{ "schemaVersion": 1, "deviceProperties": [ { "id": 0, "name": "NVIDIA A800 80GB PCIe", "totalGlobalMem": 84987740160, "computeMajor": 8, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 167936, "numSms": 108, "sharedMemPerBlockOptin": 166912 }, { "id": 1, "name": "NVIDIA A800 80GB PCIe", "totalGlobalMem": 84987740160, "computeMajor": 8, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 167936, "numSms": 108, "sharedMemPerBlockOptin": 166912 }, { "id": 2, "name": "NVIDIA A800 80GB PCIe", "totalGlobalMem": 84987740160, "computeMajor": 8, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 167936, "numSms": 108, "sharedMemPerBlockOptin": 166912 }, { "id": 3, "name": "NVIDIA A800 80GB PCIe", "totalGlobalMem": 84987740160, "computeMajor": 8, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 167936, "numSms": 108, "sharedMemPerBlockOptin": 166912 }, { "id": 4, "name": "Tesla V100S-PCIE-32GB", "totalGlobalMem": 34079899648, "computeMajor": 7, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, "numSms": 80, "sharedMemPerBlockOptin": 98304 }, { "id": 5, "name": "Tesla V100S-PCIE-32GB", "totalGlobalMem": 34079899648, "computeMajor": 7, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, "numSms": 80, "sharedMemPerBlockOptin": 98304 }, { "id": 6, "name": "Tesla V100S-PCIE-32GB", "totalGlobalMem": 34079899648, "computeMajor": 7, "computeMinor": 0, "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, "numSms": 80, "sharedMemPerBlockOptin": 98304 } ], "traceEvents": [ { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454215754427, "dur": 7, "args": { "External id": 8, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 8, "pid": 5, "tid": 7, "ts": 1716454215754427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454215754164, "dur": 267, "args": { "External id": 8, "cbid": 211, "correlation": 8 } }, { "ph": "s", "id": 8, "pid": 76337, "tid": -914061504, "ts": 1716454215754164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454215754750, "dur": 5, "args": { "External id": 19, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.15, "warps per SM": 0.6, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 19, "pid": 5, "tid": 7, "ts": 1716454215754750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454215754668, "dur": 83, "args": { "External id": 19, "cbid": 211, "correlation": 19 } }, { "ph": "s", "id": 19, "pid": 76337, "tid": -914061504, "ts": 1716454215754668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454215755400, "dur": 4, "args": { "External id": 29, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29, "registers per thread": 26, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29, "pid": 5, "tid": 7, "ts": 1716454215755400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454215755286, "dur": 114, "args": { "External id": 29, "cbid": 211, "correlation": 29 } }, { "ph": "s", "id": 29, "pid": 76337, "tid": -914061504, "ts": 1716454215755286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454215779029, "dur": 1, "args": { "External id": 39, "device": 5, "context": 1, "stream": 7, "correlation": 39, "bytes": 160, "memory bandwidth (GB/s)": 0.08064516129032258 } }, { "ph": "f", "id": 39, "pid": 5, "tid": 7, "ts": 1716454215779029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454215778938, "dur": 91, "args": { "External id": 39, "cbid": 41, "correlation": 39 } }, { "ph": "s", "id": 39, "pid": 76337, "tid": -914061504, "ts": 1716454215778938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454215779031, "dur": 20, "args": { "External id": 40, "cbid": 131, "correlation": 40 } }, { "ph": "f", "id": 40, "pid": 76337, "tid": -914061504, "ts": 1716454215779031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454215780009, "dur": 94, "args": { "External id": 54, "device": 5, "context": 1, "stream": 7, "correlation": 54, "bytes": 1179648, "memory bandwidth (GB/s)": 12.53464525932144 } }, { "ph": "f", "id": 54, "pid": 5, "tid": 7, "ts": 1716454215780009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454215779738, "dur": 286, "args": { "External id": 54, "cbid": 41, "correlation": 54 } }, { "ph": "s", "id": 54, "pid": 76337, "tid": -914061504, "ts": 1716454215779738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454215780025, "dur": 85, "args": { "External id": 55, "cbid": 131, "correlation": 55 } }, { "ph": "f", "id": 55, "pid": 76337, "tid": -914061504, "ts": 1716454215780025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454215780178, "dur": 7, "args": { "External id": 59, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 59, "pid": 5, "tid": 7, "ts": 1716454215780178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454215780131, "dur": 49, "args": { "External id": 59, "cbid": 211, "correlation": 59 } }, { "ph": "s", "id": 59, "pid": 76337, "tid": -914061504, "ts": 1716454215780131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454215780719, "dur": 2, "args": { "External id": 68, "cbid": 317, "correlation": 68 } }, { "ph": "f", "id": 68, "pid": 76337, "tid": -914061504, "ts": 1716454215780719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454215780722, "dur": 221920, "args": { "External id": 69, "cbid": 20, "correlation": 69 } }, { "ph": "f", "id": 69, "pid": 76337, "tid": -914061504, "ts": 1716454215780722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216002883, "dur": 6, "args": { "External id": 78, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0875, "warps per SM": 0.35, "grid": [7, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 78, "pid": 5, "tid": 7, "ts": 1716454216002883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216002804, "dur": 83, "args": { "External id": 78, "cbid": 211, "correlation": 78 } }, { "ph": "s", "id": 78, "pid": 76337, "tid": -914061504, "ts": 1716454216002804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216002924, "dur": 1, "args": { "External id": 85, "cbid": 317, "correlation": 85 } }, { "ph": "f", "id": 85, "pid": 76337, "tid": -914061504, "ts": 1716454216002924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216002927, "dur": 1, "args": { "External id": 86, "cbid": 203, "correlation": 86 } }, { "ph": "f", "id": 86, "pid": 76337, "tid": -914061504, "ts": 1716454216002927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216002929, "dur": 1, "args": { "External id": 87, "cbid": 205, "correlation": 87 } }, { "ph": "f", "id": 87, "pid": 76337, "tid": -914061504, "ts": 1716454216002929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_tensor8x8x4_aligna2_alignc8_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216008035, "dur": 190, "args": { "External id": 91, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91, "registers per thread": 111, "shared memory": 4096, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [4, 1536, 1], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 91, "pid": 5, "tid": 7, "ts": 1716454216008035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216007881, "dur": 155, "args": { "External id": 91, "cbid": 211, "correlation": 91 } }, { "ph": "s", "id": 91, "pid": 76337, "tid": -914061504, "ts": 1716454216007881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216008851, "dur": 178, "args": { "External id": 97, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97, "pid": 5, "tid": 7, "ts": 1716454216008851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216008756, "dur": 92, "args": { "External id": 97, "cbid": 211, "correlation": 97 } }, { "ph": "s", "id": 97, "pid": 76337, "tid": -914061504, "ts": 1716454216008756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216009361, "dur": 2, "args": { "External id": 105, "cbid": 317, "correlation": 105 } }, { "ph": "f", "id": 105, "pid": 76337, "tid": -914061504, "ts": 1716454216009361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216009364, "dur": 394997, "args": { "External id": 106, "cbid": 20, "correlation": 106 } }, { "ph": "f", "id": 106, "pid": 76337, "tid": -914061504, "ts": 1716454216009364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216404536, "dur": 1802, "args": { "External id": 111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111, "pid": 5, "tid": 7, "ts": 1716454216404536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216404465, "dur": 76, "args": { "External id": 111, "cbid": 211, "correlation": 111 } }, { "ph": "s", "id": 111, "pid": 76337, "tid": -914061504, "ts": 1716454216404465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216404651, "dur": 2, "args": { "External id": 121, "cbid": 317, "correlation": 121 } }, { "ph": "f", "id": 121, "pid": 76337, "tid": -914061504, "ts": 1716454216404651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216404654, "dur": 16045, "args": { "External id": 122, "cbid": 20, "correlation": 122 } }, { "ph": "f", "id": 122, "pid": 76337, "tid": -914061504, "ts": 1716454216404654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216420919, "dur": 703, "args": { "External id": 136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 136, "pid": 5, "tid": 7, "ts": 1716454216420919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216420761, "dur": 160, "args": { "External id": 136, "cbid": 211, "correlation": 136 } }, { "ph": "s", "id": 136, "pid": 76337, "tid": -914061504, "ts": 1716454216420761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216421629, "dur": 5, "args": { "External id": 148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 148, "pid": 5, "tid": 7, "ts": 1716454216421629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216420955, "dur": 75, "args": { "External id": 148, "cbid": 211, "correlation": 148 } }, { "ph": "s", "id": 148, "pid": 76337, "tid": -914061504, "ts": 1716454216420955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216421644, "dur": 176, "args": { "External id": 151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151, "pid": 5, "tid": 7, "ts": 1716454216421644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216421073, "dur": 78, "args": { "External id": 151, "cbid": 211, "correlation": 151 } }, { "ph": "s", "id": 151, "pid": 76337, "tid": -914061504, "ts": 1716454216421073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216421828, "dur": 108, "args": { "External id": 160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160, "pid": 5, "tid": 7, "ts": 1716454216421828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216421444, "dur": 77, "args": { "External id": 160, "cbid": 211, "correlation": 160 } }, { "ph": "s", "id": 160, "pid": 76337, "tid": -914061504, "ts": 1716454216421444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216421664, "dur": 0, "args": { "External id": 170, "cbid": 317, "correlation": 170 } }, { "ph": "f", "id": 170, "pid": 76337, "tid": -914061504, "ts": 1716454216421664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216421665, "dur": 2, "args": { "External id": 171, "cbid": 203, "correlation": 171 } }, { "ph": "f", "id": 171, "pid": 76337, "tid": -914061504, "ts": 1716454216421665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216421668, "dur": 1, "args": { "External id": 172, "cbid": 205, "correlation": 172 } }, { "ph": "f", "id": 172, "pid": 76337, "tid": -914061504, "ts": 1716454216421668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216425170, "dur": 4, "args": { "External id": 178, "cbid": 317, "correlation": 178 } }, { "ph": "f", "id": 178, "pid": 76337, "tid": -914061504, "ts": 1716454216425170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216425175, "dur": 6658, "args": { "External id": 179, "cbid": 20, "correlation": 179 } }, { "ph": "f", "id": 179, "pid": 76337, "tid": -914061504, "ts": 1716454216425175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216431989, "dur": 144, "args": { "External id": 180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180, "pid": 5, "tid": 7, "ts": 1716454216431989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216431881, "dur": 111, "args": { "External id": 180, "cbid": 211, "correlation": 180 } }, { "ph": "s", "id": 180, "pid": 76337, "tid": -914061504, "ts": 1716454216431881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216432135, "dur": 6, "args": { "External id": 182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182, "pid": 5, "tid": 7, "ts": 1716454216432135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216431996, "dur": 8, "args": { "External id": 182, "cbid": 211, "correlation": 182 } }, { "ph": "s", "id": 182, "pid": 76337, "tid": -914061504, "ts": 1716454216431996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216432142, "dur": 6, "args": { "External id": 184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184, "pid": 5, "tid": 7, "ts": 1716454216432142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216432020, "dur": 12, "args": { "External id": 184, "cbid": 211, "correlation": 184 } }, { "ph": "s", "id": 184, "pid": 76337, "tid": -914061504, "ts": 1716454216432020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216432036, "dur": 0, "args": { "External id": 185, "cbid": 51, "correlation": 185 } }, { "ph": "s", "id": 185, "pid": 76337, "tid": -914061504, "ts": 1716454216432036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216432158, "dur": 818, "args": { "External id": 186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186, "pid": 5, "tid": 7, "ts": 1716454216432158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216432038, "dur": 103, "args": { "External id": 186, "cbid": 211, "correlation": 186 } }, { "ph": "s", "id": 186, "pid": 76337, "tid": -914061504, "ts": 1716454216432038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216432978, "dur": 178, "args": { "External id": 191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191, "pid": 5, "tid": 7, "ts": 1716454216432978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216432910, "dur": 14, "args": { "External id": 191, "cbid": 211, "correlation": 191 } }, { "ph": "s", "id": 191, "pid": 76337, "tid": -914061504, "ts": 1716454216432910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216433162, "dur": 708, "args": { "External id": 211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 211, "pid": 5, "tid": 7, "ts": 1716454216433162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433143, "dur": 17, "args": { "External id": 211, "cbid": 211, "correlation": 211 } }, { "ph": "s", "id": 211, "pid": 76337, "tid": -914061504, "ts": 1716454216433143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216433871, "dur": 5, "args": { "External id": 223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 223, "pid": 5, "tid": 7, "ts": 1716454216433871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433173, "dur": 8, "args": { "External id": 223, "cbid": 211, "correlation": 223 } }, { "ph": "s", "id": 223, "pid": 76337, "tid": -914061504, "ts": 1716454216433173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216433878, "dur": 178, "args": { "External id": 226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226, "pid": 5, "tid": 7, "ts": 1716454216433878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433209, "dur": 8, "args": { "External id": 226, "cbid": 211, "correlation": 226 } }, { "ph": "s", "id": 226, "pid": 76337, "tid": -914061504, "ts": 1716454216433209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216434057, "dur": 109, "args": { "External id": 235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235, "pid": 5, "tid": 7, "ts": 1716454216434057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433271, "dur": 11, "args": { "External id": 235, "cbid": 211, "correlation": 235 } }, { "ph": "s", "id": 235, "pid": 76337, "tid": -914061504, "ts": 1716454216433271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216433446, "dur": 0, "args": { "External id": 245, "cbid": 317, "correlation": 245 } }, { "ph": "f", "id": 245, "pid": 76337, "tid": -914061504, "ts": 1716454216433446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216433448, "dur": 0, "args": { "External id": 246, "cbid": 203, "correlation": 246 } }, { "ph": "f", "id": 246, "pid": 76337, "tid": -914061504, "ts": 1716454216433448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216433449, "dur": 0, "args": { "External id": 247, "cbid": 205, "correlation": 247 } }, { "ph": "f", "id": 247, "pid": 76337, "tid": -914061504, "ts": 1716454216433449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216434167, "dur": 147, "args": { "External id": 251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251, "pid": 5, "tid": 7, "ts": 1716454216434167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433465, "dur": 14, "args": { "External id": 251, "cbid": 211, "correlation": 251 } }, { "ph": "s", "id": 251, "pid": 76337, "tid": -914061504, "ts": 1716454216433465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216434316, "dur": 6, "args": { "External id": 253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 253, "pid": 5, "tid": 7, "ts": 1716454216434316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433482, "dur": 5, "args": { "External id": 253, "cbid": 211, "correlation": 253 } }, { "ph": "s", "id": 253, "pid": 76337, "tid": -914061504, "ts": 1716454216433482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216434323, "dur": 5, "args": { "External id": 255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255, "pid": 5, "tid": 7, "ts": 1716454216434323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433492, "dur": 6, "args": { "External id": 255, "cbid": 211, "correlation": 255 } }, { "ph": "s", "id": 255, "pid": 76337, "tid": -914061504, "ts": 1716454216433492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216433501, "dur": 0, "args": { "External id": 256, "cbid": 51, "correlation": 256 } }, { "ph": "s", "id": 256, "pid": 76337, "tid": -914061504, "ts": 1716454216433501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216434330, "dur": 816, "args": { "External id": 257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257, "pid": 5, "tid": 7, "ts": 1716454216434330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433502, "dur": 5, "args": { "External id": 257, "cbid": 211, "correlation": 257 } }, { "ph": "s", "id": 257, "pid": 76337, "tid": -914061504, "ts": 1716454216433502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216435147, "dur": 176, "args": { "External id": 262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262, "pid": 5, "tid": 7, "ts": 1716454216435147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433531, "dur": 10, "args": { "External id": 262, "cbid": 211, "correlation": 262 } }, { "ph": "s", "id": 262, "pid": 76337, "tid": -914061504, "ts": 1716454216433531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216435324, "dur": 751, "args": { "External id": 270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270, "pid": 5, "tid": 7, "ts": 1716454216435324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433578, "dur": 9, "args": { "External id": 270, "cbid": 211, "correlation": 270 } }, { "ph": "s", "id": 270, "pid": 76337, "tid": -914061504, "ts": 1716454216433578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216436082, "dur": 103, "args": { "External id": 278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278, "pid": 5, "tid": 7, "ts": 1716454216436082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433692, "dur": 107, "args": { "External id": 278, "cbid": 211, "correlation": 278 } }, { "ph": "s", "id": 278, "pid": 76337, "tid": -914061504, "ts": 1716454216433692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216436186, "dur": 1802, "args": { "External id": 288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288, "pid": 5, "tid": 7, "ts": 1716454216436186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433897, "dur": 16, "args": { "External id": 288, "cbid": 211, "correlation": 288 } }, { "ph": "s", "id": 288, "pid": 76337, "tid": -914061504, "ts": 1716454216433897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216437990, "dur": 704, "args": { "External id": 309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 309, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 309, "pid": 5, "tid": 7, "ts": 1716454216437990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433938, "dur": 8, "args": { "External id": 309, "cbid": 211, "correlation": 309 } }, { "ph": "s", "id": 309, "pid": 76337, "tid": -914061504, "ts": 1716454216433938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216438695, "dur": 5, "args": { "External id": 321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 321, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 321, "pid": 5, "tid": 7, "ts": 1716454216438695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433964, "dur": 8, "args": { "External id": 321, "cbid": 211, "correlation": 321 } }, { "ph": "s", "id": 321, "pid": 76337, "tid": -914061504, "ts": 1716454216433964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216438701, "dur": 176, "args": { "External id": 324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 324, "pid": 5, "tid": 7, "ts": 1716454216438701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216433995, "dur": 7, "args": { "External id": 324, "cbid": 211, "correlation": 324 } }, { "ph": "s", "id": 324, "pid": 76337, "tid": -914061504, "ts": 1716454216433995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216438879, "dur": 110, "args": { "External id": 333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 333, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 333, "pid": 5, "tid": 7, "ts": 1716454216438879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216434038, "dur": 11, "args": { "External id": 333, "cbid": 211, "correlation": 333 } }, { "ph": "s", "id": 333, "pid": 76337, "tid": -914061504, "ts": 1716454216434038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216434110, "dur": 0, "args": { "External id": 343, "cbid": 317, "correlation": 343 } }, { "ph": "f", "id": 343, "pid": 76337, "tid": -914061504, "ts": 1716454216434110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216434111, "dur": 0, "args": { "External id": 344, "cbid": 203, "correlation": 344 } }, { "ph": "f", "id": 344, "pid": 76337, "tid": -914061504, "ts": 1716454216434111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216434111, "dur": 0, "args": { "External id": 345, "cbid": 205, "correlation": 345 } }, { "ph": "f", "id": 345, "pid": 76337, "tid": -914061504, "ts": 1716454216434111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216434119, "dur": 1, "args": { "External id": 351, "cbid": 317, "correlation": 351 } }, { "ph": "f", "id": 351, "pid": 76337, "tid": -914061504, "ts": 1716454216434119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216434121, "dur": 11381, "args": { "External id": 352, "cbid": 20, "correlation": 352 } }, { "ph": "f", "id": 352, "pid": 76337, "tid": -914061504, "ts": 1716454216434121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216445537, "dur": 141, "args": { "External id": 353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 353, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 353, "pid": 5, "tid": 7, "ts": 1716454216445537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445518, "dur": 19, "args": { "External id": 353, "cbid": 211, "correlation": 353 } }, { "ph": "s", "id": 353, "pid": 76337, "tid": -914061504, "ts": 1716454216445518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216445680, "dur": 6, "args": { "External id": 355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 355, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 355, "pid": 5, "tid": 7, "ts": 1716454216445680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445540, "dur": 6, "args": { "External id": 355, "cbid": 211, "correlation": 355 } }, { "ph": "s", "id": 355, "pid": 76337, "tid": -914061504, "ts": 1716454216445540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216445688, "dur": 5, "args": { "External id": 357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 357, "pid": 5, "tid": 7, "ts": 1716454216445688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445550, "dur": 6, "args": { "External id": 357, "cbid": 211, "correlation": 357 } }, { "ph": "s", "id": 357, "pid": 76337, "tid": -914061504, "ts": 1716454216445550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216445559, "dur": 0, "args": { "External id": 358, "cbid": 51, "correlation": 358 } }, { "ph": "s", "id": 358, "pid": 76337, "tid": -914061504, "ts": 1716454216445559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216445695, "dur": 822, "args": { "External id": 359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 359, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 359, "pid": 5, "tid": 7, "ts": 1716454216445695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445560, "dur": 5, "args": { "External id": 359, "cbid": 211, "correlation": 359 } }, { "ph": "s", "id": 359, "pid": 76337, "tid": -914061504, "ts": 1716454216445560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216446518, "dur": 178, "args": { "External id": 364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 364, "pid": 5, "tid": 7, "ts": 1716454216446518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445595, "dur": 10, "args": { "External id": 364, "cbid": 211, "correlation": 364 } }, { "ph": "s", "id": 364, "pid": 76337, "tid": -914061504, "ts": 1716454216445595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216446697, "dur": 713, "args": { "External id": 384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 384, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 384, "pid": 5, "tid": 7, "ts": 1716454216446697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445706, "dur": 13, "args": { "External id": 384, "cbid": 211, "correlation": 384 } }, { "ph": "s", "id": 384, "pid": 76337, "tid": -914061504, "ts": 1716454216445706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216447412, "dur": 4, "args": { "External id": 396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 396, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 396, "pid": 5, "tid": 7, "ts": 1716454216447412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445729, "dur": 7, "args": { "External id": 396, "cbid": 211, "correlation": 396 } }, { "ph": "s", "id": 396, "pid": 76337, "tid": -914061504, "ts": 1716454216445729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216447418, "dur": 177, "args": { "External id": 399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 399, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 399, "pid": 5, "tid": 7, "ts": 1716454216447418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445750, "dur": 7, "args": { "External id": 399, "cbid": 211, "correlation": 399 } }, { "ph": "s", "id": 399, "pid": 76337, "tid": -914061504, "ts": 1716454216445750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216447596, "dur": 109, "args": { "External id": 408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 408, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 408, "pid": 5, "tid": 7, "ts": 1716454216447596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445791, "dur": 10, "args": { "External id": 408, "cbid": 211, "correlation": 408 } }, { "ph": "s", "id": 408, "pid": 76337, "tid": -914061504, "ts": 1716454216445791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216445883, "dur": 0, "args": { "External id": 418, "cbid": 317, "correlation": 418 } }, { "ph": "f", "id": 418, "pid": 76337, "tid": -914061504, "ts": 1716454216445883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216445884, "dur": 0, "args": { "External id": 419, "cbid": 203, "correlation": 419 } }, { "ph": "f", "id": 419, "pid": 76337, "tid": -914061504, "ts": 1716454216445884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216445885, "dur": 0, "args": { "External id": 420, "cbid": 205, "correlation": 420 } }, { "ph": "f", "id": 420, "pid": 76337, "tid": -914061504, "ts": 1716454216445885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216447707, "dur": 134, "args": { "External id": 424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 424, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 424, "pid": 5, "tid": 7, "ts": 1716454216447707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445898, "dur": 13, "args": { "External id": 424, "cbid": 211, "correlation": 424 } }, { "ph": "s", "id": 424, "pid": 76337, "tid": -914061504, "ts": 1716454216445898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216447843, "dur": 6, "args": { "External id": 426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 426, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 426, "pid": 5, "tid": 7, "ts": 1716454216447843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445914, "dur": 5, "args": { "External id": 426, "cbid": 211, "correlation": 426 } }, { "ph": "s", "id": 426, "pid": 76337, "tid": -914061504, "ts": 1716454216445914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216447850, "dur": 5, "args": { "External id": 428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 428, "pid": 5, "tid": 7, "ts": 1716454216447850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445923, "dur": 5, "args": { "External id": 428, "cbid": 211, "correlation": 428 } }, { "ph": "s", "id": 428, "pid": 76337, "tid": -914061504, "ts": 1716454216445923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216445931, "dur": 0, "args": { "External id": 429, "cbid": 51, "correlation": 429 } }, { "ph": "s", "id": 429, "pid": 76337, "tid": -914061504, "ts": 1716454216445931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216447857, "dur": 818, "args": { "External id": 430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 430, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 430, "pid": 5, "tid": 7, "ts": 1716454216447857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445932, "dur": 5, "args": { "External id": 430, "cbid": 211, "correlation": 430 } }, { "ph": "s", "id": 430, "pid": 76337, "tid": -914061504, "ts": 1716454216445932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216448676, "dur": 177, "args": { "External id": 435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 435, "pid": 5, "tid": 7, "ts": 1716454216448676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216445960, "dur": 9, "args": { "External id": 435, "cbid": 211, "correlation": 435 } }, { "ph": "s", "id": 435, "pid": 76337, "tid": -914061504, "ts": 1716454216445960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216448854, "dur": 751, "args": { "External id": 443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 443, "pid": 5, "tid": 7, "ts": 1716454216448854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216446008, "dur": 10, "args": { "External id": 443, "cbid": 211, "correlation": 443 } }, { "ph": "s", "id": 443, "pid": 76337, "tid": -914061504, "ts": 1716454216446008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216449606, "dur": 103, "args": { "External id": 451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 451, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 451, "pid": 5, "tid": 7, "ts": 1716454216449606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216446053, "dur": 10, "args": { "External id": 451, "cbid": 211, "correlation": 451 } }, { "ph": "s", "id": 451, "pid": 76337, "tid": -914061504, "ts": 1716454216446053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216449716, "dur": 88, "args": { "External id": 461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 461, "registers per thread": 16, "shared memory": 0, "blocks per SM": 617.2125, "warps per SM": 2468.85, "grid": [49377, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 461, "pid": 5, "tid": 7, "ts": 1716454216449716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216446305, "dur": 64, "args": { "External id": 461, "cbid": 211, "correlation": 461 } }, { "ph": "s", "id": 461, "pid": 76337, "tid": -914061504, "ts": 1716454216446305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216449806, "dur": 166, "args": { "External id": 466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 466, "pid": 5, "tid": 7, "ts": 1716454216449806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216446407, "dur": 10, "args": { "External id": 466, "cbid": 211, "correlation": 466 } }, { "ph": "s", "id": 466, "pid": 76337, "tid": -914061504, "ts": 1716454216446407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216449973, "dur": 6, "args": { "External id": 481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 481, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.6, "warps per SM": 14.4, "grid": [288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 22 } }, { "ph": "f", "id": 481, "pid": 5, "tid": 7, "ts": 1716454216449973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216446522, "dur": 16, "args": { "External id": 481, "cbid": 211, "correlation": 481 } }, { "ph": "s", "id": 481, "pid": 76337, "tid": -914061504, "ts": 1716454216446522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216446547, "dur": 0, "args": { "External id": 488, "cbid": 317, "correlation": 488 } }, { "ph": "f", "id": 488, "pid": 76337, "tid": -914061504, "ts": 1716454216446547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216446548, "dur": 0, "args": { "External id": 489, "cbid": 203, "correlation": 489 } }, { "ph": "f", "id": 489, "pid": 76337, "tid": -914061504, "ts": 1716454216446548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216446549, "dur": 0, "args": { "External id": 490, "cbid": 205, "correlation": 490 } }, { "ph": "f", "id": 490, "pid": 76337, "tid": -914061504, "ts": 1716454216446549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216450020, "dur": 255, "args": { "External id": 494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 494, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [1, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 494, "pid": 5, "tid": 7, "ts": 1716454216450020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216449771, "dur": 251, "args": { "External id": 494, "cbid": 211, "correlation": 494 } }, { "ph": "s", "id": 494, "pid": 76337, "tid": -914061504, "ts": 1716454216449771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216450803, "dur": 50, "args": { "External id": 500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 500, "pid": 5, "tid": 7, "ts": 1716454216450803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216450788, "dur": 14, "args": { "External id": 500, "cbid": 211, "correlation": 500 } }, { "ph": "s", "id": 500, "pid": 76337, "tid": -914061504, "ts": 1716454216450788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216451030, "dur": 144, "args": { "External id": 510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 510, "pid": 5, "tid": 7, "ts": 1716454216451030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216451014, "dur": 16, "args": { "External id": 510, "cbid": 211, "correlation": 510 } }, { "ph": "s", "id": 510, "pid": 76337, "tid": -914061504, "ts": 1716454216451014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216451176, "dur": 179, "args": { "External id": 531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 531, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 531, "pid": 5, "tid": 7, "ts": 1716454216451176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216451075, "dur": 11, "args": { "External id": 531, "cbid": 211, "correlation": 531 } }, { "ph": "s", "id": 531, "pid": 76337, "tid": -914061504, "ts": 1716454216451075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216451356, "dur": 5, "args": { "External id": 543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 543, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 543, "pid": 5, "tid": 7, "ts": 1716454216451356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216451097, "dur": 7, "args": { "External id": 543, "cbid": 211, "correlation": 543 } }, { "ph": "s", "id": 543, "pid": 76337, "tid": -914061504, "ts": 1716454216451097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216451362, "dur": 49, "args": { "External id": 546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 546, "pid": 5, "tid": 7, "ts": 1716454216451362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216451128, "dur": 8, "args": { "External id": 546, "cbid": 211, "correlation": 546 } }, { "ph": "s", "id": 546, "pid": 76337, "tid": -914061504, "ts": 1716454216451128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216451413, "dur": 31, "args": { "External id": 555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 555, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 555, "pid": 5, "tid": 7, "ts": 1716454216451413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216451188, "dur": 11, "args": { "External id": 555, "cbid": 211, "correlation": 555 } }, { "ph": "s", "id": 555, "pid": 76337, "tid": -914061504, "ts": 1716454216451188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216451287, "dur": 0, "args": { "External id": 565, "cbid": 317, "correlation": 565 } }, { "ph": "f", "id": 565, "pid": 76337, "tid": -914061504, "ts": 1716454216451287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216451288, "dur": 0, "args": { "External id": 566, "cbid": 203, "correlation": 566 } }, { "ph": "f", "id": 566, "pid": 76337, "tid": -914061504, "ts": 1716454216451288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216451290, "dur": 0, "args": { "External id": 567, "cbid": 205, "correlation": 567 } }, { "ph": "f", "id": 567, "pid": 76337, "tid": -914061504, "ts": 1716454216451290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216454315, "dur": 38, "args": { "External id": 571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 571, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [1536, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 571, "pid": 5, "tid": 7, "ts": 1716454216454315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216454288, "dur": 28, "args": { "External id": 571, "cbid": 211, "correlation": 571 } }, { "ph": "s", "id": 571, "pid": 76337, "tid": -914061504, "ts": 1716454216454288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216454354, "dur": 8, "args": { "External id": 573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 573, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12.8, "warps per SM": 102.4, "grid": [1, 4, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 573, "pid": 5, "tid": 7, "ts": 1716454216454354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216454319, "dur": 6, "args": { "External id": 573, "cbid": 211, "correlation": 573 } }, { "ph": "s", "id": 573, "pid": 76337, "tid": -914061504, "ts": 1716454216454319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216454364, "dur": 4, "args": { "External id": 575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 575, "pid": 5, "tid": 7, "ts": 1716454216454364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216454332, "dur": 9, "args": { "External id": 575, "cbid": 211, "correlation": 575 } }, { "ph": "s", "id": 575, "pid": 76337, "tid": -914061504, "ts": 1716454216454332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216454345, "dur": 0, "args": { "External id": 576, "cbid": 51, "correlation": 576 } }, { "ph": "s", "id": 576, "pid": 76337, "tid": -914061504, "ts": 1716454216454345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216454482, "dur": 428, "args": { "External id": 577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 577, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [192, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 577, "pid": 5, "tid": 7, "ts": 1716454216454482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216454369, "dur": 113, "args": { "External id": 577, "cbid": 211, "correlation": 577 } }, { "ph": "s", "id": 577, "pid": 76337, "tid": -914061504, "ts": 1716454216454369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216455262, "dur": 92, "args": { "External id": 582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 582, "pid": 5, "tid": 7, "ts": 1716454216455262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216455247, "dur": 14, "args": { "External id": 582, "cbid": 211, "correlation": 582 } }, { "ph": "s", "id": 582, "pid": 76337, "tid": -914061504, "ts": 1716454216455247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216455405, "dur": 362, "args": { "External id": 602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 602, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 602, "pid": 5, "tid": 7, "ts": 1716454216455405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216455391, "dur": 14, "args": { "External id": 602, "cbid": 211, "correlation": 602 } }, { "ph": "s", "id": 602, "pid": 76337, "tid": -914061504, "ts": 1716454216455391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216455769, "dur": 5, "args": { "External id": 614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 614, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 614, "pid": 5, "tid": 7, "ts": 1716454216455769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216455425, "dur": 10, "args": { "External id": 614, "cbid": 211, "correlation": 614 } }, { "ph": "s", "id": 614, "pid": 76337, "tid": -914061504, "ts": 1716454216455425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216455775, "dur": 92, "args": { "External id": 617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 617, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 617, "pid": 5, "tid": 7, "ts": 1716454216455775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216455455, "dur": 8, "args": { "External id": 617, "cbid": 211, "correlation": 617 } }, { "ph": "s", "id": 617, "pid": 76337, "tid": -914061504, "ts": 1716454216455455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216455868, "dur": 58, "args": { "External id": 626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 626, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 626, "pid": 5, "tid": 7, "ts": 1716454216455868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216455524, "dur": 12, "args": { "External id": 626, "cbid": 211, "correlation": 626 } }, { "ph": "s", "id": 626, "pid": 76337, "tid": -914061504, "ts": 1716454216455524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216455640, "dur": 0, "args": { "External id": 636, "cbid": 317, "correlation": 636 } }, { "ph": "f", "id": 636, "pid": 76337, "tid": -914061504, "ts": 1716454216455640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216455641, "dur": 1, "args": { "External id": 637, "cbid": 203, "correlation": 637 } }, { "ph": "f", "id": 637, "pid": 76337, "tid": -914061504, "ts": 1716454216455641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216455642, "dur": 1, "args": { "External id": 638, "cbid": 205, "correlation": 638 } }, { "ph": "f", "id": 638, "pid": 76337, "tid": -914061504, "ts": 1716454216455642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216458605, "dur": 104, "args": { "External id": 642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 642, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 642, "pid": 5, "tid": 7, "ts": 1716454216458605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216458579, "dur": 27, "args": { "External id": 642, "cbid": 211, "correlation": 642 } }, { "ph": "s", "id": 642, "pid": 76337, "tid": -914061504, "ts": 1716454216458579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216458711, "dur": 12, "args": { "External id": 644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 644, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 644, "pid": 5, "tid": 7, "ts": 1716454216458711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216458609, "dur": 6, "args": { "External id": 644, "cbid": 211, "correlation": 644 } }, { "ph": "s", "id": 644, "pid": 76337, "tid": -914061504, "ts": 1716454216458609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216458724, "dur": 4, "args": { "External id": 646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 646, "pid": 5, "tid": 7, "ts": 1716454216458724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216458622, "dur": 9, "args": { "External id": 646, "cbid": 211, "correlation": 646 } }, { "ph": "s", "id": 646, "pid": 76337, "tid": -914061504, "ts": 1716454216458622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216458635, "dur": 0, "args": { "External id": 647, "cbid": 51, "correlation": 647 } }, { "ph": "s", "id": 647, "pid": 76337, "tid": -914061504, "ts": 1716454216458635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216458729, "dur": 775, "args": { "External id": 648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 648, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 648, "pid": 5, "tid": 7, "ts": 1716454216458729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216458636, "dur": 7, "args": { "External id": 648, "cbid": 211, "correlation": 648 } }, { "ph": "s", "id": 648, "pid": 76337, "tid": -914061504, "ts": 1716454216458636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216459506, "dur": 91, "args": { "External id": 653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 653, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 653, "pid": 5, "tid": 7, "ts": 1716454216459506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216459386, "dur": 14, "args": { "External id": 653, "cbid": 211, "correlation": 653 } }, { "ph": "s", "id": 653, "pid": 76337, "tid": -914061504, "ts": 1716454216459386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216459489, "dur": 0, "args": { "External id": 663, "cbid": 317, "correlation": 663 } }, { "ph": "f", "id": 663, "pid": 76337, "tid": -914061504, "ts": 1716454216459489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216459490, "dur": 1, "args": { "External id": 664, "cbid": 203, "correlation": 664 } }, { "ph": "f", "id": 664, "pid": 76337, "tid": -914061504, "ts": 1716454216459490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216459491, "dur": 0, "args": { "External id": 665, "cbid": 205, "correlation": 665 } }, { "ph": "f", "id": 665, "pid": 76337, "tid": -914061504, "ts": 1716454216459491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216461618, "dur": 147, "args": { "External id": 669, "cbid": 273, "correlation": 669 } }, { "ph": "f", "id": 669, "pid": 76337, "tid": -914061504, "ts": 1716454216461618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216461767, "dur": 71, "args": { "External id": 670, "cbid": 273, "correlation": 670 } }, { "ph": "f", "id": 670, "pid": 76337, "tid": -914061504, "ts": 1716454216461767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216461838, "dur": 72, "args": { "External id": 671, "cbid": 273, "correlation": 671 } }, { "ph": "f", "id": 671, "pid": 76337, "tid": -914061504, "ts": 1716454216461838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216461912, "dur": 50, "args": { "External id": 672, "cbid": 273, "correlation": 672 } }, { "ph": "f", "id": 672, "pid": 76337, "tid": -914061504, "ts": 1716454216461912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216461962, "dur": 69, "args": { "External id": 673, "cbid": 273, "correlation": 673 } }, { "ph": "f", "id": 673, "pid": 76337, "tid": -914061504, "ts": 1716454216461962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462033, "dur": 48, "args": { "External id": 674, "cbid": 273, "correlation": 674 } }, { "ph": "f", "id": 674, "pid": 76337, "tid": -914061504, "ts": 1716454216462033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462082, "dur": 55, "args": { "External id": 675, "cbid": 273, "correlation": 675 } }, { "ph": "f", "id": 675, "pid": 76337, "tid": -914061504, "ts": 1716454216462082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462138, "dur": 43, "args": { "External id": 676, "cbid": 273, "correlation": 676 } }, { "ph": "f", "id": 676, "pid": 76337, "tid": -914061504, "ts": 1716454216462138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462182, "dur": 59, "args": { "External id": 677, "cbid": 273, "correlation": 677 } }, { "ph": "f", "id": 677, "pid": 76337, "tid": -914061504, "ts": 1716454216462182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462242, "dur": 50, "args": { "External id": 678, "cbid": 273, "correlation": 678 } }, { "ph": "f", "id": 678, "pid": 76337, "tid": -914061504, "ts": 1716454216462242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462292, "dur": 49, "args": { "External id": 679, "cbid": 273, "correlation": 679 } }, { "ph": "f", "id": 679, "pid": 76337, "tid": -914061504, "ts": 1716454216462292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462343, "dur": 43, "args": { "External id": 680, "cbid": 273, "correlation": 680 } }, { "ph": "f", "id": 680, "pid": 76337, "tid": -914061504, "ts": 1716454216462343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462386, "dur": 42, "args": { "External id": 681, "cbid": 273, "correlation": 681 } }, { "ph": "f", "id": 681, "pid": 76337, "tid": -914061504, "ts": 1716454216462386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462429, "dur": 52, "args": { "External id": 682, "cbid": 273, "correlation": 682 } }, { "ph": "f", "id": 682, "pid": 76337, "tid": -914061504, "ts": 1716454216462429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462482, "dur": 62, "args": { "External id": 683, "cbid": 273, "correlation": 683 } }, { "ph": "f", "id": 683, "pid": 76337, "tid": -914061504, "ts": 1716454216462482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462546, "dur": 53, "args": { "External id": 684, "cbid": 273, "correlation": 684 } }, { "ph": "f", "id": 684, "pid": 76337, "tid": -914061504, "ts": 1716454216462546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462600, "dur": 58, "args": { "External id": 685, "cbid": 273, "correlation": 685 } }, { "ph": "f", "id": 685, "pid": 76337, "tid": -914061504, "ts": 1716454216462600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462659, "dur": 44, "args": { "External id": 686, "cbid": 273, "correlation": 686 } }, { "ph": "f", "id": 686, "pid": 76337, "tid": -914061504, "ts": 1716454216462659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462703, "dur": 41, "args": { "External id": 687, "cbid": 273, "correlation": 687 } }, { "ph": "f", "id": 687, "pid": 76337, "tid": -914061504, "ts": 1716454216462703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462746, "dur": 46, "args": { "External id": 688, "cbid": 273, "correlation": 688 } }, { "ph": "f", "id": 688, "pid": 76337, "tid": -914061504, "ts": 1716454216462746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462792, "dur": 44, "args": { "External id": 689, "cbid": 273, "correlation": 689 } }, { "ph": "f", "id": 689, "pid": 76337, "tid": -914061504, "ts": 1716454216462792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462837, "dur": 51, "args": { "External id": 690, "cbid": 273, "correlation": 690 } }, { "ph": "f", "id": 690, "pid": 76337, "tid": -914061504, "ts": 1716454216462837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462890, "dur": 48, "args": { "External id": 691, "cbid": 273, "correlation": 691 } }, { "ph": "f", "id": 691, "pid": 76337, "tid": -914061504, "ts": 1716454216462890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462939, "dur": 53, "args": { "External id": 692, "cbid": 273, "correlation": 692 } }, { "ph": "f", "id": 692, "pid": 76337, "tid": -914061504, "ts": 1716454216462939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216462994, "dur": 57, "args": { "External id": 693, "cbid": 273, "correlation": 693 } }, { "ph": "f", "id": 693, "pid": 76337, "tid": -914061504, "ts": 1716454216462994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216463053, "dur": 44, "args": { "External id": 694, "cbid": 273, "correlation": 694 } }, { "ph": "f", "id": 694, "pid": 76337, "tid": -914061504, "ts": 1716454216463053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216463098, "dur": 60, "args": { "External id": 695, "cbid": 273, "correlation": 695 } }, { "ph": "f", "id": 695, "pid": 76337, "tid": -914061504, "ts": 1716454216463098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216463158, "dur": 43, "args": { "External id": 696, "cbid": 273, "correlation": 696 } }, { "ph": "f", "id": 696, "pid": 76337, "tid": -914061504, "ts": 1716454216463158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216463202, "dur": 10711, "args": { "External id": 697, "cbid": 273, "correlation": 697 } }, { "ph": "f", "id": 697, "pid": 76337, "tid": -914061504, "ts": 1716454216463202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216473915, "dur": 52, "args": { "External id": 698, "cbid": 273, "correlation": 698 } }, { "ph": "f", "id": 698, "pid": 76337, "tid": -914061504, "ts": 1716454216473915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216473968, "dur": 69, "args": { "External id": 699, "cbid": 273, "correlation": 699 } }, { "ph": "f", "id": 699, "pid": 76337, "tid": -914061504, "ts": 1716454216473968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474038, "dur": 52, "args": { "External id": 700, "cbid": 273, "correlation": 700 } }, { "ph": "f", "id": 700, "pid": 76337, "tid": -914061504, "ts": 1716454216474038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474091, "dur": 43, "args": { "External id": 701, "cbid": 273, "correlation": 701 } }, { "ph": "f", "id": 701, "pid": 76337, "tid": -914061504, "ts": 1716454216474091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474134, "dur": 44, "args": { "External id": 702, "cbid": 273, "correlation": 702 } }, { "ph": "f", "id": 702, "pid": 76337, "tid": -914061504, "ts": 1716454216474134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474179, "dur": 45, "args": { "External id": 703, "cbid": 273, "correlation": 703 } }, { "ph": "f", "id": 703, "pid": 76337, "tid": -914061504, "ts": 1716454216474179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474225, "dur": 43, "args": { "External id": 704, "cbid": 273, "correlation": 704 } }, { "ph": "f", "id": 704, "pid": 76337, "tid": -914061504, "ts": 1716454216474225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474270, "dur": 45, "args": { "External id": 705, "cbid": 273, "correlation": 705 } }, { "ph": "f", "id": 705, "pid": 76337, "tid": -914061504, "ts": 1716454216474270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474316, "dur": 43, "args": { "External id": 706, "cbid": 273, "correlation": 706 } }, { "ph": "f", "id": 706, "pid": 76337, "tid": -914061504, "ts": 1716454216474316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474360, "dur": 46, "args": { "External id": 707, "cbid": 273, "correlation": 707 } }, { "ph": "f", "id": 707, "pid": 76337, "tid": -914061504, "ts": 1716454216474360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474407, "dur": 45, "args": { "External id": 708, "cbid": 273, "correlation": 708 } }, { "ph": "f", "id": 708, "pid": 76337, "tid": -914061504, "ts": 1716454216474407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474454, "dur": 47, "args": { "External id": 709, "cbid": 273, "correlation": 709 } }, { "ph": "f", "id": 709, "pid": 76337, "tid": -914061504, "ts": 1716454216474454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474501, "dur": 41, "args": { "External id": 710, "cbid": 273, "correlation": 710 } }, { "ph": "f", "id": 710, "pid": 76337, "tid": -914061504, "ts": 1716454216474501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474543, "dur": 43, "args": { "External id": 711, "cbid": 273, "correlation": 711 } }, { "ph": "f", "id": 711, "pid": 76337, "tid": -914061504, "ts": 1716454216474543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474587, "dur": 45, "args": { "External id": 712, "cbid": 273, "correlation": 712 } }, { "ph": "f", "id": 712, "pid": 76337, "tid": -914061504, "ts": 1716454216474587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474633, "dur": 49, "args": { "External id": 713, "cbid": 273, "correlation": 713 } }, { "ph": "f", "id": 713, "pid": 76337, "tid": -914061504, "ts": 1716454216474633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474683, "dur": 46, "args": { "External id": 714, "cbid": 273, "correlation": 714 } }, { "ph": "f", "id": 714, "pid": 76337, "tid": -914061504, "ts": 1716454216474683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474730, "dur": 48, "args": { "External id": 715, "cbid": 273, "correlation": 715 } }, { "ph": "f", "id": 715, "pid": 76337, "tid": -914061504, "ts": 1716454216474730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216474779, "dur": 47, "args": { "External id": 716, "cbid": 273, "correlation": 716 } }, { "ph": "f", "id": 716, "pid": 76337, "tid": -914061504, "ts": 1716454216474779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216474958, "dur": 72, "args": { "External id": 717, "cbid": 251, "correlation": 717 } }, { "ph": "f", "id": 717, "pid": 76337, "tid": -914061504, "ts": 1716454216474958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216475092, "dur": 80, "args": { "External id": 718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 718, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [2, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 718, "pid": 5, "tid": 7, "ts": 1716454216475092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216475052, "dur": 43, "args": { "External id": 718, "cbid": 211, "correlation": 718 } }, { "ph": "s", "id": 718, "pid": 76337, "tid": -914061504, "ts": 1716454216475052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216475823, "dur": 92, "args": { "External id": 724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 724, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 724, "pid": 5, "tid": 7, "ts": 1716454216475823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216475806, "dur": 15, "args": { "External id": 724, "cbid": 211, "correlation": 724 } }, { "ph": "s", "id": 724, "pid": 76337, "tid": -914061504, "ts": 1716454216475806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216475923, "dur": 253, "args": { "External id": 732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 732, "pid": 5, "tid": 7, "ts": 1716454216475923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216475910, "dur": 11, "args": { "External id": 732, "cbid": 211, "correlation": 732 } }, { "ph": "s", "id": 732, "pid": 76337, "tid": -914061504, "ts": 1716454216475910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216476178, "dur": 53, "args": { "External id": 740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 740, "registers per thread": 17, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 740, "pid": 5, "tid": 7, "ts": 1716454216476178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216475988, "dur": 12, "args": { "External id": 740, "cbid": 211, "correlation": 740 } }, { "ph": "s", "id": 740, "pid": 76337, "tid": -914061504, "ts": 1716454216475988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216476232, "dur": 371, "args": { "External id": 750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 750, "pid": 5, "tid": 7, "ts": 1716454216476232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476201, "dur": 19, "args": { "External id": 750, "cbid": 211, "correlation": 750 } }, { "ph": "s", "id": 750, "pid": 76337, "tid": -914061504, "ts": 1716454216476201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216476605, "dur": 375, "args": { "External id": 771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 771, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 771, "pid": 5, "tid": 7, "ts": 1716454216476605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476257, "dur": 10, "args": { "External id": 771, "cbid": 211, "correlation": 771 } }, { "ph": "s", "id": 771, "pid": 76337, "tid": -914061504, "ts": 1716454216476257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216476981, "dur": 5, "args": { "External id": 783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 783, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 783, "pid": 5, "tid": 7, "ts": 1716454216476981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476281, "dur": 7, "args": { "External id": 783, "cbid": 211, "correlation": 783 } }, { "ph": "s", "id": 783, "pid": 76337, "tid": -914061504, "ts": 1716454216476281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216476987, "dur": 91, "args": { "External id": 786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 786, "pid": 5, "tid": 7, "ts": 1716454216476987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476304, "dur": 7, "args": { "External id": 786, "cbid": 211, "correlation": 786 } }, { "ph": "s", "id": 786, "pid": 76337, "tid": -914061504, "ts": 1716454216476304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216477080, "dur": 58, "args": { "External id": 795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 795, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 795, "pid": 5, "tid": 7, "ts": 1716454216477080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476365, "dur": 12, "args": { "External id": 795, "cbid": 211, "correlation": 795 } }, { "ph": "s", "id": 795, "pid": 76337, "tid": -914061504, "ts": 1716454216476365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216476460, "dur": 1, "args": { "External id": 805, "cbid": 317, "correlation": 805 } }, { "ph": "f", "id": 805, "pid": 76337, "tid": -914061504, "ts": 1716454216476460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216476462, "dur": 1, "args": { "External id": 806, "cbid": 203, "correlation": 806 } }, { "ph": "f", "id": 806, "pid": 76337, "tid": -914061504, "ts": 1716454216476462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216476464, "dur": 1, "args": { "External id": 807, "cbid": 205, "correlation": 807 } }, { "ph": "f", "id": 807, "pid": 76337, "tid": -914061504, "ts": 1716454216476464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216477139, "dur": 100, "args": { "External id": 811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 811, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 811, "pid": 5, "tid": 7, "ts": 1716454216477139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476497, "dur": 14, "args": { "External id": 811, "cbid": 211, "correlation": 811 } }, { "ph": "s", "id": 811, "pid": 76337, "tid": -914061504, "ts": 1716454216476497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216477241, "dur": 12, "args": { "External id": 813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 813, "pid": 5, "tid": 7, "ts": 1716454216477241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476514, "dur": 5, "args": { "External id": 813, "cbid": 211, "correlation": 813 } }, { "ph": "s", "id": 813, "pid": 76337, "tid": -914061504, "ts": 1716454216476514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216477254, "dur": 4, "args": { "External id": 815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 815, "pid": 5, "tid": 7, "ts": 1716454216477254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476530, "dur": 9, "args": { "External id": 815, "cbid": 211, "correlation": 815 } }, { "ph": "s", "id": 815, "pid": 76337, "tid": -914061504, "ts": 1716454216476530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216476546, "dur": 0, "args": { "External id": 816, "cbid": 51, "correlation": 816 } }, { "ph": "s", "id": 816, "pid": 76337, "tid": -914061504, "ts": 1716454216476546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216477260, "dur": 777, "args": { "External id": 817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 817, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 817, "pid": 5, "tid": 7, "ts": 1716454216477260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476547, "dur": 8, "args": { "External id": 817, "cbid": 211, "correlation": 817 } }, { "ph": "s", "id": 817, "pid": 76337, "tid": -914061504, "ts": 1716454216476547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216478038, "dur": 91, "args": { "External id": 822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 822, "pid": 5, "tid": 7, "ts": 1716454216478038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476579, "dur": 9, "args": { "External id": 822, "cbid": 211, "correlation": 822 } }, { "ph": "s", "id": 822, "pid": 76337, "tid": -914061504, "ts": 1716454216476579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216478131, "dur": 364, "args": { "External id": 842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 842, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 842, "pid": 5, "tid": 7, "ts": 1716454216478131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476666, "dur": 13, "args": { "External id": 842, "cbid": 211, "correlation": 842 } }, { "ph": "s", "id": 842, "pid": 76337, "tid": -914061504, "ts": 1716454216476666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216478497, "dur": 4, "args": { "External id": 854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 854, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 854, "pid": 5, "tid": 7, "ts": 1716454216478497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476689, "dur": 6, "args": { "External id": 854, "cbid": 211, "correlation": 854 } }, { "ph": "s", "id": 854, "pid": 76337, "tid": -914061504, "ts": 1716454216476689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216478502, "dur": 92, "args": { "External id": 857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 857, "pid": 5, "tid": 7, "ts": 1716454216478502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476707, "dur": 7, "args": { "External id": 857, "cbid": 211, "correlation": 857 } }, { "ph": "s", "id": 857, "pid": 76337, "tid": -914061504, "ts": 1716454216476707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216478595, "dur": 57, "args": { "External id": 866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 866, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 866, "pid": 5, "tid": 7, "ts": 1716454216478595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476748, "dur": 10, "args": { "External id": 866, "cbid": 211, "correlation": 866 } }, { "ph": "s", "id": 866, "pid": 76337, "tid": -914061504, "ts": 1716454216476748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216476837, "dur": 0, "args": { "External id": 876, "cbid": 317, "correlation": 876 } }, { "ph": "f", "id": 876, "pid": 76337, "tid": -914061504, "ts": 1716454216476837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216476837, "dur": 0, "args": { "External id": 877, "cbid": 203, "correlation": 877 } }, { "ph": "f", "id": 877, "pid": 76337, "tid": -914061504, "ts": 1716454216476837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216476838, "dur": 0, "args": { "External id": 878, "cbid": 205, "correlation": 878 } }, { "ph": "f", "id": 878, "pid": 76337, "tid": -914061504, "ts": 1716454216476838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216478654, "dur": 103, "args": { "External id": 882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 882, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 882, "pid": 5, "tid": 7, "ts": 1716454216478654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476852, "dur": 13, "args": { "External id": 882, "cbid": 211, "correlation": 882 } }, { "ph": "s", "id": 882, "pid": 76337, "tid": -914061504, "ts": 1716454216476852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216478758, "dur": 12, "args": { "External id": 884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 884, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 884, "pid": 5, "tid": 7, "ts": 1716454216478758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476867, "dur": 5, "args": { "External id": 884, "cbid": 211, "correlation": 884 } }, { "ph": "s", "id": 884, "pid": 76337, "tid": -914061504, "ts": 1716454216476867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216478772, "dur": 4, "args": { "External id": 886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 886, "pid": 5, "tid": 7, "ts": 1716454216478772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476876, "dur": 5, "args": { "External id": 886, "cbid": 211, "correlation": 886 } }, { "ph": "s", "id": 886, "pid": 76337, "tid": -914061504, "ts": 1716454216476876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216476885, "dur": 0, "args": { "External id": 887, "cbid": 51, "correlation": 887 } }, { "ph": "s", "id": 887, "pid": 76337, "tid": -914061504, "ts": 1716454216476885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216478777, "dur": 771, "args": { "External id": 888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 888, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 888, "pid": 5, "tid": 7, "ts": 1716454216478777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476886, "dur": 6, "args": { "External id": 888, "cbid": 211, "correlation": 888 } }, { "ph": "s", "id": 888, "pid": 76337, "tid": -914061504, "ts": 1716454216476886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216479550, "dur": 91, "args": { "External id": 893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 893, "pid": 5, "tid": 7, "ts": 1716454216479550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476914, "dur": 9, "args": { "External id": 893, "cbid": 211, "correlation": 893 } }, { "ph": "s", "id": 893, "pid": 76337, "tid": -914061504, "ts": 1716454216476914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216479642, "dur": 255, "args": { "External id": 901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 901, "pid": 5, "tid": 7, "ts": 1716454216479642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476950, "dur": 9, "args": { "External id": 901, "cbid": 211, "correlation": 901 } }, { "ph": "s", "id": 901, "pid": 76337, "tid": -914061504, "ts": 1716454216476950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216479899, "dur": 54, "args": { "External id": 909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 909, "registers per thread": 17, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 909, "pid": 5, "tid": 7, "ts": 1716454216479899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216476987, "dur": 9, "args": { "External id": 909, "cbid": 211, "correlation": 909 } }, { "ph": "s", "id": 909, "pid": 76337, "tid": -914061504, "ts": 1716454216476987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216479954, "dur": 46, "args": { "External id": 919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 310.0125, "warps per SM": 1240.05, "grid": [24801, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 919, "pid": 5, "tid": 7, "ts": 1716454216479954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216477099, "dur": 15, "args": { "External id": 919, "cbid": 211, "correlation": 919 } }, { "ph": "s", "id": 919, "pid": 76337, "tid": -914061504, "ts": 1716454216477099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216480001, "dur": 87, "args": { "External id": 924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 924, "pid": 5, "tid": 7, "ts": 1716454216480001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216477149, "dur": 8, "args": { "External id": 924, "cbid": 211, "correlation": 924 } }, { "ph": "s", "id": 924, "pid": 76337, "tid": -914061504, "ts": 1716454216477149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216480090, "dur": 10, "args": { "External id": 939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 939, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 939, "pid": 5, "tid": 7, "ts": 1716454216480090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216477218, "dur": 13, "args": { "External id": 939, "cbid": 211, "correlation": 939 } }, { "ph": "s", "id": 939, "pid": 76337, "tid": -914061504, "ts": 1716454216477218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216477239, "dur": 0, "args": { "External id": 946, "cbid": 317, "correlation": 946 } }, { "ph": "f", "id": 946, "pid": 76337, "tid": -914061504, "ts": 1716454216477239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216477240, "dur": 0, "args": { "External id": 947, "cbid": 203, "correlation": 947 } }, { "ph": "f", "id": 947, "pid": 76337, "tid": -914061504, "ts": 1716454216477240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216477240, "dur": 0, "args": { "External id": 948, "cbid": 205, "correlation": 948 } }, { "ph": "f", "id": 948, "pid": 76337, "tid": -914061504, "ts": 1716454216477240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216480485, "dur": 305, "args": { "External id": 952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 952, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 1.2, "warps per SM": 9.6, "grid": [1, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 952, "pid": 5, "tid": 7, "ts": 1716454216480485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216480181, "dur": 308, "args": { "External id": 952, "cbid": 211, "correlation": 952 } }, { "ph": "s", "id": 952, "pid": 76337, "tid": -914061504, "ts": 1716454216480181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216481275, "dur": 26, "args": { "External id": 958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 958, "pid": 5, "tid": 7, "ts": 1716454216481275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216481260, "dur": 13, "args": { "External id": 958, "cbid": 211, "correlation": 958 } }, { "ph": "s", "id": 958, "pid": 76337, "tid": -914061504, "ts": 1716454216481260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216481457, "dur": 69, "args": { "External id": 968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 968, "pid": 5, "tid": 7, "ts": 1716454216481457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216481441, "dur": 16, "args": { "External id": 968, "cbid": 211, "correlation": 968 } }, { "ph": "s", "id": 968, "pid": 76337, "tid": -914061504, "ts": 1716454216481441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216481527, "dur": 95, "args": { "External id": 989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 989, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 989, "pid": 5, "tid": 7, "ts": 1716454216481527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216481484, "dur": 9, "args": { "External id": 989, "cbid": 211, "correlation": 989 } }, { "ph": "s", "id": 989, "pid": 76337, "tid": -914061504, "ts": 1716454216481484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216481624, "dur": 5, "args": { "External id": 1001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1001, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1001, "pid": 5, "tid": 7, "ts": 1716454216481624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216481504, "dur": 8, "args": { "External id": 1001, "cbid": 211, "correlation": 1001 } }, { "ph": "s", "id": 1001, "pid": 76337, "tid": -914061504, "ts": 1716454216481504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216481631, "dur": 26, "args": { "External id": 1004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1004, "pid": 5, "tid": 7, "ts": 1716454216481631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216481528, "dur": 7, "args": { "External id": 1004, "cbid": 211, "correlation": 1004 } }, { "ph": "s", "id": 1004, "pid": 76337, "tid": -914061504, "ts": 1716454216481528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216481659, "dur": 18, "args": { "External id": 1013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1013, "registers per thread": 24, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1013, "pid": 5, "tid": 7, "ts": 1716454216481659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216481580, "dur": 10, "args": { "External id": 1013, "cbid": 211, "correlation": 1013 } }, { "ph": "s", "id": 1013, "pid": 76337, "tid": -914061504, "ts": 1716454216481580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216481675, "dur": 0, "args": { "External id": 1023, "cbid": 317, "correlation": 1023 } }, { "ph": "f", "id": 1023, "pid": 76337, "tid": -914061504, "ts": 1716454216481675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216481676, "dur": 1, "args": { "External id": 1024, "cbid": 203, "correlation": 1024 } }, { "ph": "f", "id": 1024, "pid": 76337, "tid": -914061504, "ts": 1716454216481676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216481678, "dur": 0, "args": { "External id": 1025, "cbid": 205, "correlation": 1025 } }, { "ph": "f", "id": 1025, "pid": 76337, "tid": -914061504, "ts": 1716454216481678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216484413, "dur": 20, "args": { "External id": 1029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1029, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 38.4, "warps per SM": 307.2, "grid": [384, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1029, "pid": 5, "tid": 7, "ts": 1716454216484413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216484389, "dur": 25, "args": { "External id": 1029, "cbid": 211, "correlation": 1029 } }, { "ph": "s", "id": 1029, "pid": 76337, "tid": -914061504, "ts": 1716454216484389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216484435, "dur": 19, "args": { "External id": 1031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1031, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 51.2, "warps per SM": 409.6, "grid": [1, 8, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1031, "pid": 5, "tid": 7, "ts": 1716454216484435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216484417, "dur": 6, "args": { "External id": 1031, "cbid": 211, "correlation": 1031 } }, { "ph": "s", "id": 1031, "pid": 76337, "tid": -914061504, "ts": 1716454216484417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216484461, "dur": 2, "args": { "External id": 1033, "device": 5, "context": 1, "stream": 7, "correlation": 1033, "bytes": 3072, "memory bandwidth (GB/s)": 1.4328358208955223 } }, { "ph": "f", "id": 1033, "pid": 5, "tid": 7, "ts": 1716454216484461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216484433, "dur": 25, "args": { "External id": 1033, "cbid": 51, "correlation": 1033 } }, { "ph": "s", "id": 1033, "pid": 76337, "tid": -914061504, "ts": 1716454216484433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216484739, "dur": 459, "args": { "External id": 1034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1034, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [4, 96, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1034, "pid": 5, "tid": 7, "ts": 1716454216484739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216484480, "dur": 259, "args": { "External id": 1034, "cbid": 211, "correlation": 1034 } }, { "ph": "s", "id": 1034, "pid": 76337, "tid": -914061504, "ts": 1716454216484480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216485206, "dur": 36, "args": { "External id": 1036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1036, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1036, "pid": 5, "tid": 7, "ts": 1716454216485206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216484756, "dur": 74, "args": { "External id": 1036, "cbid": 211, "correlation": 1036 } }, { "ph": "s", "id": 1036, "pid": 76337, "tid": -914061504, "ts": 1716454216484756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216485576, "dur": 49, "args": { "External id": 1042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1042, "pid": 5, "tid": 7, "ts": 1716454216485576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216485561, "dur": 14, "args": { "External id": 1042, "cbid": 211, "correlation": 1042 } }, { "ph": "s", "id": 1042, "pid": 76337, "tid": -914061504, "ts": 1716454216485561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216485721, "dur": 189, "args": { "External id": 1062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1062, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1062, "pid": 5, "tid": 7, "ts": 1716454216485721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216485707, "dur": 14, "args": { "External id": 1062, "cbid": 211, "correlation": 1062 } }, { "ph": "s", "id": 1062, "pid": 76337, "tid": -914061504, "ts": 1716454216485707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216485912, "dur": 5, "args": { "External id": 1074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1074, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1074, "pid": 5, "tid": 7, "ts": 1716454216485912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216485733, "dur": 7, "args": { "External id": 1074, "cbid": 211, "correlation": 1074 } }, { "ph": "s", "id": 1074, "pid": 76337, "tid": -914061504, "ts": 1716454216485733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216485918, "dur": 47, "args": { "External id": 1077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1077, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1077, "pid": 5, "tid": 7, "ts": 1716454216485918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216485759, "dur": 7, "args": { "External id": 1077, "cbid": 211, "correlation": 1077 } }, { "ph": "s", "id": 1077, "pid": 76337, "tid": -914061504, "ts": 1716454216485759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216485966, "dur": 32, "args": { "External id": 1086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1086, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1086, "pid": 5, "tid": 7, "ts": 1716454216485966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216485812, "dur": 11, "args": { "External id": 1086, "cbid": 211, "correlation": 1086 } }, { "ph": "s", "id": 1086, "pid": 76337, "tid": -914061504, "ts": 1716454216485812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216485910, "dur": 0, "args": { "External id": 1096, "cbid": 317, "correlation": 1096 } }, { "ph": "f", "id": 1096, "pid": 76337, "tid": -914061504, "ts": 1716454216485910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216485911, "dur": 0, "args": { "External id": 1097, "cbid": 203, "correlation": 1097 } }, { "ph": "f", "id": 1097, "pid": 76337, "tid": -914061504, "ts": 1716454216485911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216485912, "dur": 1, "args": { "External id": 1098, "cbid": 205, "correlation": 1098 } }, { "ph": "f", "id": 1098, "pid": 76337, "tid": -914061504, "ts": 1716454216485912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216488468, "dur": 3, "args": { "External id": 1104, "cbid": 317, "correlation": 1104 } }, { "ph": "f", "id": 1104, "pid": 76337, "tid": -914061504, "ts": 1716454216488468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216488473, "dur": 7294, "args": { "External id": 1105, "cbid": 20, "correlation": 1105 } }, { "ph": "f", "id": 1105, "pid": 76337, "tid": -914061504, "ts": 1716454216488473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216495826, "dur": 34, "args": { "External id": 1106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1106, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1106, "pid": 5, "tid": 7, "ts": 1716454216495826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216495799, "dur": 28, "args": { "External id": 1106, "cbid": 211, "correlation": 1106 } }, { "ph": "s", "id": 1106, "pid": 76337, "tid": -914061504, "ts": 1716454216495799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216495861, "dur": 33, "args": { "External id": 1108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1108, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1108, "pid": 5, "tid": 7, "ts": 1716454216495861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216495829, "dur": 7, "args": { "External id": 1108, "cbid": 211, "correlation": 1108 } }, { "ph": "s", "id": 1108, "pid": 76337, "tid": -914061504, "ts": 1716454216495829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216495897, "dur": 2, "args": { "External id": 1110, "device": 5, "context": 1, "stream": 7, "correlation": 1110, "bytes": 3072, "memory bandwidth (GB/s)": 1.4328358208955223 } }, { "ph": "f", "id": 1110, "pid": 5, "tid": 7, "ts": 1716454216495897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216495848, "dur": 12, "args": { "External id": 1110, "cbid": 51, "correlation": 1110 } }, { "ph": "s", "id": 1110, "pid": 76337, "tid": -914061504, "ts": 1716454216495848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216495901, "dur": 909, "args": { "External id": 1111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1111, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1111, "pid": 5, "tid": 7, "ts": 1716454216495901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216495862, "dur": 9, "args": { "External id": 1111, "cbid": 211, "correlation": 1111 } }, { "ph": "s", "id": 1111, "pid": 76337, "tid": -914061504, "ts": 1716454216495862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216496812, "dur": 40, "args": { "External id": 1113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1113, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1113, "pid": 5, "tid": 7, "ts": 1716454216496812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216495877, "dur": 8, "args": { "External id": 1113, "cbid": 211, "correlation": 1113 } }, { "ph": "s", "id": 1113, "pid": 76337, "tid": -914061504, "ts": 1716454216495877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216496853, "dur": 48, "args": { "External id": 1119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1119, "pid": 5, "tid": 7, "ts": 1716454216496853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216496624, "dur": 14, "args": { "External id": 1119, "cbid": 211, "correlation": 1119 } }, { "ph": "s", "id": 1119, "pid": 76337, "tid": -914061504, "ts": 1716454216496624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216496764, "dur": 0, "args": { "External id": 1129, "cbid": 317, "correlation": 1129 } }, { "ph": "f", "id": 1129, "pid": 76337, "tid": -914061504, "ts": 1716454216496764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216496765, "dur": 0, "args": { "External id": 1130, "cbid": 203, "correlation": 1130 } }, { "ph": "f", "id": 1130, "pid": 76337, "tid": -914061504, "ts": 1716454216496765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216496766, "dur": 0, "args": { "External id": 1131, "cbid": 205, "correlation": 1131 } }, { "ph": "f", "id": 1131, "pid": 76337, "tid": -914061504, "ts": 1716454216496766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216498615, "dur": 112, "args": { "External id": 1135, "cbid": 251, "correlation": 1135 } }, { "ph": "f", "id": 1135, "pid": 76337, "tid": -914061504, "ts": 1716454216498615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216498753, "dur": 62, "args": { "External id": 1136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1136, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [4, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1136, "pid": 5, "tid": 7, "ts": 1716454216498753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216498735, "dur": 27, "args": { "External id": 1136, "cbid": 211, "correlation": 1136 } }, { "ph": "s", "id": 1136, "pid": 76337, "tid": -914061504, "ts": 1716454216498735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216499360, "dur": 50, "args": { "External id": 1142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1142, "pid": 5, "tid": 7, "ts": 1716454216499360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499345, "dur": 14, "args": { "External id": 1142, "cbid": 211, "correlation": 1142 } }, { "ph": "s", "id": 1142, "pid": 76337, "tid": -914061504, "ts": 1716454216499345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216499416, "dur": 138, "args": { "External id": 1150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1150, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1150, "pid": 5, "tid": 7, "ts": 1716454216499416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499404, "dur": 10, "args": { "External id": 1150, "cbid": 211, "correlation": 1150 } }, { "ph": "s", "id": 1150, "pid": 76337, "tid": -914061504, "ts": 1716454216499404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216499555, "dur": 29, "args": { "External id": 1158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1158, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1158, "pid": 5, "tid": 7, "ts": 1716454216499555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499459, "dur": 12, "args": { "External id": 1158, "cbid": 211, "correlation": 1158 } }, { "ph": "s", "id": 1158, "pid": 76337, "tid": -914061504, "ts": 1716454216499459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216499644, "dur": 135, "args": { "External id": 1168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1168, "pid": 5, "tid": 7, "ts": 1716454216499644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499626, "dur": 18, "args": { "External id": 1168, "cbid": 211, "correlation": 1168 } }, { "ph": "s", "id": 1168, "pid": 76337, "tid": -914061504, "ts": 1716454216499626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216499780, "dur": 191, "args": { "External id": 1189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1189, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1189, "pid": 5, "tid": 7, "ts": 1716454216499780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499679, "dur": 11, "args": { "External id": 1189, "cbid": 211, "correlation": 1189 } }, { "ph": "s", "id": 1189, "pid": 76337, "tid": -914061504, "ts": 1716454216499679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216499972, "dur": 5, "args": { "External id": 1201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1201, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1201, "pid": 5, "tid": 7, "ts": 1716454216499972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499702, "dur": 7, "args": { "External id": 1201, "cbid": 211, "correlation": 1201 } }, { "ph": "s", "id": 1201, "pid": 76337, "tid": -914061504, "ts": 1716454216499702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216499978, "dur": 47, "args": { "External id": 1204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1204, "pid": 5, "tid": 7, "ts": 1716454216499978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499727, "dur": 8, "args": { "External id": 1204, "cbid": 211, "correlation": 1204 } }, { "ph": "s", "id": 1204, "pid": 76337, "tid": -914061504, "ts": 1716454216499727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216500027, "dur": 31, "args": { "External id": 1213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1213, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1213, "pid": 5, "tid": 7, "ts": 1716454216500027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499785, "dur": 13, "args": { "External id": 1213, "cbid": 211, "correlation": 1213 } }, { "ph": "s", "id": 1213, "pid": 76337, "tid": -914061504, "ts": 1716454216499785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216499883, "dur": 0, "args": { "External id": 1223, "cbid": 317, "correlation": 1223 } }, { "ph": "f", "id": 1223, "pid": 76337, "tid": -914061504, "ts": 1716454216499883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216499884, "dur": 0, "args": { "External id": 1224, "cbid": 203, "correlation": 1224 } }, { "ph": "f", "id": 1224, "pid": 76337, "tid": -914061504, "ts": 1716454216499884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216499885, "dur": 0, "args": { "External id": 1225, "cbid": 205, "correlation": 1225 } }, { "ph": "f", "id": 1225, "pid": 76337, "tid": -914061504, "ts": 1716454216499885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216500059, "dur": 34, "args": { "External id": 1229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1229, "pid": 5, "tid": 7, "ts": 1716454216500059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499905, "dur": 15, "args": { "External id": 1229, "cbid": 211, "correlation": 1229 } }, { "ph": "s", "id": 1229, "pid": 76337, "tid": -914061504, "ts": 1716454216499905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216500095, "dur": 33, "args": { "External id": 1231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1231, "pid": 5, "tid": 7, "ts": 1716454216500095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499923, "dur": 5, "args": { "External id": 1231, "cbid": 211, "correlation": 1231 } }, { "ph": "s", "id": 1231, "pid": 76337, "tid": -914061504, "ts": 1716454216499923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216500131, "dur": 1, "args": { "External id": 1233, "device": 5, "context": 1, "stream": 7, "correlation": 1233, "bytes": 3072, "memory bandwidth (GB/s)": 1.6 } }, { "ph": "f", "id": 1233, "pid": 5, "tid": 7, "ts": 1716454216500131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216499938, "dur": 12, "args": { "External id": 1233, "cbid": 51, "correlation": 1233 } }, { "ph": "s", "id": 1233, "pid": 76337, "tid": -914061504, "ts": 1716454216499938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216500135, "dur": 905, "args": { "External id": 1234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1234, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1234, "pid": 5, "tid": 7, "ts": 1716454216500135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499952, "dur": 9, "args": { "External id": 1234, "cbid": 211, "correlation": 1234 } }, { "ph": "s", "id": 1234, "pid": 76337, "tid": -914061504, "ts": 1716454216499952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216501042, "dur": 37, "args": { "External id": 1236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1236, "pid": 5, "tid": 7, "ts": 1716454216501042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216499967, "dur": 14, "args": { "External id": 1236, "cbid": 211, "correlation": 1236 } }, { "ph": "s", "id": 1236, "pid": 76337, "tid": -914061504, "ts": 1716454216499967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216501081, "dur": 49, "args": { "External id": 1242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1242, "pid": 5, "tid": 7, "ts": 1716454216501081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500006, "dur": 10, "args": { "External id": 1242, "cbid": 211, "correlation": 1242 } }, { "ph": "s", "id": 1242, "pid": 76337, "tid": -914061504, "ts": 1716454216500006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216501131, "dur": 189, "args": { "External id": 1262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1262, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1262, "pid": 5, "tid": 7, "ts": 1716454216501131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500099, "dur": 13, "args": { "External id": 1262, "cbid": 211, "correlation": 1262 } }, { "ph": "s", "id": 1262, "pid": 76337, "tid": -914061504, "ts": 1716454216500099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216501321, "dur": 4, "args": { "External id": 1274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1274, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1274, "pid": 5, "tid": 7, "ts": 1716454216501321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500130, "dur": 8, "args": { "External id": 1274, "cbid": 211, "correlation": 1274 } }, { "ph": "s", "id": 1274, "pid": 76337, "tid": -914061504, "ts": 1716454216500130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216501326, "dur": 47, "args": { "External id": 1277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1277, "pid": 5, "tid": 7, "ts": 1716454216501326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500150, "dur": 6, "args": { "External id": 1277, "cbid": 211, "correlation": 1277 } }, { "ph": "s", "id": 1277, "pid": 76337, "tid": -914061504, "ts": 1716454216500150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216501375, "dur": 31, "args": { "External id": 1286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1286, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1286, "pid": 5, "tid": 7, "ts": 1716454216501375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500195, "dur": 11, "args": { "External id": 1286, "cbid": 211, "correlation": 1286 } }, { "ph": "s", "id": 1286, "pid": 76337, "tid": -914061504, "ts": 1716454216500195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216500281, "dur": 0, "args": { "External id": 1296, "cbid": 317, "correlation": 1296 } }, { "ph": "f", "id": 1296, "pid": 76337, "tid": -914061504, "ts": 1716454216500281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216500282, "dur": 0, "args": { "External id": 1297, "cbid": 203, "correlation": 1297 } }, { "ph": "f", "id": 1297, "pid": 76337, "tid": -914061504, "ts": 1716454216500282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216500283, "dur": 0, "args": { "External id": 1298, "cbid": 205, "correlation": 1298 } }, { "ph": "f", "id": 1298, "pid": 76337, "tid": -914061504, "ts": 1716454216500283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216501407, "dur": 34, "args": { "External id": 1302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1302, "pid": 5, "tid": 7, "ts": 1716454216501407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500297, "dur": 13, "args": { "External id": 1302, "cbid": 211, "correlation": 1302 } }, { "ph": "s", "id": 1302, "pid": 76337, "tid": -914061504, "ts": 1716454216500297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216501442, "dur": 32, "args": { "External id": 1304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1304, "pid": 5, "tid": 7, "ts": 1716454216501442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500312, "dur": 5, "args": { "External id": 1304, "cbid": 211, "correlation": 1304 } }, { "ph": "s", "id": 1304, "pid": 76337, "tid": -914061504, "ts": 1716454216500312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216501476, "dur": 1, "args": { "External id": 1306, "device": 5, "context": 1, "stream": 7, "correlation": 1306, "bytes": 3072, "memory bandwidth (GB/s)": 1.5737704918032787 } }, { "ph": "f", "id": 1306, "pid": 5, "tid": 7, "ts": 1716454216501476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216500323, "dur": 6, "args": { "External id": 1306, "cbid": 51, "correlation": 1306 } }, { "ph": "s", "id": 1306, "pid": 76337, "tid": -914061504, "ts": 1716454216500323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216501481, "dur": 892, "args": { "External id": 1307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1307, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1307, "pid": 5, "tid": 7, "ts": 1716454216501481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500331, "dur": 6, "args": { "External id": 1307, "cbid": 211, "correlation": 1307 } }, { "ph": "s", "id": 1307, "pid": 76337, "tid": -914061504, "ts": 1716454216500331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216502374, "dur": 36, "args": { "External id": 1309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1309, "pid": 5, "tid": 7, "ts": 1716454216502374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500341, "dur": 5, "args": { "External id": 1309, "cbid": 211, "correlation": 1309 } }, { "ph": "s", "id": 1309, "pid": 76337, "tid": -914061504, "ts": 1716454216500341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216502411, "dur": 48, "args": { "External id": 1315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1315, "pid": 5, "tid": 7, "ts": 1716454216502411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500370, "dur": 9, "args": { "External id": 1315, "cbid": 211, "correlation": 1315 } }, { "ph": "s", "id": 1315, "pid": 76337, "tid": -914061504, "ts": 1716454216500370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216502461, "dur": 140, "args": { "External id": 1323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1323, "pid": 5, "tid": 7, "ts": 1716454216502461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500404, "dur": 9, "args": { "External id": 1323, "cbid": 211, "correlation": 1323 } }, { "ph": "s", "id": 1323, "pid": 76337, "tid": -914061504, "ts": 1716454216500404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216502603, "dur": 29, "args": { "External id": 1331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1331, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1331, "pid": 5, "tid": 7, "ts": 1716454216502603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500435, "dur": 8, "args": { "External id": 1331, "cbid": 211, "correlation": 1331 } }, { "ph": "s", "id": 1331, "pid": 76337, "tid": -914061504, "ts": 1716454216500435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216502633, "dur": 25, "args": { "External id": 1341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1341, "registers per thread": 16, "shared memory": 0, "blocks per SM": 156.4125, "warps per SM": 625.65, "grid": [12513, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1341, "pid": 5, "tid": 7, "ts": 1716454216502633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500539, "dur": 15, "args": { "External id": 1341, "cbid": 211, "correlation": 1341 } }, { "ph": "s", "id": 1341, "pid": 76337, "tid": -914061504, "ts": 1716454216500539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216502659, "dur": 47, "args": { "External id": 1346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1346, "pid": 5, "tid": 7, "ts": 1716454216502659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500587, "dur": 8, "args": { "External id": 1346, "cbid": 211, "correlation": 1346 } }, { "ph": "s", "id": 1346, "pid": 76337, "tid": -914061504, "ts": 1716454216500587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216502708, "dur": 25, "args": { "External id": 1361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 57.6, "warps per SM": 230.4, "grid": [4608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1361, "pid": 5, "tid": 7, "ts": 1716454216502708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216500671, "dur": 13, "args": { "External id": 1361, "cbid": 211, "correlation": 1361 } }, { "ph": "s", "id": 1361, "pid": 76337, "tid": -914061504, "ts": 1716454216500671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216500692, "dur": 0, "args": { "External id": 1368, "cbid": 317, "correlation": 1368 } }, { "ph": "f", "id": 1368, "pid": 76337, "tid": -914061504, "ts": 1716454216500692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216500693, "dur": 0, "args": { "External id": 1369, "cbid": 203, "correlation": 1369 } }, { "ph": "f", "id": 1369, "pid": 76337, "tid": -914061504, "ts": 1716454216500693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216500694, "dur": 0, "args": { "External id": 1370, "cbid": 205, "correlation": 1370 } }, { "ph": "f", "id": 1370, "pid": 76337, "tid": -914061504, "ts": 1716454216500694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216503346, "dur": 310, "args": { "External id": 1374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1374, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [4, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 1374, "pid": 5, "tid": 7, "ts": 1716454216503346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216503323, "dur": 23, "args": { "External id": 1374, "cbid": 211, "correlation": 1374 } }, { "ph": "s", "id": 1374, "pid": 76337, "tid": -914061504, "ts": 1716454216503323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216504060, "dur": 12, "args": { "External id": 1380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1380, "pid": 5, "tid": 7, "ts": 1716454216504060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216504045, "dur": 13, "args": { "External id": 1380, "cbid": 211, "correlation": 1380 } }, { "ph": "s", "id": 1380, "pid": 76337, "tid": -914061504, "ts": 1716454216504045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216504256, "dur": 38, "args": { "External id": 1390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1390, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1390, "pid": 5, "tid": 7, "ts": 1716454216504256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216504240, "dur": 16, "args": { "External id": 1390, "cbid": 211, "correlation": 1390 } }, { "ph": "s", "id": 1390, "pid": 76337, "tid": -914061504, "ts": 1716454216504240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216504302, "dur": 48, "args": { "External id": 1411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1411, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1411, "pid": 5, "tid": 7, "ts": 1716454216504302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216504291, "dur": 10, "args": { "External id": 1411, "cbid": 211, "correlation": 1411 } }, { "ph": "s", "id": 1411, "pid": 76337, "tid": -914061504, "ts": 1716454216504291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216504352, "dur": 4, "args": { "External id": 1423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1423, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1423, "pid": 5, "tid": 7, "ts": 1716454216504352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216504312, "dur": 7, "args": { "External id": 1423, "cbid": 211, "correlation": 1423 } }, { "ph": "s", "id": 1423, "pid": 76337, "tid": -914061504, "ts": 1716454216504312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216504357, "dur": 15, "args": { "External id": 1426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1426, "pid": 5, "tid": 7, "ts": 1716454216504357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216504337, "dur": 7, "args": { "External id": 1426, "cbid": 211, "correlation": 1426 } }, { "ph": "s", "id": 1426, "pid": 76337, "tid": -914061504, "ts": 1716454216504337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216504399, "dur": 9, "args": { "External id": 1435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1435, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1435, "pid": 5, "tid": 7, "ts": 1716454216504399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216504388, "dur": 10, "args": { "External id": 1435, "cbid": 211, "correlation": 1435 } }, { "ph": "s", "id": 1435, "pid": 76337, "tid": -914061504, "ts": 1716454216504388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216504471, "dur": 0, "args": { "External id": 1445, "cbid": 317, "correlation": 1445 } }, { "ph": "f", "id": 1445, "pid": 76337, "tid": -914061504, "ts": 1716454216504471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216504472, "dur": 1, "args": { "External id": 1446, "cbid": 203, "correlation": 1446 } }, { "ph": "f", "id": 1446, "pid": 76337, "tid": -914061504, "ts": 1716454216504472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216504474, "dur": 0, "args": { "External id": 1447, "cbid": 205, "correlation": 1447 } }, { "ph": "f", "id": 1447, "pid": 76337, "tid": -914061504, "ts": 1716454216504474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216507858, "dur": 11, "args": { "External id": 1451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1451, "pid": 5, "tid": 7, "ts": 1716454216507858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216507836, "dur": 24, "args": { "External id": 1451, "cbid": 211, "correlation": 1451 } }, { "ph": "s", "id": 1451, "pid": 76337, "tid": -914061504, "ts": 1716454216507836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216507872, "dur": 32, "args": { "External id": 1453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1453, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1453, "pid": 5, "tid": 7, "ts": 1716454216507872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216507863, "dur": 6, "args": { "External id": 1453, "cbid": 211, "correlation": 1453 } }, { "ph": "s", "id": 1453, "pid": 76337, "tid": -914061504, "ts": 1716454216507863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216507906, "dur": 2, "args": { "External id": 1455, "device": 5, "context": 1, "stream": 7, "correlation": 1455, "bytes": 768, "memory bandwidth (GB/s)": 0.375 } }, { "ph": "f", "id": 1455, "pid": 5, "tid": 7, "ts": 1716454216507906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216507877, "dur": 16, "args": { "External id": 1455, "cbid": 51, "correlation": 1455 } }, { "ph": "s", "id": 1455, "pid": 76337, "tid": -914061504, "ts": 1716454216507877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216507911, "dur": 276, "args": { "External id": 1456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1456, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1456, "pid": 5, "tid": 7, "ts": 1716454216507911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216507894, "dur": 9, "args": { "External id": 1456, "cbid": 211, "correlation": 1456 } }, { "ph": "s", "id": 1456, "pid": 76337, "tid": -914061504, "ts": 1716454216507894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216508188, "dur": 11, "args": { "External id": 1458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1458, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1458, "pid": 5, "tid": 7, "ts": 1716454216508188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216507910, "dur": 7, "args": { "External id": 1458, "cbid": 211, "correlation": 1458 } }, { "ph": "s", "id": 1458, "pid": 76337, "tid": -914061504, "ts": 1716454216507910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216508693, "dur": 13, "args": { "External id": 1464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1464, "pid": 5, "tid": 7, "ts": 1716454216508693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216508679, "dur": 13, "args": { "External id": 1464, "cbid": 211, "correlation": 1464 } }, { "ph": "s", "id": 1464, "pid": 76337, "tid": -914061504, "ts": 1716454216508679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216508829, "dur": 30, "args": { "External id": 1484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1484, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1484, "pid": 5, "tid": 7, "ts": 1716454216508829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216508815, "dur": 14, "args": { "External id": 1484, "cbid": 211, "correlation": 1484 } }, { "ph": "s", "id": 1484, "pid": 76337, "tid": -914061504, "ts": 1716454216508815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216508861, "dur": 4, "args": { "External id": 1496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1496, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1496, "pid": 5, "tid": 7, "ts": 1716454216508861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216508840, "dur": 8, "args": { "External id": 1496, "cbid": 211, "correlation": 1496 } }, { "ph": "s", "id": 1496, "pid": 76337, "tid": -914061504, "ts": 1716454216508840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216508875, "dur": 13, "args": { "External id": 1499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1499, "pid": 5, "tid": 7, "ts": 1716454216508875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216508866, "dur": 8, "args": { "External id": 1499, "cbid": 211, "correlation": 1499 } }, { "ph": "s", "id": 1499, "pid": 76337, "tid": -914061504, "ts": 1716454216508866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216508931, "dur": 9, "args": { "External id": 1508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1508, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1508, "pid": 5, "tid": 7, "ts": 1716454216508931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216508919, "dur": 11, "args": { "External id": 1508, "cbid": 211, "correlation": 1508 } }, { "ph": "s", "id": 1508, "pid": 76337, "tid": -914061504, "ts": 1716454216508919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216509040, "dur": 0, "args": { "External id": 1518, "cbid": 317, "correlation": 1518 } }, { "ph": "f", "id": 1518, "pid": 76337, "tid": -914061504, "ts": 1716454216509040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216509041, "dur": 0, "args": { "External id": 1519, "cbid": 203, "correlation": 1519 } }, { "ph": "f", "id": 1519, "pid": 76337, "tid": -914061504, "ts": 1716454216509041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216509042, "dur": 0, "args": { "External id": 1520, "cbid": 205, "correlation": 1520 } }, { "ph": "f", "id": 1520, "pid": 76337, "tid": -914061504, "ts": 1716454216509042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509072, "dur": 9, "args": { "External id": 1524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1524, "pid": 5, "tid": 7, "ts": 1716454216509072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509058, "dur": 13, "args": { "External id": 1524, "cbid": 211, "correlation": 1524 } }, { "ph": "s", "id": 1524, "pid": 76337, "tid": -914061504, "ts": 1716454216509058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509083, "dur": 32, "args": { "External id": 1526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1526, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1526, "pid": 5, "tid": 7, "ts": 1716454216509083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509073, "dur": 6, "args": { "External id": 1526, "cbid": 211, "correlation": 1526 } }, { "ph": "s", "id": 1526, "pid": 76337, "tid": -914061504, "ts": 1716454216509073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216509117, "dur": 1, "args": { "External id": 1528, "device": 5, "context": 1, "stream": 7, "correlation": 1528, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 1528, "pid": 5, "tid": 7, "ts": 1716454216509117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216509085, "dur": 8, "args": { "External id": 1528, "cbid": 51, "correlation": 1528 } }, { "ph": "s", "id": 1528, "pid": 76337, "tid": -914061504, "ts": 1716454216509085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216509121, "dur": 266, "args": { "External id": 1529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1529, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1529, "pid": 5, "tid": 7, "ts": 1716454216509121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509094, "dur": 7, "args": { "External id": 1529, "cbid": 211, "correlation": 1529 } }, { "ph": "s", "id": 1529, "pid": 76337, "tid": -914061504, "ts": 1716454216509094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509388, "dur": 10, "args": { "External id": 1531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1531, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1531, "pid": 5, "tid": 7, "ts": 1716454216509388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509106, "dur": 6, "args": { "External id": 1531, "cbid": 211, "correlation": 1531 } }, { "ph": "s", "id": 1531, "pid": 76337, "tid": -914061504, "ts": 1716454216509106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216509400, "dur": 12, "args": { "External id": 1537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1537, "pid": 5, "tid": 7, "ts": 1716454216509400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509136, "dur": 9, "args": { "External id": 1537, "cbid": 211, "correlation": 1537 } }, { "ph": "s", "id": 1537, "pid": 76337, "tid": -914061504, "ts": 1716454216509136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216509413, "dur": 39, "args": { "External id": 1545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1545, "pid": 5, "tid": 7, "ts": 1716454216509413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509180, "dur": 9, "args": { "External id": 1545, "cbid": 211, "correlation": 1545 } }, { "ph": "s", "id": 1545, "pid": 76337, "tid": -914061504, "ts": 1716454216509180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216509454, "dur": 10, "args": { "External id": 1553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1553, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1553, "pid": 5, "tid": 7, "ts": 1716454216509454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509225, "dur": 9, "args": { "External id": 1553, "cbid": 211, "correlation": 1553 } }, { "ph": "s", "id": 1553, "pid": 76337, "tid": -914061504, "ts": 1716454216509225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216509465, "dur": 38, "args": { "External id": 1563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1563, "pid": 5, "tid": 7, "ts": 1716454216509465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509320, "dur": 16, "args": { "External id": 1563, "cbid": 211, "correlation": 1563 } }, { "ph": "s", "id": 1563, "pid": 76337, "tid": -914061504, "ts": 1716454216509320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216509504, "dur": 48, "args": { "External id": 1584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1584, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1584, "pid": 5, "tid": 7, "ts": 1716454216509504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509356, "dur": 8, "args": { "External id": 1584, "cbid": 211, "correlation": 1584 } }, { "ph": "s", "id": 1584, "pid": 76337, "tid": -914061504, "ts": 1716454216509356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216509553, "dur": 4, "args": { "External id": 1596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1596, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1596, "pid": 5, "tid": 7, "ts": 1716454216509553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509385, "dur": 8, "args": { "External id": 1596, "cbid": 211, "correlation": 1596 } }, { "ph": "s", "id": 1596, "pid": 76337, "tid": -914061504, "ts": 1716454216509385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216509559, "dur": 13, "args": { "External id": 1599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1599, "pid": 5, "tid": 7, "ts": 1716454216509559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509404, "dur": 7, "args": { "External id": 1599, "cbid": 211, "correlation": 1599 } }, { "ph": "s", "id": 1599, "pid": 76337, "tid": -914061504, "ts": 1716454216509404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216509574, "dur": 10, "args": { "External id": 1608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1608, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1608, "pid": 5, "tid": 7, "ts": 1716454216509574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509447, "dur": 10, "args": { "External id": 1608, "cbid": 211, "correlation": 1608 } }, { "ph": "s", "id": 1608, "pid": 76337, "tid": -914061504, "ts": 1716454216509447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216509518, "dur": 0, "args": { "External id": 1618, "cbid": 317, "correlation": 1618 } }, { "ph": "f", "id": 1618, "pid": 76337, "tid": -914061504, "ts": 1716454216509518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216509519, "dur": 0, "args": { "External id": 1619, "cbid": 203, "correlation": 1619 } }, { "ph": "f", "id": 1619, "pid": 76337, "tid": -914061504, "ts": 1716454216509519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216509520, "dur": 0, "args": { "External id": 1620, "cbid": 205, "correlation": 1620 } }, { "ph": "f", "id": 1620, "pid": 76337, "tid": -914061504, "ts": 1716454216509520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509585, "dur": 9, "args": { "External id": 1624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1624, "pid": 5, "tid": 7, "ts": 1716454216509585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509534, "dur": 13, "args": { "External id": 1624, "cbid": 211, "correlation": 1624 } }, { "ph": "s", "id": 1624, "pid": 76337, "tid": -914061504, "ts": 1716454216509534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509595, "dur": 32, "args": { "External id": 1626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1626, "pid": 5, "tid": 7, "ts": 1716454216509595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509549, "dur": 5, "args": { "External id": 1626, "cbid": 211, "correlation": 1626 } }, { "ph": "s", "id": 1626, "pid": 76337, "tid": -914061504, "ts": 1716454216509549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216509630, "dur": 1, "args": { "External id": 1628, "device": 5, "context": 1, "stream": 7, "correlation": 1628, "bytes": 768, "memory bandwidth (GB/s)": 0.46181599518941674 } }, { "ph": "f", "id": 1628, "pid": 5, "tid": 7, "ts": 1716454216509630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216509560, "dur": 7, "args": { "External id": 1628, "cbid": 51, "correlation": 1628 } }, { "ph": "s", "id": 1628, "pid": 76337, "tid": -914061504, "ts": 1716454216509560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216509634, "dur": 265, "args": { "External id": 1629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1629, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1629, "pid": 5, "tid": 7, "ts": 1716454216509634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509568, "dur": 7, "args": { "External id": 1629, "cbid": 211, "correlation": 1629 } }, { "ph": "s", "id": 1629, "pid": 76337, "tid": -914061504, "ts": 1716454216509568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509901, "dur": 10, "args": { "External id": 1631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1631, "pid": 5, "tid": 7, "ts": 1716454216509901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509578, "dur": 5, "args": { "External id": 1631, "cbid": 211, "correlation": 1631 } }, { "ph": "s", "id": 1631, "pid": 76337, "tid": -914061504, "ts": 1716454216509578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216509913, "dur": 12, "args": { "External id": 1637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1637, "pid": 5, "tid": 7, "ts": 1716454216509913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509607, "dur": 9, "args": { "External id": 1637, "cbid": 211, "correlation": 1637 } }, { "ph": "s", "id": 1637, "pid": 76337, "tid": -914061504, "ts": 1716454216509607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216509926, "dur": 30, "args": { "External id": 1657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1657, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1657, "pid": 5, "tid": 7, "ts": 1716454216509926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509700, "dur": 13, "args": { "External id": 1657, "cbid": 211, "correlation": 1657 } }, { "ph": "s", "id": 1657, "pid": 76337, "tid": -914061504, "ts": 1716454216509700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216509958, "dur": 4, "args": { "External id": 1669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1669, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1669, "pid": 5, "tid": 7, "ts": 1716454216509958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509723, "dur": 7, "args": { "External id": 1669, "cbid": 211, "correlation": 1669 } }, { "ph": "s", "id": 1669, "pid": 76337, "tid": -914061504, "ts": 1716454216509723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216509963, "dur": 14, "args": { "External id": 1672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1672, "pid": 5, "tid": 7, "ts": 1716454216509963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509742, "dur": 6, "args": { "External id": 1672, "cbid": 211, "correlation": 1672 } }, { "ph": "s", "id": 1672, "pid": 76337, "tid": -914061504, "ts": 1716454216509742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216509978, "dur": 9, "args": { "External id": 1681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1681, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1681, "pid": 5, "tid": 7, "ts": 1716454216509978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509784, "dur": 15, "args": { "External id": 1681, "cbid": 211, "correlation": 1681 } }, { "ph": "s", "id": 1681, "pid": 76337, "tid": -914061504, "ts": 1716454216509784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216509852, "dur": 0, "args": { "External id": 1691, "cbid": 317, "correlation": 1691 } }, { "ph": "f", "id": 1691, "pid": 76337, "tid": -914061504, "ts": 1716454216509852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216509853, "dur": 0, "args": { "External id": 1692, "cbid": 203, "correlation": 1692 } }, { "ph": "f", "id": 1692, "pid": 76337, "tid": -914061504, "ts": 1716454216509853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216509853, "dur": 0, "args": { "External id": 1693, "cbid": 205, "correlation": 1693 } }, { "ph": "f", "id": 1693, "pid": 76337, "tid": -914061504, "ts": 1716454216509853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509989, "dur": 9, "args": { "External id": 1697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1697, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1697, "pid": 5, "tid": 7, "ts": 1716454216509989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509867, "dur": 13, "args": { "External id": 1697, "cbid": 211, "correlation": 1697 } }, { "ph": "s", "id": 1697, "pid": 76337, "tid": -914061504, "ts": 1716454216509867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216509999, "dur": 32, "args": { "External id": 1699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1699, "pid": 5, "tid": 7, "ts": 1716454216509999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509882, "dur": 5, "args": { "External id": 1699, "cbid": 211, "correlation": 1699 } }, { "ph": "s", "id": 1699, "pid": 76337, "tid": -914061504, "ts": 1716454216509882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216510033, "dur": 1, "args": { "External id": 1701, "device": 5, "context": 1, "stream": 7, "correlation": 1701, "bytes": 768, "memory bandwidth (GB/s)": 0.4528301886792453 } }, { "ph": "f", "id": 1701, "pid": 5, "tid": 7, "ts": 1716454216510033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216509892, "dur": 6, "args": { "External id": 1701, "cbid": 51, "correlation": 1701 } }, { "ph": "s", "id": 1701, "pid": 76337, "tid": -914061504, "ts": 1716454216509892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216510037, "dur": 265, "args": { "External id": 1702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1702, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1702, "pid": 5, "tid": 7, "ts": 1716454216510037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509900, "dur": 6, "args": { "External id": 1702, "cbid": 211, "correlation": 1702 } }, { "ph": "s", "id": 1702, "pid": 76337, "tid": -914061504, "ts": 1716454216509900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216510304, "dur": 10, "args": { "External id": 1704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1704, "pid": 5, "tid": 7, "ts": 1716454216510304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509909, "dur": 5, "args": { "External id": 1704, "cbid": 211, "correlation": 1704 } }, { "ph": "s", "id": 1704, "pid": 76337, "tid": -914061504, "ts": 1716454216509909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216510315, "dur": 12, "args": { "External id": 1710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1710, "pid": 5, "tid": 7, "ts": 1716454216510315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509937, "dur": 8, "args": { "External id": 1710, "cbid": 211, "correlation": 1710 } }, { "ph": "s", "id": 1710, "pid": 76337, "tid": -914061504, "ts": 1716454216509937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216510329, "dur": 39, "args": { "External id": 1718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1718, "pid": 5, "tid": 7, "ts": 1716454216510329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216509971, "dur": 18, "args": { "External id": 1718, "cbid": 211, "correlation": 1718 } }, { "ph": "s", "id": 1718, "pid": 76337, "tid": -914061504, "ts": 1716454216509971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216510370, "dur": 9, "args": { "External id": 1726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1726, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1726, "pid": 5, "tid": 7, "ts": 1716454216510370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510011, "dur": 8, "args": { "External id": 1726, "cbid": 211, "correlation": 1726 } }, { "ph": "s", "id": 1726, "pid": 76337, "tid": -914061504, "ts": 1716454216510011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216510380, "dur": 38, "args": { "External id": 1736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1736, "pid": 5, "tid": 7, "ts": 1716454216510380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510245, "dur": 15, "args": { "External id": 1736, "cbid": 211, "correlation": 1736 } }, { "ph": "s", "id": 1736, "pid": 76337, "tid": -914061504, "ts": 1716454216510245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216510420, "dur": 34, "args": { "External id": 1757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1757, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1757, "pid": 5, "tid": 7, "ts": 1716454216510420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510293, "dur": 9, "args": { "External id": 1757, "cbid": 211, "correlation": 1757 } }, { "ph": "s", "id": 1757, "pid": 76337, "tid": -914061504, "ts": 1716454216510293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216510456, "dur": 5, "args": { "External id": 1769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1769, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1769, "pid": 5, "tid": 7, "ts": 1716454216510456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510311, "dur": 6, "args": { "External id": 1769, "cbid": 211, "correlation": 1769 } }, { "ph": "s", "id": 1769, "pid": 76337, "tid": -914061504, "ts": 1716454216510311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216510462, "dur": 13, "args": { "External id": 1772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1772, "pid": 5, "tid": 7, "ts": 1716454216510462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510329, "dur": 7, "args": { "External id": 1772, "cbid": 211, "correlation": 1772 } }, { "ph": "s", "id": 1772, "pid": 76337, "tid": -914061504, "ts": 1716454216510329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216510476, "dur": 10, "args": { "External id": 1781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1781, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1781, "pid": 5, "tid": 7, "ts": 1716454216510476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510371, "dur": 11, "args": { "External id": 1781, "cbid": 211, "correlation": 1781 } }, { "ph": "s", "id": 1781, "pid": 76337, "tid": -914061504, "ts": 1716454216510371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216510448, "dur": 0, "args": { "External id": 1791, "cbid": 317, "correlation": 1791 } }, { "ph": "f", "id": 1791, "pid": 76337, "tid": -914061504, "ts": 1716454216510448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216510449, "dur": 0, "args": { "External id": 1792, "cbid": 203, "correlation": 1792 } }, { "ph": "f", "id": 1792, "pid": 76337, "tid": -914061504, "ts": 1716454216510449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216510449, "dur": 0, "args": { "External id": 1793, "cbid": 205, "correlation": 1793 } }, { "ph": "f", "id": 1793, "pid": 76337, "tid": -914061504, "ts": 1716454216510449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216510488, "dur": 9, "args": { "External id": 1797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1797, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1797, "pid": 5, "tid": 7, "ts": 1716454216510488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510470, "dur": 14, "args": { "External id": 1797, "cbid": 211, "correlation": 1797 } }, { "ph": "s", "id": 1797, "pid": 76337, "tid": -914061504, "ts": 1716454216510470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216510498, "dur": 33, "args": { "External id": 1799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1799, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1799, "pid": 5, "tid": 7, "ts": 1716454216510498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510486, "dur": 5, "args": { "External id": 1799, "cbid": 211, "correlation": 1799 } }, { "ph": "s", "id": 1799, "pid": 76337, "tid": -914061504, "ts": 1716454216510486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216510533, "dur": 1, "args": { "External id": 1801, "device": 5, "context": 1, "stream": 7, "correlation": 1801, "bytes": 768, "memory bandwidth (GB/s)": 0.4528301886792453 } }, { "ph": "f", "id": 1801, "pid": 5, "tid": 7, "ts": 1716454216510533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216510497, "dur": 6, "args": { "External id": 1801, "cbid": 51, "correlation": 1801 } }, { "ph": "s", "id": 1801, "pid": 76337, "tid": -914061504, "ts": 1716454216510497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216510537, "dur": 267, "args": { "External id": 1802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1802, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1802, "pid": 5, "tid": 7, "ts": 1716454216510537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510504, "dur": 6, "args": { "External id": 1802, "cbid": 211, "correlation": 1802 } }, { "ph": "s", "id": 1802, "pid": 76337, "tid": -914061504, "ts": 1716454216510504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216510805, "dur": 10, "args": { "External id": 1804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1804, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1804, "pid": 5, "tid": 7, "ts": 1716454216510805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510515, "dur": 5, "args": { "External id": 1804, "cbid": 211, "correlation": 1804 } }, { "ph": "s", "id": 1804, "pid": 76337, "tid": -914061504, "ts": 1716454216510515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216510816, "dur": 12, "args": { "External id": 1810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1810, "pid": 5, "tid": 7, "ts": 1716454216510816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510544, "dur": 9, "args": { "External id": 1810, "cbid": 211, "correlation": 1810 } }, { "ph": "s", "id": 1810, "pid": 76337, "tid": -914061504, "ts": 1716454216510544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216510830, "dur": 30, "args": { "External id": 1830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1830, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1830, "pid": 5, "tid": 7, "ts": 1716454216510830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510625, "dur": 12, "args": { "External id": 1830, "cbid": 211, "correlation": 1830 } }, { "ph": "s", "id": 1830, "pid": 76337, "tid": -914061504, "ts": 1716454216510625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216510861, "dur": 4, "args": { "External id": 1842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1842, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1842, "pid": 5, "tid": 7, "ts": 1716454216510861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510648, "dur": 6, "args": { "External id": 1842, "cbid": 211, "correlation": 1842 } }, { "ph": "s", "id": 1842, "pid": 76337, "tid": -914061504, "ts": 1716454216510648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216510866, "dur": 13, "args": { "External id": 1845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1845, "pid": 5, "tid": 7, "ts": 1716454216510866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510666, "dur": 6, "args": { "External id": 1845, "cbid": 211, "correlation": 1845 } }, { "ph": "s", "id": 1845, "pid": 76337, "tid": -914061504, "ts": 1716454216510666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216510881, "dur": 9, "args": { "External id": 1854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1854, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1854, "pid": 5, "tid": 7, "ts": 1716454216510881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510706, "dur": 9, "args": { "External id": 1854, "cbid": 211, "correlation": 1854 } }, { "ph": "s", "id": 1854, "pid": 76337, "tid": -914061504, "ts": 1716454216510706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216510788, "dur": 0, "args": { "External id": 1864, "cbid": 317, "correlation": 1864 } }, { "ph": "f", "id": 1864, "pid": 76337, "tid": -914061504, "ts": 1716454216510788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216510789, "dur": 0, "args": { "External id": 1865, "cbid": 203, "correlation": 1865 } }, { "ph": "f", "id": 1865, "pid": 76337, "tid": -914061504, "ts": 1716454216510789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216510790, "dur": 0, "args": { "External id": 1866, "cbid": 205, "correlation": 1866 } }, { "ph": "f", "id": 1866, "pid": 76337, "tid": -914061504, "ts": 1716454216510790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216510892, "dur": 9, "args": { "External id": 1870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1870, "pid": 5, "tid": 7, "ts": 1716454216510892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510803, "dur": 13, "args": { "External id": 1870, "cbid": 211, "correlation": 1870 } }, { "ph": "s", "id": 1870, "pid": 76337, "tid": -914061504, "ts": 1716454216510803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216510902, "dur": 33, "args": { "External id": 1872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1872, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1872, "pid": 5, "tid": 7, "ts": 1716454216510902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510818, "dur": 5, "args": { "External id": 1872, "cbid": 211, "correlation": 1872 } }, { "ph": "s", "id": 1872, "pid": 76337, "tid": -914061504, "ts": 1716454216510818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216510937, "dur": 1, "args": { "External id": 1874, "device": 5, "context": 1, "stream": 7, "correlation": 1874, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 1874, "pid": 5, "tid": 7, "ts": 1716454216510937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216510829, "dur": 6, "args": { "External id": 1874, "cbid": 51, "correlation": 1874 } }, { "ph": "s", "id": 1874, "pid": 76337, "tid": -914061504, "ts": 1716454216510829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216510941, "dur": 265, "args": { "External id": 1875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1875, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 1875, "pid": 5, "tid": 7, "ts": 1716454216510941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510836, "dur": 6, "args": { "External id": 1875, "cbid": 211, "correlation": 1875 } }, { "ph": "s", "id": 1875, "pid": 76337, "tid": -914061504, "ts": 1716454216510836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216511207, "dur": 10, "args": { "External id": 1877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1877, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1877, "pid": 5, "tid": 7, "ts": 1716454216511207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510846, "dur": 5, "args": { "External id": 1877, "cbid": 211, "correlation": 1877 } }, { "ph": "s", "id": 1877, "pid": 76337, "tid": -914061504, "ts": 1716454216510846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216511218, "dur": 12, "args": { "External id": 1883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1883, "pid": 5, "tid": 7, "ts": 1716454216511218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510875, "dur": 8, "args": { "External id": 1883, "cbid": 211, "correlation": 1883 } }, { "ph": "s", "id": 1883, "pid": 76337, "tid": -914061504, "ts": 1716454216510875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216511232, "dur": 39, "args": { "External id": 1891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1891, "pid": 5, "tid": 7, "ts": 1716454216511232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510909, "dur": 9, "args": { "External id": 1891, "cbid": 211, "correlation": 1891 } }, { "ph": "s", "id": 1891, "pid": 76337, "tid": -914061504, "ts": 1716454216510909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216511272, "dur": 9, "args": { "External id": 1899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1899, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1899, "pid": 5, "tid": 7, "ts": 1716454216511272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216510939, "dur": 8, "args": { "External id": 1899, "cbid": 211, "correlation": 1899 } }, { "ph": "s", "id": 1899, "pid": 76337, "tid": -914061504, "ts": 1716454216510939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216511336, "dur": 38, "args": { "External id": 1909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1909, "pid": 5, "tid": 7, "ts": 1716454216511336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511319, "dur": 17, "args": { "External id": 1909, "cbid": 211, "correlation": 1909 } }, { "ph": "s", "id": 1909, "pid": 76337, "tid": -914061504, "ts": 1716454216511319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216511375, "dur": 31, "args": { "External id": 1930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1930, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 1930, "pid": 5, "tid": 7, "ts": 1716454216511375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511357, "dur": 9, "args": { "External id": 1930, "cbid": 211, "correlation": 1930 } }, { "ph": "s", "id": 1930, "pid": 76337, "tid": -914061504, "ts": 1716454216511357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216511408, "dur": 4, "args": { "External id": 1942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1942, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 1942, "pid": 5, "tid": 7, "ts": 1716454216511408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511376, "dur": 6, "args": { "External id": 1942, "cbid": 211, "correlation": 1942 } }, { "ph": "s", "id": 1942, "pid": 76337, "tid": -914061504, "ts": 1716454216511376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216511414, "dur": 14, "args": { "External id": 1945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1945, "pid": 5, "tid": 7, "ts": 1716454216511414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511393, "dur": 6, "args": { "External id": 1945, "cbid": 211, "correlation": 1945 } }, { "ph": "s", "id": 1945, "pid": 76337, "tid": -914061504, "ts": 1716454216511393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216511635, "dur": 93, "args": { "External id": 1957, "cbid": 251, "correlation": 1957 } }, { "ph": "f", "id": 1957, "pid": 76337, "tid": -914061504, "ts": 1716454216511635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216511755, "dur": 37, "args": { "External id": 1958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1958, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 1958, "pid": 5, "tid": 7, "ts": 1716454216511755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511738, "dur": 17, "args": { "External id": 1958, "cbid": 211, "correlation": 1958 } }, { "ph": "s", "id": 1958, "pid": 76337, "tid": -914061504, "ts": 1716454216511738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216511804, "dur": 17, "args": { "External id": 1963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1963, "pid": 5, "tid": 7, "ts": 1716454216511804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511792, "dur": 10, "args": { "External id": 1963, "cbid": 211, "correlation": 1963 } }, { "ph": "s", "id": 1963, "pid": 76337, "tid": -914061504, "ts": 1716454216511792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216511893, "dur": 1, "args": { "External id": 1974, "cbid": 251, "correlation": 1974 } }, { "ph": "f", "id": 1974, "pid": 76337, "tid": -914061504, "ts": 1716454216511893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216511912, "dur": 37, "args": { "External id": 1975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1975, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 1975, "pid": 5, "tid": 7, "ts": 1716454216511912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511898, "dur": 13, "args": { "External id": 1975, "cbid": 211, "correlation": 1975 } }, { "ph": "s", "id": 1975, "pid": 76337, "tid": -914061504, "ts": 1716454216511898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216511952, "dur": 14, "args": { "External id": 1980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1980, "pid": 5, "tid": 7, "ts": 1716454216511952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216511939, "dur": 11, "args": { "External id": 1980, "cbid": 211, "correlation": 1980 } }, { "ph": "s", "id": 1980, "pid": 76337, "tid": -914061504, "ts": 1716454216511939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216512036, "dur": 1, "args": { "External id": 1991, "cbid": 251, "correlation": 1991 } }, { "ph": "f", "id": 1991, "pid": 76337, "tid": -914061504, "ts": 1716454216512036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216512056, "dur": 37, "args": { "External id": 1992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1992, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 1992, "pid": 5, "tid": 7, "ts": 1716454216512056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216512041, "dur": 15, "args": { "External id": 1992, "cbid": 211, "correlation": 1992 } }, { "ph": "s", "id": 1992, "pid": 76337, "tid": -914061504, "ts": 1716454216512041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216512094, "dur": 13, "args": { "External id": 1997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 1997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 1997, "pid": 5, "tid": 7, "ts": 1716454216512094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216512073, "dur": 9, "args": { "External id": 1997, "cbid": 211, "correlation": 1997 } }, { "ph": "s", "id": 1997, "pid": 76337, "tid": -914061504, "ts": 1716454216512073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216512545, "dur": 1205, "args": { "External id": 2022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2022, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 2022, "pid": 5, "tid": 7, "ts": 1716454216512545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216512383, "dur": 163, "args": { "External id": 2022, "cbid": 211, "correlation": 2022 } }, { "ph": "s", "id": 2022, "pid": 76337, "tid": -914061504, "ts": 1716454216512383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216512712, "dur": 59, "args": { "External id": 2040, "cbid": 251, "correlation": 2040 } }, { "ph": "f", "id": 2040, "pid": 76337, "tid": -914061504, "ts": 1716454216512712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216513759, "dur": 40, "args": { "External id": 2042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2042, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 2042, "pid": 5, "tid": 7, "ts": 1716454216513759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216512778, "dur": 14, "args": { "External id": 2042, "cbid": 211, "correlation": 2042 } }, { "ph": "s", "id": 2042, "pid": 76337, "tid": -914061504, "ts": 1716454216512778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216513806, "dur": 12, "args": { "External id": 2050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2050, "registers per thread": 19, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2050, "pid": 5, "tid": 7, "ts": 1716454216513806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216512889, "dur": 58, "args": { "External id": 2050, "cbid": 211, "correlation": 2050 } }, { "ph": "s", "id": 2050, "pid": 76337, "tid": -914061504, "ts": 1716454216512889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216513819, "dur": 9, "args": { "External id": 2058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2058, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2058, "pid": 5, "tid": 7, "ts": 1716454216513819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216512997, "dur": 13, "args": { "External id": 2058, "cbid": 211, "correlation": 2058 } }, { "ph": "s", "id": 2058, "pid": 76337, "tid": -914061504, "ts": 1716454216512997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216513830, "dur": 38, "args": { "External id": 2068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2068, "pid": 5, "tid": 7, "ts": 1716454216513830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513099, "dur": 15, "args": { "External id": 2068, "cbid": 211, "correlation": 2068 } }, { "ph": "s", "id": 2068, "pid": 76337, "tid": -914061504, "ts": 1716454216513099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216513870, "dur": 34, "args": { "External id": 2089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2089, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2089, "pid": 5, "tid": 7, "ts": 1716454216513870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513135, "dur": 8, "args": { "External id": 2089, "cbid": 211, "correlation": 2089 } }, { "ph": "s", "id": 2089, "pid": 76337, "tid": -914061504, "ts": 1716454216513135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216513905, "dur": 5, "args": { "External id": 2101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2101, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2101, "pid": 5, "tid": 7, "ts": 1716454216513905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513160, "dur": 9, "args": { "External id": 2101, "cbid": 211, "correlation": 2101 } }, { "ph": "s", "id": 2101, "pid": 76337, "tid": -914061504, "ts": 1716454216513160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216513911, "dur": 15, "args": { "External id": 2104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2104, "pid": 5, "tid": 7, "ts": 1716454216513911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513181, "dur": 7, "args": { "External id": 2104, "cbid": 211, "correlation": 2104 } }, { "ph": "s", "id": 2104, "pid": 76337, "tid": -914061504, "ts": 1716454216513181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216513928, "dur": 10, "args": { "External id": 2113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2113, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2113, "pid": 5, "tid": 7, "ts": 1716454216513928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513225, "dur": 9, "args": { "External id": 2113, "cbid": 211, "correlation": 2113 } }, { "ph": "s", "id": 2113, "pid": 76337, "tid": -914061504, "ts": 1716454216513225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216513305, "dur": 0, "args": { "External id": 2123, "cbid": 317, "correlation": 2123 } }, { "ph": "f", "id": 2123, "pid": 76337, "tid": -914061504, "ts": 1716454216513305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216513306, "dur": 0, "args": { "External id": 2124, "cbid": 203, "correlation": 2124 } }, { "ph": "f", "id": 2124, "pid": 76337, "tid": -914061504, "ts": 1716454216513306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216513307, "dur": 0, "args": { "External id": 2125, "cbid": 205, "correlation": 2125 } }, { "ph": "f", "id": 2125, "pid": 76337, "tid": -914061504, "ts": 1716454216513307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216513940, "dur": 11, "args": { "External id": 2129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2129, "pid": 5, "tid": 7, "ts": 1716454216513940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513325, "dur": 14, "args": { "External id": 2129, "cbid": 211, "correlation": 2129 } }, { "ph": "s", "id": 2129, "pid": 76337, "tid": -914061504, "ts": 1716454216513325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216513952, "dur": 32, "args": { "External id": 2131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2131, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2131, "pid": 5, "tid": 7, "ts": 1716454216513952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513341, "dur": 5, "args": { "External id": 2131, "cbid": 211, "correlation": 2131 } }, { "ph": "s", "id": 2131, "pid": 76337, "tid": -914061504, "ts": 1716454216513341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216513986, "dur": 2, "args": { "External id": 2133, "device": 5, "context": 1, "stream": 7, "correlation": 2133, "bytes": 768, "memory bandwidth (GB/s)": 0.38095238095238093 } }, { "ph": "f", "id": 2133, "pid": 5, "tid": 7, "ts": 1716454216513986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216513353, "dur": 8, "args": { "External id": 2133, "cbid": 51, "correlation": 2133 } }, { "ph": "s", "id": 2133, "pid": 76337, "tid": -914061504, "ts": 1716454216513353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216513990, "dur": 275, "args": { "External id": 2134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2134, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2134, "pid": 5, "tid": 7, "ts": 1716454216513990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513361, "dur": 6, "args": { "External id": 2134, "cbid": 211, "correlation": 2134 } }, { "ph": "s", "id": 2134, "pid": 76337, "tid": -914061504, "ts": 1716454216513361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216514267, "dur": 11, "args": { "External id": 2136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2136, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2136, "pid": 5, "tid": 7, "ts": 1716454216514267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513372, "dur": 6, "args": { "External id": 2136, "cbid": 211, "correlation": 2136 } }, { "ph": "s", "id": 2136, "pid": 76337, "tid": -914061504, "ts": 1716454216513372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216514280, "dur": 14, "args": { "External id": 2142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2142, "pid": 5, "tid": 7, "ts": 1716454216514280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513402, "dur": 9, "args": { "External id": 2142, "cbid": 211, "correlation": 2142 } }, { "ph": "s", "id": 2142, "pid": 76337, "tid": -914061504, "ts": 1716454216513402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216514295, "dur": 31, "args": { "External id": 2162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2162, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2162, "pid": 5, "tid": 7, "ts": 1716454216514295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513482, "dur": 13, "args": { "External id": 2162, "cbid": 211, "correlation": 2162 } }, { "ph": "s", "id": 2162, "pid": 76337, "tid": -914061504, "ts": 1716454216513482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216514327, "dur": 4, "args": { "External id": 2174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2174, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2174, "pid": 5, "tid": 7, "ts": 1716454216514327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513505, "dur": 6, "args": { "External id": 2174, "cbid": 211, "correlation": 2174 } }, { "ph": "s", "id": 2174, "pid": 76337, "tid": -914061504, "ts": 1716454216513505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216514332, "dur": 14, "args": { "External id": 2177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2177, "pid": 5, "tid": 7, "ts": 1716454216514332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513523, "dur": 6, "args": { "External id": 2177, "cbid": 211, "correlation": 2177 } }, { "ph": "s", "id": 2177, "pid": 76337, "tid": -914061504, "ts": 1716454216513523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216514347, "dur": 9, "args": { "External id": 2186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2186, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2186, "pid": 5, "tid": 7, "ts": 1716454216514347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513563, "dur": 10, "args": { "External id": 2186, "cbid": 211, "correlation": 2186 } }, { "ph": "s", "id": 2186, "pid": 76337, "tid": -914061504, "ts": 1716454216513563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216513623, "dur": 0, "args": { "External id": 2196, "cbid": 317, "correlation": 2196 } }, { "ph": "f", "id": 2196, "pid": 76337, "tid": -914061504, "ts": 1716454216513623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216513624, "dur": 0, "args": { "External id": 2197, "cbid": 203, "correlation": 2197 } }, { "ph": "f", "id": 2197, "pid": 76337, "tid": -914061504, "ts": 1716454216513624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216513625, "dur": 0, "args": { "External id": 2198, "cbid": 205, "correlation": 2198 } }, { "ph": "f", "id": 2198, "pid": 76337, "tid": -914061504, "ts": 1716454216513625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216514358, "dur": 8, "args": { "External id": 2202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2202, "pid": 5, "tid": 7, "ts": 1716454216514358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513639, "dur": 12, "args": { "External id": 2202, "cbid": 211, "correlation": 2202 } }, { "ph": "s", "id": 2202, "pid": 76337, "tid": -914061504, "ts": 1716454216513639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216514368, "dur": 32, "args": { "External id": 2204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2204, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2204, "pid": 5, "tid": 7, "ts": 1716454216514368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513653, "dur": 6, "args": { "External id": 2204, "cbid": 211, "correlation": 2204 } }, { "ph": "s", "id": 2204, "pid": 76337, "tid": -914061504, "ts": 1716454216513653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216514402, "dur": 1, "args": { "External id": 2206, "device": 5, "context": 1, "stream": 7, "correlation": 2206, "bytes": 768, "memory bandwidth (GB/s)": 0.4528301886792453 } }, { "ph": "f", "id": 2206, "pid": 5, "tid": 7, "ts": 1716454216514402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216513664, "dur": 6, "args": { "External id": 2206, "cbid": 51, "correlation": 2206 } }, { "ph": "s", "id": 2206, "pid": 76337, "tid": -914061504, "ts": 1716454216513664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216514406, "dur": 264, "args": { "External id": 2207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2207, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2207, "pid": 5, "tid": 7, "ts": 1716454216514406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513671, "dur": 6, "args": { "External id": 2207, "cbid": 211, "correlation": 2207 } }, { "ph": "s", "id": 2207, "pid": 76337, "tid": -914061504, "ts": 1716454216513671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216514671, "dur": 11, "args": { "External id": 2209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2209, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2209, "pid": 5, "tid": 7, "ts": 1716454216514671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513681, "dur": 5, "args": { "External id": 2209, "cbid": 211, "correlation": 2209 } }, { "ph": "s", "id": 2209, "pid": 76337, "tid": -914061504, "ts": 1716454216513681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216514683, "dur": 12, "args": { "External id": 2215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2215, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2215, "pid": 5, "tid": 7, "ts": 1716454216514683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513709, "dur": 8, "args": { "External id": 2215, "cbid": 211, "correlation": 2215 } }, { "ph": "s", "id": 2215, "pid": 76337, "tid": -914061504, "ts": 1716454216513709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216514697, "dur": 40, "args": { "External id": 2223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2223, "pid": 5, "tid": 7, "ts": 1716454216514697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513742, "dur": 9, "args": { "External id": 2223, "cbid": 211, "correlation": 2223 } }, { "ph": "s", "id": 2223, "pid": 76337, "tid": -914061504, "ts": 1716454216513742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216514738, "dur": 9, "args": { "External id": 2231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2231, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2231, "pid": 5, "tid": 7, "ts": 1716454216514738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513773, "dur": 8, "args": { "External id": 2231, "cbid": 211, "correlation": 2231 } }, { "ph": "s", "id": 2231, "pid": 76337, "tid": -914061504, "ts": 1716454216513773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216514749, "dur": 38, "args": { "External id": 2241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2241, "pid": 5, "tid": 7, "ts": 1716454216514749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513842, "dur": 12, "args": { "External id": 2241, "cbid": 211, "correlation": 2241 } }, { "ph": "s", "id": 2241, "pid": 76337, "tid": -914061504, "ts": 1716454216513842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216514788, "dur": 47, "args": { "External id": 2262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2262, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2262, "pid": 5, "tid": 7, "ts": 1716454216514788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513887, "dur": 9, "args": { "External id": 2262, "cbid": 211, "correlation": 2262 } }, { "ph": "s", "id": 2262, "pid": 76337, "tid": -914061504, "ts": 1716454216513887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216514836, "dur": 4, "args": { "External id": 2274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2274, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2274, "pid": 5, "tid": 7, "ts": 1716454216514836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513905, "dur": 6, "args": { "External id": 2274, "cbid": 211, "correlation": 2274 } }, { "ph": "s", "id": 2274, "pid": 76337, "tid": -914061504, "ts": 1716454216513905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216514842, "dur": 14, "args": { "External id": 2277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2277, "pid": 5, "tid": 7, "ts": 1716454216514842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513922, "dur": 7, "args": { "External id": 2277, "cbid": 211, "correlation": 2277 } }, { "ph": "s", "id": 2277, "pid": 76337, "tid": -914061504, "ts": 1716454216513922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216514857, "dur": 9, "args": { "External id": 2286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2286, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2286, "pid": 5, "tid": 7, "ts": 1716454216514857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216513990, "dur": 12, "args": { "External id": 2286, "cbid": 211, "correlation": 2286 } }, { "ph": "s", "id": 2286, "pid": 76337, "tid": -914061504, "ts": 1716454216513990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216514064, "dur": 0, "args": { "External id": 2296, "cbid": 317, "correlation": 2296 } }, { "ph": "f", "id": 2296, "pid": 76337, "tid": -914061504, "ts": 1716454216514064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216514065, "dur": 0, "args": { "External id": 2297, "cbid": 203, "correlation": 2297 } }, { "ph": "f", "id": 2297, "pid": 76337, "tid": -914061504, "ts": 1716454216514065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216514066, "dur": 0, "args": { "External id": 2298, "cbid": 205, "correlation": 2298 } }, { "ph": "f", "id": 2298, "pid": 76337, "tid": -914061504, "ts": 1716454216514066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216516857, "dur": 8, "args": { "External id": 2302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2302, "pid": 5, "tid": 7, "ts": 1716454216516857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216516833, "dur": 24, "args": { "External id": 2302, "cbid": 211, "correlation": 2302 } }, { "ph": "s", "id": 2302, "pid": 76337, "tid": -914061504, "ts": 1716454216516833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216516870, "dur": 4, "args": { "External id": 2304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1.6, "warps per SM": 12.8, "grid": [1, 16, 8], "block": [256, 1, 1], "est. achieved occupancy %": 20 } }, { "ph": "f", "id": 2304, "pid": 5, "tid": 7, "ts": 1716454216516870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216516860, "dur": 6, "args": { "External id": 2304, "cbid": 211, "correlation": 2304 } }, { "ph": "s", "id": 2304, "pid": 76337, "tid": -914061504, "ts": 1716454216516860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216516906, "dur": 1, "args": { "External id": 2306, "device": 5, "context": 1, "stream": 7, "correlation": 2306, "bytes": 2688, "memory bandwidth (GB/s)": 1.423728813559322 } }, { "ph": "f", "id": 2306, "pid": 5, "tid": 7, "ts": 1716454216516906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216516875, "dur": 28, "args": { "External id": 2306, "cbid": 51, "correlation": 2306 } }, { "ph": "s", "id": 2306, "pid": 76337, "tid": -914061504, "ts": 1716454216516875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x32x64_stage1_warpsize2x1x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216517106, "dur": 58, "args": { "External id": 2307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2307, "registers per thread": 128, "shared memory": 12288, "blocks per SM": 8.4, "warps per SM": 33.6, "grid": [1, 48, 14], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 2307, "pid": 5, "tid": 7, "ts": 1716454216517106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216516914, "dur": 197, "args": { "External id": 2307, "cbid": 211, "correlation": 2307 } }, { "ph": "s", "id": 2307, "pid": 76337, "tid": -914061504, "ts": 1716454216516914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216517166, "dur": 6, "args": { "External id": 2309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1.2, "warps per SM": 9.6, "grid": [96, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 2309, "pid": 5, "tid": 7, "ts": 1716454216517166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216517118, "dur": 8, "args": { "External id": 2309, "cbid": 211, "correlation": 2309 } }, { "ph": "s", "id": 2309, "pid": 76337, "tid": -914061504, "ts": 1716454216517118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216517851, "dur": 7, "args": { "External id": 2315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 2315, "pid": 5, "tid": 7, "ts": 1716454216517851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216517838, "dur": 12, "args": { "External id": 2315, "cbid": 211, "correlation": 2315 } }, { "ph": "s", "id": 2315, "pid": 76337, "tid": -914061504, "ts": 1716454216517838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216518013, "dur": 0, "args": { "External id": 2325, "cbid": 317, "correlation": 2325 } }, { "ph": "f", "id": 2325, "pid": 76337, "tid": -914061504, "ts": 1716454216518013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216518014, "dur": 0, "args": { "External id": 2326, "cbid": 203, "correlation": 2326 } }, { "ph": "f", "id": 2326, "pid": 76337, "tid": -914061504, "ts": 1716454216518014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216518015, "dur": 0, "args": { "External id": 2327, "cbid": 205, "correlation": 2327 } }, { "ph": "f", "id": 2327, "pid": 76337, "tid": -914061504, "ts": 1716454216518015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216519949, "dur": 2, "args": { "External id": 2333, "cbid": 317, "correlation": 2333 } }, { "ph": "f", "id": 2333, "pid": 76337, "tid": -914061504, "ts": 1716454216519949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216519952, "dur": 6784, "args": { "External id": 2334, "cbid": 20, "correlation": 2334 } }, { "ph": "f", "id": 2334, "pid": 76337, "tid": -914061504, "ts": 1716454216519952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216526790, "dur": 69, "args": { "External id": 2335, "cbid": 251, "correlation": 2335 } }, { "ph": "f", "id": 2335, "pid": 76337, "tid": -914061504, "ts": 1716454216526790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216526887, "dur": 7, "args": { "External id": 2336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2336, "registers per thread": 106, "shared memory": 16640, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 2336, "pid": 5, "tid": 7, "ts": 1716454216526887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216526866, "dur": 22, "args": { "External id": 2336, "cbid": 211, "correlation": 2336 } }, { "ph": "s", "id": 2336, "pid": 76337, "tid": -914061504, "ts": 1716454216526866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216527553, "dur": 5, "args": { "External id": 2342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2342, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 2342, "pid": 5, "tid": 7, "ts": 1716454216527553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216527540, "dur": 13, "args": { "External id": 2342, "cbid": 211, "correlation": 2342 } }, { "ph": "s", "id": 2342, "pid": 76337, "tid": -914061504, "ts": 1716454216527540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216527952, "dur": 3, "args": { "External id": 2350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2350, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 2350, "pid": 5, "tid": 7, "ts": 1716454216527952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216527878, "dur": 75, "args": { "External id": 2350, "cbid": 211, "correlation": 2350 } }, { "ph": "s", "id": 2350, "pid": 76337, "tid": -914061504, "ts": 1716454216527878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216528081, "dur": 3, "args": { "External id": 2358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2358, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 2358, "pid": 5, "tid": 7, "ts": 1716454216528081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216528018, "dur": 63, "args": { "External id": 2358, "cbid": 211, "correlation": 2358 } }, { "ph": "s", "id": 2358, "pid": 76337, "tid": -914061504, "ts": 1716454216528018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216528221, "dur": 3, "args": { "External id": 2366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2366, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 2366, "pid": 5, "tid": 7, "ts": 1716454216528221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216528159, "dur": 62, "args": { "External id": 2366, "cbid": 211, "correlation": 2366 } }, { "ph": "s", "id": 2366, "pid": 76337, "tid": -914061504, "ts": 1716454216528159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216528254, "dur": 3, "args": { "External id": 2374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2374, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 2374, "pid": 5, "tid": 7, "ts": 1716454216528254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216528244, "dur": 9, "args": { "External id": 2374, "cbid": 211, "correlation": 2374 } }, { "ph": "s", "id": 2374, "pid": 76337, "tid": -914061504, "ts": 1716454216528244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216528528, "dur": 5, "args": { "External id": 2383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2383, "registers per thread": 32, "shared memory": 0, "blocks per SM": 8, "warps per SM": 128, "grid": [160, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2383, "pid": 5, "tid": 7, "ts": 1716454216528528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216528447, "dur": 85, "args": { "External id": 2383, "cbid": 211, "correlation": 2383 } }, { "ph": "s", "id": 2383, "pid": 76337, "tid": -914061504, "ts": 1716454216528447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216528672, "dur": 3, "args": { "External id": 2394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 2394, "pid": 5, "tid": 7, "ts": 1716454216528672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216528657, "dur": 15, "args": { "External id": 2394, "cbid": 211, "correlation": 2394 } }, { "ph": "s", "id": 2394, "pid": 76337, "tid": -914061504, "ts": 1716454216528657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216528708, "dur": 5, "args": { "External id": 2404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2404, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2404, "pid": 5, "tid": 7, "ts": 1716454216528708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216528698, "dur": 9, "args": { "External id": 2404, "cbid": 211, "correlation": 2404 } }, { "ph": "s", "id": 2404, "pid": 76337, "tid": -914061504, "ts": 1716454216528698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454216529589, "dur": 95, "args": { "External id": 2419, "device": 5, "context": 1, "stream": 7, "correlation": 2419, "bytes": 1179648, "memory bandwidth (GB/s)": 12.362302589522443 } }, { "ph": "f", "id": 2419, "pid": 5, "tid": 7, "ts": 1716454216529589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216529336, "dur": 271, "args": { "External id": 2419, "cbid": 41, "correlation": 2419 } }, { "ph": "s", "id": 2419, "pid": 76337, "tid": -914061504, "ts": 1716454216529336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454216529609, "dur": 87, "args": { "External id": 2420, "cbid": 131, "correlation": 2420 } }, { "ph": "f", "id": 2420, "pid": 76337, "tid": -914061504, "ts": 1716454216529609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216529729, "dur": 8, "args": { "External id": 2424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 2424, "pid": 5, "tid": 7, "ts": 1716454216529729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216529710, "dur": 18, "args": { "External id": 2424, "cbid": 211, "correlation": 2424 } }, { "ph": "s", "id": 2424, "pid": 76337, "tid": -914061504, "ts": 1716454216529710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454216530735, "dur": 94, "args": { "External id": 2440, "device": 5, "context": 1, "stream": 7, "correlation": 2440, "bytes": 1179648, "memory bandwidth (GB/s)": 12.496403563597072 } }, { "ph": "f", "id": 2440, "pid": 5, "tid": 7, "ts": 1716454216530735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216530511, "dur": 241, "args": { "External id": 2440, "cbid": 41, "correlation": 2440 } }, { "ph": "s", "id": 2440, "pid": 76337, "tid": -914061504, "ts": 1716454216530511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454216530753, "dur": 83, "args": { "External id": 2441, "cbid": 131, "correlation": 2441 } }, { "ph": "f", "id": 2441, "pid": 76337, "tid": -914061504, "ts": 1716454216530753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216530861, "dur": 6, "args": { "External id": 2445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 2445, "pid": 5, "tid": 7, "ts": 1716454216530861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216530846, "dur": 14, "args": { "External id": 2445, "cbid": 211, "correlation": 2445 } }, { "ph": "s", "id": 2445, "pid": 76337, "tid": -914061504, "ts": 1716454216530846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216531096, "dur": 5, "args": { "External id": 2460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0875, "warps per SM": 0.35, "grid": [7, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 2460, "pid": 5, "tid": 7, "ts": 1716454216531096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531080, "dur": 16, "args": { "External id": 2460, "cbid": 211, "correlation": 2460 } }, { "ph": "s", "id": 2460, "pid": 76337, "tid": -914061504, "ts": 1716454216531080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216531113, "dur": 1, "args": { "External id": 2467, "cbid": 317, "correlation": 2467 } }, { "ph": "f", "id": 2467, "pid": 76337, "tid": -914061504, "ts": 1716454216531113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216531115, "dur": 1, "args": { "External id": 2468, "cbid": 203, "correlation": 2468 } }, { "ph": "f", "id": 2468, "pid": 76337, "tid": -914061504, "ts": 1716454216531115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216531116, "dur": 1, "args": { "External id": 2469, "cbid": 205, "correlation": 2469 } }, { "ph": "f", "id": 2469, "pid": 76337, "tid": -914061504, "ts": 1716454216531116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_tensor8x8x4_aligna2_alignc8_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216531167, "dur": 199, "args": { "External id": 2473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2473, "registers per thread": 111, "shared memory": 4096, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [4, 1536, 1], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 2473, "pid": 5, "tid": 7, "ts": 1716454216531167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531153, "dur": 14, "args": { "External id": 2473, "cbid": 211, "correlation": 2473 } }, { "ph": "s", "id": 2473, "pid": 76337, "tid": -914061504, "ts": 1716454216531153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216531368, "dur": 180, "args": { "External id": 2479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2479, "pid": 5, "tid": 7, "ts": 1716454216531368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531203, "dur": 10, "args": { "External id": 2479, "cbid": 211, "correlation": 2479 } }, { "ph": "s", "id": 2479, "pid": 76337, "tid": -914061504, "ts": 1716454216531203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216531549, "dur": 1803, "args": { "External id": 2489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2489, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2489, "pid": 5, "tid": 7, "ts": 1716454216531549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531327, "dur": 12, "args": { "External id": 2489, "cbid": 211, "correlation": 2489 } }, { "ph": "s", "id": 2489, "pid": 76337, "tid": -914061504, "ts": 1716454216531327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216533354, "dur": 704, "args": { "External id": 2510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2510, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2510, "pid": 5, "tid": 7, "ts": 1716454216533354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531372, "dur": 10, "args": { "External id": 2510, "cbid": 211, "correlation": 2510 } }, { "ph": "s", "id": 2510, "pid": 76337, "tid": -914061504, "ts": 1716454216531372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216534059, "dur": 5, "args": { "External id": 2522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2522, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2522, "pid": 5, "tid": 7, "ts": 1716454216534059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531395, "dur": 7, "args": { "External id": 2522, "cbid": 211, "correlation": 2522 } }, { "ph": "s", "id": 2522, "pid": 76337, "tid": -914061504, "ts": 1716454216531395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216534065, "dur": 178, "args": { "External id": 2525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2525, "pid": 5, "tid": 7, "ts": 1716454216534065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531419, "dur": 8, "args": { "External id": 2525, "cbid": 211, "correlation": 2525 } }, { "ph": "s", "id": 2525, "pid": 76337, "tid": -914061504, "ts": 1716454216531419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216534244, "dur": 110, "args": { "External id": 2534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2534, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2534, "pid": 5, "tid": 7, "ts": 1716454216534244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531476, "dur": 11, "args": { "External id": 2534, "cbid": 211, "correlation": 2534 } }, { "ph": "s", "id": 2534, "pid": 76337, "tid": -914061504, "ts": 1716454216531476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216531533, "dur": 0, "args": { "External id": 2544, "cbid": 317, "correlation": 2544 } }, { "ph": "f", "id": 2544, "pid": 76337, "tid": -914061504, "ts": 1716454216531533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216531533, "dur": 0, "args": { "External id": 2545, "cbid": 203, "correlation": 2545 } }, { "ph": "f", "id": 2545, "pid": 76337, "tid": -914061504, "ts": 1716454216531533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216531534, "dur": 0, "args": { "External id": 2546, "cbid": 205, "correlation": 2546 } }, { "ph": "f", "id": 2546, "pid": 76337, "tid": -914061504, "ts": 1716454216531534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216534356, "dur": 143, "args": { "External id": 2550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2550, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2550, "pid": 5, "tid": 7, "ts": 1716454216534356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531559, "dur": 14, "args": { "External id": 2550, "cbid": 211, "correlation": 2550 } }, { "ph": "s", "id": 2550, "pid": 76337, "tid": -914061504, "ts": 1716454216531559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216534501, "dur": 6, "args": { "External id": 2552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2552, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 2552, "pid": 5, "tid": 7, "ts": 1716454216534501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531576, "dur": 5, "args": { "External id": 2552, "cbid": 211, "correlation": 2552 } }, { "ph": "s", "id": 2552, "pid": 76337, "tid": -914061504, "ts": 1716454216531576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216534509, "dur": 5, "args": { "External id": 2554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2554, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2554, "pid": 5, "tid": 7, "ts": 1716454216534509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531589, "dur": 8, "args": { "External id": 2554, "cbid": 211, "correlation": 2554 } }, { "ph": "s", "id": 2554, "pid": 76337, "tid": -914061504, "ts": 1716454216531589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216531602, "dur": 0, "args": { "External id": 2555, "cbid": 51, "correlation": 2555 } }, { "ph": "s", "id": 2555, "pid": 76337, "tid": -914061504, "ts": 1716454216531602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216534515, "dur": 824, "args": { "External id": 2556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2556, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2556, "pid": 5, "tid": 7, "ts": 1716454216534515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531604, "dur": 7, "args": { "External id": 2556, "cbid": 211, "correlation": 2556 } }, { "ph": "s", "id": 2556, "pid": 76337, "tid": -914061504, "ts": 1716454216531604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216535340, "dur": 176, "args": { "External id": 2561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2561, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2561, "pid": 5, "tid": 7, "ts": 1716454216535340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531634, "dur": 8, "args": { "External id": 2561, "cbid": 211, "correlation": 2561 } }, { "ph": "s", "id": 2561, "pid": 76337, "tid": -914061504, "ts": 1716454216531634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216535518, "dur": 699, "args": { "External id": 2581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2581, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2581, "pid": 5, "tid": 7, "ts": 1716454216535518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531702, "dur": 11, "args": { "External id": 2581, "cbid": 211, "correlation": 2581 } }, { "ph": "s", "id": 2581, "pid": 76337, "tid": -914061504, "ts": 1716454216531702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216536218, "dur": 4, "args": { "External id": 2593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2593, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2593, "pid": 5, "tid": 7, "ts": 1716454216536218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531723, "dur": 6, "args": { "External id": 2593, "cbid": 211, "correlation": 2593 } }, { "ph": "s", "id": 2593, "pid": 76337, "tid": -914061504, "ts": 1716454216531723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216536224, "dur": 177, "args": { "External id": 2596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2596, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2596, "pid": 5, "tid": 7, "ts": 1716454216536224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531741, "dur": 7, "args": { "External id": 2596, "cbid": 211, "correlation": 2596 } }, { "ph": "s", "id": 2596, "pid": 76337, "tid": -914061504, "ts": 1716454216531741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216536403, "dur": 109, "args": { "External id": 2605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2605, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2605, "pid": 5, "tid": 7, "ts": 1716454216536403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531779, "dur": 9, "args": { "External id": 2605, "cbid": 211, "correlation": 2605 } }, { "ph": "s", "id": 2605, "pid": 76337, "tid": -914061504, "ts": 1716454216531779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216531854, "dur": 0, "args": { "External id": 2615, "cbid": 317, "correlation": 2615 } }, { "ph": "f", "id": 2615, "pid": 76337, "tid": -914061504, "ts": 1716454216531854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216531855, "dur": 0, "args": { "External id": 2616, "cbid": 203, "correlation": 2616 } }, { "ph": "f", "id": 2616, "pid": 76337, "tid": -914061504, "ts": 1716454216531855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216531856, "dur": 0, "args": { "External id": 2617, "cbid": 205, "correlation": 2617 } }, { "ph": "f", "id": 2617, "pid": 76337, "tid": -914061504, "ts": 1716454216531856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216536514, "dur": 128, "args": { "External id": 2621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2621, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2621, "pid": 5, "tid": 7, "ts": 1716454216536514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531869, "dur": 11, "args": { "External id": 2621, "cbid": 211, "correlation": 2621 } }, { "ph": "s", "id": 2621, "pid": 76337, "tid": -914061504, "ts": 1716454216531869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216536643, "dur": 6, "args": { "External id": 2623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2623, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 2623, "pid": 5, "tid": 7, "ts": 1716454216536643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531882, "dur": 5, "args": { "External id": 2623, "cbid": 211, "correlation": 2623 } }, { "ph": "s", "id": 2623, "pid": 76337, "tid": -914061504, "ts": 1716454216531882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216536651, "dur": 5, "args": { "External id": 2625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2625, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2625, "pid": 5, "tid": 7, "ts": 1716454216536651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531892, "dur": 5, "args": { "External id": 2625, "cbid": 211, "correlation": 2625 } }, { "ph": "s", "id": 2625, "pid": 76337, "tid": -914061504, "ts": 1716454216531892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216531900, "dur": 0, "args": { "External id": 2626, "cbid": 51, "correlation": 2626 } }, { "ph": "s", "id": 2626, "pid": 76337, "tid": -914061504, "ts": 1716454216531900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216536657, "dur": 818, "args": { "External id": 2627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2627, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2627, "pid": 5, "tid": 7, "ts": 1716454216536657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531901, "dur": 5, "args": { "External id": 2627, "cbid": 211, "correlation": 2627 } }, { "ph": "s", "id": 2627, "pid": 76337, "tid": -914061504, "ts": 1716454216531901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216537476, "dur": 177, "args": { "External id": 2632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2632, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2632, "pid": 5, "tid": 7, "ts": 1716454216537476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531928, "dur": 8, "args": { "External id": 2632, "cbid": 211, "correlation": 2632 } }, { "ph": "s", "id": 2632, "pid": 76337, "tid": -914061504, "ts": 1716454216531928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216537655, "dur": 750, "args": { "External id": 2640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2640, "pid": 5, "tid": 7, "ts": 1716454216537655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216531967, "dur": 19, "args": { "External id": 2640, "cbid": 211, "correlation": 2640 } }, { "ph": "s", "id": 2640, "pid": 76337, "tid": -914061504, "ts": 1716454216531967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216538406, "dur": 104, "args": { "External id": 2648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2648, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2648, "pid": 5, "tid": 7, "ts": 1716454216538406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532020, "dur": 12, "args": { "External id": 2648, "cbid": 211, "correlation": 2648 } }, { "ph": "s", "id": 2648, "pid": 76337, "tid": -914061504, "ts": 1716454216532020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216538511, "dur": 1797, "args": { "External id": 2658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2658, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2658, "pid": 5, "tid": 7, "ts": 1716454216538511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532094, "dur": 12, "args": { "External id": 2658, "cbid": 211, "correlation": 2658 } }, { "ph": "s", "id": 2658, "pid": 76337, "tid": -914061504, "ts": 1716454216532094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216540310, "dur": 702, "args": { "External id": 2679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2679, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2679, "pid": 5, "tid": 7, "ts": 1716454216540310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532125, "dur": 7, "args": { "External id": 2679, "cbid": 211, "correlation": 2679 } }, { "ph": "s", "id": 2679, "pid": 76337, "tid": -914061504, "ts": 1716454216532125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216541013, "dur": 4, "args": { "External id": 2691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2691, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2691, "pid": 5, "tid": 7, "ts": 1716454216541013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532142, "dur": 6, "args": { "External id": 2691, "cbid": 211, "correlation": 2691 } }, { "ph": "s", "id": 2691, "pid": 76337, "tid": -914061504, "ts": 1716454216532142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216541019, "dur": 178, "args": { "External id": 2694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2694, "pid": 5, "tid": 7, "ts": 1716454216541019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532158, "dur": 6, "args": { "External id": 2694, "cbid": 211, "correlation": 2694 } }, { "ph": "s", "id": 2694, "pid": 76337, "tid": -914061504, "ts": 1716454216532158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216541198, "dur": 109, "args": { "External id": 2703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2703, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2703, "pid": 5, "tid": 7, "ts": 1716454216541198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532197, "dur": 10, "args": { "External id": 2703, "cbid": 211, "correlation": 2703 } }, { "ph": "s", "id": 2703, "pid": 76337, "tid": -914061504, "ts": 1716454216532197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216532250, "dur": 0, "args": { "External id": 2713, "cbid": 317, "correlation": 2713 } }, { "ph": "f", "id": 2713, "pid": 76337, "tid": -914061504, "ts": 1716454216532250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216532250, "dur": 0, "args": { "External id": 2714, "cbid": 203, "correlation": 2714 } }, { "ph": "f", "id": 2714, "pid": 76337, "tid": -914061504, "ts": 1716454216532250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216532251, "dur": 0, "args": { "External id": 2715, "cbid": 205, "correlation": 2715 } }, { "ph": "f", "id": 2715, "pid": 76337, "tid": -914061504, "ts": 1716454216532251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216541309, "dur": 135, "args": { "External id": 2719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2719, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2719, "pid": 5, "tid": 7, "ts": 1716454216541309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532264, "dur": 12, "args": { "External id": 2719, "cbid": 211, "correlation": 2719 } }, { "ph": "s", "id": 2719, "pid": 76337, "tid": -914061504, "ts": 1716454216532264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216541445, "dur": 6, "args": { "External id": 2721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2721, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 2721, "pid": 5, "tid": 7, "ts": 1716454216541445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532279, "dur": 5, "args": { "External id": 2721, "cbid": 211, "correlation": 2721 } }, { "ph": "s", "id": 2721, "pid": 76337, "tid": -914061504, "ts": 1716454216532279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216541453, "dur": 5, "args": { "External id": 2723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2723, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2723, "pid": 5, "tid": 7, "ts": 1716454216541453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532288, "dur": 5, "args": { "External id": 2723, "cbid": 211, "correlation": 2723 } }, { "ph": "s", "id": 2723, "pid": 76337, "tid": -914061504, "ts": 1716454216532288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216532296, "dur": 0, "args": { "External id": 2724, "cbid": 51, "correlation": 2724 } }, { "ph": "s", "id": 2724, "pid": 76337, "tid": -914061504, "ts": 1716454216532296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216541460, "dur": 817, "args": { "External id": 2725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2725, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2725, "pid": 5, "tid": 7, "ts": 1716454216541460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532297, "dur": 5, "args": { "External id": 2725, "cbid": 211, "correlation": 2725 } }, { "ph": "s", "id": 2725, "pid": 76337, "tid": -914061504, "ts": 1716454216532297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216542278, "dur": 175, "args": { "External id": 2730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2730, "pid": 5, "tid": 7, "ts": 1716454216542278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532322, "dur": 9, "args": { "External id": 2730, "cbid": 211, "correlation": 2730 } }, { "ph": "s", "id": 2730, "pid": 76337, "tid": -914061504, "ts": 1716454216532322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216542454, "dur": 698, "args": { "External id": 2750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2750, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2750, "pid": 5, "tid": 7, "ts": 1716454216542454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532390, "dur": 11, "args": { "External id": 2750, "cbid": 211, "correlation": 2750 } }, { "ph": "s", "id": 2750, "pid": 76337, "tid": -914061504, "ts": 1716454216532390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216543154, "dur": 4, "args": { "External id": 2762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2762, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2762, "pid": 5, "tid": 7, "ts": 1716454216543154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532410, "dur": 6, "args": { "External id": 2762, "cbid": 211, "correlation": 2762 } }, { "ph": "s", "id": 2762, "pid": 76337, "tid": -914061504, "ts": 1716454216532410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216543159, "dur": 175, "args": { "External id": 2765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2765, "pid": 5, "tid": 7, "ts": 1716454216543159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532427, "dur": 6, "args": { "External id": 2765, "cbid": 211, "correlation": 2765 } }, { "ph": "s", "id": 2765, "pid": 76337, "tid": -914061504, "ts": 1716454216532427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216543336, "dur": 109, "args": { "External id": 2774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2774, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2774, "pid": 5, "tid": 7, "ts": 1716454216543336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532466, "dur": 9, "args": { "External id": 2774, "cbid": 211, "correlation": 2774 } }, { "ph": "s", "id": 2774, "pid": 76337, "tid": -914061504, "ts": 1716454216532466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216532527, "dur": 0, "args": { "External id": 2784, "cbid": 317, "correlation": 2784 } }, { "ph": "f", "id": 2784, "pid": 76337, "tid": -914061504, "ts": 1716454216532527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216532527, "dur": 0, "args": { "External id": 2785, "cbid": 203, "correlation": 2785 } }, { "ph": "f", "id": 2785, "pid": 76337, "tid": -914061504, "ts": 1716454216532527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216532528, "dur": 0, "args": { "External id": 2786, "cbid": 205, "correlation": 2786 } }, { "ph": "f", "id": 2786, "pid": 76337, "tid": -914061504, "ts": 1716454216532528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216543446, "dur": 132, "args": { "External id": 2790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2790, "pid": 5, "tid": 7, "ts": 1716454216543446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532540, "dur": 11, "args": { "External id": 2790, "cbid": 211, "correlation": 2790 } }, { "ph": "s", "id": 2790, "pid": 76337, "tid": -914061504, "ts": 1716454216532540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216543580, "dur": 6, "args": { "External id": 2792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2792, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 2792, "pid": 5, "tid": 7, "ts": 1716454216543580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532554, "dur": 5, "args": { "External id": 2792, "cbid": 211, "correlation": 2792 } }, { "ph": "s", "id": 2792, "pid": 76337, "tid": -914061504, "ts": 1716454216532554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216543588, "dur": 5, "args": { "External id": 2794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2794, "pid": 5, "tid": 7, "ts": 1716454216543588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532563, "dur": 6, "args": { "External id": 2794, "cbid": 211, "correlation": 2794 } }, { "ph": "s", "id": 2794, "pid": 76337, "tid": -914061504, "ts": 1716454216532563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216532572, "dur": 0, "args": { "External id": 2795, "cbid": 51, "correlation": 2795 } }, { "ph": "s", "id": 2795, "pid": 76337, "tid": -914061504, "ts": 1716454216532572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216543594, "dur": 817, "args": { "External id": 2796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2796, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2796, "pid": 5, "tid": 7, "ts": 1716454216543594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532573, "dur": 5, "args": { "External id": 2796, "cbid": 211, "correlation": 2796 } }, { "ph": "s", "id": 2796, "pid": 76337, "tid": -914061504, "ts": 1716454216532573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216544413, "dur": 177, "args": { "External id": 2801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2801, "pid": 5, "tid": 7, "ts": 1716454216544413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532598, "dur": 8, "args": { "External id": 2801, "cbid": 211, "correlation": 2801 } }, { "ph": "s", "id": 2801, "pid": 76337, "tid": -914061504, "ts": 1716454216532598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216544591, "dur": 746, "args": { "External id": 2809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2809, "pid": 5, "tid": 7, "ts": 1716454216544591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532632, "dur": 9, "args": { "External id": 2809, "cbid": 211, "correlation": 2809 } }, { "ph": "s", "id": 2809, "pid": 76337, "tid": -914061504, "ts": 1716454216532632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216545339, "dur": 103, "args": { "External id": 2817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2817, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2817, "pid": 5, "tid": 7, "ts": 1716454216545339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532662, "dur": 8, "args": { "External id": 2817, "cbid": 211, "correlation": 2817 } }, { "ph": "s", "id": 2817, "pid": 76337, "tid": -914061504, "ts": 1716454216532662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216545443, "dur": 88, "args": { "External id": 2827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2827, "registers per thread": 16, "shared memory": 0, "blocks per SM": 617.2125, "warps per SM": 2468.85, "grid": [49377, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2827, "pid": 5, "tid": 7, "ts": 1716454216545443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532738, "dur": 12, "args": { "External id": 2827, "cbid": 211, "correlation": 2827 } }, { "ph": "s", "id": 2827, "pid": 76337, "tid": -914061504, "ts": 1716454216532738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216545533, "dur": 166, "args": { "External id": 2832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2832, "pid": 5, "tid": 7, "ts": 1716454216545533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532778, "dur": 7, "args": { "External id": 2832, "cbid": 211, "correlation": 2832 } }, { "ph": "s", "id": 2832, "pid": 76337, "tid": -914061504, "ts": 1716454216532778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216545700, "dur": 6, "args": { "External id": 2847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.6, "warps per SM": 14.4, "grid": [288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 22 } }, { "ph": "f", "id": 2847, "pid": 5, "tid": 7, "ts": 1716454216545700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532845, "dur": 12, "args": { "External id": 2847, "cbid": 211, "correlation": 2847 } }, { "ph": "s", "id": 2847, "pid": 76337, "tid": -914061504, "ts": 1716454216532845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216532866, "dur": 0, "args": { "External id": 2854, "cbid": 317, "correlation": 2854 } }, { "ph": "f", "id": 2854, "pid": 76337, "tid": -914061504, "ts": 1716454216532866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216532867, "dur": 0, "args": { "External id": 2855, "cbid": 203, "correlation": 2855 } }, { "ph": "f", "id": 2855, "pid": 76337, "tid": -914061504, "ts": 1716454216532867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216532868, "dur": 0, "args": { "External id": 2856, "cbid": 205, "correlation": 2856 } }, { "ph": "f", "id": 2856, "pid": 76337, "tid": -914061504, "ts": 1716454216532868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216545707, "dur": 265, "args": { "External id": 2860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2860, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [1, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2860, "pid": 5, "tid": 7, "ts": 1716454216545707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532887, "dur": 12, "args": { "External id": 2860, "cbid": 211, "correlation": 2860 } }, { "ph": "s", "id": 2860, "pid": 76337, "tid": -914061504, "ts": 1716454216532887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216545973, "dur": 49, "args": { "External id": 2866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2866, "pid": 5, "tid": 7, "ts": 1716454216545973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216532921, "dur": 9, "args": { "External id": 2866, "cbid": 211, "correlation": 2866 } }, { "ph": "s", "id": 2866, "pid": 76337, "tid": -914061504, "ts": 1716454216532921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216546023, "dur": 144, "args": { "External id": 2876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2876, "pid": 5, "tid": 7, "ts": 1716454216546023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533014, "dur": 13, "args": { "External id": 2876, "cbid": 211, "correlation": 2876 } }, { "ph": "s", "id": 2876, "pid": 76337, "tid": -914061504, "ts": 1716454216533014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216546169, "dur": 180, "args": { "External id": 2897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2897, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2897, "pid": 5, "tid": 7, "ts": 1716454216546169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533050, "dur": 7, "args": { "External id": 2897, "cbid": 211, "correlation": 2897 } }, { "ph": "s", "id": 2897, "pid": 76337, "tid": -914061504, "ts": 1716454216533050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216546351, "dur": 4, "args": { "External id": 2909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2909, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2909, "pid": 5, "tid": 7, "ts": 1716454216546351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533066, "dur": 6, "args": { "External id": 2909, "cbid": 211, "correlation": 2909 } }, { "ph": "s", "id": 2909, "pid": 76337, "tid": -914061504, "ts": 1716454216533066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216546356, "dur": 48, "args": { "External id": 2912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2912, "pid": 5, "tid": 7, "ts": 1716454216546356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533083, "dur": 6, "args": { "External id": 2912, "cbid": 211, "correlation": 2912 } }, { "ph": "s", "id": 2912, "pid": 76337, "tid": -914061504, "ts": 1716454216533083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216546406, "dur": 31, "args": { "External id": 2921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2921, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2921, "pid": 5, "tid": 7, "ts": 1716454216546406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533123, "dur": 9, "args": { "External id": 2921, "cbid": 211, "correlation": 2921 } }, { "ph": "s", "id": 2921, "pid": 76337, "tid": -914061504, "ts": 1716454216533123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216533176, "dur": 0, "args": { "External id": 2931, "cbid": 317, "correlation": 2931 } }, { "ph": "f", "id": 2931, "pid": 76337, "tid": -914061504, "ts": 1716454216533176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216533177, "dur": 0, "args": { "External id": 2932, "cbid": 203, "correlation": 2932 } }, { "ph": "f", "id": 2932, "pid": 76337, "tid": -914061504, "ts": 1716454216533177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216533177, "dur": 0, "args": { "External id": 2933, "cbid": 205, "correlation": 2933 } }, { "ph": "f", "id": 2933, "pid": 76337, "tid": -914061504, "ts": 1716454216533177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216546439, "dur": 39, "args": { "External id": 2937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2937, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [1536, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2937, "pid": 5, "tid": 7, "ts": 1716454216546439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533192, "dur": 11, "args": { "External id": 2937, "cbid": 211, "correlation": 2937 } }, { "ph": "s", "id": 2937, "pid": 76337, "tid": -914061504, "ts": 1716454216533192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216546480, "dur": 8, "args": { "External id": 2939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2939, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12.8, "warps per SM": 102.4, "grid": [1, 4, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2939, "pid": 5, "tid": 7, "ts": 1716454216546480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533207, "dur": 5, "args": { "External id": 2939, "cbid": 211, "correlation": 2939 } }, { "ph": "s", "id": 2939, "pid": 76337, "tid": -914061504, "ts": 1716454216533207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216546489, "dur": 4, "args": { "External id": 2941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 2941, "pid": 5, "tid": 7, "ts": 1716454216546489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533217, "dur": 6, "args": { "External id": 2941, "cbid": 211, "correlation": 2941 } }, { "ph": "s", "id": 2941, "pid": 76337, "tid": -914061504, "ts": 1716454216533217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216533227, "dur": 0, "args": { "External id": 2942, "cbid": 51, "correlation": 2942 } }, { "ph": "s", "id": 2942, "pid": 76337, "tid": -914061504, "ts": 1716454216533227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216546495, "dur": 435, "args": { "External id": 2943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2943, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [192, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 2943, "pid": 5, "tid": 7, "ts": 1716454216546495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533228, "dur": 6, "args": { "External id": 2943, "cbid": 211, "correlation": 2943 } }, { "ph": "s", "id": 2943, "pid": 76337, "tid": -914061504, "ts": 1716454216533228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216546931, "dur": 91, "args": { "External id": 2948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2948, "pid": 5, "tid": 7, "ts": 1716454216546931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533257, "dur": 9, "args": { "External id": 2948, "cbid": 211, "correlation": 2948 } }, { "ph": "s", "id": 2948, "pid": 76337, "tid": -914061504, "ts": 1716454216533257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216547024, "dur": 367, "args": { "External id": 2968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2968, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 2968, "pid": 5, "tid": 7, "ts": 1716454216547024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533326, "dur": 11, "args": { "External id": 2968, "cbid": 211, "correlation": 2968 } }, { "ph": "s", "id": 2968, "pid": 76337, "tid": -914061504, "ts": 1716454216533326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216547393, "dur": 4, "args": { "External id": 2980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2980, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 2980, "pid": 5, "tid": 7, "ts": 1716454216547393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533349, "dur": 6, "args": { "External id": 2980, "cbid": 211, "correlation": 2980 } }, { "ph": "s", "id": 2980, "pid": 76337, "tid": -914061504, "ts": 1716454216533349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216547398, "dur": 90, "args": { "External id": 2983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2983, "pid": 5, "tid": 7, "ts": 1716454216547398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533366, "dur": 8, "args": { "External id": 2983, "cbid": 211, "correlation": 2983 } }, { "ph": "s", "id": 2983, "pid": 76337, "tid": -914061504, "ts": 1716454216533366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216547490, "dur": 58, "args": { "External id": 2992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 2992, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 2992, "pid": 5, "tid": 7, "ts": 1716454216547490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533406, "dur": 9, "args": { "External id": 2992, "cbid": 211, "correlation": 2992 } }, { "ph": "s", "id": 2992, "pid": 76337, "tid": -914061504, "ts": 1716454216533406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216533468, "dur": 0, "args": { "External id": 3002, "cbid": 317, "correlation": 3002 } }, { "ph": "f", "id": 3002, "pid": 76337, "tid": -914061504, "ts": 1716454216533468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216533469, "dur": 0, "args": { "External id": 3003, "cbid": 203, "correlation": 3003 } }, { "ph": "f", "id": 3003, "pid": 76337, "tid": -914061504, "ts": 1716454216533469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216533470, "dur": 0, "args": { "External id": 3004, "cbid": 205, "correlation": 3004 } }, { "ph": "f", "id": 3004, "pid": 76337, "tid": -914061504, "ts": 1716454216533470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216547549, "dur": 102, "args": { "External id": 3008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3008, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3008, "pid": 5, "tid": 7, "ts": 1716454216547549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533485, "dur": 12, "args": { "External id": 3008, "cbid": 211, "correlation": 3008 } }, { "ph": "s", "id": 3008, "pid": 76337, "tid": -914061504, "ts": 1716454216533485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216547653, "dur": 12, "args": { "External id": 3010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3010, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3010, "pid": 5, "tid": 7, "ts": 1716454216547653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533499, "dur": 5, "args": { "External id": 3010, "cbid": 211, "correlation": 3010 } }, { "ph": "s", "id": 3010, "pid": 76337, "tid": -914061504, "ts": 1716454216533499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216547666, "dur": 4, "args": { "External id": 3012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 3012, "pid": 5, "tid": 7, "ts": 1716454216547666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533509, "dur": 5, "args": { "External id": 3012, "cbid": 211, "correlation": 3012 } }, { "ph": "s", "id": 3012, "pid": 76337, "tid": -914061504, "ts": 1716454216533509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216533517, "dur": 0, "args": { "External id": 3013, "cbid": 51, "correlation": 3013 } }, { "ph": "s", "id": 3013, "pid": 76337, "tid": -914061504, "ts": 1716454216533517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216547671, "dur": 777, "args": { "External id": 3014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3014, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3014, "pid": 5, "tid": 7, "ts": 1716454216547671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533518, "dur": 5, "args": { "External id": 3014, "cbid": 211, "correlation": 3014 } }, { "ph": "s", "id": 3014, "pid": 76337, "tid": -914061504, "ts": 1716454216533518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216548449, "dur": 91, "args": { "External id": 3019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3019, "pid": 5, "tid": 7, "ts": 1716454216548449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533546, "dur": 9, "args": { "External id": 3019, "cbid": 211, "correlation": 3019 } }, { "ph": "s", "id": 3019, "pid": 76337, "tid": -914061504, "ts": 1716454216533546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216533604, "dur": 0, "args": { "External id": 3029, "cbid": 317, "correlation": 3029 } }, { "ph": "f", "id": 3029, "pid": 76337, "tid": -914061504, "ts": 1716454216533604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216533604, "dur": 0, "args": { "External id": 3030, "cbid": 203, "correlation": 3030 } }, { "ph": "f", "id": 3030, "pid": 76337, "tid": -914061504, "ts": 1716454216533604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216533605, "dur": 0, "args": { "External id": 3031, "cbid": 205, "correlation": 3031 } }, { "ph": "f", "id": 3031, "pid": 76337, "tid": -914061504, "ts": 1716454216533605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216533667, "dur": 4, "args": { "External id": 3035, "cbid": 251, "correlation": 3035 } }, { "ph": "f", "id": 3035, "pid": 76337, "tid": -914061504, "ts": 1716454216533667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216548541, "dur": 85, "args": { "External id": 3036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3036, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [2, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3036, "pid": 5, "tid": 7, "ts": 1716454216548541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533677, "dur": 15, "args": { "External id": 3036, "cbid": 211, "correlation": 3036 } }, { "ph": "s", "id": 3036, "pid": 76337, "tid": -914061504, "ts": 1716454216533677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216548627, "dur": 92, "args": { "External id": 3042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3042, "pid": 5, "tid": 7, "ts": 1716454216548627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533714, "dur": 9, "args": { "External id": 3042, "cbid": 211, "correlation": 3042 } }, { "ph": "s", "id": 3042, "pid": 76337, "tid": -914061504, "ts": 1716454216533714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216548720, "dur": 254, "args": { "External id": 3050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3050, "pid": 5, "tid": 7, "ts": 1716454216548720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533744, "dur": 8, "args": { "External id": 3050, "cbid": 211, "correlation": 3050 } }, { "ph": "s", "id": 3050, "pid": 76337, "tid": -914061504, "ts": 1716454216533744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216548976, "dur": 54, "args": { "External id": 3058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3058, "registers per thread": 17, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3058, "pid": 5, "tid": 7, "ts": 1716454216548976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533774, "dur": 8, "args": { "External id": 3058, "cbid": 211, "correlation": 3058 } }, { "ph": "s", "id": 3058, "pid": 76337, "tid": -914061504, "ts": 1716454216533774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216549031, "dur": 370, "args": { "External id": 3068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3068, "pid": 5, "tid": 7, "ts": 1716454216549031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533843, "dur": 12, "args": { "External id": 3068, "cbid": 211, "correlation": 3068 } }, { "ph": "s", "id": 3068, "pid": 76337, "tid": -914061504, "ts": 1716454216533843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216549403, "dur": 377, "args": { "External id": 3089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3089, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3089, "pid": 5, "tid": 7, "ts": 1716454216549403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533876, "dur": 8, "args": { "External id": 3089, "cbid": 211, "correlation": 3089 } }, { "ph": "s", "id": 3089, "pid": 76337, "tid": -914061504, "ts": 1716454216533876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216549781, "dur": 4, "args": { "External id": 3101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3101, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3101, "pid": 5, "tid": 7, "ts": 1716454216549781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533892, "dur": 6, "args": { "External id": 3101, "cbid": 211, "correlation": 3101 } }, { "ph": "s", "id": 3101, "pid": 76337, "tid": -914061504, "ts": 1716454216533892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216549787, "dur": 90, "args": { "External id": 3104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3104, "pid": 5, "tid": 7, "ts": 1716454216549787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533909, "dur": 6, "args": { "External id": 3104, "cbid": 211, "correlation": 3104 } }, { "ph": "s", "id": 3104, "pid": 76337, "tid": -914061504, "ts": 1716454216533909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216549878, "dur": 57, "args": { "External id": 3113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3113, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3113, "pid": 5, "tid": 7, "ts": 1716454216549878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216533948, "dur": 9, "args": { "External id": 3113, "cbid": 211, "correlation": 3113 } }, { "ph": "s", "id": 3113, "pid": 76337, "tid": -914061504, "ts": 1716454216533948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216534008, "dur": 0, "args": { "External id": 3123, "cbid": 317, "correlation": 3123 } }, { "ph": "f", "id": 3123, "pid": 76337, "tid": -914061504, "ts": 1716454216534008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216534009, "dur": 0, "args": { "External id": 3124, "cbid": 203, "correlation": 3124 } }, { "ph": "f", "id": 3124, "pid": 76337, "tid": -914061504, "ts": 1716454216534009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216534010, "dur": 0, "args": { "External id": 3125, "cbid": 205, "correlation": 3125 } }, { "ph": "f", "id": 3125, "pid": 76337, "tid": -914061504, "ts": 1716454216534010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216549937, "dur": 103, "args": { "External id": 3129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3129, "pid": 5, "tid": 7, "ts": 1716454216549937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534023, "dur": 12, "args": { "External id": 3129, "cbid": 211, "correlation": 3129 } }, { "ph": "s", "id": 3129, "pid": 76337, "tid": -914061504, "ts": 1716454216534023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216550042, "dur": 12, "args": { "External id": 3131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3131, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3131, "pid": 5, "tid": 7, "ts": 1716454216550042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534037, "dur": 5, "args": { "External id": 3131, "cbid": 211, "correlation": 3131 } }, { "ph": "s", "id": 3131, "pid": 76337, "tid": -914061504, "ts": 1716454216534037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216550055, "dur": 4, "args": { "External id": 3133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 3133, "pid": 5, "tid": 7, "ts": 1716454216550055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534046, "dur": 6, "args": { "External id": 3133, "cbid": 211, "correlation": 3133 } }, { "ph": "s", "id": 3133, "pid": 76337, "tid": -914061504, "ts": 1716454216534046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216534055, "dur": 0, "args": { "External id": 3134, "cbid": 51, "correlation": 3134 } }, { "ph": "s", "id": 3134, "pid": 76337, "tid": -914061504, "ts": 1716454216534055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216550060, "dur": 772, "args": { "External id": 3135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3135, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3135, "pid": 5, "tid": 7, "ts": 1716454216550060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534056, "dur": 5, "args": { "External id": 3135, "cbid": 211, "correlation": 3135 } }, { "ph": "s", "id": 3135, "pid": 76337, "tid": -914061504, "ts": 1716454216534056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216550834, "dur": 92, "args": { "External id": 3140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3140, "pid": 5, "tid": 7, "ts": 1716454216550834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534084, "dur": 9, "args": { "External id": 3140, "cbid": 211, "correlation": 3140 } }, { "ph": "s", "id": 3140, "pid": 76337, "tid": -914061504, "ts": 1716454216534084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216550927, "dur": 366, "args": { "External id": 3160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3160, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3160, "pid": 5, "tid": 7, "ts": 1716454216550927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534151, "dur": 12, "args": { "External id": 3160, "cbid": 211, "correlation": 3160 } }, { "ph": "s", "id": 3160, "pid": 76337, "tid": -914061504, "ts": 1716454216534151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216551295, "dur": 4, "args": { "External id": 3172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3172, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3172, "pid": 5, "tid": 7, "ts": 1716454216551295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534173, "dur": 6, "args": { "External id": 3172, "cbid": 211, "correlation": 3172 } }, { "ph": "s", "id": 3172, "pid": 76337, "tid": -914061504, "ts": 1716454216534173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216551300, "dur": 90, "args": { "External id": 3175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3175, "pid": 5, "tid": 7, "ts": 1716454216551300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534190, "dur": 6, "args": { "External id": 3175, "cbid": 211, "correlation": 3175 } }, { "ph": "s", "id": 3175, "pid": 76337, "tid": -914061504, "ts": 1716454216534190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216551391, "dur": 58, "args": { "External id": 3184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3184, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3184, "pid": 5, "tid": 7, "ts": 1716454216551391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534227, "dur": 10, "args": { "External id": 3184, "cbid": 211, "correlation": 3184 } }, { "ph": "s", "id": 3184, "pid": 76337, "tid": -914061504, "ts": 1716454216534227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216534290, "dur": 0, "args": { "External id": 3194, "cbid": 317, "correlation": 3194 } }, { "ph": "f", "id": 3194, "pid": 76337, "tid": -914061504, "ts": 1716454216534290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216534291, "dur": 0, "args": { "External id": 3195, "cbid": 203, "correlation": 3195 } }, { "ph": "f", "id": 3195, "pid": 76337, "tid": -914061504, "ts": 1716454216534291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216534292, "dur": 0, "args": { "External id": 3196, "cbid": 205, "correlation": 3196 } }, { "ph": "f", "id": 3196, "pid": 76337, "tid": -914061504, "ts": 1716454216534292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216551450, "dur": 105, "args": { "External id": 3200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3200, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3200, "pid": 5, "tid": 7, "ts": 1716454216551450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534305, "dur": 12, "args": { "External id": 3200, "cbid": 211, "correlation": 3200 } }, { "ph": "s", "id": 3200, "pid": 76337, "tid": -914061504, "ts": 1716454216534305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216551557, "dur": 12, "args": { "External id": 3202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3202, "pid": 5, "tid": 7, "ts": 1716454216551557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534319, "dur": 5, "args": { "External id": 3202, "cbid": 211, "correlation": 3202 } }, { "ph": "s", "id": 3202, "pid": 76337, "tid": -914061504, "ts": 1716454216534319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216551570, "dur": 4, "args": { "External id": 3204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 3204, "pid": 5, "tid": 7, "ts": 1716454216551570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534327, "dur": 5, "args": { "External id": 3204, "cbid": 211, "correlation": 3204 } }, { "ph": "s", "id": 3204, "pid": 76337, "tid": -914061504, "ts": 1716454216534327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216534335, "dur": 0, "args": { "External id": 3205, "cbid": 51, "correlation": 3205 } }, { "ph": "s", "id": 3205, "pid": 76337, "tid": -914061504, "ts": 1716454216534335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216551575, "dur": 770, "args": { "External id": 3206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3206, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3206, "pid": 5, "tid": 7, "ts": 1716454216551575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534336, "dur": 5, "args": { "External id": 3206, "cbid": 211, "correlation": 3206 } }, { "ph": "s", "id": 3206, "pid": 76337, "tid": -914061504, "ts": 1716454216534336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216552346, "dur": 91, "args": { "External id": 3211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3211, "pid": 5, "tid": 7, "ts": 1716454216552346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534364, "dur": 8, "args": { "External id": 3211, "cbid": 211, "correlation": 3211 } }, { "ph": "s", "id": 3211, "pid": 76337, "tid": -914061504, "ts": 1716454216534364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216552439, "dur": 254, "args": { "External id": 3219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3219, "pid": 5, "tid": 7, "ts": 1716454216552439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534397, "dur": 10, "args": { "External id": 3219, "cbid": 211, "correlation": 3219 } }, { "ph": "s", "id": 3219, "pid": 76337, "tid": -914061504, "ts": 1716454216534397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216552695, "dur": 53, "args": { "External id": 3227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3227, "registers per thread": 17, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3227, "pid": 5, "tid": 7, "ts": 1716454216552695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534427, "dur": 8, "args": { "External id": 3227, "cbid": 211, "correlation": 3227 } }, { "ph": "s", "id": 3227, "pid": 76337, "tid": -914061504, "ts": 1716454216534427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216552749, "dur": 46, "args": { "External id": 3237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 310.0125, "warps per SM": 1240.05, "grid": [24801, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3237, "pid": 5, "tid": 7, "ts": 1716454216552749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534481, "dur": 11, "args": { "External id": 3237, "cbid": 211, "correlation": 3237 } }, { "ph": "s", "id": 3237, "pid": 76337, "tid": -914061504, "ts": 1716454216534481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216552797, "dur": 85, "args": { "External id": 3242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3242, "pid": 5, "tid": 7, "ts": 1716454216552797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534510, "dur": 8, "args": { "External id": 3242, "cbid": 211, "correlation": 3242 } }, { "ph": "s", "id": 3242, "pid": 76337, "tid": -914061504, "ts": 1716454216534510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216552883, "dur": 9, "args": { "External id": 3257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 3257, "pid": 5, "tid": 7, "ts": 1716454216552883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534575, "dur": 13, "args": { "External id": 3257, "cbid": 211, "correlation": 3257 } }, { "ph": "s", "id": 3257, "pid": 76337, "tid": -914061504, "ts": 1716454216534575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216534595, "dur": 0, "args": { "External id": 3264, "cbid": 317, "correlation": 3264 } }, { "ph": "f", "id": 3264, "pid": 76337, "tid": -914061504, "ts": 1716454216534595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216534596, "dur": 0, "args": { "External id": 3265, "cbid": 203, "correlation": 3265 } }, { "ph": "f", "id": 3265, "pid": 76337, "tid": -914061504, "ts": 1716454216534596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216534597, "dur": 0, "args": { "External id": 3266, "cbid": 205, "correlation": 3266 } }, { "ph": "f", "id": 3266, "pid": 76337, "tid": -914061504, "ts": 1716454216534597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216552893, "dur": 314, "args": { "External id": 3270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3270, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 1.2, "warps per SM": 9.6, "grid": [1, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3270, "pid": 5, "tid": 7, "ts": 1716454216552893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534615, "dur": 10, "args": { "External id": 3270, "cbid": 211, "correlation": 3270 } }, { "ph": "s", "id": 3270, "pid": 76337, "tid": -914061504, "ts": 1716454216534615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216553209, "dur": 26, "args": { "External id": 3276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3276, "pid": 5, "tid": 7, "ts": 1716454216553209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534647, "dur": 8, "args": { "External id": 3276, "cbid": 211, "correlation": 3276 } }, { "ph": "s", "id": 3276, "pid": 76337, "tid": -914061504, "ts": 1716454216534647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216553236, "dur": 70, "args": { "External id": 3286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3286, "pid": 5, "tid": 7, "ts": 1716454216553236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534730, "dur": 12, "args": { "External id": 3286, "cbid": 211, "correlation": 3286 } }, { "ph": "s", "id": 3286, "pid": 76337, "tid": -914061504, "ts": 1716454216534730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216553307, "dur": 95, "args": { "External id": 3307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3307, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3307, "pid": 5, "tid": 7, "ts": 1716454216553307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534764, "dur": 7, "args": { "External id": 3307, "cbid": 211, "correlation": 3307 } }, { "ph": "s", "id": 3307, "pid": 76337, "tid": -914061504, "ts": 1716454216534764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216553404, "dur": 5, "args": { "External id": 3319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3319, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3319, "pid": 5, "tid": 7, "ts": 1716454216553404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534781, "dur": 6, "args": { "External id": 3319, "cbid": 211, "correlation": 3319 } }, { "ph": "s", "id": 3319, "pid": 76337, "tid": -914061504, "ts": 1716454216534781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216553409, "dur": 26, "args": { "External id": 3322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3322, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3322, "pid": 5, "tid": 7, "ts": 1716454216553409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534796, "dur": 6, "args": { "External id": 3322, "cbid": 211, "correlation": 3322 } }, { "ph": "s", "id": 3322, "pid": 76337, "tid": -914061504, "ts": 1716454216534796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216553437, "dur": 18, "args": { "External id": 3331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3331, "registers per thread": 24, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3331, "pid": 5, "tid": 7, "ts": 1716454216553437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534835, "dur": 9, "args": { "External id": 3331, "cbid": 211, "correlation": 3331 } }, { "ph": "s", "id": 3331, "pid": 76337, "tid": -914061504, "ts": 1716454216534835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216534887, "dur": 0, "args": { "External id": 3341, "cbid": 317, "correlation": 3341 } }, { "ph": "f", "id": 3341, "pid": 76337, "tid": -914061504, "ts": 1716454216534887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216534888, "dur": 0, "args": { "External id": 3342, "cbid": 203, "correlation": 3342 } }, { "ph": "f", "id": 3342, "pid": 76337, "tid": -914061504, "ts": 1716454216534888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216534888, "dur": 0, "args": { "External id": 3343, "cbid": 205, "correlation": 3343 } }, { "ph": "f", "id": 3343, "pid": 76337, "tid": -914061504, "ts": 1716454216534888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216553456, "dur": 20, "args": { "External id": 3347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3347, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 38.4, "warps per SM": 307.2, "grid": [384, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3347, "pid": 5, "tid": 7, "ts": 1716454216553456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534902, "dur": 11, "args": { "External id": 3347, "cbid": 211, "correlation": 3347 } }, { "ph": "s", "id": 3347, "pid": 76337, "tid": -914061504, "ts": 1716454216534902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216553477, "dur": 19, "args": { "External id": 3349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 51.2, "warps per SM": 409.6, "grid": [1, 8, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3349, "pid": 5, "tid": 7, "ts": 1716454216553477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534916, "dur": 5, "args": { "External id": 3349, "cbid": 211, "correlation": 3349 } }, { "ph": "s", "id": 3349, "pid": 76337, "tid": -914061504, "ts": 1716454216534916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216553499, "dur": 1, "args": { "External id": 3351, "device": 5, "context": 1, "stream": 7, "correlation": 3351, "bytes": 3072, "memory bandwidth (GB/s)": 1.6 } }, { "ph": "f", "id": 3351, "pid": 5, "tid": 7, "ts": 1716454216553499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216534929, "dur": 15, "args": { "External id": 3351, "cbid": 51, "correlation": 3351 } }, { "ph": "s", "id": 3351, "pid": 76337, "tid": -914061504, "ts": 1716454216534929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216553503, "dur": 465, "args": { "External id": 3352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3352, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [4, 96, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3352, "pid": 5, "tid": 7, "ts": 1716454216553503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534945, "dur": 8, "args": { "External id": 3352, "cbid": 211, "correlation": 3352 } }, { "ph": "s", "id": 3352, "pid": 76337, "tid": -914061504, "ts": 1716454216534945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216553969, "dur": 38, "args": { "External id": 3354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3354, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3354, "pid": 5, "tid": 7, "ts": 1716454216553969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534959, "dur": 7, "args": { "External id": 3354, "cbid": 211, "correlation": 3354 } }, { "ph": "s", "id": 3354, "pid": 76337, "tid": -914061504, "ts": 1716454216534959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216554009, "dur": 49, "args": { "External id": 3360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3360, "pid": 5, "tid": 7, "ts": 1716454216554009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216534999, "dur": 9, "args": { "External id": 3360, "cbid": 211, "correlation": 3360 } }, { "ph": "s", "id": 3360, "pid": 76337, "tid": -914061504, "ts": 1716454216534999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216554059, "dur": 189, "args": { "External id": 3380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3380, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3380, "pid": 5, "tid": 7, "ts": 1716454216554059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535112, "dur": 13, "args": { "External id": 3380, "cbid": 211, "correlation": 3380 } }, { "ph": "s", "id": 3380, "pid": 76337, "tid": -914061504, "ts": 1716454216535112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216554250, "dur": 4, "args": { "External id": 3392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3392, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3392, "pid": 5, "tid": 7, "ts": 1716454216554250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535135, "dur": 6, "args": { "External id": 3392, "cbid": 211, "correlation": 3392 } }, { "ph": "s", "id": 3392, "pid": 76337, "tid": -914061504, "ts": 1716454216535135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216554255, "dur": 47, "args": { "External id": 3395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3395, "pid": 5, "tid": 7, "ts": 1716454216554255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535153, "dur": 6, "args": { "External id": 3395, "cbid": 211, "correlation": 3395 } }, { "ph": "s", "id": 3395, "pid": 76337, "tid": -914061504, "ts": 1716454216535153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216554304, "dur": 31, "args": { "External id": 3404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3404, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3404, "pid": 5, "tid": 7, "ts": 1716454216554304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535194, "dur": 10, "args": { "External id": 3404, "cbid": 211, "correlation": 3404 } }, { "ph": "s", "id": 3404, "pid": 76337, "tid": -914061504, "ts": 1716454216535194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216535259, "dur": 0, "args": { "External id": 3414, "cbid": 317, "correlation": 3414 } }, { "ph": "f", "id": 3414, "pid": 76337, "tid": -914061504, "ts": 1716454216535259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216535260, "dur": 0, "args": { "External id": 3415, "cbid": 203, "correlation": 3415 } }, { "ph": "f", "id": 3415, "pid": 76337, "tid": -914061504, "ts": 1716454216535260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216535261, "dur": 0, "args": { "External id": 3416, "cbid": 205, "correlation": 3416 } }, { "ph": "f", "id": 3416, "pid": 76337, "tid": -914061504, "ts": 1716454216535261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216554336, "dur": 33, "args": { "External id": 3420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3420, "pid": 5, "tid": 7, "ts": 1716454216554336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535278, "dur": 12, "args": { "External id": 3420, "cbid": 211, "correlation": 3420 } }, { "ph": "s", "id": 3420, "pid": 76337, "tid": -914061504, "ts": 1716454216535278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216554371, "dur": 32, "args": { "External id": 3422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3422, "pid": 5, "tid": 7, "ts": 1716454216554371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535292, "dur": 5, "args": { "External id": 3422, "cbid": 211, "correlation": 3422 } }, { "ph": "s", "id": 3422, "pid": 76337, "tid": -914061504, "ts": 1716454216535292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216554406, "dur": 2, "args": { "External id": 3424, "device": 5, "context": 1, "stream": 7, "correlation": 3424, "bytes": 3072, "memory bandwidth (GB/s)": 1.4545454545454546 } }, { "ph": "f", "id": 3424, "pid": 5, "tid": 7, "ts": 1716454216554406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216535303, "dur": 7, "args": { "External id": 3424, "cbid": 51, "correlation": 3424 } }, { "ph": "s", "id": 3424, "pid": 76337, "tid": -914061504, "ts": 1716454216535303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216554410, "dur": 891, "args": { "External id": 3425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3425, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3425, "pid": 5, "tid": 7, "ts": 1716454216554410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535312, "dur": 6, "args": { "External id": 3425, "cbid": 211, "correlation": 3425 } }, { "ph": "s", "id": 3425, "pid": 76337, "tid": -914061504, "ts": 1716454216535312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216555302, "dur": 37, "args": { "External id": 3427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3427, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3427, "pid": 5, "tid": 7, "ts": 1716454216555302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535322, "dur": 5, "args": { "External id": 3427, "cbid": 211, "correlation": 3427 } }, { "ph": "s", "id": 3427, "pid": 76337, "tid": -914061504, "ts": 1716454216535322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216555340, "dur": 49, "args": { "External id": 3433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3433, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3433, "pid": 5, "tid": 7, "ts": 1716454216555340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535351, "dur": 8, "args": { "External id": 3433, "cbid": 211, "correlation": 3433 } }, { "ph": "s", "id": 3433, "pid": 76337, "tid": -914061504, "ts": 1716454216535351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216535410, "dur": 0, "args": { "External id": 3443, "cbid": 317, "correlation": 3443 } }, { "ph": "f", "id": 3443, "pid": 76337, "tid": -914061504, "ts": 1716454216535410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216535411, "dur": 0, "args": { "External id": 3444, "cbid": 203, "correlation": 3444 } }, { "ph": "f", "id": 3444, "pid": 76337, "tid": -914061504, "ts": 1716454216535411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216535412, "dur": 0, "args": { "External id": 3445, "cbid": 205, "correlation": 3445 } }, { "ph": "f", "id": 3445, "pid": 76337, "tid": -914061504, "ts": 1716454216535412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216535442, "dur": 1, "args": { "External id": 3449, "cbid": 251, "correlation": 3449 } }, { "ph": "f", "id": 3449, "pid": 76337, "tid": -914061504, "ts": 1716454216535442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216555391, "dur": 68, "args": { "External id": 3450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3450, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [4, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3450, "pid": 5, "tid": 7, "ts": 1716454216555391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535447, "dur": 13, "args": { "External id": 3450, "cbid": 211, "correlation": 3450 } }, { "ph": "s", "id": 3450, "pid": 76337, "tid": -914061504, "ts": 1716454216535447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216555460, "dur": 49, "args": { "External id": 3456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3456, "pid": 5, "tid": 7, "ts": 1716454216555460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535480, "dur": 9, "args": { "External id": 3456, "cbid": 211, "correlation": 3456 } }, { "ph": "s", "id": 3456, "pid": 76337, "tid": -914061504, "ts": 1716454216535480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216555510, "dur": 139, "args": { "External id": 3464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3464, "pid": 5, "tid": 7, "ts": 1716454216555510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535512, "dur": 8, "args": { "External id": 3464, "cbid": 211, "correlation": 3464 } }, { "ph": "s", "id": 3464, "pid": 76337, "tid": -914061504, "ts": 1716454216535512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216555650, "dur": 29, "args": { "External id": 3472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3472, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3472, "pid": 5, "tid": 7, "ts": 1716454216555650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535543, "dur": 8, "args": { "External id": 3472, "cbid": 211, "correlation": 3472 } }, { "ph": "s", "id": 3472, "pid": 76337, "tid": -914061504, "ts": 1716454216535543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216555680, "dur": 135, "args": { "External id": 3482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3482, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3482, "pid": 5, "tid": 7, "ts": 1716454216555680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535614, "dur": 12, "args": { "External id": 3482, "cbid": 211, "correlation": 3482 } }, { "ph": "s", "id": 3482, "pid": 76337, "tid": -914061504, "ts": 1716454216535614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216555817, "dur": 192, "args": { "External id": 3503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3503, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3503, "pid": 5, "tid": 7, "ts": 1716454216555817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535648, "dur": 8, "args": { "External id": 3503, "cbid": 211, "correlation": 3503 } }, { "ph": "s", "id": 3503, "pid": 76337, "tid": -914061504, "ts": 1716454216535648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216556011, "dur": 4, "args": { "External id": 3515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3515, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3515, "pid": 5, "tid": 7, "ts": 1716454216556011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535664, "dur": 6, "args": { "External id": 3515, "cbid": 211, "correlation": 3515 } }, { "ph": "s", "id": 3515, "pid": 76337, "tid": -914061504, "ts": 1716454216535664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216556016, "dur": 48, "args": { "External id": 3518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3518, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3518, "pid": 5, "tid": 7, "ts": 1716454216556016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535681, "dur": 6, "args": { "External id": 3518, "cbid": 211, "correlation": 3518 } }, { "ph": "s", "id": 3518, "pid": 76337, "tid": -914061504, "ts": 1716454216535681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216556066, "dur": 31, "args": { "External id": 3527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3527, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3527, "pid": 5, "tid": 7, "ts": 1716454216556066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535720, "dur": 9, "args": { "External id": 3527, "cbid": 211, "correlation": 3527 } }, { "ph": "s", "id": 3527, "pid": 76337, "tid": -914061504, "ts": 1716454216535720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216535772, "dur": 0, "args": { "External id": 3537, "cbid": 317, "correlation": 3537 } }, { "ph": "f", "id": 3537, "pid": 76337, "tid": -914061504, "ts": 1716454216535772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216535773, "dur": 0, "args": { "External id": 3538, "cbid": 203, "correlation": 3538 } }, { "ph": "f", "id": 3538, "pid": 76337, "tid": -914061504, "ts": 1716454216535773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216535774, "dur": 0, "args": { "External id": 3539, "cbid": 205, "correlation": 3539 } }, { "ph": "f", "id": 3539, "pid": 76337, "tid": -914061504, "ts": 1716454216535774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216556098, "dur": 34, "args": { "External id": 3543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3543, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3543, "pid": 5, "tid": 7, "ts": 1716454216556098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535787, "dur": 11, "args": { "External id": 3543, "cbid": 211, "correlation": 3543 } }, { "ph": "s", "id": 3543, "pid": 76337, "tid": -914061504, "ts": 1716454216535787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216556134, "dur": 33, "args": { "External id": 3545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3545, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3545, "pid": 5, "tid": 7, "ts": 1716454216556134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535801, "dur": 6, "args": { "External id": 3545, "cbid": 211, "correlation": 3545 } }, { "ph": "s", "id": 3545, "pid": 76337, "tid": -914061504, "ts": 1716454216535801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216556169, "dur": 2, "args": { "External id": 3547, "device": 5, "context": 1, "stream": 7, "correlation": 3547, "bytes": 3072, "memory bandwidth (GB/s)": 1.476923076923077 } }, { "ph": "f", "id": 3547, "pid": 5, "tid": 7, "ts": 1716454216556169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216535812, "dur": 6, "args": { "External id": 3547, "cbid": 51, "correlation": 3547 } }, { "ph": "s", "id": 3547, "pid": 76337, "tid": -914061504, "ts": 1716454216535812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216556173, "dur": 897, "args": { "External id": 3548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3548, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3548, "pid": 5, "tid": 7, "ts": 1716454216556173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535820, "dur": 6, "args": { "External id": 3548, "cbid": 211, "correlation": 3548 } }, { "ph": "s", "id": 3548, "pid": 76337, "tid": -914061504, "ts": 1716454216535820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216557071, "dur": 37, "args": { "External id": 3550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3550, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3550, "pid": 5, "tid": 7, "ts": 1716454216557071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535830, "dur": 5, "args": { "External id": 3550, "cbid": 211, "correlation": 3550 } }, { "ph": "s", "id": 3550, "pid": 76337, "tid": -914061504, "ts": 1716454216535830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216557110, "dur": 50, "args": { "External id": 3556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3556, "pid": 5, "tid": 7, "ts": 1716454216557110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535858, "dur": 9, "args": { "External id": 3556, "cbid": 211, "correlation": 3556 } }, { "ph": "s", "id": 3556, "pid": 76337, "tid": -914061504, "ts": 1716454216535858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216557161, "dur": 191, "args": { "External id": 3576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3576, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3576, "pid": 5, "tid": 7, "ts": 1716454216557161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535926, "dur": 11, "args": { "External id": 3576, "cbid": 211, "correlation": 3576 } }, { "ph": "s", "id": 3576, "pid": 76337, "tid": -914061504, "ts": 1716454216535926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216557353, "dur": 4, "args": { "External id": 3588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3588, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3588, "pid": 5, "tid": 7, "ts": 1716454216557353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535947, "dur": 6, "args": { "External id": 3588, "cbid": 211, "correlation": 3588 } }, { "ph": "s", "id": 3588, "pid": 76337, "tid": -914061504, "ts": 1716454216535947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216557359, "dur": 47, "args": { "External id": 3591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3591, "pid": 5, "tid": 7, "ts": 1716454216557359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216535964, "dur": 7, "args": { "External id": 3591, "cbid": 211, "correlation": 3591 } }, { "ph": "s", "id": 3591, "pid": 76337, "tid": -914061504, "ts": 1716454216535964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216557407, "dur": 31, "args": { "External id": 3600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3600, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3600, "pid": 5, "tid": 7, "ts": 1716454216557407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536013, "dur": 11, "args": { "External id": 3600, "cbid": 211, "correlation": 3600 } }, { "ph": "s", "id": 3600, "pid": 76337, "tid": -914061504, "ts": 1716454216536013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216536078, "dur": 0, "args": { "External id": 3610, "cbid": 317, "correlation": 3610 } }, { "ph": "f", "id": 3610, "pid": 76337, "tid": -914061504, "ts": 1716454216536078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216536078, "dur": 0, "args": { "External id": 3611, "cbid": 203, "correlation": 3611 } }, { "ph": "f", "id": 3611, "pid": 76337, "tid": -914061504, "ts": 1716454216536078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216536079, "dur": 0, "args": { "External id": 3612, "cbid": 205, "correlation": 3612 } }, { "ph": "f", "id": 3612, "pid": 76337, "tid": -914061504, "ts": 1716454216536079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216557439, "dur": 33, "args": { "External id": 3616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3616, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3616, "pid": 5, "tid": 7, "ts": 1716454216557439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536092, "dur": 11, "args": { "External id": 3616, "cbid": 211, "correlation": 3616 } }, { "ph": "s", "id": 3616, "pid": 76337, "tid": -914061504, "ts": 1716454216536092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216557473, "dur": 32, "args": { "External id": 3618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3618, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3618, "pid": 5, "tid": 7, "ts": 1716454216557473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536106, "dur": 5, "args": { "External id": 3618, "cbid": 211, "correlation": 3618 } }, { "ph": "s", "id": 3618, "pid": 76337, "tid": -914061504, "ts": 1716454216536106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216557508, "dur": 1, "args": { "External id": 3620, "device": 5, "context": 1, "stream": 7, "correlation": 3620, "bytes": 3072, "memory bandwidth (GB/s)": 1.5737704918032787 } }, { "ph": "f", "id": 3620, "pid": 5, "tid": 7, "ts": 1716454216557508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216536116, "dur": 7, "args": { "External id": 3620, "cbid": 51, "correlation": 3620 } }, { "ph": "s", "id": 3620, "pid": 76337, "tid": -914061504, "ts": 1716454216536116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216557512, "dur": 889, "args": { "External id": 3621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3621, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3621, "pid": 5, "tid": 7, "ts": 1716454216557512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536124, "dur": 6, "args": { "External id": 3621, "cbid": 211, "correlation": 3621 } }, { "ph": "s", "id": 3621, "pid": 76337, "tid": -914061504, "ts": 1716454216536124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216558402, "dur": 37, "args": { "External id": 3623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3623, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3623, "pid": 5, "tid": 7, "ts": 1716454216558402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536134, "dur": 5, "args": { "External id": 3623, "cbid": 211, "correlation": 3623 } }, { "ph": "s", "id": 3623, "pid": 76337, "tid": -914061504, "ts": 1716454216536134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216558441, "dur": 49, "args": { "External id": 3629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3629, "pid": 5, "tid": 7, "ts": 1716454216558441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536161, "dur": 8, "args": { "External id": 3629, "cbid": 211, "correlation": 3629 } }, { "ph": "s", "id": 3629, "pid": 76337, "tid": -914061504, "ts": 1716454216536161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216558491, "dur": 140, "args": { "External id": 3637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3637, "pid": 5, "tid": 7, "ts": 1716454216558491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536194, "dur": 9, "args": { "External id": 3637, "cbid": 211, "correlation": 3637 } }, { "ph": "s", "id": 3637, "pid": 76337, "tid": -914061504, "ts": 1716454216536194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216558632, "dur": 28, "args": { "External id": 3645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3645, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3645, "pid": 5, "tid": 7, "ts": 1716454216558632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536224, "dur": 8, "args": { "External id": 3645, "cbid": 211, "correlation": 3645 } }, { "ph": "s", "id": 3645, "pid": 76337, "tid": -914061504, "ts": 1716454216536224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216558662, "dur": 25, "args": { "External id": 3655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 156.4125, "warps per SM": 625.65, "grid": [12513, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3655, "pid": 5, "tid": 7, "ts": 1716454216558662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536279, "dur": 10, "args": { "External id": 3655, "cbid": 211, "correlation": 3655 } }, { "ph": "s", "id": 3655, "pid": 76337, "tid": -914061504, "ts": 1716454216536279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216558688, "dur": 45, "args": { "External id": 3660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3660, "pid": 5, "tid": 7, "ts": 1716454216558688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536308, "dur": 7, "args": { "External id": 3660, "cbid": 211, "correlation": 3660 } }, { "ph": "s", "id": 3660, "pid": 76337, "tid": -914061504, "ts": 1716454216536308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216558735, "dur": 25, "args": { "External id": 3675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 57.6, "warps per SM": 230.4, "grid": [4608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3675, "pid": 5, "tid": 7, "ts": 1716454216558735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536371, "dur": 12, "args": { "External id": 3675, "cbid": 211, "correlation": 3675 } }, { "ph": "s", "id": 3675, "pid": 76337, "tid": -914061504, "ts": 1716454216536371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216536391, "dur": 0, "args": { "External id": 3682, "cbid": 317, "correlation": 3682 } }, { "ph": "f", "id": 3682, "pid": 76337, "tid": -914061504, "ts": 1716454216536391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216536392, "dur": 0, "args": { "External id": 3683, "cbid": 203, "correlation": 3683 } }, { "ph": "f", "id": 3683, "pid": 76337, "tid": -914061504, "ts": 1716454216536392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216536393, "dur": 0, "args": { "External id": 3684, "cbid": 205, "correlation": 3684 } }, { "ph": "f", "id": 3684, "pid": 76337, "tid": -914061504, "ts": 1716454216536393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216558761, "dur": 310, "args": { "External id": 3688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3688, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [4, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 3688, "pid": 5, "tid": 7, "ts": 1716454216558761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536409, "dur": 8, "args": { "External id": 3688, "cbid": 211, "correlation": 3688 } }, { "ph": "s", "id": 3688, "pid": 76337, "tid": -914061504, "ts": 1716454216536409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559072, "dur": 14, "args": { "External id": 3694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3694, "pid": 5, "tid": 7, "ts": 1716454216559072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536440, "dur": 8, "args": { "External id": 3694, "cbid": 211, "correlation": 3694 } }, { "ph": "s", "id": 3694, "pid": 76337, "tid": -914061504, "ts": 1716454216536440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559088, "dur": 37, "args": { "External id": 3704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3704, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3704, "pid": 5, "tid": 7, "ts": 1716454216559088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536522, "dur": 13, "args": { "External id": 3704, "cbid": 211, "correlation": 3704 } }, { "ph": "s", "id": 3704, "pid": 76337, "tid": -914061504, "ts": 1716454216536522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216559127, "dur": 47, "args": { "External id": 3725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3725, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3725, "pid": 5, "tid": 7, "ts": 1716454216559127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536556, "dur": 7, "args": { "External id": 3725, "cbid": 211, "correlation": 3725 } }, { "ph": "s", "id": 3725, "pid": 76337, "tid": -914061504, "ts": 1716454216536556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216559176, "dur": 4, "args": { "External id": 3737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3737, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3737, "pid": 5, "tid": 7, "ts": 1716454216559176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536572, "dur": 6, "args": { "External id": 3737, "cbid": 211, "correlation": 3737 } }, { "ph": "s", "id": 3737, "pid": 76337, "tid": -914061504, "ts": 1716454216536572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559181, "dur": 16, "args": { "External id": 3740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3740, "pid": 5, "tid": 7, "ts": 1716454216559181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536590, "dur": 6, "args": { "External id": 3740, "cbid": 211, "correlation": 3740 } }, { "ph": "s", "id": 3740, "pid": 76337, "tid": -914061504, "ts": 1716454216536590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216559198, "dur": 10, "args": { "External id": 3749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3749, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3749, "pid": 5, "tid": 7, "ts": 1716454216559198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536628, "dur": 10, "args": { "External id": 3749, "cbid": 211, "correlation": 3749 } }, { "ph": "s", "id": 3749, "pid": 76337, "tid": -914061504, "ts": 1716454216536628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216536680, "dur": 0, "args": { "External id": 3759, "cbid": 317, "correlation": 3759 } }, { "ph": "f", "id": 3759, "pid": 76337, "tid": -914061504, "ts": 1716454216536680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216536681, "dur": 0, "args": { "External id": 3760, "cbid": 203, "correlation": 3760 } }, { "ph": "f", "id": 3760, "pid": 76337, "tid": -914061504, "ts": 1716454216536681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216536681, "dur": 0, "args": { "External id": 3761, "cbid": 205, "correlation": 3761 } }, { "ph": "f", "id": 3761, "pid": 76337, "tid": -914061504, "ts": 1716454216536681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216559209, "dur": 11, "args": { "External id": 3765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3765, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3765, "pid": 5, "tid": 7, "ts": 1716454216559209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536696, "dur": 11, "args": { "External id": 3765, "cbid": 211, "correlation": 3765 } }, { "ph": "s", "id": 3765, "pid": 76337, "tid": -914061504, "ts": 1716454216536696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216559222, "dur": 32, "args": { "External id": 3767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3767, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3767, "pid": 5, "tid": 7, "ts": 1716454216559222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536710, "dur": 5, "args": { "External id": 3767, "cbid": 211, "correlation": 3767 } }, { "ph": "s", "id": 3767, "pid": 76337, "tid": -914061504, "ts": 1716454216536710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216559256, "dur": 1, "args": { "External id": 3769, "device": 5, "context": 1, "stream": 7, "correlation": 3769, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 3769, "pid": 5, "tid": 7, "ts": 1716454216559256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216536721, "dur": 7, "args": { "External id": 3769, "cbid": 51, "correlation": 3769 } }, { "ph": "s", "id": 3769, "pid": 76337, "tid": -914061504, "ts": 1716454216536721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216559260, "dur": 273, "args": { "External id": 3770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3770, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3770, "pid": 5, "tid": 7, "ts": 1716454216559260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536729, "dur": 6, "args": { "External id": 3770, "cbid": 211, "correlation": 3770 } }, { "ph": "s", "id": 3770, "pid": 76337, "tid": -914061504, "ts": 1716454216536729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216559535, "dur": 11, "args": { "External id": 3772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3772, "pid": 5, "tid": 7, "ts": 1716454216559535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536739, "dur": 6, "args": { "External id": 3772, "cbid": 211, "correlation": 3772 } }, { "ph": "s", "id": 3772, "pid": 76337, "tid": -914061504, "ts": 1716454216536739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559547, "dur": 12, "args": { "External id": 3778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3778, "pid": 5, "tid": 7, "ts": 1716454216559547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536767, "dur": 8, "args": { "External id": 3778, "cbid": 211, "correlation": 3778 } }, { "ph": "s", "id": 3778, "pid": 76337, "tid": -914061504, "ts": 1716454216536767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216559561, "dur": 30, "args": { "External id": 3798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3798, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3798, "pid": 5, "tid": 7, "ts": 1716454216559561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536835, "dur": 12, "args": { "External id": 3798, "cbid": 211, "correlation": 3798 } }, { "ph": "s", "id": 3798, "pid": 76337, "tid": -914061504, "ts": 1716454216536835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216559592, "dur": 4, "args": { "External id": 3810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3810, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3810, "pid": 5, "tid": 7, "ts": 1716454216559592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536857, "dur": 6, "args": { "External id": 3810, "cbid": 211, "correlation": 3810 } }, { "ph": "s", "id": 3810, "pid": 76337, "tid": -914061504, "ts": 1716454216536857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559598, "dur": 13, "args": { "External id": 3813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3813, "pid": 5, "tid": 7, "ts": 1716454216559598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536874, "dur": 6, "args": { "External id": 3813, "cbid": 211, "correlation": 3813 } }, { "ph": "s", "id": 3813, "pid": 76337, "tid": -914061504, "ts": 1716454216536874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216559612, "dur": 9, "args": { "External id": 3822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3822, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3822, "pid": 5, "tid": 7, "ts": 1716454216559612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536913, "dur": 10, "args": { "External id": 3822, "cbid": 211, "correlation": 3822 } }, { "ph": "s", "id": 3822, "pid": 76337, "tid": -914061504, "ts": 1716454216536913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216536983, "dur": 0, "args": { "External id": 3832, "cbid": 317, "correlation": 3832 } }, { "ph": "f", "id": 3832, "pid": 76337, "tid": -914061504, "ts": 1716454216536983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216536984, "dur": 0, "args": { "External id": 3833, "cbid": 203, "correlation": 3833 } }, { "ph": "f", "id": 3833, "pid": 76337, "tid": -914061504, "ts": 1716454216536984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216536984, "dur": 0, "args": { "External id": 3834, "cbid": 205, "correlation": 3834 } }, { "ph": "f", "id": 3834, "pid": 76337, "tid": -914061504, "ts": 1716454216536984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216559623, "dur": 9, "args": { "External id": 3838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3838, "pid": 5, "tid": 7, "ts": 1716454216559623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216536998, "dur": 13, "args": { "External id": 3838, "cbid": 211, "correlation": 3838 } }, { "ph": "s", "id": 3838, "pid": 76337, "tid": -914061504, "ts": 1716454216536998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216559633, "dur": 32, "args": { "External id": 3840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3840, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3840, "pid": 5, "tid": 7, "ts": 1716454216559633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537013, "dur": 5, "args": { "External id": 3840, "cbid": 211, "correlation": 3840 } }, { "ph": "s", "id": 3840, "pid": 76337, "tid": -914061504, "ts": 1716454216537013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216559667, "dur": 1, "args": { "External id": 3842, "device": 5, "context": 1, "stream": 7, "correlation": 3842, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 3842, "pid": 5, "tid": 7, "ts": 1716454216559667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216537023, "dur": 6, "args": { "External id": 3842, "cbid": 51, "correlation": 3842 } }, { "ph": "s", "id": 3842, "pid": 76337, "tid": -914061504, "ts": 1716454216537023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216559671, "dur": 265, "args": { "External id": 3843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3843, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3843, "pid": 5, "tid": 7, "ts": 1716454216559671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537031, "dur": 6, "args": { "External id": 3843, "cbid": 211, "correlation": 3843 } }, { "ph": "s", "id": 3843, "pid": 76337, "tid": -914061504, "ts": 1716454216537031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216559938, "dur": 10, "args": { "External id": 3845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3845, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3845, "pid": 5, "tid": 7, "ts": 1716454216559938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537040, "dur": 5, "args": { "External id": 3845, "cbid": 211, "correlation": 3845 } }, { "ph": "s", "id": 3845, "pid": 76337, "tid": -914061504, "ts": 1716454216537040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559949, "dur": 12, "args": { "External id": 3851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3851, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3851, "pid": 5, "tid": 7, "ts": 1716454216559949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537068, "dur": 8, "args": { "External id": 3851, "cbid": 211, "correlation": 3851 } }, { "ph": "s", "id": 3851, "pid": 76337, "tid": -914061504, "ts": 1716454216537068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216559962, "dur": 39, "args": { "External id": 3859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3859, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3859, "pid": 5, "tid": 7, "ts": 1716454216559962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537102, "dur": 10, "args": { "External id": 3859, "cbid": 211, "correlation": 3859 } }, { "ph": "s", "id": 3859, "pid": 76337, "tid": -914061504, "ts": 1716454216537102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216560003, "dur": 10, "args": { "External id": 3867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3867, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3867, "pid": 5, "tid": 7, "ts": 1716454216560003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537132, "dur": 8, "args": { "External id": 3867, "cbid": 211, "correlation": 3867 } }, { "ph": "s", "id": 3867, "pid": 76337, "tid": -914061504, "ts": 1716454216537132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560014, "dur": 38, "args": { "External id": 3877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3877, "pid": 5, "tid": 7, "ts": 1716454216560014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537200, "dur": 12, "args": { "External id": 3877, "cbid": 211, "correlation": 3877 } }, { "ph": "s", "id": 3877, "pid": 76337, "tid": -914061504, "ts": 1716454216537200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216560053, "dur": 48, "args": { "External id": 3898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3898, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3898, "pid": 5, "tid": 7, "ts": 1716454216560053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537232, "dur": 7, "args": { "External id": 3898, "cbid": 211, "correlation": 3898 } }, { "ph": "s", "id": 3898, "pid": 76337, "tid": -914061504, "ts": 1716454216537232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216560102, "dur": 4, "args": { "External id": 3910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3910, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3910, "pid": 5, "tid": 7, "ts": 1716454216560102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537248, "dur": 6, "args": { "External id": 3910, "cbid": 211, "correlation": 3910 } }, { "ph": "s", "id": 3910, "pid": 76337, "tid": -914061504, "ts": 1716454216537248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560108, "dur": 14, "args": { "External id": 3913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3913, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3913, "pid": 5, "tid": 7, "ts": 1716454216560108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537264, "dur": 7, "args": { "External id": 3913, "cbid": 211, "correlation": 3913 } }, { "ph": "s", "id": 3913, "pid": 76337, "tid": -914061504, "ts": 1716454216537264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216560124, "dur": 9, "args": { "External id": 3922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3922, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3922, "pid": 5, "tid": 7, "ts": 1716454216560124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537303, "dur": 10, "args": { "External id": 3922, "cbid": 211, "correlation": 3922 } }, { "ph": "s", "id": 3922, "pid": 76337, "tid": -914061504, "ts": 1716454216537303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216537354, "dur": 0, "args": { "External id": 3932, "cbid": 317, "correlation": 3932 } }, { "ph": "f", "id": 3932, "pid": 76337, "tid": -914061504, "ts": 1716454216537354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216537355, "dur": 0, "args": { "External id": 3933, "cbid": 203, "correlation": 3933 } }, { "ph": "f", "id": 3933, "pid": 76337, "tid": -914061504, "ts": 1716454216537355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216537356, "dur": 0, "args": { "External id": 3934, "cbid": 205, "correlation": 3934 } }, { "ph": "f", "id": 3934, "pid": 76337, "tid": -914061504, "ts": 1716454216537356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216560135, "dur": 9, "args": { "External id": 3938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3938, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3938, "pid": 5, "tid": 7, "ts": 1716454216560135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537370, "dur": 12, "args": { "External id": 3938, "cbid": 211, "correlation": 3938 } }, { "ph": "s", "id": 3938, "pid": 76337, "tid": -914061504, "ts": 1716454216537370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216560145, "dur": 32, "args": { "External id": 3940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3940, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3940, "pid": 5, "tid": 7, "ts": 1716454216560145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537384, "dur": 5, "args": { "External id": 3940, "cbid": 211, "correlation": 3940 } }, { "ph": "s", "id": 3940, "pid": 76337, "tid": -914061504, "ts": 1716454216537384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216560179, "dur": 1, "args": { "External id": 3942, "device": 5, "context": 1, "stream": 7, "correlation": 3942, "bytes": 768, "memory bandwidth (GB/s)": 0.42105263157894735 } }, { "ph": "f", "id": 3942, "pid": 5, "tid": 7, "ts": 1716454216560179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216537394, "dur": 6, "args": { "External id": 3942, "cbid": 51, "correlation": 3942 } }, { "ph": "s", "id": 3942, "pid": 76337, "tid": -914061504, "ts": 1716454216537394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216560183, "dur": 266, "args": { "External id": 3943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3943, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 3943, "pid": 5, "tid": 7, "ts": 1716454216560183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537401, "dur": 6, "args": { "External id": 3943, "cbid": 211, "correlation": 3943 } }, { "ph": "s", "id": 3943, "pid": 76337, "tid": -914061504, "ts": 1716454216537401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216560451, "dur": 10, "args": { "External id": 3945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3945, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3945, "pid": 5, "tid": 7, "ts": 1716454216560451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537411, "dur": 5, "args": { "External id": 3945, "cbid": 211, "correlation": 3945 } }, { "ph": "s", "id": 3945, "pid": 76337, "tid": -914061504, "ts": 1716454216537411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560462, "dur": 12, "args": { "External id": 3951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3951, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3951, "pid": 5, "tid": 7, "ts": 1716454216560462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537440, "dur": 9, "args": { "External id": 3951, "cbid": 211, "correlation": 3951 } }, { "ph": "s", "id": 3951, "pid": 76337, "tid": -914061504, "ts": 1716454216537440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216560476, "dur": 30, "args": { "External id": 3971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3971, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 3971, "pid": 5, "tid": 7, "ts": 1716454216560476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537508, "dur": 11, "args": { "External id": 3971, "cbid": 211, "correlation": 3971 } }, { "ph": "s", "id": 3971, "pid": 76337, "tid": -914061504, "ts": 1716454216537508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216560507, "dur": 4, "args": { "External id": 3983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3983, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 3983, "pid": 5, "tid": 7, "ts": 1716454216560507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537530, "dur": 6, "args": { "External id": 3983, "cbid": 211, "correlation": 3983 } }, { "ph": "s", "id": 3983, "pid": 76337, "tid": -914061504, "ts": 1716454216537530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560513, "dur": 14, "args": { "External id": 3986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3986, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3986, "pid": 5, "tid": 7, "ts": 1716454216560513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537548, "dur": 6, "args": { "External id": 3986, "cbid": 211, "correlation": 3986 } }, { "ph": "s", "id": 3986, "pid": 76337, "tid": -914061504, "ts": 1716454216537548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216560528, "dur": 9, "args": { "External id": 3995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 3995, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 3995, "pid": 5, "tid": 7, "ts": 1716454216560528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537587, "dur": 10, "args": { "External id": 3995, "cbid": 211, "correlation": 3995 } }, { "ph": "s", "id": 3995, "pid": 76337, "tid": -914061504, "ts": 1716454216537587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216537647, "dur": 0, "args": { "External id": 4005, "cbid": 317, "correlation": 4005 } }, { "ph": "f", "id": 4005, "pid": 76337, "tid": -914061504, "ts": 1716454216537647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216537648, "dur": 0, "args": { "External id": 4006, "cbid": 203, "correlation": 4006 } }, { "ph": "f", "id": 4006, "pid": 76337, "tid": -914061504, "ts": 1716454216537648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216537649, "dur": 0, "args": { "External id": 4007, "cbid": 205, "correlation": 4007 } }, { "ph": "f", "id": 4007, "pid": 76337, "tid": -914061504, "ts": 1716454216537649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216560538, "dur": 9, "args": { "External id": 4011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4011, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4011, "pid": 5, "tid": 7, "ts": 1716454216560538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537662, "dur": 11, "args": { "External id": 4011, "cbid": 211, "correlation": 4011 } }, { "ph": "s", "id": 4011, "pid": 76337, "tid": -914061504, "ts": 1716454216537662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216560549, "dur": 33, "args": { "External id": 4013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4013, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4013, "pid": 5, "tid": 7, "ts": 1716454216560549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537675, "dur": 5, "args": { "External id": 4013, "cbid": 211, "correlation": 4013 } }, { "ph": "s", "id": 4013, "pid": 76337, "tid": -914061504, "ts": 1716454216537675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216560584, "dur": 1, "args": { "External id": 4015, "device": 5, "context": 1, "stream": 7, "correlation": 4015, "bytes": 768, "memory bandwidth (GB/s)": 0.4528301886792453 } }, { "ph": "f", "id": 4015, "pid": 5, "tid": 7, "ts": 1716454216560584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216537686, "dur": 6, "args": { "External id": 4015, "cbid": 51, "correlation": 4015 } }, { "ph": "s", "id": 4015, "pid": 76337, "tid": -914061504, "ts": 1716454216537686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216560588, "dur": 265, "args": { "External id": 4016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4016, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4016, "pid": 5, "tid": 7, "ts": 1716454216560588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537693, "dur": 6, "args": { "External id": 4016, "cbid": 211, "correlation": 4016 } }, { "ph": "s", "id": 4016, "pid": 76337, "tid": -914061504, "ts": 1716454216537693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216560854, "dur": 10, "args": { "External id": 4018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4018, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4018, "pid": 5, "tid": 7, "ts": 1716454216560854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537703, "dur": 5, "args": { "External id": 4018, "cbid": 211, "correlation": 4018 } }, { "ph": "s", "id": 4018, "pid": 76337, "tid": -914061504, "ts": 1716454216537703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560866, "dur": 12, "args": { "External id": 4024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4024, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4024, "pid": 5, "tid": 7, "ts": 1716454216560866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537730, "dur": 8, "args": { "External id": 4024, "cbid": 211, "correlation": 4024 } }, { "ph": "s", "id": 4024, "pid": 76337, "tid": -914061504, "ts": 1716454216537730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560879, "dur": 40, "args": { "External id": 4032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4032, "pid": 5, "tid": 7, "ts": 1716454216560879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537763, "dur": 9, "args": { "External id": 4032, "cbid": 211, "correlation": 4032 } }, { "ph": "s", "id": 4032, "pid": 76337, "tid": -914061504, "ts": 1716454216537763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216560920, "dur": 10, "args": { "External id": 4040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4040, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4040, "pid": 5, "tid": 7, "ts": 1716454216560920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537792, "dur": 9, "args": { "External id": 4040, "cbid": 211, "correlation": 4040 } }, { "ph": "s", "id": 4040, "pid": 76337, "tid": -914061504, "ts": 1716454216537792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216560931, "dur": 38, "args": { "External id": 4050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4050, "pid": 5, "tid": 7, "ts": 1716454216560931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537887, "dur": 12, "args": { "External id": 4050, "cbid": 211, "correlation": 4050 } }, { "ph": "s", "id": 4050, "pid": 76337, "tid": -914061504, "ts": 1716454216537887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216560971, "dur": 34, "args": { "External id": 4071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4071, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4071, "pid": 5, "tid": 7, "ts": 1716454216560971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537921, "dur": 7, "args": { "External id": 4071, "cbid": 211, "correlation": 4071 } }, { "ph": "s", "id": 4071, "pid": 76337, "tid": -914061504, "ts": 1716454216537921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216561006, "dur": 4, "args": { "External id": 4083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4083, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4083, "pid": 5, "tid": 7, "ts": 1716454216561006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537937, "dur": 6, "args": { "External id": 4083, "cbid": 211, "correlation": 4083 } }, { "ph": "s", "id": 4083, "pid": 76337, "tid": -914061504, "ts": 1716454216537937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561011, "dur": 13, "args": { "External id": 4086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4086, "pid": 5, "tid": 7, "ts": 1716454216561011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216537953, "dur": 7, "args": { "External id": 4086, "cbid": 211, "correlation": 4086 } }, { "ph": "s", "id": 4086, "pid": 76337, "tid": -914061504, "ts": 1716454216537953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216561026, "dur": 10, "args": { "External id": 4095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4095, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4095, "pid": 5, "tid": 7, "ts": 1716454216561026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538001, "dur": 10, "args": { "External id": 4095, "cbid": 211, "correlation": 4095 } }, { "ph": "s", "id": 4095, "pid": 76337, "tid": -914061504, "ts": 1716454216538001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216538054, "dur": 0, "args": { "External id": 4105, "cbid": 317, "correlation": 4105 } }, { "ph": "f", "id": 4105, "pid": 76337, "tid": -914061504, "ts": 1716454216538054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216538054, "dur": 0, "args": { "External id": 4106, "cbid": 203, "correlation": 4106 } }, { "ph": "f", "id": 4106, "pid": 76337, "tid": -914061504, "ts": 1716454216538054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216538055, "dur": 0, "args": { "External id": 4107, "cbid": 205, "correlation": 4107 } }, { "ph": "f", "id": 4107, "pid": 76337, "tid": -914061504, "ts": 1716454216538055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216561037, "dur": 9, "args": { "External id": 4111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4111, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4111, "pid": 5, "tid": 7, "ts": 1716454216561037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538069, "dur": 12, "args": { "External id": 4111, "cbid": 211, "correlation": 4111 } }, { "ph": "s", "id": 4111, "pid": 76337, "tid": -914061504, "ts": 1716454216538069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216561047, "dur": 32, "args": { "External id": 4113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4113, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4113, "pid": 5, "tid": 7, "ts": 1716454216561047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538083, "dur": 5, "args": { "External id": 4113, "cbid": 211, "correlation": 4113 } }, { "ph": "s", "id": 4113, "pid": 76337, "tid": -914061504, "ts": 1716454216538083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216561082, "dur": 1, "args": { "External id": 4115, "device": 5, "context": 1, "stream": 7, "correlation": 4115, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 4115, "pid": 5, "tid": 7, "ts": 1716454216561082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216538094, "dur": 6, "args": { "External id": 4115, "cbid": 51, "correlation": 4115 } }, { "ph": "s", "id": 4115, "pid": 76337, "tid": -914061504, "ts": 1716454216538094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216561085, "dur": 266, "args": { "External id": 4116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4116, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4116, "pid": 5, "tid": 7, "ts": 1716454216561085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538101, "dur": 6, "args": { "External id": 4116, "cbid": 211, "correlation": 4116 } }, { "ph": "s", "id": 4116, "pid": 76337, "tid": -914061504, "ts": 1716454216538101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216561353, "dur": 10, "args": { "External id": 4118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4118, "pid": 5, "tid": 7, "ts": 1716454216561353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538110, "dur": 5, "args": { "External id": 4118, "cbid": 211, "correlation": 4118 } }, { "ph": "s", "id": 4118, "pid": 76337, "tid": -914061504, "ts": 1716454216538110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561365, "dur": 12, "args": { "External id": 4124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4124, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4124, "pid": 5, "tid": 7, "ts": 1716454216561365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538138, "dur": 8, "args": { "External id": 4124, "cbid": 211, "correlation": 4124 } }, { "ph": "s", "id": 4124, "pid": 76337, "tid": -914061504, "ts": 1716454216538138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216561379, "dur": 30, "args": { "External id": 4144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4144, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4144, "pid": 5, "tid": 7, "ts": 1716454216561379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538207, "dur": 11, "args": { "External id": 4144, "cbid": 211, "correlation": 4144 } }, { "ph": "s", "id": 4144, "pid": 76337, "tid": -914061504, "ts": 1716454216538207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216561410, "dur": 4, "args": { "External id": 4156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4156, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4156, "pid": 5, "tid": 7, "ts": 1716454216561410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538229, "dur": 6, "args": { "External id": 4156, "cbid": 211, "correlation": 4156 } }, { "ph": "s", "id": 4156, "pid": 76337, "tid": -914061504, "ts": 1716454216538229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561416, "dur": 14, "args": { "External id": 4159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4159, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4159, "pid": 5, "tid": 7, "ts": 1716454216561416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538246, "dur": 6, "args": { "External id": 4159, "cbid": 211, "correlation": 4159 } }, { "ph": "s", "id": 4159, "pid": 76337, "tid": -914061504, "ts": 1716454216538246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216561431, "dur": 9, "args": { "External id": 4168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4168, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4168, "pid": 5, "tid": 7, "ts": 1716454216561431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538286, "dur": 9, "args": { "External id": 4168, "cbid": 211, "correlation": 4168 } }, { "ph": "s", "id": 4168, "pid": 76337, "tid": -914061504, "ts": 1716454216538286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216538348, "dur": 0, "args": { "External id": 4178, "cbid": 317, "correlation": 4178 } }, { "ph": "f", "id": 4178, "pid": 76337, "tid": -914061504, "ts": 1716454216538348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216538348, "dur": 0, "args": { "External id": 4179, "cbid": 203, "correlation": 4179 } }, { "ph": "f", "id": 4179, "pid": 76337, "tid": -914061504, "ts": 1716454216538348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216538349, "dur": 0, "args": { "External id": 4180, "cbid": 205, "correlation": 4180 } }, { "ph": "f", "id": 4180, "pid": 76337, "tid": -914061504, "ts": 1716454216538349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216561442, "dur": 9, "args": { "External id": 4184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4184, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4184, "pid": 5, "tid": 7, "ts": 1716454216561442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538361, "dur": 12, "args": { "External id": 4184, "cbid": 211, "correlation": 4184 } }, { "ph": "s", "id": 4184, "pid": 76337, "tid": -914061504, "ts": 1716454216538361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216561452, "dur": 33, "args": { "External id": 4186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4186, "pid": 5, "tid": 7, "ts": 1716454216561452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538376, "dur": 5, "args": { "External id": 4186, "cbid": 211, "correlation": 4186 } }, { "ph": "s", "id": 4186, "pid": 76337, "tid": -914061504, "ts": 1716454216538376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216561488, "dur": 1, "args": { "External id": 4188, "device": 5, "context": 1, "stream": 7, "correlation": 4188, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 4188, "pid": 5, "tid": 7, "ts": 1716454216561488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216538386, "dur": 7, "args": { "External id": 4188, "cbid": 51, "correlation": 4188 } }, { "ph": "s", "id": 4188, "pid": 76337, "tid": -914061504, "ts": 1716454216538386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216561491, "dur": 267, "args": { "External id": 4189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4189, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4189, "pid": 5, "tid": 7, "ts": 1716454216561491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538394, "dur": 6, "args": { "External id": 4189, "cbid": 211, "correlation": 4189 } }, { "ph": "s", "id": 4189, "pid": 76337, "tid": -914061504, "ts": 1716454216538394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216561760, "dur": 10, "args": { "External id": 4191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4191, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4191, "pid": 5, "tid": 7, "ts": 1716454216561760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538403, "dur": 5, "args": { "External id": 4191, "cbid": 211, "correlation": 4191 } }, { "ph": "s", "id": 4191, "pid": 76337, "tid": -914061504, "ts": 1716454216538403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561772, "dur": 12, "args": { "External id": 4197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4197, "pid": 5, "tid": 7, "ts": 1716454216561772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538431, "dur": 8, "args": { "External id": 4197, "cbid": 211, "correlation": 4197 } }, { "ph": "s", "id": 4197, "pid": 76337, "tid": -914061504, "ts": 1716454216538431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561785, "dur": 39, "args": { "External id": 4205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4205, "pid": 5, "tid": 7, "ts": 1716454216561785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538464, "dur": 8, "args": { "External id": 4205, "cbid": 211, "correlation": 4205 } }, { "ph": "s", "id": 4205, "pid": 76337, "tid": -914061504, "ts": 1716454216538464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216561826, "dur": 10, "args": { "External id": 4213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4213, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4213, "pid": 5, "tid": 7, "ts": 1716454216561826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538493, "dur": 9, "args": { "External id": 4213, "cbid": 211, "correlation": 4213 } }, { "ph": "s", "id": 4213, "pid": 76337, "tid": -914061504, "ts": 1716454216538493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561837, "dur": 38, "args": { "External id": 4223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4223, "pid": 5, "tid": 7, "ts": 1716454216561837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538685, "dur": 16, "args": { "External id": 4223, "cbid": 211, "correlation": 4223 } }, { "ph": "s", "id": 4223, "pid": 76337, "tid": -914061504, "ts": 1716454216538685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216561876, "dur": 31, "args": { "External id": 4244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4244, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4244, "pid": 5, "tid": 7, "ts": 1716454216561876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538722, "dur": 7, "args": { "External id": 4244, "cbid": 211, "correlation": 4244 } }, { "ph": "s", "id": 4244, "pid": 76337, "tid": -914061504, "ts": 1716454216538722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216561908, "dur": 4, "args": { "External id": 4256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4256, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4256, "pid": 5, "tid": 7, "ts": 1716454216561908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538739, "dur": 6, "args": { "External id": 4256, "cbid": 211, "correlation": 4256 } }, { "ph": "s", "id": 4256, "pid": 76337, "tid": -914061504, "ts": 1716454216538739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561914, "dur": 14, "args": { "External id": 4259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4259, "pid": 5, "tid": 7, "ts": 1716454216561914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538756, "dur": 6, "args": { "External id": 4259, "cbid": 211, "correlation": 4259 } }, { "ph": "s", "id": 4259, "pid": 76337, "tid": -914061504, "ts": 1716454216538756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216538901, "dur": 2, "args": { "External id": 4271, "cbid": 251, "correlation": 4271 } }, { "ph": "f", "id": 4271, "pid": 76337, "tid": -914061504, "ts": 1716454216538901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216561929, "dur": 39, "args": { "External id": 4272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4272, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 4272, "pid": 5, "tid": 7, "ts": 1716454216561929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538909, "dur": 14, "args": { "External id": 4272, "cbid": 211, "correlation": 4272 } }, { "ph": "s", "id": 4272, "pid": 76337, "tid": -914061504, "ts": 1716454216538909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216561969, "dur": 16, "args": { "External id": 4277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4277, "pid": 5, "tid": 7, "ts": 1716454216561969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216538948, "dur": 8, "args": { "External id": 4277, "cbid": 211, "correlation": 4277 } }, { "ph": "s", "id": 4277, "pid": 76337, "tid": -914061504, "ts": 1716454216538948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216539033, "dur": 1, "args": { "External id": 4288, "cbid": 251, "correlation": 4288 } }, { "ph": "f", "id": 4288, "pid": 76337, "tid": -914061504, "ts": 1716454216539033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216561987, "dur": 36, "args": { "External id": 4289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4289, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 4289, "pid": 5, "tid": 7, "ts": 1716454216561987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539037, "dur": 13, "args": { "External id": 4289, "cbid": 211, "correlation": 4289 } }, { "ph": "s", "id": 4289, "pid": 76337, "tid": -914061504, "ts": 1716454216539037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216562024, "dur": 15, "args": { "External id": 4294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4294, "pid": 5, "tid": 7, "ts": 1716454216562024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539067, "dur": 8, "args": { "External id": 4294, "cbid": 211, "correlation": 4294 } }, { "ph": "s", "id": 4294, "pid": 76337, "tid": -914061504, "ts": 1716454216539067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216539141, "dur": 1, "args": { "External id": 4305, "cbid": 251, "correlation": 4305 } }, { "ph": "f", "id": 4305, "pid": 76337, "tid": -914061504, "ts": 1716454216539141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216562041, "dur": 37, "args": { "External id": 4306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4306, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 4306, "pid": 5, "tid": 7, "ts": 1716454216562041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539145, "dur": 12, "args": { "External id": 4306, "cbid": 211, "correlation": 4306 } }, { "ph": "s", "id": 4306, "pid": 76337, "tid": -914061504, "ts": 1716454216539145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216562079, "dur": 15, "args": { "External id": 4311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4311, "pid": 5, "tid": 7, "ts": 1716454216562079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539173, "dur": 8, "args": { "External id": 4311, "cbid": 211, "correlation": 4311 } }, { "ph": "s", "id": 4311, "pid": 76337, "tid": -914061504, "ts": 1716454216539173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216562096, "dur": 1221, "args": { "External id": 4336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4336, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 4336, "pid": 5, "tid": 7, "ts": 1716454216562096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539274, "dur": 13, "args": { "External id": 4336, "cbid": 211, "correlation": 4336 } }, { "ph": "s", "id": 4336, "pid": 76337, "tid": -914061504, "ts": 1716454216539274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216539402, "dur": 2, "args": { "External id": 4354, "cbid": 251, "correlation": 4354 } }, { "ph": "f", "id": 4354, "pid": 76337, "tid": -914061504, "ts": 1716454216539402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216563318, "dur": 42, "args": { "External id": 4356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4356, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 4356, "pid": 5, "tid": 7, "ts": 1716454216563318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539409, "dur": 14, "args": { "External id": 4356, "cbid": 211, "correlation": 4356 } }, { "ph": "s", "id": 4356, "pid": 76337, "tid": -914061504, "ts": 1716454216539409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216563361, "dur": 12, "args": { "External id": 4364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4364, "registers per thread": 19, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4364, "pid": 5, "tid": 7, "ts": 1716454216563361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539495, "dur": 14, "args": { "External id": 4364, "cbid": 211, "correlation": 4364 } }, { "ph": "s", "id": 4364, "pid": 76337, "tid": -914061504, "ts": 1716454216539495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216563375, "dur": 9, "args": { "External id": 4372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4372, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4372, "pid": 5, "tid": 7, "ts": 1716454216563375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539533, "dur": 9, "args": { "External id": 4372, "cbid": 211, "correlation": 4372 } }, { "ph": "s", "id": 4372, "pid": 76337, "tid": -914061504, "ts": 1716454216539533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216563385, "dur": 38, "args": { "External id": 4382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4382, "pid": 5, "tid": 7, "ts": 1716454216563385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539609, "dur": 13, "args": { "External id": 4382, "cbid": 211, "correlation": 4382 } }, { "ph": "s", "id": 4382, "pid": 76337, "tid": -914061504, "ts": 1716454216539609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216563425, "dur": 33, "args": { "External id": 4403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4403, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4403, "pid": 5, "tid": 7, "ts": 1716454216563425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539642, "dur": 7, "args": { "External id": 4403, "cbid": 211, "correlation": 4403 } }, { "ph": "s", "id": 4403, "pid": 76337, "tid": -914061504, "ts": 1716454216539642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216563459, "dur": 4, "args": { "External id": 4415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4415, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4415, "pid": 5, "tid": 7, "ts": 1716454216563459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539659, "dur": 7, "args": { "External id": 4415, "cbid": 211, "correlation": 4415 } }, { "ph": "s", "id": 4415, "pid": 76337, "tid": -914061504, "ts": 1716454216539659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216563465, "dur": 14, "args": { "External id": 4418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4418, "pid": 5, "tid": 7, "ts": 1716454216563465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539676, "dur": 6, "args": { "External id": 4418, "cbid": 211, "correlation": 4418 } }, { "ph": "s", "id": 4418, "pid": 76337, "tid": -914061504, "ts": 1716454216539676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216563481, "dur": 10, "args": { "External id": 4427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4427, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4427, "pid": 5, "tid": 7, "ts": 1716454216563481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539715, "dur": 10, "args": { "External id": 4427, "cbid": 211, "correlation": 4427 } }, { "ph": "s", "id": 4427, "pid": 76337, "tid": -914061504, "ts": 1716454216539715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216539770, "dur": 0, "args": { "External id": 4437, "cbid": 317, "correlation": 4437 } }, { "ph": "f", "id": 4437, "pid": 76337, "tid": -914061504, "ts": 1716454216539770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216539771, "dur": 0, "args": { "External id": 4438, "cbid": 203, "correlation": 4438 } }, { "ph": "f", "id": 4438, "pid": 76337, "tid": -914061504, "ts": 1716454216539771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216539772, "dur": 0, "args": { "External id": 4439, "cbid": 205, "correlation": 4439 } }, { "ph": "f", "id": 4439, "pid": 76337, "tid": -914061504, "ts": 1716454216539772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216563492, "dur": 11, "args": { "External id": 4443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4443, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4443, "pid": 5, "tid": 7, "ts": 1716454216563492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539786, "dur": 12, "args": { "External id": 4443, "cbid": 211, "correlation": 4443 } }, { "ph": "s", "id": 4443, "pid": 76337, "tid": -914061504, "ts": 1716454216539786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216563505, "dur": 32, "args": { "External id": 4445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4445, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4445, "pid": 5, "tid": 7, "ts": 1716454216563505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539800, "dur": 5, "args": { "External id": 4445, "cbid": 211, "correlation": 4445 } }, { "ph": "s", "id": 4445, "pid": 76337, "tid": -914061504, "ts": 1716454216539800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216563539, "dur": 1, "args": { "External id": 4447, "device": 5, "context": 1, "stream": 7, "correlation": 4447, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 4447, "pid": 5, "tid": 7, "ts": 1716454216563539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216539811, "dur": 6, "args": { "External id": 4447, "cbid": 51, "correlation": 4447 } }, { "ph": "s", "id": 4447, "pid": 76337, "tid": -914061504, "ts": 1716454216539811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216563543, "dur": 274, "args": { "External id": 4448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4448, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4448, "pid": 5, "tid": 7, "ts": 1716454216563543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539819, "dur": 7, "args": { "External id": 4448, "cbid": 211, "correlation": 4448 } }, { "ph": "s", "id": 4448, "pid": 76337, "tid": -914061504, "ts": 1716454216539819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216563818, "dur": 11, "args": { "External id": 4450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4450, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4450, "pid": 5, "tid": 7, "ts": 1716454216563818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539829, "dur": 5, "args": { "External id": 4450, "cbid": 211, "correlation": 4450 } }, { "ph": "s", "id": 4450, "pid": 76337, "tid": -914061504, "ts": 1716454216539829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216563830, "dur": 13, "args": { "External id": 4456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4456, "pid": 5, "tid": 7, "ts": 1716454216563830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539857, "dur": 8, "args": { "External id": 4456, "cbid": 211, "correlation": 4456 } }, { "ph": "s", "id": 4456, "pid": 76337, "tid": -914061504, "ts": 1716454216539857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216563845, "dur": 31, "args": { "External id": 4476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4476, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4476, "pid": 5, "tid": 7, "ts": 1716454216563845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539925, "dur": 12, "args": { "External id": 4476, "cbid": 211, "correlation": 4476 } }, { "ph": "s", "id": 4476, "pid": 76337, "tid": -914061504, "ts": 1716454216539925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216563877, "dur": 4, "args": { "External id": 4488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4488, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4488, "pid": 5, "tid": 7, "ts": 1716454216563877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539947, "dur": 6, "args": { "External id": 4488, "cbid": 211, "correlation": 4488 } }, { "ph": "s", "id": 4488, "pid": 76337, "tid": -914061504, "ts": 1716454216539947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216563882, "dur": 14, "args": { "External id": 4491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4491, "pid": 5, "tid": 7, "ts": 1716454216563882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216539964, "dur": 6, "args": { "External id": 4491, "cbid": 211, "correlation": 4491 } }, { "ph": "s", "id": 4491, "pid": 76337, "tid": -914061504, "ts": 1716454216539964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216563898, "dur": 9, "args": { "External id": 4500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4500, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4500, "pid": 5, "tid": 7, "ts": 1716454216563898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540013, "dur": 10, "args": { "External id": 4500, "cbid": 211, "correlation": 4500 } }, { "ph": "s", "id": 4500, "pid": 76337, "tid": -914061504, "ts": 1716454216540013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216540078, "dur": 0, "args": { "External id": 4510, "cbid": 317, "correlation": 4510 } }, { "ph": "f", "id": 4510, "pid": 76337, "tid": -914061504, "ts": 1716454216540078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216540078, "dur": 0, "args": { "External id": 4511, "cbid": 203, "correlation": 4511 } }, { "ph": "f", "id": 4511, "pid": 76337, "tid": -914061504, "ts": 1716454216540078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216540079, "dur": 0, "args": { "External id": 4512, "cbid": 205, "correlation": 4512 } }, { "ph": "f", "id": 4512, "pid": 76337, "tid": -914061504, "ts": 1716454216540079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216563908, "dur": 8, "args": { "External id": 4516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4516, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4516, "pid": 5, "tid": 7, "ts": 1716454216563908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540092, "dur": 12, "args": { "External id": 4516, "cbid": 211, "correlation": 4516 } }, { "ph": "s", "id": 4516, "pid": 76337, "tid": -914061504, "ts": 1716454216540092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216563918, "dur": 32, "args": { "External id": 4518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4518, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4518, "pid": 5, "tid": 7, "ts": 1716454216563918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540107, "dur": 5, "args": { "External id": 4518, "cbid": 211, "correlation": 4518 } }, { "ph": "s", "id": 4518, "pid": 76337, "tid": -914061504, "ts": 1716454216540107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216563953, "dur": 1, "args": { "External id": 4520, "device": 5, "context": 1, "stream": 7, "correlation": 4520, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 4520, "pid": 5, "tid": 7, "ts": 1716454216563953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216540117, "dur": 6, "args": { "External id": 4520, "cbid": 51, "correlation": 4520 } }, { "ph": "s", "id": 4520, "pid": 76337, "tid": -914061504, "ts": 1716454216540117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216563957, "dur": 265, "args": { "External id": 4521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4521, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4521, "pid": 5, "tid": 7, "ts": 1716454216563957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540125, "dur": 6, "args": { "External id": 4521, "cbid": 211, "correlation": 4521 } }, { "ph": "s", "id": 4521, "pid": 76337, "tid": -914061504, "ts": 1716454216540125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216564223, "dur": 11, "args": { "External id": 4523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4523, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4523, "pid": 5, "tid": 7, "ts": 1716454216564223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540134, "dur": 5, "args": { "External id": 4523, "cbid": 211, "correlation": 4523 } }, { "ph": "s", "id": 4523, "pid": 76337, "tid": -914061504, "ts": 1716454216540134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564235, "dur": 12, "args": { "External id": 4529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4529, "pid": 5, "tid": 7, "ts": 1716454216564235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540162, "dur": 8, "args": { "External id": 4529, "cbid": 211, "correlation": 4529 } }, { "ph": "s", "id": 4529, "pid": 76337, "tid": -914061504, "ts": 1716454216540162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564248, "dur": 40, "args": { "External id": 4537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4537, "pid": 5, "tid": 7, "ts": 1716454216564248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540196, "dur": 9, "args": { "External id": 4537, "cbid": 211, "correlation": 4537 } }, { "ph": "s", "id": 4537, "pid": 76337, "tid": -914061504, "ts": 1716454216540196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216564290, "dur": 9, "args": { "External id": 4545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4545, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4545, "pid": 5, "tid": 7, "ts": 1716454216564290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540226, "dur": 8, "args": { "External id": 4545, "cbid": 211, "correlation": 4545 } }, { "ph": "s", "id": 4545, "pid": 76337, "tid": -914061504, "ts": 1716454216540226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564301, "dur": 38, "args": { "External id": 4555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4555, "pid": 5, "tid": 7, "ts": 1716454216564301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540294, "dur": 13, "args": { "External id": 4555, "cbid": 211, "correlation": 4555 } }, { "ph": "s", "id": 4555, "pid": 76337, "tid": -914061504, "ts": 1716454216540294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216564340, "dur": 47, "args": { "External id": 4576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4576, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4576, "pid": 5, "tid": 7, "ts": 1716454216564340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540328, "dur": 7, "args": { "External id": 4576, "cbid": 211, "correlation": 4576 } }, { "ph": "s", "id": 4576, "pid": 76337, "tid": -914061504, "ts": 1716454216540328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216564389, "dur": 4, "args": { "External id": 4588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4588, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4588, "pid": 5, "tid": 7, "ts": 1716454216564389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540344, "dur": 5, "args": { "External id": 4588, "cbid": 211, "correlation": 4588 } }, { "ph": "s", "id": 4588, "pid": 76337, "tid": -914061504, "ts": 1716454216540344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564395, "dur": 13, "args": { "External id": 4591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4591, "pid": 5, "tid": 7, "ts": 1716454216564395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540361, "dur": 6, "args": { "External id": 4591, "cbid": 211, "correlation": 4591 } }, { "ph": "s", "id": 4591, "pid": 76337, "tid": -914061504, "ts": 1716454216540361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216564409, "dur": 9, "args": { "External id": 4600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4600, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4600, "pid": 5, "tid": 7, "ts": 1716454216564409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540403, "dur": 11, "args": { "External id": 4600, "cbid": 211, "correlation": 4600 } }, { "ph": "s", "id": 4600, "pid": 76337, "tid": -914061504, "ts": 1716454216540403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216540455, "dur": 0, "args": { "External id": 4610, "cbid": 317, "correlation": 4610 } }, { "ph": "f", "id": 4610, "pid": 76337, "tid": -914061504, "ts": 1716454216540455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216540456, "dur": 0, "args": { "External id": 4611, "cbid": 203, "correlation": 4611 } }, { "ph": "f", "id": 4611, "pid": 76337, "tid": -914061504, "ts": 1716454216540456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216540457, "dur": 0, "args": { "External id": 4612, "cbid": 205, "correlation": 4612 } }, { "ph": "f", "id": 4612, "pid": 76337, "tid": -914061504, "ts": 1716454216540457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216564420, "dur": 9, "args": { "External id": 4616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4616, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4616, "pid": 5, "tid": 7, "ts": 1716454216564420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540472, "dur": 11, "args": { "External id": 4616, "cbid": 211, "correlation": 4616 } }, { "ph": "s", "id": 4616, "pid": 76337, "tid": -914061504, "ts": 1716454216540472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216564430, "dur": 4, "args": { "External id": 4618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4618, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1.6, "warps per SM": 12.8, "grid": [1, 16, 8], "block": [256, 1, 1], "est. achieved occupancy %": 20 } }, { "ph": "f", "id": 4618, "pid": 5, "tid": 7, "ts": 1716454216564430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540487, "dur": 5, "args": { "External id": 4618, "cbid": 211, "correlation": 4618 } }, { "ph": "s", "id": 4618, "pid": 76337, "tid": -914061504, "ts": 1716454216540487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216564437, "dur": 1, "args": { "External id": 4620, "device": 5, "context": 1, "stream": 7, "correlation": 4620, "bytes": 2688, "memory bandwidth (GB/s)": 1.423728813559322 } }, { "ph": "f", "id": 4620, "pid": 5, "tid": 7, "ts": 1716454216564437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216540498, "dur": 7, "args": { "External id": 4620, "cbid": 51, "correlation": 4620 } }, { "ph": "s", "id": 4620, "pid": 76337, "tid": -914061504, "ts": 1716454216540498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x32x64_stage1_warpsize2x1x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216564441, "dur": 60, "args": { "External id": 4621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4621, "registers per thread": 128, "shared memory": 12288, "blocks per SM": 8.4, "warps per SM": 33.6, "grid": [1, 48, 14], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 4621, "pid": 5, "tid": 7, "ts": 1716454216564441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540507, "dur": 9, "args": { "External id": 4621, "cbid": 211, "correlation": 4621 } }, { "ph": "s", "id": 4621, "pid": 76337, "tid": -914061504, "ts": 1716454216540507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216564503, "dur": 5, "args": { "External id": 4623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4623, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1.2, "warps per SM": 9.6, "grid": [96, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 4623, "pid": 5, "tid": 7, "ts": 1716454216564503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540521, "dur": 5, "args": { "External id": 4623, "cbid": 211, "correlation": 4623 } }, { "ph": "s", "id": 4623, "pid": 76337, "tid": -914061504, "ts": 1716454216540521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564509, "dur": 5, "args": { "External id": 4629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 4629, "pid": 5, "tid": 7, "ts": 1716454216564509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540548, "dur": 8, "args": { "External id": 4629, "cbid": 211, "correlation": 4629 } }, { "ph": "s", "id": 4629, "pid": 76337, "tid": -914061504, "ts": 1716454216540548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216540613, "dur": 0, "args": { "External id": 4639, "cbid": 317, "correlation": 4639 } }, { "ph": "f", "id": 4639, "pid": 76337, "tid": -914061504, "ts": 1716454216540613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216540613, "dur": 0, "args": { "External id": 4640, "cbid": 203, "correlation": 4640 } }, { "ph": "f", "id": 4640, "pid": 76337, "tid": -914061504, "ts": 1716454216540613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216540614, "dur": 0, "args": { "External id": 4641, "cbid": 205, "correlation": 4641 } }, { "ph": "f", "id": 4641, "pid": 76337, "tid": -914061504, "ts": 1716454216540614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216540635, "dur": 1, "args": { "External id": 4645, "cbid": 251, "correlation": 4645 } }, { "ph": "f", "id": 4645, "pid": 76337, "tid": -914061504, "ts": 1716454216540635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216564515, "dur": 11, "args": { "External id": 4646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4646, "registers per thread": 106, "shared memory": 16640, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 4646, "pid": 5, "tid": 7, "ts": 1716454216564515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540639, "dur": 13, "args": { "External id": 4646, "cbid": 211, "correlation": 4646 } }, { "ph": "s", "id": 4646, "pid": 76337, "tid": -914061504, "ts": 1716454216540639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564528, "dur": 4, "args": { "External id": 4652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4652, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 4652, "pid": 5, "tid": 7, "ts": 1716454216564528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540674, "dur": 9, "args": { "External id": 4652, "cbid": 211, "correlation": 4652 } }, { "ph": "s", "id": 4652, "pid": 76337, "tid": -914061504, "ts": 1716454216540674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216564533, "dur": 4, "args": { "External id": 4660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4660, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 4660, "pid": 5, "tid": 7, "ts": 1716454216564533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540746, "dur": 13, "args": { "External id": 4660, "cbid": 211, "correlation": 4660 } }, { "ph": "s", "id": 4660, "pid": 76337, "tid": -914061504, "ts": 1716454216540746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216564538, "dur": 3, "args": { "External id": 4668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4668, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 4668, "pid": 5, "tid": 7, "ts": 1716454216564538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540792, "dur": 12, "args": { "External id": 4668, "cbid": 211, "correlation": 4668 } }, { "ph": "s", "id": 4668, "pid": 76337, "tid": -914061504, "ts": 1716454216540792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216564543, "dur": 3, "args": { "External id": 4676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4676, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 4676, "pid": 5, "tid": 7, "ts": 1716454216564543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540823, "dur": 10, "args": { "External id": 4676, "cbid": 211, "correlation": 4676 } }, { "ph": "s", "id": 4676, "pid": 76337, "tid": -914061504, "ts": 1716454216540823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216564547, "dur": 3, "args": { "External id": 4684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4684, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 4684, "pid": 5, "tid": 7, "ts": 1716454216564547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540847, "dur": 7, "args": { "External id": 4684, "cbid": 211, "correlation": 4684 } }, { "ph": "s", "id": 4684, "pid": 76337, "tid": -914061504, "ts": 1716454216540847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564552, "dur": 5, "args": { "External id": 4698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0875, "warps per SM": 0.35, "grid": [7, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 4698, "pid": 5, "tid": 7, "ts": 1716454216564552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216540969, "dur": 23, "args": { "External id": 4698, "cbid": 211, "correlation": 4698 } }, { "ph": "s", "id": 4698, "pid": 76337, "tid": -914061504, "ts": 1716454216540969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216541000, "dur": 0, "args": { "External id": 4705, "cbid": 317, "correlation": 4705 } }, { "ph": "f", "id": 4705, "pid": 76337, "tid": -914061504, "ts": 1716454216541000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216541001, "dur": 0, "args": { "External id": 4706, "cbid": 203, "correlation": 4706 } }, { "ph": "f", "id": 4706, "pid": 76337, "tid": -914061504, "ts": 1716454216541001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216541002, "dur": 0, "args": { "External id": 4707, "cbid": 205, "correlation": 4707 } }, { "ph": "f", "id": 4707, "pid": 76337, "tid": -914061504, "ts": 1716454216541002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_tensor8x8x4_aligna2_alignc8_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216564558, "dur": 199, "args": { "External id": 4711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4711, "registers per thread": 111, "shared memory": 4096, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [4, 1536, 1], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 4711, "pid": 5, "tid": 7, "ts": 1716454216564558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541018, "dur": 11, "args": { "External id": 4711, "cbid": 211, "correlation": 4711 } }, { "ph": "s", "id": 4711, "pid": 76337, "tid": -914061504, "ts": 1716454216541018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564759, "dur": 177, "args": { "External id": 4717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4717, "pid": 5, "tid": 7, "ts": 1716454216564759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541052, "dur": 8, "args": { "External id": 4717, "cbid": 211, "correlation": 4717 } }, { "ph": "s", "id": 4717, "pid": 76337, "tid": -914061504, "ts": 1716454216541052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216564938, "dur": 1807, "args": { "External id": 4727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4727, "pid": 5, "tid": 7, "ts": 1716454216564938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541128, "dur": 12, "args": { "External id": 4727, "cbid": 211, "correlation": 4727 } }, { "ph": "s", "id": 4727, "pid": 76337, "tid": -914061504, "ts": 1716454216541128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216566745, "dur": 700, "args": { "External id": 4748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4748, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4748, "pid": 5, "tid": 7, "ts": 1716454216566745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541161, "dur": 8, "args": { "External id": 4748, "cbid": 211, "correlation": 4748 } }, { "ph": "s", "id": 4748, "pid": 76337, "tid": -914061504, "ts": 1716454216541161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216567447, "dur": 4, "args": { "External id": 4760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4760, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4760, "pid": 5, "tid": 7, "ts": 1716454216567447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541178, "dur": 6, "args": { "External id": 4760, "cbid": 211, "correlation": 4760 } }, { "ph": "s", "id": 4760, "pid": 76337, "tid": -914061504, "ts": 1716454216541178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216567453, "dur": 176, "args": { "External id": 4763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4763, "pid": 5, "tid": 7, "ts": 1716454216567453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541194, "dur": 7, "args": { "External id": 4763, "cbid": 211, "correlation": 4763 } }, { "ph": "s", "id": 4763, "pid": 76337, "tid": -914061504, "ts": 1716454216541194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216567630, "dur": 109, "args": { "External id": 4772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4772, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4772, "pid": 5, "tid": 7, "ts": 1716454216567630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541233, "dur": 10, "args": { "External id": 4772, "cbid": 211, "correlation": 4772 } }, { "ph": "s", "id": 4772, "pid": 76337, "tid": -914061504, "ts": 1716454216541233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216541283, "dur": 0, "args": { "External id": 4782, "cbid": 317, "correlation": 4782 } }, { "ph": "f", "id": 4782, "pid": 76337, "tid": -914061504, "ts": 1716454216541283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216541284, "dur": 0, "args": { "External id": 4783, "cbid": 203, "correlation": 4783 } }, { "ph": "f", "id": 4783, "pid": 76337, "tid": -914061504, "ts": 1716454216541284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216541285, "dur": 0, "args": { "External id": 4784, "cbid": 205, "correlation": 4784 } }, { "ph": "f", "id": 4784, "pid": 76337, "tid": -914061504, "ts": 1716454216541285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216567741, "dur": 137, "args": { "External id": 4788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4788, "pid": 5, "tid": 7, "ts": 1716454216567741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541299, "dur": 11, "args": { "External id": 4788, "cbid": 211, "correlation": 4788 } }, { "ph": "s", "id": 4788, "pid": 76337, "tid": -914061504, "ts": 1716454216541299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216567879, "dur": 6, "args": { "External id": 4790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 4790, "pid": 5, "tid": 7, "ts": 1716454216567879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541313, "dur": 5, "args": { "External id": 4790, "cbid": 211, "correlation": 4790 } }, { "ph": "s", "id": 4790, "pid": 76337, "tid": -914061504, "ts": 1716454216541313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216567887, "dur": 5, "args": { "External id": 4792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4792, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4792, "pid": 5, "tid": 7, "ts": 1716454216567887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541322, "dur": 6, "args": { "External id": 4792, "cbid": 211, "correlation": 4792 } }, { "ph": "s", "id": 4792, "pid": 76337, "tid": -914061504, "ts": 1716454216541322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216541331, "dur": 0, "args": { "External id": 4793, "cbid": 51, "correlation": 4793 } }, { "ph": "s", "id": 4793, "pid": 76337, "tid": -914061504, "ts": 1716454216541331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216567893, "dur": 823, "args": { "External id": 4794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4794, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4794, "pid": 5, "tid": 7, "ts": 1716454216567893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541331, "dur": 5, "args": { "External id": 4794, "cbid": 211, "correlation": 4794 } }, { "ph": "s", "id": 4794, "pid": 76337, "tid": -914061504, "ts": 1716454216541331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216568718, "dur": 176, "args": { "External id": 4799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4799, "pid": 5, "tid": 7, "ts": 1716454216568718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541358, "dur": 8, "args": { "External id": 4799, "cbid": 211, "correlation": 4799 } }, { "ph": "s", "id": 4799, "pid": 76337, "tid": -914061504, "ts": 1716454216541358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216568895, "dur": 700, "args": { "External id": 4819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4819, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4819, "pid": 5, "tid": 7, "ts": 1716454216568895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541426, "dur": 11, "args": { "External id": 4819, "cbid": 211, "correlation": 4819 } }, { "ph": "s", "id": 4819, "pid": 76337, "tid": -914061504, "ts": 1716454216541426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216569597, "dur": 4, "args": { "External id": 4831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4831, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4831, "pid": 5, "tid": 7, "ts": 1716454216569597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541446, "dur": 7, "args": { "External id": 4831, "cbid": 211, "correlation": 4831 } }, { "ph": "s", "id": 4831, "pid": 76337, "tid": -914061504, "ts": 1716454216541446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216569602, "dur": 178, "args": { "External id": 4834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4834, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4834, "pid": 5, "tid": 7, "ts": 1716454216569602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541463, "dur": 6, "args": { "External id": 4834, "cbid": 211, "correlation": 4834 } }, { "ph": "s", "id": 4834, "pid": 76337, "tid": -914061504, "ts": 1716454216541463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216569781, "dur": 109, "args": { "External id": 4843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4843, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4843, "pid": 5, "tid": 7, "ts": 1716454216569781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541501, "dur": 10, "args": { "External id": 4843, "cbid": 211, "correlation": 4843 } }, { "ph": "s", "id": 4843, "pid": 76337, "tid": -914061504, "ts": 1716454216541501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216541561, "dur": 0, "args": { "External id": 4853, "cbid": 317, "correlation": 4853 } }, { "ph": "f", "id": 4853, "pid": 76337, "tid": -914061504, "ts": 1716454216541561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216541562, "dur": 0, "args": { "External id": 4854, "cbid": 203, "correlation": 4854 } }, { "ph": "f", "id": 4854, "pid": 76337, "tid": -914061504, "ts": 1716454216541562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216541563, "dur": 0, "args": { "External id": 4855, "cbid": 205, "correlation": 4855 } }, { "ph": "f", "id": 4855, "pid": 76337, "tid": -914061504, "ts": 1716454216541563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216569892, "dur": 131, "args": { "External id": 4859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4859, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4859, "pid": 5, "tid": 7, "ts": 1716454216569892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541575, "dur": 12, "args": { "External id": 4859, "cbid": 211, "correlation": 4859 } }, { "ph": "s", "id": 4859, "pid": 76337, "tid": -914061504, "ts": 1716454216541575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216570024, "dur": 6, "args": { "External id": 4861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4861, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 4861, "pid": 5, "tid": 7, "ts": 1716454216570024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541589, "dur": 5, "args": { "External id": 4861, "cbid": 211, "correlation": 4861 } }, { "ph": "s", "id": 4861, "pid": 76337, "tid": -914061504, "ts": 1716454216541589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216570032, "dur": 5, "args": { "External id": 4863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4863, "pid": 5, "tid": 7, "ts": 1716454216570032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541598, "dur": 5, "args": { "External id": 4863, "cbid": 211, "correlation": 4863 } }, { "ph": "s", "id": 4863, "pid": 76337, "tid": -914061504, "ts": 1716454216541598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216541606, "dur": 0, "args": { "External id": 4864, "cbid": 51, "correlation": 4864 } }, { "ph": "s", "id": 4864, "pid": 76337, "tid": -914061504, "ts": 1716454216541606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216570039, "dur": 819, "args": { "External id": 4865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4865, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4865, "pid": 5, "tid": 7, "ts": 1716454216570039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541606, "dur": 5, "args": { "External id": 4865, "cbid": 211, "correlation": 4865 } }, { "ph": "s", "id": 4865, "pid": 76337, "tid": -914061504, "ts": 1716454216541606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216570859, "dur": 177, "args": { "External id": 4870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4870, "pid": 5, "tid": 7, "ts": 1716454216570859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541633, "dur": 8, "args": { "External id": 4870, "cbid": 211, "correlation": 4870 } }, { "ph": "s", "id": 4870, "pid": 76337, "tid": -914061504, "ts": 1716454216541633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216571037, "dur": 748, "args": { "External id": 4878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4878, "pid": 5, "tid": 7, "ts": 1716454216571037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541666, "dur": 10, "args": { "External id": 4878, "cbid": 211, "correlation": 4878 } }, { "ph": "s", "id": 4878, "pid": 76337, "tid": -914061504, "ts": 1716454216541666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216571787, "dur": 103, "args": { "External id": 4886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4886, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4886, "pid": 5, "tid": 7, "ts": 1716454216571787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541697, "dur": 8, "args": { "External id": 4886, "cbid": 211, "correlation": 4886 } }, { "ph": "s", "id": 4886, "pid": 76337, "tid": -914061504, "ts": 1716454216541697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216571891, "dur": 1797, "args": { "External id": 4896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4896, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4896, "pid": 5, "tid": 7, "ts": 1716454216571891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541764, "dur": 15, "args": { "External id": 4896, "cbid": 211, "correlation": 4896 } }, { "ph": "s", "id": 4896, "pid": 76337, "tid": -914061504, "ts": 1716454216541764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216573690, "dur": 699, "args": { "External id": 4917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4917, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4917, "pid": 5, "tid": 7, "ts": 1716454216573690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541800, "dur": 8, "args": { "External id": 4917, "cbid": 211, "correlation": 4917 } }, { "ph": "s", "id": 4917, "pid": 76337, "tid": -914061504, "ts": 1716454216541800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216574390, "dur": 4, "args": { "External id": 4929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4929, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 4929, "pid": 5, "tid": 7, "ts": 1716454216574390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541817, "dur": 6, "args": { "External id": 4929, "cbid": 211, "correlation": 4929 } }, { "ph": "s", "id": 4929, "pid": 76337, "tid": -914061504, "ts": 1716454216541817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216574395, "dur": 177, "args": { "External id": 4932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4932, "pid": 5, "tid": 7, "ts": 1716454216574395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541833, "dur": 7, "args": { "External id": 4932, "cbid": 211, "correlation": 4932 } }, { "ph": "s", "id": 4932, "pid": 76337, "tid": -914061504, "ts": 1716454216541833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216574573, "dur": 110, "args": { "External id": 4941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4941, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4941, "pid": 5, "tid": 7, "ts": 1716454216574573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541871, "dur": 9, "args": { "External id": 4941, "cbid": 211, "correlation": 4941 } }, { "ph": "s", "id": 4941, "pid": 76337, "tid": -914061504, "ts": 1716454216541871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216541922, "dur": 0, "args": { "External id": 4951, "cbid": 317, "correlation": 4951 } }, { "ph": "f", "id": 4951, "pid": 76337, "tid": -914061504, "ts": 1716454216541922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216541923, "dur": 0, "args": { "External id": 4952, "cbid": 203, "correlation": 4952 } }, { "ph": "f", "id": 4952, "pid": 76337, "tid": -914061504, "ts": 1716454216541923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216541923, "dur": 0, "args": { "External id": 4953, "cbid": 205, "correlation": 4953 } }, { "ph": "f", "id": 4953, "pid": 76337, "tid": -914061504, "ts": 1716454216541923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216574685, "dur": 137, "args": { "External id": 4957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4957, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4957, "pid": 5, "tid": 7, "ts": 1716454216574685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541936, "dur": 12, "args": { "External id": 4957, "cbid": 211, "correlation": 4957 } }, { "ph": "s", "id": 4957, "pid": 76337, "tid": -914061504, "ts": 1716454216541936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216574823, "dur": 6, "args": { "External id": 4959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4959, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 4959, "pid": 5, "tid": 7, "ts": 1716454216574823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541951, "dur": 5, "args": { "External id": 4959, "cbid": 211, "correlation": 4959 } }, { "ph": "s", "id": 4959, "pid": 76337, "tid": -914061504, "ts": 1716454216541951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216574831, "dur": 5, "args": { "External id": 4961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4961, "pid": 5, "tid": 7, "ts": 1716454216574831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541959, "dur": 5, "args": { "External id": 4961, "cbid": 211, "correlation": 4961 } }, { "ph": "s", "id": 4961, "pid": 76337, "tid": -914061504, "ts": 1716454216541959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216541967, "dur": 0, "args": { "External id": 4962, "cbid": 51, "correlation": 4962 } }, { "ph": "s", "id": 4962, "pid": 76337, "tid": -914061504, "ts": 1716454216541967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216574837, "dur": 817, "args": { "External id": 4963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4963, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 4963, "pid": 5, "tid": 7, "ts": 1716454216574837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216541968, "dur": 14, "args": { "External id": 4963, "cbid": 211, "correlation": 4963 } }, { "ph": "s", "id": 4963, "pid": 76337, "tid": -914061504, "ts": 1716454216541968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216575656, "dur": 176, "args": { "External id": 4968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 4968, "pid": 5, "tid": 7, "ts": 1716454216575656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542005, "dur": 9, "args": { "External id": 4968, "cbid": 211, "correlation": 4968 } }, { "ph": "s", "id": 4968, "pid": 76337, "tid": -914061504, "ts": 1716454216542005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216575833, "dur": 702, "args": { "External id": 4988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 4988, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 4988, "pid": 5, "tid": 7, "ts": 1716454216575833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542073, "dur": 11, "args": { "External id": 4988, "cbid": 211, "correlation": 4988 } }, { "ph": "s", "id": 4988, "pid": 76337, "tid": -914061504, "ts": 1716454216542073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216576536, "dur": 4, "args": { "External id": 5000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5000, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5000, "pid": 5, "tid": 7, "ts": 1716454216576536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542093, "dur": 6, "args": { "External id": 5000, "cbid": 211, "correlation": 5000 } }, { "ph": "s", "id": 5000, "pid": 76337, "tid": -914061504, "ts": 1716454216542093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216576542, "dur": 176, "args": { "External id": 5003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5003, "pid": 5, "tid": 7, "ts": 1716454216576542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542110, "dur": 6, "args": { "External id": 5003, "cbid": 211, "correlation": 5003 } }, { "ph": "s", "id": 5003, "pid": 76337, "tid": -914061504, "ts": 1716454216542110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216576719, "dur": 110, "args": { "External id": 5012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5012, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5012, "pid": 5, "tid": 7, "ts": 1716454216576719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542148, "dur": 10, "args": { "External id": 5012, "cbid": 211, "correlation": 5012 } }, { "ph": "s", "id": 5012, "pid": 76337, "tid": -914061504, "ts": 1716454216542148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216542208, "dur": 0, "args": { "External id": 5022, "cbid": 317, "correlation": 5022 } }, { "ph": "f", "id": 5022, "pid": 76337, "tid": -914061504, "ts": 1716454216542208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216542209, "dur": 0, "args": { "External id": 5023, "cbid": 203, "correlation": 5023 } }, { "ph": "f", "id": 5023, "pid": 76337, "tid": -914061504, "ts": 1716454216542209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216542209, "dur": 0, "args": { "External id": 5024, "cbid": 205, "correlation": 5024 } }, { "ph": "f", "id": 5024, "pid": 76337, "tid": -914061504, "ts": 1716454216542209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216576830, "dur": 141, "args": { "External id": 5028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5028, "pid": 5, "tid": 7, "ts": 1716454216576830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542222, "dur": 12, "args": { "External id": 5028, "cbid": 211, "correlation": 5028 } }, { "ph": "s", "id": 5028, "pid": 76337, "tid": -914061504, "ts": 1716454216542222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216576973, "dur": 6, "args": { "External id": 5030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5030, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 5030, "pid": 5, "tid": 7, "ts": 1716454216576973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542237, "dur": 5, "args": { "External id": 5030, "cbid": 211, "correlation": 5030 } }, { "ph": "s", "id": 5030, "pid": 76337, "tid": -914061504, "ts": 1716454216542237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216576980, "dur": 5, "args": { "External id": 5032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5032, "pid": 5, "tid": 7, "ts": 1716454216576980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542246, "dur": 5, "args": { "External id": 5032, "cbid": 211, "correlation": 5032 } }, { "ph": "s", "id": 5032, "pid": 76337, "tid": -914061504, "ts": 1716454216542246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216542253, "dur": 0, "args": { "External id": 5033, "cbid": 51, "correlation": 5033 } }, { "ph": "s", "id": 5033, "pid": 76337, "tid": -914061504, "ts": 1716454216542253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216576986, "dur": 815, "args": { "External id": 5034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5034, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5034, "pid": 5, "tid": 7, "ts": 1716454216576986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542254, "dur": 5, "args": { "External id": 5034, "cbid": 211, "correlation": 5034 } }, { "ph": "s", "id": 5034, "pid": 76337, "tid": -914061504, "ts": 1716454216542254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216577803, "dur": 177, "args": { "External id": 5039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5039, "pid": 5, "tid": 7, "ts": 1716454216577803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542280, "dur": 8, "args": { "External id": 5039, "cbid": 211, "correlation": 5039 } }, { "ph": "s", "id": 5039, "pid": 76337, "tid": -914061504, "ts": 1716454216542280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216577982, "dur": 754, "args": { "External id": 5047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5047, "pid": 5, "tid": 7, "ts": 1716454216577982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542314, "dur": 10, "args": { "External id": 5047, "cbid": 211, "correlation": 5047 } }, { "ph": "s", "id": 5047, "pid": 76337, "tid": -914061504, "ts": 1716454216542314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216578737, "dur": 104, "args": { "External id": 5055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5055, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5055, "pid": 5, "tid": 7, "ts": 1716454216578737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542344, "dur": 8, "args": { "External id": 5055, "cbid": 211, "correlation": 5055 } }, { "ph": "s", "id": 5055, "pid": 76337, "tid": -914061504, "ts": 1716454216542344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216578842, "dur": 88, "args": { "External id": 5065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 617.2125, "warps per SM": 2468.85, "grid": [49377, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5065, "pid": 5, "tid": 7, "ts": 1716454216578842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542401, "dur": 10, "args": { "External id": 5065, "cbid": 211, "correlation": 5065 } }, { "ph": "s", "id": 5065, "pid": 76337, "tid": -914061504, "ts": 1716454216542401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216578932, "dur": 165, "args": { "External id": 5070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5070, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5070, "pid": 5, "tid": 7, "ts": 1716454216578932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542431, "dur": 7, "args": { "External id": 5070, "cbid": 211, "correlation": 5070 } }, { "ph": "s", "id": 5070, "pid": 76337, "tid": -914061504, "ts": 1716454216542431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216579098, "dur": 5, "args": { "External id": 5085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.6, "warps per SM": 14.4, "grid": [288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 22 } }, { "ph": "f", "id": 5085, "pid": 5, "tid": 7, "ts": 1716454216579098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542494, "dur": 12, "args": { "External id": 5085, "cbid": 211, "correlation": 5085 } }, { "ph": "s", "id": 5085, "pid": 76337, "tid": -914061504, "ts": 1716454216542494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216542514, "dur": 0, "args": { "External id": 5092, "cbid": 317, "correlation": 5092 } }, { "ph": "f", "id": 5092, "pid": 76337, "tid": -914061504, "ts": 1716454216542514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216542514, "dur": 0, "args": { "External id": 5093, "cbid": 203, "correlation": 5093 } }, { "ph": "f", "id": 5093, "pid": 76337, "tid": -914061504, "ts": 1716454216542514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216542515, "dur": 0, "args": { "External id": 5094, "cbid": 205, "correlation": 5094 } }, { "ph": "f", "id": 5094, "pid": 76337, "tid": -914061504, "ts": 1716454216542515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216579105, "dur": 265, "args": { "External id": 5098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5098, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [1, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5098, "pid": 5, "tid": 7, "ts": 1716454216579105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542530, "dur": 9, "args": { "External id": 5098, "cbid": 211, "correlation": 5098 } }, { "ph": "s", "id": 5098, "pid": 76337, "tid": -914061504, "ts": 1716454216542530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216579372, "dur": 48, "args": { "External id": 5104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5104, "pid": 5, "tid": 7, "ts": 1716454216579372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542561, "dur": 8, "args": { "External id": 5104, "cbid": 211, "correlation": 5104 } }, { "ph": "s", "id": 5104, "pid": 76337, "tid": -914061504, "ts": 1716454216542561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216579421, "dur": 146, "args": { "External id": 5114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5114, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5114, "pid": 5, "tid": 7, "ts": 1716454216579421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542641, "dur": 13, "args": { "External id": 5114, "cbid": 211, "correlation": 5114 } }, { "ph": "s", "id": 5114, "pid": 76337, "tid": -914061504, "ts": 1716454216542641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216579568, "dur": 180, "args": { "External id": 5135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5135, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5135, "pid": 5, "tid": 7, "ts": 1716454216579568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542675, "dur": 7, "args": { "External id": 5135, "cbid": 211, "correlation": 5135 } }, { "ph": "s", "id": 5135, "pid": 76337, "tid": -914061504, "ts": 1716454216542675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216579750, "dur": 4, "args": { "External id": 5147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5147, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5147, "pid": 5, "tid": 7, "ts": 1716454216579750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542691, "dur": 6, "args": { "External id": 5147, "cbid": 211, "correlation": 5147 } }, { "ph": "s", "id": 5147, "pid": 76337, "tid": -914061504, "ts": 1716454216542691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216579755, "dur": 48, "args": { "External id": 5150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5150, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5150, "pid": 5, "tid": 7, "ts": 1716454216579755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542707, "dur": 7, "args": { "External id": 5150, "cbid": 211, "correlation": 5150 } }, { "ph": "s", "id": 5150, "pid": 76337, "tid": -914061504, "ts": 1716454216542707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216579805, "dur": 31, "args": { "External id": 5159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5159, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5159, "pid": 5, "tid": 7, "ts": 1716454216579805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542747, "dur": 9, "args": { "External id": 5159, "cbid": 211, "correlation": 5159 } }, { "ph": "s", "id": 5159, "pid": 76337, "tid": -914061504, "ts": 1716454216542747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216542797, "dur": 0, "args": { "External id": 5169, "cbid": 317, "correlation": 5169 } }, { "ph": "f", "id": 5169, "pid": 76337, "tid": -914061504, "ts": 1716454216542797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216542798, "dur": 0, "args": { "External id": 5170, "cbid": 203, "correlation": 5170 } }, { "ph": "f", "id": 5170, "pid": 76337, "tid": -914061504, "ts": 1716454216542798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216542799, "dur": 0, "args": { "External id": 5171, "cbid": 205, "correlation": 5171 } }, { "ph": "f", "id": 5171, "pid": 76337, "tid": -914061504, "ts": 1716454216542799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216579837, "dur": 37, "args": { "External id": 5175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5175, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [1536, 4, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5175, "pid": 5, "tid": 7, "ts": 1716454216579837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542812, "dur": 15, "args": { "External id": 5175, "cbid": 211, "correlation": 5175 } }, { "ph": "s", "id": 5175, "pid": 76337, "tid": -914061504, "ts": 1716454216542812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216579875, "dur": 8, "args": { "External id": 5177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5177, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12.8, "warps per SM": 102.4, "grid": [1, 4, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5177, "pid": 5, "tid": 7, "ts": 1716454216579875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542830, "dur": 5, "args": { "External id": 5177, "cbid": 211, "correlation": 5177 } }, { "ph": "s", "id": 5177, "pid": 76337, "tid": -914061504, "ts": 1716454216542830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216579884, "dur": 4, "args": { "External id": 5179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5179, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 5179, "pid": 5, "tid": 7, "ts": 1716454216579884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542839, "dur": 5, "args": { "External id": 5179, "cbid": 211, "correlation": 5179 } }, { "ph": "s", "id": 5179, "pid": 76337, "tid": -914061504, "ts": 1716454216542839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216542847, "dur": 0, "args": { "External id": 5180, "cbid": 51, "correlation": 5180 } }, { "ph": "s", "id": 5180, "pid": 76337, "tid": -914061504, "ts": 1716454216542847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216579890, "dur": 434, "args": { "External id": 5181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5181, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [192, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5181, "pid": 5, "tid": 7, "ts": 1716454216579890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542848, "dur": 6, "args": { "External id": 5181, "cbid": 211, "correlation": 5181 } }, { "ph": "s", "id": 5181, "pid": 76337, "tid": -914061504, "ts": 1716454216542848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216580326, "dur": 91, "args": { "External id": 5186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5186, "pid": 5, "tid": 7, "ts": 1716454216580326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542876, "dur": 8, "args": { "External id": 5186, "cbid": 211, "correlation": 5186 } }, { "ph": "s", "id": 5186, "pid": 76337, "tid": -914061504, "ts": 1716454216542876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216580419, "dur": 366, "args": { "External id": 5206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5206, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5206, "pid": 5, "tid": 7, "ts": 1716454216580419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542944, "dur": 12, "args": { "External id": 5206, "cbid": 211, "correlation": 5206 } }, { "ph": "s", "id": 5206, "pid": 76337, "tid": -914061504, "ts": 1716454216542944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216580786, "dur": 4, "args": { "External id": 5218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5218, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5218, "pid": 5, "tid": 7, "ts": 1716454216580786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542965, "dur": 6, "args": { "External id": 5218, "cbid": 211, "correlation": 5218 } }, { "ph": "s", "id": 5218, "pid": 76337, "tid": -914061504, "ts": 1716454216542965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216580792, "dur": 91, "args": { "External id": 5221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5221, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5221, "pid": 5, "tid": 7, "ts": 1716454216580792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216542991, "dur": 8, "args": { "External id": 5221, "cbid": 211, "correlation": 5221 } }, { "ph": "s", "id": 5221, "pid": 76337, "tid": -914061504, "ts": 1716454216542991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216580884, "dur": 58, "args": { "External id": 5230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5230, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5230, "pid": 5, "tid": 7, "ts": 1716454216580884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543031, "dur": 10, "args": { "External id": 5230, "cbid": 211, "correlation": 5230 } }, { "ph": "s", "id": 5230, "pid": 76337, "tid": -914061504, "ts": 1716454216543031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216543091, "dur": 0, "args": { "External id": 5240, "cbid": 317, "correlation": 5240 } }, { "ph": "f", "id": 5240, "pid": 76337, "tid": -914061504, "ts": 1716454216543091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216543092, "dur": 0, "args": { "External id": 5241, "cbid": 203, "correlation": 5241 } }, { "ph": "f", "id": 5241, "pid": 76337, "tid": -914061504, "ts": 1716454216543092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216543093, "dur": 0, "args": { "External id": 5242, "cbid": 205, "correlation": 5242 } }, { "ph": "f", "id": 5242, "pid": 76337, "tid": -914061504, "ts": 1716454216543093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216580944, "dur": 101, "args": { "External id": 5246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5246, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5246, "pid": 5, "tid": 7, "ts": 1716454216580944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543106, "dur": 11, "args": { "External id": 5246, "cbid": 211, "correlation": 5246 } }, { "ph": "s", "id": 5246, "pid": 76337, "tid": -914061504, "ts": 1716454216543106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216581046, "dur": 12, "args": { "External id": 5248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5248, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5248, "pid": 5, "tid": 7, "ts": 1716454216581046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543120, "dur": 5, "args": { "External id": 5248, "cbid": 211, "correlation": 5248 } }, { "ph": "s", "id": 5248, "pid": 76337, "tid": -914061504, "ts": 1716454216543120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216581059, "dur": 4, "args": { "External id": 5250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 5250, "pid": 5, "tid": 7, "ts": 1716454216581059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543129, "dur": 5, "args": { "External id": 5250, "cbid": 211, "correlation": 5250 } }, { "ph": "s", "id": 5250, "pid": 76337, "tid": -914061504, "ts": 1716454216543129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216543137, "dur": 0, "args": { "External id": 5251, "cbid": 51, "correlation": 5251 } }, { "ph": "s", "id": 5251, "pid": 76337, "tid": -914061504, "ts": 1716454216543137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216581064, "dur": 774, "args": { "External id": 5252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5252, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5252, "pid": 5, "tid": 7, "ts": 1716454216581064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543138, "dur": 6, "args": { "External id": 5252, "cbid": 211, "correlation": 5252 } }, { "ph": "s", "id": 5252, "pid": 76337, "tid": -914061504, "ts": 1716454216543138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216581839, "dur": 91, "args": { "External id": 5257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5257, "pid": 5, "tid": 7, "ts": 1716454216581839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543166, "dur": 8, "args": { "External id": 5257, "cbid": 211, "correlation": 5257 } }, { "ph": "s", "id": 5257, "pid": 76337, "tid": -914061504, "ts": 1716454216543166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216543223, "dur": 0, "args": { "External id": 5267, "cbid": 317, "correlation": 5267 } }, { "ph": "f", "id": 5267, "pid": 76337, "tid": -914061504, "ts": 1716454216543223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216543223, "dur": 0, "args": { "External id": 5268, "cbid": 203, "correlation": 5268 } }, { "ph": "f", "id": 5268, "pid": 76337, "tid": -914061504, "ts": 1716454216543223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216543224, "dur": 0, "args": { "External id": 5269, "cbid": 205, "correlation": 5269 } }, { "ph": "f", "id": 5269, "pid": 76337, "tid": -914061504, "ts": 1716454216543224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216543254, "dur": 1, "args": { "External id": 5273, "cbid": 251, "correlation": 5273 } }, { "ph": "f", "id": 5273, "pid": 76337, "tid": -914061504, "ts": 1716454216543254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216581932, "dur": 86, "args": { "External id": 5274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5274, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [2, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5274, "pid": 5, "tid": 7, "ts": 1716454216581932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543258, "dur": 12, "args": { "External id": 5274, "cbid": 211, "correlation": 5274 } }, { "ph": "s", "id": 5274, "pid": 76337, "tid": -914061504, "ts": 1716454216543258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216582019, "dur": 91, "args": { "External id": 5280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5280, "pid": 5, "tid": 7, "ts": 1716454216582019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543291, "dur": 9, "args": { "External id": 5280, "cbid": 211, "correlation": 5280 } }, { "ph": "s", "id": 5280, "pid": 76337, "tid": -914061504, "ts": 1716454216543291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216582112, "dur": 253, "args": { "External id": 5288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5288, "pid": 5, "tid": 7, "ts": 1716454216582112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543322, "dur": 8, "args": { "External id": 5288, "cbid": 211, "correlation": 5288 } }, { "ph": "s", "id": 5288, "pid": 76337, "tid": -914061504, "ts": 1716454216543322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216582366, "dur": 54, "args": { "External id": 5296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5296, "registers per thread": 17, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5296, "pid": 5, "tid": 7, "ts": 1716454216582366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543350, "dur": 9, "args": { "External id": 5296, "cbid": 211, "correlation": 5296 } }, { "ph": "s", "id": 5296, "pid": 76337, "tid": -914061504, "ts": 1716454216543350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216582421, "dur": 373, "args": { "External id": 5306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5306, "pid": 5, "tid": 7, "ts": 1716454216582421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543419, "dur": 12, "args": { "External id": 5306, "cbid": 211, "correlation": 5306 } }, { "ph": "s", "id": 5306, "pid": 76337, "tid": -914061504, "ts": 1716454216543419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216582796, "dur": 375, "args": { "External id": 5327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5327, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5327, "pid": 5, "tid": 7, "ts": 1716454216582796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543450, "dur": 7, "args": { "External id": 5327, "cbid": 211, "correlation": 5327 } }, { "ph": "s", "id": 5327, "pid": 76337, "tid": -914061504, "ts": 1716454216543450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216583172, "dur": 4, "args": { "External id": 5339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5339, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5339, "pid": 5, "tid": 7, "ts": 1716454216583172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543466, "dur": 6, "args": { "External id": 5339, "cbid": 211, "correlation": 5339 } }, { "ph": "s", "id": 5339, "pid": 76337, "tid": -914061504, "ts": 1716454216543466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216583177, "dur": 90, "args": { "External id": 5342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5342, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5342, "pid": 5, "tid": 7, "ts": 1716454216583177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543483, "dur": 7, "args": { "External id": 5342, "cbid": 211, "correlation": 5342 } }, { "ph": "s", "id": 5342, "pid": 76337, "tid": -914061504, "ts": 1716454216543483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216583269, "dur": 57, "args": { "External id": 5351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5351, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5351, "pid": 5, "tid": 7, "ts": 1716454216583269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543521, "dur": 10, "args": { "External id": 5351, "cbid": 211, "correlation": 5351 } }, { "ph": "s", "id": 5351, "pid": 76337, "tid": -914061504, "ts": 1716454216543521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216543571, "dur": 0, "args": { "External id": 5361, "cbid": 317, "correlation": 5361 } }, { "ph": "f", "id": 5361, "pid": 76337, "tid": -914061504, "ts": 1716454216543571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216543571, "dur": 0, "args": { "External id": 5362, "cbid": 203, "correlation": 5362 } }, { "ph": "f", "id": 5362, "pid": 76337, "tid": -914061504, "ts": 1716454216543571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216543572, "dur": 0, "args": { "External id": 5363, "cbid": 205, "correlation": 5363 } }, { "ph": "f", "id": 5363, "pid": 76337, "tid": -914061504, "ts": 1716454216543572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216583327, "dur": 104, "args": { "External id": 5367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5367, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5367, "pid": 5, "tid": 7, "ts": 1716454216583327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543586, "dur": 11, "args": { "External id": 5367, "cbid": 211, "correlation": 5367 } }, { "ph": "s", "id": 5367, "pid": 76337, "tid": -914061504, "ts": 1716454216543586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216583433, "dur": 12, "args": { "External id": 5369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5369, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5369, "pid": 5, "tid": 7, "ts": 1716454216583433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543600, "dur": 5, "args": { "External id": 5369, "cbid": 211, "correlation": 5369 } }, { "ph": "s", "id": 5369, "pid": 76337, "tid": -914061504, "ts": 1716454216543600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216583446, "dur": 4, "args": { "External id": 5371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5371, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 5371, "pid": 5, "tid": 7, "ts": 1716454216583446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543609, "dur": 5, "args": { "External id": 5371, "cbid": 211, "correlation": 5371 } }, { "ph": "s", "id": 5371, "pid": 76337, "tid": -914061504, "ts": 1716454216543609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216543617, "dur": 0, "args": { "External id": 5372, "cbid": 51, "correlation": 5372 } }, { "ph": "s", "id": 5372, "pid": 76337, "tid": -914061504, "ts": 1716454216543617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216583451, "dur": 772, "args": { "External id": 5373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5373, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5373, "pid": 5, "tid": 7, "ts": 1716454216583451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543618, "dur": 5, "args": { "External id": 5373, "cbid": 211, "correlation": 5373 } }, { "ph": "s", "id": 5373, "pid": 76337, "tid": -914061504, "ts": 1716454216543618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216584224, "dur": 91, "args": { "External id": 5378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5378, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5378, "pid": 5, "tid": 7, "ts": 1716454216584224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543646, "dur": 8, "args": { "External id": 5378, "cbid": 211, "correlation": 5378 } }, { "ph": "s", "id": 5378, "pid": 76337, "tid": -914061504, "ts": 1716454216543646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216584317, "dur": 364, "args": { "External id": 5398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5398, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5398, "pid": 5, "tid": 7, "ts": 1716454216584317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543712, "dur": 11, "args": { "External id": 5398, "cbid": 211, "correlation": 5398 } }, { "ph": "s", "id": 5398, "pid": 76337, "tid": -914061504, "ts": 1716454216543712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216584683, "dur": 4, "args": { "External id": 5410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5410, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5410, "pid": 5, "tid": 7, "ts": 1716454216584683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543732, "dur": 6, "args": { "External id": 5410, "cbid": 211, "correlation": 5410 } }, { "ph": "s", "id": 5410, "pid": 76337, "tid": -914061504, "ts": 1716454216543732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216584689, "dur": 92, "args": { "External id": 5413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5413, "pid": 5, "tid": 7, "ts": 1716454216584689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543750, "dur": 6, "args": { "External id": 5413, "cbid": 211, "correlation": 5413 } }, { "ph": "s", "id": 5413, "pid": 76337, "tid": -914061504, "ts": 1716454216543750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216584781, "dur": 57, "args": { "External id": 5422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5422, "registers per thread": 24, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5422, "pid": 5, "tid": 7, "ts": 1716454216584781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543787, "dur": 10, "args": { "External id": 5422, "cbid": 211, "correlation": 5422 } }, { "ph": "s", "id": 5422, "pid": 76337, "tid": -914061504, "ts": 1716454216543787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216543847, "dur": 0, "args": { "External id": 5432, "cbid": 317, "correlation": 5432 } }, { "ph": "f", "id": 5432, "pid": 76337, "tid": -914061504, "ts": 1716454216543847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216543848, "dur": 0, "args": { "External id": 5433, "cbid": 203, "correlation": 5433 } }, { "ph": "f", "id": 5433, "pid": 76337, "tid": -914061504, "ts": 1716454216543848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216543849, "dur": 0, "args": { "External id": 5434, "cbid": 205, "correlation": 5434 } }, { "ph": "f", "id": 5434, "pid": 76337, "tid": -914061504, "ts": 1716454216543849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216584840, "dur": 99, "args": { "External id": 5438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 153.6, "warps per SM": 1228.8, "grid": [1536, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5438, "pid": 5, "tid": 7, "ts": 1716454216584840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543862, "dur": 12, "args": { "External id": 5438, "cbid": 211, "correlation": 5438 } }, { "ph": "s", "id": 5438, "pid": 76337, "tid": -914061504, "ts": 1716454216543862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216584940, "dur": 11, "args": { "External id": 5440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5440, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5440, "pid": 5, "tid": 7, "ts": 1716454216584940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543876, "dur": 5, "args": { "External id": 5440, "cbid": 211, "correlation": 5440 } }, { "ph": "s", "id": 5440, "pid": 76337, "tid": -914061504, "ts": 1716454216543876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216584953, "dur": 4, "args": { "External id": 5442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 5442, "pid": 5, "tid": 7, "ts": 1716454216584953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543885, "dur": 5, "args": { "External id": 5442, "cbid": 211, "correlation": 5442 } }, { "ph": "s", "id": 5442, "pid": 76337, "tid": -914061504, "ts": 1716454216543885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216543893, "dur": 0, "args": { "External id": 5443, "cbid": 51, "correlation": 5443 } }, { "ph": "s", "id": 5443, "pid": 76337, "tid": -914061504, "ts": 1716454216543893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216584958, "dur": 771, "args": { "External id": 5444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5444, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [384, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5444, "pid": 5, "tid": 7, "ts": 1716454216584958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543893, "dur": 5, "args": { "External id": 5444, "cbid": 211, "correlation": 5444 } }, { "ph": "s", "id": 5444, "pid": 76337, "tid": -914061504, "ts": 1716454216543893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216585731, "dur": 91, "args": { "External id": 5449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5449, "pid": 5, "tid": 7, "ts": 1716454216585731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543921, "dur": 8, "args": { "External id": 5449, "cbid": 211, "correlation": 5449 } }, { "ph": "s", "id": 5449, "pid": 76337, "tid": -914061504, "ts": 1716454216543921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216585824, "dur": 254, "args": { "External id": 5457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5457, "pid": 5, "tid": 7, "ts": 1716454216585824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543953, "dur": 9, "args": { "External id": 5457, "cbid": 211, "correlation": 5457 } }, { "ph": "s", "id": 5457, "pid": 76337, "tid": -914061504, "ts": 1716454216543953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216586079, "dur": 54, "args": { "External id": 5465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5465, "registers per thread": 17, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5465, "pid": 5, "tid": 7, "ts": 1716454216586079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216543992, "dur": 9, "args": { "External id": 5465, "cbid": 211, "correlation": 5465 } }, { "ph": "s", "id": 5465, "pid": 76337, "tid": -914061504, "ts": 1716454216543992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216586134, "dur": 46, "args": { "External id": 5475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 310.0125, "warps per SM": 1240.05, "grid": [24801, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5475, "pid": 5, "tid": 7, "ts": 1716454216586134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544048, "dur": 11, "args": { "External id": 5475, "cbid": 211, "correlation": 5475 } }, { "ph": "s", "id": 5475, "pid": 76337, "tid": -914061504, "ts": 1716454216544048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216586182, "dur": 87, "args": { "External id": 5480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5480, "pid": 5, "tid": 7, "ts": 1716454216586182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544078, "dur": 8, "args": { "External id": 5480, "cbid": 211, "correlation": 5480 } }, { "ph": "s", "id": 5480, "pid": 76337, "tid": -914061504, "ts": 1716454216544078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216586271, "dur": 9, "args": { "External id": 5495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 5495, "pid": 5, "tid": 7, "ts": 1716454216586271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544140, "dur": 12, "args": { "External id": 5495, "cbid": 211, "correlation": 5495 } }, { "ph": "s", "id": 5495, "pid": 76337, "tid": -914061504, "ts": 1716454216544140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216544160, "dur": 0, "args": { "External id": 5502, "cbid": 317, "correlation": 5502 } }, { "ph": "f", "id": 5502, "pid": 76337, "tid": -914061504, "ts": 1716454216544160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216544161, "dur": 0, "args": { "External id": 5503, "cbid": 203, "correlation": 5503 } }, { "ph": "f", "id": 5503, "pid": 76337, "tid": -914061504, "ts": 1716454216544161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216544162, "dur": 0, "args": { "External id": 5504, "cbid": 205, "correlation": 5504 } }, { "ph": "f", "id": 5504, "pid": 76337, "tid": -914061504, "ts": 1716454216544162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216586281, "dur": 314, "args": { "External id": 5508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5508, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 1.2, "warps per SM": 9.6, "grid": [1, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5508, "pid": 5, "tid": 7, "ts": 1716454216586281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544177, "dur": 9, "args": { "External id": 5508, "cbid": 211, "correlation": 5508 } }, { "ph": "s", "id": 5508, "pid": 76337, "tid": -914061504, "ts": 1716454216544177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216586596, "dur": 26, "args": { "External id": 5514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5514, "pid": 5, "tid": 7, "ts": 1716454216586596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544208, "dur": 8, "args": { "External id": 5514, "cbid": 211, "correlation": 5514 } }, { "ph": "s", "id": 5514, "pid": 76337, "tid": -914061504, "ts": 1716454216544208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216586623, "dur": 69, "args": { "External id": 5524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5524, "pid": 5, "tid": 7, "ts": 1716454216586623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544287, "dur": 12, "args": { "External id": 5524, "cbid": 211, "correlation": 5524 } }, { "ph": "s", "id": 5524, "pid": 76337, "tid": -914061504, "ts": 1716454216544287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216586694, "dur": 94, "args": { "External id": 5545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5545, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5545, "pid": 5, "tid": 7, "ts": 1716454216586694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544320, "dur": 7, "args": { "External id": 5545, "cbid": 211, "correlation": 5545 } }, { "ph": "s", "id": 5545, "pid": 76337, "tid": -914061504, "ts": 1716454216544320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216586790, "dur": 5, "args": { "External id": 5557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5557, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5557, "pid": 5, "tid": 7, "ts": 1716454216586790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544336, "dur": 6, "args": { "External id": 5557, "cbid": 211, "correlation": 5557 } }, { "ph": "s", "id": 5557, "pid": 76337, "tid": -914061504, "ts": 1716454216544336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216586796, "dur": 26, "args": { "External id": 5560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5560, "pid": 5, "tid": 7, "ts": 1716454216586796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544353, "dur": 6, "args": { "External id": 5560, "cbid": 211, "correlation": 5560 } }, { "ph": "s", "id": 5560, "pid": 76337, "tid": -914061504, "ts": 1716454216544353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216586824, "dur": 18, "args": { "External id": 5569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5569, "registers per thread": 24, "shared memory": 0, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5569, "pid": 5, "tid": 7, "ts": 1716454216586824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544390, "dur": 10, "args": { "External id": 5569, "cbid": 211, "correlation": 5569 } }, { "ph": "s", "id": 5569, "pid": 76337, "tid": -914061504, "ts": 1716454216544390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216544441, "dur": 0, "args": { "External id": 5579, "cbid": 317, "correlation": 5579 } }, { "ph": "f", "id": 5579, "pid": 76337, "tid": -914061504, "ts": 1716454216544441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216544442, "dur": 0, "args": { "External id": 5580, "cbid": 203, "correlation": 5580 } }, { "ph": "f", "id": 5580, "pid": 76337, "tid": -914061504, "ts": 1716454216544442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216544443, "dur": 0, "args": { "External id": 5581, "cbid": 205, "correlation": 5581 } }, { "ph": "f", "id": 5581, "pid": 76337, "tid": -914061504, "ts": 1716454216544443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216586843, "dur": 20, "args": { "External id": 5585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 38.4, "warps per SM": 307.2, "grid": [384, 8, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5585, "pid": 5, "tid": 7, "ts": 1716454216586843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544456, "dur": 11, "args": { "External id": 5585, "cbid": 211, "correlation": 5585 } }, { "ph": "s", "id": 5585, "pid": 76337, "tid": -914061504, "ts": 1716454216544456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216586865, "dur": 19, "args": { "External id": 5587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 51.2, "warps per SM": 409.6, "grid": [1, 8, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5587, "pid": 5, "tid": 7, "ts": 1716454216586865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544470, "dur": 5, "args": { "External id": 5587, "cbid": 211, "correlation": 5587 } }, { "ph": "s", "id": 5587, "pid": 76337, "tid": -914061504, "ts": 1716454216544470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216586886, "dur": 1, "args": { "External id": 5589, "device": 5, "context": 1, "stream": 7, "correlation": 5589, "bytes": 3072, "memory bandwidth (GB/s)": 1.6 } }, { "ph": "f", "id": 5589, "pid": 5, "tid": 7, "ts": 1716454216586886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216544481, "dur": 7, "args": { "External id": 5589, "cbid": 51, "correlation": 5589 } }, { "ph": "s", "id": 5589, "pid": 76337, "tid": -914061504, "ts": 1716454216544481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216586890, "dur": 465, "args": { "External id": 5590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5590, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [4, 96, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5590, "pid": 5, "tid": 7, "ts": 1716454216586890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544489, "dur": 6, "args": { "External id": 5590, "cbid": 211, "correlation": 5590 } }, { "ph": "s", "id": 5590, "pid": 76337, "tid": -914061504, "ts": 1716454216544489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216587357, "dur": 38, "args": { "External id": 5592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5592, "pid": 5, "tid": 7, "ts": 1716454216587357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544499, "dur": 6, "args": { "External id": 5592, "cbid": 211, "correlation": 5592 } }, { "ph": "s", "id": 5592, "pid": 76337, "tid": -914061504, "ts": 1716454216544499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216587396, "dur": 49, "args": { "External id": 5598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5598, "pid": 5, "tid": 7, "ts": 1716454216587396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544527, "dur": 8, "args": { "External id": 5598, "cbid": 211, "correlation": 5598 } }, { "ph": "s", "id": 5598, "pid": 76337, "tid": -914061504, "ts": 1716454216544527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216587447, "dur": 190, "args": { "External id": 5618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5618, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5618, "pid": 5, "tid": 7, "ts": 1716454216587447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544595, "dur": 11, "args": { "External id": 5618, "cbid": 211, "correlation": 5618 } }, { "ph": "s", "id": 5618, "pid": 76337, "tid": -914061504, "ts": 1716454216544595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216587638, "dur": 4, "args": { "External id": 5630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5630, "pid": 5, "tid": 7, "ts": 1716454216587638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544615, "dur": 7, "args": { "External id": 5630, "cbid": 211, "correlation": 5630 } }, { "ph": "s", "id": 5630, "pid": 76337, "tid": -914061504, "ts": 1716454216544615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216587644, "dur": 46, "args": { "External id": 5633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5633, "pid": 5, "tid": 7, "ts": 1716454216587644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544633, "dur": 6, "args": { "External id": 5633, "cbid": 211, "correlation": 5633 } }, { "ph": "s", "id": 5633, "pid": 76337, "tid": -914061504, "ts": 1716454216544633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216587692, "dur": 31, "args": { "External id": 5642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5642, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5642, "pid": 5, "tid": 7, "ts": 1716454216587692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544672, "dur": 11, "args": { "External id": 5642, "cbid": 211, "correlation": 5642 } }, { "ph": "s", "id": 5642, "pid": 76337, "tid": -914061504, "ts": 1716454216544672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216544734, "dur": 0, "args": { "External id": 5652, "cbid": 317, "correlation": 5652 } }, { "ph": "f", "id": 5652, "pid": 76337, "tid": -914061504, "ts": 1716454216544734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216544735, "dur": 0, "args": { "External id": 5653, "cbid": 203, "correlation": 5653 } }, { "ph": "f", "id": 5653, "pid": 76337, "tid": -914061504, "ts": 1716454216544735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216544736, "dur": 0, "args": { "External id": 5654, "cbid": 205, "correlation": 5654 } }, { "ph": "f", "id": 5654, "pid": 76337, "tid": -914061504, "ts": 1716454216544736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216587724, "dur": 34, "args": { "External id": 5658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5658, "pid": 5, "tid": 7, "ts": 1716454216587724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544749, "dur": 12, "args": { "External id": 5658, "cbid": 211, "correlation": 5658 } }, { "ph": "s", "id": 5658, "pid": 76337, "tid": -914061504, "ts": 1716454216544749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216587760, "dur": 32, "args": { "External id": 5660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5660, "pid": 5, "tid": 7, "ts": 1716454216587760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544763, "dur": 5, "args": { "External id": 5660, "cbid": 211, "correlation": 5660 } }, { "ph": "s", "id": 5660, "pid": 76337, "tid": -914061504, "ts": 1716454216544763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216587794, "dur": 1, "args": { "External id": 5662, "device": 5, "context": 1, "stream": 7, "correlation": 5662, "bytes": 3072, "memory bandwidth (GB/s)": 1.5483870967741935 } }, { "ph": "f", "id": 5662, "pid": 5, "tid": 7, "ts": 1716454216587794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216544774, "dur": 6, "args": { "External id": 5662, "cbid": 51, "correlation": 5662 } }, { "ph": "s", "id": 5662, "pid": 76337, "tid": -914061504, "ts": 1716454216544774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216587798, "dur": 884, "args": { "External id": 5663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5663, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5663, "pid": 5, "tid": 7, "ts": 1716454216587798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544781, "dur": 6, "args": { "External id": 5663, "cbid": 211, "correlation": 5663 } }, { "ph": "s", "id": 5663, "pid": 76337, "tid": -914061504, "ts": 1716454216544781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216588684, "dur": 36, "args": { "External id": 5665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5665, "pid": 5, "tid": 7, "ts": 1716454216588684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544792, "dur": 5, "args": { "External id": 5665, "cbid": 211, "correlation": 5665 } }, { "ph": "s", "id": 5665, "pid": 76337, "tid": -914061504, "ts": 1716454216544792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216588722, "dur": 49, "args": { "External id": 5671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5671, "pid": 5, "tid": 7, "ts": 1716454216588722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544820, "dur": 8, "args": { "External id": 5671, "cbid": 211, "correlation": 5671 } }, { "ph": "s", "id": 5671, "pid": 76337, "tid": -914061504, "ts": 1716454216544820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216544877, "dur": 0, "args": { "External id": 5681, "cbid": 317, "correlation": 5681 } }, { "ph": "f", "id": 5681, "pid": 76337, "tid": -914061504, "ts": 1716454216544877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216544878, "dur": 0, "args": { "External id": 5682, "cbid": 203, "correlation": 5682 } }, { "ph": "f", "id": 5682, "pid": 76337, "tid": -914061504, "ts": 1716454216544878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216544879, "dur": 0, "args": { "External id": 5683, "cbid": 205, "correlation": 5683 } }, { "ph": "f", "id": 5683, "pid": 76337, "tid": -914061504, "ts": 1716454216544879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216544906, "dur": 1, "args": { "External id": 5687, "cbid": 251, "correlation": 5687 } }, { "ph": "f", "id": 5687, "pid": 76337, "tid": -914061504, "ts": 1716454216544906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216588773, "dur": 68, "args": { "External id": 5688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5688, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [4, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5688, "pid": 5, "tid": 7, "ts": 1716454216588773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544909, "dur": 12, "args": { "External id": 5688, "cbid": 211, "correlation": 5688 } }, { "ph": "s", "id": 5688, "pid": 76337, "tid": -914061504, "ts": 1716454216544909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216588842, "dur": 47, "args": { "External id": 5694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5694, "pid": 5, "tid": 7, "ts": 1716454216588842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544941, "dur": 9, "args": { "External id": 5694, "cbid": 211, "correlation": 5694 } }, { "ph": "s", "id": 5694, "pid": 76337, "tid": -914061504, "ts": 1716454216544941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216588891, "dur": 140, "args": { "External id": 5702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5702, "pid": 5, "tid": 7, "ts": 1716454216588891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216544972, "dur": 17, "args": { "External id": 5702, "cbid": 211, "correlation": 5702 } }, { "ph": "s", "id": 5702, "pid": 76337, "tid": -914061504, "ts": 1716454216544972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216589032, "dur": 29, "args": { "External id": 5710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5710, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5710, "pid": 5, "tid": 7, "ts": 1716454216589032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545013, "dur": 8, "args": { "External id": 5710, "cbid": 211, "correlation": 5710 } }, { "ph": "s", "id": 5710, "pid": 76337, "tid": -914061504, "ts": 1716454216545013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216589062, "dur": 135, "args": { "External id": 5720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5720, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5720, "pid": 5, "tid": 7, "ts": 1716454216589062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545083, "dur": 12, "args": { "External id": 5720, "cbid": 211, "correlation": 5720 } }, { "ph": "s", "id": 5720, "pid": 76337, "tid": -914061504, "ts": 1716454216545083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216589199, "dur": 191, "args": { "External id": 5741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5741, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5741, "pid": 5, "tid": 7, "ts": 1716454216589199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545115, "dur": 7, "args": { "External id": 5741, "cbid": 211, "correlation": 5741 } }, { "ph": "s", "id": 5741, "pid": 76337, "tid": -914061504, "ts": 1716454216545115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216589392, "dur": 5, "args": { "External id": 5753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5753, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5753, "pid": 5, "tid": 7, "ts": 1716454216589392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545132, "dur": 6, "args": { "External id": 5753, "cbid": 211, "correlation": 5753 } }, { "ph": "s", "id": 5753, "pid": 76337, "tid": -914061504, "ts": 1716454216545132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216589397, "dur": 48, "args": { "External id": 5756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5756, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5756, "pid": 5, "tid": 7, "ts": 1716454216589397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545148, "dur": 7, "args": { "External id": 5756, "cbid": 211, "correlation": 5756 } }, { "ph": "s", "id": 5756, "pid": 76337, "tid": -914061504, "ts": 1716454216545148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216589447, "dur": 30, "args": { "External id": 5765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5765, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5765, "pid": 5, "tid": 7, "ts": 1716454216589447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545188, "dur": 9, "args": { "External id": 5765, "cbid": 211, "correlation": 5765 } }, { "ph": "s", "id": 5765, "pid": 76337, "tid": -914061504, "ts": 1716454216545188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216545239, "dur": 0, "args": { "External id": 5775, "cbid": 317, "correlation": 5775 } }, { "ph": "f", "id": 5775, "pid": 76337, "tid": -914061504, "ts": 1716454216545239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216545240, "dur": 0, "args": { "External id": 5776, "cbid": 203, "correlation": 5776 } }, { "ph": "f", "id": 5776, "pid": 76337, "tid": -914061504, "ts": 1716454216545240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216545241, "dur": 0, "args": { "External id": 5777, "cbid": 205, "correlation": 5777 } }, { "ph": "f", "id": 5777, "pid": 76337, "tid": -914061504, "ts": 1716454216545241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216589478, "dur": 33, "args": { "External id": 5781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5781, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5781, "pid": 5, "tid": 7, "ts": 1716454216589478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545254, "dur": 12, "args": { "External id": 5781, "cbid": 211, "correlation": 5781 } }, { "ph": "s", "id": 5781, "pid": 76337, "tid": -914061504, "ts": 1716454216545254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216589513, "dur": 32, "args": { "External id": 5783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5783, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5783, "pid": 5, "tid": 7, "ts": 1716454216589513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545269, "dur": 5, "args": { "External id": 5783, "cbid": 211, "correlation": 5783 } }, { "ph": "s", "id": 5783, "pid": 76337, "tid": -914061504, "ts": 1716454216545269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216589548, "dur": 1, "args": { "External id": 5785, "device": 5, "context": 1, "stream": 7, "correlation": 5785, "bytes": 3072, "memory bandwidth (GB/s)": 1.5737704918032787 } }, { "ph": "f", "id": 5785, "pid": 5, "tid": 7, "ts": 1716454216589548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216545279, "dur": 7, "args": { "External id": 5785, "cbid": 51, "correlation": 5785 } }, { "ph": "s", "id": 5785, "pid": 76337, "tid": -914061504, "ts": 1716454216545279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216589552, "dur": 894, "args": { "External id": 5786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5786, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5786, "pid": 5, "tid": 7, "ts": 1716454216589552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545287, "dur": 6, "args": { "External id": 5786, "cbid": 211, "correlation": 5786 } }, { "ph": "s", "id": 5786, "pid": 76337, "tid": -914061504, "ts": 1716454216545287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216590447, "dur": 38, "args": { "External id": 5788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5788, "pid": 5, "tid": 7, "ts": 1716454216590447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545296, "dur": 5, "args": { "External id": 5788, "cbid": 211, "correlation": 5788 } }, { "ph": "s", "id": 5788, "pid": 76337, "tid": -914061504, "ts": 1716454216545296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216590486, "dur": 49, "args": { "External id": 5794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5794, "pid": 5, "tid": 7, "ts": 1716454216590486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545325, "dur": 8, "args": { "External id": 5794, "cbid": 211, "correlation": 5794 } }, { "ph": "s", "id": 5794, "pid": 76337, "tid": -914061504, "ts": 1716454216545325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216590537, "dur": 189, "args": { "External id": 5814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5814, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5814, "pid": 5, "tid": 7, "ts": 1716454216590537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545392, "dur": 11, "args": { "External id": 5814, "cbid": 211, "correlation": 5814 } }, { "ph": "s", "id": 5814, "pid": 76337, "tid": -914061504, "ts": 1716454216545392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216590728, "dur": 4, "args": { "External id": 5826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5826, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5826, "pid": 5, "tid": 7, "ts": 1716454216590728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545412, "dur": 6, "args": { "External id": 5826, "cbid": 211, "correlation": 5826 } }, { "ph": "s", "id": 5826, "pid": 76337, "tid": -914061504, "ts": 1716454216545412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216590733, "dur": 47, "args": { "External id": 5829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5829, "pid": 5, "tid": 7, "ts": 1716454216590733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545430, "dur": 6, "args": { "External id": 5829, "cbid": 211, "correlation": 5829 } }, { "ph": "s", "id": 5829, "pid": 76337, "tid": -914061504, "ts": 1716454216545430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216590782, "dur": 31, "args": { "External id": 5838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5838, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5838, "pid": 5, "tid": 7, "ts": 1716454216590782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545469, "dur": 10, "args": { "External id": 5838, "cbid": 211, "correlation": 5838 } }, { "ph": "s", "id": 5838, "pid": 76337, "tid": -914061504, "ts": 1716454216545469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216545530, "dur": 0, "args": { "External id": 5848, "cbid": 317, "correlation": 5848 } }, { "ph": "f", "id": 5848, "pid": 76337, "tid": -914061504, "ts": 1716454216545530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216545531, "dur": 0, "args": { "External id": 5849, "cbid": 203, "correlation": 5849 } }, { "ph": "f", "id": 5849, "pid": 76337, "tid": -914061504, "ts": 1716454216545531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216545532, "dur": 0, "args": { "External id": 5850, "cbid": 205, "correlation": 5850 } }, { "ph": "f", "id": 5850, "pid": 76337, "tid": -914061504, "ts": 1716454216545532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216590814, "dur": 34, "args": { "External id": 5854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5854, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5854, "pid": 5, "tid": 7, "ts": 1716454216590814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545545, "dur": 12, "args": { "External id": 5854, "cbid": 211, "correlation": 5854 } }, { "ph": "s", "id": 5854, "pid": 76337, "tid": -914061504, "ts": 1716454216545545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216590849, "dur": 33, "args": { "External id": 5856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5856, "pid": 5, "tid": 7, "ts": 1716454216590849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545559, "dur": 5, "args": { "External id": 5856, "cbid": 211, "correlation": 5856 } }, { "ph": "s", "id": 5856, "pid": 76337, "tid": -914061504, "ts": 1716454216545559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216590884, "dur": 1, "args": { "External id": 5858, "device": 5, "context": 1, "stream": 7, "correlation": 5858, "bytes": 3072, "memory bandwidth (GB/s)": 1.5737704918032787 } }, { "ph": "f", "id": 5858, "pid": 5, "tid": 7, "ts": 1716454216590884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216545569, "dur": 6, "args": { "External id": 5858, "cbid": 51, "correlation": 5858 } }, { "ph": "s", "id": 5858, "pid": 76337, "tid": -914061504, "ts": 1716454216545569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216590888, "dur": 890, "args": { "External id": 5859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5859, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 96, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 5859, "pid": 5, "tid": 7, "ts": 1716454216590888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545577, "dur": 7, "args": { "External id": 5859, "cbid": 211, "correlation": 5859 } }, { "ph": "s", "id": 5859, "pid": 76337, "tid": -914061504, "ts": 1716454216545577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216591779, "dur": 37, "args": { "External id": 5861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5861, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [384, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5861, "pid": 5, "tid": 7, "ts": 1716454216591779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545587, "dur": 5, "args": { "External id": 5861, "cbid": 211, "correlation": 5861 } }, { "ph": "s", "id": 5861, "pid": 76337, "tid": -914061504, "ts": 1716454216545587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216591818, "dur": 49, "args": { "External id": 5867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5867, "pid": 5, "tid": 7, "ts": 1716454216591818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545614, "dur": 8, "args": { "External id": 5867, "cbid": 211, "correlation": 5867 } }, { "ph": "s", "id": 5867, "pid": 76337, "tid": -914061504, "ts": 1716454216545614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216591868, "dur": 141, "args": { "External id": 5875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5875, "pid": 5, "tid": 7, "ts": 1716454216591868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545649, "dur": 9, "args": { "External id": 5875, "cbid": 211, "correlation": 5875 } }, { "ph": "s", "id": 5875, "pid": 76337, "tid": -914061504, "ts": 1716454216545649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216592010, "dur": 28, "args": { "External id": 5883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5883, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5883, "pid": 5, "tid": 7, "ts": 1716454216592010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545677, "dur": 8, "args": { "External id": 5883, "cbid": 211, "correlation": 5883 } }, { "ph": "s", "id": 5883, "pid": 76337, "tid": -914061504, "ts": 1716454216545677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216592039, "dur": 25, "args": { "External id": 5893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 156.4125, "warps per SM": 625.65, "grid": [12513, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5893, "pid": 5, "tid": 7, "ts": 1716454216592039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545731, "dur": 11, "args": { "External id": 5893, "cbid": 211, "correlation": 5893 } }, { "ph": "s", "id": 5893, "pid": 76337, "tid": -914061504, "ts": 1716454216545731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592065, "dur": 47, "args": { "External id": 5898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5898, "pid": 5, "tid": 7, "ts": 1716454216592065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545761, "dur": 8, "args": { "External id": 5898, "cbid": 211, "correlation": 5898 } }, { "ph": "s", "id": 5898, "pid": 76337, "tid": -914061504, "ts": 1716454216545761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592113, "dur": 25, "args": { "External id": 5913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5913, "registers per thread": 16, "shared memory": 0, "blocks per SM": 57.6, "warps per SM": 230.4, "grid": [4608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5913, "pid": 5, "tid": 7, "ts": 1716454216592113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545823, "dur": 12, "args": { "External id": 5913, "cbid": 211, "correlation": 5913 } }, { "ph": "s", "id": 5913, "pid": 76337, "tid": -914061504, "ts": 1716454216545823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216545842, "dur": 0, "args": { "External id": 5920, "cbid": 317, "correlation": 5920 } }, { "ph": "f", "id": 5920, "pid": 76337, "tid": -914061504, "ts": 1716454216545842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216545843, "dur": 0, "args": { "External id": 5921, "cbid": 203, "correlation": 5921 } }, { "ph": "f", "id": 5921, "pid": 76337, "tid": -914061504, "ts": 1716454216545843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216545844, "dur": 0, "args": { "External id": 5922, "cbid": 205, "correlation": 5922 } }, { "ph": "f", "id": 5922, "pid": 76337, "tid": -914061504, "ts": 1716454216545844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216592139, "dur": 312, "args": { "External id": 5926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5926, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [4, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 5926, "pid": 5, "tid": 7, "ts": 1716454216592139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545859, "dur": 8, "args": { "External id": 5926, "cbid": 211, "correlation": 5926 } }, { "ph": "s", "id": 5926, "pid": 76337, "tid": -914061504, "ts": 1716454216545859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592453, "dur": 14, "args": { "External id": 5932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5932, "pid": 5, "tid": 7, "ts": 1716454216592453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545889, "dur": 8, "args": { "External id": 5932, "cbid": 211, "correlation": 5932 } }, { "ph": "s", "id": 5932, "pid": 76337, "tid": -914061504, "ts": 1716454216545889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592468, "dur": 37, "args": { "External id": 5942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5942, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5942, "pid": 5, "tid": 7, "ts": 1716454216592468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216545969, "dur": 20, "args": { "External id": 5942, "cbid": 211, "correlation": 5942 } }, { "ph": "s", "id": 5942, "pid": 76337, "tid": -914061504, "ts": 1716454216545969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216592507, "dur": 49, "args": { "External id": 5963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5963, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 5963, "pid": 5, "tid": 7, "ts": 1716454216592507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546010, "dur": 8, "args": { "External id": 5963, "cbid": 211, "correlation": 5963 } }, { "ph": "s", "id": 5963, "pid": 76337, "tid": -914061504, "ts": 1716454216546010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216592557, "dur": 4, "args": { "External id": 5975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5975, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 5975, "pid": 5, "tid": 7, "ts": 1716454216592557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546028, "dur": 6, "args": { "External id": 5975, "cbid": 211, "correlation": 5975 } }, { "ph": "s", "id": 5975, "pid": 76337, "tid": -914061504, "ts": 1716454216546028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592562, "dur": 15, "args": { "External id": 5978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5978, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5978, "pid": 5, "tid": 7, "ts": 1716454216592562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546044, "dur": 6, "args": { "External id": 5978, "cbid": 211, "correlation": 5978 } }, { "ph": "s", "id": 5978, "pid": 76337, "tid": -914061504, "ts": 1716454216546044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216592579, "dur": 9, "args": { "External id": 5987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 5987, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 5987, "pid": 5, "tid": 7, "ts": 1716454216592579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546085, "dur": 9, "args": { "External id": 5987, "cbid": 211, "correlation": 5987 } }, { "ph": "s", "id": 5987, "pid": 76337, "tid": -914061504, "ts": 1716454216546085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216546136, "dur": 0, "args": { "External id": 5997, "cbid": 317, "correlation": 5997 } }, { "ph": "f", "id": 5997, "pid": 76337, "tid": -914061504, "ts": 1716454216546136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216546136, "dur": 0, "args": { "External id": 5998, "cbid": 203, "correlation": 5998 } }, { "ph": "f", "id": 5998, "pid": 76337, "tid": -914061504, "ts": 1716454216546136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216546137, "dur": 0, "args": { "External id": 5999, "cbid": 205, "correlation": 5999 } }, { "ph": "f", "id": 5999, "pid": 76337, "tid": -914061504, "ts": 1716454216546137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216592590, "dur": 11, "args": { "External id": 6003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6003, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6003, "pid": 5, "tid": 7, "ts": 1716454216592590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546150, "dur": 12, "args": { "External id": 6003, "cbid": 211, "correlation": 6003 } }, { "ph": "s", "id": 6003, "pid": 76337, "tid": -914061504, "ts": 1716454216546150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216592602, "dur": 33, "args": { "External id": 6005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6005, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6005, "pid": 5, "tid": 7, "ts": 1716454216592602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546164, "dur": 5, "args": { "External id": 6005, "cbid": 211, "correlation": 6005 } }, { "ph": "s", "id": 6005, "pid": 76337, "tid": -914061504, "ts": 1716454216546164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216592637, "dur": 1, "args": { "External id": 6007, "device": 5, "context": 1, "stream": 7, "correlation": 6007, "bytes": 768, "memory bandwidth (GB/s)": 0.4525633470830878 } }, { "ph": "f", "id": 6007, "pid": 5, "tid": 7, "ts": 1716454216592637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216546175, "dur": 7, "args": { "External id": 6007, "cbid": 51, "correlation": 6007 } }, { "ph": "s", "id": 6007, "pid": 76337, "tid": -914061504, "ts": 1716454216546175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216592641, "dur": 270, "args": { "External id": 6008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6008, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6008, "pid": 5, "tid": 7, "ts": 1716454216592641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546184, "dur": 6, "args": { "External id": 6008, "cbid": 211, "correlation": 6008 } }, { "ph": "s", "id": 6008, "pid": 76337, "tid": -914061504, "ts": 1716454216546184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216592913, "dur": 11, "args": { "External id": 6010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6010, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6010, "pid": 5, "tid": 7, "ts": 1716454216592913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546193, "dur": 5, "args": { "External id": 6010, "cbid": 211, "correlation": 6010 } }, { "ph": "s", "id": 6010, "pid": 76337, "tid": -914061504, "ts": 1716454216546193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592926, "dur": 13, "args": { "External id": 6016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6016, "pid": 5, "tid": 7, "ts": 1716454216592926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546220, "dur": 9, "args": { "External id": 6016, "cbid": 211, "correlation": 6016 } }, { "ph": "s", "id": 6016, "pid": 76337, "tid": -914061504, "ts": 1716454216546220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216592940, "dur": 30, "args": { "External id": 6036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6036, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6036, "pid": 5, "tid": 7, "ts": 1716454216592940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546289, "dur": 11, "args": { "External id": 6036, "cbid": 211, "correlation": 6036 } }, { "ph": "s", "id": 6036, "pid": 76337, "tid": -914061504, "ts": 1716454216546289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216592971, "dur": 4, "args": { "External id": 6048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6048, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6048, "pid": 5, "tid": 7, "ts": 1716454216592971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546310, "dur": 6, "args": { "External id": 6048, "cbid": 211, "correlation": 6048 } }, { "ph": "s", "id": 6048, "pid": 76337, "tid": -914061504, "ts": 1716454216546310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216592977, "dur": 13, "args": { "External id": 6051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6051, "pid": 5, "tid": 7, "ts": 1716454216592977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546326, "dur": 7, "args": { "External id": 6051, "cbid": 211, "correlation": 6051 } }, { "ph": "s", "id": 6051, "pid": 76337, "tid": -914061504, "ts": 1716454216546326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216592991, "dur": 9, "args": { "External id": 6060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6060, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6060, "pid": 5, "tid": 7, "ts": 1716454216592991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546367, "dur": 10, "args": { "External id": 6060, "cbid": 211, "correlation": 6060 } }, { "ph": "s", "id": 6060, "pid": 76337, "tid": -914061504, "ts": 1716454216546367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216546427, "dur": 0, "args": { "External id": 6070, "cbid": 317, "correlation": 6070 } }, { "ph": "f", "id": 6070, "pid": 76337, "tid": -914061504, "ts": 1716454216546427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216546428, "dur": 0, "args": { "External id": 6071, "cbid": 203, "correlation": 6071 } }, { "ph": "f", "id": 6071, "pid": 76337, "tid": -914061504, "ts": 1716454216546428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216546429, "dur": 0, "args": { "External id": 6072, "cbid": 205, "correlation": 6072 } }, { "ph": "f", "id": 6072, "pid": 76337, "tid": -914061504, "ts": 1716454216546429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593002, "dur": 9, "args": { "External id": 6076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6076, "pid": 5, "tid": 7, "ts": 1716454216593002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546442, "dur": 13, "args": { "External id": 6076, "cbid": 211, "correlation": 6076 } }, { "ph": "s", "id": 6076, "pid": 76337, "tid": -914061504, "ts": 1716454216546442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593012, "dur": 32, "args": { "External id": 6078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6078, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6078, "pid": 5, "tid": 7, "ts": 1716454216593012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546457, "dur": 5, "args": { "External id": 6078, "cbid": 211, "correlation": 6078 } }, { "ph": "s", "id": 6078, "pid": 76337, "tid": -914061504, "ts": 1716454216546457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216593047, "dur": 1, "args": { "External id": 6080, "device": 5, "context": 1, "stream": 7, "correlation": 6080, "bytes": 768, "memory bandwidth (GB/s)": 0.4528301886792453 } }, { "ph": "f", "id": 6080, "pid": 5, "tid": 7, "ts": 1716454216593047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216546468, "dur": 6, "args": { "External id": 6080, "cbid": 51, "correlation": 6080 } }, { "ph": "s", "id": 6080, "pid": 76337, "tid": -914061504, "ts": 1716454216546468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216593051, "dur": 264, "args": { "External id": 6081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6081, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6081, "pid": 5, "tid": 7, "ts": 1716454216593051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546475, "dur": 6, "args": { "External id": 6081, "cbid": 211, "correlation": 6081 } }, { "ph": "s", "id": 6081, "pid": 76337, "tid": -914061504, "ts": 1716454216546475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593317, "dur": 11, "args": { "External id": 6083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6083, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6083, "pid": 5, "tid": 7, "ts": 1716454216593317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546484, "dur": 5, "args": { "External id": 6083, "cbid": 211, "correlation": 6083 } }, { "ph": "s", "id": 6083, "pid": 76337, "tid": -914061504, "ts": 1716454216546484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216593329, "dur": 12, "args": { "External id": 6089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6089, "pid": 5, "tid": 7, "ts": 1716454216593329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546512, "dur": 9, "args": { "External id": 6089, "cbid": 211, "correlation": 6089 } }, { "ph": "s", "id": 6089, "pid": 76337, "tid": -914061504, "ts": 1716454216546512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216593342, "dur": 39, "args": { "External id": 6097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6097, "pid": 5, "tid": 7, "ts": 1716454216593342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546546, "dur": 9, "args": { "External id": 6097, "cbid": 211, "correlation": 6097 } }, { "ph": "s", "id": 6097, "pid": 76337, "tid": -914061504, "ts": 1716454216546546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216593382, "dur": 10, "args": { "External id": 6105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6105, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6105, "pid": 5, "tid": 7, "ts": 1716454216593382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546575, "dur": 8, "args": { "External id": 6105, "cbid": 211, "correlation": 6105 } }, { "ph": "s", "id": 6105, "pid": 76337, "tid": -914061504, "ts": 1716454216546575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216593393, "dur": 38, "args": { "External id": 6115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6115, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6115, "pid": 5, "tid": 7, "ts": 1716454216593393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546643, "dur": 12, "args": { "External id": 6115, "cbid": 211, "correlation": 6115 } }, { "ph": "s", "id": 6115, "pid": 76337, "tid": -914061504, "ts": 1716454216546643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216593433, "dur": 49, "args": { "External id": 6136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6136, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6136, "pid": 5, "tid": 7, "ts": 1716454216593433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546676, "dur": 7, "args": { "External id": 6136, "cbid": 211, "correlation": 6136 } }, { "ph": "s", "id": 6136, "pid": 76337, "tid": -914061504, "ts": 1716454216546676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216593483, "dur": 4, "args": { "External id": 6148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6148, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6148, "pid": 5, "tid": 7, "ts": 1716454216593483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546692, "dur": 6, "args": { "External id": 6148, "cbid": 211, "correlation": 6148 } }, { "ph": "s", "id": 6148, "pid": 76337, "tid": -914061504, "ts": 1716454216546692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216593489, "dur": 14, "args": { "External id": 6151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6151, "pid": 5, "tid": 7, "ts": 1716454216593489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546708, "dur": 6, "args": { "External id": 6151, "cbid": 211, "correlation": 6151 } }, { "ph": "s", "id": 6151, "pid": 76337, "tid": -914061504, "ts": 1716454216546708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216593504, "dur": 9, "args": { "External id": 6160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6160, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6160, "pid": 5, "tid": 7, "ts": 1716454216593504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546746, "dur": 9, "args": { "External id": 6160, "cbid": 211, "correlation": 6160 } }, { "ph": "s", "id": 6160, "pid": 76337, "tid": -914061504, "ts": 1716454216546746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216546796, "dur": 0, "args": { "External id": 6170, "cbid": 317, "correlation": 6170 } }, { "ph": "f", "id": 6170, "pid": 76337, "tid": -914061504, "ts": 1716454216546796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216546797, "dur": 0, "args": { "External id": 6171, "cbid": 203, "correlation": 6171 } }, { "ph": "f", "id": 6171, "pid": 76337, "tid": -914061504, "ts": 1716454216546797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216546797, "dur": 0, "args": { "External id": 6172, "cbid": 205, "correlation": 6172 } }, { "ph": "f", "id": 6172, "pid": 76337, "tid": -914061504, "ts": 1716454216546797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593515, "dur": 10, "args": { "External id": 6176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6176, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6176, "pid": 5, "tid": 7, "ts": 1716454216593515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546810, "dur": 12, "args": { "External id": 6176, "cbid": 211, "correlation": 6176 } }, { "ph": "s", "id": 6176, "pid": 76337, "tid": -914061504, "ts": 1716454216546810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593526, "dur": 32, "args": { "External id": 6178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6178, "pid": 5, "tid": 7, "ts": 1716454216593526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546824, "dur": 5, "args": { "External id": 6178, "cbid": 211, "correlation": 6178 } }, { "ph": "s", "id": 6178, "pid": 76337, "tid": -914061504, "ts": 1716454216546824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216593560, "dur": 1, "args": { "External id": 6180, "device": 5, "context": 1, "stream": 7, "correlation": 6180, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 6180, "pid": 5, "tid": 7, "ts": 1716454216593560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216546835, "dur": 6, "args": { "External id": 6180, "cbid": 51, "correlation": 6180 } }, { "ph": "s", "id": 6180, "pid": 76337, "tid": -914061504, "ts": 1716454216546835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216593564, "dur": 314, "args": { "External id": 6181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6181, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6181, "pid": 5, "tid": 7, "ts": 1716454216593564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546842, "dur": 6, "args": { "External id": 6181, "cbid": 211, "correlation": 6181 } }, { "ph": "s", "id": 6181, "pid": 76337, "tid": -914061504, "ts": 1716454216546842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593880, "dur": 10, "args": { "External id": 6183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6183, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6183, "pid": 5, "tid": 7, "ts": 1716454216593880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546852, "dur": 5, "args": { "External id": 6183, "cbid": 211, "correlation": 6183 } }, { "ph": "s", "id": 6183, "pid": 76337, "tid": -914061504, "ts": 1716454216546852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216593891, "dur": 12, "args": { "External id": 6189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6189, "pid": 5, "tid": 7, "ts": 1716454216593891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546879, "dur": 9, "args": { "External id": 6189, "cbid": 211, "correlation": 6189 } }, { "ph": "s", "id": 6189, "pid": 76337, "tid": -914061504, "ts": 1716454216546879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216593905, "dur": 30, "args": { "External id": 6209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6209, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6209, "pid": 5, "tid": 7, "ts": 1716454216593905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546950, "dur": 11, "args": { "External id": 6209, "cbid": 211, "correlation": 6209 } }, { "ph": "s", "id": 6209, "pid": 76337, "tid": -914061504, "ts": 1716454216546950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216593937, "dur": 4, "args": { "External id": 6221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6221, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6221, "pid": 5, "tid": 7, "ts": 1716454216593937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546971, "dur": 16, "args": { "External id": 6221, "cbid": 211, "correlation": 6221 } }, { "ph": "s", "id": 6221, "pid": 76337, "tid": -914061504, "ts": 1716454216546971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216593942, "dur": 13, "args": { "External id": 6224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6224, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6224, "pid": 5, "tid": 7, "ts": 1716454216593942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216546999, "dur": 7, "args": { "External id": 6224, "cbid": 211, "correlation": 6224 } }, { "ph": "s", "id": 6224, "pid": 76337, "tid": -914061504, "ts": 1716454216546999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216593957, "dur": 9, "args": { "External id": 6233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6233, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6233, "pid": 5, "tid": 7, "ts": 1716454216593957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547040, "dur": 10, "args": { "External id": 6233, "cbid": 211, "correlation": 6233 } }, { "ph": "s", "id": 6233, "pid": 76337, "tid": -914061504, "ts": 1716454216547040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216547101, "dur": 0, "args": { "External id": 6243, "cbid": 317, "correlation": 6243 } }, { "ph": "f", "id": 6243, "pid": 76337, "tid": -914061504, "ts": 1716454216547101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216547102, "dur": 0, "args": { "External id": 6244, "cbid": 203, "correlation": 6244 } }, { "ph": "f", "id": 6244, "pid": 76337, "tid": -914061504, "ts": 1716454216547102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216547102, "dur": 0, "args": { "External id": 6245, "cbid": 205, "correlation": 6245 } }, { "ph": "f", "id": 6245, "pid": 76337, "tid": -914061504, "ts": 1716454216547102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593967, "dur": 9, "args": { "External id": 6249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6249, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6249, "pid": 5, "tid": 7, "ts": 1716454216593967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547116, "dur": 11, "args": { "External id": 6249, "cbid": 211, "correlation": 6249 } }, { "ph": "s", "id": 6249, "pid": 76337, "tid": -914061504, "ts": 1716454216547116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216593978, "dur": 32, "args": { "External id": 6251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6251, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6251, "pid": 5, "tid": 7, "ts": 1716454216593978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547129, "dur": 5, "args": { "External id": 6251, "cbid": 211, "correlation": 6251 } }, { "ph": "s", "id": 6251, "pid": 76337, "tid": -914061504, "ts": 1716454216547129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216594012, "dur": 1, "args": { "External id": 6253, "device": 5, "context": 1, "stream": 7, "correlation": 6253, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 6253, "pid": 5, "tid": 7, "ts": 1716454216594012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216547140, "dur": 6, "args": { "External id": 6253, "cbid": 51, "correlation": 6253 } }, { "ph": "s", "id": 6253, "pid": 76337, "tid": -914061504, "ts": 1716454216547140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216594016, "dur": 266, "args": { "External id": 6254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6254, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6254, "pid": 5, "tid": 7, "ts": 1716454216594016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547147, "dur": 6, "args": { "External id": 6254, "cbid": 211, "correlation": 6254 } }, { "ph": "s", "id": 6254, "pid": 76337, "tid": -914061504, "ts": 1716454216547147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216594283, "dur": 10, "args": { "External id": 6256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6256, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6256, "pid": 5, "tid": 7, "ts": 1716454216594283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547157, "dur": 6, "args": { "External id": 6256, "cbid": 211, "correlation": 6256 } }, { "ph": "s", "id": 6256, "pid": 76337, "tid": -914061504, "ts": 1716454216547157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216594295, "dur": 12, "args": { "External id": 6262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6262, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6262, "pid": 5, "tid": 7, "ts": 1716454216594295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547185, "dur": 8, "args": { "External id": 6262, "cbid": 211, "correlation": 6262 } }, { "ph": "s", "id": 6262, "pid": 76337, "tid": -914061504, "ts": 1716454216547185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216594308, "dur": 39, "args": { "External id": 6270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6270, "pid": 5, "tid": 7, "ts": 1716454216594308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547219, "dur": 9, "args": { "External id": 6270, "cbid": 211, "correlation": 6270 } }, { "ph": "s", "id": 6270, "pid": 76337, "tid": -914061504, "ts": 1716454216547219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216594348, "dur": 9, "args": { "External id": 6278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6278, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6278, "pid": 5, "tid": 7, "ts": 1716454216594348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547248, "dur": 8, "args": { "External id": 6278, "cbid": 211, "correlation": 6278 } }, { "ph": "s", "id": 6278, "pid": 76337, "tid": -914061504, "ts": 1716454216547248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216594359, "dur": 39, "args": { "External id": 6288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6288, "pid": 5, "tid": 7, "ts": 1716454216594359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547331, "dur": 13, "args": { "External id": 6288, "cbid": 211, "correlation": 6288 } }, { "ph": "s", "id": 6288, "pid": 76337, "tid": -914061504, "ts": 1716454216547331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216594399, "dur": 34, "args": { "External id": 6309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6309, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6309, "pid": 5, "tid": 7, "ts": 1716454216594399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547364, "dur": 8, "args": { "External id": 6309, "cbid": 211, "correlation": 6309 } }, { "ph": "s", "id": 6309, "pid": 76337, "tid": -914061504, "ts": 1716454216547364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216594435, "dur": 4, "args": { "External id": 6321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6321, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6321, "pid": 5, "tid": 7, "ts": 1716454216594435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547381, "dur": 6, "args": { "External id": 6321, "cbid": 211, "correlation": 6321 } }, { "ph": "s", "id": 6321, "pid": 76337, "tid": -914061504, "ts": 1716454216547381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216594441, "dur": 14, "args": { "External id": 6324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6324, "pid": 5, "tid": 7, "ts": 1716454216594441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547397, "dur": 6, "args": { "External id": 6324, "cbid": 211, "correlation": 6324 } }, { "ph": "s", "id": 6324, "pid": 76337, "tid": -914061504, "ts": 1716454216547397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216594456, "dur": 9, "args": { "External id": 6333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6333, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6333, "pid": 5, "tid": 7, "ts": 1716454216594456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547437, "dur": 9, "args": { "External id": 6333, "cbid": 211, "correlation": 6333 } }, { "ph": "s", "id": 6333, "pid": 76337, "tid": -914061504, "ts": 1716454216547437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216547488, "dur": 0, "args": { "External id": 6343, "cbid": 317, "correlation": 6343 } }, { "ph": "f", "id": 6343, "pid": 76337, "tid": -914061504, "ts": 1716454216547488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216547489, "dur": 0, "args": { "External id": 6344, "cbid": 203, "correlation": 6344 } }, { "ph": "f", "id": 6344, "pid": 76337, "tid": -914061504, "ts": 1716454216547489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216547489, "dur": 0, "args": { "External id": 6345, "cbid": 205, "correlation": 6345 } }, { "ph": "f", "id": 6345, "pid": 76337, "tid": -914061504, "ts": 1716454216547489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216594466, "dur": 8, "args": { "External id": 6349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6349, "pid": 5, "tid": 7, "ts": 1716454216594466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547502, "dur": 11, "args": { "External id": 6349, "cbid": 211, "correlation": 6349 } }, { "ph": "s", "id": 6349, "pid": 76337, "tid": -914061504, "ts": 1716454216547502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216594476, "dur": 32, "args": { "External id": 6351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6351, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6351, "pid": 5, "tid": 7, "ts": 1716454216594476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547515, "dur": 5, "args": { "External id": 6351, "cbid": 211, "correlation": 6351 } }, { "ph": "s", "id": 6351, "pid": 76337, "tid": -914061504, "ts": 1716454216547515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216594511, "dur": 1, "args": { "External id": 6353, "device": 5, "context": 1, "stream": 7, "correlation": 6353, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 6353, "pid": 5, "tid": 7, "ts": 1716454216594511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216547526, "dur": 7, "args": { "External id": 6353, "cbid": 51, "correlation": 6353 } }, { "ph": "s", "id": 6353, "pid": 76337, "tid": -914061504, "ts": 1716454216547526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216594515, "dur": 266, "args": { "External id": 6354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6354, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6354, "pid": 5, "tid": 7, "ts": 1716454216594515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547534, "dur": 6, "args": { "External id": 6354, "cbid": 211, "correlation": 6354 } }, { "ph": "s", "id": 6354, "pid": 76337, "tid": -914061504, "ts": 1716454216547534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216594782, "dur": 10, "args": { "External id": 6356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6356, "pid": 5, "tid": 7, "ts": 1716454216594782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547544, "dur": 5, "args": { "External id": 6356, "cbid": 211, "correlation": 6356 } }, { "ph": "s", "id": 6356, "pid": 76337, "tid": -914061504, "ts": 1716454216547544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216594793, "dur": 13, "args": { "External id": 6362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6362, "pid": 5, "tid": 7, "ts": 1716454216594793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547571, "dur": 9, "args": { "External id": 6362, "cbid": 211, "correlation": 6362 } }, { "ph": "s", "id": 6362, "pid": 76337, "tid": -914061504, "ts": 1716454216547571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216594808, "dur": 30, "args": { "External id": 6382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6382, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6382, "pid": 5, "tid": 7, "ts": 1716454216594808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547641, "dur": 11, "args": { "External id": 6382, "cbid": 211, "correlation": 6382 } }, { "ph": "s", "id": 6382, "pid": 76337, "tid": -914061504, "ts": 1716454216547641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216594840, "dur": 4, "args": { "External id": 6394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6394, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6394, "pid": 5, "tid": 7, "ts": 1716454216594840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547661, "dur": 6, "args": { "External id": 6394, "cbid": 211, "correlation": 6394 } }, { "ph": "s", "id": 6394, "pid": 76337, "tid": -914061504, "ts": 1716454216547661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216594845, "dur": 14, "args": { "External id": 6397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6397, "pid": 5, "tid": 7, "ts": 1716454216594845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547678, "dur": 6, "args": { "External id": 6397, "cbid": 211, "correlation": 6397 } }, { "ph": "s", "id": 6397, "pid": 76337, "tid": -914061504, "ts": 1716454216547678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216594860, "dur": 9, "args": { "External id": 6406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6406, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6406, "pid": 5, "tid": 7, "ts": 1716454216594860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547717, "dur": 10, "args": { "External id": 6406, "cbid": 211, "correlation": 6406 } }, { "ph": "s", "id": 6406, "pid": 76337, "tid": -914061504, "ts": 1716454216547717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216547778, "dur": 0, "args": { "External id": 6416, "cbid": 317, "correlation": 6416 } }, { "ph": "f", "id": 6416, "pid": 76337, "tid": -914061504, "ts": 1716454216547778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216547778, "dur": 0, "args": { "External id": 6417, "cbid": 203, "correlation": 6417 } }, { "ph": "f", "id": 6417, "pid": 76337, "tid": -914061504, "ts": 1716454216547778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216547779, "dur": 0, "args": { "External id": 6418, "cbid": 205, "correlation": 6418 } }, { "ph": "f", "id": 6418, "pid": 76337, "tid": -914061504, "ts": 1716454216547779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216594871, "dur": 9, "args": { "External id": 6422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6422, "pid": 5, "tid": 7, "ts": 1716454216594871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547792, "dur": 12, "args": { "External id": 6422, "cbid": 211, "correlation": 6422 } }, { "ph": "s", "id": 6422, "pid": 76337, "tid": -914061504, "ts": 1716454216547792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216594881, "dur": 32, "args": { "External id": 6424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6424, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6424, "pid": 5, "tid": 7, "ts": 1716454216594881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547806, "dur": 5, "args": { "External id": 6424, "cbid": 211, "correlation": 6424 } }, { "ph": "s", "id": 6424, "pid": 76337, "tid": -914061504, "ts": 1716454216547806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216594916, "dur": 1, "args": { "External id": 6426, "device": 5, "context": 1, "stream": 7, "correlation": 6426, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 6426, "pid": 5, "tid": 7, "ts": 1716454216594916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216547817, "dur": 6, "args": { "External id": 6426, "cbid": 51, "correlation": 6426 } }, { "ph": "s", "id": 6426, "pid": 76337, "tid": -914061504, "ts": 1716454216547817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216594920, "dur": 265, "args": { "External id": 6427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6427, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6427, "pid": 5, "tid": 7, "ts": 1716454216594920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547824, "dur": 6, "args": { "External id": 6427, "cbid": 211, "correlation": 6427 } }, { "ph": "s", "id": 6427, "pid": 76337, "tid": -914061504, "ts": 1716454216547824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216595187, "dur": 10, "args": { "External id": 6429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6429, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6429, "pid": 5, "tid": 7, "ts": 1716454216595187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547834, "dur": 5, "args": { "External id": 6429, "cbid": 211, "correlation": 6429 } }, { "ph": "s", "id": 6429, "pid": 76337, "tid": -914061504, "ts": 1716454216547834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595199, "dur": 12, "args": { "External id": 6435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6435, "pid": 5, "tid": 7, "ts": 1716454216595199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547862, "dur": 9, "args": { "External id": 6435, "cbid": 211, "correlation": 6435 } }, { "ph": "s", "id": 6435, "pid": 76337, "tid": -914061504, "ts": 1716454216547862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595212, "dur": 40, "args": { "External id": 6443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6443, "pid": 5, "tid": 7, "ts": 1716454216595212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547896, "dur": 9, "args": { "External id": 6443, "cbid": 211, "correlation": 6443 } }, { "ph": "s", "id": 6443, "pid": 76337, "tid": -914061504, "ts": 1716454216547896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216595254, "dur": 10, "args": { "External id": 6451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6451, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6451, "pid": 5, "tid": 7, "ts": 1716454216595254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216547925, "dur": 8, "args": { "External id": 6451, "cbid": 211, "correlation": 6451 } }, { "ph": "s", "id": 6451, "pid": 76337, "tid": -914061504, "ts": 1716454216547925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595265, "dur": 38, "args": { "External id": 6461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6461, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6461, "pid": 5, "tid": 7, "ts": 1716454216595265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548079, "dur": 15, "args": { "External id": 6461, "cbid": 211, "correlation": 6461 } }, { "ph": "s", "id": 6461, "pid": 76337, "tid": -914061504, "ts": 1716454216548079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216595304, "dur": 31, "args": { "External id": 6482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6482, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6482, "pid": 5, "tid": 7, "ts": 1716454216595304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548115, "dur": 7, "args": { "External id": 6482, "cbid": 211, "correlation": 6482 } }, { "ph": "s", "id": 6482, "pid": 76337, "tid": -914061504, "ts": 1716454216548115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216595337, "dur": 4, "args": { "External id": 6494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6494, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6494, "pid": 5, "tid": 7, "ts": 1716454216595337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548133, "dur": 6, "args": { "External id": 6494, "cbid": 211, "correlation": 6494 } }, { "ph": "s", "id": 6494, "pid": 76337, "tid": -914061504, "ts": 1716454216548133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595343, "dur": 14, "args": { "External id": 6497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6497, "pid": 5, "tid": 7, "ts": 1716454216595343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548149, "dur": 6, "args": { "External id": 6497, "cbid": 211, "correlation": 6497 } }, { "ph": "s", "id": 6497, "pid": 76337, "tid": -914061504, "ts": 1716454216548149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216548246, "dur": 1, "args": { "External id": 6509, "cbid": 251, "correlation": 6509 } }, { "ph": "f", "id": 6509, "pid": 76337, "tid": -914061504, "ts": 1716454216548246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216595358, "dur": 38, "args": { "External id": 6510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6510, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 6510, "pid": 5, "tid": 7, "ts": 1716454216595358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548252, "dur": 13, "args": { "External id": 6510, "cbid": 211, "correlation": 6510 } }, { "ph": "s", "id": 6510, "pid": 76337, "tid": -914061504, "ts": 1716454216548252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595398, "dur": 15, "args": { "External id": 6515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6515, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6515, "pid": 5, "tid": 7, "ts": 1716454216595398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548284, "dur": 9, "args": { "External id": 6515, "cbid": 211, "correlation": 6515 } }, { "ph": "s", "id": 6515, "pid": 76337, "tid": -914061504, "ts": 1716454216548284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216548358, "dur": 1, "args": { "External id": 6526, "cbid": 251, "correlation": 6526 } }, { "ph": "f", "id": 6526, "pid": 76337, "tid": -914061504, "ts": 1716454216548358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216595414, "dur": 36, "args": { "External id": 6527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6527, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 6527, "pid": 5, "tid": 7, "ts": 1716454216595414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548363, "dur": 12, "args": { "External id": 6527, "cbid": 211, "correlation": 6527 } }, { "ph": "s", "id": 6527, "pid": 76337, "tid": -914061504, "ts": 1716454216548363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595451, "dur": 14, "args": { "External id": 6532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6532, "pid": 5, "tid": 7, "ts": 1716454216595451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548391, "dur": 8, "args": { "External id": 6532, "cbid": 211, "correlation": 6532 } }, { "ph": "s", "id": 6532, "pid": 76337, "tid": -914061504, "ts": 1716454216548391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216548462, "dur": 1, "args": { "External id": 6543, "cbid": 251, "correlation": 6543 } }, { "ph": "f", "id": 6543, "pid": 76337, "tid": -914061504, "ts": 1716454216548462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454216595467, "dur": 36, "args": { "External id": 6544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6544, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 6544, "pid": 5, "tid": 7, "ts": 1716454216595467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548467, "dur": 12, "args": { "External id": 6544, "cbid": 211, "correlation": 6544 } }, { "ph": "s", "id": 6544, "pid": 76337, "tid": -914061504, "ts": 1716454216548467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216595504, "dur": 14, "args": { "External id": 6549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6549, "pid": 5, "tid": 7, "ts": 1716454216595504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548495, "dur": 9, "args": { "External id": 6549, "cbid": 211, "correlation": 6549 } }, { "ph": "s", "id": 6549, "pid": 76337, "tid": -914061504, "ts": 1716454216548495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216595520, "dur": 1210, "args": { "External id": 6574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6574, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 6574, "pid": 5, "tid": 7, "ts": 1716454216595520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548576, "dur": 12, "args": { "External id": 6574, "cbid": 211, "correlation": 6574 } }, { "ph": "s", "id": 6574, "pid": 76337, "tid": -914061504, "ts": 1716454216548576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216548682, "dur": 1, "args": { "External id": 6592, "cbid": 251, "correlation": 6592 } }, { "ph": "f", "id": 6592, "pid": 76337, "tid": -914061504, "ts": 1716454216548682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216596731, "dur": 42, "args": { "External id": 6594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6594, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [8, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 6594, "pid": 5, "tid": 7, "ts": 1716454216596731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548688, "dur": 14, "args": { "External id": 6594, "cbid": 211, "correlation": 6594 } }, { "ph": "s", "id": 6594, "pid": 76337, "tid": -914061504, "ts": 1716454216548688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216596775, "dur": 12, "args": { "External id": 6602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6602, "registers per thread": 19, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6602, "pid": 5, "tid": 7, "ts": 1716454216596775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548771, "dur": 13, "args": { "External id": 6602, "cbid": 211, "correlation": 6602 } }, { "ph": "s", "id": 6602, "pid": 76337, "tid": -914061504, "ts": 1716454216548771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216596789, "dur": 9, "args": { "External id": 6610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6610, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6610, "pid": 5, "tid": 7, "ts": 1716454216596789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548809, "dur": 9, "args": { "External id": 6610, "cbid": 211, "correlation": 6610 } }, { "ph": "s", "id": 6610, "pid": 76337, "tid": -914061504, "ts": 1716454216548809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216596800, "dur": 38, "args": { "External id": 6620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6620, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6620, "pid": 5, "tid": 7, "ts": 1716454216596800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548882, "dur": 12, "args": { "External id": 6620, "cbid": 211, "correlation": 6620 } }, { "ph": "s", "id": 6620, "pid": 76337, "tid": -914061504, "ts": 1716454216548882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216596839, "dur": 33, "args": { "External id": 6641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6641, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6641, "pid": 5, "tid": 7, "ts": 1716454216596839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548916, "dur": 8, "args": { "External id": 6641, "cbid": 211, "correlation": 6641 } }, { "ph": "s", "id": 6641, "pid": 76337, "tid": -914061504, "ts": 1716454216548916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216596874, "dur": 4, "args": { "External id": 6653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6653, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6653, "pid": 5, "tid": 7, "ts": 1716454216596874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548933, "dur": 6, "args": { "External id": 6653, "cbid": 211, "correlation": 6653 } }, { "ph": "s", "id": 6653, "pid": 76337, "tid": -914061504, "ts": 1716454216548933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216596879, "dur": 14, "args": { "External id": 6656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6656, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6656, "pid": 5, "tid": 7, "ts": 1716454216596879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548949, "dur": 6, "args": { "External id": 6656, "cbid": 211, "correlation": 6656 } }, { "ph": "s", "id": 6656, "pid": 76337, "tid": -914061504, "ts": 1716454216548949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216596895, "dur": 10, "args": { "External id": 6665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6665, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6665, "pid": 5, "tid": 7, "ts": 1716454216596895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216548997, "dur": 10, "args": { "External id": 6665, "cbid": 211, "correlation": 6665 } }, { "ph": "s", "id": 6665, "pid": 76337, "tid": -914061504, "ts": 1716454216548997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216549050, "dur": 0, "args": { "External id": 6675, "cbid": 317, "correlation": 6675 } }, { "ph": "f", "id": 6675, "pid": 76337, "tid": -914061504, "ts": 1716454216549050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216549051, "dur": 0, "args": { "External id": 6676, "cbid": 203, "correlation": 6676 } }, { "ph": "f", "id": 6676, "pid": 76337, "tid": -914061504, "ts": 1716454216549051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216549052, "dur": 0, "args": { "External id": 6677, "cbid": 205, "correlation": 6677 } }, { "ph": "f", "id": 6677, "pid": 76337, "tid": -914061504, "ts": 1716454216549052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216596907, "dur": 10, "args": { "External id": 6681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6681, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6681, "pid": 5, "tid": 7, "ts": 1716454216596907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549066, "dur": 12, "args": { "External id": 6681, "cbid": 211, "correlation": 6681 } }, { "ph": "s", "id": 6681, "pid": 76337, "tid": -914061504, "ts": 1716454216549066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216596919, "dur": 33, "args": { "External id": 6683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6683, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6683, "pid": 5, "tid": 7, "ts": 1716454216596919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549080, "dur": 5, "args": { "External id": 6683, "cbid": 211, "correlation": 6683 } }, { "ph": "s", "id": 6683, "pid": 76337, "tid": -914061504, "ts": 1716454216549080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216596954, "dur": 1, "args": { "External id": 6685, "device": 5, "context": 1, "stream": 7, "correlation": 6685, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 6685, "pid": 5, "tid": 7, "ts": 1716454216596954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216549091, "dur": 7, "args": { "External id": 6685, "cbid": 51, "correlation": 6685 } }, { "ph": "s", "id": 6685, "pid": 76337, "tid": -914061504, "ts": 1716454216549091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216596957, "dur": 273, "args": { "External id": 6686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6686, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6686, "pid": 5, "tid": 7, "ts": 1716454216596957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549099, "dur": 6, "args": { "External id": 6686, "cbid": 211, "correlation": 6686 } }, { "ph": "s", "id": 6686, "pid": 76337, "tid": -914061504, "ts": 1716454216549099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597232, "dur": 11, "args": { "External id": 6688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6688, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6688, "pid": 5, "tid": 7, "ts": 1716454216597232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549108, "dur": 5, "args": { "External id": 6688, "cbid": 211, "correlation": 6688 } }, { "ph": "s", "id": 6688, "pid": 76337, "tid": -914061504, "ts": 1716454216549108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597244, "dur": 14, "args": { "External id": 6694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6694, "pid": 5, "tid": 7, "ts": 1716454216597244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549137, "dur": 8, "args": { "External id": 6694, "cbid": 211, "correlation": 6694 } }, { "ph": "s", "id": 6694, "pid": 76337, "tid": -914061504, "ts": 1716454216549137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216597259, "dur": 30, "args": { "External id": 6714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6714, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6714, "pid": 5, "tid": 7, "ts": 1716454216597259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549204, "dur": 11, "args": { "External id": 6714, "cbid": 211, "correlation": 6714 } }, { "ph": "s", "id": 6714, "pid": 76337, "tid": -914061504, "ts": 1716454216549204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216597291, "dur": 4, "args": { "External id": 6726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6726, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6726, "pid": 5, "tid": 7, "ts": 1716454216597291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549225, "dur": 7, "args": { "External id": 6726, "cbid": 211, "correlation": 6726 } }, { "ph": "s", "id": 6726, "pid": 76337, "tid": -914061504, "ts": 1716454216549225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597296, "dur": 13, "args": { "External id": 6729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6729, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6729, "pid": 5, "tid": 7, "ts": 1716454216597296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549243, "dur": 6, "args": { "External id": 6729, "cbid": 211, "correlation": 6729 } }, { "ph": "s", "id": 6729, "pid": 76337, "tid": -914061504, "ts": 1716454216549243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597311, "dur": 9, "args": { "External id": 6738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6738, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6738, "pid": 5, "tid": 7, "ts": 1716454216597311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549281, "dur": 9, "args": { "External id": 6738, "cbid": 211, "correlation": 6738 } }, { "ph": "s", "id": 6738, "pid": 76337, "tid": -914061504, "ts": 1716454216549281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216549340, "dur": 0, "args": { "External id": 6748, "cbid": 317, "correlation": 6748 } }, { "ph": "f", "id": 6748, "pid": 76337, "tid": -914061504, "ts": 1716454216549340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216549341, "dur": 0, "args": { "External id": 6749, "cbid": 203, "correlation": 6749 } }, { "ph": "f", "id": 6749, "pid": 76337, "tid": -914061504, "ts": 1716454216549341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216549342, "dur": 0, "args": { "External id": 6750, "cbid": 205, "correlation": 6750 } }, { "ph": "f", "id": 6750, "pid": 76337, "tid": -914061504, "ts": 1716454216549342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597322, "dur": 9, "args": { "External id": 6754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6754, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6754, "pid": 5, "tid": 7, "ts": 1716454216597322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549354, "dur": 12, "args": { "External id": 6754, "cbid": 211, "correlation": 6754 } }, { "ph": "s", "id": 6754, "pid": 76337, "tid": -914061504, "ts": 1716454216549354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597332, "dur": 32, "args": { "External id": 6756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6756, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6756, "pid": 5, "tid": 7, "ts": 1716454216597332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549368, "dur": 5, "args": { "External id": 6756, "cbid": 211, "correlation": 6756 } }, { "ph": "s", "id": 6756, "pid": 76337, "tid": -914061504, "ts": 1716454216549368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216597366, "dur": 1, "args": { "External id": 6758, "device": 5, "context": 1, "stream": 7, "correlation": 6758, "bytes": 768, "memory bandwidth (GB/s)": 0.46153846153846156 } }, { "ph": "f", "id": 6758, "pid": 5, "tid": 7, "ts": 1716454216597366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216549379, "dur": 6, "args": { "External id": 6758, "cbid": 51, "correlation": 6758 } }, { "ph": "s", "id": 6758, "pid": 76337, "tid": -914061504, "ts": 1716454216549379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216597370, "dur": 266, "args": { "External id": 6759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6759, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [4, 24, 2], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 6759, "pid": 5, "tid": 7, "ts": 1716454216597370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549386, "dur": 6, "args": { "External id": 6759, "cbid": 211, "correlation": 6759 } }, { "ph": "s", "id": 6759, "pid": 76337, "tid": -914061504, "ts": 1716454216549386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597637, "dur": 11, "args": { "External id": 6761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6761, "pid": 5, "tid": 7, "ts": 1716454216597637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549396, "dur": 5, "args": { "External id": 6761, "cbid": 211, "correlation": 6761 } }, { "ph": "s", "id": 6761, "pid": 76337, "tid": -914061504, "ts": 1716454216549396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597649, "dur": 12, "args": { "External id": 6767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6767, "pid": 5, "tid": 7, "ts": 1716454216597649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549423, "dur": 8, "args": { "External id": 6767, "cbid": 211, "correlation": 6767 } }, { "ph": "s", "id": 6767, "pid": 76337, "tid": -914061504, "ts": 1716454216549423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597663, "dur": 40, "args": { "External id": 6775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6775, "pid": 5, "tid": 7, "ts": 1716454216597663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549457, "dur": 9, "args": { "External id": 6775, "cbid": 211, "correlation": 6775 } }, { "ph": "s", "id": 6775, "pid": 76337, "tid": -914061504, "ts": 1716454216549457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597704, "dur": 10, "args": { "External id": 6783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6783, "registers per thread": 17, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6783, "pid": 5, "tid": 7, "ts": 1716454216597704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549487, "dur": 9, "args": { "External id": 6783, "cbid": 211, "correlation": 6783 } }, { "ph": "s", "id": 6783, "pid": 76337, "tid": -914061504, "ts": 1716454216549487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597716, "dur": 38, "args": { "External id": 6793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6793, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6793, "pid": 5, "tid": 7, "ts": 1716454216597716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549557, "dur": 12, "args": { "External id": 6793, "cbid": 211, "correlation": 6793 } }, { "ph": "s", "id": 6793, "pid": 76337, "tid": -914061504, "ts": 1716454216549557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216597755, "dur": 47, "args": { "External id": 6814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6814, "registers per thread": 25, "shared memory": 768, "blocks per SM": 0.4, "warps per SM": 6.4, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 6814, "pid": 5, "tid": 7, "ts": 1716454216597755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549589, "dur": 7, "args": { "External id": 6814, "cbid": 211, "correlation": 6814 } }, { "ph": "s", "id": 6814, "pid": 76337, "tid": -914061504, "ts": 1716454216549589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216597804, "dur": 5, "args": { "External id": 6826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6826, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 6826, "pid": 5, "tid": 7, "ts": 1716454216597804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549605, "dur": 6, "args": { "External id": 6826, "cbid": 211, "correlation": 6826 } }, { "ph": "s", "id": 6826, "pid": 76337, "tid": -914061504, "ts": 1716454216549605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597809, "dur": 14, "args": { "External id": 6829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6829, "pid": 5, "tid": 7, "ts": 1716454216597809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549622, "dur": 6, "args": { "External id": 6829, "cbid": 211, "correlation": 6829 } }, { "ph": "s", "id": 6829, "pid": 76337, "tid": -914061504, "ts": 1716454216549622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597825, "dur": 9, "args": { "External id": 6838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6838, "registers per thread": 24, "shared memory": 0, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6838, "pid": 5, "tid": 7, "ts": 1716454216597825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549664, "dur": 10, "args": { "External id": 6838, "cbid": 211, "correlation": 6838 } }, { "ph": "s", "id": 6838, "pid": 76337, "tid": -914061504, "ts": 1716454216549664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216549714, "dur": 0, "args": { "External id": 6848, "cbid": 317, "correlation": 6848 } }, { "ph": "f", "id": 6848, "pid": 76337, "tid": -914061504, "ts": 1716454216549714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216549715, "dur": 0, "args": { "External id": 6849, "cbid": 203, "correlation": 6849 } }, { "ph": "f", "id": 6849, "pid": 76337, "tid": -914061504, "ts": 1716454216549715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216549715, "dur": 0, "args": { "External id": 6850, "cbid": 205, "correlation": 6850 } }, { "ph": "f", "id": 6850, "pid": 76337, "tid": -914061504, "ts": 1716454216549715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597835, "dur": 8, "args": { "External id": 6854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6854, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [96, 16, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6854, "pid": 5, "tid": 7, "ts": 1716454216597835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549730, "dur": 11, "args": { "External id": 6854, "cbid": 211, "correlation": 6854 } }, { "ph": "s", "id": 6854, "pid": 76337, "tid": -914061504, "ts": 1716454216549730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597844, "dur": 5, "args": { "External id": 6856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1.6, "warps per SM": 12.8, "grid": [1, 16, 8], "block": [256, 1, 1], "est. achieved occupancy %": 20 } }, { "ph": "f", "id": 6856, "pid": 5, "tid": 7, "ts": 1716454216597844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549744, "dur": 5, "args": { "External id": 6856, "cbid": 211, "correlation": 6856 } }, { "ph": "s", "id": 6856, "pid": 76337, "tid": -914061504, "ts": 1716454216549744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216597851, "dur": 1, "args": { "External id": 6858, "device": 5, "context": 1, "stream": 7, "correlation": 6858, "bytes": 2688, "memory bandwidth (GB/s)": 1.4 } }, { "ph": "f", "id": 6858, "pid": 5, "tid": 7, "ts": 1716454216597851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216549755, "dur": 7, "args": { "External id": 6858, "cbid": 51, "correlation": 6858 } }, { "ph": "s", "id": 6858, "pid": 76337, "tid": -914061504, "ts": 1716454216549755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x32x64_stage1_warpsize2x1x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216597856, "dur": 61, "args": { "External id": 6859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6859, "registers per thread": 128, "shared memory": 12288, "blocks per SM": 8.4, "warps per SM": 33.6, "grid": [1, 48, 14], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 6859, "pid": 5, "tid": 7, "ts": 1716454216597856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549762, "dur": 7, "args": { "External id": 6859, "cbid": 211, "correlation": 6859 } }, { "ph": "s", "id": 6859, "pid": 76337, "tid": -914061504, "ts": 1716454216549762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216597918, "dur": 4, "args": { "External id": 6861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6861, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1.2, "warps per SM": 9.6, "grid": [96, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 6861, "pid": 5, "tid": 7, "ts": 1716454216597918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549774, "dur": 5, "args": { "External id": 6861, "cbid": 211, "correlation": 6861 } }, { "ph": "s", "id": 6861, "pid": 76337, "tid": -914061504, "ts": 1716454216549774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597923, "dur": 5, "args": { "External id": 6867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 6867, "pid": 5, "tid": 7, "ts": 1716454216597923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549801, "dur": 8, "args": { "External id": 6867, "cbid": 211, "correlation": 6867 } }, { "ph": "s", "id": 6867, "pid": 76337, "tid": -914061504, "ts": 1716454216549801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216549861, "dur": 0, "args": { "External id": 6877, "cbid": 317, "correlation": 6877 } }, { "ph": "f", "id": 6877, "pid": 76337, "tid": -914061504, "ts": 1716454216549861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216549862, "dur": 0, "args": { "External id": 6878, "cbid": 203, "correlation": 6878 } }, { "ph": "f", "id": 6878, "pid": 76337, "tid": -914061504, "ts": 1716454216549862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216549863, "dur": 0, "args": { "External id": 6879, "cbid": 205, "correlation": 6879 } }, { "ph": "f", "id": 6879, "pid": 76337, "tid": -914061504, "ts": 1716454216549863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216549882, "dur": 1, "args": { "External id": 6883, "cbid": 251, "correlation": 6883 } }, { "ph": "f", "id": 6883, "pid": 76337, "tid": -914061504, "ts": 1716454216549882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216597930, "dur": 11, "args": { "External id": 6884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6884, "registers per thread": 106, "shared memory": 16640, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 6884, "pid": 5, "tid": 7, "ts": 1716454216597930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549886, "dur": 12, "args": { "External id": 6884, "cbid": 211, "correlation": 6884 } }, { "ph": "s", "id": 6884, "pid": 76337, "tid": -914061504, "ts": 1716454216549886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216597942, "dur": 4, "args": { "External id": 6890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.6, "warps per SM": 2.4, "grid": [48, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 } }, { "ph": "f", "id": 6890, "pid": 5, "tid": 7, "ts": 1716454216597942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549919, "dur": 9, "args": { "External id": 6890, "cbid": 211, "correlation": 6890 } }, { "ph": "s", "id": 6890, "pid": 76337, "tid": -914061504, "ts": 1716454216549919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597948, "dur": 4, "args": { "External id": 6898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6898, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 6898, "pid": 5, "tid": 7, "ts": 1716454216597948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216549985, "dur": 12, "args": { "External id": 6898, "cbid": 211, "correlation": 6898 } }, { "ph": "s", "id": 6898, "pid": 76337, "tid": -914061504, "ts": 1716454216549985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597953, "dur": 3, "args": { "External id": 6906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6906, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 6906, "pid": 5, "tid": 7, "ts": 1716454216597953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216550025, "dur": 11, "args": { "External id": 6906, "cbid": 211, "correlation": 6906 } }, { "ph": "s", "id": 6906, "pid": 76337, "tid": -914061504, "ts": 1716454216550025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597957, "dur": 3, "args": { "External id": 6914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6914, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 6914, "pid": 5, "tid": 7, "ts": 1716454216597957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216550051, "dur": 8, "args": { "External id": 6914, "cbid": 211, "correlation": 6914 } }, { "ph": "s", "id": 6914, "pid": 76337, "tid": -914061504, "ts": 1716454216550051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216597962, "dur": 3, "args": { "External id": 6922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6922, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 6922, "pid": 5, "tid": 7, "ts": 1716454216597962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216550073, "dur": 7, "args": { "External id": 6922, "cbid": 211, "correlation": 6922 } }, { "ph": "s", "id": 6922, "pid": 76337, "tid": -914061504, "ts": 1716454216550073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454216597967, "dur": 2, "args": { "External id": 6932, "device": 5, "context": 1, "stream": 7, "correlation": 6932, "bytes": 6144, "memory bandwidth (GB/s)": 2.6666666666666665 } }, { "ph": "f", "id": 6932, "pid": 5, "tid": 7, "ts": 1716454216597967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216550367, "dur": 45, "args": { "External id": 6932, "cbid": 41, "correlation": 6932 } }, { "ph": "s", "id": 6932, "pid": 76337, "tid": -914061504, "ts": 1716454216550367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454216550413, "dur": 47573, "args": { "External id": 6933, "cbid": 131, "correlation": 6933 } }, { "ph": "f", "id": 6933, "pid": 76337, "tid": -914061504, "ts": 1716454216550413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216598064, "dur": 5, "args": { "External id": 6941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6941, "registers per thread": 32, "shared memory": 0, "blocks per SM": 8, "warps per SM": 128, "grid": [160, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6941, "pid": 5, "tid": 7, "ts": 1716454216598064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216598044, "dur": 21, "args": { "External id": 6941, "cbid": 211, "correlation": 6941 } }, { "ph": "s", "id": 6941, "pid": 76337, "tid": -914061504, "ts": 1716454216598044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216598102, "dur": 4, "args": { "External id": 6950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6950, "registers per thread": 32, "shared memory": 0, "blocks per SM": 8, "warps per SM": 128, "grid": [160, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6950, "pid": 5, "tid": 7, "ts": 1716454216598102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216598092, "dur": 9, "args": { "External id": 6950, "cbid": 211, "correlation": 6950 } }, { "ph": "s", "id": 6950, "pid": 76337, "tid": -914061504, "ts": 1716454216598092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216598130, "dur": 5, "args": { "External id": 6959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6959, "registers per thread": 32, "shared memory": 0, "blocks per SM": 8, "warps per SM": 128, "grid": [160, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6959, "pid": 5, "tid": 7, "ts": 1716454216598130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216598119, "dur": 10, "args": { "External id": 6959, "cbid": 211, "correlation": 6959 } }, { "ph": "s", "id": 6959, "pid": 76337, "tid": -914061504, "ts": 1716454216598119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216598158, "dur": 5, "args": { "External id": 6968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6968, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 6968, "pid": 5, "tid": 7, "ts": 1716454216598158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216598148, "dur": 8, "args": { "External id": 6968, "cbid": 211, "correlation": 6968 } }, { "ph": "s", "id": 6968, "pid": 76337, "tid": -914061504, "ts": 1716454216598148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454216602686, "dur": 10, "args": { "External id": 6978, "device": 5, "context": 1, "stream": 7, "correlation": 6978, "bytes": 98304, "memory bandwidth (GB/s)": 9.540372670807454 } }, { "ph": "f", "id": 6978, "pid": 5, "tid": 7, "ts": 1716454216602686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216602639, "dur": 47, "args": { "External id": 6978, "cbid": 41, "correlation": 6978 } }, { "ph": "s", "id": 6978, "pid": 76337, "tid": -914061504, "ts": 1716454216602639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454216602687, "dur": 17, "args": { "External id": 6979, "cbid": 131, "correlation": 6979 } }, { "ph": "f", "id": 6979, "pid": 76337, "tid": -914061504, "ts": 1716454216602687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216602756, "dur": 3, "args": { "External id": 6986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 6986, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 6986, "pid": 5, "tid": 7, "ts": 1716454216602756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216602739, "dur": 17, "args": { "External id": 6986, "cbid": 211, "correlation": 6986 } }, { "ph": "s", "id": 6986, "pid": 76337, "tid": -914061504, "ts": 1716454216602739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454216602827, "dur": 3, "args": { "External id": 6995, "device": 5, "context": 1, "stream": 7, "correlation": 6995, "bytes": 98304, "memory bandwidth (GB/s)": 26.94736842105263 } }, { "ph": "f", "id": 6995, "pid": 5, "tid": 7, "ts": 1716454216602827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216602780, "dur": 45, "args": { "External id": 6995, "cbid": 41, "correlation": 6995 } }, { "ph": "s", "id": 6995, "pid": 76337, "tid": -914061504, "ts": 1716454216602780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454216603206, "dur": 1, "args": { "External id": 7005, "device": 5, "context": 1, "stream": 7, "correlation": 7005, "bytes": 8, "memory bandwidth (GB/s)": 0.004032258064516129 } }, { "ph": "f", "id": 7005, "pid": 5, "tid": 7, "ts": 1716454216603206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216603177, "dur": 27, "args": { "External id": 7005, "cbid": 41, "correlation": 7005 } }, { "ph": "s", "id": 7005, "pid": 76337, "tid": -914061504, "ts": 1716454216603177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454216603206, "dur": 9, "args": { "External id": 7006, "cbid": 131, "correlation": 7006 } }, { "ph": "f", "id": 7006, "pid": 76337, "tid": -914061504, "ts": 1716454216603206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454216603476, "dur": 3, "args": { "External id": 7019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7019, "pid": 5, "tid": 7, "ts": 1716454216603476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216603394, "dur": 88, "args": { "External id": 7019, "cbid": 211, "correlation": 7019 } }, { "ph": "s", "id": 7019, "pid": 76337, "tid": -914061504, "ts": 1716454216603394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216603524, "dur": 3, "args": { "External id": 7027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7027, "pid": 5, "tid": 7, "ts": 1716454216603524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216603511, "dur": 13, "args": { "External id": 7027, "cbid": 211, "correlation": 7027 } }, { "ph": "s", "id": 7027, "pid": 76337, "tid": -914061504, "ts": 1716454216603511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216603643, "dur": 3, "args": { "External id": 7035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7035, "pid": 5, "tid": 7, "ts": 1716454216603643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216603581, "dur": 63, "args": { "External id": 7035, "cbid": 211, "correlation": 7035 } }, { "ph": "s", "id": 7035, "pid": 76337, "tid": -914061504, "ts": 1716454216603581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216603727, "dur": 3, "args": { "External id": 7043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7043, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7043, "pid": 5, "tid": 7, "ts": 1716454216603727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216603676, "dur": 51, "args": { "External id": 7043, "cbid": 211, "correlation": 7043 } }, { "ph": "s", "id": 7043, "pid": 76337, "tid": -914061504, "ts": 1716454216603676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454216603903, "dur": 4, "args": { "External id": 7055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7055, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7055, "pid": 5, "tid": 7, "ts": 1716454216603903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216603802, "dur": 100, "args": { "External id": 7055, "cbid": 211, "correlation": 7055 } }, { "ph": "s", "id": 7055, "pid": 76337, "tid": -914061504, "ts": 1716454216603802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216604028, "dur": 4, "args": { "External id": 7066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7066, "pid": 5, "tid": 7, "ts": 1716454216604028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216603963, "dur": 66, "args": { "External id": 7066, "cbid": 211, "correlation": 7066 } }, { "ph": "s", "id": 7066, "pid": 76337, "tid": -914061504, "ts": 1716454216603963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216604069, "dur": 3, "args": { "External id": 7074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7074, "pid": 5, "tid": 7, "ts": 1716454216604069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216604059, "dur": 9, "args": { "External id": 7074, "cbid": 211, "correlation": 7074 } }, { "ph": "s", "id": 7074, "pid": 76337, "tid": -914061504, "ts": 1716454216604059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216604397, "dur": 4, "args": { "External id": 7082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7082, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7082, "pid": 5, "tid": 7, "ts": 1716454216604397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216604147, "dur": 251, "args": { "External id": 7082, "cbid": 211, "correlation": 7082 } }, { "ph": "s", "id": 7082, "pid": 76337, "tid": -914061504, "ts": 1716454216604147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216604559, "dur": 4, "args": { "External id": 7090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7090, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7090, "pid": 5, "tid": 7, "ts": 1716454216604559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216604481, "dur": 79, "args": { "External id": 7090, "cbid": 211, "correlation": 7090 } }, { "ph": "s", "id": 7090, "pid": 76337, "tid": -914061504, "ts": 1716454216604481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216604663, "dur": 4, "args": { "External id": 7099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7099, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7099, "pid": 5, "tid": 7, "ts": 1716454216604663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216604599, "dur": 65, "args": { "External id": 7099, "cbid": 211, "correlation": 7099 } }, { "ph": "s", "id": 7099, "pid": 76337, "tid": -914061504, "ts": 1716454216604599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216604785, "dur": 5, "args": { "External id": 7112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7112, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7112, "pid": 5, "tid": 7, "ts": 1716454216604785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216604729, "dur": 62, "args": { "External id": 7112, "cbid": 211, "correlation": 7112 } }, { "ph": "s", "id": 7112, "pid": 76337, "tid": -914061504, "ts": 1716454216604729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454216604847, "dur": 8, "args": { "External id": 7122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7122, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7122, "pid": 5, "tid": 7, "ts": 1716454216604847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216604833, "dur": 14, "args": { "External id": 7122, "cbid": 211, "correlation": 7122 } }, { "ph": "s", "id": 7122, "pid": 76337, "tid": -914061504, "ts": 1716454216604833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216605049, "dur": 98, "args": { "External id": 7139, "cbid": 251, "correlation": 7139 } }, { "ph": "f", "id": 7139, "pid": 76337, "tid": -914061504, "ts": 1716454216605049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454216605174, "dur": 11, "args": { "External id": 7141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7141, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7141, "pid": 5, "tid": 7, "ts": 1716454216605174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216605157, "dur": 24, "args": { "External id": 7141, "cbid": 211, "correlation": 7141 } }, { "ph": "s", "id": 7141, "pid": 76337, "tid": -914061504, "ts": 1716454216605157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216605255, "dur": 4, "args": { "External id": 7149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7149, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7149, "pid": 5, "tid": 7, "ts": 1716454216605255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216605242, "dur": 12, "args": { "External id": 7149, "cbid": 211, "correlation": 7149 } }, { "ph": "s", "id": 7149, "pid": 76337, "tid": -914061504, "ts": 1716454216605242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216647829, "dur": 106, "args": { "External id": 7165, "cbid": 251, "correlation": 7165 } }, { "ph": "f", "id": 7165, "pid": 76337, "tid": -914061504, "ts": 1716454216647829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216647944, "dur": 0, "args": { "External id": 7167, "cbid": 251, "correlation": 7167 } }, { "ph": "f", "id": 7167, "pid": 76337, "tid": -914061504, "ts": 1716454216647944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216647964, "dur": 12, "args": { "External id": 7168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7168, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7168, "pid": 5, "tid": 7, "ts": 1716454216647964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216647947, "dur": 18, "args": { "External id": 7168, "cbid": 211, "correlation": 7168 } }, { "ph": "s", "id": 7168, "pid": 76337, "tid": -914061504, "ts": 1716454216647947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216648121, "dur": 4, "args": { "External id": 7170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7170, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7170, "pid": 5, "tid": 7, "ts": 1716454216648121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216647992, "dur": 129, "args": { "External id": 7170, "cbid": 211, "correlation": 7170 } }, { "ph": "s", "id": 7170, "pid": 76337, "tid": -914061504, "ts": 1716454216647992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216648233, "dur": 1, "args": { "External id": 7180, "cbid": 317, "correlation": 7180 } }, { "ph": "f", "id": 7180, "pid": 76337, "tid": -914061504, "ts": 1716454216648233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216648235, "dur": 0, "args": { "External id": 7181, "cbid": 203, "correlation": 7181 } }, { "ph": "f", "id": 7181, "pid": 76337, "tid": -914061504, "ts": 1716454216648235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216648236, "dur": 1, "args": { "External id": 7182, "cbid": 205, "correlation": 7182 } }, { "ph": "f", "id": 7182, "pid": 76337, "tid": -914061504, "ts": 1716454216648236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216652109, "dur": 3, "args": { "External id": 7186, "device": 5, "context": 1, "stream": 7, "correlation": 7186, "bytes": 393216, "memory bandwidth (GB/s)": 107.78947368421052 } }, { "ph": "f", "id": 7186, "pid": 5, "tid": 7, "ts": 1716454216652109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216652049, "dur": 60, "args": { "External id": 7186, "cbid": 51, "correlation": 7186 } }, { "ph": "s", "id": 7186, "pid": 76337, "tid": -914061504, "ts": 1716454216652049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216652131, "dur": 8, "args": { "External id": 7187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7187, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7187, "pid": 5, "tid": 7, "ts": 1716454216652131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216652116, "dur": 15, "args": { "External id": 7187, "cbid": 211, "correlation": 7187 } }, { "ph": "s", "id": 7187, "pid": 76337, "tid": -914061504, "ts": 1716454216652116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216652145, "dur": 3, "args": { "External id": 7189, "device": 5, "context": 1, "stream": 7, "correlation": 7189, "bytes": 46080, "memory bandwidth (GB/s)": 14.117647058823529 } }, { "ph": "f", "id": 7189, "pid": 5, "tid": 7, "ts": 1716454216652145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216652135, "dur": 9, "args": { "External id": 7189, "cbid": 51, "correlation": 7189 } }, { "ph": "s", "id": 7189, "pid": 76337, "tid": -914061504, "ts": 1716454216652135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216652152, "dur": 4, "args": { "External id": 7190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 7190, "pid": 5, "tid": 7, "ts": 1716454216652152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216652144, "dur": 6, "args": { "External id": 7190, "cbid": 211, "correlation": 7190 } }, { "ph": "s", "id": 7190, "pid": 76337, "tid": -914061504, "ts": 1716454216652144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216652171, "dur": 4, "args": { "External id": 7192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7192, "pid": 5, "tid": 7, "ts": 1716454216652171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216652159, "dur": 10, "args": { "External id": 7192, "cbid": 211, "correlation": 7192 } }, { "ph": "s", "id": 7192, "pid": 76337, "tid": -914061504, "ts": 1716454216652159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216652174, "dur": 0, "args": { "External id": 7193, "cbid": 51, "correlation": 7193 } }, { "ph": "s", "id": 7193, "pid": 76337, "tid": -914061504, "ts": 1716454216652174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216652185, "dur": 95, "args": { "External id": 7194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7194, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7194, "pid": 5, "tid": 7, "ts": 1716454216652185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216652176, "dur": 7, "args": { "External id": 7194, "cbid": 211, "correlation": 7194 } }, { "ph": "s", "id": 7194, "pid": 76337, "tid": -914061504, "ts": 1716454216652176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216652973, "dur": 64, "args": { "External id": 7199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7199, "pid": 5, "tid": 7, "ts": 1716454216652973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216652956, "dur": 25, "args": { "External id": 7199, "cbid": 211, "correlation": 7199 } }, { "ph": "s", "id": 7199, "pid": 76337, "tid": -914061504, "ts": 1716454216652956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216671816, "dur": 56, "args": { "External id": 7219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7219, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 7219, "pid": 5, "tid": 7, "ts": 1716454216671816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216671746, "dur": 75, "args": { "External id": 7219, "cbid": 211, "correlation": 7219 } }, { "ph": "s", "id": 7219, "pid": 76337, "tid": -914061504, "ts": 1716454216671746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216671874, "dur": 5, "args": { "External id": 7231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7231, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7231, "pid": 5, "tid": 7, "ts": 1716454216671874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216671837, "dur": 10, "args": { "External id": 7231, "cbid": 211, "correlation": 7231 } }, { "ph": "s", "id": 7231, "pid": 76337, "tid": -914061504, "ts": 1716454216671837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216671901, "dur": 60, "args": { "External id": 7234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7234, "pid": 5, "tid": 7, "ts": 1716454216671901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216671890, "dur": 10, "args": { "External id": 7234, "cbid": 211, "correlation": 7234 } }, { "ph": "s", "id": 7234, "pid": 76337, "tid": -914061504, "ts": 1716454216671890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216671986, "dur": 38, "args": { "External id": 7243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7243, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7243, "pid": 5, "tid": 7, "ts": 1716454216671986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216671964, "dur": 22, "args": { "External id": 7243, "cbid": 211, "correlation": 7243 } }, { "ph": "s", "id": 7243, "pid": 76337, "tid": -914061504, "ts": 1716454216671964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216672111, "dur": 1, "args": { "External id": 7253, "cbid": 317, "correlation": 7253 } }, { "ph": "f", "id": 7253, "pid": 76337, "tid": -914061504, "ts": 1716454216672111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216672113, "dur": 1, "args": { "External id": 7254, "cbid": 203, "correlation": 7254 } }, { "ph": "f", "id": 7254, "pid": 76337, "tid": -914061504, "ts": 1716454216672113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216672116, "dur": 1, "args": { "External id": 7255, "cbid": 205, "correlation": 7255 } }, { "ph": "f", "id": 7255, "pid": 76337, "tid": -914061504, "ts": 1716454216672116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216675584, "dur": 41, "args": { "External id": 7259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7259, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7259, "pid": 5, "tid": 7, "ts": 1716454216675584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216675559, "dur": 25, "args": { "External id": 7259, "cbid": 211, "correlation": 7259 } }, { "ph": "s", "id": 7259, "pid": 76337, "tid": -914061504, "ts": 1716454216675559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216675626, "dur": 15, "args": { "External id": 7261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7261, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7261, "pid": 5, "tid": 7, "ts": 1716454216675626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216675588, "dur": 7, "args": { "External id": 7261, "cbid": 211, "correlation": 7261 } }, { "ph": "s", "id": 7261, "pid": 76337, "tid": -914061504, "ts": 1716454216675588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216675643, "dur": 3, "args": { "External id": 7263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7263, "pid": 5, "tid": 7, "ts": 1716454216675643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216675604, "dur": 11, "args": { "External id": 7263, "cbid": 211, "correlation": 7263 } }, { "ph": "s", "id": 7263, "pid": 76337, "tid": -914061504, "ts": 1716454216675604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216675619, "dur": 0, "args": { "External id": 7264, "cbid": 51, "correlation": 7264 } }, { "ph": "s", "id": 7264, "pid": 76337, "tid": -914061504, "ts": 1716454216675619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216675647, "dur": 772, "args": { "External id": 7265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7265, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7265, "pid": 5, "tid": 7, "ts": 1716454216675647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216675621, "dur": 8, "args": { "External id": 7265, "cbid": 211, "correlation": 7265 } }, { "ph": "s", "id": 7265, "pid": 76337, "tid": -914061504, "ts": 1716454216675621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216676442, "dur": 63, "args": { "External id": 7270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7270, "pid": 5, "tid": 7, "ts": 1716454216676442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676423, "dur": 18, "args": { "External id": 7270, "cbid": 211, "correlation": 7270 } }, { "ph": "s", "id": 7270, "pid": 76337, "tid": -914061504, "ts": 1716454216676423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216676524, "dur": 4, "args": { "External id": 7278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7278, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7278, "pid": 5, "tid": 7, "ts": 1716454216676524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676512, "dur": 11, "args": { "External id": 7278, "cbid": 211, "correlation": 7278 } }, { "ph": "s", "id": 7278, "pid": 76337, "tid": -914061504, "ts": 1716454216676512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216676699, "dur": 6, "args": { "External id": 7294, "cbid": 251, "correlation": 7294 } }, { "ph": "f", "id": 7294, "pid": 76337, "tid": -914061504, "ts": 1716454216676699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216676711, "dur": 0, "args": { "External id": 7296, "cbid": 251, "correlation": 7296 } }, { "ph": "f", "id": 7296, "pid": 76337, "tid": -914061504, "ts": 1716454216676711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216676732, "dur": 12, "args": { "External id": 7297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7297, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 7297, "pid": 5, "tid": 7, "ts": 1716454216676732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676715, "dur": 17, "args": { "External id": 7297, "cbid": 211, "correlation": 7297 } }, { "ph": "s", "id": 7297, "pid": 76337, "tid": -914061504, "ts": 1716454216676715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216676750, "dur": 4, "args": { "External id": 7299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7299, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 7299, "pid": 5, "tid": 7, "ts": 1716454216676750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676738, "dur": 10, "args": { "External id": 7299, "cbid": 211, "correlation": 7299 } }, { "ph": "s", "id": 7299, "pid": 76337, "tid": -914061504, "ts": 1716454216676738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216676853, "dur": 57, "args": { "External id": 7309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7309, "pid": 5, "tid": 7, "ts": 1716454216676853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676839, "dur": 13, "args": { "External id": 7309, "cbid": 211, "correlation": 7309 } }, { "ph": "s", "id": 7309, "pid": 76337, "tid": -914061504, "ts": 1716454216676839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216676952, "dur": 56, "args": { "External id": 7329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7329, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 7329, "pid": 5, "tid": 7, "ts": 1716454216676952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676938, "dur": 13, "args": { "External id": 7329, "cbid": 211, "correlation": 7329 } }, { "ph": "s", "id": 7329, "pid": 76337, "tid": -914061504, "ts": 1716454216676938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216677009, "dur": 4, "args": { "External id": 7341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7341, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7341, "pid": 5, "tid": 7, "ts": 1716454216677009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676962, "dur": 8, "args": { "External id": 7341, "cbid": 211, "correlation": 7341 } }, { "ph": "s", "id": 7341, "pid": 76337, "tid": -914061504, "ts": 1716454216676962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216677015, "dur": 57, "args": { "External id": 7344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7344, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7344, "pid": 5, "tid": 7, "ts": 1716454216677015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216676998, "dur": 8, "args": { "External id": 7344, "cbid": 211, "correlation": 7344 } }, { "ph": "s", "id": 7344, "pid": 76337, "tid": -914061504, "ts": 1716454216676998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216677074, "dur": 37, "args": { "External id": 7353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7353, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7353, "pid": 5, "tid": 7, "ts": 1716454216677074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677045, "dur": 12, "args": { "External id": 7353, "cbid": 211, "correlation": 7353 } }, { "ph": "s", "id": 7353, "pid": 76337, "tid": -914061504, "ts": 1716454216677045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216677158, "dur": 0, "args": { "External id": 7363, "cbid": 317, "correlation": 7363 } }, { "ph": "f", "id": 7363, "pid": 76337, "tid": -914061504, "ts": 1716454216677158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216677159, "dur": 1, "args": { "External id": 7364, "cbid": 203, "correlation": 7364 } }, { "ph": "f", "id": 7364, "pid": 76337, "tid": -914061504, "ts": 1716454216677159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216677161, "dur": 0, "args": { "External id": 7365, "cbid": 205, "correlation": 7365 } }, { "ph": "f", "id": 7365, "pid": 76337, "tid": -914061504, "ts": 1716454216677161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216677190, "dur": 40, "args": { "External id": 7369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7369, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7369, "pid": 5, "tid": 7, "ts": 1716454216677190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677176, "dur": 13, "args": { "External id": 7369, "cbid": 211, "correlation": 7369 } }, { "ph": "s", "id": 7369, "pid": 76337, "tid": -914061504, "ts": 1716454216677176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216677232, "dur": 16, "args": { "External id": 7371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7371, "pid": 5, "tid": 7, "ts": 1716454216677232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677192, "dur": 6, "args": { "External id": 7371, "cbid": 211, "correlation": 7371 } }, { "ph": "s", "id": 7371, "pid": 76337, "tid": -914061504, "ts": 1716454216677192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216677249, "dur": 3, "args": { "External id": 7373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7373, "pid": 5, "tid": 7, "ts": 1716454216677249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677203, "dur": 6, "args": { "External id": 7373, "cbid": 211, "correlation": 7373 } }, { "ph": "s", "id": 7373, "pid": 76337, "tid": -914061504, "ts": 1716454216677203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216677212, "dur": 0, "args": { "External id": 7374, "cbid": 51, "correlation": 7374 } }, { "ph": "s", "id": 7374, "pid": 76337, "tid": -914061504, "ts": 1716454216677212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216677254, "dur": 767, "args": { "External id": 7375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7375, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7375, "pid": 5, "tid": 7, "ts": 1716454216677254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677213, "dur": 5, "args": { "External id": 7375, "cbid": 211, "correlation": 7375 } }, { "ph": "s", "id": 7375, "pid": 76337, "tid": -914061504, "ts": 1716454216677213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216678022, "dur": 63, "args": { "External id": 7380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7380, "pid": 5, "tid": 7, "ts": 1716454216678022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677242, "dur": 10, "args": { "External id": 7380, "cbid": 211, "correlation": 7380 } }, { "ph": "s", "id": 7380, "pid": 76337, "tid": -914061504, "ts": 1716454216677242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216678086, "dur": 51, "args": { "External id": 7388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7388, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7388, "pid": 5, "tid": 7, "ts": 1716454216678086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677278, "dur": 9, "args": { "External id": 7388, "cbid": 211, "correlation": 7388 } }, { "ph": "s", "id": 7388, "pid": 76337, "tid": -914061504, "ts": 1716454216677278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216678138, "dur": 36, "args": { "External id": 7396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7396, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7396, "pid": 5, "tid": 7, "ts": 1716454216678138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677328, "dur": 10, "args": { "External id": 7396, "cbid": 211, "correlation": 7396 } }, { "ph": "s", "id": 7396, "pid": 76337, "tid": -914061504, "ts": 1716454216677328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216678176, "dur": 56, "args": { "External id": 7416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7416, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 7416, "pid": 5, "tid": 7, "ts": 1716454216678176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677494, "dur": 15, "args": { "External id": 7416, "cbid": 211, "correlation": 7416 } }, { "ph": "s", "id": 7416, "pid": 76337, "tid": -914061504, "ts": 1716454216677494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216678233, "dur": 4, "args": { "External id": 7428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7428, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7428, "pid": 5, "tid": 7, "ts": 1716454216678233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677519, "dur": 7, "args": { "External id": 7428, "cbid": 211, "correlation": 7428 } }, { "ph": "s", "id": 7428, "pid": 76337, "tid": -914061504, "ts": 1716454216677519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216678239, "dur": 59, "args": { "External id": 7431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7431, "pid": 5, "tid": 7, "ts": 1716454216678239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216677538, "dur": 6, "args": { "External id": 7431, "cbid": 211, "correlation": 7431 } }, { "ph": "s", "id": 7431, "pid": 76337, "tid": -914061504, "ts": 1716454216677538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216677615, "dur": 0, "args": { "External id": 7442, "cbid": 317, "correlation": 7442 } }, { "ph": "f", "id": 7442, "pid": 76337, "tid": -914061504, "ts": 1716454216677615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216677616, "dur": 0, "args": { "External id": 7443, "cbid": 203, "correlation": 7443 } }, { "ph": "f", "id": 7443, "pid": 76337, "tid": -914061504, "ts": 1716454216677616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216677617, "dur": 0, "args": { "External id": 7444, "cbid": 205, "correlation": 7444 } }, { "ph": "f", "id": 7444, "pid": 76337, "tid": -914061504, "ts": 1716454216677617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679525, "dur": 127, "args": { "External id": 7448, "cbid": 251, "correlation": 7448 } }, { "ph": "f", "id": 7448, "pid": 76337, "tid": -914061504, "ts": 1716454216679525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679654, "dur": 1, "args": { "External id": 7449, "cbid": 251, "correlation": 7449 } }, { "ph": "f", "id": 7449, "pid": 76337, "tid": -914061504, "ts": 1716454216679654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679655, "dur": 47, "args": { "External id": 7450, "cbid": 251, "correlation": 7450 } }, { "ph": "f", "id": 7450, "pid": 76337, "tid": -914061504, "ts": 1716454216679655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679704, "dur": 1, "args": { "External id": 7451, "cbid": 251, "correlation": 7451 } }, { "ph": "f", "id": 7451, "pid": 76337, "tid": -914061504, "ts": 1716454216679704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679706, "dur": 59, "args": { "External id": 7452, "cbid": 251, "correlation": 7452 } }, { "ph": "f", "id": 7452, "pid": 76337, "tid": -914061504, "ts": 1716454216679706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679766, "dur": 1, "args": { "External id": 7453, "cbid": 251, "correlation": 7453 } }, { "ph": "f", "id": 7453, "pid": 76337, "tid": -914061504, "ts": 1716454216679766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679768, "dur": 35, "args": { "External id": 7454, "cbid": 251, "correlation": 7454 } }, { "ph": "f", "id": 7454, "pid": 76337, "tid": -914061504, "ts": 1716454216679768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679804, "dur": 1, "args": { "External id": 7455, "cbid": 251, "correlation": 7455 } }, { "ph": "f", "id": 7455, "pid": 76337, "tid": -914061504, "ts": 1716454216679804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216679807, "dur": 0, "args": { "External id": 7456, "cbid": 251, "correlation": 7456 } }, { "ph": "f", "id": 7456, "pid": 76337, "tid": -914061504, "ts": 1716454216679807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216679831, "dur": 123, "args": { "External id": 7457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7457, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 7457, "pid": 5, "tid": 7, "ts": 1716454216679831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216679813, "dur": 19, "args": { "External id": 7457, "cbid": 211, "correlation": 7457 } }, { "ph": "s", "id": 7457, "pid": 76337, "tid": -914061504, "ts": 1716454216679813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216680393, "dur": 64, "args": { "External id": 7463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7463, "pid": 5, "tid": 7, "ts": 1716454216680393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216680379, "dur": 14, "args": { "External id": 7463, "cbid": 211, "correlation": 7463 } }, { "ph": "s", "id": 7463, "pid": 76337, "tid": -914061504, "ts": 1716454216680379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216680642, "dur": 164, "args": { "External id": 7474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7474, "pid": 5, "tid": 7, "ts": 1716454216680642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216680623, "dur": 19, "args": { "External id": 7474, "cbid": 211, "correlation": 7474 } }, { "ph": "s", "id": 7474, "pid": 76337, "tid": -914061504, "ts": 1716454216680623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216680814, "dur": 102, "args": { "External id": 7496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7496, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7496, "pid": 5, "tid": 7, "ts": 1716454216680814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216680688, "dur": 56, "args": { "External id": 7496, "cbid": 211, "correlation": 7496 } }, { "ph": "s", "id": 7496, "pid": 76337, "tid": -914061504, "ts": 1716454216680688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216680892, "dur": 2, "args": { "External id": 7507, "cbid": 251, "correlation": 7507 } }, { "ph": "f", "id": 7507, "pid": 76337, "tid": -914061504, "ts": 1716454216680892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216680918, "dur": 119, "args": { "External id": 7508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7508, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7508, "pid": 5, "tid": 7, "ts": 1716454216680918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216680901, "dur": 15, "args": { "External id": 7508, "cbid": 211, "correlation": 7508 } }, { "ph": "s", "id": 7508, "pid": 76337, "tid": -914061504, "ts": 1716454216680901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216680986, "dur": 1, "args": { "External id": 7519, "cbid": 251, "correlation": 7519 } }, { "ph": "f", "id": 7519, "pid": 76337, "tid": -914061504, "ts": 1716454216680986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216681039, "dur": 116, "args": { "External id": 7520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7520, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7520, "pid": 5, "tid": 7, "ts": 1716454216681039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216680990, "dur": 13, "args": { "External id": 7520, "cbid": 211, "correlation": 7520 } }, { "ph": "s", "id": 7520, "pid": 76337, "tid": -914061504, "ts": 1716454216680990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216681073, "dur": 1, "args": { "External id": 7531, "cbid": 251, "correlation": 7531 } }, { "ph": "f", "id": 7531, "pid": 76337, "tid": -914061504, "ts": 1716454216681073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216681156, "dur": 111, "args": { "External id": 7532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7532, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7532, "pid": 5, "tid": 7, "ts": 1716454216681156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681077, "dur": 12, "args": { "External id": 7532, "cbid": 211, "correlation": 7532 } }, { "ph": "s", "id": 7532, "pid": 76337, "tid": -914061504, "ts": 1716454216681077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216681337, "dur": 5218, "args": { "External id": 7553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7553, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 7553, "pid": 5, "tid": 7, "ts": 1716454216681337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681219, "dur": 117, "args": { "External id": 7553, "cbid": 211, "correlation": 7553 } }, { "ph": "s", "id": 7553, "pid": 76337, "tid": -914061504, "ts": 1716454216681219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216681464, "dur": 2, "args": { "External id": 7571, "cbid": 251, "correlation": 7571 } }, { "ph": "f", "id": 7571, "pid": 76337, "tid": -914061504, "ts": 1716454216681464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216686556, "dur": 114, "args": { "External id": 7573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7573, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 7573, "pid": 5, "tid": 7, "ts": 1716454216686556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681471, "dur": 14, "args": { "External id": 7573, "cbid": 211, "correlation": 7573 } }, { "ph": "s", "id": 7573, "pid": 76337, "tid": -914061504, "ts": 1716454216681471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216686671, "dur": 35, "args": { "External id": 7581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7581, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7581, "pid": 5, "tid": 7, "ts": 1716454216686671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681567, "dur": 15, "args": { "External id": 7581, "cbid": 211, "correlation": 7581 } }, { "ph": "s", "id": 7581, "pid": 76337, "tid": -914061504, "ts": 1716454216681567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216686708, "dur": 172, "args": { "External id": 7589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7589, "pid": 5, "tid": 7, "ts": 1716454216686708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681617, "dur": 10, "args": { "External id": 7589, "cbid": 211, "correlation": 7589 } }, { "ph": "s", "id": 7589, "pid": 76337, "tid": -914061504, "ts": 1716454216681617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216686881, "dur": 103, "args": { "External id": 7611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7611, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7611, "pid": 5, "tid": 7, "ts": 1716454216686881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681686, "dur": 12, "args": { "External id": 7611, "cbid": 211, "correlation": 7611 } }, { "ph": "s", "id": 7611, "pid": 76337, "tid": -914061504, "ts": 1716454216681686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216681777, "dur": 1, "args": { "External id": 7622, "cbid": 251, "correlation": 7622 } }, { "ph": "f", "id": 7622, "pid": 76337, "tid": -914061504, "ts": 1716454216681777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216686986, "dur": 118, "args": { "External id": 7623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7623, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7623, "pid": 5, "tid": 7, "ts": 1716454216686986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681782, "dur": 13, "args": { "External id": 7623, "cbid": 211, "correlation": 7623 } }, { "ph": "s", "id": 7623, "pid": 76337, "tid": -914061504, "ts": 1716454216681782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216681921, "dur": 48, "args": { "External id": 7634, "cbid": 251, "correlation": 7634 } }, { "ph": "f", "id": 7634, "pid": 76337, "tid": -914061504, "ts": 1716454216681921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216681982, "dur": 0, "args": { "External id": 7635, "cbid": 251, "correlation": 7635 } }, { "ph": "f", "id": 7635, "pid": 76337, "tid": -914061504, "ts": 1716454216681982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216687112, "dur": 8, "args": { "External id": 7636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7636, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 7636, "pid": 5, "tid": 7, "ts": 1716454216687112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216681985, "dur": 22, "args": { "External id": 7636, "cbid": 211, "correlation": 7636 } }, { "ph": "s", "id": 7636, "pid": 76337, "tid": -914061504, "ts": 1716454216681985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216687126, "dur": 5, "args": { "External id": 7638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7638, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 7638, "pid": 5, "tid": 7, "ts": 1716454216687126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682015, "dur": 98, "args": { "External id": 7638, "cbid": 211, "correlation": 7638 } }, { "ph": "s", "id": 7638, "pid": 76337, "tid": -914061504, "ts": 1716454216682015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216682192, "dur": 1, "args": { "External id": 7649, "cbid": 251, "correlation": 7649 } }, { "ph": "f", "id": 7649, "pid": 76337, "tid": -914061504, "ts": 1716454216682192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216682196, "dur": 0, "args": { "External id": 7650, "cbid": 251, "correlation": 7650 } }, { "ph": "f", "id": 7650, "pid": 76337, "tid": -914061504, "ts": 1716454216682196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216687132, "dur": 7, "args": { "External id": 7651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7651, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 7651, "pid": 5, "tid": 7, "ts": 1716454216687132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682198, "dur": 14, "args": { "External id": 7651, "cbid": 211, "correlation": 7651 } }, { "ph": "s", "id": 7651, "pid": 76337, "tid": -914061504, "ts": 1716454216682198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216687141, "dur": 4, "args": { "External id": 7653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7653, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 7653, "pid": 5, "tid": 7, "ts": 1716454216687141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682214, "dur": 6, "args": { "External id": 7653, "cbid": 211, "correlation": 7653 } }, { "ph": "s", "id": 7653, "pid": 76337, "tid": -914061504, "ts": 1716454216682214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216687146, "dur": 177, "args": { "External id": 7674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7674, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 7674, "pid": 5, "tid": 7, "ts": 1716454216687146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682292, "dur": 13, "args": { "External id": 7674, "cbid": 211, "correlation": 7674 } }, { "ph": "s", "id": 7674, "pid": 76337, "tid": -914061504, "ts": 1716454216682292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216682405, "dur": 1, "args": { "External id": 7692, "cbid": 251, "correlation": 7692 } }, { "ph": "f", "id": 7692, "pid": 76337, "tid": -914061504, "ts": 1716454216682405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216687325, "dur": 117, "args": { "External id": 7694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7694, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 7694, "pid": 5, "tid": 7, "ts": 1716454216687325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682411, "dur": 15, "args": { "External id": 7694, "cbid": 211, "correlation": 7694 } }, { "ph": "s", "id": 7694, "pid": 76337, "tid": -914061504, "ts": 1716454216682411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216687443, "dur": 35, "args": { "External id": 7702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7702, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7702, "pid": 5, "tid": 7, "ts": 1716454216687443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682486, "dur": 12, "args": { "External id": 7702, "cbid": 211, "correlation": 7702 } }, { "ph": "s", "id": 7702, "pid": 76337, "tid": -914061504, "ts": 1716454216682486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216687479, "dur": 51, "args": { "External id": 7710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7710, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7710, "pid": 5, "tid": 7, "ts": 1716454216687479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682528, "dur": 9, "args": { "External id": 7710, "cbid": 211, "correlation": 7710 } }, { "ph": "s", "id": 7710, "pid": 76337, "tid": -914061504, "ts": 1716454216682528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216687531, "dur": 103, "args": { "External id": 7732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7732, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7732, "pid": 5, "tid": 7, "ts": 1716454216687531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682581, "dur": 10, "args": { "External id": 7732, "cbid": 211, "correlation": 7732 } }, { "ph": "s", "id": 7732, "pid": 76337, "tid": -914061504, "ts": 1716454216682581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216682731, "dur": 74, "args": { "External id": 7748, "cbid": 251, "correlation": 7748 } }, { "ph": "f", "id": 7748, "pid": 76337, "tid": -914061504, "ts": 1716454216682731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216687646, "dur": 624, "args": { "External id": 7750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7750, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7750, "pid": 5, "tid": 7, "ts": 1716454216687646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216682812, "dur": 20, "args": { "External id": 7750, "cbid": 211, "correlation": 7750 } }, { "ph": "s", "id": 7750, "pid": 76337, "tid": -914061504, "ts": 1716454216682812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216683002, "dur": 2, "args": { "External id": 7758, "cbid": 317, "correlation": 7758 } }, { "ph": "f", "id": 7758, "pid": 76337, "tid": -914061504, "ts": 1716454216683002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216683005, "dur": 9502, "args": { "External id": 7759, "cbid": 20, "correlation": 7759 } }, { "ph": "f", "id": 7759, "pid": 76337, "tid": -914061504, "ts": 1716454216683005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216692659, "dur": 260, "args": { "External id": 7762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7762, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7762, "pid": 5, "tid": 7, "ts": 1716454216692659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216692571, "dur": 88, "args": { "External id": 7762, "cbid": 211, "correlation": 7762 } }, { "ph": "s", "id": 7762, "pid": 76337, "tid": -914061504, "ts": 1716454216692571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216692705, "dur": 1, "args": { "External id": 7770, "cbid": 317, "correlation": 7770 } }, { "ph": "f", "id": 7770, "pid": 76337, "tid": -914061504, "ts": 1716454216692705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216692707, "dur": 3543, "args": { "External id": 7771, "cbid": 20, "correlation": 7771 } }, { "ph": "f", "id": 7771, "pid": 76337, "tid": -914061504, "ts": 1716454216692707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216696349, "dur": 261, "args": { "External id": 7774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7774, "pid": 5, "tid": 7, "ts": 1716454216696349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696271, "dur": 78, "args": { "External id": 7774, "cbid": 211, "correlation": 7774 } }, { "ph": "s", "id": 7774, "pid": 76337, "tid": -914061504, "ts": 1716454216696271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696506, "dur": 3, "args": { "External id": 7790, "cbid": 251, "correlation": 7790 } }, { "ph": "f", "id": 7790, "pid": 76337, "tid": -914061504, "ts": 1716454216696506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696513, "dur": 0, "args": { "External id": 7792, "cbid": 251, "correlation": 7792 } }, { "ph": "f", "id": 7792, "pid": 76337, "tid": -914061504, "ts": 1716454216696513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216696611, "dur": 377, "args": { "External id": 7793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7793, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 7793, "pid": 5, "tid": 7, "ts": 1716454216696611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696531, "dur": 16, "args": { "External id": 7793, "cbid": 211, "correlation": 7793 } }, { "ph": "s", "id": 7793, "pid": 76337, "tid": -914061504, "ts": 1716454216696531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216696990, "dur": 50, "args": { "External id": 7801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7801, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7801, "pid": 5, "tid": 7, "ts": 1716454216696990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696589, "dur": 11, "args": { "External id": 7801, "cbid": 211, "correlation": 7801 } }, { "ph": "s", "id": 7801, "pid": 76337, "tid": -914061504, "ts": 1716454216696589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216697042, "dur": 172, "args": { "External id": 7812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7812, "pid": 5, "tid": 7, "ts": 1716454216697042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696687, "dur": 14, "args": { "External id": 7812, "cbid": 211, "correlation": 7812 } }, { "ph": "s", "id": 7812, "pid": 76337, "tid": -914061504, "ts": 1716454216696687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216696793, "dur": 0, "args": { "External id": 7824, "cbid": 317, "correlation": 7824 } }, { "ph": "f", "id": 7824, "pid": 76337, "tid": -914061504, "ts": 1716454216696793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216696794, "dur": 1, "args": { "External id": 7825, "cbid": 203, "correlation": 7825 } }, { "ph": "f", "id": 7825, "pid": 76337, "tid": -914061504, "ts": 1716454216696794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216696796, "dur": 1, "args": { "External id": 7826, "cbid": 205, "correlation": 7826 } }, { "ph": "f", "id": 7826, "pid": 76337, "tid": -914061504, "ts": 1716454216696796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696836, "dur": 1, "args": { "External id": 7830, "cbid": 251, "correlation": 7830 } }, { "ph": "f", "id": 7830, "pid": 76337, "tid": -914061504, "ts": 1716454216696836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696839, "dur": 1, "args": { "External id": 7831, "cbid": 251, "correlation": 7831 } }, { "ph": "f", "id": 7831, "pid": 76337, "tid": -914061504, "ts": 1716454216696839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696840, "dur": 0, "args": { "External id": 7832, "cbid": 251, "correlation": 7832 } }, { "ph": "f", "id": 7832, "pid": 76337, "tid": -914061504, "ts": 1716454216696840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696842, "dur": 0, "args": { "External id": 7833, "cbid": 251, "correlation": 7833 } }, { "ph": "f", "id": 7833, "pid": 76337, "tid": -914061504, "ts": 1716454216696842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696843, "dur": 1, "args": { "External id": 7834, "cbid": 251, "correlation": 7834 } }, { "ph": "f", "id": 7834, "pid": 76337, "tid": -914061504, "ts": 1716454216696843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696845, "dur": 1, "args": { "External id": 7835, "cbid": 251, "correlation": 7835 } }, { "ph": "f", "id": 7835, "pid": 76337, "tid": -914061504, "ts": 1716454216696845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696847, "dur": 1, "args": { "External id": 7836, "cbid": 251, "correlation": 7836 } }, { "ph": "f", "id": 7836, "pid": 76337, "tid": -914061504, "ts": 1716454216696847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696848, "dur": 0, "args": { "External id": 7837, "cbid": 251, "correlation": 7837 } }, { "ph": "f", "id": 7837, "pid": 76337, "tid": -914061504, "ts": 1716454216696848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216696850, "dur": 0, "args": { "External id": 7838, "cbid": 251, "correlation": 7838 } }, { "ph": "f", "id": 7838, "pid": 76337, "tid": -914061504, "ts": 1716454216696850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216697216, "dur": 124, "args": { "External id": 7839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7839, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 7839, "pid": 5, "tid": 7, "ts": 1716454216697216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696853, "dur": 13, "args": { "External id": 7839, "cbid": 211, "correlation": 7839 } }, { "ph": "s", "id": 7839, "pid": 76337, "tid": -914061504, "ts": 1716454216696853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216697342, "dur": 64, "args": { "External id": 7845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7845, "pid": 5, "tid": 7, "ts": 1716454216697342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696907, "dur": 10, "args": { "External id": 7845, "cbid": 211, "correlation": 7845 } }, { "ph": "s", "id": 7845, "pid": 76337, "tid": -914061504, "ts": 1716454216696907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216697407, "dur": 50, "args": { "External id": 7853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7853, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7853, "pid": 5, "tid": 7, "ts": 1716454216697407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216696943, "dur": 9, "args": { "External id": 7853, "cbid": 211, "correlation": 7853 } }, { "ph": "s", "id": 7853, "pid": 76337, "tid": -914061504, "ts": 1716454216696943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216697458, "dur": 58, "args": { "External id": 7873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7873, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 7873, "pid": 5, "tid": 7, "ts": 1716454216697458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697079, "dur": 15, "args": { "External id": 7873, "cbid": 211, "correlation": 7873 } }, { "ph": "s", "id": 7873, "pid": 76337, "tid": -914061504, "ts": 1716454216697079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216697518, "dur": 5, "args": { "External id": 7885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7885, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7885, "pid": 5, "tid": 7, "ts": 1716454216697518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697116, "dur": 9, "args": { "External id": 7885, "cbid": 211, "correlation": 7885 } }, { "ph": "s", "id": 7885, "pid": 76337, "tid": -914061504, "ts": 1716454216697116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216697524, "dur": 59, "args": { "External id": 7888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7888, "pid": 5, "tid": 7, "ts": 1716454216697524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697140, "dur": 8, "args": { "External id": 7888, "cbid": 211, "correlation": 7888 } }, { "ph": "s", "id": 7888, "pid": 76337, "tid": -914061504, "ts": 1716454216697140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216697584, "dur": 38, "args": { "External id": 7897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7897, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7897, "pid": 5, "tid": 7, "ts": 1716454216697584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697196, "dur": 11, "args": { "External id": 7897, "cbid": 211, "correlation": 7897 } }, { "ph": "s", "id": 7897, "pid": 76337, "tid": -914061504, "ts": 1716454216697196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216697268, "dur": 0, "args": { "External id": 7907, "cbid": 317, "correlation": 7907 } }, { "ph": "f", "id": 7907, "pid": 76337, "tid": -914061504, "ts": 1716454216697268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216697269, "dur": 0, "args": { "External id": 7908, "cbid": 203, "correlation": 7908 } }, { "ph": "f", "id": 7908, "pid": 76337, "tid": -914061504, "ts": 1716454216697269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216697270, "dur": 0, "args": { "External id": 7909, "cbid": 205, "correlation": 7909 } }, { "ph": "f", "id": 7909, "pid": 76337, "tid": -914061504, "ts": 1716454216697270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216697623, "dur": 42, "args": { "External id": 7913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7913, "pid": 5, "tid": 7, "ts": 1716454216697623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697293, "dur": 15, "args": { "External id": 7913, "cbid": 211, "correlation": 7913 } }, { "ph": "s", "id": 7913, "pid": 76337, "tid": -914061504, "ts": 1716454216697293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216697667, "dur": 16, "args": { "External id": 7915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7915, "pid": 5, "tid": 7, "ts": 1716454216697667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697311, "dur": 5, "args": { "External id": 7915, "cbid": 211, "correlation": 7915 } }, { "ph": "s", "id": 7915, "pid": 76337, "tid": -914061504, "ts": 1716454216697311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216697684, "dur": 4, "args": { "External id": 7917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7917, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7917, "pid": 5, "tid": 7, "ts": 1716454216697684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697329, "dur": 8, "args": { "External id": 7917, "cbid": 211, "correlation": 7917 } }, { "ph": "s", "id": 7917, "pid": 76337, "tid": -914061504, "ts": 1716454216697329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216697343, "dur": 0, "args": { "External id": 7918, "cbid": 51, "correlation": 7918 } }, { "ph": "s", "id": 7918, "pid": 76337, "tid": -914061504, "ts": 1716454216697343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216697689, "dur": 772, "args": { "External id": 7919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7919, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 7919, "pid": 5, "tid": 7, "ts": 1716454216697689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697345, "dur": 7, "args": { "External id": 7919, "cbid": 211, "correlation": 7919 } }, { "ph": "s", "id": 7919, "pid": 76337, "tid": -914061504, "ts": 1716454216697345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216698462, "dur": 63, "args": { "External id": 7924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7924, "pid": 5, "tid": 7, "ts": 1716454216698462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697376, "dur": 10, "args": { "External id": 7924, "cbid": 211, "correlation": 7924 } }, { "ph": "s", "id": 7924, "pid": 76337, "tid": -914061504, "ts": 1716454216697376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216698527, "dur": 4, "args": { "External id": 7932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7932, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7932, "pid": 5, "tid": 7, "ts": 1716454216698527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697421, "dur": 9, "args": { "External id": 7932, "cbid": 211, "correlation": 7932 } }, { "ph": "s", "id": 7932, "pid": 76337, "tid": -914061504, "ts": 1716454216697421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216697513, "dur": 2, "args": { "External id": 7948, "cbid": 251, "correlation": 7948 } }, { "ph": "f", "id": 7948, "pid": 76337, "tid": -914061504, "ts": 1716454216697513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216697518, "dur": 0, "args": { "External id": 7950, "cbid": 251, "correlation": 7950 } }, { "ph": "f", "id": 7950, "pid": 76337, "tid": -914061504, "ts": 1716454216697518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216698532, "dur": 12, "args": { "External id": 7951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7951, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 7951, "pid": 5, "tid": 7, "ts": 1716454216698532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697521, "dur": 14, "args": { "External id": 7951, "cbid": 211, "correlation": 7951 } }, { "ph": "s", "id": 7951, "pid": 76337, "tid": -914061504, "ts": 1716454216697521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216698546, "dur": 5, "args": { "External id": 7953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7953, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 7953, "pid": 5, "tid": 7, "ts": 1716454216698546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697539, "dur": 7, "args": { "External id": 7953, "cbid": 211, "correlation": 7953 } }, { "ph": "s", "id": 7953, "pid": 76337, "tid": -914061504, "ts": 1716454216697539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216698553, "dur": 57, "args": { "External id": 7963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7963, "pid": 5, "tid": 7, "ts": 1716454216698553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697615, "dur": 12, "args": { "External id": 7963, "cbid": 211, "correlation": 7963 } }, { "ph": "s", "id": 7963, "pid": 76337, "tid": -914061504, "ts": 1716454216697615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216698612, "dur": 56, "args": { "External id": 7983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7983, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 7983, "pid": 5, "tid": 7, "ts": 1716454216698612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697684, "dur": 11, "args": { "External id": 7983, "cbid": 211, "correlation": 7983 } }, { "ph": "s", "id": 7983, "pid": 76337, "tid": -914061504, "ts": 1716454216697684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216698669, "dur": 4, "args": { "External id": 7995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7995, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 7995, "pid": 5, "tid": 7, "ts": 1716454216698669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697705, "dur": 6, "args": { "External id": 7995, "cbid": 211, "correlation": 7995 } }, { "ph": "s", "id": 7995, "pid": 76337, "tid": -914061504, "ts": 1716454216697705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216698675, "dur": 58, "args": { "External id": 7998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 7998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 7998, "pid": 5, "tid": 7, "ts": 1716454216698675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697723, "dur": 7, "args": { "External id": 7998, "cbid": 211, "correlation": 7998 } }, { "ph": "s", "id": 7998, "pid": 76337, "tid": -914061504, "ts": 1716454216697723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216698734, "dur": 38, "args": { "External id": 8007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8007, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8007, "pid": 5, "tid": 7, "ts": 1716454216698734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697764, "dur": 10, "args": { "External id": 8007, "cbid": 211, "correlation": 8007 } }, { "ph": "s", "id": 8007, "pid": 76337, "tid": -914061504, "ts": 1716454216697764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216697842, "dur": 0, "args": { "External id": 8017, "cbid": 317, "correlation": 8017 } }, { "ph": "f", "id": 8017, "pid": 76337, "tid": -914061504, "ts": 1716454216697842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216697843, "dur": 0, "args": { "External id": 8018, "cbid": 203, "correlation": 8018 } }, { "ph": "f", "id": 8018, "pid": 76337, "tid": -914061504, "ts": 1716454216697843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216697844, "dur": 0, "args": { "External id": 8019, "cbid": 205, "correlation": 8019 } }, { "ph": "f", "id": 8019, "pid": 76337, "tid": -914061504, "ts": 1716454216697844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216698774, "dur": 41, "args": { "External id": 8023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8023, "pid": 5, "tid": 7, "ts": 1716454216698774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697858, "dur": 13, "args": { "External id": 8023, "cbid": 211, "correlation": 8023 } }, { "ph": "s", "id": 8023, "pid": 76337, "tid": -914061504, "ts": 1716454216697858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216698816, "dur": 15, "args": { "External id": 8025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8025, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8025, "pid": 5, "tid": 7, "ts": 1716454216698816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697874, "dur": 5, "args": { "External id": 8025, "cbid": 211, "correlation": 8025 } }, { "ph": "s", "id": 8025, "pid": 76337, "tid": -914061504, "ts": 1716454216697874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216698833, "dur": 3, "args": { "External id": 8027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 8027, "pid": 5, "tid": 7, "ts": 1716454216698833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697883, "dur": 5, "args": { "External id": 8027, "cbid": 211, "correlation": 8027 } }, { "ph": "s", "id": 8027, "pid": 76337, "tid": -914061504, "ts": 1716454216697883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216697891, "dur": 0, "args": { "External id": 8028, "cbid": 51, "correlation": 8028 } }, { "ph": "s", "id": 8028, "pid": 76337, "tid": -914061504, "ts": 1716454216697891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216698838, "dur": 766, "args": { "External id": 8029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8029, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8029, "pid": 5, "tid": 7, "ts": 1716454216698838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697892, "dur": 5, "args": { "External id": 8029, "cbid": 211, "correlation": 8029 } }, { "ph": "s", "id": 8029, "pid": 76337, "tid": -914061504, "ts": 1716454216697892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216699606, "dur": 63, "args": { "External id": 8034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8034, "pid": 5, "tid": 7, "ts": 1716454216699606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697920, "dur": 9, "args": { "External id": 8034, "cbid": 211, "correlation": 8034 } }, { "ph": "s", "id": 8034, "pid": 76337, "tid": -914061504, "ts": 1716454216697920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216699670, "dur": 50, "args": { "External id": 8042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8042, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8042, "pid": 5, "tid": 7, "ts": 1716454216699670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697953, "dur": 9, "args": { "External id": 8042, "cbid": 211, "correlation": 8042 } }, { "ph": "s", "id": 8042, "pid": 76337, "tid": -914061504, "ts": 1716454216697953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216699721, "dur": 35, "args": { "External id": 8050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8050, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8050, "pid": 5, "tid": 7, "ts": 1716454216699721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216697999, "dur": 10, "args": { "External id": 8050, "cbid": 211, "correlation": 8050 } }, { "ph": "s", "id": 8050, "pid": 76337, "tid": -914061504, "ts": 1716454216697999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216699758, "dur": 57, "args": { "External id": 8070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8070, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 8070, "pid": 5, "tid": 7, "ts": 1716454216699758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698099, "dur": 14, "args": { "External id": 8070, "cbid": 211, "correlation": 8070 } }, { "ph": "s", "id": 8070, "pid": 76337, "tid": -914061504, "ts": 1716454216698099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216699816, "dur": 4, "args": { "External id": 8082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8082, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 8082, "pid": 5, "tid": 7, "ts": 1716454216699816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698129, "dur": 8, "args": { "External id": 8082, "cbid": 211, "correlation": 8082 } }, { "ph": "s", "id": 8082, "pid": 76337, "tid": -914061504, "ts": 1716454216698129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216699822, "dur": 60, "args": { "External id": 8085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8085, "pid": 5, "tid": 7, "ts": 1716454216699822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698149, "dur": 7, "args": { "External id": 8085, "cbid": 211, "correlation": 8085 } }, { "ph": "s", "id": 8085, "pid": 76337, "tid": -914061504, "ts": 1716454216698149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216698218, "dur": 0, "args": { "External id": 8096, "cbid": 317, "correlation": 8096 } }, { "ph": "f", "id": 8096, "pid": 76337, "tid": -914061504, "ts": 1716454216698218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216698219, "dur": 0, "args": { "External id": 8097, "cbid": 203, "correlation": 8097 } }, { "ph": "f", "id": 8097, "pid": 76337, "tid": -914061504, "ts": 1716454216698219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216698220, "dur": 0, "args": { "External id": 8098, "cbid": 205, "correlation": 8098 } }, { "ph": "f", "id": 8098, "pid": 76337, "tid": -914061504, "ts": 1716454216698220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698242, "dur": 1, "args": { "External id": 8102, "cbid": 251, "correlation": 8102 } }, { "ph": "f", "id": 8102, "pid": 76337, "tid": -914061504, "ts": 1716454216698242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698244, "dur": 0, "args": { "External id": 8103, "cbid": 251, "correlation": 8103 } }, { "ph": "f", "id": 8103, "pid": 76337, "tid": -914061504, "ts": 1716454216698244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698245, "dur": 0, "args": { "External id": 8104, "cbid": 251, "correlation": 8104 } }, { "ph": "f", "id": 8104, "pid": 76337, "tid": -914061504, "ts": 1716454216698245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698246, "dur": 0, "args": { "External id": 8105, "cbid": 251, "correlation": 8105 } }, { "ph": "f", "id": 8105, "pid": 76337, "tid": -914061504, "ts": 1716454216698246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698246, "dur": 0, "args": { "External id": 8106, "cbid": 251, "correlation": 8106 } }, { "ph": "f", "id": 8106, "pid": 76337, "tid": -914061504, "ts": 1716454216698246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698247, "dur": 0, "args": { "External id": 8107, "cbid": 251, "correlation": 8107 } }, { "ph": "f", "id": 8107, "pid": 76337, "tid": -914061504, "ts": 1716454216698247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698248, "dur": 0, "args": { "External id": 8108, "cbid": 251, "correlation": 8108 } }, { "ph": "f", "id": 8108, "pid": 76337, "tid": -914061504, "ts": 1716454216698248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698248, "dur": 0, "args": { "External id": 8109, "cbid": 251, "correlation": 8109 } }, { "ph": "f", "id": 8109, "pid": 76337, "tid": -914061504, "ts": 1716454216698248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698250, "dur": 0, "args": { "External id": 8110, "cbid": 251, "correlation": 8110 } }, { "ph": "f", "id": 8110, "pid": 76337, "tid": -914061504, "ts": 1716454216698250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216699883, "dur": 122, "args": { "External id": 8111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8111, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8111, "pid": 5, "tid": 7, "ts": 1716454216699883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698252, "dur": 13, "args": { "External id": 8111, "cbid": 211, "correlation": 8111 } }, { "ph": "s", "id": 8111, "pid": 76337, "tid": -914061504, "ts": 1716454216698252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216700007, "dur": 63, "args": { "External id": 8117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8117, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8117, "pid": 5, "tid": 7, "ts": 1716454216700007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698288, "dur": 9, "args": { "External id": 8117, "cbid": 211, "correlation": 8117 } }, { "ph": "s", "id": 8117, "pid": 76337, "tid": -914061504, "ts": 1716454216698288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216700071, "dur": 163, "args": { "External id": 8128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8128, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8128, "pid": 5, "tid": 7, "ts": 1716454216700071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698399, "dur": 15, "args": { "External id": 8128, "cbid": 211, "correlation": 8128 } }, { "ph": "s", "id": 8128, "pid": 76337, "tid": -914061504, "ts": 1716454216698399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216700235, "dur": 103, "args": { "External id": 8150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8150, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8150, "pid": 5, "tid": 7, "ts": 1716454216700235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698435, "dur": 8, "args": { "External id": 8150, "cbid": 211, "correlation": 8150 } }, { "ph": "s", "id": 8150, "pid": 76337, "tid": -914061504, "ts": 1716454216698435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698539, "dur": 1, "args": { "External id": 8161, "cbid": 251, "correlation": 8161 } }, { "ph": "f", "id": 8161, "pid": 76337, "tid": -914061504, "ts": 1716454216698539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216700340, "dur": 121, "args": { "External id": 8162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8162, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8162, "pid": 5, "tid": 7, "ts": 1716454216700340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698544, "dur": 13, "args": { "External id": 8162, "cbid": 211, "correlation": 8162 } }, { "ph": "s", "id": 8162, "pid": 76337, "tid": -914061504, "ts": 1716454216698544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698628, "dur": 1, "args": { "External id": 8173, "cbid": 251, "correlation": 8173 } }, { "ph": "f", "id": 8173, "pid": 76337, "tid": -914061504, "ts": 1716454216698628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216700462, "dur": 113, "args": { "External id": 8174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8174, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8174, "pid": 5, "tid": 7, "ts": 1716454216700462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698632, "dur": 12, "args": { "External id": 8174, "cbid": 211, "correlation": 8174 } }, { "ph": "s", "id": 8174, "pid": 76337, "tid": -914061504, "ts": 1716454216698632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698710, "dur": 1, "args": { "External id": 8185, "cbid": 251, "correlation": 8185 } }, { "ph": "f", "id": 8185, "pid": 76337, "tid": -914061504, "ts": 1716454216698710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216700576, "dur": 113, "args": { "External id": 8186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8186, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8186, "pid": 5, "tid": 7, "ts": 1716454216700576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698715, "dur": 12, "args": { "External id": 8186, "cbid": 211, "correlation": 8186 } }, { "ph": "s", "id": 8186, "pid": 76337, "tid": -914061504, "ts": 1716454216698715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216700691, "dur": 5237, "args": { "External id": 8207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8207, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8207, "pid": 5, "tid": 7, "ts": 1716454216700691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698813, "dur": 15, "args": { "External id": 8207, "cbid": 211, "correlation": 8207 } }, { "ph": "s", "id": 8207, "pid": 76337, "tid": -914061504, "ts": 1716454216698813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216698941, "dur": 1, "args": { "External id": 8225, "cbid": 251, "correlation": 8225 } }, { "ph": "f", "id": 8225, "pid": 76337, "tid": -914061504, "ts": 1716454216698941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216705929, "dur": 114, "args": { "External id": 8227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8227, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8227, "pid": 5, "tid": 7, "ts": 1716454216705929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216698947, "dur": 14, "args": { "External id": 8227, "cbid": 211, "correlation": 8227 } }, { "ph": "s", "id": 8227, "pid": 76337, "tid": -914061504, "ts": 1716454216698947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216706045, "dur": 35, "args": { "External id": 8235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8235, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8235, "pid": 5, "tid": 7, "ts": 1716454216706045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699035, "dur": 13, "args": { "External id": 8235, "cbid": 211, "correlation": 8235 } }, { "ph": "s", "id": 8235, "pid": 76337, "tid": -914061504, "ts": 1716454216699035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216706081, "dur": 171, "args": { "External id": 8243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8243, "pid": 5, "tid": 7, "ts": 1716454216706081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699078, "dur": 9, "args": { "External id": 8243, "cbid": 211, "correlation": 8243 } }, { "ph": "s", "id": 8243, "pid": 76337, "tid": -914061504, "ts": 1716454216699078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216706254, "dur": 102, "args": { "External id": 8265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8265, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8265, "pid": 5, "tid": 7, "ts": 1716454216706254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699132, "dur": 10, "args": { "External id": 8265, "cbid": 211, "correlation": 8265 } }, { "ph": "s", "id": 8265, "pid": 76337, "tid": -914061504, "ts": 1716454216699132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699218, "dur": 1, "args": { "External id": 8276, "cbid": 251, "correlation": 8276 } }, { "ph": "f", "id": 8276, "pid": 76337, "tid": -914061504, "ts": 1716454216699218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216706357, "dur": 115, "args": { "External id": 8277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8277, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8277, "pid": 5, "tid": 7, "ts": 1716454216706357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699224, "dur": 13, "args": { "External id": 8277, "cbid": 211, "correlation": 8277 } }, { "ph": "s", "id": 8277, "pid": 76337, "tid": -914061504, "ts": 1716454216699224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699302, "dur": 1, "args": { "External id": 8288, "cbid": 251, "correlation": 8288 } }, { "ph": "f", "id": 8288, "pid": 76337, "tid": -914061504, "ts": 1716454216699302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699305, "dur": 0, "args": { "External id": 8289, "cbid": 251, "correlation": 8289 } }, { "ph": "f", "id": 8289, "pid": 76337, "tid": -914061504, "ts": 1716454216699305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216706473, "dur": 10, "args": { "External id": 8290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8290, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 8290, "pid": 5, "tid": 7, "ts": 1716454216706473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699307, "dur": 14, "args": { "External id": 8290, "cbid": 211, "correlation": 8290 } }, { "ph": "s", "id": 8290, "pid": 76337, "tid": -914061504, "ts": 1716454216699307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216706485, "dur": 5, "args": { "External id": 8292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8292, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 8292, "pid": 5, "tid": 7, "ts": 1716454216706485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699324, "dur": 6, "args": { "External id": 8292, "cbid": 211, "correlation": 8292 } }, { "ph": "s", "id": 8292, "pid": 76337, "tid": -914061504, "ts": 1716454216699324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699385, "dur": 1, "args": { "External id": 8303, "cbid": 251, "correlation": 8303 } }, { "ph": "f", "id": 8303, "pid": 76337, "tid": -914061504, "ts": 1716454216699385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699389, "dur": 0, "args": { "External id": 8304, "cbid": 251, "correlation": 8304 } }, { "ph": "f", "id": 8304, "pid": 76337, "tid": -914061504, "ts": 1716454216699389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216706492, "dur": 7, "args": { "External id": 8305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8305, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 8305, "pid": 5, "tid": 7, "ts": 1716454216706492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699390, "dur": 12, "args": { "External id": 8305, "cbid": 211, "correlation": 8305 } }, { "ph": "s", "id": 8305, "pid": 76337, "tid": -914061504, "ts": 1716454216699390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216706500, "dur": 4, "args": { "External id": 8307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8307, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 8307, "pid": 5, "tid": 7, "ts": 1716454216706500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699404, "dur": 5, "args": { "External id": 8307, "cbid": 211, "correlation": 8307 } }, { "ph": "s", "id": 8307, "pid": 76337, "tid": -914061504, "ts": 1716454216699404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216706505, "dur": 169, "args": { "External id": 8328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8328, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8328, "pid": 5, "tid": 7, "ts": 1716454216706505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699478, "dur": 12, "args": { "External id": 8328, "cbid": 211, "correlation": 8328 } }, { "ph": "s", "id": 8328, "pid": 76337, "tid": -914061504, "ts": 1716454216699478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699596, "dur": 1, "args": { "External id": 8346, "cbid": 251, "correlation": 8346 } }, { "ph": "f", "id": 8346, "pid": 76337, "tid": -914061504, "ts": 1716454216699596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216706675, "dur": 114, "args": { "External id": 8348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8348, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8348, "pid": 5, "tid": 7, "ts": 1716454216706675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699602, "dur": 14, "args": { "External id": 8348, "cbid": 211, "correlation": 8348 } }, { "ph": "s", "id": 8348, "pid": 76337, "tid": -914061504, "ts": 1716454216699602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216706791, "dur": 35, "args": { "External id": 8356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8356, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8356, "pid": 5, "tid": 7, "ts": 1716454216706791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699676, "dur": 12, "args": { "External id": 8356, "cbid": 211, "correlation": 8356 } }, { "ph": "s", "id": 8356, "pid": 76337, "tid": -914061504, "ts": 1716454216699676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216706828, "dur": 51, "args": { "External id": 8364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8364, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8364, "pid": 5, "tid": 7, "ts": 1716454216706828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699716, "dur": 9, "args": { "External id": 8364, "cbid": 211, "correlation": 8364 } }, { "ph": "s", "id": 8364, "pid": 76337, "tid": -914061504, "ts": 1716454216699716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216706880, "dur": 102, "args": { "External id": 8386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8386, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8386, "pid": 5, "tid": 7, "ts": 1716454216706880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699768, "dur": 10, "args": { "External id": 8386, "cbid": 211, "correlation": 8386 } }, { "ph": "s", "id": 8386, "pid": 76337, "tid": -914061504, "ts": 1716454216699768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216699859, "dur": 2, "args": { "External id": 8402, "cbid": 251, "correlation": 8402 } }, { "ph": "f", "id": 8402, "pid": 76337, "tid": -914061504, "ts": 1716454216699859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216706983, "dur": 629, "args": { "External id": 8404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8404, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8404, "pid": 5, "tid": 7, "ts": 1716454216706983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699865, "dur": 14, "args": { "External id": 8404, "cbid": 211, "correlation": 8404 } }, { "ph": "s", "id": 8404, "pid": 76337, "tid": -914061504, "ts": 1716454216699865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216707613, "dur": 260, "args": { "External id": 8412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8412, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8412, "pid": 5, "tid": 7, "ts": 1716454216707613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699953, "dur": 13, "args": { "External id": 8412, "cbid": 211, "correlation": 8412 } }, { "ph": "s", "id": 8412, "pid": 76337, "tid": -914061504, "ts": 1716454216699953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216707875, "dur": 261, "args": { "External id": 8420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8420, "pid": 5, "tid": 7, "ts": 1716454216707875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216699999, "dur": 10, "args": { "External id": 8420, "cbid": 211, "correlation": 8420 } }, { "ph": "s", "id": 8420, "pid": 76337, "tid": -914061504, "ts": 1716454216699999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700099, "dur": 2, "args": { "External id": 8436, "cbid": 251, "correlation": 8436 } }, { "ph": "f", "id": 8436, "pid": 76337, "tid": -914061504, "ts": 1716454216700099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700104, "dur": 0, "args": { "External id": 8438, "cbid": 251, "correlation": 8438 } }, { "ph": "f", "id": 8438, "pid": 76337, "tid": -914061504, "ts": 1716454216700104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216708138, "dur": 380, "args": { "External id": 8439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8439, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 8439, "pid": 5, "tid": 7, "ts": 1716454216708138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216700108, "dur": 14, "args": { "External id": 8439, "cbid": 211, "correlation": 8439 } }, { "ph": "s", "id": 8439, "pid": 76337, "tid": -914061504, "ts": 1716454216700108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216708519, "dur": 51, "args": { "External id": 8447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8447, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8447, "pid": 5, "tid": 7, "ts": 1716454216708519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216700152, "dur": 11, "args": { "External id": 8447, "cbid": 211, "correlation": 8447 } }, { "ph": "s", "id": 8447, "pid": 76337, "tid": -914061504, "ts": 1716454216700152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216708572, "dur": 173, "args": { "External id": 8458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8458, "pid": 5, "tid": 7, "ts": 1716454216708572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216700218, "dur": 12, "args": { "External id": 8458, "cbid": 211, "correlation": 8458 } }, { "ph": "s", "id": 8458, "pid": 76337, "tid": -914061504, "ts": 1716454216700218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216700296, "dur": 0, "args": { "External id": 8470, "cbid": 317, "correlation": 8470 } }, { "ph": "f", "id": 8470, "pid": 76337, "tid": -914061504, "ts": 1716454216700296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216700297, "dur": 0, "args": { "External id": 8471, "cbid": 203, "correlation": 8471 } }, { "ph": "f", "id": 8471, "pid": 76337, "tid": -914061504, "ts": 1716454216700297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216700298, "dur": 0, "args": { "External id": 8472, "cbid": 205, "correlation": 8472 } }, { "ph": "f", "id": 8472, "pid": 76337, "tid": -914061504, "ts": 1716454216700298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700321, "dur": 1, "args": { "External id": 8476, "cbid": 251, "correlation": 8476 } }, { "ph": "f", "id": 8476, "pid": 76337, "tid": -914061504, "ts": 1716454216700321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700323, "dur": 0, "args": { "External id": 8477, "cbid": 251, "correlation": 8477 } }, { "ph": "f", "id": 8477, "pid": 76337, "tid": -914061504, "ts": 1716454216700323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700324, "dur": 0, "args": { "External id": 8478, "cbid": 251, "correlation": 8478 } }, { "ph": "f", "id": 8478, "pid": 76337, "tid": -914061504, "ts": 1716454216700324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700324, "dur": 0, "args": { "External id": 8479, "cbid": 251, "correlation": 8479 } }, { "ph": "f", "id": 8479, "pid": 76337, "tid": -914061504, "ts": 1716454216700324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700325, "dur": 0, "args": { "External id": 8480, "cbid": 251, "correlation": 8480 } }, { "ph": "f", "id": 8480, "pid": 76337, "tid": -914061504, "ts": 1716454216700325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700326, "dur": 0, "args": { "External id": 8481, "cbid": 251, "correlation": 8481 } }, { "ph": "f", "id": 8481, "pid": 76337, "tid": -914061504, "ts": 1716454216700326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700327, "dur": 0, "args": { "External id": 8482, "cbid": 251, "correlation": 8482 } }, { "ph": "f", "id": 8482, "pid": 76337, "tid": -914061504, "ts": 1716454216700327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700328, "dur": 0, "args": { "External id": 8483, "cbid": 251, "correlation": 8483 } }, { "ph": "f", "id": 8483, "pid": 76337, "tid": -914061504, "ts": 1716454216700328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216700329, "dur": 0, "args": { "External id": 8484, "cbid": 251, "correlation": 8484 } }, { "ph": "f", "id": 8484, "pid": 76337, "tid": -914061504, "ts": 1716454216700329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216708746, "dur": 125, "args": { "External id": 8485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8485, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8485, "pid": 5, "tid": 7, "ts": 1716454216708746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216700331, "dur": 14, "args": { "External id": 8485, "cbid": 211, "correlation": 8485 } }, { "ph": "s", "id": 8485, "pid": 76337, "tid": -914061504, "ts": 1716454216700331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216708873, "dur": 64, "args": { "External id": 8491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8491, "pid": 5, "tid": 7, "ts": 1716454216708873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216700368, "dur": 9, "args": { "External id": 8491, "cbid": 211, "correlation": 8491 } }, { "ph": "s", "id": 8491, "pid": 76337, "tid": -914061504, "ts": 1716454216700368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216708938, "dur": 51, "args": { "External id": 8499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8499, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8499, "pid": 5, "tid": 7, "ts": 1716454216708938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216700401, "dur": 9, "args": { "External id": 8499, "cbid": 211, "correlation": 8499 } }, { "ph": "s", "id": 8499, "pid": 76337, "tid": -914061504, "ts": 1716454216700401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216700477, "dur": 0, "args": { "External id": 8509, "cbid": 317, "correlation": 8509 } }, { "ph": "f", "id": 8509, "pid": 76337, "tid": -914061504, "ts": 1716454216700477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216700477, "dur": 0, "args": { "External id": 8510, "cbid": 203, "correlation": 8510 } }, { "ph": "f", "id": 8510, "pid": 76337, "tid": -914061504, "ts": 1716454216700477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216700478, "dur": 0, "args": { "External id": 8511, "cbid": 205, "correlation": 8511 } }, { "ph": "f", "id": 8511, "pid": 76337, "tid": -914061504, "ts": 1716454216700478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216708991, "dur": 42, "args": { "External id": 8515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8515, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8515, "pid": 5, "tid": 7, "ts": 1716454216708991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216703922, "dur": 31, "args": { "External id": 8515, "cbid": 211, "correlation": 8515 } }, { "ph": "s", "id": 8515, "pid": 76337, "tid": -914061504, "ts": 1716454216703922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216709034, "dur": 15, "args": { "External id": 8517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8517, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8517, "pid": 5, "tid": 7, "ts": 1716454216709034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216703957, "dur": 6, "args": { "External id": 8517, "cbid": 211, "correlation": 8517 } }, { "ph": "s", "id": 8517, "pid": 76337, "tid": -914061504, "ts": 1716454216703957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216709052, "dur": 2, "args": { "External id": 8519, "device": 5, "context": 1, "stream": 7, "correlation": 8519, "bytes": 1536, "memory bandwidth (GB/s)": 0.6153846153846154 } }, { "ph": "f", "id": 8519, "pid": 5, "tid": 7, "ts": 1716454216709052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216703982, "dur": 18, "args": { "External id": 8519, "cbid": 51, "correlation": 8519 } }, { "ph": "s", "id": 8519, "pid": 76337, "tid": -914061504, "ts": 1716454216703982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216709071, "dur": 388, "args": { "External id": 8520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8520, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8520, "pid": 5, "tid": 7, "ts": 1716454216709071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216704003, "dur": 171, "args": { "External id": 8520, "cbid": 211, "correlation": 8520 } }, { "ph": "s", "id": 8520, "pid": 76337, "tid": -914061504, "ts": 1716454216704003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216709461, "dur": 13, "args": { "External id": 8522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8522, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8522, "pid": 5, "tid": 7, "ts": 1716454216709461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216704183, "dur": 8, "args": { "External id": 8522, "cbid": 211, "correlation": 8522 } }, { "ph": "s", "id": 8522, "pid": 76337, "tid": -914061504, "ts": 1716454216704183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216709476, "dur": 18, "args": { "External id": 8528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8528, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8528, "pid": 5, "tid": 7, "ts": 1716454216709476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216704987, "dur": 13, "args": { "External id": 8528, "cbid": 211, "correlation": 8528 } }, { "ph": "s", "id": 8528, "pid": 76337, "tid": -914061504, "ts": 1716454216704987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216709495, "dur": 20, "args": { "External id": 8548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8548, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 8548, "pid": 5, "tid": 7, "ts": 1716454216709495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216705175, "dur": 15, "args": { "External id": 8548, "cbid": 211, "correlation": 8548 } }, { "ph": "s", "id": 8548, "pid": 76337, "tid": -914061504, "ts": 1716454216705175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216709516, "dur": 5, "args": { "External id": 8560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8560, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 8560, "pid": 5, "tid": 7, "ts": 1716454216709516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216705235, "dur": 10, "args": { "External id": 8560, "cbid": 211, "correlation": 8560 } }, { "ph": "s", "id": 8560, "pid": 76337, "tid": -914061504, "ts": 1716454216705235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216709523, "dur": 19, "args": { "External id": 8563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8563, "pid": 5, "tid": 7, "ts": 1716454216709523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216705265, "dur": 7, "args": { "External id": 8563, "cbid": 211, "correlation": 8563 } }, { "ph": "s", "id": 8563, "pid": 76337, "tid": -914061504, "ts": 1716454216705265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216709543, "dur": 12, "args": { "External id": 8572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8572, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8572, "pid": 5, "tid": 7, "ts": 1716454216709543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216705320, "dur": 12, "args": { "External id": 8572, "cbid": 211, "correlation": 8572 } }, { "ph": "s", "id": 8572, "pid": 76337, "tid": -914061504, "ts": 1716454216705320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216705420, "dur": 0, "args": { "External id": 8582, "cbid": 317, "correlation": 8582 } }, { "ph": "f", "id": 8582, "pid": 76337, "tid": -914061504, "ts": 1716454216705420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216705421, "dur": 1, "args": { "External id": 8583, "cbid": 203, "correlation": 8583 } }, { "ph": "f", "id": 8583, "pid": 76337, "tid": -914061504, "ts": 1716454216705421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216705423, "dur": 0, "args": { "External id": 8584, "cbid": 205, "correlation": 8584 } }, { "ph": "f", "id": 8584, "pid": 76337, "tid": -914061504, "ts": 1716454216705423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216709557, "dur": 14, "args": { "External id": 8588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8588, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8588, "pid": 5, "tid": 7, "ts": 1716454216709557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216708166, "dur": 23, "args": { "External id": 8588, "cbid": 211, "correlation": 8588 } }, { "ph": "s", "id": 8588, "pid": 76337, "tid": -914061504, "ts": 1716454216708166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216709572, "dur": 26, "args": { "External id": 8590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8590, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8590, "pid": 5, "tid": 7, "ts": 1716454216709572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216708192, "dur": 6, "args": { "External id": 8590, "cbid": 211, "correlation": 8590 } }, { "ph": "s", "id": 8590, "pid": 76337, "tid": -914061504, "ts": 1716454216708192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216709600, "dur": 4, "args": { "External id": 8592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 8592, "pid": 5, "tid": 7, "ts": 1716454216709600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216708205, "dur": 10, "args": { "External id": 8592, "cbid": 211, "correlation": 8592 } }, { "ph": "s", "id": 8592, "pid": 76337, "tid": -914061504, "ts": 1716454216708205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216708218, "dur": 0, "args": { "External id": 8593, "cbid": 51, "correlation": 8593 } }, { "ph": "s", "id": 8593, "pid": 76337, "tid": -914061504, "ts": 1716454216708218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216709615, "dur": 384, "args": { "External id": 8594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8594, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8594, "pid": 5, "tid": 7, "ts": 1716454216709615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216708235, "dur": 120, "args": { "External id": 8594, "cbid": 211, "correlation": 8594 } }, { "ph": "s", "id": 8594, "pid": 76337, "tid": -914061504, "ts": 1716454216708235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216710000, "dur": 22, "args": { "External id": 8595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8595, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8595, "pid": 5, "tid": 7, "ts": 1716454216710000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216708360, "dur": 7, "args": { "External id": 8595, "cbid": 211, "correlation": 8595 } }, { "ph": "s", "id": 8595, "pid": 76337, "tid": -914061504, "ts": 1716454216708360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216710023, "dur": 35, "args": { "External id": 8601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8601, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8601, "pid": 5, "tid": 7, "ts": 1716454216710023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709074, "dur": 13, "args": { "External id": 8601, "cbid": 211, "correlation": 8601 } }, { "ph": "s", "id": 8601, "pid": 76337, "tid": -914061504, "ts": 1716454216709074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216710060, "dur": 4, "args": { "External id": 8609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8609, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 8609, "pid": 5, "tid": 7, "ts": 1716454216710060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709149, "dur": 10, "args": { "External id": 8609, "cbid": 211, "correlation": 8609 } }, { "ph": "s", "id": 8609, "pid": 76337, "tid": -914061504, "ts": 1716454216709149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216709304, "dur": 5, "args": { "External id": 8625, "cbid": 251, "correlation": 8625 } }, { "ph": "f", "id": 8625, "pid": 76337, "tid": -914061504, "ts": 1716454216709304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216709315, "dur": 0, "args": { "External id": 8627, "cbid": 251, "correlation": 8627 } }, { "ph": "f", "id": 8627, "pid": 76337, "tid": -914061504, "ts": 1716454216709315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216710065, "dur": 13, "args": { "External id": 8628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8628, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 8628, "pid": 5, "tid": 7, "ts": 1716454216710065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709320, "dur": 17, "args": { "External id": 8628, "cbid": 211, "correlation": 8628 } }, { "ph": "s", "id": 8628, "pid": 76337, "tid": -914061504, "ts": 1716454216709320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216710080, "dur": 5, "args": { "External id": 8630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8630, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 8630, "pid": 5, "tid": 7, "ts": 1716454216710080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709342, "dur": 8, "args": { "External id": 8630, "cbid": 211, "correlation": 8630 } }, { "ph": "s", "id": 8630, "pid": 76337, "tid": -914061504, "ts": 1716454216709342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216710087, "dur": 31, "args": { "External id": 8640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8640, "pid": 5, "tid": 7, "ts": 1716454216710087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709437, "dur": 12, "args": { "External id": 8640, "cbid": 211, "correlation": 8640 } }, { "ph": "s", "id": 8640, "pid": 76337, "tid": -914061504, "ts": 1716454216709437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216710119, "dur": 34, "args": { "External id": 8660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8660, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 8660, "pid": 5, "tid": 7, "ts": 1716454216710119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709529, "dur": 12, "args": { "External id": 8660, "cbid": 211, "correlation": 8660 } }, { "ph": "s", "id": 8660, "pid": 76337, "tid": -914061504, "ts": 1716454216709529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216710155, "dur": 5, "args": { "External id": 8672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8672, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 8672, "pid": 5, "tid": 7, "ts": 1716454216710155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709564, "dur": 9, "args": { "External id": 8672, "cbid": 211, "correlation": 8672 } }, { "ph": "s", "id": 8672, "pid": 76337, "tid": -914061504, "ts": 1716454216709564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216710162, "dur": 33, "args": { "External id": 8675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8675, "pid": 5, "tid": 7, "ts": 1716454216710162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709592, "dur": 7, "args": { "External id": 8675, "cbid": 211, "correlation": 8675 } }, { "ph": "s", "id": 8675, "pid": 76337, "tid": -914061504, "ts": 1716454216709592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216710196, "dur": 23, "args": { "External id": 8684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8684, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8684, "pid": 5, "tid": 7, "ts": 1716454216710196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216709639, "dur": 10, "args": { "External id": 8684, "cbid": 211, "correlation": 8684 } }, { "ph": "s", "id": 8684, "pid": 76337, "tid": -914061504, "ts": 1716454216709639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216709747, "dur": 0, "args": { "External id": 8694, "cbid": 317, "correlation": 8694 } }, { "ph": "f", "id": 8694, "pid": 76337, "tid": -914061504, "ts": 1716454216709747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216709748, "dur": 1, "args": { "External id": 8695, "cbid": 203, "correlation": 8695 } }, { "ph": "f", "id": 8695, "pid": 76337, "tid": -914061504, "ts": 1716454216709748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216709750, "dur": 1, "args": { "External id": 8696, "cbid": 205, "correlation": 8696 } }, { "ph": "f", "id": 8696, "pid": 76337, "tid": -914061504, "ts": 1716454216709750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216712348, "dur": 26, "args": { "External id": 8700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8700, "pid": 5, "tid": 7, "ts": 1716454216712348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216712327, "dur": 21, "args": { "External id": 8700, "cbid": 211, "correlation": 8700 } }, { "ph": "s", "id": 8700, "pid": 76337, "tid": -914061504, "ts": 1716454216712327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216712375, "dur": 49, "args": { "External id": 8702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8702, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8702, "pid": 5, "tid": 7, "ts": 1716454216712375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216712351, "dur": 6, "args": { "External id": 8702, "cbid": 211, "correlation": 8702 } }, { "ph": "s", "id": 8702, "pid": 76337, "tid": -914061504, "ts": 1716454216712351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216712425, "dur": 723, "args": { "External id": 8704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8704, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8704, "pid": 5, "tid": 7, "ts": 1716454216712425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216712369, "dur": 11, "args": { "External id": 8704, "cbid": 211, "correlation": 8704 } }, { "ph": "s", "id": 8704, "pid": 76337, "tid": -914061504, "ts": 1716454216712369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216713149, "dur": 21, "args": { "External id": 8706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8706, "pid": 5, "tid": 7, "ts": 1716454216713149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216712386, "dur": 7, "args": { "External id": 8706, "cbid": 211, "correlation": 8706 } }, { "ph": "s", "id": 8706, "pid": 76337, "tid": -914061504, "ts": 1716454216712386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216713172, "dur": 35, "args": { "External id": 8712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8712, "pid": 5, "tid": 7, "ts": 1716454216713172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216713094, "dur": 14, "args": { "External id": 8712, "cbid": 211, "correlation": 8712 } }, { "ph": "s", "id": 8712, "pid": 76337, "tid": -914061504, "ts": 1716454216713094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216713186, "dur": 0, "args": { "External id": 8722, "cbid": 317, "correlation": 8722 } }, { "ph": "f", "id": 8722, "pid": 76337, "tid": -914061504, "ts": 1716454216713186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216713187, "dur": 0, "args": { "External id": 8723, "cbid": 203, "correlation": 8723 } }, { "ph": "f", "id": 8723, "pid": 76337, "tid": -914061504, "ts": 1716454216713187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216713188, "dur": 1, "args": { "External id": 8724, "cbid": 205, "correlation": 8724 } }, { "ph": "f", "id": 8724, "pid": 76337, "tid": -914061504, "ts": 1716454216713188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715052, "dur": 4, "args": { "External id": 8728, "cbid": 251, "correlation": 8728 } }, { "ph": "f", "id": 8728, "pid": 76337, "tid": -914061504, "ts": 1716454216715052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715058, "dur": 1, "args": { "External id": 8729, "cbid": 251, "correlation": 8729 } }, { "ph": "f", "id": 8729, "pid": 76337, "tid": -914061504, "ts": 1716454216715058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715060, "dur": 1, "args": { "External id": 8730, "cbid": 251, "correlation": 8730 } }, { "ph": "f", "id": 8730, "pid": 76337, "tid": -914061504, "ts": 1716454216715060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715062, "dur": 1, "args": { "External id": 8731, "cbid": 251, "correlation": 8731 } }, { "ph": "f", "id": 8731, "pid": 76337, "tid": -914061504, "ts": 1716454216715062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715064, "dur": 1, "args": { "External id": 8732, "cbid": 251, "correlation": 8732 } }, { "ph": "f", "id": 8732, "pid": 76337, "tid": -914061504, "ts": 1716454216715064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715066, "dur": 1, "args": { "External id": 8733, "cbid": 251, "correlation": 8733 } }, { "ph": "f", "id": 8733, "pid": 76337, "tid": -914061504, "ts": 1716454216715066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715068, "dur": 1, "args": { "External id": 8734, "cbid": 251, "correlation": 8734 } }, { "ph": "f", "id": 8734, "pid": 76337, "tid": -914061504, "ts": 1716454216715068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715070, "dur": 1, "args": { "External id": 8735, "cbid": 251, "correlation": 8735 } }, { "ph": "f", "id": 8735, "pid": 76337, "tid": -914061504, "ts": 1716454216715070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216715072, "dur": 0, "args": { "External id": 8736, "cbid": 251, "correlation": 8736 } }, { "ph": "f", "id": 8736, "pid": 76337, "tid": -914061504, "ts": 1716454216715072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216715098, "dur": 56, "args": { "External id": 8737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8737, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8737, "pid": 5, "tid": 7, "ts": 1716454216715098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715077, "dur": 22, "args": { "External id": 8737, "cbid": 211, "correlation": 8737 } }, { "ph": "s", "id": 8737, "pid": 76337, "tid": -914061504, "ts": 1716454216715077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216715687, "dur": 34, "args": { "External id": 8743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8743, "pid": 5, "tid": 7, "ts": 1716454216715687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715672, "dur": 14, "args": { "External id": 8743, "cbid": 211, "correlation": 8743 } }, { "ph": "s", "id": 8743, "pid": 76337, "tid": -914061504, "ts": 1716454216715672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216715733, "dur": 27, "args": { "External id": 8751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8751, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8751, "pid": 5, "tid": 7, "ts": 1716454216715733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715722, "dur": 10, "args": { "External id": 8751, "cbid": 211, "correlation": 8751 } }, { "ph": "s", "id": 8751, "pid": 76337, "tid": -914061504, "ts": 1716454216715722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216715787, "dur": 20, "args": { "External id": 8759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8759, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8759, "pid": 5, "tid": 7, "ts": 1716454216715787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715774, "dur": 12, "args": { "External id": 8759, "cbid": 211, "correlation": 8759 } }, { "ph": "s", "id": 8759, "pid": 76337, "tid": -914061504, "ts": 1716454216715774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216715934, "dur": 34, "args": { "External id": 8779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8779, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 8779, "pid": 5, "tid": 7, "ts": 1716454216715934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715920, "dur": 14, "args": { "External id": 8779, "cbid": 211, "correlation": 8779 } }, { "ph": "s", "id": 8779, "pid": 76337, "tid": -914061504, "ts": 1716454216715920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216715969, "dur": 4, "args": { "External id": 8791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8791, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 8791, "pid": 5, "tid": 7, "ts": 1716454216715969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715945, "dur": 8, "args": { "External id": 8791, "cbid": 211, "correlation": 8791 } }, { "ph": "s", "id": 8791, "pid": 76337, "tid": -914061504, "ts": 1716454216715945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216715990, "dur": 32, "args": { "External id": 8794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8794, "pid": 5, "tid": 7, "ts": 1716454216715990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216715972, "dur": 17, "args": { "External id": 8794, "cbid": 211, "correlation": 8794 } }, { "ph": "s", "id": 8794, "pid": 76337, "tid": -914061504, "ts": 1716454216715972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216716059, "dur": 0, "args": { "External id": 8805, "cbid": 317, "correlation": 8805 } }, { "ph": "f", "id": 8805, "pid": 76337, "tid": -914061504, "ts": 1716454216716059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216716060, "dur": 1, "args": { "External id": 8806, "cbid": 203, "correlation": 8806 } }, { "ph": "f", "id": 8806, "pid": 76337, "tid": -914061504, "ts": 1716454216716060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216716062, "dur": 0, "args": { "External id": 8807, "cbid": 205, "correlation": 8807 } }, { "ph": "f", "id": 8807, "pid": 76337, "tid": -914061504, "ts": 1716454216716062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216717878, "dur": 24, "args": { "External id": 8811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8811, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8811, "pid": 5, "tid": 7, "ts": 1716454216717878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216717859, "dur": 19, "args": { "External id": 8811, "cbid": 211, "correlation": 8811 } }, { "ph": "s", "id": 8811, "pid": 76337, "tid": -914061504, "ts": 1716454216717859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216718071, "dur": 122, "args": { "External id": 8813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8813, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8813, "pid": 5, "tid": 7, "ts": 1716454216718071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216717914, "dur": 158, "args": { "External id": 8813, "cbid": 211, "correlation": 8813 } }, { "ph": "s", "id": 8813, "pid": 76337, "tid": -914061504, "ts": 1716454216717914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216718194, "dur": 23, "args": { "External id": 8815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8815, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8815, "pid": 5, "tid": 7, "ts": 1716454216718194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216718079, "dur": 8, "args": { "External id": 8815, "cbid": 211, "correlation": 8815 } }, { "ph": "s", "id": 8815, "pid": 76337, "tid": -914061504, "ts": 1716454216718079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216718608, "dur": 35, "args": { "External id": 8821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8821, "pid": 5, "tid": 7, "ts": 1716454216718608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216718596, "dur": 11, "args": { "External id": 8821, "cbid": 211, "correlation": 8821 } }, { "ph": "s", "id": 8821, "pid": 76337, "tid": -914061504, "ts": 1716454216718596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216718810, "dur": 89, "args": { "External id": 8832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8832, "pid": 5, "tid": 7, "ts": 1716454216718810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216718793, "dur": 18, "args": { "External id": 8832, "cbid": 211, "correlation": 8832 } }, { "ph": "s", "id": 8832, "pid": 76337, "tid": -914061504, "ts": 1716454216718793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216718901, "dur": 40, "args": { "External id": 8854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8854, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8854, "pid": 5, "tid": 7, "ts": 1716454216718901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216718837, "dur": 11, "args": { "External id": 8854, "cbid": 211, "correlation": 8854 } }, { "ph": "s", "id": 8854, "pid": 76337, "tid": -914061504, "ts": 1716454216718837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216719006, "dur": 3, "args": { "External id": 8865, "cbid": 251, "correlation": 8865 } }, { "ph": "f", "id": 8865, "pid": 76337, "tid": -914061504, "ts": 1716454216719006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216719032, "dur": 105, "args": { "External id": 8866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8866, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8866, "pid": 5, "tid": 7, "ts": 1716454216719032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719016, "dur": 16, "args": { "External id": 8866, "cbid": 211, "correlation": 8866 } }, { "ph": "s", "id": 8866, "pid": 76337, "tid": -914061504, "ts": 1716454216719016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216719099, "dur": 1, "args": { "External id": 8877, "cbid": 251, "correlation": 8877 } }, { "ph": "f", "id": 8877, "pid": 76337, "tid": -914061504, "ts": 1716454216719099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216719139, "dur": 98, "args": { "External id": 8878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8878, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8878, "pid": 5, "tid": 7, "ts": 1716454216719139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719103, "dur": 11, "args": { "External id": 8878, "cbid": 211, "correlation": 8878 } }, { "ph": "s", "id": 8878, "pid": 76337, "tid": -914061504, "ts": 1716454216719103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216719187, "dur": 1, "args": { "External id": 8889, "cbid": 251, "correlation": 8889 } }, { "ph": "f", "id": 8889, "pid": 76337, "tid": -914061504, "ts": 1716454216719187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216719238, "dur": 96, "args": { "External id": 8890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8890, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8890, "pid": 5, "tid": 7, "ts": 1716454216719238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719192, "dur": 13, "args": { "External id": 8890, "cbid": 211, "correlation": 8890 } }, { "ph": "s", "id": 8890, "pid": 76337, "tid": -914061504, "ts": 1716454216719192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216719467, "dur": 588, "args": { "External id": 8911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8911, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 8911, "pid": 5, "tid": 7, "ts": 1716454216719467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719314, "dur": 155, "args": { "External id": 8911, "cbid": 211, "correlation": 8911 } }, { "ph": "s", "id": 8911, "pid": 76337, "tid": -914061504, "ts": 1716454216719314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216719610, "dur": 3, "args": { "External id": 8929, "cbid": 251, "correlation": 8929 } }, { "ph": "f", "id": 8929, "pid": 76337, "tid": -914061504, "ts": 1716454216719610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216720056, "dur": 105, "args": { "External id": 8931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8931, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8931, "pid": 5, "tid": 7, "ts": 1716454216720056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719619, "dur": 14, "args": { "External id": 8931, "cbid": 211, "correlation": 8931 } }, { "ph": "s", "id": 8931, "pid": 76337, "tid": -914061504, "ts": 1716454216719619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216720163, "dur": 20, "args": { "External id": 8939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8939, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8939, "pid": 5, "tid": 7, "ts": 1716454216720163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719722, "dur": 16, "args": { "External id": 8939, "cbid": 211, "correlation": 8939 } }, { "ph": "s", "id": 8939, "pid": 76337, "tid": -914061504, "ts": 1716454216719722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216720184, "dur": 94, "args": { "External id": 8947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8947, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8947, "pid": 5, "tid": 7, "ts": 1716454216720184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719770, "dur": 10, "args": { "External id": 8947, "cbid": 211, "correlation": 8947 } }, { "ph": "s", "id": 8947, "pid": 76337, "tid": -914061504, "ts": 1716454216719770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216720279, "dur": 40, "args": { "External id": 8969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8969, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 8969, "pid": 5, "tid": 7, "ts": 1716454216720279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719842, "dur": 12, "args": { "External id": 8969, "cbid": 211, "correlation": 8969 } }, { "ph": "s", "id": 8969, "pid": 76337, "tid": -914061504, "ts": 1716454216719842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216719947, "dur": 1, "args": { "External id": 8980, "cbid": 251, "correlation": 8980 } }, { "ph": "f", "id": 8980, "pid": 76337, "tid": -914061504, "ts": 1716454216719947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216720320, "dur": 89, "args": { "External id": 8981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8981, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 8981, "pid": 5, "tid": 7, "ts": 1716454216720320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216719952, "dur": 14, "args": { "External id": 8981, "cbid": 211, "correlation": 8981 } }, { "ph": "s", "id": 8981, "pid": 76337, "tid": -914061504, "ts": 1716454216719952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720055, "dur": 2, "args": { "External id": 8992, "cbid": 251, "correlation": 8992 } }, { "ph": "f", "id": 8992, "pid": 76337, "tid": -914061504, "ts": 1716454216720055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720061, "dur": 0, "args": { "External id": 8993, "cbid": 251, "correlation": 8993 } }, { "ph": "f", "id": 8993, "pid": 76337, "tid": -914061504, "ts": 1716454216720061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216720411, "dur": 12, "args": { "External id": 8994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8994, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 8994, "pid": 5, "tid": 7, "ts": 1716454216720411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720063, "dur": 15, "args": { "External id": 8994, "cbid": 211, "correlation": 8994 } }, { "ph": "s", "id": 8994, "pid": 76337, "tid": -914061504, "ts": 1716454216720063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216720430, "dur": 4, "args": { "External id": 8996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 8996, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 8996, "pid": 5, "tid": 7, "ts": 1716454216720430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720082, "dur": 72, "args": { "External id": 8996, "cbid": 211, "correlation": 8996 } }, { "ph": "s", "id": 8996, "pid": 76337, "tid": -914061504, "ts": 1716454216720082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720226, "dur": 1, "args": { "External id": 9007, "cbid": 251, "correlation": 9007 } }, { "ph": "f", "id": 9007, "pid": 76337, "tid": -914061504, "ts": 1716454216720226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720230, "dur": 0, "args": { "External id": 9008, "cbid": 251, "correlation": 9008 } }, { "ph": "f", "id": 9008, "pid": 76337, "tid": -914061504, "ts": 1716454216720230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216720436, "dur": 9, "args": { "External id": 9009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9009, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 9009, "pid": 5, "tid": 7, "ts": 1716454216720436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720232, "dur": 13, "args": { "External id": 9009, "cbid": 211, "correlation": 9009 } }, { "ph": "s", "id": 9009, "pid": 76337, "tid": -914061504, "ts": 1716454216720232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216720446, "dur": 3, "args": { "External id": 9011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9011, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 9011, "pid": 5, "tid": 7, "ts": 1716454216720446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720247, "dur": 6, "args": { "External id": 9011, "cbid": 211, "correlation": 9011 } }, { "ph": "s", "id": 9011, "pid": 76337, "tid": -914061504, "ts": 1716454216720247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216720450, "dur": 110, "args": { "External id": 9032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9032, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 9032, "pid": 5, "tid": 7, "ts": 1716454216720450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720324, "dur": 12, "args": { "External id": 9032, "cbid": 211, "correlation": 9032 } }, { "ph": "s", "id": 9032, "pid": 76337, "tid": -914061504, "ts": 1716454216720324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720437, "dur": 1, "args": { "External id": 9050, "cbid": 251, "correlation": 9050 } }, { "ph": "f", "id": 9050, "pid": 76337, "tid": -914061504, "ts": 1716454216720437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216720562, "dur": 106, "args": { "External id": 9052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9052, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9052, "pid": 5, "tid": 7, "ts": 1716454216720562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720443, "dur": 14, "args": { "External id": 9052, "cbid": 211, "correlation": 9052 } }, { "ph": "s", "id": 9052, "pid": 76337, "tid": -914061504, "ts": 1716454216720443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216720670, "dur": 20, "args": { "External id": 9060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9060, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9060, "pid": 5, "tid": 7, "ts": 1716454216720670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720518, "dur": 12, "args": { "External id": 9060, "cbid": 211, "correlation": 9060 } }, { "ph": "s", "id": 9060, "pid": 76337, "tid": -914061504, "ts": 1716454216720518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216720691, "dur": 28, "args": { "External id": 9068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9068, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9068, "pid": 5, "tid": 7, "ts": 1716454216720691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720559, "dur": 9, "args": { "External id": 9068, "cbid": 211, "correlation": 9068 } }, { "ph": "s", "id": 9068, "pid": 76337, "tid": -914061504, "ts": 1716454216720559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216720721, "dur": 40, "args": { "External id": 9090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9090, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9090, "pid": 5, "tid": 7, "ts": 1716454216720721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720611, "dur": 10, "args": { "External id": 9090, "cbid": 211, "correlation": 9090 } }, { "ph": "s", "id": 9090, "pid": 76337, "tid": -914061504, "ts": 1716454216720611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720709, "dur": 2, "args": { "External id": 9106, "cbid": 251, "correlation": 9106 } }, { "ph": "f", "id": 9106, "pid": 76337, "tid": -914061504, "ts": 1716454216720709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720715, "dur": 0, "args": { "External id": 9108, "cbid": 251, "correlation": 9108 } }, { "ph": "f", "id": 9108, "pid": 76337, "tid": -914061504, "ts": 1716454216720715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216720763, "dur": 590, "args": { "External id": 9109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9109, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 9109, "pid": 5, "tid": 7, "ts": 1716454216720763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720721, "dur": 13, "args": { "External id": 9109, "cbid": 211, "correlation": 9109 } }, { "ph": "s", "id": 9109, "pid": 76337, "tid": -914061504, "ts": 1716454216720721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216721354, "dur": 134, "args": { "External id": 9117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9117, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9117, "pid": 5, "tid": 7, "ts": 1716454216721354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720819, "dur": 15, "args": { "External id": 9117, "cbid": 211, "correlation": 9117 } }, { "ph": "s", "id": 9117, "pid": 76337, "tid": -914061504, "ts": 1716454216720819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216721489, "dur": 131, "args": { "External id": 9125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9125, "pid": 5, "tid": 7, "ts": 1716454216721489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720862, "dur": 10, "args": { "External id": 9125, "cbid": 211, "correlation": 9125 } }, { "ph": "s", "id": 9125, "pid": 76337, "tid": -914061504, "ts": 1716454216720862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216720945, "dur": 1, "args": { "External id": 9141, "cbid": 251, "correlation": 9141 } }, { "ph": "f", "id": 9141, "pid": 76337, "tid": -914061504, "ts": 1716454216720945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216721622, "dur": 327, "args": { "External id": 9143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9143, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9143, "pid": 5, "tid": 7, "ts": 1716454216721622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216720951, "dur": 13, "args": { "External id": 9143, "cbid": 211, "correlation": 9143 } }, { "ph": "s", "id": 9143, "pid": 76337, "tid": -914061504, "ts": 1716454216720951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216724405, "dur": 27, "args": { "External id": 9151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9151, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9151, "pid": 5, "tid": 7, "ts": 1716454216724405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724380, "dur": 25, "args": { "External id": 9151, "cbid": 211, "correlation": 9151 } }, { "ph": "s", "id": 9151, "pid": 76337, "tid": -914061504, "ts": 1716454216724380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216724527, "dur": 90, "args": { "External id": 9162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9162, "pid": 5, "tid": 7, "ts": 1716454216724527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724510, "dur": 16, "args": { "External id": 9162, "cbid": 211, "correlation": 9162 } }, { "ph": "s", "id": 9162, "pid": 76337, "tid": -914061504, "ts": 1716454216724510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216724639, "dur": 1, "args": { "External id": 9174, "cbid": 317, "correlation": 9174 } }, { "ph": "f", "id": 9174, "pid": 76337, "tid": -914061504, "ts": 1716454216724639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216724641, "dur": 1, "args": { "External id": 9175, "cbid": 203, "correlation": 9175 } }, { "ph": "f", "id": 9175, "pid": 76337, "tid": -914061504, "ts": 1716454216724641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216724643, "dur": 1, "args": { "External id": 9176, "cbid": 205, "correlation": 9176 } }, { "ph": "f", "id": 9176, "pid": 76337, "tid": -914061504, "ts": 1716454216724643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216724700, "dur": 24, "args": { "External id": 9180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9180, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9180, "pid": 5, "tid": 7, "ts": 1716454216724700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724684, "dur": 16, "args": { "External id": 9180, "cbid": 211, "correlation": 9180 } }, { "ph": "s", "id": 9180, "pid": 76337, "tid": -914061504, "ts": 1716454216724684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216724732, "dur": 131, "args": { "External id": 9182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9182, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9182, "pid": 5, "tid": 7, "ts": 1716454216724732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724719, "dur": 12, "args": { "External id": 9182, "cbid": 211, "correlation": 9182 } }, { "ph": "s", "id": 9182, "pid": 76337, "tid": -914061504, "ts": 1716454216724719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216724864, "dur": 24, "args": { "External id": 9184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9184, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9184, "pid": 5, "tid": 7, "ts": 1716454216724864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724739, "dur": 21, "args": { "External id": 9184, "cbid": 211, "correlation": 9184 } }, { "ph": "s", "id": 9184, "pid": 76337, "tid": -914061504, "ts": 1716454216724739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216724890, "dur": 35, "args": { "External id": 9190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9190, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9190, "pid": 5, "tid": 7, "ts": 1716454216724890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724790, "dur": 9, "args": { "External id": 9190, "cbid": 211, "correlation": 9190 } }, { "ph": "s", "id": 9190, "pid": 76337, "tid": -914061504, "ts": 1716454216724790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216724926, "dur": 27, "args": { "External id": 9198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9198, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9198, "pid": 5, "tid": 7, "ts": 1716454216724926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724824, "dur": 8, "args": { "External id": 9198, "cbid": 211, "correlation": 9198 } }, { "ph": "s", "id": 9198, "pid": 76337, "tid": -914061504, "ts": 1716454216724824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216724986, "dur": 32, "args": { "External id": 9218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9218, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 9218, "pid": 5, "tid": 7, "ts": 1716454216724986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216724962, "dur": 24, "args": { "External id": 9218, "cbid": 211, "correlation": 9218 } }, { "ph": "s", "id": 9218, "pid": 76337, "tid": -914061504, "ts": 1716454216724962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216725020, "dur": 5, "args": { "External id": 9230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9230, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 9230, "pid": 5, "tid": 7, "ts": 1716454216725020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725005, "dur": 9, "args": { "External id": 9230, "cbid": 211, "correlation": 9230 } }, { "ph": "s", "id": 9230, "pid": 76337, "tid": -914061504, "ts": 1716454216725005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216725040, "dur": 33, "args": { "External id": 9233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9233, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9233, "pid": 5, "tid": 7, "ts": 1716454216725040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725030, "dur": 8, "args": { "External id": 9233, "cbid": 211, "correlation": 9233 } }, { "ph": "s", "id": 9233, "pid": 76337, "tid": -914061504, "ts": 1716454216725030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216725109, "dur": 23, "args": { "External id": 9242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9242, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9242, "pid": 5, "tid": 7, "ts": 1716454216725109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725089, "dur": 29, "args": { "External id": 9242, "cbid": 211, "correlation": 9242 } }, { "ph": "s", "id": 9242, "pid": 76337, "tid": -914061504, "ts": 1716454216725089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216725172, "dur": 0, "args": { "External id": 9252, "cbid": 317, "correlation": 9252 } }, { "ph": "f", "id": 9252, "pid": 76337, "tid": -914061504, "ts": 1716454216725172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216725173, "dur": 0, "args": { "External id": 9253, "cbid": 203, "correlation": 9253 } }, { "ph": "f", "id": 9253, "pid": 76337, "tid": -914061504, "ts": 1716454216725173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216725173, "dur": 0, "args": { "External id": 9254, "cbid": 205, "correlation": 9254 } }, { "ph": "f", "id": 9254, "pid": 76337, "tid": -914061504, "ts": 1716454216725173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216725210, "dur": 22, "args": { "External id": 9258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9258, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9258, "pid": 5, "tid": 7, "ts": 1716454216725210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725196, "dur": 13, "args": { "External id": 9258, "cbid": 211, "correlation": 9258 } }, { "ph": "s", "id": 9258, "pid": 76337, "tid": -914061504, "ts": 1716454216725196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216725233, "dur": 48, "args": { "External id": 9260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9260, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9260, "pid": 5, "tid": 7, "ts": 1716454216725233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725212, "dur": 6, "args": { "External id": 9260, "cbid": 211, "correlation": 9260 } }, { "ph": "s", "id": 9260, "pid": 76337, "tid": -914061504, "ts": 1716454216725212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216725283, "dur": 719, "args": { "External id": 9262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9262, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9262, "pid": 5, "tid": 7, "ts": 1716454216725283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725226, "dur": 10, "args": { "External id": 9262, "cbid": 211, "correlation": 9262 } }, { "ph": "s", "id": 9262, "pid": 76337, "tid": -914061504, "ts": 1716454216725226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216726003, "dur": 22, "args": { "External id": 9264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9264, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9264, "pid": 5, "tid": 7, "ts": 1716454216726003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725240, "dur": 5, "args": { "External id": 9264, "cbid": 211, "correlation": 9264 } }, { "ph": "s", "id": 9264, "pid": 76337, "tid": -914061504, "ts": 1716454216725240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216726027, "dur": 35, "args": { "External id": 9270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9270, "pid": 5, "tid": 7, "ts": 1716454216726027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725277, "dur": 10, "args": { "External id": 9270, "cbid": 211, "correlation": 9270 } }, { "ph": "s", "id": 9270, "pid": 76337, "tid": -914061504, "ts": 1716454216725277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216726063, "dur": 4, "args": { "External id": 9278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9278, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 9278, "pid": 5, "tid": 7, "ts": 1716454216726063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725326, "dur": 10, "args": { "External id": 9278, "cbid": 211, "correlation": 9278 } }, { "ph": "s", "id": 9278, "pid": 76337, "tid": -914061504, "ts": 1716454216725326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216725441, "dur": 3, "args": { "External id": 9294, "cbid": 251, "correlation": 9294 } }, { "ph": "f", "id": 9294, "pid": 76337, "tid": -914061504, "ts": 1716454216725441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216725448, "dur": 0, "args": { "External id": 9296, "cbid": 251, "correlation": 9296 } }, { "ph": "f", "id": 9296, "pid": 76337, "tid": -914061504, "ts": 1716454216725448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216726068, "dur": 14, "args": { "External id": 9297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9297, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 9297, "pid": 5, "tid": 7, "ts": 1716454216726068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725451, "dur": 17, "args": { "External id": 9297, "cbid": 211, "correlation": 9297 } }, { "ph": "s", "id": 9297, "pid": 76337, "tid": -914061504, "ts": 1716454216725451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216726083, "dur": 5, "args": { "External id": 9299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9299, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 9299, "pid": 5, "tid": 7, "ts": 1716454216726083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725473, "dur": 8, "args": { "External id": 9299, "cbid": 211, "correlation": 9299 } }, { "ph": "s", "id": 9299, "pid": 76337, "tid": -914061504, "ts": 1716454216725473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216726090, "dur": 31, "args": { "External id": 9309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9309, "pid": 5, "tid": 7, "ts": 1716454216726090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725552, "dur": 12, "args": { "External id": 9309, "cbid": 211, "correlation": 9309 } }, { "ph": "s", "id": 9309, "pid": 76337, "tid": -914061504, "ts": 1716454216725552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216726123, "dur": 32, "args": { "External id": 9329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9329, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 9329, "pid": 5, "tid": 7, "ts": 1716454216726123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725634, "dur": 12, "args": { "External id": 9329, "cbid": 211, "correlation": 9329 } }, { "ph": "s", "id": 9329, "pid": 76337, "tid": -914061504, "ts": 1716454216725634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216726157, "dur": 4, "args": { "External id": 9341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9341, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 9341, "pid": 5, "tid": 7, "ts": 1716454216726157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725662, "dur": 8, "args": { "External id": 9341, "cbid": 211, "correlation": 9341 } }, { "ph": "s", "id": 9341, "pid": 76337, "tid": -914061504, "ts": 1716454216725662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216726162, "dur": 32, "args": { "External id": 9344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9344, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9344, "pid": 5, "tid": 7, "ts": 1716454216726162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725684, "dur": 7, "args": { "External id": 9344, "cbid": 211, "correlation": 9344 } }, { "ph": "s", "id": 9344, "pid": 76337, "tid": -914061504, "ts": 1716454216725684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216726195, "dur": 23, "args": { "External id": 9353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9353, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9353, "pid": 5, "tid": 7, "ts": 1716454216726195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725726, "dur": 10, "args": { "External id": 9353, "cbid": 211, "correlation": 9353 } }, { "ph": "s", "id": 9353, "pid": 76337, "tid": -914061504, "ts": 1716454216725726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216725806, "dur": 0, "args": { "External id": 9363, "cbid": 317, "correlation": 9363 } }, { "ph": "f", "id": 9363, "pid": 76337, "tid": -914061504, "ts": 1716454216725806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216725807, "dur": 0, "args": { "External id": 9364, "cbid": 203, "correlation": 9364 } }, { "ph": "f", "id": 9364, "pid": 76337, "tid": -914061504, "ts": 1716454216725807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216725808, "dur": 0, "args": { "External id": 9365, "cbid": 205, "correlation": 9365 } }, { "ph": "f", "id": 9365, "pid": 76337, "tid": -914061504, "ts": 1716454216725808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216726219, "dur": 24, "args": { "External id": 9369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9369, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9369, "pid": 5, "tid": 7, "ts": 1716454216726219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725821, "dur": 13, "args": { "External id": 9369, "cbid": 211, "correlation": 9369 } }, { "ph": "s", "id": 9369, "pid": 76337, "tid": -914061504, "ts": 1716454216725821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216726245, "dur": 48, "args": { "External id": 9371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9371, "pid": 5, "tid": 7, "ts": 1716454216726245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725836, "dur": 6, "args": { "External id": 9371, "cbid": 211, "correlation": 9371 } }, { "ph": "s", "id": 9371, "pid": 76337, "tid": -914061504, "ts": 1716454216725836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216726295, "dur": 709, "args": { "External id": 9373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9373, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9373, "pid": 5, "tid": 7, "ts": 1716454216726295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725848, "dur": 6, "args": { "External id": 9373, "cbid": 211, "correlation": 9373 } }, { "ph": "s", "id": 9373, "pid": 76337, "tid": -914061504, "ts": 1716454216725848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216727006, "dur": 22, "args": { "External id": 9375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9375, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9375, "pid": 5, "tid": 7, "ts": 1716454216727006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725858, "dur": 5, "args": { "External id": 9375, "cbid": 211, "correlation": 9375 } }, { "ph": "s", "id": 9375, "pid": 76337, "tid": -914061504, "ts": 1716454216725858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216727029, "dur": 35, "args": { "External id": 9381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9381, "pid": 5, "tid": 7, "ts": 1716454216727029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725886, "dur": 9, "args": { "External id": 9381, "cbid": 211, "correlation": 9381 } }, { "ph": "s", "id": 9381, "pid": 76337, "tid": -914061504, "ts": 1716454216725886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216727065, "dur": 27, "args": { "External id": 9389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9389, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9389, "pid": 5, "tid": 7, "ts": 1716454216727065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725920, "dur": 8, "args": { "External id": 9389, "cbid": 211, "correlation": 9389 } }, { "ph": "s", "id": 9389, "pid": 76337, "tid": -914061504, "ts": 1716454216725920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216727094, "dur": 20, "args": { "External id": 9397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9397, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9397, "pid": 5, "tid": 7, "ts": 1716454216727094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216725962, "dur": 10, "args": { "External id": 9397, "cbid": 211, "correlation": 9397 } }, { "ph": "s", "id": 9397, "pid": 76337, "tid": -914061504, "ts": 1716454216725962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216727115, "dur": 32, "args": { "External id": 9417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9417, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 9417, "pid": 5, "tid": 7, "ts": 1716454216727115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726057, "dur": 13, "args": { "External id": 9417, "cbid": 211, "correlation": 9417 } }, { "ph": "s", "id": 9417, "pid": 76337, "tid": -914061504, "ts": 1716454216726057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216727149, "dur": 5, "args": { "External id": 9429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9429, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 9429, "pid": 5, "tid": 7, "ts": 1716454216727149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726091, "dur": 8, "args": { "External id": 9429, "cbid": 211, "correlation": 9429 } }, { "ph": "s", "id": 9429, "pid": 76337, "tid": -914061504, "ts": 1716454216726091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216727155, "dur": 32, "args": { "External id": 9432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9432, "pid": 5, "tid": 7, "ts": 1716454216727155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726112, "dur": 7, "args": { "External id": 9432, "cbid": 211, "correlation": 9432 } }, { "ph": "s", "id": 9432, "pid": 76337, "tid": -914061504, "ts": 1716454216726112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216726180, "dur": 0, "args": { "External id": 9443, "cbid": 317, "correlation": 9443 } }, { "ph": "f", "id": 9443, "pid": 76337, "tid": -914061504, "ts": 1716454216726180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216726181, "dur": 0, "args": { "External id": 9444, "cbid": 203, "correlation": 9444 } }, { "ph": "f", "id": 9444, "pid": 76337, "tid": -914061504, "ts": 1716454216726181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216726182, "dur": 0, "args": { "External id": 9445, "cbid": 205, "correlation": 9445 } }, { "ph": "f", "id": 9445, "pid": 76337, "tid": -914061504, "ts": 1716454216726182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216727189, "dur": 23, "args": { "External id": 9449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9449, "pid": 5, "tid": 7, "ts": 1716454216727189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726195, "dur": 13, "args": { "External id": 9449, "cbid": 211, "correlation": 9449 } }, { "ph": "s", "id": 9449, "pid": 76337, "tid": -914061504, "ts": 1716454216726195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216727213, "dur": 129, "args": { "External id": 9451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9451, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9451, "pid": 5, "tid": 7, "ts": 1716454216727213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726215, "dur": 6, "args": { "External id": 9451, "cbid": 211, "correlation": 9451 } }, { "ph": "s", "id": 9451, "pid": 76337, "tid": -914061504, "ts": 1716454216726215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216727343, "dur": 22, "args": { "External id": 9453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9453, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9453, "pid": 5, "tid": 7, "ts": 1716454216727343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726225, "dur": 5, "args": { "External id": 9453, "cbid": 211, "correlation": 9453 } }, { "ph": "s", "id": 9453, "pid": 76337, "tid": -914061504, "ts": 1716454216726225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216727367, "dur": 35, "args": { "External id": 9459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9459, "pid": 5, "tid": 7, "ts": 1716454216727367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726253, "dur": 9, "args": { "External id": 9459, "cbid": 211, "correlation": 9459 } }, { "ph": "s", "id": 9459, "pid": 76337, "tid": -914061504, "ts": 1716454216726253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216727404, "dur": 88, "args": { "External id": 9470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9470, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9470, "pid": 5, "tid": 7, "ts": 1716454216727404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726365, "dur": 14, "args": { "External id": 9470, "cbid": 211, "correlation": 9470 } }, { "ph": "s", "id": 9470, "pid": 76337, "tid": -914061504, "ts": 1716454216726365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216727493, "dur": 39, "args": { "External id": 9492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9492, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9492, "pid": 5, "tid": 7, "ts": 1716454216727493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726399, "dur": 10, "args": { "External id": 9492, "cbid": 211, "correlation": 9492 } }, { "ph": "s", "id": 9492, "pid": 76337, "tid": -914061504, "ts": 1716454216726399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216726512, "dur": 2, "args": { "External id": 9503, "cbid": 251, "correlation": 9503 } }, { "ph": "f", "id": 9503, "pid": 76337, "tid": -914061504, "ts": 1716454216726512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216727534, "dur": 105, "args": { "External id": 9504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9504, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9504, "pid": 5, "tid": 7, "ts": 1716454216727534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726519, "dur": 19, "args": { "External id": 9504, "cbid": 211, "correlation": 9504 } }, { "ph": "s", "id": 9504, "pid": 76337, "tid": -914061504, "ts": 1716454216726519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216726613, "dur": 1, "args": { "External id": 9515, "cbid": 251, "correlation": 9515 } }, { "ph": "f", "id": 9515, "pid": 76337, "tid": -914061504, "ts": 1716454216726613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216727641, "dur": 98, "args": { "External id": 9516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9516, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9516, "pid": 5, "tid": 7, "ts": 1716454216727641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726618, "dur": 12, "args": { "External id": 9516, "cbid": 211, "correlation": 9516 } }, { "ph": "s", "id": 9516, "pid": 76337, "tid": -914061504, "ts": 1716454216726618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216726698, "dur": 1, "args": { "External id": 9527, "cbid": 251, "correlation": 9527 } }, { "ph": "f", "id": 9527, "pid": 76337, "tid": -914061504, "ts": 1716454216726698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216727740, "dur": 97, "args": { "External id": 9528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9528, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9528, "pid": 5, "tid": 7, "ts": 1716454216727740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726702, "dur": 13, "args": { "External id": 9528, "cbid": 211, "correlation": 9528 } }, { "ph": "s", "id": 9528, "pid": 76337, "tid": -914061504, "ts": 1716454216726702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216727838, "dur": 610, "args": { "External id": 9549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9549, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 9549, "pid": 5, "tid": 7, "ts": 1716454216727838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726805, "dur": 17, "args": { "External id": 9549, "cbid": 211, "correlation": 9549 } }, { "ph": "s", "id": 9549, "pid": 76337, "tid": -914061504, "ts": 1716454216726805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216726922, "dur": 14, "args": { "External id": 9567, "cbid": 251, "correlation": 9567 } }, { "ph": "f", "id": 9567, "pid": 76337, "tid": -914061504, "ts": 1716454216726922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216728450, "dur": 105, "args": { "External id": 9569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9569, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9569, "pid": 5, "tid": 7, "ts": 1716454216728450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216726942, "dur": 14, "args": { "External id": 9569, "cbid": 211, "correlation": 9569 } }, { "ph": "s", "id": 9569, "pid": 76337, "tid": -914061504, "ts": 1716454216726942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216728556, "dur": 19, "args": { "External id": 9577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9577, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9577, "pid": 5, "tid": 7, "ts": 1716454216728556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727026, "dur": 13, "args": { "External id": 9577, "cbid": 211, "correlation": 9577 } }, { "ph": "s", "id": 9577, "pid": 76337, "tid": -914061504, "ts": 1716454216727026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216728577, "dur": 93, "args": { "External id": 9585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9585, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9585, "pid": 5, "tid": 7, "ts": 1716454216728577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727069, "dur": 9, "args": { "External id": 9585, "cbid": 211, "correlation": 9585 } }, { "ph": "s", "id": 9585, "pid": 76337, "tid": -914061504, "ts": 1716454216727069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216728671, "dur": 37, "args": { "External id": 9607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9607, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9607, "pid": 5, "tid": 7, "ts": 1716454216728671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727123, "dur": 10, "args": { "External id": 9607, "cbid": 211, "correlation": 9607 } }, { "ph": "s", "id": 9607, "pid": 76337, "tid": -914061504, "ts": 1716454216727123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727208, "dur": 1, "args": { "External id": 9618, "cbid": 251, "correlation": 9618 } }, { "ph": "f", "id": 9618, "pid": 76337, "tid": -914061504, "ts": 1716454216727208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216728710, "dur": 95, "args": { "External id": 9619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9619, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9619, "pid": 5, "tid": 7, "ts": 1716454216728710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727213, "dur": 15, "args": { "External id": 9619, "cbid": 211, "correlation": 9619 } }, { "ph": "s", "id": 9619, "pid": 76337, "tid": -914061504, "ts": 1716454216727213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727293, "dur": 1, "args": { "External id": 9630, "cbid": 251, "correlation": 9630 } }, { "ph": "f", "id": 9630, "pid": 76337, "tid": -914061504, "ts": 1716454216727293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727297, "dur": 0, "args": { "External id": 9631, "cbid": 251, "correlation": 9631 } }, { "ph": "f", "id": 9631, "pid": 76337, "tid": -914061504, "ts": 1716454216727297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216728807, "dur": 12, "args": { "External id": 9632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9632, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 9632, "pid": 5, "tid": 7, "ts": 1716454216728807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727299, "dur": 12, "args": { "External id": 9632, "cbid": 211, "correlation": 9632 } }, { "ph": "s", "id": 9632, "pid": 76337, "tid": -914061504, "ts": 1716454216727299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216728820, "dur": 5, "args": { "External id": 9634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9634, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 9634, "pid": 5, "tid": 7, "ts": 1716454216728820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727314, "dur": 7, "args": { "External id": 9634, "cbid": 211, "correlation": 9634 } }, { "ph": "s", "id": 9634, "pid": 76337, "tid": -914061504, "ts": 1716454216727314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727388, "dur": 1, "args": { "External id": 9645, "cbid": 251, "correlation": 9645 } }, { "ph": "f", "id": 9645, "pid": 76337, "tid": -914061504, "ts": 1716454216727388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727392, "dur": 0, "args": { "External id": 9646, "cbid": 251, "correlation": 9646 } }, { "ph": "f", "id": 9646, "pid": 76337, "tid": -914061504, "ts": 1716454216727392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216728826, "dur": 8, "args": { "External id": 9647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9647, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 9647, "pid": 5, "tid": 7, "ts": 1716454216728826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727394, "dur": 12, "args": { "External id": 9647, "cbid": 211, "correlation": 9647 } }, { "ph": "s", "id": 9647, "pid": 76337, "tid": -914061504, "ts": 1716454216727394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216728835, "dur": 3, "args": { "External id": 9649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9649, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 9649, "pid": 5, "tid": 7, "ts": 1716454216728835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727408, "dur": 6, "args": { "External id": 9649, "cbid": 211, "correlation": 9649 } }, { "ph": "s", "id": 9649, "pid": 76337, "tid": -914061504, "ts": 1716454216727408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216728840, "dur": 99, "args": { "External id": 9670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9670, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 9670, "pid": 5, "tid": 7, "ts": 1716454216728840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727483, "dur": 13, "args": { "External id": 9670, "cbid": 211, "correlation": 9670 } }, { "ph": "s", "id": 9670, "pid": 76337, "tid": -914061504, "ts": 1716454216727483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727599, "dur": 1, "args": { "External id": 9688, "cbid": 251, "correlation": 9688 } }, { "ph": "f", "id": 9688, "pid": 76337, "tid": -914061504, "ts": 1716454216727599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216728941, "dur": 100, "args": { "External id": 9690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9690, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9690, "pid": 5, "tid": 7, "ts": 1716454216728941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727605, "dur": 14, "args": { "External id": 9690, "cbid": 211, "correlation": 9690 } }, { "ph": "s", "id": 9690, "pid": 76337, "tid": -914061504, "ts": 1716454216727605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216729042, "dur": 19, "args": { "External id": 9698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9698, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9698, "pid": 5, "tid": 7, "ts": 1716454216729042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727677, "dur": 13, "args": { "External id": 9698, "cbid": 211, "correlation": 9698 } }, { "ph": "s", "id": 9698, "pid": 76337, "tid": -914061504, "ts": 1716454216727677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216729063, "dur": 28, "args": { "External id": 9706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9706, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9706, "pid": 5, "tid": 7, "ts": 1716454216729063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727717, "dur": 8, "args": { "External id": 9706, "cbid": 211, "correlation": 9706 } }, { "ph": "s", "id": 9706, "pid": 76337, "tid": -914061504, "ts": 1716454216727717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216729092, "dur": 38, "args": { "External id": 9728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9728, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9728, "pid": 5, "tid": 7, "ts": 1716454216729092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727767, "dur": 10, "args": { "External id": 9728, "cbid": 211, "correlation": 9728 } }, { "ph": "s", "id": 9728, "pid": 76337, "tid": -914061504, "ts": 1716454216727767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727860, "dur": 2, "args": { "External id": 9744, "cbid": 251, "correlation": 9744 } }, { "ph": "f", "id": 9744, "pid": 76337, "tid": -914061504, "ts": 1716454216727860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216727865, "dur": 0, "args": { "External id": 9746, "cbid": 251, "correlation": 9746 } }, { "ph": "f", "id": 9746, "pid": 76337, "tid": -914061504, "ts": 1716454216727865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216729132, "dur": 589, "args": { "External id": 9747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9747, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 9747, "pid": 5, "tid": 7, "ts": 1716454216729132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727871, "dur": 14, "args": { "External id": 9747, "cbid": 211, "correlation": 9747 } }, { "ph": "s", "id": 9747, "pid": 76337, "tid": -914061504, "ts": 1716454216727871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216729722, "dur": 133, "args": { "External id": 9755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9755, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9755, "pid": 5, "tid": 7, "ts": 1716454216729722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216727967, "dur": 28, "args": { "External id": 9755, "cbid": 211, "correlation": 9755 } }, { "ph": "s", "id": 9755, "pid": 76337, "tid": -914061504, "ts": 1716454216727967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216729856, "dur": 133, "args": { "External id": 9763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9763, "pid": 5, "tid": 7, "ts": 1716454216729856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728021, "dur": 13, "args": { "External id": 9763, "cbid": 211, "correlation": 9763 } }, { "ph": "s", "id": 9763, "pid": 76337, "tid": -914061504, "ts": 1716454216728021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216728107, "dur": 2, "args": { "External id": 9779, "cbid": 251, "correlation": 9779 } }, { "ph": "f", "id": 9779, "pid": 76337, "tid": -914061504, "ts": 1716454216728107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216729991, "dur": 318, "args": { "External id": 9781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9781, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9781, "pid": 5, "tid": 7, "ts": 1716454216729991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728113, "dur": 12, "args": { "External id": 9781, "cbid": 211, "correlation": 9781 } }, { "ph": "s", "id": 9781, "pid": 76337, "tid": -914061504, "ts": 1716454216728113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216730310, "dur": 27, "args": { "External id": 9789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9789, "pid": 5, "tid": 7, "ts": 1716454216730310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728156, "dur": 10, "args": { "External id": 9789, "cbid": 211, "correlation": 9789 } }, { "ph": "s", "id": 9789, "pid": 76337, "tid": -914061504, "ts": 1716454216728156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216730337, "dur": 82, "args": { "External id": 9800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9800, "pid": 5, "tid": 7, "ts": 1716454216730337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728219, "dur": 13, "args": { "External id": 9800, "cbid": 211, "correlation": 9800 } }, { "ph": "s", "id": 9800, "pid": 76337, "tid": -914061504, "ts": 1716454216728219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216728286, "dur": 0, "args": { "External id": 9812, "cbid": 317, "correlation": 9812 } }, { "ph": "f", "id": 9812, "pid": 76337, "tid": -914061504, "ts": 1716454216728286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216728287, "dur": 0, "args": { "External id": 9813, "cbid": 203, "correlation": 9813 } }, { "ph": "f", "id": 9813, "pid": 76337, "tid": -914061504, "ts": 1716454216728287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216728288, "dur": 0, "args": { "External id": 9814, "cbid": 205, "correlation": 9814 } }, { "ph": "f", "id": 9814, "pid": 76337, "tid": -914061504, "ts": 1716454216728288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216730421, "dur": 22, "args": { "External id": 9818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9818, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9818, "pid": 5, "tid": 7, "ts": 1716454216730421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728302, "dur": 12, "args": { "External id": 9818, "cbid": 211, "correlation": 9818 } }, { "ph": "s", "id": 9818, "pid": 76337, "tid": -914061504, "ts": 1716454216728302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216730445, "dur": 121, "args": { "External id": 9820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9820, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9820, "pid": 5, "tid": 7, "ts": 1716454216730445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728321, "dur": 6, "args": { "External id": 9820, "cbid": 211, "correlation": 9820 } }, { "ph": "s", "id": 9820, "pid": 76337, "tid": -914061504, "ts": 1716454216728321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216730567, "dur": 22, "args": { "External id": 9822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9822, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9822, "pid": 5, "tid": 7, "ts": 1716454216730567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728331, "dur": 5, "args": { "External id": 9822, "cbid": 211, "correlation": 9822 } }, { "ph": "s", "id": 9822, "pid": 76337, "tid": -914061504, "ts": 1716454216728331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216730590, "dur": 33, "args": { "External id": 9828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9828, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9828, "pid": 5, "tid": 7, "ts": 1716454216730590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728358, "dur": 8, "args": { "External id": 9828, "cbid": 211, "correlation": 9828 } }, { "ph": "s", "id": 9828, "pid": 76337, "tid": -914061504, "ts": 1716454216728358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216730625, "dur": 26, "args": { "External id": 9836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9836, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9836, "pid": 5, "tid": 7, "ts": 1716454216730625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216728389, "dur": 8, "args": { "External id": 9836, "cbid": 211, "correlation": 9836 } }, { "ph": "s", "id": 9836, "pid": 76337, "tid": -914061504, "ts": 1716454216728389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216728478, "dur": 0, "args": { "External id": 9846, "cbid": 317, "correlation": 9846 } }, { "ph": "f", "id": 9846, "pid": 76337, "tid": -914061504, "ts": 1716454216728478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216728479, "dur": 0, "args": { "External id": 9847, "cbid": 203, "correlation": 9847 } }, { "ph": "f", "id": 9847, "pid": 76337, "tid": -914061504, "ts": 1716454216728479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216728480, "dur": 0, "args": { "External id": 9848, "cbid": 205, "correlation": 9848 } }, { "ph": "f", "id": 9848, "pid": 76337, "tid": -914061504, "ts": 1716454216728480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216731383, "dur": 23, "args": { "External id": 9852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9852, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9852, "pid": 5, "tid": 7, "ts": 1716454216731383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216731357, "dur": 27, "args": { "External id": 9852, "cbid": 211, "correlation": 9852 } }, { "ph": "s", "id": 9852, "pid": 76337, "tid": -914061504, "ts": 1716454216731357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216731407, "dur": 45, "args": { "External id": 9854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9854, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9854, "pid": 5, "tid": 7, "ts": 1716454216731407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216731387, "dur": 6, "args": { "External id": 9854, "cbid": 211, "correlation": 9854 } }, { "ph": "s", "id": 9854, "pid": 76337, "tid": -914061504, "ts": 1716454216731387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216731453, "dur": 236, "args": { "External id": 9856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9856, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 9856, "pid": 5, "tid": 7, "ts": 1716454216731453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216731403, "dur": 10, "args": { "External id": 9856, "cbid": 211, "correlation": 9856 } }, { "ph": "s", "id": 9856, "pid": 76337, "tid": -914061504, "ts": 1716454216731403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216731690, "dur": 7, "args": { "External id": 9858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9858, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9858, "pid": 5, "tid": 7, "ts": 1716454216731690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216731418, "dur": 7, "args": { "External id": 9858, "cbid": 211, "correlation": 9858 } }, { "ph": "s", "id": 9858, "pid": 76337, "tid": -914061504, "ts": 1716454216731418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216732158, "dur": 9, "args": { "External id": 9864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9864, "pid": 5, "tid": 7, "ts": 1716454216732158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216732145, "dur": 12, "args": { "External id": 9864, "cbid": 211, "correlation": 9864 } }, { "ph": "s", "id": 9864, "pid": 76337, "tid": -914061504, "ts": 1716454216732145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216732325, "dur": 12, "args": { "External id": 9884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9884, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 9884, "pid": 5, "tid": 7, "ts": 1716454216732325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216732310, "dur": 14, "args": { "External id": 9884, "cbid": 211, "correlation": 9884 } }, { "ph": "s", "id": 9884, "pid": 76337, "tid": -914061504, "ts": 1716454216732310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216732345, "dur": 4, "args": { "External id": 9896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9896, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 9896, "pid": 5, "tid": 7, "ts": 1716454216732345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216732336, "dur": 8, "args": { "External id": 9896, "cbid": 211, "correlation": 9896 } }, { "ph": "s", "id": 9896, "pid": 76337, "tid": -914061504, "ts": 1716454216732336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216732372, "dur": 13, "args": { "External id": 9899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9899, "pid": 5, "tid": 7, "ts": 1716454216732372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216732364, "dur": 7, "args": { "External id": 9899, "cbid": 211, "correlation": 9899 } }, { "ph": "s", "id": 9899, "pid": 76337, "tid": -914061504, "ts": 1716454216732364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216732428, "dur": 7, "args": { "External id": 9908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9908, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9908, "pid": 5, "tid": 7, "ts": 1716454216732428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216732416, "dur": 10, "args": { "External id": 9908, "cbid": 211, "correlation": 9908 } }, { "ph": "s", "id": 9908, "pid": 76337, "tid": -914061504, "ts": 1716454216732416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216732520, "dur": 0, "args": { "External id": 9918, "cbid": 317, "correlation": 9918 } }, { "ph": "f", "id": 9918, "pid": 76337, "tid": -914061504, "ts": 1716454216732520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216732521, "dur": 1, "args": { "External id": 9919, "cbid": 203, "correlation": 9919 } }, { "ph": "f", "id": 9919, "pid": 76337, "tid": -914061504, "ts": 1716454216732521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216732523, "dur": 1, "args": { "External id": 9920, "cbid": 205, "correlation": 9920 } }, { "ph": "f", "id": 9920, "pid": 76337, "tid": -914061504, "ts": 1716454216732523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216735326, "dur": 6, "args": { "External id": 9924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9924, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9924, "pid": 5, "tid": 7, "ts": 1716454216735326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216735303, "dur": 23, "args": { "External id": 9924, "cbid": 211, "correlation": 9924 } }, { "ph": "s", "id": 9924, "pid": 76337, "tid": -914061504, "ts": 1716454216735303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216735338, "dur": 84, "args": { "External id": 9926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9926, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9926, "pid": 5, "tid": 7, "ts": 1716454216735338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216735329, "dur": 7, "args": { "External id": 9926, "cbid": 211, "correlation": 9926 } }, { "ph": "s", "id": 9926, "pid": 76337, "tid": -914061504, "ts": 1716454216735329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216735425, "dur": 1, "args": { "External id": 9928, "device": 5, "context": 1, "stream": 7, "correlation": 9928, "bytes": 960, "memory bandwidth (GB/s)": 0.5 } }, { "ph": "f", "id": 9928, "pid": 5, "tid": 7, "ts": 1716454216735425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216735344, "dur": 30, "args": { "External id": 9928, "cbid": 51, "correlation": 9928 } }, { "ph": "s", "id": 9928, "pid": 76337, "tid": -914061504, "ts": 1716454216735344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216735429, "dur": 541, "args": { "External id": 9929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9929, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9929, "pid": 5, "tid": 7, "ts": 1716454216735429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216735377, "dur": 13, "args": { "External id": 9929, "cbid": 211, "correlation": 9929 } }, { "ph": "s", "id": 9929, "pid": 76337, "tid": -914061504, "ts": 1716454216735377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216735971, "dur": 11, "args": { "External id": 9931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9931, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9931, "pid": 5, "tid": 7, "ts": 1716454216735971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216735397, "dur": 7, "args": { "External id": 9931, "cbid": 211, "correlation": 9931 } }, { "ph": "s", "id": 9931, "pid": 76337, "tid": -914061504, "ts": 1716454216735397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216736169, "dur": 15, "args": { "External id": 9937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9937, "pid": 5, "tid": 7, "ts": 1716454216736169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736156, "dur": 12, "args": { "External id": 9937, "cbid": 211, "correlation": 9937 } }, { "ph": "s", "id": 9937, "pid": 76337, "tid": -914061504, "ts": 1716454216736156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216736243, "dur": 4, "args": { "External id": 9945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9945, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 9945, "pid": 5, "tid": 7, "ts": 1716454216736243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736233, "dur": 10, "args": { "External id": 9945, "cbid": 211, "correlation": 9945 } }, { "ph": "s", "id": 9945, "pid": 76337, "tid": -914061504, "ts": 1716454216736233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216736366, "dur": 4, "args": { "External id": 9961, "cbid": 251, "correlation": 9961 } }, { "ph": "f", "id": 9961, "pid": 76337, "tid": -914061504, "ts": 1716454216736366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216736376, "dur": 0, "args": { "External id": 9963, "cbid": 251, "correlation": 9963 } }, { "ph": "f", "id": 9963, "pid": 76337, "tid": -914061504, "ts": 1716454216736376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216736394, "dur": 13, "args": { "External id": 9964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9964, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9964, "pid": 5, "tid": 7, "ts": 1716454216736394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736379, "dur": 15, "args": { "External id": 9964, "cbid": 211, "correlation": 9964 } }, { "ph": "s", "id": 9964, "pid": 76337, "tid": -914061504, "ts": 1716454216736379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216736410, "dur": 5, "args": { "External id": 9966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9966, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 9966, "pid": 5, "tid": 7, "ts": 1716454216736410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736399, "dur": 9, "args": { "External id": 9966, "cbid": 211, "correlation": 9966 } }, { "ph": "s", "id": 9966, "pid": 76337, "tid": -914061504, "ts": 1716454216736399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216736504, "dur": 17, "args": { "External id": 9976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 9976, "pid": 5, "tid": 7, "ts": 1716454216736504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736492, "dur": 12, "args": { "External id": 9976, "cbid": 211, "correlation": 9976 } }, { "ph": "s", "id": 9976, "pid": 76337, "tid": -914061504, "ts": 1716454216736492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216736590, "dur": 17, "args": { "External id": 9996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 9996, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 9996, "pid": 5, "tid": 7, "ts": 1716454216736590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736578, "dur": 12, "args": { "External id": 9996, "cbid": 211, "correlation": 9996 } }, { "ph": "s", "id": 9996, "pid": 76337, "tid": -914061504, "ts": 1716454216736578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216736611, "dur": 4, "args": { "External id": 10008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10008, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 10008, "pid": 5, "tid": 7, "ts": 1716454216736611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736601, "dur": 7, "args": { "External id": 10008, "cbid": 211, "correlation": 10008 } }, { "ph": "s", "id": 10008, "pid": 76337, "tid": -914061504, "ts": 1716454216736601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216736635, "dur": 17, "args": { "External id": 10011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10011, "pid": 5, "tid": 7, "ts": 1716454216736635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736626, "dur": 8, "args": { "External id": 10011, "cbid": 211, "correlation": 10011 } }, { "ph": "s", "id": 10011, "pid": 76337, "tid": -914061504, "ts": 1716454216736626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216736683, "dur": 11, "args": { "External id": 10020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10020, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10020, "pid": 5, "tid": 7, "ts": 1716454216736683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216736672, "dur": 10, "args": { "External id": 10020, "cbid": 211, "correlation": 10020 } }, { "ph": "s", "id": 10020, "pid": 76337, "tid": -914061504, "ts": 1716454216736672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216736787, "dur": 0, "args": { "External id": 10030, "cbid": 317, "correlation": 10030 } }, { "ph": "f", "id": 10030, "pid": 76337, "tid": -914061504, "ts": 1716454216736787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216736789, "dur": 0, "args": { "External id": 10031, "cbid": 203, "correlation": 10031 } }, { "ph": "f", "id": 10031, "pid": 76337, "tid": -914061504, "ts": 1716454216736789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216736790, "dur": 0, "args": { "External id": 10032, "cbid": 205, "correlation": 10032 } }, { "ph": "f", "id": 10032, "pid": 76337, "tid": -914061504, "ts": 1716454216736790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216739475, "dur": 11, "args": { "External id": 10036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10036, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10036, "pid": 5, "tid": 7, "ts": 1716454216739475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216739451, "dur": 25, "args": { "External id": 10036, "cbid": 211, "correlation": 10036 } }, { "ph": "s", "id": 10036, "pid": 76337, "tid": -914061504, "ts": 1716454216739451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216739488, "dur": 165, "args": { "External id": 10038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10038, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10038, "pid": 5, "tid": 7, "ts": 1716454216739488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216739478, "dur": 6, "args": { "External id": 10038, "cbid": 211, "correlation": 10038 } }, { "ph": "s", "id": 10038, "pid": 76337, "tid": -914061504, "ts": 1716454216739478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216739655, "dur": 1, "args": { "External id": 10040, "device": 5, "context": 1, "stream": 7, "correlation": 10040, "bytes": 960, "memory bandwidth (GB/s)": 0.4918032786885246 } }, { "ph": "f", "id": 10040, "pid": 5, "tid": 7, "ts": 1716454216739655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216739493, "dur": 16, "args": { "External id": 10040, "cbid": 51, "correlation": 10040 } }, { "ph": "s", "id": 10040, "pid": 76337, "tid": -914061504, "ts": 1716454216739493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216739659, "dur": 668, "args": { "External id": 10041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10041, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10041, "pid": 5, "tid": 7, "ts": 1716454216739659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216739512, "dur": 11, "args": { "External id": 10041, "cbid": 211, "correlation": 10041 } }, { "ph": "s", "id": 10041, "pid": 76337, "tid": -914061504, "ts": 1716454216739512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216740329, "dur": 12, "args": { "External id": 10043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10043, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10043, "pid": 5, "tid": 7, "ts": 1716454216740329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216739528, "dur": 7, "args": { "External id": 10043, "cbid": 211, "correlation": 10043 } }, { "ph": "s", "id": 10043, "pid": 76337, "tid": -914061504, "ts": 1716454216739528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216740343, "dur": 15, "args": { "External id": 10049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10049, "pid": 5, "tid": 7, "ts": 1716454216740343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216740254, "dur": 13, "args": { "External id": 10049, "cbid": 211, "correlation": 10049 } }, { "ph": "s", "id": 10049, "pid": 76337, "tid": -914061504, "ts": 1716454216740254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216740348, "dur": 0, "args": { "External id": 10059, "cbid": 317, "correlation": 10059 } }, { "ph": "f", "id": 10059, "pid": 76337, "tid": -914061504, "ts": 1716454216740348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216740349, "dur": 0, "args": { "External id": 10060, "cbid": 203, "correlation": 10060 } }, { "ph": "f", "id": 10060, "pid": 76337, "tid": -914061504, "ts": 1716454216740349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216740350, "dur": 1, "args": { "External id": 10061, "cbid": 205, "correlation": 10061 } }, { "ph": "f", "id": 10061, "pid": 76337, "tid": -914061504, "ts": 1716454216740350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216742328, "dur": 8, "args": { "External id": 10065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10065, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10065, "pid": 5, "tid": 7, "ts": 1716454216742328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216742307, "dur": 21, "args": { "External id": 10065, "cbid": 211, "correlation": 10065 } }, { "ph": "s", "id": 10065, "pid": 76337, "tid": -914061504, "ts": 1716454216742307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216742345, "dur": 4, "args": { "External id": 10067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10067, "pid": 5, "tid": 7, "ts": 1716454216742345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216742335, "dur": 10, "args": { "External id": 10067, "cbid": 211, "correlation": 10067 } }, { "ph": "s", "id": 10067, "pid": 76337, "tid": -914061504, "ts": 1716454216742335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216742349, "dur": 0, "args": { "External id": 10068, "cbid": 51, "correlation": 10068 } }, { "ph": "s", "id": 10068, "pid": 76337, "tid": -914061504, "ts": 1716454216742349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216742360, "dur": 58, "args": { "External id": 10069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10069, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 10069, "pid": 5, "tid": 7, "ts": 1716454216742360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216742350, "dur": 8, "args": { "External id": 10069, "cbid": 211, "correlation": 10069 } }, { "ph": "s", "id": 10069, "pid": 76337, "tid": -914061504, "ts": 1716454216742350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216742924, "dur": 14, "args": { "External id": 10074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10074, "pid": 5, "tid": 7, "ts": 1716454216742924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216742913, "dur": 10, "args": { "External id": 10074, "cbid": 211, "correlation": 10074 } }, { "ph": "s", "id": 10074, "pid": 76337, "tid": -914061504, "ts": 1716454216742913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216742967, "dur": 12, "args": { "External id": 10082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10082, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10082, "pid": 5, "tid": 7, "ts": 1716454216742967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216742957, "dur": 9, "args": { "External id": 10082, "cbid": 211, "correlation": 10082 } }, { "ph": "s", "id": 10082, "pid": 76337, "tid": -914061504, "ts": 1716454216742957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216743035, "dur": 10, "args": { "External id": 10090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10090, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10090, "pid": 5, "tid": 7, "ts": 1716454216743035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216743021, "dur": 12, "args": { "External id": 10090, "cbid": 211, "correlation": 10090 } }, { "ph": "s", "id": 10090, "pid": 76337, "tid": -914061504, "ts": 1716454216743021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216743181, "dur": 18, "args": { "External id": 10110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10110, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 10110, "pid": 5, "tid": 7, "ts": 1716454216743181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216743165, "dur": 15, "args": { "External id": 10110, "cbid": 211, "correlation": 10110 } }, { "ph": "s", "id": 10110, "pid": 76337, "tid": -914061504, "ts": 1716454216743165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216743234, "dur": 4, "args": { "External id": 10122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10122, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 10122, "pid": 5, "tid": 7, "ts": 1716454216743234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216743224, "dur": 10, "args": { "External id": 10122, "cbid": 211, "correlation": 10122 } }, { "ph": "s", "id": 10122, "pid": 76337, "tid": -914061504, "ts": 1716454216743224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216743262, "dur": 17, "args": { "External id": 10125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10125, "pid": 5, "tid": 7, "ts": 1716454216743262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216743254, "dur": 7, "args": { "External id": 10125, "cbid": 211, "correlation": 10125 } }, { "ph": "s", "id": 10125, "pid": 76337, "tid": -914061504, "ts": 1716454216743254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216743326, "dur": 0, "args": { "External id": 10136, "cbid": 317, "correlation": 10136 } }, { "ph": "f", "id": 10136, "pid": 76337, "tid": -914061504, "ts": 1716454216743326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216743326, "dur": 1, "args": { "External id": 10137, "cbid": 203, "correlation": 10137 } }, { "ph": "f", "id": 10137, "pid": 76337, "tid": -914061504, "ts": 1716454216743326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216743328, "dur": 1, "args": { "External id": 10138, "cbid": 205, "correlation": 10138 } }, { "ph": "f", "id": 10138, "pid": 76337, "tid": -914061504, "ts": 1716454216743328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216745067, "dur": 11, "args": { "External id": 10142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10142, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10142, "pid": 5, "tid": 7, "ts": 1716454216745067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216745048, "dur": 19, "args": { "External id": 10142, "cbid": 211, "correlation": 10142 } }, { "ph": "s", "id": 10142, "pid": 76337, "tid": -914061504, "ts": 1716454216745048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216745084, "dur": 3, "args": { "External id": 10144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10144, "pid": 5, "tid": 7, "ts": 1716454216745084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216745074, "dur": 8, "args": { "External id": 10144, "cbid": 211, "correlation": 10144 } }, { "ph": "s", "id": 10144, "pid": 76337, "tid": -914061504, "ts": 1716454216745074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216745085, "dur": 0, "args": { "External id": 10145, "cbid": 51, "correlation": 10145 } }, { "ph": "s", "id": 10145, "pid": 76337, "tid": -914061504, "ts": 1716454216745085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216745095, "dur": 98, "args": { "External id": 10146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10146, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 10146, "pid": 5, "tid": 7, "ts": 1716454216745095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216745086, "dur": 7, "args": { "External id": 10146, "cbid": 211, "correlation": 10146 } }, { "ph": "s", "id": 10146, "pid": 76337, "tid": -914061504, "ts": 1716454216745086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216745596, "dur": 16, "args": { "External id": 10151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10151, "pid": 5, "tid": 7, "ts": 1716454216745596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216745583, "dur": 12, "args": { "External id": 10151, "cbid": 211, "correlation": 10151 } }, { "ph": "s", "id": 10151, "pid": 76337, "tid": -914061504, "ts": 1716454216745583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216745775, "dur": 44, "args": { "External id": 10162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10162, "pid": 5, "tid": 7, "ts": 1716454216745775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216745758, "dur": 17, "args": { "External id": 10162, "cbid": 211, "correlation": 10162 } }, { "ph": "s", "id": 10162, "pid": 76337, "tid": -914061504, "ts": 1716454216745758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216745827, "dur": 19, "args": { "External id": 10184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10184, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10184, "pid": 5, "tid": 7, "ts": 1716454216745827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216745814, "dur": 13, "args": { "External id": 10184, "cbid": 211, "correlation": 10184 } }, { "ph": "s", "id": 10184, "pid": 76337, "tid": -914061504, "ts": 1716454216745814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216745989, "dur": 4, "args": { "External id": 10195, "cbid": 251, "correlation": 10195 } }, { "ph": "f", "id": 10195, "pid": 76337, "tid": -914061504, "ts": 1716454216745989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216746018, "dur": 90, "args": { "External id": 10196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10196, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10196, "pid": 5, "tid": 7, "ts": 1716454216746018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746002, "dur": 31, "args": { "External id": 10196, "cbid": 211, "correlation": 10196 } }, { "ph": "s", "id": 10196, "pid": 76337, "tid": -914061504, "ts": 1716454216746002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746110, "dur": 1, "args": { "External id": 10207, "cbid": 251, "correlation": 10207 } }, { "ph": "f", "id": 10207, "pid": 76337, "tid": -914061504, "ts": 1716454216746110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216746127, "dur": 86, "args": { "External id": 10208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10208, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10208, "pid": 5, "tid": 7, "ts": 1716454216746127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746114, "dur": 13, "args": { "External id": 10208, "cbid": 211, "correlation": 10208 } }, { "ph": "s", "id": 10208, "pid": 76337, "tid": -914061504, "ts": 1716454216746114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746182, "dur": 1, "args": { "External id": 10219, "cbid": 251, "correlation": 10219 } }, { "ph": "f", "id": 10219, "pid": 76337, "tid": -914061504, "ts": 1716454216746182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216746215, "dur": 85, "args": { "External id": 10220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10220, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10220, "pid": 5, "tid": 7, "ts": 1716454216746215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746186, "dur": 13, "args": { "External id": 10220, "cbid": 211, "correlation": 10220 } }, { "ph": "s", "id": 10220, "pid": 76337, "tid": -914061504, "ts": 1716454216746186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216746324, "dur": 140, "args": { "External id": 10245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10245, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10245, "pid": 5, "tid": 7, "ts": 1716454216746324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746309, "dur": 15, "args": { "External id": 10245, "cbid": 211, "correlation": 10245 } }, { "ph": "s", "id": 10245, "pid": 76337, "tid": -914061504, "ts": 1716454216746309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746447, "dur": 2, "args": { "External id": 10263, "cbid": 251, "correlation": 10263 } }, { "ph": "f", "id": 10263, "pid": 76337, "tid": -914061504, "ts": 1716454216746447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216746471, "dur": 92, "args": { "External id": 10265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10265, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10265, "pid": 5, "tid": 7, "ts": 1716454216746471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746454, "dur": 16, "args": { "External id": 10265, "cbid": 211, "correlation": 10265 } }, { "ph": "s", "id": 10265, "pid": 76337, "tid": -914061504, "ts": 1716454216746454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216746569, "dur": 10, "args": { "External id": 10273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10273, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10273, "pid": 5, "tid": 7, "ts": 1716454216746569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746553, "dur": 15, "args": { "External id": 10273, "cbid": 211, "correlation": 10273 } }, { "ph": "s", "id": 10273, "pid": 76337, "tid": -914061504, "ts": 1716454216746553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216746609, "dur": 46, "args": { "External id": 10281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10281, "pid": 5, "tid": 7, "ts": 1716454216746609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746599, "dur": 10, "args": { "External id": 10281, "cbid": 211, "correlation": 10281 } }, { "ph": "s", "id": 10281, "pid": 76337, "tid": -914061504, "ts": 1716454216746599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216746700, "dur": 17, "args": { "External id": 10303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10303, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10303, "pid": 5, "tid": 7, "ts": 1716454216746700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746686, "dur": 13, "args": { "External id": 10303, "cbid": 211, "correlation": 10303 } }, { "ph": "s", "id": 10303, "pid": 76337, "tid": -914061504, "ts": 1716454216746686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746789, "dur": 1, "args": { "External id": 10314, "cbid": 251, "correlation": 10314 } }, { "ph": "f", "id": 10314, "pid": 76337, "tid": -914061504, "ts": 1716454216746789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216746809, "dur": 85, "args": { "External id": 10315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10315, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10315, "pid": 5, "tid": 7, "ts": 1716454216746809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746794, "dur": 14, "args": { "External id": 10315, "cbid": 211, "correlation": 10315 } }, { "ph": "s", "id": 10315, "pid": 76337, "tid": -914061504, "ts": 1716454216746794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746880, "dur": 1, "args": { "External id": 10326, "cbid": 251, "correlation": 10326 } }, { "ph": "f", "id": 10326, "pid": 76337, "tid": -914061504, "ts": 1716454216746880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746885, "dur": 0, "args": { "External id": 10327, "cbid": 251, "correlation": 10327 } }, { "ph": "f", "id": 10327, "pid": 76337, "tid": -914061504, "ts": 1716454216746885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216746903, "dur": 12, "args": { "External id": 10328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10328, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10328, "pid": 5, "tid": 7, "ts": 1716454216746903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746888, "dur": 14, "args": { "External id": 10328, "cbid": 211, "correlation": 10328 } }, { "ph": "s", "id": 10328, "pid": 76337, "tid": -914061504, "ts": 1716454216746888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216746918, "dur": 5, "args": { "External id": 10330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10330, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10330, "pid": 5, "tid": 7, "ts": 1716454216746918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746906, "dur": 9, "args": { "External id": 10330, "cbid": 211, "correlation": 10330 } }, { "ph": "s", "id": 10330, "pid": 76337, "tid": -914061504, "ts": 1716454216746906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746987, "dur": 1, "args": { "External id": 10341, "cbid": 251, "correlation": 10341 } }, { "ph": "f", "id": 10341, "pid": 76337, "tid": -914061504, "ts": 1716454216746987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216746991, "dur": 0, "args": { "External id": 10342, "cbid": 251, "correlation": 10342 } }, { "ph": "f", "id": 10342, "pid": 76337, "tid": -914061504, "ts": 1716454216746991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216747008, "dur": 9, "args": { "External id": 10343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10343, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10343, "pid": 5, "tid": 7, "ts": 1716454216747008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216746993, "dur": 14, "args": { "External id": 10343, "cbid": 211, "correlation": 10343 } }, { "ph": "s", "id": 10343, "pid": 76337, "tid": -914061504, "ts": 1716454216746993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216747018, "dur": 3, "args": { "External id": 10345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10345, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10345, "pid": 5, "tid": 7, "ts": 1716454216747018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216747009, "dur": 6, "args": { "External id": 10345, "cbid": 211, "correlation": 10345 } }, { "ph": "s", "id": 10345, "pid": 76337, "tid": -914061504, "ts": 1716454216747009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216747102, "dur": 57, "args": { "External id": 10370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10370, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10370, "pid": 5, "tid": 7, "ts": 1716454216747102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216747089, "dur": 13, "args": { "External id": 10370, "cbid": 211, "correlation": 10370 } }, { "ph": "s", "id": 10370, "pid": 76337, "tid": -914061504, "ts": 1716454216747089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216747205, "dur": 2, "args": { "External id": 10388, "cbid": 251, "correlation": 10388 } }, { "ph": "f", "id": 10388, "pid": 76337, "tid": -914061504, "ts": 1716454216747205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216747225, "dur": 88, "args": { "External id": 10390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10390, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10390, "pid": 5, "tid": 7, "ts": 1716454216747225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216747211, "dur": 14, "args": { "External id": 10390, "cbid": 211, "correlation": 10390 } }, { "ph": "s", "id": 10390, "pid": 76337, "tid": -914061504, "ts": 1716454216747211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216747314, "dur": 10, "args": { "External id": 10398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10398, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10398, "pid": 5, "tid": 7, "ts": 1716454216747314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216747285, "dur": 12, "args": { "External id": 10398, "cbid": 211, "correlation": 10398 } }, { "ph": "s", "id": 10398, "pid": 76337, "tid": -914061504, "ts": 1716454216747285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216747336, "dur": 14, "args": { "External id": 10406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10406, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10406, "pid": 5, "tid": 7, "ts": 1716454216747336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216747326, "dur": 9, "args": { "External id": 10406, "cbid": 211, "correlation": 10406 } }, { "ph": "s", "id": 10406, "pid": 76337, "tid": -914061504, "ts": 1716454216747326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216747388, "dur": 17, "args": { "External id": 10428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10428, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10428, "pid": 5, "tid": 7, "ts": 1716454216747388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216747377, "dur": 10, "args": { "External id": 10428, "cbid": 211, "correlation": 10428 } }, { "ph": "s", "id": 10428, "pid": 76337, "tid": -914061504, "ts": 1716454216747377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216765669, "dur": 231, "args": { "External id": 10444, "cbid": 251, "correlation": 10444 } }, { "ph": "f", "id": 10444, "pid": 76337, "tid": -914061504, "ts": 1716454216765669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216765908, "dur": 0, "args": { "External id": 10446, "cbid": 251, "correlation": 10446 } }, { "ph": "f", "id": 10446, "pid": 76337, "tid": -914061504, "ts": 1716454216765908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216765940, "dur": 487, "args": { "External id": 10447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10447, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10447, "pid": 5, "tid": 7, "ts": 1716454216765940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216765911, "dur": 30, "args": { "External id": 10447, "cbid": 211, "correlation": 10447 } }, { "ph": "s", "id": 10447, "pid": 76337, "tid": -914061504, "ts": 1716454216765911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216766429, "dur": 66, "args": { "External id": 10455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10455, "pid": 5, "tid": 7, "ts": 1716454216766429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766063, "dur": 16, "args": { "External id": 10455, "cbid": 211, "correlation": 10455 } }, { "ph": "s", "id": 10455, "pid": 76337, "tid": -914061504, "ts": 1716454216766063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216766497, "dur": 66, "args": { "External id": 10463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10463, "pid": 5, "tid": 7, "ts": 1716454216766497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766109, "dur": 11, "args": { "External id": 10463, "cbid": 211, "correlation": 10463 } }, { "ph": "s", "id": 10463, "pid": 76337, "tid": -914061504, "ts": 1716454216766109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216766207, "dur": 1, "args": { "External id": 10479, "cbid": 251, "correlation": 10479 } }, { "ph": "f", "id": 10479, "pid": 76337, "tid": -914061504, "ts": 1716454216766207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216766564, "dur": 1, "args": { "External id": 10481, "device": 5, "context": 1, "stream": 7, "correlation": 10481, "bytes": 240, "memory bandwidth (GB/s)": 0.125 } }, { "ph": "f", "id": 10481, "pid": 5, "tid": 7, "ts": 1716454216766564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216766213, "dur": 18, "args": { "External id": 10481, "cbid": 51, "correlation": 10481 } }, { "ph": "s", "id": 10481, "pid": 76337, "tid": -914061504, "ts": 1716454216766213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216766568, "dur": 280, "args": { "External id": 10482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10482, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10482, "pid": 5, "tid": 7, "ts": 1716454216766568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766232, "dur": 11, "args": { "External id": 10482, "cbid": 211, "correlation": 10482 } }, { "ph": "s", "id": 10482, "pid": 76337, "tid": -914061504, "ts": 1716454216766232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216766850, "dur": 14, "args": { "External id": 10490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10490, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10490, "pid": 5, "tid": 7, "ts": 1716454216766850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766276, "dur": 10, "args": { "External id": 10490, "cbid": 211, "correlation": 10490 } }, { "ph": "s", "id": 10490, "pid": 76337, "tid": -914061504, "ts": 1716454216766276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216766865, "dur": 38, "args": { "External id": 10501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10501, "pid": 5, "tid": 7, "ts": 1716454216766865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766353, "dur": 13, "args": { "External id": 10501, "cbid": 211, "correlation": 10501 } }, { "ph": "s", "id": 10501, "pid": 76337, "tid": -914061504, "ts": 1716454216766353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216766444, "dur": 1, "args": { "External id": 10513, "cbid": 317, "correlation": 10513 } }, { "ph": "f", "id": 10513, "pid": 76337, "tid": -914061504, "ts": 1716454216766444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216766446, "dur": 1, "args": { "External id": 10514, "cbid": 203, "correlation": 10514 } }, { "ph": "f", "id": 10514, "pid": 76337, "tid": -914061504, "ts": 1716454216766446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216766447, "dur": 1, "args": { "External id": 10515, "cbid": 205, "correlation": 10515 } }, { "ph": "f", "id": 10515, "pid": 76337, "tid": -914061504, "ts": 1716454216766447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216766904, "dur": 13, "args": { "External id": 10519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10519, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10519, "pid": 5, "tid": 7, "ts": 1716454216766904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766486, "dur": 15, "args": { "External id": 10519, "cbid": 211, "correlation": 10519 } }, { "ph": "s", "id": 10519, "pid": 76337, "tid": -914061504, "ts": 1716454216766486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216766919, "dur": 4, "args": { "External id": 10521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10521, "pid": 5, "tid": 7, "ts": 1716454216766919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766508, "dur": 7, "args": { "External id": 10521, "cbid": 211, "correlation": 10521 } }, { "ph": "s", "id": 10521, "pid": 76337, "tid": -914061504, "ts": 1716454216766508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216766519, "dur": 0, "args": { "External id": 10522, "cbid": 51, "correlation": 10522 } }, { "ph": "s", "id": 10522, "pid": 76337, "tid": -914061504, "ts": 1716454216766519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216766924, "dur": 97, "args": { "External id": 10523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10523, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 10523, "pid": 5, "tid": 7, "ts": 1716454216766924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766520, "dur": 7, "args": { "External id": 10523, "cbid": 211, "correlation": 10523 } }, { "ph": "s", "id": 10523, "pid": 76337, "tid": -914061504, "ts": 1716454216766520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216767023, "dur": 17, "args": { "External id": 10528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10528, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10528, "pid": 5, "tid": 7, "ts": 1716454216767023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766553, "dur": 9, "args": { "External id": 10528, "cbid": 211, "correlation": 10528 } }, { "ph": "s", "id": 10528, "pid": 76337, "tid": -914061504, "ts": 1716454216766553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216767041, "dur": 11, "args": { "External id": 10536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10536, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10536, "pid": 5, "tid": 7, "ts": 1716454216767041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766585, "dur": 9, "args": { "External id": 10536, "cbid": 211, "correlation": 10536 } }, { "ph": "s", "id": 10536, "pid": 76337, "tid": -914061504, "ts": 1716454216766585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216767053, "dur": 18, "args": { "External id": 10556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10556, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 10556, "pid": 5, "tid": 7, "ts": 1716454216767053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766685, "dur": 13, "args": { "External id": 10556, "cbid": 211, "correlation": 10556 } }, { "ph": "s", "id": 10556, "pid": 76337, "tid": -914061504, "ts": 1716454216766685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216767073, "dur": 5, "args": { "External id": 10568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10568, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 10568, "pid": 5, "tid": 7, "ts": 1716454216767073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766709, "dur": 7, "args": { "External id": 10568, "cbid": 211, "correlation": 10568 } }, { "ph": "s", "id": 10568, "pid": 76337, "tid": -914061504, "ts": 1716454216766709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216767079, "dur": 18, "args": { "External id": 10571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10571, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10571, "pid": 5, "tid": 7, "ts": 1716454216767079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766731, "dur": 7, "args": { "External id": 10571, "cbid": 211, "correlation": 10571 } }, { "ph": "s", "id": 10571, "pid": 76337, "tid": -914061504, "ts": 1716454216766731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216767098, "dur": 11, "args": { "External id": 10580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10580, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10580, "pid": 5, "tid": 7, "ts": 1716454216767098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766781, "dur": 11, "args": { "External id": 10580, "cbid": 211, "correlation": 10580 } }, { "ph": "s", "id": 10580, "pid": 76337, "tid": -914061504, "ts": 1716454216766781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216766853, "dur": 0, "args": { "External id": 10590, "cbid": 317, "correlation": 10590 } }, { "ph": "f", "id": 10590, "pid": 76337, "tid": -914061504, "ts": 1716454216766853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216766853, "dur": 0, "args": { "External id": 10591, "cbid": 203, "correlation": 10591 } }, { "ph": "f", "id": 10591, "pid": 76337, "tid": -914061504, "ts": 1716454216766853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216766854, "dur": 0, "args": { "External id": 10592, "cbid": 205, "correlation": 10592 } }, { "ph": "f", "id": 10592, "pid": 76337, "tid": -914061504, "ts": 1716454216766854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216767111, "dur": 11, "args": { "External id": 10596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10596, "pid": 5, "tid": 7, "ts": 1716454216767111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766870, "dur": 14, "args": { "External id": 10596, "cbid": 211, "correlation": 10596 } }, { "ph": "s", "id": 10596, "pid": 76337, "tid": -914061504, "ts": 1716454216766870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216767123, "dur": 165, "args": { "External id": 10598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10598, "pid": 5, "tid": 7, "ts": 1716454216767123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766886, "dur": 5, "args": { "External id": 10598, "cbid": 211, "correlation": 10598 } }, { "ph": "s", "id": 10598, "pid": 76337, "tid": -914061504, "ts": 1716454216766886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216767290, "dur": 1, "args": { "External id": 10600, "device": 5, "context": 1, "stream": 7, "correlation": 10600, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 10600, "pid": 5, "tid": 7, "ts": 1716454216767290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216766903, "dur": 8, "args": { "External id": 10600, "cbid": 51, "correlation": 10600 } }, { "ph": "s", "id": 10600, "pid": 76337, "tid": -914061504, "ts": 1716454216766903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216767294, "dur": 667, "args": { "External id": 10601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10601, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10601, "pid": 5, "tid": 7, "ts": 1716454216767294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766913, "dur": 9, "args": { "External id": 10601, "cbid": 211, "correlation": 10601 } }, { "ph": "s", "id": 10601, "pid": 76337, "tid": -914061504, "ts": 1716454216766913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216767962, "dur": 13, "args": { "External id": 10603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10603, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10603, "pid": 5, "tid": 7, "ts": 1716454216767962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766930, "dur": 7, "args": { "External id": 10603, "cbid": 211, "correlation": 10603 } }, { "ph": "s", "id": 10603, "pid": 76337, "tid": -914061504, "ts": 1716454216766930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216767977, "dur": 15, "args": { "External id": 10609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10609, "pid": 5, "tid": 7, "ts": 1716454216767977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216766961, "dur": 8, "args": { "External id": 10609, "cbid": 211, "correlation": 10609 } }, { "ph": "s", "id": 10609, "pid": 76337, "tid": -914061504, "ts": 1716454216766961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216767993, "dur": 3, "args": { "External id": 10617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10617, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 10617, "pid": 5, "tid": 7, "ts": 1716454216767993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767016, "dur": 11, "args": { "External id": 10617, "cbid": 211, "correlation": 10617 } }, { "ph": "s", "id": 10617, "pid": 76337, "tid": -914061504, "ts": 1716454216767016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216767102, "dur": 1, "args": { "External id": 10633, "cbid": 251, "correlation": 10633 } }, { "ph": "f", "id": 10633, "pid": 76337, "tid": -914061504, "ts": 1716454216767102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216767107, "dur": 0, "args": { "External id": 10635, "cbid": 251, "correlation": 10635 } }, { "ph": "f", "id": 10635, "pid": 76337, "tid": -914061504, "ts": 1716454216767107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216767998, "dur": 13, "args": { "External id": 10636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10636, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10636, "pid": 5, "tid": 7, "ts": 1716454216767998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767109, "dur": 13, "args": { "External id": 10636, "cbid": 211, "correlation": 10636 } }, { "ph": "s", "id": 10636, "pid": 76337, "tid": -914061504, "ts": 1716454216767109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216768013, "dur": 5, "args": { "External id": 10638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10638, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10638, "pid": 5, "tid": 7, "ts": 1716454216768013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767125, "dur": 7, "args": { "External id": 10638, "cbid": 211, "correlation": 10638 } }, { "ph": "s", "id": 10638, "pid": 76337, "tid": -914061504, "ts": 1716454216767125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216768019, "dur": 17, "args": { "External id": 10648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10648, "pid": 5, "tid": 7, "ts": 1716454216768019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767196, "dur": 13, "args": { "External id": 10648, "cbid": 211, "correlation": 10648 } }, { "ph": "s", "id": 10648, "pid": 76337, "tid": -914061504, "ts": 1716454216767196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216768038, "dur": 18, "args": { "External id": 10668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10668, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 10668, "pid": 5, "tid": 7, "ts": 1716454216768038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767263, "dur": 12, "args": { "External id": 10668, "cbid": 211, "correlation": 10668 } }, { "ph": "s", "id": 10668, "pid": 76337, "tid": -914061504, "ts": 1716454216767263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216768057, "dur": 4, "args": { "External id": 10680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10680, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 10680, "pid": 5, "tid": 7, "ts": 1716454216768057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767284, "dur": 6, "args": { "External id": 10680, "cbid": 211, "correlation": 10680 } }, { "ph": "s", "id": 10680, "pid": 76337, "tid": -914061504, "ts": 1716454216767284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216768063, "dur": 17, "args": { "External id": 10683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10683, "pid": 5, "tid": 7, "ts": 1716454216768063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767303, "dur": 6, "args": { "External id": 10683, "cbid": 211, "correlation": 10683 } }, { "ph": "s", "id": 10683, "pid": 76337, "tid": -914061504, "ts": 1716454216767303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216768081, "dur": 11, "args": { "External id": 10692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10692, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10692, "pid": 5, "tid": 7, "ts": 1716454216768081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767345, "dur": 10, "args": { "External id": 10692, "cbid": 211, "correlation": 10692 } }, { "ph": "s", "id": 10692, "pid": 76337, "tid": -914061504, "ts": 1716454216767345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216767422, "dur": 0, "args": { "External id": 10702, "cbid": 317, "correlation": 10702 } }, { "ph": "f", "id": 10702, "pid": 76337, "tid": -914061504, "ts": 1716454216767422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216767422, "dur": 0, "args": { "External id": 10703, "cbid": 203, "correlation": 10703 } }, { "ph": "f", "id": 10703, "pid": 76337, "tid": -914061504, "ts": 1716454216767422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216767423, "dur": 0, "args": { "External id": 10704, "cbid": 205, "correlation": 10704 } }, { "ph": "f", "id": 10704, "pid": 76337, "tid": -914061504, "ts": 1716454216767423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216768093, "dur": 13, "args": { "External id": 10708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10708, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10708, "pid": 5, "tid": 7, "ts": 1716454216768093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767437, "dur": 14, "args": { "External id": 10708, "cbid": 211, "correlation": 10708 } }, { "ph": "s", "id": 10708, "pid": 76337, "tid": -914061504, "ts": 1716454216767437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216768107, "dur": 165, "args": { "External id": 10710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10710, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10710, "pid": 5, "tid": 7, "ts": 1716454216768107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767453, "dur": 5, "args": { "External id": 10710, "cbid": 211, "correlation": 10710 } }, { "ph": "s", "id": 10710, "pid": 76337, "tid": -914061504, "ts": 1716454216767453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216768275, "dur": 1, "args": { "External id": 10712, "device": 5, "context": 1, "stream": 7, "correlation": 10712, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 10712, "pid": 5, "tid": 7, "ts": 1716454216768275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216767464, "dur": 6, "args": { "External id": 10712, "cbid": 51, "correlation": 10712 } }, { "ph": "s", "id": 10712, "pid": 76337, "tid": -914061504, "ts": 1716454216767464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216768278, "dur": 653, "args": { "External id": 10713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10713, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10713, "pid": 5, "tid": 7, "ts": 1716454216768278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767472, "dur": 6, "args": { "External id": 10713, "cbid": 211, "correlation": 10713 } }, { "ph": "s", "id": 10713, "pid": 76337, "tid": -914061504, "ts": 1716454216767472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216768933, "dur": 12, "args": { "External id": 10715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10715, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10715, "pid": 5, "tid": 7, "ts": 1716454216768933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767482, "dur": 5, "args": { "External id": 10715, "cbid": 211, "correlation": 10715 } }, { "ph": "s", "id": 10715, "pid": 76337, "tid": -914061504, "ts": 1716454216767482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216768946, "dur": 15, "args": { "External id": 10721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10721, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10721, "pid": 5, "tid": 7, "ts": 1716454216768946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767511, "dur": 8, "args": { "External id": 10721, "cbid": 211, "correlation": 10721 } }, { "ph": "s", "id": 10721, "pid": 76337, "tid": -914061504, "ts": 1716454216767511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216768962, "dur": 12, "args": { "External id": 10729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10729, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10729, "pid": 5, "tid": 7, "ts": 1716454216768962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767543, "dur": 8, "args": { "External id": 10729, "cbid": 211, "correlation": 10729 } }, { "ph": "s", "id": 10729, "pid": 76337, "tid": -914061504, "ts": 1716454216767543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216768975, "dur": 10, "args": { "External id": 10737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10737, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10737, "pid": 5, "tid": 7, "ts": 1716454216768975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767575, "dur": 8, "args": { "External id": 10737, "cbid": 211, "correlation": 10737 } }, { "ph": "s", "id": 10737, "pid": 76337, "tid": -914061504, "ts": 1716454216767575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216768987, "dur": 18, "args": { "External id": 10757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10757, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 10757, "pid": 5, "tid": 7, "ts": 1716454216768987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767656, "dur": 14, "args": { "External id": 10757, "cbid": 211, "correlation": 10757 } }, { "ph": "s", "id": 10757, "pid": 76337, "tid": -914061504, "ts": 1716454216767656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216769006, "dur": 4, "args": { "External id": 10769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10769, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 10769, "pid": 5, "tid": 7, "ts": 1716454216769006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767680, "dur": 6, "args": { "External id": 10769, "cbid": 211, "correlation": 10769 } }, { "ph": "s", "id": 10769, "pid": 76337, "tid": -914061504, "ts": 1716454216767680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216769011, "dur": 17, "args": { "External id": 10772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10772, "pid": 5, "tid": 7, "ts": 1716454216769011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767698, "dur": 6, "args": { "External id": 10772, "cbid": 211, "correlation": 10772 } }, { "ph": "s", "id": 10772, "pid": 76337, "tid": -914061504, "ts": 1716454216767698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216767768, "dur": 0, "args": { "External id": 10783, "cbid": 317, "correlation": 10783 } }, { "ph": "f", "id": 10783, "pid": 76337, "tid": -914061504, "ts": 1716454216767768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216767768, "dur": 0, "args": { "External id": 10784, "cbid": 203, "correlation": 10784 } }, { "ph": "f", "id": 10784, "pid": 76337, "tid": -914061504, "ts": 1716454216767768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216767769, "dur": 0, "args": { "External id": 10785, "cbid": 205, "correlation": 10785 } }, { "ph": "f", "id": 10785, "pid": 76337, "tid": -914061504, "ts": 1716454216767769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216769030, "dur": 11, "args": { "External id": 10789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10789, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10789, "pid": 5, "tid": 7, "ts": 1716454216769030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767782, "dur": 13, "args": { "External id": 10789, "cbid": 211, "correlation": 10789 } }, { "ph": "s", "id": 10789, "pid": 76337, "tid": -914061504, "ts": 1716454216767782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216769042, "dur": 4, "args": { "External id": 10791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10791, "pid": 5, "tid": 7, "ts": 1716454216769042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767800, "dur": 6, "args": { "External id": 10791, "cbid": 211, "correlation": 10791 } }, { "ph": "s", "id": 10791, "pid": 76337, "tid": -914061504, "ts": 1716454216767800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216767808, "dur": 0, "args": { "External id": 10792, "cbid": 51, "correlation": 10792 } }, { "ph": "s", "id": 10792, "pid": 76337, "tid": -914061504, "ts": 1716454216767808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216769047, "dur": 96, "args": { "External id": 10793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10793, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 10793, "pid": 5, "tid": 7, "ts": 1716454216769047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767809, "dur": 5, "args": { "External id": 10793, "cbid": 211, "correlation": 10793 } }, { "ph": "s", "id": 10793, "pid": 76337, "tid": -914061504, "ts": 1716454216767809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216769145, "dur": 15, "args": { "External id": 10798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10798, "pid": 5, "tid": 7, "ts": 1716454216769145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767837, "dur": 9, "args": { "External id": 10798, "cbid": 211, "correlation": 10798 } }, { "ph": "s", "id": 10798, "pid": 76337, "tid": -914061504, "ts": 1716454216767837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216769162, "dur": 43, "args": { "External id": 10809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10809, "pid": 5, "tid": 7, "ts": 1716454216769162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767937, "dur": 14, "args": { "External id": 10809, "cbid": 211, "correlation": 10809 } }, { "ph": "s", "id": 10809, "pid": 76337, "tid": -914061504, "ts": 1716454216767937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216769206, "dur": 19, "args": { "External id": 10831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10831, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10831, "pid": 5, "tid": 7, "ts": 1716454216769206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216767992, "dur": 10, "args": { "External id": 10831, "cbid": 211, "correlation": 10831 } }, { "ph": "s", "id": 10831, "pid": 76337, "tid": -914061504, "ts": 1716454216767992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768103, "dur": 2, "args": { "External id": 10842, "cbid": 251, "correlation": 10842 } }, { "ph": "f", "id": 10842, "pid": 76337, "tid": -914061504, "ts": 1716454216768103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216769226, "dur": 91, "args": { "External id": 10843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10843, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10843, "pid": 5, "tid": 7, "ts": 1716454216769226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768108, "dur": 14, "args": { "External id": 10843, "cbid": 211, "correlation": 10843 } }, { "ph": "s", "id": 10843, "pid": 76337, "tid": -914061504, "ts": 1716454216768108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768193, "dur": 1, "args": { "External id": 10854, "cbid": 251, "correlation": 10854 } }, { "ph": "f", "id": 10854, "pid": 76337, "tid": -914061504, "ts": 1716454216768193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216769319, "dur": 86, "args": { "External id": 10855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10855, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10855, "pid": 5, "tid": 7, "ts": 1716454216769319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768197, "dur": 12, "args": { "External id": 10855, "cbid": 211, "correlation": 10855 } }, { "ph": "s", "id": 10855, "pid": 76337, "tid": -914061504, "ts": 1716454216768197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768265, "dur": 1, "args": { "External id": 10866, "cbid": 251, "correlation": 10866 } }, { "ph": "f", "id": 10866, "pid": 76337, "tid": -914061504, "ts": 1716454216768265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216769406, "dur": 84, "args": { "External id": 10867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10867, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10867, "pid": 5, "tid": 7, "ts": 1716454216769406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768269, "dur": 11, "args": { "External id": 10867, "cbid": 211, "correlation": 10867 } }, { "ph": "s", "id": 10867, "pid": 76337, "tid": -914061504, "ts": 1716454216768269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216769492, "dur": 140, "args": { "External id": 10892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10892, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10892, "pid": 5, "tid": 7, "ts": 1716454216769492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768356, "dur": 13, "args": { "External id": 10892, "cbid": 211, "correlation": 10892 } }, { "ph": "s", "id": 10892, "pid": 76337, "tid": -914061504, "ts": 1716454216768356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768464, "dur": 1, "args": { "External id": 10910, "cbid": 251, "correlation": 10910 } }, { "ph": "f", "id": 10910, "pid": 76337, "tid": -914061504, "ts": 1716454216768464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216769633, "dur": 93, "args": { "External id": 10912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10912, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10912, "pid": 5, "tid": 7, "ts": 1716454216769633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768470, "dur": 14, "args": { "External id": 10912, "cbid": 211, "correlation": 10912 } }, { "ph": "s", "id": 10912, "pid": 76337, "tid": -914061504, "ts": 1716454216768470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216769727, "dur": 10, "args": { "External id": 10920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10920, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10920, "pid": 5, "tid": 7, "ts": 1716454216769727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768542, "dur": 13, "args": { "External id": 10920, "cbid": 211, "correlation": 10920 } }, { "ph": "s", "id": 10920, "pid": 76337, "tid": -914061504, "ts": 1716454216768542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216769738, "dur": 46, "args": { "External id": 10928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10928, "pid": 5, "tid": 7, "ts": 1716454216769738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768583, "dur": 10, "args": { "External id": 10928, "cbid": 211, "correlation": 10928 } }, { "ph": "s", "id": 10928, "pid": 76337, "tid": -914061504, "ts": 1716454216768583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216769786, "dur": 17, "args": { "External id": 10950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10950, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 10950, "pid": 5, "tid": 7, "ts": 1716454216769786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768635, "dur": 10, "args": { "External id": 10950, "cbid": 211, "correlation": 10950 } }, { "ph": "s", "id": 10950, "pid": 76337, "tid": -914061504, "ts": 1716454216768635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768727, "dur": 1, "args": { "External id": 10961, "cbid": 251, "correlation": 10961 } }, { "ph": "f", "id": 10961, "pid": 76337, "tid": -914061504, "ts": 1716454216768727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216769804, "dur": 86, "args": { "External id": 10962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10962, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 10962, "pid": 5, "tid": 7, "ts": 1716454216769804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768732, "dur": 12, "args": { "External id": 10962, "cbid": 211, "correlation": 10962 } }, { "ph": "s", "id": 10962, "pid": 76337, "tid": -914061504, "ts": 1716454216768732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768804, "dur": 1, "args": { "External id": 10973, "cbid": 251, "correlation": 10973 } }, { "ph": "f", "id": 10973, "pid": 76337, "tid": -914061504, "ts": 1716454216768804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768807, "dur": 0, "args": { "External id": 10974, "cbid": 251, "correlation": 10974 } }, { "ph": "f", "id": 10974, "pid": 76337, "tid": -914061504, "ts": 1716454216768807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216769892, "dur": 12, "args": { "External id": 10975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10975, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10975, "pid": 5, "tid": 7, "ts": 1716454216769892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768809, "dur": 12, "args": { "External id": 10975, "cbid": 211, "correlation": 10975 } }, { "ph": "s", "id": 10975, "pid": 76337, "tid": -914061504, "ts": 1716454216768809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216769906, "dur": 5, "args": { "External id": 10977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10977, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10977, "pid": 5, "tid": 7, "ts": 1716454216769906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768823, "dur": 6, "args": { "External id": 10977, "cbid": 211, "correlation": 10977 } }, { "ph": "s", "id": 10977, "pid": 76337, "tid": -914061504, "ts": 1716454216768823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768882, "dur": 1, "args": { "External id": 10988, "cbid": 251, "correlation": 10988 } }, { "ph": "f", "id": 10988, "pid": 76337, "tid": -914061504, "ts": 1716454216768882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216768885, "dur": 0, "args": { "External id": 10989, "cbid": 251, "correlation": 10989 } }, { "ph": "f", "id": 10989, "pid": 76337, "tid": -914061504, "ts": 1716454216768885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216769912, "dur": 9, "args": { "External id": 10990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10990, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10990, "pid": 5, "tid": 7, "ts": 1716454216769912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768887, "dur": 11, "args": { "External id": 10990, "cbid": 211, "correlation": 10990 } }, { "ph": "s", "id": 10990, "pid": 76337, "tid": -914061504, "ts": 1716454216768887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216769922, "dur": 3, "args": { "External id": 10992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 10992, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 10992, "pid": 5, "tid": 7, "ts": 1716454216769922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768900, "dur": 5, "args": { "External id": 10992, "cbid": 211, "correlation": 10992 } }, { "ph": "s", "id": 10992, "pid": 76337, "tid": -914061504, "ts": 1716454216768900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216769927, "dur": 57, "args": { "External id": 11017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11017, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11017, "pid": 5, "tid": 7, "ts": 1716454216769927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216768987, "dur": 14, "args": { "External id": 11017, "cbid": 211, "correlation": 11017 } }, { "ph": "s", "id": 11017, "pid": 76337, "tid": -914061504, "ts": 1716454216768987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216769102, "dur": 1, "args": { "External id": 11035, "cbid": 251, "correlation": 11035 } }, { "ph": "f", "id": 11035, "pid": 76337, "tid": -914061504, "ts": 1716454216769102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216769984, "dur": 87, "args": { "External id": 11037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11037, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 11037, "pid": 5, "tid": 7, "ts": 1716454216769984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769108, "dur": 15, "args": { "External id": 11037, "cbid": 211, "correlation": 11037 } }, { "ph": "s", "id": 11037, "pid": 76337, "tid": -914061504, "ts": 1716454216769108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216770073, "dur": 9, "args": { "External id": 11045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11045, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11045, "pid": 5, "tid": 7, "ts": 1716454216770073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769181, "dur": 12, "args": { "External id": 11045, "cbid": 211, "correlation": 11045 } }, { "ph": "s", "id": 11045, "pid": 76337, "tid": -914061504, "ts": 1716454216769181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216770084, "dur": 14, "args": { "External id": 11053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11053, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11053, "pid": 5, "tid": 7, "ts": 1716454216770084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769220, "dur": 9, "args": { "External id": 11053, "cbid": 211, "correlation": 11053 } }, { "ph": "s", "id": 11053, "pid": 76337, "tid": -914061504, "ts": 1716454216769220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216770098, "dur": 17, "args": { "External id": 11075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11075, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11075, "pid": 5, "tid": 7, "ts": 1716454216770098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769272, "dur": 10, "args": { "External id": 11075, "cbid": 211, "correlation": 11075 } }, { "ph": "s", "id": 11075, "pid": 76337, "tid": -914061504, "ts": 1716454216769272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216769359, "dur": 1, "args": { "External id": 11091, "cbid": 251, "correlation": 11091 } }, { "ph": "f", "id": 11091, "pid": 76337, "tid": -914061504, "ts": 1716454216769359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216769364, "dur": 0, "args": { "External id": 11093, "cbid": 251, "correlation": 11093 } }, { "ph": "f", "id": 11093, "pid": 76337, "tid": -914061504, "ts": 1716454216769364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216770117, "dur": 494, "args": { "External id": 11094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11094, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11094, "pid": 5, "tid": 7, "ts": 1716454216770117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769366, "dur": 13, "args": { "External id": 11094, "cbid": 211, "correlation": 11094 } }, { "ph": "s", "id": 11094, "pid": 76337, "tid": -914061504, "ts": 1716454216769366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216770612, "dur": 66, "args": { "External id": 11102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11102, "pid": 5, "tid": 7, "ts": 1716454216770612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769435, "dur": 13, "args": { "External id": 11102, "cbid": 211, "correlation": 11102 } }, { "ph": "s", "id": 11102, "pid": 76337, "tid": -914061504, "ts": 1716454216769435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216770680, "dur": 66, "args": { "External id": 11110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11110, "pid": 5, "tid": 7, "ts": 1716454216770680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769467, "dur": 8, "args": { "External id": 11110, "cbid": 211, "correlation": 11110 } }, { "ph": "s", "id": 11110, "pid": 76337, "tid": -914061504, "ts": 1716454216769467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216769547, "dur": 1, "args": { "External id": 11126, "cbid": 251, "correlation": 11126 } }, { "ph": "f", "id": 11126, "pid": 76337, "tid": -914061504, "ts": 1716454216769547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216770748, "dur": 1, "args": { "External id": 11128, "device": 5, "context": 1, "stream": 7, "correlation": 11128, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 11128, "pid": 5, "tid": 7, "ts": 1716454216770748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216769552, "dur": 10, "args": { "External id": 11128, "cbid": 51, "correlation": 11128 } }, { "ph": "s", "id": 11128, "pid": 76337, "tid": -914061504, "ts": 1716454216769552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216770752, "dur": 271, "args": { "External id": 11129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11129, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 11129, "pid": 5, "tid": 7, "ts": 1716454216770752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769564, "dur": 12, "args": { "External id": 11129, "cbid": 211, "correlation": 11129 } }, { "ph": "s", "id": 11129, "pid": 76337, "tid": -914061504, "ts": 1716454216769564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216771024, "dur": 13, "args": { "External id": 11137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11137, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11137, "pid": 5, "tid": 7, "ts": 1716454216771024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769606, "dur": 10, "args": { "External id": 11137, "cbid": 211, "correlation": 11137 } }, { "ph": "s", "id": 11137, "pid": 76337, "tid": -914061504, "ts": 1716454216769606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216771039, "dur": 38, "args": { "External id": 11148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11148, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11148, "pid": 5, "tid": 7, "ts": 1716454216771039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769669, "dur": 12, "args": { "External id": 11148, "cbid": 211, "correlation": 11148 } }, { "ph": "s", "id": 11148, "pid": 76337, "tid": -914061504, "ts": 1716454216769669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216769734, "dur": 0, "args": { "External id": 11160, "cbid": 317, "correlation": 11160 } }, { "ph": "f", "id": 11160, "pid": 76337, "tid": -914061504, "ts": 1716454216769734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216769735, "dur": 0, "args": { "External id": 11161, "cbid": 203, "correlation": 11161 } }, { "ph": "f", "id": 11161, "pid": 76337, "tid": -914061504, "ts": 1716454216769735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216769736, "dur": 0, "args": { "External id": 11162, "cbid": 205, "correlation": 11162 } }, { "ph": "f", "id": 11162, "pid": 76337, "tid": -914061504, "ts": 1716454216769736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216771078, "dur": 13, "args": { "External id": 11166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11166, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11166, "pid": 5, "tid": 7, "ts": 1716454216771078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769751, "dur": 12, "args": { "External id": 11166, "cbid": 211, "correlation": 11166 } }, { "ph": "s", "id": 11166, "pid": 76337, "tid": -914061504, "ts": 1716454216769751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216771093, "dur": 4, "args": { "External id": 11168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 11168, "pid": 5, "tid": 7, "ts": 1716454216771093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769768, "dur": 9, "args": { "External id": 11168, "cbid": 211, "correlation": 11168 } }, { "ph": "s", "id": 11168, "pid": 76337, "tid": -914061504, "ts": 1716454216769768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216769780, "dur": 0, "args": { "External id": 11169, "cbid": 51, "correlation": 11169 } }, { "ph": "s", "id": 11169, "pid": 76337, "tid": -914061504, "ts": 1716454216769780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216771098, "dur": 99, "args": { "External id": 11170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11170, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 11170, "pid": 5, "tid": 7, "ts": 1716454216771098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769781, "dur": 5, "args": { "External id": 11170, "cbid": 211, "correlation": 11170 } }, { "ph": "s", "id": 11170, "pid": 76337, "tid": -914061504, "ts": 1716454216769781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216771198, "dur": 17, "args": { "External id": 11175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11175, "pid": 5, "tid": 7, "ts": 1716454216771198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769808, "dur": 9, "args": { "External id": 11175, "cbid": 211, "correlation": 11175 } }, { "ph": "s", "id": 11175, "pid": 76337, "tid": -914061504, "ts": 1716454216769808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216771216, "dur": 12, "args": { "External id": 11183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11183, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11183, "pid": 5, "tid": 7, "ts": 1716454216771216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216769839, "dur": 8, "args": { "External id": 11183, "cbid": 211, "correlation": 11183 } }, { "ph": "s", "id": 11183, "pid": 76337, "tid": -914061504, "ts": 1716454216769839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216769913, "dur": 0, "args": { "External id": 11193, "cbid": 317, "correlation": 11193 } }, { "ph": "f", "id": 11193, "pid": 76337, "tid": -914061504, "ts": 1716454216769913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216769913, "dur": 0, "args": { "External id": 11194, "cbid": 203, "correlation": 11194 } }, { "ph": "f", "id": 11194, "pid": 76337, "tid": -914061504, "ts": 1716454216769913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216769914, "dur": 0, "args": { "External id": 11195, "cbid": 205, "correlation": 11195 } }, { "ph": "f", "id": 11195, "pid": 76337, "tid": -914061504, "ts": 1716454216769914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216773089, "dur": 12, "args": { "External id": 11199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11199, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11199, "pid": 5, "tid": 7, "ts": 1716454216773089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216773058, "dur": 32, "args": { "External id": 11199, "cbid": 211, "correlation": 11199 } }, { "ph": "s", "id": 11199, "pid": 76337, "tid": -914061504, "ts": 1716454216773058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216773103, "dur": 166, "args": { "External id": 11201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11201, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11201, "pid": 5, "tid": 7, "ts": 1716454216773103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216773094, "dur": 6, "args": { "External id": 11201, "cbid": 211, "correlation": 11201 } }, { "ph": "s", "id": 11201, "pid": 76337, "tid": -914061504, "ts": 1716454216773094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216773271, "dur": 1, "args": { "External id": 11203, "device": 5, "context": 1, "stream": 7, "correlation": 11203, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 11203, "pid": 5, "tid": 7, "ts": 1716454216773271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216773109, "dur": 12, "args": { "External id": 11203, "cbid": 51, "correlation": 11203 } }, { "ph": "s", "id": 11203, "pid": 76337, "tid": -914061504, "ts": 1716454216773109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216773295, "dur": 196, "args": { "External id": 11204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11204, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 11204, "pid": 5, "tid": 7, "ts": 1716454216773295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216773123, "dur": 172, "args": { "External id": 11204, "cbid": 211, "correlation": 11204 } }, { "ph": "s", "id": 11204, "pid": 76337, "tid": -914061504, "ts": 1716454216773123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216773492, "dur": 6, "args": { "External id": 11206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11206, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11206, "pid": 5, "tid": 7, "ts": 1716454216773492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216773303, "dur": 9, "args": { "External id": 11206, "cbid": 211, "correlation": 11206 } }, { "ph": "s", "id": 11206, "pid": 76337, "tid": -914061504, "ts": 1716454216773303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216774107, "dur": 8, "args": { "External id": 11212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11212, "pid": 5, "tid": 7, "ts": 1716454216774107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216774094, "dur": 13, "args": { "External id": 11212, "cbid": 211, "correlation": 11212 } }, { "ph": "s", "id": 11212, "pid": 76337, "tid": -914061504, "ts": 1716454216774094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216774315, "dur": 11, "args": { "External id": 11232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11232, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11232, "pid": 5, "tid": 7, "ts": 1716454216774315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216774301, "dur": 14, "args": { "External id": 11232, "cbid": 211, "correlation": 11232 } }, { "ph": "s", "id": 11232, "pid": 76337, "tid": -914061504, "ts": 1716454216774301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216774336, "dur": 4, "args": { "External id": 11244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11244, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11244, "pid": 5, "tid": 7, "ts": 1716454216774336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216774327, "dur": 9, "args": { "External id": 11244, "cbid": 211, "correlation": 11244 } }, { "ph": "s", "id": 11244, "pid": 76337, "tid": -914061504, "ts": 1716454216774327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216774365, "dur": 8, "args": { "External id": 11247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11247, "pid": 5, "tid": 7, "ts": 1716454216774365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216774355, "dur": 8, "args": { "External id": 11247, "cbid": 211, "correlation": 11247 } }, { "ph": "s", "id": 11247, "pid": 76337, "tid": -914061504, "ts": 1716454216774355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216774423, "dur": 5, "args": { "External id": 11256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11256, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11256, "pid": 5, "tid": 7, "ts": 1716454216774423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216774412, "dur": 10, "args": { "External id": 11256, "cbid": 211, "correlation": 11256 } }, { "ph": "s", "id": 11256, "pid": 76337, "tid": -914061504, "ts": 1716454216774412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216774502, "dur": 0, "args": { "External id": 11266, "cbid": 317, "correlation": 11266 } }, { "ph": "f", "id": 11266, "pid": 76337, "tid": -914061504, "ts": 1716454216774502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216774503, "dur": 0, "args": { "External id": 11267, "cbid": 203, "correlation": 11267 } }, { "ph": "f", "id": 11267, "pid": 76337, "tid": -914061504, "ts": 1716454216774503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216774504, "dur": 0, "args": { "External id": 11268, "cbid": 205, "correlation": 11268 } }, { "ph": "f", "id": 11268, "pid": 76337, "tid": -914061504, "ts": 1716454216774504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216777609, "dur": 6, "args": { "External id": 11272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11272, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11272, "pid": 5, "tid": 7, "ts": 1716454216777609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216777585, "dur": 26, "args": { "External id": 11272, "cbid": 211, "correlation": 11272 } }, { "ph": "s", "id": 11272, "pid": 76337, "tid": -914061504, "ts": 1716454216777585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216777621, "dur": 164, "args": { "External id": 11274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11274, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11274, "pid": 5, "tid": 7, "ts": 1716454216777621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216777614, "dur": 6, "args": { "External id": 11274, "cbid": 211, "correlation": 11274 } }, { "ph": "s", "id": 11274, "pid": 76337, "tid": -914061504, "ts": 1716454216777614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216777788, "dur": 1, "args": { "External id": 11276, "device": 5, "context": 1, "stream": 7, "correlation": 11276, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 11276, "pid": 5, "tid": 7, "ts": 1716454216777788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216777629, "dur": 14, "args": { "External id": 11276, "cbid": 51, "correlation": 11276 } }, { "ph": "s", "id": 11276, "pid": 76337, "tid": -914061504, "ts": 1716454216777629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216777791, "dur": 273, "args": { "External id": 11277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11277, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11277, "pid": 5, "tid": 7, "ts": 1716454216777791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216777645, "dur": 12, "args": { "External id": 11277, "cbid": 211, "correlation": 11277 } }, { "ph": "s", "id": 11277, "pid": 76337, "tid": -914061504, "ts": 1716454216777645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216778066, "dur": 6, "args": { "External id": 11279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11279, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11279, "pid": 5, "tid": 7, "ts": 1716454216778066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216777663, "dur": 7, "args": { "External id": 11279, "cbid": 211, "correlation": 11279 } }, { "ph": "s", "id": 11279, "pid": 76337, "tid": -914061504, "ts": 1716454216777663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216778435, "dur": 6, "args": { "External id": 11285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11285, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11285, "pid": 5, "tid": 7, "ts": 1716454216778435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778420, "dur": 15, "args": { "External id": 11285, "cbid": 211, "correlation": 11285 } }, { "ph": "s", "id": 11285, "pid": 76337, "tid": -914061504, "ts": 1716454216778420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216778530, "dur": 3, "args": { "External id": 11293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11293, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 11293, "pid": 5, "tid": 7, "ts": 1716454216778530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778517, "dur": 12, "args": { "External id": 11293, "cbid": 211, "correlation": 11293 } }, { "ph": "s", "id": 11293, "pid": 76337, "tid": -914061504, "ts": 1716454216778517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216778668, "dur": 4, "args": { "External id": 11309, "cbid": 251, "correlation": 11309 } }, { "ph": "f", "id": 11309, "pid": 76337, "tid": -914061504, "ts": 1716454216778668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216778679, "dur": 0, "args": { "External id": 11311, "cbid": 251, "correlation": 11311 } }, { "ph": "f", "id": 11311, "pid": 76337, "tid": -914061504, "ts": 1716454216778679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216778699, "dur": 13, "args": { "External id": 11312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11312, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11312, "pid": 5, "tid": 7, "ts": 1716454216778699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778684, "dur": 16, "args": { "External id": 11312, "cbid": 211, "correlation": 11312 } }, { "ph": "s", "id": 11312, "pid": 76337, "tid": -914061504, "ts": 1716454216778684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216778715, "dur": 5, "args": { "External id": 11314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11314, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11314, "pid": 5, "tid": 7, "ts": 1716454216778715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778704, "dur": 8, "args": { "External id": 11314, "cbid": 211, "correlation": 11314 } }, { "ph": "s", "id": 11314, "pid": 76337, "tid": -914061504, "ts": 1716454216778704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216778811, "dur": 6, "args": { "External id": 11324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11324, "pid": 5, "tid": 7, "ts": 1716454216778811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778798, "dur": 13, "args": { "External id": 11324, "cbid": 211, "correlation": 11324 } }, { "ph": "s", "id": 11324, "pid": 76337, "tid": -914061504, "ts": 1716454216778798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216778904, "dur": 10, "args": { "External id": 11344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11344, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11344, "pid": 5, "tid": 7, "ts": 1716454216778904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778891, "dur": 12, "args": { "External id": 11344, "cbid": 211, "correlation": 11344 } }, { "ph": "s", "id": 11344, "pid": 76337, "tid": -914061504, "ts": 1716454216778891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216778923, "dur": 4, "args": { "External id": 11356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11356, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11356, "pid": 5, "tid": 7, "ts": 1716454216778923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778915, "dur": 8, "args": { "External id": 11356, "cbid": 211, "correlation": 11356 } }, { "ph": "s", "id": 11356, "pid": 76337, "tid": -914061504, "ts": 1716454216778915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216778950, "dur": 7, "args": { "External id": 11359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11359, "pid": 5, "tid": 7, "ts": 1716454216778950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778941, "dur": 7, "args": { "External id": 11359, "cbid": 211, "correlation": 11359 } }, { "ph": "s", "id": 11359, "pid": 76337, "tid": -914061504, "ts": 1716454216778941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216779008, "dur": 5, "args": { "External id": 11368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11368, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11368, "pid": 5, "tid": 7, "ts": 1716454216779008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216778996, "dur": 11, "args": { "External id": 11368, "cbid": 211, "correlation": 11368 } }, { "ph": "s", "id": 11368, "pid": 76337, "tid": -914061504, "ts": 1716454216778996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216779105, "dur": 0, "args": { "External id": 11378, "cbid": 317, "correlation": 11378 } }, { "ph": "f", "id": 11378, "pid": 76337, "tid": -914061504, "ts": 1716454216779105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216779106, "dur": 1, "args": { "External id": 11379, "cbid": 203, "correlation": 11379 } }, { "ph": "f", "id": 11379, "pid": 76337, "tid": -914061504, "ts": 1716454216779106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216779108, "dur": 0, "args": { "External id": 11380, "cbid": 205, "correlation": 11380 } }, { "ph": "f", "id": 11380, "pid": 76337, "tid": -914061504, "ts": 1716454216779108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216779138, "dur": 5, "args": { "External id": 11384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11384, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11384, "pid": 5, "tid": 7, "ts": 1716454216779138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216779124, "dur": 14, "args": { "External id": 11384, "cbid": 211, "correlation": 11384 } }, { "ph": "s", "id": 11384, "pid": 76337, "tid": -914061504, "ts": 1716454216779124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216779148, "dur": 163, "args": { "External id": 11386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11386, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11386, "pid": 5, "tid": 7, "ts": 1716454216779148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216779140, "dur": 6, "args": { "External id": 11386, "cbid": 211, "correlation": 11386 } }, { "ph": "s", "id": 11386, "pid": 76337, "tid": -914061504, "ts": 1716454216779140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216779313, "dur": 1, "args": { "External id": 11388, "device": 5, "context": 1, "stream": 7, "correlation": 11388, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 11388, "pid": 5, "tid": 7, "ts": 1716454216779313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216779152, "dur": 8, "args": { "External id": 11388, "cbid": 51, "correlation": 11388 } }, { "ph": "s", "id": 11388, "pid": 76337, "tid": -914061504, "ts": 1716454216779152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216779317, "dur": 263, "args": { "External id": 11389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11389, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11389, "pid": 5, "tid": 7, "ts": 1716454216779317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216779161, "dur": 7, "args": { "External id": 11389, "cbid": 211, "correlation": 11389 } }, { "ph": "s", "id": 11389, "pid": 76337, "tid": -914061504, "ts": 1716454216779161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216779581, "dur": 6, "args": { "External id": 11391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11391, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11391, "pid": 5, "tid": 7, "ts": 1716454216779581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216779171, "dur": 6, "args": { "External id": 11391, "cbid": 211, "correlation": 11391 } }, { "ph": "s", "id": 11391, "pid": 76337, "tid": -914061504, "ts": 1716454216779171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216779588, "dur": 6, "args": { "External id": 11397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11397, "pid": 5, "tid": 7, "ts": 1716454216779588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216779202, "dur": 8, "args": { "External id": 11397, "cbid": 211, "correlation": 11397 } }, { "ph": "s", "id": 11397, "pid": 76337, "tid": -914061504, "ts": 1716454216779202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216779596, "dur": 5, "args": { "External id": 11405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11405, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11405, "pid": 5, "tid": 7, "ts": 1716454216779596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216779236, "dur": 9, "args": { "External id": 11405, "cbid": 211, "correlation": 11405 } }, { "ph": "s", "id": 11405, "pid": 76337, "tid": -914061504, "ts": 1716454216779236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216779280, "dur": 1, "args": { "External id": 11413, "cbid": 317, "correlation": 11413 } }, { "ph": "f", "id": 11413, "pid": 76337, "tid": -914061504, "ts": 1716454216779280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216779282, "dur": 10016, "args": { "External id": 11414, "cbid": 20, "correlation": 11414 } }, { "ph": "f", "id": 11414, "pid": 76337, "tid": -914061504, "ts": 1716454216779282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216789338, "dur": 5, "args": { "External id": 11417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11417, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11417, "pid": 5, "tid": 7, "ts": 1716454216789338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789315, "dur": 24, "args": { "External id": 11417, "cbid": 211, "correlation": 11417 } }, { "ph": "s", "id": 11417, "pid": 76337, "tid": -914061504, "ts": 1716454216789315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216789443, "dur": 10, "args": { "External id": 11437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11437, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11437, "pid": 5, "tid": 7, "ts": 1716454216789443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789430, "dur": 12, "args": { "External id": 11437, "cbid": 211, "correlation": 11437 } }, { "ph": "s", "id": 11437, "pid": 76337, "tid": -914061504, "ts": 1716454216789430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216789461, "dur": 4, "args": { "External id": 11449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11449, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11449, "pid": 5, "tid": 7, "ts": 1716454216789461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789453, "dur": 7, "args": { "External id": 11449, "cbid": 211, "correlation": 11449 } }, { "ph": "s", "id": 11449, "pid": 76337, "tid": -914061504, "ts": 1716454216789453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216789482, "dur": 6, "args": { "External id": 11452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11452, "pid": 5, "tid": 7, "ts": 1716454216789482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789474, "dur": 7, "args": { "External id": 11452, "cbid": 211, "correlation": 11452 } }, { "ph": "s", "id": 11452, "pid": 76337, "tid": -914061504, "ts": 1716454216789474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216789526, "dur": 5, "args": { "External id": 11461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11461, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11461, "pid": 5, "tid": 7, "ts": 1716454216789526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789515, "dur": 10, "args": { "External id": 11461, "cbid": 211, "correlation": 11461 } }, { "ph": "s", "id": 11461, "pid": 76337, "tid": -914061504, "ts": 1716454216789515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216789587, "dur": 0, "args": { "External id": 11471, "cbid": 317, "correlation": 11471 } }, { "ph": "f", "id": 11471, "pid": 76337, "tid": -914061504, "ts": 1716454216789587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216789588, "dur": 0, "args": { "External id": 11472, "cbid": 203, "correlation": 11472 } }, { "ph": "f", "id": 11472, "pid": 76337, "tid": -914061504, "ts": 1716454216789588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216789589, "dur": 0, "args": { "External id": 11473, "cbid": 205, "correlation": 11473 } }, { "ph": "f", "id": 11473, "pid": 76337, "tid": -914061504, "ts": 1716454216789589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216789616, "dur": 5, "args": { "External id": 11477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11477, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11477, "pid": 5, "tid": 7, "ts": 1716454216789616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789604, "dur": 12, "args": { "External id": 11477, "cbid": 211, "correlation": 11477 } }, { "ph": "s", "id": 11477, "pid": 76337, "tid": -914061504, "ts": 1716454216789604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216789626, "dur": 164, "args": { "External id": 11479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11479, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11479, "pid": 5, "tid": 7, "ts": 1716454216789626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789619, "dur": 6, "args": { "External id": 11479, "cbid": 211, "correlation": 11479 } }, { "ph": "s", "id": 11479, "pid": 76337, "tid": -914061504, "ts": 1716454216789619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216789792, "dur": 1, "args": { "External id": 11481, "device": 5, "context": 1, "stream": 7, "correlation": 11481, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 11481, "pid": 5, "tid": 7, "ts": 1716454216789792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216789631, "dur": 8, "args": { "External id": 11481, "cbid": 51, "correlation": 11481 } }, { "ph": "s", "id": 11481, "pid": 76337, "tid": -914061504, "ts": 1716454216789631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216789796, "dur": 261, "args": { "External id": 11482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11482, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11482, "pid": 5, "tid": 7, "ts": 1716454216789796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789640, "dur": 6, "args": { "External id": 11482, "cbid": 211, "correlation": 11482 } }, { "ph": "s", "id": 11482, "pid": 76337, "tid": -914061504, "ts": 1716454216789640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216790058, "dur": 5, "args": { "External id": 11484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11484, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11484, "pid": 5, "tid": 7, "ts": 1716454216790058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789651, "dur": 5, "args": { "External id": 11484, "cbid": 211, "correlation": 11484 } }, { "ph": "s", "id": 11484, "pid": 76337, "tid": -914061504, "ts": 1716454216789651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216790065, "dur": 6, "args": { "External id": 11490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11490, "pid": 5, "tid": 7, "ts": 1716454216790065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789680, "dur": 10, "args": { "External id": 11490, "cbid": 211, "correlation": 11490 } }, { "ph": "s", "id": 11490, "pid": 76337, "tid": -914061504, "ts": 1716454216789680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216790072, "dur": 3, "args": { "External id": 11498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11498, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 11498, "pid": 5, "tid": 7, "ts": 1716454216790072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789726, "dur": 10, "args": { "External id": 11498, "cbid": 211, "correlation": 11498 } }, { "ph": "s", "id": 11498, "pid": 76337, "tid": -914061504, "ts": 1716454216789726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216789823, "dur": 2, "args": { "External id": 11514, "cbid": 251, "correlation": 11514 } }, { "ph": "f", "id": 11514, "pid": 76337, "tid": -914061504, "ts": 1716454216789823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216789829, "dur": 0, "args": { "External id": 11516, "cbid": 251, "correlation": 11516 } }, { "ph": "f", "id": 11516, "pid": 76337, "tid": -914061504, "ts": 1716454216789829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216790077, "dur": 11, "args": { "External id": 11517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11517, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11517, "pid": 5, "tid": 7, "ts": 1716454216790077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789831, "dur": 13, "args": { "External id": 11517, "cbid": 211, "correlation": 11517 } }, { "ph": "s", "id": 11517, "pid": 76337, "tid": -914061504, "ts": 1716454216789831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216790089, "dur": 4, "args": { "External id": 11519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11519, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11519, "pid": 5, "tid": 7, "ts": 1716454216790089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789847, "dur": 6, "args": { "External id": 11519, "cbid": 211, "correlation": 11519 } }, { "ph": "s", "id": 11519, "pid": 76337, "tid": -914061504, "ts": 1716454216789847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216790093, "dur": 6, "args": { "External id": 11529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11529, "pid": 5, "tid": 7, "ts": 1716454216790093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789910, "dur": 13, "args": { "External id": 11529, "cbid": 211, "correlation": 11529 } }, { "ph": "s", "id": 11529, "pid": 76337, "tid": -914061504, "ts": 1716454216789910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216790100, "dur": 10, "args": { "External id": 11549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11549, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11549, "pid": 5, "tid": 7, "ts": 1716454216790100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216789987, "dur": 11, "args": { "External id": 11549, "cbid": 211, "correlation": 11549 } }, { "ph": "s", "id": 11549, "pid": 76337, "tid": -914061504, "ts": 1716454216789987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216790112, "dur": 4, "args": { "External id": 11561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11561, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11561, "pid": 5, "tid": 7, "ts": 1716454216790112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790009, "dur": 6, "args": { "External id": 11561, "cbid": 211, "correlation": 11561 } }, { "ph": "s", "id": 11561, "pid": 76337, "tid": -914061504, "ts": 1716454216790009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216790117, "dur": 7, "args": { "External id": 11564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11564, "pid": 5, "tid": 7, "ts": 1716454216790117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790027, "dur": 7, "args": { "External id": 11564, "cbid": 211, "correlation": 11564 } }, { "ph": "s", "id": 11564, "pid": 76337, "tid": -914061504, "ts": 1716454216790027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216790125, "dur": 5, "args": { "External id": 11573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11573, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11573, "pid": 5, "tid": 7, "ts": 1716454216790125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790068, "dur": 10, "args": { "External id": 11573, "cbid": 211, "correlation": 11573 } }, { "ph": "s", "id": 11573, "pid": 76337, "tid": -914061504, "ts": 1716454216790068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216790149, "dur": 0, "args": { "External id": 11583, "cbid": 317, "correlation": 11583 } }, { "ph": "f", "id": 11583, "pid": 76337, "tid": -914061504, "ts": 1716454216790149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216790150, "dur": 0, "args": { "External id": 11584, "cbid": 203, "correlation": 11584 } }, { "ph": "f", "id": 11584, "pid": 76337, "tid": -914061504, "ts": 1716454216790150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216790151, "dur": 0, "args": { "External id": 11585, "cbid": 205, "correlation": 11585 } }, { "ph": "f", "id": 11585, "pid": 76337, "tid": -914061504, "ts": 1716454216790151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216790178, "dur": 5, "args": { "External id": 11589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11589, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11589, "pid": 5, "tid": 7, "ts": 1716454216790178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790165, "dur": 13, "args": { "External id": 11589, "cbid": 211, "correlation": 11589 } }, { "ph": "s", "id": 11589, "pid": 76337, "tid": -914061504, "ts": 1716454216790165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216790188, "dur": 163, "args": { "External id": 11591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11591, "pid": 5, "tid": 7, "ts": 1716454216790188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790180, "dur": 6, "args": { "External id": 11591, "cbid": 211, "correlation": 11591 } }, { "ph": "s", "id": 11591, "pid": 76337, "tid": -914061504, "ts": 1716454216790180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216790354, "dur": 1, "args": { "External id": 11593, "device": 5, "context": 1, "stream": 7, "correlation": 11593, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 11593, "pid": 5, "tid": 7, "ts": 1716454216790354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216790191, "dur": 7, "args": { "External id": 11593, "cbid": 51, "correlation": 11593 } }, { "ph": "s", "id": 11593, "pid": 76337, "tid": -914061504, "ts": 1716454216790191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216790357, "dur": 260, "args": { "External id": 11594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11594, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11594, "pid": 5, "tid": 7, "ts": 1716454216790357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790199, "dur": 6, "args": { "External id": 11594, "cbid": 211, "correlation": 11594 } }, { "ph": "s", "id": 11594, "pid": 76337, "tid": -914061504, "ts": 1716454216790199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216790619, "dur": 6, "args": { "External id": 11596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11596, "pid": 5, "tid": 7, "ts": 1716454216790619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790209, "dur": 6, "args": { "External id": 11596, "cbid": 211, "correlation": 11596 } }, { "ph": "s", "id": 11596, "pid": 76337, "tid": -914061504, "ts": 1716454216790209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216790627, "dur": 7, "args": { "External id": 11602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11602, "pid": 5, "tid": 7, "ts": 1716454216790627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790239, "dur": 8, "args": { "External id": 11602, "cbid": 211, "correlation": 11602 } }, { "ph": "s", "id": 11602, "pid": 76337, "tid": -914061504, "ts": 1716454216790239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216790634, "dur": 5, "args": { "External id": 11610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11610, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11610, "pid": 5, "tid": 7, "ts": 1716454216790634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790272, "dur": 10, "args": { "External id": 11610, "cbid": 211, "correlation": 11610 } }, { "ph": "s", "id": 11610, "pid": 76337, "tid": -914061504, "ts": 1716454216790272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216790641, "dur": 4, "args": { "External id": 11618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11618, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11618, "pid": 5, "tid": 7, "ts": 1716454216790641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790304, "dur": 8, "args": { "External id": 11618, "cbid": 211, "correlation": 11618 } }, { "ph": "s", "id": 11618, "pid": 76337, "tid": -914061504, "ts": 1716454216790304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216790647, "dur": 10, "args": { "External id": 11638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11638, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11638, "pid": 5, "tid": 7, "ts": 1716454216790647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790470, "dur": 14, "args": { "External id": 11638, "cbid": 211, "correlation": 11638 } }, { "ph": "s", "id": 11638, "pid": 76337, "tid": -914061504, "ts": 1716454216790470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216790658, "dur": 4, "args": { "External id": 11650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11650, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11650, "pid": 5, "tid": 7, "ts": 1716454216790658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790494, "dur": 7, "args": { "External id": 11650, "cbid": 211, "correlation": 11650 } }, { "ph": "s", "id": 11650, "pid": 76337, "tid": -914061504, "ts": 1716454216790494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216790663, "dur": 7, "args": { "External id": 11653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11653, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11653, "pid": 5, "tid": 7, "ts": 1716454216790663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790513, "dur": 7, "args": { "External id": 11653, "cbid": 211, "correlation": 11653 } }, { "ph": "s", "id": 11653, "pid": 76337, "tid": -914061504, "ts": 1716454216790513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216790671, "dur": 5, "args": { "External id": 11662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11662, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11662, "pid": 5, "tid": 7, "ts": 1716454216790671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790552, "dur": 10, "args": { "External id": 11662, "cbid": 211, "correlation": 11662 } }, { "ph": "s", "id": 11662, "pid": 76337, "tid": -914061504, "ts": 1716454216790552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216790618, "dur": 0, "args": { "External id": 11672, "cbid": 317, "correlation": 11672 } }, { "ph": "f", "id": 11672, "pid": 76337, "tid": -914061504, "ts": 1716454216790618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216790618, "dur": 0, "args": { "External id": 11673, "cbid": 203, "correlation": 11673 } }, { "ph": "f", "id": 11673, "pid": 76337, "tid": -914061504, "ts": 1716454216790618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216790619, "dur": 0, "args": { "External id": 11674, "cbid": 205, "correlation": 11674 } }, { "ph": "f", "id": 11674, "pid": 76337, "tid": -914061504, "ts": 1716454216790619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216790678, "dur": 5, "args": { "External id": 11678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11678, "pid": 5, "tid": 7, "ts": 1716454216790678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790643, "dur": 14, "args": { "External id": 11678, "cbid": 211, "correlation": 11678 } }, { "ph": "s", "id": 11678, "pid": 76337, "tid": -914061504, "ts": 1716454216790643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216790685, "dur": 163, "args": { "External id": 11680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11680, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11680, "pid": 5, "tid": 7, "ts": 1716454216790685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790659, "dur": 5, "args": { "External id": 11680, "cbid": 211, "correlation": 11680 } }, { "ph": "s", "id": 11680, "pid": 76337, "tid": -914061504, "ts": 1716454216790659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216790850, "dur": 1, "args": { "External id": 11682, "device": 5, "context": 1, "stream": 7, "correlation": 11682, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 11682, "pid": 5, "tid": 7, "ts": 1716454216790850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216790670, "dur": 7, "args": { "External id": 11682, "cbid": 51, "correlation": 11682 } }, { "ph": "s", "id": 11682, "pid": 76337, "tid": -914061504, "ts": 1716454216790670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216790854, "dur": 261, "args": { "External id": 11683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11683, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11683, "pid": 5, "tid": 7, "ts": 1716454216790854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790677, "dur": 6, "args": { "External id": 11683, "cbid": 211, "correlation": 11683 } }, { "ph": "s", "id": 11683, "pid": 76337, "tid": -914061504, "ts": 1716454216790677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216791116, "dur": 6, "args": { "External id": 11685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11685, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11685, "pid": 5, "tid": 7, "ts": 1716454216791116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790687, "dur": 5, "args": { "External id": 11685, "cbid": 211, "correlation": 11685 } }, { "ph": "s", "id": 11685, "pid": 76337, "tid": -914061504, "ts": 1716454216790687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216791123, "dur": 6, "args": { "External id": 11691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11691, "pid": 5, "tid": 7, "ts": 1716454216791123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790717, "dur": 9, "args": { "External id": 11691, "cbid": 211, "correlation": 11691 } }, { "ph": "s", "id": 11691, "pid": 76337, "tid": -914061504, "ts": 1716454216790717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216791130, "dur": 3, "args": { "External id": 11699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11699, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 11699, "pid": 5, "tid": 7, "ts": 1716454216791130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790763, "dur": 10, "args": { "External id": 11699, "cbid": 211, "correlation": 11699 } }, { "ph": "s", "id": 11699, "pid": 76337, "tid": -914061504, "ts": 1716454216790763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216790839, "dur": 1, "args": { "External id": 11715, "cbid": 251, "correlation": 11715 } }, { "ph": "f", "id": 11715, "pid": 76337, "tid": -914061504, "ts": 1716454216790839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216790844, "dur": 0, "args": { "External id": 11717, "cbid": 251, "correlation": 11717 } }, { "ph": "f", "id": 11717, "pid": 76337, "tid": -914061504, "ts": 1716454216790844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216791135, "dur": 10, "args": { "External id": 11718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11718, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11718, "pid": 5, "tid": 7, "ts": 1716454216791135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790846, "dur": 13, "args": { "External id": 11718, "cbid": 211, "correlation": 11718 } }, { "ph": "s", "id": 11718, "pid": 76337, "tid": -914061504, "ts": 1716454216790846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216791147, "dur": 4, "args": { "External id": 11720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11720, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11720, "pid": 5, "tid": 7, "ts": 1716454216791147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790861, "dur": 6, "args": { "External id": 11720, "cbid": 211, "correlation": 11720 } }, { "ph": "s", "id": 11720, "pid": 76337, "tid": -914061504, "ts": 1716454216790861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216791151, "dur": 6, "args": { "External id": 11730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11730, "pid": 5, "tid": 7, "ts": 1716454216791151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790920, "dur": 12, "args": { "External id": 11730, "cbid": 211, "correlation": 11730 } }, { "ph": "s", "id": 11730, "pid": 76337, "tid": -914061504, "ts": 1716454216790920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216791158, "dur": 10, "args": { "External id": 11750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11750, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11750, "pid": 5, "tid": 7, "ts": 1716454216791158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216790995, "dur": 11, "args": { "External id": 11750, "cbid": 211, "correlation": 11750 } }, { "ph": "s", "id": 11750, "pid": 76337, "tid": -914061504, "ts": 1716454216790995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216791170, "dur": 4, "args": { "External id": 11762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11762, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11762, "pid": 5, "tid": 7, "ts": 1716454216791170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791027, "dur": 7, "args": { "External id": 11762, "cbid": 211, "correlation": 11762 } }, { "ph": "s", "id": 11762, "pid": 76337, "tid": -914061504, "ts": 1716454216791027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216791175, "dur": 7, "args": { "External id": 11765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11765, "pid": 5, "tid": 7, "ts": 1716454216791175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791047, "dur": 6, "args": { "External id": 11765, "cbid": 211, "correlation": 11765 } }, { "ph": "s", "id": 11765, "pid": 76337, "tid": -914061504, "ts": 1716454216791047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216791183, "dur": 5, "args": { "External id": 11774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11774, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11774, "pid": 5, "tid": 7, "ts": 1716454216791183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791090, "dur": 10, "args": { "External id": 11774, "cbid": 211, "correlation": 11774 } }, { "ph": "s", "id": 11774, "pid": 76337, "tid": -914061504, "ts": 1716454216791090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216791154, "dur": 0, "args": { "External id": 11784, "cbid": 317, "correlation": 11784 } }, { "ph": "f", "id": 11784, "pid": 76337, "tid": -914061504, "ts": 1716454216791154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216791155, "dur": 0, "args": { "External id": 11785, "cbid": 203, "correlation": 11785 } }, { "ph": "f", "id": 11785, "pid": 76337, "tid": -914061504, "ts": 1716454216791155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216791155, "dur": 0, "args": { "External id": 11786, "cbid": 205, "correlation": 11786 } }, { "ph": "f", "id": 11786, "pid": 76337, "tid": -914061504, "ts": 1716454216791155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216791189, "dur": 5, "args": { "External id": 11790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11790, "pid": 5, "tid": 7, "ts": 1716454216791189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791169, "dur": 12, "args": { "External id": 11790, "cbid": 211, "correlation": 11790 } }, { "ph": "s", "id": 11790, "pid": 76337, "tid": -914061504, "ts": 1716454216791169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216791196, "dur": 164, "args": { "External id": 11792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11792, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11792, "pid": 5, "tid": 7, "ts": 1716454216791196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791184, "dur": 5, "args": { "External id": 11792, "cbid": 211, "correlation": 11792 } }, { "ph": "s", "id": 11792, "pid": 76337, "tid": -914061504, "ts": 1716454216791184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216791362, "dur": 1, "args": { "External id": 11794, "device": 5, "context": 1, "stream": 7, "correlation": 11794, "bytes": 240, "memory bandwidth (GB/s)": 0.14423076923076922 } }, { "ph": "f", "id": 11794, "pid": 5, "tid": 7, "ts": 1716454216791362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216791195, "dur": 7, "args": { "External id": 11794, "cbid": 51, "correlation": 11794 } }, { "ph": "s", "id": 11794, "pid": 76337, "tid": -914061504, "ts": 1716454216791195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216791366, "dur": 261, "args": { "External id": 11795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11795, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 11795, "pid": 5, "tid": 7, "ts": 1716454216791366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791202, "dur": 6, "args": { "External id": 11795, "cbid": 211, "correlation": 11795 } }, { "ph": "s", "id": 11795, "pid": 76337, "tid": -914061504, "ts": 1716454216791202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216791628, "dur": 6, "args": { "External id": 11797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11797, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11797, "pid": 5, "tid": 7, "ts": 1716454216791628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791212, "dur": 5, "args": { "External id": 11797, "cbid": 211, "correlation": 11797 } }, { "ph": "s", "id": 11797, "pid": 76337, "tid": -914061504, "ts": 1716454216791212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216791635, "dur": 6, "args": { "External id": 11803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11803, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11803, "pid": 5, "tid": 7, "ts": 1716454216791635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791241, "dur": 8, "args": { "External id": 11803, "cbid": 211, "correlation": 11803 } }, { "ph": "s", "id": 11803, "pid": 76337, "tid": -914061504, "ts": 1716454216791241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216791643, "dur": 5, "args": { "External id": 11811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11811, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11811, "pid": 5, "tid": 7, "ts": 1716454216791643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216791274, "dur": 8, "args": { "External id": 11811, "cbid": 211, "correlation": 11811 } }, { "ph": "s", "id": 11811, "pid": 76337, "tid": -914061504, "ts": 1716454216791274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216791301, "dur": 1, "args": { "External id": 11819, "cbid": 317, "correlation": 11819 } }, { "ph": "f", "id": 11819, "pid": 76337, "tid": -914061504, "ts": 1716454216791301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216791302, "dur": 1231, "args": { "External id": 11820, "cbid": 20, "correlation": 11820 } }, { "ph": "f", "id": 11820, "pid": 76337, "tid": -914061504, "ts": 1716454216791302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216792562, "dur": 5, "args": { "External id": 11823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11823, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11823, "pid": 5, "tid": 7, "ts": 1716454216792562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216792545, "dur": 16, "args": { "External id": 11823, "cbid": 211, "correlation": 11823 } }, { "ph": "s", "id": 11823, "pid": 76337, "tid": -914061504, "ts": 1716454216792545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216792736, "dur": 10, "args": { "External id": 11843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11843, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 11843, "pid": 5, "tid": 7, "ts": 1716454216792736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216792721, "dur": 14, "args": { "External id": 11843, "cbid": 211, "correlation": 11843 } }, { "ph": "s", "id": 11843, "pid": 76337, "tid": -914061504, "ts": 1716454216792721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216792754, "dur": 4, "args": { "External id": 11855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11855, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 11855, "pid": 5, "tid": 7, "ts": 1716454216792754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216792745, "dur": 7, "args": { "External id": 11855, "cbid": 211, "correlation": 11855 } }, { "ph": "s", "id": 11855, "pid": 76337, "tid": -914061504, "ts": 1716454216792745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216792773, "dur": 6, "args": { "External id": 11858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11858, "pid": 5, "tid": 7, "ts": 1716454216792773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216792765, "dur": 6, "args": { "External id": 11858, "cbid": 211, "correlation": 11858 } }, { "ph": "s", "id": 11858, "pid": 76337, "tid": -914061504, "ts": 1716454216792765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216792840, "dur": 0, "args": { "External id": 11869, "cbid": 317, "correlation": 11869 } }, { "ph": "f", "id": 11869, "pid": 76337, "tid": -914061504, "ts": 1716454216792840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216792841, "dur": 0, "args": { "External id": 11870, "cbid": 203, "correlation": 11870 } }, { "ph": "f", "id": 11870, "pid": 76337, "tid": -914061504, "ts": 1716454216792841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216792842, "dur": 0, "args": { "External id": 11871, "cbid": 205, "correlation": 11871 } }, { "ph": "f", "id": 11871, "pid": 76337, "tid": -914061504, "ts": 1716454216792842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216794800, "dur": 5, "args": { "External id": 11875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11875, "pid": 5, "tid": 7, "ts": 1716454216794800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216794774, "dur": 26, "args": { "External id": 11875, "cbid": 211, "correlation": 11875 } }, { "ph": "s", "id": 11875, "pid": 76337, "tid": -914061504, "ts": 1716454216794774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216794935, "dur": 35, "args": { "External id": 11877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11877, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 11877, "pid": 5, "tid": 7, "ts": 1716454216794935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216794819, "dur": 118, "args": { "External id": 11877, "cbid": 211, "correlation": 11877 } }, { "ph": "s", "id": 11877, "pid": 76337, "tid": -914061504, "ts": 1716454216794819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216794971, "dur": 6, "args": { "External id": 11879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11879, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 11879, "pid": 5, "tid": 7, "ts": 1716454216794971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216794944, "dur": 8, "args": { "External id": 11879, "cbid": 211, "correlation": 11879 } }, { "ph": "s", "id": 11879, "pid": 76337, "tid": -914061504, "ts": 1716454216794944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216795477, "dur": 8, "args": { "External id": 11885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11885, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11885, "pid": 5, "tid": 7, "ts": 1716454216795477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216795465, "dur": 12, "args": { "External id": 11885, "cbid": 211, "correlation": 11885 } }, { "ph": "s", "id": 11885, "pid": 76337, "tid": -914061504, "ts": 1716454216795465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216795675, "dur": 12, "args": { "External id": 11896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11896, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 11896, "pid": 5, "tid": 7, "ts": 1716454216795675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216795657, "dur": 18, "args": { "External id": 11896, "cbid": 211, "correlation": 11896 } }, { "ph": "s", "id": 11896, "pid": 76337, "tid": -914061504, "ts": 1716454216795657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216795713, "dur": 10, "args": { "External id": 11918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11918, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 11918, "pid": 5, "tid": 7, "ts": 1716454216795713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216795702, "dur": 10, "args": { "External id": 11918, "cbid": 211, "correlation": 11918 } }, { "ph": "s", "id": 11918, "pid": 76337, "tid": -914061504, "ts": 1716454216795702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216795869, "dur": 58, "args": { "External id": 11929, "cbid": 251, "correlation": 11929 } }, { "ph": "f", "id": 11929, "pid": 76337, "tid": -914061504, "ts": 1716454216795869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216795963, "dur": 1, "args": { "External id": 11930, "device": 5, "context": 1, "stream": 7, "correlation": 11930, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 11930, "pid": 5, "tid": 7, "ts": 1716454216795963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216795934, "dur": 27, "args": { "External id": 11930, "cbid": 51, "correlation": 11930 } }, { "ph": "s", "id": 11930, "pid": 76337, "tid": -914061504, "ts": 1716454216795934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216795988, "dur": 35, "args": { "External id": 11931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11931, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 11931, "pid": 5, "tid": 7, "ts": 1716454216795988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216795964, "dur": 25, "args": { "External id": 11931, "cbid": 211, "correlation": 11931 } }, { "ph": "s", "id": 11931, "pid": 76337, "tid": -914061504, "ts": 1716454216795964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216796047, "dur": 2, "args": { "External id": 11939, "cbid": 317, "correlation": 11939 } }, { "ph": "f", "id": 11939, "pid": 76337, "tid": -914061504, "ts": 1716454216796047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216796050, "dur": 12343, "args": { "External id": 11940, "cbid": 20, "correlation": 11940 } }, { "ph": "f", "id": 11940, "pid": 76337, "tid": -914061504, "ts": 1716454216796050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216808426, "dur": 1, "args": { "External id": 11946, "cbid": 251, "correlation": 11946 } }, { "ph": "f", "id": 11946, "pid": 76337, "tid": -914061504, "ts": 1716454216808426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216808443, "dur": 1, "args": { "External id": 11947, "device": 5, "context": 1, "stream": 7, "correlation": 11947, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 11947, "pid": 5, "tid": 7, "ts": 1716454216808443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216808431, "dur": 9, "args": { "External id": 11947, "cbid": 51, "correlation": 11947 } }, { "ph": "s", "id": 11947, "pid": 76337, "tid": -914061504, "ts": 1716454216808431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216808456, "dur": 36, "args": { "External id": 11948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11948, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 11948, "pid": 5, "tid": 7, "ts": 1716454216808456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216808442, "dur": 13, "args": { "External id": 11948, "cbid": 211, "correlation": 11948 } }, { "ph": "s", "id": 11948, "pid": 76337, "tid": -914061504, "ts": 1716454216808442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216808529, "dur": 1, "args": { "External id": 11959, "cbid": 251, "correlation": 11959 } }, { "ph": "f", "id": 11959, "pid": 76337, "tid": -914061504, "ts": 1716454216808529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216808543, "dur": 1, "args": { "External id": 11960, "device": 5, "context": 1, "stream": 7, "correlation": 11960, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 11960, "pid": 5, "tid": 7, "ts": 1716454216808543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216808533, "dur": 7, "args": { "External id": 11960, "cbid": 51, "correlation": 11960 } }, { "ph": "s", "id": 11960, "pid": 76337, "tid": -914061504, "ts": 1716454216808533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216808553, "dur": 35, "args": { "External id": 11961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11961, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 11961, "pid": 5, "tid": 7, "ts": 1716454216808553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216808541, "dur": 11, "args": { "External id": 11961, "cbid": 211, "correlation": 11961 } }, { "ph": "s", "id": 11961, "pid": 76337, "tid": -914061504, "ts": 1716454216808541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216808651, "dur": 1, "args": { "External id": 11976, "cbid": 317, "correlation": 11976 } }, { "ph": "f", "id": 11976, "pid": 76337, "tid": -914061504, "ts": 1716454216808651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216808653, "dur": 2117, "args": { "External id": 11977, "cbid": 20, "correlation": 11977 } }, { "ph": "f", "id": 11977, "pid": 76337, "tid": -914061504, "ts": 1716454216808653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216810812, "dur": 35, "args": { "External id": 11990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 11990, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 11990, "pid": 5, "tid": 7, "ts": 1716454216810812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216810792, "dur": 20, "args": { "External id": 11990, "cbid": 211, "correlation": 11990 } }, { "ph": "s", "id": 11990, "pid": 76337, "tid": -914061504, "ts": 1716454216810792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216810921, "dur": 1, "args": { "External id": 12005, "cbid": 317, "correlation": 12005 } }, { "ph": "f", "id": 12005, "pid": 76337, "tid": -914061504, "ts": 1716454216810921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454216810922, "dur": 826, "args": { "External id": 12006, "cbid": 20, "correlation": 12006 } }, { "ph": "f", "id": 12006, "pid": 76337, "tid": -914061504, "ts": 1716454216810922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216811785, "dur": 64, "args": { "External id": 12012, "cbid": 251, "correlation": 12012 } }, { "ph": "f", "id": 12012, "pid": 76337, "tid": -914061504, "ts": 1716454216811785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216811864, "dur": 2, "args": { "External id": 12014, "device": 5, "context": 1, "stream": 7, "correlation": 12014, "bytes": 480, "memory bandwidth (GB/s)": 0.22727272727272727 } }, { "ph": "f", "id": 12014, "pid": 5, "tid": 7, "ts": 1716454216811864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216811855, "dur": 7, "args": { "External id": 12014, "cbid": 51, "correlation": 12014 } }, { "ph": "s", "id": 12014, "pid": 76337, "tid": -914061504, "ts": 1716454216811855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216811876, "dur": 36, "args": { "External id": 12015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12015, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 12015, "pid": 5, "tid": 7, "ts": 1716454216811876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216811863, "dur": 14, "args": { "External id": 12015, "cbid": 211, "correlation": 12015 } }, { "ph": "s", "id": 12015, "pid": 76337, "tid": -914061504, "ts": 1716454216811863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216812005, "dur": 5, "args": { "External id": 12023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12023, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12023, "pid": 5, "tid": 7, "ts": 1716454216812005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216811989, "dur": 17, "args": { "External id": 12023, "cbid": 211, "correlation": 12023 } }, { "ph": "s", "id": 12023, "pid": 76337, "tid": -914061504, "ts": 1716454216811989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216812056, "dur": 12, "args": { "External id": 12031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12031, "pid": 5, "tid": 7, "ts": 1716454216812056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812045, "dur": 11, "args": { "External id": 12031, "cbid": 211, "correlation": 12031 } }, { "ph": "s", "id": 12031, "pid": 76337, "tid": -914061504, "ts": 1716454216812045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216812117, "dur": 10, "args": { "External id": 12053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12053, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 12053, "pid": 5, "tid": 7, "ts": 1716454216812117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812105, "dur": 12, "args": { "External id": 12053, "cbid": 211, "correlation": 12053 } }, { "ph": "s", "id": 12053, "pid": 76337, "tid": -914061504, "ts": 1716454216812105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812198, "dur": 1, "args": { "External id": 12064, "cbid": 251, "correlation": 12064 } }, { "ph": "f", "id": 12064, "pid": 76337, "tid": -914061504, "ts": 1716454216812198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216812214, "dur": 1, "args": { "External id": 12065, "device": 5, "context": 1, "stream": 7, "correlation": 12065, "bytes": 480, "memory bandwidth (GB/s)": 0.3 } }, { "ph": "f", "id": 12065, "pid": 5, "tid": 7, "ts": 1716454216812214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216812203, "dur": 8, "args": { "External id": 12065, "cbid": 51, "correlation": 12065 } }, { "ph": "s", "id": 12065, "pid": 76337, "tid": -914061504, "ts": 1716454216812203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216812225, "dur": 36, "args": { "External id": 12066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12066, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 12066, "pid": 5, "tid": 7, "ts": 1716454216812225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812213, "dur": 12, "args": { "External id": 12066, "cbid": 211, "correlation": 12066 } }, { "ph": "s", "id": 12066, "pid": 76337, "tid": -914061504, "ts": 1716454216812213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812296, "dur": 1, "args": { "External id": 12077, "cbid": 251, "correlation": 12077 } }, { "ph": "f", "id": 12077, "pid": 76337, "tid": -914061504, "ts": 1716454216812296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812300, "dur": 0, "args": { "External id": 12078, "cbid": 251, "correlation": 12078 } }, { "ph": "f", "id": 12078, "pid": 76337, "tid": -914061504, "ts": 1716454216812300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216812317, "dur": 12, "args": { "External id": 12079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12079, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12079, "pid": 5, "tid": 7, "ts": 1716454216812317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812303, "dur": 14, "args": { "External id": 12079, "cbid": 211, "correlation": 12079 } }, { "ph": "s", "id": 12079, "pid": 76337, "tid": -914061504, "ts": 1716454216812303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216812332, "dur": 5, "args": { "External id": 12081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12081, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12081, "pid": 5, "tid": 7, "ts": 1716454216812332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812320, "dur": 10, "args": { "External id": 12081, "cbid": 211, "correlation": 12081 } }, { "ph": "s", "id": 12081, "pid": 76337, "tid": -914061504, "ts": 1716454216812320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812383, "dur": 1, "args": { "External id": 12092, "cbid": 251, "correlation": 12092 } }, { "ph": "f", "id": 12092, "pid": 76337, "tid": -914061504, "ts": 1716454216812383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812387, "dur": 0, "args": { "External id": 12093, "cbid": 251, "correlation": 12093 } }, { "ph": "f", "id": 12093, "pid": 76337, "tid": -914061504, "ts": 1716454216812387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216812401, "dur": 9, "args": { "External id": 12094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12094, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12094, "pid": 5, "tid": 7, "ts": 1716454216812401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812388, "dur": 11, "args": { "External id": 12094, "cbid": 211, "correlation": 12094 } }, { "ph": "s", "id": 12094, "pid": 76337, "tid": -914061504, "ts": 1716454216812388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216812411, "dur": 3, "args": { "External id": 12096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12096, "pid": 5, "tid": 7, "ts": 1716454216812411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812401, "dur": 6, "args": { "External id": 12096, "cbid": 211, "correlation": 12096 } }, { "ph": "s", "id": 12096, "pid": 76337, "tid": -914061504, "ts": 1716454216812401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216812493, "dur": 32, "args": { "External id": 12121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12121, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 12121, "pid": 5, "tid": 7, "ts": 1716454216812493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812480, "dur": 13, "args": { "External id": 12121, "cbid": 211, "correlation": 12121 } }, { "ph": "s", "id": 12121, "pid": 76337, "tid": -914061504, "ts": 1716454216812480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812588, "dur": 1, "args": { "External id": 12139, "cbid": 251, "correlation": 12139 } }, { "ph": "f", "id": 12139, "pid": 76337, "tid": -914061504, "ts": 1716454216812588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216812605, "dur": 1, "args": { "External id": 12141, "device": 5, "context": 1, "stream": 7, "correlation": 12141, "bytes": 480, "memory bandwidth (GB/s)": 0.3 } }, { "ph": "f", "id": 12141, "pid": 5, "tid": 7, "ts": 1716454216812605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216812593, "dur": 8, "args": { "External id": 12141, "cbid": 51, "correlation": 12141 } }, { "ph": "s", "id": 12141, "pid": 76337, "tid": -914061504, "ts": 1716454216812593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216812616, "dur": 35, "args": { "External id": 12142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12142, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 12142, "pid": 5, "tid": 7, "ts": 1716454216812616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812604, "dur": 12, "args": { "External id": 12142, "cbid": 211, "correlation": 12142 } }, { "ph": "s", "id": 12142, "pid": 76337, "tid": -914061504, "ts": 1716454216812604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216812689, "dur": 4, "args": { "External id": 12150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12150, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12150, "pid": 5, "tid": 7, "ts": 1716454216812689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812676, "dur": 12, "args": { "External id": 12150, "cbid": 211, "correlation": 12150 } }, { "ph": "s", "id": 12150, "pid": 76337, "tid": -914061504, "ts": 1716454216812676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216812727, "dur": 6, "args": { "External id": 12158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12158, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12158, "pid": 5, "tid": 7, "ts": 1716454216812727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812717, "dur": 9, "args": { "External id": 12158, "cbid": 211, "correlation": 12158 } }, { "ph": "s", "id": 12158, "pid": 76337, "tid": -914061504, "ts": 1716454216812717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216812795, "dur": 9, "args": { "External id": 12180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12180, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 12180, "pid": 5, "tid": 7, "ts": 1716454216812795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812782, "dur": 12, "args": { "External id": 12180, "cbid": 211, "correlation": 12180 } }, { "ph": "s", "id": 12180, "pid": 76337, "tid": -914061504, "ts": 1716454216812782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812905, "dur": 2, "args": { "External id": 12196, "cbid": 251, "correlation": 12196 } }, { "ph": "f", "id": 12196, "pid": 76337, "tid": -914061504, "ts": 1716454216812905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216812911, "dur": 0, "args": { "External id": 12198, "cbid": 251, "correlation": 12198 } }, { "ph": "f", "id": 12198, "pid": 76337, "tid": -914061504, "ts": 1716454216812911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216812931, "dur": 195, "args": { "External id": 12199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12199, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12199, "pid": 5, "tid": 7, "ts": 1716454216812931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216812914, "dur": 17, "args": { "External id": 12199, "cbid": 211, "correlation": 12199 } }, { "ph": "s", "id": 12199, "pid": 76337, "tid": -914061504, "ts": 1716454216812914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216813127, "dur": 21, "args": { "External id": 12207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12207, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12207, "pid": 5, "tid": 7, "ts": 1716454216813127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813028, "dur": 15, "args": { "External id": 12207, "cbid": 211, "correlation": 12207 } }, { "ph": "s", "id": 12207, "pid": 76337, "tid": -914061504, "ts": 1716454216813028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216813150, "dur": 21, "args": { "External id": 12215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12215, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12215, "pid": 5, "tid": 7, "ts": 1716454216813150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813071, "dur": 10, "args": { "External id": 12215, "cbid": 211, "correlation": 12215 } }, { "ph": "s", "id": 12215, "pid": 76337, "tid": -914061504, "ts": 1716454216813071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216813156, "dur": 64, "args": { "External id": 12231, "cbid": 251, "correlation": 12231 } }, { "ph": "f", "id": 12231, "pid": 76337, "tid": -914061504, "ts": 1716454216813156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216813235, "dur": 1, "args": { "External id": 12233, "device": 5, "context": 1, "stream": 7, "correlation": 12233, "bytes": 120, "memory bandwidth (GB/s)": 0.07653061224489796 } }, { "ph": "f", "id": 12233, "pid": 5, "tid": 7, "ts": 1716454216813235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216813225, "dur": 7, "args": { "External id": 12233, "cbid": 51, "correlation": 12233 } }, { "ph": "s", "id": 12233, "pid": 76337, "tid": -914061504, "ts": 1716454216813225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216813259, "dur": 110, "args": { "External id": 12234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12234, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 12234, "pid": 5, "tid": 7, "ts": 1716454216813259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813234, "dur": 25, "args": { "External id": 12234, "cbid": 211, "correlation": 12234 } }, { "ph": "s", "id": 12234, "pid": 76337, "tid": -914061504, "ts": 1716454216813234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216813370, "dur": 5, "args": { "External id": 12242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12242, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12242, "pid": 5, "tid": 7, "ts": 1716454216813370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813296, "dur": 11, "args": { "External id": 12242, "cbid": 211, "correlation": 12242 } }, { "ph": "s", "id": 12242, "pid": 76337, "tid": -914061504, "ts": 1716454216813296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216813388, "dur": 10, "args": { "External id": 12253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12253, "pid": 5, "tid": 7, "ts": 1716454216813388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813374, "dur": 14, "args": { "External id": 12253, "cbid": 211, "correlation": 12253 } }, { "ph": "s", "id": 12253, "pid": 76337, "tid": -914061504, "ts": 1716454216813374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216813484, "dur": 0, "args": { "External id": 12265, "cbid": 317, "correlation": 12265 } }, { "ph": "f", "id": 12265, "pid": 76337, "tid": -914061504, "ts": 1716454216813484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216813486, "dur": 0, "args": { "External id": 12266, "cbid": 203, "correlation": 12266 } }, { "ph": "f", "id": 12266, "pid": 76337, "tid": -914061504, "ts": 1716454216813486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216813487, "dur": 1, "args": { "External id": 12267, "cbid": 205, "correlation": 12267 } }, { "ph": "f", "id": 12267, "pid": 76337, "tid": -914061504, "ts": 1716454216813487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216813526, "dur": 6, "args": { "External id": 12271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12271, "pid": 5, "tid": 7, "ts": 1716454216813526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813511, "dur": 14, "args": { "External id": 12271, "cbid": 211, "correlation": 12271 } }, { "ph": "s", "id": 12271, "pid": 76337, "tid": -914061504, "ts": 1716454216813511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216813547, "dur": 37, "args": { "External id": 12273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12273, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 12273, "pid": 5, "tid": 7, "ts": 1716454216813547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813537, "dur": 8, "args": { "External id": 12273, "cbid": 211, "correlation": 12273 } }, { "ph": "s", "id": 12273, "pid": 76337, "tid": -914061504, "ts": 1716454216813537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216813585, "dur": 6, "args": { "External id": 12275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12275, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12275, "pid": 5, "tid": 7, "ts": 1716454216813585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813551, "dur": 8, "args": { "External id": 12275, "cbid": 211, "correlation": 12275 } }, { "ph": "s", "id": 12275, "pid": 76337, "tid": -914061504, "ts": 1716454216813551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216813592, "dur": 8, "args": { "External id": 12281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12281, "pid": 5, "tid": 7, "ts": 1716454216813592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813582, "dur": 9, "args": { "External id": 12281, "cbid": 211, "correlation": 12281 } }, { "ph": "s", "id": 12281, "pid": 76337, "tid": -914061504, "ts": 1716454216813582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216813626, "dur": 4, "args": { "External id": 12289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12289, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12289, "pid": 5, "tid": 7, "ts": 1716454216813626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813616, "dur": 9, "args": { "External id": 12289, "cbid": 211, "correlation": 12289 } }, { "ph": "s", "id": 12289, "pid": 76337, "tid": -914061504, "ts": 1716454216813616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216813727, "dur": 11, "args": { "External id": 12309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12309, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12309, "pid": 5, "tid": 7, "ts": 1716454216813727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813713, "dur": 13, "args": { "External id": 12309, "cbid": 211, "correlation": 12309 } }, { "ph": "s", "id": 12309, "pid": 76337, "tid": -914061504, "ts": 1716454216813713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216813746, "dur": 5, "args": { "External id": 12321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12321, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 12321, "pid": 5, "tid": 7, "ts": 1716454216813746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813738, "dur": 8, "args": { "External id": 12321, "cbid": 211, "correlation": 12321 } }, { "ph": "s", "id": 12321, "pid": 76337, "tid": -914061504, "ts": 1716454216813738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216813769, "dur": 9, "args": { "External id": 12324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12324, "pid": 5, "tid": 7, "ts": 1716454216813769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813760, "dur": 8, "args": { "External id": 12324, "cbid": 211, "correlation": 12324 } }, { "ph": "s", "id": 12324, "pid": 76337, "tid": -914061504, "ts": 1716454216813760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216813823, "dur": 5, "args": { "External id": 12333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12333, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12333, "pid": 5, "tid": 7, "ts": 1716454216813823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813812, "dur": 11, "args": { "External id": 12333, "cbid": 211, "correlation": 12333 } }, { "ph": "s", "id": 12333, "pid": 76337, "tid": -914061504, "ts": 1716454216813812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216813867, "dur": 0, "args": { "External id": 12343, "cbid": 317, "correlation": 12343 } }, { "ph": "f", "id": 12343, "pid": 76337, "tid": -914061504, "ts": 1716454216813867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216813868, "dur": 0, "args": { "External id": 12344, "cbid": 203, "correlation": 12344 } }, { "ph": "f", "id": 12344, "pid": 76337, "tid": -914061504, "ts": 1716454216813868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216813869, "dur": 0, "args": { "External id": 12345, "cbid": 205, "correlation": 12345 } }, { "ph": "f", "id": 12345, "pid": 76337, "tid": -914061504, "ts": 1716454216813869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216813897, "dur": 5, "args": { "External id": 12349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12349, "pid": 5, "tid": 7, "ts": 1716454216813897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813885, "dur": 11, "args": { "External id": 12349, "cbid": 211, "correlation": 12349 } }, { "ph": "s", "id": 12349, "pid": 76337, "tid": -914061504, "ts": 1716454216813885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216813907, "dur": 163, "args": { "External id": 12351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12351, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12351, "pid": 5, "tid": 7, "ts": 1716454216813907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813899, "dur": 6, "args": { "External id": 12351, "cbid": 211, "correlation": 12351 } }, { "ph": "s", "id": 12351, "pid": 76337, "tid": -914061504, "ts": 1716454216813899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216814072, "dur": 1, "args": { "External id": 12353, "device": 5, "context": 1, "stream": 7, "correlation": 12353, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 12353, "pid": 5, "tid": 7, "ts": 1716454216814072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216813912, "dur": 8, "args": { "External id": 12353, "cbid": 51, "correlation": 12353 } }, { "ph": "s", "id": 12353, "pid": 76337, "tid": -914061504, "ts": 1716454216813912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216814076, "dur": 272, "args": { "External id": 12354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12354, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12354, "pid": 5, "tid": 7, "ts": 1716454216814076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813922, "dur": 9, "args": { "External id": 12354, "cbid": 211, "correlation": 12354 } }, { "ph": "s", "id": 12354, "pid": 76337, "tid": -914061504, "ts": 1716454216813922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216814349, "dur": 6, "args": { "External id": 12356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12356, "pid": 5, "tid": 7, "ts": 1716454216814349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813935, "dur": 5, "args": { "External id": 12356, "cbid": 211, "correlation": 12356 } }, { "ph": "s", "id": 12356, "pid": 76337, "tid": -914061504, "ts": 1716454216813935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216814357, "dur": 7, "args": { "External id": 12362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12362, "pid": 5, "tid": 7, "ts": 1716454216814357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216813965, "dur": 18, "args": { "External id": 12362, "cbid": 211, "correlation": 12362 } }, { "ph": "s", "id": 12362, "pid": 76337, "tid": -914061504, "ts": 1716454216813965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216814365, "dur": 3, "args": { "External id": 12370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12370, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 12370, "pid": 5, "tid": 7, "ts": 1716454216814365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814019, "dur": 10, "args": { "External id": 12370, "cbid": 211, "correlation": 12370 } }, { "ph": "s", "id": 12370, "pid": 76337, "tid": -914061504, "ts": 1716454216814019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216814088, "dur": 1, "args": { "External id": 12386, "cbid": 251, "correlation": 12386 } }, { "ph": "f", "id": 12386, "pid": 76337, "tid": -914061504, "ts": 1716454216814088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216814094, "dur": 0, "args": { "External id": 12388, "cbid": 251, "correlation": 12388 } }, { "ph": "f", "id": 12388, "pid": 76337, "tid": -914061504, "ts": 1716454216814094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216814369, "dur": 13, "args": { "External id": 12389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12389, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12389, "pid": 5, "tid": 7, "ts": 1716454216814369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814096, "dur": 11, "args": { "External id": 12389, "cbid": 211, "correlation": 12389 } }, { "ph": "s", "id": 12389, "pid": 76337, "tid": -914061504, "ts": 1716454216814096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216814384, "dur": 5, "args": { "External id": 12391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12391, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12391, "pid": 5, "tid": 7, "ts": 1716454216814384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814110, "dur": 6, "args": { "External id": 12391, "cbid": 211, "correlation": 12391 } }, { "ph": "s", "id": 12391, "pid": 76337, "tid": -914061504, "ts": 1716454216814110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216814390, "dur": 6, "args": { "External id": 12401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12401, "pid": 5, "tid": 7, "ts": 1716454216814390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814178, "dur": 13, "args": { "External id": 12401, "cbid": 211, "correlation": 12401 } }, { "ph": "s", "id": 12401, "pid": 76337, "tid": -914061504, "ts": 1716454216814178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216814397, "dur": 10, "args": { "External id": 12421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12421, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12421, "pid": 5, "tid": 7, "ts": 1716454216814397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814246, "dur": 11, "args": { "External id": 12421, "cbid": 211, "correlation": 12421 } }, { "ph": "s", "id": 12421, "pid": 76337, "tid": -914061504, "ts": 1716454216814246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216814408, "dur": 4, "args": { "External id": 12433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12433, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 12433, "pid": 5, "tid": 7, "ts": 1716454216814408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814266, "dur": 6, "args": { "External id": 12433, "cbid": 211, "correlation": 12433 } }, { "ph": "s", "id": 12433, "pid": 76337, "tid": -914061504, "ts": 1716454216814266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216814414, "dur": 7, "args": { "External id": 12436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12436, "pid": 5, "tid": 7, "ts": 1716454216814414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814284, "dur": 6, "args": { "External id": 12436, "cbid": 211, "correlation": 12436 } }, { "ph": "s", "id": 12436, "pid": 76337, "tid": -914061504, "ts": 1716454216814284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216814422, "dur": 5, "args": { "External id": 12445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12445, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12445, "pid": 5, "tid": 7, "ts": 1716454216814422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814325, "dur": 10, "args": { "External id": 12445, "cbid": 211, "correlation": 12445 } }, { "ph": "s", "id": 12445, "pid": 76337, "tid": -914061504, "ts": 1716454216814325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216814389, "dur": 0, "args": { "External id": 12455, "cbid": 317, "correlation": 12455 } }, { "ph": "f", "id": 12455, "pid": 76337, "tid": -914061504, "ts": 1716454216814389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216814390, "dur": 0, "args": { "External id": 12456, "cbid": 203, "correlation": 12456 } }, { "ph": "f", "id": 12456, "pid": 76337, "tid": -914061504, "ts": 1716454216814390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216814391, "dur": 0, "args": { "External id": 12457, "cbid": 205, "correlation": 12457 } }, { "ph": "f", "id": 12457, "pid": 76337, "tid": -914061504, "ts": 1716454216814391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216814428, "dur": 5, "args": { "External id": 12461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12461, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12461, "pid": 5, "tid": 7, "ts": 1716454216814428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814404, "dur": 12, "args": { "External id": 12461, "cbid": 211, "correlation": 12461 } }, { "ph": "s", "id": 12461, "pid": 76337, "tid": -914061504, "ts": 1716454216814404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216814435, "dur": 164, "args": { "External id": 12463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12463, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12463, "pid": 5, "tid": 7, "ts": 1716454216814435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814419, "dur": 6, "args": { "External id": 12463, "cbid": 211, "correlation": 12463 } }, { "ph": "s", "id": 12463, "pid": 76337, "tid": -914061504, "ts": 1716454216814419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216814601, "dur": 1, "args": { "External id": 12465, "device": 5, "context": 1, "stream": 7, "correlation": 12465, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 12465, "pid": 5, "tid": 7, "ts": 1716454216814601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216814430, "dur": 7, "args": { "External id": 12465, "cbid": 51, "correlation": 12465 } }, { "ph": "s", "id": 12465, "pid": 76337, "tid": -914061504, "ts": 1716454216814430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216814604, "dur": 262, "args": { "External id": 12466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12466, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12466, "pid": 5, "tid": 7, "ts": 1716454216814604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814438, "dur": 6, "args": { "External id": 12466, "cbid": 211, "correlation": 12466 } }, { "ph": "s", "id": 12466, "pid": 76337, "tid": -914061504, "ts": 1716454216814438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216814867, "dur": 6, "args": { "External id": 12468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12468, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12468, "pid": 5, "tid": 7, "ts": 1716454216814867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814448, "dur": 5, "args": { "External id": 12468, "cbid": 211, "correlation": 12468 } }, { "ph": "s", "id": 12468, "pid": 76337, "tid": -914061504, "ts": 1716454216814448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216814875, "dur": 6, "args": { "External id": 12474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12474, "pid": 5, "tid": 7, "ts": 1716454216814875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814476, "dur": 9, "args": { "External id": 12474, "cbid": 211, "correlation": 12474 } }, { "ph": "s", "id": 12474, "pid": 76337, "tid": -914061504, "ts": 1716454216814476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216814882, "dur": 5, "args": { "External id": 12482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12482, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12482, "pid": 5, "tid": 7, "ts": 1716454216814882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814509, "dur": 8, "args": { "External id": 12482, "cbid": 211, "correlation": 12482 } }, { "ph": "s", "id": 12482, "pid": 76337, "tid": -914061504, "ts": 1716454216814509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216814888, "dur": 5, "args": { "External id": 12490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12490, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12490, "pid": 5, "tid": 7, "ts": 1716454216814888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814539, "dur": 8, "args": { "External id": 12490, "cbid": 211, "correlation": 12490 } }, { "ph": "s", "id": 12490, "pid": 76337, "tid": -914061504, "ts": 1716454216814539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216814894, "dur": 11, "args": { "External id": 12499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12499, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12499, "pid": 5, "tid": 7, "ts": 1716454216814894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814721, "dur": 22, "args": { "External id": 12499, "cbid": 211, "correlation": 12499 } }, { "ph": "s", "id": 12499, "pid": 76337, "tid": -914061504, "ts": 1716454216814721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216814907, "dur": 12, "args": { "External id": 12519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12519, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12519, "pid": 5, "tid": 7, "ts": 1716454216814907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814813, "dur": 13, "args": { "External id": 12519, "cbid": 211, "correlation": 12519 } }, { "ph": "s", "id": 12519, "pid": 76337, "tid": -914061504, "ts": 1716454216814813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216814921, "dur": 4, "args": { "External id": 12531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12531, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12531, "pid": 5, "tid": 7, "ts": 1716454216814921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814836, "dur": 6, "args": { "External id": 12531, "cbid": 211, "correlation": 12531 } }, { "ph": "s", "id": 12531, "pid": 76337, "tid": -914061504, "ts": 1716454216814836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216814926, "dur": 10, "args": { "External id": 12534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12534, "pid": 5, "tid": 7, "ts": 1716454216814926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814855, "dur": 6, "args": { "External id": 12534, "cbid": 211, "correlation": 12534 } }, { "ph": "s", "id": 12534, "pid": 76337, "tid": -914061504, "ts": 1716454216814855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216814938, "dur": 6, "args": { "External id": 12543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12543, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12543, "pid": 5, "tid": 7, "ts": 1716454216814938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216814895, "dur": 9, "args": { "External id": 12543, "cbid": 211, "correlation": 12543 } }, { "ph": "s", "id": 12543, "pid": 76337, "tid": -914061504, "ts": 1716454216814895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216814961, "dur": 0, "args": { "External id": 12553, "cbid": 317, "correlation": 12553 } }, { "ph": "f", "id": 12553, "pid": 76337, "tid": -914061504, "ts": 1716454216814961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216814962, "dur": 0, "args": { "External id": 12554, "cbid": 203, "correlation": 12554 } }, { "ph": "f", "id": 12554, "pid": 76337, "tid": -914061504, "ts": 1716454216814962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216814963, "dur": 0, "args": { "External id": 12555, "cbid": 205, "correlation": 12555 } }, { "ph": "f", "id": 12555, "pid": 76337, "tid": -914061504, "ts": 1716454216814963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216818191, "dur": 7, "args": { "External id": 12559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12559, "pid": 5, "tid": 7, "ts": 1716454216818191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216818161, "dur": 31, "args": { "External id": 12559, "cbid": 211, "correlation": 12559 } }, { "ph": "s", "id": 12559, "pid": 76337, "tid": -914061504, "ts": 1716454216818161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216818203, "dur": 323, "args": { "External id": 12561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12561, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12561, "pid": 5, "tid": 7, "ts": 1716454216818203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216818195, "dur": 6, "args": { "External id": 12561, "cbid": 211, "correlation": 12561 } }, { "ph": "s", "id": 12561, "pid": 76337, "tid": -914061504, "ts": 1716454216818195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216818528, "dur": 1, "args": { "External id": 12563, "device": 5, "context": 1, "stream": 7, "correlation": 12563, "bytes": 240, "memory bandwidth (GB/s)": 0.125 } }, { "ph": "f", "id": 12563, "pid": 5, "tid": 7, "ts": 1716454216818528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216818209, "dur": 14, "args": { "External id": 12563, "cbid": 51, "correlation": 12563 } }, { "ph": "s", "id": 12563, "pid": 76337, "tid": -914061504, "ts": 1716454216818209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216818532, "dur": 500, "args": { "External id": 12564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12564, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12564, "pid": 5, "tid": 7, "ts": 1716454216818532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216818225, "dur": 9, "args": { "External id": 12564, "cbid": 211, "correlation": 12564 } }, { "ph": "s", "id": 12564, "pid": 76337, "tid": -914061504, "ts": 1716454216818225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216819033, "dur": 5, "args": { "External id": 12566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12566, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12566, "pid": 5, "tid": 7, "ts": 1716454216819033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216818239, "dur": 6, "args": { "External id": 12566, "cbid": 211, "correlation": 12566 } }, { "ph": "s", "id": 12566, "pid": 76337, "tid": -914061504, "ts": 1716454216818239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216819039, "dur": 6, "args": { "External id": 12572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12572, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12572, "pid": 5, "tid": 7, "ts": 1716454216819039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819002, "dur": 14, "args": { "External id": 12572, "cbid": 211, "correlation": 12572 } }, { "ph": "s", "id": 12572, "pid": 76337, "tid": -914061504, "ts": 1716454216819002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216819099, "dur": 3, "args": { "External id": 12580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12580, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 12580, "pid": 5, "tid": 7, "ts": 1716454216819099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819088, "dur": 11, "args": { "External id": 12580, "cbid": 211, "correlation": 12580 } }, { "ph": "s", "id": 12580, "pid": 76337, "tid": -914061504, "ts": 1716454216819088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216819211, "dur": 4, "args": { "External id": 12596, "cbid": 251, "correlation": 12596 } }, { "ph": "f", "id": 12596, "pid": 76337, "tid": -914061504, "ts": 1716454216819211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216819220, "dur": 0, "args": { "External id": 12598, "cbid": 251, "correlation": 12598 } }, { "ph": "f", "id": 12598, "pid": 76337, "tid": -914061504, "ts": 1716454216819220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216819238, "dur": 10, "args": { "External id": 12599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12599, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12599, "pid": 5, "tid": 7, "ts": 1716454216819238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819223, "dur": 16, "args": { "External id": 12599, "cbid": 211, "correlation": 12599 } }, { "ph": "s", "id": 12599, "pid": 76337, "tid": -914061504, "ts": 1716454216819223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216819253, "dur": 4, "args": { "External id": 12601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12601, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12601, "pid": 5, "tid": 7, "ts": 1716454216819253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819242, "dur": 9, "args": { "External id": 12601, "cbid": 211, "correlation": 12601 } }, { "ph": "s", "id": 12601, "pid": 76337, "tid": -914061504, "ts": 1716454216819242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216819340, "dur": 6, "args": { "External id": 12611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12611, "pid": 5, "tid": 7, "ts": 1716454216819340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819327, "dur": 12, "args": { "External id": 12611, "cbid": 211, "correlation": 12611 } }, { "ph": "s", "id": 12611, "pid": 76337, "tid": -914061504, "ts": 1716454216819327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216819432, "dur": 10, "args": { "External id": 12631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12631, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12631, "pid": 5, "tid": 7, "ts": 1716454216819432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819419, "dur": 12, "args": { "External id": 12631, "cbid": 211, "correlation": 12631 } }, { "ph": "s", "id": 12631, "pid": 76337, "tid": -914061504, "ts": 1716454216819419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216819451, "dur": 4, "args": { "External id": 12643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12643, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 12643, "pid": 5, "tid": 7, "ts": 1716454216819451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819443, "dur": 8, "args": { "External id": 12643, "cbid": 211, "correlation": 12643 } }, { "ph": "s", "id": 12643, "pid": 76337, "tid": -914061504, "ts": 1716454216819443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216819478, "dur": 7, "args": { "External id": 12646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12646, "pid": 5, "tid": 7, "ts": 1716454216819478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819470, "dur": 7, "args": { "External id": 12646, "cbid": 211, "correlation": 12646 } }, { "ph": "s", "id": 12646, "pid": 76337, "tid": -914061504, "ts": 1716454216819470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216819524, "dur": 5, "args": { "External id": 12655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12655, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12655, "pid": 5, "tid": 7, "ts": 1716454216819524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819513, "dur": 10, "args": { "External id": 12655, "cbid": 211, "correlation": 12655 } }, { "ph": "s", "id": 12655, "pid": 76337, "tid": -914061504, "ts": 1716454216819513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216819600, "dur": 0, "args": { "External id": 12665, "cbid": 317, "correlation": 12665 } }, { "ph": "f", "id": 12665, "pid": 76337, "tid": -914061504, "ts": 1716454216819600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216819601, "dur": 0, "args": { "External id": 12666, "cbid": 203, "correlation": 12666 } }, { "ph": "f", "id": 12666, "pid": 76337, "tid": -914061504, "ts": 1716454216819601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216819603, "dur": 0, "args": { "External id": 12667, "cbid": 205, "correlation": 12667 } }, { "ph": "f", "id": 12667, "pid": 76337, "tid": -914061504, "ts": 1716454216819603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216819632, "dur": 5, "args": { "External id": 12671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12671, "pid": 5, "tid": 7, "ts": 1716454216819632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819620, "dur": 12, "args": { "External id": 12671, "cbid": 211, "correlation": 12671 } }, { "ph": "s", "id": 12671, "pid": 76337, "tid": -914061504, "ts": 1716454216819620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216819642, "dur": 164, "args": { "External id": 12673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12673, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12673, "pid": 5, "tid": 7, "ts": 1716454216819642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819634, "dur": 6, "args": { "External id": 12673, "cbid": 211, "correlation": 12673 } }, { "ph": "s", "id": 12673, "pid": 76337, "tid": -914061504, "ts": 1716454216819634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216819809, "dur": 1, "args": { "External id": 12675, "device": 5, "context": 1, "stream": 7, "correlation": 12675, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 12675, "pid": 5, "tid": 7, "ts": 1716454216819809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216819646, "dur": 8, "args": { "External id": 12675, "cbid": 51, "correlation": 12675 } }, { "ph": "s", "id": 12675, "pid": 76337, "tid": -914061504, "ts": 1716454216819646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216819812, "dur": 261, "args": { "External id": 12676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12676, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12676, "pid": 5, "tid": 7, "ts": 1716454216819812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819656, "dur": 6, "args": { "External id": 12676, "cbid": 211, "correlation": 12676 } }, { "ph": "s", "id": 12676, "pid": 76337, "tid": -914061504, "ts": 1716454216819656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216820075, "dur": 6, "args": { "External id": 12678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12678, "pid": 5, "tid": 7, "ts": 1716454216820075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819666, "dur": 5, "args": { "External id": 12678, "cbid": 211, "correlation": 12678 } }, { "ph": "s", "id": 12678, "pid": 76337, "tid": -914061504, "ts": 1716454216819666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216820082, "dur": 6, "args": { "External id": 12684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12684, "pid": 5, "tid": 7, "ts": 1716454216820082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216819695, "dur": 9, "args": { "External id": 12684, "cbid": 211, "correlation": 12684 } }, { "ph": "s", "id": 12684, "pid": 76337, "tid": -914061504, "ts": 1716454216819695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216819756, "dur": 0, "args": { "External id": 12694, "cbid": 317, "correlation": 12694 } }, { "ph": "f", "id": 12694, "pid": 76337, "tid": -914061504, "ts": 1716454216819756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216819757, "dur": 0, "args": { "External id": 12695, "cbid": 203, "correlation": 12695 } }, { "ph": "f", "id": 12695, "pid": 76337, "tid": -914061504, "ts": 1716454216819757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216819758, "dur": 0, "args": { "External id": 12696, "cbid": 205, "correlation": 12696 } }, { "ph": "f", "id": 12696, "pid": 76337, "tid": -914061504, "ts": 1716454216819758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216821688, "dur": 8, "args": { "External id": 12700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12700, "pid": 5, "tid": 7, "ts": 1716454216821688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216821666, "dur": 21, "args": { "External id": 12700, "cbid": 211, "correlation": 12700 } }, { "ph": "s", "id": 12700, "pid": 76337, "tid": -914061504, "ts": 1716454216821666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216821712, "dur": 3, "args": { "External id": 12702, "device": 5, "context": 1, "stream": 7, "correlation": 12702, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 12702, "pid": 5, "tid": 7, "ts": 1716454216821712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216821694, "dur": 16, "args": { "External id": 12702, "cbid": 51, "correlation": 12702 } }, { "ph": "s", "id": 12702, "pid": 76337, "tid": -914061504, "ts": 1716454216821694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216821722, "dur": 97, "args": { "External id": 12703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12703, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 12703, "pid": 5, "tid": 7, "ts": 1716454216821722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216821712, "dur": 10, "args": { "External id": 12703, "cbid": 211, "correlation": 12703 } }, { "ph": "s", "id": 12703, "pid": 76337, "tid": -914061504, "ts": 1716454216821712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216821821, "dur": 5, "args": { "External id": 12705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12705, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12705, "pid": 5, "tid": 7, "ts": 1716454216821821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216821727, "dur": 6, "args": { "External id": 12705, "cbid": 211, "correlation": 12705 } }, { "ph": "s", "id": 12705, "pid": 76337, "tid": -914061504, "ts": 1716454216821727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216822262, "dur": 6, "args": { "External id": 12711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12711, "pid": 5, "tid": 7, "ts": 1716454216822262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822249, "dur": 12, "args": { "External id": 12711, "cbid": 211, "correlation": 12711 } }, { "ph": "s", "id": 12711, "pid": 76337, "tid": -914061504, "ts": 1716454216822249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216822303, "dur": 5, "args": { "External id": 12719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12719, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12719, "pid": 5, "tid": 7, "ts": 1716454216822303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822292, "dur": 9, "args": { "External id": 12719, "cbid": 211, "correlation": 12719 } }, { "ph": "s", "id": 12719, "pid": 76337, "tid": -914061504, "ts": 1716454216822292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216822355, "dur": 4, "args": { "External id": 12727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12727, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12727, "pid": 5, "tid": 7, "ts": 1716454216822355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822344, "dur": 10, "args": { "External id": 12727, "cbid": 211, "correlation": 12727 } }, { "ph": "s", "id": 12727, "pid": 76337, "tid": -914061504, "ts": 1716454216822344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216822432, "dur": 11, "args": { "External id": 12736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12736, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12736, "pid": 5, "tid": 7, "ts": 1716454216822432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822419, "dur": 13, "args": { "External id": 12736, "cbid": 211, "correlation": 12736 } }, { "ph": "s", "id": 12736, "pid": 76337, "tid": -914061504, "ts": 1716454216822419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216822528, "dur": 12, "args": { "External id": 12756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12756, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12756, "pid": 5, "tid": 7, "ts": 1716454216822528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822515, "dur": 12, "args": { "External id": 12756, "cbid": 211, "correlation": 12756 } }, { "ph": "s", "id": 12756, "pid": 76337, "tid": -914061504, "ts": 1716454216822515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216822547, "dur": 4, "args": { "External id": 12768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12768, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12768, "pid": 5, "tid": 7, "ts": 1716454216822547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822538, "dur": 8, "args": { "External id": 12768, "cbid": 211, "correlation": 12768 } }, { "ph": "s", "id": 12768, "pid": 76337, "tid": -914061504, "ts": 1716454216822538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216822568, "dur": 10, "args": { "External id": 12771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12771, "pid": 5, "tid": 7, "ts": 1716454216822568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822560, "dur": 6, "args": { "External id": 12771, "cbid": 211, "correlation": 12771 } }, { "ph": "s", "id": 12771, "pid": 76337, "tid": -914061504, "ts": 1716454216822560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216822613, "dur": 6, "args": { "External id": 12780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12780, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12780, "pid": 5, "tid": 7, "ts": 1716454216822613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822601, "dur": 11, "args": { "External id": 12780, "cbid": 211, "correlation": 12780 } }, { "ph": "s", "id": 12780, "pid": 76337, "tid": -914061504, "ts": 1716454216822601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216822679, "dur": 0, "args": { "External id": 12790, "cbid": 317, "correlation": 12790 } }, { "ph": "f", "id": 12790, "pid": 76337, "tid": -914061504, "ts": 1716454216822679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216822679, "dur": 0, "args": { "External id": 12791, "cbid": 203, "correlation": 12791 } }, { "ph": "f", "id": 12791, "pid": 76337, "tid": -914061504, "ts": 1716454216822679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216822680, "dur": 1, "args": { "External id": 12792, "cbid": 205, "correlation": 12792 } }, { "ph": "f", "id": 12792, "pid": 76337, "tid": -914061504, "ts": 1716454216822680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216822709, "dur": 7, "args": { "External id": 12796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12796, "pid": 5, "tid": 7, "ts": 1716454216822709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822696, "dur": 13, "args": { "External id": 12796, "cbid": 211, "correlation": 12796 } }, { "ph": "s", "id": 12796, "pid": 76337, "tid": -914061504, "ts": 1716454216822696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216822720, "dur": 323, "args": { "External id": 12798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12798, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12798, "pid": 5, "tid": 7, "ts": 1716454216822720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822711, "dur": 6, "args": { "External id": 12798, "cbid": 211, "correlation": 12798 } }, { "ph": "s", "id": 12798, "pid": 76337, "tid": -914061504, "ts": 1716454216822711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216823046, "dur": 1, "args": { "External id": 12800, "device": 5, "context": 1, "stream": 7, "correlation": 12800, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 12800, "pid": 5, "tid": 7, "ts": 1716454216823046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216822724, "dur": 8, "args": { "External id": 12800, "cbid": 51, "correlation": 12800 } }, { "ph": "s", "id": 12800, "pid": 76337, "tid": -914061504, "ts": 1716454216822724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216823050, "dur": 503, "args": { "External id": 12801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12801, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12801, "pid": 5, "tid": 7, "ts": 1716454216823050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822733, "dur": 7, "args": { "External id": 12801, "cbid": 211, "correlation": 12801 } }, { "ph": "s", "id": 12801, "pid": 76337, "tid": -914061504, "ts": 1716454216822733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216823554, "dur": 6, "args": { "External id": 12803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12803, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12803, "pid": 5, "tid": 7, "ts": 1716454216823554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822743, "dur": 6, "args": { "External id": 12803, "cbid": 211, "correlation": 12803 } }, { "ph": "s", "id": 12803, "pid": 76337, "tid": -914061504, "ts": 1716454216822743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216823561, "dur": 6, "args": { "External id": 12809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12809, "pid": 5, "tid": 7, "ts": 1716454216823561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822773, "dur": 10, "args": { "External id": 12809, "cbid": 211, "correlation": 12809 } }, { "ph": "s", "id": 12809, "pid": 76337, "tid": -914061504, "ts": 1716454216822773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216823569, "dur": 3, "args": { "External id": 12817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12817, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 12817, "pid": 5, "tid": 7, "ts": 1716454216823569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822819, "dur": 10, "args": { "External id": 12817, "cbid": 211, "correlation": 12817 } }, { "ph": "s", "id": 12817, "pid": 76337, "tid": -914061504, "ts": 1716454216822819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216822932, "dur": 2, "args": { "External id": 12833, "cbid": 251, "correlation": 12833 } }, { "ph": "f", "id": 12833, "pid": 76337, "tid": -914061504, "ts": 1716454216822932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216822939, "dur": 0, "args": { "External id": 12835, "cbid": 251, "correlation": 12835 } }, { "ph": "f", "id": 12835, "pid": 76337, "tid": -914061504, "ts": 1716454216822939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216823573, "dur": 13, "args": { "External id": 12836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12836, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12836, "pid": 5, "tid": 7, "ts": 1716454216823573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822941, "dur": 15, "args": { "External id": 12836, "cbid": 211, "correlation": 12836 } }, { "ph": "s", "id": 12836, "pid": 76337, "tid": -914061504, "ts": 1716454216822941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216823587, "dur": 5, "args": { "External id": 12838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12838, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12838, "pid": 5, "tid": 7, "ts": 1716454216823587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216822959, "dur": 8, "args": { "External id": 12838, "cbid": 211, "correlation": 12838 } }, { "ph": "s", "id": 12838, "pid": 76337, "tid": -914061504, "ts": 1716454216822959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216823593, "dur": 6, "args": { "External id": 12848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12848, "pid": 5, "tid": 7, "ts": 1716454216823593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823036, "dur": 13, "args": { "External id": 12848, "cbid": 211, "correlation": 12848 } }, { "ph": "s", "id": 12848, "pid": 76337, "tid": -914061504, "ts": 1716454216823036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216823600, "dur": 10, "args": { "External id": 12868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12868, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12868, "pid": 5, "tid": 7, "ts": 1716454216823600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823105, "dur": 11, "args": { "External id": 12868, "cbid": 211, "correlation": 12868 } }, { "ph": "s", "id": 12868, "pid": 76337, "tid": -914061504, "ts": 1716454216823105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216823611, "dur": 4, "args": { "External id": 12880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12880, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 12880, "pid": 5, "tid": 7, "ts": 1716454216823611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823126, "dur": 6, "args": { "External id": 12880, "cbid": 211, "correlation": 12880 } }, { "ph": "s", "id": 12880, "pid": 76337, "tid": -914061504, "ts": 1716454216823126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216823616, "dur": 7, "args": { "External id": 12883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12883, "pid": 5, "tid": 7, "ts": 1716454216823616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823144, "dur": 6, "args": { "External id": 12883, "cbid": 211, "correlation": 12883 } }, { "ph": "s", "id": 12883, "pid": 76337, "tid": -914061504, "ts": 1716454216823144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216823624, "dur": 5, "args": { "External id": 12892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12892, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12892, "pid": 5, "tid": 7, "ts": 1716454216823624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823185, "dur": 9, "args": { "External id": 12892, "cbid": 211, "correlation": 12892 } }, { "ph": "s", "id": 12892, "pid": 76337, "tid": -914061504, "ts": 1716454216823185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216823266, "dur": 0, "args": { "External id": 12902, "cbid": 317, "correlation": 12902 } }, { "ph": "f", "id": 12902, "pid": 76337, "tid": -914061504, "ts": 1716454216823266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216823267, "dur": 0, "args": { "External id": 12903, "cbid": 203, "correlation": 12903 } }, { "ph": "f", "id": 12903, "pid": 76337, "tid": -914061504, "ts": 1716454216823267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216823268, "dur": 0, "args": { "External id": 12904, "cbid": 205, "correlation": 12904 } }, { "ph": "f", "id": 12904, "pid": 76337, "tid": -914061504, "ts": 1716454216823268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216823630, "dur": 5, "args": { "External id": 12908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12908, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12908, "pid": 5, "tid": 7, "ts": 1716454216823630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823283, "dur": 13, "args": { "External id": 12908, "cbid": 211, "correlation": 12908 } }, { "ph": "s", "id": 12908, "pid": 76337, "tid": -914061504, "ts": 1716454216823283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216823637, "dur": 163, "args": { "External id": 12910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12910, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12910, "pid": 5, "tid": 7, "ts": 1716454216823637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823299, "dur": 6, "args": { "External id": 12910, "cbid": 211, "correlation": 12910 } }, { "ph": "s", "id": 12910, "pid": 76337, "tid": -914061504, "ts": 1716454216823299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216823802, "dur": 1, "args": { "External id": 12912, "device": 5, "context": 1, "stream": 7, "correlation": 12912, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 12912, "pid": 5, "tid": 7, "ts": 1716454216823802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216823310, "dur": 7, "args": { "External id": 12912, "cbid": 51, "correlation": 12912 } }, { "ph": "s", "id": 12912, "pid": 76337, "tid": -914061504, "ts": 1716454216823310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216823806, "dur": 262, "args": { "External id": 12913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12913, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 12913, "pid": 5, "tid": 7, "ts": 1716454216823806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823318, "dur": 6, "args": { "External id": 12913, "cbid": 211, "correlation": 12913 } }, { "ph": "s", "id": 12913, "pid": 76337, "tid": -914061504, "ts": 1716454216823318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216824069, "dur": 6, "args": { "External id": 12915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12915, "pid": 5, "tid": 7, "ts": 1716454216824069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823328, "dur": 5, "args": { "External id": 12915, "cbid": 211, "correlation": 12915 } }, { "ph": "s", "id": 12915, "pid": 76337, "tid": -914061504, "ts": 1716454216823328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216824076, "dur": 6, "args": { "External id": 12921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12921, "pid": 5, "tid": 7, "ts": 1716454216824076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823358, "dur": 8, "args": { "External id": 12921, "cbid": 211, "correlation": 12921 } }, { "ph": "s", "id": 12921, "pid": 76337, "tid": -914061504, "ts": 1716454216823358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216823419, "dur": 0, "args": { "External id": 12931, "cbid": 317, "correlation": 12931 } }, { "ph": "f", "id": 12931, "pid": 76337, "tid": -914061504, "ts": 1716454216823419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216823420, "dur": 0, "args": { "External id": 12932, "cbid": 203, "correlation": 12932 } }, { "ph": "f", "id": 12932, "pid": 76337, "tid": -914061504, "ts": 1716454216823420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216823420, "dur": 0, "args": { "External id": 12933, "cbid": 205, "correlation": 12933 } }, { "ph": "f", "id": 12933, "pid": 76337, "tid": -914061504, "ts": 1716454216823420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216824083, "dur": 8, "args": { "External id": 12937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12937, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12937, "pid": 5, "tid": 7, "ts": 1716454216824083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823432, "dur": 11, "args": { "External id": 12937, "cbid": 211, "correlation": 12937 } }, { "ph": "s", "id": 12937, "pid": 76337, "tid": -914061504, "ts": 1716454216823432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216824092, "dur": 3, "args": { "External id": 12939, "device": 5, "context": 1, "stream": 7, "correlation": 12939, "bytes": 4800, "memory bandwidth (GB/s)": 1.293103448275862 } }, { "ph": "f", "id": 12939, "pid": 5, "tid": 7, "ts": 1716454216824092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216823449, "dur": 10, "args": { "External id": 12939, "cbid": 51, "correlation": 12939 } }, { "ph": "s", "id": 12939, "pid": 76337, "tid": -914061504, "ts": 1716454216823449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216824097, "dur": 95, "args": { "External id": 12940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12940, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 12940, "pid": 5, "tid": 7, "ts": 1716454216824097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823460, "dur": 6, "args": { "External id": 12940, "cbid": 211, "correlation": 12940 } }, { "ph": "s", "id": 12940, "pid": 76337, "tid": -914061504, "ts": 1716454216823460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216824193, "dur": 6, "args": { "External id": 12942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12942, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12942, "pid": 5, "tid": 7, "ts": 1716454216824193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823470, "dur": 5, "args": { "External id": 12942, "cbid": 211, "correlation": 12942 } }, { "ph": "s", "id": 12942, "pid": 76337, "tid": -914061504, "ts": 1716454216823470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216824200, "dur": 6, "args": { "External id": 12948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12948, "pid": 5, "tid": 7, "ts": 1716454216824200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823497, "dur": 8, "args": { "External id": 12948, "cbid": 211, "correlation": 12948 } }, { "ph": "s", "id": 12948, "pid": 76337, "tid": -914061504, "ts": 1716454216823497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216824208, "dur": 5, "args": { "External id": 12956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12956, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12956, "pid": 5, "tid": 7, "ts": 1716454216824208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823527, "dur": 8, "args": { "External id": 12956, "cbid": 211, "correlation": 12956 } }, { "ph": "s", "id": 12956, "pid": 76337, "tid": -914061504, "ts": 1716454216823527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216824214, "dur": 4, "args": { "External id": 12964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12964, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 12964, "pid": 5, "tid": 7, "ts": 1716454216824214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823556, "dur": 9, "args": { "External id": 12964, "cbid": 211, "correlation": 12964 } }, { "ph": "s", "id": 12964, "pid": 76337, "tid": -914061504, "ts": 1716454216823556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216824220, "dur": 11, "args": { "External id": 12973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12973, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 12973, "pid": 5, "tid": 7, "ts": 1716454216824220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823603, "dur": 10, "args": { "External id": 12973, "cbid": 211, "correlation": 12973 } }, { "ph": "s", "id": 12973, "pid": 76337, "tid": -914061504, "ts": 1716454216823603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216824232, "dur": 12, "args": { "External id": 12993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 12993, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 12993, "pid": 5, "tid": 7, "ts": 1716454216824232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823674, "dur": 11, "args": { "External id": 12993, "cbid": 211, "correlation": 12993 } }, { "ph": "s", "id": 12993, "pid": 76337, "tid": -914061504, "ts": 1716454216823674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216824246, "dur": 4, "args": { "External id": 13005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13005, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13005, "pid": 5, "tid": 7, "ts": 1716454216824246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823697, "dur": 6, "args": { "External id": 13005, "cbid": 211, "correlation": 13005 } }, { "ph": "s", "id": 13005, "pid": 76337, "tid": -914061504, "ts": 1716454216823697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216824251, "dur": 10, "args": { "External id": 13008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13008, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13008, "pid": 5, "tid": 7, "ts": 1716454216824251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823715, "dur": 7, "args": { "External id": 13008, "cbid": 211, "correlation": 13008 } }, { "ph": "s", "id": 13008, "pid": 76337, "tid": -914061504, "ts": 1716454216823715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216824263, "dur": 6, "args": { "External id": 13017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13017, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13017, "pid": 5, "tid": 7, "ts": 1716454216824263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823753, "dur": 10, "args": { "External id": 13017, "cbid": 211, "correlation": 13017 } }, { "ph": "s", "id": 13017, "pid": 76337, "tid": -914061504, "ts": 1716454216823753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216823804, "dur": 0, "args": { "External id": 13027, "cbid": 317, "correlation": 13027 } }, { "ph": "f", "id": 13027, "pid": 76337, "tid": -914061504, "ts": 1716454216823804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216823805, "dur": 0, "args": { "External id": 13028, "cbid": 203, "correlation": 13028 } }, { "ph": "f", "id": 13028, "pid": 76337, "tid": -914061504, "ts": 1716454216823805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216823805, "dur": 0, "args": { "External id": 13029, "cbid": 205, "correlation": 13029 } }, { "ph": "f", "id": 13029, "pid": 76337, "tid": -914061504, "ts": 1716454216823805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216824270, "dur": 7, "args": { "External id": 13033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13033, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13033, "pid": 5, "tid": 7, "ts": 1716454216824270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823820, "dur": 12, "args": { "External id": 13033, "cbid": 211, "correlation": 13033 } }, { "ph": "s", "id": 13033, "pid": 76337, "tid": -914061504, "ts": 1716454216823820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216824278, "dur": 324, "args": { "External id": 13035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13035, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13035, "pid": 5, "tid": 7, "ts": 1716454216824278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823834, "dur": 5, "args": { "External id": 13035, "cbid": 211, "correlation": 13035 } }, { "ph": "s", "id": 13035, "pid": 76337, "tid": -914061504, "ts": 1716454216823834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216824604, "dur": 1, "args": { "External id": 13037, "device": 5, "context": 1, "stream": 7, "correlation": 13037, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 13037, "pid": 5, "tid": 7, "ts": 1716454216824604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216823844, "dur": 6, "args": { "External id": 13037, "cbid": 51, "correlation": 13037 } }, { "ph": "s", "id": 13037, "pid": 76337, "tid": -914061504, "ts": 1716454216823844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216824608, "dur": 501, "args": { "External id": 13038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13038, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13038, "pid": 5, "tid": 7, "ts": 1716454216824608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823851, "dur": 6, "args": { "External id": 13038, "cbid": 211, "correlation": 13038 } }, { "ph": "s", "id": 13038, "pid": 76337, "tid": -914061504, "ts": 1716454216823851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825110, "dur": 5, "args": { "External id": 13040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13040, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13040, "pid": 5, "tid": 7, "ts": 1716454216825110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823861, "dur": 5, "args": { "External id": 13040, "cbid": 211, "correlation": 13040 } }, { "ph": "s", "id": 13040, "pid": 76337, "tid": -914061504, "ts": 1716454216823861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216825117, "dur": 6, "args": { "External id": 13046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13046, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13046, "pid": 5, "tid": 7, "ts": 1716454216825117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823889, "dur": 8, "args": { "External id": 13046, "cbid": 211, "correlation": 13046 } }, { "ph": "s", "id": 13046, "pid": 76337, "tid": -914061504, "ts": 1716454216823889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216825125, "dur": 3, "args": { "External id": 13054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13054, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 13054, "pid": 5, "tid": 7, "ts": 1716454216825125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216823933, "dur": 9, "args": { "External id": 13054, "cbid": 211, "correlation": 13054 } }, { "ph": "s", "id": 13054, "pid": 76337, "tid": -914061504, "ts": 1716454216823933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216824028, "dur": 1, "args": { "External id": 13070, "cbid": 251, "correlation": 13070 } }, { "ph": "f", "id": 13070, "pid": 76337, "tid": -914061504, "ts": 1716454216824028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216824033, "dur": 0, "args": { "External id": 13072, "cbid": 251, "correlation": 13072 } }, { "ph": "f", "id": 13072, "pid": 76337, "tid": -914061504, "ts": 1716454216824033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216825129, "dur": 12, "args": { "External id": 13073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13073, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13073, "pid": 5, "tid": 7, "ts": 1716454216825129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824035, "dur": 13, "args": { "External id": 13073, "cbid": 211, "correlation": 13073 } }, { "ph": "s", "id": 13073, "pid": 76337, "tid": -914061504, "ts": 1716454216824035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216825143, "dur": 5, "args": { "External id": 13075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13075, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13075, "pid": 5, "tid": 7, "ts": 1716454216825143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824050, "dur": 6, "args": { "External id": 13075, "cbid": 211, "correlation": 13075 } }, { "ph": "s", "id": 13075, "pid": 76337, "tid": -914061504, "ts": 1716454216824050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216825149, "dur": 6, "args": { "External id": 13085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13085, "pid": 5, "tid": 7, "ts": 1716454216825149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824108, "dur": 12, "args": { "External id": 13085, "cbid": 211, "correlation": 13085 } }, { "ph": "s", "id": 13085, "pid": 76337, "tid": -914061504, "ts": 1716454216824108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216825157, "dur": 10, "args": { "External id": 13105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13105, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 13105, "pid": 5, "tid": 7, "ts": 1716454216825157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824173, "dur": 10, "args": { "External id": 13105, "cbid": 211, "correlation": 13105 } }, { "ph": "s", "id": 13105, "pid": 76337, "tid": -914061504, "ts": 1716454216824173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216825168, "dur": 4, "args": { "External id": 13117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13117, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 13117, "pid": 5, "tid": 7, "ts": 1716454216825168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824203, "dur": 8, "args": { "External id": 13117, "cbid": 211, "correlation": 13117 } }, { "ph": "s", "id": 13117, "pid": 76337, "tid": -914061504, "ts": 1716454216824203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216825173, "dur": 7, "args": { "External id": 13120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13120, "pid": 5, "tid": 7, "ts": 1716454216825173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824224, "dur": 7, "args": { "External id": 13120, "cbid": 211, "correlation": 13120 } }, { "ph": "s", "id": 13120, "pid": 76337, "tid": -914061504, "ts": 1716454216824224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216825181, "dur": 5, "args": { "External id": 13129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13129, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13129, "pid": 5, "tid": 7, "ts": 1716454216825181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824267, "dur": 10, "args": { "External id": 13129, "cbid": 211, "correlation": 13129 } }, { "ph": "s", "id": 13129, "pid": 76337, "tid": -914061504, "ts": 1716454216824267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216824341, "dur": 0, "args": { "External id": 13139, "cbid": 317, "correlation": 13139 } }, { "ph": "f", "id": 13139, "pid": 76337, "tid": -914061504, "ts": 1716454216824341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216824342, "dur": 0, "args": { "External id": 13140, "cbid": 203, "correlation": 13140 } }, { "ph": "f", "id": 13140, "pid": 76337, "tid": -914061504, "ts": 1716454216824342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216824343, "dur": 0, "args": { "External id": 13141, "cbid": 205, "correlation": 13141 } }, { "ph": "f", "id": 13141, "pid": 76337, "tid": -914061504, "ts": 1716454216824343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825187, "dur": 5, "args": { "External id": 13145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13145, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13145, "pid": 5, "tid": 7, "ts": 1716454216825187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824356, "dur": 13, "args": { "External id": 13145, "cbid": 211, "correlation": 13145 } }, { "ph": "s", "id": 13145, "pid": 76337, "tid": -914061504, "ts": 1716454216824356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825193, "dur": 163, "args": { "External id": 13147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13147, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13147, "pid": 5, "tid": 7, "ts": 1716454216825193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824372, "dur": 5, "args": { "External id": 13147, "cbid": 211, "correlation": 13147 } }, { "ph": "s", "id": 13147, "pid": 76337, "tid": -914061504, "ts": 1716454216824372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216825358, "dur": 1, "args": { "External id": 13149, "device": 5, "context": 1, "stream": 7, "correlation": 13149, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 13149, "pid": 5, "tid": 7, "ts": 1716454216825358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216824383, "dur": 6, "args": { "External id": 13149, "cbid": 51, "correlation": 13149 } }, { "ph": "s", "id": 13149, "pid": 76337, "tid": -914061504, "ts": 1716454216824383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216825362, "dur": 261, "args": { "External id": 13150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13150, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13150, "pid": 5, "tid": 7, "ts": 1716454216825362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824391, "dur": 6, "args": { "External id": 13150, "cbid": 211, "correlation": 13150 } }, { "ph": "s", "id": 13150, "pid": 76337, "tid": -914061504, "ts": 1716454216824391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825624, "dur": 6, "args": { "External id": 13152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13152, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13152, "pid": 5, "tid": 7, "ts": 1716454216825624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824400, "dur": 5, "args": { "External id": 13152, "cbid": 211, "correlation": 13152 } }, { "ph": "s", "id": 13152, "pid": 76337, "tid": -914061504, "ts": 1716454216824400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216825632, "dur": 6, "args": { "External id": 13158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13158, "pid": 5, "tid": 7, "ts": 1716454216825632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824429, "dur": 9, "args": { "External id": 13158, "cbid": 211, "correlation": 13158 } }, { "ph": "s", "id": 13158, "pid": 76337, "tid": -914061504, "ts": 1716454216824429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216824489, "dur": 0, "args": { "External id": 13168, "cbid": 317, "correlation": 13168 } }, { "ph": "f", "id": 13168, "pid": 76337, "tid": -914061504, "ts": 1716454216824489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216824490, "dur": 0, "args": { "External id": 13169, "cbid": 203, "correlation": 13169 } }, { "ph": "f", "id": 13169, "pid": 76337, "tid": -914061504, "ts": 1716454216824490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216824491, "dur": 0, "args": { "External id": 13170, "cbid": 205, "correlation": 13170 } }, { "ph": "f", "id": 13170, "pid": 76337, "tid": -914061504, "ts": 1716454216824491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825639, "dur": 8, "args": { "External id": 13174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13174, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13174, "pid": 5, "tid": 7, "ts": 1716454216825639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824503, "dur": 11, "args": { "External id": 13174, "cbid": 211, "correlation": 13174 } }, { "ph": "s", "id": 13174, "pid": 76337, "tid": -914061504, "ts": 1716454216824503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216825649, "dur": 3, "args": { "External id": 13176, "device": 5, "context": 1, "stream": 7, "correlation": 13176, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 13176, "pid": 5, "tid": 7, "ts": 1716454216825649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216824520, "dur": 10, "args": { "External id": 13176, "cbid": 51, "correlation": 13176 } }, { "ph": "s", "id": 13176, "pid": 76337, "tid": -914061504, "ts": 1716454216824520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216825653, "dur": 95, "args": { "External id": 13177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13177, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 13177, "pid": 5, "tid": 7, "ts": 1716454216825653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824531, "dur": 6, "args": { "External id": 13177, "cbid": 211, "correlation": 13177 } }, { "ph": "s", "id": 13177, "pid": 76337, "tid": -914061504, "ts": 1716454216824531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825749, "dur": 5, "args": { "External id": 13179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13179, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13179, "pid": 5, "tid": 7, "ts": 1716454216825749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824541, "dur": 5, "args": { "External id": 13179, "cbid": 211, "correlation": 13179 } }, { "ph": "s", "id": 13179, "pid": 76337, "tid": -914061504, "ts": 1716454216824541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216825756, "dur": 6, "args": { "External id": 13185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13185, "pid": 5, "tid": 7, "ts": 1716454216825756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824567, "dur": 9, "args": { "External id": 13185, "cbid": 211, "correlation": 13185 } }, { "ph": "s", "id": 13185, "pid": 76337, "tid": -914061504, "ts": 1716454216824567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216825764, "dur": 5, "args": { "External id": 13193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13193, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13193, "pid": 5, "tid": 7, "ts": 1716454216825764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824597, "dur": 8, "args": { "External id": 13193, "cbid": 211, "correlation": 13193 } }, { "ph": "s", "id": 13193, "pid": 76337, "tid": -914061504, "ts": 1716454216824597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216825769, "dur": 4, "args": { "External id": 13201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13201, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 13201, "pid": 5, "tid": 7, "ts": 1716454216825769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824627, "dur": 8, "args": { "External id": 13201, "cbid": 211, "correlation": 13201 } }, { "ph": "s", "id": 13201, "pid": 76337, "tid": -914061504, "ts": 1716454216824627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454216825780, "dur": 14, "args": { "External id": 13212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13212, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13212, "pid": 5, "tid": 7, "ts": 1716454216825780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824780, "dur": 100, "args": { "External id": 13212, "cbid": 211, "correlation": 13212 } }, { "ph": "s", "id": 13212, "pid": 76337, "tid": -914061504, "ts": 1716454216824780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216824946, "dur": 0, "args": { "External id": 13222, "cbid": 317, "correlation": 13222 } }, { "ph": "f", "id": 13222, "pid": 76337, "tid": -914061504, "ts": 1716454216824946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216824947, "dur": 0, "args": { "External id": 13223, "cbid": 203, "correlation": 13223 } }, { "ph": "f", "id": 13223, "pid": 76337, "tid": -914061504, "ts": 1716454216824947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216824948, "dur": 0, "args": { "External id": 13224, "cbid": 205, "correlation": 13224 } }, { "ph": "f", "id": 13224, "pid": 76337, "tid": -914061504, "ts": 1716454216824948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825796, "dur": 10, "args": { "External id": 13228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13228, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13228, "pid": 5, "tid": 7, "ts": 1716454216825796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824972, "dur": 20, "args": { "External id": 13228, "cbid": 211, "correlation": 13228 } }, { "ph": "s", "id": 13228, "pid": 76337, "tid": -914061504, "ts": 1716454216824972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216825808, "dur": 164, "args": { "External id": 13230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13230, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13230, "pid": 5, "tid": 7, "ts": 1716454216825808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216824996, "dur": 5, "args": { "External id": 13230, "cbid": 211, "correlation": 13230 } }, { "ph": "s", "id": 13230, "pid": 76337, "tid": -914061504, "ts": 1716454216824996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216825974, "dur": 1, "args": { "External id": 13232, "device": 5, "context": 1, "stream": 7, "correlation": 13232, "bytes": 960, "memory bandwidth (GB/s)": 0.5357142857142857 } }, { "ph": "f", "id": 13232, "pid": 5, "tid": 7, "ts": 1716454216825974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216825008, "dur": 7, "args": { "External id": 13232, "cbid": 51, "correlation": 13232 } }, { "ph": "s", "id": 13232, "pid": 76337, "tid": -914061504, "ts": 1716454216825008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216825978, "dur": 666, "args": { "External id": 13233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13233, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13233, "pid": 5, "tid": 7, "ts": 1716454216825978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825016, "dur": 7, "args": { "External id": 13233, "cbid": 211, "correlation": 13233 } }, { "ph": "s", "id": 13233, "pid": 76337, "tid": -914061504, "ts": 1716454216825016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216826646, "dur": 14, "args": { "External id": 13235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13235, "pid": 5, "tid": 7, "ts": 1716454216826646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825027, "dur": 5, "args": { "External id": 13235, "cbid": 211, "correlation": 13235 } }, { "ph": "s", "id": 13235, "pid": 76337, "tid": -914061504, "ts": 1716454216825027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216826661, "dur": 16, "args": { "External id": 13241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13241, "pid": 5, "tid": 7, "ts": 1716454216826661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825057, "dur": 9, "args": { "External id": 13241, "cbid": 211, "correlation": 13241 } }, { "ph": "s", "id": 13241, "pid": 76337, "tid": -914061504, "ts": 1716454216825057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216826678, "dur": 30, "args": { "External id": 13250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13250, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13250, "pid": 5, "tid": 7, "ts": 1716454216826678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825198, "dur": 14, "args": { "External id": 13250, "cbid": 211, "correlation": 13250 } }, { "ph": "s", "id": 13250, "pid": 76337, "tid": -914061504, "ts": 1716454216825198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216826710, "dur": 31, "args": { "External id": 13270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13270, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 13270, "pid": 5, "tid": 7, "ts": 1716454216826710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825282, "dur": 12, "args": { "External id": 13270, "cbid": 211, "correlation": 13270 } }, { "ph": "s", "id": 13270, "pid": 76337, "tid": -914061504, "ts": 1716454216825282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216826742, "dur": 5, "args": { "External id": 13282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13282, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13282, "pid": 5, "tid": 7, "ts": 1716454216826742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825305, "dur": 6, "args": { "External id": 13282, "cbid": 211, "correlation": 13282 } }, { "ph": "s", "id": 13282, "pid": 76337, "tid": -914061504, "ts": 1716454216825305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216826749, "dur": 31, "args": { "External id": 13285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13285, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13285, "pid": 5, "tid": 7, "ts": 1716454216826749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825323, "dur": 7, "args": { "External id": 13285, "cbid": 211, "correlation": 13285 } }, { "ph": "s", "id": 13285, "pid": 76337, "tid": -914061504, "ts": 1716454216825323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216826781, "dur": 22, "args": { "External id": 13294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13294, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13294, "pid": 5, "tid": 7, "ts": 1716454216826781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216825364, "dur": 10, "args": { "External id": 13294, "cbid": 211, "correlation": 13294 } }, { "ph": "s", "id": 13294, "pid": 76337, "tid": -914061504, "ts": 1716454216825364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216825434, "dur": 0, "args": { "External id": 13304, "cbid": 317, "correlation": 13304 } }, { "ph": "f", "id": 13304, "pid": 76337, "tid": -914061504, "ts": 1716454216825434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216825435, "dur": 0, "args": { "External id": 13305, "cbid": 203, "correlation": 13305 } }, { "ph": "f", "id": 13305, "pid": 76337, "tid": -914061504, "ts": 1716454216825435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216825436, "dur": 0, "args": { "External id": 13306, "cbid": 205, "correlation": 13306 } }, { "ph": "f", "id": 13306, "pid": 76337, "tid": -914061504, "ts": 1716454216825436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216828720, "dur": 21, "args": { "External id": 13310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13310, "pid": 5, "tid": 7, "ts": 1716454216828720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216828690, "dur": 30, "args": { "External id": 13310, "cbid": 211, "correlation": 13310 } }, { "ph": "s", "id": 13310, "pid": 76337, "tid": -914061504, "ts": 1716454216828690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216828742, "dur": 323, "args": { "External id": 13312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13312, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13312, "pid": 5, "tid": 7, "ts": 1716454216828742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216828723, "dur": 6, "args": { "External id": 13312, "cbid": 211, "correlation": 13312 } }, { "ph": "s", "id": 13312, "pid": 76337, "tid": -914061504, "ts": 1716454216828723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216829067, "dur": 1, "args": { "External id": 13314, "device": 5, "context": 1, "stream": 7, "correlation": 13314, "bytes": 960, "memory bandwidth (GB/s)": 0.4918032786885246 } }, { "ph": "f", "id": 13314, "pid": 5, "tid": 7, "ts": 1716454216829067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216828738, "dur": 11, "args": { "External id": 13314, "cbid": 51, "correlation": 13314 } }, { "ph": "s", "id": 13314, "pid": 76337, "tid": -914061504, "ts": 1716454216828738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216829071, "dur": 1249, "args": { "External id": 13315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13315, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13315, "pid": 5, "tid": 7, "ts": 1716454216829071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216828750, "dur": 8, "args": { "External id": 13315, "cbid": 211, "correlation": 13315 } }, { "ph": "s", "id": 13315, "pid": 76337, "tid": -914061504, "ts": 1716454216828750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216830322, "dur": 13, "args": { "External id": 13317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13317, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13317, "pid": 5, "tid": 7, "ts": 1716454216830322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216828763, "dur": 6, "args": { "External id": 13317, "cbid": 211, "correlation": 13317 } }, { "ph": "s", "id": 13317, "pid": 76337, "tid": -914061504, "ts": 1716454216828763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216830336, "dur": 15, "args": { "External id": 13323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13323, "pid": 5, "tid": 7, "ts": 1716454216830336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829536, "dur": 12, "args": { "External id": 13323, "cbid": 211, "correlation": 13323 } }, { "ph": "s", "id": 13323, "pid": 76337, "tid": -914061504, "ts": 1716454216829536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216830352, "dur": 3, "args": { "External id": 13331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13331, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 13331, "pid": 5, "tid": 7, "ts": 1716454216830352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829622, "dur": 10, "args": { "External id": 13331, "cbid": 211, "correlation": 13331 } }, { "ph": "s", "id": 13331, "pid": 76337, "tid": -914061504, "ts": 1716454216829622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216829737, "dur": 3, "args": { "External id": 13347, "cbid": 251, "correlation": 13347 } }, { "ph": "f", "id": 13347, "pid": 76337, "tid": -914061504, "ts": 1716454216829737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216829747, "dur": 0, "args": { "External id": 13349, "cbid": 251, "correlation": 13349 } }, { "ph": "f", "id": 13349, "pid": 76337, "tid": -914061504, "ts": 1716454216829747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216830357, "dur": 13, "args": { "External id": 13350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13350, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13350, "pid": 5, "tid": 7, "ts": 1716454216830357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829750, "dur": 15, "args": { "External id": 13350, "cbid": 211, "correlation": 13350 } }, { "ph": "s", "id": 13350, "pid": 76337, "tid": -914061504, "ts": 1716454216829750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216830372, "dur": 6, "args": { "External id": 13352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13352, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13352, "pid": 5, "tid": 7, "ts": 1716454216830372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829768, "dur": 8, "args": { "External id": 13352, "cbid": 211, "correlation": 13352 } }, { "ph": "s", "id": 13352, "pid": 76337, "tid": -914061504, "ts": 1716454216829768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216830378, "dur": 17, "args": { "External id": 13362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13362, "pid": 5, "tid": 7, "ts": 1716454216830378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829849, "dur": 12, "args": { "External id": 13362, "cbid": 211, "correlation": 13362 } }, { "ph": "s", "id": 13362, "pid": 76337, "tid": -914061504, "ts": 1716454216829849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216830397, "dur": 18, "args": { "External id": 13382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13382, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 13382, "pid": 5, "tid": 7, "ts": 1716454216830397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829939, "dur": 13, "args": { "External id": 13382, "cbid": 211, "correlation": 13382 } }, { "ph": "s", "id": 13382, "pid": 76337, "tid": -914061504, "ts": 1716454216829939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216830416, "dur": 4, "args": { "External id": 13394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13394, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 13394, "pid": 5, "tid": 7, "ts": 1716454216830416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829963, "dur": 7, "args": { "External id": 13394, "cbid": 211, "correlation": 13394 } }, { "ph": "s", "id": 13394, "pid": 76337, "tid": -914061504, "ts": 1716454216829963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216830421, "dur": 17, "args": { "External id": 13397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13397, "pid": 5, "tid": 7, "ts": 1716454216830421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216829997, "dur": 9, "args": { "External id": 13397, "cbid": 211, "correlation": 13397 } }, { "ph": "s", "id": 13397, "pid": 76337, "tid": -914061504, "ts": 1716454216829997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216830440, "dur": 11, "args": { "External id": 13406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13406, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13406, "pid": 5, "tid": 7, "ts": 1716454216830440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216830043, "dur": 10, "args": { "External id": 13406, "cbid": 211, "correlation": 13406 } }, { "ph": "s", "id": 13406, "pid": 76337, "tid": -914061504, "ts": 1716454216830043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216830133, "dur": 0, "args": { "External id": 13416, "cbid": 317, "correlation": 13416 } }, { "ph": "f", "id": 13416, "pid": 76337, "tid": -914061504, "ts": 1716454216830133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216830134, "dur": 0, "args": { "External id": 13417, "cbid": 203, "correlation": 13417 } }, { "ph": "f", "id": 13417, "pid": 76337, "tid": -914061504, "ts": 1716454216830134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216830135, "dur": 1, "args": { "External id": 13418, "cbid": 205, "correlation": 13418 } }, { "ph": "f", "id": 13418, "pid": 76337, "tid": -914061504, "ts": 1716454216830135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216830452, "dur": 11, "args": { "External id": 13422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13422, "pid": 5, "tid": 7, "ts": 1716454216830452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216830151, "dur": 12, "args": { "External id": 13422, "cbid": 211, "correlation": 13422 } }, { "ph": "s", "id": 13422, "pid": 76337, "tid": -914061504, "ts": 1716454216830151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216830464, "dur": 164, "args": { "External id": 13424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13424, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13424, "pid": 5, "tid": 7, "ts": 1716454216830464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216830166, "dur": 6, "args": { "External id": 13424, "cbid": 211, "correlation": 13424 } }, { "ph": "s", "id": 13424, "pid": 76337, "tid": -914061504, "ts": 1716454216830166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216830631, "dur": 1, "args": { "External id": 13426, "device": 5, "context": 1, "stream": 7, "correlation": 13426, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 13426, "pid": 5, "tid": 7, "ts": 1716454216830631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216830178, "dur": 7, "args": { "External id": 13426, "cbid": 51, "correlation": 13426 } }, { "ph": "s", "id": 13426, "pid": 76337, "tid": -914061504, "ts": 1716454216830178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216830635, "dur": 653, "args": { "External id": 13427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13427, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13427, "pid": 5, "tid": 7, "ts": 1716454216830635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216830187, "dur": 6, "args": { "External id": 13427, "cbid": 211, "correlation": 13427 } }, { "ph": "s", "id": 13427, "pid": 76337, "tid": -914061504, "ts": 1716454216830187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216831289, "dur": 13, "args": { "External id": 13429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13429, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13429, "pid": 5, "tid": 7, "ts": 1716454216831289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216830197, "dur": 5, "args": { "External id": 13429, "cbid": 211, "correlation": 13429 } }, { "ph": "s", "id": 13429, "pid": 76337, "tid": -914061504, "ts": 1716454216830197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216831303, "dur": 15, "args": { "External id": 13435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13435, "pid": 5, "tid": 7, "ts": 1716454216831303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216830226, "dur": 9, "args": { "External id": 13435, "cbid": 211, "correlation": 13435 } }, { "ph": "s", "id": 13435, "pid": 76337, "tid": -914061504, "ts": 1716454216830226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216830286, "dur": 0, "args": { "External id": 13445, "cbid": 317, "correlation": 13445 } }, { "ph": "f", "id": 13445, "pid": 76337, "tid": -914061504, "ts": 1716454216830286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216830286, "dur": 0, "args": { "External id": 13446, "cbid": 203, "correlation": 13446 } }, { "ph": "f", "id": 13446, "pid": 76337, "tid": -914061504, "ts": 1716454216830286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216830287, "dur": 0, "args": { "External id": 13447, "cbid": 205, "correlation": 13447 } }, { "ph": "f", "id": 13447, "pid": 76337, "tid": -914061504, "ts": 1716454216830287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216832016, "dur": 21, "args": { "External id": 13451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13451, "pid": 5, "tid": 7, "ts": 1716454216832016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216831998, "dur": 19, "args": { "External id": 13451, "cbid": 211, "correlation": 13451 } }, { "ph": "s", "id": 13451, "pid": 76337, "tid": -914061504, "ts": 1716454216831998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216832038, "dur": 4, "args": { "External id": 13453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13453, "pid": 5, "tid": 7, "ts": 1716454216832038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832023, "dur": 11, "args": { "External id": 13453, "cbid": 211, "correlation": 13453 } }, { "ph": "s", "id": 13453, "pid": 76337, "tid": -914061504, "ts": 1716454216832023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216832037, "dur": 0, "args": { "External id": 13454, "cbid": 51, "correlation": 13454 } }, { "ph": "s", "id": 13454, "pid": 76337, "tid": -914061504, "ts": 1716454216832037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216832048, "dur": 177, "args": { "External id": 13455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13455, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 13455, "pid": 5, "tid": 7, "ts": 1716454216832048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832038, "dur": 8, "args": { "External id": 13455, "cbid": 211, "correlation": 13455 } }, { "ph": "s", "id": 13455, "pid": 76337, "tid": -914061504, "ts": 1716454216832038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216832495, "dur": 16, "args": { "External id": 13460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13460, "pid": 5, "tid": 7, "ts": 1716454216832495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832483, "dur": 10, "args": { "External id": 13460, "cbid": 211, "correlation": 13460 } }, { "ph": "s", "id": 13460, "pid": 76337, "tid": -914061504, "ts": 1716454216832483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216832531, "dur": 13, "args": { "External id": 13468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13468, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13468, "pid": 5, "tid": 7, "ts": 1716454216832531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832521, "dur": 10, "args": { "External id": 13468, "cbid": 211, "correlation": 13468 } }, { "ph": "s", "id": 13468, "pid": 76337, "tid": -914061504, "ts": 1716454216832521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216832581, "dur": 10, "args": { "External id": 13476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13476, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13476, "pid": 5, "tid": 7, "ts": 1716454216832581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832571, "dur": 10, "args": { "External id": 13476, "cbid": 211, "correlation": 13476 } }, { "ph": "s", "id": 13476, "pid": 76337, "tid": -914061504, "ts": 1716454216832571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216832705, "dur": 18, "args": { "External id": 13496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13496, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 13496, "pid": 5, "tid": 7, "ts": 1716454216832705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832691, "dur": 14, "args": { "External id": 13496, "cbid": 211, "correlation": 13496 } }, { "ph": "s", "id": 13496, "pid": 76337, "tid": -914061504, "ts": 1716454216832691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216832747, "dur": 4, "args": { "External id": 13508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13508, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 13508, "pid": 5, "tid": 7, "ts": 1716454216832747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832737, "dur": 9, "args": { "External id": 13508, "cbid": 211, "correlation": 13508 } }, { "ph": "s", "id": 13508, "pid": 76337, "tid": -914061504, "ts": 1716454216832737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216832769, "dur": 17, "args": { "External id": 13511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13511, "pid": 5, "tid": 7, "ts": 1716454216832769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832761, "dur": 7, "args": { "External id": 13511, "cbid": 211, "correlation": 13511 } }, { "ph": "s", "id": 13511, "pid": 76337, "tid": -914061504, "ts": 1716454216832761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216832839, "dur": 0, "args": { "External id": 13522, "cbid": 317, "correlation": 13522 } }, { "ph": "f", "id": 13522, "pid": 76337, "tid": -914061504, "ts": 1716454216832839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216832840, "dur": 0, "args": { "External id": 13523, "cbid": 203, "correlation": 13523 } }, { "ph": "f", "id": 13523, "pid": 76337, "tid": -914061504, "ts": 1716454216832840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216832841, "dur": 0, "args": { "External id": 13524, "cbid": 205, "correlation": 13524 } }, { "ph": "f", "id": 13524, "pid": 76337, "tid": -914061504, "ts": 1716454216832841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216832870, "dur": 11, "args": { "External id": 13528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13528, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13528, "pid": 5, "tid": 7, "ts": 1716454216832870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832857, "dur": 13, "args": { "External id": 13528, "cbid": 211, "correlation": 13528 } }, { "ph": "s", "id": 13528, "pid": 76337, "tid": -914061504, "ts": 1716454216832857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216832883, "dur": 3, "args": { "External id": 13530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13530, "pid": 5, "tid": 7, "ts": 1716454216832883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832874, "dur": 6, "args": { "External id": 13530, "cbid": 211, "correlation": 13530 } }, { "ph": "s", "id": 13530, "pid": 76337, "tid": -914061504, "ts": 1716454216832874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216832884, "dur": 0, "args": { "External id": 13531, "cbid": 51, "correlation": 13531 } }, { "ph": "s", "id": 13531, "pid": 76337, "tid": -914061504, "ts": 1716454216832884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216832892, "dur": 93, "args": { "External id": 13532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13532, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 13532, "pid": 5, "tid": 7, "ts": 1716454216832892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832885, "dur": 6, "args": { "External id": 13532, "cbid": 211, "correlation": 13532 } }, { "ph": "s", "id": 13532, "pid": 76337, "tid": -914061504, "ts": 1716454216832885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216832986, "dur": 15, "args": { "External id": 13537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13537, "pid": 5, "tid": 7, "ts": 1716454216832986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216832914, "dur": 9, "args": { "External id": 13537, "cbid": 211, "correlation": 13537 } }, { "ph": "s", "id": 13537, "pid": 76337, "tid": -914061504, "ts": 1716454216832914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216833088, "dur": 44, "args": { "External id": 13548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13548, "pid": 5, "tid": 7, "ts": 1716454216833088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833071, "dur": 19, "args": { "External id": 13548, "cbid": 211, "correlation": 13548 } }, { "ph": "s", "id": 13548, "pid": 76337, "tid": -914061504, "ts": 1716454216833071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216833133, "dur": 18, "args": { "External id": 13570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13570, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13570, "pid": 5, "tid": 7, "ts": 1716454216833133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833113, "dur": 10, "args": { "External id": 13570, "cbid": 211, "correlation": 13570 } }, { "ph": "s", "id": 13570, "pid": 76337, "tid": -914061504, "ts": 1716454216833113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216833285, "dur": 3, "args": { "External id": 13581, "cbid": 251, "correlation": 13581 } }, { "ph": "f", "id": 13581, "pid": 76337, "tid": -914061504, "ts": 1716454216833285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216833313, "dur": 91, "args": { "External id": 13582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13582, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13582, "pid": 5, "tid": 7, "ts": 1716454216833313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833296, "dur": 18, "args": { "External id": 13582, "cbid": 211, "correlation": 13582 } }, { "ph": "s", "id": 13582, "pid": 76337, "tid": -914061504, "ts": 1716454216833296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216833393, "dur": 1, "args": { "External id": 13593, "cbid": 251, "correlation": 13593 } }, { "ph": "f", "id": 13593, "pid": 76337, "tid": -914061504, "ts": 1716454216833393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216833412, "dur": 86, "args": { "External id": 13594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13594, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13594, "pid": 5, "tid": 7, "ts": 1716454216833412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833398, "dur": 13, "args": { "External id": 13594, "cbid": 211, "correlation": 13594 } }, { "ph": "s", "id": 13594, "pid": 76337, "tid": -914061504, "ts": 1716454216833398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216833478, "dur": 1, "args": { "External id": 13605, "cbid": 251, "correlation": 13605 } }, { "ph": "f", "id": 13605, "pid": 76337, "tid": -914061504, "ts": 1716454216833478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216833499, "dur": 85, "args": { "External id": 13606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13606, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13606, "pid": 5, "tid": 7, "ts": 1716454216833499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833482, "dur": 13, "args": { "External id": 13606, "cbid": 211, "correlation": 13606 } }, { "ph": "s", "id": 13606, "pid": 76337, "tid": -914061504, "ts": 1716454216833482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216833619, "dur": 140, "args": { "External id": 13631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13631, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13631, "pid": 5, "tid": 7, "ts": 1716454216833619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833603, "dur": 16, "args": { "External id": 13631, "cbid": 211, "correlation": 13631 } }, { "ph": "s", "id": 13631, "pid": 76337, "tid": -914061504, "ts": 1716454216833603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216833748, "dur": 2, "args": { "External id": 13649, "cbid": 251, "correlation": 13649 } }, { "ph": "f", "id": 13649, "pid": 76337, "tid": -914061504, "ts": 1716454216833748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216833770, "dur": 92, "args": { "External id": 13651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13651, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13651, "pid": 5, "tid": 7, "ts": 1716454216833770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833755, "dur": 16, "args": { "External id": 13651, "cbid": 211, "correlation": 13651 } }, { "ph": "s", "id": 13651, "pid": 76337, "tid": -914061504, "ts": 1716454216833755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216833864, "dur": 10, "args": { "External id": 13659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13659, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13659, "pid": 5, "tid": 7, "ts": 1716454216833864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833842, "dur": 13, "args": { "External id": 13659, "cbid": 211, "correlation": 13659 } }, { "ph": "s", "id": 13659, "pid": 76337, "tid": -914061504, "ts": 1716454216833842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216833895, "dur": 46, "args": { "External id": 13667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13667, "pid": 5, "tid": 7, "ts": 1716454216833895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833884, "dur": 10, "args": { "External id": 13667, "cbid": 211, "correlation": 13667 } }, { "ph": "s", "id": 13667, "pid": 76337, "tid": -914061504, "ts": 1716454216833884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216833964, "dur": 17, "args": { "External id": 13689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13689, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13689, "pid": 5, "tid": 7, "ts": 1716454216833964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216833952, "dur": 11, "args": { "External id": 13689, "cbid": 211, "correlation": 13689 } }, { "ph": "s", "id": 13689, "pid": 76337, "tid": -914061504, "ts": 1716454216833952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834061, "dur": 1, "args": { "External id": 13700, "cbid": 251, "correlation": 13700 } }, { "ph": "f", "id": 13700, "pid": 76337, "tid": -914061504, "ts": 1716454216834061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216834081, "dur": 86, "args": { "External id": 13701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13701, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13701, "pid": 5, "tid": 7, "ts": 1716454216834081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834066, "dur": 14, "args": { "External id": 13701, "cbid": 211, "correlation": 13701 } }, { "ph": "s", "id": 13701, "pid": 76337, "tid": -914061504, "ts": 1716454216834066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834160, "dur": 2, "args": { "External id": 13712, "cbid": 251, "correlation": 13712 } }, { "ph": "f", "id": 13712, "pid": 76337, "tid": -914061504, "ts": 1716454216834160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834165, "dur": 0, "args": { "External id": 13713, "cbid": 251, "correlation": 13713 } }, { "ph": "f", "id": 13713, "pid": 76337, "tid": -914061504, "ts": 1716454216834165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216834181, "dur": 12, "args": { "External id": 13714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13714, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13714, "pid": 5, "tid": 7, "ts": 1716454216834181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834167, "dur": 14, "args": { "External id": 13714, "cbid": 211, "correlation": 13714 } }, { "ph": "s", "id": 13714, "pid": 76337, "tid": -914061504, "ts": 1716454216834167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216834195, "dur": 5, "args": { "External id": 13716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13716, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13716, "pid": 5, "tid": 7, "ts": 1716454216834195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834184, "dur": 9, "args": { "External id": 13716, "cbid": 211, "correlation": 13716 } }, { "ph": "s", "id": 13716, "pid": 76337, "tid": -914061504, "ts": 1716454216834184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834246, "dur": 1, "args": { "External id": 13727, "cbid": 251, "correlation": 13727 } }, { "ph": "f", "id": 13727, "pid": 76337, "tid": -914061504, "ts": 1716454216834246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834250, "dur": 0, "args": { "External id": 13728, "cbid": 251, "correlation": 13728 } }, { "ph": "f", "id": 13728, "pid": 76337, "tid": -914061504, "ts": 1716454216834250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216834264, "dur": 8, "args": { "External id": 13729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13729, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13729, "pid": 5, "tid": 7, "ts": 1716454216834264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834251, "dur": 12, "args": { "External id": 13729, "cbid": 211, "correlation": 13729 } }, { "ph": "s", "id": 13729, "pid": 76337, "tid": -914061504, "ts": 1716454216834251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216834274, "dur": 3, "args": { "External id": 13731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13731, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13731, "pid": 5, "tid": 7, "ts": 1716454216834274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834265, "dur": 6, "args": { "External id": 13731, "cbid": 211, "correlation": 13731 } }, { "ph": "s", "id": 13731, "pid": 76337, "tid": -914061504, "ts": 1716454216834265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216834356, "dur": 57, "args": { "External id": 13756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13756, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13756, "pid": 5, "tid": 7, "ts": 1716454216834356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834343, "dur": 12, "args": { "External id": 13756, "cbid": 211, "correlation": 13756 } }, { "ph": "s", "id": 13756, "pid": 76337, "tid": -914061504, "ts": 1716454216834343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834460, "dur": 1, "args": { "External id": 13774, "cbid": 251, "correlation": 13774 } }, { "ph": "f", "id": 13774, "pid": 76337, "tid": -914061504, "ts": 1716454216834460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216834480, "dur": 87, "args": { "External id": 13776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13776, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13776, "pid": 5, "tid": 7, "ts": 1716454216834480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834466, "dur": 14, "args": { "External id": 13776, "cbid": 211, "correlation": 13776 } }, { "ph": "s", "id": 13776, "pid": 76337, "tid": -914061504, "ts": 1716454216834466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216834569, "dur": 10, "args": { "External id": 13784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13784, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13784, "pid": 5, "tid": 7, "ts": 1716454216834569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834539, "dur": 12, "args": { "External id": 13784, "cbid": 211, "correlation": 13784 } }, { "ph": "s", "id": 13784, "pid": 76337, "tid": -914061504, "ts": 1716454216834539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216834589, "dur": 13, "args": { "External id": 13792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13792, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13792, "pid": 5, "tid": 7, "ts": 1716454216834589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834579, "dur": 9, "args": { "External id": 13792, "cbid": 211, "correlation": 13792 } }, { "ph": "s", "id": 13792, "pid": 76337, "tid": -914061504, "ts": 1716454216834579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216834654, "dur": 17, "args": { "External id": 13814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13814, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13814, "pid": 5, "tid": 7, "ts": 1716454216834654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834642, "dur": 12, "args": { "External id": 13814, "cbid": 211, "correlation": 13814 } }, { "ph": "s", "id": 13814, "pid": 76337, "tid": -914061504, "ts": 1716454216834642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834747, "dur": 2, "args": { "External id": 13830, "cbid": 251, "correlation": 13830 } }, { "ph": "f", "id": 13830, "pid": 76337, "tid": -914061504, "ts": 1716454216834747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834753, "dur": 0, "args": { "External id": 13832, "cbid": 251, "correlation": 13832 } }, { "ph": "f", "id": 13832, "pid": 76337, "tid": -914061504, "ts": 1716454216834753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216834771, "dur": 495, "args": { "External id": 13833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13833, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13833, "pid": 5, "tid": 7, "ts": 1716454216834771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834756, "dur": 15, "args": { "External id": 13833, "cbid": 211, "correlation": 13833 } }, { "ph": "s", "id": 13833, "pid": 76337, "tid": -914061504, "ts": 1716454216834756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216835268, "dur": 66, "args": { "External id": 13841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13841, "pid": 5, "tid": 7, "ts": 1716454216835268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834854, "dur": 15, "args": { "External id": 13841, "cbid": 211, "correlation": 13841 } }, { "ph": "s", "id": 13841, "pid": 76337, "tid": -914061504, "ts": 1716454216834854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216835335, "dur": 68, "args": { "External id": 13849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13849, "pid": 5, "tid": 7, "ts": 1716454216835335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216834896, "dur": 11, "args": { "External id": 13849, "cbid": 211, "correlation": 13849 } }, { "ph": "s", "id": 13849, "pid": 76337, "tid": -914061504, "ts": 1716454216834896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216834989, "dur": 1, "args": { "External id": 13865, "cbid": 251, "correlation": 13865 } }, { "ph": "f", "id": 13865, "pid": 76337, "tid": -914061504, "ts": 1716454216834989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216835405, "dur": 1, "args": { "External id": 13867, "device": 5, "context": 1, "stream": 7, "correlation": 13867, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 13867, "pid": 5, "tid": 7, "ts": 1716454216835405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216834994, "dur": 15, "args": { "External id": 13867, "cbid": 51, "correlation": 13867 } }, { "ph": "s", "id": 13867, "pid": 76337, "tid": -914061504, "ts": 1716454216834994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216835409, "dur": 270, "args": { "External id": 13868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13868, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13868, "pid": 5, "tid": 7, "ts": 1716454216835409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835011, "dur": 12, "args": { "External id": 13868, "cbid": 211, "correlation": 13868 } }, { "ph": "s", "id": 13868, "pid": 76337, "tid": -914061504, "ts": 1716454216835011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216835680, "dur": 14, "args": { "External id": 13876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13876, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13876, "pid": 5, "tid": 7, "ts": 1716454216835680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835056, "dur": 11, "args": { "External id": 13876, "cbid": 211, "correlation": 13876 } }, { "ph": "s", "id": 13876, "pid": 76337, "tid": -914061504, "ts": 1716454216835056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216835695, "dur": 38, "args": { "External id": 13887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13887, "pid": 5, "tid": 7, "ts": 1716454216835695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835127, "dur": 12, "args": { "External id": 13887, "cbid": 211, "correlation": 13887 } }, { "ph": "s", "id": 13887, "pid": 76337, "tid": -914061504, "ts": 1716454216835127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216835195, "dur": 0, "args": { "External id": 13899, "cbid": 317, "correlation": 13899 } }, { "ph": "f", "id": 13899, "pid": 76337, "tid": -914061504, "ts": 1716454216835195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216835196, "dur": 0, "args": { "External id": 13900, "cbid": 203, "correlation": 13900 } }, { "ph": "f", "id": 13900, "pid": 76337, "tid": -914061504, "ts": 1716454216835196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216835197, "dur": 0, "args": { "External id": 13901, "cbid": 205, "correlation": 13901 } }, { "ph": "f", "id": 13901, "pid": 76337, "tid": -914061504, "ts": 1716454216835197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216835734, "dur": 13, "args": { "External id": 13905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13905, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13905, "pid": 5, "tid": 7, "ts": 1716454216835734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835211, "dur": 12, "args": { "External id": 13905, "cbid": 211, "correlation": 13905 } }, { "ph": "s", "id": 13905, "pid": 76337, "tid": -914061504, "ts": 1716454216835211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216835749, "dur": 4, "args": { "External id": 13907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13907, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 13907, "pid": 5, "tid": 7, "ts": 1716454216835749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835229, "dur": 6, "args": { "External id": 13907, "cbid": 211, "correlation": 13907 } }, { "ph": "s", "id": 13907, "pid": 76337, "tid": -914061504, "ts": 1716454216835229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216835238, "dur": 0, "args": { "External id": 13908, "cbid": 51, "correlation": 13908 } }, { "ph": "s", "id": 13908, "pid": 76337, "tid": -914061504, "ts": 1716454216835238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216835754, "dur": 100, "args": { "External id": 13909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13909, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 13909, "pid": 5, "tid": 7, "ts": 1716454216835754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835239, "dur": 6, "args": { "External id": 13909, "cbid": 211, "correlation": 13909 } }, { "ph": "s", "id": 13909, "pid": 76337, "tid": -914061504, "ts": 1716454216835239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216835855, "dur": 17, "args": { "External id": 13914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13914, "pid": 5, "tid": 7, "ts": 1716454216835855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835267, "dur": 9, "args": { "External id": 13914, "cbid": 211, "correlation": 13914 } }, { "ph": "s", "id": 13914, "pid": 76337, "tid": -914061504, "ts": 1716454216835267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216835873, "dur": 12, "args": { "External id": 13922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13922, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13922, "pid": 5, "tid": 7, "ts": 1716454216835873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835300, "dur": 8, "args": { "External id": 13922, "cbid": 211, "correlation": 13922 } }, { "ph": "s", "id": 13922, "pid": 76337, "tid": -914061504, "ts": 1716454216835300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216835887, "dur": 31, "args": { "External id": 13931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13931, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13931, "pid": 5, "tid": 7, "ts": 1716454216835887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835357, "dur": 13, "args": { "External id": 13931, "cbid": 211, "correlation": 13931 } }, { "ph": "s", "id": 13931, "pid": 76337, "tid": -914061504, "ts": 1716454216835357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216835918, "dur": 31, "args": { "External id": 13951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13951, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 13951, "pid": 5, "tid": 7, "ts": 1716454216835918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835450, "dur": 13, "args": { "External id": 13951, "cbid": 211, "correlation": 13951 } }, { "ph": "s", "id": 13951, "pid": 76337, "tid": -914061504, "ts": 1716454216835450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216835950, "dur": 5, "args": { "External id": 13963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13963, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13963, "pid": 5, "tid": 7, "ts": 1716454216835950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835473, "dur": 6, "args": { "External id": 13963, "cbid": 211, "correlation": 13963 } }, { "ph": "s", "id": 13963, "pid": 76337, "tid": -914061504, "ts": 1716454216835473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216835956, "dur": 31, "args": { "External id": 13966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13966, "pid": 5, "tid": 7, "ts": 1716454216835956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835492, "dur": 8, "args": { "External id": 13966, "cbid": 211, "correlation": 13966 } }, { "ph": "s", "id": 13966, "pid": 76337, "tid": -914061504, "ts": 1716454216835492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216835989, "dur": 21, "args": { "External id": 13975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13975, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13975, "pid": 5, "tid": 7, "ts": 1716454216835989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835538, "dur": 11, "args": { "External id": 13975, "cbid": 211, "correlation": 13975 } }, { "ph": "s", "id": 13975, "pid": 76337, "tid": -914061504, "ts": 1716454216835538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216835606, "dur": 0, "args": { "External id": 13985, "cbid": 317, "correlation": 13985 } }, { "ph": "f", "id": 13985, "pid": 76337, "tid": -914061504, "ts": 1716454216835606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216835607, "dur": 0, "args": { "External id": 13986, "cbid": 203, "correlation": 13986 } }, { "ph": "f", "id": 13986, "pid": 76337, "tid": -914061504, "ts": 1716454216835607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216835608, "dur": 0, "args": { "External id": 13987, "cbid": 205, "correlation": 13987 } }, { "ph": "f", "id": 13987, "pid": 76337, "tid": -914061504, "ts": 1716454216835608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216836011, "dur": 23, "args": { "External id": 13991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13991, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13991, "pid": 5, "tid": 7, "ts": 1716454216836011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835623, "dur": 12, "args": { "External id": 13991, "cbid": 211, "correlation": 13991 } }, { "ph": "s", "id": 13991, "pid": 76337, "tid": -914061504, "ts": 1716454216835623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216836036, "dur": 321, "args": { "External id": 13993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13993, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13993, "pid": 5, "tid": 7, "ts": 1716454216836036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835638, "dur": 5, "args": { "External id": 13993, "cbid": 211, "correlation": 13993 } }, { "ph": "s", "id": 13993, "pid": 76337, "tid": -914061504, "ts": 1716454216835638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216836359, "dur": 1, "args": { "External id": 13995, "device": 5, "context": 1, "stream": 7, "correlation": 13995, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 13995, "pid": 5, "tid": 7, "ts": 1716454216836359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216835652, "dur": 8, "args": { "External id": 13995, "cbid": 51, "correlation": 13995 } }, { "ph": "s", "id": 13995, "pid": 76337, "tid": -914061504, "ts": 1716454216835652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216836363, "dur": 1264, "args": { "External id": 13996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13996, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 13996, "pid": 5, "tid": 7, "ts": 1716454216836363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835662, "dur": 7, "args": { "External id": 13996, "cbid": 211, "correlation": 13996 } }, { "ph": "s", "id": 13996, "pid": 76337, "tid": -914061504, "ts": 1716454216835662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216837628, "dur": 13, "args": { "External id": 13998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 13998, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 13998, "pid": 5, "tid": 7, "ts": 1716454216837628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835675, "dur": 6, "args": { "External id": 13998, "cbid": 211, "correlation": 13998 } }, { "ph": "s", "id": 13998, "pid": 76337, "tid": -914061504, "ts": 1716454216835675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216837642, "dur": 15, "args": { "External id": 14004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14004, "pid": 5, "tid": 7, "ts": 1716454216837642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835705, "dur": 8, "args": { "External id": 14004, "cbid": 211, "correlation": 14004 } }, { "ph": "s", "id": 14004, "pid": 76337, "tid": -914061504, "ts": 1716454216835705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216837658, "dur": 3, "args": { "External id": 14012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14012, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 14012, "pid": 5, "tid": 7, "ts": 1716454216837658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835750, "dur": 10, "args": { "External id": 14012, "cbid": 211, "correlation": 14012 } }, { "ph": "s", "id": 14012, "pid": 76337, "tid": -914061504, "ts": 1716454216835750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216835817, "dur": 1, "args": { "External id": 14028, "cbid": 251, "correlation": 14028 } }, { "ph": "f", "id": 14028, "pid": 76337, "tid": -914061504, "ts": 1716454216835817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216835822, "dur": 0, "args": { "External id": 14030, "cbid": 251, "correlation": 14030 } }, { "ph": "f", "id": 14030, "pid": 76337, "tid": -914061504, "ts": 1716454216835822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216837663, "dur": 14, "args": { "External id": 14031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14031, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14031, "pid": 5, "tid": 7, "ts": 1716454216837663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835824, "dur": 11, "args": { "External id": 14031, "cbid": 211, "correlation": 14031 } }, { "ph": "s", "id": 14031, "pid": 76337, "tid": -914061504, "ts": 1716454216835824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216837678, "dur": 5, "args": { "External id": 14033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14033, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14033, "pid": 5, "tid": 7, "ts": 1716454216837678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835838, "dur": 6, "args": { "External id": 14033, "cbid": 211, "correlation": 14033 } }, { "ph": "s", "id": 14033, "pid": 76337, "tid": -914061504, "ts": 1716454216835838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216837685, "dur": 18, "args": { "External id": 14043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14043, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14043, "pid": 5, "tid": 7, "ts": 1716454216837685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835903, "dur": 12, "args": { "External id": 14043, "cbid": 211, "correlation": 14043 } }, { "ph": "s", "id": 14043, "pid": 76337, "tid": -914061504, "ts": 1716454216835903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216837704, "dur": 18, "args": { "External id": 14063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14063, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 14063, "pid": 5, "tid": 7, "ts": 1716454216837704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216835969, "dur": 19, "args": { "External id": 14063, "cbid": 211, "correlation": 14063 } }, { "ph": "s", "id": 14063, "pid": 76337, "tid": -914061504, "ts": 1716454216835969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216837723, "dur": 4, "args": { "External id": 14075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14075, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 14075, "pid": 5, "tid": 7, "ts": 1716454216837723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836011, "dur": 8, "args": { "External id": 14075, "cbid": 211, "correlation": 14075 } }, { "ph": "s", "id": 14075, "pid": 76337, "tid": -914061504, "ts": 1716454216836011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216837729, "dur": 17, "args": { "External id": 14078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14078, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14078, "pid": 5, "tid": 7, "ts": 1716454216837729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836033, "dur": 7, "args": { "External id": 14078, "cbid": 211, "correlation": 14078 } }, { "ph": "s", "id": 14078, "pid": 76337, "tid": -914061504, "ts": 1716454216836033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216837747, "dur": 12, "args": { "External id": 14087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14087, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14087, "pid": 5, "tid": 7, "ts": 1716454216837747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836077, "dur": 10, "args": { "External id": 14087, "cbid": 211, "correlation": 14087 } }, { "ph": "s", "id": 14087, "pid": 76337, "tid": -914061504, "ts": 1716454216836077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216836141, "dur": 0, "args": { "External id": 14097, "cbid": 317, "correlation": 14097 } }, { "ph": "f", "id": 14097, "pid": 76337, "tid": -914061504, "ts": 1716454216836141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216836141, "dur": 0, "args": { "External id": 14098, "cbid": 203, "correlation": 14098 } }, { "ph": "f", "id": 14098, "pid": 76337, "tid": -914061504, "ts": 1716454216836141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216836142, "dur": 0, "args": { "External id": 14099, "cbid": 205, "correlation": 14099 } }, { "ph": "f", "id": 14099, "pid": 76337, "tid": -914061504, "ts": 1716454216836142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216837760, "dur": 12, "args": { "External id": 14103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14103, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14103, "pid": 5, "tid": 7, "ts": 1716454216837760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836158, "dur": 13, "args": { "External id": 14103, "cbid": 211, "correlation": 14103 } }, { "ph": "s", "id": 14103, "pid": 76337, "tid": -914061504, "ts": 1716454216836158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216837773, "dur": 166, "args": { "External id": 14105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14105, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14105, "pid": 5, "tid": 7, "ts": 1716454216837773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836173, "dur": 5, "args": { "External id": 14105, "cbid": 211, "correlation": 14105 } }, { "ph": "s", "id": 14105, "pid": 76337, "tid": -914061504, "ts": 1716454216836173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216837941, "dur": 1, "args": { "External id": 14107, "device": 5, "context": 1, "stream": 7, "correlation": 14107, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 14107, "pid": 5, "tid": 7, "ts": 1716454216837941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216836184, "dur": 7, "args": { "External id": 14107, "cbid": 51, "correlation": 14107 } }, { "ph": "s", "id": 14107, "pid": 76337, "tid": -914061504, "ts": 1716454216836184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216837945, "dur": 653, "args": { "External id": 14108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14108, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14108, "pid": 5, "tid": 7, "ts": 1716454216837945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836192, "dur": 6, "args": { "External id": 14108, "cbid": 211, "correlation": 14108 } }, { "ph": "s", "id": 14108, "pid": 76337, "tid": -914061504, "ts": 1716454216836192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216838599, "dur": 13, "args": { "External id": 14110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14110, "pid": 5, "tid": 7, "ts": 1716454216838599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836202, "dur": 5, "args": { "External id": 14110, "cbid": 211, "correlation": 14110 } }, { "ph": "s", "id": 14110, "pid": 76337, "tid": -914061504, "ts": 1716454216836202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216838614, "dur": 15, "args": { "External id": 14116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14116, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14116, "pid": 5, "tid": 7, "ts": 1716454216838614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836231, "dur": 8, "args": { "External id": 14116, "cbid": 211, "correlation": 14116 } }, { "ph": "s", "id": 14116, "pid": 76337, "tid": -914061504, "ts": 1716454216836231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216836291, "dur": 0, "args": { "External id": 14126, "cbid": 317, "correlation": 14126 } }, { "ph": "f", "id": 14126, "pid": 76337, "tid": -914061504, "ts": 1716454216836291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216836291, "dur": 0, "args": { "External id": 14127, "cbid": 203, "correlation": 14127 } }, { "ph": "f", "id": 14127, "pid": 76337, "tid": -914061504, "ts": 1716454216836291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216836292, "dur": 0, "args": { "External id": 14128, "cbid": 205, "correlation": 14128 } }, { "ph": "f", "id": 14128, "pid": 76337, "tid": -914061504, "ts": 1716454216836292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216838630, "dur": 22, "args": { "External id": 14132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14132, "pid": 5, "tid": 7, "ts": 1716454216838630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836303, "dur": 12, "args": { "External id": 14132, "cbid": 211, "correlation": 14132 } }, { "ph": "s", "id": 14132, "pid": 76337, "tid": -914061504, "ts": 1716454216836303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216838653, "dur": 4, "args": { "External id": 14134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14134, "pid": 5, "tid": 7, "ts": 1716454216838653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836320, "dur": 6, "args": { "External id": 14134, "cbid": 211, "correlation": 14134 } }, { "ph": "s", "id": 14134, "pid": 76337, "tid": -914061504, "ts": 1716454216836320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216836328, "dur": 0, "args": { "External id": 14135, "cbid": 51, "correlation": 14135 } }, { "ph": "s", "id": 14135, "pid": 76337, "tid": -914061504, "ts": 1716454216836328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216838659, "dur": 174, "args": { "External id": 14136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14136, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 14136, "pid": 5, "tid": 7, "ts": 1716454216838659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836329, "dur": 5, "args": { "External id": 14136, "cbid": 211, "correlation": 14136 } }, { "ph": "s", "id": 14136, "pid": 76337, "tid": -914061504, "ts": 1716454216836329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216838834, "dur": 16, "args": { "External id": 14141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14141, "pid": 5, "tid": 7, "ts": 1716454216838834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836353, "dur": 9, "args": { "External id": 14141, "cbid": 211, "correlation": 14141 } }, { "ph": "s", "id": 14141, "pid": 76337, "tid": -914061504, "ts": 1716454216836353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216838852, "dur": 12, "args": { "External id": 14149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14149, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14149, "pid": 5, "tid": 7, "ts": 1716454216838852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836383, "dur": 8, "args": { "External id": 14149, "cbid": 211, "correlation": 14149 } }, { "ph": "s", "id": 14149, "pid": 76337, "tid": -914061504, "ts": 1716454216836383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216838865, "dur": 10, "args": { "External id": 14157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14157, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14157, "pid": 5, "tid": 7, "ts": 1716454216838865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836412, "dur": 9, "args": { "External id": 14157, "cbid": 211, "correlation": 14157 } }, { "ph": "s", "id": 14157, "pid": 76337, "tid": -914061504, "ts": 1716454216836412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216838877, "dur": 18, "args": { "External id": 14177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14177, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 14177, "pid": 5, "tid": 7, "ts": 1716454216838877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836498, "dur": 13, "args": { "External id": 14177, "cbid": 211, "correlation": 14177 } }, { "ph": "s", "id": 14177, "pid": 76337, "tid": -914061504, "ts": 1716454216836498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216838896, "dur": 5, "args": { "External id": 14189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14189, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 14189, "pid": 5, "tid": 7, "ts": 1716454216838896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836521, "dur": 6, "args": { "External id": 14189, "cbid": 211, "correlation": 14189 } }, { "ph": "s", "id": 14189, "pid": 76337, "tid": -914061504, "ts": 1716454216836521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216838903, "dur": 17, "args": { "External id": 14192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14192, "pid": 5, "tid": 7, "ts": 1716454216838903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836539, "dur": 6, "args": { "External id": 14192, "cbid": 211, "correlation": 14192 } }, { "ph": "s", "id": 14192, "pid": 76337, "tid": -914061504, "ts": 1716454216836539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216836610, "dur": 0, "args": { "External id": 14203, "cbid": 317, "correlation": 14203 } }, { "ph": "f", "id": 14203, "pid": 76337, "tid": -914061504, "ts": 1716454216836610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216836611, "dur": 0, "args": { "External id": 14204, "cbid": 203, "correlation": 14204 } }, { "ph": "f", "id": 14204, "pid": 76337, "tid": -914061504, "ts": 1716454216836611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216836612, "dur": 0, "args": { "External id": 14205, "cbid": 205, "correlation": 14205 } }, { "ph": "f", "id": 14205, "pid": 76337, "tid": -914061504, "ts": 1716454216836612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216838921, "dur": 11, "args": { "External id": 14209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14209, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14209, "pid": 5, "tid": 7, "ts": 1716454216838921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836625, "dur": 13, "args": { "External id": 14209, "cbid": 211, "correlation": 14209 } }, { "ph": "s", "id": 14209, "pid": 76337, "tid": -914061504, "ts": 1716454216836625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216838934, "dur": 3, "args": { "External id": 14211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14211, "pid": 5, "tid": 7, "ts": 1716454216838934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836642, "dur": 6, "args": { "External id": 14211, "cbid": 211, "correlation": 14211 } }, { "ph": "s", "id": 14211, "pid": 76337, "tid": -914061504, "ts": 1716454216836642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216836651, "dur": 0, "args": { "External id": 14212, "cbid": 51, "correlation": 14212 } }, { "ph": "s", "id": 14212, "pid": 76337, "tid": -914061504, "ts": 1716454216836651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216838938, "dur": 92, "args": { "External id": 14213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14213, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 14213, "pid": 5, "tid": 7, "ts": 1716454216838938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836651, "dur": 5, "args": { "External id": 14213, "cbid": 211, "correlation": 14213 } }, { "ph": "s", "id": 14213, "pid": 76337, "tid": -914061504, "ts": 1716454216836651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216839032, "dur": 15, "args": { "External id": 14218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14218, "pid": 5, "tid": 7, "ts": 1716454216839032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836679, "dur": 9, "args": { "External id": 14218, "cbid": 211, "correlation": 14218 } }, { "ph": "s", "id": 14218, "pid": 76337, "tid": -914061504, "ts": 1716454216836679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216839049, "dur": 43, "args": { "External id": 14229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14229, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14229, "pid": 5, "tid": 7, "ts": 1716454216839049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836778, "dur": 15, "args": { "External id": 14229, "cbid": 211, "correlation": 14229 } }, { "ph": "s", "id": 14229, "pid": 76337, "tid": -914061504, "ts": 1716454216836778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216839093, "dur": 19, "args": { "External id": 14251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14251, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14251, "pid": 5, "tid": 7, "ts": 1716454216839093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836812, "dur": 8, "args": { "External id": 14251, "cbid": 211, "correlation": 14251 } }, { "ph": "s", "id": 14251, "pid": 76337, "tid": -914061504, "ts": 1716454216836812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216836915, "dur": 2, "args": { "External id": 14262, "cbid": 251, "correlation": 14262 } }, { "ph": "f", "id": 14262, "pid": 76337, "tid": -914061504, "ts": 1716454216836915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216839113, "dur": 90, "args": { "External id": 14263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14263, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14263, "pid": 5, "tid": 7, "ts": 1716454216839113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216836921, "dur": 14, "args": { "External id": 14263, "cbid": 211, "correlation": 14263 } }, { "ph": "s", "id": 14263, "pid": 76337, "tid": -914061504, "ts": 1716454216836921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837020, "dur": 1, "args": { "External id": 14274, "cbid": 251, "correlation": 14274 } }, { "ph": "f", "id": 14274, "pid": 76337, "tid": -914061504, "ts": 1716454216837020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216839205, "dur": 85, "args": { "External id": 14275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14275, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14275, "pid": 5, "tid": 7, "ts": 1716454216839205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837024, "dur": 13, "args": { "External id": 14275, "cbid": 211, "correlation": 14275 } }, { "ph": "s", "id": 14275, "pid": 76337, "tid": -914061504, "ts": 1716454216837024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837104, "dur": 1, "args": { "External id": 14286, "cbid": 251, "correlation": 14286 } }, { "ph": "f", "id": 14286, "pid": 76337, "tid": -914061504, "ts": 1716454216837104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216839292, "dur": 86, "args": { "External id": 14287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14287, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14287, "pid": 5, "tid": 7, "ts": 1716454216839292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837108, "dur": 13, "args": { "External id": 14287, "cbid": 211, "correlation": 14287 } }, { "ph": "s", "id": 14287, "pid": 76337, "tid": -914061504, "ts": 1716454216837108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216839379, "dur": 139, "args": { "External id": 14312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14312, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14312, "pid": 5, "tid": 7, "ts": 1716454216839379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837194, "dur": 14, "args": { "External id": 14312, "cbid": 211, "correlation": 14312 } }, { "ph": "s", "id": 14312, "pid": 76337, "tid": -914061504, "ts": 1716454216837194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837296, "dur": 1, "args": { "External id": 14330, "cbid": 251, "correlation": 14330 } }, { "ph": "f", "id": 14330, "pid": 76337, "tid": -914061504, "ts": 1716454216837296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216839518, "dur": 92, "args": { "External id": 14332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14332, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14332, "pid": 5, "tid": 7, "ts": 1716454216839518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837302, "dur": 14, "args": { "External id": 14332, "cbid": 211, "correlation": 14332 } }, { "ph": "s", "id": 14332, "pid": 76337, "tid": -914061504, "ts": 1716454216837302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216839612, "dur": 10, "args": { "External id": 14340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14340, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14340, "pid": 5, "tid": 7, "ts": 1716454216839612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837374, "dur": 12, "args": { "External id": 14340, "cbid": 211, "correlation": 14340 } }, { "ph": "s", "id": 14340, "pid": 76337, "tid": -914061504, "ts": 1716454216837374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216839623, "dur": 46, "args": { "External id": 14348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14348, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14348, "pid": 5, "tid": 7, "ts": 1716454216839623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837415, "dur": 10, "args": { "External id": 14348, "cbid": 211, "correlation": 14348 } }, { "ph": "s", "id": 14348, "pid": 76337, "tid": -914061504, "ts": 1716454216837415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216839671, "dur": 19, "args": { "External id": 14370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14370, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14370, "pid": 5, "tid": 7, "ts": 1716454216839671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837481, "dur": 11, "args": { "External id": 14370, "cbid": 211, "correlation": 14370 } }, { "ph": "s", "id": 14370, "pid": 76337, "tid": -914061504, "ts": 1716454216837481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837570, "dur": 1, "args": { "External id": 14381, "cbid": 251, "correlation": 14381 } }, { "ph": "f", "id": 14381, "pid": 76337, "tid": -914061504, "ts": 1716454216837570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216839691, "dur": 86, "args": { "External id": 14382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14382, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14382, "pid": 5, "tid": 7, "ts": 1716454216839691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837575, "dur": 13, "args": { "External id": 14382, "cbid": 211, "correlation": 14382 } }, { "ph": "s", "id": 14382, "pid": 76337, "tid": -914061504, "ts": 1716454216837575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837659, "dur": 1, "args": { "External id": 14393, "cbid": 251, "correlation": 14393 } }, { "ph": "f", "id": 14393, "pid": 76337, "tid": -914061504, "ts": 1716454216837659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837663, "dur": 0, "args": { "External id": 14394, "cbid": 251, "correlation": 14394 } }, { "ph": "f", "id": 14394, "pid": 76337, "tid": -914061504, "ts": 1716454216837663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216839778, "dur": 12, "args": { "External id": 14395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14395, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14395, "pid": 5, "tid": 7, "ts": 1716454216839778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837665, "dur": 13, "args": { "External id": 14395, "cbid": 211, "correlation": 14395 } }, { "ph": "s", "id": 14395, "pid": 76337, "tid": -914061504, "ts": 1716454216837665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216839791, "dur": 5, "args": { "External id": 14397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14397, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14397, "pid": 5, "tid": 7, "ts": 1716454216839791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837680, "dur": 6, "args": { "External id": 14397, "cbid": 211, "correlation": 14397 } }, { "ph": "s", "id": 14397, "pid": 76337, "tid": -914061504, "ts": 1716454216837680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837740, "dur": 1, "args": { "External id": 14408, "cbid": 251, "correlation": 14408 } }, { "ph": "f", "id": 14408, "pid": 76337, "tid": -914061504, "ts": 1716454216837740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837743, "dur": 0, "args": { "External id": 14409, "cbid": 251, "correlation": 14409 } }, { "ph": "f", "id": 14409, "pid": 76337, "tid": -914061504, "ts": 1716454216837743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216839798, "dur": 8, "args": { "External id": 14410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14410, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14410, "pid": 5, "tid": 7, "ts": 1716454216839798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837745, "dur": 12, "args": { "External id": 14410, "cbid": 211, "correlation": 14410 } }, { "ph": "s", "id": 14410, "pid": 76337, "tid": -914061504, "ts": 1716454216837745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216839808, "dur": 3, "args": { "External id": 14412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14412, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14412, "pid": 5, "tid": 7, "ts": 1716454216839808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837758, "dur": 6, "args": { "External id": 14412, "cbid": 211, "correlation": 14412 } }, { "ph": "s", "id": 14412, "pid": 76337, "tid": -914061504, "ts": 1716454216837758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216839812, "dur": 56, "args": { "External id": 14437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14437, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14437, "pid": 5, "tid": 7, "ts": 1716454216839812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837835, "dur": 13, "args": { "External id": 14437, "cbid": 211, "correlation": 14437 } }, { "ph": "s", "id": 14437, "pid": 76337, "tid": -914061504, "ts": 1716454216837835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216837934, "dur": 1, "args": { "External id": 14455, "cbid": 251, "correlation": 14455 } }, { "ph": "f", "id": 14455, "pid": 76337, "tid": -914061504, "ts": 1716454216837934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216839869, "dur": 87, "args": { "External id": 14457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14457, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14457, "pid": 5, "tid": 7, "ts": 1716454216839869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216837940, "dur": 14, "args": { "External id": 14457, "cbid": 211, "correlation": 14457 } }, { "ph": "s", "id": 14457, "pid": 76337, "tid": -914061504, "ts": 1716454216837940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216839957, "dur": 10, "args": { "External id": 14465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14465, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14465, "pid": 5, "tid": 7, "ts": 1716454216839957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838020, "dur": 12, "args": { "External id": 14465, "cbid": 211, "correlation": 14465 } }, { "ph": "s", "id": 14465, "pid": 76337, "tid": -914061504, "ts": 1716454216838020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216839968, "dur": 13, "args": { "External id": 14473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14473, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14473, "pid": 5, "tid": 7, "ts": 1716454216839968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838060, "dur": 9, "args": { "External id": 14473, "cbid": 211, "correlation": 14473 } }, { "ph": "s", "id": 14473, "pid": 76337, "tid": -914061504, "ts": 1716454216838060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216839981, "dur": 17, "args": { "External id": 14495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14495, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14495, "pid": 5, "tid": 7, "ts": 1716454216839981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838111, "dur": 10, "args": { "External id": 14495, "cbid": 211, "correlation": 14495 } }, { "ph": "s", "id": 14495, "pid": 76337, "tid": -914061504, "ts": 1716454216838111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216838200, "dur": 1, "args": { "External id": 14511, "cbid": 251, "correlation": 14511 } }, { "ph": "f", "id": 14511, "pid": 76337, "tid": -914061504, "ts": 1716454216838200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216838205, "dur": 0, "args": { "External id": 14513, "cbid": 251, "correlation": 14513 } }, { "ph": "f", "id": 14513, "pid": 76337, "tid": -914061504, "ts": 1716454216838205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216840000, "dur": 492, "args": { "External id": 14514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14514, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14514, "pid": 5, "tid": 7, "ts": 1716454216840000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838207, "dur": 13, "args": { "External id": 14514, "cbid": 211, "correlation": 14514 } }, { "ph": "s", "id": 14514, "pid": 76337, "tid": -914061504, "ts": 1716454216838207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216840493, "dur": 68, "args": { "External id": 14522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14522, "pid": 5, "tid": 7, "ts": 1716454216840493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838274, "dur": 12, "args": { "External id": 14522, "cbid": 211, "correlation": 14522 } }, { "ph": "s", "id": 14522, "pid": 76337, "tid": -914061504, "ts": 1716454216838274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216840562, "dur": 67, "args": { "External id": 14530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14530, "pid": 5, "tid": 7, "ts": 1716454216840562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838305, "dur": 9, "args": { "External id": 14530, "cbid": 211, "correlation": 14530 } }, { "ph": "s", "id": 14530, "pid": 76337, "tid": -914061504, "ts": 1716454216838305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216838386, "dur": 1, "args": { "External id": 14546, "cbid": 251, "correlation": 14546 } }, { "ph": "f", "id": 14546, "pid": 76337, "tid": -914061504, "ts": 1716454216838386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216840632, "dur": 1, "args": { "External id": 14548, "device": 5, "context": 1, "stream": 7, "correlation": 14548, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 14548, "pid": 5, "tid": 7, "ts": 1716454216840632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216838391, "dur": 9, "args": { "External id": 14548, "cbid": 51, "correlation": 14548 } }, { "ph": "s", "id": 14548, "pid": 76337, "tid": -914061504, "ts": 1716454216838391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216840635, "dur": 271, "args": { "External id": 14549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14549, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14549, "pid": 5, "tid": 7, "ts": 1716454216840635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838401, "dur": 12, "args": { "External id": 14549, "cbid": 211, "correlation": 14549 } }, { "ph": "s", "id": 14549, "pid": 76337, "tid": -914061504, "ts": 1716454216838401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216840908, "dur": 13, "args": { "External id": 14557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14557, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14557, "pid": 5, "tid": 7, "ts": 1716454216840908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838444, "dur": 10, "args": { "External id": 14557, "cbid": 211, "correlation": 14557 } }, { "ph": "s", "id": 14557, "pid": 76337, "tid": -914061504, "ts": 1716454216838444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216840922, "dur": 38, "args": { "External id": 14568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14568, "pid": 5, "tid": 7, "ts": 1716454216840922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838509, "dur": 13, "args": { "External id": 14568, "cbid": 211, "correlation": 14568 } }, { "ph": "s", "id": 14568, "pid": 76337, "tid": -914061504, "ts": 1716454216838509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216838574, "dur": 0, "args": { "External id": 14580, "cbid": 317, "correlation": 14580 } }, { "ph": "f", "id": 14580, "pid": 76337, "tid": -914061504, "ts": 1716454216838574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216838575, "dur": 0, "args": { "External id": 14581, "cbid": 203, "correlation": 14581 } }, { "ph": "f", "id": 14581, "pid": 76337, "tid": -914061504, "ts": 1716454216838575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216838575, "dur": 0, "args": { "External id": 14582, "cbid": 205, "correlation": 14582 } }, { "ph": "f", "id": 14582, "pid": 76337, "tid": -914061504, "ts": 1716454216838575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216840962, "dur": 14, "args": { "External id": 14586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14586, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14586, "pid": 5, "tid": 7, "ts": 1716454216840962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838590, "dur": 12, "args": { "External id": 14586, "cbid": 211, "correlation": 14586 } }, { "ph": "s", "id": 14586, "pid": 76337, "tid": -914061504, "ts": 1716454216838590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216840978, "dur": 4, "args": { "External id": 14588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14588, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14588, "pid": 5, "tid": 7, "ts": 1716454216840978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838606, "dur": 6, "args": { "External id": 14588, "cbid": 211, "correlation": 14588 } }, { "ph": "s", "id": 14588, "pid": 76337, "tid": -914061504, "ts": 1716454216838606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216838615, "dur": 0, "args": { "External id": 14589, "cbid": 51, "correlation": 14589 } }, { "ph": "s", "id": 14589, "pid": 76337, "tid": -914061504, "ts": 1716454216838615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216840983, "dur": 99, "args": { "External id": 14590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14590, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 14590, "pid": 5, "tid": 7, "ts": 1716454216840983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838615, "dur": 5, "args": { "External id": 14590, "cbid": 211, "correlation": 14590 } }, { "ph": "s", "id": 14590, "pid": 76337, "tid": -914061504, "ts": 1716454216838615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216841084, "dur": 17, "args": { "External id": 14595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14595, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14595, "pid": 5, "tid": 7, "ts": 1716454216841084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838642, "dur": 8, "args": { "External id": 14595, "cbid": 211, "correlation": 14595 } }, { "ph": "s", "id": 14595, "pid": 76337, "tid": -914061504, "ts": 1716454216838642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216841102, "dur": 12, "args": { "External id": 14603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14603, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14603, "pid": 5, "tid": 7, "ts": 1716454216841102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838673, "dur": 9, "args": { "External id": 14603, "cbid": 211, "correlation": 14603 } }, { "ph": "s", "id": 14603, "pid": 76337, "tid": -914061504, "ts": 1716454216838673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216841115, "dur": 25, "args": { "External id": 14612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14612, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14612, "pid": 5, "tid": 7, "ts": 1716454216841115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838715, "dur": 10, "args": { "External id": 14612, "cbid": 211, "correlation": 14612 } }, { "ph": "s", "id": 14612, "pid": 76337, "tid": -914061504, "ts": 1716454216838715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216841141, "dur": 25, "args": { "External id": 14632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14632, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 14632, "pid": 5, "tid": 7, "ts": 1716454216841141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838787, "dur": 12, "args": { "External id": 14632, "cbid": 211, "correlation": 14632 } }, { "ph": "s", "id": 14632, "pid": 76337, "tid": -914061504, "ts": 1716454216838787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216841167, "dur": 5, "args": { "External id": 14644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14644, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 14644, "pid": 5, "tid": 7, "ts": 1716454216841167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838809, "dur": 6, "args": { "External id": 14644, "cbid": 211, "correlation": 14644 } }, { "ph": "s", "id": 14644, "pid": 76337, "tid": -914061504, "ts": 1716454216838809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216841174, "dur": 26, "args": { "External id": 14647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14647, "pid": 5, "tid": 7, "ts": 1716454216841174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838827, "dur": 7, "args": { "External id": 14647, "cbid": 211, "correlation": 14647 } }, { "ph": "s", "id": 14647, "pid": 76337, "tid": -914061504, "ts": 1716454216838827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216841201, "dur": 18, "args": { "External id": 14656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14656, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14656, "pid": 5, "tid": 7, "ts": 1716454216841201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216838866, "dur": 10, "args": { "External id": 14656, "cbid": 211, "correlation": 14656 } }, { "ph": "s", "id": 14656, "pid": 76337, "tid": -914061504, "ts": 1716454216838866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216838932, "dur": 0, "args": { "External id": 14666, "cbid": 317, "correlation": 14666 } }, { "ph": "f", "id": 14666, "pid": 76337, "tid": -914061504, "ts": 1716454216838932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216838933, "dur": 0, "args": { "External id": 14667, "cbid": 203, "correlation": 14667 } }, { "ph": "f", "id": 14667, "pid": 76337, "tid": -914061504, "ts": 1716454216838933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216838934, "dur": 0, "args": { "External id": 14668, "cbid": 205, "correlation": 14668 } }, { "ph": "f", "id": 14668, "pid": 76337, "tid": -914061504, "ts": 1716454216838934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216842148, "dur": 17, "args": { "External id": 14672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14672, "pid": 5, "tid": 7, "ts": 1716454216842148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216842115, "dur": 33, "args": { "External id": 14672, "cbid": 211, "correlation": 14672 } }, { "ph": "s", "id": 14672, "pid": 76337, "tid": -914061504, "ts": 1716454216842115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216842167, "dur": 244, "args": { "External id": 14674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14674, "pid": 5, "tid": 7, "ts": 1716454216842167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216842151, "dur": 6, "args": { "External id": 14674, "cbid": 211, "correlation": 14674 } }, { "ph": "s", "id": 14674, "pid": 76337, "tid": -914061504, "ts": 1716454216842151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216842414, "dur": 1, "args": { "External id": 14676, "device": 5, "context": 1, "stream": 7, "correlation": 14676, "bytes": 960, "memory bandwidth (GB/s)": 0.4918032786885246 } }, { "ph": "f", "id": 14676, "pid": 5, "tid": 7, "ts": 1716454216842414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216842166, "dur": 14, "args": { "External id": 14676, "cbid": 51, "correlation": 14676 } }, { "ph": "s", "id": 14676, "pid": 76337, "tid": -914061504, "ts": 1716454216842166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216842418, "dur": 815, "args": { "External id": 14677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14677, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14677, "pid": 5, "tid": 7, "ts": 1716454216842418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216842182, "dur": 10, "args": { "External id": 14677, "cbid": 211, "correlation": 14677 } }, { "ph": "s", "id": 14677, "pid": 76337, "tid": -914061504, "ts": 1716454216842182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216843234, "dur": 13, "args": { "External id": 14679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14679, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14679, "pid": 5, "tid": 7, "ts": 1716454216843234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216842198, "dur": 7, "args": { "External id": 14679, "cbid": 211, "correlation": 14679 } }, { "ph": "s", "id": 14679, "pid": 76337, "tid": -914061504, "ts": 1716454216842198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216843248, "dur": 14, "args": { "External id": 14685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14685, "pid": 5, "tid": 7, "ts": 1716454216843248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216842982, "dur": 12, "args": { "External id": 14685, "cbid": 211, "correlation": 14685 } }, { "ph": "s", "id": 14685, "pid": 76337, "tid": -914061504, "ts": 1716454216842982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216843264, "dur": 3, "args": { "External id": 14693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14693, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 14693, "pid": 5, "tid": 7, "ts": 1716454216843264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843076, "dur": 11, "args": { "External id": 14693, "cbid": 211, "correlation": 14693 } }, { "ph": "s", "id": 14693, "pid": 76337, "tid": -914061504, "ts": 1716454216843076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216843190, "dur": 3, "args": { "External id": 14709, "cbid": 251, "correlation": 14709 } }, { "ph": "f", "id": 14709, "pid": 76337, "tid": -914061504, "ts": 1716454216843190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216843199, "dur": 0, "args": { "External id": 14711, "cbid": 251, "correlation": 14711 } }, { "ph": "f", "id": 14711, "pid": 76337, "tid": -914061504, "ts": 1716454216843199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216843269, "dur": 13, "args": { "External id": 14712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14712, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14712, "pid": 5, "tid": 7, "ts": 1716454216843269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843201, "dur": 14, "args": { "External id": 14712, "cbid": 211, "correlation": 14712 } }, { "ph": "s", "id": 14712, "pid": 76337, "tid": -914061504, "ts": 1716454216843201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216843283, "dur": 5, "args": { "External id": 14714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14714, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14714, "pid": 5, "tid": 7, "ts": 1716454216843283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843219, "dur": 7, "args": { "External id": 14714, "cbid": 211, "correlation": 14714 } }, { "ph": "s", "id": 14714, "pid": 76337, "tid": -914061504, "ts": 1716454216843219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216843313, "dur": 17, "args": { "External id": 14724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14724, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14724, "pid": 5, "tid": 7, "ts": 1716454216843313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843301, "dur": 12, "args": { "External id": 14724, "cbid": 211, "correlation": 14724 } }, { "ph": "s", "id": 14724, "pid": 76337, "tid": -914061504, "ts": 1716454216843301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216843406, "dur": 18, "args": { "External id": 14744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14744, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 14744, "pid": 5, "tid": 7, "ts": 1716454216843406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843394, "dur": 12, "args": { "External id": 14744, "cbid": 211, "correlation": 14744 } }, { "ph": "s", "id": 14744, "pid": 76337, "tid": -914061504, "ts": 1716454216843394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216843428, "dur": 4, "args": { "External id": 14756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14756, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 14756, "pid": 5, "tid": 7, "ts": 1716454216843428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843417, "dur": 8, "args": { "External id": 14756, "cbid": 211, "correlation": 14756 } }, { "ph": "s", "id": 14756, "pid": 76337, "tid": -914061504, "ts": 1716454216843417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216843453, "dur": 17, "args": { "External id": 14759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14759, "pid": 5, "tid": 7, "ts": 1716454216843453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843445, "dur": 8, "args": { "External id": 14759, "cbid": 211, "correlation": 14759 } }, { "ph": "s", "id": 14759, "pid": 76337, "tid": -914061504, "ts": 1716454216843445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216843500, "dur": 12, "args": { "External id": 14768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14768, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14768, "pid": 5, "tid": 7, "ts": 1716454216843500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843489, "dur": 10, "args": { "External id": 14768, "cbid": 211, "correlation": 14768 } }, { "ph": "s", "id": 14768, "pid": 76337, "tid": -914061504, "ts": 1716454216843489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216843580, "dur": 1, "args": { "External id": 14778, "cbid": 317, "correlation": 14778 } }, { "ph": "f", "id": 14778, "pid": 76337, "tid": -914061504, "ts": 1716454216843580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216843582, "dur": 0, "args": { "External id": 14779, "cbid": 203, "correlation": 14779 } }, { "ph": "f", "id": 14779, "pid": 76337, "tid": -914061504, "ts": 1716454216843582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216843583, "dur": 1, "args": { "External id": 14780, "cbid": 205, "correlation": 14780 } }, { "ph": "f", "id": 14780, "pid": 76337, "tid": -914061504, "ts": 1716454216843583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216843614, "dur": 12, "args": { "External id": 14784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14784, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14784, "pid": 5, "tid": 7, "ts": 1716454216843614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843601, "dur": 12, "args": { "External id": 14784, "cbid": 211, "correlation": 14784 } }, { "ph": "s", "id": 14784, "pid": 76337, "tid": -914061504, "ts": 1716454216843601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216843627, "dur": 165, "args": { "External id": 14786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14786, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14786, "pid": 5, "tid": 7, "ts": 1716454216843627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843616, "dur": 6, "args": { "External id": 14786, "cbid": 211, "correlation": 14786 } }, { "ph": "s", "id": 14786, "pid": 76337, "tid": -914061504, "ts": 1716454216843616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216843794, "dur": 1, "args": { "External id": 14788, "device": 5, "context": 1, "stream": 7, "correlation": 14788, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 14788, "pid": 5, "tid": 7, "ts": 1716454216843794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216843629, "dur": 7, "args": { "External id": 14788, "cbid": 51, "correlation": 14788 } }, { "ph": "s", "id": 14788, "pid": 76337, "tid": -914061504, "ts": 1716454216843629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216843798, "dur": 653, "args": { "External id": 14789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14789, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14789, "pid": 5, "tid": 7, "ts": 1716454216843798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843638, "dur": 6, "args": { "External id": 14789, "cbid": 211, "correlation": 14789 } }, { "ph": "s", "id": 14789, "pid": 76337, "tid": -914061504, "ts": 1716454216843638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216844453, "dur": 13, "args": { "External id": 14791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14791, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14791, "pid": 5, "tid": 7, "ts": 1716454216844453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843648, "dur": 6, "args": { "External id": 14791, "cbid": 211, "correlation": 14791 } }, { "ph": "s", "id": 14791, "pid": 76337, "tid": -914061504, "ts": 1716454216843648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216844467, "dur": 15, "args": { "External id": 14797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14797, "pid": 5, "tid": 7, "ts": 1716454216844467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216843679, "dur": 9, "args": { "External id": 14797, "cbid": 211, "correlation": 14797 } }, { "ph": "s", "id": 14797, "pid": 76337, "tid": -914061504, "ts": 1716454216843679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216843739, "dur": 0, "args": { "External id": 14807, "cbid": 317, "correlation": 14807 } }, { "ph": "f", "id": 14807, "pid": 76337, "tid": -914061504, "ts": 1716454216843739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216843740, "dur": 0, "args": { "External id": 14808, "cbid": 203, "correlation": 14808 } }, { "ph": "f", "id": 14808, "pid": 76337, "tid": -914061504, "ts": 1716454216843740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216843740, "dur": 0, "args": { "External id": 14809, "cbid": 205, "correlation": 14809 } }, { "ph": "f", "id": 14809, "pid": 76337, "tid": -914061504, "ts": 1716454216843740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216845551, "dur": 17, "args": { "External id": 14813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14813, "pid": 5, "tid": 7, "ts": 1716454216845551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216845531, "dur": 20, "args": { "External id": 14813, "cbid": 211, "correlation": 14813 } }, { "ph": "s", "id": 14813, "pid": 76337, "tid": -914061504, "ts": 1716454216845531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216845569, "dur": 4, "args": { "External id": 14815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14815, "pid": 5, "tid": 7, "ts": 1716454216845569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216845556, "dur": 9, "args": { "External id": 14815, "cbid": 211, "correlation": 14815 } }, { "ph": "s", "id": 14815, "pid": 76337, "tid": -914061504, "ts": 1716454216845556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216845568, "dur": 0, "args": { "External id": 14816, "cbid": 51, "correlation": 14816 } }, { "ph": "s", "id": 14816, "pid": 76337, "tid": -914061504, "ts": 1716454216845568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216845579, "dur": 133, "args": { "External id": 14817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14817, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 14817, "pid": 5, "tid": 7, "ts": 1716454216845579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216845570, "dur": 8, "args": { "External id": 14817, "cbid": 211, "correlation": 14817 } }, { "ph": "s", "id": 14817, "pid": 76337, "tid": -914061504, "ts": 1716454216845570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216846046, "dur": 16, "args": { "External id": 14822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14822, "pid": 5, "tid": 7, "ts": 1716454216846046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846033, "dur": 11, "args": { "External id": 14822, "cbid": 211, "correlation": 14822 } }, { "ph": "s", "id": 14822, "pid": 76337, "tid": -914061504, "ts": 1716454216846033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216846086, "dur": 12, "args": { "External id": 14830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14830, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14830, "pid": 5, "tid": 7, "ts": 1716454216846086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846076, "dur": 9, "args": { "External id": 14830, "cbid": 211, "correlation": 14830 } }, { "ph": "s", "id": 14830, "pid": 76337, "tid": -914061504, "ts": 1716454216846076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216846138, "dur": 11, "args": { "External id": 14838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14838, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14838, "pid": 5, "tid": 7, "ts": 1716454216846138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846127, "dur": 11, "args": { "External id": 14838, "cbid": 211, "correlation": 14838 } }, { "ph": "s", "id": 14838, "pid": 76337, "tid": -914061504, "ts": 1716454216846127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216846269, "dur": 19, "args": { "External id": 14858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14858, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 14858, "pid": 5, "tid": 7, "ts": 1716454216846269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846256, "dur": 13, "args": { "External id": 14858, "cbid": 211, "correlation": 14858 } }, { "ph": "s", "id": 14858, "pid": 76337, "tid": -914061504, "ts": 1716454216846256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216846289, "dur": 4, "args": { "External id": 14870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14870, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 14870, "pid": 5, "tid": 7, "ts": 1716454216846289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846279, "dur": 8, "args": { "External id": 14870, "cbid": 211, "correlation": 14870 } }, { "ph": "s", "id": 14870, "pid": 76337, "tid": -914061504, "ts": 1716454216846279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216846310, "dur": 17, "args": { "External id": 14873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14873, "pid": 5, "tid": 7, "ts": 1716454216846310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846302, "dur": 7, "args": { "External id": 14873, "cbid": 211, "correlation": 14873 } }, { "ph": "s", "id": 14873, "pid": 76337, "tid": -914061504, "ts": 1716454216846302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216846385, "dur": 0, "args": { "External id": 14884, "cbid": 317, "correlation": 14884 } }, { "ph": "f", "id": 14884, "pid": 76337, "tid": -914061504, "ts": 1716454216846385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216846386, "dur": 0, "args": { "External id": 14885, "cbid": 203, "correlation": 14885 } }, { "ph": "f", "id": 14885, "pid": 76337, "tid": -914061504, "ts": 1716454216846386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216846387, "dur": 0, "args": { "External id": 14886, "cbid": 205, "correlation": 14886 } }, { "ph": "f", "id": 14886, "pid": 76337, "tid": -914061504, "ts": 1716454216846387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216846420, "dur": 12, "args": { "External id": 14890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14890, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14890, "pid": 5, "tid": 7, "ts": 1716454216846420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846407, "dur": 13, "args": { "External id": 14890, "cbid": 211, "correlation": 14890 } }, { "ph": "s", "id": 14890, "pid": 76337, "tid": -914061504, "ts": 1716454216846407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216846434, "dur": 3, "args": { "External id": 14892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14892, "pid": 5, "tid": 7, "ts": 1716454216846434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846425, "dur": 6, "args": { "External id": 14892, "cbid": 211, "correlation": 14892 } }, { "ph": "s", "id": 14892, "pid": 76337, "tid": -914061504, "ts": 1716454216846425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216846434, "dur": 0, "args": { "External id": 14893, "cbid": 51, "correlation": 14893 } }, { "ph": "s", "id": 14893, "pid": 76337, "tid": -914061504, "ts": 1716454216846434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216846443, "dur": 93, "args": { "External id": 14894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14894, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 14894, "pid": 5, "tid": 7, "ts": 1716454216846443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846435, "dur": 7, "args": { "External id": 14894, "cbid": 211, "correlation": 14894 } }, { "ph": "s", "id": 14894, "pid": 76337, "tid": -914061504, "ts": 1716454216846435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216846538, "dur": 15, "args": { "External id": 14899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14899, "pid": 5, "tid": 7, "ts": 1716454216846538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846465, "dur": 9, "args": { "External id": 14899, "cbid": 211, "correlation": 14899 } }, { "ph": "s", "id": 14899, "pid": 76337, "tid": -914061504, "ts": 1716454216846465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216846626, "dur": 44, "args": { "External id": 14910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14910, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14910, "pid": 5, "tid": 7, "ts": 1716454216846626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846609, "dur": 17, "args": { "External id": 14910, "cbid": 211, "correlation": 14910 } }, { "ph": "s", "id": 14910, "pid": 76337, "tid": -914061504, "ts": 1716454216846609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216846676, "dur": 19, "args": { "External id": 14932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14932, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 14932, "pid": 5, "tid": 7, "ts": 1716454216846676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846663, "dur": 12, "args": { "External id": 14932, "cbid": 211, "correlation": 14932 } }, { "ph": "s", "id": 14932, "pid": 76337, "tid": -914061504, "ts": 1716454216846663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216846812, "dur": 3, "args": { "External id": 14943, "cbid": 251, "correlation": 14943 } }, { "ph": "f", "id": 14943, "pid": 76337, "tid": -914061504, "ts": 1716454216846812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216846839, "dur": 91, "args": { "External id": 14944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14944, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14944, "pid": 5, "tid": 7, "ts": 1716454216846839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846823, "dur": 16, "args": { "External id": 14944, "cbid": 211, "correlation": 14944 } }, { "ph": "s", "id": 14944, "pid": 76337, "tid": -914061504, "ts": 1716454216846823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216846916, "dur": 1, "args": { "External id": 14955, "cbid": 251, "correlation": 14955 } }, { "ph": "f", "id": 14955, "pid": 76337, "tid": -914061504, "ts": 1716454216846916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216846935, "dur": 85, "args": { "External id": 14956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14956, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14956, "pid": 5, "tid": 7, "ts": 1716454216846935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216846920, "dur": 13, "args": { "External id": 14956, "cbid": 211, "correlation": 14956 } }, { "ph": "s", "id": 14956, "pid": 76337, "tid": -914061504, "ts": 1716454216846920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847000, "dur": 1, "args": { "External id": 14967, "cbid": 251, "correlation": 14967 } }, { "ph": "f", "id": 14967, "pid": 76337, "tid": -914061504, "ts": 1716454216847000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216847021, "dur": 86, "args": { "External id": 14968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14968, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 14968, "pid": 5, "tid": 7, "ts": 1716454216847021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847004, "dur": 12, "args": { "External id": 14968, "cbid": 211, "correlation": 14968 } }, { "ph": "s", "id": 14968, "pid": 76337, "tid": -914061504, "ts": 1716454216847004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216847134, "dur": 139, "args": { "External id": 14993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 14993, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 14993, "pid": 5, "tid": 7, "ts": 1716454216847134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847118, "dur": 16, "args": { "External id": 14993, "cbid": 211, "correlation": 14993 } }, { "ph": "s", "id": 14993, "pid": 76337, "tid": -914061504, "ts": 1716454216847118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847249, "dur": 2, "args": { "External id": 15011, "cbid": 251, "correlation": 15011 } }, { "ph": "f", "id": 15011, "pid": 76337, "tid": -914061504, "ts": 1716454216847249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216847274, "dur": 92, "args": { "External id": 15013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15013, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 15013, "pid": 5, "tid": 7, "ts": 1716454216847274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847256, "dur": 16, "args": { "External id": 15013, "cbid": 211, "correlation": 15013 } }, { "ph": "s", "id": 15013, "pid": 76337, "tid": -914061504, "ts": 1716454216847256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216847368, "dur": 9, "args": { "External id": 15021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15021, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15021, "pid": 5, "tid": 7, "ts": 1716454216847368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847341, "dur": 12, "args": { "External id": 15021, "cbid": 211, "correlation": 15021 } }, { "ph": "s", "id": 15021, "pid": 76337, "tid": -914061504, "ts": 1716454216847341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216847394, "dur": 46, "args": { "External id": 15029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15029, "pid": 5, "tid": 7, "ts": 1716454216847394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847384, "dur": 10, "args": { "External id": 15029, "cbid": 211, "correlation": 15029 } }, { "ph": "s", "id": 15029, "pid": 76337, "tid": -914061504, "ts": 1716454216847384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216847449, "dur": 18, "args": { "External id": 15051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15051, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15051, "pid": 5, "tid": 7, "ts": 1716454216847449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847437, "dur": 11, "args": { "External id": 15051, "cbid": 211, "correlation": 15051 } }, { "ph": "s", "id": 15051, "pid": 76337, "tid": -914061504, "ts": 1716454216847437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847524, "dur": 1, "args": { "External id": 15062, "cbid": 251, "correlation": 15062 } }, { "ph": "f", "id": 15062, "pid": 76337, "tid": -914061504, "ts": 1716454216847524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216847543, "dur": 86, "args": { "External id": 15063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15063, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 15063, "pid": 5, "tid": 7, "ts": 1716454216847543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847529, "dur": 13, "args": { "External id": 15063, "cbid": 211, "correlation": 15063 } }, { "ph": "s", "id": 15063, "pid": 76337, "tid": -914061504, "ts": 1716454216847529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847608, "dur": 1, "args": { "External id": 15074, "cbid": 251, "correlation": 15074 } }, { "ph": "f", "id": 15074, "pid": 76337, "tid": -914061504, "ts": 1716454216847608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847612, "dur": 0, "args": { "External id": 15075, "cbid": 251, "correlation": 15075 } }, { "ph": "f", "id": 15075, "pid": 76337, "tid": -914061504, "ts": 1716454216847612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216847630, "dur": 12, "args": { "External id": 15076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15076, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15076, "pid": 5, "tid": 7, "ts": 1716454216847630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847614, "dur": 14, "args": { "External id": 15076, "cbid": 211, "correlation": 15076 } }, { "ph": "s", "id": 15076, "pid": 76337, "tid": -914061504, "ts": 1716454216847614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216847644, "dur": 5, "args": { "External id": 15078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15078, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15078, "pid": 5, "tid": 7, "ts": 1716454216847644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847632, "dur": 8, "args": { "External id": 15078, "cbid": 211, "correlation": 15078 } }, { "ph": "s", "id": 15078, "pid": 76337, "tid": -914061504, "ts": 1716454216847632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847708, "dur": 1, "args": { "External id": 15089, "cbid": 251, "correlation": 15089 } }, { "ph": "f", "id": 15089, "pid": 76337, "tid": -914061504, "ts": 1716454216847708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847711, "dur": 0, "args": { "External id": 15090, "cbid": 251, "correlation": 15090 } }, { "ph": "f", "id": 15090, "pid": 76337, "tid": -914061504, "ts": 1716454216847711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216847726, "dur": 9, "args": { "External id": 15091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15091, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15091, "pid": 5, "tid": 7, "ts": 1716454216847726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847713, "dur": 13, "args": { "External id": 15091, "cbid": 211, "correlation": 15091 } }, { "ph": "s", "id": 15091, "pid": 76337, "tid": -914061504, "ts": 1716454216847713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216847736, "dur": 3, "args": { "External id": 15093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15093, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15093, "pid": 5, "tid": 7, "ts": 1716454216847736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847727, "dur": 6, "args": { "External id": 15093, "cbid": 211, "correlation": 15093 } }, { "ph": "s", "id": 15093, "pid": 76337, "tid": -914061504, "ts": 1716454216847727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216847822, "dur": 56, "args": { "External id": 15118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15118, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15118, "pid": 5, "tid": 7, "ts": 1716454216847822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847809, "dur": 13, "args": { "External id": 15118, "cbid": 211, "correlation": 15118 } }, { "ph": "s", "id": 15118, "pid": 76337, "tid": -914061504, "ts": 1716454216847809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216847924, "dur": 1, "args": { "External id": 15136, "cbid": 251, "correlation": 15136 } }, { "ph": "f", "id": 15136, "pid": 76337, "tid": -914061504, "ts": 1716454216847924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216847944, "dur": 88, "args": { "External id": 15138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15138, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 15138, "pid": 5, "tid": 7, "ts": 1716454216847944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216847930, "dur": 14, "args": { "External id": 15138, "cbid": 211, "correlation": 15138 } }, { "ph": "s", "id": 15138, "pid": 76337, "tid": -914061504, "ts": 1716454216847930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216848033, "dur": 10, "args": { "External id": 15146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15146, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15146, "pid": 5, "tid": 7, "ts": 1716454216848033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848012, "dur": 13, "args": { "External id": 15146, "cbid": 211, "correlation": 15146 } }, { "ph": "s", "id": 15146, "pid": 76337, "tid": -914061504, "ts": 1716454216848012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216848063, "dur": 13, "args": { "External id": 15154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15154, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15154, "pid": 5, "tid": 7, "ts": 1716454216848063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848053, "dur": 9, "args": { "External id": 15154, "cbid": 211, "correlation": 15154 } }, { "ph": "s", "id": 15154, "pid": 76337, "tid": -914061504, "ts": 1716454216848053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216848115, "dur": 18, "args": { "External id": 15176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15176, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15176, "pid": 5, "tid": 7, "ts": 1716454216848115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848104, "dur": 10, "args": { "External id": 15176, "cbid": 211, "correlation": 15176 } }, { "ph": "s", "id": 15176, "pid": 76337, "tid": -914061504, "ts": 1716454216848104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216848201, "dur": 2, "args": { "External id": 15192, "cbid": 251, "correlation": 15192 } }, { "ph": "f", "id": 15192, "pid": 76337, "tid": -914061504, "ts": 1716454216848201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216848207, "dur": 0, "args": { "External id": 15194, "cbid": 251, "correlation": 15194 } }, { "ph": "f", "id": 15194, "pid": 76337, "tid": -914061504, "ts": 1716454216848207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216848225, "dur": 495, "args": { "External id": 15195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15195, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15195, "pid": 5, "tid": 7, "ts": 1716454216848225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848209, "dur": 16, "args": { "External id": 15195, "cbid": 211, "correlation": 15195 } }, { "ph": "s", "id": 15195, "pid": 76337, "tid": -914061504, "ts": 1716454216848209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216848721, "dur": 67, "args": { "External id": 15203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15203, "pid": 5, "tid": 7, "ts": 1716454216848721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848303, "dur": 16, "args": { "External id": 15203, "cbid": 211, "correlation": 15203 } }, { "ph": "s", "id": 15203, "pid": 76337, "tid": -914061504, "ts": 1716454216848303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216848789, "dur": 68, "args": { "External id": 15211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15211, "pid": 5, "tid": 7, "ts": 1716454216848789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848346, "dur": 10, "args": { "External id": 15211, "cbid": 211, "correlation": 15211 } }, { "ph": "s", "id": 15211, "pid": 76337, "tid": -914061504, "ts": 1716454216848346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216848429, "dur": 1, "args": { "External id": 15227, "cbid": 251, "correlation": 15227 } }, { "ph": "f", "id": 15227, "pid": 76337, "tid": -914061504, "ts": 1716454216848429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216848859, "dur": 1, "args": { "External id": 15229, "device": 5, "context": 1, "stream": 7, "correlation": 15229, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 15229, "pid": 5, "tid": 7, "ts": 1716454216848859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216848435, "dur": 16, "args": { "External id": 15229, "cbid": 51, "correlation": 15229 } }, { "ph": "s", "id": 15229, "pid": 76337, "tid": -914061504, "ts": 1716454216848435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216848863, "dur": 270, "args": { "External id": 15230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15230, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 15230, "pid": 5, "tid": 7, "ts": 1716454216848863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848452, "dur": 11, "args": { "External id": 15230, "cbid": 211, "correlation": 15230 } }, { "ph": "s", "id": 15230, "pid": 76337, "tid": -914061504, "ts": 1716454216848452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216849134, "dur": 13, "args": { "External id": 15238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15238, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15238, "pid": 5, "tid": 7, "ts": 1716454216849134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848496, "dur": 10, "args": { "External id": 15238, "cbid": 211, "correlation": 15238 } }, { "ph": "s", "id": 15238, "pid": 76337, "tid": -914061504, "ts": 1716454216848496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216849149, "dur": 38, "args": { "External id": 15249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15249, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15249, "pid": 5, "tid": 7, "ts": 1716454216849149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848564, "dur": 13, "args": { "External id": 15249, "cbid": 211, "correlation": 15249 } }, { "ph": "s", "id": 15249, "pid": 76337, "tid": -914061504, "ts": 1716454216848564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216848631, "dur": 0, "args": { "External id": 15261, "cbid": 317, "correlation": 15261 } }, { "ph": "f", "id": 15261, "pid": 76337, "tid": -914061504, "ts": 1716454216848631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216848632, "dur": 0, "args": { "External id": 15262, "cbid": 203, "correlation": 15262 } }, { "ph": "f", "id": 15262, "pid": 76337, "tid": -914061504, "ts": 1716454216848632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216848633, "dur": 0, "args": { "External id": 15263, "cbid": 205, "correlation": 15263 } }, { "ph": "f", "id": 15263, "pid": 76337, "tid": -914061504, "ts": 1716454216848633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216849188, "dur": 13, "args": { "External id": 15267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15267, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15267, "pid": 5, "tid": 7, "ts": 1716454216849188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848649, "dur": 12, "args": { "External id": 15267, "cbid": 211, "correlation": 15267 } }, { "ph": "s", "id": 15267, "pid": 76337, "tid": -914061504, "ts": 1716454216848649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216849202, "dur": 4, "args": { "External id": 15269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15269, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 15269, "pid": 5, "tid": 7, "ts": 1716454216849202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848667, "dur": 6, "args": { "External id": 15269, "cbid": 211, "correlation": 15269 } }, { "ph": "s", "id": 15269, "pid": 76337, "tid": -914061504, "ts": 1716454216848667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216848676, "dur": 0, "args": { "External id": 15270, "cbid": 51, "correlation": 15270 } }, { "ph": "s", "id": 15270, "pid": 76337, "tid": -914061504, "ts": 1716454216848676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216849207, "dur": 98, "args": { "External id": 15271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15271, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 15271, "pid": 5, "tid": 7, "ts": 1716454216849207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848677, "dur": 5, "args": { "External id": 15271, "cbid": 211, "correlation": 15271 } }, { "ph": "s", "id": 15271, "pid": 76337, "tid": -914061504, "ts": 1716454216848677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216849306, "dur": 17, "args": { "External id": 15276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15276, "pid": 5, "tid": 7, "ts": 1716454216849306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848705, "dur": 9, "args": { "External id": 15276, "cbid": 211, "correlation": 15276 } }, { "ph": "s", "id": 15276, "pid": 76337, "tid": -914061504, "ts": 1716454216848705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216849325, "dur": 13, "args": { "External id": 15284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15284, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15284, "pid": 5, "tid": 7, "ts": 1716454216849325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848737, "dur": 9, "args": { "External id": 15284, "cbid": 211, "correlation": 15284 } }, { "ph": "s", "id": 15284, "pid": 76337, "tid": -914061504, "ts": 1716454216848737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454216849338, "dur": 57, "args": { "External id": 15295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15295, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15295, "pid": 5, "tid": 7, "ts": 1716454216849338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216848818, "dur": 13, "args": { "External id": 15295, "cbid": 211, "correlation": 15295 } }, { "ph": "s", "id": 15295, "pid": 76337, "tid": -914061504, "ts": 1716454216848818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216848874, "dur": 0, "args": { "External id": 15305, "cbid": 317, "correlation": 15305 } }, { "ph": "f", "id": 15305, "pid": 76337, "tid": -914061504, "ts": 1716454216848874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216848875, "dur": 0, "args": { "External id": 15306, "cbid": 203, "correlation": 15306 } }, { "ph": "f", "id": 15306, "pid": 76337, "tid": -914061504, "ts": 1716454216848875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216848876, "dur": 0, "args": { "External id": 15307, "cbid": 205, "correlation": 15307 } }, { "ph": "f", "id": 15307, "pid": 76337, "tid": -914061504, "ts": 1716454216848876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216851164, "dur": 39, "args": { "External id": 15311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15311, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15311, "pid": 5, "tid": 7, "ts": 1716454216851164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216851140, "dur": 24, "args": { "External id": 15311, "cbid": 211, "correlation": 15311 } }, { "ph": "s", "id": 15311, "pid": 76337, "tid": -914061504, "ts": 1716454216851140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216851204, "dur": 166, "args": { "External id": 15313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15313, "pid": 5, "tid": 7, "ts": 1716454216851204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216851167, "dur": 6, "args": { "External id": 15313, "cbid": 211, "correlation": 15313 } }, { "ph": "s", "id": 15313, "pid": 76337, "tid": -914061504, "ts": 1716454216851167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216851372, "dur": 1937, "args": { "External id": 15315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15315, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15315, "pid": 5, "tid": 7, "ts": 1716454216851372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216851184, "dur": 10, "args": { "External id": 15315, "cbid": 211, "correlation": 15315 } }, { "ph": "s", "id": 15315, "pid": 76337, "tid": -914061504, "ts": 1716454216851184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216853310, "dur": 39, "args": { "External id": 15317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15317, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15317, "pid": 5, "tid": 7, "ts": 1716454216853310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216851202, "dur": 7, "args": { "External id": 15317, "cbid": 211, "correlation": 15317 } }, { "ph": "s", "id": 15317, "pid": 76337, "tid": -914061504, "ts": 1716454216851202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216853351, "dur": 59, "args": { "External id": 15323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15323, "pid": 5, "tid": 7, "ts": 1716454216853351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216851778, "dur": 11, "args": { "External id": 15323, "cbid": 211, "correlation": 15323 } }, { "ph": "s", "id": 15323, "pid": 76337, "tid": -914061504, "ts": 1716454216851778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216853411, "dur": 85, "args": { "External id": 15332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15332, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15332, "pid": 5, "tid": 7, "ts": 1716454216853411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216851929, "dur": 17, "args": { "External id": 15332, "cbid": 211, "correlation": 15332 } }, { "ph": "s", "id": 15332, "pid": 76337, "tid": -914061504, "ts": 1716454216851929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216853497, "dur": 71, "args": { "External id": 15352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15352, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 15352, "pid": 5, "tid": 7, "ts": 1716454216853497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216852040, "dur": 14, "args": { "External id": 15352, "cbid": 211, "correlation": 15352 } }, { "ph": "s", "id": 15352, "pid": 76337, "tid": -914061504, "ts": 1716454216852040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216853570, "dur": 4, "args": { "External id": 15364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15364, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 15364, "pid": 5, "tid": 7, "ts": 1716454216853570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216852078, "dur": 8, "args": { "External id": 15364, "cbid": 211, "correlation": 15364 } }, { "ph": "s", "id": 15364, "pid": 76337, "tid": -914061504, "ts": 1716454216852078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216853575, "dur": 80, "args": { "External id": 15367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15367, "pid": 5, "tid": 7, "ts": 1716454216853575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216852108, "dur": 8, "args": { "External id": 15367, "cbid": 211, "correlation": 15367 } }, { "ph": "s", "id": 15367, "pid": 76337, "tid": -914061504, "ts": 1716454216852108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216853657, "dur": 54, "args": { "External id": 15376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15376, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15376, "pid": 5, "tid": 7, "ts": 1716454216853657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216852165, "dur": 11, "args": { "External id": 15376, "cbid": 211, "correlation": 15376 } }, { "ph": "s", "id": 15376, "pid": 76337, "tid": -914061504, "ts": 1716454216852165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216852258, "dur": 0, "args": { "External id": 15386, "cbid": 317, "correlation": 15386 } }, { "ph": "f", "id": 15386, "pid": 76337, "tid": -914061504, "ts": 1716454216852258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216852259, "dur": 1, "args": { "External id": 15387, "cbid": 203, "correlation": 15387 } }, { "ph": "f", "id": 15387, "pid": 76337, "tid": -914061504, "ts": 1716454216852259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216852260, "dur": 0, "args": { "External id": 15388, "cbid": 205, "correlation": 15388 } }, { "ph": "f", "id": 15388, "pid": 76337, "tid": -914061504, "ts": 1716454216852260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216855232, "dur": 58, "args": { "External id": 15392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15392, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15392, "pid": 5, "tid": 7, "ts": 1716454216855232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216855208, "dur": 24, "args": { "External id": 15392, "cbid": 211, "correlation": 15392 } }, { "ph": "s", "id": 15392, "pid": 76337, "tid": -914061504, "ts": 1716454216855208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216855291, "dur": 124, "args": { "External id": 15394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15394, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15394, "pid": 5, "tid": 7, "ts": 1716454216855291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216855235, "dur": 6, "args": { "External id": 15394, "cbid": 211, "correlation": 15394 } }, { "ph": "s", "id": 15394, "pid": 76337, "tid": -914061504, "ts": 1716454216855235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216855416, "dur": 1903, "args": { "External id": 15396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15396, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15396, "pid": 5, "tid": 7, "ts": 1716454216855416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216855252, "dur": 9, "args": { "External id": 15396, "cbid": 211, "correlation": 15396 } }, { "ph": "s", "id": 15396, "pid": 76337, "tid": -914061504, "ts": 1716454216855252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216857321, "dur": 20, "args": { "External id": 15398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15398, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15398, "pid": 5, "tid": 7, "ts": 1716454216857321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216855267, "dur": 7, "args": { "External id": 15398, "cbid": 211, "correlation": 15398 } }, { "ph": "s", "id": 15398, "pid": 76337, "tid": -914061504, "ts": 1716454216855267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216857342, "dur": 33, "args": { "External id": 15404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15404, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15404, "pid": 5, "tid": 7, "ts": 1716454216857342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216855953, "dur": 11, "args": { "External id": 15404, "cbid": 211, "correlation": 15404 } }, { "ph": "s", "id": 15404, "pid": 76337, "tid": -914061504, "ts": 1716454216855953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216857376, "dur": 3, "args": { "External id": 15412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15412, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 15412, "pid": 5, "tid": 7, "ts": 1716454216857376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856031, "dur": 11, "args": { "External id": 15412, "cbid": 211, "correlation": 15412 } }, { "ph": "s", "id": 15412, "pid": 76337, "tid": -914061504, "ts": 1716454216856031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216856160, "dur": 4, "args": { "External id": 15428, "cbid": 251, "correlation": 15428 } }, { "ph": "f", "id": 15428, "pid": 76337, "tid": -914061504, "ts": 1716454216856160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216856170, "dur": 0, "args": { "External id": 15430, "cbid": 251, "correlation": 15430 } }, { "ph": "f", "id": 15430, "pid": 76337, "tid": -914061504, "ts": 1716454216856170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216857381, "dur": 12, "args": { "External id": 15431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15431, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 15431, "pid": 5, "tid": 7, "ts": 1716454216857381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856174, "dur": 15, "args": { "External id": 15431, "cbid": 211, "correlation": 15431 } }, { "ph": "s", "id": 15431, "pid": 76337, "tid": -914061504, "ts": 1716454216856174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216857394, "dur": 5, "args": { "External id": 15433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15433, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 15433, "pid": 5, "tid": 7, "ts": 1716454216857394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856193, "dur": 8, "args": { "External id": 15433, "cbid": 211, "correlation": 15433 } }, { "ph": "s", "id": 15433, "pid": 76337, "tid": -914061504, "ts": 1716454216856193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216857400, "dur": 29, "args": { "External id": 15443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15443, "pid": 5, "tid": 7, "ts": 1716454216857400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856281, "dur": 12, "args": { "External id": 15443, "cbid": 211, "correlation": 15443 } }, { "ph": "s", "id": 15443, "pid": 76337, "tid": -914061504, "ts": 1716454216856281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216857430, "dur": 30, "args": { "External id": 15463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15463, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 15463, "pid": 5, "tid": 7, "ts": 1716454216857430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856372, "dur": 12, "args": { "External id": 15463, "cbid": 211, "correlation": 15463 } }, { "ph": "s", "id": 15463, "pid": 76337, "tid": -914061504, "ts": 1716454216856372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216857462, "dur": 4, "args": { "External id": 15475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15475, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 15475, "pid": 5, "tid": 7, "ts": 1716454216857462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856394, "dur": 7, "args": { "External id": 15475, "cbid": 211, "correlation": 15475 } }, { "ph": "s", "id": 15475, "pid": 76337, "tid": -914061504, "ts": 1716454216856394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216857467, "dur": 30, "args": { "External id": 15478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15478, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15478, "pid": 5, "tid": 7, "ts": 1716454216857467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856421, "dur": 7, "args": { "External id": 15478, "cbid": 211, "correlation": 15478 } }, { "ph": "s", "id": 15478, "pid": 76337, "tid": -914061504, "ts": 1716454216856421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216857499, "dur": 21, "args": { "External id": 15487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15487, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15487, "pid": 5, "tid": 7, "ts": 1716454216857499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856465, "dur": 10, "args": { "External id": 15487, "cbid": 211, "correlation": 15487 } }, { "ph": "s", "id": 15487, "pid": 76337, "tid": -914061504, "ts": 1716454216856465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216856568, "dur": 1, "args": { "External id": 15497, "cbid": 317, "correlation": 15497 } }, { "ph": "f", "id": 15497, "pid": 76337, "tid": -914061504, "ts": 1716454216856568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216856569, "dur": 1, "args": { "External id": 15498, "cbid": 203, "correlation": 15498 } }, { "ph": "f", "id": 15498, "pid": 76337, "tid": -914061504, "ts": 1716454216856569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216856571, "dur": 0, "args": { "External id": 15499, "cbid": 205, "correlation": 15499 } }, { "ph": "f", "id": 15499, "pid": 76337, "tid": -914061504, "ts": 1716454216856571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216857521, "dur": 22, "args": { "External id": 15503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15503, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15503, "pid": 5, "tid": 7, "ts": 1716454216857521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856589, "dur": 13, "args": { "External id": 15503, "cbid": 211, "correlation": 15503 } }, { "ph": "s", "id": 15503, "pid": 76337, "tid": -914061504, "ts": 1716454216856589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216857544, "dur": 45, "args": { "External id": 15505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15505, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15505, "pid": 5, "tid": 7, "ts": 1716454216857544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856605, "dur": 5, "args": { "External id": 15505, "cbid": 211, "correlation": 15505 } }, { "ph": "s", "id": 15505, "pid": 76337, "tid": -914061504, "ts": 1716454216856605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216857591, "dur": 650, "args": { "External id": 15507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15507, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15507, "pid": 5, "tid": 7, "ts": 1716454216857591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856617, "dur": 7, "args": { "External id": 15507, "cbid": 211, "correlation": 15507 } }, { "ph": "s", "id": 15507, "pid": 76337, "tid": -914061504, "ts": 1716454216856617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216858242, "dur": 21, "args": { "External id": 15509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15509, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15509, "pid": 5, "tid": 7, "ts": 1716454216858242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856629, "dur": 5, "args": { "External id": 15509, "cbid": 211, "correlation": 15509 } }, { "ph": "s", "id": 15509, "pid": 76337, "tid": -914061504, "ts": 1716454216856629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216858264, "dur": 33, "args": { "External id": 15515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15515, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15515, "pid": 5, "tid": 7, "ts": 1716454216858264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216856657, "dur": 8, "args": { "External id": 15515, "cbid": 211, "correlation": 15515 } }, { "ph": "s", "id": 15515, "pid": 76337, "tid": -914061504, "ts": 1716454216856657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216856717, "dur": 0, "args": { "External id": 15525, "cbid": 317, "correlation": 15525 } }, { "ph": "f", "id": 15525, "pid": 76337, "tid": -914061504, "ts": 1716454216856717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216856718, "dur": 0, "args": { "External id": 15526, "cbid": 203, "correlation": 15526 } }, { "ph": "f", "id": 15526, "pid": 76337, "tid": -914061504, "ts": 1716454216856718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216856719, "dur": 0, "args": { "External id": 15527, "cbid": 205, "correlation": 15527 } }, { "ph": "f", "id": 15527, "pid": 76337, "tid": -914061504, "ts": 1716454216856719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216858438, "dur": 56, "args": { "External id": 15531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15531, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15531, "pid": 5, "tid": 7, "ts": 1716454216858438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216858420, "dur": 19, "args": { "External id": 15531, "cbid": 211, "correlation": 15531 } }, { "ph": "s", "id": 15531, "pid": 76337, "tid": -914061504, "ts": 1716454216858420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216858495, "dur": 273, "args": { "External id": 15533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15533, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15533, "pid": 5, "tid": 7, "ts": 1716454216858495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216858448, "dur": 11, "args": { "External id": 15533, "cbid": 211, "correlation": 15533 } }, { "ph": "s", "id": 15533, "pid": 76337, "tid": -914061504, "ts": 1716454216858448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216858770, "dur": 22, "args": { "External id": 15535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15535, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15535, "pid": 5, "tid": 7, "ts": 1716454216858770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216858463, "dur": 6, "args": { "External id": 15535, "cbid": 211, "correlation": 15535 } }, { "ph": "s", "id": 15535, "pid": 76337, "tid": -914061504, "ts": 1716454216858463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216858924, "dur": 32, "args": { "External id": 15541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15541, "pid": 5, "tid": 7, "ts": 1716454216858924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216858912, "dur": 10, "args": { "External id": 15541, "cbid": 211, "correlation": 15541 } }, { "ph": "s", "id": 15541, "pid": 76337, "tid": -914061504, "ts": 1716454216858912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216858963, "dur": 26, "args": { "External id": 15549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15549, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15549, "pid": 5, "tid": 7, "ts": 1716454216858963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216858951, "dur": 10, "args": { "External id": 15549, "cbid": 211, "correlation": 15549 } }, { "ph": "s", "id": 15549, "pid": 76337, "tid": -914061504, "ts": 1716454216858951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216859024, "dur": 20, "args": { "External id": 15557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15557, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15557, "pid": 5, "tid": 7, "ts": 1716454216859024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859012, "dur": 11, "args": { "External id": 15557, "cbid": 211, "correlation": 15557 } }, { "ph": "s", "id": 15557, "pid": 76337, "tid": -914061504, "ts": 1716454216859012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216859149, "dur": 30, "args": { "External id": 15577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15577, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 15577, "pid": 5, "tid": 7, "ts": 1716454216859149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859135, "dur": 12, "args": { "External id": 15577, "cbid": 211, "correlation": 15577 } }, { "ph": "s", "id": 15577, "pid": 76337, "tid": -914061504, "ts": 1716454216859135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216859180, "dur": 4, "args": { "External id": 15589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15589, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 15589, "pid": 5, "tid": 7, "ts": 1716454216859180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859159, "dur": 8, "args": { "External id": 15589, "cbid": 211, "correlation": 15589 } }, { "ph": "s", "id": 15589, "pid": 76337, "tid": -914061504, "ts": 1716454216859159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216859188, "dur": 30, "args": { "External id": 15592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15592, "pid": 5, "tid": 7, "ts": 1716454216859188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859180, "dur": 7, "args": { "External id": 15592, "cbid": 211, "correlation": 15592 } }, { "ph": "s", "id": 15592, "pid": 76337, "tid": -914061504, "ts": 1716454216859180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216859245, "dur": 0, "args": { "External id": 15603, "cbid": 317, "correlation": 15603 } }, { "ph": "f", "id": 15603, "pid": 76337, "tid": -914061504, "ts": 1716454216859245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216859246, "dur": 0, "args": { "External id": 15604, "cbid": 203, "correlation": 15604 } }, { "ph": "f", "id": 15604, "pid": 76337, "tid": -914061504, "ts": 1716454216859246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216859247, "dur": 0, "args": { "External id": 15605, "cbid": 205, "correlation": 15605 } }, { "ph": "f", "id": 15605, "pid": 76337, "tid": -914061504, "ts": 1716454216859247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216859275, "dur": 21, "args": { "External id": 15609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15609, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15609, "pid": 5, "tid": 7, "ts": 1716454216859275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859262, "dur": 12, "args": { "External id": 15609, "cbid": 211, "correlation": 15609 } }, { "ph": "s", "id": 15609, "pid": 76337, "tid": -914061504, "ts": 1716454216859262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216859297, "dur": 105, "args": { "External id": 15611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15611, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15611, "pid": 5, "tid": 7, "ts": 1716454216859297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859281, "dur": 7, "args": { "External id": 15611, "cbid": 211, "correlation": 15611 } }, { "ph": "s", "id": 15611, "pid": 76337, "tid": -914061504, "ts": 1716454216859281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216859404, "dur": 21, "args": { "External id": 15613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15613, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15613, "pid": 5, "tid": 7, "ts": 1716454216859404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859292, "dur": 5, "args": { "External id": 15613, "cbid": 211, "correlation": 15613 } }, { "ph": "s", "id": 15613, "pid": 76337, "tid": -914061504, "ts": 1716454216859292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216859426, "dur": 32, "args": { "External id": 15619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15619, "pid": 5, "tid": 7, "ts": 1716454216859426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859321, "dur": 8, "args": { "External id": 15619, "cbid": 211, "correlation": 15619 } }, { "ph": "s", "id": 15619, "pid": 76337, "tid": -914061504, "ts": 1716454216859321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216859501, "dur": 81, "args": { "External id": 15630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15630, "pid": 5, "tid": 7, "ts": 1716454216859501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859483, "dur": 19, "args": { "External id": 15630, "cbid": 211, "correlation": 15630 } }, { "ph": "s", "id": 15630, "pid": 76337, "tid": -914061504, "ts": 1716454216859483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216859584, "dur": 37, "args": { "External id": 15652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15652, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15652, "pid": 5, "tid": 7, "ts": 1716454216859584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859525, "dur": 10, "args": { "External id": 15652, "cbid": 211, "correlation": 15652 } }, { "ph": "s", "id": 15652, "pid": 76337, "tid": -914061504, "ts": 1716454216859525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216859682, "dur": 3, "args": { "External id": 15663, "cbid": 251, "correlation": 15663 } }, { "ph": "f", "id": 15663, "pid": 76337, "tid": -914061504, "ts": 1716454216859682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216859710, "dur": 96, "args": { "External id": 15664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15664, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15664, "pid": 5, "tid": 7, "ts": 1716454216859710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859693, "dur": 17, "args": { "External id": 15664, "cbid": 211, "correlation": 15664 } }, { "ph": "s", "id": 15664, "pid": 76337, "tid": -914061504, "ts": 1716454216859693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216859791, "dur": 1, "args": { "External id": 15675, "cbid": 251, "correlation": 15675 } }, { "ph": "f", "id": 15675, "pid": 76337, "tid": -914061504, "ts": 1716454216859791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216859811, "dur": 89, "args": { "External id": 15676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15676, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15676, "pid": 5, "tid": 7, "ts": 1716454216859811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859795, "dur": 13, "args": { "External id": 15676, "cbid": 211, "correlation": 15676 } }, { "ph": "s", "id": 15676, "pid": 76337, "tid": -914061504, "ts": 1716454216859795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216859866, "dur": 1, "args": { "External id": 15687, "cbid": 251, "correlation": 15687 } }, { "ph": "f", "id": 15687, "pid": 76337, "tid": -914061504, "ts": 1716454216859866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216859900, "dur": 76, "args": { "External id": 15688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15688, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15688, "pid": 5, "tid": 7, "ts": 1716454216859900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859870, "dur": 12, "args": { "External id": 15688, "cbid": 211, "correlation": 15688 } }, { "ph": "s", "id": 15688, "pid": 76337, "tid": -914061504, "ts": 1716454216859870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216860010, "dur": 551, "args": { "External id": 15709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15709, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 15709, "pid": 5, "tid": 7, "ts": 1716454216860010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216859993, "dur": 17, "args": { "External id": 15709, "cbid": 211, "correlation": 15709 } }, { "ph": "s", "id": 15709, "pid": 76337, "tid": -914061504, "ts": 1716454216859993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860151, "dur": 3, "args": { "External id": 15727, "cbid": 251, "correlation": 15727 } }, { "ph": "f", "id": 15727, "pid": 76337, "tid": -914061504, "ts": 1716454216860151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216860562, "dur": 98, "args": { "External id": 15729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15729, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15729, "pid": 5, "tid": 7, "ts": 1716454216860562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860159, "dur": 15, "args": { "External id": 15729, "cbid": 211, "correlation": 15729 } }, { "ph": "s", "id": 15729, "pid": 76337, "tid": -914061504, "ts": 1716454216860159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216860661, "dur": 19, "args": { "External id": 15737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15737, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15737, "pid": 5, "tid": 7, "ts": 1716454216860661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860244, "dur": 13, "args": { "External id": 15737, "cbid": 211, "correlation": 15737 } }, { "ph": "s", "id": 15737, "pid": 76337, "tid": -914061504, "ts": 1716454216860244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216860682, "dur": 87, "args": { "External id": 15745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15745, "pid": 5, "tid": 7, "ts": 1716454216860682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860287, "dur": 9, "args": { "External id": 15745, "cbid": 211, "correlation": 15745 } }, { "ph": "s", "id": 15745, "pid": 76337, "tid": -914061504, "ts": 1716454216860287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216860770, "dur": 35, "args": { "External id": 15767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15767, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15767, "pid": 5, "tid": 7, "ts": 1716454216860770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860353, "dur": 12, "args": { "External id": 15767, "cbid": 211, "correlation": 15767 } }, { "ph": "s", "id": 15767, "pid": 76337, "tid": -914061504, "ts": 1716454216860353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860459, "dur": 1, "args": { "External id": 15778, "cbid": 251, "correlation": 15778 } }, { "ph": "f", "id": 15778, "pid": 76337, "tid": -914061504, "ts": 1716454216860459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216860806, "dur": 76, "args": { "External id": 15779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15779, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15779, "pid": 5, "tid": 7, "ts": 1716454216860806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860464, "dur": 14, "args": { "External id": 15779, "cbid": 211, "correlation": 15779 } }, { "ph": "s", "id": 15779, "pid": 76337, "tid": -914061504, "ts": 1716454216860464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860558, "dur": 1, "args": { "External id": 15790, "cbid": 251, "correlation": 15790 } }, { "ph": "f", "id": 15790, "pid": 76337, "tid": -914061504, "ts": 1716454216860558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860563, "dur": 0, "args": { "External id": 15791, "cbid": 251, "correlation": 15791 } }, { "ph": "f", "id": 15791, "pid": 76337, "tid": -914061504, "ts": 1716454216860563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216860883, "dur": 11, "args": { "External id": 15792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15792, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 15792, "pid": 5, "tid": 7, "ts": 1716454216860883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860565, "dur": 16, "args": { "External id": 15792, "cbid": 211, "correlation": 15792 } }, { "ph": "s", "id": 15792, "pid": 76337, "tid": -914061504, "ts": 1716454216860565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216860896, "dur": 5, "args": { "External id": 15794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15794, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 15794, "pid": 5, "tid": 7, "ts": 1716454216860896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860584, "dur": 9, "args": { "External id": 15794, "cbid": 211, "correlation": 15794 } }, { "ph": "s", "id": 15794, "pid": 76337, "tid": -914061504, "ts": 1716454216860584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860658, "dur": 1, "args": { "External id": 15805, "cbid": 251, "correlation": 15805 } }, { "ph": "f", "id": 15805, "pid": 76337, "tid": -914061504, "ts": 1716454216860658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860661, "dur": 0, "args": { "External id": 15806, "cbid": 251, "correlation": 15806 } }, { "ph": "f", "id": 15806, "pid": 76337, "tid": -914061504, "ts": 1716454216860661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216860902, "dur": 7, "args": { "External id": 15807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15807, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 15807, "pid": 5, "tid": 7, "ts": 1716454216860902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860663, "dur": 12, "args": { "External id": 15807, "cbid": 211, "correlation": 15807 } }, { "ph": "s", "id": 15807, "pid": 76337, "tid": -914061504, "ts": 1716454216860663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216860911, "dur": 3, "args": { "External id": 15809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15809, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 15809, "pid": 5, "tid": 7, "ts": 1716454216860911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860677, "dur": 7, "args": { "External id": 15809, "cbid": 211, "correlation": 15809 } }, { "ph": "s", "id": 15809, "pid": 76337, "tid": -914061504, "ts": 1716454216860677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216860915, "dur": 92, "args": { "External id": 15830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15830, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 15830, "pid": 5, "tid": 7, "ts": 1716454216860915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860754, "dur": 13, "args": { "External id": 15830, "cbid": 211, "correlation": 15830 } }, { "ph": "s", "id": 15830, "pid": 76337, "tid": -914061504, "ts": 1716454216860754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216860864, "dur": 1, "args": { "External id": 15848, "cbid": 251, "correlation": 15848 } }, { "ph": "f", "id": 15848, "pid": 76337, "tid": -914061504, "ts": 1716454216860864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216861009, "dur": 80, "args": { "External id": 15850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15850, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15850, "pid": 5, "tid": 7, "ts": 1716454216861009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860870, "dur": 14, "args": { "External id": 15850, "cbid": 211, "correlation": 15850 } }, { "ph": "s", "id": 15850, "pid": 76337, "tid": -914061504, "ts": 1716454216860870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216861091, "dur": 19, "args": { "External id": 15858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15858, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15858, "pid": 5, "tid": 7, "ts": 1716454216861091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860944, "dur": 13, "args": { "External id": 15858, "cbid": 211, "correlation": 15858 } }, { "ph": "s", "id": 15858, "pid": 76337, "tid": -914061504, "ts": 1716454216860944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216861111, "dur": 28, "args": { "External id": 15866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15866, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15866, "pid": 5, "tid": 7, "ts": 1716454216861111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216860994, "dur": 10, "args": { "External id": 15866, "cbid": 211, "correlation": 15866 } }, { "ph": "s", "id": 15866, "pid": 76337, "tid": -914061504, "ts": 1716454216860994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216861140, "dur": 35, "args": { "External id": 15888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15888, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15888, "pid": 5, "tid": 7, "ts": 1716454216861140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861046, "dur": 11, "args": { "External id": 15888, "cbid": 211, "correlation": 15888 } }, { "ph": "s", "id": 15888, "pid": 76337, "tid": -914061504, "ts": 1716454216861046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216861143, "dur": 2, "args": { "External id": 15904, "cbid": 251, "correlation": 15904 } }, { "ph": "f", "id": 15904, "pid": 76337, "tid": -914061504, "ts": 1716454216861143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216861149, "dur": 0, "args": { "External id": 15906, "cbid": 251, "correlation": 15906 } }, { "ph": "f", "id": 15906, "pid": 76337, "tid": -914061504, "ts": 1716454216861149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216861177, "dur": 547, "args": { "External id": 15907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15907, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 15907, "pid": 5, "tid": 7, "ts": 1716454216861177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861155, "dur": 13, "args": { "External id": 15907, "cbid": 211, "correlation": 15907 } }, { "ph": "s", "id": 15907, "pid": 76337, "tid": -914061504, "ts": 1716454216861155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216861725, "dur": 126, "args": { "External id": 15915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15915, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15915, "pid": 5, "tid": 7, "ts": 1716454216861725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861248, "dur": 14, "args": { "External id": 15915, "cbid": 211, "correlation": 15915 } }, { "ph": "s", "id": 15915, "pid": 76337, "tid": -914061504, "ts": 1716454216861248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216861852, "dur": 129, "args": { "External id": 15923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15923, "pid": 5, "tid": 7, "ts": 1716454216861852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861288, "dur": 12, "args": { "External id": 15923, "cbid": 211, "correlation": 15923 } }, { "ph": "s", "id": 15923, "pid": 76337, "tid": -914061504, "ts": 1716454216861288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216861373, "dur": 1, "args": { "External id": 15939, "cbid": 251, "correlation": 15939 } }, { "ph": "f", "id": 15939, "pid": 76337, "tid": -914061504, "ts": 1716454216861373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216861982, "dur": 307, "args": { "External id": 15941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15941, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15941, "pid": 5, "tid": 7, "ts": 1716454216861982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861378, "dur": 13, "args": { "External id": 15941, "cbid": 211, "correlation": 15941 } }, { "ph": "s", "id": 15941, "pid": 76337, "tid": -914061504, "ts": 1716454216861378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216862291, "dur": 27, "args": { "External id": 15949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15949, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15949, "pid": 5, "tid": 7, "ts": 1716454216862291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861420, "dur": 10, "args": { "External id": 15949, "cbid": 211, "correlation": 15949 } }, { "ph": "s", "id": 15949, "pid": 76337, "tid": -914061504, "ts": 1716454216861420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216862319, "dur": 82, "args": { "External id": 15960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15960, "pid": 5, "tid": 7, "ts": 1716454216862319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861488, "dur": 14, "args": { "External id": 15960, "cbid": 211, "correlation": 15960 } }, { "ph": "s", "id": 15960, "pid": 76337, "tid": -914061504, "ts": 1716454216861488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216861557, "dur": 1, "args": { "External id": 15972, "cbid": 317, "correlation": 15972 } }, { "ph": "f", "id": 15972, "pid": 76337, "tid": -914061504, "ts": 1716454216861557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216861558, "dur": 0, "args": { "External id": 15973, "cbid": 203, "correlation": 15973 } }, { "ph": "f", "id": 15973, "pid": 76337, "tid": -914061504, "ts": 1716454216861558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216861559, "dur": 0, "args": { "External id": 15974, "cbid": 205, "correlation": 15974 } }, { "ph": "f", "id": 15974, "pid": 76337, "tid": -914061504, "ts": 1716454216861559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216862402, "dur": 23, "args": { "External id": 15978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15978, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15978, "pid": 5, "tid": 7, "ts": 1716454216862402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861575, "dur": 12, "args": { "External id": 15978, "cbid": 211, "correlation": 15978 } }, { "ph": "s", "id": 15978, "pid": 76337, "tid": -914061504, "ts": 1716454216861575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216862426, "dur": 122, "args": { "External id": 15980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15980, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 15980, "pid": 5, "tid": 7, "ts": 1716454216862426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861595, "dur": 7, "args": { "External id": 15980, "cbid": 211, "correlation": 15980 } }, { "ph": "s", "id": 15980, "pid": 76337, "tid": -914061504, "ts": 1716454216861595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216862549, "dur": 22, "args": { "External id": 15982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15982, "pid": 5, "tid": 7, "ts": 1716454216862549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861606, "dur": 5, "args": { "External id": 15982, "cbid": 211, "correlation": 15982 } }, { "ph": "s", "id": 15982, "pid": 76337, "tid": -914061504, "ts": 1716454216861606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216862572, "dur": 33, "args": { "External id": 15988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15988, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15988, "pid": 5, "tid": 7, "ts": 1716454216862572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861634, "dur": 9, "args": { "External id": 15988, "cbid": 211, "correlation": 15988 } }, { "ph": "s", "id": 15988, "pid": 76337, "tid": -914061504, "ts": 1716454216861634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216862607, "dur": 27, "args": { "External id": 15996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 15996, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 15996, "pid": 5, "tid": 7, "ts": 1716454216862607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861666, "dur": 8, "args": { "External id": 15996, "cbid": 211, "correlation": 15996 } }, { "ph": "s", "id": 15996, "pid": 76337, "tid": -914061504, "ts": 1716454216861666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216862634, "dur": 54, "args": { "External id": 16005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16005, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16005, "pid": 5, "tid": 7, "ts": 1716454216862634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861724, "dur": 12, "args": { "External id": 16005, "cbid": 211, "correlation": 16005 } }, { "ph": "s", "id": 16005, "pid": 76337, "tid": -914061504, "ts": 1716454216861724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216862690, "dur": 53, "args": { "External id": 16025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16025, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 16025, "pid": 5, "tid": 7, "ts": 1716454216862690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861816, "dur": 13, "args": { "External id": 16025, "cbid": 211, "correlation": 16025 } }, { "ph": "s", "id": 16025, "pid": 76337, "tid": -914061504, "ts": 1716454216861816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216862744, "dur": 5, "args": { "External id": 16037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16037, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 16037, "pid": 5, "tid": 7, "ts": 1716454216862744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861840, "dur": 7, "args": { "External id": 16037, "cbid": 211, "correlation": 16037 } }, { "ph": "s", "id": 16037, "pid": 76337, "tid": -914061504, "ts": 1716454216861840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216862750, "dur": 58, "args": { "External id": 16040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16040, "pid": 5, "tid": 7, "ts": 1716454216862750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861860, "dur": 7, "args": { "External id": 16040, "cbid": 211, "correlation": 16040 } }, { "ph": "s", "id": 16040, "pid": 76337, "tid": -914061504, "ts": 1716454216861860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216862809, "dur": 37, "args": { "External id": 16049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16049, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16049, "pid": 5, "tid": 7, "ts": 1716454216862809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216861907, "dur": 10, "args": { "External id": 16049, "cbid": 211, "correlation": 16049 } }, { "ph": "s", "id": 16049, "pid": 76337, "tid": -914061504, "ts": 1716454216861907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216861972, "dur": 9, "args": { "External id": 16059, "cbid": 317, "correlation": 16059 } }, { "ph": "f", "id": 16059, "pid": 76337, "tid": -914061504, "ts": 1716454216861972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216861982, "dur": 0, "args": { "External id": 16060, "cbid": 203, "correlation": 16060 } }, { "ph": "f", "id": 16060, "pid": 76337, "tid": -914061504, "ts": 1716454216861982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216861983, "dur": 0, "args": { "External id": 16061, "cbid": 205, "correlation": 16061 } }, { "ph": "f", "id": 16061, "pid": 76337, "tid": -914061504, "ts": 1716454216861983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216865040, "dur": 39, "args": { "External id": 16065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16065, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16065, "pid": 5, "tid": 7, "ts": 1716454216865040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216865012, "dur": 28, "args": { "External id": 16065, "cbid": 211, "correlation": 16065 } }, { "ph": "s", "id": 16065, "pid": 76337, "tid": -914061504, "ts": 1716454216865012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216865080, "dur": 85, "args": { "External id": 16067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16067, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16067, "pid": 5, "tid": 7, "ts": 1716454216865080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216865043, "dur": 6, "args": { "External id": 16067, "cbid": 211, "correlation": 16067 } }, { "ph": "s", "id": 16067, "pid": 76337, "tid": -914061504, "ts": 1716454216865043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216865166, "dur": 1287, "args": { "External id": 16069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16069, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16069, "pid": 5, "tid": 7, "ts": 1716454216865166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216865058, "dur": 11, "args": { "External id": 16069, "cbid": 211, "correlation": 16069 } }, { "ph": "s", "id": 16069, "pid": 76337, "tid": -914061504, "ts": 1716454216865058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216866454, "dur": 20, "args": { "External id": 16071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16071, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16071, "pid": 5, "tid": 7, "ts": 1716454216866454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216865074, "dur": 8, "args": { "External id": 16071, "cbid": 211, "correlation": 16071 } }, { "ph": "s", "id": 16071, "pid": 76337, "tid": -914061504, "ts": 1716454216865074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216866476, "dur": 34, "args": { "External id": 16077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16077, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16077, "pid": 5, "tid": 7, "ts": 1716454216866476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216865803, "dur": 11, "args": { "External id": 16077, "cbid": 211, "correlation": 16077 } }, { "ph": "s", "id": 16077, "pid": 76337, "tid": -914061504, "ts": 1716454216865803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216866511, "dur": 3, "args": { "External id": 16085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16085, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 16085, "pid": 5, "tid": 7, "ts": 1716454216866511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216865887, "dur": 11, "args": { "External id": 16085, "cbid": 211, "correlation": 16085 } }, { "ph": "s", "id": 16085, "pid": 76337, "tid": -914061504, "ts": 1716454216865887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216866026, "dur": 3, "args": { "External id": 16101, "cbid": 251, "correlation": 16101 } }, { "ph": "f", "id": 16101, "pid": 76337, "tid": -914061504, "ts": 1716454216866026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216866035, "dur": 0, "args": { "External id": 16103, "cbid": 251, "correlation": 16103 } }, { "ph": "f", "id": 16103, "pid": 76337, "tid": -914061504, "ts": 1716454216866035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216866516, "dur": 12, "args": { "External id": 16104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16104, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 16104, "pid": 5, "tid": 7, "ts": 1716454216866516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866038, "dur": 16, "args": { "External id": 16104, "cbid": 211, "correlation": 16104 } }, { "ph": "s", "id": 16104, "pid": 76337, "tid": -914061504, "ts": 1716454216866038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216866529, "dur": 5, "args": { "External id": 16106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16106, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 16106, "pid": 5, "tid": 7, "ts": 1716454216866529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866058, "dur": 8, "args": { "External id": 16106, "cbid": 211, "correlation": 16106 } }, { "ph": "s", "id": 16106, "pid": 76337, "tid": -914061504, "ts": 1716454216866058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216866535, "dur": 30, "args": { "External id": 16116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16116, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16116, "pid": 5, "tid": 7, "ts": 1716454216866535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866147, "dur": 12, "args": { "External id": 16116, "cbid": 211, "correlation": 16116 } }, { "ph": "s", "id": 16116, "pid": 76337, "tid": -914061504, "ts": 1716454216866147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216866567, "dur": 31, "args": { "External id": 16136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16136, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 16136, "pid": 5, "tid": 7, "ts": 1716454216866567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866242, "dur": 12, "args": { "External id": 16136, "cbid": 211, "correlation": 16136 } }, { "ph": "s", "id": 16136, "pid": 76337, "tid": -914061504, "ts": 1716454216866242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216866599, "dur": 4, "args": { "External id": 16148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16148, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 16148, "pid": 5, "tid": 7, "ts": 1716454216866599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866265, "dur": 9, "args": { "External id": 16148, "cbid": 211, "correlation": 16148 } }, { "ph": "s", "id": 16148, "pid": 76337, "tid": -914061504, "ts": 1716454216866265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216866605, "dur": 30, "args": { "External id": 16151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16151, "pid": 5, "tid": 7, "ts": 1716454216866605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866295, "dur": 7, "args": { "External id": 16151, "cbid": 211, "correlation": 16151 } }, { "ph": "s", "id": 16151, "pid": 76337, "tid": -914061504, "ts": 1716454216866295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216866636, "dur": 21, "args": { "External id": 16160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16160, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16160, "pid": 5, "tid": 7, "ts": 1716454216866636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866340, "dur": 10, "args": { "External id": 16160, "cbid": 211, "correlation": 16160 } }, { "ph": "s", "id": 16160, "pid": 76337, "tid": -914061504, "ts": 1716454216866340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216866448, "dur": 0, "args": { "External id": 16170, "cbid": 317, "correlation": 16170 } }, { "ph": "f", "id": 16170, "pid": 76337, "tid": -914061504, "ts": 1716454216866448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216866449, "dur": 1, "args": { "External id": 16171, "cbid": 203, "correlation": 16171 } }, { "ph": "f", "id": 16171, "pid": 76337, "tid": -914061504, "ts": 1716454216866449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216866451, "dur": 0, "args": { "External id": 16172, "cbid": 205, "correlation": 16172 } }, { "ph": "f", "id": 16172, "pid": 76337, "tid": -914061504, "ts": 1716454216866451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216866658, "dur": 23, "args": { "External id": 16176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16176, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16176, "pid": 5, "tid": 7, "ts": 1716454216866658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866469, "dur": 16, "args": { "External id": 16176, "cbid": 211, "correlation": 16176 } }, { "ph": "s", "id": 16176, "pid": 76337, "tid": -914061504, "ts": 1716454216866469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216866682, "dur": 44, "args": { "External id": 16178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16178, "pid": 5, "tid": 7, "ts": 1716454216866682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866487, "dur": 5, "args": { "External id": 16178, "cbid": 211, "correlation": 16178 } }, { "ph": "s", "id": 16178, "pid": 76337, "tid": -914061504, "ts": 1716454216866487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216866728, "dur": 647, "args": { "External id": 16180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16180, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16180, "pid": 5, "tid": 7, "ts": 1716454216866728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866500, "dur": 7, "args": { "External id": 16180, "cbid": 211, "correlation": 16180 } }, { "ph": "s", "id": 16180, "pid": 76337, "tid": -914061504, "ts": 1716454216866500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216867376, "dur": 22, "args": { "External id": 16182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16182, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16182, "pid": 5, "tid": 7, "ts": 1716454216867376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866511, "dur": 5, "args": { "External id": 16182, "cbid": 211, "correlation": 16182 } }, { "ph": "s", "id": 16182, "pid": 76337, "tid": -914061504, "ts": 1716454216866511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216867399, "dur": 33, "args": { "External id": 16188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16188, "pid": 5, "tid": 7, "ts": 1716454216867399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216866543, "dur": 9, "args": { "External id": 16188, "cbid": 211, "correlation": 16188 } }, { "ph": "s", "id": 16188, "pid": 76337, "tid": -914061504, "ts": 1716454216866543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216866607, "dur": 0, "args": { "External id": 16198, "cbid": 317, "correlation": 16198 } }, { "ph": "f", "id": 16198, "pid": 76337, "tid": -914061504, "ts": 1716454216866607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216866608, "dur": 0, "args": { "External id": 16199, "cbid": 203, "correlation": 16199 } }, { "ph": "f", "id": 16199, "pid": 76337, "tid": -914061504, "ts": 1716454216866608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216866609, "dur": 0, "args": { "External id": 16200, "cbid": 205, "correlation": 16200 } }, { "ph": "f", "id": 16200, "pid": 76337, "tid": -914061504, "ts": 1716454216866609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216868364, "dur": 38, "args": { "External id": 16204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16204, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16204, "pid": 5, "tid": 7, "ts": 1716454216868364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216868344, "dur": 21, "args": { "External id": 16204, "cbid": 211, "correlation": 16204 } }, { "ph": "s", "id": 16204, "pid": 76337, "tid": -914061504, "ts": 1716454216868344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216868404, "dur": 192, "args": { "External id": 16206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16206, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16206, "pid": 5, "tid": 7, "ts": 1716454216868404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216868373, "dur": 10, "args": { "External id": 16206, "cbid": 211, "correlation": 16206 } }, { "ph": "s", "id": 16206, "pid": 76337, "tid": -914061504, "ts": 1716454216868373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216868598, "dur": 22, "args": { "External id": 16208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16208, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16208, "pid": 5, "tid": 7, "ts": 1716454216868598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216868387, "dur": 6, "args": { "External id": 16208, "cbid": 211, "correlation": 16208 } }, { "ph": "s", "id": 16208, "pid": 76337, "tid": -914061504, "ts": 1716454216868387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216868859, "dur": 33, "args": { "External id": 16214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16214, "pid": 5, "tid": 7, "ts": 1716454216868859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216868847, "dur": 11, "args": { "External id": 16214, "cbid": 211, "correlation": 16214 } }, { "ph": "s", "id": 16214, "pid": 76337, "tid": -914061504, "ts": 1716454216868847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216868901, "dur": 27, "args": { "External id": 16222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16222, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16222, "pid": 5, "tid": 7, "ts": 1716454216868901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216868891, "dur": 9, "args": { "External id": 16222, "cbid": 211, "correlation": 16222 } }, { "ph": "s", "id": 16222, "pid": 76337, "tid": -914061504, "ts": 1716454216868891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216868954, "dur": 19, "args": { "External id": 16230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16230, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16230, "pid": 5, "tid": 7, "ts": 1716454216868954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216868944, "dur": 10, "args": { "External id": 16230, "cbid": 211, "correlation": 16230 } }, { "ph": "s", "id": 16230, "pid": 76337, "tid": -914061504, "ts": 1716454216868944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216869084, "dur": 30, "args": { "External id": 16250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16250, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 16250, "pid": 5, "tid": 7, "ts": 1716454216869084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869070, "dur": 14, "args": { "External id": 16250, "cbid": 211, "correlation": 16250 } }, { "ph": "s", "id": 16250, "pid": 76337, "tid": -914061504, "ts": 1716454216869070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216869118, "dur": 4, "args": { "External id": 16262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16262, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 16262, "pid": 5, "tid": 7, "ts": 1716454216869118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869107, "dur": 9, "args": { "External id": 16262, "cbid": 211, "correlation": 16262 } }, { "ph": "s", "id": 16262, "pid": 76337, "tid": -914061504, "ts": 1716454216869107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216869140, "dur": 30, "args": { "External id": 16265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16265, "pid": 5, "tid": 7, "ts": 1716454216869140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869132, "dur": 7, "args": { "External id": 16265, "cbid": 211, "correlation": 16265 } }, { "ph": "s", "id": 16265, "pid": 76337, "tid": -914061504, "ts": 1716454216869132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216869204, "dur": 0, "args": { "External id": 16276, "cbid": 317, "correlation": 16276 } }, { "ph": "f", "id": 16276, "pid": 76337, "tid": -914061504, "ts": 1716454216869204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216869205, "dur": 0, "args": { "External id": 16277, "cbid": 203, "correlation": 16277 } }, { "ph": "f", "id": 16277, "pid": 76337, "tid": -914061504, "ts": 1716454216869205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216869206, "dur": 0, "args": { "External id": 16278, "cbid": 205, "correlation": 16278 } }, { "ph": "f", "id": 16278, "pid": 76337, "tid": -914061504, "ts": 1716454216869206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216869235, "dur": 22, "args": { "External id": 16282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16282, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16282, "pid": 5, "tid": 7, "ts": 1716454216869235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869222, "dur": 12, "args": { "External id": 16282, "cbid": 211, "correlation": 16282 } }, { "ph": "s", "id": 16282, "pid": 76337, "tid": -914061504, "ts": 1716454216869222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216869258, "dur": 105, "args": { "External id": 16284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16284, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16284, "pid": 5, "tid": 7, "ts": 1716454216869258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869240, "dur": 7, "args": { "External id": 16284, "cbid": 211, "correlation": 16284 } }, { "ph": "s", "id": 16284, "pid": 76337, "tid": -914061504, "ts": 1716454216869240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216869365, "dur": 22, "args": { "External id": 16286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16286, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16286, "pid": 5, "tid": 7, "ts": 1716454216869365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869251, "dur": 5, "args": { "External id": 16286, "cbid": 211, "correlation": 16286 } }, { "ph": "s", "id": 16286, "pid": 76337, "tid": -914061504, "ts": 1716454216869251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216869388, "dur": 32, "args": { "External id": 16292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16292, "pid": 5, "tid": 7, "ts": 1716454216869388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869280, "dur": 9, "args": { "External id": 16292, "cbid": 211, "correlation": 16292 } }, { "ph": "s", "id": 16292, "pid": 76337, "tid": -914061504, "ts": 1716454216869280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216869441, "dur": 82, "args": { "External id": 16303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16303, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16303, "pid": 5, "tid": 7, "ts": 1716454216869441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869423, "dur": 18, "args": { "External id": 16303, "cbid": 211, "correlation": 16303 } }, { "ph": "s", "id": 16303, "pid": 76337, "tid": -914061504, "ts": 1716454216869423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216869523, "dur": 37, "args": { "External id": 16325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16325, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16325, "pid": 5, "tid": 7, "ts": 1716454216869523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869464, "dur": 10, "args": { "External id": 16325, "cbid": 211, "correlation": 16325 } }, { "ph": "s", "id": 16325, "pid": 76337, "tid": -914061504, "ts": 1716454216869464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216869621, "dur": 3, "args": { "External id": 16336, "cbid": 251, "correlation": 16336 } }, { "ph": "f", "id": 16336, "pid": 76337, "tid": -914061504, "ts": 1716454216869621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216869648, "dur": 95, "args": { "External id": 16337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16337, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16337, "pid": 5, "tid": 7, "ts": 1716454216869648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869631, "dur": 17, "args": { "External id": 16337, "cbid": 211, "correlation": 16337 } }, { "ph": "s", "id": 16337, "pid": 76337, "tid": -914061504, "ts": 1716454216869631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216869711, "dur": 1, "args": { "External id": 16348, "cbid": 251, "correlation": 16348 } }, { "ph": "f", "id": 16348, "pid": 76337, "tid": -914061504, "ts": 1716454216869711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216869745, "dur": 89, "args": { "External id": 16349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16349, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16349, "pid": 5, "tid": 7, "ts": 1716454216869745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869715, "dur": 12, "args": { "External id": 16349, "cbid": 211, "correlation": 16349 } }, { "ph": "s", "id": 16349, "pid": 76337, "tid": -914061504, "ts": 1716454216869715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216869780, "dur": 1, "args": { "External id": 16360, "cbid": 251, "correlation": 16360 } }, { "ph": "f", "id": 16360, "pid": 76337, "tid": -914061504, "ts": 1716454216869780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216869835, "dur": 91, "args": { "External id": 16361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16361, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16361, "pid": 5, "tid": 7, "ts": 1716454216869835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869785, "dur": 12, "args": { "External id": 16361, "cbid": 211, "correlation": 16361 } }, { "ph": "s", "id": 16361, "pid": 76337, "tid": -914061504, "ts": 1716454216869785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216869927, "dur": 557, "args": { "External id": 16382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16382, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 16382, "pid": 5, "tid": 7, "ts": 1716454216869927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216869896, "dur": 16, "args": { "External id": 16382, "cbid": 211, "correlation": 16382 } }, { "ph": "s", "id": 16382, "pid": 76337, "tid": -914061504, "ts": 1716454216869896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870047, "dur": 2, "args": { "External id": 16400, "cbid": 251, "correlation": 16400 } }, { "ph": "f", "id": 16400, "pid": 76337, "tid": -914061504, "ts": 1716454216870047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216870485, "dur": 99, "args": { "External id": 16402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16402, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16402, "pid": 5, "tid": 7, "ts": 1716454216870485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870054, "dur": 15, "args": { "External id": 16402, "cbid": 211, "correlation": 16402 } }, { "ph": "s", "id": 16402, "pid": 76337, "tid": -914061504, "ts": 1716454216870054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216870586, "dur": 20, "args": { "External id": 16410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16410, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16410, "pid": 5, "tid": 7, "ts": 1716454216870586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870139, "dur": 13, "args": { "External id": 16410, "cbid": 211, "correlation": 16410 } }, { "ph": "s", "id": 16410, "pid": 76337, "tid": -914061504, "ts": 1716454216870139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216870607, "dur": 86, "args": { "External id": 16418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16418, "pid": 5, "tid": 7, "ts": 1716454216870607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870182, "dur": 10, "args": { "External id": 16418, "cbid": 211, "correlation": 16418 } }, { "ph": "s", "id": 16418, "pid": 76337, "tid": -914061504, "ts": 1716454216870182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216870694, "dur": 35, "args": { "External id": 16440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16440, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16440, "pid": 5, "tid": 7, "ts": 1716454216870694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870237, "dur": 10, "args": { "External id": 16440, "cbid": 211, "correlation": 16440 } }, { "ph": "s", "id": 16440, "pid": 76337, "tid": -914061504, "ts": 1716454216870237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870339, "dur": 1, "args": { "External id": 16451, "cbid": 251, "correlation": 16451 } }, { "ph": "f", "id": 16451, "pid": 76337, "tid": -914061504, "ts": 1716454216870339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216870730, "dur": 90, "args": { "External id": 16452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16452, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16452, "pid": 5, "tid": 7, "ts": 1716454216870730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870344, "dur": 14, "args": { "External id": 16452, "cbid": 211, "correlation": 16452 } }, { "ph": "s", "id": 16452, "pid": 76337, "tid": -914061504, "ts": 1716454216870344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870446, "dur": 2, "args": { "External id": 16463, "cbid": 251, "correlation": 16463 } }, { "ph": "f", "id": 16463, "pid": 76337, "tid": -914061504, "ts": 1716454216870446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870451, "dur": 0, "args": { "External id": 16464, "cbid": 251, "correlation": 16464 } }, { "ph": "f", "id": 16464, "pid": 76337, "tid": -914061504, "ts": 1716454216870451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216870822, "dur": 11, "args": { "External id": 16465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16465, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 16465, "pid": 5, "tid": 7, "ts": 1716454216870822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870454, "dur": 16, "args": { "External id": 16465, "cbid": 211, "correlation": 16465 } }, { "ph": "s", "id": 16465, "pid": 76337, "tid": -914061504, "ts": 1716454216870454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216870834, "dur": 5, "args": { "External id": 16467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16467, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 16467, "pid": 5, "tid": 7, "ts": 1716454216870834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870472, "dur": 9, "args": { "External id": 16467, "cbid": 211, "correlation": 16467 } }, { "ph": "s", "id": 16467, "pid": 76337, "tid": -914061504, "ts": 1716454216870472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870535, "dur": 1, "args": { "External id": 16478, "cbid": 251, "correlation": 16478 } }, { "ph": "f", "id": 16478, "pid": 76337, "tid": -914061504, "ts": 1716454216870535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870539, "dur": 0, "args": { "External id": 16479, "cbid": 251, "correlation": 16479 } }, { "ph": "f", "id": 16479, "pid": 76337, "tid": -914061504, "ts": 1716454216870539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216870841, "dur": 7, "args": { "External id": 16480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16480, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 16480, "pid": 5, "tid": 7, "ts": 1716454216870841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870540, "dur": 12, "args": { "External id": 16480, "cbid": 211, "correlation": 16480 } }, { "ph": "s", "id": 16480, "pid": 76337, "tid": -914061504, "ts": 1716454216870540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216870849, "dur": 3, "args": { "External id": 16482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16482, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 16482, "pid": 5, "tid": 7, "ts": 1716454216870849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870554, "dur": 5, "args": { "External id": 16482, "cbid": 211, "correlation": 16482 } }, { "ph": "s", "id": 16482, "pid": 76337, "tid": -914061504, "ts": 1716454216870554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216870853, "dur": 93, "args": { "External id": 16503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16503, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 16503, "pid": 5, "tid": 7, "ts": 1716454216870853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870629, "dur": 13, "args": { "External id": 16503, "cbid": 211, "correlation": 16503 } }, { "ph": "s", "id": 16503, "pid": 76337, "tid": -914061504, "ts": 1716454216870629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216870743, "dur": 1, "args": { "External id": 16521, "cbid": 251, "correlation": 16521 } }, { "ph": "f", "id": 16521, "pid": 76337, "tid": -914061504, "ts": 1716454216870743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216870947, "dur": 80, "args": { "External id": 16523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16523, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16523, "pid": 5, "tid": 7, "ts": 1716454216870947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870749, "dur": 14, "args": { "External id": 16523, "cbid": 211, "correlation": 16523 } }, { "ph": "s", "id": 16523, "pid": 76337, "tid": -914061504, "ts": 1716454216870749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216871029, "dur": 19, "args": { "External id": 16531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16531, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16531, "pid": 5, "tid": 7, "ts": 1716454216871029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870823, "dur": 14, "args": { "External id": 16531, "cbid": 211, "correlation": 16531 } }, { "ph": "s", "id": 16531, "pid": 76337, "tid": -914061504, "ts": 1716454216870823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216871049, "dur": 27, "args": { "External id": 16539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16539, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16539, "pid": 5, "tid": 7, "ts": 1716454216871049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870865, "dur": 9, "args": { "External id": 16539, "cbid": 211, "correlation": 16539 } }, { "ph": "s", "id": 16539, "pid": 76337, "tid": -914061504, "ts": 1716454216870865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216871078, "dur": 35, "args": { "External id": 16561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16561, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16561, "pid": 5, "tid": 7, "ts": 1716454216871078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216870930, "dur": 12, "args": { "External id": 16561, "cbid": 211, "correlation": 16561 } }, { "ph": "s", "id": 16561, "pid": 76337, "tid": -914061504, "ts": 1716454216870930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216871049, "dur": 2, "args": { "External id": 16577, "cbid": 251, "correlation": 16577 } }, { "ph": "f", "id": 16577, "pid": 76337, "tid": -914061504, "ts": 1716454216871049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216871055, "dur": 0, "args": { "External id": 16579, "cbid": 251, "correlation": 16579 } }, { "ph": "f", "id": 16579, "pid": 76337, "tid": -914061504, "ts": 1716454216871055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216871114, "dur": 548, "args": { "External id": 16580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16580, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 16580, "pid": 5, "tid": 7, "ts": 1716454216871114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871061, "dur": 15, "args": { "External id": 16580, "cbid": 211, "correlation": 16580 } }, { "ph": "s", "id": 16580, "pid": 76337, "tid": -914061504, "ts": 1716454216871061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216871664, "dur": 126, "args": { "External id": 16588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16588, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16588, "pid": 5, "tid": 7, "ts": 1716454216871664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871158, "dur": 15, "args": { "External id": 16588, "cbid": 211, "correlation": 16588 } }, { "ph": "s", "id": 16588, "pid": 76337, "tid": -914061504, "ts": 1716454216871158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216871791, "dur": 130, "args": { "External id": 16596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16596, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16596, "pid": 5, "tid": 7, "ts": 1716454216871791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871201, "dur": 12, "args": { "External id": 16596, "cbid": 211, "correlation": 16596 } }, { "ph": "s", "id": 16596, "pid": 76337, "tid": -914061504, "ts": 1716454216871201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216871286, "dur": 1, "args": { "External id": 16612, "cbid": 251, "correlation": 16612 } }, { "ph": "f", "id": 16612, "pid": 76337, "tid": -914061504, "ts": 1716454216871286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216871923, "dur": 306, "args": { "External id": 16614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16614, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16614, "pid": 5, "tid": 7, "ts": 1716454216871923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871292, "dur": 12, "args": { "External id": 16614, "cbid": 211, "correlation": 16614 } }, { "ph": "s", "id": 16614, "pid": 76337, "tid": -914061504, "ts": 1716454216871292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216872230, "dur": 26, "args": { "External id": 16622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16622, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16622, "pid": 5, "tid": 7, "ts": 1716454216872230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871333, "dur": 10, "args": { "External id": 16622, "cbid": 211, "correlation": 16622 } }, { "ph": "s", "id": 16622, "pid": 76337, "tid": -914061504, "ts": 1716454216871333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216872258, "dur": 82, "args": { "External id": 16633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16633, "pid": 5, "tid": 7, "ts": 1716454216872258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871405, "dur": 12, "args": { "External id": 16633, "cbid": 211, "correlation": 16633 } }, { "ph": "s", "id": 16633, "pid": 76337, "tid": -914061504, "ts": 1716454216871405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216871474, "dur": 0, "args": { "External id": 16645, "cbid": 317, "correlation": 16645 } }, { "ph": "f", "id": 16645, "pid": 76337, "tid": -914061504, "ts": 1716454216871474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216871475, "dur": 0, "args": { "External id": 16646, "cbid": 203, "correlation": 16646 } }, { "ph": "f", "id": 16646, "pid": 76337, "tid": -914061504, "ts": 1716454216871475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216871475, "dur": 0, "args": { "External id": 16647, "cbid": 205, "correlation": 16647 } }, { "ph": "f", "id": 16647, "pid": 76337, "tid": -914061504, "ts": 1716454216871475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216872342, "dur": 24, "args": { "External id": 16651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16651, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16651, "pid": 5, "tid": 7, "ts": 1716454216872342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871491, "dur": 13, "args": { "External id": 16651, "cbid": 211, "correlation": 16651 } }, { "ph": "s", "id": 16651, "pid": 76337, "tid": -914061504, "ts": 1716454216871491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216872367, "dur": 121, "args": { "External id": 16653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16653, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16653, "pid": 5, "tid": 7, "ts": 1716454216872367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871511, "dur": 7, "args": { "External id": 16653, "cbid": 211, "correlation": 16653 } }, { "ph": "s", "id": 16653, "pid": 76337, "tid": -914061504, "ts": 1716454216871511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216872489, "dur": 21, "args": { "External id": 16655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16655, "pid": 5, "tid": 7, "ts": 1716454216872489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871522, "dur": 6, "args": { "External id": 16655, "cbid": 211, "correlation": 16655 } }, { "ph": "s", "id": 16655, "pid": 76337, "tid": -914061504, "ts": 1716454216871522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216872512, "dur": 34, "args": { "External id": 16661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16661, "pid": 5, "tid": 7, "ts": 1716454216872512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871551, "dur": 9, "args": { "External id": 16661, "cbid": 211, "correlation": 16661 } }, { "ph": "s", "id": 16661, "pid": 76337, "tid": -914061504, "ts": 1716454216871551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216872547, "dur": 27, "args": { "External id": 16669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16669, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16669, "pid": 5, "tid": 7, "ts": 1716454216872547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871582, "dur": 8, "args": { "External id": 16669, "cbid": 211, "correlation": 16669 } }, { "ph": "s", "id": 16669, "pid": 76337, "tid": -914061504, "ts": 1716454216871582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216872575, "dur": 45, "args": { "External id": 16678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16678, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16678, "pid": 5, "tid": 7, "ts": 1716454216872575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871637, "dur": 13, "args": { "External id": 16678, "cbid": 211, "correlation": 16678 } }, { "ph": "s", "id": 16678, "pid": 76337, "tid": -914061504, "ts": 1716454216871637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216872622, "dur": 43, "args": { "External id": 16698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16698, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 16698, "pid": 5, "tid": 7, "ts": 1716454216872622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871732, "dur": 13, "args": { "External id": 16698, "cbid": 211, "correlation": 16698 } }, { "ph": "s", "id": 16698, "pid": 76337, "tid": -914061504, "ts": 1716454216871732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216872666, "dur": 4, "args": { "External id": 16710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16710, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 16710, "pid": 5, "tid": 7, "ts": 1716454216872666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871762, "dur": 9, "args": { "External id": 16710, "cbid": 211, "correlation": 16710 } }, { "ph": "s", "id": 16710, "pid": 76337, "tid": -914061504, "ts": 1716454216871762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216872672, "dur": 45, "args": { "External id": 16713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16713, "pid": 5, "tid": 7, "ts": 1716454216872672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871785, "dur": 7, "args": { "External id": 16713, "cbid": 211, "correlation": 16713 } }, { "ph": "s", "id": 16713, "pid": 76337, "tid": -914061504, "ts": 1716454216871785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216872718, "dur": 28, "args": { "External id": 16722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16722, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16722, "pid": 5, "tid": 7, "ts": 1716454216872718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216871832, "dur": 10, "args": { "External id": 16722, "cbid": 211, "correlation": 16722 } }, { "ph": "s", "id": 16722, "pid": 76337, "tid": -914061504, "ts": 1716454216871832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216871900, "dur": 0, "args": { "External id": 16732, "cbid": 317, "correlation": 16732 } }, { "ph": "f", "id": 16732, "pid": 76337, "tid": -914061504, "ts": 1716454216871900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216871901, "dur": 0, "args": { "External id": 16733, "cbid": 203, "correlation": 16733 } }, { "ph": "f", "id": 16733, "pid": 76337, "tid": -914061504, "ts": 1716454216871901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216871901, "dur": 0, "args": { "External id": 16734, "cbid": 205, "correlation": 16734 } }, { "ph": "f", "id": 16734, "pid": 76337, "tid": -914061504, "ts": 1716454216871901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216874954, "dur": 30, "args": { "External id": 16738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16738, "pid": 5, "tid": 7, "ts": 1716454216874954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216874926, "dur": 29, "args": { "External id": 16738, "cbid": 211, "correlation": 16738 } }, { "ph": "s", "id": 16738, "pid": 76337, "tid": -914061504, "ts": 1716454216874926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216874986, "dur": 64, "args": { "External id": 16740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16740, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16740, "pid": 5, "tid": 7, "ts": 1716454216874986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216874958, "dur": 6, "args": { "External id": 16740, "cbid": 211, "correlation": 16740 } }, { "ph": "s", "id": 16740, "pid": 76337, "tid": -914061504, "ts": 1716454216874958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216875051, "dur": 970, "args": { "External id": 16742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16742, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16742, "pid": 5, "tid": 7, "ts": 1716454216875051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216874981, "dur": 13, "args": { "External id": 16742, "cbid": 211, "correlation": 16742 } }, { "ph": "s", "id": 16742, "pid": 76337, "tid": -914061504, "ts": 1716454216874981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216876022, "dur": 21, "args": { "External id": 16744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16744, "pid": 5, "tid": 7, "ts": 1716454216876022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216874998, "dur": 6, "args": { "External id": 16744, "cbid": 211, "correlation": 16744 } }, { "ph": "s", "id": 16744, "pid": 76337, "tid": -914061504, "ts": 1716454216874998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216876044, "dur": 34, "args": { "External id": 16750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16750, "pid": 5, "tid": 7, "ts": 1716454216876044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216875726, "dur": 11, "args": { "External id": 16750, "cbid": 211, "correlation": 16750 } }, { "ph": "s", "id": 16750, "pid": 76337, "tid": -914061504, "ts": 1716454216875726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216876079, "dur": 4, "args": { "External id": 16758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16758, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 16758, "pid": 5, "tid": 7, "ts": 1716454216876079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216875808, "dur": 10, "args": { "External id": 16758, "cbid": 211, "correlation": 16758 } }, { "ph": "s", "id": 16758, "pid": 76337, "tid": -914061504, "ts": 1716454216875808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216875936, "dur": 3, "args": { "External id": 16774, "cbid": 251, "correlation": 16774 } }, { "ph": "f", "id": 16774, "pid": 76337, "tid": -914061504, "ts": 1716454216875936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216875944, "dur": 0, "args": { "External id": 16776, "cbid": 251, "correlation": 16776 } }, { "ph": "f", "id": 16776, "pid": 76337, "tid": -914061504, "ts": 1716454216875944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216876084, "dur": 12, "args": { "External id": 16777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16777, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 16777, "pid": 5, "tid": 7, "ts": 1716454216876084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216875947, "dur": 16, "args": { "External id": 16777, "cbid": 211, "correlation": 16777 } }, { "ph": "s", "id": 16777, "pid": 76337, "tid": -914061504, "ts": 1716454216875947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216876098, "dur": 5, "args": { "External id": 16779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16779, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 16779, "pid": 5, "tid": 7, "ts": 1716454216876098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216875967, "dur": 16, "args": { "External id": 16779, "cbid": 211, "correlation": 16779 } }, { "ph": "s", "id": 16779, "pid": 76337, "tid": -914061504, "ts": 1716454216875967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216876104, "dur": 30, "args": { "External id": 16789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16789, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16789, "pid": 5, "tid": 7, "ts": 1716454216876104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876063, "dur": 13, "args": { "External id": 16789, "cbid": 211, "correlation": 16789 } }, { "ph": "s", "id": 16789, "pid": 76337, "tid": -914061504, "ts": 1716454216876063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216876169, "dur": 31, "args": { "External id": 16809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16809, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 16809, "pid": 5, "tid": 7, "ts": 1716454216876169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876156, "dur": 12, "args": { "External id": 16809, "cbid": 211, "correlation": 16809 } }, { "ph": "s", "id": 16809, "pid": 76337, "tid": -914061504, "ts": 1716454216876156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216876202, "dur": 4, "args": { "External id": 16821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16821, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 16821, "pid": 5, "tid": 7, "ts": 1716454216876202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876179, "dur": 7, "args": { "External id": 16821, "cbid": 211, "correlation": 16821 } }, { "ph": "s", "id": 16821, "pid": 76337, "tid": -914061504, "ts": 1716454216876179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216876216, "dur": 30, "args": { "External id": 16824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16824, "pid": 5, "tid": 7, "ts": 1716454216876216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876208, "dur": 7, "args": { "External id": 16824, "cbid": 211, "correlation": 16824 } }, { "ph": "s", "id": 16824, "pid": 76337, "tid": -914061504, "ts": 1716454216876208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216876262, "dur": 21, "args": { "External id": 16833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16833, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16833, "pid": 5, "tid": 7, "ts": 1716454216876262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876251, "dur": 10, "args": { "External id": 16833, "cbid": 211, "correlation": 16833 } }, { "ph": "s", "id": 16833, "pid": 76337, "tid": -914061504, "ts": 1716454216876251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216876354, "dur": 0, "args": { "External id": 16843, "cbid": 317, "correlation": 16843 } }, { "ph": "f", "id": 16843, "pid": 76337, "tid": -914061504, "ts": 1716454216876354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216876355, "dur": 1, "args": { "External id": 16844, "cbid": 203, "correlation": 16844 } }, { "ph": "f", "id": 16844, "pid": 76337, "tid": -914061504, "ts": 1716454216876355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216876356, "dur": 0, "args": { "External id": 16845, "cbid": 205, "correlation": 16845 } }, { "ph": "f", "id": 16845, "pid": 76337, "tid": -914061504, "ts": 1716454216876356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216876389, "dur": 22, "args": { "External id": 16849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16849, "pid": 5, "tid": 7, "ts": 1716454216876389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876375, "dur": 13, "args": { "External id": 16849, "cbid": 211, "correlation": 16849 } }, { "ph": "s", "id": 16849, "pid": 76337, "tid": -914061504, "ts": 1716454216876375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216876412, "dur": 44, "args": { "External id": 16851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16851, "pid": 5, "tid": 7, "ts": 1716454216876412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876391, "dur": 6, "args": { "External id": 16851, "cbid": 211, "correlation": 16851 } }, { "ph": "s", "id": 16851, "pid": 76337, "tid": -914061504, "ts": 1716454216876391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216876457, "dur": 647, "args": { "External id": 16853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16853, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16853, "pid": 5, "tid": 7, "ts": 1716454216876457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876405, "dur": 6, "args": { "External id": 16853, "cbid": 211, "correlation": 16853 } }, { "ph": "s", "id": 16853, "pid": 76337, "tid": -914061504, "ts": 1716454216876405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216877106, "dur": 21, "args": { "External id": 16855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16855, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16855, "pid": 5, "tid": 7, "ts": 1716454216877106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876415, "dur": 6, "args": { "External id": 16855, "cbid": 211, "correlation": 16855 } }, { "ph": "s", "id": 16855, "pid": 76337, "tid": -914061504, "ts": 1716454216876415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216877128, "dur": 33, "args": { "External id": 16861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16861, "pid": 5, "tid": 7, "ts": 1716454216877128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216876444, "dur": 10, "args": { "External id": 16861, "cbid": 211, "correlation": 16861 } }, { "ph": "s", "id": 16861, "pid": 76337, "tid": -914061504, "ts": 1716454216876444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216876504, "dur": 0, "args": { "External id": 16871, "cbid": 317, "correlation": 16871 } }, { "ph": "f", "id": 16871, "pid": 76337, "tid": -914061504, "ts": 1716454216876504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216876505, "dur": 0, "args": { "External id": 16872, "cbid": 203, "correlation": 16872 } }, { "ph": "f", "id": 16872, "pid": 76337, "tid": -914061504, "ts": 1716454216876505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216876506, "dur": 0, "args": { "External id": 16873, "cbid": 205, "correlation": 16873 } }, { "ph": "f", "id": 16873, "pid": 76337, "tid": -914061504, "ts": 1716454216876506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216878243, "dur": 30, "args": { "External id": 16877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16877, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16877, "pid": 5, "tid": 7, "ts": 1716454216878243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878224, "dur": 19, "args": { "External id": 16877, "cbid": 211, "correlation": 16877 } }, { "ph": "s", "id": 16877, "pid": 76337, "tid": -914061504, "ts": 1716454216878224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216878274, "dur": 153, "args": { "External id": 16879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16879, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16879, "pid": 5, "tid": 7, "ts": 1716454216878274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878252, "dur": 10, "args": { "External id": 16879, "cbid": 211, "correlation": 16879 } }, { "ph": "s", "id": 16879, "pid": 76337, "tid": -914061504, "ts": 1716454216878252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216878428, "dur": 21, "args": { "External id": 16881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16881, "pid": 5, "tid": 7, "ts": 1716454216878428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878266, "dur": 6, "args": { "External id": 16881, "cbid": 211, "correlation": 16881 } }, { "ph": "s", "id": 16881, "pid": 76337, "tid": -914061504, "ts": 1716454216878266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216878729, "dur": 32, "args": { "External id": 16887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16887, "pid": 5, "tid": 7, "ts": 1716454216878729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878717, "dur": 10, "args": { "External id": 16887, "cbid": 211, "correlation": 16887 } }, { "ph": "s", "id": 16887, "pid": 76337, "tid": -914061504, "ts": 1716454216878717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216878767, "dur": 27, "args": { "External id": 16895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16895, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16895, "pid": 5, "tid": 7, "ts": 1716454216878767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878756, "dur": 10, "args": { "External id": 16895, "cbid": 211, "correlation": 16895 } }, { "ph": "s", "id": 16895, "pid": 76337, "tid": -914061504, "ts": 1716454216878756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216878818, "dur": 20, "args": { "External id": 16903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16903, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16903, "pid": 5, "tid": 7, "ts": 1716454216878818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878807, "dur": 10, "args": { "External id": 16903, "cbid": 211, "correlation": 16903 } }, { "ph": "s", "id": 16903, "pid": 76337, "tid": -914061504, "ts": 1716454216878807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216878937, "dur": 31, "args": { "External id": 16923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16923, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 16923, "pid": 5, "tid": 7, "ts": 1716454216878937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878923, "dur": 13, "args": { "External id": 16923, "cbid": 211, "correlation": 16923 } }, { "ph": "s", "id": 16923, "pid": 76337, "tid": -914061504, "ts": 1716454216878923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216878969, "dur": 4, "args": { "External id": 16935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16935, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 16935, "pid": 5, "tid": 7, "ts": 1716454216878969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878947, "dur": 7, "args": { "External id": 16935, "cbid": 211, "correlation": 16935 } }, { "ph": "s", "id": 16935, "pid": 76337, "tid": -914061504, "ts": 1716454216878947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216878984, "dur": 31, "args": { "External id": 16938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16938, "pid": 5, "tid": 7, "ts": 1716454216878984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216878968, "dur": 15, "args": { "External id": 16938, "cbid": 211, "correlation": 16938 } }, { "ph": "s", "id": 16938, "pid": 76337, "tid": -914061504, "ts": 1716454216878968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216879041, "dur": 0, "args": { "External id": 16949, "cbid": 317, "correlation": 16949 } }, { "ph": "f", "id": 16949, "pid": 76337, "tid": -914061504, "ts": 1716454216879041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216879042, "dur": 0, "args": { "External id": 16950, "cbid": 203, "correlation": 16950 } }, { "ph": "f", "id": 16950, "pid": 76337, "tid": -914061504, "ts": 1716454216879042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216879043, "dur": 0, "args": { "External id": 16951, "cbid": 205, "correlation": 16951 } }, { "ph": "f", "id": 16951, "pid": 76337, "tid": -914061504, "ts": 1716454216879043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216879072, "dur": 21, "args": { "External id": 16955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16955, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16955, "pid": 5, "tid": 7, "ts": 1716454216879072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879060, "dur": 12, "args": { "External id": 16955, "cbid": 211, "correlation": 16955 } }, { "ph": "s", "id": 16955, "pid": 76337, "tid": -914061504, "ts": 1716454216879060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216879094, "dur": 106, "args": { "External id": 16957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16957, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 16957, "pid": 5, "tid": 7, "ts": 1716454216879094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879078, "dur": 7, "args": { "External id": 16957, "cbid": 211, "correlation": 16957 } }, { "ph": "s", "id": 16957, "pid": 76337, "tid": -914061504, "ts": 1716454216879078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216879201, "dur": 22, "args": { "External id": 16959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16959, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16959, "pid": 5, "tid": 7, "ts": 1716454216879201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879089, "dur": 5, "args": { "External id": 16959, "cbid": 211, "correlation": 16959 } }, { "ph": "s", "id": 16959, "pid": 76337, "tid": -914061504, "ts": 1716454216879089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216879225, "dur": 32, "args": { "External id": 16965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16965, "pid": 5, "tid": 7, "ts": 1716454216879225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879117, "dur": 9, "args": { "External id": 16965, "cbid": 211, "correlation": 16965 } }, { "ph": "s", "id": 16965, "pid": 76337, "tid": -914061504, "ts": 1716454216879117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216879294, "dur": 81, "args": { "External id": 16976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16976, "pid": 5, "tid": 7, "ts": 1716454216879294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879276, "dur": 19, "args": { "External id": 16976, "cbid": 211, "correlation": 16976 } }, { "ph": "s", "id": 16976, "pid": 76337, "tid": -914061504, "ts": 1716454216879276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216879377, "dur": 36, "args": { "External id": 16998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 16998, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 16998, "pid": 5, "tid": 7, "ts": 1716454216879377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879317, "dur": 11, "args": { "External id": 16998, "cbid": 211, "correlation": 16998 } }, { "ph": "s", "id": 16998, "pid": 76337, "tid": -914061504, "ts": 1716454216879317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216879480, "dur": 2, "args": { "External id": 17009, "cbid": 251, "correlation": 17009 } }, { "ph": "f", "id": 17009, "pid": 76337, "tid": -914061504, "ts": 1716454216879480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216879506, "dur": 97, "args": { "External id": 17010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17010, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17010, "pid": 5, "tid": 7, "ts": 1716454216879506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879490, "dur": 16, "args": { "External id": 17010, "cbid": 211, "correlation": 17010 } }, { "ph": "s", "id": 17010, "pid": 76337, "tid": -914061504, "ts": 1716454216879490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216879584, "dur": 1, "args": { "External id": 17021, "cbid": 251, "correlation": 17021 } }, { "ph": "f", "id": 17021, "pid": 76337, "tid": -914061504, "ts": 1716454216879584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216879604, "dur": 90, "args": { "External id": 17022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17022, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17022, "pid": 5, "tid": 7, "ts": 1716454216879604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879588, "dur": 13, "args": { "External id": 17022, "cbid": 211, "correlation": 17022 } }, { "ph": "s", "id": 17022, "pid": 76337, "tid": -914061504, "ts": 1716454216879588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216879658, "dur": 1, "args": { "External id": 17033, "cbid": 251, "correlation": 17033 } }, { "ph": "f", "id": 17033, "pid": 76337, "tid": -914061504, "ts": 1716454216879658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216879696, "dur": 90, "args": { "External id": 17034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17034, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17034, "pid": 5, "tid": 7, "ts": 1716454216879696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879662, "dur": 12, "args": { "External id": 17034, "cbid": 211, "correlation": 17034 } }, { "ph": "s", "id": 17034, "pid": 76337, "tid": -914061504, "ts": 1716454216879662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216879789, "dur": 559, "args": { "External id": 17055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17055, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17055, "pid": 5, "tid": 7, "ts": 1716454216879789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879772, "dur": 17, "args": { "External id": 17055, "cbid": 211, "correlation": 17055 } }, { "ph": "s", "id": 17055, "pid": 76337, "tid": -914061504, "ts": 1716454216879772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216879915, "dur": 2, "args": { "External id": 17073, "cbid": 251, "correlation": 17073 } }, { "ph": "f", "id": 17073, "pid": 76337, "tid": -914061504, "ts": 1716454216879915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216880350, "dur": 99, "args": { "External id": 17075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17075, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17075, "pid": 5, "tid": 7, "ts": 1716454216880350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216879922, "dur": 15, "args": { "External id": 17075, "cbid": 211, "correlation": 17075 } }, { "ph": "s", "id": 17075, "pid": 76337, "tid": -914061504, "ts": 1716454216879922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216880450, "dur": 20, "args": { "External id": 17083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17083, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17083, "pid": 5, "tid": 7, "ts": 1716454216880450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880017, "dur": 13, "args": { "External id": 17083, "cbid": 211, "correlation": 17083 } }, { "ph": "s", "id": 17083, "pid": 76337, "tid": -914061504, "ts": 1716454216880017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216880471, "dur": 86, "args": { "External id": 17091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17091, "pid": 5, "tid": 7, "ts": 1716454216880471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880061, "dur": 10, "args": { "External id": 17091, "cbid": 211, "correlation": 17091 } }, { "ph": "s", "id": 17091, "pid": 76337, "tid": -914061504, "ts": 1716454216880061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216880559, "dur": 35, "args": { "External id": 17113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17113, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17113, "pid": 5, "tid": 7, "ts": 1716454216880559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880115, "dur": 10, "args": { "External id": 17113, "cbid": 211, "correlation": 17113 } }, { "ph": "s", "id": 17113, "pid": 76337, "tid": -914061504, "ts": 1716454216880115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880213, "dur": 1, "args": { "External id": 17124, "cbid": 251, "correlation": 17124 } }, { "ph": "f", "id": 17124, "pid": 76337, "tid": -914061504, "ts": 1716454216880213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216880595, "dur": 91, "args": { "External id": 17125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17125, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17125, "pid": 5, "tid": 7, "ts": 1716454216880595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880218, "dur": 14, "args": { "External id": 17125, "cbid": 211, "correlation": 17125 } }, { "ph": "s", "id": 17125, "pid": 76337, "tid": -914061504, "ts": 1716454216880218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880315, "dur": 1, "args": { "External id": 17136, "cbid": 251, "correlation": 17136 } }, { "ph": "f", "id": 17136, "pid": 76337, "tid": -914061504, "ts": 1716454216880315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880320, "dur": 0, "args": { "External id": 17137, "cbid": 251, "correlation": 17137 } }, { "ph": "f", "id": 17137, "pid": 76337, "tid": -914061504, "ts": 1716454216880320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216880687, "dur": 11, "args": { "External id": 17138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17138, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 17138, "pid": 5, "tid": 7, "ts": 1716454216880687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880322, "dur": 14, "args": { "External id": 17138, "cbid": 211, "correlation": 17138 } }, { "ph": "s", "id": 17138, "pid": 76337, "tid": -914061504, "ts": 1716454216880322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216880699, "dur": 5, "args": { "External id": 17140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17140, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 17140, "pid": 5, "tid": 7, "ts": 1716454216880699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880339, "dur": 8, "args": { "External id": 17140, "cbid": 211, "correlation": 17140 } }, { "ph": "s", "id": 17140, "pid": 76337, "tid": -914061504, "ts": 1716454216880339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880401, "dur": 1, "args": { "External id": 17151, "cbid": 251, "correlation": 17151 } }, { "ph": "f", "id": 17151, "pid": 76337, "tid": -914061504, "ts": 1716454216880401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880404, "dur": 0, "args": { "External id": 17152, "cbid": 251, "correlation": 17152 } }, { "ph": "f", "id": 17152, "pid": 76337, "tid": -914061504, "ts": 1716454216880404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216880706, "dur": 7, "args": { "External id": 17153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17153, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 17153, "pid": 5, "tid": 7, "ts": 1716454216880706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880406, "dur": 13, "args": { "External id": 17153, "cbid": 211, "correlation": 17153 } }, { "ph": "s", "id": 17153, "pid": 76337, "tid": -914061504, "ts": 1716454216880406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216880714, "dur": 3, "args": { "External id": 17155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17155, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 17155, "pid": 5, "tid": 7, "ts": 1716454216880714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880420, "dur": 5, "args": { "External id": 17155, "cbid": 211, "correlation": 17155 } }, { "ph": "s", "id": 17155, "pid": 76337, "tid": -914061504, "ts": 1716454216880420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216880719, "dur": 93, "args": { "External id": 17176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17176, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17176, "pid": 5, "tid": 7, "ts": 1716454216880719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880494, "dur": 12, "args": { "External id": 17176, "cbid": 211, "correlation": 17176 } }, { "ph": "s", "id": 17176, "pid": 76337, "tid": -914061504, "ts": 1716454216880494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880593, "dur": 1, "args": { "External id": 17194, "cbid": 251, "correlation": 17194 } }, { "ph": "f", "id": 17194, "pid": 76337, "tid": -914061504, "ts": 1716454216880593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216880813, "dur": 94, "args": { "External id": 17196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17196, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17196, "pid": 5, "tid": 7, "ts": 1716454216880813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880599, "dur": 13, "args": { "External id": 17196, "cbid": 211, "correlation": 17196 } }, { "ph": "s", "id": 17196, "pid": 76337, "tid": -914061504, "ts": 1716454216880599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216880909, "dur": 19, "args": { "External id": 17204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17204, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17204, "pid": 5, "tid": 7, "ts": 1716454216880909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880670, "dur": 13, "args": { "External id": 17204, "cbid": 211, "correlation": 17204 } }, { "ph": "s", "id": 17204, "pid": 76337, "tid": -914061504, "ts": 1716454216880670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216880929, "dur": 28, "args": { "External id": 17212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17212, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17212, "pid": 5, "tid": 7, "ts": 1716454216880929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880711, "dur": 8, "args": { "External id": 17212, "cbid": 211, "correlation": 17212 } }, { "ph": "s", "id": 17212, "pid": 76337, "tid": -914061504, "ts": 1716454216880711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216880958, "dur": 35, "args": { "External id": 17234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17234, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17234, "pid": 5, "tid": 7, "ts": 1716454216880958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880763, "dur": 10, "args": { "External id": 17234, "cbid": 211, "correlation": 17234 } }, { "ph": "s", "id": 17234, "pid": 76337, "tid": -914061504, "ts": 1716454216880763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880858, "dur": 2, "args": { "External id": 17250, "cbid": 251, "correlation": 17250 } }, { "ph": "f", "id": 17250, "pid": 76337, "tid": -914061504, "ts": 1716454216880858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216880864, "dur": 0, "args": { "External id": 17252, "cbid": 251, "correlation": 17252 } }, { "ph": "f", "id": 17252, "pid": 76337, "tid": -914061504, "ts": 1716454216880864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216880995, "dur": 548, "args": { "External id": 17253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17253, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 17253, "pid": 5, "tid": 7, "ts": 1716454216880995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880869, "dur": 14, "args": { "External id": 17253, "cbid": 211, "correlation": 17253 } }, { "ph": "s", "id": 17253, "pid": 76337, "tid": -914061504, "ts": 1716454216880869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216881545, "dur": 127, "args": { "External id": 17261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17261, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17261, "pid": 5, "tid": 7, "ts": 1716454216881545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216880964, "dur": 22, "args": { "External id": 17261, "cbid": 211, "correlation": 17261 } }, { "ph": "s", "id": 17261, "pid": 76337, "tid": -914061504, "ts": 1716454216880964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216881673, "dur": 130, "args": { "External id": 17269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17269, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17269, "pid": 5, "tid": 7, "ts": 1716454216881673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881014, "dur": 11, "args": { "External id": 17269, "cbid": 211, "correlation": 17269 } }, { "ph": "s", "id": 17269, "pid": 76337, "tid": -914061504, "ts": 1716454216881014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216881097, "dur": 1, "args": { "External id": 17285, "cbid": 251, "correlation": 17285 } }, { "ph": "f", "id": 17285, "pid": 76337, "tid": -914061504, "ts": 1716454216881097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216881804, "dur": 307, "args": { "External id": 17287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17287, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17287, "pid": 5, "tid": 7, "ts": 1716454216881804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881103, "dur": 12, "args": { "External id": 17287, "cbid": 211, "correlation": 17287 } }, { "ph": "s", "id": 17287, "pid": 76337, "tid": -914061504, "ts": 1716454216881103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216882113, "dur": 27, "args": { "External id": 17295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17295, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17295, "pid": 5, "tid": 7, "ts": 1716454216882113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881145, "dur": 11, "args": { "External id": 17295, "cbid": 211, "correlation": 17295 } }, { "ph": "s", "id": 17295, "pid": 76337, "tid": -914061504, "ts": 1716454216881145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216882141, "dur": 83, "args": { "External id": 17306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17306, "pid": 5, "tid": 7, "ts": 1716454216882141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881215, "dur": 12, "args": { "External id": 17306, "cbid": 211, "correlation": 17306 } }, { "ph": "s", "id": 17306, "pid": 76337, "tid": -914061504, "ts": 1716454216881215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216881281, "dur": 0, "args": { "External id": 17318, "cbid": 317, "correlation": 17318 } }, { "ph": "f", "id": 17318, "pid": 76337, "tid": -914061504, "ts": 1716454216881281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216881282, "dur": 0, "args": { "External id": 17319, "cbid": 203, "correlation": 17319 } }, { "ph": "f", "id": 17319, "pid": 76337, "tid": -914061504, "ts": 1716454216881282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216881283, "dur": 0, "args": { "External id": 17320, "cbid": 205, "correlation": 17320 } }, { "ph": "f", "id": 17320, "pid": 76337, "tid": -914061504, "ts": 1716454216881283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216882225, "dur": 23, "args": { "External id": 17324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17324, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17324, "pid": 5, "tid": 7, "ts": 1716454216882225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881299, "dur": 13, "args": { "External id": 17324, "cbid": 211, "correlation": 17324 } }, { "ph": "s", "id": 17324, "pid": 76337, "tid": -914061504, "ts": 1716454216881299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216882249, "dur": 121, "args": { "External id": 17326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17326, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17326, "pid": 5, "tid": 7, "ts": 1716454216882249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881318, "dur": 6, "args": { "External id": 17326, "cbid": 211, "correlation": 17326 } }, { "ph": "s", "id": 17326, "pid": 76337, "tid": -914061504, "ts": 1716454216881318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216882371, "dur": 22, "args": { "External id": 17328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17328, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17328, "pid": 5, "tid": 7, "ts": 1716454216882371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881329, "dur": 5, "args": { "External id": 17328, "cbid": 211, "correlation": 17328 } }, { "ph": "s", "id": 17328, "pid": 76337, "tid": -914061504, "ts": 1716454216881329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216882394, "dur": 33, "args": { "External id": 17334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17334, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17334, "pid": 5, "tid": 7, "ts": 1716454216882394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881357, "dur": 9, "args": { "External id": 17334, "cbid": 211, "correlation": 17334 } }, { "ph": "s", "id": 17334, "pid": 76337, "tid": -914061504, "ts": 1716454216881357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216882429, "dur": 27, "args": { "External id": 17342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17342, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17342, "pid": 5, "tid": 7, "ts": 1716454216882429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881389, "dur": 8, "args": { "External id": 17342, "cbid": 211, "correlation": 17342 } }, { "ph": "s", "id": 17342, "pid": 76337, "tid": -914061504, "ts": 1716454216881389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454216882457, "dur": 102, "args": { "External id": 17353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17353, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17353, "pid": 5, "tid": 7, "ts": 1716454216882457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216881469, "dur": 13, "args": { "External id": 17353, "cbid": 211, "correlation": 17353 } }, { "ph": "s", "id": 17353, "pid": 76337, "tid": -914061504, "ts": 1716454216881469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216881525, "dur": 0, "args": { "External id": 17363, "cbid": 317, "correlation": 17363 } }, { "ph": "f", "id": 17363, "pid": 76337, "tid": -914061504, "ts": 1716454216881525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216881526, "dur": 0, "args": { "External id": 17364, "cbid": 203, "correlation": 17364 } }, { "ph": "f", "id": 17364, "pid": 76337, "tid": -914061504, "ts": 1716454216881526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216881527, "dur": 0, "args": { "External id": 17365, "cbid": 205, "correlation": 17365 } }, { "ph": "f", "id": 17365, "pid": 76337, "tid": -914061504, "ts": 1716454216881527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216884392, "dur": 75, "args": { "External id": 17369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17369, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17369, "pid": 5, "tid": 7, "ts": 1716454216884392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216884366, "dur": 26, "args": { "External id": 17369, "cbid": 211, "correlation": 17369 } }, { "ph": "s", "id": 17369, "pid": 76337, "tid": -914061504, "ts": 1716454216884366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216884468, "dur": 45, "args": { "External id": 17371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17371, "pid": 5, "tid": 7, "ts": 1716454216884468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216884394, "dur": 6, "args": { "External id": 17371, "cbid": 211, "correlation": 17371 } }, { "ph": "s", "id": 17371, "pid": 76337, "tid": -914061504, "ts": 1716454216884394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216884514, "dur": 4, "args": { "External id": 17373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17373, "pid": 5, "tid": 7, "ts": 1716454216884514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216884407, "dur": 9, "args": { "External id": 17373, "cbid": 211, "correlation": 17373 } }, { "ph": "s", "id": 17373, "pid": 76337, "tid": -914061504, "ts": 1716454216884407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216884420, "dur": 0, "args": { "External id": 17374, "cbid": 51, "correlation": 17374 } }, { "ph": "s", "id": 17374, "pid": 76337, "tid": -914061504, "ts": 1716454216884420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216884519, "dur": 2220, "args": { "External id": 17375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17375, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17375, "pid": 5, "tid": 7, "ts": 1716454216884519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216884421, "dur": 7, "args": { "External id": 17375, "cbid": 211, "correlation": 17375 } }, { "ph": "s", "id": 17375, "pid": 76337, "tid": -914061504, "ts": 1716454216884421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216886740, "dur": 114, "args": { "External id": 17380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17380, "pid": 5, "tid": 7, "ts": 1716454216886740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216885099, "dur": 12, "args": { "External id": 17380, "cbid": 211, "correlation": 17380 } }, { "ph": "s", "id": 17380, "pid": 76337, "tid": -914061504, "ts": 1716454216885099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216886856, "dur": 163, "args": { "External id": 17389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17389, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17389, "pid": 5, "tid": 7, "ts": 1716454216886856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216885256, "dur": 17, "args": { "External id": 17389, "cbid": 211, "correlation": 17389 } }, { "ph": "s", "id": 17389, "pid": 76337, "tid": -914061504, "ts": 1716454216885256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216887020, "dur": 128, "args": { "External id": 17409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17409, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 17409, "pid": 5, "tid": 7, "ts": 1716454216887020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216885362, "dur": 14, "args": { "External id": 17409, "cbid": 211, "correlation": 17409 } }, { "ph": "s", "id": 17409, "pid": 76337, "tid": -914061504, "ts": 1716454216885362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216887150, "dur": 5, "args": { "External id": 17421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17421, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 17421, "pid": 5, "tid": 7, "ts": 1716454216887150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216885387, "dur": 8, "args": { "External id": 17421, "cbid": 211, "correlation": 17421 } }, { "ph": "s", "id": 17421, "pid": 76337, "tid": -914061504, "ts": 1716454216885387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216887156, "dur": 160, "args": { "External id": 17424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17424, "pid": 5, "tid": 7, "ts": 1716454216887156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216885418, "dur": 8, "args": { "External id": 17424, "cbid": 211, "correlation": 17424 } }, { "ph": "s", "id": 17424, "pid": 76337, "tid": -914061504, "ts": 1716454216885418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216887317, "dur": 102, "args": { "External id": 17433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17433, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17433, "pid": 5, "tid": 7, "ts": 1716454216887317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216885475, "dur": 11, "args": { "External id": 17433, "cbid": 211, "correlation": 17433 } }, { "ph": "s", "id": 17433, "pid": 76337, "tid": -914061504, "ts": 1716454216885475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216885546, "dur": 0, "args": { "External id": 17443, "cbid": 317, "correlation": 17443 } }, { "ph": "f", "id": 17443, "pid": 76337, "tid": -914061504, "ts": 1716454216885546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216885547, "dur": 1, "args": { "External id": 17444, "cbid": 203, "correlation": 17444 } }, { "ph": "f", "id": 17444, "pid": 76337, "tid": -914061504, "ts": 1716454216885547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216885549, "dur": 0, "args": { "External id": 17445, "cbid": 205, "correlation": 17445 } }, { "ph": "f", "id": 17445, "pid": 76337, "tid": -914061504, "ts": 1716454216885549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216888696, "dur": 111, "args": { "External id": 17449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17449, "pid": 5, "tid": 7, "ts": 1716454216888696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216888672, "dur": 24, "args": { "External id": 17449, "cbid": 211, "correlation": 17449 } }, { "ph": "s", "id": 17449, "pid": 76337, "tid": -914061504, "ts": 1716454216888672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216888808, "dur": 34, "args": { "External id": 17451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17451, "pid": 5, "tid": 7, "ts": 1716454216888808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216888699, "dur": 6, "args": { "External id": 17451, "cbid": 211, "correlation": 17451 } }, { "ph": "s", "id": 17451, "pid": 76337, "tid": -914061504, "ts": 1716454216888699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216888844, "dur": 3, "args": { "External id": 17453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17453, "pid": 5, "tid": 7, "ts": 1716454216888844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216888710, "dur": 8, "args": { "External id": 17453, "cbid": 211, "correlation": 17453 } }, { "ph": "s", "id": 17453, "pid": 76337, "tid": -914061504, "ts": 1716454216888710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216888722, "dur": 0, "args": { "External id": 17454, "cbid": 51, "correlation": 17454 } }, { "ph": "s", "id": 17454, "pid": 76337, "tid": -914061504, "ts": 1716454216888722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216888848, "dur": 2025, "args": { "External id": 17455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17455, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17455, "pid": 5, "tid": 7, "ts": 1716454216888848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216888723, "dur": 7, "args": { "External id": 17455, "cbid": 211, "correlation": 17455 } }, { "ph": "s", "id": 17455, "pid": 76337, "tid": -914061504, "ts": 1716454216888723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216890875, "dur": 59, "args": { "External id": 17460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17460, "pid": 5, "tid": 7, "ts": 1716454216890875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889460, "dur": 12, "args": { "External id": 17460, "cbid": 211, "correlation": 17460 } }, { "ph": "s", "id": 17460, "pid": 76337, "tid": -914061504, "ts": 1716454216889460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216890936, "dur": 3, "args": { "External id": 17468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17468, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17468, "pid": 5, "tid": 7, "ts": 1716454216890936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889537, "dur": 11, "args": { "External id": 17468, "cbid": 211, "correlation": 17468 } }, { "ph": "s", "id": 17468, "pid": 76337, "tid": -914061504, "ts": 1716454216889537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216889697, "dur": 4, "args": { "External id": 17484, "cbid": 251, "correlation": 17484 } }, { "ph": "f", "id": 17484, "pid": 76337, "tid": -914061504, "ts": 1716454216889697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216889707, "dur": 0, "args": { "External id": 17486, "cbid": 251, "correlation": 17486 } }, { "ph": "f", "id": 17486, "pid": 76337, "tid": -914061504, "ts": 1716454216889707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216890940, "dur": 11, "args": { "External id": 17487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17487, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 17487, "pid": 5, "tid": 7, "ts": 1716454216890940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889712, "dur": 16, "args": { "External id": 17487, "cbid": 211, "correlation": 17487 } }, { "ph": "s", "id": 17487, "pid": 76337, "tid": -914061504, "ts": 1716454216889712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216890953, "dur": 5, "args": { "External id": 17489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17489, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 17489, "pid": 5, "tid": 7, "ts": 1716454216890953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889733, "dur": 8, "args": { "External id": 17489, "cbid": 211, "correlation": 17489 } }, { "ph": "s", "id": 17489, "pid": 76337, "tid": -914061504, "ts": 1716454216889733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216890959, "dur": 54, "args": { "External id": 17499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17499, "pid": 5, "tid": 7, "ts": 1716454216890959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889827, "dur": 13, "args": { "External id": 17499, "cbid": 211, "correlation": 17499 } }, { "ph": "s", "id": 17499, "pid": 76337, "tid": -914061504, "ts": 1716454216889827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216891014, "dur": 52, "args": { "External id": 17519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17519, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 17519, "pid": 5, "tid": 7, "ts": 1716454216891014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889919, "dur": 13, "args": { "External id": 17519, "cbid": 211, "correlation": 17519 } }, { "ph": "s", "id": 17519, "pid": 76337, "tid": -914061504, "ts": 1716454216889919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216891068, "dur": 4, "args": { "External id": 17531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17531, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17531, "pid": 5, "tid": 7, "ts": 1716454216891068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889944, "dur": 8, "args": { "External id": 17531, "cbid": 211, "correlation": 17531 } }, { "ph": "s", "id": 17531, "pid": 76337, "tid": -914061504, "ts": 1716454216889944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216891073, "dur": 55, "args": { "External id": 17534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17534, "pid": 5, "tid": 7, "ts": 1716454216891073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216889971, "dur": 16, "args": { "External id": 17534, "cbid": 211, "correlation": 17534 } }, { "ph": "s", "id": 17534, "pid": 76337, "tid": -914061504, "ts": 1716454216889971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216891130, "dur": 37, "args": { "External id": 17543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17543, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17543, "pid": 5, "tid": 7, "ts": 1716454216891130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216890025, "dur": 11, "args": { "External id": 17543, "cbid": 211, "correlation": 17543 } }, { "ph": "s", "id": 17543, "pid": 76337, "tid": -914061504, "ts": 1716454216890025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216890132, "dur": 0, "args": { "External id": 17553, "cbid": 317, "correlation": 17553 } }, { "ph": "f", "id": 17553, "pid": 76337, "tid": -914061504, "ts": 1716454216890132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216890133, "dur": 1, "args": { "External id": 17554, "cbid": 203, "correlation": 17554 } }, { "ph": "f", "id": 17554, "pid": 76337, "tid": -914061504, "ts": 1716454216890133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216890134, "dur": 1, "args": { "External id": 17555, "cbid": 205, "correlation": 17555 } }, { "ph": "f", "id": 17555, "pid": 76337, "tid": -914061504, "ts": 1716454216890134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216891168, "dur": 39, "args": { "External id": 17559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17559, "pid": 5, "tid": 7, "ts": 1716454216891168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216890154, "dur": 13, "args": { "External id": 17559, "cbid": 211, "correlation": 17559 } }, { "ph": "s", "id": 17559, "pid": 76337, "tid": -914061504, "ts": 1716454216890154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216891209, "dur": 14, "args": { "External id": 17561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17561, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17561, "pid": 5, "tid": 7, "ts": 1716454216891209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216890170, "dur": 6, "args": { "External id": 17561, "cbid": 211, "correlation": 17561 } }, { "ph": "s", "id": 17561, "pid": 76337, "tid": -914061504, "ts": 1716454216890170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216891224, "dur": 4, "args": { "External id": 17563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17563, "pid": 5, "tid": 7, "ts": 1716454216891224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216890181, "dur": 6, "args": { "External id": 17563, "cbid": 211, "correlation": 17563 } }, { "ph": "s", "id": 17563, "pid": 76337, "tid": -914061504, "ts": 1716454216890181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216890190, "dur": 0, "args": { "External id": 17564, "cbid": 51, "correlation": 17564 } }, { "ph": "s", "id": 17564, "pid": 76337, "tid": -914061504, "ts": 1716454216890190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216891229, "dur": 702, "args": { "External id": 17565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17565, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17565, "pid": 5, "tid": 7, "ts": 1716454216891229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216890191, "dur": 5, "args": { "External id": 17565, "cbid": 211, "correlation": 17565 } }, { "ph": "s", "id": 17565, "pid": 76337, "tid": -914061504, "ts": 1716454216890191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216891932, "dur": 60, "args": { "External id": 17570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17570, "pid": 5, "tid": 7, "ts": 1716454216891932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216890220, "dur": 9, "args": { "External id": 17570, "cbid": 211, "correlation": 17570 } }, { "ph": "s", "id": 17570, "pid": 76337, "tid": -914061504, "ts": 1716454216890220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216890280, "dur": 0, "args": { "External id": 17580, "cbid": 317, "correlation": 17580 } }, { "ph": "f", "id": 17580, "pid": 76337, "tid": -914061504, "ts": 1716454216890280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216890281, "dur": 0, "args": { "External id": 17581, "cbid": 203, "correlation": 17581 } }, { "ph": "f", "id": 17581, "pid": 76337, "tid": -914061504, "ts": 1716454216890281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216890281, "dur": 0, "args": { "External id": 17582, "cbid": 205, "correlation": 17582 } }, { "ph": "f", "id": 17582, "pid": 76337, "tid": -914061504, "ts": 1716454216890281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216891993, "dur": 4, "args": { "External id": 17586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17586, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17586, "pid": 5, "tid": 7, "ts": 1716454216891993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216891836, "dur": 19, "args": { "External id": 17586, "cbid": 211, "correlation": 17586 } }, { "ph": "s", "id": 17586, "pid": 76337, "tid": -914061504, "ts": 1716454216891836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216891858, "dur": 0, "args": { "External id": 17587, "cbid": 51, "correlation": 17587 } }, { "ph": "s", "id": 17587, "pid": 76337, "tid": -914061504, "ts": 1716454216891858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454216892027, "dur": 262, "args": { "External id": 17588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17588, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17588, "pid": 5, "tid": 7, "ts": 1716454216892027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216891873, "dur": 154, "args": { "External id": 17588, "cbid": 211, "correlation": 17588 } }, { "ph": "s", "id": 17588, "pid": 76337, "tid": -914061504, "ts": 1716454216891873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216892448, "dur": 59, "args": { "External id": 17593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17593, "pid": 5, "tid": 7, "ts": 1716454216892448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892437, "dur": 11, "args": { "External id": 17593, "cbid": 211, "correlation": 17593 } }, { "ph": "s", "id": 17593, "pid": 76337, "tid": -914061504, "ts": 1716454216892437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216892509, "dur": 50, "args": { "External id": 17601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17601, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17601, "pid": 5, "tid": 7, "ts": 1716454216892509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892477, "dur": 10, "args": { "External id": 17601, "cbid": 211, "correlation": 17601 } }, { "ph": "s", "id": 17601, "pid": 76337, "tid": -914061504, "ts": 1716454216892477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216892560, "dur": 35, "args": { "External id": 17609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17609, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17609, "pid": 5, "tid": 7, "ts": 1716454216892560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892529, "dur": 11, "args": { "External id": 17609, "cbid": 211, "correlation": 17609 } }, { "ph": "s", "id": 17609, "pid": 76337, "tid": -914061504, "ts": 1716454216892529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216892666, "dur": 52, "args": { "External id": 17629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17629, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 17629, "pid": 5, "tid": 7, "ts": 1716454216892666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892653, "dur": 13, "args": { "External id": 17629, "cbid": 211, "correlation": 17629 } }, { "ph": "s", "id": 17629, "pid": 76337, "tid": -914061504, "ts": 1716454216892653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216892720, "dur": 4, "args": { "External id": 17641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17641, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 17641, "pid": 5, "tid": 7, "ts": 1716454216892720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892689, "dur": 9, "args": { "External id": 17641, "cbid": 211, "correlation": 17641 } }, { "ph": "s", "id": 17641, "pid": 76337, "tid": -914061504, "ts": 1716454216892689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216892725, "dur": 58, "args": { "External id": 17644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17644, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17644, "pid": 5, "tid": 7, "ts": 1716454216892725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892713, "dur": 8, "args": { "External id": 17644, "cbid": 211, "correlation": 17644 } }, { "ph": "s", "id": 17644, "pid": 76337, "tid": -914061504, "ts": 1716454216892713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216892796, "dur": 0, "args": { "External id": 17655, "cbid": 317, "correlation": 17655 } }, { "ph": "f", "id": 17655, "pid": 76337, "tid": -914061504, "ts": 1716454216892796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216892797, "dur": 1, "args": { "External id": 17656, "cbid": 203, "correlation": 17656 } }, { "ph": "f", "id": 17656, "pid": 76337, "tid": -914061504, "ts": 1716454216892797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216892798, "dur": 0, "args": { "External id": 17657, "cbid": 205, "correlation": 17657 } }, { "ph": "f", "id": 17657, "pid": 76337, "tid": -914061504, "ts": 1716454216892798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892849, "dur": 3, "args": { "External id": 17661, "cbid": 251, "correlation": 17661 } }, { "ph": "f", "id": 17661, "pid": 76337, "tid": -914061504, "ts": 1716454216892849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892854, "dur": 1, "args": { "External id": 17662, "cbid": 251, "correlation": 17662 } }, { "ph": "f", "id": 17662, "pid": 76337, "tid": -914061504, "ts": 1716454216892854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892856, "dur": 1, "args": { "External id": 17663, "cbid": 251, "correlation": 17663 } }, { "ph": "f", "id": 17663, "pid": 76337, "tid": -914061504, "ts": 1716454216892856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892858, "dur": 1, "args": { "External id": 17664, "cbid": 251, "correlation": 17664 } }, { "ph": "f", "id": 17664, "pid": 76337, "tid": -914061504, "ts": 1716454216892858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892860, "dur": 1, "args": { "External id": 17665, "cbid": 251, "correlation": 17665 } }, { "ph": "f", "id": 17665, "pid": 76337, "tid": -914061504, "ts": 1716454216892860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892862, "dur": 1, "args": { "External id": 17666, "cbid": 251, "correlation": 17666 } }, { "ph": "f", "id": 17666, "pid": 76337, "tid": -914061504, "ts": 1716454216892862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892864, "dur": 1, "args": { "External id": 17667, "cbid": 251, "correlation": 17667 } }, { "ph": "f", "id": 17667, "pid": 76337, "tid": -914061504, "ts": 1716454216892864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892866, "dur": 1, "args": { "External id": 17668, "cbid": 251, "correlation": 17668 } }, { "ph": "f", "id": 17668, "pid": 76337, "tid": -914061504, "ts": 1716454216892866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216892868, "dur": 0, "args": { "External id": 17669, "cbid": 251, "correlation": 17669 } }, { "ph": "f", "id": 17669, "pid": 76337, "tid": -914061504, "ts": 1716454216892868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216892892, "dur": 116, "args": { "External id": 17670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17670, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17670, "pid": 5, "tid": 7, "ts": 1716454216892892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892874, "dur": 18, "args": { "External id": 17670, "cbid": 211, "correlation": 17670 } }, { "ph": "s", "id": 17670, "pid": 76337, "tid": -914061504, "ts": 1716454216892874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216893009, "dur": 60, "args": { "External id": 17676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17676, "pid": 5, "tid": 7, "ts": 1716454216893009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216892916, "dur": 9, "args": { "External id": 17676, "cbid": 211, "correlation": 17676 } }, { "ph": "s", "id": 17676, "pid": 76337, "tid": -914061504, "ts": 1716454216892916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216893091, "dur": 149, "args": { "External id": 17687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17687, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17687, "pid": 5, "tid": 7, "ts": 1716454216893091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893072, "dur": 18, "args": { "External id": 17687, "cbid": 211, "correlation": 17687 } }, { "ph": "s", "id": 17687, "pid": 76337, "tid": -914061504, "ts": 1716454216893072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216893242, "dur": 95, "args": { "External id": 17709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17709, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17709, "pid": 5, "tid": 7, "ts": 1716454216893242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893126, "dur": 12, "args": { "External id": 17709, "cbid": 211, "correlation": 17709 } }, { "ph": "s", "id": 17709, "pid": 76337, "tid": -914061504, "ts": 1716454216893126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216893266, "dur": 2, "args": { "External id": 17720, "cbid": 251, "correlation": 17720 } }, { "ph": "f", "id": 17720, "pid": 76337, "tid": -914061504, "ts": 1716454216893266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216893338, "dur": 111, "args": { "External id": 17721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17721, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17721, "pid": 5, "tid": 7, "ts": 1716454216893338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893273, "dur": 16, "args": { "External id": 17721, "cbid": 211, "correlation": 17721 } }, { "ph": "s", "id": 17721, "pid": 76337, "tid": -914061504, "ts": 1716454216893273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216893365, "dur": 1, "args": { "External id": 17732, "cbid": 251, "correlation": 17732 } }, { "ph": "f", "id": 17732, "pid": 76337, "tid": -914061504, "ts": 1716454216893365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216893450, "dur": 103, "args": { "External id": 17733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17733, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17733, "pid": 5, "tid": 7, "ts": 1716454216893450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893369, "dur": 12, "args": { "External id": 17733, "cbid": 211, "correlation": 17733 } }, { "ph": "s", "id": 17733, "pid": 76337, "tid": -914061504, "ts": 1716454216893369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216893448, "dur": 1, "args": { "External id": 17744, "cbid": 251, "correlation": 17744 } }, { "ph": "f", "id": 17744, "pid": 76337, "tid": -914061504, "ts": 1716454216893448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216893554, "dur": 102, "args": { "External id": 17745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17745, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17745, "pid": 5, "tid": 7, "ts": 1716454216893554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893452, "dur": 13, "args": { "External id": 17745, "cbid": 211, "correlation": 17745 } }, { "ph": "s", "id": 17745, "pid": 76337, "tid": -914061504, "ts": 1716454216893452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216893658, "dur": 4776, "args": { "External id": 17766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17766, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17766, "pid": 5, "tid": 7, "ts": 1716454216893658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893566, "dur": 16, "args": { "External id": 17766, "cbid": 211, "correlation": 17766 } }, { "ph": "s", "id": 17766, "pid": 76337, "tid": -914061504, "ts": 1716454216893566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216893711, "dur": 2, "args": { "External id": 17784, "cbid": 251, "correlation": 17784 } }, { "ph": "f", "id": 17784, "pid": 76337, "tid": -914061504, "ts": 1716454216893711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216898435, "dur": 106, "args": { "External id": 17786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17786, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17786, "pid": 5, "tid": 7, "ts": 1716454216898435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893719, "dur": 15, "args": { "External id": 17786, "cbid": 211, "correlation": 17786 } }, { "ph": "s", "id": 17786, "pid": 76337, "tid": -914061504, "ts": 1716454216893719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216898542, "dur": 35, "args": { "External id": 17794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17794, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17794, "pid": 5, "tid": 7, "ts": 1716454216898542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893803, "dur": 13, "args": { "External id": 17794, "cbid": 211, "correlation": 17794 } }, { "ph": "s", "id": 17794, "pid": 76337, "tid": -914061504, "ts": 1716454216893803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216898579, "dur": 160, "args": { "External id": 17802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17802, "pid": 5, "tid": 7, "ts": 1716454216898579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893847, "dur": 10, "args": { "External id": 17802, "cbid": 211, "correlation": 17802 } }, { "ph": "s", "id": 17802, "pid": 76337, "tid": -914061504, "ts": 1716454216893847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216898741, "dur": 93, "args": { "External id": 17824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17824, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17824, "pid": 5, "tid": 7, "ts": 1716454216898741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216893903, "dur": 10, "args": { "External id": 17824, "cbid": 211, "correlation": 17824 } }, { "ph": "s", "id": 17824, "pid": 76337, "tid": -914061504, "ts": 1716454216893903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216893995, "dur": 1, "args": { "External id": 17835, "cbid": 251, "correlation": 17835 } }, { "ph": "f", "id": 17835, "pid": 76337, "tid": -914061504, "ts": 1716454216893995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216898835, "dur": 104, "args": { "External id": 17836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17836, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17836, "pid": 5, "tid": 7, "ts": 1716454216898835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894000, "dur": 13, "args": { "External id": 17836, "cbid": 211, "correlation": 17836 } }, { "ph": "s", "id": 17836, "pid": 76337, "tid": -914061504, "ts": 1716454216894000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894099, "dur": 2, "args": { "External id": 17847, "cbid": 251, "correlation": 17847 } }, { "ph": "f", "id": 17847, "pid": 76337, "tid": -914061504, "ts": 1716454216894099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894104, "dur": 0, "args": { "External id": 17848, "cbid": 251, "correlation": 17848 } }, { "ph": "f", "id": 17848, "pid": 76337, "tid": -914061504, "ts": 1716454216894104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216898941, "dur": 10, "args": { "External id": 17849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17849, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 17849, "pid": 5, "tid": 7, "ts": 1716454216898941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894107, "dur": 16, "args": { "External id": 17849, "cbid": 211, "correlation": 17849 } }, { "ph": "s", "id": 17849, "pid": 76337, "tid": -914061504, "ts": 1716454216894107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216898952, "dur": 5, "args": { "External id": 17851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17851, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 17851, "pid": 5, "tid": 7, "ts": 1716454216898952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894126, "dur": 9, "args": { "External id": 17851, "cbid": 211, "correlation": 17851 } }, { "ph": "s", "id": 17851, "pid": 76337, "tid": -914061504, "ts": 1716454216894126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894203, "dur": 1, "args": { "External id": 17862, "cbid": 251, "correlation": 17862 } }, { "ph": "f", "id": 17862, "pid": 76337, "tid": -914061504, "ts": 1716454216894203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894206, "dur": 0, "args": { "External id": 17863, "cbid": 251, "correlation": 17863 } }, { "ph": "f", "id": 17863, "pid": 76337, "tid": -914061504, "ts": 1716454216894206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216898959, "dur": 6, "args": { "External id": 17864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17864, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 17864, "pid": 5, "tid": 7, "ts": 1716454216898959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894208, "dur": 12, "args": { "External id": 17864, "cbid": 211, "correlation": 17864 } }, { "ph": "s", "id": 17864, "pid": 76337, "tid": -914061504, "ts": 1716454216894208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216898966, "dur": 3, "args": { "External id": 17866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17866, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 17866, "pid": 5, "tid": 7, "ts": 1716454216898966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894222, "dur": 6, "args": { "External id": 17866, "cbid": 211, "correlation": 17866 } }, { "ph": "s", "id": 17866, "pid": 76337, "tid": -914061504, "ts": 1716454216894222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216898971, "dur": 154, "args": { "External id": 17887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17887, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17887, "pid": 5, "tid": 7, "ts": 1716454216898971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894299, "dur": 12, "args": { "External id": 17887, "cbid": 211, "correlation": 17887 } }, { "ph": "s", "id": 17887, "pid": 76337, "tid": -914061504, "ts": 1716454216894299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894409, "dur": 1, "args": { "External id": 17905, "cbid": 251, "correlation": 17905 } }, { "ph": "f", "id": 17905, "pid": 76337, "tid": -914061504, "ts": 1716454216894409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216899126, "dur": 105, "args": { "External id": 17907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17907, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 17907, "pid": 5, "tid": 7, "ts": 1716454216899126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894415, "dur": 14, "args": { "External id": 17907, "cbid": 211, "correlation": 17907 } }, { "ph": "s", "id": 17907, "pid": 76337, "tid": -914061504, "ts": 1716454216894415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216899233, "dur": 34, "args": { "External id": 17915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17915, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17915, "pid": 5, "tid": 7, "ts": 1716454216899233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894487, "dur": 13, "args": { "External id": 17915, "cbid": 211, "correlation": 17915 } }, { "ph": "s", "id": 17915, "pid": 76337, "tid": -914061504, "ts": 1716454216894487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216899268, "dur": 51, "args": { "External id": 17923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17923, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17923, "pid": 5, "tid": 7, "ts": 1716454216899268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894527, "dur": 9, "args": { "External id": 17923, "cbid": 211, "correlation": 17923 } }, { "ph": "s", "id": 17923, "pid": 76337, "tid": -914061504, "ts": 1716454216894527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216899320, "dur": 94, "args": { "External id": 17945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17945, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17945, "pid": 5, "tid": 7, "ts": 1716454216899320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894591, "dur": 12, "args": { "External id": 17945, "cbid": 211, "correlation": 17945 } }, { "ph": "s", "id": 17945, "pid": 76337, "tid": -914061504, "ts": 1716454216894591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894698, "dur": 2, "args": { "External id": 17961, "cbid": 251, "correlation": 17961 } }, { "ph": "f", "id": 17961, "pid": 76337, "tid": -914061504, "ts": 1716454216894698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216899415, "dur": 580, "args": { "External id": 17963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17963, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 17963, "pid": 5, "tid": 7, "ts": 1716454216899415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894705, "dur": 14, "args": { "External id": 17963, "cbid": 211, "correlation": 17963 } }, { "ph": "s", "id": 17963, "pid": 76337, "tid": -914061504, "ts": 1716454216894705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216899996, "dur": 245, "args": { "External id": 17971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17971, "pid": 5, "tid": 7, "ts": 1716454216899996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894803, "dur": 15, "args": { "External id": 17971, "cbid": 211, "correlation": 17971 } }, { "ph": "s", "id": 17971, "pid": 76337, "tid": -914061504, "ts": 1716454216894803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216900243, "dur": 253, "args": { "External id": 17979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 17979, "pid": 5, "tid": 7, "ts": 1716454216900243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894845, "dur": 11, "args": { "External id": 17979, "cbid": 211, "correlation": 17979 } }, { "ph": "s", "id": 17979, "pid": 76337, "tid": -914061504, "ts": 1716454216894845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894945, "dur": 2, "args": { "External id": 17995, "cbid": 251, "correlation": 17995 } }, { "ph": "f", "id": 17995, "pid": 76337, "tid": -914061504, "ts": 1716454216894945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216894951, "dur": 0, "args": { "External id": 17997, "cbid": 251, "correlation": 17997 } }, { "ph": "f", "id": 17997, "pid": 76337, "tid": -914061504, "ts": 1716454216894951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216900497, "dur": 357, "args": { "External id": 17998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 17998, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 17998, "pid": 5, "tid": 7, "ts": 1716454216900497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216894956, "dur": 14, "args": { "External id": 17998, "cbid": 211, "correlation": 17998 } }, { "ph": "s", "id": 17998, "pid": 76337, "tid": -914061504, "ts": 1716454216894956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216900856, "dur": 50, "args": { "External id": 18006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18006, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18006, "pid": 5, "tid": 7, "ts": 1716454216900856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895010, "dur": 11, "args": { "External id": 18006, "cbid": 211, "correlation": 18006 } }, { "ph": "s", "id": 18006, "pid": 76337, "tid": -914061504, "ts": 1716454216895010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216900907, "dur": 158, "args": { "External id": 18017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18017, "pid": 5, "tid": 7, "ts": 1716454216900907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895081, "dur": 13, "args": { "External id": 18017, "cbid": 211, "correlation": 18017 } }, { "ph": "s", "id": 18017, "pid": 76337, "tid": -914061504, "ts": 1716454216895081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216895158, "dur": 0, "args": { "External id": 18029, "cbid": 317, "correlation": 18029 } }, { "ph": "f", "id": 18029, "pid": 76337, "tid": -914061504, "ts": 1716454216895158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216895159, "dur": 0, "args": { "External id": 18030, "cbid": 203, "correlation": 18030 } }, { "ph": "f", "id": 18030, "pid": 76337, "tid": -914061504, "ts": 1716454216895159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216895159, "dur": 0, "args": { "External id": 18031, "cbid": 205, "correlation": 18031 } }, { "ph": "f", "id": 18031, "pid": 76337, "tid": -914061504, "ts": 1716454216895159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895186, "dur": 1, "args": { "External id": 18035, "cbid": 251, "correlation": 18035 } }, { "ph": "f", "id": 18035, "pid": 76337, "tid": -914061504, "ts": 1716454216895186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895189, "dur": 0, "args": { "External id": 18036, "cbid": 251, "correlation": 18036 } }, { "ph": "f", "id": 18036, "pid": 76337, "tid": -914061504, "ts": 1716454216895189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895190, "dur": 0, "args": { "External id": 18037, "cbid": 251, "correlation": 18037 } }, { "ph": "f", "id": 18037, "pid": 76337, "tid": -914061504, "ts": 1716454216895190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895191, "dur": 0, "args": { "External id": 18038, "cbid": 251, "correlation": 18038 } }, { "ph": "f", "id": 18038, "pid": 76337, "tid": -914061504, "ts": 1716454216895191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895192, "dur": 1, "args": { "External id": 18039, "cbid": 251, "correlation": 18039 } }, { "ph": "f", "id": 18039, "pid": 76337, "tid": -914061504, "ts": 1716454216895192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895194, "dur": 0, "args": { "External id": 18040, "cbid": 251, "correlation": 18040 } }, { "ph": "f", "id": 18040, "pid": 76337, "tid": -914061504, "ts": 1716454216895194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895194, "dur": 1, "args": { "External id": 18041, "cbid": 251, "correlation": 18041 } }, { "ph": "f", "id": 18041, "pid": 76337, "tid": -914061504, "ts": 1716454216895194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895196, "dur": 0, "args": { "External id": 18042, "cbid": 251, "correlation": 18042 } }, { "ph": "f", "id": 18042, "pid": 76337, "tid": -914061504, "ts": 1716454216895196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216895197, "dur": 0, "args": { "External id": 18043, "cbid": 251, "correlation": 18043 } }, { "ph": "f", "id": 18043, "pid": 76337, "tid": -914061504, "ts": 1716454216895197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216901066, "dur": 114, "args": { "External id": 18044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18044, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18044, "pid": 5, "tid": 7, "ts": 1716454216901066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895199, "dur": 13, "args": { "External id": 18044, "cbid": 211, "correlation": 18044 } }, { "ph": "s", "id": 18044, "pid": 76337, "tid": -914061504, "ts": 1716454216895199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216901181, "dur": 61, "args": { "External id": 18050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18050, "pid": 5, "tid": 7, "ts": 1716454216901181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895235, "dur": 10, "args": { "External id": 18050, "cbid": 211, "correlation": 18050 } }, { "ph": "s", "id": 18050, "pid": 76337, "tid": -914061504, "ts": 1716454216895235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216901244, "dur": 50, "args": { "External id": 18058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18058, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18058, "pid": 5, "tid": 7, "ts": 1716454216901244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895268, "dur": 8, "args": { "External id": 18058, "cbid": 211, "correlation": 18058 } }, { "ph": "s", "id": 18058, "pid": 76337, "tid": -914061504, "ts": 1716454216895268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216901294, "dur": 98, "args": { "External id": 18067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18067, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18067, "pid": 5, "tid": 7, "ts": 1716454216901294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895324, "dur": 13, "args": { "External id": 18067, "cbid": 211, "correlation": 18067 } }, { "ph": "s", "id": 18067, "pid": 76337, "tid": -914061504, "ts": 1716454216895324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216901394, "dur": 94, "args": { "External id": 18087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18087, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 18087, "pid": 5, "tid": 7, "ts": 1716454216901394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895404, "dur": 11, "args": { "External id": 18087, "cbid": 211, "correlation": 18087 } }, { "ph": "s", "id": 18087, "pid": 76337, "tid": -914061504, "ts": 1716454216895404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216901490, "dur": 5, "args": { "External id": 18099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18099, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 18099, "pid": 5, "tid": 7, "ts": 1716454216901490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895426, "dur": 6, "args": { "External id": 18099, "cbid": 211, "correlation": 18099 } }, { "ph": "s", "id": 18099, "pid": 76337, "tid": -914061504, "ts": 1716454216895426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216901496, "dur": 107, "args": { "External id": 18102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18102, "pid": 5, "tid": 7, "ts": 1716454216901496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895445, "dur": 7, "args": { "External id": 18102, "cbid": 211, "correlation": 18102 } }, { "ph": "s", "id": 18102, "pid": 76337, "tid": -914061504, "ts": 1716454216895445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216901605, "dur": 69, "args": { "External id": 18111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18111, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18111, "pid": 5, "tid": 7, "ts": 1716454216901605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216895490, "dur": 11, "args": { "External id": 18111, "cbid": 211, "correlation": 18111 } }, { "ph": "s", "id": 18111, "pid": 76337, "tid": -914061504, "ts": 1716454216895490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216895544, "dur": 0, "args": { "External id": 18121, "cbid": 317, "correlation": 18121 } }, { "ph": "f", "id": 18121, "pid": 76337, "tid": -914061504, "ts": 1716454216895544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216895544, "dur": 0, "args": { "External id": 18122, "cbid": 203, "correlation": 18122 } }, { "ph": "f", "id": 18122, "pid": 76337, "tid": -914061504, "ts": 1716454216895544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216895545, "dur": 0, "args": { "External id": 18123, "cbid": 205, "correlation": 18123 } }, { "ph": "f", "id": 18123, "pid": 76337, "tid": -914061504, "ts": 1716454216895545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216901675, "dur": 76, "args": { "External id": 18127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18127, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18127, "pid": 5, "tid": 7, "ts": 1716454216901675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216898727, "dur": 37, "args": { "External id": 18127, "cbid": 211, "correlation": 18127 } }, { "ph": "s", "id": 18127, "pid": 76337, "tid": -914061504, "ts": 1716454216898727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216901753, "dur": 24, "args": { "External id": 18129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18129, "pid": 5, "tid": 7, "ts": 1716454216901753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216898767, "dur": 6, "args": { "External id": 18129, "cbid": 211, "correlation": 18129 } }, { "ph": "s", "id": 18129, "pid": 76337, "tid": -914061504, "ts": 1716454216898767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216901778, "dur": 4, "args": { "External id": 18131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18131, "pid": 5, "tid": 7, "ts": 1716454216901778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216898782, "dur": 9, "args": { "External id": 18131, "cbid": 211, "correlation": 18131 } }, { "ph": "s", "id": 18131, "pid": 76337, "tid": -914061504, "ts": 1716454216898782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216898794, "dur": 0, "args": { "External id": 18132, "cbid": 51, "correlation": 18132 } }, { "ph": "s", "id": 18132, "pid": 76337, "tid": -914061504, "ts": 1716454216898794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216901783, "dur": 1371, "args": { "External id": 18133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18133, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18133, "pid": 5, "tid": 7, "ts": 1716454216901783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216898796, "dur": 8, "args": { "External id": 18133, "cbid": 211, "correlation": 18133 } }, { "ph": "s", "id": 18133, "pid": 76337, "tid": -914061504, "ts": 1716454216898796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216903156, "dur": 59, "args": { "External id": 18138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18138, "pid": 5, "tid": 7, "ts": 1716454216903156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216899566, "dur": 14, "args": { "External id": 18138, "cbid": 211, "correlation": 18138 } }, { "ph": "s", "id": 18138, "pid": 76337, "tid": -914061504, "ts": 1716454216899566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216903216, "dur": 3, "args": { "External id": 18146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18146, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18146, "pid": 5, "tid": 7, "ts": 1716454216903216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216899663, "dur": 13, "args": { "External id": 18146, "cbid": 211, "correlation": 18146 } }, { "ph": "s", "id": 18146, "pid": 76337, "tid": -914061504, "ts": 1716454216899663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216899808, "dur": 4, "args": { "External id": 18162, "cbid": 251, "correlation": 18162 } }, { "ph": "f", "id": 18162, "pid": 76337, "tid": -914061504, "ts": 1716454216899808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216899819, "dur": 0, "args": { "External id": 18164, "cbid": 251, "correlation": 18164 } }, { "ph": "f", "id": 18164, "pid": 76337, "tid": -914061504, "ts": 1716454216899819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216903221, "dur": 11, "args": { "External id": 18165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18165, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 18165, "pid": 5, "tid": 7, "ts": 1716454216903221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216899823, "dur": 16, "args": { "External id": 18165, "cbid": 211, "correlation": 18165 } }, { "ph": "s", "id": 18165, "pid": 76337, "tid": -914061504, "ts": 1716454216899823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216903233, "dur": 5, "args": { "External id": 18167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18167, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 18167, "pid": 5, "tid": 7, "ts": 1716454216903233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216899844, "dur": 9, "args": { "External id": 18167, "cbid": 211, "correlation": 18167 } }, { "ph": "s", "id": 18167, "pid": 76337, "tid": -914061504, "ts": 1716454216899844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216903240, "dur": 53, "args": { "External id": 18177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18177, "pid": 5, "tid": 7, "ts": 1716454216903240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216899936, "dur": 12, "args": { "External id": 18177, "cbid": 211, "correlation": 18177 } }, { "ph": "s", "id": 18177, "pid": 76337, "tid": -914061504, "ts": 1716454216899936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216903294, "dur": 52, "args": { "External id": 18197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18197, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 18197, "pid": 5, "tid": 7, "ts": 1716454216903294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900043, "dur": 12, "args": { "External id": 18197, "cbid": 211, "correlation": 18197 } }, { "ph": "s", "id": 18197, "pid": 76337, "tid": -914061504, "ts": 1716454216900043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216903348, "dur": 4, "args": { "External id": 18209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18209, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18209, "pid": 5, "tid": 7, "ts": 1716454216903348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900066, "dur": 8, "args": { "External id": 18209, "cbid": 211, "correlation": 18209 } }, { "ph": "s", "id": 18209, "pid": 76337, "tid": -914061504, "ts": 1716454216900066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216903353, "dur": 54, "args": { "External id": 18212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18212, "pid": 5, "tid": 7, "ts": 1716454216903353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900097, "dur": 8, "args": { "External id": 18212, "cbid": 211, "correlation": 18212 } }, { "ph": "s", "id": 18212, "pid": 76337, "tid": -914061504, "ts": 1716454216900097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216903409, "dur": 36, "args": { "External id": 18221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18221, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18221, "pid": 5, "tid": 7, "ts": 1716454216903409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900143, "dur": 10, "args": { "External id": 18221, "cbid": 211, "correlation": 18221 } }, { "ph": "s", "id": 18221, "pid": 76337, "tid": -914061504, "ts": 1716454216900143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216900253, "dur": 0, "args": { "External id": 18231, "cbid": 317, "correlation": 18231 } }, { "ph": "f", "id": 18231, "pid": 76337, "tid": -914061504, "ts": 1716454216900253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216900255, "dur": 1, "args": { "External id": 18232, "cbid": 203, "correlation": 18232 } }, { "ph": "f", "id": 18232, "pid": 76337, "tid": -914061504, "ts": 1716454216900255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216900256, "dur": 1, "args": { "External id": 18233, "cbid": 205, "correlation": 18233 } }, { "ph": "f", "id": 18233, "pid": 76337, "tid": -914061504, "ts": 1716454216900256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216903447, "dur": 40, "args": { "External id": 18237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18237, "pid": 5, "tid": 7, "ts": 1716454216903447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900279, "dur": 13, "args": { "External id": 18237, "cbid": 211, "correlation": 18237 } }, { "ph": "s", "id": 18237, "pid": 76337, "tid": -914061504, "ts": 1716454216900279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216903488, "dur": 15, "args": { "External id": 18239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18239, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18239, "pid": 5, "tid": 7, "ts": 1716454216903488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900295, "dur": 5, "args": { "External id": 18239, "cbid": 211, "correlation": 18239 } }, { "ph": "s", "id": 18239, "pid": 76337, "tid": -914061504, "ts": 1716454216900295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216903504, "dur": 3, "args": { "External id": 18241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18241, "pid": 5, "tid": 7, "ts": 1716454216903504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900305, "dur": 6, "args": { "External id": 18241, "cbid": 211, "correlation": 18241 } }, { "ph": "s", "id": 18241, "pid": 76337, "tid": -914061504, "ts": 1716454216900305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216900314, "dur": 0, "args": { "External id": 18242, "cbid": 51, "correlation": 18242 } }, { "ph": "s", "id": 18242, "pid": 76337, "tid": -914061504, "ts": 1716454216900314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216903508, "dur": 702, "args": { "External id": 18243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18243, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18243, "pid": 5, "tid": 7, "ts": 1716454216903508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900315, "dur": 6, "args": { "External id": 18243, "cbid": 211, "correlation": 18243 } }, { "ph": "s", "id": 18243, "pid": 76337, "tid": -914061504, "ts": 1716454216900315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216904212, "dur": 60, "args": { "External id": 18248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18248, "pid": 5, "tid": 7, "ts": 1716454216904212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216900345, "dur": 8, "args": { "External id": 18248, "cbid": 211, "correlation": 18248 } }, { "ph": "s", "id": 18248, "pid": 76337, "tid": -914061504, "ts": 1716454216900345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216900403, "dur": 0, "args": { "External id": 18258, "cbid": 317, "correlation": 18258 } }, { "ph": "f", "id": 18258, "pid": 76337, "tid": -914061504, "ts": 1716454216900403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216900404, "dur": 0, "args": { "External id": 18259, "cbid": 203, "correlation": 18259 } }, { "ph": "f", "id": 18259, "pid": 76337, "tid": -914061504, "ts": 1716454216900404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216900405, "dur": 0, "args": { "External id": 18260, "cbid": 205, "correlation": 18260 } }, { "ph": "f", "id": 18260, "pid": 76337, "tid": -914061504, "ts": 1716454216900405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216904273, "dur": 76, "args": { "External id": 18264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18264, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18264, "pid": 5, "tid": 7, "ts": 1716454216904273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902049, "dur": 20, "args": { "External id": 18264, "cbid": 211, "correlation": 18264 } }, { "ph": "s", "id": 18264, "pid": 76337, "tid": -914061504, "ts": 1716454216902049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216904373, "dur": 201, "args": { "External id": 18266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18266, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18266, "pid": 5, "tid": 7, "ts": 1716454216904373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902094, "dur": 207, "args": { "External id": 18266, "cbid": 211, "correlation": 18266 } }, { "ph": "s", "id": 18266, "pid": 76337, "tid": -914061504, "ts": 1716454216902094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216904576, "dur": 39, "args": { "External id": 18268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18268, "pid": 5, "tid": 7, "ts": 1716454216904576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902309, "dur": 8, "args": { "External id": 18268, "cbid": 211, "correlation": 18268 } }, { "ph": "s", "id": 18268, "pid": 76337, "tid": -914061504, "ts": 1716454216902309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216904616, "dur": 61, "args": { "External id": 18274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18274, "pid": 5, "tid": 7, "ts": 1716454216904616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902740, "dur": 11, "args": { "External id": 18274, "cbid": 211, "correlation": 18274 } }, { "ph": "s", "id": 18274, "pid": 76337, "tid": -914061504, "ts": 1716454216902740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216904679, "dur": 50, "args": { "External id": 18282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18282, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18282, "pid": 5, "tid": 7, "ts": 1716454216904679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902783, "dur": 8, "args": { "External id": 18282, "cbid": 211, "correlation": 18282 } }, { "ph": "s", "id": 18282, "pid": 76337, "tid": -914061504, "ts": 1716454216902783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216904730, "dur": 35, "args": { "External id": 18290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18290, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18290, "pid": 5, "tid": 7, "ts": 1716454216904730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902835, "dur": 10, "args": { "External id": 18290, "cbid": 211, "correlation": 18290 } }, { "ph": "s", "id": 18290, "pid": 76337, "tid": -914061504, "ts": 1716454216902835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216904767, "dur": 53, "args": { "External id": 18310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18310, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 18310, "pid": 5, "tid": 7, "ts": 1716454216904767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216902986, "dur": 16, "args": { "External id": 18310, "cbid": 211, "correlation": 18310 } }, { "ph": "s", "id": 18310, "pid": 76337, "tid": -914061504, "ts": 1716454216902986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216904821, "dur": 4, "args": { "External id": 18322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18322, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18322, "pid": 5, "tid": 7, "ts": 1716454216904821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903012, "dur": 8, "args": { "External id": 18322, "cbid": 211, "correlation": 18322 } }, { "ph": "s", "id": 18322, "pid": 76337, "tid": -914061504, "ts": 1716454216903012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216904826, "dur": 57, "args": { "External id": 18325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18325, "pid": 5, "tid": 7, "ts": 1716454216904826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903036, "dur": 7, "args": { "External id": 18325, "cbid": 211, "correlation": 18325 } }, { "ph": "s", "id": 18325, "pid": 76337, "tid": -914061504, "ts": 1716454216903036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216903110, "dur": 0, "args": { "External id": 18336, "cbid": 317, "correlation": 18336 } }, { "ph": "f", "id": 18336, "pid": 76337, "tid": -914061504, "ts": 1716454216903110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216903110, "dur": 0, "args": { "External id": 18337, "cbid": 203, "correlation": 18337 } }, { "ph": "f", "id": 18337, "pid": 76337, "tid": -914061504, "ts": 1716454216903110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216903111, "dur": 0, "args": { "External id": 18338, "cbid": 205, "correlation": 18338 } }, { "ph": "f", "id": 18338, "pid": 76337, "tid": -914061504, "ts": 1716454216903111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903149, "dur": 2, "args": { "External id": 18342, "cbid": 251, "correlation": 18342 } }, { "ph": "f", "id": 18342, "pid": 76337, "tid": -914061504, "ts": 1716454216903149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903153, "dur": 1, "args": { "External id": 18343, "cbid": 251, "correlation": 18343 } }, { "ph": "f", "id": 18343, "pid": 76337, "tid": -914061504, "ts": 1716454216903153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903155, "dur": 1, "args": { "External id": 18344, "cbid": 251, "correlation": 18344 } }, { "ph": "f", "id": 18344, "pid": 76337, "tid": -914061504, "ts": 1716454216903155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903157, "dur": 1, "args": { "External id": 18345, "cbid": 251, "correlation": 18345 } }, { "ph": "f", "id": 18345, "pid": 76337, "tid": -914061504, "ts": 1716454216903157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903158, "dur": 1, "args": { "External id": 18346, "cbid": 251, "correlation": 18346 } }, { "ph": "f", "id": 18346, "pid": 76337, "tid": -914061504, "ts": 1716454216903158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903160, "dur": 1, "args": { "External id": 18347, "cbid": 251, "correlation": 18347 } }, { "ph": "f", "id": 18347, "pid": 76337, "tid": -914061504, "ts": 1716454216903160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903162, "dur": 1, "args": { "External id": 18348, "cbid": 251, "correlation": 18348 } }, { "ph": "f", "id": 18348, "pid": 76337, "tid": -914061504, "ts": 1716454216903162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903164, "dur": 1, "args": { "External id": 18349, "cbid": 251, "correlation": 18349 } }, { "ph": "f", "id": 18349, "pid": 76337, "tid": -914061504, "ts": 1716454216903164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903167, "dur": 0, "args": { "External id": 18350, "cbid": 251, "correlation": 18350 } }, { "ph": "f", "id": 18350, "pid": 76337, "tid": -914061504, "ts": 1716454216903167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216904885, "dur": 118, "args": { "External id": 18351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18351, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18351, "pid": 5, "tid": 7, "ts": 1716454216904885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903171, "dur": 16, "args": { "External id": 18351, "cbid": 211, "correlation": 18351 } }, { "ph": "s", "id": 18351, "pid": 76337, "tid": -914061504, "ts": 1716454216903171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216905004, "dur": 59, "args": { "External id": 18357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18357, "pid": 5, "tid": 7, "ts": 1716454216905004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903222, "dur": 11, "args": { "External id": 18357, "cbid": 211, "correlation": 18357 } }, { "ph": "s", "id": 18357, "pid": 76337, "tid": -914061504, "ts": 1716454216903222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216905064, "dur": 149, "args": { "External id": 18368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18368, "pid": 5, "tid": 7, "ts": 1716454216905064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903386, "dur": 18, "args": { "External id": 18368, "cbid": 211, "correlation": 18368 } }, { "ph": "s", "id": 18368, "pid": 76337, "tid": -914061504, "ts": 1716454216903386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216905215, "dur": 95, "args": { "External id": 18390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18390, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18390, "pid": 5, "tid": 7, "ts": 1716454216905215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903430, "dur": 10, "args": { "External id": 18390, "cbid": 211, "correlation": 18390 } }, { "ph": "s", "id": 18390, "pid": 76337, "tid": -914061504, "ts": 1716454216903430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903581, "dur": 2, "args": { "External id": 18401, "cbid": 251, "correlation": 18401 } }, { "ph": "f", "id": 18401, "pid": 76337, "tid": -914061504, "ts": 1716454216903581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216905312, "dur": 110, "args": { "External id": 18402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18402, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18402, "pid": 5, "tid": 7, "ts": 1716454216905312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903588, "dur": 15, "args": { "External id": 18402, "cbid": 211, "correlation": 18402 } }, { "ph": "s", "id": 18402, "pid": 76337, "tid": -914061504, "ts": 1716454216903588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903673, "dur": 1, "args": { "External id": 18413, "cbid": 251, "correlation": 18413 } }, { "ph": "f", "id": 18413, "pid": 76337, "tid": -914061504, "ts": 1716454216903673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216905423, "dur": 104, "args": { "External id": 18414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18414, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18414, "pid": 5, "tid": 7, "ts": 1716454216905423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903678, "dur": 13, "args": { "External id": 18414, "cbid": 211, "correlation": 18414 } }, { "ph": "s", "id": 18414, "pid": 76337, "tid": -914061504, "ts": 1716454216903678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216903764, "dur": 1, "args": { "External id": 18425, "cbid": 251, "correlation": 18425 } }, { "ph": "f", "id": 18425, "pid": 76337, "tid": -914061504, "ts": 1716454216903764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216905528, "dur": 105, "args": { "External id": 18426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18426, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18426, "pid": 5, "tid": 7, "ts": 1716454216905528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903769, "dur": 12, "args": { "External id": 18426, "cbid": 211, "correlation": 18426 } }, { "ph": "s", "id": 18426, "pid": 76337, "tid": -914061504, "ts": 1716454216903769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216905635, "dur": 4778, "args": { "External id": 18447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18447, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18447, "pid": 5, "tid": 7, "ts": 1716454216905635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216903883, "dur": 16, "args": { "External id": 18447, "cbid": 211, "correlation": 18447 } }, { "ph": "s", "id": 18447, "pid": 76337, "tid": -914061504, "ts": 1716454216903883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904031, "dur": 2, "args": { "External id": 18465, "cbid": 251, "correlation": 18465 } }, { "ph": "f", "id": 18465, "pid": 76337, "tid": -914061504, "ts": 1716454216904031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216910414, "dur": 108, "args": { "External id": 18467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18467, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18467, "pid": 5, "tid": 7, "ts": 1716454216910414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904038, "dur": 15, "args": { "External id": 18467, "cbid": 211, "correlation": 18467 } }, { "ph": "s", "id": 18467, "pid": 76337, "tid": -914061504, "ts": 1716454216904038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216910523, "dur": 35, "args": { "External id": 18475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18475, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18475, "pid": 5, "tid": 7, "ts": 1716454216910523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904125, "dur": 13, "args": { "External id": 18475, "cbid": 211, "correlation": 18475 } }, { "ph": "s", "id": 18475, "pid": 76337, "tid": -914061504, "ts": 1716454216904125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216910560, "dur": 158, "args": { "External id": 18483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18483, "pid": 5, "tid": 7, "ts": 1716454216910560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904168, "dur": 9, "args": { "External id": 18483, "cbid": 211, "correlation": 18483 } }, { "ph": "s", "id": 18483, "pid": 76337, "tid": -914061504, "ts": 1716454216904168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216910719, "dur": 93, "args": { "External id": 18505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18505, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18505, "pid": 5, "tid": 7, "ts": 1716454216910719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904237, "dur": 11, "args": { "External id": 18505, "cbid": 211, "correlation": 18505 } }, { "ph": "s", "id": 18505, "pid": 76337, "tid": -914061504, "ts": 1716454216904237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904343, "dur": 1, "args": { "External id": 18516, "cbid": 251, "correlation": 18516 } }, { "ph": "f", "id": 18516, "pid": 76337, "tid": -914061504, "ts": 1716454216904343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216910814, "dur": 105, "args": { "External id": 18517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18517, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18517, "pid": 5, "tid": 7, "ts": 1716454216910814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904348, "dur": 14, "args": { "External id": 18517, "cbid": 211, "correlation": 18517 } }, { "ph": "s", "id": 18517, "pid": 76337, "tid": -914061504, "ts": 1716454216904348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904434, "dur": 2, "args": { "External id": 18528, "cbid": 251, "correlation": 18528 } }, { "ph": "f", "id": 18528, "pid": 76337, "tid": -914061504, "ts": 1716454216904434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904439, "dur": 0, "args": { "External id": 18529, "cbid": 251, "correlation": 18529 } }, { "ph": "f", "id": 18529, "pid": 76337, "tid": -914061504, "ts": 1716454216904439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216910921, "dur": 10, "args": { "External id": 18530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18530, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 18530, "pid": 5, "tid": 7, "ts": 1716454216910921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904441, "dur": 14, "args": { "External id": 18530, "cbid": 211, "correlation": 18530 } }, { "ph": "s", "id": 18530, "pid": 76337, "tid": -914061504, "ts": 1716454216904441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216910932, "dur": 5, "args": { "External id": 18532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18532, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 18532, "pid": 5, "tid": 7, "ts": 1716454216910932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904459, "dur": 8, "args": { "External id": 18532, "cbid": 211, "correlation": 18532 } }, { "ph": "s", "id": 18532, "pid": 76337, "tid": -914061504, "ts": 1716454216904459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904536, "dur": 1, "args": { "External id": 18543, "cbid": 251, "correlation": 18543 } }, { "ph": "f", "id": 18543, "pid": 76337, "tid": -914061504, "ts": 1716454216904536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904539, "dur": 0, "args": { "External id": 18544, "cbid": 251, "correlation": 18544 } }, { "ph": "f", "id": 18544, "pid": 76337, "tid": -914061504, "ts": 1716454216904539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216910939, "dur": 6, "args": { "External id": 18545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18545, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 18545, "pid": 5, "tid": 7, "ts": 1716454216910939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904541, "dur": 13, "args": { "External id": 18545, "cbid": 211, "correlation": 18545 } }, { "ph": "s", "id": 18545, "pid": 76337, "tid": -914061504, "ts": 1716454216904541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216910947, "dur": 3, "args": { "External id": 18547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18547, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 18547, "pid": 5, "tid": 7, "ts": 1716454216910947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904555, "dur": 6, "args": { "External id": 18547, "cbid": 211, "correlation": 18547 } }, { "ph": "s", "id": 18547, "pid": 76337, "tid": -914061504, "ts": 1716454216904555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216910952, "dur": 156, "args": { "External id": 18568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18568, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18568, "pid": 5, "tid": 7, "ts": 1716454216910952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904632, "dur": 12, "args": { "External id": 18568, "cbid": 211, "correlation": 18568 } }, { "ph": "s", "id": 18568, "pid": 76337, "tid": -914061504, "ts": 1716454216904632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216904743, "dur": 1, "args": { "External id": 18586, "cbid": 251, "correlation": 18586 } }, { "ph": "f", "id": 18586, "pid": 76337, "tid": -914061504, "ts": 1716454216904743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216911109, "dur": 105, "args": { "External id": 18588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18588, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18588, "pid": 5, "tid": 7, "ts": 1716454216911109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904749, "dur": 14, "args": { "External id": 18588, "cbid": 211, "correlation": 18588 } }, { "ph": "s", "id": 18588, "pid": 76337, "tid": -914061504, "ts": 1716454216904749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216911216, "dur": 35, "args": { "External id": 18596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18596, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18596, "pid": 5, "tid": 7, "ts": 1716454216911216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904822, "dur": 12, "args": { "External id": 18596, "cbid": 211, "correlation": 18596 } }, { "ph": "s", "id": 18596, "pid": 76337, "tid": -914061504, "ts": 1716454216904822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216911252, "dur": 51, "args": { "External id": 18604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18604, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18604, "pid": 5, "tid": 7, "ts": 1716454216911252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904863, "dur": 9, "args": { "External id": 18604, "cbid": 211, "correlation": 18604 } }, { "ph": "s", "id": 18604, "pid": 76337, "tid": -914061504, "ts": 1716454216904863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216911304, "dur": 93, "args": { "External id": 18626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18626, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18626, "pid": 5, "tid": 7, "ts": 1716454216911304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216904916, "dur": 10, "args": { "External id": 18626, "cbid": 211, "correlation": 18626 } }, { "ph": "s", "id": 18626, "pid": 76337, "tid": -914061504, "ts": 1716454216904916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905032, "dur": 3, "args": { "External id": 18642, "cbid": 251, "correlation": 18642 } }, { "ph": "f", "id": 18642, "pid": 76337, "tid": -914061504, "ts": 1716454216905032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216911399, "dur": 579, "args": { "External id": 18644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18644, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18644, "pid": 5, "tid": 7, "ts": 1716454216911399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905040, "dur": 15, "args": { "External id": 18644, "cbid": 211, "correlation": 18644 } }, { "ph": "s", "id": 18644, "pid": 76337, "tid": -914061504, "ts": 1716454216905040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216911979, "dur": 248, "args": { "External id": 18652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18652, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18652, "pid": 5, "tid": 7, "ts": 1716454216911979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905136, "dur": 15, "args": { "External id": 18652, "cbid": 211, "correlation": 18652 } }, { "ph": "s", "id": 18652, "pid": 76337, "tid": -914061504, "ts": 1716454216905136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216912228, "dur": 254, "args": { "External id": 18660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18660, "pid": 5, "tid": 7, "ts": 1716454216912228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905179, "dur": 12, "args": { "External id": 18660, "cbid": 211, "correlation": 18660 } }, { "ph": "s", "id": 18660, "pid": 76337, "tid": -914061504, "ts": 1716454216905179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905277, "dur": 2, "args": { "External id": 18676, "cbid": 251, "correlation": 18676 } }, { "ph": "f", "id": 18676, "pid": 76337, "tid": -914061504, "ts": 1716454216905277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905282, "dur": 0, "args": { "External id": 18678, "cbid": 251, "correlation": 18678 } }, { "ph": "f", "id": 18678, "pid": 76337, "tid": -914061504, "ts": 1716454216905282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216912483, "dur": 358, "args": { "External id": 18679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18679, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 18679, "pid": 5, "tid": 7, "ts": 1716454216912483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905288, "dur": 14, "args": { "External id": 18679, "cbid": 211, "correlation": 18679 } }, { "ph": "s", "id": 18679, "pid": 76337, "tid": -914061504, "ts": 1716454216905288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216912843, "dur": 49, "args": { "External id": 18687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18687, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18687, "pid": 5, "tid": 7, "ts": 1716454216912843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905332, "dur": 11, "args": { "External id": 18687, "cbid": 211, "correlation": 18687 } }, { "ph": "s", "id": 18687, "pid": 76337, "tid": -914061504, "ts": 1716454216905332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216912894, "dur": 159, "args": { "External id": 18698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18698, "pid": 5, "tid": 7, "ts": 1716454216912894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905402, "dur": 12, "args": { "External id": 18698, "cbid": 211, "correlation": 18698 } }, { "ph": "s", "id": 18698, "pid": 76337, "tid": -914061504, "ts": 1716454216905402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216905482, "dur": 0, "args": { "External id": 18710, "cbid": 317, "correlation": 18710 } }, { "ph": "f", "id": 18710, "pid": 76337, "tid": -914061504, "ts": 1716454216905482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216905483, "dur": 0, "args": { "External id": 18711, "cbid": 203, "correlation": 18711 } }, { "ph": "f", "id": 18711, "pid": 76337, "tid": -914061504, "ts": 1716454216905483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216905483, "dur": 0, "args": { "External id": 18712, "cbid": 205, "correlation": 18712 } }, { "ph": "f", "id": 18712, "pid": 76337, "tid": -914061504, "ts": 1716454216905483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905509, "dur": 1, "args": { "External id": 18716, "cbid": 251, "correlation": 18716 } }, { "ph": "f", "id": 18716, "pid": 76337, "tid": -914061504, "ts": 1716454216905509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905511, "dur": 0, "args": { "External id": 18717, "cbid": 251, "correlation": 18717 } }, { "ph": "f", "id": 18717, "pid": 76337, "tid": -914061504, "ts": 1716454216905511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905512, "dur": 0, "args": { "External id": 18718, "cbid": 251, "correlation": 18718 } }, { "ph": "f", "id": 18718, "pid": 76337, "tid": -914061504, "ts": 1716454216905512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905513, "dur": 0, "args": { "External id": 18719, "cbid": 251, "correlation": 18719 } }, { "ph": "f", "id": 18719, "pid": 76337, "tid": -914061504, "ts": 1716454216905513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905514, "dur": 1, "args": { "External id": 18720, "cbid": 251, "correlation": 18720 } }, { "ph": "f", "id": 18720, "pid": 76337, "tid": -914061504, "ts": 1716454216905514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905515, "dur": 0, "args": { "External id": 18721, "cbid": 251, "correlation": 18721 } }, { "ph": "f", "id": 18721, "pid": 76337, "tid": -914061504, "ts": 1716454216905515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905516, "dur": 1, "args": { "External id": 18722, "cbid": 251, "correlation": 18722 } }, { "ph": "f", "id": 18722, "pid": 76337, "tid": -914061504, "ts": 1716454216905516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905517, "dur": 0, "args": { "External id": 18723, "cbid": 251, "correlation": 18723 } }, { "ph": "f", "id": 18723, "pid": 76337, "tid": -914061504, "ts": 1716454216905517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216905519, "dur": 0, "args": { "External id": 18724, "cbid": 251, "correlation": 18724 } }, { "ph": "f", "id": 18724, "pid": 76337, "tid": -914061504, "ts": 1716454216905519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216913053, "dur": 115, "args": { "External id": 18725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18725, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 18725, "pid": 5, "tid": 7, "ts": 1716454216913053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905521, "dur": 13, "args": { "External id": 18725, "cbid": 211, "correlation": 18725 } }, { "ph": "s", "id": 18725, "pid": 76337, "tid": -914061504, "ts": 1716454216905521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216913170, "dur": 59, "args": { "External id": 18731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18731, "pid": 5, "tid": 7, "ts": 1716454216913170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905557, "dur": 9, "args": { "External id": 18731, "cbid": 211, "correlation": 18731 } }, { "ph": "s", "id": 18731, "pid": 76337, "tid": -914061504, "ts": 1716454216905557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216913231, "dur": 50, "args": { "External id": 18739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18739, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18739, "pid": 5, "tid": 7, "ts": 1716454216913231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905590, "dur": 9, "args": { "External id": 18739, "cbid": 211, "correlation": 18739 } }, { "ph": "s", "id": 18739, "pid": 76337, "tid": -914061504, "ts": 1716454216905590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216913282, "dur": 98, "args": { "External id": 18748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18748, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18748, "pid": 5, "tid": 7, "ts": 1716454216913282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905648, "dur": 13, "args": { "External id": 18748, "cbid": 211, "correlation": 18748 } }, { "ph": "s", "id": 18748, "pid": 76337, "tid": -914061504, "ts": 1716454216905648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216913381, "dur": 94, "args": { "External id": 18768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18768, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 18768, "pid": 5, "tid": 7, "ts": 1716454216913381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905729, "dur": 11, "args": { "External id": 18768, "cbid": 211, "correlation": 18768 } }, { "ph": "s", "id": 18768, "pid": 76337, "tid": -914061504, "ts": 1716454216905729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216913476, "dur": 4, "args": { "External id": 18780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18780, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 18780, "pid": 5, "tid": 7, "ts": 1716454216913476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905751, "dur": 7, "args": { "External id": 18780, "cbid": 211, "correlation": 18780 } }, { "ph": "s", "id": 18780, "pid": 76337, "tid": -914061504, "ts": 1716454216905751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216913482, "dur": 109, "args": { "External id": 18783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18783, "pid": 5, "tid": 7, "ts": 1716454216913482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905770, "dur": 7, "args": { "External id": 18783, "cbid": 211, "correlation": 18783 } }, { "ph": "s", "id": 18783, "pid": 76337, "tid": -914061504, "ts": 1716454216905770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216913593, "dur": 69, "args": { "External id": 18792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18792, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18792, "pid": 5, "tid": 7, "ts": 1716454216913593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905815, "dur": 11, "args": { "External id": 18792, "cbid": 211, "correlation": 18792 } }, { "ph": "s", "id": 18792, "pid": 76337, "tid": -914061504, "ts": 1716454216905815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216905869, "dur": 0, "args": { "External id": 18802, "cbid": 317, "correlation": 18802 } }, { "ph": "f", "id": 18802, "pid": 76337, "tid": -914061504, "ts": 1716454216905869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216905870, "dur": 0, "args": { "External id": 18803, "cbid": 203, "correlation": 18803 } }, { "ph": "f", "id": 18803, "pid": 76337, "tid": -914061504, "ts": 1716454216905870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216905871, "dur": 0, "args": { "External id": 18804, "cbid": 205, "correlation": 18804 } }, { "ph": "f", "id": 18804, "pid": 76337, "tid": -914061504, "ts": 1716454216905871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216913663, "dur": 76, "args": { "External id": 18808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18808, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18808, "pid": 5, "tid": 7, "ts": 1716454216913663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905887, "dur": 12, "args": { "External id": 18808, "cbid": 211, "correlation": 18808 } }, { "ph": "s", "id": 18808, "pid": 76337, "tid": -914061504, "ts": 1716454216905887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216913741, "dur": 24, "args": { "External id": 18810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18810, "pid": 5, "tid": 7, "ts": 1716454216913741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905902, "dur": 6, "args": { "External id": 18810, "cbid": 211, "correlation": 18810 } }, { "ph": "s", "id": 18810, "pid": 76337, "tid": -914061504, "ts": 1716454216905902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216913766, "dur": 4, "args": { "External id": 18812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18812, "pid": 5, "tid": 7, "ts": 1716454216913766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905915, "dur": 6, "args": { "External id": 18812, "cbid": 211, "correlation": 18812 } }, { "ph": "s", "id": 18812, "pid": 76337, "tid": -914061504, "ts": 1716454216905915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216905925, "dur": 0, "args": { "External id": 18813, "cbid": 51, "correlation": 18813 } }, { "ph": "s", "id": 18813, "pid": 76337, "tid": -914061504, "ts": 1716454216905925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216913771, "dur": 1371, "args": { "External id": 18814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18814, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18814, "pid": 5, "tid": 7, "ts": 1716454216913771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905926, "dur": 6, "args": { "External id": 18814, "cbid": 211, "correlation": 18814 } }, { "ph": "s", "id": 18814, "pid": 76337, "tid": -914061504, "ts": 1716454216905926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216915144, "dur": 60, "args": { "External id": 18819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18819, "pid": 5, "tid": 7, "ts": 1716454216915144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216905955, "dur": 9, "args": { "External id": 18819, "cbid": 211, "correlation": 18819 } }, { "ph": "s", "id": 18819, "pid": 76337, "tid": -914061504, "ts": 1716454216905955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216915205, "dur": 4, "args": { "External id": 18827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18827, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18827, "pid": 5, "tid": 7, "ts": 1716454216915205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906009, "dur": 10, "args": { "External id": 18827, "cbid": 211, "correlation": 18827 } }, { "ph": "s", "id": 18827, "pid": 76337, "tid": -914061504, "ts": 1716454216906009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906093, "dur": 2, "args": { "External id": 18843, "cbid": 251, "correlation": 18843 } }, { "ph": "f", "id": 18843, "pid": 76337, "tid": -914061504, "ts": 1716454216906093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906099, "dur": 0, "args": { "External id": 18845, "cbid": 251, "correlation": 18845 } }, { "ph": "f", "id": 18845, "pid": 76337, "tid": -914061504, "ts": 1716454216906099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216915210, "dur": 11, "args": { "External id": 18846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18846, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 18846, "pid": 5, "tid": 7, "ts": 1716454216915210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906101, "dur": 15, "args": { "External id": 18846, "cbid": 211, "correlation": 18846 } }, { "ph": "s", "id": 18846, "pid": 76337, "tid": -914061504, "ts": 1716454216906101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216915222, "dur": 5, "args": { "External id": 18848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18848, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 18848, "pid": 5, "tid": 7, "ts": 1716454216915222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906120, "dur": 7, "args": { "External id": 18848, "cbid": 211, "correlation": 18848 } }, { "ph": "s", "id": 18848, "pid": 76337, "tid": -914061504, "ts": 1716454216906120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216915229, "dur": 54, "args": { "External id": 18858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18858, "pid": 5, "tid": 7, "ts": 1716454216915229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906188, "dur": 12, "args": { "External id": 18858, "cbid": 211, "correlation": 18858 } }, { "ph": "s", "id": 18858, "pid": 76337, "tid": -914061504, "ts": 1716454216906188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216915284, "dur": 52, "args": { "External id": 18878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18878, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 18878, "pid": 5, "tid": 7, "ts": 1716454216915284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906254, "dur": 11, "args": { "External id": 18878, "cbid": 211, "correlation": 18878 } }, { "ph": "s", "id": 18878, "pid": 76337, "tid": -914061504, "ts": 1716454216906254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216915337, "dur": 4, "args": { "External id": 18890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18890, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18890, "pid": 5, "tid": 7, "ts": 1716454216915337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906274, "dur": 6, "args": { "External id": 18890, "cbid": 211, "correlation": 18890 } }, { "ph": "s", "id": 18890, "pid": 76337, "tid": -914061504, "ts": 1716454216906274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216915342, "dur": 57, "args": { "External id": 18893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18893, "pid": 5, "tid": 7, "ts": 1716454216915342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906292, "dur": 6, "args": { "External id": 18893, "cbid": 211, "correlation": 18893 } }, { "ph": "s", "id": 18893, "pid": 76337, "tid": -914061504, "ts": 1716454216906292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216915400, "dur": 36, "args": { "External id": 18902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18902, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18902, "pid": 5, "tid": 7, "ts": 1716454216915400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906333, "dur": 10, "args": { "External id": 18902, "cbid": 211, "correlation": 18902 } }, { "ph": "s", "id": 18902, "pid": 76337, "tid": -914061504, "ts": 1716454216906333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216906410, "dur": 0, "args": { "External id": 18912, "cbid": 317, "correlation": 18912 } }, { "ph": "f", "id": 18912, "pid": 76337, "tid": -914061504, "ts": 1716454216906410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216906411, "dur": 0, "args": { "External id": 18913, "cbid": 203, "correlation": 18913 } }, { "ph": "f", "id": 18913, "pid": 76337, "tid": -914061504, "ts": 1716454216906411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216906412, "dur": 0, "args": { "External id": 18914, "cbid": 205, "correlation": 18914 } }, { "ph": "f", "id": 18914, "pid": 76337, "tid": -914061504, "ts": 1716454216906412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216915438, "dur": 40, "args": { "External id": 18918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18918, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18918, "pid": 5, "tid": 7, "ts": 1716454216915438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906429, "dur": 13, "args": { "External id": 18918, "cbid": 211, "correlation": 18918 } }, { "ph": "s", "id": 18918, "pid": 76337, "tid": -914061504, "ts": 1716454216906429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216915479, "dur": 14, "args": { "External id": 18920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18920, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18920, "pid": 5, "tid": 7, "ts": 1716454216915479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906445, "dur": 6, "args": { "External id": 18920, "cbid": 211, "correlation": 18920 } }, { "ph": "s", "id": 18920, "pid": 76337, "tid": -914061504, "ts": 1716454216906445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216915494, "dur": 3, "args": { "External id": 18922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 18922, "pid": 5, "tid": 7, "ts": 1716454216915494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906455, "dur": 6, "args": { "External id": 18922, "cbid": 211, "correlation": 18922 } }, { "ph": "s", "id": 18922, "pid": 76337, "tid": -914061504, "ts": 1716454216906455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216906464, "dur": 0, "args": { "External id": 18923, "cbid": 51, "correlation": 18923 } }, { "ph": "s", "id": 18923, "pid": 76337, "tid": -914061504, "ts": 1716454216906464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216915499, "dur": 703, "args": { "External id": 18924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18924, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18924, "pid": 5, "tid": 7, "ts": 1716454216915499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906465, "dur": 6, "args": { "External id": 18924, "cbid": 211, "correlation": 18924 } }, { "ph": "s", "id": 18924, "pid": 76337, "tid": -914061504, "ts": 1716454216906465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216916204, "dur": 59, "args": { "External id": 18929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18929, "pid": 5, "tid": 7, "ts": 1716454216916204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906494, "dur": 9, "args": { "External id": 18929, "cbid": 211, "correlation": 18929 } }, { "ph": "s", "id": 18929, "pid": 76337, "tid": -914061504, "ts": 1716454216906494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216906553, "dur": 0, "args": { "External id": 18939, "cbid": 317, "correlation": 18939 } }, { "ph": "f", "id": 18939, "pid": 76337, "tid": -914061504, "ts": 1716454216906553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216906554, "dur": 0, "args": { "External id": 18940, "cbid": 203, "correlation": 18940 } }, { "ph": "f", "id": 18940, "pid": 76337, "tid": -914061504, "ts": 1716454216906554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216906555, "dur": 0, "args": { "External id": 18941, "cbid": 205, "correlation": 18941 } }, { "ph": "f", "id": 18941, "pid": 76337, "tid": -914061504, "ts": 1716454216906555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216916264, "dur": 75, "args": { "External id": 18945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18945, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18945, "pid": 5, "tid": 7, "ts": 1716454216916264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906566, "dur": 13, "args": { "External id": 18945, "cbid": 211, "correlation": 18945 } }, { "ph": "s", "id": 18945, "pid": 76337, "tid": -914061504, "ts": 1716454216906566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454216916341, "dur": 209, "args": { "External id": 18947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18947, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 18947, "pid": 5, "tid": 7, "ts": 1716454216916341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906588, "dur": 7, "args": { "External id": 18947, "cbid": 211, "correlation": 18947 } }, { "ph": "s", "id": 18947, "pid": 76337, "tid": -914061504, "ts": 1716454216906588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216916552, "dur": 38, "args": { "External id": 18949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18949, "pid": 5, "tid": 7, "ts": 1716454216916552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906600, "dur": 6, "args": { "External id": 18949, "cbid": 211, "correlation": 18949 } }, { "ph": "s", "id": 18949, "pid": 76337, "tid": -914061504, "ts": 1716454216906600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216916591, "dur": 60, "args": { "External id": 18955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18955, "pid": 5, "tid": 7, "ts": 1716454216916591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906628, "dur": 9, "args": { "External id": 18955, "cbid": 211, "correlation": 18955 } }, { "ph": "s", "id": 18955, "pid": 76337, "tid": -914061504, "ts": 1716454216906628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216916652, "dur": 50, "args": { "External id": 18963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18963, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18963, "pid": 5, "tid": 7, "ts": 1716454216916652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906656, "dur": 8, "args": { "External id": 18963, "cbid": 211, "correlation": 18963 } }, { "ph": "s", "id": 18963, "pid": 76337, "tid": -914061504, "ts": 1716454216906656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216916703, "dur": 35, "args": { "External id": 18971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18971, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 18971, "pid": 5, "tid": 7, "ts": 1716454216916703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906686, "dur": 8, "args": { "External id": 18971, "cbid": 211, "correlation": 18971 } }, { "ph": "s", "id": 18971, "pid": 76337, "tid": -914061504, "ts": 1716454216906686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216916739, "dur": 52, "args": { "External id": 18991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 18991, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 18991, "pid": 5, "tid": 7, "ts": 1716454216916739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906769, "dur": 14, "args": { "External id": 18991, "cbid": 211, "correlation": 18991 } }, { "ph": "s", "id": 18991, "pid": 76337, "tid": -914061504, "ts": 1716454216906769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216916793, "dur": 5, "args": { "External id": 19003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19003, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19003, "pid": 5, "tid": 7, "ts": 1716454216916793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906793, "dur": 6, "args": { "External id": 19003, "cbid": 211, "correlation": 19003 } }, { "ph": "s", "id": 19003, "pid": 76337, "tid": -914061504, "ts": 1716454216906793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216916799, "dur": 57, "args": { "External id": 19006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19006, "pid": 5, "tid": 7, "ts": 1716454216916799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906810, "dur": 7, "args": { "External id": 19006, "cbid": 211, "correlation": 19006 } }, { "ph": "s", "id": 19006, "pid": 76337, "tid": -914061504, "ts": 1716454216906810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216906882, "dur": 0, "args": { "External id": 19017, "cbid": 317, "correlation": 19017 } }, { "ph": "f", "id": 19017, "pid": 76337, "tid": -914061504, "ts": 1716454216906882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216906882, "dur": 0, "args": { "External id": 19018, "cbid": 203, "correlation": 19018 } }, { "ph": "f", "id": 19018, "pid": 76337, "tid": -914061504, "ts": 1716454216906882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216906883, "dur": 0, "args": { "External id": 19019, "cbid": 205, "correlation": 19019 } }, { "ph": "f", "id": 19019, "pid": 76337, "tid": -914061504, "ts": 1716454216906883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906906, "dur": 1, "args": { "External id": 19023, "cbid": 251, "correlation": 19023 } }, { "ph": "f", "id": 19023, "pid": 76337, "tid": -914061504, "ts": 1716454216906906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906908, "dur": 0, "args": { "External id": 19024, "cbid": 251, "correlation": 19024 } }, { "ph": "f", "id": 19024, "pid": 76337, "tid": -914061504, "ts": 1716454216906908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906909, "dur": 0, "args": { "External id": 19025, "cbid": 251, "correlation": 19025 } }, { "ph": "f", "id": 19025, "pid": 76337, "tid": -914061504, "ts": 1716454216906909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906910, "dur": 0, "args": { "External id": 19026, "cbid": 251, "correlation": 19026 } }, { "ph": "f", "id": 19026, "pid": 76337, "tid": -914061504, "ts": 1716454216906910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906910, "dur": 0, "args": { "External id": 19027, "cbid": 251, "correlation": 19027 } }, { "ph": "f", "id": 19027, "pid": 76337, "tid": -914061504, "ts": 1716454216906910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906911, "dur": 0, "args": { "External id": 19028, "cbid": 251, "correlation": 19028 } }, { "ph": "f", "id": 19028, "pid": 76337, "tid": -914061504, "ts": 1716454216906911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906912, "dur": 0, "args": { "External id": 19029, "cbid": 251, "correlation": 19029 } }, { "ph": "f", "id": 19029, "pid": 76337, "tid": -914061504, "ts": 1716454216906912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906913, "dur": 0, "args": { "External id": 19030, "cbid": 251, "correlation": 19030 } }, { "ph": "f", "id": 19030, "pid": 76337, "tid": -914061504, "ts": 1716454216906913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216906914, "dur": 0, "args": { "External id": 19031, "cbid": 251, "correlation": 19031 } }, { "ph": "f", "id": 19031, "pid": 76337, "tid": -914061504, "ts": 1716454216906914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216916858, "dur": 114, "args": { "External id": 19032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19032, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19032, "pid": 5, "tid": 7, "ts": 1716454216916858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906917, "dur": 14, "args": { "External id": 19032, "cbid": 211, "correlation": 19032 } }, { "ph": "s", "id": 19032, "pid": 76337, "tid": -914061504, "ts": 1716454216906917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216916973, "dur": 60, "args": { "External id": 19038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19038, "pid": 5, "tid": 7, "ts": 1716454216916973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216906954, "dur": 9, "args": { "External id": 19038, "cbid": 211, "correlation": 19038 } }, { "ph": "s", "id": 19038, "pid": 76337, "tid": -914061504, "ts": 1716454216906954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216917034, "dur": 150, "args": { "External id": 19049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19049, "pid": 5, "tid": 7, "ts": 1716454216917034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907063, "dur": 14, "args": { "External id": 19049, "cbid": 211, "correlation": 19049 } }, { "ph": "s", "id": 19049, "pid": 76337, "tid": -914061504, "ts": 1716454216907063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216917186, "dur": 95, "args": { "External id": 19071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19071, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19071, "pid": 5, "tid": 7, "ts": 1716454216917186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907109, "dur": 10, "args": { "External id": 19071, "cbid": 211, "correlation": 19071 } }, { "ph": "s", "id": 19071, "pid": 76337, "tid": -914061504, "ts": 1716454216907109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907212, "dur": 1, "args": { "External id": 19082, "cbid": 251, "correlation": 19082 } }, { "ph": "f", "id": 19082, "pid": 76337, "tid": -914061504, "ts": 1716454216907212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216917282, "dur": 111, "args": { "External id": 19083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19083, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19083, "pid": 5, "tid": 7, "ts": 1716454216917282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907217, "dur": 14, "args": { "External id": 19083, "cbid": 211, "correlation": 19083 } }, { "ph": "s", "id": 19083, "pid": 76337, "tid": -914061504, "ts": 1716454216907217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907286, "dur": 1, "args": { "External id": 19094, "cbid": 251, "correlation": 19094 } }, { "ph": "f", "id": 19094, "pid": 76337, "tid": -914061504, "ts": 1716454216907286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216917395, "dur": 105, "args": { "External id": 19095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19095, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19095, "pid": 5, "tid": 7, "ts": 1716454216917395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907291, "dur": 11, "args": { "External id": 19095, "cbid": 211, "correlation": 19095 } }, { "ph": "s", "id": 19095, "pid": 76337, "tid": -914061504, "ts": 1716454216907291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907353, "dur": 1, "args": { "External id": 19106, "cbid": 251, "correlation": 19106 } }, { "ph": "f", "id": 19106, "pid": 76337, "tid": -914061504, "ts": 1716454216907353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216917501, "dur": 104, "args": { "External id": 19107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19107, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19107, "pid": 5, "tid": 7, "ts": 1716454216917501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907357, "dur": 11, "args": { "External id": 19107, "cbid": 211, "correlation": 19107 } }, { "ph": "s", "id": 19107, "pid": 76337, "tid": -914061504, "ts": 1716454216907357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216917606, "dur": 4781, "args": { "External id": 19128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19128, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19128, "pid": 5, "tid": 7, "ts": 1716454216917606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907438, "dur": 13, "args": { "External id": 19128, "cbid": 211, "correlation": 19128 } }, { "ph": "s", "id": 19128, "pid": 76337, "tid": -914061504, "ts": 1716454216907438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907551, "dur": 1, "args": { "External id": 19146, "cbid": 251, "correlation": 19146 } }, { "ph": "f", "id": 19146, "pid": 76337, "tid": -914061504, "ts": 1716454216907551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216922388, "dur": 107, "args": { "External id": 19148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19148, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19148, "pid": 5, "tid": 7, "ts": 1716454216922388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907556, "dur": 14, "args": { "External id": 19148, "cbid": 211, "correlation": 19148 } }, { "ph": "s", "id": 19148, "pid": 76337, "tid": -914061504, "ts": 1716454216907556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216922497, "dur": 34, "args": { "External id": 19156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19156, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19156, "pid": 5, "tid": 7, "ts": 1716454216922497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907629, "dur": 12, "args": { "External id": 19156, "cbid": 211, "correlation": 19156 } }, { "ph": "s", "id": 19156, "pid": 76337, "tid": -914061504, "ts": 1716454216907629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216922532, "dur": 159, "args": { "External id": 19164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19164, "pid": 5, "tid": 7, "ts": 1716454216922532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907672, "dur": 9, "args": { "External id": 19164, "cbid": 211, "correlation": 19164 } }, { "ph": "s", "id": 19164, "pid": 76337, "tid": -914061504, "ts": 1716454216907672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216922693, "dur": 93, "args": { "External id": 19186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19186, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19186, "pid": 5, "tid": 7, "ts": 1716454216922693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907725, "dur": 10, "args": { "External id": 19186, "cbid": 211, "correlation": 19186 } }, { "ph": "s", "id": 19186, "pid": 76337, "tid": -914061504, "ts": 1716454216907725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907808, "dur": 1, "args": { "External id": 19197, "cbid": 251, "correlation": 19197 } }, { "ph": "f", "id": 19197, "pid": 76337, "tid": -914061504, "ts": 1716454216907808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216922787, "dur": 103, "args": { "External id": 19198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19198, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19198, "pid": 5, "tid": 7, "ts": 1716454216922787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907813, "dur": 14, "args": { "External id": 19198, "cbid": 211, "correlation": 19198 } }, { "ph": "s", "id": 19198, "pid": 76337, "tid": -914061504, "ts": 1716454216907813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907900, "dur": 1, "args": { "External id": 19209, "cbid": 251, "correlation": 19209 } }, { "ph": "f", "id": 19209, "pid": 76337, "tid": -914061504, "ts": 1716454216907900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907904, "dur": 0, "args": { "External id": 19210, "cbid": 251, "correlation": 19210 } }, { "ph": "f", "id": 19210, "pid": 76337, "tid": -914061504, "ts": 1716454216907904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216922891, "dur": 10, "args": { "External id": 19211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19211, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 19211, "pid": 5, "tid": 7, "ts": 1716454216922891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907906, "dur": 13, "args": { "External id": 19211, "cbid": 211, "correlation": 19211 } }, { "ph": "s", "id": 19211, "pid": 76337, "tid": -914061504, "ts": 1716454216907906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216922903, "dur": 5, "args": { "External id": 19213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19213, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 19213, "pid": 5, "tid": 7, "ts": 1716454216922903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907921, "dur": 6, "args": { "External id": 19213, "cbid": 211, "correlation": 19213 } }, { "ph": "s", "id": 19213, "pid": 76337, "tid": -914061504, "ts": 1716454216907921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907991, "dur": 1, "args": { "External id": 19224, "cbid": 251, "correlation": 19224 } }, { "ph": "f", "id": 19224, "pid": 76337, "tid": -914061504, "ts": 1716454216907991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216907994, "dur": 0, "args": { "External id": 19225, "cbid": 251, "correlation": 19225 } }, { "ph": "f", "id": 19225, "pid": 76337, "tid": -914061504, "ts": 1716454216907994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216922909, "dur": 6, "args": { "External id": 19226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19226, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 19226, "pid": 5, "tid": 7, "ts": 1716454216922909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216907996, "dur": 12, "args": { "External id": 19226, "cbid": 211, "correlation": 19226 } }, { "ph": "s", "id": 19226, "pid": 76337, "tid": -914061504, "ts": 1716454216907996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216922917, "dur": 4, "args": { "External id": 19228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19228, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 19228, "pid": 5, "tid": 7, "ts": 1716454216922917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908010, "dur": 5, "args": { "External id": 19228, "cbid": 211, "correlation": 19228 } }, { "ph": "s", "id": 19228, "pid": 76337, "tid": -914061504, "ts": 1716454216908010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454216922922, "dur": 156, "args": { "External id": 19249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19249, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19249, "pid": 5, "tid": 7, "ts": 1716454216922922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908085, "dur": 12, "args": { "External id": 19249, "cbid": 211, "correlation": 19249 } }, { "ph": "s", "id": 19249, "pid": 76337, "tid": -914061504, "ts": 1716454216908085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908181, "dur": 1, "args": { "External id": 19267, "cbid": 251, "correlation": 19267 } }, { "ph": "f", "id": 19267, "pid": 76337, "tid": -914061504, "ts": 1716454216908181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216923079, "dur": 106, "args": { "External id": 19269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19269, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19269, "pid": 5, "tid": 7, "ts": 1716454216923079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908188, "dur": 13, "args": { "External id": 19269, "cbid": 211, "correlation": 19269 } }, { "ph": "s", "id": 19269, "pid": 76337, "tid": -914061504, "ts": 1716454216908188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216923187, "dur": 34, "args": { "External id": 19277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19277, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19277, "pid": 5, "tid": 7, "ts": 1716454216923187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908258, "dur": 12, "args": { "External id": 19277, "cbid": 211, "correlation": 19277 } }, { "ph": "s", "id": 19277, "pid": 76337, "tid": -914061504, "ts": 1716454216908258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216923222, "dur": 50, "args": { "External id": 19285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19285, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19285, "pid": 5, "tid": 7, "ts": 1716454216923222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908298, "dur": 9, "args": { "External id": 19285, "cbid": 211, "correlation": 19285 } }, { "ph": "s", "id": 19285, "pid": 76337, "tid": -914061504, "ts": 1716454216908298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216923274, "dur": 93, "args": { "External id": 19307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19307, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19307, "pid": 5, "tid": 7, "ts": 1716454216923274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908351, "dur": 10, "args": { "External id": 19307, "cbid": 211, "correlation": 19307 } }, { "ph": "s", "id": 19307, "pid": 76337, "tid": -914061504, "ts": 1716454216908351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908448, "dur": 2, "args": { "External id": 19323, "cbid": 251, "correlation": 19323 } }, { "ph": "f", "id": 19323, "pid": 76337, "tid": -914061504, "ts": 1716454216908448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454216923368, "dur": 580, "args": { "External id": 19325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19325, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19325, "pid": 5, "tid": 7, "ts": 1716454216923368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908454, "dur": 14, "args": { "External id": 19325, "cbid": 211, "correlation": 19325 } }, { "ph": "s", "id": 19325, "pid": 76337, "tid": -914061504, "ts": 1716454216908454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216923950, "dur": 245, "args": { "External id": 19333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19333, "pid": 5, "tid": 7, "ts": 1716454216923950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908523, "dur": 13, "args": { "External id": 19333, "cbid": 211, "correlation": 19333 } }, { "ph": "s", "id": 19333, "pid": 76337, "tid": -914061504, "ts": 1716454216908523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216924196, "dur": 254, "args": { "External id": 19341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19341, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19341, "pid": 5, "tid": 7, "ts": 1716454216924196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908555, "dur": 9, "args": { "External id": 19341, "cbid": 211, "correlation": 19341 } }, { "ph": "s", "id": 19341, "pid": 76337, "tid": -914061504, "ts": 1716454216908555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908648, "dur": 1, "args": { "External id": 19357, "cbid": 251, "correlation": 19357 } }, { "ph": "f", "id": 19357, "pid": 76337, "tid": -914061504, "ts": 1716454216908648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908654, "dur": 0, "args": { "External id": 19359, "cbid": 251, "correlation": 19359 } }, { "ph": "f", "id": 19359, "pid": 76337, "tid": -914061504, "ts": 1716454216908654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454216924451, "dur": 361, "args": { "External id": 19360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19360, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19360, "pid": 5, "tid": 7, "ts": 1716454216924451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908656, "dur": 15, "args": { "External id": 19360, "cbid": 211, "correlation": 19360 } }, { "ph": "s", "id": 19360, "pid": 76337, "tid": -914061504, "ts": 1716454216908656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216924813, "dur": 49, "args": { "External id": 19368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19368, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19368, "pid": 5, "tid": 7, "ts": 1716454216924813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908702, "dur": 11, "args": { "External id": 19368, "cbid": 211, "correlation": 19368 } }, { "ph": "s", "id": 19368, "pid": 76337, "tid": -914061504, "ts": 1716454216908702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216924864, "dur": 159, "args": { "External id": 19379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19379, "pid": 5, "tid": 7, "ts": 1716454216924864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908769, "dur": 12, "args": { "External id": 19379, "cbid": 211, "correlation": 19379 } }, { "ph": "s", "id": 19379, "pid": 76337, "tid": -914061504, "ts": 1716454216908769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216908834, "dur": 0, "args": { "External id": 19391, "cbid": 317, "correlation": 19391 } }, { "ph": "f", "id": 19391, "pid": 76337, "tid": -914061504, "ts": 1716454216908834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216908835, "dur": 0, "args": { "External id": 19392, "cbid": 203, "correlation": 19392 } }, { "ph": "f", "id": 19392, "pid": 76337, "tid": -914061504, "ts": 1716454216908835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216908836, "dur": 0, "args": { "External id": 19393, "cbid": 205, "correlation": 19393 } }, { "ph": "f", "id": 19393, "pid": 76337, "tid": -914061504, "ts": 1716454216908836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908859, "dur": 1, "args": { "External id": 19397, "cbid": 251, "correlation": 19397 } }, { "ph": "f", "id": 19397, "pid": 76337, "tid": -914061504, "ts": 1716454216908859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908862, "dur": 0, "args": { "External id": 19398, "cbid": 251, "correlation": 19398 } }, { "ph": "f", "id": 19398, "pid": 76337, "tid": -914061504, "ts": 1716454216908862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908862, "dur": 0, "args": { "External id": 19399, "cbid": 251, "correlation": 19399 } }, { "ph": "f", "id": 19399, "pid": 76337, "tid": -914061504, "ts": 1716454216908862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908863, "dur": 0, "args": { "External id": 19400, "cbid": 251, "correlation": 19400 } }, { "ph": "f", "id": 19400, "pid": 76337, "tid": -914061504, "ts": 1716454216908863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908864, "dur": 0, "args": { "External id": 19401, "cbid": 251, "correlation": 19401 } }, { "ph": "f", "id": 19401, "pid": 76337, "tid": -914061504, "ts": 1716454216908864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908865, "dur": 0, "args": { "External id": 19402, "cbid": 251, "correlation": 19402 } }, { "ph": "f", "id": 19402, "pid": 76337, "tid": -914061504, "ts": 1716454216908865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908866, "dur": 0, "args": { "External id": 19403, "cbid": 251, "correlation": 19403 } }, { "ph": "f", "id": 19403, "pid": 76337, "tid": -914061504, "ts": 1716454216908866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908866, "dur": 0, "args": { "External id": 19404, "cbid": 251, "correlation": 19404 } }, { "ph": "f", "id": 19404, "pid": 76337, "tid": -914061504, "ts": 1716454216908866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216908868, "dur": 0, "args": { "External id": 19405, "cbid": 251, "correlation": 19405 } }, { "ph": "f", "id": 19405, "pid": 76337, "tid": -914061504, "ts": 1716454216908868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216925024, "dur": 114, "args": { "External id": 19406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19406, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19406, "pid": 5, "tid": 7, "ts": 1716454216925024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908870, "dur": 12, "args": { "External id": 19406, "cbid": 211, "correlation": 19406 } }, { "ph": "s", "id": 19406, "pid": 76337, "tid": -914061504, "ts": 1716454216908870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216925140, "dur": 60, "args": { "External id": 19412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19412, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19412, "pid": 5, "tid": 7, "ts": 1716454216925140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908904, "dur": 9, "args": { "External id": 19412, "cbid": 211, "correlation": 19412 } }, { "ph": "s", "id": 19412, "pid": 76337, "tid": -914061504, "ts": 1716454216908904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925201, "dur": 50, "args": { "External id": 19420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19420, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19420, "pid": 5, "tid": 7, "ts": 1716454216925201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216908937, "dur": 8, "args": { "External id": 19420, "cbid": 211, "correlation": 19420 } }, { "ph": "s", "id": 19420, "pid": 76337, "tid": -914061504, "ts": 1716454216908937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216925253, "dur": 52, "args": { "External id": 19440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19440, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 19440, "pid": 5, "tid": 7, "ts": 1716454216925253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216909048, "dur": 14, "args": { "External id": 19440, "cbid": 211, "correlation": 19440 } }, { "ph": "s", "id": 19440, "pid": 76337, "tid": -914061504, "ts": 1716454216909048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216925306, "dur": 5, "args": { "External id": 19452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19452, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19452, "pid": 5, "tid": 7, "ts": 1716454216925306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216909079, "dur": 8, "args": { "External id": 19452, "cbid": 211, "correlation": 19452 } }, { "ph": "s", "id": 19452, "pid": 76337, "tid": -914061504, "ts": 1716454216909079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216925312, "dur": 59, "args": { "External id": 19455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19455, "pid": 5, "tid": 7, "ts": 1716454216925312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216909100, "dur": 7, "args": { "External id": 19455, "cbid": 211, "correlation": 19455 } }, { "ph": "s", "id": 19455, "pid": 76337, "tid": -914061504, "ts": 1716454216909100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925372, "dur": 37, "args": { "External id": 19464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19464, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19464, "pid": 5, "tid": 7, "ts": 1716454216925372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216909147, "dur": 10, "args": { "External id": 19464, "cbid": 211, "correlation": 19464 } }, { "ph": "s", "id": 19464, "pid": 76337, "tid": -914061504, "ts": 1716454216909147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216909215, "dur": 0, "args": { "External id": 19474, "cbid": 317, "correlation": 19474 } }, { "ph": "f", "id": 19474, "pid": 76337, "tid": -914061504, "ts": 1716454216909215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216909216, "dur": 0, "args": { "External id": 19475, "cbid": 203, "correlation": 19475 } }, { "ph": "f", "id": 19475, "pid": 76337, "tid": -914061504, "ts": 1716454216909216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216909216, "dur": 0, "args": { "External id": 19476, "cbid": 205, "correlation": 19476 } }, { "ph": "f", "id": 19476, "pid": 76337, "tid": -914061504, "ts": 1716454216909216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216925411, "dur": 42, "args": { "External id": 19480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19480, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19480, "pid": 5, "tid": 7, "ts": 1716454216925411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216912093, "dur": 31, "args": { "External id": 19480, "cbid": 211, "correlation": 19480 } }, { "ph": "s", "id": 19480, "pid": 76337, "tid": -914061504, "ts": 1716454216912093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454216925454, "dur": 3, "args": { "External id": 19482, "device": 5, "context": 1, "stream": 7, "correlation": 19482, "bytes": 46080, "memory bandwidth (GB/s)": 12.310980496927598 } }, { "ph": "f", "id": 19482, "pid": 5, "tid": 7, "ts": 1716454216925454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216912128, "dur": 25, "args": { "External id": 19482, "cbid": 51, "correlation": 19482 } }, { "ph": "s", "id": 19482, "pid": 76337, "tid": -914061504, "ts": 1716454216912128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216912171, "dur": 1, "args": { "External id": 19484, "cbid": 200, "correlation": 19484 } }, { "ph": "f", "id": 19484, "pid": 76337, "tid": -914061504, "ts": 1716454216912171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216912173, "dur": 0, "args": { "External id": 19485, "cbid": 200, "correlation": 19485 } }, { "ph": "f", "id": 19485, "pid": 76337, "tid": -914061504, "ts": 1716454216912173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216912174, "dur": 0, "args": { "External id": 19486, "cbid": 200, "correlation": 19486 } }, { "ph": "f", "id": 19486, "pid": 76337, "tid": -914061504, "ts": 1716454216912174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454216912174, "dur": 0, "args": { "External id": 19487, "cbid": 200, "correlation": 19487 } }, { "ph": "f", "id": 19487, "pid": 76337, "tid": -914061504, "ts": 1716454216912174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454216912175, "dur": 223, "args": { "External id": 19488, "cbid": 15, "correlation": 19488 } }, { "ph": "f", "id": 19488, "pid": 76337, "tid": -914061504, "ts": 1716454216912175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216912400, "dur": 1, "args": { "External id": 19489, "cbid": 251, "correlation": 19489 } }, { "ph": "f", "id": 19489, "pid": 76337, "tid": -914061504, "ts": 1716454216912400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454216925473, "dur": 25, "args": { "External id": 19490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19490, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19490, "pid": 5, "tid": 7, "ts": 1716454216925473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216912413, "dur": 109, "args": { "External id": 19490, "cbid": 211, "correlation": 19490 } }, { "ph": "s", "id": 19490, "pid": 76337, "tid": -914061504, "ts": 1716454216912413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216925500, "dur": 4, "args": { "External id": 19492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19492, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19492, "pid": 5, "tid": 7, "ts": 1716454216925500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216912530, "dur": 9, "args": { "External id": 19492, "cbid": 211, "correlation": 19492 } }, { "ph": "s", "id": 19492, "pid": 76337, "tid": -914061504, "ts": 1716454216912530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216912542, "dur": 0, "args": { "External id": 19493, "cbid": 51, "correlation": 19493 } }, { "ph": "s", "id": 19493, "pid": 76337, "tid": -914061504, "ts": 1716454216912542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216925505, "dur": 190, "args": { "External id": 19494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19494, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 19494, "pid": 5, "tid": 7, "ts": 1716454216925505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216912544, "dur": 10, "args": { "External id": 19494, "cbid": 211, "correlation": 19494 } }, { "ph": "s", "id": 19494, "pid": 76337, "tid": -914061504, "ts": 1716454216912544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216925696, "dur": 6, "args": { "External id": 19495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19495, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19495, "pid": 5, "tid": 7, "ts": 1716454216925696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216912559, "dur": 6, "args": { "External id": 19495, "cbid": 211, "correlation": 19495 } }, { "ph": "s", "id": 19495, "pid": 76337, "tid": -914061504, "ts": 1716454216912559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216925704, "dur": 6, "args": { "External id": 19501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 19501, "pid": 5, "tid": 7, "ts": 1716454216925704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216913302, "dur": 13, "args": { "External id": 19501, "cbid": 211, "correlation": 19501 } }, { "ph": "s", "id": 19501, "pid": 76337, "tid": -914061504, "ts": 1716454216913302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216925712, "dur": 5, "args": { "External id": 19510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19510, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19510, "pid": 5, "tid": 7, "ts": 1716454216925712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216915904, "dur": 23, "args": { "External id": 19510, "cbid": 211, "correlation": 19510 } }, { "ph": "s", "id": 19510, "pid": 76337, "tid": -914061504, "ts": 1716454216915904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216925719, "dur": 5, "args": { "External id": 19519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19519, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19519, "pid": 5, "tid": 7, "ts": 1716454216925719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216915953, "dur": 9, "args": { "External id": 19519, "cbid": 211, "correlation": 19519 } }, { "ph": "s", "id": 19519, "pid": 76337, "tid": -914061504, "ts": 1716454216915953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454216925725, "dur": 3, "args": { "External id": 19535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19535, "pid": 5, "tid": 7, "ts": 1716454216925725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916201, "dur": 18, "args": { "External id": 19535, "cbid": 211, "correlation": 19535 } }, { "ph": "s", "id": 19535, "pid": 76337, "tid": -914061504, "ts": 1716454216916201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925729, "dur": 3, "args": { "External id": 19543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19543, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19543, "pid": 5, "tid": 7, "ts": 1716454216925729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916263, "dur": 12, "args": { "External id": 19543, "cbid": 211, "correlation": 19543 } }, { "ph": "s", "id": 19543, "pid": 76337, "tid": -914061504, "ts": 1716454216916263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925733, "dur": 3, "args": { "External id": 19551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19551, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19551, "pid": 5, "tid": 7, "ts": 1716454216925733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916305, "dur": 11, "args": { "External id": 19551, "cbid": 211, "correlation": 19551 } }, { "ph": "s", "id": 19551, "pid": 76337, "tid": -914061504, "ts": 1716454216916305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925737, "dur": 4, "args": { "External id": 19559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19559, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19559, "pid": 5, "tid": 7, "ts": 1716454216925737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916341, "dur": 11, "args": { "External id": 19559, "cbid": 211, "correlation": 19559 } }, { "ph": "s", "id": 19559, "pid": 76337, "tid": -914061504, "ts": 1716454216916341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454216925743, "dur": 4, "args": { "External id": 19571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19571, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19571, "pid": 5, "tid": 7, "ts": 1716454216925743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916425, "dur": 15, "args": { "External id": 19571, "cbid": 211, "correlation": 19571 } }, { "ph": "s", "id": 19571, "pid": 76337, "tid": -914061504, "ts": 1716454216916425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216925748, "dur": 4, "args": { "External id": 19582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19582, "pid": 5, "tid": 7, "ts": 1716454216925748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916475, "dur": 12, "args": { "External id": 19582, "cbid": 211, "correlation": 19582 } }, { "ph": "s", "id": 19582, "pid": 76337, "tid": -914061504, "ts": 1716454216916475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925753, "dur": 3, "args": { "External id": 19590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19590, "pid": 5, "tid": 7, "ts": 1716454216925753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916511, "dur": 8, "args": { "External id": 19590, "cbid": 211, "correlation": 19590 } }, { "ph": "s", "id": 19590, "pid": 76337, "tid": -914061504, "ts": 1716454216916511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925757, "dur": 5, "args": { "External id": 19598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19598, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19598, "pid": 5, "tid": 7, "ts": 1716454216925757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916542, "dur": 10, "args": { "External id": 19598, "cbid": 211, "correlation": 19598 } }, { "ph": "s", "id": 19598, "pid": 76337, "tid": -914061504, "ts": 1716454216916542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925764, "dur": 5, "args": { "External id": 19606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19606, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19606, "pid": 5, "tid": 7, "ts": 1716454216925764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916573, "dur": 12, "args": { "External id": 19606, "cbid": 211, "correlation": 19606 } }, { "ph": "s", "id": 19606, "pid": 76337, "tid": -914061504, "ts": 1716454216916573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216925770, "dur": 4, "args": { "External id": 19615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19615, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19615, "pid": 5, "tid": 7, "ts": 1716454216925770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916608, "dur": 10, "args": { "External id": 19615, "cbid": 211, "correlation": 19615 } }, { "ph": "s", "id": 19615, "pid": 76337, "tid": -914061504, "ts": 1716454216916608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454216925775, "dur": 4, "args": { "External id": 19628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19628, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19628, "pid": 5, "tid": 7, "ts": 1716454216925775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916667, "dur": 14, "args": { "External id": 19628, "cbid": 211, "correlation": 19628 } }, { "ph": "s", "id": 19628, "pid": 76337, "tid": -914061504, "ts": 1716454216916667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454216925781, "dur": 8, "args": { "External id": 19638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19638, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 19638, "pid": 5, "tid": 7, "ts": 1716454216925781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916724, "dur": 13, "args": { "External id": 19638, "cbid": 211, "correlation": 19638 } }, { "ph": "s", "id": 19638, "pid": 76337, "tid": -914061504, "ts": 1716454216916724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216916903, "dur": 3, "args": { "External id": 19655, "cbid": 251, "correlation": 19655 } }, { "ph": "f", "id": 19655, "pid": 76337, "tid": -914061504, "ts": 1716454216916903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454216925790, "dur": 12, "args": { "External id": 19657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19657, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19657, "pid": 5, "tid": 7, "ts": 1716454216925790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916916, "dur": 20, "args": { "External id": 19657, "cbid": 211, "correlation": 19657 } }, { "ph": "s", "id": 19657, "pid": 76337, "tid": -914061504, "ts": 1716454216916916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216925803, "dur": 3, "args": { "External id": 19665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19665, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19665, "pid": 5, "tid": 7, "ts": 1716454216925803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216916994, "dur": 13, "args": { "External id": 19665, "cbid": 211, "correlation": 19665 } }, { "ph": "s", "id": 19665, "pid": 76337, "tid": -914061504, "ts": 1716454216916994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216917087, "dur": 2, "args": { "External id": 19681, "cbid": 251, "correlation": 19681 } }, { "ph": "f", "id": 19681, "pid": 76337, "tid": -914061504, "ts": 1716454216917087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216917095, "dur": 0, "args": { "External id": 19683, "cbid": 251, "correlation": 19683 } }, { "ph": "f", "id": 19683, "pid": 76337, "tid": -914061504, "ts": 1716454216917095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216925808, "dur": 14, "args": { "External id": 19684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19684, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19684, "pid": 5, "tid": 7, "ts": 1716454216925808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216917098, "dur": 15, "args": { "External id": 19684, "cbid": 211, "correlation": 19684 } }, { "ph": "s", "id": 19684, "pid": 76337, "tid": -914061504, "ts": 1716454216917098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216925823, "dur": 5, "args": { "External id": 19686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19686, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19686, "pid": 5, "tid": 7, "ts": 1716454216925823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216917118, "dur": 8, "args": { "External id": 19686, "cbid": 211, "correlation": 19686 } }, { "ph": "s", "id": 19686, "pid": 76337, "tid": -914061504, "ts": 1716454216917118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216917229, "dur": 1, "args": { "External id": 19696, "cbid": 317, "correlation": 19696 } }, { "ph": "f", "id": 19696, "pid": 76337, "tid": -914061504, "ts": 1716454216917229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216917231, "dur": 1, "args": { "External id": 19697, "cbid": 203, "correlation": 19697 } }, { "ph": "f", "id": 19697, "pid": 76337, "tid": -914061504, "ts": 1716454216917231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216917233, "dur": 1, "args": { "External id": 19698, "cbid": 205, "correlation": 19698 } }, { "ph": "f", "id": 19698, "pid": 76337, "tid": -914061504, "ts": 1716454216917233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216925830, "dur": 6, "args": { "External id": 19702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19702, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19702, "pid": 5, "tid": 7, "ts": 1716454216925830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216920791, "dur": 31, "args": { "External id": 19702, "cbid": 211, "correlation": 19702 } }, { "ph": "s", "id": 19702, "pid": 76337, "tid": -914061504, "ts": 1716454216920791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216925837, "dur": 4, "args": { "External id": 19704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 19704, "pid": 5, "tid": 7, "ts": 1716454216925837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216920825, "dur": 6, "args": { "External id": 19704, "cbid": 211, "correlation": 19704 } }, { "ph": "s", "id": 19704, "pid": 76337, "tid": -914061504, "ts": 1716454216920825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216925842, "dur": 3, "args": { "External id": 19706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19706, "pid": 5, "tid": 7, "ts": 1716454216925842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216920839, "dur": 8, "args": { "External id": 19706, "cbid": 211, "correlation": 19706 } }, { "ph": "s", "id": 19706, "pid": 76337, "tid": -914061504, "ts": 1716454216920839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216920851, "dur": 0, "args": { "External id": 19707, "cbid": 51, "correlation": 19707 } }, { "ph": "s", "id": 19707, "pid": 76337, "tid": -914061504, "ts": 1716454216920851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216925847, "dur": 83, "args": { "External id": 19708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19708, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19708, "pid": 5, "tid": 7, "ts": 1716454216925847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216920852, "dur": 8, "args": { "External id": 19708, "cbid": 211, "correlation": 19708 } }, { "ph": "s", "id": 19708, "pid": 76337, "tid": -914061504, "ts": 1716454216920852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216925932, "dur": 59, "args": { "External id": 19713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19713, "pid": 5, "tid": 7, "ts": 1716454216925932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216921694, "dur": 16, "args": { "External id": 19713, "cbid": 211, "correlation": 19713 } }, { "ph": "s", "id": 19713, "pid": 76337, "tid": -914061504, "ts": 1716454216921694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216926687, "dur": 51, "args": { "External id": 19733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19733, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 19733, "pid": 5, "tid": 7, "ts": 1716454216926687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216926659, "dur": 30, "args": { "External id": 19733, "cbid": 211, "correlation": 19733 } }, { "ph": "s", "id": 19733, "pid": 76337, "tid": -914061504, "ts": 1716454216926659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216926739, "dur": 4, "args": { "External id": 19745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19745, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19745, "pid": 5, "tid": 7, "ts": 1716454216926739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216926714, "dur": 11, "args": { "External id": 19745, "cbid": 211, "correlation": 19745 } }, { "ph": "s", "id": 19745, "pid": 76337, "tid": -914061504, "ts": 1716454216926714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216926761, "dur": 58, "args": { "External id": 19748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19748, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19748, "pid": 5, "tid": 7, "ts": 1716454216926761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216926751, "dur": 10, "args": { "External id": 19748, "cbid": 211, "correlation": 19748 } }, { "ph": "s", "id": 19748, "pid": 76337, "tid": -914061504, "ts": 1716454216926751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216926835, "dur": 36, "args": { "External id": 19757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19757, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19757, "pid": 5, "tid": 7, "ts": 1716454216926835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216926820, "dur": 14, "args": { "External id": 19757, "cbid": 211, "correlation": 19757 } }, { "ph": "s", "id": 19757, "pid": 76337, "tid": -914061504, "ts": 1716454216926820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216926934, "dur": 1, "args": { "External id": 19767, "cbid": 317, "correlation": 19767 } }, { "ph": "f", "id": 19767, "pid": 76337, "tid": -914061504, "ts": 1716454216926934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216926935, "dur": 1, "args": { "External id": 19768, "cbid": 203, "correlation": 19768 } }, { "ph": "f", "id": 19768, "pid": 76337, "tid": -914061504, "ts": 1716454216926935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216926938, "dur": 1, "args": { "External id": 19769, "cbid": 205, "correlation": 19769 } }, { "ph": "f", "id": 19769, "pid": 76337, "tid": -914061504, "ts": 1716454216926938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216926989, "dur": 40, "args": { "External id": 19773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19773, "pid": 5, "tid": 7, "ts": 1716454216926989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216926967, "dur": 23, "args": { "External id": 19773, "cbid": 211, "correlation": 19773 } }, { "ph": "s", "id": 19773, "pid": 76337, "tid": -914061504, "ts": 1716454216926967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216927031, "dur": 15, "args": { "External id": 19775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19775, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19775, "pid": 5, "tid": 7, "ts": 1716454216927031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216926993, "dur": 6, "args": { "External id": 19775, "cbid": 211, "correlation": 19775 } }, { "ph": "s", "id": 19775, "pid": 76337, "tid": -914061504, "ts": 1716454216926993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216927047, "dur": 3, "args": { "External id": 19777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19777, "pid": 5, "tid": 7, "ts": 1716454216927047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927009, "dur": 9, "args": { "External id": 19777, "cbid": 211, "correlation": 19777 } }, { "ph": "s", "id": 19777, "pid": 76337, "tid": -914061504, "ts": 1716454216927009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216927023, "dur": 0, "args": { "External id": 19778, "cbid": 51, "correlation": 19778 } }, { "ph": "s", "id": 19778, "pid": 76337, "tid": -914061504, "ts": 1716454216927023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216927051, "dur": 706, "args": { "External id": 19779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19779, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19779, "pid": 5, "tid": 7, "ts": 1716454216927051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927025, "dur": 8, "args": { "External id": 19779, "cbid": 211, "correlation": 19779 } }, { "ph": "s", "id": 19779, "pid": 76337, "tid": -914061504, "ts": 1716454216927025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216927759, "dur": 58, "args": { "External id": 19784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19784, "pid": 5, "tid": 7, "ts": 1716454216927759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927068, "dur": 12, "args": { "External id": 19784, "cbid": 211, "correlation": 19784 } }, { "ph": "s", "id": 19784, "pid": 76337, "tid": -914061504, "ts": 1716454216927068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216927819, "dur": 3, "args": { "External id": 19792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19792, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19792, "pid": 5, "tid": 7, "ts": 1716454216927819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927117, "dur": 9, "args": { "External id": 19792, "cbid": 211, "correlation": 19792 } }, { "ph": "s", "id": 19792, "pid": 76337, "tid": -914061504, "ts": 1716454216927117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216927266, "dur": 5, "args": { "External id": 19808, "cbid": 251, "correlation": 19808 } }, { "ph": "f", "id": 19808, "pid": 76337, "tid": -914061504, "ts": 1716454216927266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216927277, "dur": 0, "args": { "External id": 19810, "cbid": 251, "correlation": 19810 } }, { "ph": "f", "id": 19810, "pid": 76337, "tid": -914061504, "ts": 1716454216927277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454216927823, "dur": 9, "args": { "External id": 19811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19811, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 19811, "pid": 5, "tid": 7, "ts": 1716454216927823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927282, "dur": 16, "args": { "External id": 19811, "cbid": 211, "correlation": 19811 } }, { "ph": "s", "id": 19811, "pid": 76337, "tid": -914061504, "ts": 1716454216927282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454216927834, "dur": 4, "args": { "External id": 19813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19813, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 19813, "pid": 5, "tid": 7, "ts": 1716454216927834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927302, "dur": 9, "args": { "External id": 19813, "cbid": 211, "correlation": 19813 } }, { "ph": "s", "id": 19813, "pid": 76337, "tid": -914061504, "ts": 1716454216927302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216927839, "dur": 54, "args": { "External id": 19823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19823, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19823, "pid": 5, "tid": 7, "ts": 1716454216927839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927393, "dur": 12, "args": { "External id": 19823, "cbid": 211, "correlation": 19823 } }, { "ph": "s", "id": 19823, "pid": 76337, "tid": -914061504, "ts": 1716454216927393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216927895, "dur": 52, "args": { "External id": 19843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19843, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 19843, "pid": 5, "tid": 7, "ts": 1716454216927895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927462, "dur": 11, "args": { "External id": 19843, "cbid": 211, "correlation": 19843 } }, { "ph": "s", "id": 19843, "pid": 76337, "tid": -914061504, "ts": 1716454216927462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216927949, "dur": 4, "args": { "External id": 19855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19855, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19855, "pid": 5, "tid": 7, "ts": 1716454216927949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927483, "dur": 6, "args": { "External id": 19855, "cbid": 211, "correlation": 19855 } }, { "ph": "s", "id": 19855, "pid": 76337, "tid": -914061504, "ts": 1716454216927483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216927954, "dur": 56, "args": { "External id": 19858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19858, "pid": 5, "tid": 7, "ts": 1716454216927954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927501, "dur": 6, "args": { "External id": 19858, "cbid": 211, "correlation": 19858 } }, { "ph": "s", "id": 19858, "pid": 76337, "tid": -914061504, "ts": 1716454216927501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216928011, "dur": 36, "args": { "External id": 19867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19867, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19867, "pid": 5, "tid": 7, "ts": 1716454216928011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927543, "dur": 10, "args": { "External id": 19867, "cbid": 211, "correlation": 19867 } }, { "ph": "s", "id": 19867, "pid": 76337, "tid": -914061504, "ts": 1716454216927543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216927619, "dur": 0, "args": { "External id": 19877, "cbid": 317, "correlation": 19877 } }, { "ph": "f", "id": 19877, "pid": 76337, "tid": -914061504, "ts": 1716454216927619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216927619, "dur": 0, "args": { "External id": 19878, "cbid": 203, "correlation": 19878 } }, { "ph": "f", "id": 19878, "pid": 76337, "tid": -914061504, "ts": 1716454216927619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216927620, "dur": 0, "args": { "External id": 19879, "cbid": 205, "correlation": 19879 } }, { "ph": "f", "id": 19879, "pid": 76337, "tid": -914061504, "ts": 1716454216927620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216928049, "dur": 41, "args": { "External id": 19883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19883, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19883, "pid": 5, "tid": 7, "ts": 1716454216928049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927635, "dur": 12, "args": { "External id": 19883, "cbid": 211, "correlation": 19883 } }, { "ph": "s", "id": 19883, "pid": 76337, "tid": -914061504, "ts": 1716454216927635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454216928091, "dur": 14, "args": { "External id": 19885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19885, "pid": 5, "tid": 7, "ts": 1716454216928091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927649, "dur": 5, "args": { "External id": 19885, "cbid": 211, "correlation": 19885 } }, { "ph": "s", "id": 19885, "pid": 76337, "tid": -914061504, "ts": 1716454216927649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454216928107, "dur": 3, "args": { "External id": 19887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19887, "pid": 5, "tid": 7, "ts": 1716454216928107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927658, "dur": 6, "args": { "External id": 19887, "cbid": 211, "correlation": 19887 } }, { "ph": "s", "id": 19887, "pid": 76337, "tid": -914061504, "ts": 1716454216927658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454216927666, "dur": 0, "args": { "External id": 19888, "cbid": 51, "correlation": 19888 } }, { "ph": "s", "id": 19888, "pid": 76337, "tid": -914061504, "ts": 1716454216927666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454216928111, "dur": 699, "args": { "External id": 19889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19889, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 19889, "pid": 5, "tid": 7, "ts": 1716454216928111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927667, "dur": 5, "args": { "External id": 19889, "cbid": 211, "correlation": 19889 } }, { "ph": "s", "id": 19889, "pid": 76337, "tid": -914061504, "ts": 1716454216927667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216928812, "dur": 59, "args": { "External id": 19894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19894, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19894, "pid": 5, "tid": 7, "ts": 1716454216928812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927695, "dur": 9, "args": { "External id": 19894, "cbid": 211, "correlation": 19894 } }, { "ph": "s", "id": 19894, "pid": 76337, "tid": -914061504, "ts": 1716454216927695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216928873, "dur": 50, "args": { "External id": 19902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19902, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19902, "pid": 5, "tid": 7, "ts": 1716454216928873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927727, "dur": 10, "args": { "External id": 19902, "cbid": 211, "correlation": 19902 } }, { "ph": "s", "id": 19902, "pid": 76337, "tid": -914061504, "ts": 1716454216927727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454216928924, "dur": 36, "args": { "External id": 19910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19910, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19910, "pid": 5, "tid": 7, "ts": 1716454216928924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927776, "dur": 11, "args": { "External id": 19910, "cbid": 211, "correlation": 19910 } }, { "ph": "s", "id": 19910, "pid": 76337, "tid": -914061504, "ts": 1716454216927776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454216928962, "dur": 52, "args": { "External id": 19930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19930, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 19930, "pid": 5, "tid": 7, "ts": 1716454216928962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927907, "dur": 14, "args": { "External id": 19930, "cbid": 211, "correlation": 19930 } }, { "ph": "s", "id": 19930, "pid": 76337, "tid": -914061504, "ts": 1716454216927907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454216929015, "dur": 4, "args": { "External id": 19942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19942, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 19942, "pid": 5, "tid": 7, "ts": 1716454216929015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927931, "dur": 25, "args": { "External id": 19942, "cbid": 211, "correlation": 19942 } }, { "ph": "s", "id": 19942, "pid": 76337, "tid": -914061504, "ts": 1716454216927931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216929020, "dur": 56, "args": { "External id": 19945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19945, "pid": 5, "tid": 7, "ts": 1716454216929020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216927969, "dur": 16, "args": { "External id": 19945, "cbid": 211, "correlation": 19945 } }, { "ph": "s", "id": 19945, "pid": 76337, "tid": -914061504, "ts": 1716454216927969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454216928050, "dur": 0, "args": { "External id": 19956, "cbid": 317, "correlation": 19956 } }, { "ph": "f", "id": 19956, "pid": 76337, "tid": -914061504, "ts": 1716454216928050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454216928051, "dur": 0, "args": { "External id": 19957, "cbid": 203, "correlation": 19957 } }, { "ph": "f", "id": 19957, "pid": 76337, "tid": -914061504, "ts": 1716454216928051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454216928051, "dur": 0, "args": { "External id": 19958, "cbid": 205, "correlation": 19958 } }, { "ph": "f", "id": 19958, "pid": 76337, "tid": -914061504, "ts": 1716454216928051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928087, "dur": 3, "args": { "External id": 19962, "cbid": 251, "correlation": 19962 } }, { "ph": "f", "id": 19962, "pid": 76337, "tid": -914061504, "ts": 1716454216928087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928092, "dur": 1, "args": { "External id": 19963, "cbid": 251, "correlation": 19963 } }, { "ph": "f", "id": 19963, "pid": 76337, "tid": -914061504, "ts": 1716454216928092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928094, "dur": 1, "args": { "External id": 19964, "cbid": 251, "correlation": 19964 } }, { "ph": "f", "id": 19964, "pid": 76337, "tid": -914061504, "ts": 1716454216928094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928096, "dur": 1, "args": { "External id": 19965, "cbid": 251, "correlation": 19965 } }, { "ph": "f", "id": 19965, "pid": 76337, "tid": -914061504, "ts": 1716454216928096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928098, "dur": 1, "args": { "External id": 19966, "cbid": 251, "correlation": 19966 } }, { "ph": "f", "id": 19966, "pid": 76337, "tid": -914061504, "ts": 1716454216928098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928100, "dur": 1, "args": { "External id": 19967, "cbid": 251, "correlation": 19967 } }, { "ph": "f", "id": 19967, "pid": 76337, "tid": -914061504, "ts": 1716454216928100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928102, "dur": 1, "args": { "External id": 19968, "cbid": 251, "correlation": 19968 } }, { "ph": "f", "id": 19968, "pid": 76337, "tid": -914061504, "ts": 1716454216928102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928104, "dur": 1, "args": { "External id": 19969, "cbid": 251, "correlation": 19969 } }, { "ph": "f", "id": 19969, "pid": 76337, "tid": -914061504, "ts": 1716454216928104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454216928106, "dur": 0, "args": { "External id": 19970, "cbid": 251, "correlation": 19970 } }, { "ph": "f", "id": 19970, "pid": 76337, "tid": -914061504, "ts": 1716454216928106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454216929077, "dur": 116, "args": { "External id": 19971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19971, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 19971, "pid": 5, "tid": 7, "ts": 1716454216929077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216928110, "dur": 15, "args": { "External id": 19971, "cbid": 211, "correlation": 19971 } }, { "ph": "s", "id": 19971, "pid": 76337, "tid": -914061504, "ts": 1716454216928110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454216929195, "dur": 60, "args": { "External id": 19977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19977, "pid": 5, "tid": 7, "ts": 1716454216929195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216928158, "dur": 10, "args": { "External id": 19977, "cbid": 211, "correlation": 19977 } }, { "ph": "s", "id": 19977, "pid": 76337, "tid": -914061504, "ts": 1716454216928158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217311368, "dur": 609, "args": { "External id": 19986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 19986, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 19986, "pid": 5, "tid": 7, "ts": 1716454217311368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454216928317, "dur": 383057, "args": { "External id": 19986, "cbid": 211, "correlation": 19986 } }, { "ph": "s", "id": 19986, "pid": 76337, "tid": -914061504, "ts": 1716454216928317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217311979, "dur": 177, "args": { "External id": 20008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20008, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20008, "pid": 5, "tid": 7, "ts": 1716454217311979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217311544, "dur": 17, "args": { "External id": 20008, "cbid": 211, "correlation": 20008 } }, { "ph": "s", "id": 20008, "pid": 76337, "tid": -914061504, "ts": 1716454217311544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217311768, "dur": 4, "args": { "External id": 20019, "cbid": 251, "correlation": 20019 } }, { "ph": "f", "id": 20019, "pid": 76337, "tid": -914061504, "ts": 1716454217311768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217312158, "dur": 195, "args": { "External id": 20020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20020, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20020, "pid": 5, "tid": 7, "ts": 1716454217312158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217311780, "dur": 17, "args": { "External id": 20020, "cbid": 211, "correlation": 20020 } }, { "ph": "s", "id": 20020, "pid": 76337, "tid": -914061504, "ts": 1716454217311780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217311862, "dur": 1, "args": { "External id": 20031, "cbid": 251, "correlation": 20031 } }, { "ph": "f", "id": 20031, "pid": 76337, "tid": -914061504, "ts": 1716454217311862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217312354, "dur": 188, "args": { "External id": 20032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20032, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20032, "pid": 5, "tid": 7, "ts": 1716454217312354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217311866, "dur": 11, "args": { "External id": 20032, "cbid": 211, "correlation": 20032 } }, { "ph": "s", "id": 20032, "pid": 76337, "tid": -914061504, "ts": 1716454217311866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217311931, "dur": 1, "args": { "External id": 20043, "cbid": 251, "correlation": 20043 } }, { "ph": "f", "id": 20043, "pid": 76337, "tid": -914061504, "ts": 1716454217311931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217312543, "dur": 185, "args": { "External id": 20044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20044, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20044, "pid": 5, "tid": 7, "ts": 1716454217312543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217311935, "dur": 12, "args": { "External id": 20044, "cbid": 211, "correlation": 20044 } }, { "ph": "s", "id": 20044, "pid": 76337, "tid": -914061504, "ts": 1716454217311935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217312729, "dur": 18232, "args": { "External id": 20065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20065, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20065, "pid": 5, "tid": 7, "ts": 1716454217312729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312067, "dur": 16, "args": { "External id": 20065, "cbid": 211, "correlation": 20065 } }, { "ph": "s", "id": 20065, "pid": 76337, "tid": -914061504, "ts": 1716454217312067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217312209, "dur": 3, "args": { "External id": 20083, "cbid": 251, "correlation": 20083 } }, { "ph": "f", "id": 20083, "pid": 76337, "tid": -914061504, "ts": 1716454217312209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217330962, "dur": 199, "args": { "External id": 20085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20085, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20085, "pid": 5, "tid": 7, "ts": 1716454217330962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312217, "dur": 14, "args": { "External id": 20085, "cbid": 211, "correlation": 20085 } }, { "ph": "s", "id": 20085, "pid": 76337, "tid": -914061504, "ts": 1716454217312217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217331163, "dur": 66, "args": { "External id": 20093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20093, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20093, "pid": 5, "tid": 7, "ts": 1716454217331163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312328, "dur": 14, "args": { "External id": 20093, "cbid": 211, "correlation": 20093 } }, { "ph": "s", "id": 20093, "pid": 76337, "tid": -914061504, "ts": 1716454217312328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217331230, "dur": 97, "args": { "External id": 20101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20101, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20101, "pid": 5, "tid": 7, "ts": 1716454217331230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312379, "dur": 10, "args": { "External id": 20101, "cbid": 211, "correlation": 20101 } }, { "ph": "s", "id": 20101, "pid": 76337, "tid": -914061504, "ts": 1716454217312379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217331328, "dur": 54, "args": { "External id": 20112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20112, "pid": 5, "tid": 7, "ts": 1716454217331328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312490, "dur": 16, "args": { "External id": 20112, "cbid": 211, "correlation": 20112 } }, { "ph": "s", "id": 20112, "pid": 76337, "tid": -914061504, "ts": 1716454217312490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217331384, "dur": 90, "args": { "External id": 20134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20134, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20134, "pid": 5, "tid": 7, "ts": 1716454217331384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312537, "dur": 10, "args": { "External id": 20134, "cbid": 211, "correlation": 20134 } }, { "ph": "s", "id": 20134, "pid": 76337, "tid": -914061504, "ts": 1716454217312537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217312628, "dur": 1, "args": { "External id": 20145, "cbid": 251, "correlation": 20145 } }, { "ph": "f", "id": 20145, "pid": 76337, "tid": -914061504, "ts": 1716454217312628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217331475, "dur": 104, "args": { "External id": 20146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20146, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20146, "pid": 5, "tid": 7, "ts": 1716454217331475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312634, "dur": 13, "args": { "External id": 20146, "cbid": 211, "correlation": 20146 } }, { "ph": "s", "id": 20146, "pid": 76337, "tid": -914061504, "ts": 1716454217312634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217312721, "dur": 1, "args": { "External id": 20157, "cbid": 251, "correlation": 20157 } }, { "ph": "f", "id": 20157, "pid": 76337, "tid": -914061504, "ts": 1716454217312721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217312726, "dur": 0, "args": { "External id": 20158, "cbid": 251, "correlation": 20158 } }, { "ph": "f", "id": 20158, "pid": 76337, "tid": -914061504, "ts": 1716454217312726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217331580, "dur": 10, "args": { "External id": 20159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20159, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 20159, "pid": 5, "tid": 7, "ts": 1716454217331580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312728, "dur": 14, "args": { "External id": 20159, "cbid": 211, "correlation": 20159 } }, { "ph": "s", "id": 20159, "pid": 76337, "tid": -914061504, "ts": 1716454217312728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217331592, "dur": 5, "args": { "External id": 20161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20161, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 20161, "pid": 5, "tid": 7, "ts": 1716454217331592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312746, "dur": 10, "args": { "External id": 20161, "cbid": 211, "correlation": 20161 } }, { "ph": "s", "id": 20161, "pid": 76337, "tid": -914061504, "ts": 1716454217312746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217312812, "dur": 1, "args": { "External id": 20172, "cbid": 251, "correlation": 20172 } }, { "ph": "f", "id": 20172, "pid": 76337, "tid": -914061504, "ts": 1716454217312812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217312815, "dur": 0, "args": { "External id": 20173, "cbid": 251, "correlation": 20173 } }, { "ph": "f", "id": 20173, "pid": 76337, "tid": -914061504, "ts": 1716454217312815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217331598, "dur": 6, "args": { "External id": 20174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20174, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 20174, "pid": 5, "tid": 7, "ts": 1716454217331598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312817, "dur": 12, "args": { "External id": 20174, "cbid": 211, "correlation": 20174 } }, { "ph": "s", "id": 20174, "pid": 76337, "tid": -914061504, "ts": 1716454217312817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217331605, "dur": 3, "args": { "External id": 20176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20176, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 20176, "pid": 5, "tid": 7, "ts": 1716454217331605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312831, "dur": 6, "args": { "External id": 20176, "cbid": 211, "correlation": 20176 } }, { "ph": "s", "id": 20176, "pid": 76337, "tid": -914061504, "ts": 1716454217312831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217331610, "dur": 152, "args": { "External id": 20197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20197, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20197, "pid": 5, "tid": 7, "ts": 1716454217331610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217312906, "dur": 12, "args": { "External id": 20197, "cbid": 211, "correlation": 20197 } }, { "ph": "s", "id": 20197, "pid": 76337, "tid": -914061504, "ts": 1716454217312906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217313017, "dur": 2, "args": { "External id": 20215, "cbid": 251, "correlation": 20215 } }, { "ph": "f", "id": 20215, "pid": 76337, "tid": -914061504, "ts": 1716454217313017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217331763, "dur": 105, "args": { "External id": 20217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20217, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20217, "pid": 5, "tid": 7, "ts": 1716454217331763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217313025, "dur": 15, "args": { "External id": 20217, "cbid": 211, "correlation": 20217 } }, { "ph": "s", "id": 20217, "pid": 76337, "tid": -914061504, "ts": 1716454217313025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217331870, "dur": 35, "args": { "External id": 20225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20225, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20225, "pid": 5, "tid": 7, "ts": 1716454217331870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217313099, "dur": 12, "args": { "External id": 20225, "cbid": 211, "correlation": 20225 } }, { "ph": "s", "id": 20225, "pid": 76337, "tid": -914061504, "ts": 1716454217313099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217331906, "dur": 67, "args": { "External id": 20233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20233, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20233, "pid": 5, "tid": 7, "ts": 1716454217331906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217313142, "dur": 10, "args": { "External id": 20233, "cbid": 211, "correlation": 20233 } }, { "ph": "s", "id": 20233, "pid": 76337, "tid": -914061504, "ts": 1716454217313142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217331975, "dur": 92, "args": { "External id": 20255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20255, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20255, "pid": 5, "tid": 7, "ts": 1716454217331975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217313194, "dur": 11, "args": { "External id": 20255, "cbid": 211, "correlation": 20255 } }, { "ph": "s", "id": 20255, "pid": 76337, "tid": -914061504, "ts": 1716454217313194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217313314, "dur": 1, "args": { "External id": 20271, "cbid": 251, "correlation": 20271 } }, { "ph": "f", "id": 20271, "pid": 76337, "tid": -914061504, "ts": 1716454217313314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217332068, "dur": 566, "args": { "External id": 20273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20273, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20273, "pid": 5, "tid": 7, "ts": 1716454217332068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217313320, "dur": 14, "args": { "External id": 20273, "cbid": 211, "correlation": 20273 } }, { "ph": "s", "id": 20273, "pid": 76337, "tid": -914061504, "ts": 1716454217313320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217332635, "dur": 240, "args": { "External id": 20281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20281, "pid": 5, "tid": 7, "ts": 1716454217332635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217313413, "dur": 16, "args": { "External id": 20281, "cbid": 211, "correlation": 20281 } }, { "ph": "s", "id": 20281, "pid": 76337, "tid": -914061504, "ts": 1716454217313413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217313453, "dur": 1, "args": { "External id": 20289, "cbid": 317, "correlation": 20289 } }, { "ph": "f", "id": 20289, "pid": 76337, "tid": -914061504, "ts": 1716454217313453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454217313455, "dur": 292036, "args": { "External id": 20290, "cbid": 20, "correlation": 20290 } }, { "ph": "f", "id": 20290, "pid": 76337, "tid": -914061504, "ts": 1716454217313455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217605612, "dur": 252, "args": { "External id": 20293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20293, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20293, "pid": 5, "tid": 7, "ts": 1716454217605612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217605550, "dur": 64, "args": { "External id": 20293, "cbid": 211, "correlation": 20293 } }, { "ph": "s", "id": 20293, "pid": 76337, "tid": -914061504, "ts": 1716454217605550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217605864, "dur": 11, "args": { "External id": 20309, "cbid": 251, "correlation": 20309 } }, { "ph": "f", "id": 20309, "pid": 76337, "tid": -914061504, "ts": 1716454217605864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217605881, "dur": 0, "args": { "External id": 20311, "cbid": 251, "correlation": 20311 } }, { "ph": "f", "id": 20311, "pid": 76337, "tid": -914061504, "ts": 1716454217605881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217605916, "dur": 354, "args": { "External id": 20312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20312, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 20312, "pid": 5, "tid": 7, "ts": 1716454217605916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217605893, "dur": 24, "args": { "External id": 20312, "cbid": 211, "correlation": 20312 } }, { "ph": "s", "id": 20312, "pid": 76337, "tid": -914061504, "ts": 1716454217605893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217606272, "dur": 50, "args": { "External id": 20320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20320, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20320, "pid": 5, "tid": 7, "ts": 1716454217606272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217605971, "dur": 25, "args": { "External id": 20320, "cbid": 211, "correlation": 20320 } }, { "ph": "s", "id": 20320, "pid": 76337, "tid": -914061504, "ts": 1716454217605971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217606323, "dur": 155, "args": { "External id": 20331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20331, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20331, "pid": 5, "tid": 7, "ts": 1716454217606323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606133, "dur": 20, "args": { "External id": 20331, "cbid": 211, "correlation": 20331 } }, { "ph": "s", "id": 20331, "pid": 76337, "tid": -914061504, "ts": 1716454217606133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217606283, "dur": 1, "args": { "External id": 20343, "cbid": 317, "correlation": 20343 } }, { "ph": "f", "id": 20343, "pid": 76337, "tid": -914061504, "ts": 1716454217606283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217606284, "dur": 1, "args": { "External id": 20344, "cbid": 203, "correlation": 20344 } }, { "ph": "f", "id": 20344, "pid": 76337, "tid": -914061504, "ts": 1716454217606284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217606287, "dur": 1, "args": { "External id": 20345, "cbid": 205, "correlation": 20345 } }, { "ph": "f", "id": 20345, "pid": 76337, "tid": -914061504, "ts": 1716454217606287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606338, "dur": 2, "args": { "External id": 20349, "cbid": 251, "correlation": 20349 } }, { "ph": "f", "id": 20349, "pid": 76337, "tid": -914061504, "ts": 1716454217606338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606342, "dur": 1, "args": { "External id": 20350, "cbid": 251, "correlation": 20350 } }, { "ph": "f", "id": 20350, "pid": 76337, "tid": -914061504, "ts": 1716454217606342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606343, "dur": 1, "args": { "External id": 20351, "cbid": 251, "correlation": 20351 } }, { "ph": "f", "id": 20351, "pid": 76337, "tid": -914061504, "ts": 1716454217606343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606345, "dur": 1, "args": { "External id": 20352, "cbid": 251, "correlation": 20352 } }, { "ph": "f", "id": 20352, "pid": 76337, "tid": -914061504, "ts": 1716454217606345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606347, "dur": 1, "args": { "External id": 20353, "cbid": 251, "correlation": 20353 } }, { "ph": "f", "id": 20353, "pid": 76337, "tid": -914061504, "ts": 1716454217606347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606349, "dur": 1, "args": { "External id": 20354, "cbid": 251, "correlation": 20354 } }, { "ph": "f", "id": 20354, "pid": 76337, "tid": -914061504, "ts": 1716454217606349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606351, "dur": 1, "args": { "External id": 20355, "cbid": 251, "correlation": 20355 } }, { "ph": "f", "id": 20355, "pid": 76337, "tid": -914061504, "ts": 1716454217606351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606353, "dur": 1, "args": { "External id": 20356, "cbid": 251, "correlation": 20356 } }, { "ph": "f", "id": 20356, "pid": 76337, "tid": -914061504, "ts": 1716454217606353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217606356, "dur": 0, "args": { "External id": 20357, "cbid": 251, "correlation": 20357 } }, { "ph": "f", "id": 20357, "pid": 76337, "tid": -914061504, "ts": 1716454217606356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217606479, "dur": 115, "args": { "External id": 20358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20358, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20358, "pid": 5, "tid": 7, "ts": 1716454217606479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606359, "dur": 14, "args": { "External id": 20358, "cbid": 211, "correlation": 20358 } }, { "ph": "s", "id": 20358, "pid": 76337, "tid": -914061504, "ts": 1716454217606359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217606595, "dur": 59, "args": { "External id": 20364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20364, "pid": 5, "tid": 7, "ts": 1716454217606595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606404, "dur": 10, "args": { "External id": 20364, "cbid": 211, "correlation": 20364 } }, { "ph": "s", "id": 20364, "pid": 76337, "tid": -914061504, "ts": 1716454217606404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217606655, "dur": 50, "args": { "External id": 20372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20372, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20372, "pid": 5, "tid": 7, "ts": 1716454217606655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606440, "dur": 10, "args": { "External id": 20372, "cbid": 211, "correlation": 20372 } }, { "ph": "s", "id": 20372, "pid": 76337, "tid": -914061504, "ts": 1716454217606440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217606706, "dur": 52, "args": { "External id": 20392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20392, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 20392, "pid": 5, "tid": 7, "ts": 1716454217606706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606573, "dur": 18, "args": { "External id": 20392, "cbid": 211, "correlation": 20392 } }, { "ph": "s", "id": 20392, "pid": 76337, "tid": -914061504, "ts": 1716454217606573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217606760, "dur": 4, "args": { "External id": 20404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20404, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 20404, "pid": 5, "tid": 7, "ts": 1716454217606760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606610, "dur": 11, "args": { "External id": 20404, "cbid": 211, "correlation": 20404 } }, { "ph": "s", "id": 20404, "pid": 76337, "tid": -914061504, "ts": 1716454217606610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217606765, "dur": 56, "args": { "External id": 20407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20407, "pid": 5, "tid": 7, "ts": 1716454217606765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606643, "dur": 9, "args": { "External id": 20407, "cbid": 211, "correlation": 20407 } }, { "ph": "s", "id": 20407, "pid": 76337, "tid": -914061504, "ts": 1716454217606643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217606822, "dur": 37, "args": { "External id": 20416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20416, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20416, "pid": 5, "tid": 7, "ts": 1716454217606822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606701, "dur": 12, "args": { "External id": 20416, "cbid": 211, "correlation": 20416 } }, { "ph": "s", "id": 20416, "pid": 76337, "tid": -914061504, "ts": 1716454217606701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217606761, "dur": 0, "args": { "External id": 20426, "cbid": 317, "correlation": 20426 } }, { "ph": "f", "id": 20426, "pid": 76337, "tid": -914061504, "ts": 1716454217606761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217606762, "dur": 0, "args": { "External id": 20427, "cbid": 203, "correlation": 20427 } }, { "ph": "f", "id": 20427, "pid": 76337, "tid": -914061504, "ts": 1716454217606762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217606762, "dur": 0, "args": { "External id": 20428, "cbid": 205, "correlation": 20428 } }, { "ph": "f", "id": 20428, "pid": 76337, "tid": -914061504, "ts": 1716454217606762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217606861, "dur": 41, "args": { "External id": 20432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20432, "pid": 5, "tid": 7, "ts": 1716454217606861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606802, "dur": 28, "args": { "External id": 20432, "cbid": 211, "correlation": 20432 } }, { "ph": "s", "id": 20432, "pid": 76337, "tid": -914061504, "ts": 1716454217606802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217606903, "dur": 14, "args": { "External id": 20434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20434, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20434, "pid": 5, "tid": 7, "ts": 1716454217606903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606833, "dur": 6, "args": { "External id": 20434, "cbid": 211, "correlation": 20434 } }, { "ph": "s", "id": 20434, "pid": 76337, "tid": -914061504, "ts": 1716454217606833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217606918, "dur": 3, "args": { "External id": 20436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 20436, "pid": 5, "tid": 7, "ts": 1716454217606918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606859, "dur": 25, "args": { "External id": 20436, "cbid": 211, "correlation": 20436 } }, { "ph": "s", "id": 20436, "pid": 76337, "tid": -914061504, "ts": 1716454217606859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217606892, "dur": 0, "args": { "External id": 20437, "cbid": 51, "correlation": 20437 } }, { "ph": "s", "id": 20437, "pid": 76337, "tid": -914061504, "ts": 1716454217606892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217606923, "dur": 689, "args": { "External id": 20438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20438, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20438, "pid": 5, "tid": 7, "ts": 1716454217606923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606894, "dur": 17, "args": { "External id": 20438, "cbid": 211, "correlation": 20438 } }, { "ph": "s", "id": 20438, "pid": 76337, "tid": -914061504, "ts": 1716454217606894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217607613, "dur": 59, "args": { "External id": 20443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20443, "pid": 5, "tid": 7, "ts": 1716454217607613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606936, "dur": 9, "args": { "External id": 20443, "cbid": 211, "correlation": 20443 } }, { "ph": "s", "id": 20443, "pid": 76337, "tid": -914061504, "ts": 1716454217606936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217607673, "dur": 4, "args": { "External id": 20451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20451, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 20451, "pid": 5, "tid": 7, "ts": 1716454217607673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217606993, "dur": 10, "args": { "External id": 20451, "cbid": 211, "correlation": 20451 } }, { "ph": "s", "id": 20451, "pid": 76337, "tid": -914061504, "ts": 1716454217606993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607091, "dur": 2, "args": { "External id": 20467, "cbid": 251, "correlation": 20467 } }, { "ph": "f", "id": 20467, "pid": 76337, "tid": -914061504, "ts": 1716454217607091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607097, "dur": 0, "args": { "External id": 20469, "cbid": 251, "correlation": 20469 } }, { "ph": "f", "id": 20469, "pid": 76337, "tid": -914061504, "ts": 1716454217607097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217607678, "dur": 11, "args": { "External id": 20470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20470, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 20470, "pid": 5, "tid": 7, "ts": 1716454217607678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607099, "dur": 14, "args": { "External id": 20470, "cbid": 211, "correlation": 20470 } }, { "ph": "s", "id": 20470, "pid": 76337, "tid": -914061504, "ts": 1716454217607099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217607691, "dur": 5, "args": { "External id": 20472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20472, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 20472, "pid": 5, "tid": 7, "ts": 1716454217607691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607119, "dur": 8, "args": { "External id": 20472, "cbid": 211, "correlation": 20472 } }, { "ph": "s", "id": 20472, "pid": 76337, "tid": -914061504, "ts": 1716454217607119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217607697, "dur": 55, "args": { "External id": 20482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20482, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20482, "pid": 5, "tid": 7, "ts": 1716454217607697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607198, "dur": 12, "args": { "External id": 20482, "cbid": 211, "correlation": 20482 } }, { "ph": "s", "id": 20482, "pid": 76337, "tid": -914061504, "ts": 1716454217607198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217607753, "dur": 51, "args": { "External id": 20502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20502, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 20502, "pid": 5, "tid": 7, "ts": 1716454217607753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607265, "dur": 11, "args": { "External id": 20502, "cbid": 211, "correlation": 20502 } }, { "ph": "s", "id": 20502, "pid": 76337, "tid": -914061504, "ts": 1716454217607265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217607805, "dur": 4, "args": { "External id": 20514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20514, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 20514, "pid": 5, "tid": 7, "ts": 1716454217607805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607297, "dur": 8, "args": { "External id": 20514, "cbid": 211, "correlation": 20514 } }, { "ph": "s", "id": 20514, "pid": 76337, "tid": -914061504, "ts": 1716454217607297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217607810, "dur": 54, "args": { "External id": 20517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20517, "pid": 5, "tid": 7, "ts": 1716454217607810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607318, "dur": 6, "args": { "External id": 20517, "cbid": 211, "correlation": 20517 } }, { "ph": "s", "id": 20517, "pid": 76337, "tid": -914061504, "ts": 1716454217607318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217607866, "dur": 36, "args": { "External id": 20526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20526, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20526, "pid": 5, "tid": 7, "ts": 1716454217607866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607359, "dur": 10, "args": { "External id": 20526, "cbid": 211, "correlation": 20526 } }, { "ph": "s", "id": 20526, "pid": 76337, "tid": -914061504, "ts": 1716454217607359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217607425, "dur": 0, "args": { "External id": 20536, "cbid": 317, "correlation": 20536 } }, { "ph": "f", "id": 20536, "pid": 76337, "tid": -914061504, "ts": 1716454217607425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217607425, "dur": 0, "args": { "External id": 20537, "cbid": 203, "correlation": 20537 } }, { "ph": "f", "id": 20537, "pid": 76337, "tid": -914061504, "ts": 1716454217607425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217607426, "dur": 0, "args": { "External id": 20538, "cbid": 205, "correlation": 20538 } }, { "ph": "f", "id": 20538, "pid": 76337, "tid": -914061504, "ts": 1716454217607426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217607903, "dur": 39, "args": { "External id": 20542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20542, "pid": 5, "tid": 7, "ts": 1716454217607903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607440, "dur": 12, "args": { "External id": 20542, "cbid": 211, "correlation": 20542 } }, { "ph": "s", "id": 20542, "pid": 76337, "tid": -914061504, "ts": 1716454217607440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217607944, "dur": 14, "args": { "External id": 20544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20544, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20544, "pid": 5, "tid": 7, "ts": 1716454217607944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607455, "dur": 5, "args": { "External id": 20544, "cbid": 211, "correlation": 20544 } }, { "ph": "s", "id": 20544, "pid": 76337, "tid": -914061504, "ts": 1716454217607455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217607959, "dur": 3, "args": { "External id": 20546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 20546, "pid": 5, "tid": 7, "ts": 1716454217607959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607464, "dur": 6, "args": { "External id": 20546, "cbid": 211, "correlation": 20546 } }, { "ph": "s", "id": 20546, "pid": 76337, "tid": -914061504, "ts": 1716454217607464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217607473, "dur": 0, "args": { "External id": 20547, "cbid": 51, "correlation": 20547 } }, { "ph": "s", "id": 20547, "pid": 76337, "tid": -914061504, "ts": 1716454217607473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217607964, "dur": 685, "args": { "External id": 20548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20548, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20548, "pid": 5, "tid": 7, "ts": 1716454217607964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607474, "dur": 5, "args": { "External id": 20548, "cbid": 211, "correlation": 20548 } }, { "ph": "s", "id": 20548, "pid": 76337, "tid": -914061504, "ts": 1716454217607474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217608650, "dur": 58, "args": { "External id": 20553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20553, "pid": 5, "tid": 7, "ts": 1716454217608650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607501, "dur": 9, "args": { "External id": 20553, "cbid": 211, "correlation": 20553 } }, { "ph": "s", "id": 20553, "pid": 76337, "tid": -914061504, "ts": 1716454217607501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217608710, "dur": 50, "args": { "External id": 20561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20561, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20561, "pid": 5, "tid": 7, "ts": 1716454217608710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607534, "dur": 9, "args": { "External id": 20561, "cbid": 211, "correlation": 20561 } }, { "ph": "s", "id": 20561, "pid": 76337, "tid": -914061504, "ts": 1716454217607534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217608761, "dur": 35, "args": { "External id": 20569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20569, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20569, "pid": 5, "tid": 7, "ts": 1716454217608761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607578, "dur": 11, "args": { "External id": 20569, "cbid": 211, "correlation": 20569 } }, { "ph": "s", "id": 20569, "pid": 76337, "tid": -914061504, "ts": 1716454217607578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217608797, "dur": 52, "args": { "External id": 20589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20589, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 20589, "pid": 5, "tid": 7, "ts": 1716454217608797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607664, "dur": 12, "args": { "External id": 20589, "cbid": 211, "correlation": 20589 } }, { "ph": "s", "id": 20589, "pid": 76337, "tid": -914061504, "ts": 1716454217607664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217608851, "dur": 4, "args": { "External id": 20601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20601, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 20601, "pid": 5, "tid": 7, "ts": 1716454217608851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607686, "dur": 7, "args": { "External id": 20601, "cbid": 211, "correlation": 20601 } }, { "ph": "s", "id": 20601, "pid": 76337, "tid": -914061504, "ts": 1716454217607686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217608856, "dur": 54, "args": { "External id": 20604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20604, "pid": 5, "tid": 7, "ts": 1716454217608856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607703, "dur": 6, "args": { "External id": 20604, "cbid": 211, "correlation": 20604 } }, { "ph": "s", "id": 20604, "pid": 76337, "tid": -914061504, "ts": 1716454217607703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217607762, "dur": 0, "args": { "External id": 20615, "cbid": 317, "correlation": 20615 } }, { "ph": "f", "id": 20615, "pid": 76337, "tid": -914061504, "ts": 1716454217607762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217607762, "dur": 0, "args": { "External id": 20616, "cbid": 203, "correlation": 20616 } }, { "ph": "f", "id": 20616, "pid": 76337, "tid": -914061504, "ts": 1716454217607762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217607763, "dur": 0, "args": { "External id": 20617, "cbid": 205, "correlation": 20617 } }, { "ph": "f", "id": 20617, "pid": 76337, "tid": -914061504, "ts": 1716454217607763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607784, "dur": 1, "args": { "External id": 20621, "cbid": 251, "correlation": 20621 } }, { "ph": "f", "id": 20621, "pid": 76337, "tid": -914061504, "ts": 1716454217607784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607787, "dur": 0, "args": { "External id": 20622, "cbid": 251, "correlation": 20622 } }, { "ph": "f", "id": 20622, "pid": 76337, "tid": -914061504, "ts": 1716454217607787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607787, "dur": 0, "args": { "External id": 20623, "cbid": 251, "correlation": 20623 } }, { "ph": "f", "id": 20623, "pid": 76337, "tid": -914061504, "ts": 1716454217607787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607788, "dur": 0, "args": { "External id": 20624, "cbid": 251, "correlation": 20624 } }, { "ph": "f", "id": 20624, "pid": 76337, "tid": -914061504, "ts": 1716454217607788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607789, "dur": 0, "args": { "External id": 20625, "cbid": 251, "correlation": 20625 } }, { "ph": "f", "id": 20625, "pid": 76337, "tid": -914061504, "ts": 1716454217607789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607789, "dur": 0, "args": { "External id": 20626, "cbid": 251, "correlation": 20626 } }, { "ph": "f", "id": 20626, "pid": 76337, "tid": -914061504, "ts": 1716454217607789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607790, "dur": 0, "args": { "External id": 20627, "cbid": 251, "correlation": 20627 } }, { "ph": "f", "id": 20627, "pid": 76337, "tid": -914061504, "ts": 1716454217607790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607791, "dur": 0, "args": { "External id": 20628, "cbid": 251, "correlation": 20628 } }, { "ph": "f", "id": 20628, "pid": 76337, "tid": -914061504, "ts": 1716454217607791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217607792, "dur": 0, "args": { "External id": 20629, "cbid": 251, "correlation": 20629 } }, { "ph": "f", "id": 20629, "pid": 76337, "tid": -914061504, "ts": 1716454217607792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217608911, "dur": 112, "args": { "External id": 20630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20630, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20630, "pid": 5, "tid": 7, "ts": 1716454217608911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607794, "dur": 13, "args": { "External id": 20630, "cbid": 211, "correlation": 20630 } }, { "ph": "s", "id": 20630, "pid": 76337, "tid": -914061504, "ts": 1716454217607794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217609024, "dur": 59, "args": { "External id": 20636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20636, "pid": 5, "tid": 7, "ts": 1716454217609024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607830, "dur": 9, "args": { "External id": 20636, "cbid": 211, "correlation": 20636 } }, { "ph": "s", "id": 20636, "pid": 76337, "tid": -914061504, "ts": 1716454217607830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217609084, "dur": 619, "args": { "External id": 20645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20645, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20645, "pid": 5, "tid": 7, "ts": 1716454217609084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217607950, "dur": 18, "args": { "External id": 20645, "cbid": 211, "correlation": 20645 } }, { "ph": "s", "id": 20645, "pid": 76337, "tid": -914061504, "ts": 1716454217607950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217609705, "dur": 177, "args": { "External id": 20667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20667, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20667, "pid": 5, "tid": 7, "ts": 1716454217609705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608050, "dur": 14, "args": { "External id": 20667, "cbid": 211, "correlation": 20667 } }, { "ph": "s", "id": 20667, "pid": 76337, "tid": -914061504, "ts": 1716454217608050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608178, "dur": 2, "args": { "External id": 20678, "cbid": 251, "correlation": 20678 } }, { "ph": "f", "id": 20678, "pid": 76337, "tid": -914061504, "ts": 1716454217608178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217609883, "dur": 195, "args": { "External id": 20679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20679, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20679, "pid": 5, "tid": 7, "ts": 1716454217609883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608186, "dur": 16, "args": { "External id": 20679, "cbid": 211, "correlation": 20679 } }, { "ph": "s", "id": 20679, "pid": 76337, "tid": -914061504, "ts": 1716454217608186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608262, "dur": 1, "args": { "External id": 20690, "cbid": 251, "correlation": 20690 } }, { "ph": "f", "id": 20690, "pid": 76337, "tid": -914061504, "ts": 1716454217608262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217610079, "dur": 185, "args": { "External id": 20691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20691, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20691, "pid": 5, "tid": 7, "ts": 1716454217610079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608267, "dur": 11, "args": { "External id": 20691, "cbid": 211, "correlation": 20691 } }, { "ph": "s", "id": 20691, "pid": 76337, "tid": -914061504, "ts": 1716454217608267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608331, "dur": 1, "args": { "External id": 20702, "cbid": 251, "correlation": 20702 } }, { "ph": "f", "id": 20702, "pid": 76337, "tid": -914061504, "ts": 1716454217608331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217610266, "dur": 183, "args": { "External id": 20703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20703, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20703, "pid": 5, "tid": 7, "ts": 1716454217610266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608335, "dur": 11, "args": { "External id": 20703, "cbid": 211, "correlation": 20703 } }, { "ph": "s", "id": 20703, "pid": 76337, "tid": -914061504, "ts": 1716454217608335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217610450, "dur": 18204, "args": { "External id": 20724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20724, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20724, "pid": 5, "tid": 7, "ts": 1716454217610450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608440, "dur": 18, "args": { "External id": 20724, "cbid": 211, "correlation": 20724 } }, { "ph": "s", "id": 20724, "pid": 76337, "tid": -914061504, "ts": 1716454217608440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608559, "dur": 2, "args": { "External id": 20742, "cbid": 251, "correlation": 20742 } }, { "ph": "f", "id": 20742, "pid": 76337, "tid": -914061504, "ts": 1716454217608559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217628656, "dur": 199, "args": { "External id": 20744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20744, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20744, "pid": 5, "tid": 7, "ts": 1716454217628656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608566, "dur": 13, "args": { "External id": 20744, "cbid": 211, "correlation": 20744 } }, { "ph": "s", "id": 20744, "pid": 76337, "tid": -914061504, "ts": 1716454217608566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217628855, "dur": 66, "args": { "External id": 20752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20752, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20752, "pid": 5, "tid": 7, "ts": 1716454217628855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608636, "dur": 13, "args": { "External id": 20752, "cbid": 211, "correlation": 20752 } }, { "ph": "s", "id": 20752, "pid": 76337, "tid": -914061504, "ts": 1716454217608636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217628923, "dur": 98, "args": { "External id": 20760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20760, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20760, "pid": 5, "tid": 7, "ts": 1716454217628923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608676, "dur": 9, "args": { "External id": 20760, "cbid": 211, "correlation": 20760 } }, { "ph": "s", "id": 20760, "pid": 76337, "tid": -914061504, "ts": 1716454217608676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217629022, "dur": 54, "args": { "External id": 20771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20771, "pid": 5, "tid": 7, "ts": 1716454217629022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608760, "dur": 13, "args": { "External id": 20771, "cbid": 211, "correlation": 20771 } }, { "ph": "s", "id": 20771, "pid": 76337, "tid": -914061504, "ts": 1716454217608760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217629077, "dur": 90, "args": { "External id": 20793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20793, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20793, "pid": 5, "tid": 7, "ts": 1716454217629077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608792, "dur": 8, "args": { "External id": 20793, "cbid": 211, "correlation": 20793 } }, { "ph": "s", "id": 20793, "pid": 76337, "tid": -914061504, "ts": 1716454217608792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608889, "dur": 1, "args": { "External id": 20804, "cbid": 251, "correlation": 20804 } }, { "ph": "f", "id": 20804, "pid": 76337, "tid": -914061504, "ts": 1716454217608889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217629169, "dur": 103, "args": { "External id": 20805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20805, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20805, "pid": 5, "tid": 7, "ts": 1716454217629169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217608894, "dur": 13, "args": { "External id": 20805, "cbid": 211, "correlation": 20805 } }, { "ph": "s", "id": 20805, "pid": 76337, "tid": -914061504, "ts": 1716454217608894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608986, "dur": 10, "args": { "External id": 20816, "cbid": 251, "correlation": 20816 } }, { "ph": "f", "id": 20816, "pid": 76337, "tid": -914061504, "ts": 1716454217608986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217608999, "dur": 0, "args": { "External id": 20817, "cbid": 251, "correlation": 20817 } }, { "ph": "f", "id": 20817, "pid": 76337, "tid": -914061504, "ts": 1716454217608999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217629273, "dur": 10, "args": { "External id": 20818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20818, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 20818, "pid": 5, "tid": 7, "ts": 1716454217629273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609002, "dur": 23, "args": { "External id": 20818, "cbid": 211, "correlation": 20818 } }, { "ph": "s", "id": 20818, "pid": 76337, "tid": -914061504, "ts": 1716454217609002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217629284, "dur": 5, "args": { "External id": 20820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20820, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 20820, "pid": 5, "tid": 7, "ts": 1716454217629284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609028, "dur": 9, "args": { "External id": 20820, "cbid": 211, "correlation": 20820 } }, { "ph": "s", "id": 20820, "pid": 76337, "tid": -914061504, "ts": 1716454217609028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217609106, "dur": 1, "args": { "External id": 20831, "cbid": 251, "correlation": 20831 } }, { "ph": "f", "id": 20831, "pid": 76337, "tid": -914061504, "ts": 1716454217609106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217609110, "dur": 0, "args": { "External id": 20832, "cbid": 251, "correlation": 20832 } }, { "ph": "f", "id": 20832, "pid": 76337, "tid": -914061504, "ts": 1716454217609110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217629291, "dur": 6, "args": { "External id": 20833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20833, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 20833, "pid": 5, "tid": 7, "ts": 1716454217629291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609111, "dur": 13, "args": { "External id": 20833, "cbid": 211, "correlation": 20833 } }, { "ph": "s", "id": 20833, "pid": 76337, "tid": -914061504, "ts": 1716454217609111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217629298, "dur": 3, "args": { "External id": 20835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20835, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 20835, "pid": 5, "tid": 7, "ts": 1716454217629298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609126, "dur": 6, "args": { "External id": 20835, "cbid": 211, "correlation": 20835 } }, { "ph": "s", "id": 20835, "pid": 76337, "tid": -914061504, "ts": 1716454217609126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217629303, "dur": 152, "args": { "External id": 20856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20856, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20856, "pid": 5, "tid": 7, "ts": 1716454217629303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609202, "dur": 13, "args": { "External id": 20856, "cbid": 211, "correlation": 20856 } }, { "ph": "s", "id": 20856, "pid": 76337, "tid": -914061504, "ts": 1716454217609202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217609312, "dur": 2, "args": { "External id": 20874, "cbid": 251, "correlation": 20874 } }, { "ph": "f", "id": 20874, "pid": 76337, "tid": -914061504, "ts": 1716454217609312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217629457, "dur": 105, "args": { "External id": 20876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20876, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 20876, "pid": 5, "tid": 7, "ts": 1716454217629457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609319, "dur": 15, "args": { "External id": 20876, "cbid": 211, "correlation": 20876 } }, { "ph": "s", "id": 20876, "pid": 76337, "tid": -914061504, "ts": 1716454217609319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217629564, "dur": 34, "args": { "External id": 20884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20884, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20884, "pid": 5, "tid": 7, "ts": 1716454217629564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609392, "dur": 12, "args": { "External id": 20884, "cbid": 211, "correlation": 20884 } }, { "ph": "s", "id": 20884, "pid": 76337, "tid": -914061504, "ts": 1716454217609392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217629599, "dur": 68, "args": { "External id": 20892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20892, "pid": 5, "tid": 7, "ts": 1716454217629599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609434, "dur": 10, "args": { "External id": 20892, "cbid": 211, "correlation": 20892 } }, { "ph": "s", "id": 20892, "pid": 76337, "tid": -914061504, "ts": 1716454217609434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217629669, "dur": 90, "args": { "External id": 20914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20914, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20914, "pid": 5, "tid": 7, "ts": 1716454217629669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609499, "dur": 12, "args": { "External id": 20914, "cbid": 211, "correlation": 20914 } }, { "ph": "s", "id": 20914, "pid": 76337, "tid": -914061504, "ts": 1716454217609499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217609598, "dur": 1, "args": { "External id": 20930, "cbid": 251, "correlation": 20930 } }, { "ph": "f", "id": 20930, "pid": 76337, "tid": -914061504, "ts": 1716454217609598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217629761, "dur": 563, "args": { "External id": 20932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20932, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 20932, "pid": 5, "tid": 7, "ts": 1716454217629761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609604, "dur": 13, "args": { "External id": 20932, "cbid": 211, "correlation": 20932 } }, { "ph": "s", "id": 20932, "pid": 76337, "tid": -914061504, "ts": 1716454217609604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217630325, "dur": 239, "args": { "External id": 20940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20940, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20940, "pid": 5, "tid": 7, "ts": 1716454217630325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609691, "dur": 14, "args": { "External id": 20940, "cbid": 211, "correlation": 20940 } }, { "ph": "s", "id": 20940, "pid": 76337, "tid": -914061504, "ts": 1716454217609691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217630565, "dur": 250, "args": { "External id": 20948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20948, "pid": 5, "tid": 7, "ts": 1716454217630565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609729, "dur": 8, "args": { "External id": 20948, "cbid": 211, "correlation": 20948 } }, { "ph": "s", "id": 20948, "pid": 76337, "tid": -914061504, "ts": 1716454217609729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217609810, "dur": 2, "args": { "External id": 20964, "cbid": 251, "correlation": 20964 } }, { "ph": "f", "id": 20964, "pid": 76337, "tid": -914061504, "ts": 1716454217609810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217609815, "dur": 0, "args": { "External id": 20966, "cbid": 251, "correlation": 20966 } }, { "ph": "f", "id": 20966, "pid": 76337, "tid": -914061504, "ts": 1716454217609815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217630817, "dur": 354, "args": { "External id": 20967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20967, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 20967, "pid": 5, "tid": 7, "ts": 1716454217630817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609818, "dur": 14, "args": { "External id": 20967, "cbid": 211, "correlation": 20967 } }, { "ph": "s", "id": 20967, "pid": 76337, "tid": -914061504, "ts": 1716454217609818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217631172, "dur": 50, "args": { "External id": 20975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20975, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20975, "pid": 5, "tid": 7, "ts": 1716454217631172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609861, "dur": 10, "args": { "External id": 20975, "cbid": 211, "correlation": 20975 } }, { "ph": "s", "id": 20975, "pid": 76337, "tid": -914061504, "ts": 1716454217609861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217631223, "dur": 156, "args": { "External id": 20986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 20986, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 20986, "pid": 5, "tid": 7, "ts": 1716454217631223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217609929, "dur": 13, "args": { "External id": 20986, "cbid": 211, "correlation": 20986 } }, { "ph": "s", "id": 20986, "pid": 76337, "tid": -914061504, "ts": 1716454217609929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217610004, "dur": 0, "args": { "External id": 20998, "cbid": 317, "correlation": 20998 } }, { "ph": "f", "id": 20998, "pid": 76337, "tid": -914061504, "ts": 1716454217610004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217610005, "dur": 0, "args": { "External id": 20999, "cbid": 203, "correlation": 20999 } }, { "ph": "f", "id": 20999, "pid": 76337, "tid": -914061504, "ts": 1716454217610005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217610005, "dur": 0, "args": { "External id": 21000, "cbid": 205, "correlation": 21000 } }, { "ph": "f", "id": 21000, "pid": 76337, "tid": -914061504, "ts": 1716454217610005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610029, "dur": 1, "args": { "External id": 21004, "cbid": 251, "correlation": 21004 } }, { "ph": "f", "id": 21004, "pid": 76337, "tid": -914061504, "ts": 1716454217610029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610031, "dur": 0, "args": { "External id": 21005, "cbid": 251, "correlation": 21005 } }, { "ph": "f", "id": 21005, "pid": 76337, "tid": -914061504, "ts": 1716454217610031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610032, "dur": 0, "args": { "External id": 21006, "cbid": 251, "correlation": 21006 } }, { "ph": "f", "id": 21006, "pid": 76337, "tid": -914061504, "ts": 1716454217610032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610032, "dur": 0, "args": { "External id": 21007, "cbid": 251, "correlation": 21007 } }, { "ph": "f", "id": 21007, "pid": 76337, "tid": -914061504, "ts": 1716454217610032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610033, "dur": 0, "args": { "External id": 21008, "cbid": 251, "correlation": 21008 } }, { "ph": "f", "id": 21008, "pid": 76337, "tid": -914061504, "ts": 1716454217610033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610034, "dur": 0, "args": { "External id": 21009, "cbid": 251, "correlation": 21009 } }, { "ph": "f", "id": 21009, "pid": 76337, "tid": -914061504, "ts": 1716454217610034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610035, "dur": 0, "args": { "External id": 21010, "cbid": 251, "correlation": 21010 } }, { "ph": "f", "id": 21010, "pid": 76337, "tid": -914061504, "ts": 1716454217610035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610035, "dur": 0, "args": { "External id": 21011, "cbid": 251, "correlation": 21011 } }, { "ph": "f", "id": 21011, "pid": 76337, "tid": -914061504, "ts": 1716454217610035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610037, "dur": 0, "args": { "External id": 21012, "cbid": 251, "correlation": 21012 } }, { "ph": "f", "id": 21012, "pid": 76337, "tid": -914061504, "ts": 1716454217610037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217631380, "dur": 114, "args": { "External id": 21013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21013, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 21013, "pid": 5, "tid": 7, "ts": 1716454217631380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610039, "dur": 13, "args": { "External id": 21013, "cbid": 211, "correlation": 21013 } }, { "ph": "s", "id": 21013, "pid": 76337, "tid": -914061504, "ts": 1716454217610039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217631496, "dur": 59, "args": { "External id": 21019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21019, "pid": 5, "tid": 7, "ts": 1716454217631496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610074, "dur": 9, "args": { "External id": 21019, "cbid": 211, "correlation": 21019 } }, { "ph": "s", "id": 21019, "pid": 76337, "tid": -914061504, "ts": 1716454217610074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217631556, "dur": 50, "args": { "External id": 21027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21027, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21027, "pid": 5, "tid": 7, "ts": 1716454217631556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610106, "dur": 9, "args": { "External id": 21027, "cbid": 211, "correlation": 21027 } }, { "ph": "s", "id": 21027, "pid": 76337, "tid": -914061504, "ts": 1716454217610106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217610184, "dur": 0, "args": { "External id": 21037, "cbid": 317, "correlation": 21037 } }, { "ph": "f", "id": 21037, "pid": 76337, "tid": -914061504, "ts": 1716454217610184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217610184, "dur": 0, "args": { "External id": 21038, "cbid": 203, "correlation": 21038 } }, { "ph": "f", "id": 21038, "pid": 76337, "tid": -914061504, "ts": 1716454217610184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217610185, "dur": 0, "args": { "External id": 21039, "cbid": 205, "correlation": 21039 } }, { "ph": "f", "id": 21039, "pid": 76337, "tid": -914061504, "ts": 1716454217610185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217631607, "dur": 40, "args": { "External id": 21043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21043, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21043, "pid": 5, "tid": 7, "ts": 1716454217631607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610202, "dur": 12, "args": { "External id": 21043, "cbid": 211, "correlation": 21043 } }, { "ph": "s", "id": 21043, "pid": 76337, "tid": -914061504, "ts": 1716454217610202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217631649, "dur": 14, "args": { "External id": 21045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21045, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21045, "pid": 5, "tid": 7, "ts": 1716454217631649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610217, "dur": 5, "args": { "External id": 21045, "cbid": 211, "correlation": 21045 } }, { "ph": "s", "id": 21045, "pid": 76337, "tid": -914061504, "ts": 1716454217610217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217631665, "dur": 1, "args": { "External id": 21047, "device": 5, "context": 1, "stream": 7, "correlation": 21047, "bytes": 1536, "memory bandwidth (GB/s)": 0.7868852459016393 } }, { "ph": "f", "id": 21047, "pid": 5, "tid": 7, "ts": 1716454217631665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217610236, "dur": 27, "args": { "External id": 21047, "cbid": 51, "correlation": 21047 } }, { "ph": "s", "id": 21047, "pid": 76337, "tid": -914061504, "ts": 1716454217610236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217631669, "dur": 355, "args": { "External id": 21048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21048, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21048, "pid": 5, "tid": 7, "ts": 1716454217631669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610265, "dur": 11, "args": { "External id": 21048, "cbid": 211, "correlation": 21048 } }, { "ph": "s", "id": 21048, "pid": 76337, "tid": -914061504, "ts": 1716454217610265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217632026, "dur": 13, "args": { "External id": 21050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21050, "pid": 5, "tid": 7, "ts": 1716454217632026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610289, "dur": 8, "args": { "External id": 21050, "cbid": 211, "correlation": 21050 } }, { "ph": "s", "id": 21050, "pid": 76337, "tid": -914061504, "ts": 1716454217610289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217632040, "dur": 14, "args": { "External id": 21056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21056, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21056, "pid": 5, "tid": 7, "ts": 1716454217632040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610322, "dur": 9, "args": { "External id": 21056, "cbid": 211, "correlation": 21056 } }, { "ph": "s", "id": 21056, "pid": 76337, "tid": -914061504, "ts": 1716454217610322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217632055, "dur": 18, "args": { "External id": 21076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21076, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 21076, "pid": 5, "tid": 7, "ts": 1716454217632055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610435, "dur": 14, "args": { "External id": 21076, "cbid": 211, "correlation": 21076 } }, { "ph": "s", "id": 21076, "pid": 76337, "tid": -914061504, "ts": 1716454217610435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217632075, "dur": 4, "args": { "External id": 21088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21088, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 21088, "pid": 5, "tid": 7, "ts": 1716454217632075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610468, "dur": 8, "args": { "External id": 21088, "cbid": 211, "correlation": 21088 } }, { "ph": "s", "id": 21088, "pid": 76337, "tid": -914061504, "ts": 1716454217610468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217632080, "dur": 17, "args": { "External id": 21091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21091, "pid": 5, "tid": 7, "ts": 1716454217632080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610488, "dur": 7, "args": { "External id": 21091, "cbid": 211, "correlation": 21091 } }, { "ph": "s", "id": 21091, "pid": 76337, "tid": -914061504, "ts": 1716454217610488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217632099, "dur": 12, "args": { "External id": 21100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21100, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21100, "pid": 5, "tid": 7, "ts": 1716454217632099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610530, "dur": 10, "args": { "External id": 21100, "cbid": 211, "correlation": 21100 } }, { "ph": "s", "id": 21100, "pid": 76337, "tid": -914061504, "ts": 1716454217610530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217610595, "dur": 0, "args": { "External id": 21110, "cbid": 317, "correlation": 21110 } }, { "ph": "f", "id": 21110, "pid": 76337, "tid": -914061504, "ts": 1716454217610595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217610596, "dur": 0, "args": { "External id": 21111, "cbid": 203, "correlation": 21111 } }, { "ph": "f", "id": 21111, "pid": 76337, "tid": -914061504, "ts": 1716454217610596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217610597, "dur": 0, "args": { "External id": 21112, "cbid": 205, "correlation": 21112 } }, { "ph": "f", "id": 21112, "pid": 76337, "tid": -914061504, "ts": 1716454217610597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217632112, "dur": 11, "args": { "External id": 21116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21116, "pid": 5, "tid": 7, "ts": 1716454217632112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610612, "dur": 13, "args": { "External id": 21116, "cbid": 211, "correlation": 21116 } }, { "ph": "s", "id": 21116, "pid": 76337, "tid": -914061504, "ts": 1716454217610612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217632124, "dur": 24, "args": { "External id": 21118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21118, "pid": 5, "tid": 7, "ts": 1716454217632124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610627, "dur": 5, "args": { "External id": 21118, "cbid": 211, "correlation": 21118 } }, { "ph": "s", "id": 21118, "pid": 76337, "tid": -914061504, "ts": 1716454217610627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217632150, "dur": 4, "args": { "External id": 21120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 21120, "pid": 5, "tid": 7, "ts": 1716454217632150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610639, "dur": 6, "args": { "External id": 21120, "cbid": 211, "correlation": 21120 } }, { "ph": "s", "id": 21120, "pid": 76337, "tid": -914061504, "ts": 1716454217610639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217610649, "dur": 0, "args": { "External id": 21121, "cbid": 51, "correlation": 21121 } }, { "ph": "s", "id": 21121, "pid": 76337, "tid": -914061504, "ts": 1716454217610649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217632155, "dur": 352, "args": { "External id": 21122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21122, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21122, "pid": 5, "tid": 7, "ts": 1716454217632155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610650, "dur": 16, "args": { "External id": 21122, "cbid": 211, "correlation": 21122 } }, { "ph": "s", "id": 21122, "pid": 76337, "tid": -914061504, "ts": 1716454217610650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217632508, "dur": 20, "args": { "External id": 21123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21123, "pid": 5, "tid": 7, "ts": 1716454217632508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610670, "dur": 6, "args": { "External id": 21123, "cbid": 211, "correlation": 21123 } }, { "ph": "s", "id": 21123, "pid": 76337, "tid": -914061504, "ts": 1716454217610670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217632529, "dur": 32, "args": { "External id": 21129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21129, "pid": 5, "tid": 7, "ts": 1716454217632529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610700, "dur": 9, "args": { "External id": 21129, "cbid": 211, "correlation": 21129 } }, { "ph": "s", "id": 21129, "pid": 76337, "tid": -914061504, "ts": 1716454217610700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217632563, "dur": 3, "args": { "External id": 21137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21137, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 21137, "pid": 5, "tid": 7, "ts": 1716454217632563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610744, "dur": 9, "args": { "External id": 21137, "cbid": 211, "correlation": 21137 } }, { "ph": "s", "id": 21137, "pid": 76337, "tid": -914061504, "ts": 1716454217610744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610812, "dur": 1, "args": { "External id": 21153, "cbid": 251, "correlation": 21153 } }, { "ph": "f", "id": 21153, "pid": 76337, "tid": -914061504, "ts": 1716454217610812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217610817, "dur": 0, "args": { "External id": 21155, "cbid": 251, "correlation": 21155 } }, { "ph": "f", "id": 21155, "pid": 76337, "tid": -914061504, "ts": 1716454217610817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217632567, "dur": 12, "args": { "External id": 21156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21156, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 21156, "pid": 5, "tid": 7, "ts": 1716454217632567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610819, "dur": 11, "args": { "External id": 21156, "cbid": 211, "correlation": 21156 } }, { "ph": "s", "id": 21156, "pid": 76337, "tid": -914061504, "ts": 1716454217610819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217632581, "dur": 5, "args": { "External id": 21158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21158, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 21158, "pid": 5, "tid": 7, "ts": 1716454217632581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610833, "dur": 6, "args": { "External id": 21158, "cbid": 211, "correlation": 21158 } }, { "ph": "s", "id": 21158, "pid": 76337, "tid": -914061504, "ts": 1716454217610833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217632587, "dur": 29, "args": { "External id": 21168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21168, "pid": 5, "tid": 7, "ts": 1716454217632587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610893, "dur": 13, "args": { "External id": 21168, "cbid": 211, "correlation": 21168 } }, { "ph": "s", "id": 21168, "pid": 76337, "tid": -914061504, "ts": 1716454217610893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217632618, "dur": 31, "args": { "External id": 21188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21188, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 21188, "pid": 5, "tid": 7, "ts": 1716454217632618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610959, "dur": 11, "args": { "External id": 21188, "cbid": 211, "correlation": 21188 } }, { "ph": "s", "id": 21188, "pid": 76337, "tid": -914061504, "ts": 1716454217610959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217632649, "dur": 4, "args": { "External id": 21200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21200, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 21200, "pid": 5, "tid": 7, "ts": 1716454217632649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217610988, "dur": 7, "args": { "External id": 21200, "cbid": 211, "correlation": 21200 } }, { "ph": "s", "id": 21200, "pid": 76337, "tid": -914061504, "ts": 1716454217610988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217632655, "dur": 30, "args": { "External id": 21203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21203, "pid": 5, "tid": 7, "ts": 1716454217632655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611008, "dur": 6, "args": { "External id": 21203, "cbid": 211, "correlation": 21203 } }, { "ph": "s", "id": 21203, "pid": 76337, "tid": -914061504, "ts": 1716454217611008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217632686, "dur": 20, "args": { "External id": 21212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21212, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21212, "pid": 5, "tid": 7, "ts": 1716454217632686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611049, "dur": 10, "args": { "External id": 21212, "cbid": 211, "correlation": 21212 } }, { "ph": "s", "id": 21212, "pid": 76337, "tid": -914061504, "ts": 1716454217611049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217611126, "dur": 0, "args": { "External id": 21222, "cbid": 317, "correlation": 21222 } }, { "ph": "f", "id": 21222, "pid": 76337, "tid": -914061504, "ts": 1716454217611126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217611127, "dur": 0, "args": { "External id": 21223, "cbid": 203, "correlation": 21223 } }, { "ph": "f", "id": 21223, "pid": 76337, "tid": -914061504, "ts": 1716454217611127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217611128, "dur": 0, "args": { "External id": 21224, "cbid": 205, "correlation": 21224 } }, { "ph": "f", "id": 21224, "pid": 76337, "tid": -914061504, "ts": 1716454217611128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217632708, "dur": 22, "args": { "External id": 21228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21228, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21228, "pid": 5, "tid": 7, "ts": 1716454217632708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611147, "dur": 13, "args": { "External id": 21228, "cbid": 211, "correlation": 21228 } }, { "ph": "s", "id": 21228, "pid": 76337, "tid": -914061504, "ts": 1716454217611147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217632731, "dur": 43, "args": { "External id": 21230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21230, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21230, "pid": 5, "tid": 7, "ts": 1716454217632731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611162, "dur": 5, "args": { "External id": 21230, "cbid": 211, "correlation": 21230 } }, { "ph": "s", "id": 21230, "pid": 76337, "tid": -914061504, "ts": 1716454217611162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217632775, "dur": 643, "args": { "External id": 21232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21232, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21232, "pid": 5, "tid": 7, "ts": 1716454217632775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611177, "dur": 23, "args": { "External id": 21232, "cbid": 211, "correlation": 21232 } }, { "ph": "s", "id": 21232, "pid": 76337, "tid": -914061504, "ts": 1716454217611177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217633420, "dur": 22, "args": { "External id": 21234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21234, "pid": 5, "tid": 7, "ts": 1716454217633420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611205, "dur": 5, "args": { "External id": 21234, "cbid": 211, "correlation": 21234 } }, { "ph": "s", "id": 21234, "pid": 76337, "tid": -914061504, "ts": 1716454217611205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217633443, "dur": 32, "args": { "External id": 21240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21240, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21240, "pid": 5, "tid": 7, "ts": 1716454217633443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611234, "dur": 9, "args": { "External id": 21240, "cbid": 211, "correlation": 21240 } }, { "ph": "s", "id": 21240, "pid": 76337, "tid": -914061504, "ts": 1716454217611234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217611295, "dur": 0, "args": { "External id": 21250, "cbid": 317, "correlation": 21250 } }, { "ph": "f", "id": 21250, "pid": 76337, "tid": -914061504, "ts": 1716454217611295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217611296, "dur": 0, "args": { "External id": 21251, "cbid": 203, "correlation": 21251 } }, { "ph": "f", "id": 21251, "pid": 76337, "tid": -914061504, "ts": 1716454217611296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217611297, "dur": 0, "args": { "External id": 21252, "cbid": 205, "correlation": 21252 } }, { "ph": "f", "id": 21252, "pid": 76337, "tid": -914061504, "ts": 1716454217611297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611320, "dur": 1, "args": { "External id": 21256, "cbid": 251, "correlation": 21256 } }, { "ph": "f", "id": 21256, "pid": 76337, "tid": -914061504, "ts": 1716454217611320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611322, "dur": 0, "args": { "External id": 21257, "cbid": 251, "correlation": 21257 } }, { "ph": "f", "id": 21257, "pid": 76337, "tid": -914061504, "ts": 1716454217611322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611322, "dur": 0, "args": { "External id": 21258, "cbid": 251, "correlation": 21258 } }, { "ph": "f", "id": 21258, "pid": 76337, "tid": -914061504, "ts": 1716454217611322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611323, "dur": 0, "args": { "External id": 21259, "cbid": 251, "correlation": 21259 } }, { "ph": "f", "id": 21259, "pid": 76337, "tid": -914061504, "ts": 1716454217611323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611324, "dur": 0, "args": { "External id": 21260, "cbid": 251, "correlation": 21260 } }, { "ph": "f", "id": 21260, "pid": 76337, "tid": -914061504, "ts": 1716454217611324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611324, "dur": 0, "args": { "External id": 21261, "cbid": 251, "correlation": 21261 } }, { "ph": "f", "id": 21261, "pid": 76337, "tid": -914061504, "ts": 1716454217611324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611325, "dur": 0, "args": { "External id": 21262, "cbid": 251, "correlation": 21262 } }, { "ph": "f", "id": 21262, "pid": 76337, "tid": -914061504, "ts": 1716454217611325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611326, "dur": 0, "args": { "External id": 21263, "cbid": 251, "correlation": 21263 } }, { "ph": "f", "id": 21263, "pid": 76337, "tid": -914061504, "ts": 1716454217611326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611327, "dur": 0, "args": { "External id": 21264, "cbid": 251, "correlation": 21264 } }, { "ph": "f", "id": 21264, "pid": 76337, "tid": -914061504, "ts": 1716454217611327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217633477, "dur": 51, "args": { "External id": 21265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21265, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 21265, "pid": 5, "tid": 7, "ts": 1716454217633477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611329, "dur": 12, "args": { "External id": 21265, "cbid": 211, "correlation": 21265 } }, { "ph": "s", "id": 21265, "pid": 76337, "tid": -914061504, "ts": 1716454217611329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217633529, "dur": 32, "args": { "External id": 21271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21271, "pid": 5, "tid": 7, "ts": 1716454217633529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611362, "dur": 9, "args": { "External id": 21271, "cbid": 211, "correlation": 21271 } }, { "ph": "s", "id": 21271, "pid": 76337, "tid": -914061504, "ts": 1716454217611362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217633562, "dur": 27, "args": { "External id": 21279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21279, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21279, "pid": 5, "tid": 7, "ts": 1716454217633562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611391, "dur": 8, "args": { "External id": 21279, "cbid": 211, "correlation": 21279 } }, { "ph": "s", "id": 21279, "pid": 76337, "tid": -914061504, "ts": 1716454217611391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217633591, "dur": 20, "args": { "External id": 21287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21287, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21287, "pid": 5, "tid": 7, "ts": 1716454217633591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611421, "dur": 9, "args": { "External id": 21287, "cbid": 211, "correlation": 21287 } }, { "ph": "s", "id": 21287, "pid": 76337, "tid": -914061504, "ts": 1716454217611421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217633612, "dur": 29, "args": { "External id": 21307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21307, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 21307, "pid": 5, "tid": 7, "ts": 1716454217633612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611503, "dur": 12, "args": { "External id": 21307, "cbid": 211, "correlation": 21307 } }, { "ph": "s", "id": 21307, "pid": 76337, "tid": -914061504, "ts": 1716454217611503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217633642, "dur": 4, "args": { "External id": 21319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21319, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 21319, "pid": 5, "tid": 7, "ts": 1716454217633642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611524, "dur": 6, "args": { "External id": 21319, "cbid": 211, "correlation": 21319 } }, { "ph": "s", "id": 21319, "pid": 76337, "tid": -914061504, "ts": 1716454217611524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217633648, "dur": 29, "args": { "External id": 21322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21322, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21322, "pid": 5, "tid": 7, "ts": 1716454217633648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611543, "dur": 6, "args": { "External id": 21322, "cbid": 211, "correlation": 21322 } }, { "ph": "s", "id": 21322, "pid": 76337, "tid": -914061504, "ts": 1716454217611543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217611611, "dur": 0, "args": { "External id": 21333, "cbid": 317, "correlation": 21333 } }, { "ph": "f", "id": 21333, "pid": 76337, "tid": -914061504, "ts": 1716454217611611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217611612, "dur": 0, "args": { "External id": 21334, "cbid": 203, "correlation": 21334 } }, { "ph": "f", "id": 21334, "pid": 76337, "tid": -914061504, "ts": 1716454217611612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217611613, "dur": 0, "args": { "External id": 21335, "cbid": 205, "correlation": 21335 } }, { "ph": "f", "id": 21335, "pid": 76337, "tid": -914061504, "ts": 1716454217611613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217633678, "dur": 21, "args": { "External id": 21339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21339, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21339, "pid": 5, "tid": 7, "ts": 1716454217633678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611635, "dur": 13, "args": { "External id": 21339, "cbid": 211, "correlation": 21339 } }, { "ph": "s", "id": 21339, "pid": 76337, "tid": -914061504, "ts": 1716454217611635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217633701, "dur": 118, "args": { "External id": 21341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21341, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21341, "pid": 5, "tid": 7, "ts": 1716454217633701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611662, "dur": 26, "args": { "External id": 21341, "cbid": 211, "correlation": 21341 } }, { "ph": "s", "id": 21341, "pid": 76337, "tid": -914061504, "ts": 1716454217611662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217633820, "dur": 21, "args": { "External id": 21343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21343, "pid": 5, "tid": 7, "ts": 1716454217633820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611692, "dur": 6, "args": { "External id": 21343, "cbid": 211, "correlation": 21343 } }, { "ph": "s", "id": 21343, "pid": 76337, "tid": -914061504, "ts": 1716454217611692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217633842, "dur": 32, "args": { "External id": 21349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21349, "pid": 5, "tid": 7, "ts": 1716454217633842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611723, "dur": 8, "args": { "External id": 21349, "cbid": 211, "correlation": 21349 } }, { "ph": "s", "id": 21349, "pid": 76337, "tid": -914061504, "ts": 1716454217611723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217633876, "dur": 177, "args": { "External id": 21358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21358, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21358, "pid": 5, "tid": 7, "ts": 1716454217633876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611809, "dur": 15, "args": { "External id": 21358, "cbid": 211, "correlation": 21358 } }, { "ph": "s", "id": 21358, "pid": 76337, "tid": -914061504, "ts": 1716454217611809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217634054, "dur": 64, "args": { "External id": 21380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21380, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21380, "pid": 5, "tid": 7, "ts": 1716454217634054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611868, "dur": 10, "args": { "External id": 21380, "cbid": 211, "correlation": 21380 } }, { "ph": "s", "id": 21380, "pid": 76337, "tid": -914061504, "ts": 1716454217611868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217611968, "dur": 1, "args": { "External id": 21391, "cbid": 251, "correlation": 21391 } }, { "ph": "f", "id": 21391, "pid": 76337, "tid": -914061504, "ts": 1716454217611968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217634119, "dur": 152, "args": { "External id": 21392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21392, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21392, "pid": 5, "tid": 7, "ts": 1716454217634119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217611982, "dur": 15, "args": { "External id": 21392, "cbid": 211, "correlation": 21392 } }, { "ph": "s", "id": 21392, "pid": 76337, "tid": -914061504, "ts": 1716454217611982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612058, "dur": 1, "args": { "External id": 21403, "cbid": 251, "correlation": 21403 } }, { "ph": "f", "id": 21403, "pid": 76337, "tid": -914061504, "ts": 1716454217612058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217634272, "dur": 143, "args": { "External id": 21404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21404, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21404, "pid": 5, "tid": 7, "ts": 1716454217634272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612063, "dur": 11, "args": { "External id": 21404, "cbid": 211, "correlation": 21404 } }, { "ph": "s", "id": 21404, "pid": 76337, "tid": -914061504, "ts": 1716454217612063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612126, "dur": 1, "args": { "External id": 21415, "cbid": 251, "correlation": 21415 } }, { "ph": "f", "id": 21415, "pid": 76337, "tid": -914061504, "ts": 1716454217612126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217634417, "dur": 147, "args": { "External id": 21416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21416, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21416, "pid": 5, "tid": 7, "ts": 1716454217634417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612130, "dur": 11, "args": { "External id": 21416, "cbid": 211, "correlation": 21416 } }, { "ph": "s", "id": 21416, "pid": 76337, "tid": -914061504, "ts": 1716454217612130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217634565, "dur": 1909, "args": { "External id": 21437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21437, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 21437, "pid": 5, "tid": 7, "ts": 1716454217634565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612213, "dur": 14, "args": { "External id": 21437, "cbid": 211, "correlation": 21437 } }, { "ph": "s", "id": 21437, "pid": 76337, "tid": -914061504, "ts": 1716454217612213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612315, "dur": 1, "args": { "External id": 21455, "cbid": 251, "correlation": 21455 } }, { "ph": "f", "id": 21455, "pid": 76337, "tid": -914061504, "ts": 1716454217612315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217636475, "dur": 146, "args": { "External id": 21457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21457, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 21457, "pid": 5, "tid": 7, "ts": 1716454217636475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612320, "dur": 13, "args": { "External id": 21457, "cbid": 211, "correlation": 21457 } }, { "ph": "s", "id": 21457, "pid": 76337, "tid": -914061504, "ts": 1716454217612320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217636622, "dur": 35, "args": { "External id": 21465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21465, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21465, "pid": 5, "tid": 7, "ts": 1716454217636622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612393, "dur": 12, "args": { "External id": 21465, "cbid": 211, "correlation": 21465 } }, { "ph": "s", "id": 21465, "pid": 76337, "tid": -914061504, "ts": 1716454217612393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217636659, "dur": 51, "args": { "External id": 21473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21473, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21473, "pid": 5, "tid": 7, "ts": 1716454217636659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612432, "dur": 9, "args": { "External id": 21473, "cbid": 211, "correlation": 21473 } }, { "ph": "s", "id": 21473, "pid": 76337, "tid": -914061504, "ts": 1716454217612432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217636711, "dur": 30, "args": { "External id": 21484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21484, "pid": 5, "tid": 7, "ts": 1716454217636711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612506, "dur": 13, "args": { "External id": 21484, "cbid": 211, "correlation": 21484 } }, { "ph": "s", "id": 21484, "pid": 76337, "tid": -914061504, "ts": 1716454217612506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217636742, "dur": 34, "args": { "External id": 21506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21506, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21506, "pid": 5, "tid": 7, "ts": 1716454217636742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612538, "dur": 8, "args": { "External id": 21506, "cbid": 211, "correlation": 21506 } }, { "ph": "s", "id": 21506, "pid": 76337, "tid": -914061504, "ts": 1716454217612538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612628, "dur": 1, "args": { "External id": 21517, "cbid": 251, "correlation": 21517 } }, { "ph": "f", "id": 21517, "pid": 76337, "tid": -914061504, "ts": 1716454217612628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217636777, "dur": 88, "args": { "External id": 21518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21518, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21518, "pid": 5, "tid": 7, "ts": 1716454217636777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612633, "dur": 13, "args": { "External id": 21518, "cbid": 211, "correlation": 21518 } }, { "ph": "s", "id": 21518, "pid": 76337, "tid": -914061504, "ts": 1716454217612633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612702, "dur": 1, "args": { "External id": 21529, "cbid": 251, "correlation": 21529 } }, { "ph": "f", "id": 21529, "pid": 76337, "tid": -914061504, "ts": 1716454217612702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612706, "dur": 0, "args": { "External id": 21530, "cbid": 251, "correlation": 21530 } }, { "ph": "f", "id": 21530, "pid": 76337, "tid": -914061504, "ts": 1716454217612706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217636865, "dur": 11, "args": { "External id": 21531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21531, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 21531, "pid": 5, "tid": 7, "ts": 1716454217636865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612707, "dur": 12, "args": { "External id": 21531, "cbid": 211, "correlation": 21531 } }, { "ph": "s", "id": 21531, "pid": 76337, "tid": -914061504, "ts": 1716454217612707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217636878, "dur": 5, "args": { "External id": 21533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21533, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 21533, "pid": 5, "tid": 7, "ts": 1716454217636878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612722, "dur": 7, "args": { "External id": 21533, "cbid": 211, "correlation": 21533 } }, { "ph": "s", "id": 21533, "pid": 76337, "tid": -914061504, "ts": 1716454217612722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612781, "dur": 1, "args": { "External id": 21544, "cbid": 251, "correlation": 21544 } }, { "ph": "f", "id": 21544, "pid": 76337, "tid": -914061504, "ts": 1716454217612781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612784, "dur": 0, "args": { "External id": 21545, "cbid": 251, "correlation": 21545 } }, { "ph": "f", "id": 21545, "pid": 76337, "tid": -914061504, "ts": 1716454217612784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217636884, "dur": 7, "args": { "External id": 21546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21546, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 21546, "pid": 5, "tid": 7, "ts": 1716454217636884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612786, "dur": 11, "args": { "External id": 21546, "cbid": 211, "correlation": 21546 } }, { "ph": "s", "id": 21546, "pid": 76337, "tid": -914061504, "ts": 1716454217612786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217636892, "dur": 4, "args": { "External id": 21548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21548, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 21548, "pid": 5, "tid": 7, "ts": 1716454217636892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612799, "dur": 5, "args": { "External id": 21548, "cbid": 211, "correlation": 21548 } }, { "ph": "s", "id": 21548, "pid": 76337, "tid": -914061504, "ts": 1716454217612799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217636897, "dur": 90, "args": { "External id": 21569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21569, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 21569, "pid": 5, "tid": 7, "ts": 1716454217636897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217612873, "dur": 12, "args": { "External id": 21569, "cbid": 211, "correlation": 21569 } }, { "ph": "s", "id": 21569, "pid": 76337, "tid": -914061504, "ts": 1716454217612873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217612994, "dur": 1, "args": { "External id": 21587, "cbid": 251, "correlation": 21587 } }, { "ph": "f", "id": 21587, "pid": 76337, "tid": -914061504, "ts": 1716454217612994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217636989, "dur": 97, "args": { "External id": 21589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21589, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21589, "pid": 5, "tid": 7, "ts": 1716454217636989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613000, "dur": 14, "args": { "External id": 21589, "cbid": 211, "correlation": 21589 } }, { "ph": "s", "id": 21589, "pid": 76337, "tid": -914061504, "ts": 1716454217613000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217637087, "dur": 19, "args": { "External id": 21597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21597, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21597, "pid": 5, "tid": 7, "ts": 1716454217637087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613074, "dur": 13, "args": { "External id": 21597, "cbid": 211, "correlation": 21597 } }, { "ph": "s", "id": 21597, "pid": 76337, "tid": -914061504, "ts": 1716454217613074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217637107, "dur": 38, "args": { "External id": 21605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21605, "pid": 5, "tid": 7, "ts": 1716454217637107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613119, "dur": 9, "args": { "External id": 21605, "cbid": 211, "correlation": 21605 } }, { "ph": "s", "id": 21605, "pid": 76337, "tid": -914061504, "ts": 1716454217613119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217637147, "dur": 34, "args": { "External id": 21627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21627, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21627, "pid": 5, "tid": 7, "ts": 1716454217637147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613171, "dur": 10, "args": { "External id": 21627, "cbid": 211, "correlation": 21627 } }, { "ph": "s", "id": 21627, "pid": 76337, "tid": -914061504, "ts": 1716454217613171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217613274, "dur": 1, "args": { "External id": 21643, "cbid": 251, "correlation": 21643 } }, { "ph": "f", "id": 21643, "pid": 76337, "tid": -914061504, "ts": 1716454217613274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217613279, "dur": 0, "args": { "External id": 21645, "cbid": 251, "correlation": 21645 } }, { "ph": "f", "id": 21645, "pid": 76337, "tid": -914061504, "ts": 1716454217613279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217637182, "dur": 531, "args": { "External id": 21646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21646, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 21646, "pid": 5, "tid": 7, "ts": 1716454217637182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613282, "dur": 13, "args": { "External id": 21646, "cbid": 211, "correlation": 21646 } }, { "ph": "s", "id": 21646, "pid": 76337, "tid": -914061504, "ts": 1716454217613282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217637715, "dur": 124, "args": { "External id": 21654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21654, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21654, "pid": 5, "tid": 7, "ts": 1716454217637715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613352, "dur": 13, "args": { "External id": 21654, "cbid": 211, "correlation": 21654 } }, { "ph": "s", "id": 21654, "pid": 76337, "tid": -914061504, "ts": 1716454217613352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217637841, "dur": 127, "args": { "External id": 21662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21662, "pid": 5, "tid": 7, "ts": 1716454217637841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613383, "dur": 9, "args": { "External id": 21662, "cbid": 211, "correlation": 21662 } }, { "ph": "s", "id": 21662, "pid": 76337, "tid": -914061504, "ts": 1716454217613383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217613464, "dur": 1, "args": { "External id": 21678, "cbid": 251, "correlation": 21678 } }, { "ph": "f", "id": 21678, "pid": 76337, "tid": -914061504, "ts": 1716454217613464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217637969, "dur": 295, "args": { "External id": 21680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21680, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21680, "pid": 5, "tid": 7, "ts": 1716454217637969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613469, "dur": 12, "args": { "External id": 21680, "cbid": 211, "correlation": 21680 } }, { "ph": "s", "id": 21680, "pid": 76337, "tid": -914061504, "ts": 1716454217613469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217638266, "dur": 27, "args": { "External id": 21688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21688, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21688, "pid": 5, "tid": 7, "ts": 1716454217638266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613512, "dur": 9, "args": { "External id": 21688, "cbid": 211, "correlation": 21688 } }, { "ph": "s", "id": 21688, "pid": 76337, "tid": -914061504, "ts": 1716454217613512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217638294, "dur": 80, "args": { "External id": 21699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21699, "pid": 5, "tid": 7, "ts": 1716454217638294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613580, "dur": 12, "args": { "External id": 21699, "cbid": 211, "correlation": 21699 } }, { "ph": "s", "id": 21699, "pid": 76337, "tid": -914061504, "ts": 1716454217613580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217613656, "dur": 0, "args": { "External id": 21711, "cbid": 317, "correlation": 21711 } }, { "ph": "f", "id": 21711, "pid": 76337, "tid": -914061504, "ts": 1716454217613656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217613657, "dur": 0, "args": { "External id": 21712, "cbid": 203, "correlation": 21712 } }, { "ph": "f", "id": 21712, "pid": 76337, "tid": -914061504, "ts": 1716454217613657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217613658, "dur": 0, "args": { "External id": 21713, "cbid": 205, "correlation": 21713 } }, { "ph": "f", "id": 21713, "pid": 76337, "tid": -914061504, "ts": 1716454217613658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217638375, "dur": 23, "args": { "External id": 21717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21717, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21717, "pid": 5, "tid": 7, "ts": 1716454217638375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613682, "dur": 13, "args": { "External id": 21717, "cbid": 211, "correlation": 21717 } }, { "ph": "s", "id": 21717, "pid": 76337, "tid": -914061504, "ts": 1716454217613682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217638399, "dur": 119, "args": { "External id": 21719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21719, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21719, "pid": 5, "tid": 7, "ts": 1716454217638399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613701, "dur": 7, "args": { "External id": 21719, "cbid": 211, "correlation": 21719 } }, { "ph": "s", "id": 21719, "pid": 76337, "tid": -914061504, "ts": 1716454217613701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217638519, "dur": 22, "args": { "External id": 21721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21721, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21721, "pid": 5, "tid": 7, "ts": 1716454217638519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613712, "dur": 5, "args": { "External id": 21721, "cbid": 211, "correlation": 21721 } }, { "ph": "s", "id": 21721, "pid": 76337, "tid": -914061504, "ts": 1716454217613712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217638543, "dur": 32, "args": { "External id": 21727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21727, "pid": 5, "tid": 7, "ts": 1716454217638543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613741, "dur": 8, "args": { "External id": 21727, "cbid": 211, "correlation": 21727 } }, { "ph": "s", "id": 21727, "pid": 76337, "tid": -914061504, "ts": 1716454217613741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217638576, "dur": 27, "args": { "External id": 21735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21735, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21735, "pid": 5, "tid": 7, "ts": 1716454217638576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613773, "dur": 8, "args": { "External id": 21735, "cbid": 211, "correlation": 21735 } }, { "ph": "s", "id": 21735, "pid": 76337, "tid": -914061504, "ts": 1716454217613773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217638604, "dur": 31, "args": { "External id": 21755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21755, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 21755, "pid": 5, "tid": 7, "ts": 1716454217638604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613846, "dur": 12, "args": { "External id": 21755, "cbid": 211, "correlation": 21755 } }, { "ph": "s", "id": 21755, "pid": 76337, "tid": -914061504, "ts": 1716454217613846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217638636, "dur": 5, "args": { "External id": 21767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21767, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 21767, "pid": 5, "tid": 7, "ts": 1716454217638636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613869, "dur": 6, "args": { "External id": 21767, "cbid": 211, "correlation": 21767 } }, { "ph": "s", "id": 21767, "pid": 76337, "tid": -914061504, "ts": 1716454217613869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217638642, "dur": 31, "args": { "External id": 21770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21770, "pid": 5, "tid": 7, "ts": 1716454217638642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613886, "dur": 6, "args": { "External id": 21770, "cbid": 211, "correlation": 21770 } }, { "ph": "s", "id": 21770, "pid": 76337, "tid": -914061504, "ts": 1716454217613886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217638675, "dur": 21, "args": { "External id": 21779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21779, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21779, "pid": 5, "tid": 7, "ts": 1716454217638675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217613926, "dur": 9, "args": { "External id": 21779, "cbid": 211, "correlation": 21779 } }, { "ph": "s", "id": 21779, "pid": 76337, "tid": -914061504, "ts": 1716454217613926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217613985, "dur": 0, "args": { "External id": 21789, "cbid": 317, "correlation": 21789 } }, { "ph": "f", "id": 21789, "pid": 76337, "tid": -914061504, "ts": 1716454217613985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217613986, "dur": 0, "args": { "External id": 21790, "cbid": 203, "correlation": 21790 } }, { "ph": "f", "id": 21790, "pid": 76337, "tid": -914061504, "ts": 1716454217613986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217613987, "dur": 0, "args": { "External id": 21791, "cbid": 205, "correlation": 21791 } }, { "ph": "f", "id": 21791, "pid": 76337, "tid": -914061504, "ts": 1716454217613987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217638698, "dur": 21, "args": { "External id": 21795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21795, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21795, "pid": 5, "tid": 7, "ts": 1716454217638698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614000, "dur": 11, "args": { "External id": 21795, "cbid": 211, "correlation": 21795 } }, { "ph": "s", "id": 21795, "pid": 76337, "tid": -914061504, "ts": 1716454217614000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217638720, "dur": 44, "args": { "External id": 21797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21797, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21797, "pid": 5, "tid": 7, "ts": 1716454217638720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614014, "dur": 5, "args": { "External id": 21797, "cbid": 211, "correlation": 21797 } }, { "ph": "s", "id": 21797, "pid": 76337, "tid": -914061504, "ts": 1716454217614014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217638765, "dur": 644, "args": { "External id": 21799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21799, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21799, "pid": 5, "tid": 7, "ts": 1716454217638765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614026, "dur": 7, "args": { "External id": 21799, "cbid": 211, "correlation": 21799 } }, { "ph": "s", "id": 21799, "pid": 76337, "tid": -914061504, "ts": 1716454217614026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217639410, "dur": 21, "args": { "External id": 21801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21801, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21801, "pid": 5, "tid": 7, "ts": 1716454217639410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614037, "dur": 5, "args": { "External id": 21801, "cbid": 211, "correlation": 21801 } }, { "ph": "s", "id": 21801, "pid": 76337, "tid": -914061504, "ts": 1716454217614037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217639433, "dur": 33, "args": { "External id": 21807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21807, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21807, "pid": 5, "tid": 7, "ts": 1716454217639433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614064, "dur": 8, "args": { "External id": 21807, "cbid": 211, "correlation": 21807 } }, { "ph": "s", "id": 21807, "pid": 76337, "tid": -914061504, "ts": 1716454217614064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217639467, "dur": 3, "args": { "External id": 21815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21815, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 21815, "pid": 5, "tid": 7, "ts": 1716454217639467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614108, "dur": 9, "args": { "External id": 21815, "cbid": 211, "correlation": 21815 } }, { "ph": "s", "id": 21815, "pid": 76337, "tid": -914061504, "ts": 1716454217614108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217614172, "dur": 1, "args": { "External id": 21831, "cbid": 251, "correlation": 21831 } }, { "ph": "f", "id": 21831, "pid": 76337, "tid": -914061504, "ts": 1716454217614172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217614177, "dur": 0, "args": { "External id": 21833, "cbid": 251, "correlation": 21833 } }, { "ph": "f", "id": 21833, "pid": 76337, "tid": -914061504, "ts": 1716454217614177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217639471, "dur": 12, "args": { "External id": 21834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21834, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 21834, "pid": 5, "tid": 7, "ts": 1716454217639471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614179, "dur": 11, "args": { "External id": 21834, "cbid": 211, "correlation": 21834 } }, { "ph": "s", "id": 21834, "pid": 76337, "tid": -914061504, "ts": 1716454217614179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217639485, "dur": 5, "args": { "External id": 21836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21836, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 21836, "pid": 5, "tid": 7, "ts": 1716454217639485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614192, "dur": 5, "args": { "External id": 21836, "cbid": 211, "correlation": 21836 } }, { "ph": "s", "id": 21836, "pid": 76337, "tid": -914061504, "ts": 1716454217614192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217639491, "dur": 28, "args": { "External id": 21846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21846, "pid": 5, "tid": 7, "ts": 1716454217639491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614251, "dur": 12, "args": { "External id": 21846, "cbid": 211, "correlation": 21846 } }, { "ph": "s", "id": 21846, "pid": 76337, "tid": -914061504, "ts": 1716454217614251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217639520, "dur": 31, "args": { "External id": 21866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21866, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 21866, "pid": 5, "tid": 7, "ts": 1716454217639520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614316, "dur": 11, "args": { "External id": 21866, "cbid": 211, "correlation": 21866 } }, { "ph": "s", "id": 21866, "pid": 76337, "tid": -914061504, "ts": 1716454217614316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217639553, "dur": 5, "args": { "External id": 21878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21878, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 21878, "pid": 5, "tid": 7, "ts": 1716454217639553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614337, "dur": 6, "args": { "External id": 21878, "cbid": 211, "correlation": 21878 } }, { "ph": "s", "id": 21878, "pid": 76337, "tid": -914061504, "ts": 1716454217614337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217639559, "dur": 30, "args": { "External id": 21881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21881, "pid": 5, "tid": 7, "ts": 1716454217639559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614355, "dur": 7, "args": { "External id": 21881, "cbid": 211, "correlation": 21881 } }, { "ph": "s", "id": 21881, "pid": 76337, "tid": -914061504, "ts": 1716454217614355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217639589, "dur": 20, "args": { "External id": 21890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21890, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21890, "pid": 5, "tid": 7, "ts": 1716454217639589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614395, "dur": 10, "args": { "External id": 21890, "cbid": 211, "correlation": 21890 } }, { "ph": "s", "id": 21890, "pid": 76337, "tid": -914061504, "ts": 1716454217614395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217614457, "dur": 0, "args": { "External id": 21900, "cbid": 317, "correlation": 21900 } }, { "ph": "f", "id": 21900, "pid": 76337, "tid": -914061504, "ts": 1716454217614457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217614458, "dur": 0, "args": { "External id": 21901, "cbid": 203, "correlation": 21901 } }, { "ph": "f", "id": 21901, "pid": 76337, "tid": -914061504, "ts": 1716454217614458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217614459, "dur": 0, "args": { "External id": 21902, "cbid": 205, "correlation": 21902 } }, { "ph": "f", "id": 21902, "pid": 76337, "tid": -914061504, "ts": 1716454217614459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217639611, "dur": 21, "args": { "External id": 21906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21906, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21906, "pid": 5, "tid": 7, "ts": 1716454217639611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614487, "dur": 13, "args": { "External id": 21906, "cbid": 211, "correlation": 21906 } }, { "ph": "s", "id": 21906, "pid": 76337, "tid": -914061504, "ts": 1716454217614487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217639633, "dur": 44, "args": { "External id": 21908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21908, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21908, "pid": 5, "tid": 7, "ts": 1716454217639633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614502, "dur": 5, "args": { "External id": 21908, "cbid": 211, "correlation": 21908 } }, { "ph": "s", "id": 21908, "pid": 76337, "tid": -914061504, "ts": 1716454217614502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217639678, "dur": 637, "args": { "External id": 21910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21910, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21910, "pid": 5, "tid": 7, "ts": 1716454217639678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614513, "dur": 6, "args": { "External id": 21910, "cbid": 211, "correlation": 21910 } }, { "ph": "s", "id": 21910, "pid": 76337, "tid": -914061504, "ts": 1716454217614513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217640316, "dur": 22, "args": { "External id": 21912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21912, "pid": 5, "tid": 7, "ts": 1716454217640316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614523, "dur": 5, "args": { "External id": 21912, "cbid": 211, "correlation": 21912 } }, { "ph": "s", "id": 21912, "pid": 76337, "tid": -914061504, "ts": 1716454217614523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217640339, "dur": 33, "args": { "External id": 21918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21918, "pid": 5, "tid": 7, "ts": 1716454217640339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614551, "dur": 8, "args": { "External id": 21918, "cbid": 211, "correlation": 21918 } }, { "ph": "s", "id": 21918, "pid": 76337, "tid": -914061504, "ts": 1716454217614551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217640374, "dur": 26, "args": { "External id": 21926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21926, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21926, "pid": 5, "tid": 7, "ts": 1716454217640374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614583, "dur": 8, "args": { "External id": 21926, "cbid": 211, "correlation": 21926 } }, { "ph": "s", "id": 21926, "pid": 76337, "tid": -914061504, "ts": 1716454217614583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217640401, "dur": 20, "args": { "External id": 21934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21934, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21934, "pid": 5, "tid": 7, "ts": 1716454217640401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614613, "dur": 8, "args": { "External id": 21934, "cbid": 211, "correlation": 21934 } }, { "ph": "s", "id": 21934, "pid": 76337, "tid": -914061504, "ts": 1716454217614613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217640422, "dur": 29, "args": { "External id": 21954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21954, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 21954, "pid": 5, "tid": 7, "ts": 1716454217640422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614693, "dur": 12, "args": { "External id": 21954, "cbid": 211, "correlation": 21954 } }, { "ph": "s", "id": 21954, "pid": 76337, "tid": -914061504, "ts": 1716454217614693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217640453, "dur": 4, "args": { "External id": 21966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21966, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 21966, "pid": 5, "tid": 7, "ts": 1716454217640453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614715, "dur": 6, "args": { "External id": 21966, "cbid": 211, "correlation": 21966 } }, { "ph": "s", "id": 21966, "pid": 76337, "tid": -914061504, "ts": 1716454217614715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217640459, "dur": 30, "args": { "External id": 21969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21969, "pid": 5, "tid": 7, "ts": 1716454217640459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614733, "dur": 6, "args": { "External id": 21969, "cbid": 211, "correlation": 21969 } }, { "ph": "s", "id": 21969, "pid": 76337, "tid": -914061504, "ts": 1716454217614733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217614791, "dur": 0, "args": { "External id": 21980, "cbid": 317, "correlation": 21980 } }, { "ph": "f", "id": 21980, "pid": 76337, "tid": -914061504, "ts": 1716454217614791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217614791, "dur": 0, "args": { "External id": 21981, "cbid": 203, "correlation": 21981 } }, { "ph": "f", "id": 21981, "pid": 76337, "tid": -914061504, "ts": 1716454217614791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217614792, "dur": 0, "args": { "External id": 21982, "cbid": 205, "correlation": 21982 } }, { "ph": "f", "id": 21982, "pid": 76337, "tid": -914061504, "ts": 1716454217614792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217640490, "dur": 22, "args": { "External id": 21986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21986, "pid": 5, "tid": 7, "ts": 1716454217640490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614805, "dur": 11, "args": { "External id": 21986, "cbid": 211, "correlation": 21986 } }, { "ph": "s", "id": 21986, "pid": 76337, "tid": -914061504, "ts": 1716454217614805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217640513, "dur": 114, "args": { "External id": 21988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21988, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 21988, "pid": 5, "tid": 7, "ts": 1716454217640513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614822, "dur": 6, "args": { "External id": 21988, "cbid": 211, "correlation": 21988 } }, { "ph": "s", "id": 21988, "pid": 76337, "tid": -914061504, "ts": 1716454217614822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217640628, "dur": 22, "args": { "External id": 21990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21990, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21990, "pid": 5, "tid": 7, "ts": 1716454217640628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614833, "dur": 5, "args": { "External id": 21990, "cbid": 211, "correlation": 21990 } }, { "ph": "s", "id": 21990, "pid": 76337, "tid": -914061504, "ts": 1716454217614833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217640651, "dur": 32, "args": { "External id": 21996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 21996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 21996, "pid": 5, "tid": 7, "ts": 1716454217640651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614860, "dur": 8, "args": { "External id": 21996, "cbid": 211, "correlation": 21996 } }, { "ph": "s", "id": 21996, "pid": 76337, "tid": -914061504, "ts": 1716454217614860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217640685, "dur": 166, "args": { "External id": 22005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22005, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22005, "pid": 5, "tid": 7, "ts": 1716454217640685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217614942, "dur": 14, "args": { "External id": 22005, "cbid": 211, "correlation": 22005 } }, { "ph": "s", "id": 22005, "pid": 76337, "tid": -914061504, "ts": 1716454217614942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217640852, "dur": 64, "args": { "External id": 22027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22027, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22027, "pid": 5, "tid": 7, "ts": 1716454217640852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615025, "dur": 12, "args": { "External id": 22027, "cbid": 211, "correlation": 22027 } }, { "ph": "s", "id": 22027, "pid": 76337, "tid": -914061504, "ts": 1716454217615025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615129, "dur": 1, "args": { "External id": 22038, "cbid": 251, "correlation": 22038 } }, { "ph": "f", "id": 22038, "pid": 76337, "tid": -914061504, "ts": 1716454217615129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217640917, "dur": 151, "args": { "External id": 22039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22039, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22039, "pid": 5, "tid": 7, "ts": 1716454217640917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615135, "dur": 13, "args": { "External id": 22039, "cbid": 211, "correlation": 22039 } }, { "ph": "s", "id": 22039, "pid": 76337, "tid": -914061504, "ts": 1716454217615135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615222, "dur": 1, "args": { "External id": 22050, "cbid": 251, "correlation": 22050 } }, { "ph": "f", "id": 22050, "pid": 76337, "tid": -914061504, "ts": 1716454217615222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217641070, "dur": 145, "args": { "External id": 22051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22051, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22051, "pid": 5, "tid": 7, "ts": 1716454217641070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615227, "dur": 12, "args": { "External id": 22051, "cbid": 211, "correlation": 22051 } }, { "ph": "s", "id": 22051, "pid": 76337, "tid": -914061504, "ts": 1716454217615227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615306, "dur": 1, "args": { "External id": 22062, "cbid": 251, "correlation": 22062 } }, { "ph": "f", "id": 22062, "pid": 76337, "tid": -914061504, "ts": 1716454217615306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217641216, "dur": 143, "args": { "External id": 22063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22063, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22063, "pid": 5, "tid": 7, "ts": 1716454217641216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615311, "dur": 12, "args": { "External id": 22063, "cbid": 211, "correlation": 22063 } }, { "ph": "s", "id": 22063, "pid": 76337, "tid": -914061504, "ts": 1716454217615311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217641361, "dur": 1911, "args": { "External id": 22084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22084, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 22084, "pid": 5, "tid": 7, "ts": 1716454217641361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615393, "dur": 13, "args": { "External id": 22084, "cbid": 211, "correlation": 22084 } }, { "ph": "s", "id": 22084, "pid": 76337, "tid": -914061504, "ts": 1716454217615393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615504, "dur": 1, "args": { "External id": 22102, "cbid": 251, "correlation": 22102 } }, { "ph": "f", "id": 22102, "pid": 76337, "tid": -914061504, "ts": 1716454217615504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217643273, "dur": 146, "args": { "External id": 22104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22104, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 22104, "pid": 5, "tid": 7, "ts": 1716454217643273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615510, "dur": 15, "args": { "External id": 22104, "cbid": 211, "correlation": 22104 } }, { "ph": "s", "id": 22104, "pid": 76337, "tid": -914061504, "ts": 1716454217615510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217643420, "dur": 35, "args": { "External id": 22112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22112, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22112, "pid": 5, "tid": 7, "ts": 1716454217643420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615584, "dur": 12, "args": { "External id": 22112, "cbid": 211, "correlation": 22112 } }, { "ph": "s", "id": 22112, "pid": 76337, "tid": -914061504, "ts": 1716454217615584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217643457, "dur": 50, "args": { "External id": 22120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22120, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22120, "pid": 5, "tid": 7, "ts": 1716454217643457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615622, "dur": 9, "args": { "External id": 22120, "cbid": 211, "correlation": 22120 } }, { "ph": "s", "id": 22120, "pid": 76337, "tid": -914061504, "ts": 1716454217615622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217643509, "dur": 30, "args": { "External id": 22131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22131, "pid": 5, "tid": 7, "ts": 1716454217643509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615695, "dur": 12, "args": { "External id": 22131, "cbid": 211, "correlation": 22131 } }, { "ph": "s", "id": 22131, "pid": 76337, "tid": -914061504, "ts": 1716454217615695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217643540, "dur": 34, "args": { "External id": 22153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22153, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22153, "pid": 5, "tid": 7, "ts": 1716454217643540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615737, "dur": 10, "args": { "External id": 22153, "cbid": 211, "correlation": 22153 } }, { "ph": "s", "id": 22153, "pid": 76337, "tid": -914061504, "ts": 1716454217615737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615841, "dur": 1, "args": { "External id": 22164, "cbid": 251, "correlation": 22164 } }, { "ph": "f", "id": 22164, "pid": 76337, "tid": -914061504, "ts": 1716454217615841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217643576, "dur": 89, "args": { "External id": 22165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22165, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22165, "pid": 5, "tid": 7, "ts": 1716454217643576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615846, "dur": 14, "args": { "External id": 22165, "cbid": 211, "correlation": 22165 } }, { "ph": "s", "id": 22165, "pid": 76337, "tid": -914061504, "ts": 1716454217615846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615930, "dur": 1, "args": { "External id": 22176, "cbid": 251, "correlation": 22176 } }, { "ph": "f", "id": 22176, "pid": 76337, "tid": -914061504, "ts": 1716454217615930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217615934, "dur": 0, "args": { "External id": 22177, "cbid": 251, "correlation": 22177 } }, { "ph": "f", "id": 22177, "pid": 76337, "tid": -914061504, "ts": 1716454217615934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217643666, "dur": 12, "args": { "External id": 22178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22178, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 22178, "pid": 5, "tid": 7, "ts": 1716454217643666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615936, "dur": 13, "args": { "External id": 22178, "cbid": 211, "correlation": 22178 } }, { "ph": "s", "id": 22178, "pid": 76337, "tid": -914061504, "ts": 1716454217615936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217643680, "dur": 5, "args": { "External id": 22180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22180, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 22180, "pid": 5, "tid": 7, "ts": 1716454217643680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217615950, "dur": 6, "args": { "External id": 22180, "cbid": 211, "correlation": 22180 } }, { "ph": "s", "id": 22180, "pid": 76337, "tid": -914061504, "ts": 1716454217615950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217616028, "dur": 1, "args": { "External id": 22191, "cbid": 251, "correlation": 22191 } }, { "ph": "f", "id": 22191, "pid": 76337, "tid": -914061504, "ts": 1716454217616028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217616032, "dur": 0, "args": { "External id": 22192, "cbid": 251, "correlation": 22192 } }, { "ph": "f", "id": 22192, "pid": 76337, "tid": -914061504, "ts": 1716454217616032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217643686, "dur": 7, "args": { "External id": 22193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22193, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 22193, "pid": 5, "tid": 7, "ts": 1716454217643686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616034, "dur": 13, "args": { "External id": 22193, "cbid": 211, "correlation": 22193 } }, { "ph": "s", "id": 22193, "pid": 76337, "tid": -914061504, "ts": 1716454217616034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217643694, "dur": 4, "args": { "External id": 22195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22195, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 22195, "pid": 5, "tid": 7, "ts": 1716454217643694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616049, "dur": 6, "args": { "External id": 22195, "cbid": 211, "correlation": 22195 } }, { "ph": "s", "id": 22195, "pid": 76337, "tid": -914061504, "ts": 1716454217616049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217643699, "dur": 91, "args": { "External id": 22216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22216, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 22216, "pid": 5, "tid": 7, "ts": 1716454217643699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616124, "dur": 12, "args": { "External id": 22216, "cbid": 211, "correlation": 22216 } }, { "ph": "s", "id": 22216, "pid": 76337, "tid": -914061504, "ts": 1716454217616124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217616240, "dur": 1, "args": { "External id": 22234, "cbid": 251, "correlation": 22234 } }, { "ph": "f", "id": 22234, "pid": 76337, "tid": -914061504, "ts": 1716454217616240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217643791, "dur": 97, "args": { "External id": 22236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22236, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22236, "pid": 5, "tid": 7, "ts": 1716454217643791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616246, "dur": 14, "args": { "External id": 22236, "cbid": 211, "correlation": 22236 } }, { "ph": "s", "id": 22236, "pid": 76337, "tid": -914061504, "ts": 1716454217616246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217643890, "dur": 19, "args": { "External id": 22244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22244, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22244, "pid": 5, "tid": 7, "ts": 1716454217643890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616318, "dur": 13, "args": { "External id": 22244, "cbid": 211, "correlation": 22244 } }, { "ph": "s", "id": 22244, "pid": 76337, "tid": -914061504, "ts": 1716454217616318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217643910, "dur": 37, "args": { "External id": 22252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22252, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22252, "pid": 5, "tid": 7, "ts": 1716454217643910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616360, "dur": 9, "args": { "External id": 22252, "cbid": 211, "correlation": 22252 } }, { "ph": "s", "id": 22252, "pid": 76337, "tid": -914061504, "ts": 1716454217616360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217643948, "dur": 34, "args": { "External id": 22274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22274, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22274, "pid": 5, "tid": 7, "ts": 1716454217643948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616422, "dur": 11, "args": { "External id": 22274, "cbid": 211, "correlation": 22274 } }, { "ph": "s", "id": 22274, "pid": 76337, "tid": -914061504, "ts": 1716454217616422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217616515, "dur": 1, "args": { "External id": 22290, "cbid": 251, "correlation": 22290 } }, { "ph": "f", "id": 22290, "pid": 76337, "tid": -914061504, "ts": 1716454217616515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217616520, "dur": 0, "args": { "External id": 22292, "cbid": 251, "correlation": 22292 } }, { "ph": "f", "id": 22292, "pid": 76337, "tid": -914061504, "ts": 1716454217616520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217643984, "dur": 532, "args": { "External id": 22293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22293, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 22293, "pid": 5, "tid": 7, "ts": 1716454217643984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616523, "dur": 13, "args": { "External id": 22293, "cbid": 211, "correlation": 22293 } }, { "ph": "s", "id": 22293, "pid": 76337, "tid": -914061504, "ts": 1716454217616523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217644517, "dur": 124, "args": { "External id": 22301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22301, "pid": 5, "tid": 7, "ts": 1716454217644517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616590, "dur": 12, "args": { "External id": 22301, "cbid": 211, "correlation": 22301 } }, { "ph": "s", "id": 22301, "pid": 76337, "tid": -914061504, "ts": 1716454217616590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217644642, "dur": 126, "args": { "External id": 22309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22309, "pid": 5, "tid": 7, "ts": 1716454217644642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616621, "dur": 8, "args": { "External id": 22309, "cbid": 211, "correlation": 22309 } }, { "ph": "s", "id": 22309, "pid": 76337, "tid": -914061504, "ts": 1716454217616621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217616698, "dur": 1, "args": { "External id": 22325, "cbid": 251, "correlation": 22325 } }, { "ph": "f", "id": 22325, "pid": 76337, "tid": -914061504, "ts": 1716454217616698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217644770, "dur": 303, "args": { "External id": 22327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22327, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22327, "pid": 5, "tid": 7, "ts": 1716454217644770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616703, "dur": 12, "args": { "External id": 22327, "cbid": 211, "correlation": 22327 } }, { "ph": "s", "id": 22327, "pid": 76337, "tid": -914061504, "ts": 1716454217616703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217645074, "dur": 28, "args": { "External id": 22335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22335, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22335, "pid": 5, "tid": 7, "ts": 1716454217645074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616746, "dur": 10, "args": { "External id": 22335, "cbid": 211, "correlation": 22335 } }, { "ph": "s", "id": 22335, "pid": 76337, "tid": -914061504, "ts": 1716454217616746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217645103, "dur": 80, "args": { "External id": 22346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22346, "pid": 5, "tid": 7, "ts": 1716454217645103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616813, "dur": 12, "args": { "External id": 22346, "cbid": 211, "correlation": 22346 } }, { "ph": "s", "id": 22346, "pid": 76337, "tid": -914061504, "ts": 1716454217616813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217616877, "dur": 0, "args": { "External id": 22358, "cbid": 317, "correlation": 22358 } }, { "ph": "f", "id": 22358, "pid": 76337, "tid": -914061504, "ts": 1716454217616877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217616877, "dur": 0, "args": { "External id": 22359, "cbid": 203, "correlation": 22359 } }, { "ph": "f", "id": 22359, "pid": 76337, "tid": -914061504, "ts": 1716454217616877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217616878, "dur": 0, "args": { "External id": 22360, "cbid": 205, "correlation": 22360 } }, { "ph": "f", "id": 22360, "pid": 76337, "tid": -914061504, "ts": 1716454217616878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645184, "dur": 24, "args": { "External id": 22364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22364, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22364, "pid": 5, "tid": 7, "ts": 1716454217645184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616906, "dur": 13, "args": { "External id": 22364, "cbid": 211, "correlation": 22364 } }, { "ph": "s", "id": 22364, "pid": 76337, "tid": -914061504, "ts": 1716454217616906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217645209, "dur": 119, "args": { "External id": 22366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22366, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22366, "pid": 5, "tid": 7, "ts": 1716454217645209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616925, "dur": 6, "args": { "External id": 22366, "cbid": 211, "correlation": 22366 } }, { "ph": "s", "id": 22366, "pid": 76337, "tid": -914061504, "ts": 1716454217616925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645329, "dur": 21, "args": { "External id": 22368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22368, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22368, "pid": 5, "tid": 7, "ts": 1716454217645329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616935, "dur": 5, "args": { "External id": 22368, "cbid": 211, "correlation": 22368 } }, { "ph": "s", "id": 22368, "pid": 76337, "tid": -914061504, "ts": 1716454217616935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217645352, "dur": 33, "args": { "External id": 22374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22374, "pid": 5, "tid": 7, "ts": 1716454217645352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217616963, "dur": 9, "args": { "External id": 22374, "cbid": 211, "correlation": 22374 } }, { "ph": "s", "id": 22374, "pid": 76337, "tid": -914061504, "ts": 1716454217616963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217645386, "dur": 27, "args": { "External id": 22382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22382, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22382, "pid": 5, "tid": 7, "ts": 1716454217645386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617004, "dur": 9, "args": { "External id": 22382, "cbid": 211, "correlation": 22382 } }, { "ph": "s", "id": 22382, "pid": 76337, "tid": -914061504, "ts": 1716454217617004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217617078, "dur": 0, "args": { "External id": 22392, "cbid": 317, "correlation": 22392 } }, { "ph": "f", "id": 22392, "pid": 76337, "tid": -914061504, "ts": 1716454217617078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217617079, "dur": 0, "args": { "External id": 22393, "cbid": 203, "correlation": 22393 } }, { "ph": "f", "id": 22393, "pid": 76337, "tid": -914061504, "ts": 1716454217617079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217617080, "dur": 0, "args": { "External id": 22394, "cbid": 205, "correlation": 22394 } }, { "ph": "f", "id": 22394, "pid": 76337, "tid": -914061504, "ts": 1716454217617080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645414, "dur": 24, "args": { "External id": 22398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22398, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22398, "pid": 5, "tid": 7, "ts": 1716454217645414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617094, "dur": 12, "args": { "External id": 22398, "cbid": 211, "correlation": 22398 } }, { "ph": "s", "id": 22398, "pid": 76337, "tid": -914061504, "ts": 1716454217617094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645439, "dur": 44, "args": { "External id": 22400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22400, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22400, "pid": 5, "tid": 7, "ts": 1716454217645439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617109, "dur": 5, "args": { "External id": 22400, "cbid": 211, "correlation": 22400 } }, { "ph": "s", "id": 22400, "pid": 76337, "tid": -914061504, "ts": 1716454217617109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217645484, "dur": 232, "args": { "External id": 22402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22402, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 22402, "pid": 5, "tid": 7, "ts": 1716454217645484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617121, "dur": 7, "args": { "External id": 22402, "cbid": 211, "correlation": 22402 } }, { "ph": "s", "id": 22402, "pid": 76337, "tid": -914061504, "ts": 1716454217617121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645717, "dur": 6, "args": { "External id": 22404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22404, "pid": 5, "tid": 7, "ts": 1716454217645717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617132, "dur": 5, "args": { "External id": 22404, "cbid": 211, "correlation": 22404 } }, { "ph": "s", "id": 22404, "pid": 76337, "tid": -914061504, "ts": 1716454217617132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217645725, "dur": 9, "args": { "External id": 22410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22410, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22410, "pid": 5, "tid": 7, "ts": 1716454217645725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617158, "dur": 8, "args": { "External id": 22410, "cbid": 211, "correlation": 22410 } }, { "ph": "s", "id": 22410, "pid": 76337, "tid": -914061504, "ts": 1716454217617158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217645735, "dur": 12, "args": { "External id": 22430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22430, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 22430, "pid": 5, "tid": 7, "ts": 1716454217645735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617254, "dur": 12, "args": { "External id": 22430, "cbid": 211, "correlation": 22430 } }, { "ph": "s", "id": 22430, "pid": 76337, "tid": -914061504, "ts": 1716454217617254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217645748, "dur": 4, "args": { "External id": 22442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22442, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 22442, "pid": 5, "tid": 7, "ts": 1716454217645748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617286, "dur": 8, "args": { "External id": 22442, "cbid": 211, "correlation": 22442 } }, { "ph": "s", "id": 22442, "pid": 76337, "tid": -914061504, "ts": 1716454217617286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217645753, "dur": 12, "args": { "External id": 22445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22445, "pid": 5, "tid": 7, "ts": 1716454217645753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617307, "dur": 7, "args": { "External id": 22445, "cbid": 211, "correlation": 22445 } }, { "ph": "s", "id": 22445, "pid": 76337, "tid": -914061504, "ts": 1716454217617307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217645767, "dur": 7, "args": { "External id": 22454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22454, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22454, "pid": 5, "tid": 7, "ts": 1716454217645767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617348, "dur": 9, "args": { "External id": 22454, "cbid": 211, "correlation": 22454 } }, { "ph": "s", "id": 22454, "pid": 76337, "tid": -914061504, "ts": 1716454217617348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217617410, "dur": 0, "args": { "External id": 22464, "cbid": 317, "correlation": 22464 } }, { "ph": "f", "id": 22464, "pid": 76337, "tid": -914061504, "ts": 1716454217617410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217617411, "dur": 0, "args": { "External id": 22465, "cbid": 203, "correlation": 22465 } }, { "ph": "f", "id": 22465, "pid": 76337, "tid": -914061504, "ts": 1716454217617411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217617411, "dur": 0, "args": { "External id": 22466, "cbid": 205, "correlation": 22466 } }, { "ph": "f", "id": 22466, "pid": 76337, "tid": -914061504, "ts": 1716454217617411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645775, "dur": 5, "args": { "External id": 22470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22470, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22470, "pid": 5, "tid": 7, "ts": 1716454217645775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617434, "dur": 12, "args": { "External id": 22470, "cbid": 211, "correlation": 22470 } }, { "ph": "s", "id": 22470, "pid": 76337, "tid": -914061504, "ts": 1716454217617434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217645782, "dur": 82, "args": { "External id": 22472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22472, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22472, "pid": 5, "tid": 7, "ts": 1716454217645782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617450, "dur": 5, "args": { "External id": 22472, "cbid": 211, "correlation": 22472 } }, { "ph": "s", "id": 22472, "pid": 76337, "tid": -914061504, "ts": 1716454217617450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217645867, "dur": 1, "args": { "External id": 22474, "device": 5, "context": 1, "stream": 7, "correlation": 22474, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 22474, "pid": 5, "tid": 7, "ts": 1716454217645867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217617463, "dur": 9, "args": { "External id": 22474, "cbid": 51, "correlation": 22474 } }, { "ph": "s", "id": 22474, "pid": 76337, "tid": -914061504, "ts": 1716454217617463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217645870, "dur": 534, "args": { "External id": 22475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22475, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22475, "pid": 5, "tid": 7, "ts": 1716454217645870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617473, "dur": 9, "args": { "External id": 22475, "cbid": 211, "correlation": 22475 } }, { "ph": "s", "id": 22475, "pid": 76337, "tid": -914061504, "ts": 1716454217617473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217646406, "dur": 11, "args": { "External id": 22477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22477, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22477, "pid": 5, "tid": 7, "ts": 1716454217646406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617487, "dur": 5, "args": { "External id": 22477, "cbid": 211, "correlation": 22477 } }, { "ph": "s", "id": 22477, "pid": 76337, "tid": -914061504, "ts": 1716454217617487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217646418, "dur": 14, "args": { "External id": 22483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22483, "pid": 5, "tid": 7, "ts": 1716454217646418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617515, "dur": 8, "args": { "External id": 22483, "cbid": 211, "correlation": 22483 } }, { "ph": "s", "id": 22483, "pid": 76337, "tid": -914061504, "ts": 1716454217617515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217646434, "dur": 3, "args": { "External id": 22491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22491, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 22491, "pid": 5, "tid": 7, "ts": 1716454217646434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617560, "dur": 9, "args": { "External id": 22491, "cbid": 211, "correlation": 22491 } }, { "ph": "s", "id": 22491, "pid": 76337, "tid": -914061504, "ts": 1716454217617560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217617643, "dur": 2, "args": { "External id": 22507, "cbid": 251, "correlation": 22507 } }, { "ph": "f", "id": 22507, "pid": 76337, "tid": -914061504, "ts": 1716454217617643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217617649, "dur": 0, "args": { "External id": 22509, "cbid": 251, "correlation": 22509 } }, { "ph": "f", "id": 22509, "pid": 76337, "tid": -914061504, "ts": 1716454217617649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217646439, "dur": 13, "args": { "External id": 22510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22510, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22510, "pid": 5, "tid": 7, "ts": 1716454217646439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617652, "dur": 12, "args": { "External id": 22510, "cbid": 211, "correlation": 22510 } }, { "ph": "s", "id": 22510, "pid": 76337, "tid": -914061504, "ts": 1716454217617652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217646453, "dur": 5, "args": { "External id": 22512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22512, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22512, "pid": 5, "tid": 7, "ts": 1716454217646453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617666, "dur": 5, "args": { "External id": 22512, "cbid": 211, "correlation": 22512 } }, { "ph": "s", "id": 22512, "pid": 76337, "tid": -914061504, "ts": 1716454217617666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217646459, "dur": 16, "args": { "External id": 22522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22522, "pid": 5, "tid": 7, "ts": 1716454217646459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617724, "dur": 12, "args": { "External id": 22522, "cbid": 211, "correlation": 22522 } }, { "ph": "s", "id": 22522, "pid": 76337, "tid": -914061504, "ts": 1716454217617724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217646477, "dur": 18, "args": { "External id": 22542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22542, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 22542, "pid": 5, "tid": 7, "ts": 1716454217646477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617790, "dur": 11, "args": { "External id": 22542, "cbid": 211, "correlation": 22542 } }, { "ph": "s", "id": 22542, "pid": 76337, "tid": -914061504, "ts": 1716454217617790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217646496, "dur": 4, "args": { "External id": 22554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22554, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 22554, "pid": 5, "tid": 7, "ts": 1716454217646496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617826, "dur": 9, "args": { "External id": 22554, "cbid": 211, "correlation": 22554 } }, { "ph": "s", "id": 22554, "pid": 76337, "tid": -914061504, "ts": 1716454217617826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217646502, "dur": 17, "args": { "External id": 22557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22557, "pid": 5, "tid": 7, "ts": 1716454217646502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617847, "dur": 7, "args": { "External id": 22557, "cbid": 211, "correlation": 22557 } }, { "ph": "s", "id": 22557, "pid": 76337, "tid": -914061504, "ts": 1716454217617847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217646519, "dur": 11, "args": { "External id": 22566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22566, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22566, "pid": 5, "tid": 7, "ts": 1716454217646519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217617891, "dur": 10, "args": { "External id": 22566, "cbid": 211, "correlation": 22566 } }, { "ph": "s", "id": 22566, "pid": 76337, "tid": -914061504, "ts": 1716454217617891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217617969, "dur": 0, "args": { "External id": 22576, "cbid": 317, "correlation": 22576 } }, { "ph": "f", "id": 22576, "pid": 76337, "tid": -914061504, "ts": 1716454217617969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217617970, "dur": 0, "args": { "External id": 22577, "cbid": 203, "correlation": 22577 } }, { "ph": "f", "id": 22577, "pid": 76337, "tid": -914061504, "ts": 1716454217617970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217617970, "dur": 0, "args": { "External id": 22578, "cbid": 205, "correlation": 22578 } }, { "ph": "f", "id": 22578, "pid": 76337, "tid": -914061504, "ts": 1716454217617970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217646532, "dur": 11, "args": { "External id": 22582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22582, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22582, "pid": 5, "tid": 7, "ts": 1716454217646532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618002, "dur": 13, "args": { "External id": 22582, "cbid": 211, "correlation": 22582 } }, { "ph": "s", "id": 22582, "pid": 76337, "tid": -914061504, "ts": 1716454217618002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217646544, "dur": 160, "args": { "External id": 22584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22584, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22584, "pid": 5, "tid": 7, "ts": 1716454217646544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618018, "dur": 5, "args": { "External id": 22584, "cbid": 211, "correlation": 22584 } }, { "ph": "s", "id": 22584, "pid": 76337, "tid": -914061504, "ts": 1716454217618018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217646706, "dur": 1, "args": { "External id": 22586, "device": 5, "context": 1, "stream": 7, "correlation": 22586, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 22586, "pid": 5, "tid": 7, "ts": 1716454217646706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217618030, "dur": 7, "args": { "External id": 22586, "cbid": 51, "correlation": 22586 } }, { "ph": "s", "id": 22586, "pid": 76337, "tid": -914061504, "ts": 1716454217618030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217646710, "dur": 654, "args": { "External id": 22587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22587, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22587, "pid": 5, "tid": 7, "ts": 1716454217646710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618038, "dur": 7, "args": { "External id": 22587, "cbid": 211, "correlation": 22587 } }, { "ph": "s", "id": 22587, "pid": 76337, "tid": -914061504, "ts": 1716454217618038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217647365, "dur": 12, "args": { "External id": 22589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22589, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22589, "pid": 5, "tid": 7, "ts": 1716454217647365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618050, "dur": 5, "args": { "External id": 22589, "cbid": 211, "correlation": 22589 } }, { "ph": "s", "id": 22589, "pid": 76337, "tid": -914061504, "ts": 1716454217618050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217647378, "dur": 14, "args": { "External id": 22595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22595, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22595, "pid": 5, "tid": 7, "ts": 1716454217647378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618080, "dur": 8, "args": { "External id": 22595, "cbid": 211, "correlation": 22595 } }, { "ph": "s", "id": 22595, "pid": 76337, "tid": -914061504, "ts": 1716454217618080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217618139, "dur": 0, "args": { "External id": 22605, "cbid": 317, "correlation": 22605 } }, { "ph": "f", "id": 22605, "pid": 76337, "tid": -914061504, "ts": 1716454217618139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217618140, "dur": 0, "args": { "External id": 22606, "cbid": 203, "correlation": 22606 } }, { "ph": "f", "id": 22606, "pid": 76337, "tid": -914061504, "ts": 1716454217618140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217618141, "dur": 0, "args": { "External id": 22607, "cbid": 205, "correlation": 22607 } }, { "ph": "f", "id": 22607, "pid": 76337, "tid": -914061504, "ts": 1716454217618141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217647394, "dur": 8, "args": { "External id": 22611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22611, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22611, "pid": 5, "tid": 7, "ts": 1716454217647394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618156, "dur": 11, "args": { "External id": 22611, "cbid": 211, "correlation": 22611 } }, { "ph": "s", "id": 22611, "pid": 76337, "tid": -914061504, "ts": 1716454217618156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217647404, "dur": 4, "args": { "External id": 22613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22613, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 22613, "pid": 5, "tid": 7, "ts": 1716454217647404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618173, "dur": 7, "args": { "External id": 22613, "cbid": 211, "correlation": 22613 } }, { "ph": "s", "id": 22613, "pid": 76337, "tid": -914061504, "ts": 1716454217618173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217618183, "dur": 0, "args": { "External id": 22614, "cbid": 51, "correlation": 22614 } }, { "ph": "s", "id": 22614, "pid": 76337, "tid": -914061504, "ts": 1716454217618183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217647408, "dur": 56, "args": { "External id": 22615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22615, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 22615, "pid": 5, "tid": 7, "ts": 1716454217647408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618184, "dur": 5, "args": { "External id": 22615, "cbid": 211, "correlation": 22615 } }, { "ph": "s", "id": 22615, "pid": 76337, "tid": -914061504, "ts": 1716454217618184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217647466, "dur": 14, "args": { "External id": 22620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22620, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22620, "pid": 5, "tid": 7, "ts": 1716454217647466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618221, "dur": 10, "args": { "External id": 22620, "cbid": 211, "correlation": 22620 } }, { "ph": "s", "id": 22620, "pid": 76337, "tid": -914061504, "ts": 1716454217618221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217647481, "dur": 12, "args": { "External id": 22628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22628, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22628, "pid": 5, "tid": 7, "ts": 1716454217647481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618252, "dur": 8, "args": { "External id": 22628, "cbid": 211, "correlation": 22628 } }, { "ph": "s", "id": 22628, "pid": 76337, "tid": -914061504, "ts": 1716454217618252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217647495, "dur": 10, "args": { "External id": 22636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22636, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22636, "pid": 5, "tid": 7, "ts": 1716454217647495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618281, "dur": 8, "args": { "External id": 22636, "cbid": 211, "correlation": 22636 } }, { "ph": "s", "id": 22636, "pid": 76337, "tid": -914061504, "ts": 1716454217618281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217647506, "dur": 19, "args": { "External id": 22656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22656, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 22656, "pid": 5, "tid": 7, "ts": 1716454217647506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618363, "dur": 12, "args": { "External id": 22656, "cbid": 211, "correlation": 22656 } }, { "ph": "s", "id": 22656, "pid": 76337, "tid": -914061504, "ts": 1716454217618363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217647526, "dur": 4, "args": { "External id": 22668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22668, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 22668, "pid": 5, "tid": 7, "ts": 1716454217647526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618384, "dur": 6, "args": { "External id": 22668, "cbid": 211, "correlation": 22668 } }, { "ph": "s", "id": 22668, "pid": 76337, "tid": -914061504, "ts": 1716454217618384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217647532, "dur": 17, "args": { "External id": 22671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22671, "pid": 5, "tid": 7, "ts": 1716454217647532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618402, "dur": 7, "args": { "External id": 22671, "cbid": 211, "correlation": 22671 } }, { "ph": "s", "id": 22671, "pid": 76337, "tid": -914061504, "ts": 1716454217618402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217618471, "dur": 0, "args": { "External id": 22682, "cbid": 317, "correlation": 22682 } }, { "ph": "f", "id": 22682, "pid": 76337, "tid": -914061504, "ts": 1716454217618471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217618472, "dur": 0, "args": { "External id": 22683, "cbid": 203, "correlation": 22683 } }, { "ph": "f", "id": 22683, "pid": 76337, "tid": -914061504, "ts": 1716454217618472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217618473, "dur": 0, "args": { "External id": 22684, "cbid": 205, "correlation": 22684 } }, { "ph": "f", "id": 22684, "pid": 76337, "tid": -914061504, "ts": 1716454217618473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217647550, "dur": 11, "args": { "External id": 22688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22688, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22688, "pid": 5, "tid": 7, "ts": 1716454217647550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618489, "dur": 13, "args": { "External id": 22688, "cbid": 211, "correlation": 22688 } }, { "ph": "s", "id": 22688, "pid": 76337, "tid": -914061504, "ts": 1716454217618489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217647562, "dur": 3, "args": { "External id": 22690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22690, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 22690, "pid": 5, "tid": 7, "ts": 1716454217647562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618507, "dur": 6, "args": { "External id": 22690, "cbid": 211, "correlation": 22690 } }, { "ph": "s", "id": 22690, "pid": 76337, "tid": -914061504, "ts": 1716454217618507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217618517, "dur": 0, "args": { "External id": 22691, "cbid": 51, "correlation": 22691 } }, { "ph": "s", "id": 22691, "pid": 76337, "tid": -914061504, "ts": 1716454217618517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217647567, "dur": 96, "args": { "External id": 22692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22692, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 22692, "pid": 5, "tid": 7, "ts": 1716454217647567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618518, "dur": 7, "args": { "External id": 22692, "cbid": 211, "correlation": 22692 } }, { "ph": "s", "id": 22692, "pid": 76337, "tid": -914061504, "ts": 1716454217618518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217647664, "dur": 14, "args": { "External id": 22697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22697, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22697, "pid": 5, "tid": 7, "ts": 1716454217647664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618548, "dur": 8, "args": { "External id": 22697, "cbid": 211, "correlation": 22697 } }, { "ph": "s", "id": 22697, "pid": 76337, "tid": -914061504, "ts": 1716454217618548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217647679, "dur": 81, "args": { "External id": 22706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22706, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22706, "pid": 5, "tid": 7, "ts": 1716454217647679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618632, "dur": 14, "args": { "External id": 22706, "cbid": 211, "correlation": 22706 } }, { "ph": "s", "id": 22706, "pid": 76337, "tid": -914061504, "ts": 1716454217618632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217647762, "dur": 29, "args": { "External id": 22728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22728, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22728, "pid": 5, "tid": 7, "ts": 1716454217647762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618700, "dur": 12, "args": { "External id": 22728, "cbid": 211, "correlation": 22728 } }, { "ph": "s", "id": 22728, "pid": 76337, "tid": -914061504, "ts": 1716454217618700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217618804, "dur": 2, "args": { "External id": 22739, "cbid": 251, "correlation": 22739 } }, { "ph": "f", "id": 22739, "pid": 76337, "tid": -914061504, "ts": 1716454217618804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217647792, "dur": 162, "args": { "External id": 22740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22740, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22740, "pid": 5, "tid": 7, "ts": 1716454217647792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618811, "dur": 14, "args": { "External id": 22740, "cbid": 211, "correlation": 22740 } }, { "ph": "s", "id": 22740, "pid": 76337, "tid": -914061504, "ts": 1716454217618811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217618883, "dur": 1, "args": { "External id": 22751, "cbid": 251, "correlation": 22751 } }, { "ph": "f", "id": 22751, "pid": 76337, "tid": -914061504, "ts": 1716454217618883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217647956, "dur": 157, "args": { "External id": 22752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22752, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22752, "pid": 5, "tid": 7, "ts": 1716454217647956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618887, "dur": 11, "args": { "External id": 22752, "cbid": 211, "correlation": 22752 } }, { "ph": "s", "id": 22752, "pid": 76337, "tid": -914061504, "ts": 1716454217618887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217618953, "dur": 1, "args": { "External id": 22763, "cbid": 251, "correlation": 22763 } }, { "ph": "f", "id": 22763, "pid": 76337, "tid": -914061504, "ts": 1716454217618953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217648114, "dur": 156, "args": { "External id": 22764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22764, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22764, "pid": 5, "tid": 7, "ts": 1716454217648114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217618957, "dur": 11, "args": { "External id": 22764, "cbid": 211, "correlation": 22764 } }, { "ph": "s", "id": 22764, "pid": 76337, "tid": -914061504, "ts": 1716454217618957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217648272, "dur": 329, "args": { "External id": 22789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22789, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22789, "pid": 5, "tid": 7, "ts": 1716454217648272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619050, "dur": 14, "args": { "External id": 22789, "cbid": 211, "correlation": 22789 } }, { "ph": "s", "id": 22789, "pid": 76337, "tid": -914061504, "ts": 1716454217619050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619171, "dur": 1, "args": { "External id": 22807, "cbid": 251, "correlation": 22807 } }, { "ph": "f", "id": 22807, "pid": 76337, "tid": -914061504, "ts": 1716454217619171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217648602, "dur": 163, "args": { "External id": 22809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22809, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22809, "pid": 5, "tid": 7, "ts": 1716454217648602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619177, "dur": 14, "args": { "External id": 22809, "cbid": 211, "correlation": 22809 } }, { "ph": "s", "id": 22809, "pid": 76337, "tid": -914061504, "ts": 1716454217619177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217648767, "dur": 19, "args": { "External id": 22817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22817, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22817, "pid": 5, "tid": 7, "ts": 1716454217648767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619249, "dur": 12, "args": { "External id": 22817, "cbid": 211, "correlation": 22817 } }, { "ph": "s", "id": 22817, "pid": 76337, "tid": -914061504, "ts": 1716454217619249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217648787, "dur": 28, "args": { "External id": 22825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22825, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22825, "pid": 5, "tid": 7, "ts": 1716454217648787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619288, "dur": 8, "args": { "External id": 22825, "cbid": 211, "correlation": 22825 } }, { "ph": "s", "id": 22825, "pid": 76337, "tid": -914061504, "ts": 1716454217619288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217648817, "dur": 18, "args": { "External id": 22836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22836, "pid": 5, "tid": 7, "ts": 1716454217648817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619360, "dur": 12, "args": { "External id": 22836, "cbid": 211, "correlation": 22836 } }, { "ph": "s", "id": 22836, "pid": 76337, "tid": -914061504, "ts": 1716454217619360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217648836, "dur": 16, "args": { "External id": 22858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22858, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22858, "pid": 5, "tid": 7, "ts": 1716454217648836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619390, "dur": 8, "args": { "External id": 22858, "cbid": 211, "correlation": 22858 } }, { "ph": "s", "id": 22858, "pid": 76337, "tid": -914061504, "ts": 1716454217619390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619489, "dur": 2, "args": { "External id": 22869, "cbid": 251, "correlation": 22869 } }, { "ph": "f", "id": 22869, "pid": 76337, "tid": -914061504, "ts": 1716454217619489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217648853, "dur": 88, "args": { "External id": 22870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22870, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 22870, "pid": 5, "tid": 7, "ts": 1716454217648853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619495, "dur": 16, "args": { "External id": 22870, "cbid": 211, "correlation": 22870 } }, { "ph": "s", "id": 22870, "pid": 76337, "tid": -914061504, "ts": 1716454217619495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619571, "dur": 1, "args": { "External id": 22881, "cbid": 251, "correlation": 22881 } }, { "ph": "f", "id": 22881, "pid": 76337, "tid": -914061504, "ts": 1716454217619571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619575, "dur": 0, "args": { "External id": 22882, "cbid": 251, "correlation": 22882 } }, { "ph": "f", "id": 22882, "pid": 76337, "tid": -914061504, "ts": 1716454217619575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217648943, "dur": 12, "args": { "External id": 22883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22883, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22883, "pid": 5, "tid": 7, "ts": 1716454217648943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619576, "dur": 12, "args": { "External id": 22883, "cbid": 211, "correlation": 22883 } }, { "ph": "s", "id": 22883, "pid": 76337, "tid": -914061504, "ts": 1716454217619576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217648956, "dur": 6, "args": { "External id": 22885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22885, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22885, "pid": 5, "tid": 7, "ts": 1716454217648956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619590, "dur": 6, "args": { "External id": 22885, "cbid": 211, "correlation": 22885 } }, { "ph": "s", "id": 22885, "pid": 76337, "tid": -914061504, "ts": 1716454217619590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619660, "dur": 1, "args": { "External id": 22896, "cbid": 251, "correlation": 22896 } }, { "ph": "f", "id": 22896, "pid": 76337, "tid": -914061504, "ts": 1716454217619660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619663, "dur": 0, "args": { "External id": 22897, "cbid": 251, "correlation": 22897 } }, { "ph": "f", "id": 22897, "pid": 76337, "tid": -914061504, "ts": 1716454217619663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217648963, "dur": 8, "args": { "External id": 22898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22898, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22898, "pid": 5, "tid": 7, "ts": 1716454217648963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619665, "dur": 13, "args": { "External id": 22898, "cbid": 211, "correlation": 22898 } }, { "ph": "s", "id": 22898, "pid": 76337, "tid": -914061504, "ts": 1716454217619665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217648973, "dur": 3, "args": { "External id": 22900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22900, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22900, "pid": 5, "tid": 7, "ts": 1716454217648973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619679, "dur": 6, "args": { "External id": 22900, "cbid": 211, "correlation": 22900 } }, { "ph": "s", "id": 22900, "pid": 76337, "tid": -914061504, "ts": 1716454217619679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217648977, "dur": 54, "args": { "External id": 22925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22925, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 22925, "pid": 5, "tid": 7, "ts": 1716454217648977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619757, "dur": 12, "args": { "External id": 22925, "cbid": 211, "correlation": 22925 } }, { "ph": "s", "id": 22925, "pid": 76337, "tid": -914061504, "ts": 1716454217619757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217619877, "dur": 2, "args": { "External id": 22943, "cbid": 251, "correlation": 22943 } }, { "ph": "f", "id": 22943, "pid": 76337, "tid": -914061504, "ts": 1716454217619877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217649033, "dur": 90, "args": { "External id": 22945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22945, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 22945, "pid": 5, "tid": 7, "ts": 1716454217649033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619883, "dur": 16, "args": { "External id": 22945, "cbid": 211, "correlation": 22945 } }, { "ph": "s", "id": 22945, "pid": 76337, "tid": -914061504, "ts": 1716454217619883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217649124, "dur": 10, "args": { "External id": 22953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22953, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22953, "pid": 5, "tid": 7, "ts": 1716454217649124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217619958, "dur": 12, "args": { "External id": 22953, "cbid": 211, "correlation": 22953 } }, { "ph": "s", "id": 22953, "pid": 76337, "tid": -914061504, "ts": 1716454217619958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217649135, "dur": 21, "args": { "External id": 22961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22961, "pid": 5, "tid": 7, "ts": 1716454217649135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620008, "dur": 10, "args": { "External id": 22961, "cbid": 211, "correlation": 22961 } }, { "ph": "s", "id": 22961, "pid": 76337, "tid": -914061504, "ts": 1716454217620008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217649157, "dur": 18, "args": { "External id": 22983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 22983, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 22983, "pid": 5, "tid": 7, "ts": 1716454217649157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620060, "dur": 10, "args": { "External id": 22983, "cbid": 211, "correlation": 22983 } }, { "ph": "s", "id": 22983, "pid": 76337, "tid": -914061504, "ts": 1716454217620060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217620151, "dur": 2, "args": { "External id": 22999, "cbid": 251, "correlation": 22999 } }, { "ph": "f", "id": 22999, "pid": 76337, "tid": -914061504, "ts": 1716454217620151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217620156, "dur": 0, "args": { "External id": 23001, "cbid": 251, "correlation": 23001 } }, { "ph": "f", "id": 23001, "pid": 76337, "tid": -914061504, "ts": 1716454217620156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217649176, "dur": 489, "args": { "External id": 23002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23002, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23002, "pid": 5, "tid": 7, "ts": 1716454217649176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620159, "dur": 19, "args": { "External id": 23002, "cbid": 211, "correlation": 23002 } }, { "ph": "s", "id": 23002, "pid": 76337, "tid": -914061504, "ts": 1716454217620159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217649666, "dur": 66, "args": { "External id": 23010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23010, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23010, "pid": 5, "tid": 7, "ts": 1716454217649666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620232, "dur": 13, "args": { "External id": 23010, "cbid": 211, "correlation": 23010 } }, { "ph": "s", "id": 23010, "pid": 76337, "tid": -914061504, "ts": 1716454217620232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217649733, "dur": 68, "args": { "External id": 23018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23018, "pid": 5, "tid": 7, "ts": 1716454217649733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620263, "dur": 8, "args": { "External id": 23018, "cbid": 211, "correlation": 23018 } }, { "ph": "s", "id": 23018, "pid": 76337, "tid": -914061504, "ts": 1716454217620263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217620355, "dur": 1, "args": { "External id": 23034, "cbid": 251, "correlation": 23034 } }, { "ph": "f", "id": 23034, "pid": 76337, "tid": -914061504, "ts": 1716454217620355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217649803, "dur": 1, "args": { "External id": 23036, "device": 5, "context": 1, "stream": 7, "correlation": 23036, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 23036, "pid": 5, "tid": 7, "ts": 1716454217649803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217620360, "dur": 12, "args": { "External id": 23036, "cbid": 51, "correlation": 23036 } }, { "ph": "s", "id": 23036, "pid": 76337, "tid": -914061504, "ts": 1716454217620360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217649806, "dur": 267, "args": { "External id": 23037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23037, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23037, "pid": 5, "tid": 7, "ts": 1716454217649806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620374, "dur": 11, "args": { "External id": 23037, "cbid": 211, "correlation": 23037 } }, { "ph": "s", "id": 23037, "pid": 76337, "tid": -914061504, "ts": 1716454217620374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217650075, "dur": 14, "args": { "External id": 23045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23045, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23045, "pid": 5, "tid": 7, "ts": 1716454217650075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620417, "dur": 11, "args": { "External id": 23045, "cbid": 211, "correlation": 23045 } }, { "ph": "s", "id": 23045, "pid": 76337, "tid": -914061504, "ts": 1716454217620417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217650090, "dur": 38, "args": { "External id": 23056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23056, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23056, "pid": 5, "tid": 7, "ts": 1716454217650090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620485, "dur": 12, "args": { "External id": 23056, "cbid": 211, "correlation": 23056 } }, { "ph": "s", "id": 23056, "pid": 76337, "tid": -914061504, "ts": 1716454217620485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217620564, "dur": 0, "args": { "External id": 23068, "cbid": 317, "correlation": 23068 } }, { "ph": "f", "id": 23068, "pid": 76337, "tid": -914061504, "ts": 1716454217620564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217620564, "dur": 0, "args": { "External id": 23069, "cbid": 203, "correlation": 23069 } }, { "ph": "f", "id": 23069, "pid": 76337, "tid": -914061504, "ts": 1716454217620564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217620565, "dur": 0, "args": { "External id": 23070, "cbid": 205, "correlation": 23070 } }, { "ph": "f", "id": 23070, "pid": 76337, "tid": -914061504, "ts": 1716454217620565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217650129, "dur": 13, "args": { "External id": 23074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23074, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23074, "pid": 5, "tid": 7, "ts": 1716454217650129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620586, "dur": 14, "args": { "External id": 23074, "cbid": 211, "correlation": 23074 } }, { "ph": "s", "id": 23074, "pid": 76337, "tid": -914061504, "ts": 1716454217620586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217650144, "dur": 4, "args": { "External id": 23076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23076, "pid": 5, "tid": 7, "ts": 1716454217650144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620604, "dur": 6, "args": { "External id": 23076, "cbid": 211, "correlation": 23076 } }, { "ph": "s", "id": 23076, "pid": 76337, "tid": -914061504, "ts": 1716454217620604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217620613, "dur": 0, "args": { "External id": 23077, "cbid": 51, "correlation": 23077 } }, { "ph": "s", "id": 23077, "pid": 76337, "tid": -914061504, "ts": 1716454217620613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217650149, "dur": 96, "args": { "External id": 23078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23078, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 23078, "pid": 5, "tid": 7, "ts": 1716454217650149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620614, "dur": 5, "args": { "External id": 23078, "cbid": 211, "correlation": 23078 } }, { "ph": "s", "id": 23078, "pid": 76337, "tid": -914061504, "ts": 1716454217620614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217650246, "dur": 16, "args": { "External id": 23083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23083, "pid": 5, "tid": 7, "ts": 1716454217650246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620642, "dur": 9, "args": { "External id": 23083, "cbid": 211, "correlation": 23083 } }, { "ph": "s", "id": 23083, "pid": 76337, "tid": -914061504, "ts": 1716454217620642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217650264, "dur": 12, "args": { "External id": 23091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23091, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23091, "pid": 5, "tid": 7, "ts": 1716454217650264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620675, "dur": 8, "args": { "External id": 23091, "cbid": 211, "correlation": 23091 } }, { "ph": "s", "id": 23091, "pid": 76337, "tid": -914061504, "ts": 1716454217620675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217650277, "dur": 17, "args": { "External id": 23111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23111, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 23111, "pid": 5, "tid": 7, "ts": 1716454217650277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620748, "dur": 12, "args": { "External id": 23111, "cbid": 211, "correlation": 23111 } }, { "ph": "s", "id": 23111, "pid": 76337, "tid": -914061504, "ts": 1716454217620748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217650295, "dur": 4, "args": { "External id": 23123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23123, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 23123, "pid": 5, "tid": 7, "ts": 1716454217650295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620780, "dur": 8, "args": { "External id": 23123, "cbid": 211, "correlation": 23123 } }, { "ph": "s", "id": 23123, "pid": 76337, "tid": -914061504, "ts": 1716454217620780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217650301, "dur": 17, "args": { "External id": 23126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23126, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23126, "pid": 5, "tid": 7, "ts": 1716454217650301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620800, "dur": 7, "args": { "External id": 23126, "cbid": 211, "correlation": 23126 } }, { "ph": "s", "id": 23126, "pid": 76337, "tid": -914061504, "ts": 1716454217620800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217650319, "dur": 12, "args": { "External id": 23135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23135, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23135, "pid": 5, "tid": 7, "ts": 1716454217650319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620841, "dur": 10, "args": { "External id": 23135, "cbid": 211, "correlation": 23135 } }, { "ph": "s", "id": 23135, "pid": 76337, "tid": -914061504, "ts": 1716454217620841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217620893, "dur": 0, "args": { "External id": 23145, "cbid": 317, "correlation": 23145 } }, { "ph": "f", "id": 23145, "pid": 76337, "tid": -914061504, "ts": 1716454217620893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217620894, "dur": 0, "args": { "External id": 23146, "cbid": 203, "correlation": 23146 } }, { "ph": "f", "id": 23146, "pid": 76337, "tid": -914061504, "ts": 1716454217620894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217620894, "dur": 0, "args": { "External id": 23147, "cbid": 205, "correlation": 23147 } }, { "ph": "f", "id": 23147, "pid": 76337, "tid": -914061504, "ts": 1716454217620894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217650333, "dur": 11, "args": { "External id": 23151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23151, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23151, "pid": 5, "tid": 7, "ts": 1716454217650333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620918, "dur": 12, "args": { "External id": 23151, "cbid": 211, "correlation": 23151 } }, { "ph": "s", "id": 23151, "pid": 76337, "tid": -914061504, "ts": 1716454217620918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217650345, "dur": 161, "args": { "External id": 23153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23153, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23153, "pid": 5, "tid": 7, "ts": 1716454217650345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620933, "dur": 5, "args": { "External id": 23153, "cbid": 211, "correlation": 23153 } }, { "ph": "s", "id": 23153, "pid": 76337, "tid": -914061504, "ts": 1716454217620933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217650509, "dur": 1, "args": { "External id": 23155, "device": 5, "context": 1, "stream": 7, "correlation": 23155, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 23155, "pid": 5, "tid": 7, "ts": 1716454217650509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217620945, "dur": 8, "args": { "External id": 23155, "cbid": 51, "correlation": 23155 } }, { "ph": "s", "id": 23155, "pid": 76337, "tid": -914061504, "ts": 1716454217620945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217650512, "dur": 652, "args": { "External id": 23156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23156, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23156, "pid": 5, "tid": 7, "ts": 1716454217650512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620954, "dur": 7, "args": { "External id": 23156, "cbid": 211, "correlation": 23156 } }, { "ph": "s", "id": 23156, "pid": 76337, "tid": -914061504, "ts": 1716454217620954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217651166, "dur": 13, "args": { "External id": 23158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23158, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23158, "pid": 5, "tid": 7, "ts": 1716454217651166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217620964, "dur": 5, "args": { "External id": 23158, "cbid": 211, "correlation": 23158 } }, { "ph": "s", "id": 23158, "pid": 76337, "tid": -914061504, "ts": 1716454217620964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217651180, "dur": 14, "args": { "External id": 23164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23164, "pid": 5, "tid": 7, "ts": 1716454217651180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621013, "dur": 11, "args": { "External id": 23164, "cbid": 211, "correlation": 23164 } }, { "ph": "s", "id": 23164, "pid": 76337, "tid": -914061504, "ts": 1716454217621013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217651196, "dur": 3, "args": { "External id": 23172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23172, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 23172, "pid": 5, "tid": 7, "ts": 1716454217651196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621060, "dur": 10, "args": { "External id": 23172, "cbid": 211, "correlation": 23172 } }, { "ph": "s", "id": 23172, "pid": 76337, "tid": -914061504, "ts": 1716454217621060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217621135, "dur": 1, "args": { "External id": 23188, "cbid": 251, "correlation": 23188 } }, { "ph": "f", "id": 23188, "pid": 76337, "tid": -914061504, "ts": 1716454217621135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217621140, "dur": 0, "args": { "External id": 23190, "cbid": 251, "correlation": 23190 } }, { "ph": "f", "id": 23190, "pid": 76337, "tid": -914061504, "ts": 1716454217621140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217651200, "dur": 14, "args": { "External id": 23191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23191, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23191, "pid": 5, "tid": 7, "ts": 1716454217651200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621142, "dur": 12, "args": { "External id": 23191, "cbid": 211, "correlation": 23191 } }, { "ph": "s", "id": 23191, "pid": 76337, "tid": -914061504, "ts": 1716454217621142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217651215, "dur": 5, "args": { "External id": 23193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23193, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23193, "pid": 5, "tid": 7, "ts": 1716454217651215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621156, "dur": 6, "args": { "External id": 23193, "cbid": 211, "correlation": 23193 } }, { "ph": "s", "id": 23193, "pid": 76337, "tid": -914061504, "ts": 1716454217621156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217651222, "dur": 17, "args": { "External id": 23203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23203, "pid": 5, "tid": 7, "ts": 1716454217651222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621216, "dur": 12, "args": { "External id": 23203, "cbid": 211, "correlation": 23203 } }, { "ph": "s", "id": 23203, "pid": 76337, "tid": -914061504, "ts": 1716454217621216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217651240, "dur": 19, "args": { "External id": 23223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23223, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 23223, "pid": 5, "tid": 7, "ts": 1716454217651240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621281, "dur": 11, "args": { "External id": 23223, "cbid": 211, "correlation": 23223 } }, { "ph": "s", "id": 23223, "pid": 76337, "tid": -914061504, "ts": 1716454217621281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217651260, "dur": 4, "args": { "External id": 23235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23235, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 23235, "pid": 5, "tid": 7, "ts": 1716454217651260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621303, "dur": 6, "args": { "External id": 23235, "cbid": 211, "correlation": 23235 } }, { "ph": "s", "id": 23235, "pid": 76337, "tid": -914061504, "ts": 1716454217621303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217651266, "dur": 16, "args": { "External id": 23238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23238, "pid": 5, "tid": 7, "ts": 1716454217651266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621321, "dur": 6, "args": { "External id": 23238, "cbid": 211, "correlation": 23238 } }, { "ph": "s", "id": 23238, "pid": 76337, "tid": -914061504, "ts": 1716454217621321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217651283, "dur": 10, "args": { "External id": 23247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23247, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23247, "pid": 5, "tid": 7, "ts": 1716454217651283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621361, "dur": 10, "args": { "External id": 23247, "cbid": 211, "correlation": 23247 } }, { "ph": "s", "id": 23247, "pid": 76337, "tid": -914061504, "ts": 1716454217621361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217621424, "dur": 0, "args": { "External id": 23257, "cbid": 317, "correlation": 23257 } }, { "ph": "f", "id": 23257, "pid": 76337, "tid": -914061504, "ts": 1716454217621424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217621424, "dur": 0, "args": { "External id": 23258, "cbid": 203, "correlation": 23258 } }, { "ph": "f", "id": 23258, "pid": 76337, "tid": -914061504, "ts": 1716454217621424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217621425, "dur": 0, "args": { "External id": 23259, "cbid": 205, "correlation": 23259 } }, { "ph": "f", "id": 23259, "pid": 76337, "tid": -914061504, "ts": 1716454217621425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217651295, "dur": 11, "args": { "External id": 23263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23263, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23263, "pid": 5, "tid": 7, "ts": 1716454217651295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621439, "dur": 12, "args": { "External id": 23263, "cbid": 211, "correlation": 23263 } }, { "ph": "s", "id": 23263, "pid": 76337, "tid": -914061504, "ts": 1716454217621439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217651307, "dur": 161, "args": { "External id": 23265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23265, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23265, "pid": 5, "tid": 7, "ts": 1716454217651307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621454, "dur": 5, "args": { "External id": 23265, "cbid": 211, "correlation": 23265 } }, { "ph": "s", "id": 23265, "pid": 76337, "tid": -914061504, "ts": 1716454217621454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217651470, "dur": 1, "args": { "External id": 23267, "device": 5, "context": 1, "stream": 7, "correlation": 23267, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 23267, "pid": 5, "tid": 7, "ts": 1716454217651470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217621465, "dur": 6, "args": { "External id": 23267, "cbid": 51, "correlation": 23267 } }, { "ph": "s", "id": 23267, "pid": 76337, "tid": -914061504, "ts": 1716454217621465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217651474, "dur": 637, "args": { "External id": 23268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23268, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23268, "pid": 5, "tid": 7, "ts": 1716454217651474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621473, "dur": 6, "args": { "External id": 23268, "cbid": 211, "correlation": 23268 } }, { "ph": "s", "id": 23268, "pid": 76337, "tid": -914061504, "ts": 1716454217621473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217652112, "dur": 12, "args": { "External id": 23270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23270, "pid": 5, "tid": 7, "ts": 1716454217652112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621482, "dur": 5, "args": { "External id": 23270, "cbid": 211, "correlation": 23270 } }, { "ph": "s", "id": 23270, "pid": 76337, "tid": -914061504, "ts": 1716454217621482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217652126, "dur": 14, "args": { "External id": 23276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23276, "pid": 5, "tid": 7, "ts": 1716454217652126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621522, "dur": 11, "args": { "External id": 23276, "cbid": 211, "correlation": 23276 } }, { "ph": "s", "id": 23276, "pid": 76337, "tid": -914061504, "ts": 1716454217621522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217652141, "dur": 12, "args": { "External id": 23284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23284, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23284, "pid": 5, "tid": 7, "ts": 1716454217652141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621557, "dur": 8, "args": { "External id": 23284, "cbid": 211, "correlation": 23284 } }, { "ph": "s", "id": 23284, "pid": 76337, "tid": -914061504, "ts": 1716454217621557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217652154, "dur": 10, "args": { "External id": 23292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23292, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23292, "pid": 5, "tid": 7, "ts": 1716454217652154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621587, "dur": 9, "args": { "External id": 23292, "cbid": 211, "correlation": 23292 } }, { "ph": "s", "id": 23292, "pid": 76337, "tid": -914061504, "ts": 1716454217621587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217652166, "dur": 19, "args": { "External id": 23312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23312, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 23312, "pid": 5, "tid": 7, "ts": 1716454217652166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621666, "dur": 12, "args": { "External id": 23312, "cbid": 211, "correlation": 23312 } }, { "ph": "s", "id": 23312, "pid": 76337, "tid": -914061504, "ts": 1716454217621666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217652186, "dur": 4, "args": { "External id": 23324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23324, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 23324, "pid": 5, "tid": 7, "ts": 1716454217652186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621689, "dur": 6, "args": { "External id": 23324, "cbid": 211, "correlation": 23324 } }, { "ph": "s", "id": 23324, "pid": 76337, "tid": -914061504, "ts": 1716454217621689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217652192, "dur": 16, "args": { "External id": 23327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23327, "pid": 5, "tid": 7, "ts": 1716454217652192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621707, "dur": 6, "args": { "External id": 23327, "cbid": 211, "correlation": 23327 } }, { "ph": "s", "id": 23327, "pid": 76337, "tid": -914061504, "ts": 1716454217621707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217621763, "dur": 0, "args": { "External id": 23338, "cbid": 317, "correlation": 23338 } }, { "ph": "f", "id": 23338, "pid": 76337, "tid": -914061504, "ts": 1716454217621763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217621764, "dur": 0, "args": { "External id": 23339, "cbid": 203, "correlation": 23339 } }, { "ph": "f", "id": 23339, "pid": 76337, "tid": -914061504, "ts": 1716454217621764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217621765, "dur": 0, "args": { "External id": 23340, "cbid": 205, "correlation": 23340 } }, { "ph": "f", "id": 23340, "pid": 76337, "tid": -914061504, "ts": 1716454217621765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217652209, "dur": 10, "args": { "External id": 23344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23344, "pid": 5, "tid": 7, "ts": 1716454217652209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621778, "dur": 11, "args": { "External id": 23344, "cbid": 211, "correlation": 23344 } }, { "ph": "s", "id": 23344, "pid": 76337, "tid": -914061504, "ts": 1716454217621778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217652221, "dur": 4, "args": { "External id": 23346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23346, "pid": 5, "tid": 7, "ts": 1716454217652221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621794, "dur": 6, "args": { "External id": 23346, "cbid": 211, "correlation": 23346 } }, { "ph": "s", "id": 23346, "pid": 76337, "tid": -914061504, "ts": 1716454217621794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217621803, "dur": 0, "args": { "External id": 23347, "cbid": 51, "correlation": 23347 } }, { "ph": "s", "id": 23347, "pid": 76337, "tid": -914061504, "ts": 1716454217621803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217652226, "dur": 92, "args": { "External id": 23348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23348, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 23348, "pid": 5, "tid": 7, "ts": 1716454217652226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621803, "dur": 5, "args": { "External id": 23348, "cbid": 211, "correlation": 23348 } }, { "ph": "s", "id": 23348, "pid": 76337, "tid": -914061504, "ts": 1716454217621803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217652320, "dur": 15, "args": { "External id": 23353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23353, "pid": 5, "tid": 7, "ts": 1716454217652320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621831, "dur": 8, "args": { "External id": 23353, "cbid": 211, "correlation": 23353 } }, { "ph": "s", "id": 23353, "pid": 76337, "tid": -914061504, "ts": 1716454217621831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217652336, "dur": 83, "args": { "External id": 23362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23362, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23362, "pid": 5, "tid": 7, "ts": 1716454217652336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621913, "dur": 14, "args": { "External id": 23362, "cbid": 211, "correlation": 23362 } }, { "ph": "s", "id": 23362, "pid": 76337, "tid": -914061504, "ts": 1716454217621913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217652421, "dur": 30, "args": { "External id": 23384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23384, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23384, "pid": 5, "tid": 7, "ts": 1716454217652421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217621990, "dur": 13, "args": { "External id": 23384, "cbid": 211, "correlation": 23384 } }, { "ph": "s", "id": 23384, "pid": 76337, "tid": -914061504, "ts": 1716454217621990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622094, "dur": 1, "args": { "External id": 23395, "cbid": 251, "correlation": 23395 } }, { "ph": "f", "id": 23395, "pid": 76337, "tid": -914061504, "ts": 1716454217622094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217652452, "dur": 161, "args": { "External id": 23396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23396, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23396, "pid": 5, "tid": 7, "ts": 1716454217652452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622099, "dur": 15, "args": { "External id": 23396, "cbid": 211, "correlation": 23396 } }, { "ph": "s", "id": 23396, "pid": 76337, "tid": -914061504, "ts": 1716454217622099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622183, "dur": 1, "args": { "External id": 23407, "cbid": 251, "correlation": 23407 } }, { "ph": "f", "id": 23407, "pid": 76337, "tid": -914061504, "ts": 1716454217622183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217652614, "dur": 156, "args": { "External id": 23408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23408, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23408, "pid": 5, "tid": 7, "ts": 1716454217652614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622187, "dur": 12, "args": { "External id": 23408, "cbid": 211, "correlation": 23408 } }, { "ph": "s", "id": 23408, "pid": 76337, "tid": -914061504, "ts": 1716454217622187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622265, "dur": 1, "args": { "External id": 23419, "cbid": 251, "correlation": 23419 } }, { "ph": "f", "id": 23419, "pid": 76337, "tid": -914061504, "ts": 1716454217622265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217652772, "dur": 153, "args": { "External id": 23420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23420, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23420, "pid": 5, "tid": 7, "ts": 1716454217652772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622269, "dur": 13, "args": { "External id": 23420, "cbid": 211, "correlation": 23420 } }, { "ph": "s", "id": 23420, "pid": 76337, "tid": -914061504, "ts": 1716454217622269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217652926, "dur": 334, "args": { "External id": 23445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23445, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23445, "pid": 5, "tid": 7, "ts": 1716454217652926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622357, "dur": 12, "args": { "External id": 23445, "cbid": 211, "correlation": 23445 } }, { "ph": "s", "id": 23445, "pid": 76337, "tid": -914061504, "ts": 1716454217622357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622469, "dur": 1, "args": { "External id": 23463, "cbid": 251, "correlation": 23463 } }, { "ph": "f", "id": 23463, "pid": 76337, "tid": -914061504, "ts": 1716454217622469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217653261, "dur": 164, "args": { "External id": 23465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23465, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23465, "pid": 5, "tid": 7, "ts": 1716454217653261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622475, "dur": 14, "args": { "External id": 23465, "cbid": 211, "correlation": 23465 } }, { "ph": "s", "id": 23465, "pid": 76337, "tid": -914061504, "ts": 1716454217622475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217653426, "dur": 19, "args": { "External id": 23473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23473, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23473, "pid": 5, "tid": 7, "ts": 1716454217653426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622548, "dur": 12, "args": { "External id": 23473, "cbid": 211, "correlation": 23473 } }, { "ph": "s", "id": 23473, "pid": 76337, "tid": -914061504, "ts": 1716454217622548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217653446, "dur": 27, "args": { "External id": 23481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23481, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23481, "pid": 5, "tid": 7, "ts": 1716454217653446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622588, "dur": 9, "args": { "External id": 23481, "cbid": 211, "correlation": 23481 } }, { "ph": "s", "id": 23481, "pid": 76337, "tid": -914061504, "ts": 1716454217622588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217653475, "dur": 18, "args": { "External id": 23492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23492, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23492, "pid": 5, "tid": 7, "ts": 1716454217653475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622659, "dur": 12, "args": { "External id": 23492, "cbid": 211, "correlation": 23492 } }, { "ph": "s", "id": 23492, "pid": 76337, "tid": -914061504, "ts": 1716454217622659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217653494, "dur": 16, "args": { "External id": 23514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23514, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23514, "pid": 5, "tid": 7, "ts": 1716454217653494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622690, "dur": 7, "args": { "External id": 23514, "cbid": 211, "correlation": 23514 } }, { "ph": "s", "id": 23514, "pid": 76337, "tid": -914061504, "ts": 1716454217622690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622788, "dur": 1, "args": { "External id": 23525, "cbid": 251, "correlation": 23525 } }, { "ph": "f", "id": 23525, "pid": 76337, "tid": -914061504, "ts": 1716454217622788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217653511, "dur": 88, "args": { "External id": 23526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23526, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23526, "pid": 5, "tid": 7, "ts": 1716454217653511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622794, "dur": 15, "args": { "External id": 23526, "cbid": 211, "correlation": 23526 } }, { "ph": "s", "id": 23526, "pid": 76337, "tid": -914061504, "ts": 1716454217622794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622875, "dur": 1, "args": { "External id": 23537, "cbid": 251, "correlation": 23537 } }, { "ph": "f", "id": 23537, "pid": 76337, "tid": -914061504, "ts": 1716454217622875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622879, "dur": 0, "args": { "External id": 23538, "cbid": 251, "correlation": 23538 } }, { "ph": "f", "id": 23538, "pid": 76337, "tid": -914061504, "ts": 1716454217622879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217653600, "dur": 12, "args": { "External id": 23539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23539, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23539, "pid": 5, "tid": 7, "ts": 1716454217653600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622880, "dur": 12, "args": { "External id": 23539, "cbid": 211, "correlation": 23539 } }, { "ph": "s", "id": 23539, "pid": 76337, "tid": -914061504, "ts": 1716454217622880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217653614, "dur": 6, "args": { "External id": 23541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23541, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23541, "pid": 5, "tid": 7, "ts": 1716454217653614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622894, "dur": 6, "args": { "External id": 23541, "cbid": 211, "correlation": 23541 } }, { "ph": "s", "id": 23541, "pid": 76337, "tid": -914061504, "ts": 1716454217622894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622970, "dur": 1, "args": { "External id": 23552, "cbid": 251, "correlation": 23552 } }, { "ph": "f", "id": 23552, "pid": 76337, "tid": -914061504, "ts": 1716454217622970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217622981, "dur": 0, "args": { "External id": 23553, "cbid": 251, "correlation": 23553 } }, { "ph": "f", "id": 23553, "pid": 76337, "tid": -914061504, "ts": 1716454217622981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217653621, "dur": 8, "args": { "External id": 23554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23554, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23554, "pid": 5, "tid": 7, "ts": 1716454217653621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622983, "dur": 13, "args": { "External id": 23554, "cbid": 211, "correlation": 23554 } }, { "ph": "s", "id": 23554, "pid": 76337, "tid": -914061504, "ts": 1716454217622983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217653630, "dur": 3, "args": { "External id": 23556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23556, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23556, "pid": 5, "tid": 7, "ts": 1716454217653630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217622998, "dur": 5, "args": { "External id": 23556, "cbid": 211, "correlation": 23556 } }, { "ph": "s", "id": 23556, "pid": 76337, "tid": -914061504, "ts": 1716454217622998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217653635, "dur": 54, "args": { "External id": 23581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23581, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23581, "pid": 5, "tid": 7, "ts": 1716454217653635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623077, "dur": 12, "args": { "External id": 23581, "cbid": 211, "correlation": 23581 } }, { "ph": "s", "id": 23581, "pid": 76337, "tid": -914061504, "ts": 1716454217623077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217623188, "dur": 1, "args": { "External id": 23599, "cbid": 251, "correlation": 23599 } }, { "ph": "f", "id": 23599, "pid": 76337, "tid": -914061504, "ts": 1716454217623188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217653689, "dur": 89, "args": { "External id": 23601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23601, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23601, "pid": 5, "tid": 7, "ts": 1716454217653689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623194, "dur": 14, "args": { "External id": 23601, "cbid": 211, "correlation": 23601 } }, { "ph": "s", "id": 23601, "pid": 76337, "tid": -914061504, "ts": 1716454217623194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217653780, "dur": 10, "args": { "External id": 23609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23609, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23609, "pid": 5, "tid": 7, "ts": 1716454217653780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623265, "dur": 13, "args": { "External id": 23609, "cbid": 211, "correlation": 23609 } }, { "ph": "s", "id": 23609, "pid": 76337, "tid": -914061504, "ts": 1716454217623265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217653791, "dur": 22, "args": { "External id": 23617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23617, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23617, "pid": 5, "tid": 7, "ts": 1716454217653791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623307, "dur": 9, "args": { "External id": 23617, "cbid": 211, "correlation": 23617 } }, { "ph": "s", "id": 23617, "pid": 76337, "tid": -914061504, "ts": 1716454217623307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217653814, "dur": 18, "args": { "External id": 23639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23639, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23639, "pid": 5, "tid": 7, "ts": 1716454217653814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623358, "dur": 10, "args": { "External id": 23639, "cbid": 211, "correlation": 23639 } }, { "ph": "s", "id": 23639, "pid": 76337, "tid": -914061504, "ts": 1716454217623358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217623444, "dur": 1, "args": { "External id": 23655, "cbid": 251, "correlation": 23655 } }, { "ph": "f", "id": 23655, "pid": 76337, "tid": -914061504, "ts": 1716454217623444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217623449, "dur": 0, "args": { "External id": 23657, "cbid": 251, "correlation": 23657 } }, { "ph": "f", "id": 23657, "pid": 76337, "tid": -914061504, "ts": 1716454217623449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217653833, "dur": 492, "args": { "External id": 23658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23658, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23658, "pid": 5, "tid": 7, "ts": 1716454217653833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623452, "dur": 13, "args": { "External id": 23658, "cbid": 211, "correlation": 23658 } }, { "ph": "s", "id": 23658, "pid": 76337, "tid": -914061504, "ts": 1716454217623452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217654327, "dur": 65, "args": { "External id": 23666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23666, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23666, "pid": 5, "tid": 7, "ts": 1716454217654327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623518, "dur": 13, "args": { "External id": 23666, "cbid": 211, "correlation": 23666 } }, { "ph": "s", "id": 23666, "pid": 76337, "tid": -914061504, "ts": 1716454217623518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217654394, "dur": 66, "args": { "External id": 23674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23674, "pid": 5, "tid": 7, "ts": 1716454217654394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623548, "dur": 8, "args": { "External id": 23674, "cbid": 211, "correlation": 23674 } }, { "ph": "s", "id": 23674, "pid": 76337, "tid": -914061504, "ts": 1716454217623548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217623628, "dur": 1, "args": { "External id": 23690, "cbid": 251, "correlation": 23690 } }, { "ph": "f", "id": 23690, "pid": 76337, "tid": -914061504, "ts": 1716454217623628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217654463, "dur": 1, "args": { "External id": 23692, "device": 5, "context": 1, "stream": 7, "correlation": 23692, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 23692, "pid": 5, "tid": 7, "ts": 1716454217654463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217623633, "dur": 9, "args": { "External id": 23692, "cbid": 51, "correlation": 23692 } }, { "ph": "s", "id": 23692, "pid": 76337, "tid": -914061504, "ts": 1716454217623633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217654466, "dur": 268, "args": { "External id": 23693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23693, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23693, "pid": 5, "tid": 7, "ts": 1716454217654466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623643, "dur": 11, "args": { "External id": 23693, "cbid": 211, "correlation": 23693 } }, { "ph": "s", "id": 23693, "pid": 76337, "tid": -914061504, "ts": 1716454217623643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217654736, "dur": 13, "args": { "External id": 23701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23701, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23701, "pid": 5, "tid": 7, "ts": 1716454217654736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623686, "dur": 10, "args": { "External id": 23701, "cbid": 211, "correlation": 23701 } }, { "ph": "s", "id": 23701, "pid": 76337, "tid": -914061504, "ts": 1716454217623686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217654751, "dur": 38, "args": { "External id": 23712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23712, "pid": 5, "tid": 7, "ts": 1716454217654751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623755, "dur": 12, "args": { "External id": 23712, "cbid": 211, "correlation": 23712 } }, { "ph": "s", "id": 23712, "pid": 76337, "tid": -914061504, "ts": 1716454217623755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217623819, "dur": 0, "args": { "External id": 23724, "cbid": 317, "correlation": 23724 } }, { "ph": "f", "id": 23724, "pid": 76337, "tid": -914061504, "ts": 1716454217623819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217623820, "dur": 0, "args": { "External id": 23725, "cbid": 203, "correlation": 23725 } }, { "ph": "f", "id": 23725, "pid": 76337, "tid": -914061504, "ts": 1716454217623820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217623821, "dur": 0, "args": { "External id": 23726, "cbid": 205, "correlation": 23726 } }, { "ph": "f", "id": 23726, "pid": 76337, "tid": -914061504, "ts": 1716454217623821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217654790, "dur": 14, "args": { "External id": 23730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23730, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23730, "pid": 5, "tid": 7, "ts": 1716454217654790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623836, "dur": 12, "args": { "External id": 23730, "cbid": 211, "correlation": 23730 } }, { "ph": "s", "id": 23730, "pid": 76337, "tid": -914061504, "ts": 1716454217623836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217654805, "dur": 4, "args": { "External id": 23732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 23732, "pid": 5, "tid": 7, "ts": 1716454217654805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623852, "dur": 6, "args": { "External id": 23732, "cbid": 211, "correlation": 23732 } }, { "ph": "s", "id": 23732, "pid": 76337, "tid": -914061504, "ts": 1716454217623852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217623862, "dur": 0, "args": { "External id": 23733, "cbid": 51, "correlation": 23733 } }, { "ph": "s", "id": 23733, "pid": 76337, "tid": -914061504, "ts": 1716454217623862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217654810, "dur": 96, "args": { "External id": 23734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23734, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 23734, "pid": 5, "tid": 7, "ts": 1716454217654810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623863, "dur": 5, "args": { "External id": 23734, "cbid": 211, "correlation": 23734 } }, { "ph": "s", "id": 23734, "pid": 76337, "tid": -914061504, "ts": 1716454217623863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217654907, "dur": 16, "args": { "External id": 23739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23739, "pid": 5, "tid": 7, "ts": 1716454217654907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623889, "dur": 8, "args": { "External id": 23739, "cbid": 211, "correlation": 23739 } }, { "ph": "s", "id": 23739, "pid": 76337, "tid": -914061504, "ts": 1716454217623889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217654925, "dur": 12, "args": { "External id": 23747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23747, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23747, "pid": 5, "tid": 7, "ts": 1716454217654925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217623921, "dur": 8, "args": { "External id": 23747, "cbid": 211, "correlation": 23747 } }, { "ph": "s", "id": 23747, "pid": 76337, "tid": -914061504, "ts": 1716454217623921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217624000, "dur": 0, "args": { "External id": 23757, "cbid": 317, "correlation": 23757 } }, { "ph": "f", "id": 23757, "pid": 76337, "tid": -914061504, "ts": 1716454217624000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217624001, "dur": 0, "args": { "External id": 23758, "cbid": 203, "correlation": 23758 } }, { "ph": "f", "id": 23758, "pid": 76337, "tid": -914061504, "ts": 1716454217624001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217624001, "dur": 0, "args": { "External id": 23759, "cbid": 205, "correlation": 23759 } }, { "ph": "f", "id": 23759, "pid": 76337, "tid": -914061504, "ts": 1716454217624001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217654938, "dur": 12, "args": { "External id": 23763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23763, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23763, "pid": 5, "tid": 7, "ts": 1716454217654938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624016, "dur": 13, "args": { "External id": 23763, "cbid": 211, "correlation": 23763 } }, { "ph": "s", "id": 23763, "pid": 76337, "tid": -914061504, "ts": 1716454217624016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217654951, "dur": 161, "args": { "External id": 23765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23765, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23765, "pid": 5, "tid": 7, "ts": 1716454217654951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624032, "dur": 6, "args": { "External id": 23765, "cbid": 211, "correlation": 23765 } }, { "ph": "s", "id": 23765, "pid": 76337, "tid": -914061504, "ts": 1716454217624032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217655114, "dur": 1, "args": { "External id": 23767, "device": 5, "context": 1, "stream": 7, "correlation": 23767, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 23767, "pid": 5, "tid": 7, "ts": 1716454217655114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217624044, "dur": 6, "args": { "External id": 23767, "cbid": 51, "correlation": 23767 } }, { "ph": "s", "id": 23767, "pid": 76337, "tid": -914061504, "ts": 1716454217624044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217655118, "dur": 195, "args": { "External id": 23768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23768, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 23768, "pid": 5, "tid": 7, "ts": 1716454217655118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624061, "dur": 18, "args": { "External id": 23768, "cbid": 211, "correlation": 23768 } }, { "ph": "s", "id": 23768, "pid": 76337, "tid": -914061504, "ts": 1716454217624061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217655314, "dur": 6, "args": { "External id": 23770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23770, "pid": 5, "tid": 7, "ts": 1716454217655314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624085, "dur": 7, "args": { "External id": 23770, "cbid": 211, "correlation": 23770 } }, { "ph": "s", "id": 23770, "pid": 76337, "tid": -914061504, "ts": 1716454217624085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217655321, "dur": 6, "args": { "External id": 23776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23776, "pid": 5, "tid": 7, "ts": 1716454217655321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624117, "dur": 9, "args": { "External id": 23776, "cbid": 211, "correlation": 23776 } }, { "ph": "s", "id": 23776, "pid": 76337, "tid": -914061504, "ts": 1716454217624117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217655329, "dur": 10, "args": { "External id": 23796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23796, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 23796, "pid": 5, "tid": 7, "ts": 1716454217655329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624249, "dur": 14, "args": { "External id": 23796, "cbid": 211, "correlation": 23796 } }, { "ph": "s", "id": 23796, "pid": 76337, "tid": -914061504, "ts": 1716454217624249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217655340, "dur": 4, "args": { "External id": 23808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23808, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 23808, "pid": 5, "tid": 7, "ts": 1716454217655340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624273, "dur": 7, "args": { "External id": 23808, "cbid": 211, "correlation": 23808 } }, { "ph": "s", "id": 23808, "pid": 76337, "tid": -914061504, "ts": 1716454217624273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217655346, "dur": 8, "args": { "External id": 23811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23811, "pid": 5, "tid": 7, "ts": 1716454217655346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624293, "dur": 6, "args": { "External id": 23811, "cbid": 211, "correlation": 23811 } }, { "ph": "s", "id": 23811, "pid": 76337, "tid": -914061504, "ts": 1716454217624293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217655355, "dur": 5, "args": { "External id": 23820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23820, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23820, "pid": 5, "tid": 7, "ts": 1716454217655355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624333, "dur": 10, "args": { "External id": 23820, "cbid": 211, "correlation": 23820 } }, { "ph": "s", "id": 23820, "pid": 76337, "tid": -914061504, "ts": 1716454217624333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217624387, "dur": 0, "args": { "External id": 23830, "cbid": 317, "correlation": 23830 } }, { "ph": "f", "id": 23830, "pid": 76337, "tid": -914061504, "ts": 1716454217624387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217624388, "dur": 0, "args": { "External id": 23831, "cbid": 203, "correlation": 23831 } }, { "ph": "f", "id": 23831, "pid": 76337, "tid": -914061504, "ts": 1716454217624388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217624388, "dur": 0, "args": { "External id": 23832, "cbid": 205, "correlation": 23832 } }, { "ph": "f", "id": 23832, "pid": 76337, "tid": -914061504, "ts": 1716454217624388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217655362, "dur": 5, "args": { "External id": 23836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23836, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23836, "pid": 5, "tid": 7, "ts": 1716454217655362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624403, "dur": 11, "args": { "External id": 23836, "cbid": 211, "correlation": 23836 } }, { "ph": "s", "id": 23836, "pid": 76337, "tid": -914061504, "ts": 1716454217624403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217655368, "dur": 160, "args": { "External id": 23838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23838, "pid": 5, "tid": 7, "ts": 1716454217655368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624417, "dur": 5, "args": { "External id": 23838, "cbid": 211, "correlation": 23838 } }, { "ph": "s", "id": 23838, "pid": 76337, "tid": -914061504, "ts": 1716454217624417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217655530, "dur": 1, "args": { "External id": 23840, "device": 5, "context": 1, "stream": 7, "correlation": 23840, "bytes": 240, "memory bandwidth (GB/s)": 0.125 } }, { "ph": "f", "id": 23840, "pid": 5, "tid": 7, "ts": 1716454217655530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217624428, "dur": 7, "args": { "External id": 23840, "cbid": 51, "correlation": 23840 } }, { "ph": "s", "id": 23840, "pid": 76337, "tid": -914061504, "ts": 1716454217624428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217655534, "dur": 265, "args": { "External id": 23841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23841, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23841, "pid": 5, "tid": 7, "ts": 1716454217655534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624436, "dur": 6, "args": { "External id": 23841, "cbid": 211, "correlation": 23841 } }, { "ph": "s", "id": 23841, "pid": 76337, "tid": -914061504, "ts": 1716454217624436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217655801, "dur": 6, "args": { "External id": 23843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23843, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23843, "pid": 5, "tid": 7, "ts": 1716454217655801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624446, "dur": 5, "args": { "External id": 23843, "cbid": 211, "correlation": 23843 } }, { "ph": "s", "id": 23843, "pid": 76337, "tid": -914061504, "ts": 1716454217624446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217655808, "dur": 6, "args": { "External id": 23849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23849, "pid": 5, "tid": 7, "ts": 1716454217655808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624474, "dur": 8, "args": { "External id": 23849, "cbid": 211, "correlation": 23849 } }, { "ph": "s", "id": 23849, "pid": 76337, "tid": -914061504, "ts": 1716454217624474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217655815, "dur": 3, "args": { "External id": 23857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23857, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 23857, "pid": 5, "tid": 7, "ts": 1716454217655815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624518, "dur": 9, "args": { "External id": 23857, "cbid": 211, "correlation": 23857 } }, { "ph": "s", "id": 23857, "pid": 76337, "tid": -914061504, "ts": 1716454217624518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217624600, "dur": 1, "args": { "External id": 23873, "cbid": 251, "correlation": 23873 } }, { "ph": "f", "id": 23873, "pid": 76337, "tid": -914061504, "ts": 1716454217624600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217624606, "dur": 0, "args": { "External id": 23875, "cbid": 251, "correlation": 23875 } }, { "ph": "f", "id": 23875, "pid": 76337, "tid": -914061504, "ts": 1716454217624606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217655820, "dur": 13, "args": { "External id": 23876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23876, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23876, "pid": 5, "tid": 7, "ts": 1716454217655820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624608, "dur": 13, "args": { "External id": 23876, "cbid": 211, "correlation": 23876 } }, { "ph": "s", "id": 23876, "pid": 76337, "tid": -914061504, "ts": 1716454217624608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217655834, "dur": 5, "args": { "External id": 23878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23878, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23878, "pid": 5, "tid": 7, "ts": 1716454217655834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624623, "dur": 6, "args": { "External id": 23878, "cbid": 211, "correlation": 23878 } }, { "ph": "s", "id": 23878, "pid": 76337, "tid": -914061504, "ts": 1716454217624623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217655841, "dur": 6, "args": { "External id": 23888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23888, "pid": 5, "tid": 7, "ts": 1716454217655841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624682, "dur": 12, "args": { "External id": 23888, "cbid": 211, "correlation": 23888 } }, { "ph": "s", "id": 23888, "pid": 76337, "tid": -914061504, "ts": 1716454217624682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217655847, "dur": 10, "args": { "External id": 23908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23908, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 23908, "pid": 5, "tid": 7, "ts": 1716454217655847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624748, "dur": 11, "args": { "External id": 23908, "cbid": 211, "correlation": 23908 } }, { "ph": "s", "id": 23908, "pid": 76337, "tid": -914061504, "ts": 1716454217624748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217655858, "dur": 4, "args": { "External id": 23920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23920, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 23920, "pid": 5, "tid": 7, "ts": 1716454217655858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624769, "dur": 6, "args": { "External id": 23920, "cbid": 211, "correlation": 23920 } }, { "ph": "s", "id": 23920, "pid": 76337, "tid": -914061504, "ts": 1716454217624769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217655863, "dur": 7, "args": { "External id": 23923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23923, "pid": 5, "tid": 7, "ts": 1716454217655863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624788, "dur": 6, "args": { "External id": 23923, "cbid": 211, "correlation": 23923 } }, { "ph": "s", "id": 23923, "pid": 76337, "tid": -914061504, "ts": 1716454217624788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217655872, "dur": 4, "args": { "External id": 23932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23932, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23932, "pid": 5, "tid": 7, "ts": 1716454217655872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624827, "dur": 10, "args": { "External id": 23932, "cbid": 211, "correlation": 23932 } }, { "ph": "s", "id": 23932, "pid": 76337, "tid": -914061504, "ts": 1716454217624827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217624905, "dur": 0, "args": { "External id": 23942, "cbid": 317, "correlation": 23942 } }, { "ph": "f", "id": 23942, "pid": 76337, "tid": -914061504, "ts": 1716454217624905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217624906, "dur": 0, "args": { "External id": 23943, "cbid": 203, "correlation": 23943 } }, { "ph": "f", "id": 23943, "pid": 76337, "tid": -914061504, "ts": 1716454217624906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217624907, "dur": 0, "args": { "External id": 23944, "cbid": 205, "correlation": 23944 } }, { "ph": "f", "id": 23944, "pid": 76337, "tid": -914061504, "ts": 1716454217624907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217655877, "dur": 5, "args": { "External id": 23948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23948, "pid": 5, "tid": 7, "ts": 1716454217655877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624920, "dur": 13, "args": { "External id": 23948, "cbid": 211, "correlation": 23948 } }, { "ph": "s", "id": 23948, "pid": 76337, "tid": -914061504, "ts": 1716454217624920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217655884, "dur": 160, "args": { "External id": 23950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23950, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23950, "pid": 5, "tid": 7, "ts": 1716454217655884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624936, "dur": 5, "args": { "External id": 23950, "cbid": 211, "correlation": 23950 } }, { "ph": "s", "id": 23950, "pid": 76337, "tid": -914061504, "ts": 1716454217624936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217656046, "dur": 1, "args": { "External id": 23952, "device": 5, "context": 1, "stream": 7, "correlation": 23952, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 23952, "pid": 5, "tid": 7, "ts": 1716454217656046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217624946, "dur": 6, "args": { "External id": 23952, "cbid": 51, "correlation": 23952 } }, { "ph": "s", "id": 23952, "pid": 76337, "tid": -914061504, "ts": 1716454217624946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217656050, "dur": 254, "args": { "External id": 23953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23953, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 23953, "pid": 5, "tid": 7, "ts": 1716454217656050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624954, "dur": 7, "args": { "External id": 23953, "cbid": 211, "correlation": 23953 } }, { "ph": "s", "id": 23953, "pid": 76337, "tid": -914061504, "ts": 1716454217624954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217656305, "dur": 6, "args": { "External id": 23955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23955, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 23955, "pid": 5, "tid": 7, "ts": 1716454217656305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217624964, "dur": 5, "args": { "External id": 23955, "cbid": 211, "correlation": 23955 } }, { "ph": "s", "id": 23955, "pid": 76337, "tid": -914061504, "ts": 1716454217624964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217656313, "dur": 6, "args": { "External id": 23961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23961, "pid": 5, "tid": 7, "ts": 1716454217656313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625001, "dur": 10, "args": { "External id": 23961, "cbid": 211, "correlation": 23961 } }, { "ph": "s", "id": 23961, "pid": 76337, "tid": -914061504, "ts": 1716454217625001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217656320, "dur": 5, "args": { "External id": 23969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23969, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23969, "pid": 5, "tid": 7, "ts": 1716454217656320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625035, "dur": 8, "args": { "External id": 23969, "cbid": 211, "correlation": 23969 } }, { "ph": "s", "id": 23969, "pid": 76337, "tid": -914061504, "ts": 1716454217625035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217656327, "dur": 4, "args": { "External id": 23977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23977, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 23977, "pid": 5, "tid": 7, "ts": 1716454217656327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625064, "dur": 9, "args": { "External id": 23977, "cbid": 211, "correlation": 23977 } }, { "ph": "s", "id": 23977, "pid": 76337, "tid": -914061504, "ts": 1716454217625064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217656332, "dur": 9, "args": { "External id": 23997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 23997, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 23997, "pid": 5, "tid": 7, "ts": 1716454217656332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625139, "dur": 12, "args": { "External id": 23997, "cbid": 211, "correlation": 23997 } }, { "ph": "s", "id": 23997, "pid": 76337, "tid": -914061504, "ts": 1716454217625139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217656343, "dur": 4, "args": { "External id": 24009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24009, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24009, "pid": 5, "tid": 7, "ts": 1716454217656343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625161, "dur": 6, "args": { "External id": 24009, "cbid": 211, "correlation": 24009 } }, { "ph": "s", "id": 24009, "pid": 76337, "tid": -914061504, "ts": 1716454217625161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217656348, "dur": 6, "args": { "External id": 24012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24012, "pid": 5, "tid": 7, "ts": 1716454217656348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625179, "dur": 7, "args": { "External id": 24012, "cbid": 211, "correlation": 24012 } }, { "ph": "s", "id": 24012, "pid": 76337, "tid": -914061504, "ts": 1716454217625179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217656355, "dur": 4, "args": { "External id": 24021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24021, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24021, "pid": 5, "tid": 7, "ts": 1716454217656355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625218, "dur": 9, "args": { "External id": 24021, "cbid": 211, "correlation": 24021 } }, { "ph": "s", "id": 24021, "pid": 76337, "tid": -914061504, "ts": 1716454217625218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217625268, "dur": 0, "args": { "External id": 24031, "cbid": 317, "correlation": 24031 } }, { "ph": "f", "id": 24031, "pid": 76337, "tid": -914061504, "ts": 1716454217625268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217625269, "dur": 0, "args": { "External id": 24032, "cbid": 203, "correlation": 24032 } }, { "ph": "f", "id": 24032, "pid": 76337, "tid": -914061504, "ts": 1716454217625269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217625270, "dur": 0, "args": { "External id": 24033, "cbid": 205, "correlation": 24033 } }, { "ph": "f", "id": 24033, "pid": 76337, "tid": -914061504, "ts": 1716454217625270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217656361, "dur": 5, "args": { "External id": 24037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24037, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24037, "pid": 5, "tid": 7, "ts": 1716454217656361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625283, "dur": 11, "args": { "External id": 24037, "cbid": 211, "correlation": 24037 } }, { "ph": "s", "id": 24037, "pid": 76337, "tid": -914061504, "ts": 1716454217625283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217656367, "dur": 160, "args": { "External id": 24039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24039, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24039, "pid": 5, "tid": 7, "ts": 1716454217656367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625297, "dur": 6, "args": { "External id": 24039, "cbid": 211, "correlation": 24039 } }, { "ph": "s", "id": 24039, "pid": 76337, "tid": -914061504, "ts": 1716454217625297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217656529, "dur": 1, "args": { "External id": 24041, "device": 5, "context": 1, "stream": 7, "correlation": 24041, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 24041, "pid": 5, "tid": 7, "ts": 1716454217656529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217625308, "dur": 6, "args": { "External id": 24041, "cbid": 51, "correlation": 24041 } }, { "ph": "s", "id": 24041, "pid": 76337, "tid": -914061504, "ts": 1716454217625308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217656533, "dur": 254, "args": { "External id": 24042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24042, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24042, "pid": 5, "tid": 7, "ts": 1716454217656533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625315, "dur": 6, "args": { "External id": 24042, "cbid": 211, "correlation": 24042 } }, { "ph": "s", "id": 24042, "pid": 76337, "tid": -914061504, "ts": 1716454217625315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217656788, "dur": 6, "args": { "External id": 24044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24044, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24044, "pid": 5, "tid": 7, "ts": 1716454217656788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625324, "dur": 5, "args": { "External id": 24044, "cbid": 211, "correlation": 24044 } }, { "ph": "s", "id": 24044, "pid": 76337, "tid": -914061504, "ts": 1716454217625324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217656795, "dur": 6, "args": { "External id": 24050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24050, "pid": 5, "tid": 7, "ts": 1716454217656795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625352, "dur": 8, "args": { "External id": 24050, "cbid": 211, "correlation": 24050 } }, { "ph": "s", "id": 24050, "pid": 76337, "tid": -914061504, "ts": 1716454217625352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217656802, "dur": 3, "args": { "External id": 24058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24058, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 24058, "pid": 5, "tid": 7, "ts": 1716454217656802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625395, "dur": 10, "args": { "External id": 24058, "cbid": 211, "correlation": 24058 } }, { "ph": "s", "id": 24058, "pid": 76337, "tid": -914061504, "ts": 1716454217625395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217625468, "dur": 1, "args": { "External id": 24074, "cbid": 251, "correlation": 24074 } }, { "ph": "f", "id": 24074, "pid": 76337, "tid": -914061504, "ts": 1716454217625468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217625473, "dur": 0, "args": { "External id": 24076, "cbid": 251, "correlation": 24076 } }, { "ph": "f", "id": 24076, "pid": 76337, "tid": -914061504, "ts": 1716454217625473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217656807, "dur": 10, "args": { "External id": 24077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24077, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24077, "pid": 5, "tid": 7, "ts": 1716454217656807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625475, "dur": 12, "args": { "External id": 24077, "cbid": 211, "correlation": 24077 } }, { "ph": "s", "id": 24077, "pid": 76337, "tid": -914061504, "ts": 1716454217625475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217656818, "dur": 4, "args": { "External id": 24079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24079, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24079, "pid": 5, "tid": 7, "ts": 1716454217656818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625489, "dur": 6, "args": { "External id": 24079, "cbid": 211, "correlation": 24079 } }, { "ph": "s", "id": 24079, "pid": 76337, "tid": -914061504, "ts": 1716454217625489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217656823, "dur": 6, "args": { "External id": 24089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24089, "pid": 5, "tid": 7, "ts": 1716454217656823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625547, "dur": 12, "args": { "External id": 24089, "cbid": 211, "correlation": 24089 } }, { "ph": "s", "id": 24089, "pid": 76337, "tid": -914061504, "ts": 1716454217625547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217656830, "dur": 9, "args": { "External id": 24109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24109, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 24109, "pid": 5, "tid": 7, "ts": 1716454217656830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625612, "dur": 11, "args": { "External id": 24109, "cbid": 211, "correlation": 24109 } }, { "ph": "s", "id": 24109, "pid": 76337, "tid": -914061504, "ts": 1716454217625612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217656841, "dur": 4, "args": { "External id": 24121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24121, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24121, "pid": 5, "tid": 7, "ts": 1716454217656841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625633, "dur": 6, "args": { "External id": 24121, "cbid": 211, "correlation": 24121 } }, { "ph": "s", "id": 24121, "pid": 76337, "tid": -914061504, "ts": 1716454217625633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217656846, "dur": 7, "args": { "External id": 24124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24124, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24124, "pid": 5, "tid": 7, "ts": 1716454217656846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625650, "dur": 6, "args": { "External id": 24124, "cbid": 211, "correlation": 24124 } }, { "ph": "s", "id": 24124, "pid": 76337, "tid": -914061504, "ts": 1716454217625650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217656853, "dur": 4, "args": { "External id": 24133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24133, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24133, "pid": 5, "tid": 7, "ts": 1716454217656853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625691, "dur": 9, "args": { "External id": 24133, "cbid": 211, "correlation": 24133 } }, { "ph": "s", "id": 24133, "pid": 76337, "tid": -914061504, "ts": 1716454217625691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217625753, "dur": 0, "args": { "External id": 24143, "cbid": 317, "correlation": 24143 } }, { "ph": "f", "id": 24143, "pid": 76337, "tid": -914061504, "ts": 1716454217625753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217625754, "dur": 0, "args": { "External id": 24144, "cbid": 203, "correlation": 24144 } }, { "ph": "f", "id": 24144, "pid": 76337, "tid": -914061504, "ts": 1716454217625754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217625754, "dur": 0, "args": { "External id": 24145, "cbid": 205, "correlation": 24145 } }, { "ph": "f", "id": 24145, "pid": 76337, "tid": -914061504, "ts": 1716454217625754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217656859, "dur": 5, "args": { "External id": 24149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24149, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24149, "pid": 5, "tid": 7, "ts": 1716454217656859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625768, "dur": 13, "args": { "External id": 24149, "cbid": 211, "correlation": 24149 } }, { "ph": "s", "id": 24149, "pid": 76337, "tid": -914061504, "ts": 1716454217625768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217656865, "dur": 159, "args": { "External id": 24151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24151, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24151, "pid": 5, "tid": 7, "ts": 1716454217656865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625783, "dur": 5, "args": { "External id": 24151, "cbid": 211, "correlation": 24151 } }, { "ph": "s", "id": 24151, "pid": 76337, "tid": -914061504, "ts": 1716454217625783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217657027, "dur": 1, "args": { "External id": 24153, "device": 5, "context": 1, "stream": 7, "correlation": 24153, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 24153, "pid": 5, "tid": 7, "ts": 1716454217657027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217625794, "dur": 7, "args": { "External id": 24153, "cbid": 51, "correlation": 24153 } }, { "ph": "s", "id": 24153, "pid": 76337, "tid": -914061504, "ts": 1716454217625794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217657031, "dur": 254, "args": { "External id": 24154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24154, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24154, "pid": 5, "tid": 7, "ts": 1716454217657031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625802, "dur": 6, "args": { "External id": 24154, "cbid": 211, "correlation": 24154 } }, { "ph": "s", "id": 24154, "pid": 76337, "tid": -914061504, "ts": 1716454217625802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217657286, "dur": 6, "args": { "External id": 24156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24156, "pid": 5, "tid": 7, "ts": 1716454217657286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625811, "dur": 5, "args": { "External id": 24156, "cbid": 211, "correlation": 24156 } }, { "ph": "s", "id": 24156, "pid": 76337, "tid": -914061504, "ts": 1716454217625811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217657293, "dur": 6, "args": { "External id": 24162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24162, "pid": 5, "tid": 7, "ts": 1716454217657293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625840, "dur": 9, "args": { "External id": 24162, "cbid": 211, "correlation": 24162 } }, { "ph": "s", "id": 24162, "pid": 76337, "tid": -914061504, "ts": 1716454217625840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217657300, "dur": 5, "args": { "External id": 24170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24170, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24170, "pid": 5, "tid": 7, "ts": 1716454217657300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625874, "dur": 9, "args": { "External id": 24170, "cbid": 211, "correlation": 24170 } }, { "ph": "s", "id": 24170, "pid": 76337, "tid": -914061504, "ts": 1716454217625874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217657306, "dur": 4, "args": { "External id": 24178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24178, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24178, "pid": 5, "tid": 7, "ts": 1716454217657306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217625919, "dur": 10, "args": { "External id": 24178, "cbid": 211, "correlation": 24178 } }, { "ph": "s", "id": 24178, "pid": 76337, "tid": -914061504, "ts": 1716454217625919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217657312, "dur": 10, "args": { "External id": 24198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24198, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 24198, "pid": 5, "tid": 7, "ts": 1716454217657312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626081, "dur": 15, "args": { "External id": 24198, "cbid": 211, "correlation": 24198 } }, { "ph": "s", "id": 24198, "pid": 76337, "tid": -914061504, "ts": 1716454217626081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217657323, "dur": 4, "args": { "External id": 24210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24210, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24210, "pid": 5, "tid": 7, "ts": 1716454217657323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626106, "dur": 6, "args": { "External id": 24210, "cbid": 211, "correlation": 24210 } }, { "ph": "s", "id": 24210, "pid": 76337, "tid": -914061504, "ts": 1716454217626106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217657328, "dur": 7, "args": { "External id": 24213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24213, "pid": 5, "tid": 7, "ts": 1716454217657328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626125, "dur": 6, "args": { "External id": 24213, "cbid": 211, "correlation": 24213 } }, { "ph": "s", "id": 24213, "pid": 76337, "tid": -914061504, "ts": 1716454217626125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217657336, "dur": 4, "args": { "External id": 24222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24222, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24222, "pid": 5, "tid": 7, "ts": 1716454217657336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626164, "dur": 10, "args": { "External id": 24222, "cbid": 211, "correlation": 24222 } }, { "ph": "s", "id": 24222, "pid": 76337, "tid": -914061504, "ts": 1716454217626164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217626236, "dur": 0, "args": { "External id": 24232, "cbid": 317, "correlation": 24232 } }, { "ph": "f", "id": 24232, "pid": 76337, "tid": -914061504, "ts": 1716454217626236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217626237, "dur": 0, "args": { "External id": 24233, "cbid": 203, "correlation": 24233 } }, { "ph": "f", "id": 24233, "pid": 76337, "tid": -914061504, "ts": 1716454217626237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217626237, "dur": 0, "args": { "External id": 24234, "cbid": 205, "correlation": 24234 } }, { "ph": "f", "id": 24234, "pid": 76337, "tid": -914061504, "ts": 1716454217626237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217657342, "dur": 5, "args": { "External id": 24238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24238, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24238, "pid": 5, "tid": 7, "ts": 1716454217657342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626252, "dur": 12, "args": { "External id": 24238, "cbid": 211, "correlation": 24238 } }, { "ph": "s", "id": 24238, "pid": 76337, "tid": -914061504, "ts": 1716454217626252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217657348, "dur": 159, "args": { "External id": 24240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24240, "pid": 5, "tid": 7, "ts": 1716454217657348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626267, "dur": 6, "args": { "External id": 24240, "cbid": 211, "correlation": 24240 } }, { "ph": "s", "id": 24240, "pid": 76337, "tid": -914061504, "ts": 1716454217626267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217657509, "dur": 1, "args": { "External id": 24242, "device": 5, "context": 1, "stream": 7, "correlation": 24242, "bytes": 240, "memory bandwidth (GB/s)": 0.1561483409238777 } }, { "ph": "f", "id": 24242, "pid": 5, "tid": 7, "ts": 1716454217657509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217626278, "dur": 7, "args": { "External id": 24242, "cbid": 51, "correlation": 24242 } }, { "ph": "s", "id": 24242, "pid": 76337, "tid": -914061504, "ts": 1716454217626278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217657513, "dur": 254, "args": { "External id": 24243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24243, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24243, "pid": 5, "tid": 7, "ts": 1716454217657513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626285, "dur": 6, "args": { "External id": 24243, "cbid": 211, "correlation": 24243 } }, { "ph": "s", "id": 24243, "pid": 76337, "tid": -914061504, "ts": 1716454217626285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217657768, "dur": 6, "args": { "External id": 24245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24245, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24245, "pid": 5, "tid": 7, "ts": 1716454217657768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626296, "dur": 5, "args": { "External id": 24245, "cbid": 211, "correlation": 24245 } }, { "ph": "s", "id": 24245, "pid": 76337, "tid": -914061504, "ts": 1716454217626296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217657775, "dur": 6, "args": { "External id": 24251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24251, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24251, "pid": 5, "tid": 7, "ts": 1716454217657775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626325, "dur": 8, "args": { "External id": 24251, "cbid": 211, "correlation": 24251 } }, { "ph": "s", "id": 24251, "pid": 76337, "tid": -914061504, "ts": 1716454217626325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217657783, "dur": 3, "args": { "External id": 24259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24259, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 24259, "pid": 5, "tid": 7, "ts": 1716454217657783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626370, "dur": 9, "args": { "External id": 24259, "cbid": 211, "correlation": 24259 } }, { "ph": "s", "id": 24259, "pid": 76337, "tid": -914061504, "ts": 1716454217626370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217626449, "dur": 1, "args": { "External id": 24275, "cbid": 251, "correlation": 24275 } }, { "ph": "f", "id": 24275, "pid": 76337, "tid": -914061504, "ts": 1716454217626449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217626454, "dur": 0, "args": { "External id": 24277, "cbid": 251, "correlation": 24277 } }, { "ph": "f", "id": 24277, "pid": 76337, "tid": -914061504, "ts": 1716454217626454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217657787, "dur": 11, "args": { "External id": 24278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24278, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24278, "pid": 5, "tid": 7, "ts": 1716454217657787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626456, "dur": 13, "args": { "External id": 24278, "cbid": 211, "correlation": 24278 } }, { "ph": "s", "id": 24278, "pid": 76337, "tid": -914061504, "ts": 1716454217626456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217657799, "dur": 4, "args": { "External id": 24280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24280, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24280, "pid": 5, "tid": 7, "ts": 1716454217657799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626470, "dur": 5, "args": { "External id": 24280, "cbid": 211, "correlation": 24280 } }, { "ph": "s", "id": 24280, "pid": 76337, "tid": -914061504, "ts": 1716454217626470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217657804, "dur": 6, "args": { "External id": 24290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24290, "pid": 5, "tid": 7, "ts": 1716454217657804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626528, "dur": 12, "args": { "External id": 24290, "cbid": 211, "correlation": 24290 } }, { "ph": "s", "id": 24290, "pid": 76337, "tid": -914061504, "ts": 1716454217626528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217657811, "dur": 9, "args": { "External id": 24310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24310, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 24310, "pid": 5, "tid": 7, "ts": 1716454217657811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626595, "dur": 11, "args": { "External id": 24310, "cbid": 211, "correlation": 24310 } }, { "ph": "s", "id": 24310, "pid": 76337, "tid": -914061504, "ts": 1716454217626595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217657822, "dur": 4, "args": { "External id": 24322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24322, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24322, "pid": 5, "tid": 7, "ts": 1716454217657822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626616, "dur": 6, "args": { "External id": 24322, "cbid": 211, "correlation": 24322 } }, { "ph": "s", "id": 24322, "pid": 76337, "tid": -914061504, "ts": 1716454217626616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217657827, "dur": 7, "args": { "External id": 24325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24325, "pid": 5, "tid": 7, "ts": 1716454217657827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626634, "dur": 6, "args": { "External id": 24325, "cbid": 211, "correlation": 24325 } }, { "ph": "s", "id": 24325, "pid": 76337, "tid": -914061504, "ts": 1716454217626634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217657834, "dur": 4, "args": { "External id": 24334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24334, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24334, "pid": 5, "tid": 7, "ts": 1716454217657834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626675, "dur": 10, "args": { "External id": 24334, "cbid": 211, "correlation": 24334 } }, { "ph": "s", "id": 24334, "pid": 76337, "tid": -914061504, "ts": 1716454217626675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217626751, "dur": 0, "args": { "External id": 24344, "cbid": 317, "correlation": 24344 } }, { "ph": "f", "id": 24344, "pid": 76337, "tid": -914061504, "ts": 1716454217626751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217626751, "dur": 0, "args": { "External id": 24345, "cbid": 203, "correlation": 24345 } }, { "ph": "f", "id": 24345, "pid": 76337, "tid": -914061504, "ts": 1716454217626751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217626752, "dur": 0, "args": { "External id": 24346, "cbid": 205, "correlation": 24346 } }, { "ph": "f", "id": 24346, "pid": 76337, "tid": -914061504, "ts": 1716454217626752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217657840, "dur": 5, "args": { "External id": 24350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24350, "pid": 5, "tid": 7, "ts": 1716454217657840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626767, "dur": 13, "args": { "External id": 24350, "cbid": 211, "correlation": 24350 } }, { "ph": "s", "id": 24350, "pid": 76337, "tid": -914061504, "ts": 1716454217626767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217657846, "dur": 159, "args": { "External id": 24352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24352, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24352, "pid": 5, "tid": 7, "ts": 1716454217657846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626782, "dur": 5, "args": { "External id": 24352, "cbid": 211, "correlation": 24352 } }, { "ph": "s", "id": 24352, "pid": 76337, "tid": -914061504, "ts": 1716454217626782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217658008, "dur": 1, "args": { "External id": 24354, "device": 5, "context": 1, "stream": 7, "correlation": 24354, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 24354, "pid": 5, "tid": 7, "ts": 1716454217658008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217626792, "dur": 6, "args": { "External id": 24354, "cbid": 51, "correlation": 24354 } }, { "ph": "s", "id": 24354, "pid": 76337, "tid": -914061504, "ts": 1716454217626792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217658012, "dur": 253, "args": { "External id": 24355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24355, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24355, "pid": 5, "tid": 7, "ts": 1716454217658012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626800, "dur": 6, "args": { "External id": 24355, "cbid": 211, "correlation": 24355 } }, { "ph": "s", "id": 24355, "pid": 76337, "tid": -914061504, "ts": 1716454217626800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217658266, "dur": 6, "args": { "External id": 24357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24357, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24357, "pid": 5, "tid": 7, "ts": 1716454217658266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626809, "dur": 5, "args": { "External id": 24357, "cbid": 211, "correlation": 24357 } }, { "ph": "s", "id": 24357, "pid": 76337, "tid": -914061504, "ts": 1716454217626809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217658273, "dur": 6, "args": { "External id": 24363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24363, "pid": 5, "tid": 7, "ts": 1716454217658273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626838, "dur": 8, "args": { "External id": 24363, "cbid": 211, "correlation": 24363 } }, { "ph": "s", "id": 24363, "pid": 76337, "tid": -914061504, "ts": 1716454217626838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217658281, "dur": 5, "args": { "External id": 24371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24371, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24371, "pid": 5, "tid": 7, "ts": 1716454217658281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626871, "dur": 8, "args": { "External id": 24371, "cbid": 211, "correlation": 24371 } }, { "ph": "s", "id": 24371, "pid": 76337, "tid": -914061504, "ts": 1716454217626871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217658287, "dur": 4, "args": { "External id": 24379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24379, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24379, "pid": 5, "tid": 7, "ts": 1716454217658287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217626900, "dur": 8, "args": { "External id": 24379, "cbid": 211, "correlation": 24379 } }, { "ph": "s", "id": 24379, "pid": 76337, "tid": -914061504, "ts": 1716454217626900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217658293, "dur": 9, "args": { "External id": 24399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24399, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 24399, "pid": 5, "tid": 7, "ts": 1716454217658293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627057, "dur": 15, "args": { "External id": 24399, "cbid": 211, "correlation": 24399 } }, { "ph": "s", "id": 24399, "pid": 76337, "tid": -914061504, "ts": 1716454217627057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217658304, "dur": 4, "args": { "External id": 24411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24411, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24411, "pid": 5, "tid": 7, "ts": 1716454217658304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627082, "dur": 7, "args": { "External id": 24411, "cbid": 211, "correlation": 24411 } }, { "ph": "s", "id": 24411, "pid": 76337, "tid": -914061504, "ts": 1716454217627082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217658309, "dur": 6, "args": { "External id": 24414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24414, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24414, "pid": 5, "tid": 7, "ts": 1716454217658309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627102, "dur": 6, "args": { "External id": 24414, "cbid": 211, "correlation": 24414 } }, { "ph": "s", "id": 24414, "pid": 76337, "tid": -914061504, "ts": 1716454217627102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217627172, "dur": 0, "args": { "External id": 24425, "cbid": 317, "correlation": 24425 } }, { "ph": "f", "id": 24425, "pid": 76337, "tid": -914061504, "ts": 1716454217627172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217627173, "dur": 0, "args": { "External id": 24426, "cbid": 203, "correlation": 24426 } }, { "ph": "f", "id": 24426, "pid": 76337, "tid": -914061504, "ts": 1716454217627173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217627173, "dur": 0, "args": { "External id": 24427, "cbid": 205, "correlation": 24427 } }, { "ph": "f", "id": 24427, "pid": 76337, "tid": -914061504, "ts": 1716454217627173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217658316, "dur": 5, "args": { "External id": 24431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24431, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24431, "pid": 5, "tid": 7, "ts": 1716454217658316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627192, "dur": 13, "args": { "External id": 24431, "cbid": 211, "correlation": 24431 } }, { "ph": "s", "id": 24431, "pid": 76337, "tid": -914061504, "ts": 1716454217627192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217658323, "dur": 37, "args": { "External id": 24433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24433, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 24433, "pid": 5, "tid": 7, "ts": 1716454217658323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627212, "dur": 24, "args": { "External id": 24433, "cbid": 211, "correlation": 24433 } }, { "ph": "s", "id": 24433, "pid": 76337, "tid": -914061504, "ts": 1716454217627212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217658361, "dur": 5, "args": { "External id": 24435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24435, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24435, "pid": 5, "tid": 7, "ts": 1716454217658361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627241, "dur": 6, "args": { "External id": 24435, "cbid": 211, "correlation": 24435 } }, { "ph": "s", "id": 24435, "pid": 76337, "tid": -914061504, "ts": 1716454217627241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217658367, "dur": 6, "args": { "External id": 24441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24441, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24441, "pid": 5, "tid": 7, "ts": 1716454217658367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627270, "dur": 9, "args": { "External id": 24441, "cbid": 211, "correlation": 24441 } }, { "ph": "s", "id": 24441, "pid": 76337, "tid": -914061504, "ts": 1716454217627270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217658374, "dur": 20, "args": { "External id": 24450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24450, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24450, "pid": 5, "tid": 7, "ts": 1716454217658374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627358, "dur": 14, "args": { "External id": 24450, "cbid": 211, "correlation": 24450 } }, { "ph": "s", "id": 24450, "pid": 76337, "tid": -914061504, "ts": 1716454217627358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217658395, "dur": 10, "args": { "External id": 24472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24472, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 24472, "pid": 5, "tid": 7, "ts": 1716454217658395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627416, "dur": 10, "args": { "External id": 24472, "cbid": 211, "correlation": 24472 } }, { "ph": "s", "id": 24472, "pid": 76337, "tid": -914061504, "ts": 1716454217627416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217627525, "dur": 108, "args": { "External id": 24483, "cbid": 251, "correlation": 24483 } }, { "ph": "f", "id": 24483, "pid": 76337, "tid": -914061504, "ts": 1716454217627525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217627638, "dur": 0, "args": { "External id": 24484, "cbid": 251, "correlation": 24484 } }, { "ph": "f", "id": 24484, "pid": 76337, "tid": -914061504, "ts": 1716454217627638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217658413, "dur": 50, "args": { "External id": 24485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24485, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 24485, "pid": 5, "tid": 7, "ts": 1716454217658413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627641, "dur": 21, "args": { "External id": 24485, "cbid": 211, "correlation": 24485 } }, { "ph": "s", "id": 24485, "pid": 76337, "tid": -914061504, "ts": 1716454217627641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217627736, "dur": 1, "args": { "External id": 24496, "cbid": 251, "correlation": 24496 } }, { "ph": "f", "id": 24496, "pid": 76337, "tid": -914061504, "ts": 1716454217627736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217627740, "dur": 0, "args": { "External id": 24497, "cbid": 251, "correlation": 24497 } }, { "ph": "f", "id": 24497, "pid": 76337, "tid": -914061504, "ts": 1716454217627740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217658465, "dur": 52, "args": { "External id": 24498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24498, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 24498, "pid": 5, "tid": 7, "ts": 1716454217658465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627742, "dur": 13, "args": { "External id": 24498, "cbid": 211, "correlation": 24498 } }, { "ph": "s", "id": 24498, "pid": 76337, "tid": -914061504, "ts": 1716454217627742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217627822, "dur": 1, "args": { "External id": 24509, "cbid": 251, "correlation": 24509 } }, { "ph": "f", "id": 24509, "pid": 76337, "tid": -914061504, "ts": 1716454217627822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217627826, "dur": 0, "args": { "External id": 24510, "cbid": 251, "correlation": 24510 } }, { "ph": "f", "id": 24510, "pid": 76337, "tid": -914061504, "ts": 1716454217627826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217658518, "dur": 52, "args": { "External id": 24511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24511, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 24511, "pid": 5, "tid": 7, "ts": 1716454217658518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627828, "dur": 12, "args": { "External id": 24511, "cbid": 211, "correlation": 24511 } }, { "ph": "s", "id": 24511, "pid": 76337, "tid": -914061504, "ts": 1716454217627828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217658571, "dur": 55, "args": { "External id": 24536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24536, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24536, "pid": 5, "tid": 7, "ts": 1716454217658571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217627916, "dur": 14, "args": { "External id": 24536, "cbid": 211, "correlation": 24536 } }, { "ph": "s", "id": 24536, "pid": 76337, "tid": -914061504, "ts": 1716454217627916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628044, "dur": 1, "args": { "External id": 24554, "cbid": 251, "correlation": 24554 } }, { "ph": "f", "id": 24554, "pid": 76337, "tid": -914061504, "ts": 1716454217628044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217658627, "dur": 61, "args": { "External id": 24556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24556, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 24556, "pid": 5, "tid": 7, "ts": 1716454217658627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628050, "dur": 14, "args": { "External id": 24556, "cbid": 211, "correlation": 24556 } }, { "ph": "s", "id": 24556, "pid": 76337, "tid": -914061504, "ts": 1716454217628050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217658690, "dur": 6, "args": { "External id": 24564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24564, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24564, "pid": 5, "tid": 7, "ts": 1716454217658690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628124, "dur": 12, "args": { "External id": 24564, "cbid": 211, "correlation": 24564 } }, { "ph": "s", "id": 24564, "pid": 76337, "tid": -914061504, "ts": 1716454217628124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217658697, "dur": 7, "args": { "External id": 24572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24572, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24572, "pid": 5, "tid": 7, "ts": 1716454217658697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628162, "dur": 9, "args": { "External id": 24572, "cbid": 211, "correlation": 24572 } }, { "ph": "s", "id": 24572, "pid": 76337, "tid": -914061504, "ts": 1716454217628162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217658705, "dur": 8, "args": { "External id": 24583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24583, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24583, "pid": 5, "tid": 7, "ts": 1716454217658705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628238, "dur": 13, "args": { "External id": 24583, "cbid": 211, "correlation": 24583 } }, { "ph": "s", "id": 24583, "pid": 76337, "tid": -914061504, "ts": 1716454217628238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217658714, "dur": 10, "args": { "External id": 24605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24605, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 24605, "pid": 5, "tid": 7, "ts": 1716454217658714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628271, "dur": 8, "args": { "External id": 24605, "cbid": 211, "correlation": 24605 } }, { "ph": "s", "id": 24605, "pid": 76337, "tid": -914061504, "ts": 1716454217628271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628367, "dur": 2, "args": { "External id": 24616, "cbid": 251, "correlation": 24616 } }, { "ph": "f", "id": 24616, "pid": 76337, "tid": -914061504, "ts": 1716454217628367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217658727, "dur": 1, "args": { "External id": 24617, "device": 5, "context": 1, "stream": 7, "correlation": 24617, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 24617, "pid": 5, "tid": 7, "ts": 1716454217658727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217628373, "dur": 10, "args": { "External id": 24617, "cbid": 51, "correlation": 24617 } }, { "ph": "s", "id": 24617, "pid": 76337, "tid": -914061504, "ts": 1716454217628373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217658730, "dur": 36, "args": { "External id": 24618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24618, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 24618, "pid": 5, "tid": 7, "ts": 1716454217658730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628386, "dur": 13, "args": { "External id": 24618, "cbid": 211, "correlation": 24618 } }, { "ph": "s", "id": 24618, "pid": 76337, "tid": -914061504, "ts": 1716454217628386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628473, "dur": 1, "args": { "External id": 24629, "cbid": 251, "correlation": 24629 } }, { "ph": "f", "id": 24629, "pid": 76337, "tid": -914061504, "ts": 1716454217628473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628478, "dur": 0, "args": { "External id": 24630, "cbid": 251, "correlation": 24630 } }, { "ph": "f", "id": 24630, "pid": 76337, "tid": -914061504, "ts": 1716454217628478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217658768, "dur": 11, "args": { "External id": 24631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24631, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24631, "pid": 5, "tid": 7, "ts": 1716454217658768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628480, "dur": 14, "args": { "External id": 24631, "cbid": 211, "correlation": 24631 } }, { "ph": "s", "id": 24631, "pid": 76337, "tid": -914061504, "ts": 1716454217628480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217658780, "dur": 5, "args": { "External id": 24633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24633, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24633, "pid": 5, "tid": 7, "ts": 1716454217658780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628496, "dur": 7, "args": { "External id": 24633, "cbid": 211, "correlation": 24633 } }, { "ph": "s", "id": 24633, "pid": 76337, "tid": -914061504, "ts": 1716454217628496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628568, "dur": 1, "args": { "External id": 24644, "cbid": 251, "correlation": 24644 } }, { "ph": "f", "id": 24644, "pid": 76337, "tid": -914061504, "ts": 1716454217628568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628572, "dur": 0, "args": { "External id": 24645, "cbid": 251, "correlation": 24645 } }, { "ph": "f", "id": 24645, "pid": 76337, "tid": -914061504, "ts": 1716454217628572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217658787, "dur": 8, "args": { "External id": 24646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24646, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24646, "pid": 5, "tid": 7, "ts": 1716454217658787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628573, "dur": 13, "args": { "External id": 24646, "cbid": 211, "correlation": 24646 } }, { "ph": "s", "id": 24646, "pid": 76337, "tid": -914061504, "ts": 1716454217628573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217658797, "dur": 3, "args": { "External id": 24648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24648, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24648, "pid": 5, "tid": 7, "ts": 1716454217658797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628588, "dur": 6, "args": { "External id": 24648, "cbid": 211, "correlation": 24648 } }, { "ph": "s", "id": 24648, "pid": 76337, "tid": -914061504, "ts": 1716454217628588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217658801, "dur": 17, "args": { "External id": 24673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24673, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 24673, "pid": 5, "tid": 7, "ts": 1716454217658801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628668, "dur": 12, "args": { "External id": 24673, "cbid": 211, "correlation": 24673 } }, { "ph": "s", "id": 24673, "pid": 76337, "tid": -914061504, "ts": 1716454217628668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217628782, "dur": 2, "args": { "External id": 24691, "cbid": 251, "correlation": 24691 } }, { "ph": "f", "id": 24691, "pid": 76337, "tid": -914061504, "ts": 1716454217628782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217658820, "dur": 1, "args": { "External id": 24693, "device": 5, "context": 1, "stream": 7, "correlation": 24693, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 24693, "pid": 5, "tid": 7, "ts": 1716454217658820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217628788, "dur": 10, "args": { "External id": 24693, "cbid": 51, "correlation": 24693 } }, { "ph": "s", "id": 24693, "pid": 76337, "tid": -914061504, "ts": 1716454217628788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217658824, "dur": 36, "args": { "External id": 24694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24694, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 24694, "pid": 5, "tid": 7, "ts": 1716454217658824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628800, "dur": 13, "args": { "External id": 24694, "cbid": 211, "correlation": 24694 } }, { "ph": "s", "id": 24694, "pid": 76337, "tid": -914061504, "ts": 1716454217628800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217658861, "dur": 4, "args": { "External id": 24702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24702, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24702, "pid": 5, "tid": 7, "ts": 1716454217658861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628873, "dur": 12, "args": { "External id": 24702, "cbid": 211, "correlation": 24702 } }, { "ph": "s", "id": 24702, "pid": 76337, "tid": -914061504, "ts": 1716454217628873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217658867, "dur": 8, "args": { "External id": 24710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24710, "pid": 5, "tid": 7, "ts": 1716454217658867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628916, "dur": 9, "args": { "External id": 24710, "cbid": 211, "correlation": 24710 } }, { "ph": "s", "id": 24710, "pid": 76337, "tid": -914061504, "ts": 1716454217628916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217658876, "dur": 9, "args": { "External id": 24732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24732, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 24732, "pid": 5, "tid": 7, "ts": 1716454217658876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217628968, "dur": 18, "args": { "External id": 24732, "cbid": 211, "correlation": 24732 } }, { "ph": "s", "id": 24732, "pid": 76337, "tid": -914061504, "ts": 1716454217628968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217629071, "dur": 1, "args": { "External id": 24748, "cbid": 251, "correlation": 24748 } }, { "ph": "f", "id": 24748, "pid": 76337, "tid": -914061504, "ts": 1716454217629071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217629076, "dur": 0, "args": { "External id": 24750, "cbid": 251, "correlation": 24750 } }, { "ph": "f", "id": 24750, "pid": 76337, "tid": -914061504, "ts": 1716454217629076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217658887, "dur": 188, "args": { "External id": 24751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24751, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24751, "pid": 5, "tid": 7, "ts": 1716454217658887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629078, "dur": 14, "args": { "External id": 24751, "cbid": 211, "correlation": 24751 } }, { "ph": "s", "id": 24751, "pid": 76337, "tid": -914061504, "ts": 1716454217629078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659076, "dur": 21, "args": { "External id": 24759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24759, "pid": 5, "tid": 7, "ts": 1716454217659076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629148, "dur": 12, "args": { "External id": 24759, "cbid": 211, "correlation": 24759 } }, { "ph": "s", "id": 24759, "pid": 76337, "tid": -914061504, "ts": 1716454217629148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659098, "dur": 21, "args": { "External id": 24767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24767, "pid": 5, "tid": 7, "ts": 1716454217659098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629182, "dur": 8, "args": { "External id": 24767, "cbid": 211, "correlation": 24767 } }, { "ph": "s", "id": 24767, "pid": 76337, "tid": -914061504, "ts": 1716454217629182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217629264, "dur": 1, "args": { "External id": 24783, "cbid": 251, "correlation": 24783 } }, { "ph": "f", "id": 24783, "pid": 76337, "tid": -914061504, "ts": 1716454217629264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217659121, "dur": 1, "args": { "External id": 24785, "device": 5, "context": 1, "stream": 7, "correlation": 24785, "bytes": 120, "memory bandwidth (GB/s)": 0.07807417046193885 } }, { "ph": "f", "id": 24785, "pid": 5, "tid": 7, "ts": 1716454217659121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217629268, "dur": 9, "args": { "External id": 24785, "cbid": 51, "correlation": 24785 } }, { "ph": "s", "id": 24785, "pid": 76337, "tid": -914061504, "ts": 1716454217629268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217659125, "dur": 109, "args": { "External id": 24786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24786, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 24786, "pid": 5, "tid": 7, "ts": 1716454217659125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629279, "dur": 12, "args": { "External id": 24786, "cbid": 211, "correlation": 24786 } }, { "ph": "s", "id": 24786, "pid": 76337, "tid": -914061504, "ts": 1716454217629279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217659236, "dur": 5, "args": { "External id": 24794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24794, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24794, "pid": 5, "tid": 7, "ts": 1716454217659236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629322, "dur": 10, "args": { "External id": 24794, "cbid": 211, "correlation": 24794 } }, { "ph": "s", "id": 24794, "pid": 76337, "tid": -914061504, "ts": 1716454217629322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659242, "dur": 9, "args": { "External id": 24805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24805, "pid": 5, "tid": 7, "ts": 1716454217659242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629392, "dur": 12, "args": { "External id": 24805, "cbid": 211, "correlation": 24805 } }, { "ph": "s", "id": 24805, "pid": 76337, "tid": -914061504, "ts": 1716454217629392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217629457, "dur": 0, "args": { "External id": 24817, "cbid": 317, "correlation": 24817 } }, { "ph": "f", "id": 24817, "pid": 76337, "tid": -914061504, "ts": 1716454217629457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217629458, "dur": 0, "args": { "External id": 24818, "cbid": 203, "correlation": 24818 } }, { "ph": "f", "id": 24818, "pid": 76337, "tid": -914061504, "ts": 1716454217629458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217629459, "dur": 0, "args": { "External id": 24819, "cbid": 205, "correlation": 24819 } }, { "ph": "f", "id": 24819, "pid": 76337, "tid": -914061504, "ts": 1716454217629459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659252, "dur": 5, "args": { "External id": 24823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24823, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24823, "pid": 5, "tid": 7, "ts": 1716454217659252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629474, "dur": 13, "args": { "External id": 24823, "cbid": 211, "correlation": 24823 } }, { "ph": "s", "id": 24823, "pid": 76337, "tid": -914061504, "ts": 1716454217629474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217659259, "dur": 36, "args": { "External id": 24825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24825, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 24825, "pid": 5, "tid": 7, "ts": 1716454217659259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629495, "dur": 7, "args": { "External id": 24825, "cbid": 211, "correlation": 24825 } }, { "ph": "s", "id": 24825, "pid": 76337, "tid": -914061504, "ts": 1716454217629495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659296, "dur": 6, "args": { "External id": 24827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24827, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24827, "pid": 5, "tid": 7, "ts": 1716454217659296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629506, "dur": 5, "args": { "External id": 24827, "cbid": 211, "correlation": 24827 } }, { "ph": "s", "id": 24827, "pid": 76337, "tid": -914061504, "ts": 1716454217629506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659304, "dur": 7, "args": { "External id": 24833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24833, "pid": 5, "tid": 7, "ts": 1716454217659304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629533, "dur": 9, "args": { "External id": 24833, "cbid": 211, "correlation": 24833 } }, { "ph": "s", "id": 24833, "pid": 76337, "tid": -914061504, "ts": 1716454217629533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217659312, "dur": 5, "args": { "External id": 24841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24841, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24841, "pid": 5, "tid": 7, "ts": 1716454217659312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629565, "dur": 8, "args": { "External id": 24841, "cbid": 211, "correlation": 24841 } }, { "ph": "s", "id": 24841, "pid": 76337, "tid": -914061504, "ts": 1716454217629565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217659318, "dur": 11, "args": { "External id": 24861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24861, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 24861, "pid": 5, "tid": 7, "ts": 1716454217659318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629639, "dur": 12, "args": { "External id": 24861, "cbid": 211, "correlation": 24861 } }, { "ph": "s", "id": 24861, "pid": 76337, "tid": -914061504, "ts": 1716454217629639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217659330, "dur": 4, "args": { "External id": 24873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24873, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24873, "pid": 5, "tid": 7, "ts": 1716454217659330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629661, "dur": 6, "args": { "External id": 24873, "cbid": 211, "correlation": 24873 } }, { "ph": "s", "id": 24873, "pid": 76337, "tid": -914061504, "ts": 1716454217629661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659336, "dur": 9, "args": { "External id": 24876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24876, "pid": 5, "tid": 7, "ts": 1716454217659336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629679, "dur": 7, "args": { "External id": 24876, "cbid": 211, "correlation": 24876 } }, { "ph": "s", "id": 24876, "pid": 76337, "tid": -914061504, "ts": 1716454217629679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217659345, "dur": 5, "args": { "External id": 24885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24885, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24885, "pid": 5, "tid": 7, "ts": 1716454217659345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629719, "dur": 10, "args": { "External id": 24885, "cbid": 211, "correlation": 24885 } }, { "ph": "s", "id": 24885, "pid": 76337, "tid": -914061504, "ts": 1716454217629719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217629785, "dur": 0, "args": { "External id": 24895, "cbid": 317, "correlation": 24895 } }, { "ph": "f", "id": 24895, "pid": 76337, "tid": -914061504, "ts": 1716454217629785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217629785, "dur": 0, "args": { "External id": 24896, "cbid": 203, "correlation": 24896 } }, { "ph": "f", "id": 24896, "pid": 76337, "tid": -914061504, "ts": 1716454217629785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217629786, "dur": 0, "args": { "External id": 24897, "cbid": 205, "correlation": 24897 } }, { "ph": "f", "id": 24897, "pid": 76337, "tid": -914061504, "ts": 1716454217629786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659352, "dur": 5, "args": { "External id": 24901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24901, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24901, "pid": 5, "tid": 7, "ts": 1716454217659352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629800, "dur": 13, "args": { "External id": 24901, "cbid": 211, "correlation": 24901 } }, { "ph": "s", "id": 24901, "pid": 76337, "tid": -914061504, "ts": 1716454217629800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659358, "dur": 160, "args": { "External id": 24903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24903, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24903, "pid": 5, "tid": 7, "ts": 1716454217659358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629816, "dur": 5, "args": { "External id": 24903, "cbid": 211, "correlation": 24903 } }, { "ph": "s", "id": 24903, "pid": 76337, "tid": -914061504, "ts": 1716454217629816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217659520, "dur": 1, "args": { "External id": 24905, "device": 5, "context": 1, "stream": 7, "correlation": 24905, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 24905, "pid": 5, "tid": 7, "ts": 1716454217659520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217629827, "dur": 6, "args": { "External id": 24905, "cbid": 51, "correlation": 24905 } }, { "ph": "s", "id": 24905, "pid": 76337, "tid": -914061504, "ts": 1716454217629827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217659523, "dur": 265, "args": { "External id": 24906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24906, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24906, "pid": 5, "tid": 7, "ts": 1716454217659523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629834, "dur": 6, "args": { "External id": 24906, "cbid": 211, "correlation": 24906 } }, { "ph": "s", "id": 24906, "pid": 76337, "tid": -914061504, "ts": 1716454217629834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659789, "dur": 6, "args": { "External id": 24908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24908, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 24908, "pid": 5, "tid": 7, "ts": 1716454217659789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629844, "dur": 5, "args": { "External id": 24908, "cbid": 211, "correlation": 24908 } }, { "ph": "s", "id": 24908, "pid": 76337, "tid": -914061504, "ts": 1716454217629844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659796, "dur": 6, "args": { "External id": 24914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24914, "pid": 5, "tid": 7, "ts": 1716454217659796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629872, "dur": 9, "args": { "External id": 24914, "cbid": 211, "correlation": 24914 } }, { "ph": "s", "id": 24914, "pid": 76337, "tid": -914061504, "ts": 1716454217629872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217659804, "dur": 3, "args": { "External id": 24922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24922, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 24922, "pid": 5, "tid": 7, "ts": 1716454217659804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629918, "dur": 9, "args": { "External id": 24922, "cbid": 211, "correlation": 24922 } }, { "ph": "s", "id": 24922, "pid": 76337, "tid": -914061504, "ts": 1716454217629918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217629992, "dur": 1, "args": { "External id": 24938, "cbid": 251, "correlation": 24938 } }, { "ph": "f", "id": 24938, "pid": 76337, "tid": -914061504, "ts": 1716454217629992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217629997, "dur": 0, "args": { "External id": 24940, "cbid": 251, "correlation": 24940 } }, { "ph": "f", "id": 24940, "pid": 76337, "tid": -914061504, "ts": 1716454217629997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217659808, "dur": 12, "args": { "External id": 24941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24941, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24941, "pid": 5, "tid": 7, "ts": 1716454217659808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217629999, "dur": 13, "args": { "External id": 24941, "cbid": 211, "correlation": 24941 } }, { "ph": "s", "id": 24941, "pid": 76337, "tid": -914061504, "ts": 1716454217629999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217659822, "dur": 5, "args": { "External id": 24943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24943, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 24943, "pid": 5, "tid": 7, "ts": 1716454217659822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630014, "dur": 5, "args": { "External id": 24943, "cbid": 211, "correlation": 24943 } }, { "ph": "s", "id": 24943, "pid": 76337, "tid": -914061504, "ts": 1716454217630014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659828, "dur": 6, "args": { "External id": 24953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24953, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24953, "pid": 5, "tid": 7, "ts": 1716454217659828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630073, "dur": 12, "args": { "External id": 24953, "cbid": 211, "correlation": 24953 } }, { "ph": "s", "id": 24953, "pid": 76337, "tid": -914061504, "ts": 1716454217630073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217659835, "dur": 10, "args": { "External id": 24973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24973, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 24973, "pid": 5, "tid": 7, "ts": 1716454217659835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630139, "dur": 10, "args": { "External id": 24973, "cbid": 211, "correlation": 24973 } }, { "ph": "s", "id": 24973, "pid": 76337, "tid": -914061504, "ts": 1716454217630139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217659846, "dur": 4, "args": { "External id": 24985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24985, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 24985, "pid": 5, "tid": 7, "ts": 1716454217659846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630160, "dur": 6, "args": { "External id": 24985, "cbid": 211, "correlation": 24985 } }, { "ph": "s", "id": 24985, "pid": 76337, "tid": -914061504, "ts": 1716454217630160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217659851, "dur": 7, "args": { "External id": 24988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24988, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24988, "pid": 5, "tid": 7, "ts": 1716454217659851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630178, "dur": 7, "args": { "External id": 24988, "cbid": 211, "correlation": 24988 } }, { "ph": "s", "id": 24988, "pid": 76337, "tid": -914061504, "ts": 1716454217630178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217659859, "dur": 4, "args": { "External id": 24997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 24997, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 24997, "pid": 5, "tid": 7, "ts": 1716454217659859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630218, "dur": 10, "args": { "External id": 24997, "cbid": 211, "correlation": 24997 } }, { "ph": "s", "id": 24997, "pid": 76337, "tid": -914061504, "ts": 1716454217630218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217630282, "dur": 0, "args": { "External id": 25007, "cbid": 317, "correlation": 25007 } }, { "ph": "f", "id": 25007, "pid": 76337, "tid": -914061504, "ts": 1716454217630282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217630283, "dur": 0, "args": { "External id": 25008, "cbid": 203, "correlation": 25008 } }, { "ph": "f", "id": 25008, "pid": 76337, "tid": -914061504, "ts": 1716454217630283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217630284, "dur": 0, "args": { "External id": 25009, "cbid": 205, "correlation": 25009 } }, { "ph": "f", "id": 25009, "pid": 76337, "tid": -914061504, "ts": 1716454217630284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659865, "dur": 5, "args": { "External id": 25013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25013, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25013, "pid": 5, "tid": 7, "ts": 1716454217659865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630297, "dur": 12, "args": { "External id": 25013, "cbid": 211, "correlation": 25013 } }, { "ph": "s", "id": 25013, "pid": 76337, "tid": -914061504, "ts": 1716454217630297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217659871, "dur": 160, "args": { "External id": 25015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25015, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25015, "pid": 5, "tid": 7, "ts": 1716454217659871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630311, "dur": 5, "args": { "External id": 25015, "cbid": 211, "correlation": 25015 } }, { "ph": "s", "id": 25015, "pid": 76337, "tid": -914061504, "ts": 1716454217630311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217660033, "dur": 1, "args": { "External id": 25017, "device": 5, "context": 1, "stream": 7, "correlation": 25017, "bytes": 240, "memory bandwidth (GB/s)": 0.13157894736842105 } }, { "ph": "f", "id": 25017, "pid": 5, "tid": 7, "ts": 1716454217660033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217630321, "dur": 6, "args": { "External id": 25017, "cbid": 51, "correlation": 25017 } }, { "ph": "s", "id": 25017, "pid": 76337, "tid": -914061504, "ts": 1716454217630321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217660037, "dur": 254, "args": { "External id": 25018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25018, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25018, "pid": 5, "tid": 7, "ts": 1716454217660037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630329, "dur": 7, "args": { "External id": 25018, "cbid": 211, "correlation": 25018 } }, { "ph": "s", "id": 25018, "pid": 76337, "tid": -914061504, "ts": 1716454217630329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217660293, "dur": 6, "args": { "External id": 25020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25020, "pid": 5, "tid": 7, "ts": 1716454217660293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630339, "dur": 5, "args": { "External id": 25020, "cbid": 211, "correlation": 25020 } }, { "ph": "s", "id": 25020, "pid": 76337, "tid": -914061504, "ts": 1716454217630339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217660300, "dur": 6, "args": { "External id": 25026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25026, "pid": 5, "tid": 7, "ts": 1716454217660300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630368, "dur": 8, "args": { "External id": 25026, "cbid": 211, "correlation": 25026 } }, { "ph": "s", "id": 25026, "pid": 76337, "tid": -914061504, "ts": 1716454217630368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217660308, "dur": 4, "args": { "External id": 25034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25034, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25034, "pid": 5, "tid": 7, "ts": 1716454217660308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630400, "dur": 8, "args": { "External id": 25034, "cbid": 211, "correlation": 25034 } }, { "ph": "s", "id": 25034, "pid": 76337, "tid": -914061504, "ts": 1716454217630400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217660313, "dur": 4, "args": { "External id": 25042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25042, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25042, "pid": 5, "tid": 7, "ts": 1716454217660313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630431, "dur": 9, "args": { "External id": 25042, "cbid": 211, "correlation": 25042 } }, { "ph": "s", "id": 25042, "pid": 76337, "tid": -914061504, "ts": 1716454217630431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217660319, "dur": 11, "args": { "External id": 25051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25051, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25051, "pid": 5, "tid": 7, "ts": 1716454217660319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630559, "dur": 15, "args": { "External id": 25051, "cbid": 211, "correlation": 25051 } }, { "ph": "s", "id": 25051, "pid": 76337, "tid": -914061504, "ts": 1716454217630559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217660332, "dur": 12, "args": { "External id": 25071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25071, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25071, "pid": 5, "tid": 7, "ts": 1716454217660332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630640, "dur": 13, "args": { "External id": 25071, "cbid": 211, "correlation": 25071 } }, { "ph": "s", "id": 25071, "pid": 76337, "tid": -914061504, "ts": 1716454217630640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217660345, "dur": 4, "args": { "External id": 25083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25083, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25083, "pid": 5, "tid": 7, "ts": 1716454217660345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630663, "dur": 6, "args": { "External id": 25083, "cbid": 211, "correlation": 25083 } }, { "ph": "s", "id": 25083, "pid": 76337, "tid": -914061504, "ts": 1716454217630663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217660351, "dur": 10, "args": { "External id": 25086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25086, "pid": 5, "tid": 7, "ts": 1716454217660351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630682, "dur": 7, "args": { "External id": 25086, "cbid": 211, "correlation": 25086 } }, { "ph": "s", "id": 25086, "pid": 76337, "tid": -914061504, "ts": 1716454217630682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217660362, "dur": 6, "args": { "External id": 25095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25095, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25095, "pid": 5, "tid": 7, "ts": 1716454217660362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630723, "dur": 10, "args": { "External id": 25095, "cbid": 211, "correlation": 25095 } }, { "ph": "s", "id": 25095, "pid": 76337, "tid": -914061504, "ts": 1716454217630723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217630789, "dur": 0, "args": { "External id": 25105, "cbid": 317, "correlation": 25105 } }, { "ph": "f", "id": 25105, "pid": 76337, "tid": -914061504, "ts": 1716454217630789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217630790, "dur": 0, "args": { "External id": 25106, "cbid": 203, "correlation": 25106 } }, { "ph": "f", "id": 25106, "pid": 76337, "tid": -914061504, "ts": 1716454217630790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217630791, "dur": 0, "args": { "External id": 25107, "cbid": 205, "correlation": 25107 } }, { "ph": "f", "id": 25107, "pid": 76337, "tid": -914061504, "ts": 1716454217630791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217660370, "dur": 6, "args": { "External id": 25111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25111, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25111, "pid": 5, "tid": 7, "ts": 1716454217660370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630808, "dur": 13, "args": { "External id": 25111, "cbid": 211, "correlation": 25111 } }, { "ph": "s", "id": 25111, "pid": 76337, "tid": -914061504, "ts": 1716454217630808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217660377, "dur": 315, "args": { "External id": 25113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25113, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25113, "pid": 5, "tid": 7, "ts": 1716454217660377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630823, "dur": 6, "args": { "External id": 25113, "cbid": 211, "correlation": 25113 } }, { "ph": "s", "id": 25113, "pid": 76337, "tid": -914061504, "ts": 1716454217630823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217660695, "dur": 1, "args": { "External id": 25115, "device": 5, "context": 1, "stream": 7, "correlation": 25115, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 25115, "pid": 5, "tid": 7, "ts": 1716454217660695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217630835, "dur": 7, "args": { "External id": 25115, "cbid": 51, "correlation": 25115 } }, { "ph": "s", "id": 25115, "pid": 76337, "tid": -914061504, "ts": 1716454217630835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217660698, "dur": 486, "args": { "External id": 25116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25116, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25116, "pid": 5, "tid": 7, "ts": 1716454217660698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630842, "dur": 6, "args": { "External id": 25116, "cbid": 211, "correlation": 25116 } }, { "ph": "s", "id": 25116, "pid": 76337, "tid": -914061504, "ts": 1716454217630842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661186, "dur": 5, "args": { "External id": 25118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25118, "pid": 5, "tid": 7, "ts": 1716454217661186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630853, "dur": 5, "args": { "External id": 25118, "cbid": 211, "correlation": 25118 } }, { "ph": "s", "id": 25118, "pid": 76337, "tid": -914061504, "ts": 1716454217630853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217661192, "dur": 6, "args": { "External id": 25124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25124, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25124, "pid": 5, "tid": 7, "ts": 1716454217661192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630883, "dur": 9, "args": { "External id": 25124, "cbid": 211, "correlation": 25124 } }, { "ph": "s", "id": 25124, "pid": 76337, "tid": -914061504, "ts": 1716454217630883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217661200, "dur": 3, "args": { "External id": 25132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25132, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 25132, "pid": 5, "tid": 7, "ts": 1716454217661200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217630927, "dur": 9, "args": { "External id": 25132, "cbid": 211, "correlation": 25132 } }, { "ph": "s", "id": 25132, "pid": 76337, "tid": -914061504, "ts": 1716454217630927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217631011, "dur": 1, "args": { "External id": 25148, "cbid": 251, "correlation": 25148 } }, { "ph": "f", "id": 25148, "pid": 76337, "tid": -914061504, "ts": 1716454217631011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217631016, "dur": 0, "args": { "External id": 25150, "cbid": 251, "correlation": 25150 } }, { "ph": "f", "id": 25150, "pid": 76337, "tid": -914061504, "ts": 1716454217631016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217661204, "dur": 11, "args": { "External id": 25151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25151, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25151, "pid": 5, "tid": 7, "ts": 1716454217661204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631018, "dur": 13, "args": { "External id": 25151, "cbid": 211, "correlation": 25151 } }, { "ph": "s", "id": 25151, "pid": 76337, "tid": -914061504, "ts": 1716454217631018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217661216, "dur": 5, "args": { "External id": 25153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25153, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25153, "pid": 5, "tid": 7, "ts": 1716454217661216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631034, "dur": 5, "args": { "External id": 25153, "cbid": 211, "correlation": 25153 } }, { "ph": "s", "id": 25153, "pid": 76337, "tid": -914061504, "ts": 1716454217631034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217661222, "dur": 6, "args": { "External id": 25163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25163, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25163, "pid": 5, "tid": 7, "ts": 1716454217661222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631092, "dur": 12, "args": { "External id": 25163, "cbid": 211, "correlation": 25163 } }, { "ph": "s", "id": 25163, "pid": 76337, "tid": -914061504, "ts": 1716454217631092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217661229, "dur": 9, "args": { "External id": 25183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25183, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25183, "pid": 5, "tid": 7, "ts": 1716454217661229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631159, "dur": 11, "args": { "External id": 25183, "cbid": 211, "correlation": 25183 } }, { "ph": "s", "id": 25183, "pid": 76337, "tid": -914061504, "ts": 1716454217631159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217661240, "dur": 3, "args": { "External id": 25195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25195, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 25195, "pid": 5, "tid": 7, "ts": 1716454217661240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631179, "dur": 6, "args": { "External id": 25195, "cbid": 211, "correlation": 25195 } }, { "ph": "s", "id": 25195, "pid": 76337, "tid": -914061504, "ts": 1716454217631179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217661245, "dur": 7, "args": { "External id": 25198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25198, "pid": 5, "tid": 7, "ts": 1716454217661245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631198, "dur": 6, "args": { "External id": 25198, "cbid": 211, "correlation": 25198 } }, { "ph": "s", "id": 25198, "pid": 76337, "tid": -914061504, "ts": 1716454217631198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217661253, "dur": 4, "args": { "External id": 25207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25207, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25207, "pid": 5, "tid": 7, "ts": 1716454217661253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631237, "dur": 11, "args": { "External id": 25207, "cbid": 211, "correlation": 25207 } }, { "ph": "s", "id": 25207, "pid": 76337, "tid": -914061504, "ts": 1716454217631237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217631300, "dur": 0, "args": { "External id": 25217, "cbid": 317, "correlation": 25217 } }, { "ph": "f", "id": 25217, "pid": 76337, "tid": -914061504, "ts": 1716454217631300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217631301, "dur": 0, "args": { "External id": 25218, "cbid": 203, "correlation": 25218 } }, { "ph": "f", "id": 25218, "pid": 76337, "tid": -914061504, "ts": 1716454217631301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217631302, "dur": 0, "args": { "External id": 25219, "cbid": 205, "correlation": 25219 } }, { "ph": "f", "id": 25219, "pid": 76337, "tid": -914061504, "ts": 1716454217631302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661258, "dur": 5, "args": { "External id": 25223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25223, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25223, "pid": 5, "tid": 7, "ts": 1716454217661258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631315, "dur": 12, "args": { "External id": 25223, "cbid": 211, "correlation": 25223 } }, { "ph": "s", "id": 25223, "pid": 76337, "tid": -914061504, "ts": 1716454217631315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661265, "dur": 159, "args": { "External id": 25225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25225, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25225, "pid": 5, "tid": 7, "ts": 1716454217661265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631330, "dur": 5, "args": { "External id": 25225, "cbid": 211, "correlation": 25225 } }, { "ph": "s", "id": 25225, "pid": 76337, "tid": -914061504, "ts": 1716454217631330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217661426, "dur": 1, "args": { "External id": 25227, "device": 5, "context": 1, "stream": 7, "correlation": 25227, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 25227, "pid": 5, "tid": 7, "ts": 1716454217661426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217631341, "dur": 6, "args": { "External id": 25227, "cbid": 51, "correlation": 25227 } }, { "ph": "s", "id": 25227, "pid": 76337, "tid": -914061504, "ts": 1716454217631341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217661429, "dur": 255, "args": { "External id": 25228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25228, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25228, "pid": 5, "tid": 7, "ts": 1716454217661429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631348, "dur": 6, "args": { "External id": 25228, "cbid": 211, "correlation": 25228 } }, { "ph": "s", "id": 25228, "pid": 76337, "tid": -914061504, "ts": 1716454217631348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661685, "dur": 5, "args": { "External id": 25230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25230, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25230, "pid": 5, "tid": 7, "ts": 1716454217661685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631358, "dur": 5, "args": { "External id": 25230, "cbid": 211, "correlation": 25230 } }, { "ph": "s", "id": 25230, "pid": 76337, "tid": -914061504, "ts": 1716454217631358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217661692, "dur": 6, "args": { "External id": 25236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25236, "pid": 5, "tid": 7, "ts": 1716454217661692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631386, "dur": 8, "args": { "External id": 25236, "cbid": 211, "correlation": 25236 } }, { "ph": "s", "id": 25236, "pid": 76337, "tid": -914061504, "ts": 1716454217631386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217631444, "dur": 0, "args": { "External id": 25246, "cbid": 317, "correlation": 25246 } }, { "ph": "f", "id": 25246, "pid": 76337, "tid": -914061504, "ts": 1716454217631444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217631445, "dur": 0, "args": { "External id": 25247, "cbid": 203, "correlation": 25247 } }, { "ph": "f", "id": 25247, "pid": 76337, "tid": -914061504, "ts": 1716454217631445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217631446, "dur": 0, "args": { "External id": 25248, "cbid": 205, "correlation": 25248 } }, { "ph": "f", "id": 25248, "pid": 76337, "tid": -914061504, "ts": 1716454217631446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661699, "dur": 7, "args": { "External id": 25252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25252, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25252, "pid": 5, "tid": 7, "ts": 1716454217661699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631461, "dur": 12, "args": { "External id": 25252, "cbid": 211, "correlation": 25252 } }, { "ph": "s", "id": 25252, "pid": 76337, "tid": -914061504, "ts": 1716454217631461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217661708, "dur": 3, "args": { "External id": 25254, "device": 5, "context": 1, "stream": 7, "correlation": 25254, "bytes": 4800, "memory bandwidth (GB/s)": 1.4285714285714286 } }, { "ph": "f", "id": 25254, "pid": 5, "tid": 7, "ts": 1716454217661708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217631479, "dur": 15, "args": { "External id": 25254, "cbid": 51, "correlation": 25254 } }, { "ph": "s", "id": 25254, "pid": 76337, "tid": -914061504, "ts": 1716454217631479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217661712, "dur": 97, "args": { "External id": 25255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25255, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 25255, "pid": 5, "tid": 7, "ts": 1716454217661712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631495, "dur": 7, "args": { "External id": 25255, "cbid": 211, "correlation": 25255 } }, { "ph": "s", "id": 25255, "pid": 76337, "tid": -914061504, "ts": 1716454217631495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661811, "dur": 6, "args": { "External id": 25257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25257, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25257, "pid": 5, "tid": 7, "ts": 1716454217661811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631506, "dur": 5, "args": { "External id": 25257, "cbid": 211, "correlation": 25257 } }, { "ph": "s", "id": 25257, "pid": 76337, "tid": -914061504, "ts": 1716454217631506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217661817, "dur": 6, "args": { "External id": 25263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25263, "pid": 5, "tid": 7, "ts": 1716454217661817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631533, "dur": 9, "args": { "External id": 25263, "cbid": 211, "correlation": 25263 } }, { "ph": "s", "id": 25263, "pid": 76337, "tid": -914061504, "ts": 1716454217631533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217661825, "dur": 5, "args": { "External id": 25271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25271, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25271, "pid": 5, "tid": 7, "ts": 1716454217661825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631563, "dur": 8, "args": { "External id": 25271, "cbid": 211, "correlation": 25271 } }, { "ph": "s", "id": 25271, "pid": 76337, "tid": -914061504, "ts": 1716454217631563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217661831, "dur": 4, "args": { "External id": 25279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25279, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25279, "pid": 5, "tid": 7, "ts": 1716454217661831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631592, "dur": 8, "args": { "External id": 25279, "cbid": 211, "correlation": 25279 } }, { "ph": "s", "id": 25279, "pid": 76337, "tid": -914061504, "ts": 1716454217631592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217661837, "dur": 11, "args": { "External id": 25288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25288, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25288, "pid": 5, "tid": 7, "ts": 1716454217661837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631636, "dur": 10, "args": { "External id": 25288, "cbid": 211, "correlation": 25288 } }, { "ph": "s", "id": 25288, "pid": 76337, "tid": -914061504, "ts": 1716454217631636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217661849, "dur": 12, "args": { "External id": 25308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25308, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25308, "pid": 5, "tid": 7, "ts": 1716454217661849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631706, "dur": 12, "args": { "External id": 25308, "cbid": 211, "correlation": 25308 } }, { "ph": "s", "id": 25308, "pid": 76337, "tid": -914061504, "ts": 1716454217631706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217661862, "dur": 4, "args": { "External id": 25320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25320, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25320, "pid": 5, "tid": 7, "ts": 1716454217661862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631728, "dur": 6, "args": { "External id": 25320, "cbid": 211, "correlation": 25320 } }, { "ph": "s", "id": 25320, "pid": 76337, "tid": -914061504, "ts": 1716454217631728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217661868, "dur": 10, "args": { "External id": 25323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25323, "pid": 5, "tid": 7, "ts": 1716454217661868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631746, "dur": 6, "args": { "External id": 25323, "cbid": 211, "correlation": 25323 } }, { "ph": "s", "id": 25323, "pid": 76337, "tid": -914061504, "ts": 1716454217631746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217661879, "dur": 6, "args": { "External id": 25332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25332, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25332, "pid": 5, "tid": 7, "ts": 1716454217661879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631784, "dur": 10, "args": { "External id": 25332, "cbid": 211, "correlation": 25332 } }, { "ph": "s", "id": 25332, "pid": 76337, "tid": -914061504, "ts": 1716454217631784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217631854, "dur": 0, "args": { "External id": 25342, "cbid": 317, "correlation": 25342 } }, { "ph": "f", "id": 25342, "pid": 76337, "tid": -914061504, "ts": 1716454217631854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217631855, "dur": 0, "args": { "External id": 25343, "cbid": 203, "correlation": 25343 } }, { "ph": "f", "id": 25343, "pid": 76337, "tid": -914061504, "ts": 1716454217631855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217631856, "dur": 0, "args": { "External id": 25344, "cbid": 205, "correlation": 25344 } }, { "ph": "f", "id": 25344, "pid": 76337, "tid": -914061504, "ts": 1716454217631856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661886, "dur": 6, "args": { "External id": 25348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25348, "pid": 5, "tid": 7, "ts": 1716454217661886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631870, "dur": 12, "args": { "External id": 25348, "cbid": 211, "correlation": 25348 } }, { "ph": "s", "id": 25348, "pid": 76337, "tid": -914061504, "ts": 1716454217631870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217661894, "dur": 314, "args": { "External id": 25350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25350, "pid": 5, "tid": 7, "ts": 1716454217661894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631885, "dur": 5, "args": { "External id": 25350, "cbid": 211, "correlation": 25350 } }, { "ph": "s", "id": 25350, "pid": 76337, "tid": -914061504, "ts": 1716454217631885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217662210, "dur": 1, "args": { "External id": 25352, "device": 5, "context": 1, "stream": 7, "correlation": 25352, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 25352, "pid": 5, "tid": 7, "ts": 1716454217662210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217631896, "dur": 7, "args": { "External id": 25352, "cbid": 51, "correlation": 25352 } }, { "ph": "s", "id": 25352, "pid": 76337, "tid": -914061504, "ts": 1716454217631896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217662214, "dur": 488, "args": { "External id": 25353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25353, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25353, "pid": 5, "tid": 7, "ts": 1716454217662214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631904, "dur": 6, "args": { "External id": 25353, "cbid": 211, "correlation": 25353 } }, { "ph": "s", "id": 25353, "pid": 76337, "tid": -914061504, "ts": 1716454217631904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217662703, "dur": 5, "args": { "External id": 25355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25355, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25355, "pid": 5, "tid": 7, "ts": 1716454217662703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631913, "dur": 5, "args": { "External id": 25355, "cbid": 211, "correlation": 25355 } }, { "ph": "s", "id": 25355, "pid": 76337, "tid": -914061504, "ts": 1716454217631913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217662710, "dur": 6, "args": { "External id": 25361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25361, "pid": 5, "tid": 7, "ts": 1716454217662710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631942, "dur": 9, "args": { "External id": 25361, "cbid": 211, "correlation": 25361 } }, { "ph": "s", "id": 25361, "pid": 76337, "tid": -914061504, "ts": 1716454217631942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217662717, "dur": 3, "args": { "External id": 25369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25369, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 25369, "pid": 5, "tid": 7, "ts": 1716454217662717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217631995, "dur": 10, "args": { "External id": 25369, "cbid": 211, "correlation": 25369 } }, { "ph": "s", "id": 25369, "pid": 76337, "tid": -914061504, "ts": 1716454217631995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217632060, "dur": 1, "args": { "External id": 25385, "cbid": 251, "correlation": 25385 } }, { "ph": "f", "id": 25385, "pid": 76337, "tid": -914061504, "ts": 1716454217632060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217632065, "dur": 0, "args": { "External id": 25387, "cbid": 251, "correlation": 25387 } }, { "ph": "f", "id": 25387, "pid": 76337, "tid": -914061504, "ts": 1716454217632065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217662721, "dur": 13, "args": { "External id": 25388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25388, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25388, "pid": 5, "tid": 7, "ts": 1716454217662721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632067, "dur": 11, "args": { "External id": 25388, "cbid": 211, "correlation": 25388 } }, { "ph": "s", "id": 25388, "pid": 76337, "tid": -914061504, "ts": 1716454217632067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217662735, "dur": 5, "args": { "External id": 25390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25390, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25390, "pid": 5, "tid": 7, "ts": 1716454217662735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632080, "dur": 6, "args": { "External id": 25390, "cbid": 211, "correlation": 25390 } }, { "ph": "s", "id": 25390, "pid": 76337, "tid": -914061504, "ts": 1716454217632080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217662741, "dur": 6, "args": { "External id": 25400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25400, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25400, "pid": 5, "tid": 7, "ts": 1716454217662741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632137, "dur": 13, "args": { "External id": 25400, "cbid": 211, "correlation": 25400 } }, { "ph": "s", "id": 25400, "pid": 76337, "tid": -914061504, "ts": 1716454217632137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217662748, "dur": 9, "args": { "External id": 25420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25420, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25420, "pid": 5, "tid": 7, "ts": 1716454217662748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632204, "dur": 11, "args": { "External id": 25420, "cbid": 211, "correlation": 25420 } }, { "ph": "s", "id": 25420, "pid": 76337, "tid": -914061504, "ts": 1716454217632204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217662759, "dur": 4, "args": { "External id": 25432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25432, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 25432, "pid": 5, "tid": 7, "ts": 1716454217662759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632224, "dur": 6, "args": { "External id": 25432, "cbid": 211, "correlation": 25432 } }, { "ph": "s", "id": 25432, "pid": 76337, "tid": -914061504, "ts": 1716454217632224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217662764, "dur": 7, "args": { "External id": 25435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25435, "pid": 5, "tid": 7, "ts": 1716454217662764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632242, "dur": 7, "args": { "External id": 25435, "cbid": 211, "correlation": 25435 } }, { "ph": "s", "id": 25435, "pid": 76337, "tid": -914061504, "ts": 1716454217632242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217662772, "dur": 4, "args": { "External id": 25444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25444, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25444, "pid": 5, "tid": 7, "ts": 1716454217662772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632282, "dur": 10, "args": { "External id": 25444, "cbid": 211, "correlation": 25444 } }, { "ph": "s", "id": 25444, "pid": 76337, "tid": -914061504, "ts": 1716454217632282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217632360, "dur": 0, "args": { "External id": 25454, "cbid": 317, "correlation": 25454 } }, { "ph": "f", "id": 25454, "pid": 76337, "tid": -914061504, "ts": 1716454217632360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217632361, "dur": 0, "args": { "External id": 25455, "cbid": 203, "correlation": 25455 } }, { "ph": "f", "id": 25455, "pid": 76337, "tid": -914061504, "ts": 1716454217632361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217632362, "dur": 0, "args": { "External id": 25456, "cbid": 205, "correlation": 25456 } }, { "ph": "f", "id": 25456, "pid": 76337, "tid": -914061504, "ts": 1716454217632362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217662778, "dur": 5, "args": { "External id": 25460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25460, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25460, "pid": 5, "tid": 7, "ts": 1716454217662778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632375, "dur": 13, "args": { "External id": 25460, "cbid": 211, "correlation": 25460 } }, { "ph": "s", "id": 25460, "pid": 76337, "tid": -914061504, "ts": 1716454217632375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217662784, "dur": 159, "args": { "External id": 25462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25462, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25462, "pid": 5, "tid": 7, "ts": 1716454217662784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632391, "dur": 5, "args": { "External id": 25462, "cbid": 211, "correlation": 25462 } }, { "ph": "s", "id": 25462, "pid": 76337, "tid": -914061504, "ts": 1716454217632391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217662946, "dur": 1, "args": { "External id": 25464, "device": 5, "context": 1, "stream": 7, "correlation": 25464, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 25464, "pid": 5, "tid": 7, "ts": 1716454217662946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217632402, "dur": 6, "args": { "External id": 25464, "cbid": 51, "correlation": 25464 } }, { "ph": "s", "id": 25464, "pid": 76337, "tid": -914061504, "ts": 1716454217632402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217662949, "dur": 254, "args": { "External id": 25465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25465, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25465, "pid": 5, "tid": 7, "ts": 1716454217662949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632410, "dur": 6, "args": { "External id": 25465, "cbid": 211, "correlation": 25465 } }, { "ph": "s", "id": 25465, "pid": 76337, "tid": -914061504, "ts": 1716454217632410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217663204, "dur": 6, "args": { "External id": 25467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25467, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25467, "pid": 5, "tid": 7, "ts": 1716454217663204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632420, "dur": 5, "args": { "External id": 25467, "cbid": 211, "correlation": 25467 } }, { "ph": "s", "id": 25467, "pid": 76337, "tid": -914061504, "ts": 1716454217632420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217663212, "dur": 6, "args": { "External id": 25473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25473, "pid": 5, "tid": 7, "ts": 1716454217663212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632448, "dur": 9, "args": { "External id": 25473, "cbid": 211, "correlation": 25473 } }, { "ph": "s", "id": 25473, "pid": 76337, "tid": -914061504, "ts": 1716454217632448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217632508, "dur": 0, "args": { "External id": 25483, "cbid": 317, "correlation": 25483 } }, { "ph": "f", "id": 25483, "pid": 76337, "tid": -914061504, "ts": 1716454217632508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217632508, "dur": 0, "args": { "External id": 25484, "cbid": 203, "correlation": 25484 } }, { "ph": "f", "id": 25484, "pid": 76337, "tid": -914061504, "ts": 1716454217632508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217632509, "dur": 0, "args": { "External id": 25485, "cbid": 205, "correlation": 25485 } }, { "ph": "f", "id": 25485, "pid": 76337, "tid": -914061504, "ts": 1716454217632509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217663219, "dur": 7, "args": { "External id": 25489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25489, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25489, "pid": 5, "tid": 7, "ts": 1716454217663219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632521, "dur": 11, "args": { "External id": 25489, "cbid": 211, "correlation": 25489 } }, { "ph": "s", "id": 25489, "pid": 76337, "tid": -914061504, "ts": 1716454217632521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217663228, "dur": 3, "args": { "External id": 25491, "device": 5, "context": 1, "stream": 7, "correlation": 25491, "bytes": 4800, "memory bandwidth (GB/s)": 1.5151515151515151 } }, { "ph": "f", "id": 25491, "pid": 5, "tid": 7, "ts": 1716454217663228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217632537, "dur": 9, "args": { "External id": 25491, "cbid": 51, "correlation": 25491 } }, { "ph": "s", "id": 25491, "pid": 76337, "tid": -914061504, "ts": 1716454217632537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217663232, "dur": 93, "args": { "External id": 25492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25492, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 25492, "pid": 5, "tid": 7, "ts": 1716454217663232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632547, "dur": 6, "args": { "External id": 25492, "cbid": 211, "correlation": 25492 } }, { "ph": "s", "id": 25492, "pid": 76337, "tid": -914061504, "ts": 1716454217632547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217663326, "dur": 5, "args": { "External id": 25494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25494, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25494, "pid": 5, "tid": 7, "ts": 1716454217663326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632558, "dur": 5, "args": { "External id": 25494, "cbid": 211, "correlation": 25494 } }, { "ph": "s", "id": 25494, "pid": 76337, "tid": -914061504, "ts": 1716454217632558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217663333, "dur": 6, "args": { "External id": 25500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25500, "pid": 5, "tid": 7, "ts": 1716454217663333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632584, "dur": 8, "args": { "External id": 25500, "cbid": 211, "correlation": 25500 } }, { "ph": "s", "id": 25500, "pid": 76337, "tid": -914061504, "ts": 1716454217632584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217663341, "dur": 5, "args": { "External id": 25508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25508, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25508, "pid": 5, "tid": 7, "ts": 1716454217663341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632614, "dur": 8, "args": { "External id": 25508, "cbid": 211, "correlation": 25508 } }, { "ph": "s", "id": 25508, "pid": 76337, "tid": -914061504, "ts": 1716454217632614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217663347, "dur": 4, "args": { "External id": 25516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25516, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25516, "pid": 5, "tid": 7, "ts": 1716454217663347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632643, "dur": 8, "args": { "External id": 25516, "cbid": 211, "correlation": 25516 } }, { "ph": "s", "id": 25516, "pid": 76337, "tid": -914061504, "ts": 1716454217632643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217663352, "dur": 11, "args": { "External id": 25525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25525, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25525, "pid": 5, "tid": 7, "ts": 1716454217663352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632686, "dur": 11, "args": { "External id": 25525, "cbid": 211, "correlation": 25525 } }, { "ph": "s", "id": 25525, "pid": 76337, "tid": -914061504, "ts": 1716454217632686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217663365, "dur": 12, "args": { "External id": 25545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25545, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25545, "pid": 5, "tid": 7, "ts": 1716454217663365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632770, "dur": 12, "args": { "External id": 25545, "cbid": 211, "correlation": 25545 } }, { "ph": "s", "id": 25545, "pid": 76337, "tid": -914061504, "ts": 1716454217632770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217663378, "dur": 4, "args": { "External id": 25557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25557, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25557, "pid": 5, "tid": 7, "ts": 1716454217663378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632792, "dur": 6, "args": { "External id": 25557, "cbid": 211, "correlation": 25557 } }, { "ph": "s", "id": 25557, "pid": 76337, "tid": -914061504, "ts": 1716454217632792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217663383, "dur": 11, "args": { "External id": 25560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25560, "pid": 5, "tid": 7, "ts": 1716454217663383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632811, "dur": 7, "args": { "External id": 25560, "cbid": 211, "correlation": 25560 } }, { "ph": "s", "id": 25560, "pid": 76337, "tid": -914061504, "ts": 1716454217632811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217663395, "dur": 6, "args": { "External id": 25569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25569, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25569, "pid": 5, "tid": 7, "ts": 1716454217663395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632849, "dur": 10, "args": { "External id": 25569, "cbid": 211, "correlation": 25569 } }, { "ph": "s", "id": 25569, "pid": 76337, "tid": -914061504, "ts": 1716454217632849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217632902, "dur": 0, "args": { "External id": 25579, "cbid": 317, "correlation": 25579 } }, { "ph": "f", "id": 25579, "pid": 76337, "tid": -914061504, "ts": 1716454217632902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217632903, "dur": 0, "args": { "External id": 25580, "cbid": 203, "correlation": 25580 } }, { "ph": "f", "id": 25580, "pid": 76337, "tid": -914061504, "ts": 1716454217632903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217632903, "dur": 0, "args": { "External id": 25581, "cbid": 205, "correlation": 25581 } }, { "ph": "f", "id": 25581, "pid": 76337, "tid": -914061504, "ts": 1716454217632903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217663403, "dur": 6, "args": { "External id": 25585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25585, "pid": 5, "tid": 7, "ts": 1716454217663403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632917, "dur": 12, "args": { "External id": 25585, "cbid": 211, "correlation": 25585 } }, { "ph": "s", "id": 25585, "pid": 76337, "tid": -914061504, "ts": 1716454217632917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217663410, "dur": 313, "args": { "External id": 25587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25587, "pid": 5, "tid": 7, "ts": 1716454217663410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632932, "dur": 5, "args": { "External id": 25587, "cbid": 211, "correlation": 25587 } }, { "ph": "s", "id": 25587, "pid": 76337, "tid": -914061504, "ts": 1716454217632932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217663726, "dur": 1, "args": { "External id": 25589, "device": 5, "context": 1, "stream": 7, "correlation": 25589, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 25589, "pid": 5, "tid": 7, "ts": 1716454217663726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217632942, "dur": 6, "args": { "External id": 25589, "cbid": 51, "correlation": 25589 } }, { "ph": "s", "id": 25589, "pid": 76337, "tid": -914061504, "ts": 1716454217632942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217663729, "dur": 488, "args": { "External id": 25590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25590, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25590, "pid": 5, "tid": 7, "ts": 1716454217663729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632950, "dur": 6, "args": { "External id": 25590, "cbid": 211, "correlation": 25590 } }, { "ph": "s", "id": 25590, "pid": 76337, "tid": -914061504, "ts": 1716454217632950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664218, "dur": 6, "args": { "External id": 25592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25592, "pid": 5, "tid": 7, "ts": 1716454217664218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632960, "dur": 5, "args": { "External id": 25592, "cbid": 211, "correlation": 25592 } }, { "ph": "s", "id": 25592, "pid": 76337, "tid": -914061504, "ts": 1716454217632960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217664225, "dur": 6, "args": { "External id": 25598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25598, "pid": 5, "tid": 7, "ts": 1716454217664225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217632997, "dur": 10, "args": { "External id": 25598, "cbid": 211, "correlation": 25598 } }, { "ph": "s", "id": 25598, "pid": 76337, "tid": -914061504, "ts": 1716454217632997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217664233, "dur": 3, "args": { "External id": 25606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25606, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 25606, "pid": 5, "tid": 7, "ts": 1716454217664233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633042, "dur": 9, "args": { "External id": 25606, "cbid": 211, "correlation": 25606 } }, { "ph": "s", "id": 25606, "pid": 76337, "tid": -914061504, "ts": 1716454217633042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217633116, "dur": 1, "args": { "External id": 25622, "cbid": 251, "correlation": 25622 } }, { "ph": "f", "id": 25622, "pid": 76337, "tid": -914061504, "ts": 1716454217633116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217633121, "dur": 0, "args": { "External id": 25624, "cbid": 251, "correlation": 25624 } }, { "ph": "f", "id": 25624, "pid": 76337, "tid": -914061504, "ts": 1716454217633121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217664237, "dur": 12, "args": { "External id": 25625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25625, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25625, "pid": 5, "tid": 7, "ts": 1716454217664237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633123, "dur": 13, "args": { "External id": 25625, "cbid": 211, "correlation": 25625 } }, { "ph": "s", "id": 25625, "pid": 76337, "tid": -914061504, "ts": 1716454217633123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217664250, "dur": 5, "args": { "External id": 25627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25627, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25627, "pid": 5, "tid": 7, "ts": 1716454217664250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633137, "dur": 5, "args": { "External id": 25627, "cbid": 211, "correlation": 25627 } }, { "ph": "s", "id": 25627, "pid": 76337, "tid": -914061504, "ts": 1716454217633137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217664257, "dur": 6, "args": { "External id": 25637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25637, "pid": 5, "tid": 7, "ts": 1716454217664257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633195, "dur": 12, "args": { "External id": 25637, "cbid": 211, "correlation": 25637 } }, { "ph": "s", "id": 25637, "pid": 76337, "tid": -914061504, "ts": 1716454217633195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217664264, "dur": 9, "args": { "External id": 25657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25657, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25657, "pid": 5, "tid": 7, "ts": 1716454217664264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633261, "dur": 10, "args": { "External id": 25657, "cbid": 211, "correlation": 25657 } }, { "ph": "s", "id": 25657, "pid": 76337, "tid": -914061504, "ts": 1716454217633261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217664275, "dur": 3, "args": { "External id": 25669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25669, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 25669, "pid": 5, "tid": 7, "ts": 1716454217664275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633281, "dur": 6, "args": { "External id": 25669, "cbid": 211, "correlation": 25669 } }, { "ph": "s", "id": 25669, "pid": 76337, "tid": -914061504, "ts": 1716454217633281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217664279, "dur": 7, "args": { "External id": 25672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25672, "pid": 5, "tid": 7, "ts": 1716454217664279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633300, "dur": 6, "args": { "External id": 25672, "cbid": 211, "correlation": 25672 } }, { "ph": "s", "id": 25672, "pid": 76337, "tid": -914061504, "ts": 1716454217633300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217664287, "dur": 4, "args": { "External id": 25681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25681, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25681, "pid": 5, "tid": 7, "ts": 1716454217664287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633340, "dur": 10, "args": { "External id": 25681, "cbid": 211, "correlation": 25681 } }, { "ph": "s", "id": 25681, "pid": 76337, "tid": -914061504, "ts": 1716454217633340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217633416, "dur": 0, "args": { "External id": 25691, "cbid": 317, "correlation": 25691 } }, { "ph": "f", "id": 25691, "pid": 76337, "tid": -914061504, "ts": 1716454217633416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217633417, "dur": 0, "args": { "External id": 25692, "cbid": 203, "correlation": 25692 } }, { "ph": "f", "id": 25692, "pid": 76337, "tid": -914061504, "ts": 1716454217633417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217633418, "dur": 0, "args": { "External id": 25693, "cbid": 205, "correlation": 25693 } }, { "ph": "f", "id": 25693, "pid": 76337, "tid": -914061504, "ts": 1716454217633418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664293, "dur": 5, "args": { "External id": 25697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25697, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25697, "pid": 5, "tid": 7, "ts": 1716454217664293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633431, "dur": 13, "args": { "External id": 25697, "cbid": 211, "correlation": 25697 } }, { "ph": "s", "id": 25697, "pid": 76337, "tid": -914061504, "ts": 1716454217633431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664299, "dur": 159, "args": { "External id": 25699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25699, "pid": 5, "tid": 7, "ts": 1716454217664299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633447, "dur": 5, "args": { "External id": 25699, "cbid": 211, "correlation": 25699 } }, { "ph": "s", "id": 25699, "pid": 76337, "tid": -914061504, "ts": 1716454217633447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217664461, "dur": 1, "args": { "External id": 25701, "device": 5, "context": 1, "stream": 7, "correlation": 25701, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 25701, "pid": 5, "tid": 7, "ts": 1716454217664461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217633458, "dur": 6, "args": { "External id": 25701, "cbid": 51, "correlation": 25701 } }, { "ph": "s", "id": 25701, "pid": 76337, "tid": -914061504, "ts": 1716454217633458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217664464, "dur": 254, "args": { "External id": 25702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25702, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25702, "pid": 5, "tid": 7, "ts": 1716454217664464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633465, "dur": 6, "args": { "External id": 25702, "cbid": 211, "correlation": 25702 } }, { "ph": "s", "id": 25702, "pid": 76337, "tid": -914061504, "ts": 1716454217633465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664720, "dur": 6, "args": { "External id": 25704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25704, "pid": 5, "tid": 7, "ts": 1716454217664720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633475, "dur": 5, "args": { "External id": 25704, "cbid": 211, "correlation": 25704 } }, { "ph": "s", "id": 25704, "pid": 76337, "tid": -914061504, "ts": 1716454217633475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217664726, "dur": 6, "args": { "External id": 25710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25710, "pid": 5, "tid": 7, "ts": 1716454217664726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633504, "dur": 8, "args": { "External id": 25710, "cbid": 211, "correlation": 25710 } }, { "ph": "s", "id": 25710, "pid": 76337, "tid": -914061504, "ts": 1716454217633504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217633562, "dur": 0, "args": { "External id": 25720, "cbid": 317, "correlation": 25720 } }, { "ph": "f", "id": 25720, "pid": 76337, "tid": -914061504, "ts": 1716454217633562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217633563, "dur": 0, "args": { "External id": 25721, "cbid": 203, "correlation": 25721 } }, { "ph": "f", "id": 25721, "pid": 76337, "tid": -914061504, "ts": 1716454217633563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217633563, "dur": 0, "args": { "External id": 25722, "cbid": 205, "correlation": 25722 } }, { "ph": "f", "id": 25722, "pid": 76337, "tid": -914061504, "ts": 1716454217633563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664734, "dur": 8, "args": { "External id": 25726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25726, "pid": 5, "tid": 7, "ts": 1716454217664734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633575, "dur": 12, "args": { "External id": 25726, "cbid": 211, "correlation": 25726 } }, { "ph": "s", "id": 25726, "pid": 76337, "tid": -914061504, "ts": 1716454217633575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217664743, "dur": 3, "args": { "External id": 25728, "device": 5, "context": 1, "stream": 7, "correlation": 25728, "bytes": 4800, "memory bandwidth (GB/s)": 1.5004688965301656 } }, { "ph": "f", "id": 25728, "pid": 5, "tid": 7, "ts": 1716454217664743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217633592, "dur": 10, "args": { "External id": 25728, "cbid": 51, "correlation": 25728 } }, { "ph": "s", "id": 25728, "pid": 76337, "tid": -914061504, "ts": 1716454217633592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217664747, "dur": 93, "args": { "External id": 25729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25729, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 25729, "pid": 5, "tid": 7, "ts": 1716454217664747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633602, "dur": 6, "args": { "External id": 25729, "cbid": 211, "correlation": 25729 } }, { "ph": "s", "id": 25729, "pid": 76337, "tid": -914061504, "ts": 1716454217633602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664842, "dur": 5, "args": { "External id": 25731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25731, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25731, "pid": 5, "tid": 7, "ts": 1716454217664842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633612, "dur": 5, "args": { "External id": 25731, "cbid": 211, "correlation": 25731 } }, { "ph": "s", "id": 25731, "pid": 76337, "tid": -914061504, "ts": 1716454217633612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217664849, "dur": 6, "args": { "External id": 25737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25737, "pid": 5, "tid": 7, "ts": 1716454217664849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633638, "dur": 9, "args": { "External id": 25737, "cbid": 211, "correlation": 25737 } }, { "ph": "s", "id": 25737, "pid": 76337, "tid": -914061504, "ts": 1716454217633638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217664856, "dur": 5, "args": { "External id": 25745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25745, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25745, "pid": 5, "tid": 7, "ts": 1716454217664856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633668, "dur": 8, "args": { "External id": 25745, "cbid": 211, "correlation": 25745 } }, { "ph": "s", "id": 25745, "pid": 76337, "tid": -914061504, "ts": 1716454217633668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217664862, "dur": 4, "args": { "External id": 25753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25753, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 25753, "pid": 5, "tid": 7, "ts": 1716454217664862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633696, "dur": 9, "args": { "External id": 25753, "cbid": 211, "correlation": 25753 } }, { "ph": "s", "id": 25753, "pid": 76337, "tid": -914061504, "ts": 1716454217633696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454217664868, "dur": 14, "args": { "External id": 25764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25764, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25764, "pid": 5, "tid": 7, "ts": 1716454217664868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633777, "dur": 13, "args": { "External id": 25764, "cbid": 211, "correlation": 25764 } }, { "ph": "s", "id": 25764, "pid": 76337, "tid": -914061504, "ts": 1716454217633777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217633833, "dur": 0, "args": { "External id": 25774, "cbid": 317, "correlation": 25774 } }, { "ph": "f", "id": 25774, "pid": 76337, "tid": -914061504, "ts": 1716454217633833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217633834, "dur": 0, "args": { "External id": 25775, "cbid": 203, "correlation": 25775 } }, { "ph": "f", "id": 25775, "pid": 76337, "tid": -914061504, "ts": 1716454217633834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217633834, "dur": 0, "args": { "External id": 25776, "cbid": 205, "correlation": 25776 } }, { "ph": "f", "id": 25776, "pid": 76337, "tid": -914061504, "ts": 1716454217633834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664883, "dur": 9, "args": { "External id": 25780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25780, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25780, "pid": 5, "tid": 7, "ts": 1716454217664883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633848, "dur": 12, "args": { "External id": 25780, "cbid": 211, "correlation": 25780 } }, { "ph": "s", "id": 25780, "pid": 76337, "tid": -914061504, "ts": 1716454217633848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217664893, "dur": 160, "args": { "External id": 25782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25782, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25782, "pid": 5, "tid": 7, "ts": 1716454217664893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633862, "dur": 5, "args": { "External id": 25782, "cbid": 211, "correlation": 25782 } }, { "ph": "s", "id": 25782, "pid": 76337, "tid": -914061504, "ts": 1716454217633862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217665055, "dur": 1, "args": { "External id": 25784, "device": 5, "context": 1, "stream": 7, "correlation": 25784, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 25784, "pid": 5, "tid": 7, "ts": 1716454217665055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217633873, "dur": 6, "args": { "External id": 25784, "cbid": 51, "correlation": 25784 } }, { "ph": "s", "id": 25784, "pid": 76337, "tid": -914061504, "ts": 1716454217633873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217665059, "dur": 641, "args": { "External id": 25785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25785, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25785, "pid": 5, "tid": 7, "ts": 1716454217665059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633880, "dur": 6, "args": { "External id": 25785, "cbid": 211, "correlation": 25785 } }, { "ph": "s", "id": 25785, "pid": 76337, "tid": -914061504, "ts": 1716454217633880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217665701, "dur": 12, "args": { "External id": 25787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25787, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25787, "pid": 5, "tid": 7, "ts": 1716454217665701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633890, "dur": 5, "args": { "External id": 25787, "cbid": 211, "correlation": 25787 } }, { "ph": "s", "id": 25787, "pid": 76337, "tid": -914061504, "ts": 1716454217633890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217665714, "dur": 14, "args": { "External id": 25793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25793, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25793, "pid": 5, "tid": 7, "ts": 1716454217665714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217633918, "dur": 8, "args": { "External id": 25793, "cbid": 211, "correlation": 25793 } }, { "ph": "s", "id": 25793, "pid": 76337, "tid": -914061504, "ts": 1716454217633918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217665730, "dur": 29, "args": { "External id": 25802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25802, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25802, "pid": 5, "tid": 7, "ts": 1716454217665730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634050, "dur": 15, "args": { "External id": 25802, "cbid": 211, "correlation": 25802 } }, { "ph": "s", "id": 25802, "pid": 76337, "tid": -914061504, "ts": 1716454217634050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217665761, "dur": 29, "args": { "External id": 25822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25822, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25822, "pid": 5, "tid": 7, "ts": 1716454217665761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634133, "dur": 12, "args": { "External id": 25822, "cbid": 211, "correlation": 25822 } }, { "ph": "s", "id": 25822, "pid": 76337, "tid": -914061504, "ts": 1716454217634133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217665791, "dur": 4, "args": { "External id": 25834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25834, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25834, "pid": 5, "tid": 7, "ts": 1716454217665791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634164, "dur": 8, "args": { "External id": 25834, "cbid": 211, "correlation": 25834 } }, { "ph": "s", "id": 25834, "pid": 76337, "tid": -914061504, "ts": 1716454217634164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217665797, "dur": 31, "args": { "External id": 25837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25837, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25837, "pid": 5, "tid": 7, "ts": 1716454217665797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634185, "dur": 6, "args": { "External id": 25837, "cbid": 211, "correlation": 25837 } }, { "ph": "s", "id": 25837, "pid": 76337, "tid": -914061504, "ts": 1716454217634185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217665828, "dur": 21, "args": { "External id": 25846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25846, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25846, "pid": 5, "tid": 7, "ts": 1716454217665828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634227, "dur": 10, "args": { "External id": 25846, "cbid": 211, "correlation": 25846 } }, { "ph": "s", "id": 25846, "pid": 76337, "tid": -914061504, "ts": 1716454217634227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217634302, "dur": 0, "args": { "External id": 25856, "cbid": 317, "correlation": 25856 } }, { "ph": "f", "id": 25856, "pid": 76337, "tid": -914061504, "ts": 1716454217634302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217634303, "dur": 0, "args": { "External id": 25857, "cbid": 203, "correlation": 25857 } }, { "ph": "f", "id": 25857, "pid": 76337, "tid": -914061504, "ts": 1716454217634303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217634303, "dur": 0, "args": { "External id": 25858, "cbid": 205, "correlation": 25858 } }, { "ph": "f", "id": 25858, "pid": 76337, "tid": -914061504, "ts": 1716454217634303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217665851, "dur": 21, "args": { "External id": 25862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25862, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25862, "pid": 5, "tid": 7, "ts": 1716454217665851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634332, "dur": 13, "args": { "External id": 25862, "cbid": 211, "correlation": 25862 } }, { "ph": "s", "id": 25862, "pid": 76337, "tid": -914061504, "ts": 1716454217634332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217665874, "dur": 315, "args": { "External id": 25864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25864, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25864, "pid": 5, "tid": 7, "ts": 1716454217665874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634347, "dur": 5, "args": { "External id": 25864, "cbid": 211, "correlation": 25864 } }, { "ph": "s", "id": 25864, "pid": 76337, "tid": -914061504, "ts": 1716454217634347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217666191, "dur": 1, "args": { "External id": 25866, "device": 5, "context": 1, "stream": 7, "correlation": 25866, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 25866, "pid": 5, "tid": 7, "ts": 1716454217666191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217634358, "dur": 6, "args": { "External id": 25866, "cbid": 51, "correlation": 25866 } }, { "ph": "s", "id": 25866, "pid": 76337, "tid": -914061504, "ts": 1716454217634358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217666195, "dur": 1222, "args": { "External id": 25867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25867, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25867, "pid": 5, "tid": 7, "ts": 1716454217666195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634366, "dur": 7, "args": { "External id": 25867, "cbid": 211, "correlation": 25867 } }, { "ph": "s", "id": 25867, "pid": 76337, "tid": -914061504, "ts": 1716454217634366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217667419, "dur": 12, "args": { "External id": 25869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25869, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25869, "pid": 5, "tid": 7, "ts": 1716454217667419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634376, "dur": 5, "args": { "External id": 25869, "cbid": 211, "correlation": 25869 } }, { "ph": "s", "id": 25869, "pid": 76337, "tid": -914061504, "ts": 1716454217634376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217667432, "dur": 14, "args": { "External id": 25875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25875, "pid": 5, "tid": 7, "ts": 1716454217667432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634413, "dur": 11, "args": { "External id": 25875, "cbid": 211, "correlation": 25875 } }, { "ph": "s", "id": 25875, "pid": 76337, "tid": -914061504, "ts": 1716454217634413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217667448, "dur": 3, "args": { "External id": 25883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25883, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 25883, "pid": 5, "tid": 7, "ts": 1716454217667448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634462, "dur": 10, "args": { "External id": 25883, "cbid": 211, "correlation": 25883 } }, { "ph": "s", "id": 25883, "pid": 76337, "tid": -914061504, "ts": 1716454217634462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217634536, "dur": 1, "args": { "External id": 25899, "cbid": 251, "correlation": 25899 } }, { "ph": "f", "id": 25899, "pid": 76337, "tid": -914061504, "ts": 1716454217634536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217634541, "dur": 0, "args": { "External id": 25901, "cbid": 251, "correlation": 25901 } }, { "ph": "f", "id": 25901, "pid": 76337, "tid": -914061504, "ts": 1716454217634541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217667453, "dur": 13, "args": { "External id": 25902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25902, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25902, "pid": 5, "tid": 7, "ts": 1716454217667453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634543, "dur": 12, "args": { "External id": 25902, "cbid": 211, "correlation": 25902 } }, { "ph": "s", "id": 25902, "pid": 76337, "tid": -914061504, "ts": 1716454217634543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217667467, "dur": 5, "args": { "External id": 25904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25904, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25904, "pid": 5, "tid": 7, "ts": 1716454217667467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634557, "dur": 5, "args": { "External id": 25904, "cbid": 211, "correlation": 25904 } }, { "ph": "s", "id": 25904, "pid": 76337, "tid": -914061504, "ts": 1716454217634557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217667473, "dur": 17, "args": { "External id": 25914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25914, "pid": 5, "tid": 7, "ts": 1716454217667473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634615, "dur": 12, "args": { "External id": 25914, "cbid": 211, "correlation": 25914 } }, { "ph": "s", "id": 25914, "pid": 76337, "tid": -914061504, "ts": 1716454217634615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217667491, "dur": 17, "args": { "External id": 25934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25934, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 25934, "pid": 5, "tid": 7, "ts": 1716454217667491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634681, "dur": 11, "args": { "External id": 25934, "cbid": 211, "correlation": 25934 } }, { "ph": "s", "id": 25934, "pid": 76337, "tid": -914061504, "ts": 1716454217634681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217667510, "dur": 4, "args": { "External id": 25946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25946, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 25946, "pid": 5, "tid": 7, "ts": 1716454217667510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634703, "dur": 6, "args": { "External id": 25946, "cbid": 211, "correlation": 25946 } }, { "ph": "s", "id": 25946, "pid": 76337, "tid": -914061504, "ts": 1716454217634703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217667515, "dur": 16, "args": { "External id": 25949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25949, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25949, "pid": 5, "tid": 7, "ts": 1716454217667515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634720, "dur": 6, "args": { "External id": 25949, "cbid": 211, "correlation": 25949 } }, { "ph": "s", "id": 25949, "pid": 76337, "tid": -914061504, "ts": 1716454217634720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217667533, "dur": 11, "args": { "External id": 25958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25958, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25958, "pid": 5, "tid": 7, "ts": 1716454217667533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634761, "dur": 10, "args": { "External id": 25958, "cbid": 211, "correlation": 25958 } }, { "ph": "s", "id": 25958, "pid": 76337, "tid": -914061504, "ts": 1716454217634761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217634824, "dur": 0, "args": { "External id": 25968, "cbid": 317, "correlation": 25968 } }, { "ph": "f", "id": 25968, "pid": 76337, "tid": -914061504, "ts": 1716454217634824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217634825, "dur": 0, "args": { "External id": 25969, "cbid": 203, "correlation": 25969 } }, { "ph": "f", "id": 25969, "pid": 76337, "tid": -914061504, "ts": 1716454217634825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217634825, "dur": 0, "args": { "External id": 25970, "cbid": 205, "correlation": 25970 } }, { "ph": "f", "id": 25970, "pid": 76337, "tid": -914061504, "ts": 1716454217634825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217667545, "dur": 11, "args": { "External id": 25974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25974, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25974, "pid": 5, "tid": 7, "ts": 1716454217667545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634853, "dur": 13, "args": { "External id": 25974, "cbid": 211, "correlation": 25974 } }, { "ph": "s", "id": 25974, "pid": 76337, "tid": -914061504, "ts": 1716454217634853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217667557, "dur": 160, "args": { "External id": 25976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25976, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25976, "pid": 5, "tid": 7, "ts": 1716454217667557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634868, "dur": 5, "args": { "External id": 25976, "cbid": 211, "correlation": 25976 } }, { "ph": "s", "id": 25976, "pid": 76337, "tid": -914061504, "ts": 1716454217634868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217667720, "dur": 1, "args": { "External id": 25978, "device": 5, "context": 1, "stream": 7, "correlation": 25978, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 25978, "pid": 5, "tid": 7, "ts": 1716454217667720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217634879, "dur": 7, "args": { "External id": 25978, "cbid": 51, "correlation": 25978 } }, { "ph": "s", "id": 25978, "pid": 76337, "tid": -914061504, "ts": 1716454217634879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217667724, "dur": 640, "args": { "External id": 25979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25979, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 25979, "pid": 5, "tid": 7, "ts": 1716454217667724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634887, "dur": 6, "args": { "External id": 25979, "cbid": 211, "correlation": 25979 } }, { "ph": "s", "id": 25979, "pid": 76337, "tid": -914061504, "ts": 1716454217634887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217668365, "dur": 13, "args": { "External id": 25981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25981, "pid": 5, "tid": 7, "ts": 1716454217668365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634897, "dur": 5, "args": { "External id": 25981, "cbid": 211, "correlation": 25981 } }, { "ph": "s", "id": 25981, "pid": 76337, "tid": -914061504, "ts": 1716454217634897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217668379, "dur": 14, "args": { "External id": 25987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 25987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 25987, "pid": 5, "tid": 7, "ts": 1716454217668379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217634926, "dur": 9, "args": { "External id": 25987, "cbid": 211, "correlation": 25987 } }, { "ph": "s", "id": 25987, "pid": 76337, "tid": -914061504, "ts": 1716454217634926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217634994, "dur": 0, "args": { "External id": 25997, "cbid": 317, "correlation": 25997 } }, { "ph": "f", "id": 25997, "pid": 76337, "tid": -914061504, "ts": 1716454217634994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217634995, "dur": 0, "args": { "External id": 25998, "cbid": 203, "correlation": 25998 } }, { "ph": "f", "id": 25998, "pid": 76337, "tid": -914061504, "ts": 1716454217634995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217634995, "dur": 0, "args": { "External id": 25999, "cbid": 205, "correlation": 25999 } }, { "ph": "f", "id": 25999, "pid": 76337, "tid": -914061504, "ts": 1716454217634995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217668395, "dur": 21, "args": { "External id": 26003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26003, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26003, "pid": 5, "tid": 7, "ts": 1716454217668395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635019, "dur": 13, "args": { "External id": 26003, "cbid": 211, "correlation": 26003 } }, { "ph": "s", "id": 26003, "pid": 76337, "tid": -914061504, "ts": 1716454217635019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217668417, "dur": 4, "args": { "External id": 26005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26005, "pid": 5, "tid": 7, "ts": 1716454217668417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635038, "dur": 7, "args": { "External id": 26005, "cbid": 211, "correlation": 26005 } }, { "ph": "s", "id": 26005, "pid": 76337, "tid": -914061504, "ts": 1716454217635038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217635049, "dur": 0, "args": { "External id": 26006, "cbid": 51, "correlation": 26006 } }, { "ph": "s", "id": 26006, "pid": 76337, "tid": -914061504, "ts": 1716454217635049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217668422, "dur": 172, "args": { "External id": 26007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26007, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 26007, "pid": 5, "tid": 7, "ts": 1716454217668422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635050, "dur": 6, "args": { "External id": 26007, "cbid": 211, "correlation": 26007 } }, { "ph": "s", "id": 26007, "pid": 76337, "tid": -914061504, "ts": 1716454217635050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217668596, "dur": 16, "args": { "External id": 26012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26012, "pid": 5, "tid": 7, "ts": 1716454217668596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635078, "dur": 9, "args": { "External id": 26012, "cbid": 211, "correlation": 26012 } }, { "ph": "s", "id": 26012, "pid": 76337, "tid": -914061504, "ts": 1716454217635078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217668613, "dur": 12, "args": { "External id": 26020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26020, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26020, "pid": 5, "tid": 7, "ts": 1716454217668613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635109, "dur": 8, "args": { "External id": 26020, "cbid": 211, "correlation": 26020 } }, { "ph": "s", "id": 26020, "pid": 76337, "tid": -914061504, "ts": 1716454217635109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217668626, "dur": 10, "args": { "External id": 26028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26028, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26028, "pid": 5, "tid": 7, "ts": 1716454217668626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635138, "dur": 9, "args": { "External id": 26028, "cbid": 211, "correlation": 26028 } }, { "ph": "s", "id": 26028, "pid": 76337, "tid": -914061504, "ts": 1716454217635138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217668637, "dur": 18, "args": { "External id": 26048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26048, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 26048, "pid": 5, "tid": 7, "ts": 1716454217668637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635221, "dur": 12, "args": { "External id": 26048, "cbid": 211, "correlation": 26048 } }, { "ph": "s", "id": 26048, "pid": 76337, "tid": -914061504, "ts": 1716454217635221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217668657, "dur": 5, "args": { "External id": 26060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26060, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 26060, "pid": 5, "tid": 7, "ts": 1716454217668657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635243, "dur": 6, "args": { "External id": 26060, "cbid": 211, "correlation": 26060 } }, { "ph": "s", "id": 26060, "pid": 76337, "tid": -914061504, "ts": 1716454217635243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217668663, "dur": 16, "args": { "External id": 26063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26063, "pid": 5, "tid": 7, "ts": 1716454217668663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635261, "dur": 6, "args": { "External id": 26063, "cbid": 211, "correlation": 26063 } }, { "ph": "s", "id": 26063, "pid": 76337, "tid": -914061504, "ts": 1716454217635261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217635319, "dur": 0, "args": { "External id": 26074, "cbid": 317, "correlation": 26074 } }, { "ph": "f", "id": 26074, "pid": 76337, "tid": -914061504, "ts": 1716454217635319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217635320, "dur": 0, "args": { "External id": 26075, "cbid": 203, "correlation": 26075 } }, { "ph": "f", "id": 26075, "pid": 76337, "tid": -914061504, "ts": 1716454217635320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217635321, "dur": 0, "args": { "External id": 26076, "cbid": 205, "correlation": 26076 } }, { "ph": "f", "id": 26076, "pid": 76337, "tid": -914061504, "ts": 1716454217635321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217668680, "dur": 11, "args": { "External id": 26080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26080, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26080, "pid": 5, "tid": 7, "ts": 1716454217668680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635334, "dur": 12, "args": { "External id": 26080, "cbid": 211, "correlation": 26080 } }, { "ph": "s", "id": 26080, "pid": 76337, "tid": -914061504, "ts": 1716454217635334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217668692, "dur": 3, "args": { "External id": 26082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26082, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26082, "pid": 5, "tid": 7, "ts": 1716454217668692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635350, "dur": 6, "args": { "External id": 26082, "cbid": 211, "correlation": 26082 } }, { "ph": "s", "id": 26082, "pid": 76337, "tid": -914061504, "ts": 1716454217635350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217635359, "dur": 0, "args": { "External id": 26083, "cbid": 51, "correlation": 26083 } }, { "ph": "s", "id": 26083, "pid": 76337, "tid": -914061504, "ts": 1716454217635359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217668697, "dur": 89, "args": { "External id": 26084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26084, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 26084, "pid": 5, "tid": 7, "ts": 1716454217668697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635360, "dur": 5, "args": { "External id": 26084, "cbid": 211, "correlation": 26084 } }, { "ph": "s", "id": 26084, "pid": 76337, "tid": -914061504, "ts": 1716454217635360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217668787, "dur": 15, "args": { "External id": 26089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26089, "pid": 5, "tid": 7, "ts": 1716454217668787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635386, "dur": 8, "args": { "External id": 26089, "cbid": 211, "correlation": 26089 } }, { "ph": "s", "id": 26089, "pid": 76337, "tid": -914061504, "ts": 1716454217635386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217668804, "dur": 82, "args": { "External id": 26098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26098, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26098, "pid": 5, "tid": 7, "ts": 1716454217668804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635469, "dur": 15, "args": { "External id": 26098, "cbid": 211, "correlation": 26098 } }, { "ph": "s", "id": 26098, "pid": 76337, "tid": -914061504, "ts": 1716454217635469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217668887, "dur": 30, "args": { "External id": 26120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26120, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26120, "pid": 5, "tid": 7, "ts": 1716454217668887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635531, "dur": 10, "args": { "External id": 26120, "cbid": 211, "correlation": 26120 } }, { "ph": "s", "id": 26120, "pid": 76337, "tid": -914061504, "ts": 1716454217635531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217635644, "dur": 2, "args": { "External id": 26131, "cbid": 251, "correlation": 26131 } }, { "ph": "f", "id": 26131, "pid": 76337, "tid": -914061504, "ts": 1716454217635644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217668918, "dur": 161, "args": { "External id": 26132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26132, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26132, "pid": 5, "tid": 7, "ts": 1716454217668918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635650, "dur": 14, "args": { "External id": 26132, "cbid": 211, "correlation": 26132 } }, { "ph": "s", "id": 26132, "pid": 76337, "tid": -914061504, "ts": 1716454217635650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217635733, "dur": 1, "args": { "External id": 26143, "cbid": 251, "correlation": 26143 } }, { "ph": "f", "id": 26143, "pid": 76337, "tid": -914061504, "ts": 1716454217635733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217669081, "dur": 157, "args": { "External id": 26144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26144, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26144, "pid": 5, "tid": 7, "ts": 1716454217669081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635736, "dur": 12, "args": { "External id": 26144, "cbid": 211, "correlation": 26144 } }, { "ph": "s", "id": 26144, "pid": 76337, "tid": -914061504, "ts": 1716454217635736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217635802, "dur": 1, "args": { "External id": 26155, "cbid": 251, "correlation": 26155 } }, { "ph": "f", "id": 26155, "pid": 76337, "tid": -914061504, "ts": 1716454217635802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217669239, "dur": 150, "args": { "External id": 26156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26156, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26156, "pid": 5, "tid": 7, "ts": 1716454217669239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635806, "dur": 11, "args": { "External id": 26156, "cbid": 211, "correlation": 26156 } }, { "ph": "s", "id": 26156, "pid": 76337, "tid": -914061504, "ts": 1716454217635806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217669391, "dur": 329, "args": { "External id": 26181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26181, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26181, "pid": 5, "tid": 7, "ts": 1716454217669391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217635896, "dur": 14, "args": { "External id": 26181, "cbid": 211, "correlation": 26181 } }, { "ph": "s", "id": 26181, "pid": 76337, "tid": -914061504, "ts": 1716454217635896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636021, "dur": 2, "args": { "External id": 26199, "cbid": 251, "correlation": 26199 } }, { "ph": "f", "id": 26199, "pid": 76337, "tid": -914061504, "ts": 1716454217636021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217669722, "dur": 164, "args": { "External id": 26201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26201, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26201, "pid": 5, "tid": 7, "ts": 1716454217669722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636027, "dur": 14, "args": { "External id": 26201, "cbid": 211, "correlation": 26201 } }, { "ph": "s", "id": 26201, "pid": 76337, "tid": -914061504, "ts": 1716454217636027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217669887, "dur": 20, "args": { "External id": 26209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26209, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26209, "pid": 5, "tid": 7, "ts": 1716454217669887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636100, "dur": 13, "args": { "External id": 26209, "cbid": 211, "correlation": 26209 } }, { "ph": "s", "id": 26209, "pid": 76337, "tid": -914061504, "ts": 1716454217636100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217669907, "dur": 28, "args": { "External id": 26217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26217, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26217, "pid": 5, "tid": 7, "ts": 1716454217669907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636141, "dur": 8, "args": { "External id": 26217, "cbid": 211, "correlation": 26217 } }, { "ph": "s", "id": 26217, "pid": 76337, "tid": -914061504, "ts": 1716454217636141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217669936, "dur": 18, "args": { "External id": 26228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26228, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26228, "pid": 5, "tid": 7, "ts": 1716454217669936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636217, "dur": 13, "args": { "External id": 26228, "cbid": 211, "correlation": 26228 } }, { "ph": "s", "id": 26228, "pid": 76337, "tid": -914061504, "ts": 1716454217636217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217669956, "dur": 16, "args": { "External id": 26250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26250, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26250, "pid": 5, "tid": 7, "ts": 1716454217669956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636248, "dur": 8, "args": { "External id": 26250, "cbid": 211, "correlation": 26250 } }, { "ph": "s", "id": 26250, "pid": 76337, "tid": -914061504, "ts": 1716454217636248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636349, "dur": 2, "args": { "External id": 26261, "cbid": 251, "correlation": 26261 } }, { "ph": "f", "id": 26261, "pid": 76337, "tid": -914061504, "ts": 1716454217636349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217669973, "dur": 88, "args": { "External id": 26262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26262, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26262, "pid": 5, "tid": 7, "ts": 1716454217669973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636355, "dur": 15, "args": { "External id": 26262, "cbid": 211, "correlation": 26262 } }, { "ph": "s", "id": 26262, "pid": 76337, "tid": -914061504, "ts": 1716454217636355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636439, "dur": 1, "args": { "External id": 26273, "cbid": 251, "correlation": 26273 } }, { "ph": "f", "id": 26273, "pid": 76337, "tid": -914061504, "ts": 1716454217636439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636443, "dur": 0, "args": { "External id": 26274, "cbid": 251, "correlation": 26274 } }, { "ph": "f", "id": 26274, "pid": 76337, "tid": -914061504, "ts": 1716454217636443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217670062, "dur": 12, "args": { "External id": 26275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26275, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26275, "pid": 5, "tid": 7, "ts": 1716454217670062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636445, "dur": 13, "args": { "External id": 26275, "cbid": 211, "correlation": 26275 } }, { "ph": "s", "id": 26275, "pid": 76337, "tid": -914061504, "ts": 1716454217636445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217670076, "dur": 5, "args": { "External id": 26277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26277, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26277, "pid": 5, "tid": 7, "ts": 1716454217670076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636460, "dur": 7, "args": { "External id": 26277, "cbid": 211, "correlation": 26277 } }, { "ph": "s", "id": 26277, "pid": 76337, "tid": -914061504, "ts": 1716454217636460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636530, "dur": 1, "args": { "External id": 26288, "cbid": 251, "correlation": 26288 } }, { "ph": "f", "id": 26288, "pid": 76337, "tid": -914061504, "ts": 1716454217636530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636534, "dur": 0, "args": { "External id": 26289, "cbid": 251, "correlation": 26289 } }, { "ph": "f", "id": 26289, "pid": 76337, "tid": -914061504, "ts": 1716454217636534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217670083, "dur": 8, "args": { "External id": 26290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26290, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26290, "pid": 5, "tid": 7, "ts": 1716454217670083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636535, "dur": 13, "args": { "External id": 26290, "cbid": 211, "correlation": 26290 } }, { "ph": "s", "id": 26290, "pid": 76337, "tid": -914061504, "ts": 1716454217636535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217670093, "dur": 3, "args": { "External id": 26292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26292, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26292, "pid": 5, "tid": 7, "ts": 1716454217670093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636550, "dur": 6, "args": { "External id": 26292, "cbid": 211, "correlation": 26292 } }, { "ph": "s", "id": 26292, "pid": 76337, "tid": -914061504, "ts": 1716454217636550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217670097, "dur": 54, "args": { "External id": 26317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26317, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26317, "pid": 5, "tid": 7, "ts": 1716454217670097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636630, "dur": 13, "args": { "External id": 26317, "cbid": 211, "correlation": 26317 } }, { "ph": "s", "id": 26317, "pid": 76337, "tid": -914061504, "ts": 1716454217636630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217636743, "dur": 2, "args": { "External id": 26335, "cbid": 251, "correlation": 26335 } }, { "ph": "f", "id": 26335, "pid": 76337, "tid": -914061504, "ts": 1716454217636743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217670152, "dur": 90, "args": { "External id": 26337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26337, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26337, "pid": 5, "tid": 7, "ts": 1716454217670152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636749, "dur": 15, "args": { "External id": 26337, "cbid": 211, "correlation": 26337 } }, { "ph": "s", "id": 26337, "pid": 76337, "tid": -914061504, "ts": 1716454217636749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217670244, "dur": 9, "args": { "External id": 26345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26345, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26345, "pid": 5, "tid": 7, "ts": 1716454217670244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636821, "dur": 14, "args": { "External id": 26345, "cbid": 211, "correlation": 26345 } }, { "ph": "s", "id": 26345, "pid": 76337, "tid": -914061504, "ts": 1716454217636821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217670254, "dur": 21, "args": { "External id": 26353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26353, "pid": 5, "tid": 7, "ts": 1716454217670254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636865, "dur": 9, "args": { "External id": 26353, "cbid": 211, "correlation": 26353 } }, { "ph": "s", "id": 26353, "pid": 76337, "tid": -914061504, "ts": 1716454217636865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217670276, "dur": 17, "args": { "External id": 26375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26375, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26375, "pid": 5, "tid": 7, "ts": 1716454217670276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217636929, "dur": 12, "args": { "External id": 26375, "cbid": 211, "correlation": 26375 } }, { "ph": "s", "id": 26375, "pid": 76337, "tid": -914061504, "ts": 1716454217636929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217637030, "dur": 1, "args": { "External id": 26391, "cbid": 251, "correlation": 26391 } }, { "ph": "f", "id": 26391, "pid": 76337, "tid": -914061504, "ts": 1716454217637030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217637035, "dur": 0, "args": { "External id": 26393, "cbid": 251, "correlation": 26393 } }, { "ph": "f", "id": 26393, "pid": 76337, "tid": -914061504, "ts": 1716454217637035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217670295, "dur": 496, "args": { "External id": 26394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26394, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26394, "pid": 5, "tid": 7, "ts": 1716454217670295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637037, "dur": 14, "args": { "External id": 26394, "cbid": 211, "correlation": 26394 } }, { "ph": "s", "id": 26394, "pid": 76337, "tid": -914061504, "ts": 1716454217637037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217670793, "dur": 65, "args": { "External id": 26402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26402, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26402, "pid": 5, "tid": 7, "ts": 1716454217670793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637109, "dur": 12, "args": { "External id": 26402, "cbid": 211, "correlation": 26402 } }, { "ph": "s", "id": 26402, "pid": 76337, "tid": -914061504, "ts": 1716454217637109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217670859, "dur": 66, "args": { "External id": 26410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26410, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26410, "pid": 5, "tid": 7, "ts": 1716454217670859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637142, "dur": 8, "args": { "External id": 26410, "cbid": 211, "correlation": 26410 } }, { "ph": "s", "id": 26410, "pid": 76337, "tid": -914061504, "ts": 1716454217637142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217637234, "dur": 1, "args": { "External id": 26426, "cbid": 251, "correlation": 26426 } }, { "ph": "f", "id": 26426, "pid": 76337, "tid": -914061504, "ts": 1716454217637234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217670928, "dur": 1, "args": { "External id": 26428, "device": 5, "context": 1, "stream": 7, "correlation": 26428, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 26428, "pid": 5, "tid": 7, "ts": 1716454217670928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217637240, "dur": 11, "args": { "External id": 26428, "cbid": 51, "correlation": 26428 } }, { "ph": "s", "id": 26428, "pid": 76337, "tid": -914061504, "ts": 1716454217637240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217670931, "dur": 268, "args": { "External id": 26429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26429, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26429, "pid": 5, "tid": 7, "ts": 1716454217670931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637252, "dur": 12, "args": { "External id": 26429, "cbid": 211, "correlation": 26429 } }, { "ph": "s", "id": 26429, "pid": 76337, "tid": -914061504, "ts": 1716454217637252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217671201, "dur": 14, "args": { "External id": 26437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26437, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26437, "pid": 5, "tid": 7, "ts": 1716454217671201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637296, "dur": 11, "args": { "External id": 26437, "cbid": 211, "correlation": 26437 } }, { "ph": "s", "id": 26437, "pid": 76337, "tid": -914061504, "ts": 1716454217637296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217671216, "dur": 37, "args": { "External id": 26448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26448, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26448, "pid": 5, "tid": 7, "ts": 1716454217671216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637366, "dur": 12, "args": { "External id": 26448, "cbid": 211, "correlation": 26448 } }, { "ph": "s", "id": 26448, "pid": 76337, "tid": -914061504, "ts": 1716454217637366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217637432, "dur": 0, "args": { "External id": 26460, "cbid": 317, "correlation": 26460 } }, { "ph": "f", "id": 26460, "pid": 76337, "tid": -914061504, "ts": 1716454217637432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217637433, "dur": 0, "args": { "External id": 26461, "cbid": 203, "correlation": 26461 } }, { "ph": "f", "id": 26461, "pid": 76337, "tid": -914061504, "ts": 1716454217637433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217637434, "dur": 0, "args": { "External id": 26462, "cbid": 205, "correlation": 26462 } }, { "ph": "f", "id": 26462, "pid": 76337, "tid": -914061504, "ts": 1716454217637434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217671255, "dur": 12, "args": { "External id": 26466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26466, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26466, "pid": 5, "tid": 7, "ts": 1716454217671255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637449, "dur": 12, "args": { "External id": 26466, "cbid": 211, "correlation": 26466 } }, { "ph": "s", "id": 26466, "pid": 76337, "tid": -914061504, "ts": 1716454217637449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217671268, "dur": 4, "args": { "External id": 26468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26468, "pid": 5, "tid": 7, "ts": 1716454217671268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637466, "dur": 6, "args": { "External id": 26468, "cbid": 211, "correlation": 26468 } }, { "ph": "s", "id": 26468, "pid": 76337, "tid": -914061504, "ts": 1716454217637466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217637474, "dur": 0, "args": { "External id": 26469, "cbid": 51, "correlation": 26469 } }, { "ph": "s", "id": 26469, "pid": 76337, "tid": -914061504, "ts": 1716454217637474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217671273, "dur": 94, "args": { "External id": 26470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26470, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 26470, "pid": 5, "tid": 7, "ts": 1716454217671273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637475, "dur": 5, "args": { "External id": 26470, "cbid": 211, "correlation": 26470 } }, { "ph": "s", "id": 26470, "pid": 76337, "tid": -914061504, "ts": 1716454217637475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217671368, "dur": 16, "args": { "External id": 26475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26475, "pid": 5, "tid": 7, "ts": 1716454217671368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637503, "dur": 10, "args": { "External id": 26475, "cbid": 211, "correlation": 26475 } }, { "ph": "s", "id": 26475, "pid": 76337, "tid": -914061504, "ts": 1716454217637503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217671386, "dur": 12, "args": { "External id": 26483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26483, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26483, "pid": 5, "tid": 7, "ts": 1716454217671386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637535, "dur": 8, "args": { "External id": 26483, "cbid": 211, "correlation": 26483 } }, { "ph": "s", "id": 26483, "pid": 76337, "tid": -914061504, "ts": 1716454217637535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217671398, "dur": 29, "args": { "External id": 26492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26492, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26492, "pid": 5, "tid": 7, "ts": 1716454217671398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637575, "dur": 10, "args": { "External id": 26492, "cbid": 211, "correlation": 26492 } }, { "ph": "s", "id": 26492, "pid": 76337, "tid": -914061504, "ts": 1716454217637575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217671429, "dur": 31, "args": { "External id": 26512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26512, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 26512, "pid": 5, "tid": 7, "ts": 1716454217671429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637649, "dur": 11, "args": { "External id": 26512, "cbid": 211, "correlation": 26512 } }, { "ph": "s", "id": 26512, "pid": 76337, "tid": -914061504, "ts": 1716454217637649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217671461, "dur": 5, "args": { "External id": 26524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26524, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26524, "pid": 5, "tid": 7, "ts": 1716454217671461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637682, "dur": 8, "args": { "External id": 26524, "cbid": 211, "correlation": 26524 } }, { "ph": "s", "id": 26524, "pid": 76337, "tid": -914061504, "ts": 1716454217637682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217671467, "dur": 31, "args": { "External id": 26527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26527, "pid": 5, "tid": 7, "ts": 1716454217671467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637702, "dur": 7, "args": { "External id": 26527, "cbid": 211, "correlation": 26527 } }, { "ph": "s", "id": 26527, "pid": 76337, "tid": -914061504, "ts": 1716454217637702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217671499, "dur": 21, "args": { "External id": 26536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26536, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26536, "pid": 5, "tid": 7, "ts": 1716454217671499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637744, "dur": 9, "args": { "External id": 26536, "cbid": 211, "correlation": 26536 } }, { "ph": "s", "id": 26536, "pid": 76337, "tid": -914061504, "ts": 1716454217637744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217637807, "dur": 0, "args": { "External id": 26546, "cbid": 317, "correlation": 26546 } }, { "ph": "f", "id": 26546, "pid": 76337, "tid": -914061504, "ts": 1716454217637807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217637808, "dur": 0, "args": { "External id": 26547, "cbid": 203, "correlation": 26547 } }, { "ph": "f", "id": 26547, "pid": 76337, "tid": -914061504, "ts": 1716454217637808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217637809, "dur": 0, "args": { "External id": 26548, "cbid": 205, "correlation": 26548 } }, { "ph": "f", "id": 26548, "pid": 76337, "tid": -914061504, "ts": 1716454217637809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217671521, "dur": 22, "args": { "External id": 26552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26552, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26552, "pid": 5, "tid": 7, "ts": 1716454217671521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637830, "dur": 12, "args": { "External id": 26552, "cbid": 211, "correlation": 26552 } }, { "ph": "s", "id": 26552, "pid": 76337, "tid": -914061504, "ts": 1716454217637830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217671544, "dur": 315, "args": { "External id": 26554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26554, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26554, "pid": 5, "tid": 7, "ts": 1716454217671544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637844, "dur": 5, "args": { "External id": 26554, "cbid": 211, "correlation": 26554 } }, { "ph": "s", "id": 26554, "pid": 76337, "tid": -914061504, "ts": 1716454217637844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217671861, "dur": 1, "args": { "External id": 26556, "device": 5, "context": 1, "stream": 7, "correlation": 26556, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 26556, "pid": 5, "tid": 7, "ts": 1716454217671861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217637856, "dur": 6, "args": { "External id": 26556, "cbid": 51, "correlation": 26556 } }, { "ph": "s", "id": 26556, "pid": 76337, "tid": -914061504, "ts": 1716454217637856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217671865, "dur": 1238, "args": { "External id": 26557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26557, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26557, "pid": 5, "tid": 7, "ts": 1716454217671865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637864, "dur": 6, "args": { "External id": 26557, "cbid": 211, "correlation": 26557 } }, { "ph": "s", "id": 26557, "pid": 76337, "tid": -914061504, "ts": 1716454217637864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217673104, "dur": 12, "args": { "External id": 26559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26559, "pid": 5, "tid": 7, "ts": 1716454217673104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637874, "dur": 6, "args": { "External id": 26559, "cbid": 211, "correlation": 26559 } }, { "ph": "s", "id": 26559, "pid": 76337, "tid": -914061504, "ts": 1716454217637874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217673117, "dur": 15, "args": { "External id": 26565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26565, "pid": 5, "tid": 7, "ts": 1716454217673117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637911, "dur": 10, "args": { "External id": 26565, "cbid": 211, "correlation": 26565 } }, { "ph": "s", "id": 26565, "pid": 76337, "tid": -914061504, "ts": 1716454217637911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217673133, "dur": 3, "args": { "External id": 26573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26573, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 26573, "pid": 5, "tid": 7, "ts": 1716454217673133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217637958, "dur": 10, "args": { "External id": 26573, "cbid": 211, "correlation": 26573 } }, { "ph": "s", "id": 26573, "pid": 76337, "tid": -914061504, "ts": 1716454217637958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217638034, "dur": 1, "args": { "External id": 26589, "cbid": 251, "correlation": 26589 } }, { "ph": "f", "id": 26589, "pid": 76337, "tid": -914061504, "ts": 1716454217638034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217638039, "dur": 0, "args": { "External id": 26591, "cbid": 251, "correlation": 26591 } }, { "ph": "f", "id": 26591, "pid": 76337, "tid": -914061504, "ts": 1716454217638039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217673138, "dur": 12, "args": { "External id": 26592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26592, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26592, "pid": 5, "tid": 7, "ts": 1716454217673138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638041, "dur": 12, "args": { "External id": 26592, "cbid": 211, "correlation": 26592 } }, { "ph": "s", "id": 26592, "pid": 76337, "tid": -914061504, "ts": 1716454217638041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217673152, "dur": 5, "args": { "External id": 26594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26594, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26594, "pid": 5, "tid": 7, "ts": 1716454217673152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638055, "dur": 5, "args": { "External id": 26594, "cbid": 211, "correlation": 26594 } }, { "ph": "s", "id": 26594, "pid": 76337, "tid": -914061504, "ts": 1716454217638055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217673158, "dur": 16, "args": { "External id": 26604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26604, "pid": 5, "tid": 7, "ts": 1716454217673158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638113, "dur": 12, "args": { "External id": 26604, "cbid": 211, "correlation": 26604 } }, { "ph": "s", "id": 26604, "pid": 76337, "tid": -914061504, "ts": 1716454217638113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217673175, "dur": 17, "args": { "External id": 26624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26624, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 26624, "pid": 5, "tid": 7, "ts": 1716454217673175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638180, "dur": 11, "args": { "External id": 26624, "cbid": 211, "correlation": 26624 } }, { "ph": "s", "id": 26624, "pid": 76337, "tid": -914061504, "ts": 1716454217638180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217673194, "dur": 4, "args": { "External id": 26636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26636, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 26636, "pid": 5, "tid": 7, "ts": 1716454217673194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638201, "dur": 6, "args": { "External id": 26636, "cbid": 211, "correlation": 26636 } }, { "ph": "s", "id": 26636, "pid": 76337, "tid": -914061504, "ts": 1716454217638201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217673199, "dur": 16, "args": { "External id": 26639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26639, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26639, "pid": 5, "tid": 7, "ts": 1716454217673199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638220, "dur": 6, "args": { "External id": 26639, "cbid": 211, "correlation": 26639 } }, { "ph": "s", "id": 26639, "pid": 76337, "tid": -914061504, "ts": 1716454217638220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217673216, "dur": 11, "args": { "External id": 26648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26648, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26648, "pid": 5, "tid": 7, "ts": 1716454217673216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638262, "dur": 10, "args": { "External id": 26648, "cbid": 211, "correlation": 26648 } }, { "ph": "s", "id": 26648, "pid": 76337, "tid": -914061504, "ts": 1716454217638262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217638325, "dur": 0, "args": { "External id": 26658, "cbid": 317, "correlation": 26658 } }, { "ph": "f", "id": 26658, "pid": 76337, "tid": -914061504, "ts": 1716454217638325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217638326, "dur": 0, "args": { "External id": 26659, "cbid": 203, "correlation": 26659 } }, { "ph": "f", "id": 26659, "pid": 76337, "tid": -914061504, "ts": 1716454217638326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217638327, "dur": 0, "args": { "External id": 26660, "cbid": 205, "correlation": 26660 } }, { "ph": "f", "id": 26660, "pid": 76337, "tid": -914061504, "ts": 1716454217638327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217673228, "dur": 11, "args": { "External id": 26664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26664, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26664, "pid": 5, "tid": 7, "ts": 1716454217673228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638340, "dur": 13, "args": { "External id": 26664, "cbid": 211, "correlation": 26664 } }, { "ph": "s", "id": 26664, "pid": 76337, "tid": -914061504, "ts": 1716454217638340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217673240, "dur": 161, "args": { "External id": 26666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26666, "pid": 5, "tid": 7, "ts": 1716454217673240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638356, "dur": 5, "args": { "External id": 26666, "cbid": 211, "correlation": 26666 } }, { "ph": "s", "id": 26666, "pid": 76337, "tid": -914061504, "ts": 1716454217638356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217673403, "dur": 1, "args": { "External id": 26668, "device": 5, "context": 1, "stream": 7, "correlation": 26668, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 26668, "pid": 5, "tid": 7, "ts": 1716454217673403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217638367, "dur": 6, "args": { "External id": 26668, "cbid": 51, "correlation": 26668 } }, { "ph": "s", "id": 26668, "pid": 76337, "tid": -914061504, "ts": 1716454217638367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217673407, "dur": 639, "args": { "External id": 26669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26669, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26669, "pid": 5, "tid": 7, "ts": 1716454217673407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638374, "dur": 6, "args": { "External id": 26669, "cbid": 211, "correlation": 26669 } }, { "ph": "s", "id": 26669, "pid": 76337, "tid": -914061504, "ts": 1716454217638374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217674047, "dur": 12, "args": { "External id": 26671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26671, "pid": 5, "tid": 7, "ts": 1716454217674047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638384, "dur": 5, "args": { "External id": 26671, "cbid": 211, "correlation": 26671 } }, { "ph": "s", "id": 26671, "pid": 76337, "tid": -914061504, "ts": 1716454217638384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217674061, "dur": 15, "args": { "External id": 26677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26677, "pid": 5, "tid": 7, "ts": 1716454217674061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638414, "dur": 9, "args": { "External id": 26677, "cbid": 211, "correlation": 26677 } }, { "ph": "s", "id": 26677, "pid": 76337, "tid": -914061504, "ts": 1716454217638414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217638472, "dur": 0, "args": { "External id": 26687, "cbid": 317, "correlation": 26687 } }, { "ph": "f", "id": 26687, "pid": 76337, "tid": -914061504, "ts": 1716454217638472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217638473, "dur": 0, "args": { "External id": 26688, "cbid": 203, "correlation": 26688 } }, { "ph": "f", "id": 26688, "pid": 76337, "tid": -914061504, "ts": 1716454217638473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217638474, "dur": 0, "args": { "External id": 26689, "cbid": 205, "correlation": 26689 } }, { "ph": "f", "id": 26689, "pid": 76337, "tid": -914061504, "ts": 1716454217638474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217674077, "dur": 22, "args": { "External id": 26693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26693, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26693, "pid": 5, "tid": 7, "ts": 1716454217674077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638485, "dur": 11, "args": { "External id": 26693, "cbid": 211, "correlation": 26693 } }, { "ph": "s", "id": 26693, "pid": 76337, "tid": -914061504, "ts": 1716454217638485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217674100, "dur": 4, "args": { "External id": 26695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26695, "pid": 5, "tid": 7, "ts": 1716454217674100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638501, "dur": 6, "args": { "External id": 26695, "cbid": 211, "correlation": 26695 } }, { "ph": "s", "id": 26695, "pid": 76337, "tid": -914061504, "ts": 1716454217638501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217638510, "dur": 0, "args": { "External id": 26696, "cbid": 51, "correlation": 26696 } }, { "ph": "s", "id": 26696, "pid": 76337, "tid": -914061504, "ts": 1716454217638510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217674105, "dur": 167, "args": { "External id": 26697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26697, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 26697, "pid": 5, "tid": 7, "ts": 1716454217674105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638511, "dur": 5, "args": { "External id": 26697, "cbid": 211, "correlation": 26697 } }, { "ph": "s", "id": 26697, "pid": 76337, "tid": -914061504, "ts": 1716454217638511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217674274, "dur": 16, "args": { "External id": 26702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26702, "pid": 5, "tid": 7, "ts": 1716454217674274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638536, "dur": 8, "args": { "External id": 26702, "cbid": 211, "correlation": 26702 } }, { "ph": "s", "id": 26702, "pid": 76337, "tid": -914061504, "ts": 1716454217638536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217674291, "dur": 12, "args": { "External id": 26710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26710, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26710, "pid": 5, "tid": 7, "ts": 1716454217674291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638565, "dur": 8, "args": { "External id": 26710, "cbid": 211, "correlation": 26710 } }, { "ph": "s", "id": 26710, "pid": 76337, "tid": -914061504, "ts": 1716454217638565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217674304, "dur": 10, "args": { "External id": 26718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26718, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26718, "pid": 5, "tid": 7, "ts": 1716454217674304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638594, "dur": 8, "args": { "External id": 26718, "cbid": 211, "correlation": 26718 } }, { "ph": "s", "id": 26718, "pid": 76337, "tid": -914061504, "ts": 1716454217638594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217674315, "dur": 18, "args": { "External id": 26738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26738, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 26738, "pid": 5, "tid": 7, "ts": 1716454217674315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638678, "dur": 12, "args": { "External id": 26738, "cbid": 211, "correlation": 26738 } }, { "ph": "s", "id": 26738, "pid": 76337, "tid": -914061504, "ts": 1716454217638678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217674334, "dur": 4, "args": { "External id": 26750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26750, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 26750, "pid": 5, "tid": 7, "ts": 1716454217674334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638700, "dur": 6, "args": { "External id": 26750, "cbid": 211, "correlation": 26750 } }, { "ph": "s", "id": 26750, "pid": 76337, "tid": -914061504, "ts": 1716454217638700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217674339, "dur": 16, "args": { "External id": 26753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26753, "pid": 5, "tid": 7, "ts": 1716454217674339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638718, "dur": 7, "args": { "External id": 26753, "cbid": 211, "correlation": 26753 } }, { "ph": "s", "id": 26753, "pid": 76337, "tid": -914061504, "ts": 1716454217638718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217638775, "dur": 0, "args": { "External id": 26764, "cbid": 317, "correlation": 26764 } }, { "ph": "f", "id": 26764, "pid": 76337, "tid": -914061504, "ts": 1716454217638775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217638776, "dur": 0, "args": { "External id": 26765, "cbid": 203, "correlation": 26765 } }, { "ph": "f", "id": 26765, "pid": 76337, "tid": -914061504, "ts": 1716454217638776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217638777, "dur": 0, "args": { "External id": 26766, "cbid": 205, "correlation": 26766 } }, { "ph": "f", "id": 26766, "pid": 76337, "tid": -914061504, "ts": 1716454217638777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217674357, "dur": 11, "args": { "External id": 26770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26770, "pid": 5, "tid": 7, "ts": 1716454217674357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638791, "dur": 11, "args": { "External id": 26770, "cbid": 211, "correlation": 26770 } }, { "ph": "s", "id": 26770, "pid": 76337, "tid": -914061504, "ts": 1716454217638791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217674370, "dur": 3, "args": { "External id": 26772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26772, "pid": 5, "tid": 7, "ts": 1716454217674370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638806, "dur": 6, "args": { "External id": 26772, "cbid": 211, "correlation": 26772 } }, { "ph": "s", "id": 26772, "pid": 76337, "tid": -914061504, "ts": 1716454217638806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217638814, "dur": 0, "args": { "External id": 26773, "cbid": 51, "correlation": 26773 } }, { "ph": "s", "id": 26773, "pid": 76337, "tid": -914061504, "ts": 1716454217638814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217674374, "dur": 90, "args": { "External id": 26774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26774, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 26774, "pid": 5, "tid": 7, "ts": 1716454217674374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638815, "dur": 5, "args": { "External id": 26774, "cbid": 211, "correlation": 26774 } }, { "ph": "s", "id": 26774, "pid": 76337, "tid": -914061504, "ts": 1716454217638815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217674466, "dur": 16, "args": { "External id": 26779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26779, "pid": 5, "tid": 7, "ts": 1716454217674466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638842, "dur": 9, "args": { "External id": 26779, "cbid": 211, "correlation": 26779 } }, { "ph": "s", "id": 26779, "pid": 76337, "tid": -914061504, "ts": 1716454217638842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217674483, "dur": 82, "args": { "External id": 26788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26788, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26788, "pid": 5, "tid": 7, "ts": 1716454217674483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638924, "dur": 14, "args": { "External id": 26788, "cbid": 211, "correlation": 26788 } }, { "ph": "s", "id": 26788, "pid": 76337, "tid": -914061504, "ts": 1716454217638924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217674566, "dur": 30, "args": { "External id": 26810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26810, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26810, "pid": 5, "tid": 7, "ts": 1716454217674566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217638990, "dur": 11, "args": { "External id": 26810, "cbid": 211, "correlation": 26810 } }, { "ph": "s", "id": 26810, "pid": 76337, "tid": -914061504, "ts": 1716454217638990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639104, "dur": 1, "args": { "External id": 26821, "cbid": 251, "correlation": 26821 } }, { "ph": "f", "id": 26821, "pid": 76337, "tid": -914061504, "ts": 1716454217639104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217674597, "dur": 163, "args": { "External id": 26822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26822, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26822, "pid": 5, "tid": 7, "ts": 1716454217674597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639109, "dur": 14, "args": { "External id": 26822, "cbid": 211, "correlation": 26822 } }, { "ph": "s", "id": 26822, "pid": 76337, "tid": -914061504, "ts": 1716454217639109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639195, "dur": 1, "args": { "External id": 26833, "cbid": 251, "correlation": 26833 } }, { "ph": "f", "id": 26833, "pid": 76337, "tid": -914061504, "ts": 1716454217639195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217674761, "dur": 156, "args": { "External id": 26834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26834, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26834, "pid": 5, "tid": 7, "ts": 1716454217674761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639199, "dur": 13, "args": { "External id": 26834, "cbid": 211, "correlation": 26834 } }, { "ph": "s", "id": 26834, "pid": 76337, "tid": -914061504, "ts": 1716454217639199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639279, "dur": 1, "args": { "External id": 26845, "cbid": 251, "correlation": 26845 } }, { "ph": "f", "id": 26845, "pid": 76337, "tid": -914061504, "ts": 1716454217639279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217674919, "dur": 156, "args": { "External id": 26846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26846, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26846, "pid": 5, "tid": 7, "ts": 1716454217674919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639283, "dur": 12, "args": { "External id": 26846, "cbid": 211, "correlation": 26846 } }, { "ph": "s", "id": 26846, "pid": 76337, "tid": -914061504, "ts": 1716454217639283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217675076, "dur": 332, "args": { "External id": 26871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26871, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26871, "pid": 5, "tid": 7, "ts": 1716454217675076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639371, "dur": 13, "args": { "External id": 26871, "cbid": 211, "correlation": 26871 } }, { "ph": "s", "id": 26871, "pid": 76337, "tid": -914061504, "ts": 1716454217639371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639487, "dur": 1, "args": { "External id": 26889, "cbid": 251, "correlation": 26889 } }, { "ph": "f", "id": 26889, "pid": 76337, "tid": -914061504, "ts": 1716454217639487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217675409, "dur": 164, "args": { "External id": 26891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26891, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26891, "pid": 5, "tid": 7, "ts": 1716454217675409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639493, "dur": 14, "args": { "External id": 26891, "cbid": 211, "correlation": 26891 } }, { "ph": "s", "id": 26891, "pid": 76337, "tid": -914061504, "ts": 1716454217639493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217675574, "dur": 19, "args": { "External id": 26899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26899, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26899, "pid": 5, "tid": 7, "ts": 1716454217675574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639566, "dur": 13, "args": { "External id": 26899, "cbid": 211, "correlation": 26899 } }, { "ph": "s", "id": 26899, "pid": 76337, "tid": -914061504, "ts": 1716454217639566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217675595, "dur": 27, "args": { "External id": 26907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26907, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26907, "pid": 5, "tid": 7, "ts": 1716454217675595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639605, "dur": 9, "args": { "External id": 26907, "cbid": 211, "correlation": 26907 } }, { "ph": "s", "id": 26907, "pid": 76337, "tid": -914061504, "ts": 1716454217639605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217675624, "dur": 18, "args": { "External id": 26918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26918, "pid": 5, "tid": 7, "ts": 1716454217675624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639678, "dur": 13, "args": { "External id": 26918, "cbid": 211, "correlation": 26918 } }, { "ph": "s", "id": 26918, "pid": 76337, "tid": -914061504, "ts": 1716454217639678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217675643, "dur": 16, "args": { "External id": 26940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26940, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 26940, "pid": 5, "tid": 7, "ts": 1716454217675643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639709, "dur": 7, "args": { "External id": 26940, "cbid": 211, "correlation": 26940 } }, { "ph": "s", "id": 26940, "pid": 76337, "tid": -914061504, "ts": 1716454217639709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639807, "dur": 1, "args": { "External id": 26951, "cbid": 251, "correlation": 26951 } }, { "ph": "f", "id": 26951, "pid": 76337, "tid": -914061504, "ts": 1716454217639807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217675660, "dur": 88, "args": { "External id": 26952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26952, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 26952, "pid": 5, "tid": 7, "ts": 1716454217675660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639813, "dur": 14, "args": { "External id": 26952, "cbid": 211, "correlation": 26952 } }, { "ph": "s", "id": 26952, "pid": 76337, "tid": -914061504, "ts": 1716454217639813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639896, "dur": 1, "args": { "External id": 26963, "cbid": 251, "correlation": 26963 } }, { "ph": "f", "id": 26963, "pid": 76337, "tid": -914061504, "ts": 1716454217639896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639900, "dur": 0, "args": { "External id": 26964, "cbid": 251, "correlation": 26964 } }, { "ph": "f", "id": 26964, "pid": 76337, "tid": -914061504, "ts": 1716454217639900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217675750, "dur": 12, "args": { "External id": 26965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26965, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26965, "pid": 5, "tid": 7, "ts": 1716454217675750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639902, "dur": 13, "args": { "External id": 26965, "cbid": 211, "correlation": 26965 } }, { "ph": "s", "id": 26965, "pid": 76337, "tid": -914061504, "ts": 1716454217639902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217675763, "dur": 6, "args": { "External id": 26967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26967, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26967, "pid": 5, "tid": 7, "ts": 1716454217675763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217639917, "dur": 6, "args": { "External id": 26967, "cbid": 211, "correlation": 26967 } }, { "ph": "s", "id": 26967, "pid": 76337, "tid": -914061504, "ts": 1716454217639917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639996, "dur": 1, "args": { "External id": 26978, "cbid": 251, "correlation": 26978 } }, { "ph": "f", "id": 26978, "pid": 76337, "tid": -914061504, "ts": 1716454217639996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217639999, "dur": 0, "args": { "External id": 26979, "cbid": 251, "correlation": 26979 } }, { "ph": "f", "id": 26979, "pid": 76337, "tid": -914061504, "ts": 1716454217639999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217675770, "dur": 8, "args": { "External id": 26980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26980, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26980, "pid": 5, "tid": 7, "ts": 1716454217675770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640001, "dur": 13, "args": { "External id": 26980, "cbid": 211, "correlation": 26980 } }, { "ph": "s", "id": 26980, "pid": 76337, "tid": -914061504, "ts": 1716454217640001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217675780, "dur": 3, "args": { "External id": 26982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 26982, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 26982, "pid": 5, "tid": 7, "ts": 1716454217675780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640015, "dur": 5, "args": { "External id": 26982, "cbid": 211, "correlation": 26982 } }, { "ph": "s", "id": 26982, "pid": 76337, "tid": -914061504, "ts": 1716454217640015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217675784, "dur": 54, "args": { "External id": 27007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27007, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27007, "pid": 5, "tid": 7, "ts": 1716454217675784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640094, "dur": 12, "args": { "External id": 27007, "cbid": 211, "correlation": 27007 } }, { "ph": "s", "id": 27007, "pid": 76337, "tid": -914061504, "ts": 1716454217640094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217640204, "dur": 1, "args": { "External id": 27025, "cbid": 251, "correlation": 27025 } }, { "ph": "f", "id": 27025, "pid": 76337, "tid": -914061504, "ts": 1716454217640204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217675839, "dur": 89, "args": { "External id": 27027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27027, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27027, "pid": 5, "tid": 7, "ts": 1716454217675839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640210, "dur": 14, "args": { "External id": 27027, "cbid": 211, "correlation": 27027 } }, { "ph": "s", "id": 27027, "pid": 76337, "tid": -914061504, "ts": 1716454217640210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217675930, "dur": 10, "args": { "External id": 27035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27035, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27035, "pid": 5, "tid": 7, "ts": 1716454217675930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640282, "dur": 12, "args": { "External id": 27035, "cbid": 211, "correlation": 27035 } }, { "ph": "s", "id": 27035, "pid": 76337, "tid": -914061504, "ts": 1716454217640282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217675941, "dur": 21, "args": { "External id": 27043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27043, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27043, "pid": 5, "tid": 7, "ts": 1716454217675941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640323, "dur": 9, "args": { "External id": 27043, "cbid": 211, "correlation": 27043 } }, { "ph": "s", "id": 27043, "pid": 76337, "tid": -914061504, "ts": 1716454217640323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217675963, "dur": 17, "args": { "External id": 27065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27065, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27065, "pid": 5, "tid": 7, "ts": 1716454217675963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640375, "dur": 10, "args": { "External id": 27065, "cbid": 211, "correlation": 27065 } }, { "ph": "s", "id": 27065, "pid": 76337, "tid": -914061504, "ts": 1716454217640375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217640475, "dur": 1, "args": { "External id": 27081, "cbid": 251, "correlation": 27081 } }, { "ph": "f", "id": 27081, "pid": 76337, "tid": -914061504, "ts": 1716454217640475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217640480, "dur": 0, "args": { "External id": 27083, "cbid": 251, "correlation": 27083 } }, { "ph": "f", "id": 27083, "pid": 76337, "tid": -914061504, "ts": 1716454217640480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217675982, "dur": 496, "args": { "External id": 27084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27084, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27084, "pid": 5, "tid": 7, "ts": 1716454217675982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640482, "dur": 14, "args": { "External id": 27084, "cbid": 211, "correlation": 27084 } }, { "ph": "s", "id": 27084, "pid": 76337, "tid": -914061504, "ts": 1716454217640482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217676478, "dur": 64, "args": { "External id": 27092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27092, "pid": 5, "tid": 7, "ts": 1716454217676478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640550, "dur": 14, "args": { "External id": 27092, "cbid": 211, "correlation": 27092 } }, { "ph": "s", "id": 27092, "pid": 76337, "tid": -914061504, "ts": 1716454217640550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217676544, "dur": 66, "args": { "External id": 27100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27100, "pid": 5, "tid": 7, "ts": 1716454217676544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640582, "dur": 8, "args": { "External id": 27100, "cbid": 211, "correlation": 27100 } }, { "ph": "s", "id": 27100, "pid": 76337, "tid": -914061504, "ts": 1716454217640582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217640662, "dur": 1, "args": { "External id": 27116, "cbid": 251, "correlation": 27116 } }, { "ph": "f", "id": 27116, "pid": 76337, "tid": -914061504, "ts": 1716454217640662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217676612, "dur": 1, "args": { "External id": 27118, "device": 5, "context": 1, "stream": 7, "correlation": 27118, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 27118, "pid": 5, "tid": 7, "ts": 1716454217676612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217640668, "dur": 10, "args": { "External id": 27118, "cbid": 51, "correlation": 27118 } }, { "ph": "s", "id": 27118, "pid": 76337, "tid": -914061504, "ts": 1716454217640668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217676615, "dur": 268, "args": { "External id": 27119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27119, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27119, "pid": 5, "tid": 7, "ts": 1716454217676615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640679, "dur": 11, "args": { "External id": 27119, "cbid": 211, "correlation": 27119 } }, { "ph": "s", "id": 27119, "pid": 76337, "tid": -914061504, "ts": 1716454217640679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217676885, "dur": 14, "args": { "External id": 27127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27127, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27127, "pid": 5, "tid": 7, "ts": 1716454217676885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640721, "dur": 10, "args": { "External id": 27127, "cbid": 211, "correlation": 27127 } }, { "ph": "s", "id": 27127, "pid": 76337, "tid": -914061504, "ts": 1716454217640721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217676900, "dur": 37, "args": { "External id": 27138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27138, "pid": 5, "tid": 7, "ts": 1716454217676900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640790, "dur": 12, "args": { "External id": 27138, "cbid": 211, "correlation": 27138 } }, { "ph": "s", "id": 27138, "pid": 76337, "tid": -914061504, "ts": 1716454217640790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217640854, "dur": 0, "args": { "External id": 27150, "cbid": 317, "correlation": 27150 } }, { "ph": "f", "id": 27150, "pid": 76337, "tid": -914061504, "ts": 1716454217640854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217640855, "dur": 0, "args": { "External id": 27151, "cbid": 203, "correlation": 27151 } }, { "ph": "f", "id": 27151, "pid": 76337, "tid": -914061504, "ts": 1716454217640855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217640856, "dur": 0, "args": { "External id": 27152, "cbid": 205, "correlation": 27152 } }, { "ph": "f", "id": 27152, "pid": 76337, "tid": -914061504, "ts": 1716454217640856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217676938, "dur": 14, "args": { "External id": 27156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27156, "pid": 5, "tid": 7, "ts": 1716454217676938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640872, "dur": 12, "args": { "External id": 27156, "cbid": 211, "correlation": 27156 } }, { "ph": "s", "id": 27156, "pid": 76337, "tid": -914061504, "ts": 1716454217640872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217676954, "dur": 4, "args": { "External id": 27158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27158, "pid": 5, "tid": 7, "ts": 1716454217676954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640889, "dur": 6, "args": { "External id": 27158, "cbid": 211, "correlation": 27158 } }, { "ph": "s", "id": 27158, "pid": 76337, "tid": -914061504, "ts": 1716454217640889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217640897, "dur": 0, "args": { "External id": 27159, "cbid": 51, "correlation": 27159 } }, { "ph": "s", "id": 27159, "pid": 76337, "tid": -914061504, "ts": 1716454217640897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217676959, "dur": 95, "args": { "External id": 27160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27160, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 27160, "pid": 5, "tid": 7, "ts": 1716454217676959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640898, "dur": 5, "args": { "External id": 27160, "cbid": 211, "correlation": 27160 } }, { "ph": "s", "id": 27160, "pid": 76337, "tid": -914061504, "ts": 1716454217640898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217677056, "dur": 16, "args": { "External id": 27165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27165, "pid": 5, "tid": 7, "ts": 1716454217677056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640926, "dur": 9, "args": { "External id": 27165, "cbid": 211, "correlation": 27165 } }, { "ph": "s", "id": 27165, "pid": 76337, "tid": -914061504, "ts": 1716454217640926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217677073, "dur": 11, "args": { "External id": 27173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27173, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27173, "pid": 5, "tid": 7, "ts": 1716454217677073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217640958, "dur": 8, "args": { "External id": 27173, "cbid": 211, "correlation": 27173 } }, { "ph": "s", "id": 27173, "pid": 76337, "tid": -914061504, "ts": 1716454217640958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217677085, "dur": 25, "args": { "External id": 27182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27182, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27182, "pid": 5, "tid": 7, "ts": 1716454217677085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641005, "dur": 11, "args": { "External id": 27182, "cbid": 211, "correlation": 27182 } }, { "ph": "s", "id": 27182, "pid": 76337, "tid": -914061504, "ts": 1716454217641005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217677112, "dur": 23, "args": { "External id": 27202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27202, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 27202, "pid": 5, "tid": 7, "ts": 1716454217677112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641090, "dur": 13, "args": { "External id": 27202, "cbid": 211, "correlation": 27202 } }, { "ph": "s", "id": 27202, "pid": 76337, "tid": -914061504, "ts": 1716454217641090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217677136, "dur": 5, "args": { "External id": 27214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27214, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 27214, "pid": 5, "tid": 7, "ts": 1716454217677136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641113, "dur": 7, "args": { "External id": 27214, "cbid": 211, "correlation": 27214 } }, { "ph": "s", "id": 27214, "pid": 76337, "tid": -914061504, "ts": 1716454217641113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217677142, "dur": 24, "args": { "External id": 27217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27217, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27217, "pid": 5, "tid": 7, "ts": 1716454217677142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641133, "dur": 7, "args": { "External id": 27217, "cbid": 211, "correlation": 27217 } }, { "ph": "s", "id": 27217, "pid": 76337, "tid": -914061504, "ts": 1716454217641133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217677168, "dur": 18, "args": { "External id": 27226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27226, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27226, "pid": 5, "tid": 7, "ts": 1716454217677168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641172, "dur": 10, "args": { "External id": 27226, "cbid": 211, "correlation": 27226 } }, { "ph": "s", "id": 27226, "pid": 76337, "tid": -914061504, "ts": 1716454217641172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217641236, "dur": 0, "args": { "External id": 27236, "cbid": 317, "correlation": 27236 } }, { "ph": "f", "id": 27236, "pid": 76337, "tid": -914061504, "ts": 1716454217641236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217641237, "dur": 0, "args": { "External id": 27237, "cbid": 203, "correlation": 27237 } }, { "ph": "f", "id": 27237, "pid": 76337, "tid": -914061504, "ts": 1716454217641237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217641237, "dur": 0, "args": { "External id": 27238, "cbid": 205, "correlation": 27238 } }, { "ph": "f", "id": 27238, "pid": 76337, "tid": -914061504, "ts": 1716454217641237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217677187, "dur": 17, "args": { "External id": 27242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27242, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27242, "pid": 5, "tid": 7, "ts": 1716454217677187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641253, "dur": 12, "args": { "External id": 27242, "cbid": 211, "correlation": 27242 } }, { "ph": "s", "id": 27242, "pid": 76337, "tid": -914061504, "ts": 1716454217641253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217677206, "dur": 237, "args": { "External id": 27244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27244, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27244, "pid": 5, "tid": 7, "ts": 1716454217677206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641268, "dur": 5, "args": { "External id": 27244, "cbid": 211, "correlation": 27244 } }, { "ph": "s", "id": 27244, "pid": 76337, "tid": -914061504, "ts": 1716454217641268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217677445, "dur": 1, "args": { "External id": 27246, "device": 5, "context": 1, "stream": 7, "correlation": 27246, "bytes": 960, "memory bandwidth (GB/s)": 0.5 } }, { "ph": "f", "id": 27246, "pid": 5, "tid": 7, "ts": 1716454217677445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217641279, "dur": 8, "args": { "External id": 27246, "cbid": 51, "correlation": 27246 } }, { "ph": "s", "id": 27246, "pid": 76337, "tid": -914061504, "ts": 1716454217641279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217677449, "dur": 805, "args": { "External id": 27247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27247, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27247, "pid": 5, "tid": 7, "ts": 1716454217677449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641288, "dur": 6, "args": { "External id": 27247, "cbid": 211, "correlation": 27247 } }, { "ph": "s", "id": 27247, "pid": 76337, "tid": -914061504, "ts": 1716454217641288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217678255, "dur": 13, "args": { "External id": 27249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27249, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27249, "pid": 5, "tid": 7, "ts": 1716454217678255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641299, "dur": 5, "args": { "External id": 27249, "cbid": 211, "correlation": 27249 } }, { "ph": "s", "id": 27249, "pid": 76337, "tid": -914061504, "ts": 1716454217641299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217678270, "dur": 14, "args": { "External id": 27255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27255, "pid": 5, "tid": 7, "ts": 1716454217678270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641327, "dur": 9, "args": { "External id": 27255, "cbid": 211, "correlation": 27255 } }, { "ph": "s", "id": 27255, "pid": 76337, "tid": -914061504, "ts": 1716454217641327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217678285, "dur": 3, "args": { "External id": 27263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27263, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 27263, "pid": 5, "tid": 7, "ts": 1716454217678285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641372, "dur": 10, "args": { "External id": 27263, "cbid": 211, "correlation": 27263 } }, { "ph": "s", "id": 27263, "pid": 76337, "tid": -914061504, "ts": 1716454217641372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217641451, "dur": 1, "args": { "External id": 27279, "cbid": 251, "correlation": 27279 } }, { "ph": "f", "id": 27279, "pid": 76337, "tid": -914061504, "ts": 1716454217641451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217641455, "dur": 0, "args": { "External id": 27281, "cbid": 251, "correlation": 27281 } }, { "ph": "f", "id": 27281, "pid": 76337, "tid": -914061504, "ts": 1716454217641455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217678290, "dur": 13, "args": { "External id": 27282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27282, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27282, "pid": 5, "tid": 7, "ts": 1716454217678290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641457, "dur": 12, "args": { "External id": 27282, "cbid": 211, "correlation": 27282 } }, { "ph": "s", "id": 27282, "pid": 76337, "tid": -914061504, "ts": 1716454217641457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217678305, "dur": 5, "args": { "External id": 27284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27284, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27284, "pid": 5, "tid": 7, "ts": 1716454217678305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641471, "dur": 5, "args": { "External id": 27284, "cbid": 211, "correlation": 27284 } }, { "ph": "s", "id": 27284, "pid": 76337, "tid": -914061504, "ts": 1716454217641471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217678311, "dur": 17, "args": { "External id": 27294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27294, "pid": 5, "tid": 7, "ts": 1716454217678311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641531, "dur": 13, "args": { "External id": 27294, "cbid": 211, "correlation": 27294 } }, { "ph": "s", "id": 27294, "pid": 76337, "tid": -914061504, "ts": 1716454217641531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217678329, "dur": 18, "args": { "External id": 27314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27314, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 27314, "pid": 5, "tid": 7, "ts": 1716454217678329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641599, "dur": 11, "args": { "External id": 27314, "cbid": 211, "correlation": 27314 } }, { "ph": "s", "id": 27314, "pid": 76337, "tid": -914061504, "ts": 1716454217641599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217678349, "dur": 4, "args": { "External id": 27326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27326, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 27326, "pid": 5, "tid": 7, "ts": 1716454217678349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641619, "dur": 6, "args": { "External id": 27326, "cbid": 211, "correlation": 27326 } }, { "ph": "s", "id": 27326, "pid": 76337, "tid": -914061504, "ts": 1716454217641619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217678354, "dur": 16, "args": { "External id": 27329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27329, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27329, "pid": 5, "tid": 7, "ts": 1716454217678354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641637, "dur": 6, "args": { "External id": 27329, "cbid": 211, "correlation": 27329 } }, { "ph": "s", "id": 27329, "pid": 76337, "tid": -914061504, "ts": 1716454217641637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217678372, "dur": 11, "args": { "External id": 27338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27338, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27338, "pid": 5, "tid": 7, "ts": 1716454217678372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641678, "dur": 10, "args": { "External id": 27338, "cbid": 211, "correlation": 27338 } }, { "ph": "s", "id": 27338, "pid": 76337, "tid": -914061504, "ts": 1716454217641678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217641741, "dur": 0, "args": { "External id": 27348, "cbid": 317, "correlation": 27348 } }, { "ph": "f", "id": 27348, "pid": 76337, "tid": -914061504, "ts": 1716454217641741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217641742, "dur": 0, "args": { "External id": 27349, "cbid": 203, "correlation": 27349 } }, { "ph": "f", "id": 27349, "pid": 76337, "tid": -914061504, "ts": 1716454217641742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217641743, "dur": 0, "args": { "External id": 27350, "cbid": 205, "correlation": 27350 } }, { "ph": "f", "id": 27350, "pid": 76337, "tid": -914061504, "ts": 1716454217641743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217678384, "dur": 11, "args": { "External id": 27354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27354, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27354, "pid": 5, "tid": 7, "ts": 1716454217678384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641757, "dur": 13, "args": { "External id": 27354, "cbid": 211, "correlation": 27354 } }, { "ph": "s", "id": 27354, "pid": 76337, "tid": -914061504, "ts": 1716454217641757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217678396, "dur": 160, "args": { "External id": 27356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27356, "pid": 5, "tid": 7, "ts": 1716454217678396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641772, "dur": 5, "args": { "External id": 27356, "cbid": 211, "correlation": 27356 } }, { "ph": "s", "id": 27356, "pid": 76337, "tid": -914061504, "ts": 1716454217641772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217678559, "dur": 1, "args": { "External id": 27358, "device": 5, "context": 1, "stream": 7, "correlation": 27358, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 27358, "pid": 5, "tid": 7, "ts": 1716454217678559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217641783, "dur": 6, "args": { "External id": 27358, "cbid": 51, "correlation": 27358 } }, { "ph": "s", "id": 27358, "pid": 76337, "tid": -914061504, "ts": 1716454217641783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217678562, "dur": 640, "args": { "External id": 27359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27359, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27359, "pid": 5, "tid": 7, "ts": 1716454217678562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641790, "dur": 6, "args": { "External id": 27359, "cbid": 211, "correlation": 27359 } }, { "ph": "s", "id": 27359, "pid": 76337, "tid": -914061504, "ts": 1716454217641790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217679203, "dur": 13, "args": { "External id": 27361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27361, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27361, "pid": 5, "tid": 7, "ts": 1716454217679203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641800, "dur": 5, "args": { "External id": 27361, "cbid": 211, "correlation": 27361 } }, { "ph": "s", "id": 27361, "pid": 76337, "tid": -914061504, "ts": 1716454217641800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217679217, "dur": 15, "args": { "External id": 27367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27367, "pid": 5, "tid": 7, "ts": 1716454217679217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641829, "dur": 9, "args": { "External id": 27367, "cbid": 211, "correlation": 27367 } }, { "ph": "s", "id": 27367, "pid": 76337, "tid": -914061504, "ts": 1716454217641829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217641887, "dur": 0, "args": { "External id": 27377, "cbid": 317, "correlation": 27377 } }, { "ph": "f", "id": 27377, "pid": 76337, "tid": -914061504, "ts": 1716454217641887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217641887, "dur": 0, "args": { "External id": 27378, "cbid": 203, "correlation": 27378 } }, { "ph": "f", "id": 27378, "pid": 76337, "tid": -914061504, "ts": 1716454217641887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217641888, "dur": 0, "args": { "External id": 27379, "cbid": 205, "correlation": 27379 } }, { "ph": "f", "id": 27379, "pid": 76337, "tid": -914061504, "ts": 1716454217641888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217679234, "dur": 17, "args": { "External id": 27383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27383, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27383, "pid": 5, "tid": 7, "ts": 1716454217679234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641902, "dur": 12, "args": { "External id": 27383, "cbid": 211, "correlation": 27383 } }, { "ph": "s", "id": 27383, "pid": 76337, "tid": -914061504, "ts": 1716454217641902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217679252, "dur": 4, "args": { "External id": 27385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27385, "pid": 5, "tid": 7, "ts": 1716454217679252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641919, "dur": 7, "args": { "External id": 27385, "cbid": 211, "correlation": 27385 } }, { "ph": "s", "id": 27385, "pid": 76337, "tid": -914061504, "ts": 1716454217641919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217641928, "dur": 0, "args": { "External id": 27386, "cbid": 51, "correlation": 27386 } }, { "ph": "s", "id": 27386, "pid": 76337, "tid": -914061504, "ts": 1716454217641928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217679257, "dur": 129, "args": { "External id": 27387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27387, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 27387, "pid": 5, "tid": 7, "ts": 1716454217679257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641929, "dur": 5, "args": { "External id": 27387, "cbid": 211, "correlation": 27387 } }, { "ph": "s", "id": 27387, "pid": 76337, "tid": -914061504, "ts": 1716454217641929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217679387, "dur": 15, "args": { "External id": 27392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27392, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27392, "pid": 5, "tid": 7, "ts": 1716454217679387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641955, "dur": 8, "args": { "External id": 27392, "cbid": 211, "correlation": 27392 } }, { "ph": "s", "id": 27392, "pid": 76337, "tid": -914061504, "ts": 1716454217641955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217679403, "dur": 13, "args": { "External id": 27400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27400, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27400, "pid": 5, "tid": 7, "ts": 1716454217679403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217641992, "dur": 8, "args": { "External id": 27400, "cbid": 211, "correlation": 27400 } }, { "ph": "s", "id": 27400, "pid": 76337, "tid": -914061504, "ts": 1716454217641992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217679417, "dur": 10, "args": { "External id": 27408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27408, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27408, "pid": 5, "tid": 7, "ts": 1716454217679417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642021, "dur": 8, "args": { "External id": 27408, "cbid": 211, "correlation": 27408 } }, { "ph": "s", "id": 27408, "pid": 76337, "tid": -914061504, "ts": 1716454217642021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217679429, "dur": 18, "args": { "External id": 27428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27428, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 27428, "pid": 5, "tid": 7, "ts": 1716454217679429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642104, "dur": 13, "args": { "External id": 27428, "cbid": 211, "correlation": 27428 } }, { "ph": "s", "id": 27428, "pid": 76337, "tid": -914061504, "ts": 1716454217642104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217679448, "dur": 4, "args": { "External id": 27440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27440, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 27440, "pid": 5, "tid": 7, "ts": 1716454217679448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642127, "dur": 6, "args": { "External id": 27440, "cbid": 211, "correlation": 27440 } }, { "ph": "s", "id": 27440, "pid": 76337, "tid": -914061504, "ts": 1716454217642127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217679454, "dur": 16, "args": { "External id": 27443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27443, "pid": 5, "tid": 7, "ts": 1716454217679454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642146, "dur": 6, "args": { "External id": 27443, "cbid": 211, "correlation": 27443 } }, { "ph": "s", "id": 27443, "pid": 76337, "tid": -914061504, "ts": 1716454217642146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217642203, "dur": 0, "args": { "External id": 27454, "cbid": 317, "correlation": 27454 } }, { "ph": "f", "id": 27454, "pid": 76337, "tid": -914061504, "ts": 1716454217642203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217642204, "dur": 0, "args": { "External id": 27455, "cbid": 203, "correlation": 27455 } }, { "ph": "f", "id": 27455, "pid": 76337, "tid": -914061504, "ts": 1716454217642204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217642204, "dur": 0, "args": { "External id": 27456, "cbid": 205, "correlation": 27456 } }, { "ph": "f", "id": 27456, "pid": 76337, "tid": -914061504, "ts": 1716454217642204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217679471, "dur": 11, "args": { "External id": 27460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27460, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27460, "pid": 5, "tid": 7, "ts": 1716454217679471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642217, "dur": 12, "args": { "External id": 27460, "cbid": 211, "correlation": 27460 } }, { "ph": "s", "id": 27460, "pid": 76337, "tid": -914061504, "ts": 1716454217642217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217679484, "dur": 3, "args": { "External id": 27462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27462, "pid": 5, "tid": 7, "ts": 1716454217679484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642234, "dur": 6, "args": { "External id": 27462, "cbid": 211, "correlation": 27462 } }, { "ph": "s", "id": 27462, "pid": 76337, "tid": -914061504, "ts": 1716454217642234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217642243, "dur": 0, "args": { "External id": 27463, "cbid": 51, "correlation": 27463 } }, { "ph": "s", "id": 27463, "pid": 76337, "tid": -914061504, "ts": 1716454217642243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217679489, "dur": 89, "args": { "External id": 27464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27464, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 27464, "pid": 5, "tid": 7, "ts": 1716454217679489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642243, "dur": 5, "args": { "External id": 27464, "cbid": 211, "correlation": 27464 } }, { "ph": "s", "id": 27464, "pid": 76337, "tid": -914061504, "ts": 1716454217642243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217679579, "dur": 15, "args": { "External id": 27469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27469, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27469, "pid": 5, "tid": 7, "ts": 1716454217679579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642270, "dur": 9, "args": { "External id": 27469, "cbid": 211, "correlation": 27469 } }, { "ph": "s", "id": 27469, "pid": 76337, "tid": -914061504, "ts": 1716454217642270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217679595, "dur": 82, "args": { "External id": 27478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27478, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27478, "pid": 5, "tid": 7, "ts": 1716454217679595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642353, "dur": 14, "args": { "External id": 27478, "cbid": 211, "correlation": 27478 } }, { "ph": "s", "id": 27478, "pid": 76337, "tid": -914061504, "ts": 1716454217642353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217679679, "dur": 28, "args": { "External id": 27500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27500, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27500, "pid": 5, "tid": 7, "ts": 1716454217679679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642409, "dur": 10, "args": { "External id": 27500, "cbid": 211, "correlation": 27500 } }, { "ph": "s", "id": 27500, "pid": 76337, "tid": -914061504, "ts": 1716454217642409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217642525, "dur": 1, "args": { "External id": 27511, "cbid": 251, "correlation": 27511 } }, { "ph": "f", "id": 27511, "pid": 76337, "tid": -914061504, "ts": 1716454217642525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217679708, "dur": 162, "args": { "External id": 27512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27512, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27512, "pid": 5, "tid": 7, "ts": 1716454217679708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642531, "dur": 15, "args": { "External id": 27512, "cbid": 211, "correlation": 27512 } }, { "ph": "s", "id": 27512, "pid": 76337, "tid": -914061504, "ts": 1716454217642531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217642614, "dur": 1, "args": { "External id": 27523, "cbid": 251, "correlation": 27523 } }, { "ph": "f", "id": 27523, "pid": 76337, "tid": -914061504, "ts": 1716454217642614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217679871, "dur": 155, "args": { "External id": 27524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27524, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27524, "pid": 5, "tid": 7, "ts": 1716454217679871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642618, "dur": 12, "args": { "External id": 27524, "cbid": 211, "correlation": 27524 } }, { "ph": "s", "id": 27524, "pid": 76337, "tid": -914061504, "ts": 1716454217642618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217642696, "dur": 1, "args": { "External id": 27535, "cbid": 251, "correlation": 27535 } }, { "ph": "f", "id": 27535, "pid": 76337, "tid": -914061504, "ts": 1716454217642696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217680028, "dur": 154, "args": { "External id": 27536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27536, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27536, "pid": 5, "tid": 7, "ts": 1716454217680028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642700, "dur": 13, "args": { "External id": 27536, "cbid": 211, "correlation": 27536 } }, { "ph": "s", "id": 27536, "pid": 76337, "tid": -914061504, "ts": 1716454217642700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217680183, "dur": 330, "args": { "External id": 27561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27561, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27561, "pid": 5, "tid": 7, "ts": 1716454217680183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642786, "dur": 13, "args": { "External id": 27561, "cbid": 211, "correlation": 27561 } }, { "ph": "s", "id": 27561, "pid": 76337, "tid": -914061504, "ts": 1716454217642786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217642900, "dur": 1, "args": { "External id": 27579, "cbid": 251, "correlation": 27579 } }, { "ph": "f", "id": 27579, "pid": 76337, "tid": -914061504, "ts": 1716454217642900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217680515, "dur": 164, "args": { "External id": 27581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27581, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27581, "pid": 5, "tid": 7, "ts": 1716454217680515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642906, "dur": 14, "args": { "External id": 27581, "cbid": 211, "correlation": 27581 } }, { "ph": "s", "id": 27581, "pid": 76337, "tid": -914061504, "ts": 1716454217642906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217680680, "dur": 19, "args": { "External id": 27589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27589, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27589, "pid": 5, "tid": 7, "ts": 1716454217680680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217642987, "dur": 13, "args": { "External id": 27589, "cbid": 211, "correlation": 27589 } }, { "ph": "s", "id": 27589, "pid": 76337, "tid": -914061504, "ts": 1716454217642987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217680700, "dur": 27, "args": { "External id": 27597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27597, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27597, "pid": 5, "tid": 7, "ts": 1716454217680700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643027, "dur": 9, "args": { "External id": 27597, "cbid": 211, "correlation": 27597 } }, { "ph": "s", "id": 27597, "pid": 76337, "tid": -914061504, "ts": 1716454217643027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217680729, "dur": 18, "args": { "External id": 27608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27608, "pid": 5, "tid": 7, "ts": 1716454217680729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643101, "dur": 13, "args": { "External id": 27608, "cbid": 211, "correlation": 27608 } }, { "ph": "s", "id": 27608, "pid": 76337, "tid": -914061504, "ts": 1716454217643101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217680748, "dur": 16, "args": { "External id": 27630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27630, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27630, "pid": 5, "tid": 7, "ts": 1716454217680748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643132, "dur": 8, "args": { "External id": 27630, "cbid": 211, "correlation": 27630 } }, { "ph": "s", "id": 27630, "pid": 76337, "tid": -914061504, "ts": 1716454217643132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643227, "dur": 1, "args": { "External id": 27641, "cbid": 251, "correlation": 27641 } }, { "ph": "f", "id": 27641, "pid": 76337, "tid": -914061504, "ts": 1716454217643227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217680765, "dur": 88, "args": { "External id": 27642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27642, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27642, "pid": 5, "tid": 7, "ts": 1716454217680765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643232, "dur": 14, "args": { "External id": 27642, "cbid": 211, "correlation": 27642 } }, { "ph": "s", "id": 27642, "pid": 76337, "tid": -914061504, "ts": 1716454217643232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643303, "dur": 1, "args": { "External id": 27653, "cbid": 251, "correlation": 27653 } }, { "ph": "f", "id": 27653, "pid": 76337, "tid": -914061504, "ts": 1716454217643303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643307, "dur": 0, "args": { "External id": 27654, "cbid": 251, "correlation": 27654 } }, { "ph": "f", "id": 27654, "pid": 76337, "tid": -914061504, "ts": 1716454217643307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217680854, "dur": 11, "args": { "External id": 27655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27655, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27655, "pid": 5, "tid": 7, "ts": 1716454217680854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643309, "dur": 12, "args": { "External id": 27655, "cbid": 211, "correlation": 27655 } }, { "ph": "s", "id": 27655, "pid": 76337, "tid": -914061504, "ts": 1716454217643309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217680867, "dur": 5, "args": { "External id": 27657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27657, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27657, "pid": 5, "tid": 7, "ts": 1716454217680867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643323, "dur": 6, "args": { "External id": 27657, "cbid": 211, "correlation": 27657 } }, { "ph": "s", "id": 27657, "pid": 76337, "tid": -914061504, "ts": 1716454217643323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643396, "dur": 1, "args": { "External id": 27668, "cbid": 251, "correlation": 27668 } }, { "ph": "f", "id": 27668, "pid": 76337, "tid": -914061504, "ts": 1716454217643396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643401, "dur": 0, "args": { "External id": 27669, "cbid": 251, "correlation": 27669 } }, { "ph": "f", "id": 27669, "pid": 76337, "tid": -914061504, "ts": 1716454217643401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217680873, "dur": 9, "args": { "External id": 27670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27670, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27670, "pid": 5, "tid": 7, "ts": 1716454217680873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643402, "dur": 12, "args": { "External id": 27670, "cbid": 211, "correlation": 27670 } }, { "ph": "s", "id": 27670, "pid": 76337, "tid": -914061504, "ts": 1716454217643402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217680883, "dur": 3, "args": { "External id": 27672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27672, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27672, "pid": 5, "tid": 7, "ts": 1716454217680883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643416, "dur": 5, "args": { "External id": 27672, "cbid": 211, "correlation": 27672 } }, { "ph": "s", "id": 27672, "pid": 76337, "tid": -914061504, "ts": 1716454217643416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217680888, "dur": 55, "args": { "External id": 27697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27697, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27697, "pid": 5, "tid": 7, "ts": 1716454217680888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643493, "dur": 13, "args": { "External id": 27697, "cbid": 211, "correlation": 27697 } }, { "ph": "s", "id": 27697, "pid": 76337, "tid": -914061504, "ts": 1716454217643493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643604, "dur": 1, "args": { "External id": 27715, "cbid": 251, "correlation": 27715 } }, { "ph": "f", "id": 27715, "pid": 76337, "tid": -914061504, "ts": 1716454217643604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217680944, "dur": 90, "args": { "External id": 27717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27717, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27717, "pid": 5, "tid": 7, "ts": 1716454217680944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643610, "dur": 14, "args": { "External id": 27717, "cbid": 211, "correlation": 27717 } }, { "ph": "s", "id": 27717, "pid": 76337, "tid": -914061504, "ts": 1716454217643610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217681035, "dur": 10, "args": { "External id": 27725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27725, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27725, "pid": 5, "tid": 7, "ts": 1716454217681035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643683, "dur": 12, "args": { "External id": 27725, "cbid": 211, "correlation": 27725 } }, { "ph": "s", "id": 27725, "pid": 76337, "tid": -914061504, "ts": 1716454217643683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217681046, "dur": 21, "args": { "External id": 27733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27733, "pid": 5, "tid": 7, "ts": 1716454217681046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643724, "dur": 10, "args": { "External id": 27733, "cbid": 211, "correlation": 27733 } }, { "ph": "s", "id": 27733, "pid": 76337, "tid": -914061504, "ts": 1716454217643724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217681068, "dur": 17, "args": { "External id": 27755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27755, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27755, "pid": 5, "tid": 7, "ts": 1716454217681068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643776, "dur": 10, "args": { "External id": 27755, "cbid": 211, "correlation": 27755 } }, { "ph": "s", "id": 27755, "pid": 76337, "tid": -914061504, "ts": 1716454217643776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643877, "dur": 1, "args": { "External id": 27771, "cbid": 251, "correlation": 27771 } }, { "ph": "f", "id": 27771, "pid": 76337, "tid": -914061504, "ts": 1716454217643877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217643882, "dur": 0, "args": { "External id": 27773, "cbid": 251, "correlation": 27773 } }, { "ph": "f", "id": 27773, "pid": 76337, "tid": -914061504, "ts": 1716454217643882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217681087, "dur": 495, "args": { "External id": 27774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27774, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27774, "pid": 5, "tid": 7, "ts": 1716454217681087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643884, "dur": 14, "args": { "External id": 27774, "cbid": 211, "correlation": 27774 } }, { "ph": "s", "id": 27774, "pid": 76337, "tid": -914061504, "ts": 1716454217643884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217681583, "dur": 64, "args": { "External id": 27782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27782, "pid": 5, "tid": 7, "ts": 1716454217681583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643952, "dur": 12, "args": { "External id": 27782, "cbid": 211, "correlation": 27782 } }, { "ph": "s", "id": 27782, "pid": 76337, "tid": -914061504, "ts": 1716454217643952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217681649, "dur": 65, "args": { "External id": 27790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27790, "pid": 5, "tid": 7, "ts": 1716454217681649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217643990, "dur": 9, "args": { "External id": 27790, "cbid": 211, "correlation": 27790 } }, { "ph": "s", "id": 27790, "pid": 76337, "tid": -914061504, "ts": 1716454217643990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217644072, "dur": 1, "args": { "External id": 27806, "cbid": 251, "correlation": 27806 } }, { "ph": "f", "id": 27806, "pid": 76337, "tid": -914061504, "ts": 1716454217644072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454217681716, "dur": 1, "args": { "External id": 27808, "device": 5, "context": 1, "stream": 7, "correlation": 27808, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 27808, "pid": 5, "tid": 7, "ts": 1716454217681716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217644077, "dur": 10, "args": { "External id": 27808, "cbid": 51, "correlation": 27808 } }, { "ph": "s", "id": 27808, "pid": 76337, "tid": -914061504, "ts": 1716454217644077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217681720, "dur": 267, "args": { "External id": 27809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27809, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27809, "pid": 5, "tid": 7, "ts": 1716454217681720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644088, "dur": 11, "args": { "External id": 27809, "cbid": 211, "correlation": 27809 } }, { "ph": "s", "id": 27809, "pid": 76337, "tid": -914061504, "ts": 1716454217644088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217681988, "dur": 14, "args": { "External id": 27817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27817, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27817, "pid": 5, "tid": 7, "ts": 1716454217681988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644130, "dur": 11, "args": { "External id": 27817, "cbid": 211, "correlation": 27817 } }, { "ph": "s", "id": 27817, "pid": 76337, "tid": -914061504, "ts": 1716454217644130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217682003, "dur": 37, "args": { "External id": 27828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27828, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27828, "pid": 5, "tid": 7, "ts": 1716454217682003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644199, "dur": 12, "args": { "External id": 27828, "cbid": 211, "correlation": 27828 } }, { "ph": "s", "id": 27828, "pid": 76337, "tid": -914061504, "ts": 1716454217644199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217644263, "dur": 0, "args": { "External id": 27840, "cbid": 317, "correlation": 27840 } }, { "ph": "f", "id": 27840, "pid": 76337, "tid": -914061504, "ts": 1716454217644263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217644264, "dur": 0, "args": { "External id": 27841, "cbid": 203, "correlation": 27841 } }, { "ph": "f", "id": 27841, "pid": 76337, "tid": -914061504, "ts": 1716454217644264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217644265, "dur": 0, "args": { "External id": 27842, "cbid": 205, "correlation": 27842 } }, { "ph": "f", "id": 27842, "pid": 76337, "tid": -914061504, "ts": 1716454217644265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217682041, "dur": 13, "args": { "External id": 27846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27846, "pid": 5, "tid": 7, "ts": 1716454217682041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644280, "dur": 13, "args": { "External id": 27846, "cbid": 211, "correlation": 27846 } }, { "ph": "s", "id": 27846, "pid": 76337, "tid": -914061504, "ts": 1716454217644280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217682055, "dur": 4, "args": { "External id": 27848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 27848, "pid": 5, "tid": 7, "ts": 1716454217682055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644298, "dur": 6, "args": { "External id": 27848, "cbid": 211, "correlation": 27848 } }, { "ph": "s", "id": 27848, "pid": 76337, "tid": -914061504, "ts": 1716454217644298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217644306, "dur": 0, "args": { "External id": 27849, "cbid": 51, "correlation": 27849 } }, { "ph": "s", "id": 27849, "pid": 76337, "tid": -914061504, "ts": 1716454217644306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217682060, "dur": 96, "args": { "External id": 27850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27850, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 27850, "pid": 5, "tid": 7, "ts": 1716454217682060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644307, "dur": 5, "args": { "External id": 27850, "cbid": 211, "correlation": 27850 } }, { "ph": "s", "id": 27850, "pid": 76337, "tid": -914061504, "ts": 1716454217644307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217682158, "dur": 16, "args": { "External id": 27855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27855, "pid": 5, "tid": 7, "ts": 1716454217682158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644333, "dur": 10, "args": { "External id": 27855, "cbid": 211, "correlation": 27855 } }, { "ph": "s", "id": 27855, "pid": 76337, "tid": -914061504, "ts": 1716454217644333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217682175, "dur": 11, "args": { "External id": 27863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27863, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27863, "pid": 5, "tid": 7, "ts": 1716454217682175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644365, "dur": 8, "args": { "External id": 27863, "cbid": 211, "correlation": 27863 } }, { "ph": "s", "id": 27863, "pid": 76337, "tid": -914061504, "ts": 1716454217644365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454217682188, "dur": 55, "args": { "External id": 27874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27874, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27874, "pid": 5, "tid": 7, "ts": 1716454217682188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644430, "dur": 12, "args": { "External id": 27874, "cbid": 211, "correlation": 27874 } }, { "ph": "s", "id": 27874, "pid": 76337, "tid": -914061504, "ts": 1716454217644430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217644485, "dur": 0, "args": { "External id": 27884, "cbid": 317, "correlation": 27884 } }, { "ph": "f", "id": 27884, "pid": 76337, "tid": -914061504, "ts": 1716454217644485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217644486, "dur": 0, "args": { "External id": 27885, "cbid": 203, "correlation": 27885 } }, { "ph": "f", "id": 27885, "pid": 76337, "tid": -914061504, "ts": 1716454217644486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217644487, "dur": 0, "args": { "External id": 27886, "cbid": 205, "correlation": 27886 } }, { "ph": "f", "id": 27886, "pid": 76337, "tid": -914061504, "ts": 1716454217644487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217682244, "dur": 38, "args": { "External id": 27890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27890, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27890, "pid": 5, "tid": 7, "ts": 1716454217682244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644504, "dur": 11, "args": { "External id": 27890, "cbid": 211, "correlation": 27890 } }, { "ph": "s", "id": 27890, "pid": 76337, "tid": -914061504, "ts": 1716454217644504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217682284, "dur": 160, "args": { "External id": 27892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27892, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27892, "pid": 5, "tid": 7, "ts": 1716454217682284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644518, "dur": 5, "args": { "External id": 27892, "cbid": 211, "correlation": 27892 } }, { "ph": "s", "id": 27892, "pid": 76337, "tid": -914061504, "ts": 1716454217644518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217682446, "dur": 1954, "args": { "External id": 27894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27894, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27894, "pid": 5, "tid": 7, "ts": 1716454217682446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644531, "dur": 8, "args": { "External id": 27894, "cbid": 211, "correlation": 27894 } }, { "ph": "s", "id": 27894, "pid": 76337, "tid": -914061504, "ts": 1716454217644531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217684401, "dur": 40, "args": { "External id": 27896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27896, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27896, "pid": 5, "tid": 7, "ts": 1716454217684401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644543, "dur": 6, "args": { "External id": 27896, "cbid": 211, "correlation": 27896 } }, { "ph": "s", "id": 27896, "pid": 76337, "tid": -914061504, "ts": 1716454217644543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217684442, "dur": 58, "args": { "External id": 27902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27902, "pid": 5, "tid": 7, "ts": 1716454217684442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644571, "dur": 8, "args": { "External id": 27902, "cbid": 211, "correlation": 27902 } }, { "ph": "s", "id": 27902, "pid": 76337, "tid": -914061504, "ts": 1716454217644571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217684502, "dur": 82, "args": { "External id": 27911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27911, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27911, "pid": 5, "tid": 7, "ts": 1716454217684502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644662, "dur": 13, "args": { "External id": 27911, "cbid": 211, "correlation": 27911 } }, { "ph": "s", "id": 27911, "pid": 76337, "tid": -914061504, "ts": 1716454217644662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217684585, "dur": 74, "args": { "External id": 27931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27931, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 27931, "pid": 5, "tid": 7, "ts": 1716454217684585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644734, "dur": 11, "args": { "External id": 27931, "cbid": 211, "correlation": 27931 } }, { "ph": "s", "id": 27931, "pid": 76337, "tid": -914061504, "ts": 1716454217644734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217684660, "dur": 5, "args": { "External id": 27943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27943, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 27943, "pid": 5, "tid": 7, "ts": 1716454217684660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644754, "dur": 7, "args": { "External id": 27943, "cbid": 211, "correlation": 27943 } }, { "ph": "s", "id": 27943, "pid": 76337, "tid": -914061504, "ts": 1716454217644754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217684666, "dur": 81, "args": { "External id": 27946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27946, "pid": 5, "tid": 7, "ts": 1716454217684666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644774, "dur": 7, "args": { "External id": 27946, "cbid": 211, "correlation": 27946 } }, { "ph": "s", "id": 27946, "pid": 76337, "tid": -914061504, "ts": 1716454217644774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217684748, "dur": 53, "args": { "External id": 27955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27955, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27955, "pid": 5, "tid": 7, "ts": 1716454217684748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644814, "dur": 10, "args": { "External id": 27955, "cbid": 211, "correlation": 27955 } }, { "ph": "s", "id": 27955, "pid": 76337, "tid": -914061504, "ts": 1716454217644814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217644878, "dur": 0, "args": { "External id": 27965, "cbid": 317, "correlation": 27965 } }, { "ph": "f", "id": 27965, "pid": 76337, "tid": -914061504, "ts": 1716454217644878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217644879, "dur": 0, "args": { "External id": 27966, "cbid": 203, "correlation": 27966 } }, { "ph": "f", "id": 27966, "pid": 76337, "tid": -914061504, "ts": 1716454217644879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217644880, "dur": 0, "args": { "External id": 27967, "cbid": 205, "correlation": 27967 } }, { "ph": "f", "id": 27967, "pid": 76337, "tid": -914061504, "ts": 1716454217644880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217684802, "dur": 58, "args": { "External id": 27971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27971, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27971, "pid": 5, "tid": 7, "ts": 1716454217684802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644896, "dur": 12, "args": { "External id": 27971, "cbid": 211, "correlation": 27971 } }, { "ph": "s", "id": 27971, "pid": 76337, "tid": -914061504, "ts": 1716454217644896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217684861, "dur": 121, "args": { "External id": 27973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27973, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27973, "pid": 5, "tid": 7, "ts": 1716454217684861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644911, "dur": 5, "args": { "External id": 27973, "cbid": 211, "correlation": 27973 } }, { "ph": "s", "id": 27973, "pid": 76337, "tid": -914061504, "ts": 1716454217644911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217684983, "dur": 1870, "args": { "External id": 27975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27975, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 27975, "pid": 5, "tid": 7, "ts": 1716454217684983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644923, "dur": 6, "args": { "External id": 27975, "cbid": 211, "correlation": 27975 } }, { "ph": "s", "id": 27975, "pid": 76337, "tid": -914061504, "ts": 1716454217644923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217686855, "dur": 21, "args": { "External id": 27977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27977, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27977, "pid": 5, "tid": 7, "ts": 1716454217686855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644932, "dur": 5, "args": { "External id": 27977, "cbid": 211, "correlation": 27977 } }, { "ph": "s", "id": 27977, "pid": 76337, "tid": -914061504, "ts": 1716454217644932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217686877, "dur": 32, "args": { "External id": 27983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 27983, "pid": 5, "tid": 7, "ts": 1716454217686877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217644961, "dur": 9, "args": { "External id": 27983, "cbid": 211, "correlation": 27983 } }, { "ph": "s", "id": 27983, "pid": 76337, "tid": -914061504, "ts": 1716454217644961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217686911, "dur": 3, "args": { "External id": 27991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 27991, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 27991, "pid": 5, "tid": 7, "ts": 1716454217686911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645014, "dur": 11, "args": { "External id": 27991, "cbid": 211, "correlation": 27991 } }, { "ph": "s", "id": 27991, "pid": 76337, "tid": -914061504, "ts": 1716454217645014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217645081, "dur": 1, "args": { "External id": 28007, "cbid": 251, "correlation": 28007 } }, { "ph": "f", "id": 28007, "pid": 76337, "tid": -914061504, "ts": 1716454217645081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217645086, "dur": 0, "args": { "External id": 28009, "cbid": 251, "correlation": 28009 } }, { "ph": "f", "id": 28009, "pid": 76337, "tid": -914061504, "ts": 1716454217645086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217686916, "dur": 12, "args": { "External id": 28010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28010, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 28010, "pid": 5, "tid": 7, "ts": 1716454217686916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645088, "dur": 11, "args": { "External id": 28010, "cbid": 211, "correlation": 28010 } }, { "ph": "s", "id": 28010, "pid": 76337, "tid": -914061504, "ts": 1716454217645088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217686929, "dur": 5, "args": { "External id": 28012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28012, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 28012, "pid": 5, "tid": 7, "ts": 1716454217686929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645101, "dur": 6, "args": { "External id": 28012, "cbid": 211, "correlation": 28012 } }, { "ph": "s", "id": 28012, "pid": 76337, "tid": -914061504, "ts": 1716454217645101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217686936, "dur": 29, "args": { "External id": 28022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28022, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28022, "pid": 5, "tid": 7, "ts": 1716454217686936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645160, "dur": 12, "args": { "External id": 28022, "cbid": 211, "correlation": 28022 } }, { "ph": "s", "id": 28022, "pid": 76337, "tid": -914061504, "ts": 1716454217645160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217686966, "dur": 31, "args": { "External id": 28042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28042, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 28042, "pid": 5, "tid": 7, "ts": 1716454217686966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645225, "dur": 11, "args": { "External id": 28042, "cbid": 211, "correlation": 28042 } }, { "ph": "s", "id": 28042, "pid": 76337, "tid": -914061504, "ts": 1716454217645225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217686998, "dur": 4, "args": { "External id": 28054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28054, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 28054, "pid": 5, "tid": 7, "ts": 1716454217686998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645247, "dur": 6, "args": { "External id": 28054, "cbid": 211, "correlation": 28054 } }, { "ph": "s", "id": 28054, "pid": 76337, "tid": -914061504, "ts": 1716454217645247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217687004, "dur": 30, "args": { "External id": 28057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28057, "pid": 5, "tid": 7, "ts": 1716454217687004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645264, "dur": 6, "args": { "External id": 28057, "cbid": 211, "correlation": 28057 } }, { "ph": "s", "id": 28057, "pid": 76337, "tid": -914061504, "ts": 1716454217645264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217687035, "dur": 20, "args": { "External id": 28066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28066, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28066, "pid": 5, "tid": 7, "ts": 1716454217687035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645305, "dur": 10, "args": { "External id": 28066, "cbid": 211, "correlation": 28066 } }, { "ph": "s", "id": 28066, "pid": 76337, "tid": -914061504, "ts": 1716454217645305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217645382, "dur": 0, "args": { "External id": 28076, "cbid": 317, "correlation": 28076 } }, { "ph": "f", "id": 28076, "pid": 76337, "tid": -914061504, "ts": 1716454217645382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217645382, "dur": 0, "args": { "External id": 28077, "cbid": 203, "correlation": 28077 } }, { "ph": "f", "id": 28077, "pid": 76337, "tid": -914061504, "ts": 1716454217645382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217645383, "dur": 0, "args": { "External id": 28078, "cbid": 205, "correlation": 28078 } }, { "ph": "f", "id": 28078, "pid": 76337, "tid": -914061504, "ts": 1716454217645383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217687057, "dur": 22, "args": { "External id": 28082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28082, "pid": 5, "tid": 7, "ts": 1716454217687057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645399, "dur": 13, "args": { "External id": 28082, "cbid": 211, "correlation": 28082 } }, { "ph": "s", "id": 28082, "pid": 76337, "tid": -914061504, "ts": 1716454217645399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217687080, "dur": 44, "args": { "External id": 28084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28084, "pid": 5, "tid": 7, "ts": 1716454217687080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645415, "dur": 5, "args": { "External id": 28084, "cbid": 211, "correlation": 28084 } }, { "ph": "s", "id": 28084, "pid": 76337, "tid": -914061504, "ts": 1716454217645415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217687125, "dur": 641, "args": { "External id": 28086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28086, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28086, "pid": 5, "tid": 7, "ts": 1716454217687125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645427, "dur": 6, "args": { "External id": 28086, "cbid": 211, "correlation": 28086 } }, { "ph": "s", "id": 28086, "pid": 76337, "tid": -914061504, "ts": 1716454217645427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217687767, "dur": 22, "args": { "External id": 28088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28088, "pid": 5, "tid": 7, "ts": 1716454217687767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645436, "dur": 5, "args": { "External id": 28088, "cbid": 211, "correlation": 28088 } }, { "ph": "s", "id": 28088, "pid": 76337, "tid": -914061504, "ts": 1716454217645436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217687791, "dur": 32, "args": { "External id": 28094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28094, "pid": 5, "tid": 7, "ts": 1716454217687791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645465, "dur": 9, "args": { "External id": 28094, "cbid": 211, "correlation": 28094 } }, { "ph": "s", "id": 28094, "pid": 76337, "tid": -914061504, "ts": 1716454217645465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217645524, "dur": 0, "args": { "External id": 28104, "cbid": 317, "correlation": 28104 } }, { "ph": "f", "id": 28104, "pid": 76337, "tid": -914061504, "ts": 1716454217645524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217645524, "dur": 0, "args": { "External id": 28105, "cbid": 203, "correlation": 28105 } }, { "ph": "f", "id": 28105, "pid": 76337, "tid": -914061504, "ts": 1716454217645524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217645525, "dur": 0, "args": { "External id": 28106, "cbid": 205, "correlation": 28106 } }, { "ph": "f", "id": 28106, "pid": 76337, "tid": -914061504, "ts": 1716454217645525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217687824, "dur": 55, "args": { "External id": 28110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28110, "pid": 5, "tid": 7, "ts": 1716454217687824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645538, "dur": 11, "args": { "External id": 28110, "cbid": 211, "correlation": 28110 } }, { "ph": "s", "id": 28110, "pid": 76337, "tid": -914061504, "ts": 1716454217645538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217687881, "dur": 266, "args": { "External id": 28112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28112, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28112, "pid": 5, "tid": 7, "ts": 1716454217687881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645556, "dur": 8, "args": { "External id": 28112, "cbid": 211, "correlation": 28112 } }, { "ph": "s", "id": 28112, "pid": 76337, "tid": -914061504, "ts": 1716454217645556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217688148, "dur": 21, "args": { "External id": 28114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28114, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28114, "pid": 5, "tid": 7, "ts": 1716454217688148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645567, "dur": 5, "args": { "External id": 28114, "cbid": 211, "correlation": 28114 } }, { "ph": "s", "id": 28114, "pid": 76337, "tid": -914061504, "ts": 1716454217645567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217688170, "dur": 32, "args": { "External id": 28120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28120, "pid": 5, "tid": 7, "ts": 1716454217688170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645593, "dur": 8, "args": { "External id": 28120, "cbid": 211, "correlation": 28120 } }, { "ph": "s", "id": 28120, "pid": 76337, "tid": -914061504, "ts": 1716454217645593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217688203, "dur": 27, "args": { "External id": 28128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28128, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28128, "pid": 5, "tid": 7, "ts": 1716454217688203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645622, "dur": 8, "args": { "External id": 28128, "cbid": 211, "correlation": 28128 } }, { "ph": "s", "id": 28128, "pid": 76337, "tid": -914061504, "ts": 1716454217645622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217688232, "dur": 20, "args": { "External id": 28136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28136, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28136, "pid": 5, "tid": 7, "ts": 1716454217688232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645650, "dur": 9, "args": { "External id": 28136, "cbid": 211, "correlation": 28136 } }, { "ph": "s", "id": 28136, "pid": 76337, "tid": -914061504, "ts": 1716454217645650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217688253, "dur": 30, "args": { "External id": 28156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28156, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 28156, "pid": 5, "tid": 7, "ts": 1716454217688253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645733, "dur": 12, "args": { "External id": 28156, "cbid": 211, "correlation": 28156 } }, { "ph": "s", "id": 28156, "pid": 76337, "tid": -914061504, "ts": 1716454217645733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217688284, "dur": 5, "args": { "External id": 28168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28168, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 28168, "pid": 5, "tid": 7, "ts": 1716454217688284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645755, "dur": 6, "args": { "External id": 28168, "cbid": 211, "correlation": 28168 } }, { "ph": "s", "id": 28168, "pid": 76337, "tid": -914061504, "ts": 1716454217645755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217688290, "dur": 31, "args": { "External id": 28171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28171, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28171, "pid": 5, "tid": 7, "ts": 1716454217688290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645773, "dur": 6, "args": { "External id": 28171, "cbid": 211, "correlation": 28171 } }, { "ph": "s", "id": 28171, "pid": 76337, "tid": -914061504, "ts": 1716454217645773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217645842, "dur": 0, "args": { "External id": 28182, "cbid": 317, "correlation": 28182 } }, { "ph": "f", "id": 28182, "pid": 76337, "tid": -914061504, "ts": 1716454217645842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217645843, "dur": 0, "args": { "External id": 28183, "cbid": 203, "correlation": 28183 } }, { "ph": "f", "id": 28183, "pid": 76337, "tid": -914061504, "ts": 1716454217645843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217645844, "dur": 0, "args": { "External id": 28184, "cbid": 205, "correlation": 28184 } }, { "ph": "f", "id": 28184, "pid": 76337, "tid": -914061504, "ts": 1716454217645844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217688322, "dur": 22, "args": { "External id": 28188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28188, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28188, "pid": 5, "tid": 7, "ts": 1716454217688322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645858, "dur": 13, "args": { "External id": 28188, "cbid": 211, "correlation": 28188 } }, { "ph": "s", "id": 28188, "pid": 76337, "tid": -914061504, "ts": 1716454217645858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217688345, "dur": 103, "args": { "External id": 28190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28190, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28190, "pid": 5, "tid": 7, "ts": 1716454217688345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645878, "dur": 6, "args": { "External id": 28190, "cbid": 211, "correlation": 28190 } }, { "ph": "s", "id": 28190, "pid": 76337, "tid": -914061504, "ts": 1716454217645878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217688450, "dur": 21, "args": { "External id": 28192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28192, "pid": 5, "tid": 7, "ts": 1716454217688450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645888, "dur": 5, "args": { "External id": 28192, "cbid": 211, "correlation": 28192 } }, { "ph": "s", "id": 28192, "pid": 76337, "tid": -914061504, "ts": 1716454217645888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217688472, "dur": 32, "args": { "External id": 28198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28198, "pid": 5, "tid": 7, "ts": 1716454217688472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217645927, "dur": 10, "args": { "External id": 28198, "cbid": 211, "correlation": 28198 } }, { "ph": "s", "id": 28198, "pid": 76337, "tid": -914061504, "ts": 1716454217645927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217688505, "dur": 192, "args": { "External id": 28207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28207, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28207, "pid": 5, "tid": 7, "ts": 1716454217688505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646020, "dur": 15, "args": { "External id": 28207, "cbid": 211, "correlation": 28207 } }, { "ph": "s", "id": 28207, "pid": 76337, "tid": -914061504, "ts": 1716454217646020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217688698, "dur": 64, "args": { "External id": 28229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28229, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28229, "pid": 5, "tid": 7, "ts": 1716454217688698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646097, "dur": 12, "args": { "External id": 28229, "cbid": 211, "correlation": 28229 } }, { "ph": "s", "id": 28229, "pid": 76337, "tid": -914061504, "ts": 1716454217646097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217646202, "dur": 1, "args": { "External id": 28240, "cbid": 251, "correlation": 28240 } }, { "ph": "f", "id": 28240, "pid": 76337, "tid": -914061504, "ts": 1716454217646202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217688763, "dur": 153, "args": { "External id": 28241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28241, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28241, "pid": 5, "tid": 7, "ts": 1716454217688763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646208, "dur": 14, "args": { "External id": 28241, "cbid": 211, "correlation": 28241 } }, { "ph": "s", "id": 28241, "pid": 76337, "tid": -914061504, "ts": 1716454217646208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217646291, "dur": 1, "args": { "External id": 28252, "cbid": 251, "correlation": 28252 } }, { "ph": "f", "id": 28252, "pid": 76337, "tid": -914061504, "ts": 1716454217646291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217688918, "dur": 146, "args": { "External id": 28253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28253, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28253, "pid": 5, "tid": 7, "ts": 1716454217688918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646295, "dur": 12, "args": { "External id": 28253, "cbid": 211, "correlation": 28253 } }, { "ph": "s", "id": 28253, "pid": 76337, "tid": -914061504, "ts": 1716454217646295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217646373, "dur": 1, "args": { "External id": 28264, "cbid": 251, "correlation": 28264 } }, { "ph": "f", "id": 28264, "pid": 76337, "tid": -914061504, "ts": 1716454217646373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217689065, "dur": 142, "args": { "External id": 28265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28265, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28265, "pid": 5, "tid": 7, "ts": 1716454217689065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646377, "dur": 13, "args": { "External id": 28265, "cbid": 211, "correlation": 28265 } }, { "ph": "s", "id": 28265, "pid": 76337, "tid": -914061504, "ts": 1716454217646377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217689209, "dur": 1907, "args": { "External id": 28286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28286, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 28286, "pid": 5, "tid": 7, "ts": 1716454217689209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646464, "dur": 14, "args": { "External id": 28286, "cbid": 211, "correlation": 28286 } }, { "ph": "s", "id": 28286, "pid": 76337, "tid": -914061504, "ts": 1716454217646464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217646583, "dur": 2, "args": { "External id": 28304, "cbid": 251, "correlation": 28304 } }, { "ph": "f", "id": 28304, "pid": 76337, "tid": -914061504, "ts": 1716454217646583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217691117, "dur": 145, "args": { "External id": 28306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28306, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 28306, "pid": 5, "tid": 7, "ts": 1716454217691117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646590, "dur": 14, "args": { "External id": 28306, "cbid": 211, "correlation": 28306 } }, { "ph": "s", "id": 28306, "pid": 76337, "tid": -914061504, "ts": 1716454217646590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217691264, "dur": 36, "args": { "External id": 28314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28314, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28314, "pid": 5, "tid": 7, "ts": 1716454217691264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646663, "dur": 12, "args": { "External id": 28314, "cbid": 211, "correlation": 28314 } }, { "ph": "s", "id": 28314, "pid": 76337, "tid": -914061504, "ts": 1716454217646663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217691301, "dur": 50, "args": { "External id": 28322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28322, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28322, "pid": 5, "tid": 7, "ts": 1716454217691301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646703, "dur": 10, "args": { "External id": 28322, "cbid": 211, "correlation": 28322 } }, { "ph": "s", "id": 28322, "pid": 76337, "tid": -914061504, "ts": 1716454217646703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217691352, "dur": 30, "args": { "External id": 28333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28333, "pid": 5, "tid": 7, "ts": 1716454217691352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646779, "dur": 12, "args": { "External id": 28333, "cbid": 211, "correlation": 28333 } }, { "ph": "s", "id": 28333, "pid": 76337, "tid": -914061504, "ts": 1716454217646779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217691384, "dur": 34, "args": { "External id": 28355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28355, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28355, "pid": 5, "tid": 7, "ts": 1716454217691384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646809, "dur": 7, "args": { "External id": 28355, "cbid": 211, "correlation": 28355 } }, { "ph": "s", "id": 28355, "pid": 76337, "tid": -914061504, "ts": 1716454217646809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217646907, "dur": 1, "args": { "External id": 28366, "cbid": 251, "correlation": 28366 } }, { "ph": "f", "id": 28366, "pid": 76337, "tid": -914061504, "ts": 1716454217646907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217691419, "dur": 89, "args": { "External id": 28367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28367, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28367, "pid": 5, "tid": 7, "ts": 1716454217691419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217646912, "dur": 14, "args": { "External id": 28367, "cbid": 211, "correlation": 28367 } }, { "ph": "s", "id": 28367, "pid": 76337, "tid": -914061504, "ts": 1716454217646912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647006, "dur": 1, "args": { "External id": 28378, "cbid": 251, "correlation": 28378 } }, { "ph": "f", "id": 28378, "pid": 76337, "tid": -914061504, "ts": 1716454217647006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647011, "dur": 0, "args": { "External id": 28379, "cbid": 251, "correlation": 28379 } }, { "ph": "f", "id": 28379, "pid": 76337, "tid": -914061504, "ts": 1716454217647011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217691509, "dur": 11, "args": { "External id": 28380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28380, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 28380, "pid": 5, "tid": 7, "ts": 1716454217691509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647012, "dur": 14, "args": { "External id": 28380, "cbid": 211, "correlation": 28380 } }, { "ph": "s", "id": 28380, "pid": 76337, "tid": -914061504, "ts": 1716454217647012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217691521, "dur": 5, "args": { "External id": 28382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28382, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 28382, "pid": 5, "tid": 7, "ts": 1716454217691521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647028, "dur": 7, "args": { "External id": 28382, "cbid": 211, "correlation": 28382 } }, { "ph": "s", "id": 28382, "pid": 76337, "tid": -914061504, "ts": 1716454217647028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647101, "dur": 1, "args": { "External id": 28393, "cbid": 251, "correlation": 28393 } }, { "ph": "f", "id": 28393, "pid": 76337, "tid": -914061504, "ts": 1716454217647101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647105, "dur": 0, "args": { "External id": 28394, "cbid": 251, "correlation": 28394 } }, { "ph": "f", "id": 28394, "pid": 76337, "tid": -914061504, "ts": 1716454217647105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217691527, "dur": 7, "args": { "External id": 28395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28395, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 28395, "pid": 5, "tid": 7, "ts": 1716454217691527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647106, "dur": 13, "args": { "External id": 28395, "cbid": 211, "correlation": 28395 } }, { "ph": "s", "id": 28395, "pid": 76337, "tid": -914061504, "ts": 1716454217647106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217691536, "dur": 4, "args": { "External id": 28397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28397, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 28397, "pid": 5, "tid": 7, "ts": 1716454217691536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647120, "dur": 5, "args": { "External id": 28397, "cbid": 211, "correlation": 28397 } }, { "ph": "s", "id": 28397, "pid": 76337, "tid": -914061504, "ts": 1716454217647120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217691541, "dur": 90, "args": { "External id": 28418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28418, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 28418, "pid": 5, "tid": 7, "ts": 1716454217691541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647196, "dur": 12, "args": { "External id": 28418, "cbid": 211, "correlation": 28418 } }, { "ph": "s", "id": 28418, "pid": 76337, "tid": -914061504, "ts": 1716454217647196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647305, "dur": 1, "args": { "External id": 28436, "cbid": 251, "correlation": 28436 } }, { "ph": "f", "id": 28436, "pid": 76337, "tid": -914061504, "ts": 1716454217647305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217691632, "dur": 96, "args": { "External id": 28438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28438, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28438, "pid": 5, "tid": 7, "ts": 1716454217691632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647310, "dur": 14, "args": { "External id": 28438, "cbid": 211, "correlation": 28438 } }, { "ph": "s", "id": 28438, "pid": 76337, "tid": -914061504, "ts": 1716454217647310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217691730, "dur": 19, "args": { "External id": 28446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28446, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28446, "pid": 5, "tid": 7, "ts": 1716454217691730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647381, "dur": 13, "args": { "External id": 28446, "cbid": 211, "correlation": 28446 } }, { "ph": "s", "id": 28446, "pid": 76337, "tid": -914061504, "ts": 1716454217647381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217691750, "dur": 38, "args": { "External id": 28454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28454, "pid": 5, "tid": 7, "ts": 1716454217691750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647423, "dur": 9, "args": { "External id": 28454, "cbid": 211, "correlation": 28454 } }, { "ph": "s", "id": 28454, "pid": 76337, "tid": -914061504, "ts": 1716454217647423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217691789, "dur": 34, "args": { "External id": 28476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28476, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28476, "pid": 5, "tid": 7, "ts": 1716454217691789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647475, "dur": 10, "args": { "External id": 28476, "cbid": 211, "correlation": 28476 } }, { "ph": "s", "id": 28476, "pid": 76337, "tid": -914061504, "ts": 1716454217647475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647577, "dur": 1, "args": { "External id": 28492, "cbid": 251, "correlation": 28492 } }, { "ph": "f", "id": 28492, "pid": 76337, "tid": -914061504, "ts": 1716454217647577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647582, "dur": 0, "args": { "External id": 28494, "cbid": 251, "correlation": 28494 } }, { "ph": "f", "id": 28494, "pid": 76337, "tid": -914061504, "ts": 1716454217647582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217691825, "dur": 531, "args": { "External id": 28495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28495, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 28495, "pid": 5, "tid": 7, "ts": 1716454217691825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647587, "dur": 14, "args": { "External id": 28495, "cbid": 211, "correlation": 28495 } }, { "ph": "s", "id": 28495, "pid": 76337, "tid": -914061504, "ts": 1716454217647587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217692357, "dur": 123, "args": { "External id": 28503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28503, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28503, "pid": 5, "tid": 7, "ts": 1716454217692357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647657, "dur": 13, "args": { "External id": 28503, "cbid": 211, "correlation": 28503 } }, { "ph": "s", "id": 28503, "pid": 76337, "tid": -914061504, "ts": 1716454217647657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217692481, "dur": 127, "args": { "External id": 28511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28511, "pid": 5, "tid": 7, "ts": 1716454217692481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647690, "dur": 8, "args": { "External id": 28511, "cbid": 211, "correlation": 28511 } }, { "ph": "s", "id": 28511, "pid": 76337, "tid": -914061504, "ts": 1716454217647690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217647780, "dur": 1, "args": { "External id": 28527, "cbid": 251, "correlation": 28527 } }, { "ph": "f", "id": 28527, "pid": 76337, "tid": -914061504, "ts": 1716454217647780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217692610, "dur": 303, "args": { "External id": 28529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28529, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28529, "pid": 5, "tid": 7, "ts": 1716454217692610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647786, "dur": 13, "args": { "External id": 28529, "cbid": 211, "correlation": 28529 } }, { "ph": "s", "id": 28529, "pid": 76337, "tid": -914061504, "ts": 1716454217647786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217692915, "dur": 28, "args": { "External id": 28537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28537, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28537, "pid": 5, "tid": 7, "ts": 1716454217692915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647830, "dur": 10, "args": { "External id": 28537, "cbid": 211, "correlation": 28537 } }, { "ph": "s", "id": 28537, "pid": 76337, "tid": -914061504, "ts": 1716454217647830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217692944, "dur": 80, "args": { "External id": 28548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28548, "pid": 5, "tid": 7, "ts": 1716454217692944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217647899, "dur": 12, "args": { "External id": 28548, "cbid": 211, "correlation": 28548 } }, { "ph": "s", "id": 28548, "pid": 76337, "tid": -914061504, "ts": 1716454217647899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217647966, "dur": 0, "args": { "External id": 28560, "cbid": 317, "correlation": 28560 } }, { "ph": "f", "id": 28560, "pid": 76337, "tid": -914061504, "ts": 1716454217647966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217647967, "dur": 0, "args": { "External id": 28561, "cbid": 203, "correlation": 28561 } }, { "ph": "f", "id": 28561, "pid": 76337, "tid": -914061504, "ts": 1716454217647967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217647968, "dur": 0, "args": { "External id": 28562, "cbid": 205, "correlation": 28562 } }, { "ph": "f", "id": 28562, "pid": 76337, "tid": -914061504, "ts": 1716454217647968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217693024, "dur": 22, "args": { "External id": 28566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28566, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28566, "pid": 5, "tid": 7, "ts": 1716454217693024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648004, "dur": 14, "args": { "External id": 28566, "cbid": 211, "correlation": 28566 } }, { "ph": "s", "id": 28566, "pid": 76337, "tid": -914061504, "ts": 1716454217648004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217693048, "dur": 118, "args": { "External id": 28568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28568, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28568, "pid": 5, "tid": 7, "ts": 1716454217693048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648026, "dur": 7, "args": { "External id": 28568, "cbid": 211, "correlation": 28568 } }, { "ph": "s", "id": 28568, "pid": 76337, "tid": -914061504, "ts": 1716454217648026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217693167, "dur": 23, "args": { "External id": 28570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28570, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28570, "pid": 5, "tid": 7, "ts": 1716454217693167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648037, "dur": 5, "args": { "External id": 28570, "cbid": 211, "correlation": 28570 } }, { "ph": "s", "id": 28570, "pid": 76337, "tid": -914061504, "ts": 1716454217648037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217693192, "dur": 32, "args": { "External id": 28576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28576, "pid": 5, "tid": 7, "ts": 1716454217693192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648065, "dur": 9, "args": { "External id": 28576, "cbid": 211, "correlation": 28576 } }, { "ph": "s", "id": 28576, "pid": 76337, "tid": -914061504, "ts": 1716454217648065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217693225, "dur": 26, "args": { "External id": 28584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28584, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28584, "pid": 5, "tid": 7, "ts": 1716454217693225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648098, "dur": 8, "args": { "External id": 28584, "cbid": 211, "correlation": 28584 } }, { "ph": "s", "id": 28584, "pid": 76337, "tid": -914061504, "ts": 1716454217648098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217693253, "dur": 54, "args": { "External id": 28593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28593, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28593, "pid": 5, "tid": 7, "ts": 1716454217693253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648140, "dur": 10, "args": { "External id": 28593, "cbid": 211, "correlation": 28593 } }, { "ph": "s", "id": 28593, "pid": 76337, "tid": -914061504, "ts": 1716454217648140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217693308, "dur": 51, "args": { "External id": 28613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28613, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 28613, "pid": 5, "tid": 7, "ts": 1716454217693308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648214, "dur": 11, "args": { "External id": 28613, "cbid": 211, "correlation": 28613 } }, { "ph": "s", "id": 28613, "pid": 76337, "tid": -914061504, "ts": 1716454217648214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217693361, "dur": 4, "args": { "External id": 28625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28625, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 28625, "pid": 5, "tid": 7, "ts": 1716454217693361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648235, "dur": 7, "args": { "External id": 28625, "cbid": 211, "correlation": 28625 } }, { "ph": "s", "id": 28625, "pid": 76337, "tid": -914061504, "ts": 1716454217648235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217693366, "dur": 55, "args": { "External id": 28628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28628, "pid": 5, "tid": 7, "ts": 1716454217693366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648254, "dur": 7, "args": { "External id": 28628, "cbid": 211, "correlation": 28628 } }, { "ph": "s", "id": 28628, "pid": 76337, "tid": -914061504, "ts": 1716454217648254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217693423, "dur": 37, "args": { "External id": 28637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28637, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28637, "pid": 5, "tid": 7, "ts": 1716454217693423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648294, "dur": 10, "args": { "External id": 28637, "cbid": 211, "correlation": 28637 } }, { "ph": "s", "id": 28637, "pid": 76337, "tid": -914061504, "ts": 1716454217648294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217648346, "dur": 0, "args": { "External id": 28647, "cbid": 317, "correlation": 28647 } }, { "ph": "f", "id": 28647, "pid": 76337, "tid": -914061504, "ts": 1716454217648346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217648347, "dur": 0, "args": { "External id": 28648, "cbid": 203, "correlation": 28648 } }, { "ph": "f", "id": 28648, "pid": 76337, "tid": -914061504, "ts": 1716454217648347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217648347, "dur": 0, "args": { "External id": 28649, "cbid": 205, "correlation": 28649 } }, { "ph": "f", "id": 28649, "pid": 76337, "tid": -914061504, "ts": 1716454217648347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217693461, "dur": 40, "args": { "External id": 28653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28653, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28653, "pid": 5, "tid": 7, "ts": 1716454217693461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648362, "dur": 11, "args": { "External id": 28653, "cbid": 211, "correlation": 28653 } }, { "ph": "s", "id": 28653, "pid": 76337, "tid": -914061504, "ts": 1716454217648362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217693502, "dur": 82, "args": { "External id": 28655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28655, "pid": 5, "tid": 7, "ts": 1716454217693502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648375, "dur": 5, "args": { "External id": 28655, "cbid": 211, "correlation": 28655 } }, { "ph": "s", "id": 28655, "pid": 76337, "tid": -914061504, "ts": 1716454217648375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217693585, "dur": 1264, "args": { "External id": 28657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28657, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28657, "pid": 5, "tid": 7, "ts": 1716454217693585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648387, "dur": 8, "args": { "External id": 28657, "cbid": 211, "correlation": 28657 } }, { "ph": "s", "id": 28657, "pid": 76337, "tid": -914061504, "ts": 1716454217648387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217694851, "dur": 22, "args": { "External id": 28659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28659, "pid": 5, "tid": 7, "ts": 1716454217694851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648398, "dur": 5, "args": { "External id": 28659, "cbid": 211, "correlation": 28659 } }, { "ph": "s", "id": 28659, "pid": 76337, "tid": -914061504, "ts": 1716454217648398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217694874, "dur": 34, "args": { "External id": 28665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28665, "pid": 5, "tid": 7, "ts": 1716454217694874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648426, "dur": 8, "args": { "External id": 28665, "cbid": 211, "correlation": 28665 } }, { "ph": "s", "id": 28665, "pid": 76337, "tid": -914061504, "ts": 1716454217648426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217694909, "dur": 3, "args": { "External id": 28673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28673, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 28673, "pid": 5, "tid": 7, "ts": 1716454217694909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648469, "dur": 9, "args": { "External id": 28673, "cbid": 211, "correlation": 28673 } }, { "ph": "s", "id": 28673, "pid": 76337, "tid": -914061504, "ts": 1716454217648469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217648549, "dur": 1, "args": { "External id": 28689, "cbid": 251, "correlation": 28689 } }, { "ph": "f", "id": 28689, "pid": 76337, "tid": -914061504, "ts": 1716454217648549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217648554, "dur": 0, "args": { "External id": 28691, "cbid": 251, "correlation": 28691 } }, { "ph": "f", "id": 28691, "pid": 76337, "tid": -914061504, "ts": 1716454217648554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217694914, "dur": 12, "args": { "External id": 28692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28692, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 28692, "pid": 5, "tid": 7, "ts": 1716454217694914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648556, "dur": 12, "args": { "External id": 28692, "cbid": 211, "correlation": 28692 } }, { "ph": "s", "id": 28692, "pid": 76337, "tid": -914061504, "ts": 1716454217648556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217694928, "dur": 5, "args": { "External id": 28694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28694, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 28694, "pid": 5, "tid": 7, "ts": 1716454217694928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648570, "dur": 5, "args": { "External id": 28694, "cbid": 211, "correlation": 28694 } }, { "ph": "s", "id": 28694, "pid": 76337, "tid": -914061504, "ts": 1716454217648570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217694934, "dur": 29, "args": { "External id": 28704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28704, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28704, "pid": 5, "tid": 7, "ts": 1716454217694934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648630, "dur": 12, "args": { "External id": 28704, "cbid": 211, "correlation": 28704 } }, { "ph": "s", "id": 28704, "pid": 76337, "tid": -914061504, "ts": 1716454217648630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217694964, "dur": 30, "args": { "External id": 28724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28724, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 28724, "pid": 5, "tid": 7, "ts": 1716454217694964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648696, "dur": 12, "args": { "External id": 28724, "cbid": 211, "correlation": 28724 } }, { "ph": "s", "id": 28724, "pid": 76337, "tid": -914061504, "ts": 1716454217648696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217694995, "dur": 5, "args": { "External id": 28736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28736, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 28736, "pid": 5, "tid": 7, "ts": 1716454217694995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648717, "dur": 6, "args": { "External id": 28736, "cbid": 211, "correlation": 28736 } }, { "ph": "s", "id": 28736, "pid": 76337, "tid": -914061504, "ts": 1716454217648717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217695001, "dur": 30, "args": { "External id": 28739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28739, "pid": 5, "tid": 7, "ts": 1716454217695001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648735, "dur": 6, "args": { "External id": 28739, "cbid": 211, "correlation": 28739 } }, { "ph": "s", "id": 28739, "pid": 76337, "tid": -914061504, "ts": 1716454217648735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217695032, "dur": 21, "args": { "External id": 28748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28748, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28748, "pid": 5, "tid": 7, "ts": 1716454217695032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648776, "dur": 10, "args": { "External id": 28748, "cbid": 211, "correlation": 28748 } }, { "ph": "s", "id": 28748, "pid": 76337, "tid": -914061504, "ts": 1716454217648776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217648851, "dur": 0, "args": { "External id": 28758, "cbid": 317, "correlation": 28758 } }, { "ph": "f", "id": 28758, "pid": 76337, "tid": -914061504, "ts": 1716454217648851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217648851, "dur": 0, "args": { "External id": 28759, "cbid": 203, "correlation": 28759 } }, { "ph": "f", "id": 28759, "pid": 76337, "tid": -914061504, "ts": 1716454217648851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217648852, "dur": 0, "args": { "External id": 28760, "cbid": 205, "correlation": 28760 } }, { "ph": "f", "id": 28760, "pid": 76337, "tid": -914061504, "ts": 1716454217648852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217695055, "dur": 22, "args": { "External id": 28764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28764, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28764, "pid": 5, "tid": 7, "ts": 1716454217695055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648867, "dur": 13, "args": { "External id": 28764, "cbid": 211, "correlation": 28764 } }, { "ph": "s", "id": 28764, "pid": 76337, "tid": -914061504, "ts": 1716454217648867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217695078, "dur": 43, "args": { "External id": 28766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28766, "pid": 5, "tid": 7, "ts": 1716454217695078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648883, "dur": 5, "args": { "External id": 28766, "cbid": 211, "correlation": 28766 } }, { "ph": "s", "id": 28766, "pid": 76337, "tid": -914061504, "ts": 1716454217648883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217695122, "dur": 635, "args": { "External id": 28768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28768, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28768, "pid": 5, "tid": 7, "ts": 1716454217695122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648894, "dur": 6, "args": { "External id": 28768, "cbid": 211, "correlation": 28768 } }, { "ph": "s", "id": 28768, "pid": 76337, "tid": -914061504, "ts": 1716454217648894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217695758, "dur": 22, "args": { "External id": 28770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28770, "pid": 5, "tid": 7, "ts": 1716454217695758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648903, "dur": 5, "args": { "External id": 28770, "cbid": 211, "correlation": 28770 } }, { "ph": "s", "id": 28770, "pid": 76337, "tid": -914061504, "ts": 1716454217648903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217695782, "dur": 32, "args": { "External id": 28776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28776, "pid": 5, "tid": 7, "ts": 1716454217695782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217648932, "dur": 8, "args": { "External id": 28776, "cbid": 211, "correlation": 28776 } }, { "ph": "s", "id": 28776, "pid": 76337, "tid": -914061504, "ts": 1716454217648932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217648999, "dur": 0, "args": { "External id": 28786, "cbid": 317, "correlation": 28786 } }, { "ph": "f", "id": 28786, "pid": 76337, "tid": -914061504, "ts": 1716454217648999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217649000, "dur": 0, "args": { "External id": 28787, "cbid": 203, "correlation": 28787 } }, { "ph": "f", "id": 28787, "pid": 76337, "tid": -914061504, "ts": 1716454217649000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217649000, "dur": 0, "args": { "External id": 28788, "cbid": 205, "correlation": 28788 } }, { "ph": "f", "id": 28788, "pid": 76337, "tid": -914061504, "ts": 1716454217649000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217695815, "dur": 37, "args": { "External id": 28792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28792, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28792, "pid": 5, "tid": 7, "ts": 1716454217695815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649013, "dur": 13, "args": { "External id": 28792, "cbid": 211, "correlation": 28792 } }, { "ph": "s", "id": 28792, "pid": 76337, "tid": -914061504, "ts": 1716454217649013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217695854, "dur": 187, "args": { "External id": 28794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28794, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28794, "pid": 5, "tid": 7, "ts": 1716454217695854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649031, "dur": 6, "args": { "External id": 28794, "cbid": 211, "correlation": 28794 } }, { "ph": "s", "id": 28794, "pid": 76337, "tid": -914061504, "ts": 1716454217649031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217696042, "dur": 22, "args": { "External id": 28796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28796, "pid": 5, "tid": 7, "ts": 1716454217696042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649041, "dur": 5, "args": { "External id": 28796, "cbid": 211, "correlation": 28796 } }, { "ph": "s", "id": 28796, "pid": 76337, "tid": -914061504, "ts": 1716454217649041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217696065, "dur": 32, "args": { "External id": 28802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28802, "pid": 5, "tid": 7, "ts": 1716454217696065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649067, "dur": 9, "args": { "External id": 28802, "cbid": 211, "correlation": 28802 } }, { "ph": "s", "id": 28802, "pid": 76337, "tid": -914061504, "ts": 1716454217649067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217696098, "dur": 27, "args": { "External id": 28810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28810, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28810, "pid": 5, "tid": 7, "ts": 1716454217696098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649096, "dur": 8, "args": { "External id": 28810, "cbid": 211, "correlation": 28810 } }, { "ph": "s", "id": 28810, "pid": 76337, "tid": -914061504, "ts": 1716454217649096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217696126, "dur": 20, "args": { "External id": 28818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28818, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28818, "pid": 5, "tid": 7, "ts": 1716454217696126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649126, "dur": 8, "args": { "External id": 28818, "cbid": 211, "correlation": 28818 } }, { "ph": "s", "id": 28818, "pid": 76337, "tid": -914061504, "ts": 1716454217649126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217696148, "dur": 30, "args": { "External id": 28838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28838, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 28838, "pid": 5, "tid": 7, "ts": 1716454217696148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649209, "dur": 12, "args": { "External id": 28838, "cbid": 211, "correlation": 28838 } }, { "ph": "s", "id": 28838, "pid": 76337, "tid": -914061504, "ts": 1716454217649209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217696179, "dur": 5, "args": { "External id": 28850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28850, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 28850, "pid": 5, "tid": 7, "ts": 1716454217696179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649231, "dur": 6, "args": { "External id": 28850, "cbid": 211, "correlation": 28850 } }, { "ph": "s", "id": 28850, "pid": 76337, "tid": -914061504, "ts": 1716454217649231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217696185, "dur": 30, "args": { "External id": 28853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28853, "pid": 5, "tid": 7, "ts": 1716454217696185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649249, "dur": 6, "args": { "External id": 28853, "cbid": 211, "correlation": 28853 } }, { "ph": "s", "id": 28853, "pid": 76337, "tid": -914061504, "ts": 1716454217649249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217649317, "dur": 0, "args": { "External id": 28864, "cbid": 317, "correlation": 28864 } }, { "ph": "f", "id": 28864, "pid": 76337, "tid": -914061504, "ts": 1716454217649317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217649318, "dur": 0, "args": { "External id": 28865, "cbid": 203, "correlation": 28865 } }, { "ph": "f", "id": 28865, "pid": 76337, "tid": -914061504, "ts": 1716454217649318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217649319, "dur": 0, "args": { "External id": 28866, "cbid": 205, "correlation": 28866 } }, { "ph": "f", "id": 28866, "pid": 76337, "tid": -914061504, "ts": 1716454217649319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217696216, "dur": 23, "args": { "External id": 28870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28870, "pid": 5, "tid": 7, "ts": 1716454217696216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649332, "dur": 13, "args": { "External id": 28870, "cbid": 211, "correlation": 28870 } }, { "ph": "s", "id": 28870, "pid": 76337, "tid": -914061504, "ts": 1716454217649332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217696240, "dur": 103, "args": { "External id": 28872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28872, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28872, "pid": 5, "tid": 7, "ts": 1716454217696240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649351, "dur": 6, "args": { "External id": 28872, "cbid": 211, "correlation": 28872 } }, { "ph": "s", "id": 28872, "pid": 76337, "tid": -914061504, "ts": 1716454217649351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217696345, "dur": 21, "args": { "External id": 28874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28874, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28874, "pid": 5, "tid": 7, "ts": 1716454217696345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649361, "dur": 5, "args": { "External id": 28874, "cbid": 211, "correlation": 28874 } }, { "ph": "s", "id": 28874, "pid": 76337, "tid": -914061504, "ts": 1716454217649361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217696367, "dur": 32, "args": { "External id": 28880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28880, "pid": 5, "tid": 7, "ts": 1716454217696367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649389, "dur": 8, "args": { "External id": 28880, "cbid": 211, "correlation": 28880 } }, { "ph": "s", "id": 28880, "pid": 76337, "tid": -914061504, "ts": 1716454217649389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217696401, "dur": 193, "args": { "External id": 28889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28889, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28889, "pid": 5, "tid": 7, "ts": 1716454217696401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649472, "dur": 14, "args": { "External id": 28889, "cbid": 211, "correlation": 28889 } }, { "ph": "s", "id": 28889, "pid": 76337, "tid": -914061504, "ts": 1716454217649472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217696595, "dur": 64, "args": { "External id": 28911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28911, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28911, "pid": 5, "tid": 7, "ts": 1716454217696595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649528, "dur": 11, "args": { "External id": 28911, "cbid": 211, "correlation": 28911 } }, { "ph": "s", "id": 28911, "pid": 76337, "tid": -914061504, "ts": 1716454217649528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217649629, "dur": 1, "args": { "External id": 28922, "cbid": 251, "correlation": 28922 } }, { "ph": "f", "id": 28922, "pid": 76337, "tid": -914061504, "ts": 1716454217649629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217696661, "dur": 152, "args": { "External id": 28923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28923, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28923, "pid": 5, "tid": 7, "ts": 1716454217696661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649635, "dur": 14, "args": { "External id": 28923, "cbid": 211, "correlation": 28923 } }, { "ph": "s", "id": 28923, "pid": 76337, "tid": -914061504, "ts": 1716454217649635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217649718, "dur": 1, "args": { "External id": 28934, "cbid": 251, "correlation": 28934 } }, { "ph": "f", "id": 28934, "pid": 76337, "tid": -914061504, "ts": 1716454217649718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217696814, "dur": 148, "args": { "External id": 28935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28935, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28935, "pid": 5, "tid": 7, "ts": 1716454217696814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649723, "dur": 12, "args": { "External id": 28935, "cbid": 211, "correlation": 28935 } }, { "ph": "s", "id": 28935, "pid": 76337, "tid": -914061504, "ts": 1716454217649723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217649808, "dur": 1, "args": { "External id": 28946, "cbid": 251, "correlation": 28946 } }, { "ph": "f", "id": 28946, "pid": 76337, "tid": -914061504, "ts": 1716454217649808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217696963, "dur": 145, "args": { "External id": 28947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28947, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 28947, "pid": 5, "tid": 7, "ts": 1716454217696963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649812, "dur": 12, "args": { "External id": 28947, "cbid": 211, "correlation": 28947 } }, { "ph": "s", "id": 28947, "pid": 76337, "tid": -914061504, "ts": 1716454217649812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217697110, "dur": 1908, "args": { "External id": 28968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28968, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 28968, "pid": 5, "tid": 7, "ts": 1716454217697110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217649895, "dur": 13, "args": { "External id": 28968, "cbid": 211, "correlation": 28968 } }, { "ph": "s", "id": 28968, "pid": 76337, "tid": -914061504, "ts": 1716454217649895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650015, "dur": 1, "args": { "External id": 28986, "cbid": 251, "correlation": 28986 } }, { "ph": "f", "id": 28986, "pid": 76337, "tid": -914061504, "ts": 1716454217650015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217699019, "dur": 145, "args": { "External id": 28988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28988, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 28988, "pid": 5, "tid": 7, "ts": 1716454217699019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650021, "dur": 14, "args": { "External id": 28988, "cbid": 211, "correlation": 28988 } }, { "ph": "s", "id": 28988, "pid": 76337, "tid": -914061504, "ts": 1716454217650021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217699166, "dur": 35, "args": { "External id": 28996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 28996, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 28996, "pid": 5, "tid": 7, "ts": 1716454217699166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650094, "dur": 12, "args": { "External id": 28996, "cbid": 211, "correlation": 28996 } }, { "ph": "s", "id": 28996, "pid": 76337, "tid": -914061504, "ts": 1716454217650094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217699202, "dur": 50, "args": { "External id": 29004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29004, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29004, "pid": 5, "tid": 7, "ts": 1716454217699202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650133, "dur": 9, "args": { "External id": 29004, "cbid": 211, "correlation": 29004 } }, { "ph": "s", "id": 29004, "pid": 76337, "tid": -914061504, "ts": 1716454217650133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217699253, "dur": 30, "args": { "External id": 29015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29015, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29015, "pid": 5, "tid": 7, "ts": 1716454217699253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650205, "dur": 12, "args": { "External id": 29015, "cbid": 211, "correlation": 29015 } }, { "ph": "s", "id": 29015, "pid": 76337, "tid": -914061504, "ts": 1716454217650205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217699285, "dur": 34, "args": { "External id": 29037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29037, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29037, "pid": 5, "tid": 7, "ts": 1716454217699285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650237, "dur": 8, "args": { "External id": 29037, "cbid": 211, "correlation": 29037 } }, { "ph": "s", "id": 29037, "pid": 76337, "tid": -914061504, "ts": 1716454217650237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650333, "dur": 1, "args": { "External id": 29048, "cbid": 251, "correlation": 29048 } }, { "ph": "f", "id": 29048, "pid": 76337, "tid": -914061504, "ts": 1716454217650333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217699320, "dur": 88, "args": { "External id": 29049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29049, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29049, "pid": 5, "tid": 7, "ts": 1716454217699320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650339, "dur": 13, "args": { "External id": 29049, "cbid": 211, "correlation": 29049 } }, { "ph": "s", "id": 29049, "pid": 76337, "tid": -914061504, "ts": 1716454217650339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650423, "dur": 1, "args": { "External id": 29060, "cbid": 251, "correlation": 29060 } }, { "ph": "f", "id": 29060, "pid": 76337, "tid": -914061504, "ts": 1716454217650423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650427, "dur": 0, "args": { "External id": 29061, "cbid": 251, "correlation": 29061 } }, { "ph": "f", "id": 29061, "pid": 76337, "tid": -914061504, "ts": 1716454217650427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217699410, "dur": 11, "args": { "External id": 29062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29062, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 29062, "pid": 5, "tid": 7, "ts": 1716454217699410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650429, "dur": 13, "args": { "External id": 29062, "cbid": 211, "correlation": 29062 } }, { "ph": "s", "id": 29062, "pid": 76337, "tid": -914061504, "ts": 1716454217650429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217699422, "dur": 5, "args": { "External id": 29064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29064, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 29064, "pid": 5, "tid": 7, "ts": 1716454217699422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650444, "dur": 6, "args": { "External id": 29064, "cbid": 211, "correlation": 29064 } }, { "ph": "s", "id": 29064, "pid": 76337, "tid": -914061504, "ts": 1716454217650444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650514, "dur": 1, "args": { "External id": 29075, "cbid": 251, "correlation": 29075 } }, { "ph": "f", "id": 29075, "pid": 76337, "tid": -914061504, "ts": 1716454217650514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650518, "dur": 0, "args": { "External id": 29076, "cbid": 251, "correlation": 29076 } }, { "ph": "f", "id": 29076, "pid": 76337, "tid": -914061504, "ts": 1716454217650518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217699428, "dur": 7, "args": { "External id": 29077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29077, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 29077, "pid": 5, "tid": 7, "ts": 1716454217699428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650520, "dur": 12, "args": { "External id": 29077, "cbid": 211, "correlation": 29077 } }, { "ph": "s", "id": 29077, "pid": 76337, "tid": -914061504, "ts": 1716454217650520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217699437, "dur": 3, "args": { "External id": 29079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29079, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 29079, "pid": 5, "tid": 7, "ts": 1716454217699437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650534, "dur": 6, "args": { "External id": 29079, "cbid": 211, "correlation": 29079 } }, { "ph": "s", "id": 29079, "pid": 76337, "tid": -914061504, "ts": 1716454217650534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217699441, "dur": 89, "args": { "External id": 29100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29100, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 29100, "pid": 5, "tid": 7, "ts": 1716454217699441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650609, "dur": 13, "args": { "External id": 29100, "cbid": 211, "correlation": 29100 } }, { "ph": "s", "id": 29100, "pid": 76337, "tid": -914061504, "ts": 1716454217650609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650717, "dur": 1, "args": { "External id": 29118, "cbid": 251, "correlation": 29118 } }, { "ph": "f", "id": 29118, "pid": 76337, "tid": -914061504, "ts": 1716454217650717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217699532, "dur": 98, "args": { "External id": 29120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29120, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29120, "pid": 5, "tid": 7, "ts": 1716454217699532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650722, "dur": 14, "args": { "External id": 29120, "cbid": 211, "correlation": 29120 } }, { "ph": "s", "id": 29120, "pid": 76337, "tid": -914061504, "ts": 1716454217650722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217699631, "dur": 19, "args": { "External id": 29128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29128, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29128, "pid": 5, "tid": 7, "ts": 1716454217699631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650793, "dur": 12, "args": { "External id": 29128, "cbid": 211, "correlation": 29128 } }, { "ph": "s", "id": 29128, "pid": 76337, "tid": -914061504, "ts": 1716454217650793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217699651, "dur": 37, "args": { "External id": 29136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29136, "pid": 5, "tid": 7, "ts": 1716454217699651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650834, "dur": 9, "args": { "External id": 29136, "cbid": 211, "correlation": 29136 } }, { "ph": "s", "id": 29136, "pid": 76337, "tid": -914061504, "ts": 1716454217650834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217699689, "dur": 34, "args": { "External id": 29158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29158, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29158, "pid": 5, "tid": 7, "ts": 1716454217699689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217650885, "dur": 10, "args": { "External id": 29158, "cbid": 211, "correlation": 29158 } }, { "ph": "s", "id": 29158, "pid": 76337, "tid": -914061504, "ts": 1716454217650885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650994, "dur": 1, "args": { "External id": 29174, "cbid": 251, "correlation": 29174 } }, { "ph": "f", "id": 29174, "pid": 76337, "tid": -914061504, "ts": 1716454217650994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217650999, "dur": 0, "args": { "External id": 29176, "cbid": 251, "correlation": 29176 } }, { "ph": "f", "id": 29176, "pid": 76337, "tid": -914061504, "ts": 1716454217650999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217699724, "dur": 533, "args": { "External id": 29177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29177, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 29177, "pid": 5, "tid": 7, "ts": 1716454217699724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651003, "dur": 14, "args": { "External id": 29177, "cbid": 211, "correlation": 29177 } }, { "ph": "s", "id": 29177, "pid": 76337, "tid": -914061504, "ts": 1716454217651003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217700259, "dur": 124, "args": { "External id": 29185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29185, "pid": 5, "tid": 7, "ts": 1716454217700259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651072, "dur": 13, "args": { "External id": 29185, "cbid": 211, "correlation": 29185 } }, { "ph": "s", "id": 29185, "pid": 76337, "tid": -914061504, "ts": 1716454217651072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217700384, "dur": 126, "args": { "External id": 29193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29193, "pid": 5, "tid": 7, "ts": 1716454217700384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651104, "dur": 8, "args": { "External id": 29193, "cbid": 211, "correlation": 29193 } }, { "ph": "s", "id": 29193, "pid": 76337, "tid": -914061504, "ts": 1716454217651104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217651195, "dur": 1, "args": { "External id": 29209, "cbid": 251, "correlation": 29209 } }, { "ph": "f", "id": 29209, "pid": 76337, "tid": -914061504, "ts": 1716454217651195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217700511, "dur": 306, "args": { "External id": 29211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29211, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29211, "pid": 5, "tid": 7, "ts": 1716454217700511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651201, "dur": 13, "args": { "External id": 29211, "cbid": 211, "correlation": 29211 } }, { "ph": "s", "id": 29211, "pid": 76337, "tid": -914061504, "ts": 1716454217651201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217700818, "dur": 27, "args": { "External id": 29219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29219, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29219, "pid": 5, "tid": 7, "ts": 1716454217700818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651245, "dur": 10, "args": { "External id": 29219, "cbid": 211, "correlation": 29219 } }, { "ph": "s", "id": 29219, "pid": 76337, "tid": -914061504, "ts": 1716454217651245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217700847, "dur": 80, "args": { "External id": 29230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29230, "pid": 5, "tid": 7, "ts": 1716454217700847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651312, "dur": 13, "args": { "External id": 29230, "cbid": 211, "correlation": 29230 } }, { "ph": "s", "id": 29230, "pid": 76337, "tid": -914061504, "ts": 1716454217651312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217651376, "dur": 0, "args": { "External id": 29242, "cbid": 317, "correlation": 29242 } }, { "ph": "f", "id": 29242, "pid": 76337, "tid": -914061504, "ts": 1716454217651376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217651377, "dur": 0, "args": { "External id": 29243, "cbid": 203, "correlation": 29243 } }, { "ph": "f", "id": 29243, "pid": 76337, "tid": -914061504, "ts": 1716454217651377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217651378, "dur": 0, "args": { "External id": 29244, "cbid": 205, "correlation": 29244 } }, { "ph": "f", "id": 29244, "pid": 76337, "tid": -914061504, "ts": 1716454217651378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217700928, "dur": 24, "args": { "External id": 29248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29248, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29248, "pid": 5, "tid": 7, "ts": 1716454217700928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651394, "dur": 11, "args": { "External id": 29248, "cbid": 211, "correlation": 29248 } }, { "ph": "s", "id": 29248, "pid": 76337, "tid": -914061504, "ts": 1716454217651394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217700953, "dur": 119, "args": { "External id": 29250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29250, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29250, "pid": 5, "tid": 7, "ts": 1716454217700953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651412, "dur": 6, "args": { "External id": 29250, "cbid": 211, "correlation": 29250 } }, { "ph": "s", "id": 29250, "pid": 76337, "tid": -914061504, "ts": 1716454217651412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217701073, "dur": 22, "args": { "External id": 29252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29252, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29252, "pid": 5, "tid": 7, "ts": 1716454217701073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651422, "dur": 6, "args": { "External id": 29252, "cbid": 211, "correlation": 29252 } }, { "ph": "s", "id": 29252, "pid": 76337, "tid": -914061504, "ts": 1716454217651422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217701097, "dur": 32, "args": { "External id": 29258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29258, "pid": 5, "tid": 7, "ts": 1716454217701097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651450, "dur": 8, "args": { "External id": 29258, "cbid": 211, "correlation": 29258 } }, { "ph": "s", "id": 29258, "pid": 76337, "tid": -914061504, "ts": 1716454217651450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217701130, "dur": 26, "args": { "External id": 29266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29266, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29266, "pid": 5, "tid": 7, "ts": 1716454217701130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651482, "dur": 8, "args": { "External id": 29266, "cbid": 211, "correlation": 29266 } }, { "ph": "s", "id": 29266, "pid": 76337, "tid": -914061504, "ts": 1716454217651482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217701158, "dur": 47, "args": { "External id": 29275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29275, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29275, "pid": 5, "tid": 7, "ts": 1716454217701158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651520, "dur": 10, "args": { "External id": 29275, "cbid": 211, "correlation": 29275 } }, { "ph": "s", "id": 29275, "pid": 76337, "tid": -914061504, "ts": 1716454217651520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217701206, "dur": 43, "args": { "External id": 29295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29295, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 29295, "pid": 5, "tid": 7, "ts": 1716454217701206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651592, "dur": 12, "args": { "External id": 29295, "cbid": 211, "correlation": 29295 } }, { "ph": "s", "id": 29295, "pid": 76337, "tid": -914061504, "ts": 1716454217651592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217701250, "dur": 5, "args": { "External id": 29307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29307, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 29307, "pid": 5, "tid": 7, "ts": 1716454217701250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651614, "dur": 6, "args": { "External id": 29307, "cbid": 211, "correlation": 29307 } }, { "ph": "s", "id": 29307, "pid": 76337, "tid": -914061504, "ts": 1716454217651614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217701256, "dur": 44, "args": { "External id": 29310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29310, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29310, "pid": 5, "tid": 7, "ts": 1716454217701256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651632, "dur": 7, "args": { "External id": 29310, "cbid": 211, "correlation": 29310 } }, { "ph": "s", "id": 29310, "pid": 76337, "tid": -914061504, "ts": 1716454217651632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217701302, "dur": 29, "args": { "External id": 29319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29319, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29319, "pid": 5, "tid": 7, "ts": 1716454217701302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651672, "dur": 9, "args": { "External id": 29319, "cbid": 211, "correlation": 29319 } }, { "ph": "s", "id": 29319, "pid": 76337, "tid": -914061504, "ts": 1716454217651672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217651724, "dur": 0, "args": { "External id": 29329, "cbid": 317, "correlation": 29329 } }, { "ph": "f", "id": 29329, "pid": 76337, "tid": -914061504, "ts": 1716454217651724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217651725, "dur": 0, "args": { "External id": 29330, "cbid": 203, "correlation": 29330 } }, { "ph": "f", "id": 29330, "pid": 76337, "tid": -914061504, "ts": 1716454217651725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217651725, "dur": 0, "args": { "External id": 29331, "cbid": 205, "correlation": 29331 } }, { "ph": "f", "id": 29331, "pid": 76337, "tid": -914061504, "ts": 1716454217651725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217701332, "dur": 30, "args": { "External id": 29335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29335, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29335, "pid": 5, "tid": 7, "ts": 1716454217701332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651740, "dur": 11, "args": { "External id": 29335, "cbid": 211, "correlation": 29335 } }, { "ph": "s", "id": 29335, "pid": 76337, "tid": -914061504, "ts": 1716454217651740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217701364, "dur": 62, "args": { "External id": 29337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29337, "pid": 5, "tid": 7, "ts": 1716454217701364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651754, "dur": 6, "args": { "External id": 29337, "cbid": 211, "correlation": 29337 } }, { "ph": "s", "id": 29337, "pid": 76337, "tid": -914061504, "ts": 1716454217651754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217701427, "dur": 955, "args": { "External id": 29339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29339, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29339, "pid": 5, "tid": 7, "ts": 1716454217701427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651766, "dur": 6, "args": { "External id": 29339, "cbid": 211, "correlation": 29339 } }, { "ph": "s", "id": 29339, "pid": 76337, "tid": -914061504, "ts": 1716454217651766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217702383, "dur": 21, "args": { "External id": 29341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29341, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29341, "pid": 5, "tid": 7, "ts": 1716454217702383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651776, "dur": 5, "args": { "External id": 29341, "cbid": 211, "correlation": 29341 } }, { "ph": "s", "id": 29341, "pid": 76337, "tid": -914061504, "ts": 1716454217651776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217702405, "dur": 32, "args": { "External id": 29347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29347, "pid": 5, "tid": 7, "ts": 1716454217702405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651803, "dur": 9, "args": { "External id": 29347, "cbid": 211, "correlation": 29347 } }, { "ph": "s", "id": 29347, "pid": 76337, "tid": -914061504, "ts": 1716454217651803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217702439, "dur": 3, "args": { "External id": 29355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29355, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 29355, "pid": 5, "tid": 7, "ts": 1716454217702439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651847, "dur": 9, "args": { "External id": 29355, "cbid": 211, "correlation": 29355 } }, { "ph": "s", "id": 29355, "pid": 76337, "tid": -914061504, "ts": 1716454217651847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217651926, "dur": 1, "args": { "External id": 29371, "cbid": 251, "correlation": 29371 } }, { "ph": "f", "id": 29371, "pid": 76337, "tid": -914061504, "ts": 1716454217651926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217651931, "dur": 0, "args": { "External id": 29373, "cbid": 251, "correlation": 29373 } }, { "ph": "f", "id": 29373, "pid": 76337, "tid": -914061504, "ts": 1716454217651931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217702443, "dur": 12, "args": { "External id": 29374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29374, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 29374, "pid": 5, "tid": 7, "ts": 1716454217702443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651933, "dur": 13, "args": { "External id": 29374, "cbid": 211, "correlation": 29374 } }, { "ph": "s", "id": 29374, "pid": 76337, "tid": -914061504, "ts": 1716454217651933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217702457, "dur": 5, "args": { "External id": 29376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29376, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 29376, "pid": 5, "tid": 7, "ts": 1716454217702457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217651948, "dur": 6, "args": { "External id": 29376, "cbid": 211, "correlation": 29376 } }, { "ph": "s", "id": 29376, "pid": 76337, "tid": -914061504, "ts": 1716454217651948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217702463, "dur": 28, "args": { "External id": 29386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29386, "pid": 5, "tid": 7, "ts": 1716454217702463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652017, "dur": 13, "args": { "External id": 29386, "cbid": 211, "correlation": 29386 } }, { "ph": "s", "id": 29386, "pid": 76337, "tid": -914061504, "ts": 1716454217652017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217702492, "dur": 31, "args": { "External id": 29406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29406, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 29406, "pid": 5, "tid": 7, "ts": 1716454217702492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652084, "dur": 11, "args": { "External id": 29406, "cbid": 211, "correlation": 29406 } }, { "ph": "s", "id": 29406, "pid": 76337, "tid": -914061504, "ts": 1716454217652084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217702524, "dur": 4, "args": { "External id": 29418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29418, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 29418, "pid": 5, "tid": 7, "ts": 1716454217702524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652105, "dur": 7, "args": { "External id": 29418, "cbid": 211, "correlation": 29418 } }, { "ph": "s", "id": 29418, "pid": 76337, "tid": -914061504, "ts": 1716454217652105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217702530, "dur": 30, "args": { "External id": 29421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29421, "pid": 5, "tid": 7, "ts": 1716454217702530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652124, "dur": 6, "args": { "External id": 29421, "cbid": 211, "correlation": 29421 } }, { "ph": "s", "id": 29421, "pid": 76337, "tid": -914061504, "ts": 1716454217652124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217702561, "dur": 21, "args": { "External id": 29430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29430, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29430, "pid": 5, "tid": 7, "ts": 1716454217702561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652164, "dur": 10, "args": { "External id": 29430, "cbid": 211, "correlation": 29430 } }, { "ph": "s", "id": 29430, "pid": 76337, "tid": -914061504, "ts": 1716454217652164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217652241, "dur": 0, "args": { "External id": 29440, "cbid": 317, "correlation": 29440 } }, { "ph": "f", "id": 29440, "pid": 76337, "tid": -914061504, "ts": 1716454217652241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217652241, "dur": 0, "args": { "External id": 29441, "cbid": 203, "correlation": 29441 } }, { "ph": "f", "id": 29441, "pid": 76337, "tid": -914061504, "ts": 1716454217652241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217652242, "dur": 0, "args": { "External id": 29442, "cbid": 205, "correlation": 29442 } }, { "ph": "f", "id": 29442, "pid": 76337, "tid": -914061504, "ts": 1716454217652242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217702583, "dur": 22, "args": { "External id": 29446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29446, "pid": 5, "tid": 7, "ts": 1716454217702583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652256, "dur": 13, "args": { "External id": 29446, "cbid": 211, "correlation": 29446 } }, { "ph": "s", "id": 29446, "pid": 76337, "tid": -914061504, "ts": 1716454217652256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217702606, "dur": 43, "args": { "External id": 29448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29448, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29448, "pid": 5, "tid": 7, "ts": 1716454217702606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652272, "dur": 5, "args": { "External id": 29448, "cbid": 211, "correlation": 29448 } }, { "ph": "s", "id": 29448, "pid": 76337, "tid": -914061504, "ts": 1716454217652272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217702651, "dur": 635, "args": { "External id": 29450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29450, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29450, "pid": 5, "tid": 7, "ts": 1716454217702651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652284, "dur": 6, "args": { "External id": 29450, "cbid": 211, "correlation": 29450 } }, { "ph": "s", "id": 29450, "pid": 76337, "tid": -914061504, "ts": 1716454217652284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217703287, "dur": 22, "args": { "External id": 29452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29452, "pid": 5, "tid": 7, "ts": 1716454217703287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652293, "dur": 5, "args": { "External id": 29452, "cbid": 211, "correlation": 29452 } }, { "ph": "s", "id": 29452, "pid": 76337, "tid": -914061504, "ts": 1716454217652293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217703311, "dur": 33, "args": { "External id": 29458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29458, "pid": 5, "tid": 7, "ts": 1716454217703311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652322, "dur": 9, "args": { "External id": 29458, "cbid": 211, "correlation": 29458 } }, { "ph": "s", "id": 29458, "pid": 76337, "tid": -914061504, "ts": 1716454217652322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217652381, "dur": 0, "args": { "External id": 29468, "cbid": 317, "correlation": 29468 } }, { "ph": "f", "id": 29468, "pid": 76337, "tid": -914061504, "ts": 1716454217652381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217652381, "dur": 0, "args": { "External id": 29469, "cbid": 203, "correlation": 29469 } }, { "ph": "f", "id": 29469, "pid": 76337, "tid": -914061504, "ts": 1716454217652381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217652382, "dur": 0, "args": { "External id": 29470, "cbid": 205, "correlation": 29470 } }, { "ph": "f", "id": 29470, "pid": 76337, "tid": -914061504, "ts": 1716454217652382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217703345, "dur": 31, "args": { "External id": 29474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29474, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29474, "pid": 5, "tid": 7, "ts": 1716454217703345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652396, "dur": 11, "args": { "External id": 29474, "cbid": 211, "correlation": 29474 } }, { "ph": "s", "id": 29474, "pid": 76337, "tid": -914061504, "ts": 1716454217652396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217703377, "dur": 149, "args": { "External id": 29476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29476, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29476, "pid": 5, "tid": 7, "ts": 1716454217703377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652413, "dur": 6, "args": { "External id": 29476, "cbid": 211, "correlation": 29476 } }, { "ph": "s", "id": 29476, "pid": 76337, "tid": -914061504, "ts": 1716454217652413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217703528, "dur": 22, "args": { "External id": 29478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29478, "pid": 5, "tid": 7, "ts": 1716454217703528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652424, "dur": 5, "args": { "External id": 29478, "cbid": 211, "correlation": 29478 } }, { "ph": "s", "id": 29478, "pid": 76337, "tid": -914061504, "ts": 1716454217652424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217703551, "dur": 32, "args": { "External id": 29484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29484, "pid": 5, "tid": 7, "ts": 1716454217703551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652450, "dur": 8, "args": { "External id": 29484, "cbid": 211, "correlation": 29484 } }, { "ph": "s", "id": 29484, "pid": 76337, "tid": -914061504, "ts": 1716454217652450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217703584, "dur": 27, "args": { "External id": 29492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29492, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29492, "pid": 5, "tid": 7, "ts": 1716454217703584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652479, "dur": 8, "args": { "External id": 29492, "cbid": 211, "correlation": 29492 } }, { "ph": "s", "id": 29492, "pid": 76337, "tid": -914061504, "ts": 1716454217652479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217703612, "dur": 20, "args": { "External id": 29500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29500, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29500, "pid": 5, "tid": 7, "ts": 1716454217703612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652507, "dur": 8, "args": { "External id": 29500, "cbid": 211, "correlation": 29500 } }, { "ph": "s", "id": 29500, "pid": 76337, "tid": -914061504, "ts": 1716454217652507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217703633, "dur": 30, "args": { "External id": 29520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29520, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 29520, "pid": 5, "tid": 7, "ts": 1716454217703633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652591, "dur": 12, "args": { "External id": 29520, "cbid": 211, "correlation": 29520 } }, { "ph": "s", "id": 29520, "pid": 76337, "tid": -914061504, "ts": 1716454217652591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217703665, "dur": 5, "args": { "External id": 29532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29532, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 29532, "pid": 5, "tid": 7, "ts": 1716454217703665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652613, "dur": 6, "args": { "External id": 29532, "cbid": 211, "correlation": 29532 } }, { "ph": "s", "id": 29532, "pid": 76337, "tid": -914061504, "ts": 1716454217652613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217703671, "dur": 31, "args": { "External id": 29535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29535, "pid": 5, "tid": 7, "ts": 1716454217703671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652631, "dur": 7, "args": { "External id": 29535, "cbid": 211, "correlation": 29535 } }, { "ph": "s", "id": 29535, "pid": 76337, "tid": -914061504, "ts": 1716454217652631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217652701, "dur": 0, "args": { "External id": 29546, "cbid": 317, "correlation": 29546 } }, { "ph": "f", "id": 29546, "pid": 76337, "tid": -914061504, "ts": 1716454217652701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217652702, "dur": 0, "args": { "External id": 29547, "cbid": 203, "correlation": 29547 } }, { "ph": "f", "id": 29547, "pid": 76337, "tid": -914061504, "ts": 1716454217652702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217652702, "dur": 0, "args": { "External id": 29548, "cbid": 205, "correlation": 29548 } }, { "ph": "f", "id": 29548, "pid": 76337, "tid": -914061504, "ts": 1716454217652702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217703703, "dur": 23, "args": { "External id": 29552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29552, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29552, "pid": 5, "tid": 7, "ts": 1716454217703703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652716, "dur": 12, "args": { "External id": 29552, "cbid": 211, "correlation": 29552 } }, { "ph": "s", "id": 29552, "pid": 76337, "tid": -914061504, "ts": 1716454217652716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217703727, "dur": 103, "args": { "External id": 29554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29554, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29554, "pid": 5, "tid": 7, "ts": 1716454217703727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652734, "dur": 6, "args": { "External id": 29554, "cbid": 211, "correlation": 29554 } }, { "ph": "s", "id": 29554, "pid": 76337, "tid": -914061504, "ts": 1716454217652734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217703832, "dur": 21, "args": { "External id": 29556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29556, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29556, "pid": 5, "tid": 7, "ts": 1716454217703832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652744, "dur": 5, "args": { "External id": 29556, "cbid": 211, "correlation": 29556 } }, { "ph": "s", "id": 29556, "pid": 76337, "tid": -914061504, "ts": 1716454217652744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217703854, "dur": 32, "args": { "External id": 29562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29562, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29562, "pid": 5, "tid": 7, "ts": 1716454217703854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652772, "dur": 8, "args": { "External id": 29562, "cbid": 211, "correlation": 29562 } }, { "ph": "s", "id": 29562, "pid": 76337, "tid": -914061504, "ts": 1716454217652772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217703887, "dur": 195, "args": { "External id": 29571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29571, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29571, "pid": 5, "tid": 7, "ts": 1716454217703887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652855, "dur": 14, "args": { "External id": 29571, "cbid": 211, "correlation": 29571 } }, { "ph": "s", "id": 29571, "pid": 76337, "tid": -914061504, "ts": 1716454217652855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217704084, "dur": 64, "args": { "External id": 29593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29593, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29593, "pid": 5, "tid": 7, "ts": 1716454217704084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217652912, "dur": 10, "args": { "External id": 29593, "cbid": 211, "correlation": 29593 } }, { "ph": "s", "id": 29593, "pid": 76337, "tid": -914061504, "ts": 1716454217652912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653022, "dur": 1, "args": { "External id": 29604, "cbid": 251, "correlation": 29604 } }, { "ph": "f", "id": 29604, "pid": 76337, "tid": -914061504, "ts": 1716454217653022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217704150, "dur": 150, "args": { "External id": 29605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29605, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29605, "pid": 5, "tid": 7, "ts": 1716454217704150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653028, "dur": 14, "args": { "External id": 29605, "cbid": 211, "correlation": 29605 } }, { "ph": "s", "id": 29605, "pid": 76337, "tid": -914061504, "ts": 1716454217653028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653114, "dur": 1, "args": { "External id": 29616, "cbid": 251, "correlation": 29616 } }, { "ph": "f", "id": 29616, "pid": 76337, "tid": -914061504, "ts": 1716454217653114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217704301, "dur": 142, "args": { "External id": 29617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29617, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29617, "pid": 5, "tid": 7, "ts": 1716454217704301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653118, "dur": 12, "args": { "External id": 29617, "cbid": 211, "correlation": 29617 } }, { "ph": "s", "id": 29617, "pid": 76337, "tid": -914061504, "ts": 1716454217653118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653196, "dur": 1, "args": { "External id": 29628, "cbid": 251, "correlation": 29628 } }, { "ph": "f", "id": 29628, "pid": 76337, "tid": -914061504, "ts": 1716454217653196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217704445, "dur": 143, "args": { "External id": 29629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29629, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29629, "pid": 5, "tid": 7, "ts": 1716454217704445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653201, "dur": 12, "args": { "External id": 29629, "cbid": 211, "correlation": 29629 } }, { "ph": "s", "id": 29629, "pid": 76337, "tid": -914061504, "ts": 1716454217653201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217704589, "dur": 1906, "args": { "External id": 29650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29650, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 29650, "pid": 5, "tid": 7, "ts": 1716454217704589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653285, "dur": 13, "args": { "External id": 29650, "cbid": 211, "correlation": 29650 } }, { "ph": "s", "id": 29650, "pid": 76337, "tid": -914061504, "ts": 1716454217653285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653397, "dur": 1, "args": { "External id": 29668, "cbid": 251, "correlation": 29668 } }, { "ph": "f", "id": 29668, "pid": 76337, "tid": -914061504, "ts": 1716454217653397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217706496, "dur": 145, "args": { "External id": 29670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29670, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 29670, "pid": 5, "tid": 7, "ts": 1716454217706496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653403, "dur": 15, "args": { "External id": 29670, "cbid": 211, "correlation": 29670 } }, { "ph": "s", "id": 29670, "pid": 76337, "tid": -914061504, "ts": 1716454217653403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217706642, "dur": 36, "args": { "External id": 29678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29678, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29678, "pid": 5, "tid": 7, "ts": 1716454217706642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653476, "dur": 12, "args": { "External id": 29678, "cbid": 211, "correlation": 29678 } }, { "ph": "s", "id": 29678, "pid": 76337, "tid": -914061504, "ts": 1716454217653476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217706679, "dur": 51, "args": { "External id": 29686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29686, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29686, "pid": 5, "tid": 7, "ts": 1716454217706679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653515, "dur": 9, "args": { "External id": 29686, "cbid": 211, "correlation": 29686 } }, { "ph": "s", "id": 29686, "pid": 76337, "tid": -914061504, "ts": 1716454217653515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217706731, "dur": 31, "args": { "External id": 29697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29697, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29697, "pid": 5, "tid": 7, "ts": 1716454217706731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653601, "dur": 14, "args": { "External id": 29697, "cbid": 211, "correlation": 29697 } }, { "ph": "s", "id": 29697, "pid": 76337, "tid": -914061504, "ts": 1716454217653601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217706763, "dur": 34, "args": { "External id": 29719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29719, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29719, "pid": 5, "tid": 7, "ts": 1716454217706763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653634, "dur": 8, "args": { "External id": 29719, "cbid": 211, "correlation": 29719 } }, { "ph": "s", "id": 29719, "pid": 76337, "tid": -914061504, "ts": 1716454217653634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653730, "dur": 1, "args": { "External id": 29730, "cbid": 251, "correlation": 29730 } }, { "ph": "f", "id": 29730, "pid": 76337, "tid": -914061504, "ts": 1716454217653730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217706799, "dur": 88, "args": { "External id": 29731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29731, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29731, "pid": 5, "tid": 7, "ts": 1716454217706799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653736, "dur": 14, "args": { "External id": 29731, "cbid": 211, "correlation": 29731 } }, { "ph": "s", "id": 29731, "pid": 76337, "tid": -914061504, "ts": 1716454217653736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653818, "dur": 1, "args": { "External id": 29742, "cbid": 251, "correlation": 29742 } }, { "ph": "f", "id": 29742, "pid": 76337, "tid": -914061504, "ts": 1716454217653818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653823, "dur": 0, "args": { "External id": 29743, "cbid": 251, "correlation": 29743 } }, { "ph": "f", "id": 29743, "pid": 76337, "tid": -914061504, "ts": 1716454217653823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217706888, "dur": 11, "args": { "External id": 29744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29744, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 29744, "pid": 5, "tid": 7, "ts": 1716454217706888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653824, "dur": 13, "args": { "External id": 29744, "cbid": 211, "correlation": 29744 } }, { "ph": "s", "id": 29744, "pid": 76337, "tid": -914061504, "ts": 1716454217653824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217706900, "dur": 5, "args": { "External id": 29746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29746, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 29746, "pid": 5, "tid": 7, "ts": 1716454217706900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653839, "dur": 6, "args": { "External id": 29746, "cbid": 211, "correlation": 29746 } }, { "ph": "s", "id": 29746, "pid": 76337, "tid": -914061504, "ts": 1716454217653839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653910, "dur": 1, "args": { "External id": 29757, "cbid": 251, "correlation": 29757 } }, { "ph": "f", "id": 29757, "pid": 76337, "tid": -914061504, "ts": 1716454217653910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217653914, "dur": 0, "args": { "External id": 29758, "cbid": 251, "correlation": 29758 } }, { "ph": "f", "id": 29758, "pid": 76337, "tid": -914061504, "ts": 1716454217653914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217706907, "dur": 7, "args": { "External id": 29759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29759, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 29759, "pid": 5, "tid": 7, "ts": 1716454217706907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653916, "dur": 13, "args": { "External id": 29759, "cbid": 211, "correlation": 29759 } }, { "ph": "s", "id": 29759, "pid": 76337, "tid": -914061504, "ts": 1716454217653916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217706915, "dur": 4, "args": { "External id": 29761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29761, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 29761, "pid": 5, "tid": 7, "ts": 1716454217706915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217653930, "dur": 6, "args": { "External id": 29761, "cbid": 211, "correlation": 29761 } }, { "ph": "s", "id": 29761, "pid": 76337, "tid": -914061504, "ts": 1716454217653930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217706920, "dur": 90, "args": { "External id": 29782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29782, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 29782, "pid": 5, "tid": 7, "ts": 1716454217706920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654014, "dur": 13, "args": { "External id": 29782, "cbid": 211, "correlation": 29782 } }, { "ph": "s", "id": 29782, "pid": 76337, "tid": -914061504, "ts": 1716454217654014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217654124, "dur": 1, "args": { "External id": 29800, "cbid": 251, "correlation": 29800 } }, { "ph": "f", "id": 29800, "pid": 76337, "tid": -914061504, "ts": 1716454217654124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217707012, "dur": 98, "args": { "External id": 29802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29802, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29802, "pid": 5, "tid": 7, "ts": 1716454217707012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654130, "dur": 13, "args": { "External id": 29802, "cbid": 211, "correlation": 29802 } }, { "ph": "s", "id": 29802, "pid": 76337, "tid": -914061504, "ts": 1716454217654130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217707111, "dur": 19, "args": { "External id": 29810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29810, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29810, "pid": 5, "tid": 7, "ts": 1716454217707111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654202, "dur": 12, "args": { "External id": 29810, "cbid": 211, "correlation": 29810 } }, { "ph": "s", "id": 29810, "pid": 76337, "tid": -914061504, "ts": 1716454217654202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217707132, "dur": 38, "args": { "External id": 29818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29818, "pid": 5, "tid": 7, "ts": 1716454217707132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654242, "dur": 10, "args": { "External id": 29818, "cbid": 211, "correlation": 29818 } }, { "ph": "s", "id": 29818, "pid": 76337, "tid": -914061504, "ts": 1716454217654242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217707171, "dur": 34, "args": { "External id": 29840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29840, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29840, "pid": 5, "tid": 7, "ts": 1716454217707171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654293, "dur": 11, "args": { "External id": 29840, "cbid": 211, "correlation": 29840 } }, { "ph": "s", "id": 29840, "pid": 76337, "tid": -914061504, "ts": 1716454217654293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217654396, "dur": 1, "args": { "External id": 29856, "cbid": 251, "correlation": 29856 } }, { "ph": "f", "id": 29856, "pid": 76337, "tid": -914061504, "ts": 1716454217654396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217654401, "dur": 0, "args": { "External id": 29858, "cbid": 251, "correlation": 29858 } }, { "ph": "f", "id": 29858, "pid": 76337, "tid": -914061504, "ts": 1716454217654401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217707206, "dur": 529, "args": { "External id": 29859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29859, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 29859, "pid": 5, "tid": 7, "ts": 1716454217707206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654405, "dur": 14, "args": { "External id": 29859, "cbid": 211, "correlation": 29859 } }, { "ph": "s", "id": 29859, "pid": 76337, "tid": -914061504, "ts": 1716454217654405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217707737, "dur": 126, "args": { "External id": 29867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29867, "pid": 5, "tid": 7, "ts": 1716454217707737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654473, "dur": 13, "args": { "External id": 29867, "cbid": 211, "correlation": 29867 } }, { "ph": "s", "id": 29867, "pid": 76337, "tid": -914061504, "ts": 1716454217654473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217707864, "dur": 129, "args": { "External id": 29875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29875, "pid": 5, "tid": 7, "ts": 1716454217707864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654504, "dur": 9, "args": { "External id": 29875, "cbid": 211, "correlation": 29875 } }, { "ph": "s", "id": 29875, "pid": 76337, "tid": -914061504, "ts": 1716454217654504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217654595, "dur": 1, "args": { "External id": 29891, "cbid": 251, "correlation": 29891 } }, { "ph": "f", "id": 29891, "pid": 76337, "tid": -914061504, "ts": 1716454217654595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217707994, "dur": 301, "args": { "External id": 29893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29893, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29893, "pid": 5, "tid": 7, "ts": 1716454217707994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654601, "dur": 14, "args": { "External id": 29893, "cbid": 211, "correlation": 29893 } }, { "ph": "s", "id": 29893, "pid": 76337, "tid": -914061504, "ts": 1716454217654601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217708296, "dur": 27, "args": { "External id": 29901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29901, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29901, "pid": 5, "tid": 7, "ts": 1716454217708296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654645, "dur": 10, "args": { "External id": 29901, "cbid": 211, "correlation": 29901 } }, { "ph": "s", "id": 29901, "pid": 76337, "tid": -914061504, "ts": 1716454217654645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217708324, "dur": 80, "args": { "External id": 29912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29912, "pid": 5, "tid": 7, "ts": 1716454217708324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654714, "dur": 12, "args": { "External id": 29912, "cbid": 211, "correlation": 29912 } }, { "ph": "s", "id": 29912, "pid": 76337, "tid": -914061504, "ts": 1716454217654714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217654778, "dur": 0, "args": { "External id": 29924, "cbid": 317, "correlation": 29924 } }, { "ph": "f", "id": 29924, "pid": 76337, "tid": -914061504, "ts": 1716454217654778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217654778, "dur": 0, "args": { "External id": 29925, "cbid": 203, "correlation": 29925 } }, { "ph": "f", "id": 29925, "pid": 76337, "tid": -914061504, "ts": 1716454217654778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217654779, "dur": 0, "args": { "External id": 29926, "cbid": 205, "correlation": 29926 } }, { "ph": "f", "id": 29926, "pid": 76337, "tid": -914061504, "ts": 1716454217654779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217708405, "dur": 21, "args": { "External id": 29930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29930, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29930, "pid": 5, "tid": 7, "ts": 1716454217708405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654794, "dur": 12, "args": { "External id": 29930, "cbid": 211, "correlation": 29930 } }, { "ph": "s", "id": 29930, "pid": 76337, "tid": -914061504, "ts": 1716454217654794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217708428, "dur": 118, "args": { "External id": 29932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29932, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29932, "pid": 5, "tid": 7, "ts": 1716454217708428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654812, "dur": 7, "args": { "External id": 29932, "cbid": 211, "correlation": 29932 } }, { "ph": "s", "id": 29932, "pid": 76337, "tid": -914061504, "ts": 1716454217654812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217708547, "dur": 24, "args": { "External id": 29934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29934, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29934, "pid": 5, "tid": 7, "ts": 1716454217708547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654824, "dur": 5, "args": { "External id": 29934, "cbid": 211, "correlation": 29934 } }, { "ph": "s", "id": 29934, "pid": 76337, "tid": -914061504, "ts": 1716454217654824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217708572, "dur": 32, "args": { "External id": 29940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29940, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29940, "pid": 5, "tid": 7, "ts": 1716454217708572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654851, "dur": 8, "args": { "External id": 29940, "cbid": 211, "correlation": 29940 } }, { "ph": "s", "id": 29940, "pid": 76337, "tid": -914061504, "ts": 1716454217654851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217708605, "dur": 27, "args": { "External id": 29948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29948, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29948, "pid": 5, "tid": 7, "ts": 1716454217708605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654882, "dur": 9, "args": { "External id": 29948, "cbid": 211, "correlation": 29948 } }, { "ph": "s", "id": 29948, "pid": 76337, "tid": -914061504, "ts": 1716454217654882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454217708633, "dur": 99, "args": { "External id": 29959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29959, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29959, "pid": 5, "tid": 7, "ts": 1716454217708633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217654949, "dur": 12, "args": { "External id": 29959, "cbid": 211, "correlation": 29959 } }, { "ph": "s", "id": 29959, "pid": 76337, "tid": -914061504, "ts": 1716454217654949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217655013, "dur": 0, "args": { "External id": 29969, "cbid": 317, "correlation": 29969 } }, { "ph": "f", "id": 29969, "pid": 76337, "tid": -914061504, "ts": 1716454217655013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217655014, "dur": 0, "args": { "External id": 29970, "cbid": 203, "correlation": 29970 } }, { "ph": "f", "id": 29970, "pid": 76337, "tid": -914061504, "ts": 1716454217655014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217655015, "dur": 0, "args": { "External id": 29971, "cbid": 205, "correlation": 29971 } }, { "ph": "f", "id": 29971, "pid": 76337, "tid": -914061504, "ts": 1716454217655015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217708734, "dur": 73, "args": { "External id": 29975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29975, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29975, "pid": 5, "tid": 7, "ts": 1716454217708734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655030, "dur": 12, "args": { "External id": 29975, "cbid": 211, "correlation": 29975 } }, { "ph": "s", "id": 29975, "pid": 76337, "tid": -914061504, "ts": 1716454217655030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217708808, "dur": 44, "args": { "External id": 29977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29977, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29977, "pid": 5, "tid": 7, "ts": 1716454217708808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655044, "dur": 5, "args": { "External id": 29977, "cbid": 211, "correlation": 29977 } }, { "ph": "s", "id": 29977, "pid": 76337, "tid": -914061504, "ts": 1716454217655044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217708853, "dur": 4, "args": { "External id": 29979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 29979, "pid": 5, "tid": 7, "ts": 1716454217708853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655055, "dur": 9, "args": { "External id": 29979, "cbid": 211, "correlation": 29979 } }, { "ph": "s", "id": 29979, "pid": 76337, "tid": -914061504, "ts": 1716454217655055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217655068, "dur": 0, "args": { "External id": 29980, "cbid": 51, "correlation": 29980 } }, { "ph": "s", "id": 29980, "pid": 76337, "tid": -914061504, "ts": 1716454217655068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217708858, "dur": 2214, "args": { "External id": 29981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29981, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 29981, "pid": 5, "tid": 7, "ts": 1716454217708858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655069, "dur": 6, "args": { "External id": 29981, "cbid": 211, "correlation": 29981 } }, { "ph": "s", "id": 29981, "pid": 76337, "tid": -914061504, "ts": 1716454217655069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217711074, "dur": 111, "args": { "External id": 29986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29986, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29986, "pid": 5, "tid": 7, "ts": 1716454217711074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655098, "dur": 9, "args": { "External id": 29986, "cbid": 211, "correlation": 29986 } }, { "ph": "s", "id": 29986, "pid": 76337, "tid": -914061504, "ts": 1716454217655098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217711186, "dur": 162, "args": { "External id": 29995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 29995, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 29995, "pid": 5, "tid": 7, "ts": 1716454217711186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655192, "dur": 13, "args": { "External id": 29995, "cbid": 211, "correlation": 29995 } }, { "ph": "s", "id": 29995, "pid": 76337, "tid": -914061504, "ts": 1716454217655192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217711350, "dur": 128, "args": { "External id": 30015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30015, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 30015, "pid": 5, "tid": 7, "ts": 1716454217711350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655278, "dur": 14, "args": { "External id": 30015, "cbid": 211, "correlation": 30015 } }, { "ph": "s", "id": 30015, "pid": 76337, "tid": -914061504, "ts": 1716454217655278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217711478, "dur": 4, "args": { "External id": 30027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30027, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 30027, "pid": 5, "tid": 7, "ts": 1716454217711478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655301, "dur": 6, "args": { "External id": 30027, "cbid": 211, "correlation": 30027 } }, { "ph": "s", "id": 30027, "pid": 76337, "tid": -914061504, "ts": 1716454217655301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217711484, "dur": 156, "args": { "External id": 30030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30030, "pid": 5, "tid": 7, "ts": 1716454217711484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655320, "dur": 7, "args": { "External id": 30030, "cbid": 211, "correlation": 30030 } }, { "ph": "s", "id": 30030, "pid": 76337, "tid": -914061504, "ts": 1716454217655320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217711642, "dur": 101, "args": { "External id": 30039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30039, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30039, "pid": 5, "tid": 7, "ts": 1716454217711642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655361, "dur": 9, "args": { "External id": 30039, "cbid": 211, "correlation": 30039 } }, { "ph": "s", "id": 30039, "pid": 76337, "tid": -914061504, "ts": 1716454217655361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217655424, "dur": 0, "args": { "External id": 30049, "cbid": 317, "correlation": 30049 } }, { "ph": "f", "id": 30049, "pid": 76337, "tid": -914061504, "ts": 1716454217655424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217655425, "dur": 0, "args": { "External id": 30050, "cbid": 203, "correlation": 30050 } }, { "ph": "f", "id": 30050, "pid": 76337, "tid": -914061504, "ts": 1716454217655425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217655426, "dur": 0, "args": { "External id": 30051, "cbid": 205, "correlation": 30051 } }, { "ph": "f", "id": 30051, "pid": 76337, "tid": -914061504, "ts": 1716454217655426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217711745, "dur": 111, "args": { "External id": 30055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30055, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30055, "pid": 5, "tid": 7, "ts": 1716454217711745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655442, "dur": 14, "args": { "External id": 30055, "cbid": 211, "correlation": 30055 } }, { "ph": "s", "id": 30055, "pid": 76337, "tid": -914061504, "ts": 1716454217655442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217711857, "dur": 34, "args": { "External id": 30057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30057, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30057, "pid": 5, "tid": 7, "ts": 1716454217711857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655458, "dur": 5, "args": { "External id": 30057, "cbid": 211, "correlation": 30057 } }, { "ph": "s", "id": 30057, "pid": 76337, "tid": -914061504, "ts": 1716454217655458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217711892, "dur": 3, "args": { "External id": 30059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30059, "pid": 5, "tid": 7, "ts": 1716454217711892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655469, "dur": 6, "args": { "External id": 30059, "cbid": 211, "correlation": 30059 } }, { "ph": "s", "id": 30059, "pid": 76337, "tid": -914061504, "ts": 1716454217655469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217655478, "dur": 0, "args": { "External id": 30060, "cbid": 51, "correlation": 30060 } }, { "ph": "s", "id": 30060, "pid": 76337, "tid": -914061504, "ts": 1716454217655478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217711896, "dur": 1987, "args": { "External id": 30061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30061, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30061, "pid": 5, "tid": 7, "ts": 1716454217711896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655479, "dur": 6, "args": { "External id": 30061, "cbid": 211, "correlation": 30061 } }, { "ph": "s", "id": 30061, "pid": 76337, "tid": -914061504, "ts": 1716454217655479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217713884, "dur": 59, "args": { "External id": 30066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30066, "pid": 5, "tid": 7, "ts": 1716454217713884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655509, "dur": 8, "args": { "External id": 30066, "cbid": 211, "correlation": 30066 } }, { "ph": "s", "id": 30066, "pid": 76337, "tid": -914061504, "ts": 1716454217655509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217713945, "dur": 3, "args": { "External id": 30074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30074, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30074, "pid": 5, "tid": 7, "ts": 1716454217713945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655553, "dur": 10, "args": { "External id": 30074, "cbid": 211, "correlation": 30074 } }, { "ph": "s", "id": 30074, "pid": 76337, "tid": -914061504, "ts": 1716454217655553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217655629, "dur": 1, "args": { "External id": 30090, "cbid": 251, "correlation": 30090 } }, { "ph": "f", "id": 30090, "pid": 76337, "tid": -914061504, "ts": 1716454217655629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217655634, "dur": 0, "args": { "External id": 30092, "cbid": 251, "correlation": 30092 } }, { "ph": "f", "id": 30092, "pid": 76337, "tid": -914061504, "ts": 1716454217655634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217713949, "dur": 11, "args": { "External id": 30093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30093, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 30093, "pid": 5, "tid": 7, "ts": 1716454217713949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655636, "dur": 12, "args": { "External id": 30093, "cbid": 211, "correlation": 30093 } }, { "ph": "s", "id": 30093, "pid": 76337, "tid": -914061504, "ts": 1716454217655636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217713962, "dur": 5, "args": { "External id": 30095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30095, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 30095, "pid": 5, "tid": 7, "ts": 1716454217713962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655651, "dur": 6, "args": { "External id": 30095, "cbid": 211, "correlation": 30095 } }, { "ph": "s", "id": 30095, "pid": 76337, "tid": -914061504, "ts": 1716454217655651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217713968, "dur": 52, "args": { "External id": 30105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30105, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30105, "pid": 5, "tid": 7, "ts": 1716454217713968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655710, "dur": 12, "args": { "External id": 30105, "cbid": 211, "correlation": 30105 } }, { "ph": "s", "id": 30105, "pid": 76337, "tid": -914061504, "ts": 1716454217655710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217714022, "dur": 49, "args": { "External id": 30125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30125, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 30125, "pid": 5, "tid": 7, "ts": 1716454217714022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655777, "dur": 11, "args": { "External id": 30125, "cbid": 211, "correlation": 30125 } }, { "ph": "s", "id": 30125, "pid": 76337, "tid": -914061504, "ts": 1716454217655777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217714073, "dur": 4, "args": { "External id": 30137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30137, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30137, "pid": 5, "tid": 7, "ts": 1716454217714073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655798, "dur": 6, "args": { "External id": 30137, "cbid": 211, "correlation": 30137 } }, { "ph": "s", "id": 30137, "pid": 76337, "tid": -914061504, "ts": 1716454217655798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217714078, "dur": 55, "args": { "External id": 30140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30140, "pid": 5, "tid": 7, "ts": 1716454217714078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655815, "dur": 7, "args": { "External id": 30140, "cbid": 211, "correlation": 30140 } }, { "ph": "s", "id": 30140, "pid": 76337, "tid": -914061504, "ts": 1716454217655815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217714134, "dur": 36, "args": { "External id": 30149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30149, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30149, "pid": 5, "tid": 7, "ts": 1716454217714134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655855, "dur": 10, "args": { "External id": 30149, "cbid": 211, "correlation": 30149 } }, { "ph": "s", "id": 30149, "pid": 76337, "tid": -914061504, "ts": 1716454217655855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217655919, "dur": 0, "args": { "External id": 30159, "cbid": 317, "correlation": 30159 } }, { "ph": "f", "id": 30159, "pid": 76337, "tid": -914061504, "ts": 1716454217655919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217655920, "dur": 0, "args": { "External id": 30160, "cbid": 203, "correlation": 30160 } }, { "ph": "f", "id": 30160, "pid": 76337, "tid": -914061504, "ts": 1716454217655920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217655921, "dur": 0, "args": { "External id": 30161, "cbid": 205, "correlation": 30161 } }, { "ph": "f", "id": 30161, "pid": 76337, "tid": -914061504, "ts": 1716454217655921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217714171, "dur": 41, "args": { "External id": 30165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30165, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30165, "pid": 5, "tid": 7, "ts": 1716454217714171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655939, "dur": 12, "args": { "External id": 30165, "cbid": 211, "correlation": 30165 } }, { "ph": "s", "id": 30165, "pid": 76337, "tid": -914061504, "ts": 1716454217655939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217714213, "dur": 14, "args": { "External id": 30167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30167, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30167, "pid": 5, "tid": 7, "ts": 1716454217714213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655953, "dur": 5, "args": { "External id": 30167, "cbid": 211, "correlation": 30167 } }, { "ph": "s", "id": 30167, "pid": 76337, "tid": -914061504, "ts": 1716454217655953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217714229, "dur": 3, "args": { "External id": 30169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30169, "pid": 5, "tid": 7, "ts": 1716454217714229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655963, "dur": 6, "args": { "External id": 30169, "cbid": 211, "correlation": 30169 } }, { "ph": "s", "id": 30169, "pid": 76337, "tid": -914061504, "ts": 1716454217655963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217655972, "dur": 0, "args": { "External id": 30170, "cbid": 51, "correlation": 30170 } }, { "ph": "s", "id": 30170, "pid": 76337, "tid": -914061504, "ts": 1716454217655972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217714233, "dur": 691, "args": { "External id": 30171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30171, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30171, "pid": 5, "tid": 7, "ts": 1716454217714233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217655981, "dur": 5, "args": { "External id": 30171, "cbid": 211, "correlation": 30171 } }, { "ph": "s", "id": 30171, "pid": 76337, "tid": -914061504, "ts": 1716454217655981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217714925, "dur": 60, "args": { "External id": 30176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30176, "pid": 5, "tid": 7, "ts": 1716454217714925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656010, "dur": 9, "args": { "External id": 30176, "cbid": 211, "correlation": 30176 } }, { "ph": "s", "id": 30176, "pid": 76337, "tid": -914061504, "ts": 1716454217656010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217656069, "dur": 0, "args": { "External id": 30186, "cbid": 317, "correlation": 30186 } }, { "ph": "f", "id": 30186, "pid": 76337, "tid": -914061504, "ts": 1716454217656069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217656070, "dur": 0, "args": { "External id": 30187, "cbid": 203, "correlation": 30187 } }, { "ph": "f", "id": 30187, "pid": 76337, "tid": -914061504, "ts": 1716454217656070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217656070, "dur": 0, "args": { "External id": 30188, "cbid": 205, "correlation": 30188 } }, { "ph": "f", "id": 30188, "pid": 76337, "tid": -914061504, "ts": 1716454217656070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217714987, "dur": 4, "args": { "External id": 30192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30192, "pid": 5, "tid": 7, "ts": 1716454217714987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656087, "dur": 11, "args": { "External id": 30192, "cbid": 211, "correlation": 30192 } }, { "ph": "s", "id": 30192, "pid": 76337, "tid": -914061504, "ts": 1716454217656087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217656103, "dur": 0, "args": { "External id": 30193, "cbid": 51, "correlation": 30193 } }, { "ph": "s", "id": 30193, "pid": 76337, "tid": -914061504, "ts": 1716454217656103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454217714991, "dur": 262, "args": { "External id": 30194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30194, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30194, "pid": 5, "tid": 7, "ts": 1716454217714991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656104, "dur": 6, "args": { "External id": 30194, "cbid": 211, "correlation": 30194 } }, { "ph": "s", "id": 30194, "pid": 76337, "tid": -914061504, "ts": 1716454217656104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217715255, "dur": 58, "args": { "External id": 30199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30199, "pid": 5, "tid": 7, "ts": 1716454217715255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656131, "dur": 8, "args": { "External id": 30199, "cbid": 211, "correlation": 30199 } }, { "ph": "s", "id": 30199, "pid": 76337, "tid": -914061504, "ts": 1716454217656131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217715314, "dur": 50, "args": { "External id": 30207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30207, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30207, "pid": 5, "tid": 7, "ts": 1716454217715314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656159, "dur": 8, "args": { "External id": 30207, "cbid": 211, "correlation": 30207 } }, { "ph": "s", "id": 30207, "pid": 76337, "tid": -914061504, "ts": 1716454217656159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217715366, "dur": 35, "args": { "External id": 30215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30215, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30215, "pid": 5, "tid": 7, "ts": 1716454217715366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656189, "dur": 8, "args": { "External id": 30215, "cbid": 211, "correlation": 30215 } }, { "ph": "s", "id": 30215, "pid": 76337, "tid": -914061504, "ts": 1716454217656189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217715402, "dur": 52, "args": { "External id": 30235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30235, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 30235, "pid": 5, "tid": 7, "ts": 1716454217715402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656269, "dur": 12, "args": { "External id": 30235, "cbid": 211, "correlation": 30235 } }, { "ph": "s", "id": 30235, "pid": 76337, "tid": -914061504, "ts": 1716454217656269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217715456, "dur": 4, "args": { "External id": 30247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30247, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30247, "pid": 5, "tid": 7, "ts": 1716454217715456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656291, "dur": 6, "args": { "External id": 30247, "cbid": 211, "correlation": 30247 } }, { "ph": "s", "id": 30247, "pid": 76337, "tid": -914061504, "ts": 1716454217656291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217715461, "dur": 56, "args": { "External id": 30250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30250, "pid": 5, "tid": 7, "ts": 1716454217715461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656308, "dur": 7, "args": { "External id": 30250, "cbid": 211, "correlation": 30250 } }, { "ph": "s", "id": 30250, "pid": 76337, "tid": -914061504, "ts": 1716454217656308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217656377, "dur": 0, "args": { "External id": 30261, "cbid": 317, "correlation": 30261 } }, { "ph": "f", "id": 30261, "pid": 76337, "tid": -914061504, "ts": 1716454217656377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217656378, "dur": 0, "args": { "External id": 30262, "cbid": 203, "correlation": 30262 } }, { "ph": "f", "id": 30262, "pid": 76337, "tid": -914061504, "ts": 1716454217656378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217656379, "dur": 0, "args": { "External id": 30263, "cbid": 205, "correlation": 30263 } }, { "ph": "f", "id": 30263, "pid": 76337, "tid": -914061504, "ts": 1716454217656379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656411, "dur": 2, "args": { "External id": 30267, "cbid": 251, "correlation": 30267 } }, { "ph": "f", "id": 30267, "pid": 76337, "tid": -914061504, "ts": 1716454217656411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656414, "dur": 1, "args": { "External id": 30268, "cbid": 251, "correlation": 30268 } }, { "ph": "f", "id": 30268, "pid": 76337, "tid": -914061504, "ts": 1716454217656414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656416, "dur": 0, "args": { "External id": 30269, "cbid": 251, "correlation": 30269 } }, { "ph": "f", "id": 30269, "pid": 76337, "tid": -914061504, "ts": 1716454217656416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656417, "dur": 1, "args": { "External id": 30270, "cbid": 251, "correlation": 30270 } }, { "ph": "f", "id": 30270, "pid": 76337, "tid": -914061504, "ts": 1716454217656417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656419, "dur": 1, "args": { "External id": 30271, "cbid": 251, "correlation": 30271 } }, { "ph": "f", "id": 30271, "pid": 76337, "tid": -914061504, "ts": 1716454217656419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656421, "dur": 1, "args": { "External id": 30272, "cbid": 251, "correlation": 30272 } }, { "ph": "f", "id": 30272, "pid": 76337, "tid": -914061504, "ts": 1716454217656421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656423, "dur": 1, "args": { "External id": 30273, "cbid": 251, "correlation": 30273 } }, { "ph": "f", "id": 30273, "pid": 76337, "tid": -914061504, "ts": 1716454217656423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656425, "dur": 1, "args": { "External id": 30274, "cbid": 251, "correlation": 30274 } }, { "ph": "f", "id": 30274, "pid": 76337, "tid": -914061504, "ts": 1716454217656425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656427, "dur": 0, "args": { "External id": 30275, "cbid": 251, "correlation": 30275 } }, { "ph": "f", "id": 30275, "pid": 76337, "tid": -914061504, "ts": 1716454217656427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217715519, "dur": 114, "args": { "External id": 30276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30276, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 30276, "pid": 5, "tid": 7, "ts": 1716454217715519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656429, "dur": 14, "args": { "External id": 30276, "cbid": 211, "correlation": 30276 } }, { "ph": "s", "id": 30276, "pid": 76337, "tid": -914061504, "ts": 1716454217656429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217715634, "dur": 59, "args": { "External id": 30282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30282, "pid": 5, "tid": 7, "ts": 1716454217715634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656476, "dur": 10, "args": { "External id": 30282, "cbid": 211, "correlation": 30282 } }, { "ph": "s", "id": 30282, "pid": 76337, "tid": -914061504, "ts": 1716454217656476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217715694, "dur": 568, "args": { "External id": 30291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30291, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30291, "pid": 5, "tid": 7, "ts": 1716454217715694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656563, "dur": 14, "args": { "External id": 30291, "cbid": 211, "correlation": 30291 } }, { "ph": "s", "id": 30291, "pid": 76337, "tid": -914061504, "ts": 1716454217656563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217716263, "dur": 179, "args": { "External id": 30313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30313, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30313, "pid": 5, "tid": 7, "ts": 1716454217716263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656631, "dur": 12, "args": { "External id": 30313, "cbid": 211, "correlation": 30313 } }, { "ph": "s", "id": 30313, "pid": 76337, "tid": -914061504, "ts": 1716454217656631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656734, "dur": 1, "args": { "External id": 30324, "cbid": 251, "correlation": 30324 } }, { "ph": "f", "id": 30324, "pid": 76337, "tid": -914061504, "ts": 1716454217656734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217716443, "dur": 194, "args": { "External id": 30325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30325, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30325, "pid": 5, "tid": 7, "ts": 1716454217716443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656739, "dur": 14, "args": { "External id": 30325, "cbid": 211, "correlation": 30325 } }, { "ph": "s", "id": 30325, "pid": 76337, "tid": -914061504, "ts": 1716454217656739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656823, "dur": 1, "args": { "External id": 30336, "cbid": 251, "correlation": 30336 } }, { "ph": "f", "id": 30336, "pid": 76337, "tid": -914061504, "ts": 1716454217656823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217716639, "dur": 187, "args": { "External id": 30337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30337, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30337, "pid": 5, "tid": 7, "ts": 1716454217716639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656827, "dur": 13, "args": { "External id": 30337, "cbid": 211, "correlation": 30337 } }, { "ph": "s", "id": 30337, "pid": 76337, "tid": -914061504, "ts": 1716454217656827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217656905, "dur": 1, "args": { "External id": 30348, "cbid": 251, "correlation": 30348 } }, { "ph": "f", "id": 30348, "pid": 76337, "tid": -914061504, "ts": 1716454217656905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217716827, "dur": 185, "args": { "External id": 30349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30349, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30349, "pid": 5, "tid": 7, "ts": 1716454217716827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217656910, "dur": 13, "args": { "External id": 30349, "cbid": 211, "correlation": 30349 } }, { "ph": "s", "id": 30349, "pid": 76337, "tid": -914061504, "ts": 1716454217656910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217717013, "dur": 18349, "args": { "External id": 30370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30370, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 30370, "pid": 5, "tid": 7, "ts": 1716454217717013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657003, "dur": 14, "args": { "External id": 30370, "cbid": 211, "correlation": 30370 } }, { "ph": "s", "id": 30370, "pid": 76337, "tid": -914061504, "ts": 1716454217657003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657116, "dur": 1, "args": { "External id": 30388, "cbid": 251, "correlation": 30388 } }, { "ph": "f", "id": 30388, "pid": 76337, "tid": -914061504, "ts": 1716454217657116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217735364, "dur": 201, "args": { "External id": 30390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30390, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30390, "pid": 5, "tid": 7, "ts": 1716454217735364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657122, "dur": 14, "args": { "External id": 30390, "cbid": 211, "correlation": 30390 } }, { "ph": "s", "id": 30390, "pid": 76337, "tid": -914061504, "ts": 1716454217657122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217735567, "dur": 67, "args": { "External id": 30398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30398, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30398, "pid": 5, "tid": 7, "ts": 1716454217735567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657195, "dur": 13, "args": { "External id": 30398, "cbid": 211, "correlation": 30398 } }, { "ph": "s", "id": 30398, "pid": 76337, "tid": -914061504, "ts": 1716454217657195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217735635, "dur": 96, "args": { "External id": 30406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30406, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30406, "pid": 5, "tid": 7, "ts": 1716454217735635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657235, "dur": 8, "args": { "External id": 30406, "cbid": 211, "correlation": 30406 } }, { "ph": "s", "id": 30406, "pid": 76337, "tid": -914061504, "ts": 1716454217657235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217735732, "dur": 53, "args": { "External id": 30417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30417, "pid": 5, "tid": 7, "ts": 1716454217735732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657307, "dur": 14, "args": { "External id": 30417, "cbid": 211, "correlation": 30417 } }, { "ph": "s", "id": 30417, "pid": 76337, "tid": -914061504, "ts": 1716454217657307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217735787, "dur": 91, "args": { "External id": 30439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30439, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30439, "pid": 5, "tid": 7, "ts": 1716454217735787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657340, "dur": 8, "args": { "External id": 30439, "cbid": 211, "correlation": 30439 } }, { "ph": "s", "id": 30439, "pid": 76337, "tid": -914061504, "ts": 1716454217657340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657436, "dur": 1, "args": { "External id": 30450, "cbid": 251, "correlation": 30450 } }, { "ph": "f", "id": 30450, "pid": 76337, "tid": -914061504, "ts": 1716454217657436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217735879, "dur": 103, "args": { "External id": 30451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30451, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30451, "pid": 5, "tid": 7, "ts": 1716454217735879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657441, "dur": 14, "args": { "External id": 30451, "cbid": 211, "correlation": 30451 } }, { "ph": "s", "id": 30451, "pid": 76337, "tid": -914061504, "ts": 1716454217657441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657535, "dur": 2, "args": { "External id": 30462, "cbid": 251, "correlation": 30462 } }, { "ph": "f", "id": 30462, "pid": 76337, "tid": -914061504, "ts": 1716454217657535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657540, "dur": 0, "args": { "External id": 30463, "cbid": 251, "correlation": 30463 } }, { "ph": "f", "id": 30463, "pid": 76337, "tid": -914061504, "ts": 1716454217657540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217735983, "dur": 10, "args": { "External id": 30464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30464, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 30464, "pid": 5, "tid": 7, "ts": 1716454217735983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657542, "dur": 15, "args": { "External id": 30464, "cbid": 211, "correlation": 30464 } }, { "ph": "s", "id": 30464, "pid": 76337, "tid": -914061504, "ts": 1716454217657542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217735994, "dur": 5, "args": { "External id": 30466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30466, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 30466, "pid": 5, "tid": 7, "ts": 1716454217735994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657561, "dur": 8, "args": { "External id": 30466, "cbid": 211, "correlation": 30466 } }, { "ph": "s", "id": 30466, "pid": 76337, "tid": -914061504, "ts": 1716454217657561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657638, "dur": 1, "args": { "External id": 30477, "cbid": 251, "correlation": 30477 } }, { "ph": "f", "id": 30477, "pid": 76337, "tid": -914061504, "ts": 1716454217657638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657642, "dur": 0, "args": { "External id": 30478, "cbid": 251, "correlation": 30478 } }, { "ph": "f", "id": 30478, "pid": 76337, "tid": -914061504, "ts": 1716454217657642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217736001, "dur": 6, "args": { "External id": 30479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30479, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 30479, "pid": 5, "tid": 7, "ts": 1716454217736001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657643, "dur": 13, "args": { "External id": 30479, "cbid": 211, "correlation": 30479 } }, { "ph": "s", "id": 30479, "pid": 76337, "tid": -914061504, "ts": 1716454217657643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217736008, "dur": 3, "args": { "External id": 30481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30481, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 30481, "pid": 5, "tid": 7, "ts": 1716454217736008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657658, "dur": 6, "args": { "External id": 30481, "cbid": 211, "correlation": 30481 } }, { "ph": "s", "id": 30481, "pid": 76337, "tid": -914061504, "ts": 1716454217657658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217736013, "dur": 153, "args": { "External id": 30502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30502, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 30502, "pid": 5, "tid": 7, "ts": 1716454217736013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657733, "dur": 13, "args": { "External id": 30502, "cbid": 211, "correlation": 30502 } }, { "ph": "s", "id": 30502, "pid": 76337, "tid": -914061504, "ts": 1716454217657733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217657843, "dur": 2, "args": { "External id": 30520, "cbid": 251, "correlation": 30520 } }, { "ph": "f", "id": 30520, "pid": 76337, "tid": -914061504, "ts": 1716454217657843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217736167, "dur": 106, "args": { "External id": 30522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30522, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 30522, "pid": 5, "tid": 7, "ts": 1716454217736167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657849, "dur": 14, "args": { "External id": 30522, "cbid": 211, "correlation": 30522 } }, { "ph": "s", "id": 30522, "pid": 76337, "tid": -914061504, "ts": 1716454217657849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217736274, "dur": 35, "args": { "External id": 30530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30530, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30530, "pid": 5, "tid": 7, "ts": 1716454217736274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657921, "dur": 12, "args": { "External id": 30530, "cbid": 211, "correlation": 30530 } }, { "ph": "s", "id": 30530, "pid": 76337, "tid": -914061504, "ts": 1716454217657921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217736310, "dur": 68, "args": { "External id": 30538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30538, "pid": 5, "tid": 7, "ts": 1716454217736310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217657963, "dur": 9, "args": { "External id": 30538, "cbid": 211, "correlation": 30538 } }, { "ph": "s", "id": 30538, "pid": 76337, "tid": -914061504, "ts": 1716454217657963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217736380, "dur": 91, "args": { "External id": 30560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30560, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30560, "pid": 5, "tid": 7, "ts": 1716454217736380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217658024, "dur": 11, "args": { "External id": 30560, "cbid": 211, "correlation": 30560 } }, { "ph": "s", "id": 30560, "pid": 76337, "tid": -914061504, "ts": 1716454217658024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217658112, "dur": 1, "args": { "External id": 30576, "cbid": 251, "correlation": 30576 } }, { "ph": "f", "id": 30576, "pid": 76337, "tid": -914061504, "ts": 1716454217658112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217736472, "dur": 566, "args": { "External id": 30578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30578, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30578, "pid": 5, "tid": 7, "ts": 1716454217736472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217658117, "dur": 13, "args": { "External id": 30578, "cbid": 211, "correlation": 30578 } }, { "ph": "s", "id": 30578, "pid": 76337, "tid": -914061504, "ts": 1716454217658117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217737039, "dur": 242, "args": { "External id": 30586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30586, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30586, "pid": 5, "tid": 7, "ts": 1716454217737039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217658186, "dur": 12, "args": { "External id": 30586, "cbid": 211, "correlation": 30586 } }, { "ph": "s", "id": 30586, "pid": 76337, "tid": -914061504, "ts": 1716454217658186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217658218, "dur": 1, "args": { "External id": 30594, "cbid": 317, "correlation": 30594 } }, { "ph": "f", "id": 30594, "pid": 76337, "tid": -914061504, "ts": 1716454217658218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454217658220, "dur": 295342, "args": { "External id": 30595, "cbid": 20, "correlation": 30595 } }, { "ph": "f", "id": 30595, "pid": 76337, "tid": -914061504, "ts": 1716454217658220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217953625, "dur": 255, "args": { "External id": 30598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30598, "pid": 5, "tid": 7, "ts": 1716454217953625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217953590, "dur": 33, "args": { "External id": 30598, "cbid": 211, "correlation": 30598 } }, { "ph": "s", "id": 30598, "pid": 76337, "tid": -914061504, "ts": 1716454217953590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217953811, "dur": 4, "args": { "External id": 30614, "cbid": 251, "correlation": 30614 } }, { "ph": "f", "id": 30614, "pid": 76337, "tid": -914061504, "ts": 1716454217953811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217953819, "dur": 0, "args": { "External id": 30616, "cbid": 251, "correlation": 30616 } }, { "ph": "f", "id": 30616, "pid": 76337, "tid": -914061504, "ts": 1716454217953819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217953880, "dur": 359, "args": { "External id": 30617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30617, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 30617, "pid": 5, "tid": 7, "ts": 1716454217953880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217953825, "dur": 16, "args": { "External id": 30617, "cbid": 211, "correlation": 30617 } }, { "ph": "s", "id": 30617, "pid": 76337, "tid": -914061504, "ts": 1716454217953825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217954241, "dur": 50, "args": { "External id": 30625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30625, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30625, "pid": 5, "tid": 7, "ts": 1716454217954241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217953884, "dur": 12, "args": { "External id": 30625, "cbid": 211, "correlation": 30625 } }, { "ph": "s", "id": 30625, "pid": 76337, "tid": -914061504, "ts": 1716454217953884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217954292, "dur": 156, "args": { "External id": 30636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30636, "pid": 5, "tid": 7, "ts": 1716454217954292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954001, "dur": 14, "args": { "External id": 30636, "cbid": 211, "correlation": 30636 } }, { "ph": "s", "id": 30636, "pid": 76337, "tid": -914061504, "ts": 1716454217954001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217954102, "dur": 1, "args": { "External id": 30648, "cbid": 317, "correlation": 30648 } }, { "ph": "f", "id": 30648, "pid": 76337, "tid": -914061504, "ts": 1716454217954102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217954104, "dur": 1, "args": { "External id": 30649, "cbid": 203, "correlation": 30649 } }, { "ph": "f", "id": 30649, "pid": 76337, "tid": -914061504, "ts": 1716454217954104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217954106, "dur": 0, "args": { "External id": 30650, "cbid": 205, "correlation": 30650 } }, { "ph": "f", "id": 30650, "pid": 76337, "tid": -914061504, "ts": 1716454217954106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954145, "dur": 2, "args": { "External id": 30654, "cbid": 251, "correlation": 30654 } }, { "ph": "f", "id": 30654, "pid": 76337, "tid": -914061504, "ts": 1716454217954145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954149, "dur": 0, "args": { "External id": 30655, "cbid": 251, "correlation": 30655 } }, { "ph": "f", "id": 30655, "pid": 76337, "tid": -914061504, "ts": 1716454217954149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954150, "dur": 0, "args": { "External id": 30656, "cbid": 251, "correlation": 30656 } }, { "ph": "f", "id": 30656, "pid": 76337, "tid": -914061504, "ts": 1716454217954150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954151, "dur": 0, "args": { "External id": 30657, "cbid": 251, "correlation": 30657 } }, { "ph": "f", "id": 30657, "pid": 76337, "tid": -914061504, "ts": 1716454217954151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954152, "dur": 1, "args": { "External id": 30658, "cbid": 251, "correlation": 30658 } }, { "ph": "f", "id": 30658, "pid": 76337, "tid": -914061504, "ts": 1716454217954152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954154, "dur": 1, "args": { "External id": 30659, "cbid": 251, "correlation": 30659 } }, { "ph": "f", "id": 30659, "pid": 76337, "tid": -914061504, "ts": 1716454217954154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954156, "dur": 1, "args": { "External id": 30660, "cbid": 251, "correlation": 30660 } }, { "ph": "f", "id": 30660, "pid": 76337, "tid": -914061504, "ts": 1716454217954156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954158, "dur": 1, "args": { "External id": 30661, "cbid": 251, "correlation": 30661 } }, { "ph": "f", "id": 30661, "pid": 76337, "tid": -914061504, "ts": 1716454217954158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954160, "dur": 0, "args": { "External id": 30662, "cbid": 251, "correlation": 30662 } }, { "ph": "f", "id": 30662, "pid": 76337, "tid": -914061504, "ts": 1716454217954160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217954450, "dur": 115, "args": { "External id": 30663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30663, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 30663, "pid": 5, "tid": 7, "ts": 1716454217954450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954163, "dur": 12, "args": { "External id": 30663, "cbid": 211, "correlation": 30663 } }, { "ph": "s", "id": 30663, "pid": 76337, "tid": -914061504, "ts": 1716454217954163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217954566, "dur": 59, "args": { "External id": 30669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30669, "pid": 5, "tid": 7, "ts": 1716454217954566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954202, "dur": 10, "args": { "External id": 30669, "cbid": 211, "correlation": 30669 } }, { "ph": "s", "id": 30669, "pid": 76337, "tid": -914061504, "ts": 1716454217954202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217954626, "dur": 50, "args": { "External id": 30677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30677, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30677, "pid": 5, "tid": 7, "ts": 1716454217954626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954235, "dur": 9, "args": { "External id": 30677, "cbid": 211, "correlation": 30677 } }, { "ph": "s", "id": 30677, "pid": 76337, "tid": -914061504, "ts": 1716454217954235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217954678, "dur": 97, "args": { "External id": 30686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30686, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30686, "pid": 5, "tid": 7, "ts": 1716454217954678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954293, "dur": 12, "args": { "External id": 30686, "cbid": 211, "correlation": 30686 } }, { "ph": "s", "id": 30686, "pid": 76337, "tid": -914061504, "ts": 1716454217954293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217954776, "dur": 91, "args": { "External id": 30706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30706, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 30706, "pid": 5, "tid": 7, "ts": 1716454217954776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954410, "dur": 14, "args": { "External id": 30706, "cbid": 211, "correlation": 30706 } }, { "ph": "s", "id": 30706, "pid": 76337, "tid": -914061504, "ts": 1716454217954410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217954869, "dur": 5, "args": { "External id": 30718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30718, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 30718, "pid": 5, "tid": 7, "ts": 1716454217954869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954434, "dur": 8, "args": { "External id": 30718, "cbid": 211, "correlation": 30718 } }, { "ph": "s", "id": 30718, "pid": 76337, "tid": -914061504, "ts": 1716454217954434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217954875, "dur": 108, "args": { "External id": 30721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30721, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30721, "pid": 5, "tid": 7, "ts": 1716454217954875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954457, "dur": 8, "args": { "External id": 30721, "cbid": 211, "correlation": 30721 } }, { "ph": "s", "id": 30721, "pid": 76337, "tid": -914061504, "ts": 1716454217954457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217954984, "dur": 70, "args": { "External id": 30730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30730, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30730, "pid": 5, "tid": 7, "ts": 1716454217954984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954507, "dur": 10, "args": { "External id": 30730, "cbid": 211, "correlation": 30730 } }, { "ph": "s", "id": 30730, "pid": 76337, "tid": -914061504, "ts": 1716454217954507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217954574, "dur": 0, "args": { "External id": 30740, "cbid": 317, "correlation": 30740 } }, { "ph": "f", "id": 30740, "pid": 76337, "tid": -914061504, "ts": 1716454217954574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217954575, "dur": 0, "args": { "External id": 30741, "cbid": 203, "correlation": 30741 } }, { "ph": "f", "id": 30741, "pid": 76337, "tid": -914061504, "ts": 1716454217954575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217954576, "dur": 0, "args": { "External id": 30742, "cbid": 205, "correlation": 30742 } }, { "ph": "f", "id": 30742, "pid": 76337, "tid": -914061504, "ts": 1716454217954576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217955056, "dur": 75, "args": { "External id": 30746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30746, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30746, "pid": 5, "tid": 7, "ts": 1716454217955056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954599, "dur": 13, "args": { "External id": 30746, "cbid": 211, "correlation": 30746 } }, { "ph": "s", "id": 30746, "pid": 76337, "tid": -914061504, "ts": 1716454217954599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217955132, "dur": 24, "args": { "External id": 30748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30748, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30748, "pid": 5, "tid": 7, "ts": 1716454217955132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954615, "dur": 5, "args": { "External id": 30748, "cbid": 211, "correlation": 30748 } }, { "ph": "s", "id": 30748, "pid": 76337, "tid": -914061504, "ts": 1716454217954615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217955158, "dur": 3, "args": { "External id": 30750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30750, "pid": 5, "tid": 7, "ts": 1716454217955158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954631, "dur": 8, "args": { "External id": 30750, "cbid": 211, "correlation": 30750 } }, { "ph": "s", "id": 30750, "pid": 76337, "tid": -914061504, "ts": 1716454217954631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217954645, "dur": 0, "args": { "External id": 30751, "cbid": 51, "correlation": 30751 } }, { "ph": "s", "id": 30751, "pid": 76337, "tid": -914061504, "ts": 1716454217954645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217955163, "dur": 1349, "args": { "External id": 30752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30752, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30752, "pid": 5, "tid": 7, "ts": 1716454217955163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954647, "dur": 7, "args": { "External id": 30752, "cbid": 211, "correlation": 30752 } }, { "ph": "s", "id": 30752, "pid": 76337, "tid": -914061504, "ts": 1716454217954647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217956513, "dur": 59, "args": { "External id": 30757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30757, "pid": 5, "tid": 7, "ts": 1716454217956513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954677, "dur": 8, "args": { "External id": 30757, "cbid": 211, "correlation": 30757 } }, { "ph": "s", "id": 30757, "pid": 76337, "tid": -914061504, "ts": 1716454217954677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217956573, "dur": 4, "args": { "External id": 30765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30765, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30765, "pid": 5, "tid": 7, "ts": 1716454217956573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954722, "dur": 9, "args": { "External id": 30765, "cbid": 211, "correlation": 30765 } }, { "ph": "s", "id": 30765, "pid": 76337, "tid": -914061504, "ts": 1716454217954722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954809, "dur": 2, "args": { "External id": 30781, "cbid": 251, "correlation": 30781 } }, { "ph": "f", "id": 30781, "pid": 76337, "tid": -914061504, "ts": 1716454217954809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217954815, "dur": 0, "args": { "External id": 30783, "cbid": 251, "correlation": 30783 } }, { "ph": "f", "id": 30783, "pid": 76337, "tid": -914061504, "ts": 1716454217954815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217956578, "dur": 11, "args": { "External id": 30784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30784, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 30784, "pid": 5, "tid": 7, "ts": 1716454217956578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954818, "dur": 14, "args": { "External id": 30784, "cbid": 211, "correlation": 30784 } }, { "ph": "s", "id": 30784, "pid": 76337, "tid": -914061504, "ts": 1716454217954818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217956591, "dur": 5, "args": { "External id": 30786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30786, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 30786, "pid": 5, "tid": 7, "ts": 1716454217956591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954835, "dur": 7, "args": { "External id": 30786, "cbid": 211, "correlation": 30786 } }, { "ph": "s", "id": 30786, "pid": 76337, "tid": -914061504, "ts": 1716454217954835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217956597, "dur": 54, "args": { "External id": 30796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30796, "pid": 5, "tid": 7, "ts": 1716454217956597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954906, "dur": 12, "args": { "External id": 30796, "cbid": 211, "correlation": 30796 } }, { "ph": "s", "id": 30796, "pid": 76337, "tid": -914061504, "ts": 1716454217954906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217956653, "dur": 50, "args": { "External id": 30816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30816, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 30816, "pid": 5, "tid": 7, "ts": 1716454217956653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217954985, "dur": 12, "args": { "External id": 30816, "cbid": 211, "correlation": 30816 } }, { "ph": "s", "id": 30816, "pid": 76337, "tid": -914061504, "ts": 1716454217954985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217956704, "dur": 4, "args": { "External id": 30828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30828, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30828, "pid": 5, "tid": 7, "ts": 1716454217956704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955008, "dur": 6, "args": { "External id": 30828, "cbid": 211, "correlation": 30828 } }, { "ph": "s", "id": 30828, "pid": 76337, "tid": -914061504, "ts": 1716454217955008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217956709, "dur": 54, "args": { "External id": 30831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30831, "pid": 5, "tid": 7, "ts": 1716454217956709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955026, "dur": 7, "args": { "External id": 30831, "cbid": 211, "correlation": 30831 } }, { "ph": "s", "id": 30831, "pid": 76337, "tid": -914061504, "ts": 1716454217955026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217956765, "dur": 37, "args": { "External id": 30840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30840, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30840, "pid": 5, "tid": 7, "ts": 1716454217956765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955068, "dur": 9, "args": { "External id": 30840, "cbid": 211, "correlation": 30840 } }, { "ph": "s", "id": 30840, "pid": 76337, "tid": -914061504, "ts": 1716454217955068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217955132, "dur": 0, "args": { "External id": 30850, "cbid": 317, "correlation": 30850 } }, { "ph": "f", "id": 30850, "pid": 76337, "tid": -914061504, "ts": 1716454217955132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217955133, "dur": 0, "args": { "External id": 30851, "cbid": 203, "correlation": 30851 } }, { "ph": "f", "id": 30851, "pid": 76337, "tid": -914061504, "ts": 1716454217955133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217955134, "dur": 0, "args": { "External id": 30852, "cbid": 205, "correlation": 30852 } }, { "ph": "f", "id": 30852, "pid": 76337, "tid": -914061504, "ts": 1716454217955134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217956803, "dur": 41, "args": { "External id": 30856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30856, "pid": 5, "tid": 7, "ts": 1716454217956803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955150, "dur": 13, "args": { "External id": 30856, "cbid": 211, "correlation": 30856 } }, { "ph": "s", "id": 30856, "pid": 76337, "tid": -914061504, "ts": 1716454217955150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217956845, "dur": 14, "args": { "External id": 30858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30858, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30858, "pid": 5, "tid": 7, "ts": 1716454217956845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955165, "dur": 5, "args": { "External id": 30858, "cbid": 211, "correlation": 30858 } }, { "ph": "s", "id": 30858, "pid": 76337, "tid": -914061504, "ts": 1716454217955165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217956860, "dur": 3, "args": { "External id": 30860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30860, "pid": 5, "tid": 7, "ts": 1716454217956860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955176, "dur": 5, "args": { "External id": 30860, "cbid": 211, "correlation": 30860 } }, { "ph": "s", "id": 30860, "pid": 76337, "tid": -914061504, "ts": 1716454217955176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217955184, "dur": 0, "args": { "External id": 30861, "cbid": 51, "correlation": 30861 } }, { "ph": "s", "id": 30861, "pid": 76337, "tid": -914061504, "ts": 1716454217955184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217956865, "dur": 690, "args": { "External id": 30862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30862, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30862, "pid": 5, "tid": 7, "ts": 1716454217956865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955185, "dur": 5, "args": { "External id": 30862, "cbid": 211, "correlation": 30862 } }, { "ph": "s", "id": 30862, "pid": 76337, "tid": -914061504, "ts": 1716454217955185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217957556, "dur": 58, "args": { "External id": 30867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30867, "pid": 5, "tid": 7, "ts": 1716454217957556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955213, "dur": 8, "args": { "External id": 30867, "cbid": 211, "correlation": 30867 } }, { "ph": "s", "id": 30867, "pid": 76337, "tid": -914061504, "ts": 1716454217955213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217955272, "dur": 0, "args": { "External id": 30877, "cbid": 317, "correlation": 30877 } }, { "ph": "f", "id": 30877, "pid": 76337, "tid": -914061504, "ts": 1716454217955272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217955273, "dur": 0, "args": { "External id": 30878, "cbid": 203, "correlation": 30878 } }, { "ph": "f", "id": 30878, "pid": 76337, "tid": -914061504, "ts": 1716454217955273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217955273, "dur": 0, "args": { "External id": 30879, "cbid": 205, "correlation": 30879 } }, { "ph": "f", "id": 30879, "pid": 76337, "tid": -914061504, "ts": 1716454217955273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217957616, "dur": 74, "args": { "External id": 30883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30883, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30883, "pid": 5, "tid": 7, "ts": 1716454217957616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955287, "dur": 12, "args": { "External id": 30883, "cbid": 211, "correlation": 30883 } }, { "ph": "s", "id": 30883, "pid": 76337, "tid": -914061504, "ts": 1716454217955287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217957691, "dur": 208, "args": { "External id": 30885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30885, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 30885, "pid": 5, "tid": 7, "ts": 1716454217957691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955311, "dur": 9, "args": { "External id": 30885, "cbid": 211, "correlation": 30885 } }, { "ph": "s", "id": 30885, "pid": 76337, "tid": -914061504, "ts": 1716454217955311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217957900, "dur": 40, "args": { "External id": 30887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30887, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30887, "pid": 5, "tid": 7, "ts": 1716454217957900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955327, "dur": 7, "args": { "External id": 30887, "cbid": 211, "correlation": 30887 } }, { "ph": "s", "id": 30887, "pid": 76337, "tid": -914061504, "ts": 1716454217955327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217957942, "dur": 59, "args": { "External id": 30893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30893, "pid": 5, "tid": 7, "ts": 1716454217957942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955355, "dur": 8, "args": { "External id": 30893, "cbid": 211, "correlation": 30893 } }, { "ph": "s", "id": 30893, "pid": 76337, "tid": -914061504, "ts": 1716454217955355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217958002, "dur": 50, "args": { "External id": 30901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30901, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30901, "pid": 5, "tid": 7, "ts": 1716454217958002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955384, "dur": 8, "args": { "External id": 30901, "cbid": 211, "correlation": 30901 } }, { "ph": "s", "id": 30901, "pid": 76337, "tid": -914061504, "ts": 1716454217955384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217958053, "dur": 35, "args": { "External id": 30909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30909, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30909, "pid": 5, "tid": 7, "ts": 1716454217958053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955422, "dur": 10, "args": { "External id": 30909, "cbid": 211, "correlation": 30909 } }, { "ph": "s", "id": 30909, "pid": 76337, "tid": -914061504, "ts": 1716454217955422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217958090, "dur": 52, "args": { "External id": 30929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30929, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 30929, "pid": 5, "tid": 7, "ts": 1716454217958090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955522, "dur": 13, "args": { "External id": 30929, "cbid": 211, "correlation": 30929 } }, { "ph": "s", "id": 30929, "pid": 76337, "tid": -914061504, "ts": 1716454217955522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217958143, "dur": 5, "args": { "External id": 30941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30941, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 30941, "pid": 5, "tid": 7, "ts": 1716454217958143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955554, "dur": 7, "args": { "External id": 30941, "cbid": 211, "correlation": 30941 } }, { "ph": "s", "id": 30941, "pid": 76337, "tid": -914061504, "ts": 1716454217955554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217958150, "dur": 56, "args": { "External id": 30944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30944, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30944, "pid": 5, "tid": 7, "ts": 1716454217958150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955574, "dur": 6, "args": { "External id": 30944, "cbid": 211, "correlation": 30944 } }, { "ph": "s", "id": 30944, "pid": 76337, "tid": -914061504, "ts": 1716454217955574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217955643, "dur": 0, "args": { "External id": 30955, "cbid": 317, "correlation": 30955 } }, { "ph": "f", "id": 30955, "pid": 76337, "tid": -914061504, "ts": 1716454217955643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217955644, "dur": 0, "args": { "External id": 30956, "cbid": 203, "correlation": 30956 } }, { "ph": "f", "id": 30956, "pid": 76337, "tid": -914061504, "ts": 1716454217955644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217955645, "dur": 0, "args": { "External id": 30957, "cbid": 205, "correlation": 30957 } }, { "ph": "f", "id": 30957, "pid": 76337, "tid": -914061504, "ts": 1716454217955645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955669, "dur": 1, "args": { "External id": 30961, "cbid": 251, "correlation": 30961 } }, { "ph": "f", "id": 30961, "pid": 76337, "tid": -914061504, "ts": 1716454217955669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955671, "dur": 0, "args": { "External id": 30962, "cbid": 251, "correlation": 30962 } }, { "ph": "f", "id": 30962, "pid": 76337, "tid": -914061504, "ts": 1716454217955671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955672, "dur": 0, "args": { "External id": 30963, "cbid": 251, "correlation": 30963 } }, { "ph": "f", "id": 30963, "pid": 76337, "tid": -914061504, "ts": 1716454217955672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955673, "dur": 0, "args": { "External id": 30964, "cbid": 251, "correlation": 30964 } }, { "ph": "f", "id": 30964, "pid": 76337, "tid": -914061504, "ts": 1716454217955673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955673, "dur": 0, "args": { "External id": 30965, "cbid": 251, "correlation": 30965 } }, { "ph": "f", "id": 30965, "pid": 76337, "tid": -914061504, "ts": 1716454217955673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955674, "dur": 0, "args": { "External id": 30966, "cbid": 251, "correlation": 30966 } }, { "ph": "f", "id": 30966, "pid": 76337, "tid": -914061504, "ts": 1716454217955674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955675, "dur": 0, "args": { "External id": 30967, "cbid": 251, "correlation": 30967 } }, { "ph": "f", "id": 30967, "pid": 76337, "tid": -914061504, "ts": 1716454217955675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955676, "dur": 0, "args": { "External id": 30968, "cbid": 251, "correlation": 30968 } }, { "ph": "f", "id": 30968, "pid": 76337, "tid": -914061504, "ts": 1716454217955676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217955677, "dur": 0, "args": { "External id": 30969, "cbid": 251, "correlation": 30969 } }, { "ph": "f", "id": 30969, "pid": 76337, "tid": -914061504, "ts": 1716454217955677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217958207, "dur": 113, "args": { "External id": 30970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30970, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 30970, "pid": 5, "tid": 7, "ts": 1716454217958207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955680, "dur": 13, "args": { "External id": 30970, "cbid": 211, "correlation": 30970 } }, { "ph": "s", "id": 30970, "pid": 76337, "tid": -914061504, "ts": 1716454217955680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217958322, "dur": 60, "args": { "External id": 30976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30976, "pid": 5, "tid": 7, "ts": 1716454217958322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955715, "dur": 10, "args": { "External id": 30976, "cbid": 211, "correlation": 30976 } }, { "ph": "s", "id": 30976, "pid": 76337, "tid": -914061504, "ts": 1716454217955715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217958383, "dur": 574, "args": { "External id": 30985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 30985, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 30985, "pid": 5, "tid": 7, "ts": 1716454217958383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955806, "dur": 15, "args": { "External id": 30985, "cbid": 211, "correlation": 30985 } }, { "ph": "s", "id": 30985, "pid": 76337, "tid": -914061504, "ts": 1716454217955806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217958958, "dur": 179, "args": { "External id": 31007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31007, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31007, "pid": 5, "tid": 7, "ts": 1716454217958958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217955875, "dur": 12, "args": { "External id": 31007, "cbid": 211, "correlation": 31007 } }, { "ph": "s", "id": 31007, "pid": 76337, "tid": -914061504, "ts": 1716454217955875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956010, "dur": 2, "args": { "External id": 31018, "cbid": 251, "correlation": 31018 } }, { "ph": "f", "id": 31018, "pid": 76337, "tid": -914061504, "ts": 1716454217956010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217959138, "dur": 194, "args": { "External id": 31019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31019, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31019, "pid": 5, "tid": 7, "ts": 1716454217959138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956016, "dur": 15, "args": { "External id": 31019, "cbid": 211, "correlation": 31019 } }, { "ph": "s", "id": 31019, "pid": 76337, "tid": -914061504, "ts": 1716454217956016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956105, "dur": 1, "args": { "External id": 31030, "cbid": 251, "correlation": 31030 } }, { "ph": "f", "id": 31030, "pid": 76337, "tid": -914061504, "ts": 1716454217956105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217959334, "dur": 187, "args": { "External id": 31031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31031, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31031, "pid": 5, "tid": 7, "ts": 1716454217959334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956109, "dur": 13, "args": { "External id": 31031, "cbid": 211, "correlation": 31031 } }, { "ph": "s", "id": 31031, "pid": 76337, "tid": -914061504, "ts": 1716454217956109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956187, "dur": 1, "args": { "External id": 31042, "cbid": 251, "correlation": 31042 } }, { "ph": "f", "id": 31042, "pid": 76337, "tid": -914061504, "ts": 1716454217956187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217959522, "dur": 185, "args": { "External id": 31043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31043, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31043, "pid": 5, "tid": 7, "ts": 1716454217959522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956192, "dur": 12, "args": { "External id": 31043, "cbid": 211, "correlation": 31043 } }, { "ph": "s", "id": 31043, "pid": 76337, "tid": -914061504, "ts": 1716454217956192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217959708, "dur": 18357, "args": { "External id": 31064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31064, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31064, "pid": 5, "tid": 7, "ts": 1716454217959708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956297, "dur": 15, "args": { "External id": 31064, "cbid": 211, "correlation": 31064 } }, { "ph": "s", "id": 31064, "pid": 76337, "tid": -914061504, "ts": 1716454217956297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956433, "dur": 2, "args": { "External id": 31082, "cbid": 251, "correlation": 31082 } }, { "ph": "f", "id": 31082, "pid": 76337, "tid": -914061504, "ts": 1716454217956433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217978066, "dur": 199, "args": { "External id": 31084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31084, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31084, "pid": 5, "tid": 7, "ts": 1716454217978066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956440, "dur": 14, "args": { "External id": 31084, "cbid": 211, "correlation": 31084 } }, { "ph": "s", "id": 31084, "pid": 76337, "tid": -914061504, "ts": 1716454217956440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217978267, "dur": 68, "args": { "External id": 31092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31092, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31092, "pid": 5, "tid": 7, "ts": 1716454217978267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956513, "dur": 12, "args": { "External id": 31092, "cbid": 211, "correlation": 31092 } }, { "ph": "s", "id": 31092, "pid": 76337, "tid": -914061504, "ts": 1716454217956513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217978335, "dur": 97, "args": { "External id": 31100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31100, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31100, "pid": 5, "tid": 7, "ts": 1716454217978335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956552, "dur": 8, "args": { "External id": 31100, "cbid": 211, "correlation": 31100 } }, { "ph": "s", "id": 31100, "pid": 76337, "tid": -914061504, "ts": 1716454217956552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217978434, "dur": 53, "args": { "External id": 31111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31111, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31111, "pid": 5, "tid": 7, "ts": 1716454217978434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956633, "dur": 13, "args": { "External id": 31111, "cbid": 211, "correlation": 31111 } }, { "ph": "s", "id": 31111, "pid": 76337, "tid": -914061504, "ts": 1716454217956633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217978488, "dur": 91, "args": { "External id": 31133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31133, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31133, "pid": 5, "tid": 7, "ts": 1716454217978488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956667, "dur": 8, "args": { "External id": 31133, "cbid": 211, "correlation": 31133 } }, { "ph": "s", "id": 31133, "pid": 76337, "tid": -914061504, "ts": 1716454217956667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956751, "dur": 1, "args": { "External id": 31144, "cbid": 251, "correlation": 31144 } }, { "ph": "f", "id": 31144, "pid": 76337, "tid": -914061504, "ts": 1716454217956751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217978581, "dur": 107, "args": { "External id": 31145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31145, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31145, "pid": 5, "tid": 7, "ts": 1716454217978581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956756, "dur": 13, "args": { "External id": 31145, "cbid": 211, "correlation": 31145 } }, { "ph": "s", "id": 31145, "pid": 76337, "tid": -914061504, "ts": 1716454217956756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956838, "dur": 1, "args": { "External id": 31156, "cbid": 251, "correlation": 31156 } }, { "ph": "f", "id": 31156, "pid": 76337, "tid": -914061504, "ts": 1716454217956838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956842, "dur": 0, "args": { "External id": 31157, "cbid": 251, "correlation": 31157 } }, { "ph": "f", "id": 31157, "pid": 76337, "tid": -914061504, "ts": 1716454217956842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217978689, "dur": 11, "args": { "External id": 31158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31158, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 31158, "pid": 5, "tid": 7, "ts": 1716454217978689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956844, "dur": 14, "args": { "External id": 31158, "cbid": 211, "correlation": 31158 } }, { "ph": "s", "id": 31158, "pid": 76337, "tid": -914061504, "ts": 1716454217956844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217978701, "dur": 5, "args": { "External id": 31160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31160, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 31160, "pid": 5, "tid": 7, "ts": 1716454217978701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956861, "dur": 7, "args": { "External id": 31160, "cbid": 211, "correlation": 31160 } }, { "ph": "s", "id": 31160, "pid": 76337, "tid": -914061504, "ts": 1716454217956861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956938, "dur": 1, "args": { "External id": 31171, "cbid": 251, "correlation": 31171 } }, { "ph": "f", "id": 31171, "pid": 76337, "tid": -914061504, "ts": 1716454217956938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217956941, "dur": 0, "args": { "External id": 31172, "cbid": 251, "correlation": 31172 } }, { "ph": "f", "id": 31172, "pid": 76337, "tid": -914061504, "ts": 1716454217956941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217978707, "dur": 6, "args": { "External id": 31173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31173, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 31173, "pid": 5, "tid": 7, "ts": 1716454217978707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956943, "dur": 13, "args": { "External id": 31173, "cbid": 211, "correlation": 31173 } }, { "ph": "s", "id": 31173, "pid": 76337, "tid": -914061504, "ts": 1716454217956943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217978715, "dur": 3, "args": { "External id": 31175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31175, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 31175, "pid": 5, "tid": 7, "ts": 1716454217978715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217956958, "dur": 6, "args": { "External id": 31175, "cbid": 211, "correlation": 31175 } }, { "ph": "s", "id": 31175, "pid": 76337, "tid": -914061504, "ts": 1716454217956958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217978720, "dur": 155, "args": { "External id": 31196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31196, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31196, "pid": 5, "tid": 7, "ts": 1716454217978720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957040, "dur": 13, "args": { "External id": 31196, "cbid": 211, "correlation": 31196 } }, { "ph": "s", "id": 31196, "pid": 76337, "tid": -914061504, "ts": 1716454217957040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957139, "dur": 2, "args": { "External id": 31214, "cbid": 251, "correlation": 31214 } }, { "ph": "f", "id": 31214, "pid": 76337, "tid": -914061504, "ts": 1716454217957139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217978876, "dur": 106, "args": { "External id": 31216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31216, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31216, "pid": 5, "tid": 7, "ts": 1716454217978876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957145, "dur": 14, "args": { "External id": 31216, "cbid": 211, "correlation": 31216 } }, { "ph": "s", "id": 31216, "pid": 76337, "tid": -914061504, "ts": 1716454217957145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217978984, "dur": 34, "args": { "External id": 31224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31224, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31224, "pid": 5, "tid": 7, "ts": 1716454217978984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957215, "dur": 13, "args": { "External id": 31224, "cbid": 211, "correlation": 31224 } }, { "ph": "s", "id": 31224, "pid": 76337, "tid": -914061504, "ts": 1716454217957215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217979019, "dur": 66, "args": { "External id": 31232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31232, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31232, "pid": 5, "tid": 7, "ts": 1716454217979019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957256, "dur": 9, "args": { "External id": 31232, "cbid": 211, "correlation": 31232 } }, { "ph": "s", "id": 31232, "pid": 76337, "tid": -914061504, "ts": 1716454217957256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217979087, "dur": 91, "args": { "External id": 31254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31254, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31254, "pid": 5, "tid": 7, "ts": 1716454217979087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957308, "dur": 10, "args": { "External id": 31254, "cbid": 211, "correlation": 31254 } }, { "ph": "s", "id": 31254, "pid": 76337, "tid": -914061504, "ts": 1716454217957308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957418, "dur": 1, "args": { "External id": 31270, "cbid": 251, "correlation": 31270 } }, { "ph": "f", "id": 31270, "pid": 76337, "tid": -914061504, "ts": 1716454217957418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217979180, "dur": 569, "args": { "External id": 31272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31272, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31272, "pid": 5, "tid": 7, "ts": 1716454217979180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957424, "dur": 13, "args": { "External id": 31272, "cbid": 211, "correlation": 31272 } }, { "ph": "s", "id": 31272, "pid": 76337, "tid": -914061504, "ts": 1716454217957424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217979750, "dur": 239, "args": { "External id": 31280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31280, "pid": 5, "tid": 7, "ts": 1716454217979750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957509, "dur": 14, "args": { "External id": 31280, "cbid": 211, "correlation": 31280 } }, { "ph": "s", "id": 31280, "pid": 76337, "tid": -914061504, "ts": 1716454217957509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217979990, "dur": 252, "args": { "External id": 31288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31288, "pid": 5, "tid": 7, "ts": 1716454217979990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957546, "dur": 8, "args": { "External id": 31288, "cbid": 211, "correlation": 31288 } }, { "ph": "s", "id": 31288, "pid": 76337, "tid": -914061504, "ts": 1716454217957546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957641, "dur": 1, "args": { "External id": 31304, "cbid": 251, "correlation": 31304 } }, { "ph": "f", "id": 31304, "pid": 76337, "tid": -914061504, "ts": 1716454217957641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957646, "dur": 0, "args": { "External id": 31306, "cbid": 251, "correlation": 31306 } }, { "ph": "f", "id": 31306, "pid": 76337, "tid": -914061504, "ts": 1716454217957646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454217980243, "dur": 358, "args": { "External id": 31307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31307, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 31307, "pid": 5, "tid": 7, "ts": 1716454217980243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957650, "dur": 14, "args": { "External id": 31307, "cbid": 211, "correlation": 31307 } }, { "ph": "s", "id": 31307, "pid": 76337, "tid": -914061504, "ts": 1716454217957650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217980603, "dur": 50, "args": { "External id": 31315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31315, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31315, "pid": 5, "tid": 7, "ts": 1716454217980603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957694, "dur": 10, "args": { "External id": 31315, "cbid": 211, "correlation": 31315 } }, { "ph": "s", "id": 31315, "pid": 76337, "tid": -914061504, "ts": 1716454217957694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217980654, "dur": 156, "args": { "External id": 31326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31326, "pid": 5, "tid": 7, "ts": 1716454217980654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957763, "dur": 12, "args": { "External id": 31326, "cbid": 211, "correlation": 31326 } }, { "ph": "s", "id": 31326, "pid": 76337, "tid": -914061504, "ts": 1716454217957763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217957827, "dur": 0, "args": { "External id": 31338, "cbid": 317, "correlation": 31338 } }, { "ph": "f", "id": 31338, "pid": 76337, "tid": -914061504, "ts": 1716454217957827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217957828, "dur": 0, "args": { "External id": 31339, "cbid": 203, "correlation": 31339 } }, { "ph": "f", "id": 31339, "pid": 76337, "tid": -914061504, "ts": 1716454217957828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217957829, "dur": 0, "args": { "External id": 31340, "cbid": 205, "correlation": 31340 } }, { "ph": "f", "id": 31340, "pid": 76337, "tid": -914061504, "ts": 1716454217957829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957853, "dur": 1, "args": { "External id": 31344, "cbid": 251, "correlation": 31344 } }, { "ph": "f", "id": 31344, "pid": 76337, "tid": -914061504, "ts": 1716454217957853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957855, "dur": 0, "args": { "External id": 31345, "cbid": 251, "correlation": 31345 } }, { "ph": "f", "id": 31345, "pid": 76337, "tid": -914061504, "ts": 1716454217957855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957856, "dur": 0, "args": { "External id": 31346, "cbid": 251, "correlation": 31346 } }, { "ph": "f", "id": 31346, "pid": 76337, "tid": -914061504, "ts": 1716454217957856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957856, "dur": 0, "args": { "External id": 31347, "cbid": 251, "correlation": 31347 } }, { "ph": "f", "id": 31347, "pid": 76337, "tid": -914061504, "ts": 1716454217957856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957857, "dur": 0, "args": { "External id": 31348, "cbid": 251, "correlation": 31348 } }, { "ph": "f", "id": 31348, "pid": 76337, "tid": -914061504, "ts": 1716454217957857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957858, "dur": 0, "args": { "External id": 31349, "cbid": 251, "correlation": 31349 } }, { "ph": "f", "id": 31349, "pid": 76337, "tid": -914061504, "ts": 1716454217957858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957859, "dur": 0, "args": { "External id": 31350, "cbid": 251, "correlation": 31350 } }, { "ph": "f", "id": 31350, "pid": 76337, "tid": -914061504, "ts": 1716454217957859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957860, "dur": 0, "args": { "External id": 31351, "cbid": 251, "correlation": 31351 } }, { "ph": "f", "id": 31351, "pid": 76337, "tid": -914061504, "ts": 1716454217957860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217957861, "dur": 0, "args": { "External id": 31352, "cbid": 251, "correlation": 31352 } }, { "ph": "f", "id": 31352, "pid": 76337, "tid": -914061504, "ts": 1716454217957861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217980812, "dur": 115, "args": { "External id": 31353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31353, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31353, "pid": 5, "tid": 7, "ts": 1716454217980812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957863, "dur": 12, "args": { "External id": 31353, "cbid": 211, "correlation": 31353 } }, { "ph": "s", "id": 31353, "pid": 76337, "tid": -914061504, "ts": 1716454217957863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217980928, "dur": 59, "args": { "External id": 31359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31359, "pid": 5, "tid": 7, "ts": 1716454217980928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957898, "dur": 10, "args": { "External id": 31359, "cbid": 211, "correlation": 31359 } }, { "ph": "s", "id": 31359, "pid": 76337, "tid": -914061504, "ts": 1716454217957898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217980988, "dur": 50, "args": { "External id": 31367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31367, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31367, "pid": 5, "tid": 7, "ts": 1716454217980988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957930, "dur": 8, "args": { "External id": 31367, "cbid": 211, "correlation": 31367 } }, { "ph": "s", "id": 31367, "pid": 76337, "tid": -914061504, "ts": 1716454217957930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217981039, "dur": 99, "args": { "External id": 31376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31376, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31376, "pid": 5, "tid": 7, "ts": 1716454217981039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217957969, "dur": 18, "args": { "External id": 31376, "cbid": 211, "correlation": 31376 } }, { "ph": "s", "id": 31376, "pid": 76337, "tid": -914061504, "ts": 1716454217957969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217981140, "dur": 92, "args": { "External id": 31396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31396, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 31396, "pid": 5, "tid": 7, "ts": 1716454217981140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958050, "dur": 12, "args": { "External id": 31396, "cbid": 211, "correlation": 31396 } }, { "ph": "s", "id": 31396, "pid": 76337, "tid": -914061504, "ts": 1716454217958050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217981234, "dur": 5, "args": { "External id": 31408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31408, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 31408, "pid": 5, "tid": 7, "ts": 1716454217981234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958071, "dur": 7, "args": { "External id": 31408, "cbid": 211, "correlation": 31408 } }, { "ph": "s", "id": 31408, "pid": 76337, "tid": -914061504, "ts": 1716454217958071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217981240, "dur": 107, "args": { "External id": 31411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31411, "pid": 5, "tid": 7, "ts": 1716454217981240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958090, "dur": 7, "args": { "External id": 31411, "cbid": 211, "correlation": 31411 } }, { "ph": "s", "id": 31411, "pid": 76337, "tid": -914061504, "ts": 1716454217958090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217981348, "dur": 68, "args": { "External id": 31420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31420, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31420, "pid": 5, "tid": 7, "ts": 1716454217981348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958128, "dur": 10, "args": { "External id": 31420, "cbid": 211, "correlation": 31420 } }, { "ph": "s", "id": 31420, "pid": 76337, "tid": -914061504, "ts": 1716454217958128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217958193, "dur": 0, "args": { "External id": 31430, "cbid": 317, "correlation": 31430 } }, { "ph": "f", "id": 31430, "pid": 76337, "tid": -914061504, "ts": 1716454217958193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217958194, "dur": 0, "args": { "External id": 31431, "cbid": 203, "correlation": 31431 } }, { "ph": "f", "id": 31431, "pid": 76337, "tid": -914061504, "ts": 1716454217958194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217958195, "dur": 0, "args": { "External id": 31432, "cbid": 205, "correlation": 31432 } }, { "ph": "f", "id": 31432, "pid": 76337, "tid": -914061504, "ts": 1716454217958195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217981418, "dur": 75, "args": { "External id": 31436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31436, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31436, "pid": 5, "tid": 7, "ts": 1716454217981418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958210, "dur": 12, "args": { "External id": 31436, "cbid": 211, "correlation": 31436 } }, { "ph": "s", "id": 31436, "pid": 76337, "tid": -914061504, "ts": 1716454217958210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217981495, "dur": 24, "args": { "External id": 31438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31438, "pid": 5, "tid": 7, "ts": 1716454217981495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958225, "dur": 5, "args": { "External id": 31438, "cbid": 211, "correlation": 31438 } }, { "ph": "s", "id": 31438, "pid": 76337, "tid": -914061504, "ts": 1716454217958225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217981521, "dur": 3, "args": { "External id": 31440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 31440, "pid": 5, "tid": 7, "ts": 1716454217981521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958235, "dur": 6, "args": { "External id": 31440, "cbid": 211, "correlation": 31440 } }, { "ph": "s", "id": 31440, "pid": 76337, "tid": -914061504, "ts": 1716454217958235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217958244, "dur": 0, "args": { "External id": 31441, "cbid": 51, "correlation": 31441 } }, { "ph": "s", "id": 31441, "pid": 76337, "tid": -914061504, "ts": 1716454217958244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217981526, "dur": 1345, "args": { "External id": 31442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31442, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31442, "pid": 5, "tid": 7, "ts": 1716454217981526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958245, "dur": 6, "args": { "External id": 31442, "cbid": 211, "correlation": 31442 } }, { "ph": "s", "id": 31442, "pid": 76337, "tid": -914061504, "ts": 1716454217958245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217982872, "dur": 60, "args": { "External id": 31447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31447, "pid": 5, "tid": 7, "ts": 1716454217982872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958273, "dur": 8, "args": { "External id": 31447, "cbid": 211, "correlation": 31447 } }, { "ph": "s", "id": 31447, "pid": 76337, "tid": -914061504, "ts": 1716454217958273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217982933, "dur": 3, "args": { "External id": 31455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31455, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 31455, "pid": 5, "tid": 7, "ts": 1716454217982933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958318, "dur": 9, "args": { "External id": 31455, "cbid": 211, "correlation": 31455 } }, { "ph": "s", "id": 31455, "pid": 76337, "tid": -914061504, "ts": 1716454217958318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217958396, "dur": 1, "args": { "External id": 31471, "cbid": 251, "correlation": 31471 } }, { "ph": "f", "id": 31471, "pid": 76337, "tid": -914061504, "ts": 1716454217958396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217958402, "dur": 0, "args": { "External id": 31473, "cbid": 251, "correlation": 31473 } }, { "ph": "f", "id": 31473, "pid": 76337, "tid": -914061504, "ts": 1716454217958402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454217982937, "dur": 11, "args": { "External id": 31474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31474, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 31474, "pid": 5, "tid": 7, "ts": 1716454217982937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958403, "dur": 13, "args": { "External id": 31474, "cbid": 211, "correlation": 31474 } }, { "ph": "s", "id": 31474, "pid": 76337, "tid": -914061504, "ts": 1716454217958403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454217982950, "dur": 5, "args": { "External id": 31476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31476, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 31476, "pid": 5, "tid": 7, "ts": 1716454217982950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958418, "dur": 6, "args": { "External id": 31476, "cbid": 211, "correlation": 31476 } }, { "ph": "s", "id": 31476, "pid": 76337, "tid": -914061504, "ts": 1716454217958418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217982957, "dur": 54, "args": { "External id": 31486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31486, "pid": 5, "tid": 7, "ts": 1716454217982957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958479, "dur": 12, "args": { "External id": 31486, "cbid": 211, "correlation": 31486 } }, { "ph": "s", "id": 31486, "pid": 76337, "tid": -914061504, "ts": 1716454217958479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217983011, "dur": 52, "args": { "External id": 31506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31506, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 31506, "pid": 5, "tid": 7, "ts": 1716454217983011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958546, "dur": 11, "args": { "External id": 31506, "cbid": 211, "correlation": 31506 } }, { "ph": "s", "id": 31506, "pid": 76337, "tid": -914061504, "ts": 1716454217958546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217983065, "dur": 4, "args": { "External id": 31518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31518, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 31518, "pid": 5, "tid": 7, "ts": 1716454217983065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958567, "dur": 6, "args": { "External id": 31518, "cbid": 211, "correlation": 31518 } }, { "ph": "s", "id": 31518, "pid": 76337, "tid": -914061504, "ts": 1716454217958567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217983070, "dur": 54, "args": { "External id": 31521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31521, "pid": 5, "tid": 7, "ts": 1716454217983070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958585, "dur": 6, "args": { "External id": 31521, "cbid": 211, "correlation": 31521 } }, { "ph": "s", "id": 31521, "pid": 76337, "tid": -914061504, "ts": 1716454217958585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217983125, "dur": 36, "args": { "External id": 31530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31530, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31530, "pid": 5, "tid": 7, "ts": 1716454217983125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958625, "dur": 9, "args": { "External id": 31530, "cbid": 211, "correlation": 31530 } }, { "ph": "s", "id": 31530, "pid": 76337, "tid": -914061504, "ts": 1716454217958625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217958688, "dur": 0, "args": { "External id": 31540, "cbid": 317, "correlation": 31540 } }, { "ph": "f", "id": 31540, "pid": 76337, "tid": -914061504, "ts": 1716454217958688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217958688, "dur": 0, "args": { "External id": 31541, "cbid": 203, "correlation": 31541 } }, { "ph": "f", "id": 31541, "pid": 76337, "tid": -914061504, "ts": 1716454217958688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217958689, "dur": 0, "args": { "External id": 31542, "cbid": 205, "correlation": 31542 } }, { "ph": "f", "id": 31542, "pid": 76337, "tid": -914061504, "ts": 1716454217958689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217983163, "dur": 40, "args": { "External id": 31546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31546, "pid": 5, "tid": 7, "ts": 1716454217983163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958704, "dur": 12, "args": { "External id": 31546, "cbid": 211, "correlation": 31546 } }, { "ph": "s", "id": 31546, "pid": 76337, "tid": -914061504, "ts": 1716454217958704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217983204, "dur": 14, "args": { "External id": 31548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31548, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31548, "pid": 5, "tid": 7, "ts": 1716454217983204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958718, "dur": 6, "args": { "External id": 31548, "cbid": 211, "correlation": 31548 } }, { "ph": "s", "id": 31548, "pid": 76337, "tid": -914061504, "ts": 1716454217958718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454217983220, "dur": 3, "args": { "External id": 31550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31550, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 31550, "pid": 5, "tid": 7, "ts": 1716454217983220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958729, "dur": 5, "args": { "External id": 31550, "cbid": 211, "correlation": 31550 } }, { "ph": "s", "id": 31550, "pid": 76337, "tid": -914061504, "ts": 1716454217958729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217958737, "dur": 0, "args": { "External id": 31551, "cbid": 51, "correlation": 31551 } }, { "ph": "s", "id": 31551, "pid": 76337, "tid": -914061504, "ts": 1716454217958737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454217983225, "dur": 689, "args": { "External id": 31552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31552, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31552, "pid": 5, "tid": 7, "ts": 1716454217983225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958738, "dur": 5, "args": { "External id": 31552, "cbid": 211, "correlation": 31552 } }, { "ph": "s", "id": 31552, "pid": 76337, "tid": -914061504, "ts": 1716454217958738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217983915, "dur": 59, "args": { "External id": 31557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31557, "pid": 5, "tid": 7, "ts": 1716454217983915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958765, "dur": 8, "args": { "External id": 31557, "cbid": 211, "correlation": 31557 } }, { "ph": "s", "id": 31557, "pid": 76337, "tid": -914061504, "ts": 1716454217958765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217958823, "dur": 0, "args": { "External id": 31567, "cbid": 317, "correlation": 31567 } }, { "ph": "f", "id": 31567, "pid": 76337, "tid": -914061504, "ts": 1716454217958823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217958824, "dur": 0, "args": { "External id": 31568, "cbid": 203, "correlation": 31568 } }, { "ph": "f", "id": 31568, "pid": 76337, "tid": -914061504, "ts": 1716454217958824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217958824, "dur": 0, "args": { "External id": 31569, "cbid": 205, "correlation": 31569 } }, { "ph": "f", "id": 31569, "pid": 76337, "tid": -914061504, "ts": 1716454217958824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217983975, "dur": 74, "args": { "External id": 31573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31573, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31573, "pid": 5, "tid": 7, "ts": 1716454217983975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958836, "dur": 12, "args": { "External id": 31573, "cbid": 211, "correlation": 31573 } }, { "ph": "s", "id": 31573, "pid": 76337, "tid": -914061504, "ts": 1716454217958836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454217984050, "dur": 209, "args": { "External id": 31575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31575, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31575, "pid": 5, "tid": 7, "ts": 1716454217984050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958855, "dur": 7, "args": { "External id": 31575, "cbid": 211, "correlation": 31575 } }, { "ph": "s", "id": 31575, "pid": 76337, "tid": -914061504, "ts": 1716454217958855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454217984260, "dur": 38, "args": { "External id": 31577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31577, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31577, "pid": 5, "tid": 7, "ts": 1716454217984260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958866, "dur": 5, "args": { "External id": 31577, "cbid": 211, "correlation": 31577 } }, { "ph": "s", "id": 31577, "pid": 76337, "tid": -914061504, "ts": 1716454217958866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217984300, "dur": 58, "args": { "External id": 31583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31583, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31583, "pid": 5, "tid": 7, "ts": 1716454217984300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958892, "dur": 9, "args": { "External id": 31583, "cbid": 211, "correlation": 31583 } }, { "ph": "s", "id": 31583, "pid": 76337, "tid": -914061504, "ts": 1716454217958892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217984360, "dur": 50, "args": { "External id": 31591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31591, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31591, "pid": 5, "tid": 7, "ts": 1716454217984360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958921, "dur": 7, "args": { "External id": 31591, "cbid": 211, "correlation": 31591 } }, { "ph": "s", "id": 31591, "pid": 76337, "tid": -914061504, "ts": 1716454217958921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454217984412, "dur": 35, "args": { "External id": 31599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31599, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31599, "pid": 5, "tid": 7, "ts": 1716454217984412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217958949, "dur": 9, "args": { "External id": 31599, "cbid": 211, "correlation": 31599 } }, { "ph": "s", "id": 31599, "pid": 76337, "tid": -914061504, "ts": 1716454217958949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217984448, "dur": 51, "args": { "External id": 31619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31619, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 31619, "pid": 5, "tid": 7, "ts": 1716454217984448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959039, "dur": 13, "args": { "External id": 31619, "cbid": 211, "correlation": 31619 } }, { "ph": "s", "id": 31619, "pid": 76337, "tid": -914061504, "ts": 1716454217959039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454217984500, "dur": 5, "args": { "External id": 31631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31631, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 31631, "pid": 5, "tid": 7, "ts": 1716454217984500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959062, "dur": 6, "args": { "External id": 31631, "cbid": 211, "correlation": 31631 } }, { "ph": "s", "id": 31631, "pid": 76337, "tid": -914061504, "ts": 1716454217959062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217984506, "dur": 55, "args": { "External id": 31634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31634, "pid": 5, "tid": 7, "ts": 1716454217984506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959080, "dur": 6, "args": { "External id": 31634, "cbid": 211, "correlation": 31634 } }, { "ph": "s", "id": 31634, "pid": 76337, "tid": -914061504, "ts": 1716454217959080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217959150, "dur": 0, "args": { "External id": 31645, "cbid": 317, "correlation": 31645 } }, { "ph": "f", "id": 31645, "pid": 76337, "tid": -914061504, "ts": 1716454217959150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217959151, "dur": 0, "args": { "External id": 31646, "cbid": 203, "correlation": 31646 } }, { "ph": "f", "id": 31646, "pid": 76337, "tid": -914061504, "ts": 1716454217959151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217959151, "dur": 0, "args": { "External id": 31647, "cbid": 205, "correlation": 31647 } }, { "ph": "f", "id": 31647, "pid": 76337, "tid": -914061504, "ts": 1716454217959151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959175, "dur": 1, "args": { "External id": 31651, "cbid": 251, "correlation": 31651 } }, { "ph": "f", "id": 31651, "pid": 76337, "tid": -914061504, "ts": 1716454217959175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959177, "dur": 0, "args": { "External id": 31652, "cbid": 251, "correlation": 31652 } }, { "ph": "f", "id": 31652, "pid": 76337, "tid": -914061504, "ts": 1716454217959177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959177, "dur": 0, "args": { "External id": 31653, "cbid": 251, "correlation": 31653 } }, { "ph": "f", "id": 31653, "pid": 76337, "tid": -914061504, "ts": 1716454217959177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959178, "dur": 0, "args": { "External id": 31654, "cbid": 251, "correlation": 31654 } }, { "ph": "f", "id": 31654, "pid": 76337, "tid": -914061504, "ts": 1716454217959178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959179, "dur": 0, "args": { "External id": 31655, "cbid": 251, "correlation": 31655 } }, { "ph": "f", "id": 31655, "pid": 76337, "tid": -914061504, "ts": 1716454217959179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959179, "dur": 0, "args": { "External id": 31656, "cbid": 251, "correlation": 31656 } }, { "ph": "f", "id": 31656, "pid": 76337, "tid": -914061504, "ts": 1716454217959179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959180, "dur": 0, "args": { "External id": 31657, "cbid": 251, "correlation": 31657 } }, { "ph": "f", "id": 31657, "pid": 76337, "tid": -914061504, "ts": 1716454217959180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959181, "dur": 0, "args": { "External id": 31658, "cbid": 251, "correlation": 31658 } }, { "ph": "f", "id": 31658, "pid": 76337, "tid": -914061504, "ts": 1716454217959181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959182, "dur": 0, "args": { "External id": 31659, "cbid": 251, "correlation": 31659 } }, { "ph": "f", "id": 31659, "pid": 76337, "tid": -914061504, "ts": 1716454217959182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454217984563, "dur": 114, "args": { "External id": 31660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31660, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31660, "pid": 5, "tid": 7, "ts": 1716454217984563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959184, "dur": 13, "args": { "External id": 31660, "cbid": 211, "correlation": 31660 } }, { "ph": "s", "id": 31660, "pid": 76337, "tid": -914061504, "ts": 1716454217959184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454217984679, "dur": 59, "args": { "External id": 31666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31666, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31666, "pid": 5, "tid": 7, "ts": 1716454217984679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959221, "dur": 9, "args": { "External id": 31666, "cbid": 211, "correlation": 31666 } }, { "ph": "s", "id": 31666, "pid": 76337, "tid": -914061504, "ts": 1716454217959221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454217984739, "dur": 563, "args": { "External id": 31675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31675, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31675, "pid": 5, "tid": 7, "ts": 1716454217984739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959305, "dur": 14, "args": { "External id": 31675, "cbid": 211, "correlation": 31675 } }, { "ph": "s", "id": 31675, "pid": 76337, "tid": -914061504, "ts": 1716454217959305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454217985304, "dur": 179, "args": { "External id": 31697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31697, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31697, "pid": 5, "tid": 7, "ts": 1716454217985304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959362, "dur": 10, "args": { "External id": 31697, "cbid": 211, "correlation": 31697 } }, { "ph": "s", "id": 31697, "pid": 76337, "tid": -914061504, "ts": 1716454217959362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959463, "dur": 1, "args": { "External id": 31708, "cbid": 251, "correlation": 31708 } }, { "ph": "f", "id": 31708, "pid": 76337, "tid": -914061504, "ts": 1716454217959463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217985485, "dur": 193, "args": { "External id": 31709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31709, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31709, "pid": 5, "tid": 7, "ts": 1716454217985485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959468, "dur": 15, "args": { "External id": 31709, "cbid": 211, "correlation": 31709 } }, { "ph": "s", "id": 31709, "pid": 76337, "tid": -914061504, "ts": 1716454217959468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959551, "dur": 1, "args": { "External id": 31720, "cbid": 251, "correlation": 31720 } }, { "ph": "f", "id": 31720, "pid": 76337, "tid": -914061504, "ts": 1716454217959551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217985680, "dur": 187, "args": { "External id": 31721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31721, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31721, "pid": 5, "tid": 7, "ts": 1716454217985680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959556, "dur": 12, "args": { "External id": 31721, "cbid": 211, "correlation": 31721 } }, { "ph": "s", "id": 31721, "pid": 76337, "tid": -914061504, "ts": 1716454217959556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959632, "dur": 1, "args": { "External id": 31732, "cbid": 251, "correlation": 31732 } }, { "ph": "f", "id": 31732, "pid": 76337, "tid": -914061504, "ts": 1716454217959632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454217985868, "dur": 185, "args": { "External id": 31733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31733, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31733, "pid": 5, "tid": 7, "ts": 1716454217985868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959636, "dur": 12, "args": { "External id": 31733, "cbid": 211, "correlation": 31733 } }, { "ph": "s", "id": 31733, "pid": 76337, "tid": -914061504, "ts": 1716454217959636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454217986054, "dur": 18297, "args": { "External id": 31754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31754, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31754, "pid": 5, "tid": 7, "ts": 1716454217986054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959719, "dur": 12, "args": { "External id": 31754, "cbid": 211, "correlation": 31754 } }, { "ph": "s", "id": 31754, "pid": 76337, "tid": -914061504, "ts": 1716454217959719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217959828, "dur": 1, "args": { "External id": 31772, "cbid": 251, "correlation": 31772 } }, { "ph": "f", "id": 31772, "pid": 76337, "tid": -914061504, "ts": 1716454217959828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218004353, "dur": 201, "args": { "External id": 31774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31774, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31774, "pid": 5, "tid": 7, "ts": 1716454218004353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959834, "dur": 14, "args": { "External id": 31774, "cbid": 211, "correlation": 31774 } }, { "ph": "s", "id": 31774, "pid": 76337, "tid": -914061504, "ts": 1716454217959834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218004555, "dur": 66, "args": { "External id": 31782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31782, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31782, "pid": 5, "tid": 7, "ts": 1716454218004555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959907, "dur": 12, "args": { "External id": 31782, "cbid": 211, "correlation": 31782 } }, { "ph": "s", "id": 31782, "pid": 76337, "tid": -914061504, "ts": 1716454217959907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218004622, "dur": 96, "args": { "External id": 31790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31790, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31790, "pid": 5, "tid": 7, "ts": 1716454218004622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217959946, "dur": 9, "args": { "External id": 31790, "cbid": 211, "correlation": 31790 } }, { "ph": "s", "id": 31790, "pid": 76337, "tid": -914061504, "ts": 1716454217959946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218004720, "dur": 54, "args": { "External id": 31801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31801, "pid": 5, "tid": 7, "ts": 1716454218004720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960028, "dur": 13, "args": { "External id": 31801, "cbid": 211, "correlation": 31801 } }, { "ph": "s", "id": 31801, "pid": 76337, "tid": -914061504, "ts": 1716454217960028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218004775, "dur": 91, "args": { "External id": 31823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31823, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31823, "pid": 5, "tid": 7, "ts": 1716454218004775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960059, "dur": 7, "args": { "External id": 31823, "cbid": 211, "correlation": 31823 } }, { "ph": "s", "id": 31823, "pid": 76337, "tid": -914061504, "ts": 1716454217960059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960155, "dur": 1, "args": { "External id": 31834, "cbid": 251, "correlation": 31834 } }, { "ph": "f", "id": 31834, "pid": 76337, "tid": -914061504, "ts": 1716454217960155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218004868, "dur": 104, "args": { "External id": 31835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31835, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31835, "pid": 5, "tid": 7, "ts": 1716454218004868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960160, "dur": 14, "args": { "External id": 31835, "cbid": 211, "correlation": 31835 } }, { "ph": "s", "id": 31835, "pid": 76337, "tid": -914061504, "ts": 1716454217960160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960246, "dur": 1, "args": { "External id": 31846, "cbid": 251, "correlation": 31846 } }, { "ph": "f", "id": 31846, "pid": 76337, "tid": -914061504, "ts": 1716454217960246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960250, "dur": 0, "args": { "External id": 31847, "cbid": 251, "correlation": 31847 } }, { "ph": "f", "id": 31847, "pid": 76337, "tid": -914061504, "ts": 1716454217960250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218004973, "dur": 10, "args": { "External id": 31848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31848, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 31848, "pid": 5, "tid": 7, "ts": 1716454218004973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960252, "dur": 13, "args": { "External id": 31848, "cbid": 211, "correlation": 31848 } }, { "ph": "s", "id": 31848, "pid": 76337, "tid": -914061504, "ts": 1716454217960252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218004984, "dur": 5, "args": { "External id": 31850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31850, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 31850, "pid": 5, "tid": 7, "ts": 1716454218004984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960267, "dur": 6, "args": { "External id": 31850, "cbid": 211, "correlation": 31850 } }, { "ph": "s", "id": 31850, "pid": 76337, "tid": -914061504, "ts": 1716454217960267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960339, "dur": 1, "args": { "External id": 31861, "cbid": 251, "correlation": 31861 } }, { "ph": "f", "id": 31861, "pid": 76337, "tid": -914061504, "ts": 1716454217960339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960343, "dur": 0, "args": { "External id": 31862, "cbid": 251, "correlation": 31862 } }, { "ph": "f", "id": 31862, "pid": 76337, "tid": -914061504, "ts": 1716454217960343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218004990, "dur": 6, "args": { "External id": 31863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31863, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 31863, "pid": 5, "tid": 7, "ts": 1716454218004990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960345, "dur": 13, "args": { "External id": 31863, "cbid": 211, "correlation": 31863 } }, { "ph": "s", "id": 31863, "pid": 76337, "tid": -914061504, "ts": 1716454217960345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218004997, "dur": 3, "args": { "External id": 31865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31865, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 31865, "pid": 5, "tid": 7, "ts": 1716454218004997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960360, "dur": 6, "args": { "External id": 31865, "cbid": 211, "correlation": 31865 } }, { "ph": "s", "id": 31865, "pid": 76337, "tid": -914061504, "ts": 1716454217960360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218005002, "dur": 154, "args": { "External id": 31886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31886, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31886, "pid": 5, "tid": 7, "ts": 1716454218005002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960434, "dur": 12, "args": { "External id": 31886, "cbid": 211, "correlation": 31886 } }, { "ph": "s", "id": 31886, "pid": 76337, "tid": -914061504, "ts": 1716454217960434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960543, "dur": 1, "args": { "External id": 31904, "cbid": 251, "correlation": 31904 } }, { "ph": "f", "id": 31904, "pid": 76337, "tid": -914061504, "ts": 1716454217960543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218005157, "dur": 107, "args": { "External id": 31906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31906, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 31906, "pid": 5, "tid": 7, "ts": 1716454218005157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960549, "dur": 14, "args": { "External id": 31906, "cbid": 211, "correlation": 31906 } }, { "ph": "s", "id": 31906, "pid": 76337, "tid": -914061504, "ts": 1716454217960549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218005266, "dur": 34, "args": { "External id": 31914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31914, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31914, "pid": 5, "tid": 7, "ts": 1716454218005266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960621, "dur": 13, "args": { "External id": 31914, "cbid": 211, "correlation": 31914 } }, { "ph": "s", "id": 31914, "pid": 76337, "tid": -914061504, "ts": 1716454217960621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218005302, "dur": 66, "args": { "External id": 31922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31922, "pid": 5, "tid": 7, "ts": 1716454218005302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960663, "dur": 9, "args": { "External id": 31922, "cbid": 211, "correlation": 31922 } }, { "ph": "s", "id": 31922, "pid": 76337, "tid": -914061504, "ts": 1716454217960663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218005369, "dur": 91, "args": { "External id": 31944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31944, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31944, "pid": 5, "tid": 7, "ts": 1716454218005369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960714, "dur": 10, "args": { "External id": 31944, "cbid": 211, "correlation": 31944 } }, { "ph": "s", "id": 31944, "pid": 76337, "tid": -914061504, "ts": 1716454217960714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217960813, "dur": 1, "args": { "External id": 31960, "cbid": 251, "correlation": 31960 } }, { "ph": "f", "id": 31960, "pid": 76337, "tid": -914061504, "ts": 1716454217960813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218005462, "dur": 568, "args": { "External id": 31962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31962, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 31962, "pid": 5, "tid": 7, "ts": 1716454218005462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960819, "dur": 13, "args": { "External id": 31962, "cbid": 211, "correlation": 31962 } }, { "ph": "s", "id": 31962, "pid": 76337, "tid": -914061504, "ts": 1716454217960819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218006031, "dur": 242, "args": { "External id": 31970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31970, "pid": 5, "tid": 7, "ts": 1716454218006031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960886, "dur": 13, "args": { "External id": 31970, "cbid": 211, "correlation": 31970 } }, { "ph": "s", "id": 31970, "pid": 76337, "tid": -914061504, "ts": 1716454217960886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218006275, "dur": 252, "args": { "External id": 31978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31978, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 31978, "pid": 5, "tid": 7, "ts": 1716454218006275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217960918, "dur": 8, "args": { "External id": 31978, "cbid": 211, "correlation": 31978 } }, { "ph": "s", "id": 31978, "pid": 76337, "tid": -914061504, "ts": 1716454217960918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961021, "dur": 2, "args": { "External id": 31994, "cbid": 251, "correlation": 31994 } }, { "ph": "f", "id": 31994, "pid": 76337, "tid": -914061504, "ts": 1716454217961021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961027, "dur": 0, "args": { "External id": 31996, "cbid": 251, "correlation": 31996 } }, { "ph": "f", "id": 31996, "pid": 76337, "tid": -914061504, "ts": 1716454217961027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218006528, "dur": 356, "args": { "External id": 31997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 31997, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 31997, "pid": 5, "tid": 7, "ts": 1716454218006528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961030, "dur": 14, "args": { "External id": 31997, "cbid": 211, "correlation": 31997 } }, { "ph": "s", "id": 31997, "pid": 76337, "tid": -914061504, "ts": 1716454217961030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218006886, "dur": 50, "args": { "External id": 32005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32005, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32005, "pid": 5, "tid": 7, "ts": 1716454218006886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961075, "dur": 10, "args": { "External id": 32005, "cbid": 211, "correlation": 32005 } }, { "ph": "s", "id": 32005, "pid": 76337, "tid": -914061504, "ts": 1716454217961075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218006937, "dur": 156, "args": { "External id": 32016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32016, "pid": 5, "tid": 7, "ts": 1716454218006937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961144, "dur": 13, "args": { "External id": 32016, "cbid": 211, "correlation": 32016 } }, { "ph": "s", "id": 32016, "pid": 76337, "tid": -914061504, "ts": 1716454217961144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217961209, "dur": 0, "args": { "External id": 32028, "cbid": 317, "correlation": 32028 } }, { "ph": "f", "id": 32028, "pid": 76337, "tid": -914061504, "ts": 1716454217961209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217961210, "dur": 0, "args": { "External id": 32029, "cbid": 203, "correlation": 32029 } }, { "ph": "f", "id": 32029, "pid": 76337, "tid": -914061504, "ts": 1716454217961210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217961211, "dur": 0, "args": { "External id": 32030, "cbid": 205, "correlation": 32030 } }, { "ph": "f", "id": 32030, "pid": 76337, "tid": -914061504, "ts": 1716454217961211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961234, "dur": 1, "args": { "External id": 32034, "cbid": 251, "correlation": 32034 } }, { "ph": "f", "id": 32034, "pid": 76337, "tid": -914061504, "ts": 1716454217961234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961236, "dur": 0, "args": { "External id": 32035, "cbid": 251, "correlation": 32035 } }, { "ph": "f", "id": 32035, "pid": 76337, "tid": -914061504, "ts": 1716454217961236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961237, "dur": 0, "args": { "External id": 32036, "cbid": 251, "correlation": 32036 } }, { "ph": "f", "id": 32036, "pid": 76337, "tid": -914061504, "ts": 1716454217961237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961237, "dur": 0, "args": { "External id": 32037, "cbid": 251, "correlation": 32037 } }, { "ph": "f", "id": 32037, "pid": 76337, "tid": -914061504, "ts": 1716454217961237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961238, "dur": 0, "args": { "External id": 32038, "cbid": 251, "correlation": 32038 } }, { "ph": "f", "id": 32038, "pid": 76337, "tid": -914061504, "ts": 1716454217961238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961239, "dur": 0, "args": { "External id": 32039, "cbid": 251, "correlation": 32039 } }, { "ph": "f", "id": 32039, "pid": 76337, "tid": -914061504, "ts": 1716454217961239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961240, "dur": 0, "args": { "External id": 32040, "cbid": 251, "correlation": 32040 } }, { "ph": "f", "id": 32040, "pid": 76337, "tid": -914061504, "ts": 1716454217961240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961240, "dur": 0, "args": { "External id": 32041, "cbid": 251, "correlation": 32041 } }, { "ph": "f", "id": 32041, "pid": 76337, "tid": -914061504, "ts": 1716454217961240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961242, "dur": 0, "args": { "External id": 32042, "cbid": 251, "correlation": 32042 } }, { "ph": "f", "id": 32042, "pid": 76337, "tid": -914061504, "ts": 1716454217961242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218007094, "dur": 115, "args": { "External id": 32043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32043, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 32043, "pid": 5, "tid": 7, "ts": 1716454218007094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961244, "dur": 13, "args": { "External id": 32043, "cbid": 211, "correlation": 32043 } }, { "ph": "s", "id": 32043, "pid": 76337, "tid": -914061504, "ts": 1716454217961244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218007211, "dur": 59, "args": { "External id": 32049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32049, "pid": 5, "tid": 7, "ts": 1716454218007211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961279, "dur": 9, "args": { "External id": 32049, "cbid": 211, "correlation": 32049 } }, { "ph": "s", "id": 32049, "pid": 76337, "tid": -914061504, "ts": 1716454217961279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218007271, "dur": 50, "args": { "External id": 32057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32057, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32057, "pid": 5, "tid": 7, "ts": 1716454218007271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961311, "dur": 8, "args": { "External id": 32057, "cbid": 211, "correlation": 32057 } }, { "ph": "s", "id": 32057, "pid": 76337, "tid": -914061504, "ts": 1716454217961311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218007322, "dur": 53, "args": { "External id": 32077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32077, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 32077, "pid": 5, "tid": 7, "ts": 1716454218007322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961396, "dur": 13, "args": { "External id": 32077, "cbid": 211, "correlation": 32077 } }, { "ph": "s", "id": 32077, "pid": 76337, "tid": -914061504, "ts": 1716454217961396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218007376, "dur": 5, "args": { "External id": 32089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32089, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32089, "pid": 5, "tid": 7, "ts": 1716454218007376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961418, "dur": 6, "args": { "External id": 32089, "cbid": 211, "correlation": 32089 } }, { "ph": "s", "id": 32089, "pid": 76337, "tid": -914061504, "ts": 1716454217961418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218007382, "dur": 57, "args": { "External id": 32092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32092, "pid": 5, "tid": 7, "ts": 1716454218007382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961436, "dur": 7, "args": { "External id": 32092, "cbid": 211, "correlation": 32092 } }, { "ph": "s", "id": 32092, "pid": 76337, "tid": -914061504, "ts": 1716454217961436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218007440, "dur": 37, "args": { "External id": 32101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32101, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32101, "pid": 5, "tid": 7, "ts": 1716454218007440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961481, "dur": 10, "args": { "External id": 32101, "cbid": 211, "correlation": 32101 } }, { "ph": "s", "id": 32101, "pid": 76337, "tid": -914061504, "ts": 1716454217961481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454217961557, "dur": 0, "args": { "External id": 32111, "cbid": 317, "correlation": 32111 } }, { "ph": "f", "id": 32111, "pid": 76337, "tid": -914061504, "ts": 1716454217961557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454217961558, "dur": 0, "args": { "External id": 32112, "cbid": 203, "correlation": 32112 } }, { "ph": "f", "id": 32112, "pid": 76337, "tid": -914061504, "ts": 1716454217961558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454217961559, "dur": 0, "args": { "External id": 32113, "cbid": 205, "correlation": 32113 } }, { "ph": "f", "id": 32113, "pid": 76337, "tid": -914061504, "ts": 1716454217961559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218007478, "dur": 41, "args": { "External id": 32117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32117, "pid": 5, "tid": 7, "ts": 1716454218007478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961576, "dur": 13, "args": { "External id": 32117, "cbid": 211, "correlation": 32117 } }, { "ph": "s", "id": 32117, "pid": 76337, "tid": -914061504, "ts": 1716454217961576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218007520, "dur": 3, "args": { "External id": 32119, "device": 5, "context": 1, "stream": 7, "correlation": 32119, "bytes": 46080, "memory bandwidth (GB/s)": 12.413793103448276 } }, { "ph": "f", "id": 32119, "pid": 5, "tid": 7, "ts": 1716454218007520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217961592, "dur": 24, "args": { "External id": 32119, "cbid": 51, "correlation": 32119 } }, { "ph": "s", "id": 32119, "pid": 76337, "tid": -914061504, "ts": 1716454217961592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454217961622, "dur": 1, "args": { "External id": 32121, "cbid": 200, "correlation": 32121 } }, { "ph": "f", "id": 32121, "pid": 76337, "tid": -914061504, "ts": 1716454217961622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454217961624, "dur": 0, "args": { "External id": 32122, "cbid": 200, "correlation": 32122 } }, { "ph": "f", "id": 32122, "pid": 76337, "tid": -914061504, "ts": 1716454217961624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454217961625, "dur": 0, "args": { "External id": 32123, "cbid": 200, "correlation": 32123 } }, { "ph": "f", "id": 32123, "pid": 76337, "tid": -914061504, "ts": 1716454217961625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454217961626, "dur": 0, "args": { "External id": 32124, "cbid": 200, "correlation": 32124 } }, { "ph": "f", "id": 32124, "pid": 76337, "tid": -914061504, "ts": 1716454217961626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454217961627, "dur": 15, "args": { "External id": 32125, "cbid": 15, "correlation": 32125 } }, { "ph": "f", "id": 32125, "pid": 76337, "tid": -914061504, "ts": 1716454217961627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454217961643, "dur": 1, "args": { "External id": 32126, "cbid": 251, "correlation": 32126 } }, { "ph": "f", "id": 32126, "pid": 76337, "tid": -914061504, "ts": 1716454217961643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454218007524, "dur": 25, "args": { "External id": 32127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32127, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32127, "pid": 5, "tid": 7, "ts": 1716454218007524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961651, "dur": 11, "args": { "External id": 32127, "cbid": 211, "correlation": 32127 } }, { "ph": "s", "id": 32127, "pid": 76337, "tid": -914061504, "ts": 1716454217961651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218007550, "dur": 4, "args": { "External id": 32129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32129, "pid": 5, "tid": 7, "ts": 1716454218007550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961668, "dur": 6, "args": { "External id": 32129, "cbid": 211, "correlation": 32129 } }, { "ph": "s", "id": 32129, "pid": 76337, "tid": -914061504, "ts": 1716454217961668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217961678, "dur": 0, "args": { "External id": 32130, "cbid": 51, "correlation": 32130 } }, { "ph": "s", "id": 32130, "pid": 76337, "tid": -914061504, "ts": 1716454217961678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218007556, "dur": 188, "args": { "External id": 32131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32131, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32131, "pid": 5, "tid": 7, "ts": 1716454218007556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961679, "dur": 8, "args": { "External id": 32131, "cbid": 211, "correlation": 32131 } }, { "ph": "s", "id": 32131, "pid": 76337, "tid": -914061504, "ts": 1716454217961679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218007745, "dur": 6, "args": { "External id": 32132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32132, "pid": 5, "tid": 7, "ts": 1716454218007745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961690, "dur": 6, "args": { "External id": 32132, "cbid": 211, "correlation": 32132 } }, { "ph": "s", "id": 32132, "pid": 76337, "tid": -914061504, "ts": 1716454217961690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218007753, "dur": 5, "args": { "External id": 32138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 32138, "pid": 5, "tid": 7, "ts": 1716454218007753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217961721, "dur": 9, "args": { "External id": 32138, "cbid": 211, "correlation": 32138 } }, { "ph": "s", "id": 32138, "pid": 76337, "tid": -914061504, "ts": 1716454217961721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218007759, "dur": 3, "args": { "External id": 32146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32146, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32146, "pid": 5, "tid": 7, "ts": 1716454218007759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217963489, "dur": 15, "args": { "External id": 32146, "cbid": 211, "correlation": 32146 } }, { "ph": "s", "id": 32146, "pid": 76337, "tid": -914061504, "ts": 1716454217963489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218007763, "dur": 3, "args": { "External id": 32154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32154, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32154, "pid": 5, "tid": 7, "ts": 1716454218007763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217963530, "dur": 10, "args": { "External id": 32154, "cbid": 211, "correlation": 32154 } }, { "ph": "s", "id": 32154, "pid": 76337, "tid": -914061504, "ts": 1716454217963530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218007767, "dur": 3, "args": { "External id": 32162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32162, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32162, "pid": 5, "tid": 7, "ts": 1716454218007767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217963559, "dur": 8, "args": { "External id": 32162, "cbid": 211, "correlation": 32162 } }, { "ph": "s", "id": 32162, "pid": 76337, "tid": -914061504, "ts": 1716454217963559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218007778, "dur": 3, "args": { "External id": 32170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32170, "pid": 5, "tid": 7, "ts": 1716454218007778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217963675, "dur": 163, "args": { "External id": 32170, "cbid": 211, "correlation": 32170 } }, { "ph": "s", "id": 32170, "pid": 76337, "tid": -914061504, "ts": 1716454217963675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceCount", "pid": 76337, "tid": -914061504, "ts": 1716454217963903, "dur": 0, "args": { "External id": 32182, "cbid": 3, "correlation": 32182 } }, { "ph": "f", "id": 32182, "pid": 76337, "tid": -914061504, "ts": 1716454217963903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454217963905, "dur": 79, "args": { "External id": 32186, "cbid": 15, "correlation": 32186 } }, { "ph": "f", "id": 32186, "pid": 76337, "tid": -914061504, "ts": 1716454217963905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454218007793, "dur": 3, "args": { "External id": 32196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32196, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32196, "pid": 5, "tid": 7, "ts": 1716454218007793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454217963997, "dur": 56, "args": { "External id": 32196, "cbid": 211, "correlation": 32196 } }, { "ph": "s", "id": 32196, "pid": 76337, "tid": -914061504, "ts": 1716454217963997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454217964054, "dur": 0, "args": { "External id": 32197, "cbid": 11, "correlation": 32197 } }, { "ph": "f", "id": 32197, "pid": 76337, "tid": -914061504, "ts": 1716454217964054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454217964054, "dur": 0, "args": { "External id": 32198, "cbid": 11, "correlation": 32198 } }, { "ph": "f", "id": 32198, "pid": 76337, "tid": -914061504, "ts": 1716454217964054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218007800, "dur": 3, "args": { "External id": 32201, "device": 5, "context": 1, "stream": 7, "correlation": 32201, "bytes": 4, "memory bandwidth (GB/s)": 0.0010869565217391304 } }, { "ph": "f", "id": 32201, "pid": 5, "tid": 7, "ts": 1716454218007800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454217964056, "dur": 43762, "args": { "External id": 32201, "cbid": 41, "correlation": 32201 } }, { "ph": "s", "id": 32201, "pid": 76337, "tid": -914061504, "ts": 1716454217964056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218007820, "dur": 11, "args": { "External id": 32202, "cbid": 131, "correlation": 32202 } }, { "ph": "f", "id": 32202, "pid": 76337, "tid": -914061504, "ts": 1716454218007820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454218007913, "dur": 2, "args": { "External id": 32226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32226, "pid": 5, "tid": 7, "ts": 1716454218007913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218007869, "dur": 45, "args": { "External id": 32226, "cbid": 211, "correlation": 32226 } }, { "ph": "s", "id": 32226, "pid": 76337, "tid": -914061504, "ts": 1716454218007869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218007914, "dur": 0, "args": { "External id": 32227, "cbid": 11, "correlation": 32227 } }, { "ph": "f", "id": 32227, "pid": 76337, "tid": -914061504, "ts": 1716454218007914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218007915, "dur": 0, "args": { "External id": 32228, "cbid": 11, "correlation": 32228 } }, { "ph": "f", "id": 32228, "pid": 76337, "tid": -914061504, "ts": 1716454218007915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218007916, "dur": 0, "args": { "External id": 32230, "cbid": 200, "correlation": 32230 } }, { "ph": "f", "id": 32230, "pid": 76337, "tid": -914061504, "ts": 1716454218007916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454218007972, "dur": 3, "args": { "External id": 32232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32232, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32232, "pid": 5, "tid": 7, "ts": 1716454218007972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218007918, "dur": 62, "args": { "External id": 32232, "cbid": 211, "correlation": 32232 } }, { "ph": "s", "id": 32232, "pid": 76337, "tid": -914061504, "ts": 1716454218007918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218007981, "dur": 0, "args": { "External id": 32233, "cbid": 11, "correlation": 32233 } }, { "ph": "f", "id": 32233, "pid": 76337, "tid": -914061504, "ts": 1716454218007981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218007982, "dur": 0, "args": { "External id": 32234, "cbid": 11, "correlation": 32234 } }, { "ph": "f", "id": 32234, "pid": 76337, "tid": -914061504, "ts": 1716454218007982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218008122, "dur": 2, "args": { "External id": 32243, "device": 5, "context": 1, "stream": 7, "correlation": 32243, "bytes": 8, "memory bandwidth (GB/s)": 0.0035714285714285713 } }, { "ph": "f", "id": 32243, "pid": 5, "tid": 7, "ts": 1716454218008122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218008095, "dur": 37, "args": { "External id": 32243, "cbid": 41, "correlation": 32243 } }, { "ph": "s", "id": 32243, "pid": 76337, "tid": -914061504, "ts": 1716454218008095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218008133, "dur": 4, "args": { "External id": 32244, "cbid": 131, "correlation": 32244 } }, { "ph": "f", "id": 32244, "pid": 76337, "tid": -914061504, "ts": 1716454218008133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218008387, "dur": 3, "args": { "External id": 32252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32252, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32252, "pid": 5, "tid": 7, "ts": 1716454218008387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218008370, "dur": 16, "args": { "External id": 32252, "cbid": 211, "correlation": 32252 } }, { "ph": "s", "id": 32252, "pid": 76337, "tid": -914061504, "ts": 1716454218008370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218008415, "dur": 4, "args": { "External id": 32261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32261, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32261, "pid": 5, "tid": 7, "ts": 1716454218008415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218008406, "dur": 8, "args": { "External id": 32261, "cbid": 211, "correlation": 32261 } }, { "ph": "s", "id": 32261, "pid": 76337, "tid": -914061504, "ts": 1716454218008406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218008445, "dur": 3, "args": { "External id": 32269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32269, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32269, "pid": 5, "tid": 7, "ts": 1716454218008445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218008435, "dur": 9, "args": { "External id": 32269, "cbid": 211, "correlation": 32269 } }, { "ph": "s", "id": 32269, "pid": 76337, "tid": -914061504, "ts": 1716454218008435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218008733, "dur": 1, "args": { "External id": 32279, "device": 5, "context": 1, "stream": 7, "correlation": 32279, "bytes": 4, "memory bandwidth (GB/s)": 0.0021551724137931034 } }, { "ph": "f", "id": 32279, "pid": 5, "tid": 7, "ts": 1716454218008733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218008705, "dur": 26, "args": { "External id": 32279, "cbid": 41, "correlation": 32279 } }, { "ph": "s", "id": 32279, "pid": 76337, "tid": -914061504, "ts": 1716454218008705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218008732, "dur": 9, "args": { "External id": 32280, "cbid": 131, "correlation": 32280 } }, { "ph": "f", "id": 32280, "pid": 76337, "tid": -914061504, "ts": 1716454218008732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218009014, "dur": 3, "args": { "External id": 32288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32288, "pid": 5, "tid": 7, "ts": 1716454218009014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218008995, "dur": 19, "args": { "External id": 32288, "cbid": 211, "correlation": 32288 } }, { "ph": "s", "id": 32288, "pid": 76337, "tid": -914061504, "ts": 1716454218008995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218009137, "dur": 3, "args": { "External id": 32297, "device": 5, "context": 1, "stream": 7, "correlation": 32297, "bytes": 4, "memory bandwidth (GB/s)": 0.0011467889908256881 } }, { "ph": "f", "id": 32297, "pid": 5, "tid": 7, "ts": 1716454218009137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009111, "dur": 26, "args": { "External id": 32297, "cbid": 41, "correlation": 32297 } }, { "ph": "s", "id": 32297, "pid": 76337, "tid": -914061504, "ts": 1716454218009111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218009212, "dur": 1, "args": { "External id": 32308, "device": 5, "context": 1, "stream": 7, "correlation": 32308, "bytes": 4, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 32308, "pid": 5, "tid": 7, "ts": 1716454218009212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009184, "dur": 26, "args": { "External id": 32308, "cbid": 41, "correlation": 32308 } }, { "ph": "s", "id": 32308, "pid": 76337, "tid": -914061504, "ts": 1716454218009184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009211, "dur": 8, "args": { "External id": 32309, "cbid": 131, "correlation": 32309 } }, { "ph": "f", "id": 32309, "pid": 76337, "tid": -914061504, "ts": 1716454218009211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218009268, "dur": 3, "args": { "External id": 32317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32317, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32317, "pid": 5, "tid": 7, "ts": 1716454218009268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009254, "dur": 14, "args": { "External id": 32317, "cbid": 211, "correlation": 32317 } }, { "ph": "s", "id": 32317, "pid": 76337, "tid": -914061504, "ts": 1716454218009254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218009298, "dur": 3, "args": { "External id": 32327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32327, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32327, "pid": 5, "tid": 7, "ts": 1716454218009298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009289, "dur": 8, "args": { "External id": 32327, "cbid": 211, "correlation": 32327 } }, { "ph": "s", "id": 32327, "pid": 76337, "tid": -914061504, "ts": 1716454218009289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218009321, "dur": 3, "args": { "External id": 32336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32336, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32336, "pid": 5, "tid": 7, "ts": 1716454218009321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009312, "dur": 7, "args": { "External id": 32336, "cbid": 211, "correlation": 32336 } }, { "ph": "s", "id": 32336, "pid": 76337, "tid": -914061504, "ts": 1716454218009312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218009422, "dur": 3, "args": { "External id": 32344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32344, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32344, "pid": 5, "tid": 7, "ts": 1716454218009422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009360, "dur": 62, "args": { "External id": 32344, "cbid": 211, "correlation": 32344 } }, { "ph": "s", "id": 32344, "pid": 76337, "tid": -914061504, "ts": 1716454218009360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218009495, "dur": 3, "args": { "External id": 32352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32352, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32352, "pid": 5, "tid": 7, "ts": 1716454218009495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009483, "dur": 12, "args": { "External id": 32352, "cbid": 211, "correlation": 32352 } }, { "ph": "s", "id": 32352, "pid": 76337, "tid": -914061504, "ts": 1716454218009483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218009555, "dur": 2, "args": { "External id": 32360, "device": 5, "context": 1, "stream": 7, "correlation": 32360, "bytes": 8, "memory bandwidth (GB/s)": 0.00390625 } }, { "ph": "f", "id": 32360, "pid": 5, "tid": 7, "ts": 1716454218009555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009538, "dur": 27, "args": { "External id": 32360, "cbid": 41, "correlation": 32360 } }, { "ph": "s", "id": 32360, "pid": 76337, "tid": -914061504, "ts": 1716454218009538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009566, "dur": 3, "args": { "External id": 32361, "cbid": 131, "correlation": 32361 } }, { "ph": "f", "id": 32361, "pid": 76337, "tid": -914061504, "ts": 1716454218009566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218009648, "dur": 1, "args": { "External id": 32371, "device": 5, "context": 1, "stream": 7, "correlation": 32371, "bytes": 42, "memory bandwidth (GB/s)": 0.027925531914893616 } }, { "ph": "f", "id": 32371, "pid": 5, "tid": 7, "ts": 1716454218009648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009634, "dur": 12, "args": { "External id": 32371, "cbid": 41, "correlation": 32371 } }, { "ph": "s", "id": 32371, "pid": 76337, "tid": -914061504, "ts": 1716454218009634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009646, "dur": 8, "args": { "External id": 32372, "cbid": 131, "correlation": 32372 } }, { "ph": "f", "id": 32372, "pid": 76337, "tid": -914061504, "ts": 1716454218009646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218009696, "dur": 1, "args": { "External id": 32381, "device": 5, "context": 1, "stream": 7, "correlation": 32381, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 32381, "pid": 5, "tid": 7, "ts": 1716454218009696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009686, "dur": 8, "args": { "External id": 32381, "cbid": 41, "correlation": 32381 } }, { "ph": "s", "id": 32381, "pid": 76337, "tid": -914061504, "ts": 1716454218009686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009695, "dur": 8, "args": { "External id": 32382, "cbid": 131, "correlation": 32382 } }, { "ph": "f", "id": 32382, "pid": 76337, "tid": -914061504, "ts": 1716454218009695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218009760, "dur": 3, "args": { "External id": 32389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32389, "pid": 5, "tid": 7, "ts": 1716454218009760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009744, "dur": 16, "args": { "External id": 32389, "cbid": 211, "correlation": 32389 } }, { "ph": "s", "id": 32389, "pid": 76337, "tid": -914061504, "ts": 1716454218009744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454218009787, "dur": 3, "args": { "External id": 32409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32409, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32409, "pid": 5, "tid": 7, "ts": 1716454218009787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009778, "dur": 8, "args": { "External id": 32409, "cbid": 211, "correlation": 32409 } }, { "ph": "s", "id": 32409, "pid": 76337, "tid": -914061504, "ts": 1716454218009778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218009787, "dur": 0, "args": { "External id": 32410, "cbid": 11, "correlation": 32410 } }, { "ph": "f", "id": 32410, "pid": 76337, "tid": -914061504, "ts": 1716454218009787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218009787, "dur": 0, "args": { "External id": 32411, "cbid": 11, "correlation": 32411 } }, { "ph": "f", "id": 32411, "pid": 76337, "tid": -914061504, "ts": 1716454218009787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218009813, "dur": 1, "args": { "External id": 32414, "device": 5, "context": 1, "stream": 7, "correlation": 32414, "bytes": 4, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 32414, "pid": 5, "tid": 7, "ts": 1716454218009813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009789, "dur": 33, "args": { "External id": 32414, "cbid": 41, "correlation": 32414 } }, { "ph": "s", "id": 32414, "pid": 76337, "tid": -914061504, "ts": 1716454218009789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009823, "dur": 3, "args": { "External id": 32415, "cbid": 131, "correlation": 32415 } }, { "ph": "f", "id": 32415, "pid": 76337, "tid": -914061504, "ts": 1716454218009823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454218009858, "dur": 2, "args": { "External id": 32439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32439, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32439, "pid": 5, "tid": 7, "ts": 1716454218009858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009848, "dur": 9, "args": { "External id": 32439, "cbid": 211, "correlation": 32439 } }, { "ph": "s", "id": 32439, "pid": 76337, "tid": -914061504, "ts": 1716454218009848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218009858, "dur": 0, "args": { "External id": 32440, "cbid": 11, "correlation": 32440 } }, { "ph": "f", "id": 32440, "pid": 76337, "tid": -914061504, "ts": 1716454218009858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218009858, "dur": 0, "args": { "External id": 32441, "cbid": 11, "correlation": 32441 } }, { "ph": "f", "id": 32441, "pid": 76337, "tid": -914061504, "ts": 1716454218009858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218009859, "dur": 0, "args": { "External id": 32443, "cbid": 200, "correlation": 32443 } }, { "ph": "f", "id": 32443, "pid": 76337, "tid": -914061504, "ts": 1716454218009859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454218009868, "dur": 3, "args": { "External id": 32445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32445, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32445, "pid": 5, "tid": 7, "ts": 1716454218009868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218009861, "dur": 7, "args": { "External id": 32445, "cbid": 211, "correlation": 32445 } }, { "ph": "s", "id": 32445, "pid": 76337, "tid": -914061504, "ts": 1716454218009861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218009868, "dur": 0, "args": { "External id": 32446, "cbid": 11, "correlation": 32446 } }, { "ph": "f", "id": 32446, "pid": 76337, "tid": -914061504, "ts": 1716454218009868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218009869, "dur": 0, "args": { "External id": 32447, "cbid": 11, "correlation": 32447 } }, { "ph": "f", "id": 32447, "pid": 76337, "tid": -914061504, "ts": 1716454218009869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218009902, "dur": 2, "args": { "External id": 32454, "device": 5, "context": 1, "stream": 7, "correlation": 32454, "bytes": 8, "memory bandwidth (GB/s)": 0.0038461538461538464 } }, { "ph": "f", "id": 32454, "pid": 5, "tid": 7, "ts": 1716454218009902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009888, "dur": 23, "args": { "External id": 32454, "cbid": 41, "correlation": 32454 } }, { "ph": "s", "id": 32454, "pid": 76337, "tid": -914061504, "ts": 1716454218009888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009912, "dur": 3, "args": { "External id": 32455, "cbid": 131, "correlation": 32455 } }, { "ph": "f", "id": 32455, "pid": 76337, "tid": -914061504, "ts": 1716454218009912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218009963, "dur": 1, "args": { "External id": 32465, "device": 5, "context": 1, "stream": 7, "correlation": 32465, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 32465, "pid": 5, "tid": 7, "ts": 1716454218009963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218009950, "dur": 10, "args": { "External id": 32465, "cbid": 41, "correlation": 32465 } }, { "ph": "s", "id": 32465, "pid": 76337, "tid": -914061504, "ts": 1716454218009950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218009961, "dur": 8, "args": { "External id": 32466, "cbid": 131, "correlation": 32466 } }, { "ph": "f", "id": 32466, "pid": 76337, "tid": -914061504, "ts": 1716454218009961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218010283, "dur": 4, "args": { "External id": 32473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32473, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32473, "pid": 5, "tid": 7, "ts": 1716454218010283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010034, "dur": 251, "args": { "External id": 32473, "cbid": 211, "correlation": 32473 } }, { "ph": "s", "id": 32473, "pid": 76337, "tid": -914061504, "ts": 1716454218010034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010451, "dur": 3, "args": { "External id": 32482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32482, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32482, "pid": 5, "tid": 7, "ts": 1716454218010451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010395, "dur": 57, "args": { "External id": 32482, "cbid": 211, "correlation": 32482 } }, { "ph": "s", "id": 32482, "pid": 76337, "tid": -914061504, "ts": 1716454218010395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010492, "dur": 3, "args": { "External id": 32490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32490, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32490, "pid": 5, "tid": 7, "ts": 1716454218010492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010481, "dur": 10, "args": { "External id": 32490, "cbid": 211, "correlation": 32490 } }, { "ph": "s", "id": 32490, "pid": 76337, "tid": -914061504, "ts": 1716454218010481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010585, "dur": 3, "args": { "External id": 32498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32498, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32498, "pid": 5, "tid": 7, "ts": 1716454218010585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010528, "dur": 58, "args": { "External id": 32498, "cbid": 211, "correlation": 32498 } }, { "ph": "s", "id": 32498, "pid": 76337, "tid": -914061504, "ts": 1716454218010528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010687, "dur": 3, "args": { "External id": 32506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32506, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32506, "pid": 5, "tid": 7, "ts": 1716454218010687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010628, "dur": 59, "args": { "External id": 32506, "cbid": 211, "correlation": 32506 } }, { "ph": "s", "id": 32506, "pid": 76337, "tid": -914061504, "ts": 1716454218010628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010724, "dur": 3, "args": { "External id": 32514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32514, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32514, "pid": 5, "tid": 7, "ts": 1716454218010724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010713, "dur": 10, "args": { "External id": 32514, "cbid": 211, "correlation": 32514 } }, { "ph": "s", "id": 32514, "pid": 76337, "tid": -914061504, "ts": 1716454218010713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010791, "dur": 3, "args": { "External id": 32522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32522, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32522, "pid": 5, "tid": 7, "ts": 1716454218010791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010742, "dur": 49, "args": { "External id": 32522, "cbid": 211, "correlation": 32522 } }, { "ph": "s", "id": 32522, "pid": 76337, "tid": -914061504, "ts": 1716454218010742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218010823, "dur": 6, "args": { "External id": 32530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32530, "pid": 5, "tid": 7, "ts": 1716454218010823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010813, "dur": 10, "args": { "External id": 32530, "cbid": 211, "correlation": 32530 } }, { "ph": "s", "id": 32530, "pid": 76337, "tid": -914061504, "ts": 1716454218010813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218010844, "dur": 5, "args": { "External id": 32538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32538, "pid": 5, "tid": 7, "ts": 1716454218010844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010835, "dur": 7, "args": { "External id": 32538, "cbid": 211, "correlation": 32538 } }, { "ph": "s", "id": 32538, "pid": 76337, "tid": -914061504, "ts": 1716454218010835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010864, "dur": 3, "args": { "External id": 32546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32546, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32546, "pid": 5, "tid": 7, "ts": 1716454218010864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010856, "dur": 7, "args": { "External id": 32546, "cbid": 211, "correlation": 32546 } }, { "ph": "s", "id": 32546, "pid": 76337, "tid": -914061504, "ts": 1716454218010856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218010990, "dur": 3, "args": { "External id": 32554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32554, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32554, "pid": 5, "tid": 7, "ts": 1716454218010990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218010925, "dur": 65, "args": { "External id": 32554, "cbid": 211, "correlation": 32554 } }, { "ph": "s", "id": 32554, "pid": 76337, "tid": -914061504, "ts": 1716454218010925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218011027, "dur": 5, "args": { "External id": 32562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32562, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32562, "pid": 5, "tid": 7, "ts": 1716454218011027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011016, "dur": 10, "args": { "External id": 32562, "cbid": 211, "correlation": 32562 } }, { "ph": "s", "id": 32562, "pid": 76337, "tid": -914061504, "ts": 1716454218011016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218011051, "dur": 4, "args": { "External id": 32570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32570, "pid": 5, "tid": 7, "ts": 1716454218011051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011042, "dur": 8, "args": { "External id": 32570, "cbid": 211, "correlation": 32570 } }, { "ph": "s", "id": 32570, "pid": 76337, "tid": -914061504, "ts": 1716454218011042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011072, "dur": 3, "args": { "External id": 32578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32578, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 32578, "pid": 5, "tid": 7, "ts": 1716454218011072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011063, "dur": 7, "args": { "External id": 32578, "cbid": 211, "correlation": 32578 } }, { "ph": "s", "id": 32578, "pid": 76337, "tid": -914061504, "ts": 1716454218011063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218011452, "dur": 5, "args": { "External id": 32587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32587, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32587, "pid": 5, "tid": 7, "ts": 1716454218011452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011434, "dur": 19, "args": { "External id": 32587, "cbid": 211, "correlation": 32587 } }, { "ph": "s", "id": 32587, "pid": 76337, "tid": -914061504, "ts": 1716454218011434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218011489, "dur": 5, "args": { "External id": 32596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32596, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32596, "pid": 5, "tid": 7, "ts": 1716454218011489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011479, "dur": 9, "args": { "External id": 32596, "cbid": 211, "correlation": 32596 } }, { "ph": "s", "id": 32596, "pid": 76337, "tid": -914061504, "ts": 1716454218011479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454218011630, "dur": 3, "args": { "External id": 32612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32612, "pid": 5, "tid": 7, "ts": 1716454218011630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011615, "dur": 16, "args": { "External id": 32612, "cbid": 211, "correlation": 32612 } }, { "ph": "s", "id": 32612, "pid": 76337, "tid": -914061504, "ts": 1716454218011615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011667, "dur": 3, "args": { "External id": 32620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32620, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32620, "pid": 5, "tid": 7, "ts": 1716454218011667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011657, "dur": 9, "args": { "External id": 32620, "cbid": 211, "correlation": 32620 } }, { "ph": "s", "id": 32620, "pid": 76337, "tid": -914061504, "ts": 1716454218011657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011699, "dur": 3, "args": { "External id": 32628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32628, "pid": 5, "tid": 7, "ts": 1716454218011699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011689, "dur": 9, "args": { "External id": 32628, "cbid": 211, "correlation": 32628 } }, { "ph": "s", "id": 32628, "pid": 76337, "tid": -914061504, "ts": 1716454218011689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011731, "dur": 4, "args": { "External id": 32636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32636, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32636, "pid": 5, "tid": 7, "ts": 1716454218011731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011721, "dur": 9, "args": { "External id": 32636, "cbid": 211, "correlation": 32636 } }, { "ph": "s", "id": 32636, "pid": 76337, "tid": -914061504, "ts": 1716454218011721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454218011789, "dur": 4, "args": { "External id": 32648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32648, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32648, "pid": 5, "tid": 7, "ts": 1716454218011789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011776, "dur": 13, "args": { "External id": 32648, "cbid": 211, "correlation": 32648 } }, { "ph": "s", "id": 32648, "pid": 76337, "tid": -914061504, "ts": 1716454218011776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218011836, "dur": 4, "args": { "External id": 32659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32659, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32659, "pid": 5, "tid": 7, "ts": 1716454218011836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011824, "dur": 12, "args": { "External id": 32659, "cbid": 211, "correlation": 32659 } }, { "ph": "s", "id": 32659, "pid": 76337, "tid": -914061504, "ts": 1716454218011824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011867, "dur": 3, "args": { "External id": 32667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32667, "pid": 5, "tid": 7, "ts": 1716454218011867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011857, "dur": 9, "args": { "External id": 32667, "cbid": 211, "correlation": 32667 } }, { "ph": "s", "id": 32667, "pid": 76337, "tid": -914061504, "ts": 1716454218011857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011898, "dur": 5, "args": { "External id": 32675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32675, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32675, "pid": 5, "tid": 7, "ts": 1716454218011898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011888, "dur": 9, "args": { "External id": 32675, "cbid": 211, "correlation": 32675 } }, { "ph": "s", "id": 32675, "pid": 76337, "tid": -914061504, "ts": 1716454218011888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218011927, "dur": 5, "args": { "External id": 32683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32683, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32683, "pid": 5, "tid": 7, "ts": 1716454218011927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011918, "dur": 9, "args": { "External id": 32683, "cbid": 211, "correlation": 32683 } }, { "ph": "s", "id": 32683, "pid": 76337, "tid": -914061504, "ts": 1716454218011918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218011959, "dur": 4, "args": { "External id": 32692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32692, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32692, "pid": 5, "tid": 7, "ts": 1716454218011959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218011948, "dur": 10, "args": { "External id": 32692, "cbid": 211, "correlation": 32692 } }, { "ph": "s", "id": 32692, "pid": 76337, "tid": -914061504, "ts": 1716454218011948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218012032, "dur": 4, "args": { "External id": 32705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32705, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32705, "pid": 5, "tid": 7, "ts": 1716454218012032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012018, "dur": 15, "args": { "External id": 32705, "cbid": 211, "correlation": 32705 } }, { "ph": "s", "id": 32705, "pid": 76337, "tid": -914061504, "ts": 1716454218012018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454218012075, "dur": 8, "args": { "External id": 32715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32715, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 32715, "pid": 5, "tid": 7, "ts": 1716454218012075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012063, "dur": 12, "args": { "External id": 32715, "cbid": 211, "correlation": 32715 } }, { "ph": "s", "id": 32715, "pid": 76337, "tid": -914061504, "ts": 1716454218012063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218012185, "dur": 3, "args": { "External id": 32732, "cbid": 251, "correlation": 32732 } }, { "ph": "f", "id": 32732, "pid": 76337, "tid": -914061504, "ts": 1716454218012185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454218012219, "dur": 11, "args": { "External id": 32734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32734, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 32734, "pid": 5, "tid": 7, "ts": 1716454218012219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012197, "dur": 23, "args": { "External id": 32734, "cbid": 211, "correlation": 32734 } }, { "ph": "s", "id": 32734, "pid": 76337, "tid": -914061504, "ts": 1716454218012197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218012277, "dur": 4, "args": { "External id": 32742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32742, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32742, "pid": 5, "tid": 7, "ts": 1716454218012277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012265, "dur": 11, "args": { "External id": 32742, "cbid": 211, "correlation": 32742 } }, { "ph": "s", "id": 32742, "pid": 76337, "tid": -914061504, "ts": 1716454218012265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218012332, "dur": 2, "args": { "External id": 32758, "cbid": 251, "correlation": 32758 } }, { "ph": "f", "id": 32758, "pid": 76337, "tid": -914061504, "ts": 1716454218012332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218012338, "dur": 0, "args": { "External id": 32760, "cbid": 251, "correlation": 32760 } }, { "ph": "f", "id": 32760, "pid": 76337, "tid": -914061504, "ts": 1716454218012338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218012353, "dur": 13, "args": { "External id": 32761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32761, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 32761, "pid": 5, "tid": 7, "ts": 1716454218012353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012340, "dur": 14, "args": { "External id": 32761, "cbid": 211, "correlation": 32761 } }, { "ph": "s", "id": 32761, "pid": 76337, "tid": -914061504, "ts": 1716454218012340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218012368, "dur": 5, "args": { "External id": 32763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32763, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 32763, "pid": 5, "tid": 7, "ts": 1716454218012368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012358, "dur": 9, "args": { "External id": 32763, "cbid": 211, "correlation": 32763 } }, { "ph": "s", "id": 32763, "pid": 76337, "tid": -914061504, "ts": 1716454218012358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218012454, "dur": 1, "args": { "External id": 32773, "cbid": 317, "correlation": 32773 } }, { "ph": "f", "id": 32773, "pid": 76337, "tid": -914061504, "ts": 1716454218012454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218012456, "dur": 1, "args": { "External id": 32774, "cbid": 203, "correlation": 32774 } }, { "ph": "f", "id": 32774, "pid": 76337, "tid": -914061504, "ts": 1716454218012456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218012458, "dur": 1, "args": { "External id": 32775, "cbid": 205, "correlation": 32775 } }, { "ph": "f", "id": 32775, "pid": 76337, "tid": -914061504, "ts": 1716454218012458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218012503, "dur": 6, "args": { "External id": 32779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32779, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32779, "pid": 5, "tid": 7, "ts": 1716454218012503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012489, "dur": 14, "args": { "External id": 32779, "cbid": 211, "correlation": 32779 } }, { "ph": "s", "id": 32779, "pid": 76337, "tid": -914061504, "ts": 1716454218012489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218012515, "dur": 4, "args": { "External id": 32781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32781, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 32781, "pid": 5, "tid": 7, "ts": 1716454218012515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012507, "dur": 7, "args": { "External id": 32781, "cbid": 211, "correlation": 32781 } }, { "ph": "s", "id": 32781, "pid": 76337, "tid": -914061504, "ts": 1716454218012507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218012531, "dur": 3, "args": { "External id": 32783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32783, "pid": 5, "tid": 7, "ts": 1716454218012531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012523, "dur": 7, "args": { "External id": 32783, "cbid": 211, "correlation": 32783 } }, { "ph": "s", "id": 32783, "pid": 76337, "tid": -914061504, "ts": 1716454218012523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218012534, "dur": 0, "args": { "External id": 32784, "cbid": 51, "correlation": 32784 } }, { "ph": "s", "id": 32784, "pid": 76337, "tid": -914061504, "ts": 1716454218012534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218012545, "dur": 83, "args": { "External id": 32785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32785, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 32785, "pid": 5, "tid": 7, "ts": 1716454218012545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012536, "dur": 7, "args": { "External id": 32785, "cbid": 211, "correlation": 32785 } }, { "ph": "s", "id": 32785, "pid": 76337, "tid": -914061504, "ts": 1716454218012536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218012629, "dur": 60, "args": { "External id": 32790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32790, "pid": 5, "tid": 7, "ts": 1716454218012629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218012572, "dur": 9, "args": { "External id": 32790, "cbid": 211, "correlation": 32790 } }, { "ph": "s", "id": 32790, "pid": 76337, "tid": -914061504, "ts": 1716454218012572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218014337, "dur": 50, "args": { "External id": 32810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32810, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 32810, "pid": 5, "tid": 7, "ts": 1716454218014337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014321, "dur": 16, "args": { "External id": 32810, "cbid": 211, "correlation": 32810 } }, { "ph": "s", "id": 32810, "pid": 76337, "tid": -914061504, "ts": 1716454218014321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218014389, "dur": 4, "args": { "External id": 32822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32822, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32822, "pid": 5, "tid": 7, "ts": 1716454218014389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014349, "dur": 8, "args": { "External id": 32822, "cbid": 211, "correlation": 32822 } }, { "ph": "s", "id": 32822, "pid": 76337, "tid": -914061504, "ts": 1716454218014349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218014395, "dur": 56, "args": { "External id": 32825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32825, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32825, "pid": 5, "tid": 7, "ts": 1716454218014395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014373, "dur": 7, "args": { "External id": 32825, "cbid": 211, "correlation": 32825 } }, { "ph": "s", "id": 32825, "pid": 76337, "tid": -914061504, "ts": 1716454218014373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218014452, "dur": 36, "args": { "External id": 32834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32834, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32834, "pid": 5, "tid": 7, "ts": 1716454218014452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014418, "dur": 10, "args": { "External id": 32834, "cbid": 211, "correlation": 32834 } }, { "ph": "s", "id": 32834, "pid": 76337, "tid": -914061504, "ts": 1716454218014418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218014475, "dur": 0, "args": { "External id": 32844, "cbid": 317, "correlation": 32844 } }, { "ph": "f", "id": 32844, "pid": 76337, "tid": -914061504, "ts": 1716454218014475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218014476, "dur": 0, "args": { "External id": 32845, "cbid": 203, "correlation": 32845 } }, { "ph": "f", "id": 32845, "pid": 76337, "tid": -914061504, "ts": 1716454218014476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218014477, "dur": 0, "args": { "External id": 32846, "cbid": 205, "correlation": 32846 } }, { "ph": "f", "id": 32846, "pid": 76337, "tid": -914061504, "ts": 1716454218014477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218014507, "dur": 40, "args": { "External id": 32850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32850, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32850, "pid": 5, "tid": 7, "ts": 1716454218014507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014494, "dur": 12, "args": { "External id": 32850, "cbid": 211, "correlation": 32850 } }, { "ph": "s", "id": 32850, "pid": 76337, "tid": -914061504, "ts": 1716454218014494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218014549, "dur": 14, "args": { "External id": 32852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32852, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32852, "pid": 5, "tid": 7, "ts": 1716454218014549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014509, "dur": 6, "args": { "External id": 32852, "cbid": 211, "correlation": 32852 } }, { "ph": "s", "id": 32852, "pid": 76337, "tid": -914061504, "ts": 1716454218014509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218014564, "dur": 3, "args": { "External id": 32854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32854, "pid": 5, "tid": 7, "ts": 1716454218014564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014520, "dur": 6, "args": { "External id": 32854, "cbid": 211, "correlation": 32854 } }, { "ph": "s", "id": 32854, "pid": 76337, "tid": -914061504, "ts": 1716454218014520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218014530, "dur": 0, "args": { "External id": 32855, "cbid": 51, "correlation": 32855 } }, { "ph": "s", "id": 32855, "pid": 76337, "tid": -914061504, "ts": 1716454218014530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218014568, "dur": 694, "args": { "External id": 32856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32856, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 32856, "pid": 5, "tid": 7, "ts": 1716454218014568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014531, "dur": 6, "args": { "External id": 32856, "cbid": 211, "correlation": 32856 } }, { "ph": "s", "id": 32856, "pid": 76337, "tid": -914061504, "ts": 1716454218014531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218015264, "dur": 59, "args": { "External id": 32861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32861, "pid": 5, "tid": 7, "ts": 1716454218015264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014560, "dur": 9, "args": { "External id": 32861, "cbid": 211, "correlation": 32861 } }, { "ph": "s", "id": 32861, "pid": 76337, "tid": -914061504, "ts": 1716454218014560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218015324, "dur": 3, "args": { "External id": 32869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32869, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32869, "pid": 5, "tid": 7, "ts": 1716454218015324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014604, "dur": 9, "args": { "External id": 32869, "cbid": 211, "correlation": 32869 } }, { "ph": "s", "id": 32869, "pid": 76337, "tid": -914061504, "ts": 1716454218014604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218014670, "dur": 2, "args": { "External id": 32885, "cbid": 251, "correlation": 32885 } }, { "ph": "f", "id": 32885, "pid": 76337, "tid": -914061504, "ts": 1716454218014670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218014676, "dur": 0, "args": { "External id": 32887, "cbid": 251, "correlation": 32887 } }, { "ph": "f", "id": 32887, "pid": 76337, "tid": -914061504, "ts": 1716454218014676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218015329, "dur": 9, "args": { "External id": 32888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32888, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 32888, "pid": 5, "tid": 7, "ts": 1716454218015329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014678, "dur": 12, "args": { "External id": 32888, "cbid": 211, "correlation": 32888 } }, { "ph": "s", "id": 32888, "pid": 76337, "tid": -914061504, "ts": 1716454218014678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218015339, "dur": 4, "args": { "External id": 32890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32890, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 32890, "pid": 5, "tid": 7, "ts": 1716454218015339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014692, "dur": 6, "args": { "External id": 32890, "cbid": 211, "correlation": 32890 } }, { "ph": "s", "id": 32890, "pid": 76337, "tid": -914061504, "ts": 1716454218014692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218015344, "dur": 53, "args": { "External id": 32900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32900, "pid": 5, "tid": 7, "ts": 1716454218015344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014751, "dur": 12, "args": { "External id": 32900, "cbid": 211, "correlation": 32900 } }, { "ph": "s", "id": 32900, "pid": 76337, "tid": -914061504, "ts": 1716454218014751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218015399, "dur": 51, "args": { "External id": 32920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32920, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 32920, "pid": 5, "tid": 7, "ts": 1716454218015399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014816, "dur": 11, "args": { "External id": 32920, "cbid": 211, "correlation": 32920 } }, { "ph": "s", "id": 32920, "pid": 76337, "tid": -914061504, "ts": 1716454218014816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218015452, "dur": 4, "args": { "External id": 32932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32932, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32932, "pid": 5, "tid": 7, "ts": 1716454218015452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014838, "dur": 6, "args": { "External id": 32932, "cbid": 211, "correlation": 32932 } }, { "ph": "s", "id": 32932, "pid": 76337, "tid": -914061504, "ts": 1716454218014838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218015457, "dur": 55, "args": { "External id": 32935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32935, "pid": 5, "tid": 7, "ts": 1716454218015457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014857, "dur": 6, "args": { "External id": 32935, "cbid": 211, "correlation": 32935 } }, { "ph": "s", "id": 32935, "pid": 76337, "tid": -914061504, "ts": 1716454218014857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218015513, "dur": 37, "args": { "External id": 32944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32944, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32944, "pid": 5, "tid": 7, "ts": 1716454218015513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014897, "dur": 9, "args": { "External id": 32944, "cbid": 211, "correlation": 32944 } }, { "ph": "s", "id": 32944, "pid": 76337, "tid": -914061504, "ts": 1716454218014897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218014966, "dur": 0, "args": { "External id": 32954, "cbid": 317, "correlation": 32954 } }, { "ph": "f", "id": 32954, "pid": 76337, "tid": -914061504, "ts": 1716454218014966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218014967, "dur": 0, "args": { "External id": 32955, "cbid": 203, "correlation": 32955 } }, { "ph": "f", "id": 32955, "pid": 76337, "tid": -914061504, "ts": 1716454218014967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218014968, "dur": 0, "args": { "External id": 32956, "cbid": 205, "correlation": 32956 } }, { "ph": "f", "id": 32956, "pid": 76337, "tid": -914061504, "ts": 1716454218014968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218015551, "dur": 41, "args": { "External id": 32960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32960, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32960, "pid": 5, "tid": 7, "ts": 1716454218015551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218014990, "dur": 12, "args": { "External id": 32960, "cbid": 211, "correlation": 32960 } }, { "ph": "s", "id": 32960, "pid": 76337, "tid": -914061504, "ts": 1716454218014990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218015593, "dur": 14, "args": { "External id": 32962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32962, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32962, "pid": 5, "tid": 7, "ts": 1716454218015593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015005, "dur": 5, "args": { "External id": 32962, "cbid": 211, "correlation": 32962 } }, { "ph": "s", "id": 32962, "pid": 76337, "tid": -914061504, "ts": 1716454218015005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218015609, "dur": 3, "args": { "External id": 32964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 32964, "pid": 5, "tid": 7, "ts": 1716454218015609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015014, "dur": 5, "args": { "External id": 32964, "cbid": 211, "correlation": 32964 } }, { "ph": "s", "id": 32964, "pid": 76337, "tid": -914061504, "ts": 1716454218015014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218015023, "dur": 0, "args": { "External id": 32965, "cbid": 51, "correlation": 32965 } }, { "ph": "s", "id": 32965, "pid": 76337, "tid": -914061504, "ts": 1716454218015023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218015613, "dur": 687, "args": { "External id": 32966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32966, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 32966, "pid": 5, "tid": 7, "ts": 1716454218015613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015023, "dur": 5, "args": { "External id": 32966, "cbid": 211, "correlation": 32966 } }, { "ph": "s", "id": 32966, "pid": 76337, "tid": -914061504, "ts": 1716454218015023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218016302, "dur": 59, "args": { "External id": 32971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32971, "pid": 5, "tid": 7, "ts": 1716454218016302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015052, "dur": 8, "args": { "External id": 32971, "cbid": 211, "correlation": 32971 } }, { "ph": "s", "id": 32971, "pid": 76337, "tid": -914061504, "ts": 1716454218015052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218016362, "dur": 50, "args": { "External id": 32979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32979, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32979, "pid": 5, "tid": 7, "ts": 1716454218016362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015085, "dur": 9, "args": { "External id": 32979, "cbid": 211, "correlation": 32979 } }, { "ph": "s", "id": 32979, "pid": 76337, "tid": -914061504, "ts": 1716454218015085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218016413, "dur": 35, "args": { "External id": 32987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 32987, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 32987, "pid": 5, "tid": 7, "ts": 1716454218016413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015118, "dur": 9, "args": { "External id": 32987, "cbid": 211, "correlation": 32987 } }, { "ph": "s", "id": 32987, "pid": 76337, "tid": -914061504, "ts": 1716454218015118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218016449, "dur": 51, "args": { "External id": 33007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33007, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 33007, "pid": 5, "tid": 7, "ts": 1716454218016449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015200, "dur": 12, "args": { "External id": 33007, "cbid": 211, "correlation": 33007 } }, { "ph": "s", "id": 33007, "pid": 76337, "tid": -914061504, "ts": 1716454218015200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218016501, "dur": 4, "args": { "External id": 33019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33019, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33019, "pid": 5, "tid": 7, "ts": 1716454218016501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015222, "dur": 6, "args": { "External id": 33019, "cbid": 211, "correlation": 33019 } }, { "ph": "s", "id": 33019, "pid": 76337, "tid": -914061504, "ts": 1716454218015222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218016506, "dur": 55, "args": { "External id": 33022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33022, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33022, "pid": 5, "tid": 7, "ts": 1716454218016506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015239, "dur": 7, "args": { "External id": 33022, "cbid": 211, "correlation": 33022 } }, { "ph": "s", "id": 33022, "pid": 76337, "tid": -914061504, "ts": 1716454218015239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218015298, "dur": 0, "args": { "External id": 33033, "cbid": 317, "correlation": 33033 } }, { "ph": "f", "id": 33033, "pid": 76337, "tid": -914061504, "ts": 1716454218015298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218015298, "dur": 0, "args": { "External id": 33034, "cbid": 203, "correlation": 33034 } }, { "ph": "f", "id": 33034, "pid": 76337, "tid": -914061504, "ts": 1716454218015298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218015299, "dur": 0, "args": { "External id": 33035, "cbid": 205, "correlation": 33035 } }, { "ph": "f", "id": 33035, "pid": 76337, "tid": -914061504, "ts": 1716454218015299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015333, "dur": 2, "args": { "External id": 33039, "cbid": 251, "correlation": 33039 } }, { "ph": "f", "id": 33039, "pid": 76337, "tid": -914061504, "ts": 1716454218015333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015337, "dur": 1, "args": { "External id": 33040, "cbid": 251, "correlation": 33040 } }, { "ph": "f", "id": 33040, "pid": 76337, "tid": -914061504, "ts": 1716454218015337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015339, "dur": 1, "args": { "External id": 33041, "cbid": 251, "correlation": 33041 } }, { "ph": "f", "id": 33041, "pid": 76337, "tid": -914061504, "ts": 1716454218015339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015341, "dur": 1, "args": { "External id": 33042, "cbid": 251, "correlation": 33042 } }, { "ph": "f", "id": 33042, "pid": 76337, "tid": -914061504, "ts": 1716454218015341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015343, "dur": 1, "args": { "External id": 33043, "cbid": 251, "correlation": 33043 } }, { "ph": "f", "id": 33043, "pid": 76337, "tid": -914061504, "ts": 1716454218015343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015345, "dur": 1, "args": { "External id": 33044, "cbid": 251, "correlation": 33044 } }, { "ph": "f", "id": 33044, "pid": 76337, "tid": -914061504, "ts": 1716454218015345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015347, "dur": 1, "args": { "External id": 33045, "cbid": 251, "correlation": 33045 } }, { "ph": "f", "id": 33045, "pid": 76337, "tid": -914061504, "ts": 1716454218015347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015349, "dur": 1, "args": { "External id": 33046, "cbid": 251, "correlation": 33046 } }, { "ph": "f", "id": 33046, "pid": 76337, "tid": -914061504, "ts": 1716454218015349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015352, "dur": 0, "args": { "External id": 33047, "cbid": 251, "correlation": 33047 } }, { "ph": "f", "id": 33047, "pid": 76337, "tid": -914061504, "ts": 1716454218015352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218016563, "dur": 113, "args": { "External id": 33048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33048, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33048, "pid": 5, "tid": 7, "ts": 1716454218016563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015356, "dur": 13, "args": { "External id": 33048, "cbid": 211, "correlation": 33048 } }, { "ph": "s", "id": 33048, "pid": 76337, "tid": -914061504, "ts": 1716454218015356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218016677, "dur": 60, "args": { "External id": 33054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33054, "pid": 5, "tid": 7, "ts": 1716454218016677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015391, "dur": 9, "args": { "External id": 33054, "cbid": 211, "correlation": 33054 } }, { "ph": "s", "id": 33054, "pid": 76337, "tid": -914061504, "ts": 1716454218015391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218016738, "dur": 619, "args": { "External id": 33063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33063, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33063, "pid": 5, "tid": 7, "ts": 1716454218016738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015505, "dur": 15, "args": { "External id": 33063, "cbid": 211, "correlation": 33063 } }, { "ph": "s", "id": 33063, "pid": 76337, "tid": -914061504, "ts": 1716454218015505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218017358, "dur": 179, "args": { "External id": 33085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33085, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33085, "pid": 5, "tid": 7, "ts": 1716454218017358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015575, "dur": 11, "args": { "External id": 33085, "cbid": 211, "correlation": 33085 } }, { "ph": "s", "id": 33085, "pid": 76337, "tid": -914061504, "ts": 1716454218015575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015690, "dur": 2, "args": { "External id": 33096, "cbid": 251, "correlation": 33096 } }, { "ph": "f", "id": 33096, "pid": 76337, "tid": -914061504, "ts": 1716454218015690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218017539, "dur": 196, "args": { "External id": 33097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33097, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33097, "pid": 5, "tid": 7, "ts": 1716454218017539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015697, "dur": 14, "args": { "External id": 33097, "cbid": 211, "correlation": 33097 } }, { "ph": "s", "id": 33097, "pid": 76337, "tid": -914061504, "ts": 1716454218015697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015768, "dur": 1, "args": { "External id": 33108, "cbid": 251, "correlation": 33108 } }, { "ph": "f", "id": 33108, "pid": 76337, "tid": -914061504, "ts": 1716454218015768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218017736, "dur": 190, "args": { "External id": 33109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33109, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33109, "pid": 5, "tid": 7, "ts": 1716454218017736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015772, "dur": 11, "args": { "External id": 33109, "cbid": 211, "correlation": 33109 } }, { "ph": "s", "id": 33109, "pid": 76337, "tid": -914061504, "ts": 1716454218015772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218015836, "dur": 1, "args": { "External id": 33120, "cbid": 251, "correlation": 33120 } }, { "ph": "f", "id": 33120, "pid": 76337, "tid": -914061504, "ts": 1716454218015836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218017927, "dur": 184, "args": { "External id": 33121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33121, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33121, "pid": 5, "tid": 7, "ts": 1716454218017927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015840, "dur": 11, "args": { "External id": 33121, "cbid": 211, "correlation": 33121 } }, { "ph": "s", "id": 33121, "pid": 76337, "tid": -914061504, "ts": 1716454218015840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218018113, "dur": 18418, "args": { "External id": 33142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33142, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33142, "pid": 5, "tid": 7, "ts": 1716454218018113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218015944, "dur": 20, "args": { "External id": 33142, "cbid": 211, "correlation": 33142 } }, { "ph": "s", "id": 33142, "pid": 76337, "tid": -914061504, "ts": 1716454218015944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016070, "dur": 2, "args": { "External id": 33160, "cbid": 251, "correlation": 33160 } }, { "ph": "f", "id": 33160, "pid": 76337, "tid": -914061504, "ts": 1716454218016070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218036532, "dur": 198, "args": { "External id": 33162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33162, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33162, "pid": 5, "tid": 7, "ts": 1716454218036532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016076, "dur": 14, "args": { "External id": 33162, "cbid": 211, "correlation": 33162 } }, { "ph": "s", "id": 33162, "pid": 76337, "tid": -914061504, "ts": 1716454218016076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218036732, "dur": 66, "args": { "External id": 33170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33170, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33170, "pid": 5, "tid": 7, "ts": 1716454218036732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016149, "dur": 13, "args": { "External id": 33170, "cbid": 211, "correlation": 33170 } }, { "ph": "s", "id": 33170, "pid": 76337, "tid": -914061504, "ts": 1716454218016149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218036799, "dur": 97, "args": { "External id": 33178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33178, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33178, "pid": 5, "tid": 7, "ts": 1716454218036799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016188, "dur": 9, "args": { "External id": 33178, "cbid": 211, "correlation": 33178 } }, { "ph": "s", "id": 33178, "pid": 76337, "tid": -914061504, "ts": 1716454218016188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218036897, "dur": 53, "args": { "External id": 33189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33189, "pid": 5, "tid": 7, "ts": 1716454218036897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016273, "dur": 14, "args": { "External id": 33189, "cbid": 211, "correlation": 33189 } }, { "ph": "s", "id": 33189, "pid": 76337, "tid": -914061504, "ts": 1716454218016273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218036951, "dur": 91, "args": { "External id": 33211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33211, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33211, "pid": 5, "tid": 7, "ts": 1716454218036951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016306, "dur": 8, "args": { "External id": 33211, "cbid": 211, "correlation": 33211 } }, { "ph": "s", "id": 33211, "pid": 76337, "tid": -914061504, "ts": 1716454218016306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016389, "dur": 1, "args": { "External id": 33222, "cbid": 251, "correlation": 33222 } }, { "ph": "f", "id": 33222, "pid": 76337, "tid": -914061504, "ts": 1716454218016389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218037044, "dur": 103, "args": { "External id": 33223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33223, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33223, "pid": 5, "tid": 7, "ts": 1716454218037044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016394, "dur": 13, "args": { "External id": 33223, "cbid": 211, "correlation": 33223 } }, { "ph": "s", "id": 33223, "pid": 76337, "tid": -914061504, "ts": 1716454218016394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016475, "dur": 1, "args": { "External id": 33234, "cbid": 251, "correlation": 33234 } }, { "ph": "f", "id": 33234, "pid": 76337, "tid": -914061504, "ts": 1716454218016475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016479, "dur": 0, "args": { "External id": 33235, "cbid": 251, "correlation": 33235 } }, { "ph": "f", "id": 33235, "pid": 76337, "tid": -914061504, "ts": 1716454218016479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218037148, "dur": 10, "args": { "External id": 33236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33236, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 33236, "pid": 5, "tid": 7, "ts": 1716454218037148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016481, "dur": 13, "args": { "External id": 33236, "cbid": 211, "correlation": 33236 } }, { "ph": "s", "id": 33236, "pid": 76337, "tid": -914061504, "ts": 1716454218016481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218037160, "dur": 5, "args": { "External id": 33238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33238, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 33238, "pid": 5, "tid": 7, "ts": 1716454218037160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016498, "dur": 7, "args": { "External id": 33238, "cbid": 211, "correlation": 33238 } }, { "ph": "s", "id": 33238, "pid": 76337, "tid": -914061504, "ts": 1716454218016498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016560, "dur": 1, "args": { "External id": 33249, "cbid": 251, "correlation": 33249 } }, { "ph": "f", "id": 33249, "pid": 76337, "tid": -914061504, "ts": 1716454218016560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016564, "dur": 0, "args": { "External id": 33250, "cbid": 251, "correlation": 33250 } }, { "ph": "f", "id": 33250, "pid": 76337, "tid": -914061504, "ts": 1716454218016564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218037166, "dur": 6, "args": { "External id": 33251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33251, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 33251, "pid": 5, "tid": 7, "ts": 1716454218037166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016565, "dur": 12, "args": { "External id": 33251, "cbid": 211, "correlation": 33251 } }, { "ph": "s", "id": 33251, "pid": 76337, "tid": -914061504, "ts": 1716454218016565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218037174, "dur": 3, "args": { "External id": 33253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33253, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 33253, "pid": 5, "tid": 7, "ts": 1716454218037174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016579, "dur": 6, "args": { "External id": 33253, "cbid": 211, "correlation": 33253 } }, { "ph": "s", "id": 33253, "pid": 76337, "tid": -914061504, "ts": 1716454218016579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218037178, "dur": 154, "args": { "External id": 33274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33274, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33274, "pid": 5, "tid": 7, "ts": 1716454218037178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016653, "dur": 12, "args": { "External id": 33274, "cbid": 211, "correlation": 33274 } }, { "ph": "s", "id": 33274, "pid": 76337, "tid": -914061504, "ts": 1716454218016653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218016750, "dur": 2, "args": { "External id": 33292, "cbid": 251, "correlation": 33292 } }, { "ph": "f", "id": 33292, "pid": 76337, "tid": -914061504, "ts": 1716454218016750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218037334, "dur": 106, "args": { "External id": 33294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33294, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33294, "pid": 5, "tid": 7, "ts": 1716454218037334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016756, "dur": 14, "args": { "External id": 33294, "cbid": 211, "correlation": 33294 } }, { "ph": "s", "id": 33294, "pid": 76337, "tid": -914061504, "ts": 1716454218016756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218037442, "dur": 34, "args": { "External id": 33302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33302, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33302, "pid": 5, "tid": 7, "ts": 1716454218037442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016828, "dur": 12, "args": { "External id": 33302, "cbid": 211, "correlation": 33302 } }, { "ph": "s", "id": 33302, "pid": 76337, "tid": -914061504, "ts": 1716454218016828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218037477, "dur": 68, "args": { "External id": 33310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33310, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33310, "pid": 5, "tid": 7, "ts": 1716454218037477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016869, "dur": 9, "args": { "External id": 33310, "cbid": 211, "correlation": 33310 } }, { "ph": "s", "id": 33310, "pid": 76337, "tid": -914061504, "ts": 1716454218016869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218037546, "dur": 91, "args": { "External id": 33332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33332, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33332, "pid": 5, "tid": 7, "ts": 1716454218037546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218016921, "dur": 10, "args": { "External id": 33332, "cbid": 211, "correlation": 33332 } }, { "ph": "s", "id": 33332, "pid": 76337, "tid": -914061504, "ts": 1716454218016921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017022, "dur": 1, "args": { "External id": 33348, "cbid": 251, "correlation": 33348 } }, { "ph": "f", "id": 33348, "pid": 76337, "tid": -914061504, "ts": 1716454218017022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218037639, "dur": 565, "args": { "External id": 33350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33350, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33350, "pid": 5, "tid": 7, "ts": 1716454218037639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017028, "dur": 13, "args": { "External id": 33350, "cbid": 211, "correlation": 33350 } }, { "ph": "s", "id": 33350, "pid": 76337, "tid": -914061504, "ts": 1716454218017028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218038205, "dur": 241, "args": { "External id": 33358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33358, "pid": 5, "tid": 7, "ts": 1716454218038205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017107, "dur": 15, "args": { "External id": 33358, "cbid": 211, "correlation": 33358 } }, { "ph": "s", "id": 33358, "pid": 76337, "tid": -914061504, "ts": 1716454218017107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218038447, "dur": 251, "args": { "External id": 33366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33366, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33366, "pid": 5, "tid": 7, "ts": 1716454218038447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017142, "dur": 8, "args": { "External id": 33366, "cbid": 211, "correlation": 33366 } }, { "ph": "s", "id": 33366, "pid": 76337, "tid": -914061504, "ts": 1716454218017142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017226, "dur": 2, "args": { "External id": 33382, "cbid": 251, "correlation": 33382 } }, { "ph": "f", "id": 33382, "pid": 76337, "tid": -914061504, "ts": 1716454218017226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017232, "dur": 0, "args": { "External id": 33384, "cbid": 251, "correlation": 33384 } }, { "ph": "f", "id": 33384, "pid": 76337, "tid": -914061504, "ts": 1716454218017232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218038700, "dur": 358, "args": { "External id": 33385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33385, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 33385, "pid": 5, "tid": 7, "ts": 1716454218038700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017237, "dur": 14, "args": { "External id": 33385, "cbid": 211, "correlation": 33385 } }, { "ph": "s", "id": 33385, "pid": 76337, "tid": -914061504, "ts": 1716454218017237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218039059, "dur": 50, "args": { "External id": 33393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33393, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33393, "pid": 5, "tid": 7, "ts": 1716454218039059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017280, "dur": 10, "args": { "External id": 33393, "cbid": 211, "correlation": 33393 } }, { "ph": "s", "id": 33393, "pid": 76337, "tid": -914061504, "ts": 1716454218017280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218039110, "dur": 156, "args": { "External id": 33404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33404, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33404, "pid": 5, "tid": 7, "ts": 1716454218039110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017352, "dur": 12, "args": { "External id": 33404, "cbid": 211, "correlation": 33404 } }, { "ph": "s", "id": 33404, "pid": 76337, "tid": -914061504, "ts": 1716454218017352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218017417, "dur": 0, "args": { "External id": 33416, "cbid": 317, "correlation": 33416 } }, { "ph": "f", "id": 33416, "pid": 76337, "tid": -914061504, "ts": 1716454218017417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218017418, "dur": 0, "args": { "External id": 33417, "cbid": 203, "correlation": 33417 } }, { "ph": "f", "id": 33417, "pid": 76337, "tid": -914061504, "ts": 1716454218017418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218017419, "dur": 0, "args": { "External id": 33418, "cbid": 205, "correlation": 33418 } }, { "ph": "f", "id": 33418, "pid": 76337, "tid": -914061504, "ts": 1716454218017419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017443, "dur": 1, "args": { "External id": 33422, "cbid": 251, "correlation": 33422 } }, { "ph": "f", "id": 33422, "pid": 76337, "tid": -914061504, "ts": 1716454218017443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017445, "dur": 0, "args": { "External id": 33423, "cbid": 251, "correlation": 33423 } }, { "ph": "f", "id": 33423, "pid": 76337, "tid": -914061504, "ts": 1716454218017445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017445, "dur": 0, "args": { "External id": 33424, "cbid": 251, "correlation": 33424 } }, { "ph": "f", "id": 33424, "pid": 76337, "tid": -914061504, "ts": 1716454218017445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017446, "dur": 0, "args": { "External id": 33425, "cbid": 251, "correlation": 33425 } }, { "ph": "f", "id": 33425, "pid": 76337, "tid": -914061504, "ts": 1716454218017446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017447, "dur": 0, "args": { "External id": 33426, "cbid": 251, "correlation": 33426 } }, { "ph": "f", "id": 33426, "pid": 76337, "tid": -914061504, "ts": 1716454218017447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017448, "dur": 0, "args": { "External id": 33427, "cbid": 251, "correlation": 33427 } }, { "ph": "f", "id": 33427, "pid": 76337, "tid": -914061504, "ts": 1716454218017448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017449, "dur": 0, "args": { "External id": 33428, "cbid": 251, "correlation": 33428 } }, { "ph": "f", "id": 33428, "pid": 76337, "tid": -914061504, "ts": 1716454218017449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017450, "dur": 0, "args": { "External id": 33429, "cbid": 251, "correlation": 33429 } }, { "ph": "f", "id": 33429, "pid": 76337, "tid": -914061504, "ts": 1716454218017450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017451, "dur": 0, "args": { "External id": 33430, "cbid": 251, "correlation": 33430 } }, { "ph": "f", "id": 33430, "pid": 76337, "tid": -914061504, "ts": 1716454218017451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218039267, "dur": 114, "args": { "External id": 33431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33431, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33431, "pid": 5, "tid": 7, "ts": 1716454218039267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017453, "dur": 12, "args": { "External id": 33431, "cbid": 211, "correlation": 33431 } }, { "ph": "s", "id": 33431, "pid": 76337, "tid": -914061504, "ts": 1716454218017453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218039383, "dur": 59, "args": { "External id": 33437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33437, "pid": 5, "tid": 7, "ts": 1716454218039383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017488, "dur": 9, "args": { "External id": 33437, "cbid": 211, "correlation": 33437 } }, { "ph": "s", "id": 33437, "pid": 76337, "tid": -914061504, "ts": 1716454218017488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218039443, "dur": 50, "args": { "External id": 33445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33445, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33445, "pid": 5, "tid": 7, "ts": 1716454218039443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017521, "dur": 9, "args": { "External id": 33445, "cbid": 211, "correlation": 33445 } }, { "ph": "s", "id": 33445, "pid": 76337, "tid": -914061504, "ts": 1716454218017521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218039494, "dur": 51, "args": { "External id": 33465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33465, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 33465, "pid": 5, "tid": 7, "ts": 1716454218039494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017596, "dur": 11, "args": { "External id": 33465, "cbid": 211, "correlation": 33465 } }, { "ph": "s", "id": 33465, "pid": 76337, "tid": -914061504, "ts": 1716454218017596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218039547, "dur": 5, "args": { "External id": 33477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33477, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33477, "pid": 5, "tid": 7, "ts": 1716454218039547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017616, "dur": 7, "args": { "External id": 33477, "cbid": 211, "correlation": 33477 } }, { "ph": "s", "id": 33477, "pid": 76337, "tid": -914061504, "ts": 1716454218017616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218039553, "dur": 55, "args": { "External id": 33480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33480, "pid": 5, "tid": 7, "ts": 1716454218039553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017635, "dur": 7, "args": { "External id": 33480, "cbid": 211, "correlation": 33480 } }, { "ph": "s", "id": 33480, "pid": 76337, "tid": -914061504, "ts": 1716454218017635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218039610, "dur": 37, "args": { "External id": 33489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33489, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33489, "pid": 5, "tid": 7, "ts": 1716454218039610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017674, "dur": 10, "args": { "External id": 33489, "cbid": 211, "correlation": 33489 } }, { "ph": "s", "id": 33489, "pid": 76337, "tid": -914061504, "ts": 1716454218017674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218017725, "dur": 0, "args": { "External id": 33499, "cbid": 317, "correlation": 33499 } }, { "ph": "f", "id": 33499, "pid": 76337, "tid": -914061504, "ts": 1716454218017725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218017726, "dur": 0, "args": { "External id": 33500, "cbid": 203, "correlation": 33500 } }, { "ph": "f", "id": 33500, "pid": 76337, "tid": -914061504, "ts": 1716454218017726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218017727, "dur": 0, "args": { "External id": 33501, "cbid": 205, "correlation": 33501 } }, { "ph": "f", "id": 33501, "pid": 76337, "tid": -914061504, "ts": 1716454218017727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218039648, "dur": 41, "args": { "External id": 33505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33505, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33505, "pid": 5, "tid": 7, "ts": 1716454218039648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017742, "dur": 11, "args": { "External id": 33505, "cbid": 211, "correlation": 33505 } }, { "ph": "s", "id": 33505, "pid": 76337, "tid": -914061504, "ts": 1716454218017742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218039691, "dur": 14, "args": { "External id": 33507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33507, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33507, "pid": 5, "tid": 7, "ts": 1716454218039691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017756, "dur": 5, "args": { "External id": 33507, "cbid": 211, "correlation": 33507 } }, { "ph": "s", "id": 33507, "pid": 76337, "tid": -914061504, "ts": 1716454218017756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218039706, "dur": 4, "args": { "External id": 33509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33509, "pid": 5, "tid": 7, "ts": 1716454218039706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017766, "dur": 6, "args": { "External id": 33509, "cbid": 211, "correlation": 33509 } }, { "ph": "s", "id": 33509, "pid": 76337, "tid": -914061504, "ts": 1716454218017766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218017775, "dur": 0, "args": { "External id": 33510, "cbid": 51, "correlation": 33510 } }, { "ph": "s", "id": 33510, "pid": 76337, "tid": -914061504, "ts": 1716454218017775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218039710, "dur": 696, "args": { "External id": 33511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33511, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33511, "pid": 5, "tid": 7, "ts": 1716454218039710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017775, "dur": 5, "args": { "External id": 33511, "cbid": 211, "correlation": 33511 } }, { "ph": "s", "id": 33511, "pid": 76337, "tid": -914061504, "ts": 1716454218017775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218040408, "dur": 59, "args": { "External id": 33516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33516, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33516, "pid": 5, "tid": 7, "ts": 1716454218040408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017803, "dur": 8, "args": { "External id": 33516, "cbid": 211, "correlation": 33516 } }, { "ph": "s", "id": 33516, "pid": 76337, "tid": -914061504, "ts": 1716454218017803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218040468, "dur": 3, "args": { "External id": 33524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33524, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33524, "pid": 5, "tid": 7, "ts": 1716454218040468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017846, "dur": 9, "args": { "External id": 33524, "cbid": 211, "correlation": 33524 } }, { "ph": "s", "id": 33524, "pid": 76337, "tid": -914061504, "ts": 1716454218017846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017913, "dur": 1, "args": { "External id": 33540, "cbid": 251, "correlation": 33540 } }, { "ph": "f", "id": 33540, "pid": 76337, "tid": -914061504, "ts": 1716454218017913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218017918, "dur": 0, "args": { "External id": 33542, "cbid": 251, "correlation": 33542 } }, { "ph": "f", "id": 33542, "pid": 76337, "tid": -914061504, "ts": 1716454218017918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218040473, "dur": 11, "args": { "External id": 33543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33543, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 33543, "pid": 5, "tid": 7, "ts": 1716454218040473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017920, "dur": 11, "args": { "External id": 33543, "cbid": 211, "correlation": 33543 } }, { "ph": "s", "id": 33543, "pid": 76337, "tid": -914061504, "ts": 1716454218017920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218040485, "dur": 5, "args": { "External id": 33545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33545, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 33545, "pid": 5, "tid": 7, "ts": 1716454218040485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218017933, "dur": 6, "args": { "External id": 33545, "cbid": 211, "correlation": 33545 } }, { "ph": "s", "id": 33545, "pid": 76337, "tid": -914061504, "ts": 1716454218017933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218040491, "dur": 54, "args": { "External id": 33555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33555, "pid": 5, "tid": 7, "ts": 1716454218040491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018000, "dur": 12, "args": { "External id": 33555, "cbid": 211, "correlation": 33555 } }, { "ph": "s", "id": 33555, "pid": 76337, "tid": -914061504, "ts": 1716454218018000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218040547, "dur": 50, "args": { "External id": 33575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33575, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 33575, "pid": 5, "tid": 7, "ts": 1716454218040547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018067, "dur": 11, "args": { "External id": 33575, "cbid": 211, "correlation": 33575 } }, { "ph": "s", "id": 33575, "pid": 76337, "tid": -914061504, "ts": 1716454218018067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218040598, "dur": 4, "args": { "External id": 33587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33587, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33587, "pid": 5, "tid": 7, "ts": 1716454218040598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018087, "dur": 7, "args": { "External id": 33587, "cbid": 211, "correlation": 33587 } }, { "ph": "s", "id": 33587, "pid": 76337, "tid": -914061504, "ts": 1716454218018087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218040604, "dur": 55, "args": { "External id": 33590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33590, "pid": 5, "tid": 7, "ts": 1716454218040604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018107, "dur": 6, "args": { "External id": 33590, "cbid": 211, "correlation": 33590 } }, { "ph": "s", "id": 33590, "pid": 76337, "tid": -914061504, "ts": 1716454218018107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218040660, "dur": 36, "args": { "External id": 33599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33599, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33599, "pid": 5, "tid": 7, "ts": 1716454218040660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018147, "dur": 10, "args": { "External id": 33599, "cbid": 211, "correlation": 33599 } }, { "ph": "s", "id": 33599, "pid": 76337, "tid": -914061504, "ts": 1716454218018147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218018210, "dur": 0, "args": { "External id": 33609, "cbid": 317, "correlation": 33609 } }, { "ph": "f", "id": 33609, "pid": 76337, "tid": -914061504, "ts": 1716454218018210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218018211, "dur": 0, "args": { "External id": 33610, "cbid": 203, "correlation": 33610 } }, { "ph": "f", "id": 33610, "pid": 76337, "tid": -914061504, "ts": 1716454218018211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218018212, "dur": 0, "args": { "External id": 33611, "cbid": 205, "correlation": 33611 } }, { "ph": "f", "id": 33611, "pid": 76337, "tid": -914061504, "ts": 1716454218018212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218040698, "dur": 40, "args": { "External id": 33615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33615, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33615, "pid": 5, "tid": 7, "ts": 1716454218040698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018226, "dur": 12, "args": { "External id": 33615, "cbid": 211, "correlation": 33615 } }, { "ph": "s", "id": 33615, "pid": 76337, "tid": -914061504, "ts": 1716454218018226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218040739, "dur": 14, "args": { "External id": 33617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33617, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33617, "pid": 5, "tid": 7, "ts": 1716454218040739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018240, "dur": 5, "args": { "External id": 33617, "cbid": 211, "correlation": 33617 } }, { "ph": "s", "id": 33617, "pid": 76337, "tid": -914061504, "ts": 1716454218018240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218040754, "dur": 3, "args": { "External id": 33619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33619, "pid": 5, "tid": 7, "ts": 1716454218040754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018249, "dur": 6, "args": { "External id": 33619, "cbid": 211, "correlation": 33619 } }, { "ph": "s", "id": 33619, "pid": 76337, "tid": -914061504, "ts": 1716454218018249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218018258, "dur": 0, "args": { "External id": 33620, "cbid": 51, "correlation": 33620 } }, { "ph": "s", "id": 33620, "pid": 76337, "tid": -914061504, "ts": 1716454218018258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218040759, "dur": 689, "args": { "External id": 33621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33621, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33621, "pid": 5, "tid": 7, "ts": 1716454218040759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018259, "dur": 5, "args": { "External id": 33621, "cbid": 211, "correlation": 33621 } }, { "ph": "s", "id": 33621, "pid": 76337, "tid": -914061504, "ts": 1716454218018259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218041449, "dur": 59, "args": { "External id": 33626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33626, "pid": 5, "tid": 7, "ts": 1716454218041449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018286, "dur": 8, "args": { "External id": 33626, "cbid": 211, "correlation": 33626 } }, { "ph": "s", "id": 33626, "pid": 76337, "tid": -914061504, "ts": 1716454218018286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218041509, "dur": 50, "args": { "External id": 33634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33634, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33634, "pid": 5, "tid": 7, "ts": 1716454218041509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018318, "dur": 8, "args": { "External id": 33634, "cbid": 211, "correlation": 33634 } }, { "ph": "s", "id": 33634, "pid": 76337, "tid": -914061504, "ts": 1716454218018318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218041560, "dur": 35, "args": { "External id": 33642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33642, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33642, "pid": 5, "tid": 7, "ts": 1716454218041560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018346, "dur": 9, "args": { "External id": 33642, "cbid": 211, "correlation": 33642 } }, { "ph": "s", "id": 33642, "pid": 76337, "tid": -914061504, "ts": 1716454218018346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218041597, "dur": 51, "args": { "External id": 33662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33662, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 33662, "pid": 5, "tid": 7, "ts": 1716454218041597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018425, "dur": 12, "args": { "External id": 33662, "cbid": 211, "correlation": 33662 } }, { "ph": "s", "id": 33662, "pid": 76337, "tid": -914061504, "ts": 1716454218018425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218041650, "dur": 4, "args": { "External id": 33674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33674, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 33674, "pid": 5, "tid": 7, "ts": 1716454218041650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018447, "dur": 6, "args": { "External id": 33674, "cbid": 211, "correlation": 33674 } }, { "ph": "s", "id": 33674, "pid": 76337, "tid": -914061504, "ts": 1716454218018447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218041655, "dur": 54, "args": { "External id": 33677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33677, "pid": 5, "tid": 7, "ts": 1716454218041655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018465, "dur": 6, "args": { "External id": 33677, "cbid": 211, "correlation": 33677 } }, { "ph": "s", "id": 33677, "pid": 76337, "tid": -914061504, "ts": 1716454218018465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218018522, "dur": 0, "args": { "External id": 33688, "cbid": 317, "correlation": 33688 } }, { "ph": "f", "id": 33688, "pid": 76337, "tid": -914061504, "ts": 1716454218018522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218018522, "dur": 0, "args": { "External id": 33689, "cbid": 203, "correlation": 33689 } }, { "ph": "f", "id": 33689, "pid": 76337, "tid": -914061504, "ts": 1716454218018522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218018523, "dur": 0, "args": { "External id": 33690, "cbid": 205, "correlation": 33690 } }, { "ph": "f", "id": 33690, "pid": 76337, "tid": -914061504, "ts": 1716454218018523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018545, "dur": 1, "args": { "External id": 33694, "cbid": 251, "correlation": 33694 } }, { "ph": "f", "id": 33694, "pid": 76337, "tid": -914061504, "ts": 1716454218018545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018546, "dur": 0, "args": { "External id": 33695, "cbid": 251, "correlation": 33695 } }, { "ph": "f", "id": 33695, "pid": 76337, "tid": -914061504, "ts": 1716454218018546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018547, "dur": 0, "args": { "External id": 33696, "cbid": 251, "correlation": 33696 } }, { "ph": "f", "id": 33696, "pid": 76337, "tid": -914061504, "ts": 1716454218018547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018548, "dur": 0, "args": { "External id": 33697, "cbid": 251, "correlation": 33697 } }, { "ph": "f", "id": 33697, "pid": 76337, "tid": -914061504, "ts": 1716454218018548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018549, "dur": 0, "args": { "External id": 33698, "cbid": 251, "correlation": 33698 } }, { "ph": "f", "id": 33698, "pid": 76337, "tid": -914061504, "ts": 1716454218018549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018550, "dur": 0, "args": { "External id": 33699, "cbid": 251, "correlation": 33699 } }, { "ph": "f", "id": 33699, "pid": 76337, "tid": -914061504, "ts": 1716454218018550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018550, "dur": 0, "args": { "External id": 33700, "cbid": 251, "correlation": 33700 } }, { "ph": "f", "id": 33700, "pid": 76337, "tid": -914061504, "ts": 1716454218018550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018551, "dur": 0, "args": { "External id": 33701, "cbid": 251, "correlation": 33701 } }, { "ph": "f", "id": 33701, "pid": 76337, "tid": -914061504, "ts": 1716454218018551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018552, "dur": 0, "args": { "External id": 33702, "cbid": 251, "correlation": 33702 } }, { "ph": "f", "id": 33702, "pid": 76337, "tid": -914061504, "ts": 1716454218018552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218041710, "dur": 110, "args": { "External id": 33703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33703, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33703, "pid": 5, "tid": 7, "ts": 1716454218041710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018554, "dur": 13, "args": { "External id": 33703, "cbid": 211, "correlation": 33703 } }, { "ph": "s", "id": 33703, "pid": 76337, "tid": -914061504, "ts": 1716454218018554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218041821, "dur": 60, "args": { "External id": 33709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33709, "pid": 5, "tid": 7, "ts": 1716454218041821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018590, "dur": 9, "args": { "External id": 33709, "cbid": 211, "correlation": 33709 } }, { "ph": "s", "id": 33709, "pid": 76337, "tid": -914061504, "ts": 1716454218018590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218041882, "dur": 654, "args": { "External id": 33718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33718, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33718, "pid": 5, "tid": 7, "ts": 1716454218041882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018674, "dur": 13, "args": { "External id": 33718, "cbid": 211, "correlation": 33718 } }, { "ph": "s", "id": 33718, "pid": 76337, "tid": -914061504, "ts": 1716454218018674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218042538, "dur": 179, "args": { "External id": 33740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33740, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33740, "pid": 5, "tid": 7, "ts": 1716454218042538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018730, "dur": 10, "args": { "External id": 33740, "cbid": 211, "correlation": 33740 } }, { "ph": "s", "id": 33740, "pid": 76337, "tid": -914061504, "ts": 1716454218018730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018815, "dur": 1, "args": { "External id": 33751, "cbid": 251, "correlation": 33751 } }, { "ph": "f", "id": 33751, "pid": 76337, "tid": -914061504, "ts": 1716454218018815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218042718, "dur": 193, "args": { "External id": 33752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33752, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33752, "pid": 5, "tid": 7, "ts": 1716454218042718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018820, "dur": 14, "args": { "External id": 33752, "cbid": 211, "correlation": 33752 } }, { "ph": "s", "id": 33752, "pid": 76337, "tid": -914061504, "ts": 1716454218018820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018888, "dur": 1, "args": { "External id": 33763, "cbid": 251, "correlation": 33763 } }, { "ph": "f", "id": 33763, "pid": 76337, "tid": -914061504, "ts": 1716454218018888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218042912, "dur": 186, "args": { "External id": 33764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33764, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33764, "pid": 5, "tid": 7, "ts": 1716454218042912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018892, "dur": 11, "args": { "External id": 33764, "cbid": 211, "correlation": 33764 } }, { "ph": "s", "id": 33764, "pid": 76337, "tid": -914061504, "ts": 1716454218018892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218018955, "dur": 1, "args": { "External id": 33775, "cbid": 251, "correlation": 33775 } }, { "ph": "f", "id": 33775, "pid": 76337, "tid": -914061504, "ts": 1716454218018955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218043100, "dur": 186, "args": { "External id": 33776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33776, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33776, "pid": 5, "tid": 7, "ts": 1716454218043100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218018959, "dur": 11, "args": { "External id": 33776, "cbid": 211, "correlation": 33776 } }, { "ph": "s", "id": 33776, "pid": 76337, "tid": -914061504, "ts": 1716454218018959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218043287, "dur": 18374, "args": { "External id": 33797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33797, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33797, "pid": 5, "tid": 7, "ts": 1716454218043287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019049, "dur": 12, "args": { "External id": 33797, "cbid": 211, "correlation": 33797 } }, { "ph": "s", "id": 33797, "pid": 76337, "tid": -914061504, "ts": 1716454218019049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019146, "dur": 1, "args": { "External id": 33815, "cbid": 251, "correlation": 33815 } }, { "ph": "f", "id": 33815, "pid": 76337, "tid": -914061504, "ts": 1716454218019146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218061663, "dur": 203, "args": { "External id": 33817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33817, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33817, "pid": 5, "tid": 7, "ts": 1716454218061663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019152, "dur": 13, "args": { "External id": 33817, "cbid": 211, "correlation": 33817 } }, { "ph": "s", "id": 33817, "pid": 76337, "tid": -914061504, "ts": 1716454218019152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218061867, "dur": 66, "args": { "External id": 33825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33825, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33825, "pid": 5, "tid": 7, "ts": 1716454218061867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019221, "dur": 12, "args": { "External id": 33825, "cbid": 211, "correlation": 33825 } }, { "ph": "s", "id": 33825, "pid": 76337, "tid": -914061504, "ts": 1716454218019221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218061934, "dur": 97, "args": { "External id": 33833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33833, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33833, "pid": 5, "tid": 7, "ts": 1716454218061934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019261, "dur": 8, "args": { "External id": 33833, "cbid": 211, "correlation": 33833 } }, { "ph": "s", "id": 33833, "pid": 76337, "tid": -914061504, "ts": 1716454218019261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218062033, "dur": 53, "args": { "External id": 33844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33844, "pid": 5, "tid": 7, "ts": 1716454218062033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019332, "dur": 13, "args": { "External id": 33844, "cbid": 211, "correlation": 33844 } }, { "ph": "s", "id": 33844, "pid": 76337, "tid": -914061504, "ts": 1716454218019332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218062086, "dur": 91, "args": { "External id": 33866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33866, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33866, "pid": 5, "tid": 7, "ts": 1716454218062086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019363, "dur": 8, "args": { "External id": 33866, "cbid": 211, "correlation": 33866 } }, { "ph": "s", "id": 33866, "pid": 76337, "tid": -914061504, "ts": 1716454218019363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019447, "dur": 1, "args": { "External id": 33877, "cbid": 251, "correlation": 33877 } }, { "ph": "f", "id": 33877, "pid": 76337, "tid": -914061504, "ts": 1716454218019447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218062179, "dur": 105, "args": { "External id": 33878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33878, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 33878, "pid": 5, "tid": 7, "ts": 1716454218062179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019452, "dur": 12, "args": { "External id": 33878, "cbid": 211, "correlation": 33878 } }, { "ph": "s", "id": 33878, "pid": 76337, "tid": -914061504, "ts": 1716454218019452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019522, "dur": 1, "args": { "External id": 33889, "cbid": 251, "correlation": 33889 } }, { "ph": "f", "id": 33889, "pid": 76337, "tid": -914061504, "ts": 1716454218019522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019526, "dur": 0, "args": { "External id": 33890, "cbid": 251, "correlation": 33890 } }, { "ph": "f", "id": 33890, "pid": 76337, "tid": -914061504, "ts": 1716454218019526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218062285, "dur": 10, "args": { "External id": 33891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33891, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 33891, "pid": 5, "tid": 7, "ts": 1716454218062285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019528, "dur": 12, "args": { "External id": 33891, "cbid": 211, "correlation": 33891 } }, { "ph": "s", "id": 33891, "pid": 76337, "tid": -914061504, "ts": 1716454218019528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218062297, "dur": 5, "args": { "External id": 33893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33893, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 33893, "pid": 5, "tid": 7, "ts": 1716454218062297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019542, "dur": 6, "args": { "External id": 33893, "cbid": 211, "correlation": 33893 } }, { "ph": "s", "id": 33893, "pid": 76337, "tid": -914061504, "ts": 1716454218019542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019603, "dur": 1, "args": { "External id": 33904, "cbid": 251, "correlation": 33904 } }, { "ph": "f", "id": 33904, "pid": 76337, "tid": -914061504, "ts": 1716454218019603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019607, "dur": 0, "args": { "External id": 33905, "cbid": 251, "correlation": 33905 } }, { "ph": "f", "id": 33905, "pid": 76337, "tid": -914061504, "ts": 1716454218019607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218062303, "dur": 6, "args": { "External id": 33906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33906, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 33906, "pid": 5, "tid": 7, "ts": 1716454218062303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019608, "dur": 11, "args": { "External id": 33906, "cbid": 211, "correlation": 33906 } }, { "ph": "s", "id": 33906, "pid": 76337, "tid": -914061504, "ts": 1716454218019608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218062310, "dur": 3, "args": { "External id": 33908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33908, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 33908, "pid": 5, "tid": 7, "ts": 1716454218062310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019621, "dur": 5, "args": { "External id": 33908, "cbid": 211, "correlation": 33908 } }, { "ph": "s", "id": 33908, "pid": 76337, "tid": -914061504, "ts": 1716454218019621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218062315, "dur": 152, "args": { "External id": 33929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33929, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33929, "pid": 5, "tid": 7, "ts": 1716454218062315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019694, "dur": 12, "args": { "External id": 33929, "cbid": 211, "correlation": 33929 } }, { "ph": "s", "id": 33929, "pid": 76337, "tid": -914061504, "ts": 1716454218019694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218019790, "dur": 1, "args": { "External id": 33947, "cbid": 251, "correlation": 33947 } }, { "ph": "f", "id": 33947, "pid": 76337, "tid": -914061504, "ts": 1716454218019790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218062468, "dur": 105, "args": { "External id": 33949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33949, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 33949, "pid": 5, "tid": 7, "ts": 1716454218062468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019796, "dur": 13, "args": { "External id": 33949, "cbid": 211, "correlation": 33949 } }, { "ph": "s", "id": 33949, "pid": 76337, "tid": -914061504, "ts": 1716454218019796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218062574, "dur": 35, "args": { "External id": 33957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33957, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33957, "pid": 5, "tid": 7, "ts": 1716454218062574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019866, "dur": 12, "args": { "External id": 33957, "cbid": 211, "correlation": 33957 } }, { "ph": "s", "id": 33957, "pid": 76337, "tid": -914061504, "ts": 1716454218019866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218062611, "dur": 68, "args": { "External id": 33965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33965, "pid": 5, "tid": 7, "ts": 1716454218062611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019905, "dur": 10, "args": { "External id": 33965, "cbid": 211, "correlation": 33965 } }, { "ph": "s", "id": 33965, "pid": 76337, "tid": -914061504, "ts": 1716454218019905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218062680, "dur": 91, "args": { "External id": 33987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 33987, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 33987, "pid": 5, "tid": 7, "ts": 1716454218062680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218019957, "dur": 10, "args": { "External id": 33987, "cbid": 211, "correlation": 33987 } }, { "ph": "s", "id": 33987, "pid": 76337, "tid": -914061504, "ts": 1716454218019957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020053, "dur": 1, "args": { "External id": 34003, "cbid": 251, "correlation": 34003 } }, { "ph": "f", "id": 34003, "pid": 76337, "tid": -914061504, "ts": 1716454218020053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218062772, "dur": 570, "args": { "External id": 34005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34005, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34005, "pid": 5, "tid": 7, "ts": 1716454218062772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020058, "dur": 14, "args": { "External id": 34005, "cbid": 211, "correlation": 34005 } }, { "ph": "s", "id": 34005, "pid": 76337, "tid": -914061504, "ts": 1716454218020058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218063344, "dur": 243, "args": { "External id": 34013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34013, "pid": 5, "tid": 7, "ts": 1716454218063344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020125, "dur": 12, "args": { "External id": 34013, "cbid": 211, "correlation": 34013 } }, { "ph": "s", "id": 34013, "pid": 76337, "tid": -914061504, "ts": 1716454218020125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218063588, "dur": 249, "args": { "External id": 34021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34021, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34021, "pid": 5, "tid": 7, "ts": 1716454218063588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020154, "dur": 8, "args": { "External id": 34021, "cbid": 211, "correlation": 34021 } }, { "ph": "s", "id": 34021, "pid": 76337, "tid": -914061504, "ts": 1716454218020154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020236, "dur": 1, "args": { "External id": 34037, "cbid": 251, "correlation": 34037 } }, { "ph": "f", "id": 34037, "pid": 76337, "tid": -914061504, "ts": 1716454218020236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020241, "dur": 0, "args": { "External id": 34039, "cbid": 251, "correlation": 34039 } }, { "ph": "f", "id": 34039, "pid": 76337, "tid": -914061504, "ts": 1716454218020241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218063838, "dur": 357, "args": { "External id": 34040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34040, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 34040, "pid": 5, "tid": 7, "ts": 1716454218063838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020244, "dur": 13, "args": { "External id": 34040, "cbid": 211, "correlation": 34040 } }, { "ph": "s", "id": 34040, "pid": 76337, "tid": -914061504, "ts": 1716454218020244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218064197, "dur": 50, "args": { "External id": 34048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34048, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34048, "pid": 5, "tid": 7, "ts": 1716454218064197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020286, "dur": 10, "args": { "External id": 34048, "cbid": 211, "correlation": 34048 } }, { "ph": "s", "id": 34048, "pid": 76337, "tid": -914061504, "ts": 1716454218020286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218064248, "dur": 156, "args": { "External id": 34059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34059, "pid": 5, "tid": 7, "ts": 1716454218064248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020353, "dur": 12, "args": { "External id": 34059, "cbid": 211, "correlation": 34059 } }, { "ph": "s", "id": 34059, "pid": 76337, "tid": -914061504, "ts": 1716454218020353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218020416, "dur": 0, "args": { "External id": 34071, "cbid": 317, "correlation": 34071 } }, { "ph": "f", "id": 34071, "pid": 76337, "tid": -914061504, "ts": 1716454218020416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218020417, "dur": 0, "args": { "External id": 34072, "cbid": 203, "correlation": 34072 } }, { "ph": "f", "id": 34072, "pid": 76337, "tid": -914061504, "ts": 1716454218020417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218020418, "dur": 0, "args": { "External id": 34073, "cbid": 205, "correlation": 34073 } }, { "ph": "f", "id": 34073, "pid": 76337, "tid": -914061504, "ts": 1716454218020418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020441, "dur": 1, "args": { "External id": 34077, "cbid": 251, "correlation": 34077 } }, { "ph": "f", "id": 34077, "pid": 76337, "tid": -914061504, "ts": 1716454218020441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020443, "dur": 0, "args": { "External id": 34078, "cbid": 251, "correlation": 34078 } }, { "ph": "f", "id": 34078, "pid": 76337, "tid": -914061504, "ts": 1716454218020443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020443, "dur": 0, "args": { "External id": 34079, "cbid": 251, "correlation": 34079 } }, { "ph": "f", "id": 34079, "pid": 76337, "tid": -914061504, "ts": 1716454218020443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020444, "dur": 0, "args": { "External id": 34080, "cbid": 251, "correlation": 34080 } }, { "ph": "f", "id": 34080, "pid": 76337, "tid": -914061504, "ts": 1716454218020444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020445, "dur": 0, "args": { "External id": 34081, "cbid": 251, "correlation": 34081 } }, { "ph": "f", "id": 34081, "pid": 76337, "tid": -914061504, "ts": 1716454218020445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020445, "dur": 0, "args": { "External id": 34082, "cbid": 251, "correlation": 34082 } }, { "ph": "f", "id": 34082, "pid": 76337, "tid": -914061504, "ts": 1716454218020445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020446, "dur": 0, "args": { "External id": 34083, "cbid": 251, "correlation": 34083 } }, { "ph": "f", "id": 34083, "pid": 76337, "tid": -914061504, "ts": 1716454218020446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020447, "dur": 0, "args": { "External id": 34084, "cbid": 251, "correlation": 34084 } }, { "ph": "f", "id": 34084, "pid": 76337, "tid": -914061504, "ts": 1716454218020447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218020448, "dur": 0, "args": { "External id": 34085, "cbid": 251, "correlation": 34085 } }, { "ph": "f", "id": 34085, "pid": 76337, "tid": -914061504, "ts": 1716454218020448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218064406, "dur": 113, "args": { "External id": 34086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34086, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 34086, "pid": 5, "tid": 7, "ts": 1716454218064406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020450, "dur": 12, "args": { "External id": 34086, "cbid": 211, "correlation": 34086 } }, { "ph": "s", "id": 34086, "pid": 76337, "tid": -914061504, "ts": 1716454218020450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218064520, "dur": 59, "args": { "External id": 34092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34092, "pid": 5, "tid": 7, "ts": 1716454218064520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020485, "dur": 9, "args": { "External id": 34092, "cbid": 211, "correlation": 34092 } }, { "ph": "s", "id": 34092, "pid": 76337, "tid": -914061504, "ts": 1716454218020485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218064581, "dur": 49, "args": { "External id": 34100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34100, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34100, "pid": 5, "tid": 7, "ts": 1716454218064581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020516, "dur": 8, "args": { "External id": 34100, "cbid": 211, "correlation": 34100 } }, { "ph": "s", "id": 34100, "pid": 76337, "tid": -914061504, "ts": 1716454218020516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218020589, "dur": 0, "args": { "External id": 34110, "cbid": 317, "correlation": 34110 } }, { "ph": "f", "id": 34110, "pid": 76337, "tid": -914061504, "ts": 1716454218020589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218020590, "dur": 0, "args": { "External id": 34111, "cbid": 203, "correlation": 34111 } }, { "ph": "f", "id": 34111, "pid": 76337, "tid": -914061504, "ts": 1716454218020590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218020591, "dur": 0, "args": { "External id": 34112, "cbid": 205, "correlation": 34112 } }, { "ph": "f", "id": 34112, "pid": 76337, "tid": -914061504, "ts": 1716454218020591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218064631, "dur": 42, "args": { "External id": 34116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34116, "pid": 5, "tid": 7, "ts": 1716454218064631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020608, "dur": 12, "args": { "External id": 34116, "cbid": 211, "correlation": 34116 } }, { "ph": "s", "id": 34116, "pid": 76337, "tid": -914061504, "ts": 1716454218020608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218064674, "dur": 14, "args": { "External id": 34118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34118, "pid": 5, "tid": 7, "ts": 1716454218064674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020623, "dur": 6, "args": { "External id": 34118, "cbid": 211, "correlation": 34118 } }, { "ph": "s", "id": 34118, "pid": 76337, "tid": -914061504, "ts": 1716454218020623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218064690, "dur": 1, "args": { "External id": 34120, "device": 5, "context": 1, "stream": 7, "correlation": 34120, "bytes": 1536, "memory bandwidth (GB/s)": 0.7868852459016393 } }, { "ph": "f", "id": 34120, "pid": 5, "tid": 7, "ts": 1716454218064690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218020641, "dur": 17, "args": { "External id": 34120, "cbid": 51, "correlation": 34120 } }, { "ph": "s", "id": 34120, "pid": 76337, "tid": -914061504, "ts": 1716454218020641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218064694, "dur": 359, "args": { "External id": 34121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34121, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34121, "pid": 5, "tid": 7, "ts": 1716454218064694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020660, "dur": 9, "args": { "External id": 34121, "cbid": 211, "correlation": 34121 } }, { "ph": "s", "id": 34121, "pid": 76337, "tid": -914061504, "ts": 1716454218020660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218065054, "dur": 13, "args": { "External id": 34123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34123, "pid": 5, "tid": 7, "ts": 1716454218065054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020678, "dur": 7, "args": { "External id": 34123, "cbid": 211, "correlation": 34123 } }, { "ph": "s", "id": 34123, "pid": 76337, "tid": -914061504, "ts": 1716454218020678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218065069, "dur": 15, "args": { "External id": 34129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34129, "pid": 5, "tid": 7, "ts": 1716454218065069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020708, "dur": 8, "args": { "External id": 34129, "cbid": 211, "correlation": 34129 } }, { "ph": "s", "id": 34129, "pid": 76337, "tid": -914061504, "ts": 1716454218020708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218065085, "dur": 19, "args": { "External id": 34149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34149, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 34149, "pid": 5, "tid": 7, "ts": 1716454218065085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020800, "dur": 13, "args": { "External id": 34149, "cbid": 211, "correlation": 34149 } }, { "ph": "s", "id": 34149, "pid": 76337, "tid": -914061504, "ts": 1716454218020800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218065105, "dur": 5, "args": { "External id": 34161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34161, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 34161, "pid": 5, "tid": 7, "ts": 1716454218065105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020822, "dur": 6, "args": { "External id": 34161, "cbid": 211, "correlation": 34161 } }, { "ph": "s", "id": 34161, "pid": 76337, "tid": -914061504, "ts": 1716454218020822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218065112, "dur": 17, "args": { "External id": 34164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34164, "pid": 5, "tid": 7, "ts": 1716454218065112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020840, "dur": 7, "args": { "External id": 34164, "cbid": 211, "correlation": 34164 } }, { "ph": "s", "id": 34164, "pid": 76337, "tid": -914061504, "ts": 1716454218020840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218065130, "dur": 11, "args": { "External id": 34173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34173, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34173, "pid": 5, "tid": 7, "ts": 1716454218065130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020879, "dur": 9, "args": { "External id": 34173, "cbid": 211, "correlation": 34173 } }, { "ph": "s", "id": 34173, "pid": 76337, "tid": -914061504, "ts": 1716454218020879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218020935, "dur": 0, "args": { "External id": 34183, "cbid": 317, "correlation": 34183 } }, { "ph": "f", "id": 34183, "pid": 76337, "tid": -914061504, "ts": 1716454218020935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218020936, "dur": 0, "args": { "External id": 34184, "cbid": 203, "correlation": 34184 } }, { "ph": "f", "id": 34184, "pid": 76337, "tid": -914061504, "ts": 1716454218020936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218020936, "dur": 0, "args": { "External id": 34185, "cbid": 205, "correlation": 34185 } }, { "ph": "f", "id": 34185, "pid": 76337, "tid": -914061504, "ts": 1716454218020936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218065143, "dur": 11, "args": { "External id": 34189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34189, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34189, "pid": 5, "tid": 7, "ts": 1716454218065143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020951, "dur": 12, "args": { "External id": 34189, "cbid": 211, "correlation": 34189 } }, { "ph": "s", "id": 34189, "pid": 76337, "tid": -914061504, "ts": 1716454218020951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218065155, "dur": 24, "args": { "External id": 34191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34191, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34191, "pid": 5, "tid": 7, "ts": 1716454218065155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020966, "dur": 5, "args": { "External id": 34191, "cbid": 211, "correlation": 34191 } }, { "ph": "s", "id": 34191, "pid": 76337, "tid": -914061504, "ts": 1716454218020966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218065180, "dur": 4, "args": { "External id": 34193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 34193, "pid": 5, "tid": 7, "ts": 1716454218065180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020987, "dur": 6, "args": { "External id": 34193, "cbid": 211, "correlation": 34193 } }, { "ph": "s", "id": 34193, "pid": 76337, "tid": -914061504, "ts": 1716454218020987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218020997, "dur": 0, "args": { "External id": 34194, "cbid": 51, "correlation": 34194 } }, { "ph": "s", "id": 34194, "pid": 76337, "tid": -914061504, "ts": 1716454218020997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218065185, "dur": 354, "args": { "External id": 34195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34195, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34195, "pid": 5, "tid": 7, "ts": 1716454218065185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218020998, "dur": 7, "args": { "External id": 34195, "cbid": 211, "correlation": 34195 } }, { "ph": "s", "id": 34195, "pid": 76337, "tid": -914061504, "ts": 1716454218020998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218065540, "dur": 19, "args": { "External id": 34196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34196, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34196, "pid": 5, "tid": 7, "ts": 1716454218065540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021008, "dur": 6, "args": { "External id": 34196, "cbid": 211, "correlation": 34196 } }, { "ph": "s", "id": 34196, "pid": 76337, "tid": -914061504, "ts": 1716454218021008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218065560, "dur": 33, "args": { "External id": 34202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34202, "pid": 5, "tid": 7, "ts": 1716454218065560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021037, "dur": 8, "args": { "External id": 34202, "cbid": 211, "correlation": 34202 } }, { "ph": "s", "id": 34202, "pid": 76337, "tid": -914061504, "ts": 1716454218021037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218065594, "dur": 3, "args": { "External id": 34210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34210, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 34210, "pid": 5, "tid": 7, "ts": 1716454218065594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021080, "dur": 9, "args": { "External id": 34210, "cbid": 211, "correlation": 34210 } }, { "ph": "s", "id": 34210, "pid": 76337, "tid": -914061504, "ts": 1716454218021080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021147, "dur": 1, "args": { "External id": 34226, "cbid": 251, "correlation": 34226 } }, { "ph": "f", "id": 34226, "pid": 76337, "tid": -914061504, "ts": 1716454218021147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021153, "dur": 0, "args": { "External id": 34228, "cbid": 251, "correlation": 34228 } }, { "ph": "f", "id": 34228, "pid": 76337, "tid": -914061504, "ts": 1716454218021153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218065599, "dur": 12, "args": { "External id": 34229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34229, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 34229, "pid": 5, "tid": 7, "ts": 1716454218065599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021154, "dur": 11, "args": { "External id": 34229, "cbid": 211, "correlation": 34229 } }, { "ph": "s", "id": 34229, "pid": 76337, "tid": -914061504, "ts": 1716454218021154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218065612, "dur": 5, "args": { "External id": 34231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34231, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 34231, "pid": 5, "tid": 7, "ts": 1716454218065612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021168, "dur": 6, "args": { "External id": 34231, "cbid": 211, "correlation": 34231 } }, { "ph": "s", "id": 34231, "pid": 76337, "tid": -914061504, "ts": 1716454218021168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218065619, "dur": 30, "args": { "External id": 34241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34241, "pid": 5, "tid": 7, "ts": 1716454218065619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021225, "dur": 12, "args": { "External id": 34241, "cbid": 211, "correlation": 34241 } }, { "ph": "s", "id": 34241, "pid": 76337, "tid": -914061504, "ts": 1716454218021225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218065650, "dur": 31, "args": { "External id": 34261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34261, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 34261, "pid": 5, "tid": 7, "ts": 1716454218065650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021293, "dur": 10, "args": { "External id": 34261, "cbid": 211, "correlation": 34261 } }, { "ph": "s", "id": 34261, "pid": 76337, "tid": -914061504, "ts": 1716454218021293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218065682, "dur": 4, "args": { "External id": 34273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34273, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 34273, "pid": 5, "tid": 7, "ts": 1716454218065682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021313, "dur": 6, "args": { "External id": 34273, "cbid": 211, "correlation": 34273 } }, { "ph": "s", "id": 34273, "pid": 76337, "tid": -914061504, "ts": 1716454218021313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218065688, "dur": 30, "args": { "External id": 34276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34276, "pid": 5, "tid": 7, "ts": 1716454218065688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021332, "dur": 6, "args": { "External id": 34276, "cbid": 211, "correlation": 34276 } }, { "ph": "s", "id": 34276, "pid": 76337, "tid": -914061504, "ts": 1716454218021332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218065718, "dur": 20, "args": { "External id": 34285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34285, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34285, "pid": 5, "tid": 7, "ts": 1716454218065718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021373, "dur": 10, "args": { "External id": 34285, "cbid": 211, "correlation": 34285 } }, { "ph": "s", "id": 34285, "pid": 76337, "tid": -914061504, "ts": 1716454218021373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218021437, "dur": 0, "args": { "External id": 34295, "cbid": 317, "correlation": 34295 } }, { "ph": "f", "id": 34295, "pid": 76337, "tid": -914061504, "ts": 1716454218021437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218021438, "dur": 0, "args": { "External id": 34296, "cbid": 203, "correlation": 34296 } }, { "ph": "f", "id": 34296, "pid": 76337, "tid": -914061504, "ts": 1716454218021438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218021439, "dur": 0, "args": { "External id": 34297, "cbid": 205, "correlation": 34297 } }, { "ph": "f", "id": 34297, "pid": 76337, "tid": -914061504, "ts": 1716454218021439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218065740, "dur": 23, "args": { "External id": 34301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34301, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34301, "pid": 5, "tid": 7, "ts": 1716454218065740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021455, "dur": 12, "args": { "External id": 34301, "cbid": 211, "correlation": 34301 } }, { "ph": "s", "id": 34301, "pid": 76337, "tid": -914061504, "ts": 1716454218021455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218065764, "dur": 43, "args": { "External id": 34303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34303, "pid": 5, "tid": 7, "ts": 1716454218065764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021470, "dur": 5, "args": { "External id": 34303, "cbid": 211, "correlation": 34303 } }, { "ph": "s", "id": 34303, "pid": 76337, "tid": -914061504, "ts": 1716454218021470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218065809, "dur": 651, "args": { "External id": 34305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34305, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34305, "pid": 5, "tid": 7, "ts": 1716454218065809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021484, "dur": 9, "args": { "External id": 34305, "cbid": 211, "correlation": 34305 } }, { "ph": "s", "id": 34305, "pid": 76337, "tid": -914061504, "ts": 1716454218021484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218066461, "dur": 20, "args": { "External id": 34307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34307, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34307, "pid": 5, "tid": 7, "ts": 1716454218066461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021496, "dur": 5, "args": { "External id": 34307, "cbid": 211, "correlation": 34307 } }, { "ph": "s", "id": 34307, "pid": 76337, "tid": -914061504, "ts": 1716454218021496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218066483, "dur": 33, "args": { "External id": 34313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34313, "pid": 5, "tid": 7, "ts": 1716454218066483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021524, "dur": 9, "args": { "External id": 34313, "cbid": 211, "correlation": 34313 } }, { "ph": "s", "id": 34313, "pid": 76337, "tid": -914061504, "ts": 1716454218021524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218021583, "dur": 0, "args": { "External id": 34323, "cbid": 317, "correlation": 34323 } }, { "ph": "f", "id": 34323, "pid": 76337, "tid": -914061504, "ts": 1716454218021583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218021584, "dur": 0, "args": { "External id": 34324, "cbid": 203, "correlation": 34324 } }, { "ph": "f", "id": 34324, "pid": 76337, "tid": -914061504, "ts": 1716454218021584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218021585, "dur": 0, "args": { "External id": 34325, "cbid": 205, "correlation": 34325 } }, { "ph": "f", "id": 34325, "pid": 76337, "tid": -914061504, "ts": 1716454218021585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021606, "dur": 1, "args": { "External id": 34329, "cbid": 251, "correlation": 34329 } }, { "ph": "f", "id": 34329, "pid": 76337, "tid": -914061504, "ts": 1716454218021606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021608, "dur": 0, "args": { "External id": 34330, "cbid": 251, "correlation": 34330 } }, { "ph": "f", "id": 34330, "pid": 76337, "tid": -914061504, "ts": 1716454218021608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021609, "dur": 0, "args": { "External id": 34331, "cbid": 251, "correlation": 34331 } }, { "ph": "f", "id": 34331, "pid": 76337, "tid": -914061504, "ts": 1716454218021609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021609, "dur": 0, "args": { "External id": 34332, "cbid": 251, "correlation": 34332 } }, { "ph": "f", "id": 34332, "pid": 76337, "tid": -914061504, "ts": 1716454218021609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021610, "dur": 0, "args": { "External id": 34333, "cbid": 251, "correlation": 34333 } }, { "ph": "f", "id": 34333, "pid": 76337, "tid": -914061504, "ts": 1716454218021610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021611, "dur": 0, "args": { "External id": 34334, "cbid": 251, "correlation": 34334 } }, { "ph": "f", "id": 34334, "pid": 76337, "tid": -914061504, "ts": 1716454218021611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021612, "dur": 0, "args": { "External id": 34335, "cbid": 251, "correlation": 34335 } }, { "ph": "f", "id": 34335, "pid": 76337, "tid": -914061504, "ts": 1716454218021612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021612, "dur": 0, "args": { "External id": 34336, "cbid": 251, "correlation": 34336 } }, { "ph": "f", "id": 34336, "pid": 76337, "tid": -914061504, "ts": 1716454218021612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218021614, "dur": 0, "args": { "External id": 34337, "cbid": 251, "correlation": 34337 } }, { "ph": "f", "id": 34337, "pid": 76337, "tid": -914061504, "ts": 1716454218021614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218066516, "dur": 50, "args": { "External id": 34338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34338, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 34338, "pid": 5, "tid": 7, "ts": 1716454218066516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021616, "dur": 12, "args": { "External id": 34338, "cbid": 211, "correlation": 34338 } }, { "ph": "s", "id": 34338, "pid": 76337, "tid": -914061504, "ts": 1716454218021616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218066568, "dur": 32, "args": { "External id": 34344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34344, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34344, "pid": 5, "tid": 7, "ts": 1716454218066568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021648, "dur": 8, "args": { "External id": 34344, "cbid": 211, "correlation": 34344 } }, { "ph": "s", "id": 34344, "pid": 76337, "tid": -914061504, "ts": 1716454218021648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218066601, "dur": 27, "args": { "External id": 34352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34352, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34352, "pid": 5, "tid": 7, "ts": 1716454218066601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021677, "dur": 8, "args": { "External id": 34352, "cbid": 211, "correlation": 34352 } }, { "ph": "s", "id": 34352, "pid": 76337, "tid": -914061504, "ts": 1716454218021677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218066629, "dur": 20, "args": { "External id": 34360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34360, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34360, "pid": 5, "tid": 7, "ts": 1716454218066629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021707, "dur": 8, "args": { "External id": 34360, "cbid": 211, "correlation": 34360 } }, { "ph": "s", "id": 34360, "pid": 76337, "tid": -914061504, "ts": 1716454218021707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218066651, "dur": 30, "args": { "External id": 34380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34380, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 34380, "pid": 5, "tid": 7, "ts": 1716454218066651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021789, "dur": 13, "args": { "External id": 34380, "cbid": 211, "correlation": 34380 } }, { "ph": "s", "id": 34380, "pid": 76337, "tid": -914061504, "ts": 1716454218021789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218066682, "dur": 4, "args": { "External id": 34392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34392, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 34392, "pid": 5, "tid": 7, "ts": 1716454218066682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021811, "dur": 6, "args": { "External id": 34392, "cbid": 211, "correlation": 34392 } }, { "ph": "s", "id": 34392, "pid": 76337, "tid": -914061504, "ts": 1716454218021811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218066687, "dur": 29, "args": { "External id": 34395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34395, "pid": 5, "tid": 7, "ts": 1716454218066687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021829, "dur": 7, "args": { "External id": 34395, "cbid": 211, "correlation": 34395 } }, { "ph": "s", "id": 34395, "pid": 76337, "tid": -914061504, "ts": 1716454218021829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218021887, "dur": 0, "args": { "External id": 34406, "cbid": 317, "correlation": 34406 } }, { "ph": "f", "id": 34406, "pid": 76337, "tid": -914061504, "ts": 1716454218021887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218021888, "dur": 0, "args": { "External id": 34407, "cbid": 203, "correlation": 34407 } }, { "ph": "f", "id": 34407, "pid": 76337, "tid": -914061504, "ts": 1716454218021888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218021888, "dur": 0, "args": { "External id": 34408, "cbid": 205, "correlation": 34408 } }, { "ph": "f", "id": 34408, "pid": 76337, "tid": -914061504, "ts": 1716454218021888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218066717, "dur": 22, "args": { "External id": 34412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34412, "pid": 5, "tid": 7, "ts": 1716454218066717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021905, "dur": 12, "args": { "External id": 34412, "cbid": 211, "correlation": 34412 } }, { "ph": "s", "id": 34412, "pid": 76337, "tid": -914061504, "ts": 1716454218021905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218066741, "dur": 120, "args": { "External id": 34414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34414, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34414, "pid": 5, "tid": 7, "ts": 1716454218066741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021925, "dur": 8, "args": { "External id": 34414, "cbid": 211, "correlation": 34414 } }, { "ph": "s", "id": 34414, "pid": 76337, "tid": -914061504, "ts": 1716454218021925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218066862, "dur": 22, "args": { "External id": 34416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34416, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34416, "pid": 5, "tid": 7, "ts": 1716454218066862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021937, "dur": 5, "args": { "External id": 34416, "cbid": 211, "correlation": 34416 } }, { "ph": "s", "id": 34416, "pid": 76337, "tid": -914061504, "ts": 1716454218021937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218066885, "dur": 32, "args": { "External id": 34422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34422, "pid": 5, "tid": 7, "ts": 1716454218066885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218021965, "dur": 17, "args": { "External id": 34422, "cbid": 211, "correlation": 34422 } }, { "ph": "s", "id": 34422, "pid": 76337, "tid": -914061504, "ts": 1716454218021965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218066918, "dur": 202, "args": { "External id": 34431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34431, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34431, "pid": 5, "tid": 7, "ts": 1716454218066918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022057, "dur": 15, "args": { "External id": 34431, "cbid": 211, "correlation": 34431 } }, { "ph": "s", "id": 34431, "pid": 76337, "tid": -914061504, "ts": 1716454218022057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218067121, "dur": 65, "args": { "External id": 34453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34453, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34453, "pid": 5, "tid": 7, "ts": 1716454218067121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022113, "dur": 10, "args": { "External id": 34453, "cbid": 211, "correlation": 34453 } }, { "ph": "s", "id": 34453, "pid": 76337, "tid": -914061504, "ts": 1716454218022113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022202, "dur": 1, "args": { "External id": 34464, "cbid": 251, "correlation": 34464 } }, { "ph": "f", "id": 34464, "pid": 76337, "tid": -914061504, "ts": 1716454218022202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218067188, "dur": 153, "args": { "External id": 34465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34465, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34465, "pid": 5, "tid": 7, "ts": 1716454218067188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022207, "dur": 14, "args": { "External id": 34465, "cbid": 211, "correlation": 34465 } }, { "ph": "s", "id": 34465, "pid": 76337, "tid": -914061504, "ts": 1716454218022207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022278, "dur": 1, "args": { "External id": 34476, "cbid": 251, "correlation": 34476 } }, { "ph": "f", "id": 34476, "pid": 76337, "tid": -914061504, "ts": 1716454218022278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218067342, "dur": 146, "args": { "External id": 34477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34477, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34477, "pid": 5, "tid": 7, "ts": 1716454218067342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022281, "dur": 11, "args": { "External id": 34477, "cbid": 211, "correlation": 34477 } }, { "ph": "s", "id": 34477, "pid": 76337, "tid": -914061504, "ts": 1716454218022281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022346, "dur": 1, "args": { "External id": 34488, "cbid": 251, "correlation": 34488 } }, { "ph": "f", "id": 34488, "pid": 76337, "tid": -914061504, "ts": 1716454218022346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218067490, "dur": 143, "args": { "External id": 34489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34489, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34489, "pid": 5, "tid": 7, "ts": 1716454218067490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022350, "dur": 11, "args": { "External id": 34489, "cbid": 211, "correlation": 34489 } }, { "ph": "s", "id": 34489, "pid": 76337, "tid": -914061504, "ts": 1716454218022350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218067634, "dur": 1926, "args": { "External id": 34510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34510, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 34510, "pid": 5, "tid": 7, "ts": 1716454218067634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022432, "dur": 13, "args": { "External id": 34510, "cbid": 211, "correlation": 34510 } }, { "ph": "s", "id": 34510, "pid": 76337, "tid": -914061504, "ts": 1716454218022432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022533, "dur": 1, "args": { "External id": 34528, "cbid": 251, "correlation": 34528 } }, { "ph": "f", "id": 34528, "pid": 76337, "tid": -914061504, "ts": 1716454218022533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218069562, "dur": 146, "args": { "External id": 34530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34530, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 34530, "pid": 5, "tid": 7, "ts": 1716454218069562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022539, "dur": 13, "args": { "External id": 34530, "cbid": 211, "correlation": 34530 } }, { "ph": "s", "id": 34530, "pid": 76337, "tid": -914061504, "ts": 1716454218022539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218069709, "dur": 35, "args": { "External id": 34538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34538, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34538, "pid": 5, "tid": 7, "ts": 1716454218069709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022609, "dur": 12, "args": { "External id": 34538, "cbid": 211, "correlation": 34538 } }, { "ph": "s", "id": 34538, "pid": 76337, "tid": -914061504, "ts": 1716454218022609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218069745, "dur": 51, "args": { "External id": 34546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34546, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34546, "pid": 5, "tid": 7, "ts": 1716454218069745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022648, "dur": 9, "args": { "External id": 34546, "cbid": 211, "correlation": 34546 } }, { "ph": "s", "id": 34546, "pid": 76337, "tid": -914061504, "ts": 1716454218022648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218069797, "dur": 30, "args": { "External id": 34557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34557, "pid": 5, "tid": 7, "ts": 1716454218069797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022720, "dur": 13, "args": { "External id": 34557, "cbid": 211, "correlation": 34557 } }, { "ph": "s", "id": 34557, "pid": 76337, "tid": -914061504, "ts": 1716454218022720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218069829, "dur": 34, "args": { "External id": 34579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34579, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34579, "pid": 5, "tid": 7, "ts": 1716454218069829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022752, "dur": 8, "args": { "External id": 34579, "cbid": 211, "correlation": 34579 } }, { "ph": "s", "id": 34579, "pid": 76337, "tid": -914061504, "ts": 1716454218022752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022836, "dur": 1, "args": { "External id": 34590, "cbid": 251, "correlation": 34590 } }, { "ph": "f", "id": 34590, "pid": 76337, "tid": -914061504, "ts": 1716454218022836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218069864, "dur": 91, "args": { "External id": 34591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34591, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34591, "pid": 5, "tid": 7, "ts": 1716454218069864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022842, "dur": 13, "args": { "External id": 34591, "cbid": 211, "correlation": 34591 } }, { "ph": "s", "id": 34591, "pid": 76337, "tid": -914061504, "ts": 1716454218022842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022911, "dur": 1, "args": { "External id": 34602, "cbid": 251, "correlation": 34602 } }, { "ph": "f", "id": 34602, "pid": 76337, "tid": -914061504, "ts": 1716454218022911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022915, "dur": 0, "args": { "External id": 34603, "cbid": 251, "correlation": 34603 } }, { "ph": "f", "id": 34603, "pid": 76337, "tid": -914061504, "ts": 1716454218022915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218069957, "dur": 11, "args": { "External id": 34604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34604, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 34604, "pid": 5, "tid": 7, "ts": 1716454218069957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022916, "dur": 11, "args": { "External id": 34604, "cbid": 211, "correlation": 34604 } }, { "ph": "s", "id": 34604, "pid": 76337, "tid": -914061504, "ts": 1716454218022916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218069969, "dur": 5, "args": { "External id": 34606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34606, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 34606, "pid": 5, "tid": 7, "ts": 1716454218069969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218022932, "dur": 7, "args": { "External id": 34606, "cbid": 211, "correlation": 34606 } }, { "ph": "s", "id": 34606, "pid": 76337, "tid": -914061504, "ts": 1716454218022932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218022998, "dur": 1, "args": { "External id": 34617, "cbid": 251, "correlation": 34617 } }, { "ph": "f", "id": 34617, "pid": 76337, "tid": -914061504, "ts": 1716454218022998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218023002, "dur": 0, "args": { "External id": 34618, "cbid": 251, "correlation": 34618 } }, { "ph": "f", "id": 34618, "pid": 76337, "tid": -914061504, "ts": 1716454218023002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218069975, "dur": 7, "args": { "External id": 34619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34619, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 34619, "pid": 5, "tid": 7, "ts": 1716454218069975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023003, "dur": 13, "args": { "External id": 34619, "cbid": 211, "correlation": 34619 } }, { "ph": "s", "id": 34619, "pid": 76337, "tid": -914061504, "ts": 1716454218023003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218069984, "dur": 3, "args": { "External id": 34621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34621, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 34621, "pid": 5, "tid": 7, "ts": 1716454218069984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023018, "dur": 5, "args": { "External id": 34621, "cbid": 211, "correlation": 34621 } }, { "ph": "s", "id": 34621, "pid": 76337, "tid": -914061504, "ts": 1716454218023018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218069988, "dur": 91, "args": { "External id": 34642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34642, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 34642, "pid": 5, "tid": 7, "ts": 1716454218069988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023091, "dur": 12, "args": { "External id": 34642, "cbid": 211, "correlation": 34642 } }, { "ph": "s", "id": 34642, "pid": 76337, "tid": -914061504, "ts": 1716454218023091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218023188, "dur": 1, "args": { "External id": 34660, "cbid": 251, "correlation": 34660 } }, { "ph": "f", "id": 34660, "pid": 76337, "tid": -914061504, "ts": 1716454218023188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218070080, "dur": 97, "args": { "External id": 34662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34662, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34662, "pid": 5, "tid": 7, "ts": 1716454218070080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023193, "dur": 13, "args": { "External id": 34662, "cbid": 211, "correlation": 34662 } }, { "ph": "s", "id": 34662, "pid": 76337, "tid": -914061504, "ts": 1716454218023193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218070179, "dur": 19, "args": { "External id": 34670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34670, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34670, "pid": 5, "tid": 7, "ts": 1716454218070179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023261, "dur": 13, "args": { "External id": 34670, "cbid": 211, "correlation": 34670 } }, { "ph": "s", "id": 34670, "pid": 76337, "tid": -914061504, "ts": 1716454218023261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218070199, "dur": 36, "args": { "External id": 34678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34678, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34678, "pid": 5, "tid": 7, "ts": 1716454218070199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023303, "dur": 9, "args": { "External id": 34678, "cbid": 211, "correlation": 34678 } }, { "ph": "s", "id": 34678, "pid": 76337, "tid": -914061504, "ts": 1716454218023303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218070237, "dur": 35, "args": { "External id": 34700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34700, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34700, "pid": 5, "tid": 7, "ts": 1716454218070237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023355, "dur": 10, "args": { "External id": 34700, "cbid": 211, "correlation": 34700 } }, { "ph": "s", "id": 34700, "pid": 76337, "tid": -914061504, "ts": 1716454218023355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218023445, "dur": 1, "args": { "External id": 34716, "cbid": 251, "correlation": 34716 } }, { "ph": "f", "id": 34716, "pid": 76337, "tid": -914061504, "ts": 1716454218023445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218023450, "dur": 0, "args": { "External id": 34718, "cbid": 251, "correlation": 34718 } }, { "ph": "f", "id": 34718, "pid": 76337, "tid": -914061504, "ts": 1716454218023450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218070273, "dur": 537, "args": { "External id": 34719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34719, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 34719, "pid": 5, "tid": 7, "ts": 1716454218070273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023453, "dur": 13, "args": { "External id": 34719, "cbid": 211, "correlation": 34719 } }, { "ph": "s", "id": 34719, "pid": 76337, "tid": -914061504, "ts": 1716454218023453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218070811, "dur": 127, "args": { "External id": 34727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34727, "pid": 5, "tid": 7, "ts": 1716454218070811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023517, "dur": 13, "args": { "External id": 34727, "cbid": 211, "correlation": 34727 } }, { "ph": "s", "id": 34727, "pid": 76337, "tid": -914061504, "ts": 1716454218023517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218070939, "dur": 128, "args": { "External id": 34735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34735, "pid": 5, "tid": 7, "ts": 1716454218070939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023549, "dur": 8, "args": { "External id": 34735, "cbid": 211, "correlation": 34735 } }, { "ph": "s", "id": 34735, "pid": 76337, "tid": -914061504, "ts": 1716454218023549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218023628, "dur": 1, "args": { "External id": 34751, "cbid": 251, "correlation": 34751 } }, { "ph": "f", "id": 34751, "pid": 76337, "tid": -914061504, "ts": 1716454218023628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218071068, "dur": 304, "args": { "External id": 34753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34753, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34753, "pid": 5, "tid": 7, "ts": 1716454218071068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023634, "dur": 12, "args": { "External id": 34753, "cbid": 211, "correlation": 34753 } }, { "ph": "s", "id": 34753, "pid": 76337, "tid": -914061504, "ts": 1716454218023634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218071373, "dur": 27, "args": { "External id": 34761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34761, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34761, "pid": 5, "tid": 7, "ts": 1716454218071373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023675, "dur": 10, "args": { "External id": 34761, "cbid": 211, "correlation": 34761 } }, { "ph": "s", "id": 34761, "pid": 76337, "tid": -914061504, "ts": 1716454218023675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218071402, "dur": 81, "args": { "External id": 34772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34772, "pid": 5, "tid": 7, "ts": 1716454218071402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023744, "dur": 12, "args": { "External id": 34772, "cbid": 211, "correlation": 34772 } }, { "ph": "s", "id": 34772, "pid": 76337, "tid": -914061504, "ts": 1716454218023744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218023807, "dur": 0, "args": { "External id": 34784, "cbid": 317, "correlation": 34784 } }, { "ph": "f", "id": 34784, "pid": 76337, "tid": -914061504, "ts": 1716454218023807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218023808, "dur": 0, "args": { "External id": 34785, "cbid": 203, "correlation": 34785 } }, { "ph": "f", "id": 34785, "pid": 76337, "tid": -914061504, "ts": 1716454218023808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218023809, "dur": 0, "args": { "External id": 34786, "cbid": 205, "correlation": 34786 } }, { "ph": "f", "id": 34786, "pid": 76337, "tid": -914061504, "ts": 1716454218023809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218071484, "dur": 23, "args": { "External id": 34790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34790, "pid": 5, "tid": 7, "ts": 1716454218071484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023823, "dur": 11, "args": { "External id": 34790, "cbid": 211, "correlation": 34790 } }, { "ph": "s", "id": 34790, "pid": 76337, "tid": -914061504, "ts": 1716454218023823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218071508, "dur": 118, "args": { "External id": 34792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34792, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34792, "pid": 5, "tid": 7, "ts": 1716454218071508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023842, "dur": 7, "args": { "External id": 34792, "cbid": 211, "correlation": 34792 } }, { "ph": "s", "id": 34792, "pid": 76337, "tid": -914061504, "ts": 1716454218023842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218071627, "dur": 22, "args": { "External id": 34794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34794, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34794, "pid": 5, "tid": 7, "ts": 1716454218071627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023853, "dur": 5, "args": { "External id": 34794, "cbid": 211, "correlation": 34794 } }, { "ph": "s", "id": 34794, "pid": 76337, "tid": -914061504, "ts": 1716454218023853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218071651, "dur": 33, "args": { "External id": 34800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34800, "pid": 5, "tid": 7, "ts": 1716454218071651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023880, "dur": 8, "args": { "External id": 34800, "cbid": 211, "correlation": 34800 } }, { "ph": "s", "id": 34800, "pid": 76337, "tid": -914061504, "ts": 1716454218023880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218071685, "dur": 27, "args": { "External id": 34808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34808, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34808, "pid": 5, "tid": 7, "ts": 1716454218071685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023911, "dur": 8, "args": { "External id": 34808, "cbid": 211, "correlation": 34808 } }, { "ph": "s", "id": 34808, "pid": 76337, "tid": -914061504, "ts": 1716454218023911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218071712, "dur": 31, "args": { "External id": 34828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34828, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 34828, "pid": 5, "tid": 7, "ts": 1716454218071712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218023991, "dur": 13, "args": { "External id": 34828, "cbid": 211, "correlation": 34828 } }, { "ph": "s", "id": 34828, "pid": 76337, "tid": -914061504, "ts": 1716454218023991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218071744, "dur": 5, "args": { "External id": 34840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34840, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 34840, "pid": 5, "tid": 7, "ts": 1716454218071744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024014, "dur": 6, "args": { "External id": 34840, "cbid": 211, "correlation": 34840 } }, { "ph": "s", "id": 34840, "pid": 76337, "tid": -914061504, "ts": 1716454218024014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218071750, "dur": 32, "args": { "External id": 34843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34843, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34843, "pid": 5, "tid": 7, "ts": 1716454218071750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024031, "dur": 6, "args": { "External id": 34843, "cbid": 211, "correlation": 34843 } }, { "ph": "s", "id": 34843, "pid": 76337, "tid": -914061504, "ts": 1716454218024031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218071783, "dur": 21, "args": { "External id": 34852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34852, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34852, "pid": 5, "tid": 7, "ts": 1716454218071783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024071, "dur": 9, "args": { "External id": 34852, "cbid": 211, "correlation": 34852 } }, { "ph": "s", "id": 34852, "pid": 76337, "tid": -914061504, "ts": 1716454218024071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218024121, "dur": 0, "args": { "External id": 34862, "cbid": 317, "correlation": 34862 } }, { "ph": "f", "id": 34862, "pid": 76337, "tid": -914061504, "ts": 1716454218024121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218024122, "dur": 0, "args": { "External id": 34863, "cbid": 203, "correlation": 34863 } }, { "ph": "f", "id": 34863, "pid": 76337, "tid": -914061504, "ts": 1716454218024122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218024123, "dur": 0, "args": { "External id": 34864, "cbid": 205, "correlation": 34864 } }, { "ph": "f", "id": 34864, "pid": 76337, "tid": -914061504, "ts": 1716454218024123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218071806, "dur": 23, "args": { "External id": 34868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34868, "pid": 5, "tid": 7, "ts": 1716454218071806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024136, "dur": 11, "args": { "External id": 34868, "cbid": 211, "correlation": 34868 } }, { "ph": "s", "id": 34868, "pid": 76337, "tid": -914061504, "ts": 1716454218024136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218071830, "dur": 44, "args": { "External id": 34870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34870, "pid": 5, "tid": 7, "ts": 1716454218071830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024150, "dur": 6, "args": { "External id": 34870, "cbid": 211, "correlation": 34870 } }, { "ph": "s", "id": 34870, "pid": 76337, "tid": -914061504, "ts": 1716454218024150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218071876, "dur": 650, "args": { "External id": 34872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34872, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34872, "pid": 5, "tid": 7, "ts": 1716454218071876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024162, "dur": 6, "args": { "External id": 34872, "cbid": 211, "correlation": 34872 } }, { "ph": "s", "id": 34872, "pid": 76337, "tid": -914061504, "ts": 1716454218024162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218072527, "dur": 22, "args": { "External id": 34874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34874, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34874, "pid": 5, "tid": 7, "ts": 1716454218072527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024172, "dur": 5, "args": { "External id": 34874, "cbid": 211, "correlation": 34874 } }, { "ph": "s", "id": 34874, "pid": 76337, "tid": -914061504, "ts": 1716454218024172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218072550, "dur": 32, "args": { "External id": 34880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34880, "pid": 5, "tid": 7, "ts": 1716454218072550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024198, "dur": 9, "args": { "External id": 34880, "cbid": 211, "correlation": 34880 } }, { "ph": "s", "id": 34880, "pid": 76337, "tid": -914061504, "ts": 1716454218024198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218072584, "dur": 3, "args": { "External id": 34888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34888, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 34888, "pid": 5, "tid": 7, "ts": 1716454218072584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024242, "dur": 9, "args": { "External id": 34888, "cbid": 211, "correlation": 34888 } }, { "ph": "s", "id": 34888, "pid": 76337, "tid": -914061504, "ts": 1716454218024242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218024306, "dur": 1, "args": { "External id": 34904, "cbid": 251, "correlation": 34904 } }, { "ph": "f", "id": 34904, "pid": 76337, "tid": -914061504, "ts": 1716454218024306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218024311, "dur": 0, "args": { "External id": 34906, "cbid": 251, "correlation": 34906 } }, { "ph": "f", "id": 34906, "pid": 76337, "tid": -914061504, "ts": 1716454218024311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218072589, "dur": 12, "args": { "External id": 34907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34907, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 34907, "pid": 5, "tid": 7, "ts": 1716454218072589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024313, "dur": 11, "args": { "External id": 34907, "cbid": 211, "correlation": 34907 } }, { "ph": "s", "id": 34907, "pid": 76337, "tid": -914061504, "ts": 1716454218024313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218072602, "dur": 5, "args": { "External id": 34909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34909, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 34909, "pid": 5, "tid": 7, "ts": 1716454218072602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024326, "dur": 6, "args": { "External id": 34909, "cbid": 211, "correlation": 34909 } }, { "ph": "s", "id": 34909, "pid": 76337, "tid": -914061504, "ts": 1716454218024326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218072609, "dur": 29, "args": { "External id": 34919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34919, "pid": 5, "tid": 7, "ts": 1716454218072609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024384, "dur": 12, "args": { "External id": 34919, "cbid": 211, "correlation": 34919 } }, { "ph": "s", "id": 34919, "pid": 76337, "tid": -914061504, "ts": 1716454218024384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218072639, "dur": 31, "args": { "External id": 34939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34939, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 34939, "pid": 5, "tid": 7, "ts": 1716454218072639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024450, "dur": 10, "args": { "External id": 34939, "cbid": 211, "correlation": 34939 } }, { "ph": "s", "id": 34939, "pid": 76337, "tid": -914061504, "ts": 1716454218024450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218072671, "dur": 4, "args": { "External id": 34951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34951, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 34951, "pid": 5, "tid": 7, "ts": 1716454218072671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024470, "dur": 7, "args": { "External id": 34951, "cbid": 211, "correlation": 34951 } }, { "ph": "s", "id": 34951, "pid": 76337, "tid": -914061504, "ts": 1716454218024470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218072676, "dur": 29, "args": { "External id": 34954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34954, "pid": 5, "tid": 7, "ts": 1716454218072676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024489, "dur": 6, "args": { "External id": 34954, "cbid": 211, "correlation": 34954 } }, { "ph": "s", "id": 34954, "pid": 76337, "tid": -914061504, "ts": 1716454218024489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218072706, "dur": 20, "args": { "External id": 34963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34963, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34963, "pid": 5, "tid": 7, "ts": 1716454218072706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024529, "dur": 10, "args": { "External id": 34963, "cbid": 211, "correlation": 34963 } }, { "ph": "s", "id": 34963, "pid": 76337, "tid": -914061504, "ts": 1716454218024529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218024591, "dur": 0, "args": { "External id": 34973, "cbid": 317, "correlation": 34973 } }, { "ph": "f", "id": 34973, "pid": 76337, "tid": -914061504, "ts": 1716454218024591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218024592, "dur": 0, "args": { "External id": 34974, "cbid": 203, "correlation": 34974 } }, { "ph": "f", "id": 34974, "pid": 76337, "tid": -914061504, "ts": 1716454218024592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218024592, "dur": 0, "args": { "External id": 34975, "cbid": 205, "correlation": 34975 } }, { "ph": "f", "id": 34975, "pid": 76337, "tid": -914061504, "ts": 1716454218024592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218072728, "dur": 24, "args": { "External id": 34979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34979, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34979, "pid": 5, "tid": 7, "ts": 1716454218072728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024606, "dur": 12, "args": { "External id": 34979, "cbid": 211, "correlation": 34979 } }, { "ph": "s", "id": 34979, "pid": 76337, "tid": -914061504, "ts": 1716454218024606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218072752, "dur": 44, "args": { "External id": 34981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34981, "pid": 5, "tid": 7, "ts": 1716454218072752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024620, "dur": 5, "args": { "External id": 34981, "cbid": 211, "correlation": 34981 } }, { "ph": "s", "id": 34981, "pid": 76337, "tid": -914061504, "ts": 1716454218024620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218072797, "dur": 641, "args": { "External id": 34983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34983, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 34983, "pid": 5, "tid": 7, "ts": 1716454218072797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024632, "dur": 6, "args": { "External id": 34983, "cbid": 211, "correlation": 34983 } }, { "ph": "s", "id": 34983, "pid": 76337, "tid": -914061504, "ts": 1716454218024632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218073440, "dur": 22, "args": { "External id": 34985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34985, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34985, "pid": 5, "tid": 7, "ts": 1716454218073440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024641, "dur": 5, "args": { "External id": 34985, "cbid": 211, "correlation": 34985 } }, { "ph": "s", "id": 34985, "pid": 76337, "tid": -914061504, "ts": 1716454218024641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218073463, "dur": 33, "args": { "External id": 34991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34991, "pid": 5, "tid": 7, "ts": 1716454218073463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024668, "dur": 8, "args": { "External id": 34991, "cbid": 211, "correlation": 34991 } }, { "ph": "s", "id": 34991, "pid": 76337, "tid": -914061504, "ts": 1716454218024668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218073497, "dur": 27, "args": { "External id": 34999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 34999, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 34999, "pid": 5, "tid": 7, "ts": 1716454218073497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024701, "dur": 8, "args": { "External id": 34999, "cbid": 211, "correlation": 34999 } }, { "ph": "s", "id": 34999, "pid": 76337, "tid": -914061504, "ts": 1716454218024701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218073524, "dur": 20, "args": { "External id": 35007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35007, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35007, "pid": 5, "tid": 7, "ts": 1716454218073524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024729, "dur": 8, "args": { "External id": 35007, "cbid": 211, "correlation": 35007 } }, { "ph": "s", "id": 35007, "pid": 76337, "tid": -914061504, "ts": 1716454218024729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218073545, "dur": 29, "args": { "External id": 35027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35027, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 35027, "pid": 5, "tid": 7, "ts": 1716454218073545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024808, "dur": 12, "args": { "External id": 35027, "cbid": 211, "correlation": 35027 } }, { "ph": "s", "id": 35027, "pid": 76337, "tid": -914061504, "ts": 1716454218024808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218073576, "dur": 4, "args": { "External id": 35039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35039, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 35039, "pid": 5, "tid": 7, "ts": 1716454218073576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024830, "dur": 6, "args": { "External id": 35039, "cbid": 211, "correlation": 35039 } }, { "ph": "s", "id": 35039, "pid": 76337, "tid": -914061504, "ts": 1716454218024830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218073581, "dur": 30, "args": { "External id": 35042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35042, "pid": 5, "tid": 7, "ts": 1716454218073581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024848, "dur": 6, "args": { "External id": 35042, "cbid": 211, "correlation": 35042 } }, { "ph": "s", "id": 35042, "pid": 76337, "tid": -914061504, "ts": 1716454218024848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218024905, "dur": 0, "args": { "External id": 35053, "cbid": 317, "correlation": 35053 } }, { "ph": "f", "id": 35053, "pid": 76337, "tid": -914061504, "ts": 1716454218024905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218024906, "dur": 0, "args": { "External id": 35054, "cbid": 203, "correlation": 35054 } }, { "ph": "f", "id": 35054, "pid": 76337, "tid": -914061504, "ts": 1716454218024906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218024907, "dur": 0, "args": { "External id": 35055, "cbid": 205, "correlation": 35055 } }, { "ph": "f", "id": 35055, "pid": 76337, "tid": -914061504, "ts": 1716454218024907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218073612, "dur": 22, "args": { "External id": 35059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35059, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35059, "pid": 5, "tid": 7, "ts": 1716454218073612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024919, "dur": 12, "args": { "External id": 35059, "cbid": 211, "correlation": 35059 } }, { "ph": "s", "id": 35059, "pid": 76337, "tid": -914061504, "ts": 1716454218024919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218073635, "dur": 115, "args": { "External id": 35061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35061, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35061, "pid": 5, "tid": 7, "ts": 1716454218073635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024937, "dur": 7, "args": { "External id": 35061, "cbid": 211, "correlation": 35061 } }, { "ph": "s", "id": 35061, "pid": 76337, "tid": -914061504, "ts": 1716454218024937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218073751, "dur": 20, "args": { "External id": 35063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35063, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35063, "pid": 5, "tid": 7, "ts": 1716454218073751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024948, "dur": 5, "args": { "External id": 35063, "cbid": 211, "correlation": 35063 } }, { "ph": "s", "id": 35063, "pid": 76337, "tid": -914061504, "ts": 1716454218024948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218073773, "dur": 32, "args": { "External id": 35069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35069, "pid": 5, "tid": 7, "ts": 1716454218073773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218024982, "dur": 9, "args": { "External id": 35069, "cbid": 211, "correlation": 35069 } }, { "ph": "s", "id": 35069, "pid": 76337, "tid": -914061504, "ts": 1716454218024982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218073806, "dur": 204, "args": { "External id": 35078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35078, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35078, "pid": 5, "tid": 7, "ts": 1716454218073806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025065, "dur": 14, "args": { "External id": 35078, "cbid": 211, "correlation": 35078 } }, { "ph": "s", "id": 35078, "pid": 76337, "tid": -914061504, "ts": 1716454218025065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218074012, "dur": 64, "args": { "External id": 35100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35100, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35100, "pid": 5, "tid": 7, "ts": 1716454218074012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025123, "dur": 10, "args": { "External id": 35100, "cbid": 211, "correlation": 35100 } }, { "ph": "s", "id": 35100, "pid": 76337, "tid": -914061504, "ts": 1716454218025123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025208, "dur": 1, "args": { "External id": 35111, "cbid": 251, "correlation": 35111 } }, { "ph": "f", "id": 35111, "pid": 76337, "tid": -914061504, "ts": 1716454218025208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218074077, "dur": 154, "args": { "External id": 35112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35112, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35112, "pid": 5, "tid": 7, "ts": 1716454218074077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025213, "dur": 14, "args": { "External id": 35112, "cbid": 211, "correlation": 35112 } }, { "ph": "s", "id": 35112, "pid": 76337, "tid": -914061504, "ts": 1716454218025213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025283, "dur": 1, "args": { "External id": 35123, "cbid": 251, "correlation": 35123 } }, { "ph": "f", "id": 35123, "pid": 76337, "tid": -914061504, "ts": 1716454218025283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218074232, "dur": 145, "args": { "External id": 35124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35124, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35124, "pid": 5, "tid": 7, "ts": 1716454218074232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025286, "dur": 11, "args": { "External id": 35124, "cbid": 211, "correlation": 35124 } }, { "ph": "s", "id": 35124, "pid": 76337, "tid": -914061504, "ts": 1716454218025286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025351, "dur": 1, "args": { "External id": 35135, "cbid": 251, "correlation": 35135 } }, { "ph": "f", "id": 35135, "pid": 76337, "tid": -914061504, "ts": 1716454218025351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218074379, "dur": 144, "args": { "External id": 35136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35136, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35136, "pid": 5, "tid": 7, "ts": 1716454218074379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025355, "dur": 11, "args": { "External id": 35136, "cbid": 211, "correlation": 35136 } }, { "ph": "s", "id": 35136, "pid": 76337, "tid": -914061504, "ts": 1716454218025355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218074524, "dur": 1916, "args": { "External id": 35157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35157, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 35157, "pid": 5, "tid": 7, "ts": 1716454218074524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025434, "dur": 12, "args": { "External id": 35157, "cbid": 211, "correlation": 35157 } }, { "ph": "s", "id": 35157, "pid": 76337, "tid": -914061504, "ts": 1716454218025434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025529, "dur": 1, "args": { "External id": 35175, "cbid": 251, "correlation": 35175 } }, { "ph": "f", "id": 35175, "pid": 76337, "tid": -914061504, "ts": 1716454218025529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218076441, "dur": 145, "args": { "External id": 35177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35177, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 35177, "pid": 5, "tid": 7, "ts": 1716454218076441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025535, "dur": 14, "args": { "External id": 35177, "cbid": 211, "correlation": 35177 } }, { "ph": "s", "id": 35177, "pid": 76337, "tid": -914061504, "ts": 1716454218025535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218076587, "dur": 35, "args": { "External id": 35185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35185, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35185, "pid": 5, "tid": 7, "ts": 1716454218076587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025605, "dur": 12, "args": { "External id": 35185, "cbid": 211, "correlation": 35185 } }, { "ph": "s", "id": 35185, "pid": 76337, "tid": -914061504, "ts": 1716454218025605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218076624, "dur": 51, "args": { "External id": 35193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35193, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35193, "pid": 5, "tid": 7, "ts": 1716454218076624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025644, "dur": 9, "args": { "External id": 35193, "cbid": 211, "correlation": 35193 } }, { "ph": "s", "id": 35193, "pid": 76337, "tid": -914061504, "ts": 1716454218025644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218076676, "dur": 30, "args": { "External id": 35204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35204, "pid": 5, "tid": 7, "ts": 1716454218076676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025715, "dur": 12, "args": { "External id": 35204, "cbid": 211, "correlation": 35204 } }, { "ph": "s", "id": 35204, "pid": 76337, "tid": -914061504, "ts": 1716454218025715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218076707, "dur": 35, "args": { "External id": 35226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35226, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35226, "pid": 5, "tid": 7, "ts": 1716454218076707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025745, "dur": 7, "args": { "External id": 35226, "cbid": 211, "correlation": 35226 } }, { "ph": "s", "id": 35226, "pid": 76337, "tid": -914061504, "ts": 1716454218025745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025829, "dur": 1, "args": { "External id": 35237, "cbid": 251, "correlation": 35237 } }, { "ph": "f", "id": 35237, "pid": 76337, "tid": -914061504, "ts": 1716454218025829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218076743, "dur": 88, "args": { "External id": 35238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35238, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35238, "pid": 5, "tid": 7, "ts": 1716454218076743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025834, "dur": 13, "args": { "External id": 35238, "cbid": 211, "correlation": 35238 } }, { "ph": "s", "id": 35238, "pid": 76337, "tid": -914061504, "ts": 1716454218025834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025902, "dur": 1, "args": { "External id": 35249, "cbid": 251, "correlation": 35249 } }, { "ph": "f", "id": 35249, "pid": 76337, "tid": -914061504, "ts": 1716454218025902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025906, "dur": 0, "args": { "External id": 35250, "cbid": 251, "correlation": 35250 } }, { "ph": "f", "id": 35250, "pid": 76337, "tid": -914061504, "ts": 1716454218025906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218076833, "dur": 11, "args": { "External id": 35251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35251, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 35251, "pid": 5, "tid": 7, "ts": 1716454218076833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025907, "dur": 12, "args": { "External id": 35251, "cbid": 211, "correlation": 35251 } }, { "ph": "s", "id": 35251, "pid": 76337, "tid": -914061504, "ts": 1716454218025907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218076845, "dur": 5, "args": { "External id": 35253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35253, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 35253, "pid": 5, "tid": 7, "ts": 1716454218076845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025921, "dur": 6, "args": { "External id": 35253, "cbid": 211, "correlation": 35253 } }, { "ph": "s", "id": 35253, "pid": 76337, "tid": -914061504, "ts": 1716454218025921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025985, "dur": 1, "args": { "External id": 35264, "cbid": 251, "correlation": 35264 } }, { "ph": "f", "id": 35264, "pid": 76337, "tid": -914061504, "ts": 1716454218025985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218025988, "dur": 0, "args": { "External id": 35265, "cbid": 251, "correlation": 35265 } }, { "ph": "f", "id": 35265, "pid": 76337, "tid": -914061504, "ts": 1716454218025988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218076851, "dur": 7, "args": { "External id": 35266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35266, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 35266, "pid": 5, "tid": 7, "ts": 1716454218076851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218025990, "dur": 12, "args": { "External id": 35266, "cbid": 211, "correlation": 35266 } }, { "ph": "s", "id": 35266, "pid": 76337, "tid": -914061504, "ts": 1716454218025990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218076859, "dur": 3, "args": { "External id": 35268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35268, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 35268, "pid": 5, "tid": 7, "ts": 1716454218076859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026004, "dur": 6, "args": { "External id": 35268, "cbid": 211, "correlation": 35268 } }, { "ph": "s", "id": 35268, "pid": 76337, "tid": -914061504, "ts": 1716454218026004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218076864, "dur": 91, "args": { "External id": 35289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35289, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 35289, "pid": 5, "tid": 7, "ts": 1716454218076864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026079, "dur": 12, "args": { "External id": 35289, "cbid": 211, "correlation": 35289 } }, { "ph": "s", "id": 35289, "pid": 76337, "tid": -914061504, "ts": 1716454218026079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218026175, "dur": 1, "args": { "External id": 35307, "cbid": 251, "correlation": 35307 } }, { "ph": "f", "id": 35307, "pid": 76337, "tid": -914061504, "ts": 1716454218026175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218076956, "dur": 96, "args": { "External id": 35309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35309, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35309, "pid": 5, "tid": 7, "ts": 1716454218076956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026182, "dur": 14, "args": { "External id": 35309, "cbid": 211, "correlation": 35309 } }, { "ph": "s", "id": 35309, "pid": 76337, "tid": -914061504, "ts": 1716454218026182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218077053, "dur": 19, "args": { "External id": 35317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35317, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35317, "pid": 5, "tid": 7, "ts": 1716454218077053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026251, "dur": 12, "args": { "External id": 35317, "cbid": 211, "correlation": 35317 } }, { "ph": "s", "id": 35317, "pid": 76337, "tid": -914061504, "ts": 1716454218026251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218077073, "dur": 36, "args": { "External id": 35325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35325, "pid": 5, "tid": 7, "ts": 1716454218077073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026293, "dur": 9, "args": { "External id": 35325, "cbid": 211, "correlation": 35325 } }, { "ph": "s", "id": 35325, "pid": 76337, "tid": -914061504, "ts": 1716454218026293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218077110, "dur": 34, "args": { "External id": 35347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35347, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35347, "pid": 5, "tid": 7, "ts": 1716454218077110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026344, "dur": 10, "args": { "External id": 35347, "cbid": 211, "correlation": 35347 } }, { "ph": "s", "id": 35347, "pid": 76337, "tid": -914061504, "ts": 1716454218026344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218026432, "dur": 1, "args": { "External id": 35363, "cbid": 251, "correlation": 35363 } }, { "ph": "f", "id": 35363, "pid": 76337, "tid": -914061504, "ts": 1716454218026432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218026437, "dur": 0, "args": { "External id": 35365, "cbid": 251, "correlation": 35365 } }, { "ph": "f", "id": 35365, "pid": 76337, "tid": -914061504, "ts": 1716454218026437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218077146, "dur": 531, "args": { "External id": 35366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35366, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 35366, "pid": 5, "tid": 7, "ts": 1716454218077146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026440, "dur": 13, "args": { "External id": 35366, "cbid": 211, "correlation": 35366 } }, { "ph": "s", "id": 35366, "pid": 76337, "tid": -914061504, "ts": 1716454218026440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218077678, "dur": 122, "args": { "External id": 35374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35374, "pid": 5, "tid": 7, "ts": 1716454218077678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026506, "dur": 12, "args": { "External id": 35374, "cbid": 211, "correlation": 35374 } }, { "ph": "s", "id": 35374, "pid": 76337, "tid": -914061504, "ts": 1716454218026506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218077802, "dur": 128, "args": { "External id": 35382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35382, "pid": 5, "tid": 7, "ts": 1716454218077802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026535, "dur": 8, "args": { "External id": 35382, "cbid": 211, "correlation": 35382 } }, { "ph": "s", "id": 35382, "pid": 76337, "tid": -914061504, "ts": 1716454218026535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218026613, "dur": 1, "args": { "External id": 35398, "cbid": 251, "correlation": 35398 } }, { "ph": "f", "id": 35398, "pid": 76337, "tid": -914061504, "ts": 1716454218026613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218077931, "dur": 302, "args": { "External id": 35400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35400, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35400, "pid": 5, "tid": 7, "ts": 1716454218077931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026618, "dur": 12, "args": { "External id": 35400, "cbid": 211, "correlation": 35400 } }, { "ph": "s", "id": 35400, "pid": 76337, "tid": -914061504, "ts": 1716454218026618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218078235, "dur": 27, "args": { "External id": 35408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35408, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35408, "pid": 5, "tid": 7, "ts": 1716454218078235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026659, "dur": 11, "args": { "External id": 35408, "cbid": 211, "correlation": 35408 } }, { "ph": "s", "id": 35408, "pid": 76337, "tid": -914061504, "ts": 1716454218026659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218078263, "dur": 81, "args": { "External id": 35419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35419, "pid": 5, "tid": 7, "ts": 1716454218078263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026727, "dur": 12, "args": { "External id": 35419, "cbid": 211, "correlation": 35419 } }, { "ph": "s", "id": 35419, "pid": 76337, "tid": -914061504, "ts": 1716454218026727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218026793, "dur": 0, "args": { "External id": 35431, "cbid": 317, "correlation": 35431 } }, { "ph": "f", "id": 35431, "pid": 76337, "tid": -914061504, "ts": 1716454218026793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218026793, "dur": 0, "args": { "External id": 35432, "cbid": 203, "correlation": 35432 } }, { "ph": "f", "id": 35432, "pid": 76337, "tid": -914061504, "ts": 1716454218026793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218026794, "dur": 0, "args": { "External id": 35433, "cbid": 205, "correlation": 35433 } }, { "ph": "f", "id": 35433, "pid": 76337, "tid": -914061504, "ts": 1716454218026794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078345, "dur": 23, "args": { "External id": 35437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35437, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35437, "pid": 5, "tid": 7, "ts": 1716454218078345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026809, "dur": 13, "args": { "External id": 35437, "cbid": 211, "correlation": 35437 } }, { "ph": "s", "id": 35437, "pid": 76337, "tid": -914061504, "ts": 1716454218026809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218078369, "dur": 119, "args": { "External id": 35439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35439, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35439, "pid": 5, "tid": 7, "ts": 1716454218078369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026828, "dur": 6, "args": { "External id": 35439, "cbid": 211, "correlation": 35439 } }, { "ph": "s", "id": 35439, "pid": 76337, "tid": -914061504, "ts": 1716454218026828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078490, "dur": 22, "args": { "External id": 35441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35441, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35441, "pid": 5, "tid": 7, "ts": 1716454218078490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026838, "dur": 5, "args": { "External id": 35441, "cbid": 211, "correlation": 35441 } }, { "ph": "s", "id": 35441, "pid": 76337, "tid": -914061504, "ts": 1716454218026838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218078513, "dur": 32, "args": { "External id": 35447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35447, "pid": 5, "tid": 7, "ts": 1716454218078513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026866, "dur": 10, "args": { "External id": 35447, "cbid": 211, "correlation": 35447 } }, { "ph": "s", "id": 35447, "pid": 76337, "tid": -914061504, "ts": 1716454218026866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218078547, "dur": 27, "args": { "External id": 35455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35455, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35455, "pid": 5, "tid": 7, "ts": 1716454218078547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026898, "dur": 8, "args": { "External id": 35455, "cbid": 211, "correlation": 35455 } }, { "ph": "s", "id": 35455, "pid": 76337, "tid": -914061504, "ts": 1716454218026898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218026970, "dur": 0, "args": { "External id": 35465, "cbid": 317, "correlation": 35465 } }, { "ph": "f", "id": 35465, "pid": 76337, "tid": -914061504, "ts": 1716454218026970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218026970, "dur": 0, "args": { "External id": 35466, "cbid": 203, "correlation": 35466 } }, { "ph": "f", "id": 35466, "pid": 76337, "tid": -914061504, "ts": 1716454218026970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218026971, "dur": 0, "args": { "External id": 35467, "cbid": 205, "correlation": 35467 } }, { "ph": "f", "id": 35467, "pid": 76337, "tid": -914061504, "ts": 1716454218026971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078575, "dur": 22, "args": { "External id": 35471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35471, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35471, "pid": 5, "tid": 7, "ts": 1716454218078575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218026993, "dur": 13, "args": { "External id": 35471, "cbid": 211, "correlation": 35471 } }, { "ph": "s", "id": 35471, "pid": 76337, "tid": -914061504, "ts": 1716454218026993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078599, "dur": 44, "args": { "External id": 35473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35473, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35473, "pid": 5, "tid": 7, "ts": 1716454218078599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027008, "dur": 5, "args": { "External id": 35473, "cbid": 211, "correlation": 35473 } }, { "ph": "s", "id": 35473, "pid": 76337, "tid": -914061504, "ts": 1716454218027008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218078644, "dur": 232, "args": { "External id": 35475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35475, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 35475, "pid": 5, "tid": 7, "ts": 1716454218078644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027020, "dur": 7, "args": { "External id": 35475, "cbid": 211, "correlation": 35475 } }, { "ph": "s", "id": 35475, "pid": 76337, "tid": -914061504, "ts": 1716454218027020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078877, "dur": 6, "args": { "External id": 35477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35477, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35477, "pid": 5, "tid": 7, "ts": 1716454218078877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027031, "dur": 5, "args": { "External id": 35477, "cbid": 211, "correlation": 35477 } }, { "ph": "s", "id": 35477, "pid": 76337, "tid": -914061504, "ts": 1716454218027031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218078885, "dur": 9, "args": { "External id": 35483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35483, "pid": 5, "tid": 7, "ts": 1716454218078885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027057, "dur": 8, "args": { "External id": 35483, "cbid": 211, "correlation": 35483 } }, { "ph": "s", "id": 35483, "pid": 76337, "tid": -914061504, "ts": 1716454218027057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218078895, "dur": 12, "args": { "External id": 35503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35503, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 35503, "pid": 5, "tid": 7, "ts": 1716454218078895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027149, "dur": 12, "args": { "External id": 35503, "cbid": 211, "correlation": 35503 } }, { "ph": "s", "id": 35503, "pid": 76337, "tid": -914061504, "ts": 1716454218027149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218078908, "dur": 5, "args": { "External id": 35515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35515, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 35515, "pid": 5, "tid": 7, "ts": 1716454218078908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027171, "dur": 6, "args": { "External id": 35515, "cbid": 211, "correlation": 35515 } }, { "ph": "s", "id": 35515, "pid": 76337, "tid": -914061504, "ts": 1716454218027171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218078914, "dur": 12, "args": { "External id": 35518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35518, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35518, "pid": 5, "tid": 7, "ts": 1716454218078914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027190, "dur": 7, "args": { "External id": 35518, "cbid": 211, "correlation": 35518 } }, { "ph": "s", "id": 35518, "pid": 76337, "tid": -914061504, "ts": 1716454218027190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218078927, "dur": 7, "args": { "External id": 35527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35527, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35527, "pid": 5, "tid": 7, "ts": 1716454218078927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027228, "dur": 10, "args": { "External id": 35527, "cbid": 211, "correlation": 35527 } }, { "ph": "s", "id": 35527, "pid": 76337, "tid": -914061504, "ts": 1716454218027228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218027280, "dur": 0, "args": { "External id": 35537, "cbid": 317, "correlation": 35537 } }, { "ph": "f", "id": 35537, "pid": 76337, "tid": -914061504, "ts": 1716454218027280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218027281, "dur": 0, "args": { "External id": 35538, "cbid": 203, "correlation": 35538 } }, { "ph": "f", "id": 35538, "pid": 76337, "tid": -914061504, "ts": 1716454218027281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218027282, "dur": 0, "args": { "External id": 35539, "cbid": 205, "correlation": 35539 } }, { "ph": "f", "id": 35539, "pid": 76337, "tid": -914061504, "ts": 1716454218027282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078935, "dur": 5, "args": { "External id": 35543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35543, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35543, "pid": 5, "tid": 7, "ts": 1716454218078935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027298, "dur": 12, "args": { "External id": 35543, "cbid": 211, "correlation": 35543 } }, { "ph": "s", "id": 35543, "pid": 76337, "tid": -914061504, "ts": 1716454218027298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218078941, "dur": 82, "args": { "External id": 35545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35545, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35545, "pid": 5, "tid": 7, "ts": 1716454218078941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027312, "dur": 5, "args": { "External id": 35545, "cbid": 211, "correlation": 35545 } }, { "ph": "s", "id": 35545, "pid": 76337, "tid": -914061504, "ts": 1716454218027312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218079026, "dur": 1, "args": { "External id": 35547, "device": 5, "context": 1, "stream": 7, "correlation": 35547, "bytes": 960, "memory bandwidth (GB/s)": 0.5084745762711864 } }, { "ph": "f", "id": 35547, "pid": 5, "tid": 7, "ts": 1716454218079026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218027325, "dur": 9, "args": { "External id": 35547, "cbid": 51, "correlation": 35547 } }, { "ph": "s", "id": 35547, "pid": 76337, "tid": -914061504, "ts": 1716454218027325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218079029, "dur": 533, "args": { "External id": 35548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35548, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35548, "pid": 5, "tid": 7, "ts": 1716454218079029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027335, "dur": 8, "args": { "External id": 35548, "cbid": 211, "correlation": 35548 } }, { "ph": "s", "id": 35548, "pid": 76337, "tid": -914061504, "ts": 1716454218027335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218079564, "dur": 11, "args": { "External id": 35550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35550, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35550, "pid": 5, "tid": 7, "ts": 1716454218079564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027348, "dur": 5, "args": { "External id": 35550, "cbid": 211, "correlation": 35550 } }, { "ph": "s", "id": 35550, "pid": 76337, "tid": -914061504, "ts": 1716454218027348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218079577, "dur": 14, "args": { "External id": 35556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35556, "pid": 5, "tid": 7, "ts": 1716454218079577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027375, "dur": 8, "args": { "External id": 35556, "cbid": 211, "correlation": 35556 } }, { "ph": "s", "id": 35556, "pid": 76337, "tid": -914061504, "ts": 1716454218027375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218079592, "dur": 3, "args": { "External id": 35564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35564, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 35564, "pid": 5, "tid": 7, "ts": 1716454218079592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027418, "dur": 9, "args": { "External id": 35564, "cbid": 211, "correlation": 35564 } }, { "ph": "s", "id": 35564, "pid": 76337, "tid": -914061504, "ts": 1716454218027418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218027483, "dur": 1, "args": { "External id": 35580, "cbid": 251, "correlation": 35580 } }, { "ph": "f", "id": 35580, "pid": 76337, "tid": -914061504, "ts": 1716454218027483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218027489, "dur": 0, "args": { "External id": 35582, "cbid": 251, "correlation": 35582 } }, { "ph": "f", "id": 35582, "pid": 76337, "tid": -914061504, "ts": 1716454218027489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218079596, "dur": 13, "args": { "External id": 35583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35583, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35583, "pid": 5, "tid": 7, "ts": 1716454218079596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027491, "dur": 11, "args": { "External id": 35583, "cbid": 211, "correlation": 35583 } }, { "ph": "s", "id": 35583, "pid": 76337, "tid": -914061504, "ts": 1716454218027491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218079611, "dur": 5, "args": { "External id": 35585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35585, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35585, "pid": 5, "tid": 7, "ts": 1716454218079611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027504, "dur": 5, "args": { "External id": 35585, "cbid": 211, "correlation": 35585 } }, { "ph": "s", "id": 35585, "pid": 76337, "tid": -914061504, "ts": 1716454218027504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218079617, "dur": 16, "args": { "External id": 35595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35595, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35595, "pid": 5, "tid": 7, "ts": 1716454218079617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027561, "dur": 12, "args": { "External id": 35595, "cbid": 211, "correlation": 35595 } }, { "ph": "s", "id": 35595, "pid": 76337, "tid": -914061504, "ts": 1716454218027561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218079635, "dur": 17, "args": { "External id": 35615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35615, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 35615, "pid": 5, "tid": 7, "ts": 1716454218079635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027627, "dur": 11, "args": { "External id": 35615, "cbid": 211, "correlation": 35615 } }, { "ph": "s", "id": 35615, "pid": 76337, "tid": -914061504, "ts": 1716454218027627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218079653, "dur": 5, "args": { "External id": 35627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35627, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 35627, "pid": 5, "tid": 7, "ts": 1716454218079653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027648, "dur": 6, "args": { "External id": 35627, "cbid": 211, "correlation": 35627 } }, { "ph": "s", "id": 35627, "pid": 76337, "tid": -914061504, "ts": 1716454218027648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218079659, "dur": 16, "args": { "External id": 35630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35630, "pid": 5, "tid": 7, "ts": 1716454218079659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027667, "dur": 6, "args": { "External id": 35630, "cbid": 211, "correlation": 35630 } }, { "ph": "s", "id": 35630, "pid": 76337, "tid": -914061504, "ts": 1716454218027667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218079677, "dur": 11, "args": { "External id": 35639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35639, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35639, "pid": 5, "tid": 7, "ts": 1716454218079677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027708, "dur": 10, "args": { "External id": 35639, "cbid": 211, "correlation": 35639 } }, { "ph": "s", "id": 35639, "pid": 76337, "tid": -914061504, "ts": 1716454218027708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218027770, "dur": 0, "args": { "External id": 35649, "cbid": 317, "correlation": 35649 } }, { "ph": "f", "id": 35649, "pid": 76337, "tid": -914061504, "ts": 1716454218027770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218027771, "dur": 0, "args": { "External id": 35650, "cbid": 203, "correlation": 35650 } }, { "ph": "f", "id": 35650, "pid": 76337, "tid": -914061504, "ts": 1716454218027771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218027772, "dur": 0, "args": { "External id": 35651, "cbid": 205, "correlation": 35651 } }, { "ph": "f", "id": 35651, "pid": 76337, "tid": -914061504, "ts": 1716454218027772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218079689, "dur": 11, "args": { "External id": 35655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35655, "pid": 5, "tid": 7, "ts": 1716454218079689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027789, "dur": 12, "args": { "External id": 35655, "cbid": 211, "correlation": 35655 } }, { "ph": "s", "id": 35655, "pid": 76337, "tid": -914061504, "ts": 1716454218027789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218079701, "dur": 160, "args": { "External id": 35657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35657, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35657, "pid": 5, "tid": 7, "ts": 1716454218079701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027804, "dur": 6, "args": { "External id": 35657, "cbid": 211, "correlation": 35657 } }, { "ph": "s", "id": 35657, "pid": 76337, "tid": -914061504, "ts": 1716454218027804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218079863, "dur": 1, "args": { "External id": 35659, "device": 5, "context": 1, "stream": 7, "correlation": 35659, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 35659, "pid": 5, "tid": 7, "ts": 1716454218079863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218027816, "dur": 7, "args": { "External id": 35659, "cbid": 51, "correlation": 35659 } }, { "ph": "s", "id": 35659, "pid": 76337, "tid": -914061504, "ts": 1716454218027816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218079866, "dur": 654, "args": { "External id": 35660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35660, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35660, "pid": 5, "tid": 7, "ts": 1716454218079866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027825, "dur": 6, "args": { "External id": 35660, "cbid": 211, "correlation": 35660 } }, { "ph": "s", "id": 35660, "pid": 76337, "tid": -914061504, "ts": 1716454218027825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218080522, "dur": 12, "args": { "External id": 35662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35662, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35662, "pid": 5, "tid": 7, "ts": 1716454218080522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027835, "dur": 5, "args": { "External id": 35662, "cbid": 211, "correlation": 35662 } }, { "ph": "s", "id": 35662, "pid": 76337, "tid": -914061504, "ts": 1716454218027835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218080535, "dur": 16, "args": { "External id": 35668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35668, "pid": 5, "tid": 7, "ts": 1716454218080535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027864, "dur": 8, "args": { "External id": 35668, "cbid": 211, "correlation": 35668 } }, { "ph": "s", "id": 35668, "pid": 76337, "tid": -914061504, "ts": 1716454218027864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218027922, "dur": 0, "args": { "External id": 35678, "cbid": 317, "correlation": 35678 } }, { "ph": "f", "id": 35678, "pid": 76337, "tid": -914061504, "ts": 1716454218027922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218027922, "dur": 0, "args": { "External id": 35679, "cbid": 203, "correlation": 35679 } }, { "ph": "f", "id": 35679, "pid": 76337, "tid": -914061504, "ts": 1716454218027922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218027923, "dur": 0, "args": { "External id": 35680, "cbid": 205, "correlation": 35680 } }, { "ph": "f", "id": 35680, "pid": 76337, "tid": -914061504, "ts": 1716454218027923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218080552, "dur": 8, "args": { "External id": 35684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35684, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35684, "pid": 5, "tid": 7, "ts": 1716454218080552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027937, "dur": 12, "args": { "External id": 35684, "cbid": 211, "correlation": 35684 } }, { "ph": "s", "id": 35684, "pid": 76337, "tid": -914061504, "ts": 1716454218027937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218080562, "dur": 3, "args": { "External id": 35686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35686, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 35686, "pid": 5, "tid": 7, "ts": 1716454218080562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027954, "dur": 7, "args": { "External id": 35686, "cbid": 211, "correlation": 35686 } }, { "ph": "s", "id": 35686, "pid": 76337, "tid": -914061504, "ts": 1716454218027954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218027964, "dur": 0, "args": { "External id": 35687, "cbid": 51, "correlation": 35687 } }, { "ph": "s", "id": 35687, "pid": 76337, "tid": -914061504, "ts": 1716454218027964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218080566, "dur": 56, "args": { "External id": 35688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35688, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 35688, "pid": 5, "tid": 7, "ts": 1716454218080566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218027965, "dur": 5, "args": { "External id": 35688, "cbid": 211, "correlation": 35688 } }, { "ph": "s", "id": 35688, "pid": 76337, "tid": -914061504, "ts": 1716454218027965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218080623, "dur": 13, "args": { "External id": 35693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35693, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35693, "pid": 5, "tid": 7, "ts": 1716454218080623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028000, "dur": 10, "args": { "External id": 35693, "cbid": 211, "correlation": 35693 } }, { "ph": "s", "id": 35693, "pid": 76337, "tid": -914061504, "ts": 1716454218028000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218080638, "dur": 11, "args": { "External id": 35701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35701, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35701, "pid": 5, "tid": 7, "ts": 1716454218080638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028030, "dur": 8, "args": { "External id": 35701, "cbid": 211, "correlation": 35701 } }, { "ph": "s", "id": 35701, "pid": 76337, "tid": -914061504, "ts": 1716454218028030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218080651, "dur": 10, "args": { "External id": 35709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35709, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35709, "pid": 5, "tid": 7, "ts": 1716454218080651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028059, "dur": 9, "args": { "External id": 35709, "cbid": 211, "correlation": 35709 } }, { "ph": "s", "id": 35709, "pid": 76337, "tid": -914061504, "ts": 1716454218028059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218080662, "dur": 18, "args": { "External id": 35729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35729, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 35729, "pid": 5, "tid": 7, "ts": 1716454218080662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028140, "dur": 12, "args": { "External id": 35729, "cbid": 211, "correlation": 35729 } }, { "ph": "s", "id": 35729, "pid": 76337, "tid": -914061504, "ts": 1716454218028140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218080681, "dur": 4, "args": { "External id": 35741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35741, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 35741, "pid": 5, "tid": 7, "ts": 1716454218080681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028162, "dur": 6, "args": { "External id": 35741, "cbid": 211, "correlation": 35741 } }, { "ph": "s", "id": 35741, "pid": 76337, "tid": -914061504, "ts": 1716454218028162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218080687, "dur": 17, "args": { "External id": 35744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35744, "pid": 5, "tid": 7, "ts": 1716454218080687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028181, "dur": 7, "args": { "External id": 35744, "cbid": 211, "correlation": 35744 } }, { "ph": "s", "id": 35744, "pid": 76337, "tid": -914061504, "ts": 1716454218028181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218028238, "dur": 0, "args": { "External id": 35755, "cbid": 317, "correlation": 35755 } }, { "ph": "f", "id": 35755, "pid": 76337, "tid": -914061504, "ts": 1716454218028238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218028239, "dur": 0, "args": { "External id": 35756, "cbid": 203, "correlation": 35756 } }, { "ph": "f", "id": 35756, "pid": 76337, "tid": -914061504, "ts": 1716454218028239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218028240, "dur": 0, "args": { "External id": 35757, "cbid": 205, "correlation": 35757 } }, { "ph": "f", "id": 35757, "pid": 76337, "tid": -914061504, "ts": 1716454218028240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218080705, "dur": 11, "args": { "External id": 35761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35761, "pid": 5, "tid": 7, "ts": 1716454218080705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028256, "dur": 12, "args": { "External id": 35761, "cbid": 211, "correlation": 35761 } }, { "ph": "s", "id": 35761, "pid": 76337, "tid": -914061504, "ts": 1716454218028256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218080717, "dur": 3, "args": { "External id": 35763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 35763, "pid": 5, "tid": 7, "ts": 1716454218080717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028273, "dur": 6, "args": { "External id": 35763, "cbid": 211, "correlation": 35763 } }, { "ph": "s", "id": 35763, "pid": 76337, "tid": -914061504, "ts": 1716454218028273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218028282, "dur": 0, "args": { "External id": 35764, "cbid": 51, "correlation": 35764 } }, { "ph": "s", "id": 35764, "pid": 76337, "tid": -914061504, "ts": 1716454218028282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218080722, "dur": 94, "args": { "External id": 35765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35765, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 35765, "pid": 5, "tid": 7, "ts": 1716454218080722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028283, "dur": 5, "args": { "External id": 35765, "cbid": 211, "correlation": 35765 } }, { "ph": "s", "id": 35765, "pid": 76337, "tid": -914061504, "ts": 1716454218028283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218080817, "dur": 15, "args": { "External id": 35770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35770, "pid": 5, "tid": 7, "ts": 1716454218080817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028310, "dur": 8, "args": { "External id": 35770, "cbid": 211, "correlation": 35770 } }, { "ph": "s", "id": 35770, "pid": 76337, "tid": -914061504, "ts": 1716454218028310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218080834, "dur": 83, "args": { "External id": 35779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35779, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35779, "pid": 5, "tid": 7, "ts": 1716454218080834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028391, "dur": 14, "args": { "External id": 35779, "cbid": 211, "correlation": 35779 } }, { "ph": "s", "id": 35779, "pid": 76337, "tid": -914061504, "ts": 1716454218028391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218080918, "dur": 30, "args": { "External id": 35801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35801, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35801, "pid": 5, "tid": 7, "ts": 1716454218080918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028449, "dur": 10, "args": { "External id": 35801, "cbid": 211, "correlation": 35801 } }, { "ph": "s", "id": 35801, "pid": 76337, "tid": -914061504, "ts": 1716454218028449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218028538, "dur": 1, "args": { "External id": 35812, "cbid": 251, "correlation": 35812 } }, { "ph": "f", "id": 35812, "pid": 76337, "tid": -914061504, "ts": 1716454218028538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218080950, "dur": 160, "args": { "External id": 35813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35813, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35813, "pid": 5, "tid": 7, "ts": 1716454218080950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028544, "dur": 13, "args": { "External id": 35813, "cbid": 211, "correlation": 35813 } }, { "ph": "s", "id": 35813, "pid": 76337, "tid": -914061504, "ts": 1716454218028544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218028614, "dur": 1, "args": { "External id": 35824, "cbid": 251, "correlation": 35824 } }, { "ph": "f", "id": 35824, "pid": 76337, "tid": -914061504, "ts": 1716454218028614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218081111, "dur": 155, "args": { "External id": 35825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35825, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35825, "pid": 5, "tid": 7, "ts": 1716454218081111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028618, "dur": 11, "args": { "External id": 35825, "cbid": 211, "correlation": 35825 } }, { "ph": "s", "id": 35825, "pid": 76337, "tid": -914061504, "ts": 1716454218028618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218028681, "dur": 1, "args": { "External id": 35836, "cbid": 251, "correlation": 35836 } }, { "ph": "f", "id": 35836, "pid": 76337, "tid": -914061504, "ts": 1716454218028681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218081268, "dur": 155, "args": { "External id": 35837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35837, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35837, "pid": 5, "tid": 7, "ts": 1716454218081268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028685, "dur": 12, "args": { "External id": 35837, "cbid": 211, "correlation": 35837 } }, { "ph": "s", "id": 35837, "pid": 76337, "tid": -914061504, "ts": 1716454218028685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218081425, "dur": 330, "args": { "External id": 35862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35862, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35862, "pid": 5, "tid": 7, "ts": 1716454218081425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028771, "dur": 13, "args": { "External id": 35862, "cbid": 211, "correlation": 35862 } }, { "ph": "s", "id": 35862, "pid": 76337, "tid": -914061504, "ts": 1716454218028771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218028873, "dur": 1, "args": { "External id": 35880, "cbid": 251, "correlation": 35880 } }, { "ph": "f", "id": 35880, "pid": 76337, "tid": -914061504, "ts": 1716454218028873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218081756, "dur": 162, "args": { "External id": 35882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35882, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35882, "pid": 5, "tid": 7, "ts": 1716454218081756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028879, "dur": 13, "args": { "External id": 35882, "cbid": 211, "correlation": 35882 } }, { "ph": "s", "id": 35882, "pid": 76337, "tid": -914061504, "ts": 1716454218028879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218081919, "dur": 19, "args": { "External id": 35890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35890, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35890, "pid": 5, "tid": 7, "ts": 1716454218081919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028948, "dur": 12, "args": { "External id": 35890, "cbid": 211, "correlation": 35890 } }, { "ph": "s", "id": 35890, "pid": 76337, "tid": -914061504, "ts": 1716454218028948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218081939, "dur": 28, "args": { "External id": 35898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35898, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35898, "pid": 5, "tid": 7, "ts": 1716454218081939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218028996, "dur": 10, "args": { "External id": 35898, "cbid": 211, "correlation": 35898 } }, { "ph": "s", "id": 35898, "pid": 76337, "tid": -914061504, "ts": 1716454218028996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218081969, "dur": 18, "args": { "External id": 35909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35909, "pid": 5, "tid": 7, "ts": 1716454218081969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029071, "dur": 13, "args": { "External id": 35909, "cbid": 211, "correlation": 35909 } }, { "ph": "s", "id": 35909, "pid": 76337, "tid": -914061504, "ts": 1716454218029071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218081988, "dur": 16, "args": { "External id": 35931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35931, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 35931, "pid": 5, "tid": 7, "ts": 1716454218081988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029102, "dur": 8, "args": { "External id": 35931, "cbid": 211, "correlation": 35931 } }, { "ph": "s", "id": 35931, "pid": 76337, "tid": -914061504, "ts": 1716454218029102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029187, "dur": 1, "args": { "External id": 35942, "cbid": 251, "correlation": 35942 } }, { "ph": "f", "id": 35942, "pid": 76337, "tid": -914061504, "ts": 1716454218029187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218082005, "dur": 88, "args": { "External id": 35943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35943, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 35943, "pid": 5, "tid": 7, "ts": 1716454218082005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029193, "dur": 15, "args": { "External id": 35943, "cbid": 211, "correlation": 35943 } }, { "ph": "s", "id": 35943, "pid": 76337, "tid": -914061504, "ts": 1716454218029193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029264, "dur": 1, "args": { "External id": 35954, "cbid": 251, "correlation": 35954 } }, { "ph": "f", "id": 35954, "pid": 76337, "tid": -914061504, "ts": 1716454218029264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029268, "dur": 0, "args": { "External id": 35955, "cbid": 251, "correlation": 35955 } }, { "ph": "f", "id": 35955, "pid": 76337, "tid": -914061504, "ts": 1716454218029268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218082094, "dur": 12, "args": { "External id": 35956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35956, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35956, "pid": 5, "tid": 7, "ts": 1716454218082094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029270, "dur": 12, "args": { "External id": 35956, "cbid": 211, "correlation": 35956 } }, { "ph": "s", "id": 35956, "pid": 76337, "tid": -914061504, "ts": 1716454218029270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218082108, "dur": 5, "args": { "External id": 35958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35958, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35958, "pid": 5, "tid": 7, "ts": 1716454218082108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029284, "dur": 6, "args": { "External id": 35958, "cbid": 211, "correlation": 35958 } }, { "ph": "s", "id": 35958, "pid": 76337, "tid": -914061504, "ts": 1716454218029284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029342, "dur": 1, "args": { "External id": 35969, "cbid": 251, "correlation": 35969 } }, { "ph": "f", "id": 35969, "pid": 76337, "tid": -914061504, "ts": 1716454218029342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029345, "dur": 0, "args": { "External id": 35970, "cbid": 251, "correlation": 35970 } }, { "ph": "f", "id": 35970, "pid": 76337, "tid": -914061504, "ts": 1716454218029345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218082114, "dur": 8, "args": { "External id": 35971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35971, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35971, "pid": 5, "tid": 7, "ts": 1716454218082114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029347, "dur": 12, "args": { "External id": 35971, "cbid": 211, "correlation": 35971 } }, { "ph": "s", "id": 35971, "pid": 76337, "tid": -914061504, "ts": 1716454218029347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218082123, "dur": 3, "args": { "External id": 35973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35973, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35973, "pid": 5, "tid": 7, "ts": 1716454218082123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029361, "dur": 6, "args": { "External id": 35973, "cbid": 211, "correlation": 35973 } }, { "ph": "s", "id": 35973, "pid": 76337, "tid": -914061504, "ts": 1716454218029361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218082128, "dur": 55, "args": { "External id": 35998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 35998, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 35998, "pid": 5, "tid": 7, "ts": 1716454218082128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029437, "dur": 12, "args": { "External id": 35998, "cbid": 211, "correlation": 35998 } }, { "ph": "s", "id": 35998, "pid": 76337, "tid": -914061504, "ts": 1716454218029437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029536, "dur": 2, "args": { "External id": 36016, "cbid": 251, "correlation": 36016 } }, { "ph": "f", "id": 36016, "pid": 76337, "tid": -914061504, "ts": 1716454218029536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218082184, "dur": 89, "args": { "External id": 36018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36018, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36018, "pid": 5, "tid": 7, "ts": 1716454218082184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029542, "dur": 15, "args": { "External id": 36018, "cbid": 211, "correlation": 36018 } }, { "ph": "s", "id": 36018, "pid": 76337, "tid": -914061504, "ts": 1716454218029542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218082275, "dur": 9, "args": { "External id": 36026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36026, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36026, "pid": 5, "tid": 7, "ts": 1716454218082275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029613, "dur": 13, "args": { "External id": 36026, "cbid": 211, "correlation": 36026 } }, { "ph": "s", "id": 36026, "pid": 76337, "tid": -914061504, "ts": 1716454218029613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218082286, "dur": 21, "args": { "External id": 36034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36034, "pid": 5, "tid": 7, "ts": 1716454218082286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029655, "dur": 9, "args": { "External id": 36034, "cbid": 211, "correlation": 36034 } }, { "ph": "s", "id": 36034, "pid": 76337, "tid": -914061504, "ts": 1716454218029655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218082308, "dur": 17, "args": { "External id": 36056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36056, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36056, "pid": 5, "tid": 7, "ts": 1716454218082308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029707, "dur": 10, "args": { "External id": 36056, "cbid": 211, "correlation": 36056 } }, { "ph": "s", "id": 36056, "pid": 76337, "tid": -914061504, "ts": 1716454218029707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029797, "dur": 2, "args": { "External id": 36072, "cbid": 251, "correlation": 36072 } }, { "ph": "f", "id": 36072, "pid": 76337, "tid": -914061504, "ts": 1716454218029797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029803, "dur": 0, "args": { "External id": 36074, "cbid": 251, "correlation": 36074 } }, { "ph": "f", "id": 36074, "pid": 76337, "tid": -914061504, "ts": 1716454218029803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218082326, "dur": 489, "args": { "External id": 36075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36075, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36075, "pid": 5, "tid": 7, "ts": 1716454218082326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029805, "dur": 15, "args": { "External id": 36075, "cbid": 211, "correlation": 36075 } }, { "ph": "s", "id": 36075, "pid": 76337, "tid": -914061504, "ts": 1716454218029805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218082817, "dur": 64, "args": { "External id": 36083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36083, "pid": 5, "tid": 7, "ts": 1716454218082817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029873, "dur": 13, "args": { "External id": 36083, "cbid": 211, "correlation": 36083 } }, { "ph": "s", "id": 36083, "pid": 76337, "tid": -914061504, "ts": 1716454218029873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218082882, "dur": 69, "args": { "External id": 36091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36091, "pid": 5, "tid": 7, "ts": 1716454218082882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218029904, "dur": 8, "args": { "External id": 36091, "cbid": 211, "correlation": 36091 } }, { "ph": "s", "id": 36091, "pid": 76337, "tid": -914061504, "ts": 1716454218029904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218029993, "dur": 1, "args": { "External id": 36107, "cbid": 251, "correlation": 36107 } }, { "ph": "f", "id": 36107, "pid": 76337, "tid": -914061504, "ts": 1716454218029993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218082953, "dur": 1, "args": { "External id": 36109, "device": 5, "context": 1, "stream": 7, "correlation": 36109, "bytes": 240, "memory bandwidth (GB/s)": 0.12931034482758622 } }, { "ph": "f", "id": 36109, "pid": 5, "tid": 7, "ts": 1716454218082953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218029999, "dur": 11, "args": { "External id": 36109, "cbid": 51, "correlation": 36109 } }, { "ph": "s", "id": 36109, "pid": 76337, "tid": -914061504, "ts": 1716454218029999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218082957, "dur": 267, "args": { "External id": 36110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36110, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36110, "pid": 5, "tid": 7, "ts": 1716454218082957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030011, "dur": 11, "args": { "External id": 36110, "cbid": 211, "correlation": 36110 } }, { "ph": "s", "id": 36110, "pid": 76337, "tid": -914061504, "ts": 1716454218030011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218083225, "dur": 14, "args": { "External id": 36118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36118, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36118, "pid": 5, "tid": 7, "ts": 1716454218083225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030055, "dur": 10, "args": { "External id": 36118, "cbid": 211, "correlation": 36118 } }, { "ph": "s", "id": 36118, "pid": 76337, "tid": -914061504, "ts": 1716454218030055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218083240, "dur": 37, "args": { "External id": 36129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36129, "pid": 5, "tid": 7, "ts": 1716454218083240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030122, "dur": 12, "args": { "External id": 36129, "cbid": 211, "correlation": 36129 } }, { "ph": "s", "id": 36129, "pid": 76337, "tid": -914061504, "ts": 1716454218030122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218030187, "dur": 0, "args": { "External id": 36141, "cbid": 317, "correlation": 36141 } }, { "ph": "f", "id": 36141, "pid": 76337, "tid": -914061504, "ts": 1716454218030187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218030188, "dur": 0, "args": { "External id": 36142, "cbid": 203, "correlation": 36142 } }, { "ph": "f", "id": 36142, "pid": 76337, "tid": -914061504, "ts": 1716454218030188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218030189, "dur": 0, "args": { "External id": 36143, "cbid": 205, "correlation": 36143 } }, { "ph": "f", "id": 36143, "pid": 76337, "tid": -914061504, "ts": 1716454218030189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218083278, "dur": 13, "args": { "External id": 36147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36147, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36147, "pid": 5, "tid": 7, "ts": 1716454218083278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030205, "dur": 12, "args": { "External id": 36147, "cbid": 211, "correlation": 36147 } }, { "ph": "s", "id": 36147, "pid": 76337, "tid": -914061504, "ts": 1716454218030205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218083292, "dur": 4, "args": { "External id": 36149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36149, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36149, "pid": 5, "tid": 7, "ts": 1716454218083292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030222, "dur": 6, "args": { "External id": 36149, "cbid": 211, "correlation": 36149 } }, { "ph": "s", "id": 36149, "pid": 76337, "tid": -914061504, "ts": 1716454218030222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218030230, "dur": 0, "args": { "External id": 36150, "cbid": 51, "correlation": 36150 } }, { "ph": "s", "id": 36150, "pid": 76337, "tid": -914061504, "ts": 1716454218030230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218083298, "dur": 95, "args": { "External id": 36151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36151, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 36151, "pid": 5, "tid": 7, "ts": 1716454218083298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030231, "dur": 5, "args": { "External id": 36151, "cbid": 211, "correlation": 36151 } }, { "ph": "s", "id": 36151, "pid": 76337, "tid": -914061504, "ts": 1716454218030231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218083394, "dur": 16, "args": { "External id": 36156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36156, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36156, "pid": 5, "tid": 7, "ts": 1716454218083394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030258, "dur": 9, "args": { "External id": 36156, "cbid": 211, "correlation": 36156 } }, { "ph": "s", "id": 36156, "pid": 76337, "tid": -914061504, "ts": 1716454218030258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218083411, "dur": 11, "args": { "External id": 36164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36164, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36164, "pid": 5, "tid": 7, "ts": 1716454218083411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030289, "dur": 9, "args": { "External id": 36164, "cbid": 211, "correlation": 36164 } }, { "ph": "s", "id": 36164, "pid": 76337, "tid": -914061504, "ts": 1716454218030289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218083424, "dur": 18, "args": { "External id": 36184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36184, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 36184, "pid": 5, "tid": 7, "ts": 1716454218083424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030362, "dur": 12, "args": { "External id": 36184, "cbid": 211, "correlation": 36184 } }, { "ph": "s", "id": 36184, "pid": 76337, "tid": -914061504, "ts": 1716454218030362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218083443, "dur": 4, "args": { "External id": 36196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36196, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 36196, "pid": 5, "tid": 7, "ts": 1716454218083443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030383, "dur": 6, "args": { "External id": 36196, "cbid": 211, "correlation": 36196 } }, { "ph": "s", "id": 36196, "pid": 76337, "tid": -914061504, "ts": 1716454218030383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218083449, "dur": 18, "args": { "External id": 36199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36199, "pid": 5, "tid": 7, "ts": 1716454218083449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030401, "dur": 7, "args": { "External id": 36199, "cbid": 211, "correlation": 36199 } }, { "ph": "s", "id": 36199, "pid": 76337, "tid": -914061504, "ts": 1716454218030401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218083468, "dur": 12, "args": { "External id": 36208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36208, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36208, "pid": 5, "tid": 7, "ts": 1716454218083468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030440, "dur": 10, "args": { "External id": 36208, "cbid": 211, "correlation": 36208 } }, { "ph": "s", "id": 36208, "pid": 76337, "tid": -914061504, "ts": 1716454218030440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218030491, "dur": 0, "args": { "External id": 36218, "cbid": 317, "correlation": 36218 } }, { "ph": "f", "id": 36218, "pid": 76337, "tid": -914061504, "ts": 1716454218030491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218030492, "dur": 0, "args": { "External id": 36219, "cbid": 203, "correlation": 36219 } }, { "ph": "f", "id": 36219, "pid": 76337, "tid": -914061504, "ts": 1716454218030492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218030493, "dur": 0, "args": { "External id": 36220, "cbid": 205, "correlation": 36220 } }, { "ph": "f", "id": 36220, "pid": 76337, "tid": -914061504, "ts": 1716454218030493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218083481, "dur": 11, "args": { "External id": 36224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36224, "pid": 5, "tid": 7, "ts": 1716454218083481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030506, "dur": 12, "args": { "External id": 36224, "cbid": 211, "correlation": 36224 } }, { "ph": "s", "id": 36224, "pid": 76337, "tid": -914061504, "ts": 1716454218030506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218083493, "dur": 159, "args": { "External id": 36226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36226, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36226, "pid": 5, "tid": 7, "ts": 1716454218083493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030521, "dur": 5, "args": { "External id": 36226, "cbid": 211, "correlation": 36226 } }, { "ph": "s", "id": 36226, "pid": 76337, "tid": -914061504, "ts": 1716454218030521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218083654, "dur": 1, "args": { "External id": 36228, "device": 5, "context": 1, "stream": 7, "correlation": 36228, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 36228, "pid": 5, "tid": 7, "ts": 1716454218083654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218030532, "dur": 7, "args": { "External id": 36228, "cbid": 51, "correlation": 36228 } }, { "ph": "s", "id": 36228, "pid": 76337, "tid": -914061504, "ts": 1716454218030532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218083658, "dur": 653, "args": { "External id": 36229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36229, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36229, "pid": 5, "tid": 7, "ts": 1716454218083658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030540, "dur": 6, "args": { "External id": 36229, "cbid": 211, "correlation": 36229 } }, { "ph": "s", "id": 36229, "pid": 76337, "tid": -914061504, "ts": 1716454218030540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218084312, "dur": 14, "args": { "External id": 36231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36231, "pid": 5, "tid": 7, "ts": 1716454218084312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030551, "dur": 5, "args": { "External id": 36231, "cbid": 211, "correlation": 36231 } }, { "ph": "s", "id": 36231, "pid": 76337, "tid": -914061504, "ts": 1716454218030551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218084327, "dur": 14, "args": { "External id": 36237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36237, "pid": 5, "tid": 7, "ts": 1716454218084327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030580, "dur": 8, "args": { "External id": 36237, "cbid": 211, "correlation": 36237 } }, { "ph": "s", "id": 36237, "pid": 76337, "tid": -914061504, "ts": 1716454218030580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218084342, "dur": 3, "args": { "External id": 36245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36245, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 36245, "pid": 5, "tid": 7, "ts": 1716454218084342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030623, "dur": 9, "args": { "External id": 36245, "cbid": 211, "correlation": 36245 } }, { "ph": "s", "id": 36245, "pid": 76337, "tid": -914061504, "ts": 1716454218030623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218030688, "dur": 1, "args": { "External id": 36261, "cbid": 251, "correlation": 36261 } }, { "ph": "f", "id": 36261, "pid": 76337, "tid": -914061504, "ts": 1716454218030688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218030692, "dur": 0, "args": { "External id": 36263, "cbid": 251, "correlation": 36263 } }, { "ph": "f", "id": 36263, "pid": 76337, "tid": -914061504, "ts": 1716454218030692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218084347, "dur": 13, "args": { "External id": 36264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36264, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36264, "pid": 5, "tid": 7, "ts": 1716454218084347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030694, "dur": 11, "args": { "External id": 36264, "cbid": 211, "correlation": 36264 } }, { "ph": "s", "id": 36264, "pid": 76337, "tid": -914061504, "ts": 1716454218030694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218084361, "dur": 5, "args": { "External id": 36266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36266, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36266, "pid": 5, "tid": 7, "ts": 1716454218084361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030707, "dur": 5, "args": { "External id": 36266, "cbid": 211, "correlation": 36266 } }, { "ph": "s", "id": 36266, "pid": 76337, "tid": -914061504, "ts": 1716454218030707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218084368, "dur": 16, "args": { "External id": 36276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36276, "pid": 5, "tid": 7, "ts": 1716454218084368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030764, "dur": 13, "args": { "External id": 36276, "cbid": 211, "correlation": 36276 } }, { "ph": "s", "id": 36276, "pid": 76337, "tid": -914061504, "ts": 1716454218030764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218084385, "dur": 17, "args": { "External id": 36296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36296, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 36296, "pid": 5, "tid": 7, "ts": 1716454218084385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030831, "dur": 11, "args": { "External id": 36296, "cbid": 211, "correlation": 36296 } }, { "ph": "s", "id": 36296, "pid": 76337, "tid": -914061504, "ts": 1716454218030831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218084404, "dur": 4, "args": { "External id": 36308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36308, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 36308, "pid": 5, "tid": 7, "ts": 1716454218084404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030852, "dur": 6, "args": { "External id": 36308, "cbid": 211, "correlation": 36308 } }, { "ph": "s", "id": 36308, "pid": 76337, "tid": -914061504, "ts": 1716454218030852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218084409, "dur": 16, "args": { "External id": 36311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36311, "pid": 5, "tid": 7, "ts": 1716454218084409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030870, "dur": 7, "args": { "External id": 36311, "cbid": 211, "correlation": 36311 } }, { "ph": "s", "id": 36311, "pid": 76337, "tid": -914061504, "ts": 1716454218030870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218084426, "dur": 10, "args": { "External id": 36320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36320, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36320, "pid": 5, "tid": 7, "ts": 1716454218084426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030910, "dur": 9, "args": { "External id": 36320, "cbid": 211, "correlation": 36320 } }, { "ph": "s", "id": 36320, "pid": 76337, "tid": -914061504, "ts": 1716454218030910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218030972, "dur": 0, "args": { "External id": 36330, "cbid": 317, "correlation": 36330 } }, { "ph": "f", "id": 36330, "pid": 76337, "tid": -914061504, "ts": 1716454218030972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218030972, "dur": 10, "args": { "External id": 36331, "cbid": 203, "correlation": 36331 } }, { "ph": "f", "id": 36331, "pid": 76337, "tid": -914061504, "ts": 1716454218030972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218030984, "dur": 0, "args": { "External id": 36332, "cbid": 205, "correlation": 36332 } }, { "ph": "f", "id": 36332, "pid": 76337, "tid": -914061504, "ts": 1716454218030984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218084438, "dur": 11, "args": { "External id": 36336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36336, "pid": 5, "tid": 7, "ts": 1716454218084438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218030998, "dur": 13, "args": { "External id": 36336, "cbid": 211, "correlation": 36336 } }, { "ph": "s", "id": 36336, "pid": 76337, "tid": -914061504, "ts": 1716454218030998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218084450, "dur": 158, "args": { "External id": 36338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36338, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36338, "pid": 5, "tid": 7, "ts": 1716454218084450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031013, "dur": 5, "args": { "External id": 36338, "cbid": 211, "correlation": 36338 } }, { "ph": "s", "id": 36338, "pid": 76337, "tid": -914061504, "ts": 1716454218031013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218084611, "dur": 1, "args": { "External id": 36340, "device": 5, "context": 1, "stream": 7, "correlation": 36340, "bytes": 960, "memory bandwidth (GB/s)": 0.600375234521576 } }, { "ph": "f", "id": 36340, "pid": 5, "tid": 7, "ts": 1716454218084611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218031024, "dur": 7, "args": { "External id": 36340, "cbid": 51, "correlation": 36340 } }, { "ph": "s", "id": 36340, "pid": 76337, "tid": -914061504, "ts": 1716454218031024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218084614, "dur": 637, "args": { "External id": 36341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36341, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36341, "pid": 5, "tid": 7, "ts": 1716454218084614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031032, "dur": 6, "args": { "External id": 36341, "cbid": 211, "correlation": 36341 } }, { "ph": "s", "id": 36341, "pid": 76337, "tid": -914061504, "ts": 1716454218031032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218085252, "dur": 12, "args": { "External id": 36343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36343, "pid": 5, "tid": 7, "ts": 1716454218085252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031042, "dur": 5, "args": { "External id": 36343, "cbid": 211, "correlation": 36343 } }, { "ph": "s", "id": 36343, "pid": 76337, "tid": -914061504, "ts": 1716454218031042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218085265, "dur": 14, "args": { "External id": 36349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36349, "pid": 5, "tid": 7, "ts": 1716454218085265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031070, "dur": 8, "args": { "External id": 36349, "cbid": 211, "correlation": 36349 } }, { "ph": "s", "id": 36349, "pid": 76337, "tid": -914061504, "ts": 1716454218031070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218085281, "dur": 11, "args": { "External id": 36357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36357, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36357, "pid": 5, "tid": 7, "ts": 1716454218085281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031103, "dur": 8, "args": { "External id": 36357, "cbid": 211, "correlation": 36357 } }, { "ph": "s", "id": 36357, "pid": 76337, "tid": -914061504, "ts": 1716454218031103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218085294, "dur": 10, "args": { "External id": 36365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36365, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36365, "pid": 5, "tid": 7, "ts": 1716454218085294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031131, "dur": 9, "args": { "External id": 36365, "cbid": 211, "correlation": 36365 } }, { "ph": "s", "id": 36365, "pid": 76337, "tid": -914061504, "ts": 1716454218031131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218085305, "dur": 17, "args": { "External id": 36385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36385, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 36385, "pid": 5, "tid": 7, "ts": 1716454218085305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031209, "dur": 12, "args": { "External id": 36385, "cbid": 211, "correlation": 36385 } }, { "ph": "s", "id": 36385, "pid": 76337, "tid": -914061504, "ts": 1716454218031209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218085323, "dur": 4, "args": { "External id": 36397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36397, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 36397, "pid": 5, "tid": 7, "ts": 1716454218085323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031231, "dur": 6, "args": { "External id": 36397, "cbid": 211, "correlation": 36397 } }, { "ph": "s", "id": 36397, "pid": 76337, "tid": -914061504, "ts": 1716454218031231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218085329, "dur": 16, "args": { "External id": 36400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36400, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36400, "pid": 5, "tid": 7, "ts": 1716454218085329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031250, "dur": 6, "args": { "External id": 36400, "cbid": 211, "correlation": 36400 } }, { "ph": "s", "id": 36400, "pid": 76337, "tid": -914061504, "ts": 1716454218031250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218031307, "dur": 0, "args": { "External id": 36411, "cbid": 317, "correlation": 36411 } }, { "ph": "f", "id": 36411, "pid": 76337, "tid": -914061504, "ts": 1716454218031307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218031307, "dur": 0, "args": { "External id": 36412, "cbid": 203, "correlation": 36412 } }, { "ph": "f", "id": 36412, "pid": 76337, "tid": -914061504, "ts": 1716454218031307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218031308, "dur": 0, "args": { "External id": 36413, "cbid": 205, "correlation": 36413 } }, { "ph": "f", "id": 36413, "pid": 76337, "tid": -914061504, "ts": 1716454218031308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218085346, "dur": 11, "args": { "External id": 36417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36417, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36417, "pid": 5, "tid": 7, "ts": 1716454218085346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031321, "dur": 12, "args": { "External id": 36417, "cbid": 211, "correlation": 36417 } }, { "ph": "s", "id": 36417, "pid": 76337, "tid": -914061504, "ts": 1716454218031321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218085358, "dur": 4, "args": { "External id": 36419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36419, "pid": 5, "tid": 7, "ts": 1716454218085358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031337, "dur": 5, "args": { "External id": 36419, "cbid": 211, "correlation": 36419 } }, { "ph": "s", "id": 36419, "pid": 76337, "tid": -914061504, "ts": 1716454218031337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218031344, "dur": 0, "args": { "External id": 36420, "cbid": 51, "correlation": 36420 } }, { "ph": "s", "id": 36420, "pid": 76337, "tid": -914061504, "ts": 1716454218031344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218085363, "dur": 93, "args": { "External id": 36421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36421, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 36421, "pid": 5, "tid": 7, "ts": 1716454218085363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031345, "dur": 5, "args": { "External id": 36421, "cbid": 211, "correlation": 36421 } }, { "ph": "s", "id": 36421, "pid": 76337, "tid": -914061504, "ts": 1716454218031345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218085458, "dur": 15, "args": { "External id": 36426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36426, "pid": 5, "tid": 7, "ts": 1716454218085458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031373, "dur": 8, "args": { "External id": 36426, "cbid": 211, "correlation": 36426 } }, { "ph": "s", "id": 36426, "pid": 76337, "tid": -914061504, "ts": 1716454218031373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218085475, "dur": 82, "args": { "External id": 36435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36435, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36435, "pid": 5, "tid": 7, "ts": 1716454218085475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031453, "dur": 15, "args": { "External id": 36435, "cbid": 211, "correlation": 36435 } }, { "ph": "s", "id": 36435, "pid": 76337, "tid": -914061504, "ts": 1716454218031453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218085558, "dur": 30, "args": { "External id": 36457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36457, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36457, "pid": 5, "tid": 7, "ts": 1716454218085558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031511, "dur": 10, "args": { "External id": 36457, "cbid": 211, "correlation": 36457 } }, { "ph": "s", "id": 36457, "pid": 76337, "tid": -914061504, "ts": 1716454218031511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218031599, "dur": 1, "args": { "External id": 36468, "cbid": 251, "correlation": 36468 } }, { "ph": "f", "id": 36468, "pid": 76337, "tid": -914061504, "ts": 1716454218031599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218085589, "dur": 159, "args": { "External id": 36469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36469, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36469, "pid": 5, "tid": 7, "ts": 1716454218085589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031605, "dur": 13, "args": { "External id": 36469, "cbid": 211, "correlation": 36469 } }, { "ph": "s", "id": 36469, "pid": 76337, "tid": -914061504, "ts": 1716454218031605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218031674, "dur": 1, "args": { "External id": 36480, "cbid": 251, "correlation": 36480 } }, { "ph": "f", "id": 36480, "pid": 76337, "tid": -914061504, "ts": 1716454218031674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218085749, "dur": 156, "args": { "External id": 36481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36481, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36481, "pid": 5, "tid": 7, "ts": 1716454218085749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031677, "dur": 12, "args": { "External id": 36481, "cbid": 211, "correlation": 36481 } }, { "ph": "s", "id": 36481, "pid": 76337, "tid": -914061504, "ts": 1716454218031677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218031743, "dur": 1, "args": { "External id": 36492, "cbid": 251, "correlation": 36492 } }, { "ph": "f", "id": 36492, "pid": 76337, "tid": -914061504, "ts": 1716454218031743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218085906, "dur": 156, "args": { "External id": 36493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36493, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36493, "pid": 5, "tid": 7, "ts": 1716454218085906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031746, "dur": 11, "args": { "External id": 36493, "cbid": 211, "correlation": 36493 } }, { "ph": "s", "id": 36493, "pid": 76337, "tid": -914061504, "ts": 1716454218031746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218086063, "dur": 331, "args": { "External id": 36518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36518, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36518, "pid": 5, "tid": 7, "ts": 1716454218086063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031827, "dur": 13, "args": { "External id": 36518, "cbid": 211, "correlation": 36518 } }, { "ph": "s", "id": 36518, "pid": 76337, "tid": -914061504, "ts": 1716454218031827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218031926, "dur": 1, "args": { "External id": 36536, "cbid": 251, "correlation": 36536 } }, { "ph": "f", "id": 36536, "pid": 76337, "tid": -914061504, "ts": 1716454218031926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218086396, "dur": 163, "args": { "External id": 36538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36538, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36538, "pid": 5, "tid": 7, "ts": 1716454218086396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218031932, "dur": 13, "args": { "External id": 36538, "cbid": 211, "correlation": 36538 } }, { "ph": "s", "id": 36538, "pid": 76337, "tid": -914061504, "ts": 1716454218031932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218086560, "dur": 19, "args": { "External id": 36546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36546, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36546, "pid": 5, "tid": 7, "ts": 1716454218086560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032010, "dur": 12, "args": { "External id": 36546, "cbid": 211, "correlation": 36546 } }, { "ph": "s", "id": 36546, "pid": 76337, "tid": -914061504, "ts": 1716454218032010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218086581, "dur": 27, "args": { "External id": 36554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36554, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36554, "pid": 5, "tid": 7, "ts": 1716454218086581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032049, "dur": 9, "args": { "External id": 36554, "cbid": 211, "correlation": 36554 } }, { "ph": "s", "id": 36554, "pid": 76337, "tid": -914061504, "ts": 1716454218032049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218086610, "dur": 18, "args": { "External id": 36565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36565, "pid": 5, "tid": 7, "ts": 1716454218086610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032119, "dur": 12, "args": { "External id": 36565, "cbid": 211, "correlation": 36565 } }, { "ph": "s", "id": 36565, "pid": 76337, "tid": -914061504, "ts": 1716454218032119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218086629, "dur": 15, "args": { "External id": 36587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36587, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36587, "pid": 5, "tid": 7, "ts": 1716454218086629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032150, "dur": 7, "args": { "External id": 36587, "cbid": 211, "correlation": 36587 } }, { "ph": "s", "id": 36587, "pid": 76337, "tid": -914061504, "ts": 1716454218032150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032235, "dur": 1, "args": { "External id": 36598, "cbid": 251, "correlation": 36598 } }, { "ph": "f", "id": 36598, "pid": 76337, "tid": -914061504, "ts": 1716454218032235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218086645, "dur": 86, "args": { "External id": 36599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36599, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36599, "pid": 5, "tid": 7, "ts": 1716454218086645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032240, "dur": 13, "args": { "External id": 36599, "cbid": 211, "correlation": 36599 } }, { "ph": "s", "id": 36599, "pid": 76337, "tid": -914061504, "ts": 1716454218032240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032309, "dur": 1, "args": { "External id": 36610, "cbid": 251, "correlation": 36610 } }, { "ph": "f", "id": 36610, "pid": 76337, "tid": -914061504, "ts": 1716454218032309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032312, "dur": 0, "args": { "External id": 36611, "cbid": 251, "correlation": 36611 } }, { "ph": "f", "id": 36611, "pid": 76337, "tid": -914061504, "ts": 1716454218032312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218086733, "dur": 12, "args": { "External id": 36612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36612, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36612, "pid": 5, "tid": 7, "ts": 1716454218086733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032314, "dur": 12, "args": { "External id": 36612, "cbid": 211, "correlation": 36612 } }, { "ph": "s", "id": 36612, "pid": 76337, "tid": -914061504, "ts": 1716454218032314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218086746, "dur": 5, "args": { "External id": 36614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36614, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36614, "pid": 5, "tid": 7, "ts": 1716454218086746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032328, "dur": 6, "args": { "External id": 36614, "cbid": 211, "correlation": 36614 } }, { "ph": "s", "id": 36614, "pid": 76337, "tid": -914061504, "ts": 1716454218032328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032386, "dur": 1, "args": { "External id": 36625, "cbid": 251, "correlation": 36625 } }, { "ph": "f", "id": 36625, "pid": 76337, "tid": -914061504, "ts": 1716454218032386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032389, "dur": 0, "args": { "External id": 36626, "cbid": 251, "correlation": 36626 } }, { "ph": "f", "id": 36626, "pid": 76337, "tid": -914061504, "ts": 1716454218032389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218086752, "dur": 8, "args": { "External id": 36627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36627, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36627, "pid": 5, "tid": 7, "ts": 1716454218086752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032390, "dur": 11, "args": { "External id": 36627, "cbid": 211, "correlation": 36627 } }, { "ph": "s", "id": 36627, "pid": 76337, "tid": -914061504, "ts": 1716454218032390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218086762, "dur": 3, "args": { "External id": 36629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36629, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36629, "pid": 5, "tid": 7, "ts": 1716454218086762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032403, "dur": 5, "args": { "External id": 36629, "cbid": 211, "correlation": 36629 } }, { "ph": "s", "id": 36629, "pid": 76337, "tid": -914061504, "ts": 1716454218032403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218086766, "dur": 54, "args": { "External id": 36654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36654, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36654, "pid": 5, "tid": 7, "ts": 1716454218086766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032480, "dur": 13, "args": { "External id": 36654, "cbid": 211, "correlation": 36654 } }, { "ph": "s", "id": 36654, "pid": 76337, "tid": -914061504, "ts": 1716454218032480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032579, "dur": 1, "args": { "External id": 36672, "cbid": 251, "correlation": 36672 } }, { "ph": "f", "id": 36672, "pid": 76337, "tid": -914061504, "ts": 1716454218032579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218086821, "dur": 89, "args": { "External id": 36674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36674, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36674, "pid": 5, "tid": 7, "ts": 1716454218086821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032585, "dur": 14, "args": { "External id": 36674, "cbid": 211, "correlation": 36674 } }, { "ph": "s", "id": 36674, "pid": 76337, "tid": -914061504, "ts": 1716454218032585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218086911, "dur": 10, "args": { "External id": 36682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36682, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36682, "pid": 5, "tid": 7, "ts": 1716454218086911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032656, "dur": 12, "args": { "External id": 36682, "cbid": 211, "correlation": 36682 } }, { "ph": "s", "id": 36682, "pid": 76337, "tid": -914061504, "ts": 1716454218032656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218086922, "dur": 21, "args": { "External id": 36690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36690, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36690, "pid": 5, "tid": 7, "ts": 1716454218086922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032696, "dur": 10, "args": { "External id": 36690, "cbid": 211, "correlation": 36690 } }, { "ph": "s", "id": 36690, "pid": 76337, "tid": -914061504, "ts": 1716454218032696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218086944, "dur": 17, "args": { "External id": 36712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36712, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36712, "pid": 5, "tid": 7, "ts": 1716454218086944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032747, "dur": 10, "args": { "External id": 36712, "cbid": 211, "correlation": 36712 } }, { "ph": "s", "id": 36712, "pid": 76337, "tid": -914061504, "ts": 1716454218032747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032835, "dur": 1, "args": { "External id": 36728, "cbid": 251, "correlation": 36728 } }, { "ph": "f", "id": 36728, "pid": 76337, "tid": -914061504, "ts": 1716454218032835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218032840, "dur": 0, "args": { "External id": 36730, "cbid": 251, "correlation": 36730 } }, { "ph": "f", "id": 36730, "pid": 76337, "tid": -914061504, "ts": 1716454218032840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218086963, "dur": 488, "args": { "External id": 36731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36731, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36731, "pid": 5, "tid": 7, "ts": 1716454218086963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032841, "dur": 13, "args": { "External id": 36731, "cbid": 211, "correlation": 36731 } }, { "ph": "s", "id": 36731, "pid": 76337, "tid": -914061504, "ts": 1716454218032841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218087452, "dur": 64, "args": { "External id": 36739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36739, "pid": 5, "tid": 7, "ts": 1716454218087452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032906, "dur": 13, "args": { "External id": 36739, "cbid": 211, "correlation": 36739 } }, { "ph": "s", "id": 36739, "pid": 76337, "tid": -914061504, "ts": 1716454218032906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218087517, "dur": 67, "args": { "External id": 36747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36747, "pid": 5, "tid": 7, "ts": 1716454218087517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218032936, "dur": 8, "args": { "External id": 36747, "cbid": 211, "correlation": 36747 } }, { "ph": "s", "id": 36747, "pid": 76337, "tid": -914061504, "ts": 1716454218032936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218033023, "dur": 1, "args": { "External id": 36763, "cbid": 251, "correlation": 36763 } }, { "ph": "f", "id": 36763, "pid": 76337, "tid": -914061504, "ts": 1716454218033023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218087586, "dur": 1, "args": { "External id": 36765, "device": 5, "context": 1, "stream": 7, "correlation": 36765, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 36765, "pid": 5, "tid": 7, "ts": 1716454218087586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218033029, "dur": 10, "args": { "External id": 36765, "cbid": 51, "correlation": 36765 } }, { "ph": "s", "id": 36765, "pid": 76337, "tid": -914061504, "ts": 1716454218033029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218087590, "dur": 267, "args": { "External id": 36766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36766, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36766, "pid": 5, "tid": 7, "ts": 1716454218087590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033040, "dur": 11, "args": { "External id": 36766, "cbid": 211, "correlation": 36766 } }, { "ph": "s", "id": 36766, "pid": 76337, "tid": -914061504, "ts": 1716454218033040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218087858, "dur": 14, "args": { "External id": 36774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36774, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36774, "pid": 5, "tid": 7, "ts": 1716454218087858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033083, "dur": 10, "args": { "External id": 36774, "cbid": 211, "correlation": 36774 } }, { "ph": "s", "id": 36774, "pid": 76337, "tid": -914061504, "ts": 1716454218033083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218087873, "dur": 38, "args": { "External id": 36785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36785, "pid": 5, "tid": 7, "ts": 1716454218087873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033151, "dur": 12, "args": { "External id": 36785, "cbid": 211, "correlation": 36785 } }, { "ph": "s", "id": 36785, "pid": 76337, "tid": -914061504, "ts": 1716454218033151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218033215, "dur": 0, "args": { "External id": 36797, "cbid": 317, "correlation": 36797 } }, { "ph": "f", "id": 36797, "pid": 76337, "tid": -914061504, "ts": 1716454218033215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218033216, "dur": 0, "args": { "External id": 36798, "cbid": 203, "correlation": 36798 } }, { "ph": "f", "id": 36798, "pid": 76337, "tid": -914061504, "ts": 1716454218033216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218033217, "dur": 0, "args": { "External id": 36799, "cbid": 205, "correlation": 36799 } }, { "ph": "f", "id": 36799, "pid": 76337, "tid": -914061504, "ts": 1716454218033217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218087912, "dur": 13, "args": { "External id": 36803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36803, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36803, "pid": 5, "tid": 7, "ts": 1716454218087912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033231, "dur": 12, "args": { "External id": 36803, "cbid": 211, "correlation": 36803 } }, { "ph": "s", "id": 36803, "pid": 76337, "tid": -914061504, "ts": 1716454218033231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218087927, "dur": 4, "args": { "External id": 36805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 36805, "pid": 5, "tid": 7, "ts": 1716454218087927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033248, "dur": 6, "args": { "External id": 36805, "cbid": 211, "correlation": 36805 } }, { "ph": "s", "id": 36805, "pid": 76337, "tid": -914061504, "ts": 1716454218033248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218033258, "dur": 0, "args": { "External id": 36806, "cbid": 51, "correlation": 36806 } }, { "ph": "s", "id": 36806, "pid": 76337, "tid": -914061504, "ts": 1716454218033258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218087932, "dur": 94, "args": { "External id": 36807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36807, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 36807, "pid": 5, "tid": 7, "ts": 1716454218087932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033258, "dur": 5, "args": { "External id": 36807, "cbid": 211, "correlation": 36807 } }, { "ph": "s", "id": 36807, "pid": 76337, "tid": -914061504, "ts": 1716454218033258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218088027, "dur": 16, "args": { "External id": 36812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36812, "pid": 5, "tid": 7, "ts": 1716454218088027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033285, "dur": 9, "args": { "External id": 36812, "cbid": 211, "correlation": 36812 } }, { "ph": "s", "id": 36812, "pid": 76337, "tid": -914061504, "ts": 1716454218033285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218088044, "dur": 11, "args": { "External id": 36820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36820, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36820, "pid": 5, "tid": 7, "ts": 1716454218088044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033316, "dur": 8, "args": { "External id": 36820, "cbid": 211, "correlation": 36820 } }, { "ph": "s", "id": 36820, "pid": 76337, "tid": -914061504, "ts": 1716454218033316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218033386, "dur": 0, "args": { "External id": 36830, "cbid": 317, "correlation": 36830 } }, { "ph": "f", "id": 36830, "pid": 76337, "tid": -914061504, "ts": 1716454218033386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218033387, "dur": 0, "args": { "External id": 36831, "cbid": 203, "correlation": 36831 } }, { "ph": "f", "id": 36831, "pid": 76337, "tid": -914061504, "ts": 1716454218033387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218033388, "dur": 0, "args": { "External id": 36832, "cbid": 205, "correlation": 36832 } }, { "ph": "f", "id": 36832, "pid": 76337, "tid": -914061504, "ts": 1716454218033388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088057, "dur": 11, "args": { "External id": 36836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36836, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36836, "pid": 5, "tid": 7, "ts": 1716454218088057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033403, "dur": 12, "args": { "External id": 36836, "cbid": 211, "correlation": 36836 } }, { "ph": "s", "id": 36836, "pid": 76337, "tid": -914061504, "ts": 1716454218033403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088068, "dur": 161, "args": { "External id": 36838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36838, "pid": 5, "tid": 7, "ts": 1716454218088068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033418, "dur": 5, "args": { "External id": 36838, "cbid": 211, "correlation": 36838 } }, { "ph": "s", "id": 36838, "pid": 76337, "tid": -914061504, "ts": 1716454218033418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218088231, "dur": 1, "args": { "External id": 36840, "device": 5, "context": 1, "stream": 7, "correlation": 36840, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 36840, "pid": 5, "tid": 7, "ts": 1716454218088231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218033429, "dur": 6, "args": { "External id": 36840, "cbid": 51, "correlation": 36840 } }, { "ph": "s", "id": 36840, "pid": 76337, "tid": -914061504, "ts": 1716454218033429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218088235, "dur": 196, "args": { "External id": 36841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36841, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 36841, "pid": 5, "tid": 7, "ts": 1716454218088235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033437, "dur": 8, "args": { "External id": 36841, "cbid": 211, "correlation": 36841 } }, { "ph": "s", "id": 36841, "pid": 76337, "tid": -914061504, "ts": 1716454218033437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088432, "dur": 6, "args": { "External id": 36843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36843, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36843, "pid": 5, "tid": 7, "ts": 1716454218088432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033449, "dur": 5, "args": { "External id": 36843, "cbid": 211, "correlation": 36843 } }, { "ph": "s", "id": 36843, "pid": 76337, "tid": -914061504, "ts": 1716454218033449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218088439, "dur": 6, "args": { "External id": 36849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 36849, "pid": 5, "tid": 7, "ts": 1716454218088439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033477, "dur": 8, "args": { "External id": 36849, "cbid": 211, "correlation": 36849 } }, { "ph": "s", "id": 36849, "pid": 76337, "tid": -914061504, "ts": 1716454218033477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218088446, "dur": 11, "args": { "External id": 36869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36869, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 36869, "pid": 5, "tid": 7, "ts": 1716454218088446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033568, "dur": 12, "args": { "External id": 36869, "cbid": 211, "correlation": 36869 } }, { "ph": "s", "id": 36869, "pid": 76337, "tid": -914061504, "ts": 1716454218033568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218088458, "dur": 4, "args": { "External id": 36881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36881, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 36881, "pid": 5, "tid": 7, "ts": 1716454218088458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033590, "dur": 7, "args": { "External id": 36881, "cbid": 211, "correlation": 36881 } }, { "ph": "s", "id": 36881, "pid": 76337, "tid": -914061504, "ts": 1716454218033590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218088464, "dur": 9, "args": { "External id": 36884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36884, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 36884, "pid": 5, "tid": 7, "ts": 1716454218088464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033609, "dur": 7, "args": { "External id": 36884, "cbid": 211, "correlation": 36884 } }, { "ph": "s", "id": 36884, "pid": 76337, "tid": -914061504, "ts": 1716454218033609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218088474, "dur": 5, "args": { "External id": 36893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36893, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 36893, "pid": 5, "tid": 7, "ts": 1716454218088474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033649, "dur": 10, "args": { "External id": 36893, "cbid": 211, "correlation": 36893 } }, { "ph": "s", "id": 36893, "pid": 76337, "tid": -914061504, "ts": 1716454218033649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218033701, "dur": 0, "args": { "External id": 36903, "cbid": 317, "correlation": 36903 } }, { "ph": "f", "id": 36903, "pid": 76337, "tid": -914061504, "ts": 1716454218033701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218033702, "dur": 0, "args": { "External id": 36904, "cbid": 203, "correlation": 36904 } }, { "ph": "f", "id": 36904, "pid": 76337, "tid": -914061504, "ts": 1716454218033702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218033702, "dur": 0, "args": { "External id": 36905, "cbid": 205, "correlation": 36905 } }, { "ph": "f", "id": 36905, "pid": 76337, "tid": -914061504, "ts": 1716454218033702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088480, "dur": 5, "args": { "External id": 36909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36909, "pid": 5, "tid": 7, "ts": 1716454218088480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033718, "dur": 11, "args": { "External id": 36909, "cbid": 211, "correlation": 36909 } }, { "ph": "s", "id": 36909, "pid": 76337, "tid": -914061504, "ts": 1716454218033718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088486, "dur": 160, "args": { "External id": 36911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36911, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36911, "pid": 5, "tid": 7, "ts": 1716454218088486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033732, "dur": 5, "args": { "External id": 36911, "cbid": 211, "correlation": 36911 } }, { "ph": "s", "id": 36911, "pid": 76337, "tid": -914061504, "ts": 1716454218033732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218088648, "dur": 1, "args": { "External id": 36913, "device": 5, "context": 1, "stream": 7, "correlation": 36913, "bytes": 240, "memory bandwidth (GB/s)": 0.1271186440677966 } }, { "ph": "f", "id": 36913, "pid": 5, "tid": 7, "ts": 1716454218088648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218033742, "dur": 6, "args": { "External id": 36913, "cbid": 51, "correlation": 36913 } }, { "ph": "s", "id": 36913, "pid": 76337, "tid": -914061504, "ts": 1716454218033742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218088652, "dur": 263, "args": { "External id": 36914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36914, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36914, "pid": 5, "tid": 7, "ts": 1716454218088652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033750, "dur": 6, "args": { "External id": 36914, "cbid": 211, "correlation": 36914 } }, { "ph": "s", "id": 36914, "pid": 76337, "tid": -914061504, "ts": 1716454218033750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088917, "dur": 6, "args": { "External id": 36916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36916, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 36916, "pid": 5, "tid": 7, "ts": 1716454218088917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033760, "dur": 6, "args": { "External id": 36916, "cbid": 211, "correlation": 36916 } }, { "ph": "s", "id": 36916, "pid": 76337, "tid": -914061504, "ts": 1716454218033760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218088924, "dur": 6, "args": { "External id": 36922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 36922, "pid": 5, "tid": 7, "ts": 1716454218088924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033789, "dur": 8, "args": { "External id": 36922, "cbid": 211, "correlation": 36922 } }, { "ph": "s", "id": 36922, "pid": 76337, "tid": -914061504, "ts": 1716454218033789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218088932, "dur": 3, "args": { "External id": 36930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36930, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 36930, "pid": 5, "tid": 7, "ts": 1716454218088932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033832, "dur": 9, "args": { "External id": 36930, "cbid": 211, "correlation": 36930 } }, { "ph": "s", "id": 36930, "pid": 76337, "tid": -914061504, "ts": 1716454218033832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218033896, "dur": 1, "args": { "External id": 36946, "cbid": 251, "correlation": 36946 } }, { "ph": "f", "id": 36946, "pid": 76337, "tid": -914061504, "ts": 1716454218033896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218033902, "dur": 0, "args": { "External id": 36948, "cbid": 251, "correlation": 36948 } }, { "ph": "f", "id": 36948, "pid": 76337, "tid": -914061504, "ts": 1716454218033902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218088936, "dur": 13, "args": { "External id": 36949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36949, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36949, "pid": 5, "tid": 7, "ts": 1716454218088936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033903, "dur": 11, "args": { "External id": 36949, "cbid": 211, "correlation": 36949 } }, { "ph": "s", "id": 36949, "pid": 76337, "tid": -914061504, "ts": 1716454218033903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218088950, "dur": 5, "args": { "External id": 36951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36951, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 36951, "pid": 5, "tid": 7, "ts": 1716454218088950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033916, "dur": 6, "args": { "External id": 36951, "cbid": 211, "correlation": 36951 } }, { "ph": "s", "id": 36951, "pid": 76337, "tid": -914061504, "ts": 1716454218033916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218088956, "dur": 5, "args": { "External id": 36961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 36961, "pid": 5, "tid": 7, "ts": 1716454218088956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218033981, "dur": 12, "args": { "External id": 36961, "cbid": 211, "correlation": 36961 } }, { "ph": "s", "id": 36961, "pid": 76337, "tid": -914061504, "ts": 1716454218033981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218088963, "dur": 10, "args": { "External id": 36981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36981, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 36981, "pid": 5, "tid": 7, "ts": 1716454218088963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034049, "dur": 11, "args": { "External id": 36981, "cbid": 211, "correlation": 36981 } }, { "ph": "s", "id": 36981, "pid": 76337, "tid": -914061504, "ts": 1716454218034049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218088974, "dur": 4, "args": { "External id": 36993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36993, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 36993, "pid": 5, "tid": 7, "ts": 1716454218088974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034070, "dur": 6, "args": { "External id": 36993, "cbid": 211, "correlation": 36993 } }, { "ph": "s", "id": 36993, "pid": 76337, "tid": -914061504, "ts": 1716454218034070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218088979, "dur": 7, "args": { "External id": 36996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 36996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 36996, "pid": 5, "tid": 7, "ts": 1716454218088979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034089, "dur": 6, "args": { "External id": 36996, "cbid": 211, "correlation": 36996 } }, { "ph": "s", "id": 36996, "pid": 76337, "tid": -914061504, "ts": 1716454218034089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218088987, "dur": 4, "args": { "External id": 37005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37005, "pid": 5, "tid": 7, "ts": 1716454218088987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034129, "dur": 10, "args": { "External id": 37005, "cbid": 211, "correlation": 37005 } }, { "ph": "s", "id": 37005, "pid": 76337, "tid": -914061504, "ts": 1716454218034129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218034192, "dur": 0, "args": { "External id": 37015, "cbid": 317, "correlation": 37015 } }, { "ph": "f", "id": 37015, "pid": 76337, "tid": -914061504, "ts": 1716454218034192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218034193, "dur": 0, "args": { "External id": 37016, "cbid": 203, "correlation": 37016 } }, { "ph": "f", "id": 37016, "pid": 76337, "tid": -914061504, "ts": 1716454218034193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218034194, "dur": 0, "args": { "External id": 37017, "cbid": 205, "correlation": 37017 } }, { "ph": "f", "id": 37017, "pid": 76337, "tid": -914061504, "ts": 1716454218034194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088993, "dur": 5, "args": { "External id": 37021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37021, "pid": 5, "tid": 7, "ts": 1716454218088993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034208, "dur": 12, "args": { "External id": 37021, "cbid": 211, "correlation": 37021 } }, { "ph": "s", "id": 37021, "pid": 76337, "tid": -914061504, "ts": 1716454218034208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218088999, "dur": 157, "args": { "External id": 37023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37023, "pid": 5, "tid": 7, "ts": 1716454218088999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034222, "dur": 5, "args": { "External id": 37023, "cbid": 211, "correlation": 37023 } }, { "ph": "s", "id": 37023, "pid": 76337, "tid": -914061504, "ts": 1716454218034222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218089159, "dur": 1, "args": { "External id": 37025, "device": 5, "context": 1, "stream": 7, "correlation": 37025, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 37025, "pid": 5, "tid": 7, "ts": 1716454218089159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218034232, "dur": 7, "args": { "External id": 37025, "cbid": 51, "correlation": 37025 } }, { "ph": "s", "id": 37025, "pid": 76337, "tid": -914061504, "ts": 1716454218034232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218089162, "dur": 253, "args": { "External id": 37026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37026, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37026, "pid": 5, "tid": 7, "ts": 1716454218089162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034240, "dur": 6, "args": { "External id": 37026, "cbid": 211, "correlation": 37026 } }, { "ph": "s", "id": 37026, "pid": 76337, "tid": -914061504, "ts": 1716454218034240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218089417, "dur": 5, "args": { "External id": 37028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37028, "pid": 5, "tid": 7, "ts": 1716454218089417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034249, "dur": 5, "args": { "External id": 37028, "cbid": 211, "correlation": 37028 } }, { "ph": "s", "id": 37028, "pid": 76337, "tid": -914061504, "ts": 1716454218034249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218089423, "dur": 6, "args": { "External id": 37034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37034, "pid": 5, "tid": 7, "ts": 1716454218089423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034277, "dur": 8, "args": { "External id": 37034, "cbid": 211, "correlation": 37034 } }, { "ph": "s", "id": 37034, "pid": 76337, "tid": -914061504, "ts": 1716454218034277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218089431, "dur": 5, "args": { "External id": 37042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37042, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37042, "pid": 5, "tid": 7, "ts": 1716454218089431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034310, "dur": 8, "args": { "External id": 37042, "cbid": 211, "correlation": 37042 } }, { "ph": "s", "id": 37042, "pid": 76337, "tid": -914061504, "ts": 1716454218034310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218089437, "dur": 4, "args": { "External id": 37050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37050, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37050, "pid": 5, "tid": 7, "ts": 1716454218089437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034339, "dur": 9, "args": { "External id": 37050, "cbid": 211, "correlation": 37050 } }, { "ph": "s", "id": 37050, "pid": 76337, "tid": -914061504, "ts": 1716454218034339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218089442, "dur": 9, "args": { "External id": 37070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37070, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 37070, "pid": 5, "tid": 7, "ts": 1716454218089442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034413, "dur": 12, "args": { "External id": 37070, "cbid": 211, "correlation": 37070 } }, { "ph": "s", "id": 37070, "pid": 76337, "tid": -914061504, "ts": 1716454218034413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218089453, "dur": 3, "args": { "External id": 37082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37082, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 37082, "pid": 5, "tid": 7, "ts": 1716454218089453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034435, "dur": 7, "args": { "External id": 37082, "cbid": 211, "correlation": 37082 } }, { "ph": "s", "id": 37082, "pid": 76337, "tid": -914061504, "ts": 1716454218034435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218089457, "dur": 6, "args": { "External id": 37085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37085, "pid": 5, "tid": 7, "ts": 1716454218089457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034453, "dur": 8, "args": { "External id": 37085, "cbid": 211, "correlation": 37085 } }, { "ph": "s", "id": 37085, "pid": 76337, "tid": -914061504, "ts": 1716454218034453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218089465, "dur": 4, "args": { "External id": 37094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37094, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37094, "pid": 5, "tid": 7, "ts": 1716454218089465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034493, "dur": 10, "args": { "External id": 37094, "cbid": 211, "correlation": 37094 } }, { "ph": "s", "id": 37094, "pid": 76337, "tid": -914061504, "ts": 1716454218034493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218034544, "dur": 0, "args": { "External id": 37104, "cbid": 317, "correlation": 37104 } }, { "ph": "f", "id": 37104, "pid": 76337, "tid": -914061504, "ts": 1716454218034544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218034545, "dur": 0, "args": { "External id": 37105, "cbid": 203, "correlation": 37105 } }, { "ph": "f", "id": 37105, "pid": 76337, "tid": -914061504, "ts": 1716454218034545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218034546, "dur": 0, "args": { "External id": 37106, "cbid": 205, "correlation": 37106 } }, { "ph": "f", "id": 37106, "pid": 76337, "tid": -914061504, "ts": 1716454218034546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218089471, "dur": 5, "args": { "External id": 37110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37110, "pid": 5, "tid": 7, "ts": 1716454218089471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034559, "dur": 12, "args": { "External id": 37110, "cbid": 211, "correlation": 37110 } }, { "ph": "s", "id": 37110, "pid": 76337, "tid": -914061504, "ts": 1716454218034559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218089477, "dur": 158, "args": { "External id": 37112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37112, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37112, "pid": 5, "tid": 7, "ts": 1716454218089477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034573, "dur": 5, "args": { "External id": 37112, "cbid": 211, "correlation": 37112 } }, { "ph": "s", "id": 37112, "pid": 76337, "tid": -914061504, "ts": 1716454218034573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218089638, "dur": 1, "args": { "External id": 37114, "device": 5, "context": 1, "stream": 7, "correlation": 37114, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 37114, "pid": 5, "tid": 7, "ts": 1716454218089638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218034584, "dur": 6, "args": { "External id": 37114, "cbid": 51, "correlation": 37114 } }, { "ph": "s", "id": 37114, "pid": 76337, "tid": -914061504, "ts": 1716454218034584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218089641, "dur": 252, "args": { "External id": 37115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37115, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37115, "pid": 5, "tid": 7, "ts": 1716454218089641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034591, "dur": 6, "args": { "External id": 37115, "cbid": 211, "correlation": 37115 } }, { "ph": "s", "id": 37115, "pid": 76337, "tid": -914061504, "ts": 1716454218034591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218089895, "dur": 5, "args": { "External id": 37117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37117, "pid": 5, "tid": 7, "ts": 1716454218089895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034601, "dur": 6, "args": { "External id": 37117, "cbid": 211, "correlation": 37117 } }, { "ph": "s", "id": 37117, "pid": 76337, "tid": -914061504, "ts": 1716454218034601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218089901, "dur": 6, "args": { "External id": 37123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37123, "pid": 5, "tid": 7, "ts": 1716454218089901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034629, "dur": 9, "args": { "External id": 37123, "cbid": 211, "correlation": 37123 } }, { "ph": "s", "id": 37123, "pid": 76337, "tid": -914061504, "ts": 1716454218034629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218089909, "dur": 3, "args": { "External id": 37131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37131, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 37131, "pid": 5, "tid": 7, "ts": 1716454218089909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034673, "dur": 10, "args": { "External id": 37131, "cbid": 211, "correlation": 37131 } }, { "ph": "s", "id": 37131, "pid": 76337, "tid": -914061504, "ts": 1716454218034673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218034736, "dur": 1, "args": { "External id": 37147, "cbid": 251, "correlation": 37147 } }, { "ph": "f", "id": 37147, "pid": 76337, "tid": -914061504, "ts": 1716454218034736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218034741, "dur": 0, "args": { "External id": 37149, "cbid": 251, "correlation": 37149 } }, { "ph": "f", "id": 37149, "pid": 76337, "tid": -914061504, "ts": 1716454218034741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218089913, "dur": 10, "args": { "External id": 37150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37150, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37150, "pid": 5, "tid": 7, "ts": 1716454218089913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034743, "dur": 11, "args": { "External id": 37150, "cbid": 211, "correlation": 37150 } }, { "ph": "s", "id": 37150, "pid": 76337, "tid": -914061504, "ts": 1716454218034743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218089924, "dur": 3, "args": { "External id": 37152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37152, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37152, "pid": 5, "tid": 7, "ts": 1716454218089924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034756, "dur": 5, "args": { "External id": 37152, "cbid": 211, "correlation": 37152 } }, { "ph": "s", "id": 37152, "pid": 76337, "tid": -914061504, "ts": 1716454218034756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218089929, "dur": 5, "args": { "External id": 37162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37162, "pid": 5, "tid": 7, "ts": 1716454218089929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034811, "dur": 13, "args": { "External id": 37162, "cbid": 211, "correlation": 37162 } }, { "ph": "s", "id": 37162, "pid": 76337, "tid": -914061504, "ts": 1716454218034811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218089935, "dur": 9, "args": { "External id": 37182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37182, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 37182, "pid": 5, "tid": 7, "ts": 1716454218089935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034879, "dur": 10, "args": { "External id": 37182, "cbid": 211, "correlation": 37182 } }, { "ph": "s", "id": 37182, "pid": 76337, "tid": -914061504, "ts": 1716454218034879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218089946, "dur": 4, "args": { "External id": 37194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37194, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 37194, "pid": 5, "tid": 7, "ts": 1716454218089946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034899, "dur": 6, "args": { "External id": 37194, "cbid": 211, "correlation": 37194 } }, { "ph": "s", "id": 37194, "pid": 76337, "tid": -914061504, "ts": 1716454218034899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218089951, "dur": 7, "args": { "External id": 37197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37197, "pid": 5, "tid": 7, "ts": 1716454218089951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034917, "dur": 6, "args": { "External id": 37197, "cbid": 211, "correlation": 37197 } }, { "ph": "s", "id": 37197, "pid": 76337, "tid": -914061504, "ts": 1716454218034917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218089959, "dur": 4, "args": { "External id": 37206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37206, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37206, "pid": 5, "tid": 7, "ts": 1716454218089959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218034958, "dur": 10, "args": { "External id": 37206, "cbid": 211, "correlation": 37206 } }, { "ph": "s", "id": 37206, "pid": 76337, "tid": -914061504, "ts": 1716454218034958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218035029, "dur": 0, "args": { "External id": 37216, "cbid": 317, "correlation": 37216 } }, { "ph": "f", "id": 37216, "pid": 76337, "tid": -914061504, "ts": 1716454218035029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218035030, "dur": 0, "args": { "External id": 37217, "cbid": 203, "correlation": 37217 } }, { "ph": "f", "id": 37217, "pid": 76337, "tid": -914061504, "ts": 1716454218035030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218035031, "dur": 0, "args": { "External id": 37218, "cbid": 205, "correlation": 37218 } }, { "ph": "f", "id": 37218, "pid": 76337, "tid": -914061504, "ts": 1716454218035031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218089964, "dur": 5, "args": { "External id": 37222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37222, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37222, "pid": 5, "tid": 7, "ts": 1716454218089964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035045, "dur": 12, "args": { "External id": 37222, "cbid": 211, "correlation": 37222 } }, { "ph": "s", "id": 37222, "pid": 76337, "tid": -914061504, "ts": 1716454218035045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218089971, "dur": 159, "args": { "External id": 37224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37224, "pid": 5, "tid": 7, "ts": 1716454218089971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035060, "dur": 5, "args": { "External id": 37224, "cbid": 211, "correlation": 37224 } }, { "ph": "s", "id": 37224, "pid": 76337, "tid": -914061504, "ts": 1716454218035060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218090132, "dur": 1, "args": { "External id": 37226, "device": 5, "context": 1, "stream": 7, "correlation": 37226, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 37226, "pid": 5, "tid": 7, "ts": 1716454218090132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218035070, "dur": 6, "args": { "External id": 37226, "cbid": 51, "correlation": 37226 } }, { "ph": "s", "id": 37226, "pid": 76337, "tid": -914061504, "ts": 1716454218035070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218090136, "dur": 253, "args": { "External id": 37227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37227, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37227, "pid": 5, "tid": 7, "ts": 1716454218090136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035077, "dur": 7, "args": { "External id": 37227, "cbid": 211, "correlation": 37227 } }, { "ph": "s", "id": 37227, "pid": 76337, "tid": -914061504, "ts": 1716454218035077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218090390, "dur": 6, "args": { "External id": 37229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37229, "pid": 5, "tid": 7, "ts": 1716454218090390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035089, "dur": 5, "args": { "External id": 37229, "cbid": 211, "correlation": 37229 } }, { "ph": "s", "id": 37229, "pid": 76337, "tid": -914061504, "ts": 1716454218035089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218090397, "dur": 6, "args": { "External id": 37235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37235, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37235, "pid": 5, "tid": 7, "ts": 1716454218090397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035117, "dur": 8, "args": { "External id": 37235, "cbid": 211, "correlation": 37235 } }, { "ph": "s", "id": 37235, "pid": 76337, "tid": -914061504, "ts": 1716454218035117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218090404, "dur": 5, "args": { "External id": 37243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37243, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37243, "pid": 5, "tid": 7, "ts": 1716454218090404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035150, "dur": 9, "args": { "External id": 37243, "cbid": 211, "correlation": 37243 } }, { "ph": "s", "id": 37243, "pid": 76337, "tid": -914061504, "ts": 1716454218035150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218090410, "dur": 4, "args": { "External id": 37251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37251, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37251, "pid": 5, "tid": 7, "ts": 1716454218090410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035179, "dur": 9, "args": { "External id": 37251, "cbid": 211, "correlation": 37251 } }, { "ph": "s", "id": 37251, "pid": 76337, "tid": -914061504, "ts": 1716454218035179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218090416, "dur": 9, "args": { "External id": 37271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37271, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 37271, "pid": 5, "tid": 7, "ts": 1716454218090416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035281, "dur": 13, "args": { "External id": 37271, "cbid": 211, "correlation": 37271 } }, { "ph": "s", "id": 37271, "pid": 76337, "tid": -914061504, "ts": 1716454218035281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218090426, "dur": 4, "args": { "External id": 37283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37283, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 37283, "pid": 5, "tid": 7, "ts": 1716454218090426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035305, "dur": 6, "args": { "External id": 37283, "cbid": 211, "correlation": 37283 } }, { "ph": "s", "id": 37283, "pid": 76337, "tid": -914061504, "ts": 1716454218035305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218090431, "dur": 6, "args": { "External id": 37286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37286, "pid": 5, "tid": 7, "ts": 1716454218090431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035323, "dur": 7, "args": { "External id": 37286, "cbid": 211, "correlation": 37286 } }, { "ph": "s", "id": 37286, "pid": 76337, "tid": -914061504, "ts": 1716454218035323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218090439, "dur": 5, "args": { "External id": 37295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37295, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37295, "pid": 5, "tid": 7, "ts": 1716454218090439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035362, "dur": 9, "args": { "External id": 37295, "cbid": 211, "correlation": 37295 } }, { "ph": "s", "id": 37295, "pid": 76337, "tid": -914061504, "ts": 1716454218035362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218035415, "dur": 0, "args": { "External id": 37305, "cbid": 317, "correlation": 37305 } }, { "ph": "f", "id": 37305, "pid": 76337, "tid": -914061504, "ts": 1716454218035415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218035416, "dur": 0, "args": { "External id": 37306, "cbid": 203, "correlation": 37306 } }, { "ph": "f", "id": 37306, "pid": 76337, "tid": -914061504, "ts": 1716454218035416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218035417, "dur": 0, "args": { "External id": 37307, "cbid": 205, "correlation": 37307 } }, { "ph": "f", "id": 37307, "pid": 76337, "tid": -914061504, "ts": 1716454218035417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218090445, "dur": 5, "args": { "External id": 37311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37311, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37311, "pid": 5, "tid": 7, "ts": 1716454218090445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035430, "dur": 11, "args": { "External id": 37311, "cbid": 211, "correlation": 37311 } }, { "ph": "s", "id": 37311, "pid": 76337, "tid": -914061504, "ts": 1716454218035430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218090451, "dur": 158, "args": { "External id": 37313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37313, "pid": 5, "tid": 7, "ts": 1716454218090451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035444, "dur": 5, "args": { "External id": 37313, "cbid": 211, "correlation": 37313 } }, { "ph": "s", "id": 37313, "pid": 76337, "tid": -914061504, "ts": 1716454218035444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218090611, "dur": 1, "args": { "External id": 37315, "device": 5, "context": 1, "stream": 7, "correlation": 37315, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 37315, "pid": 5, "tid": 7, "ts": 1716454218090611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218035454, "dur": 7, "args": { "External id": 37315, "cbid": 51, "correlation": 37315 } }, { "ph": "s", "id": 37315, "pid": 76337, "tid": -914061504, "ts": 1716454218035454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218090615, "dur": 253, "args": { "External id": 37316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37316, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37316, "pid": 5, "tid": 7, "ts": 1716454218090615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035462, "dur": 6, "args": { "External id": 37316, "cbid": 211, "correlation": 37316 } }, { "ph": "s", "id": 37316, "pid": 76337, "tid": -914061504, "ts": 1716454218035462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218090869, "dur": 5, "args": { "External id": 37318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37318, "pid": 5, "tid": 7, "ts": 1716454218090869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035472, "dur": 5, "args": { "External id": 37318, "cbid": 211, "correlation": 37318 } }, { "ph": "s", "id": 37318, "pid": 76337, "tid": -914061504, "ts": 1716454218035472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218090876, "dur": 6, "args": { "External id": 37324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37324, "pid": 5, "tid": 7, "ts": 1716454218090876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035500, "dur": 9, "args": { "External id": 37324, "cbid": 211, "correlation": 37324 } }, { "ph": "s", "id": 37324, "pid": 76337, "tid": -914061504, "ts": 1716454218035500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218090883, "dur": 3, "args": { "External id": 37332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37332, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 37332, "pid": 5, "tid": 7, "ts": 1716454218090883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035545, "dur": 9, "args": { "External id": 37332, "cbid": 211, "correlation": 37332 } }, { "ph": "s", "id": 37332, "pid": 76337, "tid": -914061504, "ts": 1716454218035545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218035606, "dur": 1, "args": { "External id": 37348, "cbid": 251, "correlation": 37348 } }, { "ph": "f", "id": 37348, "pid": 76337, "tid": -914061504, "ts": 1716454218035606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218035611, "dur": 0, "args": { "External id": 37350, "cbid": 251, "correlation": 37350 } }, { "ph": "f", "id": 37350, "pid": 76337, "tid": -914061504, "ts": 1716454218035611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218090888, "dur": 11, "args": { "External id": 37351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37351, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37351, "pid": 5, "tid": 7, "ts": 1716454218090888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035613, "dur": 11, "args": { "External id": 37351, "cbid": 211, "correlation": 37351 } }, { "ph": "s", "id": 37351, "pid": 76337, "tid": -914061504, "ts": 1716454218035613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218090900, "dur": 3, "args": { "External id": 37353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37353, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37353, "pid": 5, "tid": 7, "ts": 1716454218090900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035627, "dur": 5, "args": { "External id": 37353, "cbid": 211, "correlation": 37353 } }, { "ph": "s", "id": 37353, "pid": 76337, "tid": -914061504, "ts": 1716454218035627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218090905, "dur": 5, "args": { "External id": 37363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37363, "pid": 5, "tid": 7, "ts": 1716454218090905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035683, "dur": 12, "args": { "External id": 37363, "cbid": 211, "correlation": 37363 } }, { "ph": "s", "id": 37363, "pid": 76337, "tid": -914061504, "ts": 1716454218035683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218090912, "dur": 9, "args": { "External id": 37383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37383, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 37383, "pid": 5, "tid": 7, "ts": 1716454218090912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035749, "dur": 11, "args": { "External id": 37383, "cbid": 211, "correlation": 37383 } }, { "ph": "s", "id": 37383, "pid": 76337, "tid": -914061504, "ts": 1716454218035749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218090922, "dur": 3, "args": { "External id": 37395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37395, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 37395, "pid": 5, "tid": 7, "ts": 1716454218090922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035769, "dur": 7, "args": { "External id": 37395, "cbid": 211, "correlation": 37395 } }, { "ph": "s", "id": 37395, "pid": 76337, "tid": -914061504, "ts": 1716454218035769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218090927, "dur": 7, "args": { "External id": 37398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37398, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37398, "pid": 5, "tid": 7, "ts": 1716454218090927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035788, "dur": 6, "args": { "External id": 37398, "cbid": 211, "correlation": 37398 } }, { "ph": "s", "id": 37398, "pid": 76337, "tid": -914061504, "ts": 1716454218035788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218090935, "dur": 4, "args": { "External id": 37407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37407, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37407, "pid": 5, "tid": 7, "ts": 1716454218090935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035829, "dur": 10, "args": { "External id": 37407, "cbid": 211, "correlation": 37407 } }, { "ph": "s", "id": 37407, "pid": 76337, "tid": -914061504, "ts": 1716454218035829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218035892, "dur": 0, "args": { "External id": 37417, "cbid": 317, "correlation": 37417 } }, { "ph": "f", "id": 37417, "pid": 76337, "tid": -914061504, "ts": 1716454218035892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218035893, "dur": 0, "args": { "External id": 37418, "cbid": 203, "correlation": 37418 } }, { "ph": "f", "id": 37418, "pid": 76337, "tid": -914061504, "ts": 1716454218035893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218035894, "dur": 0, "args": { "External id": 37419, "cbid": 205, "correlation": 37419 } }, { "ph": "f", "id": 37419, "pid": 76337, "tid": -914061504, "ts": 1716454218035894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218090941, "dur": 5, "args": { "External id": 37423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37423, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37423, "pid": 5, "tid": 7, "ts": 1716454218090941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035907, "dur": 12, "args": { "External id": 37423, "cbid": 211, "correlation": 37423 } }, { "ph": "s", "id": 37423, "pid": 76337, "tid": -914061504, "ts": 1716454218035907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218090947, "dur": 159, "args": { "External id": 37425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37425, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37425, "pid": 5, "tid": 7, "ts": 1716454218090947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035921, "dur": 5, "args": { "External id": 37425, "cbid": 211, "correlation": 37425 } }, { "ph": "s", "id": 37425, "pid": 76337, "tid": -914061504, "ts": 1716454218035921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218091108, "dur": 1, "args": { "External id": 37427, "device": 5, "context": 1, "stream": 7, "correlation": 37427, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 37427, "pid": 5, "tid": 7, "ts": 1716454218091108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218035932, "dur": 6, "args": { "External id": 37427, "cbid": 51, "correlation": 37427 } }, { "ph": "s", "id": 37427, "pid": 76337, "tid": -914061504, "ts": 1716454218035932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218091112, "dur": 252, "args": { "External id": 37428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37428, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37428, "pid": 5, "tid": 7, "ts": 1716454218091112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035940, "dur": 6, "args": { "External id": 37428, "cbid": 211, "correlation": 37428 } }, { "ph": "s", "id": 37428, "pid": 76337, "tid": -914061504, "ts": 1716454218035940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218091365, "dur": 5, "args": { "External id": 37430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37430, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37430, "pid": 5, "tid": 7, "ts": 1716454218091365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035950, "dur": 5, "args": { "External id": 37430, "cbid": 211, "correlation": 37430 } }, { "ph": "s", "id": 37430, "pid": 76337, "tid": -914061504, "ts": 1716454218035950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218091372, "dur": 6, "args": { "External id": 37436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37436, "pid": 5, "tid": 7, "ts": 1716454218091372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218035987, "dur": 9, "args": { "External id": 37436, "cbid": 211, "correlation": 37436 } }, { "ph": "s", "id": 37436, "pid": 76337, "tid": -914061504, "ts": 1716454218035987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218091379, "dur": 5, "args": { "External id": 37444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37444, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37444, "pid": 5, "tid": 7, "ts": 1716454218091379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036020, "dur": 8, "args": { "External id": 37444, "cbid": 211, "correlation": 37444 } }, { "ph": "s", "id": 37444, "pid": 76337, "tid": -914061504, "ts": 1716454218036020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218091385, "dur": 4, "args": { "External id": 37452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37452, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37452, "pid": 5, "tid": 7, "ts": 1716454218091385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036050, "dur": 8, "args": { "External id": 37452, "cbid": 211, "correlation": 37452 } }, { "ph": "s", "id": 37452, "pid": 76337, "tid": -914061504, "ts": 1716454218036050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218091391, "dur": 9, "args": { "External id": 37472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37472, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 37472, "pid": 5, "tid": 7, "ts": 1716454218091391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036191, "dur": 13, "args": { "External id": 37472, "cbid": 211, "correlation": 37472 } }, { "ph": "s", "id": 37472, "pid": 76337, "tid": -914061504, "ts": 1716454218036191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218091401, "dur": 3, "args": { "External id": 37484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37484, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 37484, "pid": 5, "tid": 7, "ts": 1716454218091401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036215, "dur": 6, "args": { "External id": 37484, "cbid": 211, "correlation": 37484 } }, { "ph": "s", "id": 37484, "pid": 76337, "tid": -914061504, "ts": 1716454218036215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218091406, "dur": 6, "args": { "External id": 37487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37487, "pid": 5, "tid": 7, "ts": 1716454218091406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036233, "dur": 6, "args": { "External id": 37487, "cbid": 211, "correlation": 37487 } }, { "ph": "s", "id": 37487, "pid": 76337, "tid": -914061504, "ts": 1716454218036233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218036291, "dur": 0, "args": { "External id": 37498, "cbid": 317, "correlation": 37498 } }, { "ph": "f", "id": 37498, "pid": 76337, "tid": -914061504, "ts": 1716454218036291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218036292, "dur": 0, "args": { "External id": 37499, "cbid": 203, "correlation": 37499 } }, { "ph": "f", "id": 37499, "pid": 76337, "tid": -914061504, "ts": 1716454218036292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218036293, "dur": 0, "args": { "External id": 37500, "cbid": 205, "correlation": 37500 } }, { "ph": "f", "id": 37500, "pid": 76337, "tid": -914061504, "ts": 1716454218036293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218091414, "dur": 5, "args": { "External id": 37504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37504, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37504, "pid": 5, "tid": 7, "ts": 1716454218091414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036311, "dur": 12, "args": { "External id": 37504, "cbid": 211, "correlation": 37504 } }, { "ph": "s", "id": 37504, "pid": 76337, "tid": -914061504, "ts": 1716454218036311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218091420, "dur": 36, "args": { "External id": 37506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37506, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 37506, "pid": 5, "tid": 7, "ts": 1716454218091420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036330, "dur": 9, "args": { "External id": 37506, "cbid": 211, "correlation": 37506 } }, { "ph": "s", "id": 37506, "pid": 76337, "tid": -914061504, "ts": 1716454218036330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218091457, "dur": 5, "args": { "External id": 37508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37508, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37508, "pid": 5, "tid": 7, "ts": 1716454218091457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036343, "dur": 6, "args": { "External id": 37508, "cbid": 211, "correlation": 37508 } }, { "ph": "s", "id": 37508, "pid": 76337, "tid": -914061504, "ts": 1716454218036343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218091464, "dur": 6, "args": { "External id": 37514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37514, "pid": 5, "tid": 7, "ts": 1716454218091464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036371, "dur": 8, "args": { "External id": 37514, "cbid": 211, "correlation": 37514 } }, { "ph": "s", "id": 37514, "pid": 76337, "tid": -914061504, "ts": 1716454218036371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218091471, "dur": 20, "args": { "External id": 37523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37523, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37523, "pid": 5, "tid": 7, "ts": 1716454218091471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036454, "dur": 14, "args": { "External id": 37523, "cbid": 211, "correlation": 37523 } }, { "ph": "s", "id": 37523, "pid": 76337, "tid": -914061504, "ts": 1716454218036454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218091492, "dur": 11, "args": { "External id": 37545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37545, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 37545, "pid": 5, "tid": 7, "ts": 1716454218091492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036512, "dur": 10, "args": { "External id": 37545, "cbid": 211, "correlation": 37545 } }, { "ph": "s", "id": 37545, "pid": 76337, "tid": -914061504, "ts": 1716454218036512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036601, "dur": 2, "args": { "External id": 37556, "cbid": 251, "correlation": 37556 } }, { "ph": "f", "id": 37556, "pid": 76337, "tid": -914061504, "ts": 1716454218036601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036606, "dur": 0, "args": { "External id": 37557, "cbid": 251, "correlation": 37557 } }, { "ph": "f", "id": 37557, "pid": 76337, "tid": -914061504, "ts": 1716454218036606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218091504, "dur": 52, "args": { "External id": 37558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37558, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 37558, "pid": 5, "tid": 7, "ts": 1716454218091504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036609, "dur": 14, "args": { "External id": 37558, "cbid": 211, "correlation": 37558 } }, { "ph": "s", "id": 37558, "pid": 76337, "tid": -914061504, "ts": 1716454218036609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036681, "dur": 1, "args": { "External id": 37569, "cbid": 251, "correlation": 37569 } }, { "ph": "f", "id": 37569, "pid": 76337, "tid": -914061504, "ts": 1716454218036681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036685, "dur": 0, "args": { "External id": 37570, "cbid": 251, "correlation": 37570 } }, { "ph": "f", "id": 37570, "pid": 76337, "tid": -914061504, "ts": 1716454218036685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218091557, "dur": 51, "args": { "External id": 37571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37571, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 37571, "pid": 5, "tid": 7, "ts": 1716454218091557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036687, "dur": 12, "args": { "External id": 37571, "cbid": 211, "correlation": 37571 } }, { "ph": "s", "id": 37571, "pid": 76337, "tid": -914061504, "ts": 1716454218036687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036751, "dur": 1, "args": { "External id": 37582, "cbid": 251, "correlation": 37582 } }, { "ph": "f", "id": 37582, "pid": 76337, "tid": -914061504, "ts": 1716454218036751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036755, "dur": 0, "args": { "External id": 37583, "cbid": 251, "correlation": 37583 } }, { "ph": "f", "id": 37583, "pid": 76337, "tid": -914061504, "ts": 1716454218036755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218091610, "dur": 52, "args": { "External id": 37584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37584, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 37584, "pid": 5, "tid": 7, "ts": 1716454218091610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036756, "dur": 11, "args": { "External id": 37584, "cbid": 211, "correlation": 37584 } }, { "ph": "s", "id": 37584, "pid": 76337, "tid": -914061504, "ts": 1716454218036756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218091663, "dur": 55, "args": { "External id": 37609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37609, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37609, "pid": 5, "tid": 7, "ts": 1716454218091663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036840, "dur": 13, "args": { "External id": 37609, "cbid": 211, "correlation": 37609 } }, { "ph": "s", "id": 37609, "pid": 76337, "tid": -914061504, "ts": 1716454218036840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218036939, "dur": 1, "args": { "External id": 37627, "cbid": 251, "correlation": 37627 } }, { "ph": "f", "id": 37627, "pid": 76337, "tid": -914061504, "ts": 1716454218036939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218091720, "dur": 62, "args": { "External id": 37629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37629, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 37629, "pid": 5, "tid": 7, "ts": 1716454218091720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218036945, "dur": 13, "args": { "External id": 37629, "cbid": 211, "correlation": 37629 } }, { "ph": "s", "id": 37629, "pid": 76337, "tid": -914061504, "ts": 1716454218036945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218091784, "dur": 6, "args": { "External id": 37637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37637, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37637, "pid": 5, "tid": 7, "ts": 1716454218091784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037024, "dur": 13, "args": { "External id": 37637, "cbid": 211, "correlation": 37637 } }, { "ph": "s", "id": 37637, "pid": 76337, "tid": -914061504, "ts": 1716454218037024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218091791, "dur": 7, "args": { "External id": 37645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37645, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37645, "pid": 5, "tid": 7, "ts": 1716454218091791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037063, "dur": 9, "args": { "External id": 37645, "cbid": 211, "correlation": 37645 } }, { "ph": "s", "id": 37645, "pid": 76337, "tid": -914061504, "ts": 1716454218037063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218091799, "dur": 7, "args": { "External id": 37656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37656, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37656, "pid": 5, "tid": 7, "ts": 1716454218091799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037136, "dur": 13, "args": { "External id": 37656, "cbid": 211, "correlation": 37656 } }, { "ph": "s", "id": 37656, "pid": 76337, "tid": -914061504, "ts": 1716454218037136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218091808, "dur": 8, "args": { "External id": 37678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37678, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 37678, "pid": 5, "tid": 7, "ts": 1716454218091808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037169, "dur": 7, "args": { "External id": 37678, "cbid": 211, "correlation": 37678 } }, { "ph": "s", "id": 37678, "pid": 76337, "tid": -914061504, "ts": 1716454218037169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037255, "dur": 2, "args": { "External id": 37689, "cbid": 251, "correlation": 37689 } }, { "ph": "f", "id": 37689, "pid": 76337, "tid": -914061504, "ts": 1716454218037255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218091818, "dur": 1, "args": { "External id": 37690, "device": 5, "context": 1, "stream": 7, "correlation": 37690, "bytes": 480, "memory bandwidth (GB/s)": 0.2542372881355932 } }, { "ph": "f", "id": 37690, "pid": 5, "tid": 7, "ts": 1716454218091818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218037261, "dur": 10, "args": { "External id": 37690, "cbid": 51, "correlation": 37690 } }, { "ph": "s", "id": 37690, "pid": 76337, "tid": -914061504, "ts": 1716454218037261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218091822, "dur": 35, "args": { "External id": 37691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37691, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 37691, "pid": 5, "tid": 7, "ts": 1716454218091822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037273, "dur": 12, "args": { "External id": 37691, "cbid": 211, "correlation": 37691 } }, { "ph": "s", "id": 37691, "pid": 76337, "tid": -914061504, "ts": 1716454218037273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037344, "dur": 1, "args": { "External id": 37702, "cbid": 251, "correlation": 37702 } }, { "ph": "f", "id": 37702, "pid": 76337, "tid": -914061504, "ts": 1716454218037344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037347, "dur": 0, "args": { "External id": 37703, "cbid": 251, "correlation": 37703 } }, { "ph": "f", "id": 37703, "pid": 76337, "tid": -914061504, "ts": 1716454218037347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218091859, "dur": 11, "args": { "External id": 37704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37704, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37704, "pid": 5, "tid": 7, "ts": 1716454218091859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037349, "dur": 12, "args": { "External id": 37704, "cbid": 211, "correlation": 37704 } }, { "ph": "s", "id": 37704, "pid": 76337, "tid": -914061504, "ts": 1716454218037349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218091871, "dur": 5, "args": { "External id": 37706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37706, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37706, "pid": 5, "tid": 7, "ts": 1716454218091871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037362, "dur": 6, "args": { "External id": 37706, "cbid": 211, "correlation": 37706 } }, { "ph": "s", "id": 37706, "pid": 76337, "tid": -914061504, "ts": 1716454218037362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037419, "dur": 1, "args": { "External id": 37717, "cbid": 251, "correlation": 37717 } }, { "ph": "f", "id": 37717, "pid": 76337, "tid": -914061504, "ts": 1716454218037419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037422, "dur": 0, "args": { "External id": 37718, "cbid": 251, "correlation": 37718 } }, { "ph": "f", "id": 37718, "pid": 76337, "tid": -914061504, "ts": 1716454218037422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218091877, "dur": 8, "args": { "External id": 37719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37719, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37719, "pid": 5, "tid": 7, "ts": 1716454218091877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037424, "dur": 12, "args": { "External id": 37719, "cbid": 211, "correlation": 37719 } }, { "ph": "s", "id": 37719, "pid": 76337, "tid": -914061504, "ts": 1716454218037424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218091887, "dur": 3, "args": { "External id": 37721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37721, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37721, "pid": 5, "tid": 7, "ts": 1716454218091887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037437, "dur": 5, "args": { "External id": 37721, "cbid": 211, "correlation": 37721 } }, { "ph": "s", "id": 37721, "pid": 76337, "tid": -914061504, "ts": 1716454218037437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218091891, "dur": 20, "args": { "External id": 37746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37746, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 37746, "pid": 5, "tid": 7, "ts": 1716454218091891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037514, "dur": 12, "args": { "External id": 37746, "cbid": 211, "correlation": 37746 } }, { "ph": "s", "id": 37746, "pid": 76337, "tid": -914061504, "ts": 1716454218037514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037614, "dur": 2, "args": { "External id": 37764, "cbid": 251, "correlation": 37764 } }, { "ph": "f", "id": 37764, "pid": 76337, "tid": -914061504, "ts": 1716454218037614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218091913, "dur": 1, "args": { "External id": 37766, "device": 5, "context": 1, "stream": 7, "correlation": 37766, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 37766, "pid": 5, "tid": 7, "ts": 1716454218091913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218037620, "dur": 11, "args": { "External id": 37766, "cbid": 51, "correlation": 37766 } }, { "ph": "s", "id": 37766, "pid": 76337, "tid": -914061504, "ts": 1716454218037620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218091917, "dur": 35, "args": { "External id": 37767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37767, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 37767, "pid": 5, "tid": 7, "ts": 1716454218091917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037632, "dur": 13, "args": { "External id": 37767, "cbid": 211, "correlation": 37767 } }, { "ph": "s", "id": 37767, "pid": 76337, "tid": -914061504, "ts": 1716454218037632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218091953, "dur": 4, "args": { "External id": 37775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37775, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37775, "pid": 5, "tid": 7, "ts": 1716454218091953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037703, "dur": 12, "args": { "External id": 37775, "cbid": 211, "correlation": 37775 } }, { "ph": "s", "id": 37775, "pid": 76337, "tid": -914061504, "ts": 1716454218037703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218091959, "dur": 8, "args": { "External id": 37783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37783, "pid": 5, "tid": 7, "ts": 1716454218091959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037744, "dur": 10, "args": { "External id": 37783, "cbid": 211, "correlation": 37783 } }, { "ph": "s", "id": 37783, "pid": 76337, "tid": -914061504, "ts": 1716454218037744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218091968, "dur": 8, "args": { "External id": 37805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37805, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 37805, "pid": 5, "tid": 7, "ts": 1716454218091968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037795, "dur": 11, "args": { "External id": 37805, "cbid": 211, "correlation": 37805 } }, { "ph": "s", "id": 37805, "pid": 76337, "tid": -914061504, "ts": 1716454218037795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037886, "dur": 1, "args": { "External id": 37821, "cbid": 251, "correlation": 37821 } }, { "ph": "f", "id": 37821, "pid": 76337, "tid": -914061504, "ts": 1716454218037886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218037891, "dur": 0, "args": { "External id": 37823, "cbid": 251, "correlation": 37823 } }, { "ph": "f", "id": 37823, "pid": 76337, "tid": -914061504, "ts": 1716454218037891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218091978, "dur": 188, "args": { "External id": 37824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37824, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37824, "pid": 5, "tid": 7, "ts": 1716454218091978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037893, "dur": 13, "args": { "External id": 37824, "cbid": 211, "correlation": 37824 } }, { "ph": "s", "id": 37824, "pid": 76337, "tid": -914061504, "ts": 1716454218037893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092167, "dur": 20, "args": { "External id": 37832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37832, "pid": 5, "tid": 7, "ts": 1716454218092167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037960, "dur": 21, "args": { "External id": 37832, "cbid": 211, "correlation": 37832 } }, { "ph": "s", "id": 37832, "pid": 76337, "tid": -914061504, "ts": 1716454218037960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092189, "dur": 22, "args": { "External id": 37840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37840, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37840, "pid": 5, "tid": 7, "ts": 1716454218092189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218037999, "dur": 9, "args": { "External id": 37840, "cbid": 211, "correlation": 37840 } }, { "ph": "s", "id": 37840, "pid": 76337, "tid": -914061504, "ts": 1716454218037999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218038083, "dur": 1, "args": { "External id": 37856, "cbid": 251, "correlation": 37856 } }, { "ph": "f", "id": 37856, "pid": 76337, "tid": -914061504, "ts": 1716454218038083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218092213, "dur": 1, "args": { "External id": 37858, "device": 5, "context": 1, "stream": 7, "correlation": 37858, "bytes": 120, "memory bandwidth (GB/s)": 0.0797872340425532 } }, { "ph": "f", "id": 37858, "pid": 5, "tid": 7, "ts": 1716454218092213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218038088, "dur": 9, "args": { "External id": 37858, "cbid": 51, "correlation": 37858 } }, { "ph": "s", "id": 37858, "pid": 76337, "tid": -914061504, "ts": 1716454218038088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218092216, "dur": 108, "args": { "External id": 37859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37859, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 37859, "pid": 5, "tid": 7, "ts": 1716454218092216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038098, "dur": 12, "args": { "External id": 37859, "cbid": 211, "correlation": 37859 } }, { "ph": "s", "id": 37859, "pid": 76337, "tid": -914061504, "ts": 1716454218038098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218092325, "dur": 5, "args": { "External id": 37867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37867, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37867, "pid": 5, "tid": 7, "ts": 1716454218092325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038141, "dur": 10, "args": { "External id": 37867, "cbid": 211, "correlation": 37867 } }, { "ph": "s", "id": 37867, "pid": 76337, "tid": -914061504, "ts": 1716454218038141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092331, "dur": 10, "args": { "External id": 37878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37878, "pid": 5, "tid": 7, "ts": 1716454218092331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038209, "dur": 12, "args": { "External id": 37878, "cbid": 211, "correlation": 37878 } }, { "ph": "s", "id": 37878, "pid": 76337, "tid": -914061504, "ts": 1716454218038209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218038274, "dur": 0, "args": { "External id": 37890, "cbid": 317, "correlation": 37890 } }, { "ph": "f", "id": 37890, "pid": 76337, "tid": -914061504, "ts": 1716454218038274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218038275, "dur": 0, "args": { "External id": 37891, "cbid": 203, "correlation": 37891 } }, { "ph": "f", "id": 37891, "pid": 76337, "tid": -914061504, "ts": 1716454218038275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218038275, "dur": 0, "args": { "External id": 37892, "cbid": 205, "correlation": 37892 } }, { "ph": "f", "id": 37892, "pid": 76337, "tid": -914061504, "ts": 1716454218038275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092342, "dur": 5, "args": { "External id": 37896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37896, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37896, "pid": 5, "tid": 7, "ts": 1716454218092342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038291, "dur": 12, "args": { "External id": 37896, "cbid": 211, "correlation": 37896 } }, { "ph": "s", "id": 37896, "pid": 76337, "tid": -914061504, "ts": 1716454218038291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218092349, "dur": 36, "args": { "External id": 37898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37898, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 37898, "pid": 5, "tid": 7, "ts": 1716454218092349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038309, "dur": 7, "args": { "External id": 37898, "cbid": 211, "correlation": 37898 } }, { "ph": "s", "id": 37898, "pid": 76337, "tid": -914061504, "ts": 1716454218038309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092386, "dur": 6, "args": { "External id": 37900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37900, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37900, "pid": 5, "tid": 7, "ts": 1716454218092386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038320, "dur": 5, "args": { "External id": 37900, "cbid": 211, "correlation": 37900 } }, { "ph": "s", "id": 37900, "pid": 76337, "tid": -914061504, "ts": 1716454218038320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092393, "dur": 7, "args": { "External id": 37906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37906, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37906, "pid": 5, "tid": 7, "ts": 1716454218092393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038347, "dur": 8, "args": { "External id": 37906, "cbid": 211, "correlation": 37906 } }, { "ph": "s", "id": 37906, "pid": 76337, "tid": -914061504, "ts": 1716454218038347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218092402, "dur": 5, "args": { "External id": 37914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37914, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37914, "pid": 5, "tid": 7, "ts": 1716454218092402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038379, "dur": 8, "args": { "External id": 37914, "cbid": 211, "correlation": 37914 } }, { "ph": "s", "id": 37914, "pid": 76337, "tid": -914061504, "ts": 1716454218038379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218092408, "dur": 10, "args": { "External id": 37934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37934, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 37934, "pid": 5, "tid": 7, "ts": 1716454218092408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038453, "dur": 11, "args": { "External id": 37934, "cbid": 211, "correlation": 37934 } }, { "ph": "s", "id": 37934, "pid": 76337, "tid": -914061504, "ts": 1716454218038453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218092419, "dur": 4, "args": { "External id": 37946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37946, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 37946, "pid": 5, "tid": 7, "ts": 1716454218092419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038474, "dur": 6, "args": { "External id": 37946, "cbid": 211, "correlation": 37946 } }, { "ph": "s", "id": 37946, "pid": 76337, "tid": -914061504, "ts": 1716454218038474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092425, "dur": 8, "args": { "External id": 37949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37949, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37949, "pid": 5, "tid": 7, "ts": 1716454218092425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038491, "dur": 7, "args": { "External id": 37949, "cbid": 211, "correlation": 37949 } }, { "ph": "s", "id": 37949, "pid": 76337, "tid": -914061504, "ts": 1716454218038491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218092434, "dur": 5, "args": { "External id": 37958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37958, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37958, "pid": 5, "tid": 7, "ts": 1716454218092434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038531, "dur": 10, "args": { "External id": 37958, "cbid": 211, "correlation": 37958 } }, { "ph": "s", "id": 37958, "pid": 76337, "tid": -914061504, "ts": 1716454218038531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218038583, "dur": 0, "args": { "External id": 37968, "cbid": 317, "correlation": 37968 } }, { "ph": "f", "id": 37968, "pid": 76337, "tid": -914061504, "ts": 1716454218038583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218038583, "dur": 0, "args": { "External id": 37969, "cbid": 203, "correlation": 37969 } }, { "ph": "f", "id": 37969, "pid": 76337, "tid": -914061504, "ts": 1716454218038583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218038584, "dur": 0, "args": { "External id": 37970, "cbid": 205, "correlation": 37970 } }, { "ph": "f", "id": 37970, "pid": 76337, "tid": -914061504, "ts": 1716454218038584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092440, "dur": 5, "args": { "External id": 37974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37974, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37974, "pid": 5, "tid": 7, "ts": 1716454218092440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038597, "dur": 11, "args": { "External id": 37974, "cbid": 211, "correlation": 37974 } }, { "ph": "s", "id": 37974, "pid": 76337, "tid": -914061504, "ts": 1716454218038597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092446, "dur": 157, "args": { "External id": 37976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37976, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37976, "pid": 5, "tid": 7, "ts": 1716454218092446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038611, "dur": 5, "args": { "External id": 37976, "cbid": 211, "correlation": 37976 } }, { "ph": "s", "id": 37976, "pid": 76337, "tid": -914061504, "ts": 1716454218038611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218092606, "dur": 1, "args": { "External id": 37978, "device": 5, "context": 1, "stream": 7, "correlation": 37978, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 37978, "pid": 5, "tid": 7, "ts": 1716454218092606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218038622, "dur": 6, "args": { "External id": 37978, "cbid": 51, "correlation": 37978 } }, { "ph": "s", "id": 37978, "pid": 76337, "tid": -914061504, "ts": 1716454218038622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218092610, "dur": 263, "args": { "External id": 37979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37979, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 37979, "pid": 5, "tid": 7, "ts": 1716454218092610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038629, "dur": 6, "args": { "External id": 37979, "cbid": 211, "correlation": 37979 } }, { "ph": "s", "id": 37979, "pid": 76337, "tid": -914061504, "ts": 1716454218038629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092874, "dur": 5, "args": { "External id": 37981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 37981, "pid": 5, "tid": 7, "ts": 1716454218092874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038639, "dur": 5, "args": { "External id": 37981, "cbid": 211, "correlation": 37981 } }, { "ph": "s", "id": 37981, "pid": 76337, "tid": -914061504, "ts": 1716454218038639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092881, "dur": 6, "args": { "External id": 37987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 37987, "pid": 5, "tid": 7, "ts": 1716454218092881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038667, "dur": 9, "args": { "External id": 37987, "cbid": 211, "correlation": 37987 } }, { "ph": "s", "id": 37987, "pid": 76337, "tid": -914061504, "ts": 1716454218038667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218092888, "dur": 3, "args": { "External id": 37995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 37995, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 37995, "pid": 5, "tid": 7, "ts": 1716454218092888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038710, "dur": 9, "args": { "External id": 37995, "cbid": 211, "correlation": 37995 } }, { "ph": "s", "id": 37995, "pid": 76337, "tid": -914061504, "ts": 1716454218038710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218038775, "dur": 1, "args": { "External id": 38011, "cbid": 251, "correlation": 38011 } }, { "ph": "f", "id": 38011, "pid": 76337, "tid": -914061504, "ts": 1716454218038775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218038780, "dur": 0, "args": { "External id": 38013, "cbid": 251, "correlation": 38013 } }, { "ph": "f", "id": 38013, "pid": 76337, "tid": -914061504, "ts": 1716454218038780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218092893, "dur": 12, "args": { "External id": 38014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38014, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38014, "pid": 5, "tid": 7, "ts": 1716454218092893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038782, "dur": 12, "args": { "External id": 38014, "cbid": 211, "correlation": 38014 } }, { "ph": "s", "id": 38014, "pid": 76337, "tid": -914061504, "ts": 1716454218038782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218092906, "dur": 5, "args": { "External id": 38016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38016, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38016, "pid": 5, "tid": 7, "ts": 1716454218092906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038795, "dur": 5, "args": { "External id": 38016, "cbid": 211, "correlation": 38016 } }, { "ph": "s", "id": 38016, "pid": 76337, "tid": -914061504, "ts": 1716454218038795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092912, "dur": 5, "args": { "External id": 38026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38026, "pid": 5, "tid": 7, "ts": 1716454218092912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038853, "dur": 12, "args": { "External id": 38026, "cbid": 211, "correlation": 38026 } }, { "ph": "s", "id": 38026, "pid": 76337, "tid": -914061504, "ts": 1716454218038853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218092919, "dur": 10, "args": { "External id": 38046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38046, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38046, "pid": 5, "tid": 7, "ts": 1716454218092919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038919, "dur": 11, "args": { "External id": 38046, "cbid": 211, "correlation": 38046 } }, { "ph": "s", "id": 38046, "pid": 76337, "tid": -914061504, "ts": 1716454218038919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218092930, "dur": 4, "args": { "External id": 38058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38058, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 38058, "pid": 5, "tid": 7, "ts": 1716454218092930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038939, "dur": 6, "args": { "External id": 38058, "cbid": 211, "correlation": 38058 } }, { "ph": "s", "id": 38058, "pid": 76337, "tid": -914061504, "ts": 1716454218038939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218092935, "dur": 6, "args": { "External id": 38061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38061, "pid": 5, "tid": 7, "ts": 1716454218092935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218038958, "dur": 7, "args": { "External id": 38061, "cbid": 211, "correlation": 38061 } }, { "ph": "s", "id": 38061, "pid": 76337, "tid": -914061504, "ts": 1716454218038958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218092943, "dur": 4, "args": { "External id": 38070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38070, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38070, "pid": 5, "tid": 7, "ts": 1716454218092943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039007, "dur": 11, "args": { "External id": 38070, "cbid": 211, "correlation": 38070 } }, { "ph": "s", "id": 38070, "pid": 76337, "tid": -914061504, "ts": 1716454218039007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218039071, "dur": 0, "args": { "External id": 38080, "cbid": 317, "correlation": 38080 } }, { "ph": "f", "id": 38080, "pid": 76337, "tid": -914061504, "ts": 1716454218039071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218039072, "dur": 0, "args": { "External id": 38081, "cbid": 203, "correlation": 38081 } }, { "ph": "f", "id": 38081, "pid": 76337, "tid": -914061504, "ts": 1716454218039072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218039073, "dur": 0, "args": { "External id": 38082, "cbid": 205, "correlation": 38082 } }, { "ph": "f", "id": 38082, "pid": 76337, "tid": -914061504, "ts": 1716454218039073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092949, "dur": 5, "args": { "External id": 38086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38086, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38086, "pid": 5, "tid": 7, "ts": 1716454218092949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039087, "dur": 12, "args": { "External id": 38086, "cbid": 211, "correlation": 38086 } }, { "ph": "s", "id": 38086, "pid": 76337, "tid": -914061504, "ts": 1716454218039087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218092955, "dur": 158, "args": { "External id": 38088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38088, "pid": 5, "tid": 7, "ts": 1716454218092955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039101, "dur": 5, "args": { "External id": 38088, "cbid": 211, "correlation": 38088 } }, { "ph": "s", "id": 38088, "pid": 76337, "tid": -914061504, "ts": 1716454218039101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218093115, "dur": 1, "args": { "External id": 38090, "device": 5, "context": 1, "stream": 7, "correlation": 38090, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 38090, "pid": 5, "tid": 7, "ts": 1716454218093115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218039111, "dur": 6, "args": { "External id": 38090, "cbid": 51, "correlation": 38090 } }, { "ph": "s", "id": 38090, "pid": 76337, "tid": -914061504, "ts": 1716454218039111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218093119, "dur": 252, "args": { "External id": 38091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38091, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38091, "pid": 5, "tid": 7, "ts": 1716454218093119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039119, "dur": 6, "args": { "External id": 38091, "cbid": 211, "correlation": 38091 } }, { "ph": "s", "id": 38091, "pid": 76337, "tid": -914061504, "ts": 1716454218039119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218093372, "dur": 6, "args": { "External id": 38093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38093, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38093, "pid": 5, "tid": 7, "ts": 1716454218093372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039129, "dur": 5, "args": { "External id": 38093, "cbid": 211, "correlation": 38093 } }, { "ph": "s", "id": 38093, "pid": 76337, "tid": -914061504, "ts": 1716454218039129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218093380, "dur": 6, "args": { "External id": 38099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38099, "pid": 5, "tid": 7, "ts": 1716454218093380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039158, "dur": 8, "args": { "External id": 38099, "cbid": 211, "correlation": 38099 } }, { "ph": "s", "id": 38099, "pid": 76337, "tid": -914061504, "ts": 1716454218039158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218093387, "dur": 5, "args": { "External id": 38107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38107, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38107, "pid": 5, "tid": 7, "ts": 1716454218093387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039191, "dur": 8, "args": { "External id": 38107, "cbid": 211, "correlation": 38107 } }, { "ph": "s", "id": 38107, "pid": 76337, "tid": -914061504, "ts": 1716454218039191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218093393, "dur": 4, "args": { "External id": 38115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38115, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38115, "pid": 5, "tid": 7, "ts": 1716454218093393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039219, "dur": 9, "args": { "External id": 38115, "cbid": 211, "correlation": 38115 } }, { "ph": "s", "id": 38115, "pid": 76337, "tid": -914061504, "ts": 1716454218039219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218093398, "dur": 11, "args": { "External id": 38124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38124, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38124, "pid": 5, "tid": 7, "ts": 1716454218093398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039308, "dur": 13, "args": { "External id": 38124, "cbid": 211, "correlation": 38124 } }, { "ph": "s", "id": 38124, "pid": 76337, "tid": -914061504, "ts": 1716454218039308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218093411, "dur": 12, "args": { "External id": 38144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38144, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38144, "pid": 5, "tid": 7, "ts": 1716454218093411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039376, "dur": 11, "args": { "External id": 38144, "cbid": 211, "correlation": 38144 } }, { "ph": "s", "id": 38144, "pid": 76337, "tid": -914061504, "ts": 1716454218039376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218093424, "dur": 4, "args": { "External id": 38156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38156, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38156, "pid": 5, "tid": 7, "ts": 1716454218093424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039398, "dur": 6, "args": { "External id": 38156, "cbid": 211, "correlation": 38156 } }, { "ph": "s", "id": 38156, "pid": 76337, "tid": -914061504, "ts": 1716454218039398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218093429, "dur": 10, "args": { "External id": 38159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38159, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38159, "pid": 5, "tid": 7, "ts": 1716454218093429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039416, "dur": 6, "args": { "External id": 38159, "cbid": 211, "correlation": 38159 } }, { "ph": "s", "id": 38159, "pid": 76337, "tid": -914061504, "ts": 1716454218039416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218093440, "dur": 6, "args": { "External id": 38168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38168, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38168, "pid": 5, "tid": 7, "ts": 1716454218093440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039454, "dur": 9, "args": { "External id": 38168, "cbid": 211, "correlation": 38168 } }, { "ph": "s", "id": 38168, "pid": 76337, "tid": -914061504, "ts": 1716454218039454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218039506, "dur": 0, "args": { "External id": 38178, "cbid": 317, "correlation": 38178 } }, { "ph": "f", "id": 38178, "pid": 76337, "tid": -914061504, "ts": 1716454218039506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218039507, "dur": 0, "args": { "External id": 38179, "cbid": 203, "correlation": 38179 } }, { "ph": "f", "id": 38179, "pid": 76337, "tid": -914061504, "ts": 1716454218039507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218039508, "dur": 0, "args": { "External id": 38180, "cbid": 205, "correlation": 38180 } }, { "ph": "f", "id": 38180, "pid": 76337, "tid": -914061504, "ts": 1716454218039508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218093447, "dur": 6, "args": { "External id": 38184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38184, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38184, "pid": 5, "tid": 7, "ts": 1716454218093447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039524, "dur": 11, "args": { "External id": 38184, "cbid": 211, "correlation": 38184 } }, { "ph": "s", "id": 38184, "pid": 76337, "tid": -914061504, "ts": 1716454218039524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218093455, "dur": 312, "args": { "External id": 38186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38186, "pid": 5, "tid": 7, "ts": 1716454218093455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039538, "dur": 5, "args": { "External id": 38186, "cbid": 211, "correlation": 38186 } }, { "ph": "s", "id": 38186, "pid": 76337, "tid": -914061504, "ts": 1716454218039538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218093769, "dur": 1, "args": { "External id": 38188, "device": 5, "context": 1, "stream": 7, "correlation": 38188, "bytes": 240, "memory bandwidth (GB/s)": 0.1271186440677966 } }, { "ph": "f", "id": 38188, "pid": 5, "tid": 7, "ts": 1716454218093769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218039549, "dur": 7, "args": { "External id": 38188, "cbid": 51, "correlation": 38188 } }, { "ph": "s", "id": 38188, "pid": 76337, "tid": -914061504, "ts": 1716454218039549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218093773, "dur": 484, "args": { "External id": 38189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38189, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38189, "pid": 5, "tid": 7, "ts": 1716454218093773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039557, "dur": 7, "args": { "External id": 38189, "cbid": 211, "correlation": 38189 } }, { "ph": "s", "id": 38189, "pid": 76337, "tid": -914061504, "ts": 1716454218039557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094258, "dur": 5, "args": { "External id": 38191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38191, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38191, "pid": 5, "tid": 7, "ts": 1716454218094258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039568, "dur": 5, "args": { "External id": 38191, "cbid": 211, "correlation": 38191 } }, { "ph": "s", "id": 38191, "pid": 76337, "tid": -914061504, "ts": 1716454218039568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218094265, "dur": 6, "args": { "External id": 38197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38197, "pid": 5, "tid": 7, "ts": 1716454218094265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039596, "dur": 8, "args": { "External id": 38197, "cbid": 211, "correlation": 38197 } }, { "ph": "s", "id": 38197, "pid": 76337, "tid": -914061504, "ts": 1716454218039596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218094272, "dur": 3, "args": { "External id": 38205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38205, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 38205, "pid": 5, "tid": 7, "ts": 1716454218094272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039639, "dur": 9, "args": { "External id": 38205, "cbid": 211, "correlation": 38205 } }, { "ph": "s", "id": 38205, "pid": 76337, "tid": -914061504, "ts": 1716454218039639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218039704, "dur": 1, "args": { "External id": 38221, "cbid": 251, "correlation": 38221 } }, { "ph": "f", "id": 38221, "pid": 76337, "tid": -914061504, "ts": 1716454218039704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218039709, "dur": 0, "args": { "External id": 38223, "cbid": 251, "correlation": 38223 } }, { "ph": "f", "id": 38223, "pid": 76337, "tid": -914061504, "ts": 1716454218039709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218094276, "dur": 10, "args": { "External id": 38224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38224, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38224, "pid": 5, "tid": 7, "ts": 1716454218094276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039711, "dur": 12, "args": { "External id": 38224, "cbid": 211, "correlation": 38224 } }, { "ph": "s", "id": 38224, "pid": 76337, "tid": -914061504, "ts": 1716454218039711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218094288, "dur": 5, "args": { "External id": 38226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38226, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38226, "pid": 5, "tid": 7, "ts": 1716454218094288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039725, "dur": 5, "args": { "External id": 38226, "cbid": 211, "correlation": 38226 } }, { "ph": "s", "id": 38226, "pid": 76337, "tid": -914061504, "ts": 1716454218039725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218094294, "dur": 5, "args": { "External id": 38236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38236, "pid": 5, "tid": 7, "ts": 1716454218094294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039782, "dur": 12, "args": { "External id": 38236, "cbid": 211, "correlation": 38236 } }, { "ph": "s", "id": 38236, "pid": 76337, "tid": -914061504, "ts": 1716454218039782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218094301, "dur": 9, "args": { "External id": 38256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38256, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38256, "pid": 5, "tid": 7, "ts": 1716454218094301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039848, "dur": 11, "args": { "External id": 38256, "cbid": 211, "correlation": 38256 } }, { "ph": "s", "id": 38256, "pid": 76337, "tid": -914061504, "ts": 1716454218039848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218094311, "dur": 3, "args": { "External id": 38268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38268, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 38268, "pid": 5, "tid": 7, "ts": 1716454218094311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039869, "dur": 6, "args": { "External id": 38268, "cbid": 211, "correlation": 38268 } }, { "ph": "s", "id": 38268, "pid": 76337, "tid": -914061504, "ts": 1716454218039869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218094316, "dur": 7, "args": { "External id": 38271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38271, "pid": 5, "tid": 7, "ts": 1716454218094316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039887, "dur": 6, "args": { "External id": 38271, "cbid": 211, "correlation": 38271 } }, { "ph": "s", "id": 38271, "pid": 76337, "tid": -914061504, "ts": 1716454218039887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218094324, "dur": 4, "args": { "External id": 38280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38280, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38280, "pid": 5, "tid": 7, "ts": 1716454218094324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218039926, "dur": 10, "args": { "External id": 38280, "cbid": 211, "correlation": 38280 } }, { "ph": "s", "id": 38280, "pid": 76337, "tid": -914061504, "ts": 1716454218039926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218039997, "dur": 0, "args": { "External id": 38290, "cbid": 317, "correlation": 38290 } }, { "ph": "f", "id": 38290, "pid": 76337, "tid": -914061504, "ts": 1716454218039997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218039998, "dur": 0, "args": { "External id": 38291, "cbid": 203, "correlation": 38291 } }, { "ph": "f", "id": 38291, "pid": 76337, "tid": -914061504, "ts": 1716454218039998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218039999, "dur": 0, "args": { "External id": 38292, "cbid": 205, "correlation": 38292 } }, { "ph": "f", "id": 38292, "pid": 76337, "tid": -914061504, "ts": 1716454218039999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094330, "dur": 5, "args": { "External id": 38296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38296, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38296, "pid": 5, "tid": 7, "ts": 1716454218094330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040013, "dur": 12, "args": { "External id": 38296, "cbid": 211, "correlation": 38296 } }, { "ph": "s", "id": 38296, "pid": 76337, "tid": -914061504, "ts": 1716454218040013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094336, "dur": 157, "args": { "External id": 38298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38298, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38298, "pid": 5, "tid": 7, "ts": 1716454218094336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040027, "dur": 5, "args": { "External id": 38298, "cbid": 211, "correlation": 38298 } }, { "ph": "s", "id": 38298, "pid": 76337, "tid": -914061504, "ts": 1716454218040027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218094495, "dur": 1, "args": { "External id": 38300, "device": 5, "context": 1, "stream": 7, "correlation": 38300, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 38300, "pid": 5, "tid": 7, "ts": 1716454218094495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218040038, "dur": 7, "args": { "External id": 38300, "cbid": 51, "correlation": 38300 } }, { "ph": "s", "id": 38300, "pid": 76337, "tid": -914061504, "ts": 1716454218040038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218094499, "dur": 252, "args": { "External id": 38301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38301, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38301, "pid": 5, "tid": 7, "ts": 1716454218094499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040046, "dur": 6, "args": { "External id": 38301, "cbid": 211, "correlation": 38301 } }, { "ph": "s", "id": 38301, "pid": 76337, "tid": -914061504, "ts": 1716454218040046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094752, "dur": 5, "args": { "External id": 38303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38303, "pid": 5, "tid": 7, "ts": 1716454218094752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040056, "dur": 5, "args": { "External id": 38303, "cbid": 211, "correlation": 38303 } }, { "ph": "s", "id": 38303, "pid": 76337, "tid": -914061504, "ts": 1716454218040056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218094759, "dur": 6, "args": { "External id": 38309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38309, "pid": 5, "tid": 7, "ts": 1716454218094759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040084, "dur": 9, "args": { "External id": 38309, "cbid": 211, "correlation": 38309 } }, { "ph": "s", "id": 38309, "pid": 76337, "tid": -914061504, "ts": 1716454218040084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218040143, "dur": 0, "args": { "External id": 38319, "cbid": 317, "correlation": 38319 } }, { "ph": "f", "id": 38319, "pid": 76337, "tid": -914061504, "ts": 1716454218040143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218040144, "dur": 0, "args": { "External id": 38320, "cbid": 203, "correlation": 38320 } }, { "ph": "f", "id": 38320, "pid": 76337, "tid": -914061504, "ts": 1716454218040144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218040145, "dur": 0, "args": { "External id": 38321, "cbid": 205, "correlation": 38321 } }, { "ph": "f", "id": 38321, "pid": 76337, "tid": -914061504, "ts": 1716454218040145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094767, "dur": 8, "args": { "External id": 38325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38325, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38325, "pid": 5, "tid": 7, "ts": 1716454218094767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040160, "dur": 11, "args": { "External id": 38325, "cbid": 211, "correlation": 38325 } }, { "ph": "s", "id": 38325, "pid": 76337, "tid": -914061504, "ts": 1716454218040160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218094776, "dur": 3, "args": { "External id": 38327, "device": 5, "context": 1, "stream": 7, "correlation": 38327, "bytes": 4800, "memory bandwidth (GB/s)": 1.3043478260869565 } }, { "ph": "f", "id": 38327, "pid": 5, "tid": 7, "ts": 1716454218094776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218040178, "dur": 15, "args": { "External id": 38327, "cbid": 51, "correlation": 38327 } }, { "ph": "s", "id": 38327, "pid": 76337, "tid": -914061504, "ts": 1716454218040178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218094780, "dur": 97, "args": { "External id": 38328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38328, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 38328, "pid": 5, "tid": 7, "ts": 1716454218094780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040194, "dur": 7, "args": { "External id": 38328, "cbid": 211, "correlation": 38328 } }, { "ph": "s", "id": 38328, "pid": 76337, "tid": -914061504, "ts": 1716454218040194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094878, "dur": 6, "args": { "External id": 38330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38330, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38330, "pid": 5, "tid": 7, "ts": 1716454218094878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040205, "dur": 5, "args": { "External id": 38330, "cbid": 211, "correlation": 38330 } }, { "ph": "s", "id": 38330, "pid": 76337, "tid": -914061504, "ts": 1716454218040205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218094885, "dur": 6, "args": { "External id": 38336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38336, "pid": 5, "tid": 7, "ts": 1716454218094885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040232, "dur": 8, "args": { "External id": 38336, "cbid": 211, "correlation": 38336 } }, { "ph": "s", "id": 38336, "pid": 76337, "tid": -914061504, "ts": 1716454218040232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218094892, "dur": 5, "args": { "External id": 38344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38344, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38344, "pid": 5, "tid": 7, "ts": 1716454218094892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040261, "dur": 7, "args": { "External id": 38344, "cbid": 211, "correlation": 38344 } }, { "ph": "s", "id": 38344, "pid": 76337, "tid": -914061504, "ts": 1716454218040261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218094898, "dur": 4, "args": { "External id": 38352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38352, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38352, "pid": 5, "tid": 7, "ts": 1716454218094898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040289, "dur": 8, "args": { "External id": 38352, "cbid": 211, "correlation": 38352 } }, { "ph": "s", "id": 38352, "pid": 76337, "tid": -914061504, "ts": 1716454218040289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218094904, "dur": 11, "args": { "External id": 38361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38361, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38361, "pid": 5, "tid": 7, "ts": 1716454218094904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040333, "dur": 10, "args": { "External id": 38361, "cbid": 211, "correlation": 38361 } }, { "ph": "s", "id": 38361, "pid": 76337, "tid": -914061504, "ts": 1716454218040333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218094916, "dur": 12, "args": { "External id": 38381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38381, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38381, "pid": 5, "tid": 7, "ts": 1716454218094916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040402, "dur": 12, "args": { "External id": 38381, "cbid": 211, "correlation": 38381 } }, { "ph": "s", "id": 38381, "pid": 76337, "tid": -914061504, "ts": 1716454218040402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218094929, "dur": 4, "args": { "External id": 38393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38393, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38393, "pid": 5, "tid": 7, "ts": 1716454218094929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040424, "dur": 6, "args": { "External id": 38393, "cbid": 211, "correlation": 38393 } }, { "ph": "s", "id": 38393, "pid": 76337, "tid": -914061504, "ts": 1716454218040424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218094934, "dur": 11, "args": { "External id": 38396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38396, "pid": 5, "tid": 7, "ts": 1716454218094934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040441, "dur": 6, "args": { "External id": 38396, "cbid": 211, "correlation": 38396 } }, { "ph": "s", "id": 38396, "pid": 76337, "tid": -914061504, "ts": 1716454218040441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218094946, "dur": 6, "args": { "External id": 38405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38405, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38405, "pid": 5, "tid": 7, "ts": 1716454218094946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040479, "dur": 10, "args": { "External id": 38405, "cbid": 211, "correlation": 38405 } }, { "ph": "s", "id": 38405, "pid": 76337, "tid": -914061504, "ts": 1716454218040479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218040531, "dur": 0, "args": { "External id": 38415, "cbid": 317, "correlation": 38415 } }, { "ph": "f", "id": 38415, "pid": 76337, "tid": -914061504, "ts": 1716454218040531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218040532, "dur": 0, "args": { "External id": 38416, "cbid": 203, "correlation": 38416 } }, { "ph": "f", "id": 38416, "pid": 76337, "tid": -914061504, "ts": 1716454218040532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218040532, "dur": 0, "args": { "External id": 38417, "cbid": 205, "correlation": 38417 } }, { "ph": "f", "id": 38417, "pid": 76337, "tid": -914061504, "ts": 1716454218040532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094953, "dur": 6, "args": { "External id": 38421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38421, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38421, "pid": 5, "tid": 7, "ts": 1716454218094953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040546, "dur": 11, "args": { "External id": 38421, "cbid": 211, "correlation": 38421 } }, { "ph": "s", "id": 38421, "pid": 76337, "tid": -914061504, "ts": 1716454218040546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218094961, "dur": 313, "args": { "External id": 38423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38423, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38423, "pid": 5, "tid": 7, "ts": 1716454218094961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040559, "dur": 6, "args": { "External id": 38423, "cbid": 211, "correlation": 38423 } }, { "ph": "s", "id": 38423, "pid": 76337, "tid": -914061504, "ts": 1716454218040559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218095276, "dur": 1, "args": { "External id": 38425, "device": 5, "context": 1, "stream": 7, "correlation": 38425, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 38425, "pid": 5, "tid": 7, "ts": 1716454218095276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218040571, "dur": 6, "args": { "External id": 38425, "cbid": 51, "correlation": 38425 } }, { "ph": "s", "id": 38425, "pid": 76337, "tid": -914061504, "ts": 1716454218040571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218095280, "dur": 486, "args": { "External id": 38426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38426, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38426, "pid": 5, "tid": 7, "ts": 1716454218095280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040578, "dur": 6, "args": { "External id": 38426, "cbid": 211, "correlation": 38426 } }, { "ph": "s", "id": 38426, "pid": 76337, "tid": -914061504, "ts": 1716454218040578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218095767, "dur": 6, "args": { "External id": 38428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38428, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38428, "pid": 5, "tid": 7, "ts": 1716454218095767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040588, "dur": 5, "args": { "External id": 38428, "cbid": 211, "correlation": 38428 } }, { "ph": "s", "id": 38428, "pid": 76337, "tid": -914061504, "ts": 1716454218040588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218095774, "dur": 6, "args": { "External id": 38434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38434, "pid": 5, "tid": 7, "ts": 1716454218095774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040616, "dur": 10, "args": { "External id": 38434, "cbid": 211, "correlation": 38434 } }, { "ph": "s", "id": 38434, "pid": 76337, "tid": -914061504, "ts": 1716454218040616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218095781, "dur": 3, "args": { "External id": 38442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38442, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 38442, "pid": 5, "tid": 7, "ts": 1716454218095781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040660, "dur": 9, "args": { "External id": 38442, "cbid": 211, "correlation": 38442 } }, { "ph": "s", "id": 38442, "pid": 76337, "tid": -914061504, "ts": 1716454218040660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218040724, "dur": 1, "args": { "External id": 38458, "cbid": 251, "correlation": 38458 } }, { "ph": "f", "id": 38458, "pid": 76337, "tid": -914061504, "ts": 1716454218040724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218040729, "dur": 0, "args": { "External id": 38460, "cbid": 251, "correlation": 38460 } }, { "ph": "f", "id": 38460, "pid": 76337, "tid": -914061504, "ts": 1716454218040729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218095786, "dur": 12, "args": { "External id": 38461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38461, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38461, "pid": 5, "tid": 7, "ts": 1716454218095786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040731, "dur": 11, "args": { "External id": 38461, "cbid": 211, "correlation": 38461 } }, { "ph": "s", "id": 38461, "pid": 76337, "tid": -914061504, "ts": 1716454218040731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218095799, "dur": 5, "args": { "External id": 38463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38463, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38463, "pid": 5, "tid": 7, "ts": 1716454218095799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040744, "dur": 5, "args": { "External id": 38463, "cbid": 211, "correlation": 38463 } }, { "ph": "s", "id": 38463, "pid": 76337, "tid": -914061504, "ts": 1716454218040744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218095806, "dur": 6, "args": { "External id": 38473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38473, "pid": 5, "tid": 7, "ts": 1716454218095806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040801, "dur": 12, "args": { "External id": 38473, "cbid": 211, "correlation": 38473 } }, { "ph": "s", "id": 38473, "pid": 76337, "tid": -914061504, "ts": 1716454218040801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218095813, "dur": 9, "args": { "External id": 38493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38493, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38493, "pid": 5, "tid": 7, "ts": 1716454218095813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040867, "dur": 11, "args": { "External id": 38493, "cbid": 211, "correlation": 38493 } }, { "ph": "s", "id": 38493, "pid": 76337, "tid": -914061504, "ts": 1716454218040867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218095823, "dur": 4, "args": { "External id": 38505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38505, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 38505, "pid": 5, "tid": 7, "ts": 1716454218095823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040889, "dur": 6, "args": { "External id": 38505, "cbid": 211, "correlation": 38505 } }, { "ph": "s", "id": 38505, "pid": 76337, "tid": -914061504, "ts": 1716454218040889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218095828, "dur": 6, "args": { "External id": 38508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38508, "pid": 5, "tid": 7, "ts": 1716454218095828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040907, "dur": 6, "args": { "External id": 38508, "cbid": 211, "correlation": 38508 } }, { "ph": "s", "id": 38508, "pid": 76337, "tid": -914061504, "ts": 1716454218040907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218095836, "dur": 4, "args": { "External id": 38517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38517, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38517, "pid": 5, "tid": 7, "ts": 1716454218095836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218040948, "dur": 10, "args": { "External id": 38517, "cbid": 211, "correlation": 38517 } }, { "ph": "s", "id": 38517, "pid": 76337, "tid": -914061504, "ts": 1716454218040948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218041019, "dur": 0, "args": { "External id": 38527, "cbid": 317, "correlation": 38527 } }, { "ph": "f", "id": 38527, "pid": 76337, "tid": -914061504, "ts": 1716454218041019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218041020, "dur": 0, "args": { "External id": 38528, "cbid": 203, "correlation": 38528 } }, { "ph": "f", "id": 38528, "pid": 76337, "tid": -914061504, "ts": 1716454218041020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218041020, "dur": 0, "args": { "External id": 38529, "cbid": 205, "correlation": 38529 } }, { "ph": "f", "id": 38529, "pid": 76337, "tid": -914061504, "ts": 1716454218041020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218095842, "dur": 5, "args": { "External id": 38533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38533, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38533, "pid": 5, "tid": 7, "ts": 1716454218095842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041034, "dur": 13, "args": { "External id": 38533, "cbid": 211, "correlation": 38533 } }, { "ph": "s", "id": 38533, "pid": 76337, "tid": -914061504, "ts": 1716454218041034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218095848, "dur": 157, "args": { "External id": 38535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38535, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38535, "pid": 5, "tid": 7, "ts": 1716454218095848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041050, "dur": 6, "args": { "External id": 38535, "cbid": 211, "correlation": 38535 } }, { "ph": "s", "id": 38535, "pid": 76337, "tid": -914061504, "ts": 1716454218041050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218096008, "dur": 1, "args": { "External id": 38537, "device": 5, "context": 1, "stream": 7, "correlation": 38537, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 38537, "pid": 5, "tid": 7, "ts": 1716454218096008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218041061, "dur": 6, "args": { "External id": 38537, "cbid": 51, "correlation": 38537 } }, { "ph": "s", "id": 38537, "pid": 76337, "tid": -914061504, "ts": 1716454218041061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218096011, "dur": 252, "args": { "External id": 38538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38538, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38538, "pid": 5, "tid": 7, "ts": 1716454218096011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041068, "dur": 6, "args": { "External id": 38538, "cbid": 211, "correlation": 38538 } }, { "ph": "s", "id": 38538, "pid": 76337, "tid": -914061504, "ts": 1716454218041068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218096265, "dur": 6, "args": { "External id": 38540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38540, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38540, "pid": 5, "tid": 7, "ts": 1716454218096265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041078, "dur": 5, "args": { "External id": 38540, "cbid": 211, "correlation": 38540 } }, { "ph": "s", "id": 38540, "pid": 76337, "tid": -914061504, "ts": 1716454218041078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218096271, "dur": 6, "args": { "External id": 38546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38546, "pid": 5, "tid": 7, "ts": 1716454218096271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041107, "dur": 9, "args": { "External id": 38546, "cbid": 211, "correlation": 38546 } }, { "ph": "s", "id": 38546, "pid": 76337, "tid": -914061504, "ts": 1716454218041107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218041165, "dur": 0, "args": { "External id": 38556, "cbid": 317, "correlation": 38556 } }, { "ph": "f", "id": 38556, "pid": 76337, "tid": -914061504, "ts": 1716454218041165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218041166, "dur": 0, "args": { "External id": 38557, "cbid": 203, "correlation": 38557 } }, { "ph": "f", "id": 38557, "pid": 76337, "tid": -914061504, "ts": 1716454218041166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218041167, "dur": 0, "args": { "External id": 38558, "cbid": 205, "correlation": 38558 } }, { "ph": "f", "id": 38558, "pid": 76337, "tid": -914061504, "ts": 1716454218041167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218096279, "dur": 7, "args": { "External id": 38562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38562, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38562, "pid": 5, "tid": 7, "ts": 1716454218096279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041178, "dur": 11, "args": { "External id": 38562, "cbid": 211, "correlation": 38562 } }, { "ph": "s", "id": 38562, "pid": 76337, "tid": -914061504, "ts": 1716454218041178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218096287, "dur": 3, "args": { "External id": 38564, "device": 5, "context": 1, "stream": 7, "correlation": 38564, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 38564, "pid": 5, "tid": 7, "ts": 1716454218096287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218041194, "dur": 10, "args": { "External id": 38564, "cbid": 51, "correlation": 38564 } }, { "ph": "s", "id": 38564, "pid": 76337, "tid": -914061504, "ts": 1716454218041194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218096291, "dur": 95, "args": { "External id": 38565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38565, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 38565, "pid": 5, "tid": 7, "ts": 1716454218096291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041206, "dur": 6, "args": { "External id": 38565, "cbid": 211, "correlation": 38565 } }, { "ph": "s", "id": 38565, "pid": 76337, "tid": -914061504, "ts": 1716454218041206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218096388, "dur": 5, "args": { "External id": 38567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38567, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38567, "pid": 5, "tid": 7, "ts": 1716454218096388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041215, "dur": 5, "args": { "External id": 38567, "cbid": 211, "correlation": 38567 } }, { "ph": "s", "id": 38567, "pid": 76337, "tid": -914061504, "ts": 1716454218041215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218096395, "dur": 6, "args": { "External id": 38573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38573, "pid": 5, "tid": 7, "ts": 1716454218096395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041241, "dur": 9, "args": { "External id": 38573, "cbid": 211, "correlation": 38573 } }, { "ph": "s", "id": 38573, "pid": 76337, "tid": -914061504, "ts": 1716454218041241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218096402, "dur": 5, "args": { "External id": 38581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38581, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38581, "pid": 5, "tid": 7, "ts": 1716454218096402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041271, "dur": 7, "args": { "External id": 38581, "cbid": 211, "correlation": 38581 } }, { "ph": "s", "id": 38581, "pid": 76337, "tid": -914061504, "ts": 1716454218041271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218096408, "dur": 5, "args": { "External id": 38589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38589, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38589, "pid": 5, "tid": 7, "ts": 1716454218096408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041298, "dur": 9, "args": { "External id": 38589, "cbid": 211, "correlation": 38589 } }, { "ph": "s", "id": 38589, "pid": 76337, "tid": -914061504, "ts": 1716454218041298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218096414, "dur": 11, "args": { "External id": 38598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38598, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38598, "pid": 5, "tid": 7, "ts": 1716454218096414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041342, "dur": 11, "args": { "External id": 38598, "cbid": 211, "correlation": 38598 } }, { "ph": "s", "id": 38598, "pid": 76337, "tid": -914061504, "ts": 1716454218041342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218096426, "dur": 12, "args": { "External id": 38618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38618, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38618, "pid": 5, "tid": 7, "ts": 1716454218096426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041413, "dur": 11, "args": { "External id": 38618, "cbid": 211, "correlation": 38618 } }, { "ph": "s", "id": 38618, "pid": 76337, "tid": -914061504, "ts": 1716454218041413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218096439, "dur": 4, "args": { "External id": 38630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38630, "pid": 5, "tid": 7, "ts": 1716454218096439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041433, "dur": 6, "args": { "External id": 38630, "cbid": 211, "correlation": 38630 } }, { "ph": "s", "id": 38630, "pid": 76337, "tid": -914061504, "ts": 1716454218041433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218096445, "dur": 11, "args": { "External id": 38633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38633, "pid": 5, "tid": 7, "ts": 1716454218096445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041451, "dur": 7, "args": { "External id": 38633, "cbid": 211, "correlation": 38633 } }, { "ph": "s", "id": 38633, "pid": 76337, "tid": -914061504, "ts": 1716454218041451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218096457, "dur": 6, "args": { "External id": 38642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38642, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38642, "pid": 5, "tid": 7, "ts": 1716454218096457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041490, "dur": 9, "args": { "External id": 38642, "cbid": 211, "correlation": 38642 } }, { "ph": "s", "id": 38642, "pid": 76337, "tid": -914061504, "ts": 1716454218041490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218041541, "dur": 0, "args": { "External id": 38652, "cbid": 317, "correlation": 38652 } }, { "ph": "f", "id": 38652, "pid": 76337, "tid": -914061504, "ts": 1716454218041541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218041542, "dur": 0, "args": { "External id": 38653, "cbid": 203, "correlation": 38653 } }, { "ph": "f", "id": 38653, "pid": 76337, "tid": -914061504, "ts": 1716454218041542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218041542, "dur": 0, "args": { "External id": 38654, "cbid": 205, "correlation": 38654 } }, { "ph": "f", "id": 38654, "pid": 76337, "tid": -914061504, "ts": 1716454218041542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218096464, "dur": 6, "args": { "External id": 38658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38658, "pid": 5, "tid": 7, "ts": 1716454218096464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041556, "dur": 12, "args": { "External id": 38658, "cbid": 211, "correlation": 38658 } }, { "ph": "s", "id": 38658, "pid": 76337, "tid": -914061504, "ts": 1716454218041556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218096472, "dur": 312, "args": { "External id": 38660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38660, "pid": 5, "tid": 7, "ts": 1716454218096472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041570, "dur": 5, "args": { "External id": 38660, "cbid": 211, "correlation": 38660 } }, { "ph": "s", "id": 38660, "pid": 76337, "tid": -914061504, "ts": 1716454218041570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218096786, "dur": 1, "args": { "External id": 38662, "device": 5, "context": 1, "stream": 7, "correlation": 38662, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 38662, "pid": 5, "tid": 7, "ts": 1716454218096786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218041580, "dur": 6, "args": { "External id": 38662, "cbid": 51, "correlation": 38662 } }, { "ph": "s", "id": 38662, "pid": 76337, "tid": -914061504, "ts": 1716454218041580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218096789, "dur": 486, "args": { "External id": 38663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38663, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38663, "pid": 5, "tid": 7, "ts": 1716454218096789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041588, "dur": 6, "args": { "External id": 38663, "cbid": 211, "correlation": 38663 } }, { "ph": "s", "id": 38663, "pid": 76337, "tid": -914061504, "ts": 1716454218041588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097277, "dur": 5, "args": { "External id": 38665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38665, "pid": 5, "tid": 7, "ts": 1716454218097277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041597, "dur": 5, "args": { "External id": 38665, "cbid": 211, "correlation": 38665 } }, { "ph": "s", "id": 38665, "pid": 76337, "tid": -914061504, "ts": 1716454218041597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218097283, "dur": 6, "args": { "External id": 38671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38671, "pid": 5, "tid": 7, "ts": 1716454218097283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041626, "dur": 8, "args": { "External id": 38671, "cbid": 211, "correlation": 38671 } }, { "ph": "s", "id": 38671, "pid": 76337, "tid": -914061504, "ts": 1716454218041626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218097291, "dur": 3, "args": { "External id": 38679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38679, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 38679, "pid": 5, "tid": 7, "ts": 1716454218097291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041669, "dur": 10, "args": { "External id": 38679, "cbid": 211, "correlation": 38679 } }, { "ph": "s", "id": 38679, "pid": 76337, "tid": -914061504, "ts": 1716454218041669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218041731, "dur": 1, "args": { "External id": 38695, "cbid": 251, "correlation": 38695 } }, { "ph": "f", "id": 38695, "pid": 76337, "tid": -914061504, "ts": 1716454218041731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218041737, "dur": 0, "args": { "External id": 38697, "cbid": 251, "correlation": 38697 } }, { "ph": "f", "id": 38697, "pid": 76337, "tid": -914061504, "ts": 1716454218041737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218097295, "dur": 13, "args": { "External id": 38698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38698, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38698, "pid": 5, "tid": 7, "ts": 1716454218097295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041739, "dur": 11, "args": { "External id": 38698, "cbid": 211, "correlation": 38698 } }, { "ph": "s", "id": 38698, "pid": 76337, "tid": -914061504, "ts": 1716454218041739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218097309, "dur": 5, "args": { "External id": 38700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38700, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38700, "pid": 5, "tid": 7, "ts": 1716454218097309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041751, "dur": 5, "args": { "External id": 38700, "cbid": 211, "correlation": 38700 } }, { "ph": "s", "id": 38700, "pid": 76337, "tid": -914061504, "ts": 1716454218041751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218097315, "dur": 5, "args": { "External id": 38710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38710, "pid": 5, "tid": 7, "ts": 1716454218097315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041807, "dur": 12, "args": { "External id": 38710, "cbid": 211, "correlation": 38710 } }, { "ph": "s", "id": 38710, "pid": 76337, "tid": -914061504, "ts": 1716454218041807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218097322, "dur": 9, "args": { "External id": 38730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38730, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38730, "pid": 5, "tid": 7, "ts": 1716454218097322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041873, "dur": 11, "args": { "External id": 38730, "cbid": 211, "correlation": 38730 } }, { "ph": "s", "id": 38730, "pid": 76337, "tid": -914061504, "ts": 1716454218041873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218097333, "dur": 3, "args": { "External id": 38742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38742, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 38742, "pid": 5, "tid": 7, "ts": 1716454218097333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041894, "dur": 6, "args": { "External id": 38742, "cbid": 211, "correlation": 38742 } }, { "ph": "s", "id": 38742, "pid": 76337, "tid": -914061504, "ts": 1716454218041894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218097337, "dur": 6, "args": { "External id": 38745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38745, "pid": 5, "tid": 7, "ts": 1716454218097337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041912, "dur": 6, "args": { "External id": 38745, "cbid": 211, "correlation": 38745 } }, { "ph": "s", "id": 38745, "pid": 76337, "tid": -914061504, "ts": 1716454218041912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218097345, "dur": 4, "args": { "External id": 38754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38754, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38754, "pid": 5, "tid": 7, "ts": 1716454218097345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218041952, "dur": 10, "args": { "External id": 38754, "cbid": 211, "correlation": 38754 } }, { "ph": "s", "id": 38754, "pid": 76337, "tid": -914061504, "ts": 1716454218041952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218042023, "dur": 0, "args": { "External id": 38764, "cbid": 317, "correlation": 38764 } }, { "ph": "f", "id": 38764, "pid": 76337, "tid": -914061504, "ts": 1716454218042023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218042024, "dur": 0, "args": { "External id": 38765, "cbid": 203, "correlation": 38765 } }, { "ph": "f", "id": 38765, "pid": 76337, "tid": -914061504, "ts": 1716454218042024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218042025, "dur": 0, "args": { "External id": 38766, "cbid": 205, "correlation": 38766 } }, { "ph": "f", "id": 38766, "pid": 76337, "tid": -914061504, "ts": 1716454218042025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097351, "dur": 5, "args": { "External id": 38770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38770, "pid": 5, "tid": 7, "ts": 1716454218097351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042038, "dur": 13, "args": { "External id": 38770, "cbid": 211, "correlation": 38770 } }, { "ph": "s", "id": 38770, "pid": 76337, "tid": -914061504, "ts": 1716454218042038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097357, "dur": 158, "args": { "External id": 38772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38772, "pid": 5, "tid": 7, "ts": 1716454218097357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042054, "dur": 5, "args": { "External id": 38772, "cbid": 211, "correlation": 38772 } }, { "ph": "s", "id": 38772, "pid": 76337, "tid": -914061504, "ts": 1716454218042054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218097517, "dur": 1, "args": { "External id": 38774, "device": 5, "context": 1, "stream": 7, "correlation": 38774, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 38774, "pid": 5, "tid": 7, "ts": 1716454218097517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218042065, "dur": 6, "args": { "External id": 38774, "cbid": 51, "correlation": 38774 } }, { "ph": "s", "id": 38774, "pid": 76337, "tid": -914061504, "ts": 1716454218042065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218097521, "dur": 252, "args": { "External id": 38775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38775, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38775, "pid": 5, "tid": 7, "ts": 1716454218097521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042072, "dur": 6, "args": { "External id": 38775, "cbid": 211, "correlation": 38775 } }, { "ph": "s", "id": 38775, "pid": 76337, "tid": -914061504, "ts": 1716454218042072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097774, "dur": 6, "args": { "External id": 38777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38777, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38777, "pid": 5, "tid": 7, "ts": 1716454218097774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042082, "dur": 5, "args": { "External id": 38777, "cbid": 211, "correlation": 38777 } }, { "ph": "s", "id": 38777, "pid": 76337, "tid": -914061504, "ts": 1716454218042082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218097781, "dur": 6, "args": { "External id": 38783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38783, "pid": 5, "tid": 7, "ts": 1716454218097781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042111, "dur": 8, "args": { "External id": 38783, "cbid": 211, "correlation": 38783 } }, { "ph": "s", "id": 38783, "pid": 76337, "tid": -914061504, "ts": 1716454218042111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218042170, "dur": 0, "args": { "External id": 38793, "cbid": 317, "correlation": 38793 } }, { "ph": "f", "id": 38793, "pid": 76337, "tid": -914061504, "ts": 1716454218042170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218042170, "dur": 0, "args": { "External id": 38794, "cbid": 203, "correlation": 38794 } }, { "ph": "f", "id": 38794, "pid": 76337, "tid": -914061504, "ts": 1716454218042170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218042171, "dur": 0, "args": { "External id": 38795, "cbid": 205, "correlation": 38795 } }, { "ph": "f", "id": 38795, "pid": 76337, "tid": -914061504, "ts": 1716454218042171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097789, "dur": 8, "args": { "External id": 38799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38799, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38799, "pid": 5, "tid": 7, "ts": 1716454218097789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042183, "dur": 11, "args": { "External id": 38799, "cbid": 211, "correlation": 38799 } }, { "ph": "s", "id": 38799, "pid": 76337, "tid": -914061504, "ts": 1716454218042183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218097798, "dur": 3, "args": { "External id": 38801, "device": 5, "context": 1, "stream": 7, "correlation": 38801, "bytes": 4800, "memory bandwidth (GB/s)": 1.3157894736842106 } }, { "ph": "f", "id": 38801, "pid": 5, "tid": 7, "ts": 1716454218097798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218042199, "dur": 10, "args": { "External id": 38801, "cbid": 51, "correlation": 38801 } }, { "ph": "s", "id": 38801, "pid": 76337, "tid": -914061504, "ts": 1716454218042199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218097802, "dur": 94, "args": { "External id": 38802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38802, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 38802, "pid": 5, "tid": 7, "ts": 1716454218097802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042210, "dur": 6, "args": { "External id": 38802, "cbid": 211, "correlation": 38802 } }, { "ph": "s", "id": 38802, "pid": 76337, "tid": -914061504, "ts": 1716454218042210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097898, "dur": 5, "args": { "External id": 38804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38804, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38804, "pid": 5, "tid": 7, "ts": 1716454218097898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042220, "dur": 5, "args": { "External id": 38804, "cbid": 211, "correlation": 38804 } }, { "ph": "s", "id": 38804, "pid": 76337, "tid": -914061504, "ts": 1716454218042220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218097905, "dur": 6, "args": { "External id": 38810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38810, "pid": 5, "tid": 7, "ts": 1716454218097905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042246, "dur": 9, "args": { "External id": 38810, "cbid": 211, "correlation": 38810 } }, { "ph": "s", "id": 38810, "pid": 76337, "tid": -914061504, "ts": 1716454218042246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218097912, "dur": 5, "args": { "External id": 38818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38818, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38818, "pid": 5, "tid": 7, "ts": 1716454218097912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042276, "dur": 8, "args": { "External id": 38818, "cbid": 211, "correlation": 38818 } }, { "ph": "s", "id": 38818, "pid": 76337, "tid": -914061504, "ts": 1716454218042276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218097918, "dur": 4, "args": { "External id": 38826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38826, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 38826, "pid": 5, "tid": 7, "ts": 1716454218097918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042305, "dur": 8, "args": { "External id": 38826, "cbid": 211, "correlation": 38826 } }, { "ph": "s", "id": 38826, "pid": 76337, "tid": -914061504, "ts": 1716454218042305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454218097924, "dur": 14, "args": { "External id": 38837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38837, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38837, "pid": 5, "tid": 7, "ts": 1716454218097924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042385, "dur": 14, "args": { "External id": 38837, "cbid": 211, "correlation": 38837 } }, { "ph": "s", "id": 38837, "pid": 76337, "tid": -914061504, "ts": 1716454218042385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218042441, "dur": 0, "args": { "External id": 38847, "cbid": 317, "correlation": 38847 } }, { "ph": "f", "id": 38847, "pid": 76337, "tid": -914061504, "ts": 1716454218042441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218042442, "dur": 0, "args": { "External id": 38848, "cbid": 203, "correlation": 38848 } }, { "ph": "f", "id": 38848, "pid": 76337, "tid": -914061504, "ts": 1716454218042442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218042443, "dur": 0, "args": { "External id": 38849, "cbid": 205, "correlation": 38849 } }, { "ph": "f", "id": 38849, "pid": 76337, "tid": -914061504, "ts": 1716454218042443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097939, "dur": 8, "args": { "External id": 38853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38853, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38853, "pid": 5, "tid": 7, "ts": 1716454218097939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042456, "dur": 11, "args": { "External id": 38853, "cbid": 211, "correlation": 38853 } }, { "ph": "s", "id": 38853, "pid": 76337, "tid": -914061504, "ts": 1716454218042456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218097948, "dur": 158, "args": { "External id": 38855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38855, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38855, "pid": 5, "tid": 7, "ts": 1716454218097948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042470, "dur": 5, "args": { "External id": 38855, "cbid": 211, "correlation": 38855 } }, { "ph": "s", "id": 38855, "pid": 76337, "tid": -914061504, "ts": 1716454218042470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218098109, "dur": 1, "args": { "External id": 38857, "device": 5, "context": 1, "stream": 7, "correlation": 38857, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 38857, "pid": 5, "tid": 7, "ts": 1716454218098109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218042480, "dur": 6, "args": { "External id": 38857, "cbid": 51, "correlation": 38857 } }, { "ph": "s", "id": 38857, "pid": 76337, "tid": -914061504, "ts": 1716454218042480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218098112, "dur": 639, "args": { "External id": 38858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38858, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38858, "pid": 5, "tid": 7, "ts": 1716454218098112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042488, "dur": 6, "args": { "External id": 38858, "cbid": 211, "correlation": 38858 } }, { "ph": "s", "id": 38858, "pid": 76337, "tid": -914061504, "ts": 1716454218042488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218098752, "dur": 13, "args": { "External id": 38860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38860, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38860, "pid": 5, "tid": 7, "ts": 1716454218098752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042497, "dur": 5, "args": { "External id": 38860, "cbid": 211, "correlation": 38860 } }, { "ph": "s", "id": 38860, "pid": 76337, "tid": -914061504, "ts": 1716454218042497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218098766, "dur": 15, "args": { "External id": 38866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38866, "pid": 5, "tid": 7, "ts": 1716454218098766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042526, "dur": 8, "args": { "External id": 38866, "cbid": 211, "correlation": 38866 } }, { "ph": "s", "id": 38866, "pid": 76337, "tid": -914061504, "ts": 1716454218042526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218098782, "dur": 30, "args": { "External id": 38875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38875, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38875, "pid": 5, "tid": 7, "ts": 1716454218098782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042615, "dur": 13, "args": { "External id": 38875, "cbid": 211, "correlation": 38875 } }, { "ph": "s", "id": 38875, "pid": 76337, "tid": -914061504, "ts": 1716454218042615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218098813, "dur": 30, "args": { "External id": 38895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38895, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 38895, "pid": 5, "tid": 7, "ts": 1716454218098813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042683, "dur": 11, "args": { "External id": 38895, "cbid": 211, "correlation": 38895 } }, { "ph": "s", "id": 38895, "pid": 76337, "tid": -914061504, "ts": 1716454218042683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218098844, "dur": 4, "args": { "External id": 38907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38907, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38907, "pid": 5, "tid": 7, "ts": 1716454218098844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042703, "dur": 6, "args": { "External id": 38907, "cbid": 211, "correlation": 38907 } }, { "ph": "s", "id": 38907, "pid": 76337, "tid": -914061504, "ts": 1716454218042703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218098850, "dur": 29, "args": { "External id": 38910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38910, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38910, "pid": 5, "tid": 7, "ts": 1716454218098850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042722, "dur": 7, "args": { "External id": 38910, "cbid": 211, "correlation": 38910 } }, { "ph": "s", "id": 38910, "pid": 76337, "tid": -914061504, "ts": 1716454218042722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218098881, "dur": 20, "args": { "External id": 38919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38919, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38919, "pid": 5, "tid": 7, "ts": 1716454218098881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042760, "dur": 10, "args": { "External id": 38919, "cbid": 211, "correlation": 38919 } }, { "ph": "s", "id": 38919, "pid": 76337, "tid": -914061504, "ts": 1716454218042760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218042811, "dur": 0, "args": { "External id": 38929, "cbid": 317, "correlation": 38929 } }, { "ph": "f", "id": 38929, "pid": 76337, "tid": -914061504, "ts": 1716454218042811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218042812, "dur": 0, "args": { "External id": 38930, "cbid": 203, "correlation": 38930 } }, { "ph": "f", "id": 38930, "pid": 76337, "tid": -914061504, "ts": 1716454218042812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218042813, "dur": 0, "args": { "External id": 38931, "cbid": 205, "correlation": 38931 } }, { "ph": "f", "id": 38931, "pid": 76337, "tid": -914061504, "ts": 1716454218042813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218098902, "dur": 21, "args": { "External id": 38935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38935, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38935, "pid": 5, "tid": 7, "ts": 1716454218098902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042828, "dur": 12, "args": { "External id": 38935, "cbid": 211, "correlation": 38935 } }, { "ph": "s", "id": 38935, "pid": 76337, "tid": -914061504, "ts": 1716454218042828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218098925, "dur": 312, "args": { "External id": 38937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38937, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38937, "pid": 5, "tid": 7, "ts": 1716454218098925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042842, "dur": 5, "args": { "External id": 38937, "cbid": 211, "correlation": 38937 } }, { "ph": "s", "id": 38937, "pid": 76337, "tid": -914061504, "ts": 1716454218042842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218099239, "dur": 1, "args": { "External id": 38939, "device": 5, "context": 1, "stream": 7, "correlation": 38939, "bytes": 960, "memory bandwidth (GB/s)": 0.5 } }, { "ph": "f", "id": 38939, "pid": 5, "tid": 7, "ts": 1716454218099239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218042853, "dur": 6, "args": { "External id": 38939, "cbid": 51, "correlation": 38939 } }, { "ph": "s", "id": 38939, "pid": 76337, "tid": -914061504, "ts": 1716454218042853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218099243, "dur": 1220, "args": { "External id": 38940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38940, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38940, "pid": 5, "tid": 7, "ts": 1716454218099243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042860, "dur": 6, "args": { "External id": 38940, "cbid": 211, "correlation": 38940 } }, { "ph": "s", "id": 38940, "pid": 76337, "tid": -914061504, "ts": 1716454218042860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218100464, "dur": 12, "args": { "External id": 38942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38942, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38942, "pid": 5, "tid": 7, "ts": 1716454218100464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042870, "dur": 5, "args": { "External id": 38942, "cbid": 211, "correlation": 38942 } }, { "ph": "s", "id": 38942, "pid": 76337, "tid": -914061504, "ts": 1716454218042870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218100478, "dur": 15, "args": { "External id": 38948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38948, "pid": 5, "tid": 7, "ts": 1716454218100478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042898, "dur": 8, "args": { "External id": 38948, "cbid": 211, "correlation": 38948 } }, { "ph": "s", "id": 38948, "pid": 76337, "tid": -914061504, "ts": 1716454218042898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218100494, "dur": 3, "args": { "External id": 38956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38956, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 38956, "pid": 5, "tid": 7, "ts": 1716454218100494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218042943, "dur": 9, "args": { "External id": 38956, "cbid": 211, "correlation": 38956 } }, { "ph": "s", "id": 38956, "pid": 76337, "tid": -914061504, "ts": 1716454218042943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218043014, "dur": 1, "args": { "External id": 38972, "cbid": 251, "correlation": 38972 } }, { "ph": "f", "id": 38972, "pid": 76337, "tid": -914061504, "ts": 1716454218043014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218043020, "dur": 0, "args": { "External id": 38974, "cbid": 251, "correlation": 38974 } }, { "ph": "f", "id": 38974, "pid": 76337, "tid": -914061504, "ts": 1716454218043020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218100499, "dur": 13, "args": { "External id": 38975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38975, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38975, "pid": 5, "tid": 7, "ts": 1716454218100499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043022, "dur": 12, "args": { "External id": 38975, "cbid": 211, "correlation": 38975 } }, { "ph": "s", "id": 38975, "pid": 76337, "tid": -914061504, "ts": 1716454218043022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218100513, "dur": 5, "args": { "External id": 38977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38977, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 38977, "pid": 5, "tid": 7, "ts": 1716454218100513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043036, "dur": 6, "args": { "External id": 38977, "cbid": 211, "correlation": 38977 } }, { "ph": "s", "id": 38977, "pid": 76337, "tid": -914061504, "ts": 1716454218043036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218100519, "dur": 17, "args": { "External id": 38987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 38987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 38987, "pid": 5, "tid": 7, "ts": 1716454218100519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043094, "dur": 12, "args": { "External id": 38987, "cbid": 211, "correlation": 38987 } }, { "ph": "s", "id": 38987, "pid": 76337, "tid": -914061504, "ts": 1716454218043094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218100537, "dur": 17, "args": { "External id": 39007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39007, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 39007, "pid": 5, "tid": 7, "ts": 1716454218100537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043160, "dur": 11, "args": { "External id": 39007, "cbid": 211, "correlation": 39007 } }, { "ph": "s", "id": 39007, "pid": 76337, "tid": -914061504, "ts": 1716454218043160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218100556, "dur": 4, "args": { "External id": 39019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39019, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 39019, "pid": 5, "tid": 7, "ts": 1716454218100556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043180, "dur": 6, "args": { "External id": 39019, "cbid": 211, "correlation": 39019 } }, { "ph": "s", "id": 39019, "pid": 76337, "tid": -914061504, "ts": 1716454218043180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218100561, "dur": 16, "args": { "External id": 39022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39022, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39022, "pid": 5, "tid": 7, "ts": 1716454218100561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043198, "dur": 6, "args": { "External id": 39022, "cbid": 211, "correlation": 39022 } }, { "ph": "s", "id": 39022, "pid": 76337, "tid": -914061504, "ts": 1716454218043198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218100578, "dur": 11, "args": { "External id": 39031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39031, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39031, "pid": 5, "tid": 7, "ts": 1716454218100578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043239, "dur": 10, "args": { "External id": 39031, "cbid": 211, "correlation": 39031 } }, { "ph": "s", "id": 39031, "pid": 76337, "tid": -914061504, "ts": 1716454218043239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218043301, "dur": 0, "args": { "External id": 39041, "cbid": 317, "correlation": 39041 } }, { "ph": "f", "id": 39041, "pid": 76337, "tid": -914061504, "ts": 1716454218043301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218043302, "dur": 0, "args": { "External id": 39042, "cbid": 203, "correlation": 39042 } }, { "ph": "f", "id": 39042, "pid": 76337, "tid": -914061504, "ts": 1716454218043302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218043303, "dur": 0, "args": { "External id": 39043, "cbid": 205, "correlation": 39043 } }, { "ph": "f", "id": 39043, "pid": 76337, "tid": -914061504, "ts": 1716454218043303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218100590, "dur": 11, "args": { "External id": 39047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39047, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39047, "pid": 5, "tid": 7, "ts": 1716454218100590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043317, "dur": 12, "args": { "External id": 39047, "cbid": 211, "correlation": 39047 } }, { "ph": "s", "id": 39047, "pid": 76337, "tid": -914061504, "ts": 1716454218043317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218100602, "dur": 159, "args": { "External id": 39049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39049, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39049, "pid": 5, "tid": 7, "ts": 1716454218100602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043331, "dur": 5, "args": { "External id": 39049, "cbid": 211, "correlation": 39049 } }, { "ph": "s", "id": 39049, "pid": 76337, "tid": -914061504, "ts": 1716454218043331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218100763, "dur": 1, "args": { "External id": 39051, "device": 5, "context": 1, "stream": 7, "correlation": 39051, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 39051, "pid": 5, "tid": 7, "ts": 1716454218100763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218043342, "dur": 6, "args": { "External id": 39051, "cbid": 51, "correlation": 39051 } }, { "ph": "s", "id": 39051, "pid": 76337, "tid": -914061504, "ts": 1716454218043342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218100767, "dur": 637, "args": { "External id": 39052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39052, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39052, "pid": 5, "tid": 7, "ts": 1716454218100767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043349, "dur": 7, "args": { "External id": 39052, "cbid": 211, "correlation": 39052 } }, { "ph": "s", "id": 39052, "pid": 76337, "tid": -914061504, "ts": 1716454218043349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218101405, "dur": 12, "args": { "External id": 39054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39054, "pid": 5, "tid": 7, "ts": 1716454218101405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043360, "dur": 5, "args": { "External id": 39054, "cbid": 211, "correlation": 39054 } }, { "ph": "s", "id": 39054, "pid": 76337, "tid": -914061504, "ts": 1716454218043360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218101418, "dur": 14, "args": { "External id": 39060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39060, "pid": 5, "tid": 7, "ts": 1716454218101418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043388, "dur": 9, "args": { "External id": 39060, "cbid": 211, "correlation": 39060 } }, { "ph": "s", "id": 39060, "pid": 76337, "tid": -914061504, "ts": 1716454218043388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218043446, "dur": 0, "args": { "External id": 39070, "cbid": 317, "correlation": 39070 } }, { "ph": "f", "id": 39070, "pid": 76337, "tid": -914061504, "ts": 1716454218043446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218043447, "dur": 0, "args": { "External id": 39071, "cbid": 203, "correlation": 39071 } }, { "ph": "f", "id": 39071, "pid": 76337, "tid": -914061504, "ts": 1716454218043447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218043448, "dur": 0, "args": { "External id": 39072, "cbid": 205, "correlation": 39072 } }, { "ph": "f", "id": 39072, "pid": 76337, "tid": -914061504, "ts": 1716454218043448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218101434, "dur": 21, "args": { "External id": 39076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39076, "pid": 5, "tid": 7, "ts": 1716454218101434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043461, "dur": 12, "args": { "External id": 39076, "cbid": 211, "correlation": 39076 } }, { "ph": "s", "id": 39076, "pid": 76337, "tid": -914061504, "ts": 1716454218043461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218101456, "dur": 4, "args": { "External id": 39078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39078, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39078, "pid": 5, "tid": 7, "ts": 1716454218101456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043479, "dur": 6, "args": { "External id": 39078, "cbid": 211, "correlation": 39078 } }, { "ph": "s", "id": 39078, "pid": 76337, "tid": -914061504, "ts": 1716454218043479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218043488, "dur": 0, "args": { "External id": 39079, "cbid": 51, "correlation": 39079 } }, { "ph": "s", "id": 39079, "pid": 76337, "tid": -914061504, "ts": 1716454218043488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218101461, "dur": 171, "args": { "External id": 39080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39080, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 39080, "pid": 5, "tid": 7, "ts": 1716454218101461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043489, "dur": 6, "args": { "External id": 39080, "cbid": 211, "correlation": 39080 } }, { "ph": "s", "id": 39080, "pid": 76337, "tid": -914061504, "ts": 1716454218043489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218101633, "dur": 15, "args": { "External id": 39085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39085, "pid": 5, "tid": 7, "ts": 1716454218101633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043516, "dur": 8, "args": { "External id": 39085, "cbid": 211, "correlation": 39085 } }, { "ph": "s", "id": 39085, "pid": 76337, "tid": -914061504, "ts": 1716454218043516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218101650, "dur": 12, "args": { "External id": 39093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39093, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39093, "pid": 5, "tid": 7, "ts": 1716454218101650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043544, "dur": 8, "args": { "External id": 39093, "cbid": 211, "correlation": 39093 } }, { "ph": "s", "id": 39093, "pid": 76337, "tid": -914061504, "ts": 1716454218043544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218101663, "dur": 10, "args": { "External id": 39101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39101, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39101, "pid": 5, "tid": 7, "ts": 1716454218101663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043573, "dur": 8, "args": { "External id": 39101, "cbid": 211, "correlation": 39101 } }, { "ph": "s", "id": 39101, "pid": 76337, "tid": -914061504, "ts": 1716454218043573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218101674, "dur": 18, "args": { "External id": 39121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39121, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 39121, "pid": 5, "tid": 7, "ts": 1716454218101674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043655, "dur": 13, "args": { "External id": 39121, "cbid": 211, "correlation": 39121 } }, { "ph": "s", "id": 39121, "pid": 76337, "tid": -914061504, "ts": 1716454218043655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218101693, "dur": 4, "args": { "External id": 39133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39133, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 39133, "pid": 5, "tid": 7, "ts": 1716454218101693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043678, "dur": 6, "args": { "External id": 39133, "cbid": 211, "correlation": 39133 } }, { "ph": "s", "id": 39133, "pid": 76337, "tid": -914061504, "ts": 1716454218043678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218101699, "dur": 16, "args": { "External id": 39136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39136, "pid": 5, "tid": 7, "ts": 1716454218101699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043696, "dur": 6, "args": { "External id": 39136, "cbid": 211, "correlation": 39136 } }, { "ph": "s", "id": 39136, "pid": 76337, "tid": -914061504, "ts": 1716454218043696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218043752, "dur": 0, "args": { "External id": 39147, "cbid": 317, "correlation": 39147 } }, { "ph": "f", "id": 39147, "pid": 76337, "tid": -914061504, "ts": 1716454218043752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218043753, "dur": 0, "args": { "External id": 39148, "cbid": 203, "correlation": 39148 } }, { "ph": "f", "id": 39148, "pid": 76337, "tid": -914061504, "ts": 1716454218043753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218043754, "dur": 0, "args": { "External id": 39149, "cbid": 205, "correlation": 39149 } }, { "ph": "f", "id": 39149, "pid": 76337, "tid": -914061504, "ts": 1716454218043754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218101716, "dur": 11, "args": { "External id": 39153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39153, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39153, "pid": 5, "tid": 7, "ts": 1716454218101716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043768, "dur": 12, "args": { "External id": 39153, "cbid": 211, "correlation": 39153 } }, { "ph": "s", "id": 39153, "pid": 76337, "tid": -914061504, "ts": 1716454218043768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218101729, "dur": 3, "args": { "External id": 39155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39155, "pid": 5, "tid": 7, "ts": 1716454218101729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043784, "dur": 6, "args": { "External id": 39155, "cbid": 211, "correlation": 39155 } }, { "ph": "s", "id": 39155, "pid": 76337, "tid": -914061504, "ts": 1716454218043784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218043794, "dur": 0, "args": { "External id": 39156, "cbid": 51, "correlation": 39156 } }, { "ph": "s", "id": 39156, "pid": 76337, "tid": -914061504, "ts": 1716454218043794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218101733, "dur": 88, "args": { "External id": 39157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39157, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 39157, "pid": 5, "tid": 7, "ts": 1716454218101733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043795, "dur": 5, "args": { "External id": 39157, "cbid": 211, "correlation": 39157 } }, { "ph": "s", "id": 39157, "pid": 76337, "tid": -914061504, "ts": 1716454218043795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218101822, "dur": 15, "args": { "External id": 39162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39162, "pid": 5, "tid": 7, "ts": 1716454218101822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043821, "dur": 8, "args": { "External id": 39162, "cbid": 211, "correlation": 39162 } }, { "ph": "s", "id": 39162, "pid": 76337, "tid": -914061504, "ts": 1716454218043821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218101839, "dur": 81, "args": { "External id": 39171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39171, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39171, "pid": 5, "tid": 7, "ts": 1716454218101839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043904, "dur": 14, "args": { "External id": 39171, "cbid": 211, "correlation": 39171 } }, { "ph": "s", "id": 39171, "pid": 76337, "tid": -914061504, "ts": 1716454218043904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218101921, "dur": 30, "args": { "External id": 39193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39193, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39193, "pid": 5, "tid": 7, "ts": 1716454218101921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218043962, "dur": 10, "args": { "External id": 39193, "cbid": 211, "correlation": 39193 } }, { "ph": "s", "id": 39193, "pid": 76337, "tid": -914061504, "ts": 1716454218043962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044064, "dur": 2, "args": { "External id": 39204, "cbid": 251, "correlation": 39204 } }, { "ph": "f", "id": 39204, "pid": 76337, "tid": -914061504, "ts": 1716454218044064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218101952, "dur": 140, "args": { "External id": 39205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39205, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39205, "pid": 5, "tid": 7, "ts": 1716454218101952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044070, "dur": 14, "args": { "External id": 39205, "cbid": 211, "correlation": 39205 } }, { "ph": "s", "id": 39205, "pid": 76337, "tid": -914061504, "ts": 1716454218044070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044141, "dur": 1, "args": { "External id": 39216, "cbid": 251, "correlation": 39216 } }, { "ph": "f", "id": 39216, "pid": 76337, "tid": -914061504, "ts": 1716454218044141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218102094, "dur": 153, "args": { "External id": 39217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39217, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39217, "pid": 5, "tid": 7, "ts": 1716454218102094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044145, "dur": 12, "args": { "External id": 39217, "cbid": 211, "correlation": 39217 } }, { "ph": "s", "id": 39217, "pid": 76337, "tid": -914061504, "ts": 1716454218044145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044211, "dur": 1, "args": { "External id": 39228, "cbid": 251, "correlation": 39228 } }, { "ph": "f", "id": 39228, "pid": 76337, "tid": -914061504, "ts": 1716454218044211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218102248, "dur": 154, "args": { "External id": 39229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39229, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39229, "pid": 5, "tid": 7, "ts": 1716454218102248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044215, "dur": 11, "args": { "External id": 39229, "cbid": 211, "correlation": 39229 } }, { "ph": "s", "id": 39229, "pid": 76337, "tid": -914061504, "ts": 1716454218044215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218102404, "dur": 330, "args": { "External id": 39254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39254, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39254, "pid": 5, "tid": 7, "ts": 1716454218102404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044300, "dur": 13, "args": { "External id": 39254, "cbid": 211, "correlation": 39254 } }, { "ph": "s", "id": 39254, "pid": 76337, "tid": -914061504, "ts": 1716454218044300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044401, "dur": 1, "args": { "External id": 39272, "cbid": 251, "correlation": 39272 } }, { "ph": "f", "id": 39272, "pid": 76337, "tid": -914061504, "ts": 1716454218044401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218102736, "dur": 144, "args": { "External id": 39274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39274, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39274, "pid": 5, "tid": 7, "ts": 1716454218102736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044407, "dur": 13, "args": { "External id": 39274, "cbid": 211, "correlation": 39274 } }, { "ph": "s", "id": 39274, "pid": 76337, "tid": -914061504, "ts": 1716454218044407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218102881, "dur": 19, "args": { "External id": 39282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39282, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39282, "pid": 5, "tid": 7, "ts": 1716454218102881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044477, "dur": 12, "args": { "External id": 39282, "cbid": 211, "correlation": 39282 } }, { "ph": "s", "id": 39282, "pid": 76337, "tid": -914061504, "ts": 1716454218044477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218102902, "dur": 28, "args": { "External id": 39290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39290, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39290, "pid": 5, "tid": 7, "ts": 1716454218102902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044515, "dur": 9, "args": { "External id": 39290, "cbid": 211, "correlation": 39290 } }, { "ph": "s", "id": 39290, "pid": 76337, "tid": -914061504, "ts": 1716454218044515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218102932, "dur": 18, "args": { "External id": 39301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39301, "pid": 5, "tid": 7, "ts": 1716454218102932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044591, "dur": 12, "args": { "External id": 39301, "cbid": 211, "correlation": 39301 } }, { "ph": "s", "id": 39301, "pid": 76337, "tid": -914061504, "ts": 1716454218044591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218102951, "dur": 16, "args": { "External id": 39323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39323, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39323, "pid": 5, "tid": 7, "ts": 1716454218102951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044621, "dur": 8, "args": { "External id": 39323, "cbid": 211, "correlation": 39323 } }, { "ph": "s", "id": 39323, "pid": 76337, "tid": -914061504, "ts": 1716454218044621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044707, "dur": 1, "args": { "External id": 39334, "cbid": 251, "correlation": 39334 } }, { "ph": "f", "id": 39334, "pid": 76337, "tid": -914061504, "ts": 1716454218044707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218102968, "dur": 88, "args": { "External id": 39335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39335, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39335, "pid": 5, "tid": 7, "ts": 1716454218102968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044712, "dur": 13, "args": { "External id": 39335, "cbid": 211, "correlation": 39335 } }, { "ph": "s", "id": 39335, "pid": 76337, "tid": -914061504, "ts": 1716454218044712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044783, "dur": 1, "args": { "External id": 39346, "cbid": 251, "correlation": 39346 } }, { "ph": "f", "id": 39346, "pid": 76337, "tid": -914061504, "ts": 1716454218044783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044787, "dur": 0, "args": { "External id": 39347, "cbid": 251, "correlation": 39347 } }, { "ph": "f", "id": 39347, "pid": 76337, "tid": -914061504, "ts": 1716454218044787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218103057, "dur": 12, "args": { "External id": 39348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39348, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39348, "pid": 5, "tid": 7, "ts": 1716454218103057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044789, "dur": 12, "args": { "External id": 39348, "cbid": 211, "correlation": 39348 } }, { "ph": "s", "id": 39348, "pid": 76337, "tid": -914061504, "ts": 1716454218044789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218103070, "dur": 5, "args": { "External id": 39350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39350, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39350, "pid": 5, "tid": 7, "ts": 1716454218103070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044803, "dur": 6, "args": { "External id": 39350, "cbid": 211, "correlation": 39350 } }, { "ph": "s", "id": 39350, "pid": 76337, "tid": -914061504, "ts": 1716454218044803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044860, "dur": 1, "args": { "External id": 39361, "cbid": 251, "correlation": 39361 } }, { "ph": "f", "id": 39361, "pid": 76337, "tid": -914061504, "ts": 1716454218044860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218044863, "dur": 0, "args": { "External id": 39362, "cbid": 251, "correlation": 39362 } }, { "ph": "f", "id": 39362, "pid": 76337, "tid": -914061504, "ts": 1716454218044863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218103077, "dur": 8, "args": { "External id": 39363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39363, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39363, "pid": 5, "tid": 7, "ts": 1716454218103077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044865, "dur": 12, "args": { "External id": 39363, "cbid": 211, "correlation": 39363 } }, { "ph": "s", "id": 39363, "pid": 76337, "tid": -914061504, "ts": 1716454218044865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218103087, "dur": 3, "args": { "External id": 39365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39365, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39365, "pid": 5, "tid": 7, "ts": 1716454218103087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044878, "dur": 5, "args": { "External id": 39365, "cbid": 211, "correlation": 39365 } }, { "ph": "s", "id": 39365, "pid": 76337, "tid": -914061504, "ts": 1716454218044878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218103091, "dur": 54, "args": { "External id": 39390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39390, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39390, "pid": 5, "tid": 7, "ts": 1716454218103091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218044955, "dur": 12, "args": { "External id": 39390, "cbid": 211, "correlation": 39390 } }, { "ph": "s", "id": 39390, "pid": 76337, "tid": -914061504, "ts": 1716454218044955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218045062, "dur": 2, "args": { "External id": 39408, "cbid": 251, "correlation": 39408 } }, { "ph": "f", "id": 39408, "pid": 76337, "tid": -914061504, "ts": 1716454218045062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218103146, "dur": 89, "args": { "External id": 39410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39410, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39410, "pid": 5, "tid": 7, "ts": 1716454218103146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045068, "dur": 14, "args": { "External id": 39410, "cbid": 211, "correlation": 39410 } }, { "ph": "s", "id": 39410, "pid": 76337, "tid": -914061504, "ts": 1716454218045068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218103236, "dur": 9, "args": { "External id": 39418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39418, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39418, "pid": 5, "tid": 7, "ts": 1716454218103236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045138, "dur": 12, "args": { "External id": 39418, "cbid": 211, "correlation": 39418 } }, { "ph": "s", "id": 39418, "pid": 76337, "tid": -914061504, "ts": 1716454218045138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218103247, "dur": 21, "args": { "External id": 39426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39426, "pid": 5, "tid": 7, "ts": 1716454218103247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045181, "dur": 9, "args": { "External id": 39426, "cbid": 211, "correlation": 39426 } }, { "ph": "s", "id": 39426, "pid": 76337, "tid": -914061504, "ts": 1716454218045181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218103270, "dur": 17, "args": { "External id": 39448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39448, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39448, "pid": 5, "tid": 7, "ts": 1716454218103270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045232, "dur": 10, "args": { "External id": 39448, "cbid": 211, "correlation": 39448 } }, { "ph": "s", "id": 39448, "pid": 76337, "tid": -914061504, "ts": 1716454218045232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218045320, "dur": 1, "args": { "External id": 39464, "cbid": 251, "correlation": 39464 } }, { "ph": "f", "id": 39464, "pid": 76337, "tid": -914061504, "ts": 1716454218045320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218045325, "dur": 0, "args": { "External id": 39466, "cbid": 251, "correlation": 39466 } }, { "ph": "f", "id": 39466, "pid": 76337, "tid": -914061504, "ts": 1716454218045325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218103288, "dur": 491, "args": { "External id": 39467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39467, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39467, "pid": 5, "tid": 7, "ts": 1716454218103288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045327, "dur": 13, "args": { "External id": 39467, "cbid": 211, "correlation": 39467 } }, { "ph": "s", "id": 39467, "pid": 76337, "tid": -914061504, "ts": 1716454218045327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218103781, "dur": 64, "args": { "External id": 39475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39475, "pid": 5, "tid": 7, "ts": 1716454218103781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045394, "dur": 12, "args": { "External id": 39475, "cbid": 211, "correlation": 39475 } }, { "ph": "s", "id": 39475, "pid": 76337, "tid": -914061504, "ts": 1716454218045394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218103846, "dur": 69, "args": { "External id": 39483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39483, "pid": 5, "tid": 7, "ts": 1716454218103846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045425, "dur": 8, "args": { "External id": 39483, "cbid": 211, "correlation": 39483 } }, { "ph": "s", "id": 39483, "pid": 76337, "tid": -914061504, "ts": 1716454218045425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218045506, "dur": 1, "args": { "External id": 39499, "cbid": 251, "correlation": 39499 } }, { "ph": "f", "id": 39499, "pid": 76337, "tid": -914061504, "ts": 1716454218045506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218103917, "dur": 1, "args": { "External id": 39501, "device": 5, "context": 1, "stream": 7, "correlation": 39501, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 39501, "pid": 5, "tid": 7, "ts": 1716454218103917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218045511, "dur": 9, "args": { "External id": 39501, "cbid": 51, "correlation": 39501 } }, { "ph": "s", "id": 39501, "pid": 76337, "tid": -914061504, "ts": 1716454218045511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218103921, "dur": 264, "args": { "External id": 39502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39502, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39502, "pid": 5, "tid": 7, "ts": 1716454218103921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045522, "dur": 11, "args": { "External id": 39502, "cbid": 211, "correlation": 39502 } }, { "ph": "s", "id": 39502, "pid": 76337, "tid": -914061504, "ts": 1716454218045522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218104186, "dur": 14, "args": { "External id": 39510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39510, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39510, "pid": 5, "tid": 7, "ts": 1716454218104186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045565, "dur": 10, "args": { "External id": 39510, "cbid": 211, "correlation": 39510 } }, { "ph": "s", "id": 39510, "pid": 76337, "tid": -914061504, "ts": 1716454218045565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218104201, "dur": 37, "args": { "External id": 39521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39521, "pid": 5, "tid": 7, "ts": 1716454218104201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045633, "dur": 12, "args": { "External id": 39521, "cbid": 211, "correlation": 39521 } }, { "ph": "s", "id": 39521, "pid": 76337, "tid": -914061504, "ts": 1716454218045633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218045696, "dur": 0, "args": { "External id": 39533, "cbid": 317, "correlation": 39533 } }, { "ph": "f", "id": 39533, "pid": 76337, "tid": -914061504, "ts": 1716454218045696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218045697, "dur": 0, "args": { "External id": 39534, "cbid": 203, "correlation": 39534 } }, { "ph": "f", "id": 39534, "pid": 76337, "tid": -914061504, "ts": 1716454218045697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218045698, "dur": 0, "args": { "External id": 39535, "cbid": 205, "correlation": 39535 } }, { "ph": "f", "id": 39535, "pid": 76337, "tid": -914061504, "ts": 1716454218045698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218104239, "dur": 14, "args": { "External id": 39539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39539, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39539, "pid": 5, "tid": 7, "ts": 1716454218104239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045712, "dur": 12, "args": { "External id": 39539, "cbid": 211, "correlation": 39539 } }, { "ph": "s", "id": 39539, "pid": 76337, "tid": -914061504, "ts": 1716454218045712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218104254, "dur": 4, "args": { "External id": 39541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39541, "pid": 5, "tid": 7, "ts": 1716454218104254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045729, "dur": 6, "args": { "External id": 39541, "cbid": 211, "correlation": 39541 } }, { "ph": "s", "id": 39541, "pid": 76337, "tid": -914061504, "ts": 1716454218045729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218045739, "dur": 0, "args": { "External id": 39542, "cbid": 51, "correlation": 39542 } }, { "ph": "s", "id": 39542, "pid": 76337, "tid": -914061504, "ts": 1716454218045739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218104259, "dur": 93, "args": { "External id": 39543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39543, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 39543, "pid": 5, "tid": 7, "ts": 1716454218104259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045739, "dur": 5, "args": { "External id": 39543, "cbid": 211, "correlation": 39543 } }, { "ph": "s", "id": 39543, "pid": 76337, "tid": -914061504, "ts": 1716454218045739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218104354, "dur": 16, "args": { "External id": 39548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39548, "pid": 5, "tid": 7, "ts": 1716454218104354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045766, "dur": 8, "args": { "External id": 39548, "cbid": 211, "correlation": 39548 } }, { "ph": "s", "id": 39548, "pid": 76337, "tid": -914061504, "ts": 1716454218045766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218104371, "dur": 11, "args": { "External id": 39556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39556, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39556, "pid": 5, "tid": 7, "ts": 1716454218104371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045798, "dur": 8, "args": { "External id": 39556, "cbid": 211, "correlation": 39556 } }, { "ph": "s", "id": 39556, "pid": 76337, "tid": -914061504, "ts": 1716454218045798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218104384, "dur": 29, "args": { "External id": 39565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39565, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39565, "pid": 5, "tid": 7, "ts": 1716454218104384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045837, "dur": 10, "args": { "External id": 39565, "cbid": 211, "correlation": 39565 } }, { "ph": "s", "id": 39565, "pid": 76337, "tid": -914061504, "ts": 1716454218045837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218104414, "dur": 30, "args": { "External id": 39585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39585, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 39585, "pid": 5, "tid": 7, "ts": 1716454218104414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045908, "dur": 12, "args": { "External id": 39585, "cbid": 211, "correlation": 39585 } }, { "ph": "s", "id": 39585, "pid": 76337, "tid": -914061504, "ts": 1716454218045908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218104446, "dur": 5, "args": { "External id": 39597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39597, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39597, "pid": 5, "tid": 7, "ts": 1716454218104446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045930, "dur": 6, "args": { "External id": 39597, "cbid": 211, "correlation": 39597 } }, { "ph": "s", "id": 39597, "pid": 76337, "tid": -914061504, "ts": 1716454218045930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218104452, "dur": 30, "args": { "External id": 39600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39600, "pid": 5, "tid": 7, "ts": 1716454218104452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045948, "dur": 6, "args": { "External id": 39600, "cbid": 211, "correlation": 39600 } }, { "ph": "s", "id": 39600, "pid": 76337, "tid": -914061504, "ts": 1716454218045948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218104484, "dur": 22, "args": { "External id": 39609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39609, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39609, "pid": 5, "tid": 7, "ts": 1716454218104484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218045995, "dur": 10, "args": { "External id": 39609, "cbid": 211, "correlation": 39609 } }, { "ph": "s", "id": 39609, "pid": 76337, "tid": -914061504, "ts": 1716454218045995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218046048, "dur": 0, "args": { "External id": 39619, "cbid": 317, "correlation": 39619 } }, { "ph": "f", "id": 39619, "pid": 76337, "tid": -914061504, "ts": 1716454218046048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218046049, "dur": 0, "args": { "External id": 39620, "cbid": 203, "correlation": 39620 } }, { "ph": "f", "id": 39620, "pid": 76337, "tid": -914061504, "ts": 1716454218046049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218046050, "dur": 0, "args": { "External id": 39621, "cbid": 205, "correlation": 39621 } }, { "ph": "f", "id": 39621, "pid": 76337, "tid": -914061504, "ts": 1716454218046050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218104507, "dur": 22, "args": { "External id": 39625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39625, "pid": 5, "tid": 7, "ts": 1716454218104507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046063, "dur": 11, "args": { "External id": 39625, "cbid": 211, "correlation": 39625 } }, { "ph": "s", "id": 39625, "pid": 76337, "tid": -914061504, "ts": 1716454218046063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218104530, "dur": 313, "args": { "External id": 39627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39627, "pid": 5, "tid": 7, "ts": 1716454218104530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046077, "dur": 5, "args": { "External id": 39627, "cbid": 211, "correlation": 39627 } }, { "ph": "s", "id": 39627, "pid": 76337, "tid": -914061504, "ts": 1716454218046077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218104846, "dur": 1, "args": { "External id": 39629, "device": 5, "context": 1, "stream": 7, "correlation": 39629, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 39629, "pid": 5, "tid": 7, "ts": 1716454218104846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218046089, "dur": 7, "args": { "External id": 39629, "cbid": 51, "correlation": 39629 } }, { "ph": "s", "id": 39629, "pid": 76337, "tid": -914061504, "ts": 1716454218046089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218104849, "dur": 1232, "args": { "External id": 39630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39630, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39630, "pid": 5, "tid": 7, "ts": 1716454218104849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046096, "dur": 6, "args": { "External id": 39630, "cbid": 211, "correlation": 39630 } }, { "ph": "s", "id": 39630, "pid": 76337, "tid": -914061504, "ts": 1716454218046096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218106083, "dur": 13, "args": { "External id": 39632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39632, "pid": 5, "tid": 7, "ts": 1716454218106083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046107, "dur": 5, "args": { "External id": 39632, "cbid": 211, "correlation": 39632 } }, { "ph": "s", "id": 39632, "pid": 76337, "tid": -914061504, "ts": 1716454218046107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218106097, "dur": 15, "args": { "External id": 39638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39638, "pid": 5, "tid": 7, "ts": 1716454218106097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046134, "dur": 8, "args": { "External id": 39638, "cbid": 211, "correlation": 39638 } }, { "ph": "s", "id": 39638, "pid": 76337, "tid": -914061504, "ts": 1716454218046134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218106113, "dur": 3, "args": { "External id": 39646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39646, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 39646, "pid": 5, "tid": 7, "ts": 1716454218106113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046179, "dur": 9, "args": { "External id": 39646, "cbid": 211, "correlation": 39646 } }, { "ph": "s", "id": 39646, "pid": 76337, "tid": -914061504, "ts": 1716454218046179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218046244, "dur": 1, "args": { "External id": 39662, "cbid": 251, "correlation": 39662 } }, { "ph": "f", "id": 39662, "pid": 76337, "tid": -914061504, "ts": 1716454218046244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218046249, "dur": 0, "args": { "External id": 39664, "cbid": 251, "correlation": 39664 } }, { "ph": "f", "id": 39664, "pid": 76337, "tid": -914061504, "ts": 1716454218046249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218106118, "dur": 13, "args": { "External id": 39665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39665, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39665, "pid": 5, "tid": 7, "ts": 1716454218106118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046251, "dur": 11, "args": { "External id": 39665, "cbid": 211, "correlation": 39665 } }, { "ph": "s", "id": 39665, "pid": 76337, "tid": -914061504, "ts": 1716454218046251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218106133, "dur": 5, "args": { "External id": 39667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39667, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39667, "pid": 5, "tid": 7, "ts": 1716454218106133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046265, "dur": 5, "args": { "External id": 39667, "cbid": 211, "correlation": 39667 } }, { "ph": "s", "id": 39667, "pid": 76337, "tid": -914061504, "ts": 1716454218046265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218106139, "dur": 17, "args": { "External id": 39677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39677, "pid": 5, "tid": 7, "ts": 1716454218106139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046321, "dur": 12, "args": { "External id": 39677, "cbid": 211, "correlation": 39677 } }, { "ph": "s", "id": 39677, "pid": 76337, "tid": -914061504, "ts": 1716454218046321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218106157, "dur": 17, "args": { "External id": 39697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39697, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 39697, "pid": 5, "tid": 7, "ts": 1716454218106157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046388, "dur": 11, "args": { "External id": 39697, "cbid": 211, "correlation": 39697 } }, { "ph": "s", "id": 39697, "pid": 76337, "tid": -914061504, "ts": 1716454218046388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218106175, "dur": 4, "args": { "External id": 39709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39709, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 39709, "pid": 5, "tid": 7, "ts": 1716454218106175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046408, "dur": 6, "args": { "External id": 39709, "cbid": 211, "correlation": 39709 } }, { "ph": "s", "id": 39709, "pid": 76337, "tid": -914061504, "ts": 1716454218046408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218106181, "dur": 16, "args": { "External id": 39712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39712, "pid": 5, "tid": 7, "ts": 1716454218106181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046426, "dur": 6, "args": { "External id": 39712, "cbid": 211, "correlation": 39712 } }, { "ph": "s", "id": 39712, "pid": 76337, "tid": -914061504, "ts": 1716454218046426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218106198, "dur": 11, "args": { "External id": 39721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39721, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39721, "pid": 5, "tid": 7, "ts": 1716454218106198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046466, "dur": 10, "args": { "External id": 39721, "cbid": 211, "correlation": 39721 } }, { "ph": "s", "id": 39721, "pid": 76337, "tid": -914061504, "ts": 1716454218046466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218046528, "dur": 0, "args": { "External id": 39731, "cbid": 317, "correlation": 39731 } }, { "ph": "f", "id": 39731, "pid": 76337, "tid": -914061504, "ts": 1716454218046528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218046529, "dur": 0, "args": { "External id": 39732, "cbid": 203, "correlation": 39732 } }, { "ph": "f", "id": 39732, "pid": 76337, "tid": -914061504, "ts": 1716454218046529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218046530, "dur": 0, "args": { "External id": 39733, "cbid": 205, "correlation": 39733 } }, { "ph": "f", "id": 39733, "pid": 76337, "tid": -914061504, "ts": 1716454218046530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218106210, "dur": 11, "args": { "External id": 39737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39737, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39737, "pid": 5, "tid": 7, "ts": 1716454218106210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046543, "dur": 12, "args": { "External id": 39737, "cbid": 211, "correlation": 39737 } }, { "ph": "s", "id": 39737, "pid": 76337, "tid": -914061504, "ts": 1716454218046543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218106222, "dur": 159, "args": { "External id": 39739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39739, "pid": 5, "tid": 7, "ts": 1716454218106222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046557, "dur": 5, "args": { "External id": 39739, "cbid": 211, "correlation": 39739 } }, { "ph": "s", "id": 39739, "pid": 76337, "tid": -914061504, "ts": 1716454218046557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218106384, "dur": 1, "args": { "External id": 39741, "device": 5, "context": 1, "stream": 7, "correlation": 39741, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 39741, "pid": 5, "tid": 7, "ts": 1716454218106384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218046569, "dur": 6, "args": { "External id": 39741, "cbid": 51, "correlation": 39741 } }, { "ph": "s", "id": 39741, "pid": 76337, "tid": -914061504, "ts": 1716454218046569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218106388, "dur": 636, "args": { "External id": 39742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39742, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39742, "pid": 5, "tid": 7, "ts": 1716454218106388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046576, "dur": 6, "args": { "External id": 39742, "cbid": 211, "correlation": 39742 } }, { "ph": "s", "id": 39742, "pid": 76337, "tid": -914061504, "ts": 1716454218046576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218107025, "dur": 12, "args": { "External id": 39744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39744, "pid": 5, "tid": 7, "ts": 1716454218107025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046585, "dur": 5, "args": { "External id": 39744, "cbid": 211, "correlation": 39744 } }, { "ph": "s", "id": 39744, "pid": 76337, "tid": -914061504, "ts": 1716454218046585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218107038, "dur": 14, "args": { "External id": 39750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39750, "pid": 5, "tid": 7, "ts": 1716454218107038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046613, "dur": 9, "args": { "External id": 39750, "cbid": 211, "correlation": 39750 } }, { "ph": "s", "id": 39750, "pid": 76337, "tid": -914061504, "ts": 1716454218046613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218046671, "dur": 0, "args": { "External id": 39760, "cbid": 317, "correlation": 39760 } }, { "ph": "f", "id": 39760, "pid": 76337, "tid": -914061504, "ts": 1716454218046671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218046672, "dur": 0, "args": { "External id": 39761, "cbid": 203, "correlation": 39761 } }, { "ph": "f", "id": 39761, "pid": 76337, "tid": -914061504, "ts": 1716454218046672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218046673, "dur": 0, "args": { "External id": 39762, "cbid": 205, "correlation": 39762 } }, { "ph": "f", "id": 39762, "pid": 76337, "tid": -914061504, "ts": 1716454218046673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218107054, "dur": 22, "args": { "External id": 39766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39766, "pid": 5, "tid": 7, "ts": 1716454218107054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046685, "dur": 11, "args": { "External id": 39766, "cbid": 211, "correlation": 39766 } }, { "ph": "s", "id": 39766, "pid": 76337, "tid": -914061504, "ts": 1716454218046685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218107077, "dur": 4, "args": { "External id": 39768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39768, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39768, "pid": 5, "tid": 7, "ts": 1716454218107077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046700, "dur": 6, "args": { "External id": 39768, "cbid": 211, "correlation": 39768 } }, { "ph": "s", "id": 39768, "pid": 76337, "tid": -914061504, "ts": 1716454218046700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218046709, "dur": 0, "args": { "External id": 39769, "cbid": 51, "correlation": 39769 } }, { "ph": "s", "id": 39769, "pid": 76337, "tid": -914061504, "ts": 1716454218046709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218107082, "dur": 166, "args": { "External id": 39770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39770, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 39770, "pid": 5, "tid": 7, "ts": 1716454218107082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046710, "dur": 5, "args": { "External id": 39770, "cbid": 211, "correlation": 39770 } }, { "ph": "s", "id": 39770, "pid": 76337, "tid": -914061504, "ts": 1716454218046710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218107249, "dur": 16, "args": { "External id": 39775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39775, "pid": 5, "tid": 7, "ts": 1716454218107249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046735, "dur": 9, "args": { "External id": 39775, "cbid": 211, "correlation": 39775 } }, { "ph": "s", "id": 39775, "pid": 76337, "tid": -914061504, "ts": 1716454218046735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218107266, "dur": 13, "args": { "External id": 39783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39783, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39783, "pid": 5, "tid": 7, "ts": 1716454218107266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046763, "dur": 8, "args": { "External id": 39783, "cbid": 211, "correlation": 39783 } }, { "ph": "s", "id": 39783, "pid": 76337, "tid": -914061504, "ts": 1716454218046763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218107281, "dur": 10, "args": { "External id": 39791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39791, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39791, "pid": 5, "tid": 7, "ts": 1716454218107281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046791, "dur": 8, "args": { "External id": 39791, "cbid": 211, "correlation": 39791 } }, { "ph": "s", "id": 39791, "pid": 76337, "tid": -914061504, "ts": 1716454218046791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218107292, "dur": 18, "args": { "External id": 39811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39811, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 39811, "pid": 5, "tid": 7, "ts": 1716454218107292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046874, "dur": 13, "args": { "External id": 39811, "cbid": 211, "correlation": 39811 } }, { "ph": "s", "id": 39811, "pid": 76337, "tid": -914061504, "ts": 1716454218046874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218107311, "dur": 4, "args": { "External id": 39823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39823, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 39823, "pid": 5, "tid": 7, "ts": 1716454218107311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046896, "dur": 6, "args": { "External id": 39823, "cbid": 211, "correlation": 39823 } }, { "ph": "s", "id": 39823, "pid": 76337, "tid": -914061504, "ts": 1716454218046896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218107317, "dur": 16, "args": { "External id": 39826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39826, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39826, "pid": 5, "tid": 7, "ts": 1716454218107317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046914, "dur": 7, "args": { "External id": 39826, "cbid": 211, "correlation": 39826 } }, { "ph": "s", "id": 39826, "pid": 76337, "tid": -914061504, "ts": 1716454218046914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218046971, "dur": 0, "args": { "External id": 39837, "cbid": 317, "correlation": 39837 } }, { "ph": "f", "id": 39837, "pid": 76337, "tid": -914061504, "ts": 1716454218046971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218046972, "dur": 0, "args": { "External id": 39838, "cbid": 203, "correlation": 39838 } }, { "ph": "f", "id": 39838, "pid": 76337, "tid": -914061504, "ts": 1716454218046972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218046972, "dur": 8, "args": { "External id": 39839, "cbid": 205, "correlation": 39839 } }, { "ph": "f", "id": 39839, "pid": 76337, "tid": -914061504, "ts": 1716454218046972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218107334, "dur": 12, "args": { "External id": 39843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39843, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39843, "pid": 5, "tid": 7, "ts": 1716454218107334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218046993, "dur": 12, "args": { "External id": 39843, "cbid": 211, "correlation": 39843 } }, { "ph": "s", "id": 39843, "pid": 76337, "tid": -914061504, "ts": 1716454218046993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218107347, "dur": 3, "args": { "External id": 39845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 39845, "pid": 5, "tid": 7, "ts": 1716454218107347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047010, "dur": 6, "args": { "External id": 39845, "cbid": 211, "correlation": 39845 } }, { "ph": "s", "id": 39845, "pid": 76337, "tid": -914061504, "ts": 1716454218047010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218047018, "dur": 0, "args": { "External id": 39846, "cbid": 51, "correlation": 39846 } }, { "ph": "s", "id": 39846, "pid": 76337, "tid": -914061504, "ts": 1716454218047018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218107352, "dur": 89, "args": { "External id": 39847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39847, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 39847, "pid": 5, "tid": 7, "ts": 1716454218107352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047019, "dur": 5, "args": { "External id": 39847, "cbid": 211, "correlation": 39847 } }, { "ph": "s", "id": 39847, "pid": 76337, "tid": -914061504, "ts": 1716454218047019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218107442, "dur": 15, "args": { "External id": 39852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39852, "pid": 5, "tid": 7, "ts": 1716454218107442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047046, "dur": 9, "args": { "External id": 39852, "cbid": 211, "correlation": 39852 } }, { "ph": "s", "id": 39852, "pid": 76337, "tid": -914061504, "ts": 1716454218047046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218107459, "dur": 82, "args": { "External id": 39861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39861, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39861, "pid": 5, "tid": 7, "ts": 1716454218107459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047129, "dur": 14, "args": { "External id": 39861, "cbid": 211, "correlation": 39861 } }, { "ph": "s", "id": 39861, "pid": 76337, "tid": -914061504, "ts": 1716454218047129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218107542, "dur": 29, "args": { "External id": 39883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39883, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39883, "pid": 5, "tid": 7, "ts": 1716454218107542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047186, "dur": 10, "args": { "External id": 39883, "cbid": 211, "correlation": 39883 } }, { "ph": "s", "id": 39883, "pid": 76337, "tid": -914061504, "ts": 1716454218047186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218047278, "dur": 1, "args": { "External id": 39894, "cbid": 251, "correlation": 39894 } }, { "ph": "f", "id": 39894, "pid": 76337, "tid": -914061504, "ts": 1716454218047278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218107573, "dur": 161, "args": { "External id": 39895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39895, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39895, "pid": 5, "tid": 7, "ts": 1716454218107573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047283, "dur": 13, "args": { "External id": 39895, "cbid": 211, "correlation": 39895 } }, { "ph": "s", "id": 39895, "pid": 76337, "tid": -914061504, "ts": 1716454218047283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218047353, "dur": 1, "args": { "External id": 39906, "cbid": 251, "correlation": 39906 } }, { "ph": "f", "id": 39906, "pid": 76337, "tid": -914061504, "ts": 1716454218047353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218107735, "dur": 155, "args": { "External id": 39907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39907, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39907, "pid": 5, "tid": 7, "ts": 1716454218107735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047358, "dur": 11, "args": { "External id": 39907, "cbid": 211, "correlation": 39907 } }, { "ph": "s", "id": 39907, "pid": 76337, "tid": -914061504, "ts": 1716454218047358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218047423, "dur": 1, "args": { "External id": 39918, "cbid": 251, "correlation": 39918 } }, { "ph": "f", "id": 39918, "pid": 76337, "tid": -914061504, "ts": 1716454218047423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218107891, "dur": 155, "args": { "External id": 39919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39919, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39919, "pid": 5, "tid": 7, "ts": 1716454218107891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047427, "dur": 11, "args": { "External id": 39919, "cbid": 211, "correlation": 39919 } }, { "ph": "s", "id": 39919, "pid": 76337, "tid": -914061504, "ts": 1716454218047427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218108048, "dur": 326, "args": { "External id": 39944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39944, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39944, "pid": 5, "tid": 7, "ts": 1716454218108048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047511, "dur": 13, "args": { "External id": 39944, "cbid": 211, "correlation": 39944 } }, { "ph": "s", "id": 39944, "pid": 76337, "tid": -914061504, "ts": 1716454218047511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218047611, "dur": 1, "args": { "External id": 39962, "cbid": 251, "correlation": 39962 } }, { "ph": "f", "id": 39962, "pid": 76337, "tid": -914061504, "ts": 1716454218047611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218108376, "dur": 162, "args": { "External id": 39964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39964, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 39964, "pid": 5, "tid": 7, "ts": 1716454218108376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047617, "dur": 13, "args": { "External id": 39964, "cbid": 211, "correlation": 39964 } }, { "ph": "s", "id": 39964, "pid": 76337, "tid": -914061504, "ts": 1716454218047617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218108539, "dur": 19, "args": { "External id": 39972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39972, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39972, "pid": 5, "tid": 7, "ts": 1716454218108539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047688, "dur": 12, "args": { "External id": 39972, "cbid": 211, "correlation": 39972 } }, { "ph": "s", "id": 39972, "pid": 76337, "tid": -914061504, "ts": 1716454218047688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218108560, "dur": 28, "args": { "External id": 39980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39980, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39980, "pid": 5, "tid": 7, "ts": 1716454218108560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047726, "dur": 9, "args": { "External id": 39980, "cbid": 211, "correlation": 39980 } }, { "ph": "s", "id": 39980, "pid": 76337, "tid": -914061504, "ts": 1716454218047726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218108588, "dur": 18, "args": { "External id": 39991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 39991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 39991, "pid": 5, "tid": 7, "ts": 1716454218108588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047800, "dur": 12, "args": { "External id": 39991, "cbid": 211, "correlation": 39991 } }, { "ph": "s", "id": 39991, "pid": 76337, "tid": -914061504, "ts": 1716454218047800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218108608, "dur": 16, "args": { "External id": 40013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40013, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40013, "pid": 5, "tid": 7, "ts": 1716454218108608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047831, "dur": 7, "args": { "External id": 40013, "cbid": 211, "correlation": 40013 } }, { "ph": "s", "id": 40013, "pid": 76337, "tid": -914061504, "ts": 1716454218047831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218047917, "dur": 1, "args": { "External id": 40024, "cbid": 251, "correlation": 40024 } }, { "ph": "f", "id": 40024, "pid": 76337, "tid": -914061504, "ts": 1716454218047917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218108625, "dur": 87, "args": { "External id": 40025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40025, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40025, "pid": 5, "tid": 7, "ts": 1716454218108625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218047922, "dur": 13, "args": { "External id": 40025, "cbid": 211, "correlation": 40025 } }, { "ph": "s", "id": 40025, "pid": 76337, "tid": -914061504, "ts": 1716454218047922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048001, "dur": 1, "args": { "External id": 40036, "cbid": 251, "correlation": 40036 } }, { "ph": "f", "id": 40036, "pid": 76337, "tid": -914061504, "ts": 1716454218048001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048005, "dur": 0, "args": { "External id": 40037, "cbid": 251, "correlation": 40037 } }, { "ph": "f", "id": 40037, "pid": 76337, "tid": -914061504, "ts": 1716454218048005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218108713, "dur": 12, "args": { "External id": 40038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40038, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40038, "pid": 5, "tid": 7, "ts": 1716454218108713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048006, "dur": 12, "args": { "External id": 40038, "cbid": 211, "correlation": 40038 } }, { "ph": "s", "id": 40038, "pid": 76337, "tid": -914061504, "ts": 1716454218048006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218108726, "dur": 5, "args": { "External id": 40040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40040, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40040, "pid": 5, "tid": 7, "ts": 1716454218108726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048020, "dur": 6, "args": { "External id": 40040, "cbid": 211, "correlation": 40040 } }, { "ph": "s", "id": 40040, "pid": 76337, "tid": -914061504, "ts": 1716454218048020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048078, "dur": 1, "args": { "External id": 40051, "cbid": 251, "correlation": 40051 } }, { "ph": "f", "id": 40051, "pid": 76337, "tid": -914061504, "ts": 1716454218048078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048081, "dur": 0, "args": { "External id": 40052, "cbid": 251, "correlation": 40052 } }, { "ph": "f", "id": 40052, "pid": 76337, "tid": -914061504, "ts": 1716454218048081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218108733, "dur": 8, "args": { "External id": 40053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40053, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40053, "pid": 5, "tid": 7, "ts": 1716454218108733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048083, "dur": 12, "args": { "External id": 40053, "cbid": 211, "correlation": 40053 } }, { "ph": "s", "id": 40053, "pid": 76337, "tid": -914061504, "ts": 1716454218048083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218108742, "dur": 3, "args": { "External id": 40055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40055, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40055, "pid": 5, "tid": 7, "ts": 1716454218108742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048097, "dur": 5, "args": { "External id": 40055, "cbid": 211, "correlation": 40055 } }, { "ph": "s", "id": 40055, "pid": 76337, "tid": -914061504, "ts": 1716454218048097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218108746, "dur": 54, "args": { "External id": 40080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40080, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40080, "pid": 5, "tid": 7, "ts": 1716454218108746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048173, "dur": 12, "args": { "External id": 40080, "cbid": 211, "correlation": 40080 } }, { "ph": "s", "id": 40080, "pid": 76337, "tid": -914061504, "ts": 1716454218048173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048273, "dur": 1, "args": { "External id": 40098, "cbid": 251, "correlation": 40098 } }, { "ph": "f", "id": 40098, "pid": 76337, "tid": -914061504, "ts": 1716454218048273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218108802, "dur": 89, "args": { "External id": 40100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40100, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40100, "pid": 5, "tid": 7, "ts": 1716454218108802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048279, "dur": 14, "args": { "External id": 40100, "cbid": 211, "correlation": 40100 } }, { "ph": "s", "id": 40100, "pid": 76337, "tid": -914061504, "ts": 1716454218048279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218108892, "dur": 10, "args": { "External id": 40108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40108, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40108, "pid": 5, "tid": 7, "ts": 1716454218108892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048348, "dur": 12, "args": { "External id": 40108, "cbid": 211, "correlation": 40108 } }, { "ph": "s", "id": 40108, "pid": 76337, "tid": -914061504, "ts": 1716454218048348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218108903, "dur": 20, "args": { "External id": 40116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40116, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40116, "pid": 5, "tid": 7, "ts": 1716454218108903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048390, "dur": 9, "args": { "External id": 40116, "cbid": 211, "correlation": 40116 } }, { "ph": "s", "id": 40116, "pid": 76337, "tid": -914061504, "ts": 1716454218048390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218108925, "dur": 17, "args": { "External id": 40138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40138, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40138, "pid": 5, "tid": 7, "ts": 1716454218108925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048442, "dur": 10, "args": { "External id": 40138, "cbid": 211, "correlation": 40138 } }, { "ph": "s", "id": 40138, "pid": 76337, "tid": -914061504, "ts": 1716454218048442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048527, "dur": 1, "args": { "External id": 40154, "cbid": 251, "correlation": 40154 } }, { "ph": "f", "id": 40154, "pid": 76337, "tid": -914061504, "ts": 1716454218048527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048532, "dur": 0, "args": { "External id": 40156, "cbid": 251, "correlation": 40156 } }, { "ph": "f", "id": 40156, "pid": 76337, "tid": -914061504, "ts": 1716454218048532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218108944, "dur": 491, "args": { "External id": 40157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40157, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40157, "pid": 5, "tid": 7, "ts": 1716454218108944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048534, "dur": 13, "args": { "External id": 40157, "cbid": 211, "correlation": 40157 } }, { "ph": "s", "id": 40157, "pid": 76337, "tid": -914061504, "ts": 1716454218048534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218109436, "dur": 65, "args": { "External id": 40165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40165, "pid": 5, "tid": 7, "ts": 1716454218109436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048600, "dur": 12, "args": { "External id": 40165, "cbid": 211, "correlation": 40165 } }, { "ph": "s", "id": 40165, "pid": 76337, "tid": -914061504, "ts": 1716454218048600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218109503, "dur": 66, "args": { "External id": 40173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40173, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40173, "pid": 5, "tid": 7, "ts": 1716454218109503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048630, "dur": 9, "args": { "External id": 40173, "cbid": 211, "correlation": 40173 } }, { "ph": "s", "id": 40173, "pid": 76337, "tid": -914061504, "ts": 1716454218048630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218048712, "dur": 1, "args": { "External id": 40189, "cbid": 251, "correlation": 40189 } }, { "ph": "f", "id": 40189, "pid": 76337, "tid": -914061504, "ts": 1716454218048712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218109571, "dur": 1, "args": { "External id": 40191, "device": 5, "context": 1, "stream": 7, "correlation": 40191, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 40191, "pid": 5, "tid": 7, "ts": 1716454218109571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218048717, "dur": 9, "args": { "External id": 40191, "cbid": 51, "correlation": 40191 } }, { "ph": "s", "id": 40191, "pid": 76337, "tid": -914061504, "ts": 1716454218048717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218109575, "dur": 263, "args": { "External id": 40192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40192, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40192, "pid": 5, "tid": 7, "ts": 1716454218109575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048727, "dur": 11, "args": { "External id": 40192, "cbid": 211, "correlation": 40192 } }, { "ph": "s", "id": 40192, "pid": 76337, "tid": -914061504, "ts": 1716454218048727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218109839, "dur": 14, "args": { "External id": 40200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40200, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40200, "pid": 5, "tid": 7, "ts": 1716454218109839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048769, "dur": 10, "args": { "External id": 40200, "cbid": 211, "correlation": 40200 } }, { "ph": "s", "id": 40200, "pid": 76337, "tid": -914061504, "ts": 1716454218048769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218109854, "dur": 37, "args": { "External id": 40211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40211, "pid": 5, "tid": 7, "ts": 1716454218109854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048837, "dur": 13, "args": { "External id": 40211, "cbid": 211, "correlation": 40211 } }, { "ph": "s", "id": 40211, "pid": 76337, "tid": -914061504, "ts": 1716454218048837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218048947, "dur": 0, "args": { "External id": 40223, "cbid": 317, "correlation": 40223 } }, { "ph": "f", "id": 40223, "pid": 76337, "tid": -914061504, "ts": 1716454218048947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218048948, "dur": 0, "args": { "External id": 40224, "cbid": 203, "correlation": 40224 } }, { "ph": "f", "id": 40224, "pid": 76337, "tid": -914061504, "ts": 1716454218048948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218048949, "dur": 0, "args": { "External id": 40225, "cbid": 205, "correlation": 40225 } }, { "ph": "f", "id": 40225, "pid": 76337, "tid": -914061504, "ts": 1716454218048949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218109893, "dur": 13, "args": { "External id": 40229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40229, "pid": 5, "tid": 7, "ts": 1716454218109893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048965, "dur": 20, "args": { "External id": 40229, "cbid": 211, "correlation": 40229 } }, { "ph": "s", "id": 40229, "pid": 76337, "tid": -914061504, "ts": 1716454218048965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218109907, "dur": 4, "args": { "External id": 40231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40231, "pid": 5, "tid": 7, "ts": 1716454218109907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218048990, "dur": 6, "args": { "External id": 40231, "cbid": 211, "correlation": 40231 } }, { "ph": "s", "id": 40231, "pid": 76337, "tid": -914061504, "ts": 1716454218048990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218048999, "dur": 0, "args": { "External id": 40232, "cbid": 51, "correlation": 40232 } }, { "ph": "s", "id": 40232, "pid": 76337, "tid": -914061504, "ts": 1716454218048999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218109912, "dur": 95, "args": { "External id": 40233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40233, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 40233, "pid": 5, "tid": 7, "ts": 1716454218109912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049000, "dur": 6, "args": { "External id": 40233, "cbid": 211, "correlation": 40233 } }, { "ph": "s", "id": 40233, "pid": 76337, "tid": -914061504, "ts": 1716454218049000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218110008, "dur": 15, "args": { "External id": 40238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40238, "pid": 5, "tid": 7, "ts": 1716454218110008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049029, "dur": 9, "args": { "External id": 40238, "cbid": 211, "correlation": 40238 } }, { "ph": "s", "id": 40238, "pid": 76337, "tid": -914061504, "ts": 1716454218049029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218110025, "dur": 12, "args": { "External id": 40246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40246, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40246, "pid": 5, "tid": 7, "ts": 1716454218110025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049062, "dur": 8, "args": { "External id": 40246, "cbid": 211, "correlation": 40246 } }, { "ph": "s", "id": 40246, "pid": 76337, "tid": -914061504, "ts": 1716454218049062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218110038, "dur": 25, "args": { "External id": 40255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40255, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40255, "pid": 5, "tid": 7, "ts": 1716454218110038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049101, "dur": 11, "args": { "External id": 40255, "cbid": 211, "correlation": 40255 } }, { "ph": "s", "id": 40255, "pid": 76337, "tid": -914061504, "ts": 1716454218049101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218110065, "dur": 24, "args": { "External id": 40275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40275, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 40275, "pid": 5, "tid": 7, "ts": 1716454218110065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049174, "dur": 12, "args": { "External id": 40275, "cbid": 211, "correlation": 40275 } }, { "ph": "s", "id": 40275, "pid": 76337, "tid": -914061504, "ts": 1716454218049174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218110090, "dur": 5, "args": { "External id": 40287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40287, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 40287, "pid": 5, "tid": 7, "ts": 1716454218110090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049196, "dur": 6, "args": { "External id": 40287, "cbid": 211, "correlation": 40287 } }, { "ph": "s", "id": 40287, "pid": 76337, "tid": -914061504, "ts": 1716454218049196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218110096, "dur": 24, "args": { "External id": 40290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40290, "pid": 5, "tid": 7, "ts": 1716454218110096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049215, "dur": 7, "args": { "External id": 40290, "cbid": 211, "correlation": 40290 } }, { "ph": "s", "id": 40290, "pid": 76337, "tid": -914061504, "ts": 1716454218049215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218110122, "dur": 17, "args": { "External id": 40299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40299, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40299, "pid": 5, "tid": 7, "ts": 1716454218110122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049254, "dur": 10, "args": { "External id": 40299, "cbid": 211, "correlation": 40299 } }, { "ph": "s", "id": 40299, "pid": 76337, "tid": -914061504, "ts": 1716454218049254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218049306, "dur": 0, "args": { "External id": 40309, "cbid": 317, "correlation": 40309 } }, { "ph": "f", "id": 40309, "pid": 76337, "tid": -914061504, "ts": 1716454218049306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218049307, "dur": 0, "args": { "External id": 40310, "cbid": 203, "correlation": 40310 } }, { "ph": "f", "id": 40310, "pid": 76337, "tid": -914061504, "ts": 1716454218049307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218049307, "dur": 0, "args": { "External id": 40311, "cbid": 205, "correlation": 40311 } }, { "ph": "f", "id": 40311, "pid": 76337, "tid": -914061504, "ts": 1716454218049307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218110140, "dur": 18, "args": { "External id": 40315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40315, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40315, "pid": 5, "tid": 7, "ts": 1716454218110140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049323, "dur": 11, "args": { "External id": 40315, "cbid": 211, "correlation": 40315 } }, { "ph": "s", "id": 40315, "pid": 76337, "tid": -914061504, "ts": 1716454218049323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218110159, "dur": 237, "args": { "External id": 40317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40317, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40317, "pid": 5, "tid": 7, "ts": 1716454218110159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049337, "dur": 5, "args": { "External id": 40317, "cbid": 211, "correlation": 40317 } }, { "ph": "s", "id": 40317, "pid": 76337, "tid": -914061504, "ts": 1716454218049337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218110398, "dur": 1, "args": { "External id": 40319, "device": 5, "context": 1, "stream": 7, "correlation": 40319, "bytes": 960, "memory bandwidth (GB/s)": 0.5084745762711864 } }, { "ph": "f", "id": 40319, "pid": 5, "tid": 7, "ts": 1716454218110398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218049349, "dur": 7, "args": { "External id": 40319, "cbid": 51, "correlation": 40319 } }, { "ph": "s", "id": 40319, "pid": 76337, "tid": -914061504, "ts": 1716454218049349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218110402, "dur": 805, "args": { "External id": 40320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40320, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40320, "pid": 5, "tid": 7, "ts": 1716454218110402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049357, "dur": 8, "args": { "External id": 40320, "cbid": 211, "correlation": 40320 } }, { "ph": "s", "id": 40320, "pid": 76337, "tid": -914061504, "ts": 1716454218049357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218111208, "dur": 13, "args": { "External id": 40322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40322, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40322, "pid": 5, "tid": 7, "ts": 1716454218111208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049370, "dur": 6, "args": { "External id": 40322, "cbid": 211, "correlation": 40322 } }, { "ph": "s", "id": 40322, "pid": 76337, "tid": -914061504, "ts": 1716454218049370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218111222, "dur": 14, "args": { "External id": 40328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40328, "pid": 5, "tid": 7, "ts": 1716454218111222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049399, "dur": 9, "args": { "External id": 40328, "cbid": 211, "correlation": 40328 } }, { "ph": "s", "id": 40328, "pid": 76337, "tid": -914061504, "ts": 1716454218049399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218111238, "dur": 3, "args": { "External id": 40336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40336, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 40336, "pid": 5, "tid": 7, "ts": 1716454218111238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049443, "dur": 9, "args": { "External id": 40336, "cbid": 211, "correlation": 40336 } }, { "ph": "s", "id": 40336, "pid": 76337, "tid": -914061504, "ts": 1716454218049443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218049510, "dur": 1, "args": { "External id": 40352, "cbid": 251, "correlation": 40352 } }, { "ph": "f", "id": 40352, "pid": 76337, "tid": -914061504, "ts": 1716454218049510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218049515, "dur": 0, "args": { "External id": 40354, "cbid": 251, "correlation": 40354 } }, { "ph": "f", "id": 40354, "pid": 76337, "tid": -914061504, "ts": 1716454218049515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218111242, "dur": 13, "args": { "External id": 40355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40355, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40355, "pid": 5, "tid": 7, "ts": 1716454218111242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049517, "dur": 12, "args": { "External id": 40355, "cbid": 211, "correlation": 40355 } }, { "ph": "s", "id": 40355, "pid": 76337, "tid": -914061504, "ts": 1716454218049517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218111257, "dur": 5, "args": { "External id": 40357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40357, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40357, "pid": 5, "tid": 7, "ts": 1716454218111257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049531, "dur": 5, "args": { "External id": 40357, "cbid": 211, "correlation": 40357 } }, { "ph": "s", "id": 40357, "pid": 76337, "tid": -914061504, "ts": 1716454218049531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218111263, "dur": 16, "args": { "External id": 40367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40367, "pid": 5, "tid": 7, "ts": 1716454218111263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049589, "dur": 11, "args": { "External id": 40367, "cbid": 211, "correlation": 40367 } }, { "ph": "s", "id": 40367, "pid": 76337, "tid": -914061504, "ts": 1716454218049589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218111281, "dur": 17, "args": { "External id": 40387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40387, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 40387, "pid": 5, "tid": 7, "ts": 1716454218111281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049655, "dur": 11, "args": { "External id": 40387, "cbid": 211, "correlation": 40387 } }, { "ph": "s", "id": 40387, "pid": 76337, "tid": -914061504, "ts": 1716454218049655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218111299, "dur": 4, "args": { "External id": 40399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40399, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 40399, "pid": 5, "tid": 7, "ts": 1716454218111299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049676, "dur": 7, "args": { "External id": 40399, "cbid": 211, "correlation": 40399 } }, { "ph": "s", "id": 40399, "pid": 76337, "tid": -914061504, "ts": 1716454218049676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218111304, "dur": 16, "args": { "External id": 40402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40402, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40402, "pid": 5, "tid": 7, "ts": 1716454218111304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049694, "dur": 6, "args": { "External id": 40402, "cbid": 211, "correlation": 40402 } }, { "ph": "s", "id": 40402, "pid": 76337, "tid": -914061504, "ts": 1716454218049694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218111321, "dur": 10, "args": { "External id": 40411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40411, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40411, "pid": 5, "tid": 7, "ts": 1716454218111321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049736, "dur": 10, "args": { "External id": 40411, "cbid": 211, "correlation": 40411 } }, { "ph": "s", "id": 40411, "pid": 76337, "tid": -914061504, "ts": 1716454218049736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218049798, "dur": 0, "args": { "External id": 40421, "cbid": 317, "correlation": 40421 } }, { "ph": "f", "id": 40421, "pid": 76337, "tid": -914061504, "ts": 1716454218049798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218049799, "dur": 0, "args": { "External id": 40422, "cbid": 203, "correlation": 40422 } }, { "ph": "f", "id": 40422, "pid": 76337, "tid": -914061504, "ts": 1716454218049799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218049799, "dur": 0, "args": { "External id": 40423, "cbid": 205, "correlation": 40423 } }, { "ph": "f", "id": 40423, "pid": 76337, "tid": -914061504, "ts": 1716454218049799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218111333, "dur": 12, "args": { "External id": 40427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40427, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40427, "pid": 5, "tid": 7, "ts": 1716454218111333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049813, "dur": 12, "args": { "External id": 40427, "cbid": 211, "correlation": 40427 } }, { "ph": "s", "id": 40427, "pid": 76337, "tid": -914061504, "ts": 1716454218049813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218111346, "dur": 157, "args": { "External id": 40429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40429, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40429, "pid": 5, "tid": 7, "ts": 1716454218111346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049827, "dur": 6, "args": { "External id": 40429, "cbid": 211, "correlation": 40429 } }, { "ph": "s", "id": 40429, "pid": 76337, "tid": -914061504, "ts": 1716454218049827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218111506, "dur": 1, "args": { "External id": 40431, "device": 5, "context": 1, "stream": 7, "correlation": 40431, "bytes": 960, "memory bandwidth (GB/s)": 0.5996252342286071 } }, { "ph": "f", "id": 40431, "pid": 5, "tid": 7, "ts": 1716454218111506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218049839, "dur": 6, "args": { "External id": 40431, "cbid": 51, "correlation": 40431 } }, { "ph": "s", "id": 40431, "pid": 76337, "tid": -914061504, "ts": 1716454218049839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218111509, "dur": 636, "args": { "External id": 40432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40432, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40432, "pid": 5, "tid": 7, "ts": 1716454218111509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049847, "dur": 6, "args": { "External id": 40432, "cbid": 211, "correlation": 40432 } }, { "ph": "s", "id": 40432, "pid": 76337, "tid": -914061504, "ts": 1716454218049847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218112147, "dur": 11, "args": { "External id": 40434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40434, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40434, "pid": 5, "tid": 7, "ts": 1716454218112147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049856, "dur": 5, "args": { "External id": 40434, "cbid": 211, "correlation": 40434 } }, { "ph": "s", "id": 40434, "pid": 76337, "tid": -914061504, "ts": 1716454218049856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218112159, "dur": 14, "args": { "External id": 40440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40440, "pid": 5, "tid": 7, "ts": 1716454218112159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049884, "dur": 9, "args": { "External id": 40440, "cbid": 211, "correlation": 40440 } }, { "ph": "s", "id": 40440, "pid": 76337, "tid": -914061504, "ts": 1716454218049884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218049943, "dur": 0, "args": { "External id": 40450, "cbid": 317, "correlation": 40450 } }, { "ph": "f", "id": 40450, "pid": 76337, "tid": -914061504, "ts": 1716454218049943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218049944, "dur": 0, "args": { "External id": 40451, "cbid": 203, "correlation": 40451 } }, { "ph": "f", "id": 40451, "pid": 76337, "tid": -914061504, "ts": 1716454218049944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218049945, "dur": 0, "args": { "External id": 40452, "cbid": 205, "correlation": 40452 } }, { "ph": "f", "id": 40452, "pid": 76337, "tid": -914061504, "ts": 1716454218049945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218112175, "dur": 16, "args": { "External id": 40456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40456, "pid": 5, "tid": 7, "ts": 1716454218112175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049959, "dur": 12, "args": { "External id": 40456, "cbid": 211, "correlation": 40456 } }, { "ph": "s", "id": 40456, "pid": 76337, "tid": -914061504, "ts": 1716454218049959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218112192, "dur": 4, "args": { "External id": 40458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40458, "pid": 5, "tid": 7, "ts": 1716454218112192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049983, "dur": 6, "args": { "External id": 40458, "cbid": 211, "correlation": 40458 } }, { "ph": "s", "id": 40458, "pid": 76337, "tid": -914061504, "ts": 1716454218049983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218049992, "dur": 0, "args": { "External id": 40459, "cbid": 51, "correlation": 40459 } }, { "ph": "s", "id": 40459, "pid": 76337, "tid": -914061504, "ts": 1716454218049992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218112197, "dur": 127, "args": { "External id": 40460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40460, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 40460, "pid": 5, "tid": 7, "ts": 1716454218112197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218049993, "dur": 5, "args": { "External id": 40460, "cbid": 211, "correlation": 40460 } }, { "ph": "s", "id": 40460, "pid": 76337, "tid": -914061504, "ts": 1716454218049993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218112326, "dur": 15, "args": { "External id": 40465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40465, "pid": 5, "tid": 7, "ts": 1716454218112326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050020, "dur": 9, "args": { "External id": 40465, "cbid": 211, "correlation": 40465 } }, { "ph": "s", "id": 40465, "pid": 76337, "tid": -914061504, "ts": 1716454218050020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218112343, "dur": 11, "args": { "External id": 40473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40473, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40473, "pid": 5, "tid": 7, "ts": 1716454218112343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050050, "dur": 9, "args": { "External id": 40473, "cbid": 211, "correlation": 40473 } }, { "ph": "s", "id": 40473, "pid": 76337, "tid": -914061504, "ts": 1716454218050050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218112355, "dur": 10, "args": { "External id": 40481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40481, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40481, "pid": 5, "tid": 7, "ts": 1716454218112355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050079, "dur": 8, "args": { "External id": 40481, "cbid": 211, "correlation": 40481 } }, { "ph": "s", "id": 40481, "pid": 76337, "tid": -914061504, "ts": 1716454218050079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218112366, "dur": 18, "args": { "External id": 40501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40501, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 40501, "pid": 5, "tid": 7, "ts": 1716454218112366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050163, "dur": 12, "args": { "External id": 40501, "cbid": 211, "correlation": 40501 } }, { "ph": "s", "id": 40501, "pid": 76337, "tid": -914061504, "ts": 1716454218050163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218112386, "dur": 4, "args": { "External id": 40513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40513, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 40513, "pid": 5, "tid": 7, "ts": 1716454218112386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050186, "dur": 6, "args": { "External id": 40513, "cbid": 211, "correlation": 40513 } }, { "ph": "s", "id": 40513, "pid": 76337, "tid": -914061504, "ts": 1716454218050186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218112392, "dur": 17, "args": { "External id": 40516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40516, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40516, "pid": 5, "tid": 7, "ts": 1716454218112392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050204, "dur": 7, "args": { "External id": 40516, "cbid": 211, "correlation": 40516 } }, { "ph": "s", "id": 40516, "pid": 76337, "tid": -914061504, "ts": 1716454218050204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218050261, "dur": 0, "args": { "External id": 40527, "cbid": 317, "correlation": 40527 } }, { "ph": "f", "id": 40527, "pid": 76337, "tid": -914061504, "ts": 1716454218050261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218050262, "dur": 0, "args": { "External id": 40528, "cbid": 203, "correlation": 40528 } }, { "ph": "f", "id": 40528, "pid": 76337, "tid": -914061504, "ts": 1716454218050262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218050263, "dur": 0, "args": { "External id": 40529, "cbid": 205, "correlation": 40529 } }, { "ph": "f", "id": 40529, "pid": 76337, "tid": -914061504, "ts": 1716454218050263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218112410, "dur": 11, "args": { "External id": 40533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40533, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40533, "pid": 5, "tid": 7, "ts": 1716454218112410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050276, "dur": 12, "args": { "External id": 40533, "cbid": 211, "correlation": 40533 } }, { "ph": "s", "id": 40533, "pid": 76337, "tid": -914061504, "ts": 1716454218050276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218112422, "dur": 3, "args": { "External id": 40535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40535, "pid": 5, "tid": 7, "ts": 1716454218112422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050292, "dur": 6, "args": { "External id": 40535, "cbid": 211, "correlation": 40535 } }, { "ph": "s", "id": 40535, "pid": 76337, "tid": -914061504, "ts": 1716454218050292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218050300, "dur": 0, "args": { "External id": 40536, "cbid": 51, "correlation": 40536 } }, { "ph": "s", "id": 40536, "pid": 76337, "tid": -914061504, "ts": 1716454218050300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218112426, "dur": 88, "args": { "External id": 40537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40537, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 40537, "pid": 5, "tid": 7, "ts": 1716454218112426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050301, "dur": 5, "args": { "External id": 40537, "cbid": 211, "correlation": 40537 } }, { "ph": "s", "id": 40537, "pid": 76337, "tid": -914061504, "ts": 1716454218050301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218112515, "dur": 15, "args": { "External id": 40542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40542, "pid": 5, "tid": 7, "ts": 1716454218112515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050328, "dur": 8, "args": { "External id": 40542, "cbid": 211, "correlation": 40542 } }, { "ph": "s", "id": 40542, "pid": 76337, "tid": -914061504, "ts": 1716454218050328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218112532, "dur": 82, "args": { "External id": 40551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40551, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40551, "pid": 5, "tid": 7, "ts": 1716454218112532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050409, "dur": 15, "args": { "External id": 40551, "cbid": 211, "correlation": 40551 } }, { "ph": "s", "id": 40551, "pid": 76337, "tid": -914061504, "ts": 1716454218050409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218112616, "dur": 30, "args": { "External id": 40573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40573, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40573, "pid": 5, "tid": 7, "ts": 1716454218112616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050467, "dur": 10, "args": { "External id": 40573, "cbid": 211, "correlation": 40573 } }, { "ph": "s", "id": 40573, "pid": 76337, "tid": -914061504, "ts": 1716454218050467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218050556, "dur": 1, "args": { "External id": 40584, "cbid": 251, "correlation": 40584 } }, { "ph": "f", "id": 40584, "pid": 76337, "tid": -914061504, "ts": 1716454218050556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218112647, "dur": 159, "args": { "External id": 40585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40585, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40585, "pid": 5, "tid": 7, "ts": 1716454218112647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050562, "dur": 14, "args": { "External id": 40585, "cbid": 211, "correlation": 40585 } }, { "ph": "s", "id": 40585, "pid": 76337, "tid": -914061504, "ts": 1716454218050562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218050633, "dur": 1, "args": { "External id": 40596, "cbid": 251, "correlation": 40596 } }, { "ph": "f", "id": 40596, "pid": 76337, "tid": -914061504, "ts": 1716454218050633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218112807, "dur": 155, "args": { "External id": 40597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40597, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40597, "pid": 5, "tid": 7, "ts": 1716454218112807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050637, "dur": 11, "args": { "External id": 40597, "cbid": 211, "correlation": 40597 } }, { "ph": "s", "id": 40597, "pid": 76337, "tid": -914061504, "ts": 1716454218050637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218050702, "dur": 1, "args": { "External id": 40608, "cbid": 251, "correlation": 40608 } }, { "ph": "f", "id": 40608, "pid": 76337, "tid": -914061504, "ts": 1716454218050702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218112963, "dur": 155, "args": { "External id": 40609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40609, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40609, "pid": 5, "tid": 7, "ts": 1716454218112963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050706, "dur": 11, "args": { "External id": 40609, "cbid": 211, "correlation": 40609 } }, { "ph": "s", "id": 40609, "pid": 76337, "tid": -914061504, "ts": 1716454218050706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218113120, "dur": 330, "args": { "External id": 40634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40634, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40634, "pid": 5, "tid": 7, "ts": 1716454218113120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050790, "dur": 13, "args": { "External id": 40634, "cbid": 211, "correlation": 40634 } }, { "ph": "s", "id": 40634, "pid": 76337, "tid": -914061504, "ts": 1716454218050790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218050890, "dur": 1, "args": { "External id": 40652, "cbid": 251, "correlation": 40652 } }, { "ph": "f", "id": 40652, "pid": 76337, "tid": -914061504, "ts": 1716454218050890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218113451, "dur": 161, "args": { "External id": 40654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40654, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40654, "pid": 5, "tid": 7, "ts": 1716454218113451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050896, "dur": 13, "args": { "External id": 40654, "cbid": 211, "correlation": 40654 } }, { "ph": "s", "id": 40654, "pid": 76337, "tid": -914061504, "ts": 1716454218050896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218113614, "dur": 19, "args": { "External id": 40662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40662, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40662, "pid": 5, "tid": 7, "ts": 1716454218113614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218050966, "dur": 20, "args": { "External id": 40662, "cbid": 211, "correlation": 40662 } }, { "ph": "s", "id": 40662, "pid": 76337, "tid": -914061504, "ts": 1716454218050966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218113635, "dur": 28, "args": { "External id": 40670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40670, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40670, "pid": 5, "tid": 7, "ts": 1716454218113635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051013, "dur": 9, "args": { "External id": 40670, "cbid": 211, "correlation": 40670 } }, { "ph": "s", "id": 40670, "pid": 76337, "tid": -914061504, "ts": 1716454218051013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218113663, "dur": 18, "args": { "External id": 40681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40681, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40681, "pid": 5, "tid": 7, "ts": 1716454218113663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051087, "dur": 12, "args": { "External id": 40681, "cbid": 211, "correlation": 40681 } }, { "ph": "s", "id": 40681, "pid": 76337, "tid": -914061504, "ts": 1716454218051087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218113683, "dur": 15, "args": { "External id": 40703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40703, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40703, "pid": 5, "tid": 7, "ts": 1716454218113683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051119, "dur": 7, "args": { "External id": 40703, "cbid": 211, "correlation": 40703 } }, { "ph": "s", "id": 40703, "pid": 76337, "tid": -914061504, "ts": 1716454218051119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051203, "dur": 1, "args": { "External id": 40714, "cbid": 251, "correlation": 40714 } }, { "ph": "f", "id": 40714, "pid": 76337, "tid": -914061504, "ts": 1716454218051203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218113699, "dur": 87, "args": { "External id": 40715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40715, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40715, "pid": 5, "tid": 7, "ts": 1716454218113699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051208, "dur": 13, "args": { "External id": 40715, "cbid": 211, "correlation": 40715 } }, { "ph": "s", "id": 40715, "pid": 76337, "tid": -914061504, "ts": 1716454218051208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051279, "dur": 1, "args": { "External id": 40726, "cbid": 251, "correlation": 40726 } }, { "ph": "f", "id": 40726, "pid": 76337, "tid": -914061504, "ts": 1716454218051279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051282, "dur": 0, "args": { "External id": 40727, "cbid": 251, "correlation": 40727 } }, { "ph": "f", "id": 40727, "pid": 76337, "tid": -914061504, "ts": 1716454218051282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218113788, "dur": 12, "args": { "External id": 40728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40728, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40728, "pid": 5, "tid": 7, "ts": 1716454218113788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051284, "dur": 12, "args": { "External id": 40728, "cbid": 211, "correlation": 40728 } }, { "ph": "s", "id": 40728, "pid": 76337, "tid": -914061504, "ts": 1716454218051284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218113801, "dur": 5, "args": { "External id": 40730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40730, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40730, "pid": 5, "tid": 7, "ts": 1716454218113801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051298, "dur": 6, "args": { "External id": 40730, "cbid": 211, "correlation": 40730 } }, { "ph": "s", "id": 40730, "pid": 76337, "tid": -914061504, "ts": 1716454218051298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051355, "dur": 1, "args": { "External id": 40741, "cbid": 251, "correlation": 40741 } }, { "ph": "f", "id": 40741, "pid": 76337, "tid": -914061504, "ts": 1716454218051355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051358, "dur": 0, "args": { "External id": 40742, "cbid": 251, "correlation": 40742 } }, { "ph": "f", "id": 40742, "pid": 76337, "tid": -914061504, "ts": 1716454218051358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218113808, "dur": 8, "args": { "External id": 40743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40743, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40743, "pid": 5, "tid": 7, "ts": 1716454218113808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051359, "dur": 11, "args": { "External id": 40743, "cbid": 211, "correlation": 40743 } }, { "ph": "s", "id": 40743, "pid": 76337, "tid": -914061504, "ts": 1716454218051359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218113817, "dur": 3, "args": { "External id": 40745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40745, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40745, "pid": 5, "tid": 7, "ts": 1716454218113817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051372, "dur": 5, "args": { "External id": 40745, "cbid": 211, "correlation": 40745 } }, { "ph": "s", "id": 40745, "pid": 76337, "tid": -914061504, "ts": 1716454218051372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218113821, "dur": 54, "args": { "External id": 40770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40770, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40770, "pid": 5, "tid": 7, "ts": 1716454218113821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051448, "dur": 13, "args": { "External id": 40770, "cbid": 211, "correlation": 40770 } }, { "ph": "s", "id": 40770, "pid": 76337, "tid": -914061504, "ts": 1716454218051448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051546, "dur": 1, "args": { "External id": 40788, "cbid": 251, "correlation": 40788 } }, { "ph": "f", "id": 40788, "pid": 76337, "tid": -914061504, "ts": 1716454218051546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218113877, "dur": 89, "args": { "External id": 40790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40790, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40790, "pid": 5, "tid": 7, "ts": 1716454218113877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051551, "dur": 14, "args": { "External id": 40790, "cbid": 211, "correlation": 40790 } }, { "ph": "s", "id": 40790, "pid": 76337, "tid": -914061504, "ts": 1716454218051551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218113967, "dur": 10, "args": { "External id": 40798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40798, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40798, "pid": 5, "tid": 7, "ts": 1716454218113967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051622, "dur": 12, "args": { "External id": 40798, "cbid": 211, "correlation": 40798 } }, { "ph": "s", "id": 40798, "pid": 76337, "tid": -914061504, "ts": 1716454218051622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218113978, "dur": 21, "args": { "External id": 40806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40806, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40806, "pid": 5, "tid": 7, "ts": 1716454218113978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051663, "dur": 9, "args": { "External id": 40806, "cbid": 211, "correlation": 40806 } }, { "ph": "s", "id": 40806, "pid": 76337, "tid": -914061504, "ts": 1716454218051663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218114001, "dur": 17, "args": { "External id": 40828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40828, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40828, "pid": 5, "tid": 7, "ts": 1716454218114001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051713, "dur": 10, "args": { "External id": 40828, "cbid": 211, "correlation": 40828 } }, { "ph": "s", "id": 40828, "pid": 76337, "tid": -914061504, "ts": 1716454218051713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051800, "dur": 1, "args": { "External id": 40844, "cbid": 251, "correlation": 40844 } }, { "ph": "f", "id": 40844, "pid": 76337, "tid": -914061504, "ts": 1716454218051800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051804, "dur": 0, "args": { "External id": 40846, "cbid": 251, "correlation": 40846 } }, { "ph": "f", "id": 40846, "pid": 76337, "tid": -914061504, "ts": 1716454218051804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218114019, "dur": 492, "args": { "External id": 40847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40847, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40847, "pid": 5, "tid": 7, "ts": 1716454218114019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051806, "dur": 13, "args": { "External id": 40847, "cbid": 211, "correlation": 40847 } }, { "ph": "s", "id": 40847, "pid": 76337, "tid": -914061504, "ts": 1716454218051806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218114513, "dur": 64, "args": { "External id": 40855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40855, "pid": 5, "tid": 7, "ts": 1716454218114513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051872, "dur": 13, "args": { "External id": 40855, "cbid": 211, "correlation": 40855 } }, { "ph": "s", "id": 40855, "pid": 76337, "tid": -914061504, "ts": 1716454218051872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218114578, "dur": 65, "args": { "External id": 40863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40863, "pid": 5, "tid": 7, "ts": 1716454218114578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218051902, "dur": 8, "args": { "External id": 40863, "cbid": 211, "correlation": 40863 } }, { "ph": "s", "id": 40863, "pid": 76337, "tid": -914061504, "ts": 1716454218051902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218051990, "dur": 1, "args": { "External id": 40879, "cbid": 251, "correlation": 40879 } }, { "ph": "f", "id": 40879, "pid": 76337, "tid": -914061504, "ts": 1716454218051990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218114646, "dur": 1, "args": { "External id": 40881, "device": 5, "context": 1, "stream": 7, "correlation": 40881, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 40881, "pid": 5, "tid": 7, "ts": 1716454218114646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218051995, "dur": 9, "args": { "External id": 40881, "cbid": 51, "correlation": 40881 } }, { "ph": "s", "id": 40881, "pid": 76337, "tid": -914061504, "ts": 1716454218051995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218114649, "dur": 264, "args": { "External id": 40882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40882, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40882, "pid": 5, "tid": 7, "ts": 1716454218114649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052006, "dur": 12, "args": { "External id": 40882, "cbid": 211, "correlation": 40882 } }, { "ph": "s", "id": 40882, "pid": 76337, "tid": -914061504, "ts": 1716454218052006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218114915, "dur": 13, "args": { "External id": 40890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40890, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40890, "pid": 5, "tid": 7, "ts": 1716454218114915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052050, "dur": 10, "args": { "External id": 40890, "cbid": 211, "correlation": 40890 } }, { "ph": "s", "id": 40890, "pid": 76337, "tid": -914061504, "ts": 1716454218052050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218114930, "dur": 37, "args": { "External id": 40901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40901, "pid": 5, "tid": 7, "ts": 1716454218114930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052118, "dur": 13, "args": { "External id": 40901, "cbid": 211, "correlation": 40901 } }, { "ph": "s", "id": 40901, "pid": 76337, "tid": -914061504, "ts": 1716454218052118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218052182, "dur": 0, "args": { "External id": 40913, "cbid": 317, "correlation": 40913 } }, { "ph": "f", "id": 40913, "pid": 76337, "tid": -914061504, "ts": 1716454218052182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218052183, "dur": 0, "args": { "External id": 40914, "cbid": 203, "correlation": 40914 } }, { "ph": "f", "id": 40914, "pid": 76337, "tid": -914061504, "ts": 1716454218052183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218052184, "dur": 0, "args": { "External id": 40915, "cbid": 205, "correlation": 40915 } }, { "ph": "f", "id": 40915, "pid": 76337, "tid": -914061504, "ts": 1716454218052184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218114968, "dur": 12, "args": { "External id": 40919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40919, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40919, "pid": 5, "tid": 7, "ts": 1716454218114968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052200, "dur": 12, "args": { "External id": 40919, "cbid": 211, "correlation": 40919 } }, { "ph": "s", "id": 40919, "pid": 76337, "tid": -914061504, "ts": 1716454218052200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218114982, "dur": 4, "args": { "External id": 40921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 40921, "pid": 5, "tid": 7, "ts": 1716454218114982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052216, "dur": 6, "args": { "External id": 40921, "cbid": 211, "correlation": 40921 } }, { "ph": "s", "id": 40921, "pid": 76337, "tid": -914061504, "ts": 1716454218052216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218052224, "dur": 0, "args": { "External id": 40922, "cbid": 51, "correlation": 40922 } }, { "ph": "s", "id": 40922, "pid": 76337, "tid": -914061504, "ts": 1716454218052224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218114987, "dur": 94, "args": { "External id": 40923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40923, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 40923, "pid": 5, "tid": 7, "ts": 1716454218114987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052225, "dur": 5, "args": { "External id": 40923, "cbid": 211, "correlation": 40923 } }, { "ph": "s", "id": 40923, "pid": 76337, "tid": -914061504, "ts": 1716454218052225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218115082, "dur": 15, "args": { "External id": 40928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40928, "pid": 5, "tid": 7, "ts": 1716454218115082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052252, "dur": 9, "args": { "External id": 40928, "cbid": 211, "correlation": 40928 } }, { "ph": "s", "id": 40928, "pid": 76337, "tid": -914061504, "ts": 1716454218052252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218115099, "dur": 12, "args": { "External id": 40936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40936, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40936, "pid": 5, "tid": 7, "ts": 1716454218115099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052284, "dur": 9, "args": { "External id": 40936, "cbid": 211, "correlation": 40936 } }, { "ph": "s", "id": 40936, "pid": 76337, "tid": -914061504, "ts": 1716454218052284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454218115112, "dur": 56, "args": { "External id": 40947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40947, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40947, "pid": 5, "tid": 7, "ts": 1716454218115112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052350, "dur": 13, "args": { "External id": 40947, "cbid": 211, "correlation": 40947 } }, { "ph": "s", "id": 40947, "pid": 76337, "tid": -914061504, "ts": 1716454218052350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218052407, "dur": 0, "args": { "External id": 40957, "cbid": 317, "correlation": 40957 } }, { "ph": "f", "id": 40957, "pid": 76337, "tid": -914061504, "ts": 1716454218052407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218052407, "dur": 0, "args": { "External id": 40958, "cbid": 203, "correlation": 40958 } }, { "ph": "f", "id": 40958, "pid": 76337, "tid": -914061504, "ts": 1716454218052407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218052408, "dur": 0, "args": { "External id": 40959, "cbid": 205, "correlation": 40959 } }, { "ph": "f", "id": 40959, "pid": 76337, "tid": -914061504, "ts": 1716454218052408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218115169, "dur": 39, "args": { "External id": 40963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40963, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40963, "pid": 5, "tid": 7, "ts": 1716454218115169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052424, "dur": 12, "args": { "External id": 40963, "cbid": 211, "correlation": 40963 } }, { "ph": "s", "id": 40963, "pid": 76337, "tid": -914061504, "ts": 1716454218052424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218115209, "dur": 159, "args": { "External id": 40965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40965, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40965, "pid": 5, "tid": 7, "ts": 1716454218115209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052439, "dur": 6, "args": { "External id": 40965, "cbid": 211, "correlation": 40965 } }, { "ph": "s", "id": 40965, "pid": 76337, "tid": -914061504, "ts": 1716454218052439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218115369, "dur": 1958, "args": { "External id": 40967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40967, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 40967, "pid": 5, "tid": 7, "ts": 1716454218115369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052452, "dur": 8, "args": { "External id": 40967, "cbid": 211, "correlation": 40967 } }, { "ph": "s", "id": 40967, "pid": 76337, "tid": -914061504, "ts": 1716454218052452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218117329, "dur": 40, "args": { "External id": 40969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40969, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40969, "pid": 5, "tid": 7, "ts": 1716454218117329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052464, "dur": 5, "args": { "External id": 40969, "cbid": 211, "correlation": 40969 } }, { "ph": "s", "id": 40969, "pid": 76337, "tid": -914061504, "ts": 1716454218052464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218117370, "dur": 59, "args": { "External id": 40975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40975, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40975, "pid": 5, "tid": 7, "ts": 1716454218117370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052492, "dur": 8, "args": { "External id": 40975, "cbid": 211, "correlation": 40975 } }, { "ph": "s", "id": 40975, "pid": 76337, "tid": -914061504, "ts": 1716454218052492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218117429, "dur": 86, "args": { "External id": 40984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 40984, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 40984, "pid": 5, "tid": 7, "ts": 1716454218117429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052582, "dur": 13, "args": { "External id": 40984, "cbid": 211, "correlation": 40984 } }, { "ph": "s", "id": 40984, "pid": 76337, "tid": -914061504, "ts": 1716454218052582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218117516, "dur": 72, "args": { "External id": 41004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41004, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 41004, "pid": 5, "tid": 7, "ts": 1716454218117516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052651, "dur": 12, "args": { "External id": 41004, "cbid": 211, "correlation": 41004 } }, { "ph": "s", "id": 41004, "pid": 76337, "tid": -914061504, "ts": 1716454218052651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218117590, "dur": 5, "args": { "External id": 41016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41016, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 41016, "pid": 5, "tid": 7, "ts": 1716454218117590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052673, "dur": 6, "args": { "External id": 41016, "cbid": 211, "correlation": 41016 } }, { "ph": "s", "id": 41016, "pid": 76337, "tid": -914061504, "ts": 1716454218052673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218117596, "dur": 82, "args": { "External id": 41019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41019, "pid": 5, "tid": 7, "ts": 1716454218117596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052691, "dur": 7, "args": { "External id": 41019, "cbid": 211, "correlation": 41019 } }, { "ph": "s", "id": 41019, "pid": 76337, "tid": -914061504, "ts": 1716454218052691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218117679, "dur": 53, "args": { "External id": 41028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41028, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41028, "pid": 5, "tid": 7, "ts": 1716454218117679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052731, "dur": 9, "args": { "External id": 41028, "cbid": 211, "correlation": 41028 } }, { "ph": "s", "id": 41028, "pid": 76337, "tid": -914061504, "ts": 1716454218052731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218052782, "dur": 0, "args": { "External id": 41038, "cbid": 317, "correlation": 41038 } }, { "ph": "f", "id": 41038, "pid": 76337, "tid": -914061504, "ts": 1716454218052782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218052783, "dur": 0, "args": { "External id": 41039, "cbid": 203, "correlation": 41039 } }, { "ph": "f", "id": 41039, "pid": 76337, "tid": -914061504, "ts": 1716454218052783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218052784, "dur": 0, "args": { "External id": 41040, "cbid": 205, "correlation": 41040 } }, { "ph": "f", "id": 41040, "pid": 76337, "tid": -914061504, "ts": 1716454218052784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218117733, "dur": 56, "args": { "External id": 41044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41044, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41044, "pid": 5, "tid": 7, "ts": 1716454218117733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052800, "dur": 12, "args": { "External id": 41044, "cbid": 211, "correlation": 41044 } }, { "ph": "s", "id": 41044, "pid": 76337, "tid": -914061504, "ts": 1716454218052800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218117790, "dur": 120, "args": { "External id": 41046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41046, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41046, "pid": 5, "tid": 7, "ts": 1716454218117790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052815, "dur": 5, "args": { "External id": 41046, "cbid": 211, "correlation": 41046 } }, { "ph": "s", "id": 41046, "pid": 76337, "tid": -914061504, "ts": 1716454218052815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218117912, "dur": 1856, "args": { "External id": 41048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41048, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41048, "pid": 5, "tid": 7, "ts": 1716454218117912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052826, "dur": 6, "args": { "External id": 41048, "cbid": 211, "correlation": 41048 } }, { "ph": "s", "id": 41048, "pid": 76337, "tid": -914061504, "ts": 1716454218052826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218119769, "dur": 19, "args": { "External id": 41050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41050, "pid": 5, "tid": 7, "ts": 1716454218119769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052835, "dur": 5, "args": { "External id": 41050, "cbid": 211, "correlation": 41050 } }, { "ph": "s", "id": 41050, "pid": 76337, "tid": -914061504, "ts": 1716454218052835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218119789, "dur": 32, "args": { "External id": 41056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41056, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41056, "pid": 5, "tid": 7, "ts": 1716454218119789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052864, "dur": 8, "args": { "External id": 41056, "cbid": 211, "correlation": 41056 } }, { "ph": "s", "id": 41056, "pid": 76337, "tid": -914061504, "ts": 1716454218052864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218119822, "dur": 3, "args": { "External id": 41064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41064, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 41064, "pid": 5, "tid": 7, "ts": 1716454218119822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052906, "dur": 11, "args": { "External id": 41064, "cbid": 211, "correlation": 41064 } }, { "ph": "s", "id": 41064, "pid": 76337, "tid": -914061504, "ts": 1716454218052906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218052985, "dur": 1, "args": { "External id": 41080, "cbid": 251, "correlation": 41080 } }, { "ph": "f", "id": 41080, "pid": 76337, "tid": -914061504, "ts": 1716454218052985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218052990, "dur": 0, "args": { "External id": 41082, "cbid": 251, "correlation": 41082 } }, { "ph": "f", "id": 41082, "pid": 76337, "tid": -914061504, "ts": 1716454218052990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218119827, "dur": 12, "args": { "External id": 41083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41083, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 41083, "pid": 5, "tid": 7, "ts": 1716454218119827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218052992, "dur": 12, "args": { "External id": 41083, "cbid": 211, "correlation": 41083 } }, { "ph": "s", "id": 41083, "pid": 76337, "tid": -914061504, "ts": 1716454218052992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218119841, "dur": 5, "args": { "External id": 41085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41085, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 41085, "pid": 5, "tid": 7, "ts": 1716454218119841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053006, "dur": 6, "args": { "External id": 41085, "cbid": 211, "correlation": 41085 } }, { "ph": "s", "id": 41085, "pid": 76337, "tid": -914061504, "ts": 1716454218053006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218119847, "dur": 29, "args": { "External id": 41095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41095, "pid": 5, "tid": 7, "ts": 1716454218119847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053066, "dur": 12, "args": { "External id": 41095, "cbid": 211, "correlation": 41095 } }, { "ph": "s", "id": 41095, "pid": 76337, "tid": -914061504, "ts": 1716454218053066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218119877, "dur": 30, "args": { "External id": 41115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41115, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 41115, "pid": 5, "tid": 7, "ts": 1716454218119877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053132, "dur": 11, "args": { "External id": 41115, "cbid": 211, "correlation": 41115 } }, { "ph": "s", "id": 41115, "pid": 76337, "tid": -914061504, "ts": 1716454218053132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218119909, "dur": 4, "args": { "External id": 41127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41127, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 41127, "pid": 5, "tid": 7, "ts": 1716454218119909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053153, "dur": 6, "args": { "External id": 41127, "cbid": 211, "correlation": 41127 } }, { "ph": "s", "id": 41127, "pid": 76337, "tid": -914061504, "ts": 1716454218053153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218119914, "dur": 30, "args": { "External id": 41130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41130, "pid": 5, "tid": 7, "ts": 1716454218119914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053172, "dur": 7, "args": { "External id": 41130, "cbid": 211, "correlation": 41130 } }, { "ph": "s", "id": 41130, "pid": 76337, "tid": -914061504, "ts": 1716454218053172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218119946, "dur": 20, "args": { "External id": 41139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41139, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41139, "pid": 5, "tid": 7, "ts": 1716454218119946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053211, "dur": 10, "args": { "External id": 41139, "cbid": 211, "correlation": 41139 } }, { "ph": "s", "id": 41139, "pid": 76337, "tid": -914061504, "ts": 1716454218053211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218053275, "dur": 0, "args": { "External id": 41149, "cbid": 317, "correlation": 41149 } }, { "ph": "f", "id": 41149, "pid": 76337, "tid": -914061504, "ts": 1716454218053275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218053276, "dur": 0, "args": { "External id": 41150, "cbid": 203, "correlation": 41150 } }, { "ph": "f", "id": 41150, "pid": 76337, "tid": -914061504, "ts": 1716454218053276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218053276, "dur": 0, "args": { "External id": 41151, "cbid": 205, "correlation": 41151 } }, { "ph": "f", "id": 41151, "pid": 76337, "tid": -914061504, "ts": 1716454218053276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218119967, "dur": 22, "args": { "External id": 41155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41155, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41155, "pid": 5, "tid": 7, "ts": 1716454218119967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053291, "dur": 12, "args": { "External id": 41155, "cbid": 211, "correlation": 41155 } }, { "ph": "s", "id": 41155, "pid": 76337, "tid": -914061504, "ts": 1716454218053291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218119990, "dur": 43, "args": { "External id": 41157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41157, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41157, "pid": 5, "tid": 7, "ts": 1716454218119990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053306, "dur": 5, "args": { "External id": 41157, "cbid": 211, "correlation": 41157 } }, { "ph": "s", "id": 41157, "pid": 76337, "tid": -914061504, "ts": 1716454218053306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218120034, "dur": 635, "args": { "External id": 41159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41159, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41159, "pid": 5, "tid": 7, "ts": 1716454218120034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053318, "dur": 6, "args": { "External id": 41159, "cbid": 211, "correlation": 41159 } }, { "ph": "s", "id": 41159, "pid": 76337, "tid": -914061504, "ts": 1716454218053318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218120670, "dur": 21, "args": { "External id": 41161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41161, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41161, "pid": 5, "tid": 7, "ts": 1716454218120670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053328, "dur": 5, "args": { "External id": 41161, "cbid": 211, "correlation": 41161 } }, { "ph": "s", "id": 41161, "pid": 76337, "tid": -914061504, "ts": 1716454218053328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218120692, "dur": 32, "args": { "External id": 41167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41167, "pid": 5, "tid": 7, "ts": 1716454218120692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053355, "dur": 8, "args": { "External id": 41167, "cbid": 211, "correlation": 41167 } }, { "ph": "s", "id": 41167, "pid": 76337, "tid": -914061504, "ts": 1716454218053355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218053412, "dur": 0, "args": { "External id": 41177, "cbid": 317, "correlation": 41177 } }, { "ph": "f", "id": 41177, "pid": 76337, "tid": -914061504, "ts": 1716454218053412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218053413, "dur": 0, "args": { "External id": 41178, "cbid": 203, "correlation": 41178 } }, { "ph": "f", "id": 41178, "pid": 76337, "tid": -914061504, "ts": 1716454218053413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218053414, "dur": 0, "args": { "External id": 41179, "cbid": 205, "correlation": 41179 } }, { "ph": "f", "id": 41179, "pid": 76337, "tid": -914061504, "ts": 1716454218053414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218120726, "dur": 57, "args": { "External id": 41183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41183, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41183, "pid": 5, "tid": 7, "ts": 1716454218120726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053427, "dur": 12, "args": { "External id": 41183, "cbid": 211, "correlation": 41183 } }, { "ph": "s", "id": 41183, "pid": 76337, "tid": -914061504, "ts": 1716454218053427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218120784, "dur": 263, "args": { "External id": 41185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41185, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41185, "pid": 5, "tid": 7, "ts": 1716454218120784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053445, "dur": 8, "args": { "External id": 41185, "cbid": 211, "correlation": 41185 } }, { "ph": "s", "id": 41185, "pid": 76337, "tid": -914061504, "ts": 1716454218053445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218121048, "dur": 21, "args": { "External id": 41187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41187, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41187, "pid": 5, "tid": 7, "ts": 1716454218121048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053456, "dur": 5, "args": { "External id": 41187, "cbid": 211, "correlation": 41187 } }, { "ph": "s", "id": 41187, "pid": 76337, "tid": -914061504, "ts": 1716454218053456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218121070, "dur": 32, "args": { "External id": 41193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41193, "pid": 5, "tid": 7, "ts": 1716454218121070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053483, "dur": 8, "args": { "External id": 41193, "cbid": 211, "correlation": 41193 } }, { "ph": "s", "id": 41193, "pid": 76337, "tid": -914061504, "ts": 1716454218053483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218121103, "dur": 27, "args": { "External id": 41201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41201, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41201, "pid": 5, "tid": 7, "ts": 1716454218121103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053511, "dur": 8, "args": { "External id": 41201, "cbid": 211, "correlation": 41201 } }, { "ph": "s", "id": 41201, "pid": 76337, "tid": -914061504, "ts": 1716454218053511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218121132, "dur": 20, "args": { "External id": 41209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41209, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41209, "pid": 5, "tid": 7, "ts": 1716454218121132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053541, "dur": 8, "args": { "External id": 41209, "cbid": 211, "correlation": 41209 } }, { "ph": "s", "id": 41209, "pid": 76337, "tid": -914061504, "ts": 1716454218053541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218121153, "dur": 30, "args": { "External id": 41229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41229, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 41229, "pid": 5, "tid": 7, "ts": 1716454218121153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053623, "dur": 13, "args": { "External id": 41229, "cbid": 211, "correlation": 41229 } }, { "ph": "s", "id": 41229, "pid": 76337, "tid": -914061504, "ts": 1716454218053623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218121184, "dur": 4, "args": { "External id": 41241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41241, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 41241, "pid": 5, "tid": 7, "ts": 1716454218121184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053645, "dur": 6, "args": { "External id": 41241, "cbid": 211, "correlation": 41241 } }, { "ph": "s", "id": 41241, "pid": 76337, "tid": -914061504, "ts": 1716454218053645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218121190, "dur": 30, "args": { "External id": 41244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41244, "pid": 5, "tid": 7, "ts": 1716454218121190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053663, "dur": 6, "args": { "External id": 41244, "cbid": 211, "correlation": 41244 } }, { "ph": "s", "id": 41244, "pid": 76337, "tid": -914061504, "ts": 1716454218053663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218053719, "dur": 0, "args": { "External id": 41255, "cbid": 317, "correlation": 41255 } }, { "ph": "f", "id": 41255, "pid": 76337, "tid": -914061504, "ts": 1716454218053719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218053720, "dur": 0, "args": { "External id": 41256, "cbid": 203, "correlation": 41256 } }, { "ph": "f", "id": 41256, "pid": 76337, "tid": -914061504, "ts": 1716454218053720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218053721, "dur": 0, "args": { "External id": 41257, "cbid": 205, "correlation": 41257 } }, { "ph": "f", "id": 41257, "pid": 76337, "tid": -914061504, "ts": 1716454218053721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218121221, "dur": 21, "args": { "External id": 41261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41261, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41261, "pid": 5, "tid": 7, "ts": 1716454218121221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053735, "dur": 12, "args": { "External id": 41261, "cbid": 211, "correlation": 41261 } }, { "ph": "s", "id": 41261, "pid": 76337, "tid": -914061504, "ts": 1716454218053735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218121244, "dur": 103, "args": { "External id": 41263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41263, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41263, "pid": 5, "tid": 7, "ts": 1716454218121244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053752, "dur": 6, "args": { "External id": 41263, "cbid": 211, "correlation": 41263 } }, { "ph": "s", "id": 41263, "pid": 76337, "tid": -914061504, "ts": 1716454218053752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218121348, "dur": 22, "args": { "External id": 41265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41265, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41265, "pid": 5, "tid": 7, "ts": 1716454218121348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053762, "dur": 5, "args": { "External id": 41265, "cbid": 211, "correlation": 41265 } }, { "ph": "s", "id": 41265, "pid": 76337, "tid": -914061504, "ts": 1716454218053762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218121372, "dur": 31, "args": { "External id": 41271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41271, "pid": 5, "tid": 7, "ts": 1716454218121372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053790, "dur": 8, "args": { "External id": 41271, "cbid": 211, "correlation": 41271 } }, { "ph": "s", "id": 41271, "pid": 76337, "tid": -914061504, "ts": 1716454218053790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218121404, "dur": 181, "args": { "External id": 41280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41280, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41280, "pid": 5, "tid": 7, "ts": 1716454218121404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053872, "dur": 15, "args": { "External id": 41280, "cbid": 211, "correlation": 41280 } }, { "ph": "s", "id": 41280, "pid": 76337, "tid": -914061504, "ts": 1716454218053872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218121586, "dur": 63, "args": { "External id": 41302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41302, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41302, "pid": 5, "tid": 7, "ts": 1716454218121586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218053929, "dur": 10, "args": { "External id": 41302, "cbid": 211, "correlation": 41302 } }, { "ph": "s", "id": 41302, "pid": 76337, "tid": -914061504, "ts": 1716454218053929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054026, "dur": 1, "args": { "External id": 41313, "cbid": 251, "correlation": 41313 } }, { "ph": "f", "id": 41313, "pid": 76337, "tid": -914061504, "ts": 1716454218054026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218121651, "dur": 151, "args": { "External id": 41314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41314, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41314, "pid": 5, "tid": 7, "ts": 1716454218121651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054032, "dur": 14, "args": { "External id": 41314, "cbid": 211, "correlation": 41314 } }, { "ph": "s", "id": 41314, "pid": 76337, "tid": -914061504, "ts": 1716454218054032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054103, "dur": 1, "args": { "External id": 41325, "cbid": 251, "correlation": 41325 } }, { "ph": "f", "id": 41325, "pid": 76337, "tid": -914061504, "ts": 1716454218054103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218121803, "dur": 141, "args": { "External id": 41326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41326, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41326, "pid": 5, "tid": 7, "ts": 1716454218121803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054107, "dur": 11, "args": { "External id": 41326, "cbid": 211, "correlation": 41326 } }, { "ph": "s", "id": 41326, "pid": 76337, "tid": -914061504, "ts": 1716454218054107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054172, "dur": 1, "args": { "External id": 41337, "cbid": 251, "correlation": 41337 } }, { "ph": "f", "id": 41337, "pid": 76337, "tid": -914061504, "ts": 1716454218054172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218121945, "dur": 143, "args": { "External id": 41338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41338, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41338, "pid": 5, "tid": 7, "ts": 1716454218121945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054175, "dur": 11, "args": { "External id": 41338, "cbid": 211, "correlation": 41338 } }, { "ph": "s", "id": 41338, "pid": 76337, "tid": -914061504, "ts": 1716454218054175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218122089, "dur": 1890, "args": { "External id": 41359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41359, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 41359, "pid": 5, "tid": 7, "ts": 1716454218122089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054258, "dur": 13, "args": { "External id": 41359, "cbid": 211, "correlation": 41359 } }, { "ph": "s", "id": 41359, "pid": 76337, "tid": -914061504, "ts": 1716454218054258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054358, "dur": 2, "args": { "External id": 41377, "cbid": 251, "correlation": 41377 } }, { "ph": "f", "id": 41377, "pid": 76337, "tid": -914061504, "ts": 1716454218054358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218123981, "dur": 145, "args": { "External id": 41379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41379, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 41379, "pid": 5, "tid": 7, "ts": 1716454218123981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054364, "dur": 13, "args": { "External id": 41379, "cbid": 211, "correlation": 41379 } }, { "ph": "s", "id": 41379, "pid": 76337, "tid": -914061504, "ts": 1716454218054364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218124127, "dur": 35, "args": { "External id": 41387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41387, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41387, "pid": 5, "tid": 7, "ts": 1716454218124127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054434, "dur": 12, "args": { "External id": 41387, "cbid": 211, "correlation": 41387 } }, { "ph": "s", "id": 41387, "pid": 76337, "tid": -914061504, "ts": 1716454218054434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218124163, "dur": 51, "args": { "External id": 41395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41395, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41395, "pid": 5, "tid": 7, "ts": 1716454218124163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054472, "dur": 8, "args": { "External id": 41395, "cbid": 211, "correlation": 41395 } }, { "ph": "s", "id": 41395, "pid": 76337, "tid": -914061504, "ts": 1716454218054472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218124215, "dur": 30, "args": { "External id": 41406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41406, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41406, "pid": 5, "tid": 7, "ts": 1716454218124215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054544, "dur": 13, "args": { "External id": 41406, "cbid": 211, "correlation": 41406 } }, { "ph": "s", "id": 41406, "pid": 76337, "tid": -914061504, "ts": 1716454218054544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218124246, "dur": 34, "args": { "External id": 41428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41428, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41428, "pid": 5, "tid": 7, "ts": 1716454218124246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054576, "dur": 7, "args": { "External id": 41428, "cbid": 211, "correlation": 41428 } }, { "ph": "s", "id": 41428, "pid": 76337, "tid": -914061504, "ts": 1716454218054576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054659, "dur": 1, "args": { "External id": 41439, "cbid": 251, "correlation": 41439 } }, { "ph": "f", "id": 41439, "pid": 76337, "tid": -914061504, "ts": 1716454218054659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218124282, "dur": 89, "args": { "External id": 41440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41440, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41440, "pid": 5, "tid": 7, "ts": 1716454218124282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054665, "dur": 14, "args": { "External id": 41440, "cbid": 211, "correlation": 41440 } }, { "ph": "s", "id": 41440, "pid": 76337, "tid": -914061504, "ts": 1716454218054665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054735, "dur": 1, "args": { "External id": 41451, "cbid": 251, "correlation": 41451 } }, { "ph": "f", "id": 41451, "pid": 76337, "tid": -914061504, "ts": 1716454218054735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054738, "dur": 0, "args": { "External id": 41452, "cbid": 251, "correlation": 41452 } }, { "ph": "f", "id": 41452, "pid": 76337, "tid": -914061504, "ts": 1716454218054738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218124372, "dur": 11, "args": { "External id": 41453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41453, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 41453, "pid": 5, "tid": 7, "ts": 1716454218124372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054740, "dur": 12, "args": { "External id": 41453, "cbid": 211, "correlation": 41453 } }, { "ph": "s", "id": 41453, "pid": 76337, "tid": -914061504, "ts": 1716454218054740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218124383, "dur": 5, "args": { "External id": 41455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41455, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 41455, "pid": 5, "tid": 7, "ts": 1716454218124383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054753, "dur": 6, "args": { "External id": 41455, "cbid": 211, "correlation": 41455 } }, { "ph": "s", "id": 41455, "pid": 76337, "tid": -914061504, "ts": 1716454218054753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054811, "dur": 1, "args": { "External id": 41466, "cbid": 251, "correlation": 41466 } }, { "ph": "f", "id": 41466, "pid": 76337, "tid": -914061504, "ts": 1716454218054811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218054814, "dur": 0, "args": { "External id": 41467, "cbid": 251, "correlation": 41467 } }, { "ph": "f", "id": 41467, "pid": 76337, "tid": -914061504, "ts": 1716454218054814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218124390, "dur": 7, "args": { "External id": 41468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41468, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 41468, "pid": 5, "tid": 7, "ts": 1716454218124390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054815, "dur": 11, "args": { "External id": 41468, "cbid": 211, "correlation": 41468 } }, { "ph": "s", "id": 41468, "pid": 76337, "tid": -914061504, "ts": 1716454218054815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218124398, "dur": 3, "args": { "External id": 41470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41470, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 41470, "pid": 5, "tid": 7, "ts": 1716454218124398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054828, "dur": 6, "args": { "External id": 41470, "cbid": 211, "correlation": 41470 } }, { "ph": "s", "id": 41470, "pid": 76337, "tid": -914061504, "ts": 1716454218054828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218124402, "dur": 90, "args": { "External id": 41491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41491, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 41491, "pid": 5, "tid": 7, "ts": 1716454218124402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218054901, "dur": 12, "args": { "External id": 41491, "cbid": 211, "correlation": 41491 } }, { "ph": "s", "id": 41491, "pid": 76337, "tid": -914061504, "ts": 1716454218054901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218055004, "dur": 1, "args": { "External id": 41509, "cbid": 251, "correlation": 41509 } }, { "ph": "f", "id": 41509, "pid": 76337, "tid": -914061504, "ts": 1716454218055004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218124494, "dur": 96, "args": { "External id": 41511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41511, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41511, "pid": 5, "tid": 7, "ts": 1716454218124494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055010, "dur": 13, "args": { "External id": 41511, "cbid": 211, "correlation": 41511 } }, { "ph": "s", "id": 41511, "pid": 76337, "tid": -914061504, "ts": 1716454218055010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218124591, "dur": 19, "args": { "External id": 41519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41519, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41519, "pid": 5, "tid": 7, "ts": 1716454218124591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055080, "dur": 12, "args": { "External id": 41519, "cbid": 211, "correlation": 41519 } }, { "ph": "s", "id": 41519, "pid": 76337, "tid": -914061504, "ts": 1716454218055080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218124611, "dur": 37, "args": { "External id": 41527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41527, "pid": 5, "tid": 7, "ts": 1716454218124611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055121, "dur": 9, "args": { "External id": 41527, "cbid": 211, "correlation": 41527 } }, { "ph": "s", "id": 41527, "pid": 76337, "tid": -914061504, "ts": 1716454218055121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218124649, "dur": 34, "args": { "External id": 41549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41549, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41549, "pid": 5, "tid": 7, "ts": 1716454218124649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055172, "dur": 10, "args": { "External id": 41549, "cbid": 211, "correlation": 41549 } }, { "ph": "s", "id": 41549, "pid": 76337, "tid": -914061504, "ts": 1716454218055172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218055263, "dur": 1, "args": { "External id": 41565, "cbid": 251, "correlation": 41565 } }, { "ph": "f", "id": 41565, "pid": 76337, "tid": -914061504, "ts": 1716454218055263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218055268, "dur": 0, "args": { "External id": 41567, "cbid": 251, "correlation": 41567 } }, { "ph": "f", "id": 41567, "pid": 76337, "tid": -914061504, "ts": 1716454218055268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218124685, "dur": 529, "args": { "External id": 41568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41568, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 41568, "pid": 5, "tid": 7, "ts": 1716454218124685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055273, "dur": 13, "args": { "External id": 41568, "cbid": 211, "correlation": 41568 } }, { "ph": "s", "id": 41568, "pid": 76337, "tid": -914061504, "ts": 1716454218055273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218125215, "dur": 123, "args": { "External id": 41576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41576, "pid": 5, "tid": 7, "ts": 1716454218125215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055338, "dur": 13, "args": { "External id": 41576, "cbid": 211, "correlation": 41576 } }, { "ph": "s", "id": 41576, "pid": 76337, "tid": -914061504, "ts": 1716454218055338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218125340, "dur": 127, "args": { "External id": 41584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41584, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41584, "pid": 5, "tid": 7, "ts": 1716454218125340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055369, "dur": 8, "args": { "External id": 41584, "cbid": 211, "correlation": 41584 } }, { "ph": "s", "id": 41584, "pid": 76337, "tid": -914061504, "ts": 1716454218055369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218055445, "dur": 1, "args": { "External id": 41600, "cbid": 251, "correlation": 41600 } }, { "ph": "f", "id": 41600, "pid": 76337, "tid": -914061504, "ts": 1716454218055445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218125468, "dur": 302, "args": { "External id": 41602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41602, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41602, "pid": 5, "tid": 7, "ts": 1716454218125468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055451, "dur": 13, "args": { "External id": 41602, "cbid": 211, "correlation": 41602 } }, { "ph": "s", "id": 41602, "pid": 76337, "tid": -914061504, "ts": 1716454218055451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218125771, "dur": 28, "args": { "External id": 41610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41610, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41610, "pid": 5, "tid": 7, "ts": 1716454218125771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055492, "dur": 10, "args": { "External id": 41610, "cbid": 211, "correlation": 41610 } }, { "ph": "s", "id": 41610, "pid": 76337, "tid": -914061504, "ts": 1716454218055492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218125800, "dur": 79, "args": { "External id": 41621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41621, "pid": 5, "tid": 7, "ts": 1716454218125800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055560, "dur": 13, "args": { "External id": 41621, "cbid": 211, "correlation": 41621 } }, { "ph": "s", "id": 41621, "pid": 76337, "tid": -914061504, "ts": 1716454218055560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218055624, "dur": 0, "args": { "External id": 41633, "cbid": 317, "correlation": 41633 } }, { "ph": "f", "id": 41633, "pid": 76337, "tid": -914061504, "ts": 1716454218055624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218055625, "dur": 0, "args": { "External id": 41634, "cbid": 203, "correlation": 41634 } }, { "ph": "f", "id": 41634, "pid": 76337, "tid": -914061504, "ts": 1716454218055625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218055625, "dur": 0, "args": { "External id": 41635, "cbid": 205, "correlation": 41635 } }, { "ph": "f", "id": 41635, "pid": 76337, "tid": -914061504, "ts": 1716454218055625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218125880, "dur": 22, "args": { "External id": 41639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41639, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41639, "pid": 5, "tid": 7, "ts": 1716454218125880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055640, "dur": 12, "args": { "External id": 41639, "cbid": 211, "correlation": 41639 } }, { "ph": "s", "id": 41639, "pid": 76337, "tid": -914061504, "ts": 1716454218055640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218125903, "dur": 118, "args": { "External id": 41641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41641, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41641, "pid": 5, "tid": 7, "ts": 1716454218125903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055658, "dur": 6, "args": { "External id": 41641, "cbid": 211, "correlation": 41641 } }, { "ph": "s", "id": 41641, "pid": 76337, "tid": -914061504, "ts": 1716454218055658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218126023, "dur": 23, "args": { "External id": 41643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41643, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41643, "pid": 5, "tid": 7, "ts": 1716454218126023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055668, "dur": 6, "args": { "External id": 41643, "cbid": 211, "correlation": 41643 } }, { "ph": "s", "id": 41643, "pid": 76337, "tid": -914061504, "ts": 1716454218055668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218126047, "dur": 32, "args": { "External id": 41649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41649, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41649, "pid": 5, "tid": 7, "ts": 1716454218126047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055696, "dur": 8, "args": { "External id": 41649, "cbid": 211, "correlation": 41649 } }, { "ph": "s", "id": 41649, "pid": 76337, "tid": -914061504, "ts": 1716454218055696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218126080, "dur": 27, "args": { "External id": 41657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41657, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41657, "pid": 5, "tid": 7, "ts": 1716454218126080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055728, "dur": 8, "args": { "External id": 41657, "cbid": 211, "correlation": 41657 } }, { "ph": "s", "id": 41657, "pid": 76337, "tid": -914061504, "ts": 1716454218055728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218126108, "dur": 52, "args": { "External id": 41666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41666, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41666, "pid": 5, "tid": 7, "ts": 1716454218126108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055765, "dur": 10, "args": { "External id": 41666, "cbid": 211, "correlation": 41666 } }, { "ph": "s", "id": 41666, "pid": 76337, "tid": -914061504, "ts": 1716454218055765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218126162, "dur": 50, "args": { "External id": 41686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41686, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 41686, "pid": 5, "tid": 7, "ts": 1716454218126162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055837, "dur": 12, "args": { "External id": 41686, "cbid": 211, "correlation": 41686 } }, { "ph": "s", "id": 41686, "pid": 76337, "tid": -914061504, "ts": 1716454218055837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218126213, "dur": 4, "args": { "External id": 41698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41698, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 41698, "pid": 5, "tid": 7, "ts": 1716454218126213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055859, "dur": 6, "args": { "External id": 41698, "cbid": 211, "correlation": 41698 } }, { "ph": "s", "id": 41698, "pid": 76337, "tid": -914061504, "ts": 1716454218055859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218126219, "dur": 56, "args": { "External id": 41701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41701, "pid": 5, "tid": 7, "ts": 1716454218126219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055877, "dur": 7, "args": { "External id": 41701, "cbid": 211, "correlation": 41701 } }, { "ph": "s", "id": 41701, "pid": 76337, "tid": -914061504, "ts": 1716454218055877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218126276, "dur": 37, "args": { "External id": 41710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41710, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41710, "pid": 5, "tid": 7, "ts": 1716454218126276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055917, "dur": 10, "args": { "External id": 41710, "cbid": 211, "correlation": 41710 } }, { "ph": "s", "id": 41710, "pid": 76337, "tid": -914061504, "ts": 1716454218055917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218055969, "dur": 0, "args": { "External id": 41720, "cbid": 317, "correlation": 41720 } }, { "ph": "f", "id": 41720, "pid": 76337, "tid": -914061504, "ts": 1716454218055969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218055970, "dur": 0, "args": { "External id": 41721, "cbid": 203, "correlation": 41721 } }, { "ph": "f", "id": 41721, "pid": 76337, "tid": -914061504, "ts": 1716454218055970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218055971, "dur": 0, "args": { "External id": 41722, "cbid": 205, "correlation": 41722 } }, { "ph": "f", "id": 41722, "pid": 76337, "tid": -914061504, "ts": 1716454218055971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218126315, "dur": 39, "args": { "External id": 41726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41726, "pid": 5, "tid": 7, "ts": 1716454218126315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218055994, "dur": 12, "args": { "External id": 41726, "cbid": 211, "correlation": 41726 } }, { "ph": "s", "id": 41726, "pid": 76337, "tid": -914061504, "ts": 1716454218055994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218126355, "dur": 81, "args": { "External id": 41728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41728, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41728, "pid": 5, "tid": 7, "ts": 1716454218126355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056009, "dur": 5, "args": { "External id": 41728, "cbid": 211, "correlation": 41728 } }, { "ph": "s", "id": 41728, "pid": 76337, "tid": -914061504, "ts": 1716454218056009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218126438, "dur": 1257, "args": { "External id": 41730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41730, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41730, "pid": 5, "tid": 7, "ts": 1716454218126438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056020, "dur": 6, "args": { "External id": 41730, "cbid": 211, "correlation": 41730 } }, { "ph": "s", "id": 41730, "pid": 76337, "tid": -914061504, "ts": 1716454218056020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218127697, "dur": 21, "args": { "External id": 41732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41732, "pid": 5, "tid": 7, "ts": 1716454218127697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056029, "dur": 5, "args": { "External id": 41732, "cbid": 211, "correlation": 41732 } }, { "ph": "s", "id": 41732, "pid": 76337, "tid": -914061504, "ts": 1716454218056029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218127719, "dur": 32, "args": { "External id": 41738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41738, "pid": 5, "tid": 7, "ts": 1716454218127719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056058, "dur": 8, "args": { "External id": 41738, "cbid": 211, "correlation": 41738 } }, { "ph": "s", "id": 41738, "pid": 76337, "tid": -914061504, "ts": 1716454218056058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218127753, "dur": 3, "args": { "External id": 41746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41746, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 41746, "pid": 5, "tid": 7, "ts": 1716454218127753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056101, "dur": 10, "args": { "External id": 41746, "cbid": 211, "correlation": 41746 } }, { "ph": "s", "id": 41746, "pid": 76337, "tid": -914061504, "ts": 1716454218056101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218056168, "dur": 1, "args": { "External id": 41762, "cbid": 251, "correlation": 41762 } }, { "ph": "f", "id": 41762, "pid": 76337, "tid": -914061504, "ts": 1716454218056168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218056173, "dur": 0, "args": { "External id": 41764, "cbid": 251, "correlation": 41764 } }, { "ph": "f", "id": 41764, "pid": 76337, "tid": -914061504, "ts": 1716454218056173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218127757, "dur": 12, "args": { "External id": 41765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41765, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 41765, "pid": 5, "tid": 7, "ts": 1716454218127757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056175, "dur": 11, "args": { "External id": 41765, "cbid": 211, "correlation": 41765 } }, { "ph": "s", "id": 41765, "pid": 76337, "tid": -914061504, "ts": 1716454218056175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218127770, "dur": 5, "args": { "External id": 41767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41767, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 41767, "pid": 5, "tid": 7, "ts": 1716454218127770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056187, "dur": 6, "args": { "External id": 41767, "cbid": 211, "correlation": 41767 } }, { "ph": "s", "id": 41767, "pid": 76337, "tid": -914061504, "ts": 1716454218056187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218127776, "dur": 29, "args": { "External id": 41777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41777, "pid": 5, "tid": 7, "ts": 1716454218127776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056244, "dur": 13, "args": { "External id": 41777, "cbid": 211, "correlation": 41777 } }, { "ph": "s", "id": 41777, "pid": 76337, "tid": -914061504, "ts": 1716454218056244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218127807, "dur": 30, "args": { "External id": 41797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41797, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 41797, "pid": 5, "tid": 7, "ts": 1716454218127807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056311, "dur": 10, "args": { "External id": 41797, "cbid": 211, "correlation": 41797 } }, { "ph": "s", "id": 41797, "pid": 76337, "tid": -914061504, "ts": 1716454218056311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218127838, "dur": 4, "args": { "External id": 41809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41809, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 41809, "pid": 5, "tid": 7, "ts": 1716454218127838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056331, "dur": 6, "args": { "External id": 41809, "cbid": 211, "correlation": 41809 } }, { "ph": "s", "id": 41809, "pid": 76337, "tid": -914061504, "ts": 1716454218056331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218127843, "dur": 29, "args": { "External id": 41812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41812, "pid": 5, "tid": 7, "ts": 1716454218127843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056350, "dur": 7, "args": { "External id": 41812, "cbid": 211, "correlation": 41812 } }, { "ph": "s", "id": 41812, "pid": 76337, "tid": -914061504, "ts": 1716454218056350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218127873, "dur": 21, "args": { "External id": 41821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41821, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41821, "pid": 5, "tid": 7, "ts": 1716454218127873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056390, "dur": 9, "args": { "External id": 41821, "cbid": 211, "correlation": 41821 } }, { "ph": "s", "id": 41821, "pid": 76337, "tid": -914061504, "ts": 1716454218056390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218056452, "dur": 0, "args": { "External id": 41831, "cbid": 317, "correlation": 41831 } }, { "ph": "f", "id": 41831, "pid": 76337, "tid": -914061504, "ts": 1716454218056452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218056453, "dur": 0, "args": { "External id": 41832, "cbid": 203, "correlation": 41832 } }, { "ph": "f", "id": 41832, "pid": 76337, "tid": -914061504, "ts": 1716454218056453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218056454, "dur": 0, "args": { "External id": 41833, "cbid": 205, "correlation": 41833 } }, { "ph": "f", "id": 41833, "pid": 76337, "tid": -914061504, "ts": 1716454218056454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218127895, "dur": 23, "args": { "External id": 41837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41837, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41837, "pid": 5, "tid": 7, "ts": 1716454218127895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056468, "dur": 12, "args": { "External id": 41837, "cbid": 211, "correlation": 41837 } }, { "ph": "s", "id": 41837, "pid": 76337, "tid": -914061504, "ts": 1716454218056468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218127919, "dur": 43, "args": { "External id": 41839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41839, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41839, "pid": 5, "tid": 7, "ts": 1716454218127919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056483, "dur": 5, "args": { "External id": 41839, "cbid": 211, "correlation": 41839 } }, { "ph": "s", "id": 41839, "pid": 76337, "tid": -914061504, "ts": 1716454218056483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218127963, "dur": 632, "args": { "External id": 41841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41841, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41841, "pid": 5, "tid": 7, "ts": 1716454218127963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056494, "dur": 6, "args": { "External id": 41841, "cbid": 211, "correlation": 41841 } }, { "ph": "s", "id": 41841, "pid": 76337, "tid": -914061504, "ts": 1716454218056494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218128597, "dur": 22, "args": { "External id": 41843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41843, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41843, "pid": 5, "tid": 7, "ts": 1716454218128597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056503, "dur": 5, "args": { "External id": 41843, "cbid": 211, "correlation": 41843 } }, { "ph": "s", "id": 41843, "pid": 76337, "tid": -914061504, "ts": 1716454218056503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218128620, "dur": 32, "args": { "External id": 41849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41849, "pid": 5, "tid": 7, "ts": 1716454218128620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056531, "dur": 8, "args": { "External id": 41849, "cbid": 211, "correlation": 41849 } }, { "ph": "s", "id": 41849, "pid": 76337, "tid": -914061504, "ts": 1716454218056531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218056588, "dur": 0, "args": { "External id": 41859, "cbid": 317, "correlation": 41859 } }, { "ph": "f", "id": 41859, "pid": 76337, "tid": -914061504, "ts": 1716454218056588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218056589, "dur": 0, "args": { "External id": 41860, "cbid": 203, "correlation": 41860 } }, { "ph": "f", "id": 41860, "pid": 76337, "tid": -914061504, "ts": 1716454218056589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218056590, "dur": 0, "args": { "External id": 41861, "cbid": 205, "correlation": 41861 } }, { "ph": "f", "id": 41861, "pid": 76337, "tid": -914061504, "ts": 1716454218056590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218128653, "dur": 39, "args": { "External id": 41865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41865, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41865, "pid": 5, "tid": 7, "ts": 1716454218128653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056602, "dur": 12, "args": { "External id": 41865, "cbid": 211, "correlation": 41865 } }, { "ph": "s", "id": 41865, "pid": 76337, "tid": -914061504, "ts": 1716454218056602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218128693, "dur": 184, "args": { "External id": 41867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41867, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41867, "pid": 5, "tid": 7, "ts": 1716454218128693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056620, "dur": 6, "args": { "External id": 41867, "cbid": 211, "correlation": 41867 } }, { "ph": "s", "id": 41867, "pid": 76337, "tid": -914061504, "ts": 1716454218056620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218128878, "dur": 21, "args": { "External id": 41869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41869, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41869, "pid": 5, "tid": 7, "ts": 1716454218128878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056629, "dur": 5, "args": { "External id": 41869, "cbid": 211, "correlation": 41869 } }, { "ph": "s", "id": 41869, "pid": 76337, "tid": -914061504, "ts": 1716454218056629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218128901, "dur": 32, "args": { "External id": 41875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41875, "pid": 5, "tid": 7, "ts": 1716454218128901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056654, "dur": 8, "args": { "External id": 41875, "cbid": 211, "correlation": 41875 } }, { "ph": "s", "id": 41875, "pid": 76337, "tid": -914061504, "ts": 1716454218056654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218128934, "dur": 27, "args": { "External id": 41883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41883, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41883, "pid": 5, "tid": 7, "ts": 1716454218128934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056683, "dur": 8, "args": { "External id": 41883, "cbid": 211, "correlation": 41883 } }, { "ph": "s", "id": 41883, "pid": 76337, "tid": -914061504, "ts": 1716454218056683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218128962, "dur": 20, "args": { "External id": 41891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41891, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41891, "pid": 5, "tid": 7, "ts": 1716454218128962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056712, "dur": 9, "args": { "External id": 41891, "cbid": 211, "correlation": 41891 } }, { "ph": "s", "id": 41891, "pid": 76337, "tid": -914061504, "ts": 1716454218056712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218128983, "dur": 30, "args": { "External id": 41911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41911, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 41911, "pid": 5, "tid": 7, "ts": 1716454218128983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056794, "dur": 12, "args": { "External id": 41911, "cbid": 211, "correlation": 41911 } }, { "ph": "s", "id": 41911, "pid": 76337, "tid": -914061504, "ts": 1716454218056794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218129014, "dur": 4, "args": { "External id": 41923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41923, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 41923, "pid": 5, "tid": 7, "ts": 1716454218129014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056816, "dur": 7, "args": { "External id": 41923, "cbid": 211, "correlation": 41923 } }, { "ph": "s", "id": 41923, "pid": 76337, "tid": -914061504, "ts": 1716454218056816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218129019, "dur": 31, "args": { "External id": 41926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41926, "pid": 5, "tid": 7, "ts": 1716454218129019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056834, "dur": 6, "args": { "External id": 41926, "cbid": 211, "correlation": 41926 } }, { "ph": "s", "id": 41926, "pid": 76337, "tid": -914061504, "ts": 1716454218056834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218056891, "dur": 0, "args": { "External id": 41937, "cbid": 317, "correlation": 41937 } }, { "ph": "f", "id": 41937, "pid": 76337, "tid": -914061504, "ts": 1716454218056891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218056892, "dur": 0, "args": { "External id": 41938, "cbid": 203, "correlation": 41938 } }, { "ph": "f", "id": 41938, "pid": 76337, "tid": -914061504, "ts": 1716454218056892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218056893, "dur": 0, "args": { "External id": 41939, "cbid": 205, "correlation": 41939 } }, { "ph": "f", "id": 41939, "pid": 76337, "tid": -914061504, "ts": 1716454218056893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218129051, "dur": 21, "args": { "External id": 41943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41943, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41943, "pid": 5, "tid": 7, "ts": 1716454218129051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056905, "dur": 12, "args": { "External id": 41943, "cbid": 211, "correlation": 41943 } }, { "ph": "s", "id": 41943, "pid": 76337, "tid": -914061504, "ts": 1716454218056905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218129073, "dur": 101, "args": { "External id": 41945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41945, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41945, "pid": 5, "tid": 7, "ts": 1716454218129073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056923, "dur": 7, "args": { "External id": 41945, "cbid": 211, "correlation": 41945 } }, { "ph": "s", "id": 41945, "pid": 76337, "tid": -914061504, "ts": 1716454218056923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218129176, "dur": 22, "args": { "External id": 41947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41947, "pid": 5, "tid": 7, "ts": 1716454218129176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056933, "dur": 5, "args": { "External id": 41947, "cbid": 211, "correlation": 41947 } }, { "ph": "s", "id": 41947, "pid": 76337, "tid": -914061504, "ts": 1716454218056933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218129200, "dur": 33, "args": { "External id": 41953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41953, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41953, "pid": 5, "tid": 7, "ts": 1716454218129200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218056960, "dur": 8, "args": { "External id": 41953, "cbid": 211, "correlation": 41953 } }, { "ph": "s", "id": 41953, "pid": 76337, "tid": -914061504, "ts": 1716454218056960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218129233, "dur": 167, "args": { "External id": 41962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41962, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41962, "pid": 5, "tid": 7, "ts": 1716454218129233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062148, "dur": 65, "args": { "External id": 41962, "cbid": 211, "correlation": 41962 } }, { "ph": "s", "id": 41962, "pid": 76337, "tid": -914061504, "ts": 1716454218062148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218129402, "dur": 63, "args": { "External id": 41984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41984, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 41984, "pid": 5, "tid": 7, "ts": 1716454218129402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062299, "dur": 13, "args": { "External id": 41984, "cbid": 211, "correlation": 41984 } }, { "ph": "s", "id": 41984, "pid": 76337, "tid": -914061504, "ts": 1716454218062299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218062474, "dur": 6, "args": { "External id": 41995, "cbid": 251, "correlation": 41995 } }, { "ph": "f", "id": 41995, "pid": 76337, "tid": -914061504, "ts": 1716454218062474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218129466, "dur": 151, "args": { "External id": 41996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 41996, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 41996, "pid": 5, "tid": 7, "ts": 1716454218129466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062488, "dur": 16, "args": { "External id": 41996, "cbid": 211, "correlation": 41996 } }, { "ph": "s", "id": 41996, "pid": 76337, "tid": -914061504, "ts": 1716454218062488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218062567, "dur": 1, "args": { "External id": 42007, "cbid": 251, "correlation": 42007 } }, { "ph": "f", "id": 42007, "pid": 76337, "tid": -914061504, "ts": 1716454218062567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218129619, "dur": 142, "args": { "External id": 42008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42008, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42008, "pid": 5, "tid": 7, "ts": 1716454218129619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062571, "dur": 11, "args": { "External id": 42008, "cbid": 211, "correlation": 42008 } }, { "ph": "s", "id": 42008, "pid": 76337, "tid": -914061504, "ts": 1716454218062571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218062637, "dur": 1, "args": { "External id": 42019, "cbid": 251, "correlation": 42019 } }, { "ph": "f", "id": 42019, "pid": 76337, "tid": -914061504, "ts": 1716454218062637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218129762, "dur": 142, "args": { "External id": 42020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42020, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42020, "pid": 5, "tid": 7, "ts": 1716454218129762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062641, "dur": 11, "args": { "External id": 42020, "cbid": 211, "correlation": 42020 } }, { "ph": "s", "id": 42020, "pid": 76337, "tid": -914061504, "ts": 1716454218062641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218129905, "dur": 1892, "args": { "External id": 42041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42041, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 42041, "pid": 5, "tid": 7, "ts": 1716454218129905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062755, "dur": 20, "args": { "External id": 42041, "cbid": 211, "correlation": 42041 } }, { "ph": "s", "id": 42041, "pid": 76337, "tid": -914061504, "ts": 1716454218062755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218062900, "dur": 2, "args": { "External id": 42059, "cbid": 251, "correlation": 42059 } }, { "ph": "f", "id": 42059, "pid": 76337, "tid": -914061504, "ts": 1716454218062900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218131798, "dur": 145, "args": { "External id": 42061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42061, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 42061, "pid": 5, "tid": 7, "ts": 1716454218131798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218062908, "dur": 15, "args": { "External id": 42061, "cbid": 211, "correlation": 42061 } }, { "ph": "s", "id": 42061, "pid": 76337, "tid": -914061504, "ts": 1716454218062908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218131945, "dur": 36, "args": { "External id": 42069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42069, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42069, "pid": 5, "tid": 7, "ts": 1716454218131945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063032, "dur": 20, "args": { "External id": 42069, "cbid": 211, "correlation": 42069 } }, { "ph": "s", "id": 42069, "pid": 76337, "tid": -914061504, "ts": 1716454218063032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218131981, "dur": 51, "args": { "External id": 42077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42077, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42077, "pid": 5, "tid": 7, "ts": 1716454218131981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063089, "dur": 10, "args": { "External id": 42077, "cbid": 211, "correlation": 42077 } }, { "ph": "s", "id": 42077, "pid": 76337, "tid": -914061504, "ts": 1716454218063089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218132034, "dur": 29, "args": { "External id": 42088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42088, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42088, "pid": 5, "tid": 7, "ts": 1716454218132034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063193, "dur": 17, "args": { "External id": 42088, "cbid": 211, "correlation": 42088 } }, { "ph": "s", "id": 42088, "pid": 76337, "tid": -914061504, "ts": 1716454218063193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218132064, "dur": 34, "args": { "External id": 42110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42110, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42110, "pid": 5, "tid": 7, "ts": 1716454218132064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063230, "dur": 8, "args": { "External id": 42110, "cbid": 211, "correlation": 42110 } }, { "ph": "s", "id": 42110, "pid": 76337, "tid": -914061504, "ts": 1716454218063230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063317, "dur": 1, "args": { "External id": 42121, "cbid": 251, "correlation": 42121 } }, { "ph": "f", "id": 42121, "pid": 76337, "tid": -914061504, "ts": 1716454218063317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218132099, "dur": 89, "args": { "External id": 42122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42122, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42122, "pid": 5, "tid": 7, "ts": 1716454218132099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063322, "dur": 12, "args": { "External id": 42122, "cbid": 211, "correlation": 42122 } }, { "ph": "s", "id": 42122, "pid": 76337, "tid": -914061504, "ts": 1716454218063322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063408, "dur": 1, "args": { "External id": 42133, "cbid": 251, "correlation": 42133 } }, { "ph": "f", "id": 42133, "pid": 76337, "tid": -914061504, "ts": 1716454218063408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063413, "dur": 0, "args": { "External id": 42134, "cbid": 251, "correlation": 42134 } }, { "ph": "f", "id": 42134, "pid": 76337, "tid": -914061504, "ts": 1716454218063413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218132190, "dur": 12, "args": { "External id": 42135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42135, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 42135, "pid": 5, "tid": 7, "ts": 1716454218132190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063416, "dur": 14, "args": { "External id": 42135, "cbid": 211, "correlation": 42135 } }, { "ph": "s", "id": 42135, "pid": 76337, "tid": -914061504, "ts": 1716454218063416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218132203, "dur": 5, "args": { "External id": 42137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42137, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 42137, "pid": 5, "tid": 7, "ts": 1716454218132203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063433, "dur": 11, "args": { "External id": 42137, "cbid": 211, "correlation": 42137 } }, { "ph": "s", "id": 42137, "pid": 76337, "tid": -914061504, "ts": 1716454218063433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063497, "dur": 1, "args": { "External id": 42148, "cbid": 251, "correlation": 42148 } }, { "ph": "f", "id": 42148, "pid": 76337, "tid": -914061504, "ts": 1716454218063497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063500, "dur": 0, "args": { "External id": 42149, "cbid": 251, "correlation": 42149 } }, { "ph": "f", "id": 42149, "pid": 76337, "tid": -914061504, "ts": 1716454218063500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218132209, "dur": 7, "args": { "External id": 42150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42150, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 42150, "pid": 5, "tid": 7, "ts": 1716454218132209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063502, "dur": 12, "args": { "External id": 42150, "cbid": 211, "correlation": 42150 } }, { "ph": "s", "id": 42150, "pid": 76337, "tid": -914061504, "ts": 1716454218063502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218132217, "dur": 3, "args": { "External id": 42152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42152, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 42152, "pid": 5, "tid": 7, "ts": 1716454218132217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063515, "dur": 5, "args": { "External id": 42152, "cbid": 211, "correlation": 42152 } }, { "ph": "s", "id": 42152, "pid": 76337, "tid": -914061504, "ts": 1716454218063515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218132221, "dur": 89, "args": { "External id": 42173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42173, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 42173, "pid": 5, "tid": 7, "ts": 1716454218132221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063588, "dur": 13, "args": { "External id": 42173, "cbid": 211, "correlation": 42173 } }, { "ph": "s", "id": 42173, "pid": 76337, "tid": -914061504, "ts": 1716454218063588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063686, "dur": 2, "args": { "External id": 42191, "cbid": 251, "correlation": 42191 } }, { "ph": "f", "id": 42191, "pid": 76337, "tid": -914061504, "ts": 1716454218063686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218132311, "dur": 98, "args": { "External id": 42193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42193, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42193, "pid": 5, "tid": 7, "ts": 1716454218132311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063693, "dur": 14, "args": { "External id": 42193, "cbid": 211, "correlation": 42193 } }, { "ph": "s", "id": 42193, "pid": 76337, "tid": -914061504, "ts": 1716454218063693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218132410, "dur": 19, "args": { "External id": 42201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42201, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42201, "pid": 5, "tid": 7, "ts": 1716454218132410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063766, "dur": 12, "args": { "External id": 42201, "cbid": 211, "correlation": 42201 } }, { "ph": "s", "id": 42201, "pid": 76337, "tid": -914061504, "ts": 1716454218063766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218132431, "dur": 38, "args": { "External id": 42209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42209, "pid": 5, "tid": 7, "ts": 1716454218132431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063810, "dur": 11, "args": { "External id": 42209, "cbid": 211, "correlation": 42209 } }, { "ph": "s", "id": 42209, "pid": 76337, "tid": -914061504, "ts": 1716454218063810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218132471, "dur": 33, "args": { "External id": 42231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42231, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42231, "pid": 5, "tid": 7, "ts": 1716454218132471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063864, "dur": 10, "args": { "External id": 42231, "cbid": 211, "correlation": 42231 } }, { "ph": "s", "id": 42231, "pid": 76337, "tid": -914061504, "ts": 1716454218063864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063959, "dur": 1, "args": { "External id": 42247, "cbid": 251, "correlation": 42247 } }, { "ph": "f", "id": 42247, "pid": 76337, "tid": -914061504, "ts": 1716454218063959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218063964, "dur": 0, "args": { "External id": 42249, "cbid": 251, "correlation": 42249 } }, { "ph": "f", "id": 42249, "pid": 76337, "tid": -914061504, "ts": 1716454218063964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218132506, "dur": 527, "args": { "External id": 42250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42250, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 42250, "pid": 5, "tid": 7, "ts": 1716454218132506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218063970, "dur": 21, "args": { "External id": 42250, "cbid": 211, "correlation": 42250 } }, { "ph": "s", "id": 42250, "pid": 76337, "tid": -914061504, "ts": 1716454218063970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218133034, "dur": 123, "args": { "External id": 42258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42258, "pid": 5, "tid": 7, "ts": 1716454218133034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064072, "dur": 14, "args": { "External id": 42258, "cbid": 211, "correlation": 42258 } }, { "ph": "s", "id": 42258, "pid": 76337, "tid": -914061504, "ts": 1716454218064072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218133158, "dur": 127, "args": { "External id": 42266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42266, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42266, "pid": 5, "tid": 7, "ts": 1716454218133158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064113, "dur": 11, "args": { "External id": 42266, "cbid": 211, "correlation": 42266 } }, { "ph": "s", "id": 42266, "pid": 76337, "tid": -914061504, "ts": 1716454218064113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218064198, "dur": 1, "args": { "External id": 42282, "cbid": 251, "correlation": 42282 } }, { "ph": "f", "id": 42282, "pid": 76337, "tid": -914061504, "ts": 1716454218064198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218133287, "dur": 302, "args": { "External id": 42284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42284, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42284, "pid": 5, "tid": 7, "ts": 1716454218133287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064204, "dur": 13, "args": { "External id": 42284, "cbid": 211, "correlation": 42284 } }, { "ph": "s", "id": 42284, "pid": 76337, "tid": -914061504, "ts": 1716454218064204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218133590, "dur": 27, "args": { "External id": 42292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42292, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42292, "pid": 5, "tid": 7, "ts": 1716454218133590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064247, "dur": 10, "args": { "External id": 42292, "cbid": 211, "correlation": 42292 } }, { "ph": "s", "id": 42292, "pid": 76337, "tid": -914061504, "ts": 1716454218064247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218133618, "dur": 80, "args": { "External id": 42303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42303, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42303, "pid": 5, "tid": 7, "ts": 1716454218133618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064319, "dur": 13, "args": { "External id": 42303, "cbid": 211, "correlation": 42303 } }, { "ph": "s", "id": 42303, "pid": 76337, "tid": -914061504, "ts": 1716454218064319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218064421, "dur": 1, "args": { "External id": 42315, "cbid": 317, "correlation": 42315 } }, { "ph": "f", "id": 42315, "pid": 76337, "tid": -914061504, "ts": 1716454218064421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218064423, "dur": 1, "args": { "External id": 42316, "cbid": 203, "correlation": 42316 } }, { "ph": "f", "id": 42316, "pid": 76337, "tid": -914061504, "ts": 1716454218064423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218064426, "dur": 1, "args": { "External id": 42317, "cbid": 205, "correlation": 42317 } }, { "ph": "f", "id": 42317, "pid": 76337, "tid": -914061504, "ts": 1716454218064426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218133699, "dur": 24, "args": { "External id": 42321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42321, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42321, "pid": 5, "tid": 7, "ts": 1716454218133699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064471, "dur": 15, "args": { "External id": 42321, "cbid": 211, "correlation": 42321 } }, { "ph": "s", "id": 42321, "pid": 76337, "tid": -914061504, "ts": 1716454218064471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218133725, "dur": 117, "args": { "External id": 42323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42323, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42323, "pid": 5, "tid": 7, "ts": 1716454218133725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064507, "dur": 11, "args": { "External id": 42323, "cbid": 211, "correlation": 42323 } }, { "ph": "s", "id": 42323, "pid": 76337, "tid": -914061504, "ts": 1716454218064507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218133843, "dur": 23, "args": { "External id": 42325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42325, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42325, "pid": 5, "tid": 7, "ts": 1716454218133843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064526, "dur": 8, "args": { "External id": 42325, "cbid": 211, "correlation": 42325 } }, { "ph": "s", "id": 42325, "pid": 76337, "tid": -914061504, "ts": 1716454218064526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218133868, "dur": 32, "args": { "External id": 42331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42331, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42331, "pid": 5, "tid": 7, "ts": 1716454218133868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064562, "dur": 9, "args": { "External id": 42331, "cbid": 211, "correlation": 42331 } }, { "ph": "s", "id": 42331, "pid": 76337, "tid": -914061504, "ts": 1716454218064562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218133901, "dur": 26, "args": { "External id": 42339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42339, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42339, "pid": 5, "tid": 7, "ts": 1716454218133901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064593, "dur": 8, "args": { "External id": 42339, "cbid": 211, "correlation": 42339 } }, { "ph": "s", "id": 42339, "pid": 76337, "tid": -914061504, "ts": 1716454218064593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218133928, "dur": 44, "args": { "External id": 42348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42348, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42348, "pid": 5, "tid": 7, "ts": 1716454218133928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064640, "dur": 13, "args": { "External id": 42348, "cbid": 211, "correlation": 42348 } }, { "ph": "s", "id": 42348, "pid": 76337, "tid": -914061504, "ts": 1716454218064640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218133973, "dur": 41, "args": { "External id": 42368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42368, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 42368, "pid": 5, "tid": 7, "ts": 1716454218133973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064742, "dur": 14, "args": { "External id": 42368, "cbid": 211, "correlation": 42368 } }, { "ph": "s", "id": 42368, "pid": 76337, "tid": -914061504, "ts": 1716454218064742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218134015, "dur": 5, "args": { "External id": 42380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42380, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 42380, "pid": 5, "tid": 7, "ts": 1716454218134015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064768, "dur": 7, "args": { "External id": 42380, "cbid": 211, "correlation": 42380 } }, { "ph": "s", "id": 42380, "pid": 76337, "tid": -914061504, "ts": 1716454218064768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218134022, "dur": 43, "args": { "External id": 42383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42383, "pid": 5, "tid": 7, "ts": 1716454218134022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064792, "dur": 7, "args": { "External id": 42383, "cbid": 211, "correlation": 42383 } }, { "ph": "s", "id": 42383, "pid": 76337, "tid": -914061504, "ts": 1716454218064792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218134066, "dur": 30, "args": { "External id": 42392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42392, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42392, "pid": 5, "tid": 7, "ts": 1716454218134066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064845, "dur": 12, "args": { "External id": 42392, "cbid": 211, "correlation": 42392 } }, { "ph": "s", "id": 42392, "pid": 76337, "tid": -914061504, "ts": 1716454218064845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218064902, "dur": 0, "args": { "External id": 42402, "cbid": 317, "correlation": 42402 } }, { "ph": "f", "id": 42402, "pid": 76337, "tid": -914061504, "ts": 1716454218064902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218064903, "dur": 0, "args": { "External id": 42403, "cbid": 203, "correlation": 42403 } }, { "ph": "f", "id": 42403, "pid": 76337, "tid": -914061504, "ts": 1716454218064903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218064903, "dur": 0, "args": { "External id": 42404, "cbid": 205, "correlation": 42404 } }, { "ph": "f", "id": 42404, "pid": 76337, "tid": -914061504, "ts": 1716454218064903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218134098, "dur": 31, "args": { "External id": 42408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42408, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42408, "pid": 5, "tid": 7, "ts": 1716454218134098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064919, "dur": 12, "args": { "External id": 42408, "cbid": 211, "correlation": 42408 } }, { "ph": "s", "id": 42408, "pid": 76337, "tid": -914061504, "ts": 1716454218064919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218134130, "dur": 62, "args": { "External id": 42410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42410, "pid": 5, "tid": 7, "ts": 1716454218134130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064934, "dur": 5, "args": { "External id": 42410, "cbid": 211, "correlation": 42410 } }, { "ph": "s", "id": 42410, "pid": 76337, "tid": -914061504, "ts": 1716454218064934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218134193, "dur": 948, "args": { "External id": 42412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42412, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42412, "pid": 5, "tid": 7, "ts": 1716454218134193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064946, "dur": 12, "args": { "External id": 42412, "cbid": 211, "correlation": 42412 } }, { "ph": "s", "id": 42412, "pid": 76337, "tid": -914061504, "ts": 1716454218064946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218135143, "dur": 21, "args": { "External id": 42414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42414, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42414, "pid": 5, "tid": 7, "ts": 1716454218135143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064962, "dur": 5, "args": { "External id": 42414, "cbid": 211, "correlation": 42414 } }, { "ph": "s", "id": 42414, "pid": 76337, "tid": -914061504, "ts": 1716454218064962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218135165, "dur": 33, "args": { "External id": 42420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42420, "pid": 5, "tid": 7, "ts": 1716454218135165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218064999, "dur": 9, "args": { "External id": 42420, "cbid": 211, "correlation": 42420 } }, { "ph": "s", "id": 42420, "pid": 76337, "tid": -914061504, "ts": 1716454218064999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218135199, "dur": 3, "args": { "External id": 42428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42428, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 42428, "pid": 5, "tid": 7, "ts": 1716454218135199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065045, "dur": 9, "args": { "External id": 42428, "cbid": 211, "correlation": 42428 } }, { "ph": "s", "id": 42428, "pid": 76337, "tid": -914061504, "ts": 1716454218065045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218065113, "dur": 1, "args": { "External id": 42444, "cbid": 251, "correlation": 42444 } }, { "ph": "f", "id": 42444, "pid": 76337, "tid": -914061504, "ts": 1716454218065113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218065119, "dur": 0, "args": { "External id": 42446, "cbid": 251, "correlation": 42446 } }, { "ph": "f", "id": 42446, "pid": 76337, "tid": -914061504, "ts": 1716454218065119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218135204, "dur": 12, "args": { "External id": 42447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42447, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 42447, "pid": 5, "tid": 7, "ts": 1716454218135204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065120, "dur": 11, "args": { "External id": 42447, "cbid": 211, "correlation": 42447 } }, { "ph": "s", "id": 42447, "pid": 76337, "tid": -914061504, "ts": 1716454218065120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218135217, "dur": 5, "args": { "External id": 42449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42449, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 42449, "pid": 5, "tid": 7, "ts": 1716454218135217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065135, "dur": 7, "args": { "External id": 42449, "cbid": 211, "correlation": 42449 } }, { "ph": "s", "id": 42449, "pid": 76337, "tid": -914061504, "ts": 1716454218065135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218135223, "dur": 29, "args": { "External id": 42459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42459, "pid": 5, "tid": 7, "ts": 1716454218135223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065204, "dur": 12, "args": { "External id": 42459, "cbid": 211, "correlation": 42459 } }, { "ph": "s", "id": 42459, "pid": 76337, "tid": -914061504, "ts": 1716454218065204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218135254, "dur": 31, "args": { "External id": 42479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42479, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 42479, "pid": 5, "tid": 7, "ts": 1716454218135254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065269, "dur": 11, "args": { "External id": 42479, "cbid": 211, "correlation": 42479 } }, { "ph": "s", "id": 42479, "pid": 76337, "tid": -914061504, "ts": 1716454218065269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218135286, "dur": 4, "args": { "External id": 42491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42491, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 42491, "pid": 5, "tid": 7, "ts": 1716454218135286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065294, "dur": 6, "args": { "External id": 42491, "cbid": 211, "correlation": 42491 } }, { "ph": "s", "id": 42491, "pid": 76337, "tid": -914061504, "ts": 1716454218065294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218135291, "dur": 31, "args": { "External id": 42494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42494, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42494, "pid": 5, "tid": 7, "ts": 1716454218135291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065313, "dur": 7, "args": { "External id": 42494, "cbid": 211, "correlation": 42494 } }, { "ph": "s", "id": 42494, "pid": 76337, "tid": -914061504, "ts": 1716454218065313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218135324, "dur": 20, "args": { "External id": 42503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42503, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42503, "pid": 5, "tid": 7, "ts": 1716454218135324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065354, "dur": 9, "args": { "External id": 42503, "cbid": 211, "correlation": 42503 } }, { "ph": "s", "id": 42503, "pid": 76337, "tid": -914061504, "ts": 1716454218065354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218065418, "dur": 0, "args": { "External id": 42513, "cbid": 317, "correlation": 42513 } }, { "ph": "f", "id": 42513, "pid": 76337, "tid": -914061504, "ts": 1716454218065418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218065419, "dur": 0, "args": { "External id": 42514, "cbid": 203, "correlation": 42514 } }, { "ph": "f", "id": 42514, "pid": 76337, "tid": -914061504, "ts": 1716454218065419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218065420, "dur": 0, "args": { "External id": 42515, "cbid": 205, "correlation": 42515 } }, { "ph": "f", "id": 42515, "pid": 76337, "tid": -914061504, "ts": 1716454218065420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218135345, "dur": 22, "args": { "External id": 42519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42519, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42519, "pid": 5, "tid": 7, "ts": 1716454218135345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065436, "dur": 13, "args": { "External id": 42519, "cbid": 211, "correlation": 42519 } }, { "ph": "s", "id": 42519, "pid": 76337, "tid": -914061504, "ts": 1716454218065436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218135368, "dur": 44, "args": { "External id": 42521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42521, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42521, "pid": 5, "tid": 7, "ts": 1716454218135368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065452, "dur": 5, "args": { "External id": 42521, "cbid": 211, "correlation": 42521 } }, { "ph": "s", "id": 42521, "pid": 76337, "tid": -914061504, "ts": 1716454218065452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218135413, "dur": 632, "args": { "External id": 42523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42523, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42523, "pid": 5, "tid": 7, "ts": 1716454218135413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065464, "dur": 6, "args": { "External id": 42523, "cbid": 211, "correlation": 42523 } }, { "ph": "s", "id": 42523, "pid": 76337, "tid": -914061504, "ts": 1716454218065464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218136046, "dur": 21, "args": { "External id": 42525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42525, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42525, "pid": 5, "tid": 7, "ts": 1716454218136046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065474, "dur": 5, "args": { "External id": 42525, "cbid": 211, "correlation": 42525 } }, { "ph": "s", "id": 42525, "pid": 76337, "tid": -914061504, "ts": 1716454218065474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218136068, "dur": 32, "args": { "External id": 42531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42531, "pid": 5, "tid": 7, "ts": 1716454218136068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065502, "dur": 8, "args": { "External id": 42531, "cbid": 211, "correlation": 42531 } }, { "ph": "s", "id": 42531, "pid": 76337, "tid": -914061504, "ts": 1716454218065502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218065560, "dur": 0, "args": { "External id": 42541, "cbid": 317, "correlation": 42541 } }, { "ph": "f", "id": 42541, "pid": 76337, "tid": -914061504, "ts": 1716454218065560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218065561, "dur": 0, "args": { "External id": 42542, "cbid": 203, "correlation": 42542 } }, { "ph": "f", "id": 42542, "pid": 76337, "tid": -914061504, "ts": 1716454218065561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218065561, "dur": 0, "args": { "External id": 42543, "cbid": 205, "correlation": 42543 } }, { "ph": "f", "id": 42543, "pid": 76337, "tid": -914061504, "ts": 1716454218065561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218136102, "dur": 31, "args": { "External id": 42547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42547, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42547, "pid": 5, "tid": 7, "ts": 1716454218136102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065576, "dur": 14, "args": { "External id": 42547, "cbid": 211, "correlation": 42547 } }, { "ph": "s", "id": 42547, "pid": 76337, "tid": -914061504, "ts": 1716454218065576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218136134, "dur": 151, "args": { "External id": 42549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42549, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42549, "pid": 5, "tid": 7, "ts": 1716454218136134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065597, "dur": 7, "args": { "External id": 42549, "cbid": 211, "correlation": 42549 } }, { "ph": "s", "id": 42549, "pid": 76337, "tid": -914061504, "ts": 1716454218065597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218136286, "dur": 23, "args": { "External id": 42551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42551, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42551, "pid": 5, "tid": 7, "ts": 1716454218136286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065608, "dur": 5, "args": { "External id": 42551, "cbid": 211, "correlation": 42551 } }, { "ph": "s", "id": 42551, "pid": 76337, "tid": -914061504, "ts": 1716454218065608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218136310, "dur": 32, "args": { "External id": 42557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42557, "pid": 5, "tid": 7, "ts": 1716454218136310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065634, "dur": 8, "args": { "External id": 42557, "cbid": 211, "correlation": 42557 } }, { "ph": "s", "id": 42557, "pid": 76337, "tid": -914061504, "ts": 1716454218065634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218136344, "dur": 27, "args": { "External id": 42565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42565, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42565, "pid": 5, "tid": 7, "ts": 1716454218136344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065664, "dur": 7, "args": { "External id": 42565, "cbid": 211, "correlation": 42565 } }, { "ph": "s", "id": 42565, "pid": 76337, "tid": -914061504, "ts": 1716454218065664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218136372, "dur": 20, "args": { "External id": 42573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42573, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42573, "pid": 5, "tid": 7, "ts": 1716454218136372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065691, "dur": 8, "args": { "External id": 42573, "cbid": 211, "correlation": 42573 } }, { "ph": "s", "id": 42573, "pid": 76337, "tid": -914061504, "ts": 1716454218065691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218136393, "dur": 30, "args": { "External id": 42593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42593, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 42593, "pid": 5, "tid": 7, "ts": 1716454218136393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065777, "dur": 12, "args": { "External id": 42593, "cbid": 211, "correlation": 42593 } }, { "ph": "s", "id": 42593, "pid": 76337, "tid": -914061504, "ts": 1716454218065777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218136425, "dur": 4, "args": { "External id": 42605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42605, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 42605, "pid": 5, "tid": 7, "ts": 1716454218136425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065799, "dur": 6, "args": { "External id": 42605, "cbid": 211, "correlation": 42605 } }, { "ph": "s", "id": 42605, "pid": 76337, "tid": -914061504, "ts": 1716454218065799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218136430, "dur": 30, "args": { "External id": 42608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42608, "pid": 5, "tid": 7, "ts": 1716454218136430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065817, "dur": 6, "args": { "External id": 42608, "cbid": 211, "correlation": 42608 } }, { "ph": "s", "id": 42608, "pid": 76337, "tid": -914061504, "ts": 1716454218065817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218065877, "dur": 0, "args": { "External id": 42619, "cbid": 317, "correlation": 42619 } }, { "ph": "f", "id": 42619, "pid": 76337, "tid": -914061504, "ts": 1716454218065877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218065878, "dur": 0, "args": { "External id": 42620, "cbid": 203, "correlation": 42620 } }, { "ph": "f", "id": 42620, "pid": 76337, "tid": -914061504, "ts": 1716454218065878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218065879, "dur": 0, "args": { "External id": 42621, "cbid": 205, "correlation": 42621 } }, { "ph": "f", "id": 42621, "pid": 76337, "tid": -914061504, "ts": 1716454218065879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218136461, "dur": 22, "args": { "External id": 42625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42625, "pid": 5, "tid": 7, "ts": 1716454218136461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065892, "dur": 12, "args": { "External id": 42625, "cbid": 211, "correlation": 42625 } }, { "ph": "s", "id": 42625, "pid": 76337, "tid": -914061504, "ts": 1716454218065892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218136484, "dur": 102, "args": { "External id": 42627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42627, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42627, "pid": 5, "tid": 7, "ts": 1716454218136484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065910, "dur": 6, "args": { "External id": 42627, "cbid": 211, "correlation": 42627 } }, { "ph": "s", "id": 42627, "pid": 76337, "tid": -914061504, "ts": 1716454218065910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218136588, "dur": 23, "args": { "External id": 42629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42629, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42629, "pid": 5, "tid": 7, "ts": 1716454218136588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065920, "dur": 5, "args": { "External id": 42629, "cbid": 211, "correlation": 42629 } }, { "ph": "s", "id": 42629, "pid": 76337, "tid": -914061504, "ts": 1716454218065920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218136612, "dur": 32, "args": { "External id": 42635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42635, "pid": 5, "tid": 7, "ts": 1716454218136612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218065948, "dur": 8, "args": { "External id": 42635, "cbid": 211, "correlation": 42635 } }, { "ph": "s", "id": 42635, "pid": 76337, "tid": -914061504, "ts": 1716454218065948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218136645, "dur": 178, "args": { "External id": 42644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42644, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42644, "pid": 5, "tid": 7, "ts": 1716454218136645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066039, "dur": 14, "args": { "External id": 42644, "cbid": 211, "correlation": 42644 } }, { "ph": "s", "id": 42644, "pid": 76337, "tid": -914061504, "ts": 1716454218066039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218136824, "dur": 63, "args": { "External id": 42666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42666, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42666, "pid": 5, "tid": 7, "ts": 1716454218136824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066098, "dur": 10, "args": { "External id": 42666, "cbid": 211, "correlation": 42666 } }, { "ph": "s", "id": 42666, "pid": 76337, "tid": -914061504, "ts": 1716454218066098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066186, "dur": 1, "args": { "External id": 42677, "cbid": 251, "correlation": 42677 } }, { "ph": "f", "id": 42677, "pid": 76337, "tid": -914061504, "ts": 1716454218066186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218136888, "dur": 151, "args": { "External id": 42678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42678, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42678, "pid": 5, "tid": 7, "ts": 1716454218136888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066192, "dur": 15, "args": { "External id": 42678, "cbid": 211, "correlation": 42678 } }, { "ph": "s", "id": 42678, "pid": 76337, "tid": -914061504, "ts": 1716454218066192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066266, "dur": 1, "args": { "External id": 42689, "cbid": 251, "correlation": 42689 } }, { "ph": "f", "id": 42689, "pid": 76337, "tid": -914061504, "ts": 1716454218066266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218137040, "dur": 141, "args": { "External id": 42690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42690, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42690, "pid": 5, "tid": 7, "ts": 1716454218137040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066270, "dur": 11, "args": { "External id": 42690, "cbid": 211, "correlation": 42690 } }, { "ph": "s", "id": 42690, "pid": 76337, "tid": -914061504, "ts": 1716454218066270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066336, "dur": 1, "args": { "External id": 42701, "cbid": 251, "correlation": 42701 } }, { "ph": "f", "id": 42701, "pid": 76337, "tid": -914061504, "ts": 1716454218066336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218137183, "dur": 144, "args": { "External id": 42702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42702, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42702, "pid": 5, "tid": 7, "ts": 1716454218137183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066340, "dur": 11, "args": { "External id": 42702, "cbid": 211, "correlation": 42702 } }, { "ph": "s", "id": 42702, "pid": 76337, "tid": -914061504, "ts": 1716454218066340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218137329, "dur": 1902, "args": { "External id": 42723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42723, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 42723, "pid": 5, "tid": 7, "ts": 1716454218137329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066419, "dur": 12, "args": { "External id": 42723, "cbid": 211, "correlation": 42723 } }, { "ph": "s", "id": 42723, "pid": 76337, "tid": -914061504, "ts": 1716454218066419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066516, "dur": 1, "args": { "External id": 42741, "cbid": 251, "correlation": 42741 } }, { "ph": "f", "id": 42741, "pid": 76337, "tid": -914061504, "ts": 1716454218066516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218139232, "dur": 147, "args": { "External id": 42743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42743, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 42743, "pid": 5, "tid": 7, "ts": 1716454218139232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066522, "dur": 13, "args": { "External id": 42743, "cbid": 211, "correlation": 42743 } }, { "ph": "s", "id": 42743, "pid": 76337, "tid": -914061504, "ts": 1716454218066522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218139381, "dur": 35, "args": { "External id": 42751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42751, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42751, "pid": 5, "tid": 7, "ts": 1716454218139381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066591, "dur": 12, "args": { "External id": 42751, "cbid": 211, "correlation": 42751 } }, { "ph": "s", "id": 42751, "pid": 76337, "tid": -914061504, "ts": 1716454218066591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218139417, "dur": 51, "args": { "External id": 42759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42759, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42759, "pid": 5, "tid": 7, "ts": 1716454218139417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066630, "dur": 11, "args": { "External id": 42759, "cbid": 211, "correlation": 42759 } }, { "ph": "s", "id": 42759, "pid": 76337, "tid": -914061504, "ts": 1716454218066630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218139470, "dur": 30, "args": { "External id": 42770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42770, "pid": 5, "tid": 7, "ts": 1716454218139470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066705, "dur": 13, "args": { "External id": 42770, "cbid": 211, "correlation": 42770 } }, { "ph": "s", "id": 42770, "pid": 76337, "tid": -914061504, "ts": 1716454218066705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218139501, "dur": 34, "args": { "External id": 42792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42792, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42792, "pid": 5, "tid": 7, "ts": 1716454218139501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066737, "dur": 8, "args": { "External id": 42792, "cbid": 211, "correlation": 42792 } }, { "ph": "s", "id": 42792, "pid": 76337, "tid": -914061504, "ts": 1716454218066737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066822, "dur": 1, "args": { "External id": 42803, "cbid": 251, "correlation": 42803 } }, { "ph": "f", "id": 42803, "pid": 76337, "tid": -914061504, "ts": 1716454218066822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218139536, "dur": 89, "args": { "External id": 42804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42804, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42804, "pid": 5, "tid": 7, "ts": 1716454218139536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066828, "dur": 14, "args": { "External id": 42804, "cbid": 211, "correlation": 42804 } }, { "ph": "s", "id": 42804, "pid": 76337, "tid": -914061504, "ts": 1716454218066828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066897, "dur": 1, "args": { "External id": 42815, "cbid": 251, "correlation": 42815 } }, { "ph": "f", "id": 42815, "pid": 76337, "tid": -914061504, "ts": 1716454218066897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066901, "dur": 0, "args": { "External id": 42816, "cbid": 251, "correlation": 42816 } }, { "ph": "f", "id": 42816, "pid": 76337, "tid": -914061504, "ts": 1716454218066901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218139627, "dur": 10, "args": { "External id": 42817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42817, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 42817, "pid": 5, "tid": 7, "ts": 1716454218139627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066902, "dur": 12, "args": { "External id": 42817, "cbid": 211, "correlation": 42817 } }, { "ph": "s", "id": 42817, "pid": 76337, "tid": -914061504, "ts": 1716454218066902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218139638, "dur": 5, "args": { "External id": 42819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42819, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 42819, "pid": 5, "tid": 7, "ts": 1716454218139638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066916, "dur": 6, "args": { "External id": 42819, "cbid": 211, "correlation": 42819 } }, { "ph": "s", "id": 42819, "pid": 76337, "tid": -914061504, "ts": 1716454218066916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066972, "dur": 10, "args": { "External id": 42830, "cbid": 251, "correlation": 42830 } }, { "ph": "f", "id": 42830, "pid": 76337, "tid": -914061504, "ts": 1716454218066972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218066985, "dur": 0, "args": { "External id": 42831, "cbid": 251, "correlation": 42831 } }, { "ph": "f", "id": 42831, "pid": 76337, "tid": -914061504, "ts": 1716454218066985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218139644, "dur": 7, "args": { "External id": 42832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42832, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 42832, "pid": 5, "tid": 7, "ts": 1716454218139644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218066987, "dur": 13, "args": { "External id": 42832, "cbid": 211, "correlation": 42832 } }, { "ph": "s", "id": 42832, "pid": 76337, "tid": -914061504, "ts": 1716454218066987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218139652, "dur": 3, "args": { "External id": 42834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42834, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 42834, "pid": 5, "tid": 7, "ts": 1716454218139652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067002, "dur": 8, "args": { "External id": 42834, "cbid": 211, "correlation": 42834 } }, { "ph": "s", "id": 42834, "pid": 76337, "tid": -914061504, "ts": 1716454218067002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218139656, "dur": 91, "args": { "External id": 42855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42855, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 42855, "pid": 5, "tid": 7, "ts": 1716454218139656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067079, "dur": 13, "args": { "External id": 42855, "cbid": 211, "correlation": 42855 } }, { "ph": "s", "id": 42855, "pid": 76337, "tid": -914061504, "ts": 1716454218067079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218067177, "dur": 1, "args": { "External id": 42873, "cbid": 251, "correlation": 42873 } }, { "ph": "f", "id": 42873, "pid": 76337, "tid": -914061504, "ts": 1716454218067177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218139748, "dur": 96, "args": { "External id": 42875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42875, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42875, "pid": 5, "tid": 7, "ts": 1716454218139748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067183, "dur": 13, "args": { "External id": 42875, "cbid": 211, "correlation": 42875 } }, { "ph": "s", "id": 42875, "pid": 76337, "tid": -914061504, "ts": 1716454218067183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218139846, "dur": 19, "args": { "External id": 42883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42883, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42883, "pid": 5, "tid": 7, "ts": 1716454218139846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067252, "dur": 12, "args": { "External id": 42883, "cbid": 211, "correlation": 42883 } }, { "ph": "s", "id": 42883, "pid": 76337, "tid": -914061504, "ts": 1716454218067252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218139866, "dur": 38, "args": { "External id": 42891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42891, "pid": 5, "tid": 7, "ts": 1716454218139866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067293, "dur": 10, "args": { "External id": 42891, "cbid": 211, "correlation": 42891 } }, { "ph": "s", "id": 42891, "pid": 76337, "tid": -914061504, "ts": 1716454218067293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218139905, "dur": 34, "args": { "External id": 42913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42913, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42913, "pid": 5, "tid": 7, "ts": 1716454218139905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067346, "dur": 14, "args": { "External id": 42913, "cbid": 211, "correlation": 42913 } }, { "ph": "s", "id": 42913, "pid": 76337, "tid": -914061504, "ts": 1716454218067346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218067439, "dur": 1, "args": { "External id": 42929, "cbid": 251, "correlation": 42929 } }, { "ph": "f", "id": 42929, "pid": 76337, "tid": -914061504, "ts": 1716454218067439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218067444, "dur": 0, "args": { "External id": 42931, "cbid": 251, "correlation": 42931 } }, { "ph": "f", "id": 42931, "pid": 76337, "tid": -914061504, "ts": 1716454218067444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218139940, "dur": 533, "args": { "External id": 42932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42932, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 42932, "pid": 5, "tid": 7, "ts": 1716454218139940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067447, "dur": 14, "args": { "External id": 42932, "cbid": 211, "correlation": 42932 } }, { "ph": "s", "id": 42932, "pid": 76337, "tid": -914061504, "ts": 1716454218067447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218140475, "dur": 123, "args": { "External id": 42940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42940, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42940, "pid": 5, "tid": 7, "ts": 1716454218140475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067514, "dur": 13, "args": { "External id": 42940, "cbid": 211, "correlation": 42940 } }, { "ph": "s", "id": 42940, "pid": 76337, "tid": -914061504, "ts": 1716454218067514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218140599, "dur": 131, "args": { "External id": 42948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42948, "pid": 5, "tid": 7, "ts": 1716454218140599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067544, "dur": 8, "args": { "External id": 42948, "cbid": 211, "correlation": 42948 } }, { "ph": "s", "id": 42948, "pid": 76337, "tid": -914061504, "ts": 1716454218067544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218067620, "dur": 1, "args": { "External id": 42964, "cbid": 251, "correlation": 42964 } }, { "ph": "f", "id": 42964, "pid": 76337, "tid": -914061504, "ts": 1716454218067620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218140732, "dur": 303, "args": { "External id": 42966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42966, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 42966, "pid": 5, "tid": 7, "ts": 1716454218140732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067625, "dur": 12, "args": { "External id": 42966, "cbid": 211, "correlation": 42966 } }, { "ph": "s", "id": 42966, "pid": 76337, "tid": -914061504, "ts": 1716454218067625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218141037, "dur": 27, "args": { "External id": 42974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42974, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42974, "pid": 5, "tid": 7, "ts": 1716454218141037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067668, "dur": 9, "args": { "External id": 42974, "cbid": 211, "correlation": 42974 } }, { "ph": "s", "id": 42974, "pid": 76337, "tid": -914061504, "ts": 1716454218067668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218141065, "dur": 81, "args": { "External id": 42985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 42985, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 42985, "pid": 5, "tid": 7, "ts": 1716454218141065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067734, "dur": 12, "args": { "External id": 42985, "cbid": 211, "correlation": 42985 } }, { "ph": "s", "id": 42985, "pid": 76337, "tid": -914061504, "ts": 1716454218067734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218067800, "dur": 0, "args": { "External id": 42997, "cbid": 317, "correlation": 42997 } }, { "ph": "f", "id": 42997, "pid": 76337, "tid": -914061504, "ts": 1716454218067800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218067801, "dur": 0, "args": { "External id": 42998, "cbid": 203, "correlation": 42998 } }, { "ph": "f", "id": 42998, "pid": 76337, "tid": -914061504, "ts": 1716454218067801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218067802, "dur": 0, "args": { "External id": 42999, "cbid": 205, "correlation": 42999 } }, { "ph": "f", "id": 42999, "pid": 76337, "tid": -914061504, "ts": 1716454218067802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218141146, "dur": 23, "args": { "External id": 43003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43003, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43003, "pid": 5, "tid": 7, "ts": 1716454218141146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067817, "dur": 12, "args": { "External id": 43003, "cbid": 211, "correlation": 43003 } }, { "ph": "s", "id": 43003, "pid": 76337, "tid": -914061504, "ts": 1716454218067817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218141171, "dur": 117, "args": { "External id": 43005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43005, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43005, "pid": 5, "tid": 7, "ts": 1716454218141171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067835, "dur": 6, "args": { "External id": 43005, "cbid": 211, "correlation": 43005 } }, { "ph": "s", "id": 43005, "pid": 76337, "tid": -914061504, "ts": 1716454218067835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218141289, "dur": 24, "args": { "External id": 43007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43007, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43007, "pid": 5, "tid": 7, "ts": 1716454218141289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067846, "dur": 5, "args": { "External id": 43007, "cbid": 211, "correlation": 43007 } }, { "ph": "s", "id": 43007, "pid": 76337, "tid": -914061504, "ts": 1716454218067846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218141314, "dur": 32, "args": { "External id": 43013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43013, "pid": 5, "tid": 7, "ts": 1716454218141314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067874, "dur": 8, "args": { "External id": 43013, "cbid": 211, "correlation": 43013 } }, { "ph": "s", "id": 43013, "pid": 76337, "tid": -914061504, "ts": 1716454218067874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218141348, "dur": 27, "args": { "External id": 43021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43021, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43021, "pid": 5, "tid": 7, "ts": 1716454218141348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067905, "dur": 9, "args": { "External id": 43021, "cbid": 211, "correlation": 43021 } }, { "ph": "s", "id": 43021, "pid": 76337, "tid": -914061504, "ts": 1716454218067905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454218141376, "dur": 100, "args": { "External id": 43032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43032, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43032, "pid": 5, "tid": 7, "ts": 1716454218141376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218067991, "dur": 15, "args": { "External id": 43032, "cbid": 211, "correlation": 43032 } }, { "ph": "s", "id": 43032, "pid": 76337, "tid": -914061504, "ts": 1716454218067991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218068050, "dur": 0, "args": { "External id": 43042, "cbid": 317, "correlation": 43042 } }, { "ph": "f", "id": 43042, "pid": 76337, "tid": -914061504, "ts": 1716454218068050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218068051, "dur": 0, "args": { "External id": 43043, "cbid": 203, "correlation": 43043 } }, { "ph": "f", "id": 43043, "pid": 76337, "tid": -914061504, "ts": 1716454218068051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218068052, "dur": 0, "args": { "External id": 43044, "cbid": 205, "correlation": 43044 } }, { "ph": "f", "id": 43044, "pid": 76337, "tid": -914061504, "ts": 1716454218068052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218141477, "dur": 74, "args": { "External id": 43048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43048, "pid": 5, "tid": 7, "ts": 1716454218141477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068071, "dur": 13, "args": { "External id": 43048, "cbid": 211, "correlation": 43048 } }, { "ph": "s", "id": 43048, "pid": 76337, "tid": -914061504, "ts": 1716454218068071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218141552, "dur": 43, "args": { "External id": 43050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43050, "pid": 5, "tid": 7, "ts": 1716454218141552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068086, "dur": 5, "args": { "External id": 43050, "cbid": 211, "correlation": 43050 } }, { "ph": "s", "id": 43050, "pid": 76337, "tid": -914061504, "ts": 1716454218068086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218141597, "dur": 4, "args": { "External id": 43052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43052, "pid": 5, "tid": 7, "ts": 1716454218141597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068098, "dur": 9, "args": { "External id": 43052, "cbid": 211, "correlation": 43052 } }, { "ph": "s", "id": 43052, "pid": 76337, "tid": -914061504, "ts": 1716454218068098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218068113, "dur": 0, "args": { "External id": 43053, "cbid": 51, "correlation": 43053 } }, { "ph": "s", "id": 43053, "pid": 76337, "tid": -914061504, "ts": 1716454218068113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218141602, "dur": 2214, "args": { "External id": 43054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43054, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43054, "pid": 5, "tid": 7, "ts": 1716454218141602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068114, "dur": 8, "args": { "External id": 43054, "cbid": 211, "correlation": 43054 } }, { "ph": "s", "id": 43054, "pid": 76337, "tid": -914061504, "ts": 1716454218068114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218143818, "dur": 112, "args": { "External id": 43059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43059, "pid": 5, "tid": 7, "ts": 1716454218143818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068146, "dur": 8, "args": { "External id": 43059, "cbid": 211, "correlation": 43059 } }, { "ph": "s", "id": 43059, "pid": 76337, "tid": -914061504, "ts": 1716454218068146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218143931, "dur": 164, "args": { "External id": 43068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43068, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43068, "pid": 5, "tid": 7, "ts": 1716454218143931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068251, "dur": 13, "args": { "External id": 43068, "cbid": 211, "correlation": 43068 } }, { "ph": "s", "id": 43068, "pid": 76337, "tid": -914061504, "ts": 1716454218068251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218144096, "dur": 127, "args": { "External id": 43088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43088, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 43088, "pid": 5, "tid": 7, "ts": 1716454218144096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068322, "dur": 12, "args": { "External id": 43088, "cbid": 211, "correlation": 43088 } }, { "ph": "s", "id": 43088, "pid": 76337, "tid": -914061504, "ts": 1716454218068322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218144224, "dur": 5, "args": { "External id": 43100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43100, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 43100, "pid": 5, "tid": 7, "ts": 1716454218144224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068344, "dur": 6, "args": { "External id": 43100, "cbid": 211, "correlation": 43100 } }, { "ph": "s", "id": 43100, "pid": 76337, "tid": -914061504, "ts": 1716454218068344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218144230, "dur": 159, "args": { "External id": 43103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43103, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43103, "pid": 5, "tid": 7, "ts": 1716454218144230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068363, "dur": 6, "args": { "External id": 43103, "cbid": 211, "correlation": 43103 } }, { "ph": "s", "id": 43103, "pid": 76337, "tid": -914061504, "ts": 1716454218068363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218144390, "dur": 101, "args": { "External id": 43112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43112, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43112, "pid": 5, "tid": 7, "ts": 1716454218144390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068403, "dur": 10, "args": { "External id": 43112, "cbid": 211, "correlation": 43112 } }, { "ph": "s", "id": 43112, "pid": 76337, "tid": -914061504, "ts": 1716454218068403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218068460, "dur": 0, "args": { "External id": 43122, "cbid": 317, "correlation": 43122 } }, { "ph": "f", "id": 43122, "pid": 76337, "tid": -914061504, "ts": 1716454218068460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218068461, "dur": 0, "args": { "External id": 43123, "cbid": 203, "correlation": 43123 } }, { "ph": "f", "id": 43123, "pid": 76337, "tid": -914061504, "ts": 1716454218068461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218068461, "dur": 0, "args": { "External id": 43124, "cbid": 205, "correlation": 43124 } }, { "ph": "f", "id": 43124, "pid": 76337, "tid": -914061504, "ts": 1716454218068461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218144493, "dur": 111, "args": { "External id": 43128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43128, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43128, "pid": 5, "tid": 7, "ts": 1716454218144493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068480, "dur": 13, "args": { "External id": 43128, "cbid": 211, "correlation": 43128 } }, { "ph": "s", "id": 43128, "pid": 76337, "tid": -914061504, "ts": 1716454218068480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218144605, "dur": 33, "args": { "External id": 43130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43130, "pid": 5, "tid": 7, "ts": 1716454218144605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068495, "dur": 5, "args": { "External id": 43130, "cbid": 211, "correlation": 43130 } }, { "ph": "s", "id": 43130, "pid": 76337, "tid": -914061504, "ts": 1716454218068495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218144639, "dur": 3, "args": { "External id": 43132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43132, "pid": 5, "tid": 7, "ts": 1716454218144639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068506, "dur": 6, "args": { "External id": 43132, "cbid": 211, "correlation": 43132 } }, { "ph": "s", "id": 43132, "pid": 76337, "tid": -914061504, "ts": 1716454218068506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218068516, "dur": 0, "args": { "External id": 43133, "cbid": 51, "correlation": 43133 } }, { "ph": "s", "id": 43133, "pid": 76337, "tid": -914061504, "ts": 1716454218068516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218144644, "dur": 1978, "args": { "External id": 43134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43134, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43134, "pid": 5, "tid": 7, "ts": 1716454218144644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068517, "dur": 6, "args": { "External id": 43134, "cbid": 211, "correlation": 43134 } }, { "ph": "s", "id": 43134, "pid": 76337, "tid": -914061504, "ts": 1716454218068517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218146624, "dur": 59, "args": { "External id": 43139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43139, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43139, "pid": 5, "tid": 7, "ts": 1716454218146624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068547, "dur": 8, "args": { "External id": 43139, "cbid": 211, "correlation": 43139 } }, { "ph": "s", "id": 43139, "pid": 76337, "tid": -914061504, "ts": 1716454218068547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218146684, "dur": 3, "args": { "External id": 43147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43147, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43147, "pid": 5, "tid": 7, "ts": 1716454218146684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068591, "dur": 10, "args": { "External id": 43147, "cbid": 211, "correlation": 43147 } }, { "ph": "s", "id": 43147, "pid": 76337, "tid": -914061504, "ts": 1716454218068591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218068658, "dur": 1, "args": { "External id": 43163, "cbid": 251, "correlation": 43163 } }, { "ph": "f", "id": 43163, "pid": 76337, "tid": -914061504, "ts": 1716454218068658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218068663, "dur": 0, "args": { "External id": 43165, "cbid": 251, "correlation": 43165 } }, { "ph": "f", "id": 43165, "pid": 76337, "tid": -914061504, "ts": 1716454218068663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218146689, "dur": 11, "args": { "External id": 43166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43166, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 43166, "pid": 5, "tid": 7, "ts": 1716454218146689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068665, "dur": 11, "args": { "External id": 43166, "cbid": 211, "correlation": 43166 } }, { "ph": "s", "id": 43166, "pid": 76337, "tid": -914061504, "ts": 1716454218068665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218146701, "dur": 5, "args": { "External id": 43168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43168, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 43168, "pid": 5, "tid": 7, "ts": 1716454218146701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068678, "dur": 5, "args": { "External id": 43168, "cbid": 211, "correlation": 43168 } }, { "ph": "s", "id": 43168, "pid": 76337, "tid": -914061504, "ts": 1716454218068678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218146707, "dur": 53, "args": { "External id": 43178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43178, "pid": 5, "tid": 7, "ts": 1716454218146707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068737, "dur": 12, "args": { "External id": 43178, "cbid": 211, "correlation": 43178 } }, { "ph": "s", "id": 43178, "pid": 76337, "tid": -914061504, "ts": 1716454218068737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218146761, "dur": 50, "args": { "External id": 43198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43198, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 43198, "pid": 5, "tid": 7, "ts": 1716454218146761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068807, "dur": 11, "args": { "External id": 43198, "cbid": 211, "correlation": 43198 } }, { "ph": "s", "id": 43198, "pid": 76337, "tid": -914061504, "ts": 1716454218068807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218146813, "dur": 4, "args": { "External id": 43210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43210, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43210, "pid": 5, "tid": 7, "ts": 1716454218146813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068828, "dur": 6, "args": { "External id": 43210, "cbid": 211, "correlation": 43210 } }, { "ph": "s", "id": 43210, "pid": 76337, "tid": -914061504, "ts": 1716454218068828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218146818, "dur": 55, "args": { "External id": 43213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43213, "pid": 5, "tid": 7, "ts": 1716454218146818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068847, "dur": 7, "args": { "External id": 43213, "cbid": 211, "correlation": 43213 } }, { "ph": "s", "id": 43213, "pid": 76337, "tid": -914061504, "ts": 1716454218068847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218146874, "dur": 36, "args": { "External id": 43222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43222, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43222, "pid": 5, "tid": 7, "ts": 1716454218146874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068888, "dur": 10, "args": { "External id": 43222, "cbid": 211, "correlation": 43222 } }, { "ph": "s", "id": 43222, "pid": 76337, "tid": -914061504, "ts": 1716454218068888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218068951, "dur": 0, "args": { "External id": 43232, "cbid": 317, "correlation": 43232 } }, { "ph": "f", "id": 43232, "pid": 76337, "tid": -914061504, "ts": 1716454218068951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218068952, "dur": 0, "args": { "External id": 43233, "cbid": 203, "correlation": 43233 } }, { "ph": "f", "id": 43233, "pid": 76337, "tid": -914061504, "ts": 1716454218068952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218068952, "dur": 0, "args": { "External id": 43234, "cbid": 205, "correlation": 43234 } }, { "ph": "f", "id": 43234, "pid": 76337, "tid": -914061504, "ts": 1716454218068952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218146912, "dur": 39, "args": { "External id": 43238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43238, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43238, "pid": 5, "tid": 7, "ts": 1716454218146912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068971, "dur": 21, "args": { "External id": 43238, "cbid": 211, "correlation": 43238 } }, { "ph": "s", "id": 43238, "pid": 76337, "tid": -914061504, "ts": 1716454218068971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218146952, "dur": 14, "args": { "External id": 43240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43240, "pid": 5, "tid": 7, "ts": 1716454218146952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218068995, "dur": 5, "args": { "External id": 43240, "cbid": 211, "correlation": 43240 } }, { "ph": "s", "id": 43240, "pid": 76337, "tid": -914061504, "ts": 1716454218068995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218146968, "dur": 3, "args": { "External id": 43242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43242, "pid": 5, "tid": 7, "ts": 1716454218146968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069005, "dur": 5, "args": { "External id": 43242, "cbid": 211, "correlation": 43242 } }, { "ph": "s", "id": 43242, "pid": 76337, "tid": -914061504, "ts": 1716454218069005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218069014, "dur": 0, "args": { "External id": 43243, "cbid": 51, "correlation": 43243 } }, { "ph": "s", "id": 43243, "pid": 76337, "tid": -914061504, "ts": 1716454218069014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218146973, "dur": 687, "args": { "External id": 43244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43244, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43244, "pid": 5, "tid": 7, "ts": 1716454218146973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069014, "dur": 6, "args": { "External id": 43244, "cbid": 211, "correlation": 43244 } }, { "ph": "s", "id": 43244, "pid": 76337, "tid": -914061504, "ts": 1716454218069014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218147661, "dur": 59, "args": { "External id": 43249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43249, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43249, "pid": 5, "tid": 7, "ts": 1716454218147661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069043, "dur": 12, "args": { "External id": 43249, "cbid": 211, "correlation": 43249 } }, { "ph": "s", "id": 43249, "pid": 76337, "tid": -914061504, "ts": 1716454218069043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218069104, "dur": 0, "args": { "External id": 43259, "cbid": 317, "correlation": 43259 } }, { "ph": "f", "id": 43259, "pid": 76337, "tid": -914061504, "ts": 1716454218069104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218069105, "dur": 0, "args": { "External id": 43260, "cbid": 203, "correlation": 43260 } }, { "ph": "f", "id": 43260, "pid": 76337, "tid": -914061504, "ts": 1716454218069105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218069106, "dur": 0, "args": { "External id": 43261, "cbid": 205, "correlation": 43261 } }, { "ph": "f", "id": 43261, "pid": 76337, "tid": -914061504, "ts": 1716454218069106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218147721, "dur": 3, "args": { "External id": 43265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43265, "pid": 5, "tid": 7, "ts": 1716454218147721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069126, "dur": 11, "args": { "External id": 43265, "cbid": 211, "correlation": 43265 } }, { "ph": "s", "id": 43265, "pid": 76337, "tid": -914061504, "ts": 1716454218069126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218069142, "dur": 0, "args": { "External id": 43266, "cbid": 51, "correlation": 43266 } }, { "ph": "s", "id": 43266, "pid": 76337, "tid": -914061504, "ts": 1716454218069142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454218147725, "dur": 261, "args": { "External id": 43267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43267, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43267, "pid": 5, "tid": 7, "ts": 1716454218147725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069143, "dur": 6, "args": { "External id": 43267, "cbid": 211, "correlation": 43267 } }, { "ph": "s", "id": 43267, "pid": 76337, "tid": -914061504, "ts": 1716454218069143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218147987, "dur": 58, "args": { "External id": 43272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43272, "pid": 5, "tid": 7, "ts": 1716454218147987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069170, "dur": 9, "args": { "External id": 43272, "cbid": 211, "correlation": 43272 } }, { "ph": "s", "id": 43272, "pid": 76337, "tid": -914061504, "ts": 1716454218069170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218148047, "dur": 50, "args": { "External id": 43280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43280, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43280, "pid": 5, "tid": 7, "ts": 1716454218148047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069199, "dur": 8, "args": { "External id": 43280, "cbid": 211, "correlation": 43280 } }, { "ph": "s", "id": 43280, "pid": 76337, "tid": -914061504, "ts": 1716454218069199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218148098, "dur": 35, "args": { "External id": 43288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43288, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43288, "pid": 5, "tid": 7, "ts": 1716454218148098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069229, "dur": 9, "args": { "External id": 43288, "cbid": 211, "correlation": 43288 } }, { "ph": "s", "id": 43288, "pid": 76337, "tid": -914061504, "ts": 1716454218069229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218148134, "dur": 52, "args": { "External id": 43308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43308, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 43308, "pid": 5, "tid": 7, "ts": 1716454218148134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069311, "dur": 12, "args": { "External id": 43308, "cbid": 211, "correlation": 43308 } }, { "ph": "s", "id": 43308, "pid": 76337, "tid": -914061504, "ts": 1716454218069311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218148188, "dur": 4, "args": { "External id": 43320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43320, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43320, "pid": 5, "tid": 7, "ts": 1716454218148188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069335, "dur": 9, "args": { "External id": 43320, "cbid": 211, "correlation": 43320 } }, { "ph": "s", "id": 43320, "pid": 76337, "tid": -914061504, "ts": 1716454218069335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218148193, "dur": 55, "args": { "External id": 43323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43323, "pid": 5, "tid": 7, "ts": 1716454218148193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069356, "dur": 7, "args": { "External id": 43323, "cbid": 211, "correlation": 43323 } }, { "ph": "s", "id": 43323, "pid": 76337, "tid": -914061504, "ts": 1716454218069356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218069413, "dur": 0, "args": { "External id": 43334, "cbid": 317, "correlation": 43334 } }, { "ph": "f", "id": 43334, "pid": 76337, "tid": -914061504, "ts": 1716454218069413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218069414, "dur": 0, "args": { "External id": 43335, "cbid": 203, "correlation": 43335 } }, { "ph": "f", "id": 43335, "pid": 76337, "tid": -914061504, "ts": 1716454218069414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218069415, "dur": 0, "args": { "External id": 43336, "cbid": 205, "correlation": 43336 } }, { "ph": "f", "id": 43336, "pid": 76337, "tid": -914061504, "ts": 1716454218069415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069449, "dur": 3, "args": { "External id": 43340, "cbid": 251, "correlation": 43340 } }, { "ph": "f", "id": 43340, "pid": 76337, "tid": -914061504, "ts": 1716454218069449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069454, "dur": 1, "args": { "External id": 43341, "cbid": 251, "correlation": 43341 } }, { "ph": "f", "id": 43341, "pid": 76337, "tid": -914061504, "ts": 1716454218069454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069455, "dur": 1, "args": { "External id": 43342, "cbid": 251, "correlation": 43342 } }, { "ph": "f", "id": 43342, "pid": 76337, "tid": -914061504, "ts": 1716454218069455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069457, "dur": 1, "args": { "External id": 43343, "cbid": 251, "correlation": 43343 } }, { "ph": "f", "id": 43343, "pid": 76337, "tid": -914061504, "ts": 1716454218069457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069459, "dur": 1, "args": { "External id": 43344, "cbid": 251, "correlation": 43344 } }, { "ph": "f", "id": 43344, "pid": 76337, "tid": -914061504, "ts": 1716454218069459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069461, "dur": 1, "args": { "External id": 43345, "cbid": 251, "correlation": 43345 } }, { "ph": "f", "id": 43345, "pid": 76337, "tid": -914061504, "ts": 1716454218069461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069463, "dur": 1, "args": { "External id": 43346, "cbid": 251, "correlation": 43346 } }, { "ph": "f", "id": 43346, "pid": 76337, "tid": -914061504, "ts": 1716454218069463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069465, "dur": 1, "args": { "External id": 43347, "cbid": 251, "correlation": 43347 } }, { "ph": "f", "id": 43347, "pid": 76337, "tid": -914061504, "ts": 1716454218069465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069467, "dur": 0, "args": { "External id": 43348, "cbid": 251, "correlation": 43348 } }, { "ph": "f", "id": 43348, "pid": 76337, "tid": -914061504, "ts": 1716454218069467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218148249, "dur": 114, "args": { "External id": 43349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43349, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 43349, "pid": 5, "tid": 7, "ts": 1716454218148249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069470, "dur": 13, "args": { "External id": 43349, "cbid": 211, "correlation": 43349 } }, { "ph": "s", "id": 43349, "pid": 76337, "tid": -914061504, "ts": 1716454218069470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218148364, "dur": 58, "args": { "External id": 43355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43355, "pid": 5, "tid": 7, "ts": 1716454218148364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069507, "dur": 9, "args": { "External id": 43355, "cbid": 211, "correlation": 43355 } }, { "ph": "s", "id": 43355, "pid": 76337, "tid": -914061504, "ts": 1716454218069507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218148423, "dur": 575, "args": { "External id": 43364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43364, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43364, "pid": 5, "tid": 7, "ts": 1716454218148423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069591, "dur": 14, "args": { "External id": 43364, "cbid": 211, "correlation": 43364 } }, { "ph": "s", "id": 43364, "pid": 76337, "tid": -914061504, "ts": 1716454218069591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218149000, "dur": 177, "args": { "External id": 43386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43386, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43386, "pid": 5, "tid": 7, "ts": 1716454218149000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069649, "dur": 10, "args": { "External id": 43386, "cbid": 211, "correlation": 43386 } }, { "ph": "s", "id": 43386, "pid": 76337, "tid": -914061504, "ts": 1716454218069649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069738, "dur": 1, "args": { "External id": 43397, "cbid": 251, "correlation": 43397 } }, { "ph": "f", "id": 43397, "pid": 76337, "tid": -914061504, "ts": 1716454218069738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218149178, "dur": 195, "args": { "External id": 43398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43398, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43398, "pid": 5, "tid": 7, "ts": 1716454218149178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069743, "dur": 17, "args": { "External id": 43398, "cbid": 211, "correlation": 43398 } }, { "ph": "s", "id": 43398, "pid": 76337, "tid": -914061504, "ts": 1716454218069743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069817, "dur": 1, "args": { "External id": 43409, "cbid": 251, "correlation": 43409 } }, { "ph": "f", "id": 43409, "pid": 76337, "tid": -914061504, "ts": 1716454218069817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218149374, "dur": 182, "args": { "External id": 43410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43410, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43410, "pid": 5, "tid": 7, "ts": 1716454218149374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069821, "dur": 12, "args": { "External id": 43410, "cbid": 211, "correlation": 43410 } }, { "ph": "s", "id": 43410, "pid": 76337, "tid": -914061504, "ts": 1716454218069821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218069885, "dur": 1, "args": { "External id": 43421, "cbid": 251, "correlation": 43421 } }, { "ph": "f", "id": 43421, "pid": 76337, "tid": -914061504, "ts": 1716454218069885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218149557, "dur": 182, "args": { "External id": 43422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43422, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43422, "pid": 5, "tid": 7, "ts": 1716454218149557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069889, "dur": 12, "args": { "External id": 43422, "cbid": 211, "correlation": 43422 } }, { "ph": "s", "id": 43422, "pid": 76337, "tid": -914061504, "ts": 1716454218069889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218149741, "dur": 18285, "args": { "External id": 43443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43443, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 43443, "pid": 5, "tid": 7, "ts": 1716454218149741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218069971, "dur": 23, "args": { "External id": 43443, "cbid": 211, "correlation": 43443 } }, { "ph": "s", "id": 43443, "pid": 76337, "tid": -914061504, "ts": 1716454218069971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070081, "dur": 1, "args": { "External id": 43461, "cbid": 251, "correlation": 43461 } }, { "ph": "f", "id": 43461, "pid": 76337, "tid": -914061504, "ts": 1716454218070081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218168028, "dur": 203, "args": { "External id": 43463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43463, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43463, "pid": 5, "tid": 7, "ts": 1716454218168028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070087, "dur": 13, "args": { "External id": 43463, "cbid": 211, "correlation": 43463 } }, { "ph": "s", "id": 43463, "pid": 76337, "tid": -914061504, "ts": 1716454218070087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218168232, "dur": 66, "args": { "External id": 43471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43471, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43471, "pid": 5, "tid": 7, "ts": 1716454218168232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070157, "dur": 12, "args": { "External id": 43471, "cbid": 211, "correlation": 43471 } }, { "ph": "s", "id": 43471, "pid": 76337, "tid": -914061504, "ts": 1716454218070157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218168300, "dur": 97, "args": { "External id": 43479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43479, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43479, "pid": 5, "tid": 7, "ts": 1716454218168300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070199, "dur": 9, "args": { "External id": 43479, "cbid": 211, "correlation": 43479 } }, { "ph": "s", "id": 43479, "pid": 76337, "tid": -914061504, "ts": 1716454218070199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218168398, "dur": 55, "args": { "External id": 43490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43490, "pid": 5, "tid": 7, "ts": 1716454218168398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070271, "dur": 12, "args": { "External id": 43490, "cbid": 211, "correlation": 43490 } }, { "ph": "s", "id": 43490, "pid": 76337, "tid": -914061504, "ts": 1716454218070271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218168454, "dur": 91, "args": { "External id": 43512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43512, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43512, "pid": 5, "tid": 7, "ts": 1716454218168454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070302, "dur": 8, "args": { "External id": 43512, "cbid": 211, "correlation": 43512 } }, { "ph": "s", "id": 43512, "pid": 76337, "tid": -914061504, "ts": 1716454218070302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070386, "dur": 1, "args": { "External id": 43523, "cbid": 251, "correlation": 43523 } }, { "ph": "f", "id": 43523, "pid": 76337, "tid": -914061504, "ts": 1716454218070386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218168546, "dur": 106, "args": { "External id": 43524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43524, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43524, "pid": 5, "tid": 7, "ts": 1716454218168546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070391, "dur": 12, "args": { "External id": 43524, "cbid": 211, "correlation": 43524 } }, { "ph": "s", "id": 43524, "pid": 76337, "tid": -914061504, "ts": 1716454218070391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070468, "dur": 1, "args": { "External id": 43535, "cbid": 251, "correlation": 43535 } }, { "ph": "f", "id": 43535, "pid": 76337, "tid": -914061504, "ts": 1716454218070468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070472, "dur": 0, "args": { "External id": 43536, "cbid": 251, "correlation": 43536 } }, { "ph": "f", "id": 43536, "pid": 76337, "tid": -914061504, "ts": 1716454218070472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218168653, "dur": 10, "args": { "External id": 43537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43537, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 43537, "pid": 5, "tid": 7, "ts": 1716454218168653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070474, "dur": 14, "args": { "External id": 43537, "cbid": 211, "correlation": 43537 } }, { "ph": "s", "id": 43537, "pid": 76337, "tid": -914061504, "ts": 1716454218070474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218168665, "dur": 5, "args": { "External id": 43539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43539, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 43539, "pid": 5, "tid": 7, "ts": 1716454218168665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070492, "dur": 8, "args": { "External id": 43539, "cbid": 211, "correlation": 43539 } }, { "ph": "s", "id": 43539, "pid": 76337, "tid": -914061504, "ts": 1716454218070492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070555, "dur": 1, "args": { "External id": 43550, "cbid": 251, "correlation": 43550 } }, { "ph": "f", "id": 43550, "pid": 76337, "tid": -914061504, "ts": 1716454218070555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070558, "dur": 3, "args": { "External id": 43551, "cbid": 251, "correlation": 43551 } }, { "ph": "f", "id": 43551, "pid": 76337, "tid": -914061504, "ts": 1716454218070558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218168672, "dur": 6, "args": { "External id": 43552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43552, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 43552, "pid": 5, "tid": 7, "ts": 1716454218168672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070563, "dur": 12, "args": { "External id": 43552, "cbid": 211, "correlation": 43552 } }, { "ph": "s", "id": 43552, "pid": 76337, "tid": -914061504, "ts": 1716454218070563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218168679, "dur": 4, "args": { "External id": 43554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43554, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 43554, "pid": 5, "tid": 7, "ts": 1716454218168679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070577, "dur": 5, "args": { "External id": 43554, "cbid": 211, "correlation": 43554 } }, { "ph": "s", "id": 43554, "pid": 76337, "tid": -914061504, "ts": 1716454218070577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218168684, "dur": 153, "args": { "External id": 43575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43575, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 43575, "pid": 5, "tid": 7, "ts": 1716454218168684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070650, "dur": 12, "args": { "External id": 43575, "cbid": 211, "correlation": 43575 } }, { "ph": "s", "id": 43575, "pid": 76337, "tid": -914061504, "ts": 1716454218070650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218070747, "dur": 2, "args": { "External id": 43593, "cbid": 251, "correlation": 43593 } }, { "ph": "f", "id": 43593, "pid": 76337, "tid": -914061504, "ts": 1716454218070747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218168838, "dur": 105, "args": { "External id": 43595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43595, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 43595, "pid": 5, "tid": 7, "ts": 1716454218168838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070754, "dur": 14, "args": { "External id": 43595, "cbid": 211, "correlation": 43595 } }, { "ph": "s", "id": 43595, "pid": 76337, "tid": -914061504, "ts": 1716454218070754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218168944, "dur": 35, "args": { "External id": 43603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43603, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43603, "pid": 5, "tid": 7, "ts": 1716454218168944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070823, "dur": 12, "args": { "External id": 43603, "cbid": 211, "correlation": 43603 } }, { "ph": "s", "id": 43603, "pid": 76337, "tid": -914061504, "ts": 1716454218070823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218168981, "dur": 67, "args": { "External id": 43611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43611, "pid": 5, "tid": 7, "ts": 1716454218168981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070864, "dur": 10, "args": { "External id": 43611, "cbid": 211, "correlation": 43611 } }, { "ph": "s", "id": 43611, "pid": 76337, "tid": -914061504, "ts": 1716454218070864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218169049, "dur": 91, "args": { "External id": 43633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43633, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43633, "pid": 5, "tid": 7, "ts": 1716454218169049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218070918, "dur": 11, "args": { "External id": 43633, "cbid": 211, "correlation": 43633 } }, { "ph": "s", "id": 43633, "pid": 76337, "tid": -914061504, "ts": 1716454218070918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071013, "dur": 1, "args": { "External id": 43649, "cbid": 251, "correlation": 43649 } }, { "ph": "f", "id": 43649, "pid": 76337, "tid": -914061504, "ts": 1716454218071013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218169141, "dur": 566, "args": { "External id": 43651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43651, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43651, "pid": 5, "tid": 7, "ts": 1716454218169141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071019, "dur": 14, "args": { "External id": 43651, "cbid": 211, "correlation": 43651 } }, { "ph": "s", "id": 43651, "pid": 76337, "tid": -914061504, "ts": 1716454218071019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218169708, "dur": 239, "args": { "External id": 43659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43659, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43659, "pid": 5, "tid": 7, "ts": 1716454218169708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071086, "dur": 12, "args": { "External id": 43659, "cbid": 211, "correlation": 43659 } }, { "ph": "s", "id": 43659, "pid": 76337, "tid": -914061504, "ts": 1716454218071086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218169949, "dur": 249, "args": { "External id": 43667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43667, "pid": 5, "tid": 7, "ts": 1716454218169949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071116, "dur": 8, "args": { "External id": 43667, "cbid": 211, "correlation": 43667 } }, { "ph": "s", "id": 43667, "pid": 76337, "tid": -914061504, "ts": 1716454218071116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071199, "dur": 1, "args": { "External id": 43683, "cbid": 251, "correlation": 43683 } }, { "ph": "f", "id": 43683, "pid": 76337, "tid": -914061504, "ts": 1716454218071199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071204, "dur": 0, "args": { "External id": 43685, "cbid": 251, "correlation": 43685 } }, { "ph": "f", "id": 43685, "pid": 76337, "tid": -914061504, "ts": 1716454218071204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218170199, "dur": 355, "args": { "External id": 43686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43686, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 43686, "pid": 5, "tid": 7, "ts": 1716454218170199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071207, "dur": 13, "args": { "External id": 43686, "cbid": 211, "correlation": 43686 } }, { "ph": "s", "id": 43686, "pid": 76337, "tid": -914061504, "ts": 1716454218071207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218170555, "dur": 50, "args": { "External id": 43694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43694, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43694, "pid": 5, "tid": 7, "ts": 1716454218170555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071249, "dur": 10, "args": { "External id": 43694, "cbid": 211, "correlation": 43694 } }, { "ph": "s", "id": 43694, "pid": 76337, "tid": -914061504, "ts": 1716454218071249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218170607, "dur": 157, "args": { "External id": 43705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43705, "pid": 5, "tid": 7, "ts": 1716454218170607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071319, "dur": 13, "args": { "External id": 43705, "cbid": 211, "correlation": 43705 } }, { "ph": "s", "id": 43705, "pid": 76337, "tid": -914061504, "ts": 1716454218071319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218071385, "dur": 0, "args": { "External id": 43717, "cbid": 317, "correlation": 43717 } }, { "ph": "f", "id": 43717, "pid": 76337, "tid": -914061504, "ts": 1716454218071385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218071386, "dur": 0, "args": { "External id": 43718, "cbid": 203, "correlation": 43718 } }, { "ph": "f", "id": 43718, "pid": 76337, "tid": -914061504, "ts": 1716454218071386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218071387, "dur": 0, "args": { "External id": 43719, "cbid": 205, "correlation": 43719 } }, { "ph": "f", "id": 43719, "pid": 76337, "tid": -914061504, "ts": 1716454218071387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071446, "dur": 1, "args": { "External id": 43723, "cbid": 251, "correlation": 43723 } }, { "ph": "f", "id": 43723, "pid": 76337, "tid": -914061504, "ts": 1716454218071446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071448, "dur": 0, "args": { "External id": 43724, "cbid": 251, "correlation": 43724 } }, { "ph": "f", "id": 43724, "pid": 76337, "tid": -914061504, "ts": 1716454218071448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071449, "dur": 0, "args": { "External id": 43725, "cbid": 251, "correlation": 43725 } }, { "ph": "f", "id": 43725, "pid": 76337, "tid": -914061504, "ts": 1716454218071449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071450, "dur": 0, "args": { "External id": 43726, "cbid": 251, "correlation": 43726 } }, { "ph": "f", "id": 43726, "pid": 76337, "tid": -914061504, "ts": 1716454218071450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071451, "dur": 0, "args": { "External id": 43727, "cbid": 251, "correlation": 43727 } }, { "ph": "f", "id": 43727, "pid": 76337, "tid": -914061504, "ts": 1716454218071451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071452, "dur": 0, "args": { "External id": 43728, "cbid": 251, "correlation": 43728 } }, { "ph": "f", "id": 43728, "pid": 76337, "tid": -914061504, "ts": 1716454218071452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071453, "dur": 0, "args": { "External id": 43729, "cbid": 251, "correlation": 43729 } }, { "ph": "f", "id": 43729, "pid": 76337, "tid": -914061504, "ts": 1716454218071453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071454, "dur": 0, "args": { "External id": 43730, "cbid": 251, "correlation": 43730 } }, { "ph": "f", "id": 43730, "pid": 76337, "tid": -914061504, "ts": 1716454218071454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071455, "dur": 0, "args": { "External id": 43731, "cbid": 251, "correlation": 43731 } }, { "ph": "f", "id": 43731, "pid": 76337, "tid": -914061504, "ts": 1716454218071455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218170765, "dur": 113, "args": { "External id": 43732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43732, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 43732, "pid": 5, "tid": 7, "ts": 1716454218170765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071457, "dur": 13, "args": { "External id": 43732, "cbid": 211, "correlation": 43732 } }, { "ph": "s", "id": 43732, "pid": 76337, "tid": -914061504, "ts": 1716454218071457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218170879, "dur": 59, "args": { "External id": 43738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43738, "pid": 5, "tid": 7, "ts": 1716454218170879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071495, "dur": 9, "args": { "External id": 43738, "cbid": 211, "correlation": 43738 } }, { "ph": "s", "id": 43738, "pid": 76337, "tid": -914061504, "ts": 1716454218071495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218170940, "dur": 50, "args": { "External id": 43746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43746, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43746, "pid": 5, "tid": 7, "ts": 1716454218170940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071527, "dur": 8, "args": { "External id": 43746, "cbid": 211, "correlation": 43746 } }, { "ph": "s", "id": 43746, "pid": 76337, "tid": -914061504, "ts": 1716454218071527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218170991, "dur": 98, "args": { "External id": 43755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43755, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43755, "pid": 5, "tid": 7, "ts": 1716454218170991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071566, "dur": 10, "args": { "External id": 43755, "cbid": 211, "correlation": 43755 } }, { "ph": "s", "id": 43755, "pid": 76337, "tid": -914061504, "ts": 1716454218071566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218171090, "dur": 92, "args": { "External id": 43775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43775, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 43775, "pid": 5, "tid": 7, "ts": 1716454218171090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071638, "dur": 13, "args": { "External id": 43775, "cbid": 211, "correlation": 43775 } }, { "ph": "s", "id": 43775, "pid": 76337, "tid": -914061504, "ts": 1716454218071638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218171183, "dur": 4, "args": { "External id": 43787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43787, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 43787, "pid": 5, "tid": 7, "ts": 1716454218171183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071664, "dur": 7, "args": { "External id": 43787, "cbid": 211, "correlation": 43787 } }, { "ph": "s", "id": 43787, "pid": 76337, "tid": -914061504, "ts": 1716454218071664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218171189, "dur": 107, "args": { "External id": 43790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43790, "pid": 5, "tid": 7, "ts": 1716454218171189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071683, "dur": 7, "args": { "External id": 43790, "cbid": 211, "correlation": 43790 } }, { "ph": "s", "id": 43790, "pid": 76337, "tid": -914061504, "ts": 1716454218071683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218171297, "dur": 69, "args": { "External id": 43799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43799, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43799, "pid": 5, "tid": 7, "ts": 1716454218171297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071723, "dur": 9, "args": { "External id": 43799, "cbid": 211, "correlation": 43799 } }, { "ph": "s", "id": 43799, "pid": 76337, "tid": -914061504, "ts": 1716454218071723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218071774, "dur": 0, "args": { "External id": 43809, "cbid": 317, "correlation": 43809 } }, { "ph": "f", "id": 43809, "pid": 76337, "tid": -914061504, "ts": 1716454218071774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218071775, "dur": 0, "args": { "External id": 43810, "cbid": 203, "correlation": 43810 } }, { "ph": "f", "id": 43810, "pid": 76337, "tid": -914061504, "ts": 1716454218071775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218071776, "dur": 0, "args": { "External id": 43811, "cbid": 205, "correlation": 43811 } }, { "ph": "f", "id": 43811, "pid": 76337, "tid": -914061504, "ts": 1716454218071776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218171368, "dur": 76, "args": { "External id": 43815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43815, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43815, "pid": 5, "tid": 7, "ts": 1716454218171368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071794, "dur": 12, "args": { "External id": 43815, "cbid": 211, "correlation": 43815 } }, { "ph": "s", "id": 43815, "pid": 76337, "tid": -914061504, "ts": 1716454218071794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218171445, "dur": 24, "args": { "External id": 43817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43817, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43817, "pid": 5, "tid": 7, "ts": 1716454218171445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071808, "dur": 6, "args": { "External id": 43817, "cbid": 211, "correlation": 43817 } }, { "ph": "s", "id": 43817, "pid": 76337, "tid": -914061504, "ts": 1716454218071808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218171470, "dur": 3, "args": { "External id": 43819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43819, "pid": 5, "tid": 7, "ts": 1716454218171470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071819, "dur": 6, "args": { "External id": 43819, "cbid": 211, "correlation": 43819 } }, { "ph": "s", "id": 43819, "pid": 76337, "tid": -914061504, "ts": 1716454218071819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218071828, "dur": 0, "args": { "External id": 43820, "cbid": 51, "correlation": 43820 } }, { "ph": "s", "id": 43820, "pid": 76337, "tid": -914061504, "ts": 1716454218071828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218171475, "dur": 1352, "args": { "External id": 43821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43821, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43821, "pid": 5, "tid": 7, "ts": 1716454218171475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071828, "dur": 6, "args": { "External id": 43821, "cbid": 211, "correlation": 43821 } }, { "ph": "s", "id": 43821, "pid": 76337, "tid": -914061504, "ts": 1716454218071828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218172829, "dur": 59, "args": { "External id": 43826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43826, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43826, "pid": 5, "tid": 7, "ts": 1716454218172829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071856, "dur": 9, "args": { "External id": 43826, "cbid": 211, "correlation": 43826 } }, { "ph": "s", "id": 43826, "pid": 76337, "tid": -914061504, "ts": 1716454218071856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218172889, "dur": 3, "args": { "External id": 43834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43834, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43834, "pid": 5, "tid": 7, "ts": 1716454218172889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071900, "dur": 10, "args": { "External id": 43834, "cbid": 211, "correlation": 43834 } }, { "ph": "s", "id": 43834, "pid": 76337, "tid": -914061504, "ts": 1716454218071900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071971, "dur": 1, "args": { "External id": 43850, "cbid": 251, "correlation": 43850 } }, { "ph": "f", "id": 43850, "pid": 76337, "tid": -914061504, "ts": 1716454218071971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218071984, "dur": 0, "args": { "External id": 43852, "cbid": 251, "correlation": 43852 } }, { "ph": "f", "id": 43852, "pid": 76337, "tid": -914061504, "ts": 1716454218071984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218172894, "dur": 11, "args": { "External id": 43853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43853, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 43853, "pid": 5, "tid": 7, "ts": 1716454218172894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218071986, "dur": 12, "args": { "External id": 43853, "cbid": 211, "correlation": 43853 } }, { "ph": "s", "id": 43853, "pid": 76337, "tid": -914061504, "ts": 1716454218071986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218172906, "dur": 5, "args": { "External id": 43855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43855, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 43855, "pid": 5, "tid": 7, "ts": 1716454218172906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072000, "dur": 5, "args": { "External id": 43855, "cbid": 211, "correlation": 43855 } }, { "ph": "s", "id": 43855, "pid": 76337, "tid": -914061504, "ts": 1716454218072000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218172913, "dur": 54, "args": { "External id": 43865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43865, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43865, "pid": 5, "tid": 7, "ts": 1716454218172913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072059, "dur": 13, "args": { "External id": 43865, "cbid": 211, "correlation": 43865 } }, { "ph": "s", "id": 43865, "pid": 76337, "tid": -914061504, "ts": 1716454218072059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218172968, "dur": 51, "args": { "External id": 43885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43885, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 43885, "pid": 5, "tid": 7, "ts": 1716454218172968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072127, "dur": 11, "args": { "External id": 43885, "cbid": 211, "correlation": 43885 } }, { "ph": "s", "id": 43885, "pid": 76337, "tid": -914061504, "ts": 1716454218072127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218173021, "dur": 4, "args": { "External id": 43897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43897, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43897, "pid": 5, "tid": 7, "ts": 1716454218173021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072148, "dur": 6, "args": { "External id": 43897, "cbid": 211, "correlation": 43897 } }, { "ph": "s", "id": 43897, "pid": 76337, "tid": -914061504, "ts": 1716454218072148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218173026, "dur": 53, "args": { "External id": 43900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43900, "pid": 5, "tid": 7, "ts": 1716454218173026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072166, "dur": 7, "args": { "External id": 43900, "cbid": 211, "correlation": 43900 } }, { "ph": "s", "id": 43900, "pid": 76337, "tid": -914061504, "ts": 1716454218072166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218173080, "dur": 36, "args": { "External id": 43909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43909, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43909, "pid": 5, "tid": 7, "ts": 1716454218173080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072208, "dur": 10, "args": { "External id": 43909, "cbid": 211, "correlation": 43909 } }, { "ph": "s", "id": 43909, "pid": 76337, "tid": -914061504, "ts": 1716454218072208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218072274, "dur": 0, "args": { "External id": 43919, "cbid": 317, "correlation": 43919 } }, { "ph": "f", "id": 43919, "pid": 76337, "tid": -914061504, "ts": 1716454218072274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218072275, "dur": 0, "args": { "External id": 43920, "cbid": 203, "correlation": 43920 } }, { "ph": "f", "id": 43920, "pid": 76337, "tid": -914061504, "ts": 1716454218072275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218072276, "dur": 0, "args": { "External id": 43921, "cbid": 205, "correlation": 43921 } }, { "ph": "f", "id": 43921, "pid": 76337, "tid": -914061504, "ts": 1716454218072276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218173118, "dur": 41, "args": { "External id": 43925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43925, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43925, "pid": 5, "tid": 7, "ts": 1716454218173118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072290, "dur": 12, "args": { "External id": 43925, "cbid": 211, "correlation": 43925 } }, { "ph": "s", "id": 43925, "pid": 76337, "tid": -914061504, "ts": 1716454218072290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218173160, "dur": 14, "args": { "External id": 43927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43927, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43927, "pid": 5, "tid": 7, "ts": 1716454218173160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072305, "dur": 5, "args": { "External id": 43927, "cbid": 211, "correlation": 43927 } }, { "ph": "s", "id": 43927, "pid": 76337, "tid": -914061504, "ts": 1716454218072305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218173176, "dur": 3, "args": { "External id": 43929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 43929, "pid": 5, "tid": 7, "ts": 1716454218173176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072314, "dur": 5, "args": { "External id": 43929, "cbid": 211, "correlation": 43929 } }, { "ph": "s", "id": 43929, "pid": 76337, "tid": -914061504, "ts": 1716454218072314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218072322, "dur": 0, "args": { "External id": 43930, "cbid": 51, "correlation": 43930 } }, { "ph": "s", "id": 43930, "pid": 76337, "tid": -914061504, "ts": 1716454218072322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218173180, "dur": 689, "args": { "External id": 43931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43931, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43931, "pid": 5, "tid": 7, "ts": 1716454218173180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072323, "dur": 6, "args": { "External id": 43931, "cbid": 211, "correlation": 43931 } }, { "ph": "s", "id": 43931, "pid": 76337, "tid": -914061504, "ts": 1716454218072323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218173870, "dur": 59, "args": { "External id": 43936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43936, "pid": 5, "tid": 7, "ts": 1716454218173870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072351, "dur": 8, "args": { "External id": 43936, "cbid": 211, "correlation": 43936 } }, { "ph": "s", "id": 43936, "pid": 76337, "tid": -914061504, "ts": 1716454218072351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218072408, "dur": 0, "args": { "External id": 43946, "cbid": 317, "correlation": 43946 } }, { "ph": "f", "id": 43946, "pid": 76337, "tid": -914061504, "ts": 1716454218072408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218072409, "dur": 0, "args": { "External id": 43947, "cbid": 203, "correlation": 43947 } }, { "ph": "f", "id": 43947, "pid": 76337, "tid": -914061504, "ts": 1716454218072409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218072410, "dur": 0, "args": { "External id": 43948, "cbid": 205, "correlation": 43948 } }, { "ph": "f", "id": 43948, "pid": 76337, "tid": -914061504, "ts": 1716454218072410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218173931, "dur": 75, "args": { "External id": 43952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43952, "pid": 5, "tid": 7, "ts": 1716454218173931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072423, "dur": 13, "args": { "External id": 43952, "cbid": 211, "correlation": 43952 } }, { "ph": "s", "id": 43952, "pid": 76337, "tid": -914061504, "ts": 1716454218072423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218174007, "dur": 207, "args": { "External id": 43954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43954, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 43954, "pid": 5, "tid": 7, "ts": 1716454218174007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072443, "dur": 8, "args": { "External id": 43954, "cbid": 211, "correlation": 43954 } }, { "ph": "s", "id": 43954, "pid": 76337, "tid": -914061504, "ts": 1716454218072443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218174216, "dur": 39, "args": { "External id": 43956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43956, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43956, "pid": 5, "tid": 7, "ts": 1716454218174216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072456, "dur": 6, "args": { "External id": 43956, "cbid": 211, "correlation": 43956 } }, { "ph": "s", "id": 43956, "pid": 76337, "tid": -914061504, "ts": 1716454218072456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218174256, "dur": 58, "args": { "External id": 43962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43962, "pid": 5, "tid": 7, "ts": 1716454218174256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072483, "dur": 8, "args": { "External id": 43962, "cbid": 211, "correlation": 43962 } }, { "ph": "s", "id": 43962, "pid": 76337, "tid": -914061504, "ts": 1716454218072483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218174316, "dur": 50, "args": { "External id": 43970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43970, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43970, "pid": 5, "tid": 7, "ts": 1716454218174316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072512, "dur": 8, "args": { "External id": 43970, "cbid": 211, "correlation": 43970 } }, { "ph": "s", "id": 43970, "pid": 76337, "tid": -914061504, "ts": 1716454218072512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218174367, "dur": 35, "args": { "External id": 43978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43978, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 43978, "pid": 5, "tid": 7, "ts": 1716454218174367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072544, "dur": 8, "args": { "External id": 43978, "cbid": 211, "correlation": 43978 } }, { "ph": "s", "id": 43978, "pid": 76337, "tid": -914061504, "ts": 1716454218072544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218174403, "dur": 51, "args": { "External id": 43998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 43998, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 43998, "pid": 5, "tid": 7, "ts": 1716454218174403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072627, "dur": 13, "args": { "External id": 43998, "cbid": 211, "correlation": 43998 } }, { "ph": "s", "id": 43998, "pid": 76337, "tid": -914061504, "ts": 1716454218072627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218174456, "dur": 4, "args": { "External id": 44010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44010, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 44010, "pid": 5, "tid": 7, "ts": 1716454218174456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072650, "dur": 6, "args": { "External id": 44010, "cbid": 211, "correlation": 44010 } }, { "ph": "s", "id": 44010, "pid": 76337, "tid": -914061504, "ts": 1716454218072650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218174461, "dur": 54, "args": { "External id": 44013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44013, "pid": 5, "tid": 7, "ts": 1716454218174461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072667, "dur": 7, "args": { "External id": 44013, "cbid": 211, "correlation": 44013 } }, { "ph": "s", "id": 44013, "pid": 76337, "tid": -914061504, "ts": 1716454218072667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218072725, "dur": 0, "args": { "External id": 44024, "cbid": 317, "correlation": 44024 } }, { "ph": "f", "id": 44024, "pid": 76337, "tid": -914061504, "ts": 1716454218072725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218072726, "dur": 0, "args": { "External id": 44025, "cbid": 203, "correlation": 44025 } }, { "ph": "f", "id": 44025, "pid": 76337, "tid": -914061504, "ts": 1716454218072726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218072727, "dur": 0, "args": { "External id": 44026, "cbid": 205, "correlation": 44026 } }, { "ph": "f", "id": 44026, "pid": 76337, "tid": -914061504, "ts": 1716454218072727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072749, "dur": 1, "args": { "External id": 44030, "cbid": 251, "correlation": 44030 } }, { "ph": "f", "id": 44030, "pid": 76337, "tid": -914061504, "ts": 1716454218072749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072751, "dur": 0, "args": { "External id": 44031, "cbid": 251, "correlation": 44031 } }, { "ph": "f", "id": 44031, "pid": 76337, "tid": -914061504, "ts": 1716454218072751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072752, "dur": 0, "args": { "External id": 44032, "cbid": 251, "correlation": 44032 } }, { "ph": "f", "id": 44032, "pid": 76337, "tid": -914061504, "ts": 1716454218072752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072753, "dur": 0, "args": { "External id": 44033, "cbid": 251, "correlation": 44033 } }, { "ph": "f", "id": 44033, "pid": 76337, "tid": -914061504, "ts": 1716454218072753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072753, "dur": 0, "args": { "External id": 44034, "cbid": 251, "correlation": 44034 } }, { "ph": "f", "id": 44034, "pid": 76337, "tid": -914061504, "ts": 1716454218072753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072754, "dur": 0, "args": { "External id": 44035, "cbid": 251, "correlation": 44035 } }, { "ph": "f", "id": 44035, "pid": 76337, "tid": -914061504, "ts": 1716454218072754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072755, "dur": 0, "args": { "External id": 44036, "cbid": 251, "correlation": 44036 } }, { "ph": "f", "id": 44036, "pid": 76337, "tid": -914061504, "ts": 1716454218072755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072756, "dur": 0, "args": { "External id": 44037, "cbid": 251, "correlation": 44037 } }, { "ph": "f", "id": 44037, "pid": 76337, "tid": -914061504, "ts": 1716454218072756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218072757, "dur": 0, "args": { "External id": 44038, "cbid": 251, "correlation": 44038 } }, { "ph": "f", "id": 44038, "pid": 76337, "tid": -914061504, "ts": 1716454218072757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218174517, "dur": 111, "args": { "External id": 44039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44039, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44039, "pid": 5, "tid": 7, "ts": 1716454218174517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072759, "dur": 12, "args": { "External id": 44039, "cbid": 211, "correlation": 44039 } }, { "ph": "s", "id": 44039, "pid": 76337, "tid": -914061504, "ts": 1716454218072759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218174630, "dur": 59, "args": { "External id": 44045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44045, "pid": 5, "tid": 7, "ts": 1716454218174630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072795, "dur": 8, "args": { "External id": 44045, "cbid": 211, "correlation": 44045 } }, { "ph": "s", "id": 44045, "pid": 76337, "tid": -914061504, "ts": 1716454218072795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218174690, "dur": 572, "args": { "External id": 44054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44054, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44054, "pid": 5, "tid": 7, "ts": 1716454218174690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072878, "dur": 15, "args": { "External id": 44054, "cbid": 211, "correlation": 44054 } }, { "ph": "s", "id": 44054, "pid": 76337, "tid": -914061504, "ts": 1716454218072878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218175263, "dur": 179, "args": { "External id": 44076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44076, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44076, "pid": 5, "tid": 7, "ts": 1716454218175263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218072936, "dur": 11, "args": { "External id": 44076, "cbid": 211, "correlation": 44076 } }, { "ph": "s", "id": 44076, "pid": 76337, "tid": -914061504, "ts": 1716454218072936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073033, "dur": 1, "args": { "External id": 44087, "cbid": 251, "correlation": 44087 } }, { "ph": "f", "id": 44087, "pid": 76337, "tid": -914061504, "ts": 1716454218073033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218175443, "dur": 194, "args": { "External id": 44088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44088, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44088, "pid": 5, "tid": 7, "ts": 1716454218175443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073038, "dur": 14, "args": { "External id": 44088, "cbid": 211, "correlation": 44088 } }, { "ph": "s", "id": 44088, "pid": 76337, "tid": -914061504, "ts": 1716454218073038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073107, "dur": 1, "args": { "External id": 44099, "cbid": 251, "correlation": 44099 } }, { "ph": "f", "id": 44099, "pid": 76337, "tid": -914061504, "ts": 1716454218073107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218175639, "dur": 186, "args": { "External id": 44100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44100, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44100, "pid": 5, "tid": 7, "ts": 1716454218175639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073110, "dur": 11, "args": { "External id": 44100, "cbid": 211, "correlation": 44100 } }, { "ph": "s", "id": 44100, "pid": 76337, "tid": -914061504, "ts": 1716454218073110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073173, "dur": 1, "args": { "External id": 44111, "cbid": 251, "correlation": 44111 } }, { "ph": "f", "id": 44111, "pid": 76337, "tid": -914061504, "ts": 1716454218073173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218175826, "dur": 185, "args": { "External id": 44112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44112, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44112, "pid": 5, "tid": 7, "ts": 1716454218175826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073178, "dur": 11, "args": { "External id": 44112, "cbid": 211, "correlation": 44112 } }, { "ph": "s", "id": 44112, "pid": 76337, "tid": -914061504, "ts": 1716454218073178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218176012, "dur": 18665, "args": { "External id": 44133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44133, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44133, "pid": 5, "tid": 7, "ts": 1716454218176012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073261, "dur": 13, "args": { "External id": 44133, "cbid": 211, "correlation": 44133 } }, { "ph": "s", "id": 44133, "pid": 76337, "tid": -914061504, "ts": 1716454218073261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073359, "dur": 1, "args": { "External id": 44151, "cbid": 251, "correlation": 44151 } }, { "ph": "f", "id": 44151, "pid": 76337, "tid": -914061504, "ts": 1716454218073359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218194678, "dur": 203, "args": { "External id": 44153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44153, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44153, "pid": 5, "tid": 7, "ts": 1716454218194678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073365, "dur": 13, "args": { "External id": 44153, "cbid": 211, "correlation": 44153 } }, { "ph": "s", "id": 44153, "pid": 76337, "tid": -914061504, "ts": 1716454218073365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218194882, "dur": 67, "args": { "External id": 44161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44161, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44161, "pid": 5, "tid": 7, "ts": 1716454218194882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073435, "dur": 12, "args": { "External id": 44161, "cbid": 211, "correlation": 44161 } }, { "ph": "s", "id": 44161, "pid": 76337, "tid": -914061504, "ts": 1716454218073435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218194950, "dur": 96, "args": { "External id": 44169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44169, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44169, "pid": 5, "tid": 7, "ts": 1716454218194950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073473, "dur": 8, "args": { "External id": 44169, "cbid": 211, "correlation": 44169 } }, { "ph": "s", "id": 44169, "pid": 76337, "tid": -914061504, "ts": 1716454218073473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218195048, "dur": 55, "args": { "External id": 44180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44180, "pid": 5, "tid": 7, "ts": 1716454218195048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073544, "dur": 12, "args": { "External id": 44180, "cbid": 211, "correlation": 44180 } }, { "ph": "s", "id": 44180, "pid": 76337, "tid": -914061504, "ts": 1716454218073544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218195104, "dur": 93, "args": { "External id": 44202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44202, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44202, "pid": 5, "tid": 7, "ts": 1716454218195104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073575, "dur": 8, "args": { "External id": 44202, "cbid": 211, "correlation": 44202 } }, { "ph": "s", "id": 44202, "pid": 76337, "tid": -914061504, "ts": 1716454218073575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073662, "dur": 1, "args": { "External id": 44213, "cbid": 251, "correlation": 44213 } }, { "ph": "f", "id": 44213, "pid": 76337, "tid": -914061504, "ts": 1716454218073662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218195199, "dur": 106, "args": { "External id": 44214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44214, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44214, "pid": 5, "tid": 7, "ts": 1716454218195199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073668, "dur": 14, "args": { "External id": 44214, "cbid": 211, "correlation": 44214 } }, { "ph": "s", "id": 44214, "pid": 76337, "tid": -914061504, "ts": 1716454218073668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073739, "dur": 1, "args": { "External id": 44225, "cbid": 251, "correlation": 44225 } }, { "ph": "f", "id": 44225, "pid": 76337, "tid": -914061504, "ts": 1716454218073739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073743, "dur": 0, "args": { "External id": 44226, "cbid": 251, "correlation": 44226 } }, { "ph": "f", "id": 44226, "pid": 76337, "tid": -914061504, "ts": 1716454218073743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218195306, "dur": 11, "args": { "External id": 44227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44227, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 44227, "pid": 5, "tid": 7, "ts": 1716454218195306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073744, "dur": 12, "args": { "External id": 44227, "cbid": 211, "correlation": 44227 } }, { "ph": "s", "id": 44227, "pid": 76337, "tid": -914061504, "ts": 1716454218073744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218195318, "dur": 5, "args": { "External id": 44229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44229, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 44229, "pid": 5, "tid": 7, "ts": 1716454218195318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073759, "dur": 6, "args": { "External id": 44229, "cbid": 211, "correlation": 44229 } }, { "ph": "s", "id": 44229, "pid": 76337, "tid": -914061504, "ts": 1716454218073759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073819, "dur": 1, "args": { "External id": 44240, "cbid": 251, "correlation": 44240 } }, { "ph": "f", "id": 44240, "pid": 76337, "tid": -914061504, "ts": 1716454218073819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218073822, "dur": 0, "args": { "External id": 44241, "cbid": 251, "correlation": 44241 } }, { "ph": "f", "id": 44241, "pid": 76337, "tid": -914061504, "ts": 1716454218073822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218195324, "dur": 6, "args": { "External id": 44242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44242, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 44242, "pid": 5, "tid": 7, "ts": 1716454218195324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073824, "dur": 12, "args": { "External id": 44242, "cbid": 211, "correlation": 44242 } }, { "ph": "s", "id": 44242, "pid": 76337, "tid": -914061504, "ts": 1716454218073824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218195332, "dur": 4, "args": { "External id": 44244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44244, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 44244, "pid": 5, "tid": 7, "ts": 1716454218195332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073838, "dur": 6, "args": { "External id": 44244, "cbid": 211, "correlation": 44244 } }, { "ph": "s", "id": 44244, "pid": 76337, "tid": -914061504, "ts": 1716454218073838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218195337, "dur": 157, "args": { "External id": 44265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44265, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44265, "pid": 5, "tid": 7, "ts": 1716454218195337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218073912, "dur": 12, "args": { "External id": 44265, "cbid": 211, "correlation": 44265 } }, { "ph": "s", "id": 44265, "pid": 76337, "tid": -914061504, "ts": 1716454218073912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074019, "dur": 1, "args": { "External id": 44283, "cbid": 251, "correlation": 44283 } }, { "ph": "f", "id": 44283, "pid": 76337, "tid": -914061504, "ts": 1716454218074019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218195496, "dur": 107, "args": { "External id": 44285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44285, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44285, "pid": 5, "tid": 7, "ts": 1716454218195496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074025, "dur": 14, "args": { "External id": 44285, "cbid": 211, "correlation": 44285 } }, { "ph": "s", "id": 44285, "pid": 76337, "tid": -914061504, "ts": 1716454218074025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218195604, "dur": 34, "args": { "External id": 44293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44293, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44293, "pid": 5, "tid": 7, "ts": 1716454218195604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074096, "dur": 12, "args": { "External id": 44293, "cbid": 211, "correlation": 44293 } }, { "ph": "s", "id": 44293, "pid": 76337, "tid": -914061504, "ts": 1716454218074096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218195640, "dur": 66, "args": { "External id": 44301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44301, "pid": 5, "tid": 7, "ts": 1716454218195640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074137, "dur": 9, "args": { "External id": 44301, "cbid": 211, "correlation": 44301 } }, { "ph": "s", "id": 44301, "pid": 76337, "tid": -914061504, "ts": 1716454218074137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218195708, "dur": 93, "args": { "External id": 44323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44323, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44323, "pid": 5, "tid": 7, "ts": 1716454218195708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074188, "dur": 10, "args": { "External id": 44323, "cbid": 211, "correlation": 44323 } }, { "ph": "s", "id": 44323, "pid": 76337, "tid": -914061504, "ts": 1716454218074188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074275, "dur": 1, "args": { "External id": 44339, "cbid": 251, "correlation": 44339 } }, { "ph": "f", "id": 44339, "pid": 76337, "tid": -914061504, "ts": 1716454218074275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218195802, "dur": 579, "args": { "External id": 44341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44341, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44341, "pid": 5, "tid": 7, "ts": 1716454218195802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074281, "dur": 12, "args": { "External id": 44341, "cbid": 211, "correlation": 44341 } }, { "ph": "s", "id": 44341, "pid": 76337, "tid": -914061504, "ts": 1716454218074281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218196382, "dur": 244, "args": { "External id": 44349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44349, "pid": 5, "tid": 7, "ts": 1716454218196382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074345, "dur": 13, "args": { "External id": 44349, "cbid": 211, "correlation": 44349 } }, { "ph": "s", "id": 44349, "pid": 76337, "tid": -914061504, "ts": 1716454218074345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218196628, "dur": 251, "args": { "External id": 44357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44357, "pid": 5, "tid": 7, "ts": 1716454218196628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074380, "dur": 8, "args": { "External id": 44357, "cbid": 211, "correlation": 44357 } }, { "ph": "s", "id": 44357, "pid": 76337, "tid": -914061504, "ts": 1716454218074380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074463, "dur": 1, "args": { "External id": 44373, "cbid": 251, "correlation": 44373 } }, { "ph": "f", "id": 44373, "pid": 76337, "tid": -914061504, "ts": 1716454218074463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074468, "dur": 0, "args": { "External id": 44375, "cbid": 251, "correlation": 44375 } }, { "ph": "f", "id": 44375, "pid": 76337, "tid": -914061504, "ts": 1716454218074468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218196880, "dur": 362, "args": { "External id": 44376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44376, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 44376, "pid": 5, "tid": 7, "ts": 1716454218196880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074471, "dur": 13, "args": { "External id": 44376, "cbid": 211, "correlation": 44376 } }, { "ph": "s", "id": 44376, "pid": 76337, "tid": -914061504, "ts": 1716454218074471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218197243, "dur": 50, "args": { "External id": 44384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44384, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44384, "pid": 5, "tid": 7, "ts": 1716454218197243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074513, "dur": 11, "args": { "External id": 44384, "cbid": 211, "correlation": 44384 } }, { "ph": "s", "id": 44384, "pid": 76337, "tid": -914061504, "ts": 1716454218074513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218197294, "dur": 160, "args": { "External id": 44395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44395, "pid": 5, "tid": 7, "ts": 1716454218197294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074581, "dur": 12, "args": { "External id": 44395, "cbid": 211, "correlation": 44395 } }, { "ph": "s", "id": 44395, "pid": 76337, "tid": -914061504, "ts": 1716454218074581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218074646, "dur": 0, "args": { "External id": 44407, "cbid": 317, "correlation": 44407 } }, { "ph": "f", "id": 44407, "pid": 76337, "tid": -914061504, "ts": 1716454218074646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218074646, "dur": 0, "args": { "External id": 44408, "cbid": 203, "correlation": 44408 } }, { "ph": "f", "id": 44408, "pid": 76337, "tid": -914061504, "ts": 1716454218074646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218074647, "dur": 0, "args": { "External id": 44409, "cbid": 205, "correlation": 44409 } }, { "ph": "f", "id": 44409, "pid": 76337, "tid": -914061504, "ts": 1716454218074647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074670, "dur": 1, "args": { "External id": 44413, "cbid": 251, "correlation": 44413 } }, { "ph": "f", "id": 44413, "pid": 76337, "tid": -914061504, "ts": 1716454218074670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074672, "dur": 0, "args": { "External id": 44414, "cbid": 251, "correlation": 44414 } }, { "ph": "f", "id": 44414, "pid": 76337, "tid": -914061504, "ts": 1716454218074672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074673, "dur": 0, "args": { "External id": 44415, "cbid": 251, "correlation": 44415 } }, { "ph": "f", "id": 44415, "pid": 76337, "tid": -914061504, "ts": 1716454218074673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074674, "dur": 0, "args": { "External id": 44416, "cbid": 251, "correlation": 44416 } }, { "ph": "f", "id": 44416, "pid": 76337, "tid": -914061504, "ts": 1716454218074674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074675, "dur": 0, "args": { "External id": 44417, "cbid": 251, "correlation": 44417 } }, { "ph": "f", "id": 44417, "pid": 76337, "tid": -914061504, "ts": 1716454218074675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074676, "dur": 0, "args": { "External id": 44418, "cbid": 251, "correlation": 44418 } }, { "ph": "f", "id": 44418, "pid": 76337, "tid": -914061504, "ts": 1716454218074676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074677, "dur": 0, "args": { "External id": 44419, "cbid": 251, "correlation": 44419 } }, { "ph": "f", "id": 44419, "pid": 76337, "tid": -914061504, "ts": 1716454218074677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074677, "dur": 0, "args": { "External id": 44420, "cbid": 251, "correlation": 44420 } }, { "ph": "f", "id": 44420, "pid": 76337, "tid": -914061504, "ts": 1716454218074677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218074679, "dur": 0, "args": { "External id": 44421, "cbid": 251, "correlation": 44421 } }, { "ph": "f", "id": 44421, "pid": 76337, "tid": -914061504, "ts": 1716454218074679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218197455, "dur": 114, "args": { "External id": 44422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44422, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44422, "pid": 5, "tid": 7, "ts": 1716454218197455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074681, "dur": 12, "args": { "External id": 44422, "cbid": 211, "correlation": 44422 } }, { "ph": "s", "id": 44422, "pid": 76337, "tid": -914061504, "ts": 1716454218074681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218197571, "dur": 60, "args": { "External id": 44428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44428, "pid": 5, "tid": 7, "ts": 1716454218197571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074715, "dur": 9, "args": { "External id": 44428, "cbid": 211, "correlation": 44428 } }, { "ph": "s", "id": 44428, "pid": 76337, "tid": -914061504, "ts": 1716454218074715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218197632, "dur": 50, "args": { "External id": 44436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44436, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44436, "pid": 5, "tid": 7, "ts": 1716454218197632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074750, "dur": 9, "args": { "External id": 44436, "cbid": 211, "correlation": 44436 } }, { "ph": "s", "id": 44436, "pid": 76337, "tid": -914061504, "ts": 1716454218074750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218197683, "dur": 98, "args": { "External id": 44445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44445, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44445, "pid": 5, "tid": 7, "ts": 1716454218197683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074790, "dur": 10, "args": { "External id": 44445, "cbid": 211, "correlation": 44445 } }, { "ph": "s", "id": 44445, "pid": 76337, "tid": -914061504, "ts": 1716454218074790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218197782, "dur": 92, "args": { "External id": 44465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44465, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 44465, "pid": 5, "tid": 7, "ts": 1716454218197782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074861, "dur": 11, "args": { "External id": 44465, "cbid": 211, "correlation": 44465 } }, { "ph": "s", "id": 44465, "pid": 76337, "tid": -914061504, "ts": 1716454218074861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218197876, "dur": 5, "args": { "External id": 44477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44477, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 44477, "pid": 5, "tid": 7, "ts": 1716454218197876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074883, "dur": 7, "args": { "External id": 44477, "cbid": 211, "correlation": 44477 } }, { "ph": "s", "id": 44477, "pid": 76337, "tid": -914061504, "ts": 1716454218074883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218197882, "dur": 108, "args": { "External id": 44480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44480, "pid": 5, "tid": 7, "ts": 1716454218197882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074901, "dur": 7, "args": { "External id": 44480, "cbid": 211, "correlation": 44480 } }, { "ph": "s", "id": 44480, "pid": 76337, "tid": -914061504, "ts": 1716454218074901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218197991, "dur": 69, "args": { "External id": 44489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44489, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44489, "pid": 5, "tid": 7, "ts": 1716454218197991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218074941, "dur": 9, "args": { "External id": 44489, "cbid": 211, "correlation": 44489 } }, { "ph": "s", "id": 44489, "pid": 76337, "tid": -914061504, "ts": 1716454218074941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218075001, "dur": 0, "args": { "External id": 44499, "cbid": 317, "correlation": 44499 } }, { "ph": "f", "id": 44499, "pid": 76337, "tid": -914061504, "ts": 1716454218075001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218075002, "dur": 0, "args": { "External id": 44500, "cbid": 203, "correlation": 44500 } }, { "ph": "f", "id": 44500, "pid": 76337, "tid": -914061504, "ts": 1716454218075002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218075003, "dur": 0, "args": { "External id": 44501, "cbid": 205, "correlation": 44501 } }, { "ph": "f", "id": 44501, "pid": 76337, "tid": -914061504, "ts": 1716454218075003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218198061, "dur": 76, "args": { "External id": 44505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44505, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44505, "pid": 5, "tid": 7, "ts": 1716454218198061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075017, "dur": 15, "args": { "External id": 44505, "cbid": 211, "correlation": 44505 } }, { "ph": "s", "id": 44505, "pid": 76337, "tid": -914061504, "ts": 1716454218075017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218198138, "dur": 25, "args": { "External id": 44507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44507, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44507, "pid": 5, "tid": 7, "ts": 1716454218198138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075034, "dur": 6, "args": { "External id": 44507, "cbid": 211, "correlation": 44507 } }, { "ph": "s", "id": 44507, "pid": 76337, "tid": -914061504, "ts": 1716454218075034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218198164, "dur": 4, "args": { "External id": 44509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 44509, "pid": 5, "tid": 7, "ts": 1716454218198164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075045, "dur": 5, "args": { "External id": 44509, "cbid": 211, "correlation": 44509 } }, { "ph": "s", "id": 44509, "pid": 76337, "tid": -914061504, "ts": 1716454218075045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218075053, "dur": 0, "args": { "External id": 44510, "cbid": 51, "correlation": 44510 } }, { "ph": "s", "id": 44510, "pid": 76337, "tid": -914061504, "ts": 1716454218075053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218198169, "dur": 1370, "args": { "External id": 44511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44511, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44511, "pid": 5, "tid": 7, "ts": 1716454218198169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075054, "dur": 5, "args": { "External id": 44511, "cbid": 211, "correlation": 44511 } }, { "ph": "s", "id": 44511, "pid": 76337, "tid": -914061504, "ts": 1716454218075054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218199541, "dur": 59, "args": { "External id": 44516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44516, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44516, "pid": 5, "tid": 7, "ts": 1716454218199541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075081, "dur": 8, "args": { "External id": 44516, "cbid": 211, "correlation": 44516 } }, { "ph": "s", "id": 44516, "pid": 76337, "tid": -914061504, "ts": 1716454218075081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218199602, "dur": 3, "args": { "External id": 44524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44524, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 44524, "pid": 5, "tid": 7, "ts": 1716454218199602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075125, "dur": 10, "args": { "External id": 44524, "cbid": 211, "correlation": 44524 } }, { "ph": "s", "id": 44524, "pid": 76337, "tid": -914061504, "ts": 1716454218075125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075192, "dur": 2, "args": { "External id": 44540, "cbid": 251, "correlation": 44540 } }, { "ph": "f", "id": 44540, "pid": 76337, "tid": -914061504, "ts": 1716454218075192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075198, "dur": 0, "args": { "External id": 44542, "cbid": 251, "correlation": 44542 } }, { "ph": "f", "id": 44542, "pid": 76337, "tid": -914061504, "ts": 1716454218075198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218199606, "dur": 11, "args": { "External id": 44543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44543, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 44543, "pid": 5, "tid": 7, "ts": 1716454218199606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075200, "dur": 11, "args": { "External id": 44543, "cbid": 211, "correlation": 44543 } }, { "ph": "s", "id": 44543, "pid": 76337, "tid": -914061504, "ts": 1716454218075200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218199619, "dur": 5, "args": { "External id": 44545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44545, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 44545, "pid": 5, "tid": 7, "ts": 1716454218199619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075213, "dur": 5, "args": { "External id": 44545, "cbid": 211, "correlation": 44545 } }, { "ph": "s", "id": 44545, "pid": 76337, "tid": -914061504, "ts": 1716454218075213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218199625, "dur": 55, "args": { "External id": 44555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44555, "pid": 5, "tid": 7, "ts": 1716454218199625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075271, "dur": 12, "args": { "External id": 44555, "cbid": 211, "correlation": 44555 } }, { "ph": "s", "id": 44555, "pid": 76337, "tid": -914061504, "ts": 1716454218075271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218199681, "dur": 53, "args": { "External id": 44575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44575, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 44575, "pid": 5, "tid": 7, "ts": 1716454218199681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075340, "dur": 12, "args": { "External id": 44575, "cbid": 211, "correlation": 44575 } }, { "ph": "s", "id": 44575, "pid": 76337, "tid": -914061504, "ts": 1716454218075340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218199736, "dur": 4, "args": { "External id": 44587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44587, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 44587, "pid": 5, "tid": 7, "ts": 1716454218199736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075361, "dur": 6, "args": { "External id": 44587, "cbid": 211, "correlation": 44587 } }, { "ph": "s", "id": 44587, "pid": 76337, "tid": -914061504, "ts": 1716454218075361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218199741, "dur": 55, "args": { "External id": 44590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44590, "pid": 5, "tid": 7, "ts": 1716454218199741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075380, "dur": 6, "args": { "External id": 44590, "cbid": 211, "correlation": 44590 } }, { "ph": "s", "id": 44590, "pid": 76337, "tid": -914061504, "ts": 1716454218075380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218199797, "dur": 37, "args": { "External id": 44599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44599, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44599, "pid": 5, "tid": 7, "ts": 1716454218199797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075422, "dur": 9, "args": { "External id": 44599, "cbid": 211, "correlation": 44599 } }, { "ph": "s", "id": 44599, "pid": 76337, "tid": -914061504, "ts": 1716454218075422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218075484, "dur": 0, "args": { "External id": 44609, "cbid": 317, "correlation": 44609 } }, { "ph": "f", "id": 44609, "pid": 76337, "tid": -914061504, "ts": 1716454218075484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218075485, "dur": 0, "args": { "External id": 44610, "cbid": 203, "correlation": 44610 } }, { "ph": "f", "id": 44610, "pid": 76337, "tid": -914061504, "ts": 1716454218075485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218075485, "dur": 0, "args": { "External id": 44611, "cbid": 205, "correlation": 44611 } }, { "ph": "f", "id": 44611, "pid": 76337, "tid": -914061504, "ts": 1716454218075485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218199835, "dur": 40, "args": { "External id": 44615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44615, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44615, "pid": 5, "tid": 7, "ts": 1716454218199835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075501, "dur": 13, "args": { "External id": 44615, "cbid": 211, "correlation": 44615 } }, { "ph": "s", "id": 44615, "pid": 76337, "tid": -914061504, "ts": 1716454218075501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218199877, "dur": 14, "args": { "External id": 44617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44617, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44617, "pid": 5, "tid": 7, "ts": 1716454218199877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075516, "dur": 5, "args": { "External id": 44617, "cbid": 211, "correlation": 44617 } }, { "ph": "s", "id": 44617, "pid": 76337, "tid": -914061504, "ts": 1716454218075516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218199892, "dur": 3, "args": { "External id": 44619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 44619, "pid": 5, "tid": 7, "ts": 1716454218199892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075525, "dur": 5, "args": { "External id": 44619, "cbid": 211, "correlation": 44619 } }, { "ph": "s", "id": 44619, "pid": 76337, "tid": -914061504, "ts": 1716454218075525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218075533, "dur": 0, "args": { "External id": 44620, "cbid": 51, "correlation": 44620 } }, { "ph": "s", "id": 44620, "pid": 76337, "tid": -914061504, "ts": 1716454218075533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218199897, "dur": 701, "args": { "External id": 44621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44621, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44621, "pid": 5, "tid": 7, "ts": 1716454218199897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075534, "dur": 5, "args": { "External id": 44621, "cbid": 211, "correlation": 44621 } }, { "ph": "s", "id": 44621, "pid": 76337, "tid": -914061504, "ts": 1716454218075534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218200599, "dur": 60, "args": { "External id": 44626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44626, "pid": 5, "tid": 7, "ts": 1716454218200599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075561, "dur": 8, "args": { "External id": 44626, "cbid": 211, "correlation": 44626 } }, { "ph": "s", "id": 44626, "pid": 76337, "tid": -914061504, "ts": 1716454218075561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218075621, "dur": 0, "args": { "External id": 44636, "cbid": 317, "correlation": 44636 } }, { "ph": "f", "id": 44636, "pid": 76337, "tid": -914061504, "ts": 1716454218075621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218075622, "dur": 0, "args": { "External id": 44637, "cbid": 203, "correlation": 44637 } }, { "ph": "f", "id": 44637, "pid": 76337, "tid": -914061504, "ts": 1716454218075622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218075623, "dur": 0, "args": { "External id": 44638, "cbid": 205, "correlation": 44638 } }, { "ph": "f", "id": 44638, "pid": 76337, "tid": -914061504, "ts": 1716454218075623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218200660, "dur": 75, "args": { "External id": 44642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44642, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44642, "pid": 5, "tid": 7, "ts": 1716454218200660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075635, "dur": 12, "args": { "External id": 44642, "cbid": 211, "correlation": 44642 } }, { "ph": "s", "id": 44642, "pid": 76337, "tid": -914061504, "ts": 1716454218075635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218200737, "dur": 209, "args": { "External id": 44644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44644, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44644, "pid": 5, "tid": 7, "ts": 1716454218200737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075653, "dur": 7, "args": { "External id": 44644, "cbid": 211, "correlation": 44644 } }, { "ph": "s", "id": 44644, "pid": 76337, "tid": -914061504, "ts": 1716454218075653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218200947, "dur": 39, "args": { "External id": 44646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44646, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44646, "pid": 5, "tid": 7, "ts": 1716454218200947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075664, "dur": 5, "args": { "External id": 44646, "cbid": 211, "correlation": 44646 } }, { "ph": "s", "id": 44646, "pid": 76337, "tid": -914061504, "ts": 1716454218075664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218200987, "dur": 59, "args": { "External id": 44652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44652, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44652, "pid": 5, "tid": 7, "ts": 1716454218200987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075691, "dur": 8, "args": { "External id": 44652, "cbid": 211, "correlation": 44652 } }, { "ph": "s", "id": 44652, "pid": 76337, "tid": -914061504, "ts": 1716454218075691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218201048, "dur": 50, "args": { "External id": 44660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44660, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44660, "pid": 5, "tid": 7, "ts": 1716454218201048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075719, "dur": 8, "args": { "External id": 44660, "cbid": 211, "correlation": 44660 } }, { "ph": "s", "id": 44660, "pid": 76337, "tid": -914061504, "ts": 1716454218075719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218201099, "dur": 35, "args": { "External id": 44668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44668, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44668, "pid": 5, "tid": 7, "ts": 1716454218201099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075747, "dur": 8, "args": { "External id": 44668, "cbid": 211, "correlation": 44668 } }, { "ph": "s", "id": 44668, "pid": 76337, "tid": -914061504, "ts": 1716454218075747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218201135, "dur": 53, "args": { "External id": 44688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44688, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 44688, "pid": 5, "tid": 7, "ts": 1716454218201135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075828, "dur": 12, "args": { "External id": 44688, "cbid": 211, "correlation": 44688 } }, { "ph": "s", "id": 44688, "pid": 76337, "tid": -914061504, "ts": 1716454218075828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218201189, "dur": 4, "args": { "External id": 44700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44700, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 44700, "pid": 5, "tid": 7, "ts": 1716454218201189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075850, "dur": 6, "args": { "External id": 44700, "cbid": 211, "correlation": 44700 } }, { "ph": "s", "id": 44700, "pid": 76337, "tid": -914061504, "ts": 1716454218075850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218201195, "dur": 56, "args": { "External id": 44703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44703, "pid": 5, "tid": 7, "ts": 1716454218201195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075870, "dur": 7, "args": { "External id": 44703, "cbid": 211, "correlation": 44703 } }, { "ph": "s", "id": 44703, "pid": 76337, "tid": -914061504, "ts": 1716454218075870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218075928, "dur": 0, "args": { "External id": 44714, "cbid": 317, "correlation": 44714 } }, { "ph": "f", "id": 44714, "pid": 76337, "tid": -914061504, "ts": 1716454218075928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218075929, "dur": 0, "args": { "External id": 44715, "cbid": 203, "correlation": 44715 } }, { "ph": "f", "id": 44715, "pid": 76337, "tid": -914061504, "ts": 1716454218075929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218075930, "dur": 0, "args": { "External id": 44716, "cbid": 205, "correlation": 44716 } }, { "ph": "f", "id": 44716, "pid": 76337, "tid": -914061504, "ts": 1716454218075930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075953, "dur": 1, "args": { "External id": 44720, "cbid": 251, "correlation": 44720 } }, { "ph": "f", "id": 44720, "pid": 76337, "tid": -914061504, "ts": 1716454218075953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075955, "dur": 0, "args": { "External id": 44721, "cbid": 251, "correlation": 44721 } }, { "ph": "f", "id": 44721, "pid": 76337, "tid": -914061504, "ts": 1716454218075955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075955, "dur": 0, "args": { "External id": 44722, "cbid": 251, "correlation": 44722 } }, { "ph": "f", "id": 44722, "pid": 76337, "tid": -914061504, "ts": 1716454218075955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075956, "dur": 0, "args": { "External id": 44723, "cbid": 251, "correlation": 44723 } }, { "ph": "f", "id": 44723, "pid": 76337, "tid": -914061504, "ts": 1716454218075956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075957, "dur": 0, "args": { "External id": 44724, "cbid": 251, "correlation": 44724 } }, { "ph": "f", "id": 44724, "pid": 76337, "tid": -914061504, "ts": 1716454218075957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075958, "dur": 0, "args": { "External id": 44725, "cbid": 251, "correlation": 44725 } }, { "ph": "f", "id": 44725, "pid": 76337, "tid": -914061504, "ts": 1716454218075958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075958, "dur": 0, "args": { "External id": 44726, "cbid": 251, "correlation": 44726 } }, { "ph": "f", "id": 44726, "pid": 76337, "tid": -914061504, "ts": 1716454218075958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075959, "dur": 0, "args": { "External id": 44727, "cbid": 251, "correlation": 44727 } }, { "ph": "f", "id": 44727, "pid": 76337, "tid": -914061504, "ts": 1716454218075959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218075960, "dur": 0, "args": { "External id": 44728, "cbid": 251, "correlation": 44728 } }, { "ph": "f", "id": 44728, "pid": 76337, "tid": -914061504, "ts": 1716454218075960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218201252, "dur": 115, "args": { "External id": 44729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44729, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44729, "pid": 5, "tid": 7, "ts": 1716454218201252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218075962, "dur": 20, "args": { "External id": 44729, "cbid": 211, "correlation": 44729 } }, { "ph": "s", "id": 44729, "pid": 76337, "tid": -914061504, "ts": 1716454218075962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218201369, "dur": 60, "args": { "External id": 44735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44735, "pid": 5, "tid": 7, "ts": 1716454218201369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076006, "dur": 9, "args": { "External id": 44735, "cbid": 211, "correlation": 44735 } }, { "ph": "s", "id": 44735, "pid": 76337, "tid": -914061504, "ts": 1716454218076006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218201430, "dur": 503, "args": { "External id": 44744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44744, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44744, "pid": 5, "tid": 7, "ts": 1716454218201430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076090, "dur": 14, "args": { "External id": 44744, "cbid": 211, "correlation": 44744 } }, { "ph": "s", "id": 44744, "pid": 76337, "tid": -914061504, "ts": 1716454218076090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218201933, "dur": 182, "args": { "External id": 44766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44766, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44766, "pid": 5, "tid": 7, "ts": 1716454218201933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076148, "dur": 10, "args": { "External id": 44766, "cbid": 211, "correlation": 44766 } }, { "ph": "s", "id": 44766, "pid": 76337, "tid": -914061504, "ts": 1716454218076148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076234, "dur": 1, "args": { "External id": 44777, "cbid": 251, "correlation": 44777 } }, { "ph": "f", "id": 44777, "pid": 76337, "tid": -914061504, "ts": 1716454218076234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218202117, "dur": 199, "args": { "External id": 44778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44778, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44778, "pid": 5, "tid": 7, "ts": 1716454218202117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076239, "dur": 16, "args": { "External id": 44778, "cbid": 211, "correlation": 44778 } }, { "ph": "s", "id": 44778, "pid": 76337, "tid": -914061504, "ts": 1716454218076239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076311, "dur": 1, "args": { "External id": 44789, "cbid": 251, "correlation": 44789 } }, { "ph": "f", "id": 44789, "pid": 76337, "tid": -914061504, "ts": 1716454218076311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218202318, "dur": 192, "args": { "External id": 44790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44790, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44790, "pid": 5, "tid": 7, "ts": 1716454218202318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076315, "dur": 11, "args": { "External id": 44790, "cbid": 211, "correlation": 44790 } }, { "ph": "s", "id": 44790, "pid": 76337, "tid": -914061504, "ts": 1716454218076315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076377, "dur": 1, "args": { "External id": 44801, "cbid": 251, "correlation": 44801 } }, { "ph": "f", "id": 44801, "pid": 76337, "tid": -914061504, "ts": 1716454218076377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218202511, "dur": 189, "args": { "External id": 44802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44802, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44802, "pid": 5, "tid": 7, "ts": 1716454218202511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076381, "dur": 11, "args": { "External id": 44802, "cbid": 211, "correlation": 44802 } }, { "ph": "s", "id": 44802, "pid": 76337, "tid": -914061504, "ts": 1716454218076381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218202702, "dur": 18750, "args": { "External id": 44823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44823, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44823, "pid": 5, "tid": 7, "ts": 1716454218202702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076461, "dur": 12, "args": { "External id": 44823, "cbid": 211, "correlation": 44823 } }, { "ph": "s", "id": 44823, "pid": 76337, "tid": -914061504, "ts": 1716454218076461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076557, "dur": 1, "args": { "External id": 44841, "cbid": 251, "correlation": 44841 } }, { "ph": "f", "id": 44841, "pid": 76337, "tid": -914061504, "ts": 1716454218076557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218221453, "dur": 206, "args": { "External id": 44843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44843, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44843, "pid": 5, "tid": 7, "ts": 1716454218221453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076563, "dur": 13, "args": { "External id": 44843, "cbid": 211, "correlation": 44843 } }, { "ph": "s", "id": 44843, "pid": 76337, "tid": -914061504, "ts": 1716454218076563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218221660, "dur": 66, "args": { "External id": 44851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44851, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44851, "pid": 5, "tid": 7, "ts": 1716454218221660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076633, "dur": 12, "args": { "External id": 44851, "cbid": 211, "correlation": 44851 } }, { "ph": "s", "id": 44851, "pid": 76337, "tid": -914061504, "ts": 1716454218076633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218221727, "dur": 97, "args": { "External id": 44859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44859, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44859, "pid": 5, "tid": 7, "ts": 1716454218221727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076675, "dur": 9, "args": { "External id": 44859, "cbid": 211, "correlation": 44859 } }, { "ph": "s", "id": 44859, "pid": 76337, "tid": -914061504, "ts": 1716454218076675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218221825, "dur": 54, "args": { "External id": 44870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44870, "pid": 5, "tid": 7, "ts": 1716454218221825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076745, "dur": 14, "args": { "External id": 44870, "cbid": 211, "correlation": 44870 } }, { "ph": "s", "id": 44870, "pid": 76337, "tid": -914061504, "ts": 1716454218076745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218221881, "dur": 93, "args": { "External id": 44892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44892, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44892, "pid": 5, "tid": 7, "ts": 1716454218221881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076776, "dur": 8, "args": { "External id": 44892, "cbid": 211, "correlation": 44892 } }, { "ph": "s", "id": 44892, "pid": 76337, "tid": -914061504, "ts": 1716454218076776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076860, "dur": 1, "args": { "External id": 44903, "cbid": 251, "correlation": 44903 } }, { "ph": "f", "id": 44903, "pid": 76337, "tid": -914061504, "ts": 1716454218076860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218221975, "dur": 107, "args": { "External id": 44904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44904, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 44904, "pid": 5, "tid": 7, "ts": 1716454218221975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076866, "dur": 12, "args": { "External id": 44904, "cbid": 211, "correlation": 44904 } }, { "ph": "s", "id": 44904, "pid": 76337, "tid": -914061504, "ts": 1716454218076866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076937, "dur": 1, "args": { "External id": 44915, "cbid": 251, "correlation": 44915 } }, { "ph": "f", "id": 44915, "pid": 76337, "tid": -914061504, "ts": 1716454218076937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218076940, "dur": 0, "args": { "External id": 44916, "cbid": 251, "correlation": 44916 } }, { "ph": "f", "id": 44916, "pid": 76337, "tid": -914061504, "ts": 1716454218076940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218222084, "dur": 10, "args": { "External id": 44917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44917, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 44917, "pid": 5, "tid": 7, "ts": 1716454218222084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076942, "dur": 12, "args": { "External id": 44917, "cbid": 211, "correlation": 44917 } }, { "ph": "s", "id": 44917, "pid": 76337, "tid": -914061504, "ts": 1716454218076942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218222095, "dur": 5, "args": { "External id": 44919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44919, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 44919, "pid": 5, "tid": 7, "ts": 1716454218222095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218076956, "dur": 7, "args": { "External id": 44919, "cbid": 211, "correlation": 44919 } }, { "ph": "s", "id": 44919, "pid": 76337, "tid": -914061504, "ts": 1716454218076956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077033, "dur": 1, "args": { "External id": 44930, "cbid": 251, "correlation": 44930 } }, { "ph": "f", "id": 44930, "pid": 76337, "tid": -914061504, "ts": 1716454218077033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077037, "dur": 0, "args": { "External id": 44931, "cbid": 251, "correlation": 44931 } }, { "ph": "f", "id": 44931, "pid": 76337, "tid": -914061504, "ts": 1716454218077037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218222102, "dur": 6, "args": { "External id": 44932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44932, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 44932, "pid": 5, "tid": 7, "ts": 1716454218222102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077038, "dur": 15, "args": { "External id": 44932, "cbid": 211, "correlation": 44932 } }, { "ph": "s", "id": 44932, "pid": 76337, "tid": -914061504, "ts": 1716454218077038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218222110, "dur": 4, "args": { "External id": 44934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44934, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 44934, "pid": 5, "tid": 7, "ts": 1716454218222110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077055, "dur": 6, "args": { "External id": 44934, "cbid": 211, "correlation": 44934 } }, { "ph": "s", "id": 44934, "pid": 76337, "tid": -914061504, "ts": 1716454218077055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218222114, "dur": 158, "args": { "External id": 44955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44955, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44955, "pid": 5, "tid": 7, "ts": 1716454218222114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077131, "dur": 12, "args": { "External id": 44955, "cbid": 211, "correlation": 44955 } }, { "ph": "s", "id": 44955, "pid": 76337, "tid": -914061504, "ts": 1716454218077131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077227, "dur": 1, "args": { "External id": 44973, "cbid": 251, "correlation": 44973 } }, { "ph": "f", "id": 44973, "pid": 76337, "tid": -914061504, "ts": 1716454218077227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218222273, "dur": 106, "args": { "External id": 44975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44975, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 44975, "pid": 5, "tid": 7, "ts": 1716454218222273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077233, "dur": 13, "args": { "External id": 44975, "cbid": 211, "correlation": 44975 } }, { "ph": "s", "id": 44975, "pid": 76337, "tid": -914061504, "ts": 1716454218077233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218222381, "dur": 35, "args": { "External id": 44983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44983, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44983, "pid": 5, "tid": 7, "ts": 1716454218222381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077302, "dur": 12, "args": { "External id": 44983, "cbid": 211, "correlation": 44983 } }, { "ph": "s", "id": 44983, "pid": 76337, "tid": -914061504, "ts": 1716454218077302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218222417, "dur": 68, "args": { "External id": 44991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 44991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 44991, "pid": 5, "tid": 7, "ts": 1716454218222417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077343, "dur": 9, "args": { "External id": 44991, "cbid": 211, "correlation": 44991 } }, { "ph": "s", "id": 44991, "pid": 76337, "tid": -914061504, "ts": 1716454218077343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218222487, "dur": 93, "args": { "External id": 45013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45013, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45013, "pid": 5, "tid": 7, "ts": 1716454218222487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077398, "dur": 10, "args": { "External id": 45013, "cbid": 211, "correlation": 45013 } }, { "ph": "s", "id": 45013, "pid": 76337, "tid": -914061504, "ts": 1716454218077398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077485, "dur": 1, "args": { "External id": 45029, "cbid": 251, "correlation": 45029 } }, { "ph": "f", "id": 45029, "pid": 76337, "tid": -914061504, "ts": 1716454218077485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218222581, "dur": 577, "args": { "External id": 45031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45031, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 45031, "pid": 5, "tid": 7, "ts": 1716454218222581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077491, "dur": 13, "args": { "External id": 45031, "cbid": 211, "correlation": 45031 } }, { "ph": "s", "id": 45031, "pid": 76337, "tid": -914061504, "ts": 1716454218077491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218223160, "dur": 244, "args": { "External id": 45039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45039, "pid": 5, "tid": 7, "ts": 1716454218223160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077555, "dur": 13, "args": { "External id": 45039, "cbid": 211, "correlation": 45039 } }, { "ph": "s", "id": 45039, "pid": 76337, "tid": -914061504, "ts": 1716454218077555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218223405, "dur": 254, "args": { "External id": 45047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45047, "pid": 5, "tid": 7, "ts": 1716454218223405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077585, "dur": 9, "args": { "External id": 45047, "cbid": 211, "correlation": 45047 } }, { "ph": "s", "id": 45047, "pid": 76337, "tid": -914061504, "ts": 1716454218077585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077668, "dur": 1, "args": { "External id": 45063, "cbid": 251, "correlation": 45063 } }, { "ph": "f", "id": 45063, "pid": 76337, "tid": -914061504, "ts": 1716454218077668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077674, "dur": 0, "args": { "External id": 45065, "cbid": 251, "correlation": 45065 } }, { "ph": "f", "id": 45065, "pid": 76337, "tid": -914061504, "ts": 1716454218077674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218223660, "dur": 359, "args": { "External id": 45066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45066, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45066, "pid": 5, "tid": 7, "ts": 1716454218223660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077676, "dur": 12, "args": { "External id": 45066, "cbid": 211, "correlation": 45066 } }, { "ph": "s", "id": 45066, "pid": 76337, "tid": -914061504, "ts": 1716454218077676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224020, "dur": 50, "args": { "External id": 45074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45074, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45074, "pid": 5, "tid": 7, "ts": 1716454218224020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077718, "dur": 10, "args": { "External id": 45074, "cbid": 211, "correlation": 45074 } }, { "ph": "s", "id": 45074, "pid": 76337, "tid": -914061504, "ts": 1716454218077718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218224072, "dur": 160, "args": { "External id": 45085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45085, "pid": 5, "tid": 7, "ts": 1716454218224072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077783, "dur": 36, "args": { "External id": 45085, "cbid": 211, "correlation": 45085 } }, { "ph": "s", "id": 45085, "pid": 76337, "tid": -914061504, "ts": 1716454218077783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218077873, "dur": 0, "args": { "External id": 45097, "cbid": 317, "correlation": 45097 } }, { "ph": "f", "id": 45097, "pid": 76337, "tid": -914061504, "ts": 1716454218077873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218077874, "dur": 0, "args": { "External id": 45098, "cbid": 203, "correlation": 45098 } }, { "ph": "f", "id": 45098, "pid": 76337, "tid": -914061504, "ts": 1716454218077874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218077875, "dur": 0, "args": { "External id": 45099, "cbid": 205, "correlation": 45099 } }, { "ph": "f", "id": 45099, "pid": 76337, "tid": -914061504, "ts": 1716454218077875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077897, "dur": 1, "args": { "External id": 45103, "cbid": 251, "correlation": 45103 } }, { "ph": "f", "id": 45103, "pid": 76337, "tid": -914061504, "ts": 1716454218077897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077899, "dur": 0, "args": { "External id": 45104, "cbid": 251, "correlation": 45104 } }, { "ph": "f", "id": 45104, "pid": 76337, "tid": -914061504, "ts": 1716454218077899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077900, "dur": 0, "args": { "External id": 45105, "cbid": 251, "correlation": 45105 } }, { "ph": "f", "id": 45105, "pid": 76337, "tid": -914061504, "ts": 1716454218077900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077901, "dur": 0, "args": { "External id": 45106, "cbid": 251, "correlation": 45106 } }, { "ph": "f", "id": 45106, "pid": 76337, "tid": -914061504, "ts": 1716454218077901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077901, "dur": 0, "args": { "External id": 45107, "cbid": 251, "correlation": 45107 } }, { "ph": "f", "id": 45107, "pid": 76337, "tid": -914061504, "ts": 1716454218077901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077902, "dur": 0, "args": { "External id": 45108, "cbid": 251, "correlation": 45108 } }, { "ph": "f", "id": 45108, "pid": 76337, "tid": -914061504, "ts": 1716454218077902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077903, "dur": 0, "args": { "External id": 45109, "cbid": 251, "correlation": 45109 } }, { "ph": "f", "id": 45109, "pid": 76337, "tid": -914061504, "ts": 1716454218077903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077904, "dur": 0, "args": { "External id": 45110, "cbid": 251, "correlation": 45110 } }, { "ph": "f", "id": 45110, "pid": 76337, "tid": -914061504, "ts": 1716454218077904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218077905, "dur": 0, "args": { "External id": 45111, "cbid": 251, "correlation": 45111 } }, { "ph": "f", "id": 45111, "pid": 76337, "tid": -914061504, "ts": 1716454218077905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218224233, "dur": 116, "args": { "External id": 45112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45112, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 45112, "pid": 5, "tid": 7, "ts": 1716454218224233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077907, "dur": 33, "args": { "External id": 45112, "cbid": 211, "correlation": 45112 } }, { "ph": "s", "id": 45112, "pid": 76337, "tid": -914061504, "ts": 1716454218077907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218224350, "dur": 61, "args": { "External id": 45118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45118, "pid": 5, "tid": 7, "ts": 1716454218224350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218077964, "dur": 279, "args": { "External id": 45118, "cbid": 211, "correlation": 45118 } }, { "ph": "s", "id": 45118, "pid": 76337, "tid": -914061504, "ts": 1716454218077964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224412, "dur": 50, "args": { "External id": 45126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45126, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45126, "pid": 5, "tid": 7, "ts": 1716454218224412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078267, "dur": 9, "args": { "External id": 45126, "cbid": 211, "correlation": 45126 } }, { "ph": "s", "id": 45126, "pid": 76337, "tid": -914061504, "ts": 1716454218078267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218224464, "dur": 53, "args": { "External id": 45146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45146, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 45146, "pid": 5, "tid": 7, "ts": 1716454218224464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078349, "dur": 12, "args": { "External id": 45146, "cbid": 211, "correlation": 45146 } }, { "ph": "s", "id": 45146, "pid": 76337, "tid": -914061504, "ts": 1716454218078349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218224518, "dur": 5, "args": { "External id": 45158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45158, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 45158, "pid": 5, "tid": 7, "ts": 1716454218224518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078371, "dur": 6, "args": { "External id": 45158, "cbid": 211, "correlation": 45158 } }, { "ph": "s", "id": 45158, "pid": 76337, "tid": -914061504, "ts": 1716454218078371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218224524, "dur": 56, "args": { "External id": 45161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45161, "pid": 5, "tid": 7, "ts": 1716454218224524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078389, "dur": 111, "args": { "External id": 45161, "cbid": 211, "correlation": 45161 } }, { "ph": "s", "id": 45161, "pid": 76337, "tid": -914061504, "ts": 1716454218078389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224581, "dur": 39, "args": { "External id": 45170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45170, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45170, "pid": 5, "tid": 7, "ts": 1716454218224581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078539, "dur": 11, "args": { "External id": 45170, "cbid": 211, "correlation": 45170 } }, { "ph": "s", "id": 45170, "pid": 76337, "tid": -914061504, "ts": 1716454218078539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218078594, "dur": 0, "args": { "External id": 45180, "cbid": 317, "correlation": 45180 } }, { "ph": "f", "id": 45180, "pid": 76337, "tid": -914061504, "ts": 1716454218078594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218078595, "dur": 0, "args": { "External id": 45181, "cbid": 203, "correlation": 45181 } }, { "ph": "f", "id": 45181, "pid": 76337, "tid": -914061504, "ts": 1716454218078595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218078596, "dur": 0, "args": { "External id": 45182, "cbid": 205, "correlation": 45182 } }, { "ph": "f", "id": 45182, "pid": 76337, "tid": -914061504, "ts": 1716454218078596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218224621, "dur": 42, "args": { "External id": 45186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45186, "pid": 5, "tid": 7, "ts": 1716454218224621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078613, "dur": 12, "args": { "External id": 45186, "cbid": 211, "correlation": 45186 } }, { "ph": "s", "id": 45186, "pid": 76337, "tid": -914061504, "ts": 1716454218078613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218224664, "dur": 3, "args": { "External id": 45188, "device": 5, "context": 1, "stream": 7, "correlation": 45188, "bytes": 46080, "memory bandwidth (GB/s)": 11.707317073170731 } }, { "ph": "f", "id": 45188, "pid": 5, "tid": 7, "ts": 1716454218224664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218078628, "dur": 39, "args": { "External id": 45188, "cbid": 51, "correlation": 45188 } }, { "ph": "s", "id": 45188, "pid": 76337, "tid": -914061504, "ts": 1716454218078628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218078674, "dur": 2, "args": { "External id": 45190, "cbid": 200, "correlation": 45190 } }, { "ph": "f", "id": 45190, "pid": 76337, "tid": -914061504, "ts": 1716454218078674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218078676, "dur": 0, "args": { "External id": 45191, "cbid": 200, "correlation": 45191 } }, { "ph": "f", "id": 45191, "pid": 76337, "tid": -914061504, "ts": 1716454218078676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218078677, "dur": 0, "args": { "External id": 45192, "cbid": 200, "correlation": 45192 } }, { "ph": "f", "id": 45192, "pid": 76337, "tid": -914061504, "ts": 1716454218078677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218078678, "dur": 0, "args": { "External id": 45193, "cbid": 200, "correlation": 45193 } }, { "ph": "f", "id": 45193, "pid": 76337, "tid": -914061504, "ts": 1716454218078678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454218078678, "dur": 3, "args": { "External id": 45194, "cbid": 15, "correlation": 45194 } }, { "ph": "f", "id": 45194, "pid": 76337, "tid": -914061504, "ts": 1716454218078678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218078682, "dur": 1, "args": { "External id": 45195, "cbid": 251, "correlation": 45195 } }, { "ph": "f", "id": 45195, "pid": 76337, "tid": -914061504, "ts": 1716454218078682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454218224669, "dur": 23, "args": { "External id": 45196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45196, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45196, "pid": 5, "tid": 7, "ts": 1716454218224669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078686, "dur": 11, "args": { "External id": 45196, "cbid": 211, "correlation": 45196 } }, { "ph": "s", "id": 45196, "pid": 76337, "tid": -914061504, "ts": 1716454218078686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218224694, "dur": 4, "args": { "External id": 45198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 45198, "pid": 5, "tid": 7, "ts": 1716454218224694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078702, "dur": 6, "args": { "External id": 45198, "cbid": 211, "correlation": 45198 } }, { "ph": "s", "id": 45198, "pid": 76337, "tid": -914061504, "ts": 1716454218078702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218078712, "dur": 0, "args": { "External id": 45199, "cbid": 51, "correlation": 45199 } }, { "ph": "s", "id": 45199, "pid": 76337, "tid": -914061504, "ts": 1716454218078712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218224699, "dur": 189, "args": { "External id": 45200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45200, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45200, "pid": 5, "tid": 7, "ts": 1716454218224699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078713, "dur": 171, "args": { "External id": 45200, "cbid": 211, "correlation": 45200 } }, { "ph": "s", "id": 45200, "pid": 76337, "tid": -914061504, "ts": 1716454218078713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218224889, "dur": 6, "args": { "External id": 45201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45201, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45201, "pid": 5, "tid": 7, "ts": 1716454218224889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078888, "dur": 6, "args": { "External id": 45201, "cbid": 211, "correlation": 45201 } }, { "ph": "s", "id": 45201, "pid": 76337, "tid": -914061504, "ts": 1716454218078888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218224897, "dur": 5, "args": { "External id": 45207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45207, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 45207, "pid": 5, "tid": 7, "ts": 1716454218224897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218078918, "dur": 9, "args": { "External id": 45207, "cbid": 211, "correlation": 45207 } }, { "ph": "s", "id": 45207, "pid": 76337, "tid": -914061504, "ts": 1716454218078918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224903, "dur": 3, "args": { "External id": 45215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45215, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45215, "pid": 5, "tid": 7, "ts": 1716454218224903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218080663, "dur": 18, "args": { "External id": 45215, "cbid": 211, "correlation": 45215 } }, { "ph": "s", "id": 45215, "pid": 76337, "tid": -914061504, "ts": 1716454218080663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224907, "dur": 3, "args": { "External id": 45223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45223, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45223, "pid": 5, "tid": 7, "ts": 1716454218224907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218080713, "dur": 11, "args": { "External id": 45223, "cbid": 211, "correlation": 45223 } }, { "ph": "s", "id": 45223, "pid": 76337, "tid": -914061504, "ts": 1716454218080713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224911, "dur": 3, "args": { "External id": 45231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45231, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45231, "pid": 5, "tid": 7, "ts": 1716454218224911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218080746, "dur": 9, "args": { "External id": 45231, "cbid": 211, "correlation": 45231 } }, { "ph": "s", "id": 45231, "pid": 76337, "tid": -914061504, "ts": 1716454218080746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224915, "dur": 3, "args": { "External id": 45240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45240, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45240, "pid": 5, "tid": 7, "ts": 1716454218224915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218080926, "dur": 15, "args": { "External id": 45240, "cbid": 211, "correlation": 45240 } }, { "ph": "s", "id": 45240, "pid": 76337, "tid": -914061504, "ts": 1716454218080926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224919, "dur": 3, "args": { "External id": 45249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45249, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45249, "pid": 5, "tid": 7, "ts": 1716454218224919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218080955, "dur": 7, "args": { "External id": 45249, "cbid": 211, "correlation": 45249 } }, { "ph": "s", "id": 45249, "pid": 76337, "tid": -914061504, "ts": 1716454218080955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218224923, "dur": 3, "args": { "External id": 45257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45257, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45257, "pid": 5, "tid": 7, "ts": 1716454218224923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218080991, "dur": 9, "args": { "External id": 45257, "cbid": 211, "correlation": 45257 } }, { "ph": "s", "id": 45257, "pid": 76337, "tid": -914061504, "ts": 1716454218080991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218224929, "dur": 1, "args": { "External id": 45267, "device": 5, "context": 1, "stream": 7, "correlation": 45267, "bytes": 4, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 45267, "pid": 5, "tid": 7, "ts": 1716454218224929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218081241, "dur": 40, "args": { "External id": 45267, "cbid": 41, "correlation": 45267 } }, { "ph": "s", "id": 45267, "pid": 76337, "tid": -914061504, "ts": 1716454218081241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218081283, "dur": 143662, "args": { "External id": 45268, "cbid": 131, "correlation": 45268 } }, { "ph": "f", "id": 45268, "pid": 76337, "tid": -914061504, "ts": 1716454218081283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225099, "dur": 3, "args": { "External id": 45276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45276, "pid": 5, "tid": 7, "ts": 1716454218225099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225069, "dur": 30, "args": { "External id": 45276, "cbid": 211, "correlation": 45276 } }, { "ph": "s", "id": 45276, "pid": 76337, "tid": -914061504, "ts": 1716454218225069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218225218, "dur": 3, "args": { "External id": 45285, "device": 5, "context": 1, "stream": 7, "correlation": 45285, "bytes": 4, "memory bandwidth (GB/s)": 0.0011904761904761906 } }, { "ph": "f", "id": 45285, "pid": 5, "tid": 7, "ts": 1716454218225218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218225186, "dur": 32, "args": { "External id": 45285, "cbid": 41, "correlation": 45285 } }, { "ph": "s", "id": 45285, "pid": 76337, "tid": -914061504, "ts": 1716454218225186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218225287, "dur": 1, "args": { "External id": 45296, "device": 5, "context": 1, "stream": 7, "correlation": 45296, "bytes": 4, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 45296, "pid": 5, "tid": 7, "ts": 1716454218225287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218225272, "dur": 13, "args": { "External id": 45296, "cbid": 41, "correlation": 45296 } }, { "ph": "s", "id": 45296, "pid": 76337, "tid": -914061504, "ts": 1716454218225272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218225286, "dur": 8, "args": { "External id": 45297, "cbid": 131, "correlation": 45297 } }, { "ph": "f", "id": 45297, "pid": 76337, "tid": -914061504, "ts": 1716454218225286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218225359, "dur": 1, "args": { "External id": 45306, "device": 5, "context": 1, "stream": 7, "correlation": 45306, "bytes": 2, "memory bandwidth (GB/s)": 0.0010245901639344263 } }, { "ph": "f", "id": 45306, "pid": 5, "tid": 7, "ts": 1716454218225359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218225325, "dur": 32, "args": { "External id": 45306, "cbid": 41, "correlation": 45306 } }, { "ph": "s", "id": 45306, "pid": 76337, "tid": -914061504, "ts": 1716454218225325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218225358, "dur": 8, "args": { "External id": 45307, "cbid": 131, "correlation": 45307 } }, { "ph": "f", "id": 45307, "pid": 76337, "tid": -914061504, "ts": 1716454218225358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225413, "dur": 3, "args": { "External id": 45315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45315, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45315, "pid": 5, "tid": 7, "ts": 1716454218225413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225398, "dur": 14, "args": { "External id": 45315, "cbid": 211, "correlation": 45315 } }, { "ph": "s", "id": 45315, "pid": 76337, "tid": -914061504, "ts": 1716454218225398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225443, "dur": 3, "args": { "External id": 45325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45325, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45325, "pid": 5, "tid": 7, "ts": 1716454218225443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225434, "dur": 8, "args": { "External id": 45325, "cbid": 211, "correlation": 45325 } }, { "ph": "s", "id": 45325, "pid": 76337, "tid": -914061504, "ts": 1716454218225434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225467, "dur": 3, "args": { "External id": 45334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45334, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45334, "pid": 5, "tid": 7, "ts": 1716454218225467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225457, "dur": 8, "args": { "External id": 45334, "cbid": 211, "correlation": 45334 } }, { "ph": "s", "id": 45334, "pid": 76337, "tid": -914061504, "ts": 1716454218225457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225488, "dur": 3, "args": { "External id": 45342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45342, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45342, "pid": 5, "tid": 7, "ts": 1716454218225488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225480, "dur": 7, "args": { "External id": 45342, "cbid": 211, "correlation": 45342 } }, { "ph": "s", "id": 45342, "pid": 76337, "tid": -914061504, "ts": 1716454218225480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218225531, "dur": 6, "args": { "External id": 45352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45352, "pid": 5, "tid": 7, "ts": 1716454218225531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225519, "dur": 10, "args": { "External id": 45352, "cbid": 211, "correlation": 45352 } }, { "ph": "s", "id": 45352, "pid": 76337, "tid": -914061504, "ts": 1716454218225519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225565, "dur": 3, "args": { "External id": 45360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45360, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45360, "pid": 5, "tid": 7, "ts": 1716454218225565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225555, "dur": 10, "args": { "External id": 45360, "cbid": 211, "correlation": 45360 } }, { "ph": "s", "id": 45360, "pid": 76337, "tid": -914061504, "ts": 1716454218225555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225594, "dur": 3, "args": { "External id": 45369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45369, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45369, "pid": 5, "tid": 7, "ts": 1716454218225594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225585, "dur": 8, "args": { "External id": 45369, "cbid": 211, "correlation": 45369 } }, { "ph": "s", "id": 45369, "pid": 76337, "tid": -914061504, "ts": 1716454218225585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225616, "dur": 3, "args": { "External id": 45378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45378, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45378, "pid": 5, "tid": 7, "ts": 1716454218225616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225608, "dur": 7, "args": { "External id": 45378, "cbid": 211, "correlation": 45378 } }, { "ph": "s", "id": 45378, "pid": 76337, "tid": -914061504, "ts": 1716454218225608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225910, "dur": 3, "args": { "External id": 45386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45386, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45386, "pid": 5, "tid": 7, "ts": 1716454218225910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225894, "dur": 15, "args": { "External id": 45386, "cbid": 211, "correlation": 45386 } }, { "ph": "s", "id": 45386, "pid": 76337, "tid": -914061504, "ts": 1716454218225894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218225934, "dur": 3, "args": { "External id": 45394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45394, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45394, "pid": 5, "tid": 7, "ts": 1716454218225934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218225925, "dur": 8, "args": { "External id": 45394, "cbid": 211, "correlation": 45394 } }, { "ph": "s", "id": 45394, "pid": 76337, "tid": -914061504, "ts": 1716454218225925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218226004, "dur": 1, "args": { "External id": 45404, "device": 5, "context": 1, "stream": 7, "correlation": 45404, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 45404, "pid": 5, "tid": 7, "ts": 1716454218226004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218225969, "dur": 33, "args": { "External id": 45404, "cbid": 41, "correlation": 45404 } }, { "ph": "s", "id": 45404, "pid": 76337, "tid": -914061504, "ts": 1716454218225969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218226003, "dur": 8, "args": { "External id": 45405, "cbid": 131, "correlation": 45405 } }, { "ph": "f", "id": 45405, "pid": 76337, "tid": -914061504, "ts": 1716454218226003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226096, "dur": 2, "args": { "External id": 45413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45413, "pid": 5, "tid": 7, "ts": 1716454218226096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226081, "dur": 15, "args": { "External id": 45413, "cbid": 211, "correlation": 45413 } }, { "ph": "s", "id": 45413, "pid": 76337, "tid": -914061504, "ts": 1716454218226081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218226167, "dur": 3, "args": { "External id": 45422, "device": 5, "context": 1, "stream": 7, "correlation": 45422, "bytes": 8, "memory bandwidth (GB/s)": 0.002631578947368421 } }, { "ph": "f", "id": 45422, "pid": 5, "tid": 7, "ts": 1716454218226167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218226149, "dur": 18, "args": { "External id": 45422, "cbid": 41, "correlation": 45422 } }, { "ph": "s", "id": 45422, "pid": 76337, "tid": -914061504, "ts": 1716454218226149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218226358, "dur": 4, "args": { "External id": 45432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45432, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45432, "pid": 5, "tid": 7, "ts": 1716454218226358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226225, "dur": 134, "args": { "External id": 45432, "cbid": 211, "correlation": 45432 } }, { "ph": "s", "id": 45432, "pid": 76337, "tid": -914061504, "ts": 1716454218226225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218226423, "dur": 1, "args": { "External id": 45442, "device": 5, "context": 1, "stream": 7, "correlation": 45442, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 45442, "pid": 5, "tid": 7, "ts": 1716454218226423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218226407, "dur": 14, "args": { "External id": 45442, "cbid": 41, "correlation": 45442 } }, { "ph": "s", "id": 45442, "pid": 76337, "tid": -914061504, "ts": 1716454218226407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218226421, "dur": 8, "args": { "External id": 45443, "cbid": 131, "correlation": 45443 } }, { "ph": "f", "id": 45443, "pid": 76337, "tid": -914061504, "ts": 1716454218226421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218226486, "dur": 3, "args": { "External id": 45450, "device": 5, "context": 1, "stream": 7, "correlation": 45450, "bytes": 98304, "memory bandwidth (GB/s)": 30.11764705882353 } }, { "ph": "f", "id": 45450, "pid": 5, "tid": 7, "ts": 1716454218226486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218226465, "dur": 20, "args": { "External id": 45450, "cbid": 41, "correlation": 45450 } }, { "ph": "s", "id": 45450, "pid": 76337, "tid": -914061504, "ts": 1716454218226465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218226533, "dur": 1, "args": { "External id": 45461, "device": 5, "context": 1, "stream": 7, "correlation": 45461, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 45461, "pid": 5, "tid": 7, "ts": 1716454218226533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218226520, "dur": 10, "args": { "External id": 45461, "cbid": 41, "correlation": 45461 } }, { "ph": "s", "id": 45461, "pid": 76337, "tid": -914061504, "ts": 1716454218226520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218226530, "dur": 10, "args": { "External id": 45462, "cbid": 131, "correlation": 45462 } }, { "ph": "f", "id": 45462, "pid": 76337, "tid": -914061504, "ts": 1716454218226530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226582, "dur": 3, "args": { "External id": 45470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45470, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45470, "pid": 5, "tid": 7, "ts": 1716454218226582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226569, "dur": 13, "args": { "External id": 45470, "cbid": 211, "correlation": 45470 } }, { "ph": "s", "id": 45470, "pid": 76337, "tid": -914061504, "ts": 1716454218226569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226611, "dur": 3, "args": { "External id": 45480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45480, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45480, "pid": 5, "tid": 7, "ts": 1716454218226611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226602, "dur": 8, "args": { "External id": 45480, "cbid": 211, "correlation": 45480 } }, { "ph": "s", "id": 45480, "pid": 76337, "tid": -914061504, "ts": 1716454218226602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226634, "dur": 3, "args": { "External id": 45489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45489, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45489, "pid": 5, "tid": 7, "ts": 1716454218226634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226625, "dur": 8, "args": { "External id": 45489, "cbid": 211, "correlation": 45489 } }, { "ph": "s", "id": 45489, "pid": 76337, "tid": -914061504, "ts": 1716454218226625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218226846, "dur": 5, "args": { "External id": 45497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45497, "pid": 5, "tid": 7, "ts": 1716454218226846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226830, "dur": 16, "args": { "External id": 45497, "cbid": 211, "correlation": 45497 } }, { "ph": "s", "id": 45497, "pid": 76337, "tid": -914061504, "ts": 1716454218226830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226893, "dur": 3, "args": { "External id": 45506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45506, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45506, "pid": 5, "tid": 7, "ts": 1716454218226893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226882, "dur": 9, "args": { "External id": 45506, "cbid": 211, "correlation": 45506 } }, { "ph": "s", "id": 45506, "pid": 76337, "tid": -914061504, "ts": 1716454218226882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226920, "dur": 3, "args": { "External id": 45515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45515, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45515, "pid": 5, "tid": 7, "ts": 1716454218226920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226911, "dur": 8, "args": { "External id": 45515, "cbid": 211, "correlation": 45515 } }, { "ph": "s", "id": 45515, "pid": 76337, "tid": -914061504, "ts": 1716454218226911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218226989, "dur": 3, "args": { "External id": 45523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45523, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45523, "pid": 5, "tid": 7, "ts": 1716454218226989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218226970, "dur": 18, "args": { "External id": 45523, "cbid": 211, "correlation": 45523 } }, { "ph": "s", "id": 45523, "pid": 76337, "tid": -914061504, "ts": 1716454218226970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218227058, "dur": 2, "args": { "External id": 45531, "device": 5, "context": 1, "stream": 7, "correlation": 45531, "bytes": 8, "memory bandwidth (GB/s)": 0.0030120481927710845 } }, { "ph": "f", "id": 45531, "pid": 5, "tid": 7, "ts": 1716454218227058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218227036, "dur": 33, "args": { "External id": 45531, "cbid": 41, "correlation": 45531 } }, { "ph": "s", "id": 45531, "pid": 76337, "tid": -914061504, "ts": 1716454218227036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218227069, "dur": 4, "args": { "External id": 45532, "cbid": 131, "correlation": 45532 } }, { "ph": "f", "id": 45532, "pid": 76337, "tid": -914061504, "ts": 1716454218227069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218227132, "dur": 1, "args": { "External id": 45542, "device": 5, "context": 1, "stream": 7, "correlation": 45542, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 45542, "pid": 5, "tid": 7, "ts": 1716454218227132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218227119, "dur": 11, "args": { "External id": 45542, "cbid": 41, "correlation": 45542 } }, { "ph": "s", "id": 45542, "pid": 76337, "tid": -914061504, "ts": 1716454218227119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218227131, "dur": 7, "args": { "External id": 45543, "cbid": 131, "correlation": 45543 } }, { "ph": "f", "id": 45543, "pid": 76337, "tid": -914061504, "ts": 1716454218227131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218227195, "dur": 1, "args": { "External id": 45552, "device": 5, "context": 1, "stream": 7, "correlation": 45552, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 45552, "pid": 5, "tid": 7, "ts": 1716454218227195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218227170, "dur": 23, "args": { "External id": 45552, "cbid": 41, "correlation": 45552 } }, { "ph": "s", "id": 45552, "pid": 76337, "tid": -914061504, "ts": 1716454218227170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218227194, "dur": 8, "args": { "External id": 45553, "cbid": 131, "correlation": 45553 } }, { "ph": "f", "id": 45553, "pid": 76337, "tid": -914061504, "ts": 1716454218227194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218227269, "dur": 4, "args": { "External id": 45560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45560, "pid": 5, "tid": 7, "ts": 1716454218227269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227253, "dur": 17, "args": { "External id": 45560, "cbid": 211, "correlation": 45560 } }, { "ph": "s", "id": 45560, "pid": 76337, "tid": -914061504, "ts": 1716454218227253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454218227316, "dur": 4, "args": { "External id": 45580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45580, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45580, "pid": 5, "tid": 7, "ts": 1716454218227316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227295, "dur": 21, "args": { "External id": 45580, "cbid": 211, "correlation": 45580 } }, { "ph": "s", "id": 45580, "pid": 76337, "tid": -914061504, "ts": 1716454218227295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218227317, "dur": 0, "args": { "External id": 45581, "cbid": 11, "correlation": 45581 } }, { "ph": "f", "id": 45581, "pid": 76337, "tid": -914061504, "ts": 1716454218227317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218227318, "dur": 0, "args": { "External id": 45582, "cbid": 11, "correlation": 45582 } }, { "ph": "f", "id": 45582, "pid": 76337, "tid": -914061504, "ts": 1716454218227318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218227334, "dur": 1, "args": { "External id": 45585, "device": 5, "context": 1, "stream": 7, "correlation": 45585, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 45585, "pid": 5, "tid": 7, "ts": 1716454218227334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218227319, "dur": 23, "args": { "External id": 45585, "cbid": 41, "correlation": 45585 } }, { "ph": "s", "id": 45585, "pid": 76337, "tid": -914061504, "ts": 1716454218227319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218227343, "dur": 3, "args": { "External id": 45586, "cbid": 131, "correlation": 45586 } }, { "ph": "f", "id": 45586, "pid": 76337, "tid": -914061504, "ts": 1716454218227343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454218227379, "dur": 3, "args": { "External id": 45610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45610, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45610, "pid": 5, "tid": 7, "ts": 1716454218227379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227368, "dur": 11, "args": { "External id": 45610, "cbid": 211, "correlation": 45610 } }, { "ph": "s", "id": 45610, "pid": 76337, "tid": -914061504, "ts": 1716454218227368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218227379, "dur": 0, "args": { "External id": 45611, "cbid": 11, "correlation": 45611 } }, { "ph": "f", "id": 45611, "pid": 76337, "tid": -914061504, "ts": 1716454218227379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218227380, "dur": 0, "args": { "External id": 45612, "cbid": 11, "correlation": 45612 } }, { "ph": "f", "id": 45612, "pid": 76337, "tid": -914061504, "ts": 1716454218227380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218227381, "dur": 1, "args": { "External id": 45614, "cbid": 200, "correlation": 45614 } }, { "ph": "f", "id": 45614, "pid": 76337, "tid": -914061504, "ts": 1716454218227381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454218227391, "dur": 4, "args": { "External id": 45616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45616, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45616, "pid": 5, "tid": 7, "ts": 1716454218227391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227384, "dur": 7, "args": { "External id": 45616, "cbid": 211, "correlation": 45616 } }, { "ph": "s", "id": 45616, "pid": 76337, "tid": -914061504, "ts": 1716454218227384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218227392, "dur": 0, "args": { "External id": 45617, "cbid": 11, "correlation": 45617 } }, { "ph": "f", "id": 45617, "pid": 76337, "tid": -914061504, "ts": 1716454218227392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454218227392, "dur": 0, "args": { "External id": 45618, "cbid": 11, "correlation": 45618 } }, { "ph": "f", "id": 45618, "pid": 76337, "tid": -914061504, "ts": 1716454218227392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454218227432, "dur": 1, "args": { "External id": 45625, "device": 5, "context": 1, "stream": 7, "correlation": 45625, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 45625, "pid": 5, "tid": 7, "ts": 1716454218227432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218227419, "dur": 22, "args": { "External id": 45625, "cbid": 41, "correlation": 45625 } }, { "ph": "s", "id": 45625, "pid": 76337, "tid": -914061504, "ts": 1716454218227419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218227441, "dur": 4, "args": { "External id": 45626, "cbid": 131, "correlation": 45626 } }, { "ph": "f", "id": 45626, "pid": 76337, "tid": -914061504, "ts": 1716454218227441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218227493, "dur": 1, "args": { "External id": 45636, "device": 5, "context": 1, "stream": 7, "correlation": 45636, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 45636, "pid": 5, "tid": 7, "ts": 1716454218227493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218227480, "dur": 9, "args": { "External id": 45636, "cbid": 41, "correlation": 45636 } }, { "ph": "s", "id": 45636, "pid": 76337, "tid": -914061504, "ts": 1716454218227480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218227491, "dur": 9, "args": { "External id": 45637, "cbid": 131, "correlation": 45637 } }, { "ph": "f", "id": 45637, "pid": 76337, "tid": -914061504, "ts": 1716454218227491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218227561, "dur": 5, "args": { "External id": 45644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45644, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45644, "pid": 5, "tid": 7, "ts": 1716454218227561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227545, "dur": 16, "args": { "External id": 45644, "cbid": 211, "correlation": 45644 } }, { "ph": "s", "id": 45644, "pid": 76337, "tid": -914061504, "ts": 1716454218227545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227630, "dur": 3, "args": { "External id": 45653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45653, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45653, "pid": 5, "tid": 7, "ts": 1716454218227630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227617, "dur": 13, "args": { "External id": 45653, "cbid": 211, "correlation": 45653 } }, { "ph": "s", "id": 45653, "pid": 76337, "tid": -914061504, "ts": 1716454218227617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227662, "dur": 3, "args": { "External id": 45661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45661, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45661, "pid": 5, "tid": 7, "ts": 1716454218227662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227652, "dur": 9, "args": { "External id": 45661, "cbid": 211, "correlation": 45661 } }, { "ph": "s", "id": 45661, "pid": 76337, "tid": -914061504, "ts": 1716454218227652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227698, "dur": 4, "args": { "External id": 45669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45669, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45669, "pid": 5, "tid": 7, "ts": 1716454218227698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227686, "dur": 11, "args": { "External id": 45669, "cbid": 211, "correlation": 45669 } }, { "ph": "s", "id": 45669, "pid": 76337, "tid": -914061504, "ts": 1716454218227686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227728, "dur": 4, "args": { "External id": 45677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45677, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45677, "pid": 5, "tid": 7, "ts": 1716454218227728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227717, "dur": 10, "args": { "External id": 45677, "cbid": 211, "correlation": 45677 } }, { "ph": "s", "id": 45677, "pid": 76337, "tid": -914061504, "ts": 1716454218227717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227754, "dur": 3, "args": { "External id": 45685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45685, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45685, "pid": 5, "tid": 7, "ts": 1716454218227754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227745, "dur": 8, "args": { "External id": 45685, "cbid": 211, "correlation": 45685 } }, { "ph": "s", "id": 45685, "pid": 76337, "tid": -914061504, "ts": 1716454218227745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227779, "dur": 3, "args": { "External id": 45693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45693, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45693, "pid": 5, "tid": 7, "ts": 1716454218227779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227771, "dur": 8, "args": { "External id": 45693, "cbid": 211, "correlation": 45693 } }, { "ph": "s", "id": 45693, "pid": 76337, "tid": -914061504, "ts": 1716454218227771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218227800, "dur": 4, "args": { "External id": 45701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45701, "pid": 5, "tid": 7, "ts": 1716454218227800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227792, "dur": 7, "args": { "External id": 45701, "cbid": 211, "correlation": 45701 } }, { "ph": "s", "id": 45701, "pid": 76337, "tid": -914061504, "ts": 1716454218227792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218227818, "dur": 5, "args": { "External id": 45709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45709, "pid": 5, "tid": 7, "ts": 1716454218227818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227810, "dur": 7, "args": { "External id": 45709, "cbid": 211, "correlation": 45709 } }, { "ph": "s", "id": 45709, "pid": 76337, "tid": -914061504, "ts": 1716454218227810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227837, "dur": 3, "args": { "External id": 45717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45717, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45717, "pid": 5, "tid": 7, "ts": 1716454218227837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227830, "dur": 6, "args": { "External id": 45717, "cbid": 211, "correlation": 45717 } }, { "ph": "s", "id": 45717, "pid": 76337, "tid": -914061504, "ts": 1716454218227830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227894, "dur": 4, "args": { "External id": 45725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45725, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 45725, "pid": 5, "tid": 7, "ts": 1716454218227894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227883, "dur": 10, "args": { "External id": 45725, "cbid": 211, "correlation": 45725 } }, { "ph": "s", "id": 45725, "pid": 76337, "tid": -914061504, "ts": 1716454218227883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218227921, "dur": 4, "args": { "External id": 45733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45733, "pid": 5, "tid": 7, "ts": 1716454218227921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227912, "dur": 9, "args": { "External id": 45733, "cbid": 211, "correlation": 45733 } }, { "ph": "s", "id": 45733, "pid": 76337, "tid": -914061504, "ts": 1716454218227912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218227944, "dur": 4, "args": { "External id": 45741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45741, "pid": 5, "tid": 7, "ts": 1716454218227944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227936, "dur": 7, "args": { "External id": 45741, "cbid": 211, "correlation": 45741 } }, { "ph": "s", "id": 45741, "pid": 76337, "tid": -914061504, "ts": 1716454218227936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218227963, "dur": 3, "args": { "External id": 45749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45749, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 45749, "pid": 5, "tid": 7, "ts": 1716454218227963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218227955, "dur": 7, "args": { "External id": 45749, "cbid": 211, "correlation": 45749 } }, { "ph": "s", "id": 45749, "pid": 76337, "tid": -914061504, "ts": 1716454218227955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218228350, "dur": 5, "args": { "External id": 45758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45758, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45758, "pid": 5, "tid": 7, "ts": 1716454218228350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228333, "dur": 18, "args": { "External id": 45758, "cbid": 211, "correlation": 45758 } }, { "ph": "s", "id": 45758, "pid": 76337, "tid": -914061504, "ts": 1716454218228333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218228386, "dur": 5, "args": { "External id": 45767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45767, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45767, "pid": 5, "tid": 7, "ts": 1716454218228386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228376, "dur": 9, "args": { "External id": 45767, "cbid": 211, "correlation": 45767 } }, { "ph": "s", "id": 45767, "pid": 76337, "tid": -914061504, "ts": 1716454218228376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454218228513, "dur": 3, "args": { "External id": 45783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45783, "pid": 5, "tid": 7, "ts": 1716454218228513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228499, "dur": 14, "args": { "External id": 45783, "cbid": 211, "correlation": 45783 } }, { "ph": "s", "id": 45783, "pid": 76337, "tid": -914061504, "ts": 1716454218228499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218228548, "dur": 3, "args": { "External id": 45791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45791, "pid": 5, "tid": 7, "ts": 1716454218228548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228539, "dur": 9, "args": { "External id": 45791, "cbid": 211, "correlation": 45791 } }, { "ph": "s", "id": 45791, "pid": 76337, "tid": -914061504, "ts": 1716454218228539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218228579, "dur": 3, "args": { "External id": 45799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45799, "pid": 5, "tid": 7, "ts": 1716454218228579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228570, "dur": 11, "args": { "External id": 45799, "cbid": 211, "correlation": 45799 } }, { "ph": "s", "id": 45799, "pid": 76337, "tid": -914061504, "ts": 1716454218228570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218228614, "dur": 4, "args": { "External id": 45807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45807, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45807, "pid": 5, "tid": 7, "ts": 1716454218228614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228604, "dur": 9, "args": { "External id": 45807, "cbid": 211, "correlation": 45807 } }, { "ph": "s", "id": 45807, "pid": 76337, "tid": -914061504, "ts": 1716454218228604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454218228673, "dur": 4, "args": { "External id": 45819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45819, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45819, "pid": 5, "tid": 7, "ts": 1716454218228673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228660, "dur": 13, "args": { "External id": 45819, "cbid": 211, "correlation": 45819 } }, { "ph": "s", "id": 45819, "pid": 76337, "tid": -914061504, "ts": 1716454218228660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218228720, "dur": 4, "args": { "External id": 45830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45830, "pid": 5, "tid": 7, "ts": 1716454218228720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228708, "dur": 12, "args": { "External id": 45830, "cbid": 211, "correlation": 45830 } }, { "ph": "s", "id": 45830, "pid": 76337, "tid": -914061504, "ts": 1716454218228708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218228751, "dur": 3, "args": { "External id": 45838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45838, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45838, "pid": 5, "tid": 7, "ts": 1716454218228751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228742, "dur": 8, "args": { "External id": 45838, "cbid": 211, "correlation": 45838 } }, { "ph": "s", "id": 45838, "pid": 76337, "tid": -914061504, "ts": 1716454218228742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218228784, "dur": 5, "args": { "External id": 45846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45846, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45846, "pid": 5, "tid": 7, "ts": 1716454218228784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228774, "dur": 11, "args": { "External id": 45846, "cbid": 211, "correlation": 45846 } }, { "ph": "s", "id": 45846, "pid": 76337, "tid": -914061504, "ts": 1716454218228774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218228817, "dur": 5, "args": { "External id": 45854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45854, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45854, "pid": 5, "tid": 7, "ts": 1716454218228817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228803, "dur": 13, "args": { "External id": 45854, "cbid": 211, "correlation": 45854 } }, { "ph": "s", "id": 45854, "pid": 76337, "tid": -914061504, "ts": 1716454218228803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218228847, "dur": 4, "args": { "External id": 45863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45863, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45863, "pid": 5, "tid": 7, "ts": 1716454218228847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228837, "dur": 10, "args": { "External id": 45863, "cbid": 211, "correlation": 45863 } }, { "ph": "s", "id": 45863, "pid": 76337, "tid": -914061504, "ts": 1716454218228837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218228911, "dur": 5, "args": { "External id": 45876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45876, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45876, "pid": 5, "tid": 7, "ts": 1716454218228911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228898, "dur": 13, "args": { "External id": 45876, "cbid": 211, "correlation": 45876 } }, { "ph": "s", "id": 45876, "pid": 76337, "tid": -914061504, "ts": 1716454218228898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454218228954, "dur": 8, "args": { "External id": 45886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45886, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 45886, "pid": 5, "tid": 7, "ts": 1716454218228954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218228942, "dur": 12, "args": { "External id": 45886, "cbid": 211, "correlation": 45886 } }, { "ph": "s", "id": 45886, "pid": 76337, "tid": -914061504, "ts": 1716454218228942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218229080, "dur": 4, "args": { "External id": 45903, "cbid": 251, "correlation": 45903 } }, { "ph": "f", "id": 45903, "pid": 76337, "tid": -914061504, "ts": 1716454218229080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454218229107, "dur": 12, "args": { "External id": 45905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45905, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 45905, "pid": 5, "tid": 7, "ts": 1716454218229107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229092, "dur": 16, "args": { "External id": 45905, "cbid": 211, "correlation": 45905 } }, { "ph": "s", "id": 45905, "pid": 76337, "tid": -914061504, "ts": 1716454218229092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218229162, "dur": 4, "args": { "External id": 45913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45913, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 45913, "pid": 5, "tid": 7, "ts": 1716454218229162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229151, "dur": 11, "args": { "External id": 45913, "cbid": 211, "correlation": 45913 } }, { "ph": "s", "id": 45913, "pid": 76337, "tid": -914061504, "ts": 1716454218229151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218229217, "dur": 1, "args": { "External id": 45929, "cbid": 251, "correlation": 45929 } }, { "ph": "f", "id": 45929, "pid": 76337, "tid": -914061504, "ts": 1716454218229217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218229223, "dur": 0, "args": { "External id": 45931, "cbid": 251, "correlation": 45931 } }, { "ph": "f", "id": 45931, "pid": 76337, "tid": -914061504, "ts": 1716454218229223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218229240, "dur": 13, "args": { "External id": 45932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45932, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 45932, "pid": 5, "tid": 7, "ts": 1716454218229240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229226, "dur": 14, "args": { "External id": 45932, "cbid": 211, "correlation": 45932 } }, { "ph": "s", "id": 45932, "pid": 76337, "tid": -914061504, "ts": 1716454218229226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218229254, "dur": 5, "args": { "External id": 45934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45934, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 45934, "pid": 5, "tid": 7, "ts": 1716454218229254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229244, "dur": 9, "args": { "External id": 45934, "cbid": 211, "correlation": 45934 } }, { "ph": "s", "id": 45934, "pid": 76337, "tid": -914061504, "ts": 1716454218229244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218229343, "dur": 1, "args": { "External id": 45944, "cbid": 317, "correlation": 45944 } }, { "ph": "f", "id": 45944, "pid": 76337, "tid": -914061504, "ts": 1716454218229343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218229345, "dur": 3, "args": { "External id": 45945, "cbid": 203, "correlation": 45945 } }, { "ph": "f", "id": 45945, "pid": 76337, "tid": -914061504, "ts": 1716454218229345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218229349, "dur": 1, "args": { "External id": 45946, "cbid": 205, "correlation": 45946 } }, { "ph": "f", "id": 45946, "pid": 76337, "tid": -914061504, "ts": 1716454218229349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218229399, "dur": 7, "args": { "External id": 45950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45950, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45950, "pid": 5, "tid": 7, "ts": 1716454218229399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229384, "dur": 14, "args": { "External id": 45950, "cbid": 211, "correlation": 45950 } }, { "ph": "s", "id": 45950, "pid": 76337, "tid": -914061504, "ts": 1716454218229384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218229410, "dur": 4, "args": { "External id": 45952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 45952, "pid": 5, "tid": 7, "ts": 1716454218229410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229402, "dur": 6, "args": { "External id": 45952, "cbid": 211, "correlation": 45952 } }, { "ph": "s", "id": 45952, "pid": 76337, "tid": -914061504, "ts": 1716454218229402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218229427, "dur": 3, "args": { "External id": 45954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 45954, "pid": 5, "tid": 7, "ts": 1716454218229427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229418, "dur": 8, "args": { "External id": 45954, "cbid": 211, "correlation": 45954 } }, { "ph": "s", "id": 45954, "pid": 76337, "tid": -914061504, "ts": 1716454218229418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218229432, "dur": 0, "args": { "External id": 45955, "cbid": 51, "correlation": 45955 } }, { "ph": "s", "id": 45955, "pid": 76337, "tid": -914061504, "ts": 1716454218229432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218229441, "dur": 85, "args": { "External id": 45956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45956, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 45956, "pid": 5, "tid": 7, "ts": 1716454218229441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229433, "dur": 6, "args": { "External id": 45956, "cbid": 211, "correlation": 45956 } }, { "ph": "s", "id": 45956, "pid": 76337, "tid": -914061504, "ts": 1716454218229433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218229528, "dur": 60, "args": { "External id": 45961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45961, "pid": 5, "tid": 7, "ts": 1716454218229528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218229467, "dur": 10, "args": { "External id": 45961, "cbid": 211, "correlation": 45961 } }, { "ph": "s", "id": 45961, "pid": 76337, "tid": -914061504, "ts": 1716454218229467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218231232, "dur": 52, "args": { "External id": 45981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45981, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 45981, "pid": 5, "tid": 7, "ts": 1716454218231232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231217, "dur": 16, "args": { "External id": 45981, "cbid": 211, "correlation": 45981 } }, { "ph": "s", "id": 45981, "pid": 76337, "tid": -914061504, "ts": 1716454218231217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218231286, "dur": 4, "args": { "External id": 45993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45993, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 45993, "pid": 5, "tid": 7, "ts": 1716454218231286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231245, "dur": 8, "args": { "External id": 45993, "cbid": 211, "correlation": 45993 } }, { "ph": "s", "id": 45993, "pid": 76337, "tid": -914061504, "ts": 1716454218231245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218231292, "dur": 59, "args": { "External id": 45996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 45996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 45996, "pid": 5, "tid": 7, "ts": 1716454218231292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231268, "dur": 8, "args": { "External id": 45996, "cbid": 211, "correlation": 45996 } }, { "ph": "s", "id": 45996, "pid": 76337, "tid": -914061504, "ts": 1716454218231268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218231352, "dur": 36, "args": { "External id": 46005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46005, "pid": 5, "tid": 7, "ts": 1716454218231352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231313, "dur": 10, "args": { "External id": 46005, "cbid": 211, "correlation": 46005 } }, { "ph": "s", "id": 46005, "pid": 76337, "tid": -914061504, "ts": 1716454218231313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218231375, "dur": 0, "args": { "External id": 46015, "cbid": 317, "correlation": 46015 } }, { "ph": "f", "id": 46015, "pid": 76337, "tid": -914061504, "ts": 1716454218231375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218231376, "dur": 0, "args": { "External id": 46016, "cbid": 203, "correlation": 46016 } }, { "ph": "f", "id": 46016, "pid": 76337, "tid": -914061504, "ts": 1716454218231376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218231377, "dur": 0, "args": { "External id": 46017, "cbid": 205, "correlation": 46017 } }, { "ph": "f", "id": 46017, "pid": 76337, "tid": -914061504, "ts": 1716454218231377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218231409, "dur": 40, "args": { "External id": 46021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46021, "pid": 5, "tid": 7, "ts": 1716454218231409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231395, "dur": 13, "args": { "External id": 46021, "cbid": 211, "correlation": 46021 } }, { "ph": "s", "id": 46021, "pid": 76337, "tid": -914061504, "ts": 1716454218231395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218231449, "dur": 15, "args": { "External id": 46023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46023, "pid": 5, "tid": 7, "ts": 1716454218231449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231411, "dur": 6, "args": { "External id": 46023, "cbid": 211, "correlation": 46023 } }, { "ph": "s", "id": 46023, "pid": 76337, "tid": -914061504, "ts": 1716454218231411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218231465, "dur": 3, "args": { "External id": 46025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46025, "pid": 5, "tid": 7, "ts": 1716454218231465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231422, "dur": 6, "args": { "External id": 46025, "cbid": 211, "correlation": 46025 } }, { "ph": "s", "id": 46025, "pid": 76337, "tid": -914061504, "ts": 1716454218231422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218231432, "dur": 0, "args": { "External id": 46026, "cbid": 51, "correlation": 46026 } }, { "ph": "s", "id": 46026, "pid": 76337, "tid": -914061504, "ts": 1716454218231432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218231470, "dur": 707, "args": { "External id": 46027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46027, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46027, "pid": 5, "tid": 7, "ts": 1716454218231470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231433, "dur": 6, "args": { "External id": 46027, "cbid": 211, "correlation": 46027 } }, { "ph": "s", "id": 46027, "pid": 76337, "tid": -914061504, "ts": 1716454218231433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218232179, "dur": 59, "args": { "External id": 46032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46032, "pid": 5, "tid": 7, "ts": 1716454218232179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231464, "dur": 9, "args": { "External id": 46032, "cbid": 211, "correlation": 46032 } }, { "ph": "s", "id": 46032, "pid": 76337, "tid": -914061504, "ts": 1716454218231464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218232239, "dur": 3, "args": { "External id": 46040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46040, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46040, "pid": 5, "tid": 7, "ts": 1716454218232239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231507, "dur": 10, "args": { "External id": 46040, "cbid": 211, "correlation": 46040 } }, { "ph": "s", "id": 46040, "pid": 76337, "tid": -914061504, "ts": 1716454218231507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218231573, "dur": 2, "args": { "External id": 46056, "cbid": 251, "correlation": 46056 } }, { "ph": "f", "id": 46056, "pid": 76337, "tid": -914061504, "ts": 1716454218231573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218231579, "dur": 0, "args": { "External id": 46058, "cbid": 251, "correlation": 46058 } }, { "ph": "f", "id": 46058, "pid": 76337, "tid": -914061504, "ts": 1716454218231579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218232244, "dur": 9, "args": { "External id": 46059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46059, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 46059, "pid": 5, "tid": 7, "ts": 1716454218232244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231580, "dur": 11, "args": { "External id": 46059, "cbid": 211, "correlation": 46059 } }, { "ph": "s", "id": 46059, "pid": 76337, "tid": -914061504, "ts": 1716454218231580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218232255, "dur": 4, "args": { "External id": 46061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46061, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 46061, "pid": 5, "tid": 7, "ts": 1716454218232255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231594, "dur": 6, "args": { "External id": 46061, "cbid": 211, "correlation": 46061 } }, { "ph": "s", "id": 46061, "pid": 76337, "tid": -914061504, "ts": 1716454218231594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218232260, "dur": 54, "args": { "External id": 46071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46071, "pid": 5, "tid": 7, "ts": 1716454218232260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231654, "dur": 16, "args": { "External id": 46071, "cbid": 211, "correlation": 46071 } }, { "ph": "s", "id": 46071, "pid": 76337, "tid": -914061504, "ts": 1716454218231654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218232315, "dur": 52, "args": { "External id": 46091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46091, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 46091, "pid": 5, "tid": 7, "ts": 1716454218232315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231726, "dur": 11, "args": { "External id": 46091, "cbid": 211, "correlation": 46091 } }, { "ph": "s", "id": 46091, "pid": 76337, "tid": -914061504, "ts": 1716454218231726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218232369, "dur": 4, "args": { "External id": 46103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46103, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46103, "pid": 5, "tid": 7, "ts": 1716454218232369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231747, "dur": 6, "args": { "External id": 46103, "cbid": 211, "correlation": 46103 } }, { "ph": "s", "id": 46103, "pid": 76337, "tid": -914061504, "ts": 1716454218231747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218232374, "dur": 54, "args": { "External id": 46106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46106, "pid": 5, "tid": 7, "ts": 1716454218232374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231766, "dur": 7, "args": { "External id": 46106, "cbid": 211, "correlation": 46106 } }, { "ph": "s", "id": 46106, "pid": 76337, "tid": -914061504, "ts": 1716454218231766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218232430, "dur": 37, "args": { "External id": 46115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46115, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46115, "pid": 5, "tid": 7, "ts": 1716454218232430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231807, "dur": 10, "args": { "External id": 46115, "cbid": 211, "correlation": 46115 } }, { "ph": "s", "id": 46115, "pid": 76337, "tid": -914061504, "ts": 1716454218231807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218231876, "dur": 0, "args": { "External id": 46125, "cbid": 317, "correlation": 46125 } }, { "ph": "f", "id": 46125, "pid": 76337, "tid": -914061504, "ts": 1716454218231876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218231877, "dur": 0, "args": { "External id": 46126, "cbid": 203, "correlation": 46126 } }, { "ph": "f", "id": 46126, "pid": 76337, "tid": -914061504, "ts": 1716454218231877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218231878, "dur": 0, "args": { "External id": 46127, "cbid": 205, "correlation": 46127 } }, { "ph": "f", "id": 46127, "pid": 76337, "tid": -914061504, "ts": 1716454218231878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218232468, "dur": 40, "args": { "External id": 46131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46131, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46131, "pid": 5, "tid": 7, "ts": 1716454218232468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231892, "dur": 12, "args": { "External id": 46131, "cbid": 211, "correlation": 46131 } }, { "ph": "s", "id": 46131, "pid": 76337, "tid": -914061504, "ts": 1716454218231892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218232509, "dur": 14, "args": { "External id": 46133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46133, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46133, "pid": 5, "tid": 7, "ts": 1716454218232509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231906, "dur": 5, "args": { "External id": 46133, "cbid": 211, "correlation": 46133 } }, { "ph": "s", "id": 46133, "pid": 76337, "tid": -914061504, "ts": 1716454218231906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218232525, "dur": 3, "args": { "External id": 46135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46135, "pid": 5, "tid": 7, "ts": 1716454218232525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231915, "dur": 5, "args": { "External id": 46135, "cbid": 211, "correlation": 46135 } }, { "ph": "s", "id": 46135, "pid": 76337, "tid": -914061504, "ts": 1716454218231915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218231924, "dur": 0, "args": { "External id": 46136, "cbid": 51, "correlation": 46136 } }, { "ph": "s", "id": 46136, "pid": 76337, "tid": -914061504, "ts": 1716454218231924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218232529, "dur": 699, "args": { "External id": 46137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46137, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46137, "pid": 5, "tid": 7, "ts": 1716454218232529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231925, "dur": 8, "args": { "External id": 46137, "cbid": 211, "correlation": 46137 } }, { "ph": "s", "id": 46137, "pid": 76337, "tid": -914061504, "ts": 1716454218231925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218233229, "dur": 60, "args": { "External id": 46142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46142, "pid": 5, "tid": 7, "ts": 1716454218233229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231955, "dur": 9, "args": { "External id": 46142, "cbid": 211, "correlation": 46142 } }, { "ph": "s", "id": 46142, "pid": 76337, "tid": -914061504, "ts": 1716454218231955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218233290, "dur": 50, "args": { "External id": 46150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46150, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46150, "pid": 5, "tid": 7, "ts": 1716454218233290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218231996, "dur": 9, "args": { "External id": 46150, "cbid": 211, "correlation": 46150 } }, { "ph": "s", "id": 46150, "pid": 76337, "tid": -914061504, "ts": 1716454218231996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218233342, "dur": 35, "args": { "External id": 46158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46158, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46158, "pid": 5, "tid": 7, "ts": 1716454218233342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232028, "dur": 10, "args": { "External id": 46158, "cbid": 211, "correlation": 46158 } }, { "ph": "s", "id": 46158, "pid": 76337, "tid": -914061504, "ts": 1716454218232028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218233378, "dur": 51, "args": { "External id": 46178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46178, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 46178, "pid": 5, "tid": 7, "ts": 1716454218233378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232111, "dur": 12, "args": { "External id": 46178, "cbid": 211, "correlation": 46178 } }, { "ph": "s", "id": 46178, "pid": 76337, "tid": -914061504, "ts": 1716454218232111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218233431, "dur": 4, "args": { "External id": 46190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46190, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46190, "pid": 5, "tid": 7, "ts": 1716454218233431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232133, "dur": 6, "args": { "External id": 46190, "cbid": 211, "correlation": 46190 } }, { "ph": "s", "id": 46190, "pid": 76337, "tid": -914061504, "ts": 1716454218232133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218233436, "dur": 55, "args": { "External id": 46193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46193, "pid": 5, "tid": 7, "ts": 1716454218233436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232151, "dur": 6, "args": { "External id": 46193, "cbid": 211, "correlation": 46193 } }, { "ph": "s", "id": 46193, "pid": 76337, "tid": -914061504, "ts": 1716454218232151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218232207, "dur": 0, "args": { "External id": 46204, "cbid": 317, "correlation": 46204 } }, { "ph": "f", "id": 46204, "pid": 76337, "tid": -914061504, "ts": 1716454218232207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218232208, "dur": 0, "args": { "External id": 46205, "cbid": 203, "correlation": 46205 } }, { "ph": "f", "id": 46205, "pid": 76337, "tid": -914061504, "ts": 1716454218232208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218232209, "dur": 0, "args": { "External id": 46206, "cbid": 205, "correlation": 46206 } }, { "ph": "f", "id": 46206, "pid": 76337, "tid": -914061504, "ts": 1716454218232209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232242, "dur": 2, "args": { "External id": 46210, "cbid": 251, "correlation": 46210 } }, { "ph": "f", "id": 46210, "pid": 76337, "tid": -914061504, "ts": 1716454218232242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232245, "dur": 1, "args": { "External id": 46211, "cbid": 251, "correlation": 46211 } }, { "ph": "f", "id": 46211, "pid": 76337, "tid": -914061504, "ts": 1716454218232245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232247, "dur": 1, "args": { "External id": 46212, "cbid": 251, "correlation": 46212 } }, { "ph": "f", "id": 46212, "pid": 76337, "tid": -914061504, "ts": 1716454218232247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232250, "dur": 4, "args": { "External id": 46213, "cbid": 251, "correlation": 46213 } }, { "ph": "f", "id": 46213, "pid": 76337, "tid": -914061504, "ts": 1716454218232250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232254, "dur": 0, "args": { "External id": 46214, "cbid": 251, "correlation": 46214 } }, { "ph": "f", "id": 46214, "pid": 76337, "tid": -914061504, "ts": 1716454218232254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232256, "dur": 1, "args": { "External id": 46215, "cbid": 251, "correlation": 46215 } }, { "ph": "f", "id": 46215, "pid": 76337, "tid": -914061504, "ts": 1716454218232256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232257, "dur": 1, "args": { "External id": 46216, "cbid": 251, "correlation": 46216 } }, { "ph": "f", "id": 46216, "pid": 76337, "tid": -914061504, "ts": 1716454218232257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232259, "dur": 1, "args": { "External id": 46217, "cbid": 251, "correlation": 46217 } }, { "ph": "f", "id": 46217, "pid": 76337, "tid": -914061504, "ts": 1716454218232259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232262, "dur": 0, "args": { "External id": 46218, "cbid": 251, "correlation": 46218 } }, { "ph": "f", "id": 46218, "pid": 76337, "tid": -914061504, "ts": 1716454218232262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218233492, "dur": 115, "args": { "External id": 46219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46219, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46219, "pid": 5, "tid": 7, "ts": 1716454218233492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232265, "dur": 15, "args": { "External id": 46219, "cbid": 211, "correlation": 46219 } }, { "ph": "s", "id": 46219, "pid": 76337, "tid": -914061504, "ts": 1716454218232265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218233608, "dur": 60, "args": { "External id": 46225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46225, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46225, "pid": 5, "tid": 7, "ts": 1716454218233608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232304, "dur": 9, "args": { "External id": 46225, "cbid": 211, "correlation": 46225 } }, { "ph": "s", "id": 46225, "pid": 76337, "tid": -914061504, "ts": 1716454218232304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218233670, "dur": 584, "args": { "External id": 46234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46234, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46234, "pid": 5, "tid": 7, "ts": 1716454218233670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232395, "dur": 15, "args": { "External id": 46234, "cbid": 211, "correlation": 46234 } }, { "ph": "s", "id": 46234, "pid": 76337, "tid": -914061504, "ts": 1716454218232395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218234256, "dur": 183, "args": { "External id": 46256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46256, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46256, "pid": 5, "tid": 7, "ts": 1716454218234256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232468, "dur": 12, "args": { "External id": 46256, "cbid": 211, "correlation": 46256 } }, { "ph": "s", "id": 46256, "pid": 76337, "tid": -914061504, "ts": 1716454218232468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232584, "dur": 2, "args": { "External id": 46267, "cbid": 251, "correlation": 46267 } }, { "ph": "f", "id": 46267, "pid": 76337, "tid": -914061504, "ts": 1716454218232584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218234440, "dur": 198, "args": { "External id": 46268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46268, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46268, "pid": 5, "tid": 7, "ts": 1716454218234440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232591, "dur": 14, "args": { "External id": 46268, "cbid": 211, "correlation": 46268 } }, { "ph": "s", "id": 46268, "pid": 76337, "tid": -914061504, "ts": 1716454218232591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232663, "dur": 1, "args": { "External id": 46279, "cbid": 251, "correlation": 46279 } }, { "ph": "f", "id": 46279, "pid": 76337, "tid": -914061504, "ts": 1716454218232663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218234639, "dur": 189, "args": { "External id": 46280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46280, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46280, "pid": 5, "tid": 7, "ts": 1716454218234639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232667, "dur": 11, "args": { "External id": 46280, "cbid": 211, "correlation": 46280 } }, { "ph": "s", "id": 46280, "pid": 76337, "tid": -914061504, "ts": 1716454218232667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232734, "dur": 1, "args": { "External id": 46291, "cbid": 251, "correlation": 46291 } }, { "ph": "f", "id": 46291, "pid": 76337, "tid": -914061504, "ts": 1716454218232734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218234830, "dur": 189, "args": { "External id": 46292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46292, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46292, "pid": 5, "tid": 7, "ts": 1716454218234830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232738, "dur": 12, "args": { "External id": 46292, "cbid": 211, "correlation": 46292 } }, { "ph": "s", "id": 46292, "pid": 76337, "tid": -914061504, "ts": 1716454218232738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218235020, "dur": 18796, "args": { "External id": 46313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46313, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46313, "pid": 5, "tid": 7, "ts": 1716454218235020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232846, "dur": 14, "args": { "External id": 46313, "cbid": 211, "correlation": 46313 } }, { "ph": "s", "id": 46313, "pid": 76337, "tid": -914061504, "ts": 1716454218232846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218232956, "dur": 2, "args": { "External id": 46331, "cbid": 251, "correlation": 46331 } }, { "ph": "f", "id": 46331, "pid": 76337, "tid": -914061504, "ts": 1716454218232956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218253818, "dur": 203, "args": { "External id": 46333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46333, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46333, "pid": 5, "tid": 7, "ts": 1716454218253818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218232963, "dur": 20, "args": { "External id": 46333, "cbid": 211, "correlation": 46333 } }, { "ph": "s", "id": 46333, "pid": 76337, "tid": -914061504, "ts": 1716454218232963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218254022, "dur": 66, "args": { "External id": 46341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46341, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46341, "pid": 5, "tid": 7, "ts": 1716454218254022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233044, "dur": 13, "args": { "External id": 46341, "cbid": 211, "correlation": 46341 } }, { "ph": "s", "id": 46341, "pid": 76337, "tid": -914061504, "ts": 1716454218233044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218254090, "dur": 97, "args": { "External id": 46349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46349, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46349, "pid": 5, "tid": 7, "ts": 1716454218254090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233083, "dur": 9, "args": { "External id": 46349, "cbid": 211, "correlation": 46349 } }, { "ph": "s", "id": 46349, "pid": 76337, "tid": -914061504, "ts": 1716454218233083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218254187, "dur": 55, "args": { "External id": 46360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46360, "pid": 5, "tid": 7, "ts": 1716454218254187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233167, "dur": 14, "args": { "External id": 46360, "cbid": 211, "correlation": 46360 } }, { "ph": "s", "id": 46360, "pid": 76337, "tid": -914061504, "ts": 1716454218233167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218254244, "dur": 93, "args": { "External id": 46382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46382, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46382, "pid": 5, "tid": 7, "ts": 1716454218254244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233204, "dur": 8, "args": { "External id": 46382, "cbid": 211, "correlation": 46382 } }, { "ph": "s", "id": 46382, "pid": 76337, "tid": -914061504, "ts": 1716454218233204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233288, "dur": 1, "args": { "External id": 46393, "cbid": 251, "correlation": 46393 } }, { "ph": "f", "id": 46393, "pid": 76337, "tid": -914061504, "ts": 1716454218233288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218254338, "dur": 106, "args": { "External id": 46394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46394, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46394, "pid": 5, "tid": 7, "ts": 1716454218254338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233294, "dur": 13, "args": { "External id": 46394, "cbid": 211, "correlation": 46394 } }, { "ph": "s", "id": 46394, "pid": 76337, "tid": -914061504, "ts": 1716454218233294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233378, "dur": 1, "args": { "External id": 46405, "cbid": 251, "correlation": 46405 } }, { "ph": "f", "id": 46405, "pid": 76337, "tid": -914061504, "ts": 1716454218233378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233382, "dur": 0, "args": { "External id": 46406, "cbid": 251, "correlation": 46406 } }, { "ph": "f", "id": 46406, "pid": 76337, "tid": -914061504, "ts": 1716454218233382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218254445, "dur": 10, "args": { "External id": 46407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46407, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 46407, "pid": 5, "tid": 7, "ts": 1716454218254445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233384, "dur": 14, "args": { "External id": 46407, "cbid": 211, "correlation": 46407 } }, { "ph": "s", "id": 46407, "pid": 76337, "tid": -914061504, "ts": 1716454218233384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218254457, "dur": 5, "args": { "External id": 46409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46409, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 46409, "pid": 5, "tid": 7, "ts": 1716454218254457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233401, "dur": 7, "args": { "External id": 46409, "cbid": 211, "correlation": 46409 } }, { "ph": "s", "id": 46409, "pid": 76337, "tid": -914061504, "ts": 1716454218233401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233462, "dur": 1, "args": { "External id": 46420, "cbid": 251, "correlation": 46420 } }, { "ph": "f", "id": 46420, "pid": 76337, "tid": -914061504, "ts": 1716454218233462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233465, "dur": 0, "args": { "External id": 46421, "cbid": 251, "correlation": 46421 } }, { "ph": "f", "id": 46421, "pid": 76337, "tid": -914061504, "ts": 1716454218233465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218254463, "dur": 6, "args": { "External id": 46422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46422, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 46422, "pid": 5, "tid": 7, "ts": 1716454218254463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233467, "dur": 11, "args": { "External id": 46422, "cbid": 211, "correlation": 46422 } }, { "ph": "s", "id": 46422, "pid": 76337, "tid": -914061504, "ts": 1716454218233467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218254470, "dur": 3, "args": { "External id": 46424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46424, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 46424, "pid": 5, "tid": 7, "ts": 1716454218254470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233480, "dur": 5, "args": { "External id": 46424, "cbid": 211, "correlation": 46424 } }, { "ph": "s", "id": 46424, "pid": 76337, "tid": -914061504, "ts": 1716454218233480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218254475, "dur": 155, "args": { "External id": 46445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46445, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46445, "pid": 5, "tid": 7, "ts": 1716454218254475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233555, "dur": 13, "args": { "External id": 46445, "cbid": 211, "correlation": 46445 } }, { "ph": "s", "id": 46445, "pid": 76337, "tid": -914061504, "ts": 1716454218233555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233654, "dur": 2, "args": { "External id": 46463, "cbid": 251, "correlation": 46463 } }, { "ph": "f", "id": 46463, "pid": 76337, "tid": -914061504, "ts": 1716454218233654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218254631, "dur": 109, "args": { "External id": 46465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46465, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46465, "pid": 5, "tid": 7, "ts": 1716454218254631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233661, "dur": 14, "args": { "External id": 46465, "cbid": 211, "correlation": 46465 } }, { "ph": "s", "id": 46465, "pid": 76337, "tid": -914061504, "ts": 1716454218233661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218254742, "dur": 35, "args": { "External id": 46473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46473, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46473, "pid": 5, "tid": 7, "ts": 1716454218254742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233732, "dur": 12, "args": { "External id": 46473, "cbid": 211, "correlation": 46473 } }, { "ph": "s", "id": 46473, "pid": 76337, "tid": -914061504, "ts": 1716454218233732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218254778, "dur": 67, "args": { "External id": 46481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46481, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46481, "pid": 5, "tid": 7, "ts": 1716454218254778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233774, "dur": 9, "args": { "External id": 46481, "cbid": 211, "correlation": 46481 } }, { "ph": "s", "id": 46481, "pid": 76337, "tid": -914061504, "ts": 1716454218233774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218254846, "dur": 92, "args": { "External id": 46503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46503, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46503, "pid": 5, "tid": 7, "ts": 1716454218254846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233826, "dur": 10, "args": { "External id": 46503, "cbid": 211, "correlation": 46503 } }, { "ph": "s", "id": 46503, "pid": 76337, "tid": -914061504, "ts": 1716454218233826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218233913, "dur": 1, "args": { "External id": 46519, "cbid": 251, "correlation": 46519 } }, { "ph": "f", "id": 46519, "pid": 76337, "tid": -914061504, "ts": 1716454218233913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218254940, "dur": 566, "args": { "External id": 46521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46521, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46521, "pid": 5, "tid": 7, "ts": 1716454218254940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218233919, "dur": 16, "args": { "External id": 46521, "cbid": 211, "correlation": 46521 } }, { "ph": "s", "id": 46521, "pid": 76337, "tid": -914061504, "ts": 1716454218233919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218255507, "dur": 239, "args": { "External id": 46529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46529, "pid": 5, "tid": 7, "ts": 1716454218255507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234007, "dur": 15, "args": { "External id": 46529, "cbid": 211, "correlation": 46529 } }, { "ph": "s", "id": 46529, "pid": 76337, "tid": -914061504, "ts": 1716454218234007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218255748, "dur": 246, "args": { "External id": 46537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46537, "pid": 5, "tid": 7, "ts": 1716454218255748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234043, "dur": 9, "args": { "External id": 46537, "cbid": 211, "correlation": 46537 } }, { "ph": "s", "id": 46537, "pid": 76337, "tid": -914061504, "ts": 1716454218234043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234127, "dur": 2, "args": { "External id": 46553, "cbid": 251, "correlation": 46553 } }, { "ph": "f", "id": 46553, "pid": 76337, "tid": -914061504, "ts": 1716454218234127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234133, "dur": 0, "args": { "External id": 46555, "cbid": 251, "correlation": 46555 } }, { "ph": "f", "id": 46555, "pid": 76337, "tid": -914061504, "ts": 1716454218234133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218255996, "dur": 352, "args": { "External id": 46556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46556, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 46556, "pid": 5, "tid": 7, "ts": 1716454218255996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234139, "dur": 13, "args": { "External id": 46556, "cbid": 211, "correlation": 46556 } }, { "ph": "s", "id": 46556, "pid": 76337, "tid": -914061504, "ts": 1716454218234139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218256348, "dur": 51, "args": { "External id": 46564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46564, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46564, "pid": 5, "tid": 7, "ts": 1716454218256348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234182, "dur": 10, "args": { "External id": 46564, "cbid": 211, "correlation": 46564 } }, { "ph": "s", "id": 46564, "pid": 76337, "tid": -914061504, "ts": 1716454218234182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218256401, "dur": 151, "args": { "External id": 46575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46575, "pid": 5, "tid": 7, "ts": 1716454218256401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234253, "dur": 12, "args": { "External id": 46575, "cbid": 211, "correlation": 46575 } }, { "ph": "s", "id": 46575, "pid": 76337, "tid": -914061504, "ts": 1716454218234253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218234317, "dur": 0, "args": { "External id": 46587, "cbid": 317, "correlation": 46587 } }, { "ph": "f", "id": 46587, "pid": 76337, "tid": -914061504, "ts": 1716454218234317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218234318, "dur": 0, "args": { "External id": 46588, "cbid": 203, "correlation": 46588 } }, { "ph": "f", "id": 46588, "pid": 76337, "tid": -914061504, "ts": 1716454218234318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218234319, "dur": 0, "args": { "External id": 46589, "cbid": 205, "correlation": 46589 } }, { "ph": "f", "id": 46589, "pid": 76337, "tid": -914061504, "ts": 1716454218234319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234343, "dur": 1, "args": { "External id": 46593, "cbid": 251, "correlation": 46593 } }, { "ph": "f", "id": 46593, "pid": 76337, "tid": -914061504, "ts": 1716454218234343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234345, "dur": 0, "args": { "External id": 46594, "cbid": 251, "correlation": 46594 } }, { "ph": "f", "id": 46594, "pid": 76337, "tid": -914061504, "ts": 1716454218234345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234346, "dur": 0, "args": { "External id": 46595, "cbid": 251, "correlation": 46595 } }, { "ph": "f", "id": 46595, "pid": 76337, "tid": -914061504, "ts": 1716454218234346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234347, "dur": 0, "args": { "External id": 46596, "cbid": 251, "correlation": 46596 } }, { "ph": "f", "id": 46596, "pid": 76337, "tid": -914061504, "ts": 1716454218234347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234347, "dur": 0, "args": { "External id": 46597, "cbid": 251, "correlation": 46597 } }, { "ph": "f", "id": 46597, "pid": 76337, "tid": -914061504, "ts": 1716454218234347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234348, "dur": 3, "args": { "External id": 46598, "cbid": 251, "correlation": 46598 } }, { "ph": "f", "id": 46598, "pid": 76337, "tid": -914061504, "ts": 1716454218234348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234352, "dur": 0, "args": { "External id": 46599, "cbid": 251, "correlation": 46599 } }, { "ph": "f", "id": 46599, "pid": 76337, "tid": -914061504, "ts": 1716454218234352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234353, "dur": 0, "args": { "External id": 46600, "cbid": 251, "correlation": 46600 } }, { "ph": "f", "id": 46600, "pid": 76337, "tid": -914061504, "ts": 1716454218234353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234354, "dur": 0, "args": { "External id": 46601, "cbid": 251, "correlation": 46601 } }, { "ph": "f", "id": 46601, "pid": 76337, "tid": -914061504, "ts": 1716454218234354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218256553, "dur": 112, "args": { "External id": 46602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46602, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46602, "pid": 5, "tid": 7, "ts": 1716454218256553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234357, "dur": 13, "args": { "External id": 46602, "cbid": 211, "correlation": 46602 } }, { "ph": "s", "id": 46602, "pid": 76337, "tid": -914061504, "ts": 1716454218234357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218256667, "dur": 58, "args": { "External id": 46608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46608, "pid": 5, "tid": 7, "ts": 1716454218256667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234393, "dur": 9, "args": { "External id": 46608, "cbid": 211, "correlation": 46608 } }, { "ph": "s", "id": 46608, "pid": 76337, "tid": -914061504, "ts": 1716454218234393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218256726, "dur": 51, "args": { "External id": 46616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46616, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46616, "pid": 5, "tid": 7, "ts": 1716454218256726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234425, "dur": 8, "args": { "External id": 46616, "cbid": 211, "correlation": 46616 } }, { "ph": "s", "id": 46616, "pid": 76337, "tid": -914061504, "ts": 1716454218234425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218256778, "dur": 50, "args": { "External id": 46636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46636, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 46636, "pid": 5, "tid": 7, "ts": 1716454218256778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234500, "dur": 11, "args": { "External id": 46636, "cbid": 211, "correlation": 46636 } }, { "ph": "s", "id": 46636, "pid": 76337, "tid": -914061504, "ts": 1716454218234500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218256829, "dur": 4, "args": { "External id": 46648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46648, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46648, "pid": 5, "tid": 7, "ts": 1716454218256829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234521, "dur": 7, "args": { "External id": 46648, "cbid": 211, "correlation": 46648 } }, { "ph": "s", "id": 46648, "pid": 76337, "tid": -914061504, "ts": 1716454218234521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218256835, "dur": 54, "args": { "External id": 46651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46651, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46651, "pid": 5, "tid": 7, "ts": 1716454218256835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234539, "dur": 7, "args": { "External id": 46651, "cbid": 211, "correlation": 46651 } }, { "ph": "s", "id": 46651, "pid": 76337, "tid": -914061504, "ts": 1716454218234539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218256891, "dur": 36, "args": { "External id": 46660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46660, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46660, "pid": 5, "tid": 7, "ts": 1716454218256891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234577, "dur": 9, "args": { "External id": 46660, "cbid": 211, "correlation": 46660 } }, { "ph": "s", "id": 46660, "pid": 76337, "tid": -914061504, "ts": 1716454218234577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218234631, "dur": 0, "args": { "External id": 46670, "cbid": 317, "correlation": 46670 } }, { "ph": "f", "id": 46670, "pid": 76337, "tid": -914061504, "ts": 1716454218234631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218234632, "dur": 0, "args": { "External id": 46671, "cbid": 203, "correlation": 46671 } }, { "ph": "f", "id": 46671, "pid": 76337, "tid": -914061504, "ts": 1716454218234632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218234633, "dur": 0, "args": { "External id": 46672, "cbid": 205, "correlation": 46672 } }, { "ph": "f", "id": 46672, "pid": 76337, "tid": -914061504, "ts": 1716454218234633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218256928, "dur": 40, "args": { "External id": 46676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46676, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46676, "pid": 5, "tid": 7, "ts": 1716454218256928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234648, "dur": 12, "args": { "External id": 46676, "cbid": 211, "correlation": 46676 } }, { "ph": "s", "id": 46676, "pid": 76337, "tid": -914061504, "ts": 1716454218234648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218256970, "dur": 14, "args": { "External id": 46678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46678, "pid": 5, "tid": 7, "ts": 1716454218256970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234663, "dur": 5, "args": { "External id": 46678, "cbid": 211, "correlation": 46678 } }, { "ph": "s", "id": 46678, "pid": 76337, "tid": -914061504, "ts": 1716454218234663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218256985, "dur": 4, "args": { "External id": 46680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46680, "pid": 5, "tid": 7, "ts": 1716454218256985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234672, "dur": 6, "args": { "External id": 46680, "cbid": 211, "correlation": 46680 } }, { "ph": "s", "id": 46680, "pid": 76337, "tid": -914061504, "ts": 1716454218234672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218234681, "dur": 0, "args": { "External id": 46681, "cbid": 51, "correlation": 46681 } }, { "ph": "s", "id": 46681, "pid": 76337, "tid": -914061504, "ts": 1716454218234681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218256990, "dur": 682, "args": { "External id": 46682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46682, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46682, "pid": 5, "tid": 7, "ts": 1716454218256990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234682, "dur": 5, "args": { "External id": 46682, "cbid": 211, "correlation": 46682 } }, { "ph": "s", "id": 46682, "pid": 76337, "tid": -914061504, "ts": 1716454218234682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218257673, "dur": 57, "args": { "External id": 46687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46687, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46687, "pid": 5, "tid": 7, "ts": 1716454218257673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234710, "dur": 8, "args": { "External id": 46687, "cbid": 211, "correlation": 46687 } }, { "ph": "s", "id": 46687, "pid": 76337, "tid": -914061504, "ts": 1716454218234710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218257731, "dur": 4, "args": { "External id": 46695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46695, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46695, "pid": 5, "tid": 7, "ts": 1716454218257731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234753, "dur": 10, "args": { "External id": 46695, "cbid": 211, "correlation": 46695 } }, { "ph": "s", "id": 46695, "pid": 76337, "tid": -914061504, "ts": 1716454218234753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234818, "dur": 1, "args": { "External id": 46711, "cbid": 251, "correlation": 46711 } }, { "ph": "f", "id": 46711, "pid": 76337, "tid": -914061504, "ts": 1716454218234818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218234823, "dur": 0, "args": { "External id": 46713, "cbid": 251, "correlation": 46713 } }, { "ph": "f", "id": 46713, "pid": 76337, "tid": -914061504, "ts": 1716454218234823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218257736, "dur": 10, "args": { "External id": 46714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46714, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 46714, "pid": 5, "tid": 7, "ts": 1716454218257736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234825, "dur": 11, "args": { "External id": 46714, "cbid": 211, "correlation": 46714 } }, { "ph": "s", "id": 46714, "pid": 76337, "tid": -914061504, "ts": 1716454218234825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218257748, "dur": 5, "args": { "External id": 46716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46716, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 46716, "pid": 5, "tid": 7, "ts": 1716454218257748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234838, "dur": 5, "args": { "External id": 46716, "cbid": 211, "correlation": 46716 } }, { "ph": "s", "id": 46716, "pid": 76337, "tid": -914061504, "ts": 1716454218234838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218257755, "dur": 52, "args": { "External id": 46726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46726, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46726, "pid": 5, "tid": 7, "ts": 1716454218257755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234896, "dur": 12, "args": { "External id": 46726, "cbid": 211, "correlation": 46726 } }, { "ph": "s", "id": 46726, "pid": 76337, "tid": -914061504, "ts": 1716454218234896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218257807, "dur": 49, "args": { "External id": 46746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46746, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 46746, "pid": 5, "tid": 7, "ts": 1716454218257807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234967, "dur": 18, "args": { "External id": 46746, "cbid": 211, "correlation": 46746 } }, { "ph": "s", "id": 46746, "pid": 76337, "tid": -914061504, "ts": 1716454218234967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218257858, "dur": 4, "args": { "External id": 46758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46758, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46758, "pid": 5, "tid": 7, "ts": 1716454218257858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218234995, "dur": 7, "args": { "External id": 46758, "cbid": 211, "correlation": 46758 } }, { "ph": "s", "id": 46758, "pid": 76337, "tid": -914061504, "ts": 1716454218234995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218257863, "dur": 54, "args": { "External id": 46761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46761, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46761, "pid": 5, "tid": 7, "ts": 1716454218257863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235016, "dur": 7, "args": { "External id": 46761, "cbid": 211, "correlation": 46761 } }, { "ph": "s", "id": 46761, "pid": 76337, "tid": -914061504, "ts": 1716454218235016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218257918, "dur": 36, "args": { "External id": 46770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46770, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46770, "pid": 5, "tid": 7, "ts": 1716454218257918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235057, "dur": 10, "args": { "External id": 46770, "cbid": 211, "correlation": 46770 } }, { "ph": "s", "id": 46770, "pid": 76337, "tid": -914061504, "ts": 1716454218235057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218235119, "dur": 0, "args": { "External id": 46780, "cbid": 317, "correlation": 46780 } }, { "ph": "f", "id": 46780, "pid": 76337, "tid": -914061504, "ts": 1716454218235119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218235120, "dur": 0, "args": { "External id": 46781, "cbid": 203, "correlation": 46781 } }, { "ph": "f", "id": 46781, "pid": 76337, "tid": -914061504, "ts": 1716454218235120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218235121, "dur": 0, "args": { "External id": 46782, "cbid": 205, "correlation": 46782 } }, { "ph": "f", "id": 46782, "pid": 76337, "tid": -914061504, "ts": 1716454218235121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218257955, "dur": 40, "args": { "External id": 46786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46786, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46786, "pid": 5, "tid": 7, "ts": 1716454218257955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235135, "dur": 12, "args": { "External id": 46786, "cbid": 211, "correlation": 46786 } }, { "ph": "s", "id": 46786, "pid": 76337, "tid": -914061504, "ts": 1716454218235135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218257997, "dur": 14, "args": { "External id": 46788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46788, "pid": 5, "tid": 7, "ts": 1716454218257997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235150, "dur": 5, "args": { "External id": 46788, "cbid": 211, "correlation": 46788 } }, { "ph": "s", "id": 46788, "pid": 76337, "tid": -914061504, "ts": 1716454218235150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218258012, "dur": 3, "args": { "External id": 46790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46790, "pid": 5, "tid": 7, "ts": 1716454218258012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235159, "dur": 6, "args": { "External id": 46790, "cbid": 211, "correlation": 46790 } }, { "ph": "s", "id": 46790, "pid": 76337, "tid": -914061504, "ts": 1716454218235159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218235168, "dur": 0, "args": { "External id": 46791, "cbid": 51, "correlation": 46791 } }, { "ph": "s", "id": 46791, "pid": 76337, "tid": -914061504, "ts": 1716454218235168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218258016, "dur": 673, "args": { "External id": 46792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46792, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46792, "pid": 5, "tid": 7, "ts": 1716454218258016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235169, "dur": 5, "args": { "External id": 46792, "cbid": 211, "correlation": 46792 } }, { "ph": "s", "id": 46792, "pid": 76337, "tid": -914061504, "ts": 1716454218235169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218258691, "dur": 57, "args": { "External id": 46797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46797, "pid": 5, "tid": 7, "ts": 1716454218258691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235197, "dur": 11, "args": { "External id": 46797, "cbid": 211, "correlation": 46797 } }, { "ph": "s", "id": 46797, "pid": 76337, "tid": -914061504, "ts": 1716454218235197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218258749, "dur": 51, "args": { "External id": 46805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46805, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46805, "pid": 5, "tid": 7, "ts": 1716454218258749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235232, "dur": 9, "args": { "External id": 46805, "cbid": 211, "correlation": 46805 } }, { "ph": "s", "id": 46805, "pid": 76337, "tid": -914061504, "ts": 1716454218235232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218258801, "dur": 35, "args": { "External id": 46813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46813, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46813, "pid": 5, "tid": 7, "ts": 1716454218258801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235261, "dur": 8, "args": { "External id": 46813, "cbid": 211, "correlation": 46813 } }, { "ph": "s", "id": 46813, "pid": 76337, "tid": -914061504, "ts": 1716454218235261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218258838, "dur": 50, "args": { "External id": 46833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46833, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 46833, "pid": 5, "tid": 7, "ts": 1716454218258838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235342, "dur": 13, "args": { "External id": 46833, "cbid": 211, "correlation": 46833 } }, { "ph": "s", "id": 46833, "pid": 76337, "tid": -914061504, "ts": 1716454218235342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218258890, "dur": 4, "args": { "External id": 46845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46845, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 46845, "pid": 5, "tid": 7, "ts": 1716454218258890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235364, "dur": 6, "args": { "External id": 46845, "cbid": 211, "correlation": 46845 } }, { "ph": "s", "id": 46845, "pid": 76337, "tid": -914061504, "ts": 1716454218235364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218258895, "dur": 54, "args": { "External id": 46848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46848, "pid": 5, "tid": 7, "ts": 1716454218258895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235381, "dur": 7, "args": { "External id": 46848, "cbid": 211, "correlation": 46848 } }, { "ph": "s", "id": 46848, "pid": 76337, "tid": -914061504, "ts": 1716454218235381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218235438, "dur": 0, "args": { "External id": 46859, "cbid": 317, "correlation": 46859 } }, { "ph": "f", "id": 46859, "pid": 76337, "tid": -914061504, "ts": 1716454218235438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218235439, "dur": 0, "args": { "External id": 46860, "cbid": 203, "correlation": 46860 } }, { "ph": "f", "id": 46860, "pid": 76337, "tid": -914061504, "ts": 1716454218235439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218235440, "dur": 0, "args": { "External id": 46861, "cbid": 205, "correlation": 46861 } }, { "ph": "f", "id": 46861, "pid": 76337, "tid": -914061504, "ts": 1716454218235440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235461, "dur": 1, "args": { "External id": 46865, "cbid": 251, "correlation": 46865 } }, { "ph": "f", "id": 46865, "pid": 76337, "tid": -914061504, "ts": 1716454218235461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235463, "dur": 0, "args": { "External id": 46866, "cbid": 251, "correlation": 46866 } }, { "ph": "f", "id": 46866, "pid": 76337, "tid": -914061504, "ts": 1716454218235463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235464, "dur": 0, "args": { "External id": 46867, "cbid": 251, "correlation": 46867 } }, { "ph": "f", "id": 46867, "pid": 76337, "tid": -914061504, "ts": 1716454218235464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235465, "dur": 0, "args": { "External id": 46868, "cbid": 251, "correlation": 46868 } }, { "ph": "f", "id": 46868, "pid": 76337, "tid": -914061504, "ts": 1716454218235465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235466, "dur": 0, "args": { "External id": 46869, "cbid": 251, "correlation": 46869 } }, { "ph": "f", "id": 46869, "pid": 76337, "tid": -914061504, "ts": 1716454218235466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235466, "dur": 0, "args": { "External id": 46870, "cbid": 251, "correlation": 46870 } }, { "ph": "f", "id": 46870, "pid": 76337, "tid": -914061504, "ts": 1716454218235466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235467, "dur": 0, "args": { "External id": 46871, "cbid": 251, "correlation": 46871 } }, { "ph": "f", "id": 46871, "pid": 76337, "tid": -914061504, "ts": 1716454218235467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235468, "dur": 0, "args": { "External id": 46872, "cbid": 251, "correlation": 46872 } }, { "ph": "f", "id": 46872, "pid": 76337, "tid": -914061504, "ts": 1716454218235468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235469, "dur": 0, "args": { "External id": 46873, "cbid": 251, "correlation": 46873 } }, { "ph": "f", "id": 46873, "pid": 76337, "tid": -914061504, "ts": 1716454218235469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218258950, "dur": 108, "args": { "External id": 46874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46874, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46874, "pid": 5, "tid": 7, "ts": 1716454218258950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235471, "dur": 15, "args": { "External id": 46874, "cbid": 211, "correlation": 46874 } }, { "ph": "s", "id": 46874, "pid": 76337, "tid": -914061504, "ts": 1716454218235471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218259059, "dur": 58, "args": { "External id": 46880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46880, "pid": 5, "tid": 7, "ts": 1716454218259059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235509, "dur": 9, "args": { "External id": 46880, "cbid": 211, "correlation": 46880 } }, { "ph": "s", "id": 46880, "pid": 76337, "tid": -914061504, "ts": 1716454218235509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218259118, "dur": 643, "args": { "External id": 46889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46889, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46889, "pid": 5, "tid": 7, "ts": 1716454218259118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235593, "dur": 15, "args": { "External id": 46889, "cbid": 211, "correlation": 46889 } }, { "ph": "s", "id": 46889, "pid": 76337, "tid": -914061504, "ts": 1716454218235593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218259763, "dur": 173, "args": { "External id": 46911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46911, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46911, "pid": 5, "tid": 7, "ts": 1716454218259763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235651, "dur": 10, "args": { "External id": 46911, "cbid": 211, "correlation": 46911 } }, { "ph": "s", "id": 46911, "pid": 76337, "tid": -914061504, "ts": 1716454218235651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235736, "dur": 1, "args": { "External id": 46922, "cbid": 251, "correlation": 46922 } }, { "ph": "f", "id": 46922, "pid": 76337, "tid": -914061504, "ts": 1716454218235736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218259937, "dur": 188, "args": { "External id": 46923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46923, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46923, "pid": 5, "tid": 7, "ts": 1716454218259937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235741, "dur": 13, "args": { "External id": 46923, "cbid": 211, "correlation": 46923 } }, { "ph": "s", "id": 46923, "pid": 76337, "tid": -914061504, "ts": 1716454218235741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235808, "dur": 1, "args": { "External id": 46934, "cbid": 251, "correlation": 46934 } }, { "ph": "f", "id": 46934, "pid": 76337, "tid": -914061504, "ts": 1716454218235808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218260126, "dur": 180, "args": { "External id": 46935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46935, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46935, "pid": 5, "tid": 7, "ts": 1716454218260126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235812, "dur": 12, "args": { "External id": 46935, "cbid": 211, "correlation": 46935 } }, { "ph": "s", "id": 46935, "pid": 76337, "tid": -914061504, "ts": 1716454218235812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218235875, "dur": 1, "args": { "External id": 46946, "cbid": 251, "correlation": 46946 } }, { "ph": "f", "id": 46946, "pid": 76337, "tid": -914061504, "ts": 1716454218235875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218260307, "dur": 179, "args": { "External id": 46947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46947, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46947, "pid": 5, "tid": 7, "ts": 1716454218260307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235879, "dur": 14, "args": { "External id": 46947, "cbid": 211, "correlation": 46947 } }, { "ph": "s", "id": 46947, "pid": 76337, "tid": -914061504, "ts": 1716454218235879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218260487, "dur": 17734, "args": { "External id": 46968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46968, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 46968, "pid": 5, "tid": 7, "ts": 1716454218260487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218235963, "dur": 21, "args": { "External id": 46968, "cbid": 211, "correlation": 46968 } }, { "ph": "s", "id": 46968, "pid": 76337, "tid": -914061504, "ts": 1716454218235963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236073, "dur": 1, "args": { "External id": 46986, "cbid": 251, "correlation": 46986 } }, { "ph": "f", "id": 46986, "pid": 76337, "tid": -914061504, "ts": 1716454218236073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218278222, "dur": 196, "args": { "External id": 46988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46988, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 46988, "pid": 5, "tid": 7, "ts": 1716454218278222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236078, "dur": 13, "args": { "External id": 46988, "cbid": 211, "correlation": 46988 } }, { "ph": "s", "id": 46988, "pid": 76337, "tid": -914061504, "ts": 1716454218236078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218278420, "dur": 66, "args": { "External id": 46996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 46996, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 46996, "pid": 5, "tid": 7, "ts": 1716454218278420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236150, "dur": 12, "args": { "External id": 46996, "cbid": 211, "correlation": 46996 } }, { "ph": "s", "id": 46996, "pid": 76337, "tid": -914061504, "ts": 1716454218236150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218278487, "dur": 98, "args": { "External id": 47004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47004, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47004, "pid": 5, "tid": 7, "ts": 1716454218278487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236189, "dur": 9, "args": { "External id": 47004, "cbid": 211, "correlation": 47004 } }, { "ph": "s", "id": 47004, "pid": 76337, "tid": -914061504, "ts": 1716454218236189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218278586, "dur": 53, "args": { "External id": 47015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47015, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47015, "pid": 5, "tid": 7, "ts": 1716454218278586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236261, "dur": 12, "args": { "External id": 47015, "cbid": 211, "correlation": 47015 } }, { "ph": "s", "id": 47015, "pid": 76337, "tid": -914061504, "ts": 1716454218236261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218278641, "dur": 88, "args": { "External id": 47037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47037, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47037, "pid": 5, "tid": 7, "ts": 1716454218278641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236296, "dur": 8, "args": { "External id": 47037, "cbid": 211, "correlation": 47037 } }, { "ph": "s", "id": 47037, "pid": 76337, "tid": -914061504, "ts": 1716454218236296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236380, "dur": 1, "args": { "External id": 47048, "cbid": 251, "correlation": 47048 } }, { "ph": "f", "id": 47048, "pid": 76337, "tid": -914061504, "ts": 1716454218236380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218278730, "dur": 100, "args": { "External id": 47049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47049, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47049, "pid": 5, "tid": 7, "ts": 1716454218278730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236385, "dur": 13, "args": { "External id": 47049, "cbid": 211, "correlation": 47049 } }, { "ph": "s", "id": 47049, "pid": 76337, "tid": -914061504, "ts": 1716454218236385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236456, "dur": 1, "args": { "External id": 47060, "cbid": 251, "correlation": 47060 } }, { "ph": "f", "id": 47060, "pid": 76337, "tid": -914061504, "ts": 1716454218236456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236460, "dur": 0, "args": { "External id": 47061, "cbid": 251, "correlation": 47061 } }, { "ph": "f", "id": 47061, "pid": 76337, "tid": -914061504, "ts": 1716454218236460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218278832, "dur": 10, "args": { "External id": 47062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47062, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 47062, "pid": 5, "tid": 7, "ts": 1716454218278832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236461, "dur": 12, "args": { "External id": 47062, "cbid": 211, "correlation": 47062 } }, { "ph": "s", "id": 47062, "pid": 76337, "tid": -914061504, "ts": 1716454218236461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218278843, "dur": 5, "args": { "External id": 47064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47064, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 47064, "pid": 5, "tid": 7, "ts": 1716454218278843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236475, "dur": 6, "args": { "External id": 47064, "cbid": 211, "correlation": 47064 } }, { "ph": "s", "id": 47064, "pid": 76337, "tid": -914061504, "ts": 1716454218236475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236535, "dur": 1, "args": { "External id": 47075, "cbid": 251, "correlation": 47075 } }, { "ph": "f", "id": 47075, "pid": 76337, "tid": -914061504, "ts": 1716454218236535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236538, "dur": 0, "args": { "External id": 47076, "cbid": 251, "correlation": 47076 } }, { "ph": "f", "id": 47076, "pid": 76337, "tid": -914061504, "ts": 1716454218236538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218278850, "dur": 6, "args": { "External id": 47077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47077, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 47077, "pid": 5, "tid": 7, "ts": 1716454218278850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236540, "dur": 12, "args": { "External id": 47077, "cbid": 211, "correlation": 47077 } }, { "ph": "s", "id": 47077, "pid": 76337, "tid": -914061504, "ts": 1716454218236540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218278857, "dur": 3, "args": { "External id": 47079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47079, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 47079, "pid": 5, "tid": 7, "ts": 1716454218278857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236554, "dur": 5, "args": { "External id": 47079, "cbid": 211, "correlation": 47079 } }, { "ph": "s", "id": 47079, "pid": 76337, "tid": -914061504, "ts": 1716454218236554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218278862, "dur": 148, "args": { "External id": 47100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47100, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 47100, "pid": 5, "tid": 7, "ts": 1716454218278862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236626, "dur": 14, "args": { "External id": 47100, "cbid": 211, "correlation": 47100 } }, { "ph": "s", "id": 47100, "pid": 76337, "tid": -914061504, "ts": 1716454218236626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236726, "dur": 1, "args": { "External id": 47118, "cbid": 251, "correlation": 47118 } }, { "ph": "f", "id": 47118, "pid": 76337, "tid": -914061504, "ts": 1716454218236726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218279011, "dur": 104, "args": { "External id": 47120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47120, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 47120, "pid": 5, "tid": 7, "ts": 1716454218279011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236731, "dur": 13, "args": { "External id": 47120, "cbid": 211, "correlation": 47120 } }, { "ph": "s", "id": 47120, "pid": 76337, "tid": -914061504, "ts": 1716454218236731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218279116, "dur": 35, "args": { "External id": 47128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47128, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47128, "pid": 5, "tid": 7, "ts": 1716454218279116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236801, "dur": 11, "args": { "External id": 47128, "cbid": 211, "correlation": 47128 } }, { "ph": "s", "id": 47128, "pid": 76337, "tid": -914061504, "ts": 1716454218236801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218279152, "dur": 67, "args": { "External id": 47136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47136, "pid": 5, "tid": 7, "ts": 1716454218279152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236842, "dur": 9, "args": { "External id": 47136, "cbid": 211, "correlation": 47136 } }, { "ph": "s", "id": 47136, "pid": 76337, "tid": -914061504, "ts": 1716454218236842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218279221, "dur": 88, "args": { "External id": 47158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47158, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47158, "pid": 5, "tid": 7, "ts": 1716454218279221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236893, "dur": 9, "args": { "External id": 47158, "cbid": 211, "correlation": 47158 } }, { "ph": "s", "id": 47158, "pid": 76337, "tid": -914061504, "ts": 1716454218236893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218236985, "dur": 1, "args": { "External id": 47174, "cbid": 251, "correlation": 47174 } }, { "ph": "f", "id": 47174, "pid": 76337, "tid": -914061504, "ts": 1716454218236985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218279310, "dur": 556, "args": { "External id": 47176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47176, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47176, "pid": 5, "tid": 7, "ts": 1716454218279310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218236991, "dur": 14, "args": { "External id": 47176, "cbid": 211, "correlation": 47176 } }, { "ph": "s", "id": 47176, "pid": 76337, "tid": -914061504, "ts": 1716454218236991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218279867, "dur": 239, "args": { "External id": 47184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47184, "pid": 5, "tid": 7, "ts": 1716454218279867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237062, "dur": 13, "args": { "External id": 47184, "cbid": 211, "correlation": 47184 } }, { "ph": "s", "id": 47184, "pid": 76337, "tid": -914061504, "ts": 1716454218237062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218280108, "dur": 247, "args": { "External id": 47192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47192, "pid": 5, "tid": 7, "ts": 1716454218280108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237092, "dur": 9, "args": { "External id": 47192, "cbid": 211, "correlation": 47192 } }, { "ph": "s", "id": 47192, "pid": 76337, "tid": -914061504, "ts": 1716454218237092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237173, "dur": 1, "args": { "External id": 47208, "cbid": 251, "correlation": 47208 } }, { "ph": "f", "id": 47208, "pid": 76337, "tid": -914061504, "ts": 1716454218237173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237178, "dur": 0, "args": { "External id": 47210, "cbid": 251, "correlation": 47210 } }, { "ph": "f", "id": 47210, "pid": 76337, "tid": -914061504, "ts": 1716454218237178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218280356, "dur": 353, "args": { "External id": 47211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47211, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 47211, "pid": 5, "tid": 7, "ts": 1716454218280356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237181, "dur": 13, "args": { "External id": 47211, "cbid": 211, "correlation": 47211 } }, { "ph": "s", "id": 47211, "pid": 76337, "tid": -914061504, "ts": 1716454218237181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218280711, "dur": 50, "args": { "External id": 47219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47219, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47219, "pid": 5, "tid": 7, "ts": 1716454218280711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237223, "dur": 10, "args": { "External id": 47219, "cbid": 211, "correlation": 47219 } }, { "ph": "s", "id": 47219, "pid": 76337, "tid": -914061504, "ts": 1716454218237223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218280763, "dur": 151, "args": { "External id": 47230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47230, "pid": 5, "tid": 7, "ts": 1716454218280763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237290, "dur": 13, "args": { "External id": 47230, "cbid": 211, "correlation": 47230 } }, { "ph": "s", "id": 47230, "pid": 76337, "tid": -914061504, "ts": 1716454218237290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218237354, "dur": 0, "args": { "External id": 47242, "cbid": 317, "correlation": 47242 } }, { "ph": "f", "id": 47242, "pid": 76337, "tid": -914061504, "ts": 1716454218237354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218237355, "dur": 0, "args": { "External id": 47243, "cbid": 203, "correlation": 47243 } }, { "ph": "f", "id": 47243, "pid": 76337, "tid": -914061504, "ts": 1716454218237355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218237356, "dur": 0, "args": { "External id": 47244, "cbid": 205, "correlation": 47244 } }, { "ph": "f", "id": 47244, "pid": 76337, "tid": -914061504, "ts": 1716454218237356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237379, "dur": 1, "args": { "External id": 47248, "cbid": 251, "correlation": 47248 } }, { "ph": "f", "id": 47248, "pid": 76337, "tid": -914061504, "ts": 1716454218237379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237381, "dur": 0, "args": { "External id": 47249, "cbid": 251, "correlation": 47249 } }, { "ph": "f", "id": 47249, "pid": 76337, "tid": -914061504, "ts": 1716454218237381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237382, "dur": 0, "args": { "External id": 47250, "cbid": 251, "correlation": 47250 } }, { "ph": "f", "id": 47250, "pid": 76337, "tid": -914061504, "ts": 1716454218237382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237383, "dur": 0, "args": { "External id": 47251, "cbid": 251, "correlation": 47251 } }, { "ph": "f", "id": 47251, "pid": 76337, "tid": -914061504, "ts": 1716454218237383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237383, "dur": 0, "args": { "External id": 47252, "cbid": 251, "correlation": 47252 } }, { "ph": "f", "id": 47252, "pid": 76337, "tid": -914061504, "ts": 1716454218237383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237384, "dur": 0, "args": { "External id": 47253, "cbid": 251, "correlation": 47253 } }, { "ph": "f", "id": 47253, "pid": 76337, "tid": -914061504, "ts": 1716454218237384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237384, "dur": 0, "args": { "External id": 47254, "cbid": 251, "correlation": 47254 } }, { "ph": "f", "id": 47254, "pid": 76337, "tid": -914061504, "ts": 1716454218237384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237385, "dur": 0, "args": { "External id": 47255, "cbid": 251, "correlation": 47255 } }, { "ph": "f", "id": 47255, "pid": 76337, "tid": -914061504, "ts": 1716454218237385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218237387, "dur": 0, "args": { "External id": 47256, "cbid": 251, "correlation": 47256 } }, { "ph": "f", "id": 47256, "pid": 76337, "tid": -914061504, "ts": 1716454218237387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218280915, "dur": 109, "args": { "External id": 47257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47257, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 47257, "pid": 5, "tid": 7, "ts": 1716454218280915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237389, "dur": 16, "args": { "External id": 47257, "cbid": 211, "correlation": 47257 } }, { "ph": "s", "id": 47257, "pid": 76337, "tid": -914061504, "ts": 1716454218237389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218281026, "dur": 59, "args": { "External id": 47263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47263, "pid": 5, "tid": 7, "ts": 1716454218281026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237427, "dur": 9, "args": { "External id": 47263, "cbid": 211, "correlation": 47263 } }, { "ph": "s", "id": 47263, "pid": 76337, "tid": -914061504, "ts": 1716454218237427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218281086, "dur": 50, "args": { "External id": 47271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47271, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47271, "pid": 5, "tid": 7, "ts": 1716454218281086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237460, "dur": 8, "args": { "External id": 47271, "cbid": 211, "correlation": 47271 } }, { "ph": "s", "id": 47271, "pid": 76337, "tid": -914061504, "ts": 1716454218237460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218237533, "dur": 0, "args": { "External id": 47281, "cbid": 317, "correlation": 47281 } }, { "ph": "f", "id": 47281, "pid": 76337, "tid": -914061504, "ts": 1716454218237533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218237534, "dur": 0, "args": { "External id": 47282, "cbid": 203, "correlation": 47282 } }, { "ph": "f", "id": 47282, "pid": 76337, "tid": -914061504, "ts": 1716454218237534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218237535, "dur": 0, "args": { "External id": 47283, "cbid": 205, "correlation": 47283 } }, { "ph": "f", "id": 47283, "pid": 76337, "tid": -914061504, "ts": 1716454218237535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218281137, "dur": 40, "args": { "External id": 47287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47287, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47287, "pid": 5, "tid": 7, "ts": 1716454218281137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237552, "dur": 13, "args": { "External id": 47287, "cbid": 211, "correlation": 47287 } }, { "ph": "s", "id": 47287, "pid": 76337, "tid": -914061504, "ts": 1716454218237552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218281178, "dur": 13, "args": { "External id": 47289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47289, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47289, "pid": 5, "tid": 7, "ts": 1716454218281178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237567, "dur": 5, "args": { "External id": 47289, "cbid": 211, "correlation": 47289 } }, { "ph": "s", "id": 47289, "pid": 76337, "tid": -914061504, "ts": 1716454218237567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218281194, "dur": 1, "args": { "External id": 47291, "device": 5, "context": 1, "stream": 7, "correlation": 47291, "bytes": 1536, "memory bandwidth (GB/s)": 0.9230769230769231 } }, { "ph": "f", "id": 47291, "pid": 5, "tid": 7, "ts": 1716454218281194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218237585, "dur": 16, "args": { "External id": 47291, "cbid": 51, "correlation": 47291 } }, { "ph": "s", "id": 47291, "pid": 76337, "tid": -914061504, "ts": 1716454218237585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218281197, "dur": 347, "args": { "External id": 47292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47292, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47292, "pid": 5, "tid": 7, "ts": 1716454218281197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237603, "dur": 10, "args": { "External id": 47292, "cbid": 211, "correlation": 47292 } }, { "ph": "s", "id": 47292, "pid": 76337, "tid": -914061504, "ts": 1716454218237603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218281546, "dur": 13, "args": { "External id": 47294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47294, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47294, "pid": 5, "tid": 7, "ts": 1716454218281546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237621, "dur": 7, "args": { "External id": 47294, "cbid": 211, "correlation": 47294 } }, { "ph": "s", "id": 47294, "pid": 76337, "tid": -914061504, "ts": 1716454218237621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218281560, "dur": 14, "args": { "External id": 47300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47300, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47300, "pid": 5, "tid": 7, "ts": 1716454218281560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237650, "dur": 9, "args": { "External id": 47300, "cbid": 211, "correlation": 47300 } }, { "ph": "s", "id": 47300, "pid": 76337, "tid": -914061504, "ts": 1716454218237650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218281575, "dur": 18, "args": { "External id": 47320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47320, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 47320, "pid": 5, "tid": 7, "ts": 1716454218281575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237743, "dur": 15, "args": { "External id": 47320, "cbid": 211, "correlation": 47320 } }, { "ph": "s", "id": 47320, "pid": 76337, "tid": -914061504, "ts": 1716454218237743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218281595, "dur": 5, "args": { "External id": 47332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47332, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 47332, "pid": 5, "tid": 7, "ts": 1716454218281595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237770, "dur": 6, "args": { "External id": 47332, "cbid": 211, "correlation": 47332 } }, { "ph": "s", "id": 47332, "pid": 76337, "tid": -914061504, "ts": 1716454218237770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218281601, "dur": 18, "args": { "External id": 47335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47335, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47335, "pid": 5, "tid": 7, "ts": 1716454218281601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237788, "dur": 6, "args": { "External id": 47335, "cbid": 211, "correlation": 47335 } }, { "ph": "s", "id": 47335, "pid": 76337, "tid": -914061504, "ts": 1716454218237788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218281620, "dur": 11, "args": { "External id": 47344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47344, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47344, "pid": 5, "tid": 7, "ts": 1716454218281620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237828, "dur": 9, "args": { "External id": 47344, "cbid": 211, "correlation": 47344 } }, { "ph": "s", "id": 47344, "pid": 76337, "tid": -914061504, "ts": 1716454218237828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218237883, "dur": 0, "args": { "External id": 47354, "cbid": 317, "correlation": 47354 } }, { "ph": "f", "id": 47354, "pid": 76337, "tid": -914061504, "ts": 1716454218237883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218237883, "dur": 0, "args": { "External id": 47355, "cbid": 203, "correlation": 47355 } }, { "ph": "f", "id": 47355, "pid": 76337, "tid": -914061504, "ts": 1716454218237883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218237884, "dur": 0, "args": { "External id": 47356, "cbid": 205, "correlation": 47356 } }, { "ph": "f", "id": 47356, "pid": 76337, "tid": -914061504, "ts": 1716454218237884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218281632, "dur": 11, "args": { "External id": 47360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47360, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47360, "pid": 5, "tid": 7, "ts": 1716454218281632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237899, "dur": 11, "args": { "External id": 47360, "cbid": 211, "correlation": 47360 } }, { "ph": "s", "id": 47360, "pid": 76337, "tid": -914061504, "ts": 1716454218237899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218281644, "dur": 23, "args": { "External id": 47362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47362, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47362, "pid": 5, "tid": 7, "ts": 1716454218281644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237913, "dur": 6, "args": { "External id": 47362, "cbid": 211, "correlation": 47362 } }, { "ph": "s", "id": 47362, "pid": 76337, "tid": -914061504, "ts": 1716454218237913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218281669, "dur": 3, "args": { "External id": 47364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 47364, "pid": 5, "tid": 7, "ts": 1716454218281669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237925, "dur": 6, "args": { "External id": 47364, "cbid": 211, "correlation": 47364 } }, { "ph": "s", "id": 47364, "pid": 76337, "tid": -914061504, "ts": 1716454218237925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218237934, "dur": 0, "args": { "External id": 47365, "cbid": 51, "correlation": 47365 } }, { "ph": "s", "id": 47365, "pid": 76337, "tid": -914061504, "ts": 1716454218237934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218281674, "dur": 345, "args": { "External id": 47366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47366, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47366, "pid": 5, "tid": 7, "ts": 1716454218281674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237935, "dur": 7, "args": { "External id": 47366, "cbid": 211, "correlation": 47366 } }, { "ph": "s", "id": 47366, "pid": 76337, "tid": -914061504, "ts": 1716454218237935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218282020, "dur": 19, "args": { "External id": 47367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47367, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47367, "pid": 5, "tid": 7, "ts": 1716454218282020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237945, "dur": 5, "args": { "External id": 47367, "cbid": 211, "correlation": 47367 } }, { "ph": "s", "id": 47367, "pid": 76337, "tid": -914061504, "ts": 1716454218237945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218282041, "dur": 31, "args": { "External id": 47373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47373, "pid": 5, "tid": 7, "ts": 1716454218282041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218237982, "dur": 9, "args": { "External id": 47373, "cbid": 211, "correlation": 47373 } }, { "ph": "s", "id": 47373, "pid": 76337, "tid": -914061504, "ts": 1716454218237982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218282073, "dur": 3, "args": { "External id": 47381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47381, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 47381, "pid": 5, "tid": 7, "ts": 1716454218282073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238026, "dur": 12, "args": { "External id": 47381, "cbid": 211, "correlation": 47381 } }, { "ph": "s", "id": 47381, "pid": 76337, "tid": -914061504, "ts": 1716454218238026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238096, "dur": 1, "args": { "External id": 47397, "cbid": 251, "correlation": 47397 } }, { "ph": "f", "id": 47397, "pid": 76337, "tid": -914061504, "ts": 1716454218238096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238102, "dur": 0, "args": { "External id": 47399, "cbid": 251, "correlation": 47399 } }, { "ph": "f", "id": 47399, "pid": 76337, "tid": -914061504, "ts": 1716454218238102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218282078, "dur": 12, "args": { "External id": 47400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47400, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 47400, "pid": 5, "tid": 7, "ts": 1716454218282078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238103, "dur": 12, "args": { "External id": 47400, "cbid": 211, "correlation": 47400 } }, { "ph": "s", "id": 47400, "pid": 76337, "tid": -914061504, "ts": 1716454218238103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218282091, "dur": 5, "args": { "External id": 47402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47402, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 47402, "pid": 5, "tid": 7, "ts": 1716454218282091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238117, "dur": 5, "args": { "External id": 47402, "cbid": 211, "correlation": 47402 } }, { "ph": "s", "id": 47402, "pid": 76337, "tid": -914061504, "ts": 1716454218238117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218282097, "dur": 29, "args": { "External id": 47412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47412, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47412, "pid": 5, "tid": 7, "ts": 1716454218282097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238175, "dur": 13, "args": { "External id": 47412, "cbid": 211, "correlation": 47412 } }, { "ph": "s", "id": 47412, "pid": 76337, "tid": -914061504, "ts": 1716454218238175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218282128, "dur": 30, "args": { "External id": 47432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47432, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 47432, "pid": 5, "tid": 7, "ts": 1716454218282128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238244, "dur": 11, "args": { "External id": 47432, "cbid": 211, "correlation": 47432 } }, { "ph": "s", "id": 47432, "pid": 76337, "tid": -914061504, "ts": 1716454218238244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218282159, "dur": 4, "args": { "External id": 47444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47444, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 47444, "pid": 5, "tid": 7, "ts": 1716454218282159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238265, "dur": 6, "args": { "External id": 47444, "cbid": 211, "correlation": 47444 } }, { "ph": "s", "id": 47444, "pid": 76337, "tid": -914061504, "ts": 1716454218238265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218282164, "dur": 30, "args": { "External id": 47447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47447, "pid": 5, "tid": 7, "ts": 1716454218282164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238283, "dur": 7, "args": { "External id": 47447, "cbid": 211, "correlation": 47447 } }, { "ph": "s", "id": 47447, "pid": 76337, "tid": -914061504, "ts": 1716454218238283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218282195, "dur": 20, "args": { "External id": 47456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47456, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47456, "pid": 5, "tid": 7, "ts": 1716454218282195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238327, "dur": 10, "args": { "External id": 47456, "cbid": 211, "correlation": 47456 } }, { "ph": "s", "id": 47456, "pid": 76337, "tid": -914061504, "ts": 1716454218238327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218238392, "dur": 0, "args": { "External id": 47466, "cbid": 317, "correlation": 47466 } }, { "ph": "f", "id": 47466, "pid": 76337, "tid": -914061504, "ts": 1716454218238392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218238393, "dur": 0, "args": { "External id": 47467, "cbid": 203, "correlation": 47467 } }, { "ph": "f", "id": 47467, "pid": 76337, "tid": -914061504, "ts": 1716454218238393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218238394, "dur": 0, "args": { "External id": 47468, "cbid": 205, "correlation": 47468 } }, { "ph": "f", "id": 47468, "pid": 76337, "tid": -914061504, "ts": 1716454218238394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218282216, "dur": 22, "args": { "External id": 47472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47472, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47472, "pid": 5, "tid": 7, "ts": 1716454218282216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238412, "dur": 12, "args": { "External id": 47472, "cbid": 211, "correlation": 47472 } }, { "ph": "s", "id": 47472, "pid": 76337, "tid": -914061504, "ts": 1716454218238412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218282240, "dur": 43, "args": { "External id": 47474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47474, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47474, "pid": 5, "tid": 7, "ts": 1716454218282240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238426, "dur": 5, "args": { "External id": 47474, "cbid": 211, "correlation": 47474 } }, { "ph": "s", "id": 47474, "pid": 76337, "tid": -914061504, "ts": 1716454218238426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218282284, "dur": 632, "args": { "External id": 47476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47476, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47476, "pid": 5, "tid": 7, "ts": 1716454218282284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238440, "dur": 9, "args": { "External id": 47476, "cbid": 211, "correlation": 47476 } }, { "ph": "s", "id": 47476, "pid": 76337, "tid": -914061504, "ts": 1716454218238440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218282917, "dur": 21, "args": { "External id": 47478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47478, "pid": 5, "tid": 7, "ts": 1716454218282917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238453, "dur": 5, "args": { "External id": 47478, "cbid": 211, "correlation": 47478 } }, { "ph": "s", "id": 47478, "pid": 76337, "tid": -914061504, "ts": 1716454218238453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218282939, "dur": 32, "args": { "External id": 47484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47484, "pid": 5, "tid": 7, "ts": 1716454218282939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238480, "dur": 9, "args": { "External id": 47484, "cbid": 211, "correlation": 47484 } }, { "ph": "s", "id": 47484, "pid": 76337, "tid": -914061504, "ts": 1716454218238480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218238539, "dur": 0, "args": { "External id": 47494, "cbid": 317, "correlation": 47494 } }, { "ph": "f", "id": 47494, "pid": 76337, "tid": -914061504, "ts": 1716454218238539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218238540, "dur": 0, "args": { "External id": 47495, "cbid": 203, "correlation": 47495 } }, { "ph": "f", "id": 47495, "pid": 76337, "tid": -914061504, "ts": 1716454218238540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218238541, "dur": 0, "args": { "External id": 47496, "cbid": 205, "correlation": 47496 } }, { "ph": "f", "id": 47496, "pid": 76337, "tid": -914061504, "ts": 1716454218238541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238563, "dur": 1, "args": { "External id": 47500, "cbid": 251, "correlation": 47500 } }, { "ph": "f", "id": 47500, "pid": 76337, "tid": -914061504, "ts": 1716454218238563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238565, "dur": 0, "args": { "External id": 47501, "cbid": 251, "correlation": 47501 } }, { "ph": "f", "id": 47501, "pid": 76337, "tid": -914061504, "ts": 1716454218238565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238566, "dur": 0, "args": { "External id": 47502, "cbid": 251, "correlation": 47502 } }, { "ph": "f", "id": 47502, "pid": 76337, "tid": -914061504, "ts": 1716454218238566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238567, "dur": 0, "args": { "External id": 47503, "cbid": 251, "correlation": 47503 } }, { "ph": "f", "id": 47503, "pid": 76337, "tid": -914061504, "ts": 1716454218238567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238567, "dur": 0, "args": { "External id": 47504, "cbid": 251, "correlation": 47504 } }, { "ph": "f", "id": 47504, "pid": 76337, "tid": -914061504, "ts": 1716454218238567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238568, "dur": 0, "args": { "External id": 47505, "cbid": 251, "correlation": 47505 } }, { "ph": "f", "id": 47505, "pid": 76337, "tid": -914061504, "ts": 1716454218238568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238569, "dur": 0, "args": { "External id": 47506, "cbid": 251, "correlation": 47506 } }, { "ph": "f", "id": 47506, "pid": 76337, "tid": -914061504, "ts": 1716454218238569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238570, "dur": 0, "args": { "External id": 47507, "cbid": 251, "correlation": 47507 } }, { "ph": "f", "id": 47507, "pid": 76337, "tid": -914061504, "ts": 1716454218238570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218238571, "dur": 0, "args": { "External id": 47508, "cbid": 251, "correlation": 47508 } }, { "ph": "f", "id": 47508, "pid": 76337, "tid": -914061504, "ts": 1716454218238571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218282972, "dur": 50, "args": { "External id": 47509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47509, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 47509, "pid": 5, "tid": 7, "ts": 1716454218282972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238573, "dur": 12, "args": { "External id": 47509, "cbid": 211, "correlation": 47509 } }, { "ph": "s", "id": 47509, "pid": 76337, "tid": -914061504, "ts": 1716454218238573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218283023, "dur": 31, "args": { "External id": 47515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47515, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47515, "pid": 5, "tid": 7, "ts": 1716454218283023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238606, "dur": 8, "args": { "External id": 47515, "cbid": 211, "correlation": 47515 } }, { "ph": "s", "id": 47515, "pid": 76337, "tid": -914061504, "ts": 1716454218238606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218283056, "dur": 27, "args": { "External id": 47523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47523, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47523, "pid": 5, "tid": 7, "ts": 1716454218283056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238638, "dur": 8, "args": { "External id": 47523, "cbid": 211, "correlation": 47523 } }, { "ph": "s", "id": 47523, "pid": 76337, "tid": -914061504, "ts": 1716454218238638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218283085, "dur": 19, "args": { "External id": 47531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47531, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47531, "pid": 5, "tid": 7, "ts": 1716454218283085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238668, "dur": 8, "args": { "External id": 47531, "cbid": 211, "correlation": 47531 } }, { "ph": "s", "id": 47531, "pid": 76337, "tid": -914061504, "ts": 1716454218238668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218283105, "dur": 29, "args": { "External id": 47551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47551, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 47551, "pid": 5, "tid": 7, "ts": 1716454218283105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238749, "dur": 13, "args": { "External id": 47551, "cbid": 211, "correlation": 47551 } }, { "ph": "s", "id": 47551, "pid": 76337, "tid": -914061504, "ts": 1716454218238749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218283136, "dur": 4, "args": { "External id": 47563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47563, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 47563, "pid": 5, "tid": 7, "ts": 1716454218283136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238772, "dur": 6, "args": { "External id": 47563, "cbid": 211, "correlation": 47563 } }, { "ph": "s", "id": 47563, "pid": 76337, "tid": -914061504, "ts": 1716454218238772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218283141, "dur": 29, "args": { "External id": 47566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47566, "pid": 5, "tid": 7, "ts": 1716454218283141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238790, "dur": 6, "args": { "External id": 47566, "cbid": 211, "correlation": 47566 } }, { "ph": "s", "id": 47566, "pid": 76337, "tid": -914061504, "ts": 1716454218238790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218238848, "dur": 0, "args": { "External id": 47577, "cbid": 317, "correlation": 47577 } }, { "ph": "f", "id": 47577, "pid": 76337, "tid": -914061504, "ts": 1716454218238848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218238849, "dur": 0, "args": { "External id": 47578, "cbid": 203, "correlation": 47578 } }, { "ph": "f", "id": 47578, "pid": 76337, "tid": -914061504, "ts": 1716454218238849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218238849, "dur": 0, "args": { "External id": 47579, "cbid": 205, "correlation": 47579 } }, { "ph": "f", "id": 47579, "pid": 76337, "tid": -914061504, "ts": 1716454218238849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218283172, "dur": 22, "args": { "External id": 47583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47583, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47583, "pid": 5, "tid": 7, "ts": 1716454218283172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238865, "dur": 12, "args": { "External id": 47583, "cbid": 211, "correlation": 47583 } }, { "ph": "s", "id": 47583, "pid": 76337, "tid": -914061504, "ts": 1716454218238865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218283195, "dur": 115, "args": { "External id": 47585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47585, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47585, "pid": 5, "tid": 7, "ts": 1716454218283195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238885, "dur": 8, "args": { "External id": 47585, "cbid": 211, "correlation": 47585 } }, { "ph": "s", "id": 47585, "pid": 76337, "tid": -914061504, "ts": 1716454218238885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218283311, "dur": 22, "args": { "External id": 47587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47587, "pid": 5, "tid": 7, "ts": 1716454218283311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238896, "dur": 5, "args": { "External id": 47587, "cbid": 211, "correlation": 47587 } }, { "ph": "s", "id": 47587, "pid": 76337, "tid": -914061504, "ts": 1716454218238896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218283334, "dur": 31, "args": { "External id": 47593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47593, "pid": 5, "tid": 7, "ts": 1716454218283334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218238928, "dur": 9, "args": { "External id": 47593, "cbid": 211, "correlation": 47593 } }, { "ph": "s", "id": 47593, "pid": 76337, "tid": -914061504, "ts": 1716454218238928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218283366, "dur": 163, "args": { "External id": 47602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47602, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47602, "pid": 5, "tid": 7, "ts": 1716454218283366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239020, "dur": 15, "args": { "External id": 47602, "cbid": 211, "correlation": 47602 } }, { "ph": "s", "id": 47602, "pid": 76337, "tid": -914061504, "ts": 1716454218239020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218283531, "dur": 61, "args": { "External id": 47624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47624, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47624, "pid": 5, "tid": 7, "ts": 1716454218283531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239078, "dur": 10, "args": { "External id": 47624, "cbid": 211, "correlation": 47624 } }, { "ph": "s", "id": 47624, "pid": 76337, "tid": -914061504, "ts": 1716454218239078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239166, "dur": 1, "args": { "External id": 47635, "cbid": 251, "correlation": 47635 } }, { "ph": "f", "id": 47635, "pid": 76337, "tid": -914061504, "ts": 1716454218239166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218283594, "dur": 149, "args": { "External id": 47636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47636, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47636, "pid": 5, "tid": 7, "ts": 1716454218283594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239172, "dur": 14, "args": { "External id": 47636, "cbid": 211, "correlation": 47636 } }, { "ph": "s", "id": 47636, "pid": 76337, "tid": -914061504, "ts": 1716454218239172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239243, "dur": 1, "args": { "External id": 47647, "cbid": 251, "correlation": 47647 } }, { "ph": "f", "id": 47647, "pid": 76337, "tid": -914061504, "ts": 1716454218239243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218283744, "dur": 142, "args": { "External id": 47648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47648, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47648, "pid": 5, "tid": 7, "ts": 1716454218283744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239247, "dur": 11, "args": { "External id": 47648, "cbid": 211, "correlation": 47648 } }, { "ph": "s", "id": 47648, "pid": 76337, "tid": -914061504, "ts": 1716454218239247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239311, "dur": 1, "args": { "External id": 47659, "cbid": 251, "correlation": 47659 } }, { "ph": "f", "id": 47659, "pid": 76337, "tid": -914061504, "ts": 1716454218239311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218283888, "dur": 142, "args": { "External id": 47660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47660, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47660, "pid": 5, "tid": 7, "ts": 1716454218283888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239315, "dur": 11, "args": { "External id": 47660, "cbid": 211, "correlation": 47660 } }, { "ph": "s", "id": 47660, "pid": 76337, "tid": -914061504, "ts": 1716454218239315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218284031, "dur": 1853, "args": { "External id": 47681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47681, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 47681, "pid": 5, "tid": 7, "ts": 1716454218284031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239401, "dur": 14, "args": { "External id": 47681, "cbid": 211, "correlation": 47681 } }, { "ph": "s", "id": 47681, "pid": 76337, "tid": -914061504, "ts": 1716454218239401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239503, "dur": 1, "args": { "External id": 47699, "cbid": 251, "correlation": 47699 } }, { "ph": "f", "id": 47699, "pid": 76337, "tid": -914061504, "ts": 1716454218239503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218285885, "dur": 142, "args": { "External id": 47701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47701, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 47701, "pid": 5, "tid": 7, "ts": 1716454218285885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239509, "dur": 13, "args": { "External id": 47701, "cbid": 211, "correlation": 47701 } }, { "ph": "s", "id": 47701, "pid": 76337, "tid": -914061504, "ts": 1716454218239509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218286029, "dur": 35, "args": { "External id": 47709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47709, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47709, "pid": 5, "tid": 7, "ts": 1716454218286029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239579, "dur": 12, "args": { "External id": 47709, "cbid": 211, "correlation": 47709 } }, { "ph": "s", "id": 47709, "pid": 76337, "tid": -914061504, "ts": 1716454218239579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218286065, "dur": 51, "args": { "External id": 47717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47717, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47717, "pid": 5, "tid": 7, "ts": 1716454218286065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239618, "dur": 8, "args": { "External id": 47717, "cbid": 211, "correlation": 47717 } }, { "ph": "s", "id": 47717, "pid": 76337, "tid": -914061504, "ts": 1716454218239618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218286118, "dur": 29, "args": { "External id": 47728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47728, "pid": 5, "tid": 7, "ts": 1716454218286118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239690, "dur": 13, "args": { "External id": 47728, "cbid": 211, "correlation": 47728 } }, { "ph": "s", "id": 47728, "pid": 76337, "tid": -914061504, "ts": 1716454218239690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218286149, "dur": 33, "args": { "External id": 47750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47750, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47750, "pid": 5, "tid": 7, "ts": 1716454218286149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239725, "dur": 7, "args": { "External id": 47750, "cbid": 211, "correlation": 47750 } }, { "ph": "s", "id": 47750, "pid": 76337, "tid": -914061504, "ts": 1716454218239725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239810, "dur": 1, "args": { "External id": 47761, "cbid": 251, "correlation": 47761 } }, { "ph": "f", "id": 47761, "pid": 76337, "tid": -914061504, "ts": 1716454218239810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218286183, "dur": 75, "args": { "External id": 47762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47762, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47762, "pid": 5, "tid": 7, "ts": 1716454218286183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239815, "dur": 13, "args": { "External id": 47762, "cbid": 211, "correlation": 47762 } }, { "ph": "s", "id": 47762, "pid": 76337, "tid": -914061504, "ts": 1716454218239815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239883, "dur": 1, "args": { "External id": 47773, "cbid": 251, "correlation": 47773 } }, { "ph": "f", "id": 47773, "pid": 76337, "tid": -914061504, "ts": 1716454218239883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239887, "dur": 0, "args": { "External id": 47774, "cbid": 251, "correlation": 47774 } }, { "ph": "f", "id": 47774, "pid": 76337, "tid": -914061504, "ts": 1716454218239887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218286260, "dur": 11, "args": { "External id": 47775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47775, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 47775, "pid": 5, "tid": 7, "ts": 1716454218286260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239889, "dur": 12, "args": { "External id": 47775, "cbid": 211, "correlation": 47775 } }, { "ph": "s", "id": 47775, "pid": 76337, "tid": -914061504, "ts": 1716454218239889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218286272, "dur": 5, "args": { "External id": 47777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47777, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 47777, "pid": 5, "tid": 7, "ts": 1716454218286272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239904, "dur": 8, "args": { "External id": 47777, "cbid": 211, "correlation": 47777 } }, { "ph": "s", "id": 47777, "pid": 76337, "tid": -914061504, "ts": 1716454218239904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239963, "dur": 1, "args": { "External id": 47788, "cbid": 251, "correlation": 47788 } }, { "ph": "f", "id": 47788, "pid": 76337, "tid": -914061504, "ts": 1716454218239963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218239967, "dur": 0, "args": { "External id": 47789, "cbid": 251, "correlation": 47789 } }, { "ph": "f", "id": 47789, "pid": 76337, "tid": -914061504, "ts": 1716454218239967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218286278, "dur": 7, "args": { "External id": 47790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47790, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 47790, "pid": 5, "tid": 7, "ts": 1716454218286278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239969, "dur": 19, "args": { "External id": 47790, "cbid": 211, "correlation": 47790 } }, { "ph": "s", "id": 47790, "pid": 76337, "tid": -914061504, "ts": 1716454218239969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218286286, "dur": 4, "args": { "External id": 47792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47792, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 47792, "pid": 5, "tid": 7, "ts": 1716454218286286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218239989, "dur": 5, "args": { "External id": 47792, "cbid": 211, "correlation": 47792 } }, { "ph": "s", "id": 47792, "pid": 76337, "tid": -914061504, "ts": 1716454218239989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218286291, "dur": 88, "args": { "External id": 47813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47813, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 47813, "pid": 5, "tid": 7, "ts": 1716454218286291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240063, "dur": 13, "args": { "External id": 47813, "cbid": 211, "correlation": 47813 } }, { "ph": "s", "id": 47813, "pid": 76337, "tid": -914061504, "ts": 1716454218240063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218240164, "dur": 1, "args": { "External id": 47831, "cbid": 251, "correlation": 47831 } }, { "ph": "f", "id": 47831, "pid": 76337, "tid": -914061504, "ts": 1716454218240164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218286380, "dur": 93, "args": { "External id": 47833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47833, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47833, "pid": 5, "tid": 7, "ts": 1716454218286380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240170, "dur": 14, "args": { "External id": 47833, "cbid": 211, "correlation": 47833 } }, { "ph": "s", "id": 47833, "pid": 76337, "tid": -914061504, "ts": 1716454218240170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218286475, "dur": 19, "args": { "External id": 47841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47841, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47841, "pid": 5, "tid": 7, "ts": 1716454218286475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240239, "dur": 12, "args": { "External id": 47841, "cbid": 211, "correlation": 47841 } }, { "ph": "s", "id": 47841, "pid": 76337, "tid": -914061504, "ts": 1716454218240239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218286495, "dur": 36, "args": { "External id": 47849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47849, "pid": 5, "tid": 7, "ts": 1716454218286495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240282, "dur": 9, "args": { "External id": 47849, "cbid": 211, "correlation": 47849 } }, { "ph": "s", "id": 47849, "pid": 76337, "tid": -914061504, "ts": 1716454218240282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218286533, "dur": 33, "args": { "External id": 47871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47871, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47871, "pid": 5, "tid": 7, "ts": 1716454218286533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240333, "dur": 10, "args": { "External id": 47871, "cbid": 211, "correlation": 47871 } }, { "ph": "s", "id": 47871, "pid": 76337, "tid": -914061504, "ts": 1716454218240333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218240422, "dur": 1, "args": { "External id": 47887, "cbid": 251, "correlation": 47887 } }, { "ph": "f", "id": 47887, "pid": 76337, "tid": -914061504, "ts": 1716454218240422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218240426, "dur": 0, "args": { "External id": 47889, "cbid": 251, "correlation": 47889 } }, { "ph": "f", "id": 47889, "pid": 76337, "tid": -914061504, "ts": 1716454218240426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218286568, "dur": 522, "args": { "External id": 47890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47890, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 47890, "pid": 5, "tid": 7, "ts": 1716454218286568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240430, "dur": 14, "args": { "External id": 47890, "cbid": 211, "correlation": 47890 } }, { "ph": "s", "id": 47890, "pid": 76337, "tid": -914061504, "ts": 1716454218240430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218287092, "dur": 122, "args": { "External id": 47898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47898, "pid": 5, "tid": 7, "ts": 1716454218287092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240496, "dur": 15, "args": { "External id": 47898, "cbid": 211, "correlation": 47898 } }, { "ph": "s", "id": 47898, "pid": 76337, "tid": -914061504, "ts": 1716454218240496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218287215, "dur": 126, "args": { "External id": 47906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47906, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47906, "pid": 5, "tid": 7, "ts": 1716454218287215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240529, "dur": 9, "args": { "External id": 47906, "cbid": 211, "correlation": 47906 } }, { "ph": "s", "id": 47906, "pid": 76337, "tid": -914061504, "ts": 1716454218240529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218240610, "dur": 1, "args": { "External id": 47922, "cbid": 251, "correlation": 47922 } }, { "ph": "f", "id": 47922, "pid": 76337, "tid": -914061504, "ts": 1716454218240610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218287342, "dur": 291, "args": { "External id": 47924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47924, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47924, "pid": 5, "tid": 7, "ts": 1716454218287342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240616, "dur": 12, "args": { "External id": 47924, "cbid": 211, "correlation": 47924 } }, { "ph": "s", "id": 47924, "pid": 76337, "tid": -914061504, "ts": 1716454218240616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218287634, "dur": 27, "args": { "External id": 47932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47932, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47932, "pid": 5, "tid": 7, "ts": 1716454218287634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240659, "dur": 9, "args": { "External id": 47932, "cbid": 211, "correlation": 47932 } }, { "ph": "s", "id": 47932, "pid": 76337, "tid": -914061504, "ts": 1716454218240659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218287662, "dur": 78, "args": { "External id": 47943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47943, "pid": 5, "tid": 7, "ts": 1716454218287662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240726, "dur": 12, "args": { "External id": 47943, "cbid": 211, "correlation": 47943 } }, { "ph": "s", "id": 47943, "pid": 76337, "tid": -914061504, "ts": 1716454218240726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218240790, "dur": 0, "args": { "External id": 47955, "cbid": 317, "correlation": 47955 } }, { "ph": "f", "id": 47955, "pid": 76337, "tid": -914061504, "ts": 1716454218240790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218240790, "dur": 0, "args": { "External id": 47956, "cbid": 203, "correlation": 47956 } }, { "ph": "f", "id": 47956, "pid": 76337, "tid": -914061504, "ts": 1716454218240790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218240791, "dur": 0, "args": { "External id": 47957, "cbid": 205, "correlation": 47957 } }, { "ph": "f", "id": 47957, "pid": 76337, "tid": -914061504, "ts": 1716454218240791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218287742, "dur": 22, "args": { "External id": 47961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47961, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47961, "pid": 5, "tid": 7, "ts": 1716454218287742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240806, "dur": 12, "args": { "External id": 47961, "cbid": 211, "correlation": 47961 } }, { "ph": "s", "id": 47961, "pid": 76337, "tid": -914061504, "ts": 1716454218240806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218287765, "dur": 114, "args": { "External id": 47963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47963, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 47963, "pid": 5, "tid": 7, "ts": 1716454218287765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240826, "dur": 6, "args": { "External id": 47963, "cbid": 211, "correlation": 47963 } }, { "ph": "s", "id": 47963, "pid": 76337, "tid": -914061504, "ts": 1716454218240826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218287881, "dur": 22, "args": { "External id": 47965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47965, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47965, "pid": 5, "tid": 7, "ts": 1716454218287881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240836, "dur": 8, "args": { "External id": 47965, "cbid": 211, "correlation": 47965 } }, { "ph": "s", "id": 47965, "pid": 76337, "tid": -914061504, "ts": 1716454218240836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218287904, "dur": 31, "args": { "External id": 47971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47971, "pid": 5, "tid": 7, "ts": 1716454218287904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240867, "dur": 9, "args": { "External id": 47971, "cbid": 211, "correlation": 47971 } }, { "ph": "s", "id": 47971, "pid": 76337, "tid": -914061504, "ts": 1716454218240867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218287937, "dur": 27, "args": { "External id": 47979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47979, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 47979, "pid": 5, "tid": 7, "ts": 1716454218287937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240899, "dur": 8, "args": { "External id": 47979, "cbid": 211, "correlation": 47979 } }, { "ph": "s", "id": 47979, "pid": 76337, "tid": -914061504, "ts": 1716454218240899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218287965, "dur": 29, "args": { "External id": 47999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 47999, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 47999, "pid": 5, "tid": 7, "ts": 1716454218287965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218240971, "dur": 19, "args": { "External id": 47999, "cbid": 211, "correlation": 47999 } }, { "ph": "s", "id": 47999, "pid": 76337, "tid": -914061504, "ts": 1716454218240971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218287996, "dur": 5, "args": { "External id": 48011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48011, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 48011, "pid": 5, "tid": 7, "ts": 1716454218287996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241001, "dur": 6, "args": { "External id": 48011, "cbid": 211, "correlation": 48011 } }, { "ph": "s", "id": 48011, "pid": 76337, "tid": -914061504, "ts": 1716454218241001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218288001, "dur": 30, "args": { "External id": 48014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48014, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48014, "pid": 5, "tid": 7, "ts": 1716454218288001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241018, "dur": 7, "args": { "External id": 48014, "cbid": 211, "correlation": 48014 } }, { "ph": "s", "id": 48014, "pid": 76337, "tid": -914061504, "ts": 1716454218241018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218288033, "dur": 22, "args": { "External id": 48023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48023, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48023, "pid": 5, "tid": 7, "ts": 1716454218288033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241058, "dur": 10, "args": { "External id": 48023, "cbid": 211, "correlation": 48023 } }, { "ph": "s", "id": 48023, "pid": 76337, "tid": -914061504, "ts": 1716454218241058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218241110, "dur": 0, "args": { "External id": 48033, "cbid": 317, "correlation": 48033 } }, { "ph": "f", "id": 48033, "pid": 76337, "tid": -914061504, "ts": 1716454218241110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218241111, "dur": 0, "args": { "External id": 48034, "cbid": 203, "correlation": 48034 } }, { "ph": "f", "id": 48034, "pid": 76337, "tid": -914061504, "ts": 1716454218241111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218241112, "dur": 0, "args": { "External id": 48035, "cbid": 205, "correlation": 48035 } }, { "ph": "f", "id": 48035, "pid": 76337, "tid": -914061504, "ts": 1716454218241112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218288056, "dur": 21, "args": { "External id": 48039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48039, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48039, "pid": 5, "tid": 7, "ts": 1716454218288056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241125, "dur": 14, "args": { "External id": 48039, "cbid": 211, "correlation": 48039 } }, { "ph": "s", "id": 48039, "pid": 76337, "tid": -914061504, "ts": 1716454218241125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218288078, "dur": 42, "args": { "External id": 48041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48041, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48041, "pid": 5, "tid": 7, "ts": 1716454218288078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241143, "dur": 5, "args": { "External id": 48041, "cbid": 211, "correlation": 48041 } }, { "ph": "s", "id": 48041, "pid": 76337, "tid": -914061504, "ts": 1716454218241143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218288121, "dur": 633, "args": { "External id": 48043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48043, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48043, "pid": 5, "tid": 7, "ts": 1716454218288121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241155, "dur": 6, "args": { "External id": 48043, "cbid": 211, "correlation": 48043 } }, { "ph": "s", "id": 48043, "pid": 76337, "tid": -914061504, "ts": 1716454218241155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218288755, "dur": 20, "args": { "External id": 48045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48045, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48045, "pid": 5, "tid": 7, "ts": 1716454218288755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241164, "dur": 5, "args": { "External id": 48045, "cbid": 211, "correlation": 48045 } }, { "ph": "s", "id": 48045, "pid": 76337, "tid": -914061504, "ts": 1716454218241164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218288776, "dur": 32, "args": { "External id": 48051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48051, "pid": 5, "tid": 7, "ts": 1716454218288776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241192, "dur": 9, "args": { "External id": 48051, "cbid": 211, "correlation": 48051 } }, { "ph": "s", "id": 48051, "pid": 76337, "tid": -914061504, "ts": 1716454218241192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218288810, "dur": 3, "args": { "External id": 48059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48059, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 48059, "pid": 5, "tid": 7, "ts": 1716454218288810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241235, "dur": 10, "args": { "External id": 48059, "cbid": 211, "correlation": 48059 } }, { "ph": "s", "id": 48059, "pid": 76337, "tid": -914061504, "ts": 1716454218241235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218241300, "dur": 1, "args": { "External id": 48075, "cbid": 251, "correlation": 48075 } }, { "ph": "f", "id": 48075, "pid": 76337, "tid": -914061504, "ts": 1716454218241300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218241305, "dur": 0, "args": { "External id": 48077, "cbid": 251, "correlation": 48077 } }, { "ph": "f", "id": 48077, "pid": 76337, "tid": -914061504, "ts": 1716454218241305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218288815, "dur": 12, "args": { "External id": 48078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48078, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 48078, "pid": 5, "tid": 7, "ts": 1716454218288815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241307, "dur": 11, "args": { "External id": 48078, "cbid": 211, "correlation": 48078 } }, { "ph": "s", "id": 48078, "pid": 76337, "tid": -914061504, "ts": 1716454218241307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218288828, "dur": 5, "args": { "External id": 48080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48080, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 48080, "pid": 5, "tid": 7, "ts": 1716454218288828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241320, "dur": 5, "args": { "External id": 48080, "cbid": 211, "correlation": 48080 } }, { "ph": "s", "id": 48080, "pid": 76337, "tid": -914061504, "ts": 1716454218241320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218288834, "dur": 29, "args": { "External id": 48090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48090, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48090, "pid": 5, "tid": 7, "ts": 1716454218288834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241377, "dur": 12, "args": { "External id": 48090, "cbid": 211, "correlation": 48090 } }, { "ph": "s", "id": 48090, "pid": 76337, "tid": -914061504, "ts": 1716454218241377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218288865, "dur": 29, "args": { "External id": 48110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48110, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 48110, "pid": 5, "tid": 7, "ts": 1716454218288865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241446, "dur": 11, "args": { "External id": 48110, "cbid": 211, "correlation": 48110 } }, { "ph": "s", "id": 48110, "pid": 76337, "tid": -914061504, "ts": 1716454218241446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218288895, "dur": 4, "args": { "External id": 48122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48122, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 48122, "pid": 5, "tid": 7, "ts": 1716454218288895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241466, "dur": 6, "args": { "External id": 48122, "cbid": 211, "correlation": 48122 } }, { "ph": "s", "id": 48122, "pid": 76337, "tid": -914061504, "ts": 1716454218241466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218288900, "dur": 30, "args": { "External id": 48125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48125, "pid": 5, "tid": 7, "ts": 1716454218288900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241488, "dur": 7, "args": { "External id": 48125, "cbid": 211, "correlation": 48125 } }, { "ph": "s", "id": 48125, "pid": 76337, "tid": -914061504, "ts": 1716454218241488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218288931, "dur": 20, "args": { "External id": 48134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48134, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48134, "pid": 5, "tid": 7, "ts": 1716454218288931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241529, "dur": 9, "args": { "External id": 48134, "cbid": 211, "correlation": 48134 } }, { "ph": "s", "id": 48134, "pid": 76337, "tid": -914061504, "ts": 1716454218241529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218241591, "dur": 0, "args": { "External id": 48144, "cbid": 317, "correlation": 48144 } }, { "ph": "f", "id": 48144, "pid": 76337, "tid": -914061504, "ts": 1716454218241591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218241592, "dur": 0, "args": { "External id": 48145, "cbid": 203, "correlation": 48145 } }, { "ph": "f", "id": 48145, "pid": 76337, "tid": -914061504, "ts": 1716454218241592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218241593, "dur": 0, "args": { "External id": 48146, "cbid": 205, "correlation": 48146 } }, { "ph": "f", "id": 48146, "pid": 76337, "tid": -914061504, "ts": 1716454218241593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218288952, "dur": 22, "args": { "External id": 48150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48150, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48150, "pid": 5, "tid": 7, "ts": 1716454218288952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241607, "dur": 12, "args": { "External id": 48150, "cbid": 211, "correlation": 48150 } }, { "ph": "s", "id": 48150, "pid": 76337, "tid": -914061504, "ts": 1716454218241607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218288975, "dur": 42, "args": { "External id": 48152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48152, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48152, "pid": 5, "tid": 7, "ts": 1716454218288975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241621, "dur": 5, "args": { "External id": 48152, "cbid": 211, "correlation": 48152 } }, { "ph": "s", "id": 48152, "pid": 76337, "tid": -914061504, "ts": 1716454218241621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218289019, "dur": 627, "args": { "External id": 48154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48154, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48154, "pid": 5, "tid": 7, "ts": 1716454218289019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241632, "dur": 6, "args": { "External id": 48154, "cbid": 211, "correlation": 48154 } }, { "ph": "s", "id": 48154, "pid": 76337, "tid": -914061504, "ts": 1716454218241632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218289647, "dur": 23, "args": { "External id": 48156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48156, "pid": 5, "tid": 7, "ts": 1716454218289647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241642, "dur": 5, "args": { "External id": 48156, "cbid": 211, "correlation": 48156 } }, { "ph": "s", "id": 48156, "pid": 76337, "tid": -914061504, "ts": 1716454218241642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218289671, "dur": 31, "args": { "External id": 48162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48162, "pid": 5, "tid": 7, "ts": 1716454218289671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241670, "dur": 8, "args": { "External id": 48162, "cbid": 211, "correlation": 48162 } }, { "ph": "s", "id": 48162, "pid": 76337, "tid": -914061504, "ts": 1716454218241670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218289704, "dur": 27, "args": { "External id": 48170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48170, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48170, "pid": 5, "tid": 7, "ts": 1716454218289704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241706, "dur": 9, "args": { "External id": 48170, "cbid": 211, "correlation": 48170 } }, { "ph": "s", "id": 48170, "pid": 76337, "tid": -914061504, "ts": 1716454218241706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218289732, "dur": 20, "args": { "External id": 48178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48178, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48178, "pid": 5, "tid": 7, "ts": 1716454218289732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241736, "dur": 8, "args": { "External id": 48178, "cbid": 211, "correlation": 48178 } }, { "ph": "s", "id": 48178, "pid": 76337, "tid": -914061504, "ts": 1716454218241736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218289753, "dur": 29, "args": { "External id": 48198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48198, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 48198, "pid": 5, "tid": 7, "ts": 1716454218289753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241815, "dur": 12, "args": { "External id": 48198, "cbid": 211, "correlation": 48198 } }, { "ph": "s", "id": 48198, "pid": 76337, "tid": -914061504, "ts": 1716454218241815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218289783, "dur": 4, "args": { "External id": 48210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48210, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 48210, "pid": 5, "tid": 7, "ts": 1716454218289783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241837, "dur": 6, "args": { "External id": 48210, "cbid": 211, "correlation": 48210 } }, { "ph": "s", "id": 48210, "pid": 76337, "tid": -914061504, "ts": 1716454218241837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218289788, "dur": 29, "args": { "External id": 48213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48213, "pid": 5, "tid": 7, "ts": 1716454218289788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241855, "dur": 6, "args": { "External id": 48213, "cbid": 211, "correlation": 48213 } }, { "ph": "s", "id": 48213, "pid": 76337, "tid": -914061504, "ts": 1716454218241855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218241912, "dur": 0, "args": { "External id": 48224, "cbid": 317, "correlation": 48224 } }, { "ph": "f", "id": 48224, "pid": 76337, "tid": -914061504, "ts": 1716454218241912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218241913, "dur": 0, "args": { "External id": 48225, "cbid": 203, "correlation": 48225 } }, { "ph": "f", "id": 48225, "pid": 76337, "tid": -914061504, "ts": 1716454218241913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218241914, "dur": 0, "args": { "External id": 48226, "cbid": 205, "correlation": 48226 } }, { "ph": "f", "id": 48226, "pid": 76337, "tid": -914061504, "ts": 1716454218241914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218289818, "dur": 22, "args": { "External id": 48230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48230, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48230, "pid": 5, "tid": 7, "ts": 1716454218289818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241927, "dur": 12, "args": { "External id": 48230, "cbid": 211, "correlation": 48230 } }, { "ph": "s", "id": 48230, "pid": 76337, "tid": -914061504, "ts": 1716454218241927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218289841, "dur": 112, "args": { "External id": 48232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48232, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48232, "pid": 5, "tid": 7, "ts": 1716454218289841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241945, "dur": 6, "args": { "External id": 48232, "cbid": 211, "correlation": 48232 } }, { "ph": "s", "id": 48232, "pid": 76337, "tid": -914061504, "ts": 1716454218241945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218289954, "dur": 21, "args": { "External id": 48234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48234, "pid": 5, "tid": 7, "ts": 1716454218289954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241954, "dur": 8, "args": { "External id": 48234, "cbid": 211, "correlation": 48234 } }, { "ph": "s", "id": 48234, "pid": 76337, "tid": -914061504, "ts": 1716454218241954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218289976, "dur": 32, "args": { "External id": 48240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48240, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48240, "pid": 5, "tid": 7, "ts": 1716454218289976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218241993, "dur": 10, "args": { "External id": 48240, "cbid": 211, "correlation": 48240 } }, { "ph": "s", "id": 48240, "pid": 76337, "tid": -914061504, "ts": 1716454218241993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218290009, "dur": 179, "args": { "External id": 48249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48249, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48249, "pid": 5, "tid": 7, "ts": 1716454218290009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242077, "dur": 14, "args": { "External id": 48249, "cbid": 211, "correlation": 48249 } }, { "ph": "s", "id": 48249, "pid": 76337, "tid": -914061504, "ts": 1716454218242077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218290190, "dur": 62, "args": { "External id": 48271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48271, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48271, "pid": 5, "tid": 7, "ts": 1716454218290190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242135, "dur": 10, "args": { "External id": 48271, "cbid": 211, "correlation": 48271 } }, { "ph": "s", "id": 48271, "pid": 76337, "tid": -914061504, "ts": 1716454218242135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242222, "dur": 1, "args": { "External id": 48282, "cbid": 251, "correlation": 48282 } }, { "ph": "f", "id": 48282, "pid": 76337, "tid": -914061504, "ts": 1716454218242222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218290253, "dur": 147, "args": { "External id": 48283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48283, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48283, "pid": 5, "tid": 7, "ts": 1716454218290253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242228, "dur": 13, "args": { "External id": 48283, "cbid": 211, "correlation": 48283 } }, { "ph": "s", "id": 48283, "pid": 76337, "tid": -914061504, "ts": 1716454218242228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242298, "dur": 1, "args": { "External id": 48294, "cbid": 251, "correlation": 48294 } }, { "ph": "f", "id": 48294, "pid": 76337, "tid": -914061504, "ts": 1716454218242298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218290401, "dur": 138, "args": { "External id": 48295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48295, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48295, "pid": 5, "tid": 7, "ts": 1716454218290401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242302, "dur": 11, "args": { "External id": 48295, "cbid": 211, "correlation": 48295 } }, { "ph": "s", "id": 48295, "pid": 76337, "tid": -914061504, "ts": 1716454218242302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242368, "dur": 1, "args": { "External id": 48306, "cbid": 251, "correlation": 48306 } }, { "ph": "f", "id": 48306, "pid": 76337, "tid": -914061504, "ts": 1716454218242368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218290541, "dur": 142, "args": { "External id": 48307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48307, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48307, "pid": 5, "tid": 7, "ts": 1716454218290541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242372, "dur": 12, "args": { "External id": 48307, "cbid": 211, "correlation": 48307 } }, { "ph": "s", "id": 48307, "pid": 76337, "tid": -914061504, "ts": 1716454218242372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218290684, "dur": 1854, "args": { "External id": 48328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48328, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 48328, "pid": 5, "tid": 7, "ts": 1716454218290684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242455, "dur": 13, "args": { "External id": 48328, "cbid": 211, "correlation": 48328 } }, { "ph": "s", "id": 48328, "pid": 76337, "tid": -914061504, "ts": 1716454218242455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242553, "dur": 1, "args": { "External id": 48346, "cbid": 251, "correlation": 48346 } }, { "ph": "f", "id": 48346, "pid": 76337, "tid": -914061504, "ts": 1716454218242553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218292539, "dur": 143, "args": { "External id": 48348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48348, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 48348, "pid": 5, "tid": 7, "ts": 1716454218292539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242559, "dur": 13, "args": { "External id": 48348, "cbid": 211, "correlation": 48348 } }, { "ph": "s", "id": 48348, "pid": 76337, "tid": -914061504, "ts": 1716454218242559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218292683, "dur": 35, "args": { "External id": 48356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48356, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48356, "pid": 5, "tid": 7, "ts": 1716454218292683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242629, "dur": 13, "args": { "External id": 48356, "cbid": 211, "correlation": 48356 } }, { "ph": "s", "id": 48356, "pid": 76337, "tid": -914061504, "ts": 1716454218242629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218292720, "dur": 50, "args": { "External id": 48364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48364, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48364, "pid": 5, "tid": 7, "ts": 1716454218292720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242668, "dur": 8, "args": { "External id": 48364, "cbid": 211, "correlation": 48364 } }, { "ph": "s", "id": 48364, "pid": 76337, "tid": -914061504, "ts": 1716454218242668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218292771, "dur": 30, "args": { "External id": 48375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48375, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48375, "pid": 5, "tid": 7, "ts": 1716454218292771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242739, "dur": 12, "args": { "External id": 48375, "cbid": 211, "correlation": 48375 } }, { "ph": "s", "id": 48375, "pid": 76337, "tid": -914061504, "ts": 1716454218242739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218292803, "dur": 33, "args": { "External id": 48397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48397, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48397, "pid": 5, "tid": 7, "ts": 1716454218292803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242773, "dur": 7, "args": { "External id": 48397, "cbid": 211, "correlation": 48397 } }, { "ph": "s", "id": 48397, "pid": 76337, "tid": -914061504, "ts": 1716454218242773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242858, "dur": 1, "args": { "External id": 48408, "cbid": 251, "correlation": 48408 } }, { "ph": "f", "id": 48408, "pid": 76337, "tid": -914061504, "ts": 1716454218242858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218292837, "dur": 86, "args": { "External id": 48409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48409, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48409, "pid": 5, "tid": 7, "ts": 1716454218292837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242863, "dur": 13, "args": { "External id": 48409, "cbid": 211, "correlation": 48409 } }, { "ph": "s", "id": 48409, "pid": 76337, "tid": -914061504, "ts": 1716454218242863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242930, "dur": 1, "args": { "External id": 48420, "cbid": 251, "correlation": 48420 } }, { "ph": "f", "id": 48420, "pid": 76337, "tid": -914061504, "ts": 1716454218242930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218242934, "dur": 0, "args": { "External id": 48421, "cbid": 251, "correlation": 48421 } }, { "ph": "f", "id": 48421, "pid": 76337, "tid": -914061504, "ts": 1716454218242934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218292925, "dur": 11, "args": { "External id": 48422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48422, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 48422, "pid": 5, "tid": 7, "ts": 1716454218292925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242936, "dur": 12, "args": { "External id": 48422, "cbid": 211, "correlation": 48422 } }, { "ph": "s", "id": 48422, "pid": 76337, "tid": -914061504, "ts": 1716454218242936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218292938, "dur": 5, "args": { "External id": 48424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48424, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 48424, "pid": 5, "tid": 7, "ts": 1716454218292938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218242950, "dur": 6, "args": { "External id": 48424, "cbid": 211, "correlation": 48424 } }, { "ph": "s", "id": 48424, "pid": 76337, "tid": -914061504, "ts": 1716454218242950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218243017, "dur": 1, "args": { "External id": 48435, "cbid": 251, "correlation": 48435 } }, { "ph": "f", "id": 48435, "pid": 76337, "tid": -914061504, "ts": 1716454218243017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218243020, "dur": 0, "args": { "External id": 48436, "cbid": 251, "correlation": 48436 } }, { "ph": "f", "id": 48436, "pid": 76337, "tid": -914061504, "ts": 1716454218243020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218292944, "dur": 7, "args": { "External id": 48437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48437, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 48437, "pid": 5, "tid": 7, "ts": 1716454218292944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243022, "dur": 12, "args": { "External id": 48437, "cbid": 211, "correlation": 48437 } }, { "ph": "s", "id": 48437, "pid": 76337, "tid": -914061504, "ts": 1716454218243022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218292952, "dur": 3, "args": { "External id": 48439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48439, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 48439, "pid": 5, "tid": 7, "ts": 1716454218292952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243035, "dur": 5, "args": { "External id": 48439, "cbid": 211, "correlation": 48439 } }, { "ph": "s", "id": 48439, "pid": 76337, "tid": -914061504, "ts": 1716454218243035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218292957, "dur": 88, "args": { "External id": 48460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48460, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 48460, "pid": 5, "tid": 7, "ts": 1716454218292957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243110, "dur": 12, "args": { "External id": 48460, "cbid": 211, "correlation": 48460 } }, { "ph": "s", "id": 48460, "pid": 76337, "tid": -914061504, "ts": 1716454218243110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218243211, "dur": 1, "args": { "External id": 48478, "cbid": 251, "correlation": 48478 } }, { "ph": "f", "id": 48478, "pid": 76337, "tid": -914061504, "ts": 1716454218243211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218293046, "dur": 95, "args": { "External id": 48480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48480, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48480, "pid": 5, "tid": 7, "ts": 1716454218293046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243218, "dur": 13, "args": { "External id": 48480, "cbid": 211, "correlation": 48480 } }, { "ph": "s", "id": 48480, "pid": 76337, "tid": -914061504, "ts": 1716454218243218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218293143, "dur": 19, "args": { "External id": 48488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48488, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48488, "pid": 5, "tid": 7, "ts": 1716454218293143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243286, "dur": 12, "args": { "External id": 48488, "cbid": 211, "correlation": 48488 } }, { "ph": "s", "id": 48488, "pid": 76337, "tid": -914061504, "ts": 1716454218243286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218293162, "dur": 36, "args": { "External id": 48496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48496, "pid": 5, "tid": 7, "ts": 1716454218293162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243327, "dur": 9, "args": { "External id": 48496, "cbid": 211, "correlation": 48496 } }, { "ph": "s", "id": 48496, "pid": 76337, "tid": -914061504, "ts": 1716454218243327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218293200, "dur": 34, "args": { "External id": 48518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48518, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48518, "pid": 5, "tid": 7, "ts": 1716454218293200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243380, "dur": 10, "args": { "External id": 48518, "cbid": 211, "correlation": 48518 } }, { "ph": "s", "id": 48518, "pid": 76337, "tid": -914061504, "ts": 1716454218243380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218243468, "dur": 1, "args": { "External id": 48534, "cbid": 251, "correlation": 48534 } }, { "ph": "f", "id": 48534, "pid": 76337, "tid": -914061504, "ts": 1716454218243468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218243473, "dur": 0, "args": { "External id": 48536, "cbid": 251, "correlation": 48536 } }, { "ph": "f", "id": 48536, "pid": 76337, "tid": -914061504, "ts": 1716454218243473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218293235, "dur": 522, "args": { "External id": 48537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48537, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 48537, "pid": 5, "tid": 7, "ts": 1716454218293235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243477, "dur": 13, "args": { "External id": 48537, "cbid": 211, "correlation": 48537 } }, { "ph": "s", "id": 48537, "pid": 76337, "tid": -914061504, "ts": 1716454218243477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218293758, "dur": 122, "args": { "External id": 48545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48545, "pid": 5, "tid": 7, "ts": 1716454218293758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243545, "dur": 13, "args": { "External id": 48545, "cbid": 211, "correlation": 48545 } }, { "ph": "s", "id": 48545, "pid": 76337, "tid": -914061504, "ts": 1716454218243545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218293882, "dur": 126, "args": { "External id": 48553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48553, "pid": 5, "tid": 7, "ts": 1716454218293882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243575, "dur": 9, "args": { "External id": 48553, "cbid": 211, "correlation": 48553 } }, { "ph": "s", "id": 48553, "pid": 76337, "tid": -914061504, "ts": 1716454218243575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218243654, "dur": 1, "args": { "External id": 48569, "cbid": 251, "correlation": 48569 } }, { "ph": "f", "id": 48569, "pid": 76337, "tid": -914061504, "ts": 1716454218243654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218294009, "dur": 296, "args": { "External id": 48571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48571, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48571, "pid": 5, "tid": 7, "ts": 1716454218294009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243659, "dur": 12, "args": { "External id": 48571, "cbid": 211, "correlation": 48571 } }, { "ph": "s", "id": 48571, "pid": 76337, "tid": -914061504, "ts": 1716454218243659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218294306, "dur": 28, "args": { "External id": 48579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48579, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48579, "pid": 5, "tid": 7, "ts": 1716454218294306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243701, "dur": 10, "args": { "External id": 48579, "cbid": 211, "correlation": 48579 } }, { "ph": "s", "id": 48579, "pid": 76337, "tid": -914061504, "ts": 1716454218243701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218294335, "dur": 78, "args": { "External id": 48590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48590, "pid": 5, "tid": 7, "ts": 1716454218294335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243768, "dur": 12, "args": { "External id": 48590, "cbid": 211, "correlation": 48590 } }, { "ph": "s", "id": 48590, "pid": 76337, "tid": -914061504, "ts": 1716454218243768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218243831, "dur": 0, "args": { "External id": 48602, "cbid": 317, "correlation": 48602 } }, { "ph": "f", "id": 48602, "pid": 76337, "tid": -914061504, "ts": 1716454218243831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218243832, "dur": 0, "args": { "External id": 48603, "cbid": 203, "correlation": 48603 } }, { "ph": "f", "id": 48603, "pid": 76337, "tid": -914061504, "ts": 1716454218243832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218243833, "dur": 0, "args": { "External id": 48604, "cbid": 205, "correlation": 48604 } }, { "ph": "f", "id": 48604, "pid": 76337, "tid": -914061504, "ts": 1716454218243833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294414, "dur": 23, "args": { "External id": 48608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48608, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48608, "pid": 5, "tid": 7, "ts": 1716454218294414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243848, "dur": 12, "args": { "External id": 48608, "cbid": 211, "correlation": 48608 } }, { "ph": "s", "id": 48608, "pid": 76337, "tid": -914061504, "ts": 1716454218243848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218294439, "dur": 115, "args": { "External id": 48610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48610, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48610, "pid": 5, "tid": 7, "ts": 1716454218294439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243867, "dur": 6, "args": { "External id": 48610, "cbid": 211, "correlation": 48610 } }, { "ph": "s", "id": 48610, "pid": 76337, "tid": -914061504, "ts": 1716454218243867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294555, "dur": 21, "args": { "External id": 48612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48612, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48612, "pid": 5, "tid": 7, "ts": 1716454218294555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243877, "dur": 8, "args": { "External id": 48612, "cbid": 211, "correlation": 48612 } }, { "ph": "s", "id": 48612, "pid": 76337, "tid": -914061504, "ts": 1716454218243877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218294578, "dur": 32, "args": { "External id": 48618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48618, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48618, "pid": 5, "tid": 7, "ts": 1716454218294578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243908, "dur": 9, "args": { "External id": 48618, "cbid": 211, "correlation": 48618 } }, { "ph": "s", "id": 48618, "pid": 76337, "tid": -914061504, "ts": 1716454218243908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218294611, "dur": 27, "args": { "External id": 48626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48626, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48626, "pid": 5, "tid": 7, "ts": 1716454218294611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218243941, "dur": 9, "args": { "External id": 48626, "cbid": 211, "correlation": 48626 } }, { "ph": "s", "id": 48626, "pid": 76337, "tid": -914061504, "ts": 1716454218243941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218244021, "dur": 0, "args": { "External id": 48636, "cbid": 317, "correlation": 48636 } }, { "ph": "f", "id": 48636, "pid": 76337, "tid": -914061504, "ts": 1716454218244021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218244022, "dur": 0, "args": { "External id": 48637, "cbid": 203, "correlation": 48637 } }, { "ph": "f", "id": 48637, "pid": 76337, "tid": -914061504, "ts": 1716454218244022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218244023, "dur": 0, "args": { "External id": 48638, "cbid": 205, "correlation": 48638 } }, { "ph": "f", "id": 48638, "pid": 76337, "tid": -914061504, "ts": 1716454218244023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294640, "dur": 22, "args": { "External id": 48642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48642, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48642, "pid": 5, "tid": 7, "ts": 1716454218294640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244041, "dur": 14, "args": { "External id": 48642, "cbid": 211, "correlation": 48642 } }, { "ph": "s", "id": 48642, "pid": 76337, "tid": -914061504, "ts": 1716454218244041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294663, "dur": 43, "args": { "External id": 48644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48644, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48644, "pid": 5, "tid": 7, "ts": 1716454218294663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244057, "dur": 6, "args": { "External id": 48644, "cbid": 211, "correlation": 48644 } }, { "ph": "s", "id": 48644, "pid": 76337, "tid": -914061504, "ts": 1716454218244057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218294707, "dur": 227, "args": { "External id": 48646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48646, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 48646, "pid": 5, "tid": 7, "ts": 1716454218294707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244069, "dur": 7, "args": { "External id": 48646, "cbid": 211, "correlation": 48646 } }, { "ph": "s", "id": 48646, "pid": 76337, "tid": -914061504, "ts": 1716454218244069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294936, "dur": 6, "args": { "External id": 48648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48648, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48648, "pid": 5, "tid": 7, "ts": 1716454218294936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244080, "dur": 5, "args": { "External id": 48648, "cbid": 211, "correlation": 48648 } }, { "ph": "s", "id": 48648, "pid": 76337, "tid": -914061504, "ts": 1716454218244080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218294943, "dur": 9, "args": { "External id": 48654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48654, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48654, "pid": 5, "tid": 7, "ts": 1716454218294943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244108, "dur": 8, "args": { "External id": 48654, "cbid": 211, "correlation": 48654 } }, { "ph": "s", "id": 48654, "pid": 76337, "tid": -914061504, "ts": 1716454218244108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218294954, "dur": 12, "args": { "External id": 48674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48674, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 48674, "pid": 5, "tid": 7, "ts": 1716454218294954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244200, "dur": 13, "args": { "External id": 48674, "cbid": 211, "correlation": 48674 } }, { "ph": "s", "id": 48674, "pid": 76337, "tid": -914061504, "ts": 1716454218244200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218294967, "dur": 4, "args": { "External id": 48686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48686, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 48686, "pid": 5, "tid": 7, "ts": 1716454218294967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244226, "dur": 7, "args": { "External id": 48686, "cbid": 211, "correlation": 48686 } }, { "ph": "s", "id": 48686, "pid": 76337, "tid": -914061504, "ts": 1716454218244226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218294972, "dur": 11, "args": { "External id": 48689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48689, "pid": 5, "tid": 7, "ts": 1716454218294972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244244, "dur": 7, "args": { "External id": 48689, "cbid": 211, "correlation": 48689 } }, { "ph": "s", "id": 48689, "pid": 76337, "tid": -914061504, "ts": 1716454218244244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218294984, "dur": 7, "args": { "External id": 48698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48698, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48698, "pid": 5, "tid": 7, "ts": 1716454218294984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244285, "dur": 9, "args": { "External id": 48698, "cbid": 211, "correlation": 48698 } }, { "ph": "s", "id": 48698, "pid": 76337, "tid": -914061504, "ts": 1716454218244285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218244337, "dur": 0, "args": { "External id": 48708, "cbid": 317, "correlation": 48708 } }, { "ph": "f", "id": 48708, "pid": 76337, "tid": -914061504, "ts": 1716454218244337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218244337, "dur": 0, "args": { "External id": 48709, "cbid": 203, "correlation": 48709 } }, { "ph": "f", "id": 48709, "pid": 76337, "tid": -914061504, "ts": 1716454218244337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218244338, "dur": 0, "args": { "External id": 48710, "cbid": 205, "correlation": 48710 } }, { "ph": "f", "id": 48710, "pid": 76337, "tid": -914061504, "ts": 1716454218244338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294992, "dur": 5, "args": { "External id": 48714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48714, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48714, "pid": 5, "tid": 7, "ts": 1716454218294992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244354, "dur": 12, "args": { "External id": 48714, "cbid": 211, "correlation": 48714 } }, { "ph": "s", "id": 48714, "pid": 76337, "tid": -914061504, "ts": 1716454218244354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218294999, "dur": 80, "args": { "External id": 48716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48716, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48716, "pid": 5, "tid": 7, "ts": 1716454218294999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244369, "dur": 5, "args": { "External id": 48716, "cbid": 211, "correlation": 48716 } }, { "ph": "s", "id": 48716, "pid": 76337, "tid": -914061504, "ts": 1716454218244369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218295081, "dur": 1, "args": { "External id": 48718, "device": 5, "context": 1, "stream": 7, "correlation": 48718, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 48718, "pid": 5, "tid": 7, "ts": 1716454218295081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218244382, "dur": 8, "args": { "External id": 48718, "cbid": 51, "correlation": 48718 } }, { "ph": "s", "id": 48718, "pid": 76337, "tid": -914061504, "ts": 1716454218244382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218295084, "dur": 530, "args": { "External id": 48719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48719, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48719, "pid": 5, "tid": 7, "ts": 1716454218295084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244392, "dur": 8, "args": { "External id": 48719, "cbid": 211, "correlation": 48719 } }, { "ph": "s", "id": 48719, "pid": 76337, "tid": -914061504, "ts": 1716454218244392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218295615, "dur": 12, "args": { "External id": 48721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48721, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48721, "pid": 5, "tid": 7, "ts": 1716454218295615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244403, "dur": 5, "args": { "External id": 48721, "cbid": 211, "correlation": 48721 } }, { "ph": "s", "id": 48721, "pid": 76337, "tid": -914061504, "ts": 1716454218244403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218295629, "dur": 14, "args": { "External id": 48727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48727, "pid": 5, "tid": 7, "ts": 1716454218295629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244431, "dur": 8, "args": { "External id": 48727, "cbid": 211, "correlation": 48727 } }, { "ph": "s", "id": 48727, "pid": 76337, "tid": -914061504, "ts": 1716454218244431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218295644, "dur": 3, "args": { "External id": 48735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48735, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 48735, "pid": 5, "tid": 7, "ts": 1716454218295644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244475, "dur": 9, "args": { "External id": 48735, "cbid": 211, "correlation": 48735 } }, { "ph": "s", "id": 48735, "pid": 76337, "tid": -914061504, "ts": 1716454218244475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218244543, "dur": 1, "args": { "External id": 48751, "cbid": 251, "correlation": 48751 } }, { "ph": "f", "id": 48751, "pid": 76337, "tid": -914061504, "ts": 1716454218244543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218244548, "dur": 0, "args": { "External id": 48753, "cbid": 251, "correlation": 48753 } }, { "ph": "f", "id": 48753, "pid": 76337, "tid": -914061504, "ts": 1716454218244548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218295649, "dur": 13, "args": { "External id": 48754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48754, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48754, "pid": 5, "tid": 7, "ts": 1716454218295649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244550, "dur": 11, "args": { "External id": 48754, "cbid": 211, "correlation": 48754 } }, { "ph": "s", "id": 48754, "pid": 76337, "tid": -914061504, "ts": 1716454218244550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218295663, "dur": 5, "args": { "External id": 48756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48756, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48756, "pid": 5, "tid": 7, "ts": 1716454218295663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244563, "dur": 5, "args": { "External id": 48756, "cbid": 211, "correlation": 48756 } }, { "ph": "s", "id": 48756, "pid": 76337, "tid": -914061504, "ts": 1716454218244563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218295670, "dur": 16, "args": { "External id": 48766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48766, "pid": 5, "tid": 7, "ts": 1716454218295670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244620, "dur": 12, "args": { "External id": 48766, "cbid": 211, "correlation": 48766 } }, { "ph": "s", "id": 48766, "pid": 76337, "tid": -914061504, "ts": 1716454218244620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218295687, "dur": 17, "args": { "External id": 48786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48786, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 48786, "pid": 5, "tid": 7, "ts": 1716454218295687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244686, "dur": 11, "args": { "External id": 48786, "cbid": 211, "correlation": 48786 } }, { "ph": "s", "id": 48786, "pid": 76337, "tid": -914061504, "ts": 1716454218244686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218295706, "dur": 5, "args": { "External id": 48798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48798, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 48798, "pid": 5, "tid": 7, "ts": 1716454218295706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244707, "dur": 6, "args": { "External id": 48798, "cbid": 211, "correlation": 48798 } }, { "ph": "s", "id": 48798, "pid": 76337, "tid": -914061504, "ts": 1716454218244707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218295712, "dur": 16, "args": { "External id": 48801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48801, "pid": 5, "tid": 7, "ts": 1716454218295712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244725, "dur": 6, "args": { "External id": 48801, "cbid": 211, "correlation": 48801 } }, { "ph": "s", "id": 48801, "pid": 76337, "tid": -914061504, "ts": 1716454218244725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218295729, "dur": 11, "args": { "External id": 48810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48810, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48810, "pid": 5, "tid": 7, "ts": 1716454218295729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244765, "dur": 12, "args": { "External id": 48810, "cbid": 211, "correlation": 48810 } }, { "ph": "s", "id": 48810, "pid": 76337, "tid": -914061504, "ts": 1716454218244765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218244831, "dur": 0, "args": { "External id": 48820, "cbid": 317, "correlation": 48820 } }, { "ph": "f", "id": 48820, "pid": 76337, "tid": -914061504, "ts": 1716454218244831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218244832, "dur": 0, "args": { "External id": 48821, "cbid": 203, "correlation": 48821 } }, { "ph": "f", "id": 48821, "pid": 76337, "tid": -914061504, "ts": 1716454218244832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218244832, "dur": 0, "args": { "External id": 48822, "cbid": 205, "correlation": 48822 } }, { "ph": "f", "id": 48822, "pid": 76337, "tid": -914061504, "ts": 1716454218244832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218295741, "dur": 11, "args": { "External id": 48826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48826, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48826, "pid": 5, "tid": 7, "ts": 1716454218295741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244853, "dur": 12, "args": { "External id": 48826, "cbid": 211, "correlation": 48826 } }, { "ph": "s", "id": 48826, "pid": 76337, "tid": -914061504, "ts": 1716454218244853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218295753, "dur": 158, "args": { "External id": 48828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48828, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48828, "pid": 5, "tid": 7, "ts": 1716454218295753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244868, "dur": 5, "args": { "External id": 48828, "cbid": 211, "correlation": 48828 } }, { "ph": "s", "id": 48828, "pid": 76337, "tid": -914061504, "ts": 1716454218244868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218295913, "dur": 1, "args": { "External id": 48830, "device": 5, "context": 1, "stream": 7, "correlation": 48830, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 48830, "pid": 5, "tid": 7, "ts": 1716454218295913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218244880, "dur": 7, "args": { "External id": 48830, "cbid": 51, "correlation": 48830 } }, { "ph": "s", "id": 48830, "pid": 76337, "tid": -914061504, "ts": 1716454218244880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218295917, "dur": 650, "args": { "External id": 48831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48831, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48831, "pid": 5, "tid": 7, "ts": 1716454218295917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244888, "dur": 7, "args": { "External id": 48831, "cbid": 211, "correlation": 48831 } }, { "ph": "s", "id": 48831, "pid": 76337, "tid": -914061504, "ts": 1716454218244888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218296569, "dur": 13, "args": { "External id": 48833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48833, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48833, "pid": 5, "tid": 7, "ts": 1716454218296569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244900, "dur": 5, "args": { "External id": 48833, "cbid": 211, "correlation": 48833 } }, { "ph": "s", "id": 48833, "pid": 76337, "tid": -914061504, "ts": 1716454218244900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218296583, "dur": 14, "args": { "External id": 48839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48839, "pid": 5, "tid": 7, "ts": 1716454218296583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218244929, "dur": 8, "args": { "External id": 48839, "cbid": 211, "correlation": 48839 } }, { "ph": "s", "id": 48839, "pid": 76337, "tid": -914061504, "ts": 1716454218244929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218244996, "dur": 0, "args": { "External id": 48849, "cbid": 317, "correlation": 48849 } }, { "ph": "f", "id": 48849, "pid": 76337, "tid": -914061504, "ts": 1716454218244996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218244997, "dur": 0, "args": { "External id": 48850, "cbid": 203, "correlation": 48850 } }, { "ph": "f", "id": 48850, "pid": 76337, "tid": -914061504, "ts": 1716454218244997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218244998, "dur": 0, "args": { "External id": 48851, "cbid": 205, "correlation": 48851 } }, { "ph": "f", "id": 48851, "pid": 76337, "tid": -914061504, "ts": 1716454218244998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218296598, "dur": 8, "args": { "External id": 48855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48855, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48855, "pid": 5, "tid": 7, "ts": 1716454218296598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245019, "dur": 12, "args": { "External id": 48855, "cbid": 211, "correlation": 48855 } }, { "ph": "s", "id": 48855, "pid": 76337, "tid": -914061504, "ts": 1716454218245019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218296608, "dur": 3, "args": { "External id": 48857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 48857, "pid": 5, "tid": 7, "ts": 1716454218296608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245037, "dur": 6, "args": { "External id": 48857, "cbid": 211, "correlation": 48857 } }, { "ph": "s", "id": 48857, "pid": 76337, "tid": -914061504, "ts": 1716454218245037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218245047, "dur": 0, "args": { "External id": 48858, "cbid": 51, "correlation": 48858 } }, { "ph": "s", "id": 48858, "pid": 76337, "tid": -914061504, "ts": 1716454218245047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218296612, "dur": 55, "args": { "External id": 48859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48859, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 48859, "pid": 5, "tid": 7, "ts": 1716454218296612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245048, "dur": 6, "args": { "External id": 48859, "cbid": 211, "correlation": 48859 } }, { "ph": "s", "id": 48859, "pid": 76337, "tid": -914061504, "ts": 1716454218245048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218296668, "dur": 13, "args": { "External id": 48864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48864, "pid": 5, "tid": 7, "ts": 1716454218296668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245074, "dur": 8, "args": { "External id": 48864, "cbid": 211, "correlation": 48864 } }, { "ph": "s", "id": 48864, "pid": 76337, "tid": -914061504, "ts": 1716454218245074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218296683, "dur": 12, "args": { "External id": 48872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48872, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48872, "pid": 5, "tid": 7, "ts": 1716454218296683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245106, "dur": 8, "args": { "External id": 48872, "cbid": 211, "correlation": 48872 } }, { "ph": "s", "id": 48872, "pid": 76337, "tid": -914061504, "ts": 1716454218245106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218296696, "dur": 10, "args": { "External id": 48880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48880, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48880, "pid": 5, "tid": 7, "ts": 1716454218296696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245136, "dur": 8, "args": { "External id": 48880, "cbid": 211, "correlation": 48880 } }, { "ph": "s", "id": 48880, "pid": 76337, "tid": -914061504, "ts": 1716454218245136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218296708, "dur": 18, "args": { "External id": 48900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48900, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 48900, "pid": 5, "tid": 7, "ts": 1716454218296708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245216, "dur": 13, "args": { "External id": 48900, "cbid": 211, "correlation": 48900 } }, { "ph": "s", "id": 48900, "pid": 76337, "tid": -914061504, "ts": 1716454218245216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218296727, "dur": 4, "args": { "External id": 48912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48912, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 48912, "pid": 5, "tid": 7, "ts": 1716454218296727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245238, "dur": 6, "args": { "External id": 48912, "cbid": 211, "correlation": 48912 } }, { "ph": "s", "id": 48912, "pid": 76337, "tid": -914061504, "ts": 1716454218245238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218296733, "dur": 18, "args": { "External id": 48915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48915, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48915, "pid": 5, "tid": 7, "ts": 1716454218296733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245255, "dur": 7, "args": { "External id": 48915, "cbid": 211, "correlation": 48915 } }, { "ph": "s", "id": 48915, "pid": 76337, "tid": -914061504, "ts": 1716454218245255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218245312, "dur": 0, "args": { "External id": 48926, "cbid": 317, "correlation": 48926 } }, { "ph": "f", "id": 48926, "pid": 76337, "tid": -914061504, "ts": 1716454218245312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218245313, "dur": 0, "args": { "External id": 48927, "cbid": 203, "correlation": 48927 } }, { "ph": "f", "id": 48927, "pid": 76337, "tid": -914061504, "ts": 1716454218245313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218245314, "dur": 0, "args": { "External id": 48928, "cbid": 205, "correlation": 48928 } }, { "ph": "f", "id": 48928, "pid": 76337, "tid": -914061504, "ts": 1716454218245314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218296752, "dur": 12, "args": { "External id": 48932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48932, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48932, "pid": 5, "tid": 7, "ts": 1716454218296752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245333, "dur": 12, "args": { "External id": 48932, "cbid": 211, "correlation": 48932 } }, { "ph": "s", "id": 48932, "pid": 76337, "tid": -914061504, "ts": 1716454218245333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218296766, "dur": 3, "args": { "External id": 48934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 48934, "pid": 5, "tid": 7, "ts": 1716454218296766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245349, "dur": 6, "args": { "External id": 48934, "cbid": 211, "correlation": 48934 } }, { "ph": "s", "id": 48934, "pid": 76337, "tid": -914061504, "ts": 1716454218245349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218245359, "dur": 0, "args": { "External id": 48935, "cbid": 51, "correlation": 48935 } }, { "ph": "s", "id": 48935, "pid": 76337, "tid": -914061504, "ts": 1716454218245359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218296770, "dur": 95, "args": { "External id": 48936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48936, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 48936, "pid": 5, "tid": 7, "ts": 1716454218296770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245360, "dur": 7, "args": { "External id": 48936, "cbid": 211, "correlation": 48936 } }, { "ph": "s", "id": 48936, "pid": 76337, "tid": -914061504, "ts": 1716454218245360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218296866, "dur": 15, "args": { "External id": 48941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48941, "pid": 5, "tid": 7, "ts": 1716454218296866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245390, "dur": 9, "args": { "External id": 48941, "cbid": 211, "correlation": 48941 } }, { "ph": "s", "id": 48941, "pid": 76337, "tid": -914061504, "ts": 1716454218245390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218296882, "dur": 82, "args": { "External id": 48950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48950, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48950, "pid": 5, "tid": 7, "ts": 1716454218296882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245472, "dur": 14, "args": { "External id": 48950, "cbid": 211, "correlation": 48950 } }, { "ph": "s", "id": 48950, "pid": 76337, "tid": -914061504, "ts": 1716454218245472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218296966, "dur": 29, "args": { "External id": 48972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48972, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 48972, "pid": 5, "tid": 7, "ts": 1716454218296966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245528, "dur": 10, "args": { "External id": 48972, "cbid": 211, "correlation": 48972 } }, { "ph": "s", "id": 48972, "pid": 76337, "tid": -914061504, "ts": 1716454218245528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218245616, "dur": 1, "args": { "External id": 48983, "cbid": 251, "correlation": 48983 } }, { "ph": "f", "id": 48983, "pid": 76337, "tid": -914061504, "ts": 1716454218245616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218296996, "dur": 137, "args": { "External id": 48984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48984, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48984, "pid": 5, "tid": 7, "ts": 1716454218296996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245622, "dur": 14, "args": { "External id": 48984, "cbid": 211, "correlation": 48984 } }, { "ph": "s", "id": 48984, "pid": 76337, "tid": -914061504, "ts": 1716454218245622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218245693, "dur": 1, "args": { "External id": 48995, "cbid": 251, "correlation": 48995 } }, { "ph": "f", "id": 48995, "pid": 76337, "tid": -914061504, "ts": 1716454218245693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218297135, "dur": 154, "args": { "External id": 48996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 48996, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 48996, "pid": 5, "tid": 7, "ts": 1716454218297135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245697, "dur": 11, "args": { "External id": 48996, "cbid": 211, "correlation": 48996 } }, { "ph": "s", "id": 48996, "pid": 76337, "tid": -914061504, "ts": 1716454218245697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218245761, "dur": 1, "args": { "External id": 49007, "cbid": 251, "correlation": 49007 } }, { "ph": "f", "id": 49007, "pid": 76337, "tid": -914061504, "ts": 1716454218245761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218297290, "dur": 154, "args": { "External id": 49008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49008, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49008, "pid": 5, "tid": 7, "ts": 1716454218297290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245765, "dur": 11, "args": { "External id": 49008, "cbid": 211, "correlation": 49008 } }, { "ph": "s", "id": 49008, "pid": 76337, "tid": -914061504, "ts": 1716454218245765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218297446, "dur": 330, "args": { "External id": 49033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49033, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49033, "pid": 5, "tid": 7, "ts": 1716454218297446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245853, "dur": 15, "args": { "External id": 49033, "cbid": 211, "correlation": 49033 } }, { "ph": "s", "id": 49033, "pid": 76337, "tid": -914061504, "ts": 1716454218245853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218245957, "dur": 1, "args": { "External id": 49051, "cbid": 251, "correlation": 49051 } }, { "ph": "f", "id": 49051, "pid": 76337, "tid": -914061504, "ts": 1716454218245957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218297777, "dur": 161, "args": { "External id": 49053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49053, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49053, "pid": 5, "tid": 7, "ts": 1716454218297777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218245962, "dur": 22, "args": { "External id": 49053, "cbid": 211, "correlation": 49053 } }, { "ph": "s", "id": 49053, "pid": 76337, "tid": -914061504, "ts": 1716454218245962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218297939, "dur": 19, "args": { "External id": 49061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49061, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49061, "pid": 5, "tid": 7, "ts": 1716454218297939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246043, "dur": 13, "args": { "External id": 49061, "cbid": 211, "correlation": 49061 } }, { "ph": "s", "id": 49061, "pid": 76337, "tid": -914061504, "ts": 1716454218246043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218297960, "dur": 27, "args": { "External id": 49069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49069, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49069, "pid": 5, "tid": 7, "ts": 1716454218297960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246083, "dur": 9, "args": { "External id": 49069, "cbid": 211, "correlation": 49069 } }, { "ph": "s", "id": 49069, "pid": 76337, "tid": -914061504, "ts": 1716454218246083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218297988, "dur": 18, "args": { "External id": 49080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49080, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49080, "pid": 5, "tid": 7, "ts": 1716454218297988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246155, "dur": 12, "args": { "External id": 49080, "cbid": 211, "correlation": 49080 } }, { "ph": "s", "id": 49080, "pid": 76337, "tid": -914061504, "ts": 1716454218246155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218298006, "dur": 15, "args": { "External id": 49102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49102, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49102, "pid": 5, "tid": 7, "ts": 1716454218298006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246189, "dur": 8, "args": { "External id": 49102, "cbid": 211, "correlation": 49102 } }, { "ph": "s", "id": 49102, "pid": 76337, "tid": -914061504, "ts": 1716454218246189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246275, "dur": 1, "args": { "External id": 49113, "cbid": 251, "correlation": 49113 } }, { "ph": "f", "id": 49113, "pid": 76337, "tid": -914061504, "ts": 1716454218246275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218298023, "dur": 86, "args": { "External id": 49114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49114, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49114, "pid": 5, "tid": 7, "ts": 1716454218298023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246281, "dur": 14, "args": { "External id": 49114, "cbid": 211, "correlation": 49114 } }, { "ph": "s", "id": 49114, "pid": 76337, "tid": -914061504, "ts": 1716454218246281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246351, "dur": 1, "args": { "External id": 49125, "cbid": 251, "correlation": 49125 } }, { "ph": "f", "id": 49125, "pid": 76337, "tid": -914061504, "ts": 1716454218246351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246355, "dur": 0, "args": { "External id": 49126, "cbid": 251, "correlation": 49126 } }, { "ph": "f", "id": 49126, "pid": 76337, "tid": -914061504, "ts": 1716454218246355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218298110, "dur": 13, "args": { "External id": 49127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49127, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49127, "pid": 5, "tid": 7, "ts": 1716454218298110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246357, "dur": 12, "args": { "External id": 49127, "cbid": 211, "correlation": 49127 } }, { "ph": "s", "id": 49127, "pid": 76337, "tid": -914061504, "ts": 1716454218246357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218298124, "dur": 5, "args": { "External id": 49129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49129, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49129, "pid": 5, "tid": 7, "ts": 1716454218298124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246371, "dur": 6, "args": { "External id": 49129, "cbid": 211, "correlation": 49129 } }, { "ph": "s", "id": 49129, "pid": 76337, "tid": -914061504, "ts": 1716454218246371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246428, "dur": 1, "args": { "External id": 49140, "cbid": 251, "correlation": 49140 } }, { "ph": "f", "id": 49140, "pid": 76337, "tid": -914061504, "ts": 1716454218246428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246432, "dur": 0, "args": { "External id": 49141, "cbid": 251, "correlation": 49141 } }, { "ph": "f", "id": 49141, "pid": 76337, "tid": -914061504, "ts": 1716454218246432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218298131, "dur": 8, "args": { "External id": 49142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49142, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49142, "pid": 5, "tid": 7, "ts": 1716454218298131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246433, "dur": 11, "args": { "External id": 49142, "cbid": 211, "correlation": 49142 } }, { "ph": "s", "id": 49142, "pid": 76337, "tid": -914061504, "ts": 1716454218246433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218298141, "dur": 3, "args": { "External id": 49144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49144, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49144, "pid": 5, "tid": 7, "ts": 1716454218298141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246446, "dur": 5, "args": { "External id": 49144, "cbid": 211, "correlation": 49144 } }, { "ph": "s", "id": 49144, "pid": 76337, "tid": -914061504, "ts": 1716454218246446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218298145, "dur": 53, "args": { "External id": 49169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49169, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49169, "pid": 5, "tid": 7, "ts": 1716454218298145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246523, "dur": 15, "args": { "External id": 49169, "cbid": 211, "correlation": 49169 } }, { "ph": "s", "id": 49169, "pid": 76337, "tid": -914061504, "ts": 1716454218246523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246627, "dur": 2, "args": { "External id": 49187, "cbid": 251, "correlation": 49187 } }, { "ph": "f", "id": 49187, "pid": 76337, "tid": -914061504, "ts": 1716454218246627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218298199, "dur": 89, "args": { "External id": 49189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49189, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49189, "pid": 5, "tid": 7, "ts": 1716454218298199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246634, "dur": 17, "args": { "External id": 49189, "cbid": 211, "correlation": 49189 } }, { "ph": "s", "id": 49189, "pid": 76337, "tid": -914061504, "ts": 1716454218246634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218298289, "dur": 10, "args": { "External id": 49197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49197, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49197, "pid": 5, "tid": 7, "ts": 1716454218298289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246707, "dur": 12, "args": { "External id": 49197, "cbid": 211, "correlation": 49197 } }, { "ph": "s", "id": 49197, "pid": 76337, "tid": -914061504, "ts": 1716454218246707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218298300, "dur": 20, "args": { "External id": 49205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49205, "pid": 5, "tid": 7, "ts": 1716454218298300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246749, "dur": 9, "args": { "External id": 49205, "cbid": 211, "correlation": 49205 } }, { "ph": "s", "id": 49205, "pid": 76337, "tid": -914061504, "ts": 1716454218246749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218298322, "dur": 17, "args": { "External id": 49227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49227, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49227, "pid": 5, "tid": 7, "ts": 1716454218298322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246801, "dur": 10, "args": { "External id": 49227, "cbid": 211, "correlation": 49227 } }, { "ph": "s", "id": 49227, "pid": 76337, "tid": -914061504, "ts": 1716454218246801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246890, "dur": 2, "args": { "External id": 49243, "cbid": 251, "correlation": 49243 } }, { "ph": "f", "id": 49243, "pid": 76337, "tid": -914061504, "ts": 1716454218246890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218246896, "dur": 0, "args": { "External id": 49245, "cbid": 251, "correlation": 49245 } }, { "ph": "f", "id": 49245, "pid": 76337, "tid": -914061504, "ts": 1716454218246896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218298341, "dur": 490, "args": { "External id": 49246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49246, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49246, "pid": 5, "tid": 7, "ts": 1716454218298341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246899, "dur": 18, "args": { "External id": 49246, "cbid": 211, "correlation": 49246 } }, { "ph": "s", "id": 49246, "pid": 76337, "tid": -914061504, "ts": 1716454218246899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218298832, "dur": 65, "args": { "External id": 49254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49254, "pid": 5, "tid": 7, "ts": 1716454218298832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218246969, "dur": 22, "args": { "External id": 49254, "cbid": 211, "correlation": 49254 } }, { "ph": "s", "id": 49254, "pid": 76337, "tid": -914061504, "ts": 1716454218246969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218298899, "dur": 67, "args": { "External id": 49262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49262, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49262, "pid": 5, "tid": 7, "ts": 1716454218298899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247010, "dur": 9, "args": { "External id": 49262, "cbid": 211, "correlation": 49262 } }, { "ph": "s", "id": 49262, "pid": 76337, "tid": -914061504, "ts": 1716454218247010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218247093, "dur": 1, "args": { "External id": 49278, "cbid": 251, "correlation": 49278 } }, { "ph": "f", "id": 49278, "pid": 76337, "tid": -914061504, "ts": 1716454218247093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218298968, "dur": 1, "args": { "External id": 49280, "device": 5, "context": 1, "stream": 7, "correlation": 49280, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 49280, "pid": 5, "tid": 7, "ts": 1716454218298968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218247099, "dur": 11, "args": { "External id": 49280, "cbid": 51, "correlation": 49280 } }, { "ph": "s", "id": 49280, "pid": 76337, "tid": -914061504, "ts": 1716454218247099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218298972, "dur": 265, "args": { "External id": 49281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49281, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49281, "pid": 5, "tid": 7, "ts": 1716454218298972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247112, "dur": 11, "args": { "External id": 49281, "cbid": 211, "correlation": 49281 } }, { "ph": "s", "id": 49281, "pid": 76337, "tid": -914061504, "ts": 1716454218247112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218299239, "dur": 14, "args": { "External id": 49289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49289, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49289, "pid": 5, "tid": 7, "ts": 1716454218299239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247154, "dur": 10, "args": { "External id": 49289, "cbid": 211, "correlation": 49289 } }, { "ph": "s", "id": 49289, "pid": 76337, "tid": -914061504, "ts": 1716454218247154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218299254, "dur": 36, "args": { "External id": 49300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49300, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49300, "pid": 5, "tid": 7, "ts": 1716454218299254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247220, "dur": 12, "args": { "External id": 49300, "cbid": 211, "correlation": 49300 } }, { "ph": "s", "id": 49300, "pid": 76337, "tid": -914061504, "ts": 1716454218247220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218247285, "dur": 0, "args": { "External id": 49312, "cbid": 317, "correlation": 49312 } }, { "ph": "f", "id": 49312, "pid": 76337, "tid": -914061504, "ts": 1716454218247285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218247285, "dur": 0, "args": { "External id": 49313, "cbid": 203, "correlation": 49313 } }, { "ph": "f", "id": 49313, "pid": 76337, "tid": -914061504, "ts": 1716454218247285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218247286, "dur": 0, "args": { "External id": 49314, "cbid": 205, "correlation": 49314 } }, { "ph": "f", "id": 49314, "pid": 76337, "tid": -914061504, "ts": 1716454218247286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218299291, "dur": 13, "args": { "External id": 49318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49318, "pid": 5, "tid": 7, "ts": 1716454218299291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247302, "dur": 12, "args": { "External id": 49318, "cbid": 211, "correlation": 49318 } }, { "ph": "s", "id": 49318, "pid": 76337, "tid": -914061504, "ts": 1716454218247302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218299305, "dur": 4, "args": { "External id": 49320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49320, "pid": 5, "tid": 7, "ts": 1716454218299305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247322, "dur": 6, "args": { "External id": 49320, "cbid": 211, "correlation": 49320 } }, { "ph": "s", "id": 49320, "pid": 76337, "tid": -914061504, "ts": 1716454218247322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218247331, "dur": 0, "args": { "External id": 49321, "cbid": 51, "correlation": 49321 } }, { "ph": "s", "id": 49321, "pid": 76337, "tid": -914061504, "ts": 1716454218247331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218299310, "dur": 94, "args": { "External id": 49322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49322, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 49322, "pid": 5, "tid": 7, "ts": 1716454218299310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247332, "dur": 5, "args": { "External id": 49322, "cbid": 211, "correlation": 49322 } }, { "ph": "s", "id": 49322, "pid": 76337, "tid": -914061504, "ts": 1716454218247332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218299405, "dur": 17, "args": { "External id": 49327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49327, "pid": 5, "tid": 7, "ts": 1716454218299405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247360, "dur": 8, "args": { "External id": 49327, "cbid": 211, "correlation": 49327 } }, { "ph": "s", "id": 49327, "pid": 76337, "tid": -914061504, "ts": 1716454218247360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218299423, "dur": 12, "args": { "External id": 49335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49335, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49335, "pid": 5, "tid": 7, "ts": 1716454218299423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247391, "dur": 9, "args": { "External id": 49335, "cbid": 211, "correlation": 49335 } }, { "ph": "s", "id": 49335, "pid": 76337, "tid": -914061504, "ts": 1716454218247391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218299436, "dur": 18, "args": { "External id": 49355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49355, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 49355, "pid": 5, "tid": 7, "ts": 1716454218299436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247465, "dur": 11, "args": { "External id": 49355, "cbid": 211, "correlation": 49355 } }, { "ph": "s", "id": 49355, "pid": 76337, "tid": -914061504, "ts": 1716454218247465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218299455, "dur": 4, "args": { "External id": 49367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49367, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 49367, "pid": 5, "tid": 7, "ts": 1716454218299455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247486, "dur": 6, "args": { "External id": 49367, "cbid": 211, "correlation": 49367 } }, { "ph": "s", "id": 49367, "pid": 76337, "tid": -914061504, "ts": 1716454218247486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218299461, "dur": 18, "args": { "External id": 49370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49370, "pid": 5, "tid": 7, "ts": 1716454218299461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247504, "dur": 7, "args": { "External id": 49370, "cbid": 211, "correlation": 49370 } }, { "ph": "s", "id": 49370, "pid": 76337, "tid": -914061504, "ts": 1716454218247504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218299481, "dur": 11, "args": { "External id": 49379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49379, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49379, "pid": 5, "tid": 7, "ts": 1716454218299481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247542, "dur": 10, "args": { "External id": 49379, "cbid": 211, "correlation": 49379 } }, { "ph": "s", "id": 49379, "pid": 76337, "tid": -914061504, "ts": 1716454218247542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218247596, "dur": 0, "args": { "External id": 49389, "cbid": 317, "correlation": 49389 } }, { "ph": "f", "id": 49389, "pid": 76337, "tid": -914061504, "ts": 1716454218247596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218247597, "dur": 0, "args": { "External id": 49390, "cbid": 203, "correlation": 49390 } }, { "ph": "f", "id": 49390, "pid": 76337, "tid": -914061504, "ts": 1716454218247597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218247598, "dur": 0, "args": { "External id": 49391, "cbid": 205, "correlation": 49391 } }, { "ph": "f", "id": 49391, "pid": 76337, "tid": -914061504, "ts": 1716454218247598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218299493, "dur": 11, "args": { "External id": 49395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49395, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49395, "pid": 5, "tid": 7, "ts": 1716454218299493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247612, "dur": 12, "args": { "External id": 49395, "cbid": 211, "correlation": 49395 } }, { "ph": "s", "id": 49395, "pid": 76337, "tid": -914061504, "ts": 1716454218247612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218299505, "dur": 158, "args": { "External id": 49397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49397, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49397, "pid": 5, "tid": 7, "ts": 1716454218299505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247626, "dur": 5, "args": { "External id": 49397, "cbid": 211, "correlation": 49397 } }, { "ph": "s", "id": 49397, "pid": 76337, "tid": -914061504, "ts": 1716454218247626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218299665, "dur": 1, "args": { "External id": 49399, "device": 5, "context": 1, "stream": 7, "correlation": 49399, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 49399, "pid": 5, "tid": 7, "ts": 1716454218299665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218247638, "dur": 7, "args": { "External id": 49399, "cbid": 51, "correlation": 49399 } }, { "ph": "s", "id": 49399, "pid": 76337, "tid": -914061504, "ts": 1716454218247638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218299669, "dur": 651, "args": { "External id": 49400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49400, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49400, "pid": 5, "tid": 7, "ts": 1716454218299669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247645, "dur": 6, "args": { "External id": 49400, "cbid": 211, "correlation": 49400 } }, { "ph": "s", "id": 49400, "pid": 76337, "tid": -914061504, "ts": 1716454218247645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218300321, "dur": 14, "args": { "External id": 49402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49402, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49402, "pid": 5, "tid": 7, "ts": 1716454218300321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247656, "dur": 6, "args": { "External id": 49402, "cbid": 211, "correlation": 49402 } }, { "ph": "s", "id": 49402, "pid": 76337, "tid": -914061504, "ts": 1716454218247656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218300337, "dur": 14, "args": { "External id": 49408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49408, "pid": 5, "tid": 7, "ts": 1716454218300337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247685, "dur": 8, "args": { "External id": 49408, "cbid": 211, "correlation": 49408 } }, { "ph": "s", "id": 49408, "pid": 76337, "tid": -914061504, "ts": 1716454218247685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218300352, "dur": 4, "args": { "External id": 49416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49416, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 49416, "pid": 5, "tid": 7, "ts": 1716454218300352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247729, "dur": 9, "args": { "External id": 49416, "cbid": 211, "correlation": 49416 } }, { "ph": "s", "id": 49416, "pid": 76337, "tid": -914061504, "ts": 1716454218247729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218247796, "dur": 1, "args": { "External id": 49432, "cbid": 251, "correlation": 49432 } }, { "ph": "f", "id": 49432, "pid": 76337, "tid": -914061504, "ts": 1716454218247796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218247801, "dur": 0, "args": { "External id": 49434, "cbid": 251, "correlation": 49434 } }, { "ph": "f", "id": 49434, "pid": 76337, "tid": -914061504, "ts": 1716454218247801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218300357, "dur": 13, "args": { "External id": 49435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49435, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49435, "pid": 5, "tid": 7, "ts": 1716454218300357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247803, "dur": 11, "args": { "External id": 49435, "cbid": 211, "correlation": 49435 } }, { "ph": "s", "id": 49435, "pid": 76337, "tid": -914061504, "ts": 1716454218247803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218300371, "dur": 5, "args": { "External id": 49437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49437, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49437, "pid": 5, "tid": 7, "ts": 1716454218300371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247817, "dur": 6, "args": { "External id": 49437, "cbid": 211, "correlation": 49437 } }, { "ph": "s", "id": 49437, "pid": 76337, "tid": -914061504, "ts": 1716454218247817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218300377, "dur": 16, "args": { "External id": 49447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49447, "pid": 5, "tid": 7, "ts": 1716454218300377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247875, "dur": 15, "args": { "External id": 49447, "cbid": 211, "correlation": 49447 } }, { "ph": "s", "id": 49447, "pid": 76337, "tid": -914061504, "ts": 1716454218247875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218300395, "dur": 17, "args": { "External id": 49467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49467, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 49467, "pid": 5, "tid": 7, "ts": 1716454218300395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247944, "dur": 11, "args": { "External id": 49467, "cbid": 211, "correlation": 49467 } }, { "ph": "s", "id": 49467, "pid": 76337, "tid": -914061504, "ts": 1716454218247944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218300413, "dur": 4, "args": { "External id": 49479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49479, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 49479, "pid": 5, "tid": 7, "ts": 1716454218300413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247965, "dur": 6, "args": { "External id": 49479, "cbid": 211, "correlation": 49479 } }, { "ph": "s", "id": 49479, "pid": 76337, "tid": -914061504, "ts": 1716454218247965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218300418, "dur": 16, "args": { "External id": 49482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49482, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49482, "pid": 5, "tid": 7, "ts": 1716454218300418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218247992, "dur": 7, "args": { "External id": 49482, "cbid": 211, "correlation": 49482 } }, { "ph": "s", "id": 49482, "pid": 76337, "tid": -914061504, "ts": 1716454218247992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218300435, "dur": 10, "args": { "External id": 49491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49491, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49491, "pid": 5, "tid": 7, "ts": 1716454218300435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248035, "dur": 10, "args": { "External id": 49491, "cbid": 211, "correlation": 49491 } }, { "ph": "s", "id": 49491, "pid": 76337, "tid": -914061504, "ts": 1716454218248035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218248096, "dur": 0, "args": { "External id": 49501, "cbid": 317, "correlation": 49501 } }, { "ph": "f", "id": 49501, "pid": 76337, "tid": -914061504, "ts": 1716454218248096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218248097, "dur": 0, "args": { "External id": 49502, "cbid": 203, "correlation": 49502 } }, { "ph": "f", "id": 49502, "pid": 76337, "tid": -914061504, "ts": 1716454218248097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218248098, "dur": 0, "args": { "External id": 49503, "cbid": 205, "correlation": 49503 } }, { "ph": "f", "id": 49503, "pid": 76337, "tid": -914061504, "ts": 1716454218248098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218300447, "dur": 11, "args": { "External id": 49507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49507, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49507, "pid": 5, "tid": 7, "ts": 1716454218300447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248111, "dur": 12, "args": { "External id": 49507, "cbid": 211, "correlation": 49507 } }, { "ph": "s", "id": 49507, "pid": 76337, "tid": -914061504, "ts": 1716454218248111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218300459, "dur": 157, "args": { "External id": 49509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49509, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49509, "pid": 5, "tid": 7, "ts": 1716454218300459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248126, "dur": 6, "args": { "External id": 49509, "cbid": 211, "correlation": 49509 } }, { "ph": "s", "id": 49509, "pid": 76337, "tid": -914061504, "ts": 1716454218248126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218300619, "dur": 1, "args": { "External id": 49511, "device": 5, "context": 1, "stream": 7, "correlation": 49511, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 49511, "pid": 5, "tid": 7, "ts": 1716454218300619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218248137, "dur": 8, "args": { "External id": 49511, "cbid": 51, "correlation": 49511 } }, { "ph": "s", "id": 49511, "pid": 76337, "tid": -914061504, "ts": 1716454218248137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218300622, "dur": 637, "args": { "External id": 49512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49512, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49512, "pid": 5, "tid": 7, "ts": 1716454218300622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248146, "dur": 6, "args": { "External id": 49512, "cbid": 211, "correlation": 49512 } }, { "ph": "s", "id": 49512, "pid": 76337, "tid": -914061504, "ts": 1716454218248146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218301260, "dur": 12, "args": { "External id": 49514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49514, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49514, "pid": 5, "tid": 7, "ts": 1716454218301260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248156, "dur": 8, "args": { "External id": 49514, "cbid": 211, "correlation": 49514 } }, { "ph": "s", "id": 49514, "pid": 76337, "tid": -914061504, "ts": 1716454218248156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218301274, "dur": 15, "args": { "External id": 49520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49520, "pid": 5, "tid": 7, "ts": 1716454218301274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248188, "dur": 9, "args": { "External id": 49520, "cbid": 211, "correlation": 49520 } }, { "ph": "s", "id": 49520, "pid": 76337, "tid": -914061504, "ts": 1716454218248188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218301290, "dur": 11, "args": { "External id": 49528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49528, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49528, "pid": 5, "tid": 7, "ts": 1716454218301290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248220, "dur": 8, "args": { "External id": 49528, "cbid": 211, "correlation": 49528 } }, { "ph": "s", "id": 49528, "pid": 76337, "tid": -914061504, "ts": 1716454218248220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218301303, "dur": 9, "args": { "External id": 49536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49536, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49536, "pid": 5, "tid": 7, "ts": 1716454218301303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248250, "dur": 8, "args": { "External id": 49536, "cbid": 211, "correlation": 49536 } }, { "ph": "s", "id": 49536, "pid": 76337, "tid": -914061504, "ts": 1716454218248250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218301313, "dur": 17, "args": { "External id": 49556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49556, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 49556, "pid": 5, "tid": 7, "ts": 1716454218301313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248329, "dur": 13, "args": { "External id": 49556, "cbid": 211, "correlation": 49556 } }, { "ph": "s", "id": 49556, "pid": 76337, "tid": -914061504, "ts": 1716454218248329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218301332, "dur": 4, "args": { "External id": 49568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49568, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 49568, "pid": 5, "tid": 7, "ts": 1716454218301332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248352, "dur": 6, "args": { "External id": 49568, "cbid": 211, "correlation": 49568 } }, { "ph": "s", "id": 49568, "pid": 76337, "tid": -914061504, "ts": 1716454218248352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218301337, "dur": 17, "args": { "External id": 49571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49571, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49571, "pid": 5, "tid": 7, "ts": 1716454218301337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248369, "dur": 6, "args": { "External id": 49571, "cbid": 211, "correlation": 49571 } }, { "ph": "s", "id": 49571, "pid": 76337, "tid": -914061504, "ts": 1716454218248369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218248426, "dur": 0, "args": { "External id": 49582, "cbid": 317, "correlation": 49582 } }, { "ph": "f", "id": 49582, "pid": 76337, "tid": -914061504, "ts": 1716454218248426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218248426, "dur": 0, "args": { "External id": 49583, "cbid": 203, "correlation": 49583 } }, { "ph": "f", "id": 49583, "pid": 76337, "tid": -914061504, "ts": 1716454218248426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218248427, "dur": 0, "args": { "External id": 49584, "cbid": 205, "correlation": 49584 } }, { "ph": "f", "id": 49584, "pid": 76337, "tid": -914061504, "ts": 1716454218248427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218301355, "dur": 11, "args": { "External id": 49588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49588, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49588, "pid": 5, "tid": 7, "ts": 1716454218301355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248445, "dur": 12, "args": { "External id": 49588, "cbid": 211, "correlation": 49588 } }, { "ph": "s", "id": 49588, "pid": 76337, "tid": -914061504, "ts": 1716454218248445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218301367, "dur": 4, "args": { "External id": 49590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49590, "pid": 5, "tid": 7, "ts": 1716454218301367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248461, "dur": 5, "args": { "External id": 49590, "cbid": 211, "correlation": 49590 } }, { "ph": "s", "id": 49590, "pid": 76337, "tid": -914061504, "ts": 1716454218248461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218248470, "dur": 0, "args": { "External id": 49591, "cbid": 51, "correlation": 49591 } }, { "ph": "s", "id": 49591, "pid": 76337, "tid": -914061504, "ts": 1716454218248470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218301372, "dur": 91, "args": { "External id": 49592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49592, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 49592, "pid": 5, "tid": 7, "ts": 1716454218301372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248470, "dur": 5, "args": { "External id": 49592, "cbid": 211, "correlation": 49592 } }, { "ph": "s", "id": 49592, "pid": 76337, "tid": -914061504, "ts": 1716454218248470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218301464, "dur": 15, "args": { "External id": 49597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49597, "pid": 5, "tid": 7, "ts": 1716454218301464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248498, "dur": 8, "args": { "External id": 49597, "cbid": 211, "correlation": 49597 } }, { "ph": "s", "id": 49597, "pid": 76337, "tid": -914061504, "ts": 1716454218248498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218301480, "dur": 81, "args": { "External id": 49606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49606, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49606, "pid": 5, "tid": 7, "ts": 1716454218301480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248581, "dur": 15, "args": { "External id": 49606, "cbid": 211, "correlation": 49606 } }, { "ph": "s", "id": 49606, "pid": 76337, "tid": -914061504, "ts": 1716454218248581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218301563, "dur": 30, "args": { "External id": 49628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49628, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49628, "pid": 5, "tid": 7, "ts": 1716454218301563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248638, "dur": 11, "args": { "External id": 49628, "cbid": 211, "correlation": 49628 } }, { "ph": "s", "id": 49628, "pid": 76337, "tid": -914061504, "ts": 1716454218248638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218248728, "dur": 1, "args": { "External id": 49639, "cbid": 251, "correlation": 49639 } }, { "ph": "f", "id": 49639, "pid": 76337, "tid": -914061504, "ts": 1716454218248728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218301594, "dur": 139, "args": { "External id": 49640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49640, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49640, "pid": 5, "tid": 7, "ts": 1716454218301594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248734, "dur": 13, "args": { "External id": 49640, "cbid": 211, "correlation": 49640 } }, { "ph": "s", "id": 49640, "pid": 76337, "tid": -914061504, "ts": 1716454218248734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218248802, "dur": 1, "args": { "External id": 49651, "cbid": 251, "correlation": 49651 } }, { "ph": "f", "id": 49651, "pid": 76337, "tid": -914061504, "ts": 1716454218248802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218301735, "dur": 153, "args": { "External id": 49652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49652, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49652, "pid": 5, "tid": 7, "ts": 1716454218301735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248807, "dur": 11, "args": { "External id": 49652, "cbid": 211, "correlation": 49652 } }, { "ph": "s", "id": 49652, "pid": 76337, "tid": -914061504, "ts": 1716454218248807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218248874, "dur": 1, "args": { "External id": 49663, "cbid": 251, "correlation": 49663 } }, { "ph": "f", "id": 49663, "pid": 76337, "tid": -914061504, "ts": 1716454218248874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218301889, "dur": 135, "args": { "External id": 49664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49664, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49664, "pid": 5, "tid": 7, "ts": 1716454218301889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248878, "dur": 11, "args": { "External id": 49664, "cbid": 211, "correlation": 49664 } }, { "ph": "s", "id": 49664, "pid": 76337, "tid": -914061504, "ts": 1716454218248878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218302026, "dur": 328, "args": { "External id": 49689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49689, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49689, "pid": 5, "tid": 7, "ts": 1716454218302026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218248963, "dur": 21, "args": { "External id": 49689, "cbid": 211, "correlation": 49689 } }, { "ph": "s", "id": 49689, "pid": 76337, "tid": -914061504, "ts": 1716454218248963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249074, "dur": 1, "args": { "External id": 49707, "cbid": 251, "correlation": 49707 } }, { "ph": "f", "id": 49707, "pid": 76337, "tid": -914061504, "ts": 1716454218249074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218302355, "dur": 164, "args": { "External id": 49709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49709, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49709, "pid": 5, "tid": 7, "ts": 1716454218302355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249080, "dur": 13, "args": { "External id": 49709, "cbid": 211, "correlation": 49709 } }, { "ph": "s", "id": 49709, "pid": 76337, "tid": -914061504, "ts": 1716454218249080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218302521, "dur": 19, "args": { "External id": 49717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49717, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49717, "pid": 5, "tid": 7, "ts": 1716454218302521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249150, "dur": 12, "args": { "External id": 49717, "cbid": 211, "correlation": 49717 } }, { "ph": "s", "id": 49717, "pid": 76337, "tid": -914061504, "ts": 1716454218249150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218302541, "dur": 28, "args": { "External id": 49725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49725, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49725, "pid": 5, "tid": 7, "ts": 1716454218302541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249189, "dur": 8, "args": { "External id": 49725, "cbid": 211, "correlation": 49725 } }, { "ph": "s", "id": 49725, "pid": 76337, "tid": -914061504, "ts": 1716454218249189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218302570, "dur": 18, "args": { "External id": 49736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49736, "pid": 5, "tid": 7, "ts": 1716454218302570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249264, "dur": 13, "args": { "External id": 49736, "cbid": 211, "correlation": 49736 } }, { "ph": "s", "id": 49736, "pid": 76337, "tid": -914061504, "ts": 1716454218249264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218302590, "dur": 15, "args": { "External id": 49758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49758, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49758, "pid": 5, "tid": 7, "ts": 1716454218302590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249297, "dur": 7, "args": { "External id": 49758, "cbid": 211, "correlation": 49758 } }, { "ph": "s", "id": 49758, "pid": 76337, "tid": -914061504, "ts": 1716454218249297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249381, "dur": 1, "args": { "External id": 49769, "cbid": 251, "correlation": 49769 } }, { "ph": "f", "id": 49769, "pid": 76337, "tid": -914061504, "ts": 1716454218249381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218302606, "dur": 85, "args": { "External id": 49770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49770, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49770, "pid": 5, "tid": 7, "ts": 1716454218302606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249387, "dur": 14, "args": { "External id": 49770, "cbid": 211, "correlation": 49770 } }, { "ph": "s", "id": 49770, "pid": 76337, "tid": -914061504, "ts": 1716454218249387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249457, "dur": 1, "args": { "External id": 49781, "cbid": 251, "correlation": 49781 } }, { "ph": "f", "id": 49781, "pid": 76337, "tid": -914061504, "ts": 1716454218249457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249461, "dur": 0, "args": { "External id": 49782, "cbid": 251, "correlation": 49782 } }, { "ph": "f", "id": 49782, "pid": 76337, "tid": -914061504, "ts": 1716454218249461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218302693, "dur": 12, "args": { "External id": 49783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49783, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49783, "pid": 5, "tid": 7, "ts": 1716454218302693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249463, "dur": 11, "args": { "External id": 49783, "cbid": 211, "correlation": 49783 } }, { "ph": "s", "id": 49783, "pid": 76337, "tid": -914061504, "ts": 1716454218249463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218302706, "dur": 6, "args": { "External id": 49785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49785, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49785, "pid": 5, "tid": 7, "ts": 1716454218302706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249476, "dur": 6, "args": { "External id": 49785, "cbid": 211, "correlation": 49785 } }, { "ph": "s", "id": 49785, "pid": 76337, "tid": -914061504, "ts": 1716454218249476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249533, "dur": 1, "args": { "External id": 49796, "cbid": 251, "correlation": 49796 } }, { "ph": "f", "id": 49796, "pid": 76337, "tid": -914061504, "ts": 1716454218249533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249537, "dur": 0, "args": { "External id": 49797, "cbid": 251, "correlation": 49797 } }, { "ph": "f", "id": 49797, "pid": 76337, "tid": -914061504, "ts": 1716454218249537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218302713, "dur": 9, "args": { "External id": 49798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49798, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49798, "pid": 5, "tid": 7, "ts": 1716454218302713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249538, "dur": 12, "args": { "External id": 49798, "cbid": 211, "correlation": 49798 } }, { "ph": "s", "id": 49798, "pid": 76337, "tid": -914061504, "ts": 1716454218249538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218302723, "dur": 3, "args": { "External id": 49800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49800, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49800, "pid": 5, "tid": 7, "ts": 1716454218302723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249551, "dur": 6, "args": { "External id": 49800, "cbid": 211, "correlation": 49800 } }, { "ph": "s", "id": 49800, "pid": 76337, "tid": -914061504, "ts": 1716454218249551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218302728, "dur": 53, "args": { "External id": 49825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49825, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49825, "pid": 5, "tid": 7, "ts": 1716454218302728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249631, "dur": 12, "args": { "External id": 49825, "cbid": 211, "correlation": 49825 } }, { "ph": "s", "id": 49825, "pid": 76337, "tid": -914061504, "ts": 1716454218249631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249730, "dur": 1, "args": { "External id": 49843, "cbid": 251, "correlation": 49843 } }, { "ph": "f", "id": 49843, "pid": 76337, "tid": -914061504, "ts": 1716454218249730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218302782, "dur": 88, "args": { "External id": 49845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49845, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49845, "pid": 5, "tid": 7, "ts": 1716454218302782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249736, "dur": 14, "args": { "External id": 49845, "cbid": 211, "correlation": 49845 } }, { "ph": "s", "id": 49845, "pid": 76337, "tid": -914061504, "ts": 1716454218249736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218302872, "dur": 9, "args": { "External id": 49853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49853, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49853, "pid": 5, "tid": 7, "ts": 1716454218302872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249805, "dur": 12, "args": { "External id": 49853, "cbid": 211, "correlation": 49853 } }, { "ph": "s", "id": 49853, "pid": 76337, "tid": -914061504, "ts": 1716454218249805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218302883, "dur": 20, "args": { "External id": 49861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49861, "pid": 5, "tid": 7, "ts": 1716454218302883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249847, "dur": 9, "args": { "External id": 49861, "cbid": 211, "correlation": 49861 } }, { "ph": "s", "id": 49861, "pid": 76337, "tid": -914061504, "ts": 1716454218249847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218302904, "dur": 17, "args": { "External id": 49883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49883, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49883, "pid": 5, "tid": 7, "ts": 1716454218302904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218249898, "dur": 10, "args": { "External id": 49883, "cbid": 211, "correlation": 49883 } }, { "ph": "s", "id": 49883, "pid": 76337, "tid": -914061504, "ts": 1716454218249898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218249996, "dur": 1, "args": { "External id": 49899, "cbid": 251, "correlation": 49899 } }, { "ph": "f", "id": 49899, "pid": 76337, "tid": -914061504, "ts": 1716454218249996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218250001, "dur": 0, "args": { "External id": 49901, "cbid": 251, "correlation": 49901 } }, { "ph": "f", "id": 49901, "pid": 76337, "tid": -914061504, "ts": 1716454218250001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218302923, "dur": 486, "args": { "External id": 49902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49902, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 49902, "pid": 5, "tid": 7, "ts": 1716454218302923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250004, "dur": 14, "args": { "External id": 49902, "cbid": 211, "correlation": 49902 } }, { "ph": "s", "id": 49902, "pid": 76337, "tid": -914061504, "ts": 1716454218250004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218303411, "dur": 65, "args": { "External id": 49910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49910, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49910, "pid": 5, "tid": 7, "ts": 1716454218303411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250072, "dur": 13, "args": { "External id": 49910, "cbid": 211, "correlation": 49910 } }, { "ph": "s", "id": 49910, "pid": 76337, "tid": -914061504, "ts": 1716454218250072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218303477, "dur": 66, "args": { "External id": 49918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49918, "pid": 5, "tid": 7, "ts": 1716454218303477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250104, "dur": 8, "args": { "External id": 49918, "cbid": 211, "correlation": 49918 } }, { "ph": "s", "id": 49918, "pid": 76337, "tid": -914061504, "ts": 1716454218250104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218250184, "dur": 1, "args": { "External id": 49934, "cbid": 251, "correlation": 49934 } }, { "ph": "f", "id": 49934, "pid": 76337, "tid": -914061504, "ts": 1716454218250184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218303545, "dur": 1, "args": { "External id": 49936, "device": 5, "context": 1, "stream": 7, "correlation": 49936, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 49936, "pid": 5, "tid": 7, "ts": 1716454218303545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218250189, "dur": 9, "args": { "External id": 49936, "cbid": 51, "correlation": 49936 } }, { "ph": "s", "id": 49936, "pid": 76337, "tid": -914061504, "ts": 1716454218250189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218303549, "dur": 266, "args": { "External id": 49937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49937, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49937, "pid": 5, "tid": 7, "ts": 1716454218303549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250201, "dur": 11, "args": { "External id": 49937, "cbid": 211, "correlation": 49937 } }, { "ph": "s", "id": 49937, "pid": 76337, "tid": -914061504, "ts": 1716454218250201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218303816, "dur": 14, "args": { "External id": 49945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49945, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49945, "pid": 5, "tid": 7, "ts": 1716454218303816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250243, "dur": 10, "args": { "External id": 49945, "cbid": 211, "correlation": 49945 } }, { "ph": "s", "id": 49945, "pid": 76337, "tid": -914061504, "ts": 1716454218250243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218303831, "dur": 36, "args": { "External id": 49956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49956, "pid": 5, "tid": 7, "ts": 1716454218303831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250312, "dur": 13, "args": { "External id": 49956, "cbid": 211, "correlation": 49956 } }, { "ph": "s", "id": 49956, "pid": 76337, "tid": -914061504, "ts": 1716454218250312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218250379, "dur": 0, "args": { "External id": 49968, "cbid": 317, "correlation": 49968 } }, { "ph": "f", "id": 49968, "pid": 76337, "tid": -914061504, "ts": 1716454218250379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218250380, "dur": 0, "args": { "External id": 49969, "cbid": 203, "correlation": 49969 } }, { "ph": "f", "id": 49969, "pid": 76337, "tid": -914061504, "ts": 1716454218250380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218250380, "dur": 0, "args": { "External id": 49970, "cbid": 205, "correlation": 49970 } }, { "ph": "f", "id": 49970, "pid": 76337, "tid": -914061504, "ts": 1716454218250380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218303868, "dur": 13, "args": { "External id": 49974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49974, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49974, "pid": 5, "tid": 7, "ts": 1716454218303868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250396, "dur": 12, "args": { "External id": 49974, "cbid": 211, "correlation": 49974 } }, { "ph": "s", "id": 49974, "pid": 76337, "tid": -914061504, "ts": 1716454218250396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218303882, "dur": 4, "args": { "External id": 49976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 49976, "pid": 5, "tid": 7, "ts": 1716454218303882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250412, "dur": 6, "args": { "External id": 49976, "cbid": 211, "correlation": 49976 } }, { "ph": "s", "id": 49976, "pid": 76337, "tid": -914061504, "ts": 1716454218250412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218250421, "dur": 0, "args": { "External id": 49977, "cbid": 51, "correlation": 49977 } }, { "ph": "s", "id": 49977, "pid": 76337, "tid": -914061504, "ts": 1716454218250421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218303887, "dur": 96, "args": { "External id": 49978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49978, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 49978, "pid": 5, "tid": 7, "ts": 1716454218303887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250422, "dur": 5, "args": { "External id": 49978, "cbid": 211, "correlation": 49978 } }, { "ph": "s", "id": 49978, "pid": 76337, "tid": -914061504, "ts": 1716454218250422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218303985, "dur": 16, "args": { "External id": 49983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49983, "pid": 5, "tid": 7, "ts": 1716454218303985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250449, "dur": 8, "args": { "External id": 49983, "cbid": 211, "correlation": 49983 } }, { "ph": "s", "id": 49983, "pid": 76337, "tid": -914061504, "ts": 1716454218250449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218304002, "dur": 11, "args": { "External id": 49991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 49991, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 49991, "pid": 5, "tid": 7, "ts": 1716454218304002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250480, "dur": 9, "args": { "External id": 49991, "cbid": 211, "correlation": 49991 } }, { "ph": "s", "id": 49991, "pid": 76337, "tid": -914061504, "ts": 1716454218250480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218250552, "dur": 0, "args": { "External id": 50001, "cbid": 317, "correlation": 50001 } }, { "ph": "f", "id": 50001, "pid": 76337, "tid": -914061504, "ts": 1716454218250552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218250553, "dur": 0, "args": { "External id": 50002, "cbid": 203, "correlation": 50002 } }, { "ph": "f", "id": 50002, "pid": 76337, "tid": -914061504, "ts": 1716454218250553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218250554, "dur": 0, "args": { "External id": 50003, "cbid": 205, "correlation": 50003 } }, { "ph": "f", "id": 50003, "pid": 76337, "tid": -914061504, "ts": 1716454218250554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304015, "dur": 11, "args": { "External id": 50007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50007, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50007, "pid": 5, "tid": 7, "ts": 1716454218304015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250569, "dur": 12, "args": { "External id": 50007, "cbid": 211, "correlation": 50007 } }, { "ph": "s", "id": 50007, "pid": 76337, "tid": -914061504, "ts": 1716454218250569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304028, "dur": 157, "args": { "External id": 50009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50009, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50009, "pid": 5, "tid": 7, "ts": 1716454218304028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250583, "dur": 5, "args": { "External id": 50009, "cbid": 211, "correlation": 50009 } }, { "ph": "s", "id": 50009, "pid": 76337, "tid": -914061504, "ts": 1716454218250583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218304187, "dur": 1, "args": { "External id": 50011, "device": 5, "context": 1, "stream": 7, "correlation": 50011, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 50011, "pid": 5, "tid": 7, "ts": 1716454218304187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218250595, "dur": 6, "args": { "External id": 50011, "cbid": 51, "correlation": 50011 } }, { "ph": "s", "id": 50011, "pid": 76337, "tid": -914061504, "ts": 1716454218250595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218304191, "dur": 195, "args": { "External id": 50012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50012, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 50012, "pid": 5, "tid": 7, "ts": 1716454218304191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250602, "dur": 7, "args": { "External id": 50012, "cbid": 211, "correlation": 50012 } }, { "ph": "s", "id": 50012, "pid": 76337, "tid": -914061504, "ts": 1716454218250602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304387, "dur": 6, "args": { "External id": 50014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50014, "pid": 5, "tid": 7, "ts": 1716454218304387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250614, "dur": 5, "args": { "External id": 50014, "cbid": 211, "correlation": 50014 } }, { "ph": "s", "id": 50014, "pid": 76337, "tid": -914061504, "ts": 1716454218250614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218304395, "dur": 6, "args": { "External id": 50020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50020, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50020, "pid": 5, "tid": 7, "ts": 1716454218304395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250642, "dur": 12, "args": { "External id": 50020, "cbid": 211, "correlation": 50020 } }, { "ph": "s", "id": 50020, "pid": 76337, "tid": -914061504, "ts": 1716454218250642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218304402, "dur": 11, "args": { "External id": 50040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50040, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50040, "pid": 5, "tid": 7, "ts": 1716454218304402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250738, "dur": 13, "args": { "External id": 50040, "cbid": 211, "correlation": 50040 } }, { "ph": "s", "id": 50040, "pid": 76337, "tid": -914061504, "ts": 1716454218250738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218304414, "dur": 4, "args": { "External id": 50052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50052, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50052, "pid": 5, "tid": 7, "ts": 1716454218304414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250761, "dur": 6, "args": { "External id": 50052, "cbid": 211, "correlation": 50052 } }, { "ph": "s", "id": 50052, "pid": 76337, "tid": -914061504, "ts": 1716454218250761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218304420, "dur": 9, "args": { "External id": 50055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50055, "pid": 5, "tid": 7, "ts": 1716454218304420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250780, "dur": 7, "args": { "External id": 50055, "cbid": 211, "correlation": 50055 } }, { "ph": "s", "id": 50055, "pid": 76337, "tid": -914061504, "ts": 1716454218250780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218304430, "dur": 5, "args": { "External id": 50064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50064, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50064, "pid": 5, "tid": 7, "ts": 1716454218304430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250820, "dur": 10, "args": { "External id": 50064, "cbid": 211, "correlation": 50064 } }, { "ph": "s", "id": 50064, "pid": 76337, "tid": -914061504, "ts": 1716454218250820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218250872, "dur": 0, "args": { "External id": 50074, "cbid": 317, "correlation": 50074 } }, { "ph": "f", "id": 50074, "pid": 76337, "tid": -914061504, "ts": 1716454218250872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218250873, "dur": 0, "args": { "External id": 50075, "cbid": 203, "correlation": 50075 } }, { "ph": "f", "id": 50075, "pid": 76337, "tid": -914061504, "ts": 1716454218250873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218250874, "dur": 0, "args": { "External id": 50076, "cbid": 205, "correlation": 50076 } }, { "ph": "f", "id": 50076, "pid": 76337, "tid": -914061504, "ts": 1716454218250874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304436, "dur": 5, "args": { "External id": 50080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50080, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50080, "pid": 5, "tid": 7, "ts": 1716454218304436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250893, "dur": 13, "args": { "External id": 50080, "cbid": 211, "correlation": 50080 } }, { "ph": "s", "id": 50080, "pid": 76337, "tid": -914061504, "ts": 1716454218250893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304442, "dur": 158, "args": { "External id": 50082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50082, "pid": 5, "tid": 7, "ts": 1716454218304442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250908, "dur": 5, "args": { "External id": 50082, "cbid": 211, "correlation": 50082 } }, { "ph": "s", "id": 50082, "pid": 76337, "tid": -914061504, "ts": 1716454218250908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218304602, "dur": 1, "args": { "External id": 50084, "device": 5, "context": 1, "stream": 7, "correlation": 50084, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 50084, "pid": 5, "tid": 7, "ts": 1716454218304602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218250919, "dur": 6, "args": { "External id": 50084, "cbid": 51, "correlation": 50084 } }, { "ph": "s", "id": 50084, "pid": 76337, "tid": -914061504, "ts": 1716454218250919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218304606, "dur": 262, "args": { "External id": 50085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50085, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50085, "pid": 5, "tid": 7, "ts": 1716454218304606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250926, "dur": 6, "args": { "External id": 50085, "cbid": 211, "correlation": 50085 } }, { "ph": "s", "id": 50085, "pid": 76337, "tid": -914061504, "ts": 1716454218250926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304869, "dur": 6, "args": { "External id": 50087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50087, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50087, "pid": 5, "tid": 7, "ts": 1716454218304869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250939, "dur": 5, "args": { "External id": 50087, "cbid": 211, "correlation": 50087 } }, { "ph": "s", "id": 50087, "pid": 76337, "tid": -914061504, "ts": 1716454218250939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218304877, "dur": 6, "args": { "External id": 50093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50093, "pid": 5, "tid": 7, "ts": 1716454218304877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218250969, "dur": 16, "args": { "External id": 50093, "cbid": 211, "correlation": 50093 } }, { "ph": "s", "id": 50093, "pid": 76337, "tid": -914061504, "ts": 1716454218250969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218304884, "dur": 3, "args": { "External id": 50101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50101, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 50101, "pid": 5, "tid": 7, "ts": 1716454218304884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251021, "dur": 10, "args": { "External id": 50101, "cbid": 211, "correlation": 50101 } }, { "ph": "s", "id": 50101, "pid": 76337, "tid": -914061504, "ts": 1716454218251021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218251087, "dur": 1, "args": { "External id": 50117, "cbid": 251, "correlation": 50117 } }, { "ph": "f", "id": 50117, "pid": 76337, "tid": -914061504, "ts": 1716454218251087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218251092, "dur": 0, "args": { "External id": 50119, "cbid": 251, "correlation": 50119 } }, { "ph": "f", "id": 50119, "pid": 76337, "tid": -914061504, "ts": 1716454218251092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218304889, "dur": 13, "args": { "External id": 50120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50120, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50120, "pid": 5, "tid": 7, "ts": 1716454218304889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251094, "dur": 12, "args": { "External id": 50120, "cbid": 211, "correlation": 50120 } }, { "ph": "s", "id": 50120, "pid": 76337, "tid": -914061504, "ts": 1716454218251094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218304903, "dur": 5, "args": { "External id": 50122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50122, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50122, "pid": 5, "tid": 7, "ts": 1716454218304903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251108, "dur": 5, "args": { "External id": 50122, "cbid": 211, "correlation": 50122 } }, { "ph": "s", "id": 50122, "pid": 76337, "tid": -914061504, "ts": 1716454218251108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218304909, "dur": 5, "args": { "External id": 50132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50132, "pid": 5, "tid": 7, "ts": 1716454218304909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251165, "dur": 12, "args": { "External id": 50132, "cbid": 211, "correlation": 50132 } }, { "ph": "s", "id": 50132, "pid": 76337, "tid": -914061504, "ts": 1716454218251165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218304916, "dur": 9, "args": { "External id": 50152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50152, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50152, "pid": 5, "tid": 7, "ts": 1716454218304916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251232, "dur": 10, "args": { "External id": 50152, "cbid": 211, "correlation": 50152 } }, { "ph": "s", "id": 50152, "pid": 76337, "tid": -914061504, "ts": 1716454218251232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218304926, "dur": 3, "args": { "External id": 50164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50164, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50164, "pid": 5, "tid": 7, "ts": 1716454218304926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251255, "dur": 7, "args": { "External id": 50164, "cbid": 211, "correlation": 50164 } }, { "ph": "s", "id": 50164, "pid": 76337, "tid": -914061504, "ts": 1716454218251255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218304931, "dur": 8, "args": { "External id": 50167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50167, "pid": 5, "tid": 7, "ts": 1716454218304931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251275, "dur": 6, "args": { "External id": 50167, "cbid": 211, "correlation": 50167 } }, { "ph": "s", "id": 50167, "pid": 76337, "tid": -914061504, "ts": 1716454218251275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218304941, "dur": 4, "args": { "External id": 50176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50176, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50176, "pid": 5, "tid": 7, "ts": 1716454218304941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251315, "dur": 10, "args": { "External id": 50176, "cbid": 211, "correlation": 50176 } }, { "ph": "s", "id": 50176, "pid": 76337, "tid": -914061504, "ts": 1716454218251315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218251378, "dur": 0, "args": { "External id": 50186, "cbid": 317, "correlation": 50186 } }, { "ph": "f", "id": 50186, "pid": 76337, "tid": -914061504, "ts": 1716454218251378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218251379, "dur": 0, "args": { "External id": 50187, "cbid": 203, "correlation": 50187 } }, { "ph": "f", "id": 50187, "pid": 76337, "tid": -914061504, "ts": 1716454218251379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218251380, "dur": 0, "args": { "External id": 50188, "cbid": 205, "correlation": 50188 } }, { "ph": "f", "id": 50188, "pid": 76337, "tid": -914061504, "ts": 1716454218251380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304946, "dur": 5, "args": { "External id": 50192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50192, "pid": 5, "tid": 7, "ts": 1716454218304946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251394, "dur": 12, "args": { "External id": 50192, "cbid": 211, "correlation": 50192 } }, { "ph": "s", "id": 50192, "pid": 76337, "tid": -914061504, "ts": 1716454218251394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218304952, "dur": 157, "args": { "External id": 50194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50194, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50194, "pid": 5, "tid": 7, "ts": 1716454218304952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251408, "dur": 5, "args": { "External id": 50194, "cbid": 211, "correlation": 50194 } }, { "ph": "s", "id": 50194, "pid": 76337, "tid": -914061504, "ts": 1716454218251408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218305112, "dur": 1, "args": { "External id": 50196, "device": 5, "context": 1, "stream": 7, "correlation": 50196, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 50196, "pid": 5, "tid": 7, "ts": 1716454218305112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218251419, "dur": 6, "args": { "External id": 50196, "cbid": 51, "correlation": 50196 } }, { "ph": "s", "id": 50196, "pid": 76337, "tid": -914061504, "ts": 1716454218251419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218305115, "dur": 252, "args": { "External id": 50197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50197, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50197, "pid": 5, "tid": 7, "ts": 1716454218305115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251427, "dur": 6, "args": { "External id": 50197, "cbid": 211, "correlation": 50197 } }, { "ph": "s", "id": 50197, "pid": 76337, "tid": -914061504, "ts": 1716454218251427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218305368, "dur": 6, "args": { "External id": 50199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50199, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50199, "pid": 5, "tid": 7, "ts": 1716454218305368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251436, "dur": 5, "args": { "External id": 50199, "cbid": 211, "correlation": 50199 } }, { "ph": "s", "id": 50199, "pid": 76337, "tid": -914061504, "ts": 1716454218251436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218305375, "dur": 6, "args": { "External id": 50205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50205, "pid": 5, "tid": 7, "ts": 1716454218305375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251464, "dur": 9, "args": { "External id": 50205, "cbid": 211, "correlation": 50205 } }, { "ph": "s", "id": 50205, "pid": 76337, "tid": -914061504, "ts": 1716454218251464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218305383, "dur": 5, "args": { "External id": 50213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50213, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50213, "pid": 5, "tid": 7, "ts": 1716454218305383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251497, "dur": 8, "args": { "External id": 50213, "cbid": 211, "correlation": 50213 } }, { "ph": "s", "id": 50213, "pid": 76337, "tid": -914061504, "ts": 1716454218251497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218305389, "dur": 4, "args": { "External id": 50221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50221, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50221, "pid": 5, "tid": 7, "ts": 1716454218305389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251529, "dur": 9, "args": { "External id": 50221, "cbid": 211, "correlation": 50221 } }, { "ph": "s", "id": 50221, "pid": 76337, "tid": -914061504, "ts": 1716454218251529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218305394, "dur": 9, "args": { "External id": 50241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50241, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50241, "pid": 5, "tid": 7, "ts": 1716454218305394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251604, "dur": 11, "args": { "External id": 50241, "cbid": 211, "correlation": 50241 } }, { "ph": "s", "id": 50241, "pid": 76337, "tid": -914061504, "ts": 1716454218251604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218305405, "dur": 4, "args": { "External id": 50253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50253, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50253, "pid": 5, "tid": 7, "ts": 1716454218305405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251625, "dur": 6, "args": { "External id": 50253, "cbid": 211, "correlation": 50253 } }, { "ph": "s", "id": 50253, "pid": 76337, "tid": -914061504, "ts": 1716454218251625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218305409, "dur": 6, "args": { "External id": 50256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50256, "pid": 5, "tid": 7, "ts": 1716454218305409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251643, "dur": 6, "args": { "External id": 50256, "cbid": 211, "correlation": 50256 } }, { "ph": "s", "id": 50256, "pid": 76337, "tid": -914061504, "ts": 1716454218251643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218305417, "dur": 4, "args": { "External id": 50265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50265, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50265, "pid": 5, "tid": 7, "ts": 1716454218305417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251681, "dur": 10, "args": { "External id": 50265, "cbid": 211, "correlation": 50265 } }, { "ph": "s", "id": 50265, "pid": 76337, "tid": -914061504, "ts": 1716454218251681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218251732, "dur": 0, "args": { "External id": 50275, "cbid": 317, "correlation": 50275 } }, { "ph": "f", "id": 50275, "pid": 76337, "tid": -914061504, "ts": 1716454218251732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218251733, "dur": 0, "args": { "External id": 50276, "cbid": 203, "correlation": 50276 } }, { "ph": "f", "id": 50276, "pid": 76337, "tid": -914061504, "ts": 1716454218251733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218251733, "dur": 0, "args": { "External id": 50277, "cbid": 205, "correlation": 50277 } }, { "ph": "f", "id": 50277, "pid": 76337, "tid": -914061504, "ts": 1716454218251733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218305423, "dur": 5, "args": { "External id": 50281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50281, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50281, "pid": 5, "tid": 7, "ts": 1716454218305423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251747, "dur": 12, "args": { "External id": 50281, "cbid": 211, "correlation": 50281 } }, { "ph": "s", "id": 50281, "pid": 76337, "tid": -914061504, "ts": 1716454218251747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218305429, "dur": 158, "args": { "External id": 50283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50283, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50283, "pid": 5, "tid": 7, "ts": 1716454218305429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251761, "dur": 5, "args": { "External id": 50283, "cbid": 211, "correlation": 50283 } }, { "ph": "s", "id": 50283, "pid": 76337, "tid": -914061504, "ts": 1716454218251761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218305589, "dur": 1, "args": { "External id": 50285, "device": 5, "context": 1, "stream": 7, "correlation": 50285, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 50285, "pid": 5, "tid": 7, "ts": 1716454218305589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218251771, "dur": 6, "args": { "External id": 50285, "cbid": 51, "correlation": 50285 } }, { "ph": "s", "id": 50285, "pid": 76337, "tid": -914061504, "ts": 1716454218251771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218305593, "dur": 251, "args": { "External id": 50286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50286, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50286, "pid": 5, "tid": 7, "ts": 1716454218305593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251779, "dur": 8, "args": { "External id": 50286, "cbid": 211, "correlation": 50286 } }, { "ph": "s", "id": 50286, "pid": 76337, "tid": -914061504, "ts": 1716454218251779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218305845, "dur": 6, "args": { "External id": 50288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50288, "pid": 5, "tid": 7, "ts": 1716454218305845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251791, "dur": 6, "args": { "External id": 50288, "cbid": 211, "correlation": 50288 } }, { "ph": "s", "id": 50288, "pid": 76337, "tid": -914061504, "ts": 1716454218251791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218305852, "dur": 6, "args": { "External id": 50294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50294, "pid": 5, "tid": 7, "ts": 1716454218305852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251819, "dur": 8, "args": { "External id": 50294, "cbid": 211, "correlation": 50294 } }, { "ph": "s", "id": 50294, "pid": 76337, "tid": -914061504, "ts": 1716454218251819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218305859, "dur": 3, "args": { "External id": 50302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50302, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 50302, "pid": 5, "tid": 7, "ts": 1716454218305859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251864, "dur": 9, "args": { "External id": 50302, "cbid": 211, "correlation": 50302 } }, { "ph": "s", "id": 50302, "pid": 76337, "tid": -914061504, "ts": 1716454218251864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218251927, "dur": 1, "args": { "External id": 50318, "cbid": 251, "correlation": 50318 } }, { "ph": "f", "id": 50318, "pid": 76337, "tid": -914061504, "ts": 1716454218251927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218251933, "dur": 0, "args": { "External id": 50320, "cbid": 251, "correlation": 50320 } }, { "ph": "f", "id": 50320, "pid": 76337, "tid": -914061504, "ts": 1716454218251933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218305864, "dur": 10, "args": { "External id": 50321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50321, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50321, "pid": 5, "tid": 7, "ts": 1716454218305864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251935, "dur": 11, "args": { "External id": 50321, "cbid": 211, "correlation": 50321 } }, { "ph": "s", "id": 50321, "pid": 76337, "tid": -914061504, "ts": 1716454218251935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218305875, "dur": 3, "args": { "External id": 50323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50323, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50323, "pid": 5, "tid": 7, "ts": 1716454218305875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218251948, "dur": 5, "args": { "External id": 50323, "cbid": 211, "correlation": 50323 } }, { "ph": "s", "id": 50323, "pid": 76337, "tid": -914061504, "ts": 1716454218251948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218305880, "dur": 5, "args": { "External id": 50333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50333, "pid": 5, "tid": 7, "ts": 1716454218305880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252013, "dur": 12, "args": { "External id": 50333, "cbid": 211, "correlation": 50333 } }, { "ph": "s", "id": 50333, "pid": 76337, "tid": -914061504, "ts": 1716454218252013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218305886, "dur": 9, "args": { "External id": 50353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50353, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50353, "pid": 5, "tid": 7, "ts": 1716454218305886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252079, "dur": 10, "args": { "External id": 50353, "cbid": 211, "correlation": 50353 } }, { "ph": "s", "id": 50353, "pid": 76337, "tid": -914061504, "ts": 1716454218252079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218305897, "dur": 4, "args": { "External id": 50365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50365, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50365, "pid": 5, "tid": 7, "ts": 1716454218305897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252103, "dur": 6, "args": { "External id": 50365, "cbid": 211, "correlation": 50365 } }, { "ph": "s", "id": 50365, "pid": 76337, "tid": -914061504, "ts": 1716454218252103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218305902, "dur": 6, "args": { "External id": 50368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50368, "pid": 5, "tid": 7, "ts": 1716454218305902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252122, "dur": 6, "args": { "External id": 50368, "cbid": 211, "correlation": 50368 } }, { "ph": "s", "id": 50368, "pid": 76337, "tid": -914061504, "ts": 1716454218252122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218305910, "dur": 4, "args": { "External id": 50377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50377, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50377, "pid": 5, "tid": 7, "ts": 1716454218305910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252162, "dur": 9, "args": { "External id": 50377, "cbid": 211, "correlation": 50377 } }, { "ph": "s", "id": 50377, "pid": 76337, "tid": -914061504, "ts": 1716454218252162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218252224, "dur": 0, "args": { "External id": 50387, "cbid": 317, "correlation": 50387 } }, { "ph": "f", "id": 50387, "pid": 76337, "tid": -914061504, "ts": 1716454218252224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218252225, "dur": 0, "args": { "External id": 50388, "cbid": 203, "correlation": 50388 } }, { "ph": "f", "id": 50388, "pid": 76337, "tid": -914061504, "ts": 1716454218252225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218252226, "dur": 0, "args": { "External id": 50389, "cbid": 205, "correlation": 50389 } }, { "ph": "f", "id": 50389, "pid": 76337, "tid": -914061504, "ts": 1716454218252226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218305916, "dur": 5, "args": { "External id": 50393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50393, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50393, "pid": 5, "tid": 7, "ts": 1716454218305916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252240, "dur": 12, "args": { "External id": 50393, "cbid": 211, "correlation": 50393 } }, { "ph": "s", "id": 50393, "pid": 76337, "tid": -914061504, "ts": 1716454218252240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218305922, "dur": 157, "args": { "External id": 50395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50395, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50395, "pid": 5, "tid": 7, "ts": 1716454218305922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252255, "dur": 6, "args": { "External id": 50395, "cbid": 211, "correlation": 50395 } }, { "ph": "s", "id": 50395, "pid": 76337, "tid": -914061504, "ts": 1716454218252255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218306081, "dur": 1, "args": { "External id": 50397, "device": 5, "context": 1, "stream": 7, "correlation": 50397, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 50397, "pid": 5, "tid": 7, "ts": 1716454218306081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218252266, "dur": 6, "args": { "External id": 50397, "cbid": 51, "correlation": 50397 } }, { "ph": "s", "id": 50397, "pid": 76337, "tid": -914061504, "ts": 1716454218252266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218306084, "dur": 251, "args": { "External id": 50398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50398, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50398, "pid": 5, "tid": 7, "ts": 1716454218306084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252273, "dur": 6, "args": { "External id": 50398, "cbid": 211, "correlation": 50398 } }, { "ph": "s", "id": 50398, "pid": 76337, "tid": -914061504, "ts": 1716454218252273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218306337, "dur": 5, "args": { "External id": 50400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50400, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50400, "pid": 5, "tid": 7, "ts": 1716454218306337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252283, "dur": 5, "args": { "External id": 50400, "cbid": 211, "correlation": 50400 } }, { "ph": "s", "id": 50400, "pid": 76337, "tid": -914061504, "ts": 1716454218252283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218306344, "dur": 6, "args": { "External id": 50406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50406, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50406, "pid": 5, "tid": 7, "ts": 1716454218306344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252313, "dur": 8, "args": { "External id": 50406, "cbid": 211, "correlation": 50406 } }, { "ph": "s", "id": 50406, "pid": 76337, "tid": -914061504, "ts": 1716454218252313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218306351, "dur": 5, "args": { "External id": 50414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50414, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50414, "pid": 5, "tid": 7, "ts": 1716454218306351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252344, "dur": 8, "args": { "External id": 50414, "cbid": 211, "correlation": 50414 } }, { "ph": "s", "id": 50414, "pid": 76337, "tid": -914061504, "ts": 1716454218252344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218306357, "dur": 4, "args": { "External id": 50422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50422, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50422, "pid": 5, "tid": 7, "ts": 1716454218306357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252377, "dur": 8, "args": { "External id": 50422, "cbid": 211, "correlation": 50422 } }, { "ph": "s", "id": 50422, "pid": 76337, "tid": -914061504, "ts": 1716454218252377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218306363, "dur": 9, "args": { "External id": 50442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50442, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50442, "pid": 5, "tid": 7, "ts": 1716454218306363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252481, "dur": 12, "args": { "External id": 50442, "cbid": 211, "correlation": 50442 } }, { "ph": "s", "id": 50442, "pid": 76337, "tid": -914061504, "ts": 1716454218252481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218306373, "dur": 3, "args": { "External id": 50454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50454, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50454, "pid": 5, "tid": 7, "ts": 1716454218306373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252504, "dur": 6, "args": { "External id": 50454, "cbid": 211, "correlation": 50454 } }, { "ph": "s", "id": 50454, "pid": 76337, "tid": -914061504, "ts": 1716454218252504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218306378, "dur": 6, "args": { "External id": 50457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50457, "pid": 5, "tid": 7, "ts": 1716454218306378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252522, "dur": 6, "args": { "External id": 50457, "cbid": 211, "correlation": 50457 } }, { "ph": "s", "id": 50457, "pid": 76337, "tid": -914061504, "ts": 1716454218252522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218306386, "dur": 4, "args": { "External id": 50466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50466, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50466, "pid": 5, "tid": 7, "ts": 1716454218306386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252559, "dur": 10, "args": { "External id": 50466, "cbid": 211, "correlation": 50466 } }, { "ph": "s", "id": 50466, "pid": 76337, "tid": -914061504, "ts": 1716454218252559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218252612, "dur": 0, "args": { "External id": 50476, "cbid": 317, "correlation": 50476 } }, { "ph": "f", "id": 50476, "pid": 76337, "tid": -914061504, "ts": 1716454218252612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218252613, "dur": 0, "args": { "External id": 50477, "cbid": 203, "correlation": 50477 } }, { "ph": "f", "id": 50477, "pid": 76337, "tid": -914061504, "ts": 1716454218252613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218252614, "dur": 0, "args": { "External id": 50478, "cbid": 205, "correlation": 50478 } }, { "ph": "f", "id": 50478, "pid": 76337, "tid": -914061504, "ts": 1716454218252614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218306391, "dur": 5, "args": { "External id": 50482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50482, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50482, "pid": 5, "tid": 7, "ts": 1716454218306391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252627, "dur": 11, "args": { "External id": 50482, "cbid": 211, "correlation": 50482 } }, { "ph": "s", "id": 50482, "pid": 76337, "tid": -914061504, "ts": 1716454218252627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218306398, "dur": 157, "args": { "External id": 50484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50484, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50484, "pid": 5, "tid": 7, "ts": 1716454218306398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252641, "dur": 5, "args": { "External id": 50484, "cbid": 211, "correlation": 50484 } }, { "ph": "s", "id": 50484, "pid": 76337, "tid": -914061504, "ts": 1716454218252641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218306557, "dur": 1, "args": { "External id": 50486, "device": 5, "context": 1, "stream": 7, "correlation": 50486, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 50486, "pid": 5, "tid": 7, "ts": 1716454218306557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218252651, "dur": 9, "args": { "External id": 50486, "cbid": 51, "correlation": 50486 } }, { "ph": "s", "id": 50486, "pid": 76337, "tid": -914061504, "ts": 1716454218252651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218306561, "dur": 251, "args": { "External id": 50487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50487, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50487, "pid": 5, "tid": 7, "ts": 1716454218306561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252662, "dur": 6, "args": { "External id": 50487, "cbid": 211, "correlation": 50487 } }, { "ph": "s", "id": 50487, "pid": 76337, "tid": -914061504, "ts": 1716454218252662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218306813, "dur": 6, "args": { "External id": 50489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50489, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50489, "pid": 5, "tid": 7, "ts": 1716454218306813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252672, "dur": 6, "args": { "External id": 50489, "cbid": 211, "correlation": 50489 } }, { "ph": "s", "id": 50489, "pid": 76337, "tid": -914061504, "ts": 1716454218252672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218306820, "dur": 6, "args": { "External id": 50495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50495, "pid": 5, "tid": 7, "ts": 1716454218306820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252700, "dur": 8, "args": { "External id": 50495, "cbid": 211, "correlation": 50495 } }, { "ph": "s", "id": 50495, "pid": 76337, "tid": -914061504, "ts": 1716454218252700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218306828, "dur": 3, "args": { "External id": 50503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50503, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 50503, "pid": 5, "tid": 7, "ts": 1716454218306828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252744, "dur": 9, "args": { "External id": 50503, "cbid": 211, "correlation": 50503 } }, { "ph": "s", "id": 50503, "pid": 76337, "tid": -914061504, "ts": 1716454218252744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218252805, "dur": 1, "args": { "External id": 50519, "cbid": 251, "correlation": 50519 } }, { "ph": "f", "id": 50519, "pid": 76337, "tid": -914061504, "ts": 1716454218252805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218252810, "dur": 0, "args": { "External id": 50521, "cbid": 251, "correlation": 50521 } }, { "ph": "f", "id": 50521, "pid": 76337, "tid": -914061504, "ts": 1716454218252810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218306832, "dur": 10, "args": { "External id": 50522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50522, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50522, "pid": 5, "tid": 7, "ts": 1716454218306832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252812, "dur": 11, "args": { "External id": 50522, "cbid": 211, "correlation": 50522 } }, { "ph": "s", "id": 50522, "pid": 76337, "tid": -914061504, "ts": 1716454218252812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218306844, "dur": 3, "args": { "External id": 50524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50524, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50524, "pid": 5, "tid": 7, "ts": 1716454218306844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252825, "dur": 5, "args": { "External id": 50524, "cbid": 211, "correlation": 50524 } }, { "ph": "s", "id": 50524, "pid": 76337, "tid": -914061504, "ts": 1716454218252825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218306848, "dur": 5, "args": { "External id": 50534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50534, "pid": 5, "tid": 7, "ts": 1716454218306848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252882, "dur": 13, "args": { "External id": 50534, "cbid": 211, "correlation": 50534 } }, { "ph": "s", "id": 50534, "pid": 76337, "tid": -914061504, "ts": 1716454218252882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218306855, "dur": 9, "args": { "External id": 50554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50554, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50554, "pid": 5, "tid": 7, "ts": 1716454218306855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252948, "dur": 13, "args": { "External id": 50554, "cbid": 211, "correlation": 50554 } }, { "ph": "s", "id": 50554, "pid": 76337, "tid": -914061504, "ts": 1716454218252948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218306866, "dur": 4, "args": { "External id": 50566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50566, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50566, "pid": 5, "tid": 7, "ts": 1716454218306866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252971, "dur": 15, "args": { "External id": 50566, "cbid": 211, "correlation": 50566 } }, { "ph": "s", "id": 50566, "pid": 76337, "tid": -914061504, "ts": 1716454218252971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218306871, "dur": 6, "args": { "External id": 50569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50569, "pid": 5, "tid": 7, "ts": 1716454218306871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218252999, "dur": 7, "args": { "External id": 50569, "cbid": 211, "correlation": 50569 } }, { "ph": "s", "id": 50569, "pid": 76337, "tid": -914061504, "ts": 1716454218252999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218306878, "dur": 4, "args": { "External id": 50578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50578, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50578, "pid": 5, "tid": 7, "ts": 1716454218306878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253042, "dur": 10, "args": { "External id": 50578, "cbid": 211, "correlation": 50578 } }, { "ph": "s", "id": 50578, "pid": 76337, "tid": -914061504, "ts": 1716454218253042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218253105, "dur": 0, "args": { "External id": 50588, "cbid": 317, "correlation": 50588 } }, { "ph": "f", "id": 50588, "pid": 76337, "tid": -914061504, "ts": 1716454218253105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218253106, "dur": 0, "args": { "External id": 50589, "cbid": 203, "correlation": 50589 } }, { "ph": "f", "id": 50589, "pid": 76337, "tid": -914061504, "ts": 1716454218253106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218253107, "dur": 0, "args": { "External id": 50590, "cbid": 205, "correlation": 50590 } }, { "ph": "f", "id": 50590, "pid": 76337, "tid": -914061504, "ts": 1716454218253107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218306884, "dur": 5, "args": { "External id": 50594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50594, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50594, "pid": 5, "tid": 7, "ts": 1716454218306884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253120, "dur": 12, "args": { "External id": 50594, "cbid": 211, "correlation": 50594 } }, { "ph": "s", "id": 50594, "pid": 76337, "tid": -914061504, "ts": 1716454218253120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218306890, "dur": 157, "args": { "External id": 50596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50596, "pid": 5, "tid": 7, "ts": 1716454218306890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253135, "dur": 6, "args": { "External id": 50596, "cbid": 211, "correlation": 50596 } }, { "ph": "s", "id": 50596, "pid": 76337, "tid": -914061504, "ts": 1716454218253135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218307049, "dur": 1, "args": { "External id": 50598, "device": 5, "context": 1, "stream": 7, "correlation": 50598, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 50598, "pid": 5, "tid": 7, "ts": 1716454218307049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218253146, "dur": 6, "args": { "External id": 50598, "cbid": 51, "correlation": 50598 } }, { "ph": "s", "id": 50598, "pid": 76337, "tid": -914061504, "ts": 1716454218253146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218307052, "dur": 250, "args": { "External id": 50599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50599, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50599, "pid": 5, "tid": 7, "ts": 1716454218307052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253153, "dur": 6, "args": { "External id": 50599, "cbid": 211, "correlation": 50599 } }, { "ph": "s", "id": 50599, "pid": 76337, "tid": -914061504, "ts": 1716454218253153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218307304, "dur": 6, "args": { "External id": 50601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50601, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50601, "pid": 5, "tid": 7, "ts": 1716454218307304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253163, "dur": 5, "args": { "External id": 50601, "cbid": 211, "correlation": 50601 } }, { "ph": "s", "id": 50601, "pid": 76337, "tid": -914061504, "ts": 1716454218253163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218307311, "dur": 6, "args": { "External id": 50607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50607, "pid": 5, "tid": 7, "ts": 1716454218307311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253191, "dur": 9, "args": { "External id": 50607, "cbid": 211, "correlation": 50607 } }, { "ph": "s", "id": 50607, "pid": 76337, "tid": -914061504, "ts": 1716454218253191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218307318, "dur": 5, "args": { "External id": 50615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50615, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50615, "pid": 5, "tid": 7, "ts": 1716454218307318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253223, "dur": 11, "args": { "External id": 50615, "cbid": 211, "correlation": 50615 } }, { "ph": "s", "id": 50615, "pid": 76337, "tid": -914061504, "ts": 1716454218253223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218307325, "dur": 4, "args": { "External id": 50623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50623, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50623, "pid": 5, "tid": 7, "ts": 1716454218307325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253256, "dur": 9, "args": { "External id": 50623, "cbid": 211, "correlation": 50623 } }, { "ph": "s", "id": 50623, "pid": 76337, "tid": -914061504, "ts": 1716454218253256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218307330, "dur": 9, "args": { "External id": 50643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50643, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 50643, "pid": 5, "tid": 7, "ts": 1716454218307330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253400, "dur": 14, "args": { "External id": 50643, "cbid": 211, "correlation": 50643 } }, { "ph": "s", "id": 50643, "pid": 76337, "tid": -914061504, "ts": 1716454218253400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218307341, "dur": 4, "args": { "External id": 50655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50655, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 50655, "pid": 5, "tid": 7, "ts": 1716454218307341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253423, "dur": 7, "args": { "External id": 50655, "cbid": 211, "correlation": 50655 } }, { "ph": "s", "id": 50655, "pid": 76337, "tid": -914061504, "ts": 1716454218253423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218307345, "dur": 6, "args": { "External id": 50658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50658, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50658, "pid": 5, "tid": 7, "ts": 1716454218307345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253442, "dur": 7, "args": { "External id": 50658, "cbid": 211, "correlation": 50658 } }, { "ph": "s", "id": 50658, "pid": 76337, "tid": -914061504, "ts": 1716454218253442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218253502, "dur": 0, "args": { "External id": 50669, "cbid": 317, "correlation": 50669 } }, { "ph": "f", "id": 50669, "pid": 76337, "tid": -914061504, "ts": 1716454218253502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218253503, "dur": 0, "args": { "External id": 50670, "cbid": 203, "correlation": 50670 } }, { "ph": "f", "id": 50670, "pid": 76337, "tid": -914061504, "ts": 1716454218253503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218253504, "dur": 0, "args": { "External id": 50671, "cbid": 205, "correlation": 50671 } }, { "ph": "f", "id": 50671, "pid": 76337, "tid": -914061504, "ts": 1716454218253504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218307353, "dur": 5, "args": { "External id": 50675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50675, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50675, "pid": 5, "tid": 7, "ts": 1716454218307353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253520, "dur": 12, "args": { "External id": 50675, "cbid": 211, "correlation": 50675 } }, { "ph": "s", "id": 50675, "pid": 76337, "tid": -914061504, "ts": 1716454218253520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218307359, "dur": 36, "args": { "External id": 50677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50677, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 50677, "pid": 5, "tid": 7, "ts": 1716454218307359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253539, "dur": 10, "args": { "External id": 50677, "cbid": 211, "correlation": 50677 } }, { "ph": "s", "id": 50677, "pid": 76337, "tid": -914061504, "ts": 1716454218253539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218307396, "dur": 5, "args": { "External id": 50679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50679, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50679, "pid": 5, "tid": 7, "ts": 1716454218307396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253554, "dur": 5, "args": { "External id": 50679, "cbid": 211, "correlation": 50679 } }, { "ph": "s", "id": 50679, "pid": 76337, "tid": -914061504, "ts": 1716454218253554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218307402, "dur": 6, "args": { "External id": 50685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50685, "pid": 5, "tid": 7, "ts": 1716454218307402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253580, "dur": 11, "args": { "External id": 50685, "cbid": 211, "correlation": 50685 } }, { "ph": "s", "id": 50685, "pid": 76337, "tid": -914061504, "ts": 1716454218253580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218307409, "dur": 19, "args": { "External id": 50694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50694, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50694, "pid": 5, "tid": 7, "ts": 1716454218307409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253666, "dur": 14, "args": { "External id": 50694, "cbid": 211, "correlation": 50694 } }, { "ph": "s", "id": 50694, "pid": 76337, "tid": -914061504, "ts": 1716454218253666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218307430, "dur": 10, "args": { "External id": 50716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50716, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 50716, "pid": 5, "tid": 7, "ts": 1716454218307430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253723, "dur": 10, "args": { "External id": 50716, "cbid": 211, "correlation": 50716 } }, { "ph": "s", "id": 50716, "pid": 76337, "tid": -914061504, "ts": 1716454218253723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218253813, "dur": 2, "args": { "External id": 50727, "cbid": 251, "correlation": 50727 } }, { "ph": "f", "id": 50727, "pid": 76337, "tid": -914061504, "ts": 1716454218253813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218253818, "dur": 0, "args": { "External id": 50728, "cbid": 251, "correlation": 50728 } }, { "ph": "f", "id": 50728, "pid": 76337, "tid": -914061504, "ts": 1716454218253818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218307441, "dur": 52, "args": { "External id": 50729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50729, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 50729, "pid": 5, "tid": 7, "ts": 1716454218307441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253821, "dur": 14, "args": { "External id": 50729, "cbid": 211, "correlation": 50729 } }, { "ph": "s", "id": 50729, "pid": 76337, "tid": -914061504, "ts": 1716454218253821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218253892, "dur": 1, "args": { "External id": 50740, "cbid": 251, "correlation": 50740 } }, { "ph": "f", "id": 50740, "pid": 76337, "tid": -914061504, "ts": 1716454218253892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218253896, "dur": 0, "args": { "External id": 50741, "cbid": 251, "correlation": 50741 } }, { "ph": "f", "id": 50741, "pid": 76337, "tid": -914061504, "ts": 1716454218253896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218307495, "dur": 52, "args": { "External id": 50742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50742, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 50742, "pid": 5, "tid": 7, "ts": 1716454218307495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253898, "dur": 12, "args": { "External id": 50742, "cbid": 211, "correlation": 50742 } }, { "ph": "s", "id": 50742, "pid": 76337, "tid": -914061504, "ts": 1716454218253898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218253965, "dur": 1, "args": { "External id": 50753, "cbid": 251, "correlation": 50753 } }, { "ph": "f", "id": 50753, "pid": 76337, "tid": -914061504, "ts": 1716454218253965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218253968, "dur": 0, "args": { "External id": 50754, "cbid": 251, "correlation": 50754 } }, { "ph": "f", "id": 50754, "pid": 76337, "tid": -914061504, "ts": 1716454218253968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218307548, "dur": 52, "args": { "External id": 50755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50755, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 50755, "pid": 5, "tid": 7, "ts": 1716454218307548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218253970, "dur": 21, "args": { "External id": 50755, "cbid": 211, "correlation": 50755 } }, { "ph": "s", "id": 50755, "pid": 76337, "tid": -914061504, "ts": 1716454218253970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218307601, "dur": 55, "args": { "External id": 50780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50780, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50780, "pid": 5, "tid": 7, "ts": 1716454218307601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254068, "dur": 14, "args": { "External id": 50780, "cbid": 211, "correlation": 50780 } }, { "ph": "s", "id": 50780, "pid": 76337, "tid": -914061504, "ts": 1716454218254068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254169, "dur": 1, "args": { "External id": 50798, "cbid": 251, "correlation": 50798 } }, { "ph": "f", "id": 50798, "pid": 76337, "tid": -914061504, "ts": 1716454218254169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218307657, "dur": 62, "args": { "External id": 50800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50800, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 50800, "pid": 5, "tid": 7, "ts": 1716454218307657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254175, "dur": 13, "args": { "External id": 50800, "cbid": 211, "correlation": 50800 } }, { "ph": "s", "id": 50800, "pid": 76337, "tid": -914061504, "ts": 1716454218254175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218307721, "dur": 6, "args": { "External id": 50808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50808, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50808, "pid": 5, "tid": 7, "ts": 1716454218307721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254245, "dur": 12, "args": { "External id": 50808, "cbid": 211, "correlation": 50808 } }, { "ph": "s", "id": 50808, "pid": 76337, "tid": -914061504, "ts": 1716454218254245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218307728, "dur": 7, "args": { "External id": 50816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50816, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 50816, "pid": 5, "tid": 7, "ts": 1716454218307728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254284, "dur": 9, "args": { "External id": 50816, "cbid": 211, "correlation": 50816 } }, { "ph": "s", "id": 50816, "pid": 76337, "tid": -914061504, "ts": 1716454218254284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218307736, "dur": 7, "args": { "External id": 50827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50827, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50827, "pid": 5, "tid": 7, "ts": 1716454218307736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254358, "dur": 13, "args": { "External id": 50827, "cbid": 211, "correlation": 50827 } }, { "ph": "s", "id": 50827, "pid": 76337, "tid": -914061504, "ts": 1716454218254358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218307744, "dur": 8, "args": { "External id": 50849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50849, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 50849, "pid": 5, "tid": 7, "ts": 1716454218307744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254394, "dur": 8, "args": { "External id": 50849, "cbid": 211, "correlation": 50849 } }, { "ph": "s", "id": 50849, "pid": 76337, "tid": -914061504, "ts": 1716454218254394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254481, "dur": 2, "args": { "External id": 50860, "cbid": 251, "correlation": 50860 } }, { "ph": "f", "id": 50860, "pid": 76337, "tid": -914061504, "ts": 1716454218254481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218307754, "dur": 1, "args": { "External id": 50861, "device": 5, "context": 1, "stream": 7, "correlation": 50861, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 50861, "pid": 5, "tid": 7, "ts": 1716454218307754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218254487, "dur": 10, "args": { "External id": 50861, "cbid": 51, "correlation": 50861 } }, { "ph": "s", "id": 50861, "pid": 76337, "tid": -914061504, "ts": 1716454218254487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218307758, "dur": 36, "args": { "External id": 50862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50862, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 50862, "pid": 5, "tid": 7, "ts": 1716454218307758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254499, "dur": 13, "args": { "External id": 50862, "cbid": 211, "correlation": 50862 } }, { "ph": "s", "id": 50862, "pid": 76337, "tid": -914061504, "ts": 1716454218254499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254571, "dur": 1, "args": { "External id": 50873, "cbid": 251, "correlation": 50873 } }, { "ph": "f", "id": 50873, "pid": 76337, "tid": -914061504, "ts": 1716454218254571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254575, "dur": 0, "args": { "External id": 50874, "cbid": 251, "correlation": 50874 } }, { "ph": "f", "id": 50874, "pid": 76337, "tid": -914061504, "ts": 1716454218254575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218307795, "dur": 12, "args": { "External id": 50875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50875, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50875, "pid": 5, "tid": 7, "ts": 1716454218307795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254576, "dur": 13, "args": { "External id": 50875, "cbid": 211, "correlation": 50875 } }, { "ph": "s", "id": 50875, "pid": 76337, "tid": -914061504, "ts": 1716454218254576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218307808, "dur": 5, "args": { "External id": 50877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50877, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50877, "pid": 5, "tid": 7, "ts": 1716454218307808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254591, "dur": 6, "args": { "External id": 50877, "cbid": 211, "correlation": 50877 } }, { "ph": "s", "id": 50877, "pid": 76337, "tid": -914061504, "ts": 1716454218254591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254649, "dur": 1, "args": { "External id": 50888, "cbid": 251, "correlation": 50888 } }, { "ph": "f", "id": 50888, "pid": 76337, "tid": -914061504, "ts": 1716454218254649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254652, "dur": 0, "args": { "External id": 50889, "cbid": 251, "correlation": 50889 } }, { "ph": "f", "id": 50889, "pid": 76337, "tid": -914061504, "ts": 1716454218254652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218307815, "dur": 8, "args": { "External id": 50890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50890, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50890, "pid": 5, "tid": 7, "ts": 1716454218307815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254654, "dur": 11, "args": { "External id": 50890, "cbid": 211, "correlation": 50890 } }, { "ph": "s", "id": 50890, "pid": 76337, "tid": -914061504, "ts": 1716454218254654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218307824, "dur": 4, "args": { "External id": 50892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50892, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50892, "pid": 5, "tid": 7, "ts": 1716454218307824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254667, "dur": 5, "args": { "External id": 50892, "cbid": 211, "correlation": 50892 } }, { "ph": "s", "id": 50892, "pid": 76337, "tid": -914061504, "ts": 1716454218254667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218307829, "dur": 19, "args": { "External id": 50917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50917, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 50917, "pid": 5, "tid": 7, "ts": 1716454218307829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254748, "dur": 13, "args": { "External id": 50917, "cbid": 211, "correlation": 50917 } }, { "ph": "s", "id": 50917, "pid": 76337, "tid": -914061504, "ts": 1716454218254748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218254850, "dur": 2, "args": { "External id": 50935, "cbid": 251, "correlation": 50935 } }, { "ph": "f", "id": 50935, "pid": 76337, "tid": -914061504, "ts": 1716454218254850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218307850, "dur": 1, "args": { "External id": 50937, "device": 5, "context": 1, "stream": 7, "correlation": 50937, "bytes": 480, "memory bandwidth (GB/s)": 0.3127035830618892 } }, { "ph": "f", "id": 50937, "pid": 5, "tid": 7, "ts": 1716454218307850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218254856, "dur": 11, "args": { "External id": 50937, "cbid": 51, "correlation": 50937 } }, { "ph": "s", "id": 50937, "pid": 76337, "tid": -914061504, "ts": 1716454218254856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218307854, "dur": 36, "args": { "External id": 50938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50938, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 50938, "pid": 5, "tid": 7, "ts": 1716454218307854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254869, "dur": 13, "args": { "External id": 50938, "cbid": 211, "correlation": 50938 } }, { "ph": "s", "id": 50938, "pid": 76337, "tid": -914061504, "ts": 1716454218254869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218307891, "dur": 4, "args": { "External id": 50946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50946, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50946, "pid": 5, "tid": 7, "ts": 1716454218307891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254939, "dur": 12, "args": { "External id": 50946, "cbid": 211, "correlation": 50946 } }, { "ph": "s", "id": 50946, "pid": 76337, "tid": -914061504, "ts": 1716454218254939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218307896, "dur": 8, "args": { "External id": 50954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 50954, "pid": 5, "tid": 7, "ts": 1716454218307896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218254990, "dur": 10, "args": { "External id": 50954, "cbid": 211, "correlation": 50954 } }, { "ph": "s", "id": 50954, "pid": 76337, "tid": -914061504, "ts": 1716454218254990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218307905, "dur": 8, "args": { "External id": 50976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50976, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 50976, "pid": 5, "tid": 7, "ts": 1716454218307905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255044, "dur": 10, "args": { "External id": 50976, "cbid": 211, "correlation": 50976 } }, { "ph": "s", "id": 50976, "pid": 76337, "tid": -914061504, "ts": 1716454218255044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218255136, "dur": 1, "args": { "External id": 50992, "cbid": 251, "correlation": 50992 } }, { "ph": "f", "id": 50992, "pid": 76337, "tid": -914061504, "ts": 1716454218255136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218255144, "dur": 0, "args": { "External id": 50994, "cbid": 251, "correlation": 50994 } }, { "ph": "f", "id": 50994, "pid": 76337, "tid": -914061504, "ts": 1716454218255144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218307915, "dur": 187, "args": { "External id": 50995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 50995, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 50995, "pid": 5, "tid": 7, "ts": 1716454218307915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255146, "dur": 13, "args": { "External id": 50995, "cbid": 211, "correlation": 50995 } }, { "ph": "s", "id": 50995, "pid": 76337, "tid": -914061504, "ts": 1716454218255146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308103, "dur": 20, "args": { "External id": 51003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51003, "pid": 5, "tid": 7, "ts": 1716454218308103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255214, "dur": 13, "args": { "External id": 51003, "cbid": 211, "correlation": 51003 } }, { "ph": "s", "id": 51003, "pid": 76337, "tid": -914061504, "ts": 1716454218255214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308124, "dur": 22, "args": { "External id": 51011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51011, "pid": 5, "tid": 7, "ts": 1716454218308124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255246, "dur": 8, "args": { "External id": 51011, "cbid": 211, "correlation": 51011 } }, { "ph": "s", "id": 51011, "pid": 76337, "tid": -914061504, "ts": 1716454218255246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218255328, "dur": 1, "args": { "External id": 51027, "cbid": 251, "correlation": 51027 } }, { "ph": "f", "id": 51027, "pid": 76337, "tid": -914061504, "ts": 1716454218255328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218308148, "dur": 1, "args": { "External id": 51029, "device": 5, "context": 1, "stream": 7, "correlation": 51029, "bytes": 120, "memory bandwidth (GB/s)": 0.0797872340425532 } }, { "ph": "f", "id": 51029, "pid": 5, "tid": 7, "ts": 1716454218308148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218255333, "dur": 8, "args": { "External id": 51029, "cbid": 51, "correlation": 51029 } }, { "ph": "s", "id": 51029, "pid": 76337, "tid": -914061504, "ts": 1716454218255333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218308151, "dur": 108, "args": { "External id": 51030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51030, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 51030, "pid": 5, "tid": 7, "ts": 1716454218308151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255343, "dur": 11, "args": { "External id": 51030, "cbid": 211, "correlation": 51030 } }, { "ph": "s", "id": 51030, "pid": 76337, "tid": -914061504, "ts": 1716454218255343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218308261, "dur": 5, "args": { "External id": 51038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51038, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51038, "pid": 5, "tid": 7, "ts": 1716454218308261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255385, "dur": 10, "args": { "External id": 51038, "cbid": 211, "correlation": 51038 } }, { "ph": "s", "id": 51038, "pid": 76337, "tid": -914061504, "ts": 1716454218255385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308268, "dur": 9, "args": { "External id": 51049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51049, "pid": 5, "tid": 7, "ts": 1716454218308268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255453, "dur": 12, "args": { "External id": 51049, "cbid": 211, "correlation": 51049 } }, { "ph": "s", "id": 51049, "pid": 76337, "tid": -914061504, "ts": 1716454218255453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218255519, "dur": 0, "args": { "External id": 51061, "cbid": 317, "correlation": 51061 } }, { "ph": "f", "id": 51061, "pid": 76337, "tid": -914061504, "ts": 1716454218255519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218255520, "dur": 0, "args": { "External id": 51062, "cbid": 203, "correlation": 51062 } }, { "ph": "f", "id": 51062, "pid": 76337, "tid": -914061504, "ts": 1716454218255520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218255521, "dur": 0, "args": { "External id": 51063, "cbid": 205, "correlation": 51063 } }, { "ph": "f", "id": 51063, "pid": 76337, "tid": -914061504, "ts": 1716454218255521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308278, "dur": 5, "args": { "External id": 51067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51067, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51067, "pid": 5, "tid": 7, "ts": 1716454218308278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255537, "dur": 15, "args": { "External id": 51067, "cbid": 211, "correlation": 51067 } }, { "ph": "s", "id": 51067, "pid": 76337, "tid": -914061504, "ts": 1716454218255537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218308285, "dur": 36, "args": { "External id": 51069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51069, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 51069, "pid": 5, "tid": 7, "ts": 1716454218308285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255559, "dur": 8, "args": { "External id": 51069, "cbid": 211, "correlation": 51069 } }, { "ph": "s", "id": 51069, "pid": 76337, "tid": -914061504, "ts": 1716454218255559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308322, "dur": 5, "args": { "External id": 51071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51071, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51071, "pid": 5, "tid": 7, "ts": 1716454218308322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255571, "dur": 6, "args": { "External id": 51071, "cbid": 211, "correlation": 51071 } }, { "ph": "s", "id": 51071, "pid": 76337, "tid": -914061504, "ts": 1716454218255571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308329, "dur": 7, "args": { "External id": 51077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51077, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51077, "pid": 5, "tid": 7, "ts": 1716454218308329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255599, "dur": 9, "args": { "External id": 51077, "cbid": 211, "correlation": 51077 } }, { "ph": "s", "id": 51077, "pid": 76337, "tid": -914061504, "ts": 1716454218255599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218308337, "dur": 5, "args": { "External id": 51085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51085, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51085, "pid": 5, "tid": 7, "ts": 1716454218308337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255632, "dur": 8, "args": { "External id": 51085, "cbid": 211, "correlation": 51085 } }, { "ph": "s", "id": 51085, "pid": 76337, "tid": -914061504, "ts": 1716454218255632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218308343, "dur": 10, "args": { "External id": 51105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51105, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51105, "pid": 5, "tid": 7, "ts": 1716454218308343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255704, "dur": 12, "args": { "External id": 51105, "cbid": 211, "correlation": 51105 } }, { "ph": "s", "id": 51105, "pid": 76337, "tid": -914061504, "ts": 1716454218255704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218308355, "dur": 4, "args": { "External id": 51117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51117, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 51117, "pid": 5, "tid": 7, "ts": 1716454218308355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255725, "dur": 7, "args": { "External id": 51117, "cbid": 211, "correlation": 51117 } }, { "ph": "s", "id": 51117, "pid": 76337, "tid": -914061504, "ts": 1716454218255725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308361, "dur": 8, "args": { "External id": 51120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51120, "pid": 5, "tid": 7, "ts": 1716454218308361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255743, "dur": 6, "args": { "External id": 51120, "cbid": 211, "correlation": 51120 } }, { "ph": "s", "id": 51120, "pid": 76337, "tid": -914061504, "ts": 1716454218255743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218308370, "dur": 5, "args": { "External id": 51129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51129, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51129, "pid": 5, "tid": 7, "ts": 1716454218308370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255782, "dur": 10, "args": { "External id": 51129, "cbid": 211, "correlation": 51129 } }, { "ph": "s", "id": 51129, "pid": 76337, "tid": -914061504, "ts": 1716454218255782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218255838, "dur": 0, "args": { "External id": 51139, "cbid": 317, "correlation": 51139 } }, { "ph": "f", "id": 51139, "pid": 76337, "tid": -914061504, "ts": 1716454218255838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218255838, "dur": 0, "args": { "External id": 51140, "cbid": 203, "correlation": 51140 } }, { "ph": "f", "id": 51140, "pid": 76337, "tid": -914061504, "ts": 1716454218255838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218255839, "dur": 0, "args": { "External id": 51141, "cbid": 205, "correlation": 51141 } }, { "ph": "f", "id": 51141, "pid": 76337, "tid": -914061504, "ts": 1716454218255839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308377, "dur": 5, "args": { "External id": 51145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51145, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51145, "pid": 5, "tid": 7, "ts": 1716454218308377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255852, "dur": 11, "args": { "External id": 51145, "cbid": 211, "correlation": 51145 } }, { "ph": "s", "id": 51145, "pid": 76337, "tid": -914061504, "ts": 1716454218255852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308383, "dur": 157, "args": { "External id": 51147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51147, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51147, "pid": 5, "tid": 7, "ts": 1716454218308383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255866, "dur": 5, "args": { "External id": 51147, "cbid": 211, "correlation": 51147 } }, { "ph": "s", "id": 51147, "pid": 76337, "tid": -914061504, "ts": 1716454218255866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218308542, "dur": 1, "args": { "External id": 51149, "device": 5, "context": 1, "stream": 7, "correlation": 51149, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 51149, "pid": 5, "tid": 7, "ts": 1716454218308542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218255877, "dur": 7, "args": { "External id": 51149, "cbid": 51, "correlation": 51149 } }, { "ph": "s", "id": 51149, "pid": 76337, "tid": -914061504, "ts": 1716454218255877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218308546, "dur": 262, "args": { "External id": 51150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51150, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51150, "pid": 5, "tid": 7, "ts": 1716454218308546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255885, "dur": 6, "args": { "External id": 51150, "cbid": 211, "correlation": 51150 } }, { "ph": "s", "id": 51150, "pid": 76337, "tid": -914061504, "ts": 1716454218255885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308809, "dur": 6, "args": { "External id": 51152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51152, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51152, "pid": 5, "tid": 7, "ts": 1716454218308809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255895, "dur": 5, "args": { "External id": 51152, "cbid": 211, "correlation": 51152 } }, { "ph": "s", "id": 51152, "pid": 76337, "tid": -914061504, "ts": 1716454218255895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308816, "dur": 6, "args": { "External id": 51158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51158, "pid": 5, "tid": 7, "ts": 1716454218308816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255922, "dur": 8, "args": { "External id": 51158, "cbid": 211, "correlation": 51158 } }, { "ph": "s", "id": 51158, "pid": 76337, "tid": -914061504, "ts": 1716454218255922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218308824, "dur": 3, "args": { "External id": 51166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51166, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 51166, "pid": 5, "tid": 7, "ts": 1716454218308824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218255966, "dur": 17, "args": { "External id": 51166, "cbid": 211, "correlation": 51166 } }, { "ph": "s", "id": 51166, "pid": 76337, "tid": -914061504, "ts": 1716454218255966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218256039, "dur": 1, "args": { "External id": 51182, "cbid": 251, "correlation": 51182 } }, { "ph": "f", "id": 51182, "pid": 76337, "tid": -914061504, "ts": 1716454218256039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218256044, "dur": 0, "args": { "External id": 51184, "cbid": 251, "correlation": 51184 } }, { "ph": "f", "id": 51184, "pid": 76337, "tid": -914061504, "ts": 1716454218256044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218308828, "dur": 13, "args": { "External id": 51185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51185, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51185, "pid": 5, "tid": 7, "ts": 1716454218308828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256046, "dur": 11, "args": { "External id": 51185, "cbid": 211, "correlation": 51185 } }, { "ph": "s", "id": 51185, "pid": 76337, "tid": -914061504, "ts": 1716454218256046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218308842, "dur": 5, "args": { "External id": 51187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51187, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51187, "pid": 5, "tid": 7, "ts": 1716454218308842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256059, "dur": 6, "args": { "External id": 51187, "cbid": 211, "correlation": 51187 } }, { "ph": "s", "id": 51187, "pid": 76337, "tid": -914061504, "ts": 1716454218256059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308848, "dur": 5, "args": { "External id": 51197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51197, "pid": 5, "tid": 7, "ts": 1716454218308848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256120, "dur": 12, "args": { "External id": 51197, "cbid": 211, "correlation": 51197 } }, { "ph": "s", "id": 51197, "pid": 76337, "tid": -914061504, "ts": 1716454218256120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218308855, "dur": 9, "args": { "External id": 51217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51217, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51217, "pid": 5, "tid": 7, "ts": 1716454218308855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256187, "dur": 11, "args": { "External id": 51217, "cbid": 211, "correlation": 51217 } }, { "ph": "s", "id": 51217, "pid": 76337, "tid": -914061504, "ts": 1716454218256187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218308865, "dur": 3, "args": { "External id": 51229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51229, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 51229, "pid": 5, "tid": 7, "ts": 1716454218308865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256207, "dur": 6, "args": { "External id": 51229, "cbid": 211, "correlation": 51229 } }, { "ph": "s", "id": 51229, "pid": 76337, "tid": -914061504, "ts": 1716454218256207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218308870, "dur": 6, "args": { "External id": 51232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51232, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51232, "pid": 5, "tid": 7, "ts": 1716454218308870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256226, "dur": 6, "args": { "External id": 51232, "cbid": 211, "correlation": 51232 } }, { "ph": "s", "id": 51232, "pid": 76337, "tid": -914061504, "ts": 1716454218256226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218308878, "dur": 4, "args": { "External id": 51241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51241, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51241, "pid": 5, "tid": 7, "ts": 1716454218308878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256266, "dur": 10, "args": { "External id": 51241, "cbid": 211, "correlation": 51241 } }, { "ph": "s", "id": 51241, "pid": 76337, "tid": -914061504, "ts": 1716454218256266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218256329, "dur": 0, "args": { "External id": 51251, "cbid": 317, "correlation": 51251 } }, { "ph": "f", "id": 51251, "pid": 76337, "tid": -914061504, "ts": 1716454218256329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218256330, "dur": 0, "args": { "External id": 51252, "cbid": 203, "correlation": 51252 } }, { "ph": "f", "id": 51252, "pid": 76337, "tid": -914061504, "ts": 1716454218256330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218256330, "dur": 0, "args": { "External id": 51253, "cbid": 205, "correlation": 51253 } }, { "ph": "f", "id": 51253, "pid": 76337, "tid": -914061504, "ts": 1716454218256330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308884, "dur": 4, "args": { "External id": 51257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51257, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51257, "pid": 5, "tid": 7, "ts": 1716454218308884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256345, "dur": 12, "args": { "External id": 51257, "cbid": 211, "correlation": 51257 } }, { "ph": "s", "id": 51257, "pid": 76337, "tid": -914061504, "ts": 1716454218256345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218308889, "dur": 157, "args": { "External id": 51259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51259, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51259, "pid": 5, "tid": 7, "ts": 1716454218308889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256359, "dur": 5, "args": { "External id": 51259, "cbid": 211, "correlation": 51259 } }, { "ph": "s", "id": 51259, "pid": 76337, "tid": -914061504, "ts": 1716454218256359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218309048, "dur": 1, "args": { "External id": 51261, "device": 5, "context": 1, "stream": 7, "correlation": 51261, "bytes": 240, "memory bandwidth (GB/s)": 0.14714898835070508 } }, { "ph": "f", "id": 51261, "pid": 5, "tid": 7, "ts": 1716454218309048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218256370, "dur": 6, "args": { "External id": 51261, "cbid": 51, "correlation": 51261 } }, { "ph": "s", "id": 51261, "pid": 76337, "tid": -914061504, "ts": 1716454218256370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218309052, "dur": 251, "args": { "External id": 51262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51262, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51262, "pid": 5, "tid": 7, "ts": 1716454218309052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256377, "dur": 9, "args": { "External id": 51262, "cbid": 211, "correlation": 51262 } }, { "ph": "s", "id": 51262, "pid": 76337, "tid": -914061504, "ts": 1716454218256377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218309304, "dur": 6, "args": { "External id": 51264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51264, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51264, "pid": 5, "tid": 7, "ts": 1716454218309304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256391, "dur": 6, "args": { "External id": 51264, "cbid": 211, "correlation": 51264 } }, { "ph": "s", "id": 51264, "pid": 76337, "tid": -914061504, "ts": 1716454218256391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218309311, "dur": 6, "args": { "External id": 51270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51270, "pid": 5, "tid": 7, "ts": 1716454218309311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256420, "dur": 8, "args": { "External id": 51270, "cbid": 211, "correlation": 51270 } }, { "ph": "s", "id": 51270, "pid": 76337, "tid": -914061504, "ts": 1716454218256420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218309319, "dur": 4, "args": { "External id": 51278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51278, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51278, "pid": 5, "tid": 7, "ts": 1716454218309319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256453, "dur": 8, "args": { "External id": 51278, "cbid": 211, "correlation": 51278 } }, { "ph": "s", "id": 51278, "pid": 76337, "tid": -914061504, "ts": 1716454218256453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218309324, "dur": 4, "args": { "External id": 51286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51286, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51286, "pid": 5, "tid": 7, "ts": 1716454218309324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256482, "dur": 9, "args": { "External id": 51286, "cbid": 211, "correlation": 51286 } }, { "ph": "s", "id": 51286, "pid": 76337, "tid": -914061504, "ts": 1716454218256482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218309330, "dur": 11, "args": { "External id": 51295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51295, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51295, "pid": 5, "tid": 7, "ts": 1716454218309330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256572, "dur": 13, "args": { "External id": 51295, "cbid": 211, "correlation": 51295 } }, { "ph": "s", "id": 51295, "pid": 76337, "tid": -914061504, "ts": 1716454218256572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218309343, "dur": 12, "args": { "External id": 51315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51315, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51315, "pid": 5, "tid": 7, "ts": 1716454218309343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256640, "dur": 11, "args": { "External id": 51315, "cbid": 211, "correlation": 51315 } }, { "ph": "s", "id": 51315, "pid": 76337, "tid": -914061504, "ts": 1716454218256640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218309356, "dur": 4, "args": { "External id": 51327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51327, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51327, "pid": 5, "tid": 7, "ts": 1716454218309356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256662, "dur": 6, "args": { "External id": 51327, "cbid": 211, "correlation": 51327 } }, { "ph": "s", "id": 51327, "pid": 76337, "tid": -914061504, "ts": 1716454218256662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218309361, "dur": 10, "args": { "External id": 51330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51330, "pid": 5, "tid": 7, "ts": 1716454218309361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256680, "dur": 10, "args": { "External id": 51330, "cbid": 211, "correlation": 51330 } }, { "ph": "s", "id": 51330, "pid": 76337, "tid": -914061504, "ts": 1716454218256680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218309371, "dur": 6, "args": { "External id": 51339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51339, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51339, "pid": 5, "tid": 7, "ts": 1716454218309371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256724, "dur": 10, "args": { "External id": 51339, "cbid": 211, "correlation": 51339 } }, { "ph": "s", "id": 51339, "pid": 76337, "tid": -914061504, "ts": 1716454218256724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218256776, "dur": 0, "args": { "External id": 51349, "cbid": 317, "correlation": 51349 } }, { "ph": "f", "id": 51349, "pid": 76337, "tid": -914061504, "ts": 1716454218256776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218256777, "dur": 0, "args": { "External id": 51350, "cbid": 203, "correlation": 51350 } }, { "ph": "f", "id": 51350, "pid": 76337, "tid": -914061504, "ts": 1716454218256777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218256778, "dur": 0, "args": { "External id": 51351, "cbid": 205, "correlation": 51351 } }, { "ph": "f", "id": 51351, "pid": 76337, "tid": -914061504, "ts": 1716454218256778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218309379, "dur": 6, "args": { "External id": 51355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51355, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51355, "pid": 5, "tid": 7, "ts": 1716454218309379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256793, "dur": 11, "args": { "External id": 51355, "cbid": 211, "correlation": 51355 } }, { "ph": "s", "id": 51355, "pid": 76337, "tid": -914061504, "ts": 1716454218256793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218309386, "dur": 311, "args": { "External id": 51357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51357, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51357, "pid": 5, "tid": 7, "ts": 1716454218309386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256807, "dur": 6, "args": { "External id": 51357, "cbid": 211, "correlation": 51357 } }, { "ph": "s", "id": 51357, "pid": 76337, "tid": -914061504, "ts": 1716454218256807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218309700, "dur": 1, "args": { "External id": 51359, "device": 5, "context": 1, "stream": 7, "correlation": 51359, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 51359, "pid": 5, "tid": 7, "ts": 1716454218309700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218256819, "dur": 8, "args": { "External id": 51359, "cbid": 51, "correlation": 51359 } }, { "ph": "s", "id": 51359, "pid": 76337, "tid": -914061504, "ts": 1716454218256819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218309703, "dur": 483, "args": { "External id": 51360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51360, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51360, "pid": 5, "tid": 7, "ts": 1716454218309703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256827, "dur": 6, "args": { "External id": 51360, "cbid": 211, "correlation": 51360 } }, { "ph": "s", "id": 51360, "pid": 76337, "tid": -914061504, "ts": 1716454218256827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310188, "dur": 6, "args": { "External id": 51362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51362, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51362, "pid": 5, "tid": 7, "ts": 1716454218310188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256837, "dur": 5, "args": { "External id": 51362, "cbid": 211, "correlation": 51362 } }, { "ph": "s", "id": 51362, "pid": 76337, "tid": -914061504, "ts": 1716454218256837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218310194, "dur": 6, "args": { "External id": 51368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51368, "pid": 5, "tid": 7, "ts": 1716454218310194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256867, "dur": 8, "args": { "External id": 51368, "cbid": 211, "correlation": 51368 } }, { "ph": "s", "id": 51368, "pid": 76337, "tid": -914061504, "ts": 1716454218256867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218310202, "dur": 3, "args": { "External id": 51376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51376, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 51376, "pid": 5, "tid": 7, "ts": 1716454218310202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256910, "dur": 10, "args": { "External id": 51376, "cbid": 211, "correlation": 51376 } }, { "ph": "s", "id": 51376, "pid": 76337, "tid": -914061504, "ts": 1716454218256910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218256981, "dur": 1, "args": { "External id": 51392, "cbid": 251, "correlation": 51392 } }, { "ph": "f", "id": 51392, "pid": 76337, "tid": -914061504, "ts": 1716454218256981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218256987, "dur": 0, "args": { "External id": 51394, "cbid": 251, "correlation": 51394 } }, { "ph": "f", "id": 51394, "pid": 76337, "tid": -914061504, "ts": 1716454218256987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218310206, "dur": 11, "args": { "External id": 51395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51395, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51395, "pid": 5, "tid": 7, "ts": 1716454218310206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218256989, "dur": 14, "args": { "External id": 51395, "cbid": 211, "correlation": 51395 } }, { "ph": "s", "id": 51395, "pid": 76337, "tid": -914061504, "ts": 1716454218256989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218310218, "dur": 4, "args": { "External id": 51397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51397, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51397, "pid": 5, "tid": 7, "ts": 1716454218310218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257005, "dur": 6, "args": { "External id": 51397, "cbid": 211, "correlation": 51397 } }, { "ph": "s", "id": 51397, "pid": 76337, "tid": -914061504, "ts": 1716454218257005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218310224, "dur": 6, "args": { "External id": 51407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51407, "pid": 5, "tid": 7, "ts": 1716454218310224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257064, "dur": 12, "args": { "External id": 51407, "cbid": 211, "correlation": 51407 } }, { "ph": "s", "id": 51407, "pid": 76337, "tid": -914061504, "ts": 1716454218257064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218310231, "dur": 9, "args": { "External id": 51427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51427, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51427, "pid": 5, "tid": 7, "ts": 1716454218310231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257130, "dur": 11, "args": { "External id": 51427, "cbid": 211, "correlation": 51427 } }, { "ph": "s", "id": 51427, "pid": 76337, "tid": -914061504, "ts": 1716454218257130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218310241, "dur": 3, "args": { "External id": 51439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51439, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 51439, "pid": 5, "tid": 7, "ts": 1716454218310241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257150, "dur": 6, "args": { "External id": 51439, "cbid": 211, "correlation": 51439 } }, { "ph": "s", "id": 51439, "pid": 76337, "tid": -914061504, "ts": 1716454218257150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218310246, "dur": 6, "args": { "External id": 51442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51442, "pid": 5, "tid": 7, "ts": 1716454218310246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257169, "dur": 6, "args": { "External id": 51442, "cbid": 211, "correlation": 51442 } }, { "ph": "s", "id": 51442, "pid": 76337, "tid": -914061504, "ts": 1716454218257169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218310254, "dur": 4, "args": { "External id": 51451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51451, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51451, "pid": 5, "tid": 7, "ts": 1716454218310254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257209, "dur": 10, "args": { "External id": 51451, "cbid": 211, "correlation": 51451 } }, { "ph": "s", "id": 51451, "pid": 76337, "tid": -914061504, "ts": 1716454218257209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218257271, "dur": 0, "args": { "External id": 51461, "cbid": 317, "correlation": 51461 } }, { "ph": "f", "id": 51461, "pid": 76337, "tid": -914061504, "ts": 1716454218257271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218257272, "dur": 0, "args": { "External id": 51462, "cbid": 203, "correlation": 51462 } }, { "ph": "f", "id": 51462, "pid": 76337, "tid": -914061504, "ts": 1716454218257272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218257273, "dur": 0, "args": { "External id": 51463, "cbid": 205, "correlation": 51463 } }, { "ph": "f", "id": 51463, "pid": 76337, "tid": -914061504, "ts": 1716454218257273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310260, "dur": 5, "args": { "External id": 51467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51467, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51467, "pid": 5, "tid": 7, "ts": 1716454218310260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257287, "dur": 15, "args": { "External id": 51467, "cbid": 211, "correlation": 51467 } }, { "ph": "s", "id": 51467, "pid": 76337, "tid": -914061504, "ts": 1716454218257287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310266, "dur": 157, "args": { "External id": 51469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51469, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51469, "pid": 5, "tid": 7, "ts": 1716454218310266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257305, "dur": 5, "args": { "External id": 51469, "cbid": 211, "correlation": 51469 } }, { "ph": "s", "id": 51469, "pid": 76337, "tid": -914061504, "ts": 1716454218257305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218310425, "dur": 1, "args": { "External id": 51471, "device": 5, "context": 1, "stream": 7, "correlation": 51471, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 51471, "pid": 5, "tid": 7, "ts": 1716454218310425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218257315, "dur": 6, "args": { "External id": 51471, "cbid": 51, "correlation": 51471 } }, { "ph": "s", "id": 51471, "pid": 76337, "tid": -914061504, "ts": 1716454218257315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218310429, "dur": 251, "args": { "External id": 51472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51472, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51472, "pid": 5, "tid": 7, "ts": 1716454218310429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257323, "dur": 7, "args": { "External id": 51472, "cbid": 211, "correlation": 51472 } }, { "ph": "s", "id": 51472, "pid": 76337, "tid": -914061504, "ts": 1716454218257323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310681, "dur": 5, "args": { "External id": 51474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51474, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51474, "pid": 5, "tid": 7, "ts": 1716454218310681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257334, "dur": 5, "args": { "External id": 51474, "cbid": 211, "correlation": 51474 } }, { "ph": "s", "id": 51474, "pid": 76337, "tid": -914061504, "ts": 1716454218257334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218310687, "dur": 6, "args": { "External id": 51480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51480, "pid": 5, "tid": 7, "ts": 1716454218310687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257362, "dur": 8, "args": { "External id": 51480, "cbid": 211, "correlation": 51480 } }, { "ph": "s", "id": 51480, "pid": 76337, "tid": -914061504, "ts": 1716454218257362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218257420, "dur": 0, "args": { "External id": 51490, "cbid": 317, "correlation": 51490 } }, { "ph": "f", "id": 51490, "pid": 76337, "tid": -914061504, "ts": 1716454218257420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218257421, "dur": 0, "args": { "External id": 51491, "cbid": 203, "correlation": 51491 } }, { "ph": "f", "id": 51491, "pid": 76337, "tid": -914061504, "ts": 1716454218257421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218257422, "dur": 0, "args": { "External id": 51492, "cbid": 205, "correlation": 51492 } }, { "ph": "f", "id": 51492, "pid": 76337, "tid": -914061504, "ts": 1716454218257422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310695, "dur": 8, "args": { "External id": 51496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51496, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51496, "pid": 5, "tid": 7, "ts": 1716454218310695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257437, "dur": 12, "args": { "External id": 51496, "cbid": 211, "correlation": 51496 } }, { "ph": "s", "id": 51496, "pid": 76337, "tid": -914061504, "ts": 1716454218257437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218310704, "dur": 3, "args": { "External id": 51498, "device": 5, "context": 1, "stream": 7, "correlation": 51498, "bytes": 4800, "memory bandwidth (GB/s)": 1.5463917525773196 } }, { "ph": "f", "id": 51498, "pid": 5, "tid": 7, "ts": 1716454218310704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218257455, "dur": 14, "args": { "External id": 51498, "cbid": 51, "correlation": 51498 } }, { "ph": "s", "id": 51498, "pid": 76337, "tid": -914061504, "ts": 1716454218257455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218310708, "dur": 93, "args": { "External id": 51499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51499, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 51499, "pid": 5, "tid": 7, "ts": 1716454218310708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257471, "dur": 7, "args": { "External id": 51499, "cbid": 211, "correlation": 51499 } }, { "ph": "s", "id": 51499, "pid": 76337, "tid": -914061504, "ts": 1716454218257471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310803, "dur": 6, "args": { "External id": 51501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51501, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51501, "pid": 5, "tid": 7, "ts": 1716454218310803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257481, "dur": 5, "args": { "External id": 51501, "cbid": 211, "correlation": 51501 } }, { "ph": "s", "id": 51501, "pid": 76337, "tid": -914061504, "ts": 1716454218257481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218310810, "dur": 6, "args": { "External id": 51507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51507, "pid": 5, "tid": 7, "ts": 1716454218310810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257508, "dur": 8, "args": { "External id": 51507, "cbid": 211, "correlation": 51507 } }, { "ph": "s", "id": 51507, "pid": 76337, "tid": -914061504, "ts": 1716454218257508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218310817, "dur": 5, "args": { "External id": 51515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51515, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51515, "pid": 5, "tid": 7, "ts": 1716454218310817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257537, "dur": 8, "args": { "External id": 51515, "cbid": 211, "correlation": 51515 } }, { "ph": "s", "id": 51515, "pid": 76337, "tid": -914061504, "ts": 1716454218257537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218310823, "dur": 4, "args": { "External id": 51523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51523, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51523, "pid": 5, "tid": 7, "ts": 1716454218310823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257569, "dur": 8, "args": { "External id": 51523, "cbid": 211, "correlation": 51523 } }, { "ph": "s", "id": 51523, "pid": 76337, "tid": -914061504, "ts": 1716454218257569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218310829, "dur": 11, "args": { "External id": 51532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51532, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51532, "pid": 5, "tid": 7, "ts": 1716454218310829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257614, "dur": 10, "args": { "External id": 51532, "cbid": 211, "correlation": 51532 } }, { "ph": "s", "id": 51532, "pid": 76337, "tid": -914061504, "ts": 1716454218257614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218310841, "dur": 12, "args": { "External id": 51552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51552, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51552, "pid": 5, "tid": 7, "ts": 1716454218310841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257683, "dur": 12, "args": { "External id": 51552, "cbid": 211, "correlation": 51552 } }, { "ph": "s", "id": 51552, "pid": 76337, "tid": -914061504, "ts": 1716454218257683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218310854, "dur": 4, "args": { "External id": 51564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51564, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51564, "pid": 5, "tid": 7, "ts": 1716454218310854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257704, "dur": 6, "args": { "External id": 51564, "cbid": 211, "correlation": 51564 } }, { "ph": "s", "id": 51564, "pid": 76337, "tid": -914061504, "ts": 1716454218257704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218310859, "dur": 10, "args": { "External id": 51567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51567, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51567, "pid": 5, "tid": 7, "ts": 1716454218310859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257722, "dur": 7, "args": { "External id": 51567, "cbid": 211, "correlation": 51567 } }, { "ph": "s", "id": 51567, "pid": 76337, "tid": -914061504, "ts": 1716454218257722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218310870, "dur": 6, "args": { "External id": 51576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51576, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51576, "pid": 5, "tid": 7, "ts": 1716454218310870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257761, "dur": 10, "args": { "External id": 51576, "cbid": 211, "correlation": 51576 } }, { "ph": "s", "id": 51576, "pid": 76337, "tid": -914061504, "ts": 1716454218257761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218257812, "dur": 0, "args": { "External id": 51586, "cbid": 317, "correlation": 51586 } }, { "ph": "f", "id": 51586, "pid": 76337, "tid": -914061504, "ts": 1716454218257812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218257813, "dur": 0, "args": { "External id": 51587, "cbid": 203, "correlation": 51587 } }, { "ph": "f", "id": 51587, "pid": 76337, "tid": -914061504, "ts": 1716454218257813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218257813, "dur": 0, "args": { "External id": 51588, "cbid": 205, "correlation": 51588 } }, { "ph": "f", "id": 51588, "pid": 76337, "tid": -914061504, "ts": 1716454218257813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310877, "dur": 6, "args": { "External id": 51592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51592, "pid": 5, "tid": 7, "ts": 1716454218310877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257827, "dur": 11, "args": { "External id": 51592, "cbid": 211, "correlation": 51592 } }, { "ph": "s", "id": 51592, "pid": 76337, "tid": -914061504, "ts": 1716454218257827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218310885, "dur": 309, "args": { "External id": 51594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51594, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51594, "pid": 5, "tid": 7, "ts": 1716454218310885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257844, "dur": 6, "args": { "External id": 51594, "cbid": 211, "correlation": 51594 } }, { "ph": "s", "id": 51594, "pid": 76337, "tid": -914061504, "ts": 1716454218257844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218311196, "dur": 1, "args": { "External id": 51596, "device": 5, "context": 1, "stream": 7, "correlation": 51596, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 51596, "pid": 5, "tid": 7, "ts": 1716454218311196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218257856, "dur": 6, "args": { "External id": 51596, "cbid": 51, "correlation": 51596 } }, { "ph": "s", "id": 51596, "pid": 76337, "tid": -914061504, "ts": 1716454218257856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218311200, "dur": 485, "args": { "External id": 51597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51597, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51597, "pid": 5, "tid": 7, "ts": 1716454218311200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257863, "dur": 6, "args": { "External id": 51597, "cbid": 211, "correlation": 51597 } }, { "ph": "s", "id": 51597, "pid": 76337, "tid": -914061504, "ts": 1716454218257863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218311687, "dur": 5, "args": { "External id": 51599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51599, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51599, "pid": 5, "tid": 7, "ts": 1716454218311687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257874, "dur": 5, "args": { "External id": 51599, "cbid": 211, "correlation": 51599 } }, { "ph": "s", "id": 51599, "pid": 76337, "tid": -914061504, "ts": 1716454218257874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218311693, "dur": 6, "args": { "External id": 51605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51605, "pid": 5, "tid": 7, "ts": 1716454218311693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257903, "dur": 8, "args": { "External id": 51605, "cbid": 211, "correlation": 51605 } }, { "ph": "s", "id": 51605, "pid": 76337, "tid": -914061504, "ts": 1716454218257903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218311701, "dur": 3, "args": { "External id": 51613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51613, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 51613, "pid": 5, "tid": 7, "ts": 1716454218311701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218257946, "dur": 9, "args": { "External id": 51613, "cbid": 211, "correlation": 51613 } }, { "ph": "s", "id": 51613, "pid": 76337, "tid": -914061504, "ts": 1716454218257946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218258016, "dur": 1, "args": { "External id": 51629, "cbid": 251, "correlation": 51629 } }, { "ph": "f", "id": 51629, "pid": 76337, "tid": -914061504, "ts": 1716454218258016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218258022, "dur": 0, "args": { "External id": 51631, "cbid": 251, "correlation": 51631 } }, { "ph": "f", "id": 51631, "pid": 76337, "tid": -914061504, "ts": 1716454218258022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218311705, "dur": 12, "args": { "External id": 51632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51632, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51632, "pid": 5, "tid": 7, "ts": 1716454218311705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258023, "dur": 12, "args": { "External id": 51632, "cbid": 211, "correlation": 51632 } }, { "ph": "s", "id": 51632, "pid": 76337, "tid": -914061504, "ts": 1716454218258023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218311719, "dur": 5, "args": { "External id": 51634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51634, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51634, "pid": 5, "tid": 7, "ts": 1716454218311719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258037, "dur": 5, "args": { "External id": 51634, "cbid": 211, "correlation": 51634 } }, { "ph": "s", "id": 51634, "pid": 76337, "tid": -914061504, "ts": 1716454218258037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218311725, "dur": 6, "args": { "External id": 51644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51644, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51644, "pid": 5, "tid": 7, "ts": 1716454218311725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258093, "dur": 13, "args": { "External id": 51644, "cbid": 211, "correlation": 51644 } }, { "ph": "s", "id": 51644, "pid": 76337, "tid": -914061504, "ts": 1716454218258093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218311732, "dur": 9, "args": { "External id": 51664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51664, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51664, "pid": 5, "tid": 7, "ts": 1716454218311732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258163, "dur": 11, "args": { "External id": 51664, "cbid": 211, "correlation": 51664 } }, { "ph": "s", "id": 51664, "pid": 76337, "tid": -914061504, "ts": 1716454218258163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218311743, "dur": 3, "args": { "External id": 51676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51676, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 51676, "pid": 5, "tid": 7, "ts": 1716454218311743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258184, "dur": 6, "args": { "External id": 51676, "cbid": 211, "correlation": 51676 } }, { "ph": "s", "id": 51676, "pid": 76337, "tid": -914061504, "ts": 1716454218258184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218311747, "dur": 6, "args": { "External id": 51679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51679, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51679, "pid": 5, "tid": 7, "ts": 1716454218311747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258203, "dur": 6, "args": { "External id": 51679, "cbid": 211, "correlation": 51679 } }, { "ph": "s", "id": 51679, "pid": 76337, "tid": -914061504, "ts": 1716454218258203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218311755, "dur": 4, "args": { "External id": 51688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51688, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51688, "pid": 5, "tid": 7, "ts": 1716454218311755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258243, "dur": 10, "args": { "External id": 51688, "cbid": 211, "correlation": 51688 } }, { "ph": "s", "id": 51688, "pid": 76337, "tid": -914061504, "ts": 1716454218258243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218258307, "dur": 0, "args": { "External id": 51698, "cbid": 317, "correlation": 51698 } }, { "ph": "f", "id": 51698, "pid": 76337, "tid": -914061504, "ts": 1716454218258307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218258308, "dur": 0, "args": { "External id": 51699, "cbid": 203, "correlation": 51699 } }, { "ph": "f", "id": 51699, "pid": 76337, "tid": -914061504, "ts": 1716454218258308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218258309, "dur": 0, "args": { "External id": 51700, "cbid": 205, "correlation": 51700 } }, { "ph": "f", "id": 51700, "pid": 76337, "tid": -914061504, "ts": 1716454218258309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218311761, "dur": 5, "args": { "External id": 51704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51704, "pid": 5, "tid": 7, "ts": 1716454218311761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258322, "dur": 12, "args": { "External id": 51704, "cbid": 211, "correlation": 51704 } }, { "ph": "s", "id": 51704, "pid": 76337, "tid": -914061504, "ts": 1716454218258322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218311767, "dur": 157, "args": { "External id": 51706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51706, "pid": 5, "tid": 7, "ts": 1716454218311767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258336, "dur": 5, "args": { "External id": 51706, "cbid": 211, "correlation": 51706 } }, { "ph": "s", "id": 51706, "pid": 76337, "tid": -914061504, "ts": 1716454218258336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218311926, "dur": 1, "args": { "External id": 51708, "device": 5, "context": 1, "stream": 7, "correlation": 51708, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 51708, "pid": 5, "tid": 7, "ts": 1716454218311926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218258347, "dur": 6, "args": { "External id": 51708, "cbid": 51, "correlation": 51708 } }, { "ph": "s", "id": 51708, "pid": 76337, "tid": -914061504, "ts": 1716454218258347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218311929, "dur": 252, "args": { "External id": 51709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51709, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51709, "pid": 5, "tid": 7, "ts": 1716454218311929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258354, "dur": 7, "args": { "External id": 51709, "cbid": 211, "correlation": 51709 } }, { "ph": "s", "id": 51709, "pid": 76337, "tid": -914061504, "ts": 1716454218258354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218312182, "dur": 6, "args": { "External id": 51711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51711, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51711, "pid": 5, "tid": 7, "ts": 1716454218312182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258364, "dur": 5, "args": { "External id": 51711, "cbid": 211, "correlation": 51711 } }, { "ph": "s", "id": 51711, "pid": 76337, "tid": -914061504, "ts": 1716454218258364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218312189, "dur": 6, "args": { "External id": 51717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51717, "pid": 5, "tid": 7, "ts": 1716454218312189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258392, "dur": 8, "args": { "External id": 51717, "cbid": 211, "correlation": 51717 } }, { "ph": "s", "id": 51717, "pid": 76337, "tid": -914061504, "ts": 1716454218258392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218258454, "dur": 0, "args": { "External id": 51727, "cbid": 317, "correlation": 51727 } }, { "ph": "f", "id": 51727, "pid": 76337, "tid": -914061504, "ts": 1716454218258454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218258455, "dur": 0, "args": { "External id": 51728, "cbid": 203, "correlation": 51728 } }, { "ph": "f", "id": 51728, "pid": 76337, "tid": -914061504, "ts": 1716454218258455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218258455, "dur": 0, "args": { "External id": 51729, "cbid": 205, "correlation": 51729 } }, { "ph": "f", "id": 51729, "pid": 76337, "tid": -914061504, "ts": 1716454218258455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218312197, "dur": 8, "args": { "External id": 51733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51733, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51733, "pid": 5, "tid": 7, "ts": 1716454218312197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258468, "dur": 12, "args": { "External id": 51733, "cbid": 211, "correlation": 51733 } }, { "ph": "s", "id": 51733, "pid": 76337, "tid": -914061504, "ts": 1716454218258468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218312206, "dur": 3, "args": { "External id": 51735, "device": 5, "context": 1, "stream": 7, "correlation": 51735, "bytes": 4800, "memory bandwidth (GB/s)": 1.530612244897959 } }, { "ph": "f", "id": 51735, "pid": 5, "tid": 7, "ts": 1716454218312206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218258485, "dur": 10, "args": { "External id": 51735, "cbid": 51, "correlation": 51735 } }, { "ph": "s", "id": 51735, "pid": 76337, "tid": -914061504, "ts": 1716454218258485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218312210, "dur": 92, "args": { "External id": 51736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51736, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 51736, "pid": 5, "tid": 7, "ts": 1716454218312210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258495, "dur": 6, "args": { "External id": 51736, "cbid": 211, "correlation": 51736 } }, { "ph": "s", "id": 51736, "pid": 76337, "tid": -914061504, "ts": 1716454218258495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218312303, "dur": 6, "args": { "External id": 51738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51738, "pid": 5, "tid": 7, "ts": 1716454218312303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258505, "dur": 5, "args": { "External id": 51738, "cbid": 211, "correlation": 51738 } }, { "ph": "s", "id": 51738, "pid": 76337, "tid": -914061504, "ts": 1716454218258505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218312310, "dur": 6, "args": { "External id": 51744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51744, "pid": 5, "tid": 7, "ts": 1716454218312310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258532, "dur": 9, "args": { "External id": 51744, "cbid": 211, "correlation": 51744 } }, { "ph": "s", "id": 51744, "pid": 76337, "tid": -914061504, "ts": 1716454218258532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218312317, "dur": 5, "args": { "External id": 51752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51752, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51752, "pid": 5, "tid": 7, "ts": 1716454218312317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258561, "dur": 9, "args": { "External id": 51752, "cbid": 211, "correlation": 51752 } }, { "ph": "s", "id": 51752, "pid": 76337, "tid": -914061504, "ts": 1716454218258561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218312323, "dur": 4, "args": { "External id": 51760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51760, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51760, "pid": 5, "tid": 7, "ts": 1716454218312323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258590, "dur": 8, "args": { "External id": 51760, "cbid": 211, "correlation": 51760 } }, { "ph": "s", "id": 51760, "pid": 76337, "tid": -914061504, "ts": 1716454218258590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218312329, "dur": 11, "args": { "External id": 51769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51769, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51769, "pid": 5, "tid": 7, "ts": 1716454218312329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258634, "dur": 10, "args": { "External id": 51769, "cbid": 211, "correlation": 51769 } }, { "ph": "s", "id": 51769, "pid": 76337, "tid": -914061504, "ts": 1716454218258634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218312341, "dur": 12, "args": { "External id": 51789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51789, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51789, "pid": 5, "tid": 7, "ts": 1716454218312341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258706, "dur": 12, "args": { "External id": 51789, "cbid": 211, "correlation": 51789 } }, { "ph": "s", "id": 51789, "pid": 76337, "tid": -914061504, "ts": 1716454218258706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218312354, "dur": 4, "args": { "External id": 51801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51801, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51801, "pid": 5, "tid": 7, "ts": 1716454218312354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258729, "dur": 6, "args": { "External id": 51801, "cbid": 211, "correlation": 51801 } }, { "ph": "s", "id": 51801, "pid": 76337, "tid": -914061504, "ts": 1716454218258729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218312359, "dur": 10, "args": { "External id": 51804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51804, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51804, "pid": 5, "tid": 7, "ts": 1716454218312359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258747, "dur": 7, "args": { "External id": 51804, "cbid": 211, "correlation": 51804 } }, { "ph": "s", "id": 51804, "pid": 76337, "tid": -914061504, "ts": 1716454218258747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218312371, "dur": 6, "args": { "External id": 51813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51813, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51813, "pid": 5, "tid": 7, "ts": 1716454218312371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258786, "dur": 10, "args": { "External id": 51813, "cbid": 211, "correlation": 51813 } }, { "ph": "s", "id": 51813, "pid": 76337, "tid": -914061504, "ts": 1716454218258786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218258839, "dur": 0, "args": { "External id": 51823, "cbid": 317, "correlation": 51823 } }, { "ph": "f", "id": 51823, "pid": 76337, "tid": -914061504, "ts": 1716454218258839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218258840, "dur": 0, "args": { "External id": 51824, "cbid": 203, "correlation": 51824 } }, { "ph": "f", "id": 51824, "pid": 76337, "tid": -914061504, "ts": 1716454218258840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218258841, "dur": 0, "args": { "External id": 51825, "cbid": 205, "correlation": 51825 } }, { "ph": "f", "id": 51825, "pid": 76337, "tid": -914061504, "ts": 1716454218258841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218312378, "dur": 6, "args": { "External id": 51829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51829, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51829, "pid": 5, "tid": 7, "ts": 1716454218312378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258854, "dur": 12, "args": { "External id": 51829, "cbid": 211, "correlation": 51829 } }, { "ph": "s", "id": 51829, "pid": 76337, "tid": -914061504, "ts": 1716454218258854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218312386, "dur": 309, "args": { "External id": 51831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51831, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51831, "pid": 5, "tid": 7, "ts": 1716454218312386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258868, "dur": 5, "args": { "External id": 51831, "cbid": 211, "correlation": 51831 } }, { "ph": "s", "id": 51831, "pid": 76337, "tid": -914061504, "ts": 1716454218258868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218312697, "dur": 1, "args": { "External id": 51833, "device": 5, "context": 1, "stream": 7, "correlation": 51833, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 51833, "pid": 5, "tid": 7, "ts": 1716454218312697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218258879, "dur": 7, "args": { "External id": 51833, "cbid": 51, "correlation": 51833 } }, { "ph": "s", "id": 51833, "pid": 76337, "tid": -914061504, "ts": 1716454218258879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218312700, "dur": 484, "args": { "External id": 51834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51834, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51834, "pid": 5, "tid": 7, "ts": 1716454218312700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258886, "dur": 6, "args": { "External id": 51834, "cbid": 211, "correlation": 51834 } }, { "ph": "s", "id": 51834, "pid": 76337, "tid": -914061504, "ts": 1716454218258886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313186, "dur": 6, "args": { "External id": 51836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51836, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51836, "pid": 5, "tid": 7, "ts": 1716454218313186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258897, "dur": 5, "args": { "External id": 51836, "cbid": 211, "correlation": 51836 } }, { "ph": "s", "id": 51836, "pid": 76337, "tid": -914061504, "ts": 1716454218258897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218313193, "dur": 6, "args": { "External id": 51842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51842, "pid": 5, "tid": 7, "ts": 1716454218313193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258925, "dur": 8, "args": { "External id": 51842, "cbid": 211, "correlation": 51842 } }, { "ph": "s", "id": 51842, "pid": 76337, "tid": -914061504, "ts": 1716454218258925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218313200, "dur": 3, "args": { "External id": 51850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51850, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 51850, "pid": 5, "tid": 7, "ts": 1716454218313200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218258971, "dur": 19, "args": { "External id": 51850, "cbid": 211, "correlation": 51850 } }, { "ph": "s", "id": 51850, "pid": 76337, "tid": -914061504, "ts": 1716454218258971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218259046, "dur": 1, "args": { "External id": 51866, "cbid": 251, "correlation": 51866 } }, { "ph": "f", "id": 51866, "pid": 76337, "tid": -914061504, "ts": 1716454218259046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218259052, "dur": 0, "args": { "External id": 51868, "cbid": 251, "correlation": 51868 } }, { "ph": "f", "id": 51868, "pid": 76337, "tid": -914061504, "ts": 1716454218259052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218313204, "dur": 13, "args": { "External id": 51869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51869, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51869, "pid": 5, "tid": 7, "ts": 1716454218313204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259054, "dur": 11, "args": { "External id": 51869, "cbid": 211, "correlation": 51869 } }, { "ph": "s", "id": 51869, "pid": 76337, "tid": -914061504, "ts": 1716454218259054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218313218, "dur": 5, "args": { "External id": 51871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51871, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51871, "pid": 5, "tid": 7, "ts": 1716454218313218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259067, "dur": 5, "args": { "External id": 51871, "cbid": 211, "correlation": 51871 } }, { "ph": "s", "id": 51871, "pid": 76337, "tid": -914061504, "ts": 1716454218259067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218313225, "dur": 6, "args": { "External id": 51881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51881, "pid": 5, "tid": 7, "ts": 1716454218313225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259124, "dur": 12, "args": { "External id": 51881, "cbid": 211, "correlation": 51881 } }, { "ph": "s", "id": 51881, "pid": 76337, "tid": -914061504, "ts": 1716454218259124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218313232, "dur": 9, "args": { "External id": 51901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51901, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 51901, "pid": 5, "tid": 7, "ts": 1716454218313232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259191, "dur": 12, "args": { "External id": 51901, "cbid": 211, "correlation": 51901 } }, { "ph": "s", "id": 51901, "pid": 76337, "tid": -914061504, "ts": 1716454218259191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218313242, "dur": 3, "args": { "External id": 51913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51913, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 51913, "pid": 5, "tid": 7, "ts": 1716454218313242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259212, "dur": 6, "args": { "External id": 51913, "cbid": 211, "correlation": 51913 } }, { "ph": "s", "id": 51913, "pid": 76337, "tid": -914061504, "ts": 1716454218259212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218313247, "dur": 7, "args": { "External id": 51916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51916, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51916, "pid": 5, "tid": 7, "ts": 1716454218313247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259230, "dur": 6, "args": { "External id": 51916, "cbid": 211, "correlation": 51916 } }, { "ph": "s", "id": 51916, "pid": 76337, "tid": -914061504, "ts": 1716454218259230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218313255, "dur": 4, "args": { "External id": 51925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51925, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51925, "pid": 5, "tid": 7, "ts": 1716454218313255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259274, "dur": 10, "args": { "External id": 51925, "cbid": 211, "correlation": 51925 } }, { "ph": "s", "id": 51925, "pid": 76337, "tid": -914061504, "ts": 1716454218259274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218259338, "dur": 0, "args": { "External id": 51935, "cbid": 317, "correlation": 51935 } }, { "ph": "f", "id": 51935, "pid": 76337, "tid": -914061504, "ts": 1716454218259338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218259338, "dur": 0, "args": { "External id": 51936, "cbid": 203, "correlation": 51936 } }, { "ph": "f", "id": 51936, "pid": 76337, "tid": -914061504, "ts": 1716454218259338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218259339, "dur": 0, "args": { "External id": 51937, "cbid": 205, "correlation": 51937 } }, { "ph": "f", "id": 51937, "pid": 76337, "tid": -914061504, "ts": 1716454218259339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313261, "dur": 5, "args": { "External id": 51941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51941, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51941, "pid": 5, "tid": 7, "ts": 1716454218313261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259353, "dur": 13, "args": { "External id": 51941, "cbid": 211, "correlation": 51941 } }, { "ph": "s", "id": 51941, "pid": 76337, "tid": -914061504, "ts": 1716454218259353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313267, "dur": 157, "args": { "External id": 51943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51943, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51943, "pid": 5, "tid": 7, "ts": 1716454218313267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259368, "dur": 5, "args": { "External id": 51943, "cbid": 211, "correlation": 51943 } }, { "ph": "s", "id": 51943, "pid": 76337, "tid": -914061504, "ts": 1716454218259368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218313426, "dur": 1, "args": { "External id": 51945, "device": 5, "context": 1, "stream": 7, "correlation": 51945, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 51945, "pid": 5, "tid": 7, "ts": 1716454218313426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218259378, "dur": 6, "args": { "External id": 51945, "cbid": 51, "correlation": 51945 } }, { "ph": "s", "id": 51945, "pid": 76337, "tid": -914061504, "ts": 1716454218259378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218313430, "dur": 251, "args": { "External id": 51946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51946, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 51946, "pid": 5, "tid": 7, "ts": 1716454218313430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259386, "dur": 6, "args": { "External id": 51946, "cbid": 211, "correlation": 51946 } }, { "ph": "s", "id": 51946, "pid": 76337, "tid": -914061504, "ts": 1716454218259386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313682, "dur": 6, "args": { "External id": 51948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51948, "pid": 5, "tid": 7, "ts": 1716454218313682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259395, "dur": 5, "args": { "External id": 51948, "cbid": 211, "correlation": 51948 } }, { "ph": "s", "id": 51948, "pid": 76337, "tid": -914061504, "ts": 1716454218259395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218313689, "dur": 6, "args": { "External id": 51954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51954, "pid": 5, "tid": 7, "ts": 1716454218313689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259424, "dur": 8, "args": { "External id": 51954, "cbid": 211, "correlation": 51954 } }, { "ph": "s", "id": 51954, "pid": 76337, "tid": -914061504, "ts": 1716454218259424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218259483, "dur": 0, "args": { "External id": 51964, "cbid": 317, "correlation": 51964 } }, { "ph": "f", "id": 51964, "pid": 76337, "tid": -914061504, "ts": 1716454218259483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218259484, "dur": 0, "args": { "External id": 51965, "cbid": 203, "correlation": 51965 } }, { "ph": "f", "id": 51965, "pid": 76337, "tid": -914061504, "ts": 1716454218259484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218259484, "dur": 0, "args": { "External id": 51966, "cbid": 205, "correlation": 51966 } }, { "ph": "f", "id": 51966, "pid": 76337, "tid": -914061504, "ts": 1716454218259484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313696, "dur": 7, "args": { "External id": 51970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51970, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51970, "pid": 5, "tid": 7, "ts": 1716454218313696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259496, "dur": 11, "args": { "External id": 51970, "cbid": 211, "correlation": 51970 } }, { "ph": "s", "id": 51970, "pid": 76337, "tid": -914061504, "ts": 1716454218259496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218313705, "dur": 3, "args": { "External id": 51972, "device": 5, "context": 1, "stream": 7, "correlation": 51972, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 51972, "pid": 5, "tid": 7, "ts": 1716454218313705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218259512, "dur": 9, "args": { "External id": 51972, "cbid": 51, "correlation": 51972 } }, { "ph": "s", "id": 51972, "pid": 76337, "tid": -914061504, "ts": 1716454218259512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218313709, "dur": 92, "args": { "External id": 51973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51973, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 51973, "pid": 5, "tid": 7, "ts": 1716454218313709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259523, "dur": 6, "args": { "External id": 51973, "cbid": 211, "correlation": 51973 } }, { "ph": "s", "id": 51973, "pid": 76337, "tid": -914061504, "ts": 1716454218259523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313802, "dur": 5, "args": { "External id": 51975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51975, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 51975, "pid": 5, "tid": 7, "ts": 1716454218313802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259532, "dur": 5, "args": { "External id": 51975, "cbid": 211, "correlation": 51975 } }, { "ph": "s", "id": 51975, "pid": 76337, "tid": -914061504, "ts": 1716454218259532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218313809, "dur": 6, "args": { "External id": 51981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51981, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51981, "pid": 5, "tid": 7, "ts": 1716454218313809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259562, "dur": 10, "args": { "External id": 51981, "cbid": 211, "correlation": 51981 } }, { "ph": "s", "id": 51981, "pid": 76337, "tid": -914061504, "ts": 1716454218259562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218313817, "dur": 5, "args": { "External id": 51989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51989, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51989, "pid": 5, "tid": 7, "ts": 1716454218313817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259592, "dur": 8, "args": { "External id": 51989, "cbid": 211, "correlation": 51989 } }, { "ph": "s", "id": 51989, "pid": 76337, "tid": -914061504, "ts": 1716454218259592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218313823, "dur": 4, "args": { "External id": 51997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 51997, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 51997, "pid": 5, "tid": 7, "ts": 1716454218313823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259621, "dur": 8, "args": { "External id": 51997, "cbid": 211, "correlation": 51997 } }, { "ph": "s", "id": 51997, "pid": 76337, "tid": -914061504, "ts": 1716454218259621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454218313828, "dur": 14, "args": { "External id": 52008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52008, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52008, "pid": 5, "tid": 7, "ts": 1716454218313828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259702, "dur": 14, "args": { "External id": 52008, "cbid": 211, "correlation": 52008 } }, { "ph": "s", "id": 52008, "pid": 76337, "tid": -914061504, "ts": 1716454218259702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218259759, "dur": 0, "args": { "External id": 52018, "cbid": 317, "correlation": 52018 } }, { "ph": "f", "id": 52018, "pid": 76337, "tid": -914061504, "ts": 1716454218259759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218259760, "dur": 0, "args": { "External id": 52019, "cbid": 203, "correlation": 52019 } }, { "ph": "f", "id": 52019, "pid": 76337, "tid": -914061504, "ts": 1716454218259760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218259760, "dur": 0, "args": { "External id": 52020, "cbid": 205, "correlation": 52020 } }, { "ph": "f", "id": 52020, "pid": 76337, "tid": -914061504, "ts": 1716454218259760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313844, "dur": 8, "args": { "External id": 52024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52024, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52024, "pid": 5, "tid": 7, "ts": 1716454218313844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259775, "dur": 11, "args": { "External id": 52024, "cbid": 211, "correlation": 52024 } }, { "ph": "s", "id": 52024, "pid": 76337, "tid": -914061504, "ts": 1716454218259775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218313854, "dur": 158, "args": { "External id": 52026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52026, "pid": 5, "tid": 7, "ts": 1716454218313854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259789, "dur": 5, "args": { "External id": 52026, "cbid": 211, "correlation": 52026 } }, { "ph": "s", "id": 52026, "pid": 76337, "tid": -914061504, "ts": 1716454218259789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218314014, "dur": 1, "args": { "External id": 52028, "device": 5, "context": 1, "stream": 7, "correlation": 52028, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 52028, "pid": 5, "tid": 7, "ts": 1716454218314014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218259800, "dur": 6, "args": { "External id": 52028, "cbid": 51, "correlation": 52028 } }, { "ph": "s", "id": 52028, "pid": 76337, "tid": -914061504, "ts": 1716454218259800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218314018, "dur": 640, "args": { "External id": 52029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52029, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52029, "pid": 5, "tid": 7, "ts": 1716454218314018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259807, "dur": 6, "args": { "External id": 52029, "cbid": 211, "correlation": 52029 } }, { "ph": "s", "id": 52029, "pid": 76337, "tid": -914061504, "ts": 1716454218259807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218314659, "dur": 12, "args": { "External id": 52031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52031, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52031, "pid": 5, "tid": 7, "ts": 1716454218314659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259817, "dur": 6, "args": { "External id": 52031, "cbid": 211, "correlation": 52031 } }, { "ph": "s", "id": 52031, "pid": 76337, "tid": -914061504, "ts": 1716454218259817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218314672, "dur": 15, "args": { "External id": 52037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52037, "pid": 5, "tid": 7, "ts": 1716454218314672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259846, "dur": 8, "args": { "External id": 52037, "cbid": 211, "correlation": 52037 } }, { "ph": "s", "id": 52037, "pid": 76337, "tid": -914061504, "ts": 1716454218259846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218314688, "dur": 29, "args": { "External id": 52046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52046, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52046, "pid": 5, "tid": 7, "ts": 1716454218314688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218259939, "dur": 13, "args": { "External id": 52046, "cbid": 211, "correlation": 52046 } }, { "ph": "s", "id": 52046, "pid": 76337, "tid": -914061504, "ts": 1716454218259939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218314718, "dur": 30, "args": { "External id": 52066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52066, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 52066, "pid": 5, "tid": 7, "ts": 1716454218314718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260016, "dur": 12, "args": { "External id": 52066, "cbid": 211, "correlation": 52066 } }, { "ph": "s", "id": 52066, "pid": 76337, "tid": -914061504, "ts": 1716454218260016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218314749, "dur": 4, "args": { "External id": 52078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52078, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52078, "pid": 5, "tid": 7, "ts": 1716454218314749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260039, "dur": 6, "args": { "External id": 52078, "cbid": 211, "correlation": 52078 } }, { "ph": "s", "id": 52078, "pid": 76337, "tid": -914061504, "ts": 1716454218260039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218314755, "dur": 29, "args": { "External id": 52081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52081, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52081, "pid": 5, "tid": 7, "ts": 1716454218314755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260057, "dur": 7, "args": { "External id": 52081, "cbid": 211, "correlation": 52081 } }, { "ph": "s", "id": 52081, "pid": 76337, "tid": -914061504, "ts": 1716454218260057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218314785, "dur": 20, "args": { "External id": 52090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52090, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52090, "pid": 5, "tid": 7, "ts": 1716454218314785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260097, "dur": 9, "args": { "External id": 52090, "cbid": 211, "correlation": 52090 } }, { "ph": "s", "id": 52090, "pid": 76337, "tid": -914061504, "ts": 1716454218260097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218260149, "dur": 0, "args": { "External id": 52100, "cbid": 317, "correlation": 52100 } }, { "ph": "f", "id": 52100, "pid": 76337, "tid": -914061504, "ts": 1716454218260149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218260150, "dur": 0, "args": { "External id": 52101, "cbid": 203, "correlation": 52101 } }, { "ph": "f", "id": 52101, "pid": 76337, "tid": -914061504, "ts": 1716454218260150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218260151, "dur": 0, "args": { "External id": 52102, "cbid": 205, "correlation": 52102 } }, { "ph": "f", "id": 52102, "pid": 76337, "tid": -914061504, "ts": 1716454218260151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218314807, "dur": 22, "args": { "External id": 52106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52106, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52106, "pid": 5, "tid": 7, "ts": 1716454218314807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260169, "dur": 12, "args": { "External id": 52106, "cbid": 211, "correlation": 52106 } }, { "ph": "s", "id": 52106, "pid": 76337, "tid": -914061504, "ts": 1716454218260169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218314830, "dur": 317, "args": { "External id": 52108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52108, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52108, "pid": 5, "tid": 7, "ts": 1716454218314830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260183, "dur": 6, "args": { "External id": 52108, "cbid": 211, "correlation": 52108 } }, { "ph": "s", "id": 52108, "pid": 76337, "tid": -914061504, "ts": 1716454218260183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218315149, "dur": 1, "args": { "External id": 52110, "device": 5, "context": 1, "stream": 7, "correlation": 52110, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 52110, "pid": 5, "tid": 7, "ts": 1716454218315149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218260195, "dur": 8, "args": { "External id": 52110, "cbid": 51, "correlation": 52110 } }, { "ph": "s", "id": 52110, "pid": 76337, "tid": -914061504, "ts": 1716454218260195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218315153, "dur": 1229, "args": { "External id": 52111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52111, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52111, "pid": 5, "tid": 7, "ts": 1716454218315153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260204, "dur": 6, "args": { "External id": 52111, "cbid": 211, "correlation": 52111 } }, { "ph": "s", "id": 52111, "pid": 76337, "tid": -914061504, "ts": 1716454218260204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218316384, "dur": 13, "args": { "External id": 52113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52113, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52113, "pid": 5, "tid": 7, "ts": 1716454218316384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260214, "dur": 5, "args": { "External id": 52113, "cbid": 211, "correlation": 52113 } }, { "ph": "s", "id": 52113, "pid": 76337, "tid": -914061504, "ts": 1716454218260214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218316398, "dur": 15, "args": { "External id": 52119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52119, "pid": 5, "tid": 7, "ts": 1716454218316398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260243, "dur": 8, "args": { "External id": 52119, "cbid": 211, "correlation": 52119 } }, { "ph": "s", "id": 52119, "pid": 76337, "tid": -914061504, "ts": 1716454218260243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218316415, "dur": 3, "args": { "External id": 52127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52127, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 52127, "pid": 5, "tid": 7, "ts": 1716454218316415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260286, "dur": 10, "args": { "External id": 52127, "cbid": 211, "correlation": 52127 } }, { "ph": "s", "id": 52127, "pid": 76337, "tid": -914061504, "ts": 1716454218260286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218260351, "dur": 1, "args": { "External id": 52143, "cbid": 251, "correlation": 52143 } }, { "ph": "f", "id": 52143, "pid": 76337, "tid": -914061504, "ts": 1716454218260351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218260356, "dur": 0, "args": { "External id": 52145, "cbid": 251, "correlation": 52145 } }, { "ph": "f", "id": 52145, "pid": 76337, "tid": -914061504, "ts": 1716454218260356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218316419, "dur": 12, "args": { "External id": 52146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52146, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52146, "pid": 5, "tid": 7, "ts": 1716454218316419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260358, "dur": 11, "args": { "External id": 52146, "cbid": 211, "correlation": 52146 } }, { "ph": "s", "id": 52146, "pid": 76337, "tid": -914061504, "ts": 1716454218260358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218316433, "dur": 5, "args": { "External id": 52148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52148, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52148, "pid": 5, "tid": 7, "ts": 1716454218316433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260371, "dur": 6, "args": { "External id": 52148, "cbid": 211, "correlation": 52148 } }, { "ph": "s", "id": 52148, "pid": 76337, "tid": -914061504, "ts": 1716454218260371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218316439, "dur": 17, "args": { "External id": 52158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52158, "pid": 5, "tid": 7, "ts": 1716454218316439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260427, "dur": 12, "args": { "External id": 52158, "cbid": 211, "correlation": 52158 } }, { "ph": "s", "id": 52158, "pid": 76337, "tid": -914061504, "ts": 1716454218260427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218316457, "dur": 17, "args": { "External id": 52178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52178, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 52178, "pid": 5, "tid": 7, "ts": 1716454218316457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260493, "dur": 14, "args": { "External id": 52178, "cbid": 211, "correlation": 52178 } }, { "ph": "s", "id": 52178, "pid": 76337, "tid": -914061504, "ts": 1716454218260493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218316476, "dur": 4, "args": { "External id": 52190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52190, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 52190, "pid": 5, "tid": 7, "ts": 1716454218316476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260517, "dur": 6, "args": { "External id": 52190, "cbid": 211, "correlation": 52190 } }, { "ph": "s", "id": 52190, "pid": 76337, "tid": -914061504, "ts": 1716454218260517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218316481, "dur": 16, "args": { "External id": 52193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52193, "pid": 5, "tid": 7, "ts": 1716454218316481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260535, "dur": 6, "args": { "External id": 52193, "cbid": 211, "correlation": 52193 } }, { "ph": "s", "id": 52193, "pid": 76337, "tid": -914061504, "ts": 1716454218260535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218316499, "dur": 11, "args": { "External id": 52202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52202, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52202, "pid": 5, "tid": 7, "ts": 1716454218316499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260576, "dur": 9, "args": { "External id": 52202, "cbid": 211, "correlation": 52202 } }, { "ph": "s", "id": 52202, "pid": 76337, "tid": -914061504, "ts": 1716454218260576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218260638, "dur": 0, "args": { "External id": 52212, "cbid": 317, "correlation": 52212 } }, { "ph": "f", "id": 52212, "pid": 76337, "tid": -914061504, "ts": 1716454218260638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218260639, "dur": 0, "args": { "External id": 52213, "cbid": 203, "correlation": 52213 } }, { "ph": "f", "id": 52213, "pid": 76337, "tid": -914061504, "ts": 1716454218260639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218260640, "dur": 0, "args": { "External id": 52214, "cbid": 205, "correlation": 52214 } }, { "ph": "f", "id": 52214, "pid": 76337, "tid": -914061504, "ts": 1716454218260640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218316511, "dur": 12, "args": { "External id": 52218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52218, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52218, "pid": 5, "tid": 7, "ts": 1716454218316511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260654, "dur": 12, "args": { "External id": 52218, "cbid": 211, "correlation": 52218 } }, { "ph": "s", "id": 52218, "pid": 76337, "tid": -914061504, "ts": 1716454218260654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218316524, "dur": 160, "args": { "External id": 52220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52220, "pid": 5, "tid": 7, "ts": 1716454218316524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260668, "dur": 5, "args": { "External id": 52220, "cbid": 211, "correlation": 52220 } }, { "ph": "s", "id": 52220, "pid": 76337, "tid": -914061504, "ts": 1716454218260668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218316687, "dur": 1, "args": { "External id": 52222, "device": 5, "context": 1, "stream": 7, "correlation": 52222, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 52222, "pid": 5, "tid": 7, "ts": 1716454218316687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218260679, "dur": 6, "args": { "External id": 52222, "cbid": 51, "correlation": 52222 } }, { "ph": "s", "id": 52222, "pid": 76337, "tid": -914061504, "ts": 1716454218260679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218316690, "dur": 644, "args": { "External id": 52223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52223, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52223, "pid": 5, "tid": 7, "ts": 1716454218316690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260686, "dur": 6, "args": { "External id": 52223, "cbid": 211, "correlation": 52223 } }, { "ph": "s", "id": 52223, "pid": 76337, "tid": -914061504, "ts": 1716454218260686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218317335, "dur": 12, "args": { "External id": 52225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52225, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52225, "pid": 5, "tid": 7, "ts": 1716454218317335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260696, "dur": 5, "args": { "External id": 52225, "cbid": 211, "correlation": 52225 } }, { "ph": "s", "id": 52225, "pid": 76337, "tid": -914061504, "ts": 1716454218260696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218317349, "dur": 14, "args": { "External id": 52231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52231, "pid": 5, "tid": 7, "ts": 1716454218317349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260724, "dur": 8, "args": { "External id": 52231, "cbid": 211, "correlation": 52231 } }, { "ph": "s", "id": 52231, "pid": 76337, "tid": -914061504, "ts": 1716454218260724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218260783, "dur": 3, "args": { "External id": 52241, "cbid": 317, "correlation": 52241 } }, { "ph": "f", "id": 52241, "pid": 76337, "tid": -914061504, "ts": 1716454218260783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218260786, "dur": 0, "args": { "External id": 52242, "cbid": 203, "correlation": 52242 } }, { "ph": "f", "id": 52242, "pid": 76337, "tid": -914061504, "ts": 1716454218260786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218260787, "dur": 0, "args": { "External id": 52243, "cbid": 205, "correlation": 52243 } }, { "ph": "f", "id": 52243, "pid": 76337, "tid": -914061504, "ts": 1716454218260787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218317365, "dur": 21, "args": { "External id": 52247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52247, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52247, "pid": 5, "tid": 7, "ts": 1716454218317365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260801, "dur": 12, "args": { "External id": 52247, "cbid": 211, "correlation": 52247 } }, { "ph": "s", "id": 52247, "pid": 76337, "tid": -914061504, "ts": 1716454218260801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218317387, "dur": 4, "args": { "External id": 52249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52249, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52249, "pid": 5, "tid": 7, "ts": 1716454218317387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260818, "dur": 6, "args": { "External id": 52249, "cbid": 211, "correlation": 52249 } }, { "ph": "s", "id": 52249, "pid": 76337, "tid": -914061504, "ts": 1716454218260818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218260828, "dur": 0, "args": { "External id": 52250, "cbid": 51, "correlation": 52250 } }, { "ph": "s", "id": 52250, "pid": 76337, "tid": -914061504, "ts": 1716454218260828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218317392, "dur": 173, "args": { "External id": 52251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52251, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 52251, "pid": 5, "tid": 7, "ts": 1716454218317392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260829, "dur": 5, "args": { "External id": 52251, "cbid": 211, "correlation": 52251 } }, { "ph": "s", "id": 52251, "pid": 76337, "tid": -914061504, "ts": 1716454218260829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218317566, "dur": 16, "args": { "External id": 52256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52256, "pid": 5, "tid": 7, "ts": 1716454218317566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260854, "dur": 8, "args": { "External id": 52256, "cbid": 211, "correlation": 52256 } }, { "ph": "s", "id": 52256, "pid": 76337, "tid": -914061504, "ts": 1716454218260854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218317583, "dur": 12, "args": { "External id": 52264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52264, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52264, "pid": 5, "tid": 7, "ts": 1716454218317583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260883, "dur": 8, "args": { "External id": 52264, "cbid": 211, "correlation": 52264 } }, { "ph": "s", "id": 52264, "pid": 76337, "tid": -914061504, "ts": 1716454218260883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218317596, "dur": 10, "args": { "External id": 52272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52272, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52272, "pid": 5, "tid": 7, "ts": 1716454218317596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218260911, "dur": 9, "args": { "External id": 52272, "cbid": 211, "correlation": 52272 } }, { "ph": "s", "id": 52272, "pid": 76337, "tid": -914061504, "ts": 1716454218260911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218317608, "dur": 18, "args": { "External id": 52292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52292, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 52292, "pid": 5, "tid": 7, "ts": 1716454218317608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261001, "dur": 12, "args": { "External id": 52292, "cbid": 211, "correlation": 52292 } }, { "ph": "s", "id": 52292, "pid": 76337, "tid": -914061504, "ts": 1716454218261001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218317627, "dur": 4, "args": { "External id": 52304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52304, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 52304, "pid": 5, "tid": 7, "ts": 1716454218317627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261024, "dur": 7, "args": { "External id": 52304, "cbid": 211, "correlation": 52304 } }, { "ph": "s", "id": 52304, "pid": 76337, "tid": -914061504, "ts": 1716454218261024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218317633, "dur": 17, "args": { "External id": 52307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52307, "pid": 5, "tid": 7, "ts": 1716454218317633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261042, "dur": 9, "args": { "External id": 52307, "cbid": 211, "correlation": 52307 } }, { "ph": "s", "id": 52307, "pid": 76337, "tid": -914061504, "ts": 1716454218261042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218261104, "dur": 0, "args": { "External id": 52318, "cbid": 317, "correlation": 52318 } }, { "ph": "f", "id": 52318, "pid": 76337, "tid": -914061504, "ts": 1716454218261104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218261105, "dur": 0, "args": { "External id": 52319, "cbid": 203, "correlation": 52319 } }, { "ph": "f", "id": 52319, "pid": 76337, "tid": -914061504, "ts": 1716454218261105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218261106, "dur": 0, "args": { "External id": 52320, "cbid": 205, "correlation": 52320 } }, { "ph": "f", "id": 52320, "pid": 76337, "tid": -914061504, "ts": 1716454218261106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218317651, "dur": 11, "args": { "External id": 52324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52324, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52324, "pid": 5, "tid": 7, "ts": 1716454218317651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261119, "dur": 12, "args": { "External id": 52324, "cbid": 211, "correlation": 52324 } }, { "ph": "s", "id": 52324, "pid": 76337, "tid": -914061504, "ts": 1716454218261119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218317664, "dur": 3, "args": { "External id": 52326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52326, "pid": 5, "tid": 7, "ts": 1716454218317664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261136, "dur": 5, "args": { "External id": 52326, "cbid": 211, "correlation": 52326 } }, { "ph": "s", "id": 52326, "pid": 76337, "tid": -914061504, "ts": 1716454218261136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218261144, "dur": 0, "args": { "External id": 52327, "cbid": 51, "correlation": 52327 } }, { "ph": "s", "id": 52327, "pid": 76337, "tid": -914061504, "ts": 1716454218261144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218317668, "dur": 90, "args": { "External id": 52328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52328, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 52328, "pid": 5, "tid": 7, "ts": 1716454218317668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261144, "dur": 5, "args": { "External id": 52328, "cbid": 211, "correlation": 52328 } }, { "ph": "s", "id": 52328, "pid": 76337, "tid": -914061504, "ts": 1716454218261144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218317759, "dur": 15, "args": { "External id": 52333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52333, "pid": 5, "tid": 7, "ts": 1716454218317759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261171, "dur": 8, "args": { "External id": 52333, "cbid": 211, "correlation": 52333 } }, { "ph": "s", "id": 52333, "pid": 76337, "tid": -914061504, "ts": 1716454218261171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218317776, "dur": 82, "args": { "External id": 52342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52342, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52342, "pid": 5, "tid": 7, "ts": 1716454218317776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261252, "dur": 14, "args": { "External id": 52342, "cbid": 211, "correlation": 52342 } }, { "ph": "s", "id": 52342, "pid": 76337, "tid": -914061504, "ts": 1716454218261252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218317859, "dur": 30, "args": { "External id": 52364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52364, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52364, "pid": 5, "tid": 7, "ts": 1716454218317859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261315, "dur": 10, "args": { "External id": 52364, "cbid": 211, "correlation": 52364 } }, { "ph": "s", "id": 52364, "pid": 76337, "tid": -914061504, "ts": 1716454218261315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218261408, "dur": 1, "args": { "External id": 52375, "cbid": 251, "correlation": 52375 } }, { "ph": "f", "id": 52375, "pid": 76337, "tid": -914061504, "ts": 1716454218261408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218317890, "dur": 161, "args": { "External id": 52376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52376, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52376, "pid": 5, "tid": 7, "ts": 1716454218317890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261414, "dur": 16, "args": { "External id": 52376, "cbid": 211, "correlation": 52376 } }, { "ph": "s", "id": 52376, "pid": 76337, "tid": -914061504, "ts": 1716454218261414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218261488, "dur": 1, "args": { "External id": 52387, "cbid": 251, "correlation": 52387 } }, { "ph": "f", "id": 52387, "pid": 76337, "tid": -914061504, "ts": 1716454218261488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218318053, "dur": 156, "args": { "External id": 52388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52388, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52388, "pid": 5, "tid": 7, "ts": 1716454218318053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261493, "dur": 13, "args": { "External id": 52388, "cbid": 211, "correlation": 52388 } }, { "ph": "s", "id": 52388, "pid": 76337, "tid": -914061504, "ts": 1716454218261493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218261560, "dur": 1, "args": { "External id": 52399, "cbid": 251, "correlation": 52399 } }, { "ph": "f", "id": 52399, "pid": 76337, "tid": -914061504, "ts": 1716454218261560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218318210, "dur": 135, "args": { "External id": 52400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52400, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52400, "pid": 5, "tid": 7, "ts": 1716454218318210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261564, "dur": 11, "args": { "External id": 52400, "cbid": 211, "correlation": 52400 } }, { "ph": "s", "id": 52400, "pid": 76337, "tid": -914061504, "ts": 1716454218261564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218318346, "dur": 331, "args": { "External id": 52425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52425, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52425, "pid": 5, "tid": 7, "ts": 1716454218318346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261652, "dur": 13, "args": { "External id": 52425, "cbid": 211, "correlation": 52425 } }, { "ph": "s", "id": 52425, "pid": 76337, "tid": -914061504, "ts": 1716454218261652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218261755, "dur": 1, "args": { "External id": 52443, "cbid": 251, "correlation": 52443 } }, { "ph": "f", "id": 52443, "pid": 76337, "tid": -914061504, "ts": 1716454218261755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218318679, "dur": 166, "args": { "External id": 52445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52445, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52445, "pid": 5, "tid": 7, "ts": 1716454218318679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261761, "dur": 13, "args": { "External id": 52445, "cbid": 211, "correlation": 52445 } }, { "ph": "s", "id": 52445, "pid": 76337, "tid": -914061504, "ts": 1716454218261761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218318846, "dur": 19, "args": { "External id": 52453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52453, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52453, "pid": 5, "tid": 7, "ts": 1716454218318846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261831, "dur": 12, "args": { "External id": 52453, "cbid": 211, "correlation": 52453 } }, { "ph": "s", "id": 52453, "pid": 76337, "tid": -914061504, "ts": 1716454218261831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218318866, "dur": 27, "args": { "External id": 52461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52461, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52461, "pid": 5, "tid": 7, "ts": 1716454218318866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261874, "dur": 9, "args": { "External id": 52461, "cbid": 211, "correlation": 52461 } }, { "ph": "s", "id": 52461, "pid": 76337, "tid": -914061504, "ts": 1716454218261874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218318894, "dur": 18, "args": { "External id": 52472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52472, "pid": 5, "tid": 7, "ts": 1716454218318894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261950, "dur": 13, "args": { "External id": 52472, "cbid": 211, "correlation": 52472 } }, { "ph": "s", "id": 52472, "pid": 76337, "tid": -914061504, "ts": 1716454218261950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218318914, "dur": 16, "args": { "External id": 52494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52494, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52494, "pid": 5, "tid": 7, "ts": 1716454218318914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218261990, "dur": 9, "args": { "External id": 52494, "cbid": 211, "correlation": 52494 } }, { "ph": "s", "id": 52494, "pid": 76337, "tid": -914061504, "ts": 1716454218261990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262078, "dur": 2, "args": { "External id": 52505, "cbid": 251, "correlation": 52505 } }, { "ph": "f", "id": 52505, "pid": 76337, "tid": -914061504, "ts": 1716454218262078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218318931, "dur": 88, "args": { "External id": 52506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52506, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52506, "pid": 5, "tid": 7, "ts": 1716454218318931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262083, "dur": 13, "args": { "External id": 52506, "cbid": 211, "correlation": 52506 } }, { "ph": "s", "id": 52506, "pid": 76337, "tid": -914061504, "ts": 1716454218262083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262154, "dur": 1, "args": { "External id": 52517, "cbid": 251, "correlation": 52517 } }, { "ph": "f", "id": 52517, "pid": 76337, "tid": -914061504, "ts": 1716454218262154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262157, "dur": 0, "args": { "External id": 52518, "cbid": 251, "correlation": 52518 } }, { "ph": "f", "id": 52518, "pid": 76337, "tid": -914061504, "ts": 1716454218262157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218319021, "dur": 12, "args": { "External id": 52519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52519, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52519, "pid": 5, "tid": 7, "ts": 1716454218319021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262159, "dur": 12, "args": { "External id": 52519, "cbid": 211, "correlation": 52519 } }, { "ph": "s", "id": 52519, "pid": 76337, "tid": -914061504, "ts": 1716454218262159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218319034, "dur": 5, "args": { "External id": 52521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52521, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52521, "pid": 5, "tid": 7, "ts": 1716454218319034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262172, "dur": 6, "args": { "External id": 52521, "cbid": 211, "correlation": 52521 } }, { "ph": "s", "id": 52521, "pid": 76337, "tid": -914061504, "ts": 1716454218262172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262230, "dur": 4, "args": { "External id": 52532, "cbid": 251, "correlation": 52532 } }, { "ph": "f", "id": 52532, "pid": 76337, "tid": -914061504, "ts": 1716454218262230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262237, "dur": 0, "args": { "External id": 52533, "cbid": 251, "correlation": 52533 } }, { "ph": "f", "id": 52533, "pid": 76337, "tid": -914061504, "ts": 1716454218262237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218319041, "dur": 8, "args": { "External id": 52534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52534, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52534, "pid": 5, "tid": 7, "ts": 1716454218319041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262238, "dur": 12, "args": { "External id": 52534, "cbid": 211, "correlation": 52534 } }, { "ph": "s", "id": 52534, "pid": 76337, "tid": -914061504, "ts": 1716454218262238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218319050, "dur": 3, "args": { "External id": 52536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52536, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52536, "pid": 5, "tid": 7, "ts": 1716454218319050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262251, "dur": 5, "args": { "External id": 52536, "cbid": 211, "correlation": 52536 } }, { "ph": "s", "id": 52536, "pid": 76337, "tid": -914061504, "ts": 1716454218262251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218319055, "dur": 55, "args": { "External id": 52561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52561, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52561, "pid": 5, "tid": 7, "ts": 1716454218319055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262328, "dur": 13, "args": { "External id": 52561, "cbid": 211, "correlation": 52561 } }, { "ph": "s", "id": 52561, "pid": 76337, "tid": -914061504, "ts": 1716454218262328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262428, "dur": 2, "args": { "External id": 52579, "cbid": 251, "correlation": 52579 } }, { "ph": "f", "id": 52579, "pid": 76337, "tid": -914061504, "ts": 1716454218262428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218319111, "dur": 90, "args": { "External id": 52581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52581, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52581, "pid": 5, "tid": 7, "ts": 1716454218319111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262435, "dur": 15, "args": { "External id": 52581, "cbid": 211, "correlation": 52581 } }, { "ph": "s", "id": 52581, "pid": 76337, "tid": -914061504, "ts": 1716454218262435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218319202, "dur": 10, "args": { "External id": 52589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52589, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52589, "pid": 5, "tid": 7, "ts": 1716454218319202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262505, "dur": 12, "args": { "External id": 52589, "cbid": 211, "correlation": 52589 } }, { "ph": "s", "id": 52589, "pid": 76337, "tid": -914061504, "ts": 1716454218262505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218319213, "dur": 21, "args": { "External id": 52597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52597, "pid": 5, "tid": 7, "ts": 1716454218319213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262547, "dur": 9, "args": { "External id": 52597, "cbid": 211, "correlation": 52597 } }, { "ph": "s", "id": 52597, "pid": 76337, "tid": -914061504, "ts": 1716454218262547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218319235, "dur": 18, "args": { "External id": 52619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52619, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52619, "pid": 5, "tid": 7, "ts": 1716454218319235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262601, "dur": 11, "args": { "External id": 52619, "cbid": 211, "correlation": 52619 } }, { "ph": "s", "id": 52619, "pid": 76337, "tid": -914061504, "ts": 1716454218262601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262691, "dur": 1, "args": { "External id": 52635, "cbid": 251, "correlation": 52635 } }, { "ph": "f", "id": 52635, "pid": 76337, "tid": -914061504, "ts": 1716454218262691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262696, "dur": 0, "args": { "External id": 52637, "cbid": 251, "correlation": 52637 } }, { "ph": "f", "id": 52637, "pid": 76337, "tid": -914061504, "ts": 1716454218262696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218319255, "dur": 493, "args": { "External id": 52638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52638, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52638, "pid": 5, "tid": 7, "ts": 1716454218319255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262698, "dur": 14, "args": { "External id": 52638, "cbid": 211, "correlation": 52638 } }, { "ph": "s", "id": 52638, "pid": 76337, "tid": -914061504, "ts": 1716454218262698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218319749, "dur": 67, "args": { "External id": 52646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52646, "pid": 5, "tid": 7, "ts": 1716454218319749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262766, "dur": 12, "args": { "External id": 52646, "cbid": 211, "correlation": 52646 } }, { "ph": "s", "id": 52646, "pid": 76337, "tid": -914061504, "ts": 1716454218262766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218319817, "dur": 66, "args": { "External id": 52654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52654, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52654, "pid": 5, "tid": 7, "ts": 1716454218319817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262798, "dur": 9, "args": { "External id": 52654, "cbid": 211, "correlation": 52654 } }, { "ph": "s", "id": 52654, "pid": 76337, "tid": -914061504, "ts": 1716454218262798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218262879, "dur": 1, "args": { "External id": 52670, "cbid": 251, "correlation": 52670 } }, { "ph": "f", "id": 52670, "pid": 76337, "tid": -914061504, "ts": 1716454218262879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218319885, "dur": 1, "args": { "External id": 52672, "device": 5, "context": 1, "stream": 7, "correlation": 52672, "bytes": 240, "memory bandwidth (GB/s)": 0.1563517915309446 } }, { "ph": "f", "id": 52672, "pid": 5, "tid": 7, "ts": 1716454218319885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218262884, "dur": 10, "args": { "External id": 52672, "cbid": 51, "correlation": 52672 } }, { "ph": "s", "id": 52672, "pid": 76337, "tid": -914061504, "ts": 1716454218262884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218319888, "dur": 269, "args": { "External id": 52673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52673, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52673, "pid": 5, "tid": 7, "ts": 1716454218319888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262895, "dur": 11, "args": { "External id": 52673, "cbid": 211, "correlation": 52673 } }, { "ph": "s", "id": 52673, "pid": 76337, "tid": -914061504, "ts": 1716454218262895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218320159, "dur": 15, "args": { "External id": 52681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52681, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52681, "pid": 5, "tid": 7, "ts": 1716454218320159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218262937, "dur": 10, "args": { "External id": 52681, "cbid": 211, "correlation": 52681 } }, { "ph": "s", "id": 52681, "pid": 76337, "tid": -914061504, "ts": 1716454218262937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218320175, "dur": 37, "args": { "External id": 52692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52692, "pid": 5, "tid": 7, "ts": 1716454218320175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263017, "dur": 14, "args": { "External id": 52692, "cbid": 211, "correlation": 52692 } }, { "ph": "s", "id": 52692, "pid": 76337, "tid": -914061504, "ts": 1716454218263017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218263083, "dur": 0, "args": { "External id": 52704, "cbid": 317, "correlation": 52704 } }, { "ph": "f", "id": 52704, "pid": 76337, "tid": -914061504, "ts": 1716454218263083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218263084, "dur": 0, "args": { "External id": 52705, "cbid": 203, "correlation": 52705 } }, { "ph": "f", "id": 52705, "pid": 76337, "tid": -914061504, "ts": 1716454218263084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218263085, "dur": 0, "args": { "External id": 52706, "cbid": 205, "correlation": 52706 } }, { "ph": "f", "id": 52706, "pid": 76337, "tid": -914061504, "ts": 1716454218263085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218320214, "dur": 14, "args": { "External id": 52710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52710, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52710, "pid": 5, "tid": 7, "ts": 1716454218320214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263099, "dur": 12, "args": { "External id": 52710, "cbid": 211, "correlation": 52710 } }, { "ph": "s", "id": 52710, "pid": 76337, "tid": -914061504, "ts": 1716454218263099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218320229, "dur": 4, "args": { "External id": 52712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52712, "pid": 5, "tid": 7, "ts": 1716454218320229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263116, "dur": 6, "args": { "External id": 52712, "cbid": 211, "correlation": 52712 } }, { "ph": "s", "id": 52712, "pid": 76337, "tid": -914061504, "ts": 1716454218263116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218263125, "dur": 0, "args": { "External id": 52713, "cbid": 51, "correlation": 52713 } }, { "ph": "s", "id": 52713, "pid": 76337, "tid": -914061504, "ts": 1716454218263125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218320234, "dur": 96, "args": { "External id": 52714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52714, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 52714, "pid": 5, "tid": 7, "ts": 1716454218320234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263125, "dur": 6, "args": { "External id": 52714, "cbid": 211, "correlation": 52714 } }, { "ph": "s", "id": 52714, "pid": 76337, "tid": -914061504, "ts": 1716454218263125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218320332, "dur": 16, "args": { "External id": 52719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52719, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52719, "pid": 5, "tid": 7, "ts": 1716454218320332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263153, "dur": 9, "args": { "External id": 52719, "cbid": 211, "correlation": 52719 } }, { "ph": "s", "id": 52719, "pid": 76337, "tid": -914061504, "ts": 1716454218263153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218320350, "dur": 11, "args": { "External id": 52727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52727, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52727, "pid": 5, "tid": 7, "ts": 1716454218320350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263185, "dur": 8, "args": { "External id": 52727, "cbid": 211, "correlation": 52727 } }, { "ph": "s", "id": 52727, "pid": 76337, "tid": -914061504, "ts": 1716454218263185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218320362, "dur": 29, "args": { "External id": 52736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52736, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52736, "pid": 5, "tid": 7, "ts": 1716454218320362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263224, "dur": 11, "args": { "External id": 52736, "cbid": 211, "correlation": 52736 } }, { "ph": "s", "id": 52736, "pid": 76337, "tid": -914061504, "ts": 1716454218263224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218320393, "dur": 30, "args": { "External id": 52756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52756, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 52756, "pid": 5, "tid": 7, "ts": 1716454218320393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263299, "dur": 12, "args": { "External id": 52756, "cbid": 211, "correlation": 52756 } }, { "ph": "s", "id": 52756, "pid": 76337, "tid": -914061504, "ts": 1716454218263299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218320425, "dur": 5, "args": { "External id": 52768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52768, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52768, "pid": 5, "tid": 7, "ts": 1716454218320425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263321, "dur": 6, "args": { "External id": 52768, "cbid": 211, "correlation": 52768 } }, { "ph": "s", "id": 52768, "pid": 76337, "tid": -914061504, "ts": 1716454218263321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218320431, "dur": 31, "args": { "External id": 52771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52771, "pid": 5, "tid": 7, "ts": 1716454218320431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263339, "dur": 7, "args": { "External id": 52771, "cbid": 211, "correlation": 52771 } }, { "ph": "s", "id": 52771, "pid": 76337, "tid": -914061504, "ts": 1716454218263339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218320463, "dur": 21, "args": { "External id": 52780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52780, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52780, "pid": 5, "tid": 7, "ts": 1716454218320463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263379, "dur": 9, "args": { "External id": 52780, "cbid": 211, "correlation": 52780 } }, { "ph": "s", "id": 52780, "pid": 76337, "tid": -914061504, "ts": 1716454218263379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218263432, "dur": 0, "args": { "External id": 52790, "cbid": 317, "correlation": 52790 } }, { "ph": "f", "id": 52790, "pid": 76337, "tid": -914061504, "ts": 1716454218263432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218263433, "dur": 0, "args": { "External id": 52791, "cbid": 203, "correlation": 52791 } }, { "ph": "f", "id": 52791, "pid": 76337, "tid": -914061504, "ts": 1716454218263433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218263433, "dur": 0, "args": { "External id": 52792, "cbid": 205, "correlation": 52792 } }, { "ph": "f", "id": 52792, "pid": 76337, "tid": -914061504, "ts": 1716454218263433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218320485, "dur": 23, "args": { "External id": 52796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52796, "pid": 5, "tid": 7, "ts": 1716454218320485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263447, "dur": 11, "args": { "External id": 52796, "cbid": 211, "correlation": 52796 } }, { "ph": "s", "id": 52796, "pid": 76337, "tid": -914061504, "ts": 1716454218263447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218320509, "dur": 317, "args": { "External id": 52798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52798, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52798, "pid": 5, "tid": 7, "ts": 1716454218320509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263461, "dur": 5, "args": { "External id": 52798, "cbid": 211, "correlation": 52798 } }, { "ph": "s", "id": 52798, "pid": 76337, "tid": -914061504, "ts": 1716454218263461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218320828, "dur": 1, "args": { "External id": 52800, "device": 5, "context": 1, "stream": 7, "correlation": 52800, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 52800, "pid": 5, "tid": 7, "ts": 1716454218320828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218263473, "dur": 6, "args": { "External id": 52800, "cbid": 51, "correlation": 52800 } }, { "ph": "s", "id": 52800, "pid": 76337, "tid": -914061504, "ts": 1716454218263473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218320832, "dur": 1250, "args": { "External id": 52801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52801, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52801, "pid": 5, "tid": 7, "ts": 1716454218320832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263480, "dur": 6, "args": { "External id": 52801, "cbid": 211, "correlation": 52801 } }, { "ph": "s", "id": 52801, "pid": 76337, "tid": -914061504, "ts": 1716454218263480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218322083, "dur": 13, "args": { "External id": 52803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52803, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52803, "pid": 5, "tid": 7, "ts": 1716454218322083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263490, "dur": 5, "args": { "External id": 52803, "cbid": 211, "correlation": 52803 } }, { "ph": "s", "id": 52803, "pid": 76337, "tid": -914061504, "ts": 1716454218263490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218322098, "dur": 15, "args": { "External id": 52809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52809, "pid": 5, "tid": 7, "ts": 1716454218322098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263518, "dur": 9, "args": { "External id": 52809, "cbid": 211, "correlation": 52809 } }, { "ph": "s", "id": 52809, "pid": 76337, "tid": -914061504, "ts": 1716454218263518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218322114, "dur": 3, "args": { "External id": 52817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52817, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 52817, "pid": 5, "tid": 7, "ts": 1716454218322114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263565, "dur": 10, "args": { "External id": 52817, "cbid": 211, "correlation": 52817 } }, { "ph": "s", "id": 52817, "pid": 76337, "tid": -914061504, "ts": 1716454218263565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218263631, "dur": 1, "args": { "External id": 52833, "cbid": 251, "correlation": 52833 } }, { "ph": "f", "id": 52833, "pid": 76337, "tid": -914061504, "ts": 1716454218263631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218263637, "dur": 0, "args": { "External id": 52835, "cbid": 251, "correlation": 52835 } }, { "ph": "f", "id": 52835, "pid": 76337, "tid": -914061504, "ts": 1716454218263637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218322118, "dur": 13, "args": { "External id": 52836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52836, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52836, "pid": 5, "tid": 7, "ts": 1716454218322118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263639, "dur": 12, "args": { "External id": 52836, "cbid": 211, "correlation": 52836 } }, { "ph": "s", "id": 52836, "pid": 76337, "tid": -914061504, "ts": 1716454218263639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218322133, "dur": 5, "args": { "External id": 52838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52838, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52838, "pid": 5, "tid": 7, "ts": 1716454218322133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263652, "dur": 5, "args": { "External id": 52838, "cbid": 211, "correlation": 52838 } }, { "ph": "s", "id": 52838, "pid": 76337, "tid": -914061504, "ts": 1716454218263652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218322139, "dur": 16, "args": { "External id": 52848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52848, "pid": 5, "tid": 7, "ts": 1716454218322139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263710, "dur": 12, "args": { "External id": 52848, "cbid": 211, "correlation": 52848 } }, { "ph": "s", "id": 52848, "pid": 76337, "tid": -914061504, "ts": 1716454218263710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218322156, "dur": 19, "args": { "External id": 52868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52868, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 52868, "pid": 5, "tid": 7, "ts": 1716454218322156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263777, "dur": 11, "args": { "External id": 52868, "cbid": 211, "correlation": 52868 } }, { "ph": "s", "id": 52868, "pid": 76337, "tid": -914061504, "ts": 1716454218263777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218322177, "dur": 4, "args": { "External id": 52880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52880, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 52880, "pid": 5, "tid": 7, "ts": 1716454218322177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263797, "dur": 6, "args": { "External id": 52880, "cbid": 211, "correlation": 52880 } }, { "ph": "s", "id": 52880, "pid": 76337, "tid": -914061504, "ts": 1716454218263797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218322182, "dur": 16, "args": { "External id": 52883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52883, "pid": 5, "tid": 7, "ts": 1716454218322182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263815, "dur": 7, "args": { "External id": 52883, "cbid": 211, "correlation": 52883 } }, { "ph": "s", "id": 52883, "pid": 76337, "tid": -914061504, "ts": 1716454218263815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218322200, "dur": 11, "args": { "External id": 52892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52892, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52892, "pid": 5, "tid": 7, "ts": 1716454218322200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263859, "dur": 10, "args": { "External id": 52892, "cbid": 211, "correlation": 52892 } }, { "ph": "s", "id": 52892, "pid": 76337, "tid": -914061504, "ts": 1716454218263859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218263921, "dur": 0, "args": { "External id": 52902, "cbid": 317, "correlation": 52902 } }, { "ph": "f", "id": 52902, "pid": 76337, "tid": -914061504, "ts": 1716454218263921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218263922, "dur": 0, "args": { "External id": 52903, "cbid": 203, "correlation": 52903 } }, { "ph": "f", "id": 52903, "pid": 76337, "tid": -914061504, "ts": 1716454218263922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218263923, "dur": 0, "args": { "External id": 52904, "cbid": 205, "correlation": 52904 } }, { "ph": "f", "id": 52904, "pid": 76337, "tid": -914061504, "ts": 1716454218263923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218322212, "dur": 12, "args": { "External id": 52908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52908, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52908, "pid": 5, "tid": 7, "ts": 1716454218322212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263937, "dur": 12, "args": { "External id": 52908, "cbid": 211, "correlation": 52908 } }, { "ph": "s", "id": 52908, "pid": 76337, "tid": -914061504, "ts": 1716454218263937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218322225, "dur": 162, "args": { "External id": 52910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52910, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52910, "pid": 5, "tid": 7, "ts": 1716454218322225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263952, "dur": 5, "args": { "External id": 52910, "cbid": 211, "correlation": 52910 } }, { "ph": "s", "id": 52910, "pid": 76337, "tid": -914061504, "ts": 1716454218263952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218322389, "dur": 1, "args": { "External id": 52912, "device": 5, "context": 1, "stream": 7, "correlation": 52912, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 52912, "pid": 5, "tid": 7, "ts": 1716454218322389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218263962, "dur": 7, "args": { "External id": 52912, "cbid": 51, "correlation": 52912 } }, { "ph": "s", "id": 52912, "pid": 76337, "tid": -914061504, "ts": 1716454218263962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218322393, "dur": 644, "args": { "External id": 52913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52913, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 52913, "pid": 5, "tid": 7, "ts": 1716454218322393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263970, "dur": 13, "args": { "External id": 52913, "cbid": 211, "correlation": 52913 } }, { "ph": "s", "id": 52913, "pid": 76337, "tid": -914061504, "ts": 1716454218263970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218323039, "dur": 12, "args": { "External id": 52915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52915, "pid": 5, "tid": 7, "ts": 1716454218323039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218263988, "dur": 6, "args": { "External id": 52915, "cbid": 211, "correlation": 52915 } }, { "ph": "s", "id": 52915, "pid": 76337, "tid": -914061504, "ts": 1716454218263988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218323052, "dur": 15, "args": { "External id": 52921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52921, "pid": 5, "tid": 7, "ts": 1716454218323052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264017, "dur": 8, "args": { "External id": 52921, "cbid": 211, "correlation": 52921 } }, { "ph": "s", "id": 52921, "pid": 76337, "tid": -914061504, "ts": 1716454218264017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218264076, "dur": 0, "args": { "External id": 52931, "cbid": 317, "correlation": 52931 } }, { "ph": "f", "id": 52931, "pid": 76337, "tid": -914061504, "ts": 1716454218264076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218264076, "dur": 0, "args": { "External id": 52932, "cbid": 203, "correlation": 52932 } }, { "ph": "f", "id": 52932, "pid": 76337, "tid": -914061504, "ts": 1716454218264076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218264077, "dur": 0, "args": { "External id": 52933, "cbid": 205, "correlation": 52933 } }, { "ph": "f", "id": 52933, "pid": 76337, "tid": -914061504, "ts": 1716454218264077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218323068, "dur": 21, "args": { "External id": 52937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52937, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52937, "pid": 5, "tid": 7, "ts": 1716454218323068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264089, "dur": 12, "args": { "External id": 52937, "cbid": 211, "correlation": 52937 } }, { "ph": "s", "id": 52937, "pid": 76337, "tid": -914061504, "ts": 1716454218264089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218323090, "dur": 4, "args": { "External id": 52939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52939, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 52939, "pid": 5, "tid": 7, "ts": 1716454218323090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264105, "dur": 6, "args": { "External id": 52939, "cbid": 211, "correlation": 52939 } }, { "ph": "s", "id": 52939, "pid": 76337, "tid": -914061504, "ts": 1716454218264105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218264114, "dur": 0, "args": { "External id": 52940, "cbid": 51, "correlation": 52940 } }, { "ph": "s", "id": 52940, "pid": 76337, "tid": -914061504, "ts": 1716454218264114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218323095, "dur": 169, "args": { "External id": 52941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52941, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 52941, "pid": 5, "tid": 7, "ts": 1716454218323095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264115, "dur": 5, "args": { "External id": 52941, "cbid": 211, "correlation": 52941 } }, { "ph": "s", "id": 52941, "pid": 76337, "tid": -914061504, "ts": 1716454218264115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218323266, "dur": 15, "args": { "External id": 52946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52946, "pid": 5, "tid": 7, "ts": 1716454218323266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264140, "dur": 11, "args": { "External id": 52946, "cbid": 211, "correlation": 52946 } }, { "ph": "s", "id": 52946, "pid": 76337, "tid": -914061504, "ts": 1716454218264140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218323282, "dur": 12, "args": { "External id": 52954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52954, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52954, "pid": 5, "tid": 7, "ts": 1716454218323282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264172, "dur": 9, "args": { "External id": 52954, "cbid": 211, "correlation": 52954 } }, { "ph": "s", "id": 52954, "pid": 76337, "tid": -914061504, "ts": 1716454218264172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218323296, "dur": 11, "args": { "External id": 52962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52962, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52962, "pid": 5, "tid": 7, "ts": 1716454218323296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264201, "dur": 8, "args": { "External id": 52962, "cbid": 211, "correlation": 52962 } }, { "ph": "s", "id": 52962, "pid": 76337, "tid": -914061504, "ts": 1716454218264201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218323308, "dur": 18, "args": { "External id": 52982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52982, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 52982, "pid": 5, "tid": 7, "ts": 1716454218323308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264283, "dur": 13, "args": { "External id": 52982, "cbid": 211, "correlation": 52982 } }, { "ph": "s", "id": 52982, "pid": 76337, "tid": -914061504, "ts": 1716454218264283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218323327, "dur": 4, "args": { "External id": 52994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52994, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 52994, "pid": 5, "tid": 7, "ts": 1716454218323327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264305, "dur": 6, "args": { "External id": 52994, "cbid": 211, "correlation": 52994 } }, { "ph": "s", "id": 52994, "pid": 76337, "tid": -914061504, "ts": 1716454218264305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218323333, "dur": 16, "args": { "External id": 52997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 52997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 52997, "pid": 5, "tid": 7, "ts": 1716454218323333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264322, "dur": 6, "args": { "External id": 52997, "cbid": 211, "correlation": 52997 } }, { "ph": "s", "id": 52997, "pid": 76337, "tid": -914061504, "ts": 1716454218264322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218264380, "dur": 0, "args": { "External id": 53008, "cbid": 317, "correlation": 53008 } }, { "ph": "f", "id": 53008, "pid": 76337, "tid": -914061504, "ts": 1716454218264380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218264381, "dur": 0, "args": { "External id": 53009, "cbid": 203, "correlation": 53009 } }, { "ph": "f", "id": 53009, "pid": 76337, "tid": -914061504, "ts": 1716454218264381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218264381, "dur": 0, "args": { "External id": 53010, "cbid": 205, "correlation": 53010 } }, { "ph": "f", "id": 53010, "pid": 76337, "tid": -914061504, "ts": 1716454218264381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218323351, "dur": 12, "args": { "External id": 53014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53014, "pid": 5, "tid": 7, "ts": 1716454218323351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264394, "dur": 12, "args": { "External id": 53014, "cbid": 211, "correlation": 53014 } }, { "ph": "s", "id": 53014, "pid": 76337, "tid": -914061504, "ts": 1716454218264394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218323364, "dur": 4, "args": { "External id": 53016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53016, "pid": 5, "tid": 7, "ts": 1716454218323364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264411, "dur": 8, "args": { "External id": 53016, "cbid": 211, "correlation": 53016 } }, { "ph": "s", "id": 53016, "pid": 76337, "tid": -914061504, "ts": 1716454218264411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218264422, "dur": 0, "args": { "External id": 53017, "cbid": 51, "correlation": 53017 } }, { "ph": "s", "id": 53017, "pid": 76337, "tid": -914061504, "ts": 1716454218264422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218323369, "dur": 89, "args": { "External id": 53018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53018, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 53018, "pid": 5, "tid": 7, "ts": 1716454218323369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264423, "dur": 5, "args": { "External id": 53018, "cbid": 211, "correlation": 53018 } }, { "ph": "s", "id": 53018, "pid": 76337, "tid": -914061504, "ts": 1716454218264423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218323459, "dur": 15, "args": { "External id": 53023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53023, "pid": 5, "tid": 7, "ts": 1716454218323459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264450, "dur": 9, "args": { "External id": 53023, "cbid": 211, "correlation": 53023 } }, { "ph": "s", "id": 53023, "pid": 76337, "tid": -914061504, "ts": 1716454218264450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218323476, "dur": 83, "args": { "External id": 53032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53032, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53032, "pid": 5, "tid": 7, "ts": 1716454218323476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264531, "dur": 13, "args": { "External id": 53032, "cbid": 211, "correlation": 53032 } }, { "ph": "s", "id": 53032, "pid": 76337, "tid": -914061504, "ts": 1716454218264531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218323560, "dur": 30, "args": { "External id": 53054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53054, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53054, "pid": 5, "tid": 7, "ts": 1716454218323560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264587, "dur": 10, "args": { "External id": 53054, "cbid": 211, "correlation": 53054 } }, { "ph": "s", "id": 53054, "pid": 76337, "tid": -914061504, "ts": 1716454218264587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218264675, "dur": 1, "args": { "External id": 53065, "cbid": 251, "correlation": 53065 } }, { "ph": "f", "id": 53065, "pid": 76337, "tid": -914061504, "ts": 1716454218264675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218323592, "dur": 162, "args": { "External id": 53066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53066, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53066, "pid": 5, "tid": 7, "ts": 1716454218323592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264680, "dur": 13, "args": { "External id": 53066, "cbid": 211, "correlation": 53066 } }, { "ph": "s", "id": 53066, "pid": 76337, "tid": -914061504, "ts": 1716454218264680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218264751, "dur": 1, "args": { "External id": 53077, "cbid": 251, "correlation": 53077 } }, { "ph": "f", "id": 53077, "pid": 76337, "tid": -914061504, "ts": 1716454218264751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218323755, "dur": 159, "args": { "External id": 53078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53078, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53078, "pid": 5, "tid": 7, "ts": 1716454218323755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264755, "dur": 11, "args": { "External id": 53078, "cbid": 211, "correlation": 53078 } }, { "ph": "s", "id": 53078, "pid": 76337, "tid": -914061504, "ts": 1716454218264755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218264821, "dur": 3, "args": { "External id": 53089, "cbid": 251, "correlation": 53089 } }, { "ph": "f", "id": 53089, "pid": 76337, "tid": -914061504, "ts": 1716454218264821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218323915, "dur": 158, "args": { "External id": 53090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53090, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53090, "pid": 5, "tid": 7, "ts": 1716454218323915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264827, "dur": 11, "args": { "External id": 53090, "cbid": 211, "correlation": 53090 } }, { "ph": "s", "id": 53090, "pid": 76337, "tid": -914061504, "ts": 1716454218264827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218324075, "dur": 333, "args": { "External id": 53115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53115, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53115, "pid": 5, "tid": 7, "ts": 1716454218324075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218264911, "dur": 14, "args": { "External id": 53115, "cbid": 211, "correlation": 53115 } }, { "ph": "s", "id": 53115, "pid": 76337, "tid": -914061504, "ts": 1716454218264911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265019, "dur": 1, "args": { "External id": 53133, "cbid": 251, "correlation": 53133 } }, { "ph": "f", "id": 53133, "pid": 76337, "tid": -914061504, "ts": 1716454218265019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218324409, "dur": 166, "args": { "External id": 53135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53135, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53135, "pid": 5, "tid": 7, "ts": 1716454218324409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265025, "dur": 14, "args": { "External id": 53135, "cbid": 211, "correlation": 53135 } }, { "ph": "s", "id": 53135, "pid": 76337, "tid": -914061504, "ts": 1716454218265025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218324576, "dur": 20, "args": { "External id": 53143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53143, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53143, "pid": 5, "tid": 7, "ts": 1716454218324576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265097, "dur": 12, "args": { "External id": 53143, "cbid": 211, "correlation": 53143 } }, { "ph": "s", "id": 53143, "pid": 76337, "tid": -914061504, "ts": 1716454218265097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218324597, "dur": 27, "args": { "External id": 53151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53151, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53151, "pid": 5, "tid": 7, "ts": 1716454218324597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265136, "dur": 8, "args": { "External id": 53151, "cbid": 211, "correlation": 53151 } }, { "ph": "s", "id": 53151, "pid": 76337, "tid": -914061504, "ts": 1716454218265136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218324626, "dur": 18, "args": { "External id": 53162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53162, "pid": 5, "tid": 7, "ts": 1716454218324626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265206, "dur": 13, "args": { "External id": 53162, "cbid": 211, "correlation": 53162 } }, { "ph": "s", "id": 53162, "pid": 76337, "tid": -914061504, "ts": 1716454218265206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218324645, "dur": 16, "args": { "External id": 53184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53184, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53184, "pid": 5, "tid": 7, "ts": 1716454218324645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265240, "dur": 7, "args": { "External id": 53184, "cbid": 211, "correlation": 53184 } }, { "ph": "s", "id": 53184, "pid": 76337, "tid": -914061504, "ts": 1716454218265240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265326, "dur": 1, "args": { "External id": 53195, "cbid": 251, "correlation": 53195 } }, { "ph": "f", "id": 53195, "pid": 76337, "tid": -914061504, "ts": 1716454218265326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218324663, "dur": 88, "args": { "External id": 53196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53196, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53196, "pid": 5, "tid": 7, "ts": 1716454218324663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265332, "dur": 13, "args": { "External id": 53196, "cbid": 211, "correlation": 53196 } }, { "ph": "s", "id": 53196, "pid": 76337, "tid": -914061504, "ts": 1716454218265332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265401, "dur": 1, "args": { "External id": 53207, "cbid": 251, "correlation": 53207 } }, { "ph": "f", "id": 53207, "pid": 76337, "tid": -914061504, "ts": 1716454218265401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265405, "dur": 0, "args": { "External id": 53208, "cbid": 251, "correlation": 53208 } }, { "ph": "f", "id": 53208, "pid": 76337, "tid": -914061504, "ts": 1716454218265405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218324752, "dur": 13, "args": { "External id": 53209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53209, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53209, "pid": 5, "tid": 7, "ts": 1716454218324752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265406, "dur": 12, "args": { "External id": 53209, "cbid": 211, "correlation": 53209 } }, { "ph": "s", "id": 53209, "pid": 76337, "tid": -914061504, "ts": 1716454218265406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218324766, "dur": 5, "args": { "External id": 53211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53211, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53211, "pid": 5, "tid": 7, "ts": 1716454218324766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265420, "dur": 6, "args": { "External id": 53211, "cbid": 211, "correlation": 53211 } }, { "ph": "s", "id": 53211, "pid": 76337, "tid": -914061504, "ts": 1716454218265420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265477, "dur": 1, "args": { "External id": 53222, "cbid": 251, "correlation": 53222 } }, { "ph": "f", "id": 53222, "pid": 76337, "tid": -914061504, "ts": 1716454218265477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265480, "dur": 0, "args": { "External id": 53223, "cbid": 251, "correlation": 53223 } }, { "ph": "f", "id": 53223, "pid": 76337, "tid": -914061504, "ts": 1716454218265480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218324772, "dur": 8, "args": { "External id": 53224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53224, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53224, "pid": 5, "tid": 7, "ts": 1716454218324772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265482, "dur": 11, "args": { "External id": 53224, "cbid": 211, "correlation": 53224 } }, { "ph": "s", "id": 53224, "pid": 76337, "tid": -914061504, "ts": 1716454218265482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218324782, "dur": 3, "args": { "External id": 53226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53226, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53226, "pid": 5, "tid": 7, "ts": 1716454218324782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265494, "dur": 6, "args": { "External id": 53226, "cbid": 211, "correlation": 53226 } }, { "ph": "s", "id": 53226, "pid": 76337, "tid": -914061504, "ts": 1716454218265494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218324787, "dur": 55, "args": { "External id": 53251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53251, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53251, "pid": 5, "tid": 7, "ts": 1716454218324787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265573, "dur": 13, "args": { "External id": 53251, "cbid": 211, "correlation": 53251 } }, { "ph": "s", "id": 53251, "pid": 76337, "tid": -914061504, "ts": 1716454218265573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265672, "dur": 1, "args": { "External id": 53269, "cbid": 251, "correlation": 53269 } }, { "ph": "f", "id": 53269, "pid": 76337, "tid": -914061504, "ts": 1716454218265672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218324843, "dur": 91, "args": { "External id": 53271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53271, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53271, "pid": 5, "tid": 7, "ts": 1716454218324843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265678, "dur": 14, "args": { "External id": 53271, "cbid": 211, "correlation": 53271 } }, { "ph": "s", "id": 53271, "pid": 76337, "tid": -914061504, "ts": 1716454218265678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218324935, "dur": 9, "args": { "External id": 53279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53279, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53279, "pid": 5, "tid": 7, "ts": 1716454218324935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265747, "dur": 12, "args": { "External id": 53279, "cbid": 211, "correlation": 53279 } }, { "ph": "s", "id": 53279, "pid": 76337, "tid": -914061504, "ts": 1716454218265747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218324945, "dur": 20, "args": { "External id": 53287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53287, "pid": 5, "tid": 7, "ts": 1716454218324945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265789, "dur": 10, "args": { "External id": 53287, "cbid": 211, "correlation": 53287 } }, { "ph": "s", "id": 53287, "pid": 76337, "tid": -914061504, "ts": 1716454218265789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218324967, "dur": 17, "args": { "External id": 53309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53309, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53309, "pid": 5, "tid": 7, "ts": 1716454218324967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265839, "dur": 10, "args": { "External id": 53309, "cbid": 211, "correlation": 53309 } }, { "ph": "s", "id": 53309, "pid": 76337, "tid": -914061504, "ts": 1716454218265839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265930, "dur": 1, "args": { "External id": 53325, "cbid": 251, "correlation": 53325 } }, { "ph": "f", "id": 53325, "pid": 76337, "tid": -914061504, "ts": 1716454218265930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218265935, "dur": 0, "args": { "External id": 53327, "cbid": 251, "correlation": 53327 } }, { "ph": "f", "id": 53327, "pid": 76337, "tid": -914061504, "ts": 1716454218265935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218324986, "dur": 496, "args": { "External id": 53328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53328, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53328, "pid": 5, "tid": 7, "ts": 1716454218324986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218265937, "dur": 13, "args": { "External id": 53328, "cbid": 211, "correlation": 53328 } }, { "ph": "s", "id": 53328, "pid": 76337, "tid": -914061504, "ts": 1716454218265937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218325483, "dur": 66, "args": { "External id": 53336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53336, "pid": 5, "tid": 7, "ts": 1716454218325483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266011, "dur": 13, "args": { "External id": 53336, "cbid": 211, "correlation": 53336 } }, { "ph": "s", "id": 53336, "pid": 76337, "tid": -914061504, "ts": 1716454218266011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218325550, "dur": 66, "args": { "External id": 53344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53344, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53344, "pid": 5, "tid": 7, "ts": 1716454218325550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266042, "dur": 8, "args": { "External id": 53344, "cbid": 211, "correlation": 53344 } }, { "ph": "s", "id": 53344, "pid": 76337, "tid": -914061504, "ts": 1716454218266042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218266123, "dur": 1, "args": { "External id": 53360, "cbid": 251, "correlation": 53360 } }, { "ph": "f", "id": 53360, "pid": 76337, "tid": -914061504, "ts": 1716454218266123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218325618, "dur": 1, "args": { "External id": 53362, "device": 5, "context": 1, "stream": 7, "correlation": 53362, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 53362, "pid": 5, "tid": 7, "ts": 1716454218325618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218266128, "dur": 9, "args": { "External id": 53362, "cbid": 51, "correlation": 53362 } }, { "ph": "s", "id": 53362, "pid": 76337, "tid": -914061504, "ts": 1716454218266128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218325622, "dur": 269, "args": { "External id": 53363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53363, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53363, "pid": 5, "tid": 7, "ts": 1716454218325622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266139, "dur": 12, "args": { "External id": 53363, "cbid": 211, "correlation": 53363 } }, { "ph": "s", "id": 53363, "pid": 76337, "tid": -914061504, "ts": 1716454218266139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218325892, "dur": 13, "args": { "External id": 53371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53371, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53371, "pid": 5, "tid": 7, "ts": 1716454218325892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266181, "dur": 10, "args": { "External id": 53371, "cbid": 211, "correlation": 53371 } }, { "ph": "s", "id": 53371, "pid": 76337, "tid": -914061504, "ts": 1716454218266181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218325907, "dur": 37, "args": { "External id": 53382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53382, "pid": 5, "tid": 7, "ts": 1716454218325907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266249, "dur": 12, "args": { "External id": 53382, "cbid": 211, "correlation": 53382 } }, { "ph": "s", "id": 53382, "pid": 76337, "tid": -914061504, "ts": 1716454218266249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218266313, "dur": 0, "args": { "External id": 53394, "cbid": 317, "correlation": 53394 } }, { "ph": "f", "id": 53394, "pid": 76337, "tid": -914061504, "ts": 1716454218266313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218266314, "dur": 0, "args": { "External id": 53395, "cbid": 203, "correlation": 53395 } }, { "ph": "f", "id": 53395, "pid": 76337, "tid": -914061504, "ts": 1716454218266314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218266315, "dur": 3, "args": { "External id": 53396, "cbid": 205, "correlation": 53396 } }, { "ph": "f", "id": 53396, "pid": 76337, "tid": -914061504, "ts": 1716454218266315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218325945, "dur": 15, "args": { "External id": 53400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53400, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53400, "pid": 5, "tid": 7, "ts": 1716454218325945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266334, "dur": 12, "args": { "External id": 53400, "cbid": 211, "correlation": 53400 } }, { "ph": "s", "id": 53400, "pid": 76337, "tid": -914061504, "ts": 1716454218266334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218325962, "dur": 4, "args": { "External id": 53402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53402, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53402, "pid": 5, "tid": 7, "ts": 1716454218325962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266350, "dur": 6, "args": { "External id": 53402, "cbid": 211, "correlation": 53402 } }, { "ph": "s", "id": 53402, "pid": 76337, "tid": -914061504, "ts": 1716454218266350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218266359, "dur": 0, "args": { "External id": 53403, "cbid": 51, "correlation": 53403 } }, { "ph": "s", "id": 53403, "pid": 76337, "tid": -914061504, "ts": 1716454218266359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218325967, "dur": 96, "args": { "External id": 53404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53404, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 53404, "pid": 5, "tid": 7, "ts": 1716454218325967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266359, "dur": 5, "args": { "External id": 53404, "cbid": 211, "correlation": 53404 } }, { "ph": "s", "id": 53404, "pid": 76337, "tid": -914061504, "ts": 1716454218266359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218326065, "dur": 16, "args": { "External id": 53409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53409, "pid": 5, "tid": 7, "ts": 1716454218326065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266387, "dur": 8, "args": { "External id": 53409, "cbid": 211, "correlation": 53409 } }, { "ph": "s", "id": 53409, "pid": 76337, "tid": -914061504, "ts": 1716454218266387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218326082, "dur": 12, "args": { "External id": 53417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53417, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53417, "pid": 5, "tid": 7, "ts": 1716454218326082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266419, "dur": 9, "args": { "External id": 53417, "cbid": 211, "correlation": 53417 } }, { "ph": "s", "id": 53417, "pid": 76337, "tid": -914061504, "ts": 1716454218266419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218326095, "dur": 25, "args": { "External id": 53426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53426, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53426, "pid": 5, "tid": 7, "ts": 1716454218326095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266457, "dur": 10, "args": { "External id": 53426, "cbid": 211, "correlation": 53426 } }, { "ph": "s", "id": 53426, "pid": 76337, "tid": -914061504, "ts": 1716454218266457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218326122, "dur": 24, "args": { "External id": 53446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53446, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 53446, "pid": 5, "tid": 7, "ts": 1716454218326122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266529, "dur": 11, "args": { "External id": 53446, "cbid": 211, "correlation": 53446 } }, { "ph": "s", "id": 53446, "pid": 76337, "tid": -914061504, "ts": 1716454218266529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218326147, "dur": 5, "args": { "External id": 53458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53458, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 53458, "pid": 5, "tid": 7, "ts": 1716454218326147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266550, "dur": 6, "args": { "External id": 53458, "cbid": 211, "correlation": 53458 } }, { "ph": "s", "id": 53458, "pid": 76337, "tid": -914061504, "ts": 1716454218266550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218326153, "dur": 24, "args": { "External id": 53461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53461, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53461, "pid": 5, "tid": 7, "ts": 1716454218326153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266568, "dur": 10, "args": { "External id": 53461, "cbid": 211, "correlation": 53461 } }, { "ph": "s", "id": 53461, "pid": 76337, "tid": -914061504, "ts": 1716454218266568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218326178, "dur": 17, "args": { "External id": 53470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53470, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53470, "pid": 5, "tid": 7, "ts": 1716454218326178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266612, "dur": 10, "args": { "External id": 53470, "cbid": 211, "correlation": 53470 } }, { "ph": "s", "id": 53470, "pid": 76337, "tid": -914061504, "ts": 1716454218266612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218266664, "dur": 0, "args": { "External id": 53480, "cbid": 317, "correlation": 53480 } }, { "ph": "f", "id": 53480, "pid": 76337, "tid": -914061504, "ts": 1716454218266664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218266665, "dur": 0, "args": { "External id": 53481, "cbid": 203, "correlation": 53481 } }, { "ph": "f", "id": 53481, "pid": 76337, "tid": -914061504, "ts": 1716454218266665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218266666, "dur": 0, "args": { "External id": 53482, "cbid": 205, "correlation": 53482 } }, { "ph": "f", "id": 53482, "pid": 76337, "tid": -914061504, "ts": 1716454218266666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218326196, "dur": 17, "args": { "External id": 53486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53486, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53486, "pid": 5, "tid": 7, "ts": 1716454218326196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266681, "dur": 12, "args": { "External id": 53486, "cbid": 211, "correlation": 53486 } }, { "ph": "s", "id": 53486, "pid": 76337, "tid": -914061504, "ts": 1716454218266681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218326215, "dur": 239, "args": { "External id": 53488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53488, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53488, "pid": 5, "tid": 7, "ts": 1716454218326215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266695, "dur": 5, "args": { "External id": 53488, "cbid": 211, "correlation": 53488 } }, { "ph": "s", "id": 53488, "pid": 76337, "tid": -914061504, "ts": 1716454218266695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218326456, "dur": 1, "args": { "External id": 53490, "device": 5, "context": 1, "stream": 7, "correlation": 53490, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 53490, "pid": 5, "tid": 7, "ts": 1716454218326456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218266707, "dur": 8, "args": { "External id": 53490, "cbid": 51, "correlation": 53490 } }, { "ph": "s", "id": 53490, "pid": 76337, "tid": -914061504, "ts": 1716454218266707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218326460, "dur": 806, "args": { "External id": 53491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53491, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53491, "pid": 5, "tid": 7, "ts": 1716454218326460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266716, "dur": 6, "args": { "External id": 53491, "cbid": 211, "correlation": 53491 } }, { "ph": "s", "id": 53491, "pid": 76337, "tid": -914061504, "ts": 1716454218266716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218327267, "dur": 14, "args": { "External id": 53493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53493, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53493, "pid": 5, "tid": 7, "ts": 1716454218327267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266726, "dur": 5, "args": { "External id": 53493, "cbid": 211, "correlation": 53493 } }, { "ph": "s", "id": 53493, "pid": 76337, "tid": -914061504, "ts": 1716454218266726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218327282, "dur": 14, "args": { "External id": 53499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53499, "pid": 5, "tid": 7, "ts": 1716454218327282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266755, "dur": 8, "args": { "External id": 53499, "cbid": 211, "correlation": 53499 } }, { "ph": "s", "id": 53499, "pid": 76337, "tid": -914061504, "ts": 1716454218266755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218327298, "dur": 4, "args": { "External id": 53507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53507, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 53507, "pid": 5, "tid": 7, "ts": 1716454218327298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266799, "dur": 9, "args": { "External id": 53507, "cbid": 211, "correlation": 53507 } }, { "ph": "s", "id": 53507, "pid": 76337, "tid": -914061504, "ts": 1716454218266799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218266864, "dur": 1, "args": { "External id": 53523, "cbid": 251, "correlation": 53523 } }, { "ph": "f", "id": 53523, "pid": 76337, "tid": -914061504, "ts": 1716454218266864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218266869, "dur": 0, "args": { "External id": 53525, "cbid": 251, "correlation": 53525 } }, { "ph": "f", "id": 53525, "pid": 76337, "tid": -914061504, "ts": 1716454218266869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218327303, "dur": 13, "args": { "External id": 53526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53526, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53526, "pid": 5, "tid": 7, "ts": 1716454218327303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266871, "dur": 14, "args": { "External id": 53526, "cbid": 211, "correlation": 53526 } }, { "ph": "s", "id": 53526, "pid": 76337, "tid": -914061504, "ts": 1716454218266871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218327318, "dur": 5, "args": { "External id": 53528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53528, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53528, "pid": 5, "tid": 7, "ts": 1716454218327318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266887, "dur": 6, "args": { "External id": 53528, "cbid": 211, "correlation": 53528 } }, { "ph": "s", "id": 53528, "pid": 76337, "tid": -914061504, "ts": 1716454218266887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218327324, "dur": 17, "args": { "External id": 53538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53538, "pid": 5, "tid": 7, "ts": 1716454218327324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218266945, "dur": 12, "args": { "External id": 53538, "cbid": 211, "correlation": 53538 } }, { "ph": "s", "id": 53538, "pid": 76337, "tid": -914061504, "ts": 1716454218266945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218327342, "dur": 18, "args": { "External id": 53558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53558, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 53558, "pid": 5, "tid": 7, "ts": 1716454218327342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267026, "dur": 11, "args": { "External id": 53558, "cbid": 211, "correlation": 53558 } }, { "ph": "s", "id": 53558, "pid": 76337, "tid": -914061504, "ts": 1716454218267026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218327361, "dur": 4, "args": { "External id": 53570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53570, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 53570, "pid": 5, "tid": 7, "ts": 1716454218327361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267047, "dur": 6, "args": { "External id": 53570, "cbid": 211, "correlation": 53570 } }, { "ph": "s", "id": 53570, "pid": 76337, "tid": -914061504, "ts": 1716454218267047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218327367, "dur": 16, "args": { "External id": 53573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53573, "pid": 5, "tid": 7, "ts": 1716454218327367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267066, "dur": 7, "args": { "External id": 53573, "cbid": 211, "correlation": 53573 } }, { "ph": "s", "id": 53573, "pid": 76337, "tid": -914061504, "ts": 1716454218267066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218327384, "dur": 11, "args": { "External id": 53582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53582, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53582, "pid": 5, "tid": 7, "ts": 1716454218327384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267107, "dur": 10, "args": { "External id": 53582, "cbid": 211, "correlation": 53582 } }, { "ph": "s", "id": 53582, "pid": 76337, "tid": -914061504, "ts": 1716454218267107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218267171, "dur": 0, "args": { "External id": 53592, "cbid": 317, "correlation": 53592 } }, { "ph": "f", "id": 53592, "pid": 76337, "tid": -914061504, "ts": 1716454218267171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218267172, "dur": 0, "args": { "External id": 53593, "cbid": 203, "correlation": 53593 } }, { "ph": "f", "id": 53593, "pid": 76337, "tid": -914061504, "ts": 1716454218267172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218267172, "dur": 0, "args": { "External id": 53594, "cbid": 205, "correlation": 53594 } }, { "ph": "f", "id": 53594, "pid": 76337, "tid": -914061504, "ts": 1716454218267172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218327396, "dur": 11, "args": { "External id": 53598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53598, "pid": 5, "tid": 7, "ts": 1716454218327396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267186, "dur": 14, "args": { "External id": 53598, "cbid": 211, "correlation": 53598 } }, { "ph": "s", "id": 53598, "pid": 76337, "tid": -914061504, "ts": 1716454218267186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218327408, "dur": 161, "args": { "External id": 53600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53600, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53600, "pid": 5, "tid": 7, "ts": 1716454218327408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267203, "dur": 5, "args": { "External id": 53600, "cbid": 211, "correlation": 53600 } }, { "ph": "s", "id": 53600, "pid": 76337, "tid": -914061504, "ts": 1716454218267203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218327572, "dur": 1, "args": { "External id": 53602, "device": 5, "context": 1, "stream": 7, "correlation": 53602, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 53602, "pid": 5, "tid": 7, "ts": 1716454218327572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218267214, "dur": 7, "args": { "External id": 53602, "cbid": 51, "correlation": 53602 } }, { "ph": "s", "id": 53602, "pid": 76337, "tid": -914061504, "ts": 1716454218267214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218327575, "dur": 645, "args": { "External id": 53603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53603, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53603, "pid": 5, "tid": 7, "ts": 1716454218327575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267222, "dur": 6, "args": { "External id": 53603, "cbid": 211, "correlation": 53603 } }, { "ph": "s", "id": 53603, "pid": 76337, "tid": -914061504, "ts": 1716454218267222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218328222, "dur": 13, "args": { "External id": 53605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53605, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53605, "pid": 5, "tid": 7, "ts": 1716454218328222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267232, "dur": 5, "args": { "External id": 53605, "cbid": 211, "correlation": 53605 } }, { "ph": "s", "id": 53605, "pid": 76337, "tid": -914061504, "ts": 1716454218267232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218328236, "dur": 15, "args": { "External id": 53611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53611, "pid": 5, "tid": 7, "ts": 1716454218328236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267260, "dur": 9, "args": { "External id": 53611, "cbid": 211, "correlation": 53611 } }, { "ph": "s", "id": 53611, "pid": 76337, "tid": -914061504, "ts": 1716454218267260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218267320, "dur": 0, "args": { "External id": 53621, "cbid": 317, "correlation": 53621 } }, { "ph": "f", "id": 53621, "pid": 76337, "tid": -914061504, "ts": 1716454218267320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218267321, "dur": 0, "args": { "External id": 53622, "cbid": 203, "correlation": 53622 } }, { "ph": "f", "id": 53622, "pid": 76337, "tid": -914061504, "ts": 1716454218267321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218267321, "dur": 0, "args": { "External id": 53623, "cbid": 205, "correlation": 53623 } }, { "ph": "f", "id": 53623, "pid": 76337, "tid": -914061504, "ts": 1716454218267321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218328252, "dur": 17, "args": { "External id": 53627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53627, "pid": 5, "tid": 7, "ts": 1716454218328252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267335, "dur": 11, "args": { "External id": 53627, "cbid": 211, "correlation": 53627 } }, { "ph": "s", "id": 53627, "pid": 76337, "tid": -914061504, "ts": 1716454218267335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218328270, "dur": 4, "args": { "External id": 53629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53629, "pid": 5, "tid": 7, "ts": 1716454218328270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267351, "dur": 6, "args": { "External id": 53629, "cbid": 211, "correlation": 53629 } }, { "ph": "s", "id": 53629, "pid": 76337, "tid": -914061504, "ts": 1716454218267351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218267359, "dur": 0, "args": { "External id": 53630, "cbid": 51, "correlation": 53630 } }, { "ph": "s", "id": 53630, "pid": 76337, "tid": -914061504, "ts": 1716454218267359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218328276, "dur": 132, "args": { "External id": 53631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53631, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 53631, "pid": 5, "tid": 7, "ts": 1716454218328276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267360, "dur": 5, "args": { "External id": 53631, "cbid": 211, "correlation": 53631 } }, { "ph": "s", "id": 53631, "pid": 76337, "tid": -914061504, "ts": 1716454218267360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218328409, "dur": 15, "args": { "External id": 53636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53636, "pid": 5, "tid": 7, "ts": 1716454218328409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267386, "dur": 8, "args": { "External id": 53636, "cbid": 211, "correlation": 53636 } }, { "ph": "s", "id": 53636, "pid": 76337, "tid": -914061504, "ts": 1716454218267386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218328425, "dur": 12, "args": { "External id": 53644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53644, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53644, "pid": 5, "tid": 7, "ts": 1716454218328425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267414, "dur": 9, "args": { "External id": 53644, "cbid": 211, "correlation": 53644 } }, { "ph": "s", "id": 53644, "pid": 76337, "tid": -914061504, "ts": 1716454218267414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218328438, "dur": 10, "args": { "External id": 53652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53652, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53652, "pid": 5, "tid": 7, "ts": 1716454218328438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267443, "dur": 11, "args": { "External id": 53652, "cbid": 211, "correlation": 53652 } }, { "ph": "s", "id": 53652, "pid": 76337, "tid": -914061504, "ts": 1716454218267443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218328450, "dur": 19, "args": { "External id": 53672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53672, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 53672, "pid": 5, "tid": 7, "ts": 1716454218328450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267531, "dur": 12, "args": { "External id": 53672, "cbid": 211, "correlation": 53672 } }, { "ph": "s", "id": 53672, "pid": 76337, "tid": -914061504, "ts": 1716454218267531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218328469, "dur": 4, "args": { "External id": 53684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53684, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 53684, "pid": 5, "tid": 7, "ts": 1716454218328469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267553, "dur": 6, "args": { "External id": 53684, "cbid": 211, "correlation": 53684 } }, { "ph": "s", "id": 53684, "pid": 76337, "tid": -914061504, "ts": 1716454218267553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218328475, "dur": 17, "args": { "External id": 53687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53687, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53687, "pid": 5, "tid": 7, "ts": 1716454218328475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267571, "dur": 7, "args": { "External id": 53687, "cbid": 211, "correlation": 53687 } }, { "ph": "s", "id": 53687, "pid": 76337, "tid": -914061504, "ts": 1716454218267571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218267629, "dur": 0, "args": { "External id": 53698, "cbid": 317, "correlation": 53698 } }, { "ph": "f", "id": 53698, "pid": 76337, "tid": -914061504, "ts": 1716454218267629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218267630, "dur": 0, "args": { "External id": 53699, "cbid": 203, "correlation": 53699 } }, { "ph": "f", "id": 53699, "pid": 76337, "tid": -914061504, "ts": 1716454218267630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218267631, "dur": 0, "args": { "External id": 53700, "cbid": 205, "correlation": 53700 } }, { "ph": "f", "id": 53700, "pid": 76337, "tid": -914061504, "ts": 1716454218267631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218328493, "dur": 11, "args": { "External id": 53704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53704, "pid": 5, "tid": 7, "ts": 1716454218328493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267644, "dur": 11, "args": { "External id": 53704, "cbid": 211, "correlation": 53704 } }, { "ph": "s", "id": 53704, "pid": 76337, "tid": -914061504, "ts": 1716454218267644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218328506, "dur": 3, "args": { "External id": 53706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53706, "pid": 5, "tid": 7, "ts": 1716454218328506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267659, "dur": 6, "args": { "External id": 53706, "cbid": 211, "correlation": 53706 } }, { "ph": "s", "id": 53706, "pid": 76337, "tid": -914061504, "ts": 1716454218267659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218267668, "dur": 0, "args": { "External id": 53707, "cbid": 51, "correlation": 53707 } }, { "ph": "s", "id": 53707, "pid": 76337, "tid": -914061504, "ts": 1716454218267668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218328510, "dur": 91, "args": { "External id": 53708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53708, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 53708, "pid": 5, "tid": 7, "ts": 1716454218328510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267668, "dur": 5, "args": { "External id": 53708, "cbid": 211, "correlation": 53708 } }, { "ph": "s", "id": 53708, "pid": 76337, "tid": -914061504, "ts": 1716454218267668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218328602, "dur": 16, "args": { "External id": 53713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53713, "pid": 5, "tid": 7, "ts": 1716454218328602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267695, "dur": 8, "args": { "External id": 53713, "cbid": 211, "correlation": 53713 } }, { "ph": "s", "id": 53713, "pid": 76337, "tid": -914061504, "ts": 1716454218267695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218328620, "dur": 84, "args": { "External id": 53722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53722, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53722, "pid": 5, "tid": 7, "ts": 1716454218328620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267775, "dur": 18, "args": { "External id": 53722, "cbid": 211, "correlation": 53722 } }, { "ph": "s", "id": 53722, "pid": 76337, "tid": -914061504, "ts": 1716454218267775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218328705, "dur": 29, "args": { "External id": 53744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53744, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53744, "pid": 5, "tid": 7, "ts": 1716454218328705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267836, "dur": 10, "args": { "External id": 53744, "cbid": 211, "correlation": 53744 } }, { "ph": "s", "id": 53744, "pid": 76337, "tid": -914061504, "ts": 1716454218267836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218267927, "dur": 1, "args": { "External id": 53755, "cbid": 251, "correlation": 53755 } }, { "ph": "f", "id": 53755, "pid": 76337, "tid": -914061504, "ts": 1716454218267927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218328736, "dur": 167, "args": { "External id": 53756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53756, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53756, "pid": 5, "tid": 7, "ts": 1716454218328736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218267933, "dur": 13, "args": { "External id": 53756, "cbid": 211, "correlation": 53756 } }, { "ph": "s", "id": 53756, "pid": 76337, "tid": -914061504, "ts": 1716454218267933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268011, "dur": 1, "args": { "External id": 53767, "cbid": 251, "correlation": 53767 } }, { "ph": "f", "id": 53767, "pid": 76337, "tid": -914061504, "ts": 1716454218268011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218328904, "dur": 156, "args": { "External id": 53768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53768, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53768, "pid": 5, "tid": 7, "ts": 1716454218328904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268015, "dur": 12, "args": { "External id": 53768, "cbid": 211, "correlation": 53768 } }, { "ph": "s", "id": 53768, "pid": 76337, "tid": -914061504, "ts": 1716454218268015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268082, "dur": 1, "args": { "External id": 53779, "cbid": 251, "correlation": 53779 } }, { "ph": "f", "id": 53779, "pid": 76337, "tid": -914061504, "ts": 1716454218268082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218329061, "dur": 158, "args": { "External id": 53780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53780, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53780, "pid": 5, "tid": 7, "ts": 1716454218329061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268086, "dur": 11, "args": { "External id": 53780, "cbid": 211, "correlation": 53780 } }, { "ph": "s", "id": 53780, "pid": 76337, "tid": -914061504, "ts": 1716454218268086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218329220, "dur": 338, "args": { "External id": 53805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53805, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53805, "pid": 5, "tid": 7, "ts": 1716454218329220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268173, "dur": 12, "args": { "External id": 53805, "cbid": 211, "correlation": 53805 } }, { "ph": "s", "id": 53805, "pid": 76337, "tid": -914061504, "ts": 1716454218268173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268274, "dur": 1, "args": { "External id": 53823, "cbid": 251, "correlation": 53823 } }, { "ph": "f", "id": 53823, "pid": 76337, "tid": -914061504, "ts": 1716454218268274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218329559, "dur": 163, "args": { "External id": 53825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53825, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53825, "pid": 5, "tid": 7, "ts": 1716454218329559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268280, "dur": 13, "args": { "External id": 53825, "cbid": 211, "correlation": 53825 } }, { "ph": "s", "id": 53825, "pid": 76337, "tid": -914061504, "ts": 1716454218268280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218329723, "dur": 19, "args": { "External id": 53833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53833, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53833, "pid": 5, "tid": 7, "ts": 1716454218329723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268350, "dur": 12, "args": { "External id": 53833, "cbid": 211, "correlation": 53833 } }, { "ph": "s", "id": 53833, "pid": 76337, "tid": -914061504, "ts": 1716454218268350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218329743, "dur": 28, "args": { "External id": 53841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53841, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53841, "pid": 5, "tid": 7, "ts": 1716454218329743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268388, "dur": 8, "args": { "External id": 53841, "cbid": 211, "correlation": 53841 } }, { "ph": "s", "id": 53841, "pid": 76337, "tid": -914061504, "ts": 1716454218268388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218329772, "dur": 19, "args": { "External id": 53852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53852, "pid": 5, "tid": 7, "ts": 1716454218329772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268460, "dur": 13, "args": { "External id": 53852, "cbid": 211, "correlation": 53852 } }, { "ph": "s", "id": 53852, "pid": 76337, "tid": -914061504, "ts": 1716454218268460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218329792, "dur": 16, "args": { "External id": 53874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53874, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53874, "pid": 5, "tid": 7, "ts": 1716454218329792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268491, "dur": 7, "args": { "External id": 53874, "cbid": 211, "correlation": 53874 } }, { "ph": "s", "id": 53874, "pid": 76337, "tid": -914061504, "ts": 1716454218268491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268580, "dur": 1, "args": { "External id": 53885, "cbid": 251, "correlation": 53885 } }, { "ph": "f", "id": 53885, "pid": 76337, "tid": -914061504, "ts": 1716454218268580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218329810, "dur": 89, "args": { "External id": 53886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53886, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53886, "pid": 5, "tid": 7, "ts": 1716454218329810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268586, "dur": 13, "args": { "External id": 53886, "cbid": 211, "correlation": 53886 } }, { "ph": "s", "id": 53886, "pid": 76337, "tid": -914061504, "ts": 1716454218268586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268656, "dur": 1, "args": { "External id": 53897, "cbid": 251, "correlation": 53897 } }, { "ph": "f", "id": 53897, "pid": 76337, "tid": -914061504, "ts": 1716454218268656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268660, "dur": 0, "args": { "External id": 53898, "cbid": 251, "correlation": 53898 } }, { "ph": "f", "id": 53898, "pid": 76337, "tid": -914061504, "ts": 1716454218268660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218329900, "dur": 11, "args": { "External id": 53899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53899, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53899, "pid": 5, "tid": 7, "ts": 1716454218329900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268661, "dur": 12, "args": { "External id": 53899, "cbid": 211, "correlation": 53899 } }, { "ph": "s", "id": 53899, "pid": 76337, "tid": -914061504, "ts": 1716454218268661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218329913, "dur": 5, "args": { "External id": 53901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53901, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53901, "pid": 5, "tid": 7, "ts": 1716454218329913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268675, "dur": 6, "args": { "External id": 53901, "cbid": 211, "correlation": 53901 } }, { "ph": "s", "id": 53901, "pid": 76337, "tid": -914061504, "ts": 1716454218268675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268733, "dur": 1, "args": { "External id": 53912, "cbid": 251, "correlation": 53912 } }, { "ph": "f", "id": 53912, "pid": 76337, "tid": -914061504, "ts": 1716454218268733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268736, "dur": 0, "args": { "External id": 53913, "cbid": 251, "correlation": 53913 } }, { "ph": "f", "id": 53913, "pid": 76337, "tid": -914061504, "ts": 1716454218268736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218329919, "dur": 8, "args": { "External id": 53914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53914, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53914, "pid": 5, "tid": 7, "ts": 1716454218329919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268738, "dur": 11, "args": { "External id": 53914, "cbid": 211, "correlation": 53914 } }, { "ph": "s", "id": 53914, "pid": 76337, "tid": -914061504, "ts": 1716454218268738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218329929, "dur": 3, "args": { "External id": 53916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53916, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53916, "pid": 5, "tid": 7, "ts": 1716454218329929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268751, "dur": 6, "args": { "External id": 53916, "cbid": 211, "correlation": 53916 } }, { "ph": "s", "id": 53916, "pid": 76337, "tid": -914061504, "ts": 1716454218268751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218329933, "dur": 55, "args": { "External id": 53941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53941, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 53941, "pid": 5, "tid": 7, "ts": 1716454218329933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268827, "dur": 13, "args": { "External id": 53941, "cbid": 211, "correlation": 53941 } }, { "ph": "s", "id": 53941, "pid": 76337, "tid": -914061504, "ts": 1716454218268827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218268928, "dur": 1, "args": { "External id": 53959, "cbid": 251, "correlation": 53959 } }, { "ph": "f", "id": 53959, "pid": 76337, "tid": -914061504, "ts": 1716454218268928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218329989, "dur": 92, "args": { "External id": 53961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53961, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 53961, "pid": 5, "tid": 7, "ts": 1716454218329989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218268934, "dur": 14, "args": { "External id": 53961, "cbid": 211, "correlation": 53961 } }, { "ph": "s", "id": 53961, "pid": 76337, "tid": -914061504, "ts": 1716454218268934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218330082, "dur": 9, "args": { "External id": 53969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53969, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53969, "pid": 5, "tid": 7, "ts": 1716454218330082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269015, "dur": 12, "args": { "External id": 53969, "cbid": 211, "correlation": 53969 } }, { "ph": "s", "id": 53969, "pid": 76337, "tid": -914061504, "ts": 1716454218269015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218330093, "dur": 22, "args": { "External id": 53977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53977, "pid": 5, "tid": 7, "ts": 1716454218330093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269057, "dur": 10, "args": { "External id": 53977, "cbid": 211, "correlation": 53977 } }, { "ph": "s", "id": 53977, "pid": 76337, "tid": -914061504, "ts": 1716454218269057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218330116, "dur": 18, "args": { "External id": 53999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 53999, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 53999, "pid": 5, "tid": 7, "ts": 1716454218330116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269109, "dur": 10, "args": { "External id": 53999, "cbid": 211, "correlation": 53999 } }, { "ph": "s", "id": 53999, "pid": 76337, "tid": -914061504, "ts": 1716454218269109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218269196, "dur": 1, "args": { "External id": 54015, "cbid": 251, "correlation": 54015 } }, { "ph": "f", "id": 54015, "pid": 76337, "tid": -914061504, "ts": 1716454218269196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218269201, "dur": 0, "args": { "External id": 54017, "cbid": 251, "correlation": 54017 } }, { "ph": "f", "id": 54017, "pid": 76337, "tid": -914061504, "ts": 1716454218269201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218330135, "dur": 497, "args": { "External id": 54018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54018, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54018, "pid": 5, "tid": 7, "ts": 1716454218330135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269203, "dur": 13, "args": { "External id": 54018, "cbid": 211, "correlation": 54018 } }, { "ph": "s", "id": 54018, "pid": 76337, "tid": -914061504, "ts": 1716454218269203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218330633, "dur": 67, "args": { "External id": 54026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54026, "pid": 5, "tid": 7, "ts": 1716454218330633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269269, "dur": 12, "args": { "External id": 54026, "cbid": 211, "correlation": 54026 } }, { "ph": "s", "id": 54026, "pid": 76337, "tid": -914061504, "ts": 1716454218269269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218330701, "dur": 65, "args": { "External id": 54034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54034, "pid": 5, "tid": 7, "ts": 1716454218330701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269303, "dur": 10, "args": { "External id": 54034, "cbid": 211, "correlation": 54034 } }, { "ph": "s", "id": 54034, "pid": 76337, "tid": -914061504, "ts": 1716454218269303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218269385, "dur": 1, "args": { "External id": 54050, "cbid": 251, "correlation": 54050 } }, { "ph": "f", "id": 54050, "pid": 76337, "tid": -914061504, "ts": 1716454218269385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218330769, "dur": 1, "args": { "External id": 54052, "device": 5, "context": 1, "stream": 7, "correlation": 54052, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 54052, "pid": 5, "tid": 7, "ts": 1716454218330769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218269390, "dur": 9, "args": { "External id": 54052, "cbid": 51, "correlation": 54052 } }, { "ph": "s", "id": 54052, "pid": 76337, "tid": -914061504, "ts": 1716454218269390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218330773, "dur": 268, "args": { "External id": 54053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54053, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 54053, "pid": 5, "tid": 7, "ts": 1716454218330773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269400, "dur": 11, "args": { "External id": 54053, "cbid": 211, "correlation": 54053 } }, { "ph": "s", "id": 54053, "pid": 76337, "tid": -914061504, "ts": 1716454218269400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218331042, "dur": 13, "args": { "External id": 54061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54061, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54061, "pid": 5, "tid": 7, "ts": 1716454218331042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269444, "dur": 10, "args": { "External id": 54061, "cbid": 211, "correlation": 54061 } }, { "ph": "s", "id": 54061, "pid": 76337, "tid": -914061504, "ts": 1716454218269444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218331057, "dur": 37, "args": { "External id": 54072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54072, "pid": 5, "tid": 7, "ts": 1716454218331057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269512, "dur": 13, "args": { "External id": 54072, "cbid": 211, "correlation": 54072 } }, { "ph": "s", "id": 54072, "pid": 76337, "tid": -914061504, "ts": 1716454218269512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218269577, "dur": 0, "args": { "External id": 54084, "cbid": 317, "correlation": 54084 } }, { "ph": "f", "id": 54084, "pid": 76337, "tid": -914061504, "ts": 1716454218269577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218269578, "dur": 0, "args": { "External id": 54085, "cbid": 203, "correlation": 54085 } }, { "ph": "f", "id": 54085, "pid": 76337, "tid": -914061504, "ts": 1716454218269578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218269579, "dur": 0, "args": { "External id": 54086, "cbid": 205, "correlation": 54086 } }, { "ph": "f", "id": 54086, "pid": 76337, "tid": -914061504, "ts": 1716454218269579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218331095, "dur": 13, "args": { "External id": 54090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54090, "pid": 5, "tid": 7, "ts": 1716454218331095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269594, "dur": 12, "args": { "External id": 54090, "cbid": 211, "correlation": 54090 } }, { "ph": "s", "id": 54090, "pid": 76337, "tid": -914061504, "ts": 1716454218269594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218331109, "dur": 4, "args": { "External id": 54092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 54092, "pid": 5, "tid": 7, "ts": 1716454218331109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269610, "dur": 6, "args": { "External id": 54092, "cbid": 211, "correlation": 54092 } }, { "ph": "s", "id": 54092, "pid": 76337, "tid": -914061504, "ts": 1716454218269610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218269619, "dur": 0, "args": { "External id": 54093, "cbid": 51, "correlation": 54093 } }, { "ph": "s", "id": 54093, "pid": 76337, "tid": -914061504, "ts": 1716454218269619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218331114, "dur": 97, "args": { "External id": 54094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54094, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 54094, "pid": 5, "tid": 7, "ts": 1716454218331114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269620, "dur": 5, "args": { "External id": 54094, "cbid": 211, "correlation": 54094 } }, { "ph": "s", "id": 54094, "pid": 76337, "tid": -914061504, "ts": 1716454218269620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218331213, "dur": 16, "args": { "External id": 54099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54099, "pid": 5, "tid": 7, "ts": 1716454218331213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269647, "dur": 11, "args": { "External id": 54099, "cbid": 211, "correlation": 54099 } }, { "ph": "s", "id": 54099, "pid": 76337, "tid": -914061504, "ts": 1716454218269647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218331230, "dur": 12, "args": { "External id": 54107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54107, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54107, "pid": 5, "tid": 7, "ts": 1716454218331230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269682, "dur": 8, "args": { "External id": 54107, "cbid": 211, "correlation": 54107 } }, { "ph": "s", "id": 54107, "pid": 76337, "tid": -914061504, "ts": 1716454218269682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454218331243, "dur": 55, "args": { "External id": 54118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54118, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54118, "pid": 5, "tid": 7, "ts": 1716454218331243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269745, "dur": 12, "args": { "External id": 54118, "cbid": 211, "correlation": 54118 } }, { "ph": "s", "id": 54118, "pid": 76337, "tid": -914061504, "ts": 1716454218269745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218269801, "dur": 0, "args": { "External id": 54128, "cbid": 317, "correlation": 54128 } }, { "ph": "f", "id": 54128, "pid": 76337, "tid": -914061504, "ts": 1716454218269801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218269801, "dur": 0, "args": { "External id": 54129, "cbid": 203, "correlation": 54129 } }, { "ph": "f", "id": 54129, "pid": 76337, "tid": -914061504, "ts": 1716454218269801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218269802, "dur": 0, "args": { "External id": 54130, "cbid": 205, "correlation": 54130 } }, { "ph": "f", "id": 54130, "pid": 76337, "tid": -914061504, "ts": 1716454218269802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218331300, "dur": 39, "args": { "External id": 54134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54134, "pid": 5, "tid": 7, "ts": 1716454218331300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269818, "dur": 11, "args": { "External id": 54134, "cbid": 211, "correlation": 54134 } }, { "ph": "s", "id": 54134, "pid": 76337, "tid": -914061504, "ts": 1716454218269818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218331340, "dur": 162, "args": { "External id": 54136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54136, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54136, "pid": 5, "tid": 7, "ts": 1716454218331340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269832, "dur": 5, "args": { "External id": 54136, "cbid": 211, "correlation": 54136 } }, { "ph": "s", "id": 54136, "pid": 76337, "tid": -914061504, "ts": 1716454218269832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218331503, "dur": 1962, "args": { "External id": 54138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54138, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54138, "pid": 5, "tid": 7, "ts": 1716454218331503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269845, "dur": 8, "args": { "External id": 54138, "cbid": 211, "correlation": 54138 } }, { "ph": "s", "id": 54138, "pid": 76337, "tid": -914061504, "ts": 1716454218269845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218333466, "dur": 40, "args": { "External id": 54140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54140, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54140, "pid": 5, "tid": 7, "ts": 1716454218333466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269857, "dur": 5, "args": { "External id": 54140, "cbid": 211, "correlation": 54140 } }, { "ph": "s", "id": 54140, "pid": 76337, "tid": -914061504, "ts": 1716454218269857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218333508, "dur": 58, "args": { "External id": 54146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54146, "pid": 5, "tid": 7, "ts": 1716454218333508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269884, "dur": 9, "args": { "External id": 54146, "cbid": 211, "correlation": 54146 } }, { "ph": "s", "id": 54146, "pid": 76337, "tid": -914061504, "ts": 1716454218269884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218333567, "dur": 85, "args": { "External id": 54155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54155, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54155, "pid": 5, "tid": 7, "ts": 1716454218333567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218269983, "dur": 14, "args": { "External id": 54155, "cbid": 211, "correlation": 54155 } }, { "ph": "s", "id": 54155, "pid": 76337, "tid": -914061504, "ts": 1716454218269983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218333654, "dur": 70, "args": { "External id": 54175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54175, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 54175, "pid": 5, "tid": 7, "ts": 1716454218333654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270061, "dur": 12, "args": { "External id": 54175, "cbid": 211, "correlation": 54175 } }, { "ph": "s", "id": 54175, "pid": 76337, "tid": -914061504, "ts": 1716454218270061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218333725, "dur": 5, "args": { "External id": 54187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54187, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 54187, "pid": 5, "tid": 7, "ts": 1716454218333725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270083, "dur": 6, "args": { "External id": 54187, "cbid": 211, "correlation": 54187 } }, { "ph": "s", "id": 54187, "pid": 76337, "tid": -914061504, "ts": 1716454218270083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218333731, "dur": 82, "args": { "External id": 54190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54190, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54190, "pid": 5, "tid": 7, "ts": 1716454218333731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270102, "dur": 7, "args": { "External id": 54190, "cbid": 211, "correlation": 54190 } }, { "ph": "s", "id": 54190, "pid": 76337, "tid": -914061504, "ts": 1716454218270102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218333814, "dur": 53, "args": { "External id": 54199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54199, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54199, "pid": 5, "tid": 7, "ts": 1716454218333814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270143, "dur": 11, "args": { "External id": 54199, "cbid": 211, "correlation": 54199 } }, { "ph": "s", "id": 54199, "pid": 76337, "tid": -914061504, "ts": 1716454218270143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218270195, "dur": 0, "args": { "External id": 54209, "cbid": 317, "correlation": 54209 } }, { "ph": "f", "id": 54209, "pid": 76337, "tid": -914061504, "ts": 1716454218270195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218270196, "dur": 0, "args": { "External id": 54210, "cbid": 203, "correlation": 54210 } }, { "ph": "f", "id": 54210, "pid": 76337, "tid": -914061504, "ts": 1716454218270196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218270197, "dur": 0, "args": { "External id": 54211, "cbid": 205, "correlation": 54211 } }, { "ph": "f", "id": 54211, "pid": 76337, "tid": -914061504, "ts": 1716454218270197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218333869, "dur": 57, "args": { "External id": 54215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54215, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54215, "pid": 5, "tid": 7, "ts": 1716454218333869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270213, "dur": 11, "args": { "External id": 54215, "cbid": 211, "correlation": 54215 } }, { "ph": "s", "id": 54215, "pid": 76337, "tid": -914061504, "ts": 1716454218270213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218333927, "dur": 120, "args": { "External id": 54217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54217, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54217, "pid": 5, "tid": 7, "ts": 1716454218333927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270227, "dur": 5, "args": { "External id": 54217, "cbid": 211, "correlation": 54217 } }, { "ph": "s", "id": 54217, "pid": 76337, "tid": -914061504, "ts": 1716454218270227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218334049, "dur": 1880, "args": { "External id": 54219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54219, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54219, "pid": 5, "tid": 7, "ts": 1716454218334049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270238, "dur": 6, "args": { "External id": 54219, "cbid": 211, "correlation": 54219 } }, { "ph": "s", "id": 54219, "pid": 76337, "tid": -914061504, "ts": 1716454218270238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218335930, "dur": 20, "args": { "External id": 54221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54221, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54221, "pid": 5, "tid": 7, "ts": 1716454218335930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270248, "dur": 5, "args": { "External id": 54221, "cbid": 211, "correlation": 54221 } }, { "ph": "s", "id": 54221, "pid": 76337, "tid": -914061504, "ts": 1716454218270248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218335951, "dur": 32, "args": { "External id": 54227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54227, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54227, "pid": 5, "tid": 7, "ts": 1716454218335951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270276, "dur": 11, "args": { "External id": 54227, "cbid": 211, "correlation": 54227 } }, { "ph": "s", "id": 54227, "pid": 76337, "tid": -914061504, "ts": 1716454218270276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218335985, "dur": 3, "args": { "External id": 54235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54235, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 54235, "pid": 5, "tid": 7, "ts": 1716454218335985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270324, "dur": 10, "args": { "External id": 54235, "cbid": 211, "correlation": 54235 } }, { "ph": "s", "id": 54235, "pid": 76337, "tid": -914061504, "ts": 1716454218270324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218270390, "dur": 1, "args": { "External id": 54251, "cbid": 251, "correlation": 54251 } }, { "ph": "f", "id": 54251, "pid": 76337, "tid": -914061504, "ts": 1716454218270390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218270396, "dur": 0, "args": { "External id": 54253, "cbid": 251, "correlation": 54253 } }, { "ph": "f", "id": 54253, "pid": 76337, "tid": -914061504, "ts": 1716454218270396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218335990, "dur": 12, "args": { "External id": 54254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54254, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 54254, "pid": 5, "tid": 7, "ts": 1716454218335990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270397, "dur": 12, "args": { "External id": 54254, "cbid": 211, "correlation": 54254 } }, { "ph": "s", "id": 54254, "pid": 76337, "tid": -914061504, "ts": 1716454218270397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218336003, "dur": 5, "args": { "External id": 54256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54256, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 54256, "pid": 5, "tid": 7, "ts": 1716454218336003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270411, "dur": 6, "args": { "External id": 54256, "cbid": 211, "correlation": 54256 } }, { "ph": "s", "id": 54256, "pid": 76337, "tid": -914061504, "ts": 1716454218270411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218336010, "dur": 29, "args": { "External id": 54266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54266, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54266, "pid": 5, "tid": 7, "ts": 1716454218336010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270470, "dur": 12, "args": { "External id": 54266, "cbid": 211, "correlation": 54266 } }, { "ph": "s", "id": 54266, "pid": 76337, "tid": -914061504, "ts": 1716454218270470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218336040, "dur": 30, "args": { "External id": 54286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54286, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 54286, "pid": 5, "tid": 7, "ts": 1716454218336040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270536, "dur": 11, "args": { "External id": 54286, "cbid": 211, "correlation": 54286 } }, { "ph": "s", "id": 54286, "pid": 76337, "tid": -914061504, "ts": 1716454218270536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218336072, "dur": 4, "args": { "External id": 54298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54298, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 54298, "pid": 5, "tid": 7, "ts": 1716454218336072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270557, "dur": 6, "args": { "External id": 54298, "cbid": 211, "correlation": 54298 } }, { "ph": "s", "id": 54298, "pid": 76337, "tid": -914061504, "ts": 1716454218270557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218336077, "dur": 30, "args": { "External id": 54301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54301, "pid": 5, "tid": 7, "ts": 1716454218336077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270575, "dur": 10, "args": { "External id": 54301, "cbid": 211, "correlation": 54301 } }, { "ph": "s", "id": 54301, "pid": 76337, "tid": -914061504, "ts": 1716454218270575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218336108, "dur": 20, "args": { "External id": 54310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54310, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54310, "pid": 5, "tid": 7, "ts": 1716454218336108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270619, "dur": 10, "args": { "External id": 54310, "cbid": 211, "correlation": 54310 } }, { "ph": "s", "id": 54310, "pid": 76337, "tid": -914061504, "ts": 1716454218270619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218270682, "dur": 0, "args": { "External id": 54320, "cbid": 317, "correlation": 54320 } }, { "ph": "f", "id": 54320, "pid": 76337, "tid": -914061504, "ts": 1716454218270682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218270683, "dur": 0, "args": { "External id": 54321, "cbid": 203, "correlation": 54321 } }, { "ph": "f", "id": 54321, "pid": 76337, "tid": -914061504, "ts": 1716454218270683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218270684, "dur": 0, "args": { "External id": 54322, "cbid": 205, "correlation": 54322 } }, { "ph": "f", "id": 54322, "pid": 76337, "tid": -914061504, "ts": 1716454218270684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218336130, "dur": 22, "args": { "External id": 54326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54326, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54326, "pid": 5, "tid": 7, "ts": 1716454218336130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270699, "dur": 12, "args": { "External id": 54326, "cbid": 211, "correlation": 54326 } }, { "ph": "s", "id": 54326, "pid": 76337, "tid": -914061504, "ts": 1716454218270699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218336153, "dur": 43, "args": { "External id": 54328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54328, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54328, "pid": 5, "tid": 7, "ts": 1716454218336153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270714, "dur": 5, "args": { "External id": 54328, "cbid": 211, "correlation": 54328 } }, { "ph": "s", "id": 54328, "pid": 76337, "tid": -914061504, "ts": 1716454218270714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218336198, "dur": 642, "args": { "External id": 54330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54330, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54330, "pid": 5, "tid": 7, "ts": 1716454218336198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270726, "dur": 6, "args": { "External id": 54330, "cbid": 211, "correlation": 54330 } }, { "ph": "s", "id": 54330, "pid": 76337, "tid": -914061504, "ts": 1716454218270726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218336841, "dur": 21, "args": { "External id": 54332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54332, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54332, "pid": 5, "tid": 7, "ts": 1716454218336841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270735, "dur": 5, "args": { "External id": 54332, "cbid": 211, "correlation": 54332 } }, { "ph": "s", "id": 54332, "pid": 76337, "tid": -914061504, "ts": 1716454218270735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218336864, "dur": 33, "args": { "External id": 54338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54338, "pid": 5, "tid": 7, "ts": 1716454218336864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270763, "dur": 8, "args": { "External id": 54338, "cbid": 211, "correlation": 54338 } }, { "ph": "s", "id": 54338, "pid": 76337, "tid": -914061504, "ts": 1716454218270763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218270821, "dur": 0, "args": { "External id": 54348, "cbid": 317, "correlation": 54348 } }, { "ph": "f", "id": 54348, "pid": 76337, "tid": -914061504, "ts": 1716454218270821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218270821, "dur": 0, "args": { "External id": 54349, "cbid": 203, "correlation": 54349 } }, { "ph": "f", "id": 54349, "pid": 76337, "tid": -914061504, "ts": 1716454218270821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218270822, "dur": 0, "args": { "External id": 54350, "cbid": 205, "correlation": 54350 } }, { "ph": "f", "id": 54350, "pid": 76337, "tid": -914061504, "ts": 1716454218270822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218336898, "dur": 56, "args": { "External id": 54354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54354, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54354, "pid": 5, "tid": 7, "ts": 1716454218336898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270840, "dur": 12, "args": { "External id": 54354, "cbid": 211, "correlation": 54354 } }, { "ph": "s", "id": 54354, "pid": 76337, "tid": -914061504, "ts": 1716454218270840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218336956, "dur": 269, "args": { "External id": 54356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54356, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54356, "pid": 5, "tid": 7, "ts": 1716454218336956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270860, "dur": 7, "args": { "External id": 54356, "cbid": 211, "correlation": 54356 } }, { "ph": "s", "id": 54356, "pid": 76337, "tid": -914061504, "ts": 1716454218270860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218337226, "dur": 21, "args": { "External id": 54358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54358, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54358, "pid": 5, "tid": 7, "ts": 1716454218337226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270872, "dur": 6, "args": { "External id": 54358, "cbid": 211, "correlation": 54358 } }, { "ph": "s", "id": 54358, "pid": 76337, "tid": -914061504, "ts": 1716454218270872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218337248, "dur": 32, "args": { "External id": 54364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54364, "pid": 5, "tid": 7, "ts": 1716454218337248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270900, "dur": 8, "args": { "External id": 54364, "cbid": 211, "correlation": 54364 } }, { "ph": "s", "id": 54364, "pid": 76337, "tid": -914061504, "ts": 1716454218270900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218337281, "dur": 27, "args": { "External id": 54372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54372, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54372, "pid": 5, "tid": 7, "ts": 1716454218337281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270930, "dur": 8, "args": { "External id": 54372, "cbid": 211, "correlation": 54372 } }, { "ph": "s", "id": 54372, "pid": 76337, "tid": -914061504, "ts": 1716454218270930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218337309, "dur": 20, "args": { "External id": 54380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54380, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54380, "pid": 5, "tid": 7, "ts": 1716454218337309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218270958, "dur": 8, "args": { "External id": 54380, "cbid": 211, "correlation": 54380 } }, { "ph": "s", "id": 54380, "pid": 76337, "tid": -914061504, "ts": 1716454218270958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218337331, "dur": 30, "args": { "External id": 54400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54400, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 54400, "pid": 5, "tid": 7, "ts": 1716454218337331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271049, "dur": 13, "args": { "External id": 54400, "cbid": 211, "correlation": 54400 } }, { "ph": "s", "id": 54400, "pid": 76337, "tid": -914061504, "ts": 1716454218271049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218337362, "dur": 4, "args": { "External id": 54412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54412, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 54412, "pid": 5, "tid": 7, "ts": 1716454218337362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271072, "dur": 6, "args": { "External id": 54412, "cbid": 211, "correlation": 54412 } }, { "ph": "s", "id": 54412, "pid": 76337, "tid": -914061504, "ts": 1716454218271072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218337367, "dur": 31, "args": { "External id": 54415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54415, "pid": 5, "tid": 7, "ts": 1716454218337367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271090, "dur": 6, "args": { "External id": 54415, "cbid": 211, "correlation": 54415 } }, { "ph": "s", "id": 54415, "pid": 76337, "tid": -914061504, "ts": 1716454218271090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218271148, "dur": 0, "args": { "External id": 54426, "cbid": 317, "correlation": 54426 } }, { "ph": "f", "id": 54426, "pid": 76337, "tid": -914061504, "ts": 1716454218271148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218271149, "dur": 0, "args": { "External id": 54427, "cbid": 203, "correlation": 54427 } }, { "ph": "f", "id": 54427, "pid": 76337, "tid": -914061504, "ts": 1716454218271149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218271149, "dur": 0, "args": { "External id": 54428, "cbid": 205, "correlation": 54428 } }, { "ph": "f", "id": 54428, "pid": 76337, "tid": -914061504, "ts": 1716454218271149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218337399, "dur": 22, "args": { "External id": 54432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54432, "pid": 5, "tid": 7, "ts": 1716454218337399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271163, "dur": 14, "args": { "External id": 54432, "cbid": 211, "correlation": 54432 } }, { "ph": "s", "id": 54432, "pid": 76337, "tid": -914061504, "ts": 1716454218271163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218337423, "dur": 104, "args": { "External id": 54434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54434, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54434, "pid": 5, "tid": 7, "ts": 1716454218337423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271184, "dur": 7, "args": { "External id": 54434, "cbid": 211, "correlation": 54434 } }, { "ph": "s", "id": 54434, "pid": 76337, "tid": -914061504, "ts": 1716454218271184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218337528, "dur": 22, "args": { "External id": 54436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54436, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54436, "pid": 5, "tid": 7, "ts": 1716454218337528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271195, "dur": 5, "args": { "External id": 54436, "cbid": 211, "correlation": 54436 } }, { "ph": "s", "id": 54436, "pid": 76337, "tid": -914061504, "ts": 1716454218271195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218337551, "dur": 33, "args": { "External id": 54442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54442, "pid": 5, "tid": 7, "ts": 1716454218337551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271223, "dur": 8, "args": { "External id": 54442, "cbid": 211, "correlation": 54442 } }, { "ph": "s", "id": 54442, "pid": 76337, "tid": -914061504, "ts": 1716454218271223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218337585, "dur": 178, "args": { "External id": 54451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54451, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54451, "pid": 5, "tid": 7, "ts": 1716454218337585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271305, "dur": 14, "args": { "External id": 54451, "cbid": 211, "correlation": 54451 } }, { "ph": "s", "id": 54451, "pid": 76337, "tid": -914061504, "ts": 1716454218271305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218337764, "dur": 64, "args": { "External id": 54473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54473, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54473, "pid": 5, "tid": 7, "ts": 1716454218337764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271361, "dur": 10, "args": { "External id": 54473, "cbid": 211, "correlation": 54473 } }, { "ph": "s", "id": 54473, "pid": 76337, "tid": -914061504, "ts": 1716454218271361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218271452, "dur": 1, "args": { "External id": 54484, "cbid": 251, "correlation": 54484 } }, { "ph": "f", "id": 54484, "pid": 76337, "tid": -914061504, "ts": 1716454218271452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218337830, "dur": 153, "args": { "External id": 54485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54485, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54485, "pid": 5, "tid": 7, "ts": 1716454218337830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271457, "dur": 13, "args": { "External id": 54485, "cbid": 211, "correlation": 54485 } }, { "ph": "s", "id": 54485, "pid": 76337, "tid": -914061504, "ts": 1716454218271457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218271527, "dur": 1, "args": { "External id": 54496, "cbid": 251, "correlation": 54496 } }, { "ph": "f", "id": 54496, "pid": 76337, "tid": -914061504, "ts": 1716454218271527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218337984, "dur": 146, "args": { "External id": 54497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54497, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54497, "pid": 5, "tid": 7, "ts": 1716454218337984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271531, "dur": 11, "args": { "External id": 54497, "cbid": 211, "correlation": 54497 } }, { "ph": "s", "id": 54497, "pid": 76337, "tid": -914061504, "ts": 1716454218271531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218271600, "dur": 1, "args": { "External id": 54508, "cbid": 251, "correlation": 54508 } }, { "ph": "f", "id": 54508, "pid": 76337, "tid": -914061504, "ts": 1716454218271600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218338132, "dur": 144, "args": { "External id": 54509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54509, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54509, "pid": 5, "tid": 7, "ts": 1716454218338132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271604, "dur": 12, "args": { "External id": 54509, "cbid": 211, "correlation": 54509 } }, { "ph": "s", "id": 54509, "pid": 76337, "tid": -914061504, "ts": 1716454218271604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218338277, "dur": 1921, "args": { "External id": 54530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54530, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 54530, "pid": 5, "tid": 7, "ts": 1716454218338277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271688, "dur": 13, "args": { "External id": 54530, "cbid": 211, "correlation": 54530 } }, { "ph": "s", "id": 54530, "pid": 76337, "tid": -914061504, "ts": 1716454218271688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218271787, "dur": 1, "args": { "External id": 54548, "cbid": 251, "correlation": 54548 } }, { "ph": "f", "id": 54548, "pid": 76337, "tid": -914061504, "ts": 1716454218271787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218340200, "dur": 148, "args": { "External id": 54550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54550, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 54550, "pid": 5, "tid": 7, "ts": 1716454218340200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271792, "dur": 13, "args": { "External id": 54550, "cbid": 211, "correlation": 54550 } }, { "ph": "s", "id": 54550, "pid": 76337, "tid": -914061504, "ts": 1716454218271792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218340349, "dur": 35, "args": { "External id": 54558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54558, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54558, "pid": 5, "tid": 7, "ts": 1716454218340349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271863, "dur": 12, "args": { "External id": 54558, "cbid": 211, "correlation": 54558 } }, { "ph": "s", "id": 54558, "pid": 76337, "tid": -914061504, "ts": 1716454218271863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218340385, "dur": 50, "args": { "External id": 54566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54566, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54566, "pid": 5, "tid": 7, "ts": 1716454218340385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271903, "dur": 8, "args": { "External id": 54566, "cbid": 211, "correlation": 54566 } }, { "ph": "s", "id": 54566, "pid": 76337, "tid": -914061504, "ts": 1716454218271903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218340437, "dur": 30, "args": { "External id": 54577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54577, "pid": 5, "tid": 7, "ts": 1716454218340437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218271982, "dur": 14, "args": { "External id": 54577, "cbid": 211, "correlation": 54577 } }, { "ph": "s", "id": 54577, "pid": 76337, "tid": -914061504, "ts": 1716454218271982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218340468, "dur": 34, "args": { "External id": 54599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54599, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54599, "pid": 5, "tid": 7, "ts": 1716454218340468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272019, "dur": 8, "args": { "External id": 54599, "cbid": 211, "correlation": 54599 } }, { "ph": "s", "id": 54599, "pid": 76337, "tid": -914061504, "ts": 1716454218272019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272106, "dur": 1, "args": { "External id": 54610, "cbid": 251, "correlation": 54610 } }, { "ph": "f", "id": 54610, "pid": 76337, "tid": -914061504, "ts": 1716454218272106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218340503, "dur": 90, "args": { "External id": 54611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54611, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54611, "pid": 5, "tid": 7, "ts": 1716454218340503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272111, "dur": 12, "args": { "External id": 54611, "cbid": 211, "correlation": 54611 } }, { "ph": "s", "id": 54611, "pid": 76337, "tid": -914061504, "ts": 1716454218272111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272179, "dur": 1, "args": { "External id": 54622, "cbid": 251, "correlation": 54622 } }, { "ph": "f", "id": 54622, "pid": 76337, "tid": -914061504, "ts": 1716454218272179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272183, "dur": 0, "args": { "External id": 54623, "cbid": 251, "correlation": 54623 } }, { "ph": "f", "id": 54623, "pid": 76337, "tid": -914061504, "ts": 1716454218272183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218340595, "dur": 11, "args": { "External id": 54624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54624, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 54624, "pid": 5, "tid": 7, "ts": 1716454218340595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272185, "dur": 12, "args": { "External id": 54624, "cbid": 211, "correlation": 54624 } }, { "ph": "s", "id": 54624, "pid": 76337, "tid": -914061504, "ts": 1716454218272185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218340607, "dur": 5, "args": { "External id": 54626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54626, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 54626, "pid": 5, "tid": 7, "ts": 1716454218340607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272199, "dur": 6, "args": { "External id": 54626, "cbid": 211, "correlation": 54626 } }, { "ph": "s", "id": 54626, "pid": 76337, "tid": -914061504, "ts": 1716454218272199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272256, "dur": 1, "args": { "External id": 54637, "cbid": 251, "correlation": 54637 } }, { "ph": "f", "id": 54637, "pid": 76337, "tid": -914061504, "ts": 1716454218272256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272259, "dur": 0, "args": { "External id": 54638, "cbid": 251, "correlation": 54638 } }, { "ph": "f", "id": 54638, "pid": 76337, "tid": -914061504, "ts": 1716454218272259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218340614, "dur": 7, "args": { "External id": 54639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54639, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 54639, "pid": 5, "tid": 7, "ts": 1716454218340614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272261, "dur": 11, "args": { "External id": 54639, "cbid": 211, "correlation": 54639 } }, { "ph": "s", "id": 54639, "pid": 76337, "tid": -914061504, "ts": 1716454218272261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218340623, "dur": 4, "args": { "External id": 54641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54641, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 54641, "pid": 5, "tid": 7, "ts": 1716454218340623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272273, "dur": 5, "args": { "External id": 54641, "cbid": 211, "correlation": 54641 } }, { "ph": "s", "id": 54641, "pid": 76337, "tid": -914061504, "ts": 1716454218272273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218340628, "dur": 89, "args": { "External id": 54662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54662, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 54662, "pid": 5, "tid": 7, "ts": 1716454218340628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272349, "dur": 14, "args": { "External id": 54662, "cbid": 211, "correlation": 54662 } }, { "ph": "s", "id": 54662, "pid": 76337, "tid": -914061504, "ts": 1716454218272349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272447, "dur": 1, "args": { "External id": 54680, "cbid": 251, "correlation": 54680 } }, { "ph": "f", "id": 54680, "pid": 76337, "tid": -914061504, "ts": 1716454218272447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218340718, "dur": 97, "args": { "External id": 54682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54682, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54682, "pid": 5, "tid": 7, "ts": 1716454218340718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272453, "dur": 14, "args": { "External id": 54682, "cbid": 211, "correlation": 54682 } }, { "ph": "s", "id": 54682, "pid": 76337, "tid": -914061504, "ts": 1716454218272453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218340816, "dur": 19, "args": { "External id": 54690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54690, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54690, "pid": 5, "tid": 7, "ts": 1716454218340816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272521, "dur": 12, "args": { "External id": 54690, "cbid": 211, "correlation": 54690 } }, { "ph": "s", "id": 54690, "pid": 76337, "tid": -914061504, "ts": 1716454218272521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218340837, "dur": 37, "args": { "External id": 54698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54698, "pid": 5, "tid": 7, "ts": 1716454218340837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272562, "dur": 10, "args": { "External id": 54698, "cbid": 211, "correlation": 54698 } }, { "ph": "s", "id": 54698, "pid": 76337, "tid": -914061504, "ts": 1716454218272562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218340875, "dur": 34, "args": { "External id": 54720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54720, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54720, "pid": 5, "tid": 7, "ts": 1716454218340875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272613, "dur": 10, "args": { "External id": 54720, "cbid": 211, "correlation": 54720 } }, { "ph": "s", "id": 54720, "pid": 76337, "tid": -914061504, "ts": 1716454218272613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272702, "dur": 4, "args": { "External id": 54736, "cbid": 251, "correlation": 54736 } }, { "ph": "f", "id": 54736, "pid": 76337, "tid": -914061504, "ts": 1716454218272702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272710, "dur": 0, "args": { "External id": 54738, "cbid": 251, "correlation": 54738 } }, { "ph": "f", "id": 54738, "pid": 76337, "tid": -914061504, "ts": 1716454218272710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218340911, "dur": 536, "args": { "External id": 54739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54739, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 54739, "pid": 5, "tid": 7, "ts": 1716454218340911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272714, "dur": 14, "args": { "External id": 54739, "cbid": 211, "correlation": 54739 } }, { "ph": "s", "id": 54739, "pid": 76337, "tid": -914061504, "ts": 1716454218272714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218341448, "dur": 124, "args": { "External id": 54747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54747, "pid": 5, "tid": 7, "ts": 1716454218341448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272781, "dur": 12, "args": { "External id": 54747, "cbid": 211, "correlation": 54747 } }, { "ph": "s", "id": 54747, "pid": 76337, "tid": -914061504, "ts": 1716454218272781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218341574, "dur": 129, "args": { "External id": 54755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54755, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54755, "pid": 5, "tid": 7, "ts": 1716454218341574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272812, "dur": 8, "args": { "External id": 54755, "cbid": 211, "correlation": 54755 } }, { "ph": "s", "id": 54755, "pid": 76337, "tid": -914061504, "ts": 1716454218272812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218272890, "dur": 1, "args": { "External id": 54771, "cbid": 251, "correlation": 54771 } }, { "ph": "f", "id": 54771, "pid": 76337, "tid": -914061504, "ts": 1716454218272890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218341704, "dur": 303, "args": { "External id": 54773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54773, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54773, "pid": 5, "tid": 7, "ts": 1716454218341704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272895, "dur": 12, "args": { "External id": 54773, "cbid": 211, "correlation": 54773 } }, { "ph": "s", "id": 54773, "pid": 76337, "tid": -914061504, "ts": 1716454218272895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218342008, "dur": 27, "args": { "External id": 54781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54781, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54781, "pid": 5, "tid": 7, "ts": 1716454218342008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218272938, "dur": 10, "args": { "External id": 54781, "cbid": 211, "correlation": 54781 } }, { "ph": "s", "id": 54781, "pid": 76337, "tid": -914061504, "ts": 1716454218272938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218342037, "dur": 81, "args": { "External id": 54792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54792, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54792, "pid": 5, "tid": 7, "ts": 1716454218342037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273013, "dur": 13, "args": { "External id": 54792, "cbid": 211, "correlation": 54792 } }, { "ph": "s", "id": 54792, "pid": 76337, "tid": -914061504, "ts": 1716454218273013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218273077, "dur": 0, "args": { "External id": 54804, "cbid": 317, "correlation": 54804 } }, { "ph": "f", "id": 54804, "pid": 76337, "tid": -914061504, "ts": 1716454218273077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218273078, "dur": 0, "args": { "External id": 54805, "cbid": 203, "correlation": 54805 } }, { "ph": "f", "id": 54805, "pid": 76337, "tid": -914061504, "ts": 1716454218273078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218273079, "dur": 0, "args": { "External id": 54806, "cbid": 205, "correlation": 54806 } }, { "ph": "f", "id": 54806, "pid": 76337, "tid": -914061504, "ts": 1716454218273079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218342119, "dur": 23, "args": { "External id": 54810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54810, "pid": 5, "tid": 7, "ts": 1716454218342119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273094, "dur": 15, "args": { "External id": 54810, "cbid": 211, "correlation": 54810 } }, { "ph": "s", "id": 54810, "pid": 76337, "tid": -914061504, "ts": 1716454218273094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218342143, "dur": 119, "args": { "External id": 54812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54812, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54812, "pid": 5, "tid": 7, "ts": 1716454218342143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273115, "dur": 7, "args": { "External id": 54812, "cbid": 211, "correlation": 54812 } }, { "ph": "s", "id": 54812, "pid": 76337, "tid": -914061504, "ts": 1716454218273115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218342263, "dur": 24, "args": { "External id": 54814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54814, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54814, "pid": 5, "tid": 7, "ts": 1716454218342263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273125, "dur": 5, "args": { "External id": 54814, "cbid": 211, "correlation": 54814 } }, { "ph": "s", "id": 54814, "pid": 76337, "tid": -914061504, "ts": 1716454218273125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218342289, "dur": 33, "args": { "External id": 54820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54820, "pid": 5, "tid": 7, "ts": 1716454218342289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273154, "dur": 8, "args": { "External id": 54820, "cbid": 211, "correlation": 54820 } }, { "ph": "s", "id": 54820, "pid": 76337, "tid": -914061504, "ts": 1716454218273154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218342323, "dur": 26, "args": { "External id": 54828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54828, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54828, "pid": 5, "tid": 7, "ts": 1716454218342323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273185, "dur": 9, "args": { "External id": 54828, "cbid": 211, "correlation": 54828 } }, { "ph": "s", "id": 54828, "pid": 76337, "tid": -914061504, "ts": 1716454218273185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218342351, "dur": 53, "args": { "External id": 54837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54837, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54837, "pid": 5, "tid": 7, "ts": 1716454218342351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273223, "dur": 10, "args": { "External id": 54837, "cbid": 211, "correlation": 54837 } }, { "ph": "s", "id": 54837, "pid": 76337, "tid": -914061504, "ts": 1716454218273223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218342405, "dur": 53, "args": { "External id": 54857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54857, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 54857, "pid": 5, "tid": 7, "ts": 1716454218342405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273294, "dur": 12, "args": { "External id": 54857, "cbid": 211, "correlation": 54857 } }, { "ph": "s", "id": 54857, "pid": 76337, "tid": -914061504, "ts": 1716454218273294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218342459, "dur": 5, "args": { "External id": 54869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54869, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 54869, "pid": 5, "tid": 7, "ts": 1716454218342459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273316, "dur": 6, "args": { "External id": 54869, "cbid": 211, "correlation": 54869 } }, { "ph": "s", "id": 54869, "pid": 76337, "tid": -914061504, "ts": 1716454218273316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218342465, "dur": 57, "args": { "External id": 54872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54872, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54872, "pid": 5, "tid": 7, "ts": 1716454218342465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273334, "dur": 7, "args": { "External id": 54872, "cbid": 211, "correlation": 54872 } }, { "ph": "s", "id": 54872, "pid": 76337, "tid": -914061504, "ts": 1716454218273334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218342523, "dur": 37, "args": { "External id": 54881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54881, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54881, "pid": 5, "tid": 7, "ts": 1716454218342523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273377, "dur": 10, "args": { "External id": 54881, "cbid": 211, "correlation": 54881 } }, { "ph": "s", "id": 54881, "pid": 76337, "tid": -914061504, "ts": 1716454218273377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218273431, "dur": 0, "args": { "External id": 54891, "cbid": 317, "correlation": 54891 } }, { "ph": "f", "id": 54891, "pid": 76337, "tid": -914061504, "ts": 1716454218273431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218273432, "dur": 0, "args": { "External id": 54892, "cbid": 203, "correlation": 54892 } }, { "ph": "f", "id": 54892, "pid": 76337, "tid": -914061504, "ts": 1716454218273432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218273432, "dur": 0, "args": { "External id": 54893, "cbid": 205, "correlation": 54893 } }, { "ph": "f", "id": 54893, "pid": 76337, "tid": -914061504, "ts": 1716454218273432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218342561, "dur": 39, "args": { "External id": 54897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54897, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54897, "pid": 5, "tid": 7, "ts": 1716454218342561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273447, "dur": 12, "args": { "External id": 54897, "cbid": 211, "correlation": 54897 } }, { "ph": "s", "id": 54897, "pid": 76337, "tid": -914061504, "ts": 1716454218273447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218342602, "dur": 83, "args": { "External id": 54899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54899, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54899, "pid": 5, "tid": 7, "ts": 1716454218342602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273462, "dur": 5, "args": { "External id": 54899, "cbid": 211, "correlation": 54899 } }, { "ph": "s", "id": 54899, "pid": 76337, "tid": -914061504, "ts": 1716454218273462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218342686, "dur": 1270, "args": { "External id": 54901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54901, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 54901, "pid": 5, "tid": 7, "ts": 1716454218342686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273473, "dur": 6, "args": { "External id": 54901, "cbid": 211, "correlation": 54901 } }, { "ph": "s", "id": 54901, "pid": 76337, "tid": -914061504, "ts": 1716454218273473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218343957, "dur": 21, "args": { "External id": 54903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54903, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54903, "pid": 5, "tid": 7, "ts": 1716454218343957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273483, "dur": 5, "args": { "External id": 54903, "cbid": 211, "correlation": 54903 } }, { "ph": "s", "id": 54903, "pid": 76337, "tid": -914061504, "ts": 1716454218273483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218343980, "dur": 33, "args": { "External id": 54909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54909, "pid": 5, "tid": 7, "ts": 1716454218343980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273511, "dur": 8, "args": { "External id": 54909, "cbid": 211, "correlation": 54909 } }, { "ph": "s", "id": 54909, "pid": 76337, "tid": -914061504, "ts": 1716454218273511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218344014, "dur": 3, "args": { "External id": 54917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54917, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 54917, "pid": 5, "tid": 7, "ts": 1716454218344014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273553, "dur": 10, "args": { "External id": 54917, "cbid": 211, "correlation": 54917 } }, { "ph": "s", "id": 54917, "pid": 76337, "tid": -914061504, "ts": 1716454218273553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218273619, "dur": 1, "args": { "External id": 54933, "cbid": 251, "correlation": 54933 } }, { "ph": "f", "id": 54933, "pid": 76337, "tid": -914061504, "ts": 1716454218273619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218273624, "dur": 0, "args": { "External id": 54935, "cbid": 251, "correlation": 54935 } }, { "ph": "f", "id": 54935, "pid": 76337, "tid": -914061504, "ts": 1716454218273624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218344019, "dur": 12, "args": { "External id": 54936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54936, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 54936, "pid": 5, "tid": 7, "ts": 1716454218344019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273626, "dur": 11, "args": { "External id": 54936, "cbid": 211, "correlation": 54936 } }, { "ph": "s", "id": 54936, "pid": 76337, "tid": -914061504, "ts": 1716454218273626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218344032, "dur": 5, "args": { "External id": 54938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54938, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 54938, "pid": 5, "tid": 7, "ts": 1716454218344032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273639, "dur": 8, "args": { "External id": 54938, "cbid": 211, "correlation": 54938 } }, { "ph": "s", "id": 54938, "pid": 76337, "tid": -914061504, "ts": 1716454218273639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218344038, "dur": 28, "args": { "External id": 54948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54948, "pid": 5, "tid": 7, "ts": 1716454218344038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273700, "dur": 12, "args": { "External id": 54948, "cbid": 211, "correlation": 54948 } }, { "ph": "s", "id": 54948, "pid": 76337, "tid": -914061504, "ts": 1716454218273700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218344068, "dur": 30, "args": { "External id": 54968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54968, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 54968, "pid": 5, "tid": 7, "ts": 1716454218344068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273766, "dur": 11, "args": { "External id": 54968, "cbid": 211, "correlation": 54968 } }, { "ph": "s", "id": 54968, "pid": 76337, "tid": -914061504, "ts": 1716454218273766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218344099, "dur": 4, "args": { "External id": 54980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54980, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 54980, "pid": 5, "tid": 7, "ts": 1716454218344099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273786, "dur": 6, "args": { "External id": 54980, "cbid": 211, "correlation": 54980 } }, { "ph": "s", "id": 54980, "pid": 76337, "tid": -914061504, "ts": 1716454218273786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218344104, "dur": 30, "args": { "External id": 54983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54983, "pid": 5, "tid": 7, "ts": 1716454218344104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273805, "dur": 6, "args": { "External id": 54983, "cbid": 211, "correlation": 54983 } }, { "ph": "s", "id": 54983, "pid": 76337, "tid": -914061504, "ts": 1716454218273805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218344135, "dur": 20, "args": { "External id": 54992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 54992, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 54992, "pid": 5, "tid": 7, "ts": 1716454218344135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273846, "dur": 10, "args": { "External id": 54992, "cbid": 211, "correlation": 54992 } }, { "ph": "s", "id": 54992, "pid": 76337, "tid": -914061504, "ts": 1716454218273846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218273908, "dur": 0, "args": { "External id": 55002, "cbid": 317, "correlation": 55002 } }, { "ph": "f", "id": 55002, "pid": 76337, "tid": -914061504, "ts": 1716454218273908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218273909, "dur": 0, "args": { "External id": 55003, "cbid": 203, "correlation": 55003 } }, { "ph": "f", "id": 55003, "pid": 76337, "tid": -914061504, "ts": 1716454218273909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218273910, "dur": 0, "args": { "External id": 55004, "cbid": 205, "correlation": 55004 } }, { "ph": "f", "id": 55004, "pid": 76337, "tid": -914061504, "ts": 1716454218273910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218344156, "dur": 23, "args": { "External id": 55008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55008, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55008, "pid": 5, "tid": 7, "ts": 1716454218344156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273924, "dur": 12, "args": { "External id": 55008, "cbid": 211, "correlation": 55008 } }, { "ph": "s", "id": 55008, "pid": 76337, "tid": -914061504, "ts": 1716454218273924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218344180, "dur": 44, "args": { "External id": 55010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55010, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55010, "pid": 5, "tid": 7, "ts": 1716454218344180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273939, "dur": 8, "args": { "External id": 55010, "cbid": 211, "correlation": 55010 } }, { "ph": "s", "id": 55010, "pid": 76337, "tid": -914061504, "ts": 1716454218273939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218344226, "dur": 643, "args": { "External id": 55012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55012, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55012, "pid": 5, "tid": 7, "ts": 1716454218344226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273953, "dur": 6, "args": { "External id": 55012, "cbid": 211, "correlation": 55012 } }, { "ph": "s", "id": 55012, "pid": 76337, "tid": -914061504, "ts": 1716454218273953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218344870, "dur": 22, "args": { "External id": 55014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55014, "pid": 5, "tid": 7, "ts": 1716454218344870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273963, "dur": 5, "args": { "External id": 55014, "cbid": 211, "correlation": 55014 } }, { "ph": "s", "id": 55014, "pid": 76337, "tid": -914061504, "ts": 1716454218273963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218344893, "dur": 33, "args": { "External id": 55020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55020, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55020, "pid": 5, "tid": 7, "ts": 1716454218344893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218273999, "dur": 9, "args": { "External id": 55020, "cbid": 211, "correlation": 55020 } }, { "ph": "s", "id": 55020, "pid": 76337, "tid": -914061504, "ts": 1716454218273999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218274059, "dur": 0, "args": { "External id": 55030, "cbid": 317, "correlation": 55030 } }, { "ph": "f", "id": 55030, "pid": 76337, "tid": -914061504, "ts": 1716454218274059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218274060, "dur": 0, "args": { "External id": 55031, "cbid": 203, "correlation": 55031 } }, { "ph": "f", "id": 55031, "pid": 76337, "tid": -914061504, "ts": 1716454218274060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218274061, "dur": 0, "args": { "External id": 55032, "cbid": 205, "correlation": 55032 } }, { "ph": "f", "id": 55032, "pid": 76337, "tid": -914061504, "ts": 1716454218274061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218344927, "dur": 38, "args": { "External id": 55036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55036, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55036, "pid": 5, "tid": 7, "ts": 1716454218344927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274073, "dur": 12, "args": { "External id": 55036, "cbid": 211, "correlation": 55036 } }, { "ph": "s", "id": 55036, "pid": 76337, "tid": -914061504, "ts": 1716454218274073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218344967, "dur": 189, "args": { "External id": 55038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55038, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55038, "pid": 5, "tid": 7, "ts": 1716454218344967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274092, "dur": 6, "args": { "External id": 55038, "cbid": 211, "correlation": 55038 } }, { "ph": "s", "id": 55038, "pid": 76337, "tid": -914061504, "ts": 1716454218274092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218345157, "dur": 21, "args": { "External id": 55040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55040, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55040, "pid": 5, "tid": 7, "ts": 1716454218345157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274101, "dur": 5, "args": { "External id": 55040, "cbid": 211, "correlation": 55040 } }, { "ph": "s", "id": 55040, "pid": 76337, "tid": -914061504, "ts": 1716454218274101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218345180, "dur": 32, "args": { "External id": 55046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55046, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55046, "pid": 5, "tid": 7, "ts": 1716454218345180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274126, "dur": 9, "args": { "External id": 55046, "cbid": 211, "correlation": 55046 } }, { "ph": "s", "id": 55046, "pid": 76337, "tid": -914061504, "ts": 1716454218274126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218345213, "dur": 27, "args": { "External id": 55054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55054, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55054, "pid": 5, "tid": 7, "ts": 1716454218345213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274155, "dur": 8, "args": { "External id": 55054, "cbid": 211, "correlation": 55054 } }, { "ph": "s", "id": 55054, "pid": 76337, "tid": -914061504, "ts": 1716454218274155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218345242, "dur": 20, "args": { "External id": 55062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55062, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55062, "pid": 5, "tid": 7, "ts": 1716454218345242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274184, "dur": 8, "args": { "External id": 55062, "cbid": 211, "correlation": 55062 } }, { "ph": "s", "id": 55062, "pid": 76337, "tid": -914061504, "ts": 1716454218274184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218345263, "dur": 30, "args": { "External id": 55082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55082, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 55082, "pid": 5, "tid": 7, "ts": 1716454218345263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274268, "dur": 13, "args": { "External id": 55082, "cbid": 211, "correlation": 55082 } }, { "ph": "s", "id": 55082, "pid": 76337, "tid": -914061504, "ts": 1716454218274268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218345294, "dur": 4, "args": { "External id": 55094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55094, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 55094, "pid": 5, "tid": 7, "ts": 1716454218345294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274292, "dur": 6, "args": { "External id": 55094, "cbid": 211, "correlation": 55094 } }, { "ph": "s", "id": 55094, "pid": 76337, "tid": -914061504, "ts": 1716454218274292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218345299, "dur": 30, "args": { "External id": 55097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55097, "pid": 5, "tid": 7, "ts": 1716454218345299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274310, "dur": 6, "args": { "External id": 55097, "cbid": 211, "correlation": 55097 } }, { "ph": "s", "id": 55097, "pid": 76337, "tid": -914061504, "ts": 1716454218274310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218274367, "dur": 0, "args": { "External id": 55108, "cbid": 317, "correlation": 55108 } }, { "ph": "f", "id": 55108, "pid": 76337, "tid": -914061504, "ts": 1716454218274367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218274368, "dur": 0, "args": { "External id": 55109, "cbid": 203, "correlation": 55109 } }, { "ph": "f", "id": 55109, "pid": 76337, "tid": -914061504, "ts": 1716454218274368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218274369, "dur": 0, "args": { "External id": 55110, "cbid": 205, "correlation": 55110 } }, { "ph": "f", "id": 55110, "pid": 76337, "tid": -914061504, "ts": 1716454218274369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218345331, "dur": 22, "args": { "External id": 55114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55114, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55114, "pid": 5, "tid": 7, "ts": 1716454218345331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274381, "dur": 12, "args": { "External id": 55114, "cbid": 211, "correlation": 55114 } }, { "ph": "s", "id": 55114, "pid": 76337, "tid": -914061504, "ts": 1716454218274381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218345354, "dur": 104, "args": { "External id": 55116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55116, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55116, "pid": 5, "tid": 7, "ts": 1716454218345354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274400, "dur": 6, "args": { "External id": 55116, "cbid": 211, "correlation": 55116 } }, { "ph": "s", "id": 55116, "pid": 76337, "tid": -914061504, "ts": 1716454218274400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218345459, "dur": 22, "args": { "External id": 55118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55118, "pid": 5, "tid": 7, "ts": 1716454218345459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274409, "dur": 5, "args": { "External id": 55118, "cbid": 211, "correlation": 55118 } }, { "ph": "s", "id": 55118, "pid": 76337, "tid": -914061504, "ts": 1716454218274409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218345482, "dur": 32, "args": { "External id": 55124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55124, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55124, "pid": 5, "tid": 7, "ts": 1716454218345482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274436, "dur": 9, "args": { "External id": 55124, "cbid": 211, "correlation": 55124 } }, { "ph": "s", "id": 55124, "pid": 76337, "tid": -914061504, "ts": 1716454218274436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218345516, "dur": 184, "args": { "External id": 55133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55133, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55133, "pid": 5, "tid": 7, "ts": 1716454218345516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274517, "dur": 14, "args": { "External id": 55133, "cbid": 211, "correlation": 55133 } }, { "ph": "s", "id": 55133, "pid": 76337, "tid": -914061504, "ts": 1716454218274517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218345702, "dur": 64, "args": { "External id": 55155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55155, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55155, "pid": 5, "tid": 7, "ts": 1716454218345702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274577, "dur": 11, "args": { "External id": 55155, "cbid": 211, "correlation": 55155 } }, { "ph": "s", "id": 55155, "pid": 76337, "tid": -914061504, "ts": 1716454218274577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218274666, "dur": 1, "args": { "External id": 55166, "cbid": 251, "correlation": 55166 } }, { "ph": "f", "id": 55166, "pid": 76337, "tid": -914061504, "ts": 1716454218274666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218345767, "dur": 151, "args": { "External id": 55167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55167, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55167, "pid": 5, "tid": 7, "ts": 1716454218345767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274672, "dur": 13, "args": { "External id": 55167, "cbid": 211, "correlation": 55167 } }, { "ph": "s", "id": 55167, "pid": 76337, "tid": -914061504, "ts": 1716454218274672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218274741, "dur": 1, "args": { "External id": 55178, "cbid": 251, "correlation": 55178 } }, { "ph": "f", "id": 55178, "pid": 76337, "tid": -914061504, "ts": 1716454218274741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218345920, "dur": 146, "args": { "External id": 55179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55179, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55179, "pid": 5, "tid": 7, "ts": 1716454218345920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274745, "dur": 11, "args": { "External id": 55179, "cbid": 211, "correlation": 55179 } }, { "ph": "s", "id": 55179, "pid": 76337, "tid": -914061504, "ts": 1716454218274745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218274810, "dur": 1, "args": { "External id": 55190, "cbid": 251, "correlation": 55190 } }, { "ph": "f", "id": 55190, "pid": 76337, "tid": -914061504, "ts": 1716454218274810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218346067, "dur": 144, "args": { "External id": 55191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55191, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55191, "pid": 5, "tid": 7, "ts": 1716454218346067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274815, "dur": 11, "args": { "External id": 55191, "cbid": 211, "correlation": 55191 } }, { "ph": "s", "id": 55191, "pid": 76337, "tid": -914061504, "ts": 1716454218274815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218346212, "dur": 1923, "args": { "External id": 55212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55212, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 55212, "pid": 5, "tid": 7, "ts": 1716454218346212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218274893, "dur": 13, "args": { "External id": 55212, "cbid": 211, "correlation": 55212 } }, { "ph": "s", "id": 55212, "pid": 76337, "tid": -914061504, "ts": 1716454218274893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275004, "dur": 1, "args": { "External id": 55230, "cbid": 251, "correlation": 55230 } }, { "ph": "f", "id": 55230, "pid": 76337, "tid": -914061504, "ts": 1716454218275004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218348137, "dur": 148, "args": { "External id": 55232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55232, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 55232, "pid": 5, "tid": 7, "ts": 1716454218348137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275009, "dur": 15, "args": { "External id": 55232, "cbid": 211, "correlation": 55232 } }, { "ph": "s", "id": 55232, "pid": 76337, "tid": -914061504, "ts": 1716454218275009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218348286, "dur": 36, "args": { "External id": 55240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55240, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55240, "pid": 5, "tid": 7, "ts": 1716454218348286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275082, "dur": 12, "args": { "External id": 55240, "cbid": 211, "correlation": 55240 } }, { "ph": "s", "id": 55240, "pid": 76337, "tid": -914061504, "ts": 1716454218275082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218348323, "dur": 51, "args": { "External id": 55248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55248, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55248, "pid": 5, "tid": 7, "ts": 1716454218348323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275120, "dur": 9, "args": { "External id": 55248, "cbid": 211, "correlation": 55248 } }, { "ph": "s", "id": 55248, "pid": 76337, "tid": -914061504, "ts": 1716454218275120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218348375, "dur": 30, "args": { "External id": 55259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55259, "pid": 5, "tid": 7, "ts": 1716454218348375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275191, "dur": 12, "args": { "External id": 55259, "cbid": 211, "correlation": 55259 } }, { "ph": "s", "id": 55259, "pid": 76337, "tid": -914061504, "ts": 1716454218275191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218348406, "dur": 34, "args": { "External id": 55281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55281, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55281, "pid": 5, "tid": 7, "ts": 1716454218348406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275222, "dur": 8, "args": { "External id": 55281, "cbid": 211, "correlation": 55281 } }, { "ph": "s", "id": 55281, "pid": 76337, "tid": -914061504, "ts": 1716454218275222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275307, "dur": 1, "args": { "External id": 55292, "cbid": 251, "correlation": 55292 } }, { "ph": "f", "id": 55292, "pid": 76337, "tid": -914061504, "ts": 1716454218275307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218348442, "dur": 90, "args": { "External id": 55293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55293, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55293, "pid": 5, "tid": 7, "ts": 1716454218348442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275313, "dur": 12, "args": { "External id": 55293, "cbid": 211, "correlation": 55293 } }, { "ph": "s", "id": 55293, "pid": 76337, "tid": -914061504, "ts": 1716454218275313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275385, "dur": 1, "args": { "External id": 55304, "cbid": 251, "correlation": 55304 } }, { "ph": "f", "id": 55304, "pid": 76337, "tid": -914061504, "ts": 1716454218275385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275389, "dur": 0, "args": { "External id": 55305, "cbid": 251, "correlation": 55305 } }, { "ph": "f", "id": 55305, "pid": 76337, "tid": -914061504, "ts": 1716454218275389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218348533, "dur": 11, "args": { "External id": 55306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55306, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 55306, "pid": 5, "tid": 7, "ts": 1716454218348533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275390, "dur": 12, "args": { "External id": 55306, "cbid": 211, "correlation": 55306 } }, { "ph": "s", "id": 55306, "pid": 76337, "tid": -914061504, "ts": 1716454218275390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218348545, "dur": 5, "args": { "External id": 55308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55308, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 55308, "pid": 5, "tid": 7, "ts": 1716454218348545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275404, "dur": 6, "args": { "External id": 55308, "cbid": 211, "correlation": 55308 } }, { "ph": "s", "id": 55308, "pid": 76337, "tid": -914061504, "ts": 1716454218275404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275461, "dur": 1, "args": { "External id": 55319, "cbid": 251, "correlation": 55319 } }, { "ph": "f", "id": 55319, "pid": 76337, "tid": -914061504, "ts": 1716454218275461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275464, "dur": 0, "args": { "External id": 55320, "cbid": 251, "correlation": 55320 } }, { "ph": "f", "id": 55320, "pid": 76337, "tid": -914061504, "ts": 1716454218275464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218348552, "dur": 7, "args": { "External id": 55321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55321, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 55321, "pid": 5, "tid": 7, "ts": 1716454218348552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275466, "dur": 11, "args": { "External id": 55321, "cbid": 211, "correlation": 55321 } }, { "ph": "s", "id": 55321, "pid": 76337, "tid": -914061504, "ts": 1716454218275466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218348560, "dur": 4, "args": { "External id": 55323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55323, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 55323, "pid": 5, "tid": 7, "ts": 1716454218348560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275479, "dur": 6, "args": { "External id": 55323, "cbid": 211, "correlation": 55323 } }, { "ph": "s", "id": 55323, "pid": 76337, "tid": -914061504, "ts": 1716454218275479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218348565, "dur": 91, "args": { "External id": 55344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55344, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 55344, "pid": 5, "tid": 7, "ts": 1716454218348565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275553, "dur": 12, "args": { "External id": 55344, "cbid": 211, "correlation": 55344 } }, { "ph": "s", "id": 55344, "pid": 76337, "tid": -914061504, "ts": 1716454218275553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275649, "dur": 1, "args": { "External id": 55362, "cbid": 251, "correlation": 55362 } }, { "ph": "f", "id": 55362, "pid": 76337, "tid": -914061504, "ts": 1716454218275649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218348657, "dur": 100, "args": { "External id": 55364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55364, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55364, "pid": 5, "tid": 7, "ts": 1716454218348657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275655, "dur": 13, "args": { "External id": 55364, "cbid": 211, "correlation": 55364 } }, { "ph": "s", "id": 55364, "pid": 76337, "tid": -914061504, "ts": 1716454218275655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218348759, "dur": 20, "args": { "External id": 55372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55372, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55372, "pid": 5, "tid": 7, "ts": 1716454218348759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275724, "dur": 14, "args": { "External id": 55372, "cbid": 211, "correlation": 55372 } }, { "ph": "s", "id": 55372, "pid": 76337, "tid": -914061504, "ts": 1716454218275724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218348779, "dur": 38, "args": { "External id": 55380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55380, "pid": 5, "tid": 7, "ts": 1716454218348779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275770, "dur": 10, "args": { "External id": 55380, "cbid": 211, "correlation": 55380 } }, { "ph": "s", "id": 55380, "pid": 76337, "tid": -914061504, "ts": 1716454218275770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218348819, "dur": 35, "args": { "External id": 55402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55402, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55402, "pid": 5, "tid": 7, "ts": 1716454218348819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275821, "dur": 10, "args": { "External id": 55402, "cbid": 211, "correlation": 55402 } }, { "ph": "s", "id": 55402, "pid": 76337, "tid": -914061504, "ts": 1716454218275821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275910, "dur": 1, "args": { "External id": 55418, "cbid": 251, "correlation": 55418 } }, { "ph": "f", "id": 55418, "pid": 76337, "tid": -914061504, "ts": 1716454218275910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218275915, "dur": 0, "args": { "External id": 55420, "cbid": 251, "correlation": 55420 } }, { "ph": "f", "id": 55420, "pid": 76337, "tid": -914061504, "ts": 1716454218275915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218348855, "dur": 534, "args": { "External id": 55421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55421, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 55421, "pid": 5, "tid": 7, "ts": 1716454218348855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275918, "dur": 13, "args": { "External id": 55421, "cbid": 211, "correlation": 55421 } }, { "ph": "s", "id": 55421, "pid": 76337, "tid": -914061504, "ts": 1716454218275918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218349390, "dur": 126, "args": { "External id": 55429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55429, "pid": 5, "tid": 7, "ts": 1716454218349390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218275991, "dur": 13, "args": { "External id": 55429, "cbid": 211, "correlation": 55429 } }, { "ph": "s", "id": 55429, "pid": 76337, "tid": -914061504, "ts": 1716454218275991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218349517, "dur": 129, "args": { "External id": 55437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55437, "pid": 5, "tid": 7, "ts": 1716454218349517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276023, "dur": 9, "args": { "External id": 55437, "cbid": 211, "correlation": 55437 } }, { "ph": "s", "id": 55437, "pid": 76337, "tid": -914061504, "ts": 1716454218276023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218276105, "dur": 1, "args": { "External id": 55453, "cbid": 251, "correlation": 55453 } }, { "ph": "f", "id": 55453, "pid": 76337, "tid": -914061504, "ts": 1716454218276105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218349647, "dur": 303, "args": { "External id": 55455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55455, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55455, "pid": 5, "tid": 7, "ts": 1716454218349647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276111, "dur": 12, "args": { "External id": 55455, "cbid": 211, "correlation": 55455 } }, { "ph": "s", "id": 55455, "pid": 76337, "tid": -914061504, "ts": 1716454218276111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218349951, "dur": 27, "args": { "External id": 55463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55463, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55463, "pid": 5, "tid": 7, "ts": 1716454218349951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276153, "dur": 10, "args": { "External id": 55463, "cbid": 211, "correlation": 55463 } }, { "ph": "s", "id": 55463, "pid": 76337, "tid": -914061504, "ts": 1716454218276153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218349979, "dur": 80, "args": { "External id": 55474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55474, "pid": 5, "tid": 7, "ts": 1716454218349979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276221, "dur": 12, "args": { "External id": 55474, "cbid": 211, "correlation": 55474 } }, { "ph": "s", "id": 55474, "pid": 76337, "tid": -914061504, "ts": 1716454218276221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218276286, "dur": 0, "args": { "External id": 55486, "cbid": 317, "correlation": 55486 } }, { "ph": "f", "id": 55486, "pid": 76337, "tid": -914061504, "ts": 1716454218276286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218276287, "dur": 0, "args": { "External id": 55487, "cbid": 203, "correlation": 55487 } }, { "ph": "f", "id": 55487, "pid": 76337, "tid": -914061504, "ts": 1716454218276287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218276288, "dur": 0, "args": { "External id": 55488, "cbid": 205, "correlation": 55488 } }, { "ph": "f", "id": 55488, "pid": 76337, "tid": -914061504, "ts": 1716454218276288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218350061, "dur": 24, "args": { "External id": 55492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55492, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55492, "pid": 5, "tid": 7, "ts": 1716454218350061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276303, "dur": 12, "args": { "External id": 55492, "cbid": 211, "correlation": 55492 } }, { "ph": "s", "id": 55492, "pid": 76337, "tid": -914061504, "ts": 1716454218276303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218350086, "dur": 120, "args": { "External id": 55494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55494, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55494, "pid": 5, "tid": 7, "ts": 1716454218350086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276322, "dur": 7, "args": { "External id": 55494, "cbid": 211, "correlation": 55494 } }, { "ph": "s", "id": 55494, "pid": 76337, "tid": -914061504, "ts": 1716454218276322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218350207, "dur": 23, "args": { "External id": 55496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55496, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55496, "pid": 5, "tid": 7, "ts": 1716454218350207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276332, "dur": 5, "args": { "External id": 55496, "cbid": 211, "correlation": 55496 } }, { "ph": "s", "id": 55496, "pid": 76337, "tid": -914061504, "ts": 1716454218276332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218350232, "dur": 32, "args": { "External id": 55502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55502, "pid": 5, "tid": 7, "ts": 1716454218350232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276360, "dur": 8, "args": { "External id": 55502, "cbid": 211, "correlation": 55502 } }, { "ph": "s", "id": 55502, "pid": 76337, "tid": -914061504, "ts": 1716454218276360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218350265, "dur": 27, "args": { "External id": 55510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55510, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55510, "pid": 5, "tid": 7, "ts": 1716454218350265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276391, "dur": 9, "args": { "External id": 55510, "cbid": 211, "correlation": 55510 } }, { "ph": "s", "id": 55510, "pid": 76337, "tid": -914061504, "ts": 1716454218276391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218350293, "dur": 46, "args": { "External id": 55519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55519, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55519, "pid": 5, "tid": 7, "ts": 1716454218350293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276433, "dur": 10, "args": { "External id": 55519, "cbid": 211, "correlation": 55519 } }, { "ph": "s", "id": 55519, "pid": 76337, "tid": -914061504, "ts": 1716454218276433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218350340, "dur": 43, "args": { "External id": 55539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55539, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 55539, "pid": 5, "tid": 7, "ts": 1716454218350340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276505, "dur": 12, "args": { "External id": 55539, "cbid": 211, "correlation": 55539 } }, { "ph": "s", "id": 55539, "pid": 76337, "tid": -914061504, "ts": 1716454218276505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218350385, "dur": 5, "args": { "External id": 55551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55551, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 55551, "pid": 5, "tid": 7, "ts": 1716454218350385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276527, "dur": 6, "args": { "External id": 55551, "cbid": 211, "correlation": 55551 } }, { "ph": "s", "id": 55551, "pid": 76337, "tid": -914061504, "ts": 1716454218276527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218350391, "dur": 43, "args": { "External id": 55554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55554, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55554, "pid": 5, "tid": 7, "ts": 1716454218350391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276546, "dur": 7, "args": { "External id": 55554, "cbid": 211, "correlation": 55554 } }, { "ph": "s", "id": 55554, "pid": 76337, "tid": -914061504, "ts": 1716454218276546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218350435, "dur": 28, "args": { "External id": 55563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55563, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55563, "pid": 5, "tid": 7, "ts": 1716454218350435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276585, "dur": 9, "args": { "External id": 55563, "cbid": 211, "correlation": 55563 } }, { "ph": "s", "id": 55563, "pid": 76337, "tid": -914061504, "ts": 1716454218276585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218276636, "dur": 0, "args": { "External id": 55573, "cbid": 317, "correlation": 55573 } }, { "ph": "f", "id": 55573, "pid": 76337, "tid": -914061504, "ts": 1716454218276636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218276637, "dur": 0, "args": { "External id": 55574, "cbid": 203, "correlation": 55574 } }, { "ph": "f", "id": 55574, "pid": 76337, "tid": -914061504, "ts": 1716454218276637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218276638, "dur": 0, "args": { "External id": 55575, "cbid": 205, "correlation": 55575 } }, { "ph": "f", "id": 55575, "pid": 76337, "tid": -914061504, "ts": 1716454218276638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218350465, "dur": 32, "args": { "External id": 55579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55579, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55579, "pid": 5, "tid": 7, "ts": 1716454218350465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276652, "dur": 11, "args": { "External id": 55579, "cbid": 211, "correlation": 55579 } }, { "ph": "s", "id": 55579, "pid": 76337, "tid": -914061504, "ts": 1716454218276652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218350499, "dur": 63, "args": { "External id": 55581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55581, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55581, "pid": 5, "tid": 7, "ts": 1716454218350499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276666, "dur": 6, "args": { "External id": 55581, "cbid": 211, "correlation": 55581 } }, { "ph": "s", "id": 55581, "pid": 76337, "tid": -914061504, "ts": 1716454218276666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218350563, "dur": 963, "args": { "External id": 55583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55583, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55583, "pid": 5, "tid": 7, "ts": 1716454218350563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276678, "dur": 9, "args": { "External id": 55583, "cbid": 211, "correlation": 55583 } }, { "ph": "s", "id": 55583, "pid": 76337, "tid": -914061504, "ts": 1716454218276678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218351527, "dur": 20, "args": { "External id": 55585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55585, "pid": 5, "tid": 7, "ts": 1716454218351527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276691, "dur": 5, "args": { "External id": 55585, "cbid": 211, "correlation": 55585 } }, { "ph": "s", "id": 55585, "pid": 76337, "tid": -914061504, "ts": 1716454218276691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218351549, "dur": 33, "args": { "External id": 55591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55591, "pid": 5, "tid": 7, "ts": 1716454218351549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276719, "dur": 8, "args": { "External id": 55591, "cbid": 211, "correlation": 55591 } }, { "ph": "s", "id": 55591, "pid": 76337, "tid": -914061504, "ts": 1716454218276719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218351583, "dur": 3, "args": { "External id": 55599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55599, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 55599, "pid": 5, "tid": 7, "ts": 1716454218351583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276762, "dur": 10, "args": { "External id": 55599, "cbid": 211, "correlation": 55599 } }, { "ph": "s", "id": 55599, "pid": 76337, "tid": -914061504, "ts": 1716454218276762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218276829, "dur": 1, "args": { "External id": 55615, "cbid": 251, "correlation": 55615 } }, { "ph": "f", "id": 55615, "pid": 76337, "tid": -914061504, "ts": 1716454218276829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218276834, "dur": 0, "args": { "External id": 55617, "cbid": 251, "correlation": 55617 } }, { "ph": "f", "id": 55617, "pid": 76337, "tid": -914061504, "ts": 1716454218276834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218351587, "dur": 12, "args": { "External id": 55618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55618, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 55618, "pid": 5, "tid": 7, "ts": 1716454218351587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276836, "dur": 11, "args": { "External id": 55618, "cbid": 211, "correlation": 55618 } }, { "ph": "s", "id": 55618, "pid": 76337, "tid": -914061504, "ts": 1716454218276836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218351601, "dur": 5, "args": { "External id": 55620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55620, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 55620, "pid": 5, "tid": 7, "ts": 1716454218351601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276849, "dur": 5, "args": { "External id": 55620, "cbid": 211, "correlation": 55620 } }, { "ph": "s", "id": 55620, "pid": 76337, "tid": -914061504, "ts": 1716454218276849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218351607, "dur": 29, "args": { "External id": 55630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55630, "pid": 5, "tid": 7, "ts": 1716454218351607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276906, "dur": 12, "args": { "External id": 55630, "cbid": 211, "correlation": 55630 } }, { "ph": "s", "id": 55630, "pid": 76337, "tid": -914061504, "ts": 1716454218276906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218351638, "dur": 31, "args": { "External id": 55650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55650, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 55650, "pid": 5, "tid": 7, "ts": 1716454218351638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218276972, "dur": 19, "args": { "External id": 55650, "cbid": 211, "correlation": 55650 } }, { "ph": "s", "id": 55650, "pid": 76337, "tid": -914061504, "ts": 1716454218276972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218351670, "dur": 4, "args": { "External id": 55662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55662, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 55662, "pid": 5, "tid": 7, "ts": 1716454218351670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277004, "dur": 7, "args": { "External id": 55662, "cbid": 211, "correlation": 55662 } }, { "ph": "s", "id": 55662, "pid": 76337, "tid": -914061504, "ts": 1716454218277004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218351675, "dur": 30, "args": { "External id": 55665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55665, "pid": 5, "tid": 7, "ts": 1716454218351675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277023, "dur": 7, "args": { "External id": 55665, "cbid": 211, "correlation": 55665 } }, { "ph": "s", "id": 55665, "pid": 76337, "tid": -914061504, "ts": 1716454218277023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218351707, "dur": 20, "args": { "External id": 55674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55674, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55674, "pid": 5, "tid": 7, "ts": 1716454218351707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277065, "dur": 10, "args": { "External id": 55674, "cbid": 211, "correlation": 55674 } }, { "ph": "s", "id": 55674, "pid": 76337, "tid": -914061504, "ts": 1716454218277065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218277128, "dur": 0, "args": { "External id": 55684, "cbid": 317, "correlation": 55684 } }, { "ph": "f", "id": 55684, "pid": 76337, "tid": -914061504, "ts": 1716454218277128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218277129, "dur": 0, "args": { "External id": 55685, "cbid": 203, "correlation": 55685 } }, { "ph": "f", "id": 55685, "pid": 76337, "tid": -914061504, "ts": 1716454218277129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218277130, "dur": 0, "args": { "External id": 55686, "cbid": 205, "correlation": 55686 } }, { "ph": "f", "id": 55686, "pid": 76337, "tid": -914061504, "ts": 1716454218277130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218351728, "dur": 22, "args": { "External id": 55690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55690, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55690, "pid": 5, "tid": 7, "ts": 1716454218351728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277145, "dur": 12, "args": { "External id": 55690, "cbid": 211, "correlation": 55690 } }, { "ph": "s", "id": 55690, "pid": 76337, "tid": -914061504, "ts": 1716454218277145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218351752, "dur": 44, "args": { "External id": 55692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55692, "pid": 5, "tid": 7, "ts": 1716454218351752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277159, "dur": 5, "args": { "External id": 55692, "cbid": 211, "correlation": 55692 } }, { "ph": "s", "id": 55692, "pid": 76337, "tid": -914061504, "ts": 1716454218277159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218351797, "dur": 641, "args": { "External id": 55694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55694, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55694, "pid": 5, "tid": 7, "ts": 1716454218351797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277171, "dur": 6, "args": { "External id": 55694, "cbid": 211, "correlation": 55694 } }, { "ph": "s", "id": 55694, "pid": 76337, "tid": -914061504, "ts": 1716454218277171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218352440, "dur": 21, "args": { "External id": 55696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55696, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55696, "pid": 5, "tid": 7, "ts": 1716454218352440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277180, "dur": 5, "args": { "External id": 55696, "cbid": 211, "correlation": 55696 } }, { "ph": "s", "id": 55696, "pid": 76337, "tid": -914061504, "ts": 1716454218277180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218352462, "dur": 33, "args": { "External id": 55702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55702, "pid": 5, "tid": 7, "ts": 1716454218352462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277209, "dur": 8, "args": { "External id": 55702, "cbid": 211, "correlation": 55702 } }, { "ph": "s", "id": 55702, "pid": 76337, "tid": -914061504, "ts": 1716454218277209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218277267, "dur": 0, "args": { "External id": 55712, "cbid": 317, "correlation": 55712 } }, { "ph": "f", "id": 55712, "pid": 76337, "tid": -914061504, "ts": 1716454218277267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218277267, "dur": 0, "args": { "External id": 55713, "cbid": 203, "correlation": 55713 } }, { "ph": "f", "id": 55713, "pid": 76337, "tid": -914061504, "ts": 1716454218277267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218277268, "dur": 0, "args": { "External id": 55714, "cbid": 205, "correlation": 55714 } }, { "ph": "f", "id": 55714, "pid": 76337, "tid": -914061504, "ts": 1716454218277268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218352496, "dur": 30, "args": { "External id": 55718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55718, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55718, "pid": 5, "tid": 7, "ts": 1716454218352496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277283, "dur": 15, "args": { "External id": 55718, "cbid": 211, "correlation": 55718 } }, { "ph": "s", "id": 55718, "pid": 76337, "tid": -914061504, "ts": 1716454218277283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218352527, "dur": 151, "args": { "External id": 55720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55720, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55720, "pid": 5, "tid": 7, "ts": 1716454218352527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277304, "dur": 6, "args": { "External id": 55720, "cbid": 211, "correlation": 55720 } }, { "ph": "s", "id": 55720, "pid": 76337, "tid": -914061504, "ts": 1716454218277304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218352679, "dur": 22, "args": { "External id": 55722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55722, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55722, "pid": 5, "tid": 7, "ts": 1716454218352679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277314, "dur": 5, "args": { "External id": 55722, "cbid": 211, "correlation": 55722 } }, { "ph": "s", "id": 55722, "pid": 76337, "tid": -914061504, "ts": 1716454218277314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218352703, "dur": 32, "args": { "External id": 55728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55728, "pid": 5, "tid": 7, "ts": 1716454218352703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277340, "dur": 9, "args": { "External id": 55728, "cbid": 211, "correlation": 55728 } }, { "ph": "s", "id": 55728, "pid": 76337, "tid": -914061504, "ts": 1716454218277340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218352736, "dur": 27, "args": { "External id": 55736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55736, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55736, "pid": 5, "tid": 7, "ts": 1716454218352736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277369, "dur": 8, "args": { "External id": 55736, "cbid": 211, "correlation": 55736 } }, { "ph": "s", "id": 55736, "pid": 76337, "tid": -914061504, "ts": 1716454218277369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218352765, "dur": 19, "args": { "External id": 55744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55744, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55744, "pid": 5, "tid": 7, "ts": 1716454218352765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277398, "dur": 8, "args": { "External id": 55744, "cbid": 211, "correlation": 55744 } }, { "ph": "s", "id": 55744, "pid": 76337, "tid": -914061504, "ts": 1716454218277398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218352786, "dur": 30, "args": { "External id": 55764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55764, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 55764, "pid": 5, "tid": 7, "ts": 1716454218352786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277481, "dur": 13, "args": { "External id": 55764, "cbid": 211, "correlation": 55764 } }, { "ph": "s", "id": 55764, "pid": 76337, "tid": -914061504, "ts": 1716454218277481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218352817, "dur": 4, "args": { "External id": 55776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55776, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 55776, "pid": 5, "tid": 7, "ts": 1716454218352817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277504, "dur": 6, "args": { "External id": 55776, "cbid": 211, "correlation": 55776 } }, { "ph": "s", "id": 55776, "pid": 76337, "tid": -914061504, "ts": 1716454218277504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218352823, "dur": 31, "args": { "External id": 55779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55779, "pid": 5, "tid": 7, "ts": 1716454218352823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277521, "dur": 6, "args": { "External id": 55779, "cbid": 211, "correlation": 55779 } }, { "ph": "s", "id": 55779, "pid": 76337, "tid": -914061504, "ts": 1716454218277521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218277581, "dur": 0, "args": { "External id": 55790, "cbid": 317, "correlation": 55790 } }, { "ph": "f", "id": 55790, "pid": 76337, "tid": -914061504, "ts": 1716454218277581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218277582, "dur": 0, "args": { "External id": 55791, "cbid": 203, "correlation": 55791 } }, { "ph": "f", "id": 55791, "pid": 76337, "tid": -914061504, "ts": 1716454218277582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218277583, "dur": 0, "args": { "External id": 55792, "cbid": 205, "correlation": 55792 } }, { "ph": "f", "id": 55792, "pid": 76337, "tid": -914061504, "ts": 1716454218277583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218352855, "dur": 22, "args": { "External id": 55796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55796, "pid": 5, "tid": 7, "ts": 1716454218352855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277595, "dur": 12, "args": { "External id": 55796, "cbid": 211, "correlation": 55796 } }, { "ph": "s", "id": 55796, "pid": 76337, "tid": -914061504, "ts": 1716454218277595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218352878, "dur": 104, "args": { "External id": 55798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55798, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55798, "pid": 5, "tid": 7, "ts": 1716454218352878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277614, "dur": 6, "args": { "External id": 55798, "cbid": 211, "correlation": 55798 } }, { "ph": "s", "id": 55798, "pid": 76337, "tid": -914061504, "ts": 1716454218277614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218352983, "dur": 23, "args": { "External id": 55800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55800, "pid": 5, "tid": 7, "ts": 1716454218352983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277623, "dur": 5, "args": { "External id": 55800, "cbid": 211, "correlation": 55800 } }, { "ph": "s", "id": 55800, "pid": 76337, "tid": -914061504, "ts": 1716454218277623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218353008, "dur": 32, "args": { "External id": 55806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55806, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55806, "pid": 5, "tid": 7, "ts": 1716454218353008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277650, "dur": 8, "args": { "External id": 55806, "cbid": 211, "correlation": 55806 } }, { "ph": "s", "id": 55806, "pid": 76337, "tid": -914061504, "ts": 1716454218277650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218353041, "dur": 194, "args": { "External id": 55815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55815, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55815, "pid": 5, "tid": 7, "ts": 1716454218353041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277733, "dur": 14, "args": { "External id": 55815, "cbid": 211, "correlation": 55815 } }, { "ph": "s", "id": 55815, "pid": 76337, "tid": -914061504, "ts": 1716454218277733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218353236, "dur": 64, "args": { "External id": 55837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55837, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55837, "pid": 5, "tid": 7, "ts": 1716454218353236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277789, "dur": 10, "args": { "External id": 55837, "cbid": 211, "correlation": 55837 } }, { "ph": "s", "id": 55837, "pid": 76337, "tid": -914061504, "ts": 1716454218277789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218277876, "dur": 2, "args": { "External id": 55848, "cbid": 251, "correlation": 55848 } }, { "ph": "f", "id": 55848, "pid": 76337, "tid": -914061504, "ts": 1716454218277876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218353302, "dur": 151, "args": { "External id": 55849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55849, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55849, "pid": 5, "tid": 7, "ts": 1716454218353302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277882, "dur": 13, "args": { "External id": 55849, "cbid": 211, "correlation": 55849 } }, { "ph": "s", "id": 55849, "pid": 76337, "tid": -914061504, "ts": 1716454218277882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218277956, "dur": 1, "args": { "External id": 55860, "cbid": 251, "correlation": 55860 } }, { "ph": "f", "id": 55860, "pid": 76337, "tid": -914061504, "ts": 1716454218277956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218353454, "dur": 141, "args": { "External id": 55861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55861, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55861, "pid": 5, "tid": 7, "ts": 1716454218353454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218277960, "dur": 12, "args": { "External id": 55861, "cbid": 211, "correlation": 55861 } }, { "ph": "s", "id": 55861, "pid": 76337, "tid": -914061504, "ts": 1716454218277960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278036, "dur": 1, "args": { "External id": 55872, "cbid": 251, "correlation": 55872 } }, { "ph": "f", "id": 55872, "pid": 76337, "tid": -914061504, "ts": 1716454218278036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218353597, "dur": 142, "args": { "External id": 55873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55873, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55873, "pid": 5, "tid": 7, "ts": 1716454218353597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278041, "dur": 12, "args": { "External id": 55873, "cbid": 211, "correlation": 55873 } }, { "ph": "s", "id": 55873, "pid": 76337, "tid": -914061504, "ts": 1716454218278041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218353740, "dur": 1934, "args": { "External id": 55894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55894, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 55894, "pid": 5, "tid": 7, "ts": 1716454218353740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278123, "dur": 13, "args": { "External id": 55894, "cbid": 211, "correlation": 55894 } }, { "ph": "s", "id": 55894, "pid": 76337, "tid": -914061504, "ts": 1716454218278123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278221, "dur": 1, "args": { "External id": 55912, "cbid": 251, "correlation": 55912 } }, { "ph": "f", "id": 55912, "pid": 76337, "tid": -914061504, "ts": 1716454218278221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218355675, "dur": 149, "args": { "External id": 55914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55914, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 55914, "pid": 5, "tid": 7, "ts": 1716454218355675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278227, "dur": 13, "args": { "External id": 55914, "cbid": 211, "correlation": 55914 } }, { "ph": "s", "id": 55914, "pid": 76337, "tid": -914061504, "ts": 1716454218278227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218355825, "dur": 35, "args": { "External id": 55922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55922, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55922, "pid": 5, "tid": 7, "ts": 1716454218355825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278297, "dur": 12, "args": { "External id": 55922, "cbid": 211, "correlation": 55922 } }, { "ph": "s", "id": 55922, "pid": 76337, "tid": -914061504, "ts": 1716454218278297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218355862, "dur": 50, "args": { "External id": 55930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55930, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55930, "pid": 5, "tid": 7, "ts": 1716454218355862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278335, "dur": 8, "args": { "External id": 55930, "cbid": 211, "correlation": 55930 } }, { "ph": "s", "id": 55930, "pid": 76337, "tid": -914061504, "ts": 1716454218278335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218355913, "dur": 31, "args": { "External id": 55941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55941, "pid": 5, "tid": 7, "ts": 1716454218355913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278411, "dur": 13, "args": { "External id": 55941, "cbid": 211, "correlation": 55941 } }, { "ph": "s", "id": 55941, "pid": 76337, "tid": -914061504, "ts": 1716454218278411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218355946, "dur": 34, "args": { "External id": 55963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55963, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 55963, "pid": 5, "tid": 7, "ts": 1716454218355946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278441, "dur": 8, "args": { "External id": 55963, "cbid": 211, "correlation": 55963 } }, { "ph": "s", "id": 55963, "pid": 76337, "tid": -914061504, "ts": 1716454218278441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278527, "dur": 1, "args": { "External id": 55974, "cbid": 251, "correlation": 55974 } }, { "ph": "f", "id": 55974, "pid": 76337, "tid": -914061504, "ts": 1716454218278527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218355981, "dur": 90, "args": { "External id": 55975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55975, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 55975, "pid": 5, "tid": 7, "ts": 1716454218355981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278532, "dur": 12, "args": { "External id": 55975, "cbid": 211, "correlation": 55975 } }, { "ph": "s", "id": 55975, "pid": 76337, "tid": -914061504, "ts": 1716454218278532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278600, "dur": 1, "args": { "External id": 55986, "cbid": 251, "correlation": 55986 } }, { "ph": "f", "id": 55986, "pid": 76337, "tid": -914061504, "ts": 1716454218278600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278604, "dur": 0, "args": { "External id": 55987, "cbid": 251, "correlation": 55987 } }, { "ph": "f", "id": 55987, "pid": 76337, "tid": -914061504, "ts": 1716454218278604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218356073, "dur": 11, "args": { "External id": 55988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55988, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 55988, "pid": 5, "tid": 7, "ts": 1716454218356073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278606, "dur": 12, "args": { "External id": 55988, "cbid": 211, "correlation": 55988 } }, { "ph": "s", "id": 55988, "pid": 76337, "tid": -914061504, "ts": 1716454218278606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218356085, "dur": 6, "args": { "External id": 55990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 55990, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 55990, "pid": 5, "tid": 7, "ts": 1716454218356085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278620, "dur": 6, "args": { "External id": 55990, "cbid": 211, "correlation": 55990 } }, { "ph": "s", "id": 55990, "pid": 76337, "tid": -914061504, "ts": 1716454218278620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278677, "dur": 1, "args": { "External id": 56001, "cbid": 251, "correlation": 56001 } }, { "ph": "f", "id": 56001, "pid": 76337, "tid": -914061504, "ts": 1716454218278677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278681, "dur": 0, "args": { "External id": 56002, "cbid": 251, "correlation": 56002 } }, { "ph": "f", "id": 56002, "pid": 76337, "tid": -914061504, "ts": 1716454218278681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218356092, "dur": 7, "args": { "External id": 56003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56003, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 56003, "pid": 5, "tid": 7, "ts": 1716454218356092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278682, "dur": 12, "args": { "External id": 56003, "cbid": 211, "correlation": 56003 } }, { "ph": "s", "id": 56003, "pid": 76337, "tid": -914061504, "ts": 1716454218278682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218356100, "dur": 4, "args": { "External id": 56005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56005, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 56005, "pid": 5, "tid": 7, "ts": 1716454218356100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278695, "dur": 8, "args": { "External id": 56005, "cbid": 211, "correlation": 56005 } }, { "ph": "s", "id": 56005, "pid": 76337, "tid": -914061504, "ts": 1716454218278695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218356105, "dur": 92, "args": { "External id": 56026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56026, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 56026, "pid": 5, "tid": 7, "ts": 1716454218356105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278772, "dur": 13, "args": { "External id": 56026, "cbid": 211, "correlation": 56026 } }, { "ph": "s", "id": 56026, "pid": 76337, "tid": -914061504, "ts": 1716454218278772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218278868, "dur": 1, "args": { "External id": 56044, "cbid": 251, "correlation": 56044 } }, { "ph": "f", "id": 56044, "pid": 76337, "tid": -914061504, "ts": 1716454218278868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218356198, "dur": 99, "args": { "External id": 56046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56046, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56046, "pid": 5, "tid": 7, "ts": 1716454218356198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278875, "dur": 13, "args": { "External id": 56046, "cbid": 211, "correlation": 56046 } }, { "ph": "s", "id": 56046, "pid": 76337, "tid": -914061504, "ts": 1716454218278875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218356299, "dur": 19, "args": { "External id": 56054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56054, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56054, "pid": 5, "tid": 7, "ts": 1716454218356299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278942, "dur": 12, "args": { "External id": 56054, "cbid": 211, "correlation": 56054 } }, { "ph": "s", "id": 56054, "pid": 76337, "tid": -914061504, "ts": 1716454218278942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218356319, "dur": 37, "args": { "External id": 56062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56062, "pid": 5, "tid": 7, "ts": 1716454218356319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218278991, "dur": 10, "args": { "External id": 56062, "cbid": 211, "correlation": 56062 } }, { "ph": "s", "id": 56062, "pid": 76337, "tid": -914061504, "ts": 1716454218278991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218356357, "dur": 34, "args": { "External id": 56084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56084, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56084, "pid": 5, "tid": 7, "ts": 1716454218356357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279044, "dur": 13, "args": { "External id": 56084, "cbid": 211, "correlation": 56084 } }, { "ph": "s", "id": 56084, "pid": 76337, "tid": -914061504, "ts": 1716454218279044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218279137, "dur": 1, "args": { "External id": 56100, "cbid": 251, "correlation": 56100 } }, { "ph": "f", "id": 56100, "pid": 76337, "tid": -914061504, "ts": 1716454218279137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218279142, "dur": 0, "args": { "External id": 56102, "cbid": 251, "correlation": 56102 } }, { "ph": "f", "id": 56102, "pid": 76337, "tid": -914061504, "ts": 1716454218279142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218356393, "dur": 544, "args": { "External id": 56103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56103, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 56103, "pid": 5, "tid": 7, "ts": 1716454218356393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279145, "dur": 13, "args": { "External id": 56103, "cbid": 211, "correlation": 56103 } }, { "ph": "s", "id": 56103, "pid": 76337, "tid": -914061504, "ts": 1716454218279145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218356938, "dur": 126, "args": { "External id": 56111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56111, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56111, "pid": 5, "tid": 7, "ts": 1716454218356938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279211, "dur": 12, "args": { "External id": 56111, "cbid": 211, "correlation": 56111 } }, { "ph": "s", "id": 56111, "pid": 76337, "tid": -914061504, "ts": 1716454218279211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218357065, "dur": 131, "args": { "External id": 56119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56119, "pid": 5, "tid": 7, "ts": 1716454218357065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279241, "dur": 8, "args": { "External id": 56119, "cbid": 211, "correlation": 56119 } }, { "ph": "s", "id": 56119, "pid": 76337, "tid": -914061504, "ts": 1716454218279241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218279317, "dur": 1, "args": { "External id": 56135, "cbid": 251, "correlation": 56135 } }, { "ph": "f", "id": 56135, "pid": 76337, "tid": -914061504, "ts": 1716454218279317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218357197, "dur": 305, "args": { "External id": 56137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56137, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56137, "pid": 5, "tid": 7, "ts": 1716454218357197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279323, "dur": 13, "args": { "External id": 56137, "cbid": 211, "correlation": 56137 } }, { "ph": "s", "id": 56137, "pid": 76337, "tid": -914061504, "ts": 1716454218279323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218357504, "dur": 28, "args": { "External id": 56145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56145, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56145, "pid": 5, "tid": 7, "ts": 1716454218357504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279365, "dur": 9, "args": { "External id": 56145, "cbid": 211, "correlation": 56145 } }, { "ph": "s", "id": 56145, "pid": 76337, "tid": -914061504, "ts": 1716454218279365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218357533, "dur": 81, "args": { "External id": 56156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56156, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56156, "pid": 5, "tid": 7, "ts": 1716454218357533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279432, "dur": 13, "args": { "External id": 56156, "cbid": 211, "correlation": 56156 } }, { "ph": "s", "id": 56156, "pid": 76337, "tid": -914061504, "ts": 1716454218279432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218279500, "dur": 0, "args": { "External id": 56168, "cbid": 317, "correlation": 56168 } }, { "ph": "f", "id": 56168, "pid": 76337, "tid": -914061504, "ts": 1716454218279500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218279501, "dur": 0, "args": { "External id": 56169, "cbid": 203, "correlation": 56169 } }, { "ph": "f", "id": 56169, "pid": 76337, "tid": -914061504, "ts": 1716454218279501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218279502, "dur": 0, "args": { "External id": 56170, "cbid": 205, "correlation": 56170 } }, { "ph": "f", "id": 56170, "pid": 76337, "tid": -914061504, "ts": 1716454218279502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218357615, "dur": 23, "args": { "External id": 56174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56174, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56174, "pid": 5, "tid": 7, "ts": 1716454218357615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279517, "dur": 12, "args": { "External id": 56174, "cbid": 211, "correlation": 56174 } }, { "ph": "s", "id": 56174, "pid": 76337, "tid": -914061504, "ts": 1716454218279517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218357640, "dur": 121, "args": { "External id": 56176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56176, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56176, "pid": 5, "tid": 7, "ts": 1716454218357640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279535, "dur": 7, "args": { "External id": 56176, "cbid": 211, "correlation": 56176 } }, { "ph": "s", "id": 56176, "pid": 76337, "tid": -914061504, "ts": 1716454218279535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218357762, "dur": 24, "args": { "External id": 56178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56178, "pid": 5, "tid": 7, "ts": 1716454218357762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279546, "dur": 5, "args": { "External id": 56178, "cbid": 211, "correlation": 56178 } }, { "ph": "s", "id": 56178, "pid": 76337, "tid": -914061504, "ts": 1716454218279546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218357787, "dur": 33, "args": { "External id": 56184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56184, "pid": 5, "tid": 7, "ts": 1716454218357787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279573, "dur": 8, "args": { "External id": 56184, "cbid": 211, "correlation": 56184 } }, { "ph": "s", "id": 56184, "pid": 76337, "tid": -914061504, "ts": 1716454218279573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218357821, "dur": 26, "args": { "External id": 56192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56192, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56192, "pid": 5, "tid": 7, "ts": 1716454218357821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279605, "dur": 8, "args": { "External id": 56192, "cbid": 211, "correlation": 56192 } }, { "ph": "s", "id": 56192, "pid": 76337, "tid": -914061504, "ts": 1716454218279605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454218357849, "dur": 102, "args": { "External id": 56203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56203, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56203, "pid": 5, "tid": 7, "ts": 1716454218357849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279666, "dur": 12, "args": { "External id": 56203, "cbid": 211, "correlation": 56203 } }, { "ph": "s", "id": 56203, "pid": 76337, "tid": -914061504, "ts": 1716454218279666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218279721, "dur": 0, "args": { "External id": 56213, "cbid": 317, "correlation": 56213 } }, { "ph": "f", "id": 56213, "pid": 76337, "tid": -914061504, "ts": 1716454218279721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218279722, "dur": 0, "args": { "External id": 56214, "cbid": 203, "correlation": 56214 } }, { "ph": "f", "id": 56214, "pid": 76337, "tid": -914061504, "ts": 1716454218279722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218279723, "dur": 0, "args": { "External id": 56215, "cbid": 205, "correlation": 56215 } }, { "ph": "f", "id": 56215, "pid": 76337, "tid": -914061504, "ts": 1716454218279723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218357952, "dur": 75, "args": { "External id": 56219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56219, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56219, "pid": 5, "tid": 7, "ts": 1716454218357952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279736, "dur": 11, "args": { "External id": 56219, "cbid": 211, "correlation": 56219 } }, { "ph": "s", "id": 56219, "pid": 76337, "tid": -914061504, "ts": 1716454218279736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218358029, "dur": 44, "args": { "External id": 56221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56221, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56221, "pid": 5, "tid": 7, "ts": 1716454218358029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279751, "dur": 5, "args": { "External id": 56221, "cbid": 211, "correlation": 56221 } }, { "ph": "s", "id": 56221, "pid": 76337, "tid": -914061504, "ts": 1716454218279751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218358074, "dur": 4, "args": { "External id": 56223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56223, "pid": 5, "tid": 7, "ts": 1716454218358074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279761, "dur": 9, "args": { "External id": 56223, "cbid": 211, "correlation": 56223 } }, { "ph": "s", "id": 56223, "pid": 76337, "tid": -914061504, "ts": 1716454218279761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218279774, "dur": 0, "args": { "External id": 56224, "cbid": 51, "correlation": 56224 } }, { "ph": "s", "id": 56224, "pid": 76337, "tid": -914061504, "ts": 1716454218279774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218358079, "dur": 2224, "args": { "External id": 56225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56225, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56225, "pid": 5, "tid": 7, "ts": 1716454218358079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279775, "dur": 6, "args": { "External id": 56225, "cbid": 211, "correlation": 56225 } }, { "ph": "s", "id": 56225, "pid": 76337, "tid": -914061504, "ts": 1716454218279775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218360305, "dur": 113, "args": { "External id": 56230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56230, "pid": 5, "tid": 7, "ts": 1716454218360305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279803, "dur": 9, "args": { "External id": 56230, "cbid": 211, "correlation": 56230 } }, { "ph": "s", "id": 56230, "pid": 76337, "tid": -914061504, "ts": 1716454218279803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218360419, "dur": 165, "args": { "External id": 56239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56239, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56239, "pid": 5, "tid": 7, "ts": 1716454218360419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279896, "dur": 13, "args": { "External id": 56239, "cbid": 211, "correlation": 56239 } }, { "ph": "s", "id": 56239, "pid": 76337, "tid": -914061504, "ts": 1716454218279896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218360585, "dur": 130, "args": { "External id": 56259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56259, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 56259, "pid": 5, "tid": 7, "ts": 1716454218360585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279967, "dur": 19, "args": { "External id": 56259, "cbid": 211, "correlation": 56259 } }, { "ph": "s", "id": 56259, "pid": 76337, "tid": -914061504, "ts": 1716454218279967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218360717, "dur": 4, "args": { "External id": 56271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56271, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 56271, "pid": 5, "tid": 7, "ts": 1716454218360717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218279996, "dur": 7, "args": { "External id": 56271, "cbid": 211, "correlation": 56271 } }, { "ph": "s", "id": 56271, "pid": 76337, "tid": -914061504, "ts": 1716454218279996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218360722, "dur": 159, "args": { "External id": 56274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56274, "pid": 5, "tid": 7, "ts": 1716454218360722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280015, "dur": 7, "args": { "External id": 56274, "cbid": 211, "correlation": 56274 } }, { "ph": "s", "id": 56274, "pid": 76337, "tid": -914061504, "ts": 1716454218280015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218360883, "dur": 102, "args": { "External id": 56283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56283, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56283, "pid": 5, "tid": 7, "ts": 1716454218360883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280056, "dur": 9, "args": { "External id": 56283, "cbid": 211, "correlation": 56283 } }, { "ph": "s", "id": 56283, "pid": 76337, "tid": -914061504, "ts": 1716454218280056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218280108, "dur": 0, "args": { "External id": 56293, "cbid": 317, "correlation": 56293 } }, { "ph": "f", "id": 56293, "pid": 76337, "tid": -914061504, "ts": 1716454218280108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218280109, "dur": 3, "args": { "External id": 56294, "cbid": 203, "correlation": 56294 } }, { "ph": "f", "id": 56294, "pid": 76337, "tid": -914061504, "ts": 1716454218280109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218280113, "dur": 0, "args": { "External id": 56295, "cbid": 205, "correlation": 56295 } }, { "ph": "f", "id": 56295, "pid": 76337, "tid": -914061504, "ts": 1716454218280113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218360986, "dur": 111, "args": { "External id": 56299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56299, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56299, "pid": 5, "tid": 7, "ts": 1716454218360986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280129, "dur": 12, "args": { "External id": 56299, "cbid": 211, "correlation": 56299 } }, { "ph": "s", "id": 56299, "pid": 76337, "tid": -914061504, "ts": 1716454218280129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218361098, "dur": 33, "args": { "External id": 56301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56301, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56301, "pid": 5, "tid": 7, "ts": 1716454218361098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280144, "dur": 5, "args": { "External id": 56301, "cbid": 211, "correlation": 56301 } }, { "ph": "s", "id": 56301, "pid": 76337, "tid": -914061504, "ts": 1716454218280144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218361133, "dur": 3, "args": { "External id": 56303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56303, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56303, "pid": 5, "tid": 7, "ts": 1716454218361133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280153, "dur": 6, "args": { "External id": 56303, "cbid": 211, "correlation": 56303 } }, { "ph": "s", "id": 56303, "pid": 76337, "tid": -914061504, "ts": 1716454218280153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218280162, "dur": 0, "args": { "External id": 56304, "cbid": 51, "correlation": 56304 } }, { "ph": "s", "id": 56304, "pid": 76337, "tid": -914061504, "ts": 1716454218280162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218361138, "dur": 2010, "args": { "External id": 56305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56305, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56305, "pid": 5, "tid": 7, "ts": 1716454218361138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280163, "dur": 6, "args": { "External id": 56305, "cbid": 211, "correlation": 56305 } }, { "ph": "s", "id": 56305, "pid": 76337, "tid": -914061504, "ts": 1716454218280163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218363150, "dur": 59, "args": { "External id": 56310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56310, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56310, "pid": 5, "tid": 7, "ts": 1716454218363150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280192, "dur": 8, "args": { "External id": 56310, "cbid": 211, "correlation": 56310 } }, { "ph": "s", "id": 56310, "pid": 76337, "tid": -914061504, "ts": 1716454218280192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218363210, "dur": 3, "args": { "External id": 56318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56318, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56318, "pid": 5, "tid": 7, "ts": 1716454218363210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280236, "dur": 9, "args": { "External id": 56318, "cbid": 211, "correlation": 56318 } }, { "ph": "s", "id": 56318, "pid": 76337, "tid": -914061504, "ts": 1716454218280236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218280303, "dur": 1, "args": { "External id": 56334, "cbid": 251, "correlation": 56334 } }, { "ph": "f", "id": 56334, "pid": 76337, "tid": -914061504, "ts": 1716454218280303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218280309, "dur": 0, "args": { "External id": 56336, "cbid": 251, "correlation": 56336 } }, { "ph": "f", "id": 56336, "pid": 76337, "tid": -914061504, "ts": 1716454218280309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218363214, "dur": 11, "args": { "External id": 56337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56337, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 56337, "pid": 5, "tid": 7, "ts": 1716454218363214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280310, "dur": 11, "args": { "External id": 56337, "cbid": 211, "correlation": 56337 } }, { "ph": "s", "id": 56337, "pid": 76337, "tid": -914061504, "ts": 1716454218280310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218363227, "dur": 5, "args": { "External id": 56339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56339, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 56339, "pid": 5, "tid": 7, "ts": 1716454218363227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280324, "dur": 5, "args": { "External id": 56339, "cbid": 211, "correlation": 56339 } }, { "ph": "s", "id": 56339, "pid": 76337, "tid": -914061504, "ts": 1716454218280324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218363233, "dur": 53, "args": { "External id": 56349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56349, "pid": 5, "tid": 7, "ts": 1716454218363233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280382, "dur": 12, "args": { "External id": 56349, "cbid": 211, "correlation": 56349 } }, { "ph": "s", "id": 56349, "pid": 76337, "tid": -914061504, "ts": 1716454218280382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218363288, "dur": 51, "args": { "External id": 56369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56369, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 56369, "pid": 5, "tid": 7, "ts": 1716454218363288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280453, "dur": 11, "args": { "External id": 56369, "cbid": 211, "correlation": 56369 } }, { "ph": "s", "id": 56369, "pid": 76337, "tid": -914061504, "ts": 1716454218280453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218363340, "dur": 4, "args": { "External id": 56381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56381, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56381, "pid": 5, "tid": 7, "ts": 1716454218363340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280474, "dur": 6, "args": { "External id": 56381, "cbid": 211, "correlation": 56381 } }, { "ph": "s", "id": 56381, "pid": 76337, "tid": -914061504, "ts": 1716454218280474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218363346, "dur": 55, "args": { "External id": 56384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56384, "pid": 5, "tid": 7, "ts": 1716454218363346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280493, "dur": 7, "args": { "External id": 56384, "cbid": 211, "correlation": 56384 } }, { "ph": "s", "id": 56384, "pid": 76337, "tid": -914061504, "ts": 1716454218280493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218363402, "dur": 37, "args": { "External id": 56393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56393, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56393, "pid": 5, "tid": 7, "ts": 1716454218363402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280534, "dur": 10, "args": { "External id": 56393, "cbid": 211, "correlation": 56393 } }, { "ph": "s", "id": 56393, "pid": 76337, "tid": -914061504, "ts": 1716454218280534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218280597, "dur": 0, "args": { "External id": 56403, "cbid": 317, "correlation": 56403 } }, { "ph": "f", "id": 56403, "pid": 76337, "tid": -914061504, "ts": 1716454218280597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218280598, "dur": 0, "args": { "External id": 56404, "cbid": 203, "correlation": 56404 } }, { "ph": "f", "id": 56404, "pid": 76337, "tid": -914061504, "ts": 1716454218280598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218280599, "dur": 0, "args": { "External id": 56405, "cbid": 205, "correlation": 56405 } }, { "ph": "f", "id": 56405, "pid": 76337, "tid": -914061504, "ts": 1716454218280599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218363440, "dur": 41, "args": { "External id": 56409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56409, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56409, "pid": 5, "tid": 7, "ts": 1716454218363440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280616, "dur": 12, "args": { "External id": 56409, "cbid": 211, "correlation": 56409 } }, { "ph": "s", "id": 56409, "pid": 76337, "tid": -914061504, "ts": 1716454218280616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218363482, "dur": 14, "args": { "External id": 56411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56411, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56411, "pid": 5, "tid": 7, "ts": 1716454218363482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280630, "dur": 5, "args": { "External id": 56411, "cbid": 211, "correlation": 56411 } }, { "ph": "s", "id": 56411, "pid": 76337, "tid": -914061504, "ts": 1716454218280630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218363498, "dur": 3, "args": { "External id": 56413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56413, "pid": 5, "tid": 7, "ts": 1716454218363498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280640, "dur": 6, "args": { "External id": 56413, "cbid": 211, "correlation": 56413 } }, { "ph": "s", "id": 56413, "pid": 76337, "tid": -914061504, "ts": 1716454218280640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218280648, "dur": 0, "args": { "External id": 56414, "cbid": 51, "correlation": 56414 } }, { "ph": "s", "id": 56414, "pid": 76337, "tid": -914061504, "ts": 1716454218280648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218363503, "dur": 699, "args": { "External id": 56415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56415, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56415, "pid": 5, "tid": 7, "ts": 1716454218363503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280649, "dur": 6, "args": { "External id": 56415, "cbid": 211, "correlation": 56415 } }, { "ph": "s", "id": 56415, "pid": 76337, "tid": -914061504, "ts": 1716454218280649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218364203, "dur": 60, "args": { "External id": 56420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56420, "pid": 5, "tid": 7, "ts": 1716454218364203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280677, "dur": 11, "args": { "External id": 56420, "cbid": 211, "correlation": 56420 } }, { "ph": "s", "id": 56420, "pid": 76337, "tid": -914061504, "ts": 1716454218280677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218280738, "dur": 0, "args": { "External id": 56430, "cbid": 317, "correlation": 56430 } }, { "ph": "f", "id": 56430, "pid": 76337, "tid": -914061504, "ts": 1716454218280738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218280739, "dur": 0, "args": { "External id": 56431, "cbid": 203, "correlation": 56431 } }, { "ph": "f", "id": 56431, "pid": 76337, "tid": -914061504, "ts": 1716454218280739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218280740, "dur": 0, "args": { "External id": 56432, "cbid": 205, "correlation": 56432 } }, { "ph": "f", "id": 56432, "pid": 76337, "tid": -914061504, "ts": 1716454218280740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218364264, "dur": 3, "args": { "External id": 56436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56436, "pid": 5, "tid": 7, "ts": 1716454218364264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280755, "dur": 13, "args": { "External id": 56436, "cbid": 211, "correlation": 56436 } }, { "ph": "s", "id": 56436, "pid": 76337, "tid": -914061504, "ts": 1716454218280755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218280772, "dur": 0, "args": { "External id": 56437, "cbid": 51, "correlation": 56437 } }, { "ph": "s", "id": 56437, "pid": 76337, "tid": -914061504, "ts": 1716454218280772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454218364269, "dur": 265, "args": { "External id": 56438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56438, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56438, "pid": 5, "tid": 7, "ts": 1716454218364269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280773, "dur": 7, "args": { "External id": 56438, "cbid": 211, "correlation": 56438 } }, { "ph": "s", "id": 56438, "pid": 76337, "tid": -914061504, "ts": 1716454218280773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218364535, "dur": 59, "args": { "External id": 56443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56443, "pid": 5, "tid": 7, "ts": 1716454218364535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280801, "dur": 9, "args": { "External id": 56443, "cbid": 211, "correlation": 56443 } }, { "ph": "s", "id": 56443, "pid": 76337, "tid": -914061504, "ts": 1716454218280801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218364596, "dur": 50, "args": { "External id": 56451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56451, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56451, "pid": 5, "tid": 7, "ts": 1716454218364596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280830, "dur": 8, "args": { "External id": 56451, "cbid": 211, "correlation": 56451 } }, { "ph": "s", "id": 56451, "pid": 76337, "tid": -914061504, "ts": 1716454218280830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218364647, "dur": 35, "args": { "External id": 56459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56459, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56459, "pid": 5, "tid": 7, "ts": 1716454218364647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280858, "dur": 8, "args": { "External id": 56459, "cbid": 211, "correlation": 56459 } }, { "ph": "s", "id": 56459, "pid": 76337, "tid": -914061504, "ts": 1716454218280858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218364683, "dur": 53, "args": { "External id": 56479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56479, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 56479, "pid": 5, "tid": 7, "ts": 1716454218364683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280940, "dur": 12, "args": { "External id": 56479, "cbid": 211, "correlation": 56479 } }, { "ph": "s", "id": 56479, "pid": 76337, "tid": -914061504, "ts": 1716454218280940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218364738, "dur": 4, "args": { "External id": 56491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56491, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56491, "pid": 5, "tid": 7, "ts": 1716454218364738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280961, "dur": 9, "args": { "External id": 56491, "cbid": 211, "correlation": 56491 } }, { "ph": "s", "id": 56491, "pid": 76337, "tid": -914061504, "ts": 1716454218280961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218364743, "dur": 55, "args": { "External id": 56494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56494, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56494, "pid": 5, "tid": 7, "ts": 1716454218364743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218280990, "dur": 8, "args": { "External id": 56494, "cbid": 211, "correlation": 56494 } }, { "ph": "s", "id": 56494, "pid": 76337, "tid": -914061504, "ts": 1716454218280990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218281049, "dur": 0, "args": { "External id": 56505, "cbid": 317, "correlation": 56505 } }, { "ph": "f", "id": 56505, "pid": 76337, "tid": -914061504, "ts": 1716454218281049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218281049, "dur": 0, "args": { "External id": 56506, "cbid": 203, "correlation": 56506 } }, { "ph": "f", "id": 56506, "pid": 76337, "tid": -914061504, "ts": 1716454218281049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218281050, "dur": 0, "args": { "External id": 56507, "cbid": 205, "correlation": 56507 } }, { "ph": "f", "id": 56507, "pid": 76337, "tid": -914061504, "ts": 1716454218281050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281081, "dur": 2, "args": { "External id": 56511, "cbid": 251, "correlation": 56511 } }, { "ph": "f", "id": 56511, "pid": 76337, "tid": -914061504, "ts": 1716454218281081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281084, "dur": 1, "args": { "External id": 56512, "cbid": 251, "correlation": 56512 } }, { "ph": "f", "id": 56512, "pid": 76337, "tid": -914061504, "ts": 1716454218281084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281085, "dur": 0, "args": { "External id": 56513, "cbid": 251, "correlation": 56513 } }, { "ph": "f", "id": 56513, "pid": 76337, "tid": -914061504, "ts": 1716454218281085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281087, "dur": 1, "args": { "External id": 56514, "cbid": 251, "correlation": 56514 } }, { "ph": "f", "id": 56514, "pid": 76337, "tid": -914061504, "ts": 1716454218281087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281088, "dur": 1, "args": { "External id": 56515, "cbid": 251, "correlation": 56515 } }, { "ph": "f", "id": 56515, "pid": 76337, "tid": -914061504, "ts": 1716454218281088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281090, "dur": 1, "args": { "External id": 56516, "cbid": 251, "correlation": 56516 } }, { "ph": "f", "id": 56516, "pid": 76337, "tid": -914061504, "ts": 1716454218281090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281092, "dur": 1, "args": { "External id": 56517, "cbid": 251, "correlation": 56517 } }, { "ph": "f", "id": 56517, "pid": 76337, "tid": -914061504, "ts": 1716454218281092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281093, "dur": 1, "args": { "External id": 56518, "cbid": 251, "correlation": 56518 } }, { "ph": "f", "id": 56518, "pid": 76337, "tid": -914061504, "ts": 1716454218281093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281096, "dur": 0, "args": { "External id": 56519, "cbid": 251, "correlation": 56519 } }, { "ph": "f", "id": 56519, "pid": 76337, "tid": -914061504, "ts": 1716454218281096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218364800, "dur": 115, "args": { "External id": 56520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56520, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 56520, "pid": 5, "tid": 7, "ts": 1716454218364800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281098, "dur": 13, "args": { "External id": 56520, "cbid": 211, "correlation": 56520 } }, { "ph": "s", "id": 56520, "pid": 76337, "tid": -914061504, "ts": 1716454218281098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218364916, "dur": 60, "args": { "External id": 56526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56526, "pid": 5, "tid": 7, "ts": 1716454218364916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281134, "dur": 9, "args": { "External id": 56526, "cbid": 211, "correlation": 56526 } }, { "ph": "s", "id": 56526, "pid": 76337, "tid": -914061504, "ts": 1716454218281134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218364977, "dur": 567, "args": { "External id": 56535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56535, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56535, "pid": 5, "tid": 7, "ts": 1716454218364977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281217, "dur": 14, "args": { "External id": 56535, "cbid": 211, "correlation": 56535 } }, { "ph": "s", "id": 56535, "pid": 76337, "tid": -914061504, "ts": 1716454218281217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218365545, "dur": 182, "args": { "External id": 56557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56557, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56557, "pid": 5, "tid": 7, "ts": 1716454218365545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281274, "dur": 11, "args": { "External id": 56557, "cbid": 211, "correlation": 56557 } }, { "ph": "s", "id": 56557, "pid": 76337, "tid": -914061504, "ts": 1716454218281274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281363, "dur": 1, "args": { "External id": 56568, "cbid": 251, "correlation": 56568 } }, { "ph": "f", "id": 56568, "pid": 76337, "tid": -914061504, "ts": 1716454218281363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218365728, "dur": 197, "args": { "External id": 56569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56569, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56569, "pid": 5, "tid": 7, "ts": 1716454218365728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281368, "dur": 16, "args": { "External id": 56569, "cbid": 211, "correlation": 56569 } }, { "ph": "s", "id": 56569, "pid": 76337, "tid": -914061504, "ts": 1716454218281368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281442, "dur": 1, "args": { "External id": 56580, "cbid": 251, "correlation": 56580 } }, { "ph": "f", "id": 56580, "pid": 76337, "tid": -914061504, "ts": 1716454218281442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218365926, "dur": 191, "args": { "External id": 56581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56581, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56581, "pid": 5, "tid": 7, "ts": 1716454218365926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281446, "dur": 12, "args": { "External id": 56581, "cbid": 211, "correlation": 56581 } }, { "ph": "s", "id": 56581, "pid": 76337, "tid": -914061504, "ts": 1716454218281446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281509, "dur": 1, "args": { "External id": 56592, "cbid": 251, "correlation": 56592 } }, { "ph": "f", "id": 56592, "pid": 76337, "tid": -914061504, "ts": 1716454218281509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218366119, "dur": 187, "args": { "External id": 56593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56593, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56593, "pid": 5, "tid": 7, "ts": 1716454218366119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281513, "dur": 12, "args": { "External id": 56593, "cbid": 211, "correlation": 56593 } }, { "ph": "s", "id": 56593, "pid": 76337, "tid": -914061504, "ts": 1716454218281513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218366307, "dur": 18602, "args": { "External id": 56614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56614, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 56614, "pid": 5, "tid": 7, "ts": 1716454218366307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281596, "dur": 14, "args": { "External id": 56614, "cbid": 211, "correlation": 56614 } }, { "ph": "s", "id": 56614, "pid": 76337, "tid": -914061504, "ts": 1716454218281596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218281695, "dur": 1, "args": { "External id": 56632, "cbid": 251, "correlation": 56632 } }, { "ph": "f", "id": 56632, "pid": 76337, "tid": -914061504, "ts": 1716454218281695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218384911, "dur": 206, "args": { "External id": 56634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56634, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56634, "pid": 5, "tid": 7, "ts": 1716454218384911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281700, "dur": 14, "args": { "External id": 56634, "cbid": 211, "correlation": 56634 } }, { "ph": "s", "id": 56634, "pid": 76337, "tid": -914061504, "ts": 1716454218281700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218385118, "dur": 67, "args": { "External id": 56642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56642, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56642, "pid": 5, "tid": 7, "ts": 1716454218385118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281770, "dur": 12, "args": { "External id": 56642, "cbid": 211, "correlation": 56642 } }, { "ph": "s", "id": 56642, "pid": 76337, "tid": -914061504, "ts": 1716454218281770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218385186, "dur": 97, "args": { "External id": 56650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56650, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56650, "pid": 5, "tid": 7, "ts": 1716454218385186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281813, "dur": 9, "args": { "External id": 56650, "cbid": 211, "correlation": 56650 } }, { "ph": "s", "id": 56650, "pid": 76337, "tid": -914061504, "ts": 1716454218281813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218385284, "dur": 54, "args": { "External id": 56661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56661, "pid": 5, "tid": 7, "ts": 1716454218385284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281885, "dur": 12, "args": { "External id": 56661, "cbid": 211, "correlation": 56661 } }, { "ph": "s", "id": 56661, "pid": 76337, "tid": -914061504, "ts": 1716454218281885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218385340, "dur": 92, "args": { "External id": 56683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56683, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56683, "pid": 5, "tid": 7, "ts": 1716454218385340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218281916, "dur": 7, "args": { "External id": 56683, "cbid": 211, "correlation": 56683 } }, { "ph": "s", "id": 56683, "pid": 76337, "tid": -914061504, "ts": 1716454218281916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282007, "dur": 1, "args": { "External id": 56694, "cbid": 251, "correlation": 56694 } }, { "ph": "f", "id": 56694, "pid": 76337, "tid": -914061504, "ts": 1716454218282007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218385433, "dur": 105, "args": { "External id": 56695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56695, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56695, "pid": 5, "tid": 7, "ts": 1716454218385433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282012, "dur": 13, "args": { "External id": 56695, "cbid": 211, "correlation": 56695 } }, { "ph": "s", "id": 56695, "pid": 76337, "tid": -914061504, "ts": 1716454218282012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282089, "dur": 1, "args": { "External id": 56706, "cbid": 251, "correlation": 56706 } }, { "ph": "f", "id": 56706, "pid": 76337, "tid": -914061504, "ts": 1716454218282089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282093, "dur": 0, "args": { "External id": 56707, "cbid": 251, "correlation": 56707 } }, { "ph": "f", "id": 56707, "pid": 76337, "tid": -914061504, "ts": 1716454218282093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218385539, "dur": 10, "args": { "External id": 56708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56708, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 56708, "pid": 5, "tid": 7, "ts": 1716454218385539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282095, "dur": 13, "args": { "External id": 56708, "cbid": 211, "correlation": 56708 } }, { "ph": "s", "id": 56708, "pid": 76337, "tid": -914061504, "ts": 1716454218282095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218385550, "dur": 5, "args": { "External id": 56710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56710, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 56710, "pid": 5, "tid": 7, "ts": 1716454218385550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282112, "dur": 7, "args": { "External id": 56710, "cbid": 211, "correlation": 56710 } }, { "ph": "s", "id": 56710, "pid": 76337, "tid": -914061504, "ts": 1716454218282112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282173, "dur": 1, "args": { "External id": 56721, "cbid": 251, "correlation": 56721 } }, { "ph": "f", "id": 56721, "pid": 76337, "tid": -914061504, "ts": 1716454218282173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282177, "dur": 0, "args": { "External id": 56722, "cbid": 251, "correlation": 56722 } }, { "ph": "f", "id": 56722, "pid": 76337, "tid": -914061504, "ts": 1716454218282177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218385557, "dur": 6, "args": { "External id": 56723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56723, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 56723, "pid": 5, "tid": 7, "ts": 1716454218385557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282178, "dur": 15, "args": { "External id": 56723, "cbid": 211, "correlation": 56723 } }, { "ph": "s", "id": 56723, "pid": 76337, "tid": -914061504, "ts": 1716454218282178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218385564, "dur": 3, "args": { "External id": 56725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56725, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 56725, "pid": 5, "tid": 7, "ts": 1716454218385564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282195, "dur": 6, "args": { "External id": 56725, "cbid": 211, "correlation": 56725 } }, { "ph": "s", "id": 56725, "pid": 76337, "tid": -914061504, "ts": 1716454218282195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218385569, "dur": 155, "args": { "External id": 56746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56746, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 56746, "pid": 5, "tid": 7, "ts": 1716454218385569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282268, "dur": 12, "args": { "External id": 56746, "cbid": 211, "correlation": 56746 } }, { "ph": "s", "id": 56746, "pid": 76337, "tid": -914061504, "ts": 1716454218282268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282363, "dur": 2, "args": { "External id": 56764, "cbid": 251, "correlation": 56764 } }, { "ph": "f", "id": 56764, "pid": 76337, "tid": -914061504, "ts": 1716454218282363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218385725, "dur": 106, "args": { "External id": 56766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56766, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 56766, "pid": 5, "tid": 7, "ts": 1716454218385725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282369, "dur": 13, "args": { "External id": 56766, "cbid": 211, "correlation": 56766 } }, { "ph": "s", "id": 56766, "pid": 76337, "tid": -914061504, "ts": 1716454218282369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218385833, "dur": 34, "args": { "External id": 56774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56774, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56774, "pid": 5, "tid": 7, "ts": 1716454218385833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282439, "dur": 12, "args": { "External id": 56774, "cbid": 211, "correlation": 56774 } }, { "ph": "s", "id": 56774, "pid": 76337, "tid": -914061504, "ts": 1716454218282439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218385869, "dur": 67, "args": { "External id": 56782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56782, "pid": 5, "tid": 7, "ts": 1716454218385869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282480, "dur": 9, "args": { "External id": 56782, "cbid": 211, "correlation": 56782 } }, { "ph": "s", "id": 56782, "pid": 76337, "tid": -914061504, "ts": 1716454218282480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218385937, "dur": 91, "args": { "External id": 56804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56804, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56804, "pid": 5, "tid": 7, "ts": 1716454218385937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282535, "dur": 10, "args": { "External id": 56804, "cbid": 211, "correlation": 56804 } }, { "ph": "s", "id": 56804, "pid": 76337, "tid": -914061504, "ts": 1716454218282535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282622, "dur": 1, "args": { "External id": 56820, "cbid": 251, "correlation": 56820 } }, { "ph": "f", "id": 56820, "pid": 76337, "tid": -914061504, "ts": 1716454218282622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218386030, "dur": 575, "args": { "External id": 56822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56822, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56822, "pid": 5, "tid": 7, "ts": 1716454218386030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282628, "dur": 12, "args": { "External id": 56822, "cbid": 211, "correlation": 56822 } }, { "ph": "s", "id": 56822, "pid": 76337, "tid": -914061504, "ts": 1716454218282628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218386606, "dur": 243, "args": { "External id": 56830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56830, "pid": 5, "tid": 7, "ts": 1716454218386606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282693, "dur": 12, "args": { "External id": 56830, "cbid": 211, "correlation": 56830 } }, { "ph": "s", "id": 56830, "pid": 76337, "tid": -914061504, "ts": 1716454218282693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218386850, "dur": 252, "args": { "External id": 56838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56838, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56838, "pid": 5, "tid": 7, "ts": 1716454218386850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282724, "dur": 9, "args": { "External id": 56838, "cbid": 211, "correlation": 56838 } }, { "ph": "s", "id": 56838, "pid": 76337, "tid": -914061504, "ts": 1716454218282724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282806, "dur": 1, "args": { "External id": 56854, "cbid": 251, "correlation": 56854 } }, { "ph": "f", "id": 56854, "pid": 76337, "tid": -914061504, "ts": 1716454218282806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218282811, "dur": 0, "args": { "External id": 56856, "cbid": 251, "correlation": 56856 } }, { "ph": "f", "id": 56856, "pid": 76337, "tid": -914061504, "ts": 1716454218282811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218387104, "dur": 357, "args": { "External id": 56857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56857, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 56857, "pid": 5, "tid": 7, "ts": 1716454218387104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282814, "dur": 12, "args": { "External id": 56857, "cbid": 211, "correlation": 56857 } }, { "ph": "s", "id": 56857, "pid": 76337, "tid": -914061504, "ts": 1716454218282814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218387462, "dur": 50, "args": { "External id": 56865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56865, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56865, "pid": 5, "tid": 7, "ts": 1716454218387462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282856, "dur": 10, "args": { "External id": 56865, "cbid": 211, "correlation": 56865 } }, { "ph": "s", "id": 56865, "pid": 76337, "tid": -914061504, "ts": 1716454218282856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218387513, "dur": 158, "args": { "External id": 56876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56876, "pid": 5, "tid": 7, "ts": 1716454218387513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218282926, "dur": 13, "args": { "External id": 56876, "cbid": 211, "correlation": 56876 } }, { "ph": "s", "id": 56876, "pid": 76337, "tid": -914061504, "ts": 1716454218282926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218283001, "dur": 0, "args": { "External id": 56888, "cbid": 317, "correlation": 56888 } }, { "ph": "f", "id": 56888, "pid": 76337, "tid": -914061504, "ts": 1716454218283001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218283002, "dur": 0, "args": { "External id": 56889, "cbid": 203, "correlation": 56889 } }, { "ph": "f", "id": 56889, "pid": 76337, "tid": -914061504, "ts": 1716454218283002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218283003, "dur": 0, "args": { "External id": 56890, "cbid": 205, "correlation": 56890 } }, { "ph": "f", "id": 56890, "pid": 76337, "tid": -914061504, "ts": 1716454218283003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283027, "dur": 1, "args": { "External id": 56894, "cbid": 251, "correlation": 56894 } }, { "ph": "f", "id": 56894, "pid": 76337, "tid": -914061504, "ts": 1716454218283027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283029, "dur": 0, "args": { "External id": 56895, "cbid": 251, "correlation": 56895 } }, { "ph": "f", "id": 56895, "pid": 76337, "tid": -914061504, "ts": 1716454218283029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283029, "dur": 0, "args": { "External id": 56896, "cbid": 251, "correlation": 56896 } }, { "ph": "f", "id": 56896, "pid": 76337, "tid": -914061504, "ts": 1716454218283029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283030, "dur": 0, "args": { "External id": 56897, "cbid": 251, "correlation": 56897 } }, { "ph": "f", "id": 56897, "pid": 76337, "tid": -914061504, "ts": 1716454218283030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283031, "dur": 0, "args": { "External id": 56898, "cbid": 251, "correlation": 56898 } }, { "ph": "f", "id": 56898, "pid": 76337, "tid": -914061504, "ts": 1716454218283031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283032, "dur": 0, "args": { "External id": 56899, "cbid": 251, "correlation": 56899 } }, { "ph": "f", "id": 56899, "pid": 76337, "tid": -914061504, "ts": 1716454218283032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283033, "dur": 0, "args": { "External id": 56900, "cbid": 251, "correlation": 56900 } }, { "ph": "f", "id": 56900, "pid": 76337, "tid": -914061504, "ts": 1716454218283033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283034, "dur": 0, "args": { "External id": 56901, "cbid": 251, "correlation": 56901 } }, { "ph": "f", "id": 56901, "pid": 76337, "tid": -914061504, "ts": 1716454218283034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283035, "dur": 0, "args": { "External id": 56902, "cbid": 251, "correlation": 56902 } }, { "ph": "f", "id": 56902, "pid": 76337, "tid": -914061504, "ts": 1716454218283035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218387672, "dur": 114, "args": { "External id": 56903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56903, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 56903, "pid": 5, "tid": 7, "ts": 1716454218387672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283037, "dur": 14, "args": { "External id": 56903, "cbid": 211, "correlation": 56903 } }, { "ph": "s", "id": 56903, "pid": 76337, "tid": -914061504, "ts": 1716454218283037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218387788, "dur": 60, "args": { "External id": 56909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56909, "pid": 5, "tid": 7, "ts": 1716454218387788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283073, "dur": 9, "args": { "External id": 56909, "cbid": 211, "correlation": 56909 } }, { "ph": "s", "id": 56909, "pid": 76337, "tid": -914061504, "ts": 1716454218283073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218387849, "dur": 50, "args": { "External id": 56917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56917, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56917, "pid": 5, "tid": 7, "ts": 1716454218387849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283106, "dur": 8, "args": { "External id": 56917, "cbid": 211, "correlation": 56917 } }, { "ph": "s", "id": 56917, "pid": 76337, "tid": -914061504, "ts": 1716454218283106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218387900, "dur": 98, "args": { "External id": 56926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56926, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56926, "pid": 5, "tid": 7, "ts": 1716454218387900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283144, "dur": 11, "args": { "External id": 56926, "cbid": 211, "correlation": 56926 } }, { "ph": "s", "id": 56926, "pid": 76337, "tid": -914061504, "ts": 1716454218283144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218387999, "dur": 93, "args": { "External id": 56946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56946, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 56946, "pid": 5, "tid": 7, "ts": 1716454218387999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283218, "dur": 11, "args": { "External id": 56946, "cbid": 211, "correlation": 56946 } }, { "ph": "s", "id": 56946, "pid": 76337, "tid": -914061504, "ts": 1716454218283218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218388093, "dur": 5, "args": { "External id": 56958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56958, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 56958, "pid": 5, "tid": 7, "ts": 1716454218388093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283243, "dur": 7, "args": { "External id": 56958, "cbid": 211, "correlation": 56958 } }, { "ph": "s", "id": 56958, "pid": 76337, "tid": -914061504, "ts": 1716454218283243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218388099, "dur": 109, "args": { "External id": 56961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56961, "pid": 5, "tid": 7, "ts": 1716454218388099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283262, "dur": 6, "args": { "External id": 56961, "cbid": 211, "correlation": 56961 } }, { "ph": "s", "id": 56961, "pid": 76337, "tid": -914061504, "ts": 1716454218283262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218388210, "dur": 69, "args": { "External id": 56970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56970, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56970, "pid": 5, "tid": 7, "ts": 1716454218388210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283301, "dur": 10, "args": { "External id": 56970, "cbid": 211, "correlation": 56970 } }, { "ph": "s", "id": 56970, "pid": 76337, "tid": -914061504, "ts": 1716454218283301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218283354, "dur": 0, "args": { "External id": 56980, "cbid": 317, "correlation": 56980 } }, { "ph": "f", "id": 56980, "pid": 76337, "tid": -914061504, "ts": 1716454218283354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218283355, "dur": 0, "args": { "External id": 56981, "cbid": 203, "correlation": 56981 } }, { "ph": "f", "id": 56981, "pid": 76337, "tid": -914061504, "ts": 1716454218283355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218283356, "dur": 0, "args": { "External id": 56982, "cbid": 205, "correlation": 56982 } }, { "ph": "f", "id": 56982, "pid": 76337, "tid": -914061504, "ts": 1716454218283356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218388280, "dur": 76, "args": { "External id": 56986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56986, "pid": 5, "tid": 7, "ts": 1716454218388280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283372, "dur": 12, "args": { "External id": 56986, "cbid": 211, "correlation": 56986 } }, { "ph": "s", "id": 56986, "pid": 76337, "tid": -914061504, "ts": 1716454218283372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218388357, "dur": 24, "args": { "External id": 56988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56988, "pid": 5, "tid": 7, "ts": 1716454218388357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283386, "dur": 5, "args": { "External id": 56988, "cbid": 211, "correlation": 56988 } }, { "ph": "s", "id": 56988, "pid": 76337, "tid": -914061504, "ts": 1716454218283386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218388382, "dur": 4, "args": { "External id": 56990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 56990, "pid": 5, "tid": 7, "ts": 1716454218388382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283396, "dur": 6, "args": { "External id": 56990, "cbid": 211, "correlation": 56990 } }, { "ph": "s", "id": 56990, "pid": 76337, "tid": -914061504, "ts": 1716454218283396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218283404, "dur": 0, "args": { "External id": 56991, "cbid": 51, "correlation": 56991 } }, { "ph": "s", "id": 56991, "pid": 76337, "tid": -914061504, "ts": 1716454218283404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218388388, "dur": 1358, "args": { "External id": 56992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56992, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 56992, "pid": 5, "tid": 7, "ts": 1716454218388388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283405, "dur": 5, "args": { "External id": 56992, "cbid": 211, "correlation": 56992 } }, { "ph": "s", "id": 56992, "pid": 76337, "tid": -914061504, "ts": 1716454218283405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218389747, "dur": 59, "args": { "External id": 56997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 56997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 56997, "pid": 5, "tid": 7, "ts": 1716454218389747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283433, "dur": 8, "args": { "External id": 56997, "cbid": 211, "correlation": 56997 } }, { "ph": "s", "id": 56997, "pid": 76337, "tid": -914061504, "ts": 1716454218283433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218389807, "dur": 3, "args": { "External id": 57005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57005, "pid": 5, "tid": 7, "ts": 1716454218389807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283476, "dur": 9, "args": { "External id": 57005, "cbid": 211, "correlation": 57005 } }, { "ph": "s", "id": 57005, "pid": 76337, "tid": -914061504, "ts": 1716454218283476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283544, "dur": 1, "args": { "External id": 57021, "cbid": 251, "correlation": 57021 } }, { "ph": "f", "id": 57021, "pid": 76337, "tid": -914061504, "ts": 1716454218283544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218283549, "dur": 0, "args": { "External id": 57023, "cbid": 251, "correlation": 57023 } }, { "ph": "f", "id": 57023, "pid": 76337, "tid": -914061504, "ts": 1716454218283549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218389811, "dur": 11, "args": { "External id": 57024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57024, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 57024, "pid": 5, "tid": 7, "ts": 1716454218389811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283551, "dur": 12, "args": { "External id": 57024, "cbid": 211, "correlation": 57024 } }, { "ph": "s", "id": 57024, "pid": 76337, "tid": -914061504, "ts": 1716454218283551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218389824, "dur": 5, "args": { "External id": 57026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57026, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 57026, "pid": 5, "tid": 7, "ts": 1716454218389824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283564, "dur": 5, "args": { "External id": 57026, "cbid": 211, "correlation": 57026 } }, { "ph": "s", "id": 57026, "pid": 76337, "tid": -914061504, "ts": 1716454218283564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218389830, "dur": 55, "args": { "External id": 57036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57036, "pid": 5, "tid": 7, "ts": 1716454218389830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283621, "dur": 13, "args": { "External id": 57036, "cbid": 211, "correlation": 57036 } }, { "ph": "s", "id": 57036, "pid": 76337, "tid": -914061504, "ts": 1716454218283621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218389886, "dur": 51, "args": { "External id": 57056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57056, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 57056, "pid": 5, "tid": 7, "ts": 1716454218389886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283687, "dur": 11, "args": { "External id": 57056, "cbid": 211, "correlation": 57056 } }, { "ph": "s", "id": 57056, "pid": 76337, "tid": -914061504, "ts": 1716454218283687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218389939, "dur": 4, "args": { "External id": 57068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57068, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57068, "pid": 5, "tid": 7, "ts": 1716454218389939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283708, "dur": 6, "args": { "External id": 57068, "cbid": 211, "correlation": 57068 } }, { "ph": "s", "id": 57068, "pid": 76337, "tid": -914061504, "ts": 1716454218283708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218389944, "dur": 54, "args": { "External id": 57071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57071, "pid": 5, "tid": 7, "ts": 1716454218389944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283726, "dur": 6, "args": { "External id": 57071, "cbid": 211, "correlation": 57071 } }, { "ph": "s", "id": 57071, "pid": 76337, "tid": -914061504, "ts": 1716454218283726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218389999, "dur": 36, "args": { "External id": 57080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57080, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57080, "pid": 5, "tid": 7, "ts": 1716454218389999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283767, "dur": 10, "args": { "External id": 57080, "cbid": 211, "correlation": 57080 } }, { "ph": "s", "id": 57080, "pid": 76337, "tid": -914061504, "ts": 1716454218283767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218283832, "dur": 0, "args": { "External id": 57090, "cbid": 317, "correlation": 57090 } }, { "ph": "f", "id": 57090, "pid": 76337, "tid": -914061504, "ts": 1716454218283832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218283833, "dur": 0, "args": { "External id": 57091, "cbid": 203, "correlation": 57091 } }, { "ph": "f", "id": 57091, "pid": 76337, "tid": -914061504, "ts": 1716454218283833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218283834, "dur": 0, "args": { "External id": 57092, "cbid": 205, "correlation": 57092 } }, { "ph": "f", "id": 57092, "pid": 76337, "tid": -914061504, "ts": 1716454218283834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218390037, "dur": 41, "args": { "External id": 57096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57096, "pid": 5, "tid": 7, "ts": 1716454218390037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283849, "dur": 13, "args": { "External id": 57096, "cbid": 211, "correlation": 57096 } }, { "ph": "s", "id": 57096, "pid": 76337, "tid": -914061504, "ts": 1716454218283849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218390080, "dur": 15, "args": { "External id": 57098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57098, "pid": 5, "tid": 7, "ts": 1716454218390080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283864, "dur": 5, "args": { "External id": 57098, "cbid": 211, "correlation": 57098 } }, { "ph": "s", "id": 57098, "pid": 76337, "tid": -914061504, "ts": 1716454218283864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218390095, "dur": 3, "args": { "External id": 57100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57100, "pid": 5, "tid": 7, "ts": 1716454218390095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283874, "dur": 5, "args": { "External id": 57100, "cbid": 211, "correlation": 57100 } }, { "ph": "s", "id": 57100, "pid": 76337, "tid": -914061504, "ts": 1716454218283874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218283882, "dur": 0, "args": { "External id": 57101, "cbid": 51, "correlation": 57101 } }, { "ph": "s", "id": 57101, "pid": 76337, "tid": -914061504, "ts": 1716454218283882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218390100, "dur": 695, "args": { "External id": 57102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57102, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57102, "pid": 5, "tid": 7, "ts": 1716454218390100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283882, "dur": 5, "args": { "External id": 57102, "cbid": 211, "correlation": 57102 } }, { "ph": "s", "id": 57102, "pid": 76337, "tid": -914061504, "ts": 1716454218283882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218390797, "dur": 60, "args": { "External id": 57107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57107, "pid": 5, "tid": 7, "ts": 1716454218390797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283911, "dur": 8, "args": { "External id": 57107, "cbid": 211, "correlation": 57107 } }, { "ph": "s", "id": 57107, "pid": 76337, "tid": -914061504, "ts": 1716454218283911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218283968, "dur": 0, "args": { "External id": 57117, "cbid": 317, "correlation": 57117 } }, { "ph": "f", "id": 57117, "pid": 76337, "tid": -914061504, "ts": 1716454218283968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218283969, "dur": 0, "args": { "External id": 57118, "cbid": 203, "correlation": 57118 } }, { "ph": "f", "id": 57118, "pid": 76337, "tid": -914061504, "ts": 1716454218283969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218283970, "dur": 0, "args": { "External id": 57119, "cbid": 205, "correlation": 57119 } }, { "ph": "f", "id": 57119, "pid": 76337, "tid": -914061504, "ts": 1716454218283970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218390858, "dur": 75, "args": { "External id": 57123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57123, "pid": 5, "tid": 7, "ts": 1716454218390858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218283990, "dur": 12, "args": { "External id": 57123, "cbid": 211, "correlation": 57123 } }, { "ph": "s", "id": 57123, "pid": 76337, "tid": -914061504, "ts": 1716454218283990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218390934, "dur": 208, "args": { "External id": 57125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57125, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57125, "pid": 5, "tid": 7, "ts": 1716454218390934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284009, "dur": 9, "args": { "External id": 57125, "cbid": 211, "correlation": 57125 } }, { "ph": "s", "id": 57125, "pid": 76337, "tid": -914061504, "ts": 1716454218284009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218391143, "dur": 37, "args": { "External id": 57127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57127, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57127, "pid": 5, "tid": 7, "ts": 1716454218391143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284022, "dur": 5, "args": { "External id": 57127, "cbid": 211, "correlation": 57127 } }, { "ph": "s", "id": 57127, "pid": 76337, "tid": -914061504, "ts": 1716454218284022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218391182, "dur": 60, "args": { "External id": 57133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57133, "pid": 5, "tid": 7, "ts": 1716454218391182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284049, "dur": 9, "args": { "External id": 57133, "cbid": 211, "correlation": 57133 } }, { "ph": "s", "id": 57133, "pid": 76337, "tid": -914061504, "ts": 1716454218284049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218391243, "dur": 50, "args": { "External id": 57141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57141, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57141, "pid": 5, "tid": 7, "ts": 1716454218391243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284079, "dur": 8, "args": { "External id": 57141, "cbid": 211, "correlation": 57141 } }, { "ph": "s", "id": 57141, "pid": 76337, "tid": -914061504, "ts": 1716454218284079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218391294, "dur": 35, "args": { "External id": 57149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57149, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57149, "pid": 5, "tid": 7, "ts": 1716454218391294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284110, "dur": 9, "args": { "External id": 57149, "cbid": 211, "correlation": 57149 } }, { "ph": "s", "id": 57149, "pid": 76337, "tid": -914061504, "ts": 1716454218284110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218391330, "dur": 52, "args": { "External id": 57169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57169, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 57169, "pid": 5, "tid": 7, "ts": 1716454218391330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284193, "dur": 12, "args": { "External id": 57169, "cbid": 211, "correlation": 57169 } }, { "ph": "s", "id": 57169, "pid": 76337, "tid": -914061504, "ts": 1716454218284193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218391384, "dur": 4, "args": { "External id": 57181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57181, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57181, "pid": 5, "tid": 7, "ts": 1716454218391384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284215, "dur": 6, "args": { "External id": 57181, "cbid": 211, "correlation": 57181 } }, { "ph": "s", "id": 57181, "pid": 76337, "tid": -914061504, "ts": 1716454218284215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218391390, "dur": 55, "args": { "External id": 57184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57184, "pid": 5, "tid": 7, "ts": 1716454218391390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284233, "dur": 6, "args": { "External id": 57184, "cbid": 211, "correlation": 57184 } }, { "ph": "s", "id": 57184, "pid": 76337, "tid": -914061504, "ts": 1716454218284233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218284291, "dur": 0, "args": { "External id": 57195, "cbid": 317, "correlation": 57195 } }, { "ph": "f", "id": 57195, "pid": 76337, "tid": -914061504, "ts": 1716454218284291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218284291, "dur": 0, "args": { "External id": 57196, "cbid": 203, "correlation": 57196 } }, { "ph": "f", "id": 57196, "pid": 76337, "tid": -914061504, "ts": 1716454218284291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218284292, "dur": 0, "args": { "External id": 57197, "cbid": 205, "correlation": 57197 } }, { "ph": "f", "id": 57197, "pid": 76337, "tid": -914061504, "ts": 1716454218284292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284314, "dur": 1, "args": { "External id": 57201, "cbid": 251, "correlation": 57201 } }, { "ph": "f", "id": 57201, "pid": 76337, "tid": -914061504, "ts": 1716454218284314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284316, "dur": 0, "args": { "External id": 57202, "cbid": 251, "correlation": 57202 } }, { "ph": "f", "id": 57202, "pid": 76337, "tid": -914061504, "ts": 1716454218284316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284317, "dur": 0, "args": { "External id": 57203, "cbid": 251, "correlation": 57203 } }, { "ph": "f", "id": 57203, "pid": 76337, "tid": -914061504, "ts": 1716454218284317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284318, "dur": 0, "args": { "External id": 57204, "cbid": 251, "correlation": 57204 } }, { "ph": "f", "id": 57204, "pid": 76337, "tid": -914061504, "ts": 1716454218284318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284319, "dur": 0, "args": { "External id": 57205, "cbid": 251, "correlation": 57205 } }, { "ph": "f", "id": 57205, "pid": 76337, "tid": -914061504, "ts": 1716454218284319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284319, "dur": 0, "args": { "External id": 57206, "cbid": 251, "correlation": 57206 } }, { "ph": "f", "id": 57206, "pid": 76337, "tid": -914061504, "ts": 1716454218284319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284320, "dur": 0, "args": { "External id": 57207, "cbid": 251, "correlation": 57207 } }, { "ph": "f", "id": 57207, "pid": 76337, "tid": -914061504, "ts": 1716454218284320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284321, "dur": 0, "args": { "External id": 57208, "cbid": 251, "correlation": 57208 } }, { "ph": "f", "id": 57208, "pid": 76337, "tid": -914061504, "ts": 1716454218284321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284322, "dur": 0, "args": { "External id": 57209, "cbid": 251, "correlation": 57209 } }, { "ph": "f", "id": 57209, "pid": 76337, "tid": -914061504, "ts": 1716454218284322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218391446, "dur": 115, "args": { "External id": 57210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57210, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57210, "pid": 5, "tid": 7, "ts": 1716454218391446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284324, "dur": 13, "args": { "External id": 57210, "cbid": 211, "correlation": 57210 } }, { "ph": "s", "id": 57210, "pid": 76337, "tid": -914061504, "ts": 1716454218284324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218391562, "dur": 59, "args": { "External id": 57216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57216, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57216, "pid": 5, "tid": 7, "ts": 1716454218391562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284359, "dur": 9, "args": { "External id": 57216, "cbid": 211, "correlation": 57216 } }, { "ph": "s", "id": 57216, "pid": 76337, "tid": -914061504, "ts": 1716454218284359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218391623, "dur": 604, "args": { "External id": 57225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57225, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57225, "pid": 5, "tid": 7, "ts": 1716454218391623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284444, "dur": 14, "args": { "External id": 57225, "cbid": 211, "correlation": 57225 } }, { "ph": "s", "id": 57225, "pid": 76337, "tid": -914061504, "ts": 1716454218284444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218392228, "dur": 180, "args": { "External id": 57247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57247, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57247, "pid": 5, "tid": 7, "ts": 1716454218392228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284502, "dur": 10, "args": { "External id": 57247, "cbid": 211, "correlation": 57247 } }, { "ph": "s", "id": 57247, "pid": 76337, "tid": -914061504, "ts": 1716454218284502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284587, "dur": 1, "args": { "External id": 57258, "cbid": 251, "correlation": 57258 } }, { "ph": "f", "id": 57258, "pid": 76337, "tid": -914061504, "ts": 1716454218284587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218392409, "dur": 194, "args": { "External id": 57259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57259, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57259, "pid": 5, "tid": 7, "ts": 1716454218392409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284593, "dur": 13, "args": { "External id": 57259, "cbid": 211, "correlation": 57259 } }, { "ph": "s", "id": 57259, "pid": 76337, "tid": -914061504, "ts": 1716454218284593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284661, "dur": 1, "args": { "External id": 57270, "cbid": 251, "correlation": 57270 } }, { "ph": "f", "id": 57270, "pid": 76337, "tid": -914061504, "ts": 1716454218284661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218392604, "dur": 189, "args": { "External id": 57271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57271, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57271, "pid": 5, "tid": 7, "ts": 1716454218392604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284665, "dur": 11, "args": { "External id": 57271, "cbid": 211, "correlation": 57271 } }, { "ph": "s", "id": 57271, "pid": 76337, "tid": -914061504, "ts": 1716454218284665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284728, "dur": 1, "args": { "External id": 57282, "cbid": 251, "correlation": 57282 } }, { "ph": "f", "id": 57282, "pid": 76337, "tid": -914061504, "ts": 1716454218284728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218392795, "dur": 189, "args": { "External id": 57283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57283, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57283, "pid": 5, "tid": 7, "ts": 1716454218392795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284733, "dur": 11, "args": { "External id": 57283, "cbid": 211, "correlation": 57283 } }, { "ph": "s", "id": 57283, "pid": 76337, "tid": -914061504, "ts": 1716454218284733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218392986, "dur": 18553, "args": { "External id": 57304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57304, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57304, "pid": 5, "tid": 7, "ts": 1716454218392986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284816, "dur": 12, "args": { "External id": 57304, "cbid": 211, "correlation": 57304 } }, { "ph": "s", "id": 57304, "pid": 76337, "tid": -914061504, "ts": 1716454218284816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218284912, "dur": 1, "args": { "External id": 57322, "cbid": 251, "correlation": 57322 } }, { "ph": "f", "id": 57322, "pid": 76337, "tid": -914061504, "ts": 1716454218284912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218411540, "dur": 200, "args": { "External id": 57324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57324, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57324, "pid": 5, "tid": 7, "ts": 1716454218411540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284918, "dur": 14, "args": { "External id": 57324, "cbid": 211, "correlation": 57324 } }, { "ph": "s", "id": 57324, "pid": 76337, "tid": -914061504, "ts": 1716454218284918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218411742, "dur": 66, "args": { "External id": 57332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57332, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57332, "pid": 5, "tid": 7, "ts": 1716454218411742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218284996, "dur": 13, "args": { "External id": 57332, "cbid": 211, "correlation": 57332 } }, { "ph": "s", "id": 57332, "pid": 76337, "tid": -914061504, "ts": 1716454218284996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218411809, "dur": 97, "args": { "External id": 57340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57340, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57340, "pid": 5, "tid": 7, "ts": 1716454218411809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218285037, "dur": 9, "args": { "External id": 57340, "cbid": 211, "correlation": 57340 } }, { "ph": "s", "id": 57340, "pid": 76337, "tid": -914061504, "ts": 1716454218285037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218411908, "dur": 53, "args": { "External id": 57351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57351, "pid": 5, "tid": 7, "ts": 1716454218411908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218285110, "dur": 12, "args": { "External id": 57351, "cbid": 211, "correlation": 57351 } }, { "ph": "s", "id": 57351, "pid": 76337, "tid": -914061504, "ts": 1716454218285110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218411962, "dur": 92, "args": { "External id": 57373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57373, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57373, "pid": 5, "tid": 7, "ts": 1716454218411962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218285142, "dur": 751, "args": { "External id": 57373, "cbid": 211, "correlation": 57373 } }, { "ph": "s", "id": 57373, "pid": 76337, "tid": -914061504, "ts": 1716454218285142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218285972, "dur": 7, "args": { "External id": 57384, "cbid": 251, "correlation": 57384 } }, { "ph": "f", "id": 57384, "pid": 76337, "tid": -914061504, "ts": 1716454218285972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218412056, "dur": 104, "args": { "External id": 57385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57385, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57385, "pid": 5, "tid": 7, "ts": 1716454218412056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218285984, "dur": 55, "args": { "External id": 57385, "cbid": 211, "correlation": 57385 } }, { "ph": "s", "id": 57385, "pid": 76337, "tid": -914061504, "ts": 1716454218285984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286101, "dur": 1, "args": { "External id": 57396, "cbid": 251, "correlation": 57396 } }, { "ph": "f", "id": 57396, "pid": 76337, "tid": -914061504, "ts": 1716454218286101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286105, "dur": 0, "args": { "External id": 57397, "cbid": 251, "correlation": 57397 } }, { "ph": "f", "id": 57397, "pid": 76337, "tid": -914061504, "ts": 1716454218286105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218412161, "dur": 10, "args": { "External id": 57398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57398, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 57398, "pid": 5, "tid": 7, "ts": 1716454218412161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286106, "dur": 12, "args": { "External id": 57398, "cbid": 211, "correlation": 57398 } }, { "ph": "s", "id": 57398, "pid": 76337, "tid": -914061504, "ts": 1716454218286106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218412172, "dur": 5, "args": { "External id": 57400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57400, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 57400, "pid": 5, "tid": 7, "ts": 1716454218412172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286121, "dur": 6, "args": { "External id": 57400, "cbid": 211, "correlation": 57400 } }, { "ph": "s", "id": 57400, "pid": 76337, "tid": -914061504, "ts": 1716454218286121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286181, "dur": 1, "args": { "External id": 57411, "cbid": 251, "correlation": 57411 } }, { "ph": "f", "id": 57411, "pid": 76337, "tid": -914061504, "ts": 1716454218286181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286185, "dur": 0, "args": { "External id": 57412, "cbid": 251, "correlation": 57412 } }, { "ph": "f", "id": 57412, "pid": 76337, "tid": -914061504, "ts": 1716454218286185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218412178, "dur": 6, "args": { "External id": 57413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57413, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 57413, "pid": 5, "tid": 7, "ts": 1716454218412178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286186, "dur": 12, "args": { "External id": 57413, "cbid": 211, "correlation": 57413 } }, { "ph": "s", "id": 57413, "pid": 76337, "tid": -914061504, "ts": 1716454218286186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218412186, "dur": 3, "args": { "External id": 57415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57415, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 57415, "pid": 5, "tid": 7, "ts": 1716454218412186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286200, "dur": 6, "args": { "External id": 57415, "cbid": 211, "correlation": 57415 } }, { "ph": "s", "id": 57415, "pid": 76337, "tid": -914061504, "ts": 1716454218286200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218412191, "dur": 155, "args": { "External id": 57436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57436, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57436, "pid": 5, "tid": 7, "ts": 1716454218412191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286273, "dur": 13, "args": { "External id": 57436, "cbid": 211, "correlation": 57436 } }, { "ph": "s", "id": 57436, "pid": 76337, "tid": -914061504, "ts": 1716454218286273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286372, "dur": 1, "args": { "External id": 57454, "cbid": 251, "correlation": 57454 } }, { "ph": "f", "id": 57454, "pid": 76337, "tid": -914061504, "ts": 1716454218286372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218412347, "dur": 107, "args": { "External id": 57456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57456, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57456, "pid": 5, "tid": 7, "ts": 1716454218412347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286378, "dur": 13, "args": { "External id": 57456, "cbid": 211, "correlation": 57456 } }, { "ph": "s", "id": 57456, "pid": 76337, "tid": -914061504, "ts": 1716454218286378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218412455, "dur": 34, "args": { "External id": 57464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57464, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57464, "pid": 5, "tid": 7, "ts": 1716454218412455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286448, "dur": 12, "args": { "External id": 57464, "cbid": 211, "correlation": 57464 } }, { "ph": "s", "id": 57464, "pid": 76337, "tid": -914061504, "ts": 1716454218286448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218412491, "dur": 66, "args": { "External id": 57472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57472, "pid": 5, "tid": 7, "ts": 1716454218412491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286488, "dur": 9, "args": { "External id": 57472, "cbid": 211, "correlation": 57472 } }, { "ph": "s", "id": 57472, "pid": 76337, "tid": -914061504, "ts": 1716454218286488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218412558, "dur": 92, "args": { "External id": 57494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57494, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57494, "pid": 5, "tid": 7, "ts": 1716454218412558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286539, "dur": 10, "args": { "External id": 57494, "cbid": 211, "correlation": 57494 } }, { "ph": "s", "id": 57494, "pid": 76337, "tid": -914061504, "ts": 1716454218286539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286624, "dur": 1, "args": { "External id": 57510, "cbid": 251, "correlation": 57510 } }, { "ph": "f", "id": 57510, "pid": 76337, "tid": -914061504, "ts": 1716454218286624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218412651, "dur": 570, "args": { "External id": 57512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57512, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57512, "pid": 5, "tid": 7, "ts": 1716454218412651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286630, "dur": 12, "args": { "External id": 57512, "cbid": 211, "correlation": 57512 } }, { "ph": "s", "id": 57512, "pid": 76337, "tid": -914061504, "ts": 1716454218286630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218413223, "dur": 242, "args": { "External id": 57520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57520, "pid": 5, "tid": 7, "ts": 1716454218413223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286694, "dur": 13, "args": { "External id": 57520, "cbid": 211, "correlation": 57520 } }, { "ph": "s", "id": 57520, "pid": 76337, "tid": -914061504, "ts": 1716454218286694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218413466, "dur": 251, "args": { "External id": 57528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57528, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57528, "pid": 5, "tid": 7, "ts": 1716454218413466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286725, "dur": 12, "args": { "External id": 57528, "cbid": 211, "correlation": 57528 } }, { "ph": "s", "id": 57528, "pid": 76337, "tid": -914061504, "ts": 1716454218286725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286811, "dur": 1, "args": { "External id": 57544, "cbid": 251, "correlation": 57544 } }, { "ph": "f", "id": 57544, "pid": 76337, "tid": -914061504, "ts": 1716454218286811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218286816, "dur": 0, "args": { "External id": 57546, "cbid": 251, "correlation": 57546 } }, { "ph": "f", "id": 57546, "pid": 76337, "tid": -914061504, "ts": 1716454218286816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218413718, "dur": 359, "args": { "External id": 57547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57547, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 57547, "pid": 5, "tid": 7, "ts": 1716454218413718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286819, "dur": 13, "args": { "External id": 57547, "cbid": 211, "correlation": 57547 } }, { "ph": "s", "id": 57547, "pid": 76337, "tid": -914061504, "ts": 1716454218286819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218414079, "dur": 50, "args": { "External id": 57555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57555, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57555, "pid": 5, "tid": 7, "ts": 1716454218414079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286861, "dur": 10, "args": { "External id": 57555, "cbid": 211, "correlation": 57555 } }, { "ph": "s", "id": 57555, "pid": 76337, "tid": -914061504, "ts": 1716454218286861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218414130, "dur": 158, "args": { "External id": 57566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57566, "pid": 5, "tid": 7, "ts": 1716454218414130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218286928, "dur": 3156, "args": { "External id": 57566, "cbid": 211, "correlation": 57566 } }, { "ph": "s", "id": 57566, "pid": 76337, "tid": -914061504, "ts": 1716454218286928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218290216, "dur": 0, "args": { "External id": 57578, "cbid": 317, "correlation": 57578 } }, { "ph": "f", "id": 57578, "pid": 76337, "tid": -914061504, "ts": 1716454218290216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218290218, "dur": 1, "args": { "External id": 57579, "cbid": 203, "correlation": 57579 } }, { "ph": "f", "id": 57579, "pid": 76337, "tid": -914061504, "ts": 1716454218290218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218290219, "dur": 1, "args": { "External id": 57580, "cbid": 205, "correlation": 57580 } }, { "ph": "f", "id": 57580, "pid": 76337, "tid": -914061504, "ts": 1716454218290219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290268, "dur": 2, "args": { "External id": 57584, "cbid": 251, "correlation": 57584 } }, { "ph": "f", "id": 57584, "pid": 76337, "tid": -914061504, "ts": 1716454218290268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290272, "dur": 0, "args": { "External id": 57585, "cbid": 251, "correlation": 57585 } }, { "ph": "f", "id": 57585, "pid": 76337, "tid": -914061504, "ts": 1716454218290272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290273, "dur": 0, "args": { "External id": 57586, "cbid": 251, "correlation": 57586 } }, { "ph": "f", "id": 57586, "pid": 76337, "tid": -914061504, "ts": 1716454218290273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290274, "dur": 0, "args": { "External id": 57587, "cbid": 251, "correlation": 57587 } }, { "ph": "f", "id": 57587, "pid": 76337, "tid": -914061504, "ts": 1716454218290274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290275, "dur": 1, "args": { "External id": 57588, "cbid": 251, "correlation": 57588 } }, { "ph": "f", "id": 57588, "pid": 76337, "tid": -914061504, "ts": 1716454218290275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290277, "dur": 1, "args": { "External id": 57589, "cbid": 251, "correlation": 57589 } }, { "ph": "f", "id": 57589, "pid": 76337, "tid": -914061504, "ts": 1716454218290277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290278, "dur": 0, "args": { "External id": 57590, "cbid": 251, "correlation": 57590 } }, { "ph": "f", "id": 57590, "pid": 76337, "tid": -914061504, "ts": 1716454218290278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290280, "dur": 1, "args": { "External id": 57591, "cbid": 251, "correlation": 57591 } }, { "ph": "f", "id": 57591, "pid": 76337, "tid": -914061504, "ts": 1716454218290280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290282, "dur": 0, "args": { "External id": 57592, "cbid": 251, "correlation": 57592 } }, { "ph": "f", "id": 57592, "pid": 76337, "tid": -914061504, "ts": 1716454218290282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218414289, "dur": 115, "args": { "External id": 57593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57593, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57593, "pid": 5, "tid": 7, "ts": 1716454218414289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290285, "dur": 21, "args": { "External id": 57593, "cbid": 211, "correlation": 57593 } }, { "ph": "s", "id": 57593, "pid": 76337, "tid": -914061504, "ts": 1716454218290285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218414405, "dur": 59, "args": { "External id": 57599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57599, "pid": 5, "tid": 7, "ts": 1716454218414405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290346, "dur": 12, "args": { "External id": 57599, "cbid": 211, "correlation": 57599 } }, { "ph": "s", "id": 57599, "pid": 76337, "tid": -914061504, "ts": 1716454218290346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218414466, "dur": 49, "args": { "External id": 57607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57607, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57607, "pid": 5, "tid": 7, "ts": 1716454218414466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290389, "dur": 12, "args": { "External id": 57607, "cbid": 211, "correlation": 57607 } }, { "ph": "s", "id": 57607, "pid": 76337, "tid": -914061504, "ts": 1716454218290389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218414517, "dur": 98, "args": { "External id": 57616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57616, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57616, "pid": 5, "tid": 7, "ts": 1716454218414517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290448, "dur": 13, "args": { "External id": 57616, "cbid": 211, "correlation": 57616 } }, { "ph": "s", "id": 57616, "pid": 76337, "tid": -914061504, "ts": 1716454218290448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218414616, "dur": 91, "args": { "External id": 57636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57636, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 57636, "pid": 5, "tid": 7, "ts": 1716454218414616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290557, "dur": 13, "args": { "External id": 57636, "cbid": 211, "correlation": 57636 } }, { "ph": "s", "id": 57636, "pid": 76337, "tid": -914061504, "ts": 1716454218290557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218414709, "dur": 5, "args": { "External id": 57648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57648, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 57648, "pid": 5, "tid": 7, "ts": 1716454218414709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290585, "dur": 9, "args": { "External id": 57648, "cbid": 211, "correlation": 57648 } }, { "ph": "s", "id": 57648, "pid": 76337, "tid": -914061504, "ts": 1716454218290585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218414715, "dur": 108, "args": { "External id": 57651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57651, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57651, "pid": 5, "tid": 7, "ts": 1716454218414715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290609, "dur": 8, "args": { "External id": 57651, "cbid": 211, "correlation": 57651 } }, { "ph": "s", "id": 57651, "pid": 76337, "tid": -914061504, "ts": 1716454218290609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218414824, "dur": 69, "args": { "External id": 57660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57660, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57660, "pid": 5, "tid": 7, "ts": 1716454218414824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290659, "dur": 12, "args": { "External id": 57660, "cbid": 211, "correlation": 57660 } }, { "ph": "s", "id": 57660, "pid": 76337, "tid": -914061504, "ts": 1716454218290659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218290714, "dur": 0, "args": { "External id": 57670, "cbid": 317, "correlation": 57670 } }, { "ph": "f", "id": 57670, "pid": 76337, "tid": -914061504, "ts": 1716454218290714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218290715, "dur": 0, "args": { "External id": 57671, "cbid": 203, "correlation": 57671 } }, { "ph": "f", "id": 57671, "pid": 76337, "tid": -914061504, "ts": 1716454218290715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218290715, "dur": 0, "args": { "External id": 57672, "cbid": 205, "correlation": 57672 } }, { "ph": "f", "id": 57672, "pid": 76337, "tid": -914061504, "ts": 1716454218290715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218414894, "dur": 76, "args": { "External id": 57676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57676, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57676, "pid": 5, "tid": 7, "ts": 1716454218414894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290739, "dur": 14, "args": { "External id": 57676, "cbid": 211, "correlation": 57676 } }, { "ph": "s", "id": 57676, "pid": 76337, "tid": -914061504, "ts": 1716454218290739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218414972, "dur": 24, "args": { "External id": 57678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57678, "pid": 5, "tid": 7, "ts": 1716454218414972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290756, "dur": 5, "args": { "External id": 57678, "cbid": 211, "correlation": 57678 } }, { "ph": "s", "id": 57678, "pid": 76337, "tid": -914061504, "ts": 1716454218290756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218414997, "dur": 4, "args": { "External id": 57680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57680, "pid": 5, "tid": 7, "ts": 1716454218414997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290771, "dur": 11, "args": { "External id": 57680, "cbid": 211, "correlation": 57680 } }, { "ph": "s", "id": 57680, "pid": 76337, "tid": -914061504, "ts": 1716454218290771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218290787, "dur": 0, "args": { "External id": 57681, "cbid": 51, "correlation": 57681 } }, { "ph": "s", "id": 57681, "pid": 76337, "tid": -914061504, "ts": 1716454218290787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218415002, "dur": 1357, "args": { "External id": 57682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57682, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57682, "pid": 5, "tid": 7, "ts": 1716454218415002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290789, "dur": 7, "args": { "External id": 57682, "cbid": 211, "correlation": 57682 } }, { "ph": "s", "id": 57682, "pid": 76337, "tid": -914061504, "ts": 1716454218290789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218416360, "dur": 58, "args": { "External id": 57687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57687, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57687, "pid": 5, "tid": 7, "ts": 1716454218416360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290818, "dur": 9, "args": { "External id": 57687, "cbid": 211, "correlation": 57687 } }, { "ph": "s", "id": 57687, "pid": 76337, "tid": -914061504, "ts": 1716454218290818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218416420, "dur": 4, "args": { "External id": 57695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57695, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57695, "pid": 5, "tid": 7, "ts": 1716454218416420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290862, "dur": 9, "args": { "External id": 57695, "cbid": 211, "correlation": 57695 } }, { "ph": "s", "id": 57695, "pid": 76337, "tid": -914061504, "ts": 1716454218290862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290960, "dur": 2, "args": { "External id": 57711, "cbid": 251, "correlation": 57711 } }, { "ph": "f", "id": 57711, "pid": 76337, "tid": -914061504, "ts": 1716454218290960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218290966, "dur": 0, "args": { "External id": 57713, "cbid": 251, "correlation": 57713 } }, { "ph": "f", "id": 57713, "pid": 76337, "tid": -914061504, "ts": 1716454218290966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218416425, "dur": 11, "args": { "External id": 57714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57714, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 57714, "pid": 5, "tid": 7, "ts": 1716454218416425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290968, "dur": 23, "args": { "External id": 57714, "cbid": 211, "correlation": 57714 } }, { "ph": "s", "id": 57714, "pid": 76337, "tid": -914061504, "ts": 1716454218290968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218416438, "dur": 5, "args": { "External id": 57716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57716, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 57716, "pid": 5, "tid": 7, "ts": 1716454218416438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218290994, "dur": 8, "args": { "External id": 57716, "cbid": 211, "correlation": 57716 } }, { "ph": "s", "id": 57716, "pid": 76337, "tid": -914061504, "ts": 1716454218290994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218416444, "dur": 55, "args": { "External id": 57726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57726, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57726, "pid": 5, "tid": 7, "ts": 1716454218416444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291069, "dur": 13, "args": { "External id": 57726, "cbid": 211, "correlation": 57726 } }, { "ph": "s", "id": 57726, "pid": 76337, "tid": -914061504, "ts": 1716454218291069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218416500, "dur": 53, "args": { "External id": 57746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57746, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 57746, "pid": 5, "tid": 7, "ts": 1716454218416500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291140, "dur": 11, "args": { "External id": 57746, "cbid": 211, "correlation": 57746 } }, { "ph": "s", "id": 57746, "pid": 76337, "tid": -914061504, "ts": 1716454218291140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218416554, "dur": 4, "args": { "External id": 57758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57758, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57758, "pid": 5, "tid": 7, "ts": 1716454218416554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291161, "dur": 6, "args": { "External id": 57758, "cbid": 211, "correlation": 57758 } }, { "ph": "s", "id": 57758, "pid": 76337, "tid": -914061504, "ts": 1716454218291161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218416560, "dur": 56, "args": { "External id": 57761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57761, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57761, "pid": 5, "tid": 7, "ts": 1716454218416560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291184, "dur": 6, "args": { "External id": 57761, "cbid": 211, "correlation": 57761 } }, { "ph": "s", "id": 57761, "pid": 76337, "tid": -914061504, "ts": 1716454218291184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218416616, "dur": 36, "args": { "External id": 57770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57770, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57770, "pid": 5, "tid": 7, "ts": 1716454218416616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291225, "dur": 11, "args": { "External id": 57770, "cbid": 211, "correlation": 57770 } }, { "ph": "s", "id": 57770, "pid": 76337, "tid": -914061504, "ts": 1716454218291225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218291294, "dur": 0, "args": { "External id": 57780, "cbid": 317, "correlation": 57780 } }, { "ph": "f", "id": 57780, "pid": 76337, "tid": -914061504, "ts": 1716454218291294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218291295, "dur": 0, "args": { "External id": 57781, "cbid": 203, "correlation": 57781 } }, { "ph": "f", "id": 57781, "pid": 76337, "tid": -914061504, "ts": 1716454218291295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218291296, "dur": 0, "args": { "External id": 57782, "cbid": 205, "correlation": 57782 } }, { "ph": "f", "id": 57782, "pid": 76337, "tid": -914061504, "ts": 1716454218291296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218416654, "dur": 40, "args": { "External id": 57786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57786, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57786, "pid": 5, "tid": 7, "ts": 1716454218416654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291313, "dur": 12, "args": { "External id": 57786, "cbid": 211, "correlation": 57786 } }, { "ph": "s", "id": 57786, "pid": 76337, "tid": -914061504, "ts": 1716454218291313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218416695, "dur": 15, "args": { "External id": 57788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57788, "pid": 5, "tid": 7, "ts": 1716454218416695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291327, "dur": 6, "args": { "External id": 57788, "cbid": 211, "correlation": 57788 } }, { "ph": "s", "id": 57788, "pid": 76337, "tid": -914061504, "ts": 1716454218291327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218416711, "dur": 3, "args": { "External id": 57790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57790, "pid": 5, "tid": 7, "ts": 1716454218416711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291338, "dur": 5, "args": { "External id": 57790, "cbid": 211, "correlation": 57790 } }, { "ph": "s", "id": 57790, "pid": 76337, "tid": -914061504, "ts": 1716454218291338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218291346, "dur": 0, "args": { "External id": 57791, "cbid": 51, "correlation": 57791 } }, { "ph": "s", "id": 57791, "pid": 76337, "tid": -914061504, "ts": 1716454218291346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218416715, "dur": 694, "args": { "External id": 57792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57792, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57792, "pid": 5, "tid": 7, "ts": 1716454218416715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291347, "dur": 5, "args": { "External id": 57792, "cbid": 211, "correlation": 57792 } }, { "ph": "s", "id": 57792, "pid": 76337, "tid": -914061504, "ts": 1716454218291347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218417411, "dur": 59, "args": { "External id": 57797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57797, "pid": 5, "tid": 7, "ts": 1716454218417411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291375, "dur": 9, "args": { "External id": 57797, "cbid": 211, "correlation": 57797 } }, { "ph": "s", "id": 57797, "pid": 76337, "tid": -914061504, "ts": 1716454218291375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218291437, "dur": 0, "args": { "External id": 57807, "cbid": 317, "correlation": 57807 } }, { "ph": "f", "id": 57807, "pid": 76337, "tid": -914061504, "ts": 1716454218291437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218291437, "dur": 0, "args": { "External id": 57808, "cbid": 203, "correlation": 57808 } }, { "ph": "f", "id": 57808, "pid": 76337, "tid": -914061504, "ts": 1716454218291437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218291438, "dur": 0, "args": { "External id": 57809, "cbid": 205, "correlation": 57809 } }, { "ph": "f", "id": 57809, "pid": 76337, "tid": -914061504, "ts": 1716454218291438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218417471, "dur": 76, "args": { "External id": 57813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57813, "pid": 5, "tid": 7, "ts": 1716454218417471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291452, "dur": 12, "args": { "External id": 57813, "cbid": 211, "correlation": 57813 } }, { "ph": "s", "id": 57813, "pid": 76337, "tid": -914061504, "ts": 1716454218291452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454218417549, "dur": 209, "args": { "External id": 57815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57815, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57815, "pid": 5, "tid": 7, "ts": 1716454218417549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291475, "dur": 9, "args": { "External id": 57815, "cbid": 211, "correlation": 57815 } }, { "ph": "s", "id": 57815, "pid": 76337, "tid": -914061504, "ts": 1716454218291475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218417759, "dur": 39, "args": { "External id": 57817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57817, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57817, "pid": 5, "tid": 7, "ts": 1716454218417759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291490, "dur": 6, "args": { "External id": 57817, "cbid": 211, "correlation": 57817 } }, { "ph": "s", "id": 57817, "pid": 76337, "tid": -914061504, "ts": 1716454218291490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218417800, "dur": 59, "args": { "External id": 57823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57823, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57823, "pid": 5, "tid": 7, "ts": 1716454218417800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291517, "dur": 8, "args": { "External id": 57823, "cbid": 211, "correlation": 57823 } }, { "ph": "s", "id": 57823, "pid": 76337, "tid": -914061504, "ts": 1716454218291517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218417861, "dur": 50, "args": { "External id": 57831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57831, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57831, "pid": 5, "tid": 7, "ts": 1716454218417861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291546, "dur": 8, "args": { "External id": 57831, "cbid": 211, "correlation": 57831 } }, { "ph": "s", "id": 57831, "pid": 76337, "tid": -914061504, "ts": 1716454218291546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218417912, "dur": 35, "args": { "External id": 57839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57839, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57839, "pid": 5, "tid": 7, "ts": 1716454218417912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291584, "dur": 15, "args": { "External id": 57839, "cbid": 211, "correlation": 57839 } }, { "ph": "s", "id": 57839, "pid": 76337, "tid": -914061504, "ts": 1716454218291584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218417948, "dur": 52, "args": { "External id": 57859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57859, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 57859, "pid": 5, "tid": 7, "ts": 1716454218417948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291677, "dur": 13, "args": { "External id": 57859, "cbid": 211, "correlation": 57859 } }, { "ph": "s", "id": 57859, "pid": 76337, "tid": -914061504, "ts": 1716454218291677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218418002, "dur": 4, "args": { "External id": 57871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57871, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 57871, "pid": 5, "tid": 7, "ts": 1716454218418002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291701, "dur": 6, "args": { "External id": 57871, "cbid": 211, "correlation": 57871 } }, { "ph": "s", "id": 57871, "pid": 76337, "tid": -914061504, "ts": 1716454218291701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218418007, "dur": 56, "args": { "External id": 57874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57874, "pid": 5, "tid": 7, "ts": 1716454218418007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291721, "dur": 7, "args": { "External id": 57874, "cbid": 211, "correlation": 57874 } }, { "ph": "s", "id": 57874, "pid": 76337, "tid": -914061504, "ts": 1716454218291721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218291779, "dur": 0, "args": { "External id": 57885, "cbid": 317, "correlation": 57885 } }, { "ph": "f", "id": 57885, "pid": 76337, "tid": -914061504, "ts": 1716454218291779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218291780, "dur": 0, "args": { "External id": 57886, "cbid": 203, "correlation": 57886 } }, { "ph": "f", "id": 57886, "pid": 76337, "tid": -914061504, "ts": 1716454218291780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218291781, "dur": 0, "args": { "External id": 57887, "cbid": 205, "correlation": 57887 } }, { "ph": "f", "id": 57887, "pid": 76337, "tid": -914061504, "ts": 1716454218291781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291802, "dur": 1, "args": { "External id": 57891, "cbid": 251, "correlation": 57891 } }, { "ph": "f", "id": 57891, "pid": 76337, "tid": -914061504, "ts": 1716454218291802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291804, "dur": 0, "args": { "External id": 57892, "cbid": 251, "correlation": 57892 } }, { "ph": "f", "id": 57892, "pid": 76337, "tid": -914061504, "ts": 1716454218291804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291805, "dur": 0, "args": { "External id": 57893, "cbid": 251, "correlation": 57893 } }, { "ph": "f", "id": 57893, "pid": 76337, "tid": -914061504, "ts": 1716454218291805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291806, "dur": 0, "args": { "External id": 57894, "cbid": 251, "correlation": 57894 } }, { "ph": "f", "id": 57894, "pid": 76337, "tid": -914061504, "ts": 1716454218291806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291806, "dur": 0, "args": { "External id": 57895, "cbid": 251, "correlation": 57895 } }, { "ph": "f", "id": 57895, "pid": 76337, "tid": -914061504, "ts": 1716454218291806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291807, "dur": 0, "args": { "External id": 57896, "cbid": 251, "correlation": 57896 } }, { "ph": "f", "id": 57896, "pid": 76337, "tid": -914061504, "ts": 1716454218291807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291808, "dur": 0, "args": { "External id": 57897, "cbid": 251, "correlation": 57897 } }, { "ph": "f", "id": 57897, "pid": 76337, "tid": -914061504, "ts": 1716454218291808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291809, "dur": 0, "args": { "External id": 57898, "cbid": 251, "correlation": 57898 } }, { "ph": "f", "id": 57898, "pid": 76337, "tid": -914061504, "ts": 1716454218291809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218291810, "dur": 0, "args": { "External id": 57899, "cbid": 251, "correlation": 57899 } }, { "ph": "f", "id": 57899, "pid": 76337, "tid": -914061504, "ts": 1716454218291810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218418064, "dur": 115, "args": { "External id": 57900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57900, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57900, "pid": 5, "tid": 7, "ts": 1716454218418064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291812, "dur": 12, "args": { "External id": 57900, "cbid": 211, "correlation": 57900 } }, { "ph": "s", "id": 57900, "pid": 76337, "tid": -914061504, "ts": 1716454218291812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218418180, "dur": 59, "args": { "External id": 57906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57906, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57906, "pid": 5, "tid": 7, "ts": 1716454218418180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291847, "dur": 9, "args": { "External id": 57906, "cbid": 211, "correlation": 57906 } }, { "ph": "s", "id": 57906, "pid": 76337, "tid": -914061504, "ts": 1716454218291847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218418240, "dur": 618, "args": { "External id": 57915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57915, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57915, "pid": 5, "tid": 7, "ts": 1716454218418240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218291947, "dur": 16, "args": { "External id": 57915, "cbid": 211, "correlation": 57915 } }, { "ph": "s", "id": 57915, "pid": 76337, "tid": -914061504, "ts": 1716454218291947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218418860, "dur": 180, "args": { "External id": 57937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57937, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 57937, "pid": 5, "tid": 7, "ts": 1716454218418860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292026, "dur": 47, "args": { "External id": 57937, "cbid": 211, "correlation": 57937 } }, { "ph": "s", "id": 57937, "pid": 76337, "tid": -914061504, "ts": 1716454218292026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292178, "dur": 2, "args": { "External id": 57948, "cbid": 251, "correlation": 57948 } }, { "ph": "f", "id": 57948, "pid": 76337, "tid": -914061504, "ts": 1716454218292178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218419041, "dur": 197, "args": { "External id": 57949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57949, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57949, "pid": 5, "tid": 7, "ts": 1716454218419041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292184, "dur": 17, "args": { "External id": 57949, "cbid": 211, "correlation": 57949 } }, { "ph": "s", "id": 57949, "pid": 76337, "tid": -914061504, "ts": 1716454218292184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292259, "dur": 1, "args": { "External id": 57960, "cbid": 251, "correlation": 57960 } }, { "ph": "f", "id": 57960, "pid": 76337, "tid": -914061504, "ts": 1716454218292259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218419240, "dur": 187, "args": { "External id": 57961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57961, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57961, "pid": 5, "tid": 7, "ts": 1716454218419240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292263, "dur": 12, "args": { "External id": 57961, "cbid": 211, "correlation": 57961 } }, { "ph": "s", "id": 57961, "pid": 76337, "tid": -914061504, "ts": 1716454218292263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292326, "dur": 1, "args": { "External id": 57972, "cbid": 251, "correlation": 57972 } }, { "ph": "f", "id": 57972, "pid": 76337, "tid": -914061504, "ts": 1716454218292326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218419428, "dur": 188, "args": { "External id": 57973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57973, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 57973, "pid": 5, "tid": 7, "ts": 1716454218419428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292330, "dur": 11, "args": { "External id": 57973, "cbid": 211, "correlation": 57973 } }, { "ph": "s", "id": 57973, "pid": 76337, "tid": -914061504, "ts": 1716454218292330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218419617, "dur": 18515, "args": { "External id": 57994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 57994, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 57994, "pid": 5, "tid": 7, "ts": 1716454218419617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292434, "dur": 15, "args": { "External id": 57994, "cbid": 211, "correlation": 57994 } }, { "ph": "s", "id": 57994, "pid": 76337, "tid": -914061504, "ts": 1716454218292434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292554, "dur": 2, "args": { "External id": 58012, "cbid": 251, "correlation": 58012 } }, { "ph": "f", "id": 58012, "pid": 76337, "tid": -914061504, "ts": 1716454218292554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218438133, "dur": 201, "args": { "External id": 58014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58014, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 58014, "pid": 5, "tid": 7, "ts": 1716454218438133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292560, "dur": 13, "args": { "External id": 58014, "cbid": 211, "correlation": 58014 } }, { "ph": "s", "id": 58014, "pid": 76337, "tid": -914061504, "ts": 1716454218292560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218438336, "dur": 66, "args": { "External id": 58022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58022, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58022, "pid": 5, "tid": 7, "ts": 1716454218438336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292630, "dur": 12, "args": { "External id": 58022, "cbid": 211, "correlation": 58022 } }, { "ph": "s", "id": 58022, "pid": 76337, "tid": -914061504, "ts": 1716454218292630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218438404, "dur": 97, "args": { "External id": 58030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58030, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58030, "pid": 5, "tid": 7, "ts": 1716454218438404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292674, "dur": 10, "args": { "External id": 58030, "cbid": 211, "correlation": 58030 } }, { "ph": "s", "id": 58030, "pid": 76337, "tid": -914061504, "ts": 1716454218292674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218438502, "dur": 54, "args": { "External id": 58041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58041, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58041, "pid": 5, "tid": 7, "ts": 1716454218438502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292765, "dur": 13, "args": { "External id": 58041, "cbid": 211, "correlation": 58041 } }, { "ph": "s", "id": 58041, "pid": 76337, "tid": -914061504, "ts": 1716454218292765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218438557, "dur": 91, "args": { "External id": 58063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58063, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58063, "pid": 5, "tid": 7, "ts": 1716454218438557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292800, "dur": 9, "args": { "External id": 58063, "cbid": 211, "correlation": 58063 } }, { "ph": "s", "id": 58063, "pid": 76337, "tid": -914061504, "ts": 1716454218292800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292885, "dur": 1, "args": { "External id": 58074, "cbid": 251, "correlation": 58074 } }, { "ph": "f", "id": 58074, "pid": 76337, "tid": -914061504, "ts": 1716454218292885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218438650, "dur": 104, "args": { "External id": 58075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58075, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 58075, "pid": 5, "tid": 7, "ts": 1716454218438650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292890, "dur": 13, "args": { "External id": 58075, "cbid": 211, "correlation": 58075 } }, { "ph": "s", "id": 58075, "pid": 76337, "tid": -914061504, "ts": 1716454218292890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292970, "dur": 1, "args": { "External id": 58086, "cbid": 251, "correlation": 58086 } }, { "ph": "f", "id": 58086, "pid": 76337, "tid": -914061504, "ts": 1716454218292970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218292982, "dur": 0, "args": { "External id": 58087, "cbid": 251, "correlation": 58087 } }, { "ph": "f", "id": 58087, "pid": 76337, "tid": -914061504, "ts": 1716454218292982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218438755, "dur": 10, "args": { "External id": 58088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58088, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58088, "pid": 5, "tid": 7, "ts": 1716454218438755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218292984, "dur": 13, "args": { "External id": 58088, "cbid": 211, "correlation": 58088 } }, { "ph": "s", "id": 58088, "pid": 76337, "tid": -914061504, "ts": 1716454218292984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218438767, "dur": 5, "args": { "External id": 58090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58090, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 58090, "pid": 5, "tid": 7, "ts": 1716454218438767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293000, "dur": 11, "args": { "External id": 58090, "cbid": 211, "correlation": 58090 } }, { "ph": "s", "id": 58090, "pid": 76337, "tid": -914061504, "ts": 1716454218293000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293069, "dur": 1, "args": { "External id": 58101, "cbid": 251, "correlation": 58101 } }, { "ph": "f", "id": 58101, "pid": 76337, "tid": -914061504, "ts": 1716454218293069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293072, "dur": 0, "args": { "External id": 58102, "cbid": 251, "correlation": 58102 } }, { "ph": "f", "id": 58102, "pid": 76337, "tid": -914061504, "ts": 1716454218293072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454218438773, "dur": 6, "args": { "External id": 58103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58103, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58103, "pid": 5, "tid": 7, "ts": 1716454218438773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293074, "dur": 14, "args": { "External id": 58103, "cbid": 211, "correlation": 58103 } }, { "ph": "s", "id": 58103, "pid": 76337, "tid": -914061504, "ts": 1716454218293074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454218438780, "dur": 3, "args": { "External id": 58105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58105, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 58105, "pid": 5, "tid": 7, "ts": 1716454218438780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293089, "dur": 6, "args": { "External id": 58105, "cbid": 211, "correlation": 58105 } }, { "ph": "s", "id": 58105, "pid": 76337, "tid": -914061504, "ts": 1716454218293089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454218438785, "dur": 155, "args": { "External id": 58126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58126, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 58126, "pid": 5, "tid": 7, "ts": 1716454218438785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293164, "dur": 12, "args": { "External id": 58126, "cbid": 211, "correlation": 58126 } }, { "ph": "s", "id": 58126, "pid": 76337, "tid": -914061504, "ts": 1716454218293164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293259, "dur": 2, "args": { "External id": 58144, "cbid": 251, "correlation": 58144 } }, { "ph": "f", "id": 58144, "pid": 76337, "tid": -914061504, "ts": 1716454218293259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218438942, "dur": 106, "args": { "External id": 58146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58146, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 58146, "pid": 5, "tid": 7, "ts": 1716454218438942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293266, "dur": 13, "args": { "External id": 58146, "cbid": 211, "correlation": 58146 } }, { "ph": "s", "id": 58146, "pid": 76337, "tid": -914061504, "ts": 1716454218293266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218439049, "dur": 35, "args": { "External id": 58154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58154, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58154, "pid": 5, "tid": 7, "ts": 1716454218439049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293336, "dur": 12, "args": { "External id": 58154, "cbid": 211, "correlation": 58154 } }, { "ph": "s", "id": 58154, "pid": 76337, "tid": -914061504, "ts": 1716454218293336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218439085, "dur": 67, "args": { "External id": 58162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58162, "pid": 5, "tid": 7, "ts": 1716454218439085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293376, "dur": 9, "args": { "External id": 58162, "cbid": 211, "correlation": 58162 } }, { "ph": "s", "id": 58162, "pid": 76337, "tid": -914061504, "ts": 1716454218293376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218439154, "dur": 92, "args": { "External id": 58184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58184, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58184, "pid": 5, "tid": 7, "ts": 1716454218439154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293431, "dur": 11, "args": { "External id": 58184, "cbid": 211, "correlation": 58184 } }, { "ph": "s", "id": 58184, "pid": 76337, "tid": -914061504, "ts": 1716454218293431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293524, "dur": 1, "args": { "External id": 58200, "cbid": 251, "correlation": 58200 } }, { "ph": "f", "id": 58200, "pid": 76337, "tid": -914061504, "ts": 1716454218293524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454218439247, "dur": 571, "args": { "External id": 58202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58202, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 58202, "pid": 5, "tid": 7, "ts": 1716454218439247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293530, "dur": 12, "args": { "External id": 58202, "cbid": 211, "correlation": 58202 } }, { "ph": "s", "id": 58202, "pid": 76337, "tid": -914061504, "ts": 1716454218293530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218439819, "dur": 241, "args": { "External id": 58210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58210, "pid": 5, "tid": 7, "ts": 1716454218439819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293612, "dur": 14, "args": { "External id": 58210, "cbid": 211, "correlation": 58210 } }, { "ph": "s", "id": 58210, "pid": 76337, "tid": -914061504, "ts": 1716454218293612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218440061, "dur": 253, "args": { "External id": 58218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58218, "pid": 5, "tid": 7, "ts": 1716454218440061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293650, "dur": 10, "args": { "External id": 58218, "cbid": 211, "correlation": 58218 } }, { "ph": "s", "id": 58218, "pid": 76337, "tid": -914061504, "ts": 1716454218293650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293735, "dur": 2, "args": { "External id": 58234, "cbid": 251, "correlation": 58234 } }, { "ph": "f", "id": 58234, "pid": 76337, "tid": -914061504, "ts": 1716454218293735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293740, "dur": 0, "args": { "External id": 58236, "cbid": 251, "correlation": 58236 } }, { "ph": "f", "id": 58236, "pid": 76337, "tid": -914061504, "ts": 1716454218293740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454218440315, "dur": 358, "args": { "External id": 58237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58237, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58237, "pid": 5, "tid": 7, "ts": 1716454218440315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293744, "dur": 13, "args": { "External id": 58237, "cbid": 211, "correlation": 58237 } }, { "ph": "s", "id": 58237, "pid": 76337, "tid": -914061504, "ts": 1716454218293744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218440675, "dur": 50, "args": { "External id": 58245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58245, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58245, "pid": 5, "tid": 7, "ts": 1716454218440675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293787, "dur": 10, "args": { "External id": 58245, "cbid": 211, "correlation": 58245 } }, { "ph": "s", "id": 58245, "pid": 76337, "tid": -914061504, "ts": 1716454218293787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218440726, "dur": 156, "args": { "External id": 58256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58256, "pid": 5, "tid": 7, "ts": 1716454218440726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293854, "dur": 39, "args": { "External id": 58256, "cbid": 211, "correlation": 58256 } }, { "ph": "s", "id": 58256, "pid": 76337, "tid": -914061504, "ts": 1716454218293854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218293947, "dur": 0, "args": { "External id": 58268, "cbid": 317, "correlation": 58268 } }, { "ph": "f", "id": 58268, "pid": 76337, "tid": -914061504, "ts": 1716454218293947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218293948, "dur": 0, "args": { "External id": 58269, "cbid": 203, "correlation": 58269 } }, { "ph": "f", "id": 58269, "pid": 76337, "tid": -914061504, "ts": 1716454218293948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218293949, "dur": 0, "args": { "External id": 58270, "cbid": 205, "correlation": 58270 } }, { "ph": "f", "id": 58270, "pid": 76337, "tid": -914061504, "ts": 1716454218293949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293981, "dur": 1, "args": { "External id": 58274, "cbid": 251, "correlation": 58274 } }, { "ph": "f", "id": 58274, "pid": 76337, "tid": -914061504, "ts": 1716454218293981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293983, "dur": 0, "args": { "External id": 58275, "cbid": 251, "correlation": 58275 } }, { "ph": "f", "id": 58275, "pid": 76337, "tid": -914061504, "ts": 1716454218293983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293984, "dur": 0, "args": { "External id": 58276, "cbid": 251, "correlation": 58276 } }, { "ph": "f", "id": 58276, "pid": 76337, "tid": -914061504, "ts": 1716454218293984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293985, "dur": 0, "args": { "External id": 58277, "cbid": 251, "correlation": 58277 } }, { "ph": "f", "id": 58277, "pid": 76337, "tid": -914061504, "ts": 1716454218293985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293985, "dur": 0, "args": { "External id": 58278, "cbid": 251, "correlation": 58278 } }, { "ph": "f", "id": 58278, "pid": 76337, "tid": -914061504, "ts": 1716454218293985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293986, "dur": 0, "args": { "External id": 58279, "cbid": 251, "correlation": 58279 } }, { "ph": "f", "id": 58279, "pid": 76337, "tid": -914061504, "ts": 1716454218293986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293987, "dur": 0, "args": { "External id": 58280, "cbid": 251, "correlation": 58280 } }, { "ph": "f", "id": 58280, "pid": 76337, "tid": -914061504, "ts": 1716454218293987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293988, "dur": 0, "args": { "External id": 58281, "cbid": 251, "correlation": 58281 } }, { "ph": "f", "id": 58281, "pid": 76337, "tid": -914061504, "ts": 1716454218293988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218293989, "dur": 0, "args": { "External id": 58282, "cbid": 251, "correlation": 58282 } }, { "ph": "f", "id": 58282, "pid": 76337, "tid": -914061504, "ts": 1716454218293989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454218440884, "dur": 113, "args": { "External id": 58283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58283, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 58283, "pid": 5, "tid": 7, "ts": 1716454218440884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218293992, "dur": 26, "args": { "External id": 58283, "cbid": 211, "correlation": 58283 } }, { "ph": "s", "id": 58283, "pid": 76337, "tid": -914061504, "ts": 1716454218293992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218440998, "dur": 59, "args": { "External id": 58289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58289, "pid": 5, "tid": 7, "ts": 1716454218440998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294042, "dur": 272, "args": { "External id": 58289, "cbid": 211, "correlation": 58289 } }, { "ph": "s", "id": 58289, "pid": 76337, "tid": -914061504, "ts": 1716454218294042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441059, "dur": 50, "args": { "External id": 58297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58297, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58297, "pid": 5, "tid": 7, "ts": 1716454218441059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294338, "dur": 9, "args": { "External id": 58297, "cbid": 211, "correlation": 58297 } }, { "ph": "s", "id": 58297, "pid": 76337, "tid": -914061504, "ts": 1716454218294338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454218441110, "dur": 52, "args": { "External id": 58317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58317, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 58317, "pid": 5, "tid": 7, "ts": 1716454218441110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294423, "dur": 12, "args": { "External id": 58317, "cbid": 211, "correlation": 58317 } }, { "ph": "s", "id": 58317, "pid": 76337, "tid": -914061504, "ts": 1716454218294423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454218441163, "dur": 4, "args": { "External id": 58329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58329, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 58329, "pid": 5, "tid": 7, "ts": 1716454218441163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294446, "dur": 7, "args": { "External id": 58329, "cbid": 211, "correlation": 58329 } }, { "ph": "s", "id": 58329, "pid": 76337, "tid": -914061504, "ts": 1716454218294446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218441168, "dur": 56, "args": { "External id": 58332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58332, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58332, "pid": 5, "tid": 7, "ts": 1716454218441168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294464, "dur": 101, "args": { "External id": 58332, "cbid": 211, "correlation": 58332 } }, { "ph": "s", "id": 58332, "pid": 76337, "tid": -914061504, "ts": 1716454218294464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441226, "dur": 37, "args": { "External id": 58341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58341, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58341, "pid": 5, "tid": 7, "ts": 1716454218441226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294604, "dur": 11, "args": { "External id": 58341, "cbid": 211, "correlation": 58341 } }, { "ph": "s", "id": 58341, "pid": 76337, "tid": -914061504, "ts": 1716454218294604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454218294661, "dur": 0, "args": { "External id": 58351, "cbid": 317, "correlation": 58351 } }, { "ph": "f", "id": 58351, "pid": 76337, "tid": -914061504, "ts": 1716454218294661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454218294662, "dur": 0, "args": { "External id": 58352, "cbid": 203, "correlation": 58352 } }, { "ph": "f", "id": 58352, "pid": 76337, "tid": -914061504, "ts": 1716454218294662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454218294663, "dur": 0, "args": { "External id": 58353, "cbid": 205, "correlation": 58353 } }, { "ph": "f", "id": 58353, "pid": 76337, "tid": -914061504, "ts": 1716454218294663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218441264, "dur": 41, "args": { "External id": 58357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58357, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58357, "pid": 5, "tid": 7, "ts": 1716454218441264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294683, "dur": 12, "args": { "External id": 58357, "cbid": 211, "correlation": 58357 } }, { "ph": "s", "id": 58357, "pid": 76337, "tid": -914061504, "ts": 1716454218294683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454218441306, "dur": 3, "args": { "External id": 58359, "device": 5, "context": 1, "stream": 7, "correlation": 58359, "bytes": 46080, "memory bandwidth (GB/s)": 12.417138237671786 } }, { "ph": "f", "id": 58359, "pid": 5, "tid": 7, "ts": 1716454218441306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218294699, "dur": 24, "args": { "External id": 58359, "cbid": 51, "correlation": 58359 } }, { "ph": "s", "id": 58359, "pid": 76337, "tid": -914061504, "ts": 1716454218294699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218294728, "dur": 1, "args": { "External id": 58361, "cbid": 200, "correlation": 58361 } }, { "ph": "f", "id": 58361, "pid": 76337, "tid": -914061504, "ts": 1716454218294728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218294730, "dur": 0, "args": { "External id": 58362, "cbid": 200, "correlation": 58362 } }, { "ph": "f", "id": 58362, "pid": 76337, "tid": -914061504, "ts": 1716454218294730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218294731, "dur": 0, "args": { "External id": 58363, "cbid": 200, "correlation": 58363 } }, { "ph": "f", "id": 58363, "pid": 76337, "tid": -914061504, "ts": 1716454218294731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454218294731, "dur": 0, "args": { "External id": 58364, "cbid": 200, "correlation": 58364 } }, { "ph": "f", "id": 58364, "pid": 76337, "tid": -914061504, "ts": 1716454218294731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454218294732, "dur": 4, "args": { "External id": 58365, "cbid": 15, "correlation": 58365 } }, { "ph": "f", "id": 58365, "pid": 76337, "tid": -914061504, "ts": 1716454218294732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454218294736, "dur": 1, "args": { "External id": 58366, "cbid": 251, "correlation": 58366 } }, { "ph": "f", "id": 58366, "pid": 76337, "tid": -914061504, "ts": 1716454218294736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454218441311, "dur": 24, "args": { "External id": 58367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58367, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58367, "pid": 5, "tid": 7, "ts": 1716454218441311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294740, "dur": 8, "args": { "External id": 58367, "cbid": 211, "correlation": 58367 } }, { "ph": "s", "id": 58367, "pid": 76337, "tid": -914061504, "ts": 1716454218294740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454218441336, "dur": 4, "args": { "External id": 58369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 58369, "pid": 5, "tid": 7, "ts": 1716454218441336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294754, "dur": 7, "args": { "External id": 58369, "cbid": 211, "correlation": 58369 } }, { "ph": "s", "id": 58369, "pid": 76337, "tid": -914061504, "ts": 1716454218294754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218294765, "dur": 0, "args": { "External id": 58370, "cbid": 51, "correlation": 58370 } }, { "ph": "s", "id": 58370, "pid": 76337, "tid": -914061504, "ts": 1716454218294765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454218441341, "dur": 187, "args": { "External id": 58371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58371, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58371, "pid": 5, "tid": 7, "ts": 1716454218441341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294766, "dur": 179, "args": { "External id": 58371, "cbid": 211, "correlation": 58371 } }, { "ph": "s", "id": 58371, "pid": 76337, "tid": -914061504, "ts": 1716454218294766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454218441530, "dur": 7, "args": { "External id": 58372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58372, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58372, "pid": 5, "tid": 7, "ts": 1716454218441530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294949, "dur": 6, "args": { "External id": 58372, "cbid": 211, "correlation": 58372 } }, { "ph": "s", "id": 58372, "pid": 76337, "tid": -914061504, "ts": 1716454218294949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454218441538, "dur": 5, "args": { "External id": 58378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58378, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 58378, "pid": 5, "tid": 7, "ts": 1716454218441538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218294988, "dur": 10, "args": { "External id": 58378, "cbid": 211, "correlation": 58378 } }, { "ph": "s", "id": 58378, "pid": 76337, "tid": -914061504, "ts": 1716454218294988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441544, "dur": 3, "args": { "External id": 58386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58386, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58386, "pid": 5, "tid": 7, "ts": 1716454218441544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218296720, "dur": 15, "args": { "External id": 58386, "cbid": 211, "correlation": 58386 } }, { "ph": "s", "id": 58386, "pid": 76337, "tid": -914061504, "ts": 1716454218296720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441548, "dur": 3, "args": { "External id": 58394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58394, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58394, "pid": 5, "tid": 7, "ts": 1716454218441548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218296760, "dur": 13, "args": { "External id": 58394, "cbid": 211, "correlation": 58394 } }, { "ph": "s", "id": 58394, "pid": 76337, "tid": -914061504, "ts": 1716454218296760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441553, "dur": 3, "args": { "External id": 58402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58402, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58402, "pid": 5, "tid": 7, "ts": 1716454218441553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218296791, "dur": 8, "args": { "External id": 58402, "cbid": 211, "correlation": 58402 } }, { "ph": "s", "id": 58402, "pid": 76337, "tid": -914061504, "ts": 1716454218296791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441556, "dur": 3, "args": { "External id": 58411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58411, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58411, "pid": 5, "tid": 7, "ts": 1716454218441556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218296972, "dur": 23, "args": { "External id": 58411, "cbid": 211, "correlation": 58411 } }, { "ph": "s", "id": 58411, "pid": 76337, "tid": -914061504, "ts": 1716454218296972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441561, "dur": 3, "args": { "External id": 58420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58420, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58420, "pid": 5, "tid": 7, "ts": 1716454218441561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218297012, "dur": 8, "args": { "External id": 58420, "cbid": 211, "correlation": 58420 } }, { "ph": "s", "id": 58420, "pid": 76337, "tid": -914061504, "ts": 1716454218297012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441565, "dur": 3, "args": { "External id": 58428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58428, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58428, "pid": 5, "tid": 7, "ts": 1716454218441565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218297038, "dur": 8, "args": { "External id": 58428, "cbid": 211, "correlation": 58428 } }, { "ph": "s", "id": 58428, "pid": 76337, "tid": -914061504, "ts": 1716454218297038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441569, "dur": 3, "args": { "External id": 58436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58436, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58436, "pid": 5, "tid": 7, "ts": 1716454218441569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218297293, "dur": 15, "args": { "External id": 58436, "cbid": 211, "correlation": 58436 } }, { "ph": "s", "id": 58436, "pid": 76337, "tid": -914061504, "ts": 1716454218297293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441574, "dur": 3, "args": { "External id": 58444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58444, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58444, "pid": 5, "tid": 7, "ts": 1716454218441574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218297324, "dur": 8, "args": { "External id": 58444, "cbid": 211, "correlation": 58444 } }, { "ph": "s", "id": 58444, "pid": 76337, "tid": -914061504, "ts": 1716454218297324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218441579, "dur": 1, "args": { "External id": 58454, "device": 5, "context": 1, "stream": 7, "correlation": 58454, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 58454, "pid": 5, "tid": 7, "ts": 1716454218441579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218297390, "dur": 40, "args": { "External id": 58454, "cbid": 41, "correlation": 58454 } }, { "ph": "s", "id": 58454, "pid": 76337, "tid": -914061504, "ts": 1716454218297390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218297432, "dur": 144164, "args": { "External id": 58455, "cbid": 131, "correlation": 58455 } }, { "ph": "f", "id": 58455, "pid": 76337, "tid": -914061504, "ts": 1716454218297432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454218441726, "dur": 3, "args": { "External id": 58463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58463, "pid": 5, "tid": 7, "ts": 1716454218441726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218441707, "dur": 22, "args": { "External id": 58463, "cbid": 211, "correlation": 58463 } }, { "ph": "s", "id": 58463, "pid": 76337, "tid": -914061504, "ts": 1716454218441707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218441813, "dur": 3, "args": { "External id": 58472, "device": 5, "context": 1, "stream": 7, "correlation": 58472, "bytes": 8, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 58472, "pid": 5, "tid": 7, "ts": 1716454218441813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218441787, "dur": 26, "args": { "External id": 58472, "cbid": 41, "correlation": 58472 } }, { "ph": "s", "id": 58472, "pid": 76337, "tid": -914061504, "ts": 1716454218441787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454218441897, "dur": 4, "args": { "External id": 58482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58482, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58482, "pid": 5, "tid": 7, "ts": 1716454218441897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454218441881, "dur": 17, "args": { "External id": 58482, "cbid": 211, "correlation": 58482 } }, { "ph": "s", "id": 58482, "pid": 76337, "tid": -914061504, "ts": 1716454218441881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454218441999, "dur": 1, "args": { "External id": 58492, "device": 5, "context": 1, "stream": 7, "correlation": 58492, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 58492, "pid": 5, "tid": 7, "ts": 1716454218441999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218441939, "dur": 57, "args": { "External id": 58492, "cbid": 41, "correlation": 58492 } }, { "ph": "s", "id": 58492, "pid": 76337, "tid": -914061504, "ts": 1716454218441939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454218441997, "dur": 9, "args": { "External id": 58493, "cbid": 131, "correlation": 58493 } }, { "ph": "f", "id": 58493, "pid": 76337, "tid": -914061504, "ts": 1716454218441997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218442096, "dur": 3, "args": { "External id": 58500, "device": 5, "context": 1, "stream": 7, "correlation": 58500, "bytes": 98304, "memory bandwidth (GB/s)": 30.415841584158414 } }, { "ph": "f", "id": 58500, "pid": 5, "tid": 7, "ts": 1716454218442096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218442047, "dur": 48, "args": { "External id": 58500, "cbid": 41, "correlation": 58500 } }, { "ph": "s", "id": 58500, "pid": 76337, "tid": -914061504, "ts": 1716454218442047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454218442414, "dur": 3, "args": { "External id": 58519, "device": 5, "context": 1, "stream": 7, "correlation": 58519, "bytes": 16, "memory bandwidth (GB/s)": 0.005320917858330562 } }, { "ph": "f", "id": 58519, "pid": 5, "tid": 7, "ts": 1716454218442414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454218442382, "dur": 31, "args": { "External id": 58519, "cbid": 41, "correlation": 58519 } }, { "ph": "s", "id": 58519, "pid": 76337, "tid": -914061504, "ts": 1716454218442382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceCount", "pid": 76337, "tid": -914061504, "ts": 1716454218447342, "dur": 1, "args": { "External id": 58522, "cbid": 3, "correlation": 58522 } }, { "ph": "f", "id": 58522, "pid": 76337, "tid": -914061504, "ts": 1716454218447342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454218447345, "dur": 1116947, "args": { "External id": 58523, "cbid": 4, "correlation": 58523 } }, { "ph": "f", "id": 58523, "pid": 76337, "tid": -914061504, "ts": 1716454218447345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454219564295, "dur": 72066, "args": { "External id": 58524, "cbid": 4, "correlation": 58524 } }, { "ph": "f", "id": 58524, "pid": 76337, "tid": -914061504, "ts": 1716454219564295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454219636363, "dur": 71462, "args": { "External id": 58525, "cbid": 4, "correlation": 58525 } }, { "ph": "f", "id": 58525, "pid": 76337, "tid": -914061504, "ts": 1716454219636363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454219707827, "dur": 36547, "args": { "External id": 58526, "cbid": 4, "correlation": 58526 } }, { "ph": "f", "id": 58526, "pid": 76337, "tid": -914061504, "ts": 1716454219707827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454219744375, "dur": 3188, "args": { "External id": 58527, "cbid": 4, "correlation": 58527 } }, { "ph": "f", "id": 58527, "pid": 76337, "tid": -914061504, "ts": 1716454219744375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454219747564, "dur": 1792, "args": { "External id": 58528, "cbid": 4, "correlation": 58528 } }, { "ph": "f", "id": 58528, "pid": 76337, "tid": -914061504, "ts": 1716454219747564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaGetDeviceProperties", "pid": 76337, "tid": -914061504, "ts": 1716454219749357, "dur": 5312, "args": { "External id": 58529, "cbid": 4, "correlation": 58529 } }, { "ph": "f", "id": 58529, "pid": 76337, "tid": -914061504, "ts": 1716454219749357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073963, "dur": 0, "args": { "External id": 58532, "cbid": 200, "correlation": 58532 } }, { "ph": "f", "id": 58532, "pid": 76337, "tid": -914061504, "ts": 1716454222073963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073964, "dur": 0, "args": { "External id": 58533, "cbid": 200, "correlation": 58533 } }, { "ph": "f", "id": 58533, "pid": 76337, "tid": -914061504, "ts": 1716454222073964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073964, "dur": 0, "args": { "External id": 58534, "cbid": 200, "correlation": 58534 } }, { "ph": "f", "id": 58534, "pid": 76337, "tid": -914061504, "ts": 1716454222073964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073964, "dur": 0, "args": { "External id": 58535, "cbid": 200, "correlation": 58535 } }, { "ph": "f", "id": 58535, "pid": 76337, "tid": -914061504, "ts": 1716454222073964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073965, "dur": 0, "args": { "External id": 58536, "cbid": 200, "correlation": 58536 } }, { "ph": "f", "id": 58536, "pid": 76337, "tid": -914061504, "ts": 1716454222073965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073965, "dur": 0, "args": { "External id": 58537, "cbid": 200, "correlation": 58537 } }, { "ph": "f", "id": 58537, "pid": 76337, "tid": -914061504, "ts": 1716454222073965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073966, "dur": 0, "args": { "External id": 58538, "cbid": 200, "correlation": 58538 } }, { "ph": "f", "id": 58538, "pid": 76337, "tid": -914061504, "ts": 1716454222073966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073966, "dur": 0, "args": { "External id": 58539, "cbid": 200, "correlation": 58539 } }, { "ph": "f", "id": 58539, "pid": 76337, "tid": -914061504, "ts": 1716454222073966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073966, "dur": 0, "args": { "External id": 58540, "cbid": 200, "correlation": 58540 } }, { "ph": "f", "id": 58540, "pid": 76337, "tid": -914061504, "ts": 1716454222073966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222073967, "dur": 0, "args": { "External id": 58541, "cbid": 200, "correlation": 58541 } }, { "ph": "f", "id": 58541, "pid": 76337, "tid": -914061504, "ts": 1716454222073967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFree", "pid": 76337, "tid": -914061504, "ts": 1716454222073972, "dur": 12, "args": { "External id": 58542, "cbid": 22, "correlation": 58542 } }, { "ph": "f", "id": 58542, "pid": 76337, "tid": -914061504, "ts": 1716454222073972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222074016, "dur": 100, "args": { "External id": 58544, "cbid": 20, "correlation": 58544 } }, { "ph": "f", "id": 58544, "pid": 76337, "tid": -914061504, "ts": 1716454222074016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222074132, "dur": 9, "args": { "External id": 58553, "cbid": 20, "correlation": 58553 } }, { "ph": "f", "id": 58553, "pid": 76337, "tid": -914061504, "ts": 1716454222074132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222074149, "dur": 759, "args": { "External id": 58562, "cbid": 20, "correlation": 58562 } }, { "ph": "f", "id": 58562, "pid": 76337, "tid": -914061504, "ts": 1716454222074149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFree", "pid": 76337, "tid": -914061504, "ts": 1716454222074928, "dur": 0, "args": { "External id": 58565, "cbid": 22, "correlation": 58565 } }, { "ph": "f", "id": 58565, "pid": 76337, "tid": -914061504, "ts": 1716454222074928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222074933, "dur": 9, "args": { "External id": 58567, "cbid": 20, "correlation": 58567 } }, { "ph": "f", "id": 58567, "pid": 76337, "tid": -914061504, "ts": 1716454222074933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222074949, "dur": 11, "args": { "External id": 58576, "cbid": 20, "correlation": 58576 } }, { "ph": "f", "id": 58576, "pid": 76337, "tid": -914061504, "ts": 1716454222074949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222074968, "dur": 766, "args": { "External id": 58585, "cbid": 20, "correlation": 58585 } }, { "ph": "f", "id": 58585, "pid": 76337, "tid": -914061504, "ts": 1716454222074968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamCreateWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222075753, "dur": 69713, "args": { "External id": 58588, "cbid": 198, "correlation": 58588 } }, { "ph": "f", "id": 58588, "pid": 76337, "tid": -914061504, "ts": 1716454222075753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFree", "pid": 76337, "tid": -914061504, "ts": 1716454222145492, "dur": 4, "args": { "External id": 58590, "cbid": 22, "correlation": 58590 } }, { "ph": "f", "id": 58590, "pid": 76337, "tid": -914061504, "ts": 1716454222145492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222145520, "dur": 27, "args": { "External id": 58592, "cbid": 20, "correlation": 58592 } }, { "ph": "f", "id": 58592, "pid": 76337, "tid": -914061504, "ts": 1716454222145520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222145558, "dur": 8, "args": { "External id": 58601, "cbid": 20, "correlation": 58601 } }, { "ph": "f", "id": 58601, "pid": 76337, "tid": -914061504, "ts": 1716454222145558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222145571, "dur": 742, "args": { "External id": 58610, "cbid": 20, "correlation": 58610 } }, { "ph": "f", "id": 58610, "pid": 76337, "tid": -914061504, "ts": 1716454222145571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamCreateWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222146324, "dur": 16, "args": { "External id": 58613, "cbid": 198, "correlation": 58613 } }, { "ph": "f", "id": 58613, "pid": 76337, "tid": -914061504, "ts": 1716454222146324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFree", "pid": 76337, "tid": -914061504, "ts": 1716454222146342, "dur": 0, "args": { "External id": 58615, "cbid": 22, "correlation": 58615 } }, { "ph": "f", "id": 58615, "pid": 76337, "tid": -914061504, "ts": 1716454222146342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222146346, "dur": 8, "args": { "External id": 58617, "cbid": 20, "correlation": 58617 } }, { "ph": "f", "id": 58617, "pid": 76337, "tid": -914061504, "ts": 1716454222146346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222146360, "dur": 6, "args": { "External id": 58626, "cbid": 20, "correlation": 58626 } }, { "ph": "f", "id": 58626, "pid": 76337, "tid": -914061504, "ts": 1716454222146360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454222146371, "dur": 530, "args": { "External id": 58635, "cbid": 20, "correlation": 58635 } }, { "ph": "f", "id": 58635, "pid": 76337, "tid": -914061504, "ts": 1716454222146371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamCreateWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222146911, "dur": 464, "args": { "External id": 58638, "cbid": 198, "correlation": 58638 } }, { "ph": "f", "id": 58638, "pid": 76337, "tid": -914061504, "ts": 1716454222146911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFree", "pid": 76337, "tid": -914061504, "ts": 1716454222147398, "dur": 0, "args": { "External id": 58640, "cbid": 22, "correlation": 58640 } }, { "ph": "f", "id": 58640, "pid": 76337, "tid": -914061504, "ts": 1716454222147398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222147548, "dur": 3, "args": { "External id": 58643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58643, "pid": 5, "tid": 7, "ts": 1716454222147548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222147472, "dur": 79, "args": { "External id": 58643, "cbid": 211, "correlation": 58643 } }, { "ph": "s", "id": 58643, "pid": 76337, "tid": -914061504, "ts": 1716454222147472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454222147589, "dur": 6, "args": { "External id": 58645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58645, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 58645, "pid": 5, "tid": 7, "ts": 1716454222147589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222147554, "dur": 37, "args": { "External id": 58645, "cbid": 211, "correlation": 58645 } }, { "ph": "s", "id": 58645, "pid": 76337, "tid": -914061504, "ts": 1716454222147554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454222147617, "dur": 3, "args": { "External id": 58647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58647, "pid": 5, "tid": 7, "ts": 1716454222147617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222147593, "dur": 23, "args": { "External id": 58647, "cbid": 211, "correlation": 58647 } }, { "ph": "s", "id": 58647, "pid": 76337, "tid": -914061504, "ts": 1716454222147593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222257417, "dur": 3, "args": { "External id": 58655, "device": 5, "context": 1, "stream": 7, "correlation": 58655, "bytes": 8, "memory bandwidth (GB/s)": 0.0024271844660194173 } }, { "ph": "f", "id": 58655, "pid": 5, "tid": 7, "ts": 1716454222257417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222147778, "dur": 109645, "args": { "External id": 58655, "cbid": 41, "correlation": 58655 } }, { "ph": "s", "id": 58655, "pid": 76337, "tid": -914061504, "ts": 1716454222147778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222257644, "dur": 3, "args": { "External id": 58669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58669, "pid": 5, "tid": 7, "ts": 1716454222257644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222257620, "dur": 29, "args": { "External id": 58669, "cbid": 211, "correlation": 58669 } }, { "ph": "s", "id": 58669, "pid": 76337, "tid": -914061504, "ts": 1716454222257620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222257673, "dur": 2, "args": { "External id": 58683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58683, "pid": 5, "tid": 7, "ts": 1716454222257673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222257663, "dur": 9, "args": { "External id": 58683, "cbid": 211, "correlation": 58683 } }, { "ph": "s", "id": 58683, "pid": 76337, "tid": -914061504, "ts": 1716454222257663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222257961, "dur": 5, "args": { "External id": 58690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58690, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58690, "pid": 5, "tid": 7, "ts": 1716454222257961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222257772, "dur": 192, "args": { "External id": 58690, "cbid": 211, "correlation": 58690 } }, { "ph": "s", "id": 58690, "pid": 76337, "tid": -914061504, "ts": 1716454222257772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222258053, "dur": 5, "args": { "External id": 58693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58693, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58693, "pid": 5, "tid": 7, "ts": 1716454222258053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222257971, "dur": 82, "args": { "External id": 58693, "cbid": 211, "correlation": 58693 } }, { "ph": "s", "id": 58693, "pid": 76337, "tid": -914061504, "ts": 1716454222257971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454222258103, "dur": 3, "args": { "External id": 58695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58695, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58695, "pid": 5, "tid": 7, "ts": 1716454222258103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222258065, "dur": 39, "args": { "External id": 58695, "cbid": 211, "correlation": 58695 } }, { "ph": "s", "id": 58695, "pid": 76337, "tid": -914061504, "ts": 1716454222258065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222258159, "dur": 3, "args": { "External id": 58698, "device": 5, "context": 1, "stream": 7, "correlation": 58698, "bytes": 8, "memory bandwidth (GB/s)": 0.0024271844660194173 } }, { "ph": "f", "id": 58698, "pid": 5, "tid": 7, "ts": 1716454222258159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222258141, "dur": 18, "args": { "External id": 58698, "cbid": 41, "correlation": 58698 } }, { "ph": "s", "id": 58698, "pid": 76337, "tid": -914061504, "ts": 1716454222258141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222258357, "dur": 4, "args": { "External id": 58714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58714, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58714, "pid": 5, "tid": 7, "ts": 1716454222258357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222258259, "dur": 101, "args": { "External id": 58714, "cbid": 211, "correlation": 58714 } }, { "ph": "s", "id": 58714, "pid": 76337, "tid": -914061504, "ts": 1716454222258259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222258403, "dur": 3, "args": { "External id": 58719, "device": 5, "context": 1, "stream": 7, "correlation": 58719, "bytes": 1, "memory bandwidth (GB/s)": 0.0002920560747663551 } }, { "ph": "f", "id": 58719, "pid": 5, "tid": 7, "ts": 1716454222258403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222258368, "dur": 33, "args": { "External id": 58719, "cbid": 41, "correlation": 58719 } }, { "ph": "s", "id": 58719, "pid": 76337, "tid": -914061504, "ts": 1716454222258368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222258442, "dur": 1, "args": { "External id": 58725, "device": 5, "context": 1, "stream": 7, "correlation": 58725, "bytes": 1, "memory bandwidth (GB/s)": 0.0005580357142857143 } }, { "ph": "f", "id": 58725, "pid": 5, "tid": 7, "ts": 1716454222258442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222258417, "dur": 36, "args": { "External id": 58725, "cbid": 41, "correlation": 58725 } }, { "ph": "s", "id": 58725, "pid": 76337, "tid": -914061504, "ts": 1716454222258417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222258454, "dur": 10, "args": { "External id": 58726, "cbid": 131, "correlation": 58726 } }, { "ph": "f", "id": 58726, "pid": 76337, "tid": -914061504, "ts": 1716454222258454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222258582, "dur": 4, "args": { "External id": 58734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58734, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58734, "pid": 5, "tid": 7, "ts": 1716454222258582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222258567, "dur": 16, "args": { "External id": 58734, "cbid": 211, "correlation": 58734 } }, { "ph": "s", "id": 58734, "pid": 76337, "tid": -914061504, "ts": 1716454222258567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222258626, "dur": 3, "args": { "External id": 58744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58744, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58744, "pid": 5, "tid": 7, "ts": 1716454222258626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222258616, "dur": 9, "args": { "External id": 58744, "cbid": 211, "correlation": 58744 } }, { "ph": "s", "id": 58744, "pid": 76337, "tid": -914061504, "ts": 1716454222258616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222258662, "dur": 3, "args": { "External id": 58753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58753, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58753, "pid": 5, "tid": 7, "ts": 1716454222258662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222258652, "dur": 9, "args": { "External id": 58753, "cbid": 211, "correlation": 58753 } }, { "ph": "s", "id": 58753, "pid": 76337, "tid": -914061504, "ts": 1716454222258652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222259328, "dur": 9, "args": { "External id": 58763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58763, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58763, "pid": 5, "tid": 7, "ts": 1716454222259328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222258823, "dur": 513, "args": { "External id": 58763, "cbid": 211, "correlation": 58763 } }, { "ph": "s", "id": 58763, "pid": 76337, "tid": -914061504, "ts": 1716454222258823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222259387, "dur": 3, "args": { "External id": 58771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58771, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58771, "pid": 5, "tid": 7, "ts": 1716454222259387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222259376, "dur": 11, "args": { "External id": 58771, "cbid": 211, "correlation": 58771 } }, { "ph": "s", "id": 58771, "pid": 76337, "tid": -914061504, "ts": 1716454222259376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222259576, "dur": 8, "args": { "External id": 58781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58781, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58781, "pid": 5, "tid": 7, "ts": 1716454222259576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222259432, "dur": 147, "args": { "External id": 58781, "cbid": 211, "correlation": 58781 } }, { "ph": "s", "id": 58781, "pid": 76337, "tid": -914061504, "ts": 1716454222259432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222260432, "dur": 7, "args": { "External id": 58789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58789, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58789, "pid": 5, "tid": 7, "ts": 1716454222260432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222259615, "dur": 818, "args": { "External id": 58789, "cbid": 211, "correlation": 58789 } }, { "ph": "s", "id": 58789, "pid": 76337, "tid": -914061504, "ts": 1716454222259615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222260474, "dur": 3, "args": { "External id": 58798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58798, "pid": 5, "tid": 7, "ts": 1716454222260474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222260463, "dur": 10, "args": { "External id": 58798, "cbid": 211, "correlation": 58798 } }, { "ph": "s", "id": 58798, "pid": 76337, "tid": -914061504, "ts": 1716454222260463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222260501, "dur": 5, "args": { "External id": 58807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58807, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58807, "pid": 5, "tid": 7, "ts": 1716454222260501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222260491, "dur": 8, "args": { "External id": 58807, "cbid": 211, "correlation": 58807 } }, { "ph": "s", "id": 58807, "pid": 76337, "tid": -914061504, "ts": 1716454222260491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222260555, "dur": 9, "args": { "External id": 58817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58817, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58817, "pid": 5, "tid": 7, "ts": 1716454222260555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222260543, "dur": 13, "args": { "External id": 58817, "cbid": 211, "correlation": 58817 } }, { "ph": "s", "id": 58817, "pid": 76337, "tid": -914061504, "ts": 1716454222260543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222260985, "dur": 3, "args": { "External id": 58826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58826, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58826, "pid": 5, "tid": 7, "ts": 1716454222260985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222260960, "dur": 25, "args": { "External id": 58826, "cbid": 211, "correlation": 58826 } }, { "ph": "s", "id": 58826, "pid": 76337, "tid": -914061504, "ts": 1716454222260960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261020, "dur": 3, "args": { "External id": 58834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58834, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58834, "pid": 5, "tid": 7, "ts": 1716454222261020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261007, "dur": 13, "args": { "External id": 58834, "cbid": 211, "correlation": 58834 } }, { "ph": "s", "id": 58834, "pid": 76337, "tid": -914061504, "ts": 1716454222261007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222261103, "dur": 1, "args": { "External id": 58844, "device": 5, "context": 1, "stream": 7, "correlation": 58844, "bytes": 8, "memory bandwidth (GB/s)": 0.005 } }, { "ph": "f", "id": 58844, "pid": 5, "tid": 7, "ts": 1716454222261103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222261082, "dur": 19, "args": { "External id": 58844, "cbid": 41, "correlation": 58844 } }, { "ph": "s", "id": 58844, "pid": 76337, "tid": -914061504, "ts": 1716454222261082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222261102, "dur": 9, "args": { "External id": 58845, "cbid": 131, "correlation": 58845 } }, { "ph": "f", "id": 58845, "pid": 76337, "tid": -914061504, "ts": 1716454222261102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261227, "dur": 3, "args": { "External id": 58853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 58853, "pid": 5, "tid": 7, "ts": 1716454222261227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261211, "dur": 20, "args": { "External id": 58853, "cbid": 211, "correlation": 58853 } }, { "ph": "s", "id": 58853, "pid": 76337, "tid": -914061504, "ts": 1716454222261211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222261320, "dur": 3, "args": { "External id": 58862, "device": 5, "context": 1, "stream": 7, "correlation": 58862, "bytes": 8, "memory bandwidth (GB/s)": 0.0024271844660194173 } }, { "ph": "f", "id": 58862, "pid": 5, "tid": 7, "ts": 1716454222261320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222261300, "dur": 19, "args": { "External id": 58862, "cbid": 41, "correlation": 58862 } }, { "ph": "s", "id": 58862, "pid": 76337, "tid": -914061504, "ts": 1716454222261300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222261412, "dur": 4, "args": { "External id": 58872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58872, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 58872, "pid": 5, "tid": 7, "ts": 1716454222261412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261395, "dur": 19, "args": { "External id": 58872, "cbid": 211, "correlation": 58872 } }, { "ph": "s", "id": 58872, "pid": 76337, "tid": -914061504, "ts": 1716454222261395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222261468, "dur": 1, "args": { "External id": 58882, "device": 5, "context": 1, "stream": 7, "correlation": 58882, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 58882, "pid": 5, "tid": 7, "ts": 1716454222261468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222261453, "dur": 13, "args": { "External id": 58882, "cbid": 41, "correlation": 58882 } }, { "ph": "s", "id": 58882, "pid": 76337, "tid": -914061504, "ts": 1716454222261453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222261467, "dur": 8, "args": { "External id": 58883, "cbid": 131, "correlation": 58883 } }, { "ph": "f", "id": 58883, "pid": 76337, "tid": -914061504, "ts": 1716454222261467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222261545, "dur": 3, "args": { "External id": 58890, "device": 5, "context": 1, "stream": 7, "correlation": 58890, "bytes": 98304, "memory bandwidth (GB/s)": 28.444444444444443 } }, { "ph": "f", "id": 58890, "pid": 5, "tid": 7, "ts": 1716454222261545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222261514, "dur": 31, "args": { "External id": 58890, "cbid": 41, "correlation": 58890 } }, { "ph": "s", "id": 58890, "pid": 76337, "tid": -914061504, "ts": 1716454222261514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222261596, "dur": 1, "args": { "External id": 58901, "device": 5, "context": 1, "stream": 7, "correlation": 58901, "bytes": 2, "memory bandwidth (GB/s)": 0.00125 } }, { "ph": "f", "id": 58901, "pid": 5, "tid": 7, "ts": 1716454222261596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222261584, "dur": 10, "args": { "External id": 58901, "cbid": 41, "correlation": 58901 } }, { "ph": "s", "id": 58901, "pid": 76337, "tid": -914061504, "ts": 1716454222261584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222261594, "dur": 8, "args": { "External id": 58902, "cbid": 131, "correlation": 58902 } }, { "ph": "f", "id": 58902, "pid": 76337, "tid": -914061504, "ts": 1716454222261594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261657, "dur": 3, "args": { "External id": 58910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58910, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58910, "pid": 5, "tid": 7, "ts": 1716454222261657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261643, "dur": 15, "args": { "External id": 58910, "cbid": 211, "correlation": 58910 } }, { "ph": "s", "id": 58910, "pid": 76337, "tid": -914061504, "ts": 1716454222261643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261688, "dur": 3, "args": { "External id": 58920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58920, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58920, "pid": 5, "tid": 7, "ts": 1716454222261688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261678, "dur": 9, "args": { "External id": 58920, "cbid": 211, "correlation": 58920 } }, { "ph": "s", "id": 58920, "pid": 76337, "tid": -914061504, "ts": 1716454222261678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261710, "dur": 3, "args": { "External id": 58929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58929, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58929, "pid": 5, "tid": 7, "ts": 1716454222261710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261701, "dur": 10, "args": { "External id": 58929, "cbid": 211, "correlation": 58929 } }, { "ph": "s", "id": 58929, "pid": 76337, "tid": -914061504, "ts": 1716454222261701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222261786, "dur": 6, "args": { "External id": 58937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58937, "pid": 5, "tid": 7, "ts": 1716454222261786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261773, "dur": 14, "args": { "External id": 58937, "cbid": 211, "correlation": 58937 } }, { "ph": "s", "id": 58937, "pid": 76337, "tid": -914061504, "ts": 1716454222261773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261827, "dur": 3, "args": { "External id": 58946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58946, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58946, "pid": 5, "tid": 7, "ts": 1716454222261827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261817, "dur": 9, "args": { "External id": 58946, "cbid": 211, "correlation": 58946 } }, { "ph": "s", "id": 58946, "pid": 76337, "tid": -914061504, "ts": 1716454222261817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261850, "dur": 3, "args": { "External id": 58955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58955, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58955, "pid": 5, "tid": 7, "ts": 1716454222261850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261841, "dur": 7, "args": { "External id": 58955, "cbid": 211, "correlation": 58955 } }, { "ph": "s", "id": 58955, "pid": 76337, "tid": -914061504, "ts": 1716454222261841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222261916, "dur": 3, "args": { "External id": 58963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 58963, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 58963, "pid": 5, "tid": 7, "ts": 1716454222261916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222261905, "dur": 10, "args": { "External id": 58963, "cbid": 211, "correlation": 58963 } }, { "ph": "s", "id": 58963, "pid": 76337, "tid": -914061504, "ts": 1716454222261905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222261991, "dur": 1, "args": { "External id": 58971, "device": 5, "context": 1, "stream": 7, "correlation": 58971, "bytes": 8, "memory bandwidth (GB/s)": 0.00423728813559322 } }, { "ph": "f", "id": 58971, "pid": 5, "tid": 7, "ts": 1716454222261991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222261965, "dur": 36, "args": { "External id": 58971, "cbid": 41, "correlation": 58971 } }, { "ph": "s", "id": 58971, "pid": 76337, "tid": -914061504, "ts": 1716454222261965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222262002, "dur": 4, "args": { "External id": 58972, "cbid": 131, "correlation": 58972 } }, { "ph": "f", "id": 58972, "pid": 76337, "tid": -914061504, "ts": 1716454222262002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222262065, "dur": 1, "args": { "External id": 58982, "device": 5, "context": 1, "stream": 7, "correlation": 58982, "bytes": 42, "memory bandwidth (GB/s)": 0.026802807913209957 } }, { "ph": "f", "id": 58982, "pid": 5, "tid": 7, "ts": 1716454222262065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222262052, "dur": 10, "args": { "External id": 58982, "cbid": 41, "correlation": 58982 } }, { "ph": "s", "id": 58982, "pid": 76337, "tid": -914061504, "ts": 1716454222262052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222262063, "dur": 8, "args": { "External id": 58983, "cbid": 131, "correlation": 58983 } }, { "ph": "f", "id": 58983, "pid": 76337, "tid": -914061504, "ts": 1716454222262063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222262121, "dur": 1, "args": { "External id": 58992, "device": 5, "context": 1, "stream": 7, "correlation": 58992, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 58992, "pid": 5, "tid": 7, "ts": 1716454222262121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222262110, "dur": 8, "args": { "External id": 58992, "cbid": 41, "correlation": 58992 } }, { "ph": "s", "id": 58992, "pid": 76337, "tid": -914061504, "ts": 1716454222262110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222262119, "dur": 8, "args": { "External id": 58993, "cbid": 131, "correlation": 58993 } }, { "ph": "f", "id": 58993, "pid": 76337, "tid": -914061504, "ts": 1716454222262119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222262197, "dur": 4, "args": { "External id": 59000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59000, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59000, "pid": 5, "tid": 7, "ts": 1716454222262197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262178, "dur": 21, "args": { "External id": 59000, "cbid": 211, "correlation": 59000 } }, { "ph": "s", "id": 59000, "pid": 76337, "tid": -914061504, "ts": 1716454222262178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454222262237, "dur": 4, "args": { "External id": 59020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59020, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59020, "pid": 5, "tid": 7, "ts": 1716454222262237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262226, "dur": 12, "args": { "External id": 59020, "cbid": 211, "correlation": 59020 } }, { "ph": "s", "id": 59020, "pid": 76337, "tid": -914061504, "ts": 1716454222262226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222262238, "dur": 0, "args": { "External id": 59021, "cbid": 11, "correlation": 59021 } }, { "ph": "f", "id": 59021, "pid": 76337, "tid": -914061504, "ts": 1716454222262238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222262239, "dur": 0, "args": { "External id": 59022, "cbid": 11, "correlation": 59022 } }, { "ph": "f", "id": 59022, "pid": 76337, "tid": -914061504, "ts": 1716454222262239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222262254, "dur": 1, "args": { "External id": 59025, "device": 5, "context": 1, "stream": 7, "correlation": 59025, "bytes": 4, "memory bandwidth (GB/s)": 0.0022727272727272726 } }, { "ph": "f", "id": 59025, "pid": 5, "tid": 7, "ts": 1716454222262254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222262240, "dur": 23, "args": { "External id": 59025, "cbid": 41, "correlation": 59025 } }, { "ph": "s", "id": 59025, "pid": 76337, "tid": -914061504, "ts": 1716454222262240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222262264, "dur": 3, "args": { "External id": 59026, "cbid": 131, "correlation": 59026 } }, { "ph": "f", "id": 59026, "pid": 76337, "tid": -914061504, "ts": 1716454222262264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222262292, "dur": 3, "args": { "External id": 59050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59050, "pid": 5, "tid": 7, "ts": 1716454222262292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262283, "dur": 9, "args": { "External id": 59050, "cbid": 211, "correlation": 59050 } }, { "ph": "s", "id": 59050, "pid": 76337, "tid": -914061504, "ts": 1716454222262283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222262293, "dur": 0, "args": { "External id": 59051, "cbid": 11, "correlation": 59051 } }, { "ph": "f", "id": 59051, "pid": 76337, "tid": -914061504, "ts": 1716454222262293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222262293, "dur": 0, "args": { "External id": 59052, "cbid": 11, "correlation": 59052 } }, { "ph": "f", "id": 59052, "pid": 76337, "tid": -914061504, "ts": 1716454222262293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222262295, "dur": 1, "args": { "External id": 59054, "cbid": 200, "correlation": 59054 } }, { "ph": "f", "id": 59054, "pid": 76337, "tid": -914061504, "ts": 1716454222262295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454222262306, "dur": 4, "args": { "External id": 59056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59056, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59056, "pid": 5, "tid": 7, "ts": 1716454222262306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262298, "dur": 9, "args": { "External id": 59056, "cbid": 211, "correlation": 59056 } }, { "ph": "s", "id": 59056, "pid": 76337, "tid": -914061504, "ts": 1716454222262298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222262307, "dur": 0, "args": { "External id": 59057, "cbid": 11, "correlation": 59057 } }, { "ph": "f", "id": 59057, "pid": 76337, "tid": -914061504, "ts": 1716454222262307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222262308, "dur": 0, "args": { "External id": 59058, "cbid": 11, "correlation": 59058 } }, { "ph": "f", "id": 59058, "pid": 76337, "tid": -914061504, "ts": 1716454222262308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222262345, "dur": 1, "args": { "External id": 59065, "device": 5, "context": 1, "stream": 7, "correlation": 59065, "bytes": 8, "memory bandwidth (GB/s)": 0.004629629629629629 } }, { "ph": "f", "id": 59065, "pid": 5, "tid": 7, "ts": 1716454222262345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222262333, "dur": 20, "args": { "External id": 59065, "cbid": 41, "correlation": 59065 } }, { "ph": "s", "id": 59065, "pid": 76337, "tid": -914061504, "ts": 1716454222262333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222262354, "dur": 3, "args": { "External id": 59066, "cbid": 131, "correlation": 59066 } }, { "ph": "f", "id": 59066, "pid": 76337, "tid": -914061504, "ts": 1716454222262354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222262405, "dur": 1, "args": { "External id": 59076, "device": 5, "context": 1, "stream": 7, "correlation": 59076, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 59076, "pid": 5, "tid": 7, "ts": 1716454222262405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222262392, "dur": 10, "args": { "External id": 59076, "cbid": 41, "correlation": 59076 } }, { "ph": "s", "id": 59076, "pid": 76337, "tid": -914061504, "ts": 1716454222262392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222262404, "dur": 8, "args": { "External id": 59077, "cbid": 131, "correlation": 59077 } }, { "ph": "f", "id": 59077, "pid": 76337, "tid": -914061504, "ts": 1716454222262404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222262479, "dur": 6, "args": { "External id": 59084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59084, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59084, "pid": 5, "tid": 7, "ts": 1716454222262479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262463, "dur": 16, "args": { "External id": 59084, "cbid": 211, "correlation": 59084 } }, { "ph": "s", "id": 59084, "pid": 76337, "tid": -914061504, "ts": 1716454222262463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262549, "dur": 3, "args": { "External id": 59093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59093, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59093, "pid": 5, "tid": 7, "ts": 1716454222262549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262536, "dur": 13, "args": { "External id": 59093, "cbid": 211, "correlation": 59093 } }, { "ph": "s", "id": 59093, "pid": 76337, "tid": -914061504, "ts": 1716454222262536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262586, "dur": 3, "args": { "External id": 59101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59101, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59101, "pid": 5, "tid": 7, "ts": 1716454222262586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262575, "dur": 10, "args": { "External id": 59101, "cbid": 211, "correlation": 59101 } }, { "ph": "s", "id": 59101, "pid": 76337, "tid": -914061504, "ts": 1716454222262575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262617, "dur": 4, "args": { "External id": 59109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59109, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59109, "pid": 5, "tid": 7, "ts": 1716454222262617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262607, "dur": 11, "args": { "External id": 59109, "cbid": 211, "correlation": 59109 } }, { "ph": "s", "id": 59109, "pid": 76337, "tid": -914061504, "ts": 1716454222262607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262647, "dur": 5, "args": { "External id": 59117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59117, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59117, "pid": 5, "tid": 7, "ts": 1716454222262647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262637, "dur": 9, "args": { "External id": 59117, "cbid": 211, "correlation": 59117 } }, { "ph": "s", "id": 59117, "pid": 76337, "tid": -914061504, "ts": 1716454222262637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262672, "dur": 3, "args": { "External id": 59125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59125, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59125, "pid": 5, "tid": 7, "ts": 1716454222262672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262663, "dur": 8, "args": { "External id": 59125, "cbid": 211, "correlation": 59125 } }, { "ph": "s", "id": 59125, "pid": 76337, "tid": -914061504, "ts": 1716454222262663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262699, "dur": 4, "args": { "External id": 59133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59133, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59133, "pid": 5, "tid": 7, "ts": 1716454222262699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262688, "dur": 10, "args": { "External id": 59133, "cbid": 211, "correlation": 59133 } }, { "ph": "s", "id": 59133, "pid": 76337, "tid": -914061504, "ts": 1716454222262688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222262720, "dur": 4, "args": { "External id": 59141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59141, "pid": 5, "tid": 7, "ts": 1716454222262720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262712, "dur": 7, "args": { "External id": 59141, "cbid": 211, "correlation": 59141 } }, { "ph": "s", "id": 59141, "pid": 76337, "tid": -914061504, "ts": 1716454222262712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222262739, "dur": 5, "args": { "External id": 59149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59149, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59149, "pid": 5, "tid": 7, "ts": 1716454222262739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262731, "dur": 9, "args": { "External id": 59149, "cbid": 211, "correlation": 59149 } }, { "ph": "s", "id": 59149, "pid": 76337, "tid": -914061504, "ts": 1716454222262731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262762, "dur": 3, "args": { "External id": 59157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59157, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59157, "pid": 5, "tid": 7, "ts": 1716454222262762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262754, "dur": 7, "args": { "External id": 59157, "cbid": 211, "correlation": 59157 } }, { "ph": "s", "id": 59157, "pid": 76337, "tid": -914061504, "ts": 1716454222262754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262819, "dur": 3, "args": { "External id": 59165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59165, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59165, "pid": 5, "tid": 7, "ts": 1716454222262819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262808, "dur": 10, "args": { "External id": 59165, "cbid": 211, "correlation": 59165 } }, { "ph": "s", "id": 59165, "pid": 76337, "tid": -914061504, "ts": 1716454222262808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222262844, "dur": 5, "args": { "External id": 59173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59173, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59173, "pid": 5, "tid": 7, "ts": 1716454222262844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262835, "dur": 8, "args": { "External id": 59173, "cbid": 211, "correlation": 59173 } }, { "ph": "s", "id": 59173, "pid": 76337, "tid": -914061504, "ts": 1716454222262835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222262867, "dur": 5, "args": { "External id": 59181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59181, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59181, "pid": 5, "tid": 7, "ts": 1716454222262867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262858, "dur": 8, "args": { "External id": 59181, "cbid": 211, "correlation": 59181 } }, { "ph": "s", "id": 59181, "pid": 76337, "tid": -914061504, "ts": 1716454222262858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222262885, "dur": 3, "args": { "External id": 59189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59189, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59189, "pid": 5, "tid": 7, "ts": 1716454222262885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222262877, "dur": 6, "args": { "External id": 59189, "cbid": 211, "correlation": 59189 } }, { "ph": "s", "id": 59189, "pid": 76337, "tid": -914061504, "ts": 1716454222262877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222263284, "dur": 6, "args": { "External id": 59198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59198, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59198, "pid": 5, "tid": 7, "ts": 1716454222263284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263267, "dur": 17, "args": { "External id": 59198, "cbid": 211, "correlation": 59198 } }, { "ph": "s", "id": 59198, "pid": 76337, "tid": -914061504, "ts": 1716454222263267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222263321, "dur": 6, "args": { "External id": 59207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59207, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59207, "pid": 5, "tid": 7, "ts": 1716454222263321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263310, "dur": 9, "args": { "External id": 59207, "cbid": 211, "correlation": 59207 } }, { "ph": "s", "id": 59207, "pid": 76337, "tid": -914061504, "ts": 1716454222263310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222263459, "dur": 3, "args": { "External id": 59223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59223, "pid": 5, "tid": 7, "ts": 1716454222263459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263443, "dur": 16, "args": { "External id": 59223, "cbid": 211, "correlation": 59223 } }, { "ph": "s", "id": 59223, "pid": 76337, "tid": -914061504, "ts": 1716454222263443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222263492, "dur": 3, "args": { "External id": 59231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59231, "pid": 5, "tid": 7, "ts": 1716454222263492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263483, "dur": 8, "args": { "External id": 59231, "cbid": 211, "correlation": 59231 } }, { "ph": "s", "id": 59231, "pid": 76337, "tid": -914061504, "ts": 1716454222263483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222263525, "dur": 3, "args": { "External id": 59239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59239, "pid": 5, "tid": 7, "ts": 1716454222263525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263514, "dur": 9, "args": { "External id": 59239, "cbid": 211, "correlation": 59239 } }, { "ph": "s", "id": 59239, "pid": 76337, "tid": -914061504, "ts": 1716454222263514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222263555, "dur": 4, "args": { "External id": 59247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59247, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59247, "pid": 5, "tid": 7, "ts": 1716454222263555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263546, "dur": 8, "args": { "External id": 59247, "cbid": 211, "correlation": 59247 } }, { "ph": "s", "id": 59247, "pid": 76337, "tid": -914061504, "ts": 1716454222263546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222263612, "dur": 4, "args": { "External id": 59259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59259, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59259, "pid": 5, "tid": 7, "ts": 1716454222263612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263599, "dur": 13, "args": { "External id": 59259, "cbid": 211, "correlation": 59259 } }, { "ph": "s", "id": 59259, "pid": 76337, "tid": -914061504, "ts": 1716454222263599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222263657, "dur": 4, "args": { "External id": 59270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59270, "pid": 5, "tid": 7, "ts": 1716454222263657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263646, "dur": 11, "args": { "External id": 59270, "cbid": 211, "correlation": 59270 } }, { "ph": "s", "id": 59270, "pid": 76337, "tid": -914061504, "ts": 1716454222263646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222263688, "dur": 3, "args": { "External id": 59278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59278, "pid": 5, "tid": 7, "ts": 1716454222263688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263678, "dur": 9, "args": { "External id": 59278, "cbid": 211, "correlation": 59278 } }, { "ph": "s", "id": 59278, "pid": 76337, "tid": -914061504, "ts": 1716454222263678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222263724, "dur": 6, "args": { "External id": 59286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59286, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59286, "pid": 5, "tid": 7, "ts": 1716454222263724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263714, "dur": 12, "args": { "External id": 59286, "cbid": 211, "correlation": 59286 } }, { "ph": "s", "id": 59286, "pid": 76337, "tid": -914061504, "ts": 1716454222263714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222263756, "dur": 5, "args": { "External id": 59294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59294, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59294, "pid": 5, "tid": 7, "ts": 1716454222263756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263745, "dur": 10, "args": { "External id": 59294, "cbid": 211, "correlation": 59294 } }, { "ph": "s", "id": 59294, "pid": 76337, "tid": -914061504, "ts": 1716454222263745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222263787, "dur": 4, "args": { "External id": 59303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59303, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59303, "pid": 5, "tid": 7, "ts": 1716454222263787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263776, "dur": 10, "args": { "External id": 59303, "cbid": 211, "correlation": 59303 } }, { "ph": "s", "id": 59303, "pid": 76337, "tid": -914061504, "ts": 1716454222263776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222263848, "dur": 5, "args": { "External id": 59316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59316, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59316, "pid": 5, "tid": 7, "ts": 1716454222263848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263835, "dur": 12, "args": { "External id": 59316, "cbid": 211, "correlation": 59316 } }, { "ph": "s", "id": 59316, "pid": 76337, "tid": -914061504, "ts": 1716454222263835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222263888, "dur": 5, "args": { "External id": 59326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59326, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59326, "pid": 5, "tid": 7, "ts": 1716454222263888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222263877, "dur": 10, "args": { "External id": 59326, "cbid": 211, "correlation": 59326 } }, { "ph": "s", "id": 59326, "pid": 76337, "tid": -914061504, "ts": 1716454222263877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222264034, "dur": 5, "args": { "External id": 59343, "cbid": 251, "correlation": 59343 } }, { "ph": "f", "id": 59343, "pid": 76337, "tid": -914061504, "ts": 1716454222264034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454222264064, "dur": 12, "args": { "External id": 59345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59345, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59345, "pid": 5, "tid": 7, "ts": 1716454222264064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264048, "dur": 17, "args": { "External id": 59345, "cbid": 211, "correlation": 59345 } }, { "ph": "s", "id": 59345, "pid": 76337, "tid": -914061504, "ts": 1716454222264048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222264130, "dur": 4, "args": { "External id": 59353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59353, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59353, "pid": 5, "tid": 7, "ts": 1716454222264130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264117, "dur": 12, "args": { "External id": 59353, "cbid": 211, "correlation": 59353 } }, { "ph": "s", "id": 59353, "pid": 76337, "tid": -914061504, "ts": 1716454222264117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222264191, "dur": 2, "args": { "External id": 59369, "cbid": 251, "correlation": 59369 } }, { "ph": "f", "id": 59369, "pid": 76337, "tid": -914061504, "ts": 1716454222264191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222264197, "dur": 0, "args": { "External id": 59371, "cbid": 251, "correlation": 59371 } }, { "ph": "f", "id": 59371, "pid": 76337, "tid": -914061504, "ts": 1716454222264197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222264214, "dur": 14, "args": { "External id": 59372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59372, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59372, "pid": 5, "tid": 7, "ts": 1716454222264214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264200, "dur": 14, "args": { "External id": 59372, "cbid": 211, "correlation": 59372 } }, { "ph": "s", "id": 59372, "pid": 76337, "tid": -914061504, "ts": 1716454222264200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222264230, "dur": 5, "args": { "External id": 59374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59374, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59374, "pid": 5, "tid": 7, "ts": 1716454222264230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264219, "dur": 9, "args": { "External id": 59374, "cbid": 211, "correlation": 59374 } }, { "ph": "s", "id": 59374, "pid": 76337, "tid": -914061504, "ts": 1716454222264219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222264337, "dur": 1, "args": { "External id": 59384, "cbid": 317, "correlation": 59384 } }, { "ph": "f", "id": 59384, "pid": 76337, "tid": -914061504, "ts": 1716454222264337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222264339, "dur": 1, "args": { "External id": 59385, "cbid": 203, "correlation": 59385 } }, { "ph": "f", "id": 59385, "pid": 76337, "tid": -914061504, "ts": 1716454222264339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222264341, "dur": 1, "args": { "External id": 59386, "cbid": 205, "correlation": 59386 } }, { "ph": "f", "id": 59386, "pid": 76337, "tid": -914061504, "ts": 1716454222264341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222264401, "dur": 7, "args": { "External id": 59390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59390, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59390, "pid": 5, "tid": 7, "ts": 1716454222264401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264384, "dur": 16, "args": { "External id": 59390, "cbid": 211, "correlation": 59390 } }, { "ph": "s", "id": 59390, "pid": 76337, "tid": -914061504, "ts": 1716454222264384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222264412, "dur": 4, "args": { "External id": 59392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59392, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 59392, "pid": 5, "tid": 7, "ts": 1716454222264412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264404, "dur": 6, "args": { "External id": 59392, "cbid": 211, "correlation": 59392 } }, { "ph": "s", "id": 59392, "pid": 76337, "tid": -914061504, "ts": 1716454222264404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222264431, "dur": 4, "args": { "External id": 59394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59394, "pid": 5, "tid": 7, "ts": 1716454222264431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264422, "dur": 8, "args": { "External id": 59394, "cbid": 211, "correlation": 59394 } }, { "ph": "s", "id": 59394, "pid": 76337, "tid": -914061504, "ts": 1716454222264422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222264438, "dur": 0, "args": { "External id": 59395, "cbid": 51, "correlation": 59395 } }, { "ph": "s", "id": 59395, "pid": 76337, "tid": -914061504, "ts": 1716454222264438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222264449, "dur": 93, "args": { "External id": 59396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59396, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59396, "pid": 5, "tid": 7, "ts": 1716454222264449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264439, "dur": 8, "args": { "External id": 59396, "cbid": 211, "correlation": 59396 } }, { "ph": "s", "id": 59396, "pid": 76337, "tid": -914061504, "ts": 1716454222264439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222264544, "dur": 64, "args": { "External id": 59401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59401, "pid": 5, "tid": 7, "ts": 1716454222264544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222264477, "dur": 10, "args": { "External id": 59401, "cbid": 211, "correlation": 59401 } }, { "ph": "s", "id": 59401, "pid": 76337, "tid": -914061504, "ts": 1716454222264477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222266376, "dur": 55, "args": { "External id": 59421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59421, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 59421, "pid": 5, "tid": 7, "ts": 1716454222266376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266358, "dur": 18, "args": { "External id": 59421, "cbid": 211, "correlation": 59421 } }, { "ph": "s", "id": 59421, "pid": 76337, "tid": -914061504, "ts": 1716454222266358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222266433, "dur": 5, "args": { "External id": 59433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59433, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59433, "pid": 5, "tid": 7, "ts": 1716454222266433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266392, "dur": 9, "args": { "External id": 59433, "cbid": 211, "correlation": 59433 } }, { "ph": "s", "id": 59433, "pid": 76337, "tid": -914061504, "ts": 1716454222266392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222266439, "dur": 61, "args": { "External id": 59436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59436, "pid": 5, "tid": 7, "ts": 1716454222266439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266417, "dur": 8, "args": { "External id": 59436, "cbid": 211, "correlation": 59436 } }, { "ph": "s", "id": 59436, "pid": 76337, "tid": -914061504, "ts": 1716454222266417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222266501, "dur": 38, "args": { "External id": 59445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59445, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59445, "pid": 5, "tid": 7, "ts": 1716454222266501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266463, "dur": 10, "args": { "External id": 59445, "cbid": 211, "correlation": 59445 } }, { "ph": "s", "id": 59445, "pid": 76337, "tid": -914061504, "ts": 1716454222266463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222266522, "dur": 0, "args": { "External id": 59455, "cbid": 317, "correlation": 59455 } }, { "ph": "f", "id": 59455, "pid": 76337, "tid": -914061504, "ts": 1716454222266522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222266523, "dur": 0, "args": { "External id": 59456, "cbid": 203, "correlation": 59456 } }, { "ph": "f", "id": 59456, "pid": 76337, "tid": -914061504, "ts": 1716454222266523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222266523, "dur": 0, "args": { "External id": 59457, "cbid": 205, "correlation": 59457 } }, { "ph": "f", "id": 59457, "pid": 76337, "tid": -914061504, "ts": 1716454222266523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222266555, "dur": 42, "args": { "External id": 59461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59461, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59461, "pid": 5, "tid": 7, "ts": 1716454222266555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266542, "dur": 12, "args": { "External id": 59461, "cbid": 211, "correlation": 59461 } }, { "ph": "s", "id": 59461, "pid": 76337, "tid": -914061504, "ts": 1716454222266542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222266598, "dur": 15, "args": { "External id": 59463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59463, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59463, "pid": 5, "tid": 7, "ts": 1716454222266598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266557, "dur": 6, "args": { "External id": 59463, "cbid": 211, "correlation": 59463 } }, { "ph": "s", "id": 59463, "pid": 76337, "tid": -914061504, "ts": 1716454222266557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222266615, "dur": 3, "args": { "External id": 59465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59465, "pid": 5, "tid": 7, "ts": 1716454222266615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266569, "dur": 6, "args": { "External id": 59465, "cbid": 211, "correlation": 59465 } }, { "ph": "s", "id": 59465, "pid": 76337, "tid": -914061504, "ts": 1716454222266569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222266579, "dur": 0, "args": { "External id": 59466, "cbid": 51, "correlation": 59466 } }, { "ph": "s", "id": 59466, "pid": 76337, "tid": -914061504, "ts": 1716454222266579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222266620, "dur": 770, "args": { "External id": 59467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59467, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59467, "pid": 5, "tid": 7, "ts": 1716454222266620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266581, "dur": 6, "args": { "External id": 59467, "cbid": 211, "correlation": 59467 } }, { "ph": "s", "id": 59467, "pid": 76337, "tid": -914061504, "ts": 1716454222266581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222267391, "dur": 62, "args": { "External id": 59472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59472, "pid": 5, "tid": 7, "ts": 1716454222267391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266610, "dur": 9, "args": { "External id": 59472, "cbid": 211, "correlation": 59472 } }, { "ph": "s", "id": 59472, "pid": 76337, "tid": -914061504, "ts": 1716454222266610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222267455, "dur": 4, "args": { "External id": 59480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59480, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59480, "pid": 5, "tid": 7, "ts": 1716454222267455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266656, "dur": 9, "args": { "External id": 59480, "cbid": 211, "correlation": 59480 } }, { "ph": "s", "id": 59480, "pid": 76337, "tid": -914061504, "ts": 1716454222266656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222266727, "dur": 1, "args": { "External id": 59496, "cbid": 251, "correlation": 59496 } }, { "ph": "f", "id": 59496, "pid": 76337, "tid": -914061504, "ts": 1716454222266727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222266733, "dur": 0, "args": { "External id": 59498, "cbid": 251, "correlation": 59498 } }, { "ph": "f", "id": 59498, "pid": 76337, "tid": -914061504, "ts": 1716454222266733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222267460, "dur": 10, "args": { "External id": 59499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59499, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 59499, "pid": 5, "tid": 7, "ts": 1716454222267460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266735, "dur": 12, "args": { "External id": 59499, "cbid": 211, "correlation": 59499 } }, { "ph": "s", "id": 59499, "pid": 76337, "tid": -914061504, "ts": 1716454222266735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222267471, "dur": 5, "args": { "External id": 59501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59501, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 59501, "pid": 5, "tid": 7, "ts": 1716454222267471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266749, "dur": 6, "args": { "External id": 59501, "cbid": 211, "correlation": 59501 } }, { "ph": "s", "id": 59501, "pid": 76337, "tid": -914061504, "ts": 1716454222266749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222267477, "dur": 56, "args": { "External id": 59511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59511, "pid": 5, "tid": 7, "ts": 1716454222267477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266810, "dur": 12, "args": { "External id": 59511, "cbid": 211, "correlation": 59511 } }, { "ph": "s", "id": 59511, "pid": 76337, "tid": -914061504, "ts": 1716454222266810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222267535, "dur": 55, "args": { "External id": 59531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59531, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 59531, "pid": 5, "tid": 7, "ts": 1716454222267535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266878, "dur": 11, "args": { "External id": 59531, "cbid": 211, "correlation": 59531 } }, { "ph": "s", "id": 59531, "pid": 76337, "tid": -914061504, "ts": 1716454222266878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222267592, "dur": 4, "args": { "External id": 59543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59543, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59543, "pid": 5, "tid": 7, "ts": 1716454222267592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266899, "dur": 6, "args": { "External id": 59543, "cbid": 211, "correlation": 59543 } }, { "ph": "s", "id": 59543, "pid": 76337, "tid": -914061504, "ts": 1716454222266899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222267597, "dur": 59, "args": { "External id": 59546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59546, "pid": 5, "tid": 7, "ts": 1716454222267597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266918, "dur": 7, "args": { "External id": 59546, "cbid": 211, "correlation": 59546 } }, { "ph": "s", "id": 59546, "pid": 76337, "tid": -914061504, "ts": 1716454222266918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222267657, "dur": 38, "args": { "External id": 59555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59555, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59555, "pid": 5, "tid": 7, "ts": 1716454222267657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222266958, "dur": 10, "args": { "External id": 59555, "cbid": 211, "correlation": 59555 } }, { "ph": "s", "id": 59555, "pid": 76337, "tid": -914061504, "ts": 1716454222266958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222267040, "dur": 0, "args": { "External id": 59565, "cbid": 317, "correlation": 59565 } }, { "ph": "f", "id": 59565, "pid": 76337, "tid": -914061504, "ts": 1716454222267040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222267041, "dur": 0, "args": { "External id": 59566, "cbid": 203, "correlation": 59566 } }, { "ph": "f", "id": 59566, "pid": 76337, "tid": -914061504, "ts": 1716454222267041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222267042, "dur": 0, "args": { "External id": 59567, "cbid": 205, "correlation": 59567 } }, { "ph": "f", "id": 59567, "pid": 76337, "tid": -914061504, "ts": 1716454222267042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222267697, "dur": 41, "args": { "External id": 59571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59571, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59571, "pid": 5, "tid": 7, "ts": 1716454222267697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267057, "dur": 13, "args": { "External id": 59571, "cbid": 211, "correlation": 59571 } }, { "ph": "s", "id": 59571, "pid": 76337, "tid": -914061504, "ts": 1716454222267057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222267739, "dur": 16, "args": { "External id": 59573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59573, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59573, "pid": 5, "tid": 7, "ts": 1716454222267739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267072, "dur": 5, "args": { "External id": 59573, "cbid": 211, "correlation": 59573 } }, { "ph": "s", "id": 59573, "pid": 76337, "tid": -914061504, "ts": 1716454222267072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222267756, "dur": 4, "args": { "External id": 59575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59575, "pid": 5, "tid": 7, "ts": 1716454222267756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267081, "dur": 5, "args": { "External id": 59575, "cbid": 211, "correlation": 59575 } }, { "ph": "s", "id": 59575, "pid": 76337, "tid": -914061504, "ts": 1716454222267081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222267090, "dur": 0, "args": { "External id": 59576, "cbid": 51, "correlation": 59576 } }, { "ph": "s", "id": 59576, "pid": 76337, "tid": -914061504, "ts": 1716454222267090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222267761, "dur": 764, "args": { "External id": 59577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59577, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59577, "pid": 5, "tid": 7, "ts": 1716454222267761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267090, "dur": 5, "args": { "External id": 59577, "cbid": 211, "correlation": 59577 } }, { "ph": "s", "id": 59577, "pid": 76337, "tid": -914061504, "ts": 1716454222267090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222268526, "dur": 63, "args": { "External id": 59582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59582, "pid": 5, "tid": 7, "ts": 1716454222268526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267119, "dur": 9, "args": { "External id": 59582, "cbid": 211, "correlation": 59582 } }, { "ph": "s", "id": 59582, "pid": 76337, "tid": -914061504, "ts": 1716454222267119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222268591, "dur": 51, "args": { "External id": 59590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59590, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59590, "pid": 5, "tid": 7, "ts": 1716454222268591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267153, "dur": 9, "args": { "External id": 59590, "cbid": 211, "correlation": 59590 } }, { "ph": "s", "id": 59590, "pid": 76337, "tid": -914061504, "ts": 1716454222267153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222268643, "dur": 37, "args": { "External id": 59598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59598, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59598, "pid": 5, "tid": 7, "ts": 1716454222268643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267185, "dur": 10, "args": { "External id": 59598, "cbid": 211, "correlation": 59598 } }, { "ph": "s", "id": 59598, "pid": 76337, "tid": -914061504, "ts": 1716454222267185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222268681, "dur": 55, "args": { "External id": 59618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59618, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 59618, "pid": 5, "tid": 7, "ts": 1716454222268681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267268, "dur": 12, "args": { "External id": 59618, "cbid": 211, "correlation": 59618 } }, { "ph": "s", "id": 59618, "pid": 76337, "tid": -914061504, "ts": 1716454222267268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222268738, "dur": 4, "args": { "External id": 59630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 59630, "pid": 5, "tid": 7, "ts": 1716454222268738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267292, "dur": 7, "args": { "External id": 59630, "cbid": 211, "correlation": 59630 } }, { "ph": "s", "id": 59630, "pid": 76337, "tid": -914061504, "ts": 1716454222267292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222268743, "dur": 58, "args": { "External id": 59633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59633, "pid": 5, "tid": 7, "ts": 1716454222268743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267312, "dur": 7, "args": { "External id": 59633, "cbid": 211, "correlation": 59633 } }, { "ph": "s", "id": 59633, "pid": 76337, "tid": -914061504, "ts": 1716454222267312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222267370, "dur": 0, "args": { "External id": 59644, "cbid": 317, "correlation": 59644 } }, { "ph": "f", "id": 59644, "pid": 76337, "tid": -914061504, "ts": 1716454222267370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222267371, "dur": 0, "args": { "External id": 59645, "cbid": 203, "correlation": 59645 } }, { "ph": "f", "id": 59645, "pid": 76337, "tid": -914061504, "ts": 1716454222267371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222267371, "dur": 0, "args": { "External id": 59646, "cbid": 205, "correlation": 59646 } }, { "ph": "f", "id": 59646, "pid": 76337, "tid": -914061504, "ts": 1716454222267371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267408, "dur": 3, "args": { "External id": 59650, "cbid": 251, "correlation": 59650 } }, { "ph": "f", "id": 59650, "pid": 76337, "tid": -914061504, "ts": 1716454222267408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267412, "dur": 1, "args": { "External id": 59651, "cbid": 251, "correlation": 59651 } }, { "ph": "f", "id": 59651, "pid": 76337, "tid": -914061504, "ts": 1716454222267412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267414, "dur": 2, "args": { "External id": 59652, "cbid": 251, "correlation": 59652 } }, { "ph": "f", "id": 59652, "pid": 76337, "tid": -914061504, "ts": 1716454222267414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267417, "dur": 1, "args": { "External id": 59653, "cbid": 251, "correlation": 59653 } }, { "ph": "f", "id": 59653, "pid": 76337, "tid": -914061504, "ts": 1716454222267417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267418, "dur": 1, "args": { "External id": 59654, "cbid": 251, "correlation": 59654 } }, { "ph": "f", "id": 59654, "pid": 76337, "tid": -914061504, "ts": 1716454222267418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267420, "dur": 1, "args": { "External id": 59655, "cbid": 251, "correlation": 59655 } }, { "ph": "f", "id": 59655, "pid": 76337, "tid": -914061504, "ts": 1716454222267420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267422, "dur": 1, "args": { "External id": 59656, "cbid": 251, "correlation": 59656 } }, { "ph": "f", "id": 59656, "pid": 76337, "tid": -914061504, "ts": 1716454222267422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267424, "dur": 1, "args": { "External id": 59657, "cbid": 251, "correlation": 59657 } }, { "ph": "f", "id": 59657, "pid": 76337, "tid": -914061504, "ts": 1716454222267424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267426, "dur": 0, "args": { "External id": 59658, "cbid": 251, "correlation": 59658 } }, { "ph": "f", "id": 59658, "pid": 76337, "tid": -914061504, "ts": 1716454222267426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222268802, "dur": 126, "args": { "External id": 59659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59659, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 59659, "pid": 5, "tid": 7, "ts": 1716454222268802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267431, "dur": 14, "args": { "External id": 59659, "cbid": 211, "correlation": 59659 } }, { "ph": "s", "id": 59659, "pid": 76337, "tid": -914061504, "ts": 1716454222267431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222268930, "dur": 63, "args": { "External id": 59665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59665, "pid": 5, "tid": 7, "ts": 1716454222268930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267468, "dur": 9, "args": { "External id": 59665, "cbid": 211, "correlation": 59665 } }, { "ph": "s", "id": 59665, "pid": 76337, "tid": -914061504, "ts": 1716454222267468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222268994, "dur": 656, "args": { "External id": 59674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59674, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59674, "pid": 5, "tid": 7, "ts": 1716454222268994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267569, "dur": 15, "args": { "External id": 59674, "cbid": 211, "correlation": 59674 } }, { "ph": "s", "id": 59674, "pid": 76337, "tid": -914061504, "ts": 1716454222267569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222269652, "dur": 200, "args": { "External id": 59696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59696, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59696, "pid": 5, "tid": 7, "ts": 1716454222269652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267642, "dur": 13, "args": { "External id": 59696, "cbid": 211, "correlation": 59696 } }, { "ph": "s", "id": 59696, "pid": 76337, "tid": -914061504, "ts": 1716454222267642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267761, "dur": 2, "args": { "External id": 59707, "cbid": 251, "correlation": 59707 } }, { "ph": "f", "id": 59707, "pid": 76337, "tid": -914061504, "ts": 1716454222267761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222269853, "dur": 213, "args": { "External id": 59708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59708, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59708, "pid": 5, "tid": 7, "ts": 1716454222269853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267769, "dur": 14, "args": { "External id": 59708, "cbid": 211, "correlation": 59708 } }, { "ph": "s", "id": 59708, "pid": 76337, "tid": -914061504, "ts": 1716454222267769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267843, "dur": 1, "args": { "External id": 59719, "cbid": 251, "correlation": 59719 } }, { "ph": "f", "id": 59719, "pid": 76337, "tid": -914061504, "ts": 1716454222267843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222270067, "dur": 203, "args": { "External id": 59720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59720, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59720, "pid": 5, "tid": 7, "ts": 1716454222270067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267847, "dur": 12, "args": { "External id": 59720, "cbid": 211, "correlation": 59720 } }, { "ph": "s", "id": 59720, "pid": 76337, "tid": -914061504, "ts": 1716454222267847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222267913, "dur": 1, "args": { "External id": 59731, "cbid": 251, "correlation": 59731 } }, { "ph": "f", "id": 59731, "pid": 76337, "tid": -914061504, "ts": 1716454222267913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222270272, "dur": 203, "args": { "External id": 59732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59732, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59732, "pid": 5, "tid": 7, "ts": 1716454222270272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222267917, "dur": 11, "args": { "External id": 59732, "cbid": 211, "correlation": 59732 } }, { "ph": "s", "id": 59732, "pid": 76337, "tid": -914061504, "ts": 1716454222267917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222270477, "dur": 20585, "args": { "External id": 59753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59753, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 59753, "pid": 5, "tid": 7, "ts": 1716454222270477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268035, "dur": 17, "args": { "External id": 59753, "cbid": 211, "correlation": 59753 } }, { "ph": "s", "id": 59753, "pid": 76337, "tid": -914061504, "ts": 1716454222268035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268154, "dur": 2, "args": { "External id": 59771, "cbid": 251, "correlation": 59771 } }, { "ph": "f", "id": 59771, "pid": 76337, "tid": -914061504, "ts": 1716454222268154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222291063, "dur": 219, "args": { "External id": 59773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59773, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59773, "pid": 5, "tid": 7, "ts": 1716454222291063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268160, "dur": 14, "args": { "External id": 59773, "cbid": 211, "correlation": 59773 } }, { "ph": "s", "id": 59773, "pid": 76337, "tid": -914061504, "ts": 1716454222268160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222291284, "dur": 66, "args": { "External id": 59781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59781, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59781, "pid": 5, "tid": 7, "ts": 1716454222291284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268233, "dur": 15, "args": { "External id": 59781, "cbid": 211, "correlation": 59781 } }, { "ph": "s", "id": 59781, "pid": 76337, "tid": -914061504, "ts": 1716454222268233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222291352, "dur": 97, "args": { "External id": 59789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59789, "pid": 5, "tid": 7, "ts": 1716454222291352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268278, "dur": 9, "args": { "External id": 59789, "cbid": 211, "correlation": 59789 } }, { "ph": "s", "id": 59789, "pid": 76337, "tid": -914061504, "ts": 1716454222268278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222291449, "dur": 56, "args": { "External id": 59800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59800, "pid": 5, "tid": 7, "ts": 1716454222291449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268364, "dur": 14, "args": { "External id": 59800, "cbid": 211, "correlation": 59800 } }, { "ph": "s", "id": 59800, "pid": 76337, "tid": -914061504, "ts": 1716454222268364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222291507, "dur": 101, "args": { "External id": 59822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59822, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59822, "pid": 5, "tid": 7, "ts": 1716454222291507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268397, "dur": 9, "args": { "External id": 59822, "cbid": 211, "correlation": 59822 } }, { "ph": "s", "id": 59822, "pid": 76337, "tid": -914061504, "ts": 1716454222268397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268483, "dur": 1, "args": { "External id": 59833, "cbid": 251, "correlation": 59833 } }, { "ph": "f", "id": 59833, "pid": 76337, "tid": -914061504, "ts": 1716454222268483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222291610, "dur": 113, "args": { "External id": 59834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59834, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59834, "pid": 5, "tid": 7, "ts": 1716454222291610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268488, "dur": 12, "args": { "External id": 59834, "cbid": 211, "correlation": 59834 } }, { "ph": "s", "id": 59834, "pid": 76337, "tid": -914061504, "ts": 1716454222268488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268571, "dur": 1, "args": { "External id": 59845, "cbid": 251, "correlation": 59845 } }, { "ph": "f", "id": 59845, "pid": 76337, "tid": -914061504, "ts": 1716454222268571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268575, "dur": 0, "args": { "External id": 59846, "cbid": 251, "correlation": 59846 } }, { "ph": "f", "id": 59846, "pid": 76337, "tid": -914061504, "ts": 1716454222268575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222291724, "dur": 11, "args": { "External id": 59847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59847, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59847, "pid": 5, "tid": 7, "ts": 1716454222291724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268577, "dur": 14, "args": { "External id": 59847, "cbid": 211, "correlation": 59847 } }, { "ph": "s", "id": 59847, "pid": 76337, "tid": -914061504, "ts": 1716454222268577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222291736, "dur": 5, "args": { "External id": 59849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59849, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 59849, "pid": 5, "tid": 7, "ts": 1716454222291736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268595, "dur": 7, "args": { "External id": 59849, "cbid": 211, "correlation": 59849 } }, { "ph": "s", "id": 59849, "pid": 76337, "tid": -914061504, "ts": 1716454222268595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268661, "dur": 1, "args": { "External id": 59860, "cbid": 251, "correlation": 59860 } }, { "ph": "f", "id": 59860, "pid": 76337, "tid": -914061504, "ts": 1716454222268661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268664, "dur": 0, "args": { "External id": 59861, "cbid": 251, "correlation": 59861 } }, { "ph": "f", "id": 59861, "pid": 76337, "tid": -914061504, "ts": 1716454222268664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222291743, "dur": 7, "args": { "External id": 59862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59862, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 59862, "pid": 5, "tid": 7, "ts": 1716454222291743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268666, "dur": 12, "args": { "External id": 59862, "cbid": 211, "correlation": 59862 } }, { "ph": "s", "id": 59862, "pid": 76337, "tid": -914061504, "ts": 1716454222268666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222291751, "dur": 4, "args": { "External id": 59864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59864, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 59864, "pid": 5, "tid": 7, "ts": 1716454222291751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268679, "dur": 6, "args": { "External id": 59864, "cbid": 211, "correlation": 59864 } }, { "ph": "s", "id": 59864, "pid": 76337, "tid": -914061504, "ts": 1716454222268679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222291756, "dur": 170, "args": { "External id": 59885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59885, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 59885, "pid": 5, "tid": 7, "ts": 1716454222291756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268755, "dur": 12, "args": { "External id": 59885, "cbid": 211, "correlation": 59885 } }, { "ph": "s", "id": 59885, "pid": 76337, "tid": -914061504, "ts": 1716454222268755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222268852, "dur": 2, "args": { "External id": 59903, "cbid": 251, "correlation": 59903 } }, { "ph": "f", "id": 59903, "pid": 76337, "tid": -914061504, "ts": 1716454222268852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222291927, "dur": 113, "args": { "External id": 59905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59905, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 59905, "pid": 5, "tid": 7, "ts": 1716454222291927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268859, "dur": 14, "args": { "External id": 59905, "cbid": 211, "correlation": 59905 } }, { "ph": "s", "id": 59905, "pid": 76337, "tid": -914061504, "ts": 1716454222268859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222292042, "dur": 35, "args": { "External id": 59913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59913, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59913, "pid": 5, "tid": 7, "ts": 1716454222292042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268930, "dur": 12, "args": { "External id": 59913, "cbid": 211, "correlation": 59913 } }, { "ph": "s", "id": 59913, "pid": 76337, "tid": -914061504, "ts": 1716454222268930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222292079, "dur": 72, "args": { "External id": 59921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59921, "pid": 5, "tid": 7, "ts": 1716454222292079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222268970, "dur": 18, "args": { "External id": 59921, "cbid": 211, "correlation": 59921 } }, { "ph": "s", "id": 59921, "pid": 76337, "tid": -914061504, "ts": 1716454222268970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222292152, "dur": 101, "args": { "External id": 59943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59943, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59943, "pid": 5, "tid": 7, "ts": 1716454222292152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269036, "dur": 11, "args": { "External id": 59943, "cbid": 211, "correlation": 59943 } }, { "ph": "s", "id": 59943, "pid": 76337, "tid": -914061504, "ts": 1716454222269036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269128, "dur": 1, "args": { "External id": 59959, "cbid": 251, "correlation": 59959 } }, { "ph": "f", "id": 59959, "pid": 76337, "tid": -914061504, "ts": 1716454222269128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222292255, "dur": 630, "args": { "External id": 59961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59961, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 59961, "pid": 5, "tid": 7, "ts": 1716454222292255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269134, "dur": 13, "args": { "External id": 59961, "cbid": 211, "correlation": 59961 } }, { "ph": "s", "id": 59961, "pid": 76337, "tid": -914061504, "ts": 1716454222269134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222292887, "dur": 262, "args": { "External id": 59969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59969, "pid": 5, "tid": 7, "ts": 1716454222292887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269215, "dur": 14, "args": { "External id": 59969, "cbid": 211, "correlation": 59969 } }, { "ph": "s", "id": 59969, "pid": 76337, "tid": -914061504, "ts": 1716454222269215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222293150, "dur": 260, "args": { "External id": 59977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 59977, "pid": 5, "tid": 7, "ts": 1716454222293150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269250, "dur": 10, "args": { "External id": 59977, "cbid": 211, "correlation": 59977 } }, { "ph": "s", "id": 59977, "pid": 76337, "tid": -914061504, "ts": 1716454222269250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269335, "dur": 2, "args": { "External id": 59993, "cbid": 251, "correlation": 59993 } }, { "ph": "f", "id": 59993, "pid": 76337, "tid": -914061504, "ts": 1716454222269335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269341, "dur": 0, "args": { "External id": 59995, "cbid": 251, "correlation": 59995 } }, { "ph": "f", "id": 59995, "pid": 76337, "tid": -914061504, "ts": 1716454222269341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222293412, "dur": 378, "args": { "External id": 59996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 59996, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 59996, "pid": 5, "tid": 7, "ts": 1716454222293412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269346, "dur": 13, "args": { "External id": 59996, "cbid": 211, "correlation": 59996 } }, { "ph": "s", "id": 59996, "pid": 76337, "tid": -914061504, "ts": 1716454222269346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222293791, "dur": 50, "args": { "External id": 60004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60004, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60004, "pid": 5, "tid": 7, "ts": 1716454222293791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269389, "dur": 10, "args": { "External id": 60004, "cbid": 211, "correlation": 60004 } }, { "ph": "s", "id": 60004, "pid": 76337, "tid": -914061504, "ts": 1716454222269389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222293843, "dur": 173, "args": { "External id": 60015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60015, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60015, "pid": 5, "tid": 7, "ts": 1716454222293843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269464, "dur": 13, "args": { "External id": 60015, "cbid": 211, "correlation": 60015 } }, { "ph": "s", "id": 60015, "pid": 76337, "tid": -914061504, "ts": 1716454222269464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222269530, "dur": 0, "args": { "External id": 60027, "cbid": 317, "correlation": 60027 } }, { "ph": "f", "id": 60027, "pid": 76337, "tid": -914061504, "ts": 1716454222269530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222269531, "dur": 0, "args": { "External id": 60028, "cbid": 203, "correlation": 60028 } }, { "ph": "f", "id": 60028, "pid": 76337, "tid": -914061504, "ts": 1716454222269531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222269532, "dur": 0, "args": { "External id": 60029, "cbid": 205, "correlation": 60029 } }, { "ph": "f", "id": 60029, "pid": 76337, "tid": -914061504, "ts": 1716454222269532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269556, "dur": 1, "args": { "External id": 60033, "cbid": 251, "correlation": 60033 } }, { "ph": "f", "id": 60033, "pid": 76337, "tid": -914061504, "ts": 1716454222269556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269558, "dur": 0, "args": { "External id": 60034, "cbid": 251, "correlation": 60034 } }, { "ph": "f", "id": 60034, "pid": 76337, "tid": -914061504, "ts": 1716454222269558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269559, "dur": 0, "args": { "External id": 60035, "cbid": 251, "correlation": 60035 } }, { "ph": "f", "id": 60035, "pid": 76337, "tid": -914061504, "ts": 1716454222269559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269559, "dur": 0, "args": { "External id": 60036, "cbid": 251, "correlation": 60036 } }, { "ph": "f", "id": 60036, "pid": 76337, "tid": -914061504, "ts": 1716454222269559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269560, "dur": 0, "args": { "External id": 60037, "cbid": 251, "correlation": 60037 } }, { "ph": "f", "id": 60037, "pid": 76337, "tid": -914061504, "ts": 1716454222269560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269561, "dur": 0, "args": { "External id": 60038, "cbid": 251, "correlation": 60038 } }, { "ph": "f", "id": 60038, "pid": 76337, "tid": -914061504, "ts": 1716454222269561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269562, "dur": 0, "args": { "External id": 60039, "cbid": 251, "correlation": 60039 } }, { "ph": "f", "id": 60039, "pid": 76337, "tid": -914061504, "ts": 1716454222269562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269563, "dur": 0, "args": { "External id": 60040, "cbid": 251, "correlation": 60040 } }, { "ph": "f", "id": 60040, "pid": 76337, "tid": -914061504, "ts": 1716454222269563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222269564, "dur": 0, "args": { "External id": 60041, "cbid": 251, "correlation": 60041 } }, { "ph": "f", "id": 60041, "pid": 76337, "tid": -914061504, "ts": 1716454222269564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222294017, "dur": 121, "args": { "External id": 60042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60042, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60042, "pid": 5, "tid": 7, "ts": 1716454222294017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269566, "dur": 12, "args": { "External id": 60042, "cbid": 211, "correlation": 60042 } }, { "ph": "s", "id": 60042, "pid": 76337, "tid": -914061504, "ts": 1716454222269566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222294140, "dur": 63, "args": { "External id": 60048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60048, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60048, "pid": 5, "tid": 7, "ts": 1716454222294140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269601, "dur": 9, "args": { "External id": 60048, "cbid": 211, "correlation": 60048 } }, { "ph": "s", "id": 60048, "pid": 76337, "tid": -914061504, "ts": 1716454222269601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222294205, "dur": 51, "args": { "External id": 60056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60056, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60056, "pid": 5, "tid": 7, "ts": 1716454222294205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269634, "dur": 8, "args": { "External id": 60056, "cbid": 211, "correlation": 60056 } }, { "ph": "s", "id": 60056, "pid": 76337, "tid": -914061504, "ts": 1716454222269634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222294257, "dur": 55, "args": { "External id": 60076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60076, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 60076, "pid": 5, "tid": 7, "ts": 1716454222294257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269706, "dur": 11, "args": { "External id": 60076, "cbid": 211, "correlation": 60076 } }, { "ph": "s", "id": 60076, "pid": 76337, "tid": -914061504, "ts": 1716454222269706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222294313, "dur": 5, "args": { "External id": 60088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60088, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60088, "pid": 5, "tid": 7, "ts": 1716454222294313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269727, "dur": 10, "args": { "External id": 60088, "cbid": 211, "correlation": 60088 } }, { "ph": "s", "id": 60088, "pid": 76337, "tid": -914061504, "ts": 1716454222269727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222294319, "dur": 60, "args": { "External id": 60091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60091, "pid": 5, "tid": 7, "ts": 1716454222294319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269749, "dur": 7, "args": { "External id": 60091, "cbid": 211, "correlation": 60091 } }, { "ph": "s", "id": 60091, "pid": 76337, "tid": -914061504, "ts": 1716454222269749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222294380, "dur": 38, "args": { "External id": 60100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60100, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60100, "pid": 5, "tid": 7, "ts": 1716454222294380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269791, "dur": 10, "args": { "External id": 60100, "cbid": 211, "correlation": 60100 } }, { "ph": "s", "id": 60100, "pid": 76337, "tid": -914061504, "ts": 1716454222269791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222269843, "dur": 0, "args": { "External id": 60110, "cbid": 317, "correlation": 60110 } }, { "ph": "f", "id": 60110, "pid": 76337, "tid": -914061504, "ts": 1716454222269843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222269844, "dur": 0, "args": { "External id": 60111, "cbid": 203, "correlation": 60111 } }, { "ph": "f", "id": 60111, "pid": 76337, "tid": -914061504, "ts": 1716454222269844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222269845, "dur": 0, "args": { "External id": 60112, "cbid": 205, "correlation": 60112 } }, { "ph": "f", "id": 60112, "pid": 76337, "tid": -914061504, "ts": 1716454222269845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222294419, "dur": 42, "args": { "External id": 60116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60116, "pid": 5, "tid": 7, "ts": 1716454222294419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269860, "dur": 12, "args": { "External id": 60116, "cbid": 211, "correlation": 60116 } }, { "ph": "s", "id": 60116, "pid": 76337, "tid": -914061504, "ts": 1716454222269860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222294463, "dur": 16, "args": { "External id": 60118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60118, "pid": 5, "tid": 7, "ts": 1716454222294463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269874, "dur": 5, "args": { "External id": 60118, "cbid": 211, "correlation": 60118 } }, { "ph": "s", "id": 60118, "pid": 76337, "tid": -914061504, "ts": 1716454222269874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222294480, "dur": 4, "args": { "External id": 60120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60120, "pid": 5, "tid": 7, "ts": 1716454222294480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269884, "dur": 6, "args": { "External id": 60120, "cbid": 211, "correlation": 60120 } }, { "ph": "s", "id": 60120, "pid": 76337, "tid": -914061504, "ts": 1716454222269884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222269893, "dur": 0, "args": { "External id": 60121, "cbid": 51, "correlation": 60121 } }, { "ph": "s", "id": 60121, "pid": 76337, "tid": -914061504, "ts": 1716454222269893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222294485, "dur": 769, "args": { "External id": 60122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60122, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60122, "pid": 5, "tid": 7, "ts": 1716454222294485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269893, "dur": 5, "args": { "External id": 60122, "cbid": 211, "correlation": 60122 } }, { "ph": "s", "id": 60122, "pid": 76337, "tid": -914061504, "ts": 1716454222269893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222295256, "dur": 63, "args": { "External id": 60127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60127, "pid": 5, "tid": 7, "ts": 1716454222295256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269921, "dur": 8, "args": { "External id": 60127, "cbid": 211, "correlation": 60127 } }, { "ph": "s", "id": 60127, "pid": 76337, "tid": -914061504, "ts": 1716454222269921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222295320, "dur": 4, "args": { "External id": 60135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60135, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60135, "pid": 5, "tid": 7, "ts": 1716454222295320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222269964, "dur": 17, "args": { "External id": 60135, "cbid": 211, "correlation": 60135 } }, { "ph": "s", "id": 60135, "pid": 76337, "tid": -914061504, "ts": 1716454222269964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270038, "dur": 1, "args": { "External id": 60151, "cbid": 251, "correlation": 60151 } }, { "ph": "f", "id": 60151, "pid": 76337, "tid": -914061504, "ts": 1716454222270038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270047, "dur": 0, "args": { "External id": 60153, "cbid": 251, "correlation": 60153 } }, { "ph": "f", "id": 60153, "pid": 76337, "tid": -914061504, "ts": 1716454222270047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222295325, "dur": 12, "args": { "External id": 60154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60154, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 60154, "pid": 5, "tid": 7, "ts": 1716454222295325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270048, "dur": 12, "args": { "External id": 60154, "cbid": 211, "correlation": 60154 } }, { "ph": "s", "id": 60154, "pid": 76337, "tid": -914061504, "ts": 1716454222270048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222295339, "dur": 6, "args": { "External id": 60156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60156, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 60156, "pid": 5, "tid": 7, "ts": 1716454222295339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270062, "dur": 5, "args": { "External id": 60156, "cbid": 211, "correlation": 60156 } }, { "ph": "s", "id": 60156, "pid": 76337, "tid": -914061504, "ts": 1716454222270062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222295346, "dur": 56, "args": { "External id": 60166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60166, "pid": 5, "tid": 7, "ts": 1716454222295346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270120, "dur": 12, "args": { "External id": 60166, "cbid": 211, "correlation": 60166 } }, { "ph": "s", "id": 60166, "pid": 76337, "tid": -914061504, "ts": 1716454222270120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222295403, "dur": 54, "args": { "External id": 60186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60186, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 60186, "pid": 5, "tid": 7, "ts": 1716454222295403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270185, "dur": 11, "args": { "External id": 60186, "cbid": 211, "correlation": 60186 } }, { "ph": "s", "id": 60186, "pid": 76337, "tid": -914061504, "ts": 1716454222270185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222295458, "dur": 4, "args": { "External id": 60198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60198, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60198, "pid": 5, "tid": 7, "ts": 1716454222295458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270206, "dur": 6, "args": { "External id": 60198, "cbid": 211, "correlation": 60198 } }, { "ph": "s", "id": 60198, "pid": 76337, "tid": -914061504, "ts": 1716454222270206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222295464, "dur": 60, "args": { "External id": 60201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60201, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60201, "pid": 5, "tid": 7, "ts": 1716454222295464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270224, "dur": 6, "args": { "External id": 60201, "cbid": 211, "correlation": 60201 } }, { "ph": "s", "id": 60201, "pid": 76337, "tid": -914061504, "ts": 1716454222270224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222295525, "dur": 38, "args": { "External id": 60210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60210, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60210, "pid": 5, "tid": 7, "ts": 1716454222295525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270265, "dur": 10, "args": { "External id": 60210, "cbid": 211, "correlation": 60210 } }, { "ph": "s", "id": 60210, "pid": 76337, "tid": -914061504, "ts": 1716454222270265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222270329, "dur": 0, "args": { "External id": 60220, "cbid": 317, "correlation": 60220 } }, { "ph": "f", "id": 60220, "pid": 76337, "tid": -914061504, "ts": 1716454222270329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222270330, "dur": 0, "args": { "External id": 60221, "cbid": 203, "correlation": 60221 } }, { "ph": "f", "id": 60221, "pid": 76337, "tid": -914061504, "ts": 1716454222270330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222270330, "dur": 0, "args": { "External id": 60222, "cbid": 205, "correlation": 60222 } }, { "ph": "f", "id": 60222, "pid": 76337, "tid": -914061504, "ts": 1716454222270330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222295564, "dur": 40, "args": { "External id": 60226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60226, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60226, "pid": 5, "tid": 7, "ts": 1716454222295564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270347, "dur": 12, "args": { "External id": 60226, "cbid": 211, "correlation": 60226 } }, { "ph": "s", "id": 60226, "pid": 76337, "tid": -914061504, "ts": 1716454222270347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222295606, "dur": 16, "args": { "External id": 60228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60228, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60228, "pid": 5, "tid": 7, "ts": 1716454222295606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270362, "dur": 6, "args": { "External id": 60228, "cbid": 211, "correlation": 60228 } }, { "ph": "s", "id": 60228, "pid": 76337, "tid": -914061504, "ts": 1716454222270362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222295623, "dur": 4, "args": { "External id": 60230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60230, "pid": 5, "tid": 7, "ts": 1716454222295623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270372, "dur": 5, "args": { "External id": 60230, "cbid": 211, "correlation": 60230 } }, { "ph": "s", "id": 60230, "pid": 76337, "tid": -914061504, "ts": 1716454222270372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222270381, "dur": 0, "args": { "External id": 60231, "cbid": 51, "correlation": 60231 } }, { "ph": "s", "id": 60231, "pid": 76337, "tid": -914061504, "ts": 1716454222270381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222295628, "dur": 764, "args": { "External id": 60232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60232, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60232, "pid": 5, "tid": 7, "ts": 1716454222295628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270381, "dur": 5, "args": { "External id": 60232, "cbid": 211, "correlation": 60232 } }, { "ph": "s", "id": 60232, "pid": 76337, "tid": -914061504, "ts": 1716454222270381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222296393, "dur": 62, "args": { "External id": 60237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60237, "pid": 5, "tid": 7, "ts": 1716454222296393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270409, "dur": 8, "args": { "External id": 60237, "cbid": 211, "correlation": 60237 } }, { "ph": "s", "id": 60237, "pid": 76337, "tid": -914061504, "ts": 1716454222270409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222296457, "dur": 50, "args": { "External id": 60245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60245, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60245, "pid": 5, "tid": 7, "ts": 1716454222296457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270441, "dur": 8, "args": { "External id": 60245, "cbid": 211, "correlation": 60245 } }, { "ph": "s", "id": 60245, "pid": 76337, "tid": -914061504, "ts": 1716454222270441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222296508, "dur": 36, "args": { "External id": 60253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60253, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60253, "pid": 5, "tid": 7, "ts": 1716454222296508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270470, "dur": 8, "args": { "External id": 60253, "cbid": 211, "correlation": 60253 } }, { "ph": "s", "id": 60253, "pid": 76337, "tid": -914061504, "ts": 1716454222270470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222296545, "dur": 55, "args": { "External id": 60273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60273, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 60273, "pid": 5, "tid": 7, "ts": 1716454222296545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270550, "dur": 12, "args": { "External id": 60273, "cbid": 211, "correlation": 60273 } }, { "ph": "s", "id": 60273, "pid": 76337, "tid": -914061504, "ts": 1716454222270550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222296602, "dur": 4, "args": { "External id": 60285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60285, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60285, "pid": 5, "tid": 7, "ts": 1716454222296602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270572, "dur": 6, "args": { "External id": 60285, "cbid": 211, "correlation": 60285 } }, { "ph": "s", "id": 60285, "pid": 76337, "tid": -914061504, "ts": 1716454222270572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222296608, "dur": 58, "args": { "External id": 60288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60288, "pid": 5, "tid": 7, "ts": 1716454222296608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270592, "dur": 7, "args": { "External id": 60288, "cbid": 211, "correlation": 60288 } }, { "ph": "s", "id": 60288, "pid": 76337, "tid": -914061504, "ts": 1716454222270592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222270652, "dur": 0, "args": { "External id": 60299, "cbid": 317, "correlation": 60299 } }, { "ph": "f", "id": 60299, "pid": 76337, "tid": -914061504, "ts": 1716454222270652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222270653, "dur": 0, "args": { "External id": 60300, "cbid": 203, "correlation": 60300 } }, { "ph": "f", "id": 60300, "pid": 76337, "tid": -914061504, "ts": 1716454222270653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222270653, "dur": 0, "args": { "External id": 60301, "cbid": 205, "correlation": 60301 } }, { "ph": "f", "id": 60301, "pid": 76337, "tid": -914061504, "ts": 1716454222270653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270676, "dur": 1, "args": { "External id": 60305, "cbid": 251, "correlation": 60305 } }, { "ph": "f", "id": 60305, "pid": 76337, "tid": -914061504, "ts": 1716454222270676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270678, "dur": 0, "args": { "External id": 60306, "cbid": 251, "correlation": 60306 } }, { "ph": "f", "id": 60306, "pid": 76337, "tid": -914061504, "ts": 1716454222270678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270679, "dur": 0, "args": { "External id": 60307, "cbid": 251, "correlation": 60307 } }, { "ph": "f", "id": 60307, "pid": 76337, "tid": -914061504, "ts": 1716454222270679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270680, "dur": 0, "args": { "External id": 60308, "cbid": 251, "correlation": 60308 } }, { "ph": "f", "id": 60308, "pid": 76337, "tid": -914061504, "ts": 1716454222270680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270680, "dur": 0, "args": { "External id": 60309, "cbid": 251, "correlation": 60309 } }, { "ph": "f", "id": 60309, "pid": 76337, "tid": -914061504, "ts": 1716454222270680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270681, "dur": 0, "args": { "External id": 60310, "cbid": 251, "correlation": 60310 } }, { "ph": "f", "id": 60310, "pid": 76337, "tid": -914061504, "ts": 1716454222270681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270682, "dur": 0, "args": { "External id": 60311, "cbid": 251, "correlation": 60311 } }, { "ph": "f", "id": 60311, "pid": 76337, "tid": -914061504, "ts": 1716454222270682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270683, "dur": 0, "args": { "External id": 60312, "cbid": 251, "correlation": 60312 } }, { "ph": "f", "id": 60312, "pid": 76337, "tid": -914061504, "ts": 1716454222270683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270684, "dur": 0, "args": { "External id": 60313, "cbid": 251, "correlation": 60313 } }, { "ph": "f", "id": 60313, "pid": 76337, "tid": -914061504, "ts": 1716454222270684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222296667, "dur": 121, "args": { "External id": 60314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60314, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60314, "pid": 5, "tid": 7, "ts": 1716454222296667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270686, "dur": 13, "args": { "External id": 60314, "cbid": 211, "correlation": 60314 } }, { "ph": "s", "id": 60314, "pid": 76337, "tid": -914061504, "ts": 1716454222270686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222296790, "dur": 63, "args": { "External id": 60320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60320, "pid": 5, "tid": 7, "ts": 1716454222296790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270722, "dur": 9, "args": { "External id": 60320, "cbid": 211, "correlation": 60320 } }, { "ph": "s", "id": 60320, "pid": 76337, "tid": -914061504, "ts": 1716454222270722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222296855, "dur": 675, "args": { "External id": 60329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60329, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60329, "pid": 5, "tid": 7, "ts": 1716454222296855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270806, "dur": 14, "args": { "External id": 60329, "cbid": 211, "correlation": 60329 } }, { "ph": "s", "id": 60329, "pid": 76337, "tid": -914061504, "ts": 1716454222270806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222297531, "dur": 199, "args": { "External id": 60351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60351, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60351, "pid": 5, "tid": 7, "ts": 1716454222297531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270862, "dur": 10, "args": { "External id": 60351, "cbid": 211, "correlation": 60351 } }, { "ph": "s", "id": 60351, "pid": 76337, "tid": -914061504, "ts": 1716454222270862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222270948, "dur": 1, "args": { "External id": 60362, "cbid": 251, "correlation": 60362 } }, { "ph": "f", "id": 60362, "pid": 76337, "tid": -914061504, "ts": 1716454222270948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222297731, "dur": 217, "args": { "External id": 60363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60363, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60363, "pid": 5, "tid": 7, "ts": 1716454222297731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222270953, "dur": 16, "args": { "External id": 60363, "cbid": 211, "correlation": 60363 } }, { "ph": "s", "id": 60363, "pid": 76337, "tid": -914061504, "ts": 1716454222270953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271034, "dur": 1, "args": { "External id": 60374, "cbid": 251, "correlation": 60374 } }, { "ph": "f", "id": 60374, "pid": 76337, "tid": -914061504, "ts": 1716454222271034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222297950, "dur": 202, "args": { "External id": 60375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60375, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60375, "pid": 5, "tid": 7, "ts": 1716454222297950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271038, "dur": 12, "args": { "External id": 60375, "cbid": 211, "correlation": 60375 } }, { "ph": "s", "id": 60375, "pid": 76337, "tid": -914061504, "ts": 1716454222271038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271102, "dur": 1, "args": { "External id": 60386, "cbid": 251, "correlation": 60386 } }, { "ph": "f", "id": 60386, "pid": 76337, "tid": -914061504, "ts": 1716454222271102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222298153, "dur": 202, "args": { "External id": 60387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60387, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60387, "pid": 5, "tid": 7, "ts": 1716454222298153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271106, "dur": 11, "args": { "External id": 60387, "cbid": 211, "correlation": 60387 } }, { "ph": "s", "id": 60387, "pid": 76337, "tid": -914061504, "ts": 1716454222271106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222298356, "dur": 20584, "args": { "External id": 60408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60408, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60408, "pid": 5, "tid": 7, "ts": 1716454222298356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271186, "dur": 12, "args": { "External id": 60408, "cbid": 211, "correlation": 60408 } }, { "ph": "s", "id": 60408, "pid": 76337, "tid": -914061504, "ts": 1716454222271186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271283, "dur": 1, "args": { "External id": 60426, "cbid": 251, "correlation": 60426 } }, { "ph": "f", "id": 60426, "pid": 76337, "tid": -914061504, "ts": 1716454222271283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222318942, "dur": 218, "args": { "External id": 60428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60428, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60428, "pid": 5, "tid": 7, "ts": 1716454222318942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271288, "dur": 14, "args": { "External id": 60428, "cbid": 211, "correlation": 60428 } }, { "ph": "s", "id": 60428, "pid": 76337, "tid": -914061504, "ts": 1716454222271288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222319161, "dur": 66, "args": { "External id": 60436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60436, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60436, "pid": 5, "tid": 7, "ts": 1716454222319161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271358, "dur": 12, "args": { "External id": 60436, "cbid": 211, "correlation": 60436 } }, { "ph": "s", "id": 60436, "pid": 76337, "tid": -914061504, "ts": 1716454222271358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222319228, "dur": 99, "args": { "External id": 60444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60444, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60444, "pid": 5, "tid": 7, "ts": 1716454222319228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271400, "dur": 10, "args": { "External id": 60444, "cbid": 211, "correlation": 60444 } }, { "ph": "s", "id": 60444, "pid": 76337, "tid": -914061504, "ts": 1716454222271400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222319328, "dur": 56, "args": { "External id": 60455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60455, "pid": 5, "tid": 7, "ts": 1716454222319328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271473, "dur": 13, "args": { "External id": 60455, "cbid": 211, "correlation": 60455 } }, { "ph": "s", "id": 60455, "pid": 76337, "tid": -914061504, "ts": 1716454222271473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222319386, "dur": 101, "args": { "External id": 60477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60477, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60477, "pid": 5, "tid": 7, "ts": 1716454222319386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271504, "dur": 9, "args": { "External id": 60477, "cbid": 211, "correlation": 60477 } }, { "ph": "s", "id": 60477, "pid": 76337, "tid": -914061504, "ts": 1716454222271504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271588, "dur": 1, "args": { "External id": 60488, "cbid": 251, "correlation": 60488 } }, { "ph": "f", "id": 60488, "pid": 76337, "tid": -914061504, "ts": 1716454222271588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222319488, "dur": 111, "args": { "External id": 60489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60489, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60489, "pid": 5, "tid": 7, "ts": 1716454222319488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271593, "dur": 12, "args": { "External id": 60489, "cbid": 211, "correlation": 60489 } }, { "ph": "s", "id": 60489, "pid": 76337, "tid": -914061504, "ts": 1716454222271593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271664, "dur": 1, "args": { "External id": 60500, "cbid": 251, "correlation": 60500 } }, { "ph": "f", "id": 60500, "pid": 76337, "tid": -914061504, "ts": 1716454222271664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271668, "dur": 0, "args": { "External id": 60501, "cbid": 251, "correlation": 60501 } }, { "ph": "f", "id": 60501, "pid": 76337, "tid": -914061504, "ts": 1716454222271668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222319600, "dur": 10, "args": { "External id": 60502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60502, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 60502, "pid": 5, "tid": 7, "ts": 1716454222319600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271670, "dur": 12, "args": { "External id": 60502, "cbid": 211, "correlation": 60502 } }, { "ph": "s", "id": 60502, "pid": 76337, "tid": -914061504, "ts": 1716454222271670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222319612, "dur": 6, "args": { "External id": 60504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60504, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 60504, "pid": 5, "tid": 7, "ts": 1716454222319612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271683, "dur": 6, "args": { "External id": 60504, "cbid": 211, "correlation": 60504 } }, { "ph": "s", "id": 60504, "pid": 76337, "tid": -914061504, "ts": 1716454222271683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271745, "dur": 1, "args": { "External id": 60515, "cbid": 251, "correlation": 60515 } }, { "ph": "f", "id": 60515, "pid": 76337, "tid": -914061504, "ts": 1716454222271745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271748, "dur": 0, "args": { "External id": 60516, "cbid": 251, "correlation": 60516 } }, { "ph": "f", "id": 60516, "pid": 76337, "tid": -914061504, "ts": 1716454222271748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222319619, "dur": 7, "args": { "External id": 60517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60517, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 60517, "pid": 5, "tid": 7, "ts": 1716454222319619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271749, "dur": 14, "args": { "External id": 60517, "cbid": 211, "correlation": 60517 } }, { "ph": "s", "id": 60517, "pid": 76337, "tid": -914061504, "ts": 1716454222271749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222319628, "dur": 4, "args": { "External id": 60519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60519, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 60519, "pid": 5, "tid": 7, "ts": 1716454222319628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271765, "dur": 6, "args": { "External id": 60519, "cbid": 211, "correlation": 60519 } }, { "ph": "s", "id": 60519, "pid": 76337, "tid": -914061504, "ts": 1716454222271765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222319633, "dur": 169, "args": { "External id": 60540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60540, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60540, "pid": 5, "tid": 7, "ts": 1716454222319633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271840, "dur": 12, "args": { "External id": 60540, "cbid": 211, "correlation": 60540 } }, { "ph": "s", "id": 60540, "pid": 76337, "tid": -914061504, "ts": 1716454222271840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222271935, "dur": 1, "args": { "External id": 60558, "cbid": 251, "correlation": 60558 } }, { "ph": "f", "id": 60558, "pid": 76337, "tid": -914061504, "ts": 1716454222271935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222319804, "dur": 116, "args": { "External id": 60560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60560, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60560, "pid": 5, "tid": 7, "ts": 1716454222319804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222271941, "dur": 13, "args": { "External id": 60560, "cbid": 211, "correlation": 60560 } }, { "ph": "s", "id": 60560, "pid": 76337, "tid": -914061504, "ts": 1716454222271941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222319921, "dur": 35, "args": { "External id": 60568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60568, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60568, "pid": 5, "tid": 7, "ts": 1716454222319921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272017, "dur": 12, "args": { "External id": 60568, "cbid": 211, "correlation": 60568 } }, { "ph": "s", "id": 60568, "pid": 76337, "tid": -914061504, "ts": 1716454222272017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222319957, "dur": 70, "args": { "External id": 60576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60576, "pid": 5, "tid": 7, "ts": 1716454222319957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272059, "dur": 9, "args": { "External id": 60576, "cbid": 211, "correlation": 60576 } }, { "ph": "s", "id": 60576, "pid": 76337, "tid": -914061504, "ts": 1716454222272059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222320029, "dur": 101, "args": { "External id": 60598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60598, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60598, "pid": 5, "tid": 7, "ts": 1716454222320029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272113, "dur": 10, "args": { "External id": 60598, "cbid": 211, "correlation": 60598 } }, { "ph": "s", "id": 60598, "pid": 76337, "tid": -914061504, "ts": 1716454222272113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272201, "dur": 1, "args": { "External id": 60614, "cbid": 251, "correlation": 60614 } }, { "ph": "f", "id": 60614, "pid": 76337, "tid": -914061504, "ts": 1716454222272201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222320132, "dur": 629, "args": { "External id": 60616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60616, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60616, "pid": 5, "tid": 7, "ts": 1716454222320132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272206, "dur": 12, "args": { "External id": 60616, "cbid": 211, "correlation": 60616 } }, { "ph": "s", "id": 60616, "pid": 76337, "tid": -914061504, "ts": 1716454222272206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222320762, "dur": 262, "args": { "External id": 60624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60624, "pid": 5, "tid": 7, "ts": 1716454222320762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272271, "dur": 12, "args": { "External id": 60624, "cbid": 211, "correlation": 60624 } }, { "ph": "s", "id": 60624, "pid": 76337, "tid": -914061504, "ts": 1716454222272271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222321025, "dur": 260, "args": { "External id": 60632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60632, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60632, "pid": 5, "tid": 7, "ts": 1716454222321025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272302, "dur": 8, "args": { "External id": 60632, "cbid": 211, "correlation": 60632 } }, { "ph": "s", "id": 60632, "pid": 76337, "tid": -914061504, "ts": 1716454222272302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272382, "dur": 1, "args": { "External id": 60648, "cbid": 251, "correlation": 60648 } }, { "ph": "f", "id": 60648, "pid": 76337, "tid": -914061504, "ts": 1716454222272382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272388, "dur": 0, "args": { "External id": 60650, "cbid": 251, "correlation": 60650 } }, { "ph": "f", "id": 60650, "pid": 76337, "tid": -914061504, "ts": 1716454222272388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222321287, "dur": 379, "args": { "External id": 60651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60651, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 60651, "pid": 5, "tid": 7, "ts": 1716454222321287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272390, "dur": 13, "args": { "External id": 60651, "cbid": 211, "correlation": 60651 } }, { "ph": "s", "id": 60651, "pid": 76337, "tid": -914061504, "ts": 1716454222272390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222321667, "dur": 50, "args": { "External id": 60659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60659, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60659, "pid": 5, "tid": 7, "ts": 1716454222321667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272432, "dur": 10, "args": { "External id": 60659, "cbid": 211, "correlation": 60659 } }, { "ph": "s", "id": 60659, "pid": 76337, "tid": -914061504, "ts": 1716454222272432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222321719, "dur": 171, "args": { "External id": 60670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60670, "pid": 5, "tid": 7, "ts": 1716454222321719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272500, "dur": 15, "args": { "External id": 60670, "cbid": 211, "correlation": 60670 } }, { "ph": "s", "id": 60670, "pid": 76337, "tid": -914061504, "ts": 1716454222272500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222272566, "dur": 0, "args": { "External id": 60682, "cbid": 317, "correlation": 60682 } }, { "ph": "f", "id": 60682, "pid": 76337, "tid": -914061504, "ts": 1716454222272566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222272567, "dur": 0, "args": { "External id": 60683, "cbid": 203, "correlation": 60683 } }, { "ph": "f", "id": 60683, "pid": 76337, "tid": -914061504, "ts": 1716454222272567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222272568, "dur": 0, "args": { "External id": 60684, "cbid": 205, "correlation": 60684 } }, { "ph": "f", "id": 60684, "pid": 76337, "tid": -914061504, "ts": 1716454222272568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272590, "dur": 1, "args": { "External id": 60688, "cbid": 251, "correlation": 60688 } }, { "ph": "f", "id": 60688, "pid": 76337, "tid": -914061504, "ts": 1716454222272590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272592, "dur": 0, "args": { "External id": 60689, "cbid": 251, "correlation": 60689 } }, { "ph": "f", "id": 60689, "pid": 76337, "tid": -914061504, "ts": 1716454222272592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272592, "dur": 0, "args": { "External id": 60690, "cbid": 251, "correlation": 60690 } }, { "ph": "f", "id": 60690, "pid": 76337, "tid": -914061504, "ts": 1716454222272592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272593, "dur": 0, "args": { "External id": 60691, "cbid": 251, "correlation": 60691 } }, { "ph": "f", "id": 60691, "pid": 76337, "tid": -914061504, "ts": 1716454222272593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272594, "dur": 0, "args": { "External id": 60692, "cbid": 251, "correlation": 60692 } }, { "ph": "f", "id": 60692, "pid": 76337, "tid": -914061504, "ts": 1716454222272594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272594, "dur": 0, "args": { "External id": 60693, "cbid": 251, "correlation": 60693 } }, { "ph": "f", "id": 60693, "pid": 76337, "tid": -914061504, "ts": 1716454222272594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272595, "dur": 0, "args": { "External id": 60694, "cbid": 251, "correlation": 60694 } }, { "ph": "f", "id": 60694, "pid": 76337, "tid": -914061504, "ts": 1716454222272595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272596, "dur": 0, "args": { "External id": 60695, "cbid": 251, "correlation": 60695 } }, { "ph": "f", "id": 60695, "pid": 76337, "tid": -914061504, "ts": 1716454222272596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222272597, "dur": 0, "args": { "External id": 60696, "cbid": 251, "correlation": 60696 } }, { "ph": "f", "id": 60696, "pid": 76337, "tid": -914061504, "ts": 1716454222272597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222321892, "dur": 122, "args": { "External id": 60697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60697, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60697, "pid": 5, "tid": 7, "ts": 1716454222321892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272599, "dur": 13, "args": { "External id": 60697, "cbid": 211, "correlation": 60697 } }, { "ph": "s", "id": 60697, "pid": 76337, "tid": -914061504, "ts": 1716454222272599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222322016, "dur": 63, "args": { "External id": 60703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60703, "pid": 5, "tid": 7, "ts": 1716454222322016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272634, "dur": 8, "args": { "External id": 60703, "cbid": 211, "correlation": 60703 } }, { "ph": "s", "id": 60703, "pid": 76337, "tid": -914061504, "ts": 1716454222272634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222322080, "dur": 50, "args": { "External id": 60711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60711, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60711, "pid": 5, "tid": 7, "ts": 1716454222322080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272667, "dur": 9, "args": { "External id": 60711, "cbid": 211, "correlation": 60711 } }, { "ph": "s", "id": 60711, "pid": 76337, "tid": -914061504, "ts": 1716454222272667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222272747, "dur": 0, "args": { "External id": 60721, "cbid": 317, "correlation": 60721 } }, { "ph": "f", "id": 60721, "pid": 76337, "tid": -914061504, "ts": 1716454222272747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222272748, "dur": 0, "args": { "External id": 60722, "cbid": 203, "correlation": 60722 } }, { "ph": "f", "id": 60722, "pid": 76337, "tid": -914061504, "ts": 1716454222272748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222272748, "dur": 0, "args": { "External id": 60723, "cbid": 205, "correlation": 60723 } }, { "ph": "f", "id": 60723, "pid": 76337, "tid": -914061504, "ts": 1716454222272748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222322131, "dur": 42, "args": { "External id": 60727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60727, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60727, "pid": 5, "tid": 7, "ts": 1716454222322131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272767, "dur": 13, "args": { "External id": 60727, "cbid": 211, "correlation": 60727 } }, { "ph": "s", "id": 60727, "pid": 76337, "tid": -914061504, "ts": 1716454222272767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222322175, "dur": 16, "args": { "External id": 60729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60729, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60729, "pid": 5, "tid": 7, "ts": 1716454222322175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272782, "dur": 5, "args": { "External id": 60729, "cbid": 211, "correlation": 60729 } }, { "ph": "s", "id": 60729, "pid": 76337, "tid": -914061504, "ts": 1716454222272782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222322193, "dur": 1, "args": { "External id": 60731, "device": 5, "context": 1, "stream": 7, "correlation": 60731, "bytes": 1536, "memory bandwidth (GB/s)": 0.8571428571428571 } }, { "ph": "f", "id": 60731, "pid": 5, "tid": 7, "ts": 1716454222322193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222272801, "dur": 16, "args": { "External id": 60731, "cbid": 51, "correlation": 60731 } }, { "ph": "s", "id": 60731, "pid": 76337, "tid": -914061504, "ts": 1716454222272801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222322197, "dur": 395, "args": { "External id": 60732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60732, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60732, "pid": 5, "tid": 7, "ts": 1716454222322197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272844, "dur": 30, "args": { "External id": 60732, "cbid": 211, "correlation": 60732 } }, { "ph": "s", "id": 60732, "pid": 76337, "tid": -914061504, "ts": 1716454222272844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222322594, "dur": 13, "args": { "External id": 60734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60734, "pid": 5, "tid": 7, "ts": 1716454222322594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272883, "dur": 8, "args": { "External id": 60734, "cbid": 211, "correlation": 60734 } }, { "ph": "s", "id": 60734, "pid": 76337, "tid": -914061504, "ts": 1716454222272883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222322609, "dur": 16, "args": { "External id": 60740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60740, "pid": 5, "tid": 7, "ts": 1716454222322609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222272918, "dur": 9, "args": { "External id": 60740, "cbid": 211, "correlation": 60740 } }, { "ph": "s", "id": 60740, "pid": 76337, "tid": -914061504, "ts": 1716454222272918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222322627, "dur": 20, "args": { "External id": 60760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60760, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 60760, "pid": 5, "tid": 7, "ts": 1716454222322627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273021, "dur": 13, "args": { "External id": 60760, "cbid": 211, "correlation": 60760 } }, { "ph": "s", "id": 60760, "pid": 76337, "tid": -914061504, "ts": 1716454222273021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222322648, "dur": 5, "args": { "External id": 60772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60772, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60772, "pid": 5, "tid": 7, "ts": 1716454222322648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273043, "dur": 7, "args": { "External id": 60772, "cbid": 211, "correlation": 60772 } }, { "ph": "s", "id": 60772, "pid": 76337, "tid": -914061504, "ts": 1716454222273043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222322654, "dur": 19, "args": { "External id": 60775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60775, "pid": 5, "tid": 7, "ts": 1716454222322654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273063, "dur": 7, "args": { "External id": 60775, "cbid": 211, "correlation": 60775 } }, { "ph": "s", "id": 60775, "pid": 76337, "tid": -914061504, "ts": 1716454222273063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222322675, "dur": 12, "args": { "External id": 60784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60784, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60784, "pid": 5, "tid": 7, "ts": 1716454222322675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273102, "dur": 10, "args": { "External id": 60784, "cbid": 211, "correlation": 60784 } }, { "ph": "s", "id": 60784, "pid": 76337, "tid": -914061504, "ts": 1716454222273102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222273157, "dur": 0, "args": { "External id": 60794, "cbid": 317, "correlation": 60794 } }, { "ph": "f", "id": 60794, "pid": 76337, "tid": -914061504, "ts": 1716454222273157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222273158, "dur": 0, "args": { "External id": 60795, "cbid": 203, "correlation": 60795 } }, { "ph": "f", "id": 60795, "pid": 76337, "tid": -914061504, "ts": 1716454222273158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222273159, "dur": 0, "args": { "External id": 60796, "cbid": 205, "correlation": 60796 } }, { "ph": "f", "id": 60796, "pid": 76337, "tid": -914061504, "ts": 1716454222273159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222322689, "dur": 12, "args": { "External id": 60800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60800, "pid": 5, "tid": 7, "ts": 1716454222322689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273178, "dur": 12, "args": { "External id": 60800, "cbid": 211, "correlation": 60800 } }, { "ph": "s", "id": 60800, "pid": 76337, "tid": -914061504, "ts": 1716454222273178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222322703, "dur": 26, "args": { "External id": 60802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60802, "pid": 5, "tid": 7, "ts": 1716454222322703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273192, "dur": 5, "args": { "External id": 60802, "cbid": 211, "correlation": 60802 } }, { "ph": "s", "id": 60802, "pid": 76337, "tid": -914061504, "ts": 1716454222273192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222322731, "dur": 4, "args": { "External id": 60804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60804, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 60804, "pid": 5, "tid": 7, "ts": 1716454222322731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273209, "dur": 8, "args": { "External id": 60804, "cbid": 211, "correlation": 60804 } }, { "ph": "s", "id": 60804, "pid": 76337, "tid": -914061504, "ts": 1716454222273209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222273220, "dur": 0, "args": { "External id": 60805, "cbid": 51, "correlation": 60805 } }, { "ph": "s", "id": 60805, "pid": 76337, "tid": -914061504, "ts": 1716454222273220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222322736, "dur": 387, "args": { "External id": 60806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60806, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60806, "pid": 5, "tid": 7, "ts": 1716454222322736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273221, "dur": 8, "args": { "External id": 60806, "cbid": 211, "correlation": 60806 } }, { "ph": "s", "id": 60806, "pid": 76337, "tid": -914061504, "ts": 1716454222273221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222323124, "dur": 21, "args": { "External id": 60807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60807, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60807, "pid": 5, "tid": 7, "ts": 1716454222323124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273232, "dur": 5, "args": { "External id": 60807, "cbid": 211, "correlation": 60807 } }, { "ph": "s", "id": 60807, "pid": 76337, "tid": -914061504, "ts": 1716454222273232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222323147, "dur": 35, "args": { "External id": 60813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60813, "pid": 5, "tid": 7, "ts": 1716454222323147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273259, "dur": 9, "args": { "External id": 60813, "cbid": 211, "correlation": 60813 } }, { "ph": "s", "id": 60813, "pid": 76337, "tid": -914061504, "ts": 1716454222273259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222323184, "dur": 4, "args": { "External id": 60821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60821, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 60821, "pid": 5, "tid": 7, "ts": 1716454222323184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273303, "dur": 9, "args": { "External id": 60821, "cbid": 211, "correlation": 60821 } }, { "ph": "s", "id": 60821, "pid": 76337, "tid": -914061504, "ts": 1716454222273303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273369, "dur": 1, "args": { "External id": 60837, "cbid": 251, "correlation": 60837 } }, { "ph": "f", "id": 60837, "pid": 76337, "tid": -914061504, "ts": 1716454222273369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273374, "dur": 0, "args": { "External id": 60839, "cbid": 251, "correlation": 60839 } }, { "ph": "f", "id": 60839, "pid": 76337, "tid": -914061504, "ts": 1716454222273374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222323189, "dur": 13, "args": { "External id": 60840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60840, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 60840, "pid": 5, "tid": 7, "ts": 1716454222323189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273376, "dur": 12, "args": { "External id": 60840, "cbid": 211, "correlation": 60840 } }, { "ph": "s", "id": 60840, "pid": 76337, "tid": -914061504, "ts": 1716454222273376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222323203, "dur": 5, "args": { "External id": 60842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60842, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 60842, "pid": 5, "tid": 7, "ts": 1716454222323203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273390, "dur": 6, "args": { "External id": 60842, "cbid": 211, "correlation": 60842 } }, { "ph": "s", "id": 60842, "pid": 76337, "tid": -914061504, "ts": 1716454222273390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222323210, "dur": 31, "args": { "External id": 60852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60852, "pid": 5, "tid": 7, "ts": 1716454222323210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273448, "dur": 13, "args": { "External id": 60852, "cbid": 211, "correlation": 60852 } }, { "ph": "s", "id": 60852, "pid": 76337, "tid": -914061504, "ts": 1716454222273448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222323242, "dur": 32, "args": { "External id": 60872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60872, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 60872, "pid": 5, "tid": 7, "ts": 1716454222323242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273516, "dur": 15, "args": { "External id": 60872, "cbid": 211, "correlation": 60872 } }, { "ph": "s", "id": 60872, "pid": 76337, "tid": -914061504, "ts": 1716454222273516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222323276, "dur": 4, "args": { "External id": 60884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60884, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 60884, "pid": 5, "tid": 7, "ts": 1716454222323276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273541, "dur": 7, "args": { "External id": 60884, "cbid": 211, "correlation": 60884 } }, { "ph": "s", "id": 60884, "pid": 76337, "tid": -914061504, "ts": 1716454222273541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222323282, "dur": 32, "args": { "External id": 60887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60887, "pid": 5, "tid": 7, "ts": 1716454222323282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273560, "dur": 6, "args": { "External id": 60887, "cbid": 211, "correlation": 60887 } }, { "ph": "s", "id": 60887, "pid": 76337, "tid": -914061504, "ts": 1716454222273560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222323315, "dur": 22, "args": { "External id": 60896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60896, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60896, "pid": 5, "tid": 7, "ts": 1716454222323315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273601, "dur": 9, "args": { "External id": 60896, "cbid": 211, "correlation": 60896 } }, { "ph": "s", "id": 60896, "pid": 76337, "tid": -914061504, "ts": 1716454222273601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222273663, "dur": 0, "args": { "External id": 60906, "cbid": 317, "correlation": 60906 } }, { "ph": "f", "id": 60906, "pid": 76337, "tid": -914061504, "ts": 1716454222273663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222273664, "dur": 0, "args": { "External id": 60907, "cbid": 203, "correlation": 60907 } }, { "ph": "f", "id": 60907, "pid": 76337, "tid": -914061504, "ts": 1716454222273664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222273665, "dur": 0, "args": { "External id": 60908, "cbid": 205, "correlation": 60908 } }, { "ph": "f", "id": 60908, "pid": 76337, "tid": -914061504, "ts": 1716454222273665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222323338, "dur": 22, "args": { "External id": 60912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60912, "pid": 5, "tid": 7, "ts": 1716454222323338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273687, "dur": 12, "args": { "External id": 60912, "cbid": 211, "correlation": 60912 } }, { "ph": "s", "id": 60912, "pid": 76337, "tid": -914061504, "ts": 1716454222273687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222323362, "dur": 49, "args": { "External id": 60914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60914, "pid": 5, "tid": 7, "ts": 1716454222323362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273702, "dur": 5, "args": { "External id": 60914, "cbid": 211, "correlation": 60914 } }, { "ph": "s", "id": 60914, "pid": 76337, "tid": -914061504, "ts": 1716454222273702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222323412, "dur": 718, "args": { "External id": 60916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60916, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 60916, "pid": 5, "tid": 7, "ts": 1716454222323412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273721, "dur": 13, "args": { "External id": 60916, "cbid": 211, "correlation": 60916 } }, { "ph": "s", "id": 60916, "pid": 76337, "tid": -914061504, "ts": 1716454222273721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222324131, "dur": 22, "args": { "External id": 60918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60918, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60918, "pid": 5, "tid": 7, "ts": 1716454222324131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273738, "dur": 5, "args": { "External id": 60918, "cbid": 211, "correlation": 60918 } }, { "ph": "s", "id": 60918, "pid": 76337, "tid": -914061504, "ts": 1716454222273738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222324155, "dur": 34, "args": { "External id": 60924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60924, "pid": 5, "tid": 7, "ts": 1716454222324155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273766, "dur": 8, "args": { "External id": 60924, "cbid": 211, "correlation": 60924 } }, { "ph": "s", "id": 60924, "pid": 76337, "tid": -914061504, "ts": 1716454222273766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222273824, "dur": 0, "args": { "External id": 60934, "cbid": 317, "correlation": 60934 } }, { "ph": "f", "id": 60934, "pid": 76337, "tid": -914061504, "ts": 1716454222273824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222273825, "dur": 0, "args": { "External id": 60935, "cbid": 203, "correlation": 60935 } }, { "ph": "f", "id": 60935, "pid": 76337, "tid": -914061504, "ts": 1716454222273825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222273825, "dur": 0, "args": { "External id": 60936, "cbid": 205, "correlation": 60936 } }, { "ph": "f", "id": 60936, "pid": 76337, "tid": -914061504, "ts": 1716454222273825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273852, "dur": 1, "args": { "External id": 60940, "cbid": 251, "correlation": 60940 } }, { "ph": "f", "id": 60940, "pid": 76337, "tid": -914061504, "ts": 1716454222273852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273854, "dur": 0, "args": { "External id": 60941, "cbid": 251, "correlation": 60941 } }, { "ph": "f", "id": 60941, "pid": 76337, "tid": -914061504, "ts": 1716454222273854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273854, "dur": 0, "args": { "External id": 60942, "cbid": 251, "correlation": 60942 } }, { "ph": "f", "id": 60942, "pid": 76337, "tid": -914061504, "ts": 1716454222273854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273855, "dur": 0, "args": { "External id": 60943, "cbid": 251, "correlation": 60943 } }, { "ph": "f", "id": 60943, "pid": 76337, "tid": -914061504, "ts": 1716454222273855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273856, "dur": 0, "args": { "External id": 60944, "cbid": 251, "correlation": 60944 } }, { "ph": "f", "id": 60944, "pid": 76337, "tid": -914061504, "ts": 1716454222273856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273856, "dur": 0, "args": { "External id": 60945, "cbid": 251, "correlation": 60945 } }, { "ph": "f", "id": 60945, "pid": 76337, "tid": -914061504, "ts": 1716454222273856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273857, "dur": 0, "args": { "External id": 60946, "cbid": 251, "correlation": 60946 } }, { "ph": "f", "id": 60946, "pid": 76337, "tid": -914061504, "ts": 1716454222273857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273858, "dur": 0, "args": { "External id": 60947, "cbid": 251, "correlation": 60947 } }, { "ph": "f", "id": 60947, "pid": 76337, "tid": -914061504, "ts": 1716454222273858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222273859, "dur": 0, "args": { "External id": 60948, "cbid": 251, "correlation": 60948 } }, { "ph": "f", "id": 60948, "pid": 76337, "tid": -914061504, "ts": 1716454222273859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222324190, "dur": 55, "args": { "External id": 60949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60949, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 60949, "pid": 5, "tid": 7, "ts": 1716454222324190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273861, "dur": 12, "args": { "External id": 60949, "cbid": 211, "correlation": 60949 } }, { "ph": "s", "id": 60949, "pid": 76337, "tid": -914061504, "ts": 1716454222273861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222324247, "dur": 35, "args": { "External id": 60955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60955, "pid": 5, "tid": 7, "ts": 1716454222324247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273895, "dur": 8, "args": { "External id": 60955, "cbid": 211, "correlation": 60955 } }, { "ph": "s", "id": 60955, "pid": 76337, "tid": -914061504, "ts": 1716454222273895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222324283, "dur": 27, "args": { "External id": 60963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60963, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60963, "pid": 5, "tid": 7, "ts": 1716454222324283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273924, "dur": 8, "args": { "External id": 60963, "cbid": 211, "correlation": 60963 } }, { "ph": "s", "id": 60963, "pid": 76337, "tid": -914061504, "ts": 1716454222273924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222324312, "dur": 20, "args": { "External id": 60971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60971, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 60971, "pid": 5, "tid": 7, "ts": 1716454222324312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222273954, "dur": 9, "args": { "External id": 60971, "cbid": 211, "correlation": 60971 } }, { "ph": "s", "id": 60971, "pid": 76337, "tid": -914061504, "ts": 1716454222273954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222324333, "dur": 32, "args": { "External id": 60991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 60991, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 60991, "pid": 5, "tid": 7, "ts": 1716454222324333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274047, "dur": 13, "args": { "External id": 60991, "cbid": 211, "correlation": 60991 } }, { "ph": "s", "id": 60991, "pid": 76337, "tid": -914061504, "ts": 1716454222274047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222324366, "dur": 4, "args": { "External id": 61003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61003, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 61003, "pid": 5, "tid": 7, "ts": 1716454222324366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274069, "dur": 6, "args": { "External id": 61003, "cbid": 211, "correlation": 61003 } }, { "ph": "s", "id": 61003, "pid": 76337, "tid": -914061504, "ts": 1716454222274069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222324372, "dur": 32, "args": { "External id": 61006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61006, "pid": 5, "tid": 7, "ts": 1716454222324372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274087, "dur": 7, "args": { "External id": 61006, "cbid": 211, "correlation": 61006 } }, { "ph": "s", "id": 61006, "pid": 76337, "tid": -914061504, "ts": 1716454222274087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222274149, "dur": 0, "args": { "External id": 61017, "cbid": 317, "correlation": 61017 } }, { "ph": "f", "id": 61017, "pid": 76337, "tid": -914061504, "ts": 1716454222274149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222274150, "dur": 0, "args": { "External id": 61018, "cbid": 203, "correlation": 61018 } }, { "ph": "f", "id": 61018, "pid": 76337, "tid": -914061504, "ts": 1716454222274150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222274151, "dur": 0, "args": { "External id": 61019, "cbid": 205, "correlation": 61019 } }, { "ph": "f", "id": 61019, "pid": 76337, "tid": -914061504, "ts": 1716454222274151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222324405, "dur": 23, "args": { "External id": 61023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61023, "pid": 5, "tid": 7, "ts": 1716454222324405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274167, "dur": 12, "args": { "External id": 61023, "cbid": 211, "correlation": 61023 } }, { "ph": "s", "id": 61023, "pid": 76337, "tid": -914061504, "ts": 1716454222274167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222324429, "dur": 130, "args": { "External id": 61025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61025, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61025, "pid": 5, "tid": 7, "ts": 1716454222324429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274188, "dur": 10, "args": { "External id": 61025, "cbid": 211, "correlation": 61025 } }, { "ph": "s", "id": 61025, "pid": 76337, "tid": -914061504, "ts": 1716454222274188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222324561, "dur": 22, "args": { "External id": 61027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61027, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61027, "pid": 5, "tid": 7, "ts": 1716454222324561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274201, "dur": 5, "args": { "External id": 61027, "cbid": 211, "correlation": 61027 } }, { "ph": "s", "id": 61027, "pid": 76337, "tid": -914061504, "ts": 1716454222274201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222324584, "dur": 35, "args": { "External id": 61033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61033, "pid": 5, "tid": 7, "ts": 1716454222324584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274229, "dur": 9, "args": { "External id": 61033, "cbid": 211, "correlation": 61033 } }, { "ph": "s", "id": 61033, "pid": 76337, "tid": -914061504, "ts": 1716454222274229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222324620, "dur": 207, "args": { "External id": 61042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61042, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61042, "pid": 5, "tid": 7, "ts": 1716454222324620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274313, "dur": 14, "args": { "External id": 61042, "cbid": 211, "correlation": 61042 } }, { "ph": "s", "id": 61042, "pid": 76337, "tid": -914061504, "ts": 1716454222274313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222324828, "dur": 71, "args": { "External id": 61064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61064, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61064, "pid": 5, "tid": 7, "ts": 1716454222324828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274373, "dur": 10, "args": { "External id": 61064, "cbid": 211, "correlation": 61064 } }, { "ph": "s", "id": 61064, "pid": 76337, "tid": -914061504, "ts": 1716454222274373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222274465, "dur": 1, "args": { "External id": 61075, "cbid": 251, "correlation": 61075 } }, { "ph": "f", "id": 61075, "pid": 76337, "tid": -914061504, "ts": 1716454222274465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222324901, "dur": 164, "args": { "External id": 61076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61076, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61076, "pid": 5, "tid": 7, "ts": 1716454222324901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274470, "dur": 13, "args": { "External id": 61076, "cbid": 211, "correlation": 61076 } }, { "ph": "s", "id": 61076, "pid": 76337, "tid": -914061504, "ts": 1716454222274470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222274545, "dur": 1, "args": { "External id": 61087, "cbid": 251, "correlation": 61087 } }, { "ph": "f", "id": 61087, "pid": 76337, "tid": -914061504, "ts": 1716454222274545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222325066, "dur": 155, "args": { "External id": 61088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61088, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61088, "pid": 5, "tid": 7, "ts": 1716454222325066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274549, "dur": 12, "args": { "External id": 61088, "cbid": 211, "correlation": 61088 } }, { "ph": "s", "id": 61088, "pid": 76337, "tid": -914061504, "ts": 1716454222274549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222274613, "dur": 1, "args": { "External id": 61099, "cbid": 251, "correlation": 61099 } }, { "ph": "f", "id": 61099, "pid": 76337, "tid": -914061504, "ts": 1716454222274613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222325223, "dur": 155, "args": { "External id": 61100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61100, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61100, "pid": 5, "tid": 7, "ts": 1716454222325223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274617, "dur": 12, "args": { "External id": 61100, "cbid": 211, "correlation": 61100 } }, { "ph": "s", "id": 61100, "pid": 76337, "tid": -914061504, "ts": 1716454222274617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222325379, "dur": 2148, "args": { "External id": 61121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61121, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 61121, "pid": 5, "tid": 7, "ts": 1716454222325379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274700, "dur": 13, "args": { "External id": 61121, "cbid": 211, "correlation": 61121 } }, { "ph": "s", "id": 61121, "pid": 76337, "tid": -914061504, "ts": 1716454222274700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222274800, "dur": 1, "args": { "External id": 61139, "cbid": 251, "correlation": 61139 } }, { "ph": "f", "id": 61139, "pid": 76337, "tid": -914061504, "ts": 1716454222274800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222327529, "dur": 158, "args": { "External id": 61141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61141, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 61141, "pid": 5, "tid": 7, "ts": 1716454222327529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274806, "dur": 13, "args": { "External id": 61141, "cbid": 211, "correlation": 61141 } }, { "ph": "s", "id": 61141, "pid": 76337, "tid": -914061504, "ts": 1716454222274806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222327688, "dur": 36, "args": { "External id": 61149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61149, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61149, "pid": 5, "tid": 7, "ts": 1716454222327688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274875, "dur": 12, "args": { "External id": 61149, "cbid": 211, "correlation": 61149 } }, { "ph": "s", "id": 61149, "pid": 76337, "tid": -914061504, "ts": 1716454222274875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222327725, "dur": 51, "args": { "External id": 61157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61157, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61157, "pid": 5, "tid": 7, "ts": 1716454222327725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274915, "dur": 11, "args": { "External id": 61157, "cbid": 211, "correlation": 61157 } }, { "ph": "s", "id": 61157, "pid": 76337, "tid": -914061504, "ts": 1716454222274915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222327778, "dur": 33, "args": { "External id": 61168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61168, "pid": 5, "tid": 7, "ts": 1716454222327778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222274999, "dur": 14, "args": { "External id": 61168, "cbid": 211, "correlation": 61168 } }, { "ph": "s", "id": 61168, "pid": 76337, "tid": -914061504, "ts": 1716454222274999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222327812, "dur": 37, "args": { "External id": 61190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61190, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61190, "pid": 5, "tid": 7, "ts": 1716454222327812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275032, "dur": 7, "args": { "External id": 61190, "cbid": 211, "correlation": 61190 } }, { "ph": "s", "id": 61190, "pid": 76337, "tid": -914061504, "ts": 1716454222275032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275117, "dur": 1, "args": { "External id": 61201, "cbid": 251, "correlation": 61201 } }, { "ph": "f", "id": 61201, "pid": 76337, "tid": -914061504, "ts": 1716454222275117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222327851, "dur": 98, "args": { "External id": 61202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61202, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61202, "pid": 5, "tid": 7, "ts": 1716454222327851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275122, "dur": 14, "args": { "External id": 61202, "cbid": 211, "correlation": 61202 } }, { "ph": "s", "id": 61202, "pid": 76337, "tid": -914061504, "ts": 1716454222275122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275193, "dur": 1, "args": { "External id": 61213, "cbid": 251, "correlation": 61213 } }, { "ph": "f", "id": 61213, "pid": 76337, "tid": -914061504, "ts": 1716454222275193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275197, "dur": 0, "args": { "External id": 61214, "cbid": 251, "correlation": 61214 } }, { "ph": "f", "id": 61214, "pid": 76337, "tid": -914061504, "ts": 1716454222275197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222327950, "dur": 12, "args": { "External id": 61215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61215, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 61215, "pid": 5, "tid": 7, "ts": 1716454222327950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275198, "dur": 12, "args": { "External id": 61215, "cbid": 211, "correlation": 61215 } }, { "ph": "s", "id": 61215, "pid": 76337, "tid": -914061504, "ts": 1716454222275198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222327964, "dur": 5, "args": { "External id": 61217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61217, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 61217, "pid": 5, "tid": 7, "ts": 1716454222327964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275214, "dur": 11, "args": { "External id": 61217, "cbid": 211, "correlation": 61217 } }, { "ph": "s", "id": 61217, "pid": 76337, "tid": -914061504, "ts": 1716454222275214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275279, "dur": 1, "args": { "External id": 61228, "cbid": 251, "correlation": 61228 } }, { "ph": "f", "id": 61228, "pid": 76337, "tid": -914061504, "ts": 1716454222275279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275282, "dur": 0, "args": { "External id": 61229, "cbid": 251, "correlation": 61229 } }, { "ph": "f", "id": 61229, "pid": 76337, "tid": -914061504, "ts": 1716454222275282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222327970, "dur": 8, "args": { "External id": 61230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61230, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 61230, "pid": 5, "tid": 7, "ts": 1716454222327970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275284, "dur": 12, "args": { "External id": 61230, "cbid": 211, "correlation": 61230 } }, { "ph": "s", "id": 61230, "pid": 76337, "tid": -914061504, "ts": 1716454222275284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222327979, "dur": 3, "args": { "External id": 61232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61232, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 61232, "pid": 5, "tid": 7, "ts": 1716454222327979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275297, "dur": 8, "args": { "External id": 61232, "cbid": 211, "correlation": 61232 } }, { "ph": "s", "id": 61232, "pid": 76337, "tid": -914061504, "ts": 1716454222275297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222327984, "dur": 100, "args": { "External id": 61253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61253, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 61253, "pid": 5, "tid": 7, "ts": 1716454222327984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275374, "dur": 12, "args": { "External id": 61253, "cbid": 211, "correlation": 61253 } }, { "ph": "s", "id": 61253, "pid": 76337, "tid": -914061504, "ts": 1716454222275374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275469, "dur": 1, "args": { "External id": 61271, "cbid": 251, "correlation": 61271 } }, { "ph": "f", "id": 61271, "pid": 76337, "tid": -914061504, "ts": 1716454222275469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222328085, "dur": 109, "args": { "External id": 61273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61273, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61273, "pid": 5, "tid": 7, "ts": 1716454222328085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275474, "dur": 13, "args": { "External id": 61273, "cbid": 211, "correlation": 61273 } }, { "ph": "s", "id": 61273, "pid": 76337, "tid": -914061504, "ts": 1716454222275474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222328195, "dur": 19, "args": { "External id": 61281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61281, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61281, "pid": 5, "tid": 7, "ts": 1716454222328195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275543, "dur": 12, "args": { "External id": 61281, "cbid": 211, "correlation": 61281 } }, { "ph": "s", "id": 61281, "pid": 76337, "tid": -914061504, "ts": 1716454222275543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222328216, "dur": 39, "args": { "External id": 61289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61289, "pid": 5, "tid": 7, "ts": 1716454222328216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275585, "dur": 9, "args": { "External id": 61289, "cbid": 211, "correlation": 61289 } }, { "ph": "s", "id": 61289, "pid": 76337, "tid": -914061504, "ts": 1716454222275585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222328256, "dur": 38, "args": { "External id": 61311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61311, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61311, "pid": 5, "tid": 7, "ts": 1716454222328256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275636, "dur": 13, "args": { "External id": 61311, "cbid": 211, "correlation": 61311 } }, { "ph": "s", "id": 61311, "pid": 76337, "tid": -914061504, "ts": 1716454222275636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275729, "dur": 1, "args": { "External id": 61327, "cbid": 251, "correlation": 61327 } }, { "ph": "f", "id": 61327, "pid": 76337, "tid": -914061504, "ts": 1716454222275729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275734, "dur": 0, "args": { "External id": 61329, "cbid": 251, "correlation": 61329 } }, { "ph": "f", "id": 61329, "pid": 76337, "tid": -914061504, "ts": 1716454222275734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222328296, "dur": 589, "args": { "External id": 61330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61330, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 61330, "pid": 5, "tid": 7, "ts": 1716454222328296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275738, "dur": 13, "args": { "External id": 61330, "cbid": 211, "correlation": 61330 } }, { "ph": "s", "id": 61330, "pid": 76337, "tid": -914061504, "ts": 1716454222275738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222328886, "dur": 134, "args": { "External id": 61338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61338, "pid": 5, "tid": 7, "ts": 1716454222328886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275804, "dur": 12, "args": { "External id": 61338, "cbid": 211, "correlation": 61338 } }, { "ph": "s", "id": 61338, "pid": 76337, "tid": -914061504, "ts": 1716454222275804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222329022, "dur": 131, "args": { "External id": 61346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61346, "pid": 5, "tid": 7, "ts": 1716454222329022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275835, "dur": 8, "args": { "External id": 61346, "cbid": 211, "correlation": 61346 } }, { "ph": "s", "id": 61346, "pid": 76337, "tid": -914061504, "ts": 1716454222275835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222275913, "dur": 1, "args": { "External id": 61362, "cbid": 251, "correlation": 61362 } }, { "ph": "f", "id": 61362, "pid": 76337, "tid": -914061504, "ts": 1716454222275913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222329154, "dur": 331, "args": { "External id": 61364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61364, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61364, "pid": 5, "tid": 7, "ts": 1716454222329154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275918, "dur": 12, "args": { "External id": 61364, "cbid": 211, "correlation": 61364 } }, { "ph": "s", "id": 61364, "pid": 76337, "tid": -914061504, "ts": 1716454222275918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222329486, "dur": 28, "args": { "External id": 61372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61372, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61372, "pid": 5, "tid": 7, "ts": 1716454222329486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222275961, "dur": 10, "args": { "External id": 61372, "cbid": 211, "correlation": 61372 } }, { "ph": "s", "id": 61372, "pid": 76337, "tid": -914061504, "ts": 1716454222275961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222329516, "dur": 89, "args": { "External id": 61383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61383, "pid": 5, "tid": 7, "ts": 1716454222329516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276041, "dur": 14, "args": { "External id": 61383, "cbid": 211, "correlation": 61383 } }, { "ph": "s", "id": 61383, "pid": 76337, "tid": -914061504, "ts": 1716454222276041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222276109, "dur": 0, "args": { "External id": 61395, "cbid": 317, "correlation": 61395 } }, { "ph": "f", "id": 61395, "pid": 76337, "tid": -914061504, "ts": 1716454222276109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222276110, "dur": 0, "args": { "External id": 61396, "cbid": 203, "correlation": 61396 } }, { "ph": "f", "id": 61396, "pid": 76337, "tid": -914061504, "ts": 1716454222276110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222276111, "dur": 0, "args": { "External id": 61397, "cbid": 205, "correlation": 61397 } }, { "ph": "f", "id": 61397, "pid": 76337, "tid": -914061504, "ts": 1716454222276111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222329606, "dur": 23, "args": { "External id": 61401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61401, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61401, "pid": 5, "tid": 7, "ts": 1716454222329606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276126, "dur": 12, "args": { "External id": 61401, "cbid": 211, "correlation": 61401 } }, { "ph": "s", "id": 61401, "pid": 76337, "tid": -914061504, "ts": 1716454222276126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222329631, "dur": 131, "args": { "External id": 61403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61403, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61403, "pid": 5, "tid": 7, "ts": 1716454222329631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276145, "dur": 7, "args": { "External id": 61403, "cbid": 211, "correlation": 61403 } }, { "ph": "s", "id": 61403, "pid": 76337, "tid": -914061504, "ts": 1716454222276145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222329764, "dur": 22, "args": { "External id": 61405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61405, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61405, "pid": 5, "tid": 7, "ts": 1716454222329764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276156, "dur": 5, "args": { "External id": 61405, "cbid": 211, "correlation": 61405 } }, { "ph": "s", "id": 61405, "pid": 76337, "tid": -914061504, "ts": 1716454222276156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222329788, "dur": 35, "args": { "External id": 61411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61411, "pid": 5, "tid": 7, "ts": 1716454222329788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276184, "dur": 8, "args": { "External id": 61411, "cbid": 211, "correlation": 61411 } }, { "ph": "s", "id": 61411, "pid": 76337, "tid": -914061504, "ts": 1716454222276184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222329824, "dur": 27, "args": { "External id": 61419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61419, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61419, "pid": 5, "tid": 7, "ts": 1716454222329824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276216, "dur": 8, "args": { "External id": 61419, "cbid": 211, "correlation": 61419 } }, { "ph": "s", "id": 61419, "pid": 76337, "tid": -914061504, "ts": 1716454222276216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222329853, "dur": 32, "args": { "External id": 61439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61439, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 61439, "pid": 5, "tid": 7, "ts": 1716454222329853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276289, "dur": 12, "args": { "External id": 61439, "cbid": 211, "correlation": 61439 } }, { "ph": "s", "id": 61439, "pid": 76337, "tid": -914061504, "ts": 1716454222276289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222329887, "dur": 5, "args": { "External id": 61451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61451, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 61451, "pid": 5, "tid": 7, "ts": 1716454222329887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276312, "dur": 6, "args": { "External id": 61451, "cbid": 211, "correlation": 61451 } }, { "ph": "s", "id": 61451, "pid": 76337, "tid": -914061504, "ts": 1716454222276312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222329893, "dur": 33, "args": { "External id": 61454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61454, "pid": 5, "tid": 7, "ts": 1716454222329893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276329, "dur": 10, "args": { "External id": 61454, "cbid": 211, "correlation": 61454 } }, { "ph": "s", "id": 61454, "pid": 76337, "tid": -914061504, "ts": 1716454222276329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222329927, "dur": 22, "args": { "External id": 61463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61463, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61463, "pid": 5, "tid": 7, "ts": 1716454222329927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276373, "dur": 10, "args": { "External id": 61463, "cbid": 211, "correlation": 61463 } }, { "ph": "s", "id": 61463, "pid": 76337, "tid": -914061504, "ts": 1716454222276373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222276425, "dur": 0, "args": { "External id": 61473, "cbid": 317, "correlation": 61473 } }, { "ph": "f", "id": 61473, "pid": 76337, "tid": -914061504, "ts": 1716454222276425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222276426, "dur": 0, "args": { "External id": 61474, "cbid": 203, "correlation": 61474 } }, { "ph": "f", "id": 61474, "pid": 76337, "tid": -914061504, "ts": 1716454222276426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222276426, "dur": 0, "args": { "External id": 61475, "cbid": 205, "correlation": 61475 } }, { "ph": "f", "id": 61475, "pid": 76337, "tid": -914061504, "ts": 1716454222276426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222329951, "dur": 24, "args": { "External id": 61479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61479, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61479, "pid": 5, "tid": 7, "ts": 1716454222329951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276440, "dur": 12, "args": { "External id": 61479, "cbid": 211, "correlation": 61479 } }, { "ph": "s", "id": 61479, "pid": 76337, "tid": -914061504, "ts": 1716454222276440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222329976, "dur": 48, "args": { "External id": 61481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61481, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61481, "pid": 5, "tid": 7, "ts": 1716454222329976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276454, "dur": 5, "args": { "External id": 61481, "cbid": 211, "correlation": 61481 } }, { "ph": "s", "id": 61481, "pid": 76337, "tid": -914061504, "ts": 1716454222276454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222330026, "dur": 717, "args": { "External id": 61483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61483, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61483, "pid": 5, "tid": 7, "ts": 1716454222330026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276466, "dur": 6, "args": { "External id": 61483, "cbid": 211, "correlation": 61483 } }, { "ph": "s", "id": 61483, "pid": 76337, "tid": -914061504, "ts": 1716454222276466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222330744, "dur": 23, "args": { "External id": 61485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61485, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61485, "pid": 5, "tid": 7, "ts": 1716454222330744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276475, "dur": 5, "args": { "External id": 61485, "cbid": 211, "correlation": 61485 } }, { "ph": "s", "id": 61485, "pid": 76337, "tid": -914061504, "ts": 1716454222276475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222330768, "dur": 35, "args": { "External id": 61491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61491, "pid": 5, "tid": 7, "ts": 1716454222330768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276502, "dur": 9, "args": { "External id": 61491, "cbid": 211, "correlation": 61491 } }, { "ph": "s", "id": 61491, "pid": 76337, "tid": -914061504, "ts": 1716454222276502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222330805, "dur": 4, "args": { "External id": 61499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61499, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 61499, "pid": 5, "tid": 7, "ts": 1716454222330805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276546, "dur": 10, "args": { "External id": 61499, "cbid": 211, "correlation": 61499 } }, { "ph": "s", "id": 61499, "pid": 76337, "tid": -914061504, "ts": 1716454222276546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222276612, "dur": 1, "args": { "External id": 61515, "cbid": 251, "correlation": 61515 } }, { "ph": "f", "id": 61515, "pid": 76337, "tid": -914061504, "ts": 1716454222276612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222276617, "dur": 0, "args": { "External id": 61517, "cbid": 251, "correlation": 61517 } }, { "ph": "f", "id": 61517, "pid": 76337, "tid": -914061504, "ts": 1716454222276617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222330810, "dur": 13, "args": { "External id": 61518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61518, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 61518, "pid": 5, "tid": 7, "ts": 1716454222330810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276619, "dur": 12, "args": { "External id": 61518, "cbid": 211, "correlation": 61518 } }, { "ph": "s", "id": 61518, "pid": 76337, "tid": -914061504, "ts": 1716454222276619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222330824, "dur": 6, "args": { "External id": 61520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61520, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 61520, "pid": 5, "tid": 7, "ts": 1716454222330824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276636, "dur": 5, "args": { "External id": 61520, "cbid": 211, "correlation": 61520 } }, { "ph": "s", "id": 61520, "pid": 76337, "tid": -914061504, "ts": 1716454222276636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222330831, "dur": 31, "args": { "External id": 61530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61530, "pid": 5, "tid": 7, "ts": 1716454222330831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276693, "dur": 12, "args": { "External id": 61530, "cbid": 211, "correlation": 61530 } }, { "ph": "s", "id": 61530, "pid": 76337, "tid": -914061504, "ts": 1716454222276693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222330863, "dur": 32, "args": { "External id": 61550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61550, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 61550, "pid": 5, "tid": 7, "ts": 1716454222330863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276759, "dur": 10, "args": { "External id": 61550, "cbid": 211, "correlation": 61550 } }, { "ph": "s", "id": 61550, "pid": 76337, "tid": -914061504, "ts": 1716454222276759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222330897, "dur": 4, "args": { "External id": 61562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61562, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 61562, "pid": 5, "tid": 7, "ts": 1716454222330897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276779, "dur": 7, "args": { "External id": 61562, "cbid": 211, "correlation": 61562 } }, { "ph": "s", "id": 61562, "pid": 76337, "tid": -914061504, "ts": 1716454222276779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222330902, "dur": 31, "args": { "External id": 61565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61565, "pid": 5, "tid": 7, "ts": 1716454222330902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276798, "dur": 7, "args": { "External id": 61565, "cbid": 211, "correlation": 61565 } }, { "ph": "s", "id": 61565, "pid": 76337, "tid": -914061504, "ts": 1716454222276798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222330935, "dur": 22, "args": { "External id": 61574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61574, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61574, "pid": 5, "tid": 7, "ts": 1716454222330935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276839, "dur": 10, "args": { "External id": 61574, "cbid": 211, "correlation": 61574 } }, { "ph": "s", "id": 61574, "pid": 76337, "tid": -914061504, "ts": 1716454222276839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222276901, "dur": 0, "args": { "External id": 61584, "cbid": 317, "correlation": 61584 } }, { "ph": "f", "id": 61584, "pid": 76337, "tid": -914061504, "ts": 1716454222276901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222276902, "dur": 0, "args": { "External id": 61585, "cbid": 203, "correlation": 61585 } }, { "ph": "f", "id": 61585, "pid": 76337, "tid": -914061504, "ts": 1716454222276902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222276903, "dur": 0, "args": { "External id": 61586, "cbid": 205, "correlation": 61586 } }, { "ph": "f", "id": 61586, "pid": 76337, "tid": -914061504, "ts": 1716454222276903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222330958, "dur": 23, "args": { "External id": 61590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61590, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61590, "pid": 5, "tid": 7, "ts": 1716454222330958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276916, "dur": 12, "args": { "External id": 61590, "cbid": 211, "correlation": 61590 } }, { "ph": "s", "id": 61590, "pid": 76337, "tid": -914061504, "ts": 1716454222276916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222330982, "dur": 48, "args": { "External id": 61592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61592, "pid": 5, "tid": 7, "ts": 1716454222330982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276934, "dur": 6, "args": { "External id": 61592, "cbid": 211, "correlation": 61592 } }, { "ph": "s", "id": 61592, "pid": 76337, "tid": -914061504, "ts": 1716454222276934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222331032, "dur": 709, "args": { "External id": 61594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61594, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61594, "pid": 5, "tid": 7, "ts": 1716454222331032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276946, "dur": 6, "args": { "External id": 61594, "cbid": 211, "correlation": 61594 } }, { "ph": "s", "id": 61594, "pid": 76337, "tid": -914061504, "ts": 1716454222276946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222331743, "dur": 22, "args": { "External id": 61596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61596, "pid": 5, "tid": 7, "ts": 1716454222331743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276955, "dur": 5, "args": { "External id": 61596, "cbid": 211, "correlation": 61596 } }, { "ph": "s", "id": 61596, "pid": 76337, "tid": -914061504, "ts": 1716454222276955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222331766, "dur": 36, "args": { "External id": 61602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61602, "pid": 5, "tid": 7, "ts": 1716454222331766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222276992, "dur": 10, "args": { "External id": 61602, "cbid": 211, "correlation": 61602 } }, { "ph": "s", "id": 61602, "pid": 76337, "tid": -914061504, "ts": 1716454222276992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222331803, "dur": 27, "args": { "External id": 61610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61610, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61610, "pid": 5, "tid": 7, "ts": 1716454222331803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277026, "dur": 8, "args": { "External id": 61610, "cbid": 211, "correlation": 61610 } }, { "ph": "s", "id": 61610, "pid": 76337, "tid": -914061504, "ts": 1716454222277026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222331831, "dur": 20, "args": { "External id": 61618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61618, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61618, "pid": 5, "tid": 7, "ts": 1716454222331831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277055, "dur": 8, "args": { "External id": 61618, "cbid": 211, "correlation": 61618 } }, { "ph": "s", "id": 61618, "pid": 76337, "tid": -914061504, "ts": 1716454222277055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222331853, "dur": 33, "args": { "External id": 61638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61638, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 61638, "pid": 5, "tid": 7, "ts": 1716454222331853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277134, "dur": 13, "args": { "External id": 61638, "cbid": 211, "correlation": 61638 } }, { "ph": "s", "id": 61638, "pid": 76337, "tid": -914061504, "ts": 1716454222277134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222331887, "dur": 4, "args": { "External id": 61650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61650, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 61650, "pid": 5, "tid": 7, "ts": 1716454222331887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277157, "dur": 6, "args": { "External id": 61650, "cbid": 211, "correlation": 61650 } }, { "ph": "s", "id": 61650, "pid": 76337, "tid": -914061504, "ts": 1716454222277157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222331893, "dur": 31, "args": { "External id": 61653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61653, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61653, "pid": 5, "tid": 7, "ts": 1716454222331893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277174, "dur": 6, "args": { "External id": 61653, "cbid": 211, "correlation": 61653 } }, { "ph": "s", "id": 61653, "pid": 76337, "tid": -914061504, "ts": 1716454222277174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222277236, "dur": 0, "args": { "External id": 61664, "cbid": 317, "correlation": 61664 } }, { "ph": "f", "id": 61664, "pid": 76337, "tid": -914061504, "ts": 1716454222277236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222277236, "dur": 0, "args": { "External id": 61665, "cbid": 203, "correlation": 61665 } }, { "ph": "f", "id": 61665, "pid": 76337, "tid": -914061504, "ts": 1716454222277236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222277237, "dur": 0, "args": { "External id": 61666, "cbid": 205, "correlation": 61666 } }, { "ph": "f", "id": 61666, "pid": 76337, "tid": -914061504, "ts": 1716454222277237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222331925, "dur": 23, "args": { "External id": 61670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61670, "pid": 5, "tid": 7, "ts": 1716454222331925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277250, "dur": 12, "args": { "External id": 61670, "cbid": 211, "correlation": 61670 } }, { "ph": "s", "id": 61670, "pid": 76337, "tid": -914061504, "ts": 1716454222277250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222331949, "dur": 127, "args": { "External id": 61672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61672, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61672, "pid": 5, "tid": 7, "ts": 1716454222331949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277268, "dur": 6, "args": { "External id": 61672, "cbid": 211, "correlation": 61672 } }, { "ph": "s", "id": 61672, "pid": 76337, "tid": -914061504, "ts": 1716454222277268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222332078, "dur": 22, "args": { "External id": 61674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61674, "pid": 5, "tid": 7, "ts": 1716454222332078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277278, "dur": 5, "args": { "External id": 61674, "cbid": 211, "correlation": 61674 } }, { "ph": "s", "id": 61674, "pid": 76337, "tid": -914061504, "ts": 1716454222277278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222332101, "dur": 35, "args": { "External id": 61680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61680, "pid": 5, "tid": 7, "ts": 1716454222332101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277306, "dur": 8, "args": { "External id": 61680, "cbid": 211, "correlation": 61680 } }, { "ph": "s", "id": 61680, "pid": 76337, "tid": -914061504, "ts": 1716454222277306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222332137, "dur": 200, "args": { "External id": 61689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61689, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61689, "pid": 5, "tid": 7, "ts": 1716454222332137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277387, "dur": 14, "args": { "External id": 61689, "cbid": 211, "correlation": 61689 } }, { "ph": "s", "id": 61689, "pid": 76337, "tid": -914061504, "ts": 1716454222277387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222332338, "dur": 71, "args": { "External id": 61711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61711, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61711, "pid": 5, "tid": 7, "ts": 1716454222332338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277446, "dur": 11, "args": { "External id": 61711, "cbid": 211, "correlation": 61711 } }, { "ph": "s", "id": 61711, "pid": 76337, "tid": -914061504, "ts": 1716454222277446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222277534, "dur": 1, "args": { "External id": 61722, "cbid": 251, "correlation": 61722 } }, { "ph": "f", "id": 61722, "pid": 76337, "tid": -914061504, "ts": 1716454222277534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222332410, "dur": 166, "args": { "External id": 61723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61723, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61723, "pid": 5, "tid": 7, "ts": 1716454222332410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277540, "dur": 13, "args": { "External id": 61723, "cbid": 211, "correlation": 61723 } }, { "ph": "s", "id": 61723, "pid": 76337, "tid": -914061504, "ts": 1716454222277540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222277613, "dur": 1, "args": { "External id": 61734, "cbid": 251, "correlation": 61734 } }, { "ph": "f", "id": 61734, "pid": 76337, "tid": -914061504, "ts": 1716454222277613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222332577, "dur": 158, "args": { "External id": 61735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61735, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61735, "pid": 5, "tid": 7, "ts": 1716454222332577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277618, "dur": 12, "args": { "External id": 61735, "cbid": 211, "correlation": 61735 } }, { "ph": "s", "id": 61735, "pid": 76337, "tid": -914061504, "ts": 1716454222277618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222277682, "dur": 1, "args": { "External id": 61746, "cbid": 251, "correlation": 61746 } }, { "ph": "f", "id": 61746, "pid": 76337, "tid": -914061504, "ts": 1716454222277682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222332736, "dur": 156, "args": { "External id": 61747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61747, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61747, "pid": 5, "tid": 7, "ts": 1716454222332736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277686, "dur": 10, "args": { "External id": 61747, "cbid": 211, "correlation": 61747 } }, { "ph": "s", "id": 61747, "pid": 76337, "tid": -914061504, "ts": 1716454222277686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222332894, "dur": 2145, "args": { "External id": 61768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61768, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 61768, "pid": 5, "tid": 7, "ts": 1716454222332894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277765, "dur": 13, "args": { "External id": 61768, "cbid": 211, "correlation": 61768 } }, { "ph": "s", "id": 61768, "pid": 76337, "tid": -914061504, "ts": 1716454222277765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222277863, "dur": 1, "args": { "External id": 61786, "cbid": 251, "correlation": 61786 } }, { "ph": "f", "id": 61786, "pid": 76337, "tid": -914061504, "ts": 1716454222277863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222335040, "dur": 159, "args": { "External id": 61788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61788, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 61788, "pid": 5, "tid": 7, "ts": 1716454222335040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277868, "dur": 14, "args": { "External id": 61788, "cbid": 211, "correlation": 61788 } }, { "ph": "s", "id": 61788, "pid": 76337, "tid": -914061504, "ts": 1716454222277868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222335201, "dur": 36, "args": { "External id": 61796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61796, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61796, "pid": 5, "tid": 7, "ts": 1716454222335201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277939, "dur": 12, "args": { "External id": 61796, "cbid": 211, "correlation": 61796 } }, { "ph": "s", "id": 61796, "pid": 76337, "tid": -914061504, "ts": 1716454222277939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222335238, "dur": 51, "args": { "External id": 61804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61804, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61804, "pid": 5, "tid": 7, "ts": 1716454222335238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222277987, "dur": 12, "args": { "External id": 61804, "cbid": 211, "correlation": 61804 } }, { "ph": "s", "id": 61804, "pid": 76337, "tid": -914061504, "ts": 1716454222277987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222335290, "dur": 32, "args": { "External id": 61815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61815, "pid": 5, "tid": 7, "ts": 1716454222335290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278064, "dur": 13, "args": { "External id": 61815, "cbid": 211, "correlation": 61815 } }, { "ph": "s", "id": 61815, "pid": 76337, "tid": -914061504, "ts": 1716454222278064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222335324, "dur": 37, "args": { "External id": 61837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61837, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61837, "pid": 5, "tid": 7, "ts": 1716454222335324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278096, "dur": 8, "args": { "External id": 61837, "cbid": 211, "correlation": 61837 } }, { "ph": "s", "id": 61837, "pid": 76337, "tid": -914061504, "ts": 1716454222278096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278180, "dur": 1, "args": { "External id": 61848, "cbid": 251, "correlation": 61848 } }, { "ph": "f", "id": 61848, "pid": 76337, "tid": -914061504, "ts": 1716454222278180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222335363, "dur": 98, "args": { "External id": 61849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61849, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61849, "pid": 5, "tid": 7, "ts": 1716454222335363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278186, "dur": 13, "args": { "External id": 61849, "cbid": 211, "correlation": 61849 } }, { "ph": "s", "id": 61849, "pid": 76337, "tid": -914061504, "ts": 1716454222278186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278254, "dur": 1, "args": { "External id": 61860, "cbid": 251, "correlation": 61860 } }, { "ph": "f", "id": 61860, "pid": 76337, "tid": -914061504, "ts": 1716454222278254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278257, "dur": 0, "args": { "External id": 61861, "cbid": 251, "correlation": 61861 } }, { "ph": "f", "id": 61861, "pid": 76337, "tid": -914061504, "ts": 1716454222278257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222335462, "dur": 12, "args": { "External id": 61862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61862, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 61862, "pid": 5, "tid": 7, "ts": 1716454222335462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278259, "dur": 12, "args": { "External id": 61862, "cbid": 211, "correlation": 61862 } }, { "ph": "s", "id": 61862, "pid": 76337, "tid": -914061504, "ts": 1716454222278259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222335476, "dur": 5, "args": { "External id": 61864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61864, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 61864, "pid": 5, "tid": 7, "ts": 1716454222335476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278273, "dur": 6, "args": { "External id": 61864, "cbid": 211, "correlation": 61864 } }, { "ph": "s", "id": 61864, "pid": 76337, "tid": -914061504, "ts": 1716454222278273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278329, "dur": 1, "args": { "External id": 61875, "cbid": 251, "correlation": 61875 } }, { "ph": "f", "id": 61875, "pid": 76337, "tid": -914061504, "ts": 1716454222278329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278332, "dur": 0, "args": { "External id": 61876, "cbid": 251, "correlation": 61876 } }, { "ph": "f", "id": 61876, "pid": 76337, "tid": -914061504, "ts": 1716454222278332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222335483, "dur": 7, "args": { "External id": 61877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61877, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 61877, "pid": 5, "tid": 7, "ts": 1716454222335483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278334, "dur": 15, "args": { "External id": 61877, "cbid": 211, "correlation": 61877 } }, { "ph": "s", "id": 61877, "pid": 76337, "tid": -914061504, "ts": 1716454222278334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222335492, "dur": 3, "args": { "External id": 61879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61879, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 61879, "pid": 5, "tid": 7, "ts": 1716454222335492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278350, "dur": 6, "args": { "External id": 61879, "cbid": 211, "correlation": 61879 } }, { "ph": "s", "id": 61879, "pid": 76337, "tid": -914061504, "ts": 1716454222278350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222335497, "dur": 99, "args": { "External id": 61900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61900, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 61900, "pid": 5, "tid": 7, "ts": 1716454222335497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278424, "dur": 12, "args": { "External id": 61900, "cbid": 211, "correlation": 61900 } }, { "ph": "s", "id": 61900, "pid": 76337, "tid": -914061504, "ts": 1716454222278424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278521, "dur": 1, "args": { "External id": 61918, "cbid": 251, "correlation": 61918 } }, { "ph": "f", "id": 61918, "pid": 76337, "tid": -914061504, "ts": 1716454222278521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222335598, "dur": 104, "args": { "External id": 61920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61920, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 61920, "pid": 5, "tid": 7, "ts": 1716454222335598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278527, "dur": 13, "args": { "External id": 61920, "cbid": 211, "correlation": 61920 } }, { "ph": "s", "id": 61920, "pid": 76337, "tid": -914061504, "ts": 1716454222278527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222335703, "dur": 19, "args": { "External id": 61928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61928, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61928, "pid": 5, "tid": 7, "ts": 1716454222335703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278595, "dur": 12, "args": { "External id": 61928, "cbid": 211, "correlation": 61928 } }, { "ph": "s", "id": 61928, "pid": 76337, "tid": -914061504, "ts": 1716454222278595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222335724, "dur": 39, "args": { "External id": 61936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61936, "pid": 5, "tid": 7, "ts": 1716454222335724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278637, "dur": 9, "args": { "External id": 61936, "cbid": 211, "correlation": 61936 } }, { "ph": "s", "id": 61936, "pid": 76337, "tid": -914061504, "ts": 1716454222278637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222335765, "dur": 37, "args": { "External id": 61958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61958, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61958, "pid": 5, "tid": 7, "ts": 1716454222335765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278687, "dur": 13, "args": { "External id": 61958, "cbid": 211, "correlation": 61958 } }, { "ph": "s", "id": 61958, "pid": 76337, "tid": -914061504, "ts": 1716454222278687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278780, "dur": 1, "args": { "External id": 61974, "cbid": 251, "correlation": 61974 } }, { "ph": "f", "id": 61974, "pid": 76337, "tid": -914061504, "ts": 1716454222278780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278785, "dur": 0, "args": { "External id": 61976, "cbid": 251, "correlation": 61976 } }, { "ph": "f", "id": 61976, "pid": 76337, "tid": -914061504, "ts": 1716454222278785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222335803, "dur": 587, "args": { "External id": 61977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61977, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 61977, "pid": 5, "tid": 7, "ts": 1716454222335803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278789, "dur": 13, "args": { "External id": 61977, "cbid": 211, "correlation": 61977 } }, { "ph": "s", "id": 61977, "pid": 76337, "tid": -914061504, "ts": 1716454222278789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222336391, "dur": 133, "args": { "External id": 61985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61985, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61985, "pid": 5, "tid": 7, "ts": 1716454222336391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278854, "dur": 13, "args": { "External id": 61985, "cbid": 211, "correlation": 61985 } }, { "ph": "s", "id": 61985, "pid": 76337, "tid": -914061504, "ts": 1716454222278854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222336526, "dur": 134, "args": { "External id": 61993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 61993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 61993, "pid": 5, "tid": 7, "ts": 1716454222336526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278885, "dur": 8, "args": { "External id": 61993, "cbid": 211, "correlation": 61993 } }, { "ph": "s", "id": 61993, "pid": 76337, "tid": -914061504, "ts": 1716454222278885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222278962, "dur": 1, "args": { "External id": 62009, "cbid": 251, "correlation": 62009 } }, { "ph": "f", "id": 62009, "pid": 76337, "tid": -914061504, "ts": 1716454222278962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222336662, "dur": 332, "args": { "External id": 62011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62011, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62011, "pid": 5, "tid": 7, "ts": 1716454222336662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222278967, "dur": 21, "args": { "External id": 62011, "cbid": 211, "correlation": 62011 } }, { "ph": "s", "id": 62011, "pid": 76337, "tid": -914061504, "ts": 1716454222278967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222336995, "dur": 28, "args": { "External id": 62019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62019, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62019, "pid": 5, "tid": 7, "ts": 1716454222336995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279019, "dur": 10, "args": { "External id": 62019, "cbid": 211, "correlation": 62019 } }, { "ph": "s", "id": 62019, "pid": 76337, "tid": -914061504, "ts": 1716454222279019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222337024, "dur": 89, "args": { "External id": 62030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62030, "pid": 5, "tid": 7, "ts": 1716454222337024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279087, "dur": 16, "args": { "External id": 62030, "cbid": 211, "correlation": 62030 } }, { "ph": "s", "id": 62030, "pid": 76337, "tid": -914061504, "ts": 1716454222279087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222279155, "dur": 0, "args": { "External id": 62042, "cbid": 317, "correlation": 62042 } }, { "ph": "f", "id": 62042, "pid": 76337, "tid": -914061504, "ts": 1716454222279155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222279156, "dur": 0, "args": { "External id": 62043, "cbid": 203, "correlation": 62043 } }, { "ph": "f", "id": 62043, "pid": 76337, "tid": -914061504, "ts": 1716454222279156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222279156, "dur": 0, "args": { "External id": 62044, "cbid": 205, "correlation": 62044 } }, { "ph": "f", "id": 62044, "pid": 76337, "tid": -914061504, "ts": 1716454222279156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337115, "dur": 24, "args": { "External id": 62048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62048, "pid": 5, "tid": 7, "ts": 1716454222337115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279171, "dur": 12, "args": { "External id": 62048, "cbid": 211, "correlation": 62048 } }, { "ph": "s", "id": 62048, "pid": 76337, "tid": -914061504, "ts": 1716454222279171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222337140, "dur": 131, "args": { "External id": 62050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62050, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62050, "pid": 5, "tid": 7, "ts": 1716454222337140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279189, "dur": 6, "args": { "External id": 62050, "cbid": 211, "correlation": 62050 } }, { "ph": "s", "id": 62050, "pid": 76337, "tid": -914061504, "ts": 1716454222279189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337273, "dur": 24, "args": { "External id": 62052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62052, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62052, "pid": 5, "tid": 7, "ts": 1716454222337273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279199, "dur": 6, "args": { "External id": 62052, "cbid": 211, "correlation": 62052 } }, { "ph": "s", "id": 62052, "pid": 76337, "tid": -914061504, "ts": 1716454222279199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222337298, "dur": 34, "args": { "External id": 62058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62058, "pid": 5, "tid": 7, "ts": 1716454222337298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279227, "dur": 8, "args": { "External id": 62058, "cbid": 211, "correlation": 62058 } }, { "ph": "s", "id": 62058, "pid": 76337, "tid": -914061504, "ts": 1716454222279227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222337334, "dur": 27, "args": { "External id": 62066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62066, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62066, "pid": 5, "tid": 7, "ts": 1716454222337334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279260, "dur": 8, "args": { "External id": 62066, "cbid": 211, "correlation": 62066 } }, { "ph": "s", "id": 62066, "pid": 76337, "tid": -914061504, "ts": 1716454222279260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222279337, "dur": 0, "args": { "External id": 62076, "cbid": 317, "correlation": 62076 } }, { "ph": "f", "id": 62076, "pid": 76337, "tid": -914061504, "ts": 1716454222279337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222279337, "dur": 0, "args": { "External id": 62077, "cbid": 203, "correlation": 62077 } }, { "ph": "f", "id": 62077, "pid": 76337, "tid": -914061504, "ts": 1716454222279337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222279338, "dur": 0, "args": { "External id": 62078, "cbid": 205, "correlation": 62078 } }, { "ph": "f", "id": 62078, "pid": 76337, "tid": -914061504, "ts": 1716454222279338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337363, "dur": 25, "args": { "External id": 62082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62082, "pid": 5, "tid": 7, "ts": 1716454222337363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279356, "dur": 13, "args": { "External id": 62082, "cbid": 211, "correlation": 62082 } }, { "ph": "s", "id": 62082, "pid": 76337, "tid": -914061504, "ts": 1716454222279356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337389, "dur": 48, "args": { "External id": 62084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62084, "pid": 5, "tid": 7, "ts": 1716454222337389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279372, "dur": 5, "args": { "External id": 62084, "cbid": 211, "correlation": 62084 } }, { "ph": "s", "id": 62084, "pid": 76337, "tid": -914061504, "ts": 1716454222279372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222337438, "dur": 248, "args": { "External id": 62086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62086, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 62086, "pid": 5, "tid": 7, "ts": 1716454222337438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279387, "dur": 7, "args": { "External id": 62086, "cbid": 211, "correlation": 62086 } }, { "ph": "s", "id": 62086, "pid": 76337, "tid": -914061504, "ts": 1716454222279387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337688, "dur": 7, "args": { "External id": 62088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62088, "pid": 5, "tid": 7, "ts": 1716454222337688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279398, "dur": 5, "args": { "External id": 62088, "cbid": 211, "correlation": 62088 } }, { "ph": "s", "id": 62088, "pid": 76337, "tid": -914061504, "ts": 1716454222279398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222337696, "dur": 10, "args": { "External id": 62094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62094, "pid": 5, "tid": 7, "ts": 1716454222337696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279426, "dur": 8, "args": { "External id": 62094, "cbid": 211, "correlation": 62094 } }, { "ph": "s", "id": 62094, "pid": 76337, "tid": -914061504, "ts": 1716454222279426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222337708, "dur": 13, "args": { "External id": 62114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62114, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 62114, "pid": 5, "tid": 7, "ts": 1716454222337708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279517, "dur": 13, "args": { "External id": 62114, "cbid": 211, "correlation": 62114 } }, { "ph": "s", "id": 62114, "pid": 76337, "tid": -914061504, "ts": 1716454222279517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222337722, "dur": 5, "args": { "External id": 62126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62126, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 62126, "pid": 5, "tid": 7, "ts": 1716454222337722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279540, "dur": 6, "args": { "External id": 62126, "cbid": 211, "correlation": 62126 } }, { "ph": "s", "id": 62126, "pid": 76337, "tid": -914061504, "ts": 1716454222279540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222337728, "dur": 14, "args": { "External id": 62129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62129, "pid": 5, "tid": 7, "ts": 1716454222337728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279558, "dur": 6, "args": { "External id": 62129, "cbid": 211, "correlation": 62129 } }, { "ph": "s", "id": 62129, "pid": 76337, "tid": -914061504, "ts": 1716454222279558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222337743, "dur": 7, "args": { "External id": 62138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62138, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62138, "pid": 5, "tid": 7, "ts": 1716454222337743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279597, "dur": 10, "args": { "External id": 62138, "cbid": 211, "correlation": 62138 } }, { "ph": "s", "id": 62138, "pid": 76337, "tid": -914061504, "ts": 1716454222279597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222279649, "dur": 0, "args": { "External id": 62148, "cbid": 317, "correlation": 62148 } }, { "ph": "f", "id": 62148, "pid": 76337, "tid": -914061504, "ts": 1716454222279649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222279650, "dur": 0, "args": { "External id": 62149, "cbid": 203, "correlation": 62149 } }, { "ph": "f", "id": 62149, "pid": 76337, "tid": -914061504, "ts": 1716454222279650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222279651, "dur": 0, "args": { "External id": 62150, "cbid": 205, "correlation": 62150 } }, { "ph": "f", "id": 62150, "pid": 76337, "tid": -914061504, "ts": 1716454222279651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337752, "dur": 6, "args": { "External id": 62154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62154, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62154, "pid": 5, "tid": 7, "ts": 1716454222337752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279679, "dur": 12, "args": { "External id": 62154, "cbid": 211, "correlation": 62154 } }, { "ph": "s", "id": 62154, "pid": 76337, "tid": -914061504, "ts": 1716454222279679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222337760, "dur": 91, "args": { "External id": 62156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62156, "pid": 5, "tid": 7, "ts": 1716454222337760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279694, "dur": 5, "args": { "External id": 62156, "cbid": 211, "correlation": 62156 } }, { "ph": "s", "id": 62156, "pid": 76337, "tid": -914061504, "ts": 1716454222279694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222337853, "dur": 1, "args": { "External id": 62158, "device": 5, "context": 1, "stream": 7, "correlation": 62158, "bytes": 960, "memory bandwidth (GB/s)": 0.5660377358490566 } }, { "ph": "f", "id": 62158, "pid": 5, "tid": 7, "ts": 1716454222337853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222279712, "dur": 9, "args": { "External id": 62158, "cbid": 51, "correlation": 62158 } }, { "ph": "s", "id": 62158, "pid": 76337, "tid": -914061504, "ts": 1716454222279712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222337857, "dur": 571, "args": { "External id": 62159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62159, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62159, "pid": 5, "tid": 7, "ts": 1716454222337857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279723, "dur": 11, "args": { "External id": 62159, "cbid": 211, "correlation": 62159 } }, { "ph": "s", "id": 62159, "pid": 76337, "tid": -914061504, "ts": 1716454222279723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222338430, "dur": 13, "args": { "External id": 62161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62161, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62161, "pid": 5, "tid": 7, "ts": 1716454222338430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279738, "dur": 5, "args": { "External id": 62161, "cbid": 211, "correlation": 62161 } }, { "ph": "s", "id": 62161, "pid": 76337, "tid": -914061504, "ts": 1716454222279738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222338444, "dur": 16, "args": { "External id": 62167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62167, "pid": 5, "tid": 7, "ts": 1716454222338444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279766, "dur": 8, "args": { "External id": 62167, "cbid": 211, "correlation": 62167 } }, { "ph": "s", "id": 62167, "pid": 76337, "tid": -914061504, "ts": 1716454222279766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222338461, "dur": 4, "args": { "External id": 62175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62175, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 62175, "pid": 5, "tid": 7, "ts": 1716454222338461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279810, "dur": 10, "args": { "External id": 62175, "cbid": 211, "correlation": 62175 } }, { "ph": "s", "id": 62175, "pid": 76337, "tid": -914061504, "ts": 1716454222279810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222279875, "dur": 1, "args": { "External id": 62191, "cbid": 251, "correlation": 62191 } }, { "ph": "f", "id": 62191, "pid": 76337, "tid": -914061504, "ts": 1716454222279875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222279880, "dur": 0, "args": { "External id": 62193, "cbid": 251, "correlation": 62193 } }, { "ph": "f", "id": 62193, "pid": 76337, "tid": -914061504, "ts": 1716454222279880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222338466, "dur": 14, "args": { "External id": 62194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62194, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62194, "pid": 5, "tid": 7, "ts": 1716454222338466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279882, "dur": 12, "args": { "External id": 62194, "cbid": 211, "correlation": 62194 } }, { "ph": "s", "id": 62194, "pid": 76337, "tid": -914061504, "ts": 1716454222279882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222338482, "dur": 6, "args": { "External id": 62196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62196, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62196, "pid": 5, "tid": 7, "ts": 1716454222338482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279896, "dur": 5, "args": { "External id": 62196, "cbid": 211, "correlation": 62196 } }, { "ph": "s", "id": 62196, "pid": 76337, "tid": -914061504, "ts": 1716454222279896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222338488, "dur": 17, "args": { "External id": 62206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62206, "pid": 5, "tid": 7, "ts": 1716454222338488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222279953, "dur": 12, "args": { "External id": 62206, "cbid": 211, "correlation": 62206 } }, { "ph": "s", "id": 62206, "pid": 76337, "tid": -914061504, "ts": 1716454222279953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222338507, "dur": 19, "args": { "External id": 62226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62226, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 62226, "pid": 5, "tid": 7, "ts": 1716454222338507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280027, "dur": 12, "args": { "External id": 62226, "cbid": 211, "correlation": 62226 } }, { "ph": "s", "id": 62226, "pid": 76337, "tid": -914061504, "ts": 1716454222280027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222338528, "dur": 5, "args": { "External id": 62238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62238, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 62238, "pid": 5, "tid": 7, "ts": 1716454222338528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280053, "dur": 6, "args": { "External id": 62238, "cbid": 211, "correlation": 62238 } }, { "ph": "s", "id": 62238, "pid": 76337, "tid": -914061504, "ts": 1716454222280053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222338534, "dur": 19, "args": { "External id": 62241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62241, "pid": 5, "tid": 7, "ts": 1716454222338534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280071, "dur": 6, "args": { "External id": 62241, "cbid": 211, "correlation": 62241 } }, { "ph": "s", "id": 62241, "pid": 76337, "tid": -914061504, "ts": 1716454222280071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222338554, "dur": 12, "args": { "External id": 62250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62250, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62250, "pid": 5, "tid": 7, "ts": 1716454222338554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280112, "dur": 10, "args": { "External id": 62250, "cbid": 211, "correlation": 62250 } }, { "ph": "s", "id": 62250, "pid": 76337, "tid": -914061504, "ts": 1716454222280112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222280174, "dur": 0, "args": { "External id": 62260, "cbid": 317, "correlation": 62260 } }, { "ph": "f", "id": 62260, "pid": 76337, "tid": -914061504, "ts": 1716454222280174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222280175, "dur": 0, "args": { "External id": 62261, "cbid": 203, "correlation": 62261 } }, { "ph": "f", "id": 62261, "pid": 76337, "tid": -914061504, "ts": 1716454222280175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222280176, "dur": 0, "args": { "External id": 62262, "cbid": 205, "correlation": 62262 } }, { "ph": "f", "id": 62262, "pid": 76337, "tid": -914061504, "ts": 1716454222280176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222338568, "dur": 12, "args": { "External id": 62266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62266, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62266, "pid": 5, "tid": 7, "ts": 1716454222338568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280200, "dur": 12, "args": { "External id": 62266, "cbid": 211, "correlation": 62266 } }, { "ph": "s", "id": 62266, "pid": 76337, "tid": -914061504, "ts": 1716454222280200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222338581, "dur": 178, "args": { "External id": 62268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62268, "pid": 5, "tid": 7, "ts": 1716454222338581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280215, "dur": 5, "args": { "External id": 62268, "cbid": 211, "correlation": 62268 } }, { "ph": "s", "id": 62268, "pid": 76337, "tid": -914061504, "ts": 1716454222280215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222338761, "dur": 1, "args": { "External id": 62270, "device": 5, "context": 1, "stream": 7, "correlation": 62270, "bytes": 960, "memory bandwidth (GB/s)": 0.5555555555555556 } }, { "ph": "f", "id": 62270, "pid": 5, "tid": 7, "ts": 1716454222338761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222280229, "dur": 7, "args": { "External id": 62270, "cbid": 51, "correlation": 62270 } }, { "ph": "s", "id": 62270, "pid": 76337, "tid": -914061504, "ts": 1716454222280229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222338765, "dur": 729, "args": { "External id": 62271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62271, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62271, "pid": 5, "tid": 7, "ts": 1716454222338765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280237, "dur": 7, "args": { "External id": 62271, "cbid": 211, "correlation": 62271 } }, { "ph": "s", "id": 62271, "pid": 76337, "tid": -914061504, "ts": 1716454222280237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222339496, "dur": 13, "args": { "External id": 62273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62273, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62273, "pid": 5, "tid": 7, "ts": 1716454222339496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280249, "dur": 5, "args": { "External id": 62273, "cbid": 211, "correlation": 62273 } }, { "ph": "s", "id": 62273, "pid": 76337, "tid": -914061504, "ts": 1716454222280249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222339510, "dur": 16, "args": { "External id": 62279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62279, "pid": 5, "tid": 7, "ts": 1716454222339510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280277, "dur": 8, "args": { "External id": 62279, "cbid": 211, "correlation": 62279 } }, { "ph": "s", "id": 62279, "pid": 76337, "tid": -914061504, "ts": 1716454222280277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222280335, "dur": 0, "args": { "External id": 62289, "cbid": 317, "correlation": 62289 } }, { "ph": "f", "id": 62289, "pid": 76337, "tid": -914061504, "ts": 1716454222280335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222280336, "dur": 0, "args": { "External id": 62290, "cbid": 203, "correlation": 62290 } }, { "ph": "f", "id": 62290, "pid": 76337, "tid": -914061504, "ts": 1716454222280336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222280337, "dur": 0, "args": { "External id": 62291, "cbid": 205, "correlation": 62291 } }, { "ph": "f", "id": 62291, "pid": 76337, "tid": -914061504, "ts": 1716454222280337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222339528, "dur": 9, "args": { "External id": 62295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62295, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62295, "pid": 5, "tid": 7, "ts": 1716454222339528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280358, "dur": 12, "args": { "External id": 62295, "cbid": 211, "correlation": 62295 } }, { "ph": "s", "id": 62295, "pid": 76337, "tid": -914061504, "ts": 1716454222280358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222339539, "dur": 4, "args": { "External id": 62297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62297, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 62297, "pid": 5, "tid": 7, "ts": 1716454222339539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280375, "dur": 6, "args": { "External id": 62297, "cbid": 211, "correlation": 62297 } }, { "ph": "s", "id": 62297, "pid": 76337, "tid": -914061504, "ts": 1716454222280375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222280385, "dur": 0, "args": { "External id": 62298, "cbid": 51, "correlation": 62298 } }, { "ph": "s", "id": 62298, "pid": 76337, "tid": -914061504, "ts": 1716454222280385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222339544, "dur": 62, "args": { "External id": 62299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62299, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 62299, "pid": 5, "tid": 7, "ts": 1716454222339544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280385, "dur": 5, "args": { "External id": 62299, "cbid": 211, "correlation": 62299 } }, { "ph": "s", "id": 62299, "pid": 76337, "tid": -914061504, "ts": 1716454222280385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222339608, "dur": 16, "args": { "External id": 62304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62304, "pid": 5, "tid": 7, "ts": 1716454222339608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280412, "dur": 8, "args": { "External id": 62304, "cbid": 211, "correlation": 62304 } }, { "ph": "s", "id": 62304, "pid": 76337, "tid": -914061504, "ts": 1716454222280412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222339625, "dur": 13, "args": { "External id": 62312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62312, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62312, "pid": 5, "tid": 7, "ts": 1716454222339625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280441, "dur": 8, "args": { "External id": 62312, "cbid": 211, "correlation": 62312 } }, { "ph": "s", "id": 62312, "pid": 76337, "tid": -914061504, "ts": 1716454222280441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222339639, "dur": 11, "args": { "External id": 62320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62320, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62320, "pid": 5, "tid": 7, "ts": 1716454222339639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280471, "dur": 8, "args": { "External id": 62320, "cbid": 211, "correlation": 62320 } }, { "ph": "s", "id": 62320, "pid": 76337, "tid": -914061504, "ts": 1716454222280471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222339652, "dur": 20, "args": { "External id": 62340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62340, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 62340, "pid": 5, "tid": 7, "ts": 1716454222339652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280551, "dur": 13, "args": { "External id": 62340, "cbid": 211, "correlation": 62340 } }, { "ph": "s", "id": 62340, "pid": 76337, "tid": -914061504, "ts": 1716454222280551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222339673, "dur": 5, "args": { "External id": 62352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62352, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 62352, "pid": 5, "tid": 7, "ts": 1716454222339673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280573, "dur": 6, "args": { "External id": 62352, "cbid": 211, "correlation": 62352 } }, { "ph": "s", "id": 62352, "pid": 76337, "tid": -914061504, "ts": 1716454222280573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222339679, "dur": 19, "args": { "External id": 62355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62355, "pid": 5, "tid": 7, "ts": 1716454222339679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280591, "dur": 6, "args": { "External id": 62355, "cbid": 211, "correlation": 62355 } }, { "ph": "s", "id": 62355, "pid": 76337, "tid": -914061504, "ts": 1716454222280591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222280652, "dur": 0, "args": { "External id": 62366, "cbid": 317, "correlation": 62366 } }, { "ph": "f", "id": 62366, "pid": 76337, "tid": -914061504, "ts": 1716454222280652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222280653, "dur": 0, "args": { "External id": 62367, "cbid": 203, "correlation": 62367 } }, { "ph": "f", "id": 62367, "pid": 76337, "tid": -914061504, "ts": 1716454222280653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222280653, "dur": 0, "args": { "External id": 62368, "cbid": 205, "correlation": 62368 } }, { "ph": "f", "id": 62368, "pid": 76337, "tid": -914061504, "ts": 1716454222280653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222339700, "dur": 11, "args": { "External id": 62372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62372, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62372, "pid": 5, "tid": 7, "ts": 1716454222339700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280673, "dur": 12, "args": { "External id": 62372, "cbid": 211, "correlation": 62372 } }, { "ph": "s", "id": 62372, "pid": 76337, "tid": -914061504, "ts": 1716454222280673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222339713, "dur": 4, "args": { "External id": 62374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 62374, "pid": 5, "tid": 7, "ts": 1716454222339713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280694, "dur": 6, "args": { "External id": 62374, "cbid": 211, "correlation": 62374 } }, { "ph": "s", "id": 62374, "pid": 76337, "tid": -914061504, "ts": 1716454222280694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222280703, "dur": 0, "args": { "External id": 62375, "cbid": 51, "correlation": 62375 } }, { "ph": "s", "id": 62375, "pid": 76337, "tid": -914061504, "ts": 1716454222280703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222339718, "dur": 107, "args": { "External id": 62376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62376, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 62376, "pid": 5, "tid": 7, "ts": 1716454222339718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280704, "dur": 5, "args": { "External id": 62376, "cbid": 211, "correlation": 62376 } }, { "ph": "s", "id": 62376, "pid": 76337, "tid": -914061504, "ts": 1716454222280704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222339826, "dur": 17, "args": { "External id": 62381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62381, "pid": 5, "tid": 7, "ts": 1716454222339826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280733, "dur": 8, "args": { "External id": 62381, "cbid": 211, "correlation": 62381 } }, { "ph": "s", "id": 62381, "pid": 76337, "tid": -914061504, "ts": 1716454222280733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222339844, "dur": 92, "args": { "External id": 62390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62390, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62390, "pid": 5, "tid": 7, "ts": 1716454222339844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280816, "dur": 15, "args": { "External id": 62390, "cbid": 211, "correlation": 62390 } }, { "ph": "s", "id": 62390, "pid": 76337, "tid": -914061504, "ts": 1716454222280816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222339937, "dur": 33, "args": { "External id": 62412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62412, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62412, "pid": 5, "tid": 7, "ts": 1716454222339937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280873, "dur": 10, "args": { "External id": 62412, "cbid": 211, "correlation": 62412 } }, { "ph": "s", "id": 62412, "pid": 76337, "tid": -914061504, "ts": 1716454222280873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222280965, "dur": 1, "args": { "External id": 62423, "cbid": 251, "correlation": 62423 } }, { "ph": "f", "id": 62423, "pid": 76337, "tid": -914061504, "ts": 1716454222280965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222339971, "dur": 154, "args": { "External id": 62424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62424, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62424, "pid": 5, "tid": 7, "ts": 1716454222339971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222280970, "dur": 23, "args": { "External id": 62424, "cbid": 211, "correlation": 62424 } }, { "ph": "s", "id": 62424, "pid": 76337, "tid": -914061504, "ts": 1716454222280970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281056, "dur": 1, "args": { "External id": 62435, "cbid": 251, "correlation": 62435 } }, { "ph": "f", "id": 62435, "pid": 76337, "tid": -914061504, "ts": 1716454222281056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222340126, "dur": 173, "args": { "External id": 62436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62436, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62436, "pid": 5, "tid": 7, "ts": 1716454222340126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281060, "dur": 11, "args": { "External id": 62436, "cbid": 211, "correlation": 62436 } }, { "ph": "s", "id": 62436, "pid": 76337, "tid": -914061504, "ts": 1716454222281060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281125, "dur": 1, "args": { "External id": 62447, "cbid": 251, "correlation": 62447 } }, { "ph": "f", "id": 62447, "pid": 76337, "tid": -914061504, "ts": 1716454222281125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222340301, "dur": 147, "args": { "External id": 62448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62448, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62448, "pid": 5, "tid": 7, "ts": 1716454222340301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281129, "dur": 11, "args": { "External id": 62448, "cbid": 211, "correlation": 62448 } }, { "ph": "s", "id": 62448, "pid": 76337, "tid": -914061504, "ts": 1716454222281129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222340449, "dur": 364, "args": { "External id": 62473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62473, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62473, "pid": 5, "tid": 7, "ts": 1716454222340449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281214, "dur": 13, "args": { "External id": 62473, "cbid": 211, "correlation": 62473 } }, { "ph": "s", "id": 62473, "pid": 76337, "tid": -914061504, "ts": 1716454222281214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281316, "dur": 1, "args": { "External id": 62491, "cbid": 251, "correlation": 62491 } }, { "ph": "f", "id": 62491, "pid": 76337, "tid": -914061504, "ts": 1716454222281316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222340815, "dur": 183, "args": { "External id": 62493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62493, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62493, "pid": 5, "tid": 7, "ts": 1716454222340815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281322, "dur": 13, "args": { "External id": 62493, "cbid": 211, "correlation": 62493 } }, { "ph": "s", "id": 62493, "pid": 76337, "tid": -914061504, "ts": 1716454222281322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222340999, "dur": 20, "args": { "External id": 62501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62501, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62501, "pid": 5, "tid": 7, "ts": 1716454222340999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281391, "dur": 13, "args": { "External id": 62501, "cbid": 211, "correlation": 62501 } }, { "ph": "s", "id": 62501, "pid": 76337, "tid": -914061504, "ts": 1716454222281391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222341021, "dur": 28, "args": { "External id": 62509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62509, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62509, "pid": 5, "tid": 7, "ts": 1716454222341021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281431, "dur": 11, "args": { "External id": 62509, "cbid": 211, "correlation": 62509 } }, { "ph": "s", "id": 62509, "pid": 76337, "tid": -914061504, "ts": 1716454222281431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222341049, "dur": 20, "args": { "External id": 62520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62520, "pid": 5, "tid": 7, "ts": 1716454222341049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281508, "dur": 13, "args": { "External id": 62520, "cbid": 211, "correlation": 62520 } }, { "ph": "s", "id": 62520, "pid": 76337, "tid": -914061504, "ts": 1716454222281508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222341071, "dur": 18, "args": { "External id": 62542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62542, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62542, "pid": 5, "tid": 7, "ts": 1716454222341071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281539, "dur": 7, "args": { "External id": 62542, "cbid": 211, "correlation": 62542 } }, { "ph": "s", "id": 62542, "pid": 76337, "tid": -914061504, "ts": 1716454222281539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281625, "dur": 1, "args": { "External id": 62553, "cbid": 251, "correlation": 62553 } }, { "ph": "f", "id": 62553, "pid": 76337, "tid": -914061504, "ts": 1716454222281625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222341090, "dur": 97, "args": { "External id": 62554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62554, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 62554, "pid": 5, "tid": 7, "ts": 1716454222341090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281631, "dur": 14, "args": { "External id": 62554, "cbid": 211, "correlation": 62554 } }, { "ph": "s", "id": 62554, "pid": 76337, "tid": -914061504, "ts": 1716454222281631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281702, "dur": 1, "args": { "External id": 62565, "cbid": 251, "correlation": 62565 } }, { "ph": "f", "id": 62565, "pid": 76337, "tid": -914061504, "ts": 1716454222281702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281706, "dur": 0, "args": { "External id": 62566, "cbid": 251, "correlation": 62566 } }, { "ph": "f", "id": 62566, "pid": 76337, "tid": -914061504, "ts": 1716454222281706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222341189, "dur": 13, "args": { "External id": 62567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62567, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62567, "pid": 5, "tid": 7, "ts": 1716454222341189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281707, "dur": 12, "args": { "External id": 62567, "cbid": 211, "correlation": 62567 } }, { "ph": "s", "id": 62567, "pid": 76337, "tid": -914061504, "ts": 1716454222281707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222341204, "dur": 6, "args": { "External id": 62569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62569, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62569, "pid": 5, "tid": 7, "ts": 1716454222341204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281721, "dur": 6, "args": { "External id": 62569, "cbid": 211, "correlation": 62569 } }, { "ph": "s", "id": 62569, "pid": 76337, "tid": -914061504, "ts": 1716454222281721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281779, "dur": 1, "args": { "External id": 62580, "cbid": 251, "correlation": 62580 } }, { "ph": "f", "id": 62580, "pid": 76337, "tid": -914061504, "ts": 1716454222281779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281783, "dur": 0, "args": { "External id": 62581, "cbid": 251, "correlation": 62581 } }, { "ph": "f", "id": 62581, "pid": 76337, "tid": -914061504, "ts": 1716454222281783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222341211, "dur": 9, "args": { "External id": 62582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62582, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62582, "pid": 5, "tid": 7, "ts": 1716454222341211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281784, "dur": 14, "args": { "External id": 62582, "cbid": 211, "correlation": 62582 } }, { "ph": "s", "id": 62582, "pid": 76337, "tid": -914061504, "ts": 1716454222281784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222341222, "dur": 3, "args": { "External id": 62584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62584, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62584, "pid": 5, "tid": 7, "ts": 1716454222341222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281800, "dur": 6, "args": { "External id": 62584, "cbid": 211, "correlation": 62584 } }, { "ph": "s", "id": 62584, "pid": 76337, "tid": -914061504, "ts": 1716454222281800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222341227, "dur": 59, "args": { "External id": 62609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62609, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62609, "pid": 5, "tid": 7, "ts": 1716454222341227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281878, "dur": 13, "args": { "External id": 62609, "cbid": 211, "correlation": 62609 } }, { "ph": "s", "id": 62609, "pid": 76337, "tid": -914061504, "ts": 1716454222281878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222281985, "dur": 2, "args": { "External id": 62627, "cbid": 251, "correlation": 62627 } }, { "ph": "f", "id": 62627, "pid": 76337, "tid": -914061504, "ts": 1716454222281985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222341287, "dur": 99, "args": { "External id": 62629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62629, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 62629, "pid": 5, "tid": 7, "ts": 1716454222341287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222281992, "dur": 14, "args": { "External id": 62629, "cbid": 211, "correlation": 62629 } }, { "ph": "s", "id": 62629, "pid": 76337, "tid": -914061504, "ts": 1716454222281992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222341388, "dur": 10, "args": { "External id": 62637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62637, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62637, "pid": 5, "tid": 7, "ts": 1716454222341388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282064, "dur": 12, "args": { "External id": 62637, "cbid": 211, "correlation": 62637 } }, { "ph": "s", "id": 62637, "pid": 76337, "tid": -914061504, "ts": 1716454222282064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222341399, "dur": 24, "args": { "External id": 62645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62645, "pid": 5, "tid": 7, "ts": 1716454222341399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282105, "dur": 9, "args": { "External id": 62645, "cbid": 211, "correlation": 62645 } }, { "ph": "s", "id": 62645, "pid": 76337, "tid": -914061504, "ts": 1716454222282105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222341425, "dur": 20, "args": { "External id": 62667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62667, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62667, "pid": 5, "tid": 7, "ts": 1716454222341425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282161, "dur": 10, "args": { "External id": 62667, "cbid": 211, "correlation": 62667 } }, { "ph": "s", "id": 62667, "pid": 76337, "tid": -914061504, "ts": 1716454222282161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222282250, "dur": 2, "args": { "External id": 62683, "cbid": 251, "correlation": 62683 } }, { "ph": "f", "id": 62683, "pid": 76337, "tid": -914061504, "ts": 1716454222282250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222282257, "dur": 0, "args": { "External id": 62685, "cbid": 251, "correlation": 62685 } }, { "ph": "f", "id": 62685, "pid": 76337, "tid": -914061504, "ts": 1716454222282257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222341446, "dur": 525, "args": { "External id": 62686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62686, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62686, "pid": 5, "tid": 7, "ts": 1716454222341446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282259, "dur": 15, "args": { "External id": 62686, "cbid": 211, "correlation": 62686 } }, { "ph": "s", "id": 62686, "pid": 76337, "tid": -914061504, "ts": 1716454222282259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222341972, "dur": 71, "args": { "External id": 62694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62694, "pid": 5, "tid": 7, "ts": 1716454222341972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282327, "dur": 12, "args": { "External id": 62694, "cbid": 211, "correlation": 62694 } }, { "ph": "s", "id": 62694, "pid": 76337, "tid": -914061504, "ts": 1716454222282327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222342044, "dur": 70, "args": { "External id": 62702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62702, "pid": 5, "tid": 7, "ts": 1716454222342044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282357, "dur": 9, "args": { "External id": 62702, "cbid": 211, "correlation": 62702 } }, { "ph": "s", "id": 62702, "pid": 76337, "tid": -914061504, "ts": 1716454222282357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222282437, "dur": 1, "args": { "External id": 62718, "cbid": 251, "correlation": 62718 } }, { "ph": "f", "id": 62718, "pid": 76337, "tid": -914061504, "ts": 1716454222282437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222342116, "dur": 1, "args": { "External id": 62720, "device": 5, "context": 1, "stream": 7, "correlation": 62720, "bytes": 240, "memory bandwidth (GB/s)": 0.14714898835070508 } }, { "ph": "f", "id": 62720, "pid": 5, "tid": 7, "ts": 1716454222342116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222282443, "dur": 11, "args": { "External id": 62720, "cbid": 51, "correlation": 62720 } }, { "ph": "s", "id": 62720, "pid": 76337, "tid": -914061504, "ts": 1716454222282443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222342120, "dur": 289, "args": { "External id": 62721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62721, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 62721, "pid": 5, "tid": 7, "ts": 1716454222342120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282455, "dur": 12, "args": { "External id": 62721, "cbid": 211, "correlation": 62721 } }, { "ph": "s", "id": 62721, "pid": 76337, "tid": -914061504, "ts": 1716454222282455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222342411, "dur": 15, "args": { "External id": 62729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62729, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62729, "pid": 5, "tid": 7, "ts": 1716454222342411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282498, "dur": 10, "args": { "External id": 62729, "cbid": 211, "correlation": 62729 } }, { "ph": "s", "id": 62729, "pid": 76337, "tid": -914061504, "ts": 1716454222282498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222342427, "dur": 41, "args": { "External id": 62740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62740, "pid": 5, "tid": 7, "ts": 1716454222342427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282568, "dur": 14, "args": { "External id": 62740, "cbid": 211, "correlation": 62740 } }, { "ph": "s", "id": 62740, "pid": 76337, "tid": -914061504, "ts": 1716454222282568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222282635, "dur": 0, "args": { "External id": 62752, "cbid": 317, "correlation": 62752 } }, { "ph": "f", "id": 62752, "pid": 76337, "tid": -914061504, "ts": 1716454222282635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222282636, "dur": 0, "args": { "External id": 62753, "cbid": 203, "correlation": 62753 } }, { "ph": "f", "id": 62753, "pid": 76337, "tid": -914061504, "ts": 1716454222282636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222282637, "dur": 0, "args": { "External id": 62754, "cbid": 205, "correlation": 62754 } }, { "ph": "f", "id": 62754, "pid": 76337, "tid": -914061504, "ts": 1716454222282637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222342470, "dur": 15, "args": { "External id": 62758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62758, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62758, "pid": 5, "tid": 7, "ts": 1716454222342470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282652, "dur": 12, "args": { "External id": 62758, "cbid": 211, "correlation": 62758 } }, { "ph": "s", "id": 62758, "pid": 76337, "tid": -914061504, "ts": 1716454222282652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222342486, "dur": 4, "args": { "External id": 62760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62760, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 62760, "pid": 5, "tid": 7, "ts": 1716454222342486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282668, "dur": 6, "args": { "External id": 62760, "cbid": 211, "correlation": 62760 } }, { "ph": "s", "id": 62760, "pid": 76337, "tid": -914061504, "ts": 1716454222282668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222282677, "dur": 0, "args": { "External id": 62761, "cbid": 51, "correlation": 62761 } }, { "ph": "s", "id": 62761, "pid": 76337, "tid": -914061504, "ts": 1716454222282677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222342491, "dur": 106, "args": { "External id": 62762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62762, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 62762, "pid": 5, "tid": 7, "ts": 1716454222342491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282677, "dur": 5, "args": { "External id": 62762, "cbid": 211, "correlation": 62762 } }, { "ph": "s", "id": 62762, "pid": 76337, "tid": -914061504, "ts": 1716454222282677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222342599, "dur": 17, "args": { "External id": 62767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62767, "pid": 5, "tid": 7, "ts": 1716454222342599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282705, "dur": 9, "args": { "External id": 62767, "cbid": 211, "correlation": 62767 } }, { "ph": "s", "id": 62767, "pid": 76337, "tid": -914061504, "ts": 1716454222282705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222342617, "dur": 12, "args": { "External id": 62775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62775, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62775, "pid": 5, "tid": 7, "ts": 1716454222342617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282736, "dur": 9, "args": { "External id": 62775, "cbid": 211, "correlation": 62775 } }, { "ph": "s", "id": 62775, "pid": 76337, "tid": -914061504, "ts": 1716454222282736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222342630, "dur": 19, "args": { "External id": 62795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62795, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 62795, "pid": 5, "tid": 7, "ts": 1716454222342630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282808, "dur": 12, "args": { "External id": 62795, "cbid": 211, "correlation": 62795 } }, { "ph": "s", "id": 62795, "pid": 76337, "tid": -914061504, "ts": 1716454222282808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222342651, "dur": 5, "args": { "External id": 62807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62807, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 62807, "pid": 5, "tid": 7, "ts": 1716454222342651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282834, "dur": 7, "args": { "External id": 62807, "cbid": 211, "correlation": 62807 } }, { "ph": "s", "id": 62807, "pid": 76337, "tid": -914061504, "ts": 1716454222282834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222342657, "dur": 19, "args": { "External id": 62810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62810, "pid": 5, "tid": 7, "ts": 1716454222342657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282852, "dur": 7, "args": { "External id": 62810, "cbid": 211, "correlation": 62810 } }, { "ph": "s", "id": 62810, "pid": 76337, "tid": -914061504, "ts": 1716454222282852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222342678, "dur": 12, "args": { "External id": 62819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62819, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62819, "pid": 5, "tid": 7, "ts": 1716454222342678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282891, "dur": 10, "args": { "External id": 62819, "cbid": 211, "correlation": 62819 } }, { "ph": "s", "id": 62819, "pid": 76337, "tid": -914061504, "ts": 1716454222282891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222282942, "dur": 0, "args": { "External id": 62829, "cbid": 317, "correlation": 62829 } }, { "ph": "f", "id": 62829, "pid": 76337, "tid": -914061504, "ts": 1716454222282942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222282943, "dur": 0, "args": { "External id": 62830, "cbid": 203, "correlation": 62830 } }, { "ph": "f", "id": 62830, "pid": 76337, "tid": -914061504, "ts": 1716454222282943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222282944, "dur": 0, "args": { "External id": 62831, "cbid": 205, "correlation": 62831 } }, { "ph": "f", "id": 62831, "pid": 76337, "tid": -914061504, "ts": 1716454222282944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222342691, "dur": 12, "args": { "External id": 62835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62835, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62835, "pid": 5, "tid": 7, "ts": 1716454222342691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282957, "dur": 12, "args": { "External id": 62835, "cbid": 211, "correlation": 62835 } }, { "ph": "s", "id": 62835, "pid": 76337, "tid": -914061504, "ts": 1716454222282957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222342705, "dur": 177, "args": { "External id": 62837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62837, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62837, "pid": 5, "tid": 7, "ts": 1716454222342705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222282972, "dur": 14, "args": { "External id": 62837, "cbid": 211, "correlation": 62837 } }, { "ph": "s", "id": 62837, "pid": 76337, "tid": -914061504, "ts": 1716454222282972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222342884, "dur": 1, "args": { "External id": 62839, "device": 5, "context": 1, "stream": 7, "correlation": 62839, "bytes": 960, "memory bandwidth (GB/s)": 0.5555555555555556 } }, { "ph": "f", "id": 62839, "pid": 5, "tid": 7, "ts": 1716454222342884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222282993, "dur": 6, "args": { "External id": 62839, "cbid": 51, "correlation": 62839 } }, { "ph": "s", "id": 62839, "pid": 76337, "tid": -914061504, "ts": 1716454222282993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222342888, "dur": 727, "args": { "External id": 62840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62840, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62840, "pid": 5, "tid": 7, "ts": 1716454222342888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283000, "dur": 7, "args": { "External id": 62840, "cbid": 211, "correlation": 62840 } }, { "ph": "s", "id": 62840, "pid": 76337, "tid": -914061504, "ts": 1716454222283000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222343617, "dur": 14, "args": { "External id": 62842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62842, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62842, "pid": 5, "tid": 7, "ts": 1716454222343617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283011, "dur": 5, "args": { "External id": 62842, "cbid": 211, "correlation": 62842 } }, { "ph": "s", "id": 62842, "pid": 76337, "tid": -914061504, "ts": 1716454222283011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222343633, "dur": 16, "args": { "External id": 62848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62848, "pid": 5, "tid": 7, "ts": 1716454222343633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283041, "dur": 8, "args": { "External id": 62848, "cbid": 211, "correlation": 62848 } }, { "ph": "s", "id": 62848, "pid": 76337, "tid": -914061504, "ts": 1716454222283041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222343650, "dur": 4, "args": { "External id": 62856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62856, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 62856, "pid": 5, "tid": 7, "ts": 1716454222343650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283085, "dur": 9, "args": { "External id": 62856, "cbid": 211, "correlation": 62856 } }, { "ph": "s", "id": 62856, "pid": 76337, "tid": -914061504, "ts": 1716454222283085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222283154, "dur": 1, "args": { "External id": 62872, "cbid": 251, "correlation": 62872 } }, { "ph": "f", "id": 62872, "pid": 76337, "tid": -914061504, "ts": 1716454222283154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222283159, "dur": 0, "args": { "External id": 62874, "cbid": 251, "correlation": 62874 } }, { "ph": "f", "id": 62874, "pid": 76337, "tid": -914061504, "ts": 1716454222283159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222343655, "dur": 14, "args": { "External id": 62875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62875, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62875, "pid": 5, "tid": 7, "ts": 1716454222343655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283161, "dur": 11, "args": { "External id": 62875, "cbid": 211, "correlation": 62875 } }, { "ph": "s", "id": 62875, "pid": 76337, "tid": -914061504, "ts": 1716454222283161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222343671, "dur": 6, "args": { "External id": 62877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62877, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62877, "pid": 5, "tid": 7, "ts": 1716454222343671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283174, "dur": 6, "args": { "External id": 62877, "cbid": 211, "correlation": 62877 } }, { "ph": "s", "id": 62877, "pid": 76337, "tid": -914061504, "ts": 1716454222283174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222343678, "dur": 19, "args": { "External id": 62887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62887, "pid": 5, "tid": 7, "ts": 1716454222343678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283233, "dur": 12, "args": { "External id": 62887, "cbid": 211, "correlation": 62887 } }, { "ph": "s", "id": 62887, "pid": 76337, "tid": -914061504, "ts": 1716454222283233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222343699, "dur": 20, "args": { "External id": 62907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62907, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 62907, "pid": 5, "tid": 7, "ts": 1716454222343699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283298, "dur": 11, "args": { "External id": 62907, "cbid": 211, "correlation": 62907 } }, { "ph": "s", "id": 62907, "pid": 76337, "tid": -914061504, "ts": 1716454222283298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222343720, "dur": 4, "args": { "External id": 62919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62919, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 62919, "pid": 5, "tid": 7, "ts": 1716454222343720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283319, "dur": 6, "args": { "External id": 62919, "cbid": 211, "correlation": 62919 } }, { "ph": "s", "id": 62919, "pid": 76337, "tid": -914061504, "ts": 1716454222283319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222343726, "dur": 18, "args": { "External id": 62922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62922, "pid": 5, "tid": 7, "ts": 1716454222343726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283338, "dur": 6, "args": { "External id": 62922, "cbid": 211, "correlation": 62922 } }, { "ph": "s", "id": 62922, "pid": 76337, "tid": -914061504, "ts": 1716454222283338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222343745, "dur": 12, "args": { "External id": 62931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62931, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62931, "pid": 5, "tid": 7, "ts": 1716454222343745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283378, "dur": 10, "args": { "External id": 62931, "cbid": 211, "correlation": 62931 } }, { "ph": "s", "id": 62931, "pid": 76337, "tid": -914061504, "ts": 1716454222283378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222283443, "dur": 0, "args": { "External id": 62941, "cbid": 317, "correlation": 62941 } }, { "ph": "f", "id": 62941, "pid": 76337, "tid": -914061504, "ts": 1716454222283443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222283444, "dur": 0, "args": { "External id": 62942, "cbid": 203, "correlation": 62942 } }, { "ph": "f", "id": 62942, "pid": 76337, "tid": -914061504, "ts": 1716454222283444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222283445, "dur": 0, "args": { "External id": 62943, "cbid": 205, "correlation": 62943 } }, { "ph": "f", "id": 62943, "pid": 76337, "tid": -914061504, "ts": 1716454222283445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222343758, "dur": 12, "args": { "External id": 62947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62947, "pid": 5, "tid": 7, "ts": 1716454222343758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283459, "dur": 12, "args": { "External id": 62947, "cbid": 211, "correlation": 62947 } }, { "ph": "s", "id": 62947, "pid": 76337, "tid": -914061504, "ts": 1716454222283459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222343771, "dur": 178, "args": { "External id": 62949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62949, "pid": 5, "tid": 7, "ts": 1716454222343771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283474, "dur": 5, "args": { "External id": 62949, "cbid": 211, "correlation": 62949 } }, { "ph": "s", "id": 62949, "pid": 76337, "tid": -914061504, "ts": 1716454222283474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222343951, "dur": 1, "args": { "External id": 62951, "device": 5, "context": 1, "stream": 7, "correlation": 62951, "bytes": 960, "memory bandwidth (GB/s)": 0.5555555555555556 } }, { "ph": "f", "id": 62951, "pid": 5, "tid": 7, "ts": 1716454222343951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222283485, "dur": 7, "args": { "External id": 62951, "cbid": 51, "correlation": 62951 } }, { "ph": "s", "id": 62951, "pid": 76337, "tid": -914061504, "ts": 1716454222283485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222343955, "dur": 712, "args": { "External id": 62952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62952, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 62952, "pid": 5, "tid": 7, "ts": 1716454222343955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283493, "dur": 6, "args": { "External id": 62952, "cbid": 211, "correlation": 62952 } }, { "ph": "s", "id": 62952, "pid": 76337, "tid": -914061504, "ts": 1716454222283493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222344669, "dur": 13, "args": { "External id": 62954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62954, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62954, "pid": 5, "tid": 7, "ts": 1716454222344669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283502, "dur": 5, "args": { "External id": 62954, "cbid": 211, "correlation": 62954 } }, { "ph": "s", "id": 62954, "pid": 76337, "tid": -914061504, "ts": 1716454222283502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222344683, "dur": 16, "args": { "External id": 62960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62960, "pid": 5, "tid": 7, "ts": 1716454222344683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283530, "dur": 9, "args": { "External id": 62960, "cbid": 211, "correlation": 62960 } }, { "ph": "s", "id": 62960, "pid": 76337, "tid": -914061504, "ts": 1716454222283530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222344700, "dur": 13, "args": { "External id": 62968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62968, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62968, "pid": 5, "tid": 7, "ts": 1716454222344700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283563, "dur": 8, "args": { "External id": 62968, "cbid": 211, "correlation": 62968 } }, { "ph": "s", "id": 62968, "pid": 76337, "tid": -914061504, "ts": 1716454222283563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222344715, "dur": 11, "args": { "External id": 62976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62976, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 62976, "pid": 5, "tid": 7, "ts": 1716454222344715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283592, "dur": 9, "args": { "External id": 62976, "cbid": 211, "correlation": 62976 } }, { "ph": "s", "id": 62976, "pid": 76337, "tid": -914061504, "ts": 1716454222283592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222344727, "dur": 19, "args": { "External id": 62996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 62996, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 62996, "pid": 5, "tid": 7, "ts": 1716454222344727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283670, "dur": 12, "args": { "External id": 62996, "cbid": 211, "correlation": 62996 } }, { "ph": "s", "id": 62996, "pid": 76337, "tid": -914061504, "ts": 1716454222283670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222344747, "dur": 4, "args": { "External id": 63008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63008, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 63008, "pid": 5, "tid": 7, "ts": 1716454222344747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283695, "dur": 7, "args": { "External id": 63008, "cbid": 211, "correlation": 63008 } }, { "ph": "s", "id": 63008, "pid": 76337, "tid": -914061504, "ts": 1716454222283695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222344753, "dur": 17, "args": { "External id": 63011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63011, "pid": 5, "tid": 7, "ts": 1716454222344753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283714, "dur": 6, "args": { "External id": 63011, "cbid": 211, "correlation": 63011 } }, { "ph": "s", "id": 63011, "pid": 76337, "tid": -914061504, "ts": 1716454222283714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222283771, "dur": 0, "args": { "External id": 63022, "cbid": 317, "correlation": 63022 } }, { "ph": "f", "id": 63022, "pid": 76337, "tid": -914061504, "ts": 1716454222283771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222283772, "dur": 0, "args": { "External id": 63023, "cbid": 203, "correlation": 63023 } }, { "ph": "f", "id": 63023, "pid": 76337, "tid": -914061504, "ts": 1716454222283772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222283772, "dur": 0, "args": { "External id": 63024, "cbid": 205, "correlation": 63024 } }, { "ph": "f", "id": 63024, "pid": 76337, "tid": -914061504, "ts": 1716454222283772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222344772, "dur": 11, "args": { "External id": 63028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63028, "pid": 5, "tid": 7, "ts": 1716454222344772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283785, "dur": 12, "args": { "External id": 63028, "cbid": 211, "correlation": 63028 } }, { "ph": "s", "id": 63028, "pid": 76337, "tid": -914061504, "ts": 1716454222283785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222344784, "dur": 4, "args": { "External id": 63030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 63030, "pid": 5, "tid": 7, "ts": 1716454222344784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283801, "dur": 6, "args": { "External id": 63030, "cbid": 211, "correlation": 63030 } }, { "ph": "s", "id": 63030, "pid": 76337, "tid": -914061504, "ts": 1716454222283801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222283810, "dur": 0, "args": { "External id": 63031, "cbid": 51, "correlation": 63031 } }, { "ph": "s", "id": 63031, "pid": 76337, "tid": -914061504, "ts": 1716454222283810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222344789, "dur": 103, "args": { "External id": 63032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63032, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 63032, "pid": 5, "tid": 7, "ts": 1716454222344789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283811, "dur": 5, "args": { "External id": 63032, "cbid": 211, "correlation": 63032 } }, { "ph": "s", "id": 63032, "pid": 76337, "tid": -914061504, "ts": 1716454222283811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222344894, "dur": 17, "args": { "External id": 63037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63037, "pid": 5, "tid": 7, "ts": 1716454222344894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283837, "dur": 8, "args": { "External id": 63037, "cbid": 211, "correlation": 63037 } }, { "ph": "s", "id": 63037, "pid": 76337, "tid": -914061504, "ts": 1716454222283837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222344912, "dur": 91, "args": { "External id": 63046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63046, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63046, "pid": 5, "tid": 7, "ts": 1716454222344912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283919, "dur": 14, "args": { "External id": 63046, "cbid": 211, "correlation": 63046 } }, { "ph": "s", "id": 63046, "pid": 76337, "tid": -914061504, "ts": 1716454222283919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222345004, "dur": 32, "args": { "External id": 63068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63068, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63068, "pid": 5, "tid": 7, "ts": 1716454222345004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222283990, "dur": 11, "args": { "External id": 63068, "cbid": 211, "correlation": 63068 } }, { "ph": "s", "id": 63068, "pid": 76337, "tid": -914061504, "ts": 1716454222283990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284083, "dur": 1, "args": { "External id": 63079, "cbid": 251, "correlation": 63079 } }, { "ph": "f", "id": 63079, "pid": 76337, "tid": -914061504, "ts": 1716454222284083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222345038, "dur": 180, "args": { "External id": 63080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63080, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63080, "pid": 5, "tid": 7, "ts": 1716454222345038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284089, "dur": 13, "args": { "External id": 63080, "cbid": 211, "correlation": 63080 } }, { "ph": "s", "id": 63080, "pid": 76337, "tid": -914061504, "ts": 1716454222284089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284159, "dur": 1, "args": { "External id": 63091, "cbid": 251, "correlation": 63091 } }, { "ph": "f", "id": 63091, "pid": 76337, "tid": -914061504, "ts": 1716454222284159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222345219, "dur": 176, "args": { "External id": 63092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63092, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63092, "pid": 5, "tid": 7, "ts": 1716454222345219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284163, "dur": 12, "args": { "External id": 63092, "cbid": 211, "correlation": 63092 } }, { "ph": "s", "id": 63092, "pid": 76337, "tid": -914061504, "ts": 1716454222284163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284227, "dur": 1, "args": { "External id": 63103, "cbid": 251, "correlation": 63103 } }, { "ph": "f", "id": 63103, "pid": 76337, "tid": -914061504, "ts": 1716454222284227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222345397, "dur": 172, "args": { "External id": 63104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63104, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63104, "pid": 5, "tid": 7, "ts": 1716454222345397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284231, "dur": 11, "args": { "External id": 63104, "cbid": 211, "correlation": 63104 } }, { "ph": "s", "id": 63104, "pid": 76337, "tid": -914061504, "ts": 1716454222284231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222345570, "dur": 361, "args": { "External id": 63129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63129, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63129, "pid": 5, "tid": 7, "ts": 1716454222345570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284313, "dur": 13, "args": { "External id": 63129, "cbid": 211, "correlation": 63129 } }, { "ph": "s", "id": 63129, "pid": 76337, "tid": -914061504, "ts": 1716454222284313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284411, "dur": 1, "args": { "External id": 63147, "cbid": 251, "correlation": 63147 } }, { "ph": "f", "id": 63147, "pid": 76337, "tid": -914061504, "ts": 1716454222284411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222345933, "dur": 184, "args": { "External id": 63149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63149, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63149, "pid": 5, "tid": 7, "ts": 1716454222345933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284417, "dur": 14, "args": { "External id": 63149, "cbid": 211, "correlation": 63149 } }, { "ph": "s", "id": 63149, "pid": 76337, "tid": -914061504, "ts": 1716454222284417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222346119, "dur": 19, "args": { "External id": 63157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63157, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63157, "pid": 5, "tid": 7, "ts": 1716454222346119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284491, "dur": 12, "args": { "External id": 63157, "cbid": 211, "correlation": 63157 } }, { "ph": "s", "id": 63157, "pid": 76337, "tid": -914061504, "ts": 1716454222284491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222346139, "dur": 28, "args": { "External id": 63165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63165, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63165, "pid": 5, "tid": 7, "ts": 1716454222346139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284530, "dur": 8, "args": { "External id": 63165, "cbid": 211, "correlation": 63165 } }, { "ph": "s", "id": 63165, "pid": 76337, "tid": -914061504, "ts": 1716454222284530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222346169, "dur": 19, "args": { "External id": 63176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63176, "pid": 5, "tid": 7, "ts": 1716454222346169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284602, "dur": 12, "args": { "External id": 63176, "cbid": 211, "correlation": 63176 } }, { "ph": "s", "id": 63176, "pid": 76337, "tid": -914061504, "ts": 1716454222284602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222346189, "dur": 17, "args": { "External id": 63198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63198, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63198, "pid": 5, "tid": 7, "ts": 1716454222346189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284633, "dur": 8, "args": { "External id": 63198, "cbid": 211, "correlation": 63198 } }, { "ph": "s", "id": 63198, "pid": 76337, "tid": -914061504, "ts": 1716454222284633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284717, "dur": 1, "args": { "External id": 63209, "cbid": 251, "correlation": 63209 } }, { "ph": "f", "id": 63209, "pid": 76337, "tid": -914061504, "ts": 1716454222284717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222346208, "dur": 96, "args": { "External id": 63210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63210, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 63210, "pid": 5, "tid": 7, "ts": 1716454222346208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284722, "dur": 13, "args": { "External id": 63210, "cbid": 211, "correlation": 63210 } }, { "ph": "s", "id": 63210, "pid": 76337, "tid": -914061504, "ts": 1716454222284722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284792, "dur": 1, "args": { "External id": 63221, "cbid": 251, "correlation": 63221 } }, { "ph": "f", "id": 63221, "pid": 76337, "tid": -914061504, "ts": 1716454222284792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284795, "dur": 0, "args": { "External id": 63222, "cbid": 251, "correlation": 63222 } }, { "ph": "f", "id": 63222, "pid": 76337, "tid": -914061504, "ts": 1716454222284795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222346306, "dur": 13, "args": { "External id": 63223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63223, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63223, "pid": 5, "tid": 7, "ts": 1716454222346306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284797, "dur": 11, "args": { "External id": 63223, "cbid": 211, "correlation": 63223 } }, { "ph": "s", "id": 63223, "pid": 76337, "tid": -914061504, "ts": 1716454222284797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222346321, "dur": 6, "args": { "External id": 63225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63225, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63225, "pid": 5, "tid": 7, "ts": 1716454222346321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284810, "dur": 9, "args": { "External id": 63225, "cbid": 211, "correlation": 63225 } }, { "ph": "s", "id": 63225, "pid": 76337, "tid": -914061504, "ts": 1716454222284810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284871, "dur": 1, "args": { "External id": 63236, "cbid": 251, "correlation": 63236 } }, { "ph": "f", "id": 63236, "pid": 76337, "tid": -914061504, "ts": 1716454222284871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222284874, "dur": 0, "args": { "External id": 63237, "cbid": 251, "correlation": 63237 } }, { "ph": "f", "id": 63237, "pid": 76337, "tid": -914061504, "ts": 1716454222284874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222346328, "dur": 9, "args": { "External id": 63238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63238, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63238, "pid": 5, "tid": 7, "ts": 1716454222346328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284875, "dur": 12, "args": { "External id": 63238, "cbid": 211, "correlation": 63238 } }, { "ph": "s", "id": 63238, "pid": 76337, "tid": -914061504, "ts": 1716454222284875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222346339, "dur": 4, "args": { "External id": 63240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63240, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63240, "pid": 5, "tid": 7, "ts": 1716454222346339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284889, "dur": 5, "args": { "External id": 63240, "cbid": 211, "correlation": 63240 } }, { "ph": "s", "id": 63240, "pid": 76337, "tid": -914061504, "ts": 1716454222284889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222346344, "dur": 60, "args": { "External id": 63265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63265, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63265, "pid": 5, "tid": 7, "ts": 1716454222346344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222284966, "dur": 21, "args": { "External id": 63265, "cbid": 211, "correlation": 63265 } }, { "ph": "s", "id": 63265, "pid": 76337, "tid": -914061504, "ts": 1716454222284966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222285074, "dur": 1, "args": { "External id": 63283, "cbid": 251, "correlation": 63283 } }, { "ph": "f", "id": 63283, "pid": 76337, "tid": -914061504, "ts": 1716454222285074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222346406, "dur": 99, "args": { "External id": 63285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63285, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 63285, "pid": 5, "tid": 7, "ts": 1716454222346406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285080, "dur": 13, "args": { "External id": 63285, "cbid": 211, "correlation": 63285 } }, { "ph": "s", "id": 63285, "pid": 76337, "tid": -914061504, "ts": 1716454222285080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222346507, "dur": 10, "args": { "External id": 63293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63293, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63293, "pid": 5, "tid": 7, "ts": 1716454222346507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285149, "dur": 13, "args": { "External id": 63293, "cbid": 211, "correlation": 63293 } }, { "ph": "s", "id": 63293, "pid": 76337, "tid": -914061504, "ts": 1716454222285149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222346519, "dur": 23, "args": { "External id": 63301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63301, "pid": 5, "tid": 7, "ts": 1716454222346519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285191, "dur": 12, "args": { "External id": 63301, "cbid": 211, "correlation": 63301 } }, { "ph": "s", "id": 63301, "pid": 76337, "tid": -914061504, "ts": 1716454222285191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222346543, "dur": 19, "args": { "External id": 63323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63323, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63323, "pid": 5, "tid": 7, "ts": 1716454222346543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285245, "dur": 11, "args": { "External id": 63323, "cbid": 211, "correlation": 63323 } }, { "ph": "s", "id": 63323, "pid": 76337, "tid": -914061504, "ts": 1716454222285245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222285333, "dur": 1, "args": { "External id": 63339, "cbid": 251, "correlation": 63339 } }, { "ph": "f", "id": 63339, "pid": 76337, "tid": -914061504, "ts": 1716454222285333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222285338, "dur": 0, "args": { "External id": 63341, "cbid": 251, "correlation": 63341 } }, { "ph": "f", "id": 63341, "pid": 76337, "tid": -914061504, "ts": 1716454222285338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222346563, "dur": 525, "args": { "External id": 63342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63342, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63342, "pid": 5, "tid": 7, "ts": 1716454222346563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285340, "dur": 12, "args": { "External id": 63342, "cbid": 211, "correlation": 63342 } }, { "ph": "s", "id": 63342, "pid": 76337, "tid": -914061504, "ts": 1716454222285340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222347089, "dur": 70, "args": { "External id": 63350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63350, "pid": 5, "tid": 7, "ts": 1716454222347089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285405, "dur": 13, "args": { "External id": 63350, "cbid": 211, "correlation": 63350 } }, { "ph": "s", "id": 63350, "pid": 76337, "tid": -914061504, "ts": 1716454222285405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222347161, "dur": 70, "args": { "External id": 63358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63358, "pid": 5, "tid": 7, "ts": 1716454222347161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285435, "dur": 8, "args": { "External id": 63358, "cbid": 211, "correlation": 63358 } }, { "ph": "s", "id": 63358, "pid": 76337, "tid": -914061504, "ts": 1716454222285435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222285514, "dur": 1, "args": { "External id": 63374, "cbid": 251, "correlation": 63374 } }, { "ph": "f", "id": 63374, "pid": 76337, "tid": -914061504, "ts": 1716454222285514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222347233, "dur": 1, "args": { "External id": 63376, "device": 5, "context": 1, "stream": 7, "correlation": 63376, "bytes": 240, "memory bandwidth (GB/s)": 0.14990630855715179 } }, { "ph": "f", "id": 63376, "pid": 5, "tid": 7, "ts": 1716454222347233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222285519, "dur": 9, "args": { "External id": 63376, "cbid": 51, "correlation": 63376 } }, { "ph": "s", "id": 63376, "pid": 76337, "tid": -914061504, "ts": 1716454222285519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222347237, "dur": 294, "args": { "External id": 63377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63377, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 63377, "pid": 5, "tid": 7, "ts": 1716454222347237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285530, "dur": 14, "args": { "External id": 63377, "cbid": 211, "correlation": 63377 } }, { "ph": "s", "id": 63377, "pid": 76337, "tid": -914061504, "ts": 1716454222285530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222347532, "dur": 14, "args": { "External id": 63385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63385, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63385, "pid": 5, "tid": 7, "ts": 1716454222347532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285576, "dur": 10, "args": { "External id": 63385, "cbid": 211, "correlation": 63385 } }, { "ph": "s", "id": 63385, "pid": 76337, "tid": -914061504, "ts": 1716454222285576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222347547, "dur": 41, "args": { "External id": 63396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63396, "pid": 5, "tid": 7, "ts": 1716454222347547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285645, "dur": 12, "args": { "External id": 63396, "cbid": 211, "correlation": 63396 } }, { "ph": "s", "id": 63396, "pid": 76337, "tid": -914061504, "ts": 1716454222285645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222285708, "dur": 0, "args": { "External id": 63408, "cbid": 317, "correlation": 63408 } }, { "ph": "f", "id": 63408, "pid": 76337, "tid": -914061504, "ts": 1716454222285708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222285709, "dur": 0, "args": { "External id": 63409, "cbid": 203, "correlation": 63409 } }, { "ph": "f", "id": 63409, "pid": 76337, "tid": -914061504, "ts": 1716454222285709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222285710, "dur": 0, "args": { "External id": 63410, "cbid": 205, "correlation": 63410 } }, { "ph": "f", "id": 63410, "pid": 76337, "tid": -914061504, "ts": 1716454222285710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222347589, "dur": 14, "args": { "External id": 63414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63414, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63414, "pid": 5, "tid": 7, "ts": 1716454222347589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285726, "dur": 12, "args": { "External id": 63414, "cbid": 211, "correlation": 63414 } }, { "ph": "s", "id": 63414, "pid": 76337, "tid": -914061504, "ts": 1716454222285726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222347605, "dur": 4, "args": { "External id": 63416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 63416, "pid": 5, "tid": 7, "ts": 1716454222347605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285742, "dur": 6, "args": { "External id": 63416, "cbid": 211, "correlation": 63416 } }, { "ph": "s", "id": 63416, "pid": 76337, "tid": -914061504, "ts": 1716454222285742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222285751, "dur": 0, "args": { "External id": 63417, "cbid": 51, "correlation": 63417 } }, { "ph": "s", "id": 63417, "pid": 76337, "tid": -914061504, "ts": 1716454222285751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222347610, "dur": 107, "args": { "External id": 63418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63418, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 63418, "pid": 5, "tid": 7, "ts": 1716454222347610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285751, "dur": 5, "args": { "External id": 63418, "cbid": 211, "correlation": 63418 } }, { "ph": "s", "id": 63418, "pid": 76337, "tid": -914061504, "ts": 1716454222285751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222347718, "dur": 19, "args": { "External id": 63423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63423, "pid": 5, "tid": 7, "ts": 1716454222347718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285779, "dur": 9, "args": { "External id": 63423, "cbid": 211, "correlation": 63423 } }, { "ph": "s", "id": 63423, "pid": 76337, "tid": -914061504, "ts": 1716454222285779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222347738, "dur": 13, "args": { "External id": 63431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63431, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63431, "pid": 5, "tid": 7, "ts": 1716454222347738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285810, "dur": 8, "args": { "External id": 63431, "cbid": 211, "correlation": 63431 } }, { "ph": "s", "id": 63431, "pid": 76337, "tid": -914061504, "ts": 1716454222285810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222285879, "dur": 0, "args": { "External id": 63441, "cbid": 317, "correlation": 63441 } }, { "ph": "f", "id": 63441, "pid": 76337, "tid": -914061504, "ts": 1716454222285879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222285879, "dur": 0, "args": { "External id": 63442, "cbid": 203, "correlation": 63442 } }, { "ph": "f", "id": 63442, "pid": 76337, "tid": -914061504, "ts": 1716454222285879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222285880, "dur": 0, "args": { "External id": 63443, "cbid": 205, "correlation": 63443 } }, { "ph": "f", "id": 63443, "pid": 76337, "tid": -914061504, "ts": 1716454222285880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222347753, "dur": 12, "args": { "External id": 63447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63447, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63447, "pid": 5, "tid": 7, "ts": 1716454222347753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285904, "dur": 14, "args": { "External id": 63447, "cbid": 211, "correlation": 63447 } }, { "ph": "s", "id": 63447, "pid": 76337, "tid": -914061504, "ts": 1716454222285904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222347766, "dur": 177, "args": { "External id": 63449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63449, "pid": 5, "tid": 7, "ts": 1716454222347766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285920, "dur": 6, "args": { "External id": 63449, "cbid": 211, "correlation": 63449 } }, { "ph": "s", "id": 63449, "pid": 76337, "tid": -914061504, "ts": 1716454222285920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222347945, "dur": 1, "args": { "External id": 63451, "device": 5, "context": 1, "stream": 7, "correlation": 63451, "bytes": 960, "memory bandwidth (GB/s)": 0.5454545454545454 } }, { "ph": "f", "id": 63451, "pid": 5, "tid": 7, "ts": 1716454222347945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222285933, "dur": 7, "args": { "External id": 63451, "cbid": 51, "correlation": 63451 } }, { "ph": "s", "id": 63451, "pid": 76337, "tid": -914061504, "ts": 1716454222285933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222347949, "dur": 218, "args": { "External id": 63452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63452, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 63452, "pid": 5, "tid": 7, "ts": 1716454222347949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285942, "dur": 8, "args": { "External id": 63452, "cbid": 211, "correlation": 63452 } }, { "ph": "s", "id": 63452, "pid": 76337, "tid": -914061504, "ts": 1716454222285942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222348169, "dur": 7, "args": { "External id": 63454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63454, "pid": 5, "tid": 7, "ts": 1716454222348169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285954, "dur": 5, "args": { "External id": 63454, "cbid": 211, "correlation": 63454 } }, { "ph": "s", "id": 63454, "pid": 76337, "tid": -914061504, "ts": 1716454222285954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222348176, "dur": 7, "args": { "External id": 63460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63460, "pid": 5, "tid": 7, "ts": 1716454222348176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222285991, "dur": 9, "args": { "External id": 63460, "cbid": 211, "correlation": 63460 } }, { "ph": "s", "id": 63460, "pid": 76337, "tid": -914061504, "ts": 1716454222285991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222348185, "dur": 12, "args": { "External id": 63480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63480, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 63480, "pid": 5, "tid": 7, "ts": 1716454222348185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286085, "dur": 12, "args": { "External id": 63480, "cbid": 211, "correlation": 63480 } }, { "ph": "s", "id": 63480, "pid": 76337, "tid": -914061504, "ts": 1716454222286085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222348198, "dur": 5, "args": { "External id": 63492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63492, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 63492, "pid": 5, "tid": 7, "ts": 1716454222348198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286108, "dur": 7, "args": { "External id": 63492, "cbid": 211, "correlation": 63492 } }, { "ph": "s", "id": 63492, "pid": 76337, "tid": -914061504, "ts": 1716454222286108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222348204, "dur": 9, "args": { "External id": 63495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63495, "pid": 5, "tid": 7, "ts": 1716454222348204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286127, "dur": 7, "args": { "External id": 63495, "cbid": 211, "correlation": 63495 } }, { "ph": "s", "id": 63495, "pid": 76337, "tid": -914061504, "ts": 1716454222286127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222348215, "dur": 6, "args": { "External id": 63504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63504, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63504, "pid": 5, "tid": 7, "ts": 1716454222348215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286167, "dur": 10, "args": { "External id": 63504, "cbid": 211, "correlation": 63504 } }, { "ph": "s", "id": 63504, "pid": 76337, "tid": -914061504, "ts": 1716454222286167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222286223, "dur": 0, "args": { "External id": 63514, "cbid": 317, "correlation": 63514 } }, { "ph": "f", "id": 63514, "pid": 76337, "tid": -914061504, "ts": 1716454222286223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222286224, "dur": 0, "args": { "External id": 63515, "cbid": 203, "correlation": 63515 } }, { "ph": "f", "id": 63515, "pid": 76337, "tid": -914061504, "ts": 1716454222286224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222286225, "dur": 0, "args": { "External id": 63516, "cbid": 205, "correlation": 63516 } }, { "ph": "f", "id": 63516, "pid": 76337, "tid": -914061504, "ts": 1716454222286225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222348222, "dur": 6, "args": { "External id": 63520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63520, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63520, "pid": 5, "tid": 7, "ts": 1716454222348222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286242, "dur": 12, "args": { "External id": 63520, "cbid": 211, "correlation": 63520 } }, { "ph": "s", "id": 63520, "pid": 76337, "tid": -914061504, "ts": 1716454222286242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222348229, "dur": 176, "args": { "External id": 63522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63522, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63522, "pid": 5, "tid": 7, "ts": 1716454222348229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286256, "dur": 5, "args": { "External id": 63522, "cbid": 211, "correlation": 63522 } }, { "ph": "s", "id": 63522, "pid": 76337, "tid": -914061504, "ts": 1716454222286256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222348408, "dur": 1, "args": { "External id": 63524, "device": 5, "context": 1, "stream": 7, "correlation": 63524, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 63524, "pid": 5, "tid": 7, "ts": 1716454222348408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222286267, "dur": 6, "args": { "External id": 63524, "cbid": 51, "correlation": 63524 } }, { "ph": "s", "id": 63524, "pid": 76337, "tid": -914061504, "ts": 1716454222286267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222348412, "dur": 296, "args": { "External id": 63525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63525, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63525, "pid": 5, "tid": 7, "ts": 1716454222348412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286275, "dur": 6, "args": { "External id": 63525, "cbid": 211, "correlation": 63525 } }, { "ph": "s", "id": 63525, "pid": 76337, "tid": -914061504, "ts": 1716454222286275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222348709, "dur": 6, "args": { "External id": 63527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63527, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63527, "pid": 5, "tid": 7, "ts": 1716454222348709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286285, "dur": 5, "args": { "External id": 63527, "cbid": 211, "correlation": 63527 } }, { "ph": "s", "id": 63527, "pid": 76337, "tid": -914061504, "ts": 1716454222286285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222348717, "dur": 7, "args": { "External id": 63533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63533, "pid": 5, "tid": 7, "ts": 1716454222348717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286313, "dur": 8, "args": { "External id": 63533, "cbid": 211, "correlation": 63533 } }, { "ph": "s", "id": 63533, "pid": 76337, "tid": -914061504, "ts": 1716454222286313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222348725, "dur": 3, "args": { "External id": 63541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63541, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 63541, "pid": 5, "tid": 7, "ts": 1716454222348725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286358, "dur": 9, "args": { "External id": 63541, "cbid": 211, "correlation": 63541 } }, { "ph": "s", "id": 63541, "pid": 76337, "tid": -914061504, "ts": 1716454222286358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222286423, "dur": 1, "args": { "External id": 63557, "cbid": 251, "correlation": 63557 } }, { "ph": "f", "id": 63557, "pid": 76337, "tid": -914061504, "ts": 1716454222286423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222286428, "dur": 0, "args": { "External id": 63559, "cbid": 251, "correlation": 63559 } }, { "ph": "f", "id": 63559, "pid": 76337, "tid": -914061504, "ts": 1716454222286428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222348730, "dur": 14, "args": { "External id": 63560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63560, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63560, "pid": 5, "tid": 7, "ts": 1716454222348730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286430, "dur": 11, "args": { "External id": 63560, "cbid": 211, "correlation": 63560 } }, { "ph": "s", "id": 63560, "pid": 76337, "tid": -914061504, "ts": 1716454222286430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222348745, "dur": 5, "args": { "External id": 63562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63562, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63562, "pid": 5, "tid": 7, "ts": 1716454222348745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286442, "dur": 5, "args": { "External id": 63562, "cbid": 211, "correlation": 63562 } }, { "ph": "s", "id": 63562, "pid": 76337, "tid": -914061504, "ts": 1716454222286442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222348752, "dur": 6, "args": { "External id": 63572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63572, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63572, "pid": 5, "tid": 7, "ts": 1716454222348752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286499, "dur": 15, "args": { "External id": 63572, "cbid": 211, "correlation": 63572 } }, { "ph": "s", "id": 63572, "pid": 76337, "tid": -914061504, "ts": 1716454222286499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222348760, "dur": 11, "args": { "External id": 63592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63592, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 63592, "pid": 5, "tid": 7, "ts": 1716454222348760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286571, "dur": 11, "args": { "External id": 63592, "cbid": 211, "correlation": 63592 } }, { "ph": "s", "id": 63592, "pid": 76337, "tid": -914061504, "ts": 1716454222286571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222348772, "dur": 4, "args": { "External id": 63604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63604, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 63604, "pid": 5, "tid": 7, "ts": 1716454222348772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286592, "dur": 6, "args": { "External id": 63604, "cbid": 211, "correlation": 63604 } }, { "ph": "s", "id": 63604, "pid": 76337, "tid": -914061504, "ts": 1716454222286592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222348777, "dur": 8, "args": { "External id": 63607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63607, "pid": 5, "tid": 7, "ts": 1716454222348777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286609, "dur": 6, "args": { "External id": 63607, "cbid": 211, "correlation": 63607 } }, { "ph": "s", "id": 63607, "pid": 76337, "tid": -914061504, "ts": 1716454222286609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222348786, "dur": 5, "args": { "External id": 63616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63616, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63616, "pid": 5, "tid": 7, "ts": 1716454222348786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286650, "dur": 10, "args": { "External id": 63616, "cbid": 211, "correlation": 63616 } }, { "ph": "s", "id": 63616, "pid": 76337, "tid": -914061504, "ts": 1716454222286650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222286713, "dur": 0, "args": { "External id": 63626, "cbid": 317, "correlation": 63626 } }, { "ph": "f", "id": 63626, "pid": 76337, "tid": -914061504, "ts": 1716454222286713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222286714, "dur": 0, "args": { "External id": 63627, "cbid": 203, "correlation": 63627 } }, { "ph": "f", "id": 63627, "pid": 76337, "tid": -914061504, "ts": 1716454222286714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222286714, "dur": 0, "args": { "External id": 63628, "cbid": 205, "correlation": 63628 } }, { "ph": "f", "id": 63628, "pid": 76337, "tid": -914061504, "ts": 1716454222286714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222348793, "dur": 6, "args": { "External id": 63632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63632, "pid": 5, "tid": 7, "ts": 1716454222348793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286728, "dur": 12, "args": { "External id": 63632, "cbid": 211, "correlation": 63632 } }, { "ph": "s", "id": 63632, "pid": 76337, "tid": -914061504, "ts": 1716454222286728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222348800, "dur": 177, "args": { "External id": 63634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63634, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63634, "pid": 5, "tid": 7, "ts": 1716454222348800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286743, "dur": 5, "args": { "External id": 63634, "cbid": 211, "correlation": 63634 } }, { "ph": "s", "id": 63634, "pid": 76337, "tid": -914061504, "ts": 1716454222286743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222348979, "dur": 1, "args": { "External id": 63636, "device": 5, "context": 1, "stream": 7, "correlation": 63636, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 63636, "pid": 5, "tid": 7, "ts": 1716454222348979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222286754, "dur": 6, "args": { "External id": 63636, "cbid": 51, "correlation": 63636 } }, { "ph": "s", "id": 63636, "pid": 76337, "tid": -914061504, "ts": 1716454222286754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222348983, "dur": 285, "args": { "External id": 63637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63637, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63637, "pid": 5, "tid": 7, "ts": 1716454222348983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286761, "dur": 6, "args": { "External id": 63637, "cbid": 211, "correlation": 63637 } }, { "ph": "s", "id": 63637, "pid": 76337, "tid": -914061504, "ts": 1716454222286761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222349269, "dur": 6, "args": { "External id": 63639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63639, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63639, "pid": 5, "tid": 7, "ts": 1716454222349269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286770, "dur": 9, "args": { "External id": 63639, "cbid": 211, "correlation": 63639 } }, { "ph": "s", "id": 63639, "pid": 76337, "tid": -914061504, "ts": 1716454222286770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222349277, "dur": 7, "args": { "External id": 63645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63645, "pid": 5, "tid": 7, "ts": 1716454222349277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286802, "dur": 9, "args": { "External id": 63645, "cbid": 211, "correlation": 63645 } }, { "ph": "s", "id": 63645, "pid": 76337, "tid": -914061504, "ts": 1716454222286802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222349285, "dur": 5, "args": { "External id": 63653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63653, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63653, "pid": 5, "tid": 7, "ts": 1716454222349285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286835, "dur": 9, "args": { "External id": 63653, "cbid": 211, "correlation": 63653 } }, { "ph": "s", "id": 63653, "pid": 76337, "tid": -914061504, "ts": 1716454222286835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222349292, "dur": 5, "args": { "External id": 63661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63661, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63661, "pid": 5, "tid": 7, "ts": 1716454222349292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286865, "dur": 8, "args": { "External id": 63661, "cbid": 211, "correlation": 63661 } }, { "ph": "s", "id": 63661, "pid": 76337, "tid": -914061504, "ts": 1716454222286865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222349298, "dur": 10, "args": { "External id": 63681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63681, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 63681, "pid": 5, "tid": 7, "ts": 1716454222349298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286938, "dur": 12, "args": { "External id": 63681, "cbid": 211, "correlation": 63681 } }, { "ph": "s", "id": 63681, "pid": 76337, "tid": -914061504, "ts": 1716454222286938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222349310, "dur": 4, "args": { "External id": 63693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63693, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 63693, "pid": 5, "tid": 7, "ts": 1716454222349310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286960, "dur": 6, "args": { "External id": 63693, "cbid": 211, "correlation": 63693 } }, { "ph": "s", "id": 63693, "pid": 76337, "tid": -914061504, "ts": 1716454222286960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222349315, "dur": 7, "args": { "External id": 63696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63696, "pid": 5, "tid": 7, "ts": 1716454222349315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222286986, "dur": 9, "args": { "External id": 63696, "cbid": 211, "correlation": 63696 } }, { "ph": "s", "id": 63696, "pid": 76337, "tid": -914061504, "ts": 1716454222286986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222349323, "dur": 5, "args": { "External id": 63705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63705, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63705, "pid": 5, "tid": 7, "ts": 1716454222349323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287027, "dur": 9, "args": { "External id": 63705, "cbid": 211, "correlation": 63705 } }, { "ph": "s", "id": 63705, "pid": 76337, "tid": -914061504, "ts": 1716454222287027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222287082, "dur": 0, "args": { "External id": 63715, "cbid": 317, "correlation": 63715 } }, { "ph": "f", "id": 63715, "pid": 76337, "tid": -914061504, "ts": 1716454222287082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222287083, "dur": 0, "args": { "External id": 63716, "cbid": 203, "correlation": 63716 } }, { "ph": "f", "id": 63716, "pid": 76337, "tid": -914061504, "ts": 1716454222287083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222287084, "dur": 0, "args": { "External id": 63717, "cbid": 205, "correlation": 63717 } }, { "ph": "f", "id": 63717, "pid": 76337, "tid": -914061504, "ts": 1716454222287084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222349330, "dur": 5, "args": { "External id": 63721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63721, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63721, "pid": 5, "tid": 7, "ts": 1716454222349330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287097, "dur": 12, "args": { "External id": 63721, "cbid": 211, "correlation": 63721 } }, { "ph": "s", "id": 63721, "pid": 76337, "tid": -914061504, "ts": 1716454222287097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222349337, "dur": 177, "args": { "External id": 63723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63723, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63723, "pid": 5, "tid": 7, "ts": 1716454222349337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287112, "dur": 5, "args": { "External id": 63723, "cbid": 211, "correlation": 63723 } }, { "ph": "s", "id": 63723, "pid": 76337, "tid": -914061504, "ts": 1716454222287112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222349515, "dur": 1, "args": { "External id": 63725, "device": 5, "context": 1, "stream": 7, "correlation": 63725, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 63725, "pid": 5, "tid": 7, "ts": 1716454222349515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222287122, "dur": 6, "args": { "External id": 63725, "cbid": 51, "correlation": 63725 } }, { "ph": "s", "id": 63725, "pid": 76337, "tid": -914061504, "ts": 1716454222287122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222349519, "dur": 284, "args": { "External id": 63726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63726, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63726, "pid": 5, "tid": 7, "ts": 1716454222349519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287130, "dur": 6, "args": { "External id": 63726, "cbid": 211, "correlation": 63726 } }, { "ph": "s", "id": 63726, "pid": 76337, "tid": -914061504, "ts": 1716454222287130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222349805, "dur": 6, "args": { "External id": 63728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63728, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63728, "pid": 5, "tid": 7, "ts": 1716454222349805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287140, "dur": 5, "args": { "External id": 63728, "cbid": 211, "correlation": 63728 } }, { "ph": "s", "id": 63728, "pid": 76337, "tid": -914061504, "ts": 1716454222287140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222349813, "dur": 7, "args": { "External id": 63734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63734, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63734, "pid": 5, "tid": 7, "ts": 1716454222349813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287168, "dur": 9, "args": { "External id": 63734, "cbid": 211, "correlation": 63734 } }, { "ph": "s", "id": 63734, "pid": 76337, "tid": -914061504, "ts": 1716454222287168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222349821, "dur": 3, "args": { "External id": 63742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63742, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 63742, "pid": 5, "tid": 7, "ts": 1716454222349821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287212, "dur": 10, "args": { "External id": 63742, "cbid": 211, "correlation": 63742 } }, { "ph": "s", "id": 63742, "pid": 76337, "tid": -914061504, "ts": 1716454222287212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222287274, "dur": 1, "args": { "External id": 63758, "cbid": 251, "correlation": 63758 } }, { "ph": "f", "id": 63758, "pid": 76337, "tid": -914061504, "ts": 1716454222287274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222287279, "dur": 0, "args": { "External id": 63760, "cbid": 251, "correlation": 63760 } }, { "ph": "f", "id": 63760, "pid": 76337, "tid": -914061504, "ts": 1716454222287279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222349825, "dur": 11, "args": { "External id": 63761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63761, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63761, "pid": 5, "tid": 7, "ts": 1716454222349825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287281, "dur": 11, "args": { "External id": 63761, "cbid": 211, "correlation": 63761 } }, { "ph": "s", "id": 63761, "pid": 76337, "tid": -914061504, "ts": 1716454222287281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222349837, "dur": 4, "args": { "External id": 63763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63763, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63763, "pid": 5, "tid": 7, "ts": 1716454222349837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287294, "dur": 5, "args": { "External id": 63763, "cbid": 211, "correlation": 63763 } }, { "ph": "s", "id": 63763, "pid": 76337, "tid": -914061504, "ts": 1716454222287294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222349843, "dur": 6, "args": { "External id": 63773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63773, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63773, "pid": 5, "tid": 7, "ts": 1716454222349843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287354, "dur": 13, "args": { "External id": 63773, "cbid": 211, "correlation": 63773 } }, { "ph": "s", "id": 63773, "pid": 76337, "tid": -914061504, "ts": 1716454222287354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222349850, "dur": 11, "args": { "External id": 63793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63793, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 63793, "pid": 5, "tid": 7, "ts": 1716454222349850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287421, "dur": 11, "args": { "External id": 63793, "cbid": 211, "correlation": 63793 } }, { "ph": "s", "id": 63793, "pid": 76337, "tid": -914061504, "ts": 1716454222287421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222349862, "dur": 4, "args": { "External id": 63805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63805, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 63805, "pid": 5, "tid": 7, "ts": 1716454222349862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287441, "dur": 6, "args": { "External id": 63805, "cbid": 211, "correlation": 63805 } }, { "ph": "s", "id": 63805, "pid": 76337, "tid": -914061504, "ts": 1716454222287441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222349868, "dur": 7, "args": { "External id": 63808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63808, "pid": 5, "tid": 7, "ts": 1716454222349868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287460, "dur": 6, "args": { "External id": 63808, "cbid": 211, "correlation": 63808 } }, { "ph": "s", "id": 63808, "pid": 76337, "tid": -914061504, "ts": 1716454222287460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222349876, "dur": 5, "args": { "External id": 63817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63817, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63817, "pid": 5, "tid": 7, "ts": 1716454222349876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287499, "dur": 10, "args": { "External id": 63817, "cbid": 211, "correlation": 63817 } }, { "ph": "s", "id": 63817, "pid": 76337, "tid": -914061504, "ts": 1716454222287499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222287562, "dur": 0, "args": { "External id": 63827, "cbid": 317, "correlation": 63827 } }, { "ph": "f", "id": 63827, "pid": 76337, "tid": -914061504, "ts": 1716454222287562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222287563, "dur": 0, "args": { "External id": 63828, "cbid": 203, "correlation": 63828 } }, { "ph": "f", "id": 63828, "pid": 76337, "tid": -914061504, "ts": 1716454222287563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222287563, "dur": 0, "args": { "External id": 63829, "cbid": 205, "correlation": 63829 } }, { "ph": "f", "id": 63829, "pid": 76337, "tid": -914061504, "ts": 1716454222287563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222349883, "dur": 6, "args": { "External id": 63833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63833, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63833, "pid": 5, "tid": 7, "ts": 1716454222349883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287577, "dur": 12, "args": { "External id": 63833, "cbid": 211, "correlation": 63833 } }, { "ph": "s", "id": 63833, "pid": 76337, "tid": -914061504, "ts": 1716454222287577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222349890, "dur": 177, "args": { "External id": 63835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63835, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63835, "pid": 5, "tid": 7, "ts": 1716454222349890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287591, "dur": 5, "args": { "External id": 63835, "cbid": 211, "correlation": 63835 } }, { "ph": "s", "id": 63835, "pid": 76337, "tid": -914061504, "ts": 1716454222287591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222350069, "dur": 1, "args": { "External id": 63837, "device": 5, "context": 1, "stream": 7, "correlation": 63837, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 63837, "pid": 5, "tid": 7, "ts": 1716454222350069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222287601, "dur": 6, "args": { "External id": 63837, "cbid": 51, "correlation": 63837 } }, { "ph": "s", "id": 63837, "pid": 76337, "tid": -914061504, "ts": 1716454222287601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222350073, "dur": 285, "args": { "External id": 63838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63838, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63838, "pid": 5, "tid": 7, "ts": 1716454222350073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287609, "dur": 10, "args": { "External id": 63838, "cbid": 211, "correlation": 63838 } }, { "ph": "s", "id": 63838, "pid": 76337, "tid": -914061504, "ts": 1716454222287609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222350360, "dur": 6, "args": { "External id": 63840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63840, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63840, "pid": 5, "tid": 7, "ts": 1716454222350360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287622, "dur": 5, "args": { "External id": 63840, "cbid": 211, "correlation": 63840 } }, { "ph": "s", "id": 63840, "pid": 76337, "tid": -914061504, "ts": 1716454222287622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222350368, "dur": 7, "args": { "External id": 63846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63846, "pid": 5, "tid": 7, "ts": 1716454222350368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287651, "dur": 8, "args": { "External id": 63846, "cbid": 211, "correlation": 63846 } }, { "ph": "s", "id": 63846, "pid": 76337, "tid": -914061504, "ts": 1716454222287651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222350375, "dur": 5, "args": { "External id": 63854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63854, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63854, "pid": 5, "tid": 7, "ts": 1716454222350375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287684, "dur": 8, "args": { "External id": 63854, "cbid": 211, "correlation": 63854 } }, { "ph": "s", "id": 63854, "pid": 76337, "tid": -914061504, "ts": 1716454222287684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222350382, "dur": 5, "args": { "External id": 63862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63862, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63862, "pid": 5, "tid": 7, "ts": 1716454222350382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287713, "dur": 9, "args": { "External id": 63862, "cbid": 211, "correlation": 63862 } }, { "ph": "s", "id": 63862, "pid": 76337, "tid": -914061504, "ts": 1716454222287713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222350388, "dur": 11, "args": { "External id": 63882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63882, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 63882, "pid": 5, "tid": 7, "ts": 1716454222350388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287817, "dur": 14, "args": { "External id": 63882, "cbid": 211, "correlation": 63882 } }, { "ph": "s", "id": 63882, "pid": 76337, "tid": -914061504, "ts": 1716454222287817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222350401, "dur": 4, "args": { "External id": 63894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63894, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 63894, "pid": 5, "tid": 7, "ts": 1716454222350401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287841, "dur": 6, "args": { "External id": 63894, "cbid": 211, "correlation": 63894 } }, { "ph": "s", "id": 63894, "pid": 76337, "tid": -914061504, "ts": 1716454222287841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222350406, "dur": 7, "args": { "External id": 63897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63897, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63897, "pid": 5, "tid": 7, "ts": 1716454222350406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287859, "dur": 7, "args": { "External id": 63897, "cbid": 211, "correlation": 63897 } }, { "ph": "s", "id": 63897, "pid": 76337, "tid": -914061504, "ts": 1716454222287859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222350414, "dur": 5, "args": { "External id": 63906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63906, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63906, "pid": 5, "tid": 7, "ts": 1716454222350414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287897, "dur": 13, "args": { "External id": 63906, "cbid": 211, "correlation": 63906 } }, { "ph": "s", "id": 63906, "pid": 76337, "tid": -914061504, "ts": 1716454222287897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222287953, "dur": 0, "args": { "External id": 63916, "cbid": 317, "correlation": 63916 } }, { "ph": "f", "id": 63916, "pid": 76337, "tid": -914061504, "ts": 1716454222287953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222287954, "dur": 0, "args": { "External id": 63917, "cbid": 203, "correlation": 63917 } }, { "ph": "f", "id": 63917, "pid": 76337, "tid": -914061504, "ts": 1716454222287954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222287955, "dur": 0, "args": { "External id": 63918, "cbid": 205, "correlation": 63918 } }, { "ph": "f", "id": 63918, "pid": 76337, "tid": -914061504, "ts": 1716454222287955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222350420, "dur": 6, "args": { "External id": 63922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63922, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63922, "pid": 5, "tid": 7, "ts": 1716454222350420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222287968, "dur": 118, "args": { "External id": 63922, "cbid": 211, "correlation": 63922 } }, { "ph": "s", "id": 63922, "pid": 76337, "tid": -914061504, "ts": 1716454222287968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222350427, "dur": 177, "args": { "External id": 63924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63924, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63924, "pid": 5, "tid": 7, "ts": 1716454222350427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288090, "dur": 6, "args": { "External id": 63924, "cbid": 211, "correlation": 63924 } }, { "ph": "s", "id": 63924, "pid": 76337, "tid": -914061504, "ts": 1716454222288090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222350607, "dur": 1, "args": { "External id": 63926, "device": 5, "context": 1, "stream": 7, "correlation": 63926, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 63926, "pid": 5, "tid": 7, "ts": 1716454222350607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222288102, "dur": 6, "args": { "External id": 63926, "cbid": 51, "correlation": 63926 } }, { "ph": "s", "id": 63926, "pid": 76337, "tid": -914061504, "ts": 1716454222288102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222350610, "dur": 284, "args": { "External id": 63927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63927, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63927, "pid": 5, "tid": 7, "ts": 1716454222350610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288109, "dur": 6, "args": { "External id": 63927, "cbid": 211, "correlation": 63927 } }, { "ph": "s", "id": 63927, "pid": 76337, "tid": -914061504, "ts": 1716454222288109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222350895, "dur": 6, "args": { "External id": 63929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63929, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 63929, "pid": 5, "tid": 7, "ts": 1716454222350895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288119, "dur": 5, "args": { "External id": 63929, "cbid": 211, "correlation": 63929 } }, { "ph": "s", "id": 63929, "pid": 76337, "tid": -914061504, "ts": 1716454222288119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222350903, "dur": 7, "args": { "External id": 63935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63935, "pid": 5, "tid": 7, "ts": 1716454222350903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288151, "dur": 8, "args": { "External id": 63935, "cbid": 211, "correlation": 63935 } }, { "ph": "s", "id": 63935, "pid": 76337, "tid": -914061504, "ts": 1716454222288151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222350911, "dur": 3, "args": { "External id": 63943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63943, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 63943, "pid": 5, "tid": 7, "ts": 1716454222350911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288198, "dur": 9, "args": { "External id": 63943, "cbid": 211, "correlation": 63943 } }, { "ph": "s", "id": 63943, "pid": 76337, "tid": -914061504, "ts": 1716454222288198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222288260, "dur": 1, "args": { "External id": 63959, "cbid": 251, "correlation": 63959 } }, { "ph": "f", "id": 63959, "pid": 76337, "tid": -914061504, "ts": 1716454222288260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222288266, "dur": 0, "args": { "External id": 63961, "cbid": 251, "correlation": 63961 } }, { "ph": "f", "id": 63961, "pid": 76337, "tid": -914061504, "ts": 1716454222288266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222350916, "dur": 12, "args": { "External id": 63962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63962, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63962, "pid": 5, "tid": 7, "ts": 1716454222350916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288268, "dur": 11, "args": { "External id": 63962, "cbid": 211, "correlation": 63962 } }, { "ph": "s", "id": 63962, "pid": 76337, "tid": -914061504, "ts": 1716454222288268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222350929, "dur": 4, "args": { "External id": 63964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63964, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 63964, "pid": 5, "tid": 7, "ts": 1716454222350929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288281, "dur": 6, "args": { "External id": 63964, "cbid": 211, "correlation": 63964 } }, { "ph": "s", "id": 63964, "pid": 76337, "tid": -914061504, "ts": 1716454222288281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222350934, "dur": 6, "args": { "External id": 63974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63974, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 63974, "pid": 5, "tid": 7, "ts": 1716454222350934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288344, "dur": 12, "args": { "External id": 63974, "cbid": 211, "correlation": 63974 } }, { "ph": "s", "id": 63974, "pid": 76337, "tid": -914061504, "ts": 1716454222288344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222350942, "dur": 10, "args": { "External id": 63994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 63994, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 63994, "pid": 5, "tid": 7, "ts": 1716454222350942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288411, "dur": 11, "args": { "External id": 63994, "cbid": 211, "correlation": 63994 } }, { "ph": "s", "id": 63994, "pid": 76337, "tid": -914061504, "ts": 1716454222288411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222350953, "dur": 4, "args": { "External id": 64006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64006, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 64006, "pid": 5, "tid": 7, "ts": 1716454222350953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288432, "dur": 6, "args": { "External id": 64006, "cbid": 211, "correlation": 64006 } }, { "ph": "s", "id": 64006, "pid": 76337, "tid": -914061504, "ts": 1716454222288432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222350959, "dur": 7, "args": { "External id": 64009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64009, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64009, "pid": 5, "tid": 7, "ts": 1716454222350959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288451, "dur": 6, "args": { "External id": 64009, "cbid": 211, "correlation": 64009 } }, { "ph": "s", "id": 64009, "pid": 76337, "tid": -914061504, "ts": 1716454222288451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222350967, "dur": 5, "args": { "External id": 64018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64018, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64018, "pid": 5, "tid": 7, "ts": 1716454222350967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288491, "dur": 10, "args": { "External id": 64018, "cbid": 211, "correlation": 64018 } }, { "ph": "s", "id": 64018, "pid": 76337, "tid": -914061504, "ts": 1716454222288491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222288555, "dur": 0, "args": { "External id": 64028, "cbid": 317, "correlation": 64028 } }, { "ph": "f", "id": 64028, "pid": 76337, "tid": -914061504, "ts": 1716454222288555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222288556, "dur": 0, "args": { "External id": 64029, "cbid": 203, "correlation": 64029 } }, { "ph": "f", "id": 64029, "pid": 76337, "tid": -914061504, "ts": 1716454222288556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222288557, "dur": 0, "args": { "External id": 64030, "cbid": 205, "correlation": 64030 } }, { "ph": "f", "id": 64030, "pid": 76337, "tid": -914061504, "ts": 1716454222288557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222350974, "dur": 6, "args": { "External id": 64034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64034, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64034, "pid": 5, "tid": 7, "ts": 1716454222350974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288570, "dur": 12, "args": { "External id": 64034, "cbid": 211, "correlation": 64034 } }, { "ph": "s", "id": 64034, "pid": 76337, "tid": -914061504, "ts": 1716454222288570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222350981, "dur": 176, "args": { "External id": 64036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64036, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64036, "pid": 5, "tid": 7, "ts": 1716454222350981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288585, "dur": 5, "args": { "External id": 64036, "cbid": 211, "correlation": 64036 } }, { "ph": "s", "id": 64036, "pid": 76337, "tid": -914061504, "ts": 1716454222288585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222351159, "dur": 1, "args": { "External id": 64038, "device": 5, "context": 1, "stream": 7, "correlation": 64038, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 64038, "pid": 5, "tid": 7, "ts": 1716454222351159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222288596, "dur": 10, "args": { "External id": 64038, "cbid": 51, "correlation": 64038 } }, { "ph": "s", "id": 64038, "pid": 76337, "tid": -914061504, "ts": 1716454222288596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222351163, "dur": 284, "args": { "External id": 64039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64039, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64039, "pid": 5, "tid": 7, "ts": 1716454222351163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288607, "dur": 6, "args": { "External id": 64039, "cbid": 211, "correlation": 64039 } }, { "ph": "s", "id": 64039, "pid": 76337, "tid": -914061504, "ts": 1716454222288607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222351449, "dur": 7, "args": { "External id": 64041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64041, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64041, "pid": 5, "tid": 7, "ts": 1716454222351449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288618, "dur": 5, "args": { "External id": 64041, "cbid": 211, "correlation": 64041 } }, { "ph": "s", "id": 64041, "pid": 76337, "tid": -914061504, "ts": 1716454222288618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222351457, "dur": 7, "args": { "External id": 64047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64047, "pid": 5, "tid": 7, "ts": 1716454222351457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288646, "dur": 9, "args": { "External id": 64047, "cbid": 211, "correlation": 64047 } }, { "ph": "s", "id": 64047, "pid": 76337, "tid": -914061504, "ts": 1716454222288646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222351465, "dur": 5, "args": { "External id": 64055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64055, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64055, "pid": 5, "tid": 7, "ts": 1716454222351465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288679, "dur": 8, "args": { "External id": 64055, "cbid": 211, "correlation": 64055 } }, { "ph": "s", "id": 64055, "pid": 76337, "tid": -914061504, "ts": 1716454222288679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222351472, "dur": 5, "args": { "External id": 64063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64063, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64063, "pid": 5, "tid": 7, "ts": 1716454222351472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288708, "dur": 8, "args": { "External id": 64063, "cbid": 211, "correlation": 64063 } }, { "ph": "s", "id": 64063, "pid": 76337, "tid": -914061504, "ts": 1716454222288708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222351478, "dur": 11, "args": { "External id": 64083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64083, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 64083, "pid": 5, "tid": 7, "ts": 1716454222351478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288852, "dur": 14, "args": { "External id": 64083, "cbid": 211, "correlation": 64083 } }, { "ph": "s", "id": 64083, "pid": 76337, "tid": -914061504, "ts": 1716454222288852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222351490, "dur": 4, "args": { "External id": 64095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64095, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 64095, "pid": 5, "tid": 7, "ts": 1716454222351490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288876, "dur": 6, "args": { "External id": 64095, "cbid": 211, "correlation": 64095 } }, { "ph": "s", "id": 64095, "pid": 76337, "tid": -914061504, "ts": 1716454222288876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222351496, "dur": 7, "args": { "External id": 64098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64098, "pid": 5, "tid": 7, "ts": 1716454222351496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288894, "dur": 7, "args": { "External id": 64098, "cbid": 211, "correlation": 64098 } }, { "ph": "s", "id": 64098, "pid": 76337, "tid": -914061504, "ts": 1716454222288894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222288956, "dur": 0, "args": { "External id": 64109, "cbid": 317, "correlation": 64109 } }, { "ph": "f", "id": 64109, "pid": 76337, "tid": -914061504, "ts": 1716454222288956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222288957, "dur": 0, "args": { "External id": 64110, "cbid": 203, "correlation": 64110 } }, { "ph": "f", "id": 64110, "pid": 76337, "tid": -914061504, "ts": 1716454222288957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222288958, "dur": 0, "args": { "External id": 64111, "cbid": 205, "correlation": 64111 } }, { "ph": "f", "id": 64111, "pid": 76337, "tid": -914061504, "ts": 1716454222288958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222351504, "dur": 5, "args": { "External id": 64115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64115, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64115, "pid": 5, "tid": 7, "ts": 1716454222351504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222288989, "dur": 13, "args": { "External id": 64115, "cbid": 211, "correlation": 64115 } }, { "ph": "s", "id": 64115, "pid": 76337, "tid": -914061504, "ts": 1716454222288989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222351511, "dur": 41, "args": { "External id": 64117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64117, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 64117, "pid": 5, "tid": 7, "ts": 1716454222351511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289030, "dur": 11, "args": { "External id": 64117, "cbid": 211, "correlation": 64117 } }, { "ph": "s", "id": 64117, "pid": 76337, "tid": -914061504, "ts": 1716454222289030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222351554, "dur": 6, "args": { "External id": 64119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64119, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64119, "pid": 5, "tid": 7, "ts": 1716454222351554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289046, "dur": 5, "args": { "External id": 64119, "cbid": 211, "correlation": 64119 } }, { "ph": "s", "id": 64119, "pid": 76337, "tid": -914061504, "ts": 1716454222289046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222351561, "dur": 7, "args": { "External id": 64125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64125, "pid": 5, "tid": 7, "ts": 1716454222351561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289075, "dur": 8, "args": { "External id": 64125, "cbid": 211, "correlation": 64125 } }, { "ph": "s", "id": 64125, "pid": 76337, "tid": -914061504, "ts": 1716454222289075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222351569, "dur": 22, "args": { "External id": 64134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64134, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64134, "pid": 5, "tid": 7, "ts": 1716454222351569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289163, "dur": 15, "args": { "External id": 64134, "cbid": 211, "correlation": 64134 } }, { "ph": "s", "id": 64134, "pid": 76337, "tid": -914061504, "ts": 1716454222289163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222351592, "dur": 12, "args": { "External id": 64156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64156, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 64156, "pid": 5, "tid": 7, "ts": 1716454222351592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289221, "dur": 10, "args": { "External id": 64156, "cbid": 211, "correlation": 64156 } }, { "ph": "s", "id": 64156, "pid": 76337, "tid": -914061504, "ts": 1716454222289221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289313, "dur": 2, "args": { "External id": 64167, "cbid": 251, "correlation": 64167 } }, { "ph": "f", "id": 64167, "pid": 76337, "tid": -914061504, "ts": 1716454222289313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289318, "dur": 0, "args": { "External id": 64168, "cbid": 251, "correlation": 64168 } }, { "ph": "f", "id": 64168, "pid": 76337, "tid": -914061504, "ts": 1716454222289318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222351605, "dur": 60, "args": { "External id": 64169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64169, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 64169, "pid": 5, "tid": 7, "ts": 1716454222351605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289320, "dur": 14, "args": { "External id": 64169, "cbid": 211, "correlation": 64169 } }, { "ph": "s", "id": 64169, "pid": 76337, "tid": -914061504, "ts": 1716454222289320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289395, "dur": 1, "args": { "External id": 64180, "cbid": 251, "correlation": 64180 } }, { "ph": "f", "id": 64180, "pid": 76337, "tid": -914061504, "ts": 1716454222289395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289399, "dur": 0, "args": { "External id": 64181, "cbid": 251, "correlation": 64181 } }, { "ph": "f", "id": 64181, "pid": 76337, "tid": -914061504, "ts": 1716454222289399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222351666, "dur": 57, "args": { "External id": 64182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64182, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 64182, "pid": 5, "tid": 7, "ts": 1716454222351666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289400, "dur": 12, "args": { "External id": 64182, "cbid": 211, "correlation": 64182 } }, { "ph": "s", "id": 64182, "pid": 76337, "tid": -914061504, "ts": 1716454222289400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289466, "dur": 1, "args": { "External id": 64193, "cbid": 251, "correlation": 64193 } }, { "ph": "f", "id": 64193, "pid": 76337, "tid": -914061504, "ts": 1716454222289466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289470, "dur": 0, "args": { "External id": 64194, "cbid": 251, "correlation": 64194 } }, { "ph": "f", "id": 64194, "pid": 76337, "tid": -914061504, "ts": 1716454222289470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222351725, "dur": 58, "args": { "External id": 64195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64195, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 64195, "pid": 5, "tid": 7, "ts": 1716454222351725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289472, "dur": 11, "args": { "External id": 64195, "cbid": 211, "correlation": 64195 } }, { "ph": "s", "id": 64195, "pid": 76337, "tid": -914061504, "ts": 1716454222289472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222351785, "dur": 61, "args": { "External id": 64220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64220, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64220, "pid": 5, "tid": 7, "ts": 1716454222351785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289554, "dur": 13, "args": { "External id": 64220, "cbid": 211, "correlation": 64220 } }, { "ph": "s", "id": 64220, "pid": 76337, "tid": -914061504, "ts": 1716454222289554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289655, "dur": 1, "args": { "External id": 64238, "cbid": 251, "correlation": 64238 } }, { "ph": "f", "id": 64238, "pid": 76337, "tid": -914061504, "ts": 1716454222289655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222351847, "dur": 69, "args": { "External id": 64240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64240, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 64240, "pid": 5, "tid": 7, "ts": 1716454222351847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289661, "dur": 14, "args": { "External id": 64240, "cbid": 211, "correlation": 64240 } }, { "ph": "s", "id": 64240, "pid": 76337, "tid": -914061504, "ts": 1716454222289661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222351918, "dur": 7, "args": { "External id": 64248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64248, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64248, "pid": 5, "tid": 7, "ts": 1716454222351918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289731, "dur": 12, "args": { "External id": 64248, "cbid": 211, "correlation": 64248 } }, { "ph": "s", "id": 64248, "pid": 76337, "tid": -914061504, "ts": 1716454222289731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222351926, "dur": 8, "args": { "External id": 64256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64256, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64256, "pid": 5, "tid": 7, "ts": 1716454222351926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289769, "dur": 12, "args": { "External id": 64256, "cbid": 211, "correlation": 64256 } }, { "ph": "s", "id": 64256, "pid": 76337, "tid": -914061504, "ts": 1716454222289769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222351935, "dur": 8, "args": { "External id": 64267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64267, "pid": 5, "tid": 7, "ts": 1716454222351935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289846, "dur": 13, "args": { "External id": 64267, "cbid": 211, "correlation": 64267 } }, { "ph": "s", "id": 64267, "pid": 76337, "tid": -914061504, "ts": 1716454222289846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222351945, "dur": 9, "args": { "External id": 64289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64289, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 64289, "pid": 5, "tid": 7, "ts": 1716454222351945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289879, "dur": 8, "args": { "External id": 64289, "cbid": 211, "correlation": 64289 } }, { "ph": "s", "id": 64289, "pid": 76337, "tid": -914061504, "ts": 1716454222289879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222289966, "dur": 2, "args": { "External id": 64300, "cbid": 251, "correlation": 64300 } }, { "ph": "f", "id": 64300, "pid": 76337, "tid": -914061504, "ts": 1716454222289966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222351956, "dur": 1, "args": { "External id": 64301, "device": 5, "context": 1, "stream": 7, "correlation": 64301, "bytes": 480, "memory bandwidth (GB/s)": 0.29411764705882354 } }, { "ph": "f", "id": 64301, "pid": 5, "tid": 7, "ts": 1716454222351956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222289972, "dur": 18, "args": { "External id": 64301, "cbid": 51, "correlation": 64301 } }, { "ph": "s", "id": 64301, "pid": 76337, "tid": -914061504, "ts": 1716454222289972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222351960, "dur": 39, "args": { "External id": 64302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64302, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 64302, "pid": 5, "tid": 7, "ts": 1716454222351960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222289992, "dur": 16, "args": { "External id": 64302, "cbid": 211, "correlation": 64302 } }, { "ph": "s", "id": 64302, "pid": 76337, "tid": -914061504, "ts": 1716454222289992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290069, "dur": 1, "args": { "External id": 64313, "cbid": 251, "correlation": 64313 } }, { "ph": "f", "id": 64313, "pid": 76337, "tid": -914061504, "ts": 1716454222290069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290073, "dur": 0, "args": { "External id": 64314, "cbid": 251, "correlation": 64314 } }, { "ph": "f", "id": 64314, "pid": 76337, "tid": -914061504, "ts": 1716454222290073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222352001, "dur": 12, "args": { "External id": 64315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64315, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64315, "pid": 5, "tid": 7, "ts": 1716454222352001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290075, "dur": 13, "args": { "External id": 64315, "cbid": 211, "correlation": 64315 } }, { "ph": "s", "id": 64315, "pid": 76337, "tid": -914061504, "ts": 1716454222290075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222352015, "dur": 6, "args": { "External id": 64317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64317, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64317, "pid": 5, "tid": 7, "ts": 1716454222352015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290090, "dur": 6, "args": { "External id": 64317, "cbid": 211, "correlation": 64317 } }, { "ph": "s", "id": 64317, "pid": 76337, "tid": -914061504, "ts": 1716454222290090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290146, "dur": 1, "args": { "External id": 64328, "cbid": 251, "correlation": 64328 } }, { "ph": "f", "id": 64328, "pid": 76337, "tid": -914061504, "ts": 1716454222290146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290150, "dur": 0, "args": { "External id": 64329, "cbid": 251, "correlation": 64329 } }, { "ph": "f", "id": 64329, "pid": 76337, "tid": -914061504, "ts": 1716454222290150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222352022, "dur": 9, "args": { "External id": 64330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64330, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64330, "pid": 5, "tid": 7, "ts": 1716454222352022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290151, "dur": 14, "args": { "External id": 64330, "cbid": 211, "correlation": 64330 } }, { "ph": "s", "id": 64330, "pid": 76337, "tid": -914061504, "ts": 1716454222290151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222352032, "dur": 4, "args": { "External id": 64332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64332, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64332, "pid": 5, "tid": 7, "ts": 1716454222352032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290167, "dur": 5, "args": { "External id": 64332, "cbid": 211, "correlation": 64332 } }, { "ph": "s", "id": 64332, "pid": 76337, "tid": -914061504, "ts": 1716454222290167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222352037, "dur": 22, "args": { "External id": 64357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64357, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 64357, "pid": 5, "tid": 7, "ts": 1716454222352037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290246, "dur": 13, "args": { "External id": 64357, "cbid": 211, "correlation": 64357 } }, { "ph": "s", "id": 64357, "pid": 76337, "tid": -914061504, "ts": 1716454222290246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290351, "dur": 2, "args": { "External id": 64375, "cbid": 251, "correlation": 64375 } }, { "ph": "f", "id": 64375, "pid": 76337, "tid": -914061504, "ts": 1716454222290351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222352061, "dur": 1, "args": { "External id": 64377, "device": 5, "context": 1, "stream": 7, "correlation": 64377, "bytes": 480, "memory bandwidth (GB/s)": 0.29411764705882354 } }, { "ph": "f", "id": 64377, "pid": 5, "tid": 7, "ts": 1716454222352061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222290357, "dur": 10, "args": { "External id": 64377, "cbid": 51, "correlation": 64377 } }, { "ph": "s", "id": 64377, "pid": 76337, "tid": -914061504, "ts": 1716454222290357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222352065, "dur": 40, "args": { "External id": 64378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64378, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 64378, "pid": 5, "tid": 7, "ts": 1716454222352065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290369, "dur": 13, "args": { "External id": 64378, "cbid": 211, "correlation": 64378 } }, { "ph": "s", "id": 64378, "pid": 76337, "tid": -914061504, "ts": 1716454222290369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222352107, "dur": 5, "args": { "External id": 64386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64386, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64386, "pid": 5, "tid": 7, "ts": 1716454222352107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290440, "dur": 12, "args": { "External id": 64386, "cbid": 211, "correlation": 64386 } }, { "ph": "s", "id": 64386, "pid": 76337, "tid": -914061504, "ts": 1716454222290440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222352113, "dur": 9, "args": { "External id": 64394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64394, "pid": 5, "tid": 7, "ts": 1716454222352113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290482, "dur": 9, "args": { "External id": 64394, "cbid": 211, "correlation": 64394 } }, { "ph": "s", "id": 64394, "pid": 76337, "tid": -914061504, "ts": 1716454222290482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222352124, "dur": 9, "args": { "External id": 64416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64416, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 64416, "pid": 5, "tid": 7, "ts": 1716454222352124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290536, "dur": 11, "args": { "External id": 64416, "cbid": 211, "correlation": 64416 } }, { "ph": "s", "id": 64416, "pid": 76337, "tid": -914061504, "ts": 1716454222290536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290629, "dur": 1, "args": { "External id": 64432, "cbid": 251, "correlation": 64432 } }, { "ph": "f", "id": 64432, "pid": 76337, "tid": -914061504, "ts": 1716454222290629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290634, "dur": 0, "args": { "External id": 64434, "cbid": 251, "correlation": 64434 } }, { "ph": "f", "id": 64434, "pid": 76337, "tid": -914061504, "ts": 1716454222290634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222352134, "dur": 199, "args": { "External id": 64435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64435, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64435, "pid": 5, "tid": 7, "ts": 1716454222352134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290636, "dur": 13, "args": { "External id": 64435, "cbid": 211, "correlation": 64435 } }, { "ph": "s", "id": 64435, "pid": 76337, "tid": -914061504, "ts": 1716454222290636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222352335, "dur": 23, "args": { "External id": 64443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64443, "pid": 5, "tid": 7, "ts": 1716454222352335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290704, "dur": 13, "args": { "External id": 64443, "cbid": 211, "correlation": 64443 } }, { "ph": "s", "id": 64443, "pid": 76337, "tid": -914061504, "ts": 1716454222290704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222352359, "dur": 23, "args": { "External id": 64451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64451, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64451, "pid": 5, "tid": 7, "ts": 1716454222352359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290736, "dur": 8, "args": { "External id": 64451, "cbid": 211, "correlation": 64451 } }, { "ph": "s", "id": 64451, "pid": 76337, "tid": -914061504, "ts": 1716454222290736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222290818, "dur": 1, "args": { "External id": 64467, "cbid": 251, "correlation": 64467 } }, { "ph": "f", "id": 64467, "pid": 76337, "tid": -914061504, "ts": 1716454222290818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222352384, "dur": 1, "args": { "External id": 64469, "device": 5, "context": 1, "stream": 7, "correlation": 64469, "bytes": 120, "memory bandwidth (GB/s)": 0.075 } }, { "ph": "f", "id": 64469, "pid": 5, "tid": 7, "ts": 1716454222352384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222290824, "dur": 9, "args": { "External id": 64469, "cbid": 51, "correlation": 64469 } }, { "ph": "s", "id": 64469, "pid": 76337, "tid": -914061504, "ts": 1716454222290824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222352388, "dur": 119, "args": { "External id": 64470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64470, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 64470, "pid": 5, "tid": 7, "ts": 1716454222352388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290834, "dur": 16, "args": { "External id": 64470, "cbid": 211, "correlation": 64470 } }, { "ph": "s", "id": 64470, "pid": 76337, "tid": -914061504, "ts": 1716454222290834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222352509, "dur": 6, "args": { "External id": 64478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64478, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64478, "pid": 5, "tid": 7, "ts": 1716454222352509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290881, "dur": 13, "args": { "External id": 64478, "cbid": 211, "correlation": 64478 } }, { "ph": "s", "id": 64478, "pid": 76337, "tid": -914061504, "ts": 1716454222290881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222352516, "dur": 11, "args": { "External id": 64489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64489, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64489, "pid": 5, "tid": 7, "ts": 1716454222352516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222290954, "dur": 13, "args": { "External id": 64489, "cbid": 211, "correlation": 64489 } }, { "ph": "s", "id": 64489, "pid": 76337, "tid": -914061504, "ts": 1716454222290954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222291028, "dur": 0, "args": { "External id": 64501, "cbid": 317, "correlation": 64501 } }, { "ph": "f", "id": 64501, "pid": 76337, "tid": -914061504, "ts": 1716454222291028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222291029, "dur": 0, "args": { "External id": 64502, "cbid": 203, "correlation": 64502 } }, { "ph": "f", "id": 64502, "pid": 76337, "tid": -914061504, "ts": 1716454222291029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222291030, "dur": 0, "args": { "External id": 64503, "cbid": 205, "correlation": 64503 } }, { "ph": "f", "id": 64503, "pid": 76337, "tid": -914061504, "ts": 1716454222291030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222352528, "dur": 6, "args": { "External id": 64507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64507, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64507, "pid": 5, "tid": 7, "ts": 1716454222352528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291045, "dur": 13, "args": { "External id": 64507, "cbid": 211, "correlation": 64507 } }, { "ph": "s", "id": 64507, "pid": 76337, "tid": -914061504, "ts": 1716454222291045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222352535, "dur": 41, "args": { "External id": 64509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64509, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 64509, "pid": 5, "tid": 7, "ts": 1716454222352535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291065, "dur": 7, "args": { "External id": 64509, "cbid": 211, "correlation": 64509 } }, { "ph": "s", "id": 64509, "pid": 76337, "tid": -914061504, "ts": 1716454222291065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222352577, "dur": 6, "args": { "External id": 64511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64511, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64511, "pid": 5, "tid": 7, "ts": 1716454222352577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291075, "dur": 5, "args": { "External id": 64511, "cbid": 211, "correlation": 64511 } }, { "ph": "s", "id": 64511, "pid": 76337, "tid": -914061504, "ts": 1716454222291075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222352585, "dur": 8, "args": { "External id": 64517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64517, "pid": 5, "tid": 7, "ts": 1716454222352585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291103, "dur": 9, "args": { "External id": 64517, "cbid": 211, "correlation": 64517 } }, { "ph": "s", "id": 64517, "pid": 76337, "tid": -914061504, "ts": 1716454222291103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222352594, "dur": 5, "args": { "External id": 64525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64525, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64525, "pid": 5, "tid": 7, "ts": 1716454222352594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291135, "dur": 8, "args": { "External id": 64525, "cbid": 211, "correlation": 64525 } }, { "ph": "s", "id": 64525, "pid": 76337, "tid": -914061504, "ts": 1716454222291135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222352601, "dur": 11, "args": { "External id": 64545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64545, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 64545, "pid": 5, "tid": 7, "ts": 1716454222352601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291207, "dur": 12, "args": { "External id": 64545, "cbid": 211, "correlation": 64545 } }, { "ph": "s", "id": 64545, "pid": 76337, "tid": -914061504, "ts": 1716454222291207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222352614, "dur": 5, "args": { "External id": 64557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64557, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 64557, "pid": 5, "tid": 7, "ts": 1716454222352614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291233, "dur": 7, "args": { "External id": 64557, "cbid": 211, "correlation": 64557 } }, { "ph": "s", "id": 64557, "pid": 76337, "tid": -914061504, "ts": 1716454222291233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222352620, "dur": 9, "args": { "External id": 64560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64560, "pid": 5, "tid": 7, "ts": 1716454222352620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291251, "dur": 6, "args": { "External id": 64560, "cbid": 211, "correlation": 64560 } }, { "ph": "s", "id": 64560, "pid": 76337, "tid": -914061504, "ts": 1716454222291251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222352630, "dur": 6, "args": { "External id": 64569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64569, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64569, "pid": 5, "tid": 7, "ts": 1716454222352630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291291, "dur": 10, "args": { "External id": 64569, "cbid": 211, "correlation": 64569 } }, { "ph": "s", "id": 64569, "pid": 76337, "tid": -914061504, "ts": 1716454222291291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222291343, "dur": 0, "args": { "External id": 64579, "cbid": 317, "correlation": 64579 } }, { "ph": "f", "id": 64579, "pid": 76337, "tid": -914061504, "ts": 1716454222291343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222291344, "dur": 0, "args": { "External id": 64580, "cbid": 203, "correlation": 64580 } }, { "ph": "f", "id": 64580, "pid": 76337, "tid": -914061504, "ts": 1716454222291344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222291344, "dur": 0, "args": { "External id": 64581, "cbid": 205, "correlation": 64581 } }, { "ph": "f", "id": 64581, "pid": 76337, "tid": -914061504, "ts": 1716454222291344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222352637, "dur": 6, "args": { "External id": 64585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64585, "pid": 5, "tid": 7, "ts": 1716454222352637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291358, "dur": 11, "args": { "External id": 64585, "cbid": 211, "correlation": 64585 } }, { "ph": "s", "id": 64585, "pid": 76337, "tid": -914061504, "ts": 1716454222291358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222352644, "dur": 177, "args": { "External id": 64587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64587, "pid": 5, "tid": 7, "ts": 1716454222352644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291372, "dur": 5, "args": { "External id": 64587, "cbid": 211, "correlation": 64587 } }, { "ph": "s", "id": 64587, "pid": 76337, "tid": -914061504, "ts": 1716454222291372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222352823, "dur": 1, "args": { "External id": 64589, "device": 5, "context": 1, "stream": 7, "correlation": 64589, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 64589, "pid": 5, "tid": 7, "ts": 1716454222352823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222291382, "dur": 8, "args": { "External id": 64589, "cbid": 51, "correlation": 64589 } }, { "ph": "s", "id": 64589, "pid": 76337, "tid": -914061504, "ts": 1716454222291382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222352827, "dur": 295, "args": { "External id": 64590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64590, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64590, "pid": 5, "tid": 7, "ts": 1716454222352827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291391, "dur": 6, "args": { "External id": 64590, "cbid": 211, "correlation": 64590 } }, { "ph": "s", "id": 64590, "pid": 76337, "tid": -914061504, "ts": 1716454222291391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222353124, "dur": 6, "args": { "External id": 64592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64592, "pid": 5, "tid": 7, "ts": 1716454222353124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291401, "dur": 5, "args": { "External id": 64592, "cbid": 211, "correlation": 64592 } }, { "ph": "s", "id": 64592, "pid": 76337, "tid": -914061504, "ts": 1716454222291401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222353131, "dur": 7, "args": { "External id": 64598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64598, "pid": 5, "tid": 7, "ts": 1716454222353131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291429, "dur": 9, "args": { "External id": 64598, "cbid": 211, "correlation": 64598 } }, { "ph": "s", "id": 64598, "pid": 76337, "tid": -914061504, "ts": 1716454222291429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222353139, "dur": 3, "args": { "External id": 64606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64606, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 64606, "pid": 5, "tid": 7, "ts": 1716454222353139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291473, "dur": 9, "args": { "External id": 64606, "cbid": 211, "correlation": 64606 } }, { "ph": "s", "id": 64606, "pid": 76337, "tid": -914061504, "ts": 1716454222291473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222291543, "dur": 1, "args": { "External id": 64622, "cbid": 251, "correlation": 64622 } }, { "ph": "f", "id": 64622, "pid": 76337, "tid": -914061504, "ts": 1716454222291543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222291548, "dur": 0, "args": { "External id": 64624, "cbid": 251, "correlation": 64624 } }, { "ph": "f", "id": 64624, "pid": 76337, "tid": -914061504, "ts": 1716454222291548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222353144, "dur": 14, "args": { "External id": 64625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64625, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64625, "pid": 5, "tid": 7, "ts": 1716454222353144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291550, "dur": 11, "args": { "External id": 64625, "cbid": 211, "correlation": 64625 } }, { "ph": "s", "id": 64625, "pid": 76337, "tid": -914061504, "ts": 1716454222291550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222353159, "dur": 5, "args": { "External id": 64627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64627, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64627, "pid": 5, "tid": 7, "ts": 1716454222353159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291563, "dur": 5, "args": { "External id": 64627, "cbid": 211, "correlation": 64627 } }, { "ph": "s", "id": 64627, "pid": 76337, "tid": -914061504, "ts": 1716454222291563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222353166, "dur": 6, "args": { "External id": 64637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64637, "pid": 5, "tid": 7, "ts": 1716454222353166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291622, "dur": 12, "args": { "External id": 64637, "cbid": 211, "correlation": 64637 } }, { "ph": "s", "id": 64637, "pid": 76337, "tid": -914061504, "ts": 1716454222291622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222353173, "dur": 11, "args": { "External id": 64657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64657, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 64657, "pid": 5, "tid": 7, "ts": 1716454222353173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291687, "dur": 11, "args": { "External id": 64657, "cbid": 211, "correlation": 64657 } }, { "ph": "s", "id": 64657, "pid": 76337, "tid": -914061504, "ts": 1716454222291687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222353186, "dur": 4, "args": { "External id": 64669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64669, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 64669, "pid": 5, "tid": 7, "ts": 1716454222353186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291709, "dur": 6, "args": { "External id": 64669, "cbid": 211, "correlation": 64669 } }, { "ph": "s", "id": 64669, "pid": 76337, "tid": -914061504, "ts": 1716454222291709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222353191, "dur": 8, "args": { "External id": 64672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64672, "pid": 5, "tid": 7, "ts": 1716454222353191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291728, "dur": 7, "args": { "External id": 64672, "cbid": 211, "correlation": 64672 } }, { "ph": "s", "id": 64672, "pid": 76337, "tid": -914061504, "ts": 1716454222291728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222353200, "dur": 5, "args": { "External id": 64681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64681, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64681, "pid": 5, "tid": 7, "ts": 1716454222353200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291769, "dur": 10, "args": { "External id": 64681, "cbid": 211, "correlation": 64681 } }, { "ph": "s", "id": 64681, "pid": 76337, "tid": -914061504, "ts": 1716454222291769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222291836, "dur": 0, "args": { "External id": 64691, "cbid": 317, "correlation": 64691 } }, { "ph": "f", "id": 64691, "pid": 76337, "tid": -914061504, "ts": 1716454222291836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222291837, "dur": 0, "args": { "External id": 64692, "cbid": 203, "correlation": 64692 } }, { "ph": "f", "id": 64692, "pid": 76337, "tid": -914061504, "ts": 1716454222291837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222291838, "dur": 0, "args": { "External id": 64693, "cbid": 205, "correlation": 64693 } }, { "ph": "f", "id": 64693, "pid": 76337, "tid": -914061504, "ts": 1716454222291838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222353206, "dur": 5, "args": { "External id": 64697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64697, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64697, "pid": 5, "tid": 7, "ts": 1716454222353206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291851, "dur": 12, "args": { "External id": 64697, "cbid": 211, "correlation": 64697 } }, { "ph": "s", "id": 64697, "pid": 76337, "tid": -914061504, "ts": 1716454222291851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222353213, "dur": 177, "args": { "External id": 64699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64699, "pid": 5, "tid": 7, "ts": 1716454222353213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291866, "dur": 5, "args": { "External id": 64699, "cbid": 211, "correlation": 64699 } }, { "ph": "s", "id": 64699, "pid": 76337, "tid": -914061504, "ts": 1716454222291866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222353393, "dur": 1, "args": { "External id": 64701, "device": 5, "context": 1, "stream": 7, "correlation": 64701, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 64701, "pid": 5, "tid": 7, "ts": 1716454222353393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222291877, "dur": 6, "args": { "External id": 64701, "cbid": 51, "correlation": 64701 } }, { "ph": "s", "id": 64701, "pid": 76337, "tid": -914061504, "ts": 1716454222291877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222353397, "dur": 284, "args": { "External id": 64702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64702, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64702, "pid": 5, "tid": 7, "ts": 1716454222353397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291884, "dur": 6, "args": { "External id": 64702, "cbid": 211, "correlation": 64702 } }, { "ph": "s", "id": 64702, "pid": 76337, "tid": -914061504, "ts": 1716454222291884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222353682, "dur": 7, "args": { "External id": 64704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64704, "pid": 5, "tid": 7, "ts": 1716454222353682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291894, "dur": 5, "args": { "External id": 64704, "cbid": 211, "correlation": 64704 } }, { "ph": "s", "id": 64704, "pid": 76337, "tid": -914061504, "ts": 1716454222291894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222353690, "dur": 7, "args": { "External id": 64710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64710, "pid": 5, "tid": 7, "ts": 1716454222353690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291922, "dur": 9, "args": { "External id": 64710, "cbid": 211, "correlation": 64710 } }, { "ph": "s", "id": 64710, "pid": 76337, "tid": -914061504, "ts": 1716454222291922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222353699, "dur": 5, "args": { "External id": 64718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64718, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64718, "pid": 5, "tid": 7, "ts": 1716454222353699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291956, "dur": 8, "args": { "External id": 64718, "cbid": 211, "correlation": 64718 } }, { "ph": "s", "id": 64718, "pid": 76337, "tid": -914061504, "ts": 1716454222291956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222353705, "dur": 5, "args": { "External id": 64726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64726, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64726, "pid": 5, "tid": 7, "ts": 1716454222353705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222291996, "dur": 9, "args": { "External id": 64726, "cbid": 211, "correlation": 64726 } }, { "ph": "s", "id": 64726, "pid": 76337, "tid": -914061504, "ts": 1716454222291996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222353712, "dur": 12, "args": { "External id": 64735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64735, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64735, "pid": 5, "tid": 7, "ts": 1716454222353712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292086, "dur": 14, "args": { "External id": 64735, "cbid": 211, "correlation": 64735 } }, { "ph": "s", "id": 64735, "pid": 76337, "tid": -914061504, "ts": 1716454222292086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222353725, "dur": 14, "args": { "External id": 64755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64755, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 64755, "pid": 5, "tid": 7, "ts": 1716454222353725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292158, "dur": 11, "args": { "External id": 64755, "cbid": 211, "correlation": 64755 } }, { "ph": "s", "id": 64755, "pid": 76337, "tid": -914061504, "ts": 1716454222292158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222353740, "dur": 4, "args": { "External id": 64767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64767, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64767, "pid": 5, "tid": 7, "ts": 1716454222353740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292179, "dur": 6, "args": { "External id": 64767, "cbid": 211, "correlation": 64767 } }, { "ph": "s", "id": 64767, "pid": 76337, "tid": -914061504, "ts": 1716454222292179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222353746, "dur": 11, "args": { "External id": 64770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64770, "pid": 5, "tid": 7, "ts": 1716454222353746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292197, "dur": 7, "args": { "External id": 64770, "cbid": 211, "correlation": 64770 } }, { "ph": "s", "id": 64770, "pid": 76337, "tid": -914061504, "ts": 1716454222292197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222353758, "dur": 7, "args": { "External id": 64779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64779, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64779, "pid": 5, "tid": 7, "ts": 1716454222353758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292237, "dur": 9, "args": { "External id": 64779, "cbid": 211, "correlation": 64779 } }, { "ph": "s", "id": 64779, "pid": 76337, "tid": -914061504, "ts": 1716454222292237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222292291, "dur": 0, "args": { "External id": 64789, "cbid": 317, "correlation": 64789 } }, { "ph": "f", "id": 64789, "pid": 76337, "tid": -914061504, "ts": 1716454222292291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222292292, "dur": 0, "args": { "External id": 64790, "cbid": 203, "correlation": 64790 } }, { "ph": "f", "id": 64790, "pid": 76337, "tid": -914061504, "ts": 1716454222292292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222292293, "dur": 0, "args": { "External id": 64791, "cbid": 205, "correlation": 64791 } }, { "ph": "f", "id": 64791, "pid": 76337, "tid": -914061504, "ts": 1716454222292293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222353766, "dur": 7, "args": { "External id": 64795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64795, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64795, "pid": 5, "tid": 7, "ts": 1716454222353766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292316, "dur": 13, "args": { "External id": 64795, "cbid": 211, "correlation": 64795 } }, { "ph": "s", "id": 64795, "pid": 76337, "tid": -914061504, "ts": 1716454222292316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222353774, "dur": 349, "args": { "External id": 64797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64797, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64797, "pid": 5, "tid": 7, "ts": 1716454222353774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292331, "dur": 5, "args": { "External id": 64797, "cbid": 211, "correlation": 64797 } }, { "ph": "s", "id": 64797, "pid": 76337, "tid": -914061504, "ts": 1716454222292331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222354125, "dur": 1, "args": { "External id": 64799, "device": 5, "context": 1, "stream": 7, "correlation": 64799, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 64799, "pid": 5, "tid": 7, "ts": 1716454222354125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222292342, "dur": 7, "args": { "External id": 64799, "cbid": 51, "correlation": 64799 } }, { "ph": "s", "id": 64799, "pid": 76337, "tid": -914061504, "ts": 1716454222292342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222354129, "dur": 545, "args": { "External id": 64800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64800, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64800, "pid": 5, "tid": 7, "ts": 1716454222354129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292350, "dur": 6, "args": { "External id": 64800, "cbid": 211, "correlation": 64800 } }, { "ph": "s", "id": 64800, "pid": 76337, "tid": -914061504, "ts": 1716454222292350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222354675, "dur": 6, "args": { "External id": 64802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64802, "pid": 5, "tid": 7, "ts": 1716454222354675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292360, "dur": 5, "args": { "External id": 64802, "cbid": 211, "correlation": 64802 } }, { "ph": "s", "id": 64802, "pid": 76337, "tid": -914061504, "ts": 1716454222292360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222354683, "dur": 7, "args": { "External id": 64808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64808, "pid": 5, "tid": 7, "ts": 1716454222354683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292390, "dur": 10, "args": { "External id": 64808, "cbid": 211, "correlation": 64808 } }, { "ph": "s", "id": 64808, "pid": 76337, "tid": -914061504, "ts": 1716454222292390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222354691, "dur": 3, "args": { "External id": 64816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64816, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 64816, "pid": 5, "tid": 7, "ts": 1716454222354691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292436, "dur": 9, "args": { "External id": 64816, "cbid": 211, "correlation": 64816 } }, { "ph": "s", "id": 64816, "pid": 76337, "tid": -914061504, "ts": 1716454222292436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222292499, "dur": 1, "args": { "External id": 64832, "cbid": 251, "correlation": 64832 } }, { "ph": "f", "id": 64832, "pid": 76337, "tid": -914061504, "ts": 1716454222292499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222292504, "dur": 0, "args": { "External id": 64834, "cbid": 251, "correlation": 64834 } }, { "ph": "f", "id": 64834, "pid": 76337, "tid": -914061504, "ts": 1716454222292504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222354695, "dur": 12, "args": { "External id": 64835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64835, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64835, "pid": 5, "tid": 7, "ts": 1716454222354695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292506, "dur": 11, "args": { "External id": 64835, "cbid": 211, "correlation": 64835 } }, { "ph": "s", "id": 64835, "pid": 76337, "tid": -914061504, "ts": 1716454222292506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222354709, "dur": 5, "args": { "External id": 64837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64837, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64837, "pid": 5, "tid": 7, "ts": 1716454222354709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292518, "dur": 6, "args": { "External id": 64837, "cbid": 211, "correlation": 64837 } }, { "ph": "s", "id": 64837, "pid": 76337, "tid": -914061504, "ts": 1716454222292518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222354715, "dur": 7, "args": { "External id": 64847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64847, "pid": 5, "tid": 7, "ts": 1716454222354715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292575, "dur": 12, "args": { "External id": 64847, "cbid": 211, "correlation": 64847 } }, { "ph": "s", "id": 64847, "pid": 76337, "tid": -914061504, "ts": 1716454222292575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222354724, "dur": 11, "args": { "External id": 64867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64867, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 64867, "pid": 5, "tid": 7, "ts": 1716454222354724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292641, "dur": 12, "args": { "External id": 64867, "cbid": 211, "correlation": 64867 } }, { "ph": "s", "id": 64867, "pid": 76337, "tid": -914061504, "ts": 1716454222292641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222354736, "dur": 4, "args": { "External id": 64879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64879, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 64879, "pid": 5, "tid": 7, "ts": 1716454222354736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292663, "dur": 6, "args": { "External id": 64879, "cbid": 211, "correlation": 64879 } }, { "ph": "s", "id": 64879, "pid": 76337, "tid": -914061504, "ts": 1716454222292663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222354741, "dur": 7, "args": { "External id": 64882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64882, "pid": 5, "tid": 7, "ts": 1716454222354741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292680, "dur": 9, "args": { "External id": 64882, "cbid": 211, "correlation": 64882 } }, { "ph": "s", "id": 64882, "pid": 76337, "tid": -914061504, "ts": 1716454222292680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222354750, "dur": 5, "args": { "External id": 64891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64891, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64891, "pid": 5, "tid": 7, "ts": 1716454222354750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292725, "dur": 10, "args": { "External id": 64891, "cbid": 211, "correlation": 64891 } }, { "ph": "s", "id": 64891, "pid": 76337, "tid": -914061504, "ts": 1716454222292725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222292788, "dur": 0, "args": { "External id": 64901, "cbid": 317, "correlation": 64901 } }, { "ph": "f", "id": 64901, "pid": 76337, "tid": -914061504, "ts": 1716454222292788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222292789, "dur": 0, "args": { "External id": 64902, "cbid": 203, "correlation": 64902 } }, { "ph": "f", "id": 64902, "pid": 76337, "tid": -914061504, "ts": 1716454222292789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222292789, "dur": 0, "args": { "External id": 64903, "cbid": 205, "correlation": 64903 } }, { "ph": "f", "id": 64903, "pid": 76337, "tid": -914061504, "ts": 1716454222292789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222354756, "dur": 6, "args": { "External id": 64907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64907, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64907, "pid": 5, "tid": 7, "ts": 1716454222354756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292803, "dur": 12, "args": { "External id": 64907, "cbid": 211, "correlation": 64907 } }, { "ph": "s", "id": 64907, "pid": 76337, "tid": -914061504, "ts": 1716454222292803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222354763, "dur": 177, "args": { "External id": 64909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64909, "pid": 5, "tid": 7, "ts": 1716454222354763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292817, "dur": 5, "args": { "External id": 64909, "cbid": 211, "correlation": 64909 } }, { "ph": "s", "id": 64909, "pid": 76337, "tid": -914061504, "ts": 1716454222292817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222354942, "dur": 1, "args": { "External id": 64911, "device": 5, "context": 1, "stream": 7, "correlation": 64911, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 64911, "pid": 5, "tid": 7, "ts": 1716454222354942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222292829, "dur": 7, "args": { "External id": 64911, "cbid": 51, "correlation": 64911 } }, { "ph": "s", "id": 64911, "pid": 76337, "tid": -914061504, "ts": 1716454222292829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222354946, "dur": 285, "args": { "External id": 64912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64912, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 64912, "pid": 5, "tid": 7, "ts": 1716454222354946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292837, "dur": 6, "args": { "External id": 64912, "cbid": 211, "correlation": 64912 } }, { "ph": "s", "id": 64912, "pid": 76337, "tid": -914061504, "ts": 1716454222292837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222355232, "dur": 6, "args": { "External id": 64914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64914, "pid": 5, "tid": 7, "ts": 1716454222355232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292846, "dur": 5, "args": { "External id": 64914, "cbid": 211, "correlation": 64914 } }, { "ph": "s", "id": 64914, "pid": 76337, "tid": -914061504, "ts": 1716454222292846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222355239, "dur": 7, "args": { "External id": 64920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64920, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64920, "pid": 5, "tid": 7, "ts": 1716454222355239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292874, "dur": 9, "args": { "External id": 64920, "cbid": 211, "correlation": 64920 } }, { "ph": "s", "id": 64920, "pid": 76337, "tid": -914061504, "ts": 1716454222292874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222292934, "dur": 0, "args": { "External id": 64930, "cbid": 317, "correlation": 64930 } }, { "ph": "f", "id": 64930, "pid": 76337, "tid": -914061504, "ts": 1716454222292934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222292934, "dur": 0, "args": { "External id": 64931, "cbid": 203, "correlation": 64931 } }, { "ph": "f", "id": 64931, "pid": 76337, "tid": -914061504, "ts": 1716454222292934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222292935, "dur": 0, "args": { "External id": 64932, "cbid": 205, "correlation": 64932 } }, { "ph": "f", "id": 64932, "pid": 76337, "tid": -914061504, "ts": 1716454222292935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222355248, "dur": 8, "args": { "External id": 64936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64936, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64936, "pid": 5, "tid": 7, "ts": 1716454222355248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292955, "dur": 12, "args": { "External id": 64936, "cbid": 211, "correlation": 64936 } }, { "ph": "s", "id": 64936, "pid": 76337, "tid": -914061504, "ts": 1716454222292955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222355257, "dur": 3, "args": { "External id": 64938, "device": 5, "context": 1, "stream": 7, "correlation": 64938, "bytes": 4800, "memory bandwidth (GB/s)": 1.3761467889908257 } }, { "ph": "f", "id": 64938, "pid": 5, "tid": 7, "ts": 1716454222355257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222292972, "dur": 23, "args": { "External id": 64938, "cbid": 51, "correlation": 64938 } }, { "ph": "s", "id": 64938, "pid": 76337, "tid": -914061504, "ts": 1716454222292972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222355262, "dur": 103, "args": { "External id": 64939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64939, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 64939, "pid": 5, "tid": 7, "ts": 1716454222355262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222292996, "dur": 7, "args": { "External id": 64939, "cbid": 211, "correlation": 64939 } }, { "ph": "s", "id": 64939, "pid": 76337, "tid": -914061504, "ts": 1716454222292996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222355366, "dur": 6, "args": { "External id": 64941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64941, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64941, "pid": 5, "tid": 7, "ts": 1716454222355366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293008, "dur": 7, "args": { "External id": 64941, "cbid": 211, "correlation": 64941 } }, { "ph": "s", "id": 64941, "pid": 76337, "tid": -914061504, "ts": 1716454222293008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222355374, "dur": 7, "args": { "External id": 64947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64947, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64947, "pid": 5, "tid": 7, "ts": 1716454222355374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293038, "dur": 9, "args": { "External id": 64947, "cbid": 211, "correlation": 64947 } }, { "ph": "s", "id": 64947, "pid": 76337, "tid": -914061504, "ts": 1716454222293038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222355382, "dur": 5, "args": { "External id": 64955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64955, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64955, "pid": 5, "tid": 7, "ts": 1716454222355382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293069, "dur": 8, "args": { "External id": 64955, "cbid": 211, "correlation": 64955 } }, { "ph": "s", "id": 64955, "pid": 76337, "tid": -914061504, "ts": 1716454222293069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222355388, "dur": 5, "args": { "External id": 64963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64963, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 64963, "pid": 5, "tid": 7, "ts": 1716454222355388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293099, "dur": 8, "args": { "External id": 64963, "cbid": 211, "correlation": 64963 } }, { "ph": "s", "id": 64963, "pid": 76337, "tid": -914061504, "ts": 1716454222293099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222355395, "dur": 12, "args": { "External id": 64972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64972, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 64972, "pid": 5, "tid": 7, "ts": 1716454222355395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293142, "dur": 11, "args": { "External id": 64972, "cbid": 211, "correlation": 64972 } }, { "ph": "s", "id": 64972, "pid": 76337, "tid": -914061504, "ts": 1716454222293142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222355408, "dur": 14, "args": { "External id": 64992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 64992, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 64992, "pid": 5, "tid": 7, "ts": 1716454222355408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293213, "dur": 11, "args": { "External id": 64992, "cbid": 211, "correlation": 64992 } }, { "ph": "s", "id": 64992, "pid": 76337, "tid": -914061504, "ts": 1716454222293213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222355423, "dur": 4, "args": { "External id": 65004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65004, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65004, "pid": 5, "tid": 7, "ts": 1716454222355423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293234, "dur": 6, "args": { "External id": 65004, "cbid": 211, "correlation": 65004 } }, { "ph": "s", "id": 65004, "pid": 76337, "tid": -914061504, "ts": 1716454222293234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222355429, "dur": 12, "args": { "External id": 65007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65007, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65007, "pid": 5, "tid": 7, "ts": 1716454222355429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293252, "dur": 6, "args": { "External id": 65007, "cbid": 211, "correlation": 65007 } }, { "ph": "s", "id": 65007, "pid": 76337, "tid": -914061504, "ts": 1716454222293252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222355443, "dur": 7, "args": { "External id": 65016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65016, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65016, "pid": 5, "tid": 7, "ts": 1716454222355443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293293, "dur": 10, "args": { "External id": 65016, "cbid": 211, "correlation": 65016 } }, { "ph": "s", "id": 65016, "pid": 76337, "tid": -914061504, "ts": 1716454222293293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222293346, "dur": 0, "args": { "External id": 65026, "cbid": 317, "correlation": 65026 } }, { "ph": "f", "id": 65026, "pid": 76337, "tid": -914061504, "ts": 1716454222293346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222293347, "dur": 0, "args": { "External id": 65027, "cbid": 203, "correlation": 65027 } }, { "ph": "f", "id": 65027, "pid": 76337, "tid": -914061504, "ts": 1716454222293347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222293347, "dur": 0, "args": { "External id": 65028, "cbid": 205, "correlation": 65028 } }, { "ph": "f", "id": 65028, "pid": 76337, "tid": -914061504, "ts": 1716454222293347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222355451, "dur": 7, "args": { "External id": 65032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65032, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65032, "pid": 5, "tid": 7, "ts": 1716454222355451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293361, "dur": 11, "args": { "External id": 65032, "cbid": 211, "correlation": 65032 } }, { "ph": "s", "id": 65032, "pid": 76337, "tid": -914061504, "ts": 1716454222293361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222355459, "dur": 349, "args": { "External id": 65034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65034, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65034, "pid": 5, "tid": 7, "ts": 1716454222355459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293375, "dur": 5, "args": { "External id": 65034, "cbid": 211, "correlation": 65034 } }, { "ph": "s", "id": 65034, "pid": 76337, "tid": -914061504, "ts": 1716454222293375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222355811, "dur": 1, "args": { "External id": 65036, "device": 5, "context": 1, "stream": 7, "correlation": 65036, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 65036, "pid": 5, "tid": 7, "ts": 1716454222355811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222293386, "dur": 6, "args": { "External id": 65036, "cbid": 51, "correlation": 65036 } }, { "ph": "s", "id": 65036, "pid": 76337, "tid": -914061504, "ts": 1716454222293386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222355815, "dur": 546, "args": { "External id": 65037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65037, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65037, "pid": 5, "tid": 7, "ts": 1716454222355815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293393, "dur": 6, "args": { "External id": 65037, "cbid": 211, "correlation": 65037 } }, { "ph": "s", "id": 65037, "pid": 76337, "tid": -914061504, "ts": 1716454222293393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222356362, "dur": 6, "args": { "External id": 65039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65039, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65039, "pid": 5, "tid": 7, "ts": 1716454222356362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293403, "dur": 5, "args": { "External id": 65039, "cbid": 211, "correlation": 65039 } }, { "ph": "s", "id": 65039, "pid": 76337, "tid": -914061504, "ts": 1716454222293403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222356369, "dur": 7, "args": { "External id": 65045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65045, "pid": 5, "tid": 7, "ts": 1716454222356369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293431, "dur": 9, "args": { "External id": 65045, "cbid": 211, "correlation": 65045 } }, { "ph": "s", "id": 65045, "pid": 76337, "tid": -914061504, "ts": 1716454222293431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222356378, "dur": 3, "args": { "External id": 65053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65053, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 65053, "pid": 5, "tid": 7, "ts": 1716454222356378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293474, "dur": 9, "args": { "External id": 65053, "cbid": 211, "correlation": 65053 } }, { "ph": "s", "id": 65053, "pid": 76337, "tid": -914061504, "ts": 1716454222293474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222293535, "dur": 1, "args": { "External id": 65069, "cbid": 251, "correlation": 65069 } }, { "ph": "f", "id": 65069, "pid": 76337, "tid": -914061504, "ts": 1716454222293535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222293541, "dur": 0, "args": { "External id": 65071, "cbid": 251, "correlation": 65071 } }, { "ph": "f", "id": 65071, "pid": 76337, "tid": -914061504, "ts": 1716454222293541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222356382, "dur": 14, "args": { "External id": 65072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65072, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65072, "pid": 5, "tid": 7, "ts": 1716454222356382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293543, "dur": 11, "args": { "External id": 65072, "cbid": 211, "correlation": 65072 } }, { "ph": "s", "id": 65072, "pid": 76337, "tid": -914061504, "ts": 1716454222293543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222356397, "dur": 5, "args": { "External id": 65074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65074, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65074, "pid": 5, "tid": 7, "ts": 1716454222356397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293555, "dur": 8, "args": { "External id": 65074, "cbid": 211, "correlation": 65074 } }, { "ph": "s", "id": 65074, "pid": 76337, "tid": -914061504, "ts": 1716454222293555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222356404, "dur": 7, "args": { "External id": 65084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65084, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65084, "pid": 5, "tid": 7, "ts": 1716454222356404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293616, "dur": 12, "args": { "External id": 65084, "cbid": 211, "correlation": 65084 } }, { "ph": "s", "id": 65084, "pid": 76337, "tid": -914061504, "ts": 1716454222293616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222356412, "dur": 11, "args": { "External id": 65104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65104, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 65104, "pid": 5, "tid": 7, "ts": 1716454222356412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293681, "dur": 11, "args": { "External id": 65104, "cbid": 211, "correlation": 65104 } }, { "ph": "s", "id": 65104, "pid": 76337, "tid": -914061504, "ts": 1716454222293681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222356424, "dur": 4, "args": { "External id": 65116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65116, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 65116, "pid": 5, "tid": 7, "ts": 1716454222356424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293702, "dur": 6, "args": { "External id": 65116, "cbid": 211, "correlation": 65116 } }, { "ph": "s", "id": 65116, "pid": 76337, "tid": -914061504, "ts": 1716454222293702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222356430, "dur": 7, "args": { "External id": 65119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65119, "pid": 5, "tid": 7, "ts": 1716454222356430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293720, "dur": 6, "args": { "External id": 65119, "cbid": 211, "correlation": 65119 } }, { "ph": "s", "id": 65119, "pid": 76337, "tid": -914061504, "ts": 1716454222293720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222356438, "dur": 5, "args": { "External id": 65128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65128, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65128, "pid": 5, "tid": 7, "ts": 1716454222356438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293759, "dur": 9, "args": { "External id": 65128, "cbid": 211, "correlation": 65128 } }, { "ph": "s", "id": 65128, "pid": 76337, "tid": -914061504, "ts": 1716454222293759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222293821, "dur": 0, "args": { "External id": 65138, "cbid": 317, "correlation": 65138 } }, { "ph": "f", "id": 65138, "pid": 76337, "tid": -914061504, "ts": 1716454222293821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222293821, "dur": 0, "args": { "External id": 65139, "cbid": 203, "correlation": 65139 } }, { "ph": "f", "id": 65139, "pid": 76337, "tid": -914061504, "ts": 1716454222293821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222293822, "dur": 0, "args": { "External id": 65140, "cbid": 205, "correlation": 65140 } }, { "ph": "f", "id": 65140, "pid": 76337, "tid": -914061504, "ts": 1716454222293822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222356445, "dur": 6, "args": { "External id": 65144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65144, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65144, "pid": 5, "tid": 7, "ts": 1716454222356445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293836, "dur": 12, "args": { "External id": 65144, "cbid": 211, "correlation": 65144 } }, { "ph": "s", "id": 65144, "pid": 76337, "tid": -914061504, "ts": 1716454222293836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222356452, "dur": 176, "args": { "External id": 65146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65146, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65146, "pid": 5, "tid": 7, "ts": 1716454222356452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293850, "dur": 8, "args": { "External id": 65146, "cbid": 211, "correlation": 65146 } }, { "ph": "s", "id": 65146, "pid": 76337, "tid": -914061504, "ts": 1716454222293850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222356630, "dur": 1, "args": { "External id": 65148, "device": 5, "context": 1, "stream": 7, "correlation": 65148, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 65148, "pid": 5, "tid": 7, "ts": 1716454222356630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222293864, "dur": 6, "args": { "External id": 65148, "cbid": 51, "correlation": 65148 } }, { "ph": "s", "id": 65148, "pid": 76337, "tid": -914061504, "ts": 1716454222293864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222356634, "dur": 286, "args": { "External id": 65149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65149, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65149, "pid": 5, "tid": 7, "ts": 1716454222356634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293871, "dur": 6, "args": { "External id": 65149, "cbid": 211, "correlation": 65149 } }, { "ph": "s", "id": 65149, "pid": 76337, "tid": -914061504, "ts": 1716454222293871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222356921, "dur": 6, "args": { "External id": 65151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65151, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65151, "pid": 5, "tid": 7, "ts": 1716454222356921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293881, "dur": 5, "args": { "External id": 65151, "cbid": 211, "correlation": 65151 } }, { "ph": "s", "id": 65151, "pid": 76337, "tid": -914061504, "ts": 1716454222293881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222356929, "dur": 7, "args": { "External id": 65157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65157, "pid": 5, "tid": 7, "ts": 1716454222356929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293910, "dur": 8, "args": { "External id": 65157, "cbid": 211, "correlation": 65157 } }, { "ph": "s", "id": 65157, "pid": 76337, "tid": -914061504, "ts": 1716454222293910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222293968, "dur": 0, "args": { "External id": 65167, "cbid": 317, "correlation": 65167 } }, { "ph": "f", "id": 65167, "pid": 76337, "tid": -914061504, "ts": 1716454222293968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222293969, "dur": 0, "args": { "External id": 65168, "cbid": 203, "correlation": 65168 } }, { "ph": "f", "id": 65168, "pid": 76337, "tid": -914061504, "ts": 1716454222293969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222293970, "dur": 0, "args": { "External id": 65169, "cbid": 205, "correlation": 65169 } }, { "ph": "f", "id": 65169, "pid": 76337, "tid": -914061504, "ts": 1716454222293970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222356937, "dur": 8, "args": { "External id": 65173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65173, "pid": 5, "tid": 7, "ts": 1716454222356937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222293991, "dur": 12, "args": { "External id": 65173, "cbid": 211, "correlation": 65173 } }, { "ph": "s", "id": 65173, "pid": 76337, "tid": -914061504, "ts": 1716454222293991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222356947, "dur": 3, "args": { "External id": 65175, "device": 5, "context": 1, "stream": 7, "correlation": 65175, "bytes": 4800, "memory bandwidth (GB/s)": 1.3888888888888888 } }, { "ph": "f", "id": 65175, "pid": 5, "tid": 7, "ts": 1716454222356947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222294009, "dur": 11, "args": { "External id": 65175, "cbid": 51, "correlation": 65175 } }, { "ph": "s", "id": 65175, "pid": 76337, "tid": -914061504, "ts": 1716454222294009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222356951, "dur": 100, "args": { "External id": 65176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65176, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 65176, "pid": 5, "tid": 7, "ts": 1716454222356951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294020, "dur": 6, "args": { "External id": 65176, "cbid": 211, "correlation": 65176 } }, { "ph": "s", "id": 65176, "pid": 76337, "tid": -914061504, "ts": 1716454222294020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222357053, "dur": 6, "args": { "External id": 65178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65178, "pid": 5, "tid": 7, "ts": 1716454222357053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294030, "dur": 5, "args": { "External id": 65178, "cbid": 211, "correlation": 65178 } }, { "ph": "s", "id": 65178, "pid": 76337, "tid": -914061504, "ts": 1716454222294030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222357061, "dur": 7, "args": { "External id": 65184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65184, "pid": 5, "tid": 7, "ts": 1716454222357061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294057, "dur": 9, "args": { "External id": 65184, "cbid": 211, "correlation": 65184 } }, { "ph": "s", "id": 65184, "pid": 76337, "tid": -914061504, "ts": 1716454222294057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222357069, "dur": 5, "args": { "External id": 65192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65192, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65192, "pid": 5, "tid": 7, "ts": 1716454222357069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294087, "dur": 8, "args": { "External id": 65192, "cbid": 211, "correlation": 65192 } }, { "ph": "s", "id": 65192, "pid": 76337, "tid": -914061504, "ts": 1716454222294087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222357076, "dur": 5, "args": { "External id": 65200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65200, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65200, "pid": 5, "tid": 7, "ts": 1716454222357076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294115, "dur": 11, "args": { "External id": 65200, "cbid": 211, "correlation": 65200 } }, { "ph": "s", "id": 65200, "pid": 76337, "tid": -914061504, "ts": 1716454222294115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222357082, "dur": 12, "args": { "External id": 65209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65209, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65209, "pid": 5, "tid": 7, "ts": 1716454222357082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294162, "dur": 11, "args": { "External id": 65209, "cbid": 211, "correlation": 65209 } }, { "ph": "s", "id": 65209, "pid": 76337, "tid": -914061504, "ts": 1716454222294162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222357095, "dur": 13, "args": { "External id": 65229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65229, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 65229, "pid": 5, "tid": 7, "ts": 1716454222357095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294231, "dur": 12, "args": { "External id": 65229, "cbid": 211, "correlation": 65229 } }, { "ph": "s", "id": 65229, "pid": 76337, "tid": -914061504, "ts": 1716454222294231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222357110, "dur": 4, "args": { "External id": 65241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65241, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65241, "pid": 5, "tid": 7, "ts": 1716454222357110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294253, "dur": 6, "args": { "External id": 65241, "cbid": 211, "correlation": 65241 } }, { "ph": "s", "id": 65241, "pid": 76337, "tid": -914061504, "ts": 1716454222294253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222357115, "dur": 12, "args": { "External id": 65244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65244, "pid": 5, "tid": 7, "ts": 1716454222357115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294271, "dur": 7, "args": { "External id": 65244, "cbid": 211, "correlation": 65244 } }, { "ph": "s", "id": 65244, "pid": 76337, "tid": -914061504, "ts": 1716454222294271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222357128, "dur": 7, "args": { "External id": 65253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65253, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65253, "pid": 5, "tid": 7, "ts": 1716454222357128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294309, "dur": 9, "args": { "External id": 65253, "cbid": 211, "correlation": 65253 } }, { "ph": "s", "id": 65253, "pid": 76337, "tid": -914061504, "ts": 1716454222294309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222294361, "dur": 0, "args": { "External id": 65263, "cbid": 317, "correlation": 65263 } }, { "ph": "f", "id": 65263, "pid": 76337, "tid": -914061504, "ts": 1716454222294361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222294362, "dur": 0, "args": { "External id": 65264, "cbid": 203, "correlation": 65264 } }, { "ph": "f", "id": 65264, "pid": 76337, "tid": -914061504, "ts": 1716454222294362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222294362, "dur": 0, "args": { "External id": 65265, "cbid": 205, "correlation": 65265 } }, { "ph": "f", "id": 65265, "pid": 76337, "tid": -914061504, "ts": 1716454222294362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222357137, "dur": 7, "args": { "External id": 65269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65269, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65269, "pid": 5, "tid": 7, "ts": 1716454222357137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294397, "dur": 12, "args": { "External id": 65269, "cbid": 211, "correlation": 65269 } }, { "ph": "s", "id": 65269, "pid": 76337, "tid": -914061504, "ts": 1716454222294397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222357145, "dur": 348, "args": { "External id": 65271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65271, "pid": 5, "tid": 7, "ts": 1716454222357145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294412, "dur": 6, "args": { "External id": 65271, "cbid": 211, "correlation": 65271 } }, { "ph": "s", "id": 65271, "pid": 76337, "tid": -914061504, "ts": 1716454222294412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222357495, "dur": 1, "args": { "External id": 65273, "device": 5, "context": 1, "stream": 7, "correlation": 65273, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 65273, "pid": 5, "tid": 7, "ts": 1716454222357495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222294423, "dur": 6, "args": { "External id": 65273, "cbid": 51, "correlation": 65273 } }, { "ph": "s", "id": 65273, "pid": 76337, "tid": -914061504, "ts": 1716454222294423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222357499, "dur": 546, "args": { "External id": 65274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65274, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65274, "pid": 5, "tid": 7, "ts": 1716454222357499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294431, "dur": 7, "args": { "External id": 65274, "cbid": 211, "correlation": 65274 } }, { "ph": "s", "id": 65274, "pid": 76337, "tid": -914061504, "ts": 1716454222294431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358047, "dur": 6, "args": { "External id": 65276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65276, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65276, "pid": 5, "tid": 7, "ts": 1716454222358047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294442, "dur": 5, "args": { "External id": 65276, "cbid": 211, "correlation": 65276 } }, { "ph": "s", "id": 65276, "pid": 76337, "tid": -914061504, "ts": 1716454222294442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222358054, "dur": 7, "args": { "External id": 65282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65282, "pid": 5, "tid": 7, "ts": 1716454222358054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294471, "dur": 8, "args": { "External id": 65282, "cbid": 211, "correlation": 65282 } }, { "ph": "s", "id": 65282, "pid": 76337, "tid": -914061504, "ts": 1716454222294471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222358062, "dur": 3, "args": { "External id": 65290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65290, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 65290, "pid": 5, "tid": 7, "ts": 1716454222358062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294515, "dur": 9, "args": { "External id": 65290, "cbid": 211, "correlation": 65290 } }, { "ph": "s", "id": 65290, "pid": 76337, "tid": -914061504, "ts": 1716454222294515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222294578, "dur": 1, "args": { "External id": 65306, "cbid": 251, "correlation": 65306 } }, { "ph": "f", "id": 65306, "pid": 76337, "tid": -914061504, "ts": 1716454222294578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222294583, "dur": 0, "args": { "External id": 65308, "cbid": 251, "correlation": 65308 } }, { "ph": "f", "id": 65308, "pid": 76337, "tid": -914061504, "ts": 1716454222294583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222358067, "dur": 13, "args": { "External id": 65309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65309, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65309, "pid": 5, "tid": 7, "ts": 1716454222358067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294585, "dur": 12, "args": { "External id": 65309, "cbid": 211, "correlation": 65309 } }, { "ph": "s", "id": 65309, "pid": 76337, "tid": -914061504, "ts": 1716454222294585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222358082, "dur": 5, "args": { "External id": 65311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65311, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65311, "pid": 5, "tid": 7, "ts": 1716454222358082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294598, "dur": 5, "args": { "External id": 65311, "cbid": 211, "correlation": 65311 } }, { "ph": "s", "id": 65311, "pid": 76337, "tid": -914061504, "ts": 1716454222294598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222358089, "dur": 6, "args": { "External id": 65321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65321, "pid": 5, "tid": 7, "ts": 1716454222358089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294656, "dur": 12, "args": { "External id": 65321, "cbid": 211, "correlation": 65321 } }, { "ph": "s", "id": 65321, "pid": 76337, "tid": -914061504, "ts": 1716454222294656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222358097, "dur": 10, "args": { "External id": 65341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65341, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 65341, "pid": 5, "tid": 7, "ts": 1716454222358097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294721, "dur": 14, "args": { "External id": 65341, "cbid": 211, "correlation": 65341 } }, { "ph": "s", "id": 65341, "pid": 76337, "tid": -914061504, "ts": 1716454222294721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222358109, "dur": 4, "args": { "External id": 65353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65353, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 65353, "pid": 5, "tid": 7, "ts": 1716454222358109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294745, "dur": 7, "args": { "External id": 65353, "cbid": 211, "correlation": 65353 } }, { "ph": "s", "id": 65353, "pid": 76337, "tid": -914061504, "ts": 1716454222294745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222358114, "dur": 8, "args": { "External id": 65356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65356, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65356, "pid": 5, "tid": 7, "ts": 1716454222358114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294764, "dur": 7, "args": { "External id": 65356, "cbid": 211, "correlation": 65356 } }, { "ph": "s", "id": 65356, "pid": 76337, "tid": -914061504, "ts": 1716454222294764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222358123, "dur": 5, "args": { "External id": 65365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65365, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65365, "pid": 5, "tid": 7, "ts": 1716454222358123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294805, "dur": 10, "args": { "External id": 65365, "cbid": 211, "correlation": 65365 } }, { "ph": "s", "id": 65365, "pid": 76337, "tid": -914061504, "ts": 1716454222294805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222294868, "dur": 0, "args": { "External id": 65375, "cbid": 317, "correlation": 65375 } }, { "ph": "f", "id": 65375, "pid": 76337, "tid": -914061504, "ts": 1716454222294868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222294868, "dur": 0, "args": { "External id": 65376, "cbid": 203, "correlation": 65376 } }, { "ph": "f", "id": 65376, "pid": 76337, "tid": -914061504, "ts": 1716454222294868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222294869, "dur": 0, "args": { "External id": 65377, "cbid": 205, "correlation": 65377 } }, { "ph": "f", "id": 65377, "pid": 76337, "tid": -914061504, "ts": 1716454222294869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358130, "dur": 6, "args": { "External id": 65381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65381, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65381, "pid": 5, "tid": 7, "ts": 1716454222358130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294883, "dur": 12, "args": { "External id": 65381, "cbid": 211, "correlation": 65381 } }, { "ph": "s", "id": 65381, "pid": 76337, "tid": -914061504, "ts": 1716454222294883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358137, "dur": 177, "args": { "External id": 65383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65383, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65383, "pid": 5, "tid": 7, "ts": 1716454222358137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294897, "dur": 5, "args": { "External id": 65383, "cbid": 211, "correlation": 65383 } }, { "ph": "s", "id": 65383, "pid": 76337, "tid": -914061504, "ts": 1716454222294897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222358316, "dur": 1, "args": { "External id": 65385, "device": 5, "context": 1, "stream": 7, "correlation": 65385, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 65385, "pid": 5, "tid": 7, "ts": 1716454222358316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222294908, "dur": 6, "args": { "External id": 65385, "cbid": 51, "correlation": 65385 } }, { "ph": "s", "id": 65385, "pid": 76337, "tid": -914061504, "ts": 1716454222294908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222358320, "dur": 285, "args": { "External id": 65386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65386, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65386, "pid": 5, "tid": 7, "ts": 1716454222358320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294916, "dur": 6, "args": { "External id": 65386, "cbid": 211, "correlation": 65386 } }, { "ph": "s", "id": 65386, "pid": 76337, "tid": -914061504, "ts": 1716454222294916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358606, "dur": 6, "args": { "External id": 65388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65388, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65388, "pid": 5, "tid": 7, "ts": 1716454222358606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294925, "dur": 5, "args": { "External id": 65388, "cbid": 211, "correlation": 65388 } }, { "ph": "s", "id": 65388, "pid": 76337, "tid": -914061504, "ts": 1716454222294925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222358614, "dur": 7, "args": { "External id": 65394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65394, "pid": 5, "tid": 7, "ts": 1716454222358614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222294953, "dur": 9, "args": { "External id": 65394, "cbid": 211, "correlation": 65394 } }, { "ph": "s", "id": 65394, "pid": 76337, "tid": -914061504, "ts": 1716454222294953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222295023, "dur": 0, "args": { "External id": 65404, "cbid": 317, "correlation": 65404 } }, { "ph": "f", "id": 65404, "pid": 76337, "tid": -914061504, "ts": 1716454222295023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222295024, "dur": 0, "args": { "External id": 65405, "cbid": 203, "correlation": 65405 } }, { "ph": "f", "id": 65405, "pid": 76337, "tid": -914061504, "ts": 1716454222295024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222295024, "dur": 0, "args": { "External id": 65406, "cbid": 205, "correlation": 65406 } }, { "ph": "f", "id": 65406, "pid": 76337, "tid": -914061504, "ts": 1716454222295024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358622, "dur": 9, "args": { "External id": 65410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65410, "pid": 5, "tid": 7, "ts": 1716454222358622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295036, "dur": 12, "args": { "External id": 65410, "cbid": 211, "correlation": 65410 } }, { "ph": "s", "id": 65410, "pid": 76337, "tid": -914061504, "ts": 1716454222295036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222358632, "dur": 3, "args": { "External id": 65412, "device": 5, "context": 1, "stream": 7, "correlation": 65412, "bytes": 4800, "memory bandwidth (GB/s)": 1.4018691588785046 } }, { "ph": "f", "id": 65412, "pid": 5, "tid": 7, "ts": 1716454222358632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222295054, "dur": 10, "args": { "External id": 65412, "cbid": 51, "correlation": 65412 } }, { "ph": "s", "id": 65412, "pid": 76337, "tid": -914061504, "ts": 1716454222295054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222358636, "dur": 101, "args": { "External id": 65413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65413, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 65413, "pid": 5, "tid": 7, "ts": 1716454222358636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295065, "dur": 6, "args": { "External id": 65413, "cbid": 211, "correlation": 65413 } }, { "ph": "s", "id": 65413, "pid": 76337, "tid": -914061504, "ts": 1716454222295065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358739, "dur": 6, "args": { "External id": 65415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65415, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65415, "pid": 5, "tid": 7, "ts": 1716454222358739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295074, "dur": 5, "args": { "External id": 65415, "cbid": 211, "correlation": 65415 } }, { "ph": "s", "id": 65415, "pid": 76337, "tid": -914061504, "ts": 1716454222295074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222358746, "dur": 7, "args": { "External id": 65421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65421, "pid": 5, "tid": 7, "ts": 1716454222358746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295101, "dur": 10, "args": { "External id": 65421, "cbid": 211, "correlation": 65421 } }, { "ph": "s", "id": 65421, "pid": 76337, "tid": -914061504, "ts": 1716454222295101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222358754, "dur": 5, "args": { "External id": 65429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65429, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65429, "pid": 5, "tid": 7, "ts": 1716454222358754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295131, "dur": 8, "args": { "External id": 65429, "cbid": 211, "correlation": 65429 } }, { "ph": "s", "id": 65429, "pid": 76337, "tid": -914061504, "ts": 1716454222295131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222358761, "dur": 5, "args": { "External id": 65437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65437, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 65437, "pid": 5, "tid": 7, "ts": 1716454222358761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295161, "dur": 8, "args": { "External id": 65437, "cbid": 211, "correlation": 65437 } }, { "ph": "s", "id": 65437, "pid": 76337, "tid": -914061504, "ts": 1716454222295161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222358767, "dur": 16, "args": { "External id": 65448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65448, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65448, "pid": 5, "tid": 7, "ts": 1716454222358767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295240, "dur": 17, "args": { "External id": 65448, "cbid": 211, "correlation": 65448 } }, { "ph": "s", "id": 65448, "pid": 76337, "tid": -914061504, "ts": 1716454222295240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222295300, "dur": 0, "args": { "External id": 65458, "cbid": 317, "correlation": 65458 } }, { "ph": "f", "id": 65458, "pid": 76337, "tid": -914061504, "ts": 1716454222295300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222295301, "dur": 0, "args": { "External id": 65459, "cbid": 203, "correlation": 65459 } }, { "ph": "f", "id": 65459, "pid": 76337, "tid": -914061504, "ts": 1716454222295301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222295301, "dur": 0, "args": { "External id": 65460, "cbid": 205, "correlation": 65460 } }, { "ph": "f", "id": 65460, "pid": 76337, "tid": -914061504, "ts": 1716454222295301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358784, "dur": 9, "args": { "External id": 65464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65464, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65464, "pid": 5, "tid": 7, "ts": 1716454222358784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295316, "dur": 11, "args": { "External id": 65464, "cbid": 211, "correlation": 65464 } }, { "ph": "s", "id": 65464, "pid": 76337, "tid": -914061504, "ts": 1716454222295316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222358795, "dur": 177, "args": { "External id": 65466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65466, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65466, "pid": 5, "tid": 7, "ts": 1716454222358795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295330, "dur": 8, "args": { "External id": 65466, "cbid": 211, "correlation": 65466 } }, { "ph": "s", "id": 65466, "pid": 76337, "tid": -914061504, "ts": 1716454222295330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222358974, "dur": 2, "args": { "External id": 65468, "device": 5, "context": 1, "stream": 7, "correlation": 65468, "bytes": 960, "memory bandwidth (GB/s)": 0.47619047619047616 } }, { "ph": "f", "id": 65468, "pid": 5, "tid": 7, "ts": 1716454222358974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222295343, "dur": 6, "args": { "External id": 65468, "cbid": 51, "correlation": 65468 } }, { "ph": "s", "id": 65468, "pid": 76337, "tid": -914061504, "ts": 1716454222295343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222358979, "dur": 712, "args": { "External id": 65469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65469, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65469, "pid": 5, "tid": 7, "ts": 1716454222358979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295351, "dur": 7, "args": { "External id": 65469, "cbid": 211, "correlation": 65469 } }, { "ph": "s", "id": 65469, "pid": 76337, "tid": -914061504, "ts": 1716454222295351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222359692, "dur": 14, "args": { "External id": 65471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65471, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65471, "pid": 5, "tid": 7, "ts": 1716454222359692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295362, "dur": 5, "args": { "External id": 65471, "cbid": 211, "correlation": 65471 } }, { "ph": "s", "id": 65471, "pid": 76337, "tid": -914061504, "ts": 1716454222295362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222359707, "dur": 16, "args": { "External id": 65477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65477, "pid": 5, "tid": 7, "ts": 1716454222359707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295390, "dur": 8, "args": { "External id": 65477, "cbid": 211, "correlation": 65477 } }, { "ph": "s", "id": 65477, "pid": 76337, "tid": -914061504, "ts": 1716454222295390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222359724, "dur": 31, "args": { "External id": 65486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65486, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65486, "pid": 5, "tid": 7, "ts": 1716454222359724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295482, "dur": 13, "args": { "External id": 65486, "cbid": 211, "correlation": 65486 } }, { "ph": "s", "id": 65486, "pid": 76337, "tid": -914061504, "ts": 1716454222295482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222359756, "dur": 33, "args": { "External id": 65506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65506, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 65506, "pid": 5, "tid": 7, "ts": 1716454222359756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295551, "dur": 11, "args": { "External id": 65506, "cbid": 211, "correlation": 65506 } }, { "ph": "s", "id": 65506, "pid": 76337, "tid": -914061504, "ts": 1716454222295551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222359791, "dur": 5, "args": { "External id": 65518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65518, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65518, "pid": 5, "tid": 7, "ts": 1716454222359791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295572, "dur": 6, "args": { "External id": 65518, "cbid": 211, "correlation": 65518 } }, { "ph": "s", "id": 65518, "pid": 76337, "tid": -914061504, "ts": 1716454222295572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222359797, "dur": 32, "args": { "External id": 65521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65521, "pid": 5, "tid": 7, "ts": 1716454222359797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295590, "dur": 6, "args": { "External id": 65521, "cbid": 211, "correlation": 65521 } }, { "ph": "s", "id": 65521, "pid": 76337, "tid": -914061504, "ts": 1716454222295590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222359831, "dur": 20, "args": { "External id": 65530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65530, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65530, "pid": 5, "tid": 7, "ts": 1716454222359831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295628, "dur": 12, "args": { "External id": 65530, "cbid": 211, "correlation": 65530 } }, { "ph": "s", "id": 65530, "pid": 76337, "tid": -914061504, "ts": 1716454222295628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222295684, "dur": 0, "args": { "External id": 65540, "cbid": 317, "correlation": 65540 } }, { "ph": "f", "id": 65540, "pid": 76337, "tid": -914061504, "ts": 1716454222295684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222295684, "dur": 0, "args": { "External id": 65541, "cbid": 203, "correlation": 65541 } }, { "ph": "f", "id": 65541, "pid": 76337, "tid": -914061504, "ts": 1716454222295684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222295685, "dur": 0, "args": { "External id": 65542, "cbid": 205, "correlation": 65542 } }, { "ph": "f", "id": 65542, "pid": 76337, "tid": -914061504, "ts": 1716454222295685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222359853, "dur": 24, "args": { "External id": 65546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65546, "pid": 5, "tid": 7, "ts": 1716454222359853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295703, "dur": 11, "args": { "External id": 65546, "cbid": 211, "correlation": 65546 } }, { "ph": "s", "id": 65546, "pid": 76337, "tid": -914061504, "ts": 1716454222295703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222359877, "dur": 349, "args": { "External id": 65548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65548, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65548, "pid": 5, "tid": 7, "ts": 1716454222359877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295717, "dur": 6, "args": { "External id": 65548, "cbid": 211, "correlation": 65548 } }, { "ph": "s", "id": 65548, "pid": 76337, "tid": -914061504, "ts": 1716454222295717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222360229, "dur": 1, "args": { "External id": 65550, "device": 5, "context": 1, "stream": 7, "correlation": 65550, "bytes": 960, "memory bandwidth (GB/s)": 0.5660377358490566 } }, { "ph": "f", "id": 65550, "pid": 5, "tid": 7, "ts": 1716454222360229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222295729, "dur": 6, "args": { "External id": 65550, "cbid": 51, "correlation": 65550 } }, { "ph": "s", "id": 65550, "pid": 76337, "tid": -914061504, "ts": 1716454222295729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222360233, "dur": 1361, "args": { "External id": 65551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65551, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65551, "pid": 5, "tid": 7, "ts": 1716454222360233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295736, "dur": 6, "args": { "External id": 65551, "cbid": 211, "correlation": 65551 } }, { "ph": "s", "id": 65551, "pid": 76337, "tid": -914061504, "ts": 1716454222295736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222361595, "dur": 13, "args": { "External id": 65553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65553, "pid": 5, "tid": 7, "ts": 1716454222361595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295746, "dur": 5, "args": { "External id": 65553, "cbid": 211, "correlation": 65553 } }, { "ph": "s", "id": 65553, "pid": 76337, "tid": -914061504, "ts": 1716454222295746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222361610, "dur": 16, "args": { "External id": 65559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65559, "pid": 5, "tid": 7, "ts": 1716454222361610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295774, "dur": 9, "args": { "External id": 65559, "cbid": 211, "correlation": 65559 } }, { "ph": "s", "id": 65559, "pid": 76337, "tid": -914061504, "ts": 1716454222295774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222361627, "dur": 4, "args": { "External id": 65567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65567, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 65567, "pid": 5, "tid": 7, "ts": 1716454222361627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295818, "dur": 9, "args": { "External id": 65567, "cbid": 211, "correlation": 65567 } }, { "ph": "s", "id": 65567, "pid": 76337, "tid": -914061504, "ts": 1716454222295818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222295882, "dur": 1, "args": { "External id": 65583, "cbid": 251, "correlation": 65583 } }, { "ph": "f", "id": 65583, "pid": 76337, "tid": -914061504, "ts": 1716454222295882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222295888, "dur": 0, "args": { "External id": 65585, "cbid": 251, "correlation": 65585 } }, { "ph": "f", "id": 65585, "pid": 76337, "tid": -914061504, "ts": 1716454222295888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222361632, "dur": 13, "args": { "External id": 65586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65586, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65586, "pid": 5, "tid": 7, "ts": 1716454222361632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295890, "dur": 11, "args": { "External id": 65586, "cbid": 211, "correlation": 65586 } }, { "ph": "s", "id": 65586, "pid": 76337, "tid": -914061504, "ts": 1716454222295890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222361647, "dur": 5, "args": { "External id": 65588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65588, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65588, "pid": 5, "tid": 7, "ts": 1716454222361647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295903, "dur": 5, "args": { "External id": 65588, "cbid": 211, "correlation": 65588 } }, { "ph": "s", "id": 65588, "pid": 76337, "tid": -914061504, "ts": 1716454222295903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222361653, "dur": 18, "args": { "External id": 65598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65598, "pid": 5, "tid": 7, "ts": 1716454222361653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222295964, "dur": 20, "args": { "External id": 65598, "cbid": 211, "correlation": 65598 } }, { "ph": "s", "id": 65598, "pid": 76337, "tid": -914061504, "ts": 1716454222295964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222361673, "dur": 19, "args": { "External id": 65618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65618, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 65618, "pid": 5, "tid": 7, "ts": 1716454222361673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296039, "dur": 11, "args": { "External id": 65618, "cbid": 211, "correlation": 65618 } }, { "ph": "s", "id": 65618, "pid": 76337, "tid": -914061504, "ts": 1716454222296039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222361694, "dur": 4, "args": { "External id": 65630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 65630, "pid": 5, "tid": 7, "ts": 1716454222361694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296060, "dur": 6, "args": { "External id": 65630, "cbid": 211, "correlation": 65630 } }, { "ph": "s", "id": 65630, "pid": 76337, "tid": -914061504, "ts": 1716454222296060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222361699, "dur": 18, "args": { "External id": 65633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65633, "pid": 5, "tid": 7, "ts": 1716454222361699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296078, "dur": 6, "args": { "External id": 65633, "cbid": 211, "correlation": 65633 } }, { "ph": "s", "id": 65633, "pid": 76337, "tid": -914061504, "ts": 1716454222296078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222361719, "dur": 13, "args": { "External id": 65642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65642, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65642, "pid": 5, "tid": 7, "ts": 1716454222361719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296118, "dur": 9, "args": { "External id": 65642, "cbid": 211, "correlation": 65642 } }, { "ph": "s", "id": 65642, "pid": 76337, "tid": -914061504, "ts": 1716454222296118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222296181, "dur": 0, "args": { "External id": 65652, "cbid": 317, "correlation": 65652 } }, { "ph": "f", "id": 65652, "pid": 76337, "tid": -914061504, "ts": 1716454222296181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222296182, "dur": 0, "args": { "External id": 65653, "cbid": 203, "correlation": 65653 } }, { "ph": "f", "id": 65653, "pid": 76337, "tid": -914061504, "ts": 1716454222296182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222296182, "dur": 0, "args": { "External id": 65654, "cbid": 205, "correlation": 65654 } }, { "ph": "f", "id": 65654, "pid": 76337, "tid": -914061504, "ts": 1716454222296182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222361733, "dur": 12, "args": { "External id": 65658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65658, "pid": 5, "tid": 7, "ts": 1716454222361733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296196, "dur": 12, "args": { "External id": 65658, "cbid": 211, "correlation": 65658 } }, { "ph": "s", "id": 65658, "pid": 76337, "tid": -914061504, "ts": 1716454222296196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222361746, "dur": 177, "args": { "External id": 65660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65660, "pid": 5, "tid": 7, "ts": 1716454222361746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296210, "dur": 5, "args": { "External id": 65660, "cbid": 211, "correlation": 65660 } }, { "ph": "s", "id": 65660, "pid": 76337, "tid": -914061504, "ts": 1716454222296210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222361926, "dur": 1, "args": { "External id": 65662, "device": 5, "context": 1, "stream": 7, "correlation": 65662, "bytes": 960, "memory bandwidth (GB/s)": 0.5660377358490566 } }, { "ph": "f", "id": 65662, "pid": 5, "tid": 7, "ts": 1716454222361926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222296221, "dur": 10, "args": { "External id": 65662, "cbid": 51, "correlation": 65662 } }, { "ph": "s", "id": 65662, "pid": 76337, "tid": -914061504, "ts": 1716454222296221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222361930, "dur": 711, "args": { "External id": 65663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65663, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65663, "pid": 5, "tid": 7, "ts": 1716454222361930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296232, "dur": 6, "args": { "External id": 65663, "cbid": 211, "correlation": 65663 } }, { "ph": "s", "id": 65663, "pid": 76337, "tid": -914061504, "ts": 1716454222296232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222362642, "dur": 12, "args": { "External id": 65665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65665, "pid": 5, "tid": 7, "ts": 1716454222362642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296242, "dur": 6, "args": { "External id": 65665, "cbid": 211, "correlation": 65665 } }, { "ph": "s", "id": 65665, "pid": 76337, "tid": -914061504, "ts": 1716454222296242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222362656, "dur": 16, "args": { "External id": 65671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65671, "pid": 5, "tid": 7, "ts": 1716454222362656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296271, "dur": 9, "args": { "External id": 65671, "cbid": 211, "correlation": 65671 } }, { "ph": "s", "id": 65671, "pid": 76337, "tid": -914061504, "ts": 1716454222296271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222296330, "dur": 0, "args": { "External id": 65681, "cbid": 317, "correlation": 65681 } }, { "ph": "f", "id": 65681, "pid": 76337, "tid": -914061504, "ts": 1716454222296330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222296331, "dur": 0, "args": { "External id": 65682, "cbid": 203, "correlation": 65682 } }, { "ph": "f", "id": 65682, "pid": 76337, "tid": -914061504, "ts": 1716454222296331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222296331, "dur": 0, "args": { "External id": 65683, "cbid": 205, "correlation": 65683 } }, { "ph": "f", "id": 65683, "pid": 76337, "tid": -914061504, "ts": 1716454222296331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222362674, "dur": 23, "args": { "External id": 65687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65687, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65687, "pid": 5, "tid": 7, "ts": 1716454222362674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296356, "dur": 12, "args": { "External id": 65687, "cbid": 211, "correlation": 65687 } }, { "ph": "s", "id": 65687, "pid": 76337, "tid": -914061504, "ts": 1716454222296356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222362698, "dur": 4, "args": { "External id": 65689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 65689, "pid": 5, "tid": 7, "ts": 1716454222362698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296377, "dur": 6, "args": { "External id": 65689, "cbid": 211, "correlation": 65689 } }, { "ph": "s", "id": 65689, "pid": 76337, "tid": -914061504, "ts": 1716454222296377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222296387, "dur": 0, "args": { "External id": 65690, "cbid": 51, "correlation": 65690 } }, { "ph": "s", "id": 65690, "pid": 76337, "tid": -914061504, "ts": 1716454222296387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222362703, "dur": 193, "args": { "External id": 65691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65691, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 65691, "pid": 5, "tid": 7, "ts": 1716454222362703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296388, "dur": 7, "args": { "External id": 65691, "cbid": 211, "correlation": 65691 } }, { "ph": "s", "id": 65691, "pid": 76337, "tid": -914061504, "ts": 1716454222296388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222362898, "dur": 17, "args": { "External id": 65696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65696, "pid": 5, "tid": 7, "ts": 1716454222362898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296415, "dur": 9, "args": { "External id": 65696, "cbid": 211, "correlation": 65696 } }, { "ph": "s", "id": 65696, "pid": 76337, "tid": -914061504, "ts": 1716454222296415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222362917, "dur": 13, "args": { "External id": 65704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65704, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65704, "pid": 5, "tid": 7, "ts": 1716454222362917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296445, "dur": 8, "args": { "External id": 65704, "cbid": 211, "correlation": 65704 } }, { "ph": "s", "id": 65704, "pid": 76337, "tid": -914061504, "ts": 1716454222296445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222362931, "dur": 11, "args": { "External id": 65712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65712, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65712, "pid": 5, "tid": 7, "ts": 1716454222362931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296474, "dur": 8, "args": { "External id": 65712, "cbid": 211, "correlation": 65712 } }, { "ph": "s", "id": 65712, "pid": 76337, "tid": -914061504, "ts": 1716454222296474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222362944, "dur": 21, "args": { "External id": 65732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65732, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 65732, "pid": 5, "tid": 7, "ts": 1716454222362944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296561, "dur": 12, "args": { "External id": 65732, "cbid": 211, "correlation": 65732 } }, { "ph": "s", "id": 65732, "pid": 76337, "tid": -914061504, "ts": 1716454222296561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222362966, "dur": 5, "args": { "External id": 65744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65744, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 65744, "pid": 5, "tid": 7, "ts": 1716454222362966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296583, "dur": 6, "args": { "External id": 65744, "cbid": 211, "correlation": 65744 } }, { "ph": "s", "id": 65744, "pid": 76337, "tid": -914061504, "ts": 1716454222296583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222362972, "dur": 18, "args": { "External id": 65747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65747, "pid": 5, "tid": 7, "ts": 1716454222362972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296601, "dur": 6, "args": { "External id": 65747, "cbid": 211, "correlation": 65747 } }, { "ph": "s", "id": 65747, "pid": 76337, "tid": -914061504, "ts": 1716454222296601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222296659, "dur": 0, "args": { "External id": 65758, "cbid": 317, "correlation": 65758 } }, { "ph": "f", "id": 65758, "pid": 76337, "tid": -914061504, "ts": 1716454222296659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222296660, "dur": 0, "args": { "External id": 65759, "cbid": 203, "correlation": 65759 } }, { "ph": "f", "id": 65759, "pid": 76337, "tid": -914061504, "ts": 1716454222296660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222296660, "dur": 0, "args": { "External id": 65760, "cbid": 205, "correlation": 65760 } }, { "ph": "f", "id": 65760, "pid": 76337, "tid": -914061504, "ts": 1716454222296660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222362992, "dur": 13, "args": { "External id": 65764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65764, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65764, "pid": 5, "tid": 7, "ts": 1716454222362992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296673, "dur": 11, "args": { "External id": 65764, "cbid": 211, "correlation": 65764 } }, { "ph": "s", "id": 65764, "pid": 76337, "tid": -914061504, "ts": 1716454222296673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222363006, "dur": 4, "args": { "External id": 65766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 65766, "pid": 5, "tid": 7, "ts": 1716454222363006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296689, "dur": 6, "args": { "External id": 65766, "cbid": 211, "correlation": 65766 } }, { "ph": "s", "id": 65766, "pid": 76337, "tid": -914061504, "ts": 1716454222296689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222296697, "dur": 0, "args": { "External id": 65767, "cbid": 51, "correlation": 65767 } }, { "ph": "s", "id": 65767, "pid": 76337, "tid": -914061504, "ts": 1716454222296697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222363011, "dur": 100, "args": { "External id": 65768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65768, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 65768, "pid": 5, "tid": 7, "ts": 1716454222363011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296698, "dur": 6, "args": { "External id": 65768, "cbid": 211, "correlation": 65768 } }, { "ph": "s", "id": 65768, "pid": 76337, "tid": -914061504, "ts": 1716454222296698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222363112, "dur": 17, "args": { "External id": 65773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65773, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65773, "pid": 5, "tid": 7, "ts": 1716454222363112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296726, "dur": 8, "args": { "External id": 65773, "cbid": 211, "correlation": 65773 } }, { "ph": "s", "id": 65773, "pid": 76337, "tid": -914061504, "ts": 1716454222296726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222363131, "dur": 93, "args": { "External id": 65782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65782, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65782, "pid": 5, "tid": 7, "ts": 1716454222363131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296808, "dur": 15, "args": { "External id": 65782, "cbid": 211, "correlation": 65782 } }, { "ph": "s", "id": 65782, "pid": 76337, "tid": -914061504, "ts": 1716454222296808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222363225, "dur": 32, "args": { "External id": 65804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65804, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65804, "pid": 5, "tid": 7, "ts": 1716454222363225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296871, "dur": 11, "args": { "External id": 65804, "cbid": 211, "correlation": 65804 } }, { "ph": "s", "id": 65804, "pid": 76337, "tid": -914061504, "ts": 1716454222296871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222296966, "dur": 1, "args": { "External id": 65815, "cbid": 251, "correlation": 65815 } }, { "ph": "f", "id": 65815, "pid": 76337, "tid": -914061504, "ts": 1716454222296966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222363258, "dur": 180, "args": { "External id": 65816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65816, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65816, "pid": 5, "tid": 7, "ts": 1716454222363258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222296972, "dur": 21, "args": { "External id": 65816, "cbid": 211, "correlation": 65816 } }, { "ph": "s", "id": 65816, "pid": 76337, "tid": -914061504, "ts": 1716454222296972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297053, "dur": 1, "args": { "External id": 65827, "cbid": 251, "correlation": 65827 } }, { "ph": "f", "id": 65827, "pid": 76337, "tid": -914061504, "ts": 1716454222297053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222363440, "dur": 173, "args": { "External id": 65828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65828, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65828, "pid": 5, "tid": 7, "ts": 1716454222363440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297057, "dur": 11, "args": { "External id": 65828, "cbid": 211, "correlation": 65828 } }, { "ph": "s", "id": 65828, "pid": 76337, "tid": -914061504, "ts": 1716454222297057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297122, "dur": 1, "args": { "External id": 65839, "cbid": 251, "correlation": 65839 } }, { "ph": "f", "id": 65839, "pid": 76337, "tid": -914061504, "ts": 1716454222297122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222363614, "dur": 170, "args": { "External id": 65840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65840, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65840, "pid": 5, "tid": 7, "ts": 1716454222363614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297126, "dur": 11, "args": { "External id": 65840, "cbid": 211, "correlation": 65840 } }, { "ph": "s", "id": 65840, "pid": 76337, "tid": -914061504, "ts": 1716454222297126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222363785, "dur": 361, "args": { "External id": 65865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65865, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65865, "pid": 5, "tid": 7, "ts": 1716454222363785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297212, "dur": 13, "args": { "External id": 65865, "cbid": 211, "correlation": 65865 } }, { "ph": "s", "id": 65865, "pid": 76337, "tid": -914061504, "ts": 1716454222297212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297318, "dur": 1, "args": { "External id": 65883, "cbid": 251, "correlation": 65883 } }, { "ph": "f", "id": 65883, "pid": 76337, "tid": -914061504, "ts": 1716454222297318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222364148, "dur": 182, "args": { "External id": 65885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65885, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65885, "pid": 5, "tid": 7, "ts": 1716454222364148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297324, "dur": 13, "args": { "External id": 65885, "cbid": 211, "correlation": 65885 } }, { "ph": "s", "id": 65885, "pid": 76337, "tid": -914061504, "ts": 1716454222297324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222364330, "dur": 19, "args": { "External id": 65893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65893, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65893, "pid": 5, "tid": 7, "ts": 1716454222364330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297408, "dur": 14, "args": { "External id": 65893, "cbid": 211, "correlation": 65893 } }, { "ph": "s", "id": 65893, "pid": 76337, "tid": -914061504, "ts": 1716454222297408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222364351, "dur": 28, "args": { "External id": 65901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65901, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65901, "pid": 5, "tid": 7, "ts": 1716454222364351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297450, "dur": 9, "args": { "External id": 65901, "cbid": 211, "correlation": 65901 } }, { "ph": "s", "id": 65901, "pid": 76337, "tid": -914061504, "ts": 1716454222297450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222364381, "dur": 19, "args": { "External id": 65912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65912, "pid": 5, "tid": 7, "ts": 1716454222364381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297526, "dur": 12, "args": { "External id": 65912, "cbid": 211, "correlation": 65912 } }, { "ph": "s", "id": 65912, "pid": 76337, "tid": -914061504, "ts": 1716454222297526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222364401, "dur": 17, "args": { "External id": 65934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65934, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 65934, "pid": 5, "tid": 7, "ts": 1716454222364401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297557, "dur": 8, "args": { "External id": 65934, "cbid": 211, "correlation": 65934 } }, { "ph": "s", "id": 65934, "pid": 76337, "tid": -914061504, "ts": 1716454222297557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297644, "dur": 1, "args": { "External id": 65945, "cbid": 251, "correlation": 65945 } }, { "ph": "f", "id": 65945, "pid": 76337, "tid": -914061504, "ts": 1716454222297644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222364420, "dur": 97, "args": { "External id": 65946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65946, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 65946, "pid": 5, "tid": 7, "ts": 1716454222364420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297649, "dur": 13, "args": { "External id": 65946, "cbid": 211, "correlation": 65946 } }, { "ph": "s", "id": 65946, "pid": 76337, "tid": -914061504, "ts": 1716454222297649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297722, "dur": 1, "args": { "External id": 65957, "cbid": 251, "correlation": 65957 } }, { "ph": "f", "id": 65957, "pid": 76337, "tid": -914061504, "ts": 1716454222297722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297726, "dur": 0, "args": { "External id": 65958, "cbid": 251, "correlation": 65958 } }, { "ph": "f", "id": 65958, "pid": 76337, "tid": -914061504, "ts": 1716454222297726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222364518, "dur": 12, "args": { "External id": 65959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65959, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65959, "pid": 5, "tid": 7, "ts": 1716454222364518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297728, "dur": 12, "args": { "External id": 65959, "cbid": 211, "correlation": 65959 } }, { "ph": "s", "id": 65959, "pid": 76337, "tid": -914061504, "ts": 1716454222297728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222364532, "dur": 6, "args": { "External id": 65961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65961, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65961, "pid": 5, "tid": 7, "ts": 1716454222364532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297742, "dur": 6, "args": { "External id": 65961, "cbid": 211, "correlation": 65961 } }, { "ph": "s", "id": 65961, "pid": 76337, "tid": -914061504, "ts": 1716454222297742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297800, "dur": 1, "args": { "External id": 65972, "cbid": 251, "correlation": 65972 } }, { "ph": "f", "id": 65972, "pid": 76337, "tid": -914061504, "ts": 1716454222297800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222297803, "dur": 0, "args": { "External id": 65973, "cbid": 251, "correlation": 65973 } }, { "ph": "f", "id": 65973, "pid": 76337, "tid": -914061504, "ts": 1716454222297803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222364540, "dur": 9, "args": { "External id": 65974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65974, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65974, "pid": 5, "tid": 7, "ts": 1716454222364540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297805, "dur": 12, "args": { "External id": 65974, "cbid": 211, "correlation": 65974 } }, { "ph": "s", "id": 65974, "pid": 76337, "tid": -914061504, "ts": 1716454222297805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222364550, "dur": 4, "args": { "External id": 65976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 65976, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 65976, "pid": 5, "tid": 7, "ts": 1716454222364550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297818, "dur": 6, "args": { "External id": 65976, "cbid": 211, "correlation": 65976 } }, { "ph": "s", "id": 65976, "pid": 76337, "tid": -914061504, "ts": 1716454222297818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222364555, "dur": 60, "args": { "External id": 66001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66001, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66001, "pid": 5, "tid": 7, "ts": 1716454222364555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222297895, "dur": 13, "args": { "External id": 66001, "cbid": 211, "correlation": 66001 } }, { "ph": "s", "id": 66001, "pid": 76337, "tid": -914061504, "ts": 1716454222297895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222298001, "dur": 3, "args": { "External id": 66019, "cbid": 251, "correlation": 66019 } }, { "ph": "f", "id": 66019, "pid": 76337, "tid": -914061504, "ts": 1716454222298001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222364616, "dur": 99, "args": { "External id": 66021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66021, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66021, "pid": 5, "tid": 7, "ts": 1716454222364616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298008, "dur": 14, "args": { "External id": 66021, "cbid": 211, "correlation": 66021 } }, { "ph": "s", "id": 66021, "pid": 76337, "tid": -914061504, "ts": 1716454222298008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222364716, "dur": 10, "args": { "External id": 66029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66029, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66029, "pid": 5, "tid": 7, "ts": 1716454222364716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298083, "dur": 12, "args": { "External id": 66029, "cbid": 211, "correlation": 66029 } }, { "ph": "s", "id": 66029, "pid": 76337, "tid": -914061504, "ts": 1716454222298083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222364728, "dur": 21, "args": { "External id": 66037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66037, "pid": 5, "tid": 7, "ts": 1716454222364728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298125, "dur": 9, "args": { "External id": 66037, "cbid": 211, "correlation": 66037 } }, { "ph": "s", "id": 66037, "pid": 76337, "tid": -914061504, "ts": 1716454222298125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222364750, "dur": 19, "args": { "External id": 66059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66059, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66059, "pid": 5, "tid": 7, "ts": 1716454222364750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298177, "dur": 10, "args": { "External id": 66059, "cbid": 211, "correlation": 66059 } }, { "ph": "s", "id": 66059, "pid": 76337, "tid": -914061504, "ts": 1716454222298177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222298265, "dur": 1, "args": { "External id": 66075, "cbid": 251, "correlation": 66075 } }, { "ph": "f", "id": 66075, "pid": 76337, "tid": -914061504, "ts": 1716454222298265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222298269, "dur": 0, "args": { "External id": 66077, "cbid": 251, "correlation": 66077 } }, { "ph": "f", "id": 66077, "pid": 76337, "tid": -914061504, "ts": 1716454222298269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222364771, "dur": 527, "args": { "External id": 66078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66078, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66078, "pid": 5, "tid": 7, "ts": 1716454222364771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298271, "dur": 13, "args": { "External id": 66078, "cbid": 211, "correlation": 66078 } }, { "ph": "s", "id": 66078, "pid": 76337, "tid": -914061504, "ts": 1716454222298271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222365300, "dur": 71, "args": { "External id": 66086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66086, "pid": 5, "tid": 7, "ts": 1716454222365300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298339, "dur": 13, "args": { "External id": 66086, "cbid": 211, "correlation": 66086 } }, { "ph": "s", "id": 66086, "pid": 76337, "tid": -914061504, "ts": 1716454222298339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222365373, "dur": 69, "args": { "External id": 66094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66094, "pid": 5, "tid": 7, "ts": 1716454222365373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298371, "dur": 8, "args": { "External id": 66094, "cbid": 211, "correlation": 66094 } }, { "ph": "s", "id": 66094, "pid": 76337, "tid": -914061504, "ts": 1716454222298371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222298455, "dur": 1, "args": { "External id": 66110, "cbid": 251, "correlation": 66110 } }, { "ph": "f", "id": 66110, "pid": 76337, "tid": -914061504, "ts": 1716454222298455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222365444, "dur": 1, "args": { "External id": 66112, "device": 5, "context": 1, "stream": 7, "correlation": 66112, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 66112, "pid": 5, "tid": 7, "ts": 1716454222365444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222298460, "dur": 10, "args": { "External id": 66112, "cbid": 51, "correlation": 66112 } }, { "ph": "s", "id": 66112, "pid": 76337, "tid": -914061504, "ts": 1716454222298460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222365447, "dur": 292, "args": { "External id": 66113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66113, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66113, "pid": 5, "tid": 7, "ts": 1716454222365447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298471, "dur": 11, "args": { "External id": 66113, "cbid": 211, "correlation": 66113 } }, { "ph": "s", "id": 66113, "pid": 76337, "tid": -914061504, "ts": 1716454222298471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222365741, "dur": 14, "args": { "External id": 66121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66121, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66121, "pid": 5, "tid": 7, "ts": 1716454222365741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298514, "dur": 10, "args": { "External id": 66121, "cbid": 211, "correlation": 66121 } }, { "ph": "s", "id": 66121, "pid": 76337, "tid": -914061504, "ts": 1716454222298514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222365757, "dur": 42, "args": { "External id": 66132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66132, "pid": 5, "tid": 7, "ts": 1716454222365757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298583, "dur": 12, "args": { "External id": 66132, "cbid": 211, "correlation": 66132 } }, { "ph": "s", "id": 66132, "pid": 76337, "tid": -914061504, "ts": 1716454222298583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222298647, "dur": 0, "args": { "External id": 66144, "cbid": 317, "correlation": 66144 } }, { "ph": "f", "id": 66144, "pid": 76337, "tid": -914061504, "ts": 1716454222298647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222298648, "dur": 0, "args": { "External id": 66145, "cbid": 203, "correlation": 66145 } }, { "ph": "f", "id": 66145, "pid": 76337, "tid": -914061504, "ts": 1716454222298648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222298649, "dur": 0, "args": { "External id": 66146, "cbid": 205, "correlation": 66146 } }, { "ph": "f", "id": 66146, "pid": 76337, "tid": -914061504, "ts": 1716454222298649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222365800, "dur": 14, "args": { "External id": 66150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66150, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66150, "pid": 5, "tid": 7, "ts": 1716454222365800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298664, "dur": 12, "args": { "External id": 66150, "cbid": 211, "correlation": 66150 } }, { "ph": "s", "id": 66150, "pid": 76337, "tid": -914061504, "ts": 1716454222298664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222365815, "dur": 4, "args": { "External id": 66152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66152, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66152, "pid": 5, "tid": 7, "ts": 1716454222365815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298681, "dur": 6, "args": { "External id": 66152, "cbid": 211, "correlation": 66152 } }, { "ph": "s", "id": 66152, "pid": 76337, "tid": -914061504, "ts": 1716454222298681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222298690, "dur": 0, "args": { "External id": 66153, "cbid": 51, "correlation": 66153 } }, { "ph": "s", "id": 66153, "pid": 76337, "tid": -914061504, "ts": 1716454222298690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222365820, "dur": 106, "args": { "External id": 66154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66154, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 66154, "pid": 5, "tid": 7, "ts": 1716454222365820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298690, "dur": 5, "args": { "External id": 66154, "cbid": 211, "correlation": 66154 } }, { "ph": "s", "id": 66154, "pid": 76337, "tid": -914061504, "ts": 1716454222298690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222365928, "dur": 18, "args": { "External id": 66159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66159, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66159, "pid": 5, "tid": 7, "ts": 1716454222365928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298717, "dur": 8, "args": { "External id": 66159, "cbid": 211, "correlation": 66159 } }, { "ph": "s", "id": 66159, "pid": 76337, "tid": -914061504, "ts": 1716454222298717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222365947, "dur": 12, "args": { "External id": 66167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66167, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66167, "pid": 5, "tid": 7, "ts": 1716454222365947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298749, "dur": 11, "args": { "External id": 66167, "cbid": 211, "correlation": 66167 } }, { "ph": "s", "id": 66167, "pid": 76337, "tid": -914061504, "ts": 1716454222298749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222365960, "dur": 33, "args": { "External id": 66176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66176, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66176, "pid": 5, "tid": 7, "ts": 1716454222365960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298791, "dur": 10, "args": { "External id": 66176, "cbid": 211, "correlation": 66176 } }, { "ph": "s", "id": 66176, "pid": 76337, "tid": -914061504, "ts": 1716454222298791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222365994, "dur": 33, "args": { "External id": 66196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66196, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 66196, "pid": 5, "tid": 7, "ts": 1716454222365994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298862, "dur": 11, "args": { "External id": 66196, "cbid": 211, "correlation": 66196 } }, { "ph": "s", "id": 66196, "pid": 76337, "tid": -914061504, "ts": 1716454222298862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222366029, "dur": 6, "args": { "External id": 66208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66208, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66208, "pid": 5, "tid": 7, "ts": 1716454222366029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298883, "dur": 6, "args": { "External id": 66208, "cbid": 211, "correlation": 66208 } }, { "ph": "s", "id": 66208, "pid": 76337, "tid": -914061504, "ts": 1716454222298883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222366036, "dur": 33, "args": { "External id": 66211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66211, "pid": 5, "tid": 7, "ts": 1716454222366036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298902, "dur": 6, "args": { "External id": 66211, "cbid": 211, "correlation": 66211 } }, { "ph": "s", "id": 66211, "pid": 76337, "tid": -914061504, "ts": 1716454222298902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222366070, "dur": 22, "args": { "External id": 66220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66220, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66220, "pid": 5, "tid": 7, "ts": 1716454222366070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222298940, "dur": 10, "args": { "External id": 66220, "cbid": 211, "correlation": 66220 } }, { "ph": "s", "id": 66220, "pid": 76337, "tid": -914061504, "ts": 1716454222298940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222299001, "dur": 0, "args": { "External id": 66230, "cbid": 317, "correlation": 66230 } }, { "ph": "f", "id": 66230, "pid": 76337, "tid": -914061504, "ts": 1716454222299001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222299001, "dur": 0, "args": { "External id": 66231, "cbid": 203, "correlation": 66231 } }, { "ph": "f", "id": 66231, "pid": 76337, "tid": -914061504, "ts": 1716454222299001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222299002, "dur": 0, "args": { "External id": 66232, "cbid": 205, "correlation": 66232 } }, { "ph": "f", "id": 66232, "pid": 76337, "tid": -914061504, "ts": 1716454222299002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222366094, "dur": 23, "args": { "External id": 66236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66236, "pid": 5, "tid": 7, "ts": 1716454222366094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299017, "dur": 12, "args": { "External id": 66236, "cbid": 211, "correlation": 66236 } }, { "ph": "s", "id": 66236, "pid": 76337, "tid": -914061504, "ts": 1716454222299017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222366118, "dur": 350, "args": { "External id": 66238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66238, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66238, "pid": 5, "tid": 7, "ts": 1716454222366118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299031, "dur": 8, "args": { "External id": 66238, "cbid": 211, "correlation": 66238 } }, { "ph": "s", "id": 66238, "pid": 76337, "tid": -914061504, "ts": 1716454222299031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222366470, "dur": 1, "args": { "External id": 66240, "device": 5, "context": 1, "stream": 7, "correlation": 66240, "bytes": 960, "memory bandwidth (GB/s)": 0.5084745762711864 } }, { "ph": "f", "id": 66240, "pid": 5, "tid": 7, "ts": 1716454222366470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222299046, "dur": 6, "args": { "External id": 66240, "cbid": 51, "correlation": 66240 } }, { "ph": "s", "id": 66240, "pid": 76337, "tid": -914061504, "ts": 1716454222299046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222366474, "dur": 1379, "args": { "External id": 66241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66241, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66241, "pid": 5, "tid": 7, "ts": 1716454222366474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299053, "dur": 7, "args": { "External id": 66241, "cbid": 211, "correlation": 66241 } }, { "ph": "s", "id": 66241, "pid": 76337, "tid": -914061504, "ts": 1716454222299053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222367854, "dur": 14, "args": { "External id": 66243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66243, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66243, "pid": 5, "tid": 7, "ts": 1716454222367854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299065, "dur": 5, "args": { "External id": 66243, "cbid": 211, "correlation": 66243 } }, { "ph": "s", "id": 66243, "pid": 76337, "tid": -914061504, "ts": 1716454222299065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222367869, "dur": 16, "args": { "External id": 66249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66249, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66249, "pid": 5, "tid": 7, "ts": 1716454222367869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299092, "dur": 8, "args": { "External id": 66249, "cbid": 211, "correlation": 66249 } }, { "ph": "s", "id": 66249, "pid": 76337, "tid": -914061504, "ts": 1716454222299092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222367887, "dur": 4, "args": { "External id": 66257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66257, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 66257, "pid": 5, "tid": 7, "ts": 1716454222367887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299137, "dur": 9, "args": { "External id": 66257, "cbid": 211, "correlation": 66257 } }, { "ph": "s", "id": 66257, "pid": 76337, "tid": -914061504, "ts": 1716454222299137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222299204, "dur": 1, "args": { "External id": 66273, "cbid": 251, "correlation": 66273 } }, { "ph": "f", "id": 66273, "pid": 76337, "tid": -914061504, "ts": 1716454222299204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222299209, "dur": 0, "args": { "External id": 66275, "cbid": 251, "correlation": 66275 } }, { "ph": "f", "id": 66275, "pid": 76337, "tid": -914061504, "ts": 1716454222299209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222367892, "dur": 14, "args": { "External id": 66276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66276, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66276, "pid": 5, "tid": 7, "ts": 1716454222367892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299212, "dur": 11, "args": { "External id": 66276, "cbid": 211, "correlation": 66276 } }, { "ph": "s", "id": 66276, "pid": 76337, "tid": -914061504, "ts": 1716454222299212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222367908, "dur": 6, "args": { "External id": 66278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66278, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66278, "pid": 5, "tid": 7, "ts": 1716454222367908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299225, "dur": 5, "args": { "External id": 66278, "cbid": 211, "correlation": 66278 } }, { "ph": "s", "id": 66278, "pid": 76337, "tid": -914061504, "ts": 1716454222299225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222367915, "dur": 17, "args": { "External id": 66288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66288, "pid": 5, "tid": 7, "ts": 1716454222367915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299282, "dur": 12, "args": { "External id": 66288, "cbid": 211, "correlation": 66288 } }, { "ph": "s", "id": 66288, "pid": 76337, "tid": -914061504, "ts": 1716454222299282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222367934, "dur": 19, "args": { "External id": 66308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66308, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 66308, "pid": 5, "tid": 7, "ts": 1716454222367934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299351, "dur": 11, "args": { "External id": 66308, "cbid": 211, "correlation": 66308 } }, { "ph": "s", "id": 66308, "pid": 76337, "tid": -914061504, "ts": 1716454222299351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222367954, "dur": 4, "args": { "External id": 66320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66320, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 66320, "pid": 5, "tid": 7, "ts": 1716454222367954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299372, "dur": 7, "args": { "External id": 66320, "cbid": 211, "correlation": 66320 } }, { "ph": "s", "id": 66320, "pid": 76337, "tid": -914061504, "ts": 1716454222299372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222367960, "dur": 18, "args": { "External id": 66323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66323, "pid": 5, "tid": 7, "ts": 1716454222367960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299391, "dur": 6, "args": { "External id": 66323, "cbid": 211, "correlation": 66323 } }, { "ph": "s", "id": 66323, "pid": 76337, "tid": -914061504, "ts": 1716454222299391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222367979, "dur": 12, "args": { "External id": 66332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66332, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66332, "pid": 5, "tid": 7, "ts": 1716454222367979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299431, "dur": 10, "args": { "External id": 66332, "cbid": 211, "correlation": 66332 } }, { "ph": "s", "id": 66332, "pid": 76337, "tid": -914061504, "ts": 1716454222299431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222299493, "dur": 0, "args": { "External id": 66342, "cbid": 317, "correlation": 66342 } }, { "ph": "f", "id": 66342, "pid": 76337, "tid": -914061504, "ts": 1716454222299493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222299494, "dur": 0, "args": { "External id": 66343, "cbid": 203, "correlation": 66343 } }, { "ph": "f", "id": 66343, "pid": 76337, "tid": -914061504, "ts": 1716454222299494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222299495, "dur": 0, "args": { "External id": 66344, "cbid": 205, "correlation": 66344 } }, { "ph": "f", "id": 66344, "pid": 76337, "tid": -914061504, "ts": 1716454222299495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222367993, "dur": 12, "args": { "External id": 66348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66348, "pid": 5, "tid": 7, "ts": 1716454222367993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299509, "dur": 12, "args": { "External id": 66348, "cbid": 211, "correlation": 66348 } }, { "ph": "s", "id": 66348, "pid": 76337, "tid": -914061504, "ts": 1716454222299509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222368006, "dur": 177, "args": { "External id": 66350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66350, "pid": 5, "tid": 7, "ts": 1716454222368006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299523, "dur": 5, "args": { "External id": 66350, "cbid": 211, "correlation": 66350 } }, { "ph": "s", "id": 66350, "pid": 76337, "tid": -914061504, "ts": 1716454222299523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222368186, "dur": 1, "args": { "External id": 66352, "device": 5, "context": 1, "stream": 7, "correlation": 66352, "bytes": 960, "memory bandwidth (GB/s)": 0.5555555555555556 } }, { "ph": "f", "id": 66352, "pid": 5, "tid": 7, "ts": 1716454222368186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222299533, "dur": 7, "args": { "External id": 66352, "cbid": 51, "correlation": 66352 } }, { "ph": "s", "id": 66352, "pid": 76337, "tid": -914061504, "ts": 1716454222299533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222368189, "dur": 710, "args": { "External id": 66353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66353, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66353, "pid": 5, "tid": 7, "ts": 1716454222368189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299542, "dur": 6, "args": { "External id": 66353, "cbid": 211, "correlation": 66353 } }, { "ph": "s", "id": 66353, "pid": 76337, "tid": -914061504, "ts": 1716454222299542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222368901, "dur": 13, "args": { "External id": 66355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66355, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66355, "pid": 5, "tid": 7, "ts": 1716454222368901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299552, "dur": 5, "args": { "External id": 66355, "cbid": 211, "correlation": 66355 } }, { "ph": "s", "id": 66355, "pid": 76337, "tid": -914061504, "ts": 1716454222299552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222368915, "dur": 17, "args": { "External id": 66361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66361, "pid": 5, "tid": 7, "ts": 1716454222368915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299580, "dur": 9, "args": { "External id": 66361, "cbid": 211, "correlation": 66361 } }, { "ph": "s", "id": 66361, "pid": 76337, "tid": -914061504, "ts": 1716454222299580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222299642, "dur": 0, "args": { "External id": 66371, "cbid": 317, "correlation": 66371 } }, { "ph": "f", "id": 66371, "pid": 76337, "tid": -914061504, "ts": 1716454222299642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222299643, "dur": 0, "args": { "External id": 66372, "cbid": 203, "correlation": 66372 } }, { "ph": "f", "id": 66372, "pid": 76337, "tid": -914061504, "ts": 1716454222299643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222299644, "dur": 0, "args": { "External id": 66373, "cbid": 205, "correlation": 66373 } }, { "ph": "f", "id": 66373, "pid": 76337, "tid": -914061504, "ts": 1716454222299644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222368933, "dur": 21, "args": { "External id": 66377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66377, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66377, "pid": 5, "tid": 7, "ts": 1716454222368933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299655, "dur": 12, "args": { "External id": 66377, "cbid": 211, "correlation": 66377 } }, { "ph": "s", "id": 66377, "pid": 76337, "tid": -914061504, "ts": 1716454222299655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222368956, "dur": 4, "args": { "External id": 66379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66379, "pid": 5, "tid": 7, "ts": 1716454222368956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299671, "dur": 6, "args": { "External id": 66379, "cbid": 211, "correlation": 66379 } }, { "ph": "s", "id": 66379, "pid": 76337, "tid": -914061504, "ts": 1716454222299671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222299679, "dur": 0, "args": { "External id": 66380, "cbid": 51, "correlation": 66380 } }, { "ph": "s", "id": 66380, "pid": 76337, "tid": -914061504, "ts": 1716454222299679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222368962, "dur": 190, "args": { "External id": 66381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66381, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 66381, "pid": 5, "tid": 7, "ts": 1716454222368962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299680, "dur": 5, "args": { "External id": 66381, "cbid": 211, "correlation": 66381 } }, { "ph": "s", "id": 66381, "pid": 76337, "tid": -914061504, "ts": 1716454222299680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222369153, "dur": 17, "args": { "External id": 66386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66386, "pid": 5, "tid": 7, "ts": 1716454222369153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299706, "dur": 8, "args": { "External id": 66386, "cbid": 211, "correlation": 66386 } }, { "ph": "s", "id": 66386, "pid": 76337, "tid": -914061504, "ts": 1716454222299706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222369172, "dur": 14, "args": { "External id": 66394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66394, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66394, "pid": 5, "tid": 7, "ts": 1716454222369172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299734, "dur": 8, "args": { "External id": 66394, "cbid": 211, "correlation": 66394 } }, { "ph": "s", "id": 66394, "pid": 76337, "tid": -914061504, "ts": 1716454222299734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222369187, "dur": 11, "args": { "External id": 66402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66402, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66402, "pid": 5, "tid": 7, "ts": 1716454222369187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299763, "dur": 8, "args": { "External id": 66402, "cbid": 211, "correlation": 66402 } }, { "ph": "s", "id": 66402, "pid": 76337, "tid": -914061504, "ts": 1716454222299763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222369199, "dur": 20, "args": { "External id": 66422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66422, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 66422, "pid": 5, "tid": 7, "ts": 1716454222369199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299844, "dur": 13, "args": { "External id": 66422, "cbid": 211, "correlation": 66422 } }, { "ph": "s", "id": 66422, "pid": 76337, "tid": -914061504, "ts": 1716454222299844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222369221, "dur": 5, "args": { "External id": 66434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66434, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 66434, "pid": 5, "tid": 7, "ts": 1716454222369221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299867, "dur": 9, "args": { "External id": 66434, "cbid": 211, "correlation": 66434 } }, { "ph": "s", "id": 66434, "pid": 76337, "tid": -914061504, "ts": 1716454222299867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222369227, "dur": 18, "args": { "External id": 66437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66437, "pid": 5, "tid": 7, "ts": 1716454222369227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299887, "dur": 7, "args": { "External id": 66437, "cbid": 211, "correlation": 66437 } }, { "ph": "s", "id": 66437, "pid": 76337, "tid": -914061504, "ts": 1716454222299887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222299957, "dur": 0, "args": { "External id": 66448, "cbid": 317, "correlation": 66448 } }, { "ph": "f", "id": 66448, "pid": 76337, "tid": -914061504, "ts": 1716454222299957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222299958, "dur": 0, "args": { "External id": 66449, "cbid": 203, "correlation": 66449 } }, { "ph": "f", "id": 66449, "pid": 76337, "tid": -914061504, "ts": 1716454222299958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222299959, "dur": 0, "args": { "External id": 66450, "cbid": 205, "correlation": 66450 } }, { "ph": "f", "id": 66450, "pid": 76337, "tid": -914061504, "ts": 1716454222299959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222369246, "dur": 11, "args": { "External id": 66454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66454, "pid": 5, "tid": 7, "ts": 1716454222369246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222299985, "dur": 17, "args": { "External id": 66454, "cbid": 211, "correlation": 66454 } }, { "ph": "s", "id": 66454, "pid": 76337, "tid": -914061504, "ts": 1716454222299985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222369259, "dur": 4, "args": { "External id": 66456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66456, "pid": 5, "tid": 7, "ts": 1716454222369259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300008, "dur": 10, "args": { "External id": 66456, "cbid": 211, "correlation": 66456 } }, { "ph": "s", "id": 66456, "pid": 76337, "tid": -914061504, "ts": 1716454222300008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222300022, "dur": 0, "args": { "External id": 66457, "cbid": 51, "correlation": 66457 } }, { "ph": "s", "id": 66457, "pid": 76337, "tid": -914061504, "ts": 1716454222300022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222369264, "dur": 100, "args": { "External id": 66458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66458, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 66458, "pid": 5, "tid": 7, "ts": 1716454222369264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300023, "dur": 7, "args": { "External id": 66458, "cbid": 211, "correlation": 66458 } }, { "ph": "s", "id": 66458, "pid": 76337, "tid": -914061504, "ts": 1716454222300023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222369365, "dur": 18, "args": { "External id": 66463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66463, "pid": 5, "tid": 7, "ts": 1716454222369365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300060, "dur": 15, "args": { "External id": 66463, "cbid": 211, "correlation": 66463 } }, { "ph": "s", "id": 66463, "pid": 76337, "tid": -914061504, "ts": 1716454222300060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222369384, "dur": 92, "args": { "External id": 66472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66472, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66472, "pid": 5, "tid": 7, "ts": 1716454222369384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300215, "dur": 17, "args": { "External id": 66472, "cbid": 211, "correlation": 66472 } }, { "ph": "s", "id": 66472, "pid": 76337, "tid": -914061504, "ts": 1716454222300215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222369478, "dur": 33, "args": { "External id": 66494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66494, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66494, "pid": 5, "tid": 7, "ts": 1716454222369478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300281, "dur": 10, "args": { "External id": 66494, "cbid": 211, "correlation": 66494 } }, { "ph": "s", "id": 66494, "pid": 76337, "tid": -914061504, "ts": 1716454222300281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222300375, "dur": 5, "args": { "External id": 66505, "cbid": 251, "correlation": 66505 } }, { "ph": "f", "id": 66505, "pid": 76337, "tid": -914061504, "ts": 1716454222300375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222369512, "dur": 178, "args": { "External id": 66506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66506, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66506, "pid": 5, "tid": 7, "ts": 1716454222369512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300384, "dur": 13, "args": { "External id": 66506, "cbid": 211, "correlation": 66506 } }, { "ph": "s", "id": 66506, "pid": 76337, "tid": -914061504, "ts": 1716454222300384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222300455, "dur": 1, "args": { "External id": 66517, "cbid": 251, "correlation": 66517 } }, { "ph": "f", "id": 66517, "pid": 76337, "tid": -914061504, "ts": 1716454222300455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222369691, "dur": 175, "args": { "External id": 66518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66518, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66518, "pid": 5, "tid": 7, "ts": 1716454222369691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300459, "dur": 12, "args": { "External id": 66518, "cbid": 211, "correlation": 66518 } }, { "ph": "s", "id": 66518, "pid": 76337, "tid": -914061504, "ts": 1716454222300459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222300524, "dur": 1, "args": { "External id": 66529, "cbid": 251, "correlation": 66529 } }, { "ph": "f", "id": 66529, "pid": 76337, "tid": -914061504, "ts": 1716454222300524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222369867, "dur": 171, "args": { "External id": 66530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66530, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66530, "pid": 5, "tid": 7, "ts": 1716454222369867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300528, "dur": 11, "args": { "External id": 66530, "cbid": 211, "correlation": 66530 } }, { "ph": "s", "id": 66530, "pid": 76337, "tid": -914061504, "ts": 1716454222300528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222370039, "dur": 364, "args": { "External id": 66555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66555, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66555, "pid": 5, "tid": 7, "ts": 1716454222370039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300628, "dur": 13, "args": { "External id": 66555, "cbid": 211, "correlation": 66555 } }, { "ph": "s", "id": 66555, "pid": 76337, "tid": -914061504, "ts": 1716454222300628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222300735, "dur": 1, "args": { "External id": 66573, "cbid": 251, "correlation": 66573 } }, { "ph": "f", "id": 66573, "pid": 76337, "tid": -914061504, "ts": 1716454222300735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222370405, "dur": 180, "args": { "External id": 66575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66575, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66575, "pid": 5, "tid": 7, "ts": 1716454222370405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300740, "dur": 13, "args": { "External id": 66575, "cbid": 211, "correlation": 66575 } }, { "ph": "s", "id": 66575, "pid": 76337, "tid": -914061504, "ts": 1716454222300740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222370587, "dur": 20, "args": { "External id": 66583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66583, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66583, "pid": 5, "tid": 7, "ts": 1716454222370587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300813, "dur": 15, "args": { "External id": 66583, "cbid": 211, "correlation": 66583 } }, { "ph": "s", "id": 66583, "pid": 76337, "tid": -914061504, "ts": 1716454222300813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222370608, "dur": 28, "args": { "External id": 66591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66591, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66591, "pid": 5, "tid": 7, "ts": 1716454222370608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300857, "dur": 9, "args": { "External id": 66591, "cbid": 211, "correlation": 66591 } }, { "ph": "s", "id": 66591, "pid": 76337, "tid": -914061504, "ts": 1716454222300857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222370637, "dur": 20, "args": { "External id": 66602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66602, "pid": 5, "tid": 7, "ts": 1716454222370637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300931, "dur": 13, "args": { "External id": 66602, "cbid": 211, "correlation": 66602 } }, { "ph": "s", "id": 66602, "pid": 76337, "tid": -914061504, "ts": 1716454222300931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222370659, "dur": 18, "args": { "External id": 66624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66624, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66624, "pid": 5, "tid": 7, "ts": 1716454222370659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222300963, "dur": 7, "args": { "External id": 66624, "cbid": 211, "correlation": 66624 } }, { "ph": "s", "id": 66624, "pid": 76337, "tid": -914061504, "ts": 1716454222300963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301058, "dur": 2, "args": { "External id": 66635, "cbid": 251, "correlation": 66635 } }, { "ph": "f", "id": 66635, "pid": 76337, "tid": -914061504, "ts": 1716454222301058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222370678, "dur": 98, "args": { "External id": 66636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66636, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66636, "pid": 5, "tid": 7, "ts": 1716454222370678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301063, "dur": 14, "args": { "External id": 66636, "cbid": 211, "correlation": 66636 } }, { "ph": "s", "id": 66636, "pid": 76337, "tid": -914061504, "ts": 1716454222301063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301134, "dur": 1, "args": { "External id": 66647, "cbid": 251, "correlation": 66647 } }, { "ph": "f", "id": 66647, "pid": 76337, "tid": -914061504, "ts": 1716454222301134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301138, "dur": 0, "args": { "External id": 66648, "cbid": 251, "correlation": 66648 } }, { "ph": "f", "id": 66648, "pid": 76337, "tid": -914061504, "ts": 1716454222301138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222370778, "dur": 13, "args": { "External id": 66649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66649, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66649, "pid": 5, "tid": 7, "ts": 1716454222370778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301140, "dur": 13, "args": { "External id": 66649, "cbid": 211, "correlation": 66649 } }, { "ph": "s", "id": 66649, "pid": 76337, "tid": -914061504, "ts": 1716454222301140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222370792, "dur": 6, "args": { "External id": 66651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66651, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66651, "pid": 5, "tid": 7, "ts": 1716454222370792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301154, "dur": 6, "args": { "External id": 66651, "cbid": 211, "correlation": 66651 } }, { "ph": "s", "id": 66651, "pid": 76337, "tid": -914061504, "ts": 1716454222301154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301214, "dur": 1, "args": { "External id": 66662, "cbid": 251, "correlation": 66662 } }, { "ph": "f", "id": 66662, "pid": 76337, "tid": -914061504, "ts": 1716454222301214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301217, "dur": 0, "args": { "External id": 66663, "cbid": 251, "correlation": 66663 } }, { "ph": "f", "id": 66663, "pid": 76337, "tid": -914061504, "ts": 1716454222301217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222370800, "dur": 9, "args": { "External id": 66664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66664, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66664, "pid": 5, "tid": 7, "ts": 1716454222370800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301219, "dur": 11, "args": { "External id": 66664, "cbid": 211, "correlation": 66664 } }, { "ph": "s", "id": 66664, "pid": 76337, "tid": -914061504, "ts": 1716454222301219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222370811, "dur": 4, "args": { "External id": 66666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66666, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66666, "pid": 5, "tid": 7, "ts": 1716454222370811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301232, "dur": 5, "args": { "External id": 66666, "cbid": 211, "correlation": 66666 } }, { "ph": "s", "id": 66666, "pid": 76337, "tid": -914061504, "ts": 1716454222301232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222370816, "dur": 60, "args": { "External id": 66691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66691, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66691, "pid": 5, "tid": 7, "ts": 1716454222370816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301309, "dur": 12, "args": { "External id": 66691, "cbid": 211, "correlation": 66691 } }, { "ph": "s", "id": 66691, "pid": 76337, "tid": -914061504, "ts": 1716454222301309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301406, "dur": 1, "args": { "External id": 66709, "cbid": 251, "correlation": 66709 } }, { "ph": "f", "id": 66709, "pid": 76337, "tid": -914061504, "ts": 1716454222301406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222370877, "dur": 100, "args": { "External id": 66711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66711, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66711, "pid": 5, "tid": 7, "ts": 1716454222370877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301413, "dur": 14, "args": { "External id": 66711, "cbid": 211, "correlation": 66711 } }, { "ph": "s", "id": 66711, "pid": 76337, "tid": -914061504, "ts": 1716454222301413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222370978, "dur": 10, "args": { "External id": 66719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66719, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66719, "pid": 5, "tid": 7, "ts": 1716454222370978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301483, "dur": 12, "args": { "External id": 66719, "cbid": 211, "correlation": 66719 } }, { "ph": "s", "id": 66719, "pid": 76337, "tid": -914061504, "ts": 1716454222301483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222370989, "dur": 22, "args": { "External id": 66727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66727, "pid": 5, "tid": 7, "ts": 1716454222370989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301525, "dur": 9, "args": { "External id": 66727, "cbid": 211, "correlation": 66727 } }, { "ph": "s", "id": 66727, "pid": 76337, "tid": -914061504, "ts": 1716454222301525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222371013, "dur": 20, "args": { "External id": 66749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66749, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66749, "pid": 5, "tid": 7, "ts": 1716454222371013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301580, "dur": 10, "args": { "External id": 66749, "cbid": 211, "correlation": 66749 } }, { "ph": "s", "id": 66749, "pid": 76337, "tid": -914061504, "ts": 1716454222301580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301667, "dur": 1, "args": { "External id": 66765, "cbid": 251, "correlation": 66765 } }, { "ph": "f", "id": 66765, "pid": 76337, "tid": -914061504, "ts": 1716454222301667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301672, "dur": 0, "args": { "External id": 66767, "cbid": 251, "correlation": 66767 } }, { "ph": "f", "id": 66767, "pid": 76337, "tid": -914061504, "ts": 1716454222301672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222371034, "dur": 531, "args": { "External id": 66768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66768, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66768, "pid": 5, "tid": 7, "ts": 1716454222371034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301674, "dur": 13, "args": { "External id": 66768, "cbid": 211, "correlation": 66768 } }, { "ph": "s", "id": 66768, "pid": 76337, "tid": -914061504, "ts": 1716454222301674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222371566, "dur": 71, "args": { "External id": 66776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66776, "pid": 5, "tid": 7, "ts": 1716454222371566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301742, "dur": 13, "args": { "External id": 66776, "cbid": 211, "correlation": 66776 } }, { "ph": "s", "id": 66776, "pid": 76337, "tid": -914061504, "ts": 1716454222301742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222371639, "dur": 68, "args": { "External id": 66784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66784, "pid": 5, "tid": 7, "ts": 1716454222371639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301774, "dur": 8, "args": { "External id": 66784, "cbid": 211, "correlation": 66784 } }, { "ph": "s", "id": 66784, "pid": 76337, "tid": -914061504, "ts": 1716454222301774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222301855, "dur": 1, "args": { "External id": 66800, "cbid": 251, "correlation": 66800 } }, { "ph": "f", "id": 66800, "pid": 76337, "tid": -914061504, "ts": 1716454222301855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222371709, "dur": 1, "args": { "External id": 66802, "device": 5, "context": 1, "stream": 7, "correlation": 66802, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 66802, "pid": 5, "tid": 7, "ts": 1716454222371709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222301860, "dur": 10, "args": { "External id": 66802, "cbid": 51, "correlation": 66802 } }, { "ph": "s", "id": 66802, "pid": 76337, "tid": -914061504, "ts": 1716454222301860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222371712, "dur": 290, "args": { "External id": 66803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66803, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66803, "pid": 5, "tid": 7, "ts": 1716454222371712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301872, "dur": 11, "args": { "External id": 66803, "cbid": 211, "correlation": 66803 } }, { "ph": "s", "id": 66803, "pid": 76337, "tid": -914061504, "ts": 1716454222301872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222372004, "dur": 15, "args": { "External id": 66811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66811, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66811, "pid": 5, "tid": 7, "ts": 1716454222372004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301914, "dur": 13, "args": { "External id": 66811, "cbid": 211, "correlation": 66811 } }, { "ph": "s", "id": 66811, "pid": 76337, "tid": -914061504, "ts": 1716454222301914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222372020, "dur": 42, "args": { "External id": 66822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66822, "pid": 5, "tid": 7, "ts": 1716454222372020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222301995, "dur": 14, "args": { "External id": 66822, "cbid": 211, "correlation": 66822 } }, { "ph": "s", "id": 66822, "pid": 76337, "tid": -914061504, "ts": 1716454222301995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222302064, "dur": 0, "args": { "External id": 66834, "cbid": 317, "correlation": 66834 } }, { "ph": "f", "id": 66834, "pid": 76337, "tid": -914061504, "ts": 1716454222302064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222302065, "dur": 0, "args": { "External id": 66835, "cbid": 203, "correlation": 66835 } }, { "ph": "f", "id": 66835, "pid": 76337, "tid": -914061504, "ts": 1716454222302065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222302065, "dur": 0, "args": { "External id": 66836, "cbid": 205, "correlation": 66836 } }, { "ph": "f", "id": 66836, "pid": 76337, "tid": -914061504, "ts": 1716454222302065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222372064, "dur": 13, "args": { "External id": 66840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66840, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66840, "pid": 5, "tid": 7, "ts": 1716454222372064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302096, "dur": 13, "args": { "External id": 66840, "cbid": 211, "correlation": 66840 } }, { "ph": "s", "id": 66840, "pid": 76337, "tid": -914061504, "ts": 1716454222302096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222372078, "dur": 4, "args": { "External id": 66842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 66842, "pid": 5, "tid": 7, "ts": 1716454222372078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302114, "dur": 6, "args": { "External id": 66842, "cbid": 211, "correlation": 66842 } }, { "ph": "s", "id": 66842, "pid": 76337, "tid": -914061504, "ts": 1716454222302114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222302123, "dur": 0, "args": { "External id": 66843, "cbid": 51, "correlation": 66843 } }, { "ph": "s", "id": 66843, "pid": 76337, "tid": -914061504, "ts": 1716454222302123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222372084, "dur": 108, "args": { "External id": 66844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66844, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 66844, "pid": 5, "tid": 7, "ts": 1716454222372084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302124, "dur": 6, "args": { "External id": 66844, "cbid": 211, "correlation": 66844 } }, { "ph": "s", "id": 66844, "pid": 76337, "tid": -914061504, "ts": 1716454222302124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222372193, "dur": 17, "args": { "External id": 66849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66849, "pid": 5, "tid": 7, "ts": 1716454222372193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302153, "dur": 9, "args": { "External id": 66849, "cbid": 211, "correlation": 66849 } }, { "ph": "s", "id": 66849, "pid": 76337, "tid": -914061504, "ts": 1716454222302153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222372212, "dur": 12, "args": { "External id": 66857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66857, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66857, "pid": 5, "tid": 7, "ts": 1716454222372212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302186, "dur": 8, "args": { "External id": 66857, "cbid": 211, "correlation": 66857 } }, { "ph": "s", "id": 66857, "pid": 76337, "tid": -914061504, "ts": 1716454222302186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222372226, "dur": 28, "args": { "External id": 66866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66866, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66866, "pid": 5, "tid": 7, "ts": 1716454222372226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302227, "dur": 10, "args": { "External id": 66866, "cbid": 211, "correlation": 66866 } }, { "ph": "s", "id": 66866, "pid": 76337, "tid": -914061504, "ts": 1716454222302227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222372255, "dur": 26, "args": { "External id": 66886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66886, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 66886, "pid": 5, "tid": 7, "ts": 1716454222372255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302304, "dur": 12, "args": { "External id": 66886, "cbid": 211, "correlation": 66886 } }, { "ph": "s", "id": 66886, "pid": 76337, "tid": -914061504, "ts": 1716454222302304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222372283, "dur": 5, "args": { "External id": 66898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66898, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 66898, "pid": 5, "tid": 7, "ts": 1716454222372283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302326, "dur": 7, "args": { "External id": 66898, "cbid": 211, "correlation": 66898 } }, { "ph": "s", "id": 66898, "pid": 76337, "tid": -914061504, "ts": 1716454222302326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222372290, "dur": 26, "args": { "External id": 66901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66901, "pid": 5, "tid": 7, "ts": 1716454222372290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302345, "dur": 7, "args": { "External id": 66901, "cbid": 211, "correlation": 66901 } }, { "ph": "s", "id": 66901, "pid": 76337, "tid": -914061504, "ts": 1716454222302345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222372317, "dur": 18, "args": { "External id": 66910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66910, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66910, "pid": 5, "tid": 7, "ts": 1716454222372317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302385, "dur": 10, "args": { "External id": 66910, "cbid": 211, "correlation": 66910 } }, { "ph": "s", "id": 66910, "pid": 76337, "tid": -914061504, "ts": 1716454222302385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222302438, "dur": 0, "args": { "External id": 66920, "cbid": 317, "correlation": 66920 } }, { "ph": "f", "id": 66920, "pid": 76337, "tid": -914061504, "ts": 1716454222302438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222302439, "dur": 0, "args": { "External id": 66921, "cbid": 203, "correlation": 66921 } }, { "ph": "f", "id": 66921, "pid": 76337, "tid": -914061504, "ts": 1716454222302439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222302440, "dur": 0, "args": { "External id": 66922, "cbid": 205, "correlation": 66922 } }, { "ph": "f", "id": 66922, "pid": 76337, "tid": -914061504, "ts": 1716454222302440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222372336, "dur": 18, "args": { "External id": 66926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66926, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66926, "pid": 5, "tid": 7, "ts": 1716454222372336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302459, "dur": 12, "args": { "External id": 66926, "cbid": 211, "correlation": 66926 } }, { "ph": "s", "id": 66926, "pid": 76337, "tid": -914061504, "ts": 1716454222302459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222372356, "dur": 262, "args": { "External id": 66928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66928, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66928, "pid": 5, "tid": 7, "ts": 1716454222372356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302473, "dur": 6, "args": { "External id": 66928, "cbid": 211, "correlation": 66928 } }, { "ph": "s", "id": 66928, "pid": 76337, "tid": -914061504, "ts": 1716454222302473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222372620, "dur": 1, "args": { "External id": 66930, "device": 5, "context": 1, "stream": 7, "correlation": 66930, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 66930, "pid": 5, "tid": 7, "ts": 1716454222372620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222302486, "dur": 7, "args": { "External id": 66930, "cbid": 51, "correlation": 66930 } }, { "ph": "s", "id": 66930, "pid": 76337, "tid": -914061504, "ts": 1716454222302486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222372624, "dur": 889, "args": { "External id": 66931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66931, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66931, "pid": 5, "tid": 7, "ts": 1716454222372624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302495, "dur": 6, "args": { "External id": 66931, "cbid": 211, "correlation": 66931 } }, { "ph": "s", "id": 66931, "pid": 76337, "tid": -914061504, "ts": 1716454222302495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222373515, "dur": 15, "args": { "External id": 66933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66933, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66933, "pid": 5, "tid": 7, "ts": 1716454222373515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302505, "dur": 5, "args": { "External id": 66933, "cbid": 211, "correlation": 66933 } }, { "ph": "s", "id": 66933, "pid": 76337, "tid": -914061504, "ts": 1716454222302505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222373531, "dur": 16, "args": { "External id": 66939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66939, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66939, "pid": 5, "tid": 7, "ts": 1716454222373531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302535, "dur": 10, "args": { "External id": 66939, "cbid": 211, "correlation": 66939 } }, { "ph": "s", "id": 66939, "pid": 76337, "tid": -914061504, "ts": 1716454222302535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222373548, "dur": 4, "args": { "External id": 66947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66947, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 66947, "pid": 5, "tid": 7, "ts": 1716454222373548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302580, "dur": 10, "args": { "External id": 66947, "cbid": 211, "correlation": 66947 } }, { "ph": "s", "id": 66947, "pid": 76337, "tid": -914061504, "ts": 1716454222302580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222302649, "dur": 1, "args": { "External id": 66963, "cbid": 251, "correlation": 66963 } }, { "ph": "f", "id": 66963, "pid": 76337, "tid": -914061504, "ts": 1716454222302649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222302654, "dur": 0, "args": { "External id": 66965, "cbid": 251, "correlation": 66965 } }, { "ph": "f", "id": 66965, "pid": 76337, "tid": -914061504, "ts": 1716454222302654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222373553, "dur": 13, "args": { "External id": 66966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66966, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66966, "pid": 5, "tid": 7, "ts": 1716454222373553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302656, "dur": 11, "args": { "External id": 66966, "cbid": 211, "correlation": 66966 } }, { "ph": "s", "id": 66966, "pid": 76337, "tid": -914061504, "ts": 1716454222302656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222373568, "dur": 6, "args": { "External id": 66968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66968, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 66968, "pid": 5, "tid": 7, "ts": 1716454222373568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302669, "dur": 6, "args": { "External id": 66968, "cbid": 211, "correlation": 66968 } }, { "ph": "s", "id": 66968, "pid": 76337, "tid": -914061504, "ts": 1716454222302669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222373575, "dur": 18, "args": { "External id": 66978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66978, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 66978, "pid": 5, "tid": 7, "ts": 1716454222373575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302729, "dur": 13, "args": { "External id": 66978, "cbid": 211, "correlation": 66978 } }, { "ph": "s", "id": 66978, "pid": 76337, "tid": -914061504, "ts": 1716454222302729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222373595, "dur": 20, "args": { "External id": 66998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 66998, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 66998, "pid": 5, "tid": 7, "ts": 1716454222373595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302795, "dur": 11, "args": { "External id": 66998, "cbid": 211, "correlation": 66998 } }, { "ph": "s", "id": 66998, "pid": 76337, "tid": -914061504, "ts": 1716454222302795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222373616, "dur": 4, "args": { "External id": 67010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67010, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 67010, "pid": 5, "tid": 7, "ts": 1716454222373616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302816, "dur": 6, "args": { "External id": 67010, "cbid": 211, "correlation": 67010 } }, { "ph": "s", "id": 67010, "pid": 76337, "tid": -914061504, "ts": 1716454222302816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222373622, "dur": 18, "args": { "External id": 67013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67013, "pid": 5, "tid": 7, "ts": 1716454222373622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302834, "dur": 9, "args": { "External id": 67013, "cbid": 211, "correlation": 67013 } }, { "ph": "s", "id": 67013, "pid": 76337, "tid": -914061504, "ts": 1716454222302834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222373641, "dur": 11, "args": { "External id": 67022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67022, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67022, "pid": 5, "tid": 7, "ts": 1716454222373641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302878, "dur": 10, "args": { "External id": 67022, "cbid": 211, "correlation": 67022 } }, { "ph": "s", "id": 67022, "pid": 76337, "tid": -914061504, "ts": 1716454222302878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222302940, "dur": 0, "args": { "External id": 67032, "cbid": 317, "correlation": 67032 } }, { "ph": "f", "id": 67032, "pid": 76337, "tid": -914061504, "ts": 1716454222302940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222302940, "dur": 0, "args": { "External id": 67033, "cbid": 203, "correlation": 67033 } }, { "ph": "f", "id": 67033, "pid": 76337, "tid": -914061504, "ts": 1716454222302940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222302941, "dur": 0, "args": { "External id": 67034, "cbid": 205, "correlation": 67034 } }, { "ph": "f", "id": 67034, "pid": 76337, "tid": -914061504, "ts": 1716454222302941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222373654, "dur": 13, "args": { "External id": 67038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67038, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67038, "pid": 5, "tid": 7, "ts": 1716454222373654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302955, "dur": 12, "args": { "External id": 67038, "cbid": 211, "correlation": 67038 } }, { "ph": "s", "id": 67038, "pid": 76337, "tid": -914061504, "ts": 1716454222302955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222373668, "dur": 178, "args": { "External id": 67040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67040, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67040, "pid": 5, "tid": 7, "ts": 1716454222373668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302970, "dur": 14, "args": { "External id": 67040, "cbid": 211, "correlation": 67040 } }, { "ph": "s", "id": 67040, "pid": 76337, "tid": -914061504, "ts": 1716454222302970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222373849, "dur": 1, "args": { "External id": 67042, "device": 5, "context": 1, "stream": 7, "correlation": 67042, "bytes": 960, "memory bandwidth (GB/s)": 0.5660377358490566 } }, { "ph": "f", "id": 67042, "pid": 5, "tid": 7, "ts": 1716454222373849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222302989, "dur": 7, "args": { "External id": 67042, "cbid": 51, "correlation": 67042 } }, { "ph": "s", "id": 67042, "pid": 76337, "tid": -914061504, "ts": 1716454222302989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222373853, "dur": 712, "args": { "External id": 67043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67043, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67043, "pid": 5, "tid": 7, "ts": 1716454222373853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222302997, "dur": 6, "args": { "External id": 67043, "cbid": 211, "correlation": 67043 } }, { "ph": "s", "id": 67043, "pid": 76337, "tid": -914061504, "ts": 1716454222302997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222374566, "dur": 12, "args": { "External id": 67045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67045, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67045, "pid": 5, "tid": 7, "ts": 1716454222374566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303008, "dur": 6, "args": { "External id": 67045, "cbid": 211, "correlation": 67045 } }, { "ph": "s", "id": 67045, "pid": 76337, "tid": -914061504, "ts": 1716454222303008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222374579, "dur": 16, "args": { "External id": 67051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67051, "pid": 5, "tid": 7, "ts": 1716454222374579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303037, "dur": 8, "args": { "External id": 67051, "cbid": 211, "correlation": 67051 } }, { "ph": "s", "id": 67051, "pid": 76337, "tid": -914061504, "ts": 1716454222303037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222303096, "dur": 0, "args": { "External id": 67061, "cbid": 317, "correlation": 67061 } }, { "ph": "f", "id": 67061, "pid": 76337, "tid": -914061504, "ts": 1716454222303096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222303097, "dur": 0, "args": { "External id": 67062, "cbid": 203, "correlation": 67062 } }, { "ph": "f", "id": 67062, "pid": 76337, "tid": -914061504, "ts": 1716454222303097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222303098, "dur": 0, "args": { "External id": 67063, "cbid": 205, "correlation": 67063 } }, { "ph": "f", "id": 67063, "pid": 76337, "tid": -914061504, "ts": 1716454222303098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222374597, "dur": 18, "args": { "External id": 67067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67067, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67067, "pid": 5, "tid": 7, "ts": 1716454222374597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303118, "dur": 12, "args": { "External id": 67067, "cbid": 211, "correlation": 67067 } }, { "ph": "s", "id": 67067, "pid": 76337, "tid": -914061504, "ts": 1716454222303118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222374616, "dur": 4, "args": { "External id": 67069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67069, "pid": 5, "tid": 7, "ts": 1716454222374616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303135, "dur": 6, "args": { "External id": 67069, "cbid": 211, "correlation": 67069 } }, { "ph": "s", "id": 67069, "pid": 76337, "tid": -914061504, "ts": 1716454222303135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222303144, "dur": 0, "args": { "External id": 67070, "cbid": 51, "correlation": 67070 } }, { "ph": "s", "id": 67070, "pid": 76337, "tid": -914061504, "ts": 1716454222303144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222374622, "dur": 145, "args": { "External id": 67071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67071, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 67071, "pid": 5, "tid": 7, "ts": 1716454222374622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303144, "dur": 6, "args": { "External id": 67071, "cbid": 211, "correlation": 67071 } }, { "ph": "s", "id": 67071, "pid": 76337, "tid": -914061504, "ts": 1716454222303144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222374768, "dur": 17, "args": { "External id": 67076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67076, "pid": 5, "tid": 7, "ts": 1716454222374768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303173, "dur": 8, "args": { "External id": 67076, "cbid": 211, "correlation": 67076 } }, { "ph": "s", "id": 67076, "pid": 76337, "tid": -914061504, "ts": 1716454222303173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222374787, "dur": 12, "args": { "External id": 67084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67084, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67084, "pid": 5, "tid": 7, "ts": 1716454222374787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303201, "dur": 8, "args": { "External id": 67084, "cbid": 211, "correlation": 67084 } }, { "ph": "s", "id": 67084, "pid": 76337, "tid": -914061504, "ts": 1716454222303201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222374800, "dur": 11, "args": { "External id": 67092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67092, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67092, "pid": 5, "tid": 7, "ts": 1716454222374800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303230, "dur": 8, "args": { "External id": 67092, "cbid": 211, "correlation": 67092 } }, { "ph": "s", "id": 67092, "pid": 76337, "tid": -914061504, "ts": 1716454222303230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222374813, "dur": 20, "args": { "External id": 67112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67112, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 67112, "pid": 5, "tid": 7, "ts": 1716454222374813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303311, "dur": 13, "args": { "External id": 67112, "cbid": 211, "correlation": 67112 } }, { "ph": "s", "id": 67112, "pid": 76337, "tid": -914061504, "ts": 1716454222303311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222374834, "dur": 4, "args": { "External id": 67124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67124, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 67124, "pid": 5, "tid": 7, "ts": 1716454222374834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303334, "dur": 6, "args": { "External id": 67124, "cbid": 211, "correlation": 67124 } }, { "ph": "s", "id": 67124, "pid": 76337, "tid": -914061504, "ts": 1716454222303334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222374840, "dur": 18, "args": { "External id": 67127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67127, "pid": 5, "tid": 7, "ts": 1716454222374840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303351, "dur": 7, "args": { "External id": 67127, "cbid": 211, "correlation": 67127 } }, { "ph": "s", "id": 67127, "pid": 76337, "tid": -914061504, "ts": 1716454222303351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222303409, "dur": 0, "args": { "External id": 67138, "cbid": 317, "correlation": 67138 } }, { "ph": "f", "id": 67138, "pid": 76337, "tid": -914061504, "ts": 1716454222303409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222303409, "dur": 0, "args": { "External id": 67139, "cbid": 203, "correlation": 67139 } }, { "ph": "f", "id": 67139, "pid": 76337, "tid": -914061504, "ts": 1716454222303409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222303410, "dur": 0, "args": { "External id": 67140, "cbid": 205, "correlation": 67140 } }, { "ph": "f", "id": 67140, "pid": 76337, "tid": -914061504, "ts": 1716454222303410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222374859, "dur": 12, "args": { "External id": 67144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67144, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67144, "pid": 5, "tid": 7, "ts": 1716454222374859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303423, "dur": 15, "args": { "External id": 67144, "cbid": 211, "correlation": 67144 } }, { "ph": "s", "id": 67144, "pid": 76337, "tid": -914061504, "ts": 1716454222303423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222374873, "dur": 4, "args": { "External id": 67146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67146, "pid": 5, "tid": 7, "ts": 1716454222374873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303443, "dur": 6, "args": { "External id": 67146, "cbid": 211, "correlation": 67146 } }, { "ph": "s", "id": 67146, "pid": 76337, "tid": -914061504, "ts": 1716454222303443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222303451, "dur": 0, "args": { "External id": 67147, "cbid": 51, "correlation": 67147 } }, { "ph": "s", "id": 67147, "pid": 76337, "tid": -914061504, "ts": 1716454222303451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222374878, "dur": 100, "args": { "External id": 67148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67148, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 67148, "pid": 5, "tid": 7, "ts": 1716454222374878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303452, "dur": 5, "args": { "External id": 67148, "cbid": 211, "correlation": 67148 } }, { "ph": "s", "id": 67148, "pid": 76337, "tid": -914061504, "ts": 1716454222303452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222374980, "dur": 17, "args": { "External id": 67153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67153, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67153, "pid": 5, "tid": 7, "ts": 1716454222374980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303479, "dur": 9, "args": { "External id": 67153, "cbid": 211, "correlation": 67153 } }, { "ph": "s", "id": 67153, "pid": 76337, "tid": -914061504, "ts": 1716454222303479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222374998, "dur": 91, "args": { "External id": 67162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67162, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67162, "pid": 5, "tid": 7, "ts": 1716454222374998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303559, "dur": 14, "args": { "External id": 67162, "cbid": 211, "correlation": 67162 } }, { "ph": "s", "id": 67162, "pid": 76337, "tid": -914061504, "ts": 1716454222303559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222375091, "dur": 32, "args": { "External id": 67184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67184, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67184, "pid": 5, "tid": 7, "ts": 1716454222375091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303616, "dur": 10, "args": { "External id": 67184, "cbid": 211, "correlation": 67184 } }, { "ph": "s", "id": 67184, "pid": 76337, "tid": -914061504, "ts": 1716454222303616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222303706, "dur": 1, "args": { "External id": 67195, "cbid": 251, "correlation": 67195 } }, { "ph": "f", "id": 67195, "pid": 76337, "tid": -914061504, "ts": 1716454222303706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222375124, "dur": 182, "args": { "External id": 67196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67196, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67196, "pid": 5, "tid": 7, "ts": 1716454222375124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303712, "dur": 13, "args": { "External id": 67196, "cbid": 211, "correlation": 67196 } }, { "ph": "s", "id": 67196, "pid": 76337, "tid": -914061504, "ts": 1716454222303712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222303782, "dur": 1, "args": { "External id": 67207, "cbid": 251, "correlation": 67207 } }, { "ph": "f", "id": 67207, "pid": 76337, "tid": -914061504, "ts": 1716454222303782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222375307, "dur": 175, "args": { "External id": 67208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67208, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67208, "pid": 5, "tid": 7, "ts": 1716454222375307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303786, "dur": 12, "args": { "External id": 67208, "cbid": 211, "correlation": 67208 } }, { "ph": "s", "id": 67208, "pid": 76337, "tid": -914061504, "ts": 1716454222303786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222303853, "dur": 1, "args": { "External id": 67219, "cbid": 251, "correlation": 67219 } }, { "ph": "f", "id": 67219, "pid": 76337, "tid": -914061504, "ts": 1716454222303853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222375484, "dur": 173, "args": { "External id": 67220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67220, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67220, "pid": 5, "tid": 7, "ts": 1716454222375484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303857, "dur": 11, "args": { "External id": 67220, "cbid": 211, "correlation": 67220 } }, { "ph": "s", "id": 67220, "pid": 76337, "tid": -914061504, "ts": 1716454222303857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222375659, "dur": 361, "args": { "External id": 67245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67245, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67245, "pid": 5, "tid": 7, "ts": 1716454222375659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222303941, "dur": 13, "args": { "External id": 67245, "cbid": 211, "correlation": 67245 } }, { "ph": "s", "id": 67245, "pid": 76337, "tid": -914061504, "ts": 1716454222303941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304050, "dur": 1, "args": { "External id": 67263, "cbid": 251, "correlation": 67263 } }, { "ph": "f", "id": 67263, "pid": 76337, "tid": -914061504, "ts": 1716454222304050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222376021, "dur": 183, "args": { "External id": 67265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67265, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67265, "pid": 5, "tid": 7, "ts": 1716454222376021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304056, "dur": 13, "args": { "External id": 67265, "cbid": 211, "correlation": 67265 } }, { "ph": "s", "id": 67265, "pid": 76337, "tid": -914061504, "ts": 1716454222304056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222376205, "dur": 20, "args": { "External id": 67273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67273, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67273, "pid": 5, "tid": 7, "ts": 1716454222376205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304127, "dur": 12, "args": { "External id": 67273, "cbid": 211, "correlation": 67273 } }, { "ph": "s", "id": 67273, "pid": 76337, "tid": -914061504, "ts": 1716454222304127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222376226, "dur": 28, "args": { "External id": 67281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67281, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67281, "pid": 5, "tid": 7, "ts": 1716454222376226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304165, "dur": 8, "args": { "External id": 67281, "cbid": 211, "correlation": 67281 } }, { "ph": "s", "id": 67281, "pid": 76337, "tid": -914061504, "ts": 1716454222304165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222376255, "dur": 20, "args": { "External id": 67292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67292, "pid": 5, "tid": 7, "ts": 1716454222376255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304238, "dur": 15, "args": { "External id": 67292, "cbid": 211, "correlation": 67292 } }, { "ph": "s", "id": 67292, "pid": 76337, "tid": -914061504, "ts": 1716454222304238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222376276, "dur": 18, "args": { "External id": 67314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67314, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67314, "pid": 5, "tid": 7, "ts": 1716454222376276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304274, "dur": 8, "args": { "External id": 67314, "cbid": 211, "correlation": 67314 } }, { "ph": "s", "id": 67314, "pid": 76337, "tid": -914061504, "ts": 1716454222304274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304361, "dur": 2, "args": { "External id": 67325, "cbid": 251, "correlation": 67325 } }, { "ph": "f", "id": 67325, "pid": 76337, "tid": -914061504, "ts": 1716454222304361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222376295, "dur": 97, "args": { "External id": 67326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67326, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67326, "pid": 5, "tid": 7, "ts": 1716454222376295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304366, "dur": 14, "args": { "External id": 67326, "cbid": 211, "correlation": 67326 } }, { "ph": "s", "id": 67326, "pid": 76337, "tid": -914061504, "ts": 1716454222304366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304435, "dur": 1, "args": { "External id": 67337, "cbid": 251, "correlation": 67337 } }, { "ph": "f", "id": 67337, "pid": 76337, "tid": -914061504, "ts": 1716454222304435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304438, "dur": 0, "args": { "External id": 67338, "cbid": 251, "correlation": 67338 } }, { "ph": "f", "id": 67338, "pid": 76337, "tid": -914061504, "ts": 1716454222304438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222376394, "dur": 13, "args": { "External id": 67339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67339, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67339, "pid": 5, "tid": 7, "ts": 1716454222376394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304440, "dur": 12, "args": { "External id": 67339, "cbid": 211, "correlation": 67339 } }, { "ph": "s", "id": 67339, "pid": 76337, "tid": -914061504, "ts": 1716454222304440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222376408, "dur": 6, "args": { "External id": 67341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67341, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67341, "pid": 5, "tid": 7, "ts": 1716454222376408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304453, "dur": 6, "args": { "External id": 67341, "cbid": 211, "correlation": 67341 } }, { "ph": "s", "id": 67341, "pid": 76337, "tid": -914061504, "ts": 1716454222304453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304512, "dur": 1, "args": { "External id": 67352, "cbid": 251, "correlation": 67352 } }, { "ph": "f", "id": 67352, "pid": 76337, "tid": -914061504, "ts": 1716454222304512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304515, "dur": 0, "args": { "External id": 67353, "cbid": 251, "correlation": 67353 } }, { "ph": "f", "id": 67353, "pid": 76337, "tid": -914061504, "ts": 1716454222304515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222376416, "dur": 9, "args": { "External id": 67354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67354, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67354, "pid": 5, "tid": 7, "ts": 1716454222376416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304517, "dur": 12, "args": { "External id": 67354, "cbid": 211, "correlation": 67354 } }, { "ph": "s", "id": 67354, "pid": 76337, "tid": -914061504, "ts": 1716454222304517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222376426, "dur": 4, "args": { "External id": 67356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67356, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67356, "pid": 5, "tid": 7, "ts": 1716454222376426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304531, "dur": 5, "args": { "External id": 67356, "cbid": 211, "correlation": 67356 } }, { "ph": "s", "id": 67356, "pid": 76337, "tid": -914061504, "ts": 1716454222304531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222376431, "dur": 60, "args": { "External id": 67381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67381, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67381, "pid": 5, "tid": 7, "ts": 1716454222376431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304610, "dur": 12, "args": { "External id": 67381, "cbid": 211, "correlation": 67381 } }, { "ph": "s", "id": 67381, "pid": 76337, "tid": -914061504, "ts": 1716454222304610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304709, "dur": 1, "args": { "External id": 67399, "cbid": 251, "correlation": 67399 } }, { "ph": "f", "id": 67399, "pid": 76337, "tid": -914061504, "ts": 1716454222304709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222376493, "dur": 99, "args": { "External id": 67401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67401, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67401, "pid": 5, "tid": 7, "ts": 1716454222376493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304715, "dur": 13, "args": { "External id": 67401, "cbid": 211, "correlation": 67401 } }, { "ph": "s", "id": 67401, "pid": 76337, "tid": -914061504, "ts": 1716454222304715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222376594, "dur": 10, "args": { "External id": 67409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67409, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67409, "pid": 5, "tid": 7, "ts": 1716454222376594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304783, "dur": 13, "args": { "External id": 67409, "cbid": 211, "correlation": 67409 } }, { "ph": "s", "id": 67409, "pid": 76337, "tid": -914061504, "ts": 1716454222304783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222376605, "dur": 22, "args": { "External id": 67417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67417, "pid": 5, "tid": 7, "ts": 1716454222376605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304825, "dur": 9, "args": { "External id": 67417, "cbid": 211, "correlation": 67417 } }, { "ph": "s", "id": 67417, "pid": 76337, "tid": -914061504, "ts": 1716454222304825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222376629, "dur": 19, "args": { "External id": 67439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67439, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67439, "pid": 5, "tid": 7, "ts": 1716454222376629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304876, "dur": 10, "args": { "External id": 67439, "cbid": 211, "correlation": 67439 } }, { "ph": "s", "id": 67439, "pid": 76337, "tid": -914061504, "ts": 1716454222304876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304968, "dur": 1, "args": { "External id": 67455, "cbid": 251, "correlation": 67455 } }, { "ph": "f", "id": 67455, "pid": 76337, "tid": -914061504, "ts": 1716454222304968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222304972, "dur": 8, "args": { "External id": 67457, "cbid": 251, "correlation": 67457 } }, { "ph": "f", "id": 67457, "pid": 76337, "tid": -914061504, "ts": 1716454222304972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222376649, "dur": 527, "args": { "External id": 67458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67458, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67458, "pid": 5, "tid": 7, "ts": 1716454222376649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222304982, "dur": 14, "args": { "External id": 67458, "cbid": 211, "correlation": 67458 } }, { "ph": "s", "id": 67458, "pid": 76337, "tid": -914061504, "ts": 1716454222304982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222377178, "dur": 70, "args": { "External id": 67466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67466, "pid": 5, "tid": 7, "ts": 1716454222377178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305050, "dur": 13, "args": { "External id": 67466, "cbid": 211, "correlation": 67466 } }, { "ph": "s", "id": 67466, "pid": 76337, "tid": -914061504, "ts": 1716454222305050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222377250, "dur": 68, "args": { "External id": 67474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67474, "pid": 5, "tid": 7, "ts": 1716454222377250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305080, "dur": 8, "args": { "External id": 67474, "cbid": 211, "correlation": 67474 } }, { "ph": "s", "id": 67474, "pid": 76337, "tid": -914061504, "ts": 1716454222305080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222305160, "dur": 1, "args": { "External id": 67490, "cbid": 251, "correlation": 67490 } }, { "ph": "f", "id": 67490, "pid": 76337, "tid": -914061504, "ts": 1716454222305160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222377320, "dur": 1, "args": { "External id": 67492, "device": 5, "context": 1, "stream": 7, "correlation": 67492, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 67492, "pid": 5, "tid": 7, "ts": 1716454222377320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222305165, "dur": 9, "args": { "External id": 67492, "cbid": 51, "correlation": 67492 } }, { "ph": "s", "id": 67492, "pid": 76337, "tid": -914061504, "ts": 1716454222305165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222377324, "dur": 291, "args": { "External id": 67493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67493, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67493, "pid": 5, "tid": 7, "ts": 1716454222377324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305176, "dur": 11, "args": { "External id": 67493, "cbid": 211, "correlation": 67493 } }, { "ph": "s", "id": 67493, "pid": 76337, "tid": -914061504, "ts": 1716454222305176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222377617, "dur": 14, "args": { "External id": 67501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67501, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67501, "pid": 5, "tid": 7, "ts": 1716454222377617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305219, "dur": 10, "args": { "External id": 67501, "cbid": 211, "correlation": 67501 } }, { "ph": "s", "id": 67501, "pid": 76337, "tid": -914061504, "ts": 1716454222305219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222377632, "dur": 41, "args": { "External id": 67512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67512, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67512, "pid": 5, "tid": 7, "ts": 1716454222377632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305287, "dur": 12, "args": { "External id": 67512, "cbid": 211, "correlation": 67512 } }, { "ph": "s", "id": 67512, "pid": 76337, "tid": -914061504, "ts": 1716454222305287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222305354, "dur": 0, "args": { "External id": 67524, "cbid": 317, "correlation": 67524 } }, { "ph": "f", "id": 67524, "pid": 76337, "tid": -914061504, "ts": 1716454222305354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222305355, "dur": 0, "args": { "External id": 67525, "cbid": 203, "correlation": 67525 } }, { "ph": "f", "id": 67525, "pid": 76337, "tid": -914061504, "ts": 1716454222305355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222305356, "dur": 0, "args": { "External id": 67526, "cbid": 205, "correlation": 67526 } }, { "ph": "f", "id": 67526, "pid": 76337, "tid": -914061504, "ts": 1716454222305356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222377675, "dur": 13, "args": { "External id": 67530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67530, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67530, "pid": 5, "tid": 7, "ts": 1716454222377675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305381, "dur": 13, "args": { "External id": 67530, "cbid": 211, "correlation": 67530 } }, { "ph": "s", "id": 67530, "pid": 76337, "tid": -914061504, "ts": 1716454222305381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222377690, "dur": 4, "args": { "External id": 67532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67532, "pid": 5, "tid": 7, "ts": 1716454222377690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305399, "dur": 6, "args": { "External id": 67532, "cbid": 211, "correlation": 67532 } }, { "ph": "s", "id": 67532, "pid": 76337, "tid": -914061504, "ts": 1716454222305399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222305409, "dur": 0, "args": { "External id": 67533, "cbid": 51, "correlation": 67533 } }, { "ph": "s", "id": 67533, "pid": 76337, "tid": -914061504, "ts": 1716454222305409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222377695, "dur": 107, "args": { "External id": 67534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67534, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 67534, "pid": 5, "tid": 7, "ts": 1716454222377695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305410, "dur": 5, "args": { "External id": 67534, "cbid": 211, "correlation": 67534 } }, { "ph": "s", "id": 67534, "pid": 76337, "tid": -914061504, "ts": 1716454222305410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222377803, "dur": 18, "args": { "External id": 67539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67539, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67539, "pid": 5, "tid": 7, "ts": 1716454222377803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305437, "dur": 8, "args": { "External id": 67539, "cbid": 211, "correlation": 67539 } }, { "ph": "s", "id": 67539, "pid": 76337, "tid": -914061504, "ts": 1716454222305437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222377823, "dur": 12, "args": { "External id": 67547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67547, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67547, "pid": 5, "tid": 7, "ts": 1716454222377823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305470, "dur": 8, "args": { "External id": 67547, "cbid": 211, "correlation": 67547 } }, { "ph": "s", "id": 67547, "pid": 76337, "tid": -914061504, "ts": 1716454222305470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222377836, "dur": 61, "args": { "External id": 67558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67558, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67558, "pid": 5, "tid": 7, "ts": 1716454222377836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305537, "dur": 12, "args": { "External id": 67558, "cbid": 211, "correlation": 67558 } }, { "ph": "s", "id": 67558, "pid": 76337, "tid": -914061504, "ts": 1716454222305537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222305593, "dur": 0, "args": { "External id": 67568, "cbid": 317, "correlation": 67568 } }, { "ph": "f", "id": 67568, "pid": 76337, "tid": -914061504, "ts": 1716454222305593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222305594, "dur": 0, "args": { "External id": 67569, "cbid": 203, "correlation": 67569 } }, { "ph": "f", "id": 67569, "pid": 76337, "tid": -914061504, "ts": 1716454222305594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222305594, "dur": 0, "args": { "External id": 67570, "cbid": 205, "correlation": 67570 } }, { "ph": "f", "id": 67570, "pid": 76337, "tid": -914061504, "ts": 1716454222305594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222377898, "dur": 39, "args": { "External id": 67574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67574, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67574, "pid": 5, "tid": 7, "ts": 1716454222377898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305616, "dur": 12, "args": { "External id": 67574, "cbid": 211, "correlation": 67574 } }, { "ph": "s", "id": 67574, "pid": 76337, "tid": -914061504, "ts": 1716454222305616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222377938, "dur": 178, "args": { "External id": 67576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67576, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67576, "pid": 5, "tid": 7, "ts": 1716454222377938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305631, "dur": 5, "args": { "External id": 67576, "cbid": 211, "correlation": 67576 } }, { "ph": "s", "id": 67576, "pid": 76337, "tid": -914061504, "ts": 1716454222305631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222378117, "dur": 2105, "args": { "External id": 67578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67578, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67578, "pid": 5, "tid": 7, "ts": 1716454222378117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305643, "dur": 8, "args": { "External id": 67578, "cbid": 211, "correlation": 67578 } }, { "ph": "s", "id": 67578, "pid": 76337, "tid": -914061504, "ts": 1716454222305643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222380224, "dur": 40, "args": { "External id": 67580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67580, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67580, "pid": 5, "tid": 7, "ts": 1716454222380224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305655, "dur": 7, "args": { "External id": 67580, "cbid": 211, "correlation": 67580 } }, { "ph": "s", "id": 67580, "pid": 76337, "tid": -914061504, "ts": 1716454222305655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222380265, "dur": 64, "args": { "External id": 67586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67586, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67586, "pid": 5, "tid": 7, "ts": 1716454222380265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305686, "dur": 8, "args": { "External id": 67586, "cbid": 211, "correlation": 67586 } }, { "ph": "s", "id": 67586, "pid": 76337, "tid": -914061504, "ts": 1716454222305686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222380330, "dur": 89, "args": { "External id": 67595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67595, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67595, "pid": 5, "tid": 7, "ts": 1716454222380330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305777, "dur": 13, "args": { "External id": 67595, "cbid": 211, "correlation": 67595 } }, { "ph": "s", "id": 67595, "pid": 76337, "tid": -914061504, "ts": 1716454222305777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222380421, "dur": 78, "args": { "External id": 67615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67615, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 67615, "pid": 5, "tid": 7, "ts": 1716454222380421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305848, "dur": 11, "args": { "External id": 67615, "cbid": 211, "correlation": 67615 } }, { "ph": "s", "id": 67615, "pid": 76337, "tid": -914061504, "ts": 1716454222305848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222380500, "dur": 5, "args": { "External id": 67627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67627, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 67627, "pid": 5, "tid": 7, "ts": 1716454222380500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305868, "dur": 6, "args": { "External id": 67627, "cbid": 211, "correlation": 67627 } }, { "ph": "s", "id": 67627, "pid": 76337, "tid": -914061504, "ts": 1716454222305868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222380507, "dur": 87, "args": { "External id": 67630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67630, "pid": 5, "tid": 7, "ts": 1716454222380507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305887, "dur": 6, "args": { "External id": 67630, "cbid": 211, "correlation": 67630 } }, { "ph": "s", "id": 67630, "pid": 76337, "tid": -914061504, "ts": 1716454222305887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222380595, "dur": 55, "args": { "External id": 67639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67639, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67639, "pid": 5, "tid": 7, "ts": 1716454222380595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222305926, "dur": 10, "args": { "External id": 67639, "cbid": 211, "correlation": 67639 } }, { "ph": "s", "id": 67639, "pid": 76337, "tid": -914061504, "ts": 1716454222305926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222305985, "dur": 0, "args": { "External id": 67649, "cbid": 317, "correlation": 67649 } }, { "ph": "f", "id": 67649, "pid": 76337, "tid": -914061504, "ts": 1716454222305985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222305986, "dur": 0, "args": { "External id": 67650, "cbid": 203, "correlation": 67650 } }, { "ph": "f", "id": 67650, "pid": 76337, "tid": -914061504, "ts": 1716454222305986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222305987, "dur": 0, "args": { "External id": 67651, "cbid": 205, "correlation": 67651 } }, { "ph": "f", "id": 67651, "pid": 76337, "tid": -914061504, "ts": 1716454222305987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222380651, "dur": 58, "args": { "External id": 67655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67655, "pid": 5, "tid": 7, "ts": 1716454222380651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306007, "dur": 13, "args": { "External id": 67655, "cbid": 211, "correlation": 67655 } }, { "ph": "s", "id": 67655, "pid": 76337, "tid": -914061504, "ts": 1716454222306007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222380710, "dur": 133, "args": { "External id": 67657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67657, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67657, "pid": 5, "tid": 7, "ts": 1716454222380710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306022, "dur": 5, "args": { "External id": 67657, "cbid": 211, "correlation": 67657 } }, { "ph": "s", "id": 67657, "pid": 76337, "tid": -914061504, "ts": 1716454222306022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222380845, "dur": 2078, "args": { "External id": 67659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67659, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67659, "pid": 5, "tid": 7, "ts": 1716454222380845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306034, "dur": 7, "args": { "External id": 67659, "cbid": 211, "correlation": 67659 } }, { "ph": "s", "id": 67659, "pid": 76337, "tid": -914061504, "ts": 1716454222306034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222382924, "dur": 22, "args": { "External id": 67661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67661, "pid": 5, "tid": 7, "ts": 1716454222382924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306045, "dur": 5, "args": { "External id": 67661, "cbid": 211, "correlation": 67661 } }, { "ph": "s", "id": 67661, "pid": 76337, "tid": -914061504, "ts": 1716454222306045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222382947, "dur": 36, "args": { "External id": 67667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67667, "pid": 5, "tid": 7, "ts": 1716454222382947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306073, "dur": 8, "args": { "External id": 67667, "cbid": 211, "correlation": 67667 } }, { "ph": "s", "id": 67667, "pid": 76337, "tid": -914061504, "ts": 1716454222306073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222382984, "dur": 4, "args": { "External id": 67675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67675, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 67675, "pid": 5, "tid": 7, "ts": 1716454222382984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306117, "dur": 9, "args": { "External id": 67675, "cbid": 211, "correlation": 67675 } }, { "ph": "s", "id": 67675, "pid": 76337, "tid": -914061504, "ts": 1716454222306117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222306182, "dur": 1, "args": { "External id": 67691, "cbid": 251, "correlation": 67691 } }, { "ph": "f", "id": 67691, "pid": 76337, "tid": -914061504, "ts": 1716454222306182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222306188, "dur": 0, "args": { "External id": 67693, "cbid": 251, "correlation": 67693 } }, { "ph": "f", "id": 67693, "pid": 76337, "tid": -914061504, "ts": 1716454222306188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222382989, "dur": 13, "args": { "External id": 67694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67694, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 67694, "pid": 5, "tid": 7, "ts": 1716454222382989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306190, "dur": 11, "args": { "External id": 67694, "cbid": 211, "correlation": 67694 } }, { "ph": "s", "id": 67694, "pid": 76337, "tid": -914061504, "ts": 1716454222306190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222383004, "dur": 5, "args": { "External id": 67696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67696, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 67696, "pid": 5, "tid": 7, "ts": 1716454222383004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306204, "dur": 5, "args": { "External id": 67696, "cbid": 211, "correlation": 67696 } }, { "ph": "s", "id": 67696, "pid": 76337, "tid": -914061504, "ts": 1716454222306204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222383011, "dur": 32, "args": { "External id": 67706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67706, "pid": 5, "tid": 7, "ts": 1716454222383011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306261, "dur": 12, "args": { "External id": 67706, "cbid": 211, "correlation": 67706 } }, { "ph": "s", "id": 67706, "pid": 76337, "tid": -914061504, "ts": 1716454222306261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222383044, "dur": 34, "args": { "External id": 67726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67726, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 67726, "pid": 5, "tid": 7, "ts": 1716454222383044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306330, "dur": 11, "args": { "External id": 67726, "cbid": 211, "correlation": 67726 } }, { "ph": "s", "id": 67726, "pid": 76337, "tid": -914061504, "ts": 1716454222306330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222383079, "dur": 4, "args": { "External id": 67738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67738, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 67738, "pid": 5, "tid": 7, "ts": 1716454222383079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306352, "dur": 6, "args": { "External id": 67738, "cbid": 211, "correlation": 67738 } }, { "ph": "s", "id": 67738, "pid": 76337, "tid": -914061504, "ts": 1716454222306352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222383085, "dur": 31, "args": { "External id": 67741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67741, "pid": 5, "tid": 7, "ts": 1716454222383085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306370, "dur": 7, "args": { "External id": 67741, "cbid": 211, "correlation": 67741 } }, { "ph": "s", "id": 67741, "pid": 76337, "tid": -914061504, "ts": 1716454222306370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222383118, "dur": 21, "args": { "External id": 67750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67750, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67750, "pid": 5, "tid": 7, "ts": 1716454222383118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306411, "dur": 10, "args": { "External id": 67750, "cbid": 211, "correlation": 67750 } }, { "ph": "s", "id": 67750, "pid": 76337, "tid": -914061504, "ts": 1716454222306411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222306474, "dur": 0, "args": { "External id": 67760, "cbid": 317, "correlation": 67760 } }, { "ph": "f", "id": 67760, "pid": 76337, "tid": -914061504, "ts": 1716454222306474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222306475, "dur": 0, "args": { "External id": 67761, "cbid": 203, "correlation": 67761 } }, { "ph": "f", "id": 67761, "pid": 76337, "tid": -914061504, "ts": 1716454222306475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222306475, "dur": 0, "args": { "External id": 67762, "cbid": 205, "correlation": 67762 } }, { "ph": "f", "id": 67762, "pid": 76337, "tid": -914061504, "ts": 1716454222306475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222383140, "dur": 23, "args": { "External id": 67766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67766, "pid": 5, "tid": 7, "ts": 1716454222383140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306491, "dur": 13, "args": { "External id": 67766, "cbid": 211, "correlation": 67766 } }, { "ph": "s", "id": 67766, "pid": 76337, "tid": -914061504, "ts": 1716454222306491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222383165, "dur": 48, "args": { "External id": 67768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67768, "pid": 5, "tid": 7, "ts": 1716454222383165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306507, "dur": 5, "args": { "External id": 67768, "cbid": 211, "correlation": 67768 } }, { "ph": "s", "id": 67768, "pid": 76337, "tid": -914061504, "ts": 1716454222306507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222383214, "dur": 709, "args": { "External id": 67770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67770, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67770, "pid": 5, "tid": 7, "ts": 1716454222383214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306518, "dur": 6, "args": { "External id": 67770, "cbid": 211, "correlation": 67770 } }, { "ph": "s", "id": 67770, "pid": 76337, "tid": -914061504, "ts": 1716454222306518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222383925, "dur": 24, "args": { "External id": 67772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67772, "pid": 5, "tid": 7, "ts": 1716454222383925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306528, "dur": 5, "args": { "External id": 67772, "cbid": 211, "correlation": 67772 } }, { "ph": "s", "id": 67772, "pid": 76337, "tid": -914061504, "ts": 1716454222306528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222383950, "dur": 34, "args": { "External id": 67778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67778, "pid": 5, "tid": 7, "ts": 1716454222383950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306556, "dur": 8, "args": { "External id": 67778, "cbid": 211, "correlation": 67778 } }, { "ph": "s", "id": 67778, "pid": 76337, "tid": -914061504, "ts": 1716454222306556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222306618, "dur": 0, "args": { "External id": 67788, "cbid": 317, "correlation": 67788 } }, { "ph": "f", "id": 67788, "pid": 76337, "tid": -914061504, "ts": 1716454222306618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222306619, "dur": 0, "args": { "External id": 67789, "cbid": 203, "correlation": 67789 } }, { "ph": "f", "id": 67789, "pid": 76337, "tid": -914061504, "ts": 1716454222306619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222306619, "dur": 0, "args": { "External id": 67790, "cbid": 205, "correlation": 67790 } }, { "ph": "f", "id": 67790, "pid": 76337, "tid": -914061504, "ts": 1716454222306619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222383986, "dur": 58, "args": { "External id": 67794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67794, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67794, "pid": 5, "tid": 7, "ts": 1716454222383986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306632, "dur": 12, "args": { "External id": 67794, "cbid": 211, "correlation": 67794 } }, { "ph": "s", "id": 67794, "pid": 76337, "tid": -914061504, "ts": 1716454222306632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222384045, "dur": 293, "args": { "External id": 67796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67796, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67796, "pid": 5, "tid": 7, "ts": 1716454222384045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306650, "dur": 8, "args": { "External id": 67796, "cbid": 211, "correlation": 67796 } }, { "ph": "s", "id": 67796, "pid": 76337, "tid": -914061504, "ts": 1716454222306650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222384339, "dur": 22, "args": { "External id": 67798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67798, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67798, "pid": 5, "tid": 7, "ts": 1716454222384339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306662, "dur": 5, "args": { "External id": 67798, "cbid": 211, "correlation": 67798 } }, { "ph": "s", "id": 67798, "pid": 76337, "tid": -914061504, "ts": 1716454222306662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222384363, "dur": 34, "args": { "External id": 67804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67804, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67804, "pid": 5, "tid": 7, "ts": 1716454222384363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306687, "dur": 8, "args": { "External id": 67804, "cbid": 211, "correlation": 67804 } }, { "ph": "s", "id": 67804, "pid": 76337, "tid": -914061504, "ts": 1716454222306687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222384399, "dur": 27, "args": { "External id": 67812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67812, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67812, "pid": 5, "tid": 7, "ts": 1716454222384399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306716, "dur": 8, "args": { "External id": 67812, "cbid": 211, "correlation": 67812 } }, { "ph": "s", "id": 67812, "pid": 76337, "tid": -914061504, "ts": 1716454222306716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222384428, "dur": 20, "args": { "External id": 67820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67820, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67820, "pid": 5, "tid": 7, "ts": 1716454222384428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306745, "dur": 9, "args": { "External id": 67820, "cbid": 211, "correlation": 67820 } }, { "ph": "s", "id": 67820, "pid": 76337, "tid": -914061504, "ts": 1716454222306745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222384449, "dur": 32, "args": { "External id": 67840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67840, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 67840, "pid": 5, "tid": 7, "ts": 1716454222384449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306826, "dur": 12, "args": { "External id": 67840, "cbid": 211, "correlation": 67840 } }, { "ph": "s", "id": 67840, "pid": 76337, "tid": -914061504, "ts": 1716454222306826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222384483, "dur": 5, "args": { "External id": 67852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67852, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 67852, "pid": 5, "tid": 7, "ts": 1716454222384483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306848, "dur": 9, "args": { "External id": 67852, "cbid": 211, "correlation": 67852 } }, { "ph": "s", "id": 67852, "pid": 76337, "tid": -914061504, "ts": 1716454222306848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222384489, "dur": 33, "args": { "External id": 67855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67855, "pid": 5, "tid": 7, "ts": 1716454222384489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306870, "dur": 7, "args": { "External id": 67855, "cbid": 211, "correlation": 67855 } }, { "ph": "s", "id": 67855, "pid": 76337, "tid": -914061504, "ts": 1716454222306870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222306928, "dur": 0, "args": { "External id": 67866, "cbid": 317, "correlation": 67866 } }, { "ph": "f", "id": 67866, "pid": 76337, "tid": -914061504, "ts": 1716454222306928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222306928, "dur": 0, "args": { "External id": 67867, "cbid": 203, "correlation": 67867 } }, { "ph": "f", "id": 67867, "pid": 76337, "tid": -914061504, "ts": 1716454222306928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222306929, "dur": 0, "args": { "External id": 67868, "cbid": 205, "correlation": 67868 } }, { "ph": "f", "id": 67868, "pid": 76337, "tid": -914061504, "ts": 1716454222306929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222384523, "dur": 23, "args": { "External id": 67872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67872, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67872, "pid": 5, "tid": 7, "ts": 1716454222384523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306943, "dur": 12, "args": { "External id": 67872, "cbid": 211, "correlation": 67872 } }, { "ph": "s", "id": 67872, "pid": 76337, "tid": -914061504, "ts": 1716454222306943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222384548, "dur": 116, "args": { "External id": 67874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67874, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67874, "pid": 5, "tid": 7, "ts": 1716454222384548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306962, "dur": 6, "args": { "External id": 67874, "cbid": 211, "correlation": 67874 } }, { "ph": "s", "id": 67874, "pid": 76337, "tid": -914061504, "ts": 1716454222306962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222384665, "dur": 23, "args": { "External id": 67876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67876, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67876, "pid": 5, "tid": 7, "ts": 1716454222384665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222306972, "dur": 13, "args": { "External id": 67876, "cbid": 211, "correlation": 67876 } }, { "ph": "s", "id": 67876, "pid": 76337, "tid": -914061504, "ts": 1716454222306972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222384689, "dur": 34, "args": { "External id": 67882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67882, "pid": 5, "tid": 7, "ts": 1716454222384689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307008, "dur": 10, "args": { "External id": 67882, "cbid": 211, "correlation": 67882 } }, { "ph": "s", "id": 67882, "pid": 76337, "tid": -914061504, "ts": 1716454222307008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222384725, "dur": 211, "args": { "External id": 67891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67891, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67891, "pid": 5, "tid": 7, "ts": 1716454222384725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307091, "dur": 15, "args": { "External id": 67891, "cbid": 211, "correlation": 67891 } }, { "ph": "s", "id": 67891, "pid": 76337, "tid": -914061504, "ts": 1716454222307091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222384937, "dur": 71, "args": { "External id": 67913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67913, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67913, "pid": 5, "tid": 7, "ts": 1716454222384937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307149, "dur": 10, "args": { "External id": 67913, "cbid": 211, "correlation": 67913 } }, { "ph": "s", "id": 67913, "pid": 76337, "tid": -914061504, "ts": 1716454222307149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307243, "dur": 2, "args": { "External id": 67924, "cbid": 251, "correlation": 67924 } }, { "ph": "f", "id": 67924, "pid": 76337, "tid": -914061504, "ts": 1716454222307243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222385009, "dur": 164, "args": { "External id": 67925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67925, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67925, "pid": 5, "tid": 7, "ts": 1716454222385009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307248, "dur": 13, "args": { "External id": 67925, "cbid": 211, "correlation": 67925 } }, { "ph": "s", "id": 67925, "pid": 76337, "tid": -914061504, "ts": 1716454222307248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307320, "dur": 1, "args": { "External id": 67936, "cbid": 251, "correlation": 67936 } }, { "ph": "f", "id": 67936, "pid": 76337, "tid": -914061504, "ts": 1716454222307320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222385175, "dur": 157, "args": { "External id": 67937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67937, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67937, "pid": 5, "tid": 7, "ts": 1716454222385175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307325, "dur": 11, "args": { "External id": 67937, "cbid": 211, "correlation": 67937 } }, { "ph": "s", "id": 67937, "pid": 76337, "tid": -914061504, "ts": 1716454222307325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307389, "dur": 1, "args": { "External id": 67948, "cbid": 251, "correlation": 67948 } }, { "ph": "f", "id": 67948, "pid": 76337, "tid": -914061504, "ts": 1716454222307389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222385333, "dur": 158, "args": { "External id": 67949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67949, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 67949, "pid": 5, "tid": 7, "ts": 1716454222385333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307393, "dur": 11, "args": { "External id": 67949, "cbid": 211, "correlation": 67949 } }, { "ph": "s", "id": 67949, "pid": 76337, "tid": -914061504, "ts": 1716454222307393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222385493, "dur": 2146, "args": { "External id": 67970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67970, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 67970, "pid": 5, "tid": 7, "ts": 1716454222385493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307478, "dur": 13, "args": { "External id": 67970, "cbid": 211, "correlation": 67970 } }, { "ph": "s", "id": 67970, "pid": 76337, "tid": -914061504, "ts": 1716454222307478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307575, "dur": 2, "args": { "External id": 67988, "cbid": 251, "correlation": 67988 } }, { "ph": "f", "id": 67988, "pid": 76337, "tid": -914061504, "ts": 1716454222307575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222387640, "dur": 160, "args": { "External id": 67990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67990, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 67990, "pid": 5, "tid": 7, "ts": 1716454222387640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307581, "dur": 13, "args": { "External id": 67990, "cbid": 211, "correlation": 67990 } }, { "ph": "s", "id": 67990, "pid": 76337, "tid": -914061504, "ts": 1716454222307581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222387802, "dur": 36, "args": { "External id": 67998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 67998, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 67998, "pid": 5, "tid": 7, "ts": 1716454222387802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307651, "dur": 15, "args": { "External id": 67998, "cbid": 211, "correlation": 67998 } }, { "ph": "s", "id": 67998, "pid": 76337, "tid": -914061504, "ts": 1716454222307651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222387840, "dur": 52, "args": { "External id": 68006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68006, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68006, "pid": 5, "tid": 7, "ts": 1716454222387840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307694, "dur": 9, "args": { "External id": 68006, "cbid": 211, "correlation": 68006 } }, { "ph": "s", "id": 68006, "pid": 76337, "tid": -914061504, "ts": 1716454222307694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222387893, "dur": 32, "args": { "External id": 68017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68017, "pid": 5, "tid": 7, "ts": 1716454222387893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307766, "dur": 12, "args": { "External id": 68017, "cbid": 211, "correlation": 68017 } }, { "ph": "s", "id": 68017, "pid": 76337, "tid": -914061504, "ts": 1716454222307766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222387926, "dur": 37, "args": { "External id": 68039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68039, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68039, "pid": 5, "tid": 7, "ts": 1716454222387926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307797, "dur": 8, "args": { "External id": 68039, "cbid": 211, "correlation": 68039 } }, { "ph": "s", "id": 68039, "pid": 76337, "tid": -914061504, "ts": 1716454222307797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307882, "dur": 1, "args": { "External id": 68050, "cbid": 251, "correlation": 68050 } }, { "ph": "f", "id": 68050, "pid": 76337, "tid": -914061504, "ts": 1716454222307882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222387964, "dur": 99, "args": { "External id": 68051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68051, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68051, "pid": 5, "tid": 7, "ts": 1716454222387964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307887, "dur": 13, "args": { "External id": 68051, "cbid": 211, "correlation": 68051 } }, { "ph": "s", "id": 68051, "pid": 76337, "tid": -914061504, "ts": 1716454222307887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307958, "dur": 1, "args": { "External id": 68062, "cbid": 251, "correlation": 68062 } }, { "ph": "f", "id": 68062, "pid": 76337, "tid": -914061504, "ts": 1716454222307958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222307961, "dur": 0, "args": { "External id": 68063, "cbid": 251, "correlation": 68063 } }, { "ph": "f", "id": 68063, "pid": 76337, "tid": -914061504, "ts": 1716454222307961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222388064, "dur": 11, "args": { "External id": 68064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68064, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 68064, "pid": 5, "tid": 7, "ts": 1716454222388064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307963, "dur": 21, "args": { "External id": 68064, "cbid": 211, "correlation": 68064 } }, { "ph": "s", "id": 68064, "pid": 76337, "tid": -914061504, "ts": 1716454222307963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222388077, "dur": 5, "args": { "External id": 68066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68066, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 68066, "pid": 5, "tid": 7, "ts": 1716454222388077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222307986, "dur": 6, "args": { "External id": 68066, "cbid": 211, "correlation": 68066 } }, { "ph": "s", "id": 68066, "pid": 76337, "tid": -914061504, "ts": 1716454222307986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222308048, "dur": 1, "args": { "External id": 68077, "cbid": 251, "correlation": 68077 } }, { "ph": "f", "id": 68077, "pid": 76337, "tid": -914061504, "ts": 1716454222308048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222308052, "dur": 0, "args": { "External id": 68078, "cbid": 251, "correlation": 68078 } }, { "ph": "f", "id": 68078, "pid": 76337, "tid": -914061504, "ts": 1716454222308052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222388084, "dur": 8, "args": { "External id": 68079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68079, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 68079, "pid": 5, "tid": 7, "ts": 1716454222388084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308054, "dur": 12, "args": { "External id": 68079, "cbid": 211, "correlation": 68079 } }, { "ph": "s", "id": 68079, "pid": 76337, "tid": -914061504, "ts": 1716454222308054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222388093, "dur": 3, "args": { "External id": 68081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68081, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 68081, "pid": 5, "tid": 7, "ts": 1716454222388093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308068, "dur": 5, "args": { "External id": 68081, "cbid": 211, "correlation": 68081 } }, { "ph": "s", "id": 68081, "pid": 76337, "tid": -914061504, "ts": 1716454222308068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222388098, "dur": 100, "args": { "External id": 68102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68102, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 68102, "pid": 5, "tid": 7, "ts": 1716454222388098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308141, "dur": 12, "args": { "External id": 68102, "cbid": 211, "correlation": 68102 } }, { "ph": "s", "id": 68102, "pid": 76337, "tid": -914061504, "ts": 1716454222308141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222308250, "dur": 1, "args": { "External id": 68120, "cbid": 251, "correlation": 68120 } }, { "ph": "f", "id": 68120, "pid": 76337, "tid": -914061504, "ts": 1716454222308250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222388199, "dur": 107, "args": { "External id": 68122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68122, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68122, "pid": 5, "tid": 7, "ts": 1716454222388199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308257, "dur": 15, "args": { "External id": 68122, "cbid": 211, "correlation": 68122 } }, { "ph": "s", "id": 68122, "pid": 76337, "tid": -914061504, "ts": 1716454222308257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222388307, "dur": 20, "args": { "External id": 68130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68130, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68130, "pid": 5, "tid": 7, "ts": 1716454222388307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308329, "dur": 12, "args": { "External id": 68130, "cbid": 211, "correlation": 68130 } }, { "ph": "s", "id": 68130, "pid": 76337, "tid": -914061504, "ts": 1716454222308329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222388328, "dur": 39, "args": { "External id": 68138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68138, "pid": 5, "tid": 7, "ts": 1716454222388328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308371, "dur": 9, "args": { "External id": 68138, "cbid": 211, "correlation": 68138 } }, { "ph": "s", "id": 68138, "pid": 76337, "tid": -914061504, "ts": 1716454222308371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222388368, "dur": 37, "args": { "External id": 68160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68160, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68160, "pid": 5, "tid": 7, "ts": 1716454222388368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308426, "dur": 10, "args": { "External id": 68160, "cbid": 211, "correlation": 68160 } }, { "ph": "s", "id": 68160, "pid": 76337, "tid": -914061504, "ts": 1716454222308426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222308516, "dur": 1, "args": { "External id": 68176, "cbid": 251, "correlation": 68176 } }, { "ph": "f", "id": 68176, "pid": 76337, "tid": -914061504, "ts": 1716454222308516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222308522, "dur": 0, "args": { "External id": 68178, "cbid": 251, "correlation": 68178 } }, { "ph": "f", "id": 68178, "pid": 76337, "tid": -914061504, "ts": 1716454222308522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222388407, "dur": 586, "args": { "External id": 68179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68179, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 68179, "pid": 5, "tid": 7, "ts": 1716454222388407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308527, "dur": 13, "args": { "External id": 68179, "cbid": 211, "correlation": 68179 } }, { "ph": "s", "id": 68179, "pid": 76337, "tid": -914061504, "ts": 1716454222308527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222388995, "dur": 133, "args": { "External id": 68187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68187, "pid": 5, "tid": 7, "ts": 1716454222388995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308593, "dur": 13, "args": { "External id": 68187, "cbid": 211, "correlation": 68187 } }, { "ph": "s", "id": 68187, "pid": 76337, "tid": -914061504, "ts": 1716454222308593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222389130, "dur": 134, "args": { "External id": 68195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68195, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68195, "pid": 5, "tid": 7, "ts": 1716454222389130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308624, "dur": 8, "args": { "External id": 68195, "cbid": 211, "correlation": 68195 } }, { "ph": "s", "id": 68195, "pid": 76337, "tid": -914061504, "ts": 1716454222308624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222308702, "dur": 1, "args": { "External id": 68211, "cbid": 251, "correlation": 68211 } }, { "ph": "f", "id": 68211, "pid": 76337, "tid": -914061504, "ts": 1716454222308702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222389266, "dur": 332, "args": { "External id": 68213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68213, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68213, "pid": 5, "tid": 7, "ts": 1716454222389266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308708, "dur": 12, "args": { "External id": 68213, "cbid": 211, "correlation": 68213 } }, { "ph": "s", "id": 68213, "pid": 76337, "tid": -914061504, "ts": 1716454222308708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222389599, "dur": 27, "args": { "External id": 68221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68221, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68221, "pid": 5, "tid": 7, "ts": 1716454222389599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308750, "dur": 10, "args": { "External id": 68221, "cbid": 211, "correlation": 68221 } }, { "ph": "s", "id": 68221, "pid": 76337, "tid": -914061504, "ts": 1716454222308750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222389628, "dur": 90, "args": { "External id": 68232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68232, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68232, "pid": 5, "tid": 7, "ts": 1716454222389628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308820, "dur": 13, "args": { "External id": 68232, "cbid": 211, "correlation": 68232 } }, { "ph": "s", "id": 68232, "pid": 76337, "tid": -914061504, "ts": 1716454222308820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222308885, "dur": 0, "args": { "External id": 68244, "cbid": 317, "correlation": 68244 } }, { "ph": "f", "id": 68244, "pid": 76337, "tid": -914061504, "ts": 1716454222308885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222308885, "dur": 0, "args": { "External id": 68245, "cbid": 203, "correlation": 68245 } }, { "ph": "f", "id": 68245, "pid": 76337, "tid": -914061504, "ts": 1716454222308885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222308886, "dur": 0, "args": { "External id": 68246, "cbid": 205, "correlation": 68246 } }, { "ph": "f", "id": 68246, "pid": 76337, "tid": -914061504, "ts": 1716454222308886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222389720, "dur": 24, "args": { "External id": 68250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68250, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68250, "pid": 5, "tid": 7, "ts": 1716454222389720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308902, "dur": 12, "args": { "External id": 68250, "cbid": 211, "correlation": 68250 } }, { "ph": "s", "id": 68250, "pid": 76337, "tid": -914061504, "ts": 1716454222308902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222389745, "dur": 133, "args": { "External id": 68252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68252, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68252, "pid": 5, "tid": 7, "ts": 1716454222389745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308920, "dur": 6, "args": { "External id": 68252, "cbid": 211, "correlation": 68252 } }, { "ph": "s", "id": 68252, "pid": 76337, "tid": -914061504, "ts": 1716454222308920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222389879, "dur": 23, "args": { "External id": 68254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68254, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68254, "pid": 5, "tid": 7, "ts": 1716454222389879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308930, "dur": 5, "args": { "External id": 68254, "cbid": 211, "correlation": 68254 } }, { "ph": "s", "id": 68254, "pid": 76337, "tid": -914061504, "ts": 1716454222308930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222389904, "dur": 35, "args": { "External id": 68260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68260, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68260, "pid": 5, "tid": 7, "ts": 1716454222389904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222308958, "dur": 8, "args": { "External id": 68260, "cbid": 211, "correlation": 68260 } }, { "ph": "s", "id": 68260, "pid": 76337, "tid": -914061504, "ts": 1716454222308958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222389940, "dur": 27, "args": { "External id": 68268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68268, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68268, "pid": 5, "tid": 7, "ts": 1716454222389940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309000, "dur": 9, "args": { "External id": 68268, "cbid": 211, "correlation": 68268 } }, { "ph": "s", "id": 68268, "pid": 76337, "tid": -914061504, "ts": 1716454222309000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222389969, "dur": 57, "args": { "External id": 68277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68277, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68277, "pid": 5, "tid": 7, "ts": 1716454222389969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309038, "dur": 11, "args": { "External id": 68277, "cbid": 211, "correlation": 68277 } }, { "ph": "s", "id": 68277, "pid": 76337, "tid": -914061504, "ts": 1716454222309038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222390028, "dur": 58, "args": { "External id": 68297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68297, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 68297, "pid": 5, "tid": 7, "ts": 1716454222390028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309114, "dur": 12, "args": { "External id": 68297, "cbid": 211, "correlation": 68297 } }, { "ph": "s", "id": 68297, "pid": 76337, "tid": -914061504, "ts": 1716454222309114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222390087, "dur": 5, "args": { "External id": 68309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68309, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 68309, "pid": 5, "tid": 7, "ts": 1716454222390087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309136, "dur": 6, "args": { "External id": 68309, "cbid": 211, "correlation": 68309 } }, { "ph": "s", "id": 68309, "pid": 76337, "tid": -914061504, "ts": 1716454222309136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222390093, "dur": 60, "args": { "External id": 68312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68312, "pid": 5, "tid": 7, "ts": 1716454222390093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309155, "dur": 7, "args": { "External id": 68312, "cbid": 211, "correlation": 68312 } }, { "ph": "s", "id": 68312, "pid": 76337, "tid": -914061504, "ts": 1716454222309155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222390154, "dur": 38, "args": { "External id": 68321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68321, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68321, "pid": 5, "tid": 7, "ts": 1716454222390154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309194, "dur": 10, "args": { "External id": 68321, "cbid": 211, "correlation": 68321 } }, { "ph": "s", "id": 68321, "pid": 76337, "tid": -914061504, "ts": 1716454222309194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222309245, "dur": 0, "args": { "External id": 68331, "cbid": 317, "correlation": 68331 } }, { "ph": "f", "id": 68331, "pid": 76337, "tid": -914061504, "ts": 1716454222309245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222309246, "dur": 0, "args": { "External id": 68332, "cbid": 203, "correlation": 68332 } }, { "ph": "f", "id": 68332, "pid": 76337, "tid": -914061504, "ts": 1716454222309246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222309247, "dur": 0, "args": { "External id": 68333, "cbid": 205, "correlation": 68333 } }, { "ph": "f", "id": 68333, "pid": 76337, "tid": -914061504, "ts": 1716454222309247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222390194, "dur": 40, "args": { "External id": 68337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68337, "pid": 5, "tid": 7, "ts": 1716454222390194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309262, "dur": 11, "args": { "External id": 68337, "cbid": 211, "correlation": 68337 } }, { "ph": "s", "id": 68337, "pid": 76337, "tid": -914061504, "ts": 1716454222309262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222390236, "dur": 91, "args": { "External id": 68339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68339, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68339, "pid": 5, "tid": 7, "ts": 1716454222390236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309276, "dur": 5, "args": { "External id": 68339, "cbid": 211, "correlation": 68339 } }, { "ph": "s", "id": 68339, "pid": 76337, "tid": -914061504, "ts": 1716454222309276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222390329, "dur": 1404, "args": { "External id": 68341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68341, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68341, "pid": 5, "tid": 7, "ts": 1716454222390329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309287, "dur": 6, "args": { "External id": 68341, "cbid": 211, "correlation": 68341 } }, { "ph": "s", "id": 68341, "pid": 76337, "tid": -914061504, "ts": 1716454222309287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222391734, "dur": 22, "args": { "External id": 68343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68343, "pid": 5, "tid": 7, "ts": 1716454222391734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309297, "dur": 6, "args": { "External id": 68343, "cbid": 211, "correlation": 68343 } }, { "ph": "s", "id": 68343, "pid": 76337, "tid": -914061504, "ts": 1716454222309297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222391758, "dur": 35, "args": { "External id": 68349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68349, "pid": 5, "tid": 7, "ts": 1716454222391758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309325, "dur": 8, "args": { "External id": 68349, "cbid": 211, "correlation": 68349 } }, { "ph": "s", "id": 68349, "pid": 76337, "tid": -914061504, "ts": 1716454222309325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222391795, "dur": 4, "args": { "External id": 68357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68357, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 68357, "pid": 5, "tid": 7, "ts": 1716454222391795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309368, "dur": 12, "args": { "External id": 68357, "cbid": 211, "correlation": 68357 } }, { "ph": "s", "id": 68357, "pid": 76337, "tid": -914061504, "ts": 1716454222309368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222309436, "dur": 1, "args": { "External id": 68373, "cbid": 251, "correlation": 68373 } }, { "ph": "f", "id": 68373, "pid": 76337, "tid": -914061504, "ts": 1716454222309436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222309441, "dur": 0, "args": { "External id": 68375, "cbid": 251, "correlation": 68375 } }, { "ph": "f", "id": 68375, "pid": 76337, "tid": -914061504, "ts": 1716454222309441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222391800, "dur": 13, "args": { "External id": 68376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68376, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 68376, "pid": 5, "tid": 7, "ts": 1716454222391800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309443, "dur": 11, "args": { "External id": 68376, "cbid": 211, "correlation": 68376 } }, { "ph": "s", "id": 68376, "pid": 76337, "tid": -914061504, "ts": 1716454222309443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222391815, "dur": 5, "args": { "External id": 68378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68378, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 68378, "pid": 5, "tid": 7, "ts": 1716454222391815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309456, "dur": 6, "args": { "External id": 68378, "cbid": 211, "correlation": 68378 } }, { "ph": "s", "id": 68378, "pid": 76337, "tid": -914061504, "ts": 1716454222309456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222391822, "dur": 31, "args": { "External id": 68388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68388, "pid": 5, "tid": 7, "ts": 1716454222391822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309514, "dur": 13, "args": { "External id": 68388, "cbid": 211, "correlation": 68388 } }, { "ph": "s", "id": 68388, "pid": 76337, "tid": -914061504, "ts": 1716454222309514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222391854, "dur": 33, "args": { "External id": 68408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68408, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 68408, "pid": 5, "tid": 7, "ts": 1716454222391854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309581, "dur": 11, "args": { "External id": 68408, "cbid": 211, "correlation": 68408 } }, { "ph": "s", "id": 68408, "pid": 76337, "tid": -914061504, "ts": 1716454222309581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222391889, "dur": 4, "args": { "External id": 68420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68420, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 68420, "pid": 5, "tid": 7, "ts": 1716454222391889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309601, "dur": 6, "args": { "External id": 68420, "cbid": 211, "correlation": 68420 } }, { "ph": "s", "id": 68420, "pid": 76337, "tid": -914061504, "ts": 1716454222309601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222391894, "dur": 31, "args": { "External id": 68423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68423, "pid": 5, "tid": 7, "ts": 1716454222391894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309618, "dur": 6, "args": { "External id": 68423, "cbid": 211, "correlation": 68423 } }, { "ph": "s", "id": 68423, "pid": 76337, "tid": -914061504, "ts": 1716454222309618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222391927, "dur": 21, "args": { "External id": 68432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68432, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68432, "pid": 5, "tid": 7, "ts": 1716454222391927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309661, "dur": 10, "args": { "External id": 68432, "cbid": 211, "correlation": 68432 } }, { "ph": "s", "id": 68432, "pid": 76337, "tid": -914061504, "ts": 1716454222309661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222309724, "dur": 0, "args": { "External id": 68442, "cbid": 317, "correlation": 68442 } }, { "ph": "f", "id": 68442, "pid": 76337, "tid": -914061504, "ts": 1716454222309724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222309725, "dur": 0, "args": { "External id": 68443, "cbid": 203, "correlation": 68443 } }, { "ph": "f", "id": 68443, "pid": 76337, "tid": -914061504, "ts": 1716454222309725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222309726, "dur": 0, "args": { "External id": 68444, "cbid": 205, "correlation": 68444 } }, { "ph": "f", "id": 68444, "pid": 76337, "tid": -914061504, "ts": 1716454222309726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222391949, "dur": 22, "args": { "External id": 68448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68448, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68448, "pid": 5, "tid": 7, "ts": 1716454222391949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309740, "dur": 12, "args": { "External id": 68448, "cbid": 211, "correlation": 68448 } }, { "ph": "s", "id": 68448, "pid": 76337, "tid": -914061504, "ts": 1716454222309740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222391973, "dur": 48, "args": { "External id": 68450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68450, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68450, "pid": 5, "tid": 7, "ts": 1716454222391973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309755, "dur": 5, "args": { "External id": 68450, "cbid": 211, "correlation": 68450 } }, { "ph": "s", "id": 68450, "pid": 76337, "tid": -914061504, "ts": 1716454222309755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222392023, "dur": 710, "args": { "External id": 68452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68452, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68452, "pid": 5, "tid": 7, "ts": 1716454222392023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309766, "dur": 6, "args": { "External id": 68452, "cbid": 211, "correlation": 68452 } }, { "ph": "s", "id": 68452, "pid": 76337, "tid": -914061504, "ts": 1716454222309766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222392734, "dur": 23, "args": { "External id": 68454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68454, "pid": 5, "tid": 7, "ts": 1716454222392734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309775, "dur": 5, "args": { "External id": 68454, "cbid": 211, "correlation": 68454 } }, { "ph": "s", "id": 68454, "pid": 76337, "tid": -914061504, "ts": 1716454222309775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222392758, "dur": 34, "args": { "External id": 68460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68460, "pid": 5, "tid": 7, "ts": 1716454222392758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309804, "dur": 8, "args": { "External id": 68460, "cbid": 211, "correlation": 68460 } }, { "ph": "s", "id": 68460, "pid": 76337, "tid": -914061504, "ts": 1716454222309804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222309863, "dur": 0, "args": { "External id": 68470, "cbid": 317, "correlation": 68470 } }, { "ph": "f", "id": 68470, "pid": 76337, "tid": -914061504, "ts": 1716454222309863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222309863, "dur": 0, "args": { "External id": 68471, "cbid": 203, "correlation": 68471 } }, { "ph": "f", "id": 68471, "pid": 76337, "tid": -914061504, "ts": 1716454222309863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222309864, "dur": 0, "args": { "External id": 68472, "cbid": 205, "correlation": 68472 } }, { "ph": "f", "id": 68472, "pid": 76337, "tid": -914061504, "ts": 1716454222309864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222392794, "dur": 40, "args": { "External id": 68476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68476, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68476, "pid": 5, "tid": 7, "ts": 1716454222392794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309877, "dur": 11, "args": { "External id": 68476, "cbid": 211, "correlation": 68476 } }, { "ph": "s", "id": 68476, "pid": 76337, "tid": -914061504, "ts": 1716454222309877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222392835, "dur": 209, "args": { "External id": 68478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68478, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68478, "pid": 5, "tid": 7, "ts": 1716454222392835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309894, "dur": 6, "args": { "External id": 68478, "cbid": 211, "correlation": 68478 } }, { "ph": "s", "id": 68478, "pid": 76337, "tid": -914061504, "ts": 1716454222309894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222393045, "dur": 23, "args": { "External id": 68480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68480, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68480, "pid": 5, "tid": 7, "ts": 1716454222393045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309903, "dur": 6, "args": { "External id": 68480, "cbid": 211, "correlation": 68480 } }, { "ph": "s", "id": 68480, "pid": 76337, "tid": -914061504, "ts": 1716454222309903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222393069, "dur": 34, "args": { "External id": 68486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68486, "pid": 5, "tid": 7, "ts": 1716454222393069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309929, "dur": 8, "args": { "External id": 68486, "cbid": 211, "correlation": 68486 } }, { "ph": "s", "id": 68486, "pid": 76337, "tid": -914061504, "ts": 1716454222309929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222393105, "dur": 27, "args": { "External id": 68494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68494, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68494, "pid": 5, "tid": 7, "ts": 1716454222393105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309962, "dur": 8, "args": { "External id": 68494, "cbid": 211, "correlation": 68494 } }, { "ph": "s", "id": 68494, "pid": 76337, "tid": -914061504, "ts": 1716454222309962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222393133, "dur": 20, "args": { "External id": 68502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68502, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68502, "pid": 5, "tid": 7, "ts": 1716454222393133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222309999, "dur": 9, "args": { "External id": 68502, "cbid": 211, "correlation": 68502 } }, { "ph": "s", "id": 68502, "pid": 76337, "tid": -914061504, "ts": 1716454222309999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222393155, "dur": 32, "args": { "External id": 68522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68522, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 68522, "pid": 5, "tid": 7, "ts": 1716454222393155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310082, "dur": 12, "args": { "External id": 68522, "cbid": 211, "correlation": 68522 } }, { "ph": "s", "id": 68522, "pid": 76337, "tid": -914061504, "ts": 1716454222310082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222393189, "dur": 4, "args": { "External id": 68534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68534, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 68534, "pid": 5, "tid": 7, "ts": 1716454222393189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310104, "dur": 6, "args": { "External id": 68534, "cbid": 211, "correlation": 68534 } }, { "ph": "s", "id": 68534, "pid": 76337, "tid": -914061504, "ts": 1716454222310104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222393195, "dur": 32, "args": { "External id": 68537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68537, "pid": 5, "tid": 7, "ts": 1716454222393195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310123, "dur": 6, "args": { "External id": 68537, "cbid": 211, "correlation": 68537 } }, { "ph": "s", "id": 68537, "pid": 76337, "tid": -914061504, "ts": 1716454222310123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222310180, "dur": 0, "args": { "External id": 68548, "cbid": 317, "correlation": 68548 } }, { "ph": "f", "id": 68548, "pid": 76337, "tid": -914061504, "ts": 1716454222310180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222310181, "dur": 0, "args": { "External id": 68549, "cbid": 203, "correlation": 68549 } }, { "ph": "f", "id": 68549, "pid": 76337, "tid": -914061504, "ts": 1716454222310181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222310182, "dur": 0, "args": { "External id": 68550, "cbid": 205, "correlation": 68550 } }, { "ph": "f", "id": 68550, "pid": 76337, "tid": -914061504, "ts": 1716454222310182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222393228, "dur": 22, "args": { "External id": 68554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68554, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68554, "pid": 5, "tid": 7, "ts": 1716454222393228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310194, "dur": 11, "args": { "External id": 68554, "cbid": 211, "correlation": 68554 } }, { "ph": "s", "id": 68554, "pid": 76337, "tid": -914061504, "ts": 1716454222310194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222393251, "dur": 116, "args": { "External id": 68556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68556, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68556, "pid": 5, "tid": 7, "ts": 1716454222393251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310211, "dur": 6, "args": { "External id": 68556, "cbid": 211, "correlation": 68556 } }, { "ph": "s", "id": 68556, "pid": 76337, "tid": -914061504, "ts": 1716454222310211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222393369, "dur": 23, "args": { "External id": 68558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68558, "pid": 5, "tid": 7, "ts": 1716454222393369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310221, "dur": 8, "args": { "External id": 68558, "cbid": 211, "correlation": 68558 } }, { "ph": "s", "id": 68558, "pid": 76337, "tid": -914061504, "ts": 1716454222310221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222393394, "dur": 34, "args": { "External id": 68564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68564, "pid": 5, "tid": 7, "ts": 1716454222393394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310252, "dur": 9, "args": { "External id": 68564, "cbid": 211, "correlation": 68564 } }, { "ph": "s", "id": 68564, "pid": 76337, "tid": -914061504, "ts": 1716454222310252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222393429, "dur": 186, "args": { "External id": 68573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68573, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68573, "pid": 5, "tid": 7, "ts": 1716454222393429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310335, "dur": 14, "args": { "External id": 68573, "cbid": 211, "correlation": 68573 } }, { "ph": "s", "id": 68573, "pid": 76337, "tid": -914061504, "ts": 1716454222310335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222393617, "dur": 71, "args": { "External id": 68595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68595, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68595, "pid": 5, "tid": 7, "ts": 1716454222393617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310392, "dur": 10, "args": { "External id": 68595, "cbid": 211, "correlation": 68595 } }, { "ph": "s", "id": 68595, "pid": 76337, "tid": -914061504, "ts": 1716454222310392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222310481, "dur": 1, "args": { "External id": 68606, "cbid": 251, "correlation": 68606 } }, { "ph": "f", "id": 68606, "pid": 76337, "tid": -914061504, "ts": 1716454222310481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222393689, "dur": 165, "args": { "External id": 68607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68607, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68607, "pid": 5, "tid": 7, "ts": 1716454222393689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310487, "dur": 13, "args": { "External id": 68607, "cbid": 211, "correlation": 68607 } }, { "ph": "s", "id": 68607, "pid": 76337, "tid": -914061504, "ts": 1716454222310487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222310557, "dur": 1, "args": { "External id": 68618, "cbid": 251, "correlation": 68618 } }, { "ph": "f", "id": 68618, "pid": 76337, "tid": -914061504, "ts": 1716454222310557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222393855, "dur": 158, "args": { "External id": 68619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68619, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68619, "pid": 5, "tid": 7, "ts": 1716454222393855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310561, "dur": 11, "args": { "External id": 68619, "cbid": 211, "correlation": 68619 } }, { "ph": "s", "id": 68619, "pid": 76337, "tid": -914061504, "ts": 1716454222310561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222310626, "dur": 1, "args": { "External id": 68630, "cbid": 251, "correlation": 68630 } }, { "ph": "f", "id": 68630, "pid": 76337, "tid": -914061504, "ts": 1716454222310626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222394015, "dur": 156, "args": { "External id": 68631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68631, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68631, "pid": 5, "tid": 7, "ts": 1716454222394015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310630, "dur": 14, "args": { "External id": 68631, "cbid": 211, "correlation": 68631 } }, { "ph": "s", "id": 68631, "pid": 76337, "tid": -914061504, "ts": 1716454222310630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222394172, "dur": 2143, "args": { "External id": 68652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68652, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 68652, "pid": 5, "tid": 7, "ts": 1716454222394172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310713, "dur": 12, "args": { "External id": 68652, "cbid": 211, "correlation": 68652 } }, { "ph": "s", "id": 68652, "pid": 76337, "tid": -914061504, "ts": 1716454222310713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222310810, "dur": 1, "args": { "External id": 68670, "cbid": 251, "correlation": 68670 } }, { "ph": "f", "id": 68670, "pid": 76337, "tid": -914061504, "ts": 1716454222310810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222396317, "dur": 160, "args": { "External id": 68672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68672, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 68672, "pid": 5, "tid": 7, "ts": 1716454222396317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310815, "dur": 14, "args": { "External id": 68672, "cbid": 211, "correlation": 68672 } }, { "ph": "s", "id": 68672, "pid": 76337, "tid": -914061504, "ts": 1716454222310815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222396479, "dur": 35, "args": { "External id": 68680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68680, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68680, "pid": 5, "tid": 7, "ts": 1716454222396479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310885, "dur": 12, "args": { "External id": 68680, "cbid": 211, "correlation": 68680 } }, { "ph": "s", "id": 68680, "pid": 76337, "tid": -914061504, "ts": 1716454222310885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222396515, "dur": 51, "args": { "External id": 68688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68688, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68688, "pid": 5, "tid": 7, "ts": 1716454222396515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222310923, "dur": 9, "args": { "External id": 68688, "cbid": 211, "correlation": 68688 } }, { "ph": "s", "id": 68688, "pid": 76337, "tid": -914061504, "ts": 1716454222310923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222396568, "dur": 31, "args": { "External id": 68699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68699, "pid": 5, "tid": 7, "ts": 1716454222396568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311003, "dur": 13, "args": { "External id": 68699, "cbid": 211, "correlation": 68699 } }, { "ph": "s", "id": 68699, "pid": 76337, "tid": -914061504, "ts": 1716454222311003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222396600, "dur": 38, "args": { "External id": 68721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68721, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68721, "pid": 5, "tid": 7, "ts": 1716454222396600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311037, "dur": 9, "args": { "External id": 68721, "cbid": 211, "correlation": 68721 } }, { "ph": "s", "id": 68721, "pid": 76337, "tid": -914061504, "ts": 1716454222311037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311123, "dur": 1, "args": { "External id": 68732, "cbid": 251, "correlation": 68732 } }, { "ph": "f", "id": 68732, "pid": 76337, "tid": -914061504, "ts": 1716454222311123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222396640, "dur": 99, "args": { "External id": 68733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68733, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68733, "pid": 5, "tid": 7, "ts": 1716454222396640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311128, "dur": 13, "args": { "External id": 68733, "cbid": 211, "correlation": 68733 } }, { "ph": "s", "id": 68733, "pid": 76337, "tid": -914061504, "ts": 1716454222311128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311195, "dur": 1, "args": { "External id": 68744, "cbid": 251, "correlation": 68744 } }, { "ph": "f", "id": 68744, "pid": 76337, "tid": -914061504, "ts": 1716454222311195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311199, "dur": 0, "args": { "External id": 68745, "cbid": 251, "correlation": 68745 } }, { "ph": "f", "id": 68745, "pid": 76337, "tid": -914061504, "ts": 1716454222311199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222396740, "dur": 12, "args": { "External id": 68746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68746, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 68746, "pid": 5, "tid": 7, "ts": 1716454222396740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311201, "dur": 13, "args": { "External id": 68746, "cbid": 211, "correlation": 68746 } }, { "ph": "s", "id": 68746, "pid": 76337, "tid": -914061504, "ts": 1716454222311201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222396753, "dur": 5, "args": { "External id": 68748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68748, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 68748, "pid": 5, "tid": 7, "ts": 1716454222396753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311215, "dur": 6, "args": { "External id": 68748, "cbid": 211, "correlation": 68748 } }, { "ph": "s", "id": 68748, "pid": 76337, "tid": -914061504, "ts": 1716454222311215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311273, "dur": 1, "args": { "External id": 68759, "cbid": 251, "correlation": 68759 } }, { "ph": "f", "id": 68759, "pid": 76337, "tid": -914061504, "ts": 1716454222311273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311276, "dur": 0, "args": { "External id": 68760, "cbid": 251, "correlation": 68760 } }, { "ph": "f", "id": 68760, "pid": 76337, "tid": -914061504, "ts": 1716454222311276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222396760, "dur": 8, "args": { "External id": 68761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68761, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 68761, "pid": 5, "tid": 7, "ts": 1716454222396760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311277, "dur": 11, "args": { "External id": 68761, "cbid": 211, "correlation": 68761 } }, { "ph": "s", "id": 68761, "pid": 76337, "tid": -914061504, "ts": 1716454222311277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222396769, "dur": 3, "args": { "External id": 68763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68763, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 68763, "pid": 5, "tid": 7, "ts": 1716454222396769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311290, "dur": 6, "args": { "External id": 68763, "cbid": 211, "correlation": 68763 } }, { "ph": "s", "id": 68763, "pid": 76337, "tid": -914061504, "ts": 1716454222311290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222396774, "dur": 99, "args": { "External id": 68784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68784, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 68784, "pid": 5, "tid": 7, "ts": 1716454222396774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311363, "dur": 27, "args": { "External id": 68784, "cbid": 211, "correlation": 68784 } }, { "ph": "s", "id": 68784, "pid": 76337, "tid": -914061504, "ts": 1716454222311363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311476, "dur": 1, "args": { "External id": 68802, "cbid": 251, "correlation": 68802 } }, { "ph": "f", "id": 68802, "pid": 76337, "tid": -914061504, "ts": 1716454222311476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222396875, "dur": 103, "args": { "External id": 68804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68804, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68804, "pid": 5, "tid": 7, "ts": 1716454222396875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311482, "dur": 14, "args": { "External id": 68804, "cbid": 211, "correlation": 68804 } }, { "ph": "s", "id": 68804, "pid": 76337, "tid": -914061504, "ts": 1716454222311482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222396980, "dur": 20, "args": { "External id": 68812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68812, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68812, "pid": 5, "tid": 7, "ts": 1716454222396980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311552, "dur": 12, "args": { "External id": 68812, "cbid": 211, "correlation": 68812 } }, { "ph": "s", "id": 68812, "pid": 76337, "tid": -914061504, "ts": 1716454222311552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222397001, "dur": 39, "args": { "External id": 68820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68820, "pid": 5, "tid": 7, "ts": 1716454222397001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311593, "dur": 10, "args": { "External id": 68820, "cbid": 211, "correlation": 68820 } }, { "ph": "s", "id": 68820, "pid": 76337, "tid": -914061504, "ts": 1716454222311593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222397042, "dur": 38, "args": { "External id": 68842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68842, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68842, "pid": 5, "tid": 7, "ts": 1716454222397042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311644, "dur": 11, "args": { "External id": 68842, "cbid": 211, "correlation": 68842 } }, { "ph": "s", "id": 68842, "pid": 76337, "tid": -914061504, "ts": 1716454222311644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311734, "dur": 1, "args": { "External id": 68858, "cbid": 251, "correlation": 68858 } }, { "ph": "f", "id": 68858, "pid": 76337, "tid": -914061504, "ts": 1716454222311734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311739, "dur": 0, "args": { "External id": 68860, "cbid": 251, "correlation": 68860 } }, { "ph": "f", "id": 68860, "pid": 76337, "tid": -914061504, "ts": 1716454222311739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222397081, "dur": 588, "args": { "External id": 68861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68861, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 68861, "pid": 5, "tid": 7, "ts": 1716454222397081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311743, "dur": 13, "args": { "External id": 68861, "cbid": 211, "correlation": 68861 } }, { "ph": "s", "id": 68861, "pid": 76337, "tid": -914061504, "ts": 1716454222311743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222397671, "dur": 134, "args": { "External id": 68869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68869, "pid": 5, "tid": 7, "ts": 1716454222397671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311811, "dur": 13, "args": { "External id": 68869, "cbid": 211, "correlation": 68869 } }, { "ph": "s", "id": 68869, "pid": 76337, "tid": -914061504, "ts": 1716454222311811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222397806, "dur": 133, "args": { "External id": 68877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68877, "pid": 5, "tid": 7, "ts": 1716454222397806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311842, "dur": 8, "args": { "External id": 68877, "cbid": 211, "correlation": 68877 } }, { "ph": "s", "id": 68877, "pid": 76337, "tid": -914061504, "ts": 1716454222311842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222311919, "dur": 1, "args": { "External id": 68893, "cbid": 251, "correlation": 68893 } }, { "ph": "f", "id": 68893, "pid": 76337, "tid": -914061504, "ts": 1716454222311919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222397941, "dur": 331, "args": { "External id": 68895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68895, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68895, "pid": 5, "tid": 7, "ts": 1716454222397941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311925, "dur": 13, "args": { "External id": 68895, "cbid": 211, "correlation": 68895 } }, { "ph": "s", "id": 68895, "pid": 76337, "tid": -914061504, "ts": 1716454222311925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222398274, "dur": 28, "args": { "External id": 68903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68903, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68903, "pid": 5, "tid": 7, "ts": 1716454222398274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222311967, "dur": 19, "args": { "External id": 68903, "cbid": 211, "correlation": 68903 } }, { "ph": "s", "id": 68903, "pid": 76337, "tid": -914061504, "ts": 1716454222311967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222398303, "dur": 90, "args": { "External id": 68914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68914, "pid": 5, "tid": 7, "ts": 1716454222398303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312044, "dur": 12, "args": { "External id": 68914, "cbid": 211, "correlation": 68914 } }, { "ph": "s", "id": 68914, "pid": 76337, "tid": -914061504, "ts": 1716454222312044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222312109, "dur": 0, "args": { "External id": 68926, "cbid": 317, "correlation": 68926 } }, { "ph": "f", "id": 68926, "pid": 76337, "tid": -914061504, "ts": 1716454222312109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222312110, "dur": 0, "args": { "External id": 68927, "cbid": 203, "correlation": 68927 } }, { "ph": "f", "id": 68927, "pid": 76337, "tid": -914061504, "ts": 1716454222312110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222312111, "dur": 0, "args": { "External id": 68928, "cbid": 205, "correlation": 68928 } }, { "ph": "f", "id": 68928, "pid": 76337, "tid": -914061504, "ts": 1716454222312111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222398394, "dur": 26, "args": { "External id": 68932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68932, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68932, "pid": 5, "tid": 7, "ts": 1716454222398394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312126, "dur": 13, "args": { "External id": 68932, "cbid": 211, "correlation": 68932 } }, { "ph": "s", "id": 68932, "pid": 76337, "tid": -914061504, "ts": 1716454222312126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222398422, "dur": 132, "args": { "External id": 68934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68934, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 68934, "pid": 5, "tid": 7, "ts": 1716454222398422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312145, "dur": 6, "args": { "External id": 68934, "cbid": 211, "correlation": 68934 } }, { "ph": "s", "id": 68934, "pid": 76337, "tid": -914061504, "ts": 1716454222312145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222398556, "dur": 24, "args": { "External id": 68936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68936, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68936, "pid": 5, "tid": 7, "ts": 1716454222398556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312158, "dur": 6, "args": { "External id": 68936, "cbid": 211, "correlation": 68936 } }, { "ph": "s", "id": 68936, "pid": 76337, "tid": -914061504, "ts": 1716454222312158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222398581, "dur": 36, "args": { "External id": 68942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68942, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68942, "pid": 5, "tid": 7, "ts": 1716454222398581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312186, "dur": 10, "args": { "External id": 68942, "cbid": 211, "correlation": 68942 } }, { "ph": "s", "id": 68942, "pid": 76337, "tid": -914061504, "ts": 1716454222312186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222398618, "dur": 27, "args": { "External id": 68950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68950, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68950, "pid": 5, "tid": 7, "ts": 1716454222398618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312219, "dur": 8, "args": { "External id": 68950, "cbid": 211, "correlation": 68950 } }, { "ph": "s", "id": 68950, "pid": 76337, "tid": -914061504, "ts": 1716454222312219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222398646, "dur": 45, "args": { "External id": 68959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68959, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68959, "pid": 5, "tid": 7, "ts": 1716454222398646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312258, "dur": 11, "args": { "External id": 68959, "cbid": 211, "correlation": 68959 } }, { "ph": "s", "id": 68959, "pid": 76337, "tid": -914061504, "ts": 1716454222312258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222398692, "dur": 44, "args": { "External id": 68979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68979, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 68979, "pid": 5, "tid": 7, "ts": 1716454222398692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312330, "dur": 11, "args": { "External id": 68979, "cbid": 211, "correlation": 68979 } }, { "ph": "s", "id": 68979, "pid": 76337, "tid": -914061504, "ts": 1716454222312330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222398738, "dur": 5, "args": { "External id": 68991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68991, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 68991, "pid": 5, "tid": 7, "ts": 1716454222398738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312352, "dur": 7, "args": { "External id": 68991, "cbid": 211, "correlation": 68991 } }, { "ph": "s", "id": 68991, "pid": 76337, "tid": -914061504, "ts": 1716454222312352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222398745, "dur": 47, "args": { "External id": 68994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 68994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 68994, "pid": 5, "tid": 7, "ts": 1716454222398745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312370, "dur": 6, "args": { "External id": 68994, "cbid": 211, "correlation": 68994 } }, { "ph": "s", "id": 68994, "pid": 76337, "tid": -914061504, "ts": 1716454222312370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222398793, "dur": 30, "args": { "External id": 69003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69003, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69003, "pid": 5, "tid": 7, "ts": 1716454222398793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312409, "dur": 13, "args": { "External id": 69003, "cbid": 211, "correlation": 69003 } }, { "ph": "s", "id": 69003, "pid": 76337, "tid": -914061504, "ts": 1716454222312409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222312464, "dur": 0, "args": { "External id": 69013, "cbid": 317, "correlation": 69013 } }, { "ph": "f", "id": 69013, "pid": 76337, "tid": -914061504, "ts": 1716454222312464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222312465, "dur": 0, "args": { "External id": 69014, "cbid": 203, "correlation": 69014 } }, { "ph": "f", "id": 69014, "pid": 76337, "tid": -914061504, "ts": 1716454222312465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222312466, "dur": 0, "args": { "External id": 69015, "cbid": 205, "correlation": 69015 } }, { "ph": "f", "id": 69015, "pid": 76337, "tid": -914061504, "ts": 1716454222312466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222398824, "dur": 30, "args": { "External id": 69019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69019, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69019, "pid": 5, "tid": 7, "ts": 1716454222398824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312482, "dur": 12, "args": { "External id": 69019, "cbid": 211, "correlation": 69019 } }, { "ph": "s", "id": 69019, "pid": 76337, "tid": -914061504, "ts": 1716454222312482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222398856, "dur": 69, "args": { "External id": 69021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69021, "pid": 5, "tid": 7, "ts": 1716454222398856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312496, "dur": 5, "args": { "External id": 69021, "cbid": 211, "correlation": 69021 } }, { "ph": "s", "id": 69021, "pid": 76337, "tid": -914061504, "ts": 1716454222312496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222398926, "dur": 1065, "args": { "External id": 69023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69023, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69023, "pid": 5, "tid": 7, "ts": 1716454222398926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312507, "dur": 6, "args": { "External id": 69023, "cbid": 211, "correlation": 69023 } }, { "ph": "s", "id": 69023, "pid": 76337, "tid": -914061504, "ts": 1716454222312507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222399992, "dur": 22, "args": { "External id": 69025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69025, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69025, "pid": 5, "tid": 7, "ts": 1716454222399992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312516, "dur": 5, "args": { "External id": 69025, "cbid": 211, "correlation": 69025 } }, { "ph": "s", "id": 69025, "pid": 76337, "tid": -914061504, "ts": 1716454222312516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222400016, "dur": 35, "args": { "External id": 69031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69031, "pid": 5, "tid": 7, "ts": 1716454222400016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312545, "dur": 8, "args": { "External id": 69031, "cbid": 211, "correlation": 69031 } }, { "ph": "s", "id": 69031, "pid": 76337, "tid": -914061504, "ts": 1716454222312545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222400052, "dur": 4, "args": { "External id": 69039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69039, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69039, "pid": 5, "tid": 7, "ts": 1716454222400052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312589, "dur": 9, "args": { "External id": 69039, "cbid": 211, "correlation": 69039 } }, { "ph": "s", "id": 69039, "pid": 76337, "tid": -914061504, "ts": 1716454222312589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222312653, "dur": 1, "args": { "External id": 69055, "cbid": 251, "correlation": 69055 } }, { "ph": "f", "id": 69055, "pid": 76337, "tid": -914061504, "ts": 1716454222312653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222312658, "dur": 0, "args": { "External id": 69057, "cbid": 251, "correlation": 69057 } }, { "ph": "f", "id": 69057, "pid": 76337, "tid": -914061504, "ts": 1716454222312658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222400057, "dur": 13, "args": { "External id": 69058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69058, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 69058, "pid": 5, "tid": 7, "ts": 1716454222400057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312660, "dur": 11, "args": { "External id": 69058, "cbid": 211, "correlation": 69058 } }, { "ph": "s", "id": 69058, "pid": 76337, "tid": -914061504, "ts": 1716454222312660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222400071, "dur": 5, "args": { "External id": 69060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69060, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 69060, "pid": 5, "tid": 7, "ts": 1716454222400071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312673, "dur": 5, "args": { "External id": 69060, "cbid": 211, "correlation": 69060 } }, { "ph": "s", "id": 69060, "pid": 76337, "tid": -914061504, "ts": 1716454222312673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222400078, "dur": 31, "args": { "External id": 69070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69070, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69070, "pid": 5, "tid": 7, "ts": 1716454222400078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312730, "dur": 15, "args": { "External id": 69070, "cbid": 211, "correlation": 69070 } }, { "ph": "s", "id": 69070, "pid": 76337, "tid": -914061504, "ts": 1716454222312730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222400111, "dur": 33, "args": { "External id": 69090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69090, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 69090, "pid": 5, "tid": 7, "ts": 1716454222400111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312798, "dur": 11, "args": { "External id": 69090, "cbid": 211, "correlation": 69090 } }, { "ph": "s", "id": 69090, "pid": 76337, "tid": -914061504, "ts": 1716454222312798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222400145, "dur": 4, "args": { "External id": 69102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69102, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 69102, "pid": 5, "tid": 7, "ts": 1716454222400145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312819, "dur": 6, "args": { "External id": 69102, "cbid": 211, "correlation": 69102 } }, { "ph": "s", "id": 69102, "pid": 76337, "tid": -914061504, "ts": 1716454222312819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222400151, "dur": 32, "args": { "External id": 69105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69105, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69105, "pid": 5, "tid": 7, "ts": 1716454222400151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312837, "dur": 7, "args": { "External id": 69105, "cbid": 211, "correlation": 69105 } }, { "ph": "s", "id": 69105, "pid": 76337, "tid": -914061504, "ts": 1716454222312837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222400184, "dur": 22, "args": { "External id": 69114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69114, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69114, "pid": 5, "tid": 7, "ts": 1716454222400184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312878, "dur": 10, "args": { "External id": 69114, "cbid": 211, "correlation": 69114 } }, { "ph": "s", "id": 69114, "pid": 76337, "tid": -914061504, "ts": 1716454222312878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222312939, "dur": 0, "args": { "External id": 69124, "cbid": 317, "correlation": 69124 } }, { "ph": "f", "id": 69124, "pid": 76337, "tid": -914061504, "ts": 1716454222312939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222312940, "dur": 0, "args": { "External id": 69125, "cbid": 203, "correlation": 69125 } }, { "ph": "f", "id": 69125, "pid": 76337, "tid": -914061504, "ts": 1716454222312940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222312940, "dur": 0, "args": { "External id": 69126, "cbid": 205, "correlation": 69126 } }, { "ph": "f", "id": 69126, "pid": 76337, "tid": -914061504, "ts": 1716454222312940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222400207, "dur": 24, "args": { "External id": 69130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69130, "pid": 5, "tid": 7, "ts": 1716454222400207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312954, "dur": 12, "args": { "External id": 69130, "cbid": 211, "correlation": 69130 } }, { "ph": "s", "id": 69130, "pid": 76337, "tid": -914061504, "ts": 1716454222312954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222400233, "dur": 48, "args": { "External id": 69132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69132, "pid": 5, "tid": 7, "ts": 1716454222400233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312969, "dur": 13, "args": { "External id": 69132, "cbid": 211, "correlation": 69132 } }, { "ph": "s", "id": 69132, "pid": 76337, "tid": -914061504, "ts": 1716454222312969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222400282, "dur": 709, "args": { "External id": 69134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69134, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69134, "pid": 5, "tid": 7, "ts": 1716454222400282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312988, "dur": 6, "args": { "External id": 69134, "cbid": 211, "correlation": 69134 } }, { "ph": "s", "id": 69134, "pid": 76337, "tid": -914061504, "ts": 1716454222312988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222400993, "dur": 22, "args": { "External id": 69136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69136, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69136, "pid": 5, "tid": 7, "ts": 1716454222400993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222312998, "dur": 8, "args": { "External id": 69136, "cbid": 211, "correlation": 69136 } }, { "ph": "s", "id": 69136, "pid": 76337, "tid": -914061504, "ts": 1716454222312998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222401017, "dur": 35, "args": { "External id": 69142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69142, "pid": 5, "tid": 7, "ts": 1716454222401017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313029, "dur": 9, "args": { "External id": 69142, "cbid": 211, "correlation": 69142 } }, { "ph": "s", "id": 69142, "pid": 76337, "tid": -914061504, "ts": 1716454222313029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222313088, "dur": 0, "args": { "External id": 69152, "cbid": 317, "correlation": 69152 } }, { "ph": "f", "id": 69152, "pid": 76337, "tid": -914061504, "ts": 1716454222313088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222313089, "dur": 0, "args": { "External id": 69153, "cbid": 203, "correlation": 69153 } }, { "ph": "f", "id": 69153, "pid": 76337, "tid": -914061504, "ts": 1716454222313089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222313090, "dur": 0, "args": { "External id": 69154, "cbid": 205, "correlation": 69154 } }, { "ph": "f", "id": 69154, "pid": 76337, "tid": -914061504, "ts": 1716454222313090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222401053, "dur": 30, "args": { "External id": 69158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69158, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69158, "pid": 5, "tid": 7, "ts": 1716454222401053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313111, "dur": 12, "args": { "External id": 69158, "cbid": 211, "correlation": 69158 } }, { "ph": "s", "id": 69158, "pid": 76337, "tid": -914061504, "ts": 1716454222313111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222401085, "dur": 167, "args": { "External id": 69160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69160, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69160, "pid": 5, "tid": 7, "ts": 1716454222401085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313134, "dur": 6, "args": { "External id": 69160, "cbid": 211, "correlation": 69160 } }, { "ph": "s", "id": 69160, "pid": 76337, "tid": -914061504, "ts": 1716454222313134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222401254, "dur": 24, "args": { "External id": 69162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69162, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69162, "pid": 5, "tid": 7, "ts": 1716454222401254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313145, "dur": 5, "args": { "External id": 69162, "cbid": 211, "correlation": 69162 } }, { "ph": "s", "id": 69162, "pid": 76337, "tid": -914061504, "ts": 1716454222313145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222401279, "dur": 34, "args": { "External id": 69168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69168, "pid": 5, "tid": 7, "ts": 1716454222401279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313170, "dur": 9, "args": { "External id": 69168, "cbid": 211, "correlation": 69168 } }, { "ph": "s", "id": 69168, "pid": 76337, "tid": -914061504, "ts": 1716454222313170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222401315, "dur": 27, "args": { "External id": 69176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69176, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69176, "pid": 5, "tid": 7, "ts": 1716454222401315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313200, "dur": 8, "args": { "External id": 69176, "cbid": 211, "correlation": 69176 } }, { "ph": "s", "id": 69176, "pid": 76337, "tid": -914061504, "ts": 1716454222313200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222401344, "dur": 20, "args": { "External id": 69184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69184, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69184, "pid": 5, "tid": 7, "ts": 1716454222401344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313228, "dur": 8, "args": { "External id": 69184, "cbid": 211, "correlation": 69184 } }, { "ph": "s", "id": 69184, "pid": 76337, "tid": -914061504, "ts": 1716454222313228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222401366, "dur": 33, "args": { "External id": 69204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69204, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 69204, "pid": 5, "tid": 7, "ts": 1716454222401366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313309, "dur": 15, "args": { "External id": 69204, "cbid": 211, "correlation": 69204 } }, { "ph": "s", "id": 69204, "pid": 76337, "tid": -914061504, "ts": 1716454222313309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222401400, "dur": 5, "args": { "External id": 69216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69216, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 69216, "pid": 5, "tid": 7, "ts": 1716454222401400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313335, "dur": 6, "args": { "External id": 69216, "cbid": 211, "correlation": 69216 } }, { "ph": "s", "id": 69216, "pid": 76337, "tid": -914061504, "ts": 1716454222313335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222401406, "dur": 33, "args": { "External id": 69219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69219, "pid": 5, "tid": 7, "ts": 1716454222401406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313352, "dur": 6, "args": { "External id": 69219, "cbid": 211, "correlation": 69219 } }, { "ph": "s", "id": 69219, "pid": 76337, "tid": -914061504, "ts": 1716454222313352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222313410, "dur": 0, "args": { "External id": 69230, "cbid": 317, "correlation": 69230 } }, { "ph": "f", "id": 69230, "pid": 76337, "tid": -914061504, "ts": 1716454222313410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222313410, "dur": 0, "args": { "External id": 69231, "cbid": 203, "correlation": 69231 } }, { "ph": "f", "id": 69231, "pid": 76337, "tid": -914061504, "ts": 1716454222313410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222313411, "dur": 0, "args": { "External id": 69232, "cbid": 205, "correlation": 69232 } }, { "ph": "f", "id": 69232, "pid": 76337, "tid": -914061504, "ts": 1716454222313411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222401441, "dur": 22, "args": { "External id": 69236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69236, "pid": 5, "tid": 7, "ts": 1716454222401441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313424, "dur": 12, "args": { "External id": 69236, "cbid": 211, "correlation": 69236 } }, { "ph": "s", "id": 69236, "pid": 76337, "tid": -914061504, "ts": 1716454222313424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222401464, "dur": 116, "args": { "External id": 69238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69238, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69238, "pid": 5, "tid": 7, "ts": 1716454222401464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313442, "dur": 6, "args": { "External id": 69238, "cbid": 211, "correlation": 69238 } }, { "ph": "s", "id": 69238, "pid": 76337, "tid": -914061504, "ts": 1716454222313442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222401581, "dur": 25, "args": { "External id": 69240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69240, "pid": 5, "tid": 7, "ts": 1716454222401581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313451, "dur": 5, "args": { "External id": 69240, "cbid": 211, "correlation": 69240 } }, { "ph": "s", "id": 69240, "pid": 76337, "tid": -914061504, "ts": 1716454222313451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222401608, "dur": 35, "args": { "External id": 69246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69246, "pid": 5, "tid": 7, "ts": 1716454222401608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313478, "dur": 8, "args": { "External id": 69246, "cbid": 211, "correlation": 69246 } }, { "ph": "s", "id": 69246, "pid": 76337, "tid": -914061504, "ts": 1716454222313478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222401644, "dur": 206, "args": { "External id": 69255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69255, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69255, "pid": 5, "tid": 7, "ts": 1716454222401644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313558, "dur": 14, "args": { "External id": 69255, "cbid": 211, "correlation": 69255 } }, { "ph": "s", "id": 69255, "pid": 76337, "tid": -914061504, "ts": 1716454222313558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222401851, "dur": 71, "args": { "External id": 69277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69277, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69277, "pid": 5, "tid": 7, "ts": 1716454222401851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313618, "dur": 11, "args": { "External id": 69277, "cbid": 211, "correlation": 69277 } }, { "ph": "s", "id": 69277, "pid": 76337, "tid": -914061504, "ts": 1716454222313618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222313707, "dur": 1, "args": { "External id": 69288, "cbid": 251, "correlation": 69288 } }, { "ph": "f", "id": 69288, "pid": 76337, "tid": -914061504, "ts": 1716454222313707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222401924, "dur": 164, "args": { "External id": 69289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69289, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69289, "pid": 5, "tid": 7, "ts": 1716454222401924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313712, "dur": 13, "args": { "External id": 69289, "cbid": 211, "correlation": 69289 } }, { "ph": "s", "id": 69289, "pid": 76337, "tid": -914061504, "ts": 1716454222313712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222313782, "dur": 1, "args": { "External id": 69300, "cbid": 251, "correlation": 69300 } }, { "ph": "f", "id": 69300, "pid": 76337, "tid": -914061504, "ts": 1716454222313782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222402089, "dur": 155, "args": { "External id": 69301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69301, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69301, "pid": 5, "tid": 7, "ts": 1716454222402089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313786, "dur": 11, "args": { "External id": 69301, "cbid": 211, "correlation": 69301 } }, { "ph": "s", "id": 69301, "pid": 76337, "tid": -914061504, "ts": 1716454222313786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222313850, "dur": 1, "args": { "External id": 69312, "cbid": 251, "correlation": 69312 } }, { "ph": "f", "id": 69312, "pid": 76337, "tid": -914061504, "ts": 1716454222313850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222402246, "dur": 154, "args": { "External id": 69313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69313, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69313, "pid": 5, "tid": 7, "ts": 1716454222402246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313854, "dur": 12, "args": { "External id": 69313, "cbid": 211, "correlation": 69313 } }, { "ph": "s", "id": 69313, "pid": 76337, "tid": -914061504, "ts": 1716454222313854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222402402, "dur": 2143, "args": { "External id": 69334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69334, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 69334, "pid": 5, "tid": 7, "ts": 1716454222402402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222313934, "dur": 12, "args": { "External id": 69334, "cbid": 211, "correlation": 69334 } }, { "ph": "s", "id": 69334, "pid": 76337, "tid": -914061504, "ts": 1716454222313934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314040, "dur": 4, "args": { "External id": 69352, "cbid": 251, "correlation": 69352 } }, { "ph": "f", "id": 69352, "pid": 76337, "tid": -914061504, "ts": 1716454222314040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222404546, "dur": 160, "args": { "External id": 69354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69354, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 69354, "pid": 5, "tid": 7, "ts": 1716454222404546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314049, "dur": 14, "args": { "External id": 69354, "cbid": 211, "correlation": 69354 } }, { "ph": "s", "id": 69354, "pid": 76337, "tid": -914061504, "ts": 1716454222314049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222404707, "dur": 35, "args": { "External id": 69362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69362, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69362, "pid": 5, "tid": 7, "ts": 1716454222404707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314119, "dur": 12, "args": { "External id": 69362, "cbid": 211, "correlation": 69362 } }, { "ph": "s", "id": 69362, "pid": 76337, "tid": -914061504, "ts": 1716454222314119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222404744, "dur": 51, "args": { "External id": 69370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69370, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69370, "pid": 5, "tid": 7, "ts": 1716454222404744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314159, "dur": 8, "args": { "External id": 69370, "cbid": 211, "correlation": 69370 } }, { "ph": "s", "id": 69370, "pid": 76337, "tid": -914061504, "ts": 1716454222314159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222404796, "dur": 31, "args": { "External id": 69381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69381, "pid": 5, "tid": 7, "ts": 1716454222404796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314230, "dur": 12, "args": { "External id": 69381, "cbid": 211, "correlation": 69381 } }, { "ph": "s", "id": 69381, "pid": 76337, "tid": -914061504, "ts": 1716454222314230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222404829, "dur": 37, "args": { "External id": 69403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69403, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69403, "pid": 5, "tid": 7, "ts": 1716454222404829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314260, "dur": 8, "args": { "External id": 69403, "cbid": 211, "correlation": 69403 } }, { "ph": "s", "id": 69403, "pid": 76337, "tid": -914061504, "ts": 1716454222314260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314344, "dur": 1, "args": { "External id": 69414, "cbid": 251, "correlation": 69414 } }, { "ph": "f", "id": 69414, "pid": 76337, "tid": -914061504, "ts": 1716454222314344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222404868, "dur": 96, "args": { "External id": 69415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69415, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69415, "pid": 5, "tid": 7, "ts": 1716454222404868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314350, "dur": 13, "args": { "External id": 69415, "cbid": 211, "correlation": 69415 } }, { "ph": "s", "id": 69415, "pid": 76337, "tid": -914061504, "ts": 1716454222314350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314417, "dur": 1, "args": { "External id": 69426, "cbid": 251, "correlation": 69426 } }, { "ph": "f", "id": 69426, "pid": 76337, "tid": -914061504, "ts": 1716454222314417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314420, "dur": 0, "args": { "External id": 69427, "cbid": 251, "correlation": 69427 } }, { "ph": "f", "id": 69427, "pid": 76337, "tid": -914061504, "ts": 1716454222314420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222404965, "dur": 12, "args": { "External id": 69428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69428, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 69428, "pid": 5, "tid": 7, "ts": 1716454222404965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314422, "dur": 15, "args": { "External id": 69428, "cbid": 211, "correlation": 69428 } }, { "ph": "s", "id": 69428, "pid": 76337, "tid": -914061504, "ts": 1716454222314422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222404979, "dur": 5, "args": { "External id": 69430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69430, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 69430, "pid": 5, "tid": 7, "ts": 1716454222404979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314439, "dur": 6, "args": { "External id": 69430, "cbid": 211, "correlation": 69430 } }, { "ph": "s", "id": 69430, "pid": 76337, "tid": -914061504, "ts": 1716454222314439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314496, "dur": 1, "args": { "External id": 69441, "cbid": 251, "correlation": 69441 } }, { "ph": "f", "id": 69441, "pid": 76337, "tid": -914061504, "ts": 1716454222314496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314500, "dur": 0, "args": { "External id": 69442, "cbid": 251, "correlation": 69442 } }, { "ph": "f", "id": 69442, "pid": 76337, "tid": -914061504, "ts": 1716454222314500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222404986, "dur": 8, "args": { "External id": 69443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69443, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 69443, "pid": 5, "tid": 7, "ts": 1716454222404986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314501, "dur": 12, "args": { "External id": 69443, "cbid": 211, "correlation": 69443 } }, { "ph": "s", "id": 69443, "pid": 76337, "tid": -914061504, "ts": 1716454222314501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222404995, "dur": 3, "args": { "External id": 69445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69445, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 69445, "pid": 5, "tid": 7, "ts": 1716454222404995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314515, "dur": 5, "args": { "External id": 69445, "cbid": 211, "correlation": 69445 } }, { "ph": "s", "id": 69445, "pid": 76337, "tid": -914061504, "ts": 1716454222314515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222405000, "dur": 100, "args": { "External id": 69466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69466, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 69466, "pid": 5, "tid": 7, "ts": 1716454222405000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314600, "dur": 13, "args": { "External id": 69466, "cbid": 211, "correlation": 69466 } }, { "ph": "s", "id": 69466, "pid": 76337, "tid": -914061504, "ts": 1716454222314600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314699, "dur": 1, "args": { "External id": 69484, "cbid": 251, "correlation": 69484 } }, { "ph": "f", "id": 69484, "pid": 76337, "tid": -914061504, "ts": 1716454222314699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222405101, "dur": 105, "args": { "External id": 69486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69486, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69486, "pid": 5, "tid": 7, "ts": 1716454222405101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314704, "dur": 13, "args": { "External id": 69486, "cbid": 211, "correlation": 69486 } }, { "ph": "s", "id": 69486, "pid": 76337, "tid": -914061504, "ts": 1716454222314704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222405208, "dur": 19, "args": { "External id": 69494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69494, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69494, "pid": 5, "tid": 7, "ts": 1716454222405208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314773, "dur": 12, "args": { "External id": 69494, "cbid": 211, "correlation": 69494 } }, { "ph": "s", "id": 69494, "pid": 76337, "tid": -914061504, "ts": 1716454222314773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222405228, "dur": 40, "args": { "External id": 69502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69502, "pid": 5, "tid": 7, "ts": 1716454222405228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314818, "dur": 10, "args": { "External id": 69502, "cbid": 211, "correlation": 69502 } }, { "ph": "s", "id": 69502, "pid": 76337, "tid": -914061504, "ts": 1716454222314818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222405270, "dur": 37, "args": { "External id": 69524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69524, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69524, "pid": 5, "tid": 7, "ts": 1716454222405270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314871, "dur": 10, "args": { "External id": 69524, "cbid": 211, "correlation": 69524 } }, { "ph": "s", "id": 69524, "pid": 76337, "tid": -914061504, "ts": 1716454222314871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314959, "dur": 1, "args": { "External id": 69540, "cbid": 251, "correlation": 69540 } }, { "ph": "f", "id": 69540, "pid": 76337, "tid": -914061504, "ts": 1716454222314959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222314964, "dur": 0, "args": { "External id": 69542, "cbid": 251, "correlation": 69542 } }, { "ph": "f", "id": 69542, "pid": 76337, "tid": -914061504, "ts": 1716454222314964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222405308, "dur": 587, "args": { "External id": 69543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69543, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 69543, "pid": 5, "tid": 7, "ts": 1716454222405308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222314968, "dur": 20, "args": { "External id": 69543, "cbid": 211, "correlation": 69543 } }, { "ph": "s", "id": 69543, "pid": 76337, "tid": -914061504, "ts": 1716454222314968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222405897, "dur": 135, "args": { "External id": 69551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69551, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69551, "pid": 5, "tid": 7, "ts": 1716454222405897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315041, "dur": 13, "args": { "External id": 69551, "cbid": 211, "correlation": 69551 } }, { "ph": "s", "id": 69551, "pid": 76337, "tid": -914061504, "ts": 1716454222315041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222406033, "dur": 134, "args": { "External id": 69559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69559, "pid": 5, "tid": 7, "ts": 1716454222406033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315070, "dur": 8, "args": { "External id": 69559, "cbid": 211, "correlation": 69559 } }, { "ph": "s", "id": 69559, "pid": 76337, "tid": -914061504, "ts": 1716454222315070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222315147, "dur": 1, "args": { "External id": 69575, "cbid": 251, "correlation": 69575 } }, { "ph": "f", "id": 69575, "pid": 76337, "tid": -914061504, "ts": 1716454222315147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222406169, "dur": 333, "args": { "External id": 69577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69577, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69577, "pid": 5, "tid": 7, "ts": 1716454222406169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315153, "dur": 16, "args": { "External id": 69577, "cbid": 211, "correlation": 69577 } }, { "ph": "s", "id": 69577, "pid": 76337, "tid": -914061504, "ts": 1716454222315153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222406504, "dur": 28, "args": { "External id": 69585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69585, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69585, "pid": 5, "tid": 7, "ts": 1716454222406504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315199, "dur": 10, "args": { "External id": 69585, "cbid": 211, "correlation": 69585 } }, { "ph": "s", "id": 69585, "pid": 76337, "tid": -914061504, "ts": 1716454222315199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222406533, "dur": 90, "args": { "External id": 69596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69596, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69596, "pid": 5, "tid": 7, "ts": 1716454222406533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315265, "dur": 13, "args": { "External id": 69596, "cbid": 211, "correlation": 69596 } }, { "ph": "s", "id": 69596, "pid": 76337, "tid": -914061504, "ts": 1716454222315265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222315330, "dur": 0, "args": { "External id": 69608, "cbid": 317, "correlation": 69608 } }, { "ph": "f", "id": 69608, "pid": 76337, "tid": -914061504, "ts": 1716454222315330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222315331, "dur": 0, "args": { "External id": 69609, "cbid": 203, "correlation": 69609 } }, { "ph": "f", "id": 69609, "pid": 76337, "tid": -914061504, "ts": 1716454222315331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222315332, "dur": 0, "args": { "External id": 69610, "cbid": 205, "correlation": 69610 } }, { "ph": "f", "id": 69610, "pid": 76337, "tid": -914061504, "ts": 1716454222315332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222406624, "dur": 26, "args": { "External id": 69614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69614, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69614, "pid": 5, "tid": 7, "ts": 1716454222406624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315347, "dur": 12, "args": { "External id": 69614, "cbid": 211, "correlation": 69614 } }, { "ph": "s", "id": 69614, "pid": 76337, "tid": -914061504, "ts": 1716454222315347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222406651, "dur": 131, "args": { "External id": 69616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69616, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69616, "pid": 5, "tid": 7, "ts": 1716454222406651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315366, "dur": 6, "args": { "External id": 69616, "cbid": 211, "correlation": 69616 } }, { "ph": "s", "id": 69616, "pid": 76337, "tid": -914061504, "ts": 1716454222315366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222406784, "dur": 25, "args": { "External id": 69618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69618, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69618, "pid": 5, "tid": 7, "ts": 1716454222406784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315376, "dur": 5, "args": { "External id": 69618, "cbid": 211, "correlation": 69618 } }, { "ph": "s", "id": 69618, "pid": 76337, "tid": -914061504, "ts": 1716454222315376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222406811, "dur": 35, "args": { "External id": 69624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69624, "pid": 5, "tid": 7, "ts": 1716454222406811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315404, "dur": 8, "args": { "External id": 69624, "cbid": 211, "correlation": 69624 } }, { "ph": "s", "id": 69624, "pid": 76337, "tid": -914061504, "ts": 1716454222315404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222406847, "dur": 28, "args": { "External id": 69632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69632, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69632, "pid": 5, "tid": 7, "ts": 1716454222406847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315435, "dur": 8, "args": { "External id": 69632, "cbid": 211, "correlation": 69632 } }, { "ph": "s", "id": 69632, "pid": 76337, "tid": -914061504, "ts": 1716454222315435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222406876, "dur": 108, "args": { "External id": 69643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69643, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69643, "pid": 5, "tid": 7, "ts": 1716454222406876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315497, "dur": 16, "args": { "External id": 69643, "cbid": 211, "correlation": 69643 } }, { "ph": "s", "id": 69643, "pid": 76337, "tid": -914061504, "ts": 1716454222315497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222315556, "dur": 0, "args": { "External id": 69653, "cbid": 317, "correlation": 69653 } }, { "ph": "f", "id": 69653, "pid": 76337, "tid": -914061504, "ts": 1716454222315556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222315557, "dur": 0, "args": { "External id": 69654, "cbid": 203, "correlation": 69654 } }, { "ph": "f", "id": 69654, "pid": 76337, "tid": -914061504, "ts": 1716454222315557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222315558, "dur": 0, "args": { "External id": 69655, "cbid": 205, "correlation": 69655 } }, { "ph": "f", "id": 69655, "pid": 76337, "tid": -914061504, "ts": 1716454222315558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222406986, "dur": 77, "args": { "External id": 69659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69659, "pid": 5, "tid": 7, "ts": 1716454222406986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315574, "dur": 11, "args": { "External id": 69659, "cbid": 211, "correlation": 69659 } }, { "ph": "s", "id": 69659, "pid": 76337, "tid": -914061504, "ts": 1716454222315574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222407065, "dur": 48, "args": { "External id": 69661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69661, "pid": 5, "tid": 7, "ts": 1716454222407065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315588, "dur": 5, "args": { "External id": 69661, "cbid": 211, "correlation": 69661 } }, { "ph": "s", "id": 69661, "pid": 76337, "tid": -914061504, "ts": 1716454222315588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222407114, "dur": 4, "args": { "External id": 69663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69663, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69663, "pid": 5, "tid": 7, "ts": 1716454222407114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315599, "dur": 6, "args": { "External id": 69663, "cbid": 211, "correlation": 69663 } }, { "ph": "s", "id": 69663, "pid": 76337, "tid": -914061504, "ts": 1716454222315599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222315608, "dur": 0, "args": { "External id": 69664, "cbid": 51, "correlation": 69664 } }, { "ph": "s", "id": 69664, "pid": 76337, "tid": -914061504, "ts": 1716454222315608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222407120, "dur": 2401, "args": { "External id": 69665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69665, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69665, "pid": 5, "tid": 7, "ts": 1716454222407120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315609, "dur": 5, "args": { "External id": 69665, "cbid": 211, "correlation": 69665 } }, { "ph": "s", "id": 69665, "pid": 76337, "tid": -914061504, "ts": 1716454222315609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222409523, "dur": 119, "args": { "External id": 69670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69670, "pid": 5, "tid": 7, "ts": 1716454222409523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315638, "dur": 9, "args": { "External id": 69670, "cbid": 211, "correlation": 69670 } }, { "ph": "s", "id": 69670, "pid": 76337, "tid": -914061504, "ts": 1716454222315638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222409643, "dur": 173, "args": { "External id": 69679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69679, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69679, "pid": 5, "tid": 7, "ts": 1716454222409643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315731, "dur": 14, "args": { "External id": 69679, "cbid": 211, "correlation": 69679 } }, { "ph": "s", "id": 69679, "pid": 76337, "tid": -914061504, "ts": 1716454222315731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222409818, "dur": 136, "args": { "External id": 69699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69699, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 69699, "pid": 5, "tid": 7, "ts": 1716454222409818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315801, "dur": 12, "args": { "External id": 69699, "cbid": 211, "correlation": 69699 } }, { "ph": "s", "id": 69699, "pid": 76337, "tid": -914061504, "ts": 1716454222315801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222409956, "dur": 5, "args": { "External id": 69711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69711, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 69711, "pid": 5, "tid": 7, "ts": 1716454222409956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315823, "dur": 9, "args": { "External id": 69711, "cbid": 211, "correlation": 69711 } }, { "ph": "s", "id": 69711, "pid": 76337, "tid": -914061504, "ts": 1716454222315823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222409962, "dur": 171, "args": { "External id": 69714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69714, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69714, "pid": 5, "tid": 7, "ts": 1716454222409962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315845, "dur": 7, "args": { "External id": 69714, "cbid": 211, "correlation": 69714 } }, { "ph": "s", "id": 69714, "pid": 76337, "tid": -914061504, "ts": 1716454222315845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222410134, "dur": 104, "args": { "External id": 69723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69723, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69723, "pid": 5, "tid": 7, "ts": 1716454222410134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315887, "dur": 9, "args": { "External id": 69723, "cbid": 211, "correlation": 69723 } }, { "ph": "s", "id": 69723, "pid": 76337, "tid": -914061504, "ts": 1716454222315887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222315939, "dur": 0, "args": { "External id": 69733, "cbid": 317, "correlation": 69733 } }, { "ph": "f", "id": 69733, "pid": 76337, "tid": -914061504, "ts": 1716454222315939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222315940, "dur": 0, "args": { "External id": 69734, "cbid": 203, "correlation": 69734 } }, { "ph": "f", "id": 69734, "pid": 76337, "tid": -914061504, "ts": 1716454222315940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222315941, "dur": 0, "args": { "External id": 69735, "cbid": 205, "correlation": 69735 } }, { "ph": "f", "id": 69735, "pid": 76337, "tid": -914061504, "ts": 1716454222315941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222410239, "dur": 115, "args": { "External id": 69739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69739, "pid": 5, "tid": 7, "ts": 1716454222410239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315956, "dur": 11, "args": { "External id": 69739, "cbid": 211, "correlation": 69739 } }, { "ph": "s", "id": 69739, "pid": 76337, "tid": -914061504, "ts": 1716454222315956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222410356, "dur": 36, "args": { "External id": 69741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69741, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69741, "pid": 5, "tid": 7, "ts": 1716454222410356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315970, "dur": 15, "args": { "External id": 69741, "cbid": 211, "correlation": 69741 } }, { "ph": "s", "id": 69741, "pid": 76337, "tid": -914061504, "ts": 1716454222315970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222410393, "dur": 4, "args": { "External id": 69743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69743, "pid": 5, "tid": 7, "ts": 1716454222410393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222315990, "dur": 6, "args": { "External id": 69743, "cbid": 211, "correlation": 69743 } }, { "ph": "s", "id": 69743, "pid": 76337, "tid": -914061504, "ts": 1716454222315990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222316000, "dur": 0, "args": { "External id": 69744, "cbid": 51, "correlation": 69744 } }, { "ph": "s", "id": 69744, "pid": 76337, "tid": -914061504, "ts": 1716454222316000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222410399, "dur": 2205, "args": { "External id": 69745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69745, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69745, "pid": 5, "tid": 7, "ts": 1716454222410399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316001, "dur": 5, "args": { "External id": 69745, "cbid": 211, "correlation": 69745 } }, { "ph": "s", "id": 69745, "pid": 76337, "tid": -914061504, "ts": 1716454222316001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222412606, "dur": 63, "args": { "External id": 69750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69750, "pid": 5, "tid": 7, "ts": 1716454222412606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316029, "dur": 9, "args": { "External id": 69750, "cbid": 211, "correlation": 69750 } }, { "ph": "s", "id": 69750, "pid": 76337, "tid": -914061504, "ts": 1716454222316029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222412670, "dur": 4, "args": { "External id": 69758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69758, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69758, "pid": 5, "tid": 7, "ts": 1716454222412670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316073, "dur": 9, "args": { "External id": 69758, "cbid": 211, "correlation": 69758 } }, { "ph": "s", "id": 69758, "pid": 76337, "tid": -914061504, "ts": 1716454222316073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316138, "dur": 1, "args": { "External id": 69774, "cbid": 251, "correlation": 69774 } }, { "ph": "f", "id": 69774, "pid": 76337, "tid": -914061504, "ts": 1716454222316138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316144, "dur": 0, "args": { "External id": 69776, "cbid": 251, "correlation": 69776 } }, { "ph": "f", "id": 69776, "pid": 76337, "tid": -914061504, "ts": 1716454222316144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222412675, "dur": 12, "args": { "External id": 69777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69777, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 69777, "pid": 5, "tid": 7, "ts": 1716454222412675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316146, "dur": 15, "args": { "External id": 69777, "cbid": 211, "correlation": 69777 } }, { "ph": "s", "id": 69777, "pid": 76337, "tid": -914061504, "ts": 1716454222316146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222412689, "dur": 5, "args": { "External id": 69779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69779, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 69779, "pid": 5, "tid": 7, "ts": 1716454222412689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316163, "dur": 6, "args": { "External id": 69779, "cbid": 211, "correlation": 69779 } }, { "ph": "s", "id": 69779, "pid": 76337, "tid": -914061504, "ts": 1716454222316163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222412695, "dur": 57, "args": { "External id": 69789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69789, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69789, "pid": 5, "tid": 7, "ts": 1716454222412695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316221, "dur": 12, "args": { "External id": 69789, "cbid": 211, "correlation": 69789 } }, { "ph": "s", "id": 69789, "pid": 76337, "tid": -914061504, "ts": 1716454222316221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222412754, "dur": 55, "args": { "External id": 69809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69809, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 69809, "pid": 5, "tid": 7, "ts": 1716454222412754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316287, "dur": 10, "args": { "External id": 69809, "cbid": 211, "correlation": 69809 } }, { "ph": "s", "id": 69809, "pid": 76337, "tid": -914061504, "ts": 1716454222316287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222412810, "dur": 4, "args": { "External id": 69821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69821, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69821, "pid": 5, "tid": 7, "ts": 1716454222412810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316307, "dur": 6, "args": { "External id": 69821, "cbid": 211, "correlation": 69821 } }, { "ph": "s", "id": 69821, "pid": 76337, "tid": -914061504, "ts": 1716454222316307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222412816, "dur": 59, "args": { "External id": 69824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69824, "pid": 5, "tid": 7, "ts": 1716454222412816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316326, "dur": 7, "args": { "External id": 69824, "cbid": 211, "correlation": 69824 } }, { "ph": "s", "id": 69824, "pid": 76337, "tid": -914061504, "ts": 1716454222316326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222412877, "dur": 38, "args": { "External id": 69833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69833, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69833, "pid": 5, "tid": 7, "ts": 1716454222412877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316366, "dur": 10, "args": { "External id": 69833, "cbid": 211, "correlation": 69833 } }, { "ph": "s", "id": 69833, "pid": 76337, "tid": -914061504, "ts": 1716454222316366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222316429, "dur": 0, "args": { "External id": 69843, "cbid": 317, "correlation": 69843 } }, { "ph": "f", "id": 69843, "pid": 76337, "tid": -914061504, "ts": 1716454222316429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222316430, "dur": 0, "args": { "External id": 69844, "cbid": 203, "correlation": 69844 } }, { "ph": "f", "id": 69844, "pid": 76337, "tid": -914061504, "ts": 1716454222316430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222316431, "dur": 0, "args": { "External id": 69845, "cbid": 205, "correlation": 69845 } }, { "ph": "f", "id": 69845, "pid": 76337, "tid": -914061504, "ts": 1716454222316431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222412916, "dur": 41, "args": { "External id": 69849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69849, "pid": 5, "tid": 7, "ts": 1716454222412916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316451, "dur": 12, "args": { "External id": 69849, "cbid": 211, "correlation": 69849 } }, { "ph": "s", "id": 69849, "pid": 76337, "tid": -914061504, "ts": 1716454222316451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222412958, "dur": 15, "args": { "External id": 69851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69851, "pid": 5, "tid": 7, "ts": 1716454222412958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316466, "dur": 5, "args": { "External id": 69851, "cbid": 211, "correlation": 69851 } }, { "ph": "s", "id": 69851, "pid": 76337, "tid": -914061504, "ts": 1716454222316466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222412975, "dur": 4, "args": { "External id": 69853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69853, "pid": 5, "tid": 7, "ts": 1716454222412975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316476, "dur": 5, "args": { "External id": 69853, "cbid": 211, "correlation": 69853 } }, { "ph": "s", "id": 69853, "pid": 76337, "tid": -914061504, "ts": 1716454222316476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222316484, "dur": 0, "args": { "External id": 69854, "cbid": 51, "correlation": 69854 } }, { "ph": "s", "id": 69854, "pid": 76337, "tid": -914061504, "ts": 1716454222316484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222412980, "dur": 765, "args": { "External id": 69855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69855, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69855, "pid": 5, "tid": 7, "ts": 1716454222412980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316485, "dur": 5, "args": { "External id": 69855, "cbid": 211, "correlation": 69855 } }, { "ph": "s", "id": 69855, "pid": 76337, "tid": -914061504, "ts": 1716454222316485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222413747, "dur": 63, "args": { "External id": 69860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69860, "pid": 5, "tid": 7, "ts": 1716454222413747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316514, "dur": 8, "args": { "External id": 69860, "cbid": 211, "correlation": 69860 } }, { "ph": "s", "id": 69860, "pid": 76337, "tid": -914061504, "ts": 1716454222316514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222316570, "dur": 0, "args": { "External id": 69870, "cbid": 317, "correlation": 69870 } }, { "ph": "f", "id": 69870, "pid": 76337, "tid": -914061504, "ts": 1716454222316570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222316571, "dur": 0, "args": { "External id": 69871, "cbid": 203, "correlation": 69871 } }, { "ph": "f", "id": 69871, "pid": 76337, "tid": -914061504, "ts": 1716454222316571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222316572, "dur": 0, "args": { "External id": 69872, "cbid": 205, "correlation": 69872 } }, { "ph": "f", "id": 69872, "pid": 76337, "tid": -914061504, "ts": 1716454222316572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222413811, "dur": 4, "args": { "External id": 69876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69876, "pid": 5, "tid": 7, "ts": 1716454222413811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316587, "dur": 11, "args": { "External id": 69876, "cbid": 211, "correlation": 69876 } }, { "ph": "s", "id": 69876, "pid": 76337, "tid": -914061504, "ts": 1716454222316587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222316603, "dur": 0, "args": { "External id": 69877, "cbid": 51, "correlation": 69877 } }, { "ph": "s", "id": 69877, "pid": 76337, "tid": -914061504, "ts": 1716454222316603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454222413816, "dur": 289, "args": { "External id": 69878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69878, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 69878, "pid": 5, "tid": 7, "ts": 1716454222413816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316604, "dur": 7, "args": { "External id": 69878, "cbid": 211, "correlation": 69878 } }, { "ph": "s", "id": 69878, "pid": 76337, "tid": -914061504, "ts": 1716454222316604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222414107, "dur": 63, "args": { "External id": 69883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69883, "pid": 5, "tid": 7, "ts": 1716454222414107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316631, "dur": 8, "args": { "External id": 69883, "cbid": 211, "correlation": 69883 } }, { "ph": "s", "id": 69883, "pid": 76337, "tid": -914061504, "ts": 1716454222316631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222414171, "dur": 50, "args": { "External id": 69891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69891, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69891, "pid": 5, "tid": 7, "ts": 1716454222414171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316659, "dur": 8, "args": { "External id": 69891, "cbid": 211, "correlation": 69891 } }, { "ph": "s", "id": 69891, "pid": 76337, "tid": -914061504, "ts": 1716454222316659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222414223, "dur": 36, "args": { "External id": 69899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69899, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69899, "pid": 5, "tid": 7, "ts": 1716454222414223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316688, "dur": 8, "args": { "External id": 69899, "cbid": 211, "correlation": 69899 } }, { "ph": "s", "id": 69899, "pid": 76337, "tid": -914061504, "ts": 1716454222316688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222414260, "dur": 55, "args": { "External id": 69919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69919, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 69919, "pid": 5, "tid": 7, "ts": 1716454222414260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316772, "dur": 12, "args": { "External id": 69919, "cbid": 211, "correlation": 69919 } }, { "ph": "s", "id": 69919, "pid": 76337, "tid": -914061504, "ts": 1716454222316772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222414316, "dur": 4, "args": { "External id": 69931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69931, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 69931, "pid": 5, "tid": 7, "ts": 1716454222414316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316795, "dur": 7, "args": { "External id": 69931, "cbid": 211, "correlation": 69931 } }, { "ph": "s", "id": 69931, "pid": 76337, "tid": -914061504, "ts": 1716454222316795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222414322, "dur": 58, "args": { "External id": 69934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69934, "pid": 5, "tid": 7, "ts": 1716454222414322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316812, "dur": 6, "args": { "External id": 69934, "cbid": 211, "correlation": 69934 } }, { "ph": "s", "id": 69934, "pid": 76337, "tid": -914061504, "ts": 1716454222316812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222316869, "dur": 0, "args": { "External id": 69945, "cbid": 317, "correlation": 69945 } }, { "ph": "f", "id": 69945, "pid": 76337, "tid": -914061504, "ts": 1716454222316869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222316870, "dur": 0, "args": { "External id": 69946, "cbid": 203, "correlation": 69946 } }, { "ph": "f", "id": 69946, "pid": 76337, "tid": -914061504, "ts": 1716454222316870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222316871, "dur": 0, "args": { "External id": 69947, "cbid": 205, "correlation": 69947 } }, { "ph": "f", "id": 69947, "pid": 76337, "tid": -914061504, "ts": 1716454222316871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316901, "dur": 2, "args": { "External id": 69951, "cbid": 251, "correlation": 69951 } }, { "ph": "f", "id": 69951, "pid": 76337, "tid": -914061504, "ts": 1716454222316901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316904, "dur": 1, "args": { "External id": 69952, "cbid": 251, "correlation": 69952 } }, { "ph": "f", "id": 69952, "pid": 76337, "tid": -914061504, "ts": 1716454222316904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316906, "dur": 1, "args": { "External id": 69953, "cbid": 251, "correlation": 69953 } }, { "ph": "f", "id": 69953, "pid": 76337, "tid": -914061504, "ts": 1716454222316906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316908, "dur": 1, "args": { "External id": 69954, "cbid": 251, "correlation": 69954 } }, { "ph": "f", "id": 69954, "pid": 76337, "tid": -914061504, "ts": 1716454222316908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316909, "dur": 1, "args": { "External id": 69955, "cbid": 251, "correlation": 69955 } }, { "ph": "f", "id": 69955, "pid": 76337, "tid": -914061504, "ts": 1716454222316909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316911, "dur": 1, "args": { "External id": 69956, "cbid": 251, "correlation": 69956 } }, { "ph": "f", "id": 69956, "pid": 76337, "tid": -914061504, "ts": 1716454222316911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316912, "dur": 0, "args": { "External id": 69957, "cbid": 251, "correlation": 69957 } }, { "ph": "f", "id": 69957, "pid": 76337, "tid": -914061504, "ts": 1716454222316912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316914, "dur": 1, "args": { "External id": 69958, "cbid": 251, "correlation": 69958 } }, { "ph": "f", "id": 69958, "pid": 76337, "tid": -914061504, "ts": 1716454222316914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222316916, "dur": 0, "args": { "External id": 69959, "cbid": 251, "correlation": 69959 } }, { "ph": "f", "id": 69959, "pid": 76337, "tid": -914061504, "ts": 1716454222316916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222414381, "dur": 123, "args": { "External id": 69960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69960, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 69960, "pid": 5, "tid": 7, "ts": 1716454222414381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316919, "dur": 12, "args": { "External id": 69960, "cbid": 211, "correlation": 69960 } }, { "ph": "s", "id": 69960, "pid": 76337, "tid": -914061504, "ts": 1716454222316919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222414506, "dur": 63, "args": { "External id": 69966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69966, "pid": 5, "tid": 7, "ts": 1716454222414506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222316954, "dur": 9, "args": { "External id": 69966, "cbid": 211, "correlation": 69966 } }, { "ph": "s", "id": 69966, "pid": 76337, "tid": -914061504, "ts": 1716454222316954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222414570, "dur": 628, "args": { "External id": 69975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69975, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69975, "pid": 5, "tid": 7, "ts": 1716454222414570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222317048, "dur": 14, "args": { "External id": 69975, "cbid": 211, "correlation": 69975 } }, { "ph": "s", "id": 69975, "pid": 76337, "tid": -914061504, "ts": 1716454222317048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222415199, "dur": 199, "args": { "External id": 69997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 69997, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 69997, "pid": 5, "tid": 7, "ts": 1716454222415199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222317109, "dur": 11, "args": { "External id": 69997, "cbid": 211, "correlation": 69997 } }, { "ph": "s", "id": 69997, "pid": 76337, "tid": -914061504, "ts": 1716454222317109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222317200, "dur": 1, "args": { "External id": 70008, "cbid": 251, "correlation": 70008 } }, { "ph": "f", "id": 70008, "pid": 76337, "tid": -914061504, "ts": 1716454222317200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222415400, "dur": 211, "args": { "External id": 70009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70009, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70009, "pid": 5, "tid": 7, "ts": 1716454222415400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222317205, "dur": 13, "args": { "External id": 70009, "cbid": 211, "correlation": 70009 } }, { "ph": "s", "id": 70009, "pid": 76337, "tid": -914061504, "ts": 1716454222317205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222317274, "dur": 1, "args": { "External id": 70020, "cbid": 251, "correlation": 70020 } }, { "ph": "f", "id": 70020, "pid": 76337, "tid": -914061504, "ts": 1716454222317274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222415612, "dur": 203, "args": { "External id": 70021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70021, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70021, "pid": 5, "tid": 7, "ts": 1716454222415612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222317278, "dur": 11, "args": { "External id": 70021, "cbid": 211, "correlation": 70021 } }, { "ph": "s", "id": 70021, "pid": 76337, "tid": -914061504, "ts": 1716454222317278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222317340, "dur": 1, "args": { "External id": 70032, "cbid": 251, "correlation": 70032 } }, { "ph": "f", "id": 70032, "pid": 76337, "tid": -914061504, "ts": 1716454222317340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222415816, "dur": 201, "args": { "External id": 70033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70033, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70033, "pid": 5, "tid": 7, "ts": 1716454222415816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222317344, "dur": 1608, "args": { "External id": 70033, "cbid": 211, "correlation": 70033 } }, { "ph": "s", "id": 70033, "pid": 76337, "tid": -914061504, "ts": 1716454222317344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222416019, "dur": 20503, "args": { "External id": 70054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70054, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70054, "pid": 5, "tid": 7, "ts": 1716454222416019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319028, "dur": 143, "args": { "External id": 70054, "cbid": 211, "correlation": 70054 } }, { "ph": "s", "id": 70054, "pid": 76337, "tid": -914061504, "ts": 1716454222319028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222319261, "dur": 1, "args": { "External id": 70072, "cbid": 251, "correlation": 70072 } }, { "ph": "f", "id": 70072, "pid": 76337, "tid": -914061504, "ts": 1716454222319261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222436524, "dur": 217, "args": { "External id": 70074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70074, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70074, "pid": 5, "tid": 7, "ts": 1716454222436524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319267, "dur": 14, "args": { "External id": 70074, "cbid": 211, "correlation": 70074 } }, { "ph": "s", "id": 70074, "pid": 76337, "tid": -914061504, "ts": 1716454222319267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222436742, "dur": 67, "args": { "External id": 70082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70082, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70082, "pid": 5, "tid": 7, "ts": 1716454222436742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319337, "dur": 12, "args": { "External id": 70082, "cbid": 211, "correlation": 70082 } }, { "ph": "s", "id": 70082, "pid": 76337, "tid": -914061504, "ts": 1716454222319337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222436810, "dur": 97, "args": { "External id": 70090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70090, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70090, "pid": 5, "tid": 7, "ts": 1716454222436810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319376, "dur": 18, "args": { "External id": 70090, "cbid": 211, "correlation": 70090 } }, { "ph": "s", "id": 70090, "pid": 76337, "tid": -914061504, "ts": 1716454222319376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222436909, "dur": 57, "args": { "External id": 70101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70101, "pid": 5, "tid": 7, "ts": 1716454222436909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319458, "dur": 40, "args": { "External id": 70101, "cbid": 211, "correlation": 70101 } }, { "ph": "s", "id": 70101, "pid": 76337, "tid": -914061504, "ts": 1716454222319458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222436967, "dur": 101, "args": { "External id": 70123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70123, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70123, "pid": 5, "tid": 7, "ts": 1716454222436967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319517, "dur": 91, "args": { "External id": 70123, "cbid": 211, "correlation": 70123 } }, { "ph": "s", "id": 70123, "pid": 76337, "tid": -914061504, "ts": 1716454222319517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222319684, "dur": 1, "args": { "External id": 70134, "cbid": 251, "correlation": 70134 } }, { "ph": "f", "id": 70134, "pid": 76337, "tid": -914061504, "ts": 1716454222319684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222437069, "dur": 115, "args": { "External id": 70135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70135, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70135, "pid": 5, "tid": 7, "ts": 1716454222437069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319689, "dur": 13, "args": { "External id": 70135, "cbid": 211, "correlation": 70135 } }, { "ph": "s", "id": 70135, "pid": 76337, "tid": -914061504, "ts": 1716454222319689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222319766, "dur": 1, "args": { "External id": 70146, "cbid": 251, "correlation": 70146 } }, { "ph": "f", "id": 70146, "pid": 76337, "tid": -914061504, "ts": 1716454222319766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222319770, "dur": 0, "args": { "External id": 70147, "cbid": 251, "correlation": 70147 } }, { "ph": "f", "id": 70147, "pid": 76337, "tid": -914061504, "ts": 1716454222319770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222437185, "dur": 11, "args": { "External id": 70148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70148, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 70148, "pid": 5, "tid": 7, "ts": 1716454222437185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319771, "dur": 14, "args": { "External id": 70148, "cbid": 211, "correlation": 70148 } }, { "ph": "s", "id": 70148, "pid": 76337, "tid": -914061504, "ts": 1716454222319771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222437198, "dur": 6, "args": { "External id": 70150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70150, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 70150, "pid": 5, "tid": 7, "ts": 1716454222437198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319789, "dur": 8, "args": { "External id": 70150, "cbid": 211, "correlation": 70150 } }, { "ph": "s", "id": 70150, "pid": 76337, "tid": -914061504, "ts": 1716454222319789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222319852, "dur": 1, "args": { "External id": 70161, "cbid": 251, "correlation": 70161 } }, { "ph": "f", "id": 70161, "pid": 76337, "tid": -914061504, "ts": 1716454222319852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222319855, "dur": 0, "args": { "External id": 70162, "cbid": 251, "correlation": 70162 } }, { "ph": "f", "id": 70162, "pid": 76337, "tid": -914061504, "ts": 1716454222319855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222437205, "dur": 6, "args": { "External id": 70163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70163, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 70163, "pid": 5, "tid": 7, "ts": 1716454222437205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319857, "dur": 11, "args": { "External id": 70163, "cbid": 211, "correlation": 70163 } }, { "ph": "s", "id": 70163, "pid": 76337, "tid": -914061504, "ts": 1716454222319857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222437213, "dur": 4, "args": { "External id": 70165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70165, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 70165, "pid": 5, "tid": 7, "ts": 1716454222437213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319870, "dur": 6, "args": { "External id": 70165, "cbid": 211, "correlation": 70165 } }, { "ph": "s", "id": 70165, "pid": 76337, "tid": -914061504, "ts": 1716454222319870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222437218, "dur": 170, "args": { "External id": 70186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70186, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70186, "pid": 5, "tid": 7, "ts": 1716454222437218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222319957, "dur": 14, "args": { "External id": 70186, "cbid": 211, "correlation": 70186 } }, { "ph": "s", "id": 70186, "pid": 76337, "tid": -914061504, "ts": 1716454222319957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222320066, "dur": 2, "args": { "External id": 70204, "cbid": 251, "correlation": 70204 } }, { "ph": "f", "id": 70204, "pid": 76337, "tid": -914061504, "ts": 1716454222320066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222437389, "dur": 114, "args": { "External id": 70206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70206, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70206, "pid": 5, "tid": 7, "ts": 1716454222437389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222320072, "dur": 14, "args": { "External id": 70206, "cbid": 211, "correlation": 70206 } }, { "ph": "s", "id": 70206, "pid": 76337, "tid": -914061504, "ts": 1716454222320072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222437505, "dur": 35, "args": { "External id": 70214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70214, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70214, "pid": 5, "tid": 7, "ts": 1716454222437505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222320143, "dur": 15, "args": { "External id": 70214, "cbid": 211, "correlation": 70214 } }, { "ph": "s", "id": 70214, "pid": 76337, "tid": -914061504, "ts": 1716454222320143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222437541, "dur": 69, "args": { "External id": 70222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70222, "pid": 5, "tid": 7, "ts": 1716454222437541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222320189, "dur": 10, "args": { "External id": 70222, "cbid": 211, "correlation": 70222 } }, { "ph": "s", "id": 70222, "pid": 76337, "tid": -914061504, "ts": 1716454222320189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222437612, "dur": 101, "args": { "External id": 70244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70244, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70244, "pid": 5, "tid": 7, "ts": 1716454222437612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222320240, "dur": 530, "args": { "External id": 70244, "cbid": 211, "correlation": 70244 } }, { "ph": "s", "id": 70244, "pid": 76337, "tid": -914061504, "ts": 1716454222320240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222320849, "dur": 1, "args": { "External id": 70260, "cbid": 251, "correlation": 70260 } }, { "ph": "f", "id": 70260, "pid": 76337, "tid": -914061504, "ts": 1716454222320849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222437715, "dur": 627, "args": { "External id": 70262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70262, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70262, "pid": 5, "tid": 7, "ts": 1716454222437715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222320855, "dur": 181, "args": { "External id": 70262, "cbid": 211, "correlation": 70262 } }, { "ph": "s", "id": 70262, "pid": 76337, "tid": -914061504, "ts": 1716454222320855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222438344, "dur": 260, "args": { "External id": 70270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70270, "pid": 5, "tid": 7, "ts": 1716454222438344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222321091, "dur": 205, "args": { "External id": 70270, "cbid": 211, "correlation": 70270 } }, { "ph": "s", "id": 70270, "pid": 76337, "tid": -914061504, "ts": 1716454222321091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222438605, "dur": 261, "args": { "External id": 70278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70278, "pid": 5, "tid": 7, "ts": 1716454222438605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222321316, "dur": 358, "args": { "External id": 70278, "cbid": 211, "correlation": 70278 } }, { "ph": "s", "id": 70278, "pid": 76337, "tid": -914061504, "ts": 1716454222321316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222321753, "dur": 1, "args": { "External id": 70294, "cbid": 251, "correlation": 70294 } }, { "ph": "f", "id": 70294, "pid": 76337, "tid": -914061504, "ts": 1716454222321753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222321758, "dur": 0, "args": { "External id": 70296, "cbid": 251, "correlation": 70296 } }, { "ph": "f", "id": 70296, "pid": 76337, "tid": -914061504, "ts": 1716454222321758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222438867, "dur": 374, "args": { "External id": 70297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70297, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 70297, "pid": 5, "tid": 7, "ts": 1716454222438867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222321761, "dur": 13, "args": { "External id": 70297, "cbid": 211, "correlation": 70297 } }, { "ph": "s", "id": 70297, "pid": 76337, "tid": -914061504, "ts": 1716454222321761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222439243, "dur": 51, "args": { "External id": 70305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70305, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70305, "pid": 5, "tid": 7, "ts": 1716454222439243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222321804, "dur": 96, "args": { "External id": 70305, "cbid": 211, "correlation": 70305 } }, { "ph": "s", "id": 70305, "pid": 76337, "tid": -914061504, "ts": 1716454222321804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222439295, "dur": 173, "args": { "External id": 70316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70316, "pid": 5, "tid": 7, "ts": 1716454222439295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222321958, "dur": 67, "args": { "External id": 70316, "cbid": 211, "correlation": 70316 } }, { "ph": "s", "id": 70316, "pid": 76337, "tid": -914061504, "ts": 1716454222321958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222322079, "dur": 0, "args": { "External id": 70328, "cbid": 317, "correlation": 70328 } }, { "ph": "f", "id": 70328, "pid": 76337, "tid": -914061504, "ts": 1716454222322079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222322080, "dur": 0, "args": { "External id": 70329, "cbid": 203, "correlation": 70329 } }, { "ph": "f", "id": 70329, "pid": 76337, "tid": -914061504, "ts": 1716454222322080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222322081, "dur": 0, "args": { "External id": 70330, "cbid": 205, "correlation": 70330 } }, { "ph": "f", "id": 70330, "pid": 76337, "tid": -914061504, "ts": 1716454222322081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322105, "dur": 1, "args": { "External id": 70334, "cbid": 251, "correlation": 70334 } }, { "ph": "f", "id": 70334, "pid": 76337, "tid": -914061504, "ts": 1716454222322105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322107, "dur": 0, "args": { "External id": 70335, "cbid": 251, "correlation": 70335 } }, { "ph": "f", "id": 70335, "pid": 76337, "tid": -914061504, "ts": 1716454222322107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322107, "dur": 0, "args": { "External id": 70336, "cbid": 251, "correlation": 70336 } }, { "ph": "f", "id": 70336, "pid": 76337, "tid": -914061504, "ts": 1716454222322107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322109, "dur": 0, "args": { "External id": 70337, "cbid": 251, "correlation": 70337 } }, { "ph": "f", "id": 70337, "pid": 76337, "tid": -914061504, "ts": 1716454222322109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322109, "dur": 1, "args": { "External id": 70338, "cbid": 251, "correlation": 70338 } }, { "ph": "f", "id": 70338, "pid": 76337, "tid": -914061504, "ts": 1716454222322109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322111, "dur": 0, "args": { "External id": 70339, "cbid": 251, "correlation": 70339 } }, { "ph": "f", "id": 70339, "pid": 76337, "tid": -914061504, "ts": 1716454222322111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322111, "dur": 0, "args": { "External id": 70340, "cbid": 251, "correlation": 70340 } }, { "ph": "f", "id": 70340, "pid": 76337, "tid": -914061504, "ts": 1716454222322111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322112, "dur": 0, "args": { "External id": 70341, "cbid": 251, "correlation": 70341 } }, { "ph": "f", "id": 70341, "pid": 76337, "tid": -914061504, "ts": 1716454222322112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322114, "dur": 0, "args": { "External id": 70342, "cbid": 251, "correlation": 70342 } }, { "ph": "f", "id": 70342, "pid": 76337, "tid": -914061504, "ts": 1716454222322114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222439470, "dur": 125, "args": { "External id": 70343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70343, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70343, "pid": 5, "tid": 7, "ts": 1716454222439470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322116, "dur": 13, "args": { "External id": 70343, "cbid": 211, "correlation": 70343 } }, { "ph": "s", "id": 70343, "pid": 76337, "tid": -914061504, "ts": 1716454222322116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222439596, "dur": 63, "args": { "External id": 70349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70349, "pid": 5, "tid": 7, "ts": 1716454222439596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322152, "dur": 9, "args": { "External id": 70349, "cbid": 211, "correlation": 70349 } }, { "ph": "s", "id": 70349, "pid": 76337, "tid": -914061504, "ts": 1716454222322152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222439661, "dur": 50, "args": { "External id": 70357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70357, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70357, "pid": 5, "tid": 7, "ts": 1716454222439661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322185, "dur": 9, "args": { "External id": 70357, "cbid": 211, "correlation": 70357 } }, { "ph": "s", "id": 70357, "pid": 76337, "tid": -914061504, "ts": 1716454222322185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222439713, "dur": 105, "args": { "External id": 70366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70366, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70366, "pid": 5, "tid": 7, "ts": 1716454222439713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322224, "dur": 13, "args": { "External id": 70366, "cbid": 211, "correlation": 70366 } }, { "ph": "s", "id": 70366, "pid": 76337, "tid": -914061504, "ts": 1716454222322224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222439819, "dur": 100, "args": { "External id": 70386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70386, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 70386, "pid": 5, "tid": 7, "ts": 1716454222439819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322300, "dur": 12, "args": { "External id": 70386, "cbid": 211, "correlation": 70386 } }, { "ph": "s", "id": 70386, "pid": 76337, "tid": -914061504, "ts": 1716454222322300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222439920, "dur": 5, "args": { "External id": 70398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70398, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 70398, "pid": 5, "tid": 7, "ts": 1716454222439920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322323, "dur": 278, "args": { "External id": 70398, "cbid": 211, "correlation": 70398 } }, { "ph": "s", "id": 70398, "pid": 76337, "tid": -914061504, "ts": 1716454222322323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222439927, "dur": 114, "args": { "External id": 70401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70401, "pid": 5, "tid": 7, "ts": 1716454222439927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322613, "dur": 7, "args": { "External id": 70401, "cbid": 211, "correlation": 70401 } }, { "ph": "s", "id": 70401, "pid": 76337, "tid": -914061504, "ts": 1716454222322613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222440042, "dur": 71, "args": { "External id": 70410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70410, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70410, "pid": 5, "tid": 7, "ts": 1716454222440042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322653, "dur": 11, "args": { "External id": 70410, "cbid": 211, "correlation": 70410 } }, { "ph": "s", "id": 70410, "pid": 76337, "tid": -914061504, "ts": 1716454222322653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222322706, "dur": 0, "args": { "External id": 70420, "cbid": 317, "correlation": 70420 } }, { "ph": "f", "id": 70420, "pid": 76337, "tid": -914061504, "ts": 1716454222322706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222322707, "dur": 0, "args": { "External id": 70421, "cbid": 203, "correlation": 70421 } }, { "ph": "f", "id": 70421, "pid": 76337, "tid": -914061504, "ts": 1716454222322707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222322707, "dur": 0, "args": { "External id": 70422, "cbid": 205, "correlation": 70422 } }, { "ph": "f", "id": 70422, "pid": 76337, "tid": -914061504, "ts": 1716454222322707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222440114, "dur": 78, "args": { "External id": 70426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70426, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70426, "pid": 5, "tid": 7, "ts": 1716454222440114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322724, "dur": 12, "args": { "External id": 70426, "cbid": 211, "correlation": 70426 } }, { "ph": "s", "id": 70426, "pid": 76337, "tid": -914061504, "ts": 1716454222322724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222440194, "dur": 27, "args": { "External id": 70428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70428, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70428, "pid": 5, "tid": 7, "ts": 1716454222440194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322738, "dur": 5, "args": { "External id": 70428, "cbid": 211, "correlation": 70428 } }, { "ph": "s", "id": 70428, "pid": 76337, "tid": -914061504, "ts": 1716454222322738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222440222, "dur": 4, "args": { "External id": 70430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 70430, "pid": 5, "tid": 7, "ts": 1716454222440222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322748, "dur": 6, "args": { "External id": 70430, "cbid": 211, "correlation": 70430 } }, { "ph": "s", "id": 70430, "pid": 76337, "tid": -914061504, "ts": 1716454222322748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222322758, "dur": 0, "args": { "External id": 70431, "cbid": 51, "correlation": 70431 } }, { "ph": "s", "id": 70431, "pid": 76337, "tid": -914061504, "ts": 1716454222322758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222440227, "dur": 1493, "args": { "External id": 70432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70432, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70432, "pid": 5, "tid": 7, "ts": 1716454222440227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322758, "dur": 8, "args": { "External id": 70432, "cbid": 211, "correlation": 70432 } }, { "ph": "s", "id": 70432, "pid": 76337, "tid": -914061504, "ts": 1716454222322758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222441722, "dur": 63, "args": { "External id": 70437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70437, "pid": 5, "tid": 7, "ts": 1716454222441722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322789, "dur": 9, "args": { "External id": 70437, "cbid": 211, "correlation": 70437 } }, { "ph": "s", "id": 70437, "pid": 76337, "tid": -914061504, "ts": 1716454222322789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222441786, "dur": 4, "args": { "External id": 70445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70445, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 70445, "pid": 5, "tid": 7, "ts": 1716454222441786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322833, "dur": 10, "args": { "External id": 70445, "cbid": 211, "correlation": 70445 } }, { "ph": "s", "id": 70445, "pid": 76337, "tid": -914061504, "ts": 1716454222322833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322898, "dur": 1, "args": { "External id": 70461, "cbid": 251, "correlation": 70461 } }, { "ph": "f", "id": 70461, "pid": 76337, "tid": -914061504, "ts": 1716454222322898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222322904, "dur": 0, "args": { "External id": 70463, "cbid": 251, "correlation": 70463 } }, { "ph": "f", "id": 70463, "pid": 76337, "tid": -914061504, "ts": 1716454222322904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222441791, "dur": 12, "args": { "External id": 70464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70464, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 70464, "pid": 5, "tid": 7, "ts": 1716454222441791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322905, "dur": 11, "args": { "External id": 70464, "cbid": 211, "correlation": 70464 } }, { "ph": "s", "id": 70464, "pid": 76337, "tid": -914061504, "ts": 1716454222322905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222441804, "dur": 5, "args": { "External id": 70466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70466, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 70466, "pid": 5, "tid": 7, "ts": 1716454222441804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222322919, "dur": 213, "args": { "External id": 70466, "cbid": 211, "correlation": 70466 } }, { "ph": "s", "id": 70466, "pid": 76337, "tid": -914061504, "ts": 1716454222322919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222441811, "dur": 57, "args": { "External id": 70476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70476, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70476, "pid": 5, "tid": 7, "ts": 1716454222441811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323186, "dur": 14, "args": { "External id": 70476, "cbid": 211, "correlation": 70476 } }, { "ph": "s", "id": 70476, "pid": 76337, "tid": -914061504, "ts": 1716454222323186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222441870, "dur": 54, "args": { "External id": 70496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70496, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 70496, "pid": 5, "tid": 7, "ts": 1716454222441870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323254, "dur": 11, "args": { "External id": 70496, "cbid": 211, "correlation": 70496 } }, { "ph": "s", "id": 70496, "pid": 76337, "tid": -914061504, "ts": 1716454222323254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222441926, "dur": 4, "args": { "External id": 70508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70508, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 70508, "pid": 5, "tid": 7, "ts": 1716454222441926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323277, "dur": 7, "args": { "External id": 70508, "cbid": 211, "correlation": 70508 } }, { "ph": "s", "id": 70508, "pid": 76337, "tid": -914061504, "ts": 1716454222323277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222441931, "dur": 58, "args": { "External id": 70511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70511, "pid": 5, "tid": 7, "ts": 1716454222441931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323297, "dur": 7, "args": { "External id": 70511, "cbid": 211, "correlation": 70511 } }, { "ph": "s", "id": 70511, "pid": 76337, "tid": -914061504, "ts": 1716454222323297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222441990, "dur": 37, "args": { "External id": 70520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70520, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70520, "pid": 5, "tid": 7, "ts": 1716454222441990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323338, "dur": 10, "args": { "External id": 70520, "cbid": 211, "correlation": 70520 } }, { "ph": "s", "id": 70520, "pid": 76337, "tid": -914061504, "ts": 1716454222323338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222323402, "dur": 0, "args": { "External id": 70530, "cbid": 317, "correlation": 70530 } }, { "ph": "f", "id": 70530, "pid": 76337, "tid": -914061504, "ts": 1716454222323402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222323402, "dur": 0, "args": { "External id": 70531, "cbid": 203, "correlation": 70531 } }, { "ph": "f", "id": 70531, "pid": 76337, "tid": -914061504, "ts": 1716454222323402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222323403, "dur": 0, "args": { "External id": 70532, "cbid": 205, "correlation": 70532 } }, { "ph": "f", "id": 70532, "pid": 76337, "tid": -914061504, "ts": 1716454222323403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222442029, "dur": 40, "args": { "External id": 70536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70536, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70536, "pid": 5, "tid": 7, "ts": 1716454222442029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323418, "dur": 12, "args": { "External id": 70536, "cbid": 211, "correlation": 70536 } }, { "ph": "s", "id": 70536, "pid": 76337, "tid": -914061504, "ts": 1716454222323418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222442071, "dur": 16, "args": { "External id": 70538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70538, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70538, "pid": 5, "tid": 7, "ts": 1716454222442071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323433, "dur": 6, "args": { "External id": 70538, "cbid": 211, "correlation": 70538 } }, { "ph": "s", "id": 70538, "pid": 76337, "tid": -914061504, "ts": 1716454222323433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222442088, "dur": 3, "args": { "External id": 70540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 70540, "pid": 5, "tid": 7, "ts": 1716454222442088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323443, "dur": 6, "args": { "External id": 70540, "cbid": 211, "correlation": 70540 } }, { "ph": "s", "id": 70540, "pid": 76337, "tid": -914061504, "ts": 1716454222323443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222323451, "dur": 0, "args": { "External id": 70541, "cbid": 51, "correlation": 70541 } }, { "ph": "s", "id": 70541, "pid": 76337, "tid": -914061504, "ts": 1716454222323451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222442093, "dur": 765, "args": { "External id": 70542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70542, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70542, "pid": 5, "tid": 7, "ts": 1716454222442093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323452, "dur": 5, "args": { "External id": 70542, "cbid": 211, "correlation": 70542 } }, { "ph": "s", "id": 70542, "pid": 76337, "tid": -914061504, "ts": 1716454222323452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222442859, "dur": 62, "args": { "External id": 70547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70547, "pid": 5, "tid": 7, "ts": 1716454222442859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323480, "dur": 8, "args": { "External id": 70547, "cbid": 211, "correlation": 70547 } }, { "ph": "s", "id": 70547, "pid": 76337, "tid": -914061504, "ts": 1716454222323480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222323536, "dur": 0, "args": { "External id": 70557, "cbid": 317, "correlation": 70557 } }, { "ph": "f", "id": 70557, "pid": 76337, "tid": -914061504, "ts": 1716454222323536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222323537, "dur": 0, "args": { "External id": 70558, "cbid": 203, "correlation": 70558 } }, { "ph": "f", "id": 70558, "pid": 76337, "tid": -914061504, "ts": 1716454222323537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222323537, "dur": 0, "args": { "External id": 70559, "cbid": 205, "correlation": 70559 } }, { "ph": "f", "id": 70559, "pid": 76337, "tid": -914061504, "ts": 1716454222323537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222442923, "dur": 75, "args": { "External id": 70563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70563, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70563, "pid": 5, "tid": 7, "ts": 1716454222442923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323551, "dur": 12, "args": { "External id": 70563, "cbid": 211, "correlation": 70563 } }, { "ph": "s", "id": 70563, "pid": 76337, "tid": -914061504, "ts": 1716454222323551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222442999, "dur": 222, "args": { "External id": 70565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70565, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70565, "pid": 5, "tid": 7, "ts": 1716454222442999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323570, "dur": 11, "args": { "External id": 70565, "cbid": 211, "correlation": 70565 } }, { "ph": "s", "id": 70565, "pid": 76337, "tid": -914061504, "ts": 1716454222323570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222443223, "dur": 41, "args": { "External id": 70567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70567, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70567, "pid": 5, "tid": 7, "ts": 1716454222443223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222323586, "dur": 553, "args": { "External id": 70567, "cbid": 211, "correlation": 70567 } }, { "ph": "s", "id": 70567, "pid": 76337, "tid": -914061504, "ts": 1716454222323586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222443265, "dur": 63, "args": { "External id": 70573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70573, "pid": 5, "tid": 7, "ts": 1716454222443265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324161, "dur": 9, "args": { "External id": 70573, "cbid": 211, "correlation": 70573 } }, { "ph": "s", "id": 70573, "pid": 76337, "tid": -914061504, "ts": 1716454222324161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222443330, "dur": 50, "args": { "External id": 70581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70581, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70581, "pid": 5, "tid": 7, "ts": 1716454222443330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324193, "dur": 8, "args": { "External id": 70581, "cbid": 211, "correlation": 70581 } }, { "ph": "s", "id": 70581, "pid": 76337, "tid": -914061504, "ts": 1716454222324193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222443381, "dur": 36, "args": { "External id": 70589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70589, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70589, "pid": 5, "tid": 7, "ts": 1716454222443381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324221, "dur": 33, "args": { "External id": 70589, "cbid": 211, "correlation": 70589 } }, { "ph": "s", "id": 70589, "pid": 76337, "tid": -914061504, "ts": 1716454222324221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222443418, "dur": 54, "args": { "External id": 70609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70609, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 70609, "pid": 5, "tid": 7, "ts": 1716454222443418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324329, "dur": 12, "args": { "External id": 70609, "cbid": 211, "correlation": 70609 } }, { "ph": "s", "id": 70609, "pid": 76337, "tid": -914061504, "ts": 1716454222324329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222443474, "dur": 5, "args": { "External id": 70621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70621, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 70621, "pid": 5, "tid": 7, "ts": 1716454222443474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324351, "dur": 6, "args": { "External id": 70621, "cbid": 211, "correlation": 70621 } }, { "ph": "s", "id": 70621, "pid": 76337, "tid": -914061504, "ts": 1716454222324351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222443480, "dur": 58, "args": { "External id": 70624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70624, "pid": 5, "tid": 7, "ts": 1716454222443480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324369, "dur": 6, "args": { "External id": 70624, "cbid": 211, "correlation": 70624 } }, { "ph": "s", "id": 70624, "pid": 76337, "tid": -914061504, "ts": 1716454222324369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222324426, "dur": 0, "args": { "External id": 70635, "cbid": 317, "correlation": 70635 } }, { "ph": "f", "id": 70635, "pid": 76337, "tid": -914061504, "ts": 1716454222324426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222324427, "dur": 3, "args": { "External id": 70636, "cbid": 203, "correlation": 70636 } }, { "ph": "f", "id": 70636, "pid": 76337, "tid": -914061504, "ts": 1716454222324427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222324431, "dur": 0, "args": { "External id": 70637, "cbid": 205, "correlation": 70637 } }, { "ph": "f", "id": 70637, "pid": 76337, "tid": -914061504, "ts": 1716454222324431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324453, "dur": 1, "args": { "External id": 70641, "cbid": 251, "correlation": 70641 } }, { "ph": "f", "id": 70641, "pid": 76337, "tid": -914061504, "ts": 1716454222324453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324455, "dur": 0, "args": { "External id": 70642, "cbid": 251, "correlation": 70642 } }, { "ph": "f", "id": 70642, "pid": 76337, "tid": -914061504, "ts": 1716454222324455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324456, "dur": 0, "args": { "External id": 70643, "cbid": 251, "correlation": 70643 } }, { "ph": "f", "id": 70643, "pid": 76337, "tid": -914061504, "ts": 1716454222324456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324457, "dur": 0, "args": { "External id": 70644, "cbid": 251, "correlation": 70644 } }, { "ph": "f", "id": 70644, "pid": 76337, "tid": -914061504, "ts": 1716454222324457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324458, "dur": 0, "args": { "External id": 70645, "cbid": 251, "correlation": 70645 } }, { "ph": "f", "id": 70645, "pid": 76337, "tid": -914061504, "ts": 1716454222324458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324459, "dur": 0, "args": { "External id": 70646, "cbid": 251, "correlation": 70646 } }, { "ph": "f", "id": 70646, "pid": 76337, "tid": -914061504, "ts": 1716454222324459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324460, "dur": 0, "args": { "External id": 70647, "cbid": 251, "correlation": 70647 } }, { "ph": "f", "id": 70647, "pid": 76337, "tid": -914061504, "ts": 1716454222324460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324461, "dur": 0, "args": { "External id": 70648, "cbid": 251, "correlation": 70648 } }, { "ph": "f", "id": 70648, "pid": 76337, "tid": -914061504, "ts": 1716454222324461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324462, "dur": 0, "args": { "External id": 70649, "cbid": 251, "correlation": 70649 } }, { "ph": "f", "id": 70649, "pid": 76337, "tid": -914061504, "ts": 1716454222324462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222443539, "dur": 122, "args": { "External id": 70650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70650, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70650, "pid": 5, "tid": 7, "ts": 1716454222443539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324464, "dur": 13, "args": { "External id": 70650, "cbid": 211, "correlation": 70650 } }, { "ph": "s", "id": 70650, "pid": 76337, "tid": -914061504, "ts": 1716454222324464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222443663, "dur": 63, "args": { "External id": 70656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70656, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70656, "pid": 5, "tid": 7, "ts": 1716454222443663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324500, "dur": 9, "args": { "External id": 70656, "cbid": 211, "correlation": 70656 } }, { "ph": "s", "id": 70656, "pid": 76337, "tid": -914061504, "ts": 1716454222324500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222443727, "dur": 567, "args": { "External id": 70665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70665, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70665, "pid": 5, "tid": 7, "ts": 1716454222443727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324583, "dur": 14, "args": { "External id": 70665, "cbid": 211, "correlation": 70665 } }, { "ph": "s", "id": 70665, "pid": 76337, "tid": -914061504, "ts": 1716454222324583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222444296, "dur": 199, "args": { "External id": 70687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70687, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70687, "pid": 5, "tid": 7, "ts": 1716454222444296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324640, "dur": 10, "args": { "External id": 70687, "cbid": 211, "correlation": 70687 } }, { "ph": "s", "id": 70687, "pid": 76337, "tid": -914061504, "ts": 1716454222324640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324728, "dur": 1, "args": { "External id": 70698, "cbid": 251, "correlation": 70698 } }, { "ph": "f", "id": 70698, "pid": 76337, "tid": -914061504, "ts": 1716454222324728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222444496, "dur": 208, "args": { "External id": 70699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70699, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70699, "pid": 5, "tid": 7, "ts": 1716454222444496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324734, "dur": 13, "args": { "External id": 70699, "cbid": 211, "correlation": 70699 } }, { "ph": "s", "id": 70699, "pid": 76337, "tid": -914061504, "ts": 1716454222324734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324802, "dur": 1, "args": { "External id": 70710, "cbid": 251, "correlation": 70710 } }, { "ph": "f", "id": 70710, "pid": 76337, "tid": -914061504, "ts": 1716454222324802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222444705, "dur": 204, "args": { "External id": 70711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70711, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70711, "pid": 5, "tid": 7, "ts": 1716454222444705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324806, "dur": 11, "args": { "External id": 70711, "cbid": 211, "correlation": 70711 } }, { "ph": "s", "id": 70711, "pid": 76337, "tid": -914061504, "ts": 1716454222324806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222324874, "dur": 1, "args": { "External id": 70722, "cbid": 251, "correlation": 70722 } }, { "ph": "f", "id": 70722, "pid": 76337, "tid": -914061504, "ts": 1716454222324874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222444911, "dur": 201, "args": { "External id": 70723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70723, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70723, "pid": 5, "tid": 7, "ts": 1716454222444911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324878, "dur": 12, "args": { "External id": 70723, "cbid": 211, "correlation": 70723 } }, { "ph": "s", "id": 70723, "pid": 76337, "tid": -914061504, "ts": 1716454222324878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222445113, "dur": 19126, "args": { "External id": 70744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70744, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70744, "pid": 5, "tid": 7, "ts": 1716454222445113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222324959, "dur": 12, "args": { "External id": 70744, "cbid": 211, "correlation": 70744 } }, { "ph": "s", "id": 70744, "pid": 76337, "tid": -914061504, "ts": 1716454222324959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222325067, "dur": 1, "args": { "External id": 70762, "cbid": 251, "correlation": 70762 } }, { "ph": "f", "id": 70762, "pid": 76337, "tid": -914061504, "ts": 1716454222325067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222464241, "dur": 205, "args": { "External id": 70764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70764, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70764, "pid": 5, "tid": 7, "ts": 1716454222464241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222325073, "dur": 13, "args": { "External id": 70764, "cbid": 211, "correlation": 70764 } }, { "ph": "s", "id": 70764, "pid": 76337, "tid": -914061504, "ts": 1716454222325073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222464447, "dur": 67, "args": { "External id": 70772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70772, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70772, "pid": 5, "tid": 7, "ts": 1716454222464447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222325142, "dur": 12, "args": { "External id": 70772, "cbid": 211, "correlation": 70772 } }, { "ph": "s", "id": 70772, "pid": 76337, "tid": -914061504, "ts": 1716454222325142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222464516, "dur": 98, "args": { "External id": 70780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70780, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70780, "pid": 5, "tid": 7, "ts": 1716454222464516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222325180, "dur": 50, "args": { "External id": 70780, "cbid": 211, "correlation": 70780 } }, { "ph": "s", "id": 70780, "pid": 76337, "tid": -914061504, "ts": 1716454222325180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222464615, "dur": 55, "args": { "External id": 70791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70791, "pid": 5, "tid": 7, "ts": 1716454222464615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222325295, "dur": 96, "args": { "External id": 70791, "cbid": 211, "correlation": 70791 } }, { "ph": "s", "id": 70791, "pid": 76337, "tid": -914061504, "ts": 1716454222325295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222464671, "dur": 94, "args": { "External id": 70813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70813, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70813, "pid": 5, "tid": 7, "ts": 1716454222464671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222325410, "dur": 2127, "args": { "External id": 70813, "cbid": 211, "correlation": 70813 } }, { "ph": "s", "id": 70813, "pid": 76337, "tid": -914061504, "ts": 1716454222325410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222327616, "dur": 1, "args": { "External id": 70824, "cbid": 251, "correlation": 70824 } }, { "ph": "f", "id": 70824, "pid": 76337, "tid": -914061504, "ts": 1716454222327616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222464767, "dur": 104, "args": { "External id": 70825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70825, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70825, "pid": 5, "tid": 7, "ts": 1716454222464767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222327621, "dur": 76, "args": { "External id": 70825, "cbid": 211, "correlation": 70825 } }, { "ph": "s", "id": 70825, "pid": 76337, "tid": -914061504, "ts": 1716454222327621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222327756, "dur": 1, "args": { "External id": 70836, "cbid": 251, "correlation": 70836 } }, { "ph": "f", "id": 70836, "pid": 76337, "tid": -914061504, "ts": 1716454222327756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222327759, "dur": 0, "args": { "External id": 70837, "cbid": 251, "correlation": 70837 } }, { "ph": "f", "id": 70837, "pid": 76337, "tid": -914061504, "ts": 1716454222327759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222464873, "dur": 10, "args": { "External id": 70838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70838, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 70838, "pid": 5, "tid": 7, "ts": 1716454222464873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222327761, "dur": 12, "args": { "External id": 70838, "cbid": 211, "correlation": 70838 } }, { "ph": "s", "id": 70838, "pid": 76337, "tid": -914061504, "ts": 1716454222327761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222464884, "dur": 5, "args": { "External id": 70840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70840, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 70840, "pid": 5, "tid": 7, "ts": 1716454222464884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222327775, "dur": 10, "args": { "External id": 70840, "cbid": 211, "correlation": 70840 } }, { "ph": "s", "id": 70840, "pid": 76337, "tid": -914061504, "ts": 1716454222327775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222327841, "dur": 1, "args": { "External id": 70851, "cbid": 251, "correlation": 70851 } }, { "ph": "f", "id": 70851, "pid": 76337, "tid": -914061504, "ts": 1716454222327841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222327844, "dur": 0, "args": { "External id": 70852, "cbid": 251, "correlation": 70852 } }, { "ph": "f", "id": 70852, "pid": 76337, "tid": -914061504, "ts": 1716454222327844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222464891, "dur": 6, "args": { "External id": 70853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70853, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 70853, "pid": 5, "tid": 7, "ts": 1716454222464891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222327846, "dur": 12, "args": { "External id": 70853, "cbid": 211, "correlation": 70853 } }, { "ph": "s", "id": 70853, "pid": 76337, "tid": -914061504, "ts": 1716454222327846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222464898, "dur": 4, "args": { "External id": 70855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70855, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 70855, "pid": 5, "tid": 7, "ts": 1716454222464898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222327859, "dur": 6, "args": { "External id": 70855, "cbid": 211, "correlation": 70855 } }, { "ph": "s", "id": 70855, "pid": 76337, "tid": -914061504, "ts": 1716454222327859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222464903, "dur": 159, "args": { "External id": 70876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70876, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70876, "pid": 5, "tid": 7, "ts": 1716454222464903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222327956, "dur": 14, "args": { "External id": 70876, "cbid": 211, "correlation": 70876 } }, { "ph": "s", "id": 70876, "pid": 76337, "tid": -914061504, "ts": 1716454222327956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328064, "dur": 1, "args": { "External id": 70894, "cbid": 251, "correlation": 70894 } }, { "ph": "f", "id": 70894, "pid": 76337, "tid": -914061504, "ts": 1716454222328064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222465064, "dur": 110, "args": { "External id": 70896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70896, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 70896, "pid": 5, "tid": 7, "ts": 1716454222465064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328071, "dur": 14, "args": { "External id": 70896, "cbid": 211, "correlation": 70896 } }, { "ph": "s", "id": 70896, "pid": 76337, "tid": -914061504, "ts": 1716454222328071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222465175, "dur": 35, "args": { "External id": 70904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70904, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70904, "pid": 5, "tid": 7, "ts": 1716454222465175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328140, "dur": 13, "args": { "External id": 70904, "cbid": 211, "correlation": 70904 } }, { "ph": "s", "id": 70904, "pid": 76337, "tid": -914061504, "ts": 1716454222328140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222465211, "dur": 67, "args": { "External id": 70912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70912, "pid": 5, "tid": 7, "ts": 1716454222465211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328181, "dur": 9, "args": { "External id": 70912, "cbid": 211, "correlation": 70912 } }, { "ph": "s", "id": 70912, "pid": 76337, "tid": -914061504, "ts": 1716454222328181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222465279, "dur": 94, "args": { "External id": 70934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70934, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70934, "pid": 5, "tid": 7, "ts": 1716454222465279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328233, "dur": 10, "args": { "External id": 70934, "cbid": 211, "correlation": 70934 } }, { "ph": "s", "id": 70934, "pid": 76337, "tid": -914061504, "ts": 1716454222328233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328322, "dur": 1, "args": { "External id": 70950, "cbid": 251, "correlation": 70950 } }, { "ph": "f", "id": 70950, "pid": 76337, "tid": -914061504, "ts": 1716454222328322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222465374, "dur": 583, "args": { "External id": 70952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70952, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 70952, "pid": 5, "tid": 7, "ts": 1716454222465374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328327, "dur": 13, "args": { "External id": 70952, "cbid": 211, "correlation": 70952 } }, { "ph": "s", "id": 70952, "pid": 76337, "tid": -914061504, "ts": 1716454222328327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222465959, "dur": 250, "args": { "External id": 70960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70960, "pid": 5, "tid": 7, "ts": 1716454222465959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328393, "dur": 13, "args": { "External id": 70960, "cbid": 211, "correlation": 70960 } }, { "ph": "s", "id": 70960, "pid": 76337, "tid": -914061504, "ts": 1716454222328393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222466210, "dur": 254, "args": { "External id": 70968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70968, "pid": 5, "tid": 7, "ts": 1716454222466210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328423, "dur": 8, "args": { "External id": 70968, "cbid": 211, "correlation": 70968 } }, { "ph": "s", "id": 70968, "pid": 76337, "tid": -914061504, "ts": 1716454222328423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328504, "dur": 1, "args": { "External id": 70984, "cbid": 251, "correlation": 70984 } }, { "ph": "f", "id": 70984, "pid": 76337, "tid": -914061504, "ts": 1716454222328504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328509, "dur": 0, "args": { "External id": 70986, "cbid": 251, "correlation": 70986 } }, { "ph": "f", "id": 70986, "pid": 76337, "tid": -914061504, "ts": 1716454222328509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222466465, "dur": 366, "args": { "External id": 70987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70987, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 70987, "pid": 5, "tid": 7, "ts": 1716454222466465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328511, "dur": 13, "args": { "External id": 70987, "cbid": 211, "correlation": 70987 } }, { "ph": "s", "id": 70987, "pid": 76337, "tid": -914061504, "ts": 1716454222328511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222466833, "dur": 50, "args": { "External id": 70995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 70995, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 70995, "pid": 5, "tid": 7, "ts": 1716454222466833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328554, "dur": 10, "args": { "External id": 70995, "cbid": 211, "correlation": 70995 } }, { "ph": "s", "id": 70995, "pid": 76337, "tid": -914061504, "ts": 1716454222328554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222466884, "dur": 160, "args": { "External id": 71006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71006, "pid": 5, "tid": 7, "ts": 1716454222466884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328621, "dur": 275, "args": { "External id": 71006, "cbid": 211, "correlation": 71006 } }, { "ph": "s", "id": 71006, "pid": 76337, "tid": -914061504, "ts": 1716454222328621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222328949, "dur": 0, "args": { "External id": 71018, "cbid": 317, "correlation": 71018 } }, { "ph": "f", "id": 71018, "pid": 76337, "tid": -914061504, "ts": 1716454222328949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222328950, "dur": 0, "args": { "External id": 71019, "cbid": 203, "correlation": 71019 } }, { "ph": "f", "id": 71019, "pid": 76337, "tid": -914061504, "ts": 1716454222328950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222328951, "dur": 3, "args": { "External id": 71020, "cbid": 205, "correlation": 71020 } }, { "ph": "f", "id": 71020, "pid": 76337, "tid": -914061504, "ts": 1716454222328951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328984, "dur": 1, "args": { "External id": 71024, "cbid": 251, "correlation": 71024 } }, { "ph": "f", "id": 71024, "pid": 76337, "tid": -914061504, "ts": 1716454222328984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328985, "dur": 0, "args": { "External id": 71025, "cbid": 251, "correlation": 71025 } }, { "ph": "f", "id": 71025, "pid": 76337, "tid": -914061504, "ts": 1716454222328985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328986, "dur": 0, "args": { "External id": 71026, "cbid": 251, "correlation": 71026 } }, { "ph": "f", "id": 71026, "pid": 76337, "tid": -914061504, "ts": 1716454222328986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328987, "dur": 0, "args": { "External id": 71027, "cbid": 251, "correlation": 71027 } }, { "ph": "f", "id": 71027, "pid": 76337, "tid": -914061504, "ts": 1716454222328987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328988, "dur": 0, "args": { "External id": 71028, "cbid": 251, "correlation": 71028 } }, { "ph": "f", "id": 71028, "pid": 76337, "tid": -914061504, "ts": 1716454222328988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328989, "dur": 0, "args": { "External id": 71029, "cbid": 251, "correlation": 71029 } }, { "ph": "f", "id": 71029, "pid": 76337, "tid": -914061504, "ts": 1716454222328989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328990, "dur": 0, "args": { "External id": 71030, "cbid": 251, "correlation": 71030 } }, { "ph": "f", "id": 71030, "pid": 76337, "tid": -914061504, "ts": 1716454222328990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328990, "dur": 0, "args": { "External id": 71031, "cbid": 251, "correlation": 71031 } }, { "ph": "f", "id": 71031, "pid": 76337, "tid": -914061504, "ts": 1716454222328990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222328992, "dur": 0, "args": { "External id": 71032, "cbid": 251, "correlation": 71032 } }, { "ph": "f", "id": 71032, "pid": 76337, "tid": -914061504, "ts": 1716454222328992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222467045, "dur": 116, "args": { "External id": 71033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71033, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 71033, "pid": 5, "tid": 7, "ts": 1716454222467045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222328994, "dur": 37, "args": { "External id": 71033, "cbid": 211, "correlation": 71033 } }, { "ph": "s", "id": 71033, "pid": 76337, "tid": -914061504, "ts": 1716454222328994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222467163, "dur": 60, "args": { "External id": 71039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71039, "pid": 5, "tid": 7, "ts": 1716454222467163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329055, "dur": 107, "args": { "External id": 71039, "cbid": 211, "correlation": 71039 } }, { "ph": "s", "id": 71039, "pid": 76337, "tid": -914061504, "ts": 1716454222329055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222467224, "dur": 50, "args": { "External id": 71047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71047, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71047, "pid": 5, "tid": 7, "ts": 1716454222467224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329185, "dur": 309, "args": { "External id": 71047, "cbid": 211, "correlation": 71047 } }, { "ph": "s", "id": 71047, "pid": 76337, "tid": -914061504, "ts": 1716454222329185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222467275, "dur": 99, "args": { "External id": 71056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71056, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71056, "pid": 5, "tid": 7, "ts": 1716454222467275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329524, "dur": 10, "args": { "External id": 71056, "cbid": 211, "correlation": 71056 } }, { "ph": "s", "id": 71056, "pid": 76337, "tid": -914061504, "ts": 1716454222329524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222467376, "dur": 93, "args": { "External id": 71076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71076, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 71076, "pid": 5, "tid": 7, "ts": 1716454222467376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329596, "dur": 19, "args": { "External id": 71076, "cbid": 211, "correlation": 71076 } }, { "ph": "s", "id": 71076, "pid": 76337, "tid": -914061504, "ts": 1716454222329596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222467470, "dur": 5, "args": { "External id": 71088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71088, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 71088, "pid": 5, "tid": 7, "ts": 1716454222467470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329625, "dur": 12, "args": { "External id": 71088, "cbid": 211, "correlation": 71088 } }, { "ph": "s", "id": 71088, "pid": 76337, "tid": -914061504, "ts": 1716454222329625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222467476, "dur": 110, "args": { "External id": 71091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71091, "pid": 5, "tid": 7, "ts": 1716454222467476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329649, "dur": 121, "args": { "External id": 71091, "cbid": 211, "correlation": 71091 } }, { "ph": "s", "id": 71091, "pid": 76337, "tid": -914061504, "ts": 1716454222329649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222467587, "dur": 70, "args": { "External id": 71100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71100, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71100, "pid": 5, "tid": 7, "ts": 1716454222467587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329807, "dur": 11, "args": { "External id": 71100, "cbid": 211, "correlation": 71100 } }, { "ph": "s", "id": 71100, "pid": 76337, "tid": -914061504, "ts": 1716454222329807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222329861, "dur": 0, "args": { "External id": 71110, "cbid": 317, "correlation": 71110 } }, { "ph": "f", "id": 71110, "pid": 76337, "tid": -914061504, "ts": 1716454222329861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222329861, "dur": 0, "args": { "External id": 71111, "cbid": 203, "correlation": 71111 } }, { "ph": "f", "id": 71111, "pid": 76337, "tid": -914061504, "ts": 1716454222329861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222329862, "dur": 0, "args": { "External id": 71112, "cbid": 205, "correlation": 71112 } }, { "ph": "f", "id": 71112, "pid": 76337, "tid": -914061504, "ts": 1716454222329862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222467659, "dur": 77, "args": { "External id": 71116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71116, "pid": 5, "tid": 7, "ts": 1716454222467659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329876, "dur": 12, "args": { "External id": 71116, "cbid": 211, "correlation": 71116 } }, { "ph": "s", "id": 71116, "pid": 76337, "tid": -914061504, "ts": 1716454222329876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222467737, "dur": 25, "args": { "External id": 71118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71118, "pid": 5, "tid": 7, "ts": 1716454222467737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329891, "dur": 5, "args": { "External id": 71118, "cbid": 211, "correlation": 71118 } }, { "ph": "s", "id": 71118, "pid": 76337, "tid": -914061504, "ts": 1716454222329891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222467763, "dur": 4, "args": { "External id": 71120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71120, "pid": 5, "tid": 7, "ts": 1716454222467763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329901, "dur": 6, "args": { "External id": 71120, "cbid": 211, "correlation": 71120 } }, { "ph": "s", "id": 71120, "pid": 76337, "tid": -914061504, "ts": 1716454222329901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222329909, "dur": 0, "args": { "External id": 71121, "cbid": 51, "correlation": 71121 } }, { "ph": "s", "id": 71121, "pid": 76337, "tid": -914061504, "ts": 1716454222329909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222467768, "dur": 1385, "args": { "External id": 71122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71122, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71122, "pid": 5, "tid": 7, "ts": 1716454222467768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329910, "dur": 5, "args": { "External id": 71122, "cbid": 211, "correlation": 71122 } }, { "ph": "s", "id": 71122, "pid": 76337, "tid": -914061504, "ts": 1716454222329910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222469154, "dur": 60, "args": { "External id": 71127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71127, "pid": 5, "tid": 7, "ts": 1716454222469154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329937, "dur": 10, "args": { "External id": 71127, "cbid": 211, "correlation": 71127 } }, { "ph": "s", "id": 71127, "pid": 76337, "tid": -914061504, "ts": 1716454222329937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222469215, "dur": 4, "args": { "External id": 71135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71135, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71135, "pid": 5, "tid": 7, "ts": 1716454222469215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222329989, "dur": 11, "args": { "External id": 71135, "cbid": 211, "correlation": 71135 } }, { "ph": "s", "id": 71135, "pid": 76337, "tid": -914061504, "ts": 1716454222329989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222330057, "dur": 1, "args": { "External id": 71151, "cbid": 251, "correlation": 71151 } }, { "ph": "f", "id": 71151, "pid": 76337, "tid": -914061504, "ts": 1716454222330057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222330063, "dur": 0, "args": { "External id": 71153, "cbid": 251, "correlation": 71153 } }, { "ph": "f", "id": 71153, "pid": 76337, "tid": -914061504, "ts": 1716454222330063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222469221, "dur": 11, "args": { "External id": 71154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71154, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 71154, "pid": 5, "tid": 7, "ts": 1716454222469221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330065, "dur": 11, "args": { "External id": 71154, "cbid": 211, "correlation": 71154 } }, { "ph": "s", "id": 71154, "pid": 76337, "tid": -914061504, "ts": 1716454222330065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222469233, "dur": 5, "args": { "External id": 71156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71156, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 71156, "pid": 5, "tid": 7, "ts": 1716454222469233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330078, "dur": 8, "args": { "External id": 71156, "cbid": 211, "correlation": 71156 } }, { "ph": "s", "id": 71156, "pid": 76337, "tid": -914061504, "ts": 1716454222330078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222469240, "dur": 55, "args": { "External id": 71166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71166, "pid": 5, "tid": 7, "ts": 1716454222469240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330139, "dur": 614, "args": { "External id": 71166, "cbid": 211, "correlation": 71166 } }, { "ph": "s", "id": 71166, "pid": 76337, "tid": -914061504, "ts": 1716454222330139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222469296, "dur": 54, "args": { "External id": 71186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71186, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 71186, "pid": 5, "tid": 7, "ts": 1716454222469296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330809, "dur": 11, "args": { "External id": 71186, "cbid": 211, "correlation": 71186 } }, { "ph": "s", "id": 71186, "pid": 76337, "tid": -914061504, "ts": 1716454222330809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222469351, "dur": 4, "args": { "External id": 71198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71198, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71198, "pid": 5, "tid": 7, "ts": 1716454222469351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330830, "dur": 6, "args": { "External id": 71198, "cbid": 211, "correlation": 71198 } }, { "ph": "s", "id": 71198, "pid": 76337, "tid": -914061504, "ts": 1716454222330830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222469356, "dur": 55, "args": { "External id": 71201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71201, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71201, "pid": 5, "tid": 7, "ts": 1716454222469356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330848, "dur": 7, "args": { "External id": 71201, "cbid": 211, "correlation": 71201 } }, { "ph": "s", "id": 71201, "pid": 76337, "tid": -914061504, "ts": 1716454222330848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222469413, "dur": 36, "args": { "External id": 71210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71210, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71210, "pid": 5, "tid": 7, "ts": 1716454222469413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330889, "dur": 10, "args": { "External id": 71210, "cbid": 211, "correlation": 71210 } }, { "ph": "s", "id": 71210, "pid": 76337, "tid": -914061504, "ts": 1716454222330889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222330951, "dur": 0, "args": { "External id": 71220, "cbid": 317, "correlation": 71220 } }, { "ph": "f", "id": 71220, "pid": 76337, "tid": -914061504, "ts": 1716454222330951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222330952, "dur": 0, "args": { "External id": 71221, "cbid": 203, "correlation": 71221 } }, { "ph": "f", "id": 71221, "pid": 76337, "tid": -914061504, "ts": 1716454222330952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222330953, "dur": 0, "args": { "External id": 71222, "cbid": 205, "correlation": 71222 } }, { "ph": "f", "id": 71222, "pid": 76337, "tid": -914061504, "ts": 1716454222330953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222469451, "dur": 40, "args": { "External id": 71226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71226, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71226, "pid": 5, "tid": 7, "ts": 1716454222469451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330967, "dur": 20, "args": { "External id": 71226, "cbid": 211, "correlation": 71226 } }, { "ph": "s", "id": 71226, "pid": 76337, "tid": -914061504, "ts": 1716454222330967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222469492, "dur": 14, "args": { "External id": 71228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71228, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71228, "pid": 5, "tid": 7, "ts": 1716454222469492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222330989, "dur": 8, "args": { "External id": 71228, "cbid": 211, "correlation": 71228 } }, { "ph": "s", "id": 71228, "pid": 76337, "tid": -914061504, "ts": 1716454222330989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222469508, "dur": 3, "args": { "External id": 71230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71230, "pid": 5, "tid": 7, "ts": 1716454222469508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331001, "dur": 6, "args": { "External id": 71230, "cbid": 211, "correlation": 71230 } }, { "ph": "s", "id": 71230, "pid": 76337, "tid": -914061504, "ts": 1716454222331001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222331010, "dur": 0, "args": { "External id": 71231, "cbid": 51, "correlation": 71231 } }, { "ph": "s", "id": 71231, "pid": 76337, "tid": -914061504, "ts": 1716454222331010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222469512, "dur": 708, "args": { "External id": 71232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71232, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71232, "pid": 5, "tid": 7, "ts": 1716454222469512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331011, "dur": 5, "args": { "External id": 71232, "cbid": 211, "correlation": 71232 } }, { "ph": "s", "id": 71232, "pid": 76337, "tid": -914061504, "ts": 1716454222331011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222470222, "dur": 60, "args": { "External id": 71237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71237, "pid": 5, "tid": 7, "ts": 1716454222470222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331039, "dur": 8, "args": { "External id": 71237, "cbid": 211, "correlation": 71237 } }, { "ph": "s", "id": 71237, "pid": 76337, "tid": -914061504, "ts": 1716454222331039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222331098, "dur": 0, "args": { "External id": 71247, "cbid": 317, "correlation": 71247 } }, { "ph": "f", "id": 71247, "pid": 76337, "tid": -914061504, "ts": 1716454222331098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222331099, "dur": 0, "args": { "External id": 71248, "cbid": 203, "correlation": 71248 } }, { "ph": "f", "id": 71248, "pid": 76337, "tid": -914061504, "ts": 1716454222331099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222331100, "dur": 0, "args": { "External id": 71249, "cbid": 205, "correlation": 71249 } }, { "ph": "f", "id": 71249, "pid": 76337, "tid": -914061504, "ts": 1716454222331100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222470282, "dur": 76, "args": { "External id": 71253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71253, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71253, "pid": 5, "tid": 7, "ts": 1716454222470282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331112, "dur": 12, "args": { "External id": 71253, "cbid": 211, "correlation": 71253 } }, { "ph": "s", "id": 71253, "pid": 76337, "tid": -914061504, "ts": 1716454222331112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222470360, "dur": 211, "args": { "External id": 71255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71255, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71255, "pid": 5, "tid": 7, "ts": 1716454222470360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331130, "dur": 7, "args": { "External id": 71255, "cbid": 211, "correlation": 71255 } }, { "ph": "s", "id": 71255, "pid": 76337, "tid": -914061504, "ts": 1716454222331130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222470572, "dur": 39, "args": { "External id": 71257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71257, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71257, "pid": 5, "tid": 7, "ts": 1716454222470572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331141, "dur": 6, "args": { "External id": 71257, "cbid": 211, "correlation": 71257 } }, { "ph": "s", "id": 71257, "pid": 76337, "tid": -914061504, "ts": 1716454222331141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222470613, "dur": 60, "args": { "External id": 71263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71263, "pid": 5, "tid": 7, "ts": 1716454222470613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331168, "dur": 582, "args": { "External id": 71263, "cbid": 211, "correlation": 71263 } }, { "ph": "s", "id": 71263, "pid": 76337, "tid": -914061504, "ts": 1716454222331168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222470674, "dur": 50, "args": { "External id": 71271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71271, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71271, "pid": 5, "tid": 7, "ts": 1716454222470674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331771, "dur": 8, "args": { "External id": 71271, "cbid": 211, "correlation": 71271 } }, { "ph": "s", "id": 71271, "pid": 76337, "tid": -914061504, "ts": 1716454222331771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222470725, "dur": 35, "args": { "External id": 71279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71279, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71279, "pid": 5, "tid": 7, "ts": 1716454222470725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331799, "dur": 11, "args": { "External id": 71279, "cbid": 211, "correlation": 71279 } }, { "ph": "s", "id": 71279, "pid": 76337, "tid": -914061504, "ts": 1716454222331799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222470761, "dur": 53, "args": { "External id": 71299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71299, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 71299, "pid": 5, "tid": 7, "ts": 1716454222470761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331888, "dur": 13, "args": { "External id": 71299, "cbid": 211, "correlation": 71299 } }, { "ph": "s", "id": 71299, "pid": 76337, "tid": -914061504, "ts": 1716454222331888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222470816, "dur": 4, "args": { "External id": 71311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71311, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71311, "pid": 5, "tid": 7, "ts": 1716454222470816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331910, "dur": 6, "args": { "External id": 71311, "cbid": 211, "correlation": 71311 } }, { "ph": "s", "id": 71311, "pid": 76337, "tid": -914061504, "ts": 1716454222331910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222470822, "dur": 55, "args": { "External id": 71314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71314, "pid": 5, "tid": 7, "ts": 1716454222470822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222331928, "dur": 6, "args": { "External id": 71314, "cbid": 211, "correlation": 71314 } }, { "ph": "s", "id": 71314, "pid": 76337, "tid": -914061504, "ts": 1716454222331928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222331994, "dur": 0, "args": { "External id": 71325, "cbid": 317, "correlation": 71325 } }, { "ph": "f", "id": 71325, "pid": 76337, "tid": -914061504, "ts": 1716454222331994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222331995, "dur": 0, "args": { "External id": 71326, "cbid": 203, "correlation": 71326 } }, { "ph": "f", "id": 71326, "pid": 76337, "tid": -914061504, "ts": 1716454222331995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222331996, "dur": 0, "args": { "External id": 71327, "cbid": 205, "correlation": 71327 } }, { "ph": "f", "id": 71327, "pid": 76337, "tid": -914061504, "ts": 1716454222331996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332018, "dur": 1, "args": { "External id": 71331, "cbid": 251, "correlation": 71331 } }, { "ph": "f", "id": 71331, "pid": 76337, "tid": -914061504, "ts": 1716454222332018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332020, "dur": 0, "args": { "External id": 71332, "cbid": 251, "correlation": 71332 } }, { "ph": "f", "id": 71332, "pid": 76337, "tid": -914061504, "ts": 1716454222332020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332021, "dur": 0, "args": { "External id": 71333, "cbid": 251, "correlation": 71333 } }, { "ph": "f", "id": 71333, "pid": 76337, "tid": -914061504, "ts": 1716454222332021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332021, "dur": 0, "args": { "External id": 71334, "cbid": 251, "correlation": 71334 } }, { "ph": "f", "id": 71334, "pid": 76337, "tid": -914061504, "ts": 1716454222332021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332022, "dur": 0, "args": { "External id": 71335, "cbid": 251, "correlation": 71335 } }, { "ph": "f", "id": 71335, "pid": 76337, "tid": -914061504, "ts": 1716454222332022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332023, "dur": 0, "args": { "External id": 71336, "cbid": 251, "correlation": 71336 } }, { "ph": "f", "id": 71336, "pid": 76337, "tid": -914061504, "ts": 1716454222332023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332024, "dur": 0, "args": { "External id": 71337, "cbid": 251, "correlation": 71337 } }, { "ph": "f", "id": 71337, "pid": 76337, "tid": -914061504, "ts": 1716454222332024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332024, "dur": 0, "args": { "External id": 71338, "cbid": 251, "correlation": 71338 } }, { "ph": "f", "id": 71338, "pid": 76337, "tid": -914061504, "ts": 1716454222332024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332026, "dur": 0, "args": { "External id": 71339, "cbid": 251, "correlation": 71339 } }, { "ph": "f", "id": 71339, "pid": 76337, "tid": -914061504, "ts": 1716454222332026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222470879, "dur": 116, "args": { "External id": 71340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71340, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 71340, "pid": 5, "tid": 7, "ts": 1716454222470879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332028, "dur": 13, "args": { "External id": 71340, "cbid": 211, "correlation": 71340 } }, { "ph": "s", "id": 71340, "pid": 76337, "tid": -914061504, "ts": 1716454222332028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222470996, "dur": 60, "args": { "External id": 71346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71346, "pid": 5, "tid": 7, "ts": 1716454222470996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332064, "dur": 9, "args": { "External id": 71346, "cbid": 211, "correlation": 71346 } }, { "ph": "s", "id": 71346, "pid": 76337, "tid": -914061504, "ts": 1716454222332064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222471057, "dur": 611, "args": { "External id": 71355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71355, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71355, "pid": 5, "tid": 7, "ts": 1716454222471057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332147, "dur": 14, "args": { "External id": 71355, "cbid": 211, "correlation": 71355 } }, { "ph": "s", "id": 71355, "pid": 76337, "tid": -914061504, "ts": 1716454222332147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222471669, "dur": 184, "args": { "External id": 71377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71377, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71377, "pid": 5, "tid": 7, "ts": 1716454222471669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332208, "dur": 11, "args": { "External id": 71377, "cbid": 211, "correlation": 71377 } }, { "ph": "s", "id": 71377, "pid": 76337, "tid": -914061504, "ts": 1716454222332208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332294, "dur": 1, "args": { "External id": 71388, "cbid": 251, "correlation": 71388 } }, { "ph": "f", "id": 71388, "pid": 76337, "tid": -914061504, "ts": 1716454222332294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222471855, "dur": 196, "args": { "External id": 71389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71389, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71389, "pid": 5, "tid": 7, "ts": 1716454222471855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332299, "dur": 13, "args": { "External id": 71389, "cbid": 211, "correlation": 71389 } }, { "ph": "s", "id": 71389, "pid": 76337, "tid": -914061504, "ts": 1716454222332299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332365, "dur": 1, "args": { "External id": 71400, "cbid": 251, "correlation": 71400 } }, { "ph": "f", "id": 71400, "pid": 76337, "tid": -914061504, "ts": 1716454222332365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222472052, "dur": 194, "args": { "External id": 71401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71401, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71401, "pid": 5, "tid": 7, "ts": 1716454222472052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332369, "dur": 12, "args": { "External id": 71401, "cbid": 211, "correlation": 71401 } }, { "ph": "s", "id": 71401, "pid": 76337, "tid": -914061504, "ts": 1716454222332369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332431, "dur": 1, "args": { "External id": 71412, "cbid": 251, "correlation": 71412 } }, { "ph": "f", "id": 71412, "pid": 76337, "tid": -914061504, "ts": 1716454222332431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222472247, "dur": 190, "args": { "External id": 71413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71413, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71413, "pid": 5, "tid": 7, "ts": 1716454222472247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332435, "dur": 12, "args": { "External id": 71413, "cbid": 211, "correlation": 71413 } }, { "ph": "s", "id": 71413, "pid": 76337, "tid": -914061504, "ts": 1716454222332435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222472438, "dur": 18948, "args": { "External id": 71434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71434, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 71434, "pid": 5, "tid": 7, "ts": 1716454222472438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332514, "dur": 12, "args": { "External id": 71434, "cbid": 211, "correlation": 71434 } }, { "ph": "s", "id": 71434, "pid": 76337, "tid": -914061504, "ts": 1716454222332514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222332613, "dur": 1, "args": { "External id": 71452, "cbid": 251, "correlation": 71452 } }, { "ph": "f", "id": 71452, "pid": 76337, "tid": -914061504, "ts": 1716454222332613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222491387, "dur": 204, "args": { "External id": 71454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71454, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71454, "pid": 5, "tid": 7, "ts": 1716454222491387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332619, "dur": 24, "args": { "External id": 71454, "cbid": 211, "correlation": 71454 } }, { "ph": "s", "id": 71454, "pid": 76337, "tid": -914061504, "ts": 1716454222332619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222491593, "dur": 66, "args": { "External id": 71462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71462, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71462, "pid": 5, "tid": 7, "ts": 1716454222491593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332703, "dur": 42, "args": { "External id": 71462, "cbid": 211, "correlation": 71462 } }, { "ph": "s", "id": 71462, "pid": 76337, "tid": -914061504, "ts": 1716454222332703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222491660, "dur": 97, "args": { "External id": 71470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71470, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71470, "pid": 5, "tid": 7, "ts": 1716454222491660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332773, "dur": 129, "args": { "External id": 71470, "cbid": 211, "correlation": 71470 } }, { "ph": "s", "id": 71470, "pid": 76337, "tid": -914061504, "ts": 1716454222332773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222491759, "dur": 55, "args": { "External id": 71481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71481, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71481, "pid": 5, "tid": 7, "ts": 1716454222491759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222332965, "dur": 2086, "args": { "External id": 71481, "cbid": 211, "correlation": 71481 } }, { "ph": "s", "id": 71481, "pid": 76337, "tid": -914061504, "ts": 1716454222332965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222491815, "dur": 94, "args": { "External id": 71503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71503, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71503, "pid": 5, "tid": 7, "ts": 1716454222491815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335070, "dur": 138, "args": { "External id": 71503, "cbid": 211, "correlation": 71503 } }, { "ph": "s", "id": 71503, "pid": 76337, "tid": -914061504, "ts": 1716454222335070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335285, "dur": 1, "args": { "External id": 71514, "cbid": 251, "correlation": 71514 } }, { "ph": "f", "id": 71514, "pid": 76337, "tid": -914061504, "ts": 1716454222335285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222491911, "dur": 105, "args": { "External id": 71515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71515, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71515, "pid": 5, "tid": 7, "ts": 1716454222491911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335290, "dur": 14, "args": { "External id": 71515, "cbid": 211, "correlation": 71515 } }, { "ph": "s", "id": 71515, "pid": 76337, "tid": -914061504, "ts": 1716454222335290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335365, "dur": 1, "args": { "External id": 71526, "cbid": 251, "correlation": 71526 } }, { "ph": "f", "id": 71526, "pid": 76337, "tid": -914061504, "ts": 1716454222335365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335368, "dur": 0, "args": { "External id": 71527, "cbid": 251, "correlation": 71527 } }, { "ph": "f", "id": 71527, "pid": 76337, "tid": -914061504, "ts": 1716454222335368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222492017, "dur": 10, "args": { "External id": 71528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71528, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71528, "pid": 5, "tid": 7, "ts": 1716454222492017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335370, "dur": 12, "args": { "External id": 71528, "cbid": 211, "correlation": 71528 } }, { "ph": "s", "id": 71528, "pid": 76337, "tid": -914061504, "ts": 1716454222335370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222492029, "dur": 5, "args": { "External id": 71530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71530, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 71530, "pid": 5, "tid": 7, "ts": 1716454222492029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335384, "dur": 6, "args": { "External id": 71530, "cbid": 211, "correlation": 71530 } }, { "ph": "s", "id": 71530, "pid": 76337, "tid": -914061504, "ts": 1716454222335384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335446, "dur": 1, "args": { "External id": 71541, "cbid": 251, "correlation": 71541 } }, { "ph": "f", "id": 71541, "pid": 76337, "tid": -914061504, "ts": 1716454222335446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335449, "dur": 0, "args": { "External id": 71542, "cbid": 251, "correlation": 71542 } }, { "ph": "f", "id": 71542, "pid": 76337, "tid": -914061504, "ts": 1716454222335449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222492035, "dur": 6, "args": { "External id": 71543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71543, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71543, "pid": 5, "tid": 7, "ts": 1716454222492035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335451, "dur": 12, "args": { "External id": 71543, "cbid": 211, "correlation": 71543 } }, { "ph": "s", "id": 71543, "pid": 76337, "tid": -914061504, "ts": 1716454222335451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222492043, "dur": 4, "args": { "External id": 71545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71545, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 71545, "pid": 5, "tid": 7, "ts": 1716454222492043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335465, "dur": 6, "args": { "External id": 71545, "cbid": 211, "correlation": 71545 } }, { "ph": "s", "id": 71545, "pid": 76337, "tid": -914061504, "ts": 1716454222335465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222492048, "dur": 156, "args": { "External id": 71566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71566, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 71566, "pid": 5, "tid": 7, "ts": 1716454222492048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335538, "dur": 13, "args": { "External id": 71566, "cbid": 211, "correlation": 71566 } }, { "ph": "s", "id": 71566, "pid": 76337, "tid": -914061504, "ts": 1716454222335538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335634, "dur": 1, "args": { "External id": 71584, "cbid": 251, "correlation": 71584 } }, { "ph": "f", "id": 71584, "pid": 76337, "tid": -914061504, "ts": 1716454222335634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222492205, "dur": 108, "args": { "External id": 71586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71586, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 71586, "pid": 5, "tid": 7, "ts": 1716454222492205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335641, "dur": 13, "args": { "External id": 71586, "cbid": 211, "correlation": 71586 } }, { "ph": "s", "id": 71586, "pid": 76337, "tid": -914061504, "ts": 1716454222335641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222492314, "dur": 35, "args": { "External id": 71594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71594, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71594, "pid": 5, "tid": 7, "ts": 1716454222492314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335709, "dur": 15, "args": { "External id": 71594, "cbid": 211, "correlation": 71594 } }, { "ph": "s", "id": 71594, "pid": 76337, "tid": -914061504, "ts": 1716454222335709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222492350, "dur": 67, "args": { "External id": 71602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71602, "pid": 5, "tid": 7, "ts": 1716454222492350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335754, "dur": 10, "args": { "External id": 71602, "cbid": 211, "correlation": 71602 } }, { "ph": "s", "id": 71602, "pid": 76337, "tid": -914061504, "ts": 1716454222335754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222492418, "dur": 94, "args": { "External id": 71624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71624, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71624, "pid": 5, "tid": 7, "ts": 1716454222492418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335806, "dur": 10, "args": { "External id": 71624, "cbid": 211, "correlation": 71624 } }, { "ph": "s", "id": 71624, "pid": 76337, "tid": -914061504, "ts": 1716454222335806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222335894, "dur": 1, "args": { "External id": 71640, "cbid": 251, "correlation": 71640 } }, { "ph": "f", "id": 71640, "pid": 76337, "tid": -914061504, "ts": 1716454222335894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222492513, "dur": 586, "args": { "External id": 71642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71642, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 71642, "pid": 5, "tid": 7, "ts": 1716454222492513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335900, "dur": 13, "args": { "External id": 71642, "cbid": 211, "correlation": 71642 } }, { "ph": "s", "id": 71642, "pid": 76337, "tid": -914061504, "ts": 1716454222335900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222493101, "dur": 244, "args": { "External id": 71650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71650, "pid": 5, "tid": 7, "ts": 1716454222493101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222335965, "dur": 21, "args": { "External id": 71650, "cbid": 211, "correlation": 71650 } }, { "ph": "s", "id": 71650, "pid": 76337, "tid": -914061504, "ts": 1716454222335965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222493346, "dur": 253, "args": { "External id": 71658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71658, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71658, "pid": 5, "tid": 7, "ts": 1716454222493346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222336005, "dur": 9, "args": { "External id": 71658, "cbid": 211, "correlation": 71658 } }, { "ph": "s", "id": 71658, "pid": 76337, "tid": -914061504, "ts": 1716454222336005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336089, "dur": 4, "args": { "External id": 71674, "cbid": 251, "correlation": 71674 } }, { "ph": "f", "id": 71674, "pid": 76337, "tid": -914061504, "ts": 1716454222336089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336097, "dur": 0, "args": { "External id": 71676, "cbid": 251, "correlation": 71676 } }, { "ph": "f", "id": 71676, "pid": 76337, "tid": -914061504, "ts": 1716454222336097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222493601, "dur": 364, "args": { "External id": 71677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71677, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 71677, "pid": 5, "tid": 7, "ts": 1716454222493601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222336100, "dur": 13, "args": { "External id": 71677, "cbid": 211, "correlation": 71677 } }, { "ph": "s", "id": 71677, "pid": 76337, "tid": -914061504, "ts": 1716454222336100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222493966, "dur": 50, "args": { "External id": 71685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71685, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71685, "pid": 5, "tid": 7, "ts": 1716454222493966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222336143, "dur": 257, "args": { "External id": 71685, "cbid": 211, "correlation": 71685 } }, { "ph": "s", "id": 71685, "pid": 76337, "tid": -914061504, "ts": 1716454222336143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222494017, "dur": 160, "args": { "External id": 71696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71696, "pid": 5, "tid": 7, "ts": 1716454222494017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222336458, "dur": 77, "args": { "External id": 71696, "cbid": 211, "correlation": 71696 } }, { "ph": "s", "id": 71696, "pid": 76337, "tid": -914061504, "ts": 1716454222336458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222336589, "dur": 0, "args": { "External id": 71708, "cbid": 317, "correlation": 71708 } }, { "ph": "f", "id": 71708, "pid": 76337, "tid": -914061504, "ts": 1716454222336589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222336590, "dur": 0, "args": { "External id": 71709, "cbid": 203, "correlation": 71709 } }, { "ph": "f", "id": 71709, "pid": 76337, "tid": -914061504, "ts": 1716454222336590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222336591, "dur": 0, "args": { "External id": 71710, "cbid": 205, "correlation": 71710 } }, { "ph": "f", "id": 71710, "pid": 76337, "tid": -914061504, "ts": 1716454222336591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336626, "dur": 1, "args": { "External id": 71714, "cbid": 251, "correlation": 71714 } }, { "ph": "f", "id": 71714, "pid": 76337, "tid": -914061504, "ts": 1716454222336626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336628, "dur": 0, "args": { "External id": 71715, "cbid": 251, "correlation": 71715 } }, { "ph": "f", "id": 71715, "pid": 76337, "tid": -914061504, "ts": 1716454222336628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336630, "dur": 0, "args": { "External id": 71716, "cbid": 251, "correlation": 71716 } }, { "ph": "f", "id": 71716, "pid": 76337, "tid": -914061504, "ts": 1716454222336630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336631, "dur": 0, "args": { "External id": 71717, "cbid": 251, "correlation": 71717 } }, { "ph": "f", "id": 71717, "pid": 76337, "tid": -914061504, "ts": 1716454222336631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336632, "dur": 0, "args": { "External id": 71718, "cbid": 251, "correlation": 71718 } }, { "ph": "f", "id": 71718, "pid": 76337, "tid": -914061504, "ts": 1716454222336632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336632, "dur": 0, "args": { "External id": 71719, "cbid": 251, "correlation": 71719 } }, { "ph": "f", "id": 71719, "pid": 76337, "tid": -914061504, "ts": 1716454222336632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336633, "dur": 0, "args": { "External id": 71720, "cbid": 251, "correlation": 71720 } }, { "ph": "f", "id": 71720, "pid": 76337, "tid": -914061504, "ts": 1716454222336633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336634, "dur": 0, "args": { "External id": 71721, "cbid": 251, "correlation": 71721 } }, { "ph": "f", "id": 71721, "pid": 76337, "tid": -914061504, "ts": 1716454222336634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222336635, "dur": 0, "args": { "External id": 71722, "cbid": 251, "correlation": 71722 } }, { "ph": "f", "id": 71722, "pid": 76337, "tid": -914061504, "ts": 1716454222336635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222494179, "dur": 116, "args": { "External id": 71723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71723, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 71723, "pid": 5, "tid": 7, "ts": 1716454222494179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222336637, "dur": 34, "args": { "External id": 71723, "cbid": 211, "correlation": 71723 } }, { "ph": "s", "id": 71723, "pid": 76337, "tid": -914061504, "ts": 1716454222336637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222494296, "dur": 61, "args": { "External id": 71729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71729, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71729, "pid": 5, "tid": 7, "ts": 1716454222494296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222336696, "dur": 307, "args": { "External id": 71729, "cbid": 211, "correlation": 71729 } }, { "ph": "s", "id": 71729, "pid": 76337, "tid": -914061504, "ts": 1716454222336696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494358, "dur": 50, "args": { "External id": 71737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71737, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71737, "pid": 5, "tid": 7, "ts": 1716454222494358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337029, "dur": 9, "args": { "External id": 71737, "cbid": 211, "correlation": 71737 } }, { "ph": "s", "id": 71737, "pid": 76337, "tid": -914061504, "ts": 1716454222337029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222494409, "dur": 52, "args": { "External id": 71757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71757, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 71757, "pid": 5, "tid": 7, "ts": 1716454222494409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337111, "dur": 12, "args": { "External id": 71757, "cbid": 211, "correlation": 71757 } }, { "ph": "s", "id": 71757, "pid": 76337, "tid": -914061504, "ts": 1716454222337111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222494463, "dur": 5, "args": { "External id": 71769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71769, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71769, "pid": 5, "tid": 7, "ts": 1716454222494463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337134, "dur": 13, "args": { "External id": 71769, "cbid": 211, "correlation": 71769 } }, { "ph": "s", "id": 71769, "pid": 76337, "tid": -914061504, "ts": 1716454222337134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222494469, "dur": 57, "args": { "External id": 71772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71772, "pid": 5, "tid": 7, "ts": 1716454222494469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337159, "dur": 120, "args": { "External id": 71772, "cbid": 211, "correlation": 71772 } }, { "ph": "s", "id": 71772, "pid": 76337, "tid": -914061504, "ts": 1716454222337159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494527, "dur": 38, "args": { "External id": 71781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71781, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71781, "pid": 5, "tid": 7, "ts": 1716454222494527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337319, "dur": 10, "args": { "External id": 71781, "cbid": 211, "correlation": 71781 } }, { "ph": "s", "id": 71781, "pid": 76337, "tid": -914061504, "ts": 1716454222337319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222337374, "dur": 0, "args": { "External id": 71791, "cbid": 317, "correlation": 71791 } }, { "ph": "f", "id": 71791, "pid": 76337, "tid": -914061504, "ts": 1716454222337374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222337375, "dur": 0, "args": { "External id": 71792, "cbid": 203, "correlation": 71792 } }, { "ph": "f", "id": 71792, "pid": 76337, "tid": -914061504, "ts": 1716454222337375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222337376, "dur": 0, "args": { "External id": 71793, "cbid": 205, "correlation": 71793 } }, { "ph": "f", "id": 71793, "pid": 76337, "tid": -914061504, "ts": 1716454222337376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222494567, "dur": 42, "args": { "External id": 71797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71797, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71797, "pid": 5, "tid": 7, "ts": 1716454222494567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337392, "dur": 12, "args": { "External id": 71797, "cbid": 211, "correlation": 71797 } }, { "ph": "s", "id": 71797, "pid": 76337, "tid": -914061504, "ts": 1716454222337392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222494610, "dur": 3, "args": { "External id": 71799, "device": 5, "context": 1, "stream": 7, "correlation": 71799, "bytes": 46080, "memory bandwidth (GB/s)": 12.203389830508474 } }, { "ph": "f", "id": 71799, "pid": 5, "tid": 7, "ts": 1716454222494610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222337407, "dur": 18, "args": { "External id": 71799, "cbid": 51, "correlation": 71799 } }, { "ph": "s", "id": 71799, "pid": 76337, "tid": -914061504, "ts": 1716454222337407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222337431, "dur": 1, "args": { "External id": 71801, "cbid": 200, "correlation": 71801 } }, { "ph": "f", "id": 71801, "pid": 76337, "tid": -914061504, "ts": 1716454222337431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222337433, "dur": 0, "args": { "External id": 71802, "cbid": 200, "correlation": 71802 } }, { "ph": "f", "id": 71802, "pid": 76337, "tid": -914061504, "ts": 1716454222337433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222337434, "dur": 0, "args": { "External id": 71803, "cbid": 200, "correlation": 71803 } }, { "ph": "f", "id": 71803, "pid": 76337, "tid": -914061504, "ts": 1716454222337434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222337434, "dur": 0, "args": { "External id": 71804, "cbid": 200, "correlation": 71804 } }, { "ph": "f", "id": 71804, "pid": 76337, "tid": -914061504, "ts": 1716454222337434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454222337435, "dur": 4, "args": { "External id": 71805, "cbid": 15, "correlation": 71805 } }, { "ph": "f", "id": 71805, "pid": 76337, "tid": -914061504, "ts": 1716454222337435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222337440, "dur": 1, "args": { "External id": 71806, "cbid": 251, "correlation": 71806 } }, { "ph": "f", "id": 71806, "pid": 76337, "tid": -914061504, "ts": 1716454222337440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454222494615, "dur": 23, "args": { "External id": 71807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71807, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71807, "pid": 5, "tid": 7, "ts": 1716454222494615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337443, "dur": 9, "args": { "External id": 71807, "cbid": 211, "correlation": 71807 } }, { "ph": "s", "id": 71807, "pid": 76337, "tid": -914061504, "ts": 1716454222337443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222494639, "dur": 4, "args": { "External id": 71809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 71809, "pid": 5, "tid": 7, "ts": 1716454222494639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337458, "dur": 7, "args": { "External id": 71809, "cbid": 211, "correlation": 71809 } }, { "ph": "s", "id": 71809, "pid": 76337, "tid": -914061504, "ts": 1716454222337458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222337469, "dur": 0, "args": { "External id": 71810, "cbid": 51, "correlation": 71810 } }, { "ph": "s", "id": 71810, "pid": 76337, "tid": -914061504, "ts": 1716454222337469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222494645, "dur": 193, "args": { "External id": 71811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71811, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71811, "pid": 5, "tid": 7, "ts": 1716454222494645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337470, "dur": 225, "args": { "External id": 71811, "cbid": 211, "correlation": 71811 } }, { "ph": "s", "id": 71811, "pid": 76337, "tid": -914061504, "ts": 1716454222337470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222494839, "dur": 6, "args": { "External id": 71812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71812, "pid": 5, "tid": 7, "ts": 1716454222494839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337699, "dur": 9, "args": { "External id": 71812, "cbid": 211, "correlation": 71812 } }, { "ph": "s", "id": 71812, "pid": 76337, "tid": -914061504, "ts": 1716454222337699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222494847, "dur": 5, "args": { "External id": 71818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 71818, "pid": 5, "tid": 7, "ts": 1716454222494847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222337732, "dur": 10, "args": { "External id": 71818, "cbid": 211, "correlation": 71818 } }, { "ph": "s", "id": 71818, "pid": 76337, "tid": -914061504, "ts": 1716454222337732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494853, "dur": 3, "args": { "External id": 71826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71826, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71826, "pid": 5, "tid": 7, "ts": 1716454222494853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222339486, "dur": 16, "args": { "External id": 71826, "cbid": 211, "correlation": 71826 } }, { "ph": "s", "id": 71826, "pid": 76337, "tid": -914061504, "ts": 1716454222339486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494857, "dur": 3, "args": { "External id": 71834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71834, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71834, "pid": 5, "tid": 7, "ts": 1716454222494857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222339527, "dur": 11, "args": { "External id": 71834, "cbid": 211, "correlation": 71834 } }, { "ph": "s", "id": 71834, "pid": 76337, "tid": -914061504, "ts": 1716454222339527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494861, "dur": 3, "args": { "External id": 71842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71842, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71842, "pid": 5, "tid": 7, "ts": 1716454222494861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222339554, "dur": 8, "args": { "External id": 71842, "cbid": 211, "correlation": 71842 } }, { "ph": "s", "id": 71842, "pid": 76337, "tid": -914061504, "ts": 1716454222339554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494865, "dur": 3, "args": { "External id": 71851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71851, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71851, "pid": 5, "tid": 7, "ts": 1716454222494865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222339724, "dur": 14, "args": { "External id": 71851, "cbid": 211, "correlation": 71851 } }, { "ph": "s", "id": 71851, "pid": 76337, "tid": -914061504, "ts": 1716454222339724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494869, "dur": 3, "args": { "External id": 71860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71860, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71860, "pid": 5, "tid": 7, "ts": 1716454222494869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222339754, "dur": 7, "args": { "External id": 71860, "cbid": 211, "correlation": 71860 } }, { "ph": "s", "id": 71860, "pid": 76337, "tid": -914061504, "ts": 1716454222339754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494874, "dur": 3, "args": { "External id": 71868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71868, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71868, "pid": 5, "tid": 7, "ts": 1716454222494874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222339777, "dur": 8, "args": { "External id": 71868, "cbid": 211, "correlation": 71868 } }, { "ph": "s", "id": 71868, "pid": 76337, "tid": -914061504, "ts": 1716454222339777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494879, "dur": 3, "args": { "External id": 71876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71876, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71876, "pid": 5, "tid": 7, "ts": 1716454222494879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222340044, "dur": 16, "args": { "External id": 71876, "cbid": 211, "correlation": 71876 } }, { "ph": "s", "id": 71876, "pid": 76337, "tid": -914061504, "ts": 1716454222340044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222494883, "dur": 3, "args": { "External id": 71884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71884, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 71884, "pid": 5, "tid": 7, "ts": 1716454222494883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222340079, "dur": 8, "args": { "External id": 71884, "cbid": 211, "correlation": 71884 } }, { "ph": "s", "id": 71884, "pid": 76337, "tid": -914061504, "ts": 1716454222340079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222494888, "dur": 1, "args": { "External id": 71894, "device": 5, "context": 1, "stream": 7, "correlation": 71894, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 71894, "pid": 5, "tid": 7, "ts": 1716454222494888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222340144, "dur": 54, "args": { "External id": 71894, "cbid": 41, "correlation": 71894 } }, { "ph": "s", "id": 71894, "pid": 76337, "tid": -914061504, "ts": 1716454222340144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222340200, "dur": 154704, "args": { "External id": 71895, "cbid": 131, "correlation": 71895 } }, { "ph": "f", "id": 71895, "pid": 76337, "tid": -914061504, "ts": 1716454222340200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222495065, "dur": 3, "args": { "External id": 71903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71903, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 71903, "pid": 5, "tid": 7, "ts": 1716454222495065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495043, "dur": 24, "args": { "External id": 71903, "cbid": 211, "correlation": 71903 } }, { "ph": "s", "id": 71903, "pid": 76337, "tid": -914061504, "ts": 1716454222495043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495153, "dur": 3, "args": { "External id": 71912, "device": 5, "context": 1, "stream": 7, "correlation": 71912, "bytes": 8, "memory bandwidth (GB/s)": 0.0023816612086930635 } }, { "ph": "f", "id": 71912, "pid": 5, "tid": 7, "ts": 1716454222495153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495126, "dur": 28, "args": { "External id": 71912, "cbid": 41, "correlation": 71912 } }, { "ph": "s", "id": 71912, "pid": 76337, "tid": -914061504, "ts": 1716454222495126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222495238, "dur": 4, "args": { "External id": 71922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71922, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 71922, "pid": 5, "tid": 7, "ts": 1716454222495238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495223, "dur": 16, "args": { "External id": 71922, "cbid": 211, "correlation": 71922 } }, { "ph": "s", "id": 71922, "pid": 76337, "tid": -914061504, "ts": 1716454222495223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495301, "dur": 1, "args": { "External id": 71932, "device": 5, "context": 1, "stream": 7, "correlation": 71932, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 71932, "pid": 5, "tid": 7, "ts": 1716454222495301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495285, "dur": 14, "args": { "External id": 71932, "cbid": 41, "correlation": 71932 } }, { "ph": "s", "id": 71932, "pid": 76337, "tid": -914061504, "ts": 1716454222495285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222495300, "dur": 8, "args": { "External id": 71933, "cbid": 131, "correlation": 71933 } }, { "ph": "f", "id": 71933, "pid": 76337, "tid": -914061504, "ts": 1716454222495300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495384, "dur": 3, "args": { "External id": 71940, "device": 5, "context": 1, "stream": 7, "correlation": 71940, "bytes": 98304, "memory bandwidth (GB/s)": 30.11764705882353 } }, { "ph": "f", "id": 71940, "pid": 5, "tid": 7, "ts": 1716454222495384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495347, "dur": 37, "args": { "External id": 71940, "cbid": 41, "correlation": 71940 } }, { "ph": "s", "id": 71940, "pid": 76337, "tid": -914061504, "ts": 1716454222495347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495477, "dur": 3, "args": { "External id": 71959, "device": 5, "context": 1, "stream": 7, "correlation": 71959, "bytes": 16, "memory bandwidth (GB/s)": 0.005263157894736842 } }, { "ph": "f", "id": 71959, "pid": 5, "tid": 7, "ts": 1716454222495477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495458, "dur": 18, "args": { "External id": 71959, "cbid": 41, "correlation": 71959 } }, { "ph": "s", "id": 71959, "pid": 76337, "tid": -914061504, "ts": 1716454222495458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222495516, "dur": 3, "args": { "External id": 71965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 71965, "pid": 5, "tid": 7, "ts": 1716454222495516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495504, "dur": 11, "args": { "External id": 71965, "cbid": 211, "correlation": 71965 } }, { "ph": "s", "id": 71965, "pid": 76337, "tid": -914061504, "ts": 1716454222495504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454222495530, "dur": 6, "args": { "External id": 71967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71967, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 71967, "pid": 5, "tid": 7, "ts": 1716454222495530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495519, "dur": 9, "args": { "External id": 71967, "cbid": 211, "correlation": 71967 } }, { "ph": "s", "id": 71967, "pid": 76337, "tid": -914061504, "ts": 1716454222495519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454222495538, "dur": 3, "args": { "External id": 71969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 71969, "pid": 5, "tid": 7, "ts": 1716454222495538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495530, "dur": 6, "args": { "External id": 71969, "cbid": 211, "correlation": 71969 } }, { "ph": "s", "id": 71969, "pid": 76337, "tid": -914061504, "ts": 1716454222495530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495571, "dur": 2, "args": { "External id": 71977, "device": 5, "context": 1, "stream": 7, "correlation": 71977, "bytes": 8, "memory bandwidth (GB/s)": 0.002717391304347826 } }, { "ph": "f", "id": 71977, "pid": 5, "tid": 7, "ts": 1716454222495571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495557, "dur": 13, "args": { "External id": 71977, "cbid": 41, "correlation": 71977 } }, { "ph": "s", "id": 71977, "pid": 76337, "tid": -914061504, "ts": 1716454222495557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222495616, "dur": 3, "args": { "External id": 71991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 71991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 71991, "pid": 5, "tid": 7, "ts": 1716454222495616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495606, "dur": 11, "args": { "External id": 71991, "cbid": 211, "correlation": 71991 } }, { "ph": "s", "id": 71991, "pid": 76337, "tid": -914061504, "ts": 1716454222495606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222495636, "dur": 2, "args": { "External id": 72005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72005, "pid": 5, "tid": 7, "ts": 1716454222495636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495628, "dur": 7, "args": { "External id": 72005, "cbid": 211, "correlation": 72005 } }, { "ph": "s", "id": 72005, "pid": 76337, "tid": -914061504, "ts": 1716454222495628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222495668, "dur": 6, "args": { "External id": 72012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72012, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72012, "pid": 5, "tid": 7, "ts": 1716454222495668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495658, "dur": 11, "args": { "External id": 72012, "cbid": 211, "correlation": 72012 } }, { "ph": "s", "id": 72012, "pid": 76337, "tid": -914061504, "ts": 1716454222495658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222495678, "dur": 6, "args": { "External id": 72015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72015, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72015, "pid": 5, "tid": 7, "ts": 1716454222495678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495671, "dur": 6, "args": { "External id": 72015, "cbid": 211, "correlation": 72015 } }, { "ph": "s", "id": 72015, "pid": 76337, "tid": -914061504, "ts": 1716454222495671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454222495687, "dur": 3, "args": { "External id": 72017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72017, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72017, "pid": 5, "tid": 7, "ts": 1716454222495687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495679, "dur": 6, "args": { "External id": 72017, "cbid": 211, "correlation": 72017 } }, { "ph": "s", "id": 72017, "pid": 76337, "tid": -914061504, "ts": 1716454222495679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495707, "dur": 2, "args": { "External id": 72020, "device": 5, "context": 1, "stream": 7, "correlation": 72020, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 72020, "pid": 5, "tid": 7, "ts": 1716454222495707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495694, "dur": 14, "args": { "External id": 72020, "cbid": 41, "correlation": 72020 } }, { "ph": "s", "id": 72020, "pid": 76337, "tid": -914061504, "ts": 1716454222495694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222495761, "dur": 4, "args": { "External id": 72036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72036, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72036, "pid": 5, "tid": 7, "ts": 1716454222495761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495748, "dur": 12, "args": { "External id": 72036, "cbid": 211, "correlation": 72036 } }, { "ph": "s", "id": 72036, "pid": 76337, "tid": -914061504, "ts": 1716454222495748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222495781, "dur": 3, "args": { "External id": 72041, "device": 5, "context": 1, "stream": 7, "correlation": 72041, "bytes": 1, "memory bandwidth (GB/s)": 0.00031259768677711783 } }, { "ph": "f", "id": 72041, "pid": 5, "tid": 7, "ts": 1716454222495781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495765, "dur": 14, "args": { "External id": 72041, "cbid": 41, "correlation": 72041 } }, { "ph": "s", "id": 72041, "pid": 76337, "tid": -914061504, "ts": 1716454222495765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222495809, "dur": 1, "args": { "External id": 72047, "device": 5, "context": 1, "stream": 7, "correlation": 72047, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 72047, "pid": 5, "tid": 7, "ts": 1716454222495809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222495790, "dur": 28, "args": { "External id": 72047, "cbid": 41, "correlation": 72047 } }, { "ph": "s", "id": 72047, "pid": 76337, "tid": -914061504, "ts": 1716454222495790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222495818, "dur": 4, "args": { "External id": 72048, "cbid": 131, "correlation": 72048 } }, { "ph": "f", "id": 72048, "pid": 76337, "tid": -914061504, "ts": 1716454222495818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222495868, "dur": 3, "args": { "External id": 72056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72056, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72056, "pid": 5, "tid": 7, "ts": 1716454222495868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495855, "dur": 12, "args": { "External id": 72056, "cbid": 211, "correlation": 72056 } }, { "ph": "s", "id": 72056, "pid": 76337, "tid": -914061504, "ts": 1716454222495855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222495897, "dur": 3, "args": { "External id": 72066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72066, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72066, "pid": 5, "tid": 7, "ts": 1716454222495897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495888, "dur": 9, "args": { "External id": 72066, "cbid": 211, "correlation": 72066 } }, { "ph": "s", "id": 72066, "pid": 76337, "tid": -914061504, "ts": 1716454222495888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222495921, "dur": 3, "args": { "External id": 72075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72075, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72075, "pid": 5, "tid": 7, "ts": 1716454222495921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222495912, "dur": 7, "args": { "External id": 72075, "cbid": 211, "correlation": 72075 } }, { "ph": "s", "id": 72075, "pid": 76337, "tid": -914061504, "ts": 1716454222495912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222496038, "dur": 12, "args": { "External id": 72085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72085, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72085, "pid": 5, "tid": 7, "ts": 1716454222496038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496024, "dur": 15, "args": { "External id": 72085, "cbid": 211, "correlation": 72085 } }, { "ph": "s", "id": 72085, "pid": 76337, "tid": -914061504, "ts": 1716454222496024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222496076, "dur": 3, "args": { "External id": 72093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72093, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72093, "pid": 5, "tid": 7, "ts": 1716454222496076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496067, "dur": 11, "args": { "External id": 72093, "cbid": 211, "correlation": 72093 } }, { "ph": "s", "id": 72093, "pid": 76337, "tid": -914061504, "ts": 1716454222496067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222496124, "dur": 12, "args": { "External id": 72103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72103, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72103, "pid": 5, "tid": 7, "ts": 1716454222496124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496113, "dur": 11, "args": { "External id": 72103, "cbid": 211, "correlation": 72103 } }, { "ph": "s", "id": 72103, "pid": 76337, "tid": -914061504, "ts": 1716454222496113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222496154, "dur": 10, "args": { "External id": 72111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72111, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72111, "pid": 5, "tid": 7, "ts": 1716454222496154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496145, "dur": 9, "args": { "External id": 72111, "cbid": 211, "correlation": 72111 } }, { "ph": "s", "id": 72111, "pid": 76337, "tid": -914061504, "ts": 1716454222496145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222496181, "dur": 3, "args": { "External id": 72120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72120, "pid": 5, "tid": 7, "ts": 1716454222496181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496172, "dur": 8, "args": { "External id": 72120, "cbid": 211, "correlation": 72120 } }, { "ph": "s", "id": 72120, "pid": 76337, "tid": -914061504, "ts": 1716454222496172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222496206, "dur": 5, "args": { "External id": 72129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72129, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72129, "pid": 5, "tid": 7, "ts": 1716454222496206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496197, "dur": 8, "args": { "External id": 72129, "cbid": 211, "correlation": 72129 } }, { "ph": "s", "id": 72129, "pid": 76337, "tid": -914061504, "ts": 1716454222496197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222496241, "dur": 8, "args": { "External id": 72139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72139, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72139, "pid": 5, "tid": 7, "ts": 1716454222496241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496231, "dur": 10, "args": { "External id": 72139, "cbid": 211, "correlation": 72139 } }, { "ph": "s", "id": 72139, "pid": 76337, "tid": -914061504, "ts": 1716454222496231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222496540, "dur": 3, "args": { "External id": 72148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72148, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72148, "pid": 5, "tid": 7, "ts": 1716454222496540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496525, "dur": 15, "args": { "External id": 72148, "cbid": 211, "correlation": 72148 } }, { "ph": "s", "id": 72148, "pid": 76337, "tid": -914061504, "ts": 1716454222496525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222496566, "dur": 3, "args": { "External id": 72156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72156, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72156, "pid": 5, "tid": 7, "ts": 1716454222496566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496557, "dur": 8, "args": { "External id": 72156, "cbid": 211, "correlation": 72156 } }, { "ph": "s", "id": 72156, "pid": 76337, "tid": -914061504, "ts": 1716454222496557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222496620, "dur": 1, "args": { "External id": 72166, "device": 5, "context": 1, "stream": 7, "correlation": 72166, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 72166, "pid": 5, "tid": 7, "ts": 1716454222496620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222496605, "dur": 13, "args": { "External id": 72166, "cbid": 41, "correlation": 72166 } }, { "ph": "s", "id": 72166, "pid": 76337, "tid": -914061504, "ts": 1716454222496605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222496619, "dur": 8, "args": { "External id": 72167, "cbid": 131, "correlation": 72167 } }, { "ph": "f", "id": 72167, "pid": 76337, "tid": -914061504, "ts": 1716454222496619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222496709, "dur": 2, "args": { "External id": 72175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72175, "pid": 5, "tid": 7, "ts": 1716454222496709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496694, "dur": 15, "args": { "External id": 72175, "cbid": 211, "correlation": 72175 } }, { "ph": "s", "id": 72175, "pid": 76337, "tid": -914061504, "ts": 1716454222496694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222496780, "dur": 3, "args": { "External id": 72184, "device": 5, "context": 1, "stream": 7, "correlation": 72184, "bytes": 8, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 72184, "pid": 5, "tid": 7, "ts": 1716454222496780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222496762, "dur": 17, "args": { "External id": 72184, "cbid": 41, "correlation": 72184 } }, { "ph": "s", "id": 72184, "pid": 76337, "tid": -914061504, "ts": 1716454222496762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222496850, "dur": 3, "args": { "External id": 72194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72194, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72194, "pid": 5, "tid": 7, "ts": 1716454222496850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222496836, "dur": 13, "args": { "External id": 72194, "cbid": 211, "correlation": 72194 } }, { "ph": "s", "id": 72194, "pid": 76337, "tid": -914061504, "ts": 1716454222496836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224868486, "dur": 51, "args": { "External id": 217151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217151, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 217151, "pid": 5, "tid": 7, "ts": 1716454224868486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224758742, "dur": 25, "args": { "External id": 217151, "cbid": 211, "correlation": 217151 } }, { "ph": "s", "id": 217151, "pid": 76337, "tid": -914061504, "ts": 1716454224758742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224868538, "dur": 4, "args": { "External id": 217163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217163, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217163, "pid": 5, "tid": 7, "ts": 1716454224868538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224758785, "dur": 7, "args": { "External id": 217163, "cbid": 211, "correlation": 217163 } }, { "ph": "s", "id": 217163, "pid": 76337, "tid": -914061504, "ts": 1716454224758785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224868544, "dur": 56, "args": { "External id": 217166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217166, "pid": 5, "tid": 7, "ts": 1716454224868544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224758811, "dur": 8, "args": { "External id": 217166, "cbid": 211, "correlation": 217166 } }, { "ph": "s", "id": 217166, "pid": 76337, "tid": -914061504, "ts": 1716454224758811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224758913, "dur": 0, "args": { "External id": 217177, "cbid": 317, "correlation": 217177 } }, { "ph": "f", "id": 217177, "pid": 76337, "tid": -914061504, "ts": 1716454224758913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224758915, "dur": 1, "args": { "External id": 217178, "cbid": 203, "correlation": 217178 } }, { "ph": "f", "id": 217178, "pid": 76337, "tid": -914061504, "ts": 1716454224758915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224758916, "dur": 0, "args": { "External id": 217179, "cbid": 205, "correlation": 217179 } }, { "ph": "f", "id": 217179, "pid": 76337, "tid": -914061504, "ts": 1716454224758916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224758993, "dur": 4, "args": { "External id": 217183, "cbid": 251, "correlation": 217183 } }, { "ph": "f", "id": 217183, "pid": 76337, "tid": -914061504, "ts": 1716454224758993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224758998, "dur": 1, "args": { "External id": 217184, "cbid": 251, "correlation": 217184 } }, { "ph": "f", "id": 217184, "pid": 76337, "tid": -914061504, "ts": 1716454224758998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759000, "dur": 1, "args": { "External id": 217185, "cbid": 251, "correlation": 217185 } }, { "ph": "f", "id": 217185, "pid": 76337, "tid": -914061504, "ts": 1716454224759000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759001, "dur": 1, "args": { "External id": 217186, "cbid": 251, "correlation": 217186 } }, { "ph": "f", "id": 217186, "pid": 76337, "tid": -914061504, "ts": 1716454224759001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759003, "dur": 1, "args": { "External id": 217187, "cbid": 251, "correlation": 217187 } }, { "ph": "f", "id": 217187, "pid": 76337, "tid": -914061504, "ts": 1716454224759003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759005, "dur": 1, "args": { "External id": 217188, "cbid": 251, "correlation": 217188 } }, { "ph": "f", "id": 217188, "pid": 76337, "tid": -914061504, "ts": 1716454224759005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759007, "dur": 0, "args": { "External id": 217189, "cbid": 251, "correlation": 217189 } }, { "ph": "f", "id": 217189, "pid": 76337, "tid": -914061504, "ts": 1716454224759007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759008, "dur": 0, "args": { "External id": 217190, "cbid": 251, "correlation": 217190 } }, { "ph": "f", "id": 217190, "pid": 76337, "tid": -914061504, "ts": 1716454224759008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759010, "dur": 0, "args": { "External id": 217191, "cbid": 251, "correlation": 217191 } }, { "ph": "f", "id": 217191, "pid": 76337, "tid": -914061504, "ts": 1716454224759010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224868601, "dur": 112, "args": { "External id": 217192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217192, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217192, "pid": 5, "tid": 7, "ts": 1716454224868601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759014, "dur": 15, "args": { "External id": 217192, "cbid": 211, "correlation": 217192 } }, { "ph": "s", "id": 217192, "pid": 76337, "tid": -914061504, "ts": 1716454224759014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224868715, "dur": 60, "args": { "External id": 217198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217198, "pid": 5, "tid": 7, "ts": 1716454224868715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759083, "dur": 11, "args": { "External id": 217198, "cbid": 211, "correlation": 217198 } }, { "ph": "s", "id": 217198, "pid": 76337, "tid": -914061504, "ts": 1716454224759083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224868776, "dur": 462, "args": { "External id": 217207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217207, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217207, "pid": 5, "tid": 7, "ts": 1716454224868776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759214, "dur": 16, "args": { "External id": 217207, "cbid": 211, "correlation": 217207 } }, { "ph": "s", "id": 217207, "pid": 76337, "tid": -914061504, "ts": 1716454224759214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224869239, "dur": 182, "args": { "External id": 217229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217229, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217229, "pid": 5, "tid": 7, "ts": 1716454224869239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759295, "dur": 11, "args": { "External id": 217229, "cbid": 211, "correlation": 217229 } }, { "ph": "s", "id": 217229, "pid": 76337, "tid": -914061504, "ts": 1716454224759295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759439, "dur": 2, "args": { "External id": 217240, "cbid": 251, "correlation": 217240 } }, { "ph": "f", "id": 217240, "pid": 76337, "tid": -914061504, "ts": 1716454224759439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224869423, "dur": 197, "args": { "External id": 217241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217241, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217241, "pid": 5, "tid": 7, "ts": 1716454224869423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759446, "dur": 13, "args": { "External id": 217241, "cbid": 211, "correlation": 217241 } }, { "ph": "s", "id": 217241, "pid": 76337, "tid": -914061504, "ts": 1716454224759446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759520, "dur": 1, "args": { "External id": 217252, "cbid": 251, "correlation": 217252 } }, { "ph": "f", "id": 217252, "pid": 76337, "tid": -914061504, "ts": 1716454224759520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224869621, "dur": 188, "args": { "External id": 217253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217253, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217253, "pid": 5, "tid": 7, "ts": 1716454224869621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759523, "dur": 12, "args": { "External id": 217253, "cbid": 211, "correlation": 217253 } }, { "ph": "s", "id": 217253, "pid": 76337, "tid": -914061504, "ts": 1716454224759523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759589, "dur": 1, "args": { "External id": 217264, "cbid": 251, "correlation": 217264 } }, { "ph": "f", "id": 217264, "pid": 76337, "tid": -914061504, "ts": 1716454224759589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224869811, "dur": 189, "args": { "External id": 217265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217265, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217265, "pid": 5, "tid": 7, "ts": 1716454224869811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759593, "dur": 11, "args": { "External id": 217265, "cbid": 211, "correlation": 217265 } }, { "ph": "s", "id": 217265, "pid": 76337, "tid": -914061504, "ts": 1716454224759593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224870001, "dur": 18734, "args": { "External id": 217286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217286, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217286, "pid": 5, "tid": 7, "ts": 1716454224870001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759706, "dur": 16, "args": { "External id": 217286, "cbid": 211, "correlation": 217286 } }, { "ph": "s", "id": 217286, "pid": 76337, "tid": -914061504, "ts": 1716454224759706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224759840, "dur": 2, "args": { "External id": 217304, "cbid": 251, "correlation": 217304 } }, { "ph": "f", "id": 217304, "pid": 76337, "tid": -914061504, "ts": 1716454224759840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224888737, "dur": 202, "args": { "External id": 217306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217306, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217306, "pid": 5, "tid": 7, "ts": 1716454224888737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759846, "dur": 14, "args": { "External id": 217306, "cbid": 211, "correlation": 217306 } }, { "ph": "s", "id": 217306, "pid": 76337, "tid": -914061504, "ts": 1716454224759846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224888940, "dur": 66, "args": { "External id": 217314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217314, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217314, "pid": 5, "tid": 7, "ts": 1716454224888940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224759949, "dur": 15, "args": { "External id": 217314, "cbid": 211, "correlation": 217314 } }, { "ph": "s", "id": 217314, "pid": 76337, "tid": -914061504, "ts": 1716454224759949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224889007, "dur": 96, "args": { "External id": 217322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217322, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217322, "pid": 5, "tid": 7, "ts": 1716454224889007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224760010, "dur": 10, "args": { "External id": 217322, "cbid": 211, "correlation": 217322 } }, { "ph": "s", "id": 217322, "pid": 76337, "tid": -914061504, "ts": 1716454224760010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224889105, "dur": 55, "args": { "External id": 217333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217333, "pid": 5, "tid": 7, "ts": 1716454224889105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224760116, "dur": 15, "args": { "External id": 217333, "cbid": 211, "correlation": 217333 } }, { "ph": "s", "id": 217333, "pid": 76337, "tid": -914061504, "ts": 1716454224760116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224889160, "dur": 92, "args": { "External id": 217355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217355, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217355, "pid": 5, "tid": 7, "ts": 1716454224889160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224760153, "dur": 1134, "args": { "External id": 217355, "cbid": 211, "correlation": 217355 } }, { "ph": "s", "id": 217355, "pid": 76337, "tid": -914061504, "ts": 1716454224760153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224761371, "dur": 1, "args": { "External id": 217366, "cbid": 251, "correlation": 217366 } }, { "ph": "f", "id": 217366, "pid": 76337, "tid": -914061504, "ts": 1716454224761371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224889254, "dur": 103, "args": { "External id": 217367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217367, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217367, "pid": 5, "tid": 7, "ts": 1716454224889254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761377, "dur": 59, "args": { "External id": 217367, "cbid": 211, "correlation": 217367 } }, { "ph": "s", "id": 217367, "pid": 76337, "tid": -914061504, "ts": 1716454224761377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224761509, "dur": 2, "args": { "External id": 217378, "cbid": 251, "correlation": 217378 } }, { "ph": "f", "id": 217378, "pid": 76337, "tid": -914061504, "ts": 1716454224761509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224761515, "dur": 0, "args": { "External id": 217379, "cbid": 251, "correlation": 217379 } }, { "ph": "f", "id": 217379, "pid": 76337, "tid": -914061504, "ts": 1716454224761515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224889359, "dur": 10, "args": { "External id": 217380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217380, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 217380, "pid": 5, "tid": 7, "ts": 1716454224889359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761517, "dur": 14, "args": { "External id": 217380, "cbid": 211, "correlation": 217380 } }, { "ph": "s", "id": 217380, "pid": 76337, "tid": -914061504, "ts": 1716454224761517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224889370, "dur": 5, "args": { "External id": 217382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217382, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 217382, "pid": 5, "tid": 7, "ts": 1716454224889370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761534, "dur": 9, "args": { "External id": 217382, "cbid": 211, "correlation": 217382 } }, { "ph": "s", "id": 217382, "pid": 76337, "tid": -914061504, "ts": 1716454224761534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224761599, "dur": 1, "args": { "External id": 217393, "cbid": 251, "correlation": 217393 } }, { "ph": "f", "id": 217393, "pid": 76337, "tid": -914061504, "ts": 1716454224761599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224761603, "dur": 0, "args": { "External id": 217394, "cbid": 251, "correlation": 217394 } }, { "ph": "f", "id": 217394, "pid": 76337, "tid": -914061504, "ts": 1716454224761603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224889377, "dur": 6, "args": { "External id": 217395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217395, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 217395, "pid": 5, "tid": 7, "ts": 1716454224889377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761604, "dur": 12, "args": { "External id": 217395, "cbid": 211, "correlation": 217395 } }, { "ph": "s", "id": 217395, "pid": 76337, "tid": -914061504, "ts": 1716454224761604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224889384, "dur": 3, "args": { "External id": 217397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217397, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 217397, "pid": 5, "tid": 7, "ts": 1716454224889384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761618, "dur": 5, "args": { "External id": 217397, "cbid": 211, "correlation": 217397 } }, { "ph": "s", "id": 217397, "pid": 76337, "tid": -914061504, "ts": 1716454224761618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224889389, "dur": 157, "args": { "External id": 217418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217418, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217418, "pid": 5, "tid": 7, "ts": 1716454224889389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761694, "dur": 12, "args": { "External id": 217418, "cbid": 211, "correlation": 217418 } }, { "ph": "s", "id": 217418, "pid": 76337, "tid": -914061504, "ts": 1716454224761694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224761793, "dur": 2, "args": { "External id": 217436, "cbid": 251, "correlation": 217436 } }, { "ph": "f", "id": 217436, "pid": 76337, "tid": -914061504, "ts": 1716454224761793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224889548, "dur": 109, "args": { "External id": 217438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217438, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217438, "pid": 5, "tid": 7, "ts": 1716454224889548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761799, "dur": 14, "args": { "External id": 217438, "cbid": 211, "correlation": 217438 } }, { "ph": "s", "id": 217438, "pid": 76337, "tid": -914061504, "ts": 1716454224761799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224889658, "dur": 35, "args": { "External id": 217446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217446, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217446, "pid": 5, "tid": 7, "ts": 1716454224889658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761869, "dur": 12, "args": { "External id": 217446, "cbid": 211, "correlation": 217446 } }, { "ph": "s", "id": 217446, "pid": 76337, "tid": -914061504, "ts": 1716454224761869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224889693, "dur": 66, "args": { "External id": 217454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217454, "pid": 5, "tid": 7, "ts": 1716454224889693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761910, "dur": 10, "args": { "External id": 217454, "cbid": 211, "correlation": 217454 } }, { "ph": "s", "id": 217454, "pid": 76337, "tid": -914061504, "ts": 1716454224761910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224889761, "dur": 93, "args": { "External id": 217476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217476, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217476, "pid": 5, "tid": 7, "ts": 1716454224889761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224761963, "dur": 19, "args": { "External id": 217476, "cbid": 211, "correlation": 217476 } }, { "ph": "s", "id": 217476, "pid": 76337, "tid": -914061504, "ts": 1716454224761963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762071, "dur": 1, "args": { "External id": 217492, "cbid": 251, "correlation": 217492 } }, { "ph": "f", "id": 217492, "pid": 76337, "tid": -914061504, "ts": 1716454224762071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224889855, "dur": 577, "args": { "External id": 217494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217494, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217494, "pid": 5, "tid": 7, "ts": 1716454224889855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762077, "dur": 13, "args": { "External id": 217494, "cbid": 211, "correlation": 217494 } }, { "ph": "s", "id": 217494, "pid": 76337, "tid": -914061504, "ts": 1716454224762077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224890433, "dur": 246, "args": { "External id": 217502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217502, "pid": 5, "tid": 7, "ts": 1716454224890433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762166, "dur": 14, "args": { "External id": 217502, "cbid": 211, "correlation": 217502 } }, { "ph": "s", "id": 217502, "pid": 76337, "tid": -914061504, "ts": 1716454224762166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224890680, "dur": 254, "args": { "External id": 217510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217510, "pid": 5, "tid": 7, "ts": 1716454224890680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762208, "dur": 11, "args": { "External id": 217510, "cbid": 211, "correlation": 217510 } }, { "ph": "s", "id": 217510, "pid": 76337, "tid": -914061504, "ts": 1716454224762208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762294, "dur": 2, "args": { "External id": 217526, "cbid": 251, "correlation": 217526 } }, { "ph": "f", "id": 217526, "pid": 76337, "tid": -914061504, "ts": 1716454224762294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762300, "dur": 0, "args": { "External id": 217528, "cbid": 251, "correlation": 217528 } }, { "ph": "f", "id": 217528, "pid": 76337, "tid": -914061504, "ts": 1716454224762300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224890935, "dur": 364, "args": { "External id": 217529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217529, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 217529, "pid": 5, "tid": 7, "ts": 1716454224890935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762305, "dur": 13, "args": { "External id": 217529, "cbid": 211, "correlation": 217529 } }, { "ph": "s", "id": 217529, "pid": 76337, "tid": -914061504, "ts": 1716454224762305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224891300, "dur": 50, "args": { "External id": 217537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217537, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217537, "pid": 5, "tid": 7, "ts": 1716454224891300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762347, "dur": 10, "args": { "External id": 217537, "cbid": 211, "correlation": 217537 } }, { "ph": "s", "id": 217537, "pid": 76337, "tid": -914061504, "ts": 1716454224762347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224891351, "dur": 159, "args": { "External id": 217548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217548, "pid": 5, "tid": 7, "ts": 1716454224891351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762422, "dur": 133, "args": { "External id": 217548, "cbid": 211, "correlation": 217548 } }, { "ph": "s", "id": 217548, "pid": 76337, "tid": -914061504, "ts": 1716454224762422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224762629, "dur": 1, "args": { "External id": 217560, "cbid": 317, "correlation": 217560 } }, { "ph": "f", "id": 217560, "pid": 76337, "tid": -914061504, "ts": 1716454224762629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224762631, "dur": 1, "args": { "External id": 217561, "cbid": 203, "correlation": 217561 } }, { "ph": "f", "id": 217561, "pid": 76337, "tid": -914061504, "ts": 1716454224762631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224762633, "dur": 1, "args": { "External id": 217562, "cbid": 205, "correlation": 217562 } }, { "ph": "f", "id": 217562, "pid": 76337, "tid": -914061504, "ts": 1716454224762633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762922, "dur": 2, "args": { "External id": 217566, "cbid": 251, "correlation": 217566 } }, { "ph": "f", "id": 217566, "pid": 76337, "tid": -914061504, "ts": 1716454224762922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762925, "dur": 1, "args": { "External id": 217567, "cbid": 251, "correlation": 217567 } }, { "ph": "f", "id": 217567, "pid": 76337, "tid": -914061504, "ts": 1716454224762925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762927, "dur": 1, "args": { "External id": 217568, "cbid": 251, "correlation": 217568 } }, { "ph": "f", "id": 217568, "pid": 76337, "tid": -914061504, "ts": 1716454224762927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762929, "dur": 1, "args": { "External id": 217569, "cbid": 251, "correlation": 217569 } }, { "ph": "f", "id": 217569, "pid": 76337, "tid": -914061504, "ts": 1716454224762929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762931, "dur": 1, "args": { "External id": 217570, "cbid": 251, "correlation": 217570 } }, { "ph": "f", "id": 217570, "pid": 76337, "tid": -914061504, "ts": 1716454224762931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762933, "dur": 1, "args": { "External id": 217571, "cbid": 251, "correlation": 217571 } }, { "ph": "f", "id": 217571, "pid": 76337, "tid": -914061504, "ts": 1716454224762933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762935, "dur": 0, "args": { "External id": 217572, "cbid": 251, "correlation": 217572 } }, { "ph": "f", "id": 217572, "pid": 76337, "tid": -914061504, "ts": 1716454224762935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762936, "dur": 1, "args": { "External id": 217573, "cbid": 251, "correlation": 217573 } }, { "ph": "f", "id": 217573, "pid": 76337, "tid": -914061504, "ts": 1716454224762936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224762938, "dur": 0, "args": { "External id": 217574, "cbid": 251, "correlation": 217574 } }, { "ph": "f", "id": 217574, "pid": 76337, "tid": -914061504, "ts": 1716454224762938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224891511, "dur": 117, "args": { "External id": 217575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217575, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217575, "pid": 5, "tid": 7, "ts": 1716454224891511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762940, "dur": 13, "args": { "External id": 217575, "cbid": 211, "correlation": 217575 } }, { "ph": "s", "id": 217575, "pid": 76337, "tid": -914061504, "ts": 1716454224762940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224891629, "dur": 61, "args": { "External id": 217581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217581, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217581, "pid": 5, "tid": 7, "ts": 1716454224891629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224762986, "dur": 9, "args": { "External id": 217581, "cbid": 211, "correlation": 217581 } }, { "ph": "s", "id": 217581, "pid": 76337, "tid": -914061504, "ts": 1716454224762986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224891691, "dur": 50, "args": { "External id": 217589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217589, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217589, "pid": 5, "tid": 7, "ts": 1716454224891691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763019, "dur": 98, "args": { "External id": 217589, "cbid": 211, "correlation": 217589 } }, { "ph": "s", "id": 217589, "pid": 76337, "tid": -914061504, "ts": 1716454224763019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224891742, "dur": 99, "args": { "External id": 217598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217598, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217598, "pid": 5, "tid": 7, "ts": 1716454224891742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763161, "dur": 12, "args": { "External id": 217598, "cbid": 211, "correlation": 217598 } }, { "ph": "s", "id": 217598, "pid": 76337, "tid": -914061504, "ts": 1716454224763161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224891842, "dur": 93, "args": { "External id": 217618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217618, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 217618, "pid": 5, "tid": 7, "ts": 1716454224891842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763259, "dur": 13, "args": { "External id": 217618, "cbid": 211, "correlation": 217618 } }, { "ph": "s", "id": 217618, "pid": 76337, "tid": -914061504, "ts": 1716454224763259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224891937, "dur": 5, "args": { "External id": 217630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 217630, "pid": 5, "tid": 7, "ts": 1716454224891937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763283, "dur": 7, "args": { "External id": 217630, "cbid": 211, "correlation": 217630 } }, { "ph": "s", "id": 217630, "pid": 76337, "tid": -914061504, "ts": 1716454224763283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224891943, "dur": 107, "args": { "External id": 217633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217633, "pid": 5, "tid": 7, "ts": 1716454224891943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763305, "dur": 70, "args": { "External id": 217633, "cbid": 211, "correlation": 217633 } }, { "ph": "s", "id": 217633, "pid": 76337, "tid": -914061504, "ts": 1716454224763305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224892051, "dur": 69, "args": { "External id": 217642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217642, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217642, "pid": 5, "tid": 7, "ts": 1716454224892051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763422, "dur": 12, "args": { "External id": 217642, "cbid": 211, "correlation": 217642 } }, { "ph": "s", "id": 217642, "pid": 76337, "tid": -914061504, "ts": 1716454224763422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224763479, "dur": 0, "args": { "External id": 217652, "cbid": 317, "correlation": 217652 } }, { "ph": "f", "id": 217652, "pid": 76337, "tid": -914061504, "ts": 1716454224763479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224763480, "dur": 0, "args": { "External id": 217653, "cbid": 203, "correlation": 217653 } }, { "ph": "f", "id": 217653, "pid": 76337, "tid": -914061504, "ts": 1716454224763480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224763481, "dur": 0, "args": { "External id": 217654, "cbid": 205, "correlation": 217654 } }, { "ph": "f", "id": 217654, "pid": 76337, "tid": -914061504, "ts": 1716454224763481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224892121, "dur": 78, "args": { "External id": 217658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217658, "pid": 5, "tid": 7, "ts": 1716454224892121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763523, "dur": 15, "args": { "External id": 217658, "cbid": 211, "correlation": 217658 } }, { "ph": "s", "id": 217658, "pid": 76337, "tid": -914061504, "ts": 1716454224763523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224892201, "dur": 25, "args": { "External id": 217660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217660, "pid": 5, "tid": 7, "ts": 1716454224892201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763540, "dur": 6, "args": { "External id": 217660, "cbid": 211, "correlation": 217660 } }, { "ph": "s", "id": 217660, "pid": 76337, "tid": -914061504, "ts": 1716454224763540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224892227, "dur": 4, "args": { "External id": 217662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217662, "pid": 5, "tid": 7, "ts": 1716454224892227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763558, "dur": 8, "args": { "External id": 217662, "cbid": 211, "correlation": 217662 } }, { "ph": "s", "id": 217662, "pid": 76337, "tid": -914061504, "ts": 1716454224763558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224763573, "dur": 0, "args": { "External id": 217663, "cbid": 51, "correlation": 217663 } }, { "ph": "s", "id": 217663, "pid": 76337, "tid": -914061504, "ts": 1716454224763573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224892231, "dur": 1377, "args": { "External id": 217664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217664, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217664, "pid": 5, "tid": 7, "ts": 1716454224892231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763574, "dur": 7, "args": { "External id": 217664, "cbid": 211, "correlation": 217664 } }, { "ph": "s", "id": 217664, "pid": 76337, "tid": -914061504, "ts": 1716454224763574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224893609, "dur": 60, "args": { "External id": 217669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217669, "pid": 5, "tid": 7, "ts": 1716454224893609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763604, "dur": 8, "args": { "External id": 217669, "cbid": 211, "correlation": 217669 } }, { "ph": "s", "id": 217669, "pid": 76337, "tid": -914061504, "ts": 1716454224763604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224893671, "dur": 3, "args": { "External id": 217677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217677, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217677, "pid": 5, "tid": 7, "ts": 1716454224893671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763649, "dur": 9, "args": { "External id": 217677, "cbid": 211, "correlation": 217677 } }, { "ph": "s", "id": 217677, "pid": 76337, "tid": -914061504, "ts": 1716454224763649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224763721, "dur": 2, "args": { "External id": 217693, "cbid": 251, "correlation": 217693 } }, { "ph": "f", "id": 217693, "pid": 76337, "tid": -914061504, "ts": 1716454224763721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224763727, "dur": 0, "args": { "External id": 217695, "cbid": 251, "correlation": 217695 } }, { "ph": "f", "id": 217695, "pid": 76337, "tid": -914061504, "ts": 1716454224763727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224893676, "dur": 11, "args": { "External id": 217696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217696, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 217696, "pid": 5, "tid": 7, "ts": 1716454224893676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763729, "dur": 13, "args": { "External id": 217696, "cbid": 211, "correlation": 217696 } }, { "ph": "s", "id": 217696, "pid": 76337, "tid": -914061504, "ts": 1716454224763729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224893688, "dur": 5, "args": { "External id": 217698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217698, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 217698, "pid": 5, "tid": 7, "ts": 1716454224893688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763747, "dur": 6, "args": { "External id": 217698, "cbid": 211, "correlation": 217698 } }, { "ph": "s", "id": 217698, "pid": 76337, "tid": -914061504, "ts": 1716454224763747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224893695, "dur": 55, "args": { "External id": 217708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217708, "pid": 5, "tid": 7, "ts": 1716454224893695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224763816, "dur": 468, "args": { "External id": 217708, "cbid": 211, "correlation": 217708 } }, { "ph": "s", "id": 217708, "pid": 76337, "tid": -914061504, "ts": 1716454224763816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224893751, "dur": 52, "args": { "External id": 217728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217728, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 217728, "pid": 5, "tid": 7, "ts": 1716454224893751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764339, "dur": 11, "args": { "External id": 217728, "cbid": 211, "correlation": 217728 } }, { "ph": "s", "id": 217728, "pid": 76337, "tid": -914061504, "ts": 1716454224764339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224893804, "dur": 4, "args": { "External id": 217740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217740, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217740, "pid": 5, "tid": 7, "ts": 1716454224893804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764360, "dur": 7, "args": { "External id": 217740, "cbid": 211, "correlation": 217740 } }, { "ph": "s", "id": 217740, "pid": 76337, "tid": -914061504, "ts": 1716454224764360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224893809, "dur": 56, "args": { "External id": 217743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217743, "pid": 5, "tid": 7, "ts": 1716454224893809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764379, "dur": 7, "args": { "External id": 217743, "cbid": 211, "correlation": 217743 } }, { "ph": "s", "id": 217743, "pid": 76337, "tid": -914061504, "ts": 1716454224764379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224893866, "dur": 37, "args": { "External id": 217752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217752, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217752, "pid": 5, "tid": 7, "ts": 1716454224893866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764421, "dur": 9, "args": { "External id": 217752, "cbid": 211, "correlation": 217752 } }, { "ph": "s", "id": 217752, "pid": 76337, "tid": -914061504, "ts": 1716454224764421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224764484, "dur": 0, "args": { "External id": 217762, "cbid": 317, "correlation": 217762 } }, { "ph": "f", "id": 217762, "pid": 76337, "tid": -914061504, "ts": 1716454224764484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224764485, "dur": 0, "args": { "External id": 217763, "cbid": 203, "correlation": 217763 } }, { "ph": "f", "id": 217763, "pid": 76337, "tid": -914061504, "ts": 1716454224764485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224764486, "dur": 0, "args": { "External id": 217764, "cbid": 205, "correlation": 217764 } }, { "ph": "f", "id": 217764, "pid": 76337, "tid": -914061504, "ts": 1716454224764486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224893904, "dur": 40, "args": { "External id": 217768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217768, "pid": 5, "tid": 7, "ts": 1716454224893904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764512, "dur": 13, "args": { "External id": 217768, "cbid": 211, "correlation": 217768 } }, { "ph": "s", "id": 217768, "pid": 76337, "tid": -914061504, "ts": 1716454224764512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224893946, "dur": 14, "args": { "External id": 217770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217770, "pid": 5, "tid": 7, "ts": 1716454224893946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764527, "dur": 5, "args": { "External id": 217770, "cbid": 211, "correlation": 217770 } }, { "ph": "s", "id": 217770, "pid": 76337, "tid": -914061504, "ts": 1716454224764527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224893961, "dur": 3, "args": { "External id": 217772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217772, "pid": 5, "tid": 7, "ts": 1716454224893961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764538, "dur": 5, "args": { "External id": 217772, "cbid": 211, "correlation": 217772 } }, { "ph": "s", "id": 217772, "pid": 76337, "tid": -914061504, "ts": 1716454224764538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224764546, "dur": 0, "args": { "External id": 217773, "cbid": 51, "correlation": 217773 } }, { "ph": "s", "id": 217773, "pid": 76337, "tid": -914061504, "ts": 1716454224764546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224893966, "dur": 699, "args": { "External id": 217774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217774, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217774, "pid": 5, "tid": 7, "ts": 1716454224893966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764547, "dur": 5, "args": { "External id": 217774, "cbid": 211, "correlation": 217774 } }, { "ph": "s", "id": 217774, "pid": 76337, "tid": -914061504, "ts": 1716454224764547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224894667, "dur": 59, "args": { "External id": 217779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217779, "pid": 5, "tid": 7, "ts": 1716454224894667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764575, "dur": 8, "args": { "External id": 217779, "cbid": 211, "correlation": 217779 } }, { "ph": "s", "id": 217779, "pid": 76337, "tid": -914061504, "ts": 1716454224764575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224764633, "dur": 0, "args": { "External id": 217789, "cbid": 317, "correlation": 217789 } }, { "ph": "f", "id": 217789, "pid": 76337, "tid": -914061504, "ts": 1716454224764633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224764634, "dur": 0, "args": { "External id": 217790, "cbid": 203, "correlation": 217790 } }, { "ph": "f", "id": 217790, "pid": 76337, "tid": -914061504, "ts": 1716454224764634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224764634, "dur": 0, "args": { "External id": 217791, "cbid": 205, "correlation": 217791 } }, { "ph": "f", "id": 217791, "pid": 76337, "tid": -914061504, "ts": 1716454224764634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224894727, "dur": 75, "args": { "External id": 217795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217795, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217795, "pid": 5, "tid": 7, "ts": 1716454224894727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764655, "dur": 12, "args": { "External id": 217795, "cbid": 211, "correlation": 217795 } }, { "ph": "s", "id": 217795, "pid": 76337, "tid": -914061504, "ts": 1716454224764655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224894803, "dur": 210, "args": { "External id": 217797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217797, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217797, "pid": 5, "tid": 7, "ts": 1716454224894803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764681, "dur": 8, "args": { "External id": 217797, "cbid": 211, "correlation": 217797 } }, { "ph": "s", "id": 217797, "pid": 76337, "tid": -914061504, "ts": 1716454224764681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224895015, "dur": 38, "args": { "External id": 217799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217799, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217799, "pid": 5, "tid": 7, "ts": 1716454224895015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764696, "dur": 6, "args": { "External id": 217799, "cbid": 211, "correlation": 217799 } }, { "ph": "s", "id": 217799, "pid": 76337, "tid": -914061504, "ts": 1716454224764696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224895054, "dur": 60, "args": { "External id": 217805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217805, "pid": 5, "tid": 7, "ts": 1716454224895054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224764724, "dur": 482, "args": { "External id": 217805, "cbid": 211, "correlation": 217805 } }, { "ph": "s", "id": 217805, "pid": 76337, "tid": -914061504, "ts": 1716454224764724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224895116, "dur": 50, "args": { "External id": 217813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217813, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217813, "pid": 5, "tid": 7, "ts": 1716454224895116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765229, "dur": 9, "args": { "External id": 217813, "cbid": 211, "correlation": 217813 } }, { "ph": "s", "id": 217813, "pid": 76337, "tid": -914061504, "ts": 1716454224765229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224895167, "dur": 36, "args": { "External id": 217821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217821, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217821, "pid": 5, "tid": 7, "ts": 1716454224895167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765259, "dur": 9, "args": { "External id": 217821, "cbid": 211, "correlation": 217821 } }, { "ph": "s", "id": 217821, "pid": 76337, "tid": -914061504, "ts": 1716454224765259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224895204, "dur": 53, "args": { "External id": 217841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217841, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 217841, "pid": 5, "tid": 7, "ts": 1716454224895204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765346, "dur": 12, "args": { "External id": 217841, "cbid": 211, "correlation": 217841 } }, { "ph": "s", "id": 217841, "pid": 76337, "tid": -914061504, "ts": 1716454224765346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224895259, "dur": 4, "args": { "External id": 217853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217853, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217853, "pid": 5, "tid": 7, "ts": 1716454224895259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765367, "dur": 6, "args": { "External id": 217853, "cbid": 211, "correlation": 217853 } }, { "ph": "s", "id": 217853, "pid": 76337, "tid": -914061504, "ts": 1716454224765367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224895264, "dur": 56, "args": { "External id": 217856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217856, "pid": 5, "tid": 7, "ts": 1716454224895264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765386, "dur": 7, "args": { "External id": 217856, "cbid": 211, "correlation": 217856 } }, { "ph": "s", "id": 217856, "pid": 76337, "tid": -914061504, "ts": 1716454224765386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224765443, "dur": 0, "args": { "External id": 217867, "cbid": 317, "correlation": 217867 } }, { "ph": "f", "id": 217867, "pid": 76337, "tid": -914061504, "ts": 1716454224765443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224765444, "dur": 0, "args": { "External id": 217868, "cbid": 203, "correlation": 217868 } }, { "ph": "f", "id": 217868, "pid": 76337, "tid": -914061504, "ts": 1716454224765444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224765444, "dur": 0, "args": { "External id": 217869, "cbid": 205, "correlation": 217869 } }, { "ph": "f", "id": 217869, "pid": 76337, "tid": -914061504, "ts": 1716454224765444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765467, "dur": 1, "args": { "External id": 217873, "cbid": 251, "correlation": 217873 } }, { "ph": "f", "id": 217873, "pid": 76337, "tid": -914061504, "ts": 1716454224765467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765470, "dur": 0, "args": { "External id": 217874, "cbid": 251, "correlation": 217874 } }, { "ph": "f", "id": 217874, "pid": 76337, "tid": -914061504, "ts": 1716454224765470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765471, "dur": 0, "args": { "External id": 217875, "cbid": 251, "correlation": 217875 } }, { "ph": "f", "id": 217875, "pid": 76337, "tid": -914061504, "ts": 1716454224765471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765472, "dur": 0, "args": { "External id": 217876, "cbid": 251, "correlation": 217876 } }, { "ph": "f", "id": 217876, "pid": 76337, "tid": -914061504, "ts": 1716454224765472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765473, "dur": 0, "args": { "External id": 217877, "cbid": 251, "correlation": 217877 } }, { "ph": "f", "id": 217877, "pid": 76337, "tid": -914061504, "ts": 1716454224765473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765474, "dur": 0, "args": { "External id": 217878, "cbid": 251, "correlation": 217878 } }, { "ph": "f", "id": 217878, "pid": 76337, "tid": -914061504, "ts": 1716454224765474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765475, "dur": 0, "args": { "External id": 217879, "cbid": 251, "correlation": 217879 } }, { "ph": "f", "id": 217879, "pid": 76337, "tid": -914061504, "ts": 1716454224765475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765476, "dur": 0, "args": { "External id": 217880, "cbid": 251, "correlation": 217880 } }, { "ph": "f", "id": 217880, "pid": 76337, "tid": -914061504, "ts": 1716454224765476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765477, "dur": 0, "args": { "External id": 217881, "cbid": 251, "correlation": 217881 } }, { "ph": "f", "id": 217881, "pid": 76337, "tid": -914061504, "ts": 1716454224765477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224895322, "dur": 113, "args": { "External id": 217882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217882, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217882, "pid": 5, "tid": 7, "ts": 1716454224895322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765479, "dur": 12, "args": { "External id": 217882, "cbid": 211, "correlation": 217882 } }, { "ph": "s", "id": 217882, "pid": 76337, "tid": -914061504, "ts": 1716454224765479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224895436, "dur": 60, "args": { "External id": 217888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217888, "pid": 5, "tid": 7, "ts": 1716454224895436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765514, "dur": 9, "args": { "External id": 217888, "cbid": 211, "correlation": 217888 } }, { "ph": "s", "id": 217888, "pid": 76337, "tid": -914061504, "ts": 1716454224765514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224895498, "dur": 440, "args": { "External id": 217897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217897, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217897, "pid": 5, "tid": 7, "ts": 1716454224895498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765599, "dur": 14, "args": { "External id": 217897, "cbid": 211, "correlation": 217897 } }, { "ph": "s", "id": 217897, "pid": 76337, "tid": -914061504, "ts": 1716454224765599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224895940, "dur": 182, "args": { "External id": 217919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217919, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217919, "pid": 5, "tid": 7, "ts": 1716454224895940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765657, "dur": 10, "args": { "External id": 217919, "cbid": 211, "correlation": 217919 } }, { "ph": "s", "id": 217919, "pid": 76337, "tid": -914061504, "ts": 1716454224765657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765747, "dur": 1, "args": { "External id": 217930, "cbid": 251, "correlation": 217930 } }, { "ph": "f", "id": 217930, "pid": 76337, "tid": -914061504, "ts": 1716454224765747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224896123, "dur": 196, "args": { "External id": 217931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217931, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217931, "pid": 5, "tid": 7, "ts": 1716454224896123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765752, "dur": 14, "args": { "External id": 217931, "cbid": 211, "correlation": 217931 } }, { "ph": "s", "id": 217931, "pid": 76337, "tid": -914061504, "ts": 1716454224765752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765821, "dur": 1, "args": { "External id": 217942, "cbid": 251, "correlation": 217942 } }, { "ph": "f", "id": 217942, "pid": 76337, "tid": -914061504, "ts": 1716454224765821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224896321, "dur": 186, "args": { "External id": 217943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217943, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217943, "pid": 5, "tid": 7, "ts": 1716454224896321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765826, "dur": 11, "args": { "External id": 217943, "cbid": 211, "correlation": 217943 } }, { "ph": "s", "id": 217943, "pid": 76337, "tid": -914061504, "ts": 1716454224765826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224765889, "dur": 1, "args": { "External id": 217954, "cbid": 251, "correlation": 217954 } }, { "ph": "f", "id": 217954, "pid": 76337, "tid": -914061504, "ts": 1716454224765889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224896509, "dur": 190, "args": { "External id": 217955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217955, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217955, "pid": 5, "tid": 7, "ts": 1716454224896509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765893, "dur": 11, "args": { "External id": 217955, "cbid": 211, "correlation": 217955 } }, { "ph": "s", "id": 217955, "pid": 76337, "tid": -914061504, "ts": 1716454224765893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224896700, "dur": 18763, "args": { "External id": 217976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217976, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 217976, "pid": 5, "tid": 7, "ts": 1716454224896700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224765982, "dur": 13, "args": { "External id": 217976, "cbid": 211, "correlation": 217976 } }, { "ph": "s", "id": 217976, "pid": 76337, "tid": -914061504, "ts": 1716454224765982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224766082, "dur": 1, "args": { "External id": 217994, "cbid": 251, "correlation": 217994 } }, { "ph": "f", "id": 217994, "pid": 76337, "tid": -914061504, "ts": 1716454224766082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224915465, "dur": 202, "args": { "External id": 217996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217996, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217996, "pid": 5, "tid": 7, "ts": 1716454224915465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224766088, "dur": 13, "args": { "External id": 217996, "cbid": 211, "correlation": 217996 } }, { "ph": "s", "id": 217996, "pid": 76337, "tid": -914061504, "ts": 1716454224766088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224915669, "dur": 67, "args": { "External id": 218004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218004, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218004, "pid": 5, "tid": 7, "ts": 1716454224915669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224766157, "dur": 12, "args": { "External id": 218004, "cbid": 211, "correlation": 218004 } }, { "ph": "s", "id": 218004, "pid": 76337, "tid": -914061504, "ts": 1716454224766157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224915737, "dur": 97, "args": { "External id": 218012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218012, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218012, "pid": 5, "tid": 7, "ts": 1716454224915737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224766196, "dur": 100, "args": { "External id": 218012, "cbid": 211, "correlation": 218012 } }, { "ph": "s", "id": 218012, "pid": 76337, "tid": -914061504, "ts": 1716454224766196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224915835, "dur": 53, "args": { "External id": 218023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218023, "pid": 5, "tid": 7, "ts": 1716454224915835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224766361, "dur": 1911, "args": { "External id": 218023, "cbid": 211, "correlation": 218023 } }, { "ph": "s", "id": 218023, "pid": 76337, "tid": -914061504, "ts": 1716454224766361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224915889, "dur": 93, "args": { "External id": 218045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218045, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218045, "pid": 5, "tid": 7, "ts": 1716454224915889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768294, "dur": 125, "args": { "External id": 218045, "cbid": 211, "correlation": 218045 } }, { "ph": "s", "id": 218045, "pid": 76337, "tid": -914061504, "ts": 1716454224768294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224768499, "dur": 1, "args": { "External id": 218056, "cbid": 251, "correlation": 218056 } }, { "ph": "f", "id": 218056, "pid": 76337, "tid": -914061504, "ts": 1716454224768499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224915983, "dur": 104, "args": { "External id": 218057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218057, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 218057, "pid": 5, "tid": 7, "ts": 1716454224915983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768504, "dur": 14, "args": { "External id": 218057, "cbid": 211, "correlation": 218057 } }, { "ph": "s", "id": 218057, "pid": 76337, "tid": -914061504, "ts": 1716454224768504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224768577, "dur": 1, "args": { "External id": 218068, "cbid": 251, "correlation": 218068 } }, { "ph": "f", "id": 218068, "pid": 76337, "tid": -914061504, "ts": 1716454224768577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224768582, "dur": 0, "args": { "External id": 218069, "cbid": 251, "correlation": 218069 } }, { "ph": "f", "id": 218069, "pid": 76337, "tid": -914061504, "ts": 1716454224768582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224916088, "dur": 10, "args": { "External id": 218070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218070, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218070, "pid": 5, "tid": 7, "ts": 1716454224916088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768584, "dur": 13, "args": { "External id": 218070, "cbid": 211, "correlation": 218070 } }, { "ph": "s", "id": 218070, "pid": 76337, "tid": -914061504, "ts": 1716454224768584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224916100, "dur": 5, "args": { "External id": 218072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218072, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 218072, "pid": 5, "tid": 7, "ts": 1716454224916100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768598, "dur": 6, "args": { "External id": 218072, "cbid": 211, "correlation": 218072 } }, { "ph": "s", "id": 218072, "pid": 76337, "tid": -914061504, "ts": 1716454224768598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224768659, "dur": 1, "args": { "External id": 218083, "cbid": 251, "correlation": 218083 } }, { "ph": "f", "id": 218083, "pid": 76337, "tid": -914061504, "ts": 1716454224768659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224768662, "dur": 0, "args": { "External id": 218084, "cbid": 251, "correlation": 218084 } }, { "ph": "f", "id": 218084, "pid": 76337, "tid": -914061504, "ts": 1716454224768662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224916106, "dur": 6, "args": { "External id": 218085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218085, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218085, "pid": 5, "tid": 7, "ts": 1716454224916106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768664, "dur": 12, "args": { "External id": 218085, "cbid": 211, "correlation": 218085 } }, { "ph": "s", "id": 218085, "pid": 76337, "tid": -914061504, "ts": 1716454224768664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224916114, "dur": 3, "args": { "External id": 218087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218087, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 218087, "pid": 5, "tid": 7, "ts": 1716454224916114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768677, "dur": 6, "args": { "External id": 218087, "cbid": 211, "correlation": 218087 } }, { "ph": "s", "id": 218087, "pid": 76337, "tid": -914061504, "ts": 1716454224768677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224916118, "dur": 156, "args": { "External id": 218108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218108, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 218108, "pid": 5, "tid": 7, "ts": 1716454224916118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768752, "dur": 12, "args": { "External id": 218108, "cbid": 211, "correlation": 218108 } }, { "ph": "s", "id": 218108, "pid": 76337, "tid": -914061504, "ts": 1716454224768752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224768847, "dur": 1, "args": { "External id": 218126, "cbid": 251, "correlation": 218126 } }, { "ph": "f", "id": 218126, "pid": 76337, "tid": -914061504, "ts": 1716454224768847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224916276, "dur": 106, "args": { "External id": 218128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218128, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 218128, "pid": 5, "tid": 7, "ts": 1716454224916276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768853, "dur": 13, "args": { "External id": 218128, "cbid": 211, "correlation": 218128 } }, { "ph": "s", "id": 218128, "pid": 76337, "tid": -914061504, "ts": 1716454224768853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224916383, "dur": 34, "args": { "External id": 218136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218136, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218136, "pid": 5, "tid": 7, "ts": 1716454224916383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768922, "dur": 12, "args": { "External id": 218136, "cbid": 211, "correlation": 218136 } }, { "ph": "s", "id": 218136, "pid": 76337, "tid": -914061504, "ts": 1716454224768922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224916419, "dur": 68, "args": { "External id": 218144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218144, "pid": 5, "tid": 7, "ts": 1716454224916419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224768964, "dur": 17, "args": { "External id": 218144, "cbid": 211, "correlation": 218144 } }, { "ph": "s", "id": 218144, "pid": 76337, "tid": -914061504, "ts": 1716454224768964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224916488, "dur": 92, "args": { "External id": 218166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218166, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218166, "pid": 5, "tid": 7, "ts": 1716454224916488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769025, "dur": 10, "args": { "External id": 218166, "cbid": 211, "correlation": 218166 } }, { "ph": "s", "id": 218166, "pid": 76337, "tid": -914061504, "ts": 1716454224769025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769112, "dur": 1, "args": { "External id": 218182, "cbid": 251, "correlation": 218182 } }, { "ph": "f", "id": 218182, "pid": 76337, "tid": -914061504, "ts": 1716454224769112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224916582, "dur": 584, "args": { "External id": 218184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218184, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 218184, "pid": 5, "tid": 7, "ts": 1716454224916582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769118, "dur": 12, "args": { "External id": 218184, "cbid": 211, "correlation": 218184 } }, { "ph": "s", "id": 218184, "pid": 76337, "tid": -914061504, "ts": 1716454224769118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224917167, "dur": 246, "args": { "External id": 218192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218192, "pid": 5, "tid": 7, "ts": 1716454224917167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769186, "dur": 13, "args": { "External id": 218192, "cbid": 211, "correlation": 218192 } }, { "ph": "s", "id": 218192, "pid": 76337, "tid": -914061504, "ts": 1716454224769186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224917414, "dur": 253, "args": { "External id": 218200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218200, "pid": 5, "tid": 7, "ts": 1716454224917414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769220, "dur": 8, "args": { "External id": 218200, "cbid": 211, "correlation": 218200 } }, { "ph": "s", "id": 218200, "pid": 76337, "tid": -914061504, "ts": 1716454224769220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769303, "dur": 2, "args": { "External id": 218216, "cbid": 251, "correlation": 218216 } }, { "ph": "f", "id": 218216, "pid": 76337, "tid": -914061504, "ts": 1716454224769303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769308, "dur": 0, "args": { "External id": 218218, "cbid": 251, "correlation": 218218 } }, { "ph": "f", "id": 218218, "pid": 76337, "tid": -914061504, "ts": 1716454224769308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224917669, "dur": 360, "args": { "External id": 218219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218219, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218219, "pid": 5, "tid": 7, "ts": 1716454224917669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769311, "dur": 14, "args": { "External id": 218219, "cbid": 211, "correlation": 218219 } }, { "ph": "s", "id": 218219, "pid": 76337, "tid": -914061504, "ts": 1716454224769311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918030, "dur": 50, "args": { "External id": 218227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218227, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218227, "pid": 5, "tid": 7, "ts": 1716454224918030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769356, "dur": 175, "args": { "External id": 218227, "cbid": 211, "correlation": 218227 } }, { "ph": "s", "id": 218227, "pid": 76337, "tid": -914061504, "ts": 1716454224769356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224918081, "dur": 159, "args": { "External id": 218238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218238, "pid": 5, "tid": 7, "ts": 1716454224918081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769590, "dur": 67, "args": { "External id": 218238, "cbid": 211, "correlation": 218238 } }, { "ph": "s", "id": 218238, "pid": 76337, "tid": -914061504, "ts": 1716454224769590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224769710, "dur": 0, "args": { "External id": 218250, "cbid": 317, "correlation": 218250 } }, { "ph": "f", "id": 218250, "pid": 76337, "tid": -914061504, "ts": 1716454224769710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224769711, "dur": 0, "args": { "External id": 218251, "cbid": 203, "correlation": 218251 } }, { "ph": "f", "id": 218251, "pid": 76337, "tid": -914061504, "ts": 1716454224769711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224769711, "dur": 0, "args": { "External id": 218252, "cbid": 205, "correlation": 218252 } }, { "ph": "f", "id": 218252, "pid": 76337, "tid": -914061504, "ts": 1716454224769711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769737, "dur": 1, "args": { "External id": 218256, "cbid": 251, "correlation": 218256 } }, { "ph": "f", "id": 218256, "pid": 76337, "tid": -914061504, "ts": 1716454224769737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769739, "dur": 0, "args": { "External id": 218257, "cbid": 251, "correlation": 218257 } }, { "ph": "f", "id": 218257, "pid": 76337, "tid": -914061504, "ts": 1716454224769739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769740, "dur": 0, "args": { "External id": 218258, "cbid": 251, "correlation": 218258 } }, { "ph": "f", "id": 218258, "pid": 76337, "tid": -914061504, "ts": 1716454224769740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769740, "dur": 0, "args": { "External id": 218259, "cbid": 251, "correlation": 218259 } }, { "ph": "f", "id": 218259, "pid": 76337, "tid": -914061504, "ts": 1716454224769740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769741, "dur": 0, "args": { "External id": 218260, "cbid": 251, "correlation": 218260 } }, { "ph": "f", "id": 218260, "pid": 76337, "tid": -914061504, "ts": 1716454224769741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769742, "dur": 0, "args": { "External id": 218261, "cbid": 251, "correlation": 218261 } }, { "ph": "f", "id": 218261, "pid": 76337, "tid": -914061504, "ts": 1716454224769742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769742, "dur": 0, "args": { "External id": 218262, "cbid": 251, "correlation": 218262 } }, { "ph": "f", "id": 218262, "pid": 76337, "tid": -914061504, "ts": 1716454224769742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769743, "dur": 0, "args": { "External id": 218263, "cbid": 251, "correlation": 218263 } }, { "ph": "f", "id": 218263, "pid": 76337, "tid": -914061504, "ts": 1716454224769743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224769744, "dur": 0, "args": { "External id": 218264, "cbid": 251, "correlation": 218264 } }, { "ph": "f", "id": 218264, "pid": 76337, "tid": -914061504, "ts": 1716454224769744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224918241, "dur": 117, "args": { "External id": 218265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218265, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 218265, "pid": 5, "tid": 7, "ts": 1716454224918241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769746, "dur": 38, "args": { "External id": 218265, "cbid": 211, "correlation": 218265 } }, { "ph": "s", "id": 218265, "pid": 76337, "tid": -914061504, "ts": 1716454224769746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224918360, "dur": 59, "args": { "External id": 218271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218271, "pid": 5, "tid": 7, "ts": 1716454224918360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224769807, "dur": 285, "args": { "External id": 218271, "cbid": 211, "correlation": 218271 } }, { "ph": "s", "id": 218271, "pid": 76337, "tid": -914061504, "ts": 1716454224769807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918421, "dur": 49, "args": { "External id": 218279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218279, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218279, "pid": 5, "tid": 7, "ts": 1716454224918421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770116, "dur": 10, "args": { "External id": 218279, "cbid": 211, "correlation": 218279 } }, { "ph": "s", "id": 218279, "pid": 76337, "tid": -914061504, "ts": 1716454224770116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224918471, "dur": 52, "args": { "External id": 218299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218299, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 218299, "pid": 5, "tid": 7, "ts": 1716454224918471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770202, "dur": 11, "args": { "External id": 218299, "cbid": 211, "correlation": 218299 } }, { "ph": "s", "id": 218299, "pid": 76337, "tid": -914061504, "ts": 1716454224770202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224918524, "dur": 4, "args": { "External id": 218311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218311, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 218311, "pid": 5, "tid": 7, "ts": 1716454224918524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770223, "dur": 7, "args": { "External id": 218311, "cbid": 211, "correlation": 218311 } }, { "ph": "s", "id": 218311, "pid": 76337, "tid": -914061504, "ts": 1716454224770223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224918530, "dur": 57, "args": { "External id": 218314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218314, "pid": 5, "tid": 7, "ts": 1716454224918530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770242, "dur": 107, "args": { "External id": 218314, "cbid": 211, "correlation": 218314 } }, { "ph": "s", "id": 218314, "pid": 76337, "tid": -914061504, "ts": 1716454224770242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918588, "dur": 37, "args": { "External id": 218323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218323, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218323, "pid": 5, "tid": 7, "ts": 1716454224918588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770388, "dur": 11, "args": { "External id": 218323, "cbid": 211, "correlation": 218323 } }, { "ph": "s", "id": 218323, "pid": 76337, "tid": -914061504, "ts": 1716454224770388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224770443, "dur": 0, "args": { "External id": 218333, "cbid": 317, "correlation": 218333 } }, { "ph": "f", "id": 218333, "pid": 76337, "tid": -914061504, "ts": 1716454224770443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224770444, "dur": 0, "args": { "External id": 218334, "cbid": 203, "correlation": 218334 } }, { "ph": "f", "id": 218334, "pid": 76337, "tid": -914061504, "ts": 1716454224770444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224770445, "dur": 0, "args": { "External id": 218335, "cbid": 205, "correlation": 218335 } }, { "ph": "f", "id": 218335, "pid": 76337, "tid": -914061504, "ts": 1716454224770445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224918626, "dur": 42, "args": { "External id": 218339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218339, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218339, "pid": 5, "tid": 7, "ts": 1716454224918626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770461, "dur": 12, "args": { "External id": 218339, "cbid": 211, "correlation": 218339 } }, { "ph": "s", "id": 218339, "pid": 76337, "tid": -914061504, "ts": 1716454224770461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224918669, "dur": 3, "args": { "External id": 218341, "device": 5, "context": 1, "stream": 7, "correlation": 218341, "bytes": 46080, "memory bandwidth (GB/s)": 12.413793103448276 } }, { "ph": "f", "id": 218341, "pid": 5, "tid": 7, "ts": 1716454224918669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224770476, "dur": 26, "args": { "External id": 218341, "cbid": 51, "correlation": 218341 } }, { "ph": "s", "id": 218341, "pid": 76337, "tid": -914061504, "ts": 1716454224770476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224770507, "dur": 2, "args": { "External id": 218343, "cbid": 200, "correlation": 218343 } }, { "ph": "f", "id": 218343, "pid": 76337, "tid": -914061504, "ts": 1716454224770507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224770510, "dur": 0, "args": { "External id": 218344, "cbid": 200, "correlation": 218344 } }, { "ph": "f", "id": 218344, "pid": 76337, "tid": -914061504, "ts": 1716454224770510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224770511, "dur": 0, "args": { "External id": 218345, "cbid": 200, "correlation": 218345 } }, { "ph": "f", "id": 218345, "pid": 76337, "tid": -914061504, "ts": 1716454224770511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224770511, "dur": 0, "args": { "External id": 218346, "cbid": 200, "correlation": 218346 } }, { "ph": "f", "id": 218346, "pid": 76337, "tid": -914061504, "ts": 1716454224770511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454224770512, "dur": 4, "args": { "External id": 218347, "cbid": 15, "correlation": 218347 } }, { "ph": "f", "id": 218347, "pid": 76337, "tid": -914061504, "ts": 1716454224770512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224770517, "dur": 1, "args": { "External id": 218348, "cbid": 251, "correlation": 218348 } }, { "ph": "f", "id": 218348, "pid": 76337, "tid": -914061504, "ts": 1716454224770517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454224918674, "dur": 23, "args": { "External id": 218349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218349, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218349, "pid": 5, "tid": 7, "ts": 1716454224918674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770520, "dur": 9, "args": { "External id": 218349, "cbid": 211, "correlation": 218349 } }, { "ph": "s", "id": 218349, "pid": 76337, "tid": -914061504, "ts": 1716454224770520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224918698, "dur": 4, "args": { "External id": 218351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 218351, "pid": 5, "tid": 7, "ts": 1716454224918698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770536, "dur": 5, "args": { "External id": 218351, "cbid": 211, "correlation": 218351 } }, { "ph": "s", "id": 218351, "pid": 76337, "tid": -914061504, "ts": 1716454224770536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224770546, "dur": 0, "args": { "External id": 218352, "cbid": 51, "correlation": 218352 } }, { "ph": "s", "id": 218352, "pid": 76337, "tid": -914061504, "ts": 1716454224770546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224918704, "dur": 190, "args": { "External id": 218353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218353, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218353, "pid": 5, "tid": 7, "ts": 1716454224918704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770547, "dur": 197, "args": { "External id": 218353, "cbid": 211, "correlation": 218353 } }, { "ph": "s", "id": 218353, "pid": 76337, "tid": -914061504, "ts": 1716454224770547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224918896, "dur": 6, "args": { "External id": 218354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218354, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218354, "pid": 5, "tid": 7, "ts": 1716454224918896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770747, "dur": 5, "args": { "External id": 218354, "cbid": 211, "correlation": 218354 } }, { "ph": "s", "id": 218354, "pid": 76337, "tid": -914061504, "ts": 1716454224770747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224918904, "dur": 5, "args": { "External id": 218360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 218360, "pid": 5, "tid": 7, "ts": 1716454224918904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224770777, "dur": 9, "args": { "External id": 218360, "cbid": 211, "correlation": 218360 } }, { "ph": "s", "id": 218360, "pid": 76337, "tid": -914061504, "ts": 1716454224770777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918910, "dur": 3, "args": { "External id": 218368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218368, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218368, "pid": 5, "tid": 7, "ts": 1716454224918910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224772616, "dur": 15, "args": { "External id": 218368, "cbid": 211, "correlation": 218368 } }, { "ph": "s", "id": 218368, "pid": 76337, "tid": -914061504, "ts": 1716454224772616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918914, "dur": 3, "args": { "External id": 218376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218376, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218376, "pid": 5, "tid": 7, "ts": 1716454224918914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224772659, "dur": 10, "args": { "External id": 218376, "cbid": 211, "correlation": 218376 } }, { "ph": "s", "id": 218376, "pid": 76337, "tid": -914061504, "ts": 1716454224772659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918919, "dur": 3, "args": { "External id": 218384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218384, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218384, "pid": 5, "tid": 7, "ts": 1716454224918919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224772685, "dur": 8, "args": { "External id": 218384, "cbid": 211, "correlation": 218384 } }, { "ph": "s", "id": 218384, "pid": 76337, "tid": -914061504, "ts": 1716454224772685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918922, "dur": 3, "args": { "External id": 218393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218393, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218393, "pid": 5, "tid": 7, "ts": 1716454224918922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224772859, "dur": 14, "args": { "External id": 218393, "cbid": 211, "correlation": 218393 } }, { "ph": "s", "id": 218393, "pid": 76337, "tid": -914061504, "ts": 1716454224772859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918927, "dur": 3, "args": { "External id": 218402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218402, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218402, "pid": 5, "tid": 7, "ts": 1716454224918927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224772888, "dur": 8, "args": { "External id": 218402, "cbid": 211, "correlation": 218402 } }, { "ph": "s", "id": 218402, "pid": 76337, "tid": -914061504, "ts": 1716454224772888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918931, "dur": 3, "args": { "External id": 218410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218410, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218410, "pid": 5, "tid": 7, "ts": 1716454224918931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224772914, "dur": 8, "args": { "External id": 218410, "cbid": 211, "correlation": 218410 } }, { "ph": "s", "id": 218410, "pid": 76337, "tid": -914061504, "ts": 1716454224772914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918936, "dur": 3, "args": { "External id": 218418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218418, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218418, "pid": 5, "tid": 7, "ts": 1716454224918936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224773180, "dur": 16, "args": { "External id": 218418, "cbid": 211, "correlation": 218418 } }, { "ph": "s", "id": 218418, "pid": 76337, "tid": -914061504, "ts": 1716454224773180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224918940, "dur": 3, "args": { "External id": 218426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218426, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218426, "pid": 5, "tid": 7, "ts": 1716454224918940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224773212, "dur": 8, "args": { "External id": 218426, "cbid": 211, "correlation": 218426 } }, { "ph": "s", "id": 218426, "pid": 76337, "tid": -914061504, "ts": 1716454224773212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224918945, "dur": 1, "args": { "External id": 218436, "device": 5, "context": 1, "stream": 7, "correlation": 218436, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 218436, "pid": 5, "tid": 7, "ts": 1716454224918945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224773278, "dur": 37, "args": { "External id": 218436, "cbid": 41, "correlation": 218436 } }, { "ph": "s", "id": 218436, "pid": 76337, "tid": -914061504, "ts": 1716454224773278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224773316, "dur": 145646, "args": { "External id": 218437, "cbid": 131, "correlation": 218437 } }, { "ph": "f", "id": 218437, "pid": 76337, "tid": -914061504, "ts": 1716454224773316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224919185, "dur": 3, "args": { "External id": 218445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218445, "pid": 5, "tid": 7, "ts": 1716454224919185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919154, "dur": 33, "args": { "External id": 218445, "cbid": 211, "correlation": 218445 } }, { "ph": "s", "id": 218445, "pid": 76337, "tid": -914061504, "ts": 1716454224919154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919292, "dur": 3, "args": { "External id": 218454, "device": 5, "context": 1, "stream": 7, "correlation": 218454, "bytes": 8, "memory bandwidth (GB/s)": 0.0021929824561403508 } }, { "ph": "f", "id": 218454, "pid": 5, "tid": 7, "ts": 1716454224919292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919254, "dur": 39, "args": { "External id": 218454, "cbid": 41, "correlation": 218454 } }, { "ph": "s", "id": 218454, "pid": 76337, "tid": -914061504, "ts": 1716454224919254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224919400, "dur": 4, "args": { "External id": 218464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218464, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218464, "pid": 5, "tid": 7, "ts": 1716454224919400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919384, "dur": 17, "args": { "External id": 218464, "cbid": 211, "correlation": 218464 } }, { "ph": "s", "id": 218464, "pid": 76337, "tid": -914061504, "ts": 1716454224919384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919483, "dur": 1, "args": { "External id": 218474, "device": 5, "context": 1, "stream": 7, "correlation": 218474, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 218474, "pid": 5, "tid": 7, "ts": 1716454224919483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919464, "dur": 17, "args": { "External id": 218474, "cbid": 41, "correlation": 218474 } }, { "ph": "s", "id": 218474, "pid": 76337, "tid": -914061504, "ts": 1716454224919464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224919482, "dur": 8, "args": { "External id": 218475, "cbid": 131, "correlation": 218475 } }, { "ph": "f", "id": 218475, "pid": 76337, "tid": -914061504, "ts": 1716454224919482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919568, "dur": 3, "args": { "External id": 218482, "device": 5, "context": 1, "stream": 7, "correlation": 218482, "bytes": 98304, "memory bandwidth (GB/s)": 30.415841584158414 } }, { "ph": "f", "id": 218482, "pid": 5, "tid": 7, "ts": 1716454224919568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919537, "dur": 30, "args": { "External id": 218482, "cbid": 41, "correlation": 218482 } }, { "ph": "s", "id": 218482, "pid": 76337, "tid": -914061504, "ts": 1716454224919537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919661, "dur": 3, "args": { "External id": 218501, "device": 5, "context": 1, "stream": 7, "correlation": 218501, "bytes": 16, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 218501, "pid": 5, "tid": 7, "ts": 1716454224919661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919642, "dur": 17, "args": { "External id": 218501, "cbid": 41, "correlation": 218501 } }, { "ph": "s", "id": 218501, "pid": 76337, "tid": -914061504, "ts": 1716454224919642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224919699, "dur": 3, "args": { "External id": 218507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218507, "pid": 5, "tid": 7, "ts": 1716454224919699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919688, "dur": 11, "args": { "External id": 218507, "cbid": 211, "correlation": 218507 } }, { "ph": "s", "id": 218507, "pid": 76337, "tid": -914061504, "ts": 1716454224919688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454224919714, "dur": 6, "args": { "External id": 218509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218509, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 218509, "pid": 5, "tid": 7, "ts": 1716454224919714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919704, "dur": 9, "args": { "External id": 218509, "cbid": 211, "correlation": 218509 } }, { "ph": "s", "id": 218509, "pid": 76337, "tid": -914061504, "ts": 1716454224919704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454224919722, "dur": 3, "args": { "External id": 218511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218511, "pid": 5, "tid": 7, "ts": 1716454224919722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919714, "dur": 6, "args": { "External id": 218511, "cbid": 211, "correlation": 218511 } }, { "ph": "s", "id": 218511, "pid": 76337, "tid": -914061504, "ts": 1716454224919714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919756, "dur": 2, "args": { "External id": 218519, "device": 5, "context": 1, "stream": 7, "correlation": 218519, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 218519, "pid": 5, "tid": 7, "ts": 1716454224919756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919742, "dur": 13, "args": { "External id": 218519, "cbid": 41, "correlation": 218519 } }, { "ph": "s", "id": 218519, "pid": 76337, "tid": -914061504, "ts": 1716454224919742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224919811, "dur": 3, "args": { "External id": 218533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218533, "pid": 5, "tid": 7, "ts": 1716454224919811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919799, "dur": 14, "args": { "External id": 218533, "cbid": 211, "correlation": 218533 } }, { "ph": "s", "id": 218533, "pid": 76337, "tid": -914061504, "ts": 1716454224919799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224919832, "dur": 2, "args": { "External id": 218547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218547, "pid": 5, "tid": 7, "ts": 1716454224919832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919824, "dur": 7, "args": { "External id": 218547, "cbid": 211, "correlation": 218547 } }, { "ph": "s", "id": 218547, "pid": 76337, "tid": -914061504, "ts": 1716454224919824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224919869, "dur": 6, "args": { "External id": 218554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218554, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218554, "pid": 5, "tid": 7, "ts": 1716454224919869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919858, "dur": 12, "args": { "External id": 218554, "cbid": 211, "correlation": 218554 } }, { "ph": "s", "id": 218554, "pid": 76337, "tid": -914061504, "ts": 1716454224919858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224919882, "dur": 6, "args": { "External id": 218557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218557, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218557, "pid": 5, "tid": 7, "ts": 1716454224919882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919873, "dur": 7, "args": { "External id": 218557, "cbid": 211, "correlation": 218557 } }, { "ph": "s", "id": 218557, "pid": 76337, "tid": -914061504, "ts": 1716454224919873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454224919890, "dur": 3, "args": { "External id": 218559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218559, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218559, "pid": 5, "tid": 7, "ts": 1716454224919890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919882, "dur": 7, "args": { "External id": 218559, "cbid": 211, "correlation": 218559 } }, { "ph": "s", "id": 218559, "pid": 76337, "tid": -914061504, "ts": 1716454224919882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919910, "dur": 2, "args": { "External id": 218562, "device": 5, "context": 1, "stream": 7, "correlation": 218562, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 218562, "pid": 5, "tid": 7, "ts": 1716454224919910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919898, "dur": 12, "args": { "External id": 218562, "cbid": 41, "correlation": 218562 } }, { "ph": "s", "id": 218562, "pid": 76337, "tid": -914061504, "ts": 1716454224919898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224919962, "dur": 4, "args": { "External id": 218578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218578, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218578, "pid": 5, "tid": 7, "ts": 1716454224919962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224919949, "dur": 14, "args": { "External id": 218578, "cbid": 211, "correlation": 218578 } }, { "ph": "s", "id": 218578, "pid": 76337, "tid": -914061504, "ts": 1716454224919949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224919993, "dur": 3, "args": { "External id": 218583, "device": 5, "context": 1, "stream": 7, "correlation": 218583, "bytes": 1, "memory bandwidth (GB/s)": 0.0003125 } }, { "ph": "f", "id": 218583, "pid": 5, "tid": 7, "ts": 1716454224919993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224919968, "dur": 25, "args": { "External id": 218583, "cbid": 41, "correlation": 218583 } }, { "ph": "s", "id": 218583, "pid": 76337, "tid": -914061504, "ts": 1716454224919968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224920021, "dur": 1, "args": { "External id": 218589, "device": 5, "context": 1, "stream": 7, "correlation": 218589, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 218589, "pid": 5, "tid": 7, "ts": 1716454224920021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224920003, "dur": 27, "args": { "External id": 218589, "cbid": 41, "correlation": 218589 } }, { "ph": "s", "id": 218589, "pid": 76337, "tid": -914061504, "ts": 1716454224920003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224920031, "dur": 3, "args": { "External id": 218590, "cbid": 131, "correlation": 218590 } }, { "ph": "f", "id": 218590, "pid": 76337, "tid": -914061504, "ts": 1716454224920031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920086, "dur": 3, "args": { "External id": 218598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218598, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218598, "pid": 5, "tid": 7, "ts": 1716454224920086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920072, "dur": 14, "args": { "External id": 218598, "cbid": 211, "correlation": 218598 } }, { "ph": "s", "id": 218598, "pid": 76337, "tid": -914061504, "ts": 1716454224920072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920117, "dur": 3, "args": { "External id": 218608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218608, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218608, "pid": 5, "tid": 7, "ts": 1716454224920117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920108, "dur": 9, "args": { "External id": 218608, "cbid": 211, "correlation": 218608 } }, { "ph": "s", "id": 218608, "pid": 76337, "tid": -914061504, "ts": 1716454224920108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920143, "dur": 3, "args": { "External id": 218617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218617, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218617, "pid": 5, "tid": 7, "ts": 1716454224920143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920134, "dur": 8, "args": { "External id": 218617, "cbid": 211, "correlation": 218617 } }, { "ph": "s", "id": 218617, "pid": 76337, "tid": -914061504, "ts": 1716454224920134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224920267, "dur": 12, "args": { "External id": 218627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218627, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218627, "pid": 5, "tid": 7, "ts": 1716454224920267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920251, "dur": 16, "args": { "External id": 218627, "cbid": 211, "correlation": 218627 } }, { "ph": "s", "id": 218627, "pid": 76337, "tid": -914061504, "ts": 1716454224920251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920306, "dur": 3, "args": { "External id": 218635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218635, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218635, "pid": 5, "tid": 7, "ts": 1716454224920306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920297, "dur": 8, "args": { "External id": 218635, "cbid": 211, "correlation": 218635 } }, { "ph": "s", "id": 218635, "pid": 76337, "tid": -914061504, "ts": 1716454224920297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224920353, "dur": 11, "args": { "External id": 218645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218645, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218645, "pid": 5, "tid": 7, "ts": 1716454224920353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920342, "dur": 12, "args": { "External id": 218645, "cbid": 211, "correlation": 218645 } }, { "ph": "s", "id": 218645, "pid": 76337, "tid": -914061504, "ts": 1716454224920342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224920387, "dur": 11, "args": { "External id": 218653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218653, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218653, "pid": 5, "tid": 7, "ts": 1716454224920387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920377, "dur": 9, "args": { "External id": 218653, "cbid": 211, "correlation": 218653 } }, { "ph": "s", "id": 218653, "pid": 76337, "tid": -914061504, "ts": 1716454224920377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920414, "dur": 3, "args": { "External id": 218662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218662, "pid": 5, "tid": 7, "ts": 1716454224920414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920404, "dur": 10, "args": { "External id": 218662, "cbid": 211, "correlation": 218662 } }, { "ph": "s", "id": 218662, "pid": 76337, "tid": -914061504, "ts": 1716454224920404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224920439, "dur": 5, "args": { "External id": 218671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218671, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218671, "pid": 5, "tid": 7, "ts": 1716454224920439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920430, "dur": 8, "args": { "External id": 218671, "cbid": 211, "correlation": 218671 } }, { "ph": "s", "id": 218671, "pid": 76337, "tid": -914061504, "ts": 1716454224920430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224920480, "dur": 8, "args": { "External id": 218681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218681, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218681, "pid": 5, "tid": 7, "ts": 1716454224920480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920469, "dur": 11, "args": { "External id": 218681, "cbid": 211, "correlation": 218681 } }, { "ph": "s", "id": 218681, "pid": 76337, "tid": -914061504, "ts": 1716454224920469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920823, "dur": 3, "args": { "External id": 218690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218690, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218690, "pid": 5, "tid": 7, "ts": 1716454224920823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920807, "dur": 16, "args": { "External id": 218690, "cbid": 211, "correlation": 218690 } }, { "ph": "s", "id": 218690, "pid": 76337, "tid": -914061504, "ts": 1716454224920807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224920861, "dur": 3, "args": { "External id": 218698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218698, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218698, "pid": 5, "tid": 7, "ts": 1716454224920861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920847, "dur": 14, "args": { "External id": 218698, "cbid": 211, "correlation": 218698 } }, { "ph": "s", "id": 218698, "pid": 76337, "tid": -914061504, "ts": 1716454224920847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224920913, "dur": 1, "args": { "External id": 218708, "device": 5, "context": 1, "stream": 7, "correlation": 218708, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 218708, "pid": 5, "tid": 7, "ts": 1716454224920913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224920898, "dur": 13, "args": { "External id": 218708, "cbid": 41, "correlation": 218708 } }, { "ph": "s", "id": 218708, "pid": 76337, "tid": -914061504, "ts": 1716454224920898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224920912, "dur": 8, "args": { "External id": 218709, "cbid": 131, "correlation": 218709 } }, { "ph": "f", "id": 218709, "pid": 76337, "tid": -914061504, "ts": 1716454224920912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921014, "dur": 2, "args": { "External id": 218717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218717, "pid": 5, "tid": 7, "ts": 1716454224921014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224920999, "dur": 14, "args": { "External id": 218717, "cbid": 211, "correlation": 218717 } }, { "ph": "s", "id": 218717, "pid": 76337, "tid": -914061504, "ts": 1716454224920999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224921087, "dur": 3, "args": { "External id": 218726, "device": 5, "context": 1, "stream": 7, "correlation": 218726, "bytes": 8, "memory bandwidth (GB/s)": 0.002577319587628866 } }, { "ph": "f", "id": 218726, "pid": 5, "tid": 7, "ts": 1716454224921087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921069, "dur": 18, "args": { "External id": 218726, "cbid": 41, "correlation": 218726 } }, { "ph": "s", "id": 218726, "pid": 76337, "tid": -914061504, "ts": 1716454224921069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224921162, "dur": 3, "args": { "External id": 218736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218736, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 218736, "pid": 5, "tid": 7, "ts": 1716454224921162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921148, "dur": 13, "args": { "External id": 218736, "cbid": 211, "correlation": 218736 } }, { "ph": "s", "id": 218736, "pid": 76337, "tid": -914061504, "ts": 1716454224921148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224921215, "dur": 1, "args": { "External id": 218746, "device": 5, "context": 1, "stream": 7, "correlation": 218746, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 218746, "pid": 5, "tid": 7, "ts": 1716454224921215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921201, "dur": 12, "args": { "External id": 218746, "cbid": 41, "correlation": 218746 } }, { "ph": "s", "id": 218746, "pid": 76337, "tid": -914061504, "ts": 1716454224921201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224921214, "dur": 8, "args": { "External id": 218747, "cbid": 131, "correlation": 218747 } }, { "ph": "f", "id": 218747, "pid": 76337, "tid": -914061504, "ts": 1716454224921214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224921280, "dur": 3, "args": { "External id": 218754, "device": 5, "context": 1, "stream": 7, "correlation": 218754, "bytes": 98304, "memory bandwidth (GB/s)": 31.670103092783506 } }, { "ph": "f", "id": 218754, "pid": 5, "tid": 7, "ts": 1716454224921280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921257, "dur": 23, "args": { "External id": 218754, "cbid": 41, "correlation": 218754 } }, { "ph": "s", "id": 218754, "pid": 76337, "tid": -914061504, "ts": 1716454224921257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224921332, "dur": 1, "args": { "External id": 218765, "device": 5, "context": 1, "stream": 7, "correlation": 218765, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 218765, "pid": 5, "tid": 7, "ts": 1716454224921332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921317, "dur": 13, "args": { "External id": 218765, "cbid": 41, "correlation": 218765 } }, { "ph": "s", "id": 218765, "pid": 76337, "tid": -914061504, "ts": 1716454224921317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224921331, "dur": 8, "args": { "External id": 218766, "cbid": 131, "correlation": 218766 } }, { "ph": "f", "id": 218766, "pid": 76337, "tid": -914061504, "ts": 1716454224921331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921382, "dur": 3, "args": { "External id": 218774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218774, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218774, "pid": 5, "tid": 7, "ts": 1716454224921382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921369, "dur": 13, "args": { "External id": 218774, "cbid": 211, "correlation": 218774 } }, { "ph": "s", "id": 218774, "pid": 76337, "tid": -914061504, "ts": 1716454224921369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921412, "dur": 3, "args": { "External id": 218784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218784, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218784, "pid": 5, "tid": 7, "ts": 1716454224921412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921403, "dur": 8, "args": { "External id": 218784, "cbid": 211, "correlation": 218784 } }, { "ph": "s", "id": 218784, "pid": 76337, "tid": -914061504, "ts": 1716454224921403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921434, "dur": 3, "args": { "External id": 218793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218793, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218793, "pid": 5, "tid": 7, "ts": 1716454224921434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921425, "dur": 7, "args": { "External id": 218793, "cbid": 211, "correlation": 218793 } }, { "ph": "s", "id": 218793, "pid": 76337, "tid": -914061504, "ts": 1716454224921425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224921504, "dur": 6, "args": { "External id": 218801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218801, "pid": 5, "tid": 7, "ts": 1716454224921504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921489, "dur": 15, "args": { "External id": 218801, "cbid": 211, "correlation": 218801 } }, { "ph": "s", "id": 218801, "pid": 76337, "tid": -914061504, "ts": 1716454224921489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921543, "dur": 3, "args": { "External id": 218810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218810, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218810, "pid": 5, "tid": 7, "ts": 1716454224921543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921533, "dur": 10, "args": { "External id": 218810, "cbid": 211, "correlation": 218810 } }, { "ph": "s", "id": 218810, "pid": 76337, "tid": -914061504, "ts": 1716454224921533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921567, "dur": 3, "args": { "External id": 218819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218819, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218819, "pid": 5, "tid": 7, "ts": 1716454224921567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921558, "dur": 7, "args": { "External id": 218819, "cbid": 211, "correlation": 218819 } }, { "ph": "s", "id": 218819, "pid": 76337, "tid": -914061504, "ts": 1716454224921558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224921629, "dur": 3, "args": { "External id": 218827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218827, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 218827, "pid": 5, "tid": 7, "ts": 1716454224921629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921618, "dur": 10, "args": { "External id": 218827, "cbid": 211, "correlation": 218827 } }, { "ph": "s", "id": 218827, "pid": 76337, "tid": -914061504, "ts": 1716454224921618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224921687, "dur": 1, "args": { "External id": 218835, "device": 5, "context": 1, "stream": 7, "correlation": 218835, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 218835, "pid": 5, "tid": 7, "ts": 1716454224921687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921671, "dur": 26, "args": { "External id": 218835, "cbid": 41, "correlation": 218835 } }, { "ph": "s", "id": 218835, "pid": 76337, "tid": -914061504, "ts": 1716454224921671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224921697, "dur": 3, "args": { "External id": 218836, "cbid": 131, "correlation": 218836 } }, { "ph": "f", "id": 218836, "pid": 76337, "tid": -914061504, "ts": 1716454224921697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224921758, "dur": 1, "args": { "External id": 218846, "device": 5, "context": 1, "stream": 7, "correlation": 218846, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 218846, "pid": 5, "tid": 7, "ts": 1716454224921758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921746, "dur": 10, "args": { "External id": 218846, "cbid": 41, "correlation": 218846 } }, { "ph": "s", "id": 218846, "pid": 76337, "tid": -914061504, "ts": 1716454224921746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224921756, "dur": 8, "args": { "External id": 218847, "cbid": 131, "correlation": 218847 } }, { "ph": "f", "id": 218847, "pid": 76337, "tid": -914061504, "ts": 1716454224921756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224921814, "dur": 1, "args": { "External id": 218856, "device": 5, "context": 1, "stream": 7, "correlation": 218856, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 218856, "pid": 5, "tid": 7, "ts": 1716454224921814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921804, "dur": 8, "args": { "External id": 218856, "cbid": 41, "correlation": 218856 } }, { "ph": "s", "id": 218856, "pid": 76337, "tid": -914061504, "ts": 1716454224921804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224921813, "dur": 8, "args": { "External id": 218857, "cbid": 131, "correlation": 218857 } }, { "ph": "f", "id": 218857, "pid": 76337, "tid": -914061504, "ts": 1716454224921813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224921886, "dur": 4, "args": { "External id": 218864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218864, "pid": 5, "tid": 7, "ts": 1716454224921886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921870, "dur": 18, "args": { "External id": 218864, "cbid": 211, "correlation": 218864 } }, { "ph": "s", "id": 218864, "pid": 76337, "tid": -914061504, "ts": 1716454224921870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454224921924, "dur": 4, "args": { "External id": 218884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218884, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218884, "pid": 5, "tid": 7, "ts": 1716454224921924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921912, "dur": 12, "args": { "External id": 218884, "cbid": 211, "correlation": 218884 } }, { "ph": "s", "id": 218884, "pid": 76337, "tid": -914061504, "ts": 1716454224921912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224921926, "dur": 0, "args": { "External id": 218885, "cbid": 11, "correlation": 218885 } }, { "ph": "f", "id": 218885, "pid": 76337, "tid": -914061504, "ts": 1716454224921926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224921926, "dur": 0, "args": { "External id": 218886, "cbid": 11, "correlation": 218886 } }, { "ph": "f", "id": 218886, "pid": 76337, "tid": -914061504, "ts": 1716454224921926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224921941, "dur": 1, "args": { "External id": 218889, "device": 5, "context": 1, "stream": 7, "correlation": 218889, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 218889, "pid": 5, "tid": 7, "ts": 1716454224921941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224921928, "dur": 22, "args": { "External id": 218889, "cbid": 41, "correlation": 218889 } }, { "ph": "s", "id": 218889, "pid": 76337, "tid": -914061504, "ts": 1716454224921928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224921951, "dur": 3, "args": { "External id": 218890, "cbid": 131, "correlation": 218890 } }, { "ph": "f", "id": 218890, "pid": 76337, "tid": -914061504, "ts": 1716454224921951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224921988, "dur": 3, "args": { "External id": 218914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218914, "pid": 5, "tid": 7, "ts": 1716454224921988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921969, "dur": 19, "args": { "External id": 218914, "cbid": 211, "correlation": 218914 } }, { "ph": "s", "id": 218914, "pid": 76337, "tid": -914061504, "ts": 1716454224921969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224921989, "dur": 0, "args": { "External id": 218915, "cbid": 11, "correlation": 218915 } }, { "ph": "f", "id": 218915, "pid": 76337, "tid": -914061504, "ts": 1716454224921989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224921989, "dur": 0, "args": { "External id": 218916, "cbid": 11, "correlation": 218916 } }, { "ph": "f", "id": 218916, "pid": 76337, "tid": -914061504, "ts": 1716454224921989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224921991, "dur": 1, "args": { "External id": 218918, "cbid": 200, "correlation": 218918 } }, { "ph": "f", "id": 218918, "pid": 76337, "tid": -914061504, "ts": 1716454224921991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454224922001, "dur": 4, "args": { "External id": 218920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218920, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218920, "pid": 5, "tid": 7, "ts": 1716454224922001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224921994, "dur": 8, "args": { "External id": 218920, "cbid": 211, "correlation": 218920 } }, { "ph": "s", "id": 218920, "pid": 76337, "tid": -914061504, "ts": 1716454224921994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224922003, "dur": 0, "args": { "External id": 218921, "cbid": 11, "correlation": 218921 } }, { "ph": "f", "id": 218921, "pid": 76337, "tid": -914061504, "ts": 1716454224922003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224922003, "dur": 0, "args": { "External id": 218922, "cbid": 11, "correlation": 218922 } }, { "ph": "f", "id": 218922, "pid": 76337, "tid": -914061504, "ts": 1716454224922003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224922042, "dur": 1, "args": { "External id": 218929, "device": 5, "context": 1, "stream": 7, "correlation": 218929, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 218929, "pid": 5, "tid": 7, "ts": 1716454224922042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224922030, "dur": 20, "args": { "External id": 218929, "cbid": 41, "correlation": 218929 } }, { "ph": "s", "id": 218929, "pid": 76337, "tid": -914061504, "ts": 1716454224922030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224922051, "dur": 3, "args": { "External id": 218930, "cbid": 131, "correlation": 218930 } }, { "ph": "f", "id": 218930, "pid": 76337, "tid": -914061504, "ts": 1716454224922051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224922101, "dur": 1, "args": { "External id": 218940, "device": 5, "context": 1, "stream": 7, "correlation": 218940, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 218940, "pid": 5, "tid": 7, "ts": 1716454224922101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224922090, "dur": 9, "args": { "External id": 218940, "cbid": 41, "correlation": 218940 } }, { "ph": "s", "id": 218940, "pid": 76337, "tid": -914061504, "ts": 1716454224922090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224922099, "dur": 8, "args": { "External id": 218941, "cbid": 131, "correlation": 218941 } }, { "ph": "f", "id": 218941, "pid": 76337, "tid": -914061504, "ts": 1716454224922099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224922177, "dur": 5, "args": { "External id": 218948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218948, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218948, "pid": 5, "tid": 7, "ts": 1716454224922177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922159, "dur": 18, "args": { "External id": 218948, "cbid": 211, "correlation": 218948 } }, { "ph": "s", "id": 218948, "pid": 76337, "tid": -914061504, "ts": 1716454224922159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922247, "dur": 3, "args": { "External id": 218957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218957, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218957, "pid": 5, "tid": 7, "ts": 1716454224922247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922234, "dur": 13, "args": { "External id": 218957, "cbid": 211, "correlation": 218957 } }, { "ph": "s", "id": 218957, "pid": 76337, "tid": -914061504, "ts": 1716454224922234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922284, "dur": 3, "args": { "External id": 218965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218965, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218965, "pid": 5, "tid": 7, "ts": 1716454224922284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922274, "dur": 9, "args": { "External id": 218965, "cbid": 211, "correlation": 218965 } }, { "ph": "s", "id": 218965, "pid": 76337, "tid": -914061504, "ts": 1716454224922274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922323, "dur": 4, "args": { "External id": 218973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218973, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218973, "pid": 5, "tid": 7, "ts": 1716454224922323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922306, "dur": 17, "args": { "External id": 218973, "cbid": 211, "correlation": 218973 } }, { "ph": "s", "id": 218973, "pid": 76337, "tid": -914061504, "ts": 1716454224922306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922353, "dur": 4, "args": { "External id": 218981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218981, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218981, "pid": 5, "tid": 7, "ts": 1716454224922353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922343, "dur": 9, "args": { "External id": 218981, "cbid": 211, "correlation": 218981 } }, { "ph": "s", "id": 218981, "pid": 76337, "tid": -914061504, "ts": 1716454224922343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922381, "dur": 3, "args": { "External id": 218989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218989, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218989, "pid": 5, "tid": 7, "ts": 1716454224922381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922372, "dur": 8, "args": { "External id": 218989, "cbid": 211, "correlation": 218989 } }, { "ph": "s", "id": 218989, "pid": 76337, "tid": -914061504, "ts": 1716454224922372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922406, "dur": 4, "args": { "External id": 218997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 218997, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 218997, "pid": 5, "tid": 7, "ts": 1716454224922406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922396, "dur": 9, "args": { "External id": 218997, "cbid": 211, "correlation": 218997 } }, { "ph": "s", "id": 218997, "pid": 76337, "tid": -914061504, "ts": 1716454224922396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224922428, "dur": 4, "args": { "External id": 219005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219005, "pid": 5, "tid": 7, "ts": 1716454224922428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922419, "dur": 7, "args": { "External id": 219005, "cbid": 211, "correlation": 219005 } }, { "ph": "s", "id": 219005, "pid": 76337, "tid": -914061504, "ts": 1716454224922419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224922446, "dur": 4, "args": { "External id": 219013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219013, "pid": 5, "tid": 7, "ts": 1716454224922446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922438, "dur": 7, "args": { "External id": 219013, "cbid": 211, "correlation": 219013 } }, { "ph": "s", "id": 219013, "pid": 76337, "tid": -914061504, "ts": 1716454224922438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922465, "dur": 3, "args": { "External id": 219021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219021, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219021, "pid": 5, "tid": 7, "ts": 1716454224922465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922458, "dur": 7, "args": { "External id": 219021, "cbid": 211, "correlation": 219021 } }, { "ph": "s", "id": 219021, "pid": 76337, "tid": -914061504, "ts": 1716454224922458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922524, "dur": 3, "args": { "External id": 219029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219029, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219029, "pid": 5, "tid": 7, "ts": 1716454224922524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922513, "dur": 11, "args": { "External id": 219029, "cbid": 211, "correlation": 219029 } }, { "ph": "s", "id": 219029, "pid": 76337, "tid": -914061504, "ts": 1716454224922513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224922550, "dur": 4, "args": { "External id": 219037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219037, "pid": 5, "tid": 7, "ts": 1716454224922550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922540, "dur": 9, "args": { "External id": 219037, "cbid": 211, "correlation": 219037 } }, { "ph": "s", "id": 219037, "pid": 76337, "tid": -914061504, "ts": 1716454224922540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224922573, "dur": 4, "args": { "External id": 219045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219045, "pid": 5, "tid": 7, "ts": 1716454224922573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922564, "dur": 8, "args": { "External id": 219045, "cbid": 211, "correlation": 219045 } }, { "ph": "s", "id": 219045, "pid": 76337, "tid": -914061504, "ts": 1716454224922564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224922591, "dur": 3, "args": { "External id": 219053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219053, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219053, "pid": 5, "tid": 7, "ts": 1716454224922591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224922583, "dur": 7, "args": { "External id": 219053, "cbid": 211, "correlation": 219053 } }, { "ph": "s", "id": 219053, "pid": 76337, "tid": -914061504, "ts": 1716454224922583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224923021, "dur": 5, "args": { "External id": 219062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219062, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219062, "pid": 5, "tid": 7, "ts": 1716454224923021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923005, "dur": 17, "args": { "External id": 219062, "cbid": 211, "correlation": 219062 } }, { "ph": "s", "id": 219062, "pid": 76337, "tid": -914061504, "ts": 1716454224923005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224923060, "dur": 5, "args": { "External id": 219071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219071, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219071, "pid": 5, "tid": 7, "ts": 1716454224923060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923049, "dur": 10, "args": { "External id": 219071, "cbid": 211, "correlation": 219071 } }, { "ph": "s", "id": 219071, "pid": 76337, "tid": -914061504, "ts": 1716454224923049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224923190, "dur": 3, "args": { "External id": 219087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219087, "pid": 5, "tid": 7, "ts": 1716454224923190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923174, "dur": 18, "args": { "External id": 219087, "cbid": 211, "correlation": 219087 } }, { "ph": "s", "id": 219087, "pid": 76337, "tid": -914061504, "ts": 1716454224923174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923226, "dur": 3, "args": { "External id": 219095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219095, "pid": 5, "tid": 7, "ts": 1716454224923226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923217, "dur": 9, "args": { "External id": 219095, "cbid": 211, "correlation": 219095 } }, { "ph": "s", "id": 219095, "pid": 76337, "tid": -914061504, "ts": 1716454224923217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923264, "dur": 3, "args": { "External id": 219103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219103, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219103, "pid": 5, "tid": 7, "ts": 1716454224923264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923251, "dur": 11, "args": { "External id": 219103, "cbid": 211, "correlation": 219103 } }, { "ph": "s", "id": 219103, "pid": 76337, "tid": -914061504, "ts": 1716454224923251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923300, "dur": 4, "args": { "External id": 219111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219111, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219111, "pid": 5, "tid": 7, "ts": 1716454224923300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923287, "dur": 13, "args": { "External id": 219111, "cbid": 211, "correlation": 219111 } }, { "ph": "s", "id": 219111, "pid": 76337, "tid": -914061504, "ts": 1716454224923287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224923357, "dur": 4, "args": { "External id": 219123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219123, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219123, "pid": 5, "tid": 7, "ts": 1716454224923357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923344, "dur": 13, "args": { "External id": 219123, "cbid": 211, "correlation": 219123 } }, { "ph": "s", "id": 219123, "pid": 76337, "tid": -914061504, "ts": 1716454224923344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224923403, "dur": 4, "args": { "External id": 219134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219134, "pid": 5, "tid": 7, "ts": 1716454224923403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923391, "dur": 11, "args": { "External id": 219134, "cbid": 211, "correlation": 219134 } }, { "ph": "s", "id": 219134, "pid": 76337, "tid": -914061504, "ts": 1716454224923391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923433, "dur": 3, "args": { "External id": 219142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219142, "pid": 5, "tid": 7, "ts": 1716454224923433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923424, "dur": 9, "args": { "External id": 219142, "cbid": 211, "correlation": 219142 } }, { "ph": "s", "id": 219142, "pid": 76337, "tid": -914061504, "ts": 1716454224923424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923467, "dur": 5, "args": { "External id": 219150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219150, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219150, "pid": 5, "tid": 7, "ts": 1716454224923467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923457, "dur": 10, "args": { "External id": 219150, "cbid": 211, "correlation": 219150 } }, { "ph": "s", "id": 219150, "pid": 76337, "tid": -914061504, "ts": 1716454224923457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923495, "dur": 5, "args": { "External id": 219158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219158, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219158, "pid": 5, "tid": 7, "ts": 1716454224923495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923486, "dur": 9, "args": { "External id": 219158, "cbid": 211, "correlation": 219158 } }, { "ph": "s", "id": 219158, "pid": 76337, "tid": -914061504, "ts": 1716454224923486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224923525, "dur": 4, "args": { "External id": 219167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219167, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219167, "pid": 5, "tid": 7, "ts": 1716454224923525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923515, "dur": 9, "args": { "External id": 219167, "cbid": 211, "correlation": 219167 } }, { "ph": "s", "id": 219167, "pid": 76337, "tid": -914061504, "ts": 1716454224923515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224923586, "dur": 4, "args": { "External id": 219180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219180, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219180, "pid": 5, "tid": 7, "ts": 1716454224923586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923573, "dur": 14, "args": { "External id": 219180, "cbid": 211, "correlation": 219180 } }, { "ph": "s", "id": 219180, "pid": 76337, "tid": -914061504, "ts": 1716454224923573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224923627, "dur": 5, "args": { "External id": 219190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219190, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219190, "pid": 5, "tid": 7, "ts": 1716454224923627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923615, "dur": 11, "args": { "External id": 219190, "cbid": 211, "correlation": 219190 } }, { "ph": "s", "id": 219190, "pid": 76337, "tid": -914061504, "ts": 1716454224923615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224923754, "dur": 5, "args": { "External id": 219207, "cbid": 251, "correlation": 219207 } }, { "ph": "f", "id": 219207, "pid": 76337, "tid": -914061504, "ts": 1716454224923754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454224923784, "dur": 11, "args": { "External id": 219209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219209, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219209, "pid": 5, "tid": 7, "ts": 1716454224923784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923769, "dur": 16, "args": { "External id": 219209, "cbid": 211, "correlation": 219209 } }, { "ph": "s", "id": 219209, "pid": 76337, "tid": -914061504, "ts": 1716454224923769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224923850, "dur": 3, "args": { "External id": 219217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219217, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219217, "pid": 5, "tid": 7, "ts": 1716454224923850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923837, "dur": 14, "args": { "External id": 219217, "cbid": 211, "correlation": 219217 } }, { "ph": "s", "id": 219217, "pid": 76337, "tid": -914061504, "ts": 1716454224923837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224923911, "dur": 2, "args": { "External id": 219233, "cbid": 251, "correlation": 219233 } }, { "ph": "f", "id": 219233, "pid": 76337, "tid": -914061504, "ts": 1716454224923911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224923917, "dur": 0, "args": { "External id": 219235, "cbid": 251, "correlation": 219235 } }, { "ph": "f", "id": 219235, "pid": 76337, "tid": -914061504, "ts": 1716454224923917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224923934, "dur": 13, "args": { "External id": 219236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219236, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219236, "pid": 5, "tid": 7, "ts": 1716454224923934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923920, "dur": 14, "args": { "External id": 219236, "cbid": 211, "correlation": 219236 } }, { "ph": "s", "id": 219236, "pid": 76337, "tid": -914061504, "ts": 1716454224923920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224923949, "dur": 5, "args": { "External id": 219238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219238, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219238, "pid": 5, "tid": 7, "ts": 1716454224923949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224923940, "dur": 8, "args": { "External id": 219238, "cbid": 211, "correlation": 219238 } }, { "ph": "s", "id": 219238, "pid": 76337, "tid": -914061504, "ts": 1716454224923940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224924060, "dur": 1, "args": { "External id": 219248, "cbid": 317, "correlation": 219248 } }, { "ph": "f", "id": 219248, "pid": 76337, "tid": -914061504, "ts": 1716454224924060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224924062, "dur": 1, "args": { "External id": 219249, "cbid": 203, "correlation": 219249 } }, { "ph": "f", "id": 219249, "pid": 76337, "tid": -914061504, "ts": 1716454224924062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224924064, "dur": 1, "args": { "External id": 219250, "cbid": 205, "correlation": 219250 } }, { "ph": "f", "id": 219250, "pid": 76337, "tid": -914061504, "ts": 1716454224924064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224924121, "dur": 7, "args": { "External id": 219254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219254, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219254, "pid": 5, "tid": 7, "ts": 1716454224924121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224924105, "dur": 15, "args": { "External id": 219254, "cbid": 211, "correlation": 219254 } }, { "ph": "s", "id": 219254, "pid": 76337, "tid": -914061504, "ts": 1716454224924105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224924131, "dur": 4, "args": { "External id": 219256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219256, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 219256, "pid": 5, "tid": 7, "ts": 1716454224924131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224924124, "dur": 6, "args": { "External id": 219256, "cbid": 211, "correlation": 219256 } }, { "ph": "s", "id": 219256, "pid": 76337, "tid": -914061504, "ts": 1716454224924124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224924152, "dur": 4, "args": { "External id": 219258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219258, "pid": 5, "tid": 7, "ts": 1716454224924152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224924142, "dur": 8, "args": { "External id": 219258, "cbid": 211, "correlation": 219258 } }, { "ph": "s", "id": 219258, "pid": 76337, "tid": -914061504, "ts": 1716454224924142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224924157, "dur": 0, "args": { "External id": 219259, "cbid": 51, "correlation": 219259 } }, { "ph": "s", "id": 219259, "pid": 76337, "tid": -914061504, "ts": 1716454224924157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224924168, "dur": 84, "args": { "External id": 219260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219260, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219260, "pid": 5, "tid": 7, "ts": 1716454224924168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224924158, "dur": 8, "args": { "External id": 219260, "cbid": 211, "correlation": 219260 } }, { "ph": "s", "id": 219260, "pid": 76337, "tid": -914061504, "ts": 1716454224924158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224924254, "dur": 60, "args": { "External id": 219265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219265, "pid": 5, "tid": 7, "ts": 1716454224924254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224924198, "dur": 10, "args": { "External id": 219265, "cbid": 211, "correlation": 219265 } }, { "ph": "s", "id": 219265, "pid": 76337, "tid": -914061504, "ts": 1716454224924198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224926047, "dur": 50, "args": { "External id": 219285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219285, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 219285, "pid": 5, "tid": 7, "ts": 1716454224926047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926028, "dur": 20, "args": { "External id": 219285, "cbid": 211, "correlation": 219285 } }, { "ph": "s", "id": 219285, "pid": 76337, "tid": -914061504, "ts": 1716454224926028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224926099, "dur": 5, "args": { "External id": 219297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219297, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219297, "pid": 5, "tid": 7, "ts": 1716454224926099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926062, "dur": 10, "args": { "External id": 219297, "cbid": 211, "correlation": 219297 } }, { "ph": "s", "id": 219297, "pid": 76337, "tid": -914061504, "ts": 1716454224926062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224926105, "dur": 57, "args": { "External id": 219300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219300, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219300, "pid": 5, "tid": 7, "ts": 1716454224926105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926092, "dur": 11, "args": { "External id": 219300, "cbid": 211, "correlation": 219300 } }, { "ph": "s", "id": 219300, "pid": 76337, "tid": -914061504, "ts": 1716454224926092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224926163, "dur": 37, "args": { "External id": 219309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219309, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219309, "pid": 5, "tid": 7, "ts": 1716454224926163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926141, "dur": 10, "args": { "External id": 219309, "cbid": 211, "correlation": 219309 } }, { "ph": "s", "id": 219309, "pid": 76337, "tid": -914061504, "ts": 1716454224926141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224926199, "dur": 0, "args": { "External id": 219319, "cbid": 317, "correlation": 219319 } }, { "ph": "f", "id": 219319, "pid": 76337, "tid": -914061504, "ts": 1716454224926199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224926200, "dur": 0, "args": { "External id": 219320, "cbid": 203, "correlation": 219320 } }, { "ph": "f", "id": 219320, "pid": 76337, "tid": -914061504, "ts": 1716454224926200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224926201, "dur": 0, "args": { "External id": 219321, "cbid": 205, "correlation": 219321 } }, { "ph": "f", "id": 219321, "pid": 76337, "tid": -914061504, "ts": 1716454224926201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224926234, "dur": 40, "args": { "External id": 219325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219325, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219325, "pid": 5, "tid": 7, "ts": 1716454224926234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926220, "dur": 13, "args": { "External id": 219325, "cbid": 211, "correlation": 219325 } }, { "ph": "s", "id": 219325, "pid": 76337, "tid": -914061504, "ts": 1716454224926220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224926275, "dur": 14, "args": { "External id": 219327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219327, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219327, "pid": 5, "tid": 7, "ts": 1716454224926275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926236, "dur": 6, "args": { "External id": 219327, "cbid": 211, "correlation": 219327 } }, { "ph": "s", "id": 219327, "pid": 76337, "tid": -914061504, "ts": 1716454224926236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224926291, "dur": 3, "args": { "External id": 219329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219329, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219329, "pid": 5, "tid": 7, "ts": 1716454224926291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926248, "dur": 8, "args": { "External id": 219329, "cbid": 211, "correlation": 219329 } }, { "ph": "s", "id": 219329, "pid": 76337, "tid": -914061504, "ts": 1716454224926248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224926260, "dur": 0, "args": { "External id": 219330, "cbid": 51, "correlation": 219330 } }, { "ph": "s", "id": 219330, "pid": 76337, "tid": -914061504, "ts": 1716454224926260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224926296, "dur": 707, "args": { "External id": 219331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219331, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219331, "pid": 5, "tid": 7, "ts": 1716454224926296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926261, "dur": 7, "args": { "External id": 219331, "cbid": 211, "correlation": 219331 } }, { "ph": "s", "id": 219331, "pid": 76337, "tid": -914061504, "ts": 1716454224926261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224927004, "dur": 59, "args": { "External id": 219336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219336, "pid": 5, "tid": 7, "ts": 1716454224927004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926292, "dur": 8, "args": { "External id": 219336, "cbid": 211, "correlation": 219336 } }, { "ph": "s", "id": 219336, "pid": 76337, "tid": -914061504, "ts": 1716454224926292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224927064, "dur": 4, "args": { "External id": 219344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219344, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219344, "pid": 5, "tid": 7, "ts": 1716454224927064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926336, "dur": 9, "args": { "External id": 219344, "cbid": 211, "correlation": 219344 } }, { "ph": "s", "id": 219344, "pid": 76337, "tid": -914061504, "ts": 1716454224926336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224926401, "dur": 1, "args": { "External id": 219360, "cbid": 251, "correlation": 219360 } }, { "ph": "f", "id": 219360, "pid": 76337, "tid": -914061504, "ts": 1716454224926401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224926407, "dur": 0, "args": { "External id": 219362, "cbid": 251, "correlation": 219362 } }, { "ph": "f", "id": 219362, "pid": 76337, "tid": -914061504, "ts": 1716454224926407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224927069, "dur": 9, "args": { "External id": 219363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219363, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 219363, "pid": 5, "tid": 7, "ts": 1716454224927069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926409, "dur": 12, "args": { "External id": 219363, "cbid": 211, "correlation": 219363 } }, { "ph": "s", "id": 219363, "pid": 76337, "tid": -914061504, "ts": 1716454224926409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224927079, "dur": 4, "args": { "External id": 219365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219365, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 219365, "pid": 5, "tid": 7, "ts": 1716454224927079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926422, "dur": 8, "args": { "External id": 219365, "cbid": 211, "correlation": 219365 } }, { "ph": "s", "id": 219365, "pid": 76337, "tid": -914061504, "ts": 1716454224926422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224927084, "dur": 55, "args": { "External id": 219375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219375, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219375, "pid": 5, "tid": 7, "ts": 1716454224927084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926485, "dur": 12, "args": { "External id": 219375, "cbid": 211, "correlation": 219375 } }, { "ph": "s", "id": 219375, "pid": 76337, "tid": -914061504, "ts": 1716454224926485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224927140, "dur": 52, "args": { "External id": 219395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219395, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 219395, "pid": 5, "tid": 7, "ts": 1716454224927140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926551, "dur": 10, "args": { "External id": 219395, "cbid": 211, "correlation": 219395 } }, { "ph": "s", "id": 219395, "pid": 76337, "tid": -914061504, "ts": 1716454224926551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224927194, "dur": 4, "args": { "External id": 219407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219407, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219407, "pid": 5, "tid": 7, "ts": 1716454224927194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926571, "dur": 7, "args": { "External id": 219407, "cbid": 211, "correlation": 219407 } }, { "ph": "s", "id": 219407, "pid": 76337, "tid": -914061504, "ts": 1716454224926571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224927199, "dur": 56, "args": { "External id": 219410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219410, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219410, "pid": 5, "tid": 7, "ts": 1716454224927199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926591, "dur": 6, "args": { "External id": 219410, "cbid": 211, "correlation": 219410 } }, { "ph": "s", "id": 219410, "pid": 76337, "tid": -914061504, "ts": 1716454224926591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224927256, "dur": 37, "args": { "External id": 219419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219419, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219419, "pid": 5, "tid": 7, "ts": 1716454224927256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926631, "dur": 9, "args": { "External id": 219419, "cbid": 211, "correlation": 219419 } }, { "ph": "s", "id": 219419, "pid": 76337, "tid": -914061504, "ts": 1716454224926631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224926699, "dur": 0, "args": { "External id": 219429, "cbid": 317, "correlation": 219429 } }, { "ph": "f", "id": 219429, "pid": 76337, "tid": -914061504, "ts": 1716454224926699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224926700, "dur": 0, "args": { "External id": 219430, "cbid": 203, "correlation": 219430 } }, { "ph": "f", "id": 219430, "pid": 76337, "tid": -914061504, "ts": 1716454224926700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224926701, "dur": 0, "args": { "External id": 219431, "cbid": 205, "correlation": 219431 } }, { "ph": "f", "id": 219431, "pid": 76337, "tid": -914061504, "ts": 1716454224926701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224927294, "dur": 40, "args": { "External id": 219435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219435, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219435, "pid": 5, "tid": 7, "ts": 1716454224927294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926715, "dur": 12, "args": { "External id": 219435, "cbid": 211, "correlation": 219435 } }, { "ph": "s", "id": 219435, "pid": 76337, "tid": -914061504, "ts": 1716454224926715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224927336, "dur": 14, "args": { "External id": 219437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219437, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219437, "pid": 5, "tid": 7, "ts": 1716454224927336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926730, "dur": 5, "args": { "External id": 219437, "cbid": 211, "correlation": 219437 } }, { "ph": "s", "id": 219437, "pid": 76337, "tid": -914061504, "ts": 1716454224926730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224927351, "dur": 3, "args": { "External id": 219439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219439, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219439, "pid": 5, "tid": 7, "ts": 1716454224927351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926739, "dur": 6, "args": { "External id": 219439, "cbid": 211, "correlation": 219439 } }, { "ph": "s", "id": 219439, "pid": 76337, "tid": -914061504, "ts": 1716454224926739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224926748, "dur": 0, "args": { "External id": 219440, "cbid": 51, "correlation": 219440 } }, { "ph": "s", "id": 219440, "pid": 76337, "tid": -914061504, "ts": 1716454224926748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224927356, "dur": 698, "args": { "External id": 219441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219441, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219441, "pid": 5, "tid": 7, "ts": 1716454224927356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926749, "dur": 5, "args": { "External id": 219441, "cbid": 211, "correlation": 219441 } }, { "ph": "s", "id": 219441, "pid": 76337, "tid": -914061504, "ts": 1716454224926749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224928056, "dur": 60, "args": { "External id": 219446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219446, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219446, "pid": 5, "tid": 7, "ts": 1716454224928056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926777, "dur": 8, "args": { "External id": 219446, "cbid": 211, "correlation": 219446 } }, { "ph": "s", "id": 219446, "pid": 76337, "tid": -914061504, "ts": 1716454224926777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224928117, "dur": 51, "args": { "External id": 219454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219454, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219454, "pid": 5, "tid": 7, "ts": 1716454224928117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926809, "dur": 8, "args": { "External id": 219454, "cbid": 211, "correlation": 219454 } }, { "ph": "s", "id": 219454, "pid": 76337, "tid": -914061504, "ts": 1716454224926809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224928169, "dur": 35, "args": { "External id": 219462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219462, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219462, "pid": 5, "tid": 7, "ts": 1716454224928169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926840, "dur": 9, "args": { "External id": 219462, "cbid": 211, "correlation": 219462 } }, { "ph": "s", "id": 219462, "pid": 76337, "tid": -914061504, "ts": 1716454224926840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224928205, "dur": 52, "args": { "External id": 219482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219482, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 219482, "pid": 5, "tid": 7, "ts": 1716454224928205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926921, "dur": 13, "args": { "External id": 219482, "cbid": 211, "correlation": 219482 } }, { "ph": "s", "id": 219482, "pid": 76337, "tid": -914061504, "ts": 1716454224926921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224928259, "dur": 4, "args": { "External id": 219494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219494, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219494, "pid": 5, "tid": 7, "ts": 1716454224928259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926944, "dur": 6, "args": { "External id": 219494, "cbid": 211, "correlation": 219494 } }, { "ph": "s", "id": 219494, "pid": 76337, "tid": -914061504, "ts": 1716454224926944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224928264, "dur": 57, "args": { "External id": 219497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219497, "pid": 5, "tid": 7, "ts": 1716454224928264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224926962, "dur": 6, "args": { "External id": 219497, "cbid": 211, "correlation": 219497 } }, { "ph": "s", "id": 219497, "pid": 76337, "tid": -914061504, "ts": 1716454224926962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224927029, "dur": 0, "args": { "External id": 219508, "cbid": 317, "correlation": 219508 } }, { "ph": "f", "id": 219508, "pid": 76337, "tid": -914061504, "ts": 1716454224927029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224927030, "dur": 0, "args": { "External id": 219509, "cbid": 203, "correlation": 219509 } }, { "ph": "f", "id": 219509, "pid": 76337, "tid": -914061504, "ts": 1716454224927030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224927030, "dur": 0, "args": { "External id": 219510, "cbid": 205, "correlation": 219510 } }, { "ph": "f", "id": 219510, "pid": 76337, "tid": -914061504, "ts": 1716454224927030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927068, "dur": 3, "args": { "External id": 219514, "cbid": 251, "correlation": 219514 } }, { "ph": "f", "id": 219514, "pid": 76337, "tid": -914061504, "ts": 1716454224927068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927072, "dur": 1, "args": { "External id": 219515, "cbid": 251, "correlation": 219515 } }, { "ph": "f", "id": 219515, "pid": 76337, "tid": -914061504, "ts": 1716454224927072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927074, "dur": 1, "args": { "External id": 219516, "cbid": 251, "correlation": 219516 } }, { "ph": "f", "id": 219516, "pid": 76337, "tid": -914061504, "ts": 1716454224927074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927076, "dur": 1, "args": { "External id": 219517, "cbid": 251, "correlation": 219517 } }, { "ph": "f", "id": 219517, "pid": 76337, "tid": -914061504, "ts": 1716454224927076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927078, "dur": 1, "args": { "External id": 219518, "cbid": 251, "correlation": 219518 } }, { "ph": "f", "id": 219518, "pid": 76337, "tid": -914061504, "ts": 1716454224927078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927080, "dur": 1, "args": { "External id": 219519, "cbid": 251, "correlation": 219519 } }, { "ph": "f", "id": 219519, "pid": 76337, "tid": -914061504, "ts": 1716454224927080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927082, "dur": 1, "args": { "External id": 219520, "cbid": 251, "correlation": 219520 } }, { "ph": "f", "id": 219520, "pid": 76337, "tid": -914061504, "ts": 1716454224927082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927084, "dur": 1, "args": { "External id": 219521, "cbid": 251, "correlation": 219521 } }, { "ph": "f", "id": 219521, "pid": 76337, "tid": -914061504, "ts": 1716454224927084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927087, "dur": 0, "args": { "External id": 219522, "cbid": 251, "correlation": 219522 } }, { "ph": "f", "id": 219522, "pid": 76337, "tid": -914061504, "ts": 1716454224927087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224928322, "dur": 115, "args": { "External id": 219523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219523, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 219523, "pid": 5, "tid": 7, "ts": 1716454224928322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927091, "dur": 14, "args": { "External id": 219523, "cbid": 211, "correlation": 219523 } }, { "ph": "s", "id": 219523, "pid": 76337, "tid": -914061504, "ts": 1716454224927091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224928439, "dur": 60, "args": { "External id": 219529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219529, "pid": 5, "tid": 7, "ts": 1716454224928439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927129, "dur": 8, "args": { "External id": 219529, "cbid": 211, "correlation": 219529 } }, { "ph": "s", "id": 219529, "pid": 76337, "tid": -914061504, "ts": 1716454224927129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224928500, "dur": 498, "args": { "External id": 219538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219538, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219538, "pid": 5, "tid": 7, "ts": 1716454224928500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927224, "dur": 16, "args": { "External id": 219538, "cbid": 211, "correlation": 219538 } }, { "ph": "s", "id": 219538, "pid": 76337, "tid": -914061504, "ts": 1716454224927224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224928999, "dur": 182, "args": { "External id": 219560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219560, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219560, "pid": 5, "tid": 7, "ts": 1716454224928999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927302, "dur": 13, "args": { "External id": 219560, "cbid": 211, "correlation": 219560 } }, { "ph": "s", "id": 219560, "pid": 76337, "tid": -914061504, "ts": 1716454224927302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927420, "dur": 2, "args": { "External id": 219571, "cbid": 251, "correlation": 219571 } }, { "ph": "f", "id": 219571, "pid": 76337, "tid": -914061504, "ts": 1716454224927420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224929183, "dur": 199, "args": { "External id": 219572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219572, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219572, "pid": 5, "tid": 7, "ts": 1716454224929183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927426, "dur": 15, "args": { "External id": 219572, "cbid": 211, "correlation": 219572 } }, { "ph": "s", "id": 219572, "pid": 76337, "tid": -914061504, "ts": 1716454224927426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927501, "dur": 1, "args": { "External id": 219583, "cbid": 251, "correlation": 219583 } }, { "ph": "f", "id": 219583, "pid": 76337, "tid": -914061504, "ts": 1716454224927501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224929383, "dur": 191, "args": { "External id": 219584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219584, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219584, "pid": 5, "tid": 7, "ts": 1716454224929383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927505, "dur": 12, "args": { "External id": 219584, "cbid": 211, "correlation": 219584 } }, { "ph": "s", "id": 219584, "pid": 76337, "tid": -914061504, "ts": 1716454224927505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927572, "dur": 1, "args": { "External id": 219595, "cbid": 251, "correlation": 219595 } }, { "ph": "f", "id": 219595, "pid": 76337, "tid": -914061504, "ts": 1716454224927572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224929575, "dur": 190, "args": { "External id": 219596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219596, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219596, "pid": 5, "tid": 7, "ts": 1716454224929575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927576, "dur": 11, "args": { "External id": 219596, "cbid": 211, "correlation": 219596 } }, { "ph": "s", "id": 219596, "pid": 76337, "tid": -914061504, "ts": 1716454224927576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224929766, "dur": 18784, "args": { "External id": 219617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219617, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 219617, "pid": 5, "tid": 7, "ts": 1716454224929766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927684, "dur": 14, "args": { "External id": 219617, "cbid": 211, "correlation": 219617 } }, { "ph": "s", "id": 219617, "pid": 76337, "tid": -914061504, "ts": 1716454224927684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224927798, "dur": 2, "args": { "External id": 219635, "cbid": 251, "correlation": 219635 } }, { "ph": "f", "id": 219635, "pid": 76337, "tid": -914061504, "ts": 1716454224927798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224948552, "dur": 205, "args": { "External id": 219637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219637, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219637, "pid": 5, "tid": 7, "ts": 1716454224948552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927805, "dur": 13, "args": { "External id": 219637, "cbid": 211, "correlation": 219637 } }, { "ph": "s", "id": 219637, "pid": 76337, "tid": -914061504, "ts": 1716454224927805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224948758, "dur": 66, "args": { "External id": 219645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219645, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219645, "pid": 5, "tid": 7, "ts": 1716454224948758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927877, "dur": 12, "args": { "External id": 219645, "cbid": 211, "correlation": 219645 } }, { "ph": "s", "id": 219645, "pid": 76337, "tid": -914061504, "ts": 1716454224927877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224948826, "dur": 97, "args": { "External id": 219653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219653, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219653, "pid": 5, "tid": 7, "ts": 1716454224948826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224927917, "dur": 9, "args": { "External id": 219653, "cbid": 211, "correlation": 219653 } }, { "ph": "s", "id": 219653, "pid": 76337, "tid": -914061504, "ts": 1716454224927917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224948924, "dur": 54, "args": { "External id": 219664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219664, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219664, "pid": 5, "tid": 7, "ts": 1716454224948924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928011, "dur": 15, "args": { "External id": 219664, "cbid": 211, "correlation": 219664 } }, { "ph": "s", "id": 219664, "pid": 76337, "tid": -914061504, "ts": 1716454224928011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224948979, "dur": 93, "args": { "External id": 219686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219686, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219686, "pid": 5, "tid": 7, "ts": 1716454224948979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928046, "dur": 8, "args": { "External id": 219686, "cbid": 211, "correlation": 219686 } }, { "ph": "s", "id": 219686, "pid": 76337, "tid": -914061504, "ts": 1716454224928046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928130, "dur": 1, "args": { "External id": 219697, "cbid": 251, "correlation": 219697 } }, { "ph": "f", "id": 219697, "pid": 76337, "tid": -914061504, "ts": 1716454224928130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224949073, "dur": 103, "args": { "External id": 219698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219698, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219698, "pid": 5, "tid": 7, "ts": 1716454224949073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928135, "dur": 13, "args": { "External id": 219698, "cbid": 211, "correlation": 219698 } }, { "ph": "s", "id": 219698, "pid": 76337, "tid": -914061504, "ts": 1716454224928135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928218, "dur": 1, "args": { "External id": 219709, "cbid": 251, "correlation": 219709 } }, { "ph": "f", "id": 219709, "pid": 76337, "tid": -914061504, "ts": 1716454224928218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928222, "dur": 0, "args": { "External id": 219710, "cbid": 251, "correlation": 219710 } }, { "ph": "f", "id": 219710, "pid": 76337, "tid": -914061504, "ts": 1716454224928222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224949177, "dur": 10, "args": { "External id": 219711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219711, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219711, "pid": 5, "tid": 7, "ts": 1716454224949177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928224, "dur": 13, "args": { "External id": 219711, "cbid": 211, "correlation": 219711 } }, { "ph": "s", "id": 219711, "pid": 76337, "tid": -914061504, "ts": 1716454224928224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224949188, "dur": 5, "args": { "External id": 219713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219713, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 219713, "pid": 5, "tid": 7, "ts": 1716454224949188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928241, "dur": 8, "args": { "External id": 219713, "cbid": 211, "correlation": 219713 } }, { "ph": "s", "id": 219713, "pid": 76337, "tid": -914061504, "ts": 1716454224928241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928305, "dur": 1, "args": { "External id": 219724, "cbid": 251, "correlation": 219724 } }, { "ph": "f", "id": 219724, "pid": 76337, "tid": -914061504, "ts": 1716454224928305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928308, "dur": 0, "args": { "External id": 219725, "cbid": 251, "correlation": 219725 } }, { "ph": "f", "id": 219725, "pid": 76337, "tid": -914061504, "ts": 1716454224928308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224949194, "dur": 6, "args": { "External id": 219726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219726, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 219726, "pid": 5, "tid": 7, "ts": 1716454224949194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928310, "dur": 13, "args": { "External id": 219726, "cbid": 211, "correlation": 219726 } }, { "ph": "s", "id": 219726, "pid": 76337, "tid": -914061504, "ts": 1716454224928310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224949202, "dur": 4, "args": { "External id": 219728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219728, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 219728, "pid": 5, "tid": 7, "ts": 1716454224949202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928324, "dur": 5, "args": { "External id": 219728, "cbid": 211, "correlation": 219728 } }, { "ph": "s", "id": 219728, "pid": 76337, "tid": -914061504, "ts": 1716454224928324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224949207, "dur": 155, "args": { "External id": 219749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219749, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 219749, "pid": 5, "tid": 7, "ts": 1716454224949207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928400, "dur": 12, "args": { "External id": 219749, "cbid": 211, "correlation": 219749 } }, { "ph": "s", "id": 219749, "pid": 76337, "tid": -914061504, "ts": 1716454224928400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928495, "dur": 2, "args": { "External id": 219767, "cbid": 251, "correlation": 219767 } }, { "ph": "f", "id": 219767, "pid": 76337, "tid": -914061504, "ts": 1716454224928495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224949363, "dur": 107, "args": { "External id": 219769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219769, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 219769, "pid": 5, "tid": 7, "ts": 1716454224949363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928502, "dur": 14, "args": { "External id": 219769, "cbid": 211, "correlation": 219769 } }, { "ph": "s", "id": 219769, "pid": 76337, "tid": -914061504, "ts": 1716454224928502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224949471, "dur": 34, "args": { "External id": 219777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219777, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219777, "pid": 5, "tid": 7, "ts": 1716454224949471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928573, "dur": 12, "args": { "External id": 219777, "cbid": 211, "correlation": 219777 } }, { "ph": "s", "id": 219777, "pid": 76337, "tid": -914061504, "ts": 1716454224928573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224949507, "dur": 68, "args": { "External id": 219785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219785, "pid": 5, "tid": 7, "ts": 1716454224949507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928614, "dur": 9, "args": { "External id": 219785, "cbid": 211, "correlation": 219785 } }, { "ph": "s", "id": 219785, "pid": 76337, "tid": -914061504, "ts": 1716454224928614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224949576, "dur": 93, "args": { "External id": 219807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219807, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219807, "pid": 5, "tid": 7, "ts": 1716454224949576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928666, "dur": 10, "args": { "External id": 219807, "cbid": 211, "correlation": 219807 } }, { "ph": "s", "id": 219807, "pid": 76337, "tid": -914061504, "ts": 1716454224928666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928756, "dur": 1, "args": { "External id": 219823, "cbid": 251, "correlation": 219823 } }, { "ph": "f", "id": 219823, "pid": 76337, "tid": -914061504, "ts": 1716454224928756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224949670, "dur": 584, "args": { "External id": 219825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219825, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219825, "pid": 5, "tid": 7, "ts": 1716454224949670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928761, "dur": 12, "args": { "External id": 219825, "cbid": 211, "correlation": 219825 } }, { "ph": "s", "id": 219825, "pid": 76337, "tid": -914061504, "ts": 1716454224928761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224950255, "dur": 245, "args": { "External id": 219833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219833, "pid": 5, "tid": 7, "ts": 1716454224950255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928843, "dur": 19, "args": { "External id": 219833, "cbid": 211, "correlation": 219833 } }, { "ph": "s", "id": 219833, "pid": 76337, "tid": -914061504, "ts": 1716454224928843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224950501, "dur": 249, "args": { "External id": 219841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219841, "pid": 5, "tid": 7, "ts": 1716454224950501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928885, "dur": 10, "args": { "External id": 219841, "cbid": 211, "correlation": 219841 } }, { "ph": "s", "id": 219841, "pid": 76337, "tid": -914061504, "ts": 1716454224928885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928971, "dur": 1, "args": { "External id": 219857, "cbid": 251, "correlation": 219857 } }, { "ph": "f", "id": 219857, "pid": 76337, "tid": -914061504, "ts": 1716454224928971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224928983, "dur": 0, "args": { "External id": 219859, "cbid": 251, "correlation": 219859 } }, { "ph": "f", "id": 219859, "pid": 76337, "tid": -914061504, "ts": 1716454224928983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224950752, "dur": 359, "args": { "External id": 219860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219860, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 219860, "pid": 5, "tid": 7, "ts": 1716454224950752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224928988, "dur": 14, "args": { "External id": 219860, "cbid": 211, "correlation": 219860 } }, { "ph": "s", "id": 219860, "pid": 76337, "tid": -914061504, "ts": 1716454224928988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224951112, "dur": 50, "args": { "External id": 219868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219868, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219868, "pid": 5, "tid": 7, "ts": 1716454224951112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929032, "dur": 11, "args": { "External id": 219868, "cbid": 211, "correlation": 219868 } }, { "ph": "s", "id": 219868, "pid": 76337, "tid": -914061504, "ts": 1716454224929032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224951164, "dur": 157, "args": { "External id": 219879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219879, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219879, "pid": 5, "tid": 7, "ts": 1716454224951164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929105, "dur": 12, "args": { "External id": 219879, "cbid": 211, "correlation": 219879 } }, { "ph": "s", "id": 219879, "pid": 76337, "tid": -914061504, "ts": 1716454224929105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224929169, "dur": 0, "args": { "External id": 219891, "cbid": 317, "correlation": 219891 } }, { "ph": "f", "id": 219891, "pid": 76337, "tid": -914061504, "ts": 1716454224929169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224929170, "dur": 0, "args": { "External id": 219892, "cbid": 203, "correlation": 219892 } }, { "ph": "f", "id": 219892, "pid": 76337, "tid": -914061504, "ts": 1716454224929170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224929171, "dur": 0, "args": { "External id": 219893, "cbid": 205, "correlation": 219893 } }, { "ph": "f", "id": 219893, "pid": 76337, "tid": -914061504, "ts": 1716454224929171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929198, "dur": 1, "args": { "External id": 219897, "cbid": 251, "correlation": 219897 } }, { "ph": "f", "id": 219897, "pid": 76337, "tid": -914061504, "ts": 1716454224929198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929200, "dur": 0, "args": { "External id": 219898, "cbid": 251, "correlation": 219898 } }, { "ph": "f", "id": 219898, "pid": 76337, "tid": -914061504, "ts": 1716454224929200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929200, "dur": 0, "args": { "External id": 219899, "cbid": 251, "correlation": 219899 } }, { "ph": "f", "id": 219899, "pid": 76337, "tid": -914061504, "ts": 1716454224929200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929201, "dur": 0, "args": { "External id": 219900, "cbid": 251, "correlation": 219900 } }, { "ph": "f", "id": 219900, "pid": 76337, "tid": -914061504, "ts": 1716454224929201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929202, "dur": 0, "args": { "External id": 219901, "cbid": 251, "correlation": 219901 } }, { "ph": "f", "id": 219901, "pid": 76337, "tid": -914061504, "ts": 1716454224929202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929203, "dur": 0, "args": { "External id": 219902, "cbid": 251, "correlation": 219902 } }, { "ph": "f", "id": 219902, "pid": 76337, "tid": -914061504, "ts": 1716454224929203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929204, "dur": 0, "args": { "External id": 219903, "cbid": 251, "correlation": 219903 } }, { "ph": "f", "id": 219903, "pid": 76337, "tid": -914061504, "ts": 1716454224929204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929205, "dur": 0, "args": { "External id": 219904, "cbid": 251, "correlation": 219904 } }, { "ph": "f", "id": 219904, "pid": 76337, "tid": -914061504, "ts": 1716454224929205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929206, "dur": 0, "args": { "External id": 219905, "cbid": 251, "correlation": 219905 } }, { "ph": "f", "id": 219905, "pid": 76337, "tid": -914061504, "ts": 1716454224929206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224951323, "dur": 115, "args": { "External id": 219906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219906, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 219906, "pid": 5, "tid": 7, "ts": 1716454224951323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929208, "dur": 12, "args": { "External id": 219906, "cbid": 211, "correlation": 219906 } }, { "ph": "s", "id": 219906, "pid": 76337, "tid": -914061504, "ts": 1716454224929208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224951439, "dur": 60, "args": { "External id": 219912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219912, "pid": 5, "tid": 7, "ts": 1716454224951439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929243, "dur": 9, "args": { "External id": 219912, "cbid": 211, "correlation": 219912 } }, { "ph": "s", "id": 219912, "pid": 76337, "tid": -914061504, "ts": 1716454224929243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224951500, "dur": 50, "args": { "External id": 219920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219920, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219920, "pid": 5, "tid": 7, "ts": 1716454224951500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929275, "dur": 8, "args": { "External id": 219920, "cbid": 211, "correlation": 219920 } }, { "ph": "s", "id": 219920, "pid": 76337, "tid": -914061504, "ts": 1716454224929275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224951551, "dur": 53, "args": { "External id": 219940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219940, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 219940, "pid": 5, "tid": 7, "ts": 1716454224951551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929349, "dur": 11, "args": { "External id": 219940, "cbid": 211, "correlation": 219940 } }, { "ph": "s", "id": 219940, "pid": 76337, "tid": -914061504, "ts": 1716454224929349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224951606, "dur": 5, "args": { "External id": 219952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219952, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219952, "pid": 5, "tid": 7, "ts": 1716454224951606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929370, "dur": 6, "args": { "External id": 219952, "cbid": 211, "correlation": 219952 } }, { "ph": "s", "id": 219952, "pid": 76337, "tid": -914061504, "ts": 1716454224929370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224951612, "dur": 57, "args": { "External id": 219955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219955, "pid": 5, "tid": 7, "ts": 1716454224951612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929390, "dur": 6, "args": { "External id": 219955, "cbid": 211, "correlation": 219955 } }, { "ph": "s", "id": 219955, "pid": 76337, "tid": -914061504, "ts": 1716454224929390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224951670, "dur": 37, "args": { "External id": 219964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219964, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219964, "pid": 5, "tid": 7, "ts": 1716454224951670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929428, "dur": 11, "args": { "External id": 219964, "cbid": 211, "correlation": 219964 } }, { "ph": "s", "id": 219964, "pid": 76337, "tid": -914061504, "ts": 1716454224929428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224929481, "dur": 0, "args": { "External id": 219974, "cbid": 317, "correlation": 219974 } }, { "ph": "f", "id": 219974, "pid": 76337, "tid": -914061504, "ts": 1716454224929481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224929482, "dur": 0, "args": { "External id": 219975, "cbid": 203, "correlation": 219975 } }, { "ph": "f", "id": 219975, "pid": 76337, "tid": -914061504, "ts": 1716454224929482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224929482, "dur": 0, "args": { "External id": 219976, "cbid": 205, "correlation": 219976 } }, { "ph": "f", "id": 219976, "pid": 76337, "tid": -914061504, "ts": 1716454224929482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224951708, "dur": 41, "args": { "External id": 219980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219980, "pid": 5, "tid": 7, "ts": 1716454224951708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929500, "dur": 12, "args": { "External id": 219980, "cbid": 211, "correlation": 219980 } }, { "ph": "s", "id": 219980, "pid": 76337, "tid": -914061504, "ts": 1716454224929500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224951751, "dur": 14, "args": { "External id": 219982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219982, "pid": 5, "tid": 7, "ts": 1716454224951751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929514, "dur": 7, "args": { "External id": 219982, "cbid": 211, "correlation": 219982 } }, { "ph": "s", "id": 219982, "pid": 76337, "tid": -914061504, "ts": 1716454224929514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224951767, "dur": 4, "args": { "External id": 219984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219984, "pid": 5, "tid": 7, "ts": 1716454224951767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929525, "dur": 5, "args": { "External id": 219984, "cbid": 211, "correlation": 219984 } }, { "ph": "s", "id": 219984, "pid": 76337, "tid": -914061504, "ts": 1716454224929525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224929533, "dur": 0, "args": { "External id": 219985, "cbid": 51, "correlation": 219985 } }, { "ph": "s", "id": 219985, "pid": 76337, "tid": -914061504, "ts": 1716454224929533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224951772, "dur": 705, "args": { "External id": 219986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219986, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 219986, "pid": 5, "tid": 7, "ts": 1716454224951772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929534, "dur": 6, "args": { "External id": 219986, "cbid": 211, "correlation": 219986 } }, { "ph": "s", "id": 219986, "pid": 76337, "tid": -914061504, "ts": 1716454224929534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224952478, "dur": 59, "args": { "External id": 219991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 219991, "pid": 5, "tid": 7, "ts": 1716454224952478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929562, "dur": 8, "args": { "External id": 219991, "cbid": 211, "correlation": 219991 } }, { "ph": "s", "id": 219991, "pid": 76337, "tid": -914061504, "ts": 1716454224929562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224952538, "dur": 4, "args": { "External id": 219999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 219999, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 219999, "pid": 5, "tid": 7, "ts": 1716454224952538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929605, "dur": 9, "args": { "External id": 219999, "cbid": 211, "correlation": 219999 } }, { "ph": "s", "id": 219999, "pid": 76337, "tid": -914061504, "ts": 1716454224929605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929669, "dur": 1, "args": { "External id": 220015, "cbid": 251, "correlation": 220015 } }, { "ph": "f", "id": 220015, "pid": 76337, "tid": -914061504, "ts": 1716454224929669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224929674, "dur": 0, "args": { "External id": 220017, "cbid": 251, "correlation": 220017 } }, { "ph": "f", "id": 220017, "pid": 76337, "tid": -914061504, "ts": 1716454224929674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224952543, "dur": 12, "args": { "External id": 220018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220018, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 220018, "pid": 5, "tid": 7, "ts": 1716454224952543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929676, "dur": 13, "args": { "External id": 220018, "cbid": 211, "correlation": 220018 } }, { "ph": "s", "id": 220018, "pid": 76337, "tid": -914061504, "ts": 1716454224929676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224952556, "dur": 5, "args": { "External id": 220020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220020, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 220020, "pid": 5, "tid": 7, "ts": 1716454224952556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929691, "dur": 5, "args": { "External id": 220020, "cbid": 211, "correlation": 220020 } }, { "ph": "s", "id": 220020, "pid": 76337, "tid": -914061504, "ts": 1716454224929691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224952563, "dur": 55, "args": { "External id": 220030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220030, "pid": 5, "tid": 7, "ts": 1716454224952563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929748, "dur": 12, "args": { "External id": 220030, "cbid": 211, "correlation": 220030 } }, { "ph": "s", "id": 220030, "pid": 76337, "tid": -914061504, "ts": 1716454224929748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224952619, "dur": 51, "args": { "External id": 220050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220050, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 220050, "pid": 5, "tid": 7, "ts": 1716454224952619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929813, "dur": 10, "args": { "External id": 220050, "cbid": 211, "correlation": 220050 } }, { "ph": "s", "id": 220050, "pid": 76337, "tid": -914061504, "ts": 1716454224929813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224952671, "dur": 4, "args": { "External id": 220062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220062, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 220062, "pid": 5, "tid": 7, "ts": 1716454224952671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929834, "dur": 6, "args": { "External id": 220062, "cbid": 211, "correlation": 220062 } }, { "ph": "s", "id": 220062, "pid": 76337, "tid": -914061504, "ts": 1716454224929834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224952676, "dur": 55, "args": { "External id": 220065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220065, "pid": 5, "tid": 7, "ts": 1716454224952676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929852, "dur": 6, "args": { "External id": 220065, "cbid": 211, "correlation": 220065 } }, { "ph": "s", "id": 220065, "pid": 76337, "tid": -914061504, "ts": 1716454224929852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224952732, "dur": 37, "args": { "External id": 220074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220074, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220074, "pid": 5, "tid": 7, "ts": 1716454224952732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929893, "dur": 9, "args": { "External id": 220074, "cbid": 211, "correlation": 220074 } }, { "ph": "s", "id": 220074, "pid": 76337, "tid": -914061504, "ts": 1716454224929893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224929954, "dur": 0, "args": { "External id": 220084, "cbid": 317, "correlation": 220084 } }, { "ph": "f", "id": 220084, "pid": 76337, "tid": -914061504, "ts": 1716454224929954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224929955, "dur": 0, "args": { "External id": 220085, "cbid": 203, "correlation": 220085 } }, { "ph": "f", "id": 220085, "pid": 76337, "tid": -914061504, "ts": 1716454224929955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224929956, "dur": 0, "args": { "External id": 220086, "cbid": 205, "correlation": 220086 } }, { "ph": "f", "id": 220086, "pid": 76337, "tid": -914061504, "ts": 1716454224929956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224952771, "dur": 39, "args": { "External id": 220090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220090, "pid": 5, "tid": 7, "ts": 1716454224952771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929970, "dur": 22, "args": { "External id": 220090, "cbid": 211, "correlation": 220090 } }, { "ph": "s", "id": 220090, "pid": 76337, "tid": -914061504, "ts": 1716454224929970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224952811, "dur": 14, "args": { "External id": 220092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220092, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220092, "pid": 5, "tid": 7, "ts": 1716454224952811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224929994, "dur": 5, "args": { "External id": 220092, "cbid": 211, "correlation": 220092 } }, { "ph": "s", "id": 220092, "pid": 76337, "tid": -914061504, "ts": 1716454224929994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224952827, "dur": 3, "args": { "External id": 220094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 220094, "pid": 5, "tid": 7, "ts": 1716454224952827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930003, "dur": 6, "args": { "External id": 220094, "cbid": 211, "correlation": 220094 } }, { "ph": "s", "id": 220094, "pid": 76337, "tid": -914061504, "ts": 1716454224930003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224930013, "dur": 0, "args": { "External id": 220095, "cbid": 51, "correlation": 220095 } }, { "ph": "s", "id": 220095, "pid": 76337, "tid": -914061504, "ts": 1716454224930013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224952832, "dur": 699, "args": { "External id": 220096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220096, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220096, "pid": 5, "tid": 7, "ts": 1716454224952832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930013, "dur": 5, "args": { "External id": 220096, "cbid": 211, "correlation": 220096 } }, { "ph": "s", "id": 220096, "pid": 76337, "tid": -914061504, "ts": 1716454224930013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224953532, "dur": 60, "args": { "External id": 220101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220101, "pid": 5, "tid": 7, "ts": 1716454224953532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930041, "dur": 8, "args": { "External id": 220101, "cbid": 211, "correlation": 220101 } }, { "ph": "s", "id": 220101, "pid": 76337, "tid": -914061504, "ts": 1716454224930041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224953594, "dur": 50, "args": { "External id": 220109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220109, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220109, "pid": 5, "tid": 7, "ts": 1716454224953594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930073, "dur": 8, "args": { "External id": 220109, "cbid": 211, "correlation": 220109 } }, { "ph": "s", "id": 220109, "pid": 76337, "tid": -914061504, "ts": 1716454224930073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224953646, "dur": 35, "args": { "External id": 220117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220117, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220117, "pid": 5, "tid": 7, "ts": 1716454224953646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930103, "dur": 8, "args": { "External id": 220117, "cbid": 211, "correlation": 220117 } }, { "ph": "s", "id": 220117, "pid": 76337, "tid": -914061504, "ts": 1716454224930103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224953682, "dur": 51, "args": { "External id": 220137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220137, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 220137, "pid": 5, "tid": 7, "ts": 1716454224953682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930182, "dur": 12, "args": { "External id": 220137, "cbid": 211, "correlation": 220137 } }, { "ph": "s", "id": 220137, "pid": 76337, "tid": -914061504, "ts": 1716454224930182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224953734, "dur": 4, "args": { "External id": 220149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220149, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 220149, "pid": 5, "tid": 7, "ts": 1716454224953734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930204, "dur": 6, "args": { "External id": 220149, "cbid": 211, "correlation": 220149 } }, { "ph": "s", "id": 220149, "pid": 76337, "tid": -914061504, "ts": 1716454224930204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224953739, "dur": 55, "args": { "External id": 220152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220152, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220152, "pid": 5, "tid": 7, "ts": 1716454224953739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930222, "dur": 6, "args": { "External id": 220152, "cbid": 211, "correlation": 220152 } }, { "ph": "s", "id": 220152, "pid": 76337, "tid": -914061504, "ts": 1716454224930222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224930278, "dur": 0, "args": { "External id": 220163, "cbid": 317, "correlation": 220163 } }, { "ph": "f", "id": 220163, "pid": 76337, "tid": -914061504, "ts": 1716454224930278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224930279, "dur": 0, "args": { "External id": 220164, "cbid": 203, "correlation": 220164 } }, { "ph": "f", "id": 220164, "pid": 76337, "tid": -914061504, "ts": 1716454224930279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224930280, "dur": 0, "args": { "External id": 220165, "cbid": 205, "correlation": 220165 } }, { "ph": "f", "id": 220165, "pid": 76337, "tid": -914061504, "ts": 1716454224930280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930303, "dur": 1, "args": { "External id": 220169, "cbid": 251, "correlation": 220169 } }, { "ph": "f", "id": 220169, "pid": 76337, "tid": -914061504, "ts": 1716454224930303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930305, "dur": 0, "args": { "External id": 220170, "cbid": 251, "correlation": 220170 } }, { "ph": "f", "id": 220170, "pid": 76337, "tid": -914061504, "ts": 1716454224930305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930306, "dur": 0, "args": { "External id": 220171, "cbid": 251, "correlation": 220171 } }, { "ph": "f", "id": 220171, "pid": 76337, "tid": -914061504, "ts": 1716454224930306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930307, "dur": 0, "args": { "External id": 220172, "cbid": 251, "correlation": 220172 } }, { "ph": "f", "id": 220172, "pid": 76337, "tid": -914061504, "ts": 1716454224930307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930307, "dur": 0, "args": { "External id": 220173, "cbid": 251, "correlation": 220173 } }, { "ph": "f", "id": 220173, "pid": 76337, "tid": -914061504, "ts": 1716454224930307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930308, "dur": 0, "args": { "External id": 220174, "cbid": 251, "correlation": 220174 } }, { "ph": "f", "id": 220174, "pid": 76337, "tid": -914061504, "ts": 1716454224930308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930309, "dur": 0, "args": { "External id": 220175, "cbid": 251, "correlation": 220175 } }, { "ph": "f", "id": 220175, "pid": 76337, "tid": -914061504, "ts": 1716454224930309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930309, "dur": 0, "args": { "External id": 220176, "cbid": 251, "correlation": 220176 } }, { "ph": "f", "id": 220176, "pid": 76337, "tid": -914061504, "ts": 1716454224930309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930311, "dur": 0, "args": { "External id": 220177, "cbid": 251, "correlation": 220177 } }, { "ph": "f", "id": 220177, "pid": 76337, "tid": -914061504, "ts": 1716454224930311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224953796, "dur": 111, "args": { "External id": 220178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220178, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220178, "pid": 5, "tid": 7, "ts": 1716454224953796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930313, "dur": 12, "args": { "External id": 220178, "cbid": 211, "correlation": 220178 } }, { "ph": "s", "id": 220178, "pid": 76337, "tid": -914061504, "ts": 1716454224930313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224953908, "dur": 59, "args": { "External id": 220184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220184, "pid": 5, "tid": 7, "ts": 1716454224953908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930348, "dur": 9, "args": { "External id": 220184, "cbid": 211, "correlation": 220184 } }, { "ph": "s", "id": 220184, "pid": 76337, "tid": -914061504, "ts": 1716454224930348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224953968, "dur": 466, "args": { "External id": 220193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220193, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220193, "pid": 5, "tid": 7, "ts": 1716454224953968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930430, "dur": 15, "args": { "External id": 220193, "cbid": 211, "correlation": 220193 } }, { "ph": "s", "id": 220193, "pid": 76337, "tid": -914061504, "ts": 1716454224930430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224954435, "dur": 182, "args": { "External id": 220215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220215, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220215, "pid": 5, "tid": 7, "ts": 1716454224954435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930487, "dur": 10, "args": { "External id": 220215, "cbid": 211, "correlation": 220215 } }, { "ph": "s", "id": 220215, "pid": 76337, "tid": -914061504, "ts": 1716454224930487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930572, "dur": 1, "args": { "External id": 220226, "cbid": 251, "correlation": 220226 } }, { "ph": "f", "id": 220226, "pid": 76337, "tid": -914061504, "ts": 1716454224930572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224954618, "dur": 200, "args": { "External id": 220227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220227, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220227, "pid": 5, "tid": 7, "ts": 1716454224954618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930577, "dur": 13, "args": { "External id": 220227, "cbid": 211, "correlation": 220227 } }, { "ph": "s", "id": 220227, "pid": 76337, "tid": -914061504, "ts": 1716454224930577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930644, "dur": 1, "args": { "External id": 220238, "cbid": 251, "correlation": 220238 } }, { "ph": "f", "id": 220238, "pid": 76337, "tid": -914061504, "ts": 1716454224930644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224954819, "dur": 188, "args": { "External id": 220239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220239, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220239, "pid": 5, "tid": 7, "ts": 1716454224954819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930649, "dur": 11, "args": { "External id": 220239, "cbid": 211, "correlation": 220239 } }, { "ph": "s", "id": 220239, "pid": 76337, "tid": -914061504, "ts": 1716454224930649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930712, "dur": 1, "args": { "External id": 220250, "cbid": 251, "correlation": 220250 } }, { "ph": "f", "id": 220250, "pid": 76337, "tid": -914061504, "ts": 1716454224930712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224955009, "dur": 187, "args": { "External id": 220251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220251, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220251, "pid": 5, "tid": 7, "ts": 1716454224955009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930716, "dur": 11, "args": { "External id": 220251, "cbid": 211, "correlation": 220251 } }, { "ph": "s", "id": 220251, "pid": 76337, "tid": -914061504, "ts": 1716454224930716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224955197, "dur": 18840, "args": { "External id": 220272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220272, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220272, "pid": 5, "tid": 7, "ts": 1716454224955197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930796, "dur": 12, "args": { "External id": 220272, "cbid": 211, "correlation": 220272 } }, { "ph": "s", "id": 220272, "pid": 76337, "tid": -914061504, "ts": 1716454224930796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224930895, "dur": 1, "args": { "External id": 220290, "cbid": 251, "correlation": 220290 } }, { "ph": "f", "id": 220290, "pid": 76337, "tid": -914061504, "ts": 1716454224930895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224974038, "dur": 205, "args": { "External id": 220292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220292, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220292, "pid": 5, "tid": 7, "ts": 1716454224974038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930900, "dur": 12, "args": { "External id": 220292, "cbid": 211, "correlation": 220292 } }, { "ph": "s", "id": 220292, "pid": 76337, "tid": -914061504, "ts": 1716454224930900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224974244, "dur": 66, "args": { "External id": 220300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220300, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220300, "pid": 5, "tid": 7, "ts": 1716454224974244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224930969, "dur": 20, "args": { "External id": 220300, "cbid": 211, "correlation": 220300 } }, { "ph": "s", "id": 220300, "pid": 76337, "tid": -914061504, "ts": 1716454224930969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224974312, "dur": 97, "args": { "External id": 220308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220308, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220308, "pid": 5, "tid": 7, "ts": 1716454224974312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931018, "dur": 9, "args": { "External id": 220308, "cbid": 211, "correlation": 220308 } }, { "ph": "s", "id": 220308, "pid": 76337, "tid": -914061504, "ts": 1716454224931018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224974410, "dur": 54, "args": { "External id": 220319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220319, "pid": 5, "tid": 7, "ts": 1716454224974410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931090, "dur": 12, "args": { "External id": 220319, "cbid": 211, "correlation": 220319 } }, { "ph": "s", "id": 220319, "pid": 76337, "tid": -914061504, "ts": 1716454224931090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224974465, "dur": 93, "args": { "External id": 220341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220341, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220341, "pid": 5, "tid": 7, "ts": 1716454224974465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931121, "dur": 7, "args": { "External id": 220341, "cbid": 211, "correlation": 220341 } }, { "ph": "s", "id": 220341, "pid": 76337, "tid": -914061504, "ts": 1716454224931121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931204, "dur": 1, "args": { "External id": 220352, "cbid": 251, "correlation": 220352 } }, { "ph": "f", "id": 220352, "pid": 76337, "tid": -914061504, "ts": 1716454224931204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224974559, "dur": 102, "args": { "External id": 220353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220353, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220353, "pid": 5, "tid": 7, "ts": 1716454224974559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931210, "dur": 12, "args": { "External id": 220353, "cbid": 211, "correlation": 220353 } }, { "ph": "s", "id": 220353, "pid": 76337, "tid": -914061504, "ts": 1716454224931210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931280, "dur": 1, "args": { "External id": 220364, "cbid": 251, "correlation": 220364 } }, { "ph": "f", "id": 220364, "pid": 76337, "tid": -914061504, "ts": 1716454224931280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931284, "dur": 0, "args": { "External id": 220365, "cbid": 251, "correlation": 220365 } }, { "ph": "f", "id": 220365, "pid": 76337, "tid": -914061504, "ts": 1716454224931284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224974662, "dur": 10, "args": { "External id": 220366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220366, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 220366, "pid": 5, "tid": 7, "ts": 1716454224974662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931286, "dur": 12, "args": { "External id": 220366, "cbid": 211, "correlation": 220366 } }, { "ph": "s", "id": 220366, "pid": 76337, "tid": -914061504, "ts": 1716454224931286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224974674, "dur": 5, "args": { "External id": 220368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220368, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 220368, "pid": 5, "tid": 7, "ts": 1716454224974674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931300, "dur": 5, "args": { "External id": 220368, "cbid": 211, "correlation": 220368 } }, { "ph": "s", "id": 220368, "pid": 76337, "tid": -914061504, "ts": 1716454224931300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931360, "dur": 1, "args": { "External id": 220379, "cbid": 251, "correlation": 220379 } }, { "ph": "f", "id": 220379, "pid": 76337, "tid": -914061504, "ts": 1716454224931360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931363, "dur": 0, "args": { "External id": 220380, "cbid": 251, "correlation": 220380 } }, { "ph": "f", "id": 220380, "pid": 76337, "tid": -914061504, "ts": 1716454224931363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224974680, "dur": 6, "args": { "External id": 220381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220381, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 220381, "pid": 5, "tid": 7, "ts": 1716454224974680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931365, "dur": 13, "args": { "External id": 220381, "cbid": 211, "correlation": 220381 } }, { "ph": "s", "id": 220381, "pid": 76337, "tid": -914061504, "ts": 1716454224931365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224974688, "dur": 3, "args": { "External id": 220383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220383, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 220383, "pid": 5, "tid": 7, "ts": 1716454224974688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931379, "dur": 5, "args": { "External id": 220383, "cbid": 211, "correlation": 220383 } }, { "ph": "s", "id": 220383, "pid": 76337, "tid": -914061504, "ts": 1716454224931379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224974693, "dur": 158, "args": { "External id": 220404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220404, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220404, "pid": 5, "tid": 7, "ts": 1716454224974693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931452, "dur": 12, "args": { "External id": 220404, "cbid": 211, "correlation": 220404 } }, { "ph": "s", "id": 220404, "pid": 76337, "tid": -914061504, "ts": 1716454224931452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931549, "dur": 1, "args": { "External id": 220422, "cbid": 251, "correlation": 220422 } }, { "ph": "f", "id": 220422, "pid": 76337, "tid": -914061504, "ts": 1716454224931549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224974852, "dur": 107, "args": { "External id": 220424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220424, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220424, "pid": 5, "tid": 7, "ts": 1716454224974852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931555, "dur": 12, "args": { "External id": 220424, "cbid": 211, "correlation": 220424 } }, { "ph": "s", "id": 220424, "pid": 76337, "tid": -914061504, "ts": 1716454224931555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224974960, "dur": 35, "args": { "External id": 220432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220432, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220432, "pid": 5, "tid": 7, "ts": 1716454224974960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931623, "dur": 12, "args": { "External id": 220432, "cbid": 211, "correlation": 220432 } }, { "ph": "s", "id": 220432, "pid": 76337, "tid": -914061504, "ts": 1716454224931623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224974996, "dur": 67, "args": { "External id": 220440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220440, "pid": 5, "tid": 7, "ts": 1716454224974996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931662, "dur": 9, "args": { "External id": 220440, "cbid": 211, "correlation": 220440 } }, { "ph": "s", "id": 220440, "pid": 76337, "tid": -914061504, "ts": 1716454224931662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224975064, "dur": 93, "args": { "External id": 220462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220462, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220462, "pid": 5, "tid": 7, "ts": 1716454224975064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931714, "dur": 10, "args": { "External id": 220462, "cbid": 211, "correlation": 220462 } }, { "ph": "s", "id": 220462, "pid": 76337, "tid": -914061504, "ts": 1716454224931714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931803, "dur": 1, "args": { "External id": 220478, "cbid": 251, "correlation": 220478 } }, { "ph": "f", "id": 220478, "pid": 76337, "tid": -914061504, "ts": 1716454224931803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224975158, "dur": 576, "args": { "External id": 220480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220480, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220480, "pid": 5, "tid": 7, "ts": 1716454224975158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931809, "dur": 12, "args": { "External id": 220480, "cbid": 211, "correlation": 220480 } }, { "ph": "s", "id": 220480, "pid": 76337, "tid": -914061504, "ts": 1716454224931809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224975736, "dur": 245, "args": { "External id": 220488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220488, "pid": 5, "tid": 7, "ts": 1716454224975736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931873, "dur": 12, "args": { "External id": 220488, "cbid": 211, "correlation": 220488 } }, { "ph": "s", "id": 220488, "pid": 76337, "tid": -914061504, "ts": 1716454224931873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224975982, "dur": 254, "args": { "External id": 220496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220496, "pid": 5, "tid": 7, "ts": 1716454224975982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224931903, "dur": 8, "args": { "External id": 220496, "cbid": 211, "correlation": 220496 } }, { "ph": "s", "id": 220496, "pid": 76337, "tid": -914061504, "ts": 1716454224931903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931994, "dur": 1, "args": { "External id": 220512, "cbid": 251, "correlation": 220512 } }, { "ph": "f", "id": 220512, "pid": 76337, "tid": -914061504, "ts": 1716454224931994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224931999, "dur": 0, "args": { "External id": 220514, "cbid": 251, "correlation": 220514 } }, { "ph": "f", "id": 220514, "pid": 76337, "tid": -914061504, "ts": 1716454224931999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224976237, "dur": 360, "args": { "External id": 220515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220515, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 220515, "pid": 5, "tid": 7, "ts": 1716454224976237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932002, "dur": 13, "args": { "External id": 220515, "cbid": 211, "correlation": 220515 } }, { "ph": "s", "id": 220515, "pid": 76337, "tid": -914061504, "ts": 1716454224932002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224976599, "dur": 51, "args": { "External id": 220523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220523, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220523, "pid": 5, "tid": 7, "ts": 1716454224976599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932046, "dur": 10, "args": { "External id": 220523, "cbid": 211, "correlation": 220523 } }, { "ph": "s", "id": 220523, "pid": 76337, "tid": -914061504, "ts": 1716454224932046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224976651, "dur": 159, "args": { "External id": 220534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220534, "pid": 5, "tid": 7, "ts": 1716454224976651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932114, "dur": 12, "args": { "External id": 220534, "cbid": 211, "correlation": 220534 } }, { "ph": "s", "id": 220534, "pid": 76337, "tid": -914061504, "ts": 1716454224932114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224932178, "dur": 0, "args": { "External id": 220546, "cbid": 317, "correlation": 220546 } }, { "ph": "f", "id": 220546, "pid": 76337, "tid": -914061504, "ts": 1716454224932178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224932179, "dur": 0, "args": { "External id": 220547, "cbid": 203, "correlation": 220547 } }, { "ph": "f", "id": 220547, "pid": 76337, "tid": -914061504, "ts": 1716454224932179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224932180, "dur": 0, "args": { "External id": 220548, "cbid": 205, "correlation": 220548 } }, { "ph": "f", "id": 220548, "pid": 76337, "tid": -914061504, "ts": 1716454224932180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932205, "dur": 1, "args": { "External id": 220552, "cbid": 251, "correlation": 220552 } }, { "ph": "f", "id": 220552, "pid": 76337, "tid": -914061504, "ts": 1716454224932205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932207, "dur": 0, "args": { "External id": 220553, "cbid": 251, "correlation": 220553 } }, { "ph": "f", "id": 220553, "pid": 76337, "tid": -914061504, "ts": 1716454224932207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932207, "dur": 0, "args": { "External id": 220554, "cbid": 251, "correlation": 220554 } }, { "ph": "f", "id": 220554, "pid": 76337, "tid": -914061504, "ts": 1716454224932207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932208, "dur": 0, "args": { "External id": 220555, "cbid": 251, "correlation": 220555 } }, { "ph": "f", "id": 220555, "pid": 76337, "tid": -914061504, "ts": 1716454224932208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932209, "dur": 0, "args": { "External id": 220556, "cbid": 251, "correlation": 220556 } }, { "ph": "f", "id": 220556, "pid": 76337, "tid": -914061504, "ts": 1716454224932209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932209, "dur": 0, "args": { "External id": 220557, "cbid": 251, "correlation": 220557 } }, { "ph": "f", "id": 220557, "pid": 76337, "tid": -914061504, "ts": 1716454224932209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932210, "dur": 0, "args": { "External id": 220558, "cbid": 251, "correlation": 220558 } }, { "ph": "f", "id": 220558, "pid": 76337, "tid": -914061504, "ts": 1716454224932210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932211, "dur": 0, "args": { "External id": 220559, "cbid": 251, "correlation": 220559 } }, { "ph": "f", "id": 220559, "pid": 76337, "tid": -914061504, "ts": 1716454224932211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932212, "dur": 0, "args": { "External id": 220560, "cbid": 251, "correlation": 220560 } }, { "ph": "f", "id": 220560, "pid": 76337, "tid": -914061504, "ts": 1716454224932212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224976811, "dur": 115, "args": { "External id": 220561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220561, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220561, "pid": 5, "tid": 7, "ts": 1716454224976811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932214, "dur": 12, "args": { "External id": 220561, "cbid": 211, "correlation": 220561 } }, { "ph": "s", "id": 220561, "pid": 76337, "tid": -914061504, "ts": 1716454224932214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224976928, "dur": 60, "args": { "External id": 220567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220567, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220567, "pid": 5, "tid": 7, "ts": 1716454224976928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932249, "dur": 9, "args": { "External id": 220567, "cbid": 211, "correlation": 220567 } }, { "ph": "s", "id": 220567, "pid": 76337, "tid": -914061504, "ts": 1716454224932249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224976989, "dur": 49, "args": { "External id": 220575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220575, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220575, "pid": 5, "tid": 7, "ts": 1716454224976989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932280, "dur": 9, "args": { "External id": 220575, "cbid": 211, "correlation": 220575 } }, { "ph": "s", "id": 220575, "pid": 76337, "tid": -914061504, "ts": 1716454224932280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224932355, "dur": 0, "args": { "External id": 220585, "cbid": 317, "correlation": 220585 } }, { "ph": "f", "id": 220585, "pid": 76337, "tid": -914061504, "ts": 1716454224932355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224932356, "dur": 0, "args": { "External id": 220586, "cbid": 203, "correlation": 220586 } }, { "ph": "f", "id": 220586, "pid": 76337, "tid": -914061504, "ts": 1716454224932356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224932356, "dur": 0, "args": { "External id": 220587, "cbid": 205, "correlation": 220587 } }, { "ph": "f", "id": 220587, "pid": 76337, "tid": -914061504, "ts": 1716454224932356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224977040, "dur": 41, "args": { "External id": 220591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220591, "pid": 5, "tid": 7, "ts": 1716454224977040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932375, "dur": 12, "args": { "External id": 220591, "cbid": 211, "correlation": 220591 } }, { "ph": "s", "id": 220591, "pid": 76337, "tid": -914061504, "ts": 1716454224932375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224977082, "dur": 14, "args": { "External id": 220593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220593, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220593, "pid": 5, "tid": 7, "ts": 1716454224977082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932390, "dur": 6, "args": { "External id": 220593, "cbid": 211, "correlation": 220593 } }, { "ph": "s", "id": 220593, "pid": 76337, "tid": -914061504, "ts": 1716454224932390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224977098, "dur": 1, "args": { "External id": 220595, "device": 5, "context": 1, "stream": 7, "correlation": 220595, "bytes": 1536, "memory bandwidth (GB/s)": 0.8888888888888888 } }, { "ph": "f", "id": 220595, "pid": 5, "tid": 7, "ts": 1716454224977098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224932410, "dur": 17, "args": { "External id": 220595, "cbid": 51, "correlation": 220595 } }, { "ph": "s", "id": 220595, "pid": 76337, "tid": -914061504, "ts": 1716454224932410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224977102, "dur": 365, "args": { "External id": 220596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220596, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220596, "pid": 5, "tid": 7, "ts": 1716454224977102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932429, "dur": 10, "args": { "External id": 220596, "cbid": 211, "correlation": 220596 } }, { "ph": "s", "id": 220596, "pid": 76337, "tid": -914061504, "ts": 1716454224932429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224977469, "dur": 14, "args": { "External id": 220598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220598, "pid": 5, "tid": 7, "ts": 1716454224977469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932447, "dur": 7, "args": { "External id": 220598, "cbid": 211, "correlation": 220598 } }, { "ph": "s", "id": 220598, "pid": 76337, "tid": -914061504, "ts": 1716454224932447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224977484, "dur": 15, "args": { "External id": 220604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220604, "pid": 5, "tid": 7, "ts": 1716454224977484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932477, "dur": 9, "args": { "External id": 220604, "cbid": 211, "correlation": 220604 } }, { "ph": "s", "id": 220604, "pid": 76337, "tid": -914061504, "ts": 1716454224932477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224977500, "dur": 19, "args": { "External id": 220624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220624, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 220624, "pid": 5, "tid": 7, "ts": 1716454224977500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932570, "dur": 14, "args": { "External id": 220624, "cbid": 211, "correlation": 220624 } }, { "ph": "s", "id": 220624, "pid": 76337, "tid": -914061504, "ts": 1716454224932570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224977520, "dur": 4, "args": { "External id": 220636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220636, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 220636, "pid": 5, "tid": 7, "ts": 1716454224977520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932594, "dur": 6, "args": { "External id": 220636, "cbid": 211, "correlation": 220636 } }, { "ph": "s", "id": 220636, "pid": 76337, "tid": -914061504, "ts": 1716454224932594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224977526, "dur": 17, "args": { "External id": 220639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220639, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220639, "pid": 5, "tid": 7, "ts": 1716454224977526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932611, "dur": 6, "args": { "External id": 220639, "cbid": 211, "correlation": 220639 } }, { "ph": "s", "id": 220639, "pid": 76337, "tid": -914061504, "ts": 1716454224932611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224977545, "dur": 11, "args": { "External id": 220648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220648, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220648, "pid": 5, "tid": 7, "ts": 1716454224977545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932650, "dur": 9, "args": { "External id": 220648, "cbid": 211, "correlation": 220648 } }, { "ph": "s", "id": 220648, "pid": 76337, "tid": -914061504, "ts": 1716454224932650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224932704, "dur": 0, "args": { "External id": 220658, "cbid": 317, "correlation": 220658 } }, { "ph": "f", "id": 220658, "pid": 76337, "tid": -914061504, "ts": 1716454224932704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224932705, "dur": 0, "args": { "External id": 220659, "cbid": 203, "correlation": 220659 } }, { "ph": "f", "id": 220659, "pid": 76337, "tid": -914061504, "ts": 1716454224932705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224932706, "dur": 0, "args": { "External id": 220660, "cbid": 205, "correlation": 220660 } }, { "ph": "f", "id": 220660, "pid": 76337, "tid": -914061504, "ts": 1716454224932706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224977558, "dur": 12, "args": { "External id": 220664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220664, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220664, "pid": 5, "tid": 7, "ts": 1716454224977558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932721, "dur": 11, "args": { "External id": 220664, "cbid": 211, "correlation": 220664 } }, { "ph": "s", "id": 220664, "pid": 76337, "tid": -914061504, "ts": 1716454224932721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224977570, "dur": 24, "args": { "External id": 220666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220666, "pid": 5, "tid": 7, "ts": 1716454224977570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932734, "dur": 6, "args": { "External id": 220666, "cbid": 211, "correlation": 220666 } }, { "ph": "s", "id": 220666, "pid": 76337, "tid": -914061504, "ts": 1716454224932734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224977596, "dur": 4, "args": { "External id": 220668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 220668, "pid": 5, "tid": 7, "ts": 1716454224977596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932746, "dur": 5, "args": { "External id": 220668, "cbid": 211, "correlation": 220668 } }, { "ph": "s", "id": 220668, "pid": 76337, "tid": -914061504, "ts": 1716454224932746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224932755, "dur": 0, "args": { "External id": 220669, "cbid": 51, "correlation": 220669 } }, { "ph": "s", "id": 220669, "pid": 76337, "tid": -914061504, "ts": 1716454224932755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224977601, "dur": 358, "args": { "External id": 220670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220670, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220670, "pid": 5, "tid": 7, "ts": 1716454224977601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932756, "dur": 7, "args": { "External id": 220670, "cbid": 211, "correlation": 220670 } }, { "ph": "s", "id": 220670, "pid": 76337, "tid": -914061504, "ts": 1716454224932756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224977961, "dur": 21, "args": { "External id": 220671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220671, "pid": 5, "tid": 7, "ts": 1716454224977961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932765, "dur": 6, "args": { "External id": 220671, "cbid": 211, "correlation": 220671 } }, { "ph": "s", "id": 220671, "pid": 76337, "tid": -914061504, "ts": 1716454224932765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224977982, "dur": 32, "args": { "External id": 220677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220677, "pid": 5, "tid": 7, "ts": 1716454224977982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932794, "dur": 8, "args": { "External id": 220677, "cbid": 211, "correlation": 220677 } }, { "ph": "s", "id": 220677, "pid": 76337, "tid": -914061504, "ts": 1716454224932794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224978016, "dur": 3, "args": { "External id": 220685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220685, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 220685, "pid": 5, "tid": 7, "ts": 1716454224978016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932836, "dur": 9, "args": { "External id": 220685, "cbid": 211, "correlation": 220685 } }, { "ph": "s", "id": 220685, "pid": 76337, "tid": -914061504, "ts": 1716454224932836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932902, "dur": 1, "args": { "External id": 220701, "cbid": 251, "correlation": 220701 } }, { "ph": "f", "id": 220701, "pid": 76337, "tid": -914061504, "ts": 1716454224932902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224932908, "dur": 0, "args": { "External id": 220703, "cbid": 251, "correlation": 220703 } }, { "ph": "f", "id": 220703, "pid": 76337, "tid": -914061504, "ts": 1716454224932908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224978021, "dur": 12, "args": { "External id": 220704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220704, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 220704, "pid": 5, "tid": 7, "ts": 1716454224978021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932909, "dur": 11, "args": { "External id": 220704, "cbid": 211, "correlation": 220704 } }, { "ph": "s", "id": 220704, "pid": 76337, "tid": -914061504, "ts": 1716454224932909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224978034, "dur": 5, "args": { "External id": 220706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220706, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 220706, "pid": 5, "tid": 7, "ts": 1716454224978034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932922, "dur": 5, "args": { "External id": 220706, "cbid": 211, "correlation": 220706 } }, { "ph": "s", "id": 220706, "pid": 76337, "tid": -914061504, "ts": 1716454224932922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224978041, "dur": 29, "args": { "External id": 220716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220716, "pid": 5, "tid": 7, "ts": 1716454224978041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224932989, "dur": 13, "args": { "External id": 220716, "cbid": 211, "correlation": 220716 } }, { "ph": "s", "id": 220716, "pid": 76337, "tid": -914061504, "ts": 1716454224932989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224978072, "dur": 31, "args": { "External id": 220736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220736, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 220736, "pid": 5, "tid": 7, "ts": 1716454224978072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933057, "dur": 11, "args": { "External id": 220736, "cbid": 211, "correlation": 220736 } }, { "ph": "s", "id": 220736, "pid": 76337, "tid": -914061504, "ts": 1716454224933057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224978104, "dur": 4, "args": { "External id": 220748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220748, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 220748, "pid": 5, "tid": 7, "ts": 1716454224978104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933079, "dur": 6, "args": { "External id": 220748, "cbid": 211, "correlation": 220748 } }, { "ph": "s", "id": 220748, "pid": 76337, "tid": -914061504, "ts": 1716454224933079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224978109, "dur": 30, "args": { "External id": 220751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220751, "pid": 5, "tid": 7, "ts": 1716454224978109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933098, "dur": 7, "args": { "External id": 220751, "cbid": 211, "correlation": 220751 } }, { "ph": "s", "id": 220751, "pid": 76337, "tid": -914061504, "ts": 1716454224933098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224978140, "dur": 21, "args": { "External id": 220760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220760, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220760, "pid": 5, "tid": 7, "ts": 1716454224978140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933141, "dur": 9, "args": { "External id": 220760, "cbid": 211, "correlation": 220760 } }, { "ph": "s", "id": 220760, "pid": 76337, "tid": -914061504, "ts": 1716454224933141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224933203, "dur": 0, "args": { "External id": 220770, "cbid": 317, "correlation": 220770 } }, { "ph": "f", "id": 220770, "pid": 76337, "tid": -914061504, "ts": 1716454224933203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224933204, "dur": 0, "args": { "External id": 220771, "cbid": 203, "correlation": 220771 } }, { "ph": "f", "id": 220771, "pid": 76337, "tid": -914061504, "ts": 1716454224933204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224933205, "dur": 0, "args": { "External id": 220772, "cbid": 205, "correlation": 220772 } }, { "ph": "f", "id": 220772, "pid": 76337, "tid": -914061504, "ts": 1716454224933205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224978163, "dur": 23, "args": { "External id": 220776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220776, "pid": 5, "tid": 7, "ts": 1716454224978163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933222, "dur": 12, "args": { "External id": 220776, "cbid": 211, "correlation": 220776 } }, { "ph": "s", "id": 220776, "pid": 76337, "tid": -914061504, "ts": 1716454224933222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224978187, "dur": 44, "args": { "External id": 220778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220778, "pid": 5, "tid": 7, "ts": 1716454224978187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933237, "dur": 6, "args": { "External id": 220778, "cbid": 211, "correlation": 220778 } }, { "ph": "s", "id": 220778, "pid": 76337, "tid": -914061504, "ts": 1716454224933237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224978233, "dur": 657, "args": { "External id": 220780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220780, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220780, "pid": 5, "tid": 7, "ts": 1716454224978233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933251, "dur": 9, "args": { "External id": 220780, "cbid": 211, "correlation": 220780 } }, { "ph": "s", "id": 220780, "pid": 76337, "tid": -914061504, "ts": 1716454224933251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224978892, "dur": 21, "args": { "External id": 220782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220782, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220782, "pid": 5, "tid": 7, "ts": 1716454224978892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933264, "dur": 5, "args": { "External id": 220782, "cbid": 211, "correlation": 220782 } }, { "ph": "s", "id": 220782, "pid": 76337, "tid": -914061504, "ts": 1716454224933264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224978914, "dur": 33, "args": { "External id": 220788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220788, "pid": 5, "tid": 7, "ts": 1716454224978914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933292, "dur": 8, "args": { "External id": 220788, "cbid": 211, "correlation": 220788 } }, { "ph": "s", "id": 220788, "pid": 76337, "tid": -914061504, "ts": 1716454224933292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224933350, "dur": 0, "args": { "External id": 220798, "cbid": 317, "correlation": 220798 } }, { "ph": "f", "id": 220798, "pid": 76337, "tid": -914061504, "ts": 1716454224933350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224933351, "dur": 0, "args": { "External id": 220799, "cbid": 203, "correlation": 220799 } }, { "ph": "f", "id": 220799, "pid": 76337, "tid": -914061504, "ts": 1716454224933351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224933352, "dur": 0, "args": { "External id": 220800, "cbid": 205, "correlation": 220800 } }, { "ph": "f", "id": 220800, "pid": 76337, "tid": -914061504, "ts": 1716454224933352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933374, "dur": 1, "args": { "External id": 220804, "cbid": 251, "correlation": 220804 } }, { "ph": "f", "id": 220804, "pid": 76337, "tid": -914061504, "ts": 1716454224933374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933376, "dur": 0, "args": { "External id": 220805, "cbid": 251, "correlation": 220805 } }, { "ph": "f", "id": 220805, "pid": 76337, "tid": -914061504, "ts": 1716454224933376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933376, "dur": 0, "args": { "External id": 220806, "cbid": 251, "correlation": 220806 } }, { "ph": "f", "id": 220806, "pid": 76337, "tid": -914061504, "ts": 1716454224933376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933377, "dur": 0, "args": { "External id": 220807, "cbid": 251, "correlation": 220807 } }, { "ph": "f", "id": 220807, "pid": 76337, "tid": -914061504, "ts": 1716454224933377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933378, "dur": 0, "args": { "External id": 220808, "cbid": 251, "correlation": 220808 } }, { "ph": "f", "id": 220808, "pid": 76337, "tid": -914061504, "ts": 1716454224933378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933379, "dur": 0, "args": { "External id": 220809, "cbid": 251, "correlation": 220809 } }, { "ph": "f", "id": 220809, "pid": 76337, "tid": -914061504, "ts": 1716454224933379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933379, "dur": 0, "args": { "External id": 220810, "cbid": 251, "correlation": 220810 } }, { "ph": "f", "id": 220810, "pid": 76337, "tid": -914061504, "ts": 1716454224933379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933380, "dur": 0, "args": { "External id": 220811, "cbid": 251, "correlation": 220811 } }, { "ph": "f", "id": 220811, "pid": 76337, "tid": -914061504, "ts": 1716454224933380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933381, "dur": 0, "args": { "External id": 220812, "cbid": 251, "correlation": 220812 } }, { "ph": "f", "id": 220812, "pid": 76337, "tid": -914061504, "ts": 1716454224933381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224978949, "dur": 53, "args": { "External id": 220813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220813, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220813, "pid": 5, "tid": 7, "ts": 1716454224978949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933383, "dur": 12, "args": { "External id": 220813, "cbid": 211, "correlation": 220813 } }, { "ph": "s", "id": 220813, "pid": 76337, "tid": -914061504, "ts": 1716454224933383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224979003, "dur": 33, "args": { "External id": 220819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220819, "pid": 5, "tid": 7, "ts": 1716454224979003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933416, "dur": 10, "args": { "External id": 220819, "cbid": 211, "correlation": 220819 } }, { "ph": "s", "id": 220819, "pid": 76337, "tid": -914061504, "ts": 1716454224933416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224979037, "dur": 27, "args": { "External id": 220827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220827, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220827, "pid": 5, "tid": 7, "ts": 1716454224979037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933447, "dur": 8, "args": { "External id": 220827, "cbid": 211, "correlation": 220827 } }, { "ph": "s", "id": 220827, "pid": 76337, "tid": -914061504, "ts": 1716454224933447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224979066, "dur": 19, "args": { "External id": 220835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220835, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220835, "pid": 5, "tid": 7, "ts": 1716454224979066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933476, "dur": 9, "args": { "External id": 220835, "cbid": 211, "correlation": 220835 } }, { "ph": "s", "id": 220835, "pid": 76337, "tid": -914061504, "ts": 1716454224933476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224979086, "dur": 30, "args": { "External id": 220855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220855, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 220855, "pid": 5, "tid": 7, "ts": 1716454224979086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933559, "dur": 12, "args": { "External id": 220855, "cbid": 211, "correlation": 220855 } }, { "ph": "s", "id": 220855, "pid": 76337, "tid": -914061504, "ts": 1716454224933559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224979118, "dur": 4, "args": { "External id": 220867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220867, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 220867, "pid": 5, "tid": 7, "ts": 1716454224979118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933580, "dur": 7, "args": { "External id": 220867, "cbid": 211, "correlation": 220867 } }, { "ph": "s", "id": 220867, "pid": 76337, "tid": -914061504, "ts": 1716454224933580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224979123, "dur": 31, "args": { "External id": 220870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220870, "pid": 5, "tid": 7, "ts": 1716454224979123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933599, "dur": 6, "args": { "External id": 220870, "cbid": 211, "correlation": 220870 } }, { "ph": "s", "id": 220870, "pid": 76337, "tid": -914061504, "ts": 1716454224933599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224933656, "dur": 0, "args": { "External id": 220881, "cbid": 317, "correlation": 220881 } }, { "ph": "f", "id": 220881, "pid": 76337, "tid": -914061504, "ts": 1716454224933656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224933657, "dur": 0, "args": { "External id": 220882, "cbid": 203, "correlation": 220882 } }, { "ph": "f", "id": 220882, "pid": 76337, "tid": -914061504, "ts": 1716454224933657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224933657, "dur": 0, "args": { "External id": 220883, "cbid": 205, "correlation": 220883 } }, { "ph": "f", "id": 220883, "pid": 76337, "tid": -914061504, "ts": 1716454224933657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224979155, "dur": 22, "args": { "External id": 220887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220887, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220887, "pid": 5, "tid": 7, "ts": 1716454224979155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933672, "dur": 11, "args": { "External id": 220887, "cbid": 211, "correlation": 220887 } }, { "ph": "s", "id": 220887, "pid": 76337, "tid": -914061504, "ts": 1716454224933672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224979178, "dur": 122, "args": { "External id": 220889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220889, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220889, "pid": 5, "tid": 7, "ts": 1716454224979178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933692, "dur": 9, "args": { "External id": 220889, "cbid": 211, "correlation": 220889 } }, { "ph": "s", "id": 220889, "pid": 76337, "tid": -914061504, "ts": 1716454224933692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224979301, "dur": 24, "args": { "External id": 220891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220891, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220891, "pid": 5, "tid": 7, "ts": 1716454224979301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933704, "dur": 5, "args": { "External id": 220891, "cbid": 211, "correlation": 220891 } }, { "ph": "s", "id": 220891, "pid": 76337, "tid": -914061504, "ts": 1716454224933704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224979326, "dur": 33, "args": { "External id": 220897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220897, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220897, "pid": 5, "tid": 7, "ts": 1716454224979326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933732, "dur": 9, "args": { "External id": 220897, "cbid": 211, "correlation": 220897 } }, { "ph": "s", "id": 220897, "pid": 76337, "tid": -914061504, "ts": 1716454224933732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224979361, "dur": 177, "args": { "External id": 220906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220906, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220906, "pid": 5, "tid": 7, "ts": 1716454224979361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933814, "dur": 14, "args": { "External id": 220906, "cbid": 211, "correlation": 220906 } }, { "ph": "s", "id": 220906, "pid": 76337, "tid": -914061504, "ts": 1716454224933814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224979539, "dur": 65, "args": { "External id": 220928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220928, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 220928, "pid": 5, "tid": 7, "ts": 1716454224979539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933870, "dur": 10, "args": { "External id": 220928, "cbid": 211, "correlation": 220928 } }, { "ph": "s", "id": 220928, "pid": 76337, "tid": -914061504, "ts": 1716454224933870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224933962, "dur": 1, "args": { "External id": 220939, "cbid": 251, "correlation": 220939 } }, { "ph": "f", "id": 220939, "pid": 76337, "tid": -914061504, "ts": 1716454224933962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224979606, "dur": 156, "args": { "External id": 220940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220940, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220940, "pid": 5, "tid": 7, "ts": 1716454224979606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224933967, "dur": 21, "args": { "External id": 220940, "cbid": 211, "correlation": 220940 } }, { "ph": "s", "id": 220940, "pid": 76337, "tid": -914061504, "ts": 1716454224933967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934046, "dur": 1, "args": { "External id": 220951, "cbid": 251, "correlation": 220951 } }, { "ph": "f", "id": 220951, "pid": 76337, "tid": -914061504, "ts": 1716454224934046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224979763, "dur": 147, "args": { "External id": 220952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220952, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220952, "pid": 5, "tid": 7, "ts": 1716454224979763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934050, "dur": 11, "args": { "External id": 220952, "cbid": 211, "correlation": 220952 } }, { "ph": "s", "id": 220952, "pid": 76337, "tid": -914061504, "ts": 1716454224934050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934116, "dur": 1, "args": { "External id": 220963, "cbid": 251, "correlation": 220963 } }, { "ph": "f", "id": 220963, "pid": 76337, "tid": -914061504, "ts": 1716454224934116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224979912, "dur": 145, "args": { "External id": 220964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220964, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 220964, "pid": 5, "tid": 7, "ts": 1716454224979912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934120, "dur": 10, "args": { "External id": 220964, "cbid": 211, "correlation": 220964 } }, { "ph": "s", "id": 220964, "pid": 76337, "tid": -914061504, "ts": 1716454224934120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224980058, "dur": 1955, "args": { "External id": 220985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 220985, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 220985, "pid": 5, "tid": 7, "ts": 1716454224980058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934203, "dur": 14, "args": { "External id": 220985, "cbid": 211, "correlation": 220985 } }, { "ph": "s", "id": 220985, "pid": 76337, "tid": -914061504, "ts": 1716454224934203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934303, "dur": 1, "args": { "External id": 221003, "cbid": 251, "correlation": 221003 } }, { "ph": "f", "id": 221003, "pid": 76337, "tid": -914061504, "ts": 1716454224934303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224982015, "dur": 147, "args": { "External id": 221005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221005, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 221005, "pid": 5, "tid": 7, "ts": 1716454224982015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934309, "dur": 13, "args": { "External id": 221005, "cbid": 211, "correlation": 221005 } }, { "ph": "s", "id": 221005, "pid": 76337, "tid": -914061504, "ts": 1716454224934309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224982163, "dur": 35, "args": { "External id": 221013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221013, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221013, "pid": 5, "tid": 7, "ts": 1716454224982163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934381, "dur": 13, "args": { "External id": 221013, "cbid": 211, "correlation": 221013 } }, { "ph": "s", "id": 221013, "pid": 76337, "tid": -914061504, "ts": 1716454224934381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224982200, "dur": 50, "args": { "External id": 221021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221021, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221021, "pid": 5, "tid": 7, "ts": 1716454224982200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934420, "dur": 9, "args": { "External id": 221021, "cbid": 211, "correlation": 221021 } }, { "ph": "s", "id": 221021, "pid": 76337, "tid": -914061504, "ts": 1716454224934420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224982252, "dur": 30, "args": { "External id": 221032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221032, "pid": 5, "tid": 7, "ts": 1716454224982252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934494, "dur": 12, "args": { "External id": 221032, "cbid": 211, "correlation": 221032 } }, { "ph": "s", "id": 221032, "pid": 76337, "tid": -914061504, "ts": 1716454224934494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224982283, "dur": 34, "args": { "External id": 221054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221054, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221054, "pid": 5, "tid": 7, "ts": 1716454224982283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934524, "dur": 7, "args": { "External id": 221054, "cbid": 211, "correlation": 221054 } }, { "ph": "s", "id": 221054, "pid": 76337, "tid": -914061504, "ts": 1716454224934524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934609, "dur": 1, "args": { "External id": 221065, "cbid": 251, "correlation": 221065 } }, { "ph": "f", "id": 221065, "pid": 76337, "tid": -914061504, "ts": 1716454224934609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224982318, "dur": 88, "args": { "External id": 221066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221066, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221066, "pid": 5, "tid": 7, "ts": 1716454224982318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934614, "dur": 12, "args": { "External id": 221066, "cbid": 211, "correlation": 221066 } }, { "ph": "s", "id": 221066, "pid": 76337, "tid": -914061504, "ts": 1716454224934614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934684, "dur": 1, "args": { "External id": 221077, "cbid": 251, "correlation": 221077 } }, { "ph": "f", "id": 221077, "pid": 76337, "tid": -914061504, "ts": 1716454224934684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934688, "dur": 0, "args": { "External id": 221078, "cbid": 251, "correlation": 221078 } }, { "ph": "f", "id": 221078, "pid": 76337, "tid": -914061504, "ts": 1716454224934688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224982408, "dur": 11, "args": { "External id": 221079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221079, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 221079, "pid": 5, "tid": 7, "ts": 1716454224982408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934690, "dur": 11, "args": { "External id": 221079, "cbid": 211, "correlation": 221079 } }, { "ph": "s", "id": 221079, "pid": 76337, "tid": -914061504, "ts": 1716454224934690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224982420, "dur": 5, "args": { "External id": 221081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221081, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 221081, "pid": 5, "tid": 7, "ts": 1716454224982420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934703, "dur": 8, "args": { "External id": 221081, "cbid": 211, "correlation": 221081 } }, { "ph": "s", "id": 221081, "pid": 76337, "tid": -914061504, "ts": 1716454224934703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934762, "dur": 1, "args": { "External id": 221092, "cbid": 251, "correlation": 221092 } }, { "ph": "f", "id": 221092, "pid": 76337, "tid": -914061504, "ts": 1716454224934762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934765, "dur": 0, "args": { "External id": 221093, "cbid": 251, "correlation": 221093 } }, { "ph": "f", "id": 221093, "pid": 76337, "tid": -914061504, "ts": 1716454224934765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224982426, "dur": 7, "args": { "External id": 221094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221094, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 221094, "pid": 5, "tid": 7, "ts": 1716454224982426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934767, "dur": 11, "args": { "External id": 221094, "cbid": 211, "correlation": 221094 } }, { "ph": "s", "id": 221094, "pid": 76337, "tid": -914061504, "ts": 1716454224934767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224982434, "dur": 3, "args": { "External id": 221096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 221096, "pid": 5, "tid": 7, "ts": 1716454224982434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934780, "dur": 6, "args": { "External id": 221096, "cbid": 211, "correlation": 221096 } }, { "ph": "s", "id": 221096, "pid": 76337, "tid": -914061504, "ts": 1716454224934780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224982439, "dur": 92, "args": { "External id": 221117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221117, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 221117, "pid": 5, "tid": 7, "ts": 1716454224982439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934854, "dur": 12, "args": { "External id": 221117, "cbid": 211, "correlation": 221117 } }, { "ph": "s", "id": 221117, "pid": 76337, "tid": -914061504, "ts": 1716454224934854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224934950, "dur": 1, "args": { "External id": 221135, "cbid": 251, "correlation": 221135 } }, { "ph": "f", "id": 221135, "pid": 76337, "tid": -914061504, "ts": 1716454224934950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224982532, "dur": 98, "args": { "External id": 221137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221137, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221137, "pid": 5, "tid": 7, "ts": 1716454224982532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224934956, "dur": 13, "args": { "External id": 221137, "cbid": 211, "correlation": 221137 } }, { "ph": "s", "id": 221137, "pid": 76337, "tid": -914061504, "ts": 1716454224934956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224982631, "dur": 19, "args": { "External id": 221145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221145, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221145, "pid": 5, "tid": 7, "ts": 1716454224982631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935032, "dur": 13, "args": { "External id": 221145, "cbid": 211, "correlation": 221145 } }, { "ph": "s", "id": 221145, "pid": 76337, "tid": -914061504, "ts": 1716454224935032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224982652, "dur": 37, "args": { "External id": 221153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221153, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221153, "pid": 5, "tid": 7, "ts": 1716454224982652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935075, "dur": 9, "args": { "External id": 221153, "cbid": 211, "correlation": 221153 } }, { "ph": "s", "id": 221153, "pid": 76337, "tid": -914061504, "ts": 1716454224935075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224982690, "dur": 35, "args": { "External id": 221175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221175, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221175, "pid": 5, "tid": 7, "ts": 1716454224982690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935127, "dur": 11, "args": { "External id": 221175, "cbid": 211, "correlation": 221175 } }, { "ph": "s", "id": 221175, "pid": 76337, "tid": -914061504, "ts": 1716454224935127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224935216, "dur": 1, "args": { "External id": 221191, "cbid": 251, "correlation": 221191 } }, { "ph": "f", "id": 221191, "pid": 76337, "tid": -914061504, "ts": 1716454224935216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224935221, "dur": 0, "args": { "External id": 221193, "cbid": 251, "correlation": 221193 } }, { "ph": "f", "id": 221193, "pid": 76337, "tid": -914061504, "ts": 1716454224935221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224982726, "dur": 536, "args": { "External id": 221194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221194, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 221194, "pid": 5, "tid": 7, "ts": 1716454224982726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935224, "dur": 13, "args": { "External id": 221194, "cbid": 211, "correlation": 221194 } }, { "ph": "s", "id": 221194, "pid": 76337, "tid": -914061504, "ts": 1716454224935224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224983264, "dur": 124, "args": { "External id": 221202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221202, "pid": 5, "tid": 7, "ts": 1716454224983264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935291, "dur": 12, "args": { "External id": 221202, "cbid": 211, "correlation": 221202 } }, { "ph": "s", "id": 221202, "pid": 76337, "tid": -914061504, "ts": 1716454224935291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224983389, "dur": 126, "args": { "External id": 221210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221210, "pid": 5, "tid": 7, "ts": 1716454224983389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935322, "dur": 8, "args": { "External id": 221210, "cbid": 211, "correlation": 221210 } }, { "ph": "s", "id": 221210, "pid": 76337, "tid": -914061504, "ts": 1716454224935322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224935400, "dur": 1, "args": { "External id": 221226, "cbid": 251, "correlation": 221226 } }, { "ph": "f", "id": 221226, "pid": 76337, "tid": -914061504, "ts": 1716454224935400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224983517, "dur": 300, "args": { "External id": 221228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221228, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221228, "pid": 5, "tid": 7, "ts": 1716454224983517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935406, "dur": 12, "args": { "External id": 221228, "cbid": 211, "correlation": 221228 } }, { "ph": "s", "id": 221228, "pid": 76337, "tid": -914061504, "ts": 1716454224935406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224983818, "dur": 27, "args": { "External id": 221236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221236, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221236, "pid": 5, "tid": 7, "ts": 1716454224983818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935447, "dur": 9, "args": { "External id": 221236, "cbid": 211, "correlation": 221236 } }, { "ph": "s", "id": 221236, "pid": 76337, "tid": -914061504, "ts": 1716454224935447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224983846, "dur": 80, "args": { "External id": 221247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221247, "pid": 5, "tid": 7, "ts": 1716454224983846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935515, "dur": 12, "args": { "External id": 221247, "cbid": 211, "correlation": 221247 } }, { "ph": "s", "id": 221247, "pid": 76337, "tid": -914061504, "ts": 1716454224935515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224935579, "dur": 0, "args": { "External id": 221259, "cbid": 317, "correlation": 221259 } }, { "ph": "f", "id": 221259, "pid": 76337, "tid": -914061504, "ts": 1716454224935579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224935580, "dur": 0, "args": { "External id": 221260, "cbid": 203, "correlation": 221260 } }, { "ph": "f", "id": 221260, "pid": 76337, "tid": -914061504, "ts": 1716454224935580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224935581, "dur": 0, "args": { "External id": 221261, "cbid": 205, "correlation": 221261 } }, { "ph": "f", "id": 221261, "pid": 76337, "tid": -914061504, "ts": 1716454224935581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224983928, "dur": 22, "args": { "External id": 221265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221265, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221265, "pid": 5, "tid": 7, "ts": 1716454224983928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935599, "dur": 12, "args": { "External id": 221265, "cbid": 211, "correlation": 221265 } }, { "ph": "s", "id": 221265, "pid": 76337, "tid": -914061504, "ts": 1716454224935599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224983951, "dur": 118, "args": { "External id": 221267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221267, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221267, "pid": 5, "tid": 7, "ts": 1716454224983951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935617, "dur": 6, "args": { "External id": 221267, "cbid": 211, "correlation": 221267 } }, { "ph": "s", "id": 221267, "pid": 76337, "tid": -914061504, "ts": 1716454224935617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224984070, "dur": 23, "args": { "External id": 221269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221269, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221269, "pid": 5, "tid": 7, "ts": 1716454224984070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935627, "dur": 6, "args": { "External id": 221269, "cbid": 211, "correlation": 221269 } }, { "ph": "s", "id": 221269, "pid": 76337, "tid": -914061504, "ts": 1716454224935627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224984095, "dur": 33, "args": { "External id": 221275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221275, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221275, "pid": 5, "tid": 7, "ts": 1716454224984095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935656, "dur": 8, "args": { "External id": 221275, "cbid": 211, "correlation": 221275 } }, { "ph": "s", "id": 221275, "pid": 76337, "tid": -914061504, "ts": 1716454224935656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224984129, "dur": 27, "args": { "External id": 221283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221283, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221283, "pid": 5, "tid": 7, "ts": 1716454224984129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935688, "dur": 8, "args": { "External id": 221283, "cbid": 211, "correlation": 221283 } }, { "ph": "s", "id": 221283, "pid": 76337, "tid": -914061504, "ts": 1716454224935688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224984157, "dur": 31, "args": { "External id": 221303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221303, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 221303, "pid": 5, "tid": 7, "ts": 1716454224984157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935760, "dur": 11, "args": { "External id": 221303, "cbid": 211, "correlation": 221303 } }, { "ph": "s", "id": 221303, "pid": 76337, "tid": -914061504, "ts": 1716454224935760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224984189, "dur": 4, "args": { "External id": 221315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221315, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 221315, "pid": 5, "tid": 7, "ts": 1716454224984189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935781, "dur": 7, "args": { "External id": 221315, "cbid": 211, "correlation": 221315 } }, { "ph": "s", "id": 221315, "pid": 76337, "tid": -914061504, "ts": 1716454224935781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224984195, "dur": 30, "args": { "External id": 221318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221318, "pid": 5, "tid": 7, "ts": 1716454224984195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935799, "dur": 6, "args": { "External id": 221318, "cbid": 211, "correlation": 221318 } }, { "ph": "s", "id": 221318, "pid": 76337, "tid": -914061504, "ts": 1716454224935799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224984226, "dur": 21, "args": { "External id": 221327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221327, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221327, "pid": 5, "tid": 7, "ts": 1716454224984226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935839, "dur": 9, "args": { "External id": 221327, "cbid": 211, "correlation": 221327 } }, { "ph": "s", "id": 221327, "pid": 76337, "tid": -914061504, "ts": 1716454224935839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224935890, "dur": 0, "args": { "External id": 221337, "cbid": 317, "correlation": 221337 } }, { "ph": "f", "id": 221337, "pid": 76337, "tid": -914061504, "ts": 1716454224935890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224935891, "dur": 0, "args": { "External id": 221338, "cbid": 203, "correlation": 221338 } }, { "ph": "f", "id": 221338, "pid": 76337, "tid": -914061504, "ts": 1716454224935891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224935891, "dur": 0, "args": { "External id": 221339, "cbid": 205, "correlation": 221339 } }, { "ph": "f", "id": 221339, "pid": 76337, "tid": -914061504, "ts": 1716454224935891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224984248, "dur": 22, "args": { "External id": 221343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221343, "pid": 5, "tid": 7, "ts": 1716454224984248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935904, "dur": 11, "args": { "External id": 221343, "cbid": 211, "correlation": 221343 } }, { "ph": "s", "id": 221343, "pid": 76337, "tid": -914061504, "ts": 1716454224935904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224984272, "dur": 43, "args": { "External id": 221345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221345, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221345, "pid": 5, "tid": 7, "ts": 1716454224984272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935917, "dur": 6, "args": { "External id": 221345, "cbid": 211, "correlation": 221345 } }, { "ph": "s", "id": 221345, "pid": 76337, "tid": -914061504, "ts": 1716454224935917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224984317, "dur": 649, "args": { "External id": 221347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221347, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221347, "pid": 5, "tid": 7, "ts": 1716454224984317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935929, "dur": 6, "args": { "External id": 221347, "cbid": 211, "correlation": 221347 } }, { "ph": "s", "id": 221347, "pid": 76337, "tid": -914061504, "ts": 1716454224935929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224984967, "dur": 22, "args": { "External id": 221349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221349, "pid": 5, "tid": 7, "ts": 1716454224984967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935939, "dur": 5, "args": { "External id": 221349, "cbid": 211, "correlation": 221349 } }, { "ph": "s", "id": 221349, "pid": 76337, "tid": -914061504, "ts": 1716454224935939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224984990, "dur": 32, "args": { "External id": 221355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221355, "pid": 5, "tid": 7, "ts": 1716454224984990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224935966, "dur": 23, "args": { "External id": 221355, "cbid": 211, "correlation": 221355 } }, { "ph": "s", "id": 221355, "pid": 76337, "tid": -914061504, "ts": 1716454224935966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224985024, "dur": 3, "args": { "External id": 221363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221363, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 221363, "pid": 5, "tid": 7, "ts": 1716454224985024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936025, "dur": 10, "args": { "External id": 221363, "cbid": 211, "correlation": 221363 } }, { "ph": "s", "id": 221363, "pid": 76337, "tid": -914061504, "ts": 1716454224936025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224936091, "dur": 1, "args": { "External id": 221379, "cbid": 251, "correlation": 221379 } }, { "ph": "f", "id": 221379, "pid": 76337, "tid": -914061504, "ts": 1716454224936091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224936096, "dur": 0, "args": { "External id": 221381, "cbid": 251, "correlation": 221381 } }, { "ph": "f", "id": 221381, "pid": 76337, "tid": -914061504, "ts": 1716454224936096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224985028, "dur": 12, "args": { "External id": 221382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221382, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 221382, "pid": 5, "tid": 7, "ts": 1716454224985028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936098, "dur": 12, "args": { "External id": 221382, "cbid": 211, "correlation": 221382 } }, { "ph": "s", "id": 221382, "pid": 76337, "tid": -914061504, "ts": 1716454224936098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224985042, "dur": 5, "args": { "External id": 221384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221384, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 221384, "pid": 5, "tid": 7, "ts": 1716454224985042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936111, "dur": 5, "args": { "External id": 221384, "cbid": 211, "correlation": 221384 } }, { "ph": "s", "id": 221384, "pid": 76337, "tid": -914061504, "ts": 1716454224936111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224985048, "dur": 29, "args": { "External id": 221394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221394, "pid": 5, "tid": 7, "ts": 1716454224985048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936170, "dur": 12, "args": { "External id": 221394, "cbid": 211, "correlation": 221394 } }, { "ph": "s", "id": 221394, "pid": 76337, "tid": -914061504, "ts": 1716454224936170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224985078, "dur": 31, "args": { "External id": 221414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221414, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 221414, "pid": 5, "tid": 7, "ts": 1716454224985078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936235, "dur": 11, "args": { "External id": 221414, "cbid": 211, "correlation": 221414 } }, { "ph": "s", "id": 221414, "pid": 76337, "tid": -914061504, "ts": 1716454224936235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224985111, "dur": 4, "args": { "External id": 221426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221426, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 221426, "pid": 5, "tid": 7, "ts": 1716454224985111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936256, "dur": 6, "args": { "External id": 221426, "cbid": 211, "correlation": 221426 } }, { "ph": "s", "id": 221426, "pid": 76337, "tid": -914061504, "ts": 1716454224936256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224985116, "dur": 29, "args": { "External id": 221429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221429, "pid": 5, "tid": 7, "ts": 1716454224985116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936274, "dur": 6, "args": { "External id": 221429, "cbid": 211, "correlation": 221429 } }, { "ph": "s", "id": 221429, "pid": 76337, "tid": -914061504, "ts": 1716454224936274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224985147, "dur": 20, "args": { "External id": 221438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221438, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221438, "pid": 5, "tid": 7, "ts": 1716454224985147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936314, "dur": 9, "args": { "External id": 221438, "cbid": 211, "correlation": 221438 } }, { "ph": "s", "id": 221438, "pid": 76337, "tid": -914061504, "ts": 1716454224936314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224936375, "dur": 0, "args": { "External id": 221448, "cbid": 317, "correlation": 221448 } }, { "ph": "f", "id": 221448, "pid": 76337, "tid": -914061504, "ts": 1716454224936375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224936376, "dur": 0, "args": { "External id": 221449, "cbid": 203, "correlation": 221449 } }, { "ph": "f", "id": 221449, "pid": 76337, "tid": -914061504, "ts": 1716454224936376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224936377, "dur": 0, "args": { "External id": 221450, "cbid": 205, "correlation": 221450 } }, { "ph": "f", "id": 221450, "pid": 76337, "tid": -914061504, "ts": 1716454224936377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224985168, "dur": 23, "args": { "External id": 221454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221454, "pid": 5, "tid": 7, "ts": 1716454224985168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936391, "dur": 11, "args": { "External id": 221454, "cbid": 211, "correlation": 221454 } }, { "ph": "s", "id": 221454, "pid": 76337, "tid": -914061504, "ts": 1716454224936391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224985192, "dur": 44, "args": { "External id": 221456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221456, "pid": 5, "tid": 7, "ts": 1716454224985192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936405, "dur": 5, "args": { "External id": 221456, "cbid": 211, "correlation": 221456 } }, { "ph": "s", "id": 221456, "pid": 76337, "tid": -914061504, "ts": 1716454224936405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224985237, "dur": 638, "args": { "External id": 221458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221458, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221458, "pid": 5, "tid": 7, "ts": 1716454224985237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936416, "dur": 7, "args": { "External id": 221458, "cbid": 211, "correlation": 221458 } }, { "ph": "s", "id": 221458, "pid": 76337, "tid": -914061504, "ts": 1716454224936416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224985876, "dur": 21, "args": { "External id": 221460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221460, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221460, "pid": 5, "tid": 7, "ts": 1716454224985876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936426, "dur": 5, "args": { "External id": 221460, "cbid": 211, "correlation": 221460 } }, { "ph": "s", "id": 221460, "pid": 76337, "tid": -914061504, "ts": 1716454224936426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224985898, "dur": 33, "args": { "External id": 221466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221466, "pid": 5, "tid": 7, "ts": 1716454224985898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936454, "dur": 8, "args": { "External id": 221466, "cbid": 211, "correlation": 221466 } }, { "ph": "s", "id": 221466, "pid": 76337, "tid": -914061504, "ts": 1716454224936454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224985932, "dur": 27, "args": { "External id": 221474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221474, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221474, "pid": 5, "tid": 7, "ts": 1716454224985932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936486, "dur": 9, "args": { "External id": 221474, "cbid": 211, "correlation": 221474 } }, { "ph": "s", "id": 221474, "pid": 76337, "tid": -914061504, "ts": 1716454224936486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224985960, "dur": 20, "args": { "External id": 221482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221482, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221482, "pid": 5, "tid": 7, "ts": 1716454224985960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936514, "dur": 8, "args": { "External id": 221482, "cbid": 211, "correlation": 221482 } }, { "ph": "s", "id": 221482, "pid": 76337, "tid": -914061504, "ts": 1716454224936514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224985982, "dur": 29, "args": { "External id": 221502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221502, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 221502, "pid": 5, "tid": 7, "ts": 1716454224985982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936592, "dur": 12, "args": { "External id": 221502, "cbid": 211, "correlation": 221502 } }, { "ph": "s", "id": 221502, "pid": 76337, "tid": -914061504, "ts": 1716454224936592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224986012, "dur": 4, "args": { "External id": 221514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221514, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 221514, "pid": 5, "tid": 7, "ts": 1716454224986012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936613, "dur": 6, "args": { "External id": 221514, "cbid": 211, "correlation": 221514 } }, { "ph": "s", "id": 221514, "pid": 76337, "tid": -914061504, "ts": 1716454224936613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224986017, "dur": 29, "args": { "External id": 221517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221517, "pid": 5, "tid": 7, "ts": 1716454224986017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936631, "dur": 6, "args": { "External id": 221517, "cbid": 211, "correlation": 221517 } }, { "ph": "s", "id": 221517, "pid": 76337, "tid": -914061504, "ts": 1716454224936631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224936688, "dur": 0, "args": { "External id": 221528, "cbid": 317, "correlation": 221528 } }, { "ph": "f", "id": 221528, "pid": 76337, "tid": -914061504, "ts": 1716454224936688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224936688, "dur": 0, "args": { "External id": 221529, "cbid": 203, "correlation": 221529 } }, { "ph": "f", "id": 221529, "pid": 76337, "tid": -914061504, "ts": 1716454224936688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224936689, "dur": 0, "args": { "External id": 221530, "cbid": 205, "correlation": 221530 } }, { "ph": "f", "id": 221530, "pid": 76337, "tid": -914061504, "ts": 1716454224936689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224986048, "dur": 22, "args": { "External id": 221534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221534, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221534, "pid": 5, "tid": 7, "ts": 1716454224986048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936703, "dur": 12, "args": { "External id": 221534, "cbid": 211, "correlation": 221534 } }, { "ph": "s", "id": 221534, "pid": 76337, "tid": -914061504, "ts": 1716454224936703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224986071, "dur": 114, "args": { "External id": 221536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221536, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221536, "pid": 5, "tid": 7, "ts": 1716454224986071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936721, "dur": 6, "args": { "External id": 221536, "cbid": 211, "correlation": 221536 } }, { "ph": "s", "id": 221536, "pid": 76337, "tid": -914061504, "ts": 1716454224936721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224986186, "dur": 21, "args": { "External id": 221538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221538, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221538, "pid": 5, "tid": 7, "ts": 1716454224986186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936730, "dur": 5, "args": { "External id": 221538, "cbid": 211, "correlation": 221538 } }, { "ph": "s", "id": 221538, "pid": 76337, "tid": -914061504, "ts": 1716454224936730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224986208, "dur": 32, "args": { "External id": 221544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221544, "pid": 5, "tid": 7, "ts": 1716454224986208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936758, "dur": 8, "args": { "External id": 221544, "cbid": 211, "correlation": 221544 } }, { "ph": "s", "id": 221544, "pid": 76337, "tid": -914061504, "ts": 1716454224936758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224986241, "dur": 187, "args": { "External id": 221553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221553, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221553, "pid": 5, "tid": 7, "ts": 1716454224986241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936839, "dur": 14, "args": { "External id": 221553, "cbid": 211, "correlation": 221553 } }, { "ph": "s", "id": 221553, "pid": 76337, "tid": -914061504, "ts": 1716454224936839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224986429, "dur": 64, "args": { "External id": 221575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221575, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221575, "pid": 5, "tid": 7, "ts": 1716454224986429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936896, "dur": 10, "args": { "External id": 221575, "cbid": 211, "correlation": 221575 } }, { "ph": "s", "id": 221575, "pid": 76337, "tid": -914061504, "ts": 1716454224936896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224936989, "dur": 1, "args": { "External id": 221586, "cbid": 251, "correlation": 221586 } }, { "ph": "f", "id": 221586, "pid": 76337, "tid": -914061504, "ts": 1716454224936989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224986495, "dur": 151, "args": { "External id": 221587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221587, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221587, "pid": 5, "tid": 7, "ts": 1716454224986495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224936995, "dur": 13, "args": { "External id": 221587, "cbid": 211, "correlation": 221587 } }, { "ph": "s", "id": 221587, "pid": 76337, "tid": -914061504, "ts": 1716454224936995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937066, "dur": 1, "args": { "External id": 221598, "cbid": 251, "correlation": 221598 } }, { "ph": "f", "id": 221598, "pid": 76337, "tid": -914061504, "ts": 1716454224937066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224986647, "dur": 147, "args": { "External id": 221599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221599, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221599, "pid": 5, "tid": 7, "ts": 1716454224986647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937070, "dur": 11, "args": { "External id": 221599, "cbid": 211, "correlation": 221599 } }, { "ph": "s", "id": 221599, "pid": 76337, "tid": -914061504, "ts": 1716454224937070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937134, "dur": 1, "args": { "External id": 221610, "cbid": 251, "correlation": 221610 } }, { "ph": "f", "id": 221610, "pid": 76337, "tid": -914061504, "ts": 1716454224937134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224986796, "dur": 143, "args": { "External id": 221611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221611, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221611, "pid": 5, "tid": 7, "ts": 1716454224986796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937138, "dur": 13, "args": { "External id": 221611, "cbid": 211, "correlation": 221611 } }, { "ph": "s", "id": 221611, "pid": 76337, "tid": -914061504, "ts": 1716454224937138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224986940, "dur": 1909, "args": { "External id": 221632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221632, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 221632, "pid": 5, "tid": 7, "ts": 1716454224986940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937219, "dur": 12, "args": { "External id": 221632, "cbid": 211, "correlation": 221632 } }, { "ph": "s", "id": 221632, "pid": 76337, "tid": -914061504, "ts": 1716454224937219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937318, "dur": 1, "args": { "External id": 221650, "cbid": 251, "correlation": 221650 } }, { "ph": "f", "id": 221650, "pid": 76337, "tid": -914061504, "ts": 1716454224937318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224988851, "dur": 146, "args": { "External id": 221652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221652, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 221652, "pid": 5, "tid": 7, "ts": 1716454224988851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937323, "dur": 13, "args": { "External id": 221652, "cbid": 211, "correlation": 221652 } }, { "ph": "s", "id": 221652, "pid": 76337, "tid": -914061504, "ts": 1716454224937323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224988998, "dur": 35, "args": { "External id": 221660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221660, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221660, "pid": 5, "tid": 7, "ts": 1716454224988998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937394, "dur": 13, "args": { "External id": 221660, "cbid": 211, "correlation": 221660 } }, { "ph": "s", "id": 221660, "pid": 76337, "tid": -914061504, "ts": 1716454224937394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224989034, "dur": 50, "args": { "External id": 221668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221668, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221668, "pid": 5, "tid": 7, "ts": 1716454224989034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937434, "dur": 8, "args": { "External id": 221668, "cbid": 211, "correlation": 221668 } }, { "ph": "s", "id": 221668, "pid": 76337, "tid": -914061504, "ts": 1716454224937434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224989085, "dur": 30, "args": { "External id": 221679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221679, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221679, "pid": 5, "tid": 7, "ts": 1716454224989085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937506, "dur": 12, "args": { "External id": 221679, "cbid": 211, "correlation": 221679 } }, { "ph": "s", "id": 221679, "pid": 76337, "tid": -914061504, "ts": 1716454224937506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224989117, "dur": 34, "args": { "External id": 221701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221701, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221701, "pid": 5, "tid": 7, "ts": 1716454224989117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937536, "dur": 7, "args": { "External id": 221701, "cbid": 211, "correlation": 221701 } }, { "ph": "s", "id": 221701, "pid": 76337, "tid": -914061504, "ts": 1716454224937536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937621, "dur": 1, "args": { "External id": 221712, "cbid": 251, "correlation": 221712 } }, { "ph": "f", "id": 221712, "pid": 76337, "tid": -914061504, "ts": 1716454224937621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224989152, "dur": 87, "args": { "External id": 221713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221713, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221713, "pid": 5, "tid": 7, "ts": 1716454224989152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937626, "dur": 12, "args": { "External id": 221713, "cbid": 211, "correlation": 221713 } }, { "ph": "s", "id": 221713, "pid": 76337, "tid": -914061504, "ts": 1716454224937626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937694, "dur": 1, "args": { "External id": 221724, "cbid": 251, "correlation": 221724 } }, { "ph": "f", "id": 221724, "pid": 76337, "tid": -914061504, "ts": 1716454224937694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937698, "dur": 0, "args": { "External id": 221725, "cbid": 251, "correlation": 221725 } }, { "ph": "f", "id": 221725, "pid": 76337, "tid": -914061504, "ts": 1716454224937698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224989241, "dur": 11, "args": { "External id": 221726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221726, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 221726, "pid": 5, "tid": 7, "ts": 1716454224989241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937699, "dur": 13, "args": { "External id": 221726, "cbid": 211, "correlation": 221726 } }, { "ph": "s", "id": 221726, "pid": 76337, "tid": -914061504, "ts": 1716454224937699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224989253, "dur": 5, "args": { "External id": 221728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221728, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 221728, "pid": 5, "tid": 7, "ts": 1716454224989253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937714, "dur": 5, "args": { "External id": 221728, "cbid": 211, "correlation": 221728 } }, { "ph": "s", "id": 221728, "pid": 76337, "tid": -914061504, "ts": 1716454224937714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937771, "dur": 1, "args": { "External id": 221739, "cbid": 251, "correlation": 221739 } }, { "ph": "f", "id": 221739, "pid": 76337, "tid": -914061504, "ts": 1716454224937771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937774, "dur": 0, "args": { "External id": 221740, "cbid": 251, "correlation": 221740 } }, { "ph": "f", "id": 221740, "pid": 76337, "tid": -914061504, "ts": 1716454224937774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224989259, "dur": 7, "args": { "External id": 221741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221741, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 221741, "pid": 5, "tid": 7, "ts": 1716454224989259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937776, "dur": 11, "args": { "External id": 221741, "cbid": 211, "correlation": 221741 } }, { "ph": "s", "id": 221741, "pid": 76337, "tid": -914061504, "ts": 1716454224937776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224989267, "dur": 3, "args": { "External id": 221743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221743, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 221743, "pid": 5, "tid": 7, "ts": 1716454224989267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937788, "dur": 5, "args": { "External id": 221743, "cbid": 211, "correlation": 221743 } }, { "ph": "s", "id": 221743, "pid": 76337, "tid": -914061504, "ts": 1716454224937788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224989271, "dur": 90, "args": { "External id": 221764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221764, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 221764, "pid": 5, "tid": 7, "ts": 1716454224989271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937862, "dur": 12, "args": { "External id": 221764, "cbid": 211, "correlation": 221764 } }, { "ph": "s", "id": 221764, "pid": 76337, "tid": -914061504, "ts": 1716454224937862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224937958, "dur": 1, "args": { "External id": 221782, "cbid": 251, "correlation": 221782 } }, { "ph": "f", "id": 221782, "pid": 76337, "tid": -914061504, "ts": 1716454224937958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224989363, "dur": 96, "args": { "External id": 221784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221784, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221784, "pid": 5, "tid": 7, "ts": 1716454224989363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224937963, "dur": 22, "args": { "External id": 221784, "cbid": 211, "correlation": 221784 } }, { "ph": "s", "id": 221784, "pid": 76337, "tid": -914061504, "ts": 1716454224937963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224989460, "dur": 19, "args": { "External id": 221792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221792, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221792, "pid": 5, "tid": 7, "ts": 1716454224989460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938042, "dur": 13, "args": { "External id": 221792, "cbid": 211, "correlation": 221792 } }, { "ph": "s", "id": 221792, "pid": 76337, "tid": -914061504, "ts": 1716454224938042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224989480, "dur": 37, "args": { "External id": 221800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221800, "pid": 5, "tid": 7, "ts": 1716454224989480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938085, "dur": 10, "args": { "External id": 221800, "cbid": 211, "correlation": 221800 } }, { "ph": "s", "id": 221800, "pid": 76337, "tid": -914061504, "ts": 1716454224938085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224989518, "dur": 34, "args": { "External id": 221822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221822, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221822, "pid": 5, "tid": 7, "ts": 1716454224989518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938136, "dur": 11, "args": { "External id": 221822, "cbid": 211, "correlation": 221822 } }, { "ph": "s", "id": 221822, "pid": 76337, "tid": -914061504, "ts": 1716454224938136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224938224, "dur": 1, "args": { "External id": 221838, "cbid": 251, "correlation": 221838 } }, { "ph": "f", "id": 221838, "pid": 76337, "tid": -914061504, "ts": 1716454224938224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224938229, "dur": 0, "args": { "External id": 221840, "cbid": 251, "correlation": 221840 } }, { "ph": "f", "id": 221840, "pid": 76337, "tid": -914061504, "ts": 1716454224938229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224989553, "dur": 531, "args": { "External id": 221841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221841, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 221841, "pid": 5, "tid": 7, "ts": 1716454224989553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938233, "dur": 13, "args": { "External id": 221841, "cbid": 211, "correlation": 221841 } }, { "ph": "s", "id": 221841, "pid": 76337, "tid": -914061504, "ts": 1716454224938233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224990086, "dur": 124, "args": { "External id": 221849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221849, "pid": 5, "tid": 7, "ts": 1716454224990086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938297, "dur": 13, "args": { "External id": 221849, "cbid": 211, "correlation": 221849 } }, { "ph": "s", "id": 221849, "pid": 76337, "tid": -914061504, "ts": 1716454224938297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224990211, "dur": 128, "args": { "External id": 221857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221857, "pid": 5, "tid": 7, "ts": 1716454224990211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938327, "dur": 8, "args": { "External id": 221857, "cbid": 211, "correlation": 221857 } }, { "ph": "s", "id": 221857, "pid": 76337, "tid": -914061504, "ts": 1716454224938327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224938402, "dur": 1, "args": { "External id": 221873, "cbid": 251, "correlation": 221873 } }, { "ph": "f", "id": 221873, "pid": 76337, "tid": -914061504, "ts": 1716454224938402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224990340, "dur": 304, "args": { "External id": 221875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221875, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221875, "pid": 5, "tid": 7, "ts": 1716454224990340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938408, "dur": 12, "args": { "External id": 221875, "cbid": 211, "correlation": 221875 } }, { "ph": "s", "id": 221875, "pid": 76337, "tid": -914061504, "ts": 1716454224938408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224990645, "dur": 27, "args": { "External id": 221883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221883, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221883, "pid": 5, "tid": 7, "ts": 1716454224990645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938450, "dur": 9, "args": { "External id": 221883, "cbid": 211, "correlation": 221883 } }, { "ph": "s", "id": 221883, "pid": 76337, "tid": -914061504, "ts": 1716454224938450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224990674, "dur": 80, "args": { "External id": 221894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221894, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221894, "pid": 5, "tid": 7, "ts": 1716454224990674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938517, "dur": 12, "args": { "External id": 221894, "cbid": 211, "correlation": 221894 } }, { "ph": "s", "id": 221894, "pid": 76337, "tid": -914061504, "ts": 1716454224938517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224938580, "dur": 0, "args": { "External id": 221906, "cbid": 317, "correlation": 221906 } }, { "ph": "f", "id": 221906, "pid": 76337, "tid": -914061504, "ts": 1716454224938580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224938581, "dur": 0, "args": { "External id": 221907, "cbid": 203, "correlation": 221907 } }, { "ph": "f", "id": 221907, "pid": 76337, "tid": -914061504, "ts": 1716454224938581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224938582, "dur": 0, "args": { "External id": 221908, "cbid": 205, "correlation": 221908 } }, { "ph": "f", "id": 221908, "pid": 76337, "tid": -914061504, "ts": 1716454224938582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224990756, "dur": 22, "args": { "External id": 221912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221912, "pid": 5, "tid": 7, "ts": 1716454224990756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938598, "dur": 12, "args": { "External id": 221912, "cbid": 211, "correlation": 221912 } }, { "ph": "s", "id": 221912, "pid": 76337, "tid": -914061504, "ts": 1716454224938598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224990779, "dur": 118, "args": { "External id": 221914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221914, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 221914, "pid": 5, "tid": 7, "ts": 1716454224990779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938616, "dur": 7, "args": { "External id": 221914, "cbid": 211, "correlation": 221914 } }, { "ph": "s", "id": 221914, "pid": 76337, "tid": -914061504, "ts": 1716454224938616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224990898, "dur": 23, "args": { "External id": 221916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221916, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221916, "pid": 5, "tid": 7, "ts": 1716454224990898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938626, "dur": 5, "args": { "External id": 221916, "cbid": 211, "correlation": 221916 } }, { "ph": "s", "id": 221916, "pid": 76337, "tid": -914061504, "ts": 1716454224938626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224990923, "dur": 33, "args": { "External id": 221922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221922, "pid": 5, "tid": 7, "ts": 1716454224990923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938654, "dur": 8, "args": { "External id": 221922, "cbid": 211, "correlation": 221922 } }, { "ph": "s", "id": 221922, "pid": 76337, "tid": -914061504, "ts": 1716454224938654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224990957, "dur": 27, "args": { "External id": 221930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221930, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221930, "pid": 5, "tid": 7, "ts": 1716454224990957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938685, "dur": 8, "args": { "External id": 221930, "cbid": 211, "correlation": 221930 } }, { "ph": "s", "id": 221930, "pid": 76337, "tid": -914061504, "ts": 1716454224938685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224938756, "dur": 0, "args": { "External id": 221940, "cbid": 317, "correlation": 221940 } }, { "ph": "f", "id": 221940, "pid": 76337, "tid": -914061504, "ts": 1716454224938756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224938757, "dur": 0, "args": { "External id": 221941, "cbid": 203, "correlation": 221941 } }, { "ph": "f", "id": 221941, "pid": 76337, "tid": -914061504, "ts": 1716454224938757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224938758, "dur": 0, "args": { "External id": 221942, "cbid": 205, "correlation": 221942 } }, { "ph": "f", "id": 221942, "pid": 76337, "tid": -914061504, "ts": 1716454224938758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224990985, "dur": 24, "args": { "External id": 221946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221946, "pid": 5, "tid": 7, "ts": 1716454224990985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938771, "dur": 12, "args": { "External id": 221946, "cbid": 211, "correlation": 221946 } }, { "ph": "s", "id": 221946, "pid": 76337, "tid": -914061504, "ts": 1716454224938771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224991010, "dur": 43, "args": { "External id": 221948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221948, "pid": 5, "tid": 7, "ts": 1716454224991010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938786, "dur": 5, "args": { "External id": 221948, "cbid": 211, "correlation": 221948 } }, { "ph": "s", "id": 221948, "pid": 76337, "tid": -914061504, "ts": 1716454224938786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224991055, "dur": 233, "args": { "External id": 221950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221950, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 221950, "pid": 5, "tid": 7, "ts": 1716454224991055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938798, "dur": 7, "args": { "External id": 221950, "cbid": 211, "correlation": 221950 } }, { "ph": "s", "id": 221950, "pid": 76337, "tid": -914061504, "ts": 1716454224938798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224991290, "dur": 6, "args": { "External id": 221952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221952, "pid": 5, "tid": 7, "ts": 1716454224991290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938808, "dur": 5, "args": { "External id": 221952, "cbid": 211, "correlation": 221952 } }, { "ph": "s", "id": 221952, "pid": 76337, "tid": -914061504, "ts": 1716454224938808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224991297, "dur": 9, "args": { "External id": 221958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221958, "pid": 5, "tid": 7, "ts": 1716454224991297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938834, "dur": 10, "args": { "External id": 221958, "cbid": 211, "correlation": 221958 } }, { "ph": "s", "id": 221958, "pid": 76337, "tid": -914061504, "ts": 1716454224938834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224991307, "dur": 12, "args": { "External id": 221978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221978, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 221978, "pid": 5, "tid": 7, "ts": 1716454224991307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938926, "dur": 12, "args": { "External id": 221978, "cbid": 211, "correlation": 221978 } }, { "ph": "s", "id": 221978, "pid": 76337, "tid": -914061504, "ts": 1716454224938926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224991320, "dur": 4, "args": { "External id": 221990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221990, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 221990, "pid": 5, "tid": 7, "ts": 1716454224991320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938948, "dur": 6, "args": { "External id": 221990, "cbid": 211, "correlation": 221990 } }, { "ph": "s", "id": 221990, "pid": 76337, "tid": -914061504, "ts": 1716454224938948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224991325, "dur": 12, "args": { "External id": 221993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 221993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 221993, "pid": 5, "tid": 7, "ts": 1716454224991325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224938967, "dur": 15, "args": { "External id": 221993, "cbid": 211, "correlation": 221993 } }, { "ph": "s", "id": 221993, "pid": 76337, "tid": -914061504, "ts": 1716454224938967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224991339, "dur": 7, "args": { "External id": 222002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222002, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222002, "pid": 5, "tid": 7, "ts": 1716454224991339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939015, "dur": 10, "args": { "External id": 222002, "cbid": 211, "correlation": 222002 } }, { "ph": "s", "id": 222002, "pid": 76337, "tid": -914061504, "ts": 1716454224939015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224939069, "dur": 0, "args": { "External id": 222012, "cbid": 317, "correlation": 222012 } }, { "ph": "f", "id": 222012, "pid": 76337, "tid": -914061504, "ts": 1716454224939069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224939069, "dur": 0, "args": { "External id": 222013, "cbid": 203, "correlation": 222013 } }, { "ph": "f", "id": 222013, "pid": 76337, "tid": -914061504, "ts": 1716454224939069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224939070, "dur": 0, "args": { "External id": 222014, "cbid": 205, "correlation": 222014 } }, { "ph": "f", "id": 222014, "pid": 76337, "tid": -914061504, "ts": 1716454224939070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224991347, "dur": 6, "args": { "External id": 222018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222018, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222018, "pid": 5, "tid": 7, "ts": 1716454224991347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939085, "dur": 11, "args": { "External id": 222018, "cbid": 211, "correlation": 222018 } }, { "ph": "s", "id": 222018, "pid": 76337, "tid": -914061504, "ts": 1716454224939085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224991354, "dur": 81, "args": { "External id": 222020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222020, "pid": 5, "tid": 7, "ts": 1716454224991354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939100, "dur": 5, "args": { "External id": 222020, "cbid": 211, "correlation": 222020 } }, { "ph": "s", "id": 222020, "pid": 76337, "tid": -914061504, "ts": 1716454224939100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224991437, "dur": 1, "args": { "External id": 222022, "device": 5, "context": 1, "stream": 7, "correlation": 222022, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 222022, "pid": 5, "tid": 7, "ts": 1716454224991437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224939113, "dur": 9, "args": { "External id": 222022, "cbid": 51, "correlation": 222022 } }, { "ph": "s", "id": 222022, "pid": 76337, "tid": -914061504, "ts": 1716454224939113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224991441, "dur": 537, "args": { "External id": 222023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222023, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222023, "pid": 5, "tid": 7, "ts": 1716454224991441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939123, "dur": 8, "args": { "External id": 222023, "cbid": 211, "correlation": 222023 } }, { "ph": "s", "id": 222023, "pid": 76337, "tid": -914061504, "ts": 1716454224939123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224991979, "dur": 12, "args": { "External id": 222025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222025, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222025, "pid": 5, "tid": 7, "ts": 1716454224991979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939135, "dur": 6, "args": { "External id": 222025, "cbid": 211, "correlation": 222025 } }, { "ph": "s", "id": 222025, "pid": 76337, "tid": -914061504, "ts": 1716454224939135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224991992, "dur": 14, "args": { "External id": 222031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222031, "pid": 5, "tid": 7, "ts": 1716454224991992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939164, "dur": 8, "args": { "External id": 222031, "cbid": 211, "correlation": 222031 } }, { "ph": "s", "id": 222031, "pid": 76337, "tid": -914061504, "ts": 1716454224939164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224992008, "dur": 4, "args": { "External id": 222039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222039, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 222039, "pid": 5, "tid": 7, "ts": 1716454224992008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939207, "dur": 9, "args": { "External id": 222039, "cbid": 211, "correlation": 222039 } }, { "ph": "s", "id": 222039, "pid": 76337, "tid": -914061504, "ts": 1716454224939207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224939271, "dur": 1, "args": { "External id": 222055, "cbid": 251, "correlation": 222055 } }, { "ph": "f", "id": 222055, "pid": 76337, "tid": -914061504, "ts": 1716454224939271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224939276, "dur": 0, "args": { "External id": 222057, "cbid": 251, "correlation": 222057 } }, { "ph": "f", "id": 222057, "pid": 76337, "tid": -914061504, "ts": 1716454224939276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224992013, "dur": 14, "args": { "External id": 222058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222058, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222058, "pid": 5, "tid": 7, "ts": 1716454224992013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939278, "dur": 11, "args": { "External id": 222058, "cbid": 211, "correlation": 222058 } }, { "ph": "s", "id": 222058, "pid": 76337, "tid": -914061504, "ts": 1716454224939278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224992028, "dur": 5, "args": { "External id": 222060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222060, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222060, "pid": 5, "tid": 7, "ts": 1716454224992028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939291, "dur": 6, "args": { "External id": 222060, "cbid": 211, "correlation": 222060 } }, { "ph": "s", "id": 222060, "pid": 76337, "tid": -914061504, "ts": 1716454224939291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224992034, "dur": 17, "args": { "External id": 222070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222070, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222070, "pid": 5, "tid": 7, "ts": 1716454224992034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939349, "dur": 12, "args": { "External id": 222070, "cbid": 211, "correlation": 222070 } }, { "ph": "s", "id": 222070, "pid": 76337, "tid": -914061504, "ts": 1716454224939349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224992052, "dur": 17, "args": { "External id": 222090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222090, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 222090, "pid": 5, "tid": 7, "ts": 1716454224992052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939414, "dur": 10, "args": { "External id": 222090, "cbid": 211, "correlation": 222090 } }, { "ph": "s", "id": 222090, "pid": 76337, "tid": -914061504, "ts": 1716454224939414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224992071, "dur": 4, "args": { "External id": 222102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222102, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 222102, "pid": 5, "tid": 7, "ts": 1716454224992071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939441, "dur": 7, "args": { "External id": 222102, "cbid": 211, "correlation": 222102 } }, { "ph": "s", "id": 222102, "pid": 76337, "tid": -914061504, "ts": 1716454224939441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224992077, "dur": 16, "args": { "External id": 222105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222105, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222105, "pid": 5, "tid": 7, "ts": 1716454224992077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939461, "dur": 7, "args": { "External id": 222105, "cbid": 211, "correlation": 222105 } }, { "ph": "s", "id": 222105, "pid": 76337, "tid": -914061504, "ts": 1716454224939461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224992094, "dur": 11, "args": { "External id": 222114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222114, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222114, "pid": 5, "tid": 7, "ts": 1716454224992094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939501, "dur": 9, "args": { "External id": 222114, "cbid": 211, "correlation": 222114 } }, { "ph": "s", "id": 222114, "pid": 76337, "tid": -914061504, "ts": 1716454224939501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224939562, "dur": 0, "args": { "External id": 222124, "cbid": 317, "correlation": 222124 } }, { "ph": "f", "id": 222124, "pid": 76337, "tid": -914061504, "ts": 1716454224939562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224939563, "dur": 0, "args": { "External id": 222125, "cbid": 203, "correlation": 222125 } }, { "ph": "f", "id": 222125, "pid": 76337, "tid": -914061504, "ts": 1716454224939563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224939564, "dur": 0, "args": { "External id": 222126, "cbid": 205, "correlation": 222126 } }, { "ph": "f", "id": 222126, "pid": 76337, "tid": -914061504, "ts": 1716454224939564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224992106, "dur": 11, "args": { "External id": 222130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222130, "pid": 5, "tid": 7, "ts": 1716454224992106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939580, "dur": 12, "args": { "External id": 222130, "cbid": 211, "correlation": 222130 } }, { "ph": "s", "id": 222130, "pid": 76337, "tid": -914061504, "ts": 1716454224939580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224992119, "dur": 159, "args": { "External id": 222132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222132, "pid": 5, "tid": 7, "ts": 1716454224992119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939595, "dur": 6, "args": { "External id": 222132, "cbid": 211, "correlation": 222132 } }, { "ph": "s", "id": 222132, "pid": 76337, "tid": -914061504, "ts": 1716454224939595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224992280, "dur": 1, "args": { "External id": 222134, "device": 5, "context": 1, "stream": 7, "correlation": 222134, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 222134, "pid": 5, "tid": 7, "ts": 1716454224992280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224939607, "dur": 6, "args": { "External id": 222134, "cbid": 51, "correlation": 222134 } }, { "ph": "s", "id": 222134, "pid": 76337, "tid": -914061504, "ts": 1716454224939607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224992284, "dur": 656, "args": { "External id": 222135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222135, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222135, "pid": 5, "tid": 7, "ts": 1716454224992284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939615, "dur": 6, "args": { "External id": 222135, "cbid": 211, "correlation": 222135 } }, { "ph": "s", "id": 222135, "pid": 76337, "tid": -914061504, "ts": 1716454224939615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224992941, "dur": 13, "args": { "External id": 222137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222137, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222137, "pid": 5, "tid": 7, "ts": 1716454224992941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939625, "dur": 5, "args": { "External id": 222137, "cbid": 211, "correlation": 222137 } }, { "ph": "s", "id": 222137, "pid": 76337, "tid": -914061504, "ts": 1716454224939625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224992955, "dur": 14, "args": { "External id": 222143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222143, "pid": 5, "tid": 7, "ts": 1716454224992955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939654, "dur": 8, "args": { "External id": 222143, "cbid": 211, "correlation": 222143 } }, { "ph": "s", "id": 222143, "pid": 76337, "tid": -914061504, "ts": 1716454224939654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224939711, "dur": 0, "args": { "External id": 222153, "cbid": 317, "correlation": 222153 } }, { "ph": "f", "id": 222153, "pid": 76337, "tid": -914061504, "ts": 1716454224939711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224939712, "dur": 0, "args": { "External id": 222154, "cbid": 203, "correlation": 222154 } }, { "ph": "f", "id": 222154, "pid": 76337, "tid": -914061504, "ts": 1716454224939712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224939713, "dur": 0, "args": { "External id": 222155, "cbid": 205, "correlation": 222155 } }, { "ph": "f", "id": 222155, "pid": 76337, "tid": -914061504, "ts": 1716454224939713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224992971, "dur": 8, "args": { "External id": 222159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222159, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222159, "pid": 5, "tid": 7, "ts": 1716454224992971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939727, "dur": 11, "args": { "External id": 222159, "cbid": 211, "correlation": 222159 } }, { "ph": "s", "id": 222159, "pid": 76337, "tid": -914061504, "ts": 1716454224939727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224992980, "dur": 3, "args": { "External id": 222161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222161, "pid": 5, "tid": 7, "ts": 1716454224992980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939744, "dur": 7, "args": { "External id": 222161, "cbid": 211, "correlation": 222161 } }, { "ph": "s", "id": 222161, "pid": 76337, "tid": -914061504, "ts": 1716454224939744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224939755, "dur": 0, "args": { "External id": 222162, "cbid": 51, "correlation": 222162 } }, { "ph": "s", "id": 222162, "pid": 76337, "tid": -914061504, "ts": 1716454224939755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224992985, "dur": 56, "args": { "External id": 222163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222163, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 222163, "pid": 5, "tid": 7, "ts": 1716454224992985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939756, "dur": 5, "args": { "External id": 222163, "cbid": 211, "correlation": 222163 } }, { "ph": "s", "id": 222163, "pid": 76337, "tid": -914061504, "ts": 1716454224939756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224993042, "dur": 14, "args": { "External id": 222168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222168, "pid": 5, "tid": 7, "ts": 1716454224993042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939781, "dur": 8, "args": { "External id": 222168, "cbid": 211, "correlation": 222168 } }, { "ph": "s", "id": 222168, "pid": 76337, "tid": -914061504, "ts": 1716454224939781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224993057, "dur": 12, "args": { "External id": 222176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222176, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222176, "pid": 5, "tid": 7, "ts": 1716454224993057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939810, "dur": 8, "args": { "External id": 222176, "cbid": 211, "correlation": 222176 } }, { "ph": "s", "id": 222176, "pid": 76337, "tid": -914061504, "ts": 1716454224939810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224993070, "dur": 10, "args": { "External id": 222184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222184, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222184, "pid": 5, "tid": 7, "ts": 1716454224993070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939838, "dur": 9, "args": { "External id": 222184, "cbid": 211, "correlation": 222184 } }, { "ph": "s", "id": 222184, "pid": 76337, "tid": -914061504, "ts": 1716454224939838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224993082, "dur": 18, "args": { "External id": 222204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222204, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 222204, "pid": 5, "tid": 7, "ts": 1716454224993082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939919, "dur": 12, "args": { "External id": 222204, "cbid": 211, "correlation": 222204 } }, { "ph": "s", "id": 222204, "pid": 76337, "tid": -914061504, "ts": 1716454224939919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224993101, "dur": 4, "args": { "External id": 222216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222216, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 222216, "pid": 5, "tid": 7, "ts": 1716454224993101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939940, "dur": 6, "args": { "External id": 222216, "cbid": 211, "correlation": 222216 } }, { "ph": "s", "id": 222216, "pid": 76337, "tid": -914061504, "ts": 1716454224939940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224993106, "dur": 17, "args": { "External id": 222219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222219, "pid": 5, "tid": 7, "ts": 1716454224993106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224939958, "dur": 7, "args": { "External id": 222219, "cbid": 211, "correlation": 222219 } }, { "ph": "s", "id": 222219, "pid": 76337, "tid": -914061504, "ts": 1716454224939958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224940024, "dur": 0, "args": { "External id": 222230, "cbid": 317, "correlation": 222230 } }, { "ph": "f", "id": 222230, "pid": 76337, "tid": -914061504, "ts": 1716454224940024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224940024, "dur": 0, "args": { "External id": 222231, "cbid": 203, "correlation": 222231 } }, { "ph": "f", "id": 222231, "pid": 76337, "tid": -914061504, "ts": 1716454224940024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224940025, "dur": 0, "args": { "External id": 222232, "cbid": 205, "correlation": 222232 } }, { "ph": "f", "id": 222232, "pid": 76337, "tid": -914061504, "ts": 1716454224940025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224993124, "dur": 12, "args": { "External id": 222236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222236, "pid": 5, "tid": 7, "ts": 1716454224993124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940040, "dur": 14, "args": { "External id": 222236, "cbid": 211, "correlation": 222236 } }, { "ph": "s", "id": 222236, "pid": 76337, "tid": -914061504, "ts": 1716454224940040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224993138, "dur": 3, "args": { "External id": 222238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222238, "pid": 5, "tid": 7, "ts": 1716454224993138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940059, "dur": 6, "args": { "External id": 222238, "cbid": 211, "correlation": 222238 } }, { "ph": "s", "id": 222238, "pid": 76337, "tid": -914061504, "ts": 1716454224940059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224940067, "dur": 0, "args": { "External id": 222239, "cbid": 51, "correlation": 222239 } }, { "ph": "s", "id": 222239, "pid": 76337, "tid": -914061504, "ts": 1716454224940067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224993142, "dur": 95, "args": { "External id": 222240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222240, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 222240, "pid": 5, "tid": 7, "ts": 1716454224993142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940068, "dur": 5, "args": { "External id": 222240, "cbid": 211, "correlation": 222240 } }, { "ph": "s", "id": 222240, "pid": 76337, "tid": -914061504, "ts": 1716454224940068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224993238, "dur": 15, "args": { "External id": 222245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222245, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222245, "pid": 5, "tid": 7, "ts": 1716454224993238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940096, "dur": 9, "args": { "External id": 222245, "cbid": 211, "correlation": 222245 } }, { "ph": "s", "id": 222245, "pid": 76337, "tid": -914061504, "ts": 1716454224940096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224993255, "dur": 81, "args": { "External id": 222254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222254, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222254, "pid": 5, "tid": 7, "ts": 1716454224993255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940178, "dur": 14, "args": { "External id": 222254, "cbid": 211, "correlation": 222254 } }, { "ph": "s", "id": 222254, "pid": 76337, "tid": -914061504, "ts": 1716454224940178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224993337, "dur": 30, "args": { "External id": 222276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222276, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222276, "pid": 5, "tid": 7, "ts": 1716454224993337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940234, "dur": 10, "args": { "External id": 222276, "cbid": 211, "correlation": 222276 } }, { "ph": "s", "id": 222276, "pid": 76337, "tid": -914061504, "ts": 1716454224940234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224940325, "dur": 1, "args": { "External id": 222287, "cbid": 251, "correlation": 222287 } }, { "ph": "f", "id": 222287, "pid": 76337, "tid": -914061504, "ts": 1716454224940325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224993368, "dur": 161, "args": { "External id": 222288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222288, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222288, "pid": 5, "tid": 7, "ts": 1716454224993368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940330, "dur": 13, "args": { "External id": 222288, "cbid": 211, "correlation": 222288 } }, { "ph": "s", "id": 222288, "pid": 76337, "tid": -914061504, "ts": 1716454224940330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224940401, "dur": 1, "args": { "External id": 222299, "cbid": 251, "correlation": 222299 } }, { "ph": "f", "id": 222299, "pid": 76337, "tid": -914061504, "ts": 1716454224940401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224993531, "dur": 155, "args": { "External id": 222300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222300, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222300, "pid": 5, "tid": 7, "ts": 1716454224993531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940405, "dur": 11, "args": { "External id": 222300, "cbid": 211, "correlation": 222300 } }, { "ph": "s", "id": 222300, "pid": 76337, "tid": -914061504, "ts": 1716454224940405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224940469, "dur": 1, "args": { "External id": 222311, "cbid": 251, "correlation": 222311 } }, { "ph": "f", "id": 222311, "pid": 76337, "tid": -914061504, "ts": 1716454224940469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224993687, "dur": 156, "args": { "External id": 222312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222312, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222312, "pid": 5, "tid": 7, "ts": 1716454224993687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940473, "dur": 11, "args": { "External id": 222312, "cbid": 211, "correlation": 222312 } }, { "ph": "s", "id": 222312, "pid": 76337, "tid": -914061504, "ts": 1716454224940473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224993844, "dur": 333, "args": { "External id": 222337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222337, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222337, "pid": 5, "tid": 7, "ts": 1716454224993844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940560, "dur": 13, "args": { "External id": 222337, "cbid": 211, "correlation": 222337 } }, { "ph": "s", "id": 222337, "pid": 76337, "tid": -914061504, "ts": 1716454224940560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224940661, "dur": 1, "args": { "External id": 222355, "cbid": 251, "correlation": 222355 } }, { "ph": "f", "id": 222355, "pid": 76337, "tid": -914061504, "ts": 1716454224940661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224994177, "dur": 166, "args": { "External id": 222357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222357, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222357, "pid": 5, "tid": 7, "ts": 1716454224994177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940667, "dur": 13, "args": { "External id": 222357, "cbid": 211, "correlation": 222357 } }, { "ph": "s", "id": 222357, "pid": 76337, "tid": -914061504, "ts": 1716454224940667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224994345, "dur": 19, "args": { "External id": 222365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222365, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222365, "pid": 5, "tid": 7, "ts": 1716454224994345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940736, "dur": 12, "args": { "External id": 222365, "cbid": 211, "correlation": 222365 } }, { "ph": "s", "id": 222365, "pid": 76337, "tid": -914061504, "ts": 1716454224940736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224994365, "dur": 27, "args": { "External id": 222373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222373, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222373, "pid": 5, "tid": 7, "ts": 1716454224994365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940775, "dur": 8, "args": { "External id": 222373, "cbid": 211, "correlation": 222373 } }, { "ph": "s", "id": 222373, "pid": 76337, "tid": -914061504, "ts": 1716454224940775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224994394, "dur": 17, "args": { "External id": 222384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222384, "pid": 5, "tid": 7, "ts": 1716454224994394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940846, "dur": 12, "args": { "External id": 222384, "cbid": 211, "correlation": 222384 } }, { "ph": "s", "id": 222384, "pid": 76337, "tid": -914061504, "ts": 1716454224940846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224994413, "dur": 16, "args": { "External id": 222406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222406, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222406, "pid": 5, "tid": 7, "ts": 1716454224994413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940876, "dur": 7, "args": { "External id": 222406, "cbid": 211, "correlation": 222406 } }, { "ph": "s", "id": 222406, "pid": 76337, "tid": -914061504, "ts": 1716454224940876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224940959, "dur": 1, "args": { "External id": 222417, "cbid": 251, "correlation": 222417 } }, { "ph": "f", "id": 222417, "pid": 76337, "tid": -914061504, "ts": 1716454224940959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224994430, "dur": 89, "args": { "External id": 222418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222418, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222418, "pid": 5, "tid": 7, "ts": 1716454224994430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224940966, "dur": 22, "args": { "External id": 222418, "cbid": 211, "correlation": 222418 } }, { "ph": "s", "id": 222418, "pid": 76337, "tid": -914061504, "ts": 1716454224940966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941047, "dur": 1, "args": { "External id": 222429, "cbid": 251, "correlation": 222429 } }, { "ph": "f", "id": 222429, "pid": 76337, "tid": -914061504, "ts": 1716454224941047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941051, "dur": 0, "args": { "External id": 222430, "cbid": 251, "correlation": 222430 } }, { "ph": "f", "id": 222430, "pid": 76337, "tid": -914061504, "ts": 1716454224941051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224994520, "dur": 12, "args": { "External id": 222431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222431, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222431, "pid": 5, "tid": 7, "ts": 1716454224994520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941052, "dur": 13, "args": { "External id": 222431, "cbid": 211, "correlation": 222431 } }, { "ph": "s", "id": 222431, "pid": 76337, "tid": -914061504, "ts": 1716454224941052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224994533, "dur": 5, "args": { "External id": 222433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222433, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222433, "pid": 5, "tid": 7, "ts": 1716454224994533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941067, "dur": 5, "args": { "External id": 222433, "cbid": 211, "correlation": 222433 } }, { "ph": "s", "id": 222433, "pid": 76337, "tid": -914061504, "ts": 1716454224941067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941124, "dur": 1, "args": { "External id": 222444, "cbid": 251, "correlation": 222444 } }, { "ph": "f", "id": 222444, "pid": 76337, "tid": -914061504, "ts": 1716454224941124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941128, "dur": 0, "args": { "External id": 222445, "cbid": 251, "correlation": 222445 } }, { "ph": "f", "id": 222445, "pid": 76337, "tid": -914061504, "ts": 1716454224941128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224994539, "dur": 8, "args": { "External id": 222446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222446, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222446, "pid": 5, "tid": 7, "ts": 1716454224994539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941129, "dur": 12, "args": { "External id": 222446, "cbid": 211, "correlation": 222446 } }, { "ph": "s", "id": 222446, "pid": 76337, "tid": -914061504, "ts": 1716454224941129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224994549, "dur": 3, "args": { "External id": 222448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222448, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222448, "pid": 5, "tid": 7, "ts": 1716454224994549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941143, "dur": 6, "args": { "External id": 222448, "cbid": 211, "correlation": 222448 } }, { "ph": "s", "id": 222448, "pid": 76337, "tid": -914061504, "ts": 1716454224941143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224994553, "dur": 53, "args": { "External id": 222473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222473, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222473, "pid": 5, "tid": 7, "ts": 1716454224994553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941220, "dur": 12, "args": { "External id": 222473, "cbid": 211, "correlation": 222473 } }, { "ph": "s", "id": 222473, "pid": 76337, "tid": -914061504, "ts": 1716454224941220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941320, "dur": 2, "args": { "External id": 222491, "cbid": 251, "correlation": 222491 } }, { "ph": "f", "id": 222491, "pid": 76337, "tid": -914061504, "ts": 1716454224941320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224994607, "dur": 89, "args": { "External id": 222493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222493, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222493, "pid": 5, "tid": 7, "ts": 1716454224994607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941327, "dur": 14, "args": { "External id": 222493, "cbid": 211, "correlation": 222493 } }, { "ph": "s", "id": 222493, "pid": 76337, "tid": -914061504, "ts": 1716454224941327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224994698, "dur": 10, "args": { "External id": 222501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222501, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222501, "pid": 5, "tid": 7, "ts": 1716454224994698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941398, "dur": 12, "args": { "External id": 222501, "cbid": 211, "correlation": 222501 } }, { "ph": "s", "id": 222501, "pid": 76337, "tid": -914061504, "ts": 1716454224941398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224994709, "dur": 20, "args": { "External id": 222509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222509, "pid": 5, "tid": 7, "ts": 1716454224994709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941439, "dur": 10, "args": { "External id": 222509, "cbid": 211, "correlation": 222509 } }, { "ph": "s", "id": 222509, "pid": 76337, "tid": -914061504, "ts": 1716454224941439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224994730, "dur": 17, "args": { "External id": 222531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222531, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222531, "pid": 5, "tid": 7, "ts": 1716454224994730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941490, "dur": 10, "args": { "External id": 222531, "cbid": 211, "correlation": 222531 } }, { "ph": "s", "id": 222531, "pid": 76337, "tid": -914061504, "ts": 1716454224941490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941580, "dur": 2, "args": { "External id": 222547, "cbid": 251, "correlation": 222547 } }, { "ph": "f", "id": 222547, "pid": 76337, "tid": -914061504, "ts": 1716454224941580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941586, "dur": 0, "args": { "External id": 222549, "cbid": 251, "correlation": 222549 } }, { "ph": "f", "id": 222549, "pid": 76337, "tid": -914061504, "ts": 1716454224941586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224994749, "dur": 490, "args": { "External id": 222550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222550, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222550, "pid": 5, "tid": 7, "ts": 1716454224994749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941589, "dur": 15, "args": { "External id": 222550, "cbid": 211, "correlation": 222550 } }, { "ph": "s", "id": 222550, "pid": 76337, "tid": -914061504, "ts": 1716454224941589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224995240, "dur": 65, "args": { "External id": 222558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222558, "pid": 5, "tid": 7, "ts": 1716454224995240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941655, "dur": 12, "args": { "External id": 222558, "cbid": 211, "correlation": 222558 } }, { "ph": "s", "id": 222558, "pid": 76337, "tid": -914061504, "ts": 1716454224941655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224995306, "dur": 68, "args": { "External id": 222566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222566, "pid": 5, "tid": 7, "ts": 1716454224995306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941685, "dur": 9, "args": { "External id": 222566, "cbid": 211, "correlation": 222566 } }, { "ph": "s", "id": 222566, "pid": 76337, "tid": -914061504, "ts": 1716454224941685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224941765, "dur": 1, "args": { "External id": 222582, "cbid": 251, "correlation": 222582 } }, { "ph": "f", "id": 222582, "pid": 76337, "tid": -914061504, "ts": 1716454224941765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224995376, "dur": 1, "args": { "External id": 222584, "device": 5, "context": 1, "stream": 7, "correlation": 222584, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 222584, "pid": 5, "tid": 7, "ts": 1716454224995376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224941771, "dur": 12, "args": { "External id": 222584, "cbid": 51, "correlation": 222584 } }, { "ph": "s", "id": 222584, "pid": 76337, "tid": -914061504, "ts": 1716454224941771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224995380, "dur": 268, "args": { "External id": 222585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222585, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222585, "pid": 5, "tid": 7, "ts": 1716454224995380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941784, "dur": 12, "args": { "External id": 222585, "cbid": 211, "correlation": 222585 } }, { "ph": "s", "id": 222585, "pid": 76337, "tid": -914061504, "ts": 1716454224941784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224995649, "dur": 15, "args": { "External id": 222593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222593, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222593, "pid": 5, "tid": 7, "ts": 1716454224995649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941826, "dur": 10, "args": { "External id": 222593, "cbid": 211, "correlation": 222593 } }, { "ph": "s", "id": 222593, "pid": 76337, "tid": -914061504, "ts": 1716454224941826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224995665, "dur": 37, "args": { "External id": 222604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222604, "pid": 5, "tid": 7, "ts": 1716454224995665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941893, "dur": 12, "args": { "External id": 222604, "cbid": 211, "correlation": 222604 } }, { "ph": "s", "id": 222604, "pid": 76337, "tid": -914061504, "ts": 1716454224941893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224941958, "dur": 0, "args": { "External id": 222616, "cbid": 317, "correlation": 222616 } }, { "ph": "f", "id": 222616, "pid": 76337, "tid": -914061504, "ts": 1716454224941958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224941959, "dur": 0, "args": { "External id": 222617, "cbid": 203, "correlation": 222617 } }, { "ph": "f", "id": 222617, "pid": 76337, "tid": -914061504, "ts": 1716454224941959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224941960, "dur": 0, "args": { "External id": 222618, "cbid": 205, "correlation": 222618 } }, { "ph": "f", "id": 222618, "pid": 76337, "tid": -914061504, "ts": 1716454224941960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224995703, "dur": 14, "args": { "External id": 222622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222622, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222622, "pid": 5, "tid": 7, "ts": 1716454224995703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224941982, "dur": 13, "args": { "External id": 222622, "cbid": 211, "correlation": 222622 } }, { "ph": "s", "id": 222622, "pid": 76337, "tid": -914061504, "ts": 1716454224941982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224995718, "dur": 4, "args": { "External id": 222624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222624, "pid": 5, "tid": 7, "ts": 1716454224995718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942000, "dur": 6, "args": { "External id": 222624, "cbid": 211, "correlation": 222624 } }, { "ph": "s", "id": 222624, "pid": 76337, "tid": -914061504, "ts": 1716454224942000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224942009, "dur": 0, "args": { "External id": 222625, "cbid": 51, "correlation": 222625 } }, { "ph": "s", "id": 222625, "pid": 76337, "tid": -914061504, "ts": 1716454224942009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224995723, "dur": 96, "args": { "External id": 222626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222626, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 222626, "pid": 5, "tid": 7, "ts": 1716454224995723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942010, "dur": 5, "args": { "External id": 222626, "cbid": 211, "correlation": 222626 } }, { "ph": "s", "id": 222626, "pid": 76337, "tid": -914061504, "ts": 1716454224942010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224995820, "dur": 16, "args": { "External id": 222631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222631, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222631, "pid": 5, "tid": 7, "ts": 1716454224995820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942037, "dur": 9, "args": { "External id": 222631, "cbid": 211, "correlation": 222631 } }, { "ph": "s", "id": 222631, "pid": 76337, "tid": -914061504, "ts": 1716454224942037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224995837, "dur": 11, "args": { "External id": 222639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222639, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222639, "pid": 5, "tid": 7, "ts": 1716454224995837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942070, "dur": 8, "args": { "External id": 222639, "cbid": 211, "correlation": 222639 } }, { "ph": "s", "id": 222639, "pid": 76337, "tid": -914061504, "ts": 1716454224942070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224995849, "dur": 18, "args": { "External id": 222659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222659, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 222659, "pid": 5, "tid": 7, "ts": 1716454224995849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942142, "dur": 12, "args": { "External id": 222659, "cbid": 211, "correlation": 222659 } }, { "ph": "s", "id": 222659, "pid": 76337, "tid": -914061504, "ts": 1716454224942142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224995869, "dur": 5, "args": { "External id": 222671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222671, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 222671, "pid": 5, "tid": 7, "ts": 1716454224995869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942164, "dur": 6, "args": { "External id": 222671, "cbid": 211, "correlation": 222671 } }, { "ph": "s", "id": 222671, "pid": 76337, "tid": -914061504, "ts": 1716454224942164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224995875, "dur": 18, "args": { "External id": 222674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222674, "pid": 5, "tid": 7, "ts": 1716454224995875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942181, "dur": 6, "args": { "External id": 222674, "cbid": 211, "correlation": 222674 } }, { "ph": "s", "id": 222674, "pid": 76337, "tid": -914061504, "ts": 1716454224942181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224995894, "dur": 11, "args": { "External id": 222683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222683, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222683, "pid": 5, "tid": 7, "ts": 1716454224995894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942220, "dur": 9, "args": { "External id": 222683, "cbid": 211, "correlation": 222683 } }, { "ph": "s", "id": 222683, "pid": 76337, "tid": -914061504, "ts": 1716454224942220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224942270, "dur": 0, "args": { "External id": 222693, "cbid": 317, "correlation": 222693 } }, { "ph": "f", "id": 222693, "pid": 76337, "tid": -914061504, "ts": 1716454224942270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224942271, "dur": 0, "args": { "External id": 222694, "cbid": 203, "correlation": 222694 } }, { "ph": "f", "id": 222694, "pid": 76337, "tid": -914061504, "ts": 1716454224942271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224942271, "dur": 0, "args": { "External id": 222695, "cbid": 205, "correlation": 222695 } }, { "ph": "f", "id": 222695, "pid": 76337, "tid": -914061504, "ts": 1716454224942271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224995907, "dur": 11, "args": { "External id": 222699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222699, "pid": 5, "tid": 7, "ts": 1716454224995907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942284, "dur": 13, "args": { "External id": 222699, "cbid": 211, "correlation": 222699 } }, { "ph": "s", "id": 222699, "pid": 76337, "tid": -914061504, "ts": 1716454224942284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224995919, "dur": 160, "args": { "External id": 222701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222701, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222701, "pid": 5, "tid": 7, "ts": 1716454224995919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942300, "dur": 5, "args": { "External id": 222701, "cbid": 211, "correlation": 222701 } }, { "ph": "s", "id": 222701, "pid": 76337, "tid": -914061504, "ts": 1716454224942300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224996081, "dur": 1, "args": { "External id": 222703, "device": 5, "context": 1, "stream": 7, "correlation": 222703, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 222703, "pid": 5, "tid": 7, "ts": 1716454224996081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224942310, "dur": 6, "args": { "External id": 222703, "cbid": 51, "correlation": 222703 } }, { "ph": "s", "id": 222703, "pid": 76337, "tid": -914061504, "ts": 1716454224942310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224996085, "dur": 655, "args": { "External id": 222704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222704, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222704, "pid": 5, "tid": 7, "ts": 1716454224996085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942318, "dur": 6, "args": { "External id": 222704, "cbid": 211, "correlation": 222704 } }, { "ph": "s", "id": 222704, "pid": 76337, "tid": -914061504, "ts": 1716454224942318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224996742, "dur": 13, "args": { "External id": 222706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222706, "pid": 5, "tid": 7, "ts": 1716454224996742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942328, "dur": 5, "args": { "External id": 222706, "cbid": 211, "correlation": 222706 } }, { "ph": "s", "id": 222706, "pid": 76337, "tid": -914061504, "ts": 1716454224942328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224996756, "dur": 15, "args": { "External id": 222712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222712, "pid": 5, "tid": 7, "ts": 1716454224996756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942356, "dur": 8, "args": { "External id": 222712, "cbid": 211, "correlation": 222712 } }, { "ph": "s", "id": 222712, "pid": 76337, "tid": -914061504, "ts": 1716454224942356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224996773, "dur": 3, "args": { "External id": 222720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222720, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 222720, "pid": 5, "tid": 7, "ts": 1716454224996773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942399, "dur": 9, "args": { "External id": 222720, "cbid": 211, "correlation": 222720 } }, { "ph": "s", "id": 222720, "pid": 76337, "tid": -914061504, "ts": 1716454224942399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224942463, "dur": 1, "args": { "External id": 222736, "cbid": 251, "correlation": 222736 } }, { "ph": "f", "id": 222736, "pid": 76337, "tid": -914061504, "ts": 1716454224942463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224942468, "dur": 0, "args": { "External id": 222738, "cbid": 251, "correlation": 222738 } }, { "ph": "f", "id": 222738, "pid": 76337, "tid": -914061504, "ts": 1716454224942468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224996777, "dur": 13, "args": { "External id": 222739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222739, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222739, "pid": 5, "tid": 7, "ts": 1716454224996777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942470, "dur": 11, "args": { "External id": 222739, "cbid": 211, "correlation": 222739 } }, { "ph": "s", "id": 222739, "pid": 76337, "tid": -914061504, "ts": 1716454224942470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224996792, "dur": 5, "args": { "External id": 222741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222741, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222741, "pid": 5, "tid": 7, "ts": 1716454224996792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942483, "dur": 5, "args": { "External id": 222741, "cbid": 211, "correlation": 222741 } }, { "ph": "s", "id": 222741, "pid": 76337, "tid": -914061504, "ts": 1716454224942483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224996798, "dur": 17, "args": { "External id": 222751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222751, "pid": 5, "tid": 7, "ts": 1716454224996798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942541, "dur": 12, "args": { "External id": 222751, "cbid": 211, "correlation": 222751 } }, { "ph": "s", "id": 222751, "pid": 76337, "tid": -914061504, "ts": 1716454224942541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224996816, "dur": 18, "args": { "External id": 222771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222771, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 222771, "pid": 5, "tid": 7, "ts": 1716454224996816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942606, "dur": 10, "args": { "External id": 222771, "cbid": 211, "correlation": 222771 } }, { "ph": "s", "id": 222771, "pid": 76337, "tid": -914061504, "ts": 1716454224942606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224996835, "dur": 4, "args": { "External id": 222783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222783, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 222783, "pid": 5, "tid": 7, "ts": 1716454224996835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942626, "dur": 6, "args": { "External id": 222783, "cbid": 211, "correlation": 222783 } }, { "ph": "s", "id": 222783, "pid": 76337, "tid": -914061504, "ts": 1716454224942626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224996840, "dur": 16, "args": { "External id": 222786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222786, "pid": 5, "tid": 7, "ts": 1716454224996840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942645, "dur": 6, "args": { "External id": 222786, "cbid": 211, "correlation": 222786 } }, { "ph": "s", "id": 222786, "pid": 76337, "tid": -914061504, "ts": 1716454224942645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224996858, "dur": 11, "args": { "External id": 222795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222795, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222795, "pid": 5, "tid": 7, "ts": 1716454224996858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942684, "dur": 10, "args": { "External id": 222795, "cbid": 211, "correlation": 222795 } }, { "ph": "s", "id": 222795, "pid": 76337, "tid": -914061504, "ts": 1716454224942684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224942745, "dur": 0, "args": { "External id": 222805, "cbid": 317, "correlation": 222805 } }, { "ph": "f", "id": 222805, "pid": 76337, "tid": -914061504, "ts": 1716454224942745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224942746, "dur": 0, "args": { "External id": 222806, "cbid": 203, "correlation": 222806 } }, { "ph": "f", "id": 222806, "pid": 76337, "tid": -914061504, "ts": 1716454224942746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224942746, "dur": 0, "args": { "External id": 222807, "cbid": 205, "correlation": 222807 } }, { "ph": "f", "id": 222807, "pid": 76337, "tid": -914061504, "ts": 1716454224942746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224996870, "dur": 10, "args": { "External id": 222811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222811, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222811, "pid": 5, "tid": 7, "ts": 1716454224996870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942760, "dur": 11, "args": { "External id": 222811, "cbid": 211, "correlation": 222811 } }, { "ph": "s", "id": 222811, "pid": 76337, "tid": -914061504, "ts": 1716454224942760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224996881, "dur": 159, "args": { "External id": 222813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222813, "pid": 5, "tid": 7, "ts": 1716454224996881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942774, "dur": 5, "args": { "External id": 222813, "cbid": 211, "correlation": 222813 } }, { "ph": "s", "id": 222813, "pid": 76337, "tid": -914061504, "ts": 1716454224942774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224997043, "dur": 1, "args": { "External id": 222815, "device": 5, "context": 1, "stream": 7, "correlation": 222815, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 222815, "pid": 5, "tid": 7, "ts": 1716454224997043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224942785, "dur": 8, "args": { "External id": 222815, "cbid": 51, "correlation": 222815 } }, { "ph": "s", "id": 222815, "pid": 76337, "tid": -914061504, "ts": 1716454224942785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224997047, "dur": 641, "args": { "External id": 222816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222816, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222816, "pid": 5, "tid": 7, "ts": 1716454224997047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942795, "dur": 7, "args": { "External id": 222816, "cbid": 211, "correlation": 222816 } }, { "ph": "s", "id": 222816, "pid": 76337, "tid": -914061504, "ts": 1716454224942795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224997689, "dur": 12, "args": { "External id": 222818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222818, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222818, "pid": 5, "tid": 7, "ts": 1716454224997689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942805, "dur": 5, "args": { "External id": 222818, "cbid": 211, "correlation": 222818 } }, { "ph": "s", "id": 222818, "pid": 76337, "tid": -914061504, "ts": 1716454224942805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224997703, "dur": 14, "args": { "External id": 222824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222824, "pid": 5, "tid": 7, "ts": 1716454224997703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942833, "dur": 9, "args": { "External id": 222824, "cbid": 211, "correlation": 222824 } }, { "ph": "s", "id": 222824, "pid": 76337, "tid": -914061504, "ts": 1716454224942833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224997718, "dur": 12, "args": { "External id": 222832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222832, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222832, "pid": 5, "tid": 7, "ts": 1716454224997718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942865, "dur": 8, "args": { "External id": 222832, "cbid": 211, "correlation": 222832 } }, { "ph": "s", "id": 222832, "pid": 76337, "tid": -914061504, "ts": 1716454224942865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224997732, "dur": 10, "args": { "External id": 222840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222840, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222840, "pid": 5, "tid": 7, "ts": 1716454224997732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942895, "dur": 8, "args": { "External id": 222840, "cbid": 211, "correlation": 222840 } }, { "ph": "s", "id": 222840, "pid": 76337, "tid": -914061504, "ts": 1716454224942895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224997743, "dur": 17, "args": { "External id": 222860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222860, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 222860, "pid": 5, "tid": 7, "ts": 1716454224997743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224942972, "dur": 20, "args": { "External id": 222860, "cbid": 211, "correlation": 222860 } }, { "ph": "s", "id": 222860, "pid": 76337, "tid": -914061504, "ts": 1716454224942972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224997762, "dur": 4, "args": { "External id": 222872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222872, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 222872, "pid": 5, "tid": 7, "ts": 1716454224997762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943003, "dur": 6, "args": { "External id": 222872, "cbid": 211, "correlation": 222872 } }, { "ph": "s", "id": 222872, "pid": 76337, "tid": -914061504, "ts": 1716454224943003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224997767, "dur": 16, "args": { "External id": 222875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222875, "pid": 5, "tid": 7, "ts": 1716454224997767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943021, "dur": 6, "args": { "External id": 222875, "cbid": 211, "correlation": 222875 } }, { "ph": "s", "id": 222875, "pid": 76337, "tid": -914061504, "ts": 1716454224943021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224943079, "dur": 0, "args": { "External id": 222886, "cbid": 317, "correlation": 222886 } }, { "ph": "f", "id": 222886, "pid": 76337, "tid": -914061504, "ts": 1716454224943079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224943080, "dur": 0, "args": { "External id": 222887, "cbid": 203, "correlation": 222887 } }, { "ph": "f", "id": 222887, "pid": 76337, "tid": -914061504, "ts": 1716454224943080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224943081, "dur": 0, "args": { "External id": 222888, "cbid": 205, "correlation": 222888 } }, { "ph": "f", "id": 222888, "pid": 76337, "tid": -914061504, "ts": 1716454224943081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224997785, "dur": 11, "args": { "External id": 222892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222892, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222892, "pid": 5, "tid": 7, "ts": 1716454224997785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943095, "dur": 13, "args": { "External id": 222892, "cbid": 211, "correlation": 222892 } }, { "ph": "s", "id": 222892, "pid": 76337, "tid": -914061504, "ts": 1716454224943095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224997798, "dur": 4, "args": { "External id": 222894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222894, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 222894, "pid": 5, "tid": 7, "ts": 1716454224997798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943112, "dur": 6, "args": { "External id": 222894, "cbid": 211, "correlation": 222894 } }, { "ph": "s", "id": 222894, "pid": 76337, "tid": -914061504, "ts": 1716454224943112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224943120, "dur": 0, "args": { "External id": 222895, "cbid": 51, "correlation": 222895 } }, { "ph": "s", "id": 222895, "pid": 76337, "tid": -914061504, "ts": 1716454224943120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224997803, "dur": 94, "args": { "External id": 222896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222896, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 222896, "pid": 5, "tid": 7, "ts": 1716454224997803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943121, "dur": 5, "args": { "External id": 222896, "cbid": 211, "correlation": 222896 } }, { "ph": "s", "id": 222896, "pid": 76337, "tid": -914061504, "ts": 1716454224943121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224997898, "dur": 15, "args": { "External id": 222901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222901, "pid": 5, "tid": 7, "ts": 1716454224997898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943148, "dur": 8, "args": { "External id": 222901, "cbid": 211, "correlation": 222901 } }, { "ph": "s", "id": 222901, "pid": 76337, "tid": -914061504, "ts": 1716454224943148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224997914, "dur": 82, "args": { "External id": 222910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222910, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222910, "pid": 5, "tid": 7, "ts": 1716454224997914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943228, "dur": 14, "args": { "External id": 222910, "cbid": 211, "correlation": 222910 } }, { "ph": "s", "id": 222910, "pid": 76337, "tid": -914061504, "ts": 1716454224943228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224997997, "dur": 30, "args": { "External id": 222932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222932, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 222932, "pid": 5, "tid": 7, "ts": 1716454224997997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943283, "dur": 10, "args": { "External id": 222932, "cbid": 211, "correlation": 222932 } }, { "ph": "s", "id": 222932, "pid": 76337, "tid": -914061504, "ts": 1716454224943283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224943370, "dur": 1, "args": { "External id": 222943, "cbid": 251, "correlation": 222943 } }, { "ph": "f", "id": 222943, "pid": 76337, "tid": -914061504, "ts": 1716454224943370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224998029, "dur": 162, "args": { "External id": 222944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222944, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222944, "pid": 5, "tid": 7, "ts": 1716454224998029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943376, "dur": 13, "args": { "External id": 222944, "cbid": 211, "correlation": 222944 } }, { "ph": "s", "id": 222944, "pid": 76337, "tid": -914061504, "ts": 1716454224943376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224943446, "dur": 1, "args": { "External id": 222955, "cbid": 251, "correlation": 222955 } }, { "ph": "f", "id": 222955, "pid": 76337, "tid": -914061504, "ts": 1716454224943446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224998192, "dur": 155, "args": { "External id": 222956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222956, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222956, "pid": 5, "tid": 7, "ts": 1716454224998192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943450, "dur": 11, "args": { "External id": 222956, "cbid": 211, "correlation": 222956 } }, { "ph": "s", "id": 222956, "pid": 76337, "tid": -914061504, "ts": 1716454224943450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224943515, "dur": 1, "args": { "External id": 222967, "cbid": 251, "correlation": 222967 } }, { "ph": "f", "id": 222967, "pid": 76337, "tid": -914061504, "ts": 1716454224943515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224998349, "dur": 135, "args": { "External id": 222968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222968, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222968, "pid": 5, "tid": 7, "ts": 1716454224998349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943519, "dur": 11, "args": { "External id": 222968, "cbid": 211, "correlation": 222968 } }, { "ph": "s", "id": 222968, "pid": 76337, "tid": -914061504, "ts": 1716454224943519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224998485, "dur": 333, "args": { "External id": 222993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 222993, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 222993, "pid": 5, "tid": 7, "ts": 1716454224998485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943603, "dur": 12, "args": { "External id": 222993, "cbid": 211, "correlation": 222993 } }, { "ph": "s", "id": 222993, "pid": 76337, "tid": -914061504, "ts": 1716454224943603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224943700, "dur": 1, "args": { "External id": 223011, "cbid": 251, "correlation": 223011 } }, { "ph": "f", "id": 223011, "pid": 76337, "tid": -914061504, "ts": 1716454224943700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224998819, "dur": 164, "args": { "External id": 223013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223013, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223013, "pid": 5, "tid": 7, "ts": 1716454224998819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943707, "dur": 13, "args": { "External id": 223013, "cbid": 211, "correlation": 223013 } }, { "ph": "s", "id": 223013, "pid": 76337, "tid": -914061504, "ts": 1716454224943707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224998985, "dur": 19, "args": { "External id": 223021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223021, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223021, "pid": 5, "tid": 7, "ts": 1716454224998985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943775, "dur": 12, "args": { "External id": 223021, "cbid": 211, "correlation": 223021 } }, { "ph": "s", "id": 223021, "pid": 76337, "tid": -914061504, "ts": 1716454224943775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224999005, "dur": 28, "args": { "External id": 223029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223029, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223029, "pid": 5, "tid": 7, "ts": 1716454224999005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943814, "dur": 8, "args": { "External id": 223029, "cbid": 211, "correlation": 223029 } }, { "ph": "s", "id": 223029, "pid": 76337, "tid": -914061504, "ts": 1716454224943814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224999034, "dur": 18, "args": { "External id": 223040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223040, "pid": 5, "tid": 7, "ts": 1716454224999034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943884, "dur": 12, "args": { "External id": 223040, "cbid": 211, "correlation": 223040 } }, { "ph": "s", "id": 223040, "pid": 76337, "tid": -914061504, "ts": 1716454224943884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224999053, "dur": 16, "args": { "External id": 223062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223062, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223062, "pid": 5, "tid": 7, "ts": 1716454224999053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224943916, "dur": 7, "args": { "External id": 223062, "cbid": 211, "correlation": 223062 } }, { "ph": "s", "id": 223062, "pid": 76337, "tid": -914061504, "ts": 1716454224943916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944011, "dur": 1, "args": { "External id": 223073, "cbid": 251, "correlation": 223073 } }, { "ph": "f", "id": 223073, "pid": 76337, "tid": -914061504, "ts": 1716454224944011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224999070, "dur": 88, "args": { "External id": 223074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223074, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 223074, "pid": 5, "tid": 7, "ts": 1716454224999070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944017, "dur": 13, "args": { "External id": 223074, "cbid": 211, "correlation": 223074 } }, { "ph": "s", "id": 223074, "pid": 76337, "tid": -914061504, "ts": 1716454224944017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944085, "dur": 1, "args": { "External id": 223085, "cbid": 251, "correlation": 223085 } }, { "ph": "f", "id": 223085, "pid": 76337, "tid": -914061504, "ts": 1716454224944085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944088, "dur": 0, "args": { "External id": 223086, "cbid": 251, "correlation": 223086 } }, { "ph": "f", "id": 223086, "pid": 76337, "tid": -914061504, "ts": 1716454224944088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224999159, "dur": 12, "args": { "External id": 223087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223087, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223087, "pid": 5, "tid": 7, "ts": 1716454224999159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944090, "dur": 12, "args": { "External id": 223087, "cbid": 211, "correlation": 223087 } }, { "ph": "s", "id": 223087, "pid": 76337, "tid": -914061504, "ts": 1716454224944090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224999172, "dur": 5, "args": { "External id": 223089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223089, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223089, "pid": 5, "tid": 7, "ts": 1716454224999172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944104, "dur": 6, "args": { "External id": 223089, "cbid": 211, "correlation": 223089 } }, { "ph": "s", "id": 223089, "pid": 76337, "tid": -914061504, "ts": 1716454224944104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944163, "dur": 1, "args": { "External id": 223100, "cbid": 251, "correlation": 223100 } }, { "ph": "f", "id": 223100, "pid": 76337, "tid": -914061504, "ts": 1716454224944163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944166, "dur": 0, "args": { "External id": 223101, "cbid": 251, "correlation": 223101 } }, { "ph": "f", "id": 223101, "pid": 76337, "tid": -914061504, "ts": 1716454224944166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224999178, "dur": 8, "args": { "External id": 223102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223102, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223102, "pid": 5, "tid": 7, "ts": 1716454224999178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944168, "dur": 11, "args": { "External id": 223102, "cbid": 211, "correlation": 223102 } }, { "ph": "s", "id": 223102, "pid": 76337, "tid": -914061504, "ts": 1716454224944168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224999188, "dur": 3, "args": { "External id": 223104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223104, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223104, "pid": 5, "tid": 7, "ts": 1716454224999188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944180, "dur": 5, "args": { "External id": 223104, "cbid": 211, "correlation": 223104 } }, { "ph": "s", "id": 223104, "pid": 76337, "tid": -914061504, "ts": 1716454224944180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224999193, "dur": 55, "args": { "External id": 223129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223129, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223129, "pid": 5, "tid": 7, "ts": 1716454224999193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944255, "dur": 13, "args": { "External id": 223129, "cbid": 211, "correlation": 223129 } }, { "ph": "s", "id": 223129, "pid": 76337, "tid": -914061504, "ts": 1716454224944255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944352, "dur": 1, "args": { "External id": 223147, "cbid": 251, "correlation": 223147 } }, { "ph": "f", "id": 223147, "pid": 76337, "tid": -914061504, "ts": 1716454224944352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224999249, "dur": 89, "args": { "External id": 223149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223149, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 223149, "pid": 5, "tid": 7, "ts": 1716454224999249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944358, "dur": 15, "args": { "External id": 223149, "cbid": 211, "correlation": 223149 } }, { "ph": "s", "id": 223149, "pid": 76337, "tid": -914061504, "ts": 1716454224944358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224999340, "dur": 10, "args": { "External id": 223157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223157, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223157, "pid": 5, "tid": 7, "ts": 1716454224999340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944429, "dur": 12, "args": { "External id": 223157, "cbid": 211, "correlation": 223157 } }, { "ph": "s", "id": 223157, "pid": 76337, "tid": -914061504, "ts": 1716454224944429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224999351, "dur": 22, "args": { "External id": 223165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223165, "pid": 5, "tid": 7, "ts": 1716454224999351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944470, "dur": 9, "args": { "External id": 223165, "cbid": 211, "correlation": 223165 } }, { "ph": "s", "id": 223165, "pid": 76337, "tid": -914061504, "ts": 1716454224944470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224999374, "dur": 17, "args": { "External id": 223187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223187, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223187, "pid": 5, "tid": 7, "ts": 1716454224999374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944522, "dur": 10, "args": { "External id": 223187, "cbid": 211, "correlation": 223187 } }, { "ph": "s", "id": 223187, "pid": 76337, "tid": -914061504, "ts": 1716454224944522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944609, "dur": 1, "args": { "External id": 223203, "cbid": 251, "correlation": 223203 } }, { "ph": "f", "id": 223203, "pid": 76337, "tid": -914061504, "ts": 1716454224944609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944614, "dur": 0, "args": { "External id": 223205, "cbid": 251, "correlation": 223205 } }, { "ph": "f", "id": 223205, "pid": 76337, "tid": -914061504, "ts": 1716454224944614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224999393, "dur": 491, "args": { "External id": 223206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223206, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223206, "pid": 5, "tid": 7, "ts": 1716454224999393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944616, "dur": 12, "args": { "External id": 223206, "cbid": 211, "correlation": 223206 } }, { "ph": "s", "id": 223206, "pid": 76337, "tid": -914061504, "ts": 1716454224944616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224999886, "dur": 66, "args": { "External id": 223214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223214, "pid": 5, "tid": 7, "ts": 1716454224999886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944680, "dur": 12, "args": { "External id": 223214, "cbid": 211, "correlation": 223214 } }, { "ph": "s", "id": 223214, "pid": 76337, "tid": -914061504, "ts": 1716454224944680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224999953, "dur": 66, "args": { "External id": 223222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223222, "pid": 5, "tid": 7, "ts": 1716454224999953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944709, "dur": 9, "args": { "External id": 223222, "cbid": 211, "correlation": 223222 } }, { "ph": "s", "id": 223222, "pid": 76337, "tid": -914061504, "ts": 1716454224944709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224944789, "dur": 1, "args": { "External id": 223238, "cbid": 251, "correlation": 223238 } }, { "ph": "f", "id": 223238, "pid": 76337, "tid": -914061504, "ts": 1716454224944789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225000021, "dur": 1, "args": { "External id": 223240, "device": 5, "context": 1, "stream": 7, "correlation": 223240, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 223240, "pid": 5, "tid": 7, "ts": 1716454225000021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224944794, "dur": 9, "args": { "External id": 223240, "cbid": 51, "correlation": 223240 } }, { "ph": "s", "id": 223240, "pid": 76337, "tid": -914061504, "ts": 1716454224944794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225000024, "dur": 268, "args": { "External id": 223241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223241, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 223241, "pid": 5, "tid": 7, "ts": 1716454225000024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944805, "dur": 12, "args": { "External id": 223241, "cbid": 211, "correlation": 223241 } }, { "ph": "s", "id": 223241, "pid": 76337, "tid": -914061504, "ts": 1716454224944805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225000294, "dur": 13, "args": { "External id": 223249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223249, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223249, "pid": 5, "tid": 7, "ts": 1716454225000294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944847, "dur": 10, "args": { "External id": 223249, "cbid": 211, "correlation": 223249 } }, { "ph": "s", "id": 223249, "pid": 76337, "tid": -914061504, "ts": 1716454224944847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225000308, "dur": 37, "args": { "External id": 223260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223260, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223260, "pid": 5, "tid": 7, "ts": 1716454225000308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224944914, "dur": 12, "args": { "External id": 223260, "cbid": 211, "correlation": 223260 } }, { "ph": "s", "id": 223260, "pid": 76337, "tid": -914061504, "ts": 1716454224944914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224944986, "dur": 0, "args": { "External id": 223272, "cbid": 317, "correlation": 223272 } }, { "ph": "f", "id": 223272, "pid": 76337, "tid": -914061504, "ts": 1716454224944986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224944987, "dur": 0, "args": { "External id": 223273, "cbid": 203, "correlation": 223273 } }, { "ph": "f", "id": 223273, "pid": 76337, "tid": -914061504, "ts": 1716454224944987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224944988, "dur": 0, "args": { "External id": 223274, "cbid": 205, "correlation": 223274 } }, { "ph": "f", "id": 223274, "pid": 76337, "tid": -914061504, "ts": 1716454224944988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225000347, "dur": 13, "args": { "External id": 223278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223278, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223278, "pid": 5, "tid": 7, "ts": 1716454225000347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945004, "dur": 13, "args": { "External id": 223278, "cbid": 211, "correlation": 223278 } }, { "ph": "s", "id": 223278, "pid": 76337, "tid": -914061504, "ts": 1716454224945004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225000361, "dur": 4, "args": { "External id": 223280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 223280, "pid": 5, "tid": 7, "ts": 1716454225000361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945022, "dur": 7, "args": { "External id": 223280, "cbid": 211, "correlation": 223280 } }, { "ph": "s", "id": 223280, "pid": 76337, "tid": -914061504, "ts": 1716454224945022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224945031, "dur": 0, "args": { "External id": 223281, "cbid": 51, "correlation": 223281 } }, { "ph": "s", "id": 223281, "pid": 76337, "tid": -914061504, "ts": 1716454224945031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225000366, "dur": 96, "args": { "External id": 223282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223282, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 223282, "pid": 5, "tid": 7, "ts": 1716454225000366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945032, "dur": 5, "args": { "External id": 223282, "cbid": 211, "correlation": 223282 } }, { "ph": "s", "id": 223282, "pid": 76337, "tid": -914061504, "ts": 1716454224945032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225000464, "dur": 16, "args": { "External id": 223287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223287, "pid": 5, "tid": 7, "ts": 1716454225000464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945059, "dur": 9, "args": { "External id": 223287, "cbid": 211, "correlation": 223287 } }, { "ph": "s", "id": 223287, "pid": 76337, "tid": -914061504, "ts": 1716454224945059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225000481, "dur": 12, "args": { "External id": 223295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223295, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223295, "pid": 5, "tid": 7, "ts": 1716454225000481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945091, "dur": 8, "args": { "External id": 223295, "cbid": 211, "correlation": 223295 } }, { "ph": "s", "id": 223295, "pid": 76337, "tid": -914061504, "ts": 1716454224945091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224945160, "dur": 0, "args": { "External id": 223305, "cbid": 317, "correlation": 223305 } }, { "ph": "f", "id": 223305, "pid": 76337, "tid": -914061504, "ts": 1716454224945160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224945161, "dur": 0, "args": { "External id": 223306, "cbid": 203, "correlation": 223306 } }, { "ph": "f", "id": 223306, "pid": 76337, "tid": -914061504, "ts": 1716454224945161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224945162, "dur": 0, "args": { "External id": 223307, "cbid": 205, "correlation": 223307 } }, { "ph": "f", "id": 223307, "pid": 76337, "tid": -914061504, "ts": 1716454224945162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225000494, "dur": 12, "args": { "External id": 223311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223311, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223311, "pid": 5, "tid": 7, "ts": 1716454225000494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945176, "dur": 12, "args": { "External id": 223311, "cbid": 211, "correlation": 223311 } }, { "ph": "s", "id": 223311, "pid": 76337, "tid": -914061504, "ts": 1716454224945176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225000508, "dur": 160, "args": { "External id": 223313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223313, "pid": 5, "tid": 7, "ts": 1716454225000508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945191, "dur": 5, "args": { "External id": 223313, "cbid": 211, "correlation": 223313 } }, { "ph": "s", "id": 223313, "pid": 76337, "tid": -914061504, "ts": 1716454224945191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225000669, "dur": 1, "args": { "External id": 223315, "device": 5, "context": 1, "stream": 7, "correlation": 223315, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 223315, "pid": 5, "tid": 7, "ts": 1716454225000669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224945202, "dur": 7, "args": { "External id": 223315, "cbid": 51, "correlation": 223315 } }, { "ph": "s", "id": 223315, "pid": 76337, "tid": -914061504, "ts": 1716454224945202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225000673, "dur": 197, "args": { "External id": 223316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223316, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 223316, "pid": 5, "tid": 7, "ts": 1716454225000673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945210, "dur": 8, "args": { "External id": 223316, "cbid": 211, "correlation": 223316 } }, { "ph": "s", "id": 223316, "pid": 76337, "tid": -914061504, "ts": 1716454224945210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225000872, "dur": 6, "args": { "External id": 223318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223318, "pid": 5, "tid": 7, "ts": 1716454225000872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945223, "dur": 5, "args": { "External id": 223318, "cbid": 211, "correlation": 223318 } }, { "ph": "s", "id": 223318, "pid": 76337, "tid": -914061504, "ts": 1716454224945223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225000879, "dur": 6, "args": { "External id": 223324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223324, "pid": 5, "tid": 7, "ts": 1716454225000879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945250, "dur": 10, "args": { "External id": 223324, "cbid": 211, "correlation": 223324 } }, { "ph": "s", "id": 223324, "pid": 76337, "tid": -914061504, "ts": 1716454224945250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225000886, "dur": 11, "args": { "External id": 223344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223344, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223344, "pid": 5, "tid": 7, "ts": 1716454225000886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945344, "dur": 12, "args": { "External id": 223344, "cbid": 211, "correlation": 223344 } }, { "ph": "s", "id": 223344, "pid": 76337, "tid": -914061504, "ts": 1716454224945344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225000898, "dur": 4, "args": { "External id": 223356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223356, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223356, "pid": 5, "tid": 7, "ts": 1716454225000898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945368, "dur": 6, "args": { "External id": 223356, "cbid": 211, "correlation": 223356 } }, { "ph": "s", "id": 223356, "pid": 76337, "tid": -914061504, "ts": 1716454224945368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225000904, "dur": 8, "args": { "External id": 223359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223359, "pid": 5, "tid": 7, "ts": 1716454225000904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945386, "dur": 6, "args": { "External id": 223359, "cbid": 211, "correlation": 223359 } }, { "ph": "s", "id": 223359, "pid": 76337, "tid": -914061504, "ts": 1716454224945386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225000913, "dur": 5, "args": { "External id": 223368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223368, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223368, "pid": 5, "tid": 7, "ts": 1716454225000913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945425, "dur": 9, "args": { "External id": 223368, "cbid": 211, "correlation": 223368 } }, { "ph": "s", "id": 223368, "pid": 76337, "tid": -914061504, "ts": 1716454224945425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224945477, "dur": 0, "args": { "External id": 223378, "cbid": 317, "correlation": 223378 } }, { "ph": "f", "id": 223378, "pid": 76337, "tid": -914061504, "ts": 1716454224945477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224945478, "dur": 0, "args": { "External id": 223379, "cbid": 203, "correlation": 223379 } }, { "ph": "f", "id": 223379, "pid": 76337, "tid": -914061504, "ts": 1716454224945478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224945479, "dur": 0, "args": { "External id": 223380, "cbid": 205, "correlation": 223380 } }, { "ph": "f", "id": 223380, "pid": 76337, "tid": -914061504, "ts": 1716454224945479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225000920, "dur": 5, "args": { "External id": 223384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223384, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223384, "pid": 5, "tid": 7, "ts": 1716454225000920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945493, "dur": 11, "args": { "External id": 223384, "cbid": 211, "correlation": 223384 } }, { "ph": "s", "id": 223384, "pid": 76337, "tid": -914061504, "ts": 1716454224945493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225000926, "dur": 159, "args": { "External id": 223386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223386, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223386, "pid": 5, "tid": 7, "ts": 1716454225000926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945507, "dur": 5, "args": { "External id": 223386, "cbid": 211, "correlation": 223386 } }, { "ph": "s", "id": 223386, "pid": 76337, "tid": -914061504, "ts": 1716454224945507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225001087, "dur": 1, "args": { "External id": 223388, "device": 5, "context": 1, "stream": 7, "correlation": 223388, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 223388, "pid": 5, "tid": 7, "ts": 1716454225001087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224945519, "dur": 6, "args": { "External id": 223388, "cbid": 51, "correlation": 223388 } }, { "ph": "s", "id": 223388, "pid": 76337, "tid": -914061504, "ts": 1716454224945519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225001091, "dur": 266, "args": { "External id": 223389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223389, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223389, "pid": 5, "tid": 7, "ts": 1716454225001091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945526, "dur": 6, "args": { "External id": 223389, "cbid": 211, "correlation": 223389 } }, { "ph": "s", "id": 223389, "pid": 76337, "tid": -914061504, "ts": 1716454224945526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225001358, "dur": 6, "args": { "External id": 223391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223391, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223391, "pid": 5, "tid": 7, "ts": 1716454225001358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945536, "dur": 6, "args": { "External id": 223391, "cbid": 211, "correlation": 223391 } }, { "ph": "s", "id": 223391, "pid": 76337, "tid": -914061504, "ts": 1716454224945536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225001365, "dur": 6, "args": { "External id": 223397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223397, "pid": 5, "tid": 7, "ts": 1716454225001365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945566, "dur": 8, "args": { "External id": 223397, "cbid": 211, "correlation": 223397 } }, { "ph": "s", "id": 223397, "pid": 76337, "tid": -914061504, "ts": 1716454224945566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225001372, "dur": 3, "args": { "External id": 223405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223405, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 223405, "pid": 5, "tid": 7, "ts": 1716454225001372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945607, "dur": 10, "args": { "External id": 223405, "cbid": 211, "correlation": 223405 } }, { "ph": "s", "id": 223405, "pid": 76337, "tid": -914061504, "ts": 1716454224945607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224945673, "dur": 1, "args": { "External id": 223421, "cbid": 251, "correlation": 223421 } }, { "ph": "f", "id": 223421, "pid": 76337, "tid": -914061504, "ts": 1716454224945673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224945678, "dur": 0, "args": { "External id": 223423, "cbid": 251, "correlation": 223423 } }, { "ph": "f", "id": 223423, "pid": 76337, "tid": -914061504, "ts": 1716454224945678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225001377, "dur": 13, "args": { "External id": 223424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223424, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223424, "pid": 5, "tid": 7, "ts": 1716454225001377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945680, "dur": 11, "args": { "External id": 223424, "cbid": 211, "correlation": 223424 } }, { "ph": "s", "id": 223424, "pid": 76337, "tid": -914061504, "ts": 1716454224945680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225001391, "dur": 5, "args": { "External id": 223426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223426, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223426, "pid": 5, "tid": 7, "ts": 1716454225001391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945693, "dur": 5, "args": { "External id": 223426, "cbid": 211, "correlation": 223426 } }, { "ph": "s", "id": 223426, "pid": 76337, "tid": -914061504, "ts": 1716454224945693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225001397, "dur": 6, "args": { "External id": 223436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223436, "pid": 5, "tid": 7, "ts": 1716454225001397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945751, "dur": 13, "args": { "External id": 223436, "cbid": 211, "correlation": 223436 } }, { "ph": "s", "id": 223436, "pid": 76337, "tid": -914061504, "ts": 1716454224945751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225001404, "dur": 9, "args": { "External id": 223456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223456, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223456, "pid": 5, "tid": 7, "ts": 1716454225001404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945818, "dur": 11, "args": { "External id": 223456, "cbid": 211, "correlation": 223456 } }, { "ph": "s", "id": 223456, "pid": 76337, "tid": -914061504, "ts": 1716454224945818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225001415, "dur": 4, "args": { "External id": 223468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223468, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223468, "pid": 5, "tid": 7, "ts": 1716454225001415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945839, "dur": 6, "args": { "External id": 223468, "cbid": 211, "correlation": 223468 } }, { "ph": "s", "id": 223468, "pid": 76337, "tid": -914061504, "ts": 1716454224945839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225001420, "dur": 7, "args": { "External id": 223471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223471, "pid": 5, "tid": 7, "ts": 1716454225001420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945857, "dur": 8, "args": { "External id": 223471, "cbid": 211, "correlation": 223471 } }, { "ph": "s", "id": 223471, "pid": 76337, "tid": -914061504, "ts": 1716454224945857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225001428, "dur": 4, "args": { "External id": 223480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223480, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223480, "pid": 5, "tid": 7, "ts": 1716454225001428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945898, "dur": 9, "args": { "External id": 223480, "cbid": 211, "correlation": 223480 } }, { "ph": "s", "id": 223480, "pid": 76337, "tid": -914061504, "ts": 1716454224945898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224945960, "dur": 0, "args": { "External id": 223490, "cbid": 317, "correlation": 223490 } }, { "ph": "f", "id": 223490, "pid": 76337, "tid": -914061504, "ts": 1716454224945960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224945961, "dur": 0, "args": { "External id": 223491, "cbid": 203, "correlation": 223491 } }, { "ph": "f", "id": 223491, "pid": 76337, "tid": -914061504, "ts": 1716454224945961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224945961, "dur": 0, "args": { "External id": 223492, "cbid": 205, "correlation": 223492 } }, { "ph": "f", "id": 223492, "pid": 76337, "tid": -914061504, "ts": 1716454224945961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225001434, "dur": 5, "args": { "External id": 223496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223496, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223496, "pid": 5, "tid": 7, "ts": 1716454225001434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945983, "dur": 12, "args": { "External id": 223496, "cbid": 211, "correlation": 223496 } }, { "ph": "s", "id": 223496, "pid": 76337, "tid": -914061504, "ts": 1716454224945983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225001440, "dur": 158, "args": { "External id": 223498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223498, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223498, "pid": 5, "tid": 7, "ts": 1716454225001440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224945998, "dur": 5, "args": { "External id": 223498, "cbid": 211, "correlation": 223498 } }, { "ph": "s", "id": 223498, "pid": 76337, "tid": -914061504, "ts": 1716454224945998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225001600, "dur": 1, "args": { "External id": 223500, "device": 5, "context": 1, "stream": 7, "correlation": 223500, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 223500, "pid": 5, "tid": 7, "ts": 1716454225001600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224946009, "dur": 7, "args": { "External id": 223500, "cbid": 51, "correlation": 223500 } }, { "ph": "s", "id": 223500, "pid": 76337, "tid": -914061504, "ts": 1716454224946009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225001604, "dur": 255, "args": { "External id": 223501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223501, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223501, "pid": 5, "tid": 7, "ts": 1716454225001604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946017, "dur": 6, "args": { "External id": 223501, "cbid": 211, "correlation": 223501 } }, { "ph": "s", "id": 223501, "pid": 76337, "tid": -914061504, "ts": 1716454224946017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225001860, "dur": 6, "args": { "External id": 223503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223503, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223503, "pid": 5, "tid": 7, "ts": 1716454225001860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946027, "dur": 5, "args": { "External id": 223503, "cbid": 211, "correlation": 223503 } }, { "ph": "s", "id": 223503, "pid": 76337, "tid": -914061504, "ts": 1716454224946027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225001867, "dur": 6, "args": { "External id": 223509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223509, "pid": 5, "tid": 7, "ts": 1716454225001867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946055, "dur": 9, "args": { "External id": 223509, "cbid": 211, "correlation": 223509 } }, { "ph": "s", "id": 223509, "pid": 76337, "tid": -914061504, "ts": 1716454224946055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225001875, "dur": 5, "args": { "External id": 223517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223517, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223517, "pid": 5, "tid": 7, "ts": 1716454225001875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946087, "dur": 8, "args": { "External id": 223517, "cbid": 211, "correlation": 223517 } }, { "ph": "s", "id": 223517, "pid": 76337, "tid": -914061504, "ts": 1716454224946087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225001881, "dur": 4, "args": { "External id": 223525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223525, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223525, "pid": 5, "tid": 7, "ts": 1716454225001881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946117, "dur": 9, "args": { "External id": 223525, "cbid": 211, "correlation": 223525 } }, { "ph": "s", "id": 223525, "pid": 76337, "tid": -914061504, "ts": 1716454224946117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225001887, "dur": 9, "args": { "External id": 223545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223545, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223545, "pid": 5, "tid": 7, "ts": 1716454225001887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946191, "dur": 11, "args": { "External id": 223545, "cbid": 211, "correlation": 223545 } }, { "ph": "s", "id": 223545, "pid": 76337, "tid": -914061504, "ts": 1716454224946191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225001897, "dur": 4, "args": { "External id": 223557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223557, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223557, "pid": 5, "tid": 7, "ts": 1716454225001897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946212, "dur": 6, "args": { "External id": 223557, "cbid": 211, "correlation": 223557 } }, { "ph": "s", "id": 223557, "pid": 76337, "tid": -914061504, "ts": 1716454224946212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225001902, "dur": 6, "args": { "External id": 223560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223560, "pid": 5, "tid": 7, "ts": 1716454225001902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946230, "dur": 6, "args": { "External id": 223560, "cbid": 211, "correlation": 223560 } }, { "ph": "s", "id": 223560, "pid": 76337, "tid": -914061504, "ts": 1716454224946230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225001910, "dur": 4, "args": { "External id": 223569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223569, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223569, "pid": 5, "tid": 7, "ts": 1716454225001910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946267, "dur": 9, "args": { "External id": 223569, "cbid": 211, "correlation": 223569 } }, { "ph": "s", "id": 223569, "pid": 76337, "tid": -914061504, "ts": 1716454224946267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224946317, "dur": 0, "args": { "External id": 223579, "cbid": 317, "correlation": 223579 } }, { "ph": "f", "id": 223579, "pid": 76337, "tid": -914061504, "ts": 1716454224946317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224946318, "dur": 0, "args": { "External id": 223580, "cbid": 203, "correlation": 223580 } }, { "ph": "f", "id": 223580, "pid": 76337, "tid": -914061504, "ts": 1716454224946318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224946318, "dur": 0, "args": { "External id": 223581, "cbid": 205, "correlation": 223581 } }, { "ph": "f", "id": 223581, "pid": 76337, "tid": -914061504, "ts": 1716454224946318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225001916, "dur": 5, "args": { "External id": 223585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223585, "pid": 5, "tid": 7, "ts": 1716454225001916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946332, "dur": 11, "args": { "External id": 223585, "cbid": 211, "correlation": 223585 } }, { "ph": "s", "id": 223585, "pid": 76337, "tid": -914061504, "ts": 1716454224946332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225001922, "dur": 160, "args": { "External id": 223587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223587, "pid": 5, "tid": 7, "ts": 1716454225001922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946346, "dur": 5, "args": { "External id": 223587, "cbid": 211, "correlation": 223587 } }, { "ph": "s", "id": 223587, "pid": 76337, "tid": -914061504, "ts": 1716454224946346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225002084, "dur": 1, "args": { "External id": 223589, "device": 5, "context": 1, "stream": 7, "correlation": 223589, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 223589, "pid": 5, "tid": 7, "ts": 1716454225002084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224946356, "dur": 6, "args": { "External id": 223589, "cbid": 51, "correlation": 223589 } }, { "ph": "s", "id": 223589, "pid": 76337, "tid": -914061504, "ts": 1716454224946356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225002088, "dur": 254, "args": { "External id": 223590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223590, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223590, "pid": 5, "tid": 7, "ts": 1716454225002088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946363, "dur": 6, "args": { "External id": 223590, "cbid": 211, "correlation": 223590 } }, { "ph": "s", "id": 223590, "pid": 76337, "tid": -914061504, "ts": 1716454224946363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225002343, "dur": 6, "args": { "External id": 223592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223592, "pid": 5, "tid": 7, "ts": 1716454225002343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946373, "dur": 6, "args": { "External id": 223592, "cbid": 211, "correlation": 223592 } }, { "ph": "s", "id": 223592, "pid": 76337, "tid": -914061504, "ts": 1716454224946373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225002350, "dur": 6, "args": { "External id": 223598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223598, "pid": 5, "tid": 7, "ts": 1716454225002350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946402, "dur": 8, "args": { "External id": 223598, "cbid": 211, "correlation": 223598 } }, { "ph": "s", "id": 223598, "pid": 76337, "tid": -914061504, "ts": 1716454224946402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225002357, "dur": 3, "args": { "External id": 223606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223606, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 223606, "pid": 5, "tid": 7, "ts": 1716454225002357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946445, "dur": 8, "args": { "External id": 223606, "cbid": 211, "correlation": 223606 } }, { "ph": "s", "id": 223606, "pid": 76337, "tid": -914061504, "ts": 1716454224946445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224946507, "dur": 1, "args": { "External id": 223622, "cbid": 251, "correlation": 223622 } }, { "ph": "f", "id": 223622, "pid": 76337, "tid": -914061504, "ts": 1716454224946507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224946512, "dur": 0, "args": { "External id": 223624, "cbid": 251, "correlation": 223624 } }, { "ph": "f", "id": 223624, "pid": 76337, "tid": -914061504, "ts": 1716454224946512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225002362, "dur": 9, "args": { "External id": 223625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223625, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223625, "pid": 5, "tid": 7, "ts": 1716454225002362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946514, "dur": 11, "args": { "External id": 223625, "cbid": 211, "correlation": 223625 } }, { "ph": "s", "id": 223625, "pid": 76337, "tid": -914061504, "ts": 1716454224946514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225002373, "dur": 3, "args": { "External id": 223627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223627, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223627, "pid": 5, "tid": 7, "ts": 1716454225002373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946527, "dur": 5, "args": { "External id": 223627, "cbid": 211, "correlation": 223627 } }, { "ph": "s", "id": 223627, "pid": 76337, "tid": -914061504, "ts": 1716454224946527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225002377, "dur": 6, "args": { "External id": 223637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223637, "pid": 5, "tid": 7, "ts": 1716454225002377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946582, "dur": 12, "args": { "External id": 223637, "cbid": 211, "correlation": 223637 } }, { "ph": "s", "id": 223637, "pid": 76337, "tid": -914061504, "ts": 1716454224946582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225002384, "dur": 9, "args": { "External id": 223657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223657, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223657, "pid": 5, "tid": 7, "ts": 1716454225002384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946646, "dur": 10, "args": { "External id": 223657, "cbid": 211, "correlation": 223657 } }, { "ph": "s", "id": 223657, "pid": 76337, "tid": -914061504, "ts": 1716454224946646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225002395, "dur": 4, "args": { "External id": 223669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223669, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223669, "pid": 5, "tid": 7, "ts": 1716454225002395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946666, "dur": 6, "args": { "External id": 223669, "cbid": 211, "correlation": 223669 } }, { "ph": "s", "id": 223669, "pid": 76337, "tid": -914061504, "ts": 1716454224946666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225002400, "dur": 6, "args": { "External id": 223672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223672, "pid": 5, "tid": 7, "ts": 1716454225002400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946685, "dur": 7, "args": { "External id": 223672, "cbid": 211, "correlation": 223672 } }, { "ph": "s", "id": 223672, "pid": 76337, "tid": -914061504, "ts": 1716454224946685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225002407, "dur": 4, "args": { "External id": 223681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223681, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223681, "pid": 5, "tid": 7, "ts": 1716454225002407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946726, "dur": 9, "args": { "External id": 223681, "cbid": 211, "correlation": 223681 } }, { "ph": "s", "id": 223681, "pid": 76337, "tid": -914061504, "ts": 1716454224946726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224946786, "dur": 0, "args": { "External id": 223691, "cbid": 317, "correlation": 223691 } }, { "ph": "f", "id": 223691, "pid": 76337, "tid": -914061504, "ts": 1716454224946786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224946787, "dur": 0, "args": { "External id": 223692, "cbid": 203, "correlation": 223692 } }, { "ph": "f", "id": 223692, "pid": 76337, "tid": -914061504, "ts": 1716454224946787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224946788, "dur": 0, "args": { "External id": 223693, "cbid": 205, "correlation": 223693 } }, { "ph": "f", "id": 223693, "pid": 76337, "tid": -914061504, "ts": 1716454224946788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225002413, "dur": 5, "args": { "External id": 223697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223697, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223697, "pid": 5, "tid": 7, "ts": 1716454225002413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946802, "dur": 12, "args": { "External id": 223697, "cbid": 211, "correlation": 223697 } }, { "ph": "s", "id": 223697, "pid": 76337, "tid": -914061504, "ts": 1716454224946802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225002419, "dur": 160, "args": { "External id": 223699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223699, "pid": 5, "tid": 7, "ts": 1716454225002419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946816, "dur": 5, "args": { "External id": 223699, "cbid": 211, "correlation": 223699 } }, { "ph": "s", "id": 223699, "pid": 76337, "tid": -914061504, "ts": 1716454224946816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225002581, "dur": 1, "args": { "External id": 223701, "device": 5, "context": 1, "stream": 7, "correlation": 223701, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 223701, "pid": 5, "tid": 7, "ts": 1716454225002581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224946827, "dur": 6, "args": { "External id": 223701, "cbid": 51, "correlation": 223701 } }, { "ph": "s", "id": 223701, "pid": 76337, "tid": -914061504, "ts": 1716454224946827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225002585, "dur": 255, "args": { "External id": 223702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223702, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223702, "pid": 5, "tid": 7, "ts": 1716454225002585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946834, "dur": 6, "args": { "External id": 223702, "cbid": 211, "correlation": 223702 } }, { "ph": "s", "id": 223702, "pid": 76337, "tid": -914061504, "ts": 1716454224946834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225002841, "dur": 6, "args": { "External id": 223704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223704, "pid": 5, "tid": 7, "ts": 1716454225002841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946843, "dur": 5, "args": { "External id": 223704, "cbid": 211, "correlation": 223704 } }, { "ph": "s", "id": 223704, "pid": 76337, "tid": -914061504, "ts": 1716454224946843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225002848, "dur": 6, "args": { "External id": 223710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223710, "pid": 5, "tid": 7, "ts": 1716454225002848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946871, "dur": 9, "args": { "External id": 223710, "cbid": 211, "correlation": 223710 } }, { "ph": "s", "id": 223710, "pid": 76337, "tid": -914061504, "ts": 1716454224946871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225002855, "dur": 5, "args": { "External id": 223718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223718, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223718, "pid": 5, "tid": 7, "ts": 1716454225002855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946903, "dur": 8, "args": { "External id": 223718, "cbid": 211, "correlation": 223718 } }, { "ph": "s", "id": 223718, "pid": 76337, "tid": -914061504, "ts": 1716454224946903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225002861, "dur": 4, "args": { "External id": 223726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223726, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223726, "pid": 5, "tid": 7, "ts": 1716454225002861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224946933, "dur": 9, "args": { "External id": 223726, "cbid": 211, "correlation": 223726 } }, { "ph": "s", "id": 223726, "pid": 76337, "tid": -914061504, "ts": 1716454224946933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225002867, "dur": 9, "args": { "External id": 223746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223746, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223746, "pid": 5, "tid": 7, "ts": 1716454225002867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947046, "dur": 12, "args": { "External id": 223746, "cbid": 211, "correlation": 223746 } }, { "ph": "s", "id": 223746, "pid": 76337, "tid": -914061504, "ts": 1716454224947046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225002878, "dur": 4, "args": { "External id": 223758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223758, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223758, "pid": 5, "tid": 7, "ts": 1716454225002878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947068, "dur": 6, "args": { "External id": 223758, "cbid": 211, "correlation": 223758 } }, { "ph": "s", "id": 223758, "pid": 76337, "tid": -914061504, "ts": 1716454224947068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225002883, "dur": 7, "args": { "External id": 223761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223761, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223761, "pid": 5, "tid": 7, "ts": 1716454225002883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947086, "dur": 7, "args": { "External id": 223761, "cbid": 211, "correlation": 223761 } }, { "ph": "s", "id": 223761, "pid": 76337, "tid": -914061504, "ts": 1716454224947086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225002891, "dur": 4, "args": { "External id": 223770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223770, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223770, "pid": 5, "tid": 7, "ts": 1716454225002891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947124, "dur": 9, "args": { "External id": 223770, "cbid": 211, "correlation": 223770 } }, { "ph": "s", "id": 223770, "pid": 76337, "tid": -914061504, "ts": 1716454224947124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224947176, "dur": 0, "args": { "External id": 223780, "cbid": 317, "correlation": 223780 } }, { "ph": "f", "id": 223780, "pid": 76337, "tid": -914061504, "ts": 1716454224947176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224947177, "dur": 0, "args": { "External id": 223781, "cbid": 203, "correlation": 223781 } }, { "ph": "f", "id": 223781, "pid": 76337, "tid": -914061504, "ts": 1716454224947177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224947177, "dur": 0, "args": { "External id": 223782, "cbid": 205, "correlation": 223782 } }, { "ph": "f", "id": 223782, "pid": 76337, "tid": -914061504, "ts": 1716454224947177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225002897, "dur": 5, "args": { "External id": 223786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223786, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223786, "pid": 5, "tid": 7, "ts": 1716454225002897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947190, "dur": 11, "args": { "External id": 223786, "cbid": 211, "correlation": 223786 } }, { "ph": "s", "id": 223786, "pid": 76337, "tid": -914061504, "ts": 1716454224947190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225002903, "dur": 159, "args": { "External id": 223788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223788, "pid": 5, "tid": 7, "ts": 1716454225002903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947204, "dur": 5, "args": { "External id": 223788, "cbid": 211, "correlation": 223788 } }, { "ph": "s", "id": 223788, "pid": 76337, "tid": -914061504, "ts": 1716454224947204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225003064, "dur": 1, "args": { "External id": 223790, "device": 5, "context": 1, "stream": 7, "correlation": 223790, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 223790, "pid": 5, "tid": 7, "ts": 1716454225003064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224947214, "dur": 6, "args": { "External id": 223790, "cbid": 51, "correlation": 223790 } }, { "ph": "s", "id": 223790, "pid": 76337, "tid": -914061504, "ts": 1716454224947214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225003068, "dur": 254, "args": { "External id": 223791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223791, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223791, "pid": 5, "tid": 7, "ts": 1716454225003068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947221, "dur": 6, "args": { "External id": 223791, "cbid": 211, "correlation": 223791 } }, { "ph": "s", "id": 223791, "pid": 76337, "tid": -914061504, "ts": 1716454224947221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225003323, "dur": 5, "args": { "External id": 223793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223793, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223793, "pid": 5, "tid": 7, "ts": 1716454225003323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947233, "dur": 6, "args": { "External id": 223793, "cbid": 211, "correlation": 223793 } }, { "ph": "s", "id": 223793, "pid": 76337, "tid": -914061504, "ts": 1716454224947233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225003330, "dur": 6, "args": { "External id": 223799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223799, "pid": 5, "tid": 7, "ts": 1716454225003330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947261, "dur": 8, "args": { "External id": 223799, "cbid": 211, "correlation": 223799 } }, { "ph": "s", "id": 223799, "pid": 76337, "tid": -914061504, "ts": 1716454224947261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225003337, "dur": 3, "args": { "External id": 223807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223807, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 223807, "pid": 5, "tid": 7, "ts": 1716454225003337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947304, "dur": 9, "args": { "External id": 223807, "cbid": 211, "correlation": 223807 } }, { "ph": "s", "id": 223807, "pid": 76337, "tid": -914061504, "ts": 1716454224947304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224947365, "dur": 1, "args": { "External id": 223823, "cbid": 251, "correlation": 223823 } }, { "ph": "f", "id": 223823, "pid": 76337, "tid": -914061504, "ts": 1716454224947365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224947371, "dur": 0, "args": { "External id": 223825, "cbid": 251, "correlation": 223825 } }, { "ph": "f", "id": 223825, "pid": 76337, "tid": -914061504, "ts": 1716454224947371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225003342, "dur": 10, "args": { "External id": 223826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223826, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223826, "pid": 5, "tid": 7, "ts": 1716454225003342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947373, "dur": 11, "args": { "External id": 223826, "cbid": 211, "correlation": 223826 } }, { "ph": "s", "id": 223826, "pid": 76337, "tid": -914061504, "ts": 1716454224947373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225003353, "dur": 3, "args": { "External id": 223828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223828, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223828, "pid": 5, "tid": 7, "ts": 1716454225003353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947386, "dur": 6, "args": { "External id": 223828, "cbid": 211, "correlation": 223828 } }, { "ph": "s", "id": 223828, "pid": 76337, "tid": -914061504, "ts": 1716454224947386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225003358, "dur": 6, "args": { "External id": 223838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223838, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223838, "pid": 5, "tid": 7, "ts": 1716454225003358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947442, "dur": 12, "args": { "External id": 223838, "cbid": 211, "correlation": 223838 } }, { "ph": "s", "id": 223838, "pid": 76337, "tid": -914061504, "ts": 1716454224947442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225003365, "dur": 9, "args": { "External id": 223858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223858, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223858, "pid": 5, "tid": 7, "ts": 1716454225003365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947508, "dur": 10, "args": { "External id": 223858, "cbid": 211, "correlation": 223858 } }, { "ph": "s", "id": 223858, "pid": 76337, "tid": -914061504, "ts": 1716454224947508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225003376, "dur": 4, "args": { "External id": 223870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223870, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223870, "pid": 5, "tid": 7, "ts": 1716454225003376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947529, "dur": 6, "args": { "External id": 223870, "cbid": 211, "correlation": 223870 } }, { "ph": "s", "id": 223870, "pid": 76337, "tid": -914061504, "ts": 1716454224947529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225003381, "dur": 6, "args": { "External id": 223873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223873, "pid": 5, "tid": 7, "ts": 1716454225003381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947547, "dur": 7, "args": { "External id": 223873, "cbid": 211, "correlation": 223873 } }, { "ph": "s", "id": 223873, "pid": 76337, "tid": -914061504, "ts": 1716454224947547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225003388, "dur": 4, "args": { "External id": 223882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223882, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223882, "pid": 5, "tid": 7, "ts": 1716454225003388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947587, "dur": 9, "args": { "External id": 223882, "cbid": 211, "correlation": 223882 } }, { "ph": "s", "id": 223882, "pid": 76337, "tid": -914061504, "ts": 1716454224947587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224947650, "dur": 0, "args": { "External id": 223892, "cbid": 317, "correlation": 223892 } }, { "ph": "f", "id": 223892, "pid": 76337, "tid": -914061504, "ts": 1716454224947650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224947651, "dur": 0, "args": { "External id": 223893, "cbid": 203, "correlation": 223893 } }, { "ph": "f", "id": 223893, "pid": 76337, "tid": -914061504, "ts": 1716454224947651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224947652, "dur": 0, "args": { "External id": 223894, "cbid": 205, "correlation": 223894 } }, { "ph": "f", "id": 223894, "pid": 76337, "tid": -914061504, "ts": 1716454224947652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225003394, "dur": 5, "args": { "External id": 223898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223898, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223898, "pid": 5, "tid": 7, "ts": 1716454225003394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947665, "dur": 13, "args": { "External id": 223898, "cbid": 211, "correlation": 223898 } }, { "ph": "s", "id": 223898, "pid": 76337, "tid": -914061504, "ts": 1716454224947665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225003400, "dur": 159, "args": { "External id": 223900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223900, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223900, "pid": 5, "tid": 7, "ts": 1716454225003400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947681, "dur": 5, "args": { "External id": 223900, "cbid": 211, "correlation": 223900 } }, { "ph": "s", "id": 223900, "pid": 76337, "tid": -914061504, "ts": 1716454224947681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225003562, "dur": 1, "args": { "External id": 223902, "device": 5, "context": 1, "stream": 7, "correlation": 223902, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 223902, "pid": 5, "tid": 7, "ts": 1716454225003562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224947691, "dur": 6, "args": { "External id": 223902, "cbid": 51, "correlation": 223902 } }, { "ph": "s", "id": 223902, "pid": 76337, "tid": -914061504, "ts": 1716454224947691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225003565, "dur": 254, "args": { "External id": 223903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223903, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 223903, "pid": 5, "tid": 7, "ts": 1716454225003565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947699, "dur": 6, "args": { "External id": 223903, "cbid": 211, "correlation": 223903 } }, { "ph": "s", "id": 223903, "pid": 76337, "tid": -914061504, "ts": 1716454224947699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225003821, "dur": 6, "args": { "External id": 223905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223905, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223905, "pid": 5, "tid": 7, "ts": 1716454225003821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947708, "dur": 5, "args": { "External id": 223905, "cbid": 211, "correlation": 223905 } }, { "ph": "s", "id": 223905, "pid": 76337, "tid": -914061504, "ts": 1716454224947708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225003828, "dur": 6, "args": { "External id": 223911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223911, "pid": 5, "tid": 7, "ts": 1716454225003828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947736, "dur": 8, "args": { "External id": 223911, "cbid": 211, "correlation": 223911 } }, { "ph": "s", "id": 223911, "pid": 76337, "tid": -914061504, "ts": 1716454224947736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225003835, "dur": 5, "args": { "External id": 223919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223919, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223919, "pid": 5, "tid": 7, "ts": 1716454225003835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947768, "dur": 9, "args": { "External id": 223919, "cbid": 211, "correlation": 223919 } }, { "ph": "s", "id": 223919, "pid": 76337, "tid": -914061504, "ts": 1716454224947768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225003841, "dur": 4, "args": { "External id": 223927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223927, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223927, "pid": 5, "tid": 7, "ts": 1716454225003841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947798, "dur": 9, "args": { "External id": 223927, "cbid": 211, "correlation": 223927 } }, { "ph": "s", "id": 223927, "pid": 76337, "tid": -914061504, "ts": 1716454224947798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225003847, "dur": 9, "args": { "External id": 223947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223947, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 223947, "pid": 5, "tid": 7, "ts": 1716454225003847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947938, "dur": 14, "args": { "External id": 223947, "cbid": 211, "correlation": 223947 } }, { "ph": "s", "id": 223947, "pid": 76337, "tid": -914061504, "ts": 1716454224947938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225003858, "dur": 4, "args": { "External id": 223959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223959, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 223959, "pid": 5, "tid": 7, "ts": 1716454225003858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947962, "dur": 6, "args": { "External id": 223959, "cbid": 211, "correlation": 223959 } }, { "ph": "s", "id": 223959, "pid": 76337, "tid": -914061504, "ts": 1716454224947962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225003863, "dur": 6, "args": { "External id": 223962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223962, "pid": 5, "tid": 7, "ts": 1716454225003863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224947990, "dur": 7, "args": { "External id": 223962, "cbid": 211, "correlation": 223962 } }, { "ph": "s", "id": 223962, "pid": 76337, "tid": -914061504, "ts": 1716454224947990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224948050, "dur": 0, "args": { "External id": 223973, "cbid": 317, "correlation": 223973 } }, { "ph": "f", "id": 223973, "pid": 76337, "tid": -914061504, "ts": 1716454224948050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224948051, "dur": 0, "args": { "External id": 223974, "cbid": 203, "correlation": 223974 } }, { "ph": "f", "id": 223974, "pid": 76337, "tid": -914061504, "ts": 1716454224948051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224948051, "dur": 0, "args": { "External id": 223975, "cbid": 205, "correlation": 223975 } }, { "ph": "f", "id": 223975, "pid": 76337, "tid": -914061504, "ts": 1716454224948051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225003871, "dur": 5, "args": { "External id": 223979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223979, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223979, "pid": 5, "tid": 7, "ts": 1716454225003871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948068, "dur": 12, "args": { "External id": 223979, "cbid": 211, "correlation": 223979 } }, { "ph": "s", "id": 223979, "pid": 76337, "tid": -914061504, "ts": 1716454224948068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225003877, "dur": 36, "args": { "External id": 223981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223981, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 223981, "pid": 5, "tid": 7, "ts": 1716454225003877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948087, "dur": 8, "args": { "External id": 223981, "cbid": 211, "correlation": 223981 } }, { "ph": "s", "id": 223981, "pid": 76337, "tid": -914061504, "ts": 1716454224948087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225003915, "dur": 5, "args": { "External id": 223983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223983, "pid": 5, "tid": 7, "ts": 1716454225003915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948099, "dur": 5, "args": { "External id": 223983, "cbid": 211, "correlation": 223983 } }, { "ph": "s", "id": 223983, "pid": 76337, "tid": -914061504, "ts": 1716454224948099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225003921, "dur": 6, "args": { "External id": 223989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 223989, "pid": 5, "tid": 7, "ts": 1716454225003921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948126, "dur": 8, "args": { "External id": 223989, "cbid": 211, "correlation": 223989 } }, { "ph": "s", "id": 223989, "pid": 76337, "tid": -914061504, "ts": 1716454224948126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225003928, "dur": 20, "args": { "External id": 223998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 223998, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 223998, "pid": 5, "tid": 7, "ts": 1716454225003928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948209, "dur": 14, "args": { "External id": 223998, "cbid": 211, "correlation": 223998 } }, { "ph": "s", "id": 223998, "pid": 76337, "tid": -914061504, "ts": 1716454224948209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225003949, "dur": 10, "args": { "External id": 224020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224020, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 224020, "pid": 5, "tid": 7, "ts": 1716454225003949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948266, "dur": 11, "args": { "External id": 224020, "cbid": 211, "correlation": 224020 } }, { "ph": "s", "id": 224020, "pid": 76337, "tid": -914061504, "ts": 1716454224948266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948355, "dur": 2, "args": { "External id": 224031, "cbid": 251, "correlation": 224031 } }, { "ph": "f", "id": 224031, "pid": 76337, "tid": -914061504, "ts": 1716454224948355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948360, "dur": 0, "args": { "External id": 224032, "cbid": 251, "correlation": 224032 } }, { "ph": "f", "id": 224032, "pid": 76337, "tid": -914061504, "ts": 1716454224948360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225003961, "dur": 53, "args": { "External id": 224033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224033, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 224033, "pid": 5, "tid": 7, "ts": 1716454225003961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948363, "dur": 15, "args": { "External id": 224033, "cbid": 211, "correlation": 224033 } }, { "ph": "s", "id": 224033, "pid": 76337, "tid": -914061504, "ts": 1716454224948363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948435, "dur": 1, "args": { "External id": 224044, "cbid": 251, "correlation": 224044 } }, { "ph": "f", "id": 224044, "pid": 76337, "tid": -914061504, "ts": 1716454224948435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948438, "dur": 0, "args": { "External id": 224045, "cbid": 251, "correlation": 224045 } }, { "ph": "f", "id": 224045, "pid": 76337, "tid": -914061504, "ts": 1716454224948438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225004015, "dur": 51, "args": { "External id": 224046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224046, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 224046, "pid": 5, "tid": 7, "ts": 1716454225004015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948440, "dur": 12, "args": { "External id": 224046, "cbid": 211, "correlation": 224046 } }, { "ph": "s", "id": 224046, "pid": 76337, "tid": -914061504, "ts": 1716454224948440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948506, "dur": 1, "args": { "External id": 224057, "cbid": 251, "correlation": 224057 } }, { "ph": "f", "id": 224057, "pid": 76337, "tid": -914061504, "ts": 1716454224948506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948510, "dur": 0, "args": { "External id": 224058, "cbid": 251, "correlation": 224058 } }, { "ph": "f", "id": 224058, "pid": 76337, "tid": -914061504, "ts": 1716454224948510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225004067, "dur": 52, "args": { "External id": 224059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224059, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 224059, "pid": 5, "tid": 7, "ts": 1716454225004067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948512, "dur": 12, "args": { "External id": 224059, "cbid": 211, "correlation": 224059 } }, { "ph": "s", "id": 224059, "pid": 76337, "tid": -914061504, "ts": 1716454224948512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225004121, "dur": 56, "args": { "External id": 224084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224084, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224084, "pid": 5, "tid": 7, "ts": 1716454225004121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948597, "dur": 13, "args": { "External id": 224084, "cbid": 211, "correlation": 224084 } }, { "ph": "s", "id": 224084, "pid": 76337, "tid": -914061504, "ts": 1716454224948597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224948697, "dur": 1, "args": { "External id": 224102, "cbid": 251, "correlation": 224102 } }, { "ph": "f", "id": 224102, "pid": 76337, "tid": -914061504, "ts": 1716454224948697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225004178, "dur": 61, "args": { "External id": 224104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224104, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 224104, "pid": 5, "tid": 7, "ts": 1716454225004178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948702, "dur": 13, "args": { "External id": 224104, "cbid": 211, "correlation": 224104 } }, { "ph": "s", "id": 224104, "pid": 76337, "tid": -914061504, "ts": 1716454224948702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225004241, "dur": 6, "args": { "External id": 224112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224112, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224112, "pid": 5, "tid": 7, "ts": 1716454225004241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948772, "dur": 13, "args": { "External id": 224112, "cbid": 211, "correlation": 224112 } }, { "ph": "s", "id": 224112, "pid": 76337, "tid": -914061504, "ts": 1716454224948772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225004248, "dur": 7, "args": { "External id": 224120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224120, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224120, "pid": 5, "tid": 7, "ts": 1716454225004248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948811, "dur": 9, "args": { "External id": 224120, "cbid": 211, "correlation": 224120 } }, { "ph": "s", "id": 224120, "pid": 76337, "tid": -914061504, "ts": 1716454224948811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004257, "dur": 8, "args": { "External id": 224131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224131, "pid": 5, "tid": 7, "ts": 1716454225004257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948885, "dur": 13, "args": { "External id": 224131, "cbid": 211, "correlation": 224131 } }, { "ph": "s", "id": 224131, "pid": 76337, "tid": -914061504, "ts": 1716454224948885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225004266, "dur": 9, "args": { "External id": 224153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224153, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 224153, "pid": 5, "tid": 7, "ts": 1716454225004266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224948917, "dur": 9, "args": { "External id": 224153, "cbid": 211, "correlation": 224153 } }, { "ph": "s", "id": 224153, "pid": 76337, "tid": -914061504, "ts": 1716454224948917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949011, "dur": 3, "args": { "External id": 224164, "cbid": 251, "correlation": 224164 } }, { "ph": "f", "id": 224164, "pid": 76337, "tid": -914061504, "ts": 1716454224949011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225004277, "dur": 1, "args": { "External id": 224165, "device": 5, "context": 1, "stream": 7, "correlation": 224165, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 224165, "pid": 5, "tid": 7, "ts": 1716454225004277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224949017, "dur": 11, "args": { "External id": 224165, "cbid": 51, "correlation": 224165 } }, { "ph": "s", "id": 224165, "pid": 76337, "tid": -914061504, "ts": 1716454224949017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225004280, "dur": 36, "args": { "External id": 224166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224166, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 224166, "pid": 5, "tid": 7, "ts": 1716454225004280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949031, "dur": 13, "args": { "External id": 224166, "cbid": 211, "correlation": 224166 } }, { "ph": "s", "id": 224166, "pid": 76337, "tid": -914061504, "ts": 1716454224949031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949104, "dur": 1, "args": { "External id": 224177, "cbid": 251, "correlation": 224177 } }, { "ph": "f", "id": 224177, "pid": 76337, "tid": -914061504, "ts": 1716454224949104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949108, "dur": 0, "args": { "External id": 224178, "cbid": 251, "correlation": 224178 } }, { "ph": "f", "id": 224178, "pid": 76337, "tid": -914061504, "ts": 1716454224949108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225004317, "dur": 12, "args": { "External id": 224179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224179, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224179, "pid": 5, "tid": 7, "ts": 1716454225004317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949109, "dur": 12, "args": { "External id": 224179, "cbid": 211, "correlation": 224179 } }, { "ph": "s", "id": 224179, "pid": 76337, "tid": -914061504, "ts": 1716454224949109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225004330, "dur": 5, "args": { "External id": 224181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224181, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224181, "pid": 5, "tid": 7, "ts": 1716454225004330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949123, "dur": 7, "args": { "External id": 224181, "cbid": 211, "correlation": 224181 } }, { "ph": "s", "id": 224181, "pid": 76337, "tid": -914061504, "ts": 1716454224949123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949182, "dur": 1, "args": { "External id": 224192, "cbid": 251, "correlation": 224192 } }, { "ph": "f", "id": 224192, "pid": 76337, "tid": -914061504, "ts": 1716454224949182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949185, "dur": 0, "args": { "External id": 224193, "cbid": 251, "correlation": 224193 } }, { "ph": "f", "id": 224193, "pid": 76337, "tid": -914061504, "ts": 1716454224949185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225004336, "dur": 8, "args": { "External id": 224194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224194, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224194, "pid": 5, "tid": 7, "ts": 1716454225004336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949186, "dur": 11, "args": { "External id": 224194, "cbid": 211, "correlation": 224194 } }, { "ph": "s", "id": 224194, "pid": 76337, "tid": -914061504, "ts": 1716454224949186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225004345, "dur": 4, "args": { "External id": 224196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224196, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224196, "pid": 5, "tid": 7, "ts": 1716454225004345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949199, "dur": 6, "args": { "External id": 224196, "cbid": 211, "correlation": 224196 } }, { "ph": "s", "id": 224196, "pid": 76337, "tid": -914061504, "ts": 1716454224949199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225004350, "dur": 19, "args": { "External id": 224221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224221, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 224221, "pid": 5, "tid": 7, "ts": 1716454225004350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949277, "dur": 13, "args": { "External id": 224221, "cbid": 211, "correlation": 224221 } }, { "ph": "s", "id": 224221, "pid": 76337, "tid": -914061504, "ts": 1716454224949277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949377, "dur": 2, "args": { "External id": 224239, "cbid": 251, "correlation": 224239 } }, { "ph": "f", "id": 224239, "pid": 76337, "tid": -914061504, "ts": 1716454224949377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225004372, "dur": 1, "args": { "External id": 224241, "device": 5, "context": 1, "stream": 7, "correlation": 224241, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 224241, "pid": 5, "tid": 7, "ts": 1716454225004372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224949384, "dur": 10, "args": { "External id": 224241, "cbid": 51, "correlation": 224241 } }, { "ph": "s", "id": 224241, "pid": 76337, "tid": -914061504, "ts": 1716454224949384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225004375, "dur": 36, "args": { "External id": 224242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224242, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 224242, "pid": 5, "tid": 7, "ts": 1716454225004375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949395, "dur": 13, "args": { "External id": 224242, "cbid": 211, "correlation": 224242 } }, { "ph": "s", "id": 224242, "pid": 76337, "tid": -914061504, "ts": 1716454224949395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225004413, "dur": 4, "args": { "External id": 224250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224250, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224250, "pid": 5, "tid": 7, "ts": 1716454225004413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949466, "dur": 12, "args": { "External id": 224250, "cbid": 211, "correlation": 224250 } }, { "ph": "s", "id": 224250, "pid": 76337, "tid": -914061504, "ts": 1716454224949466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004418, "dur": 8, "args": { "External id": 224258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224258, "pid": 5, "tid": 7, "ts": 1716454225004418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949508, "dur": 10, "args": { "External id": 224258, "cbid": 211, "correlation": 224258 } }, { "ph": "s", "id": 224258, "pid": 76337, "tid": -914061504, "ts": 1716454224949508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225004428, "dur": 8, "args": { "External id": 224280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224280, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 224280, "pid": 5, "tid": 7, "ts": 1716454225004428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949561, "dur": 10, "args": { "External id": 224280, "cbid": 211, "correlation": 224280 } }, { "ph": "s", "id": 224280, "pid": 76337, "tid": -914061504, "ts": 1716454224949561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949654, "dur": 1, "args": { "External id": 224296, "cbid": 251, "correlation": 224296 } }, { "ph": "f", "id": 224296, "pid": 76337, "tid": -914061504, "ts": 1716454224949654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949659, "dur": 0, "args": { "External id": 224298, "cbid": 251, "correlation": 224298 } }, { "ph": "f", "id": 224298, "pid": 76337, "tid": -914061504, "ts": 1716454224949659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225004437, "dur": 188, "args": { "External id": 224299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224299, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224299, "pid": 5, "tid": 7, "ts": 1716454225004437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949661, "dur": 13, "args": { "External id": 224299, "cbid": 211, "correlation": 224299 } }, { "ph": "s", "id": 224299, "pid": 76337, "tid": -914061504, "ts": 1716454224949661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004627, "dur": 21, "args": { "External id": 224307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224307, "pid": 5, "tid": 7, "ts": 1716454225004627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949728, "dur": 14, "args": { "External id": 224307, "cbid": 211, "correlation": 224307 } }, { "ph": "s", "id": 224307, "pid": 76337, "tid": -914061504, "ts": 1716454224949728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004649, "dur": 21, "args": { "External id": 224315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224315, "pid": 5, "tid": 7, "ts": 1716454225004649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949760, "dur": 8, "args": { "External id": 224315, "cbid": 211, "correlation": 224315 } }, { "ph": "s", "id": 224315, "pid": 76337, "tid": -914061504, "ts": 1716454224949760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224949842, "dur": 1, "args": { "External id": 224331, "cbid": 251, "correlation": 224331 } }, { "ph": "f", "id": 224331, "pid": 76337, "tid": -914061504, "ts": 1716454224949842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225004673, "dur": 1, "args": { "External id": 224333, "device": 5, "context": 1, "stream": 7, "correlation": 224333, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 224333, "pid": 5, "tid": 7, "ts": 1716454225004673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224949847, "dur": 9, "args": { "External id": 224333, "cbid": 51, "correlation": 224333 } }, { "ph": "s", "id": 224333, "pid": 76337, "tid": -914061504, "ts": 1716454224949847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225004676, "dur": 109, "args": { "External id": 224334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224334, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 224334, "pid": 5, "tid": 7, "ts": 1716454225004676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949857, "dur": 12, "args": { "External id": 224334, "cbid": 211, "correlation": 224334 } }, { "ph": "s", "id": 224334, "pid": 76337, "tid": -914061504, "ts": 1716454224949857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225004787, "dur": 5, "args": { "External id": 224342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224342, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224342, "pid": 5, "tid": 7, "ts": 1716454225004787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949901, "dur": 9, "args": { "External id": 224342, "cbid": 211, "correlation": 224342 } }, { "ph": "s", "id": 224342, "pid": 76337, "tid": -914061504, "ts": 1716454224949901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004793, "dur": 9, "args": { "External id": 224353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224353, "pid": 5, "tid": 7, "ts": 1716454225004793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224949969, "dur": 22, "args": { "External id": 224353, "cbid": 211, "correlation": 224353 } }, { "ph": "s", "id": 224353, "pid": 76337, "tid": -914061504, "ts": 1716454224949969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224950046, "dur": 0, "args": { "External id": 224365, "cbid": 317, "correlation": 224365 } }, { "ph": "f", "id": 224365, "pid": 76337, "tid": -914061504, "ts": 1716454224950046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224950046, "dur": 0, "args": { "External id": 224366, "cbid": 203, "correlation": 224366 } }, { "ph": "f", "id": 224366, "pid": 76337, "tid": -914061504, "ts": 1716454224950046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224950047, "dur": 0, "args": { "External id": 224367, "cbid": 205, "correlation": 224367 } }, { "ph": "f", "id": 224367, "pid": 76337, "tid": -914061504, "ts": 1716454224950047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225004804, "dur": 5, "args": { "External id": 224371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224371, "pid": 5, "tid": 7, "ts": 1716454225004804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950063, "dur": 12, "args": { "External id": 224371, "cbid": 211, "correlation": 224371 } }, { "ph": "s", "id": 224371, "pid": 76337, "tid": -914061504, "ts": 1716454224950063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225004811, "dur": 37, "args": { "External id": 224373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224373, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 224373, "pid": 5, "tid": 7, "ts": 1716454225004811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950082, "dur": 7, "args": { "External id": 224373, "cbid": 211, "correlation": 224373 } }, { "ph": "s", "id": 224373, "pid": 76337, "tid": -914061504, "ts": 1716454224950082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225004849, "dur": 6, "args": { "External id": 224375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224375, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224375, "pid": 5, "tid": 7, "ts": 1716454225004849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950093, "dur": 5, "args": { "External id": 224375, "cbid": 211, "correlation": 224375 } }, { "ph": "s", "id": 224375, "pid": 76337, "tid": -914061504, "ts": 1716454224950093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004856, "dur": 7, "args": { "External id": 224381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224381, "pid": 5, "tid": 7, "ts": 1716454225004856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950120, "dur": 9, "args": { "External id": 224381, "cbid": 211, "correlation": 224381 } }, { "ph": "s", "id": 224381, "pid": 76337, "tid": -914061504, "ts": 1716454224950120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225004864, "dur": 5, "args": { "External id": 224389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224389, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224389, "pid": 5, "tid": 7, "ts": 1716454225004864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950152, "dur": 9, "args": { "External id": 224389, "cbid": 211, "correlation": 224389 } }, { "ph": "s", "id": 224389, "pid": 76337, "tid": -914061504, "ts": 1716454224950152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225004870, "dur": 11, "args": { "External id": 224409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224409, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 224409, "pid": 5, "tid": 7, "ts": 1716454225004870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950225, "dur": 12, "args": { "External id": 224409, "cbid": 211, "correlation": 224409 } }, { "ph": "s", "id": 224409, "pid": 76337, "tid": -914061504, "ts": 1716454224950225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225004882, "dur": 4, "args": { "External id": 224421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224421, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 224421, "pid": 5, "tid": 7, "ts": 1716454225004882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950247, "dur": 6, "args": { "External id": 224421, "cbid": 211, "correlation": 224421 } }, { "ph": "s", "id": 224421, "pid": 76337, "tid": -914061504, "ts": 1716454224950247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225004888, "dur": 8, "args": { "External id": 224424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224424, "pid": 5, "tid": 7, "ts": 1716454225004888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950265, "dur": 7, "args": { "External id": 224424, "cbid": 211, "correlation": 224424 } }, { "ph": "s", "id": 224424, "pid": 76337, "tid": -914061504, "ts": 1716454224950265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225004897, "dur": 5, "args": { "External id": 224433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224433, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224433, "pid": 5, "tid": 7, "ts": 1716454225004897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950305, "dur": 10, "args": { "External id": 224433, "cbid": 211, "correlation": 224433 } }, { "ph": "s", "id": 224433, "pid": 76337, "tid": -914061504, "ts": 1716454224950305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224950355, "dur": 0, "args": { "External id": 224443, "cbid": 317, "correlation": 224443 } }, { "ph": "f", "id": 224443, "pid": 76337, "tid": -914061504, "ts": 1716454224950355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224950356, "dur": 0, "args": { "External id": 224444, "cbid": 203, "correlation": 224444 } }, { "ph": "f", "id": 224444, "pid": 76337, "tid": -914061504, "ts": 1716454224950356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224950357, "dur": 0, "args": { "External id": 224445, "cbid": 205, "correlation": 224445 } }, { "ph": "f", "id": 224445, "pid": 76337, "tid": -914061504, "ts": 1716454224950357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225004904, "dur": 5, "args": { "External id": 224449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224449, "pid": 5, "tid": 7, "ts": 1716454225004904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950370, "dur": 12, "args": { "External id": 224449, "cbid": 211, "correlation": 224449 } }, { "ph": "s", "id": 224449, "pid": 76337, "tid": -914061504, "ts": 1716454224950370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225004910, "dur": 159, "args": { "External id": 224451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224451, "pid": 5, "tid": 7, "ts": 1716454225004910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950384, "dur": 6, "args": { "External id": 224451, "cbid": 211, "correlation": 224451 } }, { "ph": "s", "id": 224451, "pid": 76337, "tid": -914061504, "ts": 1716454224950384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225005072, "dur": 1, "args": { "External id": 224453, "device": 5, "context": 1, "stream": 7, "correlation": 224453, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 224453, "pid": 5, "tid": 7, "ts": 1716454225005072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224950396, "dur": 6, "args": { "External id": 224453, "cbid": 51, "correlation": 224453 } }, { "ph": "s", "id": 224453, "pid": 76337, "tid": -914061504, "ts": 1716454224950396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225005075, "dur": 265, "args": { "External id": 224454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224454, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224454, "pid": 5, "tid": 7, "ts": 1716454225005075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950403, "dur": 6, "args": { "External id": 224454, "cbid": 211, "correlation": 224454 } }, { "ph": "s", "id": 224454, "pid": 76337, "tid": -914061504, "ts": 1716454224950403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225005342, "dur": 5, "args": { "External id": 224456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224456, "pid": 5, "tid": 7, "ts": 1716454225005342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950413, "dur": 5, "args": { "External id": 224456, "cbid": 211, "correlation": 224456 } }, { "ph": "s", "id": 224456, "pid": 76337, "tid": -914061504, "ts": 1716454224950413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225005349, "dur": 6, "args": { "External id": 224462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224462, "pid": 5, "tid": 7, "ts": 1716454225005349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950441, "dur": 8, "args": { "External id": 224462, "cbid": 211, "correlation": 224462 } }, { "ph": "s", "id": 224462, "pid": 76337, "tid": -914061504, "ts": 1716454224950441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225005356, "dur": 3, "args": { "External id": 224470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224470, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 224470, "pid": 5, "tid": 7, "ts": 1716454225005356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950484, "dur": 10, "args": { "External id": 224470, "cbid": 211, "correlation": 224470 } }, { "ph": "s", "id": 224470, "pid": 76337, "tid": -914061504, "ts": 1716454224950484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224950551, "dur": 1, "args": { "External id": 224486, "cbid": 251, "correlation": 224486 } }, { "ph": "f", "id": 224486, "pid": 76337, "tid": -914061504, "ts": 1716454224950551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224950556, "dur": 0, "args": { "External id": 224488, "cbid": 251, "correlation": 224488 } }, { "ph": "f", "id": 224488, "pid": 76337, "tid": -914061504, "ts": 1716454224950556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225005360, "dur": 12, "args": { "External id": 224489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224489, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224489, "pid": 5, "tid": 7, "ts": 1716454225005360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950558, "dur": 11, "args": { "External id": 224489, "cbid": 211, "correlation": 224489 } }, { "ph": "s", "id": 224489, "pid": 76337, "tid": -914061504, "ts": 1716454224950558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225005374, "dur": 5, "args": { "External id": 224491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224491, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224491, "pid": 5, "tid": 7, "ts": 1716454225005374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950571, "dur": 5, "args": { "External id": 224491, "cbid": 211, "correlation": 224491 } }, { "ph": "s", "id": 224491, "pid": 76337, "tid": -914061504, "ts": 1716454224950571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225005380, "dur": 6, "args": { "External id": 224501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224501, "pid": 5, "tid": 7, "ts": 1716454225005380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950629, "dur": 13, "args": { "External id": 224501, "cbid": 211, "correlation": 224501 } }, { "ph": "s", "id": 224501, "pid": 76337, "tid": -914061504, "ts": 1716454224950629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225005387, "dur": 10, "args": { "External id": 224521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224521, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 224521, "pid": 5, "tid": 7, "ts": 1716454225005387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950696, "dur": 11, "args": { "External id": 224521, "cbid": 211, "correlation": 224521 } }, { "ph": "s", "id": 224521, "pid": 76337, "tid": -914061504, "ts": 1716454224950696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225005399, "dur": 4, "args": { "External id": 224533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224533, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 224533, "pid": 5, "tid": 7, "ts": 1716454225005399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950717, "dur": 6, "args": { "External id": 224533, "cbid": 211, "correlation": 224533 } }, { "ph": "s", "id": 224533, "pid": 76337, "tid": -914061504, "ts": 1716454224950717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225005404, "dur": 7, "args": { "External id": 224536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224536, "pid": 5, "tid": 7, "ts": 1716454225005404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950736, "dur": 7, "args": { "External id": 224536, "cbid": 211, "correlation": 224536 } }, { "ph": "s", "id": 224536, "pid": 76337, "tid": -914061504, "ts": 1716454224950736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225005412, "dur": 4, "args": { "External id": 224545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224545, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224545, "pid": 5, "tid": 7, "ts": 1716454225005412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950775, "dur": 10, "args": { "External id": 224545, "cbid": 211, "correlation": 224545 } }, { "ph": "s", "id": 224545, "pid": 76337, "tid": -914061504, "ts": 1716454224950775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224950838, "dur": 0, "args": { "External id": 224555, "cbid": 317, "correlation": 224555 } }, { "ph": "f", "id": 224555, "pid": 76337, "tid": -914061504, "ts": 1716454224950838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224950839, "dur": 0, "args": { "External id": 224556, "cbid": 203, "correlation": 224556 } }, { "ph": "f", "id": 224556, "pid": 76337, "tid": -914061504, "ts": 1716454224950839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224950840, "dur": 0, "args": { "External id": 224557, "cbid": 205, "correlation": 224557 } }, { "ph": "f", "id": 224557, "pid": 76337, "tid": -914061504, "ts": 1716454224950840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225005418, "dur": 5, "args": { "External id": 224561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224561, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224561, "pid": 5, "tid": 7, "ts": 1716454225005418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950853, "dur": 12, "args": { "External id": 224561, "cbid": 211, "correlation": 224561 } }, { "ph": "s", "id": 224561, "pid": 76337, "tid": -914061504, "ts": 1716454224950853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225005424, "dur": 159, "args": { "External id": 224563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224563, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224563, "pid": 5, "tid": 7, "ts": 1716454225005424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950867, "dur": 5, "args": { "External id": 224563, "cbid": 211, "correlation": 224563 } }, { "ph": "s", "id": 224563, "pid": 76337, "tid": -914061504, "ts": 1716454224950867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225005585, "dur": 1, "args": { "External id": 224565, "device": 5, "context": 1, "stream": 7, "correlation": 224565, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 224565, "pid": 5, "tid": 7, "ts": 1716454225005585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224950878, "dur": 6, "args": { "External id": 224565, "cbid": 51, "correlation": 224565 } }, { "ph": "s", "id": 224565, "pid": 76337, "tid": -914061504, "ts": 1716454224950878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225005589, "dur": 254, "args": { "External id": 224566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224566, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224566, "pid": 5, "tid": 7, "ts": 1716454225005589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950885, "dur": 7, "args": { "External id": 224566, "cbid": 211, "correlation": 224566 } }, { "ph": "s", "id": 224566, "pid": 76337, "tid": -914061504, "ts": 1716454224950885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225005844, "dur": 6, "args": { "External id": 224568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224568, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224568, "pid": 5, "tid": 7, "ts": 1716454225005844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950896, "dur": 5, "args": { "External id": 224568, "cbid": 211, "correlation": 224568 } }, { "ph": "s", "id": 224568, "pid": 76337, "tid": -914061504, "ts": 1716454224950896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225005852, "dur": 6, "args": { "External id": 224574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224574, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224574, "pid": 5, "tid": 7, "ts": 1716454225005852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950924, "dur": 8, "args": { "External id": 224574, "cbid": 211, "correlation": 224574 } }, { "ph": "s", "id": 224574, "pid": 76337, "tid": -914061504, "ts": 1716454224950924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225005859, "dur": 5, "args": { "External id": 224582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224582, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224582, "pid": 5, "tid": 7, "ts": 1716454225005859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950957, "dur": 8, "args": { "External id": 224582, "cbid": 211, "correlation": 224582 } }, { "ph": "s", "id": 224582, "pid": 76337, "tid": -914061504, "ts": 1716454224950957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225005865, "dur": 5, "args": { "External id": 224590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224590, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224590, "pid": 5, "tid": 7, "ts": 1716454225005865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224950994, "dur": 9, "args": { "External id": 224590, "cbid": 211, "correlation": 224590 } }, { "ph": "s", "id": 224590, "pid": 76337, "tid": -914061504, "ts": 1716454224950994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225005871, "dur": 11, "args": { "External id": 224599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224599, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224599, "pid": 5, "tid": 7, "ts": 1716454225005871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951082, "dur": 14, "args": { "External id": 224599, "cbid": 211, "correlation": 224599 } }, { "ph": "s", "id": 224599, "pid": 76337, "tid": -914061504, "ts": 1716454224951082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225005884, "dur": 12, "args": { "External id": 224619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224619, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 224619, "pid": 5, "tid": 7, "ts": 1716454225005884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951150, "dur": 10, "args": { "External id": 224619, "cbid": 211, "correlation": 224619 } }, { "ph": "s", "id": 224619, "pid": 76337, "tid": -914061504, "ts": 1716454224951150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225005897, "dur": 4, "args": { "External id": 224631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224631, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224631, "pid": 5, "tid": 7, "ts": 1716454225005897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951171, "dur": 6, "args": { "External id": 224631, "cbid": 211, "correlation": 224631 } }, { "ph": "s", "id": 224631, "pid": 76337, "tid": -914061504, "ts": 1716454224951171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225005902, "dur": 10, "args": { "External id": 224634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224634, "pid": 5, "tid": 7, "ts": 1716454225005902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951190, "dur": 6, "args": { "External id": 224634, "cbid": 211, "correlation": 224634 } }, { "ph": "s", "id": 224634, "pid": 76337, "tid": -914061504, "ts": 1716454224951190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225005913, "dur": 6, "args": { "External id": 224643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224643, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224643, "pid": 5, "tid": 7, "ts": 1716454225005913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951227, "dur": 10, "args": { "External id": 224643, "cbid": 211, "correlation": 224643 } }, { "ph": "s", "id": 224643, "pid": 76337, "tid": -914061504, "ts": 1716454224951227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224951280, "dur": 0, "args": { "External id": 224653, "cbid": 317, "correlation": 224653 } }, { "ph": "f", "id": 224653, "pid": 76337, "tid": -914061504, "ts": 1716454224951280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224951280, "dur": 0, "args": { "External id": 224654, "cbid": 203, "correlation": 224654 } }, { "ph": "f", "id": 224654, "pid": 76337, "tid": -914061504, "ts": 1716454224951280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224951281, "dur": 0, "args": { "External id": 224655, "cbid": 205, "correlation": 224655 } }, { "ph": "f", "id": 224655, "pid": 76337, "tid": -914061504, "ts": 1716454224951281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225005921, "dur": 6, "args": { "External id": 224659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224659, "pid": 5, "tid": 7, "ts": 1716454225005921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951297, "dur": 11, "args": { "External id": 224659, "cbid": 211, "correlation": 224659 } }, { "ph": "s", "id": 224659, "pid": 76337, "tid": -914061504, "ts": 1716454224951297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225005928, "dur": 313, "args": { "External id": 224661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224661, "pid": 5, "tid": 7, "ts": 1716454225005928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951311, "dur": 5, "args": { "External id": 224661, "cbid": 211, "correlation": 224661 } }, { "ph": "s", "id": 224661, "pid": 76337, "tid": -914061504, "ts": 1716454224951311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225006243, "dur": 1, "args": { "External id": 224663, "device": 5, "context": 1, "stream": 7, "correlation": 224663, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 224663, "pid": 5, "tid": 7, "ts": 1716454225006243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224951322, "dur": 7, "args": { "External id": 224663, "cbid": 51, "correlation": 224663 } }, { "ph": "s", "id": 224663, "pid": 76337, "tid": -914061504, "ts": 1716454224951322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225006247, "dur": 486, "args": { "External id": 224664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224664, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224664, "pid": 5, "tid": 7, "ts": 1716454225006247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951330, "dur": 7, "args": { "External id": 224664, "cbid": 211, "correlation": 224664 } }, { "ph": "s", "id": 224664, "pid": 76337, "tid": -914061504, "ts": 1716454224951330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225006734, "dur": 6, "args": { "External id": 224666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224666, "pid": 5, "tid": 7, "ts": 1716454225006734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951341, "dur": 5, "args": { "External id": 224666, "cbid": 211, "correlation": 224666 } }, { "ph": "s", "id": 224666, "pid": 76337, "tid": -914061504, "ts": 1716454224951341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225006741, "dur": 6, "args": { "External id": 224672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224672, "pid": 5, "tid": 7, "ts": 1716454225006741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951369, "dur": 9, "args": { "External id": 224672, "cbid": 211, "correlation": 224672 } }, { "ph": "s", "id": 224672, "pid": 76337, "tid": -914061504, "ts": 1716454224951369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225006749, "dur": 3, "args": { "External id": 224680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224680, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 224680, "pid": 5, "tid": 7, "ts": 1716454225006749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951413, "dur": 9, "args": { "External id": 224680, "cbid": 211, "correlation": 224680 } }, { "ph": "s", "id": 224680, "pid": 76337, "tid": -914061504, "ts": 1716454224951413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224951475, "dur": 1, "args": { "External id": 224696, "cbid": 251, "correlation": 224696 } }, { "ph": "f", "id": 224696, "pid": 76337, "tid": -914061504, "ts": 1716454224951475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224951481, "dur": 0, "args": { "External id": 224698, "cbid": 251, "correlation": 224698 } }, { "ph": "f", "id": 224698, "pid": 76337, "tid": -914061504, "ts": 1716454224951481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225006753, "dur": 11, "args": { "External id": 224699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224699, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224699, "pid": 5, "tid": 7, "ts": 1716454225006753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951483, "dur": 11, "args": { "External id": 224699, "cbid": 211, "correlation": 224699 } }, { "ph": "s", "id": 224699, "pid": 76337, "tid": -914061504, "ts": 1716454224951483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225006765, "dur": 4, "args": { "External id": 224701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224701, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224701, "pid": 5, "tid": 7, "ts": 1716454225006765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951496, "dur": 5, "args": { "External id": 224701, "cbid": 211, "correlation": 224701 } }, { "ph": "s", "id": 224701, "pid": 76337, "tid": -914061504, "ts": 1716454224951496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225006771, "dur": 6, "args": { "External id": 224711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224711, "pid": 5, "tid": 7, "ts": 1716454225006771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951552, "dur": 12, "args": { "External id": 224711, "cbid": 211, "correlation": 224711 } }, { "ph": "s", "id": 224711, "pid": 76337, "tid": -914061504, "ts": 1716454224951552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225006778, "dur": 9, "args": { "External id": 224731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224731, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 224731, "pid": 5, "tid": 7, "ts": 1716454225006778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951618, "dur": 11, "args": { "External id": 224731, "cbid": 211, "correlation": 224731 } }, { "ph": "s", "id": 224731, "pid": 76337, "tid": -914061504, "ts": 1716454224951618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225006788, "dur": 4, "args": { "External id": 224743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224743, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 224743, "pid": 5, "tid": 7, "ts": 1716454225006788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951640, "dur": 6, "args": { "External id": 224743, "cbid": 211, "correlation": 224743 } }, { "ph": "s", "id": 224743, "pid": 76337, "tid": -914061504, "ts": 1716454224951640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225006793, "dur": 7, "args": { "External id": 224746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224746, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224746, "pid": 5, "tid": 7, "ts": 1716454225006793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951658, "dur": 6, "args": { "External id": 224746, "cbid": 211, "correlation": 224746 } }, { "ph": "s", "id": 224746, "pid": 76337, "tid": -914061504, "ts": 1716454224951658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225006802, "dur": 4, "args": { "External id": 224755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224755, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224755, "pid": 5, "tid": 7, "ts": 1716454225006802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951697, "dur": 9, "args": { "External id": 224755, "cbid": 211, "correlation": 224755 } }, { "ph": "s", "id": 224755, "pid": 76337, "tid": -914061504, "ts": 1716454224951697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224951758, "dur": 0, "args": { "External id": 224765, "cbid": 317, "correlation": 224765 } }, { "ph": "f", "id": 224765, "pid": 76337, "tid": -914061504, "ts": 1716454224951758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224951759, "dur": 0, "args": { "External id": 224766, "cbid": 203, "correlation": 224766 } }, { "ph": "f", "id": 224766, "pid": 76337, "tid": -914061504, "ts": 1716454224951759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224951760, "dur": 0, "args": { "External id": 224767, "cbid": 205, "correlation": 224767 } }, { "ph": "f", "id": 224767, "pid": 76337, "tid": -914061504, "ts": 1716454224951760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225006807, "dur": 5, "args": { "External id": 224771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224771, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224771, "pid": 5, "tid": 7, "ts": 1716454225006807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951774, "dur": 12, "args": { "External id": 224771, "cbid": 211, "correlation": 224771 } }, { "ph": "s", "id": 224771, "pid": 76337, "tid": -914061504, "ts": 1716454224951774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225006814, "dur": 159, "args": { "External id": 224773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224773, "pid": 5, "tid": 7, "ts": 1716454225006814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951788, "dur": 6, "args": { "External id": 224773, "cbid": 211, "correlation": 224773 } }, { "ph": "s", "id": 224773, "pid": 76337, "tid": -914061504, "ts": 1716454224951788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225006975, "dur": 1, "args": { "External id": 224775, "device": 5, "context": 1, "stream": 7, "correlation": 224775, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 224775, "pid": 5, "tid": 7, "ts": 1716454225006975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224951799, "dur": 6, "args": { "External id": 224775, "cbid": 51, "correlation": 224775 } }, { "ph": "s", "id": 224775, "pid": 76337, "tid": -914061504, "ts": 1716454224951799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225006979, "dur": 254, "args": { "External id": 224776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224776, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224776, "pid": 5, "tid": 7, "ts": 1716454225006979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951807, "dur": 6, "args": { "External id": 224776, "cbid": 211, "correlation": 224776 } }, { "ph": "s", "id": 224776, "pid": 76337, "tid": -914061504, "ts": 1716454224951807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225007234, "dur": 6, "args": { "External id": 224778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224778, "pid": 5, "tid": 7, "ts": 1716454225007234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951816, "dur": 5, "args": { "External id": 224778, "cbid": 211, "correlation": 224778 } }, { "ph": "s", "id": 224778, "pid": 76337, "tid": -914061504, "ts": 1716454224951816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225007241, "dur": 6, "args": { "External id": 224784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224784, "pid": 5, "tid": 7, "ts": 1716454225007241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951845, "dur": 8, "args": { "External id": 224784, "cbid": 211, "correlation": 224784 } }, { "ph": "s", "id": 224784, "pid": 76337, "tid": -914061504, "ts": 1716454224951845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224951904, "dur": 0, "args": { "External id": 224794, "cbid": 317, "correlation": 224794 } }, { "ph": "f", "id": 224794, "pid": 76337, "tid": -914061504, "ts": 1716454224951904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224951904, "dur": 0, "args": { "External id": 224795, "cbid": 203, "correlation": 224795 } }, { "ph": "f", "id": 224795, "pid": 76337, "tid": -914061504, "ts": 1716454224951904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224951905, "dur": 0, "args": { "External id": 224796, "cbid": 205, "correlation": 224796 } }, { "ph": "f", "id": 224796, "pid": 76337, "tid": -914061504, "ts": 1716454224951905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225007248, "dur": 8, "args": { "External id": 224800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224800, "pid": 5, "tid": 7, "ts": 1716454225007248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951919, "dur": 11, "args": { "External id": 224800, "cbid": 211, "correlation": 224800 } }, { "ph": "s", "id": 224800, "pid": 76337, "tid": -914061504, "ts": 1716454224951919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225007258, "dur": 3, "args": { "External id": 224802, "device": 5, "context": 1, "stream": 7, "correlation": 224802, "bytes": 4800, "memory bandwidth (GB/s)": 1.530612244897959 } }, { "ph": "f", "id": 224802, "pid": 5, "tid": 7, "ts": 1716454225007258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224951937, "dur": 14, "args": { "External id": 224802, "cbid": 51, "correlation": 224802 } }, { "ph": "s", "id": 224802, "pid": 76337, "tid": -914061504, "ts": 1716454224951937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225007262, "dur": 94, "args": { "External id": 224803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224803, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 224803, "pid": 5, "tid": 7, "ts": 1716454225007262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951952, "dur": 7, "args": { "External id": 224803, "cbid": 211, "correlation": 224803 } }, { "ph": "s", "id": 224803, "pid": 76337, "tid": -914061504, "ts": 1716454224951952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225007357, "dur": 6, "args": { "External id": 224805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224805, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224805, "pid": 5, "tid": 7, "ts": 1716454225007357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951962, "dur": 5, "args": { "External id": 224805, "cbid": 211, "correlation": 224805 } }, { "ph": "s", "id": 224805, "pid": 76337, "tid": -914061504, "ts": 1716454224951962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225007364, "dur": 6, "args": { "External id": 224811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224811, "pid": 5, "tid": 7, "ts": 1716454225007364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224951997, "dur": 9, "args": { "External id": 224811, "cbid": 211, "correlation": 224811 } }, { "ph": "s", "id": 224811, "pid": 76337, "tid": -914061504, "ts": 1716454224951997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225007371, "dur": 5, "args": { "External id": 224819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224819, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224819, "pid": 5, "tid": 7, "ts": 1716454225007371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952028, "dur": 8, "args": { "External id": 224819, "cbid": 211, "correlation": 224819 } }, { "ph": "s", "id": 224819, "pid": 76337, "tid": -914061504, "ts": 1716454224952028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225007377, "dur": 4, "args": { "External id": 224827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224827, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224827, "pid": 5, "tid": 7, "ts": 1716454225007377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952058, "dur": 8, "args": { "External id": 224827, "cbid": 211, "correlation": 224827 } }, { "ph": "s", "id": 224827, "pid": 76337, "tid": -914061504, "ts": 1716454224952058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225007383, "dur": 11, "args": { "External id": 224836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224836, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224836, "pid": 5, "tid": 7, "ts": 1716454225007383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952101, "dur": 11, "args": { "External id": 224836, "cbid": 211, "correlation": 224836 } }, { "ph": "s", "id": 224836, "pid": 76337, "tid": -914061504, "ts": 1716454224952101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225007395, "dur": 12, "args": { "External id": 224856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224856, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 224856, "pid": 5, "tid": 7, "ts": 1716454225007395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952171, "dur": 11, "args": { "External id": 224856, "cbid": 211, "correlation": 224856 } }, { "ph": "s", "id": 224856, "pid": 76337, "tid": -914061504, "ts": 1716454224952171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225007409, "dur": 4, "args": { "External id": 224868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224868, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224868, "pid": 5, "tid": 7, "ts": 1716454225007409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952193, "dur": 6, "args": { "External id": 224868, "cbid": 211, "correlation": 224868 } }, { "ph": "s", "id": 224868, "pid": 76337, "tid": -914061504, "ts": 1716454224952193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225007414, "dur": 11, "args": { "External id": 224871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224871, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224871, "pid": 5, "tid": 7, "ts": 1716454225007414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952211, "dur": 7, "args": { "External id": 224871, "cbid": 211, "correlation": 224871 } }, { "ph": "s", "id": 224871, "pid": 76337, "tid": -914061504, "ts": 1716454224952211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225007426, "dur": 6, "args": { "External id": 224880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224880, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224880, "pid": 5, "tid": 7, "ts": 1716454225007426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952249, "dur": 9, "args": { "External id": 224880, "cbid": 211, "correlation": 224880 } }, { "ph": "s", "id": 224880, "pid": 76337, "tid": -914061504, "ts": 1716454224952249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224952301, "dur": 0, "args": { "External id": 224890, "cbid": 317, "correlation": 224890 } }, { "ph": "f", "id": 224890, "pid": 76337, "tid": -914061504, "ts": 1716454224952301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224952302, "dur": 0, "args": { "External id": 224891, "cbid": 203, "correlation": 224891 } }, { "ph": "f", "id": 224891, "pid": 76337, "tid": -914061504, "ts": 1716454224952302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224952303, "dur": 0, "args": { "External id": 224892, "cbid": 205, "correlation": 224892 } }, { "ph": "f", "id": 224892, "pid": 76337, "tid": -914061504, "ts": 1716454224952303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225007434, "dur": 7, "args": { "External id": 224896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224896, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224896, "pid": 5, "tid": 7, "ts": 1716454225007434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952317, "dur": 11, "args": { "External id": 224896, "cbid": 211, "correlation": 224896 } }, { "ph": "s", "id": 224896, "pid": 76337, "tid": -914061504, "ts": 1716454224952317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225007442, "dur": 315, "args": { "External id": 224898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224898, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224898, "pid": 5, "tid": 7, "ts": 1716454225007442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952331, "dur": 5, "args": { "External id": 224898, "cbid": 211, "correlation": 224898 } }, { "ph": "s", "id": 224898, "pid": 76337, "tid": -914061504, "ts": 1716454224952331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225007759, "dur": 1, "args": { "External id": 224900, "device": 5, "context": 1, "stream": 7, "correlation": 224900, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 224900, "pid": 5, "tid": 7, "ts": 1716454225007759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224952341, "dur": 7, "args": { "External id": 224900, "cbid": 51, "correlation": 224900 } }, { "ph": "s", "id": 224900, "pid": 76337, "tid": -914061504, "ts": 1716454224952341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225007763, "dur": 490, "args": { "External id": 224901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224901, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224901, "pid": 5, "tid": 7, "ts": 1716454225007763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952350, "dur": 6, "args": { "External id": 224901, "cbid": 211, "correlation": 224901 } }, { "ph": "s", "id": 224901, "pid": 76337, "tid": -914061504, "ts": 1716454224952350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008254, "dur": 5, "args": { "External id": 224903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224903, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 224903, "pid": 5, "tid": 7, "ts": 1716454225008254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952359, "dur": 5, "args": { "External id": 224903, "cbid": 211, "correlation": 224903 } }, { "ph": "s", "id": 224903, "pid": 76337, "tid": -914061504, "ts": 1716454224952359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225008260, "dur": 6, "args": { "External id": 224909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224909, "pid": 5, "tid": 7, "ts": 1716454225008260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952387, "dur": 9, "args": { "External id": 224909, "cbid": 211, "correlation": 224909 } }, { "ph": "s", "id": 224909, "pid": 76337, "tid": -914061504, "ts": 1716454224952387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225008268, "dur": 3, "args": { "External id": 224917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224917, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 224917, "pid": 5, "tid": 7, "ts": 1716454225008268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952430, "dur": 9, "args": { "External id": 224917, "cbid": 211, "correlation": 224917 } }, { "ph": "s", "id": 224917, "pid": 76337, "tid": -914061504, "ts": 1716454224952430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224952493, "dur": 1, "args": { "External id": 224933, "cbid": 251, "correlation": 224933 } }, { "ph": "f", "id": 224933, "pid": 76337, "tid": -914061504, "ts": 1716454224952493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224952499, "dur": 0, "args": { "External id": 224935, "cbid": 251, "correlation": 224935 } }, { "ph": "f", "id": 224935, "pid": 76337, "tid": -914061504, "ts": 1716454224952499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225008272, "dur": 13, "args": { "External id": 224936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224936, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224936, "pid": 5, "tid": 7, "ts": 1716454225008272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952501, "dur": 11, "args": { "External id": 224936, "cbid": 211, "correlation": 224936 } }, { "ph": "s", "id": 224936, "pid": 76337, "tid": -914061504, "ts": 1716454224952501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225008286, "dur": 5, "args": { "External id": 224938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224938, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 224938, "pid": 5, "tid": 7, "ts": 1716454225008286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952513, "dur": 5, "args": { "External id": 224938, "cbid": 211, "correlation": 224938 } }, { "ph": "s", "id": 224938, "pid": 76337, "tid": -914061504, "ts": 1716454224952513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225008292, "dur": 6, "args": { "External id": 224948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224948, "pid": 5, "tid": 7, "ts": 1716454225008292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952571, "dur": 12, "args": { "External id": 224948, "cbid": 211, "correlation": 224948 } }, { "ph": "s", "id": 224948, "pid": 76337, "tid": -914061504, "ts": 1716454224952571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225008299, "dur": 9, "args": { "External id": 224968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224968, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 224968, "pid": 5, "tid": 7, "ts": 1716454225008299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952635, "dur": 11, "args": { "External id": 224968, "cbid": 211, "correlation": 224968 } }, { "ph": "s", "id": 224968, "pid": 76337, "tid": -914061504, "ts": 1716454224952635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225008310, "dur": 4, "args": { "External id": 224980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224980, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 224980, "pid": 5, "tid": 7, "ts": 1716454225008310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952657, "dur": 6, "args": { "External id": 224980, "cbid": 211, "correlation": 224980 } }, { "ph": "s", "id": 224980, "pid": 76337, "tid": -914061504, "ts": 1716454224952657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225008315, "dur": 7, "args": { "External id": 224983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224983, "pid": 5, "tid": 7, "ts": 1716454225008315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952674, "dur": 6, "args": { "External id": 224983, "cbid": 211, "correlation": 224983 } }, { "ph": "s", "id": 224983, "pid": 76337, "tid": -914061504, "ts": 1716454224952674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225008322, "dur": 5, "args": { "External id": 224992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 224992, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 224992, "pid": 5, "tid": 7, "ts": 1716454225008322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952715, "dur": 10, "args": { "External id": 224992, "cbid": 211, "correlation": 224992 } }, { "ph": "s", "id": 224992, "pid": 76337, "tid": -914061504, "ts": 1716454224952715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224952777, "dur": 0, "args": { "External id": 225002, "cbid": 317, "correlation": 225002 } }, { "ph": "f", "id": 225002, "pid": 76337, "tid": -914061504, "ts": 1716454224952777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224952778, "dur": 0, "args": { "External id": 225003, "cbid": 203, "correlation": 225003 } }, { "ph": "f", "id": 225003, "pid": 76337, "tid": -914061504, "ts": 1716454224952778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224952778, "dur": 0, "args": { "External id": 225004, "cbid": 205, "correlation": 225004 } }, { "ph": "f", "id": 225004, "pid": 76337, "tid": -914061504, "ts": 1716454224952778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008328, "dur": 5, "args": { "External id": 225008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225008, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225008, "pid": 5, "tid": 7, "ts": 1716454225008328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952792, "dur": 13, "args": { "External id": 225008, "cbid": 211, "correlation": 225008 } }, { "ph": "s", "id": 225008, "pid": 76337, "tid": -914061504, "ts": 1716454224952792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008334, "dur": 159, "args": { "External id": 225010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225010, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225010, "pid": 5, "tid": 7, "ts": 1716454225008334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952807, "dur": 5, "args": { "External id": 225010, "cbid": 211, "correlation": 225010 } }, { "ph": "s", "id": 225010, "pid": 76337, "tid": -914061504, "ts": 1716454224952807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225008495, "dur": 1, "args": { "External id": 225012, "device": 5, "context": 1, "stream": 7, "correlation": 225012, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 225012, "pid": 5, "tid": 7, "ts": 1716454225008495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224952818, "dur": 6, "args": { "External id": 225012, "cbid": 51, "correlation": 225012 } }, { "ph": "s", "id": 225012, "pid": 76337, "tid": -914061504, "ts": 1716454224952818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225008499, "dur": 254, "args": { "External id": 225013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225013, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225013, "pid": 5, "tid": 7, "ts": 1716454225008499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952825, "dur": 6, "args": { "External id": 225013, "cbid": 211, "correlation": 225013 } }, { "ph": "s", "id": 225013, "pid": 76337, "tid": -914061504, "ts": 1716454224952825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008755, "dur": 6, "args": { "External id": 225015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225015, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225015, "pid": 5, "tid": 7, "ts": 1716454225008755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952835, "dur": 5, "args": { "External id": 225015, "cbid": 211, "correlation": 225015 } }, { "ph": "s", "id": 225015, "pid": 76337, "tid": -914061504, "ts": 1716454224952835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225008762, "dur": 6, "args": { "External id": 225021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225021, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225021, "pid": 5, "tid": 7, "ts": 1716454225008762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952864, "dur": 8, "args": { "External id": 225021, "cbid": 211, "correlation": 225021 } }, { "ph": "s", "id": 225021, "pid": 76337, "tid": -914061504, "ts": 1716454224952864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224952922, "dur": 0, "args": { "External id": 225031, "cbid": 317, "correlation": 225031 } }, { "ph": "f", "id": 225031, "pid": 76337, "tid": -914061504, "ts": 1716454224952922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224952922, "dur": 0, "args": { "External id": 225032, "cbid": 203, "correlation": 225032 } }, { "ph": "f", "id": 225032, "pid": 76337, "tid": -914061504, "ts": 1716454224952922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224952923, "dur": 0, "args": { "External id": 225033, "cbid": 205, "correlation": 225033 } }, { "ph": "f", "id": 225033, "pid": 76337, "tid": -914061504, "ts": 1716454224952923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008769, "dur": 8, "args": { "External id": 225037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225037, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225037, "pid": 5, "tid": 7, "ts": 1716454225008769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952934, "dur": 11, "args": { "External id": 225037, "cbid": 211, "correlation": 225037 } }, { "ph": "s", "id": 225037, "pid": 76337, "tid": -914061504, "ts": 1716454224952934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225008778, "dur": 3, "args": { "External id": 225039, "device": 5, "context": 1, "stream": 7, "correlation": 225039, "bytes": 4800, "memory bandwidth (GB/s)": 1.530612244897959 } }, { "ph": "f", "id": 225039, "pid": 5, "tid": 7, "ts": 1716454225008778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224952951, "dur": 10, "args": { "External id": 225039, "cbid": 51, "correlation": 225039 } }, { "ph": "s", "id": 225039, "pid": 76337, "tid": -914061504, "ts": 1716454224952951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225008782, "dur": 94, "args": { "External id": 225040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225040, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 225040, "pid": 5, "tid": 7, "ts": 1716454225008782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952962, "dur": 6, "args": { "External id": 225040, "cbid": 211, "correlation": 225040 } }, { "ph": "s", "id": 225040, "pid": 76337, "tid": -914061504, "ts": 1716454224952962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008877, "dur": 6, "args": { "External id": 225042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225042, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225042, "pid": 5, "tid": 7, "ts": 1716454225008877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224952971, "dur": 13, "args": { "External id": 225042, "cbid": 211, "correlation": 225042 } }, { "ph": "s", "id": 225042, "pid": 76337, "tid": -914061504, "ts": 1716454224952971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225008884, "dur": 6, "args": { "External id": 225048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225048, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225048, "pid": 5, "tid": 7, "ts": 1716454225008884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953007, "dur": 9, "args": { "External id": 225048, "cbid": 211, "correlation": 225048 } }, { "ph": "s", "id": 225048, "pid": 76337, "tid": -914061504, "ts": 1716454224953007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225008892, "dur": 5, "args": { "External id": 225056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225056, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225056, "pid": 5, "tid": 7, "ts": 1716454225008892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953037, "dur": 8, "args": { "External id": 225056, "cbid": 211, "correlation": 225056 } }, { "ph": "s", "id": 225056, "pid": 76337, "tid": -914061504, "ts": 1716454224953037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225008898, "dur": 4, "args": { "External id": 225064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225064, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225064, "pid": 5, "tid": 7, "ts": 1716454225008898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953066, "dur": 8, "args": { "External id": 225064, "cbid": 211, "correlation": 225064 } }, { "ph": "s", "id": 225064, "pid": 76337, "tid": -914061504, "ts": 1716454224953066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225008904, "dur": 11, "args": { "External id": 225073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225073, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225073, "pid": 5, "tid": 7, "ts": 1716454225008904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953110, "dur": 11, "args": { "External id": 225073, "cbid": 211, "correlation": 225073 } }, { "ph": "s", "id": 225073, "pid": 76337, "tid": -914061504, "ts": 1716454224953110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225008916, "dur": 12, "args": { "External id": 225093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225093, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 225093, "pid": 5, "tid": 7, "ts": 1716454225008916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953179, "dur": 11, "args": { "External id": 225093, "cbid": 211, "correlation": 225093 } }, { "ph": "s", "id": 225093, "pid": 76337, "tid": -914061504, "ts": 1716454224953179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225008929, "dur": 4, "args": { "External id": 225105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225105, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225105, "pid": 5, "tid": 7, "ts": 1716454225008929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953199, "dur": 7, "args": { "External id": 225105, "cbid": 211, "correlation": 225105 } }, { "ph": "s", "id": 225105, "pid": 76337, "tid": -914061504, "ts": 1716454224953199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225008934, "dur": 11, "args": { "External id": 225108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225108, "pid": 5, "tid": 7, "ts": 1716454225008934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953218, "dur": 6, "args": { "External id": 225108, "cbid": 211, "correlation": 225108 } }, { "ph": "s", "id": 225108, "pid": 76337, "tid": -914061504, "ts": 1716454224953218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225008946, "dur": 6, "args": { "External id": 225117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225117, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225117, "pid": 5, "tid": 7, "ts": 1716454225008946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953256, "dur": 10, "args": { "External id": 225117, "cbid": 211, "correlation": 225117 } }, { "ph": "s", "id": 225117, "pid": 76337, "tid": -914061504, "ts": 1716454224953256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224953308, "dur": 0, "args": { "External id": 225127, "cbid": 317, "correlation": 225127 } }, { "ph": "f", "id": 225127, "pid": 76337, "tid": -914061504, "ts": 1716454224953308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224953309, "dur": 0, "args": { "External id": 225128, "cbid": 203, "correlation": 225128 } }, { "ph": "f", "id": 225128, "pid": 76337, "tid": -914061504, "ts": 1716454224953309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224953310, "dur": 0, "args": { "External id": 225129, "cbid": 205, "correlation": 225129 } }, { "ph": "f", "id": 225129, "pid": 76337, "tid": -914061504, "ts": 1716454224953310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008953, "dur": 7, "args": { "External id": 225133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225133, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225133, "pid": 5, "tid": 7, "ts": 1716454225008953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953323, "dur": 11, "args": { "External id": 225133, "cbid": 211, "correlation": 225133 } }, { "ph": "s", "id": 225133, "pid": 76337, "tid": -914061504, "ts": 1716454224953323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225008961, "dur": 314, "args": { "External id": 225135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225135, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225135, "pid": 5, "tid": 7, "ts": 1716454225008961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953336, "dur": 5, "args": { "External id": 225135, "cbid": 211, "correlation": 225135 } }, { "ph": "s", "id": 225135, "pid": 76337, "tid": -914061504, "ts": 1716454224953336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225009277, "dur": 1, "args": { "External id": 225137, "device": 5, "context": 1, "stream": 7, "correlation": 225137, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 225137, "pid": 5, "tid": 7, "ts": 1716454225009277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224953346, "dur": 7, "args": { "External id": 225137, "cbid": 51, "correlation": 225137 } }, { "ph": "s", "id": 225137, "pid": 76337, "tid": -914061504, "ts": 1716454224953346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225009281, "dur": 488, "args": { "External id": 225138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225138, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225138, "pid": 5, "tid": 7, "ts": 1716454225009281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953354, "dur": 6, "args": { "External id": 225138, "cbid": 211, "correlation": 225138 } }, { "ph": "s", "id": 225138, "pid": 76337, "tid": -914061504, "ts": 1716454224953354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225009770, "dur": 5, "args": { "External id": 225140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225140, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225140, "pid": 5, "tid": 7, "ts": 1716454225009770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953365, "dur": 5, "args": { "External id": 225140, "cbid": 211, "correlation": 225140 } }, { "ph": "s", "id": 225140, "pid": 76337, "tid": -914061504, "ts": 1716454224953365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225009776, "dur": 6, "args": { "External id": 225146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225146, "pid": 5, "tid": 7, "ts": 1716454225009776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953392, "dur": 8, "args": { "External id": 225146, "cbid": 211, "correlation": 225146 } }, { "ph": "s", "id": 225146, "pid": 76337, "tid": -914061504, "ts": 1716454224953392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225009784, "dur": 3, "args": { "External id": 225154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225154, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 225154, "pid": 5, "tid": 7, "ts": 1716454225009784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953435, "dur": 9, "args": { "External id": 225154, "cbid": 211, "correlation": 225154 } }, { "ph": "s", "id": 225154, "pid": 76337, "tid": -914061504, "ts": 1716454224953435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224953497, "dur": 1, "args": { "External id": 225170, "cbid": 251, "correlation": 225170 } }, { "ph": "f", "id": 225170, "pid": 76337, "tid": -914061504, "ts": 1716454224953497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224953503, "dur": 0, "args": { "External id": 225172, "cbid": 251, "correlation": 225172 } }, { "ph": "f", "id": 225172, "pid": 76337, "tid": -914061504, "ts": 1716454224953503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225009788, "dur": 12, "args": { "External id": 225173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225173, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225173, "pid": 5, "tid": 7, "ts": 1716454225009788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953505, "dur": 11, "args": { "External id": 225173, "cbid": 211, "correlation": 225173 } }, { "ph": "s", "id": 225173, "pid": 76337, "tid": -914061504, "ts": 1716454224953505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225009802, "dur": 5, "args": { "External id": 225175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225175, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225175, "pid": 5, "tid": 7, "ts": 1716454225009802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953518, "dur": 5, "args": { "External id": 225175, "cbid": 211, "correlation": 225175 } }, { "ph": "s", "id": 225175, "pid": 76337, "tid": -914061504, "ts": 1716454224953518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225009808, "dur": 6, "args": { "External id": 225185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225185, "pid": 5, "tid": 7, "ts": 1716454225009808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953574, "dur": 12, "args": { "External id": 225185, "cbid": 211, "correlation": 225185 } }, { "ph": "s", "id": 225185, "pid": 76337, "tid": -914061504, "ts": 1716454224953574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225009815, "dur": 9, "args": { "External id": 225205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225205, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 225205, "pid": 5, "tid": 7, "ts": 1716454225009815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953639, "dur": 11, "args": { "External id": 225205, "cbid": 211, "correlation": 225205 } }, { "ph": "s", "id": 225205, "pid": 76337, "tid": -914061504, "ts": 1716454224953639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225009825, "dur": 4, "args": { "External id": 225217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225217, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 225217, "pid": 5, "tid": 7, "ts": 1716454225009825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953661, "dur": 6, "args": { "External id": 225217, "cbid": 211, "correlation": 225217 } }, { "ph": "s", "id": 225217, "pid": 76337, "tid": -914061504, "ts": 1716454224953661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225009830, "dur": 7, "args": { "External id": 225220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225220, "pid": 5, "tid": 7, "ts": 1716454225009830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953679, "dur": 6, "args": { "External id": 225220, "cbid": 211, "correlation": 225220 } }, { "ph": "s", "id": 225220, "pid": 76337, "tid": -914061504, "ts": 1716454224953679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225009838, "dur": 4, "args": { "External id": 225229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225229, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225229, "pid": 5, "tid": 7, "ts": 1716454225009838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953719, "dur": 10, "args": { "External id": 225229, "cbid": 211, "correlation": 225229 } }, { "ph": "s", "id": 225229, "pid": 76337, "tid": -914061504, "ts": 1716454224953719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224953781, "dur": 0, "args": { "External id": 225239, "cbid": 317, "correlation": 225239 } }, { "ph": "f", "id": 225239, "pid": 76337, "tid": -914061504, "ts": 1716454224953781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224953782, "dur": 0, "args": { "External id": 225240, "cbid": 203, "correlation": 225240 } }, { "ph": "f", "id": 225240, "pid": 76337, "tid": -914061504, "ts": 1716454224953782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224953782, "dur": 0, "args": { "External id": 225241, "cbid": 205, "correlation": 225241 } }, { "ph": "f", "id": 225241, "pid": 76337, "tid": -914061504, "ts": 1716454224953782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225009844, "dur": 5, "args": { "External id": 225245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225245, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225245, "pid": 5, "tid": 7, "ts": 1716454225009844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953796, "dur": 12, "args": { "External id": 225245, "cbid": 211, "correlation": 225245 } }, { "ph": "s", "id": 225245, "pid": 76337, "tid": -914061504, "ts": 1716454224953796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225009850, "dur": 159, "args": { "External id": 225247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225247, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225247, "pid": 5, "tid": 7, "ts": 1716454225009850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953811, "dur": 5, "args": { "External id": 225247, "cbid": 211, "correlation": 225247 } }, { "ph": "s", "id": 225247, "pid": 76337, "tid": -914061504, "ts": 1716454224953811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225010012, "dur": 1, "args": { "External id": 225249, "device": 5, "context": 1, "stream": 7, "correlation": 225249, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 225249, "pid": 5, "tid": 7, "ts": 1716454225010012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224953822, "dur": 6, "args": { "External id": 225249, "cbid": 51, "correlation": 225249 } }, { "ph": "s", "id": 225249, "pid": 76337, "tid": -914061504, "ts": 1716454224953822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225010016, "dur": 255, "args": { "External id": 225250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225250, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225250, "pid": 5, "tid": 7, "ts": 1716454225010016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953829, "dur": 6, "args": { "External id": 225250, "cbid": 211, "correlation": 225250 } }, { "ph": "s", "id": 225250, "pid": 76337, "tid": -914061504, "ts": 1716454224953829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225010272, "dur": 5, "args": { "External id": 225252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225252, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225252, "pid": 5, "tid": 7, "ts": 1716454225010272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953839, "dur": 5, "args": { "External id": 225252, "cbid": 211, "correlation": 225252 } }, { "ph": "s", "id": 225252, "pid": 76337, "tid": -914061504, "ts": 1716454224953839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225010278, "dur": 6, "args": { "External id": 225258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225258, "pid": 5, "tid": 7, "ts": 1716454225010278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953867, "dur": 8, "args": { "External id": 225258, "cbid": 211, "correlation": 225258 } }, { "ph": "s", "id": 225258, "pid": 76337, "tid": -914061504, "ts": 1716454224953867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224953926, "dur": 0, "args": { "External id": 225268, "cbid": 317, "correlation": 225268 } }, { "ph": "f", "id": 225268, "pid": 76337, "tid": -914061504, "ts": 1716454224953926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224953927, "dur": 0, "args": { "External id": 225269, "cbid": 203, "correlation": 225269 } }, { "ph": "f", "id": 225269, "pid": 76337, "tid": -914061504, "ts": 1716454224953927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224953928, "dur": 0, "args": { "External id": 225270, "cbid": 205, "correlation": 225270 } }, { "ph": "f", "id": 225270, "pid": 76337, "tid": -914061504, "ts": 1716454224953928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225010286, "dur": 7, "args": { "External id": 225274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225274, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225274, "pid": 5, "tid": 7, "ts": 1716454225010286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953939, "dur": 12, "args": { "External id": 225274, "cbid": 211, "correlation": 225274 } }, { "ph": "s", "id": 225274, "pid": 76337, "tid": -914061504, "ts": 1716454224953939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225010294, "dur": 3, "args": { "External id": 225276, "device": 5, "context": 1, "stream": 7, "correlation": 225276, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 225276, "pid": 5, "tid": 7, "ts": 1716454225010294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224953956, "dur": 10, "args": { "External id": 225276, "cbid": 51, "correlation": 225276 } }, { "ph": "s", "id": 225276, "pid": 76337, "tid": -914061504, "ts": 1716454224953956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225010298, "dur": 93, "args": { "External id": 225277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225277, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 225277, "pid": 5, "tid": 7, "ts": 1716454225010298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953967, "dur": 13, "args": { "External id": 225277, "cbid": 211, "correlation": 225277 } }, { "ph": "s", "id": 225277, "pid": 76337, "tid": -914061504, "ts": 1716454224953967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225010393, "dur": 6, "args": { "External id": 225279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225279, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225279, "pid": 5, "tid": 7, "ts": 1716454225010393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224953984, "dur": 6, "args": { "External id": 225279, "cbid": 211, "correlation": 225279 } }, { "ph": "s", "id": 225279, "pid": 76337, "tid": -914061504, "ts": 1716454224953984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225010400, "dur": 6, "args": { "External id": 225285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225285, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225285, "pid": 5, "tid": 7, "ts": 1716454225010400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954013, "dur": 9, "args": { "External id": 225285, "cbid": 211, "correlation": 225285 } }, { "ph": "s", "id": 225285, "pid": 76337, "tid": -914061504, "ts": 1716454224954013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225010408, "dur": 5, "args": { "External id": 225293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225293, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225293, "pid": 5, "tid": 7, "ts": 1716454225010408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954043, "dur": 8, "args": { "External id": 225293, "cbid": 211, "correlation": 225293 } }, { "ph": "s", "id": 225293, "pid": 76337, "tid": -914061504, "ts": 1716454224954043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225010413, "dur": 4, "args": { "External id": 225301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225301, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 225301, "pid": 5, "tid": 7, "ts": 1716454225010413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954072, "dur": 8, "args": { "External id": 225301, "cbid": 211, "correlation": 225301 } }, { "ph": "s", "id": 225301, "pid": 76337, "tid": -914061504, "ts": 1716454224954072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225010419, "dur": 14, "args": { "External id": 225312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225312, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225312, "pid": 5, "tid": 7, "ts": 1716454225010419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954154, "dur": 13, "args": { "External id": 225312, "cbid": 211, "correlation": 225312 } }, { "ph": "s", "id": 225312, "pid": 76337, "tid": -914061504, "ts": 1716454224954154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224954211, "dur": 0, "args": { "External id": 225322, "cbid": 317, "correlation": 225322 } }, { "ph": "f", "id": 225322, "pid": 76337, "tid": -914061504, "ts": 1716454224954211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224954212, "dur": 0, "args": { "External id": 225323, "cbid": 203, "correlation": 225323 } }, { "ph": "f", "id": 225323, "pid": 76337, "tid": -914061504, "ts": 1716454224954212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224954212, "dur": 0, "args": { "External id": 225324, "cbid": 205, "correlation": 225324 } }, { "ph": "f", "id": 225324, "pid": 76337, "tid": -914061504, "ts": 1716454224954212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225010434, "dur": 8, "args": { "External id": 225328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225328, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225328, "pid": 5, "tid": 7, "ts": 1716454225010434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954227, "dur": 11, "args": { "External id": 225328, "cbid": 211, "correlation": 225328 } }, { "ph": "s", "id": 225328, "pid": 76337, "tid": -914061504, "ts": 1716454224954227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225010444, "dur": 161, "args": { "External id": 225330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225330, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225330, "pid": 5, "tid": 7, "ts": 1716454225010444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954240, "dur": 5, "args": { "External id": 225330, "cbid": 211, "correlation": 225330 } }, { "ph": "s", "id": 225330, "pid": 76337, "tid": -914061504, "ts": 1716454224954240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225010607, "dur": 1, "args": { "External id": 225332, "device": 5, "context": 1, "stream": 7, "correlation": 225332, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 225332, "pid": 5, "tid": 7, "ts": 1716454225010607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224954250, "dur": 7, "args": { "External id": 225332, "cbid": 51, "correlation": 225332 } }, { "ph": "s", "id": 225332, "pid": 76337, "tid": -914061504, "ts": 1716454224954250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225010610, "dur": 644, "args": { "External id": 225333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225333, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225333, "pid": 5, "tid": 7, "ts": 1716454225010610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954259, "dur": 6, "args": { "External id": 225333, "cbid": 211, "correlation": 225333 } }, { "ph": "s", "id": 225333, "pid": 76337, "tid": -914061504, "ts": 1716454224954259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225011256, "dur": 12, "args": { "External id": 225335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225335, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225335, "pid": 5, "tid": 7, "ts": 1716454225011256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954268, "dur": 5, "args": { "External id": 225335, "cbid": 211, "correlation": 225335 } }, { "ph": "s", "id": 225335, "pid": 76337, "tid": -914061504, "ts": 1716454224954268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225011269, "dur": 14, "args": { "External id": 225341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225341, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225341, "pid": 5, "tid": 7, "ts": 1716454225011269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954296, "dur": 9, "args": { "External id": 225341, "cbid": 211, "correlation": 225341 } }, { "ph": "s", "id": 225341, "pid": 76337, "tid": -914061504, "ts": 1716454224954296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225011284, "dur": 29, "args": { "External id": 225350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225350, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225350, "pid": 5, "tid": 7, "ts": 1716454225011284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954386, "dur": 12, "args": { "External id": 225350, "cbid": 211, "correlation": 225350 } }, { "ph": "s", "id": 225350, "pid": 76337, "tid": -914061504, "ts": 1716454224954386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225011314, "dur": 30, "args": { "External id": 225370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225370, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 225370, "pid": 5, "tid": 7, "ts": 1716454225011314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954455, "dur": 12, "args": { "External id": 225370, "cbid": 211, "correlation": 225370 } }, { "ph": "s", "id": 225370, "pid": 76337, "tid": -914061504, "ts": 1716454224954455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225011346, "dur": 4, "args": { "External id": 225382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225382, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225382, "pid": 5, "tid": 7, "ts": 1716454225011346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954477, "dur": 6, "args": { "External id": 225382, "cbid": 211, "correlation": 225382 } }, { "ph": "s", "id": 225382, "pid": 76337, "tid": -914061504, "ts": 1716454224954477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225011351, "dur": 30, "args": { "External id": 225385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225385, "pid": 5, "tid": 7, "ts": 1716454225011351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954495, "dur": 7, "args": { "External id": 225385, "cbid": 211, "correlation": 225385 } }, { "ph": "s", "id": 225385, "pid": 76337, "tid": -914061504, "ts": 1716454224954495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225011383, "dur": 20, "args": { "External id": 225394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225394, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225394, "pid": 5, "tid": 7, "ts": 1716454225011383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954534, "dur": 9, "args": { "External id": 225394, "cbid": 211, "correlation": 225394 } }, { "ph": "s", "id": 225394, "pid": 76337, "tid": -914061504, "ts": 1716454224954534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224954586, "dur": 0, "args": { "External id": 225404, "cbid": 317, "correlation": 225404 } }, { "ph": "f", "id": 225404, "pid": 76337, "tid": -914061504, "ts": 1716454224954586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224954587, "dur": 0, "args": { "External id": 225405, "cbid": 203, "correlation": 225405 } }, { "ph": "f", "id": 225405, "pid": 76337, "tid": -914061504, "ts": 1716454224954587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224954587, "dur": 0, "args": { "External id": 225406, "cbid": 205, "correlation": 225406 } }, { "ph": "f", "id": 225406, "pid": 76337, "tid": -914061504, "ts": 1716454224954587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225011404, "dur": 22, "args": { "External id": 225410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225410, "pid": 5, "tid": 7, "ts": 1716454225011404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954602, "dur": 12, "args": { "External id": 225410, "cbid": 211, "correlation": 225410 } }, { "ph": "s", "id": 225410, "pid": 76337, "tid": -914061504, "ts": 1716454224954602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225011428, "dur": 315, "args": { "External id": 225412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225412, "pid": 5, "tid": 7, "ts": 1716454225011428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954617, "dur": 5, "args": { "External id": 225412, "cbid": 211, "correlation": 225412 } }, { "ph": "s", "id": 225412, "pid": 76337, "tid": -914061504, "ts": 1716454224954617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225011745, "dur": 1, "args": { "External id": 225414, "device": 5, "context": 1, "stream": 7, "correlation": 225414, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 225414, "pid": 5, "tid": 7, "ts": 1716454225011745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224954628, "dur": 7, "args": { "External id": 225414, "cbid": 51, "correlation": 225414 } }, { "ph": "s", "id": 225414, "pid": 76337, "tid": -914061504, "ts": 1716454224954628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225011749, "dur": 1227, "args": { "External id": 225415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225415, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225415, "pid": 5, "tid": 7, "ts": 1716454225011749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954636, "dur": 6, "args": { "External id": 225415, "cbid": 211, "correlation": 225415 } }, { "ph": "s", "id": 225415, "pid": 76337, "tid": -914061504, "ts": 1716454224954636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225012977, "dur": 12, "args": { "External id": 225417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225417, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225417, "pid": 5, "tid": 7, "ts": 1716454225012977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954646, "dur": 5, "args": { "External id": 225417, "cbid": 211, "correlation": 225417 } }, { "ph": "s", "id": 225417, "pid": 76337, "tid": -914061504, "ts": 1716454224954646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225012990, "dur": 15, "args": { "External id": 225423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225423, "pid": 5, "tid": 7, "ts": 1716454225012990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954674, "dur": 8, "args": { "External id": 225423, "cbid": 211, "correlation": 225423 } }, { "ph": "s", "id": 225423, "pid": 76337, "tid": -914061504, "ts": 1716454224954674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225013006, "dur": 3, "args": { "External id": 225431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225431, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 225431, "pid": 5, "tid": 7, "ts": 1716454225013006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954718, "dur": 9, "args": { "External id": 225431, "cbid": 211, "correlation": 225431 } }, { "ph": "s", "id": 225431, "pid": 76337, "tid": -914061504, "ts": 1716454224954718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224954781, "dur": 1, "args": { "External id": 225447, "cbid": 251, "correlation": 225447 } }, { "ph": "f", "id": 225447, "pid": 76337, "tid": -914061504, "ts": 1716454224954781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224954786, "dur": 0, "args": { "External id": 225449, "cbid": 251, "correlation": 225449 } }, { "ph": "f", "id": 225449, "pid": 76337, "tid": -914061504, "ts": 1716454224954786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225013011, "dur": 12, "args": { "External id": 225450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225450, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225450, "pid": 5, "tid": 7, "ts": 1716454225013011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954788, "dur": 11, "args": { "External id": 225450, "cbid": 211, "correlation": 225450 } }, { "ph": "s", "id": 225450, "pid": 76337, "tid": -914061504, "ts": 1716454224954788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225013024, "dur": 5, "args": { "External id": 225452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225452, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225452, "pid": 5, "tid": 7, "ts": 1716454225013024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954801, "dur": 5, "args": { "External id": 225452, "cbid": 211, "correlation": 225452 } }, { "ph": "s", "id": 225452, "pid": 76337, "tid": -914061504, "ts": 1716454224954801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225013030, "dur": 17, "args": { "External id": 225462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225462, "pid": 5, "tid": 7, "ts": 1716454225013030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954857, "dur": 13, "args": { "External id": 225462, "cbid": 211, "correlation": 225462 } }, { "ph": "s", "id": 225462, "pid": 76337, "tid": -914061504, "ts": 1716454224954857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225013049, "dur": 18, "args": { "External id": 225482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225482, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 225482, "pid": 5, "tid": 7, "ts": 1716454225013049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954924, "dur": 11, "args": { "External id": 225482, "cbid": 211, "correlation": 225482 } }, { "ph": "s", "id": 225482, "pid": 76337, "tid": -914061504, "ts": 1716454224954924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225013068, "dur": 4, "args": { "External id": 225494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225494, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 225494, "pid": 5, "tid": 7, "ts": 1716454225013068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954944, "dur": 6, "args": { "External id": 225494, "cbid": 211, "correlation": 225494 } }, { "ph": "s", "id": 225494, "pid": 76337, "tid": -914061504, "ts": 1716454224954944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225013073, "dur": 16, "args": { "External id": 225497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225497, "pid": 5, "tid": 7, "ts": 1716454225013073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224954963, "dur": 7, "args": { "External id": 225497, "cbid": 211, "correlation": 225497 } }, { "ph": "s", "id": 225497, "pid": 76337, "tid": -914061504, "ts": 1716454224954963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225013091, "dur": 11, "args": { "External id": 225506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225506, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225506, "pid": 5, "tid": 7, "ts": 1716454225013091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955012, "dur": 11, "args": { "External id": 225506, "cbid": 211, "correlation": 225506 } }, { "ph": "s", "id": 225506, "pid": 76337, "tid": -914061504, "ts": 1716454224955012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224955076, "dur": 0, "args": { "External id": 225516, "cbid": 317, "correlation": 225516 } }, { "ph": "f", "id": 225516, "pid": 76337, "tid": -914061504, "ts": 1716454224955076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224955077, "dur": 0, "args": { "External id": 225517, "cbid": 203, "correlation": 225517 } }, { "ph": "f", "id": 225517, "pid": 76337, "tid": -914061504, "ts": 1716454224955077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224955077, "dur": 0, "args": { "External id": 225518, "cbid": 205, "correlation": 225518 } }, { "ph": "f", "id": 225518, "pid": 76337, "tid": -914061504, "ts": 1716454224955077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225013103, "dur": 11, "args": { "External id": 225522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225522, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225522, "pid": 5, "tid": 7, "ts": 1716454225013103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955090, "dur": 12, "args": { "External id": 225522, "cbid": 211, "correlation": 225522 } }, { "ph": "s", "id": 225522, "pid": 76337, "tid": -914061504, "ts": 1716454224955090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225013115, "dur": 159, "args": { "External id": 225524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225524, "pid": 5, "tid": 7, "ts": 1716454225013115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955105, "dur": 6, "args": { "External id": 225524, "cbid": 211, "correlation": 225524 } }, { "ph": "s", "id": 225524, "pid": 76337, "tid": -914061504, "ts": 1716454224955105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225013277, "dur": 1, "args": { "External id": 225526, "device": 5, "context": 1, "stream": 7, "correlation": 225526, "bytes": 960, "memory bandwidth (GB/s)": 0.5172413793103449 } }, { "ph": "f", "id": 225526, "pid": 5, "tid": 7, "ts": 1716454225013277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224955117, "dur": 6, "args": { "External id": 225526, "cbid": 51, "correlation": 225526 } }, { "ph": "s", "id": 225526, "pid": 76337, "tid": -914061504, "ts": 1716454224955117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225013281, "dur": 638, "args": { "External id": 225527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225527, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225527, "pid": 5, "tid": 7, "ts": 1716454225013281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955124, "dur": 6, "args": { "External id": 225527, "cbid": 211, "correlation": 225527 } }, { "ph": "s", "id": 225527, "pid": 76337, "tid": -914061504, "ts": 1716454224955124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225013920, "dur": 12, "args": { "External id": 225529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225529, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225529, "pid": 5, "tid": 7, "ts": 1716454225013920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955134, "dur": 5, "args": { "External id": 225529, "cbid": 211, "correlation": 225529 } }, { "ph": "s", "id": 225529, "pid": 76337, "tid": -914061504, "ts": 1716454224955134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225013933, "dur": 14, "args": { "External id": 225535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225535, "pid": 5, "tid": 7, "ts": 1716454225013933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955162, "dur": 9, "args": { "External id": 225535, "cbid": 211, "correlation": 225535 } }, { "ph": "s", "id": 225535, "pid": 76337, "tid": -914061504, "ts": 1716454224955162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224955222, "dur": 0, "args": { "External id": 225545, "cbid": 317, "correlation": 225545 } }, { "ph": "f", "id": 225545, "pid": 76337, "tid": -914061504, "ts": 1716454224955222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224955222, "dur": 0, "args": { "External id": 225546, "cbid": 203, "correlation": 225546 } }, { "ph": "f", "id": 225546, "pid": 76337, "tid": -914061504, "ts": 1716454224955222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224955223, "dur": 0, "args": { "External id": 225547, "cbid": 205, "correlation": 225547 } }, { "ph": "f", "id": 225547, "pid": 76337, "tid": -914061504, "ts": 1716454224955223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225013949, "dur": 21, "args": { "External id": 225551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225551, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225551, "pid": 5, "tid": 7, "ts": 1716454225013949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955236, "dur": 11, "args": { "External id": 225551, "cbid": 211, "correlation": 225551 } }, { "ph": "s", "id": 225551, "pid": 76337, "tid": -914061504, "ts": 1716454224955236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225013971, "dur": 4, "args": { "External id": 225553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 225553, "pid": 5, "tid": 7, "ts": 1716454225013971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955252, "dur": 8, "args": { "External id": 225553, "cbid": 211, "correlation": 225553 } }, { "ph": "s", "id": 225553, "pid": 76337, "tid": -914061504, "ts": 1716454224955252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224955264, "dur": 0, "args": { "External id": 225554, "cbid": 51, "correlation": 225554 } }, { "ph": "s", "id": 225554, "pid": 76337, "tid": -914061504, "ts": 1716454224955264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225013976, "dur": 172, "args": { "External id": 225555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225555, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 225555, "pid": 5, "tid": 7, "ts": 1716454225013976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955265, "dur": 6, "args": { "External id": 225555, "cbid": 211, "correlation": 225555 } }, { "ph": "s", "id": 225555, "pid": 76337, "tid": -914061504, "ts": 1716454224955265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225014149, "dur": 15, "args": { "External id": 225560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225560, "pid": 5, "tid": 7, "ts": 1716454225014149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955290, "dur": 8, "args": { "External id": 225560, "cbid": 211, "correlation": 225560 } }, { "ph": "s", "id": 225560, "pid": 76337, "tid": -914061504, "ts": 1716454224955290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225014166, "dur": 12, "args": { "External id": 225568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225568, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225568, "pid": 5, "tid": 7, "ts": 1716454225014166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955319, "dur": 8, "args": { "External id": 225568, "cbid": 211, "correlation": 225568 } }, { "ph": "s", "id": 225568, "pid": 76337, "tid": -914061504, "ts": 1716454224955319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225014179, "dur": 10, "args": { "External id": 225576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225576, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225576, "pid": 5, "tid": 7, "ts": 1716454225014179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955347, "dur": 8, "args": { "External id": 225576, "cbid": 211, "correlation": 225576 } }, { "ph": "s", "id": 225576, "pid": 76337, "tid": -914061504, "ts": 1716454224955347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225014190, "dur": 18, "args": { "External id": 225596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225596, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 225596, "pid": 5, "tid": 7, "ts": 1716454225014190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955428, "dur": 12, "args": { "External id": 225596, "cbid": 211, "correlation": 225596 } }, { "ph": "s", "id": 225596, "pid": 76337, "tid": -914061504, "ts": 1716454224955428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225014209, "dur": 4, "args": { "External id": 225608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225608, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 225608, "pid": 5, "tid": 7, "ts": 1716454225014209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955450, "dur": 6, "args": { "External id": 225608, "cbid": 211, "correlation": 225608 } }, { "ph": "s", "id": 225608, "pid": 76337, "tid": -914061504, "ts": 1716454224955450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225014215, "dur": 16, "args": { "External id": 225611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225611, "pid": 5, "tid": 7, "ts": 1716454225014215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955469, "dur": 7, "args": { "External id": 225611, "cbid": 211, "correlation": 225611 } }, { "ph": "s", "id": 225611, "pid": 76337, "tid": -914061504, "ts": 1716454224955469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224955527, "dur": 0, "args": { "External id": 225622, "cbid": 317, "correlation": 225622 } }, { "ph": "f", "id": 225622, "pid": 76337, "tid": -914061504, "ts": 1716454224955527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224955527, "dur": 0, "args": { "External id": 225623, "cbid": 203, "correlation": 225623 } }, { "ph": "f", "id": 225623, "pid": 76337, "tid": -914061504, "ts": 1716454224955527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224955528, "dur": 0, "args": { "External id": 225624, "cbid": 205, "correlation": 225624 } }, { "ph": "f", "id": 225624, "pid": 76337, "tid": -914061504, "ts": 1716454224955528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225014233, "dur": 11, "args": { "External id": 225628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225628, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225628, "pid": 5, "tid": 7, "ts": 1716454225014233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955541, "dur": 12, "args": { "External id": 225628, "cbid": 211, "correlation": 225628 } }, { "ph": "s", "id": 225628, "pid": 76337, "tid": -914061504, "ts": 1716454224955541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225014245, "dur": 3, "args": { "External id": 225630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 225630, "pid": 5, "tid": 7, "ts": 1716454225014245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955557, "dur": 7, "args": { "External id": 225630, "cbid": 211, "correlation": 225630 } }, { "ph": "s", "id": 225630, "pid": 76337, "tid": -914061504, "ts": 1716454224955557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224955567, "dur": 0, "args": { "External id": 225631, "cbid": 51, "correlation": 225631 } }, { "ph": "s", "id": 225631, "pid": 76337, "tid": -914061504, "ts": 1716454224955567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225014250, "dur": 89, "args": { "External id": 225632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225632, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 225632, "pid": 5, "tid": 7, "ts": 1716454225014250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955567, "dur": 5, "args": { "External id": 225632, "cbid": 211, "correlation": 225632 } }, { "ph": "s", "id": 225632, "pid": 76337, "tid": -914061504, "ts": 1716454224955567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225014341, "dur": 15, "args": { "External id": 225637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225637, "pid": 5, "tid": 7, "ts": 1716454225014341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955594, "dur": 8, "args": { "External id": 225637, "cbid": 211, "correlation": 225637 } }, { "ph": "s", "id": 225637, "pid": 76337, "tid": -914061504, "ts": 1716454224955594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225014357, "dur": 82, "args": { "External id": 225646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225646, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225646, "pid": 5, "tid": 7, "ts": 1716454225014357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955676, "dur": 14, "args": { "External id": 225646, "cbid": 211, "correlation": 225646 } }, { "ph": "s", "id": 225646, "pid": 76337, "tid": -914061504, "ts": 1716454224955676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225014440, "dur": 30, "args": { "External id": 225668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225668, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225668, "pid": 5, "tid": 7, "ts": 1716454225014440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955734, "dur": 10, "args": { "External id": 225668, "cbid": 211, "correlation": 225668 } }, { "ph": "s", "id": 225668, "pid": 76337, "tid": -914061504, "ts": 1716454224955734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224955825, "dur": 2, "args": { "External id": 225679, "cbid": 251, "correlation": 225679 } }, { "ph": "f", "id": 225679, "pid": 76337, "tid": -914061504, "ts": 1716454224955825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225014472, "dur": 162, "args": { "External id": 225680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225680, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225680, "pid": 5, "tid": 7, "ts": 1716454225014472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955832, "dur": 13, "args": { "External id": 225680, "cbid": 211, "correlation": 225680 } }, { "ph": "s", "id": 225680, "pid": 76337, "tid": -914061504, "ts": 1716454224955832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224955902, "dur": 1, "args": { "External id": 225691, "cbid": 251, "correlation": 225691 } }, { "ph": "f", "id": 225691, "pid": 76337, "tid": -914061504, "ts": 1716454224955902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225014635, "dur": 156, "args": { "External id": 225692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225692, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225692, "pid": 5, "tid": 7, "ts": 1716454225014635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955906, "dur": 12, "args": { "External id": 225692, "cbid": 211, "correlation": 225692 } }, { "ph": "s", "id": 225692, "pid": 76337, "tid": -914061504, "ts": 1716454224955906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224955981, "dur": 1, "args": { "External id": 225703, "cbid": 251, "correlation": 225703 } }, { "ph": "f", "id": 225703, "pid": 76337, "tid": -914061504, "ts": 1716454224955981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225014792, "dur": 156, "args": { "External id": 225704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225704, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225704, "pid": 5, "tid": 7, "ts": 1716454225014792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224955985, "dur": 12, "args": { "External id": 225704, "cbid": 211, "correlation": 225704 } }, { "ph": "s", "id": 225704, "pid": 76337, "tid": -914061504, "ts": 1716454224955985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225014949, "dur": 330, "args": { "External id": 225729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225729, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225729, "pid": 5, "tid": 7, "ts": 1716454225014949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956073, "dur": 13, "args": { "External id": 225729, "cbid": 211, "correlation": 225729 } }, { "ph": "s", "id": 225729, "pid": 76337, "tid": -914061504, "ts": 1716454224956073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956175, "dur": 1, "args": { "External id": 225747, "cbid": 251, "correlation": 225747 } }, { "ph": "f", "id": 225747, "pid": 76337, "tid": -914061504, "ts": 1716454224956175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225015280, "dur": 143, "args": { "External id": 225749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225749, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225749, "pid": 5, "tid": 7, "ts": 1716454225015280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956181, "dur": 13, "args": { "External id": 225749, "cbid": 211, "correlation": 225749 } }, { "ph": "s", "id": 225749, "pid": 76337, "tid": -914061504, "ts": 1716454224956181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225015424, "dur": 19, "args": { "External id": 225757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225757, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225757, "pid": 5, "tid": 7, "ts": 1716454225015424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956251, "dur": 13, "args": { "External id": 225757, "cbid": 211, "correlation": 225757 } }, { "ph": "s", "id": 225757, "pid": 76337, "tid": -914061504, "ts": 1716454224956251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225015445, "dur": 28, "args": { "External id": 225765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225765, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225765, "pid": 5, "tid": 7, "ts": 1716454225015445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956290, "dur": 9, "args": { "External id": 225765, "cbid": 211, "correlation": 225765 } }, { "ph": "s", "id": 225765, "pid": 76337, "tid": -914061504, "ts": 1716454224956290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225015474, "dur": 18, "args": { "External id": 225776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225776, "pid": 5, "tid": 7, "ts": 1716454225015474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956365, "dur": 12, "args": { "External id": 225776, "cbid": 211, "correlation": 225776 } }, { "ph": "s", "id": 225776, "pid": 76337, "tid": -914061504, "ts": 1716454224956365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225015494, "dur": 16, "args": { "External id": 225798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225798, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225798, "pid": 5, "tid": 7, "ts": 1716454225015494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956395, "dur": 8, "args": { "External id": 225798, "cbid": 211, "correlation": 225798 } }, { "ph": "s", "id": 225798, "pid": 76337, "tid": -914061504, "ts": 1716454224956395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956482, "dur": 2, "args": { "External id": 225809, "cbid": 251, "correlation": 225809 } }, { "ph": "f", "id": 225809, "pid": 76337, "tid": -914061504, "ts": 1716454224956482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225015512, "dur": 87, "args": { "External id": 225810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225810, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 225810, "pid": 5, "tid": 7, "ts": 1716454225015512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956488, "dur": 13, "args": { "External id": 225810, "cbid": 211, "correlation": 225810 } }, { "ph": "s", "id": 225810, "pid": 76337, "tid": -914061504, "ts": 1716454224956488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956559, "dur": 1, "args": { "External id": 225821, "cbid": 251, "correlation": 225821 } }, { "ph": "f", "id": 225821, "pid": 76337, "tid": -914061504, "ts": 1716454224956559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956563, "dur": 0, "args": { "External id": 225822, "cbid": 251, "correlation": 225822 } }, { "ph": "f", "id": 225822, "pid": 76337, "tid": -914061504, "ts": 1716454224956563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225015601, "dur": 12, "args": { "External id": 225823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225823, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225823, "pid": 5, "tid": 7, "ts": 1716454225015601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956564, "dur": 12, "args": { "External id": 225823, "cbid": 211, "correlation": 225823 } }, { "ph": "s", "id": 225823, "pid": 76337, "tid": -914061504, "ts": 1716454224956564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225015614, "dur": 5, "args": { "External id": 225825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225825, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225825, "pid": 5, "tid": 7, "ts": 1716454225015614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956578, "dur": 6, "args": { "External id": 225825, "cbid": 211, "correlation": 225825 } }, { "ph": "s", "id": 225825, "pid": 76337, "tid": -914061504, "ts": 1716454224956578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956636, "dur": 1, "args": { "External id": 225836, "cbid": 251, "correlation": 225836 } }, { "ph": "f", "id": 225836, "pid": 76337, "tid": -914061504, "ts": 1716454224956636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956639, "dur": 0, "args": { "External id": 225837, "cbid": 251, "correlation": 225837 } }, { "ph": "f", "id": 225837, "pid": 76337, "tid": -914061504, "ts": 1716454224956639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225015621, "dur": 8, "args": { "External id": 225838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225838, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225838, "pid": 5, "tid": 7, "ts": 1716454225015621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956640, "dur": 12, "args": { "External id": 225838, "cbid": 211, "correlation": 225838 } }, { "ph": "s", "id": 225838, "pid": 76337, "tid": -914061504, "ts": 1716454224956640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225015630, "dur": 3, "args": { "External id": 225840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225840, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225840, "pid": 5, "tid": 7, "ts": 1716454225015630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956654, "dur": 6, "args": { "External id": 225840, "cbid": 211, "correlation": 225840 } }, { "ph": "s", "id": 225840, "pid": 76337, "tid": -914061504, "ts": 1716454224956654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225015635, "dur": 54, "args": { "External id": 225865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225865, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225865, "pid": 5, "tid": 7, "ts": 1716454225015635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956731, "dur": 12, "args": { "External id": 225865, "cbid": 211, "correlation": 225865 } }, { "ph": "s", "id": 225865, "pid": 76337, "tid": -914061504, "ts": 1716454224956731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224956830, "dur": 2, "args": { "External id": 225883, "cbid": 251, "correlation": 225883 } }, { "ph": "f", "id": 225883, "pid": 76337, "tid": -914061504, "ts": 1716454224956830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225015690, "dur": 90, "args": { "External id": 225885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225885, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 225885, "pid": 5, "tid": 7, "ts": 1716454225015690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956837, "dur": 14, "args": { "External id": 225885, "cbid": 211, "correlation": 225885 } }, { "ph": "s", "id": 225885, "pid": 76337, "tid": -914061504, "ts": 1716454224956837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225015782, "dur": 9, "args": { "External id": 225893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225893, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225893, "pid": 5, "tid": 7, "ts": 1716454225015782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956906, "dur": 13, "args": { "External id": 225893, "cbid": 211, "correlation": 225893 } }, { "ph": "s", "id": 225893, "pid": 76337, "tid": -914061504, "ts": 1716454224956906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225015793, "dur": 20, "args": { "External id": 225901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225901, "pid": 5, "tid": 7, "ts": 1716454225015793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224956948, "dur": 9, "args": { "External id": 225901, "cbid": 211, "correlation": 225901 } }, { "ph": "s", "id": 225901, "pid": 76337, "tid": -914061504, "ts": 1716454224956948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225015814, "dur": 17, "args": { "External id": 225923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225923, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225923, "pid": 5, "tid": 7, "ts": 1716454225015814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957007, "dur": 11, "args": { "External id": 225923, "cbid": 211, "correlation": 225923 } }, { "ph": "s", "id": 225923, "pid": 76337, "tid": -914061504, "ts": 1716454224957007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224957096, "dur": 1, "args": { "External id": 225939, "cbid": 251, "correlation": 225939 } }, { "ph": "f", "id": 225939, "pid": 76337, "tid": -914061504, "ts": 1716454224957096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224957101, "dur": 0, "args": { "External id": 225941, "cbid": 251, "correlation": 225941 } }, { "ph": "f", "id": 225941, "pid": 76337, "tid": -914061504, "ts": 1716454224957101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225015833, "dur": 493, "args": { "External id": 225942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225942, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 225942, "pid": 5, "tid": 7, "ts": 1716454225015833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957103, "dur": 13, "args": { "External id": 225942, "cbid": 211, "correlation": 225942 } }, { "ph": "s", "id": 225942, "pid": 76337, "tid": -914061504, "ts": 1716454224957103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225016327, "dur": 65, "args": { "External id": 225950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225950, "pid": 5, "tid": 7, "ts": 1716454225016327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957172, "dur": 12, "args": { "External id": 225950, "cbid": 211, "correlation": 225950 } }, { "ph": "s", "id": 225950, "pid": 76337, "tid": -914061504, "ts": 1716454224957172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225016394, "dur": 65, "args": { "External id": 225958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225958, "pid": 5, "tid": 7, "ts": 1716454225016394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957204, "dur": 9, "args": { "External id": 225958, "cbid": 211, "correlation": 225958 } }, { "ph": "s", "id": 225958, "pid": 76337, "tid": -914061504, "ts": 1716454224957204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224957286, "dur": 1, "args": { "External id": 225974, "cbid": 251, "correlation": 225974 } }, { "ph": "f", "id": 225974, "pid": 76337, "tid": -914061504, "ts": 1716454224957286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225016461, "dur": 1, "args": { "External id": 225976, "device": 5, "context": 1, "stream": 7, "correlation": 225976, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 225976, "pid": 5, "tid": 7, "ts": 1716454225016461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224957291, "dur": 10, "args": { "External id": 225976, "cbid": 51, "correlation": 225976 } }, { "ph": "s", "id": 225976, "pid": 76337, "tid": -914061504, "ts": 1716454224957291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225016465, "dur": 267, "args": { "External id": 225977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225977, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 225977, "pid": 5, "tid": 7, "ts": 1716454225016465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957303, "dur": 12, "args": { "External id": 225977, "cbid": 211, "correlation": 225977 } }, { "ph": "s", "id": 225977, "pid": 76337, "tid": -914061504, "ts": 1716454224957303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225016733, "dur": 14, "args": { "External id": 225985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225985, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225985, "pid": 5, "tid": 7, "ts": 1716454225016733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957346, "dur": 10, "args": { "External id": 225985, "cbid": 211, "correlation": 225985 } }, { "ph": "s", "id": 225985, "pid": 76337, "tid": -914061504, "ts": 1716454224957346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225016748, "dur": 37, "args": { "External id": 225996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 225996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 225996, "pid": 5, "tid": 7, "ts": 1716454225016748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957416, "dur": 12, "args": { "External id": 225996, "cbid": 211, "correlation": 225996 } }, { "ph": "s", "id": 225996, "pid": 76337, "tid": -914061504, "ts": 1716454224957416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224957480, "dur": 0, "args": { "External id": 226008, "cbid": 317, "correlation": 226008 } }, { "ph": "f", "id": 226008, "pid": 76337, "tid": -914061504, "ts": 1716454224957480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224957481, "dur": 0, "args": { "External id": 226009, "cbid": 203, "correlation": 226009 } }, { "ph": "f", "id": 226009, "pid": 76337, "tid": -914061504, "ts": 1716454224957481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224957482, "dur": 0, "args": { "External id": 226010, "cbid": 205, "correlation": 226010 } }, { "ph": "f", "id": 226010, "pid": 76337, "tid": -914061504, "ts": 1716454224957482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225016787, "dur": 14, "args": { "External id": 226014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226014, "pid": 5, "tid": 7, "ts": 1716454225016787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957497, "dur": 13, "args": { "External id": 226014, "cbid": 211, "correlation": 226014 } }, { "ph": "s", "id": 226014, "pid": 76337, "tid": -914061504, "ts": 1716454224957497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225016803, "dur": 4, "args": { "External id": 226016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226016, "pid": 5, "tid": 7, "ts": 1716454225016803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957515, "dur": 6, "args": { "External id": 226016, "cbid": 211, "correlation": 226016 } }, { "ph": "s", "id": 226016, "pid": 76337, "tid": -914061504, "ts": 1716454224957515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224957523, "dur": 0, "args": { "External id": 226017, "cbid": 51, "correlation": 226017 } }, { "ph": "s", "id": 226017, "pid": 76337, "tid": -914061504, "ts": 1716454224957523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225016808, "dur": 96, "args": { "External id": 226018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226018, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 226018, "pid": 5, "tid": 7, "ts": 1716454225016808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957524, "dur": 5, "args": { "External id": 226018, "cbid": 211, "correlation": 226018 } }, { "ph": "s", "id": 226018, "pid": 76337, "tid": -914061504, "ts": 1716454224957524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225016905, "dur": 16, "args": { "External id": 226023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226023, "pid": 5, "tid": 7, "ts": 1716454225016905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957551, "dur": 10, "args": { "External id": 226023, "cbid": 211, "correlation": 226023 } }, { "ph": "s", "id": 226023, "pid": 76337, "tid": -914061504, "ts": 1716454224957551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225016922, "dur": 11, "args": { "External id": 226031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226031, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226031, "pid": 5, "tid": 7, "ts": 1716454225016922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957583, "dur": 8, "args": { "External id": 226031, "cbid": 211, "correlation": 226031 } }, { "ph": "s", "id": 226031, "pid": 76337, "tid": -914061504, "ts": 1716454224957583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225016934, "dur": 29, "args": { "External id": 226040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226040, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226040, "pid": 5, "tid": 7, "ts": 1716454225016934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957623, "dur": 10, "args": { "External id": 226040, "cbid": 211, "correlation": 226040 } }, { "ph": "s", "id": 226040, "pid": 76337, "tid": -914061504, "ts": 1716454224957623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225016965, "dur": 31, "args": { "External id": 226060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226060, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 226060, "pid": 5, "tid": 7, "ts": 1716454225016965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957694, "dur": 12, "args": { "External id": 226060, "cbid": 211, "correlation": 226060 } }, { "ph": "s", "id": 226060, "pid": 76337, "tid": -914061504, "ts": 1716454224957694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225016997, "dur": 6, "args": { "External id": 226072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226072, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226072, "pid": 5, "tid": 7, "ts": 1716454225016997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957716, "dur": 6, "args": { "External id": 226072, "cbid": 211, "correlation": 226072 } }, { "ph": "s", "id": 226072, "pid": 76337, "tid": -914061504, "ts": 1716454224957716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225017004, "dur": 31, "args": { "External id": 226075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226075, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226075, "pid": 5, "tid": 7, "ts": 1716454225017004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957735, "dur": 7, "args": { "External id": 226075, "cbid": 211, "correlation": 226075 } }, { "ph": "s", "id": 226075, "pid": 76337, "tid": -914061504, "ts": 1716454224957735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225017036, "dur": 21, "args": { "External id": 226084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226084, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226084, "pid": 5, "tid": 7, "ts": 1716454225017036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957774, "dur": 10, "args": { "External id": 226084, "cbid": 211, "correlation": 226084 } }, { "ph": "s", "id": 226084, "pid": 76337, "tid": -914061504, "ts": 1716454224957774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224957826, "dur": 0, "args": { "External id": 226094, "cbid": 317, "correlation": 226094 } }, { "ph": "f", "id": 226094, "pid": 76337, "tid": -914061504, "ts": 1716454224957826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224957827, "dur": 0, "args": { "External id": 226095, "cbid": 203, "correlation": 226095 } }, { "ph": "f", "id": 226095, "pid": 76337, "tid": -914061504, "ts": 1716454224957827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224957828, "dur": 0, "args": { "External id": 226096, "cbid": 205, "correlation": 226096 } }, { "ph": "f", "id": 226096, "pid": 76337, "tid": -914061504, "ts": 1716454224957828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225017059, "dur": 21, "args": { "External id": 226100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226100, "pid": 5, "tid": 7, "ts": 1716454225017059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957841, "dur": 11, "args": { "External id": 226100, "cbid": 211, "correlation": 226100 } }, { "ph": "s", "id": 226100, "pid": 76337, "tid": -914061504, "ts": 1716454224957841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225017081, "dur": 313, "args": { "External id": 226102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226102, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226102, "pid": 5, "tid": 7, "ts": 1716454225017081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957855, "dur": 5, "args": { "External id": 226102, "cbid": 211, "correlation": 226102 } }, { "ph": "s", "id": 226102, "pid": 76337, "tid": -914061504, "ts": 1716454224957855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225017397, "dur": 1, "args": { "External id": 226104, "device": 5, "context": 1, "stream": 7, "correlation": 226104, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 226104, "pid": 5, "tid": 7, "ts": 1716454225017397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224957866, "dur": 6, "args": { "External id": 226104, "cbid": 51, "correlation": 226104 } }, { "ph": "s", "id": 226104, "pid": 76337, "tid": -914061504, "ts": 1716454224957866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225017400, "dur": 1241, "args": { "External id": 226105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226105, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226105, "pid": 5, "tid": 7, "ts": 1716454225017400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957874, "dur": 6, "args": { "External id": 226105, "cbid": 211, "correlation": 226105 } }, { "ph": "s", "id": 226105, "pid": 76337, "tid": -914061504, "ts": 1716454224957874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225018643, "dur": 14, "args": { "External id": 226107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226107, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226107, "pid": 5, "tid": 7, "ts": 1716454225018643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957884, "dur": 5, "args": { "External id": 226107, "cbid": 211, "correlation": 226107 } }, { "ph": "s", "id": 226107, "pid": 76337, "tid": -914061504, "ts": 1716454224957884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225018658, "dur": 15, "args": { "External id": 226113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226113, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226113, "pid": 5, "tid": 7, "ts": 1716454225018658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957913, "dur": 9, "args": { "External id": 226113, "cbid": 211, "correlation": 226113 } }, { "ph": "s", "id": 226113, "pid": 76337, "tid": -914061504, "ts": 1716454224957913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225018674, "dur": 3, "args": { "External id": 226121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226121, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 226121, "pid": 5, "tid": 7, "ts": 1716454225018674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224957956, "dur": 10, "args": { "External id": 226121, "cbid": 211, "correlation": 226121 } }, { "ph": "s", "id": 226121, "pid": 76337, "tid": -914061504, "ts": 1716454224957956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224958031, "dur": 1, "args": { "External id": 226137, "cbid": 251, "correlation": 226137 } }, { "ph": "f", "id": 226137, "pid": 76337, "tid": -914061504, "ts": 1716454224958031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224958036, "dur": 0, "args": { "External id": 226139, "cbid": 251, "correlation": 226139 } }, { "ph": "f", "id": 226139, "pid": 76337, "tid": -914061504, "ts": 1716454224958036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225018678, "dur": 13, "args": { "External id": 226140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226140, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226140, "pid": 5, "tid": 7, "ts": 1716454225018678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958038, "dur": 12, "args": { "External id": 226140, "cbid": 211, "correlation": 226140 } }, { "ph": "s", "id": 226140, "pid": 76337, "tid": -914061504, "ts": 1716454224958038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225018693, "dur": 5, "args": { "External id": 226142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226142, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226142, "pid": 5, "tid": 7, "ts": 1716454225018693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958052, "dur": 5, "args": { "External id": 226142, "cbid": 211, "correlation": 226142 } }, { "ph": "s", "id": 226142, "pid": 76337, "tid": -914061504, "ts": 1716454224958052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225018699, "dur": 17, "args": { "External id": 226152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226152, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226152, "pid": 5, "tid": 7, "ts": 1716454225018699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958110, "dur": 12, "args": { "External id": 226152, "cbid": 211, "correlation": 226152 } }, { "ph": "s", "id": 226152, "pid": 76337, "tid": -914061504, "ts": 1716454224958110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225018717, "dur": 17, "args": { "External id": 226172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226172, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 226172, "pid": 5, "tid": 7, "ts": 1716454225018717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958176, "dur": 11, "args": { "External id": 226172, "cbid": 211, "correlation": 226172 } }, { "ph": "s", "id": 226172, "pid": 76337, "tid": -914061504, "ts": 1716454224958176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225018735, "dur": 4, "args": { "External id": 226184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226184, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 226184, "pid": 5, "tid": 7, "ts": 1716454225018735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958196, "dur": 6, "args": { "External id": 226184, "cbid": 211, "correlation": 226184 } }, { "ph": "s", "id": 226184, "pid": 76337, "tid": -914061504, "ts": 1716454224958196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225018741, "dur": 17, "args": { "External id": 226187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226187, "pid": 5, "tid": 7, "ts": 1716454225018741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958215, "dur": 7, "args": { "External id": 226187, "cbid": 211, "correlation": 226187 } }, { "ph": "s", "id": 226187, "pid": 76337, "tid": -914061504, "ts": 1716454224958215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225018759, "dur": 11, "args": { "External id": 226196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226196, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226196, "pid": 5, "tid": 7, "ts": 1716454225018759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958255, "dur": 11, "args": { "External id": 226196, "cbid": 211, "correlation": 226196 } }, { "ph": "s", "id": 226196, "pid": 76337, "tid": -914061504, "ts": 1716454224958255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224958319, "dur": 0, "args": { "External id": 226206, "cbid": 317, "correlation": 226206 } }, { "ph": "f", "id": 226206, "pid": 76337, "tid": -914061504, "ts": 1716454224958319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224958320, "dur": 0, "args": { "External id": 226207, "cbid": 203, "correlation": 226207 } }, { "ph": "f", "id": 226207, "pid": 76337, "tid": -914061504, "ts": 1716454224958320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224958321, "dur": 0, "args": { "External id": 226208, "cbid": 205, "correlation": 226208 } }, { "ph": "f", "id": 226208, "pid": 76337, "tid": -914061504, "ts": 1716454224958321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225018771, "dur": 11, "args": { "External id": 226212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226212, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226212, "pid": 5, "tid": 7, "ts": 1716454225018771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958334, "dur": 12, "args": { "External id": 226212, "cbid": 211, "correlation": 226212 } }, { "ph": "s", "id": 226212, "pid": 76337, "tid": -914061504, "ts": 1716454224958334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225018783, "dur": 160, "args": { "External id": 226214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226214, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226214, "pid": 5, "tid": 7, "ts": 1716454225018783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958349, "dur": 5, "args": { "External id": 226214, "cbid": 211, "correlation": 226214 } }, { "ph": "s", "id": 226214, "pid": 76337, "tid": -914061504, "ts": 1716454224958349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225018945, "dur": 1, "args": { "External id": 226216, "device": 5, "context": 1, "stream": 7, "correlation": 226216, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 226216, "pid": 5, "tid": 7, "ts": 1716454225018945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224958360, "dur": 7, "args": { "External id": 226216, "cbid": 51, "correlation": 226216 } }, { "ph": "s", "id": 226216, "pid": 76337, "tid": -914061504, "ts": 1716454224958360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225018949, "dur": 639, "args": { "External id": 226217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226217, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226217, "pid": 5, "tid": 7, "ts": 1716454225018949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958368, "dur": 6, "args": { "External id": 226217, "cbid": 211, "correlation": 226217 } }, { "ph": "s", "id": 226217, "pid": 76337, "tid": -914061504, "ts": 1716454224958368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225019589, "dur": 12, "args": { "External id": 226219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226219, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226219, "pid": 5, "tid": 7, "ts": 1716454225019589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958378, "dur": 5, "args": { "External id": 226219, "cbid": 211, "correlation": 226219 } }, { "ph": "s", "id": 226219, "pid": 76337, "tid": -914061504, "ts": 1716454224958378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225019603, "dur": 14, "args": { "External id": 226225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226225, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226225, "pid": 5, "tid": 7, "ts": 1716454225019603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958406, "dur": 9, "args": { "External id": 226225, "cbid": 211, "correlation": 226225 } }, { "ph": "s", "id": 226225, "pid": 76337, "tid": -914061504, "ts": 1716454224958406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224958465, "dur": 0, "args": { "External id": 226235, "cbid": 317, "correlation": 226235 } }, { "ph": "f", "id": 226235, "pid": 76337, "tid": -914061504, "ts": 1716454224958465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224958466, "dur": 0, "args": { "External id": 226236, "cbid": 203, "correlation": 226236 } }, { "ph": "f", "id": 226236, "pid": 76337, "tid": -914061504, "ts": 1716454224958466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224958467, "dur": 0, "args": { "External id": 226237, "cbid": 205, "correlation": 226237 } }, { "ph": "f", "id": 226237, "pid": 76337, "tid": -914061504, "ts": 1716454224958467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225019618, "dur": 21, "args": { "External id": 226241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226241, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226241, "pid": 5, "tid": 7, "ts": 1716454225019618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958478, "dur": 12, "args": { "External id": 226241, "cbid": 211, "correlation": 226241 } }, { "ph": "s", "id": 226241, "pid": 76337, "tid": -914061504, "ts": 1716454224958478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225019640, "dur": 4, "args": { "External id": 226243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226243, "pid": 5, "tid": 7, "ts": 1716454225019640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958494, "dur": 6, "args": { "External id": 226243, "cbid": 211, "correlation": 226243 } }, { "ph": "s", "id": 226243, "pid": 76337, "tid": -914061504, "ts": 1716454224958494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224958502, "dur": 0, "args": { "External id": 226244, "cbid": 51, "correlation": 226244 } }, { "ph": "s", "id": 226244, "pid": 76337, "tid": -914061504, "ts": 1716454224958502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225019645, "dur": 167, "args": { "External id": 226245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226245, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 226245, "pid": 5, "tid": 7, "ts": 1716454225019645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958503, "dur": 6, "args": { "External id": 226245, "cbid": 211, "correlation": 226245 } }, { "ph": "s", "id": 226245, "pid": 76337, "tid": -914061504, "ts": 1716454224958503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225019814, "dur": 16, "args": { "External id": 226250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226250, "pid": 5, "tid": 7, "ts": 1716454225019814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958529, "dur": 8, "args": { "External id": 226250, "cbid": 211, "correlation": 226250 } }, { "ph": "s", "id": 226250, "pid": 76337, "tid": -914061504, "ts": 1716454224958529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225019831, "dur": 12, "args": { "External id": 226258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226258, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226258, "pid": 5, "tid": 7, "ts": 1716454225019831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958558, "dur": 8, "args": { "External id": 226258, "cbid": 211, "correlation": 226258 } }, { "ph": "s", "id": 226258, "pid": 76337, "tid": -914061504, "ts": 1716454224958558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225019845, "dur": 10, "args": { "External id": 226266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226266, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226266, "pid": 5, "tid": 7, "ts": 1716454225019845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958586, "dur": 8, "args": { "External id": 226266, "cbid": 211, "correlation": 226266 } }, { "ph": "s", "id": 226266, "pid": 76337, "tid": -914061504, "ts": 1716454224958586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225019856, "dur": 18, "args": { "External id": 226286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226286, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 226286, "pid": 5, "tid": 7, "ts": 1716454225019856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958669, "dur": 12, "args": { "External id": 226286, "cbid": 211, "correlation": 226286 } }, { "ph": "s", "id": 226286, "pid": 76337, "tid": -914061504, "ts": 1716454224958669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225019876, "dur": 4, "args": { "External id": 226298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226298, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 226298, "pid": 5, "tid": 7, "ts": 1716454225019876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958691, "dur": 6, "args": { "External id": 226298, "cbid": 211, "correlation": 226298 } }, { "ph": "s", "id": 226298, "pid": 76337, "tid": -914061504, "ts": 1716454224958691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225019881, "dur": 17, "args": { "External id": 226301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226301, "pid": 5, "tid": 7, "ts": 1716454225019881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958709, "dur": 7, "args": { "External id": 226301, "cbid": 211, "correlation": 226301 } }, { "ph": "s", "id": 226301, "pid": 76337, "tid": -914061504, "ts": 1716454224958709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224958767, "dur": 0, "args": { "External id": 226312, "cbid": 317, "correlation": 226312 } }, { "ph": "f", "id": 226312, "pid": 76337, "tid": -914061504, "ts": 1716454224958767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224958768, "dur": 0, "args": { "External id": 226313, "cbid": 203, "correlation": 226313 } }, { "ph": "f", "id": 226313, "pid": 76337, "tid": -914061504, "ts": 1716454224958768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224958769, "dur": 0, "args": { "External id": 226314, "cbid": 205, "correlation": 226314 } }, { "ph": "f", "id": 226314, "pid": 76337, "tid": -914061504, "ts": 1716454224958769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225019899, "dur": 12, "args": { "External id": 226318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226318, "pid": 5, "tid": 7, "ts": 1716454225019899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958781, "dur": 12, "args": { "External id": 226318, "cbid": 211, "correlation": 226318 } }, { "ph": "s", "id": 226318, "pid": 76337, "tid": -914061504, "ts": 1716454224958781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225019912, "dur": 3, "args": { "External id": 226320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226320, "pid": 5, "tid": 7, "ts": 1716454225019912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958798, "dur": 6, "args": { "External id": 226320, "cbid": 211, "correlation": 226320 } }, { "ph": "s", "id": 226320, "pid": 76337, "tid": -914061504, "ts": 1716454224958798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224958806, "dur": 0, "args": { "External id": 226321, "cbid": 51, "correlation": 226321 } }, { "ph": "s", "id": 226321, "pid": 76337, "tid": -914061504, "ts": 1716454224958806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225019917, "dur": 89, "args": { "External id": 226322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226322, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 226322, "pid": 5, "tid": 7, "ts": 1716454225019917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958807, "dur": 5, "args": { "External id": 226322, "cbid": 211, "correlation": 226322 } }, { "ph": "s", "id": 226322, "pid": 76337, "tid": -914061504, "ts": 1716454224958807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225020007, "dur": 16, "args": { "External id": 226327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226327, "pid": 5, "tid": 7, "ts": 1716454225020007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958835, "dur": 9, "args": { "External id": 226327, "cbid": 211, "correlation": 226327 } }, { "ph": "s", "id": 226327, "pid": 76337, "tid": -914061504, "ts": 1716454224958835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225020023, "dur": 82, "args": { "External id": 226336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226336, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226336, "pid": 5, "tid": 7, "ts": 1716454225020023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958915, "dur": 15, "args": { "External id": 226336, "cbid": 211, "correlation": 226336 } }, { "ph": "s", "id": 226336, "pid": 76337, "tid": -914061504, "ts": 1716454224958915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225020107, "dur": 29, "args": { "External id": 226358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226358, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226358, "pid": 5, "tid": 7, "ts": 1716454225020107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224958982, "dur": 10, "args": { "External id": 226358, "cbid": 211, "correlation": 226358 } }, { "ph": "s", "id": 226358, "pid": 76337, "tid": -914061504, "ts": 1716454224958982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959072, "dur": 2, "args": { "External id": 226369, "cbid": 251, "correlation": 226369 } }, { "ph": "f", "id": 226369, "pid": 76337, "tid": -914061504, "ts": 1716454224959072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225020138, "dur": 164, "args": { "External id": 226370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226370, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226370, "pid": 5, "tid": 7, "ts": 1716454225020138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959078, "dur": 13, "args": { "External id": 226370, "cbid": 211, "correlation": 226370 } }, { "ph": "s", "id": 226370, "pid": 76337, "tid": -914061504, "ts": 1716454224959078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959148, "dur": 1, "args": { "External id": 226381, "cbid": 251, "correlation": 226381 } }, { "ph": "f", "id": 226381, "pid": 76337, "tid": -914061504, "ts": 1716454224959148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225020303, "dur": 156, "args": { "External id": 226382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226382, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226382, "pid": 5, "tid": 7, "ts": 1716454225020303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959152, "dur": 11, "args": { "External id": 226382, "cbid": 211, "correlation": 226382 } }, { "ph": "s", "id": 226382, "pid": 76337, "tid": -914061504, "ts": 1716454224959152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959217, "dur": 1, "args": { "External id": 226393, "cbid": 251, "correlation": 226393 } }, { "ph": "f", "id": 226393, "pid": 76337, "tid": -914061504, "ts": 1716454224959217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225020460, "dur": 156, "args": { "External id": 226394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226394, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226394, "pid": 5, "tid": 7, "ts": 1716454225020460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959221, "dur": 11, "args": { "External id": 226394, "cbid": 211, "correlation": 226394 } }, { "ph": "s", "id": 226394, "pid": 76337, "tid": -914061504, "ts": 1716454224959221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225020617, "dur": 336, "args": { "External id": 226419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226419, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226419, "pid": 5, "tid": 7, "ts": 1716454225020617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959305, "dur": 13, "args": { "External id": 226419, "cbid": 211, "correlation": 226419 } }, { "ph": "s", "id": 226419, "pid": 76337, "tid": -914061504, "ts": 1716454224959305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959407, "dur": 1, "args": { "External id": 226437, "cbid": 251, "correlation": 226437 } }, { "ph": "f", "id": 226437, "pid": 76337, "tid": -914061504, "ts": 1716454224959407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225020954, "dur": 164, "args": { "External id": 226439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226439, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226439, "pid": 5, "tid": 7, "ts": 1716454225020954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959413, "dur": 14, "args": { "External id": 226439, "cbid": 211, "correlation": 226439 } }, { "ph": "s", "id": 226439, "pid": 76337, "tid": -914061504, "ts": 1716454224959413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225021119, "dur": 19, "args": { "External id": 226447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226447, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226447, "pid": 5, "tid": 7, "ts": 1716454225021119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959484, "dur": 12, "args": { "External id": 226447, "cbid": 211, "correlation": 226447 } }, { "ph": "s", "id": 226447, "pid": 76337, "tid": -914061504, "ts": 1716454224959484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225021140, "dur": 28, "args": { "External id": 226455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226455, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226455, "pid": 5, "tid": 7, "ts": 1716454225021140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959523, "dur": 9, "args": { "External id": 226455, "cbid": 211, "correlation": 226455 } }, { "ph": "s", "id": 226455, "pid": 76337, "tid": -914061504, "ts": 1716454224959523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225021169, "dur": 18, "args": { "External id": 226466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226466, "pid": 5, "tid": 7, "ts": 1716454225021169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959594, "dur": 12, "args": { "External id": 226466, "cbid": 211, "correlation": 226466 } }, { "ph": "s", "id": 226466, "pid": 76337, "tid": -914061504, "ts": 1716454224959594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225021188, "dur": 16, "args": { "External id": 226488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226488, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226488, "pid": 5, "tid": 7, "ts": 1716454225021188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959624, "dur": 8, "args": { "External id": 226488, "cbid": 211, "correlation": 226488 } }, { "ph": "s", "id": 226488, "pid": 76337, "tid": -914061504, "ts": 1716454224959624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959710, "dur": 1, "args": { "External id": 226499, "cbid": 251, "correlation": 226499 } }, { "ph": "f", "id": 226499, "pid": 76337, "tid": -914061504, "ts": 1716454224959710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225021206, "dur": 88, "args": { "External id": 226500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226500, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226500, "pid": 5, "tid": 7, "ts": 1716454225021206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959715, "dur": 13, "args": { "External id": 226500, "cbid": 211, "correlation": 226500 } }, { "ph": "s", "id": 226500, "pid": 76337, "tid": -914061504, "ts": 1716454224959715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959782, "dur": 1, "args": { "External id": 226511, "cbid": 251, "correlation": 226511 } }, { "ph": "f", "id": 226511, "pid": 76337, "tid": -914061504, "ts": 1716454224959782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959787, "dur": 0, "args": { "External id": 226512, "cbid": 251, "correlation": 226512 } }, { "ph": "f", "id": 226512, "pid": 76337, "tid": -914061504, "ts": 1716454224959787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225021295, "dur": 13, "args": { "External id": 226513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226513, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226513, "pid": 5, "tid": 7, "ts": 1716454225021295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959788, "dur": 12, "args": { "External id": 226513, "cbid": 211, "correlation": 226513 } }, { "ph": "s", "id": 226513, "pid": 76337, "tid": -914061504, "ts": 1716454224959788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225021309, "dur": 5, "args": { "External id": 226515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226515, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226515, "pid": 5, "tid": 7, "ts": 1716454225021309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959802, "dur": 6, "args": { "External id": 226515, "cbid": 211, "correlation": 226515 } }, { "ph": "s", "id": 226515, "pid": 76337, "tid": -914061504, "ts": 1716454224959802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959859, "dur": 1, "args": { "External id": 226526, "cbid": 251, "correlation": 226526 } }, { "ph": "f", "id": 226526, "pid": 76337, "tid": -914061504, "ts": 1716454224959859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224959862, "dur": 0, "args": { "External id": 226527, "cbid": 251, "correlation": 226527 } }, { "ph": "f", "id": 226527, "pid": 76337, "tid": -914061504, "ts": 1716454224959862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225021315, "dur": 8, "args": { "External id": 226528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226528, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226528, "pid": 5, "tid": 7, "ts": 1716454225021315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959864, "dur": 12, "args": { "External id": 226528, "cbid": 211, "correlation": 226528 } }, { "ph": "s", "id": 226528, "pid": 76337, "tid": -914061504, "ts": 1716454224959864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225021324, "dur": 4, "args": { "External id": 226530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226530, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226530, "pid": 5, "tid": 7, "ts": 1716454225021324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959878, "dur": 5, "args": { "External id": 226530, "cbid": 211, "correlation": 226530 } }, { "ph": "s", "id": 226530, "pid": 76337, "tid": -914061504, "ts": 1716454224959878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225021329, "dur": 54, "args": { "External id": 226555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226555, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226555, "pid": 5, "tid": 7, "ts": 1716454225021329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224959954, "dur": 12, "args": { "External id": 226555, "cbid": 211, "correlation": 226555 } }, { "ph": "s", "id": 226555, "pid": 76337, "tid": -914061504, "ts": 1716454224959954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224960062, "dur": 1, "args": { "External id": 226573, "cbid": 251, "correlation": 226573 } }, { "ph": "f", "id": 226573, "pid": 76337, "tid": -914061504, "ts": 1716454224960062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225021385, "dur": 89, "args": { "External id": 226575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226575, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226575, "pid": 5, "tid": 7, "ts": 1716454225021385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960067, "dur": 15, "args": { "External id": 226575, "cbid": 211, "correlation": 226575 } }, { "ph": "s", "id": 226575, "pid": 76337, "tid": -914061504, "ts": 1716454224960067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225021475, "dur": 10, "args": { "External id": 226583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226583, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226583, "pid": 5, "tid": 7, "ts": 1716454225021475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960138, "dur": 12, "args": { "External id": 226583, "cbid": 211, "correlation": 226583 } }, { "ph": "s", "id": 226583, "pid": 76337, "tid": -914061504, "ts": 1716454224960138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225021487, "dur": 21, "args": { "External id": 226591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226591, "pid": 5, "tid": 7, "ts": 1716454225021487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960179, "dur": 9, "args": { "External id": 226591, "cbid": 211, "correlation": 226591 } }, { "ph": "s", "id": 226591, "pid": 76337, "tid": -914061504, "ts": 1716454224960179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225021509, "dur": 18, "args": { "External id": 226613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226613, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226613, "pid": 5, "tid": 7, "ts": 1716454225021509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960230, "dur": 10, "args": { "External id": 226613, "cbid": 211, "correlation": 226613 } }, { "ph": "s", "id": 226613, "pid": 76337, "tid": -914061504, "ts": 1716454224960230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224960316, "dur": 1, "args": { "External id": 226629, "cbid": 251, "correlation": 226629 } }, { "ph": "f", "id": 226629, "pid": 76337, "tid": -914061504, "ts": 1716454224960316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224960322, "dur": 0, "args": { "External id": 226631, "cbid": 251, "correlation": 226631 } }, { "ph": "f", "id": 226631, "pid": 76337, "tid": -914061504, "ts": 1716454224960322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225021528, "dur": 493, "args": { "External id": 226632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226632, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226632, "pid": 5, "tid": 7, "ts": 1716454225021528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960324, "dur": 14, "args": { "External id": 226632, "cbid": 211, "correlation": 226632 } }, { "ph": "s", "id": 226632, "pid": 76337, "tid": -914061504, "ts": 1716454224960324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225022022, "dur": 64, "args": { "External id": 226640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226640, "pid": 5, "tid": 7, "ts": 1716454225022022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960391, "dur": 13, "args": { "External id": 226640, "cbid": 211, "correlation": 226640 } }, { "ph": "s", "id": 226640, "pid": 76337, "tid": -914061504, "ts": 1716454224960391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225022087, "dur": 65, "args": { "External id": 226648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226648, "pid": 5, "tid": 7, "ts": 1716454225022087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960422, "dur": 9, "args": { "External id": 226648, "cbid": 211, "correlation": 226648 } }, { "ph": "s", "id": 226648, "pid": 76337, "tid": -914061504, "ts": 1716454224960422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224960502, "dur": 1, "args": { "External id": 226664, "cbid": 251, "correlation": 226664 } }, { "ph": "f", "id": 226664, "pid": 76337, "tid": -914061504, "ts": 1716454224960502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225022155, "dur": 1, "args": { "External id": 226666, "device": 5, "context": 1, "stream": 7, "correlation": 226666, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 226666, "pid": 5, "tid": 7, "ts": 1716454225022155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224960507, "dur": 9, "args": { "External id": 226666, "cbid": 51, "correlation": 226666 } }, { "ph": "s", "id": 226666, "pid": 76337, "tid": -914061504, "ts": 1716454224960507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225022158, "dur": 263, "args": { "External id": 226667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226667, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226667, "pid": 5, "tid": 7, "ts": 1716454225022158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960518, "dur": 12, "args": { "External id": 226667, "cbid": 211, "correlation": 226667 } }, { "ph": "s", "id": 226667, "pid": 76337, "tid": -914061504, "ts": 1716454224960518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225022423, "dur": 13, "args": { "External id": 226675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226675, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226675, "pid": 5, "tid": 7, "ts": 1716454225022423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960561, "dur": 10, "args": { "External id": 226675, "cbid": 211, "correlation": 226675 } }, { "ph": "s", "id": 226675, "pid": 76337, "tid": -914061504, "ts": 1716454224960561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225022438, "dur": 37, "args": { "External id": 226686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226686, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226686, "pid": 5, "tid": 7, "ts": 1716454225022438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960630, "dur": 12, "args": { "External id": 226686, "cbid": 211, "correlation": 226686 } }, { "ph": "s", "id": 226686, "pid": 76337, "tid": -914061504, "ts": 1716454224960630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224960694, "dur": 0, "args": { "External id": 226698, "cbid": 317, "correlation": 226698 } }, { "ph": "f", "id": 226698, "pid": 76337, "tid": -914061504, "ts": 1716454224960694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224960694, "dur": 0, "args": { "External id": 226699, "cbid": 203, "correlation": 226699 } }, { "ph": "f", "id": 226699, "pid": 76337, "tid": -914061504, "ts": 1716454224960694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224960695, "dur": 0, "args": { "External id": 226700, "cbid": 205, "correlation": 226700 } }, { "ph": "f", "id": 226700, "pid": 76337, "tid": -914061504, "ts": 1716454224960695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225022476, "dur": 13, "args": { "External id": 226704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226704, "pid": 5, "tid": 7, "ts": 1716454225022476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960710, "dur": 12, "args": { "External id": 226704, "cbid": 211, "correlation": 226704 } }, { "ph": "s", "id": 226704, "pid": 76337, "tid": -914061504, "ts": 1716454224960710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225022490, "dur": 4, "args": { "External id": 226706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226706, "pid": 5, "tid": 7, "ts": 1716454225022490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960728, "dur": 6, "args": { "External id": 226706, "cbid": 211, "correlation": 226706 } }, { "ph": "s", "id": 226706, "pid": 76337, "tid": -914061504, "ts": 1716454224960728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224960736, "dur": 0, "args": { "External id": 226707, "cbid": 51, "correlation": 226707 } }, { "ph": "s", "id": 226707, "pid": 76337, "tid": -914061504, "ts": 1716454224960736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225022495, "dur": 96, "args": { "External id": 226708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226708, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 226708, "pid": 5, "tid": 7, "ts": 1716454225022495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960737, "dur": 5, "args": { "External id": 226708, "cbid": 211, "correlation": 226708 } }, { "ph": "s", "id": 226708, "pid": 76337, "tid": -914061504, "ts": 1716454224960737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225022592, "dur": 16, "args": { "External id": 226713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226713, "pid": 5, "tid": 7, "ts": 1716454225022592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960764, "dur": 9, "args": { "External id": 226713, "cbid": 211, "correlation": 226713 } }, { "ph": "s", "id": 226713, "pid": 76337, "tid": -914061504, "ts": 1716454224960764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225022610, "dur": 11, "args": { "External id": 226721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226721, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226721, "pid": 5, "tid": 7, "ts": 1716454225022610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960796, "dur": 8, "args": { "External id": 226721, "cbid": 211, "correlation": 226721 } }, { "ph": "s", "id": 226721, "pid": 76337, "tid": -914061504, "ts": 1716454224960796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225022622, "dur": 25, "args": { "External id": 226730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226730, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226730, "pid": 5, "tid": 7, "ts": 1716454225022622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960835, "dur": 10, "args": { "External id": 226730, "cbid": 211, "correlation": 226730 } }, { "ph": "s", "id": 226730, "pid": 76337, "tid": -914061504, "ts": 1716454224960835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225022649, "dur": 24, "args": { "External id": 226750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226750, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 226750, "pid": 5, "tid": 7, "ts": 1716454225022649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960905, "dur": 11, "args": { "External id": 226750, "cbid": 211, "correlation": 226750 } }, { "ph": "s", "id": 226750, "pid": 76337, "tid": -914061504, "ts": 1716454224960905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225022674, "dur": 5, "args": { "External id": 226762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226762, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 226762, "pid": 5, "tid": 7, "ts": 1716454225022674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960926, "dur": 7, "args": { "External id": 226762, "cbid": 211, "correlation": 226762 } }, { "ph": "s", "id": 226762, "pid": 76337, "tid": -914061504, "ts": 1716454224960926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225022680, "dur": 24, "args": { "External id": 226765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226765, "pid": 5, "tid": 7, "ts": 1716454225022680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960945, "dur": 7, "args": { "External id": 226765, "cbid": 211, "correlation": 226765 } }, { "ph": "s", "id": 226765, "pid": 76337, "tid": -914061504, "ts": 1716454224960945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225022705, "dur": 17, "args": { "External id": 226774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226774, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226774, "pid": 5, "tid": 7, "ts": 1716454225022705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224960992, "dur": 10, "args": { "External id": 226774, "cbid": 211, "correlation": 226774 } }, { "ph": "s", "id": 226774, "pid": 76337, "tid": -914061504, "ts": 1716454224960992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224961045, "dur": 0, "args": { "External id": 226784, "cbid": 317, "correlation": 226784 } }, { "ph": "f", "id": 226784, "pid": 76337, "tid": -914061504, "ts": 1716454224961045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224961046, "dur": 0, "args": { "External id": 226785, "cbid": 203, "correlation": 226785 } }, { "ph": "f", "id": 226785, "pid": 76337, "tid": -914061504, "ts": 1716454224961046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224961047, "dur": 0, "args": { "External id": 226786, "cbid": 205, "correlation": 226786 } }, { "ph": "f", "id": 226786, "pid": 76337, "tid": -914061504, "ts": 1716454224961047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225022723, "dur": 17, "args": { "External id": 226790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226790, "pid": 5, "tid": 7, "ts": 1716454225022723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961062, "dur": 11, "args": { "External id": 226790, "cbid": 211, "correlation": 226790 } }, { "ph": "s", "id": 226790, "pid": 76337, "tid": -914061504, "ts": 1716454224961062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225022741, "dur": 237, "args": { "External id": 226792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226792, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226792, "pid": 5, "tid": 7, "ts": 1716454225022741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961076, "dur": 6, "args": { "External id": 226792, "cbid": 211, "correlation": 226792 } }, { "ph": "s", "id": 226792, "pid": 76337, "tid": -914061504, "ts": 1716454224961076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225022981, "dur": 1, "args": { "External id": 226794, "device": 5, "context": 1, "stream": 7, "correlation": 226794, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 226794, "pid": 5, "tid": 7, "ts": 1716454225022981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224961087, "dur": 8, "args": { "External id": 226794, "cbid": 51, "correlation": 226794 } }, { "ph": "s", "id": 226794, "pid": 76337, "tid": -914061504, "ts": 1716454224961087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225022985, "dur": 807, "args": { "External id": 226795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226795, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226795, "pid": 5, "tid": 7, "ts": 1716454225022985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961096, "dur": 6, "args": { "External id": 226795, "cbid": 211, "correlation": 226795 } }, { "ph": "s", "id": 226795, "pid": 76337, "tid": -914061504, "ts": 1716454224961096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225023793, "dur": 13, "args": { "External id": 226797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226797, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226797, "pid": 5, "tid": 7, "ts": 1716454225023793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961107, "dur": 5, "args": { "External id": 226797, "cbid": 211, "correlation": 226797 } }, { "ph": "s", "id": 226797, "pid": 76337, "tid": -914061504, "ts": 1716454224961107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225023807, "dur": 14, "args": { "External id": 226803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226803, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226803, "pid": 5, "tid": 7, "ts": 1716454225023807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961136, "dur": 8, "args": { "External id": 226803, "cbid": 211, "correlation": 226803 } }, { "ph": "s", "id": 226803, "pid": 76337, "tid": -914061504, "ts": 1716454224961136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225023823, "dur": 3, "args": { "External id": 226811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226811, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 226811, "pid": 5, "tid": 7, "ts": 1716454225023823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961180, "dur": 9, "args": { "External id": 226811, "cbid": 211, "correlation": 226811 } }, { "ph": "s", "id": 226811, "pid": 76337, "tid": -914061504, "ts": 1716454224961180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224961244, "dur": 1, "args": { "External id": 226827, "cbid": 251, "correlation": 226827 } }, { "ph": "f", "id": 226827, "pid": 76337, "tid": -914061504, "ts": 1716454224961244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224961249, "dur": 0, "args": { "External id": 226829, "cbid": 251, "correlation": 226829 } }, { "ph": "f", "id": 226829, "pid": 76337, "tid": -914061504, "ts": 1716454224961249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225023827, "dur": 13, "args": { "External id": 226830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226830, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226830, "pid": 5, "tid": 7, "ts": 1716454225023827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961251, "dur": 11, "args": { "External id": 226830, "cbid": 211, "correlation": 226830 } }, { "ph": "s", "id": 226830, "pid": 76337, "tid": -914061504, "ts": 1716454224961251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225023842, "dur": 5, "args": { "External id": 226832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226832, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226832, "pid": 5, "tid": 7, "ts": 1716454225023842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961265, "dur": 5, "args": { "External id": 226832, "cbid": 211, "correlation": 226832 } }, { "ph": "s", "id": 226832, "pid": 76337, "tid": -914061504, "ts": 1716454224961265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225023848, "dur": 17, "args": { "External id": 226842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226842, "pid": 5, "tid": 7, "ts": 1716454225023848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961322, "dur": 14, "args": { "External id": 226842, "cbid": 211, "correlation": 226842 } }, { "ph": "s", "id": 226842, "pid": 76337, "tid": -914061504, "ts": 1716454224961322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225023866, "dur": 17, "args": { "External id": 226862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226862, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 226862, "pid": 5, "tid": 7, "ts": 1716454225023866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961390, "dur": 10, "args": { "External id": 226862, "cbid": 211, "correlation": 226862 } }, { "ph": "s", "id": 226862, "pid": 76337, "tid": -914061504, "ts": 1716454224961390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225023884, "dur": 4, "args": { "External id": 226874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226874, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 226874, "pid": 5, "tid": 7, "ts": 1716454225023884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961411, "dur": 6, "args": { "External id": 226874, "cbid": 211, "correlation": 226874 } }, { "ph": "s", "id": 226874, "pid": 76337, "tid": -914061504, "ts": 1716454224961411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225023890, "dur": 16, "args": { "External id": 226877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226877, "pid": 5, "tid": 7, "ts": 1716454225023890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961430, "dur": 7, "args": { "External id": 226877, "cbid": 211, "correlation": 226877 } }, { "ph": "s", "id": 226877, "pid": 76337, "tid": -914061504, "ts": 1716454224961430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225023907, "dur": 11, "args": { "External id": 226886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226886, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226886, "pid": 5, "tid": 7, "ts": 1716454225023907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961469, "dur": 10, "args": { "External id": 226886, "cbid": 211, "correlation": 226886 } }, { "ph": "s", "id": 226886, "pid": 76337, "tid": -914061504, "ts": 1716454224961469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224961531, "dur": 0, "args": { "External id": 226896, "cbid": 317, "correlation": 226896 } }, { "ph": "f", "id": 226896, "pid": 76337, "tid": -914061504, "ts": 1716454224961531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224961532, "dur": 0, "args": { "External id": 226897, "cbid": 203, "correlation": 226897 } }, { "ph": "f", "id": 226897, "pid": 76337, "tid": -914061504, "ts": 1716454224961532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224961533, "dur": 0, "args": { "External id": 226898, "cbid": 205, "correlation": 226898 } }, { "ph": "f", "id": 226898, "pid": 76337, "tid": -914061504, "ts": 1716454224961533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225023919, "dur": 11, "args": { "External id": 226902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226902, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226902, "pid": 5, "tid": 7, "ts": 1716454225023919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961546, "dur": 12, "args": { "External id": 226902, "cbid": 211, "correlation": 226902 } }, { "ph": "s", "id": 226902, "pid": 76337, "tid": -914061504, "ts": 1716454224961546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225023931, "dur": 160, "args": { "External id": 226904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226904, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226904, "pid": 5, "tid": 7, "ts": 1716454225023931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961561, "dur": 5, "args": { "External id": 226904, "cbid": 211, "correlation": 226904 } }, { "ph": "s", "id": 226904, "pid": 76337, "tid": -914061504, "ts": 1716454224961561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225024093, "dur": 1, "args": { "External id": 226906, "device": 5, "context": 1, "stream": 7, "correlation": 226906, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 226906, "pid": 5, "tid": 7, "ts": 1716454225024093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224961571, "dur": 7, "args": { "External id": 226906, "cbid": 51, "correlation": 226906 } }, { "ph": "s", "id": 226906, "pid": 76337, "tid": -914061504, "ts": 1716454224961571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225024097, "dur": 641, "args": { "External id": 226907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226907, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 226907, "pid": 5, "tid": 7, "ts": 1716454225024097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961580, "dur": 6, "args": { "External id": 226907, "cbid": 211, "correlation": 226907 } }, { "ph": "s", "id": 226907, "pid": 76337, "tid": -914061504, "ts": 1716454224961580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225024739, "dur": 12, "args": { "External id": 226909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226909, "pid": 5, "tid": 7, "ts": 1716454225024739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961590, "dur": 5, "args": { "External id": 226909, "cbid": 211, "correlation": 226909 } }, { "ph": "s", "id": 226909, "pid": 76337, "tid": -914061504, "ts": 1716454224961590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225024753, "dur": 15, "args": { "External id": 226915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226915, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226915, "pid": 5, "tid": 7, "ts": 1716454225024753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961618, "dur": 9, "args": { "External id": 226915, "cbid": 211, "correlation": 226915 } }, { "ph": "s", "id": 226915, "pid": 76337, "tid": -914061504, "ts": 1716454224961618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224961677, "dur": 0, "args": { "External id": 226925, "cbid": 317, "correlation": 226925 } }, { "ph": "f", "id": 226925, "pid": 76337, "tid": -914061504, "ts": 1716454224961677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224961678, "dur": 0, "args": { "External id": 226926, "cbid": 203, "correlation": 226926 } }, { "ph": "f", "id": 226926, "pid": 76337, "tid": -914061504, "ts": 1716454224961678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224961679, "dur": 0, "args": { "External id": 226927, "cbid": 205, "correlation": 226927 } }, { "ph": "f", "id": 226927, "pid": 76337, "tid": -914061504, "ts": 1716454224961679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225024769, "dur": 17, "args": { "External id": 226931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226931, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226931, "pid": 5, "tid": 7, "ts": 1716454225024769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961692, "dur": 12, "args": { "External id": 226931, "cbid": 211, "correlation": 226931 } }, { "ph": "s", "id": 226931, "pid": 76337, "tid": -914061504, "ts": 1716454224961692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225024787, "dur": 4, "args": { "External id": 226933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 226933, "pid": 5, "tid": 7, "ts": 1716454225024787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961708, "dur": 6, "args": { "External id": 226933, "cbid": 211, "correlation": 226933 } }, { "ph": "s", "id": 226933, "pid": 76337, "tid": -914061504, "ts": 1716454224961708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224961717, "dur": 0, "args": { "External id": 226934, "cbid": 51, "correlation": 226934 } }, { "ph": "s", "id": 226934, "pid": 76337, "tid": -914061504, "ts": 1716454224961717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225024792, "dur": 132, "args": { "External id": 226935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226935, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 226935, "pid": 5, "tid": 7, "ts": 1716454225024792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961718, "dur": 5, "args": { "External id": 226935, "cbid": 211, "correlation": 226935 } }, { "ph": "s", "id": 226935, "pid": 76337, "tid": -914061504, "ts": 1716454224961718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225024926, "dur": 16, "args": { "External id": 226940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226940, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226940, "pid": 5, "tid": 7, "ts": 1716454225024926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961744, "dur": 9, "args": { "External id": 226940, "cbid": 211, "correlation": 226940 } }, { "ph": "s", "id": 226940, "pid": 76337, "tid": -914061504, "ts": 1716454224961744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225024942, "dur": 12, "args": { "External id": 226948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226948, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226948, "pid": 5, "tid": 7, "ts": 1716454225024942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961773, "dur": 9, "args": { "External id": 226948, "cbid": 211, "correlation": 226948 } }, { "ph": "s", "id": 226948, "pid": 76337, "tid": -914061504, "ts": 1716454224961773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225024956, "dur": 10, "args": { "External id": 226956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226956, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226956, "pid": 5, "tid": 7, "ts": 1716454225024956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961802, "dur": 8, "args": { "External id": 226956, "cbid": 211, "correlation": 226956 } }, { "ph": "s", "id": 226956, "pid": 76337, "tid": -914061504, "ts": 1716454224961802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225024967, "dur": 18, "args": { "External id": 226976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226976, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 226976, "pid": 5, "tid": 7, "ts": 1716454225024967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961885, "dur": 12, "args": { "External id": 226976, "cbid": 211, "correlation": 226976 } }, { "ph": "s", "id": 226976, "pid": 76337, "tid": -914061504, "ts": 1716454224961885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225024986, "dur": 4, "args": { "External id": 226988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226988, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 226988, "pid": 5, "tid": 7, "ts": 1716454225024986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961907, "dur": 6, "args": { "External id": 226988, "cbid": 211, "correlation": 226988 } }, { "ph": "s", "id": 226988, "pid": 76337, "tid": -914061504, "ts": 1716454224961907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225024992, "dur": 16, "args": { "External id": 226991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 226991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 226991, "pid": 5, "tid": 7, "ts": 1716454225024992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224961925, "dur": 7, "args": { "External id": 226991, "cbid": 211, "correlation": 226991 } }, { "ph": "s", "id": 226991, "pid": 76337, "tid": -914061504, "ts": 1716454224961925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224961990, "dur": 0, "args": { "External id": 227002, "cbid": 317, "correlation": 227002 } }, { "ph": "f", "id": 227002, "pid": 76337, "tid": -914061504, "ts": 1716454224961990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224961991, "dur": 0, "args": { "External id": 227003, "cbid": 203, "correlation": 227003 } }, { "ph": "f", "id": 227003, "pid": 76337, "tid": -914061504, "ts": 1716454224961991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224961992, "dur": 0, "args": { "External id": 227004, "cbid": 205, "correlation": 227004 } }, { "ph": "f", "id": 227004, "pid": 76337, "tid": -914061504, "ts": 1716454224961992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225025009, "dur": 11, "args": { "External id": 227008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227008, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227008, "pid": 5, "tid": 7, "ts": 1716454225025009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962005, "dur": 12, "args": { "External id": 227008, "cbid": 211, "correlation": 227008 } }, { "ph": "s", "id": 227008, "pid": 76337, "tid": -914061504, "ts": 1716454224962005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225025022, "dur": 3, "args": { "External id": 227010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227010, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 227010, "pid": 5, "tid": 7, "ts": 1716454225025022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962021, "dur": 6, "args": { "External id": 227010, "cbid": 211, "correlation": 227010 } }, { "ph": "s", "id": 227010, "pid": 76337, "tid": -914061504, "ts": 1716454224962021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224962030, "dur": 0, "args": { "External id": 227011, "cbid": 51, "correlation": 227011 } }, { "ph": "s", "id": 227011, "pid": 76337, "tid": -914061504, "ts": 1716454224962030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225025026, "dur": 91, "args": { "External id": 227012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227012, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 227012, "pid": 5, "tid": 7, "ts": 1716454225025026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962031, "dur": 6, "args": { "External id": 227012, "cbid": 211, "correlation": 227012 } }, { "ph": "s", "id": 227012, "pid": 76337, "tid": -914061504, "ts": 1716454224962031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225025118, "dur": 16, "args": { "External id": 227017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227017, "pid": 5, "tid": 7, "ts": 1716454225025118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962058, "dur": 8, "args": { "External id": 227017, "cbid": 211, "correlation": 227017 } }, { "ph": "s", "id": 227017, "pid": 76337, "tid": -914061504, "ts": 1716454224962058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225025135, "dur": 82, "args": { "External id": 227026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227026, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227026, "pid": 5, "tid": 7, "ts": 1716454225025135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962139, "dur": 14, "args": { "External id": 227026, "cbid": 211, "correlation": 227026 } }, { "ph": "s", "id": 227026, "pid": 76337, "tid": -914061504, "ts": 1716454224962139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225025218, "dur": 30, "args": { "External id": 227048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227048, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227048, "pid": 5, "tid": 7, "ts": 1716454225025218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962197, "dur": 10, "args": { "External id": 227048, "cbid": 211, "correlation": 227048 } }, { "ph": "s", "id": 227048, "pid": 76337, "tid": -914061504, "ts": 1716454224962197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224962286, "dur": 1, "args": { "External id": 227059, "cbid": 251, "correlation": 227059 } }, { "ph": "f", "id": 227059, "pid": 76337, "tid": -914061504, "ts": 1716454224962286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225025250, "dur": 162, "args": { "External id": 227060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227060, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227060, "pid": 5, "tid": 7, "ts": 1716454225025250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962292, "dur": 13, "args": { "External id": 227060, "cbid": 211, "correlation": 227060 } }, { "ph": "s", "id": 227060, "pid": 76337, "tid": -914061504, "ts": 1716454224962292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224962362, "dur": 1, "args": { "External id": 227071, "cbid": 251, "correlation": 227071 } }, { "ph": "f", "id": 227071, "pid": 76337, "tid": -914061504, "ts": 1716454224962362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225025413, "dur": 152, "args": { "External id": 227072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227072, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227072, "pid": 5, "tid": 7, "ts": 1716454225025413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962366, "dur": 11, "args": { "External id": 227072, "cbid": 211, "correlation": 227072 } }, { "ph": "s", "id": 227072, "pid": 76337, "tid": -914061504, "ts": 1716454224962366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224962431, "dur": 1, "args": { "External id": 227083, "cbid": 251, "correlation": 227083 } }, { "ph": "f", "id": 227083, "pid": 76337, "tid": -914061504, "ts": 1716454224962431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225025567, "dur": 156, "args": { "External id": 227084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227084, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227084, "pid": 5, "tid": 7, "ts": 1716454225025567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962436, "dur": 11, "args": { "External id": 227084, "cbid": 211, "correlation": 227084 } }, { "ph": "s", "id": 227084, "pid": 76337, "tid": -914061504, "ts": 1716454224962436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225025724, "dur": 332, "args": { "External id": 227109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227109, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227109, "pid": 5, "tid": 7, "ts": 1716454225025724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962519, "dur": 13, "args": { "External id": 227109, "cbid": 211, "correlation": 227109 } }, { "ph": "s", "id": 227109, "pid": 76337, "tid": -914061504, "ts": 1716454224962519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224962618, "dur": 1, "args": { "External id": 227127, "cbid": 251, "correlation": 227127 } }, { "ph": "f", "id": 227127, "pid": 76337, "tid": -914061504, "ts": 1716454224962618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225026057, "dur": 162, "args": { "External id": 227129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227129, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227129, "pid": 5, "tid": 7, "ts": 1716454225026057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962624, "dur": 14, "args": { "External id": 227129, "cbid": 211, "correlation": 227129 } }, { "ph": "s", "id": 227129, "pid": 76337, "tid": -914061504, "ts": 1716454224962624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225026220, "dur": 19, "args": { "External id": 227137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227137, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227137, "pid": 5, "tid": 7, "ts": 1716454225026220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962694, "dur": 12, "args": { "External id": 227137, "cbid": 211, "correlation": 227137 } }, { "ph": "s", "id": 227137, "pid": 76337, "tid": -914061504, "ts": 1716454224962694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225026241, "dur": 28, "args": { "External id": 227145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227145, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227145, "pid": 5, "tid": 7, "ts": 1716454225026241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962733, "dur": 10, "args": { "External id": 227145, "cbid": 211, "correlation": 227145 } }, { "ph": "s", "id": 227145, "pid": 76337, "tid": -914061504, "ts": 1716454224962733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225026270, "dur": 19, "args": { "External id": 227156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227156, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227156, "pid": 5, "tid": 7, "ts": 1716454225026270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962804, "dur": 13, "args": { "External id": 227156, "cbid": 211, "correlation": 227156 } }, { "ph": "s", "id": 227156, "pid": 76337, "tid": -914061504, "ts": 1716454224962804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225026290, "dur": 16, "args": { "External id": 227178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227178, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227178, "pid": 5, "tid": 7, "ts": 1716454225026290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962835, "dur": 8, "args": { "External id": 227178, "cbid": 211, "correlation": 227178 } }, { "ph": "s", "id": 227178, "pid": 76337, "tid": -914061504, "ts": 1716454224962835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224962922, "dur": 1, "args": { "External id": 227189, "cbid": 251, "correlation": 227189 } }, { "ph": "f", "id": 227189, "pid": 76337, "tid": -914061504, "ts": 1716454224962922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225026307, "dur": 87, "args": { "External id": 227190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227190, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 227190, "pid": 5, "tid": 7, "ts": 1716454225026307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224962927, "dur": 14, "args": { "External id": 227190, "cbid": 211, "correlation": 227190 } }, { "ph": "s", "id": 227190, "pid": 76337, "tid": -914061504, "ts": 1716454224962927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963005, "dur": 1, "args": { "External id": 227201, "cbid": 251, "correlation": 227201 } }, { "ph": "f", "id": 227201, "pid": 76337, "tid": -914061504, "ts": 1716454224963005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963008, "dur": 0, "args": { "External id": 227202, "cbid": 251, "correlation": 227202 } }, { "ph": "f", "id": 227202, "pid": 76337, "tid": -914061504, "ts": 1716454224963008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225026395, "dur": 11, "args": { "External id": 227203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227203, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227203, "pid": 5, "tid": 7, "ts": 1716454225026395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963010, "dur": 12, "args": { "External id": 227203, "cbid": 211, "correlation": 227203 } }, { "ph": "s", "id": 227203, "pid": 76337, "tid": -914061504, "ts": 1716454224963010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225026407, "dur": 5, "args": { "External id": 227205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227205, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227205, "pid": 5, "tid": 7, "ts": 1716454225026407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963024, "dur": 6, "args": { "External id": 227205, "cbid": 211, "correlation": 227205 } }, { "ph": "s", "id": 227205, "pid": 76337, "tid": -914061504, "ts": 1716454224963024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963083, "dur": 1, "args": { "External id": 227216, "cbid": 251, "correlation": 227216 } }, { "ph": "f", "id": 227216, "pid": 76337, "tid": -914061504, "ts": 1716454224963083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963086, "dur": 0, "args": { "External id": 227217, "cbid": 251, "correlation": 227217 } }, { "ph": "f", "id": 227217, "pid": 76337, "tid": -914061504, "ts": 1716454224963086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225026414, "dur": 8, "args": { "External id": 227218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227218, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227218, "pid": 5, "tid": 7, "ts": 1716454225026414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963088, "dur": 11, "args": { "External id": 227218, "cbid": 211, "correlation": 227218 } }, { "ph": "s", "id": 227218, "pid": 76337, "tid": -914061504, "ts": 1716454224963088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225026423, "dur": 3, "args": { "External id": 227220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227220, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227220, "pid": 5, "tid": 7, "ts": 1716454225026423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963101, "dur": 6, "args": { "External id": 227220, "cbid": 211, "correlation": 227220 } }, { "ph": "s", "id": 227220, "pid": 76337, "tid": -914061504, "ts": 1716454224963101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225026428, "dur": 55, "args": { "External id": 227245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227245, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227245, "pid": 5, "tid": 7, "ts": 1716454225026428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963178, "dur": 13, "args": { "External id": 227245, "cbid": 211, "correlation": 227245 } }, { "ph": "s", "id": 227245, "pid": 76337, "tid": -914061504, "ts": 1716454224963178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963276, "dur": 1, "args": { "External id": 227263, "cbid": 251, "correlation": 227263 } }, { "ph": "f", "id": 227263, "pid": 76337, "tid": -914061504, "ts": 1716454224963276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225026484, "dur": 89, "args": { "External id": 227265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227265, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 227265, "pid": 5, "tid": 7, "ts": 1716454225026484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963282, "dur": 14, "args": { "External id": 227265, "cbid": 211, "correlation": 227265 } }, { "ph": "s", "id": 227265, "pid": 76337, "tid": -914061504, "ts": 1716454224963282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225026574, "dur": 9, "args": { "External id": 227273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227273, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227273, "pid": 5, "tid": 7, "ts": 1716454225026574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963352, "dur": 11, "args": { "External id": 227273, "cbid": 211, "correlation": 227273 } }, { "ph": "s", "id": 227273, "pid": 76337, "tid": -914061504, "ts": 1716454224963352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225026585, "dur": 21, "args": { "External id": 227281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227281, "pid": 5, "tid": 7, "ts": 1716454225026585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963393, "dur": 9, "args": { "External id": 227281, "cbid": 211, "correlation": 227281 } }, { "ph": "s", "id": 227281, "pid": 76337, "tid": -914061504, "ts": 1716454224963393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225026607, "dur": 18, "args": { "External id": 227303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227303, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227303, "pid": 5, "tid": 7, "ts": 1716454225026607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963445, "dur": 10, "args": { "External id": 227303, "cbid": 211, "correlation": 227303 } }, { "ph": "s", "id": 227303, "pid": 76337, "tid": -914061504, "ts": 1716454224963445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963532, "dur": 1, "args": { "External id": 227319, "cbid": 251, "correlation": 227319 } }, { "ph": "f", "id": 227319, "pid": 76337, "tid": -914061504, "ts": 1716454224963532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963537, "dur": 0, "args": { "External id": 227321, "cbid": 251, "correlation": 227321 } }, { "ph": "f", "id": 227321, "pid": 76337, "tid": -914061504, "ts": 1716454224963537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225026626, "dur": 492, "args": { "External id": 227322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227322, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227322, "pid": 5, "tid": 7, "ts": 1716454225026626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963539, "dur": 13, "args": { "External id": 227322, "cbid": 211, "correlation": 227322 } }, { "ph": "s", "id": 227322, "pid": 76337, "tid": -914061504, "ts": 1716454224963539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225027119, "dur": 66, "args": { "External id": 227330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227330, "pid": 5, "tid": 7, "ts": 1716454225027119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963605, "dur": 12, "args": { "External id": 227330, "cbid": 211, "correlation": 227330 } }, { "ph": "s", "id": 227330, "pid": 76337, "tid": -914061504, "ts": 1716454224963605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225027187, "dur": 66, "args": { "External id": 227338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227338, "pid": 5, "tid": 7, "ts": 1716454225027187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963636, "dur": 9, "args": { "External id": 227338, "cbid": 211, "correlation": 227338 } }, { "ph": "s", "id": 227338, "pid": 76337, "tid": -914061504, "ts": 1716454224963636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224963716, "dur": 1, "args": { "External id": 227354, "cbid": 251, "correlation": 227354 } }, { "ph": "f", "id": 227354, "pid": 76337, "tid": -914061504, "ts": 1716454224963716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225027255, "dur": 1, "args": { "External id": 227356, "device": 5, "context": 1, "stream": 7, "correlation": 227356, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 227356, "pid": 5, "tid": 7, "ts": 1716454225027255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224963721, "dur": 9, "args": { "External id": 227356, "cbid": 51, "correlation": 227356 } }, { "ph": "s", "id": 227356, "pid": 76337, "tid": -914061504, "ts": 1716454224963721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225027259, "dur": 267, "args": { "External id": 227357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227357, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 227357, "pid": 5, "tid": 7, "ts": 1716454225027259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963731, "dur": 12, "args": { "External id": 227357, "cbid": 211, "correlation": 227357 } }, { "ph": "s", "id": 227357, "pid": 76337, "tid": -914061504, "ts": 1716454224963731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225027527, "dur": 13, "args": { "External id": 227365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227365, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227365, "pid": 5, "tid": 7, "ts": 1716454225027527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963773, "dur": 10, "args": { "External id": 227365, "cbid": 211, "correlation": 227365 } }, { "ph": "s", "id": 227365, "pid": 76337, "tid": -914061504, "ts": 1716454224963773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225027542, "dur": 37, "args": { "External id": 227376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227376, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227376, "pid": 5, "tid": 7, "ts": 1716454225027542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963842, "dur": 12, "args": { "External id": 227376, "cbid": 211, "correlation": 227376 } }, { "ph": "s", "id": 227376, "pid": 76337, "tid": -914061504, "ts": 1716454224963842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224963905, "dur": 0, "args": { "External id": 227388, "cbid": 317, "correlation": 227388 } }, { "ph": "f", "id": 227388, "pid": 76337, "tid": -914061504, "ts": 1716454224963905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224963906, "dur": 0, "args": { "External id": 227389, "cbid": 203, "correlation": 227389 } }, { "ph": "f", "id": 227389, "pid": 76337, "tid": -914061504, "ts": 1716454224963906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224963906, "dur": 0, "args": { "External id": 227390, "cbid": 205, "correlation": 227390 } }, { "ph": "f", "id": 227390, "pid": 76337, "tid": -914061504, "ts": 1716454224963906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225027580, "dur": 12, "args": { "External id": 227394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227394, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227394, "pid": 5, "tid": 7, "ts": 1716454225027580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963921, "dur": 12, "args": { "External id": 227394, "cbid": 211, "correlation": 227394 } }, { "ph": "s", "id": 227394, "pid": 76337, "tid": -914061504, "ts": 1716454224963921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225027593, "dur": 3, "args": { "External id": 227396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 227396, "pid": 5, "tid": 7, "ts": 1716454225027593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963938, "dur": 7, "args": { "External id": 227396, "cbid": 211, "correlation": 227396 } }, { "ph": "s", "id": 227396, "pid": 76337, "tid": -914061504, "ts": 1716454224963938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224963947, "dur": 0, "args": { "External id": 227397, "cbid": 51, "correlation": 227397 } }, { "ph": "s", "id": 227397, "pid": 76337, "tid": -914061504, "ts": 1716454224963947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225027598, "dur": 95, "args": { "External id": 227398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227398, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 227398, "pid": 5, "tid": 7, "ts": 1716454225027598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963948, "dur": 5, "args": { "External id": 227398, "cbid": 211, "correlation": 227398 } }, { "ph": "s", "id": 227398, "pid": 76337, "tid": -914061504, "ts": 1716454224963948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225027695, "dur": 16, "args": { "External id": 227403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227403, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227403, "pid": 5, "tid": 7, "ts": 1716454225027695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224963982, "dur": 10, "args": { "External id": 227403, "cbid": 211, "correlation": 227403 } }, { "ph": "s", "id": 227403, "pid": 76337, "tid": -914061504, "ts": 1716454224963982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225027712, "dur": 11, "args": { "External id": 227411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227411, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227411, "pid": 5, "tid": 7, "ts": 1716454225027712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964016, "dur": 8, "args": { "External id": 227411, "cbid": 211, "correlation": 227411 } }, { "ph": "s", "id": 227411, "pid": 76337, "tid": -914061504, "ts": 1716454224964016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225027725, "dur": 55, "args": { "External id": 227422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227422, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227422, "pid": 5, "tid": 7, "ts": 1716454225027725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964080, "dur": 12, "args": { "External id": 227422, "cbid": 211, "correlation": 227422 } }, { "ph": "s", "id": 227422, "pid": 76337, "tid": -914061504, "ts": 1716454224964080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224964135, "dur": 0, "args": { "External id": 227432, "cbid": 317, "correlation": 227432 } }, { "ph": "f", "id": 227432, "pid": 76337, "tid": -914061504, "ts": 1716454224964135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224964136, "dur": 0, "args": { "External id": 227433, "cbid": 203, "correlation": 227433 } }, { "ph": "f", "id": 227433, "pid": 76337, "tid": -914061504, "ts": 1716454224964136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224964137, "dur": 0, "args": { "External id": 227434, "cbid": 205, "correlation": 227434 } }, { "ph": "f", "id": 227434, "pid": 76337, "tid": -914061504, "ts": 1716454224964137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225027781, "dur": 38, "args": { "External id": 227438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227438, "pid": 5, "tid": 7, "ts": 1716454225027781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964154, "dur": 11, "args": { "External id": 227438, "cbid": 211, "correlation": 227438 } }, { "ph": "s", "id": 227438, "pid": 76337, "tid": -914061504, "ts": 1716454224964154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225027821, "dur": 160, "args": { "External id": 227440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227440, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227440, "pid": 5, "tid": 7, "ts": 1716454225027821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964167, "dur": 5, "args": { "External id": 227440, "cbid": 211, "correlation": 227440 } }, { "ph": "s", "id": 227440, "pid": 76337, "tid": -914061504, "ts": 1716454224964167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225027983, "dur": 1954, "args": { "External id": 227442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227442, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227442, "pid": 5, "tid": 7, "ts": 1716454225027983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964179, "dur": 8, "args": { "External id": 227442, "cbid": 211, "correlation": 227442 } }, { "ph": "s", "id": 227442, "pid": 76337, "tid": -914061504, "ts": 1716454224964179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225029938, "dur": 38, "args": { "External id": 227444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227444, "pid": 5, "tid": 7, "ts": 1716454225029938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964191, "dur": 6, "args": { "External id": 227444, "cbid": 211, "correlation": 227444 } }, { "ph": "s", "id": 227444, "pid": 76337, "tid": -914061504, "ts": 1716454224964191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225029978, "dur": 59, "args": { "External id": 227450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227450, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227450, "pid": 5, "tid": 7, "ts": 1716454225029978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964219, "dur": 8, "args": { "External id": 227450, "cbid": 211, "correlation": 227450 } }, { "ph": "s", "id": 227450, "pid": 76337, "tid": -914061504, "ts": 1716454224964219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225030038, "dur": 84, "args": { "External id": 227459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227459, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227459, "pid": 5, "tid": 7, "ts": 1716454225030038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964308, "dur": 13, "args": { "External id": 227459, "cbid": 211, "correlation": 227459 } }, { "ph": "s", "id": 227459, "pid": 76337, "tid": -914061504, "ts": 1716454224964308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225030124, "dur": 72, "args": { "External id": 227479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227479, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 227479, "pid": 5, "tid": 7, "ts": 1716454225030124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964377, "dur": 11, "args": { "External id": 227479, "cbid": 211, "correlation": 227479 } }, { "ph": "s", "id": 227479, "pid": 76337, "tid": -914061504, "ts": 1716454224964377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225030198, "dur": 5, "args": { "External id": 227491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227491, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 227491, "pid": 5, "tid": 7, "ts": 1716454225030198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964400, "dur": 7, "args": { "External id": 227491, "cbid": 211, "correlation": 227491 } }, { "ph": "s", "id": 227491, "pid": 76337, "tid": -914061504, "ts": 1716454224964400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225030204, "dur": 82, "args": { "External id": 227494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227494, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227494, "pid": 5, "tid": 7, "ts": 1716454225030204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964418, "dur": 6, "args": { "External id": 227494, "cbid": 211, "correlation": 227494 } }, { "ph": "s", "id": 227494, "pid": 76337, "tid": -914061504, "ts": 1716454224964418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225030287, "dur": 52, "args": { "External id": 227503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227503, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227503, "pid": 5, "tid": 7, "ts": 1716454225030287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964458, "dur": 9, "args": { "External id": 227503, "cbid": 211, "correlation": 227503 } }, { "ph": "s", "id": 227503, "pid": 76337, "tid": -914061504, "ts": 1716454224964458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224964509, "dur": 0, "args": { "External id": 227513, "cbid": 317, "correlation": 227513 } }, { "ph": "f", "id": 227513, "pid": 76337, "tid": -914061504, "ts": 1716454224964509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224964510, "dur": 0, "args": { "External id": 227514, "cbid": 203, "correlation": 227514 } }, { "ph": "f", "id": 227514, "pid": 76337, "tid": -914061504, "ts": 1716454224964510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224964511, "dur": 0, "args": { "External id": 227515, "cbid": 205, "correlation": 227515 } }, { "ph": "f", "id": 227515, "pid": 76337, "tid": -914061504, "ts": 1716454224964511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225030340, "dur": 57, "args": { "External id": 227519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227519, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227519, "pid": 5, "tid": 7, "ts": 1716454225030340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964527, "dur": 11, "args": { "External id": 227519, "cbid": 211, "correlation": 227519 } }, { "ph": "s", "id": 227519, "pid": 76337, "tid": -914061504, "ts": 1716454224964527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225030398, "dur": 120, "args": { "External id": 227521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227521, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227521, "pid": 5, "tid": 7, "ts": 1716454225030398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964541, "dur": 6, "args": { "External id": 227521, "cbid": 211, "correlation": 227521 } }, { "ph": "s", "id": 227521, "pid": 76337, "tid": -914061504, "ts": 1716454224964541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225030519, "dur": 1873, "args": { "External id": 227523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227523, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227523, "pid": 5, "tid": 7, "ts": 1716454225030519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964553, "dur": 6, "args": { "External id": 227523, "cbid": 211, "correlation": 227523 } }, { "ph": "s", "id": 227523, "pid": 76337, "tid": -914061504, "ts": 1716454224964553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225032393, "dur": 19, "args": { "External id": 227525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227525, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227525, "pid": 5, "tid": 7, "ts": 1716454225032393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964563, "dur": 5, "args": { "External id": 227525, "cbid": 211, "correlation": 227525 } }, { "ph": "s", "id": 227525, "pid": 76337, "tid": -914061504, "ts": 1716454224964563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225032413, "dur": 33, "args": { "External id": 227531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227531, "pid": 5, "tid": 7, "ts": 1716454225032413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964591, "dur": 9, "args": { "External id": 227531, "cbid": 211, "correlation": 227531 } }, { "ph": "s", "id": 227531, "pid": 76337, "tid": -914061504, "ts": 1716454224964591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225032448, "dur": 4, "args": { "External id": 227539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227539, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 227539, "pid": 5, "tid": 7, "ts": 1716454225032448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964634, "dur": 10, "args": { "External id": 227539, "cbid": 211, "correlation": 227539 } }, { "ph": "s", "id": 227539, "pid": 76337, "tid": -914061504, "ts": 1716454224964634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224964701, "dur": 1, "args": { "External id": 227555, "cbid": 251, "correlation": 227555 } }, { "ph": "f", "id": 227555, "pid": 76337, "tid": -914061504, "ts": 1716454224964701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224964706, "dur": 0, "args": { "External id": 227557, "cbid": 251, "correlation": 227557 } }, { "ph": "f", "id": 227557, "pid": 76337, "tid": -914061504, "ts": 1716454224964706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225032452, "dur": 12, "args": { "External id": 227558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227558, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 227558, "pid": 5, "tid": 7, "ts": 1716454225032452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964708, "dur": 11, "args": { "External id": 227558, "cbid": 211, "correlation": 227558 } }, { "ph": "s", "id": 227558, "pid": 76337, "tid": -914061504, "ts": 1716454224964708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225032466, "dur": 5, "args": { "External id": 227560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227560, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 227560, "pid": 5, "tid": 7, "ts": 1716454225032466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964721, "dur": 5, "args": { "External id": 227560, "cbid": 211, "correlation": 227560 } }, { "ph": "s", "id": 227560, "pid": 76337, "tid": -914061504, "ts": 1716454224964721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225032472, "dur": 29, "args": { "External id": 227570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227570, "pid": 5, "tid": 7, "ts": 1716454225032472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964780, "dur": 12, "args": { "External id": 227570, "cbid": 211, "correlation": 227570 } }, { "ph": "s", "id": 227570, "pid": 76337, "tid": -914061504, "ts": 1716454224964780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225032502, "dur": 30, "args": { "External id": 227590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227590, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 227590, "pid": 5, "tid": 7, "ts": 1716454225032502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964845, "dur": 11, "args": { "External id": 227590, "cbid": 211, "correlation": 227590 } }, { "ph": "s", "id": 227590, "pid": 76337, "tid": -914061504, "ts": 1716454224964845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225032533, "dur": 4, "args": { "External id": 227602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227602, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 227602, "pid": 5, "tid": 7, "ts": 1716454225032533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964865, "dur": 6, "args": { "External id": 227602, "cbid": 211, "correlation": 227602 } }, { "ph": "s", "id": 227602, "pid": 76337, "tid": -914061504, "ts": 1716454224964865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225032539, "dur": 29, "args": { "External id": 227605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227605, "pid": 5, "tid": 7, "ts": 1716454225032539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964883, "dur": 6, "args": { "External id": 227605, "cbid": 211, "correlation": 227605 } }, { "ph": "s", "id": 227605, "pid": 76337, "tid": -914061504, "ts": 1716454224964883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225032570, "dur": 20, "args": { "External id": 227614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227614, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227614, "pid": 5, "tid": 7, "ts": 1716454225032570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224964923, "dur": 10, "args": { "External id": 227614, "cbid": 211, "correlation": 227614 } }, { "ph": "s", "id": 227614, "pid": 76337, "tid": -914061504, "ts": 1716454224964923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224964995, "dur": 0, "args": { "External id": 227624, "cbid": 317, "correlation": 227624 } }, { "ph": "f", "id": 227624, "pid": 76337, "tid": -914061504, "ts": 1716454224964995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224964996, "dur": 0, "args": { "External id": 227625, "cbid": 203, "correlation": 227625 } }, { "ph": "f", "id": 227625, "pid": 76337, "tid": -914061504, "ts": 1716454224964996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224964996, "dur": 0, "args": { "External id": 227626, "cbid": 205, "correlation": 227626 } }, { "ph": "f", "id": 227626, "pid": 76337, "tid": -914061504, "ts": 1716454224964996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225032592, "dur": 23, "args": { "External id": 227630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227630, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227630, "pid": 5, "tid": 7, "ts": 1716454225032592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965013, "dur": 12, "args": { "External id": 227630, "cbid": 211, "correlation": 227630 } }, { "ph": "s", "id": 227630, "pid": 76337, "tid": -914061504, "ts": 1716454224965013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225032616, "dur": 43, "args": { "External id": 227632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227632, "pid": 5, "tid": 7, "ts": 1716454225032616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965027, "dur": 5, "args": { "External id": 227632, "cbid": 211, "correlation": 227632 } }, { "ph": "s", "id": 227632, "pid": 76337, "tid": -914061504, "ts": 1716454224965027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225032660, "dur": 637, "args": { "External id": 227634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227634, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227634, "pid": 5, "tid": 7, "ts": 1716454225032660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965039, "dur": 6, "args": { "External id": 227634, "cbid": 211, "correlation": 227634 } }, { "ph": "s", "id": 227634, "pid": 76337, "tid": -914061504, "ts": 1716454224965039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225033298, "dur": 22, "args": { "External id": 227636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227636, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227636, "pid": 5, "tid": 7, "ts": 1716454225033298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965049, "dur": 5, "args": { "External id": 227636, "cbid": 211, "correlation": 227636 } }, { "ph": "s", "id": 227636, "pid": 76337, "tid": -914061504, "ts": 1716454224965049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225033322, "dur": 32, "args": { "External id": 227642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227642, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227642, "pid": 5, "tid": 7, "ts": 1716454225033322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965077, "dur": 9, "args": { "External id": 227642, "cbid": 211, "correlation": 227642 } }, { "ph": "s", "id": 227642, "pid": 76337, "tid": -914061504, "ts": 1716454224965077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224965137, "dur": 0, "args": { "External id": 227652, "cbid": 317, "correlation": 227652 } }, { "ph": "f", "id": 227652, "pid": 76337, "tid": -914061504, "ts": 1716454224965137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224965137, "dur": 0, "args": { "External id": 227653, "cbid": 203, "correlation": 227653 } }, { "ph": "f", "id": 227653, "pid": 76337, "tid": -914061504, "ts": 1716454224965137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224965138, "dur": 0, "args": { "External id": 227654, "cbid": 205, "correlation": 227654 } }, { "ph": "f", "id": 227654, "pid": 76337, "tid": -914061504, "ts": 1716454224965138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225033355, "dur": 55, "args": { "External id": 227658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227658, "pid": 5, "tid": 7, "ts": 1716454225033355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965151, "dur": 12, "args": { "External id": 227658, "cbid": 211, "correlation": 227658 } }, { "ph": "s", "id": 227658, "pid": 76337, "tid": -914061504, "ts": 1716454224965151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225033412, "dur": 265, "args": { "External id": 227660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227660, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227660, "pid": 5, "tid": 7, "ts": 1716454225033412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965169, "dur": 7, "args": { "External id": 227660, "cbid": 211, "correlation": 227660 } }, { "ph": "s", "id": 227660, "pid": 76337, "tid": -914061504, "ts": 1716454224965169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225033679, "dur": 22, "args": { "External id": 227662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227662, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227662, "pid": 5, "tid": 7, "ts": 1716454225033679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965180, "dur": 5, "args": { "External id": 227662, "cbid": 211, "correlation": 227662 } }, { "ph": "s", "id": 227662, "pid": 76337, "tid": -914061504, "ts": 1716454224965180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225033702, "dur": 31, "args": { "External id": 227668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227668, "pid": 5, "tid": 7, "ts": 1716454225033702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965206, "dur": 8, "args": { "External id": 227668, "cbid": 211, "correlation": 227668 } }, { "ph": "s", "id": 227668, "pid": 76337, "tid": -914061504, "ts": 1716454224965206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225033735, "dur": 27, "args": { "External id": 227676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227676, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227676, "pid": 5, "tid": 7, "ts": 1716454225033735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965234, "dur": 8, "args": { "External id": 227676, "cbid": 211, "correlation": 227676 } }, { "ph": "s", "id": 227676, "pid": 76337, "tid": -914061504, "ts": 1716454224965234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225033763, "dur": 19, "args": { "External id": 227684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227684, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227684, "pid": 5, "tid": 7, "ts": 1716454225033763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965264, "dur": 8, "args": { "External id": 227684, "cbid": 211, "correlation": 227684 } }, { "ph": "s", "id": 227684, "pid": 76337, "tid": -914061504, "ts": 1716454224965264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225033784, "dur": 30, "args": { "External id": 227704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227704, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 227704, "pid": 5, "tid": 7, "ts": 1716454225033784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965345, "dur": 13, "args": { "External id": 227704, "cbid": 211, "correlation": 227704 } }, { "ph": "s", "id": 227704, "pid": 76337, "tid": -914061504, "ts": 1716454224965345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225033815, "dur": 4, "args": { "External id": 227716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227716, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 227716, "pid": 5, "tid": 7, "ts": 1716454225033815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965367, "dur": 6, "args": { "External id": 227716, "cbid": 211, "correlation": 227716 } }, { "ph": "s", "id": 227716, "pid": 76337, "tid": -914061504, "ts": 1716454224965367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225033820, "dur": 30, "args": { "External id": 227719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227719, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227719, "pid": 5, "tid": 7, "ts": 1716454225033820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965385, "dur": 7, "args": { "External id": 227719, "cbid": 211, "correlation": 227719 } }, { "ph": "s", "id": 227719, "pid": 76337, "tid": -914061504, "ts": 1716454224965385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224965443, "dur": 0, "args": { "External id": 227730, "cbid": 317, "correlation": 227730 } }, { "ph": "f", "id": 227730, "pid": 76337, "tid": -914061504, "ts": 1716454224965443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224965444, "dur": 0, "args": { "External id": 227731, "cbid": 203, "correlation": 227731 } }, { "ph": "f", "id": 227731, "pid": 76337, "tid": -914061504, "ts": 1716454224965444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224965444, "dur": 0, "args": { "External id": 227732, "cbid": 205, "correlation": 227732 } }, { "ph": "f", "id": 227732, "pid": 76337, "tid": -914061504, "ts": 1716454224965444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225033852, "dur": 22, "args": { "External id": 227736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227736, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227736, "pid": 5, "tid": 7, "ts": 1716454225033852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965459, "dur": 12, "args": { "External id": 227736, "cbid": 211, "correlation": 227736 } }, { "ph": "s", "id": 227736, "pid": 76337, "tid": -914061504, "ts": 1716454224965459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225033875, "dur": 103, "args": { "External id": 227738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227738, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227738, "pid": 5, "tid": 7, "ts": 1716454225033875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965477, "dur": 6, "args": { "External id": 227738, "cbid": 211, "correlation": 227738 } }, { "ph": "s", "id": 227738, "pid": 76337, "tid": -914061504, "ts": 1716454224965477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225033979, "dur": 22, "args": { "External id": 227740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227740, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227740, "pid": 5, "tid": 7, "ts": 1716454225033979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965486, "dur": 5, "args": { "External id": 227740, "cbid": 211, "correlation": 227740 } }, { "ph": "s", "id": 227740, "pid": 76337, "tid": -914061504, "ts": 1716454224965486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225034003, "dur": 32, "args": { "External id": 227746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227746, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227746, "pid": 5, "tid": 7, "ts": 1716454225034003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965515, "dur": 9, "args": { "External id": 227746, "cbid": 211, "correlation": 227746 } }, { "ph": "s", "id": 227746, "pid": 76337, "tid": -914061504, "ts": 1716454224965515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225034036, "dur": 195, "args": { "External id": 227755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227755, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227755, "pid": 5, "tid": 7, "ts": 1716454225034036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965595, "dur": 14, "args": { "External id": 227755, "cbid": 211, "correlation": 227755 } }, { "ph": "s", "id": 227755, "pid": 76337, "tid": -914061504, "ts": 1716454224965595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225034232, "dur": 63, "args": { "External id": 227777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227777, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227777, "pid": 5, "tid": 7, "ts": 1716454225034232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965653, "dur": 10, "args": { "External id": 227777, "cbid": 211, "correlation": 227777 } }, { "ph": "s", "id": 227777, "pid": 76337, "tid": -914061504, "ts": 1716454224965653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224965741, "dur": 1, "args": { "External id": 227788, "cbid": 251, "correlation": 227788 } }, { "ph": "f", "id": 227788, "pid": 76337, "tid": -914061504, "ts": 1716454224965741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225034297, "dur": 152, "args": { "External id": 227789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227789, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227789, "pid": 5, "tid": 7, "ts": 1716454225034297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965746, "dur": 15, "args": { "External id": 227789, "cbid": 211, "correlation": 227789 } }, { "ph": "s", "id": 227789, "pid": 76337, "tid": -914061504, "ts": 1716454224965746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224965817, "dur": 1, "args": { "External id": 227800, "cbid": 251, "correlation": 227800 } }, { "ph": "f", "id": 227800, "pid": 76337, "tid": -914061504, "ts": 1716454224965817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225034450, "dur": 145, "args": { "External id": 227801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227801, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227801, "pid": 5, "tid": 7, "ts": 1716454225034450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965821, "dur": 11, "args": { "External id": 227801, "cbid": 211, "correlation": 227801 } }, { "ph": "s", "id": 227801, "pid": 76337, "tid": -914061504, "ts": 1716454224965821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224965887, "dur": 1, "args": { "External id": 227812, "cbid": 251, "correlation": 227812 } }, { "ph": "f", "id": 227812, "pid": 76337, "tid": -914061504, "ts": 1716454224965887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225034597, "dur": 144, "args": { "External id": 227813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227813, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227813, "pid": 5, "tid": 7, "ts": 1716454225034597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965891, "dur": 11, "args": { "External id": 227813, "cbid": 211, "correlation": 227813 } }, { "ph": "s", "id": 227813, "pid": 76337, "tid": -914061504, "ts": 1716454224965891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225034742, "dur": 1907, "args": { "External id": 227834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227834, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 227834, "pid": 5, "tid": 7, "ts": 1716454225034742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224965981, "dur": 13, "args": { "External id": 227834, "cbid": 211, "correlation": 227834 } }, { "ph": "s", "id": 227834, "pid": 76337, "tid": -914061504, "ts": 1716454224965981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966082, "dur": 1, "args": { "External id": 227852, "cbid": 251, "correlation": 227852 } }, { "ph": "f", "id": 227852, "pid": 76337, "tid": -914061504, "ts": 1716454224966082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225036650, "dur": 148, "args": { "External id": 227854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227854, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 227854, "pid": 5, "tid": 7, "ts": 1716454225036650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966088, "dur": 13, "args": { "External id": 227854, "cbid": 211, "correlation": 227854 } }, { "ph": "s", "id": 227854, "pid": 76337, "tid": -914061504, "ts": 1716454224966088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225036799, "dur": 35, "args": { "External id": 227862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227862, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227862, "pid": 5, "tid": 7, "ts": 1716454225036799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966159, "dur": 12, "args": { "External id": 227862, "cbid": 211, "correlation": 227862 } }, { "ph": "s", "id": 227862, "pid": 76337, "tid": -914061504, "ts": 1716454224966159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225036836, "dur": 52, "args": { "External id": 227870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227870, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227870, "pid": 5, "tid": 7, "ts": 1716454225036836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966197, "dur": 9, "args": { "External id": 227870, "cbid": 211, "correlation": 227870 } }, { "ph": "s", "id": 227870, "pid": 76337, "tid": -914061504, "ts": 1716454224966197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225036889, "dur": 30, "args": { "External id": 227881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227881, "pid": 5, "tid": 7, "ts": 1716454225036889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966269, "dur": 12, "args": { "External id": 227881, "cbid": 211, "correlation": 227881 } }, { "ph": "s", "id": 227881, "pid": 76337, "tid": -914061504, "ts": 1716454224966269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225036921, "dur": 33, "args": { "External id": 227903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227903, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227903, "pid": 5, "tid": 7, "ts": 1716454225036921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966300, "dur": 8, "args": { "External id": 227903, "cbid": 211, "correlation": 227903 } }, { "ph": "s", "id": 227903, "pid": 76337, "tid": -914061504, "ts": 1716454224966300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966386, "dur": 1, "args": { "External id": 227914, "cbid": 251, "correlation": 227914 } }, { "ph": "f", "id": 227914, "pid": 76337, "tid": -914061504, "ts": 1716454224966386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225036955, "dur": 88, "args": { "External id": 227915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227915, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227915, "pid": 5, "tid": 7, "ts": 1716454225036955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966391, "dur": 13, "args": { "External id": 227915, "cbid": 211, "correlation": 227915 } }, { "ph": "s", "id": 227915, "pid": 76337, "tid": -914061504, "ts": 1716454224966391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966461, "dur": 1, "args": { "External id": 227926, "cbid": 251, "correlation": 227926 } }, { "ph": "f", "id": 227926, "pid": 76337, "tid": -914061504, "ts": 1716454224966461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966464, "dur": 0, "args": { "External id": 227927, "cbid": 251, "correlation": 227927 } }, { "ph": "f", "id": 227927, "pid": 76337, "tid": -914061504, "ts": 1716454224966464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225037045, "dur": 10, "args": { "External id": 227928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227928, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 227928, "pid": 5, "tid": 7, "ts": 1716454225037045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966466, "dur": 12, "args": { "External id": 227928, "cbid": 211, "correlation": 227928 } }, { "ph": "s", "id": 227928, "pid": 76337, "tid": -914061504, "ts": 1716454224966466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225037057, "dur": 5, "args": { "External id": 227930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227930, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 227930, "pid": 5, "tid": 7, "ts": 1716454225037057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966480, "dur": 6, "args": { "External id": 227930, "cbid": 211, "correlation": 227930 } }, { "ph": "s", "id": 227930, "pid": 76337, "tid": -914061504, "ts": 1716454224966480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966538, "dur": 1, "args": { "External id": 227941, "cbid": 251, "correlation": 227941 } }, { "ph": "f", "id": 227941, "pid": 76337, "tid": -914061504, "ts": 1716454224966538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966541, "dur": 0, "args": { "External id": 227942, "cbid": 251, "correlation": 227942 } }, { "ph": "f", "id": 227942, "pid": 76337, "tid": -914061504, "ts": 1716454224966541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225037063, "dur": 7, "args": { "External id": 227943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227943, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 227943, "pid": 5, "tid": 7, "ts": 1716454225037063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966542, "dur": 12, "args": { "External id": 227943, "cbid": 211, "correlation": 227943 } }, { "ph": "s", "id": 227943, "pid": 76337, "tid": -914061504, "ts": 1716454224966542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225037071, "dur": 3, "args": { "External id": 227945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227945, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 227945, "pid": 5, "tid": 7, "ts": 1716454225037071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966556, "dur": 5, "args": { "External id": 227945, "cbid": 211, "correlation": 227945 } }, { "ph": "s", "id": 227945, "pid": 76337, "tid": -914061504, "ts": 1716454224966556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225037076, "dur": 89, "args": { "External id": 227966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227966, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 227966, "pid": 5, "tid": 7, "ts": 1716454225037076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966629, "dur": 12, "args": { "External id": 227966, "cbid": 211, "correlation": 227966 } }, { "ph": "s", "id": 227966, "pid": 76337, "tid": -914061504, "ts": 1716454224966629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966726, "dur": 1, "args": { "External id": 227984, "cbid": 251, "correlation": 227984 } }, { "ph": "f", "id": 227984, "pid": 76337, "tid": -914061504, "ts": 1716454224966726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225037166, "dur": 95, "args": { "External id": 227986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227986, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 227986, "pid": 5, "tid": 7, "ts": 1716454225037166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966732, "dur": 13, "args": { "External id": 227986, "cbid": 211, "correlation": 227986 } }, { "ph": "s", "id": 227986, "pid": 76337, "tid": -914061504, "ts": 1716454224966732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225037262, "dur": 19, "args": { "External id": 227994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 227994, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 227994, "pid": 5, "tid": 7, "ts": 1716454225037262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966800, "dur": 13, "args": { "External id": 227994, "cbid": 211, "correlation": 227994 } }, { "ph": "s", "id": 227994, "pid": 76337, "tid": -914061504, "ts": 1716454224966800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225037283, "dur": 37, "args": { "External id": 228002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228002, "pid": 5, "tid": 7, "ts": 1716454225037283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966841, "dur": 9, "args": { "External id": 228002, "cbid": 211, "correlation": 228002 } }, { "ph": "s", "id": 228002, "pid": 76337, "tid": -914061504, "ts": 1716454224966841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225037321, "dur": 34, "args": { "External id": 228024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228024, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228024, "pid": 5, "tid": 7, "ts": 1716454225037321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966892, "dur": 10, "args": { "External id": 228024, "cbid": 211, "correlation": 228024 } }, { "ph": "s", "id": 228024, "pid": 76337, "tid": -914061504, "ts": 1716454224966892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966990, "dur": 1, "args": { "External id": 228040, "cbid": 251, "correlation": 228040 } }, { "ph": "f", "id": 228040, "pid": 76337, "tid": -914061504, "ts": 1716454224966990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224966995, "dur": 0, "args": { "External id": 228042, "cbid": 251, "correlation": 228042 } }, { "ph": "f", "id": 228042, "pid": 76337, "tid": -914061504, "ts": 1716454224966995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225037356, "dur": 529, "args": { "External id": 228043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228043, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 228043, "pid": 5, "tid": 7, "ts": 1716454225037356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224966999, "dur": 14, "args": { "External id": 228043, "cbid": 211, "correlation": 228043 } }, { "ph": "s", "id": 228043, "pid": 76337, "tid": -914061504, "ts": 1716454224966999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225037887, "dur": 124, "args": { "External id": 228051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228051, "pid": 5, "tid": 7, "ts": 1716454225037887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967068, "dur": 13, "args": { "External id": 228051, "cbid": 211, "correlation": 228051 } }, { "ph": "s", "id": 228051, "pid": 76337, "tid": -914061504, "ts": 1716454224967068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225038012, "dur": 128, "args": { "External id": 228059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228059, "pid": 5, "tid": 7, "ts": 1716454225038012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967100, "dur": 9, "args": { "External id": 228059, "cbid": 211, "correlation": 228059 } }, { "ph": "s", "id": 228059, "pid": 76337, "tid": -914061504, "ts": 1716454224967100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224967179, "dur": 1, "args": { "External id": 228075, "cbid": 251, "correlation": 228075 } }, { "ph": "f", "id": 228075, "pid": 76337, "tid": -914061504, "ts": 1716454224967179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225038141, "dur": 304, "args": { "External id": 228077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228077, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228077, "pid": 5, "tid": 7, "ts": 1716454225038141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967184, "dur": 13, "args": { "External id": 228077, "cbid": 211, "correlation": 228077 } }, { "ph": "s", "id": 228077, "pid": 76337, "tid": -914061504, "ts": 1716454224967184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225038447, "dur": 27, "args": { "External id": 228085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228085, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228085, "pid": 5, "tid": 7, "ts": 1716454225038447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967227, "dur": 10, "args": { "External id": 228085, "cbid": 211, "correlation": 228085 } }, { "ph": "s", "id": 228085, "pid": 76337, "tid": -914061504, "ts": 1716454224967227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225038475, "dur": 80, "args": { "External id": 228096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228096, "pid": 5, "tid": 7, "ts": 1716454225038475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967294, "dur": 13, "args": { "External id": 228096, "cbid": 211, "correlation": 228096 } }, { "ph": "s", "id": 228096, "pid": 76337, "tid": -914061504, "ts": 1716454224967294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224967358, "dur": 0, "args": { "External id": 228108, "cbid": 317, "correlation": 228108 } }, { "ph": "f", "id": 228108, "pid": 76337, "tid": -914061504, "ts": 1716454224967358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224967359, "dur": 0, "args": { "External id": 228109, "cbid": 203, "correlation": 228109 } }, { "ph": "f", "id": 228109, "pid": 76337, "tid": -914061504, "ts": 1716454224967359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224967360, "dur": 0, "args": { "External id": 228110, "cbid": 205, "correlation": 228110 } }, { "ph": "f", "id": 228110, "pid": 76337, "tid": -914061504, "ts": 1716454224967360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225038557, "dur": 22, "args": { "External id": 228114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228114, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228114, "pid": 5, "tid": 7, "ts": 1716454225038557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967375, "dur": 12, "args": { "External id": 228114, "cbid": 211, "correlation": 228114 } }, { "ph": "s", "id": 228114, "pid": 76337, "tid": -914061504, "ts": 1716454224967375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225038580, "dur": 117, "args": { "External id": 228116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228116, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228116, "pid": 5, "tid": 7, "ts": 1716454225038580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967394, "dur": 8, "args": { "External id": 228116, "cbid": 211, "correlation": 228116 } }, { "ph": "s", "id": 228116, "pid": 76337, "tid": -914061504, "ts": 1716454224967394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225038699, "dur": 23, "args": { "External id": 228118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228118, "pid": 5, "tid": 7, "ts": 1716454225038699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967405, "dur": 5, "args": { "External id": 228118, "cbid": 211, "correlation": 228118 } }, { "ph": "s", "id": 228118, "pid": 76337, "tid": -914061504, "ts": 1716454224967405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225038723, "dur": 32, "args": { "External id": 228124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228124, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228124, "pid": 5, "tid": 7, "ts": 1716454225038723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967433, "dur": 8, "args": { "External id": 228124, "cbid": 211, "correlation": 228124 } }, { "ph": "s", "id": 228124, "pid": 76337, "tid": -914061504, "ts": 1716454224967433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225038757, "dur": 26, "args": { "External id": 228132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228132, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228132, "pid": 5, "tid": 7, "ts": 1716454225038757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967464, "dur": 8, "args": { "External id": 228132, "cbid": 211, "correlation": 228132 } }, { "ph": "s", "id": 228132, "pid": 76337, "tid": -914061504, "ts": 1716454224967464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225038784, "dur": 52, "args": { "External id": 228141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228141, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228141, "pid": 5, "tid": 7, "ts": 1716454225038784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967504, "dur": 10, "args": { "External id": 228141, "cbid": 211, "correlation": 228141 } }, { "ph": "s", "id": 228141, "pid": 76337, "tid": -914061504, "ts": 1716454224967504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225038837, "dur": 53, "args": { "External id": 228161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228161, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 228161, "pid": 5, "tid": 7, "ts": 1716454225038837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967574, "dur": 11, "args": { "External id": 228161, "cbid": 211, "correlation": 228161 } }, { "ph": "s", "id": 228161, "pid": 76337, "tid": -914061504, "ts": 1716454224967574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225038891, "dur": 5, "args": { "External id": 228173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228173, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 228173, "pid": 5, "tid": 7, "ts": 1716454225038891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967595, "dur": 7, "args": { "External id": 228173, "cbid": 211, "correlation": 228173 } }, { "ph": "s", "id": 228173, "pid": 76337, "tid": -914061504, "ts": 1716454224967595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225038897, "dur": 55, "args": { "External id": 228176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228176, "pid": 5, "tid": 7, "ts": 1716454225038897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967614, "dur": 6, "args": { "External id": 228176, "cbid": 211, "correlation": 228176 } }, { "ph": "s", "id": 228176, "pid": 76337, "tid": -914061504, "ts": 1716454224967614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225038953, "dur": 37, "args": { "External id": 228185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228185, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228185, "pid": 5, "tid": 7, "ts": 1716454225038953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967653, "dur": 10, "args": { "External id": 228185, "cbid": 211, "correlation": 228185 } }, { "ph": "s", "id": 228185, "pid": 76337, "tid": -914061504, "ts": 1716454224967653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224967706, "dur": 0, "args": { "External id": 228195, "cbid": 317, "correlation": 228195 } }, { "ph": "f", "id": 228195, "pid": 76337, "tid": -914061504, "ts": 1716454224967706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224967707, "dur": 0, "args": { "External id": 228196, "cbid": 203, "correlation": 228196 } }, { "ph": "f", "id": 228196, "pid": 76337, "tid": -914061504, "ts": 1716454224967707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224967707, "dur": 0, "args": { "External id": 228197, "cbid": 205, "correlation": 228197 } }, { "ph": "f", "id": 228197, "pid": 76337, "tid": -914061504, "ts": 1716454224967707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225038991, "dur": 38, "args": { "External id": 228201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228201, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228201, "pid": 5, "tid": 7, "ts": 1716454225038991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967721, "dur": 11, "args": { "External id": 228201, "cbid": 211, "correlation": 228201 } }, { "ph": "s", "id": 228201, "pid": 76337, "tid": -914061504, "ts": 1716454224967721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225039031, "dur": 82, "args": { "External id": 228203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228203, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228203, "pid": 5, "tid": 7, "ts": 1716454225039031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967735, "dur": 5, "args": { "External id": 228203, "cbid": 211, "correlation": 228203 } }, { "ph": "s", "id": 228203, "pid": 76337, "tid": -914061504, "ts": 1716454224967735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225039114, "dur": 1265, "args": { "External id": 228205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228205, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228205, "pid": 5, "tid": 7, "ts": 1716454225039114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967746, "dur": 7, "args": { "External id": 228205, "cbid": 211, "correlation": 228205 } }, { "ph": "s", "id": 228205, "pid": 76337, "tid": -914061504, "ts": 1716454224967746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225040381, "dur": 22, "args": { "External id": 228207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228207, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228207, "pid": 5, "tid": 7, "ts": 1716454225040381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967757, "dur": 5, "args": { "External id": 228207, "cbid": 211, "correlation": 228207 } }, { "ph": "s", "id": 228207, "pid": 76337, "tid": -914061504, "ts": 1716454224967757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225040403, "dur": 33, "args": { "External id": 228213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228213, "pid": 5, "tid": 7, "ts": 1716454225040403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967784, "dur": 9, "args": { "External id": 228213, "cbid": 211, "correlation": 228213 } }, { "ph": "s", "id": 228213, "pid": 76337, "tid": -914061504, "ts": 1716454224967784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225040437, "dur": 3, "args": { "External id": 228221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228221, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 228221, "pid": 5, "tid": 7, "ts": 1716454225040437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967827, "dur": 9, "args": { "External id": 228221, "cbid": 211, "correlation": 228221 } }, { "ph": "s", "id": 228221, "pid": 76337, "tid": -914061504, "ts": 1716454224967827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224967892, "dur": 1, "args": { "External id": 228237, "cbid": 251, "correlation": 228237 } }, { "ph": "f", "id": 228237, "pid": 76337, "tid": -914061504, "ts": 1716454224967892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224967898, "dur": 0, "args": { "External id": 228239, "cbid": 251, "correlation": 228239 } }, { "ph": "f", "id": 228239, "pid": 76337, "tid": -914061504, "ts": 1716454224967898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225040442, "dur": 12, "args": { "External id": 228240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228240, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 228240, "pid": 5, "tid": 7, "ts": 1716454225040442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967900, "dur": 11, "args": { "External id": 228240, "cbid": 211, "correlation": 228240 } }, { "ph": "s", "id": 228240, "pid": 76337, "tid": -914061504, "ts": 1716454224967900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225040455, "dur": 5, "args": { "External id": 228242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228242, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 228242, "pid": 5, "tid": 7, "ts": 1716454225040455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967913, "dur": 5, "args": { "External id": 228242, "cbid": 211, "correlation": 228242 } }, { "ph": "s", "id": 228242, "pid": 76337, "tid": -914061504, "ts": 1716454224967913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225040462, "dur": 29, "args": { "External id": 228252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228252, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228252, "pid": 5, "tid": 7, "ts": 1716454225040462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224967971, "dur": 20, "args": { "External id": 228252, "cbid": 211, "correlation": 228252 } }, { "ph": "s", "id": 228252, "pid": 76337, "tid": -914061504, "ts": 1716454224967971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225040492, "dur": 30, "args": { "External id": 228272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228272, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 228272, "pid": 5, "tid": 7, "ts": 1716454225040492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968045, "dur": 11, "args": { "External id": 228272, "cbid": 211, "correlation": 228272 } }, { "ph": "s", "id": 228272, "pid": 76337, "tid": -914061504, "ts": 1716454224968045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225040523, "dur": 4, "args": { "External id": 228284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228284, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 228284, "pid": 5, "tid": 7, "ts": 1716454225040523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968066, "dur": 6, "args": { "External id": 228284, "cbid": 211, "correlation": 228284 } }, { "ph": "s", "id": 228284, "pid": 76337, "tid": -914061504, "ts": 1716454224968066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225040529, "dur": 30, "args": { "External id": 228287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228287, "pid": 5, "tid": 7, "ts": 1716454225040529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968084, "dur": 7, "args": { "External id": 228287, "cbid": 211, "correlation": 228287 } }, { "ph": "s", "id": 228287, "pid": 76337, "tid": -914061504, "ts": 1716454224968084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225040560, "dur": 20, "args": { "External id": 228296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228296, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228296, "pid": 5, "tid": 7, "ts": 1716454225040560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968125, "dur": 10, "args": { "External id": 228296, "cbid": 211, "correlation": 228296 } }, { "ph": "s", "id": 228296, "pid": 76337, "tid": -914061504, "ts": 1716454224968125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224968187, "dur": 0, "args": { "External id": 228306, "cbid": 317, "correlation": 228306 } }, { "ph": "f", "id": 228306, "pid": 76337, "tid": -914061504, "ts": 1716454224968187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224968188, "dur": 0, "args": { "External id": 228307, "cbid": 203, "correlation": 228307 } }, { "ph": "f", "id": 228307, "pid": 76337, "tid": -914061504, "ts": 1716454224968188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224968188, "dur": 0, "args": { "External id": 228308, "cbid": 205, "correlation": 228308 } }, { "ph": "f", "id": 228308, "pid": 76337, "tid": -914061504, "ts": 1716454224968188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225040581, "dur": 22, "args": { "External id": 228312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228312, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228312, "pid": 5, "tid": 7, "ts": 1716454225040581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968203, "dur": 12, "args": { "External id": 228312, "cbid": 211, "correlation": 228312 } }, { "ph": "s", "id": 228312, "pid": 76337, "tid": -914061504, "ts": 1716454224968203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225040605, "dur": 43, "args": { "External id": 228314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228314, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228314, "pid": 5, "tid": 7, "ts": 1716454225040605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968217, "dur": 5, "args": { "External id": 228314, "cbid": 211, "correlation": 228314 } }, { "ph": "s", "id": 228314, "pid": 76337, "tid": -914061504, "ts": 1716454224968217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225040649, "dur": 639, "args": { "External id": 228316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228316, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228316, "pid": 5, "tid": 7, "ts": 1716454225040649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968228, "dur": 6, "args": { "External id": 228316, "cbid": 211, "correlation": 228316 } }, { "ph": "s", "id": 228316, "pid": 76337, "tid": -914061504, "ts": 1716454224968228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225041289, "dur": 22, "args": { "External id": 228318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228318, "pid": 5, "tid": 7, "ts": 1716454225041289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968237, "dur": 5, "args": { "External id": 228318, "cbid": 211, "correlation": 228318 } }, { "ph": "s", "id": 228318, "pid": 76337, "tid": -914061504, "ts": 1716454224968237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225041312, "dur": 32, "args": { "External id": 228324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228324, "pid": 5, "tid": 7, "ts": 1716454225041312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968266, "dur": 9, "args": { "External id": 228324, "cbid": 211, "correlation": 228324 } }, { "ph": "s", "id": 228324, "pid": 76337, "tid": -914061504, "ts": 1716454224968266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224968325, "dur": 0, "args": { "External id": 228334, "cbid": 317, "correlation": 228334 } }, { "ph": "f", "id": 228334, "pid": 76337, "tid": -914061504, "ts": 1716454224968325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224968325, "dur": 0, "args": { "External id": 228335, "cbid": 203, "correlation": 228335 } }, { "ph": "f", "id": 228335, "pid": 76337, "tid": -914061504, "ts": 1716454224968325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224968326, "dur": 0, "args": { "External id": 228336, "cbid": 205, "correlation": 228336 } }, { "ph": "f", "id": 228336, "pid": 76337, "tid": -914061504, "ts": 1716454224968326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225041346, "dur": 38, "args": { "External id": 228340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228340, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228340, "pid": 5, "tid": 7, "ts": 1716454225041346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968338, "dur": 14, "args": { "External id": 228340, "cbid": 211, "correlation": 228340 } }, { "ph": "s", "id": 228340, "pid": 76337, "tid": -914061504, "ts": 1716454224968338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225041385, "dur": 186, "args": { "External id": 228342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228342, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228342, "pid": 5, "tid": 7, "ts": 1716454225041385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968358, "dur": 6, "args": { "External id": 228342, "cbid": 211, "correlation": 228342 } }, { "ph": "s", "id": 228342, "pid": 76337, "tid": -914061504, "ts": 1716454224968358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225041573, "dur": 22, "args": { "External id": 228344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228344, "pid": 5, "tid": 7, "ts": 1716454225041573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968368, "dur": 5, "args": { "External id": 228344, "cbid": 211, "correlation": 228344 } }, { "ph": "s", "id": 228344, "pid": 76337, "tid": -914061504, "ts": 1716454224968368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225041596, "dur": 32, "args": { "External id": 228350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228350, "pid": 5, "tid": 7, "ts": 1716454225041596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968393, "dur": 9, "args": { "External id": 228350, "cbid": 211, "correlation": 228350 } }, { "ph": "s", "id": 228350, "pid": 76337, "tid": -914061504, "ts": 1716454224968393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225041630, "dur": 27, "args": { "External id": 228358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228358, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228358, "pid": 5, "tid": 7, "ts": 1716454225041630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968423, "dur": 8, "args": { "External id": 228358, "cbid": 211, "correlation": 228358 } }, { "ph": "s", "id": 228358, "pid": 76337, "tid": -914061504, "ts": 1716454224968423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225041658, "dur": 20, "args": { "External id": 228366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228366, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228366, "pid": 5, "tid": 7, "ts": 1716454225041658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968452, "dur": 8, "args": { "External id": 228366, "cbid": 211, "correlation": 228366 } }, { "ph": "s", "id": 228366, "pid": 76337, "tid": -914061504, "ts": 1716454224968452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225041679, "dur": 31, "args": { "External id": 228386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228386, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 228386, "pid": 5, "tid": 7, "ts": 1716454225041679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968532, "dur": 12, "args": { "External id": 228386, "cbid": 211, "correlation": 228386 } }, { "ph": "s", "id": 228386, "pid": 76337, "tid": -914061504, "ts": 1716454224968532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225041711, "dur": 4, "args": { "External id": 228398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228398, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 228398, "pid": 5, "tid": 7, "ts": 1716454225041711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968555, "dur": 6, "args": { "External id": 228398, "cbid": 211, "correlation": 228398 } }, { "ph": "s", "id": 228398, "pid": 76337, "tid": -914061504, "ts": 1716454224968555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225041717, "dur": 30, "args": { "External id": 228401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228401, "pid": 5, "tid": 7, "ts": 1716454225041717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968572, "dur": 7, "args": { "External id": 228401, "cbid": 211, "correlation": 228401 } }, { "ph": "s", "id": 228401, "pid": 76337, "tid": -914061504, "ts": 1716454224968572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224968629, "dur": 0, "args": { "External id": 228412, "cbid": 317, "correlation": 228412 } }, { "ph": "f", "id": 228412, "pid": 76337, "tid": -914061504, "ts": 1716454224968629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224968630, "dur": 0, "args": { "External id": 228413, "cbid": 203, "correlation": 228413 } }, { "ph": "f", "id": 228413, "pid": 76337, "tid": -914061504, "ts": 1716454224968630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224968631, "dur": 0, "args": { "External id": 228414, "cbid": 205, "correlation": 228414 } }, { "ph": "f", "id": 228414, "pid": 76337, "tid": -914061504, "ts": 1716454224968631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225041748, "dur": 23, "args": { "External id": 228418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228418, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228418, "pid": 5, "tid": 7, "ts": 1716454225041748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968644, "dur": 12, "args": { "External id": 228418, "cbid": 211, "correlation": 228418 } }, { "ph": "s", "id": 228418, "pid": 76337, "tid": -914061504, "ts": 1716454224968644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225041772, "dur": 103, "args": { "External id": 228420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228420, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228420, "pid": 5, "tid": 7, "ts": 1716454225041772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968662, "dur": 6, "args": { "External id": 228420, "cbid": 211, "correlation": 228420 } }, { "ph": "s", "id": 228420, "pid": 76337, "tid": -914061504, "ts": 1716454224968662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225041876, "dur": 22, "args": { "External id": 228422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228422, "pid": 5, "tid": 7, "ts": 1716454225041876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968672, "dur": 5, "args": { "External id": 228422, "cbid": 211, "correlation": 228422 } }, { "ph": "s", "id": 228422, "pid": 76337, "tid": -914061504, "ts": 1716454224968672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225041900, "dur": 32, "args": { "External id": 228428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228428, "pid": 5, "tid": 7, "ts": 1716454225041900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968699, "dur": 9, "args": { "External id": 228428, "cbid": 211, "correlation": 228428 } }, { "ph": "s", "id": 228428, "pid": 76337, "tid": -914061504, "ts": 1716454224968699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225041933, "dur": 194, "args": { "External id": 228437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228437, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228437, "pid": 5, "tid": 7, "ts": 1716454225041933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968779, "dur": 14, "args": { "External id": 228437, "cbid": 211, "correlation": 228437 } }, { "ph": "s", "id": 228437, "pid": 76337, "tid": -914061504, "ts": 1716454224968779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225042128, "dur": 63, "args": { "External id": 228459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228459, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228459, "pid": 5, "tid": 7, "ts": 1716454225042128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968836, "dur": 10, "args": { "External id": 228459, "cbid": 211, "correlation": 228459 } }, { "ph": "s", "id": 228459, "pid": 76337, "tid": -914061504, "ts": 1716454224968836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224968925, "dur": 1, "args": { "External id": 228470, "cbid": 251, "correlation": 228470 } }, { "ph": "f", "id": 228470, "pid": 76337, "tid": -914061504, "ts": 1716454224968925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225042193, "dur": 151, "args": { "External id": 228471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228471, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228471, "pid": 5, "tid": 7, "ts": 1716454225042193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224968930, "dur": 13, "args": { "External id": 228471, "cbid": 211, "correlation": 228471 } }, { "ph": "s", "id": 228471, "pid": 76337, "tid": -914061504, "ts": 1716454224968930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969011, "dur": 1, "args": { "External id": 228482, "cbid": 251, "correlation": 228482 } }, { "ph": "f", "id": 228482, "pid": 76337, "tid": -914061504, "ts": 1716454224969011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225042345, "dur": 146, "args": { "External id": 228483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228483, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228483, "pid": 5, "tid": 7, "ts": 1716454225042345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969015, "dur": 12, "args": { "External id": 228483, "cbid": 211, "correlation": 228483 } }, { "ph": "s", "id": 228483, "pid": 76337, "tid": -914061504, "ts": 1716454224969015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969083, "dur": 1, "args": { "External id": 228494, "cbid": 251, "correlation": 228494 } }, { "ph": "f", "id": 228494, "pid": 76337, "tid": -914061504, "ts": 1716454224969083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225042493, "dur": 143, "args": { "External id": 228495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228495, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228495, "pid": 5, "tid": 7, "ts": 1716454225042493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969087, "dur": 11, "args": { "External id": 228495, "cbid": 211, "correlation": 228495 } }, { "ph": "s", "id": 228495, "pid": 76337, "tid": -914061504, "ts": 1716454224969087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225042637, "dur": 1908, "args": { "External id": 228516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228516, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 228516, "pid": 5, "tid": 7, "ts": 1716454225042637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969168, "dur": 13, "args": { "External id": 228516, "cbid": 211, "correlation": 228516 } }, { "ph": "s", "id": 228516, "pid": 76337, "tid": -914061504, "ts": 1716454224969168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969268, "dur": 1, "args": { "External id": 228534, "cbid": 251, "correlation": 228534 } }, { "ph": "f", "id": 228534, "pid": 76337, "tid": -914061504, "ts": 1716454224969268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225044547, "dur": 148, "args": { "External id": 228536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228536, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 228536, "pid": 5, "tid": 7, "ts": 1716454225044547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969274, "dur": 13, "args": { "External id": 228536, "cbid": 211, "correlation": 228536 } }, { "ph": "s", "id": 228536, "pid": 76337, "tid": -914061504, "ts": 1716454224969274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225044696, "dur": 35, "args": { "External id": 228544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228544, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228544, "pid": 5, "tid": 7, "ts": 1716454225044696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969345, "dur": 12, "args": { "External id": 228544, "cbid": 211, "correlation": 228544 } }, { "ph": "s", "id": 228544, "pid": 76337, "tid": -914061504, "ts": 1716454224969345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225044732, "dur": 51, "args": { "External id": 228552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228552, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228552, "pid": 5, "tid": 7, "ts": 1716454225044732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969384, "dur": 8, "args": { "External id": 228552, "cbid": 211, "correlation": 228552 } }, { "ph": "s", "id": 228552, "pid": 76337, "tid": -914061504, "ts": 1716454224969384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225044784, "dur": 29, "args": { "External id": 228563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228563, "pid": 5, "tid": 7, "ts": 1716454225044784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969455, "dur": 13, "args": { "External id": 228563, "cbid": 211, "correlation": 228563 } }, { "ph": "s", "id": 228563, "pid": 76337, "tid": -914061504, "ts": 1716454224969455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225044815, "dur": 34, "args": { "External id": 228585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228585, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228585, "pid": 5, "tid": 7, "ts": 1716454225044815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969487, "dur": 7, "args": { "External id": 228585, "cbid": 211, "correlation": 228585 } }, { "ph": "s", "id": 228585, "pid": 76337, "tid": -914061504, "ts": 1716454224969487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969572, "dur": 1, "args": { "External id": 228596, "cbid": 251, "correlation": 228596 } }, { "ph": "f", "id": 228596, "pid": 76337, "tid": -914061504, "ts": 1716454224969572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225044850, "dur": 89, "args": { "External id": 228597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228597, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228597, "pid": 5, "tid": 7, "ts": 1716454225044850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969577, "dur": 13, "args": { "External id": 228597, "cbid": 211, "correlation": 228597 } }, { "ph": "s", "id": 228597, "pid": 76337, "tid": -914061504, "ts": 1716454224969577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969648, "dur": 1, "args": { "External id": 228608, "cbid": 251, "correlation": 228608 } }, { "ph": "f", "id": 228608, "pid": 76337, "tid": -914061504, "ts": 1716454224969648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969652, "dur": 0, "args": { "External id": 228609, "cbid": 251, "correlation": 228609 } }, { "ph": "f", "id": 228609, "pid": 76337, "tid": -914061504, "ts": 1716454224969652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225044941, "dur": 11, "args": { "External id": 228610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228610, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 228610, "pid": 5, "tid": 7, "ts": 1716454225044941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969653, "dur": 12, "args": { "External id": 228610, "cbid": 211, "correlation": 228610 } }, { "ph": "s", "id": 228610, "pid": 76337, "tid": -914061504, "ts": 1716454224969653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225044953, "dur": 5, "args": { "External id": 228612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228612, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 228612, "pid": 5, "tid": 7, "ts": 1716454225044953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969667, "dur": 6, "args": { "External id": 228612, "cbid": 211, "correlation": 228612 } }, { "ph": "s", "id": 228612, "pid": 76337, "tid": -914061504, "ts": 1716454224969667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969725, "dur": 1, "args": { "External id": 228623, "cbid": 251, "correlation": 228623 } }, { "ph": "f", "id": 228623, "pid": 76337, "tid": -914061504, "ts": 1716454224969725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969728, "dur": 0, "args": { "External id": 228624, "cbid": 251, "correlation": 228624 } }, { "ph": "f", "id": 228624, "pid": 76337, "tid": -914061504, "ts": 1716454224969728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225044959, "dur": 7, "args": { "External id": 228625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228625, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 228625, "pid": 5, "tid": 7, "ts": 1716454225044959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969730, "dur": 12, "args": { "External id": 228625, "cbid": 211, "correlation": 228625 } }, { "ph": "s", "id": 228625, "pid": 76337, "tid": -914061504, "ts": 1716454224969730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225044968, "dur": 3, "args": { "External id": 228627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228627, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 228627, "pid": 5, "tid": 7, "ts": 1716454225044968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969743, "dur": 5, "args": { "External id": 228627, "cbid": 211, "correlation": 228627 } }, { "ph": "s", "id": 228627, "pid": 76337, "tid": -914061504, "ts": 1716454224969743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225044972, "dur": 90, "args": { "External id": 228648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228648, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 228648, "pid": 5, "tid": 7, "ts": 1716454225044972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969816, "dur": 12, "args": { "External id": 228648, "cbid": 211, "correlation": 228648 } }, { "ph": "s", "id": 228648, "pid": 76337, "tid": -914061504, "ts": 1716454224969816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224969914, "dur": 1, "args": { "External id": 228666, "cbid": 251, "correlation": 228666 } }, { "ph": "f", "id": 228666, "pid": 76337, "tid": -914061504, "ts": 1716454224969914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225045064, "dur": 94, "args": { "External id": 228668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228668, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228668, "pid": 5, "tid": 7, "ts": 1716454225045064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969920, "dur": 13, "args": { "External id": 228668, "cbid": 211, "correlation": 228668 } }, { "ph": "s", "id": 228668, "pid": 76337, "tid": -914061504, "ts": 1716454224969920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225045159, "dur": 19, "args": { "External id": 228676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228676, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228676, "pid": 5, "tid": 7, "ts": 1716454225045159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224969996, "dur": 13, "args": { "External id": 228676, "cbid": 211, "correlation": 228676 } }, { "ph": "s", "id": 228676, "pid": 76337, "tid": -914061504, "ts": 1716454224969996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225045180, "dur": 38, "args": { "External id": 228684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228684, "pid": 5, "tid": 7, "ts": 1716454225045180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970039, "dur": 9, "args": { "External id": 228684, "cbid": 211, "correlation": 228684 } }, { "ph": "s", "id": 228684, "pid": 76337, "tid": -914061504, "ts": 1716454224970039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225045219, "dur": 34, "args": { "External id": 228706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228706, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228706, "pid": 5, "tid": 7, "ts": 1716454225045219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970091, "dur": 10, "args": { "External id": 228706, "cbid": 211, "correlation": 228706 } }, { "ph": "s", "id": 228706, "pid": 76337, "tid": -914061504, "ts": 1716454224970091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224970182, "dur": 1, "args": { "External id": 228722, "cbid": 251, "correlation": 228722 } }, { "ph": "f", "id": 228722, "pid": 76337, "tid": -914061504, "ts": 1716454224970182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224970186, "dur": 0, "args": { "External id": 228724, "cbid": 251, "correlation": 228724 } }, { "ph": "f", "id": 228724, "pid": 76337, "tid": -914061504, "ts": 1716454224970186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225045254, "dur": 529, "args": { "External id": 228725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228725, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 228725, "pid": 5, "tid": 7, "ts": 1716454225045254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970190, "dur": 12, "args": { "External id": 228725, "cbid": 211, "correlation": 228725 } }, { "ph": "s", "id": 228725, "pid": 76337, "tid": -914061504, "ts": 1716454224970190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225045785, "dur": 124, "args": { "External id": 228733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228733, "pid": 5, "tid": 7, "ts": 1716454225045785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970254, "dur": 13, "args": { "External id": 228733, "cbid": 211, "correlation": 228733 } }, { "ph": "s", "id": 228733, "pid": 76337, "tid": -914061504, "ts": 1716454224970254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225045911, "dur": 131, "args": { "External id": 228741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228741, "pid": 5, "tid": 7, "ts": 1716454225045911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970286, "dur": 8, "args": { "External id": 228741, "cbid": 211, "correlation": 228741 } }, { "ph": "s", "id": 228741, "pid": 76337, "tid": -914061504, "ts": 1716454224970286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224970362, "dur": 1, "args": { "External id": 228757, "cbid": 251, "correlation": 228757 } }, { "ph": "f", "id": 228757, "pid": 76337, "tid": -914061504, "ts": 1716454224970362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225046044, "dur": 297, "args": { "External id": 228759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228759, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228759, "pid": 5, "tid": 7, "ts": 1716454225046044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970368, "dur": 12, "args": { "External id": 228759, "cbid": 211, "correlation": 228759 } }, { "ph": "s", "id": 228759, "pid": 76337, "tid": -914061504, "ts": 1716454224970368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225046342, "dur": 27, "args": { "External id": 228767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228767, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228767, "pid": 5, "tid": 7, "ts": 1716454225046342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970409, "dur": 10, "args": { "External id": 228767, "cbid": 211, "correlation": 228767 } }, { "ph": "s", "id": 228767, "pid": 76337, "tid": -914061504, "ts": 1716454224970409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225046370, "dur": 79, "args": { "External id": 228778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228778, "pid": 5, "tid": 7, "ts": 1716454225046370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970477, "dur": 12, "args": { "External id": 228778, "cbid": 211, "correlation": 228778 } }, { "ph": "s", "id": 228778, "pid": 76337, "tid": -914061504, "ts": 1716454224970477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224970541, "dur": 0, "args": { "External id": 228790, "cbid": 317, "correlation": 228790 } }, { "ph": "f", "id": 228790, "pid": 76337, "tid": -914061504, "ts": 1716454224970541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224970542, "dur": 0, "args": { "External id": 228791, "cbid": 203, "correlation": 228791 } }, { "ph": "f", "id": 228791, "pid": 76337, "tid": -914061504, "ts": 1716454224970542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224970543, "dur": 0, "args": { "External id": 228792, "cbid": 205, "correlation": 228792 } }, { "ph": "f", "id": 228792, "pid": 76337, "tid": -914061504, "ts": 1716454224970543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225046450, "dur": 23, "args": { "External id": 228796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228796, "pid": 5, "tid": 7, "ts": 1716454225046450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970559, "dur": 12, "args": { "External id": 228796, "cbid": 211, "correlation": 228796 } }, { "ph": "s", "id": 228796, "pid": 76337, "tid": -914061504, "ts": 1716454224970559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225046474, "dur": 119, "args": { "External id": 228798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228798, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228798, "pid": 5, "tid": 7, "ts": 1716454225046474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970578, "dur": 6, "args": { "External id": 228798, "cbid": 211, "correlation": 228798 } }, { "ph": "s", "id": 228798, "pid": 76337, "tid": -914061504, "ts": 1716454224970578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225046595, "dur": 23, "args": { "External id": 228800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228800, "pid": 5, "tid": 7, "ts": 1716454225046595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970588, "dur": 5, "args": { "External id": 228800, "cbid": 211, "correlation": 228800 } }, { "ph": "s", "id": 228800, "pid": 76337, "tid": -914061504, "ts": 1716454224970588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225046620, "dur": 33, "args": { "External id": 228806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228806, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228806, "pid": 5, "tid": 7, "ts": 1716454225046620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970616, "dur": 8, "args": { "External id": 228806, "cbid": 211, "correlation": 228806 } }, { "ph": "s", "id": 228806, "pid": 76337, "tid": -914061504, "ts": 1716454224970616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225046654, "dur": 26, "args": { "External id": 228814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228814, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228814, "pid": 5, "tid": 7, "ts": 1716454225046654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970646, "dur": 9, "args": { "External id": 228814, "cbid": 211, "correlation": 228814 } }, { "ph": "s", "id": 228814, "pid": 76337, "tid": -914061504, "ts": 1716454224970646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225046681, "dur": 47, "args": { "External id": 228823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228823, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228823, "pid": 5, "tid": 7, "ts": 1716454225046681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970684, "dur": 10, "args": { "External id": 228823, "cbid": 211, "correlation": 228823 } }, { "ph": "s", "id": 228823, "pid": 76337, "tid": -914061504, "ts": 1716454224970684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225046730, "dur": 43, "args": { "External id": 228843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228843, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 228843, "pid": 5, "tid": 7, "ts": 1716454225046730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970754, "dur": 12, "args": { "External id": 228843, "cbid": 211, "correlation": 228843 } }, { "ph": "s", "id": 228843, "pid": 76337, "tid": -914061504, "ts": 1716454224970754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225046775, "dur": 5, "args": { "External id": 228855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228855, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 228855, "pid": 5, "tid": 7, "ts": 1716454225046775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970776, "dur": 6, "args": { "External id": 228855, "cbid": 211, "correlation": 228855 } }, { "ph": "s", "id": 228855, "pid": 76337, "tid": -914061504, "ts": 1716454224970776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225046781, "dur": 44, "args": { "External id": 228858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228858, "pid": 5, "tid": 7, "ts": 1716454225046781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970794, "dur": 7, "args": { "External id": 228858, "cbid": 211, "correlation": 228858 } }, { "ph": "s", "id": 228858, "pid": 76337, "tid": -914061504, "ts": 1716454224970794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225046826, "dur": 28, "args": { "External id": 228867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228867, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228867, "pid": 5, "tid": 7, "ts": 1716454225046826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970834, "dur": 9, "args": { "External id": 228867, "cbid": 211, "correlation": 228867 } }, { "ph": "s", "id": 228867, "pid": 76337, "tid": -914061504, "ts": 1716454224970834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224970885, "dur": 0, "args": { "External id": 228877, "cbid": 317, "correlation": 228877 } }, { "ph": "f", "id": 228877, "pid": 76337, "tid": -914061504, "ts": 1716454224970885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224970886, "dur": 0, "args": { "External id": 228878, "cbid": 203, "correlation": 228878 } }, { "ph": "f", "id": 228878, "pid": 76337, "tid": -914061504, "ts": 1716454224970886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224970886, "dur": 0, "args": { "External id": 228879, "cbid": 205, "correlation": 228879 } }, { "ph": "f", "id": 228879, "pid": 76337, "tid": -914061504, "ts": 1716454224970886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225046855, "dur": 30, "args": { "External id": 228883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228883, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228883, "pid": 5, "tid": 7, "ts": 1716454225046855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970901, "dur": 12, "args": { "External id": 228883, "cbid": 211, "correlation": 228883 } }, { "ph": "s", "id": 228883, "pid": 76337, "tid": -914061504, "ts": 1716454224970901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225046886, "dur": 63, "args": { "External id": 228885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228885, "pid": 5, "tid": 7, "ts": 1716454225046886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970916, "dur": 5, "args": { "External id": 228885, "cbid": 211, "correlation": 228885 } }, { "ph": "s", "id": 228885, "pid": 76337, "tid": -914061504, "ts": 1716454224970916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225046950, "dur": 959, "args": { "External id": 228887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228887, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228887, "pid": 5, "tid": 7, "ts": 1716454225046950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970927, "dur": 6, "args": { "External id": 228887, "cbid": 211, "correlation": 228887 } }, { "ph": "s", "id": 228887, "pid": 76337, "tid": -914061504, "ts": 1716454224970927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225047911, "dur": 21, "args": { "External id": 228889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228889, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228889, "pid": 5, "tid": 7, "ts": 1716454225047911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970937, "dur": 5, "args": { "External id": 228889, "cbid": 211, "correlation": 228889 } }, { "ph": "s", "id": 228889, "pid": 76337, "tid": -914061504, "ts": 1716454224970937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225047933, "dur": 33, "args": { "External id": 228895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228895, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228895, "pid": 5, "tid": 7, "ts": 1716454225047933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224970966, "dur": 16, "args": { "External id": 228895, "cbid": 211, "correlation": 228895 } }, { "ph": "s", "id": 228895, "pid": 76337, "tid": -914061504, "ts": 1716454224970966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225047967, "dur": 3, "args": { "External id": 228903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228903, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 228903, "pid": 5, "tid": 7, "ts": 1716454225047967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971018, "dur": 10, "args": { "External id": 228903, "cbid": 211, "correlation": 228903 } }, { "ph": "s", "id": 228903, "pid": 76337, "tid": -914061504, "ts": 1716454224971018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224971084, "dur": 1, "args": { "External id": 228919, "cbid": 251, "correlation": 228919 } }, { "ph": "f", "id": 228919, "pid": 76337, "tid": -914061504, "ts": 1716454224971084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224971090, "dur": 0, "args": { "External id": 228921, "cbid": 251, "correlation": 228921 } }, { "ph": "f", "id": 228921, "pid": 76337, "tid": -914061504, "ts": 1716454224971090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225047972, "dur": 12, "args": { "External id": 228922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228922, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 228922, "pid": 5, "tid": 7, "ts": 1716454225047972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971091, "dur": 11, "args": { "External id": 228922, "cbid": 211, "correlation": 228922 } }, { "ph": "s", "id": 228922, "pid": 76337, "tid": -914061504, "ts": 1716454224971091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225047985, "dur": 5, "args": { "External id": 228924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228924, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 228924, "pid": 5, "tid": 7, "ts": 1716454225047985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971105, "dur": 6, "args": { "External id": 228924, "cbid": 211, "correlation": 228924 } }, { "ph": "s", "id": 228924, "pid": 76337, "tid": -914061504, "ts": 1716454224971105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225047991, "dur": 29, "args": { "External id": 228934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228934, "pid": 5, "tid": 7, "ts": 1716454225047991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971162, "dur": 12, "args": { "External id": 228934, "cbid": 211, "correlation": 228934 } }, { "ph": "s", "id": 228934, "pid": 76337, "tid": -914061504, "ts": 1716454224971162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225048022, "dur": 30, "args": { "External id": 228954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228954, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 228954, "pid": 5, "tid": 7, "ts": 1716454225048022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971227, "dur": 10, "args": { "External id": 228954, "cbid": 211, "correlation": 228954 } }, { "ph": "s", "id": 228954, "pid": 76337, "tid": -914061504, "ts": 1716454224971227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225048054, "dur": 4, "args": { "External id": 228966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228966, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 228966, "pid": 5, "tid": 7, "ts": 1716454225048054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971247, "dur": 6, "args": { "External id": 228966, "cbid": 211, "correlation": 228966 } }, { "ph": "s", "id": 228966, "pid": 76337, "tid": -914061504, "ts": 1716454224971247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225048059, "dur": 29, "args": { "External id": 228969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228969, "pid": 5, "tid": 7, "ts": 1716454225048059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971266, "dur": 7, "args": { "External id": 228969, "cbid": 211, "correlation": 228969 } }, { "ph": "s", "id": 228969, "pid": 76337, "tid": -914061504, "ts": 1716454224971266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225048090, "dur": 22, "args": { "External id": 228978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228978, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228978, "pid": 5, "tid": 7, "ts": 1716454225048090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971306, "dur": 11, "args": { "External id": 228978, "cbid": 211, "correlation": 228978 } }, { "ph": "s", "id": 228978, "pid": 76337, "tid": -914061504, "ts": 1716454224971306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224971369, "dur": 0, "args": { "External id": 228988, "cbid": 317, "correlation": 228988 } }, { "ph": "f", "id": 228988, "pid": 76337, "tid": -914061504, "ts": 1716454224971369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224971370, "dur": 0, "args": { "External id": 228989, "cbid": 203, "correlation": 228989 } }, { "ph": "f", "id": 228989, "pid": 76337, "tid": -914061504, "ts": 1716454224971370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224971371, "dur": 0, "args": { "External id": 228990, "cbid": 205, "correlation": 228990 } }, { "ph": "f", "id": 228990, "pid": 76337, "tid": -914061504, "ts": 1716454224971371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225048113, "dur": 22, "args": { "External id": 228994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228994, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228994, "pid": 5, "tid": 7, "ts": 1716454225048113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971384, "dur": 12, "args": { "External id": 228994, "cbid": 211, "correlation": 228994 } }, { "ph": "s", "id": 228994, "pid": 76337, "tid": -914061504, "ts": 1716454224971384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225048137, "dur": 43, "args": { "External id": 228996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228996, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 228996, "pid": 5, "tid": 7, "ts": 1716454225048137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971399, "dur": 6, "args": { "External id": 228996, "cbid": 211, "correlation": 228996 } }, { "ph": "s", "id": 228996, "pid": 76337, "tid": -914061504, "ts": 1716454224971399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225048181, "dur": 638, "args": { "External id": 228998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 228998, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 228998, "pid": 5, "tid": 7, "ts": 1716454225048181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971411, "dur": 6, "args": { "External id": 228998, "cbid": 211, "correlation": 228998 } }, { "ph": "s", "id": 228998, "pid": 76337, "tid": -914061504, "ts": 1716454224971411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225048820, "dur": 23, "args": { "External id": 229000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229000, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229000, "pid": 5, "tid": 7, "ts": 1716454225048820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971421, "dur": 5, "args": { "External id": 229000, "cbid": 211, "correlation": 229000 } }, { "ph": "s", "id": 229000, "pid": 76337, "tid": -914061504, "ts": 1716454224971421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225048844, "dur": 33, "args": { "External id": 229006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229006, "pid": 5, "tid": 7, "ts": 1716454225048844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971448, "dur": 10, "args": { "External id": 229006, "cbid": 211, "correlation": 229006 } }, { "ph": "s", "id": 229006, "pid": 76337, "tid": -914061504, "ts": 1716454224971448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224971506, "dur": 0, "args": { "External id": 229016, "cbid": 317, "correlation": 229016 } }, { "ph": "f", "id": 229016, "pid": 76337, "tid": -914061504, "ts": 1716454224971506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224971507, "dur": 0, "args": { "External id": 229017, "cbid": 203, "correlation": 229017 } }, { "ph": "f", "id": 229017, "pid": 76337, "tid": -914061504, "ts": 1716454224971507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224971508, "dur": 0, "args": { "External id": 229018, "cbid": 205, "correlation": 229018 } }, { "ph": "f", "id": 229018, "pid": 76337, "tid": -914061504, "ts": 1716454224971508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225048878, "dur": 30, "args": { "External id": 229022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229022, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229022, "pid": 5, "tid": 7, "ts": 1716454225048878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971522, "dur": 11, "args": { "External id": 229022, "cbid": 211, "correlation": 229022 } }, { "ph": "s", "id": 229022, "pid": 76337, "tid": -914061504, "ts": 1716454224971522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225048909, "dur": 149, "args": { "External id": 229024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229024, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229024, "pid": 5, "tid": 7, "ts": 1716454225048909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971540, "dur": 6, "args": { "External id": 229024, "cbid": 211, "correlation": 229024 } }, { "ph": "s", "id": 229024, "pid": 76337, "tid": -914061504, "ts": 1716454224971540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225049059, "dur": 23, "args": { "External id": 229026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229026, "pid": 5, "tid": 7, "ts": 1716454225049059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971551, "dur": 5, "args": { "External id": 229026, "cbid": 211, "correlation": 229026 } }, { "ph": "s", "id": 229026, "pid": 76337, "tid": -914061504, "ts": 1716454224971551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225049084, "dur": 32, "args": { "External id": 229032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229032, "pid": 5, "tid": 7, "ts": 1716454225049084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971577, "dur": 8, "args": { "External id": 229032, "cbid": 211, "correlation": 229032 } }, { "ph": "s", "id": 229032, "pid": 76337, "tid": -914061504, "ts": 1716454224971577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225049118, "dur": 27, "args": { "External id": 229040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229040, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229040, "pid": 5, "tid": 7, "ts": 1716454225049118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971605, "dur": 9, "args": { "External id": 229040, "cbid": 211, "correlation": 229040 } }, { "ph": "s", "id": 229040, "pid": 76337, "tid": -914061504, "ts": 1716454224971605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225049146, "dur": 20, "args": { "External id": 229048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229048, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229048, "pid": 5, "tid": 7, "ts": 1716454225049146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971634, "dur": 8, "args": { "External id": 229048, "cbid": 211, "correlation": 229048 } }, { "ph": "s", "id": 229048, "pid": 76337, "tid": -914061504, "ts": 1716454224971634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225049167, "dur": 28, "args": { "External id": 229068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229068, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 229068, "pid": 5, "tid": 7, "ts": 1716454225049167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971716, "dur": 12, "args": { "External id": 229068, "cbid": 211, "correlation": 229068 } }, { "ph": "s", "id": 229068, "pid": 76337, "tid": -914061504, "ts": 1716454224971716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225049197, "dur": 5, "args": { "External id": 229080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229080, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 229080, "pid": 5, "tid": 7, "ts": 1716454225049197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971738, "dur": 6, "args": { "External id": 229080, "cbid": 211, "correlation": 229080 } }, { "ph": "s", "id": 229080, "pid": 76337, "tid": -914061504, "ts": 1716454224971738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225049202, "dur": 29, "args": { "External id": 229083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229083, "pid": 5, "tid": 7, "ts": 1716454225049202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971755, "dur": 8, "args": { "External id": 229083, "cbid": 211, "correlation": 229083 } }, { "ph": "s", "id": 229083, "pid": 76337, "tid": -914061504, "ts": 1716454224971755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224971814, "dur": 0, "args": { "External id": 229094, "cbid": 317, "correlation": 229094 } }, { "ph": "f", "id": 229094, "pid": 76337, "tid": -914061504, "ts": 1716454224971814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224971815, "dur": 0, "args": { "External id": 229095, "cbid": 203, "correlation": 229095 } }, { "ph": "f", "id": 229095, "pid": 76337, "tid": -914061504, "ts": 1716454224971815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224971816, "dur": 0, "args": { "External id": 229096, "cbid": 205, "correlation": 229096 } }, { "ph": "f", "id": 229096, "pid": 76337, "tid": -914061504, "ts": 1716454224971816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225049233, "dur": 21, "args": { "External id": 229100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229100, "pid": 5, "tid": 7, "ts": 1716454225049233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971829, "dur": 12, "args": { "External id": 229100, "cbid": 211, "correlation": 229100 } }, { "ph": "s", "id": 229100, "pid": 76337, "tid": -914061504, "ts": 1716454224971829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225049256, "dur": 103, "args": { "External id": 229102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229102, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229102, "pid": 5, "tid": 7, "ts": 1716454225049256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971847, "dur": 6, "args": { "External id": 229102, "cbid": 211, "correlation": 229102 } }, { "ph": "s", "id": 229102, "pid": 76337, "tid": -914061504, "ts": 1716454224971847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225049360, "dur": 22, "args": { "External id": 229104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229104, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229104, "pid": 5, "tid": 7, "ts": 1716454225049360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971857, "dur": 5, "args": { "External id": 229104, "cbid": 211, "correlation": 229104 } }, { "ph": "s", "id": 229104, "pid": 76337, "tid": -914061504, "ts": 1716454224971857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225049384, "dur": 32, "args": { "External id": 229110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229110, "pid": 5, "tid": 7, "ts": 1716454225049384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971885, "dur": 9, "args": { "External id": 229110, "cbid": 211, "correlation": 229110 } }, { "ph": "s", "id": 229110, "pid": 76337, "tid": -914061504, "ts": 1716454224971885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225049417, "dur": 180, "args": { "External id": 229119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229119, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229119, "pid": 5, "tid": 7, "ts": 1716454225049417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224971967, "dur": 21, "args": { "External id": 229119, "cbid": 211, "correlation": 229119 } }, { "ph": "s", "id": 229119, "pid": 76337, "tid": -914061504, "ts": 1716454224971967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225049598, "dur": 63, "args": { "External id": 229141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229141, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229141, "pid": 5, "tid": 7, "ts": 1716454225049598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972031, "dur": 11, "args": { "External id": 229141, "cbid": 211, "correlation": 229141 } }, { "ph": "s", "id": 229141, "pid": 76337, "tid": -914061504, "ts": 1716454224972031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972120, "dur": 1, "args": { "External id": 229152, "cbid": 251, "correlation": 229152 } }, { "ph": "f", "id": 229152, "pid": 76337, "tid": -914061504, "ts": 1716454224972120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225049663, "dur": 150, "args": { "External id": 229153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229153, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229153, "pid": 5, "tid": 7, "ts": 1716454225049663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972126, "dur": 13, "args": { "External id": 229153, "cbid": 211, "correlation": 229153 } }, { "ph": "s", "id": 229153, "pid": 76337, "tid": -914061504, "ts": 1716454224972126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972196, "dur": 1, "args": { "External id": 229164, "cbid": 251, "correlation": 229164 } }, { "ph": "f", "id": 229164, "pid": 76337, "tid": -914061504, "ts": 1716454224972196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225049814, "dur": 146, "args": { "External id": 229165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229165, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229165, "pid": 5, "tid": 7, "ts": 1716454225049814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972200, "dur": 12, "args": { "External id": 229165, "cbid": 211, "correlation": 229165 } }, { "ph": "s", "id": 229165, "pid": 76337, "tid": -914061504, "ts": 1716454224972200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972266, "dur": 1, "args": { "External id": 229176, "cbid": 251, "correlation": 229176 } }, { "ph": "f", "id": 229176, "pid": 76337, "tid": -914061504, "ts": 1716454224972266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225049961, "dur": 144, "args": { "External id": 229177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229177, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229177, "pid": 5, "tid": 7, "ts": 1716454225049961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972270, "dur": 12, "args": { "External id": 229177, "cbid": 211, "correlation": 229177 } }, { "ph": "s", "id": 229177, "pid": 76337, "tid": -914061504, "ts": 1716454224972270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225050106, "dur": 1909, "args": { "External id": 229198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229198, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 229198, "pid": 5, "tid": 7, "ts": 1716454225050106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972350, "dur": 13, "args": { "External id": 229198, "cbid": 211, "correlation": 229198 } }, { "ph": "s", "id": 229198, "pid": 76337, "tid": -914061504, "ts": 1716454224972350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972450, "dur": 1, "args": { "External id": 229216, "cbid": 251, "correlation": 229216 } }, { "ph": "f", "id": 229216, "pid": 76337, "tid": -914061504, "ts": 1716454224972450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225052017, "dur": 147, "args": { "External id": 229218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229218, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 229218, "pid": 5, "tid": 7, "ts": 1716454225052017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972456, "dur": 14, "args": { "External id": 229218, "cbid": 211, "correlation": 229218 } }, { "ph": "s", "id": 229218, "pid": 76337, "tid": -914061504, "ts": 1716454224972456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225052165, "dur": 35, "args": { "External id": 229226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229226, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229226, "pid": 5, "tid": 7, "ts": 1716454225052165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972527, "dur": 12, "args": { "External id": 229226, "cbid": 211, "correlation": 229226 } }, { "ph": "s", "id": 229226, "pid": 76337, "tid": -914061504, "ts": 1716454224972527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225052201, "dur": 51, "args": { "External id": 229234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229234, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229234, "pid": 5, "tid": 7, "ts": 1716454225052201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972564, "dur": 10, "args": { "External id": 229234, "cbid": 211, "correlation": 229234 } }, { "ph": "s", "id": 229234, "pid": 76337, "tid": -914061504, "ts": 1716454224972564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225052253, "dur": 31, "args": { "External id": 229245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229245, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229245, "pid": 5, "tid": 7, "ts": 1716454225052253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972636, "dur": 13, "args": { "External id": 229245, "cbid": 211, "correlation": 229245 } }, { "ph": "s", "id": 229245, "pid": 76337, "tid": -914061504, "ts": 1716454224972636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225052285, "dur": 33, "args": { "External id": 229267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229267, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229267, "pid": 5, "tid": 7, "ts": 1716454225052285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972667, "dur": 8, "args": { "External id": 229267, "cbid": 211, "correlation": 229267 } }, { "ph": "s", "id": 229267, "pid": 76337, "tid": -914061504, "ts": 1716454224972667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972752, "dur": 1, "args": { "External id": 229278, "cbid": 251, "correlation": 229278 } }, { "ph": "f", "id": 229278, "pid": 76337, "tid": -914061504, "ts": 1716454224972752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225052320, "dur": 89, "args": { "External id": 229279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229279, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229279, "pid": 5, "tid": 7, "ts": 1716454225052320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972757, "dur": 13, "args": { "External id": 229279, "cbid": 211, "correlation": 229279 } }, { "ph": "s", "id": 229279, "pid": 76337, "tid": -914061504, "ts": 1716454224972757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972827, "dur": 1, "args": { "External id": 229290, "cbid": 251, "correlation": 229290 } }, { "ph": "f", "id": 229290, "pid": 76337, "tid": -914061504, "ts": 1716454224972827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972831, "dur": 0, "args": { "External id": 229291, "cbid": 251, "correlation": 229291 } }, { "ph": "f", "id": 229291, "pid": 76337, "tid": -914061504, "ts": 1716454224972831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225052411, "dur": 11, "args": { "External id": 229292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229292, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 229292, "pid": 5, "tid": 7, "ts": 1716454225052411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972832, "dur": 12, "args": { "External id": 229292, "cbid": 211, "correlation": 229292 } }, { "ph": "s", "id": 229292, "pid": 76337, "tid": -914061504, "ts": 1716454224972832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225052423, "dur": 5, "args": { "External id": 229294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229294, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 229294, "pid": 5, "tid": 7, "ts": 1716454225052423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972846, "dur": 6, "args": { "External id": 229294, "cbid": 211, "correlation": 229294 } }, { "ph": "s", "id": 229294, "pid": 76337, "tid": -914061504, "ts": 1716454224972846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972903, "dur": 1, "args": { "External id": 229305, "cbid": 251, "correlation": 229305 } }, { "ph": "f", "id": 229305, "pid": 76337, "tid": -914061504, "ts": 1716454224972903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224972906, "dur": 0, "args": { "External id": 229306, "cbid": 251, "correlation": 229306 } }, { "ph": "f", "id": 229306, "pid": 76337, "tid": -914061504, "ts": 1716454224972906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225052429, "dur": 7, "args": { "External id": 229307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229307, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 229307, "pid": 5, "tid": 7, "ts": 1716454225052429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972908, "dur": 11, "args": { "External id": 229307, "cbid": 211, "correlation": 229307 } }, { "ph": "s", "id": 229307, "pid": 76337, "tid": -914061504, "ts": 1716454224972908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225052438, "dur": 3, "args": { "External id": 229309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229309, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 229309, "pid": 5, "tid": 7, "ts": 1716454225052438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224972921, "dur": 7, "args": { "External id": 229309, "cbid": 211, "correlation": 229309 } }, { "ph": "s", "id": 229309, "pid": 76337, "tid": -914061504, "ts": 1716454224972921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225052442, "dur": 89, "args": { "External id": 229330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229330, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 229330, "pid": 5, "tid": 7, "ts": 1716454225052442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973005, "dur": 13, "args": { "External id": 229330, "cbid": 211, "correlation": 229330 } }, { "ph": "s", "id": 229330, "pid": 76337, "tid": -914061504, "ts": 1716454224973005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224973105, "dur": 1, "args": { "External id": 229348, "cbid": 251, "correlation": 229348 } }, { "ph": "f", "id": 229348, "pid": 76337, "tid": -914061504, "ts": 1716454224973105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225052532, "dur": 98, "args": { "External id": 229350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229350, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229350, "pid": 5, "tid": 7, "ts": 1716454225052532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973111, "dur": 13, "args": { "External id": 229350, "cbid": 211, "correlation": 229350 } }, { "ph": "s", "id": 229350, "pid": 76337, "tid": -914061504, "ts": 1716454224973111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225052632, "dur": 19, "args": { "External id": 229358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229358, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229358, "pid": 5, "tid": 7, "ts": 1716454225052632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973181, "dur": 12, "args": { "External id": 229358, "cbid": 211, "correlation": 229358 } }, { "ph": "s", "id": 229358, "pid": 76337, "tid": -914061504, "ts": 1716454224973181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225052652, "dur": 38, "args": { "External id": 229366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229366, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229366, "pid": 5, "tid": 7, "ts": 1716454225052652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973223, "dur": 10, "args": { "External id": 229366, "cbid": 211, "correlation": 229366 } }, { "ph": "s", "id": 229366, "pid": 76337, "tid": -914061504, "ts": 1716454224973223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225052691, "dur": 34, "args": { "External id": 229388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229388, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229388, "pid": 5, "tid": 7, "ts": 1716454225052691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973274, "dur": 10, "args": { "External id": 229388, "cbid": 211, "correlation": 229388 } }, { "ph": "s", "id": 229388, "pid": 76337, "tid": -914061504, "ts": 1716454224973274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224973362, "dur": 1, "args": { "External id": 229404, "cbid": 251, "correlation": 229404 } }, { "ph": "f", "id": 229404, "pid": 76337, "tid": -914061504, "ts": 1716454224973362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224973367, "dur": 0, "args": { "External id": 229406, "cbid": 251, "correlation": 229406 } }, { "ph": "f", "id": 229406, "pid": 76337, "tid": -914061504, "ts": 1716454224973367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225052726, "dur": 530, "args": { "External id": 229407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229407, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 229407, "pid": 5, "tid": 7, "ts": 1716454225052726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973370, "dur": 14, "args": { "External id": 229407, "cbid": 211, "correlation": 229407 } }, { "ph": "s", "id": 229407, "pid": 76337, "tid": -914061504, "ts": 1716454224973370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225053257, "dur": 124, "args": { "External id": 229415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229415, "pid": 5, "tid": 7, "ts": 1716454225053257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973436, "dur": 13, "args": { "External id": 229415, "cbid": 211, "correlation": 229415 } }, { "ph": "s", "id": 229415, "pid": 76337, "tid": -914061504, "ts": 1716454224973436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225053382, "dur": 128, "args": { "External id": 229423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229423, "pid": 5, "tid": 7, "ts": 1716454225053382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973467, "dur": 8, "args": { "External id": 229423, "cbid": 211, "correlation": 229423 } }, { "ph": "s", "id": 229423, "pid": 76337, "tid": -914061504, "ts": 1716454224973467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224973544, "dur": 1, "args": { "External id": 229439, "cbid": 251, "correlation": 229439 } }, { "ph": "f", "id": 229439, "pid": 76337, "tid": -914061504, "ts": 1716454224973544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225053511, "dur": 302, "args": { "External id": 229441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229441, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229441, "pid": 5, "tid": 7, "ts": 1716454225053511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973550, "dur": 12, "args": { "External id": 229441, "cbid": 211, "correlation": 229441 } }, { "ph": "s", "id": 229441, "pid": 76337, "tid": -914061504, "ts": 1716454224973550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225053814, "dur": 27, "args": { "External id": 229449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229449, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229449, "pid": 5, "tid": 7, "ts": 1716454225053814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973593, "dur": 10, "args": { "External id": 229449, "cbid": 211, "correlation": 229449 } }, { "ph": "s", "id": 229449, "pid": 76337, "tid": -914061504, "ts": 1716454224973593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225053843, "dur": 80, "args": { "External id": 229460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229460, "pid": 5, "tid": 7, "ts": 1716454225053843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973659, "dur": 12, "args": { "External id": 229460, "cbid": 211, "correlation": 229460 } }, { "ph": "s", "id": 229460, "pid": 76337, "tid": -914061504, "ts": 1716454224973659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224973722, "dur": 0, "args": { "External id": 229472, "cbid": 317, "correlation": 229472 } }, { "ph": "f", "id": 229472, "pid": 76337, "tid": -914061504, "ts": 1716454224973722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224973723, "dur": 0, "args": { "External id": 229473, "cbid": 203, "correlation": 229473 } }, { "ph": "f", "id": 229473, "pid": 76337, "tid": -914061504, "ts": 1716454224973723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224973724, "dur": 0, "args": { "External id": 229474, "cbid": 205, "correlation": 229474 } }, { "ph": "f", "id": 229474, "pid": 76337, "tid": -914061504, "ts": 1716454224973724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225053923, "dur": 21, "args": { "External id": 229478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229478, "pid": 5, "tid": 7, "ts": 1716454225053923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973741, "dur": 12, "args": { "External id": 229478, "cbid": 211, "correlation": 229478 } }, { "ph": "s", "id": 229478, "pid": 76337, "tid": -914061504, "ts": 1716454224973741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225053946, "dur": 118, "args": { "External id": 229480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229480, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229480, "pid": 5, "tid": 7, "ts": 1716454225053946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973758, "dur": 6, "args": { "External id": 229480, "cbid": 211, "correlation": 229480 } }, { "ph": "s", "id": 229480, "pid": 76337, "tid": -914061504, "ts": 1716454224973758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225054065, "dur": 23, "args": { "External id": 229482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229482, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229482, "pid": 5, "tid": 7, "ts": 1716454225054065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973768, "dur": 5, "args": { "External id": 229482, "cbid": 211, "correlation": 229482 } }, { "ph": "s", "id": 229482, "pid": 76337, "tid": -914061504, "ts": 1716454224973768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225054089, "dur": 32, "args": { "External id": 229488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229488, "pid": 5, "tid": 7, "ts": 1716454225054089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973797, "dur": 8, "args": { "External id": 229488, "cbid": 211, "correlation": 229488 } }, { "ph": "s", "id": 229488, "pid": 76337, "tid": -914061504, "ts": 1716454224973797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225054122, "dur": 27, "args": { "External id": 229496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229496, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229496, "pid": 5, "tid": 7, "ts": 1716454225054122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973828, "dur": 9, "args": { "External id": 229496, "cbid": 211, "correlation": 229496 } }, { "ph": "s", "id": 229496, "pid": 76337, "tid": -914061504, "ts": 1716454224973828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225054150, "dur": 100, "args": { "External id": 229507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229507, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229507, "pid": 5, "tid": 7, "ts": 1716454225054150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973890, "dur": 11, "args": { "External id": 229507, "cbid": 211, "correlation": 229507 } }, { "ph": "s", "id": 229507, "pid": 76337, "tid": -914061504, "ts": 1716454224973890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224973944, "dur": 0, "args": { "External id": 229517, "cbid": 317, "correlation": 229517 } }, { "ph": "f", "id": 229517, "pid": 76337, "tid": -914061504, "ts": 1716454224973944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224973945, "dur": 0, "args": { "External id": 229518, "cbid": 203, "correlation": 229518 } }, { "ph": "f", "id": 229518, "pid": 76337, "tid": -914061504, "ts": 1716454224973945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224973946, "dur": 0, "args": { "External id": 229519, "cbid": 205, "correlation": 229519 } }, { "ph": "f", "id": 229519, "pid": 76337, "tid": -914061504, "ts": 1716454224973946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225054251, "dur": 75, "args": { "External id": 229523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229523, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229523, "pid": 5, "tid": 7, "ts": 1716454225054251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973960, "dur": 11, "args": { "External id": 229523, "cbid": 211, "correlation": 229523 } }, { "ph": "s", "id": 229523, "pid": 76337, "tid": -914061504, "ts": 1716454224973960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225054327, "dur": 43, "args": { "External id": 229525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229525, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229525, "pid": 5, "tid": 7, "ts": 1716454225054327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973985, "dur": 6, "args": { "External id": 229525, "cbid": 211, "correlation": 229525 } }, { "ph": "s", "id": 229525, "pid": 76337, "tid": -914061504, "ts": 1716454224973985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225054371, "dur": 4, "args": { "External id": 229527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229527, "pid": 5, "tid": 7, "ts": 1716454225054371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224973996, "dur": 6, "args": { "External id": 229527, "cbid": 211, "correlation": 229527 } }, { "ph": "s", "id": 229527, "pid": 76337, "tid": -914061504, "ts": 1716454224973996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224974004, "dur": 0, "args": { "External id": 229528, "cbid": 51, "correlation": 229528 } }, { "ph": "s", "id": 229528, "pid": 76337, "tid": -914061504, "ts": 1716454224974004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225054376, "dur": 2221, "args": { "External id": 229529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229529, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229529, "pid": 5, "tid": 7, "ts": 1716454225054376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974005, "dur": 5, "args": { "External id": 229529, "cbid": 211, "correlation": 229529 } }, { "ph": "s", "id": 229529, "pid": 76337, "tid": -914061504, "ts": 1716454224974005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225056599, "dur": 111, "args": { "External id": 229534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229534, "pid": 5, "tid": 7, "ts": 1716454225056599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974035, "dur": 9, "args": { "External id": 229534, "cbid": 211, "correlation": 229534 } }, { "ph": "s", "id": 229534, "pid": 76337, "tid": -914061504, "ts": 1716454224974035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225056711, "dur": 165, "args": { "External id": 229543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229543, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229543, "pid": 5, "tid": 7, "ts": 1716454225056711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974127, "dur": 13, "args": { "External id": 229543, "cbid": 211, "correlation": 229543 } }, { "ph": "s", "id": 229543, "pid": 76337, "tid": -914061504, "ts": 1716454224974127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225056878, "dur": 127, "args": { "External id": 229563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229563, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 229563, "pid": 5, "tid": 7, "ts": 1716454225056878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974196, "dur": 11, "args": { "External id": 229563, "cbid": 211, "correlation": 229563 } }, { "ph": "s", "id": 229563, "pid": 76337, "tid": -914061504, "ts": 1716454224974196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225057006, "dur": 5, "args": { "External id": 229575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229575, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 229575, "pid": 5, "tid": 7, "ts": 1716454225057006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974218, "dur": 6, "args": { "External id": 229575, "cbid": 211, "correlation": 229575 } }, { "ph": "s", "id": 229575, "pid": 76337, "tid": -914061504, "ts": 1716454224974218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225057012, "dur": 160, "args": { "External id": 229578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229578, "pid": 5, "tid": 7, "ts": 1716454225057012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974237, "dur": 7, "args": { "External id": 229578, "cbid": 211, "correlation": 229578 } }, { "ph": "s", "id": 229578, "pid": 76337, "tid": -914061504, "ts": 1716454224974237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225057173, "dur": 101, "args": { "External id": 229587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229587, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229587, "pid": 5, "tid": 7, "ts": 1716454225057173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974277, "dur": 10, "args": { "External id": 229587, "cbid": 211, "correlation": 229587 } }, { "ph": "s", "id": 229587, "pid": 76337, "tid": -914061504, "ts": 1716454224974277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224974330, "dur": 0, "args": { "External id": 229597, "cbid": 317, "correlation": 229597 } }, { "ph": "f", "id": 229597, "pid": 76337, "tid": -914061504, "ts": 1716454224974330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224974331, "dur": 0, "args": { "External id": 229598, "cbid": 203, "correlation": 229598 } }, { "ph": "f", "id": 229598, "pid": 76337, "tid": -914061504, "ts": 1716454224974331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224974332, "dur": 0, "args": { "External id": 229599, "cbid": 205, "correlation": 229599 } }, { "ph": "f", "id": 229599, "pid": 76337, "tid": -914061504, "ts": 1716454224974332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225057276, "dur": 110, "args": { "External id": 229603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229603, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229603, "pid": 5, "tid": 7, "ts": 1716454225057276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974347, "dur": 11, "args": { "External id": 229603, "cbid": 211, "correlation": 229603 } }, { "ph": "s", "id": 229603, "pid": 76337, "tid": -914061504, "ts": 1716454224974347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225057387, "dur": 33, "args": { "External id": 229605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229605, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229605, "pid": 5, "tid": 7, "ts": 1716454225057387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974361, "dur": 5, "args": { "External id": 229605, "cbid": 211, "correlation": 229605 } }, { "ph": "s", "id": 229605, "pid": 76337, "tid": -914061504, "ts": 1716454224974361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225057422, "dur": 4, "args": { "External id": 229607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229607, "pid": 5, "tid": 7, "ts": 1716454225057422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974371, "dur": 7, "args": { "External id": 229607, "cbid": 211, "correlation": 229607 } }, { "ph": "s", "id": 229607, "pid": 76337, "tid": -914061504, "ts": 1716454224974371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224974381, "dur": 0, "args": { "External id": 229608, "cbid": 51, "correlation": 229608 } }, { "ph": "s", "id": 229608, "pid": 76337, "tid": -914061504, "ts": 1716454224974381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225057426, "dur": 1982, "args": { "External id": 229609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229609, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229609, "pid": 5, "tid": 7, "ts": 1716454225057426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974382, "dur": 5, "args": { "External id": 229609, "cbid": 211, "correlation": 229609 } }, { "ph": "s", "id": 229609, "pid": 76337, "tid": -914061504, "ts": 1716454224974382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225059410, "dur": 57, "args": { "External id": 229614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229614, "pid": 5, "tid": 7, "ts": 1716454225059410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974410, "dur": 8, "args": { "External id": 229614, "cbid": 211, "correlation": 229614 } }, { "ph": "s", "id": 229614, "pid": 76337, "tid": -914061504, "ts": 1716454224974410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225059469, "dur": 3, "args": { "External id": 229622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229622, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229622, "pid": 5, "tid": 7, "ts": 1716454225059469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974453, "dur": 9, "args": { "External id": 229622, "cbid": 211, "correlation": 229622 } }, { "ph": "s", "id": 229622, "pid": 76337, "tid": -914061504, "ts": 1716454224974453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224974518, "dur": 1, "args": { "External id": 229638, "cbid": 251, "correlation": 229638 } }, { "ph": "f", "id": 229638, "pid": 76337, "tid": -914061504, "ts": 1716454224974518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224974523, "dur": 0, "args": { "External id": 229640, "cbid": 251, "correlation": 229640 } }, { "ph": "f", "id": 229640, "pid": 76337, "tid": -914061504, "ts": 1716454224974523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225059473, "dur": 11, "args": { "External id": 229641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229641, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 229641, "pid": 5, "tid": 7, "ts": 1716454225059473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974526, "dur": 11, "args": { "External id": 229641, "cbid": 211, "correlation": 229641 } }, { "ph": "s", "id": 229641, "pid": 76337, "tid": -914061504, "ts": 1716454224974526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225059486, "dur": 5, "args": { "External id": 229643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229643, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 229643, "pid": 5, "tid": 7, "ts": 1716454225059486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974539, "dur": 6, "args": { "External id": 229643, "cbid": 211, "correlation": 229643 } }, { "ph": "s", "id": 229643, "pid": 76337, "tid": -914061504, "ts": 1716454224974539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225059492, "dur": 54, "args": { "External id": 229653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229653, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229653, "pid": 5, "tid": 7, "ts": 1716454225059492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974597, "dur": 12, "args": { "External id": 229653, "cbid": 211, "correlation": 229653 } }, { "ph": "s", "id": 229653, "pid": 76337, "tid": -914061504, "ts": 1716454224974597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225059547, "dur": 50, "args": { "External id": 229673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229673, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 229673, "pid": 5, "tid": 7, "ts": 1716454225059547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974663, "dur": 11, "args": { "External id": 229673, "cbid": 211, "correlation": 229673 } }, { "ph": "s", "id": 229673, "pid": 76337, "tid": -914061504, "ts": 1716454224974663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225059598, "dur": 4, "args": { "External id": 229685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229685, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229685, "pid": 5, "tid": 7, "ts": 1716454225059598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974686, "dur": 6, "args": { "External id": 229685, "cbid": 211, "correlation": 229685 } }, { "ph": "s", "id": 229685, "pid": 76337, "tid": -914061504, "ts": 1716454224974686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225059603, "dur": 55, "args": { "External id": 229688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229688, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229688, "pid": 5, "tid": 7, "ts": 1716454225059603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974704, "dur": 7, "args": { "External id": 229688, "cbid": 211, "correlation": 229688 } }, { "ph": "s", "id": 229688, "pid": 76337, "tid": -914061504, "ts": 1716454224974704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225059659, "dur": 36, "args": { "External id": 229697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229697, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229697, "pid": 5, "tid": 7, "ts": 1716454225059659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974745, "dur": 10, "args": { "External id": 229697, "cbid": 211, "correlation": 229697 } }, { "ph": "s", "id": 229697, "pid": 76337, "tid": -914061504, "ts": 1716454224974745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224974807, "dur": 0, "args": { "External id": 229707, "cbid": 317, "correlation": 229707 } }, { "ph": "f", "id": 229707, "pid": 76337, "tid": -914061504, "ts": 1716454224974807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224974808, "dur": 0, "args": { "External id": 229708, "cbid": 203, "correlation": 229708 } }, { "ph": "f", "id": 229708, "pid": 76337, "tid": -914061504, "ts": 1716454224974808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224974809, "dur": 0, "args": { "External id": 229709, "cbid": 205, "correlation": 229709 } }, { "ph": "f", "id": 229709, "pid": 76337, "tid": -914061504, "ts": 1716454224974809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225059697, "dur": 40, "args": { "External id": 229713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229713, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229713, "pid": 5, "tid": 7, "ts": 1716454225059697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974826, "dur": 13, "args": { "External id": 229713, "cbid": 211, "correlation": 229713 } }, { "ph": "s", "id": 229713, "pid": 76337, "tid": -914061504, "ts": 1716454224974826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225059738, "dur": 14, "args": { "External id": 229715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229715, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229715, "pid": 5, "tid": 7, "ts": 1716454225059738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974842, "dur": 5, "args": { "External id": 229715, "cbid": 211, "correlation": 229715 } }, { "ph": "s", "id": 229715, "pid": 76337, "tid": -914061504, "ts": 1716454224974842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225059753, "dur": 3, "args": { "External id": 229717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229717, "pid": 5, "tid": 7, "ts": 1716454225059753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974852, "dur": 6, "args": { "External id": 229717, "cbid": 211, "correlation": 229717 } }, { "ph": "s", "id": 229717, "pid": 76337, "tid": -914061504, "ts": 1716454224974852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224974860, "dur": 0, "args": { "External id": 229718, "cbid": 51, "correlation": 229718 } }, { "ph": "s", "id": 229718, "pid": 76337, "tid": -914061504, "ts": 1716454224974860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225059758, "dur": 688, "args": { "External id": 229719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229719, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229719, "pid": 5, "tid": 7, "ts": 1716454225059758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974861, "dur": 5, "args": { "External id": 229719, "cbid": 211, "correlation": 229719 } }, { "ph": "s", "id": 229719, "pid": 76337, "tid": -914061504, "ts": 1716454224974861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225060447, "dur": 58, "args": { "External id": 229724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229724, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229724, "pid": 5, "tid": 7, "ts": 1716454225060447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974890, "dur": 8, "args": { "External id": 229724, "cbid": 211, "correlation": 229724 } }, { "ph": "s", "id": 229724, "pid": 76337, "tid": -914061504, "ts": 1716454224974890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224974946, "dur": 0, "args": { "External id": 229734, "cbid": 317, "correlation": 229734 } }, { "ph": "f", "id": 229734, "pid": 76337, "tid": -914061504, "ts": 1716454224974946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224974947, "dur": 0, "args": { "External id": 229735, "cbid": 203, "correlation": 229735 } }, { "ph": "f", "id": 229735, "pid": 76337, "tid": -914061504, "ts": 1716454224974947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224974948, "dur": 0, "args": { "External id": 229736, "cbid": 205, "correlation": 229736 } }, { "ph": "f", "id": 229736, "pid": 76337, "tid": -914061504, "ts": 1716454224974948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225060506, "dur": 4, "args": { "External id": 229740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229740, "pid": 5, "tid": 7, "ts": 1716454225060506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974963, "dur": 20, "args": { "External id": 229740, "cbid": 211, "correlation": 229740 } }, { "ph": "s", "id": 229740, "pid": 76337, "tid": -914061504, "ts": 1716454224974963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224974989, "dur": 0, "args": { "External id": 229741, "cbid": 51, "correlation": 229741 } }, { "ph": "s", "id": 229741, "pid": 76337, "tid": -914061504, "ts": 1716454224974989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454225060511, "dur": 261, "args": { "External id": 229742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229742, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229742, "pid": 5, "tid": 7, "ts": 1716454225060511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224974989, "dur": 7, "args": { "External id": 229742, "cbid": 211, "correlation": 229742 } }, { "ph": "s", "id": 229742, "pid": 76337, "tid": -914061504, "ts": 1716454224974989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225060773, "dur": 58, "args": { "External id": 229747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229747, "pid": 5, "tid": 7, "ts": 1716454225060773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975018, "dur": 8, "args": { "External id": 229747, "cbid": 211, "correlation": 229747 } }, { "ph": "s", "id": 229747, "pid": 76337, "tid": -914061504, "ts": 1716454224975018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225060832, "dur": 50, "args": { "External id": 229755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229755, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229755, "pid": 5, "tid": 7, "ts": 1716454225060832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975046, "dur": 8, "args": { "External id": 229755, "cbid": 211, "correlation": 229755 } }, { "ph": "s", "id": 229755, "pid": 76337, "tid": -914061504, "ts": 1716454224975046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225060883, "dur": 35, "args": { "External id": 229763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229763, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229763, "pid": 5, "tid": 7, "ts": 1716454225060883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975075, "dur": 9, "args": { "External id": 229763, "cbid": 211, "correlation": 229763 } }, { "ph": "s", "id": 229763, "pid": 76337, "tid": -914061504, "ts": 1716454224975075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225060920, "dur": 51, "args": { "External id": 229783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229783, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 229783, "pid": 5, "tid": 7, "ts": 1716454225060920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975158, "dur": 13, "args": { "External id": 229783, "cbid": 211, "correlation": 229783 } }, { "ph": "s", "id": 229783, "pid": 76337, "tid": -914061504, "ts": 1716454224975158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225060972, "dur": 4, "args": { "External id": 229795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229795, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 229795, "pid": 5, "tid": 7, "ts": 1716454225060972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975181, "dur": 7, "args": { "External id": 229795, "cbid": 211, "correlation": 229795 } }, { "ph": "s", "id": 229795, "pid": 76337, "tid": -914061504, "ts": 1716454224975181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225060977, "dur": 54, "args": { "External id": 229798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229798, "pid": 5, "tid": 7, "ts": 1716454225060977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975199, "dur": 6, "args": { "External id": 229798, "cbid": 211, "correlation": 229798 } }, { "ph": "s", "id": 229798, "pid": 76337, "tid": -914061504, "ts": 1716454224975199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224975256, "dur": 0, "args": { "External id": 229809, "cbid": 317, "correlation": 229809 } }, { "ph": "f", "id": 229809, "pid": 76337, "tid": -914061504, "ts": 1716454224975256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224975257, "dur": 0, "args": { "External id": 229810, "cbid": 203, "correlation": 229810 } }, { "ph": "f", "id": 229810, "pid": 76337, "tid": -914061504, "ts": 1716454224975257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224975258, "dur": 0, "args": { "External id": 229811, "cbid": 205, "correlation": 229811 } }, { "ph": "f", "id": 229811, "pid": 76337, "tid": -914061504, "ts": 1716454224975258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975289, "dur": 2, "args": { "External id": 229815, "cbid": 251, "correlation": 229815 } }, { "ph": "f", "id": 229815, "pid": 76337, "tid": -914061504, "ts": 1716454224975289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975292, "dur": 1, "args": { "External id": 229816, "cbid": 251, "correlation": 229816 } }, { "ph": "f", "id": 229816, "pid": 76337, "tid": -914061504, "ts": 1716454224975292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975293, "dur": 0, "args": { "External id": 229817, "cbid": 251, "correlation": 229817 } }, { "ph": "f", "id": 229817, "pid": 76337, "tid": -914061504, "ts": 1716454224975293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975294, "dur": 1, "args": { "External id": 229818, "cbid": 251, "correlation": 229818 } }, { "ph": "f", "id": 229818, "pid": 76337, "tid": -914061504, "ts": 1716454224975294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975296, "dur": 1, "args": { "External id": 229819, "cbid": 251, "correlation": 229819 } }, { "ph": "f", "id": 229819, "pid": 76337, "tid": -914061504, "ts": 1716454224975296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975298, "dur": 1, "args": { "External id": 229820, "cbid": 251, "correlation": 229820 } }, { "ph": "f", "id": 229820, "pid": 76337, "tid": -914061504, "ts": 1716454224975298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975300, "dur": 1, "args": { "External id": 229821, "cbid": 251, "correlation": 229821 } }, { "ph": "f", "id": 229821, "pid": 76337, "tid": -914061504, "ts": 1716454224975300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975301, "dur": 1, "args": { "External id": 229822, "cbid": 251, "correlation": 229822 } }, { "ph": "f", "id": 229822, "pid": 76337, "tid": -914061504, "ts": 1716454224975301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975304, "dur": 0, "args": { "External id": 229823, "cbid": 251, "correlation": 229823 } }, { "ph": "f", "id": 229823, "pid": 76337, "tid": -914061504, "ts": 1716454224975304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225061033, "dur": 111, "args": { "External id": 229824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229824, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 229824, "pid": 5, "tid": 7, "ts": 1716454225061033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975306, "dur": 13, "args": { "External id": 229824, "cbid": 211, "correlation": 229824 } }, { "ph": "s", "id": 229824, "pid": 76337, "tid": -914061504, "ts": 1716454224975306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225061145, "dur": 58, "args": { "External id": 229830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229830, "pid": 5, "tid": 7, "ts": 1716454225061145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975343, "dur": 9, "args": { "External id": 229830, "cbid": 211, "correlation": 229830 } }, { "ph": "s", "id": 229830, "pid": 76337, "tid": -914061504, "ts": 1716454224975343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225061205, "dur": 567, "args": { "External id": 229839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229839, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229839, "pid": 5, "tid": 7, "ts": 1716454225061205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975425, "dur": 14, "args": { "External id": 229839, "cbid": 211, "correlation": 229839 } }, { "ph": "s", "id": 229839, "pid": 76337, "tid": -914061504, "ts": 1716454224975425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225061774, "dur": 177, "args": { "External id": 229861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229861, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229861, "pid": 5, "tid": 7, "ts": 1716454225061774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975484, "dur": 10, "args": { "External id": 229861, "cbid": 211, "correlation": 229861 } }, { "ph": "s", "id": 229861, "pid": 76337, "tid": -914061504, "ts": 1716454224975484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975572, "dur": 1, "args": { "External id": 229872, "cbid": 251, "correlation": 229872 } }, { "ph": "f", "id": 229872, "pid": 76337, "tid": -914061504, "ts": 1716454224975572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225061953, "dur": 195, "args": { "External id": 229873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229873, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229873, "pid": 5, "tid": 7, "ts": 1716454225061953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975577, "dur": 14, "args": { "External id": 229873, "cbid": 211, "correlation": 229873 } }, { "ph": "s", "id": 229873, "pid": 76337, "tid": -914061504, "ts": 1716454224975577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975646, "dur": 1, "args": { "External id": 229884, "cbid": 251, "correlation": 229884 } }, { "ph": "f", "id": 229884, "pid": 76337, "tid": -914061504, "ts": 1716454224975646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225062148, "dur": 184, "args": { "External id": 229885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229885, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229885, "pid": 5, "tid": 7, "ts": 1716454225062148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975650, "dur": 11, "args": { "External id": 229885, "cbid": 211, "correlation": 229885 } }, { "ph": "s", "id": 229885, "pid": 76337, "tid": -914061504, "ts": 1716454224975650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975714, "dur": 1, "args": { "External id": 229896, "cbid": 251, "correlation": 229896 } }, { "ph": "f", "id": 229896, "pid": 76337, "tid": -914061504, "ts": 1716454224975714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225062334, "dur": 185, "args": { "External id": 229897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229897, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229897, "pid": 5, "tid": 7, "ts": 1716454225062334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975718, "dur": 12, "args": { "External id": 229897, "cbid": 211, "correlation": 229897 } }, { "ph": "s", "id": 229897, "pid": 76337, "tid": -914061504, "ts": 1716454224975718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225062520, "dur": 18129, "args": { "External id": 229918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229918, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 229918, "pid": 5, "tid": 7, "ts": 1716454225062520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975801, "dur": 13, "args": { "External id": 229918, "cbid": 211, "correlation": 229918 } }, { "ph": "s", "id": 229918, "pid": 76337, "tid": -914061504, "ts": 1716454224975801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224975900, "dur": 1, "args": { "External id": 229936, "cbid": 251, "correlation": 229936 } }, { "ph": "f", "id": 229936, "pid": 76337, "tid": -914061504, "ts": 1716454224975900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225080651, "dur": 201, "args": { "External id": 229938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229938, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229938, "pid": 5, "tid": 7, "ts": 1716454225080651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975906, "dur": 13, "args": { "External id": 229938, "cbid": 211, "correlation": 229938 } }, { "ph": "s", "id": 229938, "pid": 76337, "tid": -914061504, "ts": 1716454224975906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225080853, "dur": 66, "args": { "External id": 229946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229946, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229946, "pid": 5, "tid": 7, "ts": 1716454225080853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224975987, "dur": 12, "args": { "External id": 229946, "cbid": 211, "correlation": 229946 } }, { "ph": "s", "id": 229946, "pid": 76337, "tid": -914061504, "ts": 1716454224975987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225080921, "dur": 97, "args": { "External id": 229954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229954, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229954, "pid": 5, "tid": 7, "ts": 1716454225080921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976027, "dur": 10, "args": { "External id": 229954, "cbid": 211, "correlation": 229954 } }, { "ph": "s", "id": 229954, "pid": 76337, "tid": -914061504, "ts": 1716454224976027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225081019, "dur": 54, "args": { "External id": 229965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229965, "pid": 5, "tid": 7, "ts": 1716454225081019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976100, "dur": 13, "args": { "External id": 229965, "cbid": 211, "correlation": 229965 } }, { "ph": "s", "id": 229965, "pid": 76337, "tid": -914061504, "ts": 1716454224976100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225081075, "dur": 90, "args": { "External id": 229987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229987, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 229987, "pid": 5, "tid": 7, "ts": 1716454225081075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976132, "dur": 8, "args": { "External id": 229987, "cbid": 211, "correlation": 229987 } }, { "ph": "s", "id": 229987, "pid": 76337, "tid": -914061504, "ts": 1716454224976132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976214, "dur": 1, "args": { "External id": 229998, "cbid": 251, "correlation": 229998 } }, { "ph": "f", "id": 229998, "pid": 76337, "tid": -914061504, "ts": 1716454224976214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225081166, "dur": 105, "args": { "External id": 229999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 229999, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 229999, "pid": 5, "tid": 7, "ts": 1716454225081166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976219, "dur": 13, "args": { "External id": 229999, "cbid": 211, "correlation": 229999 } }, { "ph": "s", "id": 229999, "pid": 76337, "tid": -914061504, "ts": 1716454224976219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976295, "dur": 1, "args": { "External id": 230010, "cbid": 251, "correlation": 230010 } }, { "ph": "f", "id": 230010, "pid": 76337, "tid": -914061504, "ts": 1716454224976295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976299, "dur": 0, "args": { "External id": 230011, "cbid": 251, "correlation": 230011 } }, { "ph": "f", "id": 230011, "pid": 76337, "tid": -914061504, "ts": 1716454224976299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225081273, "dur": 10, "args": { "External id": 230012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230012, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 230012, "pid": 5, "tid": 7, "ts": 1716454225081273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976301, "dur": 13, "args": { "External id": 230012, "cbid": 211, "correlation": 230012 } }, { "ph": "s", "id": 230012, "pid": 76337, "tid": -914061504, "ts": 1716454224976301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225081285, "dur": 5, "args": { "External id": 230014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230014, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230014, "pid": 5, "tid": 7, "ts": 1716454225081285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976317, "dur": 7, "args": { "External id": 230014, "cbid": 211, "correlation": 230014 } }, { "ph": "s", "id": 230014, "pid": 76337, "tid": -914061504, "ts": 1716454224976317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976381, "dur": 1, "args": { "External id": 230025, "cbid": 251, "correlation": 230025 } }, { "ph": "f", "id": 230025, "pid": 76337, "tid": -914061504, "ts": 1716454224976381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976384, "dur": 0, "args": { "External id": 230026, "cbid": 251, "correlation": 230026 } }, { "ph": "f", "id": 230026, "pid": 76337, "tid": -914061504, "ts": 1716454224976384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225081291, "dur": 6, "args": { "External id": 230027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230027, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 230027, "pid": 5, "tid": 7, "ts": 1716454225081291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976386, "dur": 12, "args": { "External id": 230027, "cbid": 211, "correlation": 230027 } }, { "ph": "s", "id": 230027, "pid": 76337, "tid": -914061504, "ts": 1716454224976386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225081298, "dur": 3, "args": { "External id": 230029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230029, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230029, "pid": 5, "tid": 7, "ts": 1716454225081298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976399, "dur": 5, "args": { "External id": 230029, "cbid": 211, "correlation": 230029 } }, { "ph": "s", "id": 230029, "pid": 76337, "tid": -914061504, "ts": 1716454224976399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225081303, "dur": 152, "args": { "External id": 230050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230050, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230050, "pid": 5, "tid": 7, "ts": 1716454225081303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976472, "dur": 13, "args": { "External id": 230050, "cbid": 211, "correlation": 230050 } }, { "ph": "s", "id": 230050, "pid": 76337, "tid": -914061504, "ts": 1716454224976472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976569, "dur": 1, "args": { "External id": 230068, "cbid": 251, "correlation": 230068 } }, { "ph": "f", "id": 230068, "pid": 76337, "tid": -914061504, "ts": 1716454224976569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225081456, "dur": 106, "args": { "External id": 230070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230070, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230070, "pid": 5, "tid": 7, "ts": 1716454225081456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976575, "dur": 14, "args": { "External id": 230070, "cbid": 211, "correlation": 230070 } }, { "ph": "s", "id": 230070, "pid": 76337, "tid": -914061504, "ts": 1716454224976575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225081563, "dur": 34, "args": { "External id": 230078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230078, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230078, "pid": 5, "tid": 7, "ts": 1716454225081563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976645, "dur": 12, "args": { "External id": 230078, "cbid": 211, "correlation": 230078 } }, { "ph": "s", "id": 230078, "pid": 76337, "tid": -914061504, "ts": 1716454224976645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225081599, "dur": 65, "args": { "External id": 230086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230086, "pid": 5, "tid": 7, "ts": 1716454225081599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976686, "dur": 9, "args": { "External id": 230086, "cbid": 211, "correlation": 230086 } }, { "ph": "s", "id": 230086, "pid": 76337, "tid": -914061504, "ts": 1716454224976686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225081665, "dur": 90, "args": { "External id": 230108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230108, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230108, "pid": 5, "tid": 7, "ts": 1716454225081665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976739, "dur": 11, "args": { "External id": 230108, "cbid": 211, "correlation": 230108 } }, { "ph": "s", "id": 230108, "pid": 76337, "tid": -914061504, "ts": 1716454224976739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224976827, "dur": 1, "args": { "External id": 230124, "cbid": 251, "correlation": 230124 } }, { "ph": "f", "id": 230124, "pid": 76337, "tid": -914061504, "ts": 1716454224976827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225081756, "dur": 564, "args": { "External id": 230126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230126, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230126, "pid": 5, "tid": 7, "ts": 1716454225081756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976833, "dur": 13, "args": { "External id": 230126, "cbid": 211, "correlation": 230126 } }, { "ph": "s", "id": 230126, "pid": 76337, "tid": -914061504, "ts": 1716454224976833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225082322, "dur": 240, "args": { "External id": 230134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230134, "pid": 5, "tid": 7, "ts": 1716454225082322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976899, "dur": 12, "args": { "External id": 230134, "cbid": 211, "correlation": 230134 } }, { "ph": "s", "id": 230134, "pid": 76337, "tid": -914061504, "ts": 1716454224976899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225082563, "dur": 252, "args": { "External id": 230142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230142, "pid": 5, "tid": 7, "ts": 1716454225082563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224976930, "dur": 9, "args": { "External id": 230142, "cbid": 211, "correlation": 230142 } }, { "ph": "s", "id": 230142, "pid": 76337, "tid": -914061504, "ts": 1716454224976930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977024, "dur": 1, "args": { "External id": 230158, "cbid": 251, "correlation": 230158 } }, { "ph": "f", "id": 230158, "pid": 76337, "tid": -914061504, "ts": 1716454224977024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977029, "dur": 0, "args": { "External id": 230160, "cbid": 251, "correlation": 230160 } }, { "ph": "f", "id": 230160, "pid": 76337, "tid": -914061504, "ts": 1716454224977029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225082816, "dur": 353, "args": { "External id": 230161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230161, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 230161, "pid": 5, "tid": 7, "ts": 1716454225082816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977032, "dur": 14, "args": { "External id": 230161, "cbid": 211, "correlation": 230161 } }, { "ph": "s", "id": 230161, "pid": 76337, "tid": -914061504, "ts": 1716454224977032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225083171, "dur": 50, "args": { "External id": 230169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230169, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230169, "pid": 5, "tid": 7, "ts": 1716454225083171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977075, "dur": 11, "args": { "External id": 230169, "cbid": 211, "correlation": 230169 } }, { "ph": "s", "id": 230169, "pid": 76337, "tid": -914061504, "ts": 1716454224977075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225083222, "dur": 155, "args": { "External id": 230180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230180, "pid": 5, "tid": 7, "ts": 1716454225083222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977143, "dur": 12, "args": { "External id": 230180, "cbid": 211, "correlation": 230180 } }, { "ph": "s", "id": 230180, "pid": 76337, "tid": -914061504, "ts": 1716454224977143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224977208, "dur": 0, "args": { "External id": 230192, "cbid": 317, "correlation": 230192 } }, { "ph": "f", "id": 230192, "pid": 76337, "tid": -914061504, "ts": 1716454224977208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224977209, "dur": 0, "args": { "External id": 230193, "cbid": 203, "correlation": 230193 } }, { "ph": "f", "id": 230193, "pid": 76337, "tid": -914061504, "ts": 1716454224977209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224977210, "dur": 0, "args": { "External id": 230194, "cbid": 205, "correlation": 230194 } }, { "ph": "f", "id": 230194, "pid": 76337, "tid": -914061504, "ts": 1716454224977210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977235, "dur": 1, "args": { "External id": 230198, "cbid": 251, "correlation": 230198 } }, { "ph": "f", "id": 230198, "pid": 76337, "tid": -914061504, "ts": 1716454224977235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977236, "dur": 0, "args": { "External id": 230199, "cbid": 251, "correlation": 230199 } }, { "ph": "f", "id": 230199, "pid": 76337, "tid": -914061504, "ts": 1716454224977236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977237, "dur": 0, "args": { "External id": 230200, "cbid": 251, "correlation": 230200 } }, { "ph": "f", "id": 230200, "pid": 76337, "tid": -914061504, "ts": 1716454224977237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977238, "dur": 0, "args": { "External id": 230201, "cbid": 251, "correlation": 230201 } }, { "ph": "f", "id": 230201, "pid": 76337, "tid": -914061504, "ts": 1716454224977238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977239, "dur": 1, "args": { "External id": 230202, "cbid": 251, "correlation": 230202 } }, { "ph": "f", "id": 230202, "pid": 76337, "tid": -914061504, "ts": 1716454224977239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977240, "dur": 0, "args": { "External id": 230203, "cbid": 251, "correlation": 230203 } }, { "ph": "f", "id": 230203, "pid": 76337, "tid": -914061504, "ts": 1716454224977240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977242, "dur": 0, "args": { "External id": 230204, "cbid": 251, "correlation": 230204 } }, { "ph": "f", "id": 230204, "pid": 76337, "tid": -914061504, "ts": 1716454224977242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977242, "dur": 0, "args": { "External id": 230205, "cbid": 251, "correlation": 230205 } }, { "ph": "f", "id": 230205, "pid": 76337, "tid": -914061504, "ts": 1716454224977242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977244, "dur": 0, "args": { "External id": 230206, "cbid": 251, "correlation": 230206 } }, { "ph": "f", "id": 230206, "pid": 76337, "tid": -914061504, "ts": 1716454224977244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225083378, "dur": 112, "args": { "External id": 230207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230207, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230207, "pid": 5, "tid": 7, "ts": 1716454225083378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977246, "dur": 12, "args": { "External id": 230207, "cbid": 211, "correlation": 230207 } }, { "ph": "s", "id": 230207, "pid": 76337, "tid": -914061504, "ts": 1716454224977246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225083491, "dur": 58, "args": { "External id": 230213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230213, "pid": 5, "tid": 7, "ts": 1716454225083491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977280, "dur": 9, "args": { "External id": 230213, "cbid": 211, "correlation": 230213 } }, { "ph": "s", "id": 230213, "pid": 76337, "tid": -914061504, "ts": 1716454224977280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225083551, "dur": 50, "args": { "External id": 230221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230221, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230221, "pid": 5, "tid": 7, "ts": 1716454225083551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977312, "dur": 8, "args": { "External id": 230221, "cbid": 211, "correlation": 230221 } }, { "ph": "s", "id": 230221, "pid": 76337, "tid": -914061504, "ts": 1716454224977312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225083603, "dur": 97, "args": { "External id": 230230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230230, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230230, "pid": 5, "tid": 7, "ts": 1716454225083603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977351, "dur": 10, "args": { "External id": 230230, "cbid": 211, "correlation": 230230 } }, { "ph": "s", "id": 230230, "pid": 76337, "tid": -914061504, "ts": 1716454224977351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225083701, "dur": 90, "args": { "External id": 230250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230250, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 230250, "pid": 5, "tid": 7, "ts": 1716454225083701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977422, "dur": 12, "args": { "External id": 230250, "cbid": 211, "correlation": 230250 } }, { "ph": "s", "id": 230250, "pid": 76337, "tid": -914061504, "ts": 1716454224977422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225083793, "dur": 4, "args": { "External id": 230262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230262, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230262, "pid": 5, "tid": 7, "ts": 1716454225083793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977444, "dur": 32, "args": { "External id": 230262, "cbid": 211, "correlation": 230262 } }, { "ph": "s", "id": 230262, "pid": 76337, "tid": -914061504, "ts": 1716454224977444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225083798, "dur": 107, "args": { "External id": 230265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230265, "pid": 5, "tid": 7, "ts": 1716454225083798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977489, "dur": 7, "args": { "External id": 230265, "cbid": 211, "correlation": 230265 } }, { "ph": "s", "id": 230265, "pid": 76337, "tid": -914061504, "ts": 1716454224977489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225083907, "dur": 69, "args": { "External id": 230274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230274, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230274, "pid": 5, "tid": 7, "ts": 1716454225083907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977529, "dur": 10, "args": { "External id": 230274, "cbid": 211, "correlation": 230274 } }, { "ph": "s", "id": 230274, "pid": 76337, "tid": -914061504, "ts": 1716454224977529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224977581, "dur": 0, "args": { "External id": 230284, "cbid": 317, "correlation": 230284 } }, { "ph": "f", "id": 230284, "pid": 76337, "tid": -914061504, "ts": 1716454224977581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224977582, "dur": 0, "args": { "External id": 230285, "cbid": 203, "correlation": 230285 } }, { "ph": "f", "id": 230285, "pid": 76337, "tid": -914061504, "ts": 1716454224977582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224977583, "dur": 0, "args": { "External id": 230286, "cbid": 205, "correlation": 230286 } }, { "ph": "f", "id": 230286, "pid": 76337, "tid": -914061504, "ts": 1716454224977583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225083978, "dur": 76, "args": { "External id": 230290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230290, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230290, "pid": 5, "tid": 7, "ts": 1716454225083978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977598, "dur": 11, "args": { "External id": 230290, "cbid": 211, "correlation": 230290 } }, { "ph": "s", "id": 230290, "pid": 76337, "tid": -914061504, "ts": 1716454224977598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225084055, "dur": 23, "args": { "External id": 230292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230292, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230292, "pid": 5, "tid": 7, "ts": 1716454225084055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977611, "dur": 5, "args": { "External id": 230292, "cbid": 211, "correlation": 230292 } }, { "ph": "s", "id": 230292, "pid": 76337, "tid": -914061504, "ts": 1716454224977611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225084080, "dur": 4, "args": { "External id": 230294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230294, "pid": 5, "tid": 7, "ts": 1716454225084080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977621, "dur": 6, "args": { "External id": 230294, "cbid": 211, "correlation": 230294 } }, { "ph": "s", "id": 230294, "pid": 76337, "tid": -914061504, "ts": 1716454224977621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224977630, "dur": 0, "args": { "External id": 230295, "cbid": 51, "correlation": 230295 } }, { "ph": "s", "id": 230295, "pid": 76337, "tid": -914061504, "ts": 1716454224977630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225084085, "dur": 1344, "args": { "External id": 230296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230296, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230296, "pid": 5, "tid": 7, "ts": 1716454225084085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977631, "dur": 5, "args": { "External id": 230296, "cbid": 211, "correlation": 230296 } }, { "ph": "s", "id": 230296, "pid": 76337, "tid": -914061504, "ts": 1716454224977631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225085430, "dur": 58, "args": { "External id": 230301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230301, "pid": 5, "tid": 7, "ts": 1716454225085430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977659, "dur": 9, "args": { "External id": 230301, "cbid": 211, "correlation": 230301 } }, { "ph": "s", "id": 230301, "pid": 76337, "tid": -914061504, "ts": 1716454224977659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225085490, "dur": 3, "args": { "External id": 230309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230309, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230309, "pid": 5, "tid": 7, "ts": 1716454225085490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977703, "dur": 9, "args": { "External id": 230309, "cbid": 211, "correlation": 230309 } }, { "ph": "s", "id": 230309, "pid": 76337, "tid": -914061504, "ts": 1716454224977703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977768, "dur": 1, "args": { "External id": 230325, "cbid": 251, "correlation": 230325 } }, { "ph": "f", "id": 230325, "pid": 76337, "tid": -914061504, "ts": 1716454224977768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224977774, "dur": 0, "args": { "External id": 230327, "cbid": 251, "correlation": 230327 } }, { "ph": "f", "id": 230327, "pid": 76337, "tid": -914061504, "ts": 1716454224977774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225085494, "dur": 11, "args": { "External id": 230328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230328, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 230328, "pid": 5, "tid": 7, "ts": 1716454225085494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977775, "dur": 12, "args": { "External id": 230328, "cbid": 211, "correlation": 230328 } }, { "ph": "s", "id": 230328, "pid": 76337, "tid": -914061504, "ts": 1716454224977775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225085507, "dur": 5, "args": { "External id": 230330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230330, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230330, "pid": 5, "tid": 7, "ts": 1716454225085507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224977790, "dur": 178, "args": { "External id": 230330, "cbid": 211, "correlation": 230330 } }, { "ph": "s", "id": 230330, "pid": 76337, "tid": -914061504, "ts": 1716454224977790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225085513, "dur": 53, "args": { "External id": 230340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230340, "pid": 5, "tid": 7, "ts": 1716454225085513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978027, "dur": 13, "args": { "External id": 230340, "cbid": 211, "correlation": 230340 } }, { "ph": "s", "id": 230340, "pid": 76337, "tid": -914061504, "ts": 1716454224978027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225085568, "dur": 51, "args": { "External id": 230360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230360, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 230360, "pid": 5, "tid": 7, "ts": 1716454225085568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978094, "dur": 11, "args": { "External id": 230360, "cbid": 211, "correlation": 230360 } }, { "ph": "s", "id": 230360, "pid": 76337, "tid": -914061504, "ts": 1716454224978094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225085620, "dur": 4, "args": { "External id": 230372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230372, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230372, "pid": 5, "tid": 7, "ts": 1716454225085620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978115, "dur": 6, "args": { "External id": 230372, "cbid": 211, "correlation": 230372 } }, { "ph": "s", "id": 230372, "pid": 76337, "tid": -914061504, "ts": 1716454224978115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225085625, "dur": 54, "args": { "External id": 230375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230375, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230375, "pid": 5, "tid": 7, "ts": 1716454225085625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978134, "dur": 7, "args": { "External id": 230375, "cbid": 211, "correlation": 230375 } }, { "ph": "s", "id": 230375, "pid": 76337, "tid": -914061504, "ts": 1716454224978134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225085681, "dur": 36, "args": { "External id": 230384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230384, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230384, "pid": 5, "tid": 7, "ts": 1716454225085681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978175, "dur": 10, "args": { "External id": 230384, "cbid": 211, "correlation": 230384 } }, { "ph": "s", "id": 230384, "pid": 76337, "tid": -914061504, "ts": 1716454224978175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224978238, "dur": 0, "args": { "External id": 230394, "cbid": 317, "correlation": 230394 } }, { "ph": "f", "id": 230394, "pid": 76337, "tid": -914061504, "ts": 1716454224978238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224978238, "dur": 0, "args": { "External id": 230395, "cbid": 203, "correlation": 230395 } }, { "ph": "f", "id": 230395, "pid": 76337, "tid": -914061504, "ts": 1716454224978238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224978239, "dur": 0, "args": { "External id": 230396, "cbid": 205, "correlation": 230396 } }, { "ph": "f", "id": 230396, "pid": 76337, "tid": -914061504, "ts": 1716454224978239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225085718, "dur": 41, "args": { "External id": 230400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230400, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230400, "pid": 5, "tid": 7, "ts": 1716454225085718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978253, "dur": 13, "args": { "External id": 230400, "cbid": 211, "correlation": 230400 } }, { "ph": "s", "id": 230400, "pid": 76337, "tid": -914061504, "ts": 1716454224978253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225085761, "dur": 14, "args": { "External id": 230402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230402, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230402, "pid": 5, "tid": 7, "ts": 1716454225085761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978268, "dur": 5, "args": { "External id": 230402, "cbid": 211, "correlation": 230402 } }, { "ph": "s", "id": 230402, "pid": 76337, "tid": -914061504, "ts": 1716454224978268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225085775, "dur": 3, "args": { "External id": 230404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230404, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230404, "pid": 5, "tid": 7, "ts": 1716454225085775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978278, "dur": 5, "args": { "External id": 230404, "cbid": 211, "correlation": 230404 } }, { "ph": "s", "id": 230404, "pid": 76337, "tid": -914061504, "ts": 1716454224978278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224978286, "dur": 0, "args": { "External id": 230405, "cbid": 51, "correlation": 230405 } }, { "ph": "s", "id": 230405, "pid": 76337, "tid": -914061504, "ts": 1716454224978286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225085780, "dur": 687, "args": { "External id": 230406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230406, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230406, "pid": 5, "tid": 7, "ts": 1716454225085780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978287, "dur": 5, "args": { "External id": 230406, "cbid": 211, "correlation": 230406 } }, { "ph": "s", "id": 230406, "pid": 76337, "tid": -914061504, "ts": 1716454224978287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225086469, "dur": 58, "args": { "External id": 230411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230411, "pid": 5, "tid": 7, "ts": 1716454225086469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978315, "dur": 8, "args": { "External id": 230411, "cbid": 211, "correlation": 230411 } }, { "ph": "s", "id": 230411, "pid": 76337, "tid": -914061504, "ts": 1716454224978315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224978372, "dur": 0, "args": { "External id": 230421, "cbid": 317, "correlation": 230421 } }, { "ph": "f", "id": 230421, "pid": 76337, "tid": -914061504, "ts": 1716454224978372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224978373, "dur": 0, "args": { "External id": 230422, "cbid": 203, "correlation": 230422 } }, { "ph": "f", "id": 230422, "pid": 76337, "tid": -914061504, "ts": 1716454224978373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224978374, "dur": 0, "args": { "External id": 230423, "cbid": 205, "correlation": 230423 } }, { "ph": "f", "id": 230423, "pid": 76337, "tid": -914061504, "ts": 1716454224978374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225086528, "dur": 74, "args": { "External id": 230427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230427, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230427, "pid": 5, "tid": 7, "ts": 1716454225086528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978387, "dur": 11, "args": { "External id": 230427, "cbid": 211, "correlation": 230427 } }, { "ph": "s", "id": 230427, "pid": 76337, "tid": -914061504, "ts": 1716454224978387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225086603, "dur": 204, "args": { "External id": 230429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230429, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230429, "pid": 5, "tid": 7, "ts": 1716454225086603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978407, "dur": 9, "args": { "External id": 230429, "cbid": 211, "correlation": 230429 } }, { "ph": "s", "id": 230429, "pid": 76337, "tid": -914061504, "ts": 1716454224978407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225086809, "dur": 39, "args": { "External id": 230431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230431, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230431, "pid": 5, "tid": 7, "ts": 1716454225086809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978420, "dur": 478, "args": { "External id": 230431, "cbid": 211, "correlation": 230431 } }, { "ph": "s", "id": 230431, "pid": 76337, "tid": -914061504, "ts": 1716454224978420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225086849, "dur": 59, "args": { "External id": 230437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230437, "pid": 5, "tid": 7, "ts": 1716454225086849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978919, "dur": 9, "args": { "External id": 230437, "cbid": 211, "correlation": 230437 } }, { "ph": "s", "id": 230437, "pid": 76337, "tid": -914061504, "ts": 1716454224978919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225086909, "dur": 49, "args": { "External id": 230445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230445, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230445, "pid": 5, "tid": 7, "ts": 1716454225086909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978949, "dur": 8, "args": { "External id": 230445, "cbid": 211, "correlation": 230445 } }, { "ph": "s", "id": 230445, "pid": 76337, "tid": -914061504, "ts": 1716454224978949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225086960, "dur": 36, "args": { "External id": 230453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230453, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230453, "pid": 5, "tid": 7, "ts": 1716454225086960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224978985, "dur": 26, "args": { "External id": 230453, "cbid": 211, "correlation": 230453 } }, { "ph": "s", "id": 230453, "pid": 76337, "tid": -914061504, "ts": 1716454224978985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225086997, "dur": 53, "args": { "External id": 230473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230473, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 230473, "pid": 5, "tid": 7, "ts": 1716454225086997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979087, "dur": 12, "args": { "External id": 230473, "cbid": 211, "correlation": 230473 } }, { "ph": "s", "id": 230473, "pid": 76337, "tid": -914061504, "ts": 1716454224979087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225087051, "dur": 5, "args": { "External id": 230485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230485, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230485, "pid": 5, "tid": 7, "ts": 1716454225087051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979109, "dur": 7, "args": { "External id": 230485, "cbid": 211, "correlation": 230485 } }, { "ph": "s", "id": 230485, "pid": 76337, "tid": -914061504, "ts": 1716454224979109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225087057, "dur": 54, "args": { "External id": 230488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230488, "pid": 5, "tid": 7, "ts": 1716454225087057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979127, "dur": 6, "args": { "External id": 230488, "cbid": 211, "correlation": 230488 } }, { "ph": "s", "id": 230488, "pid": 76337, "tid": -914061504, "ts": 1716454224979127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224979185, "dur": 0, "args": { "External id": 230499, "cbid": 317, "correlation": 230499 } }, { "ph": "f", "id": 230499, "pid": 76337, "tid": -914061504, "ts": 1716454224979185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224979186, "dur": 0, "args": { "External id": 230500, "cbid": 203, "correlation": 230500 } }, { "ph": "f", "id": 230500, "pid": 76337, "tid": -914061504, "ts": 1716454224979186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224979186, "dur": 0, "args": { "External id": 230501, "cbid": 205, "correlation": 230501 } }, { "ph": "f", "id": 230501, "pid": 76337, "tid": -914061504, "ts": 1716454224979186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979208, "dur": 1, "args": { "External id": 230505, "cbid": 251, "correlation": 230505 } }, { "ph": "f", "id": 230505, "pid": 76337, "tid": -914061504, "ts": 1716454224979208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979211, "dur": 0, "args": { "External id": 230506, "cbid": 251, "correlation": 230506 } }, { "ph": "f", "id": 230506, "pid": 76337, "tid": -914061504, "ts": 1716454224979211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979212, "dur": 0, "args": { "External id": 230507, "cbid": 251, "correlation": 230507 } }, { "ph": "f", "id": 230507, "pid": 76337, "tid": -914061504, "ts": 1716454224979212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979212, "dur": 0, "args": { "External id": 230508, "cbid": 251, "correlation": 230508 } }, { "ph": "f", "id": 230508, "pid": 76337, "tid": -914061504, "ts": 1716454224979212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979213, "dur": 0, "args": { "External id": 230509, "cbid": 251, "correlation": 230509 } }, { "ph": "f", "id": 230509, "pid": 76337, "tid": -914061504, "ts": 1716454224979213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979214, "dur": 0, "args": { "External id": 230510, "cbid": 251, "correlation": 230510 } }, { "ph": "f", "id": 230510, "pid": 76337, "tid": -914061504, "ts": 1716454224979214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979215, "dur": 0, "args": { "External id": 230511, "cbid": 251, "correlation": 230511 } }, { "ph": "f", "id": 230511, "pid": 76337, "tid": -914061504, "ts": 1716454224979215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979216, "dur": 0, "args": { "External id": 230512, "cbid": 251, "correlation": 230512 } }, { "ph": "f", "id": 230512, "pid": 76337, "tid": -914061504, "ts": 1716454224979216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979217, "dur": 0, "args": { "External id": 230513, "cbid": 251, "correlation": 230513 } }, { "ph": "f", "id": 230513, "pid": 76337, "tid": -914061504, "ts": 1716454224979217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225087112, "dur": 109, "args": { "External id": 230514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230514, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230514, "pid": 5, "tid": 7, "ts": 1716454225087112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979219, "dur": 12, "args": { "External id": 230514, "cbid": 211, "correlation": 230514 } }, { "ph": "s", "id": 230514, "pid": 76337, "tid": -914061504, "ts": 1716454224979219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225087222, "dur": 59, "args": { "External id": 230520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230520, "pid": 5, "tid": 7, "ts": 1716454225087222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979254, "dur": 9, "args": { "External id": 230520, "cbid": 211, "correlation": 230520 } }, { "ph": "s", "id": 230520, "pid": 76337, "tid": -914061504, "ts": 1716454224979254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225087283, "dur": 477, "args": { "External id": 230529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230529, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230529, "pid": 5, "tid": 7, "ts": 1716454225087283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979336, "dur": 14, "args": { "External id": 230529, "cbid": 211, "correlation": 230529 } }, { "ph": "s", "id": 230529, "pid": 76337, "tid": -914061504, "ts": 1716454224979336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225087761, "dur": 177, "args": { "External id": 230551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230551, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230551, "pid": 5, "tid": 7, "ts": 1716454225087761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979394, "dur": 10, "args": { "External id": 230551, "cbid": 211, "correlation": 230551 } }, { "ph": "s", "id": 230551, "pid": 76337, "tid": -914061504, "ts": 1716454224979394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979480, "dur": 1, "args": { "External id": 230562, "cbid": 251, "correlation": 230562 } }, { "ph": "f", "id": 230562, "pid": 76337, "tid": -914061504, "ts": 1716454224979480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225087940, "dur": 195, "args": { "External id": 230563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230563, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230563, "pid": 5, "tid": 7, "ts": 1716454225087940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979485, "dur": 13, "args": { "External id": 230563, "cbid": 211, "correlation": 230563 } }, { "ph": "s", "id": 230563, "pid": 76337, "tid": -914061504, "ts": 1716454224979485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979553, "dur": 1, "args": { "External id": 230574, "cbid": 251, "correlation": 230574 } }, { "ph": "f", "id": 230574, "pid": 76337, "tid": -914061504, "ts": 1716454224979553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225088136, "dur": 185, "args": { "External id": 230575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230575, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230575, "pid": 5, "tid": 7, "ts": 1716454225088136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979557, "dur": 12, "args": { "External id": 230575, "cbid": 211, "correlation": 230575 } }, { "ph": "s", "id": 230575, "pid": 76337, "tid": -914061504, "ts": 1716454224979557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979621, "dur": 1, "args": { "External id": 230586, "cbid": 251, "correlation": 230586 } }, { "ph": "f", "id": 230586, "pid": 76337, "tid": -914061504, "ts": 1716454224979621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225088323, "dur": 186, "args": { "External id": 230587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230587, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230587, "pid": 5, "tid": 7, "ts": 1716454225088323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979625, "dur": 11, "args": { "External id": 230587, "cbid": 211, "correlation": 230587 } }, { "ph": "s", "id": 230587, "pid": 76337, "tid": -914061504, "ts": 1716454224979625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225088510, "dur": 18190, "args": { "External id": 230608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230608, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230608, "pid": 5, "tid": 7, "ts": 1716454225088510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979704, "dur": 13, "args": { "External id": 230608, "cbid": 211, "correlation": 230608 } }, { "ph": "s", "id": 230608, "pid": 76337, "tid": -914061504, "ts": 1716454224979704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224979801, "dur": 1, "args": { "External id": 230626, "cbid": 251, "correlation": 230626 } }, { "ph": "f", "id": 230626, "pid": 76337, "tid": -914061504, "ts": 1716454224979801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225106701, "dur": 199, "args": { "External id": 230628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230628, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230628, "pid": 5, "tid": 7, "ts": 1716454225106701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979807, "dur": 14, "args": { "External id": 230628, "cbid": 211, "correlation": 230628 } }, { "ph": "s", "id": 230628, "pid": 76337, "tid": -914061504, "ts": 1716454224979807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225106901, "dur": 66, "args": { "External id": 230636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230636, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230636, "pid": 5, "tid": 7, "ts": 1716454225106901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979877, "dur": 12, "args": { "External id": 230636, "cbid": 211, "correlation": 230636 } }, { "ph": "s", "id": 230636, "pid": 76337, "tid": -914061504, "ts": 1716454224979877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225106969, "dur": 97, "args": { "External id": 230644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230644, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230644, "pid": 5, "tid": 7, "ts": 1716454225106969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979917, "dur": 9, "args": { "External id": 230644, "cbid": 211, "correlation": 230644 } }, { "ph": "s", "id": 230644, "pid": 76337, "tid": -914061504, "ts": 1716454224979917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225107067, "dur": 54, "args": { "External id": 230655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230655, "pid": 5, "tid": 7, "ts": 1716454225107067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224979996, "dur": 72, "args": { "External id": 230655, "cbid": 211, "correlation": 230655 } }, { "ph": "s", "id": 230655, "pid": 76337, "tid": -914061504, "ts": 1716454224979996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225107122, "dur": 90, "args": { "External id": 230677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230677, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230677, "pid": 5, "tid": 7, "ts": 1716454225107122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224980087, "dur": 1935, "args": { "External id": 230677, "cbid": 211, "correlation": 230677 } }, { "ph": "s", "id": 230677, "pid": 76337, "tid": -914061504, "ts": 1716454224980087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982099, "dur": 1, "args": { "External id": 230688, "cbid": 251, "correlation": 230688 } }, { "ph": "f", "id": 230688, "pid": 76337, "tid": -914061504, "ts": 1716454224982099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225107213, "dur": 101, "args": { "External id": 230689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230689, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230689, "pid": 5, "tid": 7, "ts": 1716454225107213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982104, "dur": 69, "args": { "External id": 230689, "cbid": 211, "correlation": 230689 } }, { "ph": "s", "id": 230689, "pid": 76337, "tid": -914061504, "ts": 1716454224982104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982232, "dur": 1, "args": { "External id": 230700, "cbid": 251, "correlation": 230700 } }, { "ph": "f", "id": 230700, "pid": 76337, "tid": -914061504, "ts": 1716454224982232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982236, "dur": 0, "args": { "External id": 230701, "cbid": 251, "correlation": 230701 } }, { "ph": "f", "id": 230701, "pid": 76337, "tid": -914061504, "ts": 1716454224982236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225107315, "dur": 11, "args": { "External id": 230702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230702, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 230702, "pid": 5, "tid": 7, "ts": 1716454225107315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982238, "dur": 12, "args": { "External id": 230702, "cbid": 211, "correlation": 230702 } }, { "ph": "s", "id": 230702, "pid": 76337, "tid": -914061504, "ts": 1716454224982238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225107327, "dur": 5, "args": { "External id": 230704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230704, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230704, "pid": 5, "tid": 7, "ts": 1716454225107327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982252, "dur": 6, "args": { "External id": 230704, "cbid": 211, "correlation": 230704 } }, { "ph": "s", "id": 230704, "pid": 76337, "tid": -914061504, "ts": 1716454224982252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982313, "dur": 1, "args": { "External id": 230715, "cbid": 251, "correlation": 230715 } }, { "ph": "f", "id": 230715, "pid": 76337, "tid": -914061504, "ts": 1716454224982313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982317, "dur": 0, "args": { "External id": 230716, "cbid": 251, "correlation": 230716 } }, { "ph": "f", "id": 230716, "pid": 76337, "tid": -914061504, "ts": 1716454224982317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225107334, "dur": 6, "args": { "External id": 230717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230717, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 230717, "pid": 5, "tid": 7, "ts": 1716454225107334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982319, "dur": 12, "args": { "External id": 230717, "cbid": 211, "correlation": 230717 } }, { "ph": "s", "id": 230717, "pid": 76337, "tid": -914061504, "ts": 1716454224982319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225107341, "dur": 3, "args": { "External id": 230719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230719, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230719, "pid": 5, "tid": 7, "ts": 1716454225107341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982332, "dur": 5, "args": { "External id": 230719, "cbid": 211, "correlation": 230719 } }, { "ph": "s", "id": 230719, "pid": 76337, "tid": -914061504, "ts": 1716454224982332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225107346, "dur": 154, "args": { "External id": 230740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230740, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230740, "pid": 5, "tid": 7, "ts": 1716454225107346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982405, "dur": 13, "args": { "External id": 230740, "cbid": 211, "correlation": 230740 } }, { "ph": "s", "id": 230740, "pid": 76337, "tid": -914061504, "ts": 1716454224982405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982503, "dur": 1, "args": { "External id": 230758, "cbid": 251, "correlation": 230758 } }, { "ph": "f", "id": 230758, "pid": 76337, "tid": -914061504, "ts": 1716454224982503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225107502, "dur": 105, "args": { "External id": 230760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230760, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230760, "pid": 5, "tid": 7, "ts": 1716454225107502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982509, "dur": 14, "args": { "External id": 230760, "cbid": 211, "correlation": 230760 } }, { "ph": "s", "id": 230760, "pid": 76337, "tid": -914061504, "ts": 1716454224982509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225107608, "dur": 35, "args": { "External id": 230768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230768, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230768, "pid": 5, "tid": 7, "ts": 1716454225107608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982579, "dur": 12, "args": { "External id": 230768, "cbid": 211, "correlation": 230768 } }, { "ph": "s", "id": 230768, "pid": 76337, "tid": -914061504, "ts": 1716454224982579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225107644, "dur": 66, "args": { "External id": 230776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230776, "pid": 5, "tid": 7, "ts": 1716454225107644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982620, "dur": 9, "args": { "External id": 230776, "cbid": 211, "correlation": 230776 } }, { "ph": "s", "id": 230776, "pid": 76337, "tid": -914061504, "ts": 1716454224982620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225107711, "dur": 90, "args": { "External id": 230798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230798, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230798, "pid": 5, "tid": 7, "ts": 1716454225107711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982672, "dur": 10, "args": { "External id": 230798, "cbid": 211, "correlation": 230798 } }, { "ph": "s", "id": 230798, "pid": 76337, "tid": -914061504, "ts": 1716454224982672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982757, "dur": 1, "args": { "External id": 230814, "cbid": 251, "correlation": 230814 } }, { "ph": "f", "id": 230814, "pid": 76337, "tid": -914061504, "ts": 1716454224982757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225107802, "dur": 568, "args": { "External id": 230816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230816, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230816, "pid": 5, "tid": 7, "ts": 1716454225107802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982763, "dur": 13, "args": { "External id": 230816, "cbid": 211, "correlation": 230816 } }, { "ph": "s", "id": 230816, "pid": 76337, "tid": -914061504, "ts": 1716454224982763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225108371, "dur": 238, "args": { "External id": 230824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230824, "pid": 5, "tid": 7, "ts": 1716454225108371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982829, "dur": 12, "args": { "External id": 230824, "cbid": 211, "correlation": 230824 } }, { "ph": "s", "id": 230824, "pid": 76337, "tid": -914061504, "ts": 1716454224982829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225108610, "dur": 248, "args": { "External id": 230832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230832, "pid": 5, "tid": 7, "ts": 1716454225108610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982859, "dur": 9, "args": { "External id": 230832, "cbid": 211, "correlation": 230832 } }, { "ph": "s", "id": 230832, "pid": 76337, "tid": -914061504, "ts": 1716454224982859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982940, "dur": 1, "args": { "External id": 230848, "cbid": 251, "correlation": 230848 } }, { "ph": "f", "id": 230848, "pid": 76337, "tid": -914061504, "ts": 1716454224982940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224982945, "dur": 0, "args": { "External id": 230850, "cbid": 251, "correlation": 230850 } }, { "ph": "f", "id": 230850, "pid": 76337, "tid": -914061504, "ts": 1716454224982945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225108860, "dur": 354, "args": { "External id": 230851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230851, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 230851, "pid": 5, "tid": 7, "ts": 1716454225108860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982948, "dur": 13, "args": { "External id": 230851, "cbid": 211, "correlation": 230851 } }, { "ph": "s", "id": 230851, "pid": 76337, "tid": -914061504, "ts": 1716454224982948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225109215, "dur": 50, "args": { "External id": 230859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230859, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230859, "pid": 5, "tid": 7, "ts": 1716454225109215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224982998, "dur": 11, "args": { "External id": 230859, "cbid": 211, "correlation": 230859 } }, { "ph": "s", "id": 230859, "pid": 76337, "tid": -914061504, "ts": 1716454224982998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225109267, "dur": 154, "args": { "External id": 230870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230870, "pid": 5, "tid": 7, "ts": 1716454225109267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983066, "dur": 207, "args": { "External id": 230870, "cbid": 211, "correlation": 230870 } }, { "ph": "s", "id": 230870, "pid": 76337, "tid": -914061504, "ts": 1716454224983066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224983326, "dur": 0, "args": { "External id": 230882, "cbid": 317, "correlation": 230882 } }, { "ph": "f", "id": 230882, "pid": 76337, "tid": -914061504, "ts": 1716454224983326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224983327, "dur": 0, "args": { "External id": 230883, "cbid": 203, "correlation": 230883 } }, { "ph": "f", "id": 230883, "pid": 76337, "tid": -914061504, "ts": 1716454224983327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224983328, "dur": 0, "args": { "External id": 230884, "cbid": 205, "correlation": 230884 } }, { "ph": "f", "id": 230884, "pid": 76337, "tid": -914061504, "ts": 1716454224983328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983353, "dur": 1, "args": { "External id": 230888, "cbid": 251, "correlation": 230888 } }, { "ph": "f", "id": 230888, "pid": 76337, "tid": -914061504, "ts": 1716454224983353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983355, "dur": 0, "args": { "External id": 230889, "cbid": 251, "correlation": 230889 } }, { "ph": "f", "id": 230889, "pid": 76337, "tid": -914061504, "ts": 1716454224983355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983355, "dur": 0, "args": { "External id": 230890, "cbid": 251, "correlation": 230890 } }, { "ph": "f", "id": 230890, "pid": 76337, "tid": -914061504, "ts": 1716454224983355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983356, "dur": 0, "args": { "External id": 230891, "cbid": 251, "correlation": 230891 } }, { "ph": "f", "id": 230891, "pid": 76337, "tid": -914061504, "ts": 1716454224983356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983357, "dur": 0, "args": { "External id": 230892, "cbid": 251, "correlation": 230892 } }, { "ph": "f", "id": 230892, "pid": 76337, "tid": -914061504, "ts": 1716454224983357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983358, "dur": 0, "args": { "External id": 230893, "cbid": 251, "correlation": 230893 } }, { "ph": "f", "id": 230893, "pid": 76337, "tid": -914061504, "ts": 1716454224983358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983358, "dur": 0, "args": { "External id": 230894, "cbid": 251, "correlation": 230894 } }, { "ph": "f", "id": 230894, "pid": 76337, "tid": -914061504, "ts": 1716454224983358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983359, "dur": 0, "args": { "External id": 230895, "cbid": 251, "correlation": 230895 } }, { "ph": "f", "id": 230895, "pid": 76337, "tid": -914061504, "ts": 1716454224983359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224983360, "dur": 0, "args": { "External id": 230896, "cbid": 251, "correlation": 230896 } }, { "ph": "f", "id": 230896, "pid": 76337, "tid": -914061504, "ts": 1716454224983360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225109422, "dur": 110, "args": { "External id": 230897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230897, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 230897, "pid": 5, "tid": 7, "ts": 1716454225109422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983362, "dur": 36, "args": { "External id": 230897, "cbid": 211, "correlation": 230897 } }, { "ph": "s", "id": 230897, "pid": 76337, "tid": -914061504, "ts": 1716454224983362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225109534, "dur": 59, "args": { "External id": 230903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230903, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230903, "pid": 5, "tid": 7, "ts": 1716454225109534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983420, "dur": 104, "args": { "External id": 230903, "cbid": 211, "correlation": 230903 } }, { "ph": "s", "id": 230903, "pid": 76337, "tid": -914061504, "ts": 1716454224983420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225109594, "dur": 50, "args": { "External id": 230911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230911, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230911, "pid": 5, "tid": 7, "ts": 1716454225109594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983548, "dur": 277, "args": { "External id": 230911, "cbid": 211, "correlation": 230911 } }, { "ph": "s", "id": 230911, "pid": 76337, "tid": -914061504, "ts": 1716454224983548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225109646, "dur": 95, "args": { "External id": 230920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230920, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230920, "pid": 5, "tid": 7, "ts": 1716454225109646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983857, "dur": 10, "args": { "External id": 230920, "cbid": 211, "correlation": 230920 } }, { "ph": "s", "id": 230920, "pid": 76337, "tid": -914061504, "ts": 1716454224983857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225109742, "dur": 91, "args": { "External id": 230940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230940, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 230940, "pid": 5, "tid": 7, "ts": 1716454225109742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983927, "dur": 12, "args": { "External id": 230940, "cbid": 211, "correlation": 230940 } }, { "ph": "s", "id": 230940, "pid": 76337, "tid": -914061504, "ts": 1716454224983927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225109834, "dur": 5, "args": { "External id": 230952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230952, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 230952, "pid": 5, "tid": 7, "ts": 1716454225109834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983949, "dur": 9, "args": { "External id": 230952, "cbid": 211, "correlation": 230952 } }, { "ph": "s", "id": 230952, "pid": 76337, "tid": -914061504, "ts": 1716454224983949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225109840, "dur": 109, "args": { "External id": 230955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230955, "pid": 5, "tid": 7, "ts": 1716454225109840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224983971, "dur": 107, "args": { "External id": 230955, "cbid": 211, "correlation": 230955 } }, { "ph": "s", "id": 230955, "pid": 76337, "tid": -914061504, "ts": 1716454224983971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225109950, "dur": 68, "args": { "External id": 230964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230964, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230964, "pid": 5, "tid": 7, "ts": 1716454225109950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984112, "dur": 10, "args": { "External id": 230964, "cbid": 211, "correlation": 230964 } }, { "ph": "s", "id": 230964, "pid": 76337, "tid": -914061504, "ts": 1716454224984112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224984164, "dur": 0, "args": { "External id": 230974, "cbid": 317, "correlation": 230974 } }, { "ph": "f", "id": 230974, "pid": 76337, "tid": -914061504, "ts": 1716454224984164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224984164, "dur": 0, "args": { "External id": 230975, "cbid": 203, "correlation": 230975 } }, { "ph": "f", "id": 230975, "pid": 76337, "tid": -914061504, "ts": 1716454224984164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224984165, "dur": 0, "args": { "External id": 230976, "cbid": 205, "correlation": 230976 } }, { "ph": "f", "id": 230976, "pid": 76337, "tid": -914061504, "ts": 1716454224984165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225110019, "dur": 75, "args": { "External id": 230980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230980, "pid": 5, "tid": 7, "ts": 1716454225110019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984179, "dur": 12, "args": { "External id": 230980, "cbid": 211, "correlation": 230980 } }, { "ph": "s", "id": 230980, "pid": 76337, "tid": -914061504, "ts": 1716454224984179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225110096, "dur": 24, "args": { "External id": 230982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230982, "pid": 5, "tid": 7, "ts": 1716454225110096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984193, "dur": 6, "args": { "External id": 230982, "cbid": 211, "correlation": 230982 } }, { "ph": "s", "id": 230982, "pid": 76337, "tid": -914061504, "ts": 1716454224984193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225110121, "dur": 4, "args": { "External id": 230984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230984, "pid": 5, "tid": 7, "ts": 1716454225110121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984204, "dur": 6, "args": { "External id": 230984, "cbid": 211, "correlation": 230984 } }, { "ph": "s", "id": 230984, "pid": 76337, "tid": -914061504, "ts": 1716454224984204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224984212, "dur": 0, "args": { "External id": 230985, "cbid": 51, "correlation": 230985 } }, { "ph": "s", "id": 230985, "pid": 76337, "tid": -914061504, "ts": 1716454224984212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225110126, "dur": 1346, "args": { "External id": 230986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230986, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 230986, "pid": 5, "tid": 7, "ts": 1716454225110126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984213, "dur": 5, "args": { "External id": 230986, "cbid": 211, "correlation": 230986 } }, { "ph": "s", "id": 230986, "pid": 76337, "tid": -914061504, "ts": 1716454224984213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225111473, "dur": 59, "args": { "External id": 230991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 230991, "pid": 5, "tid": 7, "ts": 1716454225111473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984240, "dur": 10, "args": { "External id": 230991, "cbid": 211, "correlation": 230991 } }, { "ph": "s", "id": 230991, "pid": 76337, "tid": -914061504, "ts": 1716454224984240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225111533, "dur": 3, "args": { "External id": 230999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 230999, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 230999, "pid": 5, "tid": 7, "ts": 1716454225111533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984284, "dur": 10, "args": { "External id": 230999, "cbid": 211, "correlation": 230999 } }, { "ph": "s", "id": 230999, "pid": 76337, "tid": -914061504, "ts": 1716454224984284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224984350, "dur": 1, "args": { "External id": 231015, "cbid": 251, "correlation": 231015 } }, { "ph": "f", "id": 231015, "pid": 76337, "tid": -914061504, "ts": 1716454224984350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224984355, "dur": 0, "args": { "External id": 231017, "cbid": 251, "correlation": 231017 } }, { "ph": "f", "id": 231017, "pid": 76337, "tid": -914061504, "ts": 1716454224984355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225111538, "dur": 11, "args": { "External id": 231018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231018, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 231018, "pid": 5, "tid": 7, "ts": 1716454225111538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984357, "dur": 11, "args": { "External id": 231018, "cbid": 211, "correlation": 231018 } }, { "ph": "s", "id": 231018, "pid": 76337, "tid": -914061504, "ts": 1716454224984357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225111550, "dur": 5, "args": { "External id": 231020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231020, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 231020, "pid": 5, "tid": 7, "ts": 1716454225111550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984370, "dur": 5, "args": { "External id": 231020, "cbid": 211, "correlation": 231020 } }, { "ph": "s", "id": 231020, "pid": 76337, "tid": -914061504, "ts": 1716454224984370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225111557, "dur": 53, "args": { "External id": 231030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231030, "pid": 5, "tid": 7, "ts": 1716454225111557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224984427, "dur": 556, "args": { "External id": 231030, "cbid": 211, "correlation": 231030 } }, { "ph": "s", "id": 231030, "pid": 76337, "tid": -914061504, "ts": 1716454224984427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225111611, "dur": 50, "args": { "External id": 231050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231050, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 231050, "pid": 5, "tid": 7, "ts": 1716454225111611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985038, "dur": 11, "args": { "External id": 231050, "cbid": 211, "correlation": 231050 } }, { "ph": "s", "id": 231050, "pid": 76337, "tid": -914061504, "ts": 1716454224985038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225111662, "dur": 4, "args": { "External id": 231062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231062, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 231062, "pid": 5, "tid": 7, "ts": 1716454225111662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985059, "dur": 7, "args": { "External id": 231062, "cbid": 211, "correlation": 231062 } }, { "ph": "s", "id": 231062, "pid": 76337, "tid": -914061504, "ts": 1716454224985059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225111667, "dur": 54, "args": { "External id": 231065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231065, "pid": 5, "tid": 7, "ts": 1716454225111667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985079, "dur": 7, "args": { "External id": 231065, "cbid": 211, "correlation": 231065 } }, { "ph": "s", "id": 231065, "pid": 76337, "tid": -914061504, "ts": 1716454224985079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225111723, "dur": 36, "args": { "External id": 231074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231074, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231074, "pid": 5, "tid": 7, "ts": 1716454225111723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985120, "dur": 10, "args": { "External id": 231074, "cbid": 211, "correlation": 231074 } }, { "ph": "s", "id": 231074, "pid": 76337, "tid": -914061504, "ts": 1716454224985120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224985182, "dur": 0, "args": { "External id": 231084, "cbid": 317, "correlation": 231084 } }, { "ph": "f", "id": 231084, "pid": 76337, "tid": -914061504, "ts": 1716454224985182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224985183, "dur": 0, "args": { "External id": 231085, "cbid": 203, "correlation": 231085 } }, { "ph": "f", "id": 231085, "pid": 76337, "tid": -914061504, "ts": 1716454224985183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224985184, "dur": 0, "args": { "External id": 231086, "cbid": 205, "correlation": 231086 } }, { "ph": "f", "id": 231086, "pid": 76337, "tid": -914061504, "ts": 1716454224985184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225111760, "dur": 41, "args": { "External id": 231090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231090, "pid": 5, "tid": 7, "ts": 1716454225111760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985198, "dur": 12, "args": { "External id": 231090, "cbid": 211, "correlation": 231090 } }, { "ph": "s", "id": 231090, "pid": 76337, "tid": -914061504, "ts": 1716454224985198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225111802, "dur": 14, "args": { "External id": 231092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231092, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231092, "pid": 5, "tid": 7, "ts": 1716454225111802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985212, "dur": 6, "args": { "External id": 231092, "cbid": 211, "correlation": 231092 } }, { "ph": "s", "id": 231092, "pid": 76337, "tid": -914061504, "ts": 1716454224985212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225111817, "dur": 3, "args": { "External id": 231094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 231094, "pid": 5, "tid": 7, "ts": 1716454225111817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985222, "dur": 5, "args": { "External id": 231094, "cbid": 211, "correlation": 231094 } }, { "ph": "s", "id": 231094, "pid": 76337, "tid": -914061504, "ts": 1716454224985222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224985230, "dur": 0, "args": { "External id": 231095, "cbid": 51, "correlation": 231095 } }, { "ph": "s", "id": 231095, "pid": 76337, "tid": -914061504, "ts": 1716454224985230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225111822, "dur": 686, "args": { "External id": 231096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231096, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231096, "pid": 5, "tid": 7, "ts": 1716454225111822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985231, "dur": 5, "args": { "External id": 231096, "cbid": 211, "correlation": 231096 } }, { "ph": "s", "id": 231096, "pid": 76337, "tid": -914061504, "ts": 1716454224985231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225112509, "dur": 58, "args": { "External id": 231101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231101, "pid": 5, "tid": 7, "ts": 1716454225112509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985258, "dur": 9, "args": { "External id": 231101, "cbid": 211, "correlation": 231101 } }, { "ph": "s", "id": 231101, "pid": 76337, "tid": -914061504, "ts": 1716454224985258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224985315, "dur": 0, "args": { "External id": 231111, "cbid": 317, "correlation": 231111 } }, { "ph": "f", "id": 231111, "pid": 76337, "tid": -914061504, "ts": 1716454224985315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224985316, "dur": 0, "args": { "External id": 231112, "cbid": 203, "correlation": 231112 } }, { "ph": "f", "id": 231112, "pid": 76337, "tid": -914061504, "ts": 1716454224985316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224985317, "dur": 0, "args": { "External id": 231113, "cbid": 205, "correlation": 231113 } }, { "ph": "f", "id": 231113, "pid": 76337, "tid": -914061504, "ts": 1716454224985317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225112569, "dur": 74, "args": { "External id": 231117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231117, "pid": 5, "tid": 7, "ts": 1716454225112569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985328, "dur": 11, "args": { "External id": 231117, "cbid": 211, "correlation": 231117 } }, { "ph": "s", "id": 231117, "pid": 76337, "tid": -914061504, "ts": 1716454224985328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225112644, "dur": 203, "args": { "External id": 231119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231119, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231119, "pid": 5, "tid": 7, "ts": 1716454225112644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985346, "dur": 7, "args": { "External id": 231119, "cbid": 211, "correlation": 231119 } }, { "ph": "s", "id": 231119, "pid": 76337, "tid": -914061504, "ts": 1716454224985346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225112849, "dur": 39, "args": { "External id": 231121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231121, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231121, "pid": 5, "tid": 7, "ts": 1716454225112849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985356, "dur": 5, "args": { "External id": 231121, "cbid": 211, "correlation": 231121 } }, { "ph": "s", "id": 231121, "pid": 76337, "tid": -914061504, "ts": 1716454224985356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225112889, "dur": 59, "args": { "External id": 231127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231127, "pid": 5, "tid": 7, "ts": 1716454225112889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985383, "dur": 500, "args": { "External id": 231127, "cbid": 211, "correlation": 231127 } }, { "ph": "s", "id": 231127, "pid": 76337, "tid": -914061504, "ts": 1716454224985383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225112949, "dur": 50, "args": { "External id": 231135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231135, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231135, "pid": 5, "tid": 7, "ts": 1716454225112949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985904, "dur": 8, "args": { "External id": 231135, "cbid": 211, "correlation": 231135 } }, { "ph": "s", "id": 231135, "pid": 76337, "tid": -914061504, "ts": 1716454224985904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225113000, "dur": 36, "args": { "External id": 231143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231143, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231143, "pid": 5, "tid": 7, "ts": 1716454225113000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224985934, "dur": 8, "args": { "External id": 231143, "cbid": 211, "correlation": 231143 } }, { "ph": "s", "id": 231143, "pid": 76337, "tid": -914061504, "ts": 1716454224985934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225113036, "dur": 52, "args": { "External id": 231163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231163, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 231163, "pid": 5, "tid": 7, "ts": 1716454225113036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986024, "dur": 14, "args": { "External id": 231163, "cbid": 211, "correlation": 231163 } }, { "ph": "s", "id": 231163, "pid": 76337, "tid": -914061504, "ts": 1716454224986024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225113090, "dur": 4, "args": { "External id": 231175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231175, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 231175, "pid": 5, "tid": 7, "ts": 1716454225113090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986047, "dur": 6, "args": { "External id": 231175, "cbid": 211, "correlation": 231175 } }, { "ph": "s", "id": 231175, "pid": 76337, "tid": -914061504, "ts": 1716454224986047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225113095, "dur": 54, "args": { "External id": 231178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231178, "pid": 5, "tid": 7, "ts": 1716454225113095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986065, "dur": 6, "args": { "External id": 231178, "cbid": 211, "correlation": 231178 } }, { "ph": "s", "id": 231178, "pid": 76337, "tid": -914061504, "ts": 1716454224986065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224986122, "dur": 0, "args": { "External id": 231189, "cbid": 317, "correlation": 231189 } }, { "ph": "f", "id": 231189, "pid": 76337, "tid": -914061504, "ts": 1716454224986122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224986122, "dur": 0, "args": { "External id": 231190, "cbid": 203, "correlation": 231190 } }, { "ph": "f", "id": 231190, "pid": 76337, "tid": -914061504, "ts": 1716454224986122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224986123, "dur": 0, "args": { "External id": 231191, "cbid": 205, "correlation": 231191 } }, { "ph": "f", "id": 231191, "pid": 76337, "tid": -914061504, "ts": 1716454224986123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986146, "dur": 1, "args": { "External id": 231195, "cbid": 251, "correlation": 231195 } }, { "ph": "f", "id": 231195, "pid": 76337, "tid": -914061504, "ts": 1716454224986146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986148, "dur": 0, "args": { "External id": 231196, "cbid": 251, "correlation": 231196 } }, { "ph": "f", "id": 231196, "pid": 76337, "tid": -914061504, "ts": 1716454224986148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986149, "dur": 1, "args": { "External id": 231197, "cbid": 251, "correlation": 231197 } }, { "ph": "f", "id": 231197, "pid": 76337, "tid": -914061504, "ts": 1716454224986149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986150, "dur": 0, "args": { "External id": 231198, "cbid": 251, "correlation": 231198 } }, { "ph": "f", "id": 231198, "pid": 76337, "tid": -914061504, "ts": 1716454224986150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986151, "dur": 0, "args": { "External id": 231199, "cbid": 251, "correlation": 231199 } }, { "ph": "f", "id": 231199, "pid": 76337, "tid": -914061504, "ts": 1716454224986151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986151, "dur": 0, "args": { "External id": 231200, "cbid": 251, "correlation": 231200 } }, { "ph": "f", "id": 231200, "pid": 76337, "tid": -914061504, "ts": 1716454224986151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986152, "dur": 0, "args": { "External id": 231201, "cbid": 251, "correlation": 231201 } }, { "ph": "f", "id": 231201, "pid": 76337, "tid": -914061504, "ts": 1716454224986152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986153, "dur": 0, "args": { "External id": 231202, "cbid": 251, "correlation": 231202 } }, { "ph": "f", "id": 231202, "pid": 76337, "tid": -914061504, "ts": 1716454224986153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986154, "dur": 0, "args": { "External id": 231203, "cbid": 251, "correlation": 231203 } }, { "ph": "f", "id": 231203, "pid": 76337, "tid": -914061504, "ts": 1716454224986154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225113151, "dur": 111, "args": { "External id": 231204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231204, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 231204, "pid": 5, "tid": 7, "ts": 1716454225113151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986156, "dur": 12, "args": { "External id": 231204, "cbid": 211, "correlation": 231204 } }, { "ph": "s", "id": 231204, "pid": 76337, "tid": -914061504, "ts": 1716454224986156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225113263, "dur": 58, "args": { "External id": 231210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231210, "pid": 5, "tid": 7, "ts": 1716454225113263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986192, "dur": 9, "args": { "External id": 231210, "cbid": 211, "correlation": 231210 } }, { "ph": "s", "id": 231210, "pid": 76337, "tid": -914061504, "ts": 1716454224986192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225113322, "dur": 459, "args": { "External id": 231219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231219, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231219, "pid": 5, "tid": 7, "ts": 1716454225113322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986274, "dur": 14, "args": { "External id": 231219, "cbid": 211, "correlation": 231219 } }, { "ph": "s", "id": 231219, "pid": 76337, "tid": -914061504, "ts": 1716454224986274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225113783, "dur": 176, "args": { "External id": 231241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231241, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231241, "pid": 5, "tid": 7, "ts": 1716454225113783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986330, "dur": 10, "args": { "External id": 231241, "cbid": 211, "correlation": 231241 } }, { "ph": "s", "id": 231241, "pid": 76337, "tid": -914061504, "ts": 1716454224986330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986415, "dur": 1, "args": { "External id": 231252, "cbid": 251, "correlation": 231252 } }, { "ph": "f", "id": 231252, "pid": 76337, "tid": -914061504, "ts": 1716454224986415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225113961, "dur": 191, "args": { "External id": 231253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231253, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231253, "pid": 5, "tid": 7, "ts": 1716454225113961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986420, "dur": 14, "args": { "External id": 231253, "cbid": 211, "correlation": 231253 } }, { "ph": "s", "id": 231253, "pid": 76337, "tid": -914061504, "ts": 1716454224986420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986490, "dur": 1, "args": { "External id": 231264, "cbid": 251, "correlation": 231264 } }, { "ph": "f", "id": 231264, "pid": 76337, "tid": -914061504, "ts": 1716454224986490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225114153, "dur": 185, "args": { "External id": 231265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231265, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231265, "pid": 5, "tid": 7, "ts": 1716454225114153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986494, "dur": 11, "args": { "External id": 231265, "cbid": 211, "correlation": 231265 } }, { "ph": "s", "id": 231265, "pid": 76337, "tid": -914061504, "ts": 1716454224986494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986556, "dur": 1, "args": { "External id": 231276, "cbid": 251, "correlation": 231276 } }, { "ph": "f", "id": 231276, "pid": 76337, "tid": -914061504, "ts": 1716454224986556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225114339, "dur": 183, "args": { "External id": 231277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231277, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231277, "pid": 5, "tid": 7, "ts": 1716454225114339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986560, "dur": 11, "args": { "External id": 231277, "cbid": 211, "correlation": 231277 } }, { "ph": "s", "id": 231277, "pid": 76337, "tid": -914061504, "ts": 1716454224986560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225114524, "dur": 18148, "args": { "External id": 231298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231298, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 231298, "pid": 5, "tid": 7, "ts": 1716454225114524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986640, "dur": 12, "args": { "External id": 231298, "cbid": 211, "correlation": 231298 } }, { "ph": "s", "id": 231298, "pid": 76337, "tid": -914061504, "ts": 1716454224986640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224986735, "dur": 1, "args": { "External id": 231316, "cbid": 251, "correlation": 231316 } }, { "ph": "f", "id": 231316, "pid": 76337, "tid": -914061504, "ts": 1716454224986735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225132673, "dur": 199, "args": { "External id": 231318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231318, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231318, "pid": 5, "tid": 7, "ts": 1716454225132673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986741, "dur": 13, "args": { "External id": 231318, "cbid": 211, "correlation": 231318 } }, { "ph": "s", "id": 231318, "pid": 76337, "tid": -914061504, "ts": 1716454224986741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225132874, "dur": 67, "args": { "External id": 231326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231326, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231326, "pid": 5, "tid": 7, "ts": 1716454225132874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986810, "dur": 12, "args": { "External id": 231326, "cbid": 211, "correlation": 231326 } }, { "ph": "s", "id": 231326, "pid": 76337, "tid": -914061504, "ts": 1716454224986810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225132942, "dur": 97, "args": { "External id": 231334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231334, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231334, "pid": 5, "tid": 7, "ts": 1716454225132942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224986849, "dur": 99, "args": { "External id": 231334, "cbid": 211, "correlation": 231334 } }, { "ph": "s", "id": 231334, "pid": 76337, "tid": -914061504, "ts": 1716454224986849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225133040, "dur": 54, "args": { "External id": 231345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231345, "pid": 5, "tid": 7, "ts": 1716454225133040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224987023, "dur": 1838, "args": { "External id": 231345, "cbid": 211, "correlation": 231345 } }, { "ph": "s", "id": 231345, "pid": 76337, "tid": -914061504, "ts": 1716454224987023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225133094, "dur": 90, "args": { "External id": 231367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231367, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231367, "pid": 5, "tid": 7, "ts": 1716454225133094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224988882, "dur": 123, "args": { "External id": 231367, "cbid": 211, "correlation": 231367 } }, { "ph": "s", "id": 231367, "pid": 76337, "tid": -914061504, "ts": 1716454224988882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989084, "dur": 1, "args": { "External id": 231378, "cbid": 251, "correlation": 231378 } }, { "ph": "f", "id": 231378, "pid": 76337, "tid": -914061504, "ts": 1716454224989084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225133185, "dur": 103, "args": { "External id": 231379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231379, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231379, "pid": 5, "tid": 7, "ts": 1716454225133185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989089, "dur": 13, "args": { "External id": 231379, "cbid": 211, "correlation": 231379 } }, { "ph": "s", "id": 231379, "pid": 76337, "tid": -914061504, "ts": 1716454224989089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989161, "dur": 1, "args": { "External id": 231390, "cbid": 251, "correlation": 231390 } }, { "ph": "f", "id": 231390, "pid": 76337, "tid": -914061504, "ts": 1716454224989161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989165, "dur": 0, "args": { "External id": 231391, "cbid": 251, "correlation": 231391 } }, { "ph": "f", "id": 231391, "pid": 76337, "tid": -914061504, "ts": 1716454224989165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225133290, "dur": 10, "args": { "External id": 231392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231392, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231392, "pid": 5, "tid": 7, "ts": 1716454225133290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989166, "dur": 12, "args": { "External id": 231392, "cbid": 211, "correlation": 231392 } }, { "ph": "s", "id": 231392, "pid": 76337, "tid": -914061504, "ts": 1716454224989166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225133301, "dur": 5, "args": { "External id": 231394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231394, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 231394, "pid": 5, "tid": 7, "ts": 1716454225133301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989181, "dur": 7, "args": { "External id": 231394, "cbid": 211, "correlation": 231394 } }, { "ph": "s", "id": 231394, "pid": 76337, "tid": -914061504, "ts": 1716454224989181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989242, "dur": 1, "args": { "External id": 231405, "cbid": 251, "correlation": 231405 } }, { "ph": "f", "id": 231405, "pid": 76337, "tid": -914061504, "ts": 1716454224989242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989245, "dur": 0, "args": { "External id": 231406, "cbid": 251, "correlation": 231406 } }, { "ph": "f", "id": 231406, "pid": 76337, "tid": -914061504, "ts": 1716454224989245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225133307, "dur": 6, "args": { "External id": 231407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231407, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231407, "pid": 5, "tid": 7, "ts": 1716454225133307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989247, "dur": 12, "args": { "External id": 231407, "cbid": 211, "correlation": 231407 } }, { "ph": "s", "id": 231407, "pid": 76337, "tid": -914061504, "ts": 1716454224989247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225133314, "dur": 3, "args": { "External id": 231409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231409, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 231409, "pid": 5, "tid": 7, "ts": 1716454225133314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989260, "dur": 6, "args": { "External id": 231409, "cbid": 211, "correlation": 231409 } }, { "ph": "s", "id": 231409, "pid": 76337, "tid": -914061504, "ts": 1716454224989260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225133319, "dur": 152, "args": { "External id": 231430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231430, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 231430, "pid": 5, "tid": 7, "ts": 1716454225133319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989334, "dur": 12, "args": { "External id": 231430, "cbid": 211, "correlation": 231430 } }, { "ph": "s", "id": 231430, "pid": 76337, "tid": -914061504, "ts": 1716454224989334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989430, "dur": 1, "args": { "External id": 231448, "cbid": 251, "correlation": 231448 } }, { "ph": "f", "id": 231448, "pid": 76337, "tid": -914061504, "ts": 1716454224989430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225133473, "dur": 106, "args": { "External id": 231450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231450, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 231450, "pid": 5, "tid": 7, "ts": 1716454225133473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989437, "dur": 13, "args": { "External id": 231450, "cbid": 211, "correlation": 231450 } }, { "ph": "s", "id": 231450, "pid": 76337, "tid": -914061504, "ts": 1716454224989437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225133580, "dur": 34, "args": { "External id": 231458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231458, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231458, "pid": 5, "tid": 7, "ts": 1716454225133580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989505, "dur": 12, "args": { "External id": 231458, "cbid": 211, "correlation": 231458 } }, { "ph": "s", "id": 231458, "pid": 76337, "tid": -914061504, "ts": 1716454224989505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225133615, "dur": 65, "args": { "External id": 231466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231466, "pid": 5, "tid": 7, "ts": 1716454225133615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989546, "dur": 9, "args": { "External id": 231466, "cbid": 211, "correlation": 231466 } }, { "ph": "s", "id": 231466, "pid": 76337, "tid": -914061504, "ts": 1716454224989546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225133682, "dur": 90, "args": { "External id": 231488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231488, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231488, "pid": 5, "tid": 7, "ts": 1716454225133682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989598, "dur": 10, "args": { "External id": 231488, "cbid": 211, "correlation": 231488 } }, { "ph": "s", "id": 231488, "pid": 76337, "tid": -914061504, "ts": 1716454224989598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989682, "dur": 1, "args": { "External id": 231504, "cbid": 251, "correlation": 231504 } }, { "ph": "f", "id": 231504, "pid": 76337, "tid": -914061504, "ts": 1716454224989682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225133773, "dur": 561, "args": { "External id": 231506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231506, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 231506, "pid": 5, "tid": 7, "ts": 1716454225133773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989689, "dur": 13, "args": { "External id": 231506, "cbid": 211, "correlation": 231506 } }, { "ph": "s", "id": 231506, "pid": 76337, "tid": -914061504, "ts": 1716454224989689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225134336, "dur": 240, "args": { "External id": 231514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231514, "pid": 5, "tid": 7, "ts": 1716454225134336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989754, "dur": 12, "args": { "External id": 231514, "cbid": 211, "correlation": 231514 } }, { "ph": "s", "id": 231514, "pid": 76337, "tid": -914061504, "ts": 1716454224989754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225134577, "dur": 252, "args": { "External id": 231522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231522, "pid": 5, "tid": 7, "ts": 1716454225134577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989785, "dur": 9, "args": { "External id": 231522, "cbid": 211, "correlation": 231522 } }, { "ph": "s", "id": 231522, "pid": 76337, "tid": -914061504, "ts": 1716454224989785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989866, "dur": 1, "args": { "External id": 231538, "cbid": 251, "correlation": 231538 } }, { "ph": "f", "id": 231538, "pid": 76337, "tid": -914061504, "ts": 1716454224989866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224989871, "dur": 0, "args": { "External id": 231540, "cbid": 251, "correlation": 231540 } }, { "ph": "f", "id": 231540, "pid": 76337, "tid": -914061504, "ts": 1716454224989871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225134830, "dur": 351, "args": { "External id": 231541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231541, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231541, "pid": 5, "tid": 7, "ts": 1716454225134830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989873, "dur": 12, "args": { "External id": 231541, "cbid": 211, "correlation": 231541 } }, { "ph": "s", "id": 231541, "pid": 76337, "tid": -914061504, "ts": 1716454224989873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225135182, "dur": 50, "args": { "External id": 231549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231549, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231549, "pid": 5, "tid": 7, "ts": 1716454225135182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224989915, "dur": 180, "args": { "External id": 231549, "cbid": 211, "correlation": 231549 } }, { "ph": "s", "id": 231549, "pid": 76337, "tid": -914061504, "ts": 1716454224989915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225135234, "dur": 154, "args": { "External id": 231560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231560, "pid": 5, "tid": 7, "ts": 1716454225135234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990153, "dur": 67, "args": { "External id": 231560, "cbid": 211, "correlation": 231560 } }, { "ph": "s", "id": 231560, "pid": 76337, "tid": -914061504, "ts": 1716454224990153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224990274, "dur": 0, "args": { "External id": 231572, "cbid": 317, "correlation": 231572 } }, { "ph": "f", "id": 231572, "pid": 76337, "tid": -914061504, "ts": 1716454224990274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224990275, "dur": 0, "args": { "External id": 231573, "cbid": 203, "correlation": 231573 } }, { "ph": "f", "id": 231573, "pid": 76337, "tid": -914061504, "ts": 1716454224990275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224990277, "dur": 0, "args": { "External id": 231574, "cbid": 205, "correlation": 231574 } }, { "ph": "f", "id": 231574, "pid": 76337, "tid": -914061504, "ts": 1716454224990277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990300, "dur": 1, "args": { "External id": 231578, "cbid": 251, "correlation": 231578 } }, { "ph": "f", "id": 231578, "pid": 76337, "tid": -914061504, "ts": 1716454224990300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990302, "dur": 0, "args": { "External id": 231579, "cbid": 251, "correlation": 231579 } }, { "ph": "f", "id": 231579, "pid": 76337, "tid": -914061504, "ts": 1716454224990302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990302, "dur": 0, "args": { "External id": 231580, "cbid": 251, "correlation": 231580 } }, { "ph": "f", "id": 231580, "pid": 76337, "tid": -914061504, "ts": 1716454224990302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990303, "dur": 0, "args": { "External id": 231581, "cbid": 251, "correlation": 231581 } }, { "ph": "f", "id": 231581, "pid": 76337, "tid": -914061504, "ts": 1716454224990303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990304, "dur": 0, "args": { "External id": 231582, "cbid": 251, "correlation": 231582 } }, { "ph": "f", "id": 231582, "pid": 76337, "tid": -914061504, "ts": 1716454224990304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990304, "dur": 0, "args": { "External id": 231583, "cbid": 251, "correlation": 231583 } }, { "ph": "f", "id": 231583, "pid": 76337, "tid": -914061504, "ts": 1716454224990304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990305, "dur": 0, "args": { "External id": 231584, "cbid": 251, "correlation": 231584 } }, { "ph": "f", "id": 231584, "pid": 76337, "tid": -914061504, "ts": 1716454224990305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990306, "dur": 0, "args": { "External id": 231585, "cbid": 251, "correlation": 231585 } }, { "ph": "f", "id": 231585, "pid": 76337, "tid": -914061504, "ts": 1716454224990306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224990307, "dur": 0, "args": { "External id": 231586, "cbid": 251, "correlation": 231586 } }, { "ph": "f", "id": 231586, "pid": 76337, "tid": -914061504, "ts": 1716454224990307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225135389, "dur": 115, "args": { "External id": 231587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231587, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 231587, "pid": 5, "tid": 7, "ts": 1716454225135389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990309, "dur": 40, "args": { "External id": 231587, "cbid": 211, "correlation": 231587 } }, { "ph": "s", "id": 231587, "pid": 76337, "tid": -914061504, "ts": 1716454224990309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225135506, "dur": 59, "args": { "External id": 231593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231593, "pid": 5, "tid": 7, "ts": 1716454225135506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990372, "dur": 281, "args": { "External id": 231593, "cbid": 211, "correlation": 231593 } }, { "ph": "s", "id": 231593, "pid": 76337, "tid": -914061504, "ts": 1716454224990372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225135566, "dur": 50, "args": { "External id": 231601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231601, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231601, "pid": 5, "tid": 7, "ts": 1716454225135566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990677, "dur": 9, "args": { "External id": 231601, "cbid": 211, "correlation": 231601 } }, { "ph": "s", "id": 231601, "pid": 76337, "tid": -914061504, "ts": 1716454224990677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225135617, "dur": 51, "args": { "External id": 231621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231621, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 231621, "pid": 5, "tid": 7, "ts": 1716454225135617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990757, "dur": 13, "args": { "External id": 231621, "cbid": 211, "correlation": 231621 } }, { "ph": "s", "id": 231621, "pid": 76337, "tid": -914061504, "ts": 1716454224990757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225135669, "dur": 5, "args": { "External id": 231633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231633, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 231633, "pid": 5, "tid": 7, "ts": 1716454225135669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990780, "dur": 6, "args": { "External id": 231633, "cbid": 211, "correlation": 231633 } }, { "ph": "s", "id": 231633, "pid": 76337, "tid": -914061504, "ts": 1716454224990780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225135675, "dur": 55, "args": { "External id": 231636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231636, "pid": 5, "tid": 7, "ts": 1716454225135675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990798, "dur": 108, "args": { "External id": 231636, "cbid": 211, "correlation": 231636 } }, { "ph": "s", "id": 231636, "pid": 76337, "tid": -914061504, "ts": 1716454224990798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225135732, "dur": 37, "args": { "External id": 231645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231645, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231645, "pid": 5, "tid": 7, "ts": 1716454225135732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224990945, "dur": 10, "args": { "External id": 231645, "cbid": 211, "correlation": 231645 } }, { "ph": "s", "id": 231645, "pid": 76337, "tid": -914061504, "ts": 1716454224990945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224991010, "dur": 0, "args": { "External id": 231655, "cbid": 317, "correlation": 231655 } }, { "ph": "f", "id": 231655, "pid": 76337, "tid": -914061504, "ts": 1716454224991010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224991011, "dur": 0, "args": { "External id": 231656, "cbid": 203, "correlation": 231656 } }, { "ph": "f", "id": 231656, "pid": 76337, "tid": -914061504, "ts": 1716454224991011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224991011, "dur": 0, "args": { "External id": 231657, "cbid": 205, "correlation": 231657 } }, { "ph": "f", "id": 231657, "pid": 76337, "tid": -914061504, "ts": 1716454224991011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225135770, "dur": 41, "args": { "External id": 231661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231661, "pid": 5, "tid": 7, "ts": 1716454225135770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224991027, "dur": 13, "args": { "External id": 231661, "cbid": 211, "correlation": 231661 } }, { "ph": "s", "id": 231661, "pid": 76337, "tid": -914061504, "ts": 1716454224991027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225135813, "dur": 3, "args": { "External id": 231663, "device": 5, "context": 1, "stream": 7, "correlation": 231663, "bytes": 46080, "memory bandwidth (GB/s)": 11.80327868852459 } }, { "ph": "f", "id": 231663, "pid": 5, "tid": 7, "ts": 1716454225135813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224991044, "dur": 16, "args": { "External id": 231663, "cbid": 51, "correlation": 231663 } }, { "ph": "s", "id": 231663, "pid": 76337, "tid": -914061504, "ts": 1716454224991044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224991065, "dur": 2, "args": { "External id": 231665, "cbid": 200, "correlation": 231665 } }, { "ph": "f", "id": 231665, "pid": 76337, "tid": -914061504, "ts": 1716454224991065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224991068, "dur": 0, "args": { "External id": 231666, "cbid": 200, "correlation": 231666 } }, { "ph": "f", "id": 231666, "pid": 76337, "tid": -914061504, "ts": 1716454224991068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224991068, "dur": 0, "args": { "External id": 231667, "cbid": 200, "correlation": 231667 } }, { "ph": "f", "id": 231667, "pid": 76337, "tid": -914061504, "ts": 1716454224991068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224991069, "dur": 0, "args": { "External id": 231668, "cbid": 200, "correlation": 231668 } }, { "ph": "f", "id": 231668, "pid": 76337, "tid": -914061504, "ts": 1716454224991069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454224991070, "dur": 4, "args": { "External id": 231669, "cbid": 15, "correlation": 231669 } }, { "ph": "f", "id": 231669, "pid": 76337, "tid": -914061504, "ts": 1716454224991070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224991075, "dur": 1, "args": { "External id": 231670, "cbid": 251, "correlation": 231670 } }, { "ph": "f", "id": 231670, "pid": 76337, "tid": -914061504, "ts": 1716454224991075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454225135818, "dur": 23, "args": { "External id": 231671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231671, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231671, "pid": 5, "tid": 7, "ts": 1716454225135818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224991078, "dur": 10, "args": { "External id": 231671, "cbid": 211, "correlation": 231671 } }, { "ph": "s", "id": 231671, "pid": 76337, "tid": -914061504, "ts": 1716454224991078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225135842, "dur": 4, "args": { "External id": 231673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231673, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 231673, "pid": 5, "tid": 7, "ts": 1716454225135842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224991094, "dur": 6, "args": { "External id": 231673, "cbid": 211, "correlation": 231673 } }, { "ph": "s", "id": 231673, "pid": 76337, "tid": -914061504, "ts": 1716454224991094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224991103, "dur": 0, "args": { "External id": 231674, "cbid": 51, "correlation": 231674 } }, { "ph": "s", "id": 231674, "pid": 76337, "tid": -914061504, "ts": 1716454224991103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225135847, "dur": 183, "args": { "External id": 231675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231675, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231675, "pid": 5, "tid": 7, "ts": 1716454225135847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224991104, "dur": 194, "args": { "External id": 231675, "cbid": 211, "correlation": 231675 } }, { "ph": "s", "id": 231675, "pid": 76337, "tid": -914061504, "ts": 1716454224991104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225136032, "dur": 6, "args": { "External id": 231676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231676, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231676, "pid": 5, "tid": 7, "ts": 1716454225136032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224991301, "dur": 5, "args": { "External id": 231676, "cbid": 211, "correlation": 231676 } }, { "ph": "s", "id": 231676, "pid": 76337, "tid": -914061504, "ts": 1716454224991301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225136039, "dur": 5, "args": { "External id": 231682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 231682, "pid": 5, "tid": 7, "ts": 1716454225136039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224991331, "dur": 9, "args": { "External id": 231682, "cbid": 211, "correlation": 231682 } }, { "ph": "s", "id": 231682, "pid": 76337, "tid": -914061504, "ts": 1716454224991331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136045, "dur": 3, "args": { "External id": 231690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231690, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231690, "pid": 5, "tid": 7, "ts": 1716454225136045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993007, "dur": 14, "args": { "External id": 231690, "cbid": 211, "correlation": 231690 } }, { "ph": "s", "id": 231690, "pid": 76337, "tid": -914061504, "ts": 1716454224993007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136049, "dur": 3, "args": { "External id": 231698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231698, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231698, "pid": 5, "tid": 7, "ts": 1716454225136049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993048, "dur": 10, "args": { "External id": 231698, "cbid": 211, "correlation": 231698 } }, { "ph": "s", "id": 231698, "pid": 76337, "tid": -914061504, "ts": 1716454224993048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136054, "dur": 3, "args": { "External id": 231706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231706, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231706, "pid": 5, "tid": 7, "ts": 1716454225136054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993074, "dur": 9, "args": { "External id": 231706, "cbid": 211, "correlation": 231706 } }, { "ph": "s", "id": 231706, "pid": 76337, "tid": -914061504, "ts": 1716454224993074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136057, "dur": 3, "args": { "External id": 231715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231715, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231715, "pid": 5, "tid": 7, "ts": 1716454225136057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993245, "dur": 14, "args": { "External id": 231715, "cbid": 211, "correlation": 231715 } }, { "ph": "s", "id": 231715, "pid": 76337, "tid": -914061504, "ts": 1716454224993245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136061, "dur": 3, "args": { "External id": 231724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231724, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231724, "pid": 5, "tid": 7, "ts": 1716454225136061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993274, "dur": 7, "args": { "External id": 231724, "cbid": 211, "correlation": 231724 } }, { "ph": "s", "id": 231724, "pid": 76337, "tid": -914061504, "ts": 1716454224993274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136065, "dur": 3, "args": { "External id": 231732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231732, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231732, "pid": 5, "tid": 7, "ts": 1716454225136065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993300, "dur": 8, "args": { "External id": 231732, "cbid": 211, "correlation": 231732 } }, { "ph": "s", "id": 231732, "pid": 76337, "tid": -914061504, "ts": 1716454224993300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136070, "dur": 3, "args": { "External id": 231740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231740, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231740, "pid": 5, "tid": 7, "ts": 1716454225136070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993563, "dur": 15, "args": { "External id": 231740, "cbid": 211, "correlation": 231740 } }, { "ph": "s", "id": 231740, "pid": 76337, "tid": -914061504, "ts": 1716454224993563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136074, "dur": 3, "args": { "External id": 231748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231748, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231748, "pid": 5, "tid": 7, "ts": 1716454225136074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224993594, "dur": 8, "args": { "External id": 231748, "cbid": 211, "correlation": 231748 } }, { "ph": "s", "id": 231748, "pid": 76337, "tid": -914061504, "ts": 1716454224993594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225136079, "dur": 1, "args": { "External id": 231758, "device": 5, "context": 1, "stream": 7, "correlation": 231758, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 231758, "pid": 5, "tid": 7, "ts": 1716454225136079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224993659, "dur": 34, "args": { "External id": 231758, "cbid": 41, "correlation": 231758 } }, { "ph": "s", "id": 231758, "pid": 76337, "tid": -914061504, "ts": 1716454224993659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224993695, "dur": 142404, "args": { "External id": 231759, "cbid": 131, "correlation": 231759 } }, { "ph": "f", "id": 231759, "pid": 76337, "tid": -914061504, "ts": 1716454224993695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225136377, "dur": 3, "args": { "External id": 231767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231767, "pid": 5, "tid": 7, "ts": 1716454225136377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225136336, "dur": 45, "args": { "External id": 231767, "cbid": 211, "correlation": 231767 } }, { "ph": "s", "id": 231767, "pid": 76337, "tid": -914061504, "ts": 1716454225136336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225136495, "dur": 3, "args": { "External id": 231776, "device": 5, "context": 1, "stream": 7, "correlation": 231776, "bytes": 8, "memory bandwidth (GB/s)": 0.0024271844660194173 } }, { "ph": "f", "id": 231776, "pid": 5, "tid": 7, "ts": 1716454225136495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225136456, "dur": 41, "args": { "External id": 231776, "cbid": 41, "correlation": 231776 } }, { "ph": "s", "id": 231776, "pid": 76337, "tid": -914061504, "ts": 1716454225136456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225136610, "dur": 4, "args": { "External id": 231786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231786, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 231786, "pid": 5, "tid": 7, "ts": 1716454225136610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225136593, "dur": 17, "args": { "External id": 231786, "cbid": 211, "correlation": 231786 } }, { "ph": "s", "id": 231786, "pid": 76337, "tid": -914061504, "ts": 1716454225136593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225136702, "dur": 1, "args": { "External id": 231796, "device": 5, "context": 1, "stream": 7, "correlation": 231796, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 231796, "pid": 5, "tid": 7, "ts": 1716454225136702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225136682, "dur": 17, "args": { "External id": 231796, "cbid": 41, "correlation": 231796 } }, { "ph": "s", "id": 231796, "pid": 76337, "tid": -914061504, "ts": 1716454225136682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225136701, "dur": 8, "args": { "External id": 231797, "cbid": 131, "correlation": 231797 } }, { "ph": "f", "id": 231797, "pid": 76337, "tid": -914061504, "ts": 1716454225136701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225136782, "dur": 3, "args": { "External id": 231804, "device": 5, "context": 1, "stream": 7, "correlation": 231804, "bytes": 98304, "memory bandwidth (GB/s)": 31.03030303030303 } }, { "ph": "f", "id": 231804, "pid": 5, "tid": 7, "ts": 1716454225136782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225136759, "dur": 23, "args": { "External id": 231804, "cbid": 41, "correlation": 231804 } }, { "ph": "s", "id": 231804, "pid": 76337, "tid": -914061504, "ts": 1716454225136759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225136881, "dur": 2, "args": { "External id": 231823, "device": 5, "context": 1, "stream": 7, "correlation": 231823, "bytes": 16, "memory bandwidth (GB/s)": 0.005376344086021506 } }, { "ph": "f", "id": 231823, "pid": 5, "tid": 7, "ts": 1716454225136881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225136860, "dur": 20, "args": { "External id": 231823, "cbid": 41, "correlation": 231823 } }, { "ph": "s", "id": 231823, "pid": 76337, "tid": -914061504, "ts": 1716454225136860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225136919, "dur": 3, "args": { "External id": 231829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231829, "pid": 5, "tid": 7, "ts": 1716454225136919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225136908, "dur": 11, "args": { "External id": 231829, "cbid": 211, "correlation": 231829 } }, { "ph": "s", "id": 231829, "pid": 76337, "tid": -914061504, "ts": 1716454225136908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454225136933, "dur": 6, "args": { "External id": 231831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231831, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 231831, "pid": 5, "tid": 7, "ts": 1716454225136933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225136923, "dur": 9, "args": { "External id": 231831, "cbid": 211, "correlation": 231831 } }, { "ph": "s", "id": 231831, "pid": 76337, "tid": -914061504, "ts": 1716454225136923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454225136941, "dur": 3, "args": { "External id": 231833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231833, "pid": 5, "tid": 7, "ts": 1716454225136941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225136934, "dur": 6, "args": { "External id": 231833, "cbid": 211, "correlation": 231833 } }, { "ph": "s", "id": 231833, "pid": 76337, "tid": -914061504, "ts": 1716454225136934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225136984, "dur": 2, "args": { "External id": 231841, "device": 5, "context": 1, "stream": 7, "correlation": 231841, "bytes": 8, "memory bandwidth (GB/s)": 0.0028089887640449437 } }, { "ph": "f", "id": 231841, "pid": 5, "tid": 7, "ts": 1716454225136984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225136961, "dur": 23, "args": { "External id": 231841, "cbid": 41, "correlation": 231841 } }, { "ph": "s", "id": 231841, "pid": 76337, "tid": -914061504, "ts": 1716454225136961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225137039, "dur": 3, "args": { "External id": 231855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231855, "pid": 5, "tid": 7, "ts": 1716454225137039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137027, "dur": 14, "args": { "External id": 231855, "cbid": 211, "correlation": 231855 } }, { "ph": "s", "id": 231855, "pid": 76337, "tid": -914061504, "ts": 1716454225137027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225137060, "dur": 2, "args": { "External id": 231869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231869, "pid": 5, "tid": 7, "ts": 1716454225137060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137052, "dur": 7, "args": { "External id": 231869, "cbid": 211, "correlation": 231869 } }, { "ph": "s", "id": 231869, "pid": 76337, "tid": -914061504, "ts": 1716454225137052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225137097, "dur": 6, "args": { "External id": 231876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231876, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231876, "pid": 5, "tid": 7, "ts": 1716454225137097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137087, "dur": 11, "args": { "External id": 231876, "cbid": 211, "correlation": 231876 } }, { "ph": "s", "id": 231876, "pid": 76337, "tid": -914061504, "ts": 1716454225137087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225137108, "dur": 6, "args": { "External id": 231879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231879, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231879, "pid": 5, "tid": 7, "ts": 1716454225137108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137101, "dur": 7, "args": { "External id": 231879, "cbid": 211, "correlation": 231879 } }, { "ph": "s", "id": 231879, "pid": 76337, "tid": -914061504, "ts": 1716454225137101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454225137117, "dur": 3, "args": { "External id": 231881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231881, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231881, "pid": 5, "tid": 7, "ts": 1716454225137117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137109, "dur": 7, "args": { "External id": 231881, "cbid": 211, "correlation": 231881 } }, { "ph": "s", "id": 231881, "pid": 76337, "tid": -914061504, "ts": 1716454225137109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225137139, "dur": 2, "args": { "External id": 231884, "device": 5, "context": 1, "stream": 7, "correlation": 231884, "bytes": 8, "memory bandwidth (GB/s)": 0.002840909090909091 } }, { "ph": "f", "id": 231884, "pid": 5, "tid": 7, "ts": 1716454225137139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225137126, "dur": 12, "args": { "External id": 231884, "cbid": 41, "correlation": 231884 } }, { "ph": "s", "id": 231884, "pid": 76337, "tid": -914061504, "ts": 1716454225137126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225137205, "dur": 4, "args": { "External id": 231900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231900, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 231900, "pid": 5, "tid": 7, "ts": 1716454225137205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137183, "dur": 23, "args": { "External id": 231900, "cbid": 211, "correlation": 231900 } }, { "ph": "s", "id": 231900, "pid": 76337, "tid": -914061504, "ts": 1716454225137183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225137228, "dur": 3, "args": { "External id": 231905, "device": 5, "context": 1, "stream": 7, "correlation": 231905, "bytes": 1, "memory bandwidth (GB/s)": 0.00028935185185185184 } }, { "ph": "f", "id": 231905, "pid": 5, "tid": 7, "ts": 1716454225137228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225137212, "dur": 14, "args": { "External id": 231905, "cbid": 41, "correlation": 231905 } }, { "ph": "s", "id": 231905, "pid": 76337, "tid": -914061504, "ts": 1716454225137212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225137254, "dur": 1, "args": { "External id": 231911, "device": 5, "context": 1, "stream": 7, "correlation": 231911, "bytes": 1, "memory bandwidth (GB/s)": 0.0005896226415094339 } }, { "ph": "f", "id": 231911, "pid": 5, "tid": 7, "ts": 1716454225137254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225137236, "dur": 27, "args": { "External id": 231911, "cbid": 41, "correlation": 231911 } }, { "ph": "s", "id": 231911, "pid": 76337, "tid": -914061504, "ts": 1716454225137236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225137264, "dur": 4, "args": { "External id": 231912, "cbid": 131, "correlation": 231912 } }, { "ph": "f", "id": 231912, "pid": 76337, "tid": -914061504, "ts": 1716454225137264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225137324, "dur": 3, "args": { "External id": 231920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231920, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231920, "pid": 5, "tid": 7, "ts": 1716454225137324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137310, "dur": 15, "args": { "External id": 231920, "cbid": 211, "correlation": 231920 } }, { "ph": "s", "id": 231920, "pid": 76337, "tid": -914061504, "ts": 1716454225137310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225137357, "dur": 3, "args": { "External id": 231930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231930, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231930, "pid": 5, "tid": 7, "ts": 1716454225137357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137347, "dur": 10, "args": { "External id": 231930, "cbid": 211, "correlation": 231930 } }, { "ph": "s", "id": 231930, "pid": 76337, "tid": -914061504, "ts": 1716454225137347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225137393, "dur": 3, "args": { "External id": 231939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231939, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231939, "pid": 5, "tid": 7, "ts": 1716454225137393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137377, "dur": 15, "args": { "External id": 231939, "cbid": 211, "correlation": 231939 } }, { "ph": "s", "id": 231939, "pid": 76337, "tid": -914061504, "ts": 1716454225137377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225137518, "dur": 12, "args": { "External id": 231949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231949, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231949, "pid": 5, "tid": 7, "ts": 1716454225137518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137504, "dur": 15, "args": { "External id": 231949, "cbid": 211, "correlation": 231949 } }, { "ph": "s", "id": 231949, "pid": 76337, "tid": -914061504, "ts": 1716454225137504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225137560, "dur": 3, "args": { "External id": 231957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231957, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231957, "pid": 5, "tid": 7, "ts": 1716454225137560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137550, "dur": 9, "args": { "External id": 231957, "cbid": 211, "correlation": 231957 } }, { "ph": "s", "id": 231957, "pid": 76337, "tid": -914061504, "ts": 1716454225137550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225137611, "dur": 11, "args": { "External id": 231967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231967, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231967, "pid": 5, "tid": 7, "ts": 1716454225137611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137599, "dur": 12, "args": { "External id": 231967, "cbid": 211, "correlation": 231967 } }, { "ph": "s", "id": 231967, "pid": 76337, "tid": -914061504, "ts": 1716454225137599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225137651, "dur": 10, "args": { "External id": 231975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231975, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231975, "pid": 5, "tid": 7, "ts": 1716454225137651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137638, "dur": 12, "args": { "External id": 231975, "cbid": 211, "correlation": 231975 } }, { "ph": "s", "id": 231975, "pid": 76337, "tid": -914061504, "ts": 1716454225137638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225137681, "dur": 3, "args": { "External id": 231984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231984, "pid": 5, "tid": 7, "ts": 1716454225137681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137672, "dur": 9, "args": { "External id": 231984, "cbid": 211, "correlation": 231984 } }, { "ph": "s", "id": 231984, "pid": 76337, "tid": -914061504, "ts": 1716454225137672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225137708, "dur": 5, "args": { "External id": 231993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 231993, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 231993, "pid": 5, "tid": 7, "ts": 1716454225137708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137699, "dur": 8, "args": { "External id": 231993, "cbid": 211, "correlation": 231993 } }, { "ph": "s", "id": 231993, "pid": 76337, "tid": -914061504, "ts": 1716454225137699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225137750, "dur": 8, "args": { "External id": 232003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232003, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232003, "pid": 5, "tid": 7, "ts": 1716454225137750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225137740, "dur": 11, "args": { "External id": 232003, "cbid": 211, "correlation": 232003 } }, { "ph": "s", "id": 232003, "pid": 76337, "tid": -914061504, "ts": 1716454225137740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138116, "dur": 3, "args": { "External id": 232012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232012, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232012, "pid": 5, "tid": 7, "ts": 1716454225138116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138100, "dur": 16, "args": { "External id": 232012, "cbid": 211, "correlation": 232012 } }, { "ph": "s", "id": 232012, "pid": 76337, "tid": -914061504, "ts": 1716454225138100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138150, "dur": 3, "args": { "External id": 232020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232020, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232020, "pid": 5, "tid": 7, "ts": 1716454225138150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138140, "dur": 10, "args": { "External id": 232020, "cbid": 211, "correlation": 232020 } }, { "ph": "s", "id": 232020, "pid": 76337, "tid": -914061504, "ts": 1716454225138140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225138203, "dur": 1, "args": { "External id": 232030, "device": 5, "context": 1, "stream": 7, "correlation": 232030, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 232030, "pid": 5, "tid": 7, "ts": 1716454225138203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225138188, "dur": 12, "args": { "External id": 232030, "cbid": 41, "correlation": 232030 } }, { "ph": "s", "id": 232030, "pid": 76337, "tid": -914061504, "ts": 1716454225138188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225138202, "dur": 9, "args": { "External id": 232031, "cbid": 131, "correlation": 232031 } }, { "ph": "f", "id": 232031, "pid": 76337, "tid": -914061504, "ts": 1716454225138202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138295, "dur": 2, "args": { "External id": 232039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232039, "pid": 5, "tid": 7, "ts": 1716454225138295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138280, "dur": 14, "args": { "External id": 232039, "cbid": 211, "correlation": 232039 } }, { "ph": "s", "id": 232039, "pid": 76337, "tid": -914061504, "ts": 1716454225138280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225138369, "dur": 3, "args": { "External id": 232048, "device": 5, "context": 1, "stream": 7, "correlation": 232048, "bytes": 8, "memory bandwidth (GB/s)": 0.002631578947368421 } }, { "ph": "f", "id": 232048, "pid": 5, "tid": 7, "ts": 1716454225138369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225138351, "dur": 18, "args": { "External id": 232048, "cbid": 41, "correlation": 232048 } }, { "ph": "s", "id": 232048, "pid": 76337, "tid": -914061504, "ts": 1716454225138351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225138443, "dur": 3, "args": { "External id": 232058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232058, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232058, "pid": 5, "tid": 7, "ts": 1716454225138443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138429, "dur": 13, "args": { "External id": 232058, "cbid": 211, "correlation": 232058 } }, { "ph": "s", "id": 232058, "pid": 76337, "tid": -914061504, "ts": 1716454225138429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225138494, "dur": 1, "args": { "External id": 232068, "device": 5, "context": 1, "stream": 7, "correlation": 232068, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 232068, "pid": 5, "tid": 7, "ts": 1716454225138494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225138480, "dur": 12, "args": { "External id": 232068, "cbid": 41, "correlation": 232068 } }, { "ph": "s", "id": 232068, "pid": 76337, "tid": -914061504, "ts": 1716454225138480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225138493, "dur": 9, "args": { "External id": 232069, "cbid": 131, "correlation": 232069 } }, { "ph": "f", "id": 232069, "pid": 76337, "tid": -914061504, "ts": 1716454225138493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225138555, "dur": 3, "args": { "External id": 232076, "device": 5, "context": 1, "stream": 7, "correlation": 232076, "bytes": 98304, "memory bandwidth (GB/s)": 32 } }, { "ph": "f", "id": 232076, "pid": 5, "tid": 7, "ts": 1716454225138555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225138536, "dur": 18, "args": { "External id": 232076, "cbid": 41, "correlation": 232076 } }, { "ph": "s", "id": 232076, "pid": 76337, "tid": -914061504, "ts": 1716454225138536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225138602, "dur": 1, "args": { "External id": 232087, "device": 5, "context": 1, "stream": 7, "correlation": 232087, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 232087, "pid": 5, "tid": 7, "ts": 1716454225138602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225138590, "dur": 10, "args": { "External id": 232087, "cbid": 41, "correlation": 232087 } }, { "ph": "s", "id": 232087, "pid": 76337, "tid": -914061504, "ts": 1716454225138590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225138601, "dur": 8, "args": { "External id": 232088, "cbid": 131, "correlation": 232088 } }, { "ph": "f", "id": 232088, "pid": 76337, "tid": -914061504, "ts": 1716454225138601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138651, "dur": 3, "args": { "External id": 232096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232096, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232096, "pid": 5, "tid": 7, "ts": 1716454225138651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138637, "dur": 13, "args": { "External id": 232096, "cbid": 211, "correlation": 232096 } }, { "ph": "s", "id": 232096, "pid": 76337, "tid": -914061504, "ts": 1716454225138637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138681, "dur": 3, "args": { "External id": 232106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232106, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232106, "pid": 5, "tid": 7, "ts": 1716454225138681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138671, "dur": 8, "args": { "External id": 232106, "cbid": 211, "correlation": 232106 } }, { "ph": "s", "id": 232106, "pid": 76337, "tid": -914061504, "ts": 1716454225138671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138702, "dur": 3, "args": { "External id": 232115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232115, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232115, "pid": 5, "tid": 7, "ts": 1716454225138702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138694, "dur": 7, "args": { "External id": 232115, "cbid": 211, "correlation": 232115 } }, { "ph": "s", "id": 232115, "pid": 76337, "tid": -914061504, "ts": 1716454225138694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225138771, "dur": 5, "args": { "External id": 232123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232123, "pid": 5, "tid": 7, "ts": 1716454225138771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138757, "dur": 15, "args": { "External id": 232123, "cbid": 211, "correlation": 232123 } }, { "ph": "s", "id": 232123, "pid": 76337, "tid": -914061504, "ts": 1716454225138757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138811, "dur": 3, "args": { "External id": 232132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232132, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232132, "pid": 5, "tid": 7, "ts": 1716454225138811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138801, "dur": 10, "args": { "External id": 232132, "cbid": 211, "correlation": 232132 } }, { "ph": "s", "id": 232132, "pid": 76337, "tid": -914061504, "ts": 1716454225138801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138835, "dur": 3, "args": { "External id": 232141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232141, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232141, "pid": 5, "tid": 7, "ts": 1716454225138835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138826, "dur": 7, "args": { "External id": 232141, "cbid": 211, "correlation": 232141 } }, { "ph": "s", "id": 232141, "pid": 76337, "tid": -914061504, "ts": 1716454225138826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225138898, "dur": 3, "args": { "External id": 232149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232149, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232149, "pid": 5, "tid": 7, "ts": 1716454225138898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225138887, "dur": 10, "args": { "External id": 232149, "cbid": 211, "correlation": 232149 } }, { "ph": "s", "id": 232149, "pid": 76337, "tid": -914061504, "ts": 1716454225138887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225138955, "dur": 1, "args": { "External id": 232157, "device": 5, "context": 1, "stream": 7, "correlation": 232157, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 232157, "pid": 5, "tid": 7, "ts": 1716454225138955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225138940, "dur": 25, "args": { "External id": 232157, "cbid": 41, "correlation": 232157 } }, { "ph": "s", "id": 232157, "pid": 76337, "tid": -914061504, "ts": 1716454225138940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225138966, "dur": 3, "args": { "External id": 232158, "cbid": 131, "correlation": 232158 } }, { "ph": "f", "id": 232158, "pid": 76337, "tid": -914061504, "ts": 1716454225138966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225139036, "dur": 1, "args": { "External id": 232168, "device": 5, "context": 1, "stream": 7, "correlation": 232168, "bytes": 42, "memory bandwidth (GB/s)": 0.027925531914893616 } }, { "ph": "f", "id": 232168, "pid": 5, "tid": 7, "ts": 1716454225139036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225139022, "dur": 11, "args": { "External id": 232168, "cbid": 41, "correlation": 232168 } }, { "ph": "s", "id": 232168, "pid": 76337, "tid": -914061504, "ts": 1716454225139022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225139033, "dur": 9, "args": { "External id": 232169, "cbid": 131, "correlation": 232169 } }, { "ph": "f", "id": 232169, "pid": 76337, "tid": -914061504, "ts": 1716454225139033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225139093, "dur": 1, "args": { "External id": 232178, "device": 5, "context": 1, "stream": 7, "correlation": 232178, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 232178, "pid": 5, "tid": 7, "ts": 1716454225139093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225139081, "dur": 9, "args": { "External id": 232178, "cbid": 41, "correlation": 232178 } }, { "ph": "s", "id": 232178, "pid": 76337, "tid": -914061504, "ts": 1716454225139081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225139091, "dur": 8, "args": { "External id": 232179, "cbid": 131, "correlation": 232179 } }, { "ph": "f", "id": 232179, "pid": 76337, "tid": -914061504, "ts": 1716454225139091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225139170, "dur": 4, "args": { "External id": 232186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232186, "pid": 5, "tid": 7, "ts": 1716454225139170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139154, "dur": 17, "args": { "External id": 232186, "cbid": 211, "correlation": 232186 } }, { "ph": "s", "id": 232186, "pid": 76337, "tid": -914061504, "ts": 1716454225139154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454225139213, "dur": 4, "args": { "External id": 232206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232206, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232206, "pid": 5, "tid": 7, "ts": 1716454225139213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139200, "dur": 15, "args": { "External id": 232206, "cbid": 211, "correlation": 232206 } }, { "ph": "s", "id": 232206, "pid": 76337, "tid": -914061504, "ts": 1716454225139200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225139216, "dur": 0, "args": { "External id": 232207, "cbid": 11, "correlation": 232207 } }, { "ph": "f", "id": 232207, "pid": 76337, "tid": -914061504, "ts": 1716454225139216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225139216, "dur": 0, "args": { "External id": 232208, "cbid": 11, "correlation": 232208 } }, { "ph": "f", "id": 232208, "pid": 76337, "tid": -914061504, "ts": 1716454225139216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225139229, "dur": 1, "args": { "External id": 232211, "device": 5, "context": 1, "stream": 7, "correlation": 232211, "bytes": 4, "memory bandwidth (GB/s)": 0.0024509803921568627 } }, { "ph": "f", "id": 232211, "pid": 5, "tid": 7, "ts": 1716454225139229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225139217, "dur": 21, "args": { "External id": 232211, "cbid": 41, "correlation": 232211 } }, { "ph": "s", "id": 232211, "pid": 76337, "tid": -914061504, "ts": 1716454225139217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225139239, "dur": 3, "args": { "External id": 232212, "cbid": 131, "correlation": 232212 } }, { "ph": "f", "id": 232212, "pid": 76337, "tid": -914061504, "ts": 1716454225139239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225139279, "dur": 2, "args": { "External id": 232236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232236, "pid": 5, "tid": 7, "ts": 1716454225139279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139260, "dur": 18, "args": { "External id": 232236, "cbid": 211, "correlation": 232236 } }, { "ph": "s", "id": 232236, "pid": 76337, "tid": -914061504, "ts": 1716454225139260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225139279, "dur": 0, "args": { "External id": 232237, "cbid": 11, "correlation": 232237 } }, { "ph": "f", "id": 232237, "pid": 76337, "tid": -914061504, "ts": 1716454225139279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225139280, "dur": 0, "args": { "External id": 232238, "cbid": 11, "correlation": 232238 } }, { "ph": "f", "id": 232238, "pid": 76337, "tid": -914061504, "ts": 1716454225139280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225139281, "dur": 1, "args": { "External id": 232240, "cbid": 200, "correlation": 232240 } }, { "ph": "f", "id": 232240, "pid": 76337, "tid": -914061504, "ts": 1716454225139281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454225139294, "dur": 4, "args": { "External id": 232242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232242, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232242, "pid": 5, "tid": 7, "ts": 1716454225139294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139285, "dur": 10, "args": { "External id": 232242, "cbid": 211, "correlation": 232242 } }, { "ph": "s", "id": 232242, "pid": 76337, "tid": -914061504, "ts": 1716454225139285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225139295, "dur": 0, "args": { "External id": 232243, "cbid": 11, "correlation": 232243 } }, { "ph": "f", "id": 232243, "pid": 76337, "tid": -914061504, "ts": 1716454225139295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225139296, "dur": 0, "args": { "External id": 232244, "cbid": 11, "correlation": 232244 } }, { "ph": "f", "id": 232244, "pid": 76337, "tid": -914061504, "ts": 1716454225139296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225139334, "dur": 1, "args": { "External id": 232251, "device": 5, "context": 1, "stream": 7, "correlation": 232251, "bytes": 8, "memory bandwidth (GB/s)": 0.004901960784313725 } }, { "ph": "f", "id": 232251, "pid": 5, "tid": 7, "ts": 1716454225139334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225139322, "dur": 22, "args": { "External id": 232251, "cbid": 41, "correlation": 232251 } }, { "ph": "s", "id": 232251, "pid": 76337, "tid": -914061504, "ts": 1716454225139322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225139344, "dur": 3, "args": { "External id": 232252, "cbid": 131, "correlation": 232252 } }, { "ph": "f", "id": 232252, "pid": 76337, "tid": -914061504, "ts": 1716454225139344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225139394, "dur": 1, "args": { "External id": 232262, "device": 5, "context": 1, "stream": 7, "correlation": 232262, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 232262, "pid": 5, "tid": 7, "ts": 1716454225139394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225139382, "dur": 10, "args": { "External id": 232262, "cbid": 41, "correlation": 232262 } }, { "ph": "s", "id": 232262, "pid": 76337, "tid": -914061504, "ts": 1716454225139382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225139392, "dur": 7, "args": { "External id": 232263, "cbid": 131, "correlation": 232263 } }, { "ph": "f", "id": 232263, "pid": 76337, "tid": -914061504, "ts": 1716454225139392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225139465, "dur": 5, "args": { "External id": 232270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232270, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232270, "pid": 5, "tid": 7, "ts": 1716454225139465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139449, "dur": 15, "args": { "External id": 232270, "cbid": 211, "correlation": 232270 } }, { "ph": "s", "id": 232270, "pid": 76337, "tid": -914061504, "ts": 1716454225139449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139539, "dur": 3, "args": { "External id": 232279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232279, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232279, "pid": 5, "tid": 7, "ts": 1716454225139539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139524, "dur": 14, "args": { "External id": 232279, "cbid": 211, "correlation": 232279 } }, { "ph": "s", "id": 232279, "pid": 76337, "tid": -914061504, "ts": 1716454225139524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139578, "dur": 3, "args": { "External id": 232287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232287, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232287, "pid": 5, "tid": 7, "ts": 1716454225139578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139565, "dur": 12, "args": { "External id": 232287, "cbid": 211, "correlation": 232287 } }, { "ph": "s", "id": 232287, "pid": 76337, "tid": -914061504, "ts": 1716454225139565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139610, "dur": 4, "args": { "External id": 232295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232295, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232295, "pid": 5, "tid": 7, "ts": 1716454225139610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139599, "dur": 11, "args": { "External id": 232295, "cbid": 211, "correlation": 232295 } }, { "ph": "s", "id": 232295, "pid": 76337, "tid": -914061504, "ts": 1716454225139599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139644, "dur": 4, "args": { "External id": 232303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232303, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232303, "pid": 5, "tid": 7, "ts": 1716454225139644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139631, "dur": 12, "args": { "External id": 232303, "cbid": 211, "correlation": 232303 } }, { "ph": "s", "id": 232303, "pid": 76337, "tid": -914061504, "ts": 1716454225139631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139671, "dur": 2, "args": { "External id": 232311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232311, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232311, "pid": 5, "tid": 7, "ts": 1716454225139671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139662, "dur": 8, "args": { "External id": 232311, "cbid": 211, "correlation": 232311 } }, { "ph": "s", "id": 232311, "pid": 76337, "tid": -914061504, "ts": 1716454225139662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139697, "dur": 3, "args": { "External id": 232319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232319, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232319, "pid": 5, "tid": 7, "ts": 1716454225139697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139688, "dur": 9, "args": { "External id": 232319, "cbid": 211, "correlation": 232319 } }, { "ph": "s", "id": 232319, "pid": 76337, "tid": -914061504, "ts": 1716454225139688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225139718, "dur": 4, "args": { "External id": 232327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232327, "pid": 5, "tid": 7, "ts": 1716454225139718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139710, "dur": 7, "args": { "External id": 232327, "cbid": 211, "correlation": 232327 } }, { "ph": "s", "id": 232327, "pid": 76337, "tid": -914061504, "ts": 1716454225139710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225139738, "dur": 5, "args": { "External id": 232335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232335, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232335, "pid": 5, "tid": 7, "ts": 1716454225139738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139730, "dur": 7, "args": { "External id": 232335, "cbid": 211, "correlation": 232335 } }, { "ph": "s", "id": 232335, "pid": 76337, "tid": -914061504, "ts": 1716454225139730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139759, "dur": 3, "args": { "External id": 232343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232343, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232343, "pid": 5, "tid": 7, "ts": 1716454225139759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139750, "dur": 7, "args": { "External id": 232343, "cbid": 211, "correlation": 232343 } }, { "ph": "s", "id": 232343, "pid": 76337, "tid": -914061504, "ts": 1716454225139750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139822, "dur": 3, "args": { "External id": 232351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232351, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232351, "pid": 5, "tid": 7, "ts": 1716454225139822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139809, "dur": 13, "args": { "External id": 232351, "cbid": 211, "correlation": 232351 } }, { "ph": "s", "id": 232351, "pid": 76337, "tid": -914061504, "ts": 1716454225139809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225139849, "dur": 4, "args": { "External id": 232359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232359, "pid": 5, "tid": 7, "ts": 1716454225139849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139840, "dur": 8, "args": { "External id": 232359, "cbid": 211, "correlation": 232359 } }, { "ph": "s", "id": 232359, "pid": 76337, "tid": -914061504, "ts": 1716454225139840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225139870, "dur": 4, "args": { "External id": 232367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232367, "pid": 5, "tid": 7, "ts": 1716454225139870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139862, "dur": 7, "args": { "External id": 232367, "cbid": 211, "correlation": 232367 } }, { "ph": "s", "id": 232367, "pid": 76337, "tid": -914061504, "ts": 1716454225139862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225139890, "dur": 3, "args": { "External id": 232375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232375, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 232375, "pid": 5, "tid": 7, "ts": 1716454225139890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225139882, "dur": 6, "args": { "External id": 232375, "cbid": 211, "correlation": 232375 } }, { "ph": "s", "id": 232375, "pid": 76337, "tid": -914061504, "ts": 1716454225139882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225140327, "dur": 5, "args": { "External id": 232384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232384, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232384, "pid": 5, "tid": 7, "ts": 1716454225140327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140309, "dur": 17, "args": { "External id": 232384, "cbid": 211, "correlation": 232384 } }, { "ph": "s", "id": 232384, "pid": 76337, "tid": -914061504, "ts": 1716454225140309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225140364, "dur": 5, "args": { "External id": 232393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232393, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232393, "pid": 5, "tid": 7, "ts": 1716454225140364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140353, "dur": 10, "args": { "External id": 232393, "cbid": 211, "correlation": 232393 } }, { "ph": "s", "id": 232393, "pid": 76337, "tid": -914061504, "ts": 1716454225140353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225140494, "dur": 3, "args": { "External id": 232409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232409, "pid": 5, "tid": 7, "ts": 1716454225140494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140481, "dur": 14, "args": { "External id": 232409, "cbid": 211, "correlation": 232409 } }, { "ph": "s", "id": 232409, "pid": 76337, "tid": -914061504, "ts": 1716454225140481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225140529, "dur": 3, "args": { "External id": 232417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232417, "pid": 5, "tid": 7, "ts": 1716454225140529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140520, "dur": 9, "args": { "External id": 232417, "cbid": 211, "correlation": 232417 } }, { "ph": "s", "id": 232417, "pid": 76337, "tid": -914061504, "ts": 1716454225140520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225140560, "dur": 3, "args": { "External id": 232425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232425, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232425, "pid": 5, "tid": 7, "ts": 1716454225140560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140550, "dur": 9, "args": { "External id": 232425, "cbid": 211, "correlation": 232425 } }, { "ph": "s", "id": 232425, "pid": 76337, "tid": -914061504, "ts": 1716454225140550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225140590, "dur": 3, "args": { "External id": 232433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232433, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232433, "pid": 5, "tid": 7, "ts": 1716454225140590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140580, "dur": 9, "args": { "External id": 232433, "cbid": 211, "correlation": 232433 } }, { "ph": "s", "id": 232433, "pid": 76337, "tid": -914061504, "ts": 1716454225140580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225140653, "dur": 4, "args": { "External id": 232445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232445, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232445, "pid": 5, "tid": 7, "ts": 1716454225140653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140637, "dur": 16, "args": { "External id": 232445, "cbid": 211, "correlation": 232445 } }, { "ph": "s", "id": 232445, "pid": 76337, "tid": -914061504, "ts": 1716454225140637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225140701, "dur": 4, "args": { "External id": 232456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232456, "pid": 5, "tid": 7, "ts": 1716454225140701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140688, "dur": 13, "args": { "External id": 232456, "cbid": 211, "correlation": 232456 } }, { "ph": "s", "id": 232456, "pid": 76337, "tid": -914061504, "ts": 1716454225140688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225140733, "dur": 2, "args": { "External id": 232464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232464, "pid": 5, "tid": 7, "ts": 1716454225140733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140723, "dur": 9, "args": { "External id": 232464, "cbid": 211, "correlation": 232464 } }, { "ph": "s", "id": 232464, "pid": 76337, "tid": -914061504, "ts": 1716454225140723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225140767, "dur": 5, "args": { "External id": 232472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232472, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232472, "pid": 5, "tid": 7, "ts": 1716454225140767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140757, "dur": 10, "args": { "External id": 232472, "cbid": 211, "correlation": 232472 } }, { "ph": "s", "id": 232472, "pid": 76337, "tid": -914061504, "ts": 1716454225140757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225140806, "dur": 5, "args": { "External id": 232480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232480, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232480, "pid": 5, "tid": 7, "ts": 1716454225140806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140791, "dur": 16, "args": { "External id": 232480, "cbid": 211, "correlation": 232480 } }, { "ph": "s", "id": 232480, "pid": 76337, "tid": -914061504, "ts": 1716454225140791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225140838, "dur": 4, "args": { "External id": 232489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232489, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232489, "pid": 5, "tid": 7, "ts": 1716454225140838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140828, "dur": 10, "args": { "External id": 232489, "cbid": 211, "correlation": 232489 } }, { "ph": "s", "id": 232489, "pid": 76337, "tid": -914061504, "ts": 1716454225140828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225140902, "dur": 4, "args": { "External id": 232502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232502, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232502, "pid": 5, "tid": 7, "ts": 1716454225140902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140889, "dur": 13, "args": { "External id": 232502, "cbid": 211, "correlation": 232502 } }, { "ph": "s", "id": 232502, "pid": 76337, "tid": -914061504, "ts": 1716454225140889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225140941, "dur": 5, "args": { "External id": 232512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232512, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 232512, "pid": 5, "tid": 7, "ts": 1716454225140941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225140931, "dur": 10, "args": { "External id": 232512, "cbid": 211, "correlation": 232512 } }, { "ph": "s", "id": 232512, "pid": 76337, "tid": -914061504, "ts": 1716454225140931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225141083, "dur": 5, "args": { "External id": 232529, "cbid": 251, "correlation": 232529 } }, { "ph": "f", "id": 232529, "pid": 76337, "tid": -914061504, "ts": 1716454225141083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454225141114, "dur": 11, "args": { "External id": 232531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232531, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232531, "pid": 5, "tid": 7, "ts": 1716454225141114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141097, "dur": 17, "args": { "External id": 232531, "cbid": 211, "correlation": 232531 } }, { "ph": "s", "id": 232531, "pid": 76337, "tid": -914061504, "ts": 1716454225141097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225141178, "dur": 3, "args": { "External id": 232539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232539, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232539, "pid": 5, "tid": 7, "ts": 1716454225141178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141165, "dur": 13, "args": { "External id": 232539, "cbid": 211, "correlation": 232539 } }, { "ph": "s", "id": 232539, "pid": 76337, "tid": -914061504, "ts": 1716454225141165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225141236, "dur": 2, "args": { "External id": 232555, "cbid": 251, "correlation": 232555 } }, { "ph": "f", "id": 232555, "pid": 76337, "tid": -914061504, "ts": 1716454225141236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225141242, "dur": 0, "args": { "External id": 232557, "cbid": 251, "correlation": 232557 } }, { "ph": "f", "id": 232557, "pid": 76337, "tid": -914061504, "ts": 1716454225141242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225141258, "dur": 13, "args": { "External id": 232558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232558, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232558, "pid": 5, "tid": 7, "ts": 1716454225141258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141245, "dur": 14, "args": { "External id": 232558, "cbid": 211, "correlation": 232558 } }, { "ph": "s", "id": 232558, "pid": 76337, "tid": -914061504, "ts": 1716454225141245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225141274, "dur": 5, "args": { "External id": 232560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232560, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232560, "pid": 5, "tid": 7, "ts": 1716454225141274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141263, "dur": 10, "args": { "External id": 232560, "cbid": 211, "correlation": 232560 } }, { "ph": "s", "id": 232560, "pid": 76337, "tid": -914061504, "ts": 1716454225141263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225141378, "dur": 1, "args": { "External id": 232570, "cbid": 317, "correlation": 232570 } }, { "ph": "f", "id": 232570, "pid": 76337, "tid": -914061504, "ts": 1716454225141378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225141380, "dur": 1, "args": { "External id": 232571, "cbid": 203, "correlation": 232571 } }, { "ph": "f", "id": 232571, "pid": 76337, "tid": -914061504, "ts": 1716454225141380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225141382, "dur": 1, "args": { "External id": 232572, "cbid": 205, "correlation": 232572 } }, { "ph": "f", "id": 232572, "pid": 76337, "tid": -914061504, "ts": 1716454225141382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225141440, "dur": 7, "args": { "External id": 232576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232576, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232576, "pid": 5, "tid": 7, "ts": 1716454225141440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141424, "dur": 15, "args": { "External id": 232576, "cbid": 211, "correlation": 232576 } }, { "ph": "s", "id": 232576, "pid": 76337, "tid": -914061504, "ts": 1716454225141424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225141450, "dur": 4, "args": { "External id": 232578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232578, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 232578, "pid": 5, "tid": 7, "ts": 1716454225141450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141443, "dur": 6, "args": { "External id": 232578, "cbid": 211, "correlation": 232578 } }, { "ph": "s", "id": 232578, "pid": 76337, "tid": -914061504, "ts": 1716454225141443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225141470, "dur": 3, "args": { "External id": 232580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232580, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232580, "pid": 5, "tid": 7, "ts": 1716454225141470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141461, "dur": 8, "args": { "External id": 232580, "cbid": 211, "correlation": 232580 } }, { "ph": "s", "id": 232580, "pid": 76337, "tid": -914061504, "ts": 1716454225141461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225141476, "dur": 0, "args": { "External id": 232581, "cbid": 51, "correlation": 232581 } }, { "ph": "s", "id": 232581, "pid": 76337, "tid": -914061504, "ts": 1716454225141476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225141487, "dur": 82, "args": { "External id": 232582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232582, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232582, "pid": 5, "tid": 7, "ts": 1716454225141487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141478, "dur": 7, "args": { "External id": 232582, "cbid": 211, "correlation": 232582 } }, { "ph": "s", "id": 232582, "pid": 76337, "tid": -914061504, "ts": 1716454225141478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225141570, "dur": 59, "args": { "External id": 232587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232587, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232587, "pid": 5, "tid": 7, "ts": 1716454225141570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225141516, "dur": 14, "args": { "External id": 232587, "cbid": 211, "correlation": 232587 } }, { "ph": "s", "id": 232587, "pid": 76337, "tid": -914061504, "ts": 1716454225141516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225143338, "dur": 51, "args": { "External id": 232607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232607, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 232607, "pid": 5, "tid": 7, "ts": 1716454225143338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143322, "dur": 16, "args": { "External id": 232607, "cbid": 211, "correlation": 232607 } }, { "ph": "s", "id": 232607, "pid": 76337, "tid": -914061504, "ts": 1716454225143322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225143391, "dur": 4, "args": { "External id": 232619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232619, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232619, "pid": 5, "tid": 7, "ts": 1716454225143391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143349, "dur": 8, "args": { "External id": 232619, "cbid": 211, "correlation": 232619 } }, { "ph": "s", "id": 232619, "pid": 76337, "tid": -914061504, "ts": 1716454225143349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225143396, "dur": 56, "args": { "External id": 232622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232622, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232622, "pid": 5, "tid": 7, "ts": 1716454225143396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143377, "dur": 8, "args": { "External id": 232622, "cbid": 211, "correlation": 232622 } }, { "ph": "s", "id": 232622, "pid": 76337, "tid": -914061504, "ts": 1716454225143377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225143453, "dur": 36, "args": { "External id": 232631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232631, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232631, "pid": 5, "tid": 7, "ts": 1716454225143453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143423, "dur": 10, "args": { "External id": 232631, "cbid": 211, "correlation": 232631 } }, { "ph": "s", "id": 232631, "pid": 76337, "tid": -914061504, "ts": 1716454225143423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225143479, "dur": 0, "args": { "External id": 232641, "cbid": 317, "correlation": 232641 } }, { "ph": "f", "id": 232641, "pid": 76337, "tid": -914061504, "ts": 1716454225143479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225143480, "dur": 0, "args": { "External id": 232642, "cbid": 203, "correlation": 232642 } }, { "ph": "f", "id": 232642, "pid": 76337, "tid": -914061504, "ts": 1716454225143480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225143481, "dur": 0, "args": { "External id": 232643, "cbid": 205, "correlation": 232643 } }, { "ph": "f", "id": 232643, "pid": 76337, "tid": -914061504, "ts": 1716454225143481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225143515, "dur": 40, "args": { "External id": 232647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232647, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232647, "pid": 5, "tid": 7, "ts": 1716454225143515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143502, "dur": 13, "args": { "External id": 232647, "cbid": 211, "correlation": 232647 } }, { "ph": "s", "id": 232647, "pid": 76337, "tid": -914061504, "ts": 1716454225143502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225143557, "dur": 14, "args": { "External id": 232649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232649, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232649, "pid": 5, "tid": 7, "ts": 1716454225143557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143518, "dur": 6, "args": { "External id": 232649, "cbid": 211, "correlation": 232649 } }, { "ph": "s", "id": 232649, "pid": 76337, "tid": -914061504, "ts": 1716454225143518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225143572, "dur": 3, "args": { "External id": 232651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232651, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232651, "pid": 5, "tid": 7, "ts": 1716454225143572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143529, "dur": 6, "args": { "External id": 232651, "cbid": 211, "correlation": 232651 } }, { "ph": "s", "id": 232651, "pid": 76337, "tid": -914061504, "ts": 1716454225143529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225143539, "dur": 0, "args": { "External id": 232652, "cbid": 51, "correlation": 232652 } }, { "ph": "s", "id": 232652, "pid": 76337, "tid": -914061504, "ts": 1716454225143539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225143577, "dur": 690, "args": { "External id": 232653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232653, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232653, "pid": 5, "tid": 7, "ts": 1716454225143577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143540, "dur": 6, "args": { "External id": 232653, "cbid": 211, "correlation": 232653 } }, { "ph": "s", "id": 232653, "pid": 76337, "tid": -914061504, "ts": 1716454225143540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225144268, "dur": 58, "args": { "External id": 232658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232658, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232658, "pid": 5, "tid": 7, "ts": 1716454225144268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143571, "dur": 9, "args": { "External id": 232658, "cbid": 211, "correlation": 232658 } }, { "ph": "s", "id": 232658, "pid": 76337, "tid": -914061504, "ts": 1716454225143571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225144328, "dur": 4, "args": { "External id": 232666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232666, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232666, "pid": 5, "tid": 7, "ts": 1716454225144328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143615, "dur": 9, "args": { "External id": 232666, "cbid": 211, "correlation": 232666 } }, { "ph": "s", "id": 232666, "pid": 76337, "tid": -914061504, "ts": 1716454225143615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225143681, "dur": 2, "args": { "External id": 232682, "cbid": 251, "correlation": 232682 } }, { "ph": "f", "id": 232682, "pid": 76337, "tid": -914061504, "ts": 1716454225143681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225143686, "dur": 0, "args": { "External id": 232684, "cbid": 251, "correlation": 232684 } }, { "ph": "f", "id": 232684, "pid": 76337, "tid": -914061504, "ts": 1716454225143686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225144333, "dur": 8, "args": { "External id": 232685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232685, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 232685, "pid": 5, "tid": 7, "ts": 1716454225144333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143688, "dur": 12, "args": { "External id": 232685, "cbid": 211, "correlation": 232685 } }, { "ph": "s", "id": 232685, "pid": 76337, "tid": -914061504, "ts": 1716454225143688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225144343, "dur": 4, "args": { "External id": 232687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232687, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 232687, "pid": 5, "tid": 7, "ts": 1716454225144343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143702, "dur": 7, "args": { "External id": 232687, "cbid": 211, "correlation": 232687 } }, { "ph": "s", "id": 232687, "pid": 76337, "tid": -914061504, "ts": 1716454225143702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225144348, "dur": 53, "args": { "External id": 232697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232697, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232697, "pid": 5, "tid": 7, "ts": 1716454225144348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143764, "dur": 12, "args": { "External id": 232697, "cbid": 211, "correlation": 232697 } }, { "ph": "s", "id": 232697, "pid": 76337, "tid": -914061504, "ts": 1716454225143764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225144402, "dur": 50, "args": { "External id": 232717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232717, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 232717, "pid": 5, "tid": 7, "ts": 1716454225144402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143830, "dur": 11, "args": { "External id": 232717, "cbid": 211, "correlation": 232717 } }, { "ph": "s", "id": 232717, "pid": 76337, "tid": -914061504, "ts": 1716454225143830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225144454, "dur": 4, "args": { "External id": 232729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232729, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232729, "pid": 5, "tid": 7, "ts": 1716454225144454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143851, "dur": 6, "args": { "External id": 232729, "cbid": 211, "correlation": 232729 } }, { "ph": "s", "id": 232729, "pid": 76337, "tid": -914061504, "ts": 1716454225143851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225144459, "dur": 55, "args": { "External id": 232732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232732, "pid": 5, "tid": 7, "ts": 1716454225144459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143870, "dur": 6, "args": { "External id": 232732, "cbid": 211, "correlation": 232732 } }, { "ph": "s", "id": 232732, "pid": 76337, "tid": -914061504, "ts": 1716454225143870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225144515, "dur": 36, "args": { "External id": 232741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232741, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232741, "pid": 5, "tid": 7, "ts": 1716454225144515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225143910, "dur": 10, "args": { "External id": 232741, "cbid": 211, "correlation": 232741 } }, { "ph": "s", "id": 232741, "pid": 76337, "tid": -914061504, "ts": 1716454225143910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225143988, "dur": 0, "args": { "External id": 232751, "cbid": 317, "correlation": 232751 } }, { "ph": "f", "id": 232751, "pid": 76337, "tid": -914061504, "ts": 1716454225143988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225143989, "dur": 0, "args": { "External id": 232752, "cbid": 203, "correlation": 232752 } }, { "ph": "f", "id": 232752, "pid": 76337, "tid": -914061504, "ts": 1716454225143989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225143990, "dur": 0, "args": { "External id": 232753, "cbid": 205, "correlation": 232753 } }, { "ph": "f", "id": 232753, "pid": 76337, "tid": -914061504, "ts": 1716454225143990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225144553, "dur": 39, "args": { "External id": 232757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232757, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232757, "pid": 5, "tid": 7, "ts": 1716454225144553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144004, "dur": 13, "args": { "External id": 232757, "cbid": 211, "correlation": 232757 } }, { "ph": "s", "id": 232757, "pid": 76337, "tid": -914061504, "ts": 1716454225144004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225144593, "dur": 14, "args": { "External id": 232759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232759, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232759, "pid": 5, "tid": 7, "ts": 1716454225144593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144019, "dur": 5, "args": { "External id": 232759, "cbid": 211, "correlation": 232759 } }, { "ph": "s", "id": 232759, "pid": 76337, "tid": -914061504, "ts": 1716454225144019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225144608, "dur": 3, "args": { "External id": 232761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232761, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232761, "pid": 5, "tid": 7, "ts": 1716454225144608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144029, "dur": 5, "args": { "External id": 232761, "cbid": 211, "correlation": 232761 } }, { "ph": "s", "id": 232761, "pid": 76337, "tid": -914061504, "ts": 1716454225144029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225144037, "dur": 0, "args": { "External id": 232762, "cbid": 51, "correlation": 232762 } }, { "ph": "s", "id": 232762, "pid": 76337, "tid": -914061504, "ts": 1716454225144037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225144613, "dur": 683, "args": { "External id": 232763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232763, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232763, "pid": 5, "tid": 7, "ts": 1716454225144613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144038, "dur": 5, "args": { "External id": 232763, "cbid": 211, "correlation": 232763 } }, { "ph": "s", "id": 232763, "pid": 76337, "tid": -914061504, "ts": 1716454225144038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225145298, "dur": 58, "args": { "External id": 232768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232768, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232768, "pid": 5, "tid": 7, "ts": 1716454225145298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144066, "dur": 9, "args": { "External id": 232768, "cbid": 211, "correlation": 232768 } }, { "ph": "s", "id": 232768, "pid": 76337, "tid": -914061504, "ts": 1716454225144066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225145358, "dur": 50, "args": { "External id": 232776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232776, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232776, "pid": 5, "tid": 7, "ts": 1716454225145358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144099, "dur": 9, "args": { "External id": 232776, "cbid": 211, "correlation": 232776 } }, { "ph": "s", "id": 232776, "pid": 76337, "tid": -914061504, "ts": 1716454225144099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225145409, "dur": 35, "args": { "External id": 232784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232784, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232784, "pid": 5, "tid": 7, "ts": 1716454225145409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144131, "dur": 9, "args": { "External id": 232784, "cbid": 211, "correlation": 232784 } }, { "ph": "s", "id": 232784, "pid": 76337, "tid": -914061504, "ts": 1716454225144131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225145446, "dur": 51, "args": { "External id": 232804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232804, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 232804, "pid": 5, "tid": 7, "ts": 1716454225145446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144216, "dur": 12, "args": { "External id": 232804, "cbid": 211, "correlation": 232804 } }, { "ph": "s", "id": 232804, "pid": 76337, "tid": -914061504, "ts": 1716454225144216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225145498, "dur": 4, "args": { "External id": 232816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232816, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 232816, "pid": 5, "tid": 7, "ts": 1716454225145498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144238, "dur": 6, "args": { "External id": 232816, "cbid": 211, "correlation": 232816 } }, { "ph": "s", "id": 232816, "pid": 76337, "tid": -914061504, "ts": 1716454225144238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225145503, "dur": 55, "args": { "External id": 232819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232819, "pid": 5, "tid": 7, "ts": 1716454225145503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144256, "dur": 7, "args": { "External id": 232819, "cbid": 211, "correlation": 232819 } }, { "ph": "s", "id": 232819, "pid": 76337, "tid": -914061504, "ts": 1716454225144256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225144313, "dur": 0, "args": { "External id": 232830, "cbid": 317, "correlation": 232830 } }, { "ph": "f", "id": 232830, "pid": 76337, "tid": -914061504, "ts": 1716454225144313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225144314, "dur": 0, "args": { "External id": 232831, "cbid": 203, "correlation": 232831 } }, { "ph": "f", "id": 232831, "pid": 76337, "tid": -914061504, "ts": 1716454225144314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225144315, "dur": 0, "args": { "External id": 232832, "cbid": 205, "correlation": 232832 } }, { "ph": "f", "id": 232832, "pid": 76337, "tid": -914061504, "ts": 1716454225144315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144348, "dur": 3, "args": { "External id": 232836, "cbid": 251, "correlation": 232836 } }, { "ph": "f", "id": 232836, "pid": 76337, "tid": -914061504, "ts": 1716454225144348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144352, "dur": 1, "args": { "External id": 232837, "cbid": 251, "correlation": 232837 } }, { "ph": "f", "id": 232837, "pid": 76337, "tid": -914061504, "ts": 1716454225144352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144355, "dur": 1, "args": { "External id": 232838, "cbid": 251, "correlation": 232838 } }, { "ph": "f", "id": 232838, "pid": 76337, "tid": -914061504, "ts": 1716454225144355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144357, "dur": 1, "args": { "External id": 232839, "cbid": 251, "correlation": 232839 } }, { "ph": "f", "id": 232839, "pid": 76337, "tid": -914061504, "ts": 1716454225144357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144359, "dur": 1, "args": { "External id": 232840, "cbid": 251, "correlation": 232840 } }, { "ph": "f", "id": 232840, "pid": 76337, "tid": -914061504, "ts": 1716454225144359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144361, "dur": 1, "args": { "External id": 232841, "cbid": 251, "correlation": 232841 } }, { "ph": "f", "id": 232841, "pid": 76337, "tid": -914061504, "ts": 1716454225144361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144363, "dur": 1, "args": { "External id": 232842, "cbid": 251, "correlation": 232842 } }, { "ph": "f", "id": 232842, "pid": 76337, "tid": -914061504, "ts": 1716454225144363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144365, "dur": 1, "args": { "External id": 232843, "cbid": 251, "correlation": 232843 } }, { "ph": "f", "id": 232843, "pid": 76337, "tid": -914061504, "ts": 1716454225144365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144368, "dur": 0, "args": { "External id": 232844, "cbid": 251, "correlation": 232844 } }, { "ph": "f", "id": 232844, "pid": 76337, "tid": -914061504, "ts": 1716454225144368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225145559, "dur": 113, "args": { "External id": 232845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232845, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 232845, "pid": 5, "tid": 7, "ts": 1716454225145559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144372, "dur": 14, "args": { "External id": 232845, "cbid": 211, "correlation": 232845 } }, { "ph": "s", "id": 232845, "pid": 76337, "tid": -914061504, "ts": 1716454225144372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225145674, "dur": 59, "args": { "External id": 232851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232851, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232851, "pid": 5, "tid": 7, "ts": 1716454225145674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144410, "dur": 9, "args": { "External id": 232851, "cbid": 211, "correlation": 232851 } }, { "ph": "s", "id": 232851, "pid": 76337, "tid": -914061504, "ts": 1716454225144410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225145734, "dur": 537, "args": { "External id": 232860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232860, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232860, "pid": 5, "tid": 7, "ts": 1716454225145734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144506, "dur": 16, "args": { "External id": 232860, "cbid": 211, "correlation": 232860 } }, { "ph": "s", "id": 232860, "pid": 76337, "tid": -914061504, "ts": 1716454225144506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225146272, "dur": 177, "args": { "External id": 232882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232882, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232882, "pid": 5, "tid": 7, "ts": 1716454225146272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144583, "dur": 14, "args": { "External id": 232882, "cbid": 211, "correlation": 232882 } }, { "ph": "s", "id": 232882, "pid": 76337, "tid": -914061504, "ts": 1716454225144583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144701, "dur": 2, "args": { "External id": 232893, "cbid": 251, "correlation": 232893 } }, { "ph": "f", "id": 232893, "pid": 76337, "tid": -914061504, "ts": 1716454225144701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225146450, "dur": 195, "args": { "External id": 232894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232894, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232894, "pid": 5, "tid": 7, "ts": 1716454225146450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144708, "dur": 14, "args": { "External id": 232894, "cbid": 211, "correlation": 232894 } }, { "ph": "s", "id": 232894, "pid": 76337, "tid": -914061504, "ts": 1716454225144708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144782, "dur": 1, "args": { "External id": 232905, "cbid": 251, "correlation": 232905 } }, { "ph": "f", "id": 232905, "pid": 76337, "tid": -914061504, "ts": 1716454225144782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225146646, "dur": 186, "args": { "External id": 232906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232906, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232906, "pid": 5, "tid": 7, "ts": 1716454225146646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144786, "dur": 12, "args": { "External id": 232906, "cbid": 211, "correlation": 232906 } }, { "ph": "s", "id": 232906, "pid": 76337, "tid": -914061504, "ts": 1716454225144786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225144851, "dur": 1, "args": { "External id": 232917, "cbid": 251, "correlation": 232917 } }, { "ph": "f", "id": 232917, "pid": 76337, "tid": -914061504, "ts": 1716454225144851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225146833, "dur": 186, "args": { "External id": 232918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232918, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232918, "pid": 5, "tid": 7, "ts": 1716454225146833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144855, "dur": 11, "args": { "External id": 232918, "cbid": 211, "correlation": 232918 } }, { "ph": "s", "id": 232918, "pid": 76337, "tid": -914061504, "ts": 1716454225144855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225147020, "dur": 18236, "args": { "External id": 232939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232939, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 232939, "pid": 5, "tid": 7, "ts": 1716454225147020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225144967, "dur": 26, "args": { "External id": 232939, "cbid": 211, "correlation": 232939 } }, { "ph": "s", "id": 232939, "pid": 76337, "tid": -914061504, "ts": 1716454225144967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145095, "dur": 2, "args": { "External id": 232957, "cbid": 251, "correlation": 232957 } }, { "ph": "f", "id": 232957, "pid": 76337, "tid": -914061504, "ts": 1716454225145095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225165258, "dur": 198, "args": { "External id": 232959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232959, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 232959, "pid": 5, "tid": 7, "ts": 1716454225165258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145102, "dur": 13, "args": { "External id": 232959, "cbid": 211, "correlation": 232959 } }, { "ph": "s", "id": 232959, "pid": 76337, "tid": -914061504, "ts": 1716454225145102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225165457, "dur": 66, "args": { "External id": 232967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232967, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232967, "pid": 5, "tid": 7, "ts": 1716454225165457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145173, "dur": 13, "args": { "External id": 232967, "cbid": 211, "correlation": 232967 } }, { "ph": "s", "id": 232967, "pid": 76337, "tid": -914061504, "ts": 1716454225145173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225165525, "dur": 96, "args": { "External id": 232975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232975, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232975, "pid": 5, "tid": 7, "ts": 1716454225165525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145213, "dur": 10, "args": { "External id": 232975, "cbid": 211, "correlation": 232975 } }, { "ph": "s", "id": 232975, "pid": 76337, "tid": -914061504, "ts": 1716454225145213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225165622, "dur": 54, "args": { "External id": 232986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 232986, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 232986, "pid": 5, "tid": 7, "ts": 1716454225165622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145302, "dur": 14, "args": { "External id": 232986, "cbid": 211, "correlation": 232986 } }, { "ph": "s", "id": 232986, "pid": 76337, "tid": -914061504, "ts": 1716454225145302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225165677, "dur": 90, "args": { "External id": 233008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233008, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233008, "pid": 5, "tid": 7, "ts": 1716454225165677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145336, "dur": 7, "args": { "External id": 233008, "cbid": 211, "correlation": 233008 } }, { "ph": "s", "id": 233008, "pid": 76337, "tid": -914061504, "ts": 1716454225145336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145419, "dur": 1, "args": { "External id": 233019, "cbid": 251, "correlation": 233019 } }, { "ph": "f", "id": 233019, "pid": 76337, "tid": -914061504, "ts": 1716454225145419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225165768, "dur": 101, "args": { "External id": 233020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233020, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233020, "pid": 5, "tid": 7, "ts": 1716454225165768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145424, "dur": 13, "args": { "External id": 233020, "cbid": 211, "correlation": 233020 } }, { "ph": "s", "id": 233020, "pid": 76337, "tid": -914061504, "ts": 1716454225145424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145506, "dur": 2, "args": { "External id": 233031, "cbid": 251, "correlation": 233031 } }, { "ph": "f", "id": 233031, "pid": 76337, "tid": -914061504, "ts": 1716454225145506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145511, "dur": 0, "args": { "External id": 233032, "cbid": 251, "correlation": 233032 } }, { "ph": "f", "id": 233032, "pid": 76337, "tid": -914061504, "ts": 1716454225145511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225165871, "dur": 10, "args": { "External id": 233033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233033, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 233033, "pid": 5, "tid": 7, "ts": 1716454225165871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145513, "dur": 14, "args": { "External id": 233033, "cbid": 211, "correlation": 233033 } }, { "ph": "s", "id": 233033, "pid": 76337, "tid": -914061504, "ts": 1716454225145513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225165882, "dur": 5, "args": { "External id": 233035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233035, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 233035, "pid": 5, "tid": 7, "ts": 1716454225165882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145531, "dur": 7, "args": { "External id": 233035, "cbid": 211, "correlation": 233035 } }, { "ph": "s", "id": 233035, "pid": 76337, "tid": -914061504, "ts": 1716454225145531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145593, "dur": 1, "args": { "External id": 233046, "cbid": 251, "correlation": 233046 } }, { "ph": "f", "id": 233046, "pid": 76337, "tid": -914061504, "ts": 1716454225145593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145597, "dur": 0, "args": { "External id": 233047, "cbid": 251, "correlation": 233047 } }, { "ph": "f", "id": 233047, "pid": 76337, "tid": -914061504, "ts": 1716454225145597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225165888, "dur": 6, "args": { "External id": 233048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233048, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 233048, "pid": 5, "tid": 7, "ts": 1716454225165888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145598, "dur": 12, "args": { "External id": 233048, "cbid": 211, "correlation": 233048 } }, { "ph": "s", "id": 233048, "pid": 76337, "tid": -914061504, "ts": 1716454225145598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225165896, "dur": 3, "args": { "External id": 233050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233050, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 233050, "pid": 5, "tid": 7, "ts": 1716454225165896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145612, "dur": 6, "args": { "External id": 233050, "cbid": 211, "correlation": 233050 } }, { "ph": "s", "id": 233050, "pid": 76337, "tid": -914061504, "ts": 1716454225145612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225165901, "dur": 153, "args": { "External id": 233071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233071, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233071, "pid": 5, "tid": 7, "ts": 1716454225165901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145685, "dur": 12, "args": { "External id": 233071, "cbid": 211, "correlation": 233071 } }, { "ph": "s", "id": 233071, "pid": 76337, "tid": -914061504, "ts": 1716454225145685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225145781, "dur": 2, "args": { "External id": 233089, "cbid": 251, "correlation": 233089 } }, { "ph": "f", "id": 233089, "pid": 76337, "tid": -914061504, "ts": 1716454225145781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225166054, "dur": 105, "args": { "External id": 233091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233091, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233091, "pid": 5, "tid": 7, "ts": 1716454225166054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145788, "dur": 15, "args": { "External id": 233091, "cbid": 211, "correlation": 233091 } }, { "ph": "s", "id": 233091, "pid": 76337, "tid": -914061504, "ts": 1716454225145788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225166161, "dur": 35, "args": { "External id": 233099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233099, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233099, "pid": 5, "tid": 7, "ts": 1716454225166161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145860, "dur": 13, "args": { "External id": 233099, "cbid": 211, "correlation": 233099 } }, { "ph": "s", "id": 233099, "pid": 76337, "tid": -914061504, "ts": 1716454225145860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225166197, "dur": 67, "args": { "External id": 233107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233107, "pid": 5, "tid": 7, "ts": 1716454225166197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145902, "dur": 9, "args": { "External id": 233107, "cbid": 211, "correlation": 233107 } }, { "ph": "s", "id": 233107, "pid": 76337, "tid": -914061504, "ts": 1716454225145902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225166265, "dur": 90, "args": { "External id": 233129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233129, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233129, "pid": 5, "tid": 7, "ts": 1716454225166265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225145952, "dur": 10, "args": { "External id": 233129, "cbid": 211, "correlation": 233129 } }, { "ph": "s", "id": 233129, "pid": 76337, "tid": -914061504, "ts": 1716454225145952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146051, "dur": 1, "args": { "External id": 233145, "cbid": 251, "correlation": 233145 } }, { "ph": "f", "id": 233145, "pid": 76337, "tid": -914061504, "ts": 1716454225146051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225166357, "dur": 567, "args": { "External id": 233147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233147, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233147, "pid": 5, "tid": 7, "ts": 1716454225166357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146057, "dur": 14, "args": { "External id": 233147, "cbid": 211, "correlation": 233147 } }, { "ph": "s", "id": 233147, "pid": 76337, "tid": -914061504, "ts": 1716454225146057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225166925, "dur": 239, "args": { "External id": 233155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233155, "pid": 5, "tid": 7, "ts": 1716454225166925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146135, "dur": 14, "args": { "External id": 233155, "cbid": 211, "correlation": 233155 } }, { "ph": "s", "id": 233155, "pid": 76337, "tid": -914061504, "ts": 1716454225146135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225167165, "dur": 249, "args": { "External id": 233163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233163, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233163, "pid": 5, "tid": 7, "ts": 1716454225167165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146170, "dur": 9, "args": { "External id": 233163, "cbid": 211, "correlation": 233163 } }, { "ph": "s", "id": 233163, "pid": 76337, "tid": -914061504, "ts": 1716454225146170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146254, "dur": 2, "args": { "External id": 233179, "cbid": 251, "correlation": 233179 } }, { "ph": "f", "id": 233179, "pid": 76337, "tid": -914061504, "ts": 1716454225146254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146260, "dur": 0, "args": { "External id": 233181, "cbid": 251, "correlation": 233181 } }, { "ph": "f", "id": 233181, "pid": 76337, "tid": -914061504, "ts": 1716454225146260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225167416, "dur": 356, "args": { "External id": 233182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233182, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 233182, "pid": 5, "tid": 7, "ts": 1716454225167416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146265, "dur": 13, "args": { "External id": 233182, "cbid": 211, "correlation": 233182 } }, { "ph": "s", "id": 233182, "pid": 76337, "tid": -914061504, "ts": 1716454225146265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225167773, "dur": 49, "args": { "External id": 233190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233190, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233190, "pid": 5, "tid": 7, "ts": 1716454225167773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146309, "dur": 10, "args": { "External id": 233190, "cbid": 211, "correlation": 233190 } }, { "ph": "s", "id": 233190, "pid": 76337, "tid": -914061504, "ts": 1716454225146309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225167824, "dur": 153, "args": { "External id": 233201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233201, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233201, "pid": 5, "tid": 7, "ts": 1716454225167824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146379, "dur": 12, "args": { "External id": 233201, "cbid": 211, "correlation": 233201 } }, { "ph": "s", "id": 233201, "pid": 76337, "tid": -914061504, "ts": 1716454225146379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225146445, "dur": 0, "args": { "External id": 233213, "cbid": 317, "correlation": 233213 } }, { "ph": "f", "id": 233213, "pid": 76337, "tid": -914061504, "ts": 1716454225146445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225146446, "dur": 0, "args": { "External id": 233214, "cbid": 203, "correlation": 233214 } }, { "ph": "f", "id": 233214, "pid": 76337, "tid": -914061504, "ts": 1716454225146446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225146447, "dur": 0, "args": { "External id": 233215, "cbid": 205, "correlation": 233215 } }, { "ph": "f", "id": 233215, "pid": 76337, "tid": -914061504, "ts": 1716454225146447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146472, "dur": 1, "args": { "External id": 233219, "cbid": 251, "correlation": 233219 } }, { "ph": "f", "id": 233219, "pid": 76337, "tid": -914061504, "ts": 1716454225146472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146474, "dur": 0, "args": { "External id": 233220, "cbid": 251, "correlation": 233220 } }, { "ph": "f", "id": 233220, "pid": 76337, "tid": -914061504, "ts": 1716454225146474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146474, "dur": 0, "args": { "External id": 233221, "cbid": 251, "correlation": 233221 } }, { "ph": "f", "id": 233221, "pid": 76337, "tid": -914061504, "ts": 1716454225146474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146475, "dur": 0, "args": { "External id": 233222, "cbid": 251, "correlation": 233222 } }, { "ph": "f", "id": 233222, "pid": 76337, "tid": -914061504, "ts": 1716454225146475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146476, "dur": 0, "args": { "External id": 233223, "cbid": 251, "correlation": 233223 } }, { "ph": "f", "id": 233223, "pid": 76337, "tid": -914061504, "ts": 1716454225146476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146477, "dur": 0, "args": { "External id": 233224, "cbid": 251, "correlation": 233224 } }, { "ph": "f", "id": 233224, "pid": 76337, "tid": -914061504, "ts": 1716454225146477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146478, "dur": 0, "args": { "External id": 233225, "cbid": 251, "correlation": 233225 } }, { "ph": "f", "id": 233225, "pid": 76337, "tid": -914061504, "ts": 1716454225146478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146479, "dur": 0, "args": { "External id": 233226, "cbid": 251, "correlation": 233226 } }, { "ph": "f", "id": 233226, "pid": 76337, "tid": -914061504, "ts": 1716454225146479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146480, "dur": 0, "args": { "External id": 233227, "cbid": 251, "correlation": 233227 } }, { "ph": "f", "id": 233227, "pid": 76337, "tid": -914061504, "ts": 1716454225146480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225167978, "dur": 111, "args": { "External id": 233228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233228, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233228, "pid": 5, "tid": 7, "ts": 1716454225167978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146482, "dur": 12, "args": { "External id": 233228, "cbid": 211, "correlation": 233228 } }, { "ph": "s", "id": 233228, "pid": 76337, "tid": -914061504, "ts": 1716454225146482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225168091, "dur": 59, "args": { "External id": 233234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233234, "pid": 5, "tid": 7, "ts": 1716454225168091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146517, "dur": 9, "args": { "External id": 233234, "cbid": 211, "correlation": 233234 } }, { "ph": "s", "id": 233234, "pid": 76337, "tid": -914061504, "ts": 1716454225146517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225168151, "dur": 50, "args": { "External id": 233242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233242, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233242, "pid": 5, "tid": 7, "ts": 1716454225168151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146550, "dur": 9, "args": { "External id": 233242, "cbid": 211, "correlation": 233242 } }, { "ph": "s", "id": 233242, "pid": 76337, "tid": -914061504, "ts": 1716454225146550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225168203, "dur": 51, "args": { "External id": 233262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233262, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 233262, "pid": 5, "tid": 7, "ts": 1716454225168203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146623, "dur": 11, "args": { "External id": 233262, "cbid": 211, "correlation": 233262 } }, { "ph": "s", "id": 233262, "pid": 76337, "tid": -914061504, "ts": 1716454225146623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225168255, "dur": 4, "args": { "External id": 233274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233274, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233274, "pid": 5, "tid": 7, "ts": 1716454225168255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146644, "dur": 7, "args": { "External id": 233274, "cbid": 211, "correlation": 233274 } }, { "ph": "s", "id": 233274, "pid": 76337, "tid": -914061504, "ts": 1716454225146644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225168261, "dur": 56, "args": { "External id": 233277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233277, "pid": 5, "tid": 7, "ts": 1716454225168261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146663, "dur": 7, "args": { "External id": 233277, "cbid": 211, "correlation": 233277 } }, { "ph": "s", "id": 233277, "pid": 76337, "tid": -914061504, "ts": 1716454225146663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225168318, "dur": 37, "args": { "External id": 233286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233286, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233286, "pid": 5, "tid": 7, "ts": 1716454225168318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146702, "dur": 10, "args": { "External id": 233286, "cbid": 211, "correlation": 233286 } }, { "ph": "s", "id": 233286, "pid": 76337, "tid": -914061504, "ts": 1716454225146702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225146754, "dur": 0, "args": { "External id": 233296, "cbid": 317, "correlation": 233296 } }, { "ph": "f", "id": 233296, "pid": 76337, "tid": -914061504, "ts": 1716454225146754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225146755, "dur": 0, "args": { "External id": 233297, "cbid": 203, "correlation": 233297 } }, { "ph": "f", "id": 233297, "pid": 76337, "tid": -914061504, "ts": 1716454225146755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225146756, "dur": 0, "args": { "External id": 233298, "cbid": 205, "correlation": 233298 } }, { "ph": "f", "id": 233298, "pid": 76337, "tid": -914061504, "ts": 1716454225146756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225168357, "dur": 41, "args": { "External id": 233302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233302, "pid": 5, "tid": 7, "ts": 1716454225168357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146770, "dur": 11, "args": { "External id": 233302, "cbid": 211, "correlation": 233302 } }, { "ph": "s", "id": 233302, "pid": 76337, "tid": -914061504, "ts": 1716454225146770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225168399, "dur": 14, "args": { "External id": 233304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233304, "pid": 5, "tid": 7, "ts": 1716454225168399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146784, "dur": 5, "args": { "External id": 233304, "cbid": 211, "correlation": 233304 } }, { "ph": "s", "id": 233304, "pid": 76337, "tid": -914061504, "ts": 1716454225146784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225168415, "dur": 4, "args": { "External id": 233306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233306, "pid": 5, "tid": 7, "ts": 1716454225168415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146793, "dur": 7, "args": { "External id": 233306, "cbid": 211, "correlation": 233306 } }, { "ph": "s", "id": 233306, "pid": 76337, "tid": -914061504, "ts": 1716454225146793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225146803, "dur": 0, "args": { "External id": 233307, "cbid": 51, "correlation": 233307 } }, { "ph": "s", "id": 233307, "pid": 76337, "tid": -914061504, "ts": 1716454225146803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225168419, "dur": 690, "args": { "External id": 233308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233308, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233308, "pid": 5, "tid": 7, "ts": 1716454225168419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146804, "dur": 5, "args": { "External id": 233308, "cbid": 211, "correlation": 233308 } }, { "ph": "s", "id": 233308, "pid": 76337, "tid": -914061504, "ts": 1716454225146804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225169111, "dur": 59, "args": { "External id": 233313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233313, "pid": 5, "tid": 7, "ts": 1716454225169111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146831, "dur": 8, "args": { "External id": 233313, "cbid": 211, "correlation": 233313 } }, { "ph": "s", "id": 233313, "pid": 76337, "tid": -914061504, "ts": 1716454225146831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225169171, "dur": 3, "args": { "External id": 233321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233321, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233321, "pid": 5, "tid": 7, "ts": 1716454225169171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146874, "dur": 9, "args": { "External id": 233321, "cbid": 211, "correlation": 233321 } }, { "ph": "s", "id": 233321, "pid": 76337, "tid": -914061504, "ts": 1716454225146874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146939, "dur": 1, "args": { "External id": 233337, "cbid": 251, "correlation": 233337 } }, { "ph": "f", "id": 233337, "pid": 76337, "tid": -914061504, "ts": 1716454225146939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225146944, "dur": 0, "args": { "External id": 233339, "cbid": 251, "correlation": 233339 } }, { "ph": "f", "id": 233339, "pid": 76337, "tid": -914061504, "ts": 1716454225146944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225169176, "dur": 11, "args": { "External id": 233340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233340, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 233340, "pid": 5, "tid": 7, "ts": 1716454225169176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146945, "dur": 12, "args": { "External id": 233340, "cbid": 211, "correlation": 233340 } }, { "ph": "s", "id": 233340, "pid": 76337, "tid": -914061504, "ts": 1716454225146945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225169188, "dur": 5, "args": { "External id": 233342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233342, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 233342, "pid": 5, "tid": 7, "ts": 1716454225169188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225146960, "dur": 6, "args": { "External id": 233342, "cbid": 211, "correlation": 233342 } }, { "ph": "s", "id": 233342, "pid": 76337, "tid": -914061504, "ts": 1716454225146960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225169195, "dur": 52, "args": { "External id": 233352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233352, "pid": 5, "tid": 7, "ts": 1716454225169195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147024, "dur": 13, "args": { "External id": 233352, "cbid": 211, "correlation": 233352 } }, { "ph": "s", "id": 233352, "pid": 76337, "tid": -914061504, "ts": 1716454225147024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225169248, "dur": 51, "args": { "External id": 233372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233372, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 233372, "pid": 5, "tid": 7, "ts": 1716454225169248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147091, "dur": 11, "args": { "External id": 233372, "cbid": 211, "correlation": 233372 } }, { "ph": "s", "id": 233372, "pid": 76337, "tid": -914061504, "ts": 1716454225147091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225169300, "dur": 4, "args": { "External id": 233384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233384, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233384, "pid": 5, "tid": 7, "ts": 1716454225169300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147112, "dur": 6, "args": { "External id": 233384, "cbid": 211, "correlation": 233384 } }, { "ph": "s", "id": 233384, "pid": 76337, "tid": -914061504, "ts": 1716454225147112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225169305, "dur": 56, "args": { "External id": 233387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233387, "pid": 5, "tid": 7, "ts": 1716454225169305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147130, "dur": 6, "args": { "External id": 233387, "cbid": 211, "correlation": 233387 } }, { "ph": "s", "id": 233387, "pid": 76337, "tid": -914061504, "ts": 1716454225147130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225169363, "dur": 36, "args": { "External id": 233396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233396, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233396, "pid": 5, "tid": 7, "ts": 1716454225169363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147171, "dur": 9, "args": { "External id": 233396, "cbid": 211, "correlation": 233396 } }, { "ph": "s", "id": 233396, "pid": 76337, "tid": -914061504, "ts": 1716454225147171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225147232, "dur": 0, "args": { "External id": 233406, "cbid": 317, "correlation": 233406 } }, { "ph": "f", "id": 233406, "pid": 76337, "tid": -914061504, "ts": 1716454225147232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225147232, "dur": 0, "args": { "External id": 233407, "cbid": 203, "correlation": 233407 } }, { "ph": "f", "id": 233407, "pid": 76337, "tid": -914061504, "ts": 1716454225147232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225147233, "dur": 0, "args": { "External id": 233408, "cbid": 205, "correlation": 233408 } }, { "ph": "f", "id": 233408, "pid": 76337, "tid": -914061504, "ts": 1716454225147233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225169400, "dur": 40, "args": { "External id": 233412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233412, "pid": 5, "tid": 7, "ts": 1716454225169400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147248, "dur": 12, "args": { "External id": 233412, "cbid": 211, "correlation": 233412 } }, { "ph": "s", "id": 233412, "pid": 76337, "tid": -914061504, "ts": 1716454225147248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225169441, "dur": 14, "args": { "External id": 233414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233414, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233414, "pid": 5, "tid": 7, "ts": 1716454225169441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147263, "dur": 5, "args": { "External id": 233414, "cbid": 211, "correlation": 233414 } }, { "ph": "s", "id": 233414, "pid": 76337, "tid": -914061504, "ts": 1716454225147263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225169457, "dur": 3, "args": { "External id": 233416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233416, "pid": 5, "tid": 7, "ts": 1716454225169457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147272, "dur": 5, "args": { "External id": 233416, "cbid": 211, "correlation": 233416 } }, { "ph": "s", "id": 233416, "pid": 76337, "tid": -914061504, "ts": 1716454225147272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225147280, "dur": 0, "args": { "External id": 233417, "cbid": 51, "correlation": 233417 } }, { "ph": "s", "id": 233417, "pid": 76337, "tid": -914061504, "ts": 1716454225147280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225169461, "dur": 683, "args": { "External id": 233418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233418, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233418, "pid": 5, "tid": 7, "ts": 1716454225169461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147281, "dur": 5, "args": { "External id": 233418, "cbid": 211, "correlation": 233418 } }, { "ph": "s", "id": 233418, "pid": 76337, "tid": -914061504, "ts": 1716454225147281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225170145, "dur": 58, "args": { "External id": 233423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233423, "pid": 5, "tid": 7, "ts": 1716454225170145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147309, "dur": 8, "args": { "External id": 233423, "cbid": 211, "correlation": 233423 } }, { "ph": "s", "id": 233423, "pid": 76337, "tid": -914061504, "ts": 1716454225147309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225170204, "dur": 50, "args": { "External id": 233431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233431, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233431, "pid": 5, "tid": 7, "ts": 1716454225170204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147341, "dur": 9, "args": { "External id": 233431, "cbid": 211, "correlation": 233431 } }, { "ph": "s", "id": 233431, "pid": 76337, "tid": -914061504, "ts": 1716454225147341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225170256, "dur": 35, "args": { "External id": 233439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233439, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233439, "pid": 5, "tid": 7, "ts": 1716454225170256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147370, "dur": 8, "args": { "External id": 233439, "cbid": 211, "correlation": 233439 } }, { "ph": "s", "id": 233439, "pid": 76337, "tid": -914061504, "ts": 1716454225147370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225170292, "dur": 50, "args": { "External id": 233459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233459, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 233459, "pid": 5, "tid": 7, "ts": 1716454225170292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147447, "dur": 12, "args": { "External id": 233459, "cbid": 211, "correlation": 233459 } }, { "ph": "s", "id": 233459, "pid": 76337, "tid": -914061504, "ts": 1716454225147447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225170344, "dur": 4, "args": { "External id": 233471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233471, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233471, "pid": 5, "tid": 7, "ts": 1716454225170344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147469, "dur": 6, "args": { "External id": 233471, "cbid": 211, "correlation": 233471 } }, { "ph": "s", "id": 233471, "pid": 76337, "tid": -914061504, "ts": 1716454225147469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225170349, "dur": 54, "args": { "External id": 233474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233474, "pid": 5, "tid": 7, "ts": 1716454225170349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147486, "dur": 6, "args": { "External id": 233474, "cbid": 211, "correlation": 233474 } }, { "ph": "s", "id": 233474, "pid": 76337, "tid": -914061504, "ts": 1716454225147486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225147543, "dur": 0, "args": { "External id": 233485, "cbid": 317, "correlation": 233485 } }, { "ph": "f", "id": 233485, "pid": 76337, "tid": -914061504, "ts": 1716454225147543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225147544, "dur": 0, "args": { "External id": 233486, "cbid": 203, "correlation": 233486 } }, { "ph": "f", "id": 233486, "pid": 76337, "tid": -914061504, "ts": 1716454225147544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225147545, "dur": 0, "args": { "External id": 233487, "cbid": 205, "correlation": 233487 } }, { "ph": "f", "id": 233487, "pid": 76337, "tid": -914061504, "ts": 1716454225147545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147567, "dur": 1, "args": { "External id": 233491, "cbid": 251, "correlation": 233491 } }, { "ph": "f", "id": 233491, "pid": 76337, "tid": -914061504, "ts": 1716454225147567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147569, "dur": 0, "args": { "External id": 233492, "cbid": 251, "correlation": 233492 } }, { "ph": "f", "id": 233492, "pid": 76337, "tid": -914061504, "ts": 1716454225147569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147570, "dur": 0, "args": { "External id": 233493, "cbid": 251, "correlation": 233493 } }, { "ph": "f", "id": 233493, "pid": 76337, "tid": -914061504, "ts": 1716454225147570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147570, "dur": 0, "args": { "External id": 233494, "cbid": 251, "correlation": 233494 } }, { "ph": "f", "id": 233494, "pid": 76337, "tid": -914061504, "ts": 1716454225147570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147571, "dur": 0, "args": { "External id": 233495, "cbid": 251, "correlation": 233495 } }, { "ph": "f", "id": 233495, "pid": 76337, "tid": -914061504, "ts": 1716454225147571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147572, "dur": 0, "args": { "External id": 233496, "cbid": 251, "correlation": 233496 } }, { "ph": "f", "id": 233496, "pid": 76337, "tid": -914061504, "ts": 1716454225147572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147573, "dur": 0, "args": { "External id": 233497, "cbid": 251, "correlation": 233497 } }, { "ph": "f", "id": 233497, "pid": 76337, "tid": -914061504, "ts": 1716454225147573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147573, "dur": 0, "args": { "External id": 233498, "cbid": 251, "correlation": 233498 } }, { "ph": "f", "id": 233498, "pid": 76337, "tid": -914061504, "ts": 1716454225147573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147574, "dur": 0, "args": { "External id": 233499, "cbid": 251, "correlation": 233499 } }, { "ph": "f", "id": 233499, "pid": 76337, "tid": -914061504, "ts": 1716454225147574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225170404, "dur": 109, "args": { "External id": 233500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233500, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233500, "pid": 5, "tid": 7, "ts": 1716454225170404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147577, "dur": 12, "args": { "External id": 233500, "cbid": 211, "correlation": 233500 } }, { "ph": "s", "id": 233500, "pid": 76337, "tid": -914061504, "ts": 1716454225147577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225170515, "dur": 58, "args": { "External id": 233506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233506, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233506, "pid": 5, "tid": 7, "ts": 1716454225170515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147611, "dur": 8, "args": { "External id": 233506, "cbid": 211, "correlation": 233506 } }, { "ph": "s", "id": 233506, "pid": 76337, "tid": -914061504, "ts": 1716454225147611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225170574, "dur": 536, "args": { "External id": 233515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233515, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233515, "pid": 5, "tid": 7, "ts": 1716454225170574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147694, "dur": 14, "args": { "External id": 233515, "cbid": 211, "correlation": 233515 } }, { "ph": "s", "id": 233515, "pid": 76337, "tid": -914061504, "ts": 1716454225147694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225171112, "dur": 176, "args": { "External id": 233537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233537, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233537, "pid": 5, "tid": 7, "ts": 1716454225171112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147751, "dur": 10, "args": { "External id": 233537, "cbid": 211, "correlation": 233537 } }, { "ph": "s", "id": 233537, "pid": 76337, "tid": -914061504, "ts": 1716454225147751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147835, "dur": 1, "args": { "External id": 233548, "cbid": 251, "correlation": 233548 } }, { "ph": "f", "id": 233548, "pid": 76337, "tid": -914061504, "ts": 1716454225147835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225171289, "dur": 190, "args": { "External id": 233549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233549, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233549, "pid": 5, "tid": 7, "ts": 1716454225171289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147840, "dur": 14, "args": { "External id": 233549, "cbid": 211, "correlation": 233549 } }, { "ph": "s", "id": 233549, "pid": 76337, "tid": -914061504, "ts": 1716454225147840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147909, "dur": 1, "args": { "External id": 233560, "cbid": 251, "correlation": 233560 } }, { "ph": "f", "id": 233560, "pid": 76337, "tid": -914061504, "ts": 1716454225147909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225171480, "dur": 184, "args": { "External id": 233561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233561, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233561, "pid": 5, "tid": 7, "ts": 1716454225171480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147913, "dur": 12, "args": { "External id": 233561, "cbid": 211, "correlation": 233561 } }, { "ph": "s", "id": 233561, "pid": 76337, "tid": -914061504, "ts": 1716454225147913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225147983, "dur": 1, "args": { "External id": 233572, "cbid": 251, "correlation": 233572 } }, { "ph": "f", "id": 233572, "pid": 76337, "tid": -914061504, "ts": 1716454225147983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225171666, "dur": 186, "args": { "External id": 233573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233573, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233573, "pid": 5, "tid": 7, "ts": 1716454225171666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225147988, "dur": 13, "args": { "External id": 233573, "cbid": 211, "correlation": 233573 } }, { "ph": "s", "id": 233573, "pid": 76337, "tid": -914061504, "ts": 1716454225147988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225171853, "dur": 18206, "args": { "External id": 233594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233594, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233594, "pid": 5, "tid": 7, "ts": 1716454225171853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148070, "dur": 12, "args": { "External id": 233594, "cbid": 211, "correlation": 233594 } }, { "ph": "s", "id": 233594, "pid": 76337, "tid": -914061504, "ts": 1716454225148070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148167, "dur": 1, "args": { "External id": 233612, "cbid": 251, "correlation": 233612 } }, { "ph": "f", "id": 233612, "pid": 76337, "tid": -914061504, "ts": 1716454225148167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225190060, "dur": 197, "args": { "External id": 233614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233614, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233614, "pid": 5, "tid": 7, "ts": 1716454225190060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148173, "dur": 13, "args": { "External id": 233614, "cbid": 211, "correlation": 233614 } }, { "ph": "s", "id": 233614, "pid": 76337, "tid": -914061504, "ts": 1716454225148173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225190259, "dur": 66, "args": { "External id": 233622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233622, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233622, "pid": 5, "tid": 7, "ts": 1716454225190259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148242, "dur": 13, "args": { "External id": 233622, "cbid": 211, "correlation": 233622 } }, { "ph": "s", "id": 233622, "pid": 76337, "tid": -914061504, "ts": 1716454225148242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225190327, "dur": 97, "args": { "External id": 233630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233630, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233630, "pid": 5, "tid": 7, "ts": 1716454225190327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148282, "dur": 8, "args": { "External id": 233630, "cbid": 211, "correlation": 233630 } }, { "ph": "s", "id": 233630, "pid": 76337, "tid": -914061504, "ts": 1716454225148282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225190424, "dur": 53, "args": { "External id": 233641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233641, "pid": 5, "tid": 7, "ts": 1716454225190424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148353, "dur": 12, "args": { "External id": 233641, "cbid": 211, "correlation": 233641 } }, { "ph": "s", "id": 233641, "pid": 76337, "tid": -914061504, "ts": 1716454225148353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225190479, "dur": 90, "args": { "External id": 233663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233663, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233663, "pid": 5, "tid": 7, "ts": 1716454225190479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148384, "dur": 8, "args": { "External id": 233663, "cbid": 211, "correlation": 233663 } }, { "ph": "s", "id": 233663, "pid": 76337, "tid": -914061504, "ts": 1716454225148384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148466, "dur": 1, "args": { "External id": 233674, "cbid": 251, "correlation": 233674 } }, { "ph": "f", "id": 233674, "pid": 76337, "tid": -914061504, "ts": 1716454225148466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225190571, "dur": 103, "args": { "External id": 233675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233675, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233675, "pid": 5, "tid": 7, "ts": 1716454225190571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148471, "dur": 13, "args": { "External id": 233675, "cbid": 211, "correlation": 233675 } }, { "ph": "s", "id": 233675, "pid": 76337, "tid": -914061504, "ts": 1716454225148471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148543, "dur": 1, "args": { "External id": 233686, "cbid": 251, "correlation": 233686 } }, { "ph": "f", "id": 233686, "pid": 76337, "tid": -914061504, "ts": 1716454225148543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148547, "dur": 0, "args": { "External id": 233687, "cbid": 251, "correlation": 233687 } }, { "ph": "f", "id": 233687, "pid": 76337, "tid": -914061504, "ts": 1716454225148547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225190675, "dur": 10, "args": { "External id": 233688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233688, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 233688, "pid": 5, "tid": 7, "ts": 1716454225190675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148549, "dur": 12, "args": { "External id": 233688, "cbid": 211, "correlation": 233688 } }, { "ph": "s", "id": 233688, "pid": 76337, "tid": -914061504, "ts": 1716454225148549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225190686, "dur": 5, "args": { "External id": 233690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233690, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 233690, "pid": 5, "tid": 7, "ts": 1716454225190686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148563, "dur": 6, "args": { "External id": 233690, "cbid": 211, "correlation": 233690 } }, { "ph": "s", "id": 233690, "pid": 76337, "tid": -914061504, "ts": 1716454225148563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148623, "dur": 1, "args": { "External id": 233701, "cbid": 251, "correlation": 233701 } }, { "ph": "f", "id": 233701, "pid": 76337, "tid": -914061504, "ts": 1716454225148623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148626, "dur": 0, "args": { "External id": 233702, "cbid": 251, "correlation": 233702 } }, { "ph": "f", "id": 233702, "pid": 76337, "tid": -914061504, "ts": 1716454225148626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225190693, "dur": 6, "args": { "External id": 233703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233703, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 233703, "pid": 5, "tid": 7, "ts": 1716454225190693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148627, "dur": 12, "args": { "External id": 233703, "cbid": 211, "correlation": 233703 } }, { "ph": "s", "id": 233703, "pid": 76337, "tid": -914061504, "ts": 1716454225148627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225190700, "dur": 3, "args": { "External id": 233705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233705, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 233705, "pid": 5, "tid": 7, "ts": 1716454225190700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148642, "dur": 6, "args": { "External id": 233705, "cbid": 211, "correlation": 233705 } }, { "ph": "s", "id": 233705, "pid": 76337, "tid": -914061504, "ts": 1716454225148642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225190705, "dur": 153, "args": { "External id": 233726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233726, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233726, "pid": 5, "tid": 7, "ts": 1716454225190705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148715, "dur": 12, "args": { "External id": 233726, "cbid": 211, "correlation": 233726 } }, { "ph": "s", "id": 233726, "pid": 76337, "tid": -914061504, "ts": 1716454225148715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225148813, "dur": 1, "args": { "External id": 233744, "cbid": 251, "correlation": 233744 } }, { "ph": "f", "id": 233744, "pid": 76337, "tid": -914061504, "ts": 1716454225148813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225190860, "dur": 105, "args": { "External id": 233746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233746, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233746, "pid": 5, "tid": 7, "ts": 1716454225190860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148818, "dur": 13, "args": { "External id": 233746, "cbid": 211, "correlation": 233746 } }, { "ph": "s", "id": 233746, "pid": 76337, "tid": -914061504, "ts": 1716454225148818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225190965, "dur": 35, "args": { "External id": 233754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233754, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233754, "pid": 5, "tid": 7, "ts": 1716454225190965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148886, "dur": 12, "args": { "External id": 233754, "cbid": 211, "correlation": 233754 } }, { "ph": "s", "id": 233754, "pid": 76337, "tid": -914061504, "ts": 1716454225148886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225191002, "dur": 68, "args": { "External id": 233762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233762, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233762, "pid": 5, "tid": 7, "ts": 1716454225191002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148927, "dur": 9, "args": { "External id": 233762, "cbid": 211, "correlation": 233762 } }, { "ph": "s", "id": 233762, "pid": 76337, "tid": -914061504, "ts": 1716454225148927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225191071, "dur": 90, "args": { "External id": 233784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233784, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233784, "pid": 5, "tid": 7, "ts": 1716454225191071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225148985, "dur": 11, "args": { "External id": 233784, "cbid": 211, "correlation": 233784 } }, { "ph": "s", "id": 233784, "pid": 76337, "tid": -914061504, "ts": 1716454225148985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149072, "dur": 1, "args": { "External id": 233800, "cbid": 251, "correlation": 233800 } }, { "ph": "f", "id": 233800, "pid": 76337, "tid": -914061504, "ts": 1716454225149072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225191162, "dur": 562, "args": { "External id": 233802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233802, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233802, "pid": 5, "tid": 7, "ts": 1716454225191162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149078, "dur": 13, "args": { "External id": 233802, "cbid": 211, "correlation": 233802 } }, { "ph": "s", "id": 233802, "pid": 76337, "tid": -914061504, "ts": 1716454225149078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225191725, "dur": 241, "args": { "External id": 233810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233810, "pid": 5, "tid": 7, "ts": 1716454225191725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149143, "dur": 12, "args": { "External id": 233810, "cbid": 211, "correlation": 233810 } }, { "ph": "s", "id": 233810, "pid": 76337, "tid": -914061504, "ts": 1716454225149143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225191967, "dur": 252, "args": { "External id": 233818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233818, "pid": 5, "tid": 7, "ts": 1716454225191967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149173, "dur": 8, "args": { "External id": 233818, "cbid": 211, "correlation": 233818 } }, { "ph": "s", "id": 233818, "pid": 76337, "tid": -914061504, "ts": 1716454225149173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149254, "dur": 1, "args": { "External id": 233834, "cbid": 251, "correlation": 233834 } }, { "ph": "f", "id": 233834, "pid": 76337, "tid": -914061504, "ts": 1716454225149254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149259, "dur": 0, "args": { "External id": 233836, "cbid": 251, "correlation": 233836 } }, { "ph": "f", "id": 233836, "pid": 76337, "tid": -914061504, "ts": 1716454225149259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225192220, "dur": 356, "args": { "External id": 233837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233837, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 233837, "pid": 5, "tid": 7, "ts": 1716454225192220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149262, "dur": 13, "args": { "External id": 233837, "cbid": 211, "correlation": 233837 } }, { "ph": "s", "id": 233837, "pid": 76337, "tid": -914061504, "ts": 1716454225149262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225192577, "dur": 50, "args": { "External id": 233845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233845, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233845, "pid": 5, "tid": 7, "ts": 1716454225192577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149304, "dur": 9, "args": { "External id": 233845, "cbid": 211, "correlation": 233845 } }, { "ph": "s", "id": 233845, "pid": 76337, "tid": -914061504, "ts": 1716454225149304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225192628, "dur": 154, "args": { "External id": 233856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233856, "pid": 5, "tid": 7, "ts": 1716454225192628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149370, "dur": 13, "args": { "External id": 233856, "cbid": 211, "correlation": 233856 } }, { "ph": "s", "id": 233856, "pid": 76337, "tid": -914061504, "ts": 1716454225149370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225149434, "dur": 0, "args": { "External id": 233868, "cbid": 317, "correlation": 233868 } }, { "ph": "f", "id": 233868, "pid": 76337, "tid": -914061504, "ts": 1716454225149434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225149435, "dur": 0, "args": { "External id": 233869, "cbid": 203, "correlation": 233869 } }, { "ph": "f", "id": 233869, "pid": 76337, "tid": -914061504, "ts": 1716454225149435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225149436, "dur": 0, "args": { "External id": 233870, "cbid": 205, "correlation": 233870 } }, { "ph": "f", "id": 233870, "pid": 76337, "tid": -914061504, "ts": 1716454225149436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149458, "dur": 1, "args": { "External id": 233874, "cbid": 251, "correlation": 233874 } }, { "ph": "f", "id": 233874, "pid": 76337, "tid": -914061504, "ts": 1716454225149458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149460, "dur": 0, "args": { "External id": 233875, "cbid": 251, "correlation": 233875 } }, { "ph": "f", "id": 233875, "pid": 76337, "tid": -914061504, "ts": 1716454225149460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149460, "dur": 0, "args": { "External id": 233876, "cbid": 251, "correlation": 233876 } }, { "ph": "f", "id": 233876, "pid": 76337, "tid": -914061504, "ts": 1716454225149460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149461, "dur": 0, "args": { "External id": 233877, "cbid": 251, "correlation": 233877 } }, { "ph": "f", "id": 233877, "pid": 76337, "tid": -914061504, "ts": 1716454225149461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149462, "dur": 0, "args": { "External id": 233878, "cbid": 251, "correlation": 233878 } }, { "ph": "f", "id": 233878, "pid": 76337, "tid": -914061504, "ts": 1716454225149462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149463, "dur": 0, "args": { "External id": 233879, "cbid": 251, "correlation": 233879 } }, { "ph": "f", "id": 233879, "pid": 76337, "tid": -914061504, "ts": 1716454225149463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149464, "dur": 0, "args": { "External id": 233880, "cbid": 251, "correlation": 233880 } }, { "ph": "f", "id": 233880, "pid": 76337, "tid": -914061504, "ts": 1716454225149464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149464, "dur": 0, "args": { "External id": 233881, "cbid": 251, "correlation": 233881 } }, { "ph": "f", "id": 233881, "pid": 76337, "tid": -914061504, "ts": 1716454225149464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225149466, "dur": 0, "args": { "External id": 233882, "cbid": 251, "correlation": 233882 } }, { "ph": "f", "id": 233882, "pid": 76337, "tid": -914061504, "ts": 1716454225149466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225192784, "dur": 113, "args": { "External id": 233883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233883, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 233883, "pid": 5, "tid": 7, "ts": 1716454225192784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149468, "dur": 12, "args": { "External id": 233883, "cbid": 211, "correlation": 233883 } }, { "ph": "s", "id": 233883, "pid": 76337, "tid": -914061504, "ts": 1716454225149468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225192898, "dur": 59, "args": { "External id": 233889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233889, "pid": 5, "tid": 7, "ts": 1716454225192898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149503, "dur": 9, "args": { "External id": 233889, "cbid": 211, "correlation": 233889 } }, { "ph": "s", "id": 233889, "pid": 76337, "tid": -914061504, "ts": 1716454225149503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225192958, "dur": 49, "args": { "External id": 233897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233897, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233897, "pid": 5, "tid": 7, "ts": 1716454225192958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149536, "dur": 9, "args": { "External id": 233897, "cbid": 211, "correlation": 233897 } }, { "ph": "s", "id": 233897, "pid": 76337, "tid": -914061504, "ts": 1716454225149536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225149610, "dur": 0, "args": { "External id": 233907, "cbid": 317, "correlation": 233907 } }, { "ph": "f", "id": 233907, "pid": 76337, "tid": -914061504, "ts": 1716454225149610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225149611, "dur": 0, "args": { "External id": 233908, "cbid": 203, "correlation": 233908 } }, { "ph": "f", "id": 233908, "pid": 76337, "tid": -914061504, "ts": 1716454225149611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225149611, "dur": 0, "args": { "External id": 233909, "cbid": 205, "correlation": 233909 } }, { "ph": "f", "id": 233909, "pid": 76337, "tid": -914061504, "ts": 1716454225149611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225193009, "dur": 41, "args": { "External id": 233913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233913, "pid": 5, "tid": 7, "ts": 1716454225193009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149629, "dur": 13, "args": { "External id": 233913, "cbid": 211, "correlation": 233913 } }, { "ph": "s", "id": 233913, "pid": 76337, "tid": -914061504, "ts": 1716454225149629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225193051, "dur": 14, "args": { "External id": 233915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233915, "pid": 5, "tid": 7, "ts": 1716454225193051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149645, "dur": 5, "args": { "External id": 233915, "cbid": 211, "correlation": 233915 } }, { "ph": "s", "id": 233915, "pid": 76337, "tid": -914061504, "ts": 1716454225149645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225193067, "dur": 1, "args": { "External id": 233917, "device": 5, "context": 1, "stream": 7, "correlation": 233917, "bytes": 1536, "memory bandwidth (GB/s)": 0.9061946902654867 } }, { "ph": "f", "id": 233917, "pid": 5, "tid": 7, "ts": 1716454225193067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225149663, "dur": 20, "args": { "External id": 233917, "cbid": 51, "correlation": 233917 } }, { "ph": "s", "id": 233917, "pid": 76337, "tid": -914061504, "ts": 1716454225149663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225193071, "dur": 354, "args": { "External id": 233918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233918, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233918, "pid": 5, "tid": 7, "ts": 1716454225193071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149685, "dur": 10, "args": { "External id": 233918, "cbid": 211, "correlation": 233918 } }, { "ph": "s", "id": 233918, "pid": 76337, "tid": -914061504, "ts": 1716454225149685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225193426, "dur": 13, "args": { "External id": 233920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233920, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233920, "pid": 5, "tid": 7, "ts": 1716454225193426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149703, "dur": 7, "args": { "External id": 233920, "cbid": 211, "correlation": 233920 } }, { "ph": "s", "id": 233920, "pid": 76337, "tid": -914061504, "ts": 1716454225149703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225193441, "dur": 14, "args": { "External id": 233926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233926, "pid": 5, "tid": 7, "ts": 1716454225193441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149734, "dur": 8, "args": { "External id": 233926, "cbid": 211, "correlation": 233926 } }, { "ph": "s", "id": 233926, "pid": 76337, "tid": -914061504, "ts": 1716454225149734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225193456, "dur": 18, "args": { "External id": 233946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233946, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 233946, "pid": 5, "tid": 7, "ts": 1716454225193456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149826, "dur": 12, "args": { "External id": 233946, "cbid": 211, "correlation": 233946 } }, { "ph": "s", "id": 233946, "pid": 76337, "tid": -914061504, "ts": 1716454225149826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225193476, "dur": 4, "args": { "External id": 233958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233958, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 233958, "pid": 5, "tid": 7, "ts": 1716454225193476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149847, "dur": 6, "args": { "External id": 233958, "cbid": 211, "correlation": 233958 } }, { "ph": "s", "id": 233958, "pid": 76337, "tid": -914061504, "ts": 1716454225149847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225193481, "dur": 17, "args": { "External id": 233961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233961, "pid": 5, "tid": 7, "ts": 1716454225193481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149865, "dur": 7, "args": { "External id": 233961, "cbid": 211, "correlation": 233961 } }, { "ph": "s", "id": 233961, "pid": 76337, "tid": -914061504, "ts": 1716454225149865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225193500, "dur": 11, "args": { "External id": 233970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233970, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233970, "pid": 5, "tid": 7, "ts": 1716454225193500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149904, "dur": 10, "args": { "External id": 233970, "cbid": 211, "correlation": 233970 } }, { "ph": "s", "id": 233970, "pid": 76337, "tid": -914061504, "ts": 1716454225149904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225149959, "dur": 0, "args": { "External id": 233980, "cbid": 317, "correlation": 233980 } }, { "ph": "f", "id": 233980, "pid": 76337, "tid": -914061504, "ts": 1716454225149959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225149960, "dur": 0, "args": { "External id": 233981, "cbid": 203, "correlation": 233981 } }, { "ph": "f", "id": 233981, "pid": 76337, "tid": -914061504, "ts": 1716454225149960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225149960, "dur": 0, "args": { "External id": 233982, "cbid": 205, "correlation": 233982 } }, { "ph": "f", "id": 233982, "pid": 76337, "tid": -914061504, "ts": 1716454225149960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225193512, "dur": 11, "args": { "External id": 233986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233986, "pid": 5, "tid": 7, "ts": 1716454225193512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149984, "dur": 12, "args": { "External id": 233986, "cbid": 211, "correlation": 233986 } }, { "ph": "s", "id": 233986, "pid": 76337, "tid": -914061504, "ts": 1716454225149984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225193525, "dur": 24, "args": { "External id": 233988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233988, "pid": 5, "tid": 7, "ts": 1716454225193525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225149998, "dur": 5, "args": { "External id": 233988, "cbid": 211, "correlation": 233988 } }, { "ph": "s", "id": 233988, "pid": 76337, "tid": -914061504, "ts": 1716454225149998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225193550, "dur": 4, "args": { "External id": 233990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 233990, "pid": 5, "tid": 7, "ts": 1716454225193550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150008, "dur": 5, "args": { "External id": 233990, "cbid": 211, "correlation": 233990 } }, { "ph": "s", "id": 233990, "pid": 76337, "tid": -914061504, "ts": 1716454225150008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225150017, "dur": 0, "args": { "External id": 233991, "cbid": 51, "correlation": 233991 } }, { "ph": "s", "id": 233991, "pid": 76337, "tid": -914061504, "ts": 1716454225150017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225193555, "dur": 353, "args": { "External id": 233992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233992, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 233992, "pid": 5, "tid": 7, "ts": 1716454225193555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150018, "dur": 8, "args": { "External id": 233992, "cbid": 211, "correlation": 233992 } }, { "ph": "s", "id": 233992, "pid": 76337, "tid": -914061504, "ts": 1716454225150018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225193909, "dur": 21, "args": { "External id": 233993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233993, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233993, "pid": 5, "tid": 7, "ts": 1716454225193909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150029, "dur": 5, "args": { "External id": 233993, "cbid": 211, "correlation": 233993 } }, { "ph": "s", "id": 233993, "pid": 76337, "tid": -914061504, "ts": 1716454225150029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225193932, "dur": 32, "args": { "External id": 233999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 233999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 233999, "pid": 5, "tid": 7, "ts": 1716454225193932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150057, "dur": 8, "args": { "External id": 233999, "cbid": 211, "correlation": 233999 } }, { "ph": "s", "id": 233999, "pid": 76337, "tid": -914061504, "ts": 1716454225150057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225193966, "dur": 4, "args": { "External id": 234007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234007, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 234007, "pid": 5, "tid": 7, "ts": 1716454225193966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150100, "dur": 9, "args": { "External id": 234007, "cbid": 211, "correlation": 234007 } }, { "ph": "s", "id": 234007, "pid": 76337, "tid": -914061504, "ts": 1716454225150100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150165, "dur": 2, "args": { "External id": 234023, "cbid": 251, "correlation": 234023 } }, { "ph": "f", "id": 234023, "pid": 76337, "tid": -914061504, "ts": 1716454225150165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150172, "dur": 0, "args": { "External id": 234025, "cbid": 251, "correlation": 234025 } }, { "ph": "f", "id": 234025, "pid": 76337, "tid": -914061504, "ts": 1716454225150172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225193971, "dur": 13, "args": { "External id": 234026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234026, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 234026, "pid": 5, "tid": 7, "ts": 1716454225193971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150174, "dur": 11, "args": { "External id": 234026, "cbid": 211, "correlation": 234026 } }, { "ph": "s", "id": 234026, "pid": 76337, "tid": -914061504, "ts": 1716454225150174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225193984, "dur": 5, "args": { "External id": 234028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234028, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 234028, "pid": 5, "tid": 7, "ts": 1716454225193984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150187, "dur": 5, "args": { "External id": 234028, "cbid": 211, "correlation": 234028 } }, { "ph": "s", "id": 234028, "pid": 76337, "tid": -914061504, "ts": 1716454225150187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225193991, "dur": 29, "args": { "External id": 234038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234038, "pid": 5, "tid": 7, "ts": 1716454225193991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150245, "dur": 12, "args": { "External id": 234038, "cbid": 211, "correlation": 234038 } }, { "ph": "s", "id": 234038, "pid": 76337, "tid": -914061504, "ts": 1716454225150245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225194021, "dur": 30, "args": { "External id": 234058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234058, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 234058, "pid": 5, "tid": 7, "ts": 1716454225194021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150311, "dur": 11, "args": { "External id": 234058, "cbid": 211, "correlation": 234058 } }, { "ph": "s", "id": 234058, "pid": 76337, "tid": -914061504, "ts": 1716454225150311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225194052, "dur": 4, "args": { "External id": 234070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234070, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 234070, "pid": 5, "tid": 7, "ts": 1716454225194052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150332, "dur": 6, "args": { "External id": 234070, "cbid": 211, "correlation": 234070 } }, { "ph": "s", "id": 234070, "pid": 76337, "tid": -914061504, "ts": 1716454225150332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225194057, "dur": 29, "args": { "External id": 234073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234073, "pid": 5, "tid": 7, "ts": 1716454225194057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150351, "dur": 6, "args": { "External id": 234073, "cbid": 211, "correlation": 234073 } }, { "ph": "s", "id": 234073, "pid": 76337, "tid": -914061504, "ts": 1716454225150351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225194087, "dur": 20, "args": { "External id": 234082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234082, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234082, "pid": 5, "tid": 7, "ts": 1716454225194087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150392, "dur": 10, "args": { "External id": 234082, "cbid": 211, "correlation": 234082 } }, { "ph": "s", "id": 234082, "pid": 76337, "tid": -914061504, "ts": 1716454225150392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225150456, "dur": 0, "args": { "External id": 234092, "cbid": 317, "correlation": 234092 } }, { "ph": "f", "id": 234092, "pid": 76337, "tid": -914061504, "ts": 1716454225150456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225150457, "dur": 0, "args": { "External id": 234093, "cbid": 203, "correlation": 234093 } }, { "ph": "f", "id": 234093, "pid": 76337, "tid": -914061504, "ts": 1716454225150457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225150457, "dur": 0, "args": { "External id": 234094, "cbid": 205, "correlation": 234094 } }, { "ph": "f", "id": 234094, "pid": 76337, "tid": -914061504, "ts": 1716454225150457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225194108, "dur": 23, "args": { "External id": 234098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234098, "pid": 5, "tid": 7, "ts": 1716454225194108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150475, "dur": 12, "args": { "External id": 234098, "cbid": 211, "correlation": 234098 } }, { "ph": "s", "id": 234098, "pid": 76337, "tid": -914061504, "ts": 1716454225150475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225194132, "dur": 43, "args": { "External id": 234100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234100, "pid": 5, "tid": 7, "ts": 1716454225194132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150490, "dur": 5, "args": { "External id": 234100, "cbid": 211, "correlation": 234100 } }, { "ph": "s", "id": 234100, "pid": 76337, "tid": -914061504, "ts": 1716454225150490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225194176, "dur": 644, "args": { "External id": 234102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234102, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234102, "pid": 5, "tid": 7, "ts": 1716454225194176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150504, "dur": 11, "args": { "External id": 234102, "cbid": 211, "correlation": 234102 } }, { "ph": "s", "id": 234102, "pid": 76337, "tid": -914061504, "ts": 1716454225150504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225194822, "dur": 20, "args": { "External id": 234104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234104, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234104, "pid": 5, "tid": 7, "ts": 1716454225194822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150519, "dur": 5, "args": { "External id": 234104, "cbid": 211, "correlation": 234104 } }, { "ph": "s", "id": 234104, "pid": 76337, "tid": -914061504, "ts": 1716454225150519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225194844, "dur": 32, "args": { "External id": 234110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234110, "pid": 5, "tid": 7, "ts": 1716454225194844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150546, "dur": 8, "args": { "External id": 234110, "cbid": 211, "correlation": 234110 } }, { "ph": "s", "id": 234110, "pid": 76337, "tid": -914061504, "ts": 1716454225150546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225150604, "dur": 0, "args": { "External id": 234120, "cbid": 317, "correlation": 234120 } }, { "ph": "f", "id": 234120, "pid": 76337, "tid": -914061504, "ts": 1716454225150604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225150605, "dur": 0, "args": { "External id": 234121, "cbid": 203, "correlation": 234121 } }, { "ph": "f", "id": 234121, "pid": 76337, "tid": -914061504, "ts": 1716454225150605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225150605, "dur": 0, "args": { "External id": 234122, "cbid": 205, "correlation": 234122 } }, { "ph": "f", "id": 234122, "pid": 76337, "tid": -914061504, "ts": 1716454225150605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150627, "dur": 1, "args": { "External id": 234126, "cbid": 251, "correlation": 234126 } }, { "ph": "f", "id": 234126, "pid": 76337, "tid": -914061504, "ts": 1716454225150627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150629, "dur": 0, "args": { "External id": 234127, "cbid": 251, "correlation": 234127 } }, { "ph": "f", "id": 234127, "pid": 76337, "tid": -914061504, "ts": 1716454225150629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150630, "dur": 0, "args": { "External id": 234128, "cbid": 251, "correlation": 234128 } }, { "ph": "f", "id": 234128, "pid": 76337, "tid": -914061504, "ts": 1716454225150630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150630, "dur": 0, "args": { "External id": 234129, "cbid": 251, "correlation": 234129 } }, { "ph": "f", "id": 234129, "pid": 76337, "tid": -914061504, "ts": 1716454225150630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150631, "dur": 0, "args": { "External id": 234130, "cbid": 251, "correlation": 234130 } }, { "ph": "f", "id": 234130, "pid": 76337, "tid": -914061504, "ts": 1716454225150631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150632, "dur": 0, "args": { "External id": 234131, "cbid": 251, "correlation": 234131 } }, { "ph": "f", "id": 234131, "pid": 76337, "tid": -914061504, "ts": 1716454225150632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150633, "dur": 0, "args": { "External id": 234132, "cbid": 251, "correlation": 234132 } }, { "ph": "f", "id": 234132, "pid": 76337, "tid": -914061504, "ts": 1716454225150633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150633, "dur": 0, "args": { "External id": 234133, "cbid": 251, "correlation": 234133 } }, { "ph": "f", "id": 234133, "pid": 76337, "tid": -914061504, "ts": 1716454225150633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225150635, "dur": 0, "args": { "External id": 234134, "cbid": 251, "correlation": 234134 } }, { "ph": "f", "id": 234134, "pid": 76337, "tid": -914061504, "ts": 1716454225150635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225194877, "dur": 50, "args": { "External id": 234135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234135, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 234135, "pid": 5, "tid": 7, "ts": 1716454225194877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150637, "dur": 12, "args": { "External id": 234135, "cbid": 211, "correlation": 234135 } }, { "ph": "s", "id": 234135, "pid": 76337, "tid": -914061504, "ts": 1716454225150637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225194928, "dur": 31, "args": { "External id": 234141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234141, "pid": 5, "tid": 7, "ts": 1716454225194928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150670, "dur": 9, "args": { "External id": 234141, "cbid": 211, "correlation": 234141 } }, { "ph": "s", "id": 234141, "pid": 76337, "tid": -914061504, "ts": 1716454225150670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225194961, "dur": 27, "args": { "External id": 234149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234149, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234149, "pid": 5, "tid": 7, "ts": 1716454225194961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150700, "dur": 8, "args": { "External id": 234149, "cbid": 211, "correlation": 234149 } }, { "ph": "s", "id": 234149, "pid": 76337, "tid": -914061504, "ts": 1716454225150700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225194989, "dur": 20, "args": { "External id": 234157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234157, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234157, "pid": 5, "tid": 7, "ts": 1716454225194989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150729, "dur": 8, "args": { "External id": 234157, "cbid": 211, "correlation": 234157 } }, { "ph": "s", "id": 234157, "pid": 76337, "tid": -914061504, "ts": 1716454225150729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225195011, "dur": 29, "args": { "External id": 234177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234177, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 234177, "pid": 5, "tid": 7, "ts": 1716454225195011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150811, "dur": 13, "args": { "External id": 234177, "cbid": 211, "correlation": 234177 } }, { "ph": "s", "id": 234177, "pid": 76337, "tid": -914061504, "ts": 1716454225150811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225195041, "dur": 4, "args": { "External id": 234189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234189, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 234189, "pid": 5, "tid": 7, "ts": 1716454225195041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150833, "dur": 6, "args": { "External id": 234189, "cbid": 211, "correlation": 234189 } }, { "ph": "s", "id": 234189, "pid": 76337, "tid": -914061504, "ts": 1716454225150833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225195046, "dur": 30, "args": { "External id": 234192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234192, "pid": 5, "tid": 7, "ts": 1716454225195046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150851, "dur": 6, "args": { "External id": 234192, "cbid": 211, "correlation": 234192 } }, { "ph": "s", "id": 234192, "pid": 76337, "tid": -914061504, "ts": 1716454225150851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225150908, "dur": 0, "args": { "External id": 234203, "cbid": 317, "correlation": 234203 } }, { "ph": "f", "id": 234203, "pid": 76337, "tid": -914061504, "ts": 1716454225150908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225150909, "dur": 0, "args": { "External id": 234204, "cbid": 203, "correlation": 234204 } }, { "ph": "f", "id": 234204, "pid": 76337, "tid": -914061504, "ts": 1716454225150909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225150910, "dur": 0, "args": { "External id": 234205, "cbid": 205, "correlation": 234205 } }, { "ph": "f", "id": 234205, "pid": 76337, "tid": -914061504, "ts": 1716454225150910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225195077, "dur": 21, "args": { "External id": 234209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234209, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234209, "pid": 5, "tid": 7, "ts": 1716454225195077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150925, "dur": 11, "args": { "External id": 234209, "cbid": 211, "correlation": 234209 } }, { "ph": "s", "id": 234209, "pid": 76337, "tid": -914061504, "ts": 1716454225150925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225195100, "dur": 118, "args": { "External id": 234211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234211, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234211, "pid": 5, "tid": 7, "ts": 1716454225195100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150944, "dur": 8, "args": { "External id": 234211, "cbid": 211, "correlation": 234211 } }, { "ph": "s", "id": 234211, "pid": 76337, "tid": -914061504, "ts": 1716454225150944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225195219, "dur": 23, "args": { "External id": 234213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234213, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234213, "pid": 5, "tid": 7, "ts": 1716454225195219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150956, "dur": 5, "args": { "External id": 234213, "cbid": 211, "correlation": 234213 } }, { "ph": "s", "id": 234213, "pid": 76337, "tid": -914061504, "ts": 1716454225150956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225195243, "dur": 32, "args": { "External id": 234219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234219, "pid": 5, "tid": 7, "ts": 1716454225195243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225150994, "dur": 9, "args": { "External id": 234219, "cbid": 211, "correlation": 234219 } }, { "ph": "s", "id": 234219, "pid": 76337, "tid": -914061504, "ts": 1716454225150994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225195277, "dur": 160, "args": { "External id": 234228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234228, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234228, "pid": 5, "tid": 7, "ts": 1716454225195277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151078, "dur": 14, "args": { "External id": 234228, "cbid": 211, "correlation": 234228 } }, { "ph": "s", "id": 234228, "pid": 76337, "tid": -914061504, "ts": 1716454225151078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225195439, "dur": 64, "args": { "External id": 234250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234250, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234250, "pid": 5, "tid": 7, "ts": 1716454225195439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151136, "dur": 11, "args": { "External id": 234250, "cbid": 211, "correlation": 234250 } }, { "ph": "s", "id": 234250, "pid": 76337, "tid": -914061504, "ts": 1716454225151136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151227, "dur": 1, "args": { "External id": 234261, "cbid": 251, "correlation": 234261 } }, { "ph": "f", "id": 234261, "pid": 76337, "tid": -914061504, "ts": 1716454225151227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225195504, "dur": 148, "args": { "External id": 234262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234262, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234262, "pid": 5, "tid": 7, "ts": 1716454225195504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151232, "dur": 13, "args": { "External id": 234262, "cbid": 211, "correlation": 234262 } }, { "ph": "s", "id": 234262, "pid": 76337, "tid": -914061504, "ts": 1716454225151232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151303, "dur": 1, "args": { "External id": 234273, "cbid": 251, "correlation": 234273 } }, { "ph": "f", "id": 234273, "pid": 76337, "tid": -914061504, "ts": 1716454225151303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225195653, "dur": 144, "args": { "External id": 234274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234274, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234274, "pid": 5, "tid": 7, "ts": 1716454225195653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151307, "dur": 12, "args": { "External id": 234274, "cbid": 211, "correlation": 234274 } }, { "ph": "s", "id": 234274, "pid": 76337, "tid": -914061504, "ts": 1716454225151307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151372, "dur": 1, "args": { "External id": 234285, "cbid": 251, "correlation": 234285 } }, { "ph": "f", "id": 234285, "pid": 76337, "tid": -914061504, "ts": 1716454225151372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225195799, "dur": 144, "args": { "External id": 234286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234286, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234286, "pid": 5, "tid": 7, "ts": 1716454225195799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151376, "dur": 11, "args": { "External id": 234286, "cbid": 211, "correlation": 234286 } }, { "ph": "s", "id": 234286, "pid": 76337, "tid": -914061504, "ts": 1716454225151376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225195944, "dur": 1903, "args": { "External id": 234307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234307, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 234307, "pid": 5, "tid": 7, "ts": 1716454225195944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151458, "dur": 20, "args": { "External id": 234307, "cbid": 211, "correlation": 234307 } }, { "ph": "s", "id": 234307, "pid": 76337, "tid": -914061504, "ts": 1716454225151458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151568, "dur": 1, "args": { "External id": 234325, "cbid": 251, "correlation": 234325 } }, { "ph": "f", "id": 234325, "pid": 76337, "tid": -914061504, "ts": 1716454225151568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225197849, "dur": 144, "args": { "External id": 234327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234327, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 234327, "pid": 5, "tid": 7, "ts": 1716454225197849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151574, "dur": 13, "args": { "External id": 234327, "cbid": 211, "correlation": 234327 } }, { "ph": "s", "id": 234327, "pid": 76337, "tid": -914061504, "ts": 1716454225151574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225197994, "dur": 36, "args": { "External id": 234335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234335, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234335, "pid": 5, "tid": 7, "ts": 1716454225197994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151645, "dur": 12, "args": { "External id": 234335, "cbid": 211, "correlation": 234335 } }, { "ph": "s", "id": 234335, "pid": 76337, "tid": -914061504, "ts": 1716454225151645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225198031, "dur": 51, "args": { "External id": 234343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234343, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234343, "pid": 5, "tid": 7, "ts": 1716454225198031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151684, "dur": 9, "args": { "External id": 234343, "cbid": 211, "correlation": 234343 } }, { "ph": "s", "id": 234343, "pid": 76337, "tid": -914061504, "ts": 1716454225151684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225198083, "dur": 30, "args": { "External id": 234354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234354, "pid": 5, "tid": 7, "ts": 1716454225198083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151757, "dur": 13, "args": { "External id": 234354, "cbid": 211, "correlation": 234354 } }, { "ph": "s", "id": 234354, "pid": 76337, "tid": -914061504, "ts": 1716454225151757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225198114, "dur": 33, "args": { "External id": 234376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234376, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234376, "pid": 5, "tid": 7, "ts": 1716454225198114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151790, "dur": 7, "args": { "External id": 234376, "cbid": 211, "correlation": 234376 } }, { "ph": "s", "id": 234376, "pid": 76337, "tid": -914061504, "ts": 1716454225151790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151876, "dur": 1, "args": { "External id": 234387, "cbid": 251, "correlation": 234387 } }, { "ph": "f", "id": 234387, "pid": 76337, "tid": -914061504, "ts": 1716454225151876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225198149, "dur": 89, "args": { "External id": 234388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234388, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234388, "pid": 5, "tid": 7, "ts": 1716454225198149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151882, "dur": 13, "args": { "External id": 234388, "cbid": 211, "correlation": 234388 } }, { "ph": "s", "id": 234388, "pid": 76337, "tid": -914061504, "ts": 1716454225151882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151952, "dur": 1, "args": { "External id": 234399, "cbid": 251, "correlation": 234399 } }, { "ph": "f", "id": 234399, "pid": 76337, "tid": -914061504, "ts": 1716454225151952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225151956, "dur": 0, "args": { "External id": 234400, "cbid": 251, "correlation": 234400 } }, { "ph": "f", "id": 234400, "pid": 76337, "tid": -914061504, "ts": 1716454225151956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225198240, "dur": 11, "args": { "External id": 234401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234401, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 234401, "pid": 5, "tid": 7, "ts": 1716454225198240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151957, "dur": 12, "args": { "External id": 234401, "cbid": 211, "correlation": 234401 } }, { "ph": "s", "id": 234401, "pid": 76337, "tid": -914061504, "ts": 1716454225151957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225198252, "dur": 5, "args": { "External id": 234403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234403, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 234403, "pid": 5, "tid": 7, "ts": 1716454225198252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225151981, "dur": 7, "args": { "External id": 234403, "cbid": 211, "correlation": 234403 } }, { "ph": "s", "id": 234403, "pid": 76337, "tid": -914061504, "ts": 1716454225151981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225152042, "dur": 1, "args": { "External id": 234414, "cbid": 251, "correlation": 234414 } }, { "ph": "f", "id": 234414, "pid": 76337, "tid": -914061504, "ts": 1716454225152042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225152045, "dur": 0, "args": { "External id": 234415, "cbid": 251, "correlation": 234415 } }, { "ph": "f", "id": 234415, "pid": 76337, "tid": -914061504, "ts": 1716454225152045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225198258, "dur": 7, "args": { "External id": 234416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234416, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 234416, "pid": 5, "tid": 7, "ts": 1716454225198258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152046, "dur": 12, "args": { "External id": 234416, "cbid": 211, "correlation": 234416 } }, { "ph": "s", "id": 234416, "pid": 76337, "tid": -914061504, "ts": 1716454225152046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225198266, "dur": 3, "args": { "External id": 234418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234418, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 234418, "pid": 5, "tid": 7, "ts": 1716454225198266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152060, "dur": 6, "args": { "External id": 234418, "cbid": 211, "correlation": 234418 } }, { "ph": "s", "id": 234418, "pid": 76337, "tid": -914061504, "ts": 1716454225152060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225198271, "dur": 90, "args": { "External id": 234439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234439, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 234439, "pid": 5, "tid": 7, "ts": 1716454225198271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152135, "dur": 13, "args": { "External id": 234439, "cbid": 211, "correlation": 234439 } }, { "ph": "s", "id": 234439, "pid": 76337, "tid": -914061504, "ts": 1716454225152135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225152233, "dur": 1, "args": { "External id": 234457, "cbid": 251, "correlation": 234457 } }, { "ph": "f", "id": 234457, "pid": 76337, "tid": -914061504, "ts": 1716454225152233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225198362, "dur": 97, "args": { "External id": 234459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234459, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234459, "pid": 5, "tid": 7, "ts": 1716454225198362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152238, "dur": 13, "args": { "External id": 234459, "cbid": 211, "correlation": 234459 } }, { "ph": "s", "id": 234459, "pid": 76337, "tid": -914061504, "ts": 1716454225152238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225198460, "dur": 19, "args": { "External id": 234467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234467, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234467, "pid": 5, "tid": 7, "ts": 1716454225198460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152308, "dur": 12, "args": { "External id": 234467, "cbid": 211, "correlation": 234467 } }, { "ph": "s", "id": 234467, "pid": 76337, "tid": -914061504, "ts": 1716454225152308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225198480, "dur": 37, "args": { "External id": 234475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234475, "pid": 5, "tid": 7, "ts": 1716454225198480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152350, "dur": 9, "args": { "External id": 234475, "cbid": 211, "correlation": 234475 } }, { "ph": "s", "id": 234475, "pid": 76337, "tid": -914061504, "ts": 1716454225152350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225198519, "dur": 33, "args": { "External id": 234497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234497, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234497, "pid": 5, "tid": 7, "ts": 1716454225198519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152401, "dur": 10, "args": { "External id": 234497, "cbid": 211, "correlation": 234497 } }, { "ph": "s", "id": 234497, "pid": 76337, "tid": -914061504, "ts": 1716454225152401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225152490, "dur": 1, "args": { "External id": 234513, "cbid": 251, "correlation": 234513 } }, { "ph": "f", "id": 234513, "pid": 76337, "tid": -914061504, "ts": 1716454225152490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225152495, "dur": 0, "args": { "External id": 234515, "cbid": 251, "correlation": 234515 } }, { "ph": "f", "id": 234515, "pid": 76337, "tid": -914061504, "ts": 1716454225152495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225198554, "dur": 532, "args": { "External id": 234516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234516, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 234516, "pid": 5, "tid": 7, "ts": 1716454225198554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152498, "dur": 13, "args": { "External id": 234516, "cbid": 211, "correlation": 234516 } }, { "ph": "s", "id": 234516, "pid": 76337, "tid": -914061504, "ts": 1716454225152498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225199087, "dur": 125, "args": { "External id": 234524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234524, "pid": 5, "tid": 7, "ts": 1716454225199087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152564, "dur": 12, "args": { "External id": 234524, "cbid": 211, "correlation": 234524 } }, { "ph": "s", "id": 234524, "pid": 76337, "tid": -914061504, "ts": 1716454225152564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225199214, "dur": 128, "args": { "External id": 234532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234532, "pid": 5, "tid": 7, "ts": 1716454225199214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152595, "dur": 8, "args": { "External id": 234532, "cbid": 211, "correlation": 234532 } }, { "ph": "s", "id": 234532, "pid": 76337, "tid": -914061504, "ts": 1716454225152595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225152671, "dur": 1, "args": { "External id": 234548, "cbid": 251, "correlation": 234548 } }, { "ph": "f", "id": 234548, "pid": 76337, "tid": -914061504, "ts": 1716454225152671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225199343, "dur": 301, "args": { "External id": 234550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234550, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234550, "pid": 5, "tid": 7, "ts": 1716454225199343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152677, "dur": 13, "args": { "External id": 234550, "cbid": 211, "correlation": 234550 } }, { "ph": "s", "id": 234550, "pid": 76337, "tid": -914061504, "ts": 1716454225152677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225199646, "dur": 27, "args": { "External id": 234558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234558, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234558, "pid": 5, "tid": 7, "ts": 1716454225199646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152719, "dur": 10, "args": { "External id": 234558, "cbid": 211, "correlation": 234558 } }, { "ph": "s", "id": 234558, "pid": 76337, "tid": -914061504, "ts": 1716454225152719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225199674, "dur": 80, "args": { "External id": 234569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234569, "pid": 5, "tid": 7, "ts": 1716454225199674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152789, "dur": 12, "args": { "External id": 234569, "cbid": 211, "correlation": 234569 } }, { "ph": "s", "id": 234569, "pid": 76337, "tid": -914061504, "ts": 1716454225152789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225152853, "dur": 0, "args": { "External id": 234581, "cbid": 317, "correlation": 234581 } }, { "ph": "f", "id": 234581, "pid": 76337, "tid": -914061504, "ts": 1716454225152853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225152854, "dur": 0, "args": { "External id": 234582, "cbid": 203, "correlation": 234582 } }, { "ph": "f", "id": 234582, "pid": 76337, "tid": -914061504, "ts": 1716454225152854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225152855, "dur": 0, "args": { "External id": 234583, "cbid": 205, "correlation": 234583 } }, { "ph": "f", "id": 234583, "pid": 76337, "tid": -914061504, "ts": 1716454225152855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225199755, "dur": 22, "args": { "External id": 234587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234587, "pid": 5, "tid": 7, "ts": 1716454225199755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152871, "dur": 12, "args": { "External id": 234587, "cbid": 211, "correlation": 234587 } }, { "ph": "s", "id": 234587, "pid": 76337, "tid": -914061504, "ts": 1716454225152871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225199779, "dur": 117, "args": { "External id": 234589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234589, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234589, "pid": 5, "tid": 7, "ts": 1716454225199779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152890, "dur": 6, "args": { "External id": 234589, "cbid": 211, "correlation": 234589 } }, { "ph": "s", "id": 234589, "pid": 76337, "tid": -914061504, "ts": 1716454225152890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225199898, "dur": 23, "args": { "External id": 234591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234591, "pid": 5, "tid": 7, "ts": 1716454225199898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152901, "dur": 5, "args": { "External id": 234591, "cbid": 211, "correlation": 234591 } }, { "ph": "s", "id": 234591, "pid": 76337, "tid": -914061504, "ts": 1716454225152901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225199922, "dur": 33, "args": { "External id": 234597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234597, "pid": 5, "tid": 7, "ts": 1716454225199922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152928, "dur": 10, "args": { "External id": 234597, "cbid": 211, "correlation": 234597 } }, { "ph": "s", "id": 234597, "pid": 76337, "tid": -914061504, "ts": 1716454225152928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225199956, "dur": 26, "args": { "External id": 234605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234605, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234605, "pid": 5, "tid": 7, "ts": 1716454225199956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225152960, "dur": 8, "args": { "External id": 234605, "cbid": 211, "correlation": 234605 } }, { "ph": "s", "id": 234605, "pid": 76337, "tid": -914061504, "ts": 1716454225152960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225199984, "dur": 29, "args": { "External id": 234625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234625, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 234625, "pid": 5, "tid": 7, "ts": 1716454225199984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153045, "dur": 12, "args": { "External id": 234625, "cbid": 211, "correlation": 234625 } }, { "ph": "s", "id": 234625, "pid": 76337, "tid": -914061504, "ts": 1716454225153045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225200015, "dur": 5, "args": { "External id": 234637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234637, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 234637, "pid": 5, "tid": 7, "ts": 1716454225200015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153067, "dur": 7, "args": { "External id": 234637, "cbid": 211, "correlation": 234637 } }, { "ph": "s", "id": 234637, "pid": 76337, "tid": -914061504, "ts": 1716454225153067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225200021, "dur": 30, "args": { "External id": 234640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234640, "pid": 5, "tid": 7, "ts": 1716454225200021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153087, "dur": 6, "args": { "External id": 234640, "cbid": 211, "correlation": 234640 } }, { "ph": "s", "id": 234640, "pid": 76337, "tid": -914061504, "ts": 1716454225153087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225200053, "dur": 20, "args": { "External id": 234649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234649, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234649, "pid": 5, "tid": 7, "ts": 1716454225200053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153125, "dur": 10, "args": { "External id": 234649, "cbid": 211, "correlation": 234649 } }, { "ph": "s", "id": 234649, "pid": 76337, "tid": -914061504, "ts": 1716454225153125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225153175, "dur": 0, "args": { "External id": 234659, "cbid": 317, "correlation": 234659 } }, { "ph": "f", "id": 234659, "pid": 76337, "tid": -914061504, "ts": 1716454225153175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225153176, "dur": 0, "args": { "External id": 234660, "cbid": 203, "correlation": 234660 } }, { "ph": "f", "id": 234660, "pid": 76337, "tid": -914061504, "ts": 1716454225153176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225153177, "dur": 0, "args": { "External id": 234661, "cbid": 205, "correlation": 234661 } }, { "ph": "f", "id": 234661, "pid": 76337, "tid": -914061504, "ts": 1716454225153177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225200074, "dur": 21, "args": { "External id": 234665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234665, "pid": 5, "tid": 7, "ts": 1716454225200074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153192, "dur": 11, "args": { "External id": 234665, "cbid": 211, "correlation": 234665 } }, { "ph": "s", "id": 234665, "pid": 76337, "tid": -914061504, "ts": 1716454225153192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225200097, "dur": 43, "args": { "External id": 234667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234667, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234667, "pid": 5, "tid": 7, "ts": 1716454225200097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153206, "dur": 5, "args": { "External id": 234667, "cbid": 211, "correlation": 234667 } }, { "ph": "s", "id": 234667, "pid": 76337, "tid": -914061504, "ts": 1716454225153206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225200141, "dur": 643, "args": { "External id": 234669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234669, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234669, "pid": 5, "tid": 7, "ts": 1716454225200141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153217, "dur": 6, "args": { "External id": 234669, "cbid": 211, "correlation": 234669 } }, { "ph": "s", "id": 234669, "pid": 76337, "tid": -914061504, "ts": 1716454225153217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225200786, "dur": 22, "args": { "External id": 234671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234671, "pid": 5, "tid": 7, "ts": 1716454225200786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153227, "dur": 6, "args": { "External id": 234671, "cbid": 211, "correlation": 234671 } }, { "ph": "s", "id": 234671, "pid": 76337, "tid": -914061504, "ts": 1716454225153227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225200809, "dur": 33, "args": { "External id": 234677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234677, "pid": 5, "tid": 7, "ts": 1716454225200809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153255, "dur": 9, "args": { "External id": 234677, "cbid": 211, "correlation": 234677 } }, { "ph": "s", "id": 234677, "pid": 76337, "tid": -914061504, "ts": 1716454225153255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225200843, "dur": 3, "args": { "External id": 234685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234685, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 234685, "pid": 5, "tid": 7, "ts": 1716454225200843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153299, "dur": 9, "args": { "External id": 234685, "cbid": 211, "correlation": 234685 } }, { "ph": "s", "id": 234685, "pid": 76337, "tid": -914061504, "ts": 1716454225153299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225153363, "dur": 1, "args": { "External id": 234701, "cbid": 251, "correlation": 234701 } }, { "ph": "f", "id": 234701, "pid": 76337, "tid": -914061504, "ts": 1716454225153363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225153368, "dur": 0, "args": { "External id": 234703, "cbid": 251, "correlation": 234703 } }, { "ph": "f", "id": 234703, "pid": 76337, "tid": -914061504, "ts": 1716454225153368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225200848, "dur": 12, "args": { "External id": 234704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234704, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 234704, "pid": 5, "tid": 7, "ts": 1716454225200848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153370, "dur": 12, "args": { "External id": 234704, "cbid": 211, "correlation": 234704 } }, { "ph": "s", "id": 234704, "pid": 76337, "tid": -914061504, "ts": 1716454225153370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225200861, "dur": 5, "args": { "External id": 234706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234706, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 234706, "pid": 5, "tid": 7, "ts": 1716454225200861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153383, "dur": 6, "args": { "External id": 234706, "cbid": 211, "correlation": 234706 } }, { "ph": "s", "id": 234706, "pid": 76337, "tid": -914061504, "ts": 1716454225153383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225200867, "dur": 30, "args": { "External id": 234716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234716, "pid": 5, "tid": 7, "ts": 1716454225200867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153442, "dur": 11, "args": { "External id": 234716, "cbid": 211, "correlation": 234716 } }, { "ph": "s", "id": 234716, "pid": 76337, "tid": -914061504, "ts": 1716454225153442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225200898, "dur": 30, "args": { "External id": 234736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234736, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 234736, "pid": 5, "tid": 7, "ts": 1716454225200898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153506, "dur": 10, "args": { "External id": 234736, "cbid": 211, "correlation": 234736 } }, { "ph": "s", "id": 234736, "pid": 76337, "tid": -914061504, "ts": 1716454225153506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225200929, "dur": 4, "args": { "External id": 234748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234748, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 234748, "pid": 5, "tid": 7, "ts": 1716454225200929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153526, "dur": 6, "args": { "External id": 234748, "cbid": 211, "correlation": 234748 } }, { "ph": "s", "id": 234748, "pid": 76337, "tid": -914061504, "ts": 1716454225153526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225200934, "dur": 29, "args": { "External id": 234751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234751, "pid": 5, "tid": 7, "ts": 1716454225200934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153546, "dur": 7, "args": { "External id": 234751, "cbid": 211, "correlation": 234751 } }, { "ph": "s", "id": 234751, "pid": 76337, "tid": -914061504, "ts": 1716454225153546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225200964, "dur": 20, "args": { "External id": 234760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234760, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234760, "pid": 5, "tid": 7, "ts": 1716454225200964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153587, "dur": 10, "args": { "External id": 234760, "cbid": 211, "correlation": 234760 } }, { "ph": "s", "id": 234760, "pid": 76337, "tid": -914061504, "ts": 1716454225153587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225153648, "dur": 0, "args": { "External id": 234770, "cbid": 317, "correlation": 234770 } }, { "ph": "f", "id": 234770, "pid": 76337, "tid": -914061504, "ts": 1716454225153648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225153649, "dur": 0, "args": { "External id": 234771, "cbid": 203, "correlation": 234771 } }, { "ph": "f", "id": 234771, "pid": 76337, "tid": -914061504, "ts": 1716454225153649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225153650, "dur": 0, "args": { "External id": 234772, "cbid": 205, "correlation": 234772 } }, { "ph": "f", "id": 234772, "pid": 76337, "tid": -914061504, "ts": 1716454225153650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225200986, "dur": 23, "args": { "External id": 234776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234776, "pid": 5, "tid": 7, "ts": 1716454225200986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153663, "dur": 12, "args": { "External id": 234776, "cbid": 211, "correlation": 234776 } }, { "ph": "s", "id": 234776, "pid": 76337, "tid": -914061504, "ts": 1716454225153663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225201010, "dur": 43, "args": { "External id": 234778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234778, "pid": 5, "tid": 7, "ts": 1716454225201010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153677, "dur": 6, "args": { "External id": 234778, "cbid": 211, "correlation": 234778 } }, { "ph": "s", "id": 234778, "pid": 76337, "tid": -914061504, "ts": 1716454225153677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225201054, "dur": 636, "args": { "External id": 234780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234780, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234780, "pid": 5, "tid": 7, "ts": 1716454225201054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153689, "dur": 6, "args": { "External id": 234780, "cbid": 211, "correlation": 234780 } }, { "ph": "s", "id": 234780, "pid": 76337, "tid": -914061504, "ts": 1716454225153689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225201692, "dur": 20, "args": { "External id": 234782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234782, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234782, "pid": 5, "tid": 7, "ts": 1716454225201692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153698, "dur": 5, "args": { "External id": 234782, "cbid": 211, "correlation": 234782 } }, { "ph": "s", "id": 234782, "pid": 76337, "tid": -914061504, "ts": 1716454225153698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225201713, "dur": 32, "args": { "External id": 234788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234788, "pid": 5, "tid": 7, "ts": 1716454225201713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153725, "dur": 9, "args": { "External id": 234788, "cbid": 211, "correlation": 234788 } }, { "ph": "s", "id": 234788, "pid": 76337, "tid": -914061504, "ts": 1716454225153725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225201747, "dur": 27, "args": { "External id": 234796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234796, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234796, "pid": 5, "tid": 7, "ts": 1716454225201747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153759, "dur": 8, "args": { "External id": 234796, "cbid": 211, "correlation": 234796 } }, { "ph": "s", "id": 234796, "pid": 76337, "tid": -914061504, "ts": 1716454225153759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225201775, "dur": 20, "args": { "External id": 234804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234804, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234804, "pid": 5, "tid": 7, "ts": 1716454225201775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153789, "dur": 8, "args": { "External id": 234804, "cbid": 211, "correlation": 234804 } }, { "ph": "s", "id": 234804, "pid": 76337, "tid": -914061504, "ts": 1716454225153789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225201797, "dur": 30, "args": { "External id": 234824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234824, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 234824, "pid": 5, "tid": 7, "ts": 1716454225201797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153866, "dur": 12, "args": { "External id": 234824, "cbid": 211, "correlation": 234824 } }, { "ph": "s", "id": 234824, "pid": 76337, "tid": -914061504, "ts": 1716454225153866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225201828, "dur": 4, "args": { "External id": 234836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234836, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 234836, "pid": 5, "tid": 7, "ts": 1716454225201828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153888, "dur": 7, "args": { "External id": 234836, "cbid": 211, "correlation": 234836 } }, { "ph": "s", "id": 234836, "pid": 76337, "tid": -914061504, "ts": 1716454225153888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225201833, "dur": 30, "args": { "External id": 234839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234839, "pid": 5, "tid": 7, "ts": 1716454225201833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153907, "dur": 6, "args": { "External id": 234839, "cbid": 211, "correlation": 234839 } }, { "ph": "s", "id": 234839, "pid": 76337, "tid": -914061504, "ts": 1716454225153907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225153963, "dur": 0, "args": { "External id": 234850, "cbid": 317, "correlation": 234850 } }, { "ph": "f", "id": 234850, "pid": 76337, "tid": -914061504, "ts": 1716454225153963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225153964, "dur": 0, "args": { "External id": 234851, "cbid": 203, "correlation": 234851 } }, { "ph": "f", "id": 234851, "pid": 76337, "tid": -914061504, "ts": 1716454225153964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225153965, "dur": 0, "args": { "External id": 234852, "cbid": 205, "correlation": 234852 } }, { "ph": "f", "id": 234852, "pid": 76337, "tid": -914061504, "ts": 1716454225153965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225201864, "dur": 22, "args": { "External id": 234856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234856, "pid": 5, "tid": 7, "ts": 1716454225201864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225153986, "dur": 13, "args": { "External id": 234856, "cbid": 211, "correlation": 234856 } }, { "ph": "s", "id": 234856, "pid": 76337, "tid": -914061504, "ts": 1716454225153986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225201887, "dur": 114, "args": { "External id": 234858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234858, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234858, "pid": 5, "tid": 7, "ts": 1716454225201887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154006, "dur": 6, "args": { "External id": 234858, "cbid": 211, "correlation": 234858 } }, { "ph": "s", "id": 234858, "pid": 76337, "tid": -914061504, "ts": 1716454225154006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225202003, "dur": 21, "args": { "External id": 234860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234860, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234860, "pid": 5, "tid": 7, "ts": 1716454225202003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154015, "dur": 5, "args": { "External id": 234860, "cbid": 211, "correlation": 234860 } }, { "ph": "s", "id": 234860, "pid": 76337, "tid": -914061504, "ts": 1716454225154015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225202025, "dur": 32, "args": { "External id": 234866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234866, "pid": 5, "tid": 7, "ts": 1716454225202025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154044, "dur": 9, "args": { "External id": 234866, "cbid": 211, "correlation": 234866 } }, { "ph": "s", "id": 234866, "pid": 76337, "tid": -914061504, "ts": 1716454225154044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225202058, "dur": 174, "args": { "External id": 234875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234875, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234875, "pid": 5, "tid": 7, "ts": 1716454225202058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154126, "dur": 14, "args": { "External id": 234875, "cbid": 211, "correlation": 234875 } }, { "ph": "s", "id": 234875, "pid": 76337, "tid": -914061504, "ts": 1716454225154126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225202233, "dur": 63, "args": { "External id": 234897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234897, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234897, "pid": 5, "tid": 7, "ts": 1716454225202233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154182, "dur": 10, "args": { "External id": 234897, "cbid": 211, "correlation": 234897 } }, { "ph": "s", "id": 234897, "pid": 76337, "tid": -914061504, "ts": 1716454225154182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154270, "dur": 1, "args": { "External id": 234908, "cbid": 251, "correlation": 234908 } }, { "ph": "f", "id": 234908, "pid": 76337, "tid": -914061504, "ts": 1716454225154270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225202298, "dur": 149, "args": { "External id": 234909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234909, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234909, "pid": 5, "tid": 7, "ts": 1716454225202298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154275, "dur": 13, "args": { "External id": 234909, "cbid": 211, "correlation": 234909 } }, { "ph": "s", "id": 234909, "pid": 76337, "tid": -914061504, "ts": 1716454225154275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154346, "dur": 1, "args": { "External id": 234920, "cbid": 251, "correlation": 234920 } }, { "ph": "f", "id": 234920, "pid": 76337, "tid": -914061504, "ts": 1716454225154346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225202448, "dur": 143, "args": { "External id": 234921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234921, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234921, "pid": 5, "tid": 7, "ts": 1716454225202448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154350, "dur": 12, "args": { "External id": 234921, "cbid": 211, "correlation": 234921 } }, { "ph": "s", "id": 234921, "pid": 76337, "tid": -914061504, "ts": 1716454225154350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154415, "dur": 1, "args": { "External id": 234932, "cbid": 251, "correlation": 234932 } }, { "ph": "f", "id": 234932, "pid": 76337, "tid": -914061504, "ts": 1716454225154415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225202592, "dur": 146, "args": { "External id": 234933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234933, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 234933, "pid": 5, "tid": 7, "ts": 1716454225202592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154419, "dur": 11, "args": { "External id": 234933, "cbid": 211, "correlation": 234933 } }, { "ph": "s", "id": 234933, "pid": 76337, "tid": -914061504, "ts": 1716454225154419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225202739, "dur": 1910, "args": { "External id": 234954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234954, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 234954, "pid": 5, "tid": 7, "ts": 1716454225202739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154501, "dur": 12, "args": { "External id": 234954, "cbid": 211, "correlation": 234954 } }, { "ph": "s", "id": 234954, "pid": 76337, "tid": -914061504, "ts": 1716454225154501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154601, "dur": 1, "args": { "External id": 234972, "cbid": 251, "correlation": 234972 } }, { "ph": "f", "id": 234972, "pid": 76337, "tid": -914061504, "ts": 1716454225154601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225204650, "dur": 146, "args": { "External id": 234974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234974, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 234974, "pid": 5, "tid": 7, "ts": 1716454225204650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154607, "dur": 13, "args": { "External id": 234974, "cbid": 211, "correlation": 234974 } }, { "ph": "s", "id": 234974, "pid": 76337, "tid": -914061504, "ts": 1716454225154607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225204798, "dur": 35, "args": { "External id": 234982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234982, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234982, "pid": 5, "tid": 7, "ts": 1716454225204798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154677, "dur": 12, "args": { "External id": 234982, "cbid": 211, "correlation": 234982 } }, { "ph": "s", "id": 234982, "pid": 76337, "tid": -914061504, "ts": 1716454225154677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225204834, "dur": 50, "args": { "External id": 234990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 234990, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 234990, "pid": 5, "tid": 7, "ts": 1716454225204834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154716, "dur": 8, "args": { "External id": 234990, "cbid": 211, "correlation": 234990 } }, { "ph": "s", "id": 234990, "pid": 76337, "tid": -914061504, "ts": 1716454225154716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225204886, "dur": 30, "args": { "External id": 235001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235001, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235001, "pid": 5, "tid": 7, "ts": 1716454225204886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154787, "dur": 13, "args": { "External id": 235001, "cbid": 211, "correlation": 235001 } }, { "ph": "s", "id": 235001, "pid": 76337, "tid": -914061504, "ts": 1716454225154787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225204918, "dur": 34, "args": { "External id": 235023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235023, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235023, "pid": 5, "tid": 7, "ts": 1716454225204918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154818, "dur": 8, "args": { "External id": 235023, "cbid": 211, "correlation": 235023 } }, { "ph": "s", "id": 235023, "pid": 76337, "tid": -914061504, "ts": 1716454225154818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154903, "dur": 1, "args": { "External id": 235034, "cbid": 251, "correlation": 235034 } }, { "ph": "f", "id": 235034, "pid": 76337, "tid": -914061504, "ts": 1716454225154903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225204953, "dur": 91, "args": { "External id": 235035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235035, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235035, "pid": 5, "tid": 7, "ts": 1716454225204953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154908, "dur": 12, "args": { "External id": 235035, "cbid": 211, "correlation": 235035 } }, { "ph": "s", "id": 235035, "pid": 76337, "tid": -914061504, "ts": 1716454225154908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154984, "dur": 1, "args": { "External id": 235046, "cbid": 251, "correlation": 235046 } }, { "ph": "f", "id": 235046, "pid": 76337, "tid": -914061504, "ts": 1716454225154984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225154988, "dur": 0, "args": { "External id": 235047, "cbid": 251, "correlation": 235047 } }, { "ph": "f", "id": 235047, "pid": 76337, "tid": -914061504, "ts": 1716454225154988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225205045, "dur": 11, "args": { "External id": 235048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235048, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 235048, "pid": 5, "tid": 7, "ts": 1716454225205045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225154989, "dur": 13, "args": { "External id": 235048, "cbid": 211, "correlation": 235048 } }, { "ph": "s", "id": 235048, "pid": 76337, "tid": -914061504, "ts": 1716454225154989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225205057, "dur": 5, "args": { "External id": 235050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235050, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 235050, "pid": 5, "tid": 7, "ts": 1716454225205057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155004, "dur": 6, "args": { "External id": 235050, "cbid": 211, "correlation": 235050 } }, { "ph": "s", "id": 235050, "pid": 76337, "tid": -914061504, "ts": 1716454225155004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225155062, "dur": 1, "args": { "External id": 235061, "cbid": 251, "correlation": 235061 } }, { "ph": "f", "id": 235061, "pid": 76337, "tid": -914061504, "ts": 1716454225155062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225155066, "dur": 0, "args": { "External id": 235062, "cbid": 251, "correlation": 235062 } }, { "ph": "f", "id": 235062, "pid": 76337, "tid": -914061504, "ts": 1716454225155066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225205063, "dur": 7, "args": { "External id": 235063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235063, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 235063, "pid": 5, "tid": 7, "ts": 1716454225205063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155067, "dur": 11, "args": { "External id": 235063, "cbid": 211, "correlation": 235063 } }, { "ph": "s", "id": 235063, "pid": 76337, "tid": -914061504, "ts": 1716454225155067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225205071, "dur": 3, "args": { "External id": 235065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235065, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 235065, "pid": 5, "tid": 7, "ts": 1716454225205071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155080, "dur": 5, "args": { "External id": 235065, "cbid": 211, "correlation": 235065 } }, { "ph": "s", "id": 235065, "pid": 76337, "tid": -914061504, "ts": 1716454225155080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225205076, "dur": 91, "args": { "External id": 235086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235086, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 235086, "pid": 5, "tid": 7, "ts": 1716454225205076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155155, "dur": 13, "args": { "External id": 235086, "cbid": 211, "correlation": 235086 } }, { "ph": "s", "id": 235086, "pid": 76337, "tid": -914061504, "ts": 1716454225155155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225155251, "dur": 1, "args": { "External id": 235104, "cbid": 251, "correlation": 235104 } }, { "ph": "f", "id": 235104, "pid": 76337, "tid": -914061504, "ts": 1716454225155251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225205168, "dur": 97, "args": { "External id": 235106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235106, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235106, "pid": 5, "tid": 7, "ts": 1716454225205168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155258, "dur": 13, "args": { "External id": 235106, "cbid": 211, "correlation": 235106 } }, { "ph": "s", "id": 235106, "pid": 76337, "tid": -914061504, "ts": 1716454225155258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225205267, "dur": 19, "args": { "External id": 235114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235114, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235114, "pid": 5, "tid": 7, "ts": 1716454225205267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155326, "dur": 12, "args": { "External id": 235114, "cbid": 211, "correlation": 235114 } }, { "ph": "s", "id": 235114, "pid": 76337, "tid": -914061504, "ts": 1716454225155326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225205287, "dur": 37, "args": { "External id": 235122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235122, "pid": 5, "tid": 7, "ts": 1716454225205287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155368, "dur": 9, "args": { "External id": 235122, "cbid": 211, "correlation": 235122 } }, { "ph": "s", "id": 235122, "pid": 76337, "tid": -914061504, "ts": 1716454225155368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225205324, "dur": 33, "args": { "External id": 235144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235144, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235144, "pid": 5, "tid": 7, "ts": 1716454225205324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155418, "dur": 10, "args": { "External id": 235144, "cbid": 211, "correlation": 235144 } }, { "ph": "s", "id": 235144, "pid": 76337, "tid": -914061504, "ts": 1716454225155418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225155507, "dur": 1, "args": { "External id": 235160, "cbid": 251, "correlation": 235160 } }, { "ph": "f", "id": 235160, "pid": 76337, "tid": -914061504, "ts": 1716454225155507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225155512, "dur": 0, "args": { "External id": 235162, "cbid": 251, "correlation": 235162 } }, { "ph": "f", "id": 235162, "pid": 76337, "tid": -914061504, "ts": 1716454225155512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225205359, "dur": 530, "args": { "External id": 235163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235163, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235163, "pid": 5, "tid": 7, "ts": 1716454225205359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155515, "dur": 13, "args": { "External id": 235163, "cbid": 211, "correlation": 235163 } }, { "ph": "s", "id": 235163, "pid": 76337, "tid": -914061504, "ts": 1716454225155515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225205890, "dur": 124, "args": { "External id": 235171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235171, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235171, "pid": 5, "tid": 7, "ts": 1716454225205890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155580, "dur": 12, "args": { "External id": 235171, "cbid": 211, "correlation": 235171 } }, { "ph": "s", "id": 235171, "pid": 76337, "tid": -914061504, "ts": 1716454225155580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225206015, "dur": 128, "args": { "External id": 235179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235179, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235179, "pid": 5, "tid": 7, "ts": 1716454225206015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155610, "dur": 8, "args": { "External id": 235179, "cbid": 211, "correlation": 235179 } }, { "ph": "s", "id": 235179, "pid": 76337, "tid": -914061504, "ts": 1716454225155610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225155687, "dur": 1, "args": { "External id": 235195, "cbid": 251, "correlation": 235195 } }, { "ph": "f", "id": 235195, "pid": 76337, "tid": -914061504, "ts": 1716454225155687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225206144, "dur": 301, "args": { "External id": 235197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235197, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235197, "pid": 5, "tid": 7, "ts": 1716454225206144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155693, "dur": 12, "args": { "External id": 235197, "cbid": 211, "correlation": 235197 } }, { "ph": "s", "id": 235197, "pid": 76337, "tid": -914061504, "ts": 1716454225155693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225206447, "dur": 27, "args": { "External id": 235205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235205, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235205, "pid": 5, "tid": 7, "ts": 1716454225206447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155734, "dur": 9, "args": { "External id": 235205, "cbid": 211, "correlation": 235205 } }, { "ph": "s", "id": 235205, "pid": 76337, "tid": -914061504, "ts": 1716454225155734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225206475, "dur": 79, "args": { "External id": 235216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235216, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235216, "pid": 5, "tid": 7, "ts": 1716454225206475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155801, "dur": 13, "args": { "External id": 235216, "cbid": 211, "correlation": 235216 } }, { "ph": "s", "id": 235216, "pid": 76337, "tid": -914061504, "ts": 1716454225155801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225155865, "dur": 0, "args": { "External id": 235228, "cbid": 317, "correlation": 235228 } }, { "ph": "f", "id": 235228, "pid": 76337, "tid": -914061504, "ts": 1716454225155865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225155866, "dur": 0, "args": { "External id": 235229, "cbid": 203, "correlation": 235229 } }, { "ph": "f", "id": 235229, "pid": 76337, "tid": -914061504, "ts": 1716454225155866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225155866, "dur": 0, "args": { "External id": 235230, "cbid": 205, "correlation": 235230 } }, { "ph": "f", "id": 235230, "pid": 76337, "tid": -914061504, "ts": 1716454225155866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225206555, "dur": 24, "args": { "External id": 235234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235234, "pid": 5, "tid": 7, "ts": 1716454225206555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155881, "dur": 12, "args": { "External id": 235234, "cbid": 211, "correlation": 235234 } }, { "ph": "s", "id": 235234, "pid": 76337, "tid": -914061504, "ts": 1716454225155881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225206580, "dur": 118, "args": { "External id": 235236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235236, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235236, "pid": 5, "tid": 7, "ts": 1716454225206580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155900, "dur": 6, "args": { "External id": 235236, "cbid": 211, "correlation": 235236 } }, { "ph": "s", "id": 235236, "pid": 76337, "tid": -914061504, "ts": 1716454225155900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225206700, "dur": 22, "args": { "External id": 235238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235238, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235238, "pid": 5, "tid": 7, "ts": 1716454225206700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155910, "dur": 5, "args": { "External id": 235238, "cbid": 211, "correlation": 235238 } }, { "ph": "s", "id": 235238, "pid": 76337, "tid": -914061504, "ts": 1716454225155910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225206723, "dur": 32, "args": { "External id": 235244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235244, "pid": 5, "tid": 7, "ts": 1716454225206723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155937, "dur": 8, "args": { "External id": 235244, "cbid": 211, "correlation": 235244 } }, { "ph": "s", "id": 235244, "pid": 76337, "tid": -914061504, "ts": 1716454225155937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225206756, "dur": 26, "args": { "External id": 235252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235252, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235252, "pid": 5, "tid": 7, "ts": 1716454225206756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225155969, "dur": 16, "args": { "External id": 235252, "cbid": 211, "correlation": 235252 } }, { "ph": "s", "id": 235252, "pid": 76337, "tid": -914061504, "ts": 1716454225155969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225156048, "dur": 0, "args": { "External id": 235262, "cbid": 317, "correlation": 235262 } }, { "ph": "f", "id": 235262, "pid": 76337, "tid": -914061504, "ts": 1716454225156048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225156049, "dur": 0, "args": { "External id": 235263, "cbid": 203, "correlation": 235263 } }, { "ph": "f", "id": 235263, "pid": 76337, "tid": -914061504, "ts": 1716454225156049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225156050, "dur": 0, "args": { "External id": 235264, "cbid": 205, "correlation": 235264 } }, { "ph": "f", "id": 235264, "pid": 76337, "tid": -914061504, "ts": 1716454225156050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225206784, "dur": 23, "args": { "External id": 235268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235268, "pid": 5, "tid": 7, "ts": 1716454225206784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156064, "dur": 12, "args": { "External id": 235268, "cbid": 211, "correlation": 235268 } }, { "ph": "s", "id": 235268, "pid": 76337, "tid": -914061504, "ts": 1716454225156064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225206808, "dur": 44, "args": { "External id": 235270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235270, "pid": 5, "tid": 7, "ts": 1716454225206808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156079, "dur": 5, "args": { "External id": 235270, "cbid": 211, "correlation": 235270 } }, { "ph": "s", "id": 235270, "pid": 76337, "tid": -914061504, "ts": 1716454225156079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225206853, "dur": 230, "args": { "External id": 235272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235272, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 235272, "pid": 5, "tid": 7, "ts": 1716454225206853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156091, "dur": 7, "args": { "External id": 235272, "cbid": 211, "correlation": 235272 } }, { "ph": "s", "id": 235272, "pid": 76337, "tid": -914061504, "ts": 1716454225156091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225207084, "dur": 6, "args": { "External id": 235274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235274, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235274, "pid": 5, "tid": 7, "ts": 1716454225207084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156101, "dur": 5, "args": { "External id": 235274, "cbid": 211, "correlation": 235274 } }, { "ph": "s", "id": 235274, "pid": 76337, "tid": -914061504, "ts": 1716454225156101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225207092, "dur": 9, "args": { "External id": 235280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235280, "pid": 5, "tid": 7, "ts": 1716454225207092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156128, "dur": 9, "args": { "External id": 235280, "cbid": 211, "correlation": 235280 } }, { "ph": "s", "id": 235280, "pid": 76337, "tid": -914061504, "ts": 1716454225156128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225207102, "dur": 12, "args": { "External id": 235300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235300, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 235300, "pid": 5, "tid": 7, "ts": 1716454225207102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156220, "dur": 12, "args": { "External id": 235300, "cbid": 211, "correlation": 235300 } }, { "ph": "s", "id": 235300, "pid": 76337, "tid": -914061504, "ts": 1716454225156220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225207115, "dur": 4, "args": { "External id": 235312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235312, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 235312, "pid": 5, "tid": 7, "ts": 1716454225207115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156243, "dur": 6, "args": { "External id": 235312, "cbid": 211, "correlation": 235312 } }, { "ph": "s", "id": 235312, "pid": 76337, "tid": -914061504, "ts": 1716454225156243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225207121, "dur": 12, "args": { "External id": 235315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235315, "pid": 5, "tid": 7, "ts": 1716454225207121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156261, "dur": 7, "args": { "External id": 235315, "cbid": 211, "correlation": 235315 } }, { "ph": "s", "id": 235315, "pid": 76337, "tid": -914061504, "ts": 1716454225156261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225207133, "dur": 7, "args": { "External id": 235324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235324, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235324, "pid": 5, "tid": 7, "ts": 1716454225207133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156301, "dur": 11, "args": { "External id": 235324, "cbid": 211, "correlation": 235324 } }, { "ph": "s", "id": 235324, "pid": 76337, "tid": -914061504, "ts": 1716454225156301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225156354, "dur": 0, "args": { "External id": 235334, "cbid": 317, "correlation": 235334 } }, { "ph": "f", "id": 235334, "pid": 76337, "tid": -914061504, "ts": 1716454225156354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225156354, "dur": 0, "args": { "External id": 235335, "cbid": 203, "correlation": 235335 } }, { "ph": "f", "id": 235335, "pid": 76337, "tid": -914061504, "ts": 1716454225156354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225156355, "dur": 0, "args": { "External id": 235336, "cbid": 205, "correlation": 235336 } }, { "ph": "f", "id": 235336, "pid": 76337, "tid": -914061504, "ts": 1716454225156355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225207141, "dur": 6, "args": { "External id": 235340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235340, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235340, "pid": 5, "tid": 7, "ts": 1716454225207141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156371, "dur": 11, "args": { "External id": 235340, "cbid": 211, "correlation": 235340 } }, { "ph": "s", "id": 235340, "pid": 76337, "tid": -914061504, "ts": 1716454225156371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225207148, "dur": 82, "args": { "External id": 235342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235342, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235342, "pid": 5, "tid": 7, "ts": 1716454225207148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156385, "dur": 5, "args": { "External id": 235342, "cbid": 211, "correlation": 235342 } }, { "ph": "s", "id": 235342, "pid": 76337, "tid": -914061504, "ts": 1716454225156385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225207232, "dur": 1, "args": { "External id": 235344, "device": 5, "context": 1, "stream": 7, "correlation": 235344, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 235344, "pid": 5, "tid": 7, "ts": 1716454225207232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225156398, "dur": 9, "args": { "External id": 235344, "cbid": 51, "correlation": 235344 } }, { "ph": "s", "id": 235344, "pid": 76337, "tid": -914061504, "ts": 1716454225156398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225207236, "dur": 535, "args": { "External id": 235345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235345, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235345, "pid": 5, "tid": 7, "ts": 1716454225207236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156408, "dur": 8, "args": { "External id": 235345, "cbid": 211, "correlation": 235345 } }, { "ph": "s", "id": 235345, "pid": 76337, "tid": -914061504, "ts": 1716454225156408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225207772, "dur": 12, "args": { "External id": 235347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235347, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235347, "pid": 5, "tid": 7, "ts": 1716454225207772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156420, "dur": 5, "args": { "External id": 235347, "cbid": 211, "correlation": 235347 } }, { "ph": "s", "id": 235347, "pid": 76337, "tid": -914061504, "ts": 1716454225156420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225207785, "dur": 14, "args": { "External id": 235353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235353, "pid": 5, "tid": 7, "ts": 1716454225207785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156447, "dur": 9, "args": { "External id": 235353, "cbid": 211, "correlation": 235353 } }, { "ph": "s", "id": 235353, "pid": 76337, "tid": -914061504, "ts": 1716454225156447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225207800, "dur": 4, "args": { "External id": 235361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235361, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 235361, "pid": 5, "tid": 7, "ts": 1716454225207800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156491, "dur": 9, "args": { "External id": 235361, "cbid": 211, "correlation": 235361 } }, { "ph": "s", "id": 235361, "pid": 76337, "tid": -914061504, "ts": 1716454225156491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225156558, "dur": 1, "args": { "External id": 235377, "cbid": 251, "correlation": 235377 } }, { "ph": "f", "id": 235377, "pid": 76337, "tid": -914061504, "ts": 1716454225156558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225156563, "dur": 0, "args": { "External id": 235379, "cbid": 251, "correlation": 235379 } }, { "ph": "f", "id": 235379, "pid": 76337, "tid": -914061504, "ts": 1716454225156563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225207805, "dur": 13, "args": { "External id": 235380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235380, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235380, "pid": 5, "tid": 7, "ts": 1716454225207805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156565, "dur": 11, "args": { "External id": 235380, "cbid": 211, "correlation": 235380 } }, { "ph": "s", "id": 235380, "pid": 76337, "tid": -914061504, "ts": 1716454225156565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225207819, "dur": 5, "args": { "External id": 235382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235382, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235382, "pid": 5, "tid": 7, "ts": 1716454225207819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156578, "dur": 6, "args": { "External id": 235382, "cbid": 211, "correlation": 235382 } }, { "ph": "s", "id": 235382, "pid": 76337, "tid": -914061504, "ts": 1716454225156578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225207825, "dur": 16, "args": { "External id": 235392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235392, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235392, "pid": 5, "tid": 7, "ts": 1716454225207825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156636, "dur": 12, "args": { "External id": 235392, "cbid": 211, "correlation": 235392 } }, { "ph": "s", "id": 235392, "pid": 76337, "tid": -914061504, "ts": 1716454225156636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225207843, "dur": 17, "args": { "External id": 235412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235412, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 235412, "pid": 5, "tid": 7, "ts": 1716454225207843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156701, "dur": 11, "args": { "External id": 235412, "cbid": 211, "correlation": 235412 } }, { "ph": "s", "id": 235412, "pid": 76337, "tid": -914061504, "ts": 1716454225156701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225207861, "dur": 4, "args": { "External id": 235424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235424, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 235424, "pid": 5, "tid": 7, "ts": 1716454225207861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156723, "dur": 6, "args": { "External id": 235424, "cbid": 211, "correlation": 235424 } }, { "ph": "s", "id": 235424, "pid": 76337, "tid": -914061504, "ts": 1716454225156723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225207867, "dur": 17, "args": { "External id": 235427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235427, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235427, "pid": 5, "tid": 7, "ts": 1716454225207867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156741, "dur": 6, "args": { "External id": 235427, "cbid": 211, "correlation": 235427 } }, { "ph": "s", "id": 235427, "pid": 76337, "tid": -914061504, "ts": 1716454225156741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225207885, "dur": 11, "args": { "External id": 235436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235436, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235436, "pid": 5, "tid": 7, "ts": 1716454225207885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156781, "dur": 9, "args": { "External id": 235436, "cbid": 211, "correlation": 235436 } }, { "ph": "s", "id": 235436, "pid": 76337, "tid": -914061504, "ts": 1716454225156781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225156844, "dur": 0, "args": { "External id": 235446, "cbid": 317, "correlation": 235446 } }, { "ph": "f", "id": 235446, "pid": 76337, "tid": -914061504, "ts": 1716454225156844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225156845, "dur": 0, "args": { "External id": 235447, "cbid": 203, "correlation": 235447 } }, { "ph": "f", "id": 235447, "pid": 76337, "tid": -914061504, "ts": 1716454225156845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225156846, "dur": 0, "args": { "External id": 235448, "cbid": 205, "correlation": 235448 } }, { "ph": "f", "id": 235448, "pid": 76337, "tid": -914061504, "ts": 1716454225156846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225207897, "dur": 11, "args": { "External id": 235452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235452, "pid": 5, "tid": 7, "ts": 1716454225207897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156863, "dur": 12, "args": { "External id": 235452, "cbid": 211, "correlation": 235452 } }, { "ph": "s", "id": 235452, "pid": 76337, "tid": -914061504, "ts": 1716454225156863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225207909, "dur": 162, "args": { "External id": 235454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235454, "pid": 5, "tid": 7, "ts": 1716454225207909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156877, "dur": 5, "args": { "External id": 235454, "cbid": 211, "correlation": 235454 } }, { "ph": "s", "id": 235454, "pid": 76337, "tid": -914061504, "ts": 1716454225156877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225208073, "dur": 1, "args": { "External id": 235456, "device": 5, "context": 1, "stream": 7, "correlation": 235456, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 235456, "pid": 5, "tid": 7, "ts": 1716454225208073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225156889, "dur": 7, "args": { "External id": 235456, "cbid": 51, "correlation": 235456 } }, { "ph": "s", "id": 235456, "pid": 76337, "tid": -914061504, "ts": 1716454225156889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225208077, "dur": 656, "args": { "External id": 235457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235457, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235457, "pid": 5, "tid": 7, "ts": 1716454225208077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156898, "dur": 7, "args": { "External id": 235457, "cbid": 211, "correlation": 235457 } }, { "ph": "s", "id": 235457, "pid": 76337, "tid": -914061504, "ts": 1716454225156898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225208735, "dur": 12, "args": { "External id": 235459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235459, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235459, "pid": 5, "tid": 7, "ts": 1716454225208735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156909, "dur": 5, "args": { "External id": 235459, "cbid": 211, "correlation": 235459 } }, { "ph": "s", "id": 235459, "pid": 76337, "tid": -914061504, "ts": 1716454225156909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225208748, "dur": 15, "args": { "External id": 235465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235465, "pid": 5, "tid": 7, "ts": 1716454225208748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225156937, "dur": 9, "args": { "External id": 235465, "cbid": 211, "correlation": 235465 } }, { "ph": "s", "id": 235465, "pid": 76337, "tid": -914061504, "ts": 1716454225156937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225157005, "dur": 0, "args": { "External id": 235475, "cbid": 317, "correlation": 235475 } }, { "ph": "f", "id": 235475, "pid": 76337, "tid": -914061504, "ts": 1716454225157005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225157006, "dur": 0, "args": { "External id": 235476, "cbid": 203, "correlation": 235476 } }, { "ph": "f", "id": 235476, "pid": 76337, "tid": -914061504, "ts": 1716454225157006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225157006, "dur": 0, "args": { "External id": 235477, "cbid": 205, "correlation": 235477 } }, { "ph": "f", "id": 235477, "pid": 76337, "tid": -914061504, "ts": 1716454225157006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225208764, "dur": 8, "args": { "External id": 235481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235481, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235481, "pid": 5, "tid": 7, "ts": 1716454225208764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157020, "dur": 12, "args": { "External id": 235481, "cbid": 211, "correlation": 235481 } }, { "ph": "s", "id": 235481, "pid": 76337, "tid": -914061504, "ts": 1716454225157020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225208774, "dur": 3, "args": { "External id": 235483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235483, "pid": 5, "tid": 7, "ts": 1716454225208774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157037, "dur": 6, "args": { "External id": 235483, "cbid": 211, "correlation": 235483 } }, { "ph": "s", "id": 235483, "pid": 76337, "tid": -914061504, "ts": 1716454225157037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225157046, "dur": 0, "args": { "External id": 235484, "cbid": 51, "correlation": 235484 } }, { "ph": "s", "id": 235484, "pid": 76337, "tid": -914061504, "ts": 1716454225157046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225208778, "dur": 55, "args": { "External id": 235485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235485, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 235485, "pid": 5, "tid": 7, "ts": 1716454225208778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157047, "dur": 5, "args": { "External id": 235485, "cbid": 211, "correlation": 235485 } }, { "ph": "s", "id": 235485, "pid": 76337, "tid": -914061504, "ts": 1716454225157047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225208835, "dur": 14, "args": { "External id": 235490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235490, "pid": 5, "tid": 7, "ts": 1716454225208835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157074, "dur": 8, "args": { "External id": 235490, "cbid": 211, "correlation": 235490 } }, { "ph": "s", "id": 235490, "pid": 76337, "tid": -914061504, "ts": 1716454225157074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225208850, "dur": 12, "args": { "External id": 235498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235498, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235498, "pid": 5, "tid": 7, "ts": 1716454225208850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157104, "dur": 8, "args": { "External id": 235498, "cbid": 211, "correlation": 235498 } }, { "ph": "s", "id": 235498, "pid": 76337, "tid": -914061504, "ts": 1716454225157104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225208863, "dur": 11, "args": { "External id": 235506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235506, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235506, "pid": 5, "tid": 7, "ts": 1716454225208863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157133, "dur": 8, "args": { "External id": 235506, "cbid": 211, "correlation": 235506 } }, { "ph": "s", "id": 235506, "pid": 76337, "tid": -914061504, "ts": 1716454225157133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225208875, "dur": 18, "args": { "External id": 235526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235526, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 235526, "pid": 5, "tid": 7, "ts": 1716454225208875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157213, "dur": 12, "args": { "External id": 235526, "cbid": 211, "correlation": 235526 } }, { "ph": "s", "id": 235526, "pid": 76337, "tid": -914061504, "ts": 1716454225157213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225208894, "dur": 5, "args": { "External id": 235538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235538, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 235538, "pid": 5, "tid": 7, "ts": 1716454225208894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157235, "dur": 6, "args": { "External id": 235538, "cbid": 211, "correlation": 235538 } }, { "ph": "s", "id": 235538, "pid": 76337, "tid": -914061504, "ts": 1716454225157235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225208900, "dur": 17, "args": { "External id": 235541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235541, "pid": 5, "tid": 7, "ts": 1716454225208900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157253, "dur": 6, "args": { "External id": 235541, "cbid": 211, "correlation": 235541 } }, { "ph": "s", "id": 235541, "pid": 76337, "tid": -914061504, "ts": 1716454225157253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225157310, "dur": 0, "args": { "External id": 235552, "cbid": 317, "correlation": 235552 } }, { "ph": "f", "id": 235552, "pid": 76337, "tid": -914061504, "ts": 1716454225157310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225157310, "dur": 0, "args": { "External id": 235553, "cbid": 203, "correlation": 235553 } }, { "ph": "f", "id": 235553, "pid": 76337, "tid": -914061504, "ts": 1716454225157310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225157311, "dur": 0, "args": { "External id": 235554, "cbid": 205, "correlation": 235554 } }, { "ph": "f", "id": 235554, "pid": 76337, "tid": -914061504, "ts": 1716454225157311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225208919, "dur": 11, "args": { "External id": 235558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235558, "pid": 5, "tid": 7, "ts": 1716454225208919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157326, "dur": 11, "args": { "External id": 235558, "cbid": 211, "correlation": 235558 } }, { "ph": "s", "id": 235558, "pid": 76337, "tid": -914061504, "ts": 1716454225157326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225208931, "dur": 3, "args": { "External id": 235560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235560, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235560, "pid": 5, "tid": 7, "ts": 1716454225208931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157343, "dur": 6, "args": { "External id": 235560, "cbid": 211, "correlation": 235560 } }, { "ph": "s", "id": 235560, "pid": 76337, "tid": -914061504, "ts": 1716454225157343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225157352, "dur": 0, "args": { "External id": 235561, "cbid": 51, "correlation": 235561 } }, { "ph": "s", "id": 235561, "pid": 76337, "tid": -914061504, "ts": 1716454225157352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225208936, "dur": 95, "args": { "External id": 235562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235562, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 235562, "pid": 5, "tid": 7, "ts": 1716454225208936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157353, "dur": 5, "args": { "External id": 235562, "cbid": 211, "correlation": 235562 } }, { "ph": "s", "id": 235562, "pid": 76337, "tid": -914061504, "ts": 1716454225157353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225209032, "dur": 15, "args": { "External id": 235567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235567, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235567, "pid": 5, "tid": 7, "ts": 1716454225209032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157380, "dur": 8, "args": { "External id": 235567, "cbid": 211, "correlation": 235567 } }, { "ph": "s", "id": 235567, "pid": 76337, "tid": -914061504, "ts": 1716454225157380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225209049, "dur": 81, "args": { "External id": 235576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235576, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235576, "pid": 5, "tid": 7, "ts": 1716454225209049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157462, "dur": 14, "args": { "External id": 235576, "cbid": 211, "correlation": 235576 } }, { "ph": "s", "id": 235576, "pid": 76337, "tid": -914061504, "ts": 1716454225157462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225209131, "dur": 29, "args": { "External id": 235598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235598, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235598, "pid": 5, "tid": 7, "ts": 1716454225209131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157518, "dur": 10, "args": { "External id": 235598, "cbid": 211, "correlation": 235598 } }, { "ph": "s", "id": 235598, "pid": 76337, "tid": -914061504, "ts": 1716454225157518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225157608, "dur": 1, "args": { "External id": 235609, "cbid": 251, "correlation": 235609 } }, { "ph": "f", "id": 235609, "pid": 76337, "tid": -914061504, "ts": 1716454225157608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225209161, "dur": 162, "args": { "External id": 235610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235610, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235610, "pid": 5, "tid": 7, "ts": 1716454225209161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157614, "dur": 13, "args": { "External id": 235610, "cbid": 211, "correlation": 235610 } }, { "ph": "s", "id": 235610, "pid": 76337, "tid": -914061504, "ts": 1716454225157614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225157684, "dur": 1, "args": { "External id": 235621, "cbid": 251, "correlation": 235621 } }, { "ph": "f", "id": 235621, "pid": 76337, "tid": -914061504, "ts": 1716454225157684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225209324, "dur": 157, "args": { "External id": 235622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235622, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235622, "pid": 5, "tid": 7, "ts": 1716454225209324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157688, "dur": 12, "args": { "External id": 235622, "cbid": 211, "correlation": 235622 } }, { "ph": "s", "id": 235622, "pid": 76337, "tid": -914061504, "ts": 1716454225157688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225157753, "dur": 1, "args": { "External id": 235633, "cbid": 251, "correlation": 235633 } }, { "ph": "f", "id": 235633, "pid": 76337, "tid": -914061504, "ts": 1716454225157753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225209482, "dur": 156, "args": { "External id": 235634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235634, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235634, "pid": 5, "tid": 7, "ts": 1716454225209482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157757, "dur": 11, "args": { "External id": 235634, "cbid": 211, "correlation": 235634 } }, { "ph": "s", "id": 235634, "pid": 76337, "tid": -914061504, "ts": 1716454225157757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225209639, "dur": 334, "args": { "External id": 235659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235659, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235659, "pid": 5, "tid": 7, "ts": 1716454225209639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157842, "dur": 13, "args": { "External id": 235659, "cbid": 211, "correlation": 235659 } }, { "ph": "s", "id": 235659, "pid": 76337, "tid": -914061504, "ts": 1716454225157842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225157942, "dur": 1, "args": { "External id": 235677, "cbid": 251, "correlation": 235677 } }, { "ph": "f", "id": 235677, "pid": 76337, "tid": -914061504, "ts": 1716454225157942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225209975, "dur": 162, "args": { "External id": 235679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235679, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235679, "pid": 5, "tid": 7, "ts": 1716454225209975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225157948, "dur": 13, "args": { "External id": 235679, "cbid": 211, "correlation": 235679 } }, { "ph": "s", "id": 235679, "pid": 76337, "tid": -914061504, "ts": 1716454225157948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225210138, "dur": 19, "args": { "External id": 235687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235687, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235687, "pid": 5, "tid": 7, "ts": 1716454225210138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158027, "dur": 13, "args": { "External id": 235687, "cbid": 211, "correlation": 235687 } }, { "ph": "s", "id": 235687, "pid": 76337, "tid": -914061504, "ts": 1716454225158027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225210158, "dur": 27, "args": { "External id": 235695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235695, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235695, "pid": 5, "tid": 7, "ts": 1716454225210158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158068, "dur": 9, "args": { "External id": 235695, "cbid": 211, "correlation": 235695 } }, { "ph": "s", "id": 235695, "pid": 76337, "tid": -914061504, "ts": 1716454225158068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225210187, "dur": 19, "args": { "External id": 235706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235706, "pid": 5, "tid": 7, "ts": 1716454225210187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158139, "dur": 13, "args": { "External id": 235706, "cbid": 211, "correlation": 235706 } }, { "ph": "s", "id": 235706, "pid": 76337, "tid": -914061504, "ts": 1716454225158139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225210207, "dur": 15, "args": { "External id": 235728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235728, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235728, "pid": 5, "tid": 7, "ts": 1716454225210207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158169, "dur": 8, "args": { "External id": 235728, "cbid": 211, "correlation": 235728 } }, { "ph": "s", "id": 235728, "pid": 76337, "tid": -914061504, "ts": 1716454225158169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158255, "dur": 1, "args": { "External id": 235739, "cbid": 251, "correlation": 235739 } }, { "ph": "f", "id": 235739, "pid": 76337, "tid": -914061504, "ts": 1716454225158255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225210224, "dur": 87, "args": { "External id": 235740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235740, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235740, "pid": 5, "tid": 7, "ts": 1716454225210224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158261, "dur": 14, "args": { "External id": 235740, "cbid": 211, "correlation": 235740 } }, { "ph": "s", "id": 235740, "pid": 76337, "tid": -914061504, "ts": 1716454225158261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158333, "dur": 1, "args": { "External id": 235751, "cbid": 251, "correlation": 235751 } }, { "ph": "f", "id": 235751, "pid": 76337, "tid": -914061504, "ts": 1716454225158333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158337, "dur": 0, "args": { "External id": 235752, "cbid": 251, "correlation": 235752 } }, { "ph": "f", "id": 235752, "pid": 76337, "tid": -914061504, "ts": 1716454225158337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225210313, "dur": 12, "args": { "External id": 235753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235753, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235753, "pid": 5, "tid": 7, "ts": 1716454225210313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158339, "dur": 12, "args": { "External id": 235753, "cbid": 211, "correlation": 235753 } }, { "ph": "s", "id": 235753, "pid": 76337, "tid": -914061504, "ts": 1716454225158339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225210326, "dur": 5, "args": { "External id": 235755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235755, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235755, "pid": 5, "tid": 7, "ts": 1716454225210326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158353, "dur": 6, "args": { "External id": 235755, "cbid": 211, "correlation": 235755 } }, { "ph": "s", "id": 235755, "pid": 76337, "tid": -914061504, "ts": 1716454225158353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158410, "dur": 1, "args": { "External id": 235766, "cbid": 251, "correlation": 235766 } }, { "ph": "f", "id": 235766, "pid": 76337, "tid": -914061504, "ts": 1716454225158410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158413, "dur": 0, "args": { "External id": 235767, "cbid": 251, "correlation": 235767 } }, { "ph": "f", "id": 235767, "pid": 76337, "tid": -914061504, "ts": 1716454225158413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225210333, "dur": 8, "args": { "External id": 235768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235768, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235768, "pid": 5, "tid": 7, "ts": 1716454225210333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158415, "dur": 11, "args": { "External id": 235768, "cbid": 211, "correlation": 235768 } }, { "ph": "s", "id": 235768, "pid": 76337, "tid": -914061504, "ts": 1716454225158415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225210343, "dur": 3, "args": { "External id": 235770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235770, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235770, "pid": 5, "tid": 7, "ts": 1716454225210343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158428, "dur": 5, "args": { "External id": 235770, "cbid": 211, "correlation": 235770 } }, { "ph": "s", "id": 235770, "pid": 76337, "tid": -914061504, "ts": 1716454225158428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225210347, "dur": 54, "args": { "External id": 235795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235795, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235795, "pid": 5, "tid": 7, "ts": 1716454225210347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158505, "dur": 12, "args": { "External id": 235795, "cbid": 211, "correlation": 235795 } }, { "ph": "s", "id": 235795, "pid": 76337, "tid": -914061504, "ts": 1716454225158505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158604, "dur": 2, "args": { "External id": 235813, "cbid": 251, "correlation": 235813 } }, { "ph": "f", "id": 235813, "pid": 76337, "tid": -914061504, "ts": 1716454225158604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225210402, "dur": 89, "args": { "External id": 235815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235815, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235815, "pid": 5, "tid": 7, "ts": 1716454225210402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158610, "dur": 15, "args": { "External id": 235815, "cbid": 211, "correlation": 235815 } }, { "ph": "s", "id": 235815, "pid": 76337, "tid": -914061504, "ts": 1716454225158610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225210492, "dur": 9, "args": { "External id": 235823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235823, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235823, "pid": 5, "tid": 7, "ts": 1716454225210492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158681, "dur": 13, "args": { "External id": 235823, "cbid": 211, "correlation": 235823 } }, { "ph": "s", "id": 235823, "pid": 76337, "tid": -914061504, "ts": 1716454225158681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225210503, "dur": 21, "args": { "External id": 235831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235831, "pid": 5, "tid": 7, "ts": 1716454225210503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158722, "dur": 9, "args": { "External id": 235831, "cbid": 211, "correlation": 235831 } }, { "ph": "s", "id": 235831, "pid": 76337, "tid": -914061504, "ts": 1716454225158722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225210525, "dur": 17, "args": { "External id": 235853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235853, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235853, "pid": 5, "tid": 7, "ts": 1716454225210525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158775, "dur": 11, "args": { "External id": 235853, "cbid": 211, "correlation": 235853 } }, { "ph": "s", "id": 235853, "pid": 76337, "tid": -914061504, "ts": 1716454225158775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158865, "dur": 2, "args": { "External id": 235869, "cbid": 251, "correlation": 235869 } }, { "ph": "f", "id": 235869, "pid": 76337, "tid": -914061504, "ts": 1716454225158865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225158871, "dur": 0, "args": { "External id": 235871, "cbid": 251, "correlation": 235871 } }, { "ph": "f", "id": 235871, "pid": 76337, "tid": -914061504, "ts": 1716454225158871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225210544, "dur": 489, "args": { "External id": 235872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235872, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 235872, "pid": 5, "tid": 7, "ts": 1716454225210544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158873, "dur": 16, "args": { "External id": 235872, "cbid": 211, "correlation": 235872 } }, { "ph": "s", "id": 235872, "pid": 76337, "tid": -914061504, "ts": 1716454225158873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225211035, "dur": 65, "args": { "External id": 235880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235880, "pid": 5, "tid": 7, "ts": 1716454225211035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158941, "dur": 12, "args": { "External id": 235880, "cbid": 211, "correlation": 235880 } }, { "ph": "s", "id": 235880, "pid": 76337, "tid": -914061504, "ts": 1716454225158941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225211101, "dur": 68, "args": { "External id": 235888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235888, "pid": 5, "tid": 7, "ts": 1716454225211101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225158971, "dur": 18, "args": { "External id": 235888, "cbid": 211, "correlation": 235888 } }, { "ph": "s", "id": 235888, "pid": 76337, "tid": -914061504, "ts": 1716454225158971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225159061, "dur": 1, "args": { "External id": 235904, "cbid": 251, "correlation": 235904 } }, { "ph": "f", "id": 235904, "pid": 76337, "tid": -914061504, "ts": 1716454225159061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225211171, "dur": 1, "args": { "External id": 235906, "device": 5, "context": 1, "stream": 7, "correlation": 235906, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 235906, "pid": 5, "tid": 7, "ts": 1716454225211171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225159067, "dur": 12, "args": { "External id": 235906, "cbid": 51, "correlation": 235906 } }, { "ph": "s", "id": 235906, "pid": 76337, "tid": -914061504, "ts": 1716454225159067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225211175, "dur": 266, "args": { "External id": 235907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235907, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235907, "pid": 5, "tid": 7, "ts": 1716454225211175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159080, "dur": 12, "args": { "External id": 235907, "cbid": 211, "correlation": 235907 } }, { "ph": "s", "id": 235907, "pid": 76337, "tid": -914061504, "ts": 1716454225159080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225211442, "dur": 14, "args": { "External id": 235915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235915, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235915, "pid": 5, "tid": 7, "ts": 1716454225211442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159122, "dur": 11, "args": { "External id": 235915, "cbid": 211, "correlation": 235915 } }, { "ph": "s", "id": 235915, "pid": 76337, "tid": -914061504, "ts": 1716454225159122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225211457, "dur": 36, "args": { "External id": 235926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235926, "pid": 5, "tid": 7, "ts": 1716454225211457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159191, "dur": 12, "args": { "External id": 235926, "cbid": 211, "correlation": 235926 } }, { "ph": "s", "id": 235926, "pid": 76337, "tid": -914061504, "ts": 1716454225159191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225159256, "dur": 0, "args": { "External id": 235938, "cbid": 317, "correlation": 235938 } }, { "ph": "f", "id": 235938, "pid": 76337, "tid": -914061504, "ts": 1716454225159256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225159257, "dur": 0, "args": { "External id": 235939, "cbid": 203, "correlation": 235939 } }, { "ph": "f", "id": 235939, "pid": 76337, "tid": -914061504, "ts": 1716454225159257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225159257, "dur": 0, "args": { "External id": 235940, "cbid": 205, "correlation": 235940 } }, { "ph": "f", "id": 235940, "pid": 76337, "tid": -914061504, "ts": 1716454225159257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225211494, "dur": 13, "args": { "External id": 235944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235944, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235944, "pid": 5, "tid": 7, "ts": 1716454225211494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159272, "dur": 13, "args": { "External id": 235944, "cbid": 211, "correlation": 235944 } }, { "ph": "s", "id": 235944, "pid": 76337, "tid": -914061504, "ts": 1716454225159272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225211509, "dur": 4, "args": { "External id": 235946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 235946, "pid": 5, "tid": 7, "ts": 1716454225211509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159289, "dur": 6, "args": { "External id": 235946, "cbid": 211, "correlation": 235946 } }, { "ph": "s", "id": 235946, "pid": 76337, "tid": -914061504, "ts": 1716454225159289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225159298, "dur": 0, "args": { "External id": 235947, "cbid": 51, "correlation": 235947 } }, { "ph": "s", "id": 235947, "pid": 76337, "tid": -914061504, "ts": 1716454225159298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225211514, "dur": 94, "args": { "External id": 235948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235948, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 235948, "pid": 5, "tid": 7, "ts": 1716454225211514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159299, "dur": 5, "args": { "External id": 235948, "cbid": 211, "correlation": 235948 } }, { "ph": "s", "id": 235948, "pid": 76337, "tid": -914061504, "ts": 1716454225159299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225211609, "dur": 16, "args": { "External id": 235953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235953, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235953, "pid": 5, "tid": 7, "ts": 1716454225211609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159325, "dur": 10, "args": { "External id": 235953, "cbid": 211, "correlation": 235953 } }, { "ph": "s", "id": 235953, "pid": 76337, "tid": -914061504, "ts": 1716454225159325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225211626, "dur": 12, "args": { "External id": 235961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235961, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235961, "pid": 5, "tid": 7, "ts": 1716454225211626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159357, "dur": 8, "args": { "External id": 235961, "cbid": 211, "correlation": 235961 } }, { "ph": "s", "id": 235961, "pid": 76337, "tid": -914061504, "ts": 1716454225159357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225211639, "dur": 18, "args": { "External id": 235981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235981, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 235981, "pid": 5, "tid": 7, "ts": 1716454225211639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159428, "dur": 12, "args": { "External id": 235981, "cbid": 211, "correlation": 235981 } }, { "ph": "s", "id": 235981, "pid": 76337, "tid": -914061504, "ts": 1716454225159428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225211659, "dur": 5, "args": { "External id": 235993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235993, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 235993, "pid": 5, "tid": 7, "ts": 1716454225211659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159450, "dur": 6, "args": { "External id": 235993, "cbid": 211, "correlation": 235993 } }, { "ph": "s", "id": 235993, "pid": 76337, "tid": -914061504, "ts": 1716454225159450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225211665, "dur": 18, "args": { "External id": 235996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 235996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 235996, "pid": 5, "tid": 7, "ts": 1716454225211665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159468, "dur": 6, "args": { "External id": 235996, "cbid": 211, "correlation": 235996 } }, { "ph": "s", "id": 235996, "pid": 76337, "tid": -914061504, "ts": 1716454225159468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225211684, "dur": 11, "args": { "External id": 236005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236005, "pid": 5, "tid": 7, "ts": 1716454225211684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159506, "dur": 10, "args": { "External id": 236005, "cbid": 211, "correlation": 236005 } }, { "ph": "s", "id": 236005, "pid": 76337, "tid": -914061504, "ts": 1716454225159506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225159558, "dur": 0, "args": { "External id": 236015, "cbid": 317, "correlation": 236015 } }, { "ph": "f", "id": 236015, "pid": 76337, "tid": -914061504, "ts": 1716454225159558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225159559, "dur": 0, "args": { "External id": 236016, "cbid": 203, "correlation": 236016 } }, { "ph": "f", "id": 236016, "pid": 76337, "tid": -914061504, "ts": 1716454225159559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225159559, "dur": 0, "args": { "External id": 236017, "cbid": 205, "correlation": 236017 } }, { "ph": "f", "id": 236017, "pid": 76337, "tid": -914061504, "ts": 1716454225159559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225211697, "dur": 11, "args": { "External id": 236021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236021, "pid": 5, "tid": 7, "ts": 1716454225211697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159573, "dur": 12, "args": { "External id": 236021, "cbid": 211, "correlation": 236021 } }, { "ph": "s", "id": 236021, "pid": 76337, "tid": -914061504, "ts": 1716454225159573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225211709, "dur": 159, "args": { "External id": 236023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236023, "pid": 5, "tid": 7, "ts": 1716454225211709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159587, "dur": 5, "args": { "External id": 236023, "cbid": 211, "correlation": 236023 } }, { "ph": "s", "id": 236023, "pid": 76337, "tid": -914061504, "ts": 1716454225159587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225211871, "dur": 1, "args": { "External id": 236025, "device": 5, "context": 1, "stream": 7, "correlation": 236025, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 236025, "pid": 5, "tid": 7, "ts": 1716454225211871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225159598, "dur": 6, "args": { "External id": 236025, "cbid": 51, "correlation": 236025 } }, { "ph": "s", "id": 236025, "pid": 76337, "tid": -914061504, "ts": 1716454225159598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225211874, "dur": 656, "args": { "External id": 236026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236026, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236026, "pid": 5, "tid": 7, "ts": 1716454225211874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159605, "dur": 6, "args": { "External id": 236026, "cbid": 211, "correlation": 236026 } }, { "ph": "s", "id": 236026, "pid": 76337, "tid": -914061504, "ts": 1716454225159605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225212531, "dur": 13, "args": { "External id": 236028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236028, "pid": 5, "tid": 7, "ts": 1716454225212531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159616, "dur": 5, "args": { "External id": 236028, "cbid": 211, "correlation": 236028 } }, { "ph": "s", "id": 236028, "pid": 76337, "tid": -914061504, "ts": 1716454225159616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225212546, "dur": 14, "args": { "External id": 236034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236034, "pid": 5, "tid": 7, "ts": 1716454225212546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159644, "dur": 8, "args": { "External id": 236034, "cbid": 211, "correlation": 236034 } }, { "ph": "s", "id": 236034, "pid": 76337, "tid": -914061504, "ts": 1716454225159644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225212561, "dur": 3, "args": { "External id": 236042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236042, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 236042, "pid": 5, "tid": 7, "ts": 1716454225212561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159687, "dur": 9, "args": { "External id": 236042, "cbid": 211, "correlation": 236042 } }, { "ph": "s", "id": 236042, "pid": 76337, "tid": -914061504, "ts": 1716454225159687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225159751, "dur": 1, "args": { "External id": 236058, "cbid": 251, "correlation": 236058 } }, { "ph": "f", "id": 236058, "pid": 76337, "tid": -914061504, "ts": 1716454225159751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225159756, "dur": 0, "args": { "External id": 236060, "cbid": 251, "correlation": 236060 } }, { "ph": "f", "id": 236060, "pid": 76337, "tid": -914061504, "ts": 1716454225159756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225212566, "dur": 13, "args": { "External id": 236061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236061, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236061, "pid": 5, "tid": 7, "ts": 1716454225212566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159758, "dur": 11, "args": { "External id": 236061, "cbid": 211, "correlation": 236061 } }, { "ph": "s", "id": 236061, "pid": 76337, "tid": -914061504, "ts": 1716454225159758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225212580, "dur": 5, "args": { "External id": 236063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236063, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236063, "pid": 5, "tid": 7, "ts": 1716454225212580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159771, "dur": 6, "args": { "External id": 236063, "cbid": 211, "correlation": 236063 } }, { "ph": "s", "id": 236063, "pid": 76337, "tid": -914061504, "ts": 1716454225159771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225212587, "dur": 16, "args": { "External id": 236073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236073, "pid": 5, "tid": 7, "ts": 1716454225212587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159829, "dur": 13, "args": { "External id": 236073, "cbid": 211, "correlation": 236073 } }, { "ph": "s", "id": 236073, "pid": 76337, "tid": -914061504, "ts": 1716454225159829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225212604, "dur": 18, "args": { "External id": 236093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236093, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 236093, "pid": 5, "tid": 7, "ts": 1716454225212604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159895, "dur": 11, "args": { "External id": 236093, "cbid": 211, "correlation": 236093 } }, { "ph": "s", "id": 236093, "pid": 76337, "tid": -914061504, "ts": 1716454225159895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225212623, "dur": 4, "args": { "External id": 236105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236105, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 236105, "pid": 5, "tid": 7, "ts": 1716454225212623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159915, "dur": 6, "args": { "External id": 236105, "cbid": 211, "correlation": 236105 } }, { "ph": "s", "id": 236105, "pid": 76337, "tid": -914061504, "ts": 1716454225159915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225212629, "dur": 16, "args": { "External id": 236108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236108, "pid": 5, "tid": 7, "ts": 1716454225212629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159935, "dur": 7, "args": { "External id": 236108, "cbid": 211, "correlation": 236108 } }, { "ph": "s", "id": 236108, "pid": 76337, "tid": -914061504, "ts": 1716454225159935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225212646, "dur": 10, "args": { "External id": 236117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236117, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236117, "pid": 5, "tid": 7, "ts": 1716454225212646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225159984, "dur": 10, "args": { "External id": 236117, "cbid": 211, "correlation": 236117 } }, { "ph": "s", "id": 236117, "pid": 76337, "tid": -914061504, "ts": 1716454225159984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225160048, "dur": 0, "args": { "External id": 236127, "cbid": 317, "correlation": 236127 } }, { "ph": "f", "id": 236127, "pid": 76337, "tid": -914061504, "ts": 1716454225160048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225160048, "dur": 0, "args": { "External id": 236128, "cbid": 203, "correlation": 236128 } }, { "ph": "f", "id": 236128, "pid": 76337, "tid": -914061504, "ts": 1716454225160048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225160049, "dur": 0, "args": { "External id": 236129, "cbid": 205, "correlation": 236129 } }, { "ph": "f", "id": 236129, "pid": 76337, "tid": -914061504, "ts": 1716454225160049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225212658, "dur": 11, "args": { "External id": 236133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236133, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236133, "pid": 5, "tid": 7, "ts": 1716454225212658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160063, "dur": 12, "args": { "External id": 236133, "cbid": 211, "correlation": 236133 } }, { "ph": "s", "id": 236133, "pid": 76337, "tid": -914061504, "ts": 1716454225160063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225212670, "dur": 159, "args": { "External id": 236135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236135, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236135, "pid": 5, "tid": 7, "ts": 1716454225212670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160077, "dur": 6, "args": { "External id": 236135, "cbid": 211, "correlation": 236135 } }, { "ph": "s", "id": 236135, "pid": 76337, "tid": -914061504, "ts": 1716454225160077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225212832, "dur": 1, "args": { "External id": 236137, "device": 5, "context": 1, "stream": 7, "correlation": 236137, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 236137, "pid": 5, "tid": 7, "ts": 1716454225212832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225160089, "dur": 7, "args": { "External id": 236137, "cbid": 51, "correlation": 236137 } }, { "ph": "s", "id": 236137, "pid": 76337, "tid": -914061504, "ts": 1716454225160089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225212835, "dur": 642, "args": { "External id": 236138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236138, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236138, "pid": 5, "tid": 7, "ts": 1716454225212835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160097, "dur": 6, "args": { "External id": 236138, "cbid": 211, "correlation": 236138 } }, { "ph": "s", "id": 236138, "pid": 76337, "tid": -914061504, "ts": 1716454225160097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225213479, "dur": 13, "args": { "External id": 236140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236140, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236140, "pid": 5, "tid": 7, "ts": 1716454225213479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160107, "dur": 5, "args": { "External id": 236140, "cbid": 211, "correlation": 236140 } }, { "ph": "s", "id": 236140, "pid": 76337, "tid": -914061504, "ts": 1716454225160107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225213493, "dur": 14, "args": { "External id": 236146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236146, "pid": 5, "tid": 7, "ts": 1716454225213493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160136, "dur": 8, "args": { "External id": 236146, "cbid": 211, "correlation": 236146 } }, { "ph": "s", "id": 236146, "pid": 76337, "tid": -914061504, "ts": 1716454225160136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225213508, "dur": 11, "args": { "External id": 236154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236154, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236154, "pid": 5, "tid": 7, "ts": 1716454225213508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160168, "dur": 9, "args": { "External id": 236154, "cbid": 211, "correlation": 236154 } }, { "ph": "s", "id": 236154, "pid": 76337, "tid": -914061504, "ts": 1716454225160168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225213521, "dur": 10, "args": { "External id": 236162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236162, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236162, "pid": 5, "tid": 7, "ts": 1716454225213521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160197, "dur": 8, "args": { "External id": 236162, "cbid": 211, "correlation": 236162 } }, { "ph": "s", "id": 236162, "pid": 76337, "tid": -914061504, "ts": 1716454225160197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225213532, "dur": 18, "args": { "External id": 236182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236182, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 236182, "pid": 5, "tid": 7, "ts": 1716454225213532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160275, "dur": 12, "args": { "External id": 236182, "cbid": 211, "correlation": 236182 } }, { "ph": "s", "id": 236182, "pid": 76337, "tid": -914061504, "ts": 1716454225160275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225213552, "dur": 4, "args": { "External id": 236194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236194, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 236194, "pid": 5, "tid": 7, "ts": 1716454225213552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160297, "dur": 6, "args": { "External id": 236194, "cbid": 211, "correlation": 236194 } }, { "ph": "s", "id": 236194, "pid": 76337, "tid": -914061504, "ts": 1716454225160297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225213557, "dur": 16, "args": { "External id": 236197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236197, "pid": 5, "tid": 7, "ts": 1716454225213557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160315, "dur": 6, "args": { "External id": 236197, "cbid": 211, "correlation": 236197 } }, { "ph": "s", "id": 236197, "pid": 76337, "tid": -914061504, "ts": 1716454225160315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225160372, "dur": 0, "args": { "External id": 236208, "cbid": 317, "correlation": 236208 } }, { "ph": "f", "id": 236208, "pid": 76337, "tid": -914061504, "ts": 1716454225160372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225160373, "dur": 0, "args": { "External id": 236209, "cbid": 203, "correlation": 236209 } }, { "ph": "f", "id": 236209, "pid": 76337, "tid": -914061504, "ts": 1716454225160373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225160374, "dur": 0, "args": { "External id": 236210, "cbid": 205, "correlation": 236210 } }, { "ph": "f", "id": 236210, "pid": 76337, "tid": -914061504, "ts": 1716454225160374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225213574, "dur": 13, "args": { "External id": 236214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236214, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236214, "pid": 5, "tid": 7, "ts": 1716454225213574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160388, "dur": 11, "args": { "External id": 236214, "cbid": 211, "correlation": 236214 } }, { "ph": "s", "id": 236214, "pid": 76337, "tid": -914061504, "ts": 1716454225160388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225213588, "dur": 4, "args": { "External id": 236216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236216, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 236216, "pid": 5, "tid": 7, "ts": 1716454225213588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160404, "dur": 6, "args": { "External id": 236216, "cbid": 211, "correlation": 236216 } }, { "ph": "s", "id": 236216, "pid": 76337, "tid": -914061504, "ts": 1716454225160404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225160412, "dur": 0, "args": { "External id": 236217, "cbid": 51, "correlation": 236217 } }, { "ph": "s", "id": 236217, "pid": 76337, "tid": -914061504, "ts": 1716454225160412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225213593, "dur": 92, "args": { "External id": 236218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236218, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 236218, "pid": 5, "tid": 7, "ts": 1716454225213593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160413, "dur": 5, "args": { "External id": 236218, "cbid": 211, "correlation": 236218 } }, { "ph": "s", "id": 236218, "pid": 76337, "tid": -914061504, "ts": 1716454225160413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225213686, "dur": 15, "args": { "External id": 236223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236223, "pid": 5, "tid": 7, "ts": 1716454225213686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160440, "dur": 8, "args": { "External id": 236223, "cbid": 211, "correlation": 236223 } }, { "ph": "s", "id": 236223, "pid": 76337, "tid": -914061504, "ts": 1716454225160440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225213702, "dur": 81, "args": { "External id": 236232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236232, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236232, "pid": 5, "tid": 7, "ts": 1716454225213702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160521, "dur": 15, "args": { "External id": 236232, "cbid": 211, "correlation": 236232 } }, { "ph": "s", "id": 236232, "pid": 76337, "tid": -914061504, "ts": 1716454225160521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225213785, "dur": 30, "args": { "External id": 236254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236254, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236254, "pid": 5, "tid": 7, "ts": 1716454225213785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160578, "dur": 10, "args": { "External id": 236254, "cbid": 211, "correlation": 236254 } }, { "ph": "s", "id": 236254, "pid": 76337, "tid": -914061504, "ts": 1716454225160578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225160664, "dur": 1, "args": { "External id": 236265, "cbid": 251, "correlation": 236265 } }, { "ph": "f", "id": 236265, "pid": 76337, "tid": -914061504, "ts": 1716454225160664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225213816, "dur": 161, "args": { "External id": 236266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236266, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236266, "pid": 5, "tid": 7, "ts": 1716454225213816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160670, "dur": 14, "args": { "External id": 236266, "cbid": 211, "correlation": 236266 } }, { "ph": "s", "id": 236266, "pid": 76337, "tid": -914061504, "ts": 1716454225160670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225160740, "dur": 1, "args": { "External id": 236277, "cbid": 251, "correlation": 236277 } }, { "ph": "f", "id": 236277, "pid": 76337, "tid": -914061504, "ts": 1716454225160740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225213979, "dur": 154, "args": { "External id": 236278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236278, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236278, "pid": 5, "tid": 7, "ts": 1716454225213979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160744, "dur": 11, "args": { "External id": 236278, "cbid": 211, "correlation": 236278 } }, { "ph": "s", "id": 236278, "pid": 76337, "tid": -914061504, "ts": 1716454225160744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225160809, "dur": 1, "args": { "External id": 236289, "cbid": 251, "correlation": 236289 } }, { "ph": "f", "id": 236289, "pid": 76337, "tid": -914061504, "ts": 1716454225160809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225214134, "dur": 157, "args": { "External id": 236290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236290, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236290, "pid": 5, "tid": 7, "ts": 1716454225214134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160813, "dur": 11, "args": { "External id": 236290, "cbid": 211, "correlation": 236290 } }, { "ph": "s", "id": 236290, "pid": 76337, "tid": -914061504, "ts": 1716454225160813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225214292, "dur": 332, "args": { "External id": 236315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236315, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236315, "pid": 5, "tid": 7, "ts": 1716454225214292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225160897, "dur": 12, "args": { "External id": 236315, "cbid": 211, "correlation": 236315 } }, { "ph": "s", "id": 236315, "pid": 76337, "tid": -914061504, "ts": 1716454225160897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161004, "dur": 1, "args": { "External id": 236333, "cbid": 251, "correlation": 236333 } }, { "ph": "f", "id": 236333, "pid": 76337, "tid": -914061504, "ts": 1716454225161004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225214626, "dur": 162, "args": { "External id": 236335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236335, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236335, "pid": 5, "tid": 7, "ts": 1716454225214626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161010, "dur": 14, "args": { "External id": 236335, "cbid": 211, "correlation": 236335 } }, { "ph": "s", "id": 236335, "pid": 76337, "tid": -914061504, "ts": 1716454225161010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225214790, "dur": 19, "args": { "External id": 236343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236343, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236343, "pid": 5, "tid": 7, "ts": 1716454225214790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161082, "dur": 12, "args": { "External id": 236343, "cbid": 211, "correlation": 236343 } }, { "ph": "s", "id": 236343, "pid": 76337, "tid": -914061504, "ts": 1716454225161082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225214810, "dur": 28, "args": { "External id": 236351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236351, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236351, "pid": 5, "tid": 7, "ts": 1716454225214810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161121, "dur": 9, "args": { "External id": 236351, "cbid": 211, "correlation": 236351 } }, { "ph": "s", "id": 236351, "pid": 76337, "tid": -914061504, "ts": 1716454225161121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225214839, "dur": 19, "args": { "External id": 236362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236362, "pid": 5, "tid": 7, "ts": 1716454225214839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161192, "dur": 12, "args": { "External id": 236362, "cbid": 211, "correlation": 236362 } }, { "ph": "s", "id": 236362, "pid": 76337, "tid": -914061504, "ts": 1716454225161192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225214860, "dur": 16, "args": { "External id": 236384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236384, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236384, "pid": 5, "tid": 7, "ts": 1716454225214860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161223, "dur": 8, "args": { "External id": 236384, "cbid": 211, "correlation": 236384 } }, { "ph": "s", "id": 236384, "pid": 76337, "tid": -914061504, "ts": 1716454225161223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161308, "dur": 1, "args": { "External id": 236395, "cbid": 251, "correlation": 236395 } }, { "ph": "f", "id": 236395, "pid": 76337, "tid": -914061504, "ts": 1716454225161308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225214877, "dur": 87, "args": { "External id": 236396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236396, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 236396, "pid": 5, "tid": 7, "ts": 1716454225214877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161313, "dur": 13, "args": { "External id": 236396, "cbid": 211, "correlation": 236396 } }, { "ph": "s", "id": 236396, "pid": 76337, "tid": -914061504, "ts": 1716454225161313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161382, "dur": 1, "args": { "External id": 236407, "cbid": 251, "correlation": 236407 } }, { "ph": "f", "id": 236407, "pid": 76337, "tid": -914061504, "ts": 1716454225161382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161386, "dur": 0, "args": { "External id": 236408, "cbid": 251, "correlation": 236408 } }, { "ph": "f", "id": 236408, "pid": 76337, "tid": -914061504, "ts": 1716454225161386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225214965, "dur": 12, "args": { "External id": 236409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236409, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236409, "pid": 5, "tid": 7, "ts": 1716454225214965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161387, "dur": 12, "args": { "External id": 236409, "cbid": 211, "correlation": 236409 } }, { "ph": "s", "id": 236409, "pid": 76337, "tid": -914061504, "ts": 1716454225161387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225214979, "dur": 6, "args": { "External id": 236411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236411, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236411, "pid": 5, "tid": 7, "ts": 1716454225214979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161401, "dur": 6, "args": { "External id": 236411, "cbid": 211, "correlation": 236411 } }, { "ph": "s", "id": 236411, "pid": 76337, "tid": -914061504, "ts": 1716454225161401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161458, "dur": 1, "args": { "External id": 236422, "cbid": 251, "correlation": 236422 } }, { "ph": "f", "id": 236422, "pid": 76337, "tid": -914061504, "ts": 1716454225161458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161462, "dur": 0, "args": { "External id": 236423, "cbid": 251, "correlation": 236423 } }, { "ph": "f", "id": 236423, "pid": 76337, "tid": -914061504, "ts": 1716454225161462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225214986, "dur": 8, "args": { "External id": 236424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236424, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236424, "pid": 5, "tid": 7, "ts": 1716454225214986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161463, "dur": 12, "args": { "External id": 236424, "cbid": 211, "correlation": 236424 } }, { "ph": "s", "id": 236424, "pid": 76337, "tid": -914061504, "ts": 1716454225161463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225214995, "dur": 3, "args": { "External id": 236426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236426, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236426, "pid": 5, "tid": 7, "ts": 1716454225214995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161477, "dur": 5, "args": { "External id": 236426, "cbid": 211, "correlation": 236426 } }, { "ph": "s", "id": 236426, "pid": 76337, "tid": -914061504, "ts": 1716454225161477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225215000, "dur": 54, "args": { "External id": 236451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236451, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236451, "pid": 5, "tid": 7, "ts": 1716454225215000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161553, "dur": 12, "args": { "External id": 236451, "cbid": 211, "correlation": 236451 } }, { "ph": "s", "id": 236451, "pid": 76337, "tid": -914061504, "ts": 1716454225161553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161651, "dur": 1, "args": { "External id": 236469, "cbid": 251, "correlation": 236469 } }, { "ph": "f", "id": 236469, "pid": 76337, "tid": -914061504, "ts": 1716454225161651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225215055, "dur": 89, "args": { "External id": 236471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236471, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 236471, "pid": 5, "tid": 7, "ts": 1716454225215055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161657, "dur": 14, "args": { "External id": 236471, "cbid": 211, "correlation": 236471 } }, { "ph": "s", "id": 236471, "pid": 76337, "tid": -914061504, "ts": 1716454225161657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225215145, "dur": 9, "args": { "External id": 236479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236479, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236479, "pid": 5, "tid": 7, "ts": 1716454225215145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161728, "dur": 12, "args": { "External id": 236479, "cbid": 211, "correlation": 236479 } }, { "ph": "s", "id": 236479, "pid": 76337, "tid": -914061504, "ts": 1716454225161728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225215156, "dur": 21, "args": { "External id": 236487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236487, "pid": 5, "tid": 7, "ts": 1716454225215156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161768, "dur": 10, "args": { "External id": 236487, "cbid": 211, "correlation": 236487 } }, { "ph": "s", "id": 236487, "pid": 76337, "tid": -914061504, "ts": 1716454225161768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225215178, "dur": 17, "args": { "External id": 236509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236509, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236509, "pid": 5, "tid": 7, "ts": 1716454225215178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161818, "dur": 10, "args": { "External id": 236509, "cbid": 211, "correlation": 236509 } }, { "ph": "s", "id": 236509, "pid": 76337, "tid": -914061504, "ts": 1716454225161818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161905, "dur": 1, "args": { "External id": 236525, "cbid": 251, "correlation": 236525 } }, { "ph": "f", "id": 236525, "pid": 76337, "tid": -914061504, "ts": 1716454225161905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225161910, "dur": 0, "args": { "External id": 236527, "cbid": 251, "correlation": 236527 } }, { "ph": "f", "id": 236527, "pid": 76337, "tid": -914061504, "ts": 1716454225161910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225215196, "dur": 487, "args": { "External id": 236528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236528, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236528, "pid": 5, "tid": 7, "ts": 1716454225215196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161911, "dur": 14, "args": { "External id": 236528, "cbid": 211, "correlation": 236528 } }, { "ph": "s", "id": 236528, "pid": 76337, "tid": -914061504, "ts": 1716454225161911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225215685, "dur": 64, "args": { "External id": 236536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236536, "pid": 5, "tid": 7, "ts": 1716454225215685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225161985, "dur": 13, "args": { "External id": 236536, "cbid": 211, "correlation": 236536 } }, { "ph": "s", "id": 236536, "pid": 76337, "tid": -914061504, "ts": 1716454225161985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225215750, "dur": 67, "args": { "External id": 236544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236544, "pid": 5, "tid": 7, "ts": 1716454225215750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162016, "dur": 8, "args": { "External id": 236544, "cbid": 211, "correlation": 236544 } }, { "ph": "s", "id": 236544, "pid": 76337, "tid": -914061504, "ts": 1716454225162016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225162096, "dur": 1, "args": { "External id": 236560, "cbid": 251, "correlation": 236560 } }, { "ph": "f", "id": 236560, "pid": 76337, "tid": -914061504, "ts": 1716454225162096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225215820, "dur": 1, "args": { "External id": 236562, "device": 5, "context": 1, "stream": 7, "correlation": 236562, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 236562, "pid": 5, "tid": 7, "ts": 1716454225215820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225162101, "dur": 9, "args": { "External id": 236562, "cbid": 51, "correlation": 236562 } }, { "ph": "s", "id": 236562, "pid": 76337, "tid": -914061504, "ts": 1716454225162101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225215823, "dur": 268, "args": { "External id": 236563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236563, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 236563, "pid": 5, "tid": 7, "ts": 1716454225215823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162111, "dur": 11, "args": { "External id": 236563, "cbid": 211, "correlation": 236563 } }, { "ph": "s", "id": 236563, "pid": 76337, "tid": -914061504, "ts": 1716454225162111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225216093, "dur": 14, "args": { "External id": 236571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236571, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236571, "pid": 5, "tid": 7, "ts": 1716454225216093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162153, "dur": 10, "args": { "External id": 236571, "cbid": 211, "correlation": 236571 } }, { "ph": "s", "id": 236571, "pid": 76337, "tid": -914061504, "ts": 1716454225162153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225216108, "dur": 36, "args": { "External id": 236582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236582, "pid": 5, "tid": 7, "ts": 1716454225216108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162220, "dur": 13, "args": { "External id": 236582, "cbid": 211, "correlation": 236582 } }, { "ph": "s", "id": 236582, "pid": 76337, "tid": -914061504, "ts": 1716454225162220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225162286, "dur": 0, "args": { "External id": 236594, "cbid": 317, "correlation": 236594 } }, { "ph": "f", "id": 236594, "pid": 76337, "tid": -914061504, "ts": 1716454225162286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225162287, "dur": 0, "args": { "External id": 236595, "cbid": 203, "correlation": 236595 } }, { "ph": "f", "id": 236595, "pid": 76337, "tid": -914061504, "ts": 1716454225162287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225162287, "dur": 0, "args": { "External id": 236596, "cbid": 205, "correlation": 236596 } }, { "ph": "f", "id": 236596, "pid": 76337, "tid": -914061504, "ts": 1716454225162287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225216146, "dur": 13, "args": { "External id": 236600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236600, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236600, "pid": 5, "tid": 7, "ts": 1716454225216146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162302, "dur": 12, "args": { "External id": 236600, "cbid": 211, "correlation": 236600 } }, { "ph": "s", "id": 236600, "pid": 76337, "tid": -914061504, "ts": 1716454225162302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225216160, "dur": 4, "args": { "External id": 236602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 236602, "pid": 5, "tid": 7, "ts": 1716454225216160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162318, "dur": 6, "args": { "External id": 236602, "cbid": 211, "correlation": 236602 } }, { "ph": "s", "id": 236602, "pid": 76337, "tid": -914061504, "ts": 1716454225162318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225162328, "dur": 0, "args": { "External id": 236603, "cbid": 51, "correlation": 236603 } }, { "ph": "s", "id": 236603, "pid": 76337, "tid": -914061504, "ts": 1716454225162328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225216165, "dur": 95, "args": { "External id": 236604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236604, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 236604, "pid": 5, "tid": 7, "ts": 1716454225216165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162328, "dur": 5, "args": { "External id": 236604, "cbid": 211, "correlation": 236604 } }, { "ph": "s", "id": 236604, "pid": 76337, "tid": -914061504, "ts": 1716454225162328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225216261, "dur": 16, "args": { "External id": 236609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236609, "pid": 5, "tid": 7, "ts": 1716454225216261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162355, "dur": 9, "args": { "External id": 236609, "cbid": 211, "correlation": 236609 } }, { "ph": "s", "id": 236609, "pid": 76337, "tid": -914061504, "ts": 1716454225162355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225216279, "dur": 12, "args": { "External id": 236617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236617, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236617, "pid": 5, "tid": 7, "ts": 1716454225216279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162387, "dur": 8, "args": { "External id": 236617, "cbid": 211, "correlation": 236617 } }, { "ph": "s", "id": 236617, "pid": 76337, "tid": -914061504, "ts": 1716454225162387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225162455, "dur": 0, "args": { "External id": 236627, "cbid": 317, "correlation": 236627 } }, { "ph": "f", "id": 236627, "pid": 76337, "tid": -914061504, "ts": 1716454225162455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225162456, "dur": 0, "args": { "External id": 236628, "cbid": 203, "correlation": 236628 } }, { "ph": "f", "id": 236628, "pid": 76337, "tid": -914061504, "ts": 1716454225162456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225162456, "dur": 0, "args": { "External id": 236629, "cbid": 205, "correlation": 236629 } }, { "ph": "f", "id": 236629, "pid": 76337, "tid": -914061504, "ts": 1716454225162456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225216292, "dur": 11, "args": { "External id": 236633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236633, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236633, "pid": 5, "tid": 7, "ts": 1716454225216292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162472, "dur": 12, "args": { "External id": 236633, "cbid": 211, "correlation": 236633 } }, { "ph": "s", "id": 236633, "pid": 76337, "tid": -914061504, "ts": 1716454225162472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225216305, "dur": 161, "args": { "External id": 236635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236635, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236635, "pid": 5, "tid": 7, "ts": 1716454225216305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162486, "dur": 5, "args": { "External id": 236635, "cbid": 211, "correlation": 236635 } }, { "ph": "s", "id": 236635, "pid": 76337, "tid": -914061504, "ts": 1716454225162486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225216468, "dur": 1, "args": { "External id": 236637, "device": 5, "context": 1, "stream": 7, "correlation": 236637, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 236637, "pid": 5, "tid": 7, "ts": 1716454225216468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225162497, "dur": 6, "args": { "External id": 236637, "cbid": 51, "correlation": 236637 } }, { "ph": "s", "id": 236637, "pid": 76337, "tid": -914061504, "ts": 1716454225162497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225216471, "dur": 196, "args": { "External id": 236638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236638, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 236638, "pid": 5, "tid": 7, "ts": 1716454225216471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162505, "dur": 8, "args": { "External id": 236638, "cbid": 211, "correlation": 236638 } }, { "ph": "s", "id": 236638, "pid": 76337, "tid": -914061504, "ts": 1716454225162505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225216669, "dur": 6, "args": { "External id": 236640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236640, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236640, "pid": 5, "tid": 7, "ts": 1716454225216669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162517, "dur": 6, "args": { "External id": 236640, "cbid": 211, "correlation": 236640 } }, { "ph": "s", "id": 236640, "pid": 76337, "tid": -914061504, "ts": 1716454225162517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225216676, "dur": 6, "args": { "External id": 236646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236646, "pid": 5, "tid": 7, "ts": 1716454225216676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162546, "dur": 8, "args": { "External id": 236646, "cbid": 211, "correlation": 236646 } }, { "ph": "s", "id": 236646, "pid": 76337, "tid": -914061504, "ts": 1716454225162546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225216683, "dur": 10, "args": { "External id": 236666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236666, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 236666, "pid": 5, "tid": 7, "ts": 1716454225216683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162638, "dur": 11, "args": { "External id": 236666, "cbid": 211, "correlation": 236666 } }, { "ph": "s", "id": 236666, "pid": 76337, "tid": -914061504, "ts": 1716454225162638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225216695, "dur": 4, "args": { "External id": 236678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236678, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 236678, "pid": 5, "tid": 7, "ts": 1716454225216695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162660, "dur": 7, "args": { "External id": 236678, "cbid": 211, "correlation": 236678 } }, { "ph": "s", "id": 236678, "pid": 76337, "tid": -914061504, "ts": 1716454225162660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225216700, "dur": 8, "args": { "External id": 236681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236681, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236681, "pid": 5, "tid": 7, "ts": 1716454225216700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162680, "dur": 6, "args": { "External id": 236681, "cbid": 211, "correlation": 236681 } }, { "ph": "s", "id": 236681, "pid": 76337, "tid": -914061504, "ts": 1716454225162680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225216710, "dur": 5, "args": { "External id": 236690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236690, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236690, "pid": 5, "tid": 7, "ts": 1716454225216710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162720, "dur": 10, "args": { "External id": 236690, "cbid": 211, "correlation": 236690 } }, { "ph": "s", "id": 236690, "pid": 76337, "tid": -914061504, "ts": 1716454225162720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225162773, "dur": 0, "args": { "External id": 236700, "cbid": 317, "correlation": 236700 } }, { "ph": "f", "id": 236700, "pid": 76337, "tid": -914061504, "ts": 1716454225162773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225162774, "dur": 0, "args": { "External id": 236701, "cbid": 203, "correlation": 236701 } }, { "ph": "f", "id": 236701, "pid": 76337, "tid": -914061504, "ts": 1716454225162774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225162775, "dur": 0, "args": { "External id": 236702, "cbid": 205, "correlation": 236702 } }, { "ph": "f", "id": 236702, "pid": 76337, "tid": -914061504, "ts": 1716454225162775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225216716, "dur": 5, "args": { "External id": 236706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236706, "pid": 5, "tid": 7, "ts": 1716454225216716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162790, "dur": 11, "args": { "External id": 236706, "cbid": 211, "correlation": 236706 } }, { "ph": "s", "id": 236706, "pid": 76337, "tid": -914061504, "ts": 1716454225162790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225216722, "dur": 159, "args": { "External id": 236708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236708, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236708, "pid": 5, "tid": 7, "ts": 1716454225216722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162803, "dur": 5, "args": { "External id": 236708, "cbid": 211, "correlation": 236708 } }, { "ph": "s", "id": 236708, "pid": 76337, "tid": -914061504, "ts": 1716454225162803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225216884, "dur": 1, "args": { "External id": 236710, "device": 5, "context": 1, "stream": 7, "correlation": 236710, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 236710, "pid": 5, "tid": 7, "ts": 1716454225216884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225162814, "dur": 7, "args": { "External id": 236710, "cbid": 51, "correlation": 236710 } }, { "ph": "s", "id": 236710, "pid": 76337, "tid": -914061504, "ts": 1716454225162814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225216887, "dur": 266, "args": { "External id": 236711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236711, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236711, "pid": 5, "tid": 7, "ts": 1716454225216887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162822, "dur": 6, "args": { "External id": 236711, "cbid": 211, "correlation": 236711 } }, { "ph": "s", "id": 236711, "pid": 76337, "tid": -914061504, "ts": 1716454225162822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225217155, "dur": 6, "args": { "External id": 236713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236713, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236713, "pid": 5, "tid": 7, "ts": 1716454225217155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162833, "dur": 5, "args": { "External id": 236713, "cbid": 211, "correlation": 236713 } }, { "ph": "s", "id": 236713, "pid": 76337, "tid": -914061504, "ts": 1716454225162833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225217161, "dur": 6, "args": { "External id": 236719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236719, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236719, "pid": 5, "tid": 7, "ts": 1716454225217161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162861, "dur": 9, "args": { "External id": 236719, "cbid": 211, "correlation": 236719 } }, { "ph": "s", "id": 236719, "pid": 76337, "tid": -914061504, "ts": 1716454225162861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225217169, "dur": 3, "args": { "External id": 236727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236727, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 236727, "pid": 5, "tid": 7, "ts": 1716454225217169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162905, "dur": 9, "args": { "External id": 236727, "cbid": 211, "correlation": 236727 } }, { "ph": "s", "id": 236727, "pid": 76337, "tid": -914061504, "ts": 1716454225162905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225162970, "dur": 1, "args": { "External id": 236743, "cbid": 251, "correlation": 236743 } }, { "ph": "f", "id": 236743, "pid": 76337, "tid": -914061504, "ts": 1716454225162970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225162983, "dur": 0, "args": { "External id": 236745, "cbid": 251, "correlation": 236745 } }, { "ph": "f", "id": 236745, "pid": 76337, "tid": -914061504, "ts": 1716454225162983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225217173, "dur": 13, "args": { "External id": 236746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236746, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236746, "pid": 5, "tid": 7, "ts": 1716454225217173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162985, "dur": 11, "args": { "External id": 236746, "cbid": 211, "correlation": 236746 } }, { "ph": "s", "id": 236746, "pid": 76337, "tid": -914061504, "ts": 1716454225162985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225217188, "dur": 5, "args": { "External id": 236748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236748, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236748, "pid": 5, "tid": 7, "ts": 1716454225217188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225162999, "dur": 6, "args": { "External id": 236748, "cbid": 211, "correlation": 236748 } }, { "ph": "s", "id": 236748, "pid": 76337, "tid": -914061504, "ts": 1716454225162999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225217194, "dur": 6, "args": { "External id": 236758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236758, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236758, "pid": 5, "tid": 7, "ts": 1716454225217194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163057, "dur": 13, "args": { "External id": 236758, "cbid": 211, "correlation": 236758 } }, { "ph": "s", "id": 236758, "pid": 76337, "tid": -914061504, "ts": 1716454225163057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225217201, "dur": 10, "args": { "External id": 236778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236778, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 236778, "pid": 5, "tid": 7, "ts": 1716454225217201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163125, "dur": 10, "args": { "External id": 236778, "cbid": 211, "correlation": 236778 } }, { "ph": "s", "id": 236778, "pid": 76337, "tid": -914061504, "ts": 1716454225163125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225217212, "dur": 4, "args": { "External id": 236790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236790, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 236790, "pid": 5, "tid": 7, "ts": 1716454225217212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163145, "dur": 6, "args": { "External id": 236790, "cbid": 211, "correlation": 236790 } }, { "ph": "s", "id": 236790, "pid": 76337, "tid": -914061504, "ts": 1716454225163145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225217217, "dur": 7, "args": { "External id": 236793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236793, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236793, "pid": 5, "tid": 7, "ts": 1716454225217217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163164, "dur": 7, "args": { "External id": 236793, "cbid": 211, "correlation": 236793 } }, { "ph": "s", "id": 236793, "pid": 76337, "tid": -914061504, "ts": 1716454225163164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225217225, "dur": 4, "args": { "External id": 236802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236802, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236802, "pid": 5, "tid": 7, "ts": 1716454225217225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163203, "dur": 10, "args": { "External id": 236802, "cbid": 211, "correlation": 236802 } }, { "ph": "s", "id": 236802, "pid": 76337, "tid": -914061504, "ts": 1716454225163203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225163267, "dur": 0, "args": { "External id": 236812, "cbid": 317, "correlation": 236812 } }, { "ph": "f", "id": 236812, "pid": 76337, "tid": -914061504, "ts": 1716454225163267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225163268, "dur": 0, "args": { "External id": 236813, "cbid": 203, "correlation": 236813 } }, { "ph": "f", "id": 236813, "pid": 76337, "tid": -914061504, "ts": 1716454225163268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225163268, "dur": 0, "args": { "External id": 236814, "cbid": 205, "correlation": 236814 } }, { "ph": "f", "id": 236814, "pid": 76337, "tid": -914061504, "ts": 1716454225163268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225217231, "dur": 5, "args": { "External id": 236818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236818, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236818, "pid": 5, "tid": 7, "ts": 1716454225217231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163282, "dur": 12, "args": { "External id": 236818, "cbid": 211, "correlation": 236818 } }, { "ph": "s", "id": 236818, "pid": 76337, "tid": -914061504, "ts": 1716454225163282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225217237, "dur": 159, "args": { "External id": 236820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236820, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236820, "pid": 5, "tid": 7, "ts": 1716454225217237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163296, "dur": 5, "args": { "External id": 236820, "cbid": 211, "correlation": 236820 } }, { "ph": "s", "id": 236820, "pid": 76337, "tid": -914061504, "ts": 1716454225163296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225217398, "dur": 1, "args": { "External id": 236822, "device": 5, "context": 1, "stream": 7, "correlation": 236822, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 236822, "pid": 5, "tid": 7, "ts": 1716454225217398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225163307, "dur": 7, "args": { "External id": 236822, "cbid": 51, "correlation": 236822 } }, { "ph": "s", "id": 236822, "pid": 76337, "tid": -914061504, "ts": 1716454225163307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225217402, "dur": 254, "args": { "External id": 236823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236823, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236823, "pid": 5, "tid": 7, "ts": 1716454225217402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163315, "dur": 6, "args": { "External id": 236823, "cbid": 211, "correlation": 236823 } }, { "ph": "s", "id": 236823, "pid": 76337, "tid": -914061504, "ts": 1716454225163315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225217658, "dur": 6, "args": { "External id": 236825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236825, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236825, "pid": 5, "tid": 7, "ts": 1716454225217658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163325, "dur": 5, "args": { "External id": 236825, "cbid": 211, "correlation": 236825 } }, { "ph": "s", "id": 236825, "pid": 76337, "tid": -914061504, "ts": 1716454225163325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225217664, "dur": 6, "args": { "External id": 236831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236831, "pid": 5, "tid": 7, "ts": 1716454225217664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163353, "dur": 9, "args": { "External id": 236831, "cbid": 211, "correlation": 236831 } }, { "ph": "s", "id": 236831, "pid": 76337, "tid": -914061504, "ts": 1716454225163353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225217672, "dur": 5, "args": { "External id": 236839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236839, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236839, "pid": 5, "tid": 7, "ts": 1716454225217672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163385, "dur": 8, "args": { "External id": 236839, "cbid": 211, "correlation": 236839 } }, { "ph": "s", "id": 236839, "pid": 76337, "tid": -914061504, "ts": 1716454225163385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225217678, "dur": 4, "args": { "External id": 236847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236847, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236847, "pid": 5, "tid": 7, "ts": 1716454225217678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163416, "dur": 8, "args": { "External id": 236847, "cbid": 211, "correlation": 236847 } }, { "ph": "s", "id": 236847, "pid": 76337, "tid": -914061504, "ts": 1716454225163416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225217684, "dur": 9, "args": { "External id": 236867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236867, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 236867, "pid": 5, "tid": 7, "ts": 1716454225217684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163488, "dur": 11, "args": { "External id": 236867, "cbid": 211, "correlation": 236867 } }, { "ph": "s", "id": 236867, "pid": 76337, "tid": -914061504, "ts": 1716454225163488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225217695, "dur": 4, "args": { "External id": 236879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236879, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 236879, "pid": 5, "tid": 7, "ts": 1716454225217695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163509, "dur": 7, "args": { "External id": 236879, "cbid": 211, "correlation": 236879 } }, { "ph": "s", "id": 236879, "pid": 76337, "tid": -914061504, "ts": 1716454225163509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225217700, "dur": 6, "args": { "External id": 236882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236882, "pid": 5, "tid": 7, "ts": 1716454225217700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163527, "dur": 7, "args": { "External id": 236882, "cbid": 211, "correlation": 236882 } }, { "ph": "s", "id": 236882, "pid": 76337, "tid": -914061504, "ts": 1716454225163527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225217708, "dur": 4, "args": { "External id": 236891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236891, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236891, "pid": 5, "tid": 7, "ts": 1716454225217708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163566, "dur": 9, "args": { "External id": 236891, "cbid": 211, "correlation": 236891 } }, { "ph": "s", "id": 236891, "pid": 76337, "tid": -914061504, "ts": 1716454225163566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225163616, "dur": 0, "args": { "External id": 236901, "cbid": 317, "correlation": 236901 } }, { "ph": "f", "id": 236901, "pid": 76337, "tid": -914061504, "ts": 1716454225163616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225163617, "dur": 0, "args": { "External id": 236902, "cbid": 203, "correlation": 236902 } }, { "ph": "f", "id": 236902, "pid": 76337, "tid": -914061504, "ts": 1716454225163617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225163618, "dur": 0, "args": { "External id": 236903, "cbid": 205, "correlation": 236903 } }, { "ph": "f", "id": 236903, "pid": 76337, "tid": -914061504, "ts": 1716454225163618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225217713, "dur": 5, "args": { "External id": 236907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236907, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236907, "pid": 5, "tid": 7, "ts": 1716454225217713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163631, "dur": 12, "args": { "External id": 236907, "cbid": 211, "correlation": 236907 } }, { "ph": "s", "id": 236907, "pid": 76337, "tid": -914061504, "ts": 1716454225163631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225217720, "dur": 159, "args": { "External id": 236909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236909, "pid": 5, "tid": 7, "ts": 1716454225217720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163645, "dur": 5, "args": { "External id": 236909, "cbid": 211, "correlation": 236909 } }, { "ph": "s", "id": 236909, "pid": 76337, "tid": -914061504, "ts": 1716454225163645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225217881, "dur": 1, "args": { "External id": 236911, "device": 5, "context": 1, "stream": 7, "correlation": 236911, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 236911, "pid": 5, "tid": 7, "ts": 1716454225217881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225163655, "dur": 7, "args": { "External id": 236911, "cbid": 51, "correlation": 236911 } }, { "ph": "s", "id": 236911, "pid": 76337, "tid": -914061504, "ts": 1716454225163655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225217884, "dur": 254, "args": { "External id": 236912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236912, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236912, "pid": 5, "tid": 7, "ts": 1716454225217884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163663, "dur": 6, "args": { "External id": 236912, "cbid": 211, "correlation": 236912 } }, { "ph": "s", "id": 236912, "pid": 76337, "tid": -914061504, "ts": 1716454225163663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225218140, "dur": 5, "args": { "External id": 236914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 236914, "pid": 5, "tid": 7, "ts": 1716454225218140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163672, "dur": 5, "args": { "External id": 236914, "cbid": 211, "correlation": 236914 } }, { "ph": "s", "id": 236914, "pid": 76337, "tid": -914061504, "ts": 1716454225163672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225218146, "dur": 6, "args": { "External id": 236920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236920, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236920, "pid": 5, "tid": 7, "ts": 1716454225218146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163700, "dur": 8, "args": { "External id": 236920, "cbid": 211, "correlation": 236920 } }, { "ph": "s", "id": 236920, "pid": 76337, "tid": -914061504, "ts": 1716454225163700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225218154, "dur": 3, "args": { "External id": 236928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236928, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 236928, "pid": 5, "tid": 7, "ts": 1716454225218154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163744, "dur": 9, "args": { "External id": 236928, "cbid": 211, "correlation": 236928 } }, { "ph": "s", "id": 236928, "pid": 76337, "tid": -914061504, "ts": 1716454225163744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225163806, "dur": 1, "args": { "External id": 236944, "cbid": 251, "correlation": 236944 } }, { "ph": "f", "id": 236944, "pid": 76337, "tid": -914061504, "ts": 1716454225163806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225163812, "dur": 0, "args": { "External id": 236946, "cbid": 251, "correlation": 236946 } }, { "ph": "f", "id": 236946, "pid": 76337, "tid": -914061504, "ts": 1716454225163812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225218158, "dur": 10, "args": { "External id": 236947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236947, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236947, "pid": 5, "tid": 7, "ts": 1716454225218158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163814, "dur": 11, "args": { "External id": 236947, "cbid": 211, "correlation": 236947 } }, { "ph": "s", "id": 236947, "pid": 76337, "tid": -914061504, "ts": 1716454225163814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225218169, "dur": 4, "args": { "External id": 236949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236949, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 236949, "pid": 5, "tid": 7, "ts": 1716454225218169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163827, "dur": 5, "args": { "External id": 236949, "cbid": 211, "correlation": 236949 } }, { "ph": "s", "id": 236949, "pid": 76337, "tid": -914061504, "ts": 1716454225163827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225218174, "dur": 6, "args": { "External id": 236959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236959, "pid": 5, "tid": 7, "ts": 1716454225218174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163882, "dur": 12, "args": { "External id": 236959, "cbid": 211, "correlation": 236959 } }, { "ph": "s", "id": 236959, "pid": 76337, "tid": -914061504, "ts": 1716454225163882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225218181, "dur": 10, "args": { "External id": 236979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236979, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 236979, "pid": 5, "tid": 7, "ts": 1716454225218181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163947, "dur": 10, "args": { "External id": 236979, "cbid": 211, "correlation": 236979 } }, { "ph": "s", "id": 236979, "pid": 76337, "tid": -914061504, "ts": 1716454225163947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225218192, "dur": 4, "args": { "External id": 236991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236991, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 236991, "pid": 5, "tid": 7, "ts": 1716454225218192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163968, "dur": 14, "args": { "External id": 236991, "cbid": 211, "correlation": 236991 } }, { "ph": "s", "id": 236991, "pid": 76337, "tid": -914061504, "ts": 1716454225163968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225218197, "dur": 7, "args": { "External id": 236994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 236994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 236994, "pid": 5, "tid": 7, "ts": 1716454225218197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225163996, "dur": 7, "args": { "External id": 236994, "cbid": 211, "correlation": 236994 } }, { "ph": "s", "id": 236994, "pid": 76337, "tid": -914061504, "ts": 1716454225163996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225218205, "dur": 4, "args": { "External id": 237003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237003, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237003, "pid": 5, "tid": 7, "ts": 1716454225218205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164038, "dur": 9, "args": { "External id": 237003, "cbid": 211, "correlation": 237003 } }, { "ph": "s", "id": 237003, "pid": 76337, "tid": -914061504, "ts": 1716454225164038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225164099, "dur": 0, "args": { "External id": 237013, "cbid": 317, "correlation": 237013 } }, { "ph": "f", "id": 237013, "pid": 76337, "tid": -914061504, "ts": 1716454225164099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225164100, "dur": 0, "args": { "External id": 237014, "cbid": 203, "correlation": 237014 } }, { "ph": "f", "id": 237014, "pid": 76337, "tid": -914061504, "ts": 1716454225164100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225164101, "dur": 0, "args": { "External id": 237015, "cbid": 205, "correlation": 237015 } }, { "ph": "f", "id": 237015, "pid": 76337, "tid": -914061504, "ts": 1716454225164101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225218211, "dur": 5, "args": { "External id": 237019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237019, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237019, "pid": 5, "tid": 7, "ts": 1716454225218211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164116, "dur": 12, "args": { "External id": 237019, "cbid": 211, "correlation": 237019 } }, { "ph": "s", "id": 237019, "pid": 76337, "tid": -914061504, "ts": 1716454225164116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225218217, "dur": 159, "args": { "External id": 237021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237021, "pid": 5, "tid": 7, "ts": 1716454225218217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164130, "dur": 5, "args": { "External id": 237021, "cbid": 211, "correlation": 237021 } }, { "ph": "s", "id": 237021, "pid": 76337, "tid": -914061504, "ts": 1716454225164130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225218378, "dur": 1, "args": { "External id": 237023, "device": 5, "context": 1, "stream": 7, "correlation": 237023, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 237023, "pid": 5, "tid": 7, "ts": 1716454225218378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225164140, "dur": 7, "args": { "External id": 237023, "cbid": 51, "correlation": 237023 } }, { "ph": "s", "id": 237023, "pid": 76337, "tid": -914061504, "ts": 1716454225164140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225218382, "dur": 254, "args": { "External id": 237024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237024, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237024, "pid": 5, "tid": 7, "ts": 1716454225218382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164148, "dur": 6, "args": { "External id": 237024, "cbid": 211, "correlation": 237024 } }, { "ph": "s", "id": 237024, "pid": 76337, "tid": -914061504, "ts": 1716454225164148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225218637, "dur": 5, "args": { "External id": 237026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237026, "pid": 5, "tid": 7, "ts": 1716454225218637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164158, "dur": 6, "args": { "External id": 237026, "cbid": 211, "correlation": 237026 } }, { "ph": "s", "id": 237026, "pid": 76337, "tid": -914061504, "ts": 1716454225164158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225218644, "dur": 6, "args": { "External id": 237032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237032, "pid": 5, "tid": 7, "ts": 1716454225218644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164187, "dur": 8, "args": { "External id": 237032, "cbid": 211, "correlation": 237032 } }, { "ph": "s", "id": 237032, "pid": 76337, "tid": -914061504, "ts": 1716454225164187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225218651, "dur": 5, "args": { "External id": 237040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237040, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237040, "pid": 5, "tid": 7, "ts": 1716454225218651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164220, "dur": 8, "args": { "External id": 237040, "cbid": 211, "correlation": 237040 } }, { "ph": "s", "id": 237040, "pid": 76337, "tid": -914061504, "ts": 1716454225164220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225218657, "dur": 4, "args": { "External id": 237048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237048, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237048, "pid": 5, "tid": 7, "ts": 1716454225218657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164250, "dur": 8, "args": { "External id": 237048, "cbid": 211, "correlation": 237048 } }, { "ph": "s", "id": 237048, "pid": 76337, "tid": -914061504, "ts": 1716454225164250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225218663, "dur": 9, "args": { "External id": 237068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237068, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 237068, "pid": 5, "tid": 7, "ts": 1716454225218663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164353, "dur": 13, "args": { "External id": 237068, "cbid": 211, "correlation": 237068 } }, { "ph": "s", "id": 237068, "pid": 76337, "tid": -914061504, "ts": 1716454225164353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225218673, "dur": 4, "args": { "External id": 237080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237080, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 237080, "pid": 5, "tid": 7, "ts": 1716454225218673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164376, "dur": 6, "args": { "External id": 237080, "cbid": 211, "correlation": 237080 } }, { "ph": "s", "id": 237080, "pid": 76337, "tid": -914061504, "ts": 1716454225164376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225218678, "dur": 6, "args": { "External id": 237083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237083, "pid": 5, "tid": 7, "ts": 1716454225218678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164394, "dur": 6, "args": { "External id": 237083, "cbid": 211, "correlation": 237083 } }, { "ph": "s", "id": 237083, "pid": 76337, "tid": -914061504, "ts": 1716454225164394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225218686, "dur": 4, "args": { "External id": 237092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237092, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237092, "pid": 5, "tid": 7, "ts": 1716454225218686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164432, "dur": 9, "args": { "External id": 237092, "cbid": 211, "correlation": 237092 } }, { "ph": "s", "id": 237092, "pid": 76337, "tid": -914061504, "ts": 1716454225164432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225164484, "dur": 0, "args": { "External id": 237102, "cbid": 317, "correlation": 237102 } }, { "ph": "f", "id": 237102, "pid": 76337, "tid": -914061504, "ts": 1716454225164484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225164484, "dur": 0, "args": { "External id": 237103, "cbid": 203, "correlation": 237103 } }, { "ph": "f", "id": 237103, "pid": 76337, "tid": -914061504, "ts": 1716454225164484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225164485, "dur": 0, "args": { "External id": 237104, "cbid": 205, "correlation": 237104 } }, { "ph": "f", "id": 237104, "pid": 76337, "tid": -914061504, "ts": 1716454225164485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225218692, "dur": 5, "args": { "External id": 237108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237108, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237108, "pid": 5, "tid": 7, "ts": 1716454225218692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164498, "dur": 12, "args": { "External id": 237108, "cbid": 211, "correlation": 237108 } }, { "ph": "s", "id": 237108, "pid": 76337, "tid": -914061504, "ts": 1716454225164498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225218698, "dur": 159, "args": { "External id": 237110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237110, "pid": 5, "tid": 7, "ts": 1716454225218698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164513, "dur": 5, "args": { "External id": 237110, "cbid": 211, "correlation": 237110 } }, { "ph": "s", "id": 237110, "pid": 76337, "tid": -914061504, "ts": 1716454225164513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225218859, "dur": 1, "args": { "External id": 237112, "device": 5, "context": 1, "stream": 7, "correlation": 237112, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 237112, "pid": 5, "tid": 7, "ts": 1716454225218859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225164523, "dur": 6, "args": { "External id": 237112, "cbid": 51, "correlation": 237112 } }, { "ph": "s", "id": 237112, "pid": 76337, "tid": -914061504, "ts": 1716454225164523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225218862, "dur": 254, "args": { "External id": 237113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237113, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237113, "pid": 5, "tid": 7, "ts": 1716454225218862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164530, "dur": 6, "args": { "External id": 237113, "cbid": 211, "correlation": 237113 } }, { "ph": "s", "id": 237113, "pid": 76337, "tid": -914061504, "ts": 1716454225164530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225219118, "dur": 6, "args": { "External id": 237115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237115, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237115, "pid": 5, "tid": 7, "ts": 1716454225219118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164540, "dur": 5, "args": { "External id": 237115, "cbid": 211, "correlation": 237115 } }, { "ph": "s", "id": 237115, "pid": 76337, "tid": -914061504, "ts": 1716454225164540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225219125, "dur": 6, "args": { "External id": 237121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237121, "pid": 5, "tid": 7, "ts": 1716454225219125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164568, "dur": 8, "args": { "External id": 237121, "cbid": 211, "correlation": 237121 } }, { "ph": "s", "id": 237121, "pid": 76337, "tid": -914061504, "ts": 1716454225164568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225219132, "dur": 3, "args": { "External id": 237129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237129, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 237129, "pid": 5, "tid": 7, "ts": 1716454225219132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164612, "dur": 9, "args": { "External id": 237129, "cbid": 211, "correlation": 237129 } }, { "ph": "s", "id": 237129, "pid": 76337, "tid": -914061504, "ts": 1716454225164612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225164674, "dur": 1, "args": { "External id": 237145, "cbid": 251, "correlation": 237145 } }, { "ph": "f", "id": 237145, "pid": 76337, "tid": -914061504, "ts": 1716454225164674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225164679, "dur": 0, "args": { "External id": 237147, "cbid": 251, "correlation": 237147 } }, { "ph": "f", "id": 237147, "pid": 76337, "tid": -914061504, "ts": 1716454225164679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225219137, "dur": 10, "args": { "External id": 237148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237148, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237148, "pid": 5, "tid": 7, "ts": 1716454225219137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164681, "dur": 11, "args": { "External id": 237148, "cbid": 211, "correlation": 237148 } }, { "ph": "s", "id": 237148, "pid": 76337, "tid": -914061504, "ts": 1716454225164681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225219148, "dur": 3, "args": { "External id": 237150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237150, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237150, "pid": 5, "tid": 7, "ts": 1716454225219148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164694, "dur": 5, "args": { "External id": 237150, "cbid": 211, "correlation": 237150 } }, { "ph": "s", "id": 237150, "pid": 76337, "tid": -914061504, "ts": 1716454225164694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225219153, "dur": 5, "args": { "External id": 237160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237160, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237160, "pid": 5, "tid": 7, "ts": 1716454225219153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164750, "dur": 13, "args": { "External id": 237160, "cbid": 211, "correlation": 237160 } }, { "ph": "s", "id": 237160, "pid": 76337, "tid": -914061504, "ts": 1716454225164750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225219160, "dur": 9, "args": { "External id": 237180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237180, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 237180, "pid": 5, "tid": 7, "ts": 1716454225219160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164816, "dur": 11, "args": { "External id": 237180, "cbid": 211, "correlation": 237180 } }, { "ph": "s", "id": 237180, "pid": 76337, "tid": -914061504, "ts": 1716454225164816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225219170, "dur": 4, "args": { "External id": 237192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237192, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 237192, "pid": 5, "tid": 7, "ts": 1716454225219170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164836, "dur": 6, "args": { "External id": 237192, "cbid": 211, "correlation": 237192 } }, { "ph": "s", "id": 237192, "pid": 76337, "tid": -914061504, "ts": 1716454225164836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225219175, "dur": 7, "args": { "External id": 237195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237195, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237195, "pid": 5, "tid": 7, "ts": 1716454225219175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164855, "dur": 7, "args": { "External id": 237195, "cbid": 211, "correlation": 237195 } }, { "ph": "s", "id": 237195, "pid": 76337, "tid": -914061504, "ts": 1716454225164855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225219184, "dur": 4, "args": { "External id": 237204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237204, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237204, "pid": 5, "tid": 7, "ts": 1716454225219184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164894, "dur": 9, "args": { "External id": 237204, "cbid": 211, "correlation": 237204 } }, { "ph": "s", "id": 237204, "pid": 76337, "tid": -914061504, "ts": 1716454225164894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225164957, "dur": 0, "args": { "External id": 237214, "cbid": 317, "correlation": 237214 } }, { "ph": "f", "id": 237214, "pid": 76337, "tid": -914061504, "ts": 1716454225164957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225164958, "dur": 0, "args": { "External id": 237215, "cbid": 203, "correlation": 237215 } }, { "ph": "f", "id": 237215, "pid": 76337, "tid": -914061504, "ts": 1716454225164958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225164958, "dur": 0, "args": { "External id": 237216, "cbid": 205, "correlation": 237216 } }, { "ph": "f", "id": 237216, "pid": 76337, "tid": -914061504, "ts": 1716454225164958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225219189, "dur": 5, "args": { "External id": 237220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237220, "pid": 5, "tid": 7, "ts": 1716454225219189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164972, "dur": 21, "args": { "External id": 237220, "cbid": 211, "correlation": 237220 } }, { "ph": "s", "id": 237220, "pid": 76337, "tid": -914061504, "ts": 1716454225164972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225219196, "dur": 159, "args": { "External id": 237222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237222, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237222, "pid": 5, "tid": 7, "ts": 1716454225219196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225164996, "dur": 6, "args": { "External id": 237222, "cbid": 211, "correlation": 237222 } }, { "ph": "s", "id": 237222, "pid": 76337, "tid": -914061504, "ts": 1716454225164996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225219357, "dur": 1, "args": { "External id": 237224, "device": 5, "context": 1, "stream": 7, "correlation": 237224, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 237224, "pid": 5, "tid": 7, "ts": 1716454225219357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225165008, "dur": 7, "args": { "External id": 237224, "cbid": 51, "correlation": 237224 } }, { "ph": "s", "id": 237224, "pid": 76337, "tid": -914061504, "ts": 1716454225165008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225219361, "dur": 254, "args": { "External id": 237225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237225, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237225, "pid": 5, "tid": 7, "ts": 1716454225219361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165016, "dur": 6, "args": { "External id": 237225, "cbid": 211, "correlation": 237225 } }, { "ph": "s", "id": 237225, "pid": 76337, "tid": -914061504, "ts": 1716454225165016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225219616, "dur": 6, "args": { "External id": 237227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237227, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237227, "pid": 5, "tid": 7, "ts": 1716454225219616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165025, "dur": 5, "args": { "External id": 237227, "cbid": 211, "correlation": 237227 } }, { "ph": "s", "id": 237227, "pid": 76337, "tid": -914061504, "ts": 1716454225165025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225219623, "dur": 6, "args": { "External id": 237233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237233, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237233, "pid": 5, "tid": 7, "ts": 1716454225219623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165054, "dur": 10, "args": { "External id": 237233, "cbid": 211, "correlation": 237233 } }, { "ph": "s", "id": 237233, "pid": 76337, "tid": -914061504, "ts": 1716454225165054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225219630, "dur": 5, "args": { "External id": 237241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237241, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237241, "pid": 5, "tid": 7, "ts": 1716454225219630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165087, "dur": 8, "args": { "External id": 237241, "cbid": 211, "correlation": 237241 } }, { "ph": "s", "id": 237241, "pid": 76337, "tid": -914061504, "ts": 1716454225165087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225219636, "dur": 4, "args": { "External id": 237249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237249, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237249, "pid": 5, "tid": 7, "ts": 1716454225219636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165117, "dur": 8, "args": { "External id": 237249, "cbid": 211, "correlation": 237249 } }, { "ph": "s", "id": 237249, "pid": 76337, "tid": -914061504, "ts": 1716454225165117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225219642, "dur": 9, "args": { "External id": 237269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237269, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 237269, "pid": 5, "tid": 7, "ts": 1716454225219642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165258, "dur": 13, "args": { "External id": 237269, "cbid": 211, "correlation": 237269 } }, { "ph": "s", "id": 237269, "pid": 76337, "tid": -914061504, "ts": 1716454225165258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225219653, "dur": 4, "args": { "External id": 237281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237281, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 237281, "pid": 5, "tid": 7, "ts": 1716454225219653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165281, "dur": 6, "args": { "External id": 237281, "cbid": 211, "correlation": 237281 } }, { "ph": "s", "id": 237281, "pid": 76337, "tid": -914061504, "ts": 1716454225165281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225219658, "dur": 7, "args": { "External id": 237284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237284, "pid": 5, "tid": 7, "ts": 1716454225219658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165299, "dur": 8, "args": { "External id": 237284, "cbid": 211, "correlation": 237284 } }, { "ph": "s", "id": 237284, "pid": 76337, "tid": -914061504, "ts": 1716454225165299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225165359, "dur": 0, "args": { "External id": 237295, "cbid": 317, "correlation": 237295 } }, { "ph": "f", "id": 237295, "pid": 76337, "tid": -914061504, "ts": 1716454225165359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225165360, "dur": 0, "args": { "External id": 237296, "cbid": 203, "correlation": 237296 } }, { "ph": "f", "id": 237296, "pid": 76337, "tid": -914061504, "ts": 1716454225165360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225165360, "dur": 0, "args": { "External id": 237297, "cbid": 205, "correlation": 237297 } }, { "ph": "f", "id": 237297, "pid": 76337, "tid": -914061504, "ts": 1716454225165360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225219665, "dur": 5, "args": { "External id": 237301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237301, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237301, "pid": 5, "tid": 7, "ts": 1716454225219665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165376, "dur": 12, "args": { "External id": 237301, "cbid": 211, "correlation": 237301 } }, { "ph": "s", "id": 237301, "pid": 76337, "tid": -914061504, "ts": 1716454225165376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225219672, "dur": 36, "args": { "External id": 237303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237303, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 237303, "pid": 5, "tid": 7, "ts": 1716454225219672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165396, "dur": 9, "args": { "External id": 237303, "cbid": 211, "correlation": 237303 } }, { "ph": "s", "id": 237303, "pid": 76337, "tid": -914061504, "ts": 1716454225165396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225219709, "dur": 5, "args": { "External id": 237305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237305, "pid": 5, "tid": 7, "ts": 1716454225219709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165409, "dur": 5, "args": { "External id": 237305, "cbid": 211, "correlation": 237305 } }, { "ph": "s", "id": 237305, "pid": 76337, "tid": -914061504, "ts": 1716454225165409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225219715, "dur": 6, "args": { "External id": 237311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237311, "pid": 5, "tid": 7, "ts": 1716454225219715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165436, "dur": 8, "args": { "External id": 237311, "cbid": 211, "correlation": 237311 } }, { "ph": "s", "id": 237311, "pid": 76337, "tid": -914061504, "ts": 1716454225165436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225219723, "dur": 20, "args": { "External id": 237320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237320, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237320, "pid": 5, "tid": 7, "ts": 1716454225219723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165519, "dur": 14, "args": { "External id": 237320, "cbid": 211, "correlation": 237320 } }, { "ph": "s", "id": 237320, "pid": 76337, "tid": -914061504, "ts": 1716454225165519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225219744, "dur": 10, "args": { "External id": 237342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237342, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 237342, "pid": 5, "tid": 7, "ts": 1716454225219744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165577, "dur": 10, "args": { "External id": 237342, "cbid": 211, "correlation": 237342 } }, { "ph": "s", "id": 237342, "pid": 76337, "tid": -914061504, "ts": 1716454225165577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225165666, "dur": 2, "args": { "External id": 237353, "cbid": 251, "correlation": 237353 } }, { "ph": "f", "id": 237353, "pid": 76337, "tid": -914061504, "ts": 1716454225165666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225165670, "dur": 0, "args": { "External id": 237354, "cbid": 251, "correlation": 237354 } }, { "ph": "f", "id": 237354, "pid": 76337, "tid": -914061504, "ts": 1716454225165670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225219755, "dur": 53, "args": { "External id": 237355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237355, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 237355, "pid": 5, "tid": 7, "ts": 1716454225219755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165673, "dur": 13, "args": { "External id": 237355, "cbid": 211, "correlation": 237355 } }, { "ph": "s", "id": 237355, "pid": 76337, "tid": -914061504, "ts": 1716454225165673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225165743, "dur": 1, "args": { "External id": 237366, "cbid": 251, "correlation": 237366 } }, { "ph": "f", "id": 237366, "pid": 76337, "tid": -914061504, "ts": 1716454225165743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225165747, "dur": 0, "args": { "External id": 237367, "cbid": 251, "correlation": 237367 } }, { "ph": "f", "id": 237367, "pid": 76337, "tid": -914061504, "ts": 1716454225165747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225219810, "dur": 52, "args": { "External id": 237368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237368, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 237368, "pid": 5, "tid": 7, "ts": 1716454225219810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165749, "dur": 12, "args": { "External id": 237368, "cbid": 211, "correlation": 237368 } }, { "ph": "s", "id": 237368, "pid": 76337, "tid": -914061504, "ts": 1716454225165749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225165815, "dur": 1, "args": { "External id": 237379, "cbid": 251, "correlation": 237379 } }, { "ph": "f", "id": 237379, "pid": 76337, "tid": -914061504, "ts": 1716454225165815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225165818, "dur": 0, "args": { "External id": 237380, "cbid": 251, "correlation": 237380 } }, { "ph": "f", "id": 237380, "pid": 76337, "tid": -914061504, "ts": 1716454225165818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225219863, "dur": 53, "args": { "External id": 237381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237381, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 237381, "pid": 5, "tid": 7, "ts": 1716454225219863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165820, "dur": 11, "args": { "External id": 237381, "cbid": 211, "correlation": 237381 } }, { "ph": "s", "id": 237381, "pid": 76337, "tid": -914061504, "ts": 1716454225165820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225219917, "dur": 55, "args": { "External id": 237406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237406, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237406, "pid": 5, "tid": 7, "ts": 1716454225219917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225165907, "dur": 13, "args": { "External id": 237406, "cbid": 211, "correlation": 237406 } }, { "ph": "s", "id": 237406, "pid": 76337, "tid": -914061504, "ts": 1716454225165907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166014, "dur": 1, "args": { "External id": 237424, "cbid": 251, "correlation": 237424 } }, { "ph": "f", "id": 237424, "pid": 76337, "tid": -914061504, "ts": 1716454225166014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225219973, "dur": 62, "args": { "External id": 237426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237426, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 237426, "pid": 5, "tid": 7, "ts": 1716454225219973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166020, "dur": 13, "args": { "External id": 237426, "cbid": 211, "correlation": 237426 } }, { "ph": "s", "id": 237426, "pid": 76337, "tid": -914061504, "ts": 1716454225166020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225220036, "dur": 6, "args": { "External id": 237434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237434, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237434, "pid": 5, "tid": 7, "ts": 1716454225220036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166091, "dur": 13, "args": { "External id": 237434, "cbid": 211, "correlation": 237434 } }, { "ph": "s", "id": 237434, "pid": 76337, "tid": -914061504, "ts": 1716454225166091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225220044, "dur": 7, "args": { "External id": 237442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237442, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237442, "pid": 5, "tid": 7, "ts": 1716454225220044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166130, "dur": 8, "args": { "External id": 237442, "cbid": 211, "correlation": 237442 } }, { "ph": "s", "id": 237442, "pid": 76337, "tid": -914061504, "ts": 1716454225166130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220052, "dur": 7, "args": { "External id": 237453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237453, "pid": 5, "tid": 7, "ts": 1716454225220052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166205, "dur": 13, "args": { "External id": 237453, "cbid": 211, "correlation": 237453 } }, { "ph": "s", "id": 237453, "pid": 76337, "tid": -914061504, "ts": 1716454225166205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225220061, "dur": 8, "args": { "External id": 237475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237475, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 237475, "pid": 5, "tid": 7, "ts": 1716454225220061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166238, "dur": 8, "args": { "External id": 237475, "cbid": 211, "correlation": 237475 } }, { "ph": "s", "id": 237475, "pid": 76337, "tid": -914061504, "ts": 1716454225166238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166324, "dur": 2, "args": { "External id": 237486, "cbid": 251, "correlation": 237486 } }, { "ph": "f", "id": 237486, "pid": 76337, "tid": -914061504, "ts": 1716454225166324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225220071, "dur": 1, "args": { "External id": 237487, "device": 5, "context": 1, "stream": 7, "correlation": 237487, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 237487, "pid": 5, "tid": 7, "ts": 1716454225220071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225166330, "dur": 10, "args": { "External id": 237487, "cbid": 51, "correlation": 237487 } }, { "ph": "s", "id": 237487, "pid": 76337, "tid": -914061504, "ts": 1716454225166330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225220075, "dur": 35, "args": { "External id": 237488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237488, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 237488, "pid": 5, "tid": 7, "ts": 1716454225220075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166342, "dur": 13, "args": { "External id": 237488, "cbid": 211, "correlation": 237488 } }, { "ph": "s", "id": 237488, "pid": 76337, "tid": -914061504, "ts": 1716454225166342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166414, "dur": 1, "args": { "External id": 237499, "cbid": 251, "correlation": 237499 } }, { "ph": "f", "id": 237499, "pid": 76337, "tid": -914061504, "ts": 1716454225166414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166418, "dur": 0, "args": { "External id": 237500, "cbid": 251, "correlation": 237500 } }, { "ph": "f", "id": 237500, "pid": 76337, "tid": -914061504, "ts": 1716454225166418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225220112, "dur": 11, "args": { "External id": 237501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237501, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237501, "pid": 5, "tid": 7, "ts": 1716454225220112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166420, "dur": 12, "args": { "External id": 237501, "cbid": 211, "correlation": 237501 } }, { "ph": "s", "id": 237501, "pid": 76337, "tid": -914061504, "ts": 1716454225166420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225220124, "dur": 6, "args": { "External id": 237503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237503, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237503, "pid": 5, "tid": 7, "ts": 1716454225220124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166433, "dur": 6, "args": { "External id": 237503, "cbid": 211, "correlation": 237503 } }, { "ph": "s", "id": 237503, "pid": 76337, "tid": -914061504, "ts": 1716454225166433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166493, "dur": 1, "args": { "External id": 237514, "cbid": 251, "correlation": 237514 } }, { "ph": "f", "id": 237514, "pid": 76337, "tid": -914061504, "ts": 1716454225166493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166496, "dur": 0, "args": { "External id": 237515, "cbid": 251, "correlation": 237515 } }, { "ph": "f", "id": 237515, "pid": 76337, "tid": -914061504, "ts": 1716454225166496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225220131, "dur": 8, "args": { "External id": 237516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237516, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237516, "pid": 5, "tid": 7, "ts": 1716454225220131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166498, "dur": 11, "args": { "External id": 237516, "cbid": 211, "correlation": 237516 } }, { "ph": "s", "id": 237516, "pid": 76337, "tid": -914061504, "ts": 1716454225166498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225220140, "dur": 4, "args": { "External id": 237518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237518, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237518, "pid": 5, "tid": 7, "ts": 1716454225220140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166511, "dur": 5, "args": { "External id": 237518, "cbid": 211, "correlation": 237518 } }, { "ph": "s", "id": 237518, "pid": 76337, "tid": -914061504, "ts": 1716454225166511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225220145, "dur": 19, "args": { "External id": 237543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237543, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 237543, "pid": 5, "tid": 7, "ts": 1716454225220145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166589, "dur": 13, "args": { "External id": 237543, "cbid": 211, "correlation": 237543 } }, { "ph": "s", "id": 237543, "pid": 76337, "tid": -914061504, "ts": 1716454225166589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166688, "dur": 2, "args": { "External id": 237561, "cbid": 251, "correlation": 237561 } }, { "ph": "f", "id": 237561, "pid": 76337, "tid": -914061504, "ts": 1716454225166688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225220167, "dur": 1, "args": { "External id": 237563, "device": 5, "context": 1, "stream": 7, "correlation": 237563, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 237563, "pid": 5, "tid": 7, "ts": 1716454225220167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225166694, "dur": 10, "args": { "External id": 237563, "cbid": 51, "correlation": 237563 } }, { "ph": "s", "id": 237563, "pid": 76337, "tid": -914061504, "ts": 1716454225166694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225220170, "dur": 35, "args": { "External id": 237564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237564, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 237564, "pid": 5, "tid": 7, "ts": 1716454225220170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166706, "dur": 12, "args": { "External id": 237564, "cbid": 211, "correlation": 237564 } }, { "ph": "s", "id": 237564, "pid": 76337, "tid": -914061504, "ts": 1716454225166706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225220207, "dur": 4, "args": { "External id": 237572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237572, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237572, "pid": 5, "tid": 7, "ts": 1716454225220207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166776, "dur": 12, "args": { "External id": 237572, "cbid": 211, "correlation": 237572 } }, { "ph": "s", "id": 237572, "pid": 76337, "tid": -914061504, "ts": 1716454225166776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220212, "dur": 8, "args": { "External id": 237580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237580, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237580, "pid": 5, "tid": 7, "ts": 1716454225220212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166818, "dur": 9, "args": { "External id": 237580, "cbid": 211, "correlation": 237580 } }, { "ph": "s", "id": 237580, "pid": 76337, "tid": -914061504, "ts": 1716454225166818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225220222, "dur": 8, "args": { "External id": 237602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237602, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 237602, "pid": 5, "tid": 7, "ts": 1716454225220222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166870, "dur": 10, "args": { "External id": 237602, "cbid": 211, "correlation": 237602 } }, { "ph": "s", "id": 237602, "pid": 76337, "tid": -914061504, "ts": 1716454225166870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166961, "dur": 1, "args": { "External id": 237618, "cbid": 251, "correlation": 237618 } }, { "ph": "f", "id": 237618, "pid": 76337, "tid": -914061504, "ts": 1716454225166961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225166966, "dur": 0, "args": { "External id": 237620, "cbid": 251, "correlation": 237620 } }, { "ph": "f", "id": 237620, "pid": 76337, "tid": -914061504, "ts": 1716454225166966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225220231, "dur": 188, "args": { "External id": 237621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237621, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237621, "pid": 5, "tid": 7, "ts": 1716454225220231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225166968, "dur": 21, "args": { "External id": 237621, "cbid": 211, "correlation": 237621 } }, { "ph": "s", "id": 237621, "pid": 76337, "tid": -914061504, "ts": 1716454225166968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220420, "dur": 20, "args": { "External id": 237629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237629, "pid": 5, "tid": 7, "ts": 1716454225220420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167043, "dur": 13, "args": { "External id": 237629, "cbid": 211, "correlation": 237629 } }, { "ph": "s", "id": 237629, "pid": 76337, "tid": -914061504, "ts": 1716454225167043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220442, "dur": 22, "args": { "External id": 237637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237637, "pid": 5, "tid": 7, "ts": 1716454225220442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167074, "dur": 8, "args": { "External id": 237637, "cbid": 211, "correlation": 237637 } }, { "ph": "s", "id": 237637, "pid": 76337, "tid": -914061504, "ts": 1716454225167074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225167156, "dur": 1, "args": { "External id": 237653, "cbid": 251, "correlation": 237653 } }, { "ph": "f", "id": 237653, "pid": 76337, "tid": -914061504, "ts": 1716454225167156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225220466, "dur": 1, "args": { "External id": 237655, "device": 5, "context": 1, "stream": 7, "correlation": 237655, "bytes": 120, "memory bandwidth (GB/s)": 0.0797872340425532 } }, { "ph": "f", "id": 237655, "pid": 5, "tid": 7, "ts": 1716454225220466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225167161, "dur": 8, "args": { "External id": 237655, "cbid": 51, "correlation": 237655 } }, { "ph": "s", "id": 237655, "pid": 76337, "tid": -914061504, "ts": 1716454225167161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225220470, "dur": 109, "args": { "External id": 237656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237656, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 237656, "pid": 5, "tid": 7, "ts": 1716454225220470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167171, "dur": 11, "args": { "External id": 237656, "cbid": 211, "correlation": 237656 } }, { "ph": "s", "id": 237656, "pid": 76337, "tid": -914061504, "ts": 1716454225167171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225220580, "dur": 5, "args": { "External id": 237664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237664, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237664, "pid": 5, "tid": 7, "ts": 1716454225220580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167214, "dur": 10, "args": { "External id": 237664, "cbid": 211, "correlation": 237664 } }, { "ph": "s", "id": 237664, "pid": 76337, "tid": -914061504, "ts": 1716454225167214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220586, "dur": 9, "args": { "External id": 237675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237675, "pid": 5, "tid": 7, "ts": 1716454225220586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167283, "dur": 13, "args": { "External id": 237675, "cbid": 211, "correlation": 237675 } }, { "ph": "s", "id": 237675, "pid": 76337, "tid": -914061504, "ts": 1716454225167283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225167351, "dur": 0, "args": { "External id": 237687, "cbid": 317, "correlation": 237687 } }, { "ph": "f", "id": 237687, "pid": 76337, "tid": -914061504, "ts": 1716454225167351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225167352, "dur": 0, "args": { "External id": 237688, "cbid": 203, "correlation": 237688 } }, { "ph": "f", "id": 237688, "pid": 76337, "tid": -914061504, "ts": 1716454225167352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225167353, "dur": 0, "args": { "External id": 237689, "cbid": 205, "correlation": 237689 } }, { "ph": "f", "id": 237689, "pid": 76337, "tid": -914061504, "ts": 1716454225167353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225220597, "dur": 5, "args": { "External id": 237693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237693, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237693, "pid": 5, "tid": 7, "ts": 1716454225220597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167368, "dur": 12, "args": { "External id": 237693, "cbid": 211, "correlation": 237693 } }, { "ph": "s", "id": 237693, "pid": 76337, "tid": -914061504, "ts": 1716454225167368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225220604, "dur": 36, "args": { "External id": 237695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237695, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 237695, "pid": 5, "tid": 7, "ts": 1716454225220604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167387, "dur": 8, "args": { "External id": 237695, "cbid": 211, "correlation": 237695 } }, { "ph": "s", "id": 237695, "pid": 76337, "tid": -914061504, "ts": 1716454225167387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225220641, "dur": 6, "args": { "External id": 237697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237697, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237697, "pid": 5, "tid": 7, "ts": 1716454225220641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167398, "dur": 5, "args": { "External id": 237697, "cbid": 211, "correlation": 237697 } }, { "ph": "s", "id": 237697, "pid": 76337, "tid": -914061504, "ts": 1716454225167398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220648, "dur": 7, "args": { "External id": 237703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237703, "pid": 5, "tid": 7, "ts": 1716454225220648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167424, "dur": 9, "args": { "External id": 237703, "cbid": 211, "correlation": 237703 } }, { "ph": "s", "id": 237703, "pid": 76337, "tid": -914061504, "ts": 1716454225167424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225220657, "dur": 5, "args": { "External id": 237711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237711, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237711, "pid": 5, "tid": 7, "ts": 1716454225220657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167456, "dur": 8, "args": { "External id": 237711, "cbid": 211, "correlation": 237711 } }, { "ph": "s", "id": 237711, "pid": 76337, "tid": -914061504, "ts": 1716454225167456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225220663, "dur": 10, "args": { "External id": 237731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237731, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 237731, "pid": 5, "tid": 7, "ts": 1716454225220663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167528, "dur": 12, "args": { "External id": 237731, "cbid": 211, "correlation": 237731 } }, { "ph": "s", "id": 237731, "pid": 76337, "tid": -914061504, "ts": 1716454225167528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225220674, "dur": 4, "args": { "External id": 237743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237743, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 237743, "pid": 5, "tid": 7, "ts": 1716454225220674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167550, "dur": 6, "args": { "External id": 237743, "cbid": 211, "correlation": 237743 } }, { "ph": "s", "id": 237743, "pid": 76337, "tid": -914061504, "ts": 1716454225167550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225220680, "dur": 8, "args": { "External id": 237746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237746, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237746, "pid": 5, "tid": 7, "ts": 1716454225220680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167568, "dur": 6, "args": { "External id": 237746, "cbid": 211, "correlation": 237746 } }, { "ph": "s", "id": 237746, "pid": 76337, "tid": -914061504, "ts": 1716454225167568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225220689, "dur": 5, "args": { "External id": 237755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237755, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237755, "pid": 5, "tid": 7, "ts": 1716454225220689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167607, "dur": 10, "args": { "External id": 237755, "cbid": 211, "correlation": 237755 } }, { "ph": "s", "id": 237755, "pid": 76337, "tid": -914061504, "ts": 1716454225167607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225167658, "dur": 0, "args": { "External id": 237765, "cbid": 317, "correlation": 237765 } }, { "ph": "f", "id": 237765, "pid": 76337, "tid": -914061504, "ts": 1716454225167658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225167659, "dur": 0, "args": { "External id": 237766, "cbid": 203, "correlation": 237766 } }, { "ph": "f", "id": 237766, "pid": 76337, "tid": -914061504, "ts": 1716454225167659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225167659, "dur": 0, "args": { "External id": 237767, "cbid": 205, "correlation": 237767 } }, { "ph": "f", "id": 237767, "pid": 76337, "tid": -914061504, "ts": 1716454225167659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225220695, "dur": 5, "args": { "External id": 237771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237771, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237771, "pid": 5, "tid": 7, "ts": 1716454225220695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167672, "dur": 11, "args": { "External id": 237771, "cbid": 211, "correlation": 237771 } }, { "ph": "s", "id": 237771, "pid": 76337, "tid": -914061504, "ts": 1716454225167672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225220702, "dur": 158, "args": { "External id": 237773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237773, "pid": 5, "tid": 7, "ts": 1716454225220702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167686, "dur": 6, "args": { "External id": 237773, "cbid": 211, "correlation": 237773 } }, { "ph": "s", "id": 237773, "pid": 76337, "tid": -914061504, "ts": 1716454225167686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225220862, "dur": 1, "args": { "External id": 237775, "device": 5, "context": 1, "stream": 7, "correlation": 237775, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 237775, "pid": 5, "tid": 7, "ts": 1716454225220862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225167697, "dur": 6, "args": { "External id": 237775, "cbid": 51, "correlation": 237775 } }, { "ph": "s", "id": 237775, "pid": 76337, "tid": -914061504, "ts": 1716454225167697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225220866, "dur": 266, "args": { "External id": 237776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237776, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237776, "pid": 5, "tid": 7, "ts": 1716454225220866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167704, "dur": 6, "args": { "External id": 237776, "cbid": 211, "correlation": 237776 } }, { "ph": "s", "id": 237776, "pid": 76337, "tid": -914061504, "ts": 1716454225167704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225221133, "dur": 6, "args": { "External id": 237778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237778, "pid": 5, "tid": 7, "ts": 1716454225221133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167714, "dur": 5, "args": { "External id": 237778, "cbid": 211, "correlation": 237778 } }, { "ph": "s", "id": 237778, "pid": 76337, "tid": -914061504, "ts": 1716454225167714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225221140, "dur": 6, "args": { "External id": 237784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237784, "pid": 5, "tid": 7, "ts": 1716454225221140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167742, "dur": 9, "args": { "External id": 237784, "cbid": 211, "correlation": 237784 } }, { "ph": "s", "id": 237784, "pid": 76337, "tid": -914061504, "ts": 1716454225167742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225221147, "dur": 3, "args": { "External id": 237792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237792, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 237792, "pid": 5, "tid": 7, "ts": 1716454225221147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167785, "dur": 10, "args": { "External id": 237792, "cbid": 211, "correlation": 237792 } }, { "ph": "s", "id": 237792, "pid": 76337, "tid": -914061504, "ts": 1716454225167785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225167850, "dur": 1, "args": { "External id": 237808, "cbid": 251, "correlation": 237808 } }, { "ph": "f", "id": 237808, "pid": 76337, "tid": -914061504, "ts": 1716454225167850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225167855, "dur": 0, "args": { "External id": 237810, "cbid": 251, "correlation": 237810 } }, { "ph": "f", "id": 237810, "pid": 76337, "tid": -914061504, "ts": 1716454225167855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225221152, "dur": 13, "args": { "External id": 237811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237811, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237811, "pid": 5, "tid": 7, "ts": 1716454225221152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167857, "dur": 11, "args": { "External id": 237811, "cbid": 211, "correlation": 237811 } }, { "ph": "s", "id": 237811, "pid": 76337, "tid": -914061504, "ts": 1716454225167857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225221166, "dur": 5, "args": { "External id": 237813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237813, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237813, "pid": 5, "tid": 7, "ts": 1716454225221166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167870, "dur": 5, "args": { "External id": 237813, "cbid": 211, "correlation": 237813 } }, { "ph": "s", "id": 237813, "pid": 76337, "tid": -914061504, "ts": 1716454225167870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225221172, "dur": 5, "args": { "External id": 237823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237823, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237823, "pid": 5, "tid": 7, "ts": 1716454225221172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225167928, "dur": 13, "args": { "External id": 237823, "cbid": 211, "correlation": 237823 } }, { "ph": "s", "id": 237823, "pid": 76337, "tid": -914061504, "ts": 1716454225167928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225221179, "dur": 10, "args": { "External id": 237843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237843, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 237843, "pid": 5, "tid": 7, "ts": 1716454225221179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168003, "dur": 11, "args": { "External id": 237843, "cbid": 211, "correlation": 237843 } }, { "ph": "s", "id": 237843, "pid": 76337, "tid": -914061504, "ts": 1716454225168003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225221190, "dur": 4, "args": { "External id": 237855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237855, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 237855, "pid": 5, "tid": 7, "ts": 1716454225221190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168024, "dur": 6, "args": { "External id": 237855, "cbid": 211, "correlation": 237855 } }, { "ph": "s", "id": 237855, "pid": 76337, "tid": -914061504, "ts": 1716454225168024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225221195, "dur": 7, "args": { "External id": 237858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237858, "pid": 5, "tid": 7, "ts": 1716454225221195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168043, "dur": 7, "args": { "External id": 237858, "cbid": 211, "correlation": 237858 } }, { "ph": "s", "id": 237858, "pid": 76337, "tid": -914061504, "ts": 1716454225168043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225221203, "dur": 4, "args": { "External id": 237867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237867, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237867, "pid": 5, "tid": 7, "ts": 1716454225221203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168084, "dur": 10, "args": { "External id": 237867, "cbid": 211, "correlation": 237867 } }, { "ph": "s", "id": 237867, "pid": 76337, "tid": -914061504, "ts": 1716454225168084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225168146, "dur": 0, "args": { "External id": 237877, "cbid": 317, "correlation": 237877 } }, { "ph": "f", "id": 237877, "pid": 76337, "tid": -914061504, "ts": 1716454225168146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225168146, "dur": 0, "args": { "External id": 237878, "cbid": 203, "correlation": 237878 } }, { "ph": "f", "id": 237878, "pid": 76337, "tid": -914061504, "ts": 1716454225168146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225168147, "dur": 0, "args": { "External id": 237879, "cbid": 205, "correlation": 237879 } }, { "ph": "f", "id": 237879, "pid": 76337, "tid": -914061504, "ts": 1716454225168147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225221209, "dur": 5, "args": { "External id": 237883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237883, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237883, "pid": 5, "tid": 7, "ts": 1716454225221209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168161, "dur": 12, "args": { "External id": 237883, "cbid": 211, "correlation": 237883 } }, { "ph": "s", "id": 237883, "pid": 76337, "tid": -914061504, "ts": 1716454225168161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225221215, "dur": 159, "args": { "External id": 237885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237885, "pid": 5, "tid": 7, "ts": 1716454225221215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168175, "dur": 5, "args": { "External id": 237885, "cbid": 211, "correlation": 237885 } }, { "ph": "s", "id": 237885, "pid": 76337, "tid": -914061504, "ts": 1716454225168175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225221377, "dur": 1, "args": { "External id": 237887, "device": 5, "context": 1, "stream": 7, "correlation": 237887, "bytes": 240, "memory bandwidth (GB/s)": 0.1561483409238777 } }, { "ph": "f", "id": 237887, "pid": 5, "tid": 7, "ts": 1716454225221377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225168186, "dur": 6, "args": { "External id": 237887, "cbid": 51, "correlation": 237887 } }, { "ph": "s", "id": 237887, "pid": 76337, "tid": -914061504, "ts": 1716454225168186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225221380, "dur": 256, "args": { "External id": 237888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237888, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237888, "pid": 5, "tid": 7, "ts": 1716454225221380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168193, "dur": 6, "args": { "External id": 237888, "cbid": 211, "correlation": 237888 } }, { "ph": "s", "id": 237888, "pid": 76337, "tid": -914061504, "ts": 1716454225168193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225221637, "dur": 6, "args": { "External id": 237890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237890, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237890, "pid": 5, "tid": 7, "ts": 1716454225221637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168203, "dur": 5, "args": { "External id": 237890, "cbid": 211, "correlation": 237890 } }, { "ph": "s", "id": 237890, "pid": 76337, "tid": -914061504, "ts": 1716454225168203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225221645, "dur": 7, "args": { "External id": 237896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237896, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237896, "pid": 5, "tid": 7, "ts": 1716454225221645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168231, "dur": 9, "args": { "External id": 237896, "cbid": 211, "correlation": 237896 } }, { "ph": "s", "id": 237896, "pid": 76337, "tid": -914061504, "ts": 1716454225168231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225221653, "dur": 4, "args": { "External id": 237904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237904, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237904, "pid": 5, "tid": 7, "ts": 1716454225221653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168264, "dur": 8, "args": { "External id": 237904, "cbid": 211, "correlation": 237904 } }, { "ph": "s", "id": 237904, "pid": 76337, "tid": -914061504, "ts": 1716454225168264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225221658, "dur": 4, "args": { "External id": 237912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237912, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237912, "pid": 5, "tid": 7, "ts": 1716454225221658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168293, "dur": 8, "args": { "External id": 237912, "cbid": 211, "correlation": 237912 } }, { "ph": "s", "id": 237912, "pid": 76337, "tid": -914061504, "ts": 1716454225168293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225221664, "dur": 11, "args": { "External id": 237921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237921, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237921, "pid": 5, "tid": 7, "ts": 1716454225221664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168381, "dur": 14, "args": { "External id": 237921, "cbid": 211, "correlation": 237921 } }, { "ph": "s", "id": 237921, "pid": 76337, "tid": -914061504, "ts": 1716454225168381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225221677, "dur": 12, "args": { "External id": 237941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237941, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 237941, "pid": 5, "tid": 7, "ts": 1716454225221677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168449, "dur": 11, "args": { "External id": 237941, "cbid": 211, "correlation": 237941 } }, { "ph": "s", "id": 237941, "pid": 76337, "tid": -914061504, "ts": 1716454225168449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225221690, "dur": 4, "args": { "External id": 237953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237953, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237953, "pid": 5, "tid": 7, "ts": 1716454225221690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168469, "dur": 6, "args": { "External id": 237953, "cbid": 211, "correlation": 237953 } }, { "ph": "s", "id": 237953, "pid": 76337, "tid": -914061504, "ts": 1716454225168469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225221695, "dur": 10, "args": { "External id": 237956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237956, "pid": 5, "tid": 7, "ts": 1716454225221695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168488, "dur": 6, "args": { "External id": 237956, "cbid": 211, "correlation": 237956 } }, { "ph": "s", "id": 237956, "pid": 76337, "tid": -914061504, "ts": 1716454225168488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225221706, "dur": 6, "args": { "External id": 237965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237965, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237965, "pid": 5, "tid": 7, "ts": 1716454225221706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168525, "dur": 10, "args": { "External id": 237965, "cbid": 211, "correlation": 237965 } }, { "ph": "s", "id": 237965, "pid": 76337, "tid": -914061504, "ts": 1716454225168525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225168577, "dur": 0, "args": { "External id": 237975, "cbid": 317, "correlation": 237975 } }, { "ph": "f", "id": 237975, "pid": 76337, "tid": -914061504, "ts": 1716454225168577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225168578, "dur": 0, "args": { "External id": 237976, "cbid": 203, "correlation": 237976 } }, { "ph": "f", "id": 237976, "pid": 76337, "tid": -914061504, "ts": 1716454225168578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225168579, "dur": 0, "args": { "External id": 237977, "cbid": 205, "correlation": 237977 } }, { "ph": "f", "id": 237977, "pid": 76337, "tid": -914061504, "ts": 1716454225168579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225221713, "dur": 6, "args": { "External id": 237981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237981, "pid": 5, "tid": 7, "ts": 1716454225221713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168594, "dur": 11, "args": { "External id": 237981, "cbid": 211, "correlation": 237981 } }, { "ph": "s", "id": 237981, "pid": 76337, "tid": -914061504, "ts": 1716454225168594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225221721, "dur": 314, "args": { "External id": 237983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237983, "pid": 5, "tid": 7, "ts": 1716454225221721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168608, "dur": 5, "args": { "External id": 237983, "cbid": 211, "correlation": 237983 } }, { "ph": "s", "id": 237983, "pid": 76337, "tid": -914061504, "ts": 1716454225168608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225222037, "dur": 1, "args": { "External id": 237985, "device": 5, "context": 1, "stream": 7, "correlation": 237985, "bytes": 240, "memory bandwidth (GB/s)": 0.14423076923076922 } }, { "ph": "f", "id": 237985, "pid": 5, "tid": 7, "ts": 1716454225222037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225168619, "dur": 8, "args": { "External id": 237985, "cbid": 51, "correlation": 237985 } }, { "ph": "s", "id": 237985, "pid": 76337, "tid": -914061504, "ts": 1716454225168619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225222041, "dur": 486, "args": { "External id": 237986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237986, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 237986, "pid": 5, "tid": 7, "ts": 1716454225222041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168628, "dur": 6, "args": { "External id": 237986, "cbid": 211, "correlation": 237986 } }, { "ph": "s", "id": 237986, "pid": 76337, "tid": -914061504, "ts": 1716454225168628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225222528, "dur": 5, "args": { "External id": 237988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 237988, "pid": 5, "tid": 7, "ts": 1716454225222528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168638, "dur": 5, "args": { "External id": 237988, "cbid": 211, "correlation": 237988 } }, { "ph": "s", "id": 237988, "pid": 76337, "tid": -914061504, "ts": 1716454225168638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225222535, "dur": 6, "args": { "External id": 237994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 237994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 237994, "pid": 5, "tid": 7, "ts": 1716454225222535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168666, "dur": 9, "args": { "External id": 237994, "cbid": 211, "correlation": 237994 } }, { "ph": "s", "id": 237994, "pid": 76337, "tid": -914061504, "ts": 1716454225168666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225222542, "dur": 3, "args": { "External id": 238002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238002, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 238002, "pid": 5, "tid": 7, "ts": 1716454225222542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168710, "dur": 9, "args": { "External id": 238002, "cbid": 211, "correlation": 238002 } }, { "ph": "s", "id": 238002, "pid": 76337, "tid": -914061504, "ts": 1716454225168710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225168773, "dur": 1, "args": { "External id": 238018, "cbid": 251, "correlation": 238018 } }, { "ph": "f", "id": 238018, "pid": 76337, "tid": -914061504, "ts": 1716454225168773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225168778, "dur": 0, "args": { "External id": 238020, "cbid": 251, "correlation": 238020 } }, { "ph": "f", "id": 238020, "pid": 76337, "tid": -914061504, "ts": 1716454225168778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225222547, "dur": 11, "args": { "External id": 238021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238021, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238021, "pid": 5, "tid": 7, "ts": 1716454225222547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168780, "dur": 11, "args": { "External id": 238021, "cbid": 211, "correlation": 238021 } }, { "ph": "s", "id": 238021, "pid": 76337, "tid": -914061504, "ts": 1716454225168780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225222558, "dur": 4, "args": { "External id": 238023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238023, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238023, "pid": 5, "tid": 7, "ts": 1716454225222558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168793, "dur": 6, "args": { "External id": 238023, "cbid": 211, "correlation": 238023 } }, { "ph": "s", "id": 238023, "pid": 76337, "tid": -914061504, "ts": 1716454225168793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225222564, "dur": 6, "args": { "External id": 238033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238033, "pid": 5, "tid": 7, "ts": 1716454225222564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168849, "dur": 12, "args": { "External id": 238033, "cbid": 211, "correlation": 238033 } }, { "ph": "s", "id": 238033, "pid": 76337, "tid": -914061504, "ts": 1716454225168849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225222571, "dur": 9, "args": { "External id": 238053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238053, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238053, "pid": 5, "tid": 7, "ts": 1716454225222571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168915, "dur": 10, "args": { "External id": 238053, "cbid": 211, "correlation": 238053 } }, { "ph": "s", "id": 238053, "pid": 76337, "tid": -914061504, "ts": 1716454225168915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225222582, "dur": 4, "args": { "External id": 238065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238065, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 238065, "pid": 5, "tid": 7, "ts": 1716454225222582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168936, "dur": 6, "args": { "External id": 238065, "cbid": 211, "correlation": 238065 } }, { "ph": "s", "id": 238065, "pid": 76337, "tid": -914061504, "ts": 1716454225168936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225222587, "dur": 7, "args": { "External id": 238068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238068, "pid": 5, "tid": 7, "ts": 1716454225222587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225168955, "dur": 6, "args": { "External id": 238068, "cbid": 211, "correlation": 238068 } }, { "ph": "s", "id": 238068, "pid": 76337, "tid": -914061504, "ts": 1716454225168955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225222595, "dur": 4, "args": { "External id": 238077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238077, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238077, "pid": 5, "tid": 7, "ts": 1716454225222595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169004, "dur": 10, "args": { "External id": 238077, "cbid": 211, "correlation": 238077 } }, { "ph": "s", "id": 238077, "pid": 76337, "tid": -914061504, "ts": 1716454225169004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225169067, "dur": 0, "args": { "External id": 238087, "cbid": 317, "correlation": 238087 } }, { "ph": "f", "id": 238087, "pid": 76337, "tid": -914061504, "ts": 1716454225169067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225169068, "dur": 0, "args": { "External id": 238088, "cbid": 203, "correlation": 238088 } }, { "ph": "f", "id": 238088, "pid": 76337, "tid": -914061504, "ts": 1716454225169068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225169069, "dur": 0, "args": { "External id": 238089, "cbid": 205, "correlation": 238089 } }, { "ph": "f", "id": 238089, "pid": 76337, "tid": -914061504, "ts": 1716454225169069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225222601, "dur": 5, "args": { "External id": 238093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238093, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238093, "pid": 5, "tid": 7, "ts": 1716454225222601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169083, "dur": 12, "args": { "External id": 238093, "cbid": 211, "correlation": 238093 } }, { "ph": "s", "id": 238093, "pid": 76337, "tid": -914061504, "ts": 1716454225169083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225222607, "dur": 158, "args": { "External id": 238095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238095, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238095, "pid": 5, "tid": 7, "ts": 1716454225222607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169097, "dur": 5, "args": { "External id": 238095, "cbid": 211, "correlation": 238095 } }, { "ph": "s", "id": 238095, "pid": 76337, "tid": -914061504, "ts": 1716454225169097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225222767, "dur": 1, "args": { "External id": 238097, "device": 5, "context": 1, "stream": 7, "correlation": 238097, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 238097, "pid": 5, "tid": 7, "ts": 1716454225222767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225169108, "dur": 7, "args": { "External id": 238097, "cbid": 51, "correlation": 238097 } }, { "ph": "s", "id": 238097, "pid": 76337, "tid": -914061504, "ts": 1716454225169108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225222771, "dur": 254, "args": { "External id": 238098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238098, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238098, "pid": 5, "tid": 7, "ts": 1716454225222771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169115, "dur": 6, "args": { "External id": 238098, "cbid": 211, "correlation": 238098 } }, { "ph": "s", "id": 238098, "pid": 76337, "tid": -914061504, "ts": 1716454225169115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225223026, "dur": 6, "args": { "External id": 238100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238100, "pid": 5, "tid": 7, "ts": 1716454225223026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169125, "dur": 6, "args": { "External id": 238100, "cbid": 211, "correlation": 238100 } }, { "ph": "s", "id": 238100, "pid": 76337, "tid": -914061504, "ts": 1716454225169125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225223033, "dur": 6, "args": { "External id": 238106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238106, "pid": 5, "tid": 7, "ts": 1716454225223033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169153, "dur": 8, "args": { "External id": 238106, "cbid": 211, "correlation": 238106 } }, { "ph": "s", "id": 238106, "pid": 76337, "tid": -914061504, "ts": 1716454225169153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225169212, "dur": 0, "args": { "External id": 238116, "cbid": 317, "correlation": 238116 } }, { "ph": "f", "id": 238116, "pid": 76337, "tid": -914061504, "ts": 1716454225169212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225169213, "dur": 0, "args": { "External id": 238117, "cbid": 203, "correlation": 238117 } }, { "ph": "f", "id": 238117, "pid": 76337, "tid": -914061504, "ts": 1716454225169213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225169214, "dur": 0, "args": { "External id": 238118, "cbid": 205, "correlation": 238118 } }, { "ph": "f", "id": 238118, "pid": 76337, "tid": -914061504, "ts": 1716454225169214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225223041, "dur": 7, "args": { "External id": 238122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238122, "pid": 5, "tid": 7, "ts": 1716454225223041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169227, "dur": 13, "args": { "External id": 238122, "cbid": 211, "correlation": 238122 } }, { "ph": "s", "id": 238122, "pid": 76337, "tid": -914061504, "ts": 1716454225169227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225223049, "dur": 3, "args": { "External id": 238124, "device": 5, "context": 1, "stream": 7, "correlation": 238124, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 238124, "pid": 5, "tid": 7, "ts": 1716454225223049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225169246, "dur": 14, "args": { "External id": 238124, "cbid": 51, "correlation": 238124 } }, { "ph": "s", "id": 238124, "pid": 76337, "tid": -914061504, "ts": 1716454225169246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225223053, "dur": 97, "args": { "External id": 238125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238125, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 238125, "pid": 5, "tid": 7, "ts": 1716454225223053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169261, "dur": 7, "args": { "External id": 238125, "cbid": 211, "correlation": 238125 } }, { "ph": "s", "id": 238125, "pid": 76337, "tid": -914061504, "ts": 1716454225169261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225223152, "dur": 6, "args": { "External id": 238127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238127, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238127, "pid": 5, "tid": 7, "ts": 1716454225223152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169271, "dur": 5, "args": { "External id": 238127, "cbid": 211, "correlation": 238127 } }, { "ph": "s", "id": 238127, "pid": 76337, "tid": -914061504, "ts": 1716454225169271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225223158, "dur": 6, "args": { "External id": 238133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238133, "pid": 5, "tid": 7, "ts": 1716454225223158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169298, "dur": 8, "args": { "External id": 238133, "cbid": 211, "correlation": 238133 } }, { "ph": "s", "id": 238133, "pid": 76337, "tid": -914061504, "ts": 1716454225169298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225223166, "dur": 5, "args": { "External id": 238141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238141, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238141, "pid": 5, "tid": 7, "ts": 1716454225223166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169327, "dur": 8, "args": { "External id": 238141, "cbid": 211, "correlation": 238141 } }, { "ph": "s", "id": 238141, "pid": 76337, "tid": -914061504, "ts": 1716454225169327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225223172, "dur": 4, "args": { "External id": 238149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238149, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238149, "pid": 5, "tid": 7, "ts": 1716454225223172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169356, "dur": 8, "args": { "External id": 238149, "cbid": 211, "correlation": 238149 } }, { "ph": "s", "id": 238149, "pid": 76337, "tid": -914061504, "ts": 1716454225169356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225223177, "dur": 11, "args": { "External id": 238158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238158, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238158, "pid": 5, "tid": 7, "ts": 1716454225223177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169400, "dur": 10, "args": { "External id": 238158, "cbid": 211, "correlation": 238158 } }, { "ph": "s", "id": 238158, "pid": 76337, "tid": -914061504, "ts": 1716454225169400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225223190, "dur": 12, "args": { "External id": 238178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238178, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238178, "pid": 5, "tid": 7, "ts": 1716454225223190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169469, "dur": 12, "args": { "External id": 238178, "cbid": 211, "correlation": 238178 } }, { "ph": "s", "id": 238178, "pid": 76337, "tid": -914061504, "ts": 1716454225169469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225223203, "dur": 4, "args": { "External id": 238190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238190, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238190, "pid": 5, "tid": 7, "ts": 1716454225223203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169491, "dur": 6, "args": { "External id": 238190, "cbid": 211, "correlation": 238190 } }, { "ph": "s", "id": 238190, "pid": 76337, "tid": -914061504, "ts": 1716454225169491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225223208, "dur": 11, "args": { "External id": 238193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238193, "pid": 5, "tid": 7, "ts": 1716454225223208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169508, "dur": 7, "args": { "External id": 238193, "cbid": 211, "correlation": 238193 } }, { "ph": "s", "id": 238193, "pid": 76337, "tid": -914061504, "ts": 1716454225169508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225223220, "dur": 6, "args": { "External id": 238202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238202, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238202, "pid": 5, "tid": 7, "ts": 1716454225223220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169546, "dur": 9, "args": { "External id": 238202, "cbid": 211, "correlation": 238202 } }, { "ph": "s", "id": 238202, "pid": 76337, "tid": -914061504, "ts": 1716454225169546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225169599, "dur": 0, "args": { "External id": 238212, "cbid": 317, "correlation": 238212 } }, { "ph": "f", "id": 238212, "pid": 76337, "tid": -914061504, "ts": 1716454225169599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225169599, "dur": 0, "args": { "External id": 238213, "cbid": 203, "correlation": 238213 } }, { "ph": "f", "id": 238213, "pid": 76337, "tid": -914061504, "ts": 1716454225169599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225169600, "dur": 0, "args": { "External id": 238214, "cbid": 205, "correlation": 238214 } }, { "ph": "f", "id": 238214, "pid": 76337, "tid": -914061504, "ts": 1716454225169600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225223227, "dur": 6, "args": { "External id": 238218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238218, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238218, "pid": 5, "tid": 7, "ts": 1716454225223227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169613, "dur": 11, "args": { "External id": 238218, "cbid": 211, "correlation": 238218 } }, { "ph": "s", "id": 238218, "pid": 76337, "tid": -914061504, "ts": 1716454225169613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225223235, "dur": 314, "args": { "External id": 238220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238220, "pid": 5, "tid": 7, "ts": 1716454225223235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169627, "dur": 6, "args": { "External id": 238220, "cbid": 211, "correlation": 238220 } }, { "ph": "s", "id": 238220, "pid": 76337, "tid": -914061504, "ts": 1716454225169627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225223551, "dur": 1, "args": { "External id": 238222, "device": 5, "context": 1, "stream": 7, "correlation": 238222, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 238222, "pid": 5, "tid": 7, "ts": 1716454225223551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225169639, "dur": 7, "args": { "External id": 238222, "cbid": 51, "correlation": 238222 } }, { "ph": "s", "id": 238222, "pid": 76337, "tid": -914061504, "ts": 1716454225169639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225223555, "dur": 488, "args": { "External id": 238223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238223, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238223, "pid": 5, "tid": 7, "ts": 1716454225223555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169647, "dur": 6, "args": { "External id": 238223, "cbid": 211, "correlation": 238223 } }, { "ph": "s", "id": 238223, "pid": 76337, "tid": -914061504, "ts": 1716454225169647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224045, "dur": 5, "args": { "External id": 238225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238225, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238225, "pid": 5, "tid": 7, "ts": 1716454225224045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169656, "dur": 5, "args": { "External id": 238225, "cbid": 211, "correlation": 238225 } }, { "ph": "s", "id": 238225, "pid": 76337, "tid": -914061504, "ts": 1716454225169656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225224051, "dur": 6, "args": { "External id": 238231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238231, "pid": 5, "tid": 7, "ts": 1716454225224051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169686, "dur": 8, "args": { "External id": 238231, "cbid": 211, "correlation": 238231 } }, { "ph": "s", "id": 238231, "pid": 76337, "tid": -914061504, "ts": 1716454225169686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225224059, "dur": 3, "args": { "External id": 238239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238239, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 238239, "pid": 5, "tid": 7, "ts": 1716454225224059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169729, "dur": 9, "args": { "External id": 238239, "cbid": 211, "correlation": 238239 } }, { "ph": "s", "id": 238239, "pid": 76337, "tid": -914061504, "ts": 1716454225169729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225169791, "dur": 1, "args": { "External id": 238255, "cbid": 251, "correlation": 238255 } }, { "ph": "f", "id": 238255, "pid": 76337, "tid": -914061504, "ts": 1716454225169791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225169796, "dur": 0, "args": { "External id": 238257, "cbid": 251, "correlation": 238257 } }, { "ph": "f", "id": 238257, "pid": 76337, "tid": -914061504, "ts": 1716454225169796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225224063, "dur": 12, "args": { "External id": 238258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238258, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238258, "pid": 5, "tid": 7, "ts": 1716454225224063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169798, "dur": 11, "args": { "External id": 238258, "cbid": 211, "correlation": 238258 } }, { "ph": "s", "id": 238258, "pid": 76337, "tid": -914061504, "ts": 1716454225169798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225224077, "dur": 5, "args": { "External id": 238260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238260, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238260, "pid": 5, "tid": 7, "ts": 1716454225224077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169811, "dur": 5, "args": { "External id": 238260, "cbid": 211, "correlation": 238260 } }, { "ph": "s", "id": 238260, "pid": 76337, "tid": -914061504, "ts": 1716454225169811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225224083, "dur": 6, "args": { "External id": 238270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238270, "pid": 5, "tid": 7, "ts": 1716454225224083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169867, "dur": 14, "args": { "External id": 238270, "cbid": 211, "correlation": 238270 } }, { "ph": "s", "id": 238270, "pid": 76337, "tid": -914061504, "ts": 1716454225169867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225224091, "dur": 9, "args": { "External id": 238290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238290, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238290, "pid": 5, "tid": 7, "ts": 1716454225224091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169935, "dur": 11, "args": { "External id": 238290, "cbid": 211, "correlation": 238290 } }, { "ph": "s", "id": 238290, "pid": 76337, "tid": -914061504, "ts": 1716454225169935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225224101, "dur": 3, "args": { "External id": 238302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238302, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 238302, "pid": 5, "tid": 7, "ts": 1716454225224101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169955, "dur": 6, "args": { "External id": 238302, "cbid": 211, "correlation": 238302 } }, { "ph": "s", "id": 238302, "pid": 76337, "tid": -914061504, "ts": 1716454225169955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225224106, "dur": 6, "args": { "External id": 238305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238305, "pid": 5, "tid": 7, "ts": 1716454225224106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225169983, "dur": 7, "args": { "External id": 238305, "cbid": 211, "correlation": 238305 } }, { "ph": "s", "id": 238305, "pid": 76337, "tid": -914061504, "ts": 1716454225169983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225224113, "dur": 4, "args": { "External id": 238314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238314, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238314, "pid": 5, "tid": 7, "ts": 1716454225224113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170025, "dur": 10, "args": { "External id": 238314, "cbid": 211, "correlation": 238314 } }, { "ph": "s", "id": 238314, "pid": 76337, "tid": -914061504, "ts": 1716454225170025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225170088, "dur": 0, "args": { "External id": 238324, "cbid": 317, "correlation": 238324 } }, { "ph": "f", "id": 238324, "pid": 76337, "tid": -914061504, "ts": 1716454225170088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225170088, "dur": 0, "args": { "External id": 238325, "cbid": 203, "correlation": 238325 } }, { "ph": "f", "id": 238325, "pid": 76337, "tid": -914061504, "ts": 1716454225170088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225170089, "dur": 0, "args": { "External id": 238326, "cbid": 205, "correlation": 238326 } }, { "ph": "f", "id": 238326, "pid": 76337, "tid": -914061504, "ts": 1716454225170089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224119, "dur": 5, "args": { "External id": 238330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238330, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238330, "pid": 5, "tid": 7, "ts": 1716454225224119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170103, "dur": 12, "args": { "External id": 238330, "cbid": 211, "correlation": 238330 } }, { "ph": "s", "id": 238330, "pid": 76337, "tid": -914061504, "ts": 1716454225170103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224125, "dur": 160, "args": { "External id": 238332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238332, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238332, "pid": 5, "tid": 7, "ts": 1716454225224125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170117, "dur": 5, "args": { "External id": 238332, "cbid": 211, "correlation": 238332 } }, { "ph": "s", "id": 238332, "pid": 76337, "tid": -914061504, "ts": 1716454225170117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225224287, "dur": 1, "args": { "External id": 238334, "device": 5, "context": 1, "stream": 7, "correlation": 238334, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 238334, "pid": 5, "tid": 7, "ts": 1716454225224287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225170128, "dur": 7, "args": { "External id": 238334, "cbid": 51, "correlation": 238334 } }, { "ph": "s", "id": 238334, "pid": 76337, "tid": -914061504, "ts": 1716454225170128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225224291, "dur": 255, "args": { "External id": 238335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238335, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238335, "pid": 5, "tid": 7, "ts": 1716454225224291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170136, "dur": 6, "args": { "External id": 238335, "cbid": 211, "correlation": 238335 } }, { "ph": "s", "id": 238335, "pid": 76337, "tid": -914061504, "ts": 1716454225170136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224547, "dur": 6, "args": { "External id": 238337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238337, "pid": 5, "tid": 7, "ts": 1716454225224547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170145, "dur": 5, "args": { "External id": 238337, "cbid": 211, "correlation": 238337 } }, { "ph": "s", "id": 238337, "pid": 76337, "tid": -914061504, "ts": 1716454225170145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225224554, "dur": 6, "args": { "External id": 238343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238343, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238343, "pid": 5, "tid": 7, "ts": 1716454225224554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170173, "dur": 10, "args": { "External id": 238343, "cbid": 211, "correlation": 238343 } }, { "ph": "s", "id": 238343, "pid": 76337, "tid": -914061504, "ts": 1716454225170173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225170232, "dur": 0, "args": { "External id": 238353, "cbid": 317, "correlation": 238353 } }, { "ph": "f", "id": 238353, "pid": 76337, "tid": -914061504, "ts": 1716454225170232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225170233, "dur": 0, "args": { "External id": 238354, "cbid": 203, "correlation": 238354 } }, { "ph": "f", "id": 238354, "pid": 76337, "tid": -914061504, "ts": 1716454225170233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225170233, "dur": 0, "args": { "External id": 238355, "cbid": 205, "correlation": 238355 } }, { "ph": "f", "id": 238355, "pid": 76337, "tid": -914061504, "ts": 1716454225170233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224561, "dur": 7, "args": { "External id": 238359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238359, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238359, "pid": 5, "tid": 7, "ts": 1716454225224561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170244, "dur": 12, "args": { "External id": 238359, "cbid": 211, "correlation": 238359 } }, { "ph": "s", "id": 238359, "pid": 76337, "tid": -914061504, "ts": 1716454225170244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225224570, "dur": 3, "args": { "External id": 238361, "device": 5, "context": 1, "stream": 7, "correlation": 238361, "bytes": 4800, "memory bandwidth (GB/s)": 1.4150943396226414 } }, { "ph": "f", "id": 238361, "pid": 5, "tid": 7, "ts": 1716454225224570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225170261, "dur": 9, "args": { "External id": 238361, "cbid": 51, "correlation": 238361 } }, { "ph": "s", "id": 238361, "pid": 76337, "tid": -914061504, "ts": 1716454225170261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225224574, "dur": 95, "args": { "External id": 238362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238362, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 238362, "pid": 5, "tid": 7, "ts": 1716454225224574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170272, "dur": 7, "args": { "External id": 238362, "cbid": 211, "correlation": 238362 } }, { "ph": "s", "id": 238362, "pid": 76337, "tid": -914061504, "ts": 1716454225170272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224670, "dur": 5, "args": { "External id": 238364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238364, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238364, "pid": 5, "tid": 7, "ts": 1716454225224670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170282, "dur": 5, "args": { "External id": 238364, "cbid": 211, "correlation": 238364 } }, { "ph": "s", "id": 238364, "pid": 76337, "tid": -914061504, "ts": 1716454225170282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225224677, "dur": 6, "args": { "External id": 238370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238370, "pid": 5, "tid": 7, "ts": 1716454225224677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170308, "dur": 8, "args": { "External id": 238370, "cbid": 211, "correlation": 238370 } }, { "ph": "s", "id": 238370, "pid": 76337, "tid": -914061504, "ts": 1716454225170308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225224685, "dur": 5, "args": { "External id": 238378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238378, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238378, "pid": 5, "tid": 7, "ts": 1716454225224685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170338, "dur": 8, "args": { "External id": 238378, "cbid": 211, "correlation": 238378 } }, { "ph": "s", "id": 238378, "pid": 76337, "tid": -914061504, "ts": 1716454225170338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225224691, "dur": 4, "args": { "External id": 238386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238386, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238386, "pid": 5, "tid": 7, "ts": 1716454225224691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170366, "dur": 9, "args": { "External id": 238386, "cbid": 211, "correlation": 238386 } }, { "ph": "s", "id": 238386, "pid": 76337, "tid": -914061504, "ts": 1716454225170366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225224696, "dur": 11, "args": { "External id": 238395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238395, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238395, "pid": 5, "tid": 7, "ts": 1716454225224696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170409, "dur": 10, "args": { "External id": 238395, "cbid": 211, "correlation": 238395 } }, { "ph": "s", "id": 238395, "pid": 76337, "tid": -914061504, "ts": 1716454225170409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225224709, "dur": 12, "args": { "External id": 238415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238415, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238415, "pid": 5, "tid": 7, "ts": 1716454225224709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170478, "dur": 11, "args": { "External id": 238415, "cbid": 211, "correlation": 238415 } }, { "ph": "s", "id": 238415, "pid": 76337, "tid": -914061504, "ts": 1716454225170478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225224722, "dur": 4, "args": { "External id": 238427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238427, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238427, "pid": 5, "tid": 7, "ts": 1716454225224722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170499, "dur": 6, "args": { "External id": 238427, "cbid": 211, "correlation": 238427 } }, { "ph": "s", "id": 238427, "pid": 76337, "tid": -914061504, "ts": 1716454225170499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225224728, "dur": 11, "args": { "External id": 238430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238430, "pid": 5, "tid": 7, "ts": 1716454225224728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170516, "dur": 7, "args": { "External id": 238430, "cbid": 211, "correlation": 238430 } }, { "ph": "s", "id": 238430, "pid": 76337, "tid": -914061504, "ts": 1716454225170516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225224740, "dur": 6, "args": { "External id": 238439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238439, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238439, "pid": 5, "tid": 7, "ts": 1716454225224740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170555, "dur": 10, "args": { "External id": 238439, "cbid": 211, "correlation": 238439 } }, { "ph": "s", "id": 238439, "pid": 76337, "tid": -914061504, "ts": 1716454225170555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225170607, "dur": 0, "args": { "External id": 238449, "cbid": 317, "correlation": 238449 } }, { "ph": "f", "id": 238449, "pid": 76337, "tid": -914061504, "ts": 1716454225170607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225170608, "dur": 0, "args": { "External id": 238450, "cbid": 203, "correlation": 238450 } }, { "ph": "f", "id": 238450, "pid": 76337, "tid": -914061504, "ts": 1716454225170608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225170609, "dur": 0, "args": { "External id": 238451, "cbid": 205, "correlation": 238451 } }, { "ph": "f", "id": 238451, "pid": 76337, "tid": -914061504, "ts": 1716454225170609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224747, "dur": 6, "args": { "External id": 238455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238455, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238455, "pid": 5, "tid": 7, "ts": 1716454225224747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170622, "dur": 12, "args": { "External id": 238455, "cbid": 211, "correlation": 238455 } }, { "ph": "s", "id": 238455, "pid": 76337, "tid": -914061504, "ts": 1716454225170622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225224755, "dur": 314, "args": { "External id": 238457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238457, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238457, "pid": 5, "tid": 7, "ts": 1716454225224755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170637, "dur": 5, "args": { "External id": 238457, "cbid": 211, "correlation": 238457 } }, { "ph": "s", "id": 238457, "pid": 76337, "tid": -914061504, "ts": 1716454225170637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225225071, "dur": 1, "args": { "External id": 238459, "device": 5, "context": 1, "stream": 7, "correlation": 238459, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 238459, "pid": 5, "tid": 7, "ts": 1716454225225071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225170647, "dur": 6, "args": { "External id": 238459, "cbid": 51, "correlation": 238459 } }, { "ph": "s", "id": 238459, "pid": 76337, "tid": -914061504, "ts": 1716454225170647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225225075, "dur": 488, "args": { "External id": 238460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238460, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238460, "pid": 5, "tid": 7, "ts": 1716454225225075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170654, "dur": 6, "args": { "External id": 238460, "cbid": 211, "correlation": 238460 } }, { "ph": "s", "id": 238460, "pid": 76337, "tid": -914061504, "ts": 1716454225170654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225225564, "dur": 6, "args": { "External id": 238462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238462, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238462, "pid": 5, "tid": 7, "ts": 1716454225225564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170665, "dur": 5, "args": { "External id": 238462, "cbid": 211, "correlation": 238462 } }, { "ph": "s", "id": 238462, "pid": 76337, "tid": -914061504, "ts": 1716454225170665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225225571, "dur": 6, "args": { "External id": 238468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238468, "pid": 5, "tid": 7, "ts": 1716454225225571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170693, "dur": 8, "args": { "External id": 238468, "cbid": 211, "correlation": 238468 } }, { "ph": "s", "id": 238468, "pid": 76337, "tid": -914061504, "ts": 1716454225170693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225225578, "dur": 3, "args": { "External id": 238476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238476, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 238476, "pid": 5, "tid": 7, "ts": 1716454225225578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170736, "dur": 9, "args": { "External id": 238476, "cbid": 211, "correlation": 238476 } }, { "ph": "s", "id": 238476, "pid": 76337, "tid": -914061504, "ts": 1716454225170736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225170798, "dur": 1, "args": { "External id": 238492, "cbid": 251, "correlation": 238492 } }, { "ph": "f", "id": 238492, "pid": 76337, "tid": -914061504, "ts": 1716454225170798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225170803, "dur": 0, "args": { "External id": 238494, "cbid": 251, "correlation": 238494 } }, { "ph": "f", "id": 238494, "pid": 76337, "tid": -914061504, "ts": 1716454225170803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225225582, "dur": 12, "args": { "External id": 238495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238495, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238495, "pid": 5, "tid": 7, "ts": 1716454225225582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170805, "dur": 11, "args": { "External id": 238495, "cbid": 211, "correlation": 238495 } }, { "ph": "s", "id": 238495, "pid": 76337, "tid": -914061504, "ts": 1716454225170805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225225596, "dur": 5, "args": { "External id": 238497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238497, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238497, "pid": 5, "tid": 7, "ts": 1716454225225596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170817, "dur": 6, "args": { "External id": 238497, "cbid": 211, "correlation": 238497 } }, { "ph": "s", "id": 238497, "pid": 76337, "tid": -914061504, "ts": 1716454225170817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225225602, "dur": 6, "args": { "External id": 238507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238507, "pid": 5, "tid": 7, "ts": 1716454225225602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170874, "dur": 12, "args": { "External id": 238507, "cbid": 211, "correlation": 238507 } }, { "ph": "s", "id": 238507, "pid": 76337, "tid": -914061504, "ts": 1716454225170874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225225609, "dur": 9, "args": { "External id": 238527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238527, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238527, "pid": 5, "tid": 7, "ts": 1716454225225609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170939, "dur": 10, "args": { "External id": 238527, "cbid": 211, "correlation": 238527 } }, { "ph": "s", "id": 238527, "pid": 76337, "tid": -914061504, "ts": 1716454225170939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225225620, "dur": 4, "args": { "External id": 238539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238539, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 238539, "pid": 5, "tid": 7, "ts": 1716454225225620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170959, "dur": 6, "args": { "External id": 238539, "cbid": 211, "correlation": 238539 } }, { "ph": "s", "id": 238539, "pid": 76337, "tid": -914061504, "ts": 1716454225170959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225225625, "dur": 7, "args": { "External id": 238542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238542, "pid": 5, "tid": 7, "ts": 1716454225225625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225170986, "dur": 7, "args": { "External id": 238542, "cbid": 211, "correlation": 238542 } }, { "ph": "s", "id": 238542, "pid": 76337, "tid": -914061504, "ts": 1716454225170986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225225633, "dur": 4, "args": { "External id": 238551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238551, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238551, "pid": 5, "tid": 7, "ts": 1716454225225633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171027, "dur": 9, "args": { "External id": 238551, "cbid": 211, "correlation": 238551 } }, { "ph": "s", "id": 238551, "pid": 76337, "tid": -914061504, "ts": 1716454225171027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225171089, "dur": 0, "args": { "External id": 238561, "cbid": 317, "correlation": 238561 } }, { "ph": "f", "id": 238561, "pid": 76337, "tid": -914061504, "ts": 1716454225171089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225171090, "dur": 0, "args": { "External id": 238562, "cbid": 203, "correlation": 238562 } }, { "ph": "f", "id": 238562, "pid": 76337, "tid": -914061504, "ts": 1716454225171090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225171090, "dur": 0, "args": { "External id": 238563, "cbid": 205, "correlation": 238563 } }, { "ph": "f", "id": 238563, "pid": 76337, "tid": -914061504, "ts": 1716454225171090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225225639, "dur": 5, "args": { "External id": 238567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238567, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238567, "pid": 5, "tid": 7, "ts": 1716454225225639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171104, "dur": 12, "args": { "External id": 238567, "cbid": 211, "correlation": 238567 } }, { "ph": "s", "id": 238567, "pid": 76337, "tid": -914061504, "ts": 1716454225171104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225225645, "dur": 159, "args": { "External id": 238569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238569, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238569, "pid": 5, "tid": 7, "ts": 1716454225225645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171118, "dur": 5, "args": { "External id": 238569, "cbid": 211, "correlation": 238569 } }, { "ph": "s", "id": 238569, "pid": 76337, "tid": -914061504, "ts": 1716454225171118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225225806, "dur": 1, "args": { "External id": 238571, "device": 5, "context": 1, "stream": 7, "correlation": 238571, "bytes": 240, "memory bandwidth (GB/s)": 0.15296367112810708 } }, { "ph": "f", "id": 238571, "pid": 5, "tid": 7, "ts": 1716454225225806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225171129, "dur": 6, "args": { "External id": 238571, "cbid": 51, "correlation": 238571 } }, { "ph": "s", "id": 238571, "pid": 76337, "tid": -914061504, "ts": 1716454225171129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225225810, "dur": 254, "args": { "External id": 238572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238572, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238572, "pid": 5, "tid": 7, "ts": 1716454225225810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171136, "dur": 6, "args": { "External id": 238572, "cbid": 211, "correlation": 238572 } }, { "ph": "s", "id": 238572, "pid": 76337, "tid": -914061504, "ts": 1716454225171136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225226065, "dur": 5, "args": { "External id": 238574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238574, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238574, "pid": 5, "tid": 7, "ts": 1716454225226065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171146, "dur": 5, "args": { "External id": 238574, "cbid": 211, "correlation": 238574 } }, { "ph": "s", "id": 238574, "pid": 76337, "tid": -914061504, "ts": 1716454225171146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225226072, "dur": 6, "args": { "External id": 238580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238580, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238580, "pid": 5, "tid": 7, "ts": 1716454225226072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171174, "dur": 9, "args": { "External id": 238580, "cbid": 211, "correlation": 238580 } }, { "ph": "s", "id": 238580, "pid": 76337, "tid": -914061504, "ts": 1716454225171174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225171233, "dur": 0, "args": { "External id": 238590, "cbid": 317, "correlation": 238590 } }, { "ph": "f", "id": 238590, "pid": 76337, "tid": -914061504, "ts": 1716454225171233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225171234, "dur": 0, "args": { "External id": 238591, "cbid": 203, "correlation": 238591 } }, { "ph": "f", "id": 238591, "pid": 76337, "tid": -914061504, "ts": 1716454225171234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225171235, "dur": 0, "args": { "External id": 238592, "cbid": 205, "correlation": 238592 } }, { "ph": "f", "id": 238592, "pid": 76337, "tid": -914061504, "ts": 1716454225171235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225226080, "dur": 8, "args": { "External id": 238596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238596, "pid": 5, "tid": 7, "ts": 1716454225226080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171246, "dur": 11, "args": { "External id": 238596, "cbid": 211, "correlation": 238596 } }, { "ph": "s", "id": 238596, "pid": 76337, "tid": -914061504, "ts": 1716454225171246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225226088, "dur": 3, "args": { "External id": 238598, "device": 5, "context": 1, "stream": 7, "correlation": 238598, "bytes": 4800, "memory bandwidth (GB/s)": 1.5151515151515151 } }, { "ph": "f", "id": 238598, "pid": 5, "tid": 7, "ts": 1716454225226088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225171262, "dur": 9, "args": { "External id": 238598, "cbid": 51, "correlation": 238598 } }, { "ph": "s", "id": 238598, "pid": 76337, "tid": -914061504, "ts": 1716454225171262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225226092, "dur": 93, "args": { "External id": 238599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238599, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 238599, "pid": 5, "tid": 7, "ts": 1716454225226092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171272, "dur": 7, "args": { "External id": 238599, "cbid": 211, "correlation": 238599 } }, { "ph": "s", "id": 238599, "pid": 76337, "tid": -914061504, "ts": 1716454225171272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225226187, "dur": 6, "args": { "External id": 238601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238601, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238601, "pid": 5, "tid": 7, "ts": 1716454225226187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171282, "dur": 5, "args": { "External id": 238601, "cbid": 211, "correlation": 238601 } }, { "ph": "s", "id": 238601, "pid": 76337, "tid": -914061504, "ts": 1716454225171282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225226194, "dur": 6, "args": { "External id": 238607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238607, "pid": 5, "tid": 7, "ts": 1716454225226194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171308, "dur": 8, "args": { "External id": 238607, "cbid": 211, "correlation": 238607 } }, { "ph": "s", "id": 238607, "pid": 76337, "tid": -914061504, "ts": 1716454225171308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225226201, "dur": 5, "args": { "External id": 238615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238615, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238615, "pid": 5, "tid": 7, "ts": 1716454225226201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171338, "dur": 8, "args": { "External id": 238615, "cbid": 211, "correlation": 238615 } }, { "ph": "s", "id": 238615, "pid": 76337, "tid": -914061504, "ts": 1716454225171338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225226207, "dur": 4, "args": { "External id": 238623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238623, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 238623, "pid": 5, "tid": 7, "ts": 1716454225226207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171366, "dur": 8, "args": { "External id": 238623, "cbid": 211, "correlation": 238623 } }, { "ph": "s", "id": 238623, "pid": 76337, "tid": -914061504, "ts": 1716454225171366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225226213, "dur": 14, "args": { "External id": 238634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238634, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238634, "pid": 5, "tid": 7, "ts": 1716454225226213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171445, "dur": 12, "args": { "External id": 238634, "cbid": 211, "correlation": 238634 } }, { "ph": "s", "id": 238634, "pid": 76337, "tid": -914061504, "ts": 1716454225171445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225171500, "dur": 0, "args": { "External id": 238644, "cbid": 317, "correlation": 238644 } }, { "ph": "f", "id": 238644, "pid": 76337, "tid": -914061504, "ts": 1716454225171500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225171500, "dur": 0, "args": { "External id": 238645, "cbid": 203, "correlation": 238645 } }, { "ph": "f", "id": 238645, "pid": 76337, "tid": -914061504, "ts": 1716454225171500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225171501, "dur": 0, "args": { "External id": 238646, "cbid": 205, "correlation": 238646 } }, { "ph": "f", "id": 238646, "pid": 76337, "tid": -914061504, "ts": 1716454225171501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225226228, "dur": 8, "args": { "External id": 238650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238650, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238650, "pid": 5, "tid": 7, "ts": 1716454225226228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171514, "dur": 12, "args": { "External id": 238650, "cbid": 211, "correlation": 238650 } }, { "ph": "s", "id": 238650, "pid": 76337, "tid": -914061504, "ts": 1716454225171514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225226238, "dur": 160, "args": { "External id": 238652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238652, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238652, "pid": 5, "tid": 7, "ts": 1716454225226238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171529, "dur": 5, "args": { "External id": 238652, "cbid": 211, "correlation": 238652 } }, { "ph": "s", "id": 238652, "pid": 76337, "tid": -914061504, "ts": 1716454225171529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225226399, "dur": 1, "args": { "External id": 238654, "device": 5, "context": 1, "stream": 7, "correlation": 238654, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 238654, "pid": 5, "tid": 7, "ts": 1716454225226399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225171539, "dur": 7, "args": { "External id": 238654, "cbid": 51, "correlation": 238654 } }, { "ph": "s", "id": 238654, "pid": 76337, "tid": -914061504, "ts": 1716454225171539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225226403, "dur": 641, "args": { "External id": 238655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238655, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238655, "pid": 5, "tid": 7, "ts": 1716454225226403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171548, "dur": 6, "args": { "External id": 238655, "cbid": 211, "correlation": 238655 } }, { "ph": "s", "id": 238655, "pid": 76337, "tid": -914061504, "ts": 1716454225171548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225227046, "dur": 12, "args": { "External id": 238657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238657, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238657, "pid": 5, "tid": 7, "ts": 1716454225227046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171558, "dur": 5, "args": { "External id": 238657, "cbid": 211, "correlation": 238657 } }, { "ph": "s", "id": 238657, "pid": 76337, "tid": -914061504, "ts": 1716454225171558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225227059, "dur": 14, "args": { "External id": 238663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238663, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238663, "pid": 5, "tid": 7, "ts": 1716454225227059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171587, "dur": 9, "args": { "External id": 238663, "cbid": 211, "correlation": 238663 } }, { "ph": "s", "id": 238663, "pid": 76337, "tid": -914061504, "ts": 1716454225171587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225227075, "dur": 31, "args": { "External id": 238672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238672, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238672, "pid": 5, "tid": 7, "ts": 1716454225227075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171678, "dur": 12, "args": { "External id": 238672, "cbid": 211, "correlation": 238672 } }, { "ph": "s", "id": 238672, "pid": 76337, "tid": -914061504, "ts": 1716454225171678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225227107, "dur": 29, "args": { "External id": 238692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238692, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238692, "pid": 5, "tid": 7, "ts": 1716454225227107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171745, "dur": 11, "args": { "External id": 238692, "cbid": 211, "correlation": 238692 } }, { "ph": "s", "id": 238692, "pid": 76337, "tid": -914061504, "ts": 1716454225171745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225227136, "dur": 4, "args": { "External id": 238704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238704, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238704, "pid": 5, "tid": 7, "ts": 1716454225227136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171765, "dur": 6, "args": { "External id": 238704, "cbid": 211, "correlation": 238704 } }, { "ph": "s", "id": 238704, "pid": 76337, "tid": -914061504, "ts": 1716454225171765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225227142, "dur": 30, "args": { "External id": 238707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238707, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238707, "pid": 5, "tid": 7, "ts": 1716454225227142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171784, "dur": 6, "args": { "External id": 238707, "cbid": 211, "correlation": 238707 } }, { "ph": "s", "id": 238707, "pid": 76337, "tid": -914061504, "ts": 1716454225171784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225227173, "dur": 21, "args": { "External id": 238716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238716, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238716, "pid": 5, "tid": 7, "ts": 1716454225227173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171822, "dur": 10, "args": { "External id": 238716, "cbid": 211, "correlation": 238716 } }, { "ph": "s", "id": 238716, "pid": 76337, "tid": -914061504, "ts": 1716454225171822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225171874, "dur": 0, "args": { "External id": 238726, "cbid": 317, "correlation": 238726 } }, { "ph": "f", "id": 238726, "pid": 76337, "tid": -914061504, "ts": 1716454225171874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225171875, "dur": 0, "args": { "External id": 238727, "cbid": 203, "correlation": 238727 } }, { "ph": "f", "id": 238727, "pid": 76337, "tid": -914061504, "ts": 1716454225171875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225171876, "dur": 0, "args": { "External id": 238728, "cbid": 205, "correlation": 238728 } }, { "ph": "f", "id": 238728, "pid": 76337, "tid": -914061504, "ts": 1716454225171876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225227195, "dur": 22, "args": { "External id": 238732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238732, "pid": 5, "tid": 7, "ts": 1716454225227195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171891, "dur": 11, "args": { "External id": 238732, "cbid": 211, "correlation": 238732 } }, { "ph": "s", "id": 238732, "pid": 76337, "tid": -914061504, "ts": 1716454225171891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225227219, "dur": 313, "args": { "External id": 238734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238734, "pid": 5, "tid": 7, "ts": 1716454225227219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171905, "dur": 5, "args": { "External id": 238734, "cbid": 211, "correlation": 238734 } }, { "ph": "s", "id": 238734, "pid": 76337, "tid": -914061504, "ts": 1716454225171905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225227534, "dur": 1, "args": { "External id": 238736, "device": 5, "context": 1, "stream": 7, "correlation": 238736, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 238736, "pid": 5, "tid": 7, "ts": 1716454225227534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225171915, "dur": 7, "args": { "External id": 238736, "cbid": 51, "correlation": 238736 } }, { "ph": "s", "id": 238736, "pid": 76337, "tid": -914061504, "ts": 1716454225171915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225227538, "dur": 1225, "args": { "External id": 238737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238737, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238737, "pid": 5, "tid": 7, "ts": 1716454225227538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171923, "dur": 6, "args": { "External id": 238737, "cbid": 211, "correlation": 238737 } }, { "ph": "s", "id": 238737, "pid": 76337, "tid": -914061504, "ts": 1716454225171923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225228764, "dur": 12, "args": { "External id": 238739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238739, "pid": 5, "tid": 7, "ts": 1716454225228764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171933, "dur": 5, "args": { "External id": 238739, "cbid": 211, "correlation": 238739 } }, { "ph": "s", "id": 238739, "pid": 76337, "tid": -914061504, "ts": 1716454225171933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225228778, "dur": 14, "args": { "External id": 238745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238745, "pid": 5, "tid": 7, "ts": 1716454225228778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225171961, "dur": 8, "args": { "External id": 238745, "cbid": 211, "correlation": 238745 } }, { "ph": "s", "id": 238745, "pid": 76337, "tid": -914061504, "ts": 1716454225171961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225228793, "dur": 3, "args": { "External id": 238753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238753, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 238753, "pid": 5, "tid": 7, "ts": 1716454225228793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172012, "dur": 10, "args": { "External id": 238753, "cbid": 211, "correlation": 238753 } }, { "ph": "s", "id": 238753, "pid": 76337, "tid": -914061504, "ts": 1716454225172012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225172076, "dur": 1, "args": { "External id": 238769, "cbid": 251, "correlation": 238769 } }, { "ph": "f", "id": 238769, "pid": 76337, "tid": -914061504, "ts": 1716454225172076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225172082, "dur": 0, "args": { "External id": 238771, "cbid": 251, "correlation": 238771 } }, { "ph": "f", "id": 238771, "pid": 76337, "tid": -914061504, "ts": 1716454225172082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225228798, "dur": 12, "args": { "External id": 238772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238772, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238772, "pid": 5, "tid": 7, "ts": 1716454225228798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172084, "dur": 11, "args": { "External id": 238772, "cbid": 211, "correlation": 238772 } }, { "ph": "s", "id": 238772, "pid": 76337, "tid": -914061504, "ts": 1716454225172084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225228811, "dur": 5, "args": { "External id": 238774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238774, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238774, "pid": 5, "tid": 7, "ts": 1716454225228811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172097, "dur": 5, "args": { "External id": 238774, "cbid": 211, "correlation": 238774 } }, { "ph": "s", "id": 238774, "pid": 76337, "tid": -914061504, "ts": 1716454225172097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225228817, "dur": 18, "args": { "External id": 238784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238784, "pid": 5, "tid": 7, "ts": 1716454225228817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172154, "dur": 12, "args": { "External id": 238784, "cbid": 211, "correlation": 238784 } }, { "ph": "s", "id": 238784, "pid": 76337, "tid": -914061504, "ts": 1716454225172154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225228836, "dur": 17, "args": { "External id": 238804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238804, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238804, "pid": 5, "tid": 7, "ts": 1716454225228836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172219, "dur": 12, "args": { "External id": 238804, "cbid": 211, "correlation": 238804 } }, { "ph": "s", "id": 238804, "pid": 76337, "tid": -914061504, "ts": 1716454225172219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225228854, "dur": 4, "args": { "External id": 238816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238816, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 238816, "pid": 5, "tid": 7, "ts": 1716454225228854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172241, "dur": 6, "args": { "External id": 238816, "cbid": 211, "correlation": 238816 } }, { "ph": "s", "id": 238816, "pid": 76337, "tid": -914061504, "ts": 1716454225172241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225228860, "dur": 16, "args": { "External id": 238819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238819, "pid": 5, "tid": 7, "ts": 1716454225228860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172259, "dur": 6, "args": { "External id": 238819, "cbid": 211, "correlation": 238819 } }, { "ph": "s", "id": 238819, "pid": 76337, "tid": -914061504, "ts": 1716454225172259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225228877, "dur": 11, "args": { "External id": 238828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238828, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238828, "pid": 5, "tid": 7, "ts": 1716454225228877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172299, "dur": 10, "args": { "External id": 238828, "cbid": 211, "correlation": 238828 } }, { "ph": "s", "id": 238828, "pid": 76337, "tid": -914061504, "ts": 1716454225172299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225172360, "dur": 0, "args": { "External id": 238838, "cbid": 317, "correlation": 238838 } }, { "ph": "f", "id": 238838, "pid": 76337, "tid": -914061504, "ts": 1716454225172360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225172361, "dur": 0, "args": { "External id": 238839, "cbid": 203, "correlation": 238839 } }, { "ph": "f", "id": 238839, "pid": 76337, "tid": -914061504, "ts": 1716454225172361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225172362, "dur": 0, "args": { "External id": 238840, "cbid": 205, "correlation": 238840 } }, { "ph": "f", "id": 238840, "pid": 76337, "tid": -914061504, "ts": 1716454225172362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225228890, "dur": 11, "args": { "External id": 238844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238844, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238844, "pid": 5, "tid": 7, "ts": 1716454225228890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172375, "dur": 12, "args": { "External id": 238844, "cbid": 211, "correlation": 238844 } }, { "ph": "s", "id": 238844, "pid": 76337, "tid": -914061504, "ts": 1716454225172375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225228902, "dur": 159, "args": { "External id": 238846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238846, "pid": 5, "tid": 7, "ts": 1716454225228902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172390, "dur": 5, "args": { "External id": 238846, "cbid": 211, "correlation": 238846 } }, { "ph": "s", "id": 238846, "pid": 76337, "tid": -914061504, "ts": 1716454225172390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225229063, "dur": 1, "args": { "External id": 238848, "device": 5, "context": 1, "stream": 7, "correlation": 238848, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 238848, "pid": 5, "tid": 7, "ts": 1716454225229063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225172401, "dur": 6, "args": { "External id": 238848, "cbid": 51, "correlation": 238848 } }, { "ph": "s", "id": 238848, "pid": 76337, "tid": -914061504, "ts": 1716454225172401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225229067, "dur": 640, "args": { "External id": 238849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238849, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 238849, "pid": 5, "tid": 7, "ts": 1716454225229067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172408, "dur": 6, "args": { "External id": 238849, "cbid": 211, "correlation": 238849 } }, { "ph": "s", "id": 238849, "pid": 76337, "tid": -914061504, "ts": 1716454225172408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225229709, "dur": 12, "args": { "External id": 238851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238851, "pid": 5, "tid": 7, "ts": 1716454225229709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172418, "dur": 5, "args": { "External id": 238851, "cbid": 211, "correlation": 238851 } }, { "ph": "s", "id": 238851, "pid": 76337, "tid": -914061504, "ts": 1716454225172418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225229722, "dur": 14, "args": { "External id": 238857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238857, "pid": 5, "tid": 7, "ts": 1716454225229722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172446, "dur": 8, "args": { "External id": 238857, "cbid": 211, "correlation": 238857 } }, { "ph": "s", "id": 238857, "pid": 76337, "tid": -914061504, "ts": 1716454225172446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225172505, "dur": 0, "args": { "External id": 238867, "cbid": 317, "correlation": 238867 } }, { "ph": "f", "id": 238867, "pid": 76337, "tid": -914061504, "ts": 1716454225172505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225172506, "dur": 0, "args": { "External id": 238868, "cbid": 203, "correlation": 238868 } }, { "ph": "f", "id": 238868, "pid": 76337, "tid": -914061504, "ts": 1716454225172506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225172506, "dur": 0, "args": { "External id": 238869, "cbid": 205, "correlation": 238869 } }, { "ph": "f", "id": 238869, "pid": 76337, "tid": -914061504, "ts": 1716454225172506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225229737, "dur": 21, "args": { "External id": 238873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238873, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238873, "pid": 5, "tid": 7, "ts": 1716454225229737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172519, "dur": 12, "args": { "External id": 238873, "cbid": 211, "correlation": 238873 } }, { "ph": "s", "id": 238873, "pid": 76337, "tid": -914061504, "ts": 1716454225172519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225229760, "dur": 4, "args": { "External id": 238875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 238875, "pid": 5, "tid": 7, "ts": 1716454225229760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172537, "dur": 6, "args": { "External id": 238875, "cbid": 211, "correlation": 238875 } }, { "ph": "s", "id": 238875, "pid": 76337, "tid": -914061504, "ts": 1716454225172537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225172546, "dur": 0, "args": { "External id": 238876, "cbid": 51, "correlation": 238876 } }, { "ph": "s", "id": 238876, "pid": 76337, "tid": -914061504, "ts": 1716454225172546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225229765, "dur": 172, "args": { "External id": 238877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238877, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 238877, "pid": 5, "tid": 7, "ts": 1716454225229765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172547, "dur": 6, "args": { "External id": 238877, "cbid": 211, "correlation": 238877 } }, { "ph": "s", "id": 238877, "pid": 76337, "tid": -914061504, "ts": 1716454225172547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225229938, "dur": 15, "args": { "External id": 238882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238882, "pid": 5, "tid": 7, "ts": 1716454225229938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172573, "dur": 9, "args": { "External id": 238882, "cbid": 211, "correlation": 238882 } }, { "ph": "s", "id": 238882, "pid": 76337, "tid": -914061504, "ts": 1716454225172573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225229954, "dur": 12, "args": { "External id": 238890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238890, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238890, "pid": 5, "tid": 7, "ts": 1716454225229954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172602, "dur": 8, "args": { "External id": 238890, "cbid": 211, "correlation": 238890 } }, { "ph": "s", "id": 238890, "pid": 76337, "tid": -914061504, "ts": 1716454225172602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225229967, "dur": 10, "args": { "External id": 238898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238898, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238898, "pid": 5, "tid": 7, "ts": 1716454225229967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172632, "dur": 8, "args": { "External id": 238898, "cbid": 211, "correlation": 238898 } }, { "ph": "s", "id": 238898, "pid": 76337, "tid": -914061504, "ts": 1716454225172632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225229978, "dur": 19, "args": { "External id": 238918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238918, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 238918, "pid": 5, "tid": 7, "ts": 1716454225229978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172712, "dur": 12, "args": { "External id": 238918, "cbid": 211, "correlation": 238918 } }, { "ph": "s", "id": 238918, "pid": 76337, "tid": -914061504, "ts": 1716454225172712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225229998, "dur": 4, "args": { "External id": 238930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238930, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 238930, "pid": 5, "tid": 7, "ts": 1716454225229998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172735, "dur": 6, "args": { "External id": 238930, "cbid": 211, "correlation": 238930 } }, { "ph": "s", "id": 238930, "pid": 76337, "tid": -914061504, "ts": 1716454225172735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225230004, "dur": 17, "args": { "External id": 238933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238933, "pid": 5, "tid": 7, "ts": 1716454225230004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172753, "dur": 6, "args": { "External id": 238933, "cbid": 211, "correlation": 238933 } }, { "ph": "s", "id": 238933, "pid": 76337, "tid": -914061504, "ts": 1716454225172753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225172809, "dur": 0, "args": { "External id": 238944, "cbid": 317, "correlation": 238944 } }, { "ph": "f", "id": 238944, "pid": 76337, "tid": -914061504, "ts": 1716454225172809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225172810, "dur": 0, "args": { "External id": 238945, "cbid": 203, "correlation": 238945 } }, { "ph": "f", "id": 238945, "pid": 76337, "tid": -914061504, "ts": 1716454225172810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225172811, "dur": 0, "args": { "External id": 238946, "cbid": 205, "correlation": 238946 } }, { "ph": "f", "id": 238946, "pid": 76337, "tid": -914061504, "ts": 1716454225172811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225230022, "dur": 11, "args": { "External id": 238950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238950, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238950, "pid": 5, "tid": 7, "ts": 1716454225230022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172824, "dur": 12, "args": { "External id": 238950, "cbid": 211, "correlation": 238950 } }, { "ph": "s", "id": 238950, "pid": 76337, "tid": -914061504, "ts": 1716454225172824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225230034, "dur": 3, "args": { "External id": 238952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238952, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 238952, "pid": 5, "tid": 7, "ts": 1716454225230034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172841, "dur": 6, "args": { "External id": 238952, "cbid": 211, "correlation": 238952 } }, { "ph": "s", "id": 238952, "pid": 76337, "tid": -914061504, "ts": 1716454225172841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225172849, "dur": 0, "args": { "External id": 238953, "cbid": 51, "correlation": 238953 } }, { "ph": "s", "id": 238953, "pid": 76337, "tid": -914061504, "ts": 1716454225172849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225230039, "dur": 88, "args": { "External id": 238954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238954, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 238954, "pid": 5, "tid": 7, "ts": 1716454225230039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172850, "dur": 5, "args": { "External id": 238954, "cbid": 211, "correlation": 238954 } }, { "ph": "s", "id": 238954, "pid": 76337, "tid": -914061504, "ts": 1716454225172850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225230129, "dur": 15, "args": { "External id": 238959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238959, "pid": 5, "tid": 7, "ts": 1716454225230129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172876, "dur": 9, "args": { "External id": 238959, "cbid": 211, "correlation": 238959 } }, { "ph": "s", "id": 238959, "pid": 76337, "tid": -914061504, "ts": 1716454225172876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225230145, "dur": 82, "args": { "External id": 238968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238968, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238968, "pid": 5, "tid": 7, "ts": 1716454225230145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225172958, "dur": 14, "args": { "External id": 238968, "cbid": 211, "correlation": 238968 } }, { "ph": "s", "id": 238968, "pid": 76337, "tid": -914061504, "ts": 1716454225172958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225230228, "dur": 30, "args": { "External id": 238990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 238990, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 238990, "pid": 5, "tid": 7, "ts": 1716454225230228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173026, "dur": 11, "args": { "External id": 238990, "cbid": 211, "correlation": 238990 } }, { "ph": "s", "id": 238990, "pid": 76337, "tid": -914061504, "ts": 1716454225173026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173118, "dur": 1, "args": { "External id": 239001, "cbid": 251, "correlation": 239001 } }, { "ph": "f", "id": 239001, "pid": 76337, "tid": -914061504, "ts": 1716454225173118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225230260, "dur": 138, "args": { "External id": 239002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239002, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239002, "pid": 5, "tid": 7, "ts": 1716454225230260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173123, "dur": 14, "args": { "External id": 239002, "cbid": 211, "correlation": 239002 } }, { "ph": "s", "id": 239002, "pid": 76337, "tid": -914061504, "ts": 1716454225173123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173195, "dur": 1, "args": { "External id": 239013, "cbid": 251, "correlation": 239013 } }, { "ph": "f", "id": 239013, "pid": 76337, "tid": -914061504, "ts": 1716454225173195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225230399, "dur": 156, "args": { "External id": 239014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239014, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239014, "pid": 5, "tid": 7, "ts": 1716454225230399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173199, "dur": 11, "args": { "External id": 239014, "cbid": 211, "correlation": 239014 } }, { "ph": "s", "id": 239014, "pid": 76337, "tid": -914061504, "ts": 1716454225173199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173265, "dur": 1, "args": { "External id": 239025, "cbid": 251, "correlation": 239025 } }, { "ph": "f", "id": 239025, "pid": 76337, "tid": -914061504, "ts": 1716454225173265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225230557, "dur": 156, "args": { "External id": 239026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239026, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239026, "pid": 5, "tid": 7, "ts": 1716454225230557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173269, "dur": 12, "args": { "External id": 239026, "cbid": 211, "correlation": 239026 } }, { "ph": "s", "id": 239026, "pid": 76337, "tid": -914061504, "ts": 1716454225173269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225230714, "dur": 333, "args": { "External id": 239051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239051, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239051, "pid": 5, "tid": 7, "ts": 1716454225230714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173356, "dur": 13, "args": { "External id": 239051, "cbid": 211, "correlation": 239051 } }, { "ph": "s", "id": 239051, "pid": 76337, "tid": -914061504, "ts": 1716454225173356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173456, "dur": 1, "args": { "External id": 239069, "cbid": 251, "correlation": 239069 } }, { "ph": "f", "id": 239069, "pid": 76337, "tid": -914061504, "ts": 1716454225173456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225231048, "dur": 164, "args": { "External id": 239071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239071, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239071, "pid": 5, "tid": 7, "ts": 1716454225231048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173462, "dur": 13, "args": { "External id": 239071, "cbid": 211, "correlation": 239071 } }, { "ph": "s", "id": 239071, "pid": 76337, "tid": -914061504, "ts": 1716454225173462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225231214, "dur": 19, "args": { "External id": 239079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239079, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239079, "pid": 5, "tid": 7, "ts": 1716454225231214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173533, "dur": 12, "args": { "External id": 239079, "cbid": 211, "correlation": 239079 } }, { "ph": "s", "id": 239079, "pid": 76337, "tid": -914061504, "ts": 1716454225173533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225231234, "dur": 27, "args": { "External id": 239087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239087, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239087, "pid": 5, "tid": 7, "ts": 1716454225231234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173571, "dur": 9, "args": { "External id": 239087, "cbid": 211, "correlation": 239087 } }, { "ph": "s", "id": 239087, "pid": 76337, "tid": -914061504, "ts": 1716454225173571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225231263, "dur": 18, "args": { "External id": 239098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239098, "pid": 5, "tid": 7, "ts": 1716454225231263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173644, "dur": 13, "args": { "External id": 239098, "cbid": 211, "correlation": 239098 } }, { "ph": "s", "id": 239098, "pid": 76337, "tid": -914061504, "ts": 1716454225173644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225231282, "dur": 16, "args": { "External id": 239120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239120, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239120, "pid": 5, "tid": 7, "ts": 1716454225231282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173675, "dur": 8, "args": { "External id": 239120, "cbid": 211, "correlation": 239120 } }, { "ph": "s", "id": 239120, "pid": 76337, "tid": -914061504, "ts": 1716454225173675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173761, "dur": 1, "args": { "External id": 239131, "cbid": 251, "correlation": 239131 } }, { "ph": "f", "id": 239131, "pid": 76337, "tid": -914061504, "ts": 1716454225173761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225231300, "dur": 87, "args": { "External id": 239132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239132, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239132, "pid": 5, "tid": 7, "ts": 1716454225231300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173766, "dur": 14, "args": { "External id": 239132, "cbid": 211, "correlation": 239132 } }, { "ph": "s", "id": 239132, "pid": 76337, "tid": -914061504, "ts": 1716454225173766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173836, "dur": 1, "args": { "External id": 239143, "cbid": 251, "correlation": 239143 } }, { "ph": "f", "id": 239143, "pid": 76337, "tid": -914061504, "ts": 1716454225173836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173839, "dur": 0, "args": { "External id": 239144, "cbid": 251, "correlation": 239144 } }, { "ph": "f", "id": 239144, "pid": 76337, "tid": -914061504, "ts": 1716454225173839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225231388, "dur": 12, "args": { "External id": 239145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239145, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239145, "pid": 5, "tid": 7, "ts": 1716454225231388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173841, "dur": 12, "args": { "External id": 239145, "cbid": 211, "correlation": 239145 } }, { "ph": "s", "id": 239145, "pid": 76337, "tid": -914061504, "ts": 1716454225173841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225231401, "dur": 6, "args": { "External id": 239147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239147, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239147, "pid": 5, "tid": 7, "ts": 1716454225231401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173854, "dur": 6, "args": { "External id": 239147, "cbid": 211, "correlation": 239147 } }, { "ph": "s", "id": 239147, "pid": 76337, "tid": -914061504, "ts": 1716454225173854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173912, "dur": 1, "args": { "External id": 239158, "cbid": 251, "correlation": 239158 } }, { "ph": "f", "id": 239158, "pid": 76337, "tid": -914061504, "ts": 1716454225173912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225173915, "dur": 0, "args": { "External id": 239159, "cbid": 251, "correlation": 239159 } }, { "ph": "f", "id": 239159, "pid": 76337, "tid": -914061504, "ts": 1716454225173915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225231408, "dur": 8, "args": { "External id": 239160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239160, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239160, "pid": 5, "tid": 7, "ts": 1716454225231408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173916, "dur": 12, "args": { "External id": 239160, "cbid": 211, "correlation": 239160 } }, { "ph": "s", "id": 239160, "pid": 76337, "tid": -914061504, "ts": 1716454225173916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225231418, "dur": 3, "args": { "External id": 239162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239162, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239162, "pid": 5, "tid": 7, "ts": 1716454225231418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225173930, "dur": 5, "args": { "External id": 239162, "cbid": 211, "correlation": 239162 } }, { "ph": "s", "id": 239162, "pid": 76337, "tid": -914061504, "ts": 1716454225173930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225231422, "dur": 55, "args": { "External id": 239187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239187, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239187, "pid": 5, "tid": 7, "ts": 1716454225231422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174016, "dur": 14, "args": { "External id": 239187, "cbid": 211, "correlation": 239187 } }, { "ph": "s", "id": 239187, "pid": 76337, "tid": -914061504, "ts": 1716454225174016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225174116, "dur": 2, "args": { "External id": 239205, "cbid": 251, "correlation": 239205 } }, { "ph": "f", "id": 239205, "pid": 76337, "tid": -914061504, "ts": 1716454225174116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225231478, "dur": 89, "args": { "External id": 239207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239207, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239207, "pid": 5, "tid": 7, "ts": 1716454225231478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174123, "dur": 14, "args": { "External id": 239207, "cbid": 211, "correlation": 239207 } }, { "ph": "s", "id": 239207, "pid": 76337, "tid": -914061504, "ts": 1716454225174123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225231569, "dur": 9, "args": { "External id": 239215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239215, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239215, "pid": 5, "tid": 7, "ts": 1716454225231569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174191, "dur": 12, "args": { "External id": 239215, "cbid": 211, "correlation": 239215 } }, { "ph": "s", "id": 239215, "pid": 76337, "tid": -914061504, "ts": 1716454225174191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225231580, "dur": 21, "args": { "External id": 239223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239223, "pid": 5, "tid": 7, "ts": 1716454225231580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174232, "dur": 9, "args": { "External id": 239223, "cbid": 211, "correlation": 239223 } }, { "ph": "s", "id": 239223, "pid": 76337, "tid": -914061504, "ts": 1716454225174232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225231602, "dur": 17, "args": { "External id": 239245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239245, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239245, "pid": 5, "tid": 7, "ts": 1716454225231602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174284, "dur": 10, "args": { "External id": 239245, "cbid": 211, "correlation": 239245 } }, { "ph": "s", "id": 239245, "pid": 76337, "tid": -914061504, "ts": 1716454225174284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225174371, "dur": 1, "args": { "External id": 239261, "cbid": 251, "correlation": 239261 } }, { "ph": "f", "id": 239261, "pid": 76337, "tid": -914061504, "ts": 1716454225174371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225174376, "dur": 0, "args": { "External id": 239263, "cbid": 251, "correlation": 239263 } }, { "ph": "f", "id": 239263, "pid": 76337, "tid": -914061504, "ts": 1716454225174376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225231620, "dur": 491, "args": { "External id": 239264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239264, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239264, "pid": 5, "tid": 7, "ts": 1716454225231620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174378, "dur": 13, "args": { "External id": 239264, "cbid": 211, "correlation": 239264 } }, { "ph": "s", "id": 239264, "pid": 76337, "tid": -914061504, "ts": 1716454225174378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225232112, "dur": 66, "args": { "External id": 239272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239272, "pid": 5, "tid": 7, "ts": 1716454225232112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174444, "dur": 12, "args": { "External id": 239272, "cbid": 211, "correlation": 239272 } }, { "ph": "s", "id": 239272, "pid": 76337, "tid": -914061504, "ts": 1716454225174444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225232179, "dur": 66, "args": { "External id": 239280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239280, "pid": 5, "tid": 7, "ts": 1716454225232179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174477, "dur": 8, "args": { "External id": 239280, "cbid": 211, "correlation": 239280 } }, { "ph": "s", "id": 239280, "pid": 76337, "tid": -914061504, "ts": 1716454225174477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225174556, "dur": 1, "args": { "External id": 239296, "cbid": 251, "correlation": 239296 } }, { "ph": "f", "id": 239296, "pid": 76337, "tid": -914061504, "ts": 1716454225174556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225232247, "dur": 1, "args": { "External id": 239298, "device": 5, "context": 1, "stream": 7, "correlation": 239298, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 239298, "pid": 5, "tid": 7, "ts": 1716454225232247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225174561, "dur": 10, "args": { "External id": 239298, "cbid": 51, "correlation": 239298 } }, { "ph": "s", "id": 239298, "pid": 76337, "tid": -914061504, "ts": 1716454225174561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225232251, "dur": 265, "args": { "External id": 239299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239299, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239299, "pid": 5, "tid": 7, "ts": 1716454225232251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174573, "dur": 11, "args": { "External id": 239299, "cbid": 211, "correlation": 239299 } }, { "ph": "s", "id": 239299, "pid": 76337, "tid": -914061504, "ts": 1716454225174573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225232518, "dur": 14, "args": { "External id": 239307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239307, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239307, "pid": 5, "tid": 7, "ts": 1716454225232518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174614, "dur": 11, "args": { "External id": 239307, "cbid": 211, "correlation": 239307 } }, { "ph": "s", "id": 239307, "pid": 76337, "tid": -914061504, "ts": 1716454225174614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225232533, "dur": 37, "args": { "External id": 239318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239318, "pid": 5, "tid": 7, "ts": 1716454225232533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174683, "dur": 12, "args": { "External id": 239318, "cbid": 211, "correlation": 239318 } }, { "ph": "s", "id": 239318, "pid": 76337, "tid": -914061504, "ts": 1716454225174683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225174748, "dur": 0, "args": { "External id": 239330, "cbid": 317, "correlation": 239330 } }, { "ph": "f", "id": 239330, "pid": 76337, "tid": -914061504, "ts": 1716454225174748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225174749, "dur": 0, "args": { "External id": 239331, "cbid": 203, "correlation": 239331 } }, { "ph": "f", "id": 239331, "pid": 76337, "tid": -914061504, "ts": 1716454225174749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225174749, "dur": 0, "args": { "External id": 239332, "cbid": 205, "correlation": 239332 } }, { "ph": "f", "id": 239332, "pid": 76337, "tid": -914061504, "ts": 1716454225174749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225232571, "dur": 13, "args": { "External id": 239336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239336, "pid": 5, "tid": 7, "ts": 1716454225232571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174764, "dur": 13, "args": { "External id": 239336, "cbid": 211, "correlation": 239336 } }, { "ph": "s", "id": 239336, "pid": 76337, "tid": -914061504, "ts": 1716454225174764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225232585, "dur": 4, "args": { "External id": 239338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239338, "pid": 5, "tid": 7, "ts": 1716454225232585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174781, "dur": 6, "args": { "External id": 239338, "cbid": 211, "correlation": 239338 } }, { "ph": "s", "id": 239338, "pid": 76337, "tid": -914061504, "ts": 1716454225174781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225174790, "dur": 0, "args": { "External id": 239339, "cbid": 51, "correlation": 239339 } }, { "ph": "s", "id": 239339, "pid": 76337, "tid": -914061504, "ts": 1716454225174790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225232590, "dur": 94, "args": { "External id": 239340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239340, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 239340, "pid": 5, "tid": 7, "ts": 1716454225232590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174791, "dur": 5, "args": { "External id": 239340, "cbid": 211, "correlation": 239340 } }, { "ph": "s", "id": 239340, "pid": 76337, "tid": -914061504, "ts": 1716454225174791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225232686, "dur": 16, "args": { "External id": 239345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239345, "pid": 5, "tid": 7, "ts": 1716454225232686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174819, "dur": 9, "args": { "External id": 239345, "cbid": 211, "correlation": 239345 } }, { "ph": "s", "id": 239345, "pid": 76337, "tid": -914061504, "ts": 1716454225174819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225232703, "dur": 12, "args": { "External id": 239353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239353, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239353, "pid": 5, "tid": 7, "ts": 1716454225232703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174850, "dur": 8, "args": { "External id": 239353, "cbid": 211, "correlation": 239353 } }, { "ph": "s", "id": 239353, "pid": 76337, "tid": -914061504, "ts": 1716454225174850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225232716, "dur": 30, "args": { "External id": 239362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239362, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239362, "pid": 5, "tid": 7, "ts": 1716454225232716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174890, "dur": 10, "args": { "External id": 239362, "cbid": 211, "correlation": 239362 } }, { "ph": "s", "id": 239362, "pid": 76337, "tid": -914061504, "ts": 1716454225174890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225232747, "dur": 30, "args": { "External id": 239382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239382, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 239382, "pid": 5, "tid": 7, "ts": 1716454225232747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174962, "dur": 21, "args": { "External id": 239382, "cbid": 211, "correlation": 239382 } }, { "ph": "s", "id": 239382, "pid": 76337, "tid": -914061504, "ts": 1716454225174962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225232779, "dur": 5, "args": { "External id": 239394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239394, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239394, "pid": 5, "tid": 7, "ts": 1716454225232779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225174993, "dur": 7, "args": { "External id": 239394, "cbid": 211, "correlation": 239394 } }, { "ph": "s", "id": 239394, "pid": 76337, "tid": -914061504, "ts": 1716454225174993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225232786, "dur": 32, "args": { "External id": 239397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239397, "pid": 5, "tid": 7, "ts": 1716454225232786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175012, "dur": 8, "args": { "External id": 239397, "cbid": 211, "correlation": 239397 } }, { "ph": "s", "id": 239397, "pid": 76337, "tid": -914061504, "ts": 1716454225175012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225232818, "dur": 20, "args": { "External id": 239406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239406, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239406, "pid": 5, "tid": 7, "ts": 1716454225232818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175053, "dur": 11, "args": { "External id": 239406, "cbid": 211, "correlation": 239406 } }, { "ph": "s", "id": 239406, "pid": 76337, "tid": -914061504, "ts": 1716454225175053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225175106, "dur": 0, "args": { "External id": 239416, "cbid": 317, "correlation": 239416 } }, { "ph": "f", "id": 239416, "pid": 76337, "tid": -914061504, "ts": 1716454225175106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225175107, "dur": 0, "args": { "External id": 239417, "cbid": 203, "correlation": 239417 } }, { "ph": "f", "id": 239417, "pid": 76337, "tid": -914061504, "ts": 1716454225175107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225175107, "dur": 0, "args": { "External id": 239418, "cbid": 205, "correlation": 239418 } }, { "ph": "f", "id": 239418, "pid": 76337, "tid": -914061504, "ts": 1716454225175107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225232840, "dur": 21, "args": { "External id": 239422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239422, "pid": 5, "tid": 7, "ts": 1716454225232840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175122, "dur": 11, "args": { "External id": 239422, "cbid": 211, "correlation": 239422 } }, { "ph": "s", "id": 239422, "pid": 76337, "tid": -914061504, "ts": 1716454225175122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225232862, "dur": 313, "args": { "External id": 239424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239424, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239424, "pid": 5, "tid": 7, "ts": 1716454225232862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175135, "dur": 5, "args": { "External id": 239424, "cbid": 211, "correlation": 239424 } }, { "ph": "s", "id": 239424, "pid": 76337, "tid": -914061504, "ts": 1716454225175135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225233178, "dur": 1, "args": { "External id": 239426, "device": 5, "context": 1, "stream": 7, "correlation": 239426, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 239426, "pid": 5, "tid": 7, "ts": 1716454225233178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225175146, "dur": 7, "args": { "External id": 239426, "cbid": 51, "correlation": 239426 } }, { "ph": "s", "id": 239426, "pid": 76337, "tid": -914061504, "ts": 1716454225175146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225233181, "dur": 1245, "args": { "External id": 239427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239427, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239427, "pid": 5, "tid": 7, "ts": 1716454225233181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175154, "dur": 6, "args": { "External id": 239427, "cbid": 211, "correlation": 239427 } }, { "ph": "s", "id": 239427, "pid": 76337, "tid": -914061504, "ts": 1716454225175154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225234428, "dur": 14, "args": { "External id": 239429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239429, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239429, "pid": 5, "tid": 7, "ts": 1716454225234428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175165, "dur": 6, "args": { "External id": 239429, "cbid": 211, "correlation": 239429 } }, { "ph": "s", "id": 239429, "pid": 76337, "tid": -914061504, "ts": 1716454225175165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225234443, "dur": 15, "args": { "External id": 239435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239435, "pid": 5, "tid": 7, "ts": 1716454225234443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175194, "dur": 8, "args": { "External id": 239435, "cbid": 211, "correlation": 239435 } }, { "ph": "s", "id": 239435, "pid": 76337, "tid": -914061504, "ts": 1716454225175194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225234459, "dur": 3, "args": { "External id": 239443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239443, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 239443, "pid": 5, "tid": 7, "ts": 1716454225234459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175238, "dur": 9, "args": { "External id": 239443, "cbid": 211, "correlation": 239443 } }, { "ph": "s", "id": 239443, "pid": 76337, "tid": -914061504, "ts": 1716454225175238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225175302, "dur": 1, "args": { "External id": 239459, "cbid": 251, "correlation": 239459 } }, { "ph": "f", "id": 239459, "pid": 76337, "tid": -914061504, "ts": 1716454225175302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225175307, "dur": 0, "args": { "External id": 239461, "cbid": 251, "correlation": 239461 } }, { "ph": "f", "id": 239461, "pid": 76337, "tid": -914061504, "ts": 1716454225175307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225234463, "dur": 14, "args": { "External id": 239462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239462, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239462, "pid": 5, "tid": 7, "ts": 1716454225234463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175309, "dur": 12, "args": { "External id": 239462, "cbid": 211, "correlation": 239462 } }, { "ph": "s", "id": 239462, "pid": 76337, "tid": -914061504, "ts": 1716454225175309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225234478, "dur": 5, "args": { "External id": 239464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239464, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239464, "pid": 5, "tid": 7, "ts": 1716454225234478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175322, "dur": 5, "args": { "External id": 239464, "cbid": 211, "correlation": 239464 } }, { "ph": "s", "id": 239464, "pid": 76337, "tid": -914061504, "ts": 1716454225175322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225234485, "dur": 16, "args": { "External id": 239474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239474, "pid": 5, "tid": 7, "ts": 1716454225234485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175380, "dur": 12, "args": { "External id": 239474, "cbid": 211, "correlation": 239474 } }, { "ph": "s", "id": 239474, "pid": 76337, "tid": -914061504, "ts": 1716454225175380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225234502, "dur": 17, "args": { "External id": 239494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239494, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 239494, "pid": 5, "tid": 7, "ts": 1716454225234502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175446, "dur": 11, "args": { "External id": 239494, "cbid": 211, "correlation": 239494 } }, { "ph": "s", "id": 239494, "pid": 76337, "tid": -914061504, "ts": 1716454225175446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225234521, "dur": 4, "args": { "External id": 239506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239506, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 239506, "pid": 5, "tid": 7, "ts": 1716454225234521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175468, "dur": 6, "args": { "External id": 239506, "cbid": 211, "correlation": 239506 } }, { "ph": "s", "id": 239506, "pid": 76337, "tid": -914061504, "ts": 1716454225175468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225234526, "dur": 16, "args": { "External id": 239509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239509, "pid": 5, "tid": 7, "ts": 1716454225234526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175486, "dur": 6, "args": { "External id": 239509, "cbid": 211, "correlation": 239509 } }, { "ph": "s", "id": 239509, "pid": 76337, "tid": -914061504, "ts": 1716454225175486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225234543, "dur": 12, "args": { "External id": 239518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239518, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239518, "pid": 5, "tid": 7, "ts": 1716454225234543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175526, "dur": 9, "args": { "External id": 239518, "cbid": 211, "correlation": 239518 } }, { "ph": "s", "id": 239518, "pid": 76337, "tid": -914061504, "ts": 1716454225175526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225175587, "dur": 0, "args": { "External id": 239528, "cbid": 317, "correlation": 239528 } }, { "ph": "f", "id": 239528, "pid": 76337, "tid": -914061504, "ts": 1716454225175587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225175588, "dur": 0, "args": { "External id": 239529, "cbid": 203, "correlation": 239529 } }, { "ph": "f", "id": 239529, "pid": 76337, "tid": -914061504, "ts": 1716454225175588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225175589, "dur": 0, "args": { "External id": 239530, "cbid": 205, "correlation": 239530 } }, { "ph": "f", "id": 239530, "pid": 76337, "tid": -914061504, "ts": 1716454225175589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225234556, "dur": 11, "args": { "External id": 239534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239534, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239534, "pid": 5, "tid": 7, "ts": 1716454225234556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175602, "dur": 12, "args": { "External id": 239534, "cbid": 211, "correlation": 239534 } }, { "ph": "s", "id": 239534, "pid": 76337, "tid": -914061504, "ts": 1716454225175602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225234568, "dur": 159, "args": { "External id": 239536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239536, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239536, "pid": 5, "tid": 7, "ts": 1716454225234568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175617, "dur": 6, "args": { "External id": 239536, "cbid": 211, "correlation": 239536 } }, { "ph": "s", "id": 239536, "pid": 76337, "tid": -914061504, "ts": 1716454225175617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225234729, "dur": 1, "args": { "External id": 239538, "device": 5, "context": 1, "stream": 7, "correlation": 239538, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 239538, "pid": 5, "tid": 7, "ts": 1716454225234729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225175629, "dur": 6, "args": { "External id": 239538, "cbid": 51, "correlation": 239538 } }, { "ph": "s", "id": 239538, "pid": 76337, "tid": -914061504, "ts": 1716454225175629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225234733, "dur": 643, "args": { "External id": 239539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239539, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239539, "pid": 5, "tid": 7, "ts": 1716454225234733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175636, "dur": 6, "args": { "External id": 239539, "cbid": 211, "correlation": 239539 } }, { "ph": "s", "id": 239539, "pid": 76337, "tid": -914061504, "ts": 1716454225175636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225235377, "dur": 12, "args": { "External id": 239541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239541, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239541, "pid": 5, "tid": 7, "ts": 1716454225235377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175645, "dur": 5, "args": { "External id": 239541, "cbid": 211, "correlation": 239541 } }, { "ph": "s", "id": 239541, "pid": 76337, "tid": -914061504, "ts": 1716454225175645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225235391, "dur": 14, "args": { "External id": 239547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239547, "pid": 5, "tid": 7, "ts": 1716454225235391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175674, "dur": 9, "args": { "External id": 239547, "cbid": 211, "correlation": 239547 } }, { "ph": "s", "id": 239547, "pid": 76337, "tid": -914061504, "ts": 1716454225175674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225175732, "dur": 0, "args": { "External id": 239557, "cbid": 317, "correlation": 239557 } }, { "ph": "f", "id": 239557, "pid": 76337, "tid": -914061504, "ts": 1716454225175732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225175733, "dur": 0, "args": { "External id": 239558, "cbid": 203, "correlation": 239558 } }, { "ph": "f", "id": 239558, "pid": 76337, "tid": -914061504, "ts": 1716454225175733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225175734, "dur": 0, "args": { "External id": 239559, "cbid": 205, "correlation": 239559 } }, { "ph": "f", "id": 239559, "pid": 76337, "tid": -914061504, "ts": 1716454225175734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225235407, "dur": 22, "args": { "External id": 239563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239563, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239563, "pid": 5, "tid": 7, "ts": 1716454225235407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175745, "dur": 11, "args": { "External id": 239563, "cbid": 211, "correlation": 239563 } }, { "ph": "s", "id": 239563, "pid": 76337, "tid": -914061504, "ts": 1716454225175745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225235430, "dur": 4, "args": { "External id": 239565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239565, "pid": 5, "tid": 7, "ts": 1716454225235430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175760, "dur": 7, "args": { "External id": 239565, "cbid": 211, "correlation": 239565 } }, { "ph": "s", "id": 239565, "pid": 76337, "tid": -914061504, "ts": 1716454225175760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225175770, "dur": 0, "args": { "External id": 239566, "cbid": 51, "correlation": 239566 } }, { "ph": "s", "id": 239566, "pid": 76337, "tid": -914061504, "ts": 1716454225175770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225235435, "dur": 167, "args": { "External id": 239567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239567, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 239567, "pid": 5, "tid": 7, "ts": 1716454225235435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175771, "dur": 5, "args": { "External id": 239567, "cbid": 211, "correlation": 239567 } }, { "ph": "s", "id": 239567, "pid": 76337, "tid": -914061504, "ts": 1716454225175771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225235604, "dur": 15, "args": { "External id": 239572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239572, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239572, "pid": 5, "tid": 7, "ts": 1716454225235604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175796, "dur": 8, "args": { "External id": 239572, "cbid": 211, "correlation": 239572 } }, { "ph": "s", "id": 239572, "pid": 76337, "tid": -914061504, "ts": 1716454225175796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225235621, "dur": 13, "args": { "External id": 239580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239580, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239580, "pid": 5, "tid": 7, "ts": 1716454225235621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175825, "dur": 8, "args": { "External id": 239580, "cbid": 211, "correlation": 239580 } }, { "ph": "s", "id": 239580, "pid": 76337, "tid": -914061504, "ts": 1716454225175825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225235635, "dur": 10, "args": { "External id": 239588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239588, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239588, "pid": 5, "tid": 7, "ts": 1716454225235635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175853, "dur": 8, "args": { "External id": 239588, "cbid": 211, "correlation": 239588 } }, { "ph": "s", "id": 239588, "pid": 76337, "tid": -914061504, "ts": 1716454225175853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225235646, "dur": 18, "args": { "External id": 239608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239608, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 239608, "pid": 5, "tid": 7, "ts": 1716454225235646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175934, "dur": 12, "args": { "External id": 239608, "cbid": 211, "correlation": 239608 } }, { "ph": "s", "id": 239608, "pid": 76337, "tid": -914061504, "ts": 1716454225175934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225235665, "dur": 5, "args": { "External id": 239620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239620, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 239620, "pid": 5, "tid": 7, "ts": 1716454225235665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175956, "dur": 6, "args": { "External id": 239620, "cbid": 211, "correlation": 239620 } }, { "ph": "s", "id": 239620, "pid": 76337, "tid": -914061504, "ts": 1716454225175956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225235671, "dur": 18, "args": { "External id": 239623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239623, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239623, "pid": 5, "tid": 7, "ts": 1716454225235671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225175982, "dur": 7, "args": { "External id": 239623, "cbid": 211, "correlation": 239623 } }, { "ph": "s", "id": 239623, "pid": 76337, "tid": -914061504, "ts": 1716454225175982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225176042, "dur": 0, "args": { "External id": 239634, "cbid": 317, "correlation": 239634 } }, { "ph": "f", "id": 239634, "pid": 76337, "tid": -914061504, "ts": 1716454225176042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225176043, "dur": 0, "args": { "External id": 239635, "cbid": 203, "correlation": 239635 } }, { "ph": "f", "id": 239635, "pid": 76337, "tid": -914061504, "ts": 1716454225176043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225176044, "dur": 0, "args": { "External id": 239636, "cbid": 205, "correlation": 239636 } }, { "ph": "f", "id": 239636, "pid": 76337, "tid": -914061504, "ts": 1716454225176044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225235690, "dur": 12, "args": { "External id": 239640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239640, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239640, "pid": 5, "tid": 7, "ts": 1716454225235690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176056, "dur": 12, "args": { "External id": 239640, "cbid": 211, "correlation": 239640 } }, { "ph": "s", "id": 239640, "pid": 76337, "tid": -914061504, "ts": 1716454225176056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225235703, "dur": 4, "args": { "External id": 239642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239642, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239642, "pid": 5, "tid": 7, "ts": 1716454225235703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176073, "dur": 6, "args": { "External id": 239642, "cbid": 211, "correlation": 239642 } }, { "ph": "s", "id": 239642, "pid": 76337, "tid": -914061504, "ts": 1716454225176073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225176081, "dur": 0, "args": { "External id": 239643, "cbid": 51, "correlation": 239643 } }, { "ph": "s", "id": 239643, "pid": 76337, "tid": -914061504, "ts": 1716454225176081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225235708, "dur": 89, "args": { "External id": 239644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239644, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 239644, "pid": 5, "tid": 7, "ts": 1716454225235708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176082, "dur": 5, "args": { "External id": 239644, "cbid": 211, "correlation": 239644 } }, { "ph": "s", "id": 239644, "pid": 76337, "tid": -914061504, "ts": 1716454225176082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225235799, "dur": 15, "args": { "External id": 239649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239649, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239649, "pid": 5, "tid": 7, "ts": 1716454225235799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176108, "dur": 9, "args": { "External id": 239649, "cbid": 211, "correlation": 239649 } }, { "ph": "s", "id": 239649, "pid": 76337, "tid": -914061504, "ts": 1716454225176108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225235815, "dur": 83, "args": { "External id": 239658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239658, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239658, "pid": 5, "tid": 7, "ts": 1716454225235815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176190, "dur": 14, "args": { "External id": 239658, "cbid": 211, "correlation": 239658 } }, { "ph": "s", "id": 239658, "pid": 76337, "tid": -914061504, "ts": 1716454225176190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225235900, "dur": 30, "args": { "External id": 239680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239680, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239680, "pid": 5, "tid": 7, "ts": 1716454225235900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176248, "dur": 10, "args": { "External id": 239680, "cbid": 211, "correlation": 239680 } }, { "ph": "s", "id": 239680, "pid": 76337, "tid": -914061504, "ts": 1716454225176248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225176334, "dur": 1, "args": { "External id": 239691, "cbid": 251, "correlation": 239691 } }, { "ph": "f", "id": 239691, "pid": 76337, "tid": -914061504, "ts": 1716454225176334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225235930, "dur": 162, "args": { "External id": 239692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239692, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239692, "pid": 5, "tid": 7, "ts": 1716454225235930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176339, "dur": 13, "args": { "External id": 239692, "cbid": 211, "correlation": 239692 } }, { "ph": "s", "id": 239692, "pid": 76337, "tid": -914061504, "ts": 1716454225176339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225176409, "dur": 1, "args": { "External id": 239703, "cbid": 251, "correlation": 239703 } }, { "ph": "f", "id": 239703, "pid": 76337, "tid": -914061504, "ts": 1716454225176409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225236094, "dur": 154, "args": { "External id": 239704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239704, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239704, "pid": 5, "tid": 7, "ts": 1716454225236094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176413, "dur": 12, "args": { "External id": 239704, "cbid": 211, "correlation": 239704 } }, { "ph": "s", "id": 239704, "pid": 76337, "tid": -914061504, "ts": 1716454225176413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225176479, "dur": 1, "args": { "External id": 239715, "cbid": 251, "correlation": 239715 } }, { "ph": "f", "id": 239715, "pid": 76337, "tid": -914061504, "ts": 1716454225176479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225236249, "dur": 155, "args": { "External id": 239716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239716, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239716, "pid": 5, "tid": 7, "ts": 1716454225236249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176483, "dur": 11, "args": { "External id": 239716, "cbid": 211, "correlation": 239716 } }, { "ph": "s", "id": 239716, "pid": 76337, "tid": -914061504, "ts": 1716454225176483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225236406, "dur": 332, "args": { "External id": 239741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239741, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239741, "pid": 5, "tid": 7, "ts": 1716454225236406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176566, "dur": 13, "args": { "External id": 239741, "cbid": 211, "correlation": 239741 } }, { "ph": "s", "id": 239741, "pid": 76337, "tid": -914061504, "ts": 1716454225176566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225176666, "dur": 1, "args": { "External id": 239759, "cbid": 251, "correlation": 239759 } }, { "ph": "f", "id": 239759, "pid": 76337, "tid": -914061504, "ts": 1716454225176666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225236740, "dur": 163, "args": { "External id": 239761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239761, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239761, "pid": 5, "tid": 7, "ts": 1716454225236740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176672, "dur": 13, "args": { "External id": 239761, "cbid": 211, "correlation": 239761 } }, { "ph": "s", "id": 239761, "pid": 76337, "tid": -914061504, "ts": 1716454225176672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225236904, "dur": 20, "args": { "External id": 239769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239769, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239769, "pid": 5, "tid": 7, "ts": 1716454225236904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176741, "dur": 12, "args": { "External id": 239769, "cbid": 211, "correlation": 239769 } }, { "ph": "s", "id": 239769, "pid": 76337, "tid": -914061504, "ts": 1716454225176741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225236925, "dur": 27, "args": { "External id": 239777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239777, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239777, "pid": 5, "tid": 7, "ts": 1716454225236925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176780, "dur": 8, "args": { "External id": 239777, "cbid": 211, "correlation": 239777 } }, { "ph": "s", "id": 239777, "pid": 76337, "tid": -914061504, "ts": 1716454225176780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225236953, "dur": 18, "args": { "External id": 239788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239788, "pid": 5, "tid": 7, "ts": 1716454225236953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176850, "dur": 12, "args": { "External id": 239788, "cbid": 211, "correlation": 239788 } }, { "ph": "s", "id": 239788, "pid": 76337, "tid": -914061504, "ts": 1716454225176850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225236973, "dur": 16, "args": { "External id": 239810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239810, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239810, "pid": 5, "tid": 7, "ts": 1716454225236973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176882, "dur": 8, "args": { "External id": 239810, "cbid": 211, "correlation": 239810 } }, { "ph": "s", "id": 239810, "pid": 76337, "tid": -914061504, "ts": 1716454225176882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225176967, "dur": 1, "args": { "External id": 239821, "cbid": 251, "correlation": 239821 } }, { "ph": "f", "id": 239821, "pid": 76337, "tid": -914061504, "ts": 1716454225176967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225236990, "dur": 87, "args": { "External id": 239822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239822, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239822, "pid": 5, "tid": 7, "ts": 1716454225236990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225176972, "dur": 21, "args": { "External id": 239822, "cbid": 211, "correlation": 239822 } }, { "ph": "s", "id": 239822, "pid": 76337, "tid": -914061504, "ts": 1716454225176972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177050, "dur": 1, "args": { "External id": 239833, "cbid": 251, "correlation": 239833 } }, { "ph": "f", "id": 239833, "pid": 76337, "tid": -914061504, "ts": 1716454225177050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177054, "dur": 0, "args": { "External id": 239834, "cbid": 251, "correlation": 239834 } }, { "ph": "f", "id": 239834, "pid": 76337, "tid": -914061504, "ts": 1716454225177054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225237078, "dur": 12, "args": { "External id": 239835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239835, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239835, "pid": 5, "tid": 7, "ts": 1716454225237078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177056, "dur": 12, "args": { "External id": 239835, "cbid": 211, "correlation": 239835 } }, { "ph": "s", "id": 239835, "pid": 76337, "tid": -914061504, "ts": 1716454225177056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225237092, "dur": 5, "args": { "External id": 239837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239837, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239837, "pid": 5, "tid": 7, "ts": 1716454225237092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177069, "dur": 6, "args": { "External id": 239837, "cbid": 211, "correlation": 239837 } }, { "ph": "s", "id": 239837, "pid": 76337, "tid": -914061504, "ts": 1716454225177069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177128, "dur": 1, "args": { "External id": 239848, "cbid": 251, "correlation": 239848 } }, { "ph": "f", "id": 239848, "pid": 76337, "tid": -914061504, "ts": 1716454225177128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177131, "dur": 0, "args": { "External id": 239849, "cbid": 251, "correlation": 239849 } }, { "ph": "f", "id": 239849, "pid": 76337, "tid": -914061504, "ts": 1716454225177131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225237098, "dur": 9, "args": { "External id": 239850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239850, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239850, "pid": 5, "tid": 7, "ts": 1716454225237098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177132, "dur": 11, "args": { "External id": 239850, "cbid": 211, "correlation": 239850 } }, { "ph": "s", "id": 239850, "pid": 76337, "tid": -914061504, "ts": 1716454225177132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225237108, "dur": 3, "args": { "External id": 239852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239852, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239852, "pid": 5, "tid": 7, "ts": 1716454225237108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177146, "dur": 5, "args": { "External id": 239852, "cbid": 211, "correlation": 239852 } }, { "ph": "s", "id": 239852, "pid": 76337, "tid": -914061504, "ts": 1716454225177146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225237113, "dur": 54, "args": { "External id": 239877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239877, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239877, "pid": 5, "tid": 7, "ts": 1716454225237113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177223, "dur": 12, "args": { "External id": 239877, "cbid": 211, "correlation": 239877 } }, { "ph": "s", "id": 239877, "pid": 76337, "tid": -914061504, "ts": 1716454225177223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177320, "dur": 1, "args": { "External id": 239895, "cbid": 251, "correlation": 239895 } }, { "ph": "f", "id": 239895, "pid": 76337, "tid": -914061504, "ts": 1716454225177320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225237168, "dur": 89, "args": { "External id": 239897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239897, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239897, "pid": 5, "tid": 7, "ts": 1716454225237168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177327, "dur": 14, "args": { "External id": 239897, "cbid": 211, "correlation": 239897 } }, { "ph": "s", "id": 239897, "pid": 76337, "tid": -914061504, "ts": 1716454225177327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225237258, "dur": 9, "args": { "External id": 239905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239905, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239905, "pid": 5, "tid": 7, "ts": 1716454225237258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177395, "dur": 12, "args": { "External id": 239905, "cbid": 211, "correlation": 239905 } }, { "ph": "s", "id": 239905, "pid": 76337, "tid": -914061504, "ts": 1716454225177395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225237269, "dur": 19, "args": { "External id": 239913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239913, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239913, "pid": 5, "tid": 7, "ts": 1716454225237269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177436, "dur": 9, "args": { "External id": 239913, "cbid": 211, "correlation": 239913 } }, { "ph": "s", "id": 239913, "pid": 76337, "tid": -914061504, "ts": 1716454225177436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225237289, "dur": 17, "args": { "External id": 239935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239935, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239935, "pid": 5, "tid": 7, "ts": 1716454225237289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177487, "dur": 10, "args": { "External id": 239935, "cbid": 211, "correlation": 239935 } }, { "ph": "s", "id": 239935, "pid": 76337, "tid": -914061504, "ts": 1716454225177487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177574, "dur": 1, "args": { "External id": 239951, "cbid": 251, "correlation": 239951 } }, { "ph": "f", "id": 239951, "pid": 76337, "tid": -914061504, "ts": 1716454225177574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177579, "dur": 0, "args": { "External id": 239953, "cbid": 251, "correlation": 239953 } }, { "ph": "f", "id": 239953, "pid": 76337, "tid": -914061504, "ts": 1716454225177579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225237307, "dur": 493, "args": { "External id": 239954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239954, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 239954, "pid": 5, "tid": 7, "ts": 1716454225237307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177581, "dur": 12, "args": { "External id": 239954, "cbid": 211, "correlation": 239954 } }, { "ph": "s", "id": 239954, "pid": 76337, "tid": -914061504, "ts": 1716454225177581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225237802, "dur": 65, "args": { "External id": 239962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239962, "pid": 5, "tid": 7, "ts": 1716454225237802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177645, "dur": 12, "args": { "External id": 239962, "cbid": 211, "correlation": 239962 } }, { "ph": "s", "id": 239962, "pid": 76337, "tid": -914061504, "ts": 1716454225177645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225237869, "dur": 67, "args": { "External id": 239970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239970, "pid": 5, "tid": 7, "ts": 1716454225237869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177677, "dur": 8, "args": { "External id": 239970, "cbid": 211, "correlation": 239970 } }, { "ph": "s", "id": 239970, "pid": 76337, "tid": -914061504, "ts": 1716454225177677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225177756, "dur": 1, "args": { "External id": 239986, "cbid": 251, "correlation": 239986 } }, { "ph": "f", "id": 239986, "pid": 76337, "tid": -914061504, "ts": 1716454225177756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225237938, "dur": 1, "args": { "External id": 239988, "device": 5, "context": 1, "stream": 7, "correlation": 239988, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 239988, "pid": 5, "tid": 7, "ts": 1716454225237938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225177761, "dur": 10, "args": { "External id": 239988, "cbid": 51, "correlation": 239988 } }, { "ph": "s", "id": 239988, "pid": 76337, "tid": -914061504, "ts": 1716454225177761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225237942, "dur": 264, "args": { "External id": 239989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239989, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 239989, "pid": 5, "tid": 7, "ts": 1716454225237942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177773, "dur": 11, "args": { "External id": 239989, "cbid": 211, "correlation": 239989 } }, { "ph": "s", "id": 239989, "pid": 76337, "tid": -914061504, "ts": 1716454225177773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225238208, "dur": 14, "args": { "External id": 239997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 239997, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 239997, "pid": 5, "tid": 7, "ts": 1716454225238208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177814, "dur": 11, "args": { "External id": 239997, "cbid": 211, "correlation": 239997 } }, { "ph": "s", "id": 239997, "pid": 76337, "tid": -914061504, "ts": 1716454225177814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225238223, "dur": 37, "args": { "External id": 240008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240008, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240008, "pid": 5, "tid": 7, "ts": 1716454225238223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177883, "dur": 12, "args": { "External id": 240008, "cbid": 211, "correlation": 240008 } }, { "ph": "s", "id": 240008, "pid": 76337, "tid": -914061504, "ts": 1716454225177883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225177946, "dur": 0, "args": { "External id": 240020, "cbid": 317, "correlation": 240020 } }, { "ph": "f", "id": 240020, "pid": 76337, "tid": -914061504, "ts": 1716454225177946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225177947, "dur": 0, "args": { "External id": 240021, "cbid": 203, "correlation": 240021 } }, { "ph": "f", "id": 240021, "pid": 76337, "tid": -914061504, "ts": 1716454225177947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225177948, "dur": 0, "args": { "External id": 240022, "cbid": 205, "correlation": 240022 } }, { "ph": "f", "id": 240022, "pid": 76337, "tid": -914061504, "ts": 1716454225177948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225238261, "dur": 13, "args": { "External id": 240026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240026, "pid": 5, "tid": 7, "ts": 1716454225238261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177963, "dur": 20, "args": { "External id": 240026, "cbid": 211, "correlation": 240026 } }, { "ph": "s", "id": 240026, "pid": 76337, "tid": -914061504, "ts": 1716454225177963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225238276, "dur": 4, "args": { "External id": 240028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240028, "pid": 5, "tid": 7, "ts": 1716454225238276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177988, "dur": 6, "args": { "External id": 240028, "cbid": 211, "correlation": 240028 } }, { "ph": "s", "id": 240028, "pid": 76337, "tid": -914061504, "ts": 1716454225177988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225177997, "dur": 0, "args": { "External id": 240029, "cbid": 51, "correlation": 240029 } }, { "ph": "s", "id": 240029, "pid": 76337, "tid": -914061504, "ts": 1716454225177997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225238282, "dur": 96, "args": { "External id": 240030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240030, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 240030, "pid": 5, "tid": 7, "ts": 1716454225238282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225177998, "dur": 5, "args": { "External id": 240030, "cbid": 211, "correlation": 240030 } }, { "ph": "s", "id": 240030, "pid": 76337, "tid": -914061504, "ts": 1716454225177998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225238379, "dur": 17, "args": { "External id": 240035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240035, "pid": 5, "tid": 7, "ts": 1716454225238379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178026, "dur": 9, "args": { "External id": 240035, "cbid": 211, "correlation": 240035 } }, { "ph": "s", "id": 240035, "pid": 76337, "tid": -914061504, "ts": 1716454225178026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225238397, "dur": 12, "args": { "External id": 240043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240043, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240043, "pid": 5, "tid": 7, "ts": 1716454225238397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178059, "dur": 8, "args": { "External id": 240043, "cbid": 211, "correlation": 240043 } }, { "ph": "s", "id": 240043, "pid": 76337, "tid": -914061504, "ts": 1716454225178059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225238411, "dur": 26, "args": { "External id": 240052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240052, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240052, "pid": 5, "tid": 7, "ts": 1716454225238411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178098, "dur": 10, "args": { "External id": 240052, "cbid": 211, "correlation": 240052 } }, { "ph": "s", "id": 240052, "pid": 76337, "tid": -914061504, "ts": 1716454225178098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225238438, "dur": 24, "args": { "External id": 240072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240072, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 240072, "pid": 5, "tid": 7, "ts": 1716454225238438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178169, "dur": 12, "args": { "External id": 240072, "cbid": 211, "correlation": 240072 } }, { "ph": "s", "id": 240072, "pid": 76337, "tid": -914061504, "ts": 1716454225178169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225238463, "dur": 5, "args": { "External id": 240084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240084, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 240084, "pid": 5, "tid": 7, "ts": 1716454225238463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178192, "dur": 6, "args": { "External id": 240084, "cbid": 211, "correlation": 240084 } }, { "ph": "s", "id": 240084, "pid": 76337, "tid": -914061504, "ts": 1716454225178192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225238468, "dur": 24, "args": { "External id": 240087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240087, "pid": 5, "tid": 7, "ts": 1716454225238468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178210, "dur": 7, "args": { "External id": 240087, "cbid": 211, "correlation": 240087 } }, { "ph": "s", "id": 240087, "pid": 76337, "tid": -914061504, "ts": 1716454225178210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225238494, "dur": 17, "args": { "External id": 240096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240096, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240096, "pid": 5, "tid": 7, "ts": 1716454225238494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178248, "dur": 10, "args": { "External id": 240096, "cbid": 211, "correlation": 240096 } }, { "ph": "s", "id": 240096, "pid": 76337, "tid": -914061504, "ts": 1716454225178248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225178300, "dur": 0, "args": { "External id": 240106, "cbid": 317, "correlation": 240106 } }, { "ph": "f", "id": 240106, "pid": 76337, "tid": -914061504, "ts": 1716454225178300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225178301, "dur": 0, "args": { "External id": 240107, "cbid": 203, "correlation": 240107 } }, { "ph": "f", "id": 240107, "pid": 76337, "tid": -914061504, "ts": 1716454225178301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225178301, "dur": 0, "args": { "External id": 240108, "cbid": 205, "correlation": 240108 } }, { "ph": "f", "id": 240108, "pid": 76337, "tid": -914061504, "ts": 1716454225178301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225238512, "dur": 17, "args": { "External id": 240112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240112, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240112, "pid": 5, "tid": 7, "ts": 1716454225238512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178316, "dur": 12, "args": { "External id": 240112, "cbid": 211, "correlation": 240112 } }, { "ph": "s", "id": 240112, "pid": 76337, "tid": -914061504, "ts": 1716454225178316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225238531, "dur": 237, "args": { "External id": 240114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240114, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240114, "pid": 5, "tid": 7, "ts": 1716454225238531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178331, "dur": 5, "args": { "External id": 240114, "cbid": 211, "correlation": 240114 } }, { "ph": "s", "id": 240114, "pid": 76337, "tid": -914061504, "ts": 1716454225178331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225238770, "dur": 1, "args": { "External id": 240116, "device": 5, "context": 1, "stream": 7, "correlation": 240116, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 240116, "pid": 5, "tid": 7, "ts": 1716454225238770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225178342, "dur": 8, "args": { "External id": 240116, "cbid": 51, "correlation": 240116 } }, { "ph": "s", "id": 240116, "pid": 76337, "tid": -914061504, "ts": 1716454225178342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225238774, "dur": 811, "args": { "External id": 240117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240117, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240117, "pid": 5, "tid": 7, "ts": 1716454225238774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178351, "dur": 6, "args": { "External id": 240117, "cbid": 211, "correlation": 240117 } }, { "ph": "s", "id": 240117, "pid": 76337, "tid": -914061504, "ts": 1716454225178351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225239587, "dur": 13, "args": { "External id": 240119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240119, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240119, "pid": 5, "tid": 7, "ts": 1716454225239587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178361, "dur": 5, "args": { "External id": 240119, "cbid": 211, "correlation": 240119 } }, { "ph": "s", "id": 240119, "pid": 76337, "tid": -914061504, "ts": 1716454225178361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225239602, "dur": 14, "args": { "External id": 240125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240125, "pid": 5, "tid": 7, "ts": 1716454225239602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178390, "dur": 9, "args": { "External id": 240125, "cbid": 211, "correlation": 240125 } }, { "ph": "s", "id": 240125, "pid": 76337, "tid": -914061504, "ts": 1716454225178390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225239617, "dur": 3, "args": { "External id": 240133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240133, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 240133, "pid": 5, "tid": 7, "ts": 1716454225239617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178434, "dur": 9, "args": { "External id": 240133, "cbid": 211, "correlation": 240133 } }, { "ph": "s", "id": 240133, "pid": 76337, "tid": -914061504, "ts": 1716454225178434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225178498, "dur": 1, "args": { "External id": 240149, "cbid": 251, "correlation": 240149 } }, { "ph": "f", "id": 240149, "pid": 76337, "tid": -914061504, "ts": 1716454225178498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225178503, "dur": 0, "args": { "External id": 240151, "cbid": 251, "correlation": 240151 } }, { "ph": "f", "id": 240151, "pid": 76337, "tid": -914061504, "ts": 1716454225178503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225239622, "dur": 13, "args": { "External id": 240152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240152, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240152, "pid": 5, "tid": 7, "ts": 1716454225239622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178505, "dur": 11, "args": { "External id": 240152, "cbid": 211, "correlation": 240152 } }, { "ph": "s", "id": 240152, "pid": 76337, "tid": -914061504, "ts": 1716454225178505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225239636, "dur": 5, "args": { "External id": 240154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240154, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240154, "pid": 5, "tid": 7, "ts": 1716454225239636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178518, "dur": 6, "args": { "External id": 240154, "cbid": 211, "correlation": 240154 } }, { "ph": "s", "id": 240154, "pid": 76337, "tid": -914061504, "ts": 1716454225178518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225239642, "dur": 16, "args": { "External id": 240164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240164, "pid": 5, "tid": 7, "ts": 1716454225239642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178577, "dur": 12, "args": { "External id": 240164, "cbid": 211, "correlation": 240164 } }, { "ph": "s", "id": 240164, "pid": 76337, "tid": -914061504, "ts": 1716454225178577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225239660, "dur": 19, "args": { "External id": 240184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240184, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 240184, "pid": 5, "tid": 7, "ts": 1716454225239660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178640, "dur": 11, "args": { "External id": 240184, "cbid": 211, "correlation": 240184 } }, { "ph": "s", "id": 240184, "pid": 76337, "tid": -914061504, "ts": 1716454225178640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225239680, "dur": 4, "args": { "External id": 240196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240196, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 240196, "pid": 5, "tid": 7, "ts": 1716454225239680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178661, "dur": 6, "args": { "External id": 240196, "cbid": 211, "correlation": 240196 } }, { "ph": "s", "id": 240196, "pid": 76337, "tid": -914061504, "ts": 1716454225178661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225239685, "dur": 16, "args": { "External id": 240199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240199, "pid": 5, "tid": 7, "ts": 1716454225239685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178680, "dur": 6, "args": { "External id": 240199, "cbid": 211, "correlation": 240199 } }, { "ph": "s", "id": 240199, "pid": 76337, "tid": -914061504, "ts": 1716454225178680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225239703, "dur": 10, "args": { "External id": 240208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240208, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240208, "pid": 5, "tid": 7, "ts": 1716454225239703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178719, "dur": 11, "args": { "External id": 240208, "cbid": 211, "correlation": 240208 } }, { "ph": "s", "id": 240208, "pid": 76337, "tid": -914061504, "ts": 1716454225178719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225178781, "dur": 0, "args": { "External id": 240218, "cbid": 317, "correlation": 240218 } }, { "ph": "f", "id": 240218, "pid": 76337, "tid": -914061504, "ts": 1716454225178781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225178782, "dur": 0, "args": { "External id": 240219, "cbid": 203, "correlation": 240219 } }, { "ph": "f", "id": 240219, "pid": 76337, "tid": -914061504, "ts": 1716454225178782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225178783, "dur": 0, "args": { "External id": 240220, "cbid": 205, "correlation": 240220 } }, { "ph": "f", "id": 240220, "pid": 76337, "tid": -914061504, "ts": 1716454225178783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225239714, "dur": 11, "args": { "External id": 240224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240224, "pid": 5, "tid": 7, "ts": 1716454225239714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178796, "dur": 12, "args": { "External id": 240224, "cbid": 211, "correlation": 240224 } }, { "ph": "s", "id": 240224, "pid": 76337, "tid": -914061504, "ts": 1716454225178796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225239727, "dur": 161, "args": { "External id": 240226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240226, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240226, "pid": 5, "tid": 7, "ts": 1716454225239727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178810, "dur": 5, "args": { "External id": 240226, "cbid": 211, "correlation": 240226 } }, { "ph": "s", "id": 240226, "pid": 76337, "tid": -914061504, "ts": 1716454225178810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225239890, "dur": 1, "args": { "External id": 240228, "device": 5, "context": 1, "stream": 7, "correlation": 240228, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 240228, "pid": 5, "tid": 7, "ts": 1716454225239890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225178821, "dur": 7, "args": { "External id": 240228, "cbid": 51, "correlation": 240228 } }, { "ph": "s", "id": 240228, "pid": 76337, "tid": -914061504, "ts": 1716454225178821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225239894, "dur": 642, "args": { "External id": 240229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240229, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240229, "pid": 5, "tid": 7, "ts": 1716454225239894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178829, "dur": 6, "args": { "External id": 240229, "cbid": 211, "correlation": 240229 } }, { "ph": "s", "id": 240229, "pid": 76337, "tid": -914061504, "ts": 1716454225178829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225240537, "dur": 12, "args": { "External id": 240231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240231, "pid": 5, "tid": 7, "ts": 1716454225240537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178839, "dur": 5, "args": { "External id": 240231, "cbid": 211, "correlation": 240231 } }, { "ph": "s", "id": 240231, "pid": 76337, "tid": -914061504, "ts": 1716454225178839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225240551, "dur": 15, "args": { "External id": 240237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240237, "pid": 5, "tid": 7, "ts": 1716454225240551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178868, "dur": 9, "args": { "External id": 240237, "cbid": 211, "correlation": 240237 } }, { "ph": "s", "id": 240237, "pid": 76337, "tid": -914061504, "ts": 1716454225178868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225178926, "dur": 0, "args": { "External id": 240247, "cbid": 317, "correlation": 240247 } }, { "ph": "f", "id": 240247, "pid": 76337, "tid": -914061504, "ts": 1716454225178926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225178927, "dur": 0, "args": { "External id": 240248, "cbid": 203, "correlation": 240248 } }, { "ph": "f", "id": 240248, "pid": 76337, "tid": -914061504, "ts": 1716454225178927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225178928, "dur": 0, "args": { "External id": 240249, "cbid": 205, "correlation": 240249 } }, { "ph": "f", "id": 240249, "pid": 76337, "tid": -914061504, "ts": 1716454225178928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225240567, "dur": 17, "args": { "External id": 240253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240253, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240253, "pid": 5, "tid": 7, "ts": 1716454225240567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178941, "dur": 11, "args": { "External id": 240253, "cbid": 211, "correlation": 240253 } }, { "ph": "s", "id": 240253, "pid": 76337, "tid": -914061504, "ts": 1716454225178941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225240585, "dur": 4, "args": { "External id": 240255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240255, "pid": 5, "tid": 7, "ts": 1716454225240585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178957, "dur": 5, "args": { "External id": 240255, "cbid": 211, "correlation": 240255 } }, { "ph": "s", "id": 240255, "pid": 76337, "tid": -914061504, "ts": 1716454225178957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225178965, "dur": 0, "args": { "External id": 240256, "cbid": 51, "correlation": 240256 } }, { "ph": "s", "id": 240256, "pid": 76337, "tid": -914061504, "ts": 1716454225178965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225240590, "dur": 131, "args": { "External id": 240257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240257, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 240257, "pid": 5, "tid": 7, "ts": 1716454225240590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225178966, "dur": 5, "args": { "External id": 240257, "cbid": 211, "correlation": 240257 } }, { "ph": "s", "id": 240257, "pid": 76337, "tid": -914061504, "ts": 1716454225178966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225240722, "dur": 15, "args": { "External id": 240262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240262, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240262, "pid": 5, "tid": 7, "ts": 1716454225240722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179000, "dur": 9, "args": { "External id": 240262, "cbid": 211, "correlation": 240262 } }, { "ph": "s", "id": 240262, "pid": 76337, "tid": -914061504, "ts": 1716454225179000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225240738, "dur": 12, "args": { "External id": 240270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240270, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240270, "pid": 5, "tid": 7, "ts": 1716454225240738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179031, "dur": 8, "args": { "External id": 240270, "cbid": 211, "correlation": 240270 } }, { "ph": "s", "id": 240270, "pid": 76337, "tid": -914061504, "ts": 1716454225179031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225240752, "dur": 10, "args": { "External id": 240278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240278, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240278, "pid": 5, "tid": 7, "ts": 1716454225240752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179059, "dur": 8, "args": { "External id": 240278, "cbid": 211, "correlation": 240278 } }, { "ph": "s", "id": 240278, "pid": 76337, "tid": -914061504, "ts": 1716454225179059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225240763, "dur": 18, "args": { "External id": 240298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240298, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 240298, "pid": 5, "tid": 7, "ts": 1716454225240763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179141, "dur": 12, "args": { "External id": 240298, "cbid": 211, "correlation": 240298 } }, { "ph": "s", "id": 240298, "pid": 76337, "tid": -914061504, "ts": 1716454225179141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225240782, "dur": 4, "args": { "External id": 240310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240310, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 240310, "pid": 5, "tid": 7, "ts": 1716454225240782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179162, "dur": 6, "args": { "External id": 240310, "cbid": 211, "correlation": 240310 } }, { "ph": "s", "id": 240310, "pid": 76337, "tid": -914061504, "ts": 1716454225179162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225240788, "dur": 16, "args": { "External id": 240313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240313, "pid": 5, "tid": 7, "ts": 1716454225240788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179181, "dur": 6, "args": { "External id": 240313, "cbid": 211, "correlation": 240313 } }, { "ph": "s", "id": 240313, "pid": 76337, "tid": -914061504, "ts": 1716454225179181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225179237, "dur": 0, "args": { "External id": 240324, "cbid": 317, "correlation": 240324 } }, { "ph": "f", "id": 240324, "pid": 76337, "tid": -914061504, "ts": 1716454225179237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225179238, "dur": 0, "args": { "External id": 240325, "cbid": 203, "correlation": 240325 } }, { "ph": "f", "id": 240325, "pid": 76337, "tid": -914061504, "ts": 1716454225179238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225179238, "dur": 0, "args": { "External id": 240326, "cbid": 205, "correlation": 240326 } }, { "ph": "f", "id": 240326, "pid": 76337, "tid": -914061504, "ts": 1716454225179238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225240805, "dur": 11, "args": { "External id": 240330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240330, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240330, "pid": 5, "tid": 7, "ts": 1716454225240805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179251, "dur": 11, "args": { "External id": 240330, "cbid": 211, "correlation": 240330 } }, { "ph": "s", "id": 240330, "pid": 76337, "tid": -914061504, "ts": 1716454225179251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225240818, "dur": 3, "args": { "External id": 240332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240332, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240332, "pid": 5, "tid": 7, "ts": 1716454225240818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179267, "dur": 6, "args": { "External id": 240332, "cbid": 211, "correlation": 240332 } }, { "ph": "s", "id": 240332, "pid": 76337, "tid": -914061504, "ts": 1716454225179267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225179276, "dur": 0, "args": { "External id": 240333, "cbid": 51, "correlation": 240333 } }, { "ph": "s", "id": 240333, "pid": 76337, "tid": -914061504, "ts": 1716454225179276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225240822, "dur": 89, "args": { "External id": 240334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240334, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 240334, "pid": 5, "tid": 7, "ts": 1716454225240822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179277, "dur": 5, "args": { "External id": 240334, "cbid": 211, "correlation": 240334 } }, { "ph": "s", "id": 240334, "pid": 76337, "tid": -914061504, "ts": 1716454225179277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225240912, "dur": 15, "args": { "External id": 240339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240339, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240339, "pid": 5, "tid": 7, "ts": 1716454225240912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179303, "dur": 9, "args": { "External id": 240339, "cbid": 211, "correlation": 240339 } }, { "ph": "s", "id": 240339, "pid": 76337, "tid": -914061504, "ts": 1716454225179303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225240929, "dur": 81, "args": { "External id": 240348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240348, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240348, "pid": 5, "tid": 7, "ts": 1716454225240929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179384, "dur": 14, "args": { "External id": 240348, "cbid": 211, "correlation": 240348 } }, { "ph": "s", "id": 240348, "pid": 76337, "tid": -914061504, "ts": 1716454225179384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225241011, "dur": 30, "args": { "External id": 240370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240370, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240370, "pid": 5, "tid": 7, "ts": 1716454225241011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179442, "dur": 10, "args": { "External id": 240370, "cbid": 211, "correlation": 240370 } }, { "ph": "s", "id": 240370, "pid": 76337, "tid": -914061504, "ts": 1716454225179442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225179529, "dur": 1, "args": { "External id": 240381, "cbid": 251, "correlation": 240381 } }, { "ph": "f", "id": 240381, "pid": 76337, "tid": -914061504, "ts": 1716454225179529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225241042, "dur": 159, "args": { "External id": 240382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240382, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240382, "pid": 5, "tid": 7, "ts": 1716454225241042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179535, "dur": 13, "args": { "External id": 240382, "cbid": 211, "correlation": 240382 } }, { "ph": "s", "id": 240382, "pid": 76337, "tid": -914061504, "ts": 1716454225179535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225179605, "dur": 1, "args": { "External id": 240393, "cbid": 251, "correlation": 240393 } }, { "ph": "f", "id": 240393, "pid": 76337, "tid": -914061504, "ts": 1716454225179605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225241203, "dur": 157, "args": { "External id": 240394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240394, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240394, "pid": 5, "tid": 7, "ts": 1716454225241203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179609, "dur": 12, "args": { "External id": 240394, "cbid": 211, "correlation": 240394 } }, { "ph": "s", "id": 240394, "pid": 76337, "tid": -914061504, "ts": 1716454225179609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225179675, "dur": 1, "args": { "External id": 240405, "cbid": 251, "correlation": 240405 } }, { "ph": "f", "id": 240405, "pid": 76337, "tid": -914061504, "ts": 1716454225179675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225241361, "dur": 155, "args": { "External id": 240406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240406, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240406, "pid": 5, "tid": 7, "ts": 1716454225241361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179679, "dur": 11, "args": { "External id": 240406, "cbid": 211, "correlation": 240406 } }, { "ph": "s", "id": 240406, "pid": 76337, "tid": -914061504, "ts": 1716454225179679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225241518, "dur": 329, "args": { "External id": 240431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240431, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240431, "pid": 5, "tid": 7, "ts": 1716454225241518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179764, "dur": 13, "args": { "External id": 240431, "cbid": 211, "correlation": 240431 } }, { "ph": "s", "id": 240431, "pid": 76337, "tid": -914061504, "ts": 1716454225179764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225179863, "dur": 1, "args": { "External id": 240449, "cbid": 251, "correlation": 240449 } }, { "ph": "f", "id": 240449, "pid": 76337, "tid": -914061504, "ts": 1716454225179863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225241847, "dur": 163, "args": { "External id": 240451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240451, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240451, "pid": 5, "tid": 7, "ts": 1716454225241847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179870, "dur": 13, "args": { "External id": 240451, "cbid": 211, "correlation": 240451 } }, { "ph": "s", "id": 240451, "pid": 76337, "tid": -914061504, "ts": 1716454225179870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225242011, "dur": 19, "args": { "External id": 240459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240459, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240459, "pid": 5, "tid": 7, "ts": 1716454225242011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179939, "dur": 12, "args": { "External id": 240459, "cbid": 211, "correlation": 240459 } }, { "ph": "s", "id": 240459, "pid": 76337, "tid": -914061504, "ts": 1716454225179939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225242032, "dur": 27, "args": { "External id": 240467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240467, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240467, "pid": 5, "tid": 7, "ts": 1716454225242032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225179986, "dur": 9, "args": { "External id": 240467, "cbid": 211, "correlation": 240467 } }, { "ph": "s", "id": 240467, "pid": 76337, "tid": -914061504, "ts": 1716454225179986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225242061, "dur": 18, "args": { "External id": 240478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240478, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240478, "pid": 5, "tid": 7, "ts": 1716454225242061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180059, "dur": 14, "args": { "External id": 240478, "cbid": 211, "correlation": 240478 } }, { "ph": "s", "id": 240478, "pid": 76337, "tid": -914061504, "ts": 1716454225180059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225242080, "dur": 15, "args": { "External id": 240500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240500, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240500, "pid": 5, "tid": 7, "ts": 1716454225242080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180091, "dur": 7, "args": { "External id": 240500, "cbid": 211, "correlation": 240500 } }, { "ph": "s", "id": 240500, "pid": 76337, "tid": -914061504, "ts": 1716454225180091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180177, "dur": 1, "args": { "External id": 240511, "cbid": 251, "correlation": 240511 } }, { "ph": "f", "id": 240511, "pid": 76337, "tid": -914061504, "ts": 1716454225180177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225242097, "dur": 87, "args": { "External id": 240512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240512, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240512, "pid": 5, "tid": 7, "ts": 1716454225242097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180182, "dur": 13, "args": { "External id": 240512, "cbid": 211, "correlation": 240512 } }, { "ph": "s", "id": 240512, "pid": 76337, "tid": -914061504, "ts": 1716454225180182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180250, "dur": 1, "args": { "External id": 240523, "cbid": 251, "correlation": 240523 } }, { "ph": "f", "id": 240523, "pid": 76337, "tid": -914061504, "ts": 1716454225180250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180254, "dur": 0, "args": { "External id": 240524, "cbid": 251, "correlation": 240524 } }, { "ph": "f", "id": 240524, "pid": 76337, "tid": -914061504, "ts": 1716454225180254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225242185, "dur": 11, "args": { "External id": 240525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240525, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240525, "pid": 5, "tid": 7, "ts": 1716454225242185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180256, "dur": 13, "args": { "External id": 240525, "cbid": 211, "correlation": 240525 } }, { "ph": "s", "id": 240525, "pid": 76337, "tid": -914061504, "ts": 1716454225180256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225242198, "dur": 6, "args": { "External id": 240527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240527, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240527, "pid": 5, "tid": 7, "ts": 1716454225242198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180270, "dur": 6, "args": { "External id": 240527, "cbid": 211, "correlation": 240527 } }, { "ph": "s", "id": 240527, "pid": 76337, "tid": -914061504, "ts": 1716454225180270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180327, "dur": 1, "args": { "External id": 240538, "cbid": 251, "correlation": 240538 } }, { "ph": "f", "id": 240538, "pid": 76337, "tid": -914061504, "ts": 1716454225180327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180331, "dur": 0, "args": { "External id": 240539, "cbid": 251, "correlation": 240539 } }, { "ph": "f", "id": 240539, "pid": 76337, "tid": -914061504, "ts": 1716454225180331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225242205, "dur": 9, "args": { "External id": 240540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240540, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240540, "pid": 5, "tid": 7, "ts": 1716454225242205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180332, "dur": 11, "args": { "External id": 240540, "cbid": 211, "correlation": 240540 } }, { "ph": "s", "id": 240540, "pid": 76337, "tid": -914061504, "ts": 1716454225180332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225242215, "dur": 3, "args": { "External id": 240542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240542, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240542, "pid": 5, "tid": 7, "ts": 1716454225242215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180345, "dur": 5, "args": { "External id": 240542, "cbid": 211, "correlation": 240542 } }, { "ph": "s", "id": 240542, "pid": 76337, "tid": -914061504, "ts": 1716454225180345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225242220, "dur": 54, "args": { "External id": 240567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240567, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240567, "pid": 5, "tid": 7, "ts": 1716454225242220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180423, "dur": 12, "args": { "External id": 240567, "cbid": 211, "correlation": 240567 } }, { "ph": "s", "id": 240567, "pid": 76337, "tid": -914061504, "ts": 1716454225180423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180522, "dur": 1, "args": { "External id": 240585, "cbid": 251, "correlation": 240585 } }, { "ph": "f", "id": 240585, "pid": 76337, "tid": -914061504, "ts": 1716454225180522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225242275, "dur": 90, "args": { "External id": 240587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240587, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240587, "pid": 5, "tid": 7, "ts": 1716454225242275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180528, "dur": 13, "args": { "External id": 240587, "cbid": 211, "correlation": 240587 } }, { "ph": "s", "id": 240587, "pid": 76337, "tid": -914061504, "ts": 1716454225180528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225242366, "dur": 9, "args": { "External id": 240595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240595, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240595, "pid": 5, "tid": 7, "ts": 1716454225242366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180598, "dur": 13, "args": { "External id": 240595, "cbid": 211, "correlation": 240595 } }, { "ph": "s", "id": 240595, "pid": 76337, "tid": -914061504, "ts": 1716454225180598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225242376, "dur": 22, "args": { "External id": 240603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240603, "pid": 5, "tid": 7, "ts": 1716454225242376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180639, "dur": 9, "args": { "External id": 240603, "cbid": 211, "correlation": 240603 } }, { "ph": "s", "id": 240603, "pid": 76337, "tid": -914061504, "ts": 1716454225180639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225242400, "dur": 17, "args": { "External id": 240625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240625, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240625, "pid": 5, "tid": 7, "ts": 1716454225242400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180690, "dur": 10, "args": { "External id": 240625, "cbid": 211, "correlation": 240625 } }, { "ph": "s", "id": 240625, "pid": 76337, "tid": -914061504, "ts": 1716454225180690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180779, "dur": 1, "args": { "External id": 240641, "cbid": 251, "correlation": 240641 } }, { "ph": "f", "id": 240641, "pid": 76337, "tid": -914061504, "ts": 1716454225180779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180784, "dur": 0, "args": { "External id": 240643, "cbid": 251, "correlation": 240643 } }, { "ph": "f", "id": 240643, "pid": 76337, "tid": -914061504, "ts": 1716454225180784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225242418, "dur": 491, "args": { "External id": 240644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240644, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240644, "pid": 5, "tid": 7, "ts": 1716454225242418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180786, "dur": 12, "args": { "External id": 240644, "cbid": 211, "correlation": 240644 } }, { "ph": "s", "id": 240644, "pid": 76337, "tid": -914061504, "ts": 1716454225180786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225242911, "dur": 66, "args": { "External id": 240652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240652, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240652, "pid": 5, "tid": 7, "ts": 1716454225242911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180850, "dur": 13, "args": { "External id": 240652, "cbid": 211, "correlation": 240652 } }, { "ph": "s", "id": 240652, "pid": 76337, "tid": -914061504, "ts": 1716454225180850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225242978, "dur": 66, "args": { "External id": 240660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240660, "pid": 5, "tid": 7, "ts": 1716454225242978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180881, "dur": 8, "args": { "External id": 240660, "cbid": 211, "correlation": 240660 } }, { "ph": "s", "id": 240660, "pid": 76337, "tid": -914061504, "ts": 1716454225180881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225180961, "dur": 1, "args": { "External id": 240676, "cbid": 251, "correlation": 240676 } }, { "ph": "f", "id": 240676, "pid": 76337, "tid": -914061504, "ts": 1716454225180961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225243046, "dur": 1, "args": { "External id": 240678, "device": 5, "context": 1, "stream": 7, "correlation": 240678, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 240678, "pid": 5, "tid": 7, "ts": 1716454225243046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225180966, "dur": 16, "args": { "External id": 240678, "cbid": 51, "correlation": 240678 } }, { "ph": "s", "id": 240678, "pid": 76337, "tid": -914061504, "ts": 1716454225180966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225243049, "dur": 266, "args": { "External id": 240679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240679, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240679, "pid": 5, "tid": 7, "ts": 1716454225243049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225180984, "dur": 11, "args": { "External id": 240679, "cbid": 211, "correlation": 240679 } }, { "ph": "s", "id": 240679, "pid": 76337, "tid": -914061504, "ts": 1716454225180984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225243317, "dur": 15, "args": { "External id": 240687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240687, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240687, "pid": 5, "tid": 7, "ts": 1716454225243317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181027, "dur": 10, "args": { "External id": 240687, "cbid": 211, "correlation": 240687 } }, { "ph": "s", "id": 240687, "pid": 76337, "tid": -914061504, "ts": 1716454225181027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225243333, "dur": 37, "args": { "External id": 240698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240698, "pid": 5, "tid": 7, "ts": 1716454225243333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181095, "dur": 13, "args": { "External id": 240698, "cbid": 211, "correlation": 240698 } }, { "ph": "s", "id": 240698, "pid": 76337, "tid": -914061504, "ts": 1716454225181095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225181158, "dur": 0, "args": { "External id": 240710, "cbid": 317, "correlation": 240710 } }, { "ph": "f", "id": 240710, "pid": 76337, "tid": -914061504, "ts": 1716454225181158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225181159, "dur": 0, "args": { "External id": 240711, "cbid": 203, "correlation": 240711 } }, { "ph": "f", "id": 240711, "pid": 76337, "tid": -914061504, "ts": 1716454225181159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225181160, "dur": 0, "args": { "External id": 240712, "cbid": 205, "correlation": 240712 } }, { "ph": "f", "id": 240712, "pid": 76337, "tid": -914061504, "ts": 1716454225181160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225243371, "dur": 13, "args": { "External id": 240716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240716, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240716, "pid": 5, "tid": 7, "ts": 1716454225243371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181175, "dur": 12, "args": { "External id": 240716, "cbid": 211, "correlation": 240716 } }, { "ph": "s", "id": 240716, "pid": 76337, "tid": -914061504, "ts": 1716454225181175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225243385, "dur": 4, "args": { "External id": 240718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 240718, "pid": 5, "tid": 7, "ts": 1716454225243385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181191, "dur": 6, "args": { "External id": 240718, "cbid": 211, "correlation": 240718 } }, { "ph": "s", "id": 240718, "pid": 76337, "tid": -914061504, "ts": 1716454225181191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225181199, "dur": 0, "args": { "External id": 240719, "cbid": 51, "correlation": 240719 } }, { "ph": "s", "id": 240719, "pid": 76337, "tid": -914061504, "ts": 1716454225181199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225243390, "dur": 95, "args": { "External id": 240720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240720, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 240720, "pid": 5, "tid": 7, "ts": 1716454225243390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181200, "dur": 5, "args": { "External id": 240720, "cbid": 211, "correlation": 240720 } }, { "ph": "s", "id": 240720, "pid": 76337, "tid": -914061504, "ts": 1716454225181200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225243486, "dur": 16, "args": { "External id": 240725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240725, "pid": 5, "tid": 7, "ts": 1716454225243486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181228, "dur": 9, "args": { "External id": 240725, "cbid": 211, "correlation": 240725 } }, { "ph": "s", "id": 240725, "pid": 76337, "tid": -914061504, "ts": 1716454225181228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225243504, "dur": 12, "args": { "External id": 240733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240733, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240733, "pid": 5, "tid": 7, "ts": 1716454225243504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181261, "dur": 8, "args": { "External id": 240733, "cbid": 211, "correlation": 240733 } }, { "ph": "s", "id": 240733, "pid": 76337, "tid": -914061504, "ts": 1716454225181261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225243517, "dur": 55, "args": { "External id": 240744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240744, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240744, "pid": 5, "tid": 7, "ts": 1716454225243517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181322, "dur": 11, "args": { "External id": 240744, "cbid": 211, "correlation": 240744 } }, { "ph": "s", "id": 240744, "pid": 76337, "tid": -914061504, "ts": 1716454225181322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225181377, "dur": 0, "args": { "External id": 240754, "cbid": 317, "correlation": 240754 } }, { "ph": "f", "id": 240754, "pid": 76337, "tid": -914061504, "ts": 1716454225181377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225181377, "dur": 0, "args": { "External id": 240755, "cbid": 203, "correlation": 240755 } }, { "ph": "f", "id": 240755, "pid": 76337, "tid": -914061504, "ts": 1716454225181377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225181378, "dur": 0, "args": { "External id": 240756, "cbid": 205, "correlation": 240756 } }, { "ph": "f", "id": 240756, "pid": 76337, "tid": -914061504, "ts": 1716454225181378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225243573, "dur": 39, "args": { "External id": 240760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240760, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240760, "pid": 5, "tid": 7, "ts": 1716454225243573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181393, "dur": 12, "args": { "External id": 240760, "cbid": 211, "correlation": 240760 } }, { "ph": "s", "id": 240760, "pid": 76337, "tid": -914061504, "ts": 1716454225181393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225243613, "dur": 160, "args": { "External id": 240762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240762, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240762, "pid": 5, "tid": 7, "ts": 1716454225243613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181408, "dur": 5, "args": { "External id": 240762, "cbid": 211, "correlation": 240762 } }, { "ph": "s", "id": 240762, "pid": 76337, "tid": -914061504, "ts": 1716454225181408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225243775, "dur": 1962, "args": { "External id": 240764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240764, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240764, "pid": 5, "tid": 7, "ts": 1716454225243775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181420, "dur": 8, "args": { "External id": 240764, "cbid": 211, "correlation": 240764 } }, { "ph": "s", "id": 240764, "pid": 76337, "tid": -914061504, "ts": 1716454225181420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225245738, "dur": 40, "args": { "External id": 240766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240766, "pid": 5, "tid": 7, "ts": 1716454225245738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181432, "dur": 5, "args": { "External id": 240766, "cbid": 211, "correlation": 240766 } }, { "ph": "s", "id": 240766, "pid": 76337, "tid": -914061504, "ts": 1716454225181432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225245780, "dur": 58, "args": { "External id": 240772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240772, "pid": 5, "tid": 7, "ts": 1716454225245780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181461, "dur": 8, "args": { "External id": 240772, "cbid": 211, "correlation": 240772 } }, { "ph": "s", "id": 240772, "pid": 76337, "tid": -914061504, "ts": 1716454225181461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225245840, "dur": 83, "args": { "External id": 240781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240781, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240781, "pid": 5, "tid": 7, "ts": 1716454225245840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181550, "dur": 14, "args": { "External id": 240781, "cbid": 211, "correlation": 240781 } }, { "ph": "s", "id": 240781, "pid": 76337, "tid": -914061504, "ts": 1716454225181550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225245924, "dur": 72, "args": { "External id": 240801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240801, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 240801, "pid": 5, "tid": 7, "ts": 1716454225245924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181621, "dur": 11, "args": { "External id": 240801, "cbid": 211, "correlation": 240801 } }, { "ph": "s", "id": 240801, "pid": 76337, "tid": -914061504, "ts": 1716454225181621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225245998, "dur": 5, "args": { "External id": 240813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240813, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 240813, "pid": 5, "tid": 7, "ts": 1716454225245998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181642, "dur": 6, "args": { "External id": 240813, "cbid": 211, "correlation": 240813 } }, { "ph": "s", "id": 240813, "pid": 76337, "tid": -914061504, "ts": 1716454225181642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225246004, "dur": 80, "args": { "External id": 240816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240816, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240816, "pid": 5, "tid": 7, "ts": 1716454225246004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181661, "dur": 7, "args": { "External id": 240816, "cbid": 211, "correlation": 240816 } }, { "ph": "s", "id": 240816, "pid": 76337, "tid": -914061504, "ts": 1716454225181661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225246086, "dur": 53, "args": { "External id": 240825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240825, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240825, "pid": 5, "tid": 7, "ts": 1716454225246086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181700, "dur": 11, "args": { "External id": 240825, "cbid": 211, "correlation": 240825 } }, { "ph": "s", "id": 240825, "pid": 76337, "tid": -914061504, "ts": 1716454225181700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225181754, "dur": 0, "args": { "External id": 240835, "cbid": 317, "correlation": 240835 } }, { "ph": "f", "id": 240835, "pid": 76337, "tid": -914061504, "ts": 1716454225181754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225181755, "dur": 0, "args": { "External id": 240836, "cbid": 203, "correlation": 240836 } }, { "ph": "f", "id": 240836, "pid": 76337, "tid": -914061504, "ts": 1716454225181755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225181756, "dur": 0, "args": { "External id": 240837, "cbid": 205, "correlation": 240837 } }, { "ph": "f", "id": 240837, "pid": 76337, "tid": -914061504, "ts": 1716454225181756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225246140, "dur": 56, "args": { "External id": 240841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240841, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240841, "pid": 5, "tid": 7, "ts": 1716454225246140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181772, "dur": 11, "args": { "External id": 240841, "cbid": 211, "correlation": 240841 } }, { "ph": "s", "id": 240841, "pid": 76337, "tid": -914061504, "ts": 1716454225181772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225246197, "dur": 120, "args": { "External id": 240843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240843, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240843, "pid": 5, "tid": 7, "ts": 1716454225246197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181786, "dur": 5, "args": { "External id": 240843, "cbid": 211, "correlation": 240843 } }, { "ph": "s", "id": 240843, "pid": 76337, "tid": -914061504, "ts": 1716454225181786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225246319, "dur": 1875, "args": { "External id": 240845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240845, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240845, "pid": 5, "tid": 7, "ts": 1716454225246319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181796, "dur": 6, "args": { "External id": 240845, "cbid": 211, "correlation": 240845 } }, { "ph": "s", "id": 240845, "pid": 76337, "tid": -914061504, "ts": 1716454225181796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225248195, "dur": 19, "args": { "External id": 240847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240847, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240847, "pid": 5, "tid": 7, "ts": 1716454225248195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181807, "dur": 5, "args": { "External id": 240847, "cbid": 211, "correlation": 240847 } }, { "ph": "s", "id": 240847, "pid": 76337, "tid": -914061504, "ts": 1716454225181807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225248216, "dur": 33, "args": { "External id": 240853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240853, "pid": 5, "tid": 7, "ts": 1716454225248216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181834, "dur": 8, "args": { "External id": 240853, "cbid": 211, "correlation": 240853 } }, { "ph": "s", "id": 240853, "pid": 76337, "tid": -914061504, "ts": 1716454225181834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225248250, "dur": 3, "args": { "External id": 240861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240861, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 240861, "pid": 5, "tid": 7, "ts": 1716454225248250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181879, "dur": 10, "args": { "External id": 240861, "cbid": 211, "correlation": 240861 } }, { "ph": "s", "id": 240861, "pid": 76337, "tid": -914061504, "ts": 1716454225181879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225181944, "dur": 1, "args": { "External id": 240877, "cbid": 251, "correlation": 240877 } }, { "ph": "f", "id": 240877, "pid": 76337, "tid": -914061504, "ts": 1716454225181944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225181950, "dur": 0, "args": { "External id": 240879, "cbid": 251, "correlation": 240879 } }, { "ph": "f", "id": 240879, "pid": 76337, "tid": -914061504, "ts": 1716454225181950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225248254, "dur": 12, "args": { "External id": 240880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240880, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 240880, "pid": 5, "tid": 7, "ts": 1716454225248254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181952, "dur": 12, "args": { "External id": 240880, "cbid": 211, "correlation": 240880 } }, { "ph": "s", "id": 240880, "pid": 76337, "tid": -914061504, "ts": 1716454225181952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225248267, "dur": 5, "args": { "External id": 240882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240882, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 240882, "pid": 5, "tid": 7, "ts": 1716454225248267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225181966, "dur": 6, "args": { "External id": 240882, "cbid": 211, "correlation": 240882 } }, { "ph": "s", "id": 240882, "pid": 76337, "tid": -914061504, "ts": 1716454225181966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225248274, "dur": 29, "args": { "External id": 240892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240892, "pid": 5, "tid": 7, "ts": 1716454225248274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182034, "dur": 13, "args": { "External id": 240892, "cbid": 211, "correlation": 240892 } }, { "ph": "s", "id": 240892, "pid": 76337, "tid": -914061504, "ts": 1716454225182034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225248304, "dur": 31, "args": { "External id": 240912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240912, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 240912, "pid": 5, "tid": 7, "ts": 1716454225248304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182100, "dur": 11, "args": { "External id": 240912, "cbid": 211, "correlation": 240912 } }, { "ph": "s", "id": 240912, "pid": 76337, "tid": -914061504, "ts": 1716454225182100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225248336, "dur": 4, "args": { "External id": 240924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240924, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 240924, "pid": 5, "tid": 7, "ts": 1716454225248336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182120, "dur": 6, "args": { "External id": 240924, "cbid": 211, "correlation": 240924 } }, { "ph": "s", "id": 240924, "pid": 76337, "tid": -914061504, "ts": 1716454225182120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225248342, "dur": 29, "args": { "External id": 240927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240927, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240927, "pid": 5, "tid": 7, "ts": 1716454225248342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182138, "dur": 6, "args": { "External id": 240927, "cbid": 211, "correlation": 240927 } }, { "ph": "s", "id": 240927, "pid": 76337, "tid": -914061504, "ts": 1716454225182138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225248372, "dur": 19, "args": { "External id": 240936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240936, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240936, "pid": 5, "tid": 7, "ts": 1716454225248372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182178, "dur": 10, "args": { "External id": 240936, "cbid": 211, "correlation": 240936 } }, { "ph": "s", "id": 240936, "pid": 76337, "tid": -914061504, "ts": 1716454225182178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225182240, "dur": 0, "args": { "External id": 240946, "cbid": 317, "correlation": 240946 } }, { "ph": "f", "id": 240946, "pid": 76337, "tid": -914061504, "ts": 1716454225182240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225182241, "dur": 0, "args": { "External id": 240947, "cbid": 203, "correlation": 240947 } }, { "ph": "f", "id": 240947, "pid": 76337, "tid": -914061504, "ts": 1716454225182241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225182242, "dur": 0, "args": { "External id": 240948, "cbid": 205, "correlation": 240948 } }, { "ph": "f", "id": 240948, "pid": 76337, "tid": -914061504, "ts": 1716454225182242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225248393, "dur": 22, "args": { "External id": 240952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240952, "pid": 5, "tid": 7, "ts": 1716454225248393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182257, "dur": 12, "args": { "External id": 240952, "cbid": 211, "correlation": 240952 } }, { "ph": "s", "id": 240952, "pid": 76337, "tid": -914061504, "ts": 1716454225182257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225248417, "dur": 43, "args": { "External id": 240954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240954, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240954, "pid": 5, "tid": 7, "ts": 1716454225248417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182271, "dur": 5, "args": { "External id": 240954, "cbid": 211, "correlation": 240954 } }, { "ph": "s", "id": 240954, "pid": 76337, "tid": -914061504, "ts": 1716454225182271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225248461, "dur": 638, "args": { "External id": 240956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240956, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240956, "pid": 5, "tid": 7, "ts": 1716454225248461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182283, "dur": 6, "args": { "External id": 240956, "cbid": 211, "correlation": 240956 } }, { "ph": "s", "id": 240956, "pid": 76337, "tid": -914061504, "ts": 1716454225182283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225249100, "dur": 20, "args": { "External id": 240958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240958, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240958, "pid": 5, "tid": 7, "ts": 1716454225249100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182292, "dur": 5, "args": { "External id": 240958, "cbid": 211, "correlation": 240958 } }, { "ph": "s", "id": 240958, "pid": 76337, "tid": -914061504, "ts": 1716454225182292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225249122, "dur": 32, "args": { "External id": 240964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240964, "pid": 5, "tid": 7, "ts": 1716454225249122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182320, "dur": 9, "args": { "External id": 240964, "cbid": 211, "correlation": 240964 } }, { "ph": "s", "id": 240964, "pid": 76337, "tid": -914061504, "ts": 1716454225182320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225182377, "dur": 0, "args": { "External id": 240974, "cbid": 317, "correlation": 240974 } }, { "ph": "f", "id": 240974, "pid": 76337, "tid": -914061504, "ts": 1716454225182377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225182378, "dur": 0, "args": { "External id": 240975, "cbid": 203, "correlation": 240975 } }, { "ph": "f", "id": 240975, "pid": 76337, "tid": -914061504, "ts": 1716454225182378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225182379, "dur": 0, "args": { "External id": 240976, "cbid": 205, "correlation": 240976 } }, { "ph": "f", "id": 240976, "pid": 76337, "tid": -914061504, "ts": 1716454225182379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225249155, "dur": 55, "args": { "External id": 240980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240980, "pid": 5, "tid": 7, "ts": 1716454225249155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182391, "dur": 12, "args": { "External id": 240980, "cbid": 211, "correlation": 240980 } }, { "ph": "s", "id": 240980, "pid": 76337, "tid": -914061504, "ts": 1716454225182391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225249211, "dur": 265, "args": { "External id": 240982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240982, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 240982, "pid": 5, "tid": 7, "ts": 1716454225249211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182410, "dur": 7, "args": { "External id": 240982, "cbid": 211, "correlation": 240982 } }, { "ph": "s", "id": 240982, "pid": 76337, "tid": -914061504, "ts": 1716454225182410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225249478, "dur": 22, "args": { "External id": 240984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240984, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240984, "pid": 5, "tid": 7, "ts": 1716454225249478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182420, "dur": 5, "args": { "External id": 240984, "cbid": 211, "correlation": 240984 } }, { "ph": "s", "id": 240984, "pid": 76337, "tid": -914061504, "ts": 1716454225182420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225249501, "dur": 32, "args": { "External id": 240990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240990, "pid": 5, "tid": 7, "ts": 1716454225249501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182447, "dur": 9, "args": { "External id": 240990, "cbid": 211, "correlation": 240990 } }, { "ph": "s", "id": 240990, "pid": 76337, "tid": -914061504, "ts": 1716454225182447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225249534, "dur": 27, "args": { "External id": 240998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 240998, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 240998, "pid": 5, "tid": 7, "ts": 1716454225249534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182475, "dur": 8, "args": { "External id": 240998, "cbid": 211, "correlation": 240998 } }, { "ph": "s", "id": 240998, "pid": 76337, "tid": -914061504, "ts": 1716454225182475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225249562, "dur": 19, "args": { "External id": 241006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241006, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241006, "pid": 5, "tid": 7, "ts": 1716454225249562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182505, "dur": 8, "args": { "External id": 241006, "cbid": 211, "correlation": 241006 } }, { "ph": "s", "id": 241006, "pid": 76337, "tid": -914061504, "ts": 1716454225182505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225249582, "dur": 29, "args": { "External id": 241026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241026, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 241026, "pid": 5, "tid": 7, "ts": 1716454225249582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182586, "dur": 13, "args": { "External id": 241026, "cbid": 211, "correlation": 241026 } }, { "ph": "s", "id": 241026, "pid": 76337, "tid": -914061504, "ts": 1716454225182586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225249613, "dur": 5, "args": { "External id": 241038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241038, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 241038, "pid": 5, "tid": 7, "ts": 1716454225249613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182609, "dur": 6, "args": { "External id": 241038, "cbid": 211, "correlation": 241038 } }, { "ph": "s", "id": 241038, "pid": 76337, "tid": -914061504, "ts": 1716454225182609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225249619, "dur": 30, "args": { "External id": 241041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241041, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241041, "pid": 5, "tid": 7, "ts": 1716454225249619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182626, "dur": 7, "args": { "External id": 241041, "cbid": 211, "correlation": 241041 } }, { "ph": "s", "id": 241041, "pid": 76337, "tid": -914061504, "ts": 1716454225182626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225182684, "dur": 0, "args": { "External id": 241052, "cbid": 317, "correlation": 241052 } }, { "ph": "f", "id": 241052, "pid": 76337, "tid": -914061504, "ts": 1716454225182684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225182685, "dur": 0, "args": { "External id": 241053, "cbid": 203, "correlation": 241053 } }, { "ph": "f", "id": 241053, "pid": 76337, "tid": -914061504, "ts": 1716454225182685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225182686, "dur": 0, "args": { "External id": 241054, "cbid": 205, "correlation": 241054 } }, { "ph": "f", "id": 241054, "pid": 76337, "tid": -914061504, "ts": 1716454225182686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225249650, "dur": 22, "args": { "External id": 241058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241058, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241058, "pid": 5, "tid": 7, "ts": 1716454225249650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182699, "dur": 12, "args": { "External id": 241058, "cbid": 211, "correlation": 241058 } }, { "ph": "s", "id": 241058, "pid": 76337, "tid": -914061504, "ts": 1716454225182699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225249674, "dur": 103, "args": { "External id": 241060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241060, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241060, "pid": 5, "tid": 7, "ts": 1716454225249674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182717, "dur": 6, "args": { "External id": 241060, "cbid": 211, "correlation": 241060 } }, { "ph": "s", "id": 241060, "pid": 76337, "tid": -914061504, "ts": 1716454225182717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225249778, "dur": 23, "args": { "External id": 241062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241062, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241062, "pid": 5, "tid": 7, "ts": 1716454225249778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182727, "dur": 5, "args": { "External id": 241062, "cbid": 211, "correlation": 241062 } }, { "ph": "s", "id": 241062, "pid": 76337, "tid": -914061504, "ts": 1716454225182727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225249803, "dur": 32, "args": { "External id": 241068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241068, "pid": 5, "tid": 7, "ts": 1716454225249803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182755, "dur": 9, "args": { "External id": 241068, "cbid": 211, "correlation": 241068 } }, { "ph": "s", "id": 241068, "pid": 76337, "tid": -914061504, "ts": 1716454225182755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225249836, "dur": 195, "args": { "External id": 241077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241077, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241077, "pid": 5, "tid": 7, "ts": 1716454225249836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182836, "dur": 15, "args": { "External id": 241077, "cbid": 211, "correlation": 241077 } }, { "ph": "s", "id": 241077, "pid": 76337, "tid": -914061504, "ts": 1716454225182836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225250033, "dur": 64, "args": { "External id": 241099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241099, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241099, "pid": 5, "tid": 7, "ts": 1716454225250033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182893, "dur": 11, "args": { "External id": 241099, "cbid": 211, "correlation": 241099 } }, { "ph": "s", "id": 241099, "pid": 76337, "tid": -914061504, "ts": 1716454225182893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225182989, "dur": 2, "args": { "External id": 241110, "cbid": 251, "correlation": 241110 } }, { "ph": "f", "id": 241110, "pid": 76337, "tid": -914061504, "ts": 1716454225182989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225250098, "dur": 152, "args": { "External id": 241111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241111, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241111, "pid": 5, "tid": 7, "ts": 1716454225250098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225182995, "dur": 13, "args": { "External id": 241111, "cbid": 211, "correlation": 241111 } }, { "ph": "s", "id": 241111, "pid": 76337, "tid": -914061504, "ts": 1716454225182995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183066, "dur": 1, "args": { "External id": 241122, "cbid": 251, "correlation": 241122 } }, { "ph": "f", "id": 241122, "pid": 76337, "tid": -914061504, "ts": 1716454225183066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225250251, "dur": 143, "args": { "External id": 241123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241123, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241123, "pid": 5, "tid": 7, "ts": 1716454225250251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183070, "dur": 12, "args": { "External id": 241123, "cbid": 211, "correlation": 241123 } }, { "ph": "s", "id": 241123, "pid": 76337, "tid": -914061504, "ts": 1716454225183070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183135, "dur": 1, "args": { "External id": 241134, "cbid": 251, "correlation": 241134 } }, { "ph": "f", "id": 241134, "pid": 76337, "tid": -914061504, "ts": 1716454225183135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225250395, "dur": 141, "args": { "External id": 241135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241135, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241135, "pid": 5, "tid": 7, "ts": 1716454225250395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183139, "dur": 11, "args": { "External id": 241135, "cbid": 211, "correlation": 241135 } }, { "ph": "s", "id": 241135, "pid": 76337, "tid": -914061504, "ts": 1716454225183139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225250538, "dur": 1910, "args": { "External id": 241156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241156, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 241156, "pid": 5, "tid": 7, "ts": 1716454225250538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183222, "dur": 13, "args": { "External id": 241156, "cbid": 211, "correlation": 241156 } }, { "ph": "s", "id": 241156, "pid": 76337, "tid": -914061504, "ts": 1716454225183222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183320, "dur": 1, "args": { "External id": 241174, "cbid": 251, "correlation": 241174 } }, { "ph": "f", "id": 241174, "pid": 76337, "tid": -914061504, "ts": 1716454225183320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225252450, "dur": 146, "args": { "External id": 241176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241176, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 241176, "pid": 5, "tid": 7, "ts": 1716454225252450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183326, "dur": 15, "args": { "External id": 241176, "cbid": 211, "correlation": 241176 } }, { "ph": "s", "id": 241176, "pid": 76337, "tid": -914061504, "ts": 1716454225183326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225252597, "dur": 36, "args": { "External id": 241184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241184, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241184, "pid": 5, "tid": 7, "ts": 1716454225252597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183397, "dur": 12, "args": { "External id": 241184, "cbid": 211, "correlation": 241184 } }, { "ph": "s", "id": 241184, "pid": 76337, "tid": -914061504, "ts": 1716454225183397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225252635, "dur": 50, "args": { "External id": 241192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241192, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241192, "pid": 5, "tid": 7, "ts": 1716454225252635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183437, "dur": 9, "args": { "External id": 241192, "cbid": 211, "correlation": 241192 } }, { "ph": "s", "id": 241192, "pid": 76337, "tid": -914061504, "ts": 1716454225183437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225252686, "dur": 30, "args": { "External id": 241203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241203, "pid": 5, "tid": 7, "ts": 1716454225252686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183506, "dur": 12, "args": { "External id": 241203, "cbid": 211, "correlation": 241203 } }, { "ph": "s", "id": 241203, "pid": 76337, "tid": -914061504, "ts": 1716454225183506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225252718, "dur": 33, "args": { "External id": 241225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241225, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241225, "pid": 5, "tid": 7, "ts": 1716454225252718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183537, "dur": 8, "args": { "External id": 241225, "cbid": 211, "correlation": 241225 } }, { "ph": "s", "id": 241225, "pid": 76337, "tid": -914061504, "ts": 1716454225183537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183621, "dur": 1, "args": { "External id": 241236, "cbid": 251, "correlation": 241236 } }, { "ph": "f", "id": 241236, "pid": 76337, "tid": -914061504, "ts": 1716454225183621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225252752, "dur": 75, "args": { "External id": 241237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241237, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241237, "pid": 5, "tid": 7, "ts": 1716454225252752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183626, "dur": 14, "args": { "External id": 241237, "cbid": 211, "correlation": 241237 } }, { "ph": "s", "id": 241237, "pid": 76337, "tid": -914061504, "ts": 1716454225183626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183695, "dur": 1, "args": { "External id": 241248, "cbid": 251, "correlation": 241248 } }, { "ph": "f", "id": 241248, "pid": 76337, "tid": -914061504, "ts": 1716454225183695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183699, "dur": 0, "args": { "External id": 241249, "cbid": 251, "correlation": 241249 } }, { "ph": "f", "id": 241249, "pid": 76337, "tid": -914061504, "ts": 1716454225183699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225252828, "dur": 11, "args": { "External id": 241250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241250, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 241250, "pid": 5, "tid": 7, "ts": 1716454225252828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183700, "dur": 12, "args": { "External id": 241250, "cbid": 211, "correlation": 241250 } }, { "ph": "s", "id": 241250, "pid": 76337, "tid": -914061504, "ts": 1716454225183700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225252841, "dur": 5, "args": { "External id": 241252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241252, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 241252, "pid": 5, "tid": 7, "ts": 1716454225252841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183714, "dur": 6, "args": { "External id": 241252, "cbid": 211, "correlation": 241252 } }, { "ph": "s", "id": 241252, "pid": 76337, "tid": -914061504, "ts": 1716454225183714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183772, "dur": 1, "args": { "External id": 241263, "cbid": 251, "correlation": 241263 } }, { "ph": "f", "id": 241263, "pid": 76337, "tid": -914061504, "ts": 1716454225183772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183775, "dur": 0, "args": { "External id": 241264, "cbid": 251, "correlation": 241264 } }, { "ph": "f", "id": 241264, "pid": 76337, "tid": -914061504, "ts": 1716454225183775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225252847, "dur": 7, "args": { "External id": 241265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241265, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 241265, "pid": 5, "tid": 7, "ts": 1716454225252847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183777, "dur": 12, "args": { "External id": 241265, "cbid": 211, "correlation": 241265 } }, { "ph": "s", "id": 241265, "pid": 76337, "tid": -914061504, "ts": 1716454225183777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225252855, "dur": 3, "args": { "External id": 241267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241267, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 241267, "pid": 5, "tid": 7, "ts": 1716454225252855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183790, "dur": 5, "args": { "External id": 241267, "cbid": 211, "correlation": 241267 } }, { "ph": "s", "id": 241267, "pid": 76337, "tid": -914061504, "ts": 1716454225183790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225252860, "dur": 90, "args": { "External id": 241288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241288, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 241288, "pid": 5, "tid": 7, "ts": 1716454225252860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183863, "dur": 12, "args": { "External id": 241288, "cbid": 211, "correlation": 241288 } }, { "ph": "s", "id": 241288, "pid": 76337, "tid": -914061504, "ts": 1716454225183863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225183959, "dur": 1, "args": { "External id": 241306, "cbid": 251, "correlation": 241306 } }, { "ph": "f", "id": 241306, "pid": 76337, "tid": -914061504, "ts": 1716454225183959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225252951, "dur": 99, "args": { "External id": 241308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241308, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241308, "pid": 5, "tid": 7, "ts": 1716454225252951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225183965, "dur": 22, "args": { "External id": 241308, "cbid": 211, "correlation": 241308 } }, { "ph": "s", "id": 241308, "pid": 76337, "tid": -914061504, "ts": 1716454225183965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225253052, "dur": 19, "args": { "External id": 241316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241316, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241316, "pid": 5, "tid": 7, "ts": 1716454225253052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184044, "dur": 13, "args": { "External id": 241316, "cbid": 211, "correlation": 241316 } }, { "ph": "s", "id": 241316, "pid": 76337, "tid": -914061504, "ts": 1716454225184044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225253072, "dur": 38, "args": { "External id": 241324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241324, "pid": 5, "tid": 7, "ts": 1716454225253072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184087, "dur": 9, "args": { "External id": 241324, "cbid": 211, "correlation": 241324 } }, { "ph": "s", "id": 241324, "pid": 76337, "tid": -914061504, "ts": 1716454225184087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225253111, "dur": 34, "args": { "External id": 241346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241346, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241346, "pid": 5, "tid": 7, "ts": 1716454225253111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184138, "dur": 10, "args": { "External id": 241346, "cbid": 211, "correlation": 241346 } }, { "ph": "s", "id": 241346, "pid": 76337, "tid": -914061504, "ts": 1716454225184138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225184228, "dur": 1, "args": { "External id": 241362, "cbid": 251, "correlation": 241362 } }, { "ph": "f", "id": 241362, "pid": 76337, "tid": -914061504, "ts": 1716454225184228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225184233, "dur": 0, "args": { "External id": 241364, "cbid": 251, "correlation": 241364 } }, { "ph": "f", "id": 241364, "pid": 76337, "tid": -914061504, "ts": 1716454225184233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225253146, "dur": 531, "args": { "External id": 241365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241365, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 241365, "pid": 5, "tid": 7, "ts": 1716454225253146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184238, "dur": 13, "args": { "External id": 241365, "cbid": 211, "correlation": 241365 } }, { "ph": "s", "id": 241365, "pid": 76337, "tid": -914061504, "ts": 1716454225184238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225253679, "dur": 123, "args": { "External id": 241373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241373, "pid": 5, "tid": 7, "ts": 1716454225253679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184303, "dur": 12, "args": { "External id": 241373, "cbid": 211, "correlation": 241373 } }, { "ph": "s", "id": 241373, "pid": 76337, "tid": -914061504, "ts": 1716454225184303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225253803, "dur": 129, "args": { "External id": 241381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241381, "pid": 5, "tid": 7, "ts": 1716454225253803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184334, "dur": 8, "args": { "External id": 241381, "cbid": 211, "correlation": 241381 } }, { "ph": "s", "id": 241381, "pid": 76337, "tid": -914061504, "ts": 1716454225184334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225184410, "dur": 1, "args": { "External id": 241397, "cbid": 251, "correlation": 241397 } }, { "ph": "f", "id": 241397, "pid": 76337, "tid": -914061504, "ts": 1716454225184410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225253933, "dur": 303, "args": { "External id": 241399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241399, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241399, "pid": 5, "tid": 7, "ts": 1716454225253933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184415, "dur": 13, "args": { "External id": 241399, "cbid": 211, "correlation": 241399 } }, { "ph": "s", "id": 241399, "pid": 76337, "tid": -914061504, "ts": 1716454225184415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225254238, "dur": 28, "args": { "External id": 241407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241407, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241407, "pid": 5, "tid": 7, "ts": 1716454225254238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184458, "dur": 10, "args": { "External id": 241407, "cbid": 211, "correlation": 241407 } }, { "ph": "s", "id": 241407, "pid": 76337, "tid": -914061504, "ts": 1716454225184458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225254267, "dur": 80, "args": { "External id": 241418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241418, "pid": 5, "tid": 7, "ts": 1716454225254267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184525, "dur": 13, "args": { "External id": 241418, "cbid": 211, "correlation": 241418 } }, { "ph": "s", "id": 241418, "pid": 76337, "tid": -914061504, "ts": 1716454225184525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225184589, "dur": 0, "args": { "External id": 241430, "cbid": 317, "correlation": 241430 } }, { "ph": "f", "id": 241430, "pid": 76337, "tid": -914061504, "ts": 1716454225184589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225184590, "dur": 0, "args": { "External id": 241431, "cbid": 203, "correlation": 241431 } }, { "ph": "f", "id": 241431, "pid": 76337, "tid": -914061504, "ts": 1716454225184590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225184590, "dur": 0, "args": { "External id": 241432, "cbid": 205, "correlation": 241432 } }, { "ph": "f", "id": 241432, "pid": 76337, "tid": -914061504, "ts": 1716454225184590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225254348, "dur": 23, "args": { "External id": 241436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241436, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241436, "pid": 5, "tid": 7, "ts": 1716454225254348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184605, "dur": 12, "args": { "External id": 241436, "cbid": 211, "correlation": 241436 } }, { "ph": "s", "id": 241436, "pid": 76337, "tid": -914061504, "ts": 1716454225184605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225254373, "dur": 118, "args": { "External id": 241438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241438, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241438, "pid": 5, "tid": 7, "ts": 1716454225254373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184624, "dur": 7, "args": { "External id": 241438, "cbid": 211, "correlation": 241438 } }, { "ph": "s", "id": 241438, "pid": 76337, "tid": -914061504, "ts": 1716454225184624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225254492, "dur": 23, "args": { "External id": 241440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241440, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241440, "pid": 5, "tid": 7, "ts": 1716454225254492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184635, "dur": 5, "args": { "External id": 241440, "cbid": 211, "correlation": 241440 } }, { "ph": "s", "id": 241440, "pid": 76337, "tid": -914061504, "ts": 1716454225184635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225254517, "dur": 32, "args": { "External id": 241446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241446, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241446, "pid": 5, "tid": 7, "ts": 1716454225254517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184662, "dur": 8, "args": { "External id": 241446, "cbid": 211, "correlation": 241446 } }, { "ph": "s", "id": 241446, "pid": 76337, "tid": -914061504, "ts": 1716454225184662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225254550, "dur": 27, "args": { "External id": 241454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241454, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241454, "pid": 5, "tid": 7, "ts": 1716454225254550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184694, "dur": 9, "args": { "External id": 241454, "cbid": 211, "correlation": 241454 } }, { "ph": "s", "id": 241454, "pid": 76337, "tid": -914061504, "ts": 1716454225184694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225254578, "dur": 53, "args": { "External id": 241463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241463, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241463, "pid": 5, "tid": 7, "ts": 1716454225254578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184733, "dur": 10, "args": { "External id": 241463, "cbid": 211, "correlation": 241463 } }, { "ph": "s", "id": 241463, "pid": 76337, "tid": -914061504, "ts": 1716454225184733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225254633, "dur": 52, "args": { "External id": 241483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241483, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 241483, "pid": 5, "tid": 7, "ts": 1716454225254633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184803, "dur": 11, "args": { "External id": 241483, "cbid": 211, "correlation": 241483 } }, { "ph": "s", "id": 241483, "pid": 76337, "tid": -914061504, "ts": 1716454225184803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225254686, "dur": 4, "args": { "External id": 241495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241495, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 241495, "pid": 5, "tid": 7, "ts": 1716454225254686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184824, "dur": 7, "args": { "External id": 241495, "cbid": 211, "correlation": 241495 } }, { "ph": "s", "id": 241495, "pid": 76337, "tid": -914061504, "ts": 1716454225184824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225254692, "dur": 56, "args": { "External id": 241498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241498, "pid": 5, "tid": 7, "ts": 1716454225254692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184843, "dur": 7, "args": { "External id": 241498, "cbid": 211, "correlation": 241498 } }, { "ph": "s", "id": 241498, "pid": 76337, "tid": -914061504, "ts": 1716454225184843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225254749, "dur": 38, "args": { "External id": 241507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241507, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241507, "pid": 5, "tid": 7, "ts": 1716454225254749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184882, "dur": 10, "args": { "External id": 241507, "cbid": 211, "correlation": 241507 } }, { "ph": "s", "id": 241507, "pid": 76337, "tid": -914061504, "ts": 1716454225184882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225184935, "dur": 0, "args": { "External id": 241517, "cbid": 317, "correlation": 241517 } }, { "ph": "f", "id": 241517, "pid": 76337, "tid": -914061504, "ts": 1716454225184935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225184936, "dur": 0, "args": { "External id": 241518, "cbid": 203, "correlation": 241518 } }, { "ph": "f", "id": 241518, "pid": 76337, "tid": -914061504, "ts": 1716454225184936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225184937, "dur": 0, "args": { "External id": 241519, "cbid": 205, "correlation": 241519 } }, { "ph": "f", "id": 241519, "pid": 76337, "tid": -914061504, "ts": 1716454225184937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225254788, "dur": 39, "args": { "External id": 241523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241523, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241523, "pid": 5, "tid": 7, "ts": 1716454225254788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184951, "dur": 11, "args": { "External id": 241523, "cbid": 211, "correlation": 241523 } }, { "ph": "s", "id": 241523, "pid": 76337, "tid": -914061504, "ts": 1716454225184951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225254829, "dur": 81, "args": { "External id": 241525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241525, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241525, "pid": 5, "tid": 7, "ts": 1716454225254829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184964, "dur": 5, "args": { "External id": 241525, "cbid": 211, "correlation": 241525 } }, { "ph": "s", "id": 241525, "pid": 76337, "tid": -914061504, "ts": 1716454225184964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225254912, "dur": 1264, "args": { "External id": 241527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241527, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241527, "pid": 5, "tid": 7, "ts": 1716454225254912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184985, "dur": 7, "args": { "External id": 241527, "cbid": 211, "correlation": 241527 } }, { "ph": "s", "id": 241527, "pid": 76337, "tid": -914061504, "ts": 1716454225184985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225256177, "dur": 23, "args": { "External id": 241529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241529, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241529, "pid": 5, "tid": 7, "ts": 1716454225256177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225184995, "dur": 5, "args": { "External id": 241529, "cbid": 211, "correlation": 241529 } }, { "ph": "s", "id": 241529, "pid": 76337, "tid": -914061504, "ts": 1716454225184995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225256201, "dur": 33, "args": { "External id": 241535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241535, "pid": 5, "tid": 7, "ts": 1716454225256201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185023, "dur": 9, "args": { "External id": 241535, "cbid": 211, "correlation": 241535 } }, { "ph": "s", "id": 241535, "pid": 76337, "tid": -914061504, "ts": 1716454225185023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225256235, "dur": 4, "args": { "External id": 241543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241543, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 241543, "pid": 5, "tid": 7, "ts": 1716454225256235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185066, "dur": 10, "args": { "External id": 241543, "cbid": 211, "correlation": 241543 } }, { "ph": "s", "id": 241543, "pid": 76337, "tid": -914061504, "ts": 1716454225185066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225185130, "dur": 1, "args": { "External id": 241559, "cbid": 251, "correlation": 241559 } }, { "ph": "f", "id": 241559, "pid": 76337, "tid": -914061504, "ts": 1716454225185130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225185135, "dur": 0, "args": { "External id": 241561, "cbid": 251, "correlation": 241561 } }, { "ph": "f", "id": 241561, "pid": 76337, "tid": -914061504, "ts": 1716454225185135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225256240, "dur": 11, "args": { "External id": 241562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241562, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 241562, "pid": 5, "tid": 7, "ts": 1716454225256240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185137, "dur": 11, "args": { "External id": 241562, "cbid": 211, "correlation": 241562 } }, { "ph": "s", "id": 241562, "pid": 76337, "tid": -914061504, "ts": 1716454225185137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225256253, "dur": 5, "args": { "External id": 241564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241564, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 241564, "pid": 5, "tid": 7, "ts": 1716454225256253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185150, "dur": 5, "args": { "External id": 241564, "cbid": 211, "correlation": 241564 } }, { "ph": "s", "id": 241564, "pid": 76337, "tid": -914061504, "ts": 1716454225185150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225256259, "dur": 28, "args": { "External id": 241574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241574, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241574, "pid": 5, "tid": 7, "ts": 1716454225256259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185207, "dur": 12, "args": { "External id": 241574, "cbid": 211, "correlation": 241574 } }, { "ph": "s", "id": 241574, "pid": 76337, "tid": -914061504, "ts": 1716454225185207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225256289, "dur": 31, "args": { "External id": 241594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241594, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 241594, "pid": 5, "tid": 7, "ts": 1716454225256289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185271, "dur": 11, "args": { "External id": 241594, "cbid": 211, "correlation": 241594 } }, { "ph": "s", "id": 241594, "pid": 76337, "tid": -914061504, "ts": 1716454225185271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225256321, "dur": 4, "args": { "External id": 241606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241606, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 241606, "pid": 5, "tid": 7, "ts": 1716454225256321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185291, "dur": 6, "args": { "External id": 241606, "cbid": 211, "correlation": 241606 } }, { "ph": "s", "id": 241606, "pid": 76337, "tid": -914061504, "ts": 1716454225185291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225256326, "dur": 31, "args": { "External id": 241609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241609, "pid": 5, "tid": 7, "ts": 1716454225256326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185310, "dur": 7, "args": { "External id": 241609, "cbid": 211, "correlation": 241609 } }, { "ph": "s", "id": 241609, "pid": 76337, "tid": -914061504, "ts": 1716454225185310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225256358, "dur": 20, "args": { "External id": 241618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241618, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241618, "pid": 5, "tid": 7, "ts": 1716454225256358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185351, "dur": 11, "args": { "External id": 241618, "cbid": 211, "correlation": 241618 } }, { "ph": "s", "id": 241618, "pid": 76337, "tid": -914061504, "ts": 1716454225185351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225185414, "dur": 0, "args": { "External id": 241628, "cbid": 317, "correlation": 241628 } }, { "ph": "f", "id": 241628, "pid": 76337, "tid": -914061504, "ts": 1716454225185414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225185415, "dur": 0, "args": { "External id": 241629, "cbid": 203, "correlation": 241629 } }, { "ph": "f", "id": 241629, "pid": 76337, "tid": -914061504, "ts": 1716454225185415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225185416, "dur": 0, "args": { "External id": 241630, "cbid": 205, "correlation": 241630 } }, { "ph": "f", "id": 241630, "pid": 76337, "tid": -914061504, "ts": 1716454225185416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225256380, "dur": 22, "args": { "External id": 241634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241634, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241634, "pid": 5, "tid": 7, "ts": 1716454225256380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185430, "dur": 12, "args": { "External id": 241634, "cbid": 211, "correlation": 241634 } }, { "ph": "s", "id": 241634, "pid": 76337, "tid": -914061504, "ts": 1716454225185430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225256403, "dur": 43, "args": { "External id": 241636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241636, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241636, "pid": 5, "tid": 7, "ts": 1716454225256403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185445, "dur": 5, "args": { "External id": 241636, "cbid": 211, "correlation": 241636 } }, { "ph": "s", "id": 241636, "pid": 76337, "tid": -914061504, "ts": 1716454225185445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225256448, "dur": 638, "args": { "External id": 241638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241638, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241638, "pid": 5, "tid": 7, "ts": 1716454225256448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185456, "dur": 6, "args": { "External id": 241638, "cbid": 211, "correlation": 241638 } }, { "ph": "s", "id": 241638, "pid": 76337, "tid": -914061504, "ts": 1716454225185456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225257088, "dur": 24, "args": { "External id": 241640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241640, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241640, "pid": 5, "tid": 7, "ts": 1716454225257088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185465, "dur": 5, "args": { "External id": 241640, "cbid": 211, "correlation": 241640 } }, { "ph": "s", "id": 241640, "pid": 76337, "tid": -914061504, "ts": 1716454225185465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225257112, "dur": 32, "args": { "External id": 241646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241646, "pid": 5, "tid": 7, "ts": 1716454225257112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185493, "dur": 8, "args": { "External id": 241646, "cbid": 211, "correlation": 241646 } }, { "ph": "s", "id": 241646, "pid": 76337, "tid": -914061504, "ts": 1716454225185493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225185551, "dur": 0, "args": { "External id": 241656, "cbid": 317, "correlation": 241656 } }, { "ph": "f", "id": 241656, "pid": 76337, "tid": -914061504, "ts": 1716454225185551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225185552, "dur": 0, "args": { "External id": 241657, "cbid": 203, "correlation": 241657 } }, { "ph": "f", "id": 241657, "pid": 76337, "tid": -914061504, "ts": 1716454225185552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225185552, "dur": 0, "args": { "External id": 241658, "cbid": 205, "correlation": 241658 } }, { "ph": "f", "id": 241658, "pid": 76337, "tid": -914061504, "ts": 1716454225185552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225257146, "dur": 38, "args": { "External id": 241662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241662, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241662, "pid": 5, "tid": 7, "ts": 1716454225257146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185564, "dur": 12, "args": { "External id": 241662, "cbid": 211, "correlation": 241662 } }, { "ph": "s", "id": 241662, "pid": 76337, "tid": -914061504, "ts": 1716454225185564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225257186, "dur": 186, "args": { "External id": 241664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241664, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241664, "pid": 5, "tid": 7, "ts": 1716454225257186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185582, "dur": 6, "args": { "External id": 241664, "cbid": 211, "correlation": 241664 } }, { "ph": "s", "id": 241664, "pid": 76337, "tid": -914061504, "ts": 1716454225185582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225257373, "dur": 21, "args": { "External id": 241666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241666, "pid": 5, "tid": 7, "ts": 1716454225257373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185591, "dur": 5, "args": { "External id": 241666, "cbid": 211, "correlation": 241666 } }, { "ph": "s", "id": 241666, "pid": 76337, "tid": -914061504, "ts": 1716454225185591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225257395, "dur": 32, "args": { "External id": 241672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241672, "pid": 5, "tid": 7, "ts": 1716454225257395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185617, "dur": 9, "args": { "External id": 241672, "cbid": 211, "correlation": 241672 } }, { "ph": "s", "id": 241672, "pid": 76337, "tid": -914061504, "ts": 1716454225185617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225257428, "dur": 27, "args": { "External id": 241680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241680, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241680, "pid": 5, "tid": 7, "ts": 1716454225257428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185646, "dur": 8, "args": { "External id": 241680, "cbid": 211, "correlation": 241680 } }, { "ph": "s", "id": 241680, "pid": 76337, "tid": -914061504, "ts": 1716454225185646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225257457, "dur": 20, "args": { "External id": 241688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241688, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241688, "pid": 5, "tid": 7, "ts": 1716454225257457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185674, "dur": 8, "args": { "External id": 241688, "cbid": 211, "correlation": 241688 } }, { "ph": "s", "id": 241688, "pid": 76337, "tid": -914061504, "ts": 1716454225185674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225257478, "dur": 29, "args": { "External id": 241708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241708, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 241708, "pid": 5, "tid": 7, "ts": 1716454225257478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185755, "dur": 12, "args": { "External id": 241708, "cbid": 211, "correlation": 241708 } }, { "ph": "s", "id": 241708, "pid": 76337, "tid": -914061504, "ts": 1716454225185755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225257509, "dur": 4, "args": { "External id": 241720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241720, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 241720, "pid": 5, "tid": 7, "ts": 1716454225257509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185777, "dur": 7, "args": { "External id": 241720, "cbid": 211, "correlation": 241720 } }, { "ph": "s", "id": 241720, "pid": 76337, "tid": -914061504, "ts": 1716454225185777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225257514, "dur": 30, "args": { "External id": 241723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241723, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241723, "pid": 5, "tid": 7, "ts": 1716454225257514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185795, "dur": 6, "args": { "External id": 241723, "cbid": 211, "correlation": 241723 } }, { "ph": "s", "id": 241723, "pid": 76337, "tid": -914061504, "ts": 1716454225185795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225185852, "dur": 0, "args": { "External id": 241734, "cbid": 317, "correlation": 241734 } }, { "ph": "f", "id": 241734, "pid": 76337, "tid": -914061504, "ts": 1716454225185852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225185853, "dur": 0, "args": { "External id": 241735, "cbid": 203, "correlation": 241735 } }, { "ph": "f", "id": 241735, "pid": 76337, "tid": -914061504, "ts": 1716454225185853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225185854, "dur": 0, "args": { "External id": 241736, "cbid": 205, "correlation": 241736 } }, { "ph": "f", "id": 241736, "pid": 76337, "tid": -914061504, "ts": 1716454225185854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225257545, "dur": 23, "args": { "External id": 241740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241740, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241740, "pid": 5, "tid": 7, "ts": 1716454225257545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185866, "dur": 12, "args": { "External id": 241740, "cbid": 211, "correlation": 241740 } }, { "ph": "s", "id": 241740, "pid": 76337, "tid": -914061504, "ts": 1716454225185866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225257569, "dur": 103, "args": { "External id": 241742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241742, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241742, "pid": 5, "tid": 7, "ts": 1716454225257569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185884, "dur": 6, "args": { "External id": 241742, "cbid": 211, "correlation": 241742 } }, { "ph": "s", "id": 241742, "pid": 76337, "tid": -914061504, "ts": 1716454225185884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225257674, "dur": 23, "args": { "External id": 241744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241744, "pid": 5, "tid": 7, "ts": 1716454225257674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185894, "dur": 5, "args": { "External id": 241744, "cbid": 211, "correlation": 241744 } }, { "ph": "s", "id": 241744, "pid": 76337, "tid": -914061504, "ts": 1716454225185894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225257698, "dur": 32, "args": { "External id": 241750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241750, "pid": 5, "tid": 7, "ts": 1716454225257698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225185920, "dur": 9, "args": { "External id": 241750, "cbid": 211, "correlation": 241750 } }, { "ph": "s", "id": 241750, "pid": 76337, "tid": -914061504, "ts": 1716454225185920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225257731, "dur": 184, "args": { "External id": 241759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241759, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241759, "pid": 5, "tid": 7, "ts": 1716454225257731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186010, "dur": 14, "args": { "External id": 241759, "cbid": 211, "correlation": 241759 } }, { "ph": "s", "id": 241759, "pid": 76337, "tid": -914061504, "ts": 1716454225186010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225257917, "dur": 63, "args": { "External id": 241781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241781, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241781, "pid": 5, "tid": 7, "ts": 1716454225257917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186067, "dur": 10, "args": { "External id": 241781, "cbid": 211, "correlation": 241781 } }, { "ph": "s", "id": 241781, "pid": 76337, "tid": -914061504, "ts": 1716454225186067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186154, "dur": 1, "args": { "External id": 241792, "cbid": 251, "correlation": 241792 } }, { "ph": "f", "id": 241792, "pid": 76337, "tid": -914061504, "ts": 1716454225186154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225257981, "dur": 153, "args": { "External id": 241793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241793, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241793, "pid": 5, "tid": 7, "ts": 1716454225257981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186159, "dur": 13, "args": { "External id": 241793, "cbid": 211, "correlation": 241793 } }, { "ph": "s", "id": 241793, "pid": 76337, "tid": -914061504, "ts": 1716454225186159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186229, "dur": 1, "args": { "External id": 241804, "cbid": 251, "correlation": 241804 } }, { "ph": "f", "id": 241804, "pid": 76337, "tid": -914061504, "ts": 1716454225186229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225258136, "dur": 145, "args": { "External id": 241805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241805, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241805, "pid": 5, "tid": 7, "ts": 1716454225258136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186233, "dur": 11, "args": { "External id": 241805, "cbid": 211, "correlation": 241805 } }, { "ph": "s", "id": 241805, "pid": 76337, "tid": -914061504, "ts": 1716454225186233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186297, "dur": 1, "args": { "External id": 241816, "cbid": 251, "correlation": 241816 } }, { "ph": "f", "id": 241816, "pid": 76337, "tid": -914061504, "ts": 1716454225186297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225258283, "dur": 143, "args": { "External id": 241817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241817, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241817, "pid": 5, "tid": 7, "ts": 1716454225258283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186301, "dur": 11, "args": { "External id": 241817, "cbid": 211, "correlation": 241817 } }, { "ph": "s", "id": 241817, "pid": 76337, "tid": -914061504, "ts": 1716454225186301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225258426, "dur": 1905, "args": { "External id": 241838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241838, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 241838, "pid": 5, "tid": 7, "ts": 1716454225258426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186383, "dur": 13, "args": { "External id": 241838, "cbid": 211, "correlation": 241838 } }, { "ph": "s", "id": 241838, "pid": 76337, "tid": -914061504, "ts": 1716454225186383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186480, "dur": 1, "args": { "External id": 241856, "cbid": 251, "correlation": 241856 } }, { "ph": "f", "id": 241856, "pid": 76337, "tid": -914061504, "ts": 1716454225186480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225260333, "dur": 146, "args": { "External id": 241858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241858, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 241858, "pid": 5, "tid": 7, "ts": 1716454225260333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186486, "dur": 13, "args": { "External id": 241858, "cbid": 211, "correlation": 241858 } }, { "ph": "s", "id": 241858, "pid": 76337, "tid": -914061504, "ts": 1716454225186486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225260481, "dur": 36, "args": { "External id": 241866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241866, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241866, "pid": 5, "tid": 7, "ts": 1716454225260481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186555, "dur": 12, "args": { "External id": 241866, "cbid": 211, "correlation": 241866 } }, { "ph": "s", "id": 241866, "pid": 76337, "tid": -914061504, "ts": 1716454225186555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225260518, "dur": 51, "args": { "External id": 241874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241874, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241874, "pid": 5, "tid": 7, "ts": 1716454225260518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186595, "dur": 8, "args": { "External id": 241874, "cbid": 211, "correlation": 241874 } }, { "ph": "s", "id": 241874, "pid": 76337, "tid": -914061504, "ts": 1716454225186595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225260570, "dur": 29, "args": { "External id": 241885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241885, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241885, "pid": 5, "tid": 7, "ts": 1716454225260570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186666, "dur": 13, "args": { "External id": 241885, "cbid": 211, "correlation": 241885 } }, { "ph": "s", "id": 241885, "pid": 76337, "tid": -914061504, "ts": 1716454225186666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225260600, "dur": 34, "args": { "External id": 241907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241907, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241907, "pid": 5, "tid": 7, "ts": 1716454225260600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186698, "dur": 7, "args": { "External id": 241907, "cbid": 211, "correlation": 241907 } }, { "ph": "s", "id": 241907, "pid": 76337, "tid": -914061504, "ts": 1716454225186698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186782, "dur": 1, "args": { "External id": 241918, "cbid": 251, "correlation": 241918 } }, { "ph": "f", "id": 241918, "pid": 76337, "tid": -914061504, "ts": 1716454225186782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225260635, "dur": 88, "args": { "External id": 241919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241919, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241919, "pid": 5, "tid": 7, "ts": 1716454225260635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186787, "dur": 13, "args": { "External id": 241919, "cbid": 211, "correlation": 241919 } }, { "ph": "s", "id": 241919, "pid": 76337, "tid": -914061504, "ts": 1716454225186787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186855, "dur": 1, "args": { "External id": 241930, "cbid": 251, "correlation": 241930 } }, { "ph": "f", "id": 241930, "pid": 76337, "tid": -914061504, "ts": 1716454225186855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186859, "dur": 0, "args": { "External id": 241931, "cbid": 251, "correlation": 241931 } }, { "ph": "f", "id": 241931, "pid": 76337, "tid": -914061504, "ts": 1716454225186859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225260725, "dur": 11, "args": { "External id": 241932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241932, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 241932, "pid": 5, "tid": 7, "ts": 1716454225260725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186861, "dur": 11, "args": { "External id": 241932, "cbid": 211, "correlation": 241932 } }, { "ph": "s", "id": 241932, "pid": 76337, "tid": -914061504, "ts": 1716454225186861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225260737, "dur": 5, "args": { "External id": 241934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241934, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 241934, "pid": 5, "tid": 7, "ts": 1716454225260737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186874, "dur": 6, "args": { "External id": 241934, "cbid": 211, "correlation": 241934 } }, { "ph": "s", "id": 241934, "pid": 76337, "tid": -914061504, "ts": 1716454225186874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186932, "dur": 1, "args": { "External id": 241945, "cbid": 251, "correlation": 241945 } }, { "ph": "f", "id": 241945, "pid": 76337, "tid": -914061504, "ts": 1716454225186932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225186937, "dur": 0, "args": { "External id": 241946, "cbid": 251, "correlation": 241946 } }, { "ph": "f", "id": 241946, "pid": 76337, "tid": -914061504, "ts": 1716454225186937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225260743, "dur": 7, "args": { "External id": 241947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241947, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 241947, "pid": 5, "tid": 7, "ts": 1716454225260743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186938, "dur": 11, "args": { "External id": 241947, "cbid": 211, "correlation": 241947 } }, { "ph": "s", "id": 241947, "pid": 76337, "tid": -914061504, "ts": 1716454225186938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225260752, "dur": 4, "args": { "External id": 241949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241949, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 241949, "pid": 5, "tid": 7, "ts": 1716454225260752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225186951, "dur": 5, "args": { "External id": 241949, "cbid": 211, "correlation": 241949 } }, { "ph": "s", "id": 241949, "pid": 76337, "tid": -914061504, "ts": 1716454225186951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225260756, "dur": 89, "args": { "External id": 241970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241970, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 241970, "pid": 5, "tid": 7, "ts": 1716454225260756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187033, "dur": 13, "args": { "External id": 241970, "cbid": 211, "correlation": 241970 } }, { "ph": "s", "id": 241970, "pid": 76337, "tid": -914061504, "ts": 1716454225187033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225187129, "dur": 1, "args": { "External id": 241988, "cbid": 251, "correlation": 241988 } }, { "ph": "f", "id": 241988, "pid": 76337, "tid": -914061504, "ts": 1716454225187129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225260847, "dur": 94, "args": { "External id": 241990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241990, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 241990, "pid": 5, "tid": 7, "ts": 1716454225260847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187136, "dur": 13, "args": { "External id": 241990, "cbid": 211, "correlation": 241990 } }, { "ph": "s", "id": 241990, "pid": 76337, "tid": -914061504, "ts": 1716454225187136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225260942, "dur": 19, "args": { "External id": 241998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 241998, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 241998, "pid": 5, "tid": 7, "ts": 1716454225260942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187204, "dur": 12, "args": { "External id": 241998, "cbid": 211, "correlation": 241998 } }, { "ph": "s", "id": 241998, "pid": 76337, "tid": -914061504, "ts": 1716454225187204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225260962, "dur": 37, "args": { "External id": 242006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242006, "pid": 5, "tid": 7, "ts": 1716454225260962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187245, "dur": 9, "args": { "External id": 242006, "cbid": 211, "correlation": 242006 } }, { "ph": "s", "id": 242006, "pid": 76337, "tid": -914061504, "ts": 1716454225187245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225261001, "dur": 34, "args": { "External id": 242028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242028, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242028, "pid": 5, "tid": 7, "ts": 1716454225261001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187296, "dur": 10, "args": { "External id": 242028, "cbid": 211, "correlation": 242028 } }, { "ph": "s", "id": 242028, "pid": 76337, "tid": -914061504, "ts": 1716454225187296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225187385, "dur": 1, "args": { "External id": 242044, "cbid": 251, "correlation": 242044 } }, { "ph": "f", "id": 242044, "pid": 76337, "tid": -914061504, "ts": 1716454225187385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225187390, "dur": 0, "args": { "External id": 242046, "cbid": 251, "correlation": 242046 } }, { "ph": "f", "id": 242046, "pid": 76337, "tid": -914061504, "ts": 1716454225187390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225261036, "dur": 529, "args": { "External id": 242047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242047, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 242047, "pid": 5, "tid": 7, "ts": 1716454225261036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187394, "dur": 13, "args": { "External id": 242047, "cbid": 211, "correlation": 242047 } }, { "ph": "s", "id": 242047, "pid": 76337, "tid": -914061504, "ts": 1716454225187394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225261566, "dur": 124, "args": { "External id": 242055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242055, "pid": 5, "tid": 7, "ts": 1716454225261566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187458, "dur": 12, "args": { "External id": 242055, "cbid": 211, "correlation": 242055 } }, { "ph": "s", "id": 242055, "pid": 76337, "tid": -914061504, "ts": 1716454225187458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225261692, "dur": 129, "args": { "External id": 242063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242063, "pid": 5, "tid": 7, "ts": 1716454225261692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187489, "dur": 8, "args": { "External id": 242063, "cbid": 211, "correlation": 242063 } }, { "ph": "s", "id": 242063, "pid": 76337, "tid": -914061504, "ts": 1716454225187489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225187565, "dur": 1, "args": { "External id": 242079, "cbid": 251, "correlation": 242079 } }, { "ph": "f", "id": 242079, "pid": 76337, "tid": -914061504, "ts": 1716454225187565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225261822, "dur": 303, "args": { "External id": 242081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242081, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242081, "pid": 5, "tid": 7, "ts": 1716454225261822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187571, "dur": 13, "args": { "External id": 242081, "cbid": 211, "correlation": 242081 } }, { "ph": "s", "id": 242081, "pid": 76337, "tid": -914061504, "ts": 1716454225187571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225262126, "dur": 27, "args": { "External id": 242089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242089, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242089, "pid": 5, "tid": 7, "ts": 1716454225262126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187613, "dur": 10, "args": { "External id": 242089, "cbid": 211, "correlation": 242089 } }, { "ph": "s", "id": 242089, "pid": 76337, "tid": -914061504, "ts": 1716454225187613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225262154, "dur": 79, "args": { "External id": 242100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242100, "pid": 5, "tid": 7, "ts": 1716454225262154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187680, "dur": 13, "args": { "External id": 242100, "cbid": 211, "correlation": 242100 } }, { "ph": "s", "id": 242100, "pid": 76337, "tid": -914061504, "ts": 1716454225187680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225187744, "dur": 0, "args": { "External id": 242112, "cbid": 317, "correlation": 242112 } }, { "ph": "f", "id": 242112, "pid": 76337, "tid": -914061504, "ts": 1716454225187744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225187745, "dur": 0, "args": { "External id": 242113, "cbid": 203, "correlation": 242113 } }, { "ph": "f", "id": 242113, "pid": 76337, "tid": -914061504, "ts": 1716454225187745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225187746, "dur": 0, "args": { "External id": 242114, "cbid": 205, "correlation": 242114 } }, { "ph": "f", "id": 242114, "pid": 76337, "tid": -914061504, "ts": 1716454225187746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225262235, "dur": 24, "args": { "External id": 242118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242118, "pid": 5, "tid": 7, "ts": 1716454225262235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187760, "dur": 12, "args": { "External id": 242118, "cbid": 211, "correlation": 242118 } }, { "ph": "s", "id": 242118, "pid": 76337, "tid": -914061504, "ts": 1716454225187760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225262260, "dur": 118, "args": { "External id": 242120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242120, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242120, "pid": 5, "tid": 7, "ts": 1716454225262260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187778, "dur": 7, "args": { "External id": 242120, "cbid": 211, "correlation": 242120 } }, { "ph": "s", "id": 242120, "pid": 76337, "tid": -914061504, "ts": 1716454225187778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225262380, "dur": 23, "args": { "External id": 242122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242122, "pid": 5, "tid": 7, "ts": 1716454225262380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187789, "dur": 5, "args": { "External id": 242122, "cbid": 211, "correlation": 242122 } }, { "ph": "s", "id": 242122, "pid": 76337, "tid": -914061504, "ts": 1716454225187789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225262405, "dur": 33, "args": { "External id": 242128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242128, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242128, "pid": 5, "tid": 7, "ts": 1716454225262405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187816, "dur": 8, "args": { "External id": 242128, "cbid": 211, "correlation": 242128 } }, { "ph": "s", "id": 242128, "pid": 76337, "tid": -914061504, "ts": 1716454225187816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225262438, "dur": 27, "args": { "External id": 242136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242136, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242136, "pid": 5, "tid": 7, "ts": 1716454225262438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187848, "dur": 8, "args": { "External id": 242136, "cbid": 211, "correlation": 242136 } }, { "ph": "s", "id": 242136, "pid": 76337, "tid": -914061504, "ts": 1716454225187848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225262467, "dur": 45, "args": { "External id": 242145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242145, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242145, "pid": 5, "tid": 7, "ts": 1716454225262467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187887, "dur": 10, "args": { "External id": 242145, "cbid": 211, "correlation": 242145 } }, { "ph": "s", "id": 242145, "pid": 76337, "tid": -914061504, "ts": 1716454225187887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225262513, "dur": 42, "args": { "External id": 242165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242165, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 242165, "pid": 5, "tid": 7, "ts": 1716454225262513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187960, "dur": 11, "args": { "External id": 242165, "cbid": 211, "correlation": 242165 } }, { "ph": "s", "id": 242165, "pid": 76337, "tid": -914061504, "ts": 1716454225187960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225262557, "dur": 5, "args": { "External id": 242177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242177, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 242177, "pid": 5, "tid": 7, "ts": 1716454225262557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225187990, "dur": 7, "args": { "External id": 242177, "cbid": 211, "correlation": 242177 } }, { "ph": "s", "id": 242177, "pid": 76337, "tid": -914061504, "ts": 1716454225187990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225262563, "dur": 43, "args": { "External id": 242180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242180, "pid": 5, "tid": 7, "ts": 1716454225262563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188010, "dur": 7, "args": { "External id": 242180, "cbid": 211, "correlation": 242180 } }, { "ph": "s", "id": 242180, "pid": 76337, "tid": -914061504, "ts": 1716454225188010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225262608, "dur": 28, "args": { "External id": 242189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242189, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242189, "pid": 5, "tid": 7, "ts": 1716454225262608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188050, "dur": 10, "args": { "External id": 242189, "cbid": 211, "correlation": 242189 } }, { "ph": "s", "id": 242189, "pid": 76337, "tid": -914061504, "ts": 1716454225188050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225188101, "dur": 0, "args": { "External id": 242199, "cbid": 317, "correlation": 242199 } }, { "ph": "f", "id": 242199, "pid": 76337, "tid": -914061504, "ts": 1716454225188101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225188102, "dur": 0, "args": { "External id": 242200, "cbid": 203, "correlation": 242200 } }, { "ph": "f", "id": 242200, "pid": 76337, "tid": -914061504, "ts": 1716454225188102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225188103, "dur": 0, "args": { "External id": 242201, "cbid": 205, "correlation": 242201 } }, { "ph": "f", "id": 242201, "pid": 76337, "tid": -914061504, "ts": 1716454225188103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225262637, "dur": 31, "args": { "External id": 242205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242205, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242205, "pid": 5, "tid": 7, "ts": 1716454225262637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188118, "dur": 11, "args": { "External id": 242205, "cbid": 211, "correlation": 242205 } }, { "ph": "s", "id": 242205, "pid": 76337, "tid": -914061504, "ts": 1716454225188118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225262670, "dur": 63, "args": { "External id": 242207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242207, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242207, "pid": 5, "tid": 7, "ts": 1716454225262670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188132, "dur": 6, "args": { "External id": 242207, "cbid": 211, "correlation": 242207 } }, { "ph": "s", "id": 242207, "pid": 76337, "tid": -914061504, "ts": 1716454225188132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225262733, "dur": 957, "args": { "External id": 242209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242209, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242209, "pid": 5, "tid": 7, "ts": 1716454225262733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188144, "dur": 6, "args": { "External id": 242209, "cbid": 211, "correlation": 242209 } }, { "ph": "s", "id": 242209, "pid": 76337, "tid": -914061504, "ts": 1716454225188144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225263692, "dur": 21, "args": { "External id": 242211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242211, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242211, "pid": 5, "tid": 7, "ts": 1716454225263692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188153, "dur": 5, "args": { "External id": 242211, "cbid": 211, "correlation": 242211 } }, { "ph": "s", "id": 242211, "pid": 76337, "tid": -914061504, "ts": 1716454225188153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225263714, "dur": 32, "args": { "External id": 242217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242217, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242217, "pid": 5, "tid": 7, "ts": 1716454225263714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188181, "dur": 9, "args": { "External id": 242217, "cbid": 211, "correlation": 242217 } }, { "ph": "s", "id": 242217, "pid": 76337, "tid": -914061504, "ts": 1716454225188181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225263747, "dur": 3, "args": { "External id": 242225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242225, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 242225, "pid": 5, "tid": 7, "ts": 1716454225263747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188225, "dur": 10, "args": { "External id": 242225, "cbid": 211, "correlation": 242225 } }, { "ph": "s", "id": 242225, "pid": 76337, "tid": -914061504, "ts": 1716454225188225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225188289, "dur": 1, "args": { "External id": 242241, "cbid": 251, "correlation": 242241 } }, { "ph": "f", "id": 242241, "pid": 76337, "tid": -914061504, "ts": 1716454225188289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225188294, "dur": 0, "args": { "External id": 242243, "cbid": 251, "correlation": 242243 } }, { "ph": "f", "id": 242243, "pid": 76337, "tid": -914061504, "ts": 1716454225188294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225263752, "dur": 12, "args": { "External id": 242244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242244, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 242244, "pid": 5, "tid": 7, "ts": 1716454225263752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188296, "dur": 11, "args": { "External id": 242244, "cbid": 211, "correlation": 242244 } }, { "ph": "s", "id": 242244, "pid": 76337, "tid": -914061504, "ts": 1716454225188296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225263765, "dur": 5, "args": { "External id": 242246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242246, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 242246, "pid": 5, "tid": 7, "ts": 1716454225263765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188309, "dur": 6, "args": { "External id": 242246, "cbid": 211, "correlation": 242246 } }, { "ph": "s", "id": 242246, "pid": 76337, "tid": -914061504, "ts": 1716454225188309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225263771, "dur": 29, "args": { "External id": 242256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242256, "pid": 5, "tid": 7, "ts": 1716454225263771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188366, "dur": 12, "args": { "External id": 242256, "cbid": 211, "correlation": 242256 } }, { "ph": "s", "id": 242256, "pid": 76337, "tid": -914061504, "ts": 1716454225188366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225263802, "dur": 32, "args": { "External id": 242276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242276, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 242276, "pid": 5, "tid": 7, "ts": 1716454225263802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188430, "dur": 11, "args": { "External id": 242276, "cbid": 211, "correlation": 242276 } }, { "ph": "s", "id": 242276, "pid": 76337, "tid": -914061504, "ts": 1716454225188430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225263835, "dur": 4, "args": { "External id": 242288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242288, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 242288, "pid": 5, "tid": 7, "ts": 1716454225263835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188451, "dur": 6, "args": { "External id": 242288, "cbid": 211, "correlation": 242288 } }, { "ph": "s", "id": 242288, "pid": 76337, "tid": -914061504, "ts": 1716454225188451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225263840, "dur": 29, "args": { "External id": 242291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242291, "pid": 5, "tid": 7, "ts": 1716454225263840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188469, "dur": 7, "args": { "External id": 242291, "cbid": 211, "correlation": 242291 } }, { "ph": "s", "id": 242291, "pid": 76337, "tid": -914061504, "ts": 1716454225188469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225263870, "dur": 21, "args": { "External id": 242300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242300, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242300, "pid": 5, "tid": 7, "ts": 1716454225263870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188510, "dur": 10, "args": { "External id": 242300, "cbid": 211, "correlation": 242300 } }, { "ph": "s", "id": 242300, "pid": 76337, "tid": -914061504, "ts": 1716454225188510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225188571, "dur": 0, "args": { "External id": 242310, "cbid": 317, "correlation": 242310 } }, { "ph": "f", "id": 242310, "pid": 76337, "tid": -914061504, "ts": 1716454225188571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225188572, "dur": 0, "args": { "External id": 242311, "cbid": 203, "correlation": 242311 } }, { "ph": "f", "id": 242311, "pid": 76337, "tid": -914061504, "ts": 1716454225188572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225188573, "dur": 0, "args": { "External id": 242312, "cbid": 205, "correlation": 242312 } }, { "ph": "f", "id": 242312, "pid": 76337, "tid": -914061504, "ts": 1716454225188573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225263893, "dur": 22, "args": { "External id": 242316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242316, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242316, "pid": 5, "tid": 7, "ts": 1716454225263893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188587, "dur": 12, "args": { "External id": 242316, "cbid": 211, "correlation": 242316 } }, { "ph": "s", "id": 242316, "pid": 76337, "tid": -914061504, "ts": 1716454225188587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225263916, "dur": 44, "args": { "External id": 242318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242318, "pid": 5, "tid": 7, "ts": 1716454225263916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188601, "dur": 5, "args": { "External id": 242318, "cbid": 211, "correlation": 242318 } }, { "ph": "s", "id": 242318, "pid": 76337, "tid": -914061504, "ts": 1716454225188601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225263961, "dur": 638, "args": { "External id": 242320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242320, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242320, "pid": 5, "tid": 7, "ts": 1716454225263961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188612, "dur": 6, "args": { "External id": 242320, "cbid": 211, "correlation": 242320 } }, { "ph": "s", "id": 242320, "pid": 76337, "tid": -914061504, "ts": 1716454225188612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225264600, "dur": 21, "args": { "External id": 242322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242322, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242322, "pid": 5, "tid": 7, "ts": 1716454225264600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188622, "dur": 5, "args": { "External id": 242322, "cbid": 211, "correlation": 242322 } }, { "ph": "s", "id": 242322, "pid": 76337, "tid": -914061504, "ts": 1716454225188622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225264623, "dur": 33, "args": { "External id": 242328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242328, "pid": 5, "tid": 7, "ts": 1716454225264623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188650, "dur": 8, "args": { "External id": 242328, "cbid": 211, "correlation": 242328 } }, { "ph": "s", "id": 242328, "pid": 76337, "tid": -914061504, "ts": 1716454225188650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225188707, "dur": 0, "args": { "External id": 242338, "cbid": 317, "correlation": 242338 } }, { "ph": "f", "id": 242338, "pid": 76337, "tid": -914061504, "ts": 1716454225188707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225188708, "dur": 0, "args": { "External id": 242339, "cbid": 203, "correlation": 242339 } }, { "ph": "f", "id": 242339, "pid": 76337, "tid": -914061504, "ts": 1716454225188708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225188709, "dur": 0, "args": { "External id": 242340, "cbid": 205, "correlation": 242340 } }, { "ph": "f", "id": 242340, "pid": 76337, "tid": -914061504, "ts": 1716454225188709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225264657, "dur": 29, "args": { "External id": 242344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242344, "pid": 5, "tid": 7, "ts": 1716454225264657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188722, "dur": 12, "args": { "External id": 242344, "cbid": 211, "correlation": 242344 } }, { "ph": "s", "id": 242344, "pid": 76337, "tid": -914061504, "ts": 1716454225188722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225264688, "dur": 149, "args": { "External id": 242346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242346, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242346, "pid": 5, "tid": 7, "ts": 1716454225264688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188740, "dur": 6, "args": { "External id": 242346, "cbid": 211, "correlation": 242346 } }, { "ph": "s", "id": 242346, "pid": 76337, "tid": -914061504, "ts": 1716454225188740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225264838, "dur": 22, "args": { "External id": 242348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242348, "pid": 5, "tid": 7, "ts": 1716454225264838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188750, "dur": 5, "args": { "External id": 242348, "cbid": 211, "correlation": 242348 } }, { "ph": "s", "id": 242348, "pid": 76337, "tid": -914061504, "ts": 1716454225188750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225264862, "dur": 32, "args": { "External id": 242354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242354, "pid": 5, "tid": 7, "ts": 1716454225264862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188775, "dur": 9, "args": { "External id": 242354, "cbid": 211, "correlation": 242354 } }, { "ph": "s", "id": 242354, "pid": 76337, "tid": -914061504, "ts": 1716454225188775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225264895, "dur": 27, "args": { "External id": 242362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242362, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242362, "pid": 5, "tid": 7, "ts": 1716454225264895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188804, "dur": 7, "args": { "External id": 242362, "cbid": 211, "correlation": 242362 } }, { "ph": "s", "id": 242362, "pid": 76337, "tid": -914061504, "ts": 1716454225188804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225264924, "dur": 19, "args": { "External id": 242370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242370, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242370, "pid": 5, "tid": 7, "ts": 1716454225264924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188833, "dur": 8, "args": { "External id": 242370, "cbid": 211, "correlation": 242370 } }, { "ph": "s", "id": 242370, "pid": 76337, "tid": -914061504, "ts": 1716454225188833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225264944, "dur": 30, "args": { "External id": 242390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242390, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 242390, "pid": 5, "tid": 7, "ts": 1716454225264944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188914, "dur": 12, "args": { "External id": 242390, "cbid": 211, "correlation": 242390 } }, { "ph": "s", "id": 242390, "pid": 76337, "tid": -914061504, "ts": 1716454225188914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225264976, "dur": 4, "args": { "External id": 242402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242402, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 242402, "pid": 5, "tid": 7, "ts": 1716454225264976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188938, "dur": 6, "args": { "External id": 242402, "cbid": 211, "correlation": 242402 } }, { "ph": "s", "id": 242402, "pid": 76337, "tid": -914061504, "ts": 1716454225188938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225264981, "dur": 30, "args": { "External id": 242405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242405, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242405, "pid": 5, "tid": 7, "ts": 1716454225264981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225188955, "dur": 7, "args": { "External id": 242405, "cbid": 211, "correlation": 242405 } }, { "ph": "s", "id": 242405, "pid": 76337, "tid": -914061504, "ts": 1716454225188955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225189025, "dur": 0, "args": { "External id": 242416, "cbid": 317, "correlation": 242416 } }, { "ph": "f", "id": 242416, "pid": 76337, "tid": -914061504, "ts": 1716454225189025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225189026, "dur": 0, "args": { "External id": 242417, "cbid": 203, "correlation": 242417 } }, { "ph": "f", "id": 242417, "pid": 76337, "tid": -914061504, "ts": 1716454225189026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225189026, "dur": 0, "args": { "External id": 242418, "cbid": 205, "correlation": 242418 } }, { "ph": "f", "id": 242418, "pid": 76337, "tid": -914061504, "ts": 1716454225189026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225265013, "dur": 22, "args": { "External id": 242422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242422, "pid": 5, "tid": 7, "ts": 1716454225265013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189040, "dur": 12, "args": { "External id": 242422, "cbid": 211, "correlation": 242422 } }, { "ph": "s", "id": 242422, "pid": 76337, "tid": -914061504, "ts": 1716454225189040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225265036, "dur": 102, "args": { "External id": 242424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242424, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242424, "pid": 5, "tid": 7, "ts": 1716454225265036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189059, "dur": 6, "args": { "External id": 242424, "cbid": 211, "correlation": 242424 } }, { "ph": "s", "id": 242424, "pid": 76337, "tid": -914061504, "ts": 1716454225189059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225265140, "dur": 22, "args": { "External id": 242426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242426, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242426, "pid": 5, "tid": 7, "ts": 1716454225265140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189068, "dur": 5, "args": { "External id": 242426, "cbid": 211, "correlation": 242426 } }, { "ph": "s", "id": 242426, "pid": 76337, "tid": -914061504, "ts": 1716454225189068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225265163, "dur": 32, "args": { "External id": 242432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242432, "pid": 5, "tid": 7, "ts": 1716454225265163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189096, "dur": 8, "args": { "External id": 242432, "cbid": 211, "correlation": 242432 } }, { "ph": "s", "id": 242432, "pid": 76337, "tid": -914061504, "ts": 1716454225189096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225265197, "dur": 174, "args": { "External id": 242441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242441, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242441, "pid": 5, "tid": 7, "ts": 1716454225265197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189178, "dur": 15, "args": { "External id": 242441, "cbid": 211, "correlation": 242441 } }, { "ph": "s", "id": 242441, "pid": 76337, "tid": -914061504, "ts": 1716454225189178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225265372, "dur": 63, "args": { "External id": 242463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242463, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242463, "pid": 5, "tid": 7, "ts": 1716454225265372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189235, "dur": 10, "args": { "External id": 242463, "cbid": 211, "correlation": 242463 } }, { "ph": "s", "id": 242463, "pid": 76337, "tid": -914061504, "ts": 1716454225189235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225189320, "dur": 1, "args": { "External id": 242474, "cbid": 251, "correlation": 242474 } }, { "ph": "f", "id": 242474, "pid": 76337, "tid": -914061504, "ts": 1716454225189320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225265437, "dur": 150, "args": { "External id": 242475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242475, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242475, "pid": 5, "tid": 7, "ts": 1716454225265437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189325, "dur": 14, "args": { "External id": 242475, "cbid": 211, "correlation": 242475 } }, { "ph": "s", "id": 242475, "pid": 76337, "tid": -914061504, "ts": 1716454225189325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225189395, "dur": 1, "args": { "External id": 242486, "cbid": 251, "correlation": 242486 } }, { "ph": "f", "id": 242486, "pid": 76337, "tid": -914061504, "ts": 1716454225189395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225265588, "dur": 143, "args": { "External id": 242487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242487, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242487, "pid": 5, "tid": 7, "ts": 1716454225265588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189399, "dur": 11, "args": { "External id": 242487, "cbid": 211, "correlation": 242487 } }, { "ph": "s", "id": 242487, "pid": 76337, "tid": -914061504, "ts": 1716454225189399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225189463, "dur": 1, "args": { "External id": 242498, "cbid": 251, "correlation": 242498 } }, { "ph": "f", "id": 242498, "pid": 76337, "tid": -914061504, "ts": 1716454225189463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225265733, "dur": 144, "args": { "External id": 242499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242499, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242499, "pid": 5, "tid": 7, "ts": 1716454225265733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189467, "dur": 11, "args": { "External id": 242499, "cbid": 211, "correlation": 242499 } }, { "ph": "s", "id": 242499, "pid": 76337, "tid": -914061504, "ts": 1716454225189467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225265878, "dur": 1905, "args": { "External id": 242520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242520, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 242520, "pid": 5, "tid": 7, "ts": 1716454225265878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189546, "dur": 13, "args": { "External id": 242520, "cbid": 211, "correlation": 242520 } }, { "ph": "s", "id": 242520, "pid": 76337, "tid": -914061504, "ts": 1716454225189546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225189645, "dur": 1, "args": { "External id": 242538, "cbid": 251, "correlation": 242538 } }, { "ph": "f", "id": 242538, "pid": 76337, "tid": -914061504, "ts": 1716454225189645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225267785, "dur": 148, "args": { "External id": 242540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242540, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 242540, "pid": 5, "tid": 7, "ts": 1716454225267785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189650, "dur": 13, "args": { "External id": 242540, "cbid": 211, "correlation": 242540 } }, { "ph": "s", "id": 242540, "pid": 76337, "tid": -914061504, "ts": 1716454225189650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225267934, "dur": 35, "args": { "External id": 242548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242548, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242548, "pid": 5, "tid": 7, "ts": 1716454225267934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189719, "dur": 13, "args": { "External id": 242548, "cbid": 211, "correlation": 242548 } }, { "ph": "s", "id": 242548, "pid": 76337, "tid": -914061504, "ts": 1716454225189719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225267971, "dur": 50, "args": { "External id": 242556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242556, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242556, "pid": 5, "tid": 7, "ts": 1716454225267971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189759, "dur": 8, "args": { "External id": 242556, "cbid": 211, "correlation": 242556 } }, { "ph": "s", "id": 242556, "pid": 76337, "tid": -914061504, "ts": 1716454225189759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225268022, "dur": 31, "args": { "External id": 242567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242567, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242567, "pid": 5, "tid": 7, "ts": 1716454225268022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189830, "dur": 12, "args": { "External id": 242567, "cbid": 211, "correlation": 242567 } }, { "ph": "s", "id": 242567, "pid": 76337, "tid": -914061504, "ts": 1716454225189830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225268054, "dur": 34, "args": { "External id": 242589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242589, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242589, "pid": 5, "tid": 7, "ts": 1716454225268054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189860, "dur": 7, "args": { "External id": 242589, "cbid": 211, "correlation": 242589 } }, { "ph": "s", "id": 242589, "pid": 76337, "tid": -914061504, "ts": 1716454225189860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225189945, "dur": 1, "args": { "External id": 242600, "cbid": 251, "correlation": 242600 } }, { "ph": "f", "id": 242600, "pid": 76337, "tid": -914061504, "ts": 1716454225189945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225268089, "dur": 88, "args": { "External id": 242601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242601, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242601, "pid": 5, "tid": 7, "ts": 1716454225268089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225189950, "dur": 12, "args": { "External id": 242601, "cbid": 211, "correlation": 242601 } }, { "ph": "s", "id": 242601, "pid": 76337, "tid": -914061504, "ts": 1716454225189950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190030, "dur": 1, "args": { "External id": 242612, "cbid": 251, "correlation": 242612 } }, { "ph": "f", "id": 242612, "pid": 76337, "tid": -914061504, "ts": 1716454225190030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190033, "dur": 0, "args": { "External id": 242613, "cbid": 251, "correlation": 242613 } }, { "ph": "f", "id": 242613, "pid": 76337, "tid": -914061504, "ts": 1716454225190033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225268179, "dur": 11, "args": { "External id": 242614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242614, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 242614, "pid": 5, "tid": 7, "ts": 1716454225268179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190035, "dur": 13, "args": { "External id": 242614, "cbid": 211, "correlation": 242614 } }, { "ph": "s", "id": 242614, "pid": 76337, "tid": -914061504, "ts": 1716454225190035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225268191, "dur": 5, "args": { "External id": 242616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242616, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 242616, "pid": 5, "tid": 7, "ts": 1716454225268191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190049, "dur": 6, "args": { "External id": 242616, "cbid": 211, "correlation": 242616 } }, { "ph": "s", "id": 242616, "pid": 76337, "tid": -914061504, "ts": 1716454225190049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190106, "dur": 1, "args": { "External id": 242627, "cbid": 251, "correlation": 242627 } }, { "ph": "f", "id": 242627, "pid": 76337, "tid": -914061504, "ts": 1716454225190106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190110, "dur": 0, "args": { "External id": 242628, "cbid": 251, "correlation": 242628 } }, { "ph": "f", "id": 242628, "pid": 76337, "tid": -914061504, "ts": 1716454225190110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225268197, "dur": 7, "args": { "External id": 242629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242629, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 242629, "pid": 5, "tid": 7, "ts": 1716454225268197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190111, "dur": 11, "args": { "External id": 242629, "cbid": 211, "correlation": 242629 } }, { "ph": "s", "id": 242629, "pid": 76337, "tid": -914061504, "ts": 1716454225190111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225268206, "dur": 4, "args": { "External id": 242631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242631, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 242631, "pid": 5, "tid": 7, "ts": 1716454225268206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190124, "dur": 6, "args": { "External id": 242631, "cbid": 211, "correlation": 242631 } }, { "ph": "s", "id": 242631, "pid": 76337, "tid": -914061504, "ts": 1716454225190124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225268211, "dur": 90, "args": { "External id": 242652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242652, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 242652, "pid": 5, "tid": 7, "ts": 1716454225268211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190198, "dur": 12, "args": { "External id": 242652, "cbid": 211, "correlation": 242652 } }, { "ph": "s", "id": 242652, "pid": 76337, "tid": -914061504, "ts": 1716454225190198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190296, "dur": 1, "args": { "External id": 242670, "cbid": 251, "correlation": 242670 } }, { "ph": "f", "id": 242670, "pid": 76337, "tid": -914061504, "ts": 1716454225190296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225268302, "dur": 96, "args": { "External id": 242672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242672, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242672, "pid": 5, "tid": 7, "ts": 1716454225268302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190302, "dur": 13, "args": { "External id": 242672, "cbid": 211, "correlation": 242672 } }, { "ph": "s", "id": 242672, "pid": 76337, "tid": -914061504, "ts": 1716454225190302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225268400, "dur": 20, "args": { "External id": 242680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242680, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242680, "pid": 5, "tid": 7, "ts": 1716454225268400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190370, "dur": 13, "args": { "External id": 242680, "cbid": 211, "correlation": 242680 } }, { "ph": "s", "id": 242680, "pid": 76337, "tid": -914061504, "ts": 1716454225190370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225268421, "dur": 37, "args": { "External id": 242688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242688, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242688, "pid": 5, "tid": 7, "ts": 1716454225268421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190411, "dur": 9, "args": { "External id": 242688, "cbid": 211, "correlation": 242688 } }, { "ph": "s", "id": 242688, "pid": 76337, "tid": -914061504, "ts": 1716454225190411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225268459, "dur": 34, "args": { "External id": 242710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242710, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242710, "pid": 5, "tid": 7, "ts": 1716454225268459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190462, "dur": 11, "args": { "External id": 242710, "cbid": 211, "correlation": 242710 } }, { "ph": "s", "id": 242710, "pid": 76337, "tid": -914061504, "ts": 1716454225190462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190551, "dur": 1, "args": { "External id": 242726, "cbid": 251, "correlation": 242726 } }, { "ph": "f", "id": 242726, "pid": 76337, "tid": -914061504, "ts": 1716454225190551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190556, "dur": 0, "args": { "External id": 242728, "cbid": 251, "correlation": 242728 } }, { "ph": "f", "id": 242728, "pid": 76337, "tid": -914061504, "ts": 1716454225190556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225268494, "dur": 529, "args": { "External id": 242729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242729, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 242729, "pid": 5, "tid": 7, "ts": 1716454225268494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190559, "dur": 13, "args": { "External id": 242729, "cbid": 211, "correlation": 242729 } }, { "ph": "s", "id": 242729, "pid": 76337, "tid": -914061504, "ts": 1716454225190559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225269024, "dur": 124, "args": { "External id": 242737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242737, "pid": 5, "tid": 7, "ts": 1716454225269024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190625, "dur": 12, "args": { "External id": 242737, "cbid": 211, "correlation": 242737 } }, { "ph": "s", "id": 242737, "pid": 76337, "tid": -914061504, "ts": 1716454225190625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225269149, "dur": 128, "args": { "External id": 242745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242745, "pid": 5, "tid": 7, "ts": 1716454225269149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190654, "dur": 8, "args": { "External id": 242745, "cbid": 211, "correlation": 242745 } }, { "ph": "s", "id": 242745, "pid": 76337, "tid": -914061504, "ts": 1716454225190654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225190730, "dur": 1, "args": { "External id": 242761, "cbid": 251, "correlation": 242761 } }, { "ph": "f", "id": 242761, "pid": 76337, "tid": -914061504, "ts": 1716454225190730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225269279, "dur": 303, "args": { "External id": 242763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242763, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242763, "pid": 5, "tid": 7, "ts": 1716454225269279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190736, "dur": 12, "args": { "External id": 242763, "cbid": 211, "correlation": 242763 } }, { "ph": "s", "id": 242763, "pid": 76337, "tid": -914061504, "ts": 1716454225190736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225269584, "dur": 27, "args": { "External id": 242771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242771, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242771, "pid": 5, "tid": 7, "ts": 1716454225269584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190778, "dur": 10, "args": { "External id": 242771, "cbid": 211, "correlation": 242771 } }, { "ph": "s", "id": 242771, "pid": 76337, "tid": -914061504, "ts": 1716454225190778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225269612, "dur": 80, "args": { "External id": 242782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242782, "pid": 5, "tid": 7, "ts": 1716454225269612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190845, "dur": 12, "args": { "External id": 242782, "cbid": 211, "correlation": 242782 } }, { "ph": "s", "id": 242782, "pid": 76337, "tid": -914061504, "ts": 1716454225190845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225190908, "dur": 0, "args": { "External id": 242794, "cbid": 317, "correlation": 242794 } }, { "ph": "f", "id": 242794, "pid": 76337, "tid": -914061504, "ts": 1716454225190908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225190909, "dur": 0, "args": { "External id": 242795, "cbid": 203, "correlation": 242795 } }, { "ph": "f", "id": 242795, "pid": 76337, "tid": -914061504, "ts": 1716454225190909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225190910, "dur": 0, "args": { "External id": 242796, "cbid": 205, "correlation": 242796 } }, { "ph": "f", "id": 242796, "pid": 76337, "tid": -914061504, "ts": 1716454225190910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225269693, "dur": 23, "args": { "External id": 242800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242800, "pid": 5, "tid": 7, "ts": 1716454225269693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190925, "dur": 12, "args": { "External id": 242800, "cbid": 211, "correlation": 242800 } }, { "ph": "s", "id": 242800, "pid": 76337, "tid": -914061504, "ts": 1716454225190925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225269717, "dur": 118, "args": { "External id": 242802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242802, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242802, "pid": 5, "tid": 7, "ts": 1716454225269717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190943, "dur": 6, "args": { "External id": 242802, "cbid": 211, "correlation": 242802 } }, { "ph": "s", "id": 242802, "pid": 76337, "tid": -914061504, "ts": 1716454225190943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225269837, "dur": 23, "args": { "External id": 242804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242804, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242804, "pid": 5, "tid": 7, "ts": 1716454225269837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190953, "dur": 5, "args": { "External id": 242804, "cbid": 211, "correlation": 242804 } }, { "ph": "s", "id": 242804, "pid": 76337, "tid": -914061504, "ts": 1716454225190953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225269861, "dur": 33, "args": { "External id": 242810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242810, "pid": 5, "tid": 7, "ts": 1716454225269861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225190989, "dur": 9, "args": { "External id": 242810, "cbid": 211, "correlation": 242810 } }, { "ph": "s", "id": 242810, "pid": 76337, "tid": -914061504, "ts": 1716454225190989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225269895, "dur": 27, "args": { "External id": 242818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242818, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242818, "pid": 5, "tid": 7, "ts": 1716454225269895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191022, "dur": 8, "args": { "External id": 242818, "cbid": 211, "correlation": 242818 } }, { "ph": "s", "id": 242818, "pid": 76337, "tid": -914061504, "ts": 1716454225191022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225269923, "dur": 100, "args": { "External id": 242829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242829, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242829, "pid": 5, "tid": 7, "ts": 1716454225269923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191083, "dur": 11, "args": { "External id": 242829, "cbid": 211, "correlation": 242829 } }, { "ph": "s", "id": 242829, "pid": 76337, "tid": -914061504, "ts": 1716454225191083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225191137, "dur": 0, "args": { "External id": 242839, "cbid": 317, "correlation": 242839 } }, { "ph": "f", "id": 242839, "pid": 76337, "tid": -914061504, "ts": 1716454225191137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225191138, "dur": 0, "args": { "External id": 242840, "cbid": 203, "correlation": 242840 } }, { "ph": "f", "id": 242840, "pid": 76337, "tid": -914061504, "ts": 1716454225191138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225191138, "dur": 0, "args": { "External id": 242841, "cbid": 205, "correlation": 242841 } }, { "ph": "f", "id": 242841, "pid": 76337, "tid": -914061504, "ts": 1716454225191138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225270024, "dur": 75, "args": { "External id": 242845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242845, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242845, "pid": 5, "tid": 7, "ts": 1716454225270024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191153, "dur": 12, "args": { "External id": 242845, "cbid": 211, "correlation": 242845 } }, { "ph": "s", "id": 242845, "pid": 76337, "tid": -914061504, "ts": 1716454225191153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225270101, "dur": 43, "args": { "External id": 242847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242847, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242847, "pid": 5, "tid": 7, "ts": 1716454225270101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191167, "dur": 6, "args": { "External id": 242847, "cbid": 211, "correlation": 242847 } }, { "ph": "s", "id": 242847, "pid": 76337, "tid": -914061504, "ts": 1716454225191167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225270145, "dur": 4, "args": { "External id": 242849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 242849, "pid": 5, "tid": 7, "ts": 1716454225270145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191178, "dur": 6, "args": { "External id": 242849, "cbid": 211, "correlation": 242849 } }, { "ph": "s", "id": 242849, "pid": 76337, "tid": -914061504, "ts": 1716454225191178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225191187, "dur": 0, "args": { "External id": 242850, "cbid": 51, "correlation": 242850 } }, { "ph": "s", "id": 242850, "pid": 76337, "tid": -914061504, "ts": 1716454225191187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225270150, "dur": 2225, "args": { "External id": 242851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242851, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242851, "pid": 5, "tid": 7, "ts": 1716454225270150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191188, "dur": 5, "args": { "External id": 242851, "cbid": 211, "correlation": 242851 } }, { "ph": "s", "id": 242851, "pid": 76337, "tid": -914061504, "ts": 1716454225191188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225272376, "dur": 111, "args": { "External id": 242856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242856, "pid": 5, "tid": 7, "ts": 1716454225272376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191215, "dur": 9, "args": { "External id": 242856, "cbid": 211, "correlation": 242856 } }, { "ph": "s", "id": 242856, "pid": 76337, "tid": -914061504, "ts": 1716454225191215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225272489, "dur": 162, "args": { "External id": 242865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242865, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242865, "pid": 5, "tid": 7, "ts": 1716454225272489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191305, "dur": 14, "args": { "External id": 242865, "cbid": 211, "correlation": 242865 } }, { "ph": "s", "id": 242865, "pid": 76337, "tid": -914061504, "ts": 1716454225191305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225272652, "dur": 125, "args": { "External id": 242885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242885, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 242885, "pid": 5, "tid": 7, "ts": 1716454225272652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191375, "dur": 11, "args": { "External id": 242885, "cbid": 211, "correlation": 242885 } }, { "ph": "s", "id": 242885, "pid": 76337, "tid": -914061504, "ts": 1716454225191375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225272778, "dur": 5, "args": { "External id": 242897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242897, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 242897, "pid": 5, "tid": 7, "ts": 1716454225272778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191396, "dur": 6, "args": { "External id": 242897, "cbid": 211, "correlation": 242897 } }, { "ph": "s", "id": 242897, "pid": 76337, "tid": -914061504, "ts": 1716454225191396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225272784, "dur": 156, "args": { "External id": 242900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242900, "pid": 5, "tid": 7, "ts": 1716454225272784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191415, "dur": 8, "args": { "External id": 242900, "cbid": 211, "correlation": 242900 } }, { "ph": "s", "id": 242900, "pid": 76337, "tid": -914061504, "ts": 1716454225191415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225272942, "dur": 101, "args": { "External id": 242909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242909, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242909, "pid": 5, "tid": 7, "ts": 1716454225272942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191456, "dur": 11, "args": { "External id": 242909, "cbid": 211, "correlation": 242909 } }, { "ph": "s", "id": 242909, "pid": 76337, "tid": -914061504, "ts": 1716454225191456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225191508, "dur": 0, "args": { "External id": 242919, "cbid": 317, "correlation": 242919 } }, { "ph": "f", "id": 242919, "pid": 76337, "tid": -914061504, "ts": 1716454225191508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225191509, "dur": 0, "args": { "External id": 242920, "cbid": 203, "correlation": 242920 } }, { "ph": "f", "id": 242920, "pid": 76337, "tid": -914061504, "ts": 1716454225191509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225191510, "dur": 0, "args": { "External id": 242921, "cbid": 205, "correlation": 242921 } }, { "ph": "f", "id": 242921, "pid": 76337, "tid": -914061504, "ts": 1716454225191510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225273044, "dur": 112, "args": { "External id": 242925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242925, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242925, "pid": 5, "tid": 7, "ts": 1716454225273044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191526, "dur": 11, "args": { "External id": 242925, "cbid": 211, "correlation": 242925 } }, { "ph": "s", "id": 242925, "pid": 76337, "tid": -914061504, "ts": 1716454225191526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225273157, "dur": 33, "args": { "External id": 242927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242927, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242927, "pid": 5, "tid": 7, "ts": 1716454225273157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191540, "dur": 5, "args": { "External id": 242927, "cbid": 211, "correlation": 242927 } }, { "ph": "s", "id": 242927, "pid": 76337, "tid": -914061504, "ts": 1716454225191540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225273191, "dur": 3, "args": { "External id": 242929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 242929, "pid": 5, "tid": 7, "ts": 1716454225273191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191549, "dur": 5, "args": { "External id": 242929, "cbid": 211, "correlation": 242929 } }, { "ph": "s", "id": 242929, "pid": 76337, "tid": -914061504, "ts": 1716454225191549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225191558, "dur": 0, "args": { "External id": 242930, "cbid": 51, "correlation": 242930 } }, { "ph": "s", "id": 242930, "pid": 76337, "tid": -914061504, "ts": 1716454225191558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225273196, "dur": 1987, "args": { "External id": 242931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242931, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 242931, "pid": 5, "tid": 7, "ts": 1716454225273196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191559, "dur": 6, "args": { "External id": 242931, "cbid": 211, "correlation": 242931 } }, { "ph": "s", "id": 242931, "pid": 76337, "tid": -914061504, "ts": 1716454225191559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225275184, "dur": 58, "args": { "External id": 242936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242936, "pid": 5, "tid": 7, "ts": 1716454225275184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191588, "dur": 9, "args": { "External id": 242936, "cbid": 211, "correlation": 242936 } }, { "ph": "s", "id": 242936, "pid": 76337, "tid": -914061504, "ts": 1716454225191588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225275244, "dur": 3, "args": { "External id": 242944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242944, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 242944, "pid": 5, "tid": 7, "ts": 1716454225275244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191631, "dur": 9, "args": { "External id": 242944, "cbid": 211, "correlation": 242944 } }, { "ph": "s", "id": 242944, "pid": 76337, "tid": -914061504, "ts": 1716454225191631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225191696, "dur": 1, "args": { "External id": 242960, "cbid": 251, "correlation": 242960 } }, { "ph": "f", "id": 242960, "pid": 76337, "tid": -914061504, "ts": 1716454225191696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225191701, "dur": 0, "args": { "External id": 242962, "cbid": 251, "correlation": 242962 } }, { "ph": "f", "id": 242962, "pid": 76337, "tid": -914061504, "ts": 1716454225191701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225275248, "dur": 11, "args": { "External id": 242963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242963, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 242963, "pid": 5, "tid": 7, "ts": 1716454225275248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191703, "dur": 12, "args": { "External id": 242963, "cbid": 211, "correlation": 242963 } }, { "ph": "s", "id": 242963, "pid": 76337, "tid": -914061504, "ts": 1716454225191703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225275260, "dur": 5, "args": { "External id": 242965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242965, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 242965, "pid": 5, "tid": 7, "ts": 1716454225275260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191717, "dur": 5, "args": { "External id": 242965, "cbid": 211, "correlation": 242965 } }, { "ph": "s", "id": 242965, "pid": 76337, "tid": -914061504, "ts": 1716454225191717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225275267, "dur": 52, "args": { "External id": 242975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242975, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 242975, "pid": 5, "tid": 7, "ts": 1716454225275267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191775, "dur": 12, "args": { "External id": 242975, "cbid": 211, "correlation": 242975 } }, { "ph": "s", "id": 242975, "pid": 76337, "tid": -914061504, "ts": 1716454225191775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225275321, "dur": 51, "args": { "External id": 242995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 242995, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 242995, "pid": 5, "tid": 7, "ts": 1716454225275321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191841, "dur": 10, "args": { "External id": 242995, "cbid": 211, "correlation": 242995 } }, { "ph": "s", "id": 242995, "pid": 76337, "tid": -914061504, "ts": 1716454225191841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225275373, "dur": 4, "args": { "External id": 243007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243007, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243007, "pid": 5, "tid": 7, "ts": 1716454225275373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191861, "dur": 7, "args": { "External id": 243007, "cbid": 211, "correlation": 243007 } }, { "ph": "s", "id": 243007, "pid": 76337, "tid": -914061504, "ts": 1716454225191861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225275378, "dur": 55, "args": { "External id": 243010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243010, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243010, "pid": 5, "tid": 7, "ts": 1716454225275378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191879, "dur": 6, "args": { "External id": 243010, "cbid": 211, "correlation": 243010 } }, { "ph": "s", "id": 243010, "pid": 76337, "tid": -914061504, "ts": 1716454225191879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225275435, "dur": 37, "args": { "External id": 243019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243019, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243019, "pid": 5, "tid": 7, "ts": 1716454225275435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225191920, "dur": 10, "args": { "External id": 243019, "cbid": 211, "correlation": 243019 } }, { "ph": "s", "id": 243019, "pid": 76337, "tid": -914061504, "ts": 1716454225191920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225191991, "dur": 0, "args": { "External id": 243029, "cbid": 317, "correlation": 243029 } }, { "ph": "f", "id": 243029, "pid": 76337, "tid": -914061504, "ts": 1716454225191991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225191992, "dur": 0, "args": { "External id": 243030, "cbid": 203, "correlation": 243030 } }, { "ph": "f", "id": 243030, "pid": 76337, "tid": -914061504, "ts": 1716454225191992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225191993, "dur": 0, "args": { "External id": 243031, "cbid": 205, "correlation": 243031 } }, { "ph": "f", "id": 243031, "pid": 76337, "tid": -914061504, "ts": 1716454225191993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225275472, "dur": 40, "args": { "External id": 243035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243035, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243035, "pid": 5, "tid": 7, "ts": 1716454225275472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192008, "dur": 13, "args": { "External id": 243035, "cbid": 211, "correlation": 243035 } }, { "ph": "s", "id": 243035, "pid": 76337, "tid": -914061504, "ts": 1716454225192008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225275514, "dur": 14, "args": { "External id": 243037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243037, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243037, "pid": 5, "tid": 7, "ts": 1716454225275514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192024, "dur": 5, "args": { "External id": 243037, "cbid": 211, "correlation": 243037 } }, { "ph": "s", "id": 243037, "pid": 76337, "tid": -914061504, "ts": 1716454225192024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225275529, "dur": 3, "args": { "External id": 243039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243039, "pid": 5, "tid": 7, "ts": 1716454225275529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192034, "dur": 5, "args": { "External id": 243039, "cbid": 211, "correlation": 243039 } }, { "ph": "s", "id": 243039, "pid": 76337, "tid": -914061504, "ts": 1716454225192034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225192042, "dur": 0, "args": { "External id": 243040, "cbid": 51, "correlation": 243040 } }, { "ph": "s", "id": 243040, "pid": 76337, "tid": -914061504, "ts": 1716454225192042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225275533, "dur": 688, "args": { "External id": 243041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243041, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243041, "pid": 5, "tid": 7, "ts": 1716454225275533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192043, "dur": 5, "args": { "External id": 243041, "cbid": 211, "correlation": 243041 } }, { "ph": "s", "id": 243041, "pid": 76337, "tid": -914061504, "ts": 1716454225192043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225276222, "dur": 58, "args": { "External id": 243046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243046, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243046, "pid": 5, "tid": 7, "ts": 1716454225276222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192071, "dur": 8, "args": { "External id": 243046, "cbid": 211, "correlation": 243046 } }, { "ph": "s", "id": 243046, "pid": 76337, "tid": -914061504, "ts": 1716454225192071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225192128, "dur": 0, "args": { "External id": 243056, "cbid": 317, "correlation": 243056 } }, { "ph": "f", "id": 243056, "pid": 76337, "tid": -914061504, "ts": 1716454225192128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225192128, "dur": 0, "args": { "External id": 243057, "cbid": 203, "correlation": 243057 } }, { "ph": "f", "id": 243057, "pid": 76337, "tid": -914061504, "ts": 1716454225192128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225192129, "dur": 0, "args": { "External id": 243058, "cbid": 205, "correlation": 243058 } }, { "ph": "f", "id": 243058, "pid": 76337, "tid": -914061504, "ts": 1716454225192129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225276282, "dur": 3, "args": { "External id": 243062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243062, "pid": 5, "tid": 7, "ts": 1716454225276282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192144, "dur": 11, "args": { "External id": 243062, "cbid": 211, "correlation": 243062 } }, { "ph": "s", "id": 243062, "pid": 76337, "tid": -914061504, "ts": 1716454225192144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225192160, "dur": 0, "args": { "External id": 243063, "cbid": 51, "correlation": 243063 } }, { "ph": "s", "id": 243063, "pid": 76337, "tid": -914061504, "ts": 1716454225192160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454225276286, "dur": 261, "args": { "External id": 243064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243064, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243064, "pid": 5, "tid": 7, "ts": 1716454225276286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192161, "dur": 7, "args": { "External id": 243064, "cbid": 211, "correlation": 243064 } }, { "ph": "s", "id": 243064, "pid": 76337, "tid": -914061504, "ts": 1716454225192161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225276548, "dur": 59, "args": { "External id": 243069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243069, "pid": 5, "tid": 7, "ts": 1716454225276548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192188, "dur": 8, "args": { "External id": 243069, "cbid": 211, "correlation": 243069 } }, { "ph": "s", "id": 243069, "pid": 76337, "tid": -914061504, "ts": 1716454225192188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225276609, "dur": 51, "args": { "External id": 243077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243077, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243077, "pid": 5, "tid": 7, "ts": 1716454225276609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192217, "dur": 8, "args": { "External id": 243077, "cbid": 211, "correlation": 243077 } }, { "ph": "s", "id": 243077, "pid": 76337, "tid": -914061504, "ts": 1716454225192217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225276661, "dur": 35, "args": { "External id": 243085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243085, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243085, "pid": 5, "tid": 7, "ts": 1716454225276661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192245, "dur": 8, "args": { "External id": 243085, "cbid": 211, "correlation": 243085 } }, { "ph": "s", "id": 243085, "pid": 76337, "tid": -914061504, "ts": 1716454225192245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225276697, "dur": 52, "args": { "External id": 243105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243105, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 243105, "pid": 5, "tid": 7, "ts": 1716454225276697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192327, "dur": 12, "args": { "External id": 243105, "cbid": 211, "correlation": 243105 } }, { "ph": "s", "id": 243105, "pid": 76337, "tid": -914061504, "ts": 1716454225192327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225276751, "dur": 4, "args": { "External id": 243117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243117, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243117, "pid": 5, "tid": 7, "ts": 1716454225276751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192349, "dur": 6, "args": { "External id": 243117, "cbid": 211, "correlation": 243117 } }, { "ph": "s", "id": 243117, "pid": 76337, "tid": -914061504, "ts": 1716454225192349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225276756, "dur": 53, "args": { "External id": 243120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243120, "pid": 5, "tid": 7, "ts": 1716454225276756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192368, "dur": 7, "args": { "External id": 243120, "cbid": 211, "correlation": 243120 } }, { "ph": "s", "id": 243120, "pid": 76337, "tid": -914061504, "ts": 1716454225192368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225192425, "dur": 0, "args": { "External id": 243131, "cbid": 317, "correlation": 243131 } }, { "ph": "f", "id": 243131, "pid": 76337, "tid": -914061504, "ts": 1716454225192425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225192426, "dur": 0, "args": { "External id": 243132, "cbid": 203, "correlation": 243132 } }, { "ph": "f", "id": 243132, "pid": 76337, "tid": -914061504, "ts": 1716454225192426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225192426, "dur": 0, "args": { "External id": 243133, "cbid": 205, "correlation": 243133 } }, { "ph": "f", "id": 243133, "pid": 76337, "tid": -914061504, "ts": 1716454225192426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192455, "dur": 2, "args": { "External id": 243137, "cbid": 251, "correlation": 243137 } }, { "ph": "f", "id": 243137, "pid": 76337, "tid": -914061504, "ts": 1716454225192455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192458, "dur": 1, "args": { "External id": 243138, "cbid": 251, "correlation": 243138 } }, { "ph": "f", "id": 243138, "pid": 76337, "tid": -914061504, "ts": 1716454225192458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192460, "dur": 0, "args": { "External id": 243139, "cbid": 251, "correlation": 243139 } }, { "ph": "f", "id": 243139, "pid": 76337, "tid": -914061504, "ts": 1716454225192460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192461, "dur": 1, "args": { "External id": 243140, "cbid": 251, "correlation": 243140 } }, { "ph": "f", "id": 243140, "pid": 76337, "tid": -914061504, "ts": 1716454225192461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192463, "dur": 1, "args": { "External id": 243141, "cbid": 251, "correlation": 243141 } }, { "ph": "f", "id": 243141, "pid": 76337, "tid": -914061504, "ts": 1716454225192463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192464, "dur": 1, "args": { "External id": 243142, "cbid": 251, "correlation": 243142 } }, { "ph": "f", "id": 243142, "pid": 76337, "tid": -914061504, "ts": 1716454225192464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192466, "dur": 1, "args": { "External id": 243143, "cbid": 251, "correlation": 243143 } }, { "ph": "f", "id": 243143, "pid": 76337, "tid": -914061504, "ts": 1716454225192466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192467, "dur": 1, "args": { "External id": 243144, "cbid": 251, "correlation": 243144 } }, { "ph": "f", "id": 243144, "pid": 76337, "tid": -914061504, "ts": 1716454225192467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192470, "dur": 0, "args": { "External id": 243145, "cbid": 251, "correlation": 243145 } }, { "ph": "f", "id": 243145, "pid": 76337, "tid": -914061504, "ts": 1716454225192470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225276810, "dur": 112, "args": { "External id": 243146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243146, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243146, "pid": 5, "tid": 7, "ts": 1716454225276810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192472, "dur": 12, "args": { "External id": 243146, "cbid": 211, "correlation": 243146 } }, { "ph": "s", "id": 243146, "pid": 76337, "tid": -914061504, "ts": 1716454225192472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225276924, "dur": 59, "args": { "External id": 243152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243152, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243152, "pid": 5, "tid": 7, "ts": 1716454225276924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192507, "dur": 9, "args": { "External id": 243152, "cbid": 211, "correlation": 243152 } }, { "ph": "s", "id": 243152, "pid": 76337, "tid": -914061504, "ts": 1716454225192507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225276984, "dur": 587, "args": { "External id": 243161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243161, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243161, "pid": 5, "tid": 7, "ts": 1716454225276984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192589, "dur": 14, "args": { "External id": 243161, "cbid": 211, "correlation": 243161 } }, { "ph": "s", "id": 243161, "pid": 76337, "tid": -914061504, "ts": 1716454225192589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225277572, "dur": 176, "args": { "External id": 243183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243183, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243183, "pid": 5, "tid": 7, "ts": 1716454225277572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192645, "dur": 11, "args": { "External id": 243183, "cbid": 211, "correlation": 243183 } }, { "ph": "s", "id": 243183, "pid": 76337, "tid": -914061504, "ts": 1716454225192645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192733, "dur": 1, "args": { "External id": 243194, "cbid": 251, "correlation": 243194 } }, { "ph": "f", "id": 243194, "pid": 76337, "tid": -914061504, "ts": 1716454225192733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225277750, "dur": 192, "args": { "External id": 243195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243195, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243195, "pid": 5, "tid": 7, "ts": 1716454225277750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192738, "dur": 13, "args": { "External id": 243195, "cbid": 211, "correlation": 243195 } }, { "ph": "s", "id": 243195, "pid": 76337, "tid": -914061504, "ts": 1716454225192738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192807, "dur": 1, "args": { "External id": 243206, "cbid": 251, "correlation": 243206 } }, { "ph": "f", "id": 243206, "pid": 76337, "tid": -914061504, "ts": 1716454225192807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225277943, "dur": 186, "args": { "External id": 243207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243207, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243207, "pid": 5, "tid": 7, "ts": 1716454225277943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192811, "dur": 11, "args": { "External id": 243207, "cbid": 211, "correlation": 243207 } }, { "ph": "s", "id": 243207, "pid": 76337, "tid": -914061504, "ts": 1716454225192811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225192874, "dur": 1, "args": { "External id": 243218, "cbid": 251, "correlation": 243218 } }, { "ph": "f", "id": 243218, "pid": 76337, "tid": -914061504, "ts": 1716454225192874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225278130, "dur": 185, "args": { "External id": 243219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243219, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243219, "pid": 5, "tid": 7, "ts": 1716454225278130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192878, "dur": 12, "args": { "External id": 243219, "cbid": 211, "correlation": 243219 } }, { "ph": "s", "id": 243219, "pid": 76337, "tid": -914061504, "ts": 1716454225192878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225278316, "dur": 18119, "args": { "External id": 243240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243240, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243240, "pid": 5, "tid": 7, "ts": 1716454225278316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225192962, "dur": 23, "args": { "External id": 243240, "cbid": 211, "correlation": 243240 } }, { "ph": "s", "id": 243240, "pid": 76337, "tid": -914061504, "ts": 1716454225192962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193072, "dur": 1, "args": { "External id": 243258, "cbid": 251, "correlation": 243258 } }, { "ph": "f", "id": 243258, "pid": 76337, "tid": -914061504, "ts": 1716454225193072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225296436, "dur": 199, "args": { "External id": 243260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243260, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243260, "pid": 5, "tid": 7, "ts": 1716454225296436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193077, "dur": 13, "args": { "External id": 243260, "cbid": 211, "correlation": 243260 } }, { "ph": "s", "id": 243260, "pid": 76337, "tid": -914061504, "ts": 1716454225193077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225296636, "dur": 67, "args": { "External id": 243268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243268, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243268, "pid": 5, "tid": 7, "ts": 1716454225296636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193147, "dur": 13, "args": { "External id": 243268, "cbid": 211, "correlation": 243268 } }, { "ph": "s", "id": 243268, "pid": 76337, "tid": -914061504, "ts": 1716454225193147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225296705, "dur": 98, "args": { "External id": 243276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243276, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243276, "pid": 5, "tid": 7, "ts": 1716454225296705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193187, "dur": 8, "args": { "External id": 243276, "cbid": 211, "correlation": 243276 } }, { "ph": "s", "id": 243276, "pid": 76337, "tid": -914061504, "ts": 1716454225193187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225296804, "dur": 54, "args": { "External id": 243287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243287, "pid": 5, "tid": 7, "ts": 1716454225296804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193259, "dur": 12, "args": { "External id": 243287, "cbid": 211, "correlation": 243287 } }, { "ph": "s", "id": 243287, "pid": 76337, "tid": -914061504, "ts": 1716454225193259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225296859, "dur": 90, "args": { "External id": 243309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243309, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243309, "pid": 5, "tid": 7, "ts": 1716454225296859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193290, "dur": 8, "args": { "External id": 243309, "cbid": 211, "correlation": 243309 } }, { "ph": "s", "id": 243309, "pid": 76337, "tid": -914061504, "ts": 1716454225193290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193371, "dur": 1, "args": { "External id": 243320, "cbid": 251, "correlation": 243320 } }, { "ph": "f", "id": 243320, "pid": 76337, "tid": -914061504, "ts": 1716454225193371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225296950, "dur": 101, "args": { "External id": 243321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243321, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243321, "pid": 5, "tid": 7, "ts": 1716454225296950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193376, "dur": 13, "args": { "External id": 243321, "cbid": 211, "correlation": 243321 } }, { "ph": "s", "id": 243321, "pid": 76337, "tid": -914061504, "ts": 1716454225193376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193450, "dur": 1, "args": { "External id": 243332, "cbid": 251, "correlation": 243332 } }, { "ph": "f", "id": 243332, "pid": 76337, "tid": -914061504, "ts": 1716454225193450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193454, "dur": 0, "args": { "External id": 243333, "cbid": 251, "correlation": 243333 } }, { "ph": "f", "id": 243333, "pid": 76337, "tid": -914061504, "ts": 1716454225193454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225297053, "dur": 10, "args": { "External id": 243334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243334, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 243334, "pid": 5, "tid": 7, "ts": 1716454225297053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193456, "dur": 13, "args": { "External id": 243334, "cbid": 211, "correlation": 243334 } }, { "ph": "s", "id": 243334, "pid": 76337, "tid": -914061504, "ts": 1716454225193456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225297064, "dur": 5, "args": { "External id": 243336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243336, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 243336, "pid": 5, "tid": 7, "ts": 1716454225297064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193472, "dur": 7, "args": { "External id": 243336, "cbid": 211, "correlation": 243336 } }, { "ph": "s", "id": 243336, "pid": 76337, "tid": -914061504, "ts": 1716454225193472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193533, "dur": 1, "args": { "External id": 243347, "cbid": 251, "correlation": 243347 } }, { "ph": "f", "id": 243347, "pid": 76337, "tid": -914061504, "ts": 1716454225193533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193536, "dur": 0, "args": { "External id": 243348, "cbid": 251, "correlation": 243348 } }, { "ph": "f", "id": 243348, "pid": 76337, "tid": -914061504, "ts": 1716454225193536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225297071, "dur": 6, "args": { "External id": 243349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243349, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 243349, "pid": 5, "tid": 7, "ts": 1716454225297071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193538, "dur": 12, "args": { "External id": 243349, "cbid": 211, "correlation": 243349 } }, { "ph": "s", "id": 243349, "pid": 76337, "tid": -914061504, "ts": 1716454225193538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225297078, "dur": 3, "args": { "External id": 243351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243351, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 243351, "pid": 5, "tid": 7, "ts": 1716454225297078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193552, "dur": 5, "args": { "External id": 243351, "cbid": 211, "correlation": 243351 } }, { "ph": "s", "id": 243351, "pid": 76337, "tid": -914061504, "ts": 1716454225193552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225297083, "dur": 153, "args": { "External id": 243372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243372, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243372, "pid": 5, "tid": 7, "ts": 1716454225297083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193625, "dur": 12, "args": { "External id": 243372, "cbid": 211, "correlation": 243372 } }, { "ph": "s", "id": 243372, "pid": 76337, "tid": -914061504, "ts": 1716454225193625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193721, "dur": 1, "args": { "External id": 243390, "cbid": 251, "correlation": 243390 } }, { "ph": "f", "id": 243390, "pid": 76337, "tid": -914061504, "ts": 1716454225193721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225297237, "dur": 103, "args": { "External id": 243392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243392, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243392, "pid": 5, "tid": 7, "ts": 1716454225297237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193727, "dur": 13, "args": { "External id": 243392, "cbid": 211, "correlation": 243392 } }, { "ph": "s", "id": 243392, "pid": 76337, "tid": -914061504, "ts": 1716454225193727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225297341, "dur": 35, "args": { "External id": 243400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243400, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243400, "pid": 5, "tid": 7, "ts": 1716454225297341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193796, "dur": 13, "args": { "External id": 243400, "cbid": 211, "correlation": 243400 } }, { "ph": "s", "id": 243400, "pid": 76337, "tid": -914061504, "ts": 1716454225193796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225297377, "dur": 66, "args": { "External id": 243408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243408, "pid": 5, "tid": 7, "ts": 1716454225297377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193837, "dur": 10, "args": { "External id": 243408, "cbid": 211, "correlation": 243408 } }, { "ph": "s", "id": 243408, "pid": 76337, "tid": -914061504, "ts": 1716454225193837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225297444, "dur": 90, "args": { "External id": 243430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243430, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243430, "pid": 5, "tid": 7, "ts": 1716454225297444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193888, "dur": 11, "args": { "External id": 243430, "cbid": 211, "correlation": 243430 } }, { "ph": "s", "id": 243430, "pid": 76337, "tid": -914061504, "ts": 1716454225193888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225193982, "dur": 1, "args": { "External id": 243446, "cbid": 251, "correlation": 243446 } }, { "ph": "f", "id": 243446, "pid": 76337, "tid": -914061504, "ts": 1716454225193982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225297536, "dur": 562, "args": { "External id": 243448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243448, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243448, "pid": 5, "tid": 7, "ts": 1716454225297536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225193987, "dur": 14, "args": { "External id": 243448, "cbid": 211, "correlation": 243448 } }, { "ph": "s", "id": 243448, "pid": 76337, "tid": -914061504, "ts": 1716454225193987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225298099, "dur": 241, "args": { "External id": 243456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243456, "pid": 5, "tid": 7, "ts": 1716454225298099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194055, "dur": 13, "args": { "External id": 243456, "cbid": 211, "correlation": 243456 } }, { "ph": "s", "id": 243456, "pid": 76337, "tid": -914061504, "ts": 1716454225194055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225298341, "dur": 252, "args": { "External id": 243464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243464, "pid": 5, "tid": 7, "ts": 1716454225298341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194086, "dur": 8, "args": { "External id": 243464, "cbid": 211, "correlation": 243464 } }, { "ph": "s", "id": 243464, "pid": 76337, "tid": -914061504, "ts": 1716454225194086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194168, "dur": 1, "args": { "External id": 243480, "cbid": 251, "correlation": 243480 } }, { "ph": "f", "id": 243480, "pid": 76337, "tid": -914061504, "ts": 1716454225194168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194173, "dur": 0, "args": { "External id": 243482, "cbid": 251, "correlation": 243482 } }, { "ph": "f", "id": 243482, "pid": 76337, "tid": -914061504, "ts": 1716454225194173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225298594, "dur": 352, "args": { "External id": 243483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243483, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 243483, "pid": 5, "tid": 7, "ts": 1716454225298594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194176, "dur": 12, "args": { "External id": 243483, "cbid": 211, "correlation": 243483 } }, { "ph": "s", "id": 243483, "pid": 76337, "tid": -914061504, "ts": 1716454225194176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225298947, "dur": 50, "args": { "External id": 243491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243491, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243491, "pid": 5, "tid": 7, "ts": 1716454225298947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194219, "dur": 10, "args": { "External id": 243491, "cbid": 211, "correlation": 243491 } }, { "ph": "s", "id": 243491, "pid": 76337, "tid": -914061504, "ts": 1716454225194219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225298998, "dur": 154, "args": { "External id": 243502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243502, "pid": 5, "tid": 7, "ts": 1716454225298998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194286, "dur": 13, "args": { "External id": 243502, "cbid": 211, "correlation": 243502 } }, { "ph": "s", "id": 243502, "pid": 76337, "tid": -914061504, "ts": 1716454225194286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225194351, "dur": 0, "args": { "External id": 243514, "cbid": 317, "correlation": 243514 } }, { "ph": "f", "id": 243514, "pid": 76337, "tid": -914061504, "ts": 1716454225194351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225194352, "dur": 0, "args": { "External id": 243515, "cbid": 203, "correlation": 243515 } }, { "ph": "f", "id": 243515, "pid": 76337, "tid": -914061504, "ts": 1716454225194352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225194353, "dur": 0, "args": { "External id": 243516, "cbid": 205, "correlation": 243516 } }, { "ph": "f", "id": 243516, "pid": 76337, "tid": -914061504, "ts": 1716454225194353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194375, "dur": 1, "args": { "External id": 243520, "cbid": 251, "correlation": 243520 } }, { "ph": "f", "id": 243520, "pid": 76337, "tid": -914061504, "ts": 1716454225194375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194377, "dur": 0, "args": { "External id": 243521, "cbid": 251, "correlation": 243521 } }, { "ph": "f", "id": 243521, "pid": 76337, "tid": -914061504, "ts": 1716454225194377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194378, "dur": 0, "args": { "External id": 243522, "cbid": 251, "correlation": 243522 } }, { "ph": "f", "id": 243522, "pid": 76337, "tid": -914061504, "ts": 1716454225194378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194379, "dur": 0, "args": { "External id": 243523, "cbid": 251, "correlation": 243523 } }, { "ph": "f", "id": 243523, "pid": 76337, "tid": -914061504, "ts": 1716454225194379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194380, "dur": 0, "args": { "External id": 243524, "cbid": 251, "correlation": 243524 } }, { "ph": "f", "id": 243524, "pid": 76337, "tid": -914061504, "ts": 1716454225194380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194381, "dur": 0, "args": { "External id": 243525, "cbid": 251, "correlation": 243525 } }, { "ph": "f", "id": 243525, "pid": 76337, "tid": -914061504, "ts": 1716454225194381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194382, "dur": 0, "args": { "External id": 243526, "cbid": 251, "correlation": 243526 } }, { "ph": "f", "id": 243526, "pid": 76337, "tid": -914061504, "ts": 1716454225194382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194383, "dur": 0, "args": { "External id": 243527, "cbid": 251, "correlation": 243527 } }, { "ph": "f", "id": 243527, "pid": 76337, "tid": -914061504, "ts": 1716454225194383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194384, "dur": 0, "args": { "External id": 243528, "cbid": 251, "correlation": 243528 } }, { "ph": "f", "id": 243528, "pid": 76337, "tid": -914061504, "ts": 1716454225194384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225299154, "dur": 113, "args": { "External id": 243529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243529, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243529, "pid": 5, "tid": 7, "ts": 1716454225299154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194386, "dur": 13, "args": { "External id": 243529, "cbid": 211, "correlation": 243529 } }, { "ph": "s", "id": 243529, "pid": 76337, "tid": -914061504, "ts": 1716454225194386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225299268, "dur": 58, "args": { "External id": 243535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243535, "pid": 5, "tid": 7, "ts": 1716454225299268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194421, "dur": 9, "args": { "External id": 243535, "cbid": 211, "correlation": 243535 } }, { "ph": "s", "id": 243535, "pid": 76337, "tid": -914061504, "ts": 1716454225194421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225299328, "dur": 50, "args": { "External id": 243543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243543, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243543, "pid": 5, "tid": 7, "ts": 1716454225299328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194453, "dur": 8, "args": { "External id": 243543, "cbid": 211, "correlation": 243543 } }, { "ph": "s", "id": 243543, "pid": 76337, "tid": -914061504, "ts": 1716454225194453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225299379, "dur": 97, "args": { "External id": 243552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243552, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243552, "pid": 5, "tid": 7, "ts": 1716454225299379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194492, "dur": 10, "args": { "External id": 243552, "cbid": 211, "correlation": 243552 } }, { "ph": "s", "id": 243552, "pid": 76337, "tid": -914061504, "ts": 1716454225194492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225299477, "dur": 91, "args": { "External id": 243572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243572, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 243572, "pid": 5, "tid": 7, "ts": 1716454225299477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194563, "dur": 11, "args": { "External id": 243572, "cbid": 211, "correlation": 243572 } }, { "ph": "s", "id": 243572, "pid": 76337, "tid": -914061504, "ts": 1716454225194563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225299570, "dur": 4, "args": { "External id": 243584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243584, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 243584, "pid": 5, "tid": 7, "ts": 1716454225299570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194584, "dur": 6, "args": { "External id": 243584, "cbid": 211, "correlation": 243584 } }, { "ph": "s", "id": 243584, "pid": 76337, "tid": -914061504, "ts": 1716454225194584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225299576, "dur": 108, "args": { "External id": 243587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243587, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243587, "pid": 5, "tid": 7, "ts": 1716454225299576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194603, "dur": 7, "args": { "External id": 243587, "cbid": 211, "correlation": 243587 } }, { "ph": "s", "id": 243587, "pid": 76337, "tid": -914061504, "ts": 1716454225194603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225299685, "dur": 69, "args": { "External id": 243596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243596, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243596, "pid": 5, "tid": 7, "ts": 1716454225299685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194641, "dur": 10, "args": { "External id": 243596, "cbid": 211, "correlation": 243596 } }, { "ph": "s", "id": 243596, "pid": 76337, "tid": -914061504, "ts": 1716454225194641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225194692, "dur": 0, "args": { "External id": 243606, "cbid": 317, "correlation": 243606 } }, { "ph": "f", "id": 243606, "pid": 76337, "tid": -914061504, "ts": 1716454225194692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225194693, "dur": 0, "args": { "External id": 243607, "cbid": 203, "correlation": 243607 } }, { "ph": "f", "id": 243607, "pid": 76337, "tid": -914061504, "ts": 1716454225194693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225194693, "dur": 0, "args": { "External id": 243608, "cbid": 205, "correlation": 243608 } }, { "ph": "f", "id": 243608, "pid": 76337, "tid": -914061504, "ts": 1716454225194693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225299755, "dur": 76, "args": { "External id": 243612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243612, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243612, "pid": 5, "tid": 7, "ts": 1716454225299755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194709, "dur": 12, "args": { "External id": 243612, "cbid": 211, "correlation": 243612 } }, { "ph": "s", "id": 243612, "pid": 76337, "tid": -914061504, "ts": 1716454225194709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225299832, "dur": 23, "args": { "External id": 243614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243614, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243614, "pid": 5, "tid": 7, "ts": 1716454225299832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194723, "dur": 5, "args": { "External id": 243614, "cbid": 211, "correlation": 243614 } }, { "ph": "s", "id": 243614, "pid": 76337, "tid": -914061504, "ts": 1716454225194723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225299857, "dur": 4, "args": { "External id": 243616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243616, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243616, "pid": 5, "tid": 7, "ts": 1716454225299857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194733, "dur": 6, "args": { "External id": 243616, "cbid": 211, "correlation": 243616 } }, { "ph": "s", "id": 243616, "pid": 76337, "tid": -914061504, "ts": 1716454225194733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225194742, "dur": 0, "args": { "External id": 243617, "cbid": 51, "correlation": 243617 } }, { "ph": "s", "id": 243617, "pid": 76337, "tid": -914061504, "ts": 1716454225194742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225299862, "dur": 1349, "args": { "External id": 243618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243618, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243618, "pid": 5, "tid": 7, "ts": 1716454225299862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194742, "dur": 5, "args": { "External id": 243618, "cbid": 211, "correlation": 243618 } }, { "ph": "s", "id": 243618, "pid": 76337, "tid": -914061504, "ts": 1716454225194742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225301212, "dur": 58, "args": { "External id": 243623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243623, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243623, "pid": 5, "tid": 7, "ts": 1716454225301212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194770, "dur": 9, "args": { "External id": 243623, "cbid": 211, "correlation": 243623 } }, { "ph": "s", "id": 243623, "pid": 76337, "tid": -914061504, "ts": 1716454225194770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225301271, "dur": 4, "args": { "External id": 243631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243631, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243631, "pid": 5, "tid": 7, "ts": 1716454225301271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194814, "dur": 9, "args": { "External id": 243631, "cbid": 211, "correlation": 243631 } }, { "ph": "s", "id": 243631, "pid": 76337, "tid": -914061504, "ts": 1716454225194814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194879, "dur": 1, "args": { "External id": 243647, "cbid": 251, "correlation": 243647 } }, { "ph": "f", "id": 243647, "pid": 76337, "tid": -914061504, "ts": 1716454225194879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225194884, "dur": 0, "args": { "External id": 243649, "cbid": 251, "correlation": 243649 } }, { "ph": "f", "id": 243649, "pid": 76337, "tid": -914061504, "ts": 1716454225194884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225301276, "dur": 11, "args": { "External id": 243650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243650, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 243650, "pid": 5, "tid": 7, "ts": 1716454225301276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194886, "dur": 13, "args": { "External id": 243650, "cbid": 211, "correlation": 243650 } }, { "ph": "s", "id": 243650, "pid": 76337, "tid": -914061504, "ts": 1716454225194886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225301288, "dur": 5, "args": { "External id": 243652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243652, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 243652, "pid": 5, "tid": 7, "ts": 1716454225301288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194901, "dur": 5, "args": { "External id": 243652, "cbid": 211, "correlation": 243652 } }, { "ph": "s", "id": 243652, "pid": 76337, "tid": -914061504, "ts": 1716454225194901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225301294, "dur": 53, "args": { "External id": 243662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243662, "pid": 5, "tid": 7, "ts": 1716454225301294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225194958, "dur": 12, "args": { "External id": 243662, "cbid": 211, "correlation": 243662 } }, { "ph": "s", "id": 243662, "pid": 76337, "tid": -914061504, "ts": 1716454225194958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225301349, "dur": 51, "args": { "External id": 243682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243682, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 243682, "pid": 5, "tid": 7, "ts": 1716454225301349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195032, "dur": 12, "args": { "External id": 243682, "cbid": 211, "correlation": 243682 } }, { "ph": "s", "id": 243682, "pid": 76337, "tid": -914061504, "ts": 1716454225195032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225301401, "dur": 4, "args": { "External id": 243694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243694, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243694, "pid": 5, "tid": 7, "ts": 1716454225301401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195054, "dur": 6, "args": { "External id": 243694, "cbid": 211, "correlation": 243694 } }, { "ph": "s", "id": 243694, "pid": 76337, "tid": -914061504, "ts": 1716454225195054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225301406, "dur": 55, "args": { "External id": 243697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243697, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243697, "pid": 5, "tid": 7, "ts": 1716454225301406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195072, "dur": 6, "args": { "External id": 243697, "cbid": 211, "correlation": 243697 } }, { "ph": "s", "id": 243697, "pid": 76337, "tid": -914061504, "ts": 1716454225195072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225301463, "dur": 37, "args": { "External id": 243706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243706, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243706, "pid": 5, "tid": 7, "ts": 1716454225301463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195113, "dur": 9, "args": { "External id": 243706, "cbid": 211, "correlation": 243706 } }, { "ph": "s", "id": 243706, "pid": 76337, "tid": -914061504, "ts": 1716454225195113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225195174, "dur": 0, "args": { "External id": 243716, "cbid": 317, "correlation": 243716 } }, { "ph": "f", "id": 243716, "pid": 76337, "tid": -914061504, "ts": 1716454225195174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225195175, "dur": 0, "args": { "External id": 243717, "cbid": 203, "correlation": 243717 } }, { "ph": "f", "id": 243717, "pid": 76337, "tid": -914061504, "ts": 1716454225195175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225195176, "dur": 0, "args": { "External id": 243718, "cbid": 205, "correlation": 243718 } }, { "ph": "f", "id": 243718, "pid": 76337, "tid": -914061504, "ts": 1716454225195176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225301501, "dur": 39, "args": { "External id": 243722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243722, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243722, "pid": 5, "tid": 7, "ts": 1716454225301501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195191, "dur": 12, "args": { "External id": 243722, "cbid": 211, "correlation": 243722 } }, { "ph": "s", "id": 243722, "pid": 76337, "tid": -914061504, "ts": 1716454225195191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225301542, "dur": 14, "args": { "External id": 243724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243724, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243724, "pid": 5, "tid": 7, "ts": 1716454225301542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195205, "dur": 5, "args": { "External id": 243724, "cbid": 211, "correlation": 243724 } }, { "ph": "s", "id": 243724, "pid": 76337, "tid": -914061504, "ts": 1716454225195205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225301557, "dur": 3, "args": { "External id": 243726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243726, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243726, "pid": 5, "tid": 7, "ts": 1716454225301557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195214, "dur": 5, "args": { "External id": 243726, "cbid": 211, "correlation": 243726 } }, { "ph": "s", "id": 243726, "pid": 76337, "tid": -914061504, "ts": 1716454225195214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225195222, "dur": 0, "args": { "External id": 243727, "cbid": 51, "correlation": 243727 } }, { "ph": "s", "id": 243727, "pid": 76337, "tid": -914061504, "ts": 1716454225195222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225301561, "dur": 687, "args": { "External id": 243728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243728, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243728, "pid": 5, "tid": 7, "ts": 1716454225301561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195223, "dur": 5, "args": { "External id": 243728, "cbid": 211, "correlation": 243728 } }, { "ph": "s", "id": 243728, "pid": 76337, "tid": -914061504, "ts": 1716454225195223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225302249, "dur": 58, "args": { "External id": 243733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243733, "pid": 5, "tid": 7, "ts": 1716454225302249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195251, "dur": 8, "args": { "External id": 243733, "cbid": 211, "correlation": 243733 } }, { "ph": "s", "id": 243733, "pid": 76337, "tid": -914061504, "ts": 1716454225195251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225195308, "dur": 0, "args": { "External id": 243743, "cbid": 317, "correlation": 243743 } }, { "ph": "f", "id": 243743, "pid": 76337, "tid": -914061504, "ts": 1716454225195308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225195309, "dur": 0, "args": { "External id": 243744, "cbid": 203, "correlation": 243744 } }, { "ph": "f", "id": 243744, "pid": 76337, "tid": -914061504, "ts": 1716454225195309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225195310, "dur": 0, "args": { "External id": 243745, "cbid": 205, "correlation": 243745 } }, { "ph": "f", "id": 243745, "pid": 76337, "tid": -914061504, "ts": 1716454225195310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225302309, "dur": 75, "args": { "External id": 243749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243749, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243749, "pid": 5, "tid": 7, "ts": 1716454225302309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195322, "dur": 12, "args": { "External id": 243749, "cbid": 211, "correlation": 243749 } }, { "ph": "s", "id": 243749, "pid": 76337, "tid": -914061504, "ts": 1716454225195322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225302385, "dur": 204, "args": { "External id": 243751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243751, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243751, "pid": 5, "tid": 7, "ts": 1716454225302385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195342, "dur": 8, "args": { "External id": 243751, "cbid": 211, "correlation": 243751 } }, { "ph": "s", "id": 243751, "pid": 76337, "tid": -914061504, "ts": 1716454225195342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225302591, "dur": 37, "args": { "External id": 243753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243753, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243753, "pid": 5, "tid": 7, "ts": 1716454225302591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195354, "dur": 6, "args": { "External id": 243753, "cbid": 211, "correlation": 243753 } }, { "ph": "s", "id": 243753, "pid": 76337, "tid": -914061504, "ts": 1716454225195354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225302629, "dur": 59, "args": { "External id": 243759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243759, "pid": 5, "tid": 7, "ts": 1716454225302629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195381, "dur": 9, "args": { "External id": 243759, "cbid": 211, "correlation": 243759 } }, { "ph": "s", "id": 243759, "pid": 76337, "tid": -914061504, "ts": 1716454225195381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225302689, "dur": 50, "args": { "External id": 243767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243767, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243767, "pid": 5, "tid": 7, "ts": 1716454225302689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195410, "dur": 8, "args": { "External id": 243767, "cbid": 211, "correlation": 243767 } }, { "ph": "s", "id": 243767, "pid": 76337, "tid": -914061504, "ts": 1716454225195410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225302741, "dur": 36, "args": { "External id": 243775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243775, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243775, "pid": 5, "tid": 7, "ts": 1716454225302741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195439, "dur": 8, "args": { "External id": 243775, "cbid": 211, "correlation": 243775 } }, { "ph": "s", "id": 243775, "pid": 76337, "tid": -914061504, "ts": 1716454225195439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225302778, "dur": 50, "args": { "External id": 243795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243795, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 243795, "pid": 5, "tid": 7, "ts": 1716454225302778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195522, "dur": 13, "args": { "External id": 243795, "cbid": 211, "correlation": 243795 } }, { "ph": "s", "id": 243795, "pid": 76337, "tid": -914061504, "ts": 1716454225195522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225302830, "dur": 4, "args": { "External id": 243807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243807, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 243807, "pid": 5, "tid": 7, "ts": 1716454225302830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195545, "dur": 6, "args": { "External id": 243807, "cbid": 211, "correlation": 243807 } }, { "ph": "s", "id": 243807, "pid": 76337, "tid": -914061504, "ts": 1716454225195545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225302835, "dur": 56, "args": { "External id": 243810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243810, "pid": 5, "tid": 7, "ts": 1716454225302835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195562, "dur": 6, "args": { "External id": 243810, "cbid": 211, "correlation": 243810 } }, { "ph": "s", "id": 243810, "pid": 76337, "tid": -914061504, "ts": 1716454225195562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225195618, "dur": 0, "args": { "External id": 243821, "cbid": 317, "correlation": 243821 } }, { "ph": "f", "id": 243821, "pid": 76337, "tid": -914061504, "ts": 1716454225195618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225195619, "dur": 0, "args": { "External id": 243822, "cbid": 203, "correlation": 243822 } }, { "ph": "f", "id": 243822, "pid": 76337, "tid": -914061504, "ts": 1716454225195619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225195620, "dur": 0, "args": { "External id": 243823, "cbid": 205, "correlation": 243823 } }, { "ph": "f", "id": 243823, "pid": 76337, "tid": -914061504, "ts": 1716454225195620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195642, "dur": 1, "args": { "External id": 243827, "cbid": 251, "correlation": 243827 } }, { "ph": "f", "id": 243827, "pid": 76337, "tid": -914061504, "ts": 1716454225195642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195644, "dur": 0, "args": { "External id": 243828, "cbid": 251, "correlation": 243828 } }, { "ph": "f", "id": 243828, "pid": 76337, "tid": -914061504, "ts": 1716454225195644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195645, "dur": 0, "args": { "External id": 243829, "cbid": 251, "correlation": 243829 } }, { "ph": "f", "id": 243829, "pid": 76337, "tid": -914061504, "ts": 1716454225195645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195645, "dur": 0, "args": { "External id": 243830, "cbid": 251, "correlation": 243830 } }, { "ph": "f", "id": 243830, "pid": 76337, "tid": -914061504, "ts": 1716454225195645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195646, "dur": 0, "args": { "External id": 243831, "cbid": 251, "correlation": 243831 } }, { "ph": "f", "id": 243831, "pid": 76337, "tid": -914061504, "ts": 1716454225195646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195647, "dur": 0, "args": { "External id": 243832, "cbid": 251, "correlation": 243832 } }, { "ph": "f", "id": 243832, "pid": 76337, "tid": -914061504, "ts": 1716454225195647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195648, "dur": 0, "args": { "External id": 243833, "cbid": 251, "correlation": 243833 } }, { "ph": "f", "id": 243833, "pid": 76337, "tid": -914061504, "ts": 1716454225195648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195648, "dur": 0, "args": { "External id": 243834, "cbid": 251, "correlation": 243834 } }, { "ph": "f", "id": 243834, "pid": 76337, "tid": -914061504, "ts": 1716454225195648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195650, "dur": 0, "args": { "External id": 243835, "cbid": 251, "correlation": 243835 } }, { "ph": "f", "id": 243835, "pid": 76337, "tid": -914061504, "ts": 1716454225195650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225302892, "dur": 112, "args": { "External id": 243836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243836, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243836, "pid": 5, "tid": 7, "ts": 1716454225302892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195652, "dur": 12, "args": { "External id": 243836, "cbid": 211, "correlation": 243836 } }, { "ph": "s", "id": 243836, "pid": 76337, "tid": -914061504, "ts": 1716454225195652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225303006, "dur": 59, "args": { "External id": 243842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243842, "pid": 5, "tid": 7, "ts": 1716454225303006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195688, "dur": 8, "args": { "External id": 243842, "cbid": 211, "correlation": 243842 } }, { "ph": "s", "id": 243842, "pid": 76337, "tid": -914061504, "ts": 1716454225195688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225303066, "dur": 617, "args": { "External id": 243851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243851, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243851, "pid": 5, "tid": 7, "ts": 1716454225303066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195768, "dur": 15, "args": { "External id": 243851, "cbid": 211, "correlation": 243851 } }, { "ph": "s", "id": 243851, "pid": 76337, "tid": -914061504, "ts": 1716454225195768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225303684, "dur": 176, "args": { "External id": 243873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243873, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243873, "pid": 5, "tid": 7, "ts": 1716454225303684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195824, "dur": 11, "args": { "External id": 243873, "cbid": 211, "correlation": 243873 } }, { "ph": "s", "id": 243873, "pid": 76337, "tid": -914061504, "ts": 1716454225195824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195910, "dur": 1, "args": { "External id": 243884, "cbid": 251, "correlation": 243884 } }, { "ph": "f", "id": 243884, "pid": 76337, "tid": -914061504, "ts": 1716454225195910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225303862, "dur": 194, "args": { "External id": 243885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243885, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243885, "pid": 5, "tid": 7, "ts": 1716454225303862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195915, "dur": 13, "args": { "External id": 243885, "cbid": 211, "correlation": 243885 } }, { "ph": "s", "id": 243885, "pid": 76337, "tid": -914061504, "ts": 1716454225195915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225195992, "dur": 1, "args": { "External id": 243896, "cbid": 251, "correlation": 243896 } }, { "ph": "f", "id": 243896, "pid": 76337, "tid": -914061504, "ts": 1716454225195992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225304058, "dur": 183, "args": { "External id": 243897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243897, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243897, "pid": 5, "tid": 7, "ts": 1716454225304058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225195997, "dur": 12, "args": { "External id": 243897, "cbid": 211, "correlation": 243897 } }, { "ph": "s", "id": 243897, "pid": 76337, "tid": -914061504, "ts": 1716454225195997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225196062, "dur": 1, "args": { "External id": 243908, "cbid": 251, "correlation": 243908 } }, { "ph": "f", "id": 243908, "pid": 76337, "tid": -914061504, "ts": 1716454225196062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225304241, "dur": 185, "args": { "External id": 243909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243909, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243909, "pid": 5, "tid": 7, "ts": 1716454225304241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196066, "dur": 11, "args": { "External id": 243909, "cbid": 211, "correlation": 243909 } }, { "ph": "s", "id": 243909, "pid": 76337, "tid": -914061504, "ts": 1716454225196066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225304428, "dur": 18246, "args": { "External id": 243930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243930, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 243930, "pid": 5, "tid": 7, "ts": 1716454225304428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196147, "dur": 13, "args": { "External id": 243930, "cbid": 211, "correlation": 243930 } }, { "ph": "s", "id": 243930, "pid": 76337, "tid": -914061504, "ts": 1716454225196147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225196244, "dur": 1, "args": { "External id": 243948, "cbid": 251, "correlation": 243948 } }, { "ph": "f", "id": 243948, "pid": 76337, "tid": -914061504, "ts": 1716454225196244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225322676, "dur": 199, "args": { "External id": 243950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243950, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 243950, "pid": 5, "tid": 7, "ts": 1716454225322676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196249, "dur": 13, "args": { "External id": 243950, "cbid": 211, "correlation": 243950 } }, { "ph": "s", "id": 243950, "pid": 76337, "tid": -914061504, "ts": 1716454225196249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225322876, "dur": 66, "args": { "External id": 243958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243958, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243958, "pid": 5, "tid": 7, "ts": 1716454225322876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196319, "dur": 13, "args": { "External id": 243958, "cbid": 211, "correlation": 243958 } }, { "ph": "s", "id": 243958, "pid": 76337, "tid": -914061504, "ts": 1716454225196319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225322943, "dur": 97, "args": { "External id": 243966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243966, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243966, "pid": 5, "tid": 7, "ts": 1716454225322943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196358, "dur": 9, "args": { "External id": 243966, "cbid": 211, "correlation": 243966 } }, { "ph": "s", "id": 243966, "pid": 76337, "tid": -914061504, "ts": 1716454225196358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225323042, "dur": 54, "args": { "External id": 243977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243977, "pid": 5, "tid": 7, "ts": 1716454225323042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196429, "dur": 12, "args": { "External id": 243977, "cbid": 211, "correlation": 243977 } }, { "ph": "s", "id": 243977, "pid": 76337, "tid": -914061504, "ts": 1716454225196429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225323096, "dur": 90, "args": { "External id": 243999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 243999, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 243999, "pid": 5, "tid": 7, "ts": 1716454225323096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225196459, "dur": 1398, "args": { "External id": 243999, "cbid": 211, "correlation": 243999 } }, { "ph": "s", "id": 243999, "pid": 76337, "tid": -914061504, "ts": 1716454225196459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225197934, "dur": 1, "args": { "External id": 244010, "cbid": 251, "correlation": 244010 } }, { "ph": "f", "id": 244010, "pid": 76337, "tid": -914061504, "ts": 1716454225197934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225323187, "dur": 103, "args": { "External id": 244011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244011, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244011, "pid": 5, "tid": 7, "ts": 1716454225323187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225197939, "dur": 65, "args": { "External id": 244011, "cbid": 211, "correlation": 244011 } }, { "ph": "s", "id": 244011, "pid": 76337, "tid": -914061504, "ts": 1716454225197939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198065, "dur": 1, "args": { "External id": 244022, "cbid": 251, "correlation": 244022 } }, { "ph": "f", "id": 244022, "pid": 76337, "tid": -914061504, "ts": 1716454225198065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198069, "dur": 0, "args": { "External id": 244023, "cbid": 251, "correlation": 244023 } }, { "ph": "f", "id": 244023, "pid": 76337, "tid": -914061504, "ts": 1716454225198069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225323291, "dur": 10, "args": { "External id": 244024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244024, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 244024, "pid": 5, "tid": 7, "ts": 1716454225323291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198070, "dur": 12, "args": { "External id": 244024, "cbid": 211, "correlation": 244024 } }, { "ph": "s", "id": 244024, "pid": 76337, "tid": -914061504, "ts": 1716454225198070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225323302, "dur": 5, "args": { "External id": 244026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244026, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 244026, "pid": 5, "tid": 7, "ts": 1716454225323302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198084, "dur": 6, "args": { "External id": 244026, "cbid": 211, "correlation": 244026 } }, { "ph": "s", "id": 244026, "pid": 76337, "tid": -914061504, "ts": 1716454225198084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198145, "dur": 1, "args": { "External id": 244037, "cbid": 251, "correlation": 244037 } }, { "ph": "f", "id": 244037, "pid": 76337, "tid": -914061504, "ts": 1716454225198145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198148, "dur": 0, "args": { "External id": 244038, "cbid": 251, "correlation": 244038 } }, { "ph": "f", "id": 244038, "pid": 76337, "tid": -914061504, "ts": 1716454225198148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225323309, "dur": 6, "args": { "External id": 244039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244039, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 244039, "pid": 5, "tid": 7, "ts": 1716454225323309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198149, "dur": 13, "args": { "External id": 244039, "cbid": 211, "correlation": 244039 } }, { "ph": "s", "id": 244039, "pid": 76337, "tid": -914061504, "ts": 1716454225198149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225323316, "dur": 3, "args": { "External id": 244041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244041, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 244041, "pid": 5, "tid": 7, "ts": 1716454225323316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198164, "dur": 5, "args": { "External id": 244041, "cbid": 211, "correlation": 244041 } }, { "ph": "s", "id": 244041, "pid": 76337, "tid": -914061504, "ts": 1716454225198164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225323321, "dur": 154, "args": { "External id": 244062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244062, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244062, "pid": 5, "tid": 7, "ts": 1716454225323321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198238, "dur": 12, "args": { "External id": 244062, "cbid": 211, "correlation": 244062 } }, { "ph": "s", "id": 244062, "pid": 76337, "tid": -914061504, "ts": 1716454225198238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198334, "dur": 1, "args": { "External id": 244080, "cbid": 251, "correlation": 244080 } }, { "ph": "f", "id": 244080, "pid": 76337, "tid": -914061504, "ts": 1716454225198334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225323477, "dur": 103, "args": { "External id": 244082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244082, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244082, "pid": 5, "tid": 7, "ts": 1716454225323477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198340, "dur": 14, "args": { "External id": 244082, "cbid": 211, "correlation": 244082 } }, { "ph": "s", "id": 244082, "pid": 76337, "tid": -914061504, "ts": 1716454225198340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225323581, "dur": 35, "args": { "External id": 244090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244090, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244090, "pid": 5, "tid": 7, "ts": 1716454225323581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198409, "dur": 12, "args": { "External id": 244090, "cbid": 211, "correlation": 244090 } }, { "ph": "s", "id": 244090, "pid": 76337, "tid": -914061504, "ts": 1716454225198409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225323618, "dur": 67, "args": { "External id": 244098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244098, "pid": 5, "tid": 7, "ts": 1716454225323618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198450, "dur": 9, "args": { "External id": 244098, "cbid": 211, "correlation": 244098 } }, { "ph": "s", "id": 244098, "pid": 76337, "tid": -914061504, "ts": 1716454225198450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225323686, "dur": 89, "args": { "External id": 244120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244120, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244120, "pid": 5, "tid": 7, "ts": 1716454225323686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198500, "dur": 10, "args": { "External id": 244120, "cbid": 211, "correlation": 244120 } }, { "ph": "s", "id": 244120, "pid": 76337, "tid": -914061504, "ts": 1716454225198500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198585, "dur": 1, "args": { "External id": 244136, "cbid": 251, "correlation": 244136 } }, { "ph": "f", "id": 244136, "pid": 76337, "tid": -914061504, "ts": 1716454225198585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225323776, "dur": 566, "args": { "External id": 244138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244138, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244138, "pid": 5, "tid": 7, "ts": 1716454225323776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198590, "dur": 14, "args": { "External id": 244138, "cbid": 211, "correlation": 244138 } }, { "ph": "s", "id": 244138, "pid": 76337, "tid": -914061504, "ts": 1716454225198590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225324343, "dur": 238, "args": { "External id": 244146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244146, "pid": 5, "tid": 7, "ts": 1716454225324343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198657, "dur": 13, "args": { "External id": 244146, "cbid": 211, "correlation": 244146 } }, { "ph": "s", "id": 244146, "pid": 76337, "tid": -914061504, "ts": 1716454225198657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225324583, "dur": 251, "args": { "External id": 244154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244154, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244154, "pid": 5, "tid": 7, "ts": 1716454225324583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198687, "dur": 9, "args": { "External id": 244154, "cbid": 211, "correlation": 244154 } }, { "ph": "s", "id": 244154, "pid": 76337, "tid": -914061504, "ts": 1716454225198687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198768, "dur": 1, "args": { "External id": 244170, "cbid": 251, "correlation": 244170 } }, { "ph": "f", "id": 244170, "pid": 76337, "tid": -914061504, "ts": 1716454225198768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225198774, "dur": 0, "args": { "External id": 244172, "cbid": 251, "correlation": 244172 } }, { "ph": "f", "id": 244172, "pid": 76337, "tid": -914061504, "ts": 1716454225198774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225324836, "dur": 354, "args": { "External id": 244173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244173, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 244173, "pid": 5, "tid": 7, "ts": 1716454225324836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198776, "dur": 12, "args": { "External id": 244173, "cbid": 211, "correlation": 244173 } }, { "ph": "s", "id": 244173, "pid": 76337, "tid": -914061504, "ts": 1716454225198776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225325191, "dur": 51, "args": { "External id": 244181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244181, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244181, "pid": 5, "tid": 7, "ts": 1716454225325191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198818, "dur": 10, "args": { "External id": 244181, "cbid": 211, "correlation": 244181 } }, { "ph": "s", "id": 244181, "pid": 76337, "tid": -914061504, "ts": 1716454225198818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225325243, "dur": 155, "args": { "External id": 244192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244192, "pid": 5, "tid": 7, "ts": 1716454225325243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225198884, "dur": 213, "args": { "External id": 244192, "cbid": 211, "correlation": 244192 } }, { "ph": "s", "id": 244192, "pid": 76337, "tid": -914061504, "ts": 1716454225198884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225199150, "dur": 0, "args": { "External id": 244204, "cbid": 317, "correlation": 244204 } }, { "ph": "f", "id": 244204, "pid": 76337, "tid": -914061504, "ts": 1716454225199150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225199151, "dur": 0, "args": { "External id": 244205, "cbid": 203, "correlation": 244205 } }, { "ph": "f", "id": 244205, "pid": 76337, "tid": -914061504, "ts": 1716454225199151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225199152, "dur": 0, "args": { "External id": 244206, "cbid": 205, "correlation": 244206 } }, { "ph": "f", "id": 244206, "pid": 76337, "tid": -914061504, "ts": 1716454225199152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199173, "dur": 1, "args": { "External id": 244210, "cbid": 251, "correlation": 244210 } }, { "ph": "f", "id": 244210, "pid": 76337, "tid": -914061504, "ts": 1716454225199173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199176, "dur": 0, "args": { "External id": 244211, "cbid": 251, "correlation": 244211 } }, { "ph": "f", "id": 244211, "pid": 76337, "tid": -914061504, "ts": 1716454225199176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199177, "dur": 0, "args": { "External id": 244212, "cbid": 251, "correlation": 244212 } }, { "ph": "f", "id": 244212, "pid": 76337, "tid": -914061504, "ts": 1716454225199177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199178, "dur": 0, "args": { "External id": 244213, "cbid": 251, "correlation": 244213 } }, { "ph": "f", "id": 244213, "pid": 76337, "tid": -914061504, "ts": 1716454225199178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199178, "dur": 0, "args": { "External id": 244214, "cbid": 251, "correlation": 244214 } }, { "ph": "f", "id": 244214, "pid": 76337, "tid": -914061504, "ts": 1716454225199178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199179, "dur": 0, "args": { "External id": 244215, "cbid": 251, "correlation": 244215 } }, { "ph": "f", "id": 244215, "pid": 76337, "tid": -914061504, "ts": 1716454225199179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199180, "dur": 0, "args": { "External id": 244216, "cbid": 251, "correlation": 244216 } }, { "ph": "f", "id": 244216, "pid": 76337, "tid": -914061504, "ts": 1716454225199180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199181, "dur": 0, "args": { "External id": 244217, "cbid": 251, "correlation": 244217 } }, { "ph": "f", "id": 244217, "pid": 76337, "tid": -914061504, "ts": 1716454225199181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225199182, "dur": 0, "args": { "External id": 244218, "cbid": 251, "correlation": 244218 } }, { "ph": "f", "id": 244218, "pid": 76337, "tid": -914061504, "ts": 1716454225199182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225325400, "dur": 113, "args": { "External id": 244219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244219, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244219, "pid": 5, "tid": 7, "ts": 1716454225325400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199184, "dur": 39, "args": { "External id": 244219, "cbid": 211, "correlation": 244219 } }, { "ph": "s", "id": 244219, "pid": 76337, "tid": -914061504, "ts": 1716454225199184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225325514, "dur": 58, "args": { "External id": 244225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244225, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244225, "pid": 5, "tid": 7, "ts": 1716454225325514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199247, "dur": 104, "args": { "External id": 244225, "cbid": 211, "correlation": 244225 } }, { "ph": "s", "id": 244225, "pid": 76337, "tid": -914061504, "ts": 1716454225199247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225325574, "dur": 51, "args": { "External id": 244233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244233, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244233, "pid": 5, "tid": 7, "ts": 1716454225325574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199373, "dur": 280, "args": { "External id": 244233, "cbid": 211, "correlation": 244233 } }, { "ph": "s", "id": 244233, "pid": 76337, "tid": -914061504, "ts": 1716454225199373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225325626, "dur": 98, "args": { "External id": 244242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244242, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244242, "pid": 5, "tid": 7, "ts": 1716454225325626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199684, "dur": 10, "args": { "External id": 244242, "cbid": 211, "correlation": 244242 } }, { "ph": "s", "id": 244242, "pid": 76337, "tid": -914061504, "ts": 1716454225199684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225325725, "dur": 90, "args": { "External id": 244262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244262, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 244262, "pid": 5, "tid": 7, "ts": 1716454225325725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199756, "dur": 12, "args": { "External id": 244262, "cbid": 211, "correlation": 244262 } }, { "ph": "s", "id": 244262, "pid": 76337, "tid": -914061504, "ts": 1716454225199756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225325816, "dur": 5, "args": { "External id": 244274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244274, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 244274, "pid": 5, "tid": 7, "ts": 1716454225325816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199777, "dur": 10, "args": { "External id": 244274, "cbid": 211, "correlation": 244274 } }, { "ph": "s", "id": 244274, "pid": 76337, "tid": -914061504, "ts": 1716454225199777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225325822, "dur": 106, "args": { "External id": 244277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244277, "pid": 5, "tid": 7, "ts": 1716454225325822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199799, "dur": 105, "args": { "External id": 244277, "cbid": 211, "correlation": 244277 } }, { "ph": "s", "id": 244277, "pid": 76337, "tid": -914061504, "ts": 1716454225199799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225325930, "dur": 69, "args": { "External id": 244286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244286, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244286, "pid": 5, "tid": 7, "ts": 1716454225325930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225199937, "dur": 11, "args": { "External id": 244286, "cbid": 211, "correlation": 244286 } }, { "ph": "s", "id": 244286, "pid": 76337, "tid": -914061504, "ts": 1716454225199937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225199998, "dur": 0, "args": { "External id": 244296, "cbid": 317, "correlation": 244296 } }, { "ph": "f", "id": 244296, "pid": 76337, "tid": -914061504, "ts": 1716454225199998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225199999, "dur": 0, "args": { "External id": 244297, "cbid": 203, "correlation": 244297 } }, { "ph": "f", "id": 244297, "pid": 76337, "tid": -914061504, "ts": 1716454225199999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225199999, "dur": 0, "args": { "External id": 244298, "cbid": 205, "correlation": 244298 } }, { "ph": "f", "id": 244298, "pid": 76337, "tid": -914061504, "ts": 1716454225199999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225326000, "dur": 75, "args": { "External id": 244302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244302, "pid": 5, "tid": 7, "ts": 1716454225326000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200013, "dur": 12, "args": { "External id": 244302, "cbid": 211, "correlation": 244302 } }, { "ph": "s", "id": 244302, "pid": 76337, "tid": -914061504, "ts": 1716454225200013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225326077, "dur": 24, "args": { "External id": 244304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244304, "pid": 5, "tid": 7, "ts": 1716454225326077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200028, "dur": 5, "args": { "External id": 244304, "cbid": 211, "correlation": 244304 } }, { "ph": "s", "id": 244304, "pid": 76337, "tid": -914061504, "ts": 1716454225200028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225326102, "dur": 3, "args": { "External id": 244306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244306, "pid": 5, "tid": 7, "ts": 1716454225326102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200038, "dur": 7, "args": { "External id": 244306, "cbid": 211, "correlation": 244306 } }, { "ph": "s", "id": 244306, "pid": 76337, "tid": -914061504, "ts": 1716454225200038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225200049, "dur": 0, "args": { "External id": 244307, "cbid": 51, "correlation": 244307 } }, { "ph": "s", "id": 244307, "pid": 76337, "tid": -914061504, "ts": 1716454225200049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225326107, "dur": 1346, "args": { "External id": 244308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244308, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244308, "pid": 5, "tid": 7, "ts": 1716454225326107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200049, "dur": 5, "args": { "External id": 244308, "cbid": 211, "correlation": 244308 } }, { "ph": "s", "id": 244308, "pid": 76337, "tid": -914061504, "ts": 1716454225200049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225327454, "dur": 58, "args": { "External id": 244313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244313, "pid": 5, "tid": 7, "ts": 1716454225327454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200076, "dur": 9, "args": { "External id": 244313, "cbid": 211, "correlation": 244313 } }, { "ph": "s", "id": 244313, "pid": 76337, "tid": -914061504, "ts": 1716454225200076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225327513, "dur": 4, "args": { "External id": 244321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244321, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244321, "pid": 5, "tid": 7, "ts": 1716454225327513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200120, "dur": 9, "args": { "External id": 244321, "cbid": 211, "correlation": 244321 } }, { "ph": "s", "id": 244321, "pid": 76337, "tid": -914061504, "ts": 1716454225200120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225200184, "dur": 1, "args": { "External id": 244337, "cbid": 251, "correlation": 244337 } }, { "ph": "f", "id": 244337, "pid": 76337, "tid": -914061504, "ts": 1716454225200184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225200190, "dur": 0, "args": { "External id": 244339, "cbid": 251, "correlation": 244339 } }, { "ph": "f", "id": 244339, "pid": 76337, "tid": -914061504, "ts": 1716454225200190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225327518, "dur": 11, "args": { "External id": 244340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244340, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 244340, "pid": 5, "tid": 7, "ts": 1716454225327518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200193, "dur": 12, "args": { "External id": 244340, "cbid": 211, "correlation": 244340 } }, { "ph": "s", "id": 244340, "pid": 76337, "tid": -914061504, "ts": 1716454225200193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225327530, "dur": 5, "args": { "External id": 244342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244342, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 244342, "pid": 5, "tid": 7, "ts": 1716454225327530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200206, "dur": 5, "args": { "External id": 244342, "cbid": 211, "correlation": 244342 } }, { "ph": "s", "id": 244342, "pid": 76337, "tid": -914061504, "ts": 1716454225200206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225327536, "dur": 53, "args": { "External id": 244352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244352, "pid": 5, "tid": 7, "ts": 1716454225327536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200263, "dur": 532, "args": { "External id": 244352, "cbid": 211, "correlation": 244352 } }, { "ph": "s", "id": 244352, "pid": 76337, "tid": -914061504, "ts": 1716454225200263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225327590, "dur": 51, "args": { "External id": 244372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244372, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 244372, "pid": 5, "tid": 7, "ts": 1716454225327590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200850, "dur": 12, "args": { "External id": 244372, "cbid": 211, "correlation": 244372 } }, { "ph": "s", "id": 244372, "pid": 76337, "tid": -914061504, "ts": 1716454225200850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225327643, "dur": 4, "args": { "External id": 244384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244384, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244384, "pid": 5, "tid": 7, "ts": 1716454225327643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200871, "dur": 6, "args": { "External id": 244384, "cbid": 211, "correlation": 244384 } }, { "ph": "s", "id": 244384, "pid": 76337, "tid": -914061504, "ts": 1716454225200871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225327648, "dur": 54, "args": { "External id": 244387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244387, "pid": 5, "tid": 7, "ts": 1716454225327648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200890, "dur": 6, "args": { "External id": 244387, "cbid": 211, "correlation": 244387 } }, { "ph": "s", "id": 244387, "pid": 76337, "tid": -914061504, "ts": 1716454225200890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225327704, "dur": 37, "args": { "External id": 244396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244396, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244396, "pid": 5, "tid": 7, "ts": 1716454225327704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225200931, "dur": 10, "args": { "External id": 244396, "cbid": 211, "correlation": 244396 } }, { "ph": "s", "id": 244396, "pid": 76337, "tid": -914061504, "ts": 1716454225200931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225201002, "dur": 0, "args": { "External id": 244406, "cbid": 317, "correlation": 244406 } }, { "ph": "f", "id": 244406, "pid": 76337, "tid": -914061504, "ts": 1716454225201002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225201002, "dur": 0, "args": { "External id": 244407, "cbid": 203, "correlation": 244407 } }, { "ph": "f", "id": 244407, "pid": 76337, "tid": -914061504, "ts": 1716454225201002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225201003, "dur": 0, "args": { "External id": 244408, "cbid": 205, "correlation": 244408 } }, { "ph": "f", "id": 244408, "pid": 76337, "tid": -914061504, "ts": 1716454225201003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225327742, "dur": 40, "args": { "External id": 244412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244412, "pid": 5, "tid": 7, "ts": 1716454225327742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201018, "dur": 12, "args": { "External id": 244412, "cbid": 211, "correlation": 244412 } }, { "ph": "s", "id": 244412, "pid": 76337, "tid": -914061504, "ts": 1716454225201018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225327783, "dur": 14, "args": { "External id": 244414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244414, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244414, "pid": 5, "tid": 7, "ts": 1716454225327783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201033, "dur": 5, "args": { "External id": 244414, "cbid": 211, "correlation": 244414 } }, { "ph": "s", "id": 244414, "pid": 76337, "tid": -914061504, "ts": 1716454225201033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225327798, "dur": 3, "args": { "External id": 244416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244416, "pid": 5, "tid": 7, "ts": 1716454225327798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201042, "dur": 6, "args": { "External id": 244416, "cbid": 211, "correlation": 244416 } }, { "ph": "s", "id": 244416, "pid": 76337, "tid": -914061504, "ts": 1716454225201042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225201050, "dur": 0, "args": { "External id": 244417, "cbid": 51, "correlation": 244417 } }, { "ph": "s", "id": 244417, "pid": 76337, "tid": -914061504, "ts": 1716454225201050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225327803, "dur": 686, "args": { "External id": 244418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244418, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244418, "pid": 5, "tid": 7, "ts": 1716454225327803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201051, "dur": 5, "args": { "External id": 244418, "cbid": 211, "correlation": 244418 } }, { "ph": "s", "id": 244418, "pid": 76337, "tid": -914061504, "ts": 1716454225201051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225328490, "dur": 57, "args": { "External id": 244423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244423, "pid": 5, "tid": 7, "ts": 1716454225328490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201079, "dur": 9, "args": { "External id": 244423, "cbid": 211, "correlation": 244423 } }, { "ph": "s", "id": 244423, "pid": 76337, "tid": -914061504, "ts": 1716454225201079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225201137, "dur": 0, "args": { "External id": 244433, "cbid": 317, "correlation": 244433 } }, { "ph": "f", "id": 244433, "pid": 76337, "tid": -914061504, "ts": 1716454225201137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225201138, "dur": 0, "args": { "External id": 244434, "cbid": 203, "correlation": 244434 } }, { "ph": "f", "id": 244434, "pid": 76337, "tid": -914061504, "ts": 1716454225201138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225201138, "dur": 0, "args": { "External id": 244435, "cbid": 205, "correlation": 244435 } }, { "ph": "f", "id": 244435, "pid": 76337, "tid": -914061504, "ts": 1716454225201138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225328549, "dur": 75, "args": { "External id": 244439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244439, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244439, "pid": 5, "tid": 7, "ts": 1716454225328549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201150, "dur": 12, "args": { "External id": 244439, "cbid": 211, "correlation": 244439 } }, { "ph": "s", "id": 244439, "pid": 76337, "tid": -914061504, "ts": 1716454225201150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225328625, "dur": 206, "args": { "External id": 244441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244441, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244441, "pid": 5, "tid": 7, "ts": 1716454225328625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201169, "dur": 6, "args": { "External id": 244441, "cbid": 211, "correlation": 244441 } }, { "ph": "s", "id": 244441, "pid": 76337, "tid": -914061504, "ts": 1716454225201169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225328832, "dur": 39, "args": { "External id": 244443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244443, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244443, "pid": 5, "tid": 7, "ts": 1716454225328832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201179, "dur": 6, "args": { "External id": 244443, "cbid": 211, "correlation": 244443 } }, { "ph": "s", "id": 244443, "pid": 76337, "tid": -914061504, "ts": 1716454225201179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225328873, "dur": 58, "args": { "External id": 244449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244449, "pid": 5, "tid": 7, "ts": 1716454225328873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201205, "dur": 493, "args": { "External id": 244449, "cbid": 211, "correlation": 244449 } }, { "ph": "s", "id": 244449, "pid": 76337, "tid": -914061504, "ts": 1716454225201205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225328933, "dur": 51, "args": { "External id": 244457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244457, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244457, "pid": 5, "tid": 7, "ts": 1716454225328933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201721, "dur": 8, "args": { "External id": 244457, "cbid": 211, "correlation": 244457 } }, { "ph": "s", "id": 244457, "pid": 76337, "tid": -914061504, "ts": 1716454225201721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225328985, "dur": 35, "args": { "External id": 244465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244465, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244465, "pid": 5, "tid": 7, "ts": 1716454225328985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201750, "dur": 8, "args": { "External id": 244465, "cbid": 211, "correlation": 244465 } }, { "ph": "s", "id": 244465, "pid": 76337, "tid": -914061504, "ts": 1716454225201750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225329021, "dur": 50, "args": { "External id": 244485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244485, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 244485, "pid": 5, "tid": 7, "ts": 1716454225329021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201831, "dur": 12, "args": { "External id": 244485, "cbid": 211, "correlation": 244485 } }, { "ph": "s", "id": 244485, "pid": 76337, "tid": -914061504, "ts": 1716454225201831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225329073, "dur": 4, "args": { "External id": 244497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244497, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244497, "pid": 5, "tid": 7, "ts": 1716454225329073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201854, "dur": 6, "args": { "External id": 244497, "cbid": 211, "correlation": 244497 } }, { "ph": "s", "id": 244497, "pid": 76337, "tid": -914061504, "ts": 1716454225201854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225329078, "dur": 55, "args": { "External id": 244500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244500, "pid": 5, "tid": 7, "ts": 1716454225329078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201872, "dur": 6, "args": { "External id": 244500, "cbid": 211, "correlation": 244500 } }, { "ph": "s", "id": 244500, "pid": 76337, "tid": -914061504, "ts": 1716454225201872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225201930, "dur": 0, "args": { "External id": 244511, "cbid": 317, "correlation": 244511 } }, { "ph": "f", "id": 244511, "pid": 76337, "tid": -914061504, "ts": 1716454225201930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225201930, "dur": 0, "args": { "External id": 244512, "cbid": 203, "correlation": 244512 } }, { "ph": "f", "id": 244512, "pid": 76337, "tid": -914061504, "ts": 1716454225201930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225201931, "dur": 0, "args": { "External id": 244513, "cbid": 205, "correlation": 244513 } }, { "ph": "f", "id": 244513, "pid": 76337, "tid": -914061504, "ts": 1716454225201931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201952, "dur": 1, "args": { "External id": 244517, "cbid": 251, "correlation": 244517 } }, { "ph": "f", "id": 244517, "pid": 76337, "tid": -914061504, "ts": 1716454225201952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201954, "dur": 0, "args": { "External id": 244518, "cbid": 251, "correlation": 244518 } }, { "ph": "f", "id": 244518, "pid": 76337, "tid": -914061504, "ts": 1716454225201954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201955, "dur": 0, "args": { "External id": 244519, "cbid": 251, "correlation": 244519 } }, { "ph": "f", "id": 244519, "pid": 76337, "tid": -914061504, "ts": 1716454225201955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201956, "dur": 0, "args": { "External id": 244520, "cbid": 251, "correlation": 244520 } }, { "ph": "f", "id": 244520, "pid": 76337, "tid": -914061504, "ts": 1716454225201956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201957, "dur": 0, "args": { "External id": 244521, "cbid": 251, "correlation": 244521 } }, { "ph": "f", "id": 244521, "pid": 76337, "tid": -914061504, "ts": 1716454225201957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201957, "dur": 0, "args": { "External id": 244522, "cbid": 251, "correlation": 244522 } }, { "ph": "f", "id": 244522, "pid": 76337, "tid": -914061504, "ts": 1716454225201957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201958, "dur": 0, "args": { "External id": 244523, "cbid": 251, "correlation": 244523 } }, { "ph": "f", "id": 244523, "pid": 76337, "tid": -914061504, "ts": 1716454225201958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201959, "dur": 0, "args": { "External id": 244524, "cbid": 251, "correlation": 244524 } }, { "ph": "f", "id": 244524, "pid": 76337, "tid": -914061504, "ts": 1716454225201959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225201960, "dur": 0, "args": { "External id": 244525, "cbid": 251, "correlation": 244525 } }, { "ph": "f", "id": 244525, "pid": 76337, "tid": -914061504, "ts": 1716454225201960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225329134, "dur": 111, "args": { "External id": 244526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244526, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244526, "pid": 5, "tid": 7, "ts": 1716454225329134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225201962, "dur": 21, "args": { "External id": 244526, "cbid": 211, "correlation": 244526 } }, { "ph": "s", "id": 244526, "pid": 76337, "tid": -914061504, "ts": 1716454225201962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225329247, "dur": 59, "args": { "External id": 244532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244532, "pid": 5, "tid": 7, "ts": 1716454225329247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202007, "dur": 10, "args": { "External id": 244532, "cbid": 211, "correlation": 244532 } }, { "ph": "s", "id": 244532, "pid": 76337, "tid": -914061504, "ts": 1716454225202007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225329307, "dur": 629, "args": { "External id": 244541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244541, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244541, "pid": 5, "tid": 7, "ts": 1716454225329307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202090, "dur": 13, "args": { "External id": 244541, "cbid": 211, "correlation": 244541 } }, { "ph": "s", "id": 244541, "pid": 76337, "tid": -914061504, "ts": 1716454225202090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225329937, "dur": 176, "args": { "External id": 244563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244563, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244563, "pid": 5, "tid": 7, "ts": 1716454225329937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202148, "dur": 10, "args": { "External id": 244563, "cbid": 211, "correlation": 244563 } }, { "ph": "s", "id": 244563, "pid": 76337, "tid": -914061504, "ts": 1716454225202148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225202233, "dur": 1, "args": { "External id": 244574, "cbid": 251, "correlation": 244574 } }, { "ph": "f", "id": 244574, "pid": 76337, "tid": -914061504, "ts": 1716454225202233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225330115, "dur": 193, "args": { "External id": 244575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244575, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244575, "pid": 5, "tid": 7, "ts": 1716454225330115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202238, "dur": 13, "args": { "External id": 244575, "cbid": 211, "correlation": 244575 } }, { "ph": "s", "id": 244575, "pid": 76337, "tid": -914061504, "ts": 1716454225202238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225202306, "dur": 1, "args": { "External id": 244586, "cbid": 251, "correlation": 244586 } }, { "ph": "f", "id": 244586, "pid": 76337, "tid": -914061504, "ts": 1716454225202306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225330309, "dur": 182, "args": { "External id": 244587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244587, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244587, "pid": 5, "tid": 7, "ts": 1716454225330309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202310, "dur": 12, "args": { "External id": 244587, "cbid": 211, "correlation": 244587 } }, { "ph": "s", "id": 244587, "pid": 76337, "tid": -914061504, "ts": 1716454225202310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225202373, "dur": 1, "args": { "External id": 244598, "cbid": 251, "correlation": 244598 } }, { "ph": "f", "id": 244598, "pid": 76337, "tid": -914061504, "ts": 1716454225202373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225330493, "dur": 182, "args": { "External id": 244599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244599, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244599, "pid": 5, "tid": 7, "ts": 1716454225330493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202376, "dur": 11, "args": { "External id": 244599, "cbid": 211, "correlation": 244599 } }, { "ph": "s", "id": 244599, "pid": 76337, "tid": -914061504, "ts": 1716454225202376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225330676, "dur": 18930, "args": { "External id": 244620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244620, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244620, "pid": 5, "tid": 7, "ts": 1716454225330676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202456, "dur": 13, "args": { "External id": 244620, "cbid": 211, "correlation": 244620 } }, { "ph": "s", "id": 244620, "pid": 76337, "tid": -914061504, "ts": 1716454225202456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225202552, "dur": 1, "args": { "External id": 244638, "cbid": 251, "correlation": 244638 } }, { "ph": "f", "id": 244638, "pid": 76337, "tid": -914061504, "ts": 1716454225202552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225349607, "dur": 204, "args": { "External id": 244640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244640, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244640, "pid": 5, "tid": 7, "ts": 1716454225349607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202558, "dur": 13, "args": { "External id": 244640, "cbid": 211, "correlation": 244640 } }, { "ph": "s", "id": 244640, "pid": 76337, "tid": -914061504, "ts": 1716454225202558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225349813, "dur": 67, "args": { "External id": 244648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244648, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244648, "pid": 5, "tid": 7, "ts": 1716454225349813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202628, "dur": 12, "args": { "External id": 244648, "cbid": 211, "correlation": 244648 } }, { "ph": "s", "id": 244648, "pid": 76337, "tid": -914061504, "ts": 1716454225202628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225349881, "dur": 96, "args": { "External id": 244656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244656, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244656, "pid": 5, "tid": 7, "ts": 1716454225349881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202667, "dur": 80, "args": { "External id": 244656, "cbid": 211, "correlation": 244656 } }, { "ph": "s", "id": 244656, "pid": 76337, "tid": -914061504, "ts": 1716454225202667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225349978, "dur": 55, "args": { "External id": 244667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244667, "pid": 5, "tid": 7, "ts": 1716454225349978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225202810, "dur": 1850, "args": { "External id": 244667, "cbid": 211, "correlation": 244667 } }, { "ph": "s", "id": 244667, "pid": 76337, "tid": -914061504, "ts": 1716454225202810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225350034, "dur": 94, "args": { "External id": 244689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244689, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244689, "pid": 5, "tid": 7, "ts": 1716454225350034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225204679, "dur": 126, "args": { "External id": 244689, "cbid": 211, "correlation": 244689 } }, { "ph": "s", "id": 244689, "pid": 76337, "tid": -914061504, "ts": 1716454225204679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225204881, "dur": 1, "args": { "External id": 244700, "cbid": 251, "correlation": 244700 } }, { "ph": "f", "id": 244700, "pid": 76337, "tid": -914061504, "ts": 1716454225204881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225350129, "dur": 105, "args": { "External id": 244701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244701, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244701, "pid": 5, "tid": 7, "ts": 1716454225350129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225204886, "dur": 13, "args": { "External id": 244701, "cbid": 211, "correlation": 244701 } }, { "ph": "s", "id": 244701, "pid": 76337, "tid": -914061504, "ts": 1716454225204886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225204957, "dur": 1, "args": { "External id": 244712, "cbid": 251, "correlation": 244712 } }, { "ph": "f", "id": 244712, "pid": 76337, "tid": -914061504, "ts": 1716454225204957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225204960, "dur": 0, "args": { "External id": 244713, "cbid": 251, "correlation": 244713 } }, { "ph": "f", "id": 244713, "pid": 76337, "tid": -914061504, "ts": 1716454225204960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225350236, "dur": 10, "args": { "External id": 244714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244714, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 244714, "pid": 5, "tid": 7, "ts": 1716454225350236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225204962, "dur": 19, "args": { "External id": 244714, "cbid": 211, "correlation": 244714 } }, { "ph": "s", "id": 244714, "pid": 76337, "tid": -914061504, "ts": 1716454225204962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225350247, "dur": 5, "args": { "External id": 244716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244716, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 244716, "pid": 5, "tid": 7, "ts": 1716454225350247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225204983, "dur": 6, "args": { "External id": 244716, "cbid": 211, "correlation": 244716 } }, { "ph": "s", "id": 244716, "pid": 76337, "tid": -914061504, "ts": 1716454225204983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225205045, "dur": 1, "args": { "External id": 244727, "cbid": 251, "correlation": 244727 } }, { "ph": "f", "id": 244727, "pid": 76337, "tid": -914061504, "ts": 1716454225205045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225205049, "dur": 0, "args": { "External id": 244728, "cbid": 251, "correlation": 244728 } }, { "ph": "f", "id": 244728, "pid": 76337, "tid": -914061504, "ts": 1716454225205049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225350254, "dur": 6, "args": { "External id": 244729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244729, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 244729, "pid": 5, "tid": 7, "ts": 1716454225350254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205051, "dur": 12, "args": { "External id": 244729, "cbid": 211, "correlation": 244729 } }, { "ph": "s", "id": 244729, "pid": 76337, "tid": -914061504, "ts": 1716454225205051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225350261, "dur": 3, "args": { "External id": 244731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244731, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 244731, "pid": 5, "tid": 7, "ts": 1716454225350261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205064, "dur": 6, "args": { "External id": 244731, "cbid": 211, "correlation": 244731 } }, { "ph": "s", "id": 244731, "pid": 76337, "tid": -914061504, "ts": 1716454225205064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225350266, "dur": 158, "args": { "External id": 244752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244752, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244752, "pid": 5, "tid": 7, "ts": 1716454225350266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205137, "dur": 13, "args": { "External id": 244752, "cbid": 211, "correlation": 244752 } }, { "ph": "s", "id": 244752, "pid": 76337, "tid": -914061504, "ts": 1716454225205137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225205233, "dur": 1, "args": { "External id": 244770, "cbid": 251, "correlation": 244770 } }, { "ph": "f", "id": 244770, "pid": 76337, "tid": -914061504, "ts": 1716454225205233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225350426, "dur": 108, "args": { "External id": 244772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244772, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244772, "pid": 5, "tid": 7, "ts": 1716454225350426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205239, "dur": 14, "args": { "External id": 244772, "cbid": 211, "correlation": 244772 } }, { "ph": "s", "id": 244772, "pid": 76337, "tid": -914061504, "ts": 1716454225205239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225350535, "dur": 35, "args": { "External id": 244780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244780, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244780, "pid": 5, "tid": 7, "ts": 1716454225350535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205309, "dur": 12, "args": { "External id": 244780, "cbid": 211, "correlation": 244780 } }, { "ph": "s", "id": 244780, "pid": 76337, "tid": -914061504, "ts": 1716454225205309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225350571, "dur": 69, "args": { "External id": 244788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244788, "pid": 5, "tid": 7, "ts": 1716454225350571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205351, "dur": 9, "args": { "External id": 244788, "cbid": 211, "correlation": 244788 } }, { "ph": "s", "id": 244788, "pid": 76337, "tid": -914061504, "ts": 1716454225205351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225350641, "dur": 93, "args": { "External id": 244810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244810, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244810, "pid": 5, "tid": 7, "ts": 1716454225350641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205401, "dur": 10, "args": { "External id": 244810, "cbid": 211, "correlation": 244810 } }, { "ph": "s", "id": 244810, "pid": 76337, "tid": -914061504, "ts": 1716454225205401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225205486, "dur": 1, "args": { "External id": 244826, "cbid": 251, "correlation": 244826 } }, { "ph": "f", "id": 244826, "pid": 76337, "tid": -914061504, "ts": 1716454225205486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225350736, "dur": 584, "args": { "External id": 244828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244828, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 244828, "pid": 5, "tid": 7, "ts": 1716454225350736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205492, "dur": 13, "args": { "External id": 244828, "cbid": 211, "correlation": 244828 } }, { "ph": "s", "id": 244828, "pid": 76337, "tid": -914061504, "ts": 1716454225205492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225351321, "dur": 246, "args": { "External id": 244836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244836, "pid": 5, "tid": 7, "ts": 1716454225351321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205558, "dur": 13, "args": { "External id": 244836, "cbid": 211, "correlation": 244836 } }, { "ph": "s", "id": 244836, "pid": 76337, "tid": -914061504, "ts": 1716454225205558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225351569, "dur": 252, "args": { "External id": 244844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244844, "pid": 5, "tid": 7, "ts": 1716454225351569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205588, "dur": 9, "args": { "External id": 244844, "cbid": 211, "correlation": 244844 } }, { "ph": "s", "id": 244844, "pid": 76337, "tid": -914061504, "ts": 1716454225205588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225205669, "dur": 1, "args": { "External id": 244860, "cbid": 251, "correlation": 244860 } }, { "ph": "f", "id": 244860, "pid": 76337, "tid": -914061504, "ts": 1716454225205669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225205674, "dur": 0, "args": { "External id": 244862, "cbid": 251, "correlation": 244862 } }, { "ph": "f", "id": 244862, "pid": 76337, "tid": -914061504, "ts": 1716454225205674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225351823, "dur": 361, "args": { "External id": 244863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244863, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 244863, "pid": 5, "tid": 7, "ts": 1716454225351823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205677, "dur": 13, "args": { "External id": 244863, "cbid": 211, "correlation": 244863 } }, { "ph": "s", "id": 244863, "pid": 76337, "tid": -914061504, "ts": 1716454225205677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225352185, "dur": 50, "args": { "External id": 244871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244871, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244871, "pid": 5, "tid": 7, "ts": 1716454225352185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205719, "dur": 179, "args": { "External id": 244871, "cbid": 211, "correlation": 244871 } }, { "ph": "s", "id": 244871, "pid": 76337, "tid": -914061504, "ts": 1716454225205719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225352236, "dur": 161, "args": { "External id": 244882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244882, "pid": 5, "tid": 7, "ts": 1716454225352236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225205955, "dur": 70, "args": { "External id": 244882, "cbid": 211, "correlation": 244882 } }, { "ph": "s", "id": 244882, "pid": 76337, "tid": -914061504, "ts": 1716454225205955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225206079, "dur": 0, "args": { "External id": 244894, "cbid": 317, "correlation": 244894 } }, { "ph": "f", "id": 244894, "pid": 76337, "tid": -914061504, "ts": 1716454225206079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225206080, "dur": 0, "args": { "External id": 244895, "cbid": 203, "correlation": 244895 } }, { "ph": "f", "id": 244895, "pid": 76337, "tid": -914061504, "ts": 1716454225206080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225206080, "dur": 0, "args": { "External id": 244896, "cbid": 205, "correlation": 244896 } }, { "ph": "f", "id": 244896, "pid": 76337, "tid": -914061504, "ts": 1716454225206080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206103, "dur": 1, "args": { "External id": 244900, "cbid": 251, "correlation": 244900 } }, { "ph": "f", "id": 244900, "pid": 76337, "tid": -914061504, "ts": 1716454225206103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206104, "dur": 0, "args": { "External id": 244901, "cbid": 251, "correlation": 244901 } }, { "ph": "f", "id": 244901, "pid": 76337, "tid": -914061504, "ts": 1716454225206104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206105, "dur": 0, "args": { "External id": 244902, "cbid": 251, "correlation": 244902 } }, { "ph": "f", "id": 244902, "pid": 76337, "tid": -914061504, "ts": 1716454225206105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206106, "dur": 0, "args": { "External id": 244903, "cbid": 251, "correlation": 244903 } }, { "ph": "f", "id": 244903, "pid": 76337, "tid": -914061504, "ts": 1716454225206106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206107, "dur": 0, "args": { "External id": 244904, "cbid": 251, "correlation": 244904 } }, { "ph": "f", "id": 244904, "pid": 76337, "tid": -914061504, "ts": 1716454225206107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206107, "dur": 0, "args": { "External id": 244905, "cbid": 251, "correlation": 244905 } }, { "ph": "f", "id": 244905, "pid": 76337, "tid": -914061504, "ts": 1716454225206107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206108, "dur": 0, "args": { "External id": 244906, "cbid": 251, "correlation": 244906 } }, { "ph": "f", "id": 244906, "pid": 76337, "tid": -914061504, "ts": 1716454225206108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206109, "dur": 0, "args": { "External id": 244907, "cbid": 251, "correlation": 244907 } }, { "ph": "f", "id": 244907, "pid": 76337, "tid": -914061504, "ts": 1716454225206109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206110, "dur": 0, "args": { "External id": 244908, "cbid": 251, "correlation": 244908 } }, { "ph": "f", "id": 244908, "pid": 76337, "tid": -914061504, "ts": 1716454225206110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225352398, "dur": 117, "args": { "External id": 244909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244909, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 244909, "pid": 5, "tid": 7, "ts": 1716454225352398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206112, "dur": 41, "args": { "External id": 244909, "cbid": 211, "correlation": 244909 } }, { "ph": "s", "id": 244909, "pid": 76337, "tid": -914061504, "ts": 1716454225206112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225352517, "dur": 60, "args": { "External id": 244915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244915, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244915, "pid": 5, "tid": 7, "ts": 1716454225352517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206176, "dur": 279, "args": { "External id": 244915, "cbid": 211, "correlation": 244915 } }, { "ph": "s", "id": 244915, "pid": 76337, "tid": -914061504, "ts": 1716454225206176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225352578, "dur": 50, "args": { "External id": 244923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244923, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244923, "pid": 5, "tid": 7, "ts": 1716454225352578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206479, "dur": 9, "args": { "External id": 244923, "cbid": 211, "correlation": 244923 } }, { "ph": "s", "id": 244923, "pid": 76337, "tid": -914061504, "ts": 1716454225206479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225352629, "dur": 52, "args": { "External id": 244943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244943, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 244943, "pid": 5, "tid": 7, "ts": 1716454225352629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206557, "dur": 12, "args": { "External id": 244943, "cbid": 211, "correlation": 244943 } }, { "ph": "s", "id": 244943, "pid": 76337, "tid": -914061504, "ts": 1716454225206557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225352683, "dur": 5, "args": { "External id": 244955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244955, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244955, "pid": 5, "tid": 7, "ts": 1716454225352683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206579, "dur": 10, "args": { "External id": 244955, "cbid": 211, "correlation": 244955 } }, { "ph": "s", "id": 244955, "pid": 76337, "tid": -914061504, "ts": 1716454225206579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225352689, "dur": 56, "args": { "External id": 244958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244958, "pid": 5, "tid": 7, "ts": 1716454225352689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206601, "dur": 105, "args": { "External id": 244958, "cbid": 211, "correlation": 244958 } }, { "ph": "s", "id": 244958, "pid": 76337, "tid": -914061504, "ts": 1716454225206601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225352746, "dur": 38, "args": { "External id": 244967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244967, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244967, "pid": 5, "tid": 7, "ts": 1716454225352746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206745, "dur": 10, "args": { "External id": 244967, "cbid": 211, "correlation": 244967 } }, { "ph": "s", "id": 244967, "pid": 76337, "tid": -914061504, "ts": 1716454225206745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225206800, "dur": 0, "args": { "External id": 244977, "cbid": 317, "correlation": 244977 } }, { "ph": "f", "id": 244977, "pid": 76337, "tid": -914061504, "ts": 1716454225206800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225206801, "dur": 0, "args": { "External id": 244978, "cbid": 203, "correlation": 244978 } }, { "ph": "f", "id": 244978, "pid": 76337, "tid": -914061504, "ts": 1716454225206801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225206802, "dur": 0, "args": { "External id": 244979, "cbid": 205, "correlation": 244979 } }, { "ph": "f", "id": 244979, "pid": 76337, "tid": -914061504, "ts": 1716454225206802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225352785, "dur": 41, "args": { "External id": 244983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244983, "pid": 5, "tid": 7, "ts": 1716454225352785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206819, "dur": 12, "args": { "External id": 244983, "cbid": 211, "correlation": 244983 } }, { "ph": "s", "id": 244983, "pid": 76337, "tid": -914061504, "ts": 1716454225206819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225352828, "dur": 3, "args": { "External id": 244985, "device": 5, "context": 1, "stream": 7, "correlation": 244985, "bytes": 46080, "memory bandwidth (GB/s)": 12.203389830508474 } }, { "ph": "f", "id": 244985, "pid": 5, "tid": 7, "ts": 1716454225352828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225206834, "dur": 17, "args": { "External id": 244985, "cbid": 51, "correlation": 244985 } }, { "ph": "s", "id": 244985, "pid": 76337, "tid": -914061504, "ts": 1716454225206834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225206857, "dur": 1, "args": { "External id": 244987, "cbid": 200, "correlation": 244987 } }, { "ph": "f", "id": 244987, "pid": 76337, "tid": -914061504, "ts": 1716454225206857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225206859, "dur": 0, "args": { "External id": 244988, "cbid": 200, "correlation": 244988 } }, { "ph": "f", "id": 244988, "pid": 76337, "tid": -914061504, "ts": 1716454225206859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225206859, "dur": 0, "args": { "External id": 244989, "cbid": 200, "correlation": 244989 } }, { "ph": "f", "id": 244989, "pid": 76337, "tid": -914061504, "ts": 1716454225206859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225206860, "dur": 0, "args": { "External id": 244990, "cbid": 200, "correlation": 244990 } }, { "ph": "f", "id": 244990, "pid": 76337, "tid": -914061504, "ts": 1716454225206860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454225206861, "dur": 4, "args": { "External id": 244991, "cbid": 15, "correlation": 244991 } }, { "ph": "f", "id": 244991, "pid": 76337, "tid": -914061504, "ts": 1716454225206861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225206867, "dur": 1, "args": { "External id": 244992, "cbid": 251, "correlation": 244992 } }, { "ph": "f", "id": 244992, "pid": 76337, "tid": -914061504, "ts": 1716454225206867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454225352832, "dur": 25, "args": { "External id": 244993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244993, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244993, "pid": 5, "tid": 7, "ts": 1716454225352832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206870, "dur": 8, "args": { "External id": 244993, "cbid": 211, "correlation": 244993 } }, { "ph": "s", "id": 244993, "pid": 76337, "tid": -914061504, "ts": 1716454225206870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225352859, "dur": 4, "args": { "External id": 244995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244995, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 244995, "pid": 5, "tid": 7, "ts": 1716454225352859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206883, "dur": 6, "args": { "External id": 244995, "cbid": 211, "correlation": 244995 } }, { "ph": "s", "id": 244995, "pid": 76337, "tid": -914061504, "ts": 1716454225206883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225206892, "dur": 0, "args": { "External id": 244996, "cbid": 51, "correlation": 244996 } }, { "ph": "s", "id": 244996, "pid": 76337, "tid": -914061504, "ts": 1716454225206892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225352864, "dur": 193, "args": { "External id": 244997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244997, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 244997, "pid": 5, "tid": 7, "ts": 1716454225352864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225206893, "dur": 199, "args": { "External id": 244997, "cbid": 211, "correlation": 244997 } }, { "ph": "s", "id": 244997, "pid": 76337, "tid": -914061504, "ts": 1716454225206893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225353058, "dur": 6, "args": { "External id": 244998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 244998, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 244998, "pid": 5, "tid": 7, "ts": 1716454225353058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225207096, "dur": 6, "args": { "External id": 244998, "cbid": 211, "correlation": 244998 } }, { "ph": "s", "id": 244998, "pid": 76337, "tid": -914061504, "ts": 1716454225207096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225353066, "dur": 5, "args": { "External id": 245004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 245004, "pid": 5, "tid": 7, "ts": 1716454225353066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225207125, "dur": 10, "args": { "External id": 245004, "cbid": 211, "correlation": 245004 } }, { "ph": "s", "id": 245004, "pid": 76337, "tid": -914061504, "ts": 1716454225207125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353072, "dur": 3, "args": { "External id": 245012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245012, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245012, "pid": 5, "tid": 7, "ts": 1716454225353072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225208784, "dur": 16, "args": { "External id": 245012, "cbid": 211, "correlation": 245012 } }, { "ph": "s", "id": 245012, "pid": 76337, "tid": -914061504, "ts": 1716454225208784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353076, "dur": 3, "args": { "External id": 245020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245020, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245020, "pid": 5, "tid": 7, "ts": 1716454225353076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225208825, "dur": 11, "args": { "External id": 245020, "cbid": 211, "correlation": 245020 } }, { "ph": "s", "id": 245020, "pid": 76337, "tid": -914061504, "ts": 1716454225208825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353080, "dur": 3, "args": { "External id": 245028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245028, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245028, "pid": 5, "tid": 7, "ts": 1716454225353080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225208853, "dur": 8, "args": { "External id": 245028, "cbid": 211, "correlation": 245028 } }, { "ph": "s", "id": 245028, "pid": 76337, "tid": -914061504, "ts": 1716454225208853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353084, "dur": 3, "args": { "External id": 245037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245037, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245037, "pid": 5, "tid": 7, "ts": 1716454225353084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225209033, "dur": 14, "args": { "External id": 245037, "cbid": 211, "correlation": 245037 } }, { "ph": "s", "id": 245037, "pid": 76337, "tid": -914061504, "ts": 1716454225209033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353088, "dur": 3, "args": { "External id": 245046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245046, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245046, "pid": 5, "tid": 7, "ts": 1716454225353088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225209062, "dur": 7, "args": { "External id": 245046, "cbid": 211, "correlation": 245046 } }, { "ph": "s", "id": 245046, "pid": 76337, "tid": -914061504, "ts": 1716454225209062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353092, "dur": 3, "args": { "External id": 245054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245054, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245054, "pid": 5, "tid": 7, "ts": 1716454225353092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225209088, "dur": 8, "args": { "External id": 245054, "cbid": 211, "correlation": 245054 } }, { "ph": "s", "id": 245054, "pid": 76337, "tid": -914061504, "ts": 1716454225209088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353097, "dur": 3, "args": { "External id": 245062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245062, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245062, "pid": 5, "tid": 7, "ts": 1716454225353097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225209346, "dur": 15, "args": { "External id": 245062, "cbid": 211, "correlation": 245062 } }, { "ph": "s", "id": 245062, "pid": 76337, "tid": -914061504, "ts": 1716454225209346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353101, "dur": 3, "args": { "External id": 245070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245070, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245070, "pid": 5, "tid": 7, "ts": 1716454225353101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225209377, "dur": 7, "args": { "External id": 245070, "cbid": 211, "correlation": 245070 } }, { "ph": "s", "id": 245070, "pid": 76337, "tid": -914061504, "ts": 1716454225209377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353106, "dur": 1, "args": { "External id": 245080, "device": 5, "context": 1, "stream": 7, "correlation": 245080, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 245080, "pid": 5, "tid": 7, "ts": 1716454225353106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225209441, "dur": 35, "args": { "External id": 245080, "cbid": 41, "correlation": 245080 } }, { "ph": "s", "id": 245080, "pid": 76337, "tid": -914061504, "ts": 1716454225209441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225209477, "dur": 143645, "args": { "External id": 245081, "cbid": 131, "correlation": 245081 } }, { "ph": "f", "id": 245081, "pid": 76337, "tid": -914061504, "ts": 1716454225209477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225353278, "dur": 3, "args": { "External id": 245089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245089, "pid": 5, "tid": 7, "ts": 1716454225353278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353255, "dur": 25, "args": { "External id": 245089, "cbid": 211, "correlation": 245089 } }, { "ph": "s", "id": 245089, "pid": 76337, "tid": -914061504, "ts": 1716454225353255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353371, "dur": 3, "args": { "External id": 245098, "device": 5, "context": 1, "stream": 7, "correlation": 245098, "bytes": 8, "memory bandwidth (GB/s)": 0.002380952380952381 } }, { "ph": "f", "id": 245098, "pid": 5, "tid": 7, "ts": 1716454225353371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353342, "dur": 30, "args": { "External id": 245098, "cbid": 41, "correlation": 245098 } }, { "ph": "s", "id": 245098, "pid": 76337, "tid": -914061504, "ts": 1716454225353342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225353466, "dur": 4, "args": { "External id": 245108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245108, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245108, "pid": 5, "tid": 7, "ts": 1716454225353466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353451, "dur": 17, "args": { "External id": 245108, "cbid": 211, "correlation": 245108 } }, { "ph": "s", "id": 245108, "pid": 76337, "tid": -914061504, "ts": 1716454225353451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353538, "dur": 1, "args": { "External id": 245118, "device": 5, "context": 1, "stream": 7, "correlation": 245118, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 245118, "pid": 5, "tid": 7, "ts": 1716454225353538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353520, "dur": 16, "args": { "External id": 245118, "cbid": 41, "correlation": 245118 } }, { "ph": "s", "id": 245118, "pid": 76337, "tid": -914061504, "ts": 1716454225353520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225353537, "dur": 8, "args": { "External id": 245119, "cbid": 131, "correlation": 245119 } }, { "ph": "f", "id": 245119, "pid": 76337, "tid": -914061504, "ts": 1716454225353537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353603, "dur": 3, "args": { "External id": 245126, "device": 5, "context": 1, "stream": 7, "correlation": 245126, "bytes": 98304, "memory bandwidth (GB/s)": 30.11764705882353 } }, { "ph": "f", "id": 245126, "pid": 5, "tid": 7, "ts": 1716454225353603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353583, "dur": 19, "args": { "External id": 245126, "cbid": 41, "correlation": 245126 } }, { "ph": "s", "id": 245126, "pid": 76337, "tid": -914061504, "ts": 1716454225353583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353691, "dur": 3, "args": { "External id": 245145, "device": 5, "context": 1, "stream": 7, "correlation": 245145, "bytes": 16, "memory bandwidth (GB/s)": 0.005263157894736842 } }, { "ph": "f", "id": 245145, "pid": 5, "tid": 7, "ts": 1716454225353691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353671, "dur": 19, "args": { "External id": 245145, "cbid": 41, "correlation": 245145 } }, { "ph": "s", "id": 245145, "pid": 76337, "tid": -914061504, "ts": 1716454225353671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225353730, "dur": 3, "args": { "External id": 245151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245151, "pid": 5, "tid": 7, "ts": 1716454225353730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353719, "dur": 11, "args": { "External id": 245151, "cbid": 211, "correlation": 245151 } }, { "ph": "s", "id": 245151, "pid": 76337, "tid": -914061504, "ts": 1716454225353719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454225353745, "dur": 6, "args": { "External id": 245153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245153, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 245153, "pid": 5, "tid": 7, "ts": 1716454225353745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353734, "dur": 10, "args": { "External id": 245153, "cbid": 211, "correlation": 245153 } }, { "ph": "s", "id": 245153, "pid": 76337, "tid": -914061504, "ts": 1716454225353734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454225353753, "dur": 3, "args": { "External id": 245155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245155, "pid": 5, "tid": 7, "ts": 1716454225353753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353745, "dur": 7, "args": { "External id": 245155, "cbid": 211, "correlation": 245155 } }, { "ph": "s", "id": 245155, "pid": 76337, "tid": -914061504, "ts": 1716454225353745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353787, "dur": 2, "args": { "External id": 245163, "device": 5, "context": 1, "stream": 7, "correlation": 245163, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 245163, "pid": 5, "tid": 7, "ts": 1716454225353787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353772, "dur": 13, "args": { "External id": 245163, "cbid": 41, "correlation": 245163 } }, { "ph": "s", "id": 245163, "pid": 76337, "tid": -914061504, "ts": 1716454225353772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225353835, "dur": 3, "args": { "External id": 245177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245177, "pid": 5, "tid": 7, "ts": 1716454225353835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353823, "dur": 13, "args": { "External id": 245177, "cbid": 211, "correlation": 245177 } }, { "ph": "s", "id": 245177, "pid": 76337, "tid": -914061504, "ts": 1716454225353823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225353855, "dur": 2, "args": { "External id": 245191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245191, "pid": 5, "tid": 7, "ts": 1716454225353855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353847, "dur": 6, "args": { "External id": 245191, "cbid": 211, "correlation": 245191 } }, { "ph": "s", "id": 245191, "pid": 76337, "tid": -914061504, "ts": 1716454225353847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225353888, "dur": 6, "args": { "External id": 245198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245198, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245198, "pid": 5, "tid": 7, "ts": 1716454225353888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353877, "dur": 12, "args": { "External id": 245198, "cbid": 211, "correlation": 245198 } }, { "ph": "s", "id": 245198, "pid": 76337, "tid": -914061504, "ts": 1716454225353877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225353898, "dur": 6, "args": { "External id": 245201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245201, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245201, "pid": 5, "tid": 7, "ts": 1716454225353898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353891, "dur": 6, "args": { "External id": 245201, "cbid": 211, "correlation": 245201 } }, { "ph": "s", "id": 245201, "pid": 76337, "tid": -914061504, "ts": 1716454225353891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454225353907, "dur": 3, "args": { "External id": 245203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245203, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245203, "pid": 5, "tid": 7, "ts": 1716454225353907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353899, "dur": 6, "args": { "External id": 245203, "cbid": 211, "correlation": 245203 } }, { "ph": "s", "id": 245203, "pid": 76337, "tid": -914061504, "ts": 1716454225353899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225353927, "dur": 2, "args": { "External id": 245206, "device": 5, "context": 1, "stream": 7, "correlation": 245206, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 245206, "pid": 5, "tid": 7, "ts": 1716454225353927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353914, "dur": 11, "args": { "External id": 245206, "cbid": 41, "correlation": 245206 } }, { "ph": "s", "id": 245206, "pid": 76337, "tid": -914061504, "ts": 1716454225353914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225353989, "dur": 4, "args": { "External id": 245222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245222, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245222, "pid": 5, "tid": 7, "ts": 1716454225353989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225353966, "dur": 23, "args": { "External id": 245222, "cbid": 211, "correlation": 245222 } }, { "ph": "s", "id": 245222, "pid": 76337, "tid": -914061504, "ts": 1716454225353966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225354010, "dur": 3, "args": { "External id": 245227, "device": 5, "context": 1, "stream": 7, "correlation": 245227, "bytes": 1, "memory bandwidth (GB/s)": 0.0003094059405940594 } }, { "ph": "f", "id": 245227, "pid": 5, "tid": 7, "ts": 1716454225354010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225353995, "dur": 15, "args": { "External id": 245227, "cbid": 41, "correlation": 245227 } }, { "ph": "s", "id": 245227, "pid": 76337, "tid": -914061504, "ts": 1716454225353995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225354038, "dur": 1, "args": { "External id": 245233, "device": 5, "context": 1, "stream": 7, "correlation": 245233, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 245233, "pid": 5, "tid": 7, "ts": 1716454225354038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225354019, "dur": 27, "args": { "External id": 245233, "cbid": 41, "correlation": 245233 } }, { "ph": "s", "id": 245233, "pid": 76337, "tid": -914061504, "ts": 1716454225354019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225354047, "dur": 3, "args": { "External id": 245234, "cbid": 131, "correlation": 245234 } }, { "ph": "f", "id": 245234, "pid": 76337, "tid": -914061504, "ts": 1716454225354047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354098, "dur": 3, "args": { "External id": 245242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245242, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245242, "pid": 5, "tid": 7, "ts": 1716454225354098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354086, "dur": 12, "args": { "External id": 245242, "cbid": 211, "correlation": 245242 } }, { "ph": "s", "id": 245242, "pid": 76337, "tid": -914061504, "ts": 1716454225354086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354133, "dur": 3, "args": { "External id": 245252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245252, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245252, "pid": 5, "tid": 7, "ts": 1716454225354133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354119, "dur": 13, "args": { "External id": 245252, "cbid": 211, "correlation": 245252 } }, { "ph": "s", "id": 245252, "pid": 76337, "tid": -914061504, "ts": 1716454225354119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354157, "dur": 3, "args": { "External id": 245261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245261, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245261, "pid": 5, "tid": 7, "ts": 1716454225354157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354148, "dur": 9, "args": { "External id": 245261, "cbid": 211, "correlation": 245261 } }, { "ph": "s", "id": 245261, "pid": 76337, "tid": -914061504, "ts": 1716454225354148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225354269, "dur": 12, "args": { "External id": 245271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245271, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245271, "pid": 5, "tid": 7, "ts": 1716454225354269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354255, "dur": 15, "args": { "External id": 245271, "cbid": 211, "correlation": 245271 } }, { "ph": "s", "id": 245271, "pid": 76337, "tid": -914061504, "ts": 1716454225354255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354309, "dur": 3, "args": { "External id": 245279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245279, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245279, "pid": 5, "tid": 7, "ts": 1716454225354309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354300, "dur": 8, "args": { "External id": 245279, "cbid": 211, "correlation": 245279 } }, { "ph": "s", "id": 245279, "pid": 76337, "tid": -914061504, "ts": 1716454225354300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225354353, "dur": 12, "args": { "External id": 245289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245289, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245289, "pid": 5, "tid": 7, "ts": 1716454225354353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354342, "dur": 11, "args": { "External id": 245289, "cbid": 211, "correlation": 245289 } }, { "ph": "s", "id": 245289, "pid": 76337, "tid": -914061504, "ts": 1716454225354342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225354384, "dur": 10, "args": { "External id": 245297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245297, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245297, "pid": 5, "tid": 7, "ts": 1716454225354384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354374, "dur": 9, "args": { "External id": 245297, "cbid": 211, "correlation": 245297 } }, { "ph": "s", "id": 245297, "pid": 76337, "tid": -914061504, "ts": 1716454225354374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354411, "dur": 3, "args": { "External id": 245306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245306, "pid": 5, "tid": 7, "ts": 1716454225354411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354401, "dur": 10, "args": { "External id": 245306, "cbid": 211, "correlation": 245306 } }, { "ph": "s", "id": 245306, "pid": 76337, "tid": -914061504, "ts": 1716454225354401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225354435, "dur": 5, "args": { "External id": 245315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245315, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245315, "pid": 5, "tid": 7, "ts": 1716454225354435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354427, "dur": 8, "args": { "External id": 245315, "cbid": 211, "correlation": 245315 } }, { "ph": "s", "id": 245315, "pid": 76337, "tid": -914061504, "ts": 1716454225354427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225354475, "dur": 8, "args": { "External id": 245325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245325, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245325, "pid": 5, "tid": 7, "ts": 1716454225354475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354464, "dur": 11, "args": { "External id": 245325, "cbid": 211, "correlation": 245325 } }, { "ph": "s", "id": 245325, "pid": 76337, "tid": -914061504, "ts": 1716454225354464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354795, "dur": 3, "args": { "External id": 245334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245334, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245334, "pid": 5, "tid": 7, "ts": 1716454225354795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354779, "dur": 15, "args": { "External id": 245334, "cbid": 211, "correlation": 245334 } }, { "ph": "s", "id": 245334, "pid": 76337, "tid": -914061504, "ts": 1716454225354779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354822, "dur": 3, "args": { "External id": 245342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245342, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245342, "pid": 5, "tid": 7, "ts": 1716454225354822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354813, "dur": 8, "args": { "External id": 245342, "cbid": 211, "correlation": 245342 } }, { "ph": "s", "id": 245342, "pid": 76337, "tid": -914061504, "ts": 1716454225354813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225354875, "dur": 1, "args": { "External id": 245352, "device": 5, "context": 1, "stream": 7, "correlation": 245352, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 245352, "pid": 5, "tid": 7, "ts": 1716454225354875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225354860, "dur": 13, "args": { "External id": 245352, "cbid": 41, "correlation": 245352 } }, { "ph": "s", "id": 245352, "pid": 76337, "tid": -914061504, "ts": 1716454225354860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225354874, "dur": 8, "args": { "External id": 245353, "cbid": 131, "correlation": 245353 } }, { "ph": "f", "id": 245353, "pid": 76337, "tid": -914061504, "ts": 1716454225354874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225354967, "dur": 3, "args": { "External id": 245361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245361, "pid": 5, "tid": 7, "ts": 1716454225354967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225354951, "dur": 16, "args": { "External id": 245361, "cbid": 211, "correlation": 245361 } }, { "ph": "s", "id": 245361, "pid": 76337, "tid": -914061504, "ts": 1716454225354951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225355049, "dur": 3, "args": { "External id": 245370, "device": 5, "context": 1, "stream": 7, "correlation": 245370, "bytes": 8, "memory bandwidth (GB/s)": 0.002577319587628866 } }, { "ph": "f", "id": 245370, "pid": 5, "tid": 7, "ts": 1716454225355049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355031, "dur": 18, "args": { "External id": 245370, "cbid": 41, "correlation": 245370 } }, { "ph": "s", "id": 245370, "pid": 76337, "tid": -914061504, "ts": 1716454225355031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225355122, "dur": 3, "args": { "External id": 245380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245380, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245380, "pid": 5, "tid": 7, "ts": 1716454225355122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355108, "dur": 14, "args": { "External id": 245380, "cbid": 211, "correlation": 245380 } }, { "ph": "s", "id": 245380, "pid": 76337, "tid": -914061504, "ts": 1716454225355108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225355175, "dur": 1, "args": { "External id": 245390, "device": 5, "context": 1, "stream": 7, "correlation": 245390, "bytes": 8, "memory bandwidth (GB/s)": 0.005211726384364821 } }, { "ph": "f", "id": 245390, "pid": 5, "tid": 7, "ts": 1716454225355175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355161, "dur": 12, "args": { "External id": 245390, "cbid": 41, "correlation": 245390 } }, { "ph": "s", "id": 245390, "pid": 76337, "tid": -914061504, "ts": 1716454225355161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225355174, "dur": 8, "args": { "External id": 245391, "cbid": 131, "correlation": 245391 } }, { "ph": "f", "id": 245391, "pid": 76337, "tid": -914061504, "ts": 1716454225355174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225355247, "dur": 3, "args": { "External id": 245398, "device": 5, "context": 1, "stream": 7, "correlation": 245398, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 245398, "pid": 5, "tid": 7, "ts": 1716454225355247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355217, "dur": 29, "args": { "External id": 245398, "cbid": 41, "correlation": 245398 } }, { "ph": "s", "id": 245398, "pid": 76337, "tid": -914061504, "ts": 1716454225355217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225355296, "dur": 1, "args": { "External id": 245409, "device": 5, "context": 1, "stream": 7, "correlation": 245409, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 245409, "pid": 5, "tid": 7, "ts": 1716454225355296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355284, "dur": 10, "args": { "External id": 245409, "cbid": 41, "correlation": 245409 } }, { "ph": "s", "id": 245409, "pid": 76337, "tid": -914061504, "ts": 1716454225355284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225355294, "dur": 8, "args": { "External id": 245410, "cbid": 131, "correlation": 245410 } }, { "ph": "f", "id": 245410, "pid": 76337, "tid": -914061504, "ts": 1716454225355294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225355345, "dur": 3, "args": { "External id": 245418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245418, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245418, "pid": 5, "tid": 7, "ts": 1716454225355345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355332, "dur": 13, "args": { "External id": 245418, "cbid": 211, "correlation": 245418 } }, { "ph": "s", "id": 245418, "pid": 76337, "tid": -914061504, "ts": 1716454225355332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225355374, "dur": 3, "args": { "External id": 245428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245428, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245428, "pid": 5, "tid": 7, "ts": 1716454225355374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355365, "dur": 8, "args": { "External id": 245428, "cbid": 211, "correlation": 245428 } }, { "ph": "s", "id": 245428, "pid": 76337, "tid": -914061504, "ts": 1716454225355365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225355396, "dur": 3, "args": { "External id": 245437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245437, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245437, "pid": 5, "tid": 7, "ts": 1716454225355396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355388, "dur": 7, "args": { "External id": 245437, "cbid": 211, "correlation": 245437 } }, { "ph": "s", "id": 245437, "pid": 76337, "tid": -914061504, "ts": 1716454225355388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225355465, "dur": 6, "args": { "External id": 245445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245445, "pid": 5, "tid": 7, "ts": 1716454225355465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355452, "dur": 13, "args": { "External id": 245445, "cbid": 211, "correlation": 245445 } }, { "ph": "s", "id": 245445, "pid": 76337, "tid": -914061504, "ts": 1716454225355452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225355504, "dur": 3, "args": { "External id": 245454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245454, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245454, "pid": 5, "tid": 7, "ts": 1716454225355504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355493, "dur": 10, "args": { "External id": 245454, "cbid": 211, "correlation": 245454 } }, { "ph": "s", "id": 245454, "pid": 76337, "tid": -914061504, "ts": 1716454225355493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225355527, "dur": 3, "args": { "External id": 245463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245463, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245463, "pid": 5, "tid": 7, "ts": 1716454225355527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355518, "dur": 7, "args": { "External id": 245463, "cbid": 211, "correlation": 245463 } }, { "ph": "s", "id": 245463, "pid": 76337, "tid": -914061504, "ts": 1716454225355518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225355588, "dur": 3, "args": { "External id": 245471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245471, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245471, "pid": 5, "tid": 7, "ts": 1716454225355588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355577, "dur": 10, "args": { "External id": 245471, "cbid": 211, "correlation": 245471 } }, { "ph": "s", "id": 245471, "pid": 76337, "tid": -914061504, "ts": 1716454225355577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225355647, "dur": 2, "args": { "External id": 245479, "device": 5, "context": 1, "stream": 7, "correlation": 245479, "bytes": 8, "memory bandwidth (GB/s)": 0.00390625 } }, { "ph": "f", "id": 245479, "pid": 5, "tid": 7, "ts": 1716454225355647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355631, "dur": 26, "args": { "External id": 245479, "cbid": 41, "correlation": 245479 } }, { "ph": "s", "id": 245479, "pid": 76337, "tid": -914061504, "ts": 1716454225355631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225355658, "dur": 3, "args": { "External id": 245480, "cbid": 131, "correlation": 245480 } }, { "ph": "f", "id": 245480, "pid": 76337, "tid": -914061504, "ts": 1716454225355658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225355718, "dur": 1, "args": { "External id": 245490, "device": 5, "context": 1, "stream": 7, "correlation": 245490, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 245490, "pid": 5, "tid": 7, "ts": 1716454225355718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355706, "dur": 10, "args": { "External id": 245490, "cbid": 41, "correlation": 245490 } }, { "ph": "s", "id": 245490, "pid": 76337, "tid": -914061504, "ts": 1716454225355706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225355717, "dur": 8, "args": { "External id": 245491, "cbid": 131, "correlation": 245491 } }, { "ph": "f", "id": 245491, "pid": 76337, "tid": -914061504, "ts": 1716454225355717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225355773, "dur": 1, "args": { "External id": 245500, "device": 5, "context": 1, "stream": 7, "correlation": 245500, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 245500, "pid": 5, "tid": 7, "ts": 1716454225355773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355762, "dur": 8, "args": { "External id": 245500, "cbid": 41, "correlation": 245500 } }, { "ph": "s", "id": 245500, "pid": 76337, "tid": -914061504, "ts": 1716454225355762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225355771, "dur": 8, "args": { "External id": 245501, "cbid": 131, "correlation": 245501 } }, { "ph": "f", "id": 245501, "pid": 76337, "tid": -914061504, "ts": 1716454225355771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225355847, "dur": 4, "args": { "External id": 245508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245508, "pid": 5, "tid": 7, "ts": 1716454225355847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355828, "dur": 19, "args": { "External id": 245508, "cbid": 211, "correlation": 245508 } }, { "ph": "s", "id": 245508, "pid": 76337, "tid": -914061504, "ts": 1716454225355828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454225355884, "dur": 4, "args": { "External id": 245528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245528, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245528, "pid": 5, "tid": 7, "ts": 1716454225355884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355872, "dur": 12, "args": { "External id": 245528, "cbid": 211, "correlation": 245528 } }, { "ph": "s", "id": 245528, "pid": 76337, "tid": -914061504, "ts": 1716454225355872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225355885, "dur": 0, "args": { "External id": 245529, "cbid": 11, "correlation": 245529 } }, { "ph": "f", "id": 245529, "pid": 76337, "tid": -914061504, "ts": 1716454225355885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225355886, "dur": 0, "args": { "External id": 245530, "cbid": 11, "correlation": 245530 } }, { "ph": "f", "id": 245530, "pid": 76337, "tid": -914061504, "ts": 1716454225355886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225355900, "dur": 1, "args": { "External id": 245533, "device": 5, "context": 1, "stream": 7, "correlation": 245533, "bytes": 4, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 245533, "pid": 5, "tid": 7, "ts": 1716454225355900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355887, "dur": 22, "args": { "External id": 245533, "cbid": 41, "correlation": 245533 } }, { "ph": "s", "id": 245533, "pid": 76337, "tid": -914061504, "ts": 1716454225355887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225355910, "dur": 3, "args": { "External id": 245534, "cbid": 131, "correlation": 245534 } }, { "ph": "f", "id": 245534, "pid": 76337, "tid": -914061504, "ts": 1716454225355910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225355938, "dur": 3, "args": { "External id": 245558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245558, "pid": 5, "tid": 7, "ts": 1716454225355938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355928, "dur": 10, "args": { "External id": 245558, "cbid": 211, "correlation": 245558 } }, { "ph": "s", "id": 245558, "pid": 76337, "tid": -914061504, "ts": 1716454225355928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225355939, "dur": 0, "args": { "External id": 245559, "cbid": 11, "correlation": 245559 } }, { "ph": "f", "id": 245559, "pid": 76337, "tid": -914061504, "ts": 1716454225355939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225355939, "dur": 0, "args": { "External id": 245560, "cbid": 11, "correlation": 245560 } }, { "ph": "f", "id": 245560, "pid": 76337, "tid": -914061504, "ts": 1716454225355939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225355941, "dur": 1, "args": { "External id": 245562, "cbid": 200, "correlation": 245562 } }, { "ph": "f", "id": 245562, "pid": 76337, "tid": -914061504, "ts": 1716454225355941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454225355952, "dur": 4, "args": { "External id": 245564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245564, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245564, "pid": 5, "tid": 7, "ts": 1716454225355952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225355944, "dur": 8, "args": { "External id": 245564, "cbid": 211, "correlation": 245564 } }, { "ph": "s", "id": 245564, "pid": 76337, "tid": -914061504, "ts": 1716454225355944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225355953, "dur": 0, "args": { "External id": 245565, "cbid": 11, "correlation": 245565 } }, { "ph": "f", "id": 245565, "pid": 76337, "tid": -914061504, "ts": 1716454225355953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225355953, "dur": 0, "args": { "External id": 245566, "cbid": 11, "correlation": 245566 } }, { "ph": "f", "id": 245566, "pid": 76337, "tid": -914061504, "ts": 1716454225355953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225355999, "dur": 1, "args": { "External id": 245573, "device": 5, "context": 1, "stream": 7, "correlation": 245573, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 245573, "pid": 5, "tid": 7, "ts": 1716454225355999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225355987, "dur": 20, "args": { "External id": 245573, "cbid": 41, "correlation": 245573 } }, { "ph": "s", "id": 245573, "pid": 76337, "tid": -914061504, "ts": 1716454225355987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225356008, "dur": 3, "args": { "External id": 245574, "cbid": 131, "correlation": 245574 } }, { "ph": "f", "id": 245574, "pid": 76337, "tid": -914061504, "ts": 1716454225356008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225356059, "dur": 1, "args": { "External id": 245584, "device": 5, "context": 1, "stream": 7, "correlation": 245584, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 245584, "pid": 5, "tid": 7, "ts": 1716454225356059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225356048, "dur": 9, "args": { "External id": 245584, "cbid": 41, "correlation": 245584 } }, { "ph": "s", "id": 245584, "pid": 76337, "tid": -914061504, "ts": 1716454225356048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225356058, "dur": 8, "args": { "External id": 245585, "cbid": 131, "correlation": 245585 } }, { "ph": "f", "id": 245585, "pid": 76337, "tid": -914061504, "ts": 1716454225356058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225356129, "dur": 5, "args": { "External id": 245592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245592, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245592, "pid": 5, "tid": 7, "ts": 1716454225356129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356114, "dur": 16, "args": { "External id": 245592, "cbid": 211, "correlation": 245592 } }, { "ph": "s", "id": 245592, "pid": 76337, "tid": -914061504, "ts": 1716454225356114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356197, "dur": 3, "args": { "External id": 245601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245601, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245601, "pid": 5, "tid": 7, "ts": 1716454225356197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356185, "dur": 12, "args": { "External id": 245601, "cbid": 211, "correlation": 245601 } }, { "ph": "s", "id": 245601, "pid": 76337, "tid": -914061504, "ts": 1716454225356185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356234, "dur": 3, "args": { "External id": 245609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245609, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245609, "pid": 5, "tid": 7, "ts": 1716454225356234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356223, "dur": 10, "args": { "External id": 245609, "cbid": 211, "correlation": 245609 } }, { "ph": "s", "id": 245609, "pid": 76337, "tid": -914061504, "ts": 1716454225356223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356265, "dur": 4, "args": { "External id": 245617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245617, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245617, "pid": 5, "tid": 7, "ts": 1716454225356265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356254, "dur": 11, "args": { "External id": 245617, "cbid": 211, "correlation": 245617 } }, { "ph": "s", "id": 245617, "pid": 76337, "tid": -914061504, "ts": 1716454225356254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356295, "dur": 4, "args": { "External id": 245625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245625, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245625, "pid": 5, "tid": 7, "ts": 1716454225356295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356284, "dur": 9, "args": { "External id": 245625, "cbid": 211, "correlation": 245625 } }, { "ph": "s", "id": 245625, "pid": 76337, "tid": -914061504, "ts": 1716454225356284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356321, "dur": 3, "args": { "External id": 245633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245633, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245633, "pid": 5, "tid": 7, "ts": 1716454225356321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356312, "dur": 8, "args": { "External id": 245633, "cbid": 211, "correlation": 245633 } }, { "ph": "s", "id": 245633, "pid": 76337, "tid": -914061504, "ts": 1716454225356312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356346, "dur": 3, "args": { "External id": 245641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245641, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245641, "pid": 5, "tid": 7, "ts": 1716454225356346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356337, "dur": 8, "args": { "External id": 245641, "cbid": 211, "correlation": 245641 } }, { "ph": "s", "id": 245641, "pid": 76337, "tid": -914061504, "ts": 1716454225356337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225356368, "dur": 4, "args": { "External id": 245649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245649, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245649, "pid": 5, "tid": 7, "ts": 1716454225356368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356359, "dur": 7, "args": { "External id": 245649, "cbid": 211, "correlation": 245649 } }, { "ph": "s", "id": 245649, "pid": 76337, "tid": -914061504, "ts": 1716454225356359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225356388, "dur": 4, "args": { "External id": 245657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245657, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245657, "pid": 5, "tid": 7, "ts": 1716454225356388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356379, "dur": 7, "args": { "External id": 245657, "cbid": 211, "correlation": 245657 } }, { "ph": "s", "id": 245657, "pid": 76337, "tid": -914061504, "ts": 1716454225356379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356406, "dur": 3, "args": { "External id": 245665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245665, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245665, "pid": 5, "tid": 7, "ts": 1716454225356406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356398, "dur": 7, "args": { "External id": 245665, "cbid": 211, "correlation": 245665 } }, { "ph": "s", "id": 245665, "pid": 76337, "tid": -914061504, "ts": 1716454225356398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356465, "dur": 3, "args": { "External id": 245673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245673, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 245673, "pid": 5, "tid": 7, "ts": 1716454225356465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356454, "dur": 10, "args": { "External id": 245673, "cbid": 211, "correlation": 245673 } }, { "ph": "s", "id": 245673, "pid": 76337, "tid": -914061504, "ts": 1716454225356454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225356492, "dur": 4, "args": { "External id": 245681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245681, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245681, "pid": 5, "tid": 7, "ts": 1716454225356492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356482, "dur": 8, "args": { "External id": 245681, "cbid": 211, "correlation": 245681 } }, { "ph": "s", "id": 245681, "pid": 76337, "tid": -914061504, "ts": 1716454225356482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225356515, "dur": 4, "args": { "External id": 245689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245689, "pid": 5, "tid": 7, "ts": 1716454225356515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356506, "dur": 7, "args": { "External id": 245689, "cbid": 211, "correlation": 245689 } }, { "ph": "s", "id": 245689, "pid": 76337, "tid": -914061504, "ts": 1716454225356506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225356534, "dur": 3, "args": { "External id": 245697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245697, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 245697, "pid": 5, "tid": 7, "ts": 1716454225356534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356527, "dur": 6, "args": { "External id": 245697, "cbid": 211, "correlation": 245697 } }, { "ph": "s", "id": 245697, "pid": 76337, "tid": -914061504, "ts": 1716454225356527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225356952, "dur": 6, "args": { "External id": 245706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245706, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245706, "pid": 5, "tid": 7, "ts": 1716454225356952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356936, "dur": 17, "args": { "External id": 245706, "cbid": 211, "correlation": 245706 } }, { "ph": "s", "id": 245706, "pid": 76337, "tid": -914061504, "ts": 1716454225356936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225356997, "dur": 5, "args": { "External id": 245715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245715, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245715, "pid": 5, "tid": 7, "ts": 1716454225356997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225356987, "dur": 10, "args": { "External id": 245715, "cbid": 211, "correlation": 245715 } }, { "ph": "s", "id": 245715, "pid": 76337, "tid": -914061504, "ts": 1716454225356987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225357125, "dur": 3, "args": { "External id": 245731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245731, "pid": 5, "tid": 7, "ts": 1716454225357125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357111, "dur": 14, "args": { "External id": 245731, "cbid": 211, "correlation": 245731 } }, { "ph": "s", "id": 245731, "pid": 76337, "tid": -914061504, "ts": 1716454225357111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357159, "dur": 3, "args": { "External id": 245739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245739, "pid": 5, "tid": 7, "ts": 1716454225357159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357150, "dur": 8, "args": { "External id": 245739, "cbid": 211, "correlation": 245739 } }, { "ph": "s", "id": 245739, "pid": 76337, "tid": -914061504, "ts": 1716454225357150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357190, "dur": 3, "args": { "External id": 245747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245747, "pid": 5, "tid": 7, "ts": 1716454225357190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357180, "dur": 9, "args": { "External id": 245747, "cbid": 211, "correlation": 245747 } }, { "ph": "s", "id": 245747, "pid": 76337, "tid": -914061504, "ts": 1716454225357180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357221, "dur": 4, "args": { "External id": 245755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245755, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245755, "pid": 5, "tid": 7, "ts": 1716454225357221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357212, "dur": 8, "args": { "External id": 245755, "cbid": 211, "correlation": 245755 } }, { "ph": "s", "id": 245755, "pid": 76337, "tid": -914061504, "ts": 1716454225357212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225357277, "dur": 4, "args": { "External id": 245767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245767, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245767, "pid": 5, "tid": 7, "ts": 1716454225357277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357264, "dur": 13, "args": { "External id": 245767, "cbid": 211, "correlation": 245767 } }, { "ph": "s", "id": 245767, "pid": 76337, "tid": -914061504, "ts": 1716454225357264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225357322, "dur": 4, "args": { "External id": 245778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245778, "pid": 5, "tid": 7, "ts": 1716454225357322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357311, "dur": 11, "args": { "External id": 245778, "cbid": 211, "correlation": 245778 } }, { "ph": "s", "id": 245778, "pid": 76337, "tid": -914061504, "ts": 1716454225357311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357354, "dur": 3, "args": { "External id": 245786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245786, "pid": 5, "tid": 7, "ts": 1716454225357354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357345, "dur": 9, "args": { "External id": 245786, "cbid": 211, "correlation": 245786 } }, { "ph": "s", "id": 245786, "pid": 76337, "tid": -914061504, "ts": 1716454225357345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357386, "dur": 5, "args": { "External id": 245794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245794, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245794, "pid": 5, "tid": 7, "ts": 1716454225357386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357377, "dur": 10, "args": { "External id": 245794, "cbid": 211, "correlation": 245794 } }, { "ph": "s", "id": 245794, "pid": 76337, "tid": -914061504, "ts": 1716454225357377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357417, "dur": 5, "args": { "External id": 245802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245802, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245802, "pid": 5, "tid": 7, "ts": 1716454225357417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357406, "dur": 9, "args": { "External id": 245802, "cbid": 211, "correlation": 245802 } }, { "ph": "s", "id": 245802, "pid": 76337, "tid": -914061504, "ts": 1716454225357406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225357447, "dur": 4, "args": { "External id": 245811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245811, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245811, "pid": 5, "tid": 7, "ts": 1716454225357447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357437, "dur": 10, "args": { "External id": 245811, "cbid": 211, "correlation": 245811 } }, { "ph": "s", "id": 245811, "pid": 76337, "tid": -914061504, "ts": 1716454225357437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225357508, "dur": 4, "args": { "External id": 245824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245824, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245824, "pid": 5, "tid": 7, "ts": 1716454225357508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357495, "dur": 13, "args": { "External id": 245824, "cbid": 211, "correlation": 245824 } }, { "ph": "s", "id": 245824, "pid": 76337, "tid": -914061504, "ts": 1716454225357495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225357548, "dur": 5, "args": { "External id": 245834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245834, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 245834, "pid": 5, "tid": 7, "ts": 1716454225357548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357538, "dur": 10, "args": { "External id": 245834, "cbid": 211, "correlation": 245834 } }, { "ph": "s", "id": 245834, "pid": 76337, "tid": -914061504, "ts": 1716454225357538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225357672, "dur": 4, "args": { "External id": 245851, "cbid": 251, "correlation": 245851 } }, { "ph": "f", "id": 245851, "pid": 76337, "tid": -914061504, "ts": 1716454225357672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454225357699, "dur": 11, "args": { "External id": 245853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245853, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 245853, "pid": 5, "tid": 7, "ts": 1716454225357699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357684, "dur": 16, "args": { "External id": 245853, "cbid": 211, "correlation": 245853 } }, { "ph": "s", "id": 245853, "pid": 76337, "tid": -914061504, "ts": 1716454225357684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225357759, "dur": 4, "args": { "External id": 245861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245861, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 245861, "pid": 5, "tid": 7, "ts": 1716454225357759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357747, "dur": 12, "args": { "External id": 245861, "cbid": 211, "correlation": 245861 } }, { "ph": "s", "id": 245861, "pid": 76337, "tid": -914061504, "ts": 1716454225357747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225357816, "dur": 2, "args": { "External id": 245877, "cbid": 251, "correlation": 245877 } }, { "ph": "f", "id": 245877, "pid": 76337, "tid": -914061504, "ts": 1716454225357816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225357823, "dur": 0, "args": { "External id": 245879, "cbid": 251, "correlation": 245879 } }, { "ph": "f", "id": 245879, "pid": 76337, "tid": -914061504, "ts": 1716454225357823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225357839, "dur": 14, "args": { "External id": 245880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245880, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 245880, "pid": 5, "tid": 7, "ts": 1716454225357839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357826, "dur": 14, "args": { "External id": 245880, "cbid": 211, "correlation": 245880 } }, { "ph": "s", "id": 245880, "pid": 76337, "tid": -914061504, "ts": 1716454225357826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225357855, "dur": 5, "args": { "External id": 245882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245882, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 245882, "pid": 5, "tid": 7, "ts": 1716454225357855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225357843, "dur": 9, "args": { "External id": 245882, "cbid": 211, "correlation": 245882 } }, { "ph": "s", "id": 245882, "pid": 76337, "tid": -914061504, "ts": 1716454225357843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225357954, "dur": 1, "args": { "External id": 245892, "cbid": 317, "correlation": 245892 } }, { "ph": "f", "id": 245892, "pid": 76337, "tid": -914061504, "ts": 1716454225357954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225357956, "dur": 1, "args": { "External id": 245893, "cbid": 203, "correlation": 245893 } }, { "ph": "f", "id": 245893, "pid": 76337, "tid": -914061504, "ts": 1716454225357956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225357958, "dur": 1, "args": { "External id": 245894, "cbid": 205, "correlation": 245894 } }, { "ph": "f", "id": 245894, "pid": 76337, "tid": -914061504, "ts": 1716454225357958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225358020, "dur": 7, "args": { "External id": 245898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245898, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245898, "pid": 5, "tid": 7, "ts": 1716454225358020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225358005, "dur": 15, "args": { "External id": 245898, "cbid": 211, "correlation": 245898 } }, { "ph": "s", "id": 245898, "pid": 76337, "tid": -914061504, "ts": 1716454225358005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225358030, "dur": 4, "args": { "External id": 245900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245900, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 245900, "pid": 5, "tid": 7, "ts": 1716454225358030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225358023, "dur": 6, "args": { "External id": 245900, "cbid": 211, "correlation": 245900 } }, { "ph": "s", "id": 245900, "pid": 76337, "tid": -914061504, "ts": 1716454225358023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225358049, "dur": 3, "args": { "External id": 245902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 245902, "pid": 5, "tid": 7, "ts": 1716454225358049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225358040, "dur": 8, "args": { "External id": 245902, "cbid": 211, "correlation": 245902 } }, { "ph": "s", "id": 245902, "pid": 76337, "tid": -914061504, "ts": 1716454225358040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225358055, "dur": 0, "args": { "External id": 245903, "cbid": 51, "correlation": 245903 } }, { "ph": "s", "id": 245903, "pid": 76337, "tid": -914061504, "ts": 1716454225358055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225358066, "dur": 87, "args": { "External id": 245904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245904, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 245904, "pid": 5, "tid": 7, "ts": 1716454225358066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225358056, "dur": 7, "args": { "External id": 245904, "cbid": 211, "correlation": 245904 } }, { "ph": "s", "id": 245904, "pid": 76337, "tid": -914061504, "ts": 1716454225358056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225358154, "dur": 60, "args": { "External id": 245909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245909, "pid": 5, "tid": 7, "ts": 1716454225358154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225358093, "dur": 11, "args": { "External id": 245909, "cbid": 211, "correlation": 245909 } }, { "ph": "s", "id": 245909, "pid": 76337, "tid": -914061504, "ts": 1716454225358093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225364511, "dur": 51, "args": { "External id": 245929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245929, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 245929, "pid": 5, "tid": 7, "ts": 1716454225364511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364474, "dur": 38, "args": { "External id": 245929, "cbid": 211, "correlation": 245929 } }, { "ph": "s", "id": 245929, "pid": 76337, "tid": -914061504, "ts": 1716454225364474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225364564, "dur": 5, "args": { "External id": 245941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245941, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 245941, "pid": 5, "tid": 7, "ts": 1716454225364564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364531, "dur": 11, "args": { "External id": 245941, "cbid": 211, "correlation": 245941 } }, { "ph": "s", "id": 245941, "pid": 76337, "tid": -914061504, "ts": 1716454225364531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225364576, "dur": 57, "args": { "External id": 245944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245944, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245944, "pid": 5, "tid": 7, "ts": 1716454225364576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364567, "dur": 8, "args": { "External id": 245944, "cbid": 211, "correlation": 245944 } }, { "ph": "s", "id": 245944, "pid": 76337, "tid": -914061504, "ts": 1716454225364567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225364642, "dur": 36, "args": { "External id": 245953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245953, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245953, "pid": 5, "tid": 7, "ts": 1716454225364642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364629, "dur": 13, "args": { "External id": 245953, "cbid": 211, "correlation": 245953 } }, { "ph": "s", "id": 245953, "pid": 76337, "tid": -914061504, "ts": 1716454225364629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225364728, "dur": 1, "args": { "External id": 245963, "cbid": 317, "correlation": 245963 } }, { "ph": "f", "id": 245963, "pid": 76337, "tid": -914061504, "ts": 1716454225364728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225364730, "dur": 1, "args": { "External id": 245964, "cbid": 203, "correlation": 245964 } }, { "ph": "f", "id": 245964, "pid": 76337, "tid": -914061504, "ts": 1716454225364730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225364732, "dur": 1, "args": { "External id": 245965, "cbid": 205, "correlation": 245965 } }, { "ph": "f", "id": 245965, "pid": 76337, "tid": -914061504, "ts": 1716454225364732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225364782, "dur": 40, "args": { "External id": 245969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245969, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245969, "pid": 5, "tid": 7, "ts": 1716454225364782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364767, "dur": 14, "args": { "External id": 245969, "cbid": 211, "correlation": 245969 } }, { "ph": "s", "id": 245969, "pid": 76337, "tid": -914061504, "ts": 1716454225364767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225364823, "dur": 14, "args": { "External id": 245971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245971, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245971, "pid": 5, "tid": 7, "ts": 1716454225364823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364784, "dur": 6, "args": { "External id": 245971, "cbid": 211, "correlation": 245971 } }, { "ph": "s", "id": 245971, "pid": 76337, "tid": -914061504, "ts": 1716454225364784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225364839, "dur": 3, "args": { "External id": 245973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245973, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 245973, "pid": 5, "tid": 7, "ts": 1716454225364839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364800, "dur": 9, "args": { "External id": 245973, "cbid": 211, "correlation": 245973 } }, { "ph": "s", "id": 245973, "pid": 76337, "tid": -914061504, "ts": 1716454225364800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225364817, "dur": 0, "args": { "External id": 245974, "cbid": 51, "correlation": 245974 } }, { "ph": "s", "id": 245974, "pid": 76337, "tid": -914061504, "ts": 1716454225364817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225364844, "dur": 712, "args": { "External id": 245975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245975, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 245975, "pid": 5, "tid": 7, "ts": 1716454225364844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364819, "dur": 8, "args": { "External id": 245975, "cbid": 211, "correlation": 245975 } }, { "ph": "s", "id": 245975, "pid": 76337, "tid": -914061504, "ts": 1716454225364819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225365557, "dur": 59, "args": { "External id": 245980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 245980, "pid": 5, "tid": 7, "ts": 1716454225365557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364863, "dur": 11, "args": { "External id": 245980, "cbid": 211, "correlation": 245980 } }, { "ph": "s", "id": 245980, "pid": 76337, "tid": -914061504, "ts": 1716454225364863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225365618, "dur": 3, "args": { "External id": 245988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 245988, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 245988, "pid": 5, "tid": 7, "ts": 1716454225365618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225364911, "dur": 10, "args": { "External id": 245988, "cbid": 211, "correlation": 245988 } }, { "ph": "s", "id": 245988, "pid": 76337, "tid": -914061504, "ts": 1716454225364911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365037, "dur": 4, "args": { "External id": 246004, "cbid": 251, "correlation": 246004 } }, { "ph": "f", "id": 246004, "pid": 76337, "tid": -914061504, "ts": 1716454225365037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365047, "dur": 0, "args": { "External id": 246006, "cbid": 251, "correlation": 246006 } }, { "ph": "f", "id": 246006, "pid": 76337, "tid": -914061504, "ts": 1716454225365047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225365623, "dur": 9, "args": { "External id": 246007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246007, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 246007, "pid": 5, "tid": 7, "ts": 1716454225365623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365051, "dur": 14, "args": { "External id": 246007, "cbid": 211, "correlation": 246007 } }, { "ph": "s", "id": 246007, "pid": 76337, "tid": -914061504, "ts": 1716454225365051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225365632, "dur": 4, "args": { "External id": 246009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246009, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 246009, "pid": 5, "tid": 7, "ts": 1716454225365632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365070, "dur": 9, "args": { "External id": 246009, "cbid": 211, "correlation": 246009 } }, { "ph": "s", "id": 246009, "pid": 76337, "tid": -914061504, "ts": 1716454225365070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225365638, "dur": 55, "args": { "External id": 246019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246019, "pid": 5, "tid": 7, "ts": 1716454225365638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365158, "dur": 12, "args": { "External id": 246019, "cbid": 211, "correlation": 246019 } }, { "ph": "s", "id": 246019, "pid": 76337, "tid": -914061504, "ts": 1716454225365158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225365694, "dur": 54, "args": { "External id": 246039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246039, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 246039, "pid": 5, "tid": 7, "ts": 1716454225365694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365227, "dur": 11, "args": { "External id": 246039, "cbid": 211, "correlation": 246039 } }, { "ph": "s", "id": 246039, "pid": 76337, "tid": -914061504, "ts": 1716454225365227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225365749, "dur": 4, "args": { "External id": 246051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246051, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246051, "pid": 5, "tid": 7, "ts": 1716454225365749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365250, "dur": 6, "args": { "External id": 246051, "cbid": 211, "correlation": 246051 } }, { "ph": "s", "id": 246051, "pid": 76337, "tid": -914061504, "ts": 1716454225365250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225365754, "dur": 56, "args": { "External id": 246054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246054, "pid": 5, "tid": 7, "ts": 1716454225365754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365268, "dur": 7, "args": { "External id": 246054, "cbid": 211, "correlation": 246054 } }, { "ph": "s", "id": 246054, "pid": 76337, "tid": -914061504, "ts": 1716454225365268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225365811, "dur": 36, "args": { "External id": 246063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246063, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246063, "pid": 5, "tid": 7, "ts": 1716454225365811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365308, "dur": 9, "args": { "External id": 246063, "cbid": 211, "correlation": 246063 } }, { "ph": "s", "id": 246063, "pid": 76337, "tid": -914061504, "ts": 1716454225365308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225365386, "dur": 0, "args": { "External id": 246073, "cbid": 317, "correlation": 246073 } }, { "ph": "f", "id": 246073, "pid": 76337, "tid": -914061504, "ts": 1716454225365386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225365386, "dur": 0, "args": { "External id": 246074, "cbid": 203, "correlation": 246074 } }, { "ph": "f", "id": 246074, "pid": 76337, "tid": -914061504, "ts": 1716454225365386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225365387, "dur": 0, "args": { "External id": 246075, "cbid": 205, "correlation": 246075 } }, { "ph": "f", "id": 246075, "pid": 76337, "tid": -914061504, "ts": 1716454225365387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225365849, "dur": 40, "args": { "External id": 246079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246079, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246079, "pid": 5, "tid": 7, "ts": 1716454225365849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365402, "dur": 12, "args": { "External id": 246079, "cbid": 211, "correlation": 246079 } }, { "ph": "s", "id": 246079, "pid": 76337, "tid": -914061504, "ts": 1716454225365402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225365890, "dur": 15, "args": { "External id": 246081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246081, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246081, "pid": 5, "tid": 7, "ts": 1716454225365890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365417, "dur": 5, "args": { "External id": 246081, "cbid": 211, "correlation": 246081 } }, { "ph": "s", "id": 246081, "pid": 76337, "tid": -914061504, "ts": 1716454225365417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225365906, "dur": 4, "args": { "External id": 246083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246083, "pid": 5, "tid": 7, "ts": 1716454225365906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365426, "dur": 6, "args": { "External id": 246083, "cbid": 211, "correlation": 246083 } }, { "ph": "s", "id": 246083, "pid": 76337, "tid": -914061504, "ts": 1716454225365426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225365435, "dur": 0, "args": { "External id": 246084, "cbid": 51, "correlation": 246084 } }, { "ph": "s", "id": 246084, "pid": 76337, "tid": -914061504, "ts": 1716454225365435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225365911, "dur": 707, "args": { "External id": 246085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246085, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246085, "pid": 5, "tid": 7, "ts": 1716454225365911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365436, "dur": 6, "args": { "External id": 246085, "cbid": 211, "correlation": 246085 } }, { "ph": "s", "id": 246085, "pid": 76337, "tid": -914061504, "ts": 1716454225365436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225366619, "dur": 59, "args": { "External id": 246090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246090, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246090, "pid": 5, "tid": 7, "ts": 1716454225366619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365464, "dur": 8, "args": { "External id": 246090, "cbid": 211, "correlation": 246090 } }, { "ph": "s", "id": 246090, "pid": 76337, "tid": -914061504, "ts": 1716454225365464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225366680, "dur": 50, "args": { "External id": 246098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246098, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246098, "pid": 5, "tid": 7, "ts": 1716454225366680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365497, "dur": 9, "args": { "External id": 246098, "cbid": 211, "correlation": 246098 } }, { "ph": "s", "id": 246098, "pid": 76337, "tid": -914061504, "ts": 1716454225365497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225366731, "dur": 35, "args": { "External id": 246106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246106, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246106, "pid": 5, "tid": 7, "ts": 1716454225366731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365541, "dur": 11, "args": { "External id": 246106, "cbid": 211, "correlation": 246106 } }, { "ph": "s", "id": 246106, "pid": 76337, "tid": -914061504, "ts": 1716454225365541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225366767, "dur": 52, "args": { "External id": 246126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246126, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 246126, "pid": 5, "tid": 7, "ts": 1716454225366767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365632, "dur": 13, "args": { "External id": 246126, "cbid": 211, "correlation": 246126 } }, { "ph": "s", "id": 246126, "pid": 76337, "tid": -914061504, "ts": 1716454225365632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225366821, "dur": 4, "args": { "External id": 246138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246138, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246138, "pid": 5, "tid": 7, "ts": 1716454225366821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365657, "dur": 7, "args": { "External id": 246138, "cbid": 211, "correlation": 246138 } }, { "ph": "s", "id": 246138, "pid": 76337, "tid": -914061504, "ts": 1716454225365657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225366826, "dur": 56, "args": { "External id": 246141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246141, "pid": 5, "tid": 7, "ts": 1716454225366826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365675, "dur": 7, "args": { "External id": 246141, "cbid": 211, "correlation": 246141 } }, { "ph": "s", "id": 246141, "pid": 76337, "tid": -914061504, "ts": 1716454225365675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225365734, "dur": 0, "args": { "External id": 246152, "cbid": 317, "correlation": 246152 } }, { "ph": "f", "id": 246152, "pid": 76337, "tid": -914061504, "ts": 1716454225365734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225365735, "dur": 0, "args": { "External id": 246153, "cbid": 203, "correlation": 246153 } }, { "ph": "f", "id": 246153, "pid": 76337, "tid": -914061504, "ts": 1716454225365735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225365736, "dur": 0, "args": { "External id": 246154, "cbid": 205, "correlation": 246154 } }, { "ph": "f", "id": 246154, "pid": 76337, "tid": -914061504, "ts": 1716454225365736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365771, "dur": 3, "args": { "External id": 246158, "cbid": 251, "correlation": 246158 } }, { "ph": "f", "id": 246158, "pid": 76337, "tid": -914061504, "ts": 1716454225365771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365775, "dur": 1, "args": { "External id": 246159, "cbid": 251, "correlation": 246159 } }, { "ph": "f", "id": 246159, "pid": 76337, "tid": -914061504, "ts": 1716454225365775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365778, "dur": 1, "args": { "External id": 246160, "cbid": 251, "correlation": 246160 } }, { "ph": "f", "id": 246160, "pid": 76337, "tid": -914061504, "ts": 1716454225365778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365780, "dur": 1, "args": { "External id": 246161, "cbid": 251, "correlation": 246161 } }, { "ph": "f", "id": 246161, "pid": 76337, "tid": -914061504, "ts": 1716454225365780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365782, "dur": 1, "args": { "External id": 246162, "cbid": 251, "correlation": 246162 } }, { "ph": "f", "id": 246162, "pid": 76337, "tid": -914061504, "ts": 1716454225365782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365784, "dur": 1, "args": { "External id": 246163, "cbid": 251, "correlation": 246163 } }, { "ph": "f", "id": 246163, "pid": 76337, "tid": -914061504, "ts": 1716454225365784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365786, "dur": 1, "args": { "External id": 246164, "cbid": 251, "correlation": 246164 } }, { "ph": "f", "id": 246164, "pid": 76337, "tid": -914061504, "ts": 1716454225365786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365788, "dur": 1, "args": { "External id": 246165, "cbid": 251, "correlation": 246165 } }, { "ph": "f", "id": 246165, "pid": 76337, "tid": -914061504, "ts": 1716454225365788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225365791, "dur": 0, "args": { "External id": 246166, "cbid": 251, "correlation": 246166 } }, { "ph": "f", "id": 246166, "pid": 76337, "tid": -914061504, "ts": 1716454225365791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225366883, "dur": 117, "args": { "External id": 246167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246167, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246167, "pid": 5, "tid": 7, "ts": 1716454225366883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365795, "dur": 15, "args": { "External id": 246167, "cbid": 211, "correlation": 246167 } }, { "ph": "s", "id": 246167, "pid": 76337, "tid": -914061504, "ts": 1716454225365795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225367001, "dur": 61, "args": { "External id": 246173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246173, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246173, "pid": 5, "tid": 7, "ts": 1716454225367001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365835, "dur": 9, "args": { "External id": 246173, "cbid": 211, "correlation": 246173 } }, { "ph": "s", "id": 246173, "pid": 76337, "tid": -914061504, "ts": 1716454225365835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225367064, "dur": 562, "args": { "External id": 246182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246182, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246182, "pid": 5, "tid": 7, "ts": 1716454225367064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225365951, "dur": 18, "args": { "External id": 246182, "cbid": 211, "correlation": 246182 } }, { "ph": "s", "id": 246182, "pid": 76337, "tid": -914061504, "ts": 1716454225365951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225367627, "dur": 185, "args": { "External id": 246204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246204, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246204, "pid": 5, "tid": 7, "ts": 1716454225367627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366044, "dur": 13, "args": { "External id": 246204, "cbid": 211, "correlation": 246204 } }, { "ph": "s", "id": 246204, "pid": 76337, "tid": -914061504, "ts": 1716454225366044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225366168, "dur": 2, "args": { "External id": 246215, "cbid": 251, "correlation": 246215 } }, { "ph": "f", "id": 246215, "pid": 76337, "tid": -914061504, "ts": 1716454225366168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225367813, "dur": 199, "args": { "External id": 246216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246216, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246216, "pid": 5, "tid": 7, "ts": 1716454225367813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366175, "dur": 14, "args": { "External id": 246216, "cbid": 211, "correlation": 246216 } }, { "ph": "s", "id": 246216, "pid": 76337, "tid": -914061504, "ts": 1716454225366175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225366254, "dur": 1, "args": { "External id": 246227, "cbid": 251, "correlation": 246227 } }, { "ph": "f", "id": 246227, "pid": 76337, "tid": -914061504, "ts": 1716454225366254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225368013, "dur": 190, "args": { "External id": 246228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246228, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246228, "pid": 5, "tid": 7, "ts": 1716454225368013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366258, "dur": 12, "args": { "External id": 246228, "cbid": 211, "correlation": 246228 } }, { "ph": "s", "id": 246228, "pid": 76337, "tid": -914061504, "ts": 1716454225366258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225366323, "dur": 1, "args": { "External id": 246239, "cbid": 251, "correlation": 246239 } }, { "ph": "f", "id": 246239, "pid": 76337, "tid": -914061504, "ts": 1716454225366323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225368205, "dur": 191, "args": { "External id": 246240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246240, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246240, "pid": 5, "tid": 7, "ts": 1716454225368205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366327, "dur": 11, "args": { "External id": 246240, "cbid": 211, "correlation": 246240 } }, { "ph": "s", "id": 246240, "pid": 76337, "tid": -914061504, "ts": 1716454225366327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225368397, "dur": 18981, "args": { "External id": 246261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246261, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246261, "pid": 5, "tid": 7, "ts": 1716454225368397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366441, "dur": 17, "args": { "External id": 246261, "cbid": 211, "correlation": 246261 } }, { "ph": "s", "id": 246261, "pid": 76337, "tid": -914061504, "ts": 1716454225366441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225366571, "dur": 2, "args": { "External id": 246279, "cbid": 251, "correlation": 246279 } }, { "ph": "f", "id": 246279, "pid": 76337, "tid": -914061504, "ts": 1716454225366571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225387380, "dur": 207, "args": { "External id": 246281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246281, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246281, "pid": 5, "tid": 7, "ts": 1716454225387380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366578, "dur": 14, "args": { "External id": 246281, "cbid": 211, "correlation": 246281 } }, { "ph": "s", "id": 246281, "pid": 76337, "tid": -914061504, "ts": 1716454225366578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225387588, "dur": 66, "args": { "External id": 246289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246289, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246289, "pid": 5, "tid": 7, "ts": 1716454225387588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366654, "dur": 12, "args": { "External id": 246289, "cbid": 211, "correlation": 246289 } }, { "ph": "s", "id": 246289, "pid": 76337, "tid": -914061504, "ts": 1716454225366654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225387656, "dur": 96, "args": { "External id": 246297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246297, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246297, "pid": 5, "tid": 7, "ts": 1716454225387656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366695, "dur": 9, "args": { "External id": 246297, "cbid": 211, "correlation": 246297 } }, { "ph": "s", "id": 246297, "pid": 76337, "tid": -914061504, "ts": 1716454225366695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225387753, "dur": 55, "args": { "External id": 246308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246308, "pid": 5, "tid": 7, "ts": 1716454225387753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366796, "dur": 16, "args": { "External id": 246308, "cbid": 211, "correlation": 246308 } }, { "ph": "s", "id": 246308, "pid": 76337, "tid": -914061504, "ts": 1716454225366796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225387810, "dur": 94, "args": { "External id": 246330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246330, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246330, "pid": 5, "tid": 7, "ts": 1716454225387810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366834, "dur": 10, "args": { "External id": 246330, "cbid": 211, "correlation": 246330 } }, { "ph": "s", "id": 246330, "pid": 76337, "tid": -914061504, "ts": 1716454225366834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225366921, "dur": 1, "args": { "External id": 246341, "cbid": 251, "correlation": 246341 } }, { "ph": "f", "id": 246341, "pid": 76337, "tid": -914061504, "ts": 1716454225366921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225387905, "dur": 109, "args": { "External id": 246342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246342, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246342, "pid": 5, "tid": 7, "ts": 1716454225387905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225366927, "dur": 13, "args": { "External id": 246342, "cbid": 211, "correlation": 246342 } }, { "ph": "s", "id": 246342, "pid": 76337, "tid": -914061504, "ts": 1716454225366927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367021, "dur": 1, "args": { "External id": 246353, "cbid": 251, "correlation": 246353 } }, { "ph": "f", "id": 246353, "pid": 76337, "tid": -914061504, "ts": 1716454225367021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367025, "dur": 0, "args": { "External id": 246354, "cbid": 251, "correlation": 246354 } }, { "ph": "f", "id": 246354, "pid": 76337, "tid": -914061504, "ts": 1716454225367025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225388015, "dur": 10, "args": { "External id": 246355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246355, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 246355, "pid": 5, "tid": 7, "ts": 1716454225388015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367027, "dur": 15, "args": { "External id": 246355, "cbid": 211, "correlation": 246355 } }, { "ph": "s", "id": 246355, "pid": 76337, "tid": -914061504, "ts": 1716454225367027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225388026, "dur": 5, "args": { "External id": 246357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246357, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 246357, "pid": 5, "tid": 7, "ts": 1716454225388026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367046, "dur": 8, "args": { "External id": 246357, "cbid": 211, "correlation": 246357 } }, { "ph": "s", "id": 246357, "pid": 76337, "tid": -914061504, "ts": 1716454225367046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367110, "dur": 1, "args": { "External id": 246368, "cbid": 251, "correlation": 246368 } }, { "ph": "f", "id": 246368, "pid": 76337, "tid": -914061504, "ts": 1716454225367110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367113, "dur": 0, "args": { "External id": 246369, "cbid": 251, "correlation": 246369 } }, { "ph": "f", "id": 246369, "pid": 76337, "tid": -914061504, "ts": 1716454225367113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225388033, "dur": 6, "args": { "External id": 246370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246370, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 246370, "pid": 5, "tid": 7, "ts": 1716454225388033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367115, "dur": 12, "args": { "External id": 246370, "cbid": 211, "correlation": 246370 } }, { "ph": "s", "id": 246370, "pid": 76337, "tid": -914061504, "ts": 1716454225367115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225388040, "dur": 4, "args": { "External id": 246372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246372, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 246372, "pid": 5, "tid": 7, "ts": 1716454225388040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367128, "dur": 5, "args": { "External id": 246372, "cbid": 211, "correlation": 246372 } }, { "ph": "s", "id": 246372, "pid": 76337, "tid": -914061504, "ts": 1716454225367128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225388045, "dur": 159, "args": { "External id": 246393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246393, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246393, "pid": 5, "tid": 7, "ts": 1716454225388045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367202, "dur": 12, "args": { "External id": 246393, "cbid": 211, "correlation": 246393 } }, { "ph": "s", "id": 246393, "pid": 76337, "tid": -914061504, "ts": 1716454225367202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367300, "dur": 2, "args": { "External id": 246411, "cbid": 251, "correlation": 246411 } }, { "ph": "f", "id": 246411, "pid": 76337, "tid": -914061504, "ts": 1716454225367300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225388205, "dur": 110, "args": { "External id": 246413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246413, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246413, "pid": 5, "tid": 7, "ts": 1716454225388205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367306, "dur": 14, "args": { "External id": 246413, "cbid": 211, "correlation": 246413 } }, { "ph": "s", "id": 246413, "pid": 76337, "tid": -914061504, "ts": 1716454225367306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225388317, "dur": 35, "args": { "External id": 246421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246421, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246421, "pid": 5, "tid": 7, "ts": 1716454225388317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367377, "dur": 13, "args": { "External id": 246421, "cbid": 211, "correlation": 246421 } }, { "ph": "s", "id": 246421, "pid": 76337, "tid": -914061504, "ts": 1716454225367377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225388353, "dur": 69, "args": { "External id": 246429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246429, "pid": 5, "tid": 7, "ts": 1716454225388353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367418, "dur": 9, "args": { "External id": 246429, "cbid": 211, "correlation": 246429 } }, { "ph": "s", "id": 246429, "pid": 76337, "tid": -914061504, "ts": 1716454225367418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225388424, "dur": 94, "args": { "External id": 246451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246451, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246451, "pid": 5, "tid": 7, "ts": 1716454225388424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367476, "dur": 11, "args": { "External id": 246451, "cbid": 211, "correlation": 246451 } }, { "ph": "s", "id": 246451, "pid": 76337, "tid": -914061504, "ts": 1716454225367476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367571, "dur": 1, "args": { "External id": 246467, "cbid": 251, "correlation": 246467 } }, { "ph": "f", "id": 246467, "pid": 76337, "tid": -914061504, "ts": 1716454225367571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225388519, "dur": 586, "args": { "External id": 246469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246469, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246469, "pid": 5, "tid": 7, "ts": 1716454225388519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367578, "dur": 13, "args": { "External id": 246469, "cbid": 211, "correlation": 246469 } }, { "ph": "s", "id": 246469, "pid": 76337, "tid": -914061504, "ts": 1716454225367578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225389106, "dur": 248, "args": { "External id": 246477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246477, "pid": 5, "tid": 7, "ts": 1716454225389106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367663, "dur": 14, "args": { "External id": 246477, "cbid": 211, "correlation": 246477 } }, { "ph": "s", "id": 246477, "pid": 76337, "tid": -914061504, "ts": 1716454225367663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225389355, "dur": 251, "args": { "External id": 246485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246485, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246485, "pid": 5, "tid": 7, "ts": 1716454225389355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367706, "dur": 11, "args": { "External id": 246485, "cbid": 211, "correlation": 246485 } }, { "ph": "s", "id": 246485, "pid": 76337, "tid": -914061504, "ts": 1716454225367706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367797, "dur": 2, "args": { "External id": 246501, "cbid": 251, "correlation": 246501 } }, { "ph": "f", "id": 246501, "pid": 76337, "tid": -914061504, "ts": 1716454225367797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225367802, "dur": 0, "args": { "External id": 246503, "cbid": 251, "correlation": 246503 } }, { "ph": "f", "id": 246503, "pid": 76337, "tid": -914061504, "ts": 1716454225367802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225389608, "dur": 365, "args": { "External id": 246504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246504, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 246504, "pid": 5, "tid": 7, "ts": 1716454225389608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367806, "dur": 14, "args": { "External id": 246504, "cbid": 211, "correlation": 246504 } }, { "ph": "s", "id": 246504, "pid": 76337, "tid": -914061504, "ts": 1716454225367806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225389974, "dur": 50, "args": { "External id": 246512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246512, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246512, "pid": 5, "tid": 7, "ts": 1716454225389974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367850, "dur": 10, "args": { "External id": 246512, "cbid": 211, "correlation": 246512 } }, { "ph": "s", "id": 246512, "pid": 76337, "tid": -914061504, "ts": 1716454225367850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225390025, "dur": 161, "args": { "External id": 246523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246523, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246523, "pid": 5, "tid": 7, "ts": 1716454225390025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225367923, "dur": 13, "args": { "External id": 246523, "cbid": 211, "correlation": 246523 } }, { "ph": "s", "id": 246523, "pid": 76337, "tid": -914061504, "ts": 1716454225367923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225368005, "dur": 0, "args": { "External id": 246535, "cbid": 317, "correlation": 246535 } }, { "ph": "f", "id": 246535, "pid": 76337, "tid": -914061504, "ts": 1716454225368005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225368005, "dur": 0, "args": { "External id": 246536, "cbid": 203, "correlation": 246536 } }, { "ph": "f", "id": 246536, "pid": 76337, "tid": -914061504, "ts": 1716454225368005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225368006, "dur": 0, "args": { "External id": 246537, "cbid": 205, "correlation": 246537 } }, { "ph": "f", "id": 246537, "pid": 76337, "tid": -914061504, "ts": 1716454225368006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368030, "dur": 1, "args": { "External id": 246541, "cbid": 251, "correlation": 246541 } }, { "ph": "f", "id": 246541, "pid": 76337, "tid": -914061504, "ts": 1716454225368030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368032, "dur": 0, "args": { "External id": 246542, "cbid": 251, "correlation": 246542 } }, { "ph": "f", "id": 246542, "pid": 76337, "tid": -914061504, "ts": 1716454225368032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368033, "dur": 0, "args": { "External id": 246543, "cbid": 251, "correlation": 246543 } }, { "ph": "f", "id": 246543, "pid": 76337, "tid": -914061504, "ts": 1716454225368033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368034, "dur": 0, "args": { "External id": 246544, "cbid": 251, "correlation": 246544 } }, { "ph": "f", "id": 246544, "pid": 76337, "tid": -914061504, "ts": 1716454225368034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368035, "dur": 0, "args": { "External id": 246545, "cbid": 251, "correlation": 246545 } }, { "ph": "f", "id": 246545, "pid": 76337, "tid": -914061504, "ts": 1716454225368035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368035, "dur": 0, "args": { "External id": 246546, "cbid": 251, "correlation": 246546 } }, { "ph": "f", "id": 246546, "pid": 76337, "tid": -914061504, "ts": 1716454225368035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368037, "dur": 0, "args": { "External id": 246547, "cbid": 251, "correlation": 246547 } }, { "ph": "f", "id": 246547, "pid": 76337, "tid": -914061504, "ts": 1716454225368037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368037, "dur": 0, "args": { "External id": 246548, "cbid": 251, "correlation": 246548 } }, { "ph": "f", "id": 246548, "pid": 76337, "tid": -914061504, "ts": 1716454225368037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368039, "dur": 0, "args": { "External id": 246549, "cbid": 251, "correlation": 246549 } }, { "ph": "f", "id": 246549, "pid": 76337, "tid": -914061504, "ts": 1716454225368039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225390188, "dur": 116, "args": { "External id": 246550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246550, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246550, "pid": 5, "tid": 7, "ts": 1716454225390188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368041, "dur": 13, "args": { "External id": 246550, "cbid": 211, "correlation": 246550 } }, { "ph": "s", "id": 246550, "pid": 76337, "tid": -914061504, "ts": 1716454225368041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225390305, "dur": 60, "args": { "External id": 246556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246556, "pid": 5, "tid": 7, "ts": 1716454225390305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368077, "dur": 10, "args": { "External id": 246556, "cbid": 211, "correlation": 246556 } }, { "ph": "s", "id": 246556, "pid": 76337, "tid": -914061504, "ts": 1716454225368077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225390367, "dur": 49, "args": { "External id": 246564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246564, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246564, "pid": 5, "tid": 7, "ts": 1716454225390367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368110, "dur": 9, "args": { "External id": 246564, "cbid": 211, "correlation": 246564 } }, { "ph": "s", "id": 246564, "pid": 76337, "tid": -914061504, "ts": 1716454225368110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225390417, "dur": 53, "args": { "External id": 246584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246584, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 246584, "pid": 5, "tid": 7, "ts": 1716454225390417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368193, "dur": 12, "args": { "External id": 246584, "cbid": 211, "correlation": 246584 } }, { "ph": "s", "id": 246584, "pid": 76337, "tid": -914061504, "ts": 1716454225368193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225390472, "dur": 5, "args": { "External id": 246596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246596, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246596, "pid": 5, "tid": 7, "ts": 1716454225390472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368214, "dur": 6, "args": { "External id": 246596, "cbid": 211, "correlation": 246596 } }, { "ph": "s", "id": 246596, "pid": 76337, "tid": -914061504, "ts": 1716454225368214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225390477, "dur": 57, "args": { "External id": 246599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246599, "pid": 5, "tid": 7, "ts": 1716454225390477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368233, "dur": 6, "args": { "External id": 246599, "cbid": 211, "correlation": 246599 } }, { "ph": "s", "id": 246599, "pid": 76337, "tid": -914061504, "ts": 1716454225368233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225390536, "dur": 37, "args": { "External id": 246608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246608, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246608, "pid": 5, "tid": 7, "ts": 1716454225390536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368273, "dur": 11, "args": { "External id": 246608, "cbid": 211, "correlation": 246608 } }, { "ph": "s", "id": 246608, "pid": 76337, "tid": -914061504, "ts": 1716454225368273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225368331, "dur": 0, "args": { "External id": 246618, "cbid": 317, "correlation": 246618 } }, { "ph": "f", "id": 246618, "pid": 76337, "tid": -914061504, "ts": 1716454225368331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225368332, "dur": 0, "args": { "External id": 246619, "cbid": 203, "correlation": 246619 } }, { "ph": "f", "id": 246619, "pid": 76337, "tid": -914061504, "ts": 1716454225368332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225368333, "dur": 0, "args": { "External id": 246620, "cbid": 205, "correlation": 246620 } }, { "ph": "f", "id": 246620, "pid": 76337, "tid": -914061504, "ts": 1716454225368333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225390574, "dur": 40, "args": { "External id": 246624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246624, "pid": 5, "tid": 7, "ts": 1716454225390574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368347, "dur": 11, "args": { "External id": 246624, "cbid": 211, "correlation": 246624 } }, { "ph": "s", "id": 246624, "pid": 76337, "tid": -914061504, "ts": 1716454225368347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225390616, "dur": 15, "args": { "External id": 246626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246626, "pid": 5, "tid": 7, "ts": 1716454225390616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368361, "dur": 5, "args": { "External id": 246626, "cbid": 211, "correlation": 246626 } }, { "ph": "s", "id": 246626, "pid": 76337, "tid": -914061504, "ts": 1716454225368361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225390632, "dur": 4, "args": { "External id": 246628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246628, "pid": 5, "tid": 7, "ts": 1716454225390632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368371, "dur": 6, "args": { "External id": 246628, "cbid": 211, "correlation": 246628 } }, { "ph": "s", "id": 246628, "pid": 76337, "tid": -914061504, "ts": 1716454225368371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225368380, "dur": 0, "args": { "External id": 246629, "cbid": 51, "correlation": 246629 } }, { "ph": "s", "id": 246629, "pid": 76337, "tid": -914061504, "ts": 1716454225368380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225390637, "dur": 711, "args": { "External id": 246630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246630, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246630, "pid": 5, "tid": 7, "ts": 1716454225390637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368380, "dur": 5, "args": { "External id": 246630, "cbid": 211, "correlation": 246630 } }, { "ph": "s", "id": 246630, "pid": 76337, "tid": -914061504, "ts": 1716454225368380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225391349, "dur": 60, "args": { "External id": 246635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246635, "pid": 5, "tid": 7, "ts": 1716454225391349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368408, "dur": 8, "args": { "External id": 246635, "cbid": 211, "correlation": 246635 } }, { "ph": "s", "id": 246635, "pid": 76337, "tid": -914061504, "ts": 1716454225368408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225391410, "dur": 3, "args": { "External id": 246643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246643, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246643, "pid": 5, "tid": 7, "ts": 1716454225391410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368451, "dur": 9, "args": { "External id": 246643, "cbid": 211, "correlation": 246643 } }, { "ph": "s", "id": 246643, "pid": 76337, "tid": -914061504, "ts": 1716454225368451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368522, "dur": 2, "args": { "External id": 246659, "cbid": 251, "correlation": 246659 } }, { "ph": "f", "id": 246659, "pid": 76337, "tid": -914061504, "ts": 1716454225368522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225368528, "dur": 0, "args": { "External id": 246661, "cbid": 251, "correlation": 246661 } }, { "ph": "f", "id": 246661, "pid": 76337, "tid": -914061504, "ts": 1716454225368528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225391415, "dur": 12, "args": { "External id": 246662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246662, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 246662, "pid": 5, "tid": 7, "ts": 1716454225391415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368530, "dur": 12, "args": { "External id": 246662, "cbid": 211, "correlation": 246662 } }, { "ph": "s", "id": 246662, "pid": 76337, "tid": -914061504, "ts": 1716454225368530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225391428, "dur": 5, "args": { "External id": 246664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246664, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 246664, "pid": 5, "tid": 7, "ts": 1716454225391428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368544, "dur": 5, "args": { "External id": 246664, "cbid": 211, "correlation": 246664 } }, { "ph": "s", "id": 246664, "pid": 76337, "tid": -914061504, "ts": 1716454225368544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225391435, "dur": 55, "args": { "External id": 246674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246674, "pid": 5, "tid": 7, "ts": 1716454225391435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368602, "dur": 12, "args": { "External id": 246674, "cbid": 211, "correlation": 246674 } }, { "ph": "s", "id": 246674, "pid": 76337, "tid": -914061504, "ts": 1716454225368602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225391491, "dur": 54, "args": { "External id": 246694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246694, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 246694, "pid": 5, "tid": 7, "ts": 1716454225391491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368673, "dur": 12, "args": { "External id": 246694, "cbid": 211, "correlation": 246694 } }, { "ph": "s", "id": 246694, "pid": 76337, "tid": -914061504, "ts": 1716454225368673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225391546, "dur": 4, "args": { "External id": 246706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246706, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246706, "pid": 5, "tid": 7, "ts": 1716454225391546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368694, "dur": 6, "args": { "External id": 246706, "cbid": 211, "correlation": 246706 } }, { "ph": "s", "id": 246706, "pid": 76337, "tid": -914061504, "ts": 1716454225368694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225391551, "dur": 56, "args": { "External id": 246709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246709, "pid": 5, "tid": 7, "ts": 1716454225391551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368712, "dur": 6, "args": { "External id": 246709, "cbid": 211, "correlation": 246709 } }, { "ph": "s", "id": 246709, "pid": 76337, "tid": -914061504, "ts": 1716454225368712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225391609, "dur": 37, "args": { "External id": 246718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246718, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246718, "pid": 5, "tid": 7, "ts": 1716454225391609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368752, "dur": 10, "args": { "External id": 246718, "cbid": 211, "correlation": 246718 } }, { "ph": "s", "id": 246718, "pid": 76337, "tid": -914061504, "ts": 1716454225368752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225368820, "dur": 0, "args": { "External id": 246728, "cbid": 317, "correlation": 246728 } }, { "ph": "f", "id": 246728, "pid": 76337, "tid": -914061504, "ts": 1716454225368820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225368821, "dur": 0, "args": { "External id": 246729, "cbid": 203, "correlation": 246729 } }, { "ph": "f", "id": 246729, "pid": 76337, "tid": -914061504, "ts": 1716454225368821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225368821, "dur": 0, "args": { "External id": 246730, "cbid": 205, "correlation": 246730 } }, { "ph": "f", "id": 246730, "pid": 76337, "tid": -914061504, "ts": 1716454225368821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225391647, "dur": 39, "args": { "External id": 246734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246734, "pid": 5, "tid": 7, "ts": 1716454225391647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368835, "dur": 12, "args": { "External id": 246734, "cbid": 211, "correlation": 246734 } }, { "ph": "s", "id": 246734, "pid": 76337, "tid": -914061504, "ts": 1716454225368835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225391687, "dur": 14, "args": { "External id": 246736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246736, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246736, "pid": 5, "tid": 7, "ts": 1716454225391687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368850, "dur": 5, "args": { "External id": 246736, "cbid": 211, "correlation": 246736 } }, { "ph": "s", "id": 246736, "pid": 76337, "tid": -914061504, "ts": 1716454225368850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225391703, "dur": 3, "args": { "External id": 246738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246738, "pid": 5, "tid": 7, "ts": 1716454225391703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368859, "dur": 5, "args": { "External id": 246738, "cbid": 211, "correlation": 246738 } }, { "ph": "s", "id": 246738, "pid": 76337, "tid": -914061504, "ts": 1716454225368859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225368867, "dur": 0, "args": { "External id": 246739, "cbid": 51, "correlation": 246739 } }, { "ph": "s", "id": 246739, "pid": 76337, "tid": -914061504, "ts": 1716454225368867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225391708, "dur": 707, "args": { "External id": 246740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246740, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246740, "pid": 5, "tid": 7, "ts": 1716454225391708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368868, "dur": 5, "args": { "External id": 246740, "cbid": 211, "correlation": 246740 } }, { "ph": "s", "id": 246740, "pid": 76337, "tid": -914061504, "ts": 1716454225368868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225392416, "dur": 60, "args": { "External id": 246745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246745, "pid": 5, "tid": 7, "ts": 1716454225392416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368896, "dur": 8, "args": { "External id": 246745, "cbid": 211, "correlation": 246745 } }, { "ph": "s", "id": 246745, "pid": 76337, "tid": -914061504, "ts": 1716454225368896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225392477, "dur": 50, "args": { "External id": 246753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246753, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246753, "pid": 5, "tid": 7, "ts": 1716454225392477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368928, "dur": 9, "args": { "External id": 246753, "cbid": 211, "correlation": 246753 } }, { "ph": "s", "id": 246753, "pid": 76337, "tid": -914061504, "ts": 1716454225368928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225392528, "dur": 35, "args": { "External id": 246761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246761, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246761, "pid": 5, "tid": 7, "ts": 1716454225392528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225368958, "dur": 8, "args": { "External id": 246761, "cbid": 211, "correlation": 246761 } }, { "ph": "s", "id": 246761, "pid": 76337, "tid": -914061504, "ts": 1716454225368958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225392565, "dur": 51, "args": { "External id": 246781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246781, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 246781, "pid": 5, "tid": 7, "ts": 1716454225392565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369049, "dur": 13, "args": { "External id": 246781, "cbid": 211, "correlation": 246781 } }, { "ph": "s", "id": 246781, "pid": 76337, "tid": -914061504, "ts": 1716454225369049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225392618, "dur": 4, "args": { "External id": 246793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246793, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 246793, "pid": 5, "tid": 7, "ts": 1716454225392618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369072, "dur": 6, "args": { "External id": 246793, "cbid": 211, "correlation": 246793 } }, { "ph": "s", "id": 246793, "pid": 76337, "tid": -914061504, "ts": 1716454225369072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225392623, "dur": 57, "args": { "External id": 246796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246796, "pid": 5, "tid": 7, "ts": 1716454225392623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369091, "dur": 6, "args": { "External id": 246796, "cbid": 211, "correlation": 246796 } }, { "ph": "s", "id": 246796, "pid": 76337, "tid": -914061504, "ts": 1716454225369091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225369148, "dur": 0, "args": { "External id": 246807, "cbid": 317, "correlation": 246807 } }, { "ph": "f", "id": 246807, "pid": 76337, "tid": -914061504, "ts": 1716454225369148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225369149, "dur": 0, "args": { "External id": 246808, "cbid": 203, "correlation": 246808 } }, { "ph": "f", "id": 246808, "pid": 76337, "tid": -914061504, "ts": 1716454225369149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225369149, "dur": 0, "args": { "External id": 246809, "cbid": 205, "correlation": 246809 } }, { "ph": "f", "id": 246809, "pid": 76337, "tid": -914061504, "ts": 1716454225369149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369171, "dur": 1, "args": { "External id": 246813, "cbid": 251, "correlation": 246813 } }, { "ph": "f", "id": 246813, "pid": 76337, "tid": -914061504, "ts": 1716454225369171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369173, "dur": 0, "args": { "External id": 246814, "cbid": 251, "correlation": 246814 } }, { "ph": "f", "id": 246814, "pid": 76337, "tid": -914061504, "ts": 1716454225369173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369174, "dur": 0, "args": { "External id": 246815, "cbid": 251, "correlation": 246815 } }, { "ph": "f", "id": 246815, "pid": 76337, "tid": -914061504, "ts": 1716454225369174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369174, "dur": 0, "args": { "External id": 246816, "cbid": 251, "correlation": 246816 } }, { "ph": "f", "id": 246816, "pid": 76337, "tid": -914061504, "ts": 1716454225369174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369175, "dur": 0, "args": { "External id": 246817, "cbid": 251, "correlation": 246817 } }, { "ph": "f", "id": 246817, "pid": 76337, "tid": -914061504, "ts": 1716454225369175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369176, "dur": 0, "args": { "External id": 246818, "cbid": 251, "correlation": 246818 } }, { "ph": "f", "id": 246818, "pid": 76337, "tid": -914061504, "ts": 1716454225369176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369177, "dur": 0, "args": { "External id": 246819, "cbid": 251, "correlation": 246819 } }, { "ph": "f", "id": 246819, "pid": 76337, "tid": -914061504, "ts": 1716454225369177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369177, "dur": 0, "args": { "External id": 246820, "cbid": 251, "correlation": 246820 } }, { "ph": "f", "id": 246820, "pid": 76337, "tid": -914061504, "ts": 1716454225369177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369179, "dur": 0, "args": { "External id": 246821, "cbid": 251, "correlation": 246821 } }, { "ph": "f", "id": 246821, "pid": 76337, "tid": -914061504, "ts": 1716454225369179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225392681, "dur": 113, "args": { "External id": 246822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246822, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246822, "pid": 5, "tid": 7, "ts": 1716454225392681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369181, "dur": 13, "args": { "External id": 246822, "cbid": 211, "correlation": 246822 } }, { "ph": "s", "id": 246822, "pid": 76337, "tid": -914061504, "ts": 1716454225369181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225392796, "dur": 60, "args": { "External id": 246828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246828, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246828, "pid": 5, "tid": 7, "ts": 1716454225392796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369216, "dur": 9, "args": { "External id": 246828, "cbid": 211, "correlation": 246828 } }, { "ph": "s", "id": 246828, "pid": 76337, "tid": -914061504, "ts": 1716454225369216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225392858, "dur": 569, "args": { "External id": 246837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246837, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246837, "pid": 5, "tid": 7, "ts": 1716454225392858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369300, "dur": 13, "args": { "External id": 246837, "cbid": 211, "correlation": 246837 } }, { "ph": "s", "id": 246837, "pid": 76337, "tid": -914061504, "ts": 1716454225369300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225393428, "dur": 184, "args": { "External id": 246859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246859, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246859, "pid": 5, "tid": 7, "ts": 1716454225393428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369357, "dur": 10, "args": { "External id": 246859, "cbid": 211, "correlation": 246859 } }, { "ph": "s", "id": 246859, "pid": 76337, "tid": -914061504, "ts": 1716454225369357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369443, "dur": 1, "args": { "External id": 246870, "cbid": 251, "correlation": 246870 } }, { "ph": "f", "id": 246870, "pid": 76337, "tid": -914061504, "ts": 1716454225369443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225393614, "dur": 197, "args": { "External id": 246871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246871, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246871, "pid": 5, "tid": 7, "ts": 1716454225393614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369449, "dur": 13, "args": { "External id": 246871, "cbid": 211, "correlation": 246871 } }, { "ph": "s", "id": 246871, "pid": 76337, "tid": -914061504, "ts": 1716454225369449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369517, "dur": 1, "args": { "External id": 246882, "cbid": 251, "correlation": 246882 } }, { "ph": "f", "id": 246882, "pid": 76337, "tid": -914061504, "ts": 1716454225369517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225393812, "dur": 191, "args": { "External id": 246883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246883, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246883, "pid": 5, "tid": 7, "ts": 1716454225393812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369521, "dur": 12, "args": { "External id": 246883, "cbid": 211, "correlation": 246883 } }, { "ph": "s", "id": 246883, "pid": 76337, "tid": -914061504, "ts": 1716454225369521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369585, "dur": 1, "args": { "External id": 246894, "cbid": 251, "correlation": 246894 } }, { "ph": "f", "id": 246894, "pid": 76337, "tid": -914061504, "ts": 1716454225369585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225394004, "dur": 190, "args": { "External id": 246895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246895, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246895, "pid": 5, "tid": 7, "ts": 1716454225394004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369589, "dur": 11, "args": { "External id": 246895, "cbid": 211, "correlation": 246895 } }, { "ph": "s", "id": 246895, "pid": 76337, "tid": -914061504, "ts": 1716454225369589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225394195, "dur": 19016, "args": { "External id": 246916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246916, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 246916, "pid": 5, "tid": 7, "ts": 1716454225394195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369669, "dur": 13, "args": { "External id": 246916, "cbid": 211, "correlation": 246916 } }, { "ph": "s", "id": 246916, "pid": 76337, "tid": -914061504, "ts": 1716454225369669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225369770, "dur": 1, "args": { "External id": 246934, "cbid": 251, "correlation": 246934 } }, { "ph": "f", "id": 246934, "pid": 76337, "tid": -914061504, "ts": 1716454225369770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225413213, "dur": 205, "args": { "External id": 246936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246936, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246936, "pid": 5, "tid": 7, "ts": 1716454225413213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369776, "dur": 14, "args": { "External id": 246936, "cbid": 211, "correlation": 246936 } }, { "ph": "s", "id": 246936, "pid": 76337, "tid": -914061504, "ts": 1716454225369776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225413419, "dur": 66, "args": { "External id": 246944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246944, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246944, "pid": 5, "tid": 7, "ts": 1716454225413419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369848, "dur": 12, "args": { "External id": 246944, "cbid": 211, "correlation": 246944 } }, { "ph": "s", "id": 246944, "pid": 76337, "tid": -914061504, "ts": 1716454225369848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225413487, "dur": 98, "args": { "External id": 246952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246952, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246952, "pid": 5, "tid": 7, "ts": 1716454225413487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369887, "dur": 9, "args": { "External id": 246952, "cbid": 211, "correlation": 246952 } }, { "ph": "s", "id": 246952, "pid": 76337, "tid": -914061504, "ts": 1716454225369887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225413585, "dur": 56, "args": { "External id": 246963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246963, "pid": 5, "tid": 7, "ts": 1716454225413585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225369961, "dur": 21, "args": { "External id": 246963, "cbid": 211, "correlation": 246963 } }, { "ph": "s", "id": 246963, "pid": 76337, "tid": -914061504, "ts": 1716454225369961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225413643, "dur": 94, "args": { "External id": 246985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246985, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 246985, "pid": 5, "tid": 7, "ts": 1716454225413643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370002, "dur": 8, "args": { "External id": 246985, "cbid": 211, "correlation": 246985 } }, { "ph": "s", "id": 246985, "pid": 76337, "tid": -914061504, "ts": 1716454225370002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370092, "dur": 1, "args": { "External id": 246996, "cbid": 251, "correlation": 246996 } }, { "ph": "f", "id": 246996, "pid": 76337, "tid": -914061504, "ts": 1716454225370092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225413738, "dur": 106, "args": { "External id": 246997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 246997, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 246997, "pid": 5, "tid": 7, "ts": 1716454225413738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370097, "dur": 14, "args": { "External id": 246997, "cbid": 211, "correlation": 246997 } }, { "ph": "s", "id": 246997, "pid": 76337, "tid": -914061504, "ts": 1716454225370097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370171, "dur": 1, "args": { "External id": 247008, "cbid": 251, "correlation": 247008 } }, { "ph": "f", "id": 247008, "pid": 76337, "tid": -914061504, "ts": 1716454225370171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370174, "dur": 0, "args": { "External id": 247009, "cbid": 251, "correlation": 247009 } }, { "ph": "f", "id": 247009, "pid": 76337, "tid": -914061504, "ts": 1716454225370174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225413845, "dur": 11, "args": { "External id": 247010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247010, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 247010, "pid": 5, "tid": 7, "ts": 1716454225413845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370176, "dur": 13, "args": { "External id": 247010, "cbid": 211, "correlation": 247010 } }, { "ph": "s", "id": 247010, "pid": 76337, "tid": -914061504, "ts": 1716454225370176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225413857, "dur": 5, "args": { "External id": 247012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247012, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 247012, "pid": 5, "tid": 7, "ts": 1716454225413857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370190, "dur": 6, "args": { "External id": 247012, "cbid": 211, "correlation": 247012 } }, { "ph": "s", "id": 247012, "pid": 76337, "tid": -914061504, "ts": 1716454225370190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370254, "dur": 1, "args": { "External id": 247023, "cbid": 251, "correlation": 247023 } }, { "ph": "f", "id": 247023, "pid": 76337, "tid": -914061504, "ts": 1716454225370254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370258, "dur": 0, "args": { "External id": 247024, "cbid": 251, "correlation": 247024 } }, { "ph": "f", "id": 247024, "pid": 76337, "tid": -914061504, "ts": 1716454225370258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225413864, "dur": 6, "args": { "External id": 247025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247025, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 247025, "pid": 5, "tid": 7, "ts": 1716454225413864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370259, "dur": 12, "args": { "External id": 247025, "cbid": 211, "correlation": 247025 } }, { "ph": "s", "id": 247025, "pid": 76337, "tid": -914061504, "ts": 1716454225370259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225413872, "dur": 3, "args": { "External id": 247027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247027, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 247027, "pid": 5, "tid": 7, "ts": 1716454225413872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370273, "dur": 5, "args": { "External id": 247027, "cbid": 211, "correlation": 247027 } }, { "ph": "s", "id": 247027, "pid": 76337, "tid": -914061504, "ts": 1716454225370273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225413876, "dur": 158, "args": { "External id": 247048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247048, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 247048, "pid": 5, "tid": 7, "ts": 1716454225413876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370347, "dur": 12, "args": { "External id": 247048, "cbid": 211, "correlation": 247048 } }, { "ph": "s", "id": 247048, "pid": 76337, "tid": -914061504, "ts": 1716454225370347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370448, "dur": 1, "args": { "External id": 247066, "cbid": 251, "correlation": 247066 } }, { "ph": "f", "id": 247066, "pid": 76337, "tid": -914061504, "ts": 1716454225370448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225414036, "dur": 109, "args": { "External id": 247068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247068, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 247068, "pid": 5, "tid": 7, "ts": 1716454225414036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370454, "dur": 13, "args": { "External id": 247068, "cbid": 211, "correlation": 247068 } }, { "ph": "s", "id": 247068, "pid": 76337, "tid": -914061504, "ts": 1716454225370454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225414146, "dur": 34, "args": { "External id": 247076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247076, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247076, "pid": 5, "tid": 7, "ts": 1716454225414146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370523, "dur": 12, "args": { "External id": 247076, "cbid": 211, "correlation": 247076 } }, { "ph": "s", "id": 247076, "pid": 76337, "tid": -914061504, "ts": 1716454225370523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225414182, "dur": 68, "args": { "External id": 247084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247084, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247084, "pid": 5, "tid": 7, "ts": 1716454225414182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370564, "dur": 9, "args": { "External id": 247084, "cbid": 211, "correlation": 247084 } }, { "ph": "s", "id": 247084, "pid": 76337, "tid": -914061504, "ts": 1716454225370564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225414251, "dur": 94, "args": { "External id": 247106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247106, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247106, "pid": 5, "tid": 7, "ts": 1716454225414251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370620, "dur": 10, "args": { "External id": 247106, "cbid": 211, "correlation": 247106 } }, { "ph": "s", "id": 247106, "pid": 76337, "tid": -914061504, "ts": 1716454225370620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370708, "dur": 1, "args": { "External id": 247122, "cbid": 251, "correlation": 247122 } }, { "ph": "f", "id": 247122, "pid": 76337, "tid": -914061504, "ts": 1716454225370708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225414346, "dur": 585, "args": { "External id": 247124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247124, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247124, "pid": 5, "tid": 7, "ts": 1716454225414346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370714, "dur": 12, "args": { "External id": 247124, "cbid": 211, "correlation": 247124 } }, { "ph": "s", "id": 247124, "pid": 76337, "tid": -914061504, "ts": 1716454225370714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225414932, "dur": 248, "args": { "External id": 247132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247132, "pid": 5, "tid": 7, "ts": 1716454225414932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370778, "dur": 13, "args": { "External id": 247132, "cbid": 211, "correlation": 247132 } }, { "ph": "s", "id": 247132, "pid": 76337, "tid": -914061504, "ts": 1716454225370778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225415181, "dur": 254, "args": { "External id": 247140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247140, "pid": 5, "tid": 7, "ts": 1716454225415181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370809, "dur": 8, "args": { "External id": 247140, "cbid": 211, "correlation": 247140 } }, { "ph": "s", "id": 247140, "pid": 76337, "tid": -914061504, "ts": 1716454225370809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370892, "dur": 1, "args": { "External id": 247156, "cbid": 251, "correlation": 247156 } }, { "ph": "f", "id": 247156, "pid": 76337, "tid": -914061504, "ts": 1716454225370892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225370897, "dur": 0, "args": { "External id": 247158, "cbid": 251, "correlation": 247158 } }, { "ph": "f", "id": 247158, "pid": 76337, "tid": -914061504, "ts": 1716454225370897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225415437, "dur": 363, "args": { "External id": 247159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247159, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 247159, "pid": 5, "tid": 7, "ts": 1716454225415437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370899, "dur": 13, "args": { "External id": 247159, "cbid": 211, "correlation": 247159 } }, { "ph": "s", "id": 247159, "pid": 76337, "tid": -914061504, "ts": 1716454225370899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225415801, "dur": 50, "args": { "External id": 247167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247167, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247167, "pid": 5, "tid": 7, "ts": 1716454225415801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225370941, "dur": 9, "args": { "External id": 247167, "cbid": 211, "correlation": 247167 } }, { "ph": "s", "id": 247167, "pid": 76337, "tid": -914061504, "ts": 1716454225370941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225415852, "dur": 160, "args": { "External id": 247178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247178, "pid": 5, "tid": 7, "ts": 1716454225415852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371016, "dur": 13, "args": { "External id": 247178, "cbid": 211, "correlation": 247178 } }, { "ph": "s", "id": 247178, "pid": 76337, "tid": -914061504, "ts": 1716454225371016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225371081, "dur": 0, "args": { "External id": 247190, "cbid": 317, "correlation": 247190 } }, { "ph": "f", "id": 247190, "pid": 76337, "tid": -914061504, "ts": 1716454225371081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225371082, "dur": 0, "args": { "External id": 247191, "cbid": 203, "correlation": 247191 } }, { "ph": "f", "id": 247191, "pid": 76337, "tid": -914061504, "ts": 1716454225371082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225371082, "dur": 0, "args": { "External id": 247192, "cbid": 205, "correlation": 247192 } }, { "ph": "f", "id": 247192, "pid": 76337, "tid": -914061504, "ts": 1716454225371082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371105, "dur": 1, "args": { "External id": 247196, "cbid": 251, "correlation": 247196 } }, { "ph": "f", "id": 247196, "pid": 76337, "tid": -914061504, "ts": 1716454225371105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371107, "dur": 0, "args": { "External id": 247197, "cbid": 251, "correlation": 247197 } }, { "ph": "f", "id": 247197, "pid": 76337, "tid": -914061504, "ts": 1716454225371107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371108, "dur": 0, "args": { "External id": 247198, "cbid": 251, "correlation": 247198 } }, { "ph": "f", "id": 247198, "pid": 76337, "tid": -914061504, "ts": 1716454225371108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371109, "dur": 0, "args": { "External id": 247199, "cbid": 251, "correlation": 247199 } }, { "ph": "f", "id": 247199, "pid": 76337, "tid": -914061504, "ts": 1716454225371109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371110, "dur": 0, "args": { "External id": 247200, "cbid": 251, "correlation": 247200 } }, { "ph": "f", "id": 247200, "pid": 76337, "tid": -914061504, "ts": 1716454225371110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371110, "dur": 0, "args": { "External id": 247201, "cbid": 251, "correlation": 247201 } }, { "ph": "f", "id": 247201, "pid": 76337, "tid": -914061504, "ts": 1716454225371110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371111, "dur": 0, "args": { "External id": 247202, "cbid": 251, "correlation": 247202 } }, { "ph": "f", "id": 247202, "pid": 76337, "tid": -914061504, "ts": 1716454225371111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371112, "dur": 0, "args": { "External id": 247203, "cbid": 251, "correlation": 247203 } }, { "ph": "f", "id": 247203, "pid": 76337, "tid": -914061504, "ts": 1716454225371112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371113, "dur": 0, "args": { "External id": 247204, "cbid": 251, "correlation": 247204 } }, { "ph": "f", "id": 247204, "pid": 76337, "tid": -914061504, "ts": 1716454225371113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225416014, "dur": 117, "args": { "External id": 247205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247205, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 247205, "pid": 5, "tid": 7, "ts": 1716454225416014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371115, "dur": 12, "args": { "External id": 247205, "cbid": 211, "correlation": 247205 } }, { "ph": "s", "id": 247205, "pid": 76337, "tid": -914061504, "ts": 1716454225371115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225416132, "dur": 60, "args": { "External id": 247211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247211, "pid": 5, "tid": 7, "ts": 1716454225416132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371150, "dur": 8, "args": { "External id": 247211, "cbid": 211, "correlation": 247211 } }, { "ph": "s", "id": 247211, "pid": 76337, "tid": -914061504, "ts": 1716454225371150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225416193, "dur": 50, "args": { "External id": 247219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247219, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247219, "pid": 5, "tid": 7, "ts": 1716454225416193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371181, "dur": 9, "args": { "External id": 247219, "cbid": 211, "correlation": 247219 } }, { "ph": "s", "id": 247219, "pid": 76337, "tid": -914061504, "ts": 1716454225371181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225371256, "dur": 0, "args": { "External id": 247229, "cbid": 317, "correlation": 247229 } }, { "ph": "f", "id": 247229, "pid": 76337, "tid": -914061504, "ts": 1716454225371256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225371256, "dur": 0, "args": { "External id": 247230, "cbid": 203, "correlation": 247230 } }, { "ph": "f", "id": 247230, "pid": 76337, "tid": -914061504, "ts": 1716454225371256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225371257, "dur": 0, "args": { "External id": 247231, "cbid": 205, "correlation": 247231 } }, { "ph": "f", "id": 247231, "pid": 76337, "tid": -914061504, "ts": 1716454225371257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225416245, "dur": 40, "args": { "External id": 247235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247235, "pid": 5, "tid": 7, "ts": 1716454225416245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371274, "dur": 13, "args": { "External id": 247235, "cbid": 211, "correlation": 247235 } }, { "ph": "s", "id": 247235, "pid": 76337, "tid": -914061504, "ts": 1716454225371274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225416286, "dur": 15, "args": { "External id": 247237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247237, "pid": 5, "tid": 7, "ts": 1716454225416286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371290, "dur": 5, "args": { "External id": 247237, "cbid": 211, "correlation": 247237 } }, { "ph": "s", "id": 247237, "pid": 76337, "tid": -914061504, "ts": 1716454225371290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225416303, "dur": 1, "args": { "External id": 247239, "device": 5, "context": 1, "stream": 7, "correlation": 247239, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 247239, "pid": 5, "tid": 7, "ts": 1716454225416303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225371309, "dur": 19, "args": { "External id": 247239, "cbid": 51, "correlation": 247239 } }, { "ph": "s", "id": 247239, "pid": 76337, "tid": -914061504, "ts": 1716454225371309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225416307, "dur": 367, "args": { "External id": 247240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247240, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247240, "pid": 5, "tid": 7, "ts": 1716454225416307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371330, "dur": 10, "args": { "External id": 247240, "cbid": 211, "correlation": 247240 } }, { "ph": "s", "id": 247240, "pid": 76337, "tid": -914061504, "ts": 1716454225371330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225416676, "dur": 14, "args": { "External id": 247242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247242, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247242, "pid": 5, "tid": 7, "ts": 1716454225416676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371353, "dur": 7, "args": { "External id": 247242, "cbid": 211, "correlation": 247242 } }, { "ph": "s", "id": 247242, "pid": 76337, "tid": -914061504, "ts": 1716454225371353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225416691, "dur": 15, "args": { "External id": 247248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247248, "pid": 5, "tid": 7, "ts": 1716454225416691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371384, "dur": 9, "args": { "External id": 247248, "cbid": 211, "correlation": 247248 } }, { "ph": "s", "id": 247248, "pid": 76337, "tid": -914061504, "ts": 1716454225371384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225416707, "dur": 20, "args": { "External id": 247268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247268, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 247268, "pid": 5, "tid": 7, "ts": 1716454225416707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371480, "dur": 13, "args": { "External id": 247268, "cbid": 211, "correlation": 247268 } }, { "ph": "s", "id": 247268, "pid": 76337, "tid": -914061504, "ts": 1716454225371480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225416728, "dur": 5, "args": { "External id": 247280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247280, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 247280, "pid": 5, "tid": 7, "ts": 1716454225416728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371504, "dur": 6, "args": { "External id": 247280, "cbid": 211, "correlation": 247280 } }, { "ph": "s", "id": 247280, "pid": 76337, "tid": -914061504, "ts": 1716454225371504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225416734, "dur": 18, "args": { "External id": 247283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247283, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247283, "pid": 5, "tid": 7, "ts": 1716454225416734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371522, "dur": 7, "args": { "External id": 247283, "cbid": 211, "correlation": 247283 } }, { "ph": "s", "id": 247283, "pid": 76337, "tid": -914061504, "ts": 1716454225371522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225416753, "dur": 12, "args": { "External id": 247292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247292, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247292, "pid": 5, "tid": 7, "ts": 1716454225416753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371563, "dur": 9, "args": { "External id": 247292, "cbid": 211, "correlation": 247292 } }, { "ph": "s", "id": 247292, "pid": 76337, "tid": -914061504, "ts": 1716454225371563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225371622, "dur": 0, "args": { "External id": 247302, "cbid": 317, "correlation": 247302 } }, { "ph": "f", "id": 247302, "pid": 76337, "tid": -914061504, "ts": 1716454225371622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225371623, "dur": 0, "args": { "External id": 247303, "cbid": 203, "correlation": 247303 } }, { "ph": "f", "id": 247303, "pid": 76337, "tid": -914061504, "ts": 1716454225371623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225371624, "dur": 0, "args": { "External id": 247304, "cbid": 205, "correlation": 247304 } }, { "ph": "f", "id": 247304, "pid": 76337, "tid": -914061504, "ts": 1716454225371624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225416766, "dur": 11, "args": { "External id": 247308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247308, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247308, "pid": 5, "tid": 7, "ts": 1716454225416766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371640, "dur": 11, "args": { "External id": 247308, "cbid": 211, "correlation": 247308 } }, { "ph": "s", "id": 247308, "pid": 76337, "tid": -914061504, "ts": 1716454225371640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225416779, "dur": 25, "args": { "External id": 247310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247310, "pid": 5, "tid": 7, "ts": 1716454225416779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371653, "dur": 5, "args": { "External id": 247310, "cbid": 211, "correlation": 247310 } }, { "ph": "s", "id": 247310, "pid": 76337, "tid": -914061504, "ts": 1716454225371653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225416805, "dur": 4, "args": { "External id": 247312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 247312, "pid": 5, "tid": 7, "ts": 1716454225416805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371664, "dur": 5, "args": { "External id": 247312, "cbid": 211, "correlation": 247312 } }, { "ph": "s", "id": 247312, "pid": 76337, "tid": -914061504, "ts": 1716454225371664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225371675, "dur": 0, "args": { "External id": 247313, "cbid": 51, "correlation": 247313 } }, { "ph": "s", "id": 247313, "pid": 76337, "tid": -914061504, "ts": 1716454225371675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225416810, "dur": 363, "args": { "External id": 247314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247314, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247314, "pid": 5, "tid": 7, "ts": 1716454225416810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371677, "dur": 9, "args": { "External id": 247314, "cbid": 211, "correlation": 247314 } }, { "ph": "s", "id": 247314, "pid": 76337, "tid": -914061504, "ts": 1716454225371677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225417174, "dur": 20, "args": { "External id": 247315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247315, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247315, "pid": 5, "tid": 7, "ts": 1716454225417174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371688, "dur": 5, "args": { "External id": 247315, "cbid": 211, "correlation": 247315 } }, { "ph": "s", "id": 247315, "pid": 76337, "tid": -914061504, "ts": 1716454225371688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225417196, "dur": 33, "args": { "External id": 247321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247321, "pid": 5, "tid": 7, "ts": 1716454225417196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371716, "dur": 8, "args": { "External id": 247321, "cbid": 211, "correlation": 247321 } }, { "ph": "s", "id": 247321, "pid": 76337, "tid": -914061504, "ts": 1716454225371716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225417230, "dur": 4, "args": { "External id": 247329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247329, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 247329, "pid": 5, "tid": 7, "ts": 1716454225417230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371758, "dur": 9, "args": { "External id": 247329, "cbid": 211, "correlation": 247329 } }, { "ph": "s", "id": 247329, "pid": 76337, "tid": -914061504, "ts": 1716454225371758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371825, "dur": 2, "args": { "External id": 247345, "cbid": 251, "correlation": 247345 } }, { "ph": "f", "id": 247345, "pid": 76337, "tid": -914061504, "ts": 1716454225371825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225371832, "dur": 0, "args": { "External id": 247347, "cbid": 251, "correlation": 247347 } }, { "ph": "f", "id": 247347, "pid": 76337, "tid": -914061504, "ts": 1716454225371832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225417235, "dur": 12, "args": { "External id": 247348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247348, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 247348, "pid": 5, "tid": 7, "ts": 1716454225417235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371833, "dur": 11, "args": { "External id": 247348, "cbid": 211, "correlation": 247348 } }, { "ph": "s", "id": 247348, "pid": 76337, "tid": -914061504, "ts": 1716454225371833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225417248, "dur": 5, "args": { "External id": 247350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247350, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 247350, "pid": 5, "tid": 7, "ts": 1716454225417248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371846, "dur": 5, "args": { "External id": 247350, "cbid": 211, "correlation": 247350 } }, { "ph": "s", "id": 247350, "pid": 76337, "tid": -914061504, "ts": 1716454225371846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225417255, "dur": 30, "args": { "External id": 247360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247360, "pid": 5, "tid": 7, "ts": 1716454225417255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371904, "dur": 12, "args": { "External id": 247360, "cbid": 211, "correlation": 247360 } }, { "ph": "s", "id": 247360, "pid": 76337, "tid": -914061504, "ts": 1716454225371904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225417286, "dur": 31, "args": { "External id": 247380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247380, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 247380, "pid": 5, "tid": 7, "ts": 1716454225417286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225371971, "dur": 21, "args": { "External id": 247380, "cbid": 211, "correlation": 247380 } }, { "ph": "s", "id": 247380, "pid": 76337, "tid": -914061504, "ts": 1716454225371971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225417318, "dur": 4, "args": { "External id": 247392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247392, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 247392, "pid": 5, "tid": 7, "ts": 1716454225417318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372003, "dur": 6, "args": { "External id": 247392, "cbid": 211, "correlation": 247392 } }, { "ph": "s", "id": 247392, "pid": 76337, "tid": -914061504, "ts": 1716454225372003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225417324, "dur": 31, "args": { "External id": 247395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247395, "pid": 5, "tid": 7, "ts": 1716454225417324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372022, "dur": 7, "args": { "External id": 247395, "cbid": 211, "correlation": 247395 } }, { "ph": "s", "id": 247395, "pid": 76337, "tid": -914061504, "ts": 1716454225372022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225417356, "dur": 21, "args": { "External id": 247404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247404, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247404, "pid": 5, "tid": 7, "ts": 1716454225417356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372064, "dur": 9, "args": { "External id": 247404, "cbid": 211, "correlation": 247404 } }, { "ph": "s", "id": 247404, "pid": 76337, "tid": -914061504, "ts": 1716454225372064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225372127, "dur": 0, "args": { "External id": 247414, "cbid": 317, "correlation": 247414 } }, { "ph": "f", "id": 247414, "pid": 76337, "tid": -914061504, "ts": 1716454225372127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225372128, "dur": 0, "args": { "External id": 247415, "cbid": 203, "correlation": 247415 } }, { "ph": "f", "id": 247415, "pid": 76337, "tid": -914061504, "ts": 1716454225372128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225372129, "dur": 0, "args": { "External id": 247416, "cbid": 205, "correlation": 247416 } }, { "ph": "f", "id": 247416, "pid": 76337, "tid": -914061504, "ts": 1716454225372129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225417378, "dur": 23, "args": { "External id": 247420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247420, "pid": 5, "tid": 7, "ts": 1716454225417378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372151, "dur": 12, "args": { "External id": 247420, "cbid": 211, "correlation": 247420 } }, { "ph": "s", "id": 247420, "pid": 76337, "tid": -914061504, "ts": 1716454225372151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225417403, "dur": 44, "args": { "External id": 247422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247422, "pid": 5, "tid": 7, "ts": 1716454225417403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372165, "dur": 5, "args": { "External id": 247422, "cbid": 211, "correlation": 247422 } }, { "ph": "s", "id": 247422, "pid": 76337, "tid": -914061504, "ts": 1716454225372165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225417448, "dur": 667, "args": { "External id": 247424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247424, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247424, "pid": 5, "tid": 7, "ts": 1716454225417448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372180, "dur": 9, "args": { "External id": 247424, "cbid": 211, "correlation": 247424 } }, { "ph": "s", "id": 247424, "pid": 76337, "tid": -914061504, "ts": 1716454225372180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225418117, "dur": 22, "args": { "External id": 247426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247426, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247426, "pid": 5, "tid": 7, "ts": 1716454225418117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372192, "dur": 5, "args": { "External id": 247426, "cbid": 211, "correlation": 247426 } }, { "ph": "s", "id": 247426, "pid": 76337, "tid": -914061504, "ts": 1716454225372192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225418140, "dur": 34, "args": { "External id": 247432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247432, "pid": 5, "tid": 7, "ts": 1716454225418140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372220, "dur": 9, "args": { "External id": 247432, "cbid": 211, "correlation": 247432 } }, { "ph": "s", "id": 247432, "pid": 76337, "tid": -914061504, "ts": 1716454225372220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225372283, "dur": 0, "args": { "External id": 247442, "cbid": 317, "correlation": 247442 } }, { "ph": "f", "id": 247442, "pid": 76337, "tid": -914061504, "ts": 1716454225372283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225372284, "dur": 0, "args": { "External id": 247443, "cbid": 203, "correlation": 247443 } }, { "ph": "f", "id": 247443, "pid": 76337, "tid": -914061504, "ts": 1716454225372284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225372285, "dur": 0, "args": { "External id": 247444, "cbid": 205, "correlation": 247444 } }, { "ph": "f", "id": 247444, "pid": 76337, "tid": -914061504, "ts": 1716454225372285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372307, "dur": 1, "args": { "External id": 247448, "cbid": 251, "correlation": 247448 } }, { "ph": "f", "id": 247448, "pid": 76337, "tid": -914061504, "ts": 1716454225372307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372309, "dur": 0, "args": { "External id": 247449, "cbid": 251, "correlation": 247449 } }, { "ph": "f", "id": 247449, "pid": 76337, "tid": -914061504, "ts": 1716454225372309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372310, "dur": 0, "args": { "External id": 247450, "cbid": 251, "correlation": 247450 } }, { "ph": "f", "id": 247450, "pid": 76337, "tid": -914061504, "ts": 1716454225372310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372311, "dur": 0, "args": { "External id": 247451, "cbid": 251, "correlation": 247451 } }, { "ph": "f", "id": 247451, "pid": 76337, "tid": -914061504, "ts": 1716454225372311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372311, "dur": 0, "args": { "External id": 247452, "cbid": 251, "correlation": 247452 } }, { "ph": "f", "id": 247452, "pid": 76337, "tid": -914061504, "ts": 1716454225372311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372312, "dur": 0, "args": { "External id": 247453, "cbid": 251, "correlation": 247453 } }, { "ph": "f", "id": 247453, "pid": 76337, "tid": -914061504, "ts": 1716454225372312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372313, "dur": 0, "args": { "External id": 247454, "cbid": 251, "correlation": 247454 } }, { "ph": "f", "id": 247454, "pid": 76337, "tid": -914061504, "ts": 1716454225372313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372314, "dur": 0, "args": { "External id": 247455, "cbid": 251, "correlation": 247455 } }, { "ph": "f", "id": 247455, "pid": 76337, "tid": -914061504, "ts": 1716454225372314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372315, "dur": 0, "args": { "External id": 247456, "cbid": 251, "correlation": 247456 } }, { "ph": "f", "id": 247456, "pid": 76337, "tid": -914061504, "ts": 1716454225372315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225418175, "dur": 52, "args": { "External id": 247457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247457, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 247457, "pid": 5, "tid": 7, "ts": 1716454225418175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372317, "dur": 13, "args": { "External id": 247457, "cbid": 211, "correlation": 247457 } }, { "ph": "s", "id": 247457, "pid": 76337, "tid": -914061504, "ts": 1716454225372317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225418228, "dur": 33, "args": { "External id": 247463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247463, "pid": 5, "tid": 7, "ts": 1716454225418228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372350, "dur": 8, "args": { "External id": 247463, "cbid": 211, "correlation": 247463 } }, { "ph": "s", "id": 247463, "pid": 76337, "tid": -914061504, "ts": 1716454225372350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225418262, "dur": 27, "args": { "External id": 247471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247471, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247471, "pid": 5, "tid": 7, "ts": 1716454225418262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372381, "dur": 8, "args": { "External id": 247471, "cbid": 211, "correlation": 247471 } }, { "ph": "s", "id": 247471, "pid": 76337, "tid": -914061504, "ts": 1716454225372381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225418290, "dur": 20, "args": { "External id": 247479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247479, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247479, "pid": 5, "tid": 7, "ts": 1716454225418290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372410, "dur": 8, "args": { "External id": 247479, "cbid": 211, "correlation": 247479 } }, { "ph": "s", "id": 247479, "pid": 76337, "tid": -914061504, "ts": 1716454225372410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225418311, "dur": 30, "args": { "External id": 247499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247499, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 247499, "pid": 5, "tid": 7, "ts": 1716454225418311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372497, "dur": 12, "args": { "External id": 247499, "cbid": 211, "correlation": 247499 } }, { "ph": "s", "id": 247499, "pid": 76337, "tid": -914061504, "ts": 1716454225372497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225418343, "dur": 4, "args": { "External id": 247511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247511, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 247511, "pid": 5, "tid": 7, "ts": 1716454225418343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372519, "dur": 6, "args": { "External id": 247511, "cbid": 211, "correlation": 247511 } }, { "ph": "s", "id": 247511, "pid": 76337, "tid": -914061504, "ts": 1716454225372519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225418349, "dur": 30, "args": { "External id": 247514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247514, "pid": 5, "tid": 7, "ts": 1716454225418349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372537, "dur": 6, "args": { "External id": 247514, "cbid": 211, "correlation": 247514 } }, { "ph": "s", "id": 247514, "pid": 76337, "tid": -914061504, "ts": 1716454225372537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225372595, "dur": 0, "args": { "External id": 247525, "cbid": 317, "correlation": 247525 } }, { "ph": "f", "id": 247525, "pid": 76337, "tid": -914061504, "ts": 1716454225372595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225372595, "dur": 0, "args": { "External id": 247526, "cbid": 203, "correlation": 247526 } }, { "ph": "f", "id": 247526, "pid": 76337, "tid": -914061504, "ts": 1716454225372595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225372596, "dur": 0, "args": { "External id": 247527, "cbid": 205, "correlation": 247527 } }, { "ph": "f", "id": 247527, "pid": 76337, "tid": -914061504, "ts": 1716454225372596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225418380, "dur": 22, "args": { "External id": 247531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247531, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247531, "pid": 5, "tid": 7, "ts": 1716454225418380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372611, "dur": 12, "args": { "External id": 247531, "cbid": 211, "correlation": 247531 } }, { "ph": "s", "id": 247531, "pid": 76337, "tid": -914061504, "ts": 1716454225372611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225418404, "dur": 123, "args": { "External id": 247533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247533, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247533, "pid": 5, "tid": 7, "ts": 1716454225418404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372632, "dur": 8, "args": { "External id": 247533, "cbid": 211, "correlation": 247533 } }, { "ph": "s", "id": 247533, "pid": 76337, "tid": -914061504, "ts": 1716454225372632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225418528, "dur": 23, "args": { "External id": 247535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247535, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247535, "pid": 5, "tid": 7, "ts": 1716454225418528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372644, "dur": 5, "args": { "External id": 247535, "cbid": 211, "correlation": 247535 } }, { "ph": "s", "id": 247535, "pid": 76337, "tid": -914061504, "ts": 1716454225372644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225418552, "dur": 33, "args": { "External id": 247541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247541, "pid": 5, "tid": 7, "ts": 1716454225418552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372671, "dur": 9, "args": { "External id": 247541, "cbid": 211, "correlation": 247541 } }, { "ph": "s", "id": 247541, "pid": 76337, "tid": -914061504, "ts": 1716454225372671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225418587, "dur": 202, "args": { "External id": 247550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247550, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247550, "pid": 5, "tid": 7, "ts": 1716454225418587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372755, "dur": 14, "args": { "External id": 247550, "cbid": 211, "correlation": 247550 } }, { "ph": "s", "id": 247550, "pid": 76337, "tid": -914061504, "ts": 1716454225372755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225418790, "dur": 66, "args": { "External id": 247572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247572, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247572, "pid": 5, "tid": 7, "ts": 1716454225418790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372817, "dur": 11, "args": { "External id": 247572, "cbid": 211, "correlation": 247572 } }, { "ph": "s", "id": 247572, "pid": 76337, "tid": -914061504, "ts": 1716454225372817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372911, "dur": 1, "args": { "External id": 247583, "cbid": 251, "correlation": 247583 } }, { "ph": "f", "id": 247583, "pid": 76337, "tid": -914061504, "ts": 1716454225372911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225418858, "dur": 158, "args": { "External id": 247584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247584, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247584, "pid": 5, "tid": 7, "ts": 1716454225418858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372916, "dur": 13, "args": { "External id": 247584, "cbid": 211, "correlation": 247584 } }, { "ph": "s", "id": 247584, "pid": 76337, "tid": -914061504, "ts": 1716454225372916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225372993, "dur": 1, "args": { "External id": 247595, "cbid": 251, "correlation": 247595 } }, { "ph": "f", "id": 247595, "pid": 76337, "tid": -914061504, "ts": 1716454225372993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225419017, "dur": 150, "args": { "External id": 247596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247596, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247596, "pid": 5, "tid": 7, "ts": 1716454225419017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225372997, "dur": 12, "args": { "External id": 247596, "cbid": 211, "correlation": 247596 } }, { "ph": "s", "id": 247596, "pid": 76337, "tid": -914061504, "ts": 1716454225372997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373066, "dur": 1, "args": { "External id": 247607, "cbid": 251, "correlation": 247607 } }, { "ph": "f", "id": 247607, "pid": 76337, "tid": -914061504, "ts": 1716454225373066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225419169, "dur": 147, "args": { "External id": 247608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247608, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247608, "pid": 5, "tid": 7, "ts": 1716454225419169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373070, "dur": 12, "args": { "External id": 247608, "cbid": 211, "correlation": 247608 } }, { "ph": "s", "id": 247608, "pid": 76337, "tid": -914061504, "ts": 1716454225373070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225419317, "dur": 1987, "args": { "External id": 247629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247629, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 247629, "pid": 5, "tid": 7, "ts": 1716454225419317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373155, "dur": 13, "args": { "External id": 247629, "cbid": 211, "correlation": 247629 } }, { "ph": "s", "id": 247629, "pid": 76337, "tid": -914061504, "ts": 1716454225373155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373261, "dur": 1, "args": { "External id": 247647, "cbid": 251, "correlation": 247647 } }, { "ph": "f", "id": 247647, "pid": 76337, "tid": -914061504, "ts": 1716454225373261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225421305, "dur": 150, "args": { "External id": 247649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247649, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 247649, "pid": 5, "tid": 7, "ts": 1716454225421305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373267, "dur": 14, "args": { "External id": 247649, "cbid": 211, "correlation": 247649 } }, { "ph": "s", "id": 247649, "pid": 76337, "tid": -914061504, "ts": 1716454225373267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225421457, "dur": 36, "args": { "External id": 247657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247657, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247657, "pid": 5, "tid": 7, "ts": 1716454225421457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373338, "dur": 12, "args": { "External id": 247657, "cbid": 211, "correlation": 247657 } }, { "ph": "s", "id": 247657, "pid": 76337, "tid": -914061504, "ts": 1716454225373338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225421494, "dur": 50, "args": { "External id": 247665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247665, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247665, "pid": 5, "tid": 7, "ts": 1716454225421494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373376, "dur": 9, "args": { "External id": 247665, "cbid": 211, "correlation": 247665 } }, { "ph": "s", "id": 247665, "pid": 76337, "tid": -914061504, "ts": 1716454225373376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225421545, "dur": 31, "args": { "External id": 247676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247676, "pid": 5, "tid": 7, "ts": 1716454225421545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373450, "dur": 13, "args": { "External id": 247676, "cbid": 211, "correlation": 247676 } }, { "ph": "s", "id": 247676, "pid": 76337, "tid": -914061504, "ts": 1716454225373450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225421577, "dur": 35, "args": { "External id": 247698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247698, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247698, "pid": 5, "tid": 7, "ts": 1716454225421577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373481, "dur": 7, "args": { "External id": 247698, "cbid": 211, "correlation": 247698 } }, { "ph": "s", "id": 247698, "pid": 76337, "tid": -914061504, "ts": 1716454225373481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373570, "dur": 1, "args": { "External id": 247709, "cbid": 251, "correlation": 247709 } }, { "ph": "f", "id": 247709, "pid": 76337, "tid": -914061504, "ts": 1716454225373570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225421614, "dur": 91, "args": { "External id": 247710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247710, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247710, "pid": 5, "tid": 7, "ts": 1716454225421614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373575, "dur": 14, "args": { "External id": 247710, "cbid": 211, "correlation": 247710 } }, { "ph": "s", "id": 247710, "pid": 76337, "tid": -914061504, "ts": 1716454225373575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373648, "dur": 1, "args": { "External id": 247721, "cbid": 251, "correlation": 247721 } }, { "ph": "f", "id": 247721, "pid": 76337, "tid": -914061504, "ts": 1716454225373648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373651, "dur": 0, "args": { "External id": 247722, "cbid": 251, "correlation": 247722 } }, { "ph": "f", "id": 247722, "pid": 76337, "tid": -914061504, "ts": 1716454225373651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225421707, "dur": 11, "args": { "External id": 247723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247723, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 247723, "pid": 5, "tid": 7, "ts": 1716454225421707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373653, "dur": 12, "args": { "External id": 247723, "cbid": 211, "correlation": 247723 } }, { "ph": "s", "id": 247723, "pid": 76337, "tid": -914061504, "ts": 1716454225373653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225421720, "dur": 5, "args": { "External id": 247725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247725, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 247725, "pid": 5, "tid": 7, "ts": 1716454225421720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373669, "dur": 7, "args": { "External id": 247725, "cbid": 211, "correlation": 247725 } }, { "ph": "s", "id": 247725, "pid": 76337, "tid": -914061504, "ts": 1716454225373669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373728, "dur": 1, "args": { "External id": 247736, "cbid": 251, "correlation": 247736 } }, { "ph": "f", "id": 247736, "pid": 76337, "tid": -914061504, "ts": 1716454225373728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373732, "dur": 0, "args": { "External id": 247737, "cbid": 251, "correlation": 247737 } }, { "ph": "f", "id": 247737, "pid": 76337, "tid": -914061504, "ts": 1716454225373732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225421726, "dur": 7, "args": { "External id": 247738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247738, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 247738, "pid": 5, "tid": 7, "ts": 1716454225421726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373734, "dur": 12, "args": { "External id": 247738, "cbid": 211, "correlation": 247738 } }, { "ph": "s", "id": 247738, "pid": 76337, "tid": -914061504, "ts": 1716454225373734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225421734, "dur": 4, "args": { "External id": 247740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247740, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 247740, "pid": 5, "tid": 7, "ts": 1716454225421734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373747, "dur": 5, "args": { "External id": 247740, "cbid": 211, "correlation": 247740 } }, { "ph": "s", "id": 247740, "pid": 76337, "tid": -914061504, "ts": 1716454225373747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225421739, "dur": 93, "args": { "External id": 247761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247761, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 247761, "pid": 5, "tid": 7, "ts": 1716454225421739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373820, "dur": 12, "args": { "External id": 247761, "cbid": 211, "correlation": 247761 } }, { "ph": "s", "id": 247761, "pid": 76337, "tid": -914061504, "ts": 1716454225373820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225373916, "dur": 1, "args": { "External id": 247779, "cbid": 251, "correlation": 247779 } }, { "ph": "f", "id": 247779, "pid": 76337, "tid": -914061504, "ts": 1716454225373916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225421834, "dur": 100, "args": { "External id": 247781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247781, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247781, "pid": 5, "tid": 7, "ts": 1716454225421834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373921, "dur": 14, "args": { "External id": 247781, "cbid": 211, "correlation": 247781 } }, { "ph": "s", "id": 247781, "pid": 76337, "tid": -914061504, "ts": 1716454225373921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225421935, "dur": 19, "args": { "External id": 247789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247789, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247789, "pid": 5, "tid": 7, "ts": 1716454225421935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225373999, "dur": 12, "args": { "External id": 247789, "cbid": 211, "correlation": 247789 } }, { "ph": "s", "id": 247789, "pid": 76337, "tid": -914061504, "ts": 1716454225373999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225421955, "dur": 38, "args": { "External id": 247797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247797, "pid": 5, "tid": 7, "ts": 1716454225421955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374042, "dur": 9, "args": { "External id": 247797, "cbid": 211, "correlation": 247797 } }, { "ph": "s", "id": 247797, "pid": 76337, "tid": -914061504, "ts": 1716454225374042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225421994, "dur": 35, "args": { "External id": 247819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247819, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247819, "pid": 5, "tid": 7, "ts": 1716454225421994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374095, "dur": 10, "args": { "External id": 247819, "cbid": 211, "correlation": 247819 } }, { "ph": "s", "id": 247819, "pid": 76337, "tid": -914061504, "ts": 1716454225374095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225374184, "dur": 1, "args": { "External id": 247835, "cbid": 251, "correlation": 247835 } }, { "ph": "f", "id": 247835, "pid": 76337, "tid": -914061504, "ts": 1716454225374184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225374190, "dur": 0, "args": { "External id": 247837, "cbid": 251, "correlation": 247837 } }, { "ph": "f", "id": 247837, "pid": 76337, "tid": -914061504, "ts": 1716454225374190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225422031, "dur": 551, "args": { "External id": 247838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247838, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 247838, "pid": 5, "tid": 7, "ts": 1716454225422031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374193, "dur": 12, "args": { "External id": 247838, "cbid": 211, "correlation": 247838 } }, { "ph": "s", "id": 247838, "pid": 76337, "tid": -914061504, "ts": 1716454225374193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225422583, "dur": 128, "args": { "External id": 247846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247846, "pid": 5, "tid": 7, "ts": 1716454225422583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374258, "dur": 12, "args": { "External id": 247846, "cbid": 211, "correlation": 247846 } }, { "ph": "s", "id": 247846, "pid": 76337, "tid": -914061504, "ts": 1716454225374258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225422713, "dur": 129, "args": { "External id": 247854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247854, "pid": 5, "tid": 7, "ts": 1716454225422713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374290, "dur": 8, "args": { "External id": 247854, "cbid": 211, "correlation": 247854 } }, { "ph": "s", "id": 247854, "pid": 76337, "tid": -914061504, "ts": 1716454225374290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225374368, "dur": 1, "args": { "External id": 247870, "cbid": 251, "correlation": 247870 } }, { "ph": "f", "id": 247870, "pid": 76337, "tid": -914061504, "ts": 1716454225374368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225422843, "dur": 308, "args": { "External id": 247872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247872, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247872, "pid": 5, "tid": 7, "ts": 1716454225422843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374374, "dur": 13, "args": { "External id": 247872, "cbid": 211, "correlation": 247872 } }, { "ph": "s", "id": 247872, "pid": 76337, "tid": -914061504, "ts": 1716454225374374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225423152, "dur": 28, "args": { "External id": 247880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247880, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247880, "pid": 5, "tid": 7, "ts": 1716454225423152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374416, "dur": 10, "args": { "External id": 247880, "cbid": 211, "correlation": 247880 } }, { "ph": "s", "id": 247880, "pid": 76337, "tid": -914061504, "ts": 1716454225374416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225423182, "dur": 83, "args": { "External id": 247891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247891, "pid": 5, "tid": 7, "ts": 1716454225423182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374486, "dur": 12, "args": { "External id": 247891, "cbid": 211, "correlation": 247891 } }, { "ph": "s", "id": 247891, "pid": 76337, "tid": -914061504, "ts": 1716454225374486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225374549, "dur": 0, "args": { "External id": 247903, "cbid": 317, "correlation": 247903 } }, { "ph": "f", "id": 247903, "pid": 76337, "tid": -914061504, "ts": 1716454225374549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225374550, "dur": 0, "args": { "External id": 247904, "cbid": 203, "correlation": 247904 } }, { "ph": "f", "id": 247904, "pid": 76337, "tid": -914061504, "ts": 1716454225374550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225374551, "dur": 0, "args": { "External id": 247905, "cbid": 205, "correlation": 247905 } }, { "ph": "f", "id": 247905, "pid": 76337, "tid": -914061504, "ts": 1716454225374551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225423266, "dur": 23, "args": { "External id": 247909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247909, "pid": 5, "tid": 7, "ts": 1716454225423266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374566, "dur": 12, "args": { "External id": 247909, "cbid": 211, "correlation": 247909 } }, { "ph": "s", "id": 247909, "pid": 76337, "tid": -914061504, "ts": 1716454225374566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225423290, "dur": 121, "args": { "External id": 247911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247911, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247911, "pid": 5, "tid": 7, "ts": 1716454225423290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374585, "dur": 7, "args": { "External id": 247911, "cbid": 211, "correlation": 247911 } }, { "ph": "s", "id": 247911, "pid": 76337, "tid": -914061504, "ts": 1716454225374585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225423413, "dur": 23, "args": { "External id": 247913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247913, "pid": 5, "tid": 7, "ts": 1716454225423413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374596, "dur": 5, "args": { "External id": 247913, "cbid": 211, "correlation": 247913 } }, { "ph": "s", "id": 247913, "pid": 76337, "tid": -914061504, "ts": 1716454225374596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225423437, "dur": 33, "args": { "External id": 247919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247919, "pid": 5, "tid": 7, "ts": 1716454225423437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374623, "dur": 9, "args": { "External id": 247919, "cbid": 211, "correlation": 247919 } }, { "ph": "s", "id": 247919, "pid": 76337, "tid": -914061504, "ts": 1716454225374623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225423472, "dur": 27, "args": { "External id": 247927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247927, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247927, "pid": 5, "tid": 7, "ts": 1716454225423472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374655, "dur": 8, "args": { "External id": 247927, "cbid": 211, "correlation": 247927 } }, { "ph": "s", "id": 247927, "pid": 76337, "tid": -914061504, "ts": 1716454225374655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225423500, "dur": 31, "args": { "External id": 247947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247947, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 247947, "pid": 5, "tid": 7, "ts": 1716454225423500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374726, "dur": 12, "args": { "External id": 247947, "cbid": 211, "correlation": 247947 } }, { "ph": "s", "id": 247947, "pid": 76337, "tid": -914061504, "ts": 1716454225374726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225423533, "dur": 5, "args": { "External id": 247959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247959, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 247959, "pid": 5, "tid": 7, "ts": 1716454225423533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374748, "dur": 6, "args": { "External id": 247959, "cbid": 211, "correlation": 247959 } }, { "ph": "s", "id": 247959, "pid": 76337, "tid": -914061504, "ts": 1716454225374748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225423539, "dur": 31, "args": { "External id": 247962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247962, "pid": 5, "tid": 7, "ts": 1716454225423539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374766, "dur": 7, "args": { "External id": 247962, "cbid": 211, "correlation": 247962 } }, { "ph": "s", "id": 247962, "pid": 76337, "tid": -914061504, "ts": 1716454225374766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225423571, "dur": 21, "args": { "External id": 247971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247971, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247971, "pid": 5, "tid": 7, "ts": 1716454225423571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374805, "dur": 9, "args": { "External id": 247971, "cbid": 211, "correlation": 247971 } }, { "ph": "s", "id": 247971, "pid": 76337, "tid": -914061504, "ts": 1716454225374805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225374857, "dur": 0, "args": { "External id": 247981, "cbid": 317, "correlation": 247981 } }, { "ph": "f", "id": 247981, "pid": 76337, "tid": -914061504, "ts": 1716454225374857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225374857, "dur": 0, "args": { "External id": 247982, "cbid": 203, "correlation": 247982 } }, { "ph": "f", "id": 247982, "pid": 76337, "tid": -914061504, "ts": 1716454225374857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225374858, "dur": 0, "args": { "External id": 247983, "cbid": 205, "correlation": 247983 } }, { "ph": "f", "id": 247983, "pid": 76337, "tid": -914061504, "ts": 1716454225374858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225423594, "dur": 21, "args": { "External id": 247987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247987, "pid": 5, "tid": 7, "ts": 1716454225423594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374875, "dur": 12, "args": { "External id": 247987, "cbid": 211, "correlation": 247987 } }, { "ph": "s", "id": 247987, "pid": 76337, "tid": -914061504, "ts": 1716454225374875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225423617, "dur": 44, "args": { "External id": 247989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247989, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247989, "pid": 5, "tid": 7, "ts": 1716454225423617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374890, "dur": 5, "args": { "External id": 247989, "cbid": 211, "correlation": 247989 } }, { "ph": "s", "id": 247989, "pid": 76337, "tid": -914061504, "ts": 1716454225374890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225423662, "dur": 667, "args": { "External id": 247991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247991, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 247991, "pid": 5, "tid": 7, "ts": 1716454225423662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374901, "dur": 6, "args": { "External id": 247991, "cbid": 211, "correlation": 247991 } }, { "ph": "s", "id": 247991, "pid": 76337, "tid": -914061504, "ts": 1716454225374901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225424331, "dur": 23, "args": { "External id": 247993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247993, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247993, "pid": 5, "tid": 7, "ts": 1716454225424331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374911, "dur": 5, "args": { "External id": 247993, "cbid": 211, "correlation": 247993 } }, { "ph": "s", "id": 247993, "pid": 76337, "tid": -914061504, "ts": 1716454225374911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225424355, "dur": 34, "args": { "External id": 247999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 247999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 247999, "pid": 5, "tid": 7, "ts": 1716454225424355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374938, "dur": 8, "args": { "External id": 247999, "cbid": 211, "correlation": 247999 } }, { "ph": "s", "id": 247999, "pid": 76337, "tid": -914061504, "ts": 1716454225374938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225424390, "dur": 3, "args": { "External id": 248007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248007, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 248007, "pid": 5, "tid": 7, "ts": 1716454225424390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225374990, "dur": 10, "args": { "External id": 248007, "cbid": 211, "correlation": 248007 } }, { "ph": "s", "id": 248007, "pid": 76337, "tid": -914061504, "ts": 1716454225374990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225375060, "dur": 1, "args": { "External id": 248023, "cbid": 251, "correlation": 248023 } }, { "ph": "f", "id": 248023, "pid": 76337, "tid": -914061504, "ts": 1716454225375060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225375065, "dur": 0, "args": { "External id": 248025, "cbid": 251, "correlation": 248025 } }, { "ph": "f", "id": 248025, "pid": 76337, "tid": -914061504, "ts": 1716454225375065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225424395, "dur": 13, "args": { "External id": 248026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248026, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 248026, "pid": 5, "tid": 7, "ts": 1716454225424395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375067, "dur": 12, "args": { "External id": 248026, "cbid": 211, "correlation": 248026 } }, { "ph": "s", "id": 248026, "pid": 76337, "tid": -914061504, "ts": 1716454225375067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225424409, "dur": 5, "args": { "External id": 248028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248028, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 248028, "pid": 5, "tid": 7, "ts": 1716454225424409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375080, "dur": 6, "args": { "External id": 248028, "cbid": 211, "correlation": 248028 } }, { "ph": "s", "id": 248028, "pid": 76337, "tid": -914061504, "ts": 1716454225375080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225424415, "dur": 29, "args": { "External id": 248038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248038, "pid": 5, "tid": 7, "ts": 1716454225424415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375140, "dur": 12, "args": { "External id": 248038, "cbid": 211, "correlation": 248038 } }, { "ph": "s", "id": 248038, "pid": 76337, "tid": -914061504, "ts": 1716454225375140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225424446, "dur": 31, "args": { "External id": 248058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248058, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 248058, "pid": 5, "tid": 7, "ts": 1716454225424446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375205, "dur": 10, "args": { "External id": 248058, "cbid": 211, "correlation": 248058 } }, { "ph": "s", "id": 248058, "pid": 76337, "tid": -914061504, "ts": 1716454225375205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225424478, "dur": 5, "args": { "External id": 248070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248070, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 248070, "pid": 5, "tid": 7, "ts": 1716454225424478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375225, "dur": 6, "args": { "External id": 248070, "cbid": 211, "correlation": 248070 } }, { "ph": "s", "id": 248070, "pid": 76337, "tid": -914061504, "ts": 1716454225375225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225424484, "dur": 31, "args": { "External id": 248073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248073, "pid": 5, "tid": 7, "ts": 1716454225424484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375245, "dur": 6, "args": { "External id": 248073, "cbid": 211, "correlation": 248073 } }, { "ph": "s", "id": 248073, "pid": 76337, "tid": -914061504, "ts": 1716454225375245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225424516, "dur": 21, "args": { "External id": 248082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248082, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248082, "pid": 5, "tid": 7, "ts": 1716454225424516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375286, "dur": 10, "args": { "External id": 248082, "cbid": 211, "correlation": 248082 } }, { "ph": "s", "id": 248082, "pid": 76337, "tid": -914061504, "ts": 1716454225375286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225375348, "dur": 0, "args": { "External id": 248092, "cbid": 317, "correlation": 248092 } }, { "ph": "f", "id": 248092, "pid": 76337, "tid": -914061504, "ts": 1716454225375348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225375349, "dur": 0, "args": { "External id": 248093, "cbid": 203, "correlation": 248093 } }, { "ph": "f", "id": 248093, "pid": 76337, "tid": -914061504, "ts": 1716454225375349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225375350, "dur": 0, "args": { "External id": 248094, "cbid": 205, "correlation": 248094 } }, { "ph": "f", "id": 248094, "pid": 76337, "tid": -914061504, "ts": 1716454225375350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225424539, "dur": 22, "args": { "External id": 248098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248098, "pid": 5, "tid": 7, "ts": 1716454225424539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375366, "dur": 12, "args": { "External id": 248098, "cbid": 211, "correlation": 248098 } }, { "ph": "s", "id": 248098, "pid": 76337, "tid": -914061504, "ts": 1716454225375366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225424562, "dur": 44, "args": { "External id": 248100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248100, "pid": 5, "tid": 7, "ts": 1716454225424562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375380, "dur": 5, "args": { "External id": 248100, "cbid": 211, "correlation": 248100 } }, { "ph": "s", "id": 248100, "pid": 76337, "tid": -914061504, "ts": 1716454225375380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225424608, "dur": 655, "args": { "External id": 248102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248102, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248102, "pid": 5, "tid": 7, "ts": 1716454225424608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375392, "dur": 6, "args": { "External id": 248102, "cbid": 211, "correlation": 248102 } }, { "ph": "s", "id": 248102, "pid": 76337, "tid": -914061504, "ts": 1716454225375392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225425264, "dur": 21, "args": { "External id": 248104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248104, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248104, "pid": 5, "tid": 7, "ts": 1716454225425264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375401, "dur": 5, "args": { "External id": 248104, "cbid": 211, "correlation": 248104 } }, { "ph": "s", "id": 248104, "pid": 76337, "tid": -914061504, "ts": 1716454225375401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225425286, "dur": 33, "args": { "External id": 248110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248110, "pid": 5, "tid": 7, "ts": 1716454225425286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375428, "dur": 9, "args": { "External id": 248110, "cbid": 211, "correlation": 248110 } }, { "ph": "s", "id": 248110, "pid": 76337, "tid": -914061504, "ts": 1716454225375428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225425321, "dur": 27, "args": { "External id": 248118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248118, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248118, "pid": 5, "tid": 7, "ts": 1716454225425321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375461, "dur": 8, "args": { "External id": 248118, "cbid": 211, "correlation": 248118 } }, { "ph": "s", "id": 248118, "pid": 76337, "tid": -914061504, "ts": 1716454225375461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225425349, "dur": 19, "args": { "External id": 248126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248126, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248126, "pid": 5, "tid": 7, "ts": 1716454225425349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375492, "dur": 8, "args": { "External id": 248126, "cbid": 211, "correlation": 248126 } }, { "ph": "s", "id": 248126, "pid": 76337, "tid": -914061504, "ts": 1716454225375492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225425369, "dur": 31, "args": { "External id": 248146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248146, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 248146, "pid": 5, "tid": 7, "ts": 1716454225425369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375570, "dur": 12, "args": { "External id": 248146, "cbid": 211, "correlation": 248146 } }, { "ph": "s", "id": 248146, "pid": 76337, "tid": -914061504, "ts": 1716454225375570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225425401, "dur": 4, "args": { "External id": 248158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248158, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 248158, "pid": 5, "tid": 7, "ts": 1716454225425401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375593, "dur": 6, "args": { "External id": 248158, "cbid": 211, "correlation": 248158 } }, { "ph": "s", "id": 248158, "pid": 76337, "tid": -914061504, "ts": 1716454225375593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225425407, "dur": 30, "args": { "External id": 248161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248161, "pid": 5, "tid": 7, "ts": 1716454225425407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375611, "dur": 6, "args": { "External id": 248161, "cbid": 211, "correlation": 248161 } }, { "ph": "s", "id": 248161, "pid": 76337, "tid": -914061504, "ts": 1716454225375611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225375668, "dur": 0, "args": { "External id": 248172, "cbid": 317, "correlation": 248172 } }, { "ph": "f", "id": 248172, "pid": 76337, "tid": -914061504, "ts": 1716454225375668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225375669, "dur": 0, "args": { "External id": 248173, "cbid": 203, "correlation": 248173 } }, { "ph": "f", "id": 248173, "pid": 76337, "tid": -914061504, "ts": 1716454225375669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225375670, "dur": 0, "args": { "External id": 248174, "cbid": 205, "correlation": 248174 } }, { "ph": "f", "id": 248174, "pid": 76337, "tid": -914061504, "ts": 1716454225375670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225425439, "dur": 22, "args": { "External id": 248178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248178, "pid": 5, "tid": 7, "ts": 1716454225425439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375683, "dur": 12, "args": { "External id": 248178, "cbid": 211, "correlation": 248178 } }, { "ph": "s", "id": 248178, "pid": 76337, "tid": -914061504, "ts": 1716454225375683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225425462, "dur": 118, "args": { "External id": 248180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248180, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248180, "pid": 5, "tid": 7, "ts": 1716454225425462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375701, "dur": 6, "args": { "External id": 248180, "cbid": 211, "correlation": 248180 } }, { "ph": "s", "id": 248180, "pid": 76337, "tid": -914061504, "ts": 1716454225375701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225425581, "dur": 21, "args": { "External id": 248182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248182, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248182, "pid": 5, "tid": 7, "ts": 1716454225425581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375711, "dur": 5, "args": { "External id": 248182, "cbid": 211, "correlation": 248182 } }, { "ph": "s", "id": 248182, "pid": 76337, "tid": -914061504, "ts": 1716454225375711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225425604, "dur": 33, "args": { "External id": 248188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248188, "pid": 5, "tid": 7, "ts": 1716454225425604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375742, "dur": 9, "args": { "External id": 248188, "cbid": 211, "correlation": 248188 } }, { "ph": "s", "id": 248188, "pid": 76337, "tid": -914061504, "ts": 1716454225375742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225425638, "dur": 189, "args": { "External id": 248197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248197, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248197, "pid": 5, "tid": 7, "ts": 1716454225425638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375825, "dur": 14, "args": { "External id": 248197, "cbid": 211, "correlation": 248197 } }, { "ph": "s", "id": 248197, "pid": 76337, "tid": -914061504, "ts": 1716454225375825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225425828, "dur": 67, "args": { "External id": 248219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248219, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248219, "pid": 5, "tid": 7, "ts": 1716454225425828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375882, "dur": 10, "args": { "External id": 248219, "cbid": 211, "correlation": 248219 } }, { "ph": "s", "id": 248219, "pid": 76337, "tid": -914061504, "ts": 1716454225375882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225375969, "dur": 1, "args": { "External id": 248230, "cbid": 251, "correlation": 248230 } }, { "ph": "f", "id": 248230, "pid": 76337, "tid": -914061504, "ts": 1716454225375969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225425896, "dur": 157, "args": { "External id": 248231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248231, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248231, "pid": 5, "tid": 7, "ts": 1716454225425896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225375983, "dur": 14, "args": { "External id": 248231, "cbid": 211, "correlation": 248231 } }, { "ph": "s", "id": 248231, "pid": 76337, "tid": -914061504, "ts": 1716454225375983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376055, "dur": 1, "args": { "External id": 248242, "cbid": 251, "correlation": 248242 } }, { "ph": "f", "id": 248242, "pid": 76337, "tid": -914061504, "ts": 1716454225376055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225426054, "dur": 150, "args": { "External id": 248243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248243, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248243, "pid": 5, "tid": 7, "ts": 1716454225426054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376060, "dur": 11, "args": { "External id": 248243, "cbid": 211, "correlation": 248243 } }, { "ph": "s", "id": 248243, "pid": 76337, "tid": -914061504, "ts": 1716454225376060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376129, "dur": 1, "args": { "External id": 248254, "cbid": 251, "correlation": 248254 } }, { "ph": "f", "id": 248254, "pid": 76337, "tid": -914061504, "ts": 1716454225376129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225426205, "dur": 149, "args": { "External id": 248255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248255, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248255, "pid": 5, "tid": 7, "ts": 1716454225426205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376133, "dur": 12, "args": { "External id": 248255, "cbid": 211, "correlation": 248255 } }, { "ph": "s", "id": 248255, "pid": 76337, "tid": -914061504, "ts": 1716454225376133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225426356, "dur": 1989, "args": { "External id": 248276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248276, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 248276, "pid": 5, "tid": 7, "ts": 1716454225426356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376215, "dur": 13, "args": { "External id": 248276, "cbid": 211, "correlation": 248276 } }, { "ph": "s", "id": 248276, "pid": 76337, "tid": -914061504, "ts": 1716454225376215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376314, "dur": 1, "args": { "External id": 248294, "cbid": 251, "correlation": 248294 } }, { "ph": "f", "id": 248294, "pid": 76337, "tid": -914061504, "ts": 1716454225376314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225428346, "dur": 150, "args": { "External id": 248296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248296, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 248296, "pid": 5, "tid": 7, "ts": 1716454225428346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376319, "dur": 13, "args": { "External id": 248296, "cbid": 211, "correlation": 248296 } }, { "ph": "s", "id": 248296, "pid": 76337, "tid": -914061504, "ts": 1716454225376319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225428498, "dur": 35, "args": { "External id": 248304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248304, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248304, "pid": 5, "tid": 7, "ts": 1716454225428498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376389, "dur": 13, "args": { "External id": 248304, "cbid": 211, "correlation": 248304 } }, { "ph": "s", "id": 248304, "pid": 76337, "tid": -914061504, "ts": 1716454225376389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225428534, "dur": 50, "args": { "External id": 248312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248312, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248312, "pid": 5, "tid": 7, "ts": 1716454225428534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376428, "dur": 8, "args": { "External id": 248312, "cbid": 211, "correlation": 248312 } }, { "ph": "s", "id": 248312, "pid": 76337, "tid": -914061504, "ts": 1716454225376428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225428586, "dur": 31, "args": { "External id": 248323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248323, "pid": 5, "tid": 7, "ts": 1716454225428586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376505, "dur": 13, "args": { "External id": 248323, "cbid": 211, "correlation": 248323 } }, { "ph": "s", "id": 248323, "pid": 76337, "tid": -914061504, "ts": 1716454225376505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225428619, "dur": 36, "args": { "External id": 248345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248345, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248345, "pid": 5, "tid": 7, "ts": 1716454225428619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376538, "dur": 9, "args": { "External id": 248345, "cbid": 211, "correlation": 248345 } }, { "ph": "s", "id": 248345, "pid": 76337, "tid": -914061504, "ts": 1716454225376538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376626, "dur": 1, "args": { "External id": 248356, "cbid": 251, "correlation": 248356 } }, { "ph": "f", "id": 248356, "pid": 76337, "tid": -914061504, "ts": 1716454225376626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225428656, "dur": 92, "args": { "External id": 248357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248357, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248357, "pid": 5, "tid": 7, "ts": 1716454225428656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376631, "dur": 14, "args": { "External id": 248357, "cbid": 211, "correlation": 248357 } }, { "ph": "s", "id": 248357, "pid": 76337, "tid": -914061504, "ts": 1716454225376631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376704, "dur": 1, "args": { "External id": 248368, "cbid": 251, "correlation": 248368 } }, { "ph": "f", "id": 248368, "pid": 76337, "tid": -914061504, "ts": 1716454225376704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376708, "dur": 0, "args": { "External id": 248369, "cbid": 251, "correlation": 248369 } }, { "ph": "f", "id": 248369, "pid": 76337, "tid": -914061504, "ts": 1716454225376708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225428749, "dur": 11, "args": { "External id": 248370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248370, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 248370, "pid": 5, "tid": 7, "ts": 1716454225428749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376709, "dur": 12, "args": { "External id": 248370, "cbid": 211, "correlation": 248370 } }, { "ph": "s", "id": 248370, "pid": 76337, "tid": -914061504, "ts": 1716454225376709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225428762, "dur": 5, "args": { "External id": 248372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248372, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 248372, "pid": 5, "tid": 7, "ts": 1716454225428762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376723, "dur": 6, "args": { "External id": 248372, "cbid": 211, "correlation": 248372 } }, { "ph": "s", "id": 248372, "pid": 76337, "tid": -914061504, "ts": 1716454225376723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376781, "dur": 1, "args": { "External id": 248383, "cbid": 251, "correlation": 248383 } }, { "ph": "f", "id": 248383, "pid": 76337, "tid": -914061504, "ts": 1716454225376781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376785, "dur": 0, "args": { "External id": 248384, "cbid": 251, "correlation": 248384 } }, { "ph": "f", "id": 248384, "pid": 76337, "tid": -914061504, "ts": 1716454225376785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225428768, "dur": 7, "args": { "External id": 248385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248385, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 248385, "pid": 5, "tid": 7, "ts": 1716454225428768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376786, "dur": 12, "args": { "External id": 248385, "cbid": 211, "correlation": 248385 } }, { "ph": "s", "id": 248385, "pid": 76337, "tid": -914061504, "ts": 1716454225376786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225428776, "dur": 3, "args": { "External id": 248387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248387, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 248387, "pid": 5, "tid": 7, "ts": 1716454225428776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376800, "dur": 6, "args": { "External id": 248387, "cbid": 211, "correlation": 248387 } }, { "ph": "s", "id": 248387, "pid": 76337, "tid": -914061504, "ts": 1716454225376800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225428781, "dur": 93, "args": { "External id": 248408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248408, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 248408, "pid": 5, "tid": 7, "ts": 1716454225428781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376874, "dur": 12, "args": { "External id": 248408, "cbid": 211, "correlation": 248408 } }, { "ph": "s", "id": 248408, "pid": 76337, "tid": -914061504, "ts": 1716454225376874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225376970, "dur": 1, "args": { "External id": 248426, "cbid": 251, "correlation": 248426 } }, { "ph": "f", "id": 248426, "pid": 76337, "tid": -914061504, "ts": 1716454225376970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225428875, "dur": 100, "args": { "External id": 248428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248428, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248428, "pid": 5, "tid": 7, "ts": 1716454225428875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225376983, "dur": 14, "args": { "External id": 248428, "cbid": 211, "correlation": 248428 } }, { "ph": "s", "id": 248428, "pid": 76337, "tid": -914061504, "ts": 1716454225376983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225428976, "dur": 19, "args": { "External id": 248436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248436, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248436, "pid": 5, "tid": 7, "ts": 1716454225428976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377054, "dur": 12, "args": { "External id": 248436, "cbid": 211, "correlation": 248436 } }, { "ph": "s", "id": 248436, "pid": 76337, "tid": -914061504, "ts": 1716454225377054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225428997, "dur": 38, "args": { "External id": 248444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248444, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248444, "pid": 5, "tid": 7, "ts": 1716454225428997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377096, "dur": 9, "args": { "External id": 248444, "cbid": 211, "correlation": 248444 } }, { "ph": "s", "id": 248444, "pid": 76337, "tid": -914061504, "ts": 1716454225377096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225429036, "dur": 35, "args": { "External id": 248466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248466, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248466, "pid": 5, "tid": 7, "ts": 1716454225429036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377149, "dur": 11, "args": { "External id": 248466, "cbid": 211, "correlation": 248466 } }, { "ph": "s", "id": 248466, "pid": 76337, "tid": -914061504, "ts": 1716454225377149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225377240, "dur": 1, "args": { "External id": 248482, "cbid": 251, "correlation": 248482 } }, { "ph": "f", "id": 248482, "pid": 76337, "tid": -914061504, "ts": 1716454225377240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225377246, "dur": 0, "args": { "External id": 248484, "cbid": 251, "correlation": 248484 } }, { "ph": "f", "id": 248484, "pid": 76337, "tid": -914061504, "ts": 1716454225377246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225429072, "dur": 549, "args": { "External id": 248485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248485, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 248485, "pid": 5, "tid": 7, "ts": 1716454225429072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377250, "dur": 13, "args": { "External id": 248485, "cbid": 211, "correlation": 248485 } }, { "ph": "s", "id": 248485, "pid": 76337, "tid": -914061504, "ts": 1716454225377250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225429622, "dur": 128, "args": { "External id": 248493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248493, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248493, "pid": 5, "tid": 7, "ts": 1716454225429622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377315, "dur": 13, "args": { "External id": 248493, "cbid": 211, "correlation": 248493 } }, { "ph": "s", "id": 248493, "pid": 76337, "tid": -914061504, "ts": 1716454225377315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225429752, "dur": 128, "args": { "External id": 248501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248501, "pid": 5, "tid": 7, "ts": 1716454225429752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377346, "dur": 8, "args": { "External id": 248501, "cbid": 211, "correlation": 248501 } }, { "ph": "s", "id": 248501, "pid": 76337, "tid": -914061504, "ts": 1716454225377346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225377422, "dur": 1, "args": { "External id": 248517, "cbid": 251, "correlation": 248517 } }, { "ph": "f", "id": 248517, "pid": 76337, "tid": -914061504, "ts": 1716454225377422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225429881, "dur": 311, "args": { "External id": 248519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248519, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248519, "pid": 5, "tid": 7, "ts": 1716454225429881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377427, "dur": 12, "args": { "External id": 248519, "cbid": 211, "correlation": 248519 } }, { "ph": "s", "id": 248519, "pid": 76337, "tid": -914061504, "ts": 1716454225377427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225430194, "dur": 27, "args": { "External id": 248527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248527, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248527, "pid": 5, "tid": 7, "ts": 1716454225430194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377469, "dur": 9, "args": { "External id": 248527, "cbid": 211, "correlation": 248527 } }, { "ph": "s", "id": 248527, "pid": 76337, "tid": -914061504, "ts": 1716454225377469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225430222, "dur": 83, "args": { "External id": 248538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248538, "pid": 5, "tid": 7, "ts": 1716454225430222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377536, "dur": 13, "args": { "External id": 248538, "cbid": 211, "correlation": 248538 } }, { "ph": "s", "id": 248538, "pid": 76337, "tid": -914061504, "ts": 1716454225377536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225377599, "dur": 0, "args": { "External id": 248550, "cbid": 317, "correlation": 248550 } }, { "ph": "f", "id": 248550, "pid": 76337, "tid": -914061504, "ts": 1716454225377599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225377600, "dur": 0, "args": { "External id": 248551, "cbid": 203, "correlation": 248551 } }, { "ph": "f", "id": 248551, "pid": 76337, "tid": -914061504, "ts": 1716454225377600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225377601, "dur": 0, "args": { "External id": 248552, "cbid": 205, "correlation": 248552 } }, { "ph": "f", "id": 248552, "pid": 76337, "tid": -914061504, "ts": 1716454225377601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430306, "dur": 23, "args": { "External id": 248556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248556, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248556, "pid": 5, "tid": 7, "ts": 1716454225430306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377615, "dur": 12, "args": { "External id": 248556, "cbid": 211, "correlation": 248556 } }, { "ph": "s", "id": 248556, "pid": 76337, "tid": -914061504, "ts": 1716454225377615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225430330, "dur": 124, "args": { "External id": 248558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248558, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248558, "pid": 5, "tid": 7, "ts": 1716454225430330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377633, "dur": 6, "args": { "External id": 248558, "cbid": 211, "correlation": 248558 } }, { "ph": "s", "id": 248558, "pid": 76337, "tid": -914061504, "ts": 1716454225377633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430455, "dur": 22, "args": { "External id": 248560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248560, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248560, "pid": 5, "tid": 7, "ts": 1716454225430455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377643, "dur": 6, "args": { "External id": 248560, "cbid": 211, "correlation": 248560 } }, { "ph": "s", "id": 248560, "pid": 76337, "tid": -914061504, "ts": 1716454225377643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225430478, "dur": 32, "args": { "External id": 248566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248566, "pid": 5, "tid": 7, "ts": 1716454225430478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377671, "dur": 9, "args": { "External id": 248566, "cbid": 211, "correlation": 248566 } }, { "ph": "s", "id": 248566, "pid": 76337, "tid": -914061504, "ts": 1716454225377671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225430512, "dur": 27, "args": { "External id": 248574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248574, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248574, "pid": 5, "tid": 7, "ts": 1716454225430512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377704, "dur": 8, "args": { "External id": 248574, "cbid": 211, "correlation": 248574 } }, { "ph": "s", "id": 248574, "pid": 76337, "tid": -914061504, "ts": 1716454225377704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225377775, "dur": 0, "args": { "External id": 248584, "cbid": 317, "correlation": 248584 } }, { "ph": "f", "id": 248584, "pid": 76337, "tid": -914061504, "ts": 1716454225377775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225377776, "dur": 0, "args": { "External id": 248585, "cbid": 203, "correlation": 248585 } }, { "ph": "f", "id": 248585, "pid": 76337, "tid": -914061504, "ts": 1716454225377776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225377777, "dur": 0, "args": { "External id": 248586, "cbid": 205, "correlation": 248586 } }, { "ph": "f", "id": 248586, "pid": 76337, "tid": -914061504, "ts": 1716454225377777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430541, "dur": 22, "args": { "External id": 248590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248590, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248590, "pid": 5, "tid": 7, "ts": 1716454225430541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377792, "dur": 13, "args": { "External id": 248590, "cbid": 211, "correlation": 248590 } }, { "ph": "s", "id": 248590, "pid": 76337, "tid": -914061504, "ts": 1716454225377792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430564, "dur": 45, "args": { "External id": 248592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248592, "pid": 5, "tid": 7, "ts": 1716454225430564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377807, "dur": 5, "args": { "External id": 248592, "cbid": 211, "correlation": 248592 } }, { "ph": "s", "id": 248592, "pid": 76337, "tid": -914061504, "ts": 1716454225377807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225430610, "dur": 236, "args": { "External id": 248594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248594, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 248594, "pid": 5, "tid": 7, "ts": 1716454225430610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377819, "dur": 6, "args": { "External id": 248594, "cbid": 211, "correlation": 248594 } }, { "ph": "s", "id": 248594, "pid": 76337, "tid": -914061504, "ts": 1716454225377819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430848, "dur": 7, "args": { "External id": 248596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248596, "pid": 5, "tid": 7, "ts": 1716454225430848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377829, "dur": 5, "args": { "External id": 248596, "cbid": 211, "correlation": 248596 } }, { "ph": "s", "id": 248596, "pid": 76337, "tid": -914061504, "ts": 1716454225377829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225430856, "dur": 9, "args": { "External id": 248602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248602, "pid": 5, "tid": 7, "ts": 1716454225430856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377856, "dur": 8, "args": { "External id": 248602, "cbid": 211, "correlation": 248602 } }, { "ph": "s", "id": 248602, "pid": 76337, "tid": -914061504, "ts": 1716454225377856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225430867, "dur": 12, "args": { "External id": 248622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248622, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 248622, "pid": 5, "tid": 7, "ts": 1716454225430867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377948, "dur": 12, "args": { "External id": 248622, "cbid": 211, "correlation": 248622 } }, { "ph": "s", "id": 248622, "pid": 76337, "tid": -914061504, "ts": 1716454225377948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225430880, "dur": 5, "args": { "External id": 248634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248634, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 248634, "pid": 5, "tid": 7, "ts": 1716454225430880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377970, "dur": 15, "args": { "External id": 248634, "cbid": 211, "correlation": 248634 } }, { "ph": "s", "id": 248634, "pid": 76337, "tid": -914061504, "ts": 1716454225377970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225430887, "dur": 12, "args": { "External id": 248637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248637, "pid": 5, "tid": 7, "ts": 1716454225430887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225377998, "dur": 7, "args": { "External id": 248637, "cbid": 211, "correlation": 248637 } }, { "ph": "s", "id": 248637, "pid": 76337, "tid": -914061504, "ts": 1716454225377998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225430900, "dur": 7, "args": { "External id": 248646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248646, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248646, "pid": 5, "tid": 7, "ts": 1716454225430900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378038, "dur": 10, "args": { "External id": 248646, "cbid": 211, "correlation": 248646 } }, { "ph": "s", "id": 248646, "pid": 76337, "tid": -914061504, "ts": 1716454225378038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225378090, "dur": 0, "args": { "External id": 248656, "cbid": 317, "correlation": 248656 } }, { "ph": "f", "id": 248656, "pid": 76337, "tid": -914061504, "ts": 1716454225378090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225378091, "dur": 0, "args": { "External id": 248657, "cbid": 203, "correlation": 248657 } }, { "ph": "f", "id": 248657, "pid": 76337, "tid": -914061504, "ts": 1716454225378091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225378091, "dur": 0, "args": { "External id": 248658, "cbid": 205, "correlation": 248658 } }, { "ph": "f", "id": 248658, "pid": 76337, "tid": -914061504, "ts": 1716454225378091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430909, "dur": 6, "args": { "External id": 248662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248662, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248662, "pid": 5, "tid": 7, "ts": 1716454225430909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378108, "dur": 11, "args": { "External id": 248662, "cbid": 211, "correlation": 248662 } }, { "ph": "s", "id": 248662, "pid": 76337, "tid": -914061504, "ts": 1716454225378108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225430916, "dur": 85, "args": { "External id": 248664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248664, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248664, "pid": 5, "tid": 7, "ts": 1716454225430916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378122, "dur": 5, "args": { "External id": 248664, "cbid": 211, "correlation": 248664 } }, { "ph": "s", "id": 248664, "pid": 76337, "tid": -914061504, "ts": 1716454225378122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225431003, "dur": 1, "args": { "External id": 248666, "device": 5, "context": 1, "stream": 7, "correlation": 248666, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 248666, "pid": 5, "tid": 7, "ts": 1716454225431003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225378135, "dur": 9, "args": { "External id": 248666, "cbid": 51, "correlation": 248666 } }, { "ph": "s", "id": 248666, "pid": 76337, "tid": -914061504, "ts": 1716454225378135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225431007, "dur": 544, "args": { "External id": 248667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248667, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248667, "pid": 5, "tid": 7, "ts": 1716454225431007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378145, "dur": 8, "args": { "External id": 248667, "cbid": 211, "correlation": 248667 } }, { "ph": "s", "id": 248667, "pid": 76337, "tid": -914061504, "ts": 1716454225378145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225431553, "dur": 12, "args": { "External id": 248669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248669, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248669, "pid": 5, "tid": 7, "ts": 1716454225431553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378158, "dur": 5, "args": { "External id": 248669, "cbid": 211, "correlation": 248669 } }, { "ph": "s", "id": 248669, "pid": 76337, "tid": -914061504, "ts": 1716454225378158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225431566, "dur": 15, "args": { "External id": 248675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248675, "pid": 5, "tid": 7, "ts": 1716454225431566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378185, "dur": 9, "args": { "External id": 248675, "cbid": 211, "correlation": 248675 } }, { "ph": "s", "id": 248675, "pid": 76337, "tid": -914061504, "ts": 1716454225378185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225431582, "dur": 3, "args": { "External id": 248683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248683, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 248683, "pid": 5, "tid": 7, "ts": 1716454225431582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378229, "dur": 9, "args": { "External id": 248683, "cbid": 211, "correlation": 248683 } }, { "ph": "s", "id": 248683, "pid": 76337, "tid": -914061504, "ts": 1716454225378229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225378296, "dur": 1, "args": { "External id": 248699, "cbid": 251, "correlation": 248699 } }, { "ph": "f", "id": 248699, "pid": 76337, "tid": -914061504, "ts": 1716454225378296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225378302, "dur": 0, "args": { "External id": 248701, "cbid": 251, "correlation": 248701 } }, { "ph": "f", "id": 248701, "pid": 76337, "tid": -914061504, "ts": 1716454225378302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225431587, "dur": 13, "args": { "External id": 248702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248702, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248702, "pid": 5, "tid": 7, "ts": 1716454225431587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378303, "dur": 11, "args": { "External id": 248702, "cbid": 211, "correlation": 248702 } }, { "ph": "s", "id": 248702, "pid": 76337, "tid": -914061504, "ts": 1716454225378303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225431601, "dur": 5, "args": { "External id": 248704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248704, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248704, "pid": 5, "tid": 7, "ts": 1716454225431601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378316, "dur": 6, "args": { "External id": 248704, "cbid": 211, "correlation": 248704 } }, { "ph": "s", "id": 248704, "pid": 76337, "tid": -914061504, "ts": 1716454225378316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225431608, "dur": 17, "args": { "External id": 248714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248714, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248714, "pid": 5, "tid": 7, "ts": 1716454225431608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378374, "dur": 12, "args": { "External id": 248714, "cbid": 211, "correlation": 248714 } }, { "ph": "s", "id": 248714, "pid": 76337, "tid": -914061504, "ts": 1716454225378374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225431626, "dur": 18, "args": { "External id": 248734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248734, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 248734, "pid": 5, "tid": 7, "ts": 1716454225431626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378440, "dur": 11, "args": { "External id": 248734, "cbid": 211, "correlation": 248734 } }, { "ph": "s", "id": 248734, "pid": 76337, "tid": -914061504, "ts": 1716454225378440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225431646, "dur": 5, "args": { "External id": 248746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248746, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 248746, "pid": 5, "tid": 7, "ts": 1716454225431646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378464, "dur": 7, "args": { "External id": 248746, "cbid": 211, "correlation": 248746 } }, { "ph": "s", "id": 248746, "pid": 76337, "tid": -914061504, "ts": 1716454225378464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225431652, "dur": 17, "args": { "External id": 248749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248749, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248749, "pid": 5, "tid": 7, "ts": 1716454225431652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378484, "dur": 6, "args": { "External id": 248749, "cbid": 211, "correlation": 248749 } }, { "ph": "s", "id": 248749, "pid": 76337, "tid": -914061504, "ts": 1716454225378484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225431670, "dur": 11, "args": { "External id": 248758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248758, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248758, "pid": 5, "tid": 7, "ts": 1716454225431670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378524, "dur": 10, "args": { "External id": 248758, "cbid": 211, "correlation": 248758 } }, { "ph": "s", "id": 248758, "pid": 76337, "tid": -914061504, "ts": 1716454225378524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225378587, "dur": 0, "args": { "External id": 248768, "cbid": 317, "correlation": 248768 } }, { "ph": "f", "id": 248768, "pid": 76337, "tid": -914061504, "ts": 1716454225378587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225378588, "dur": 0, "args": { "External id": 248769, "cbid": 203, "correlation": 248769 } }, { "ph": "f", "id": 248769, "pid": 76337, "tid": -914061504, "ts": 1716454225378588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225378589, "dur": 0, "args": { "External id": 248770, "cbid": 205, "correlation": 248770 } }, { "ph": "f", "id": 248770, "pid": 76337, "tid": -914061504, "ts": 1716454225378589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225431683, "dur": 11, "args": { "External id": 248774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248774, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248774, "pid": 5, "tid": 7, "ts": 1716454225431683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378607, "dur": 12, "args": { "External id": 248774, "cbid": 211, "correlation": 248774 } }, { "ph": "s", "id": 248774, "pid": 76337, "tid": -914061504, "ts": 1716454225378607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225431695, "dur": 166, "args": { "External id": 248776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248776, "pid": 5, "tid": 7, "ts": 1716454225431695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378621, "dur": 5, "args": { "External id": 248776, "cbid": 211, "correlation": 248776 } }, { "ph": "s", "id": 248776, "pid": 76337, "tid": -914061504, "ts": 1716454225378621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225431864, "dur": 1, "args": { "External id": 248778, "device": 5, "context": 1, "stream": 7, "correlation": 248778, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 248778, "pid": 5, "tid": 7, "ts": 1716454225431864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225378633, "dur": 6, "args": { "External id": 248778, "cbid": 51, "correlation": 248778 } }, { "ph": "s", "id": 248778, "pid": 76337, "tid": -914061504, "ts": 1716454225378633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225431867, "dur": 675, "args": { "External id": 248779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248779, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248779, "pid": 5, "tid": 7, "ts": 1716454225431867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378641, "dur": 7, "args": { "External id": 248779, "cbid": 211, "correlation": 248779 } }, { "ph": "s", "id": 248779, "pid": 76337, "tid": -914061504, "ts": 1716454225378641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225432544, "dur": 12, "args": { "External id": 248781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248781, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248781, "pid": 5, "tid": 7, "ts": 1716454225432544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378652, "dur": 5, "args": { "External id": 248781, "cbid": 211, "correlation": 248781 } }, { "ph": "s", "id": 248781, "pid": 76337, "tid": -914061504, "ts": 1716454225378652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225432558, "dur": 15, "args": { "External id": 248787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248787, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248787, "pid": 5, "tid": 7, "ts": 1716454225432558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378681, "dur": 9, "args": { "External id": 248787, "cbid": 211, "correlation": 248787 } }, { "ph": "s", "id": 248787, "pid": 76337, "tid": -914061504, "ts": 1716454225378681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225378738, "dur": 0, "args": { "External id": 248797, "cbid": 317, "correlation": 248797 } }, { "ph": "f", "id": 248797, "pid": 76337, "tid": -914061504, "ts": 1716454225378738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225378739, "dur": 0, "args": { "External id": 248798, "cbid": 203, "correlation": 248798 } }, { "ph": "f", "id": 248798, "pid": 76337, "tid": -914061504, "ts": 1716454225378739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225378740, "dur": 0, "args": { "External id": 248799, "cbid": 205, "correlation": 248799 } }, { "ph": "f", "id": 248799, "pid": 76337, "tid": -914061504, "ts": 1716454225378740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225432574, "dur": 9, "args": { "External id": 248803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248803, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248803, "pid": 5, "tid": 7, "ts": 1716454225432574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378755, "dur": 11, "args": { "External id": 248803, "cbid": 211, "correlation": 248803 } }, { "ph": "s", "id": 248803, "pid": 76337, "tid": -914061504, "ts": 1716454225378755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225432584, "dur": 3, "args": { "External id": 248805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 248805, "pid": 5, "tid": 7, "ts": 1716454225432584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378772, "dur": 6, "args": { "External id": 248805, "cbid": 211, "correlation": 248805 } }, { "ph": "s", "id": 248805, "pid": 76337, "tid": -914061504, "ts": 1716454225378772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225378781, "dur": 0, "args": { "External id": 248806, "cbid": 51, "correlation": 248806 } }, { "ph": "s", "id": 248806, "pid": 76337, "tid": -914061504, "ts": 1716454225378781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225432589, "dur": 59, "args": { "External id": 248807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248807, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 248807, "pid": 5, "tid": 7, "ts": 1716454225432589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378782, "dur": 5, "args": { "External id": 248807, "cbid": 211, "correlation": 248807 } }, { "ph": "s", "id": 248807, "pid": 76337, "tid": -914061504, "ts": 1716454225378782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225432649, "dur": 14, "args": { "External id": 248812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248812, "pid": 5, "tid": 7, "ts": 1716454225432649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378808, "dur": 8, "args": { "External id": 248812, "cbid": 211, "correlation": 248812 } }, { "ph": "s", "id": 248812, "pid": 76337, "tid": -914061504, "ts": 1716454225378808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225432665, "dur": 13, "args": { "External id": 248820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248820, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248820, "pid": 5, "tid": 7, "ts": 1716454225432665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378837, "dur": 8, "args": { "External id": 248820, "cbid": 211, "correlation": 248820 } }, { "ph": "s", "id": 248820, "pid": 76337, "tid": -914061504, "ts": 1716454225378837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225432679, "dur": 10, "args": { "External id": 248828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248828, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248828, "pid": 5, "tid": 7, "ts": 1716454225432679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378866, "dur": 8, "args": { "External id": 248828, "cbid": 211, "correlation": 248828 } }, { "ph": "s", "id": 248828, "pid": 76337, "tid": -914061504, "ts": 1716454225378866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225432691, "dur": 18, "args": { "External id": 248848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248848, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 248848, "pid": 5, "tid": 7, "ts": 1716454225432691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378945, "dur": 13, "args": { "External id": 248848, "cbid": 211, "correlation": 248848 } }, { "ph": "s", "id": 248848, "pid": 76337, "tid": -914061504, "ts": 1716454225378945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225432710, "dur": 5, "args": { "External id": 248860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248860, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 248860, "pid": 5, "tid": 7, "ts": 1716454225432710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378968, "dur": 18, "args": { "External id": 248860, "cbid": 211, "correlation": 248860 } }, { "ph": "s", "id": 248860, "pid": 76337, "tid": -914061504, "ts": 1716454225378968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225432716, "dur": 17, "args": { "External id": 248863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248863, "pid": 5, "tid": 7, "ts": 1716454225432716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225378998, "dur": 8, "args": { "External id": 248863, "cbid": 211, "correlation": 248863 } }, { "ph": "s", "id": 248863, "pid": 76337, "tid": -914061504, "ts": 1716454225378998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225379059, "dur": 0, "args": { "External id": 248874, "cbid": 317, "correlation": 248874 } }, { "ph": "f", "id": 248874, "pid": 76337, "tid": -914061504, "ts": 1716454225379059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225379060, "dur": 0, "args": { "External id": 248875, "cbid": 203, "correlation": 248875 } }, { "ph": "f", "id": 248875, "pid": 76337, "tid": -914061504, "ts": 1716454225379060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225379060, "dur": 0, "args": { "External id": 248876, "cbid": 205, "correlation": 248876 } }, { "ph": "f", "id": 248876, "pid": 76337, "tid": -914061504, "ts": 1716454225379060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225432734, "dur": 11, "args": { "External id": 248880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248880, "pid": 5, "tid": 7, "ts": 1716454225432734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379076, "dur": 12, "args": { "External id": 248880, "cbid": 211, "correlation": 248880 } }, { "ph": "s", "id": 248880, "pid": 76337, "tid": -914061504, "ts": 1716454225379076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225432747, "dur": 3, "args": { "External id": 248882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 248882, "pid": 5, "tid": 7, "ts": 1716454225432747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379093, "dur": 6, "args": { "External id": 248882, "cbid": 211, "correlation": 248882 } }, { "ph": "s", "id": 248882, "pid": 76337, "tid": -914061504, "ts": 1716454225379093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225379103, "dur": 0, "args": { "External id": 248883, "cbid": 51, "correlation": 248883 } }, { "ph": "s", "id": 248883, "pid": 76337, "tid": -914061504, "ts": 1716454225379103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225432752, "dur": 100, "args": { "External id": 248884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248884, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 248884, "pid": 5, "tid": 7, "ts": 1716454225432752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379104, "dur": 6, "args": { "External id": 248884, "cbid": 211, "correlation": 248884 } }, { "ph": "s", "id": 248884, "pid": 76337, "tid": -914061504, "ts": 1716454225379104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225432853, "dur": 16, "args": { "External id": 248889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248889, "pid": 5, "tid": 7, "ts": 1716454225432853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379132, "dur": 9, "args": { "External id": 248889, "cbid": 211, "correlation": 248889 } }, { "ph": "s", "id": 248889, "pid": 76337, "tid": -914061504, "ts": 1716454225379132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225432871, "dur": 84, "args": { "External id": 248898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248898, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248898, "pid": 5, "tid": 7, "ts": 1716454225432871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379215, "dur": 14, "args": { "External id": 248898, "cbid": 211, "correlation": 248898 } }, { "ph": "s", "id": 248898, "pid": 76337, "tid": -914061504, "ts": 1716454225379215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225432956, "dur": 31, "args": { "External id": 248920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248920, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 248920, "pid": 5, "tid": 7, "ts": 1716454225432956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379272, "dur": 11, "args": { "External id": 248920, "cbid": 211, "correlation": 248920 } }, { "ph": "s", "id": 248920, "pid": 76337, "tid": -914061504, "ts": 1716454225379272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225379362, "dur": 1, "args": { "External id": 248931, "cbid": 251, "correlation": 248931 } }, { "ph": "f", "id": 248931, "pid": 76337, "tid": -914061504, "ts": 1716454225379362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225432989, "dur": 145, "args": { "External id": 248932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248932, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248932, "pid": 5, "tid": 7, "ts": 1716454225432989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379368, "dur": 13, "args": { "External id": 248932, "cbid": 211, "correlation": 248932 } }, { "ph": "s", "id": 248932, "pid": 76337, "tid": -914061504, "ts": 1716454225379368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225379438, "dur": 1, "args": { "External id": 248943, "cbid": 251, "correlation": 248943 } }, { "ph": "f", "id": 248943, "pid": 76337, "tid": -914061504, "ts": 1716454225379438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225433135, "dur": 159, "args": { "External id": 248944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248944, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248944, "pid": 5, "tid": 7, "ts": 1716454225433135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379442, "dur": 12, "args": { "External id": 248944, "cbid": 211, "correlation": 248944 } }, { "ph": "s", "id": 248944, "pid": 76337, "tid": -914061504, "ts": 1716454225379442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225379508, "dur": 1, "args": { "External id": 248955, "cbid": 251, "correlation": 248955 } }, { "ph": "f", "id": 248955, "pid": 76337, "tid": -914061504, "ts": 1716454225379508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225433295, "dur": 161, "args": { "External id": 248956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248956, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248956, "pid": 5, "tid": 7, "ts": 1716454225433295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379512, "dur": 11, "args": { "External id": 248956, "cbid": 211, "correlation": 248956 } }, { "ph": "s", "id": 248956, "pid": 76337, "tid": -914061504, "ts": 1716454225379512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225433458, "dur": 343, "args": { "External id": 248981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 248981, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 248981, "pid": 5, "tid": 7, "ts": 1716454225433458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379602, "dur": 13, "args": { "External id": 248981, "cbid": 211, "correlation": 248981 } }, { "ph": "s", "id": 248981, "pid": 76337, "tid": -914061504, "ts": 1716454225379602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225379701, "dur": 1, "args": { "External id": 248999, "cbid": 251, "correlation": 248999 } }, { "ph": "f", "id": 248999, "pid": 76337, "tid": -914061504, "ts": 1716454225379701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225433802, "dur": 169, "args": { "External id": 249001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249001, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249001, "pid": 5, "tid": 7, "ts": 1716454225433802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379707, "dur": 13, "args": { "External id": 249001, "cbid": 211, "correlation": 249001 } }, { "ph": "s", "id": 249001, "pid": 76337, "tid": -914061504, "ts": 1716454225379707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225433973, "dur": 20, "args": { "External id": 249009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249009, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249009, "pid": 5, "tid": 7, "ts": 1716454225433973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379776, "dur": 12, "args": { "External id": 249009, "cbid": 211, "correlation": 249009 } }, { "ph": "s", "id": 249009, "pid": 76337, "tid": -914061504, "ts": 1716454225379776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225433995, "dur": 28, "args": { "External id": 249017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249017, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249017, "pid": 5, "tid": 7, "ts": 1716454225433995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379815, "dur": 8, "args": { "External id": 249017, "cbid": 211, "correlation": 249017 } }, { "ph": "s", "id": 249017, "pid": 76337, "tid": -914061504, "ts": 1716454225379815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225434024, "dur": 18, "args": { "External id": 249028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249028, "pid": 5, "tid": 7, "ts": 1716454225434024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379892, "dur": 13, "args": { "External id": 249028, "cbid": 211, "correlation": 249028 } }, { "ph": "s", "id": 249028, "pid": 76337, "tid": -914061504, "ts": 1716454225379892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225434043, "dur": 17, "args": { "External id": 249050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249050, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249050, "pid": 5, "tid": 7, "ts": 1716454225434043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225379923, "dur": 8, "args": { "External id": 249050, "cbid": 211, "correlation": 249050 } }, { "ph": "s", "id": 249050, "pid": 76337, "tid": -914061504, "ts": 1716454225379923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380017, "dur": 2, "args": { "External id": 249061, "cbid": 251, "correlation": 249061 } }, { "ph": "f", "id": 249061, "pid": 76337, "tid": -914061504, "ts": 1716454225380017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225434061, "dur": 91, "args": { "External id": 249062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249062, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249062, "pid": 5, "tid": 7, "ts": 1716454225434061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380023, "dur": 15, "args": { "External id": 249062, "cbid": 211, "correlation": 249062 } }, { "ph": "s", "id": 249062, "pid": 76337, "tid": -914061504, "ts": 1716454225380023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380095, "dur": 1, "args": { "External id": 249073, "cbid": 251, "correlation": 249073 } }, { "ph": "f", "id": 249073, "pid": 76337, "tid": -914061504, "ts": 1716454225380095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380099, "dur": 0, "args": { "External id": 249074, "cbid": 251, "correlation": 249074 } }, { "ph": "f", "id": 249074, "pid": 76337, "tid": -914061504, "ts": 1716454225380099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225434153, "dur": 12, "args": { "External id": 249075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249075, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249075, "pid": 5, "tid": 7, "ts": 1716454225434153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380101, "dur": 12, "args": { "External id": 249075, "cbid": 211, "correlation": 249075 } }, { "ph": "s", "id": 249075, "pid": 76337, "tid": -914061504, "ts": 1716454225380101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225434166, "dur": 6, "args": { "External id": 249077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249077, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249077, "pid": 5, "tid": 7, "ts": 1716454225434166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380114, "dur": 6, "args": { "External id": 249077, "cbid": 211, "correlation": 249077 } }, { "ph": "s", "id": 249077, "pid": 76337, "tid": -914061504, "ts": 1716454225380114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380172, "dur": 1, "args": { "External id": 249088, "cbid": 251, "correlation": 249088 } }, { "ph": "f", "id": 249088, "pid": 76337, "tid": -914061504, "ts": 1716454225380172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380175, "dur": 0, "args": { "External id": 249089, "cbid": 251, "correlation": 249089 } }, { "ph": "f", "id": 249089, "pid": 76337, "tid": -914061504, "ts": 1716454225380175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225434174, "dur": 9, "args": { "External id": 249090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249090, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249090, "pid": 5, "tid": 7, "ts": 1716454225434174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380176, "dur": 12, "args": { "External id": 249090, "cbid": 211, "correlation": 249090 } }, { "ph": "s", "id": 249090, "pid": 76337, "tid": -914061504, "ts": 1716454225380176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225434184, "dur": 3, "args": { "External id": 249092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249092, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249092, "pid": 5, "tid": 7, "ts": 1716454225434184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380190, "dur": 5, "args": { "External id": 249092, "cbid": 211, "correlation": 249092 } }, { "ph": "s", "id": 249092, "pid": 76337, "tid": -914061504, "ts": 1716454225380190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225434189, "dur": 56, "args": { "External id": 249117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249117, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249117, "pid": 5, "tid": 7, "ts": 1716454225434189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380267, "dur": 13, "args": { "External id": 249117, "cbid": 211, "correlation": 249117 } }, { "ph": "s", "id": 249117, "pid": 76337, "tid": -914061504, "ts": 1716454225380267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380367, "dur": 2, "args": { "External id": 249135, "cbid": 251, "correlation": 249135 } }, { "ph": "f", "id": 249135, "pid": 76337, "tid": -914061504, "ts": 1716454225380367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225434246, "dur": 93, "args": { "External id": 249137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249137, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249137, "pid": 5, "tid": 7, "ts": 1716454225434246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380374, "dur": 15, "args": { "External id": 249137, "cbid": 211, "correlation": 249137 } }, { "ph": "s", "id": 249137, "pid": 76337, "tid": -914061504, "ts": 1716454225380374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225434340, "dur": 10, "args": { "External id": 249145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249145, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249145, "pid": 5, "tid": 7, "ts": 1716454225434340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380444, "dur": 12, "args": { "External id": 249145, "cbid": 211, "correlation": 249145 } }, { "ph": "s", "id": 249145, "pid": 76337, "tid": -914061504, "ts": 1716454225380444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225434351, "dur": 20, "args": { "External id": 249153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249153, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249153, "pid": 5, "tid": 7, "ts": 1716454225434351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380487, "dur": 9, "args": { "External id": 249153, "cbid": 211, "correlation": 249153 } }, { "ph": "s", "id": 249153, "pid": 76337, "tid": -914061504, "ts": 1716454225380487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225434373, "dur": 18, "args": { "External id": 249175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249175, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249175, "pid": 5, "tid": 7, "ts": 1716454225434373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380538, "dur": 10, "args": { "External id": 249175, "cbid": 211, "correlation": 249175 } }, { "ph": "s", "id": 249175, "pid": 76337, "tid": -914061504, "ts": 1716454225380538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380628, "dur": 2, "args": { "External id": 249191, "cbid": 251, "correlation": 249191 } }, { "ph": "f", "id": 249191, "pid": 76337, "tid": -914061504, "ts": 1716454225380628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380635, "dur": 0, "args": { "External id": 249193, "cbid": 251, "correlation": 249193 } }, { "ph": "f", "id": 249193, "pid": 76337, "tid": -914061504, "ts": 1716454225380635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225434392, "dur": 498, "args": { "External id": 249194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249194, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249194, "pid": 5, "tid": 7, "ts": 1716454225434392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380637, "dur": 15, "args": { "External id": 249194, "cbid": 211, "correlation": 249194 } }, { "ph": "s", "id": 249194, "pid": 76337, "tid": -914061504, "ts": 1716454225380637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225434892, "dur": 68, "args": { "External id": 249202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249202, "pid": 5, "tid": 7, "ts": 1716454225434892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380705, "dur": 12, "args": { "External id": 249202, "cbid": 211, "correlation": 249202 } }, { "ph": "s", "id": 249202, "pid": 76337, "tid": -914061504, "ts": 1716454225380705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225434961, "dur": 69, "args": { "External id": 249210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249210, "pid": 5, "tid": 7, "ts": 1716454225434961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380735, "dur": 8, "args": { "External id": 249210, "cbid": 211, "correlation": 249210 } }, { "ph": "s", "id": 249210, "pid": 76337, "tid": -914061504, "ts": 1716454225380735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225380815, "dur": 1, "args": { "External id": 249226, "cbid": 251, "correlation": 249226 } }, { "ph": "f", "id": 249226, "pid": 76337, "tid": -914061504, "ts": 1716454225380815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225435032, "dur": 1, "args": { "External id": 249228, "device": 5, "context": 1, "stream": 7, "correlation": 249228, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 249228, "pid": 5, "tid": 7, "ts": 1716454225435032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225380821, "dur": 13, "args": { "External id": 249228, "cbid": 51, "correlation": 249228 } }, { "ph": "s", "id": 249228, "pid": 76337, "tid": -914061504, "ts": 1716454225380821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225435036, "dur": 275, "args": { "External id": 249229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249229, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249229, "pid": 5, "tid": 7, "ts": 1716454225435036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380835, "dur": 11, "args": { "External id": 249229, "cbid": 211, "correlation": 249229 } }, { "ph": "s", "id": 249229, "pid": 76337, "tid": -914061504, "ts": 1716454225380835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225435312, "dur": 13, "args": { "External id": 249237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249237, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249237, "pid": 5, "tid": 7, "ts": 1716454225435312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380876, "dur": 11, "args": { "External id": 249237, "cbid": 211, "correlation": 249237 } }, { "ph": "s", "id": 249237, "pid": 76337, "tid": -914061504, "ts": 1716454225380876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225435327, "dur": 38, "args": { "External id": 249248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249248, "pid": 5, "tid": 7, "ts": 1716454225435327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225380945, "dur": 12, "args": { "External id": 249248, "cbid": 211, "correlation": 249248 } }, { "ph": "s", "id": 249248, "pid": 76337, "tid": -914061504, "ts": 1716454225380945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225381017, "dur": 0, "args": { "External id": 249260, "cbid": 317, "correlation": 249260 } }, { "ph": "f", "id": 249260, "pid": 76337, "tid": -914061504, "ts": 1716454225381017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225381018, "dur": 0, "args": { "External id": 249261, "cbid": 203, "correlation": 249261 } }, { "ph": "f", "id": 249261, "pid": 76337, "tid": -914061504, "ts": 1716454225381018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225381019, "dur": 0, "args": { "External id": 249262, "cbid": 205, "correlation": 249262 } }, { "ph": "f", "id": 249262, "pid": 76337, "tid": -914061504, "ts": 1716454225381019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225435366, "dur": 12, "args": { "External id": 249266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249266, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249266, "pid": 5, "tid": 7, "ts": 1716454225435366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381035, "dur": 12, "args": { "External id": 249266, "cbid": 211, "correlation": 249266 } }, { "ph": "s", "id": 249266, "pid": 76337, "tid": -914061504, "ts": 1716454225381035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225435380, "dur": 4, "args": { "External id": 249268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249268, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249268, "pid": 5, "tid": 7, "ts": 1716454225435380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381052, "dur": 6, "args": { "External id": 249268, "cbid": 211, "correlation": 249268 } }, { "ph": "s", "id": 249268, "pid": 76337, "tid": -914061504, "ts": 1716454225381052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225381060, "dur": 0, "args": { "External id": 249269, "cbid": 51, "correlation": 249269 } }, { "ph": "s", "id": 249269, "pid": 76337, "tid": -914061504, "ts": 1716454225381060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225435385, "dur": 100, "args": { "External id": 249270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249270, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 249270, "pid": 5, "tid": 7, "ts": 1716454225435385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381061, "dur": 5, "args": { "External id": 249270, "cbid": 211, "correlation": 249270 } }, { "ph": "s", "id": 249270, "pid": 76337, "tid": -914061504, "ts": 1716454225381061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225435487, "dur": 16, "args": { "External id": 249275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249275, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249275, "pid": 5, "tid": 7, "ts": 1716454225435487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381089, "dur": 9, "args": { "External id": 249275, "cbid": 211, "correlation": 249275 } }, { "ph": "s", "id": 249275, "pid": 76337, "tid": -914061504, "ts": 1716454225381089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225435504, "dur": 11, "args": { "External id": 249283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249283, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249283, "pid": 5, "tid": 7, "ts": 1716454225435504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381121, "dur": 8, "args": { "External id": 249283, "cbid": 211, "correlation": 249283 } }, { "ph": "s", "id": 249283, "pid": 76337, "tid": -914061504, "ts": 1716454225381121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225435517, "dur": 18, "args": { "External id": 249303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249303, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 249303, "pid": 5, "tid": 7, "ts": 1716454225435517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381192, "dur": 11, "args": { "External id": 249303, "cbid": 211, "correlation": 249303 } }, { "ph": "s", "id": 249303, "pid": 76337, "tid": -914061504, "ts": 1716454225381192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225435536, "dur": 5, "args": { "External id": 249315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249315, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 249315, "pid": 5, "tid": 7, "ts": 1716454225435536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381214, "dur": 6, "args": { "External id": 249315, "cbid": 211, "correlation": 249315 } }, { "ph": "s", "id": 249315, "pid": 76337, "tid": -914061504, "ts": 1716454225381214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225435543, "dur": 18, "args": { "External id": 249318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249318, "pid": 5, "tid": 7, "ts": 1716454225435543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381232, "dur": 7, "args": { "External id": 249318, "cbid": 211, "correlation": 249318 } }, { "ph": "s", "id": 249318, "pid": 76337, "tid": -914061504, "ts": 1716454225381232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225435562, "dur": 12, "args": { "External id": 249327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249327, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249327, "pid": 5, "tid": 7, "ts": 1716454225435562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381270, "dur": 10, "args": { "External id": 249327, "cbid": 211, "correlation": 249327 } }, { "ph": "s", "id": 249327, "pid": 76337, "tid": -914061504, "ts": 1716454225381270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225381321, "dur": 0, "args": { "External id": 249337, "cbid": 317, "correlation": 249337 } }, { "ph": "f", "id": 249337, "pid": 76337, "tid": -914061504, "ts": 1716454225381321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225381322, "dur": 0, "args": { "External id": 249338, "cbid": 203, "correlation": 249338 } }, { "ph": "f", "id": 249338, "pid": 76337, "tid": -914061504, "ts": 1716454225381322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225381323, "dur": 0, "args": { "External id": 249339, "cbid": 205, "correlation": 249339 } }, { "ph": "f", "id": 249339, "pid": 76337, "tid": -914061504, "ts": 1716454225381323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225435575, "dur": 12, "args": { "External id": 249343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249343, "pid": 5, "tid": 7, "ts": 1716454225435575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381340, "dur": 12, "args": { "External id": 249343, "cbid": 211, "correlation": 249343 } }, { "ph": "s", "id": 249343, "pid": 76337, "tid": -914061504, "ts": 1716454225381340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225435588, "dur": 164, "args": { "External id": 249345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249345, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249345, "pid": 5, "tid": 7, "ts": 1716454225435588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381354, "dur": 5, "args": { "External id": 249345, "cbid": 211, "correlation": 249345 } }, { "ph": "s", "id": 249345, "pid": 76337, "tid": -914061504, "ts": 1716454225381354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225435755, "dur": 1, "args": { "External id": 249347, "device": 5, "context": 1, "stream": 7, "correlation": 249347, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 249347, "pid": 5, "tid": 7, "ts": 1716454225435755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225381366, "dur": 7, "args": { "External id": 249347, "cbid": 51, "correlation": 249347 } }, { "ph": "s", "id": 249347, "pid": 76337, "tid": -914061504, "ts": 1716454225381366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225435758, "dur": 674, "args": { "External id": 249348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249348, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249348, "pid": 5, "tid": 7, "ts": 1716454225435758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381373, "dur": 7, "args": { "External id": 249348, "cbid": 211, "correlation": 249348 } }, { "ph": "s", "id": 249348, "pid": 76337, "tid": -914061504, "ts": 1716454225381373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225436434, "dur": 13, "args": { "External id": 249350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249350, "pid": 5, "tid": 7, "ts": 1716454225436434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381385, "dur": 5, "args": { "External id": 249350, "cbid": 211, "correlation": 249350 } }, { "ph": "s", "id": 249350, "pid": 76337, "tid": -914061504, "ts": 1716454225381385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225436449, "dur": 15, "args": { "External id": 249356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249356, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249356, "pid": 5, "tid": 7, "ts": 1716454225436449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381413, "dur": 8, "args": { "External id": 249356, "cbid": 211, "correlation": 249356 } }, { "ph": "s", "id": 249356, "pid": 76337, "tid": -914061504, "ts": 1716454225381413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225436465, "dur": 3, "args": { "External id": 249364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249364, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 249364, "pid": 5, "tid": 7, "ts": 1716454225436465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381456, "dur": 9, "args": { "External id": 249364, "cbid": 211, "correlation": 249364 } }, { "ph": "s", "id": 249364, "pid": 76337, "tid": -914061504, "ts": 1716454225381456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225381523, "dur": 1, "args": { "External id": 249380, "cbid": 251, "correlation": 249380 } }, { "ph": "f", "id": 249380, "pid": 76337, "tid": -914061504, "ts": 1716454225381523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225381530, "dur": 0, "args": { "External id": 249382, "cbid": 251, "correlation": 249382 } }, { "ph": "f", "id": 249382, "pid": 76337, "tid": -914061504, "ts": 1716454225381530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225436470, "dur": 14, "args": { "External id": 249383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249383, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249383, "pid": 5, "tid": 7, "ts": 1716454225436470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381531, "dur": 11, "args": { "External id": 249383, "cbid": 211, "correlation": 249383 } }, { "ph": "s", "id": 249383, "pid": 76337, "tid": -914061504, "ts": 1716454225381531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225436485, "dur": 5, "args": { "External id": 249385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249385, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249385, "pid": 5, "tid": 7, "ts": 1716454225436485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381545, "dur": 6, "args": { "External id": 249385, "cbid": 211, "correlation": 249385 } }, { "ph": "s", "id": 249385, "pid": 76337, "tid": -914061504, "ts": 1716454225381545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225436492, "dur": 17, "args": { "External id": 249395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249395, "pid": 5, "tid": 7, "ts": 1716454225436492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381603, "dur": 12, "args": { "External id": 249395, "cbid": 211, "correlation": 249395 } }, { "ph": "s", "id": 249395, "pid": 76337, "tid": -914061504, "ts": 1716454225381603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225436510, "dur": 18, "args": { "External id": 249415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249415, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 249415, "pid": 5, "tid": 7, "ts": 1716454225436510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381668, "dur": 11, "args": { "External id": 249415, "cbid": 211, "correlation": 249415 } }, { "ph": "s", "id": 249415, "pid": 76337, "tid": -914061504, "ts": 1716454225381668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225436529, "dur": 4, "args": { "External id": 249427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249427, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 249427, "pid": 5, "tid": 7, "ts": 1716454225436529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381690, "dur": 6, "args": { "External id": 249427, "cbid": 211, "correlation": 249427 } }, { "ph": "s", "id": 249427, "pid": 76337, "tid": -914061504, "ts": 1716454225381690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225436535, "dur": 17, "args": { "External id": 249430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249430, "pid": 5, "tid": 7, "ts": 1716454225436535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381708, "dur": 6, "args": { "External id": 249430, "cbid": 211, "correlation": 249430 } }, { "ph": "s", "id": 249430, "pid": 76337, "tid": -914061504, "ts": 1716454225381708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225436553, "dur": 11, "args": { "External id": 249439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249439, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249439, "pid": 5, "tid": 7, "ts": 1716454225436553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381748, "dur": 9, "args": { "External id": 249439, "cbid": 211, "correlation": 249439 } }, { "ph": "s", "id": 249439, "pid": 76337, "tid": -914061504, "ts": 1716454225381748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225381809, "dur": 0, "args": { "External id": 249449, "cbid": 317, "correlation": 249449 } }, { "ph": "f", "id": 249449, "pid": 76337, "tid": -914061504, "ts": 1716454225381809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225381811, "dur": 0, "args": { "External id": 249450, "cbid": 203, "correlation": 249450 } }, { "ph": "f", "id": 249450, "pid": 76337, "tid": -914061504, "ts": 1716454225381811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225381811, "dur": 0, "args": { "External id": 249451, "cbid": 205, "correlation": 249451 } }, { "ph": "f", "id": 249451, "pid": 76337, "tid": -914061504, "ts": 1716454225381811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225436565, "dur": 11, "args": { "External id": 249455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249455, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249455, "pid": 5, "tid": 7, "ts": 1716454225436565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381825, "dur": 13, "args": { "External id": 249455, "cbid": 211, "correlation": 249455 } }, { "ph": "s", "id": 249455, "pid": 76337, "tid": -914061504, "ts": 1716454225381825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225436578, "dur": 166, "args": { "External id": 249457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249457, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249457, "pid": 5, "tid": 7, "ts": 1716454225436578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381840, "dur": 5, "args": { "External id": 249457, "cbid": 211, "correlation": 249457 } }, { "ph": "s", "id": 249457, "pid": 76337, "tid": -914061504, "ts": 1716454225381840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225436746, "dur": 1, "args": { "External id": 249459, "device": 5, "context": 1, "stream": 7, "correlation": 249459, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 249459, "pid": 5, "tid": 7, "ts": 1716454225436746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225381851, "dur": 6, "args": { "External id": 249459, "cbid": 51, "correlation": 249459 } }, { "ph": "s", "id": 249459, "pid": 76337, "tid": -914061504, "ts": 1716454225381851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225436750, "dur": 660, "args": { "External id": 249460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249460, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249460, "pid": 5, "tid": 7, "ts": 1716454225436750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381858, "dur": 6, "args": { "External id": 249460, "cbid": 211, "correlation": 249460 } }, { "ph": "s", "id": 249460, "pid": 76337, "tid": -914061504, "ts": 1716454225381858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225437411, "dur": 13, "args": { "External id": 249462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249462, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249462, "pid": 5, "tid": 7, "ts": 1716454225437411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381869, "dur": 5, "args": { "External id": 249462, "cbid": 211, "correlation": 249462 } }, { "ph": "s", "id": 249462, "pid": 76337, "tid": -914061504, "ts": 1716454225381869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225437425, "dur": 15, "args": { "External id": 249468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249468, "pid": 5, "tid": 7, "ts": 1716454225437425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381901, "dur": 9, "args": { "External id": 249468, "cbid": 211, "correlation": 249468 } }, { "ph": "s", "id": 249468, "pid": 76337, "tid": -914061504, "ts": 1716454225381901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225437441, "dur": 12, "args": { "External id": 249476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249476, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249476, "pid": 5, "tid": 7, "ts": 1716454225437441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381935, "dur": 8, "args": { "External id": 249476, "cbid": 211, "correlation": 249476 } }, { "ph": "s", "id": 249476, "pid": 76337, "tid": -914061504, "ts": 1716454225381935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225437454, "dur": 11, "args": { "External id": 249484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249484, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249484, "pid": 5, "tid": 7, "ts": 1716454225437454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225381964, "dur": 16, "args": { "External id": 249484, "cbid": 211, "correlation": 249484 } }, { "ph": "s", "id": 249484, "pid": 76337, "tid": -914061504, "ts": 1716454225381964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225437466, "dur": 18, "args": { "External id": 249504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249504, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 249504, "pid": 5, "tid": 7, "ts": 1716454225437466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382051, "dur": 13, "args": { "External id": 249504, "cbid": 211, "correlation": 249504 } }, { "ph": "s", "id": 249504, "pid": 76337, "tid": -914061504, "ts": 1716454225382051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225437486, "dur": 4, "args": { "External id": 249516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249516, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 249516, "pid": 5, "tid": 7, "ts": 1716454225437486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382073, "dur": 7, "args": { "External id": 249516, "cbid": 211, "correlation": 249516 } }, { "ph": "s", "id": 249516, "pid": 76337, "tid": -914061504, "ts": 1716454225382073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225437491, "dur": 17, "args": { "External id": 249519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249519, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249519, "pid": 5, "tid": 7, "ts": 1716454225437491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382092, "dur": 6, "args": { "External id": 249519, "cbid": 211, "correlation": 249519 } }, { "ph": "s", "id": 249519, "pid": 76337, "tid": -914061504, "ts": 1716454225382092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225382148, "dur": 0, "args": { "External id": 249530, "cbid": 317, "correlation": 249530 } }, { "ph": "f", "id": 249530, "pid": 76337, "tid": -914061504, "ts": 1716454225382148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225382149, "dur": 0, "args": { "External id": 249531, "cbid": 203, "correlation": 249531 } }, { "ph": "f", "id": 249531, "pid": 76337, "tid": -914061504, "ts": 1716454225382149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225382150, "dur": 0, "args": { "External id": 249532, "cbid": 205, "correlation": 249532 } }, { "ph": "f", "id": 249532, "pid": 76337, "tid": -914061504, "ts": 1716454225382150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225437510, "dur": 10, "args": { "External id": 249536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249536, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249536, "pid": 5, "tid": 7, "ts": 1716454225437510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382163, "dur": 12, "args": { "External id": 249536, "cbid": 211, "correlation": 249536 } }, { "ph": "s", "id": 249536, "pid": 76337, "tid": -914061504, "ts": 1716454225382163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225437522, "dur": 4, "args": { "External id": 249538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249538, "pid": 5, "tid": 7, "ts": 1716454225437522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382179, "dur": 6, "args": { "External id": 249538, "cbid": 211, "correlation": 249538 } }, { "ph": "s", "id": 249538, "pid": 76337, "tid": -914061504, "ts": 1716454225382179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225382189, "dur": 0, "args": { "External id": 249539, "cbid": 51, "correlation": 249539 } }, { "ph": "s", "id": 249539, "pid": 76337, "tid": -914061504, "ts": 1716454225382189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225437527, "dur": 96, "args": { "External id": 249540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249540, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 249540, "pid": 5, "tid": 7, "ts": 1716454225437527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382189, "dur": 5, "args": { "External id": 249540, "cbid": 211, "correlation": 249540 } }, { "ph": "s", "id": 249540, "pid": 76337, "tid": -914061504, "ts": 1716454225382189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225437625, "dur": 15, "args": { "External id": 249545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249545, "pid": 5, "tid": 7, "ts": 1716454225437625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382220, "dur": 9, "args": { "External id": 249545, "cbid": 211, "correlation": 249545 } }, { "ph": "s", "id": 249545, "pid": 76337, "tid": -914061504, "ts": 1716454225382220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225437642, "dur": 85, "args": { "External id": 249554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249554, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249554, "pid": 5, "tid": 7, "ts": 1716454225437642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382303, "dur": 14, "args": { "External id": 249554, "cbid": 211, "correlation": 249554 } }, { "ph": "s", "id": 249554, "pid": 76337, "tid": -914061504, "ts": 1716454225382303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225437728, "dur": 30, "args": { "External id": 249576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249576, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249576, "pid": 5, "tid": 7, "ts": 1716454225437728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382360, "dur": 10, "args": { "External id": 249576, "cbid": 211, "correlation": 249576 } }, { "ph": "s", "id": 249576, "pid": 76337, "tid": -914061504, "ts": 1716454225382360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225382447, "dur": 1, "args": { "External id": 249587, "cbid": 251, "correlation": 249587 } }, { "ph": "f", "id": 249587, "pid": 76337, "tid": -914061504, "ts": 1716454225382447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225437760, "dur": 167, "args": { "External id": 249588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249588, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249588, "pid": 5, "tid": 7, "ts": 1716454225437760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382452, "dur": 13, "args": { "External id": 249588, "cbid": 211, "correlation": 249588 } }, { "ph": "s", "id": 249588, "pid": 76337, "tid": -914061504, "ts": 1716454225382452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225382521, "dur": 1, "args": { "External id": 249599, "cbid": 251, "correlation": 249599 } }, { "ph": "f", "id": 249599, "pid": 76337, "tid": -914061504, "ts": 1716454225382521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225437928, "dur": 160, "args": { "External id": 249600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249600, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249600, "pid": 5, "tid": 7, "ts": 1716454225437928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382525, "dur": 12, "args": { "External id": 249600, "cbid": 211, "correlation": 249600 } }, { "ph": "s", "id": 249600, "pid": 76337, "tid": -914061504, "ts": 1716454225382525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225382591, "dur": 1, "args": { "External id": 249611, "cbid": 251, "correlation": 249611 } }, { "ph": "f", "id": 249611, "pid": 76337, "tid": -914061504, "ts": 1716454225382591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225438090, "dur": 162, "args": { "External id": 249612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249612, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249612, "pid": 5, "tid": 7, "ts": 1716454225438090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382595, "dur": 11, "args": { "External id": 249612, "cbid": 211, "correlation": 249612 } }, { "ph": "s", "id": 249612, "pid": 76337, "tid": -914061504, "ts": 1716454225382595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225438254, "dur": 345, "args": { "External id": 249637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249637, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249637, "pid": 5, "tid": 7, "ts": 1716454225438254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382679, "dur": 14, "args": { "External id": 249637, "cbid": 211, "correlation": 249637 } }, { "ph": "s", "id": 249637, "pid": 76337, "tid": -914061504, "ts": 1716454225382679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225382778, "dur": 1, "args": { "External id": 249655, "cbid": 251, "correlation": 249655 } }, { "ph": "f", "id": 249655, "pid": 76337, "tid": -914061504, "ts": 1716454225382778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225438600, "dur": 168, "args": { "External id": 249657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249657, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249657, "pid": 5, "tid": 7, "ts": 1716454225438600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382784, "dur": 14, "args": { "External id": 249657, "cbid": 211, "correlation": 249657 } }, { "ph": "s", "id": 249657, "pid": 76337, "tid": -914061504, "ts": 1716454225382784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225438769, "dur": 19, "args": { "External id": 249665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249665, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249665, "pid": 5, "tid": 7, "ts": 1716454225438769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382854, "dur": 12, "args": { "External id": 249665, "cbid": 211, "correlation": 249665 } }, { "ph": "s", "id": 249665, "pid": 76337, "tid": -914061504, "ts": 1716454225382854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225438790, "dur": 28, "args": { "External id": 249673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249673, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249673, "pid": 5, "tid": 7, "ts": 1716454225438790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382893, "dur": 8, "args": { "External id": 249673, "cbid": 211, "correlation": 249673 } }, { "ph": "s", "id": 249673, "pid": 76337, "tid": -914061504, "ts": 1716454225382893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225438819, "dur": 18, "args": { "External id": 249684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249684, "pid": 5, "tid": 7, "ts": 1716454225438819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225382963, "dur": 20, "args": { "External id": 249684, "cbid": 211, "correlation": 249684 } }, { "ph": "s", "id": 249684, "pid": 76337, "tid": -914061504, "ts": 1716454225382963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225438839, "dur": 17, "args": { "External id": 249706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249706, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249706, "pid": 5, "tid": 7, "ts": 1716454225438839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383003, "dur": 8, "args": { "External id": 249706, "cbid": 211, "correlation": 249706 } }, { "ph": "s", "id": 249706, "pid": 76337, "tid": -914061504, "ts": 1716454225383003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383091, "dur": 1, "args": { "External id": 249717, "cbid": 251, "correlation": 249717 } }, { "ph": "f", "id": 249717, "pid": 76337, "tid": -914061504, "ts": 1716454225383091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225438857, "dur": 91, "args": { "External id": 249718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249718, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249718, "pid": 5, "tid": 7, "ts": 1716454225438857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383096, "dur": 13, "args": { "External id": 249718, "cbid": 211, "correlation": 249718 } }, { "ph": "s", "id": 249718, "pid": 76337, "tid": -914061504, "ts": 1716454225383096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383164, "dur": 1, "args": { "External id": 249729, "cbid": 251, "correlation": 249729 } }, { "ph": "f", "id": 249729, "pid": 76337, "tid": -914061504, "ts": 1716454225383164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383168, "dur": 0, "args": { "External id": 249730, "cbid": 251, "correlation": 249730 } }, { "ph": "f", "id": 249730, "pid": 76337, "tid": -914061504, "ts": 1716454225383168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225438949, "dur": 13, "args": { "External id": 249731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249731, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249731, "pid": 5, "tid": 7, "ts": 1716454225438949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383169, "dur": 12, "args": { "External id": 249731, "cbid": 211, "correlation": 249731 } }, { "ph": "s", "id": 249731, "pid": 76337, "tid": -914061504, "ts": 1716454225383169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225438963, "dur": 6, "args": { "External id": 249733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249733, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249733, "pid": 5, "tid": 7, "ts": 1716454225438963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383183, "dur": 6, "args": { "External id": 249733, "cbid": 211, "correlation": 249733 } }, { "ph": "s", "id": 249733, "pid": 76337, "tid": -914061504, "ts": 1716454225383183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383241, "dur": 1, "args": { "External id": 249744, "cbid": 251, "correlation": 249744 } }, { "ph": "f", "id": 249744, "pid": 76337, "tid": -914061504, "ts": 1716454225383241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383245, "dur": 0, "args": { "External id": 249745, "cbid": 251, "correlation": 249745 } }, { "ph": "f", "id": 249745, "pid": 76337, "tid": -914061504, "ts": 1716454225383245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225438970, "dur": 9, "args": { "External id": 249746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249746, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249746, "pid": 5, "tid": 7, "ts": 1716454225438970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383247, "dur": 12, "args": { "External id": 249746, "cbid": 211, "correlation": 249746 } }, { "ph": "s", "id": 249746, "pid": 76337, "tid": -914061504, "ts": 1716454225383247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225438980, "dur": 4, "args": { "External id": 249748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249748, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249748, "pid": 5, "tid": 7, "ts": 1716454225438980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383260, "dur": 5, "args": { "External id": 249748, "cbid": 211, "correlation": 249748 } }, { "ph": "s", "id": 249748, "pid": 76337, "tid": -914061504, "ts": 1716454225383260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225438985, "dur": 55, "args": { "External id": 249773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249773, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249773, "pid": 5, "tid": 7, "ts": 1716454225438985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383337, "dur": 13, "args": { "External id": 249773, "cbid": 211, "correlation": 249773 } }, { "ph": "s", "id": 249773, "pid": 76337, "tid": -914061504, "ts": 1716454225383337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383435, "dur": 1, "args": { "External id": 249791, "cbid": 251, "correlation": 249791 } }, { "ph": "f", "id": 249791, "pid": 76337, "tid": -914061504, "ts": 1716454225383435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225439041, "dur": 92, "args": { "External id": 249793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249793, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249793, "pid": 5, "tid": 7, "ts": 1716454225439041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383441, "dur": 14, "args": { "External id": 249793, "cbid": 211, "correlation": 249793 } }, { "ph": "s", "id": 249793, "pid": 76337, "tid": -914061504, "ts": 1716454225383441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225439135, "dur": 9, "args": { "External id": 249801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249801, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249801, "pid": 5, "tid": 7, "ts": 1716454225439135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383511, "dur": 12, "args": { "External id": 249801, "cbid": 211, "correlation": 249801 } }, { "ph": "s", "id": 249801, "pid": 76337, "tid": -914061504, "ts": 1716454225383511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225439146, "dur": 21, "args": { "External id": 249809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249809, "pid": 5, "tid": 7, "ts": 1716454225439146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383552, "dur": 9, "args": { "External id": 249809, "cbid": 211, "correlation": 249809 } }, { "ph": "s", "id": 249809, "pid": 76337, "tid": -914061504, "ts": 1716454225383552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225439168, "dur": 17, "args": { "External id": 249831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249831, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249831, "pid": 5, "tid": 7, "ts": 1716454225439168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383604, "dur": 10, "args": { "External id": 249831, "cbid": 211, "correlation": 249831 } }, { "ph": "s", "id": 249831, "pid": 76337, "tid": -914061504, "ts": 1716454225383604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383691, "dur": 1, "args": { "External id": 249847, "cbid": 251, "correlation": 249847 } }, { "ph": "f", "id": 249847, "pid": 76337, "tid": -914061504, "ts": 1716454225383691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383696, "dur": 0, "args": { "External id": 249849, "cbid": 251, "correlation": 249849 } }, { "ph": "f", "id": 249849, "pid": 76337, "tid": -914061504, "ts": 1716454225383696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225439187, "dur": 499, "args": { "External id": 249850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249850, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 249850, "pid": 5, "tid": 7, "ts": 1716454225439187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383698, "dur": 13, "args": { "External id": 249850, "cbid": 211, "correlation": 249850 } }, { "ph": "s", "id": 249850, "pid": 76337, "tid": -914061504, "ts": 1716454225383698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225439687, "dur": 66, "args": { "External id": 249858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249858, "pid": 5, "tid": 7, "ts": 1716454225439687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383763, "dur": 12, "args": { "External id": 249858, "cbid": 211, "correlation": 249858 } }, { "ph": "s", "id": 249858, "pid": 76337, "tid": -914061504, "ts": 1716454225383763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225439754, "dur": 68, "args": { "External id": 249866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249866, "pid": 5, "tid": 7, "ts": 1716454225439754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383793, "dur": 9, "args": { "External id": 249866, "cbid": 211, "correlation": 249866 } }, { "ph": "s", "id": 249866, "pid": 76337, "tid": -914061504, "ts": 1716454225383793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225383872, "dur": 1, "args": { "External id": 249882, "cbid": 251, "correlation": 249882 } }, { "ph": "f", "id": 249882, "pid": 76337, "tid": -914061504, "ts": 1716454225383872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225439824, "dur": 1, "args": { "External id": 249884, "device": 5, "context": 1, "stream": 7, "correlation": 249884, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 249884, "pid": 5, "tid": 7, "ts": 1716454225439824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225383877, "dur": 9, "args": { "External id": 249884, "cbid": 51, "correlation": 249884 } }, { "ph": "s", "id": 249884, "pid": 76337, "tid": -914061504, "ts": 1716454225383877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225439828, "dur": 275, "args": { "External id": 249885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249885, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249885, "pid": 5, "tid": 7, "ts": 1716454225439828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383888, "dur": 12, "args": { "External id": 249885, "cbid": 211, "correlation": 249885 } }, { "ph": "s", "id": 249885, "pid": 76337, "tid": -914061504, "ts": 1716454225383888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225440105, "dur": 14, "args": { "External id": 249893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249893, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249893, "pid": 5, "tid": 7, "ts": 1716454225440105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225383930, "dur": 10, "args": { "External id": 249893, "cbid": 211, "correlation": 249893 } }, { "ph": "s", "id": 249893, "pid": 76337, "tid": -914061504, "ts": 1716454225383930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225440120, "dur": 39, "args": { "External id": 249904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249904, "pid": 5, "tid": 7, "ts": 1716454225440120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384007, "dur": 13, "args": { "External id": 249904, "cbid": 211, "correlation": 249904 } }, { "ph": "s", "id": 249904, "pid": 76337, "tid": -914061504, "ts": 1716454225384007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225384072, "dur": 0, "args": { "External id": 249916, "cbid": 317, "correlation": 249916 } }, { "ph": "f", "id": 249916, "pid": 76337, "tid": -914061504, "ts": 1716454225384072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225384072, "dur": 0, "args": { "External id": 249917, "cbid": 203, "correlation": 249917 } }, { "ph": "f", "id": 249917, "pid": 76337, "tid": -914061504, "ts": 1716454225384072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225384073, "dur": 0, "args": { "External id": 249918, "cbid": 205, "correlation": 249918 } }, { "ph": "f", "id": 249918, "pid": 76337, "tid": -914061504, "ts": 1716454225384073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225440160, "dur": 12, "args": { "External id": 249922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249922, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249922, "pid": 5, "tid": 7, "ts": 1716454225440160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384089, "dur": 13, "args": { "External id": 249922, "cbid": 211, "correlation": 249922 } }, { "ph": "s", "id": 249922, "pid": 76337, "tid": -914061504, "ts": 1716454225384089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225440174, "dur": 4, "args": { "External id": 249924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 249924, "pid": 5, "tid": 7, "ts": 1716454225440174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384107, "dur": 6, "args": { "External id": 249924, "cbid": 211, "correlation": 249924 } }, { "ph": "s", "id": 249924, "pid": 76337, "tid": -914061504, "ts": 1716454225384107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225384115, "dur": 0, "args": { "External id": 249925, "cbid": 51, "correlation": 249925 } }, { "ph": "s", "id": 249925, "pid": 76337, "tid": -914061504, "ts": 1716454225384115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225440179, "dur": 99, "args": { "External id": 249926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249926, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 249926, "pid": 5, "tid": 7, "ts": 1716454225440179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384116, "dur": 5, "args": { "External id": 249926, "cbid": 211, "correlation": 249926 } }, { "ph": "s", "id": 249926, "pid": 76337, "tid": -914061504, "ts": 1716454225384116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225440280, "dur": 17, "args": { "External id": 249931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249931, "pid": 5, "tid": 7, "ts": 1716454225440280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384143, "dur": 9, "args": { "External id": 249931, "cbid": 211, "correlation": 249931 } }, { "ph": "s", "id": 249931, "pid": 76337, "tid": -914061504, "ts": 1716454225384143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225440298, "dur": 11, "args": { "External id": 249939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249939, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249939, "pid": 5, "tid": 7, "ts": 1716454225440298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384175, "dur": 8, "args": { "External id": 249939, "cbid": 211, "correlation": 249939 } }, { "ph": "s", "id": 249939, "pid": 76337, "tid": -914061504, "ts": 1716454225384175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225384244, "dur": 0, "args": { "External id": 249949, "cbid": 317, "correlation": 249949 } }, { "ph": "f", "id": 249949, "pid": 76337, "tid": -914061504, "ts": 1716454225384244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225384244, "dur": 0, "args": { "External id": 249950, "cbid": 203, "correlation": 249950 } }, { "ph": "f", "id": 249950, "pid": 76337, "tid": -914061504, "ts": 1716454225384244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225384245, "dur": 0, "args": { "External id": 249951, "cbid": 205, "correlation": 249951 } }, { "ph": "f", "id": 249951, "pid": 76337, "tid": -914061504, "ts": 1716454225384245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225440310, "dur": 12, "args": { "External id": 249955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249955, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249955, "pid": 5, "tid": 7, "ts": 1716454225440310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384261, "dur": 12, "args": { "External id": 249955, "cbid": 211, "correlation": 249955 } }, { "ph": "s", "id": 249955, "pid": 76337, "tid": -914061504, "ts": 1716454225384261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225440324, "dur": 164, "args": { "External id": 249957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249957, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249957, "pid": 5, "tid": 7, "ts": 1716454225440324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384275, "dur": 5, "args": { "External id": 249957, "cbid": 211, "correlation": 249957 } }, { "ph": "s", "id": 249957, "pid": 76337, "tid": -914061504, "ts": 1716454225384275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225440491, "dur": 1, "args": { "External id": 249959, "device": 5, "context": 1, "stream": 7, "correlation": 249959, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 249959, "pid": 5, "tid": 7, "ts": 1716454225440491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225384286, "dur": 6, "args": { "External id": 249959, "cbid": 51, "correlation": 249959 } }, { "ph": "s", "id": 249959, "pid": 76337, "tid": -914061504, "ts": 1716454225384286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225440495, "dur": 202, "args": { "External id": 249960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249960, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 249960, "pid": 5, "tid": 7, "ts": 1716454225440495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384294, "dur": 9, "args": { "External id": 249960, "cbid": 211, "correlation": 249960 } }, { "ph": "s", "id": 249960, "pid": 76337, "tid": -914061504, "ts": 1716454225384294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225440698, "dur": 6, "args": { "External id": 249962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249962, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 249962, "pid": 5, "tid": 7, "ts": 1716454225440698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384307, "dur": 5, "args": { "External id": 249962, "cbid": 211, "correlation": 249962 } }, { "ph": "s", "id": 249962, "pid": 76337, "tid": -914061504, "ts": 1716454225384307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225440705, "dur": 6, "args": { "External id": 249968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 249968, "pid": 5, "tid": 7, "ts": 1716454225440705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384338, "dur": 9, "args": { "External id": 249968, "cbid": 211, "correlation": 249968 } }, { "ph": "s", "id": 249968, "pid": 76337, "tid": -914061504, "ts": 1716454225384338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225440713, "dur": 11, "args": { "External id": 249988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 249988, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 249988, "pid": 5, "tid": 7, "ts": 1716454225440713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384433, "dur": 12, "args": { "External id": 249988, "cbid": 211, "correlation": 249988 } }, { "ph": "s", "id": 249988, "pid": 76337, "tid": -914061504, "ts": 1716454225384433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225440725, "dur": 4, "args": { "External id": 250000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250000, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250000, "pid": 5, "tid": 7, "ts": 1716454225440725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384460, "dur": 7, "args": { "External id": 250000, "cbid": 211, "correlation": 250000 } }, { "ph": "s", "id": 250000, "pid": 76337, "tid": -914061504, "ts": 1716454225384460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225440731, "dur": 8, "args": { "External id": 250003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250003, "pid": 5, "tid": 7, "ts": 1716454225440731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384480, "dur": 6, "args": { "External id": 250003, "cbid": 211, "correlation": 250003 } }, { "ph": "s", "id": 250003, "pid": 76337, "tid": -914061504, "ts": 1716454225384480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225440741, "dur": 6, "args": { "External id": 250012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250012, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250012, "pid": 5, "tid": 7, "ts": 1716454225440741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384520, "dur": 10, "args": { "External id": 250012, "cbid": 211, "correlation": 250012 } }, { "ph": "s", "id": 250012, "pid": 76337, "tid": -914061504, "ts": 1716454225384520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225384572, "dur": 0, "args": { "External id": 250022, "cbid": 317, "correlation": 250022 } }, { "ph": "f", "id": 250022, "pid": 76337, "tid": -914061504, "ts": 1716454225384572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225384572, "dur": 0, "args": { "External id": 250023, "cbid": 203, "correlation": 250023 } }, { "ph": "f", "id": 250023, "pid": 76337, "tid": -914061504, "ts": 1716454225384572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225384573, "dur": 0, "args": { "External id": 250024, "cbid": 205, "correlation": 250024 } }, { "ph": "f", "id": 250024, "pid": 76337, "tid": -914061504, "ts": 1716454225384573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225440748, "dur": 5, "args": { "External id": 250028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250028, "pid": 5, "tid": 7, "ts": 1716454225440748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384588, "dur": 12, "args": { "External id": 250028, "cbid": 211, "correlation": 250028 } }, { "ph": "s", "id": 250028, "pid": 76337, "tid": -914061504, "ts": 1716454225384588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225440754, "dur": 167, "args": { "External id": 250030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250030, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250030, "pid": 5, "tid": 7, "ts": 1716454225440754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384602, "dur": 5, "args": { "External id": 250030, "cbid": 211, "correlation": 250030 } }, { "ph": "s", "id": 250030, "pid": 76337, "tid": -914061504, "ts": 1716454225384602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225440923, "dur": 1, "args": { "External id": 250032, "device": 5, "context": 1, "stream": 7, "correlation": 250032, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 250032, "pid": 5, "tid": 7, "ts": 1716454225440923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225384613, "dur": 6, "args": { "External id": 250032, "cbid": 51, "correlation": 250032 } }, { "ph": "s", "id": 250032, "pid": 76337, "tid": -914061504, "ts": 1716454225384613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225440927, "dur": 275, "args": { "External id": 250033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250033, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250033, "pid": 5, "tid": 7, "ts": 1716454225440927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384621, "dur": 6, "args": { "External id": 250033, "cbid": 211, "correlation": 250033 } }, { "ph": "s", "id": 250033, "pid": 76337, "tid": -914061504, "ts": 1716454225384621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225441203, "dur": 6, "args": { "External id": 250035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250035, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250035, "pid": 5, "tid": 7, "ts": 1716454225441203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384631, "dur": 5, "args": { "External id": 250035, "cbid": 211, "correlation": 250035 } }, { "ph": "s", "id": 250035, "pid": 76337, "tid": -914061504, "ts": 1716454225384631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225441210, "dur": 6, "args": { "External id": 250041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250041, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250041, "pid": 5, "tid": 7, "ts": 1716454225441210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384659, "dur": 8, "args": { "External id": 250041, "cbid": 211, "correlation": 250041 } }, { "ph": "s", "id": 250041, "pid": 76337, "tid": -914061504, "ts": 1716454225384659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225441218, "dur": 3, "args": { "External id": 250049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250049, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 250049, "pid": 5, "tid": 7, "ts": 1716454225441218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384703, "dur": 9, "args": { "External id": 250049, "cbid": 211, "correlation": 250049 } }, { "ph": "s", "id": 250049, "pid": 76337, "tid": -914061504, "ts": 1716454225384703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225384766, "dur": 1, "args": { "External id": 250065, "cbid": 251, "correlation": 250065 } }, { "ph": "f", "id": 250065, "pid": 76337, "tid": -914061504, "ts": 1716454225384766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225384771, "dur": 0, "args": { "External id": 250067, "cbid": 251, "correlation": 250067 } }, { "ph": "f", "id": 250067, "pid": 76337, "tid": -914061504, "ts": 1716454225384771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225441222, "dur": 14, "args": { "External id": 250068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250068, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250068, "pid": 5, "tid": 7, "ts": 1716454225441222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384773, "dur": 11, "args": { "External id": 250068, "cbid": 211, "correlation": 250068 } }, { "ph": "s", "id": 250068, "pid": 76337, "tid": -914061504, "ts": 1716454225384773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225441237, "dur": 5, "args": { "External id": 250070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250070, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250070, "pid": 5, "tid": 7, "ts": 1716454225441237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384786, "dur": 5, "args": { "External id": 250070, "cbid": 211, "correlation": 250070 } }, { "ph": "s", "id": 250070, "pid": 76337, "tid": -914061504, "ts": 1716454225384786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225441243, "dur": 6, "args": { "External id": 250080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250080, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250080, "pid": 5, "tid": 7, "ts": 1716454225441243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384842, "dur": 13, "args": { "External id": 250080, "cbid": 211, "correlation": 250080 } }, { "ph": "s", "id": 250080, "pid": 76337, "tid": -914061504, "ts": 1716454225384842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225441250, "dur": 10, "args": { "External id": 250100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250100, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 250100, "pid": 5, "tid": 7, "ts": 1716454225441250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384910, "dur": 11, "args": { "External id": 250100, "cbid": 211, "correlation": 250100 } }, { "ph": "s", "id": 250100, "pid": 76337, "tid": -914061504, "ts": 1716454225384910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225441262, "dur": 4, "args": { "External id": 250112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250112, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250112, "pid": 5, "tid": 7, "ts": 1716454225441262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384930, "dur": 6, "args": { "External id": 250112, "cbid": 211, "correlation": 250112 } }, { "ph": "s", "id": 250112, "pid": 76337, "tid": -914061504, "ts": 1716454225384930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225441267, "dur": 7, "args": { "External id": 250115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250115, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250115, "pid": 5, "tid": 7, "ts": 1716454225441267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384949, "dur": 7, "args": { "External id": 250115, "cbid": 211, "correlation": 250115 } }, { "ph": "s", "id": 250115, "pid": 76337, "tid": -914061504, "ts": 1716454225384949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225441275, "dur": 5, "args": { "External id": 250124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250124, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250124, "pid": 5, "tid": 7, "ts": 1716454225441275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225384998, "dur": 11, "args": { "External id": 250124, "cbid": 211, "correlation": 250124 } }, { "ph": "s", "id": 250124, "pid": 76337, "tid": -914061504, "ts": 1716454225384998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225385063, "dur": 0, "args": { "External id": 250134, "cbid": 317, "correlation": 250134 } }, { "ph": "f", "id": 250134, "pid": 76337, "tid": -914061504, "ts": 1716454225385063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225385064, "dur": 0, "args": { "External id": 250135, "cbid": 203, "correlation": 250135 } }, { "ph": "f", "id": 250135, "pid": 76337, "tid": -914061504, "ts": 1716454225385064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225385064, "dur": 0, "args": { "External id": 250136, "cbid": 205, "correlation": 250136 } }, { "ph": "f", "id": 250136, "pid": 76337, "tid": -914061504, "ts": 1716454225385064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225441281, "dur": 5, "args": { "External id": 250140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250140, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250140, "pid": 5, "tid": 7, "ts": 1716454225441281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385078, "dur": 12, "args": { "External id": 250140, "cbid": 211, "correlation": 250140 } }, { "ph": "s", "id": 250140, "pid": 76337, "tid": -914061504, "ts": 1716454225385078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225441287, "dur": 165, "args": { "External id": 250142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250142, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250142, "pid": 5, "tid": 7, "ts": 1716454225441287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385092, "dur": 5, "args": { "External id": 250142, "cbid": 211, "correlation": 250142 } }, { "ph": "s", "id": 250142, "pid": 76337, "tid": -914061504, "ts": 1716454225385092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225441454, "dur": 1, "args": { "External id": 250144, "device": 5, "context": 1, "stream": 7, "correlation": 250144, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 250144, "pid": 5, "tid": 7, "ts": 1716454225441454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225385103, "dur": 7, "args": { "External id": 250144, "cbid": 51, "correlation": 250144 } }, { "ph": "s", "id": 250144, "pid": 76337, "tid": -914061504, "ts": 1716454225385103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225441458, "dur": 264, "args": { "External id": 250145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250145, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250145, "pid": 5, "tid": 7, "ts": 1716454225441458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385111, "dur": 6, "args": { "External id": 250145, "cbid": 211, "correlation": 250145 } }, { "ph": "s", "id": 250145, "pid": 76337, "tid": -914061504, "ts": 1716454225385111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225441724, "dur": 6, "args": { "External id": 250147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250147, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250147, "pid": 5, "tid": 7, "ts": 1716454225441724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385121, "dur": 5, "args": { "External id": 250147, "cbid": 211, "correlation": 250147 } }, { "ph": "s", "id": 250147, "pid": 76337, "tid": -914061504, "ts": 1716454225385121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225441731, "dur": 6, "args": { "External id": 250153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250153, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250153, "pid": 5, "tid": 7, "ts": 1716454225441731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385149, "dur": 10, "args": { "External id": 250153, "cbid": 211, "correlation": 250153 } }, { "ph": "s", "id": 250153, "pid": 76337, "tid": -914061504, "ts": 1716454225385149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225441739, "dur": 5, "args": { "External id": 250161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250161, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250161, "pid": 5, "tid": 7, "ts": 1716454225441739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385182, "dur": 8, "args": { "External id": 250161, "cbid": 211, "correlation": 250161 } }, { "ph": "s", "id": 250161, "pid": 76337, "tid": -914061504, "ts": 1716454225385182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225441745, "dur": 5, "args": { "External id": 250169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250169, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250169, "pid": 5, "tid": 7, "ts": 1716454225441745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385212, "dur": 8, "args": { "External id": 250169, "cbid": 211, "correlation": 250169 } }, { "ph": "s", "id": 250169, "pid": 76337, "tid": -914061504, "ts": 1716454225385212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225441751, "dur": 10, "args": { "External id": 250189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250189, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 250189, "pid": 5, "tid": 7, "ts": 1716454225441751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385285, "dur": 12, "args": { "External id": 250189, "cbid": 211, "correlation": 250189 } }, { "ph": "s", "id": 250189, "pid": 76337, "tid": -914061504, "ts": 1716454225385285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225441762, "dur": 4, "args": { "External id": 250201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250201, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250201, "pid": 5, "tid": 7, "ts": 1716454225441762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385306, "dur": 7, "args": { "External id": 250201, "cbid": 211, "correlation": 250201 } }, { "ph": "s", "id": 250201, "pid": 76337, "tid": -914061504, "ts": 1716454225385306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225441767, "dur": 7, "args": { "External id": 250204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250204, "pid": 5, "tid": 7, "ts": 1716454225441767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385325, "dur": 6, "args": { "External id": 250204, "cbid": 211, "correlation": 250204 } }, { "ph": "s", "id": 250204, "pid": 76337, "tid": -914061504, "ts": 1716454225385325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225441775, "dur": 5, "args": { "External id": 250213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250213, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250213, "pid": 5, "tid": 7, "ts": 1716454225441775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385363, "dur": 9, "args": { "External id": 250213, "cbid": 211, "correlation": 250213 } }, { "ph": "s", "id": 250213, "pid": 76337, "tid": -914061504, "ts": 1716454225385363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225385413, "dur": 0, "args": { "External id": 250223, "cbid": 317, "correlation": 250223 } }, { "ph": "f", "id": 250223, "pid": 76337, "tid": -914061504, "ts": 1716454225385413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225385414, "dur": 0, "args": { "External id": 250224, "cbid": 203, "correlation": 250224 } }, { "ph": "f", "id": 250224, "pid": 76337, "tid": -914061504, "ts": 1716454225385414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225385415, "dur": 0, "args": { "External id": 250225, "cbid": 205, "correlation": 250225 } }, { "ph": "f", "id": 250225, "pid": 76337, "tid": -914061504, "ts": 1716454225385415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225441781, "dur": 5, "args": { "External id": 250229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250229, "pid": 5, "tid": 7, "ts": 1716454225441781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385427, "dur": 12, "args": { "External id": 250229, "cbid": 211, "correlation": 250229 } }, { "ph": "s", "id": 250229, "pid": 76337, "tid": -914061504, "ts": 1716454225385427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225441788, "dur": 165, "args": { "External id": 250231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250231, "pid": 5, "tid": 7, "ts": 1716454225441788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385441, "dur": 5, "args": { "External id": 250231, "cbid": 211, "correlation": 250231 } }, { "ph": "s", "id": 250231, "pid": 76337, "tid": -914061504, "ts": 1716454225385441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225441955, "dur": 1, "args": { "External id": 250233, "device": 5, "context": 1, "stream": 7, "correlation": 250233, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 250233, "pid": 5, "tid": 7, "ts": 1716454225441955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225385452, "dur": 6, "args": { "External id": 250233, "cbid": 51, "correlation": 250233 } }, { "ph": "s", "id": 250233, "pid": 76337, "tid": -914061504, "ts": 1716454225385452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225441958, "dur": 263, "args": { "External id": 250234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250234, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250234, "pid": 5, "tid": 7, "ts": 1716454225441958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385459, "dur": 7, "args": { "External id": 250234, "cbid": 211, "correlation": 250234 } }, { "ph": "s", "id": 250234, "pid": 76337, "tid": -914061504, "ts": 1716454225385459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225442223, "dur": 6, "args": { "External id": 250236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250236, "pid": 5, "tid": 7, "ts": 1716454225442223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385469, "dur": 5, "args": { "External id": 250236, "cbid": 211, "correlation": 250236 } }, { "ph": "s", "id": 250236, "pid": 76337, "tid": -914061504, "ts": 1716454225385469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225442230, "dur": 6, "args": { "External id": 250242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250242, "pid": 5, "tid": 7, "ts": 1716454225442230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385497, "dur": 8, "args": { "External id": 250242, "cbid": 211, "correlation": 250242 } }, { "ph": "s", "id": 250242, "pid": 76337, "tid": -914061504, "ts": 1716454225385497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225442237, "dur": 3, "args": { "External id": 250250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250250, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 250250, "pid": 5, "tid": 7, "ts": 1716454225442237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385540, "dur": 10, "args": { "External id": 250250, "cbid": 211, "correlation": 250250 } }, { "ph": "s", "id": 250250, "pid": 76337, "tid": -914061504, "ts": 1716454225385540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225385603, "dur": 1, "args": { "External id": 250266, "cbid": 251, "correlation": 250266 } }, { "ph": "f", "id": 250266, "pid": 76337, "tid": -914061504, "ts": 1716454225385603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225385608, "dur": 0, "args": { "External id": 250268, "cbid": 251, "correlation": 250268 } }, { "ph": "f", "id": 250268, "pid": 76337, "tid": -914061504, "ts": 1716454225385608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225442242, "dur": 10, "args": { "External id": 250269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250269, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250269, "pid": 5, "tid": 7, "ts": 1716454225442242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385611, "dur": 11, "args": { "External id": 250269, "cbid": 211, "correlation": 250269 } }, { "ph": "s", "id": 250269, "pid": 76337, "tid": -914061504, "ts": 1716454225385611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225442253, "dur": 4, "args": { "External id": 250271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250271, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250271, "pid": 5, "tid": 7, "ts": 1716454225442253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385624, "dur": 5, "args": { "External id": 250271, "cbid": 211, "correlation": 250271 } }, { "ph": "s", "id": 250271, "pid": 76337, "tid": -914061504, "ts": 1716454225385624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225442258, "dur": 6, "args": { "External id": 250281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250281, "pid": 5, "tid": 7, "ts": 1716454225442258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385679, "dur": 12, "args": { "External id": 250281, "cbid": 211, "correlation": 250281 } }, { "ph": "s", "id": 250281, "pid": 76337, "tid": -914061504, "ts": 1716454225385679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225442265, "dur": 10, "args": { "External id": 250301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250301, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 250301, "pid": 5, "tid": 7, "ts": 1716454225442265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385744, "dur": 10, "args": { "External id": 250301, "cbid": 211, "correlation": 250301 } }, { "ph": "s", "id": 250301, "pid": 76337, "tid": -914061504, "ts": 1716454225385744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225442276, "dur": 4, "args": { "External id": 250313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250313, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250313, "pid": 5, "tid": 7, "ts": 1716454225442276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385765, "dur": 6, "args": { "External id": 250313, "cbid": 211, "correlation": 250313 } }, { "ph": "s", "id": 250313, "pid": 76337, "tid": -914061504, "ts": 1716454225385765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225442282, "dur": 7, "args": { "External id": 250316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250316, "pid": 5, "tid": 7, "ts": 1716454225442282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385783, "dur": 6, "args": { "External id": 250316, "cbid": 211, "correlation": 250316 } }, { "ph": "s", "id": 250316, "pid": 76337, "tid": -914061504, "ts": 1716454225385783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225442290, "dur": 5, "args": { "External id": 250325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250325, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250325, "pid": 5, "tid": 7, "ts": 1716454225442290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385823, "dur": 9, "args": { "External id": 250325, "cbid": 211, "correlation": 250325 } }, { "ph": "s", "id": 250325, "pid": 76337, "tid": -914061504, "ts": 1716454225385823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225385884, "dur": 0, "args": { "External id": 250335, "cbid": 317, "correlation": 250335 } }, { "ph": "f", "id": 250335, "pid": 76337, "tid": -914061504, "ts": 1716454225385884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225385884, "dur": 0, "args": { "External id": 250336, "cbid": 203, "correlation": 250336 } }, { "ph": "f", "id": 250336, "pid": 76337, "tid": -914061504, "ts": 1716454225385884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225385885, "dur": 0, "args": { "External id": 250337, "cbid": 205, "correlation": 250337 } }, { "ph": "f", "id": 250337, "pid": 76337, "tid": -914061504, "ts": 1716454225385885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225442296, "dur": 5, "args": { "External id": 250341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250341, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250341, "pid": 5, "tid": 7, "ts": 1716454225442296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385899, "dur": 12, "args": { "External id": 250341, "cbid": 211, "correlation": 250341 } }, { "ph": "s", "id": 250341, "pid": 76337, "tid": -914061504, "ts": 1716454225385899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225442302, "dur": 165, "args": { "External id": 250343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250343, "pid": 5, "tid": 7, "ts": 1716454225442302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385913, "dur": 6, "args": { "External id": 250343, "cbid": 211, "correlation": 250343 } }, { "ph": "s", "id": 250343, "pid": 76337, "tid": -914061504, "ts": 1716454225385913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225442469, "dur": 1, "args": { "External id": 250345, "device": 5, "context": 1, "stream": 7, "correlation": 250345, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 250345, "pid": 5, "tid": 7, "ts": 1716454225442469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225385925, "dur": 6, "args": { "External id": 250345, "cbid": 51, "correlation": 250345 } }, { "ph": "s", "id": 250345, "pid": 76337, "tid": -914061504, "ts": 1716454225385925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225442473, "dur": 263, "args": { "External id": 250346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250346, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250346, "pid": 5, "tid": 7, "ts": 1716454225442473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385932, "dur": 6, "args": { "External id": 250346, "cbid": 211, "correlation": 250346 } }, { "ph": "s", "id": 250346, "pid": 76337, "tid": -914061504, "ts": 1716454225385932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225442738, "dur": 6, "args": { "External id": 250348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250348, "pid": 5, "tid": 7, "ts": 1716454225442738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385943, "dur": 5, "args": { "External id": 250348, "cbid": 211, "correlation": 250348 } }, { "ph": "s", "id": 250348, "pid": 76337, "tid": -914061504, "ts": 1716454225385943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225442745, "dur": 6, "args": { "External id": 250354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250354, "pid": 5, "tid": 7, "ts": 1716454225442745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225385972, "dur": 18, "args": { "External id": 250354, "cbid": 211, "correlation": 250354 } }, { "ph": "s", "id": 250354, "pid": 76337, "tid": -914061504, "ts": 1716454225385972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225442753, "dur": 5, "args": { "External id": 250362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250362, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250362, "pid": 5, "tid": 7, "ts": 1716454225442753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386014, "dur": 9, "args": { "External id": 250362, "cbid": 211, "correlation": 250362 } }, { "ph": "s", "id": 250362, "pid": 76337, "tid": -914061504, "ts": 1716454225386014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225442759, "dur": 5, "args": { "External id": 250370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250370, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250370, "pid": 5, "tid": 7, "ts": 1716454225442759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386044, "dur": 8, "args": { "External id": 250370, "cbid": 211, "correlation": 250370 } }, { "ph": "s", "id": 250370, "pid": 76337, "tid": -914061504, "ts": 1716454225386044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225442765, "dur": 10, "args": { "External id": 250390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250390, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 250390, "pid": 5, "tid": 7, "ts": 1716454225442765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386146, "dur": 12, "args": { "External id": 250390, "cbid": 211, "correlation": 250390 } }, { "ph": "s", "id": 250390, "pid": 76337, "tid": -914061504, "ts": 1716454225386146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225442776, "dur": 4, "args": { "External id": 250402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250402, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250402, "pid": 5, "tid": 7, "ts": 1716454225442776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386168, "dur": 7, "args": { "External id": 250402, "cbid": 211, "correlation": 250402 } }, { "ph": "s", "id": 250402, "pid": 76337, "tid": -914061504, "ts": 1716454225386168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225442781, "dur": 7, "args": { "External id": 250405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250405, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250405, "pid": 5, "tid": 7, "ts": 1716454225442781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386186, "dur": 7, "args": { "External id": 250405, "cbid": 211, "correlation": 250405 } }, { "ph": "s", "id": 250405, "pid": 76337, "tid": -914061504, "ts": 1716454225386186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225442789, "dur": 5, "args": { "External id": 250414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250414, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250414, "pid": 5, "tid": 7, "ts": 1716454225442789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386225, "dur": 9, "args": { "External id": 250414, "cbid": 211, "correlation": 250414 } }, { "ph": "s", "id": 250414, "pid": 76337, "tid": -914061504, "ts": 1716454225386225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225386282, "dur": 0, "args": { "External id": 250424, "cbid": 317, "correlation": 250424 } }, { "ph": "f", "id": 250424, "pid": 76337, "tid": -914061504, "ts": 1716454225386282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225386283, "dur": 0, "args": { "External id": 250425, "cbid": 203, "correlation": 250425 } }, { "ph": "f", "id": 250425, "pid": 76337, "tid": -914061504, "ts": 1716454225386283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225386283, "dur": 0, "args": { "External id": 250426, "cbid": 205, "correlation": 250426 } }, { "ph": "f", "id": 250426, "pid": 76337, "tid": -914061504, "ts": 1716454225386283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225442795, "dur": 5, "args": { "External id": 250430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250430, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250430, "pid": 5, "tid": 7, "ts": 1716454225442795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386296, "dur": 12, "args": { "External id": 250430, "cbid": 211, "correlation": 250430 } }, { "ph": "s", "id": 250430, "pid": 76337, "tid": -914061504, "ts": 1716454225386296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225442802, "dur": 165, "args": { "External id": 250432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250432, "pid": 5, "tid": 7, "ts": 1716454225442802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386310, "dur": 5, "args": { "External id": 250432, "cbid": 211, "correlation": 250432 } }, { "ph": "s", "id": 250432, "pid": 76337, "tid": -914061504, "ts": 1716454225386310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225442969, "dur": 1, "args": { "External id": 250434, "device": 5, "context": 1, "stream": 7, "correlation": 250434, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 250434, "pid": 5, "tid": 7, "ts": 1716454225442969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225386322, "dur": 6, "args": { "External id": 250434, "cbid": 51, "correlation": 250434 } }, { "ph": "s", "id": 250434, "pid": 76337, "tid": -914061504, "ts": 1716454225386322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225442972, "dur": 263, "args": { "External id": 250435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250435, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250435, "pid": 5, "tid": 7, "ts": 1716454225442972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386329, "dur": 6, "args": { "External id": 250435, "cbid": 211, "correlation": 250435 } }, { "ph": "s", "id": 250435, "pid": 76337, "tid": -914061504, "ts": 1716454225386329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225443237, "dur": 6, "args": { "External id": 250437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250437, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250437, "pid": 5, "tid": 7, "ts": 1716454225443237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386339, "dur": 5, "args": { "External id": 250437, "cbid": 211, "correlation": 250437 } }, { "ph": "s", "id": 250437, "pid": 76337, "tid": -914061504, "ts": 1716454225386339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225443244, "dur": 7, "args": { "External id": 250443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250443, "pid": 5, "tid": 7, "ts": 1716454225443244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386367, "dur": 9, "args": { "External id": 250443, "cbid": 211, "correlation": 250443 } }, { "ph": "s", "id": 250443, "pid": 76337, "tid": -914061504, "ts": 1716454225386367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225443252, "dur": 3, "args": { "External id": 250451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250451, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 250451, "pid": 5, "tid": 7, "ts": 1716454225443252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386411, "dur": 10, "args": { "External id": 250451, "cbid": 211, "correlation": 250451 } }, { "ph": "s", "id": 250451, "pid": 76337, "tid": -914061504, "ts": 1716454225386411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225386475, "dur": 1, "args": { "External id": 250467, "cbid": 251, "correlation": 250467 } }, { "ph": "f", "id": 250467, "pid": 76337, "tid": -914061504, "ts": 1716454225386475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225386480, "dur": 0, "args": { "External id": 250469, "cbid": 251, "correlation": 250469 } }, { "ph": "f", "id": 250469, "pid": 76337, "tid": -914061504, "ts": 1716454225386480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225443256, "dur": 10, "args": { "External id": 250470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250470, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250470, "pid": 5, "tid": 7, "ts": 1716454225443256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386482, "dur": 11, "args": { "External id": 250470, "cbid": 211, "correlation": 250470 } }, { "ph": "s", "id": 250470, "pid": 76337, "tid": -914061504, "ts": 1716454225386482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225443268, "dur": 4, "args": { "External id": 250472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250472, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250472, "pid": 5, "tid": 7, "ts": 1716454225443268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386494, "dur": 5, "args": { "External id": 250472, "cbid": 211, "correlation": 250472 } }, { "ph": "s", "id": 250472, "pid": 76337, "tid": -914061504, "ts": 1716454225386494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225443273, "dur": 6, "args": { "External id": 250482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250482, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250482, "pid": 5, "tid": 7, "ts": 1716454225443273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386550, "dur": 12, "args": { "External id": 250482, "cbid": 211, "correlation": 250482 } }, { "ph": "s", "id": 250482, "pid": 76337, "tid": -914061504, "ts": 1716454225386550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225443280, "dur": 10, "args": { "External id": 250502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250502, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 250502, "pid": 5, "tid": 7, "ts": 1716454225443280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386615, "dur": 11, "args": { "External id": 250502, "cbid": 211, "correlation": 250502 } }, { "ph": "s", "id": 250502, "pid": 76337, "tid": -914061504, "ts": 1716454225386615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225443291, "dur": 4, "args": { "External id": 250514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250514, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250514, "pid": 5, "tid": 7, "ts": 1716454225443291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386636, "dur": 6, "args": { "External id": 250514, "cbid": 211, "correlation": 250514 } }, { "ph": "s", "id": 250514, "pid": 76337, "tid": -914061504, "ts": 1716454225386636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225443296, "dur": 7, "args": { "External id": 250517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250517, "pid": 5, "tid": 7, "ts": 1716454225443296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386655, "dur": 6, "args": { "External id": 250517, "cbid": 211, "correlation": 250517 } }, { "ph": "s", "id": 250517, "pid": 76337, "tid": -914061504, "ts": 1716454225386655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225443304, "dur": 5, "args": { "External id": 250526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250526, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250526, "pid": 5, "tid": 7, "ts": 1716454225443304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386695, "dur": 10, "args": { "External id": 250526, "cbid": 211, "correlation": 250526 } }, { "ph": "s", "id": 250526, "pid": 76337, "tid": -914061504, "ts": 1716454225386695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225386758, "dur": 0, "args": { "External id": 250536, "cbid": 317, "correlation": 250536 } }, { "ph": "f", "id": 250536, "pid": 76337, "tid": -914061504, "ts": 1716454225386758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225386759, "dur": 0, "args": { "External id": 250537, "cbid": 203, "correlation": 250537 } }, { "ph": "f", "id": 250537, "pid": 76337, "tid": -914061504, "ts": 1716454225386759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225386760, "dur": 0, "args": { "External id": 250538, "cbid": 205, "correlation": 250538 } }, { "ph": "f", "id": 250538, "pid": 76337, "tid": -914061504, "ts": 1716454225386760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225443310, "dur": 5, "args": { "External id": 250542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250542, "pid": 5, "tid": 7, "ts": 1716454225443310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386774, "dur": 12, "args": { "External id": 250542, "cbid": 211, "correlation": 250542 } }, { "ph": "s", "id": 250542, "pid": 76337, "tid": -914061504, "ts": 1716454225386774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225443316, "dur": 166, "args": { "External id": 250544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250544, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250544, "pid": 5, "tid": 7, "ts": 1716454225443316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386788, "dur": 5, "args": { "External id": 250544, "cbid": 211, "correlation": 250544 } }, { "ph": "s", "id": 250544, "pid": 76337, "tid": -914061504, "ts": 1716454225386788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225443485, "dur": 1, "args": { "External id": 250546, "device": 5, "context": 1, "stream": 7, "correlation": 250546, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 250546, "pid": 5, "tid": 7, "ts": 1716454225443485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225386799, "dur": 6, "args": { "External id": 250546, "cbid": 51, "correlation": 250546 } }, { "ph": "s", "id": 250546, "pid": 76337, "tid": -914061504, "ts": 1716454225386799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225443488, "dur": 263, "args": { "External id": 250547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250547, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250547, "pid": 5, "tid": 7, "ts": 1716454225443488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386806, "dur": 6, "args": { "External id": 250547, "cbid": 211, "correlation": 250547 } }, { "ph": "s", "id": 250547, "pid": 76337, "tid": -914061504, "ts": 1716454225386806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225443753, "dur": 6, "args": { "External id": 250549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250549, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250549, "pid": 5, "tid": 7, "ts": 1716454225443753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386816, "dur": 5, "args": { "External id": 250549, "cbid": 211, "correlation": 250549 } }, { "ph": "s", "id": 250549, "pid": 76337, "tid": -914061504, "ts": 1716454225386816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225443761, "dur": 6, "args": { "External id": 250555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250555, "pid": 5, "tid": 7, "ts": 1716454225443761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386844, "dur": 9, "args": { "External id": 250555, "cbid": 211, "correlation": 250555 } }, { "ph": "s", "id": 250555, "pid": 76337, "tid": -914061504, "ts": 1716454225386844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225443768, "dur": 5, "args": { "External id": 250563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250563, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250563, "pid": 5, "tid": 7, "ts": 1716454225443768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386877, "dur": 8, "args": { "External id": 250563, "cbid": 211, "correlation": 250563 } }, { "ph": "s", "id": 250563, "pid": 76337, "tid": -914061504, "ts": 1716454225386877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225443775, "dur": 5, "args": { "External id": 250571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250571, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250571, "pid": 5, "tid": 7, "ts": 1716454225443775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225386905, "dur": 8, "args": { "External id": 250571, "cbid": 211, "correlation": 250571 } }, { "ph": "s", "id": 250571, "pid": 76337, "tid": -914061504, "ts": 1716454225386905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225443781, "dur": 10, "args": { "External id": 250591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250591, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 250591, "pid": 5, "tid": 7, "ts": 1716454225443781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387059, "dur": 14, "args": { "External id": 250591, "cbid": 211, "correlation": 250591 } }, { "ph": "s", "id": 250591, "pid": 76337, "tid": -914061504, "ts": 1716454225387059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225443792, "dur": 4, "args": { "External id": 250603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250603, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 250603, "pid": 5, "tid": 7, "ts": 1716454225443792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387084, "dur": 6, "args": { "External id": 250603, "cbid": 211, "correlation": 250603 } }, { "ph": "s", "id": 250603, "pid": 76337, "tid": -914061504, "ts": 1716454225387084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225443797, "dur": 7, "args": { "External id": 250606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250606, "pid": 5, "tid": 7, "ts": 1716454225443797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387102, "dur": 7, "args": { "External id": 250606, "cbid": 211, "correlation": 250606 } }, { "ph": "s", "id": 250606, "pid": 76337, "tid": -914061504, "ts": 1716454225387102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225387162, "dur": 0, "args": { "External id": 250617, "cbid": 317, "correlation": 250617 } }, { "ph": "f", "id": 250617, "pid": 76337, "tid": -914061504, "ts": 1716454225387162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225387163, "dur": 0, "args": { "External id": 250618, "cbid": 203, "correlation": 250618 } }, { "ph": "f", "id": 250618, "pid": 76337, "tid": -914061504, "ts": 1716454225387163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225387164, "dur": 0, "args": { "External id": 250619, "cbid": 205, "correlation": 250619 } }, { "ph": "f", "id": 250619, "pid": 76337, "tid": -914061504, "ts": 1716454225387164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225443805, "dur": 5, "args": { "External id": 250623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250623, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250623, "pid": 5, "tid": 7, "ts": 1716454225443805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387186, "dur": 12, "args": { "External id": 250623, "cbid": 211, "correlation": 250623 } }, { "ph": "s", "id": 250623, "pid": 76337, "tid": -914061504, "ts": 1716454225387186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225443811, "dur": 38, "args": { "External id": 250625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250625, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 250625, "pid": 5, "tid": 7, "ts": 1716454225443811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387205, "dur": 9, "args": { "External id": 250625, "cbid": 211, "correlation": 250625 } }, { "ph": "s", "id": 250625, "pid": 76337, "tid": -914061504, "ts": 1716454225387205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225443851, "dur": 5, "args": { "External id": 250627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250627, "pid": 5, "tid": 7, "ts": 1716454225443851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387218, "dur": 5, "args": { "External id": 250627, "cbid": 211, "correlation": 250627 } }, { "ph": "s", "id": 250627, "pid": 76337, "tid": -914061504, "ts": 1716454225387218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225443858, "dur": 6, "args": { "External id": 250633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250633, "pid": 5, "tid": 7, "ts": 1716454225443858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387245, "dur": 8, "args": { "External id": 250633, "cbid": 211, "correlation": 250633 } }, { "ph": "s", "id": 250633, "pid": 76337, "tid": -914061504, "ts": 1716454225387245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225443865, "dur": 21, "args": { "External id": 250642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250642, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250642, "pid": 5, "tid": 7, "ts": 1716454225443865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387329, "dur": 14, "args": { "External id": 250642, "cbid": 211, "correlation": 250642 } }, { "ph": "s", "id": 250642, "pid": 76337, "tid": -914061504, "ts": 1716454225387329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225443887, "dur": 11, "args": { "External id": 250664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250664, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 250664, "pid": 5, "tid": 7, "ts": 1716454225443887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387385, "dur": 10, "args": { "External id": 250664, "cbid": 211, "correlation": 250664 } }, { "ph": "s", "id": 250664, "pid": 76337, "tid": -914061504, "ts": 1716454225387385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387476, "dur": 2, "args": { "External id": 250675, "cbid": 251, "correlation": 250675 } }, { "ph": "f", "id": 250675, "pid": 76337, "tid": -914061504, "ts": 1716454225387476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387481, "dur": 0, "args": { "External id": 250676, "cbid": 251, "correlation": 250676 } }, { "ph": "f", "id": 250676, "pid": 76337, "tid": -914061504, "ts": 1716454225387481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225443900, "dur": 55, "args": { "External id": 250677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250677, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 250677, "pid": 5, "tid": 7, "ts": 1716454225443900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387484, "dur": 15, "args": { "External id": 250677, "cbid": 211, "correlation": 250677 } }, { "ph": "s", "id": 250677, "pid": 76337, "tid": -914061504, "ts": 1716454225387484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387555, "dur": 1, "args": { "External id": 250688, "cbid": 251, "correlation": 250688 } }, { "ph": "f", "id": 250688, "pid": 76337, "tid": -914061504, "ts": 1716454225387555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387559, "dur": 0, "args": { "External id": 250689, "cbid": 251, "correlation": 250689 } }, { "ph": "f", "id": 250689, "pid": 76337, "tid": -914061504, "ts": 1716454225387559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225443956, "dur": 53, "args": { "External id": 250690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250690, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 250690, "pid": 5, "tid": 7, "ts": 1716454225443956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387561, "dur": 13, "args": { "External id": 250690, "cbid": 211, "correlation": 250690 } }, { "ph": "s", "id": 250690, "pid": 76337, "tid": -914061504, "ts": 1716454225387561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387632, "dur": 1, "args": { "External id": 250701, "cbid": 251, "correlation": 250701 } }, { "ph": "f", "id": 250701, "pid": 76337, "tid": -914061504, "ts": 1716454225387632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387635, "dur": 0, "args": { "External id": 250702, "cbid": 251, "correlation": 250702 } }, { "ph": "f", "id": 250702, "pid": 76337, "tid": -914061504, "ts": 1716454225387635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225444011, "dur": 54, "args": { "External id": 250703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250703, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 250703, "pid": 5, "tid": 7, "ts": 1716454225444011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387637, "dur": 11, "args": { "External id": 250703, "cbid": 211, "correlation": 250703 } }, { "ph": "s", "id": 250703, "pid": 76337, "tid": -914061504, "ts": 1716454225387637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225444066, "dur": 57, "args": { "External id": 250728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250728, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250728, "pid": 5, "tid": 7, "ts": 1716454225444066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387722, "dur": 14, "args": { "External id": 250728, "cbid": 211, "correlation": 250728 } }, { "ph": "s", "id": 250728, "pid": 76337, "tid": -914061504, "ts": 1716454225387722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225387821, "dur": 1, "args": { "External id": 250746, "cbid": 251, "correlation": 250746 } }, { "ph": "f", "id": 250746, "pid": 76337, "tid": -914061504, "ts": 1716454225387821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225444124, "dur": 66, "args": { "External id": 250748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250748, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 250748, "pid": 5, "tid": 7, "ts": 1716454225444124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387827, "dur": 13, "args": { "External id": 250748, "cbid": 211, "correlation": 250748 } }, { "ph": "s", "id": 250748, "pid": 76337, "tid": -914061504, "ts": 1716454225387827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225444191, "dur": 6, "args": { "External id": 250756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250756, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250756, "pid": 5, "tid": 7, "ts": 1716454225444191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387900, "dur": 12, "args": { "External id": 250756, "cbid": 211, "correlation": 250756 } }, { "ph": "s", "id": 250756, "pid": 76337, "tid": -914061504, "ts": 1716454225387900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225444199, "dur": 7, "args": { "External id": 250764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250764, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250764, "pid": 5, "tid": 7, "ts": 1716454225444199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225387941, "dur": 9, "args": { "External id": 250764, "cbid": 211, "correlation": 250764 } }, { "ph": "s", "id": 250764, "pid": 76337, "tid": -914061504, "ts": 1716454225387941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444207, "dur": 8, "args": { "External id": 250775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250775, "pid": 5, "tid": 7, "ts": 1716454225444207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388023, "dur": 14, "args": { "External id": 250775, "cbid": 211, "correlation": 250775 } }, { "ph": "s", "id": 250775, "pid": 76337, "tid": -914061504, "ts": 1716454225388023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225444217, "dur": 10, "args": { "External id": 250797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250797, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 250797, "pid": 5, "tid": 7, "ts": 1716454225444217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388057, "dur": 8, "args": { "External id": 250797, "cbid": 211, "correlation": 250797 } }, { "ph": "s", "id": 250797, "pid": 76337, "tid": -914061504, "ts": 1716454225388057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388144, "dur": 2, "args": { "External id": 250808, "cbid": 251, "correlation": 250808 } }, { "ph": "f", "id": 250808, "pid": 76337, "tid": -914061504, "ts": 1716454225388144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225444228, "dur": 1, "args": { "External id": 250809, "device": 5, "context": 1, "stream": 7, "correlation": 250809, "bytes": 480, "memory bandwidth (GB/s)": 0.3 } }, { "ph": "f", "id": 250809, "pid": 5, "tid": 7, "ts": 1716454225444228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225388149, "dur": 12, "args": { "External id": 250809, "cbid": 51, "correlation": 250809 } }, { "ph": "s", "id": 250809, "pid": 76337, "tid": -914061504, "ts": 1716454225388149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225444232, "dur": 37, "args": { "External id": 250810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250810, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 250810, "pid": 5, "tid": 7, "ts": 1716454225444232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388164, "dur": 13, "args": { "External id": 250810, "cbid": 211, "correlation": 250810 } }, { "ph": "s", "id": 250810, "pid": 76337, "tid": -914061504, "ts": 1716454225388164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388237, "dur": 1, "args": { "External id": 250821, "cbid": 251, "correlation": 250821 } }, { "ph": "f", "id": 250821, "pid": 76337, "tid": -914061504, "ts": 1716454225388237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388241, "dur": 0, "args": { "External id": 250822, "cbid": 251, "correlation": 250822 } }, { "ph": "f", "id": 250822, "pid": 76337, "tid": -914061504, "ts": 1716454225388241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225444270, "dur": 12, "args": { "External id": 250823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250823, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250823, "pid": 5, "tid": 7, "ts": 1716454225444270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388242, "dur": 12, "args": { "External id": 250823, "cbid": 211, "correlation": 250823 } }, { "ph": "s", "id": 250823, "pid": 76337, "tid": -914061504, "ts": 1716454225388242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225444284, "dur": 5, "args": { "External id": 250825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250825, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250825, "pid": 5, "tid": 7, "ts": 1716454225444284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388256, "dur": 6, "args": { "External id": 250825, "cbid": 211, "correlation": 250825 } }, { "ph": "s", "id": 250825, "pid": 76337, "tid": -914061504, "ts": 1716454225388256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388314, "dur": 1, "args": { "External id": 250836, "cbid": 251, "correlation": 250836 } }, { "ph": "f", "id": 250836, "pid": 76337, "tid": -914061504, "ts": 1716454225388314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388317, "dur": 0, "args": { "External id": 250837, "cbid": 251, "correlation": 250837 } }, { "ph": "f", "id": 250837, "pid": 76337, "tid": -914061504, "ts": 1716454225388317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225444290, "dur": 8, "args": { "External id": 250838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250838, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250838, "pid": 5, "tid": 7, "ts": 1716454225444290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388319, "dur": 13, "args": { "External id": 250838, "cbid": 211, "correlation": 250838 } }, { "ph": "s", "id": 250838, "pid": 76337, "tid": -914061504, "ts": 1716454225388319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225444300, "dur": 4, "args": { "External id": 250840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250840, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250840, "pid": 5, "tid": 7, "ts": 1716454225444300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388333, "dur": 5, "args": { "External id": 250840, "cbid": 211, "correlation": 250840 } }, { "ph": "s", "id": 250840, "pid": 76337, "tid": -914061504, "ts": 1716454225388333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225444305, "dur": 20, "args": { "External id": 250865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250865, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 250865, "pid": 5, "tid": 7, "ts": 1716454225444305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388411, "dur": 12, "args": { "External id": 250865, "cbid": 211, "correlation": 250865 } }, { "ph": "s", "id": 250865, "pid": 76337, "tid": -914061504, "ts": 1716454225388411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388511, "dur": 2, "args": { "External id": 250883, "cbid": 251, "correlation": 250883 } }, { "ph": "f", "id": 250883, "pid": 76337, "tid": -914061504, "ts": 1716454225388511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225444327, "dur": 1, "args": { "External id": 250885, "device": 5, "context": 1, "stream": 7, "correlation": 250885, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 250885, "pid": 5, "tid": 7, "ts": 1716454225444327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225388517, "dur": 11, "args": { "External id": 250885, "cbid": 51, "correlation": 250885 } }, { "ph": "s", "id": 250885, "pid": 76337, "tid": -914061504, "ts": 1716454225388517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225444330, "dur": 38, "args": { "External id": 250886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250886, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 250886, "pid": 5, "tid": 7, "ts": 1716454225444330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388530, "dur": 12, "args": { "External id": 250886, "cbid": 211, "correlation": 250886 } }, { "ph": "s", "id": 250886, "pid": 76337, "tid": -914061504, "ts": 1716454225388530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225444370, "dur": 4, "args": { "External id": 250894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250894, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250894, "pid": 5, "tid": 7, "ts": 1716454225444370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388600, "dur": 12, "args": { "External id": 250894, "cbid": 211, "correlation": 250894 } }, { "ph": "s", "id": 250894, "pid": 76337, "tid": -914061504, "ts": 1716454225388600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444375, "dur": 8, "args": { "External id": 250902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250902, "pid": 5, "tid": 7, "ts": 1716454225444375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388642, "dur": 9, "args": { "External id": 250902, "cbid": 211, "correlation": 250902 } }, { "ph": "s", "id": 250902, "pid": 76337, "tid": -914061504, "ts": 1716454225388642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225444385, "dur": 8, "args": { "External id": 250924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250924, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 250924, "pid": 5, "tid": 7, "ts": 1716454225444385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388693, "dur": 10, "args": { "External id": 250924, "cbid": 211, "correlation": 250924 } }, { "ph": "s", "id": 250924, "pid": 76337, "tid": -914061504, "ts": 1716454225388693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388786, "dur": 1, "args": { "External id": 250940, "cbid": 251, "correlation": 250940 } }, { "ph": "f", "id": 250940, "pid": 76337, "tid": -914061504, "ts": 1716454225388786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388791, "dur": 0, "args": { "External id": 250942, "cbid": 251, "correlation": 250942 } }, { "ph": "f", "id": 250942, "pid": 76337, "tid": -914061504, "ts": 1716454225388791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225444395, "dur": 192, "args": { "External id": 250943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250943, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 250943, "pid": 5, "tid": 7, "ts": 1716454225444395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388793, "dur": 13, "args": { "External id": 250943, "cbid": 211, "correlation": 250943 } }, { "ph": "s", "id": 250943, "pid": 76337, "tid": -914061504, "ts": 1716454225388793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444589, "dur": 21, "args": { "External id": 250951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250951, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250951, "pid": 5, "tid": 7, "ts": 1716454225444589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388860, "dur": 12, "args": { "External id": 250951, "cbid": 211, "correlation": 250951 } }, { "ph": "s", "id": 250951, "pid": 76337, "tid": -914061504, "ts": 1716454225388860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444611, "dur": 22, "args": { "External id": 250959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 250959, "pid": 5, "tid": 7, "ts": 1716454225444611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388891, "dur": 8, "args": { "External id": 250959, "cbid": 211, "correlation": 250959 } }, { "ph": "s", "id": 250959, "pid": 76337, "tid": -914061504, "ts": 1716454225388891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225388971, "dur": 8, "args": { "External id": 250975, "cbid": 251, "correlation": 250975 } }, { "ph": "f", "id": 250975, "pid": 76337, "tid": -914061504, "ts": 1716454225388971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225444635, "dur": 1, "args": { "External id": 250977, "device": 5, "context": 1, "stream": 7, "correlation": 250977, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 250977, "pid": 5, "tid": 7, "ts": 1716454225444635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225388984, "dur": 9, "args": { "External id": 250977, "cbid": 51, "correlation": 250977 } }, { "ph": "s", "id": 250977, "pid": 76337, "tid": -914061504, "ts": 1716454225388984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225444639, "dur": 111, "args": { "External id": 250978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250978, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 250978, "pid": 5, "tid": 7, "ts": 1716454225444639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225388994, "dur": 12, "args": { "External id": 250978, "cbid": 211, "correlation": 250978 } }, { "ph": "s", "id": 250978, "pid": 76337, "tid": -914061504, "ts": 1716454225388994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225444752, "dur": 5, "args": { "External id": 250986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250986, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250986, "pid": 5, "tid": 7, "ts": 1716454225444752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389038, "dur": 10, "args": { "External id": 250986, "cbid": 211, "correlation": 250986 } }, { "ph": "s", "id": 250986, "pid": 76337, "tid": -914061504, "ts": 1716454225389038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444759, "dur": 10, "args": { "External id": 250997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 250997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 250997, "pid": 5, "tid": 7, "ts": 1716454225444759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389107, "dur": 12, "args": { "External id": 250997, "cbid": 211, "correlation": 250997 } }, { "ph": "s", "id": 250997, "pid": 76337, "tid": -914061504, "ts": 1716454225389107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225389170, "dur": 0, "args": { "External id": 251009, "cbid": 317, "correlation": 251009 } }, { "ph": "f", "id": 251009, "pid": 76337, "tid": -914061504, "ts": 1716454225389170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225389171, "dur": 0, "args": { "External id": 251010, "cbid": 203, "correlation": 251010 } }, { "ph": "f", "id": 251010, "pid": 76337, "tid": -914061504, "ts": 1716454225389171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225389172, "dur": 0, "args": { "External id": 251011, "cbid": 205, "correlation": 251011 } }, { "ph": "f", "id": 251011, "pid": 76337, "tid": -914061504, "ts": 1716454225389172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225444770, "dur": 6, "args": { "External id": 251015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251015, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251015, "pid": 5, "tid": 7, "ts": 1716454225444770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389188, "dur": 12, "args": { "External id": 251015, "cbid": 211, "correlation": 251015 } }, { "ph": "s", "id": 251015, "pid": 76337, "tid": -914061504, "ts": 1716454225389188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225444777, "dur": 38, "args": { "External id": 251017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251017, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 251017, "pid": 5, "tid": 7, "ts": 1716454225444777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389206, "dur": 7, "args": { "External id": 251017, "cbid": 211, "correlation": 251017 } }, { "ph": "s", "id": 251017, "pid": 76337, "tid": -914061504, "ts": 1716454225389206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225444816, "dur": 6, "args": { "External id": 251019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251019, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251019, "pid": 5, "tid": 7, "ts": 1716454225444816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389217, "dur": 5, "args": { "External id": 251019, "cbid": 211, "correlation": 251019 } }, { "ph": "s", "id": 251019, "pid": 76337, "tid": -914061504, "ts": 1716454225389217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444824, "dur": 7, "args": { "External id": 251025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251025, "pid": 5, "tid": 7, "ts": 1716454225444824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389244, "dur": 8, "args": { "External id": 251025, "cbid": 211, "correlation": 251025 } }, { "ph": "s", "id": 251025, "pid": 76337, "tid": -914061504, "ts": 1716454225389244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225444832, "dur": 5, "args": { "External id": 251033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251033, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251033, "pid": 5, "tid": 7, "ts": 1716454225444832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389275, "dur": 9, "args": { "External id": 251033, "cbid": 211, "correlation": 251033 } }, { "ph": "s", "id": 251033, "pid": 76337, "tid": -914061504, "ts": 1716454225389275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225444839, "dur": 11, "args": { "External id": 251053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251053, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251053, "pid": 5, "tid": 7, "ts": 1716454225444839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389348, "dur": 11, "args": { "External id": 251053, "cbid": 211, "correlation": 251053 } }, { "ph": "s", "id": 251053, "pid": 76337, "tid": -914061504, "ts": 1716454225389348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225444850, "dur": 4, "args": { "External id": 251065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251065, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 251065, "pid": 5, "tid": 7, "ts": 1716454225444850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389369, "dur": 6, "args": { "External id": 251065, "cbid": 211, "correlation": 251065 } }, { "ph": "s", "id": 251065, "pid": 76337, "tid": -914061504, "ts": 1716454225389369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225444856, "dur": 8, "args": { "External id": 251068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251068, "pid": 5, "tid": 7, "ts": 1716454225444856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389388, "dur": 6, "args": { "External id": 251068, "cbid": 211, "correlation": 251068 } }, { "ph": "s", "id": 251068, "pid": 76337, "tid": -914061504, "ts": 1716454225389388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225444866, "dur": 5, "args": { "External id": 251077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251077, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251077, "pid": 5, "tid": 7, "ts": 1716454225444866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389425, "dur": 11, "args": { "External id": 251077, "cbid": 211, "correlation": 251077 } }, { "ph": "s", "id": 251077, "pid": 76337, "tid": -914061504, "ts": 1716454225389425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225389477, "dur": 0, "args": { "External id": 251087, "cbid": 317, "correlation": 251087 } }, { "ph": "f", "id": 251087, "pid": 76337, "tid": -914061504, "ts": 1716454225389477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225389478, "dur": 0, "args": { "External id": 251088, "cbid": 203, "correlation": 251088 } }, { "ph": "f", "id": 251088, "pid": 76337, "tid": -914061504, "ts": 1716454225389478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225389479, "dur": 0, "args": { "External id": 251089, "cbid": 205, "correlation": 251089 } }, { "ph": "f", "id": 251089, "pid": 76337, "tid": -914061504, "ts": 1716454225389479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225444872, "dur": 5, "args": { "External id": 251093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251093, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251093, "pid": 5, "tid": 7, "ts": 1716454225444872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389492, "dur": 11, "args": { "External id": 251093, "cbid": 211, "correlation": 251093 } }, { "ph": "s", "id": 251093, "pid": 76337, "tid": -914061504, "ts": 1716454225389492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225444879, "dur": 165, "args": { "External id": 251095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251095, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251095, "pid": 5, "tid": 7, "ts": 1716454225444879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389506, "dur": 5, "args": { "External id": 251095, "cbid": 211, "correlation": 251095 } }, { "ph": "s", "id": 251095, "pid": 76337, "tid": -914061504, "ts": 1716454225389506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225445046, "dur": 1, "args": { "External id": 251097, "device": 5, "context": 1, "stream": 7, "correlation": 251097, "bytes": 240, "memory bandwidth (GB/s)": 0.15315890236119975 } }, { "ph": "f", "id": 251097, "pid": 5, "tid": 7, "ts": 1716454225445046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225389516, "dur": 6, "args": { "External id": 251097, "cbid": 51, "correlation": 251097 } }, { "ph": "s", "id": 251097, "pid": 76337, "tid": -914061504, "ts": 1716454225389516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225445050, "dur": 275, "args": { "External id": 251098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251098, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251098, "pid": 5, "tid": 7, "ts": 1716454225445050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389523, "dur": 7, "args": { "External id": 251098, "cbid": 211, "correlation": 251098 } }, { "ph": "s", "id": 251098, "pid": 76337, "tid": -914061504, "ts": 1716454225389523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225445326, "dur": 6, "args": { "External id": 251100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251100, "pid": 5, "tid": 7, "ts": 1716454225445326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389534, "dur": 5, "args": { "External id": 251100, "cbid": 211, "correlation": 251100 } }, { "ph": "s", "id": 251100, "pid": 76337, "tid": -914061504, "ts": 1716454225389534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225445333, "dur": 6, "args": { "External id": 251106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251106, "pid": 5, "tid": 7, "ts": 1716454225445333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389561, "dur": 8, "args": { "External id": 251106, "cbid": 211, "correlation": 251106 } }, { "ph": "s", "id": 251106, "pid": 76337, "tid": -914061504, "ts": 1716454225389561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225445341, "dur": 3, "args": { "External id": 251114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251114, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 251114, "pid": 5, "tid": 7, "ts": 1716454225445341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389605, "dur": 9, "args": { "External id": 251114, "cbid": 211, "correlation": 251114 } }, { "ph": "s", "id": 251114, "pid": 76337, "tid": -914061504, "ts": 1716454225389605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225389669, "dur": 1, "args": { "External id": 251130, "cbid": 251, "correlation": 251130 } }, { "ph": "f", "id": 251130, "pid": 76337, "tid": -914061504, "ts": 1716454225389669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225389674, "dur": 0, "args": { "External id": 251132, "cbid": 251, "correlation": 251132 } }, { "ph": "f", "id": 251132, "pid": 76337, "tid": -914061504, "ts": 1716454225389674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225445345, "dur": 13, "args": { "External id": 251133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251133, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251133, "pid": 5, "tid": 7, "ts": 1716454225445345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389676, "dur": 11, "args": { "External id": 251133, "cbid": 211, "correlation": 251133 } }, { "ph": "s", "id": 251133, "pid": 76337, "tid": -914061504, "ts": 1716454225389676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225445359, "dur": 5, "args": { "External id": 251135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251135, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251135, "pid": 5, "tid": 7, "ts": 1716454225445359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389689, "dur": 5, "args": { "External id": 251135, "cbid": 211, "correlation": 251135 } }, { "ph": "s", "id": 251135, "pid": 76337, "tid": -914061504, "ts": 1716454225389689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225445366, "dur": 6, "args": { "External id": 251145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251145, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251145, "pid": 5, "tid": 7, "ts": 1716454225445366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389746, "dur": 12, "args": { "External id": 251145, "cbid": 211, "correlation": 251145 } }, { "ph": "s", "id": 251145, "pid": 76337, "tid": -914061504, "ts": 1716454225389746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225445373, "dur": 10, "args": { "External id": 251165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251165, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251165, "pid": 5, "tid": 7, "ts": 1716454225445373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389812, "dur": 10, "args": { "External id": 251165, "cbid": 211, "correlation": 251165 } }, { "ph": "s", "id": 251165, "pid": 76337, "tid": -914061504, "ts": 1716454225389812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225445384, "dur": 4, "args": { "External id": 251177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251177, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 251177, "pid": 5, "tid": 7, "ts": 1716454225445384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389832, "dur": 7, "args": { "External id": 251177, "cbid": 211, "correlation": 251177 } }, { "ph": "s", "id": 251177, "pid": 76337, "tid": -914061504, "ts": 1716454225389832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225445389, "dur": 7, "args": { "External id": 251180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251180, "pid": 5, "tid": 7, "ts": 1716454225445389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389851, "dur": 6, "args": { "External id": 251180, "cbid": 211, "correlation": 251180 } }, { "ph": "s", "id": 251180, "pid": 76337, "tid": -914061504, "ts": 1716454225389851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225445397, "dur": 5, "args": { "External id": 251189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251189, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251189, "pid": 5, "tid": 7, "ts": 1716454225445397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389892, "dur": 9, "args": { "External id": 251189, "cbid": 211, "correlation": 251189 } }, { "ph": "s", "id": 251189, "pid": 76337, "tid": -914061504, "ts": 1716454225389892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225389953, "dur": 0, "args": { "External id": 251199, "cbid": 317, "correlation": 251199 } }, { "ph": "f", "id": 251199, "pid": 76337, "tid": -914061504, "ts": 1716454225389953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225389953, "dur": 0, "args": { "External id": 251200, "cbid": 203, "correlation": 251200 } }, { "ph": "f", "id": 251200, "pid": 76337, "tid": -914061504, "ts": 1716454225389953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225389954, "dur": 0, "args": { "External id": 251201, "cbid": 205, "correlation": 251201 } }, { "ph": "f", "id": 251201, "pid": 76337, "tid": -914061504, "ts": 1716454225389954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225445403, "dur": 5, "args": { "External id": 251205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251205, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251205, "pid": 5, "tid": 7, "ts": 1716454225445403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389968, "dur": 21, "args": { "External id": 251205, "cbid": 211, "correlation": 251205 } }, { "ph": "s", "id": 251205, "pid": 76337, "tid": -914061504, "ts": 1716454225389968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225445409, "dur": 166, "args": { "External id": 251207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251207, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251207, "pid": 5, "tid": 7, "ts": 1716454225445409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225389991, "dur": 5, "args": { "External id": 251207, "cbid": 211, "correlation": 251207 } }, { "ph": "s", "id": 251207, "pid": 76337, "tid": -914061504, "ts": 1716454225389991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225445578, "dur": 1, "args": { "External id": 251209, "device": 5, "context": 1, "stream": 7, "correlation": 251209, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 251209, "pid": 5, "tid": 7, "ts": 1716454225445578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225390002, "dur": 6, "args": { "External id": 251209, "cbid": 51, "correlation": 251209 } }, { "ph": "s", "id": 251209, "pid": 76337, "tid": -914061504, "ts": 1716454225390002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225445581, "dur": 264, "args": { "External id": 251210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251210, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251210, "pid": 5, "tid": 7, "ts": 1716454225445581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390009, "dur": 6, "args": { "External id": 251210, "cbid": 211, "correlation": 251210 } }, { "ph": "s", "id": 251210, "pid": 76337, "tid": -914061504, "ts": 1716454225390009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225445847, "dur": 6, "args": { "External id": 251212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251212, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251212, "pid": 5, "tid": 7, "ts": 1716454225445847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390019, "dur": 5, "args": { "External id": 251212, "cbid": 211, "correlation": 251212 } }, { "ph": "s", "id": 251212, "pid": 76337, "tid": -914061504, "ts": 1716454225390019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225445854, "dur": 6, "args": { "External id": 251218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251218, "pid": 5, "tid": 7, "ts": 1716454225445854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390048, "dur": 9, "args": { "External id": 251218, "cbid": 211, "correlation": 251218 } }, { "ph": "s", "id": 251218, "pid": 76337, "tid": -914061504, "ts": 1716454225390048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225445862, "dur": 5, "args": { "External id": 251226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251226, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251226, "pid": 5, "tid": 7, "ts": 1716454225445862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390081, "dur": 9, "args": { "External id": 251226, "cbid": 211, "correlation": 251226 } }, { "ph": "s", "id": 251226, "pid": 76337, "tid": -914061504, "ts": 1716454225390081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225445867, "dur": 5, "args": { "External id": 251234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251234, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251234, "pid": 5, "tid": 7, "ts": 1716454225445867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390111, "dur": 8, "args": { "External id": 251234, "cbid": 211, "correlation": 251234 } }, { "ph": "s", "id": 251234, "pid": 76337, "tid": -914061504, "ts": 1716454225390111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225445873, "dur": 12, "args": { "External id": 251243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251243, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251243, "pid": 5, "tid": 7, "ts": 1716454225445873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390199, "dur": 13, "args": { "External id": 251243, "cbid": 211, "correlation": 251243 } }, { "ph": "s", "id": 251243, "pid": 76337, "tid": -914061504, "ts": 1716454225390199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225445886, "dur": 12, "args": { "External id": 251263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251263, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251263, "pid": 5, "tid": 7, "ts": 1716454225445886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390265, "dur": 11, "args": { "External id": 251263, "cbid": 211, "correlation": 251263 } }, { "ph": "s", "id": 251263, "pid": 76337, "tid": -914061504, "ts": 1716454225390265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225445900, "dur": 4, "args": { "External id": 251275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251275, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251275, "pid": 5, "tid": 7, "ts": 1716454225445900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390290, "dur": 7, "args": { "External id": 251275, "cbid": 211, "correlation": 251275 } }, { "ph": "s", "id": 251275, "pid": 76337, "tid": -914061504, "ts": 1716454225390290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225445906, "dur": 10, "args": { "External id": 251278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251278, "pid": 5, "tid": 7, "ts": 1716454225445906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390309, "dur": 6, "args": { "External id": 251278, "cbid": 211, "correlation": 251278 } }, { "ph": "s", "id": 251278, "pid": 76337, "tid": -914061504, "ts": 1716454225390309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225445917, "dur": 6, "args": { "External id": 251287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251287, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251287, "pid": 5, "tid": 7, "ts": 1716454225445917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390349, "dur": 10, "args": { "External id": 251287, "cbid": 211, "correlation": 251287 } }, { "ph": "s", "id": 251287, "pid": 76337, "tid": -914061504, "ts": 1716454225390349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225390405, "dur": 0, "args": { "External id": 251297, "cbid": 317, "correlation": 251297 } }, { "ph": "f", "id": 251297, "pid": 76337, "tid": -914061504, "ts": 1716454225390405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225390406, "dur": 0, "args": { "External id": 251298, "cbid": 203, "correlation": 251298 } }, { "ph": "f", "id": 251298, "pid": 76337, "tid": -914061504, "ts": 1716454225390406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225390407, "dur": 0, "args": { "External id": 251299, "cbid": 205, "correlation": 251299 } }, { "ph": "f", "id": 251299, "pid": 76337, "tid": -914061504, "ts": 1716454225390407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225445925, "dur": 7, "args": { "External id": 251303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251303, "pid": 5, "tid": 7, "ts": 1716454225445925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390422, "dur": 12, "args": { "External id": 251303, "cbid": 211, "correlation": 251303 } }, { "ph": "s", "id": 251303, "pid": 76337, "tid": -914061504, "ts": 1716454225390422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225445933, "dur": 325, "args": { "External id": 251305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251305, "pid": 5, "tid": 7, "ts": 1716454225445933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390437, "dur": 6, "args": { "External id": 251305, "cbid": 211, "correlation": 251305 } }, { "ph": "s", "id": 251305, "pid": 76337, "tid": -914061504, "ts": 1716454225390437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225446260, "dur": 1, "args": { "External id": 251307, "device": 5, "context": 1, "stream": 7, "correlation": 251307, "bytes": 240, "memory bandwidth (GB/s)": 0.1561483409238777 } }, { "ph": "f", "id": 251307, "pid": 5, "tid": 7, "ts": 1716454225446260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225390449, "dur": 7, "args": { "External id": 251307, "cbid": 51, "correlation": 251307 } }, { "ph": "s", "id": 251307, "pid": 76337, "tid": -914061504, "ts": 1716454225390449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225446264, "dur": 503, "args": { "External id": 251308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251308, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251308, "pid": 5, "tid": 7, "ts": 1716454225446264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390457, "dur": 6, "args": { "External id": 251308, "cbid": 211, "correlation": 251308 } }, { "ph": "s", "id": 251308, "pid": 76337, "tid": -914061504, "ts": 1716454225390457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225446768, "dur": 5, "args": { "External id": 251310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251310, "pid": 5, "tid": 7, "ts": 1716454225446768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390468, "dur": 5, "args": { "External id": 251310, "cbid": 211, "correlation": 251310 } }, { "ph": "s", "id": 251310, "pid": 76337, "tid": -914061504, "ts": 1716454225390468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225446775, "dur": 6, "args": { "External id": 251316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251316, "pid": 5, "tid": 7, "ts": 1716454225446775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390498, "dur": 8, "args": { "External id": 251316, "cbid": 211, "correlation": 251316 } }, { "ph": "s", "id": 251316, "pid": 76337, "tid": -914061504, "ts": 1716454225390498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225446783, "dur": 3, "args": { "External id": 251324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251324, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 251324, "pid": 5, "tid": 7, "ts": 1716454225446783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390541, "dur": 9, "args": { "External id": 251324, "cbid": 211, "correlation": 251324 } }, { "ph": "s", "id": 251324, "pid": 76337, "tid": -914061504, "ts": 1716454225390541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225390604, "dur": 1, "args": { "External id": 251340, "cbid": 251, "correlation": 251340 } }, { "ph": "f", "id": 251340, "pid": 76337, "tid": -914061504, "ts": 1716454225390604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225390609, "dur": 0, "args": { "External id": 251342, "cbid": 251, "correlation": 251342 } }, { "ph": "f", "id": 251342, "pid": 76337, "tid": -914061504, "ts": 1716454225390609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225446787, "dur": 11, "args": { "External id": 251343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251343, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251343, "pid": 5, "tid": 7, "ts": 1716454225446787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390612, "dur": 11, "args": { "External id": 251343, "cbid": 211, "correlation": 251343 } }, { "ph": "s", "id": 251343, "pid": 76337, "tid": -914061504, "ts": 1716454225390612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225446800, "dur": 5, "args": { "External id": 251345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251345, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251345, "pid": 5, "tid": 7, "ts": 1716454225446800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390624, "dur": 5, "args": { "External id": 251345, "cbid": 211, "correlation": 251345 } }, { "ph": "s", "id": 251345, "pid": 76337, "tid": -914061504, "ts": 1716454225390624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225446806, "dur": 6, "args": { "External id": 251355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251355, "pid": 5, "tid": 7, "ts": 1716454225446806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390680, "dur": 13, "args": { "External id": 251355, "cbid": 211, "correlation": 251355 } }, { "ph": "s", "id": 251355, "pid": 76337, "tid": -914061504, "ts": 1716454225390680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225446813, "dur": 10, "args": { "External id": 251375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251375, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251375, "pid": 5, "tid": 7, "ts": 1716454225446813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390748, "dur": 10, "args": { "External id": 251375, "cbid": 211, "correlation": 251375 } }, { "ph": "s", "id": 251375, "pid": 76337, "tid": -914061504, "ts": 1716454225390748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225446824, "dur": 4, "args": { "External id": 251387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251387, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 251387, "pid": 5, "tid": 7, "ts": 1716454225446824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390769, "dur": 6, "args": { "External id": 251387, "cbid": 211, "correlation": 251387 } }, { "ph": "s", "id": 251387, "pid": 76337, "tid": -914061504, "ts": 1716454225390769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225446829, "dur": 7, "args": { "External id": 251390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251390, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251390, "pid": 5, "tid": 7, "ts": 1716454225446829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390787, "dur": 7, "args": { "External id": 251390, "cbid": 211, "correlation": 251390 } }, { "ph": "s", "id": 251390, "pid": 76337, "tid": -914061504, "ts": 1716454225390787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225446837, "dur": 5, "args": { "External id": 251399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251399, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251399, "pid": 5, "tid": 7, "ts": 1716454225446837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390827, "dur": 10, "args": { "External id": 251399, "cbid": 211, "correlation": 251399 } }, { "ph": "s", "id": 251399, "pid": 76337, "tid": -914061504, "ts": 1716454225390827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225390890, "dur": 0, "args": { "External id": 251409, "cbid": 317, "correlation": 251409 } }, { "ph": "f", "id": 251409, "pid": 76337, "tid": -914061504, "ts": 1716454225390890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225390891, "dur": 0, "args": { "External id": 251410, "cbid": 203, "correlation": 251410 } }, { "ph": "f", "id": 251410, "pid": 76337, "tid": -914061504, "ts": 1716454225390891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225390892, "dur": 0, "args": { "External id": 251411, "cbid": 205, "correlation": 251411 } }, { "ph": "f", "id": 251411, "pid": 76337, "tid": -914061504, "ts": 1716454225390892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225446843, "dur": 5, "args": { "External id": 251415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251415, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251415, "pid": 5, "tid": 7, "ts": 1716454225446843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390907, "dur": 12, "args": { "External id": 251415, "cbid": 211, "correlation": 251415 } }, { "ph": "s", "id": 251415, "pid": 76337, "tid": -914061504, "ts": 1716454225390907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225446850, "dur": 166, "args": { "External id": 251417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251417, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251417, "pid": 5, "tid": 7, "ts": 1716454225446850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390921, "dur": 5, "args": { "External id": 251417, "cbid": 211, "correlation": 251417 } }, { "ph": "s", "id": 251417, "pid": 76337, "tid": -914061504, "ts": 1716454225390921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225447018, "dur": 1, "args": { "External id": 251419, "device": 5, "context": 1, "stream": 7, "correlation": 251419, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 251419, "pid": 5, "tid": 7, "ts": 1716454225447018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225390932, "dur": 6, "args": { "External id": 251419, "cbid": 51, "correlation": 251419 } }, { "ph": "s", "id": 251419, "pid": 76337, "tid": -914061504, "ts": 1716454225390932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225447022, "dur": 263, "args": { "External id": 251420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251420, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251420, "pid": 5, "tid": 7, "ts": 1716454225447022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390939, "dur": 6, "args": { "External id": 251420, "cbid": 211, "correlation": 251420 } }, { "ph": "s", "id": 251420, "pid": 76337, "tid": -914061504, "ts": 1716454225390939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225447286, "dur": 6, "args": { "External id": 251422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251422, "pid": 5, "tid": 7, "ts": 1716454225447286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390949, "dur": 5, "args": { "External id": 251422, "cbid": 211, "correlation": 251422 } }, { "ph": "s", "id": 251422, "pid": 76337, "tid": -914061504, "ts": 1716454225390949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225447293, "dur": 6, "args": { "External id": 251428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251428, "pid": 5, "tid": 7, "ts": 1716454225447293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225390985, "dur": 9, "args": { "External id": 251428, "cbid": 211, "correlation": 251428 } }, { "ph": "s", "id": 251428, "pid": 76337, "tid": -914061504, "ts": 1716454225390985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225391044, "dur": 0, "args": { "External id": 251438, "cbid": 317, "correlation": 251438 } }, { "ph": "f", "id": 251438, "pid": 76337, "tid": -914061504, "ts": 1716454225391044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225391045, "dur": 0, "args": { "External id": 251439, "cbid": 203, "correlation": 251439 } }, { "ph": "f", "id": 251439, "pid": 76337, "tid": -914061504, "ts": 1716454225391045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225391046, "dur": 0, "args": { "External id": 251440, "cbid": 205, "correlation": 251440 } }, { "ph": "f", "id": 251440, "pid": 76337, "tid": -914061504, "ts": 1716454225391046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225447300, "dur": 8, "args": { "External id": 251444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251444, "pid": 5, "tid": 7, "ts": 1716454225447300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391061, "dur": 12, "args": { "External id": 251444, "cbid": 211, "correlation": 251444 } }, { "ph": "s", "id": 251444, "pid": 76337, "tid": -914061504, "ts": 1716454225391061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225447309, "dur": 3, "args": { "External id": 251446, "device": 5, "context": 1, "stream": 7, "correlation": 251446, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 251446, "pid": 5, "tid": 7, "ts": 1716454225447309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225391079, "dur": 15, "args": { "External id": 251446, "cbid": 51, "correlation": 251446 } }, { "ph": "s", "id": 251446, "pid": 76337, "tid": -914061504, "ts": 1716454225391079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225447314, "dur": 97, "args": { "External id": 251447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251447, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 251447, "pid": 5, "tid": 7, "ts": 1716454225447314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391095, "dur": 7, "args": { "External id": 251447, "cbid": 211, "correlation": 251447 } }, { "ph": "s", "id": 251447, "pid": 76337, "tid": -914061504, "ts": 1716454225391095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225447412, "dur": 6, "args": { "External id": 251449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251449, "pid": 5, "tid": 7, "ts": 1716454225447412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391106, "dur": 5, "args": { "External id": 251449, "cbid": 211, "correlation": 251449 } }, { "ph": "s", "id": 251449, "pid": 76337, "tid": -914061504, "ts": 1716454225391106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225447419, "dur": 6, "args": { "External id": 251455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251455, "pid": 5, "tid": 7, "ts": 1716454225447419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391133, "dur": 8, "args": { "External id": 251455, "cbid": 211, "correlation": 251455 } }, { "ph": "s", "id": 251455, "pid": 76337, "tid": -914061504, "ts": 1716454225391133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225447427, "dur": 5, "args": { "External id": 251463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251463, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251463, "pid": 5, "tid": 7, "ts": 1716454225447427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391162, "dur": 8, "args": { "External id": 251463, "cbid": 211, "correlation": 251463 } }, { "ph": "s", "id": 251463, "pid": 76337, "tid": -914061504, "ts": 1716454225391162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225447433, "dur": 4, "args": { "External id": 251471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251471, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251471, "pid": 5, "tid": 7, "ts": 1716454225447433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391190, "dur": 9, "args": { "External id": 251471, "cbid": 211, "correlation": 251471 } }, { "ph": "s", "id": 251471, "pid": 76337, "tid": -914061504, "ts": 1716454225391190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225447438, "dur": 11, "args": { "External id": 251480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251480, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251480, "pid": 5, "tid": 7, "ts": 1716454225447438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391235, "dur": 10, "args": { "External id": 251480, "cbid": 211, "correlation": 251480 } }, { "ph": "s", "id": 251480, "pid": 76337, "tid": -914061504, "ts": 1716454225391235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225447451, "dur": 12, "args": { "External id": 251500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251500, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251500, "pid": 5, "tid": 7, "ts": 1716454225447451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391305, "dur": 11, "args": { "External id": 251500, "cbid": 211, "correlation": 251500 } }, { "ph": "s", "id": 251500, "pid": 76337, "tid": -914061504, "ts": 1716454225391305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225447465, "dur": 4, "args": { "External id": 251512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251512, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251512, "pid": 5, "tid": 7, "ts": 1716454225447465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391329, "dur": 6, "args": { "External id": 251512, "cbid": 211, "correlation": 251512 } }, { "ph": "s", "id": 251512, "pid": 76337, "tid": -914061504, "ts": 1716454225391329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225447470, "dur": 11, "args": { "External id": 251515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251515, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251515, "pid": 5, "tid": 7, "ts": 1716454225447470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391347, "dur": 8, "args": { "External id": 251515, "cbid": 211, "correlation": 251515 } }, { "ph": "s", "id": 251515, "pid": 76337, "tid": -914061504, "ts": 1716454225391347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225447482, "dur": 7, "args": { "External id": 251524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251524, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251524, "pid": 5, "tid": 7, "ts": 1716454225447482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391387, "dur": 10, "args": { "External id": 251524, "cbid": 211, "correlation": 251524 } }, { "ph": "s", "id": 251524, "pid": 76337, "tid": -914061504, "ts": 1716454225391387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225391440, "dur": 0, "args": { "External id": 251534, "cbid": 317, "correlation": 251534 } }, { "ph": "f", "id": 251534, "pid": 76337, "tid": -914061504, "ts": 1716454225391440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225391441, "dur": 0, "args": { "External id": 251535, "cbid": 203, "correlation": 251535 } }, { "ph": "f", "id": 251535, "pid": 76337, "tid": -914061504, "ts": 1716454225391441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225391441, "dur": 0, "args": { "External id": 251536, "cbid": 205, "correlation": 251536 } }, { "ph": "f", "id": 251536, "pid": 76337, "tid": -914061504, "ts": 1716454225391441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225447491, "dur": 7, "args": { "External id": 251540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251540, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251540, "pid": 5, "tid": 7, "ts": 1716454225447491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391456, "dur": 12, "args": { "External id": 251540, "cbid": 211, "correlation": 251540 } }, { "ph": "s", "id": 251540, "pid": 76337, "tid": -914061504, "ts": 1716454225391456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225447499, "dur": 326, "args": { "External id": 251542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251542, "pid": 5, "tid": 7, "ts": 1716454225447499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391471, "dur": 5, "args": { "External id": 251542, "cbid": 211, "correlation": 251542 } }, { "ph": "s", "id": 251542, "pid": 76337, "tid": -914061504, "ts": 1716454225391471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225447826, "dur": 1, "args": { "External id": 251544, "device": 5, "context": 1, "stream": 7, "correlation": 251544, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 251544, "pid": 5, "tid": 7, "ts": 1716454225447826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225391481, "dur": 6, "args": { "External id": 251544, "cbid": 51, "correlation": 251544 } }, { "ph": "s", "id": 251544, "pid": 76337, "tid": -914061504, "ts": 1716454225391481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225447830, "dur": 505, "args": { "External id": 251545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251545, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251545, "pid": 5, "tid": 7, "ts": 1716454225447830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391488, "dur": 6, "args": { "External id": 251545, "cbid": 211, "correlation": 251545 } }, { "ph": "s", "id": 251545, "pid": 76337, "tid": -914061504, "ts": 1716454225391488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225448337, "dur": 6, "args": { "External id": 251547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251547, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251547, "pid": 5, "tid": 7, "ts": 1716454225448337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391498, "dur": 6, "args": { "External id": 251547, "cbid": 211, "correlation": 251547 } }, { "ph": "s", "id": 251547, "pid": 76337, "tid": -914061504, "ts": 1716454225391498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225448344, "dur": 6, "args": { "External id": 251553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251553, "pid": 5, "tid": 7, "ts": 1716454225448344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391527, "dur": 9, "args": { "External id": 251553, "cbid": 211, "correlation": 251553 } }, { "ph": "s", "id": 251553, "pid": 76337, "tid": -914061504, "ts": 1716454225391527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225448352, "dur": 3, "args": { "External id": 251561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251561, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 251561, "pid": 5, "tid": 7, "ts": 1716454225448352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391571, "dur": 10, "args": { "External id": 251561, "cbid": 211, "correlation": 251561 } }, { "ph": "s", "id": 251561, "pid": 76337, "tid": -914061504, "ts": 1716454225391571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225391635, "dur": 1, "args": { "External id": 251577, "cbid": 251, "correlation": 251577 } }, { "ph": "f", "id": 251577, "pid": 76337, "tid": -914061504, "ts": 1716454225391635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225391641, "dur": 0, "args": { "External id": 251579, "cbid": 251, "correlation": 251579 } }, { "ph": "f", "id": 251579, "pid": 76337, "tid": -914061504, "ts": 1716454225391641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225448356, "dur": 13, "args": { "External id": 251580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251580, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251580, "pid": 5, "tid": 7, "ts": 1716454225448356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391643, "dur": 12, "args": { "External id": 251580, "cbid": 211, "correlation": 251580 } }, { "ph": "s", "id": 251580, "pid": 76337, "tid": -914061504, "ts": 1716454225391643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225448371, "dur": 5, "args": { "External id": 251582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251582, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251582, "pid": 5, "tid": 7, "ts": 1716454225448371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391657, "dur": 5, "args": { "External id": 251582, "cbid": 211, "correlation": 251582 } }, { "ph": "s", "id": 251582, "pid": 76337, "tid": -914061504, "ts": 1716454225391657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225448377, "dur": 6, "args": { "External id": 251592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251592, "pid": 5, "tid": 7, "ts": 1716454225448377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391714, "dur": 13, "args": { "External id": 251592, "cbid": 211, "correlation": 251592 } }, { "ph": "s", "id": 251592, "pid": 76337, "tid": -914061504, "ts": 1716454225391714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225448384, "dur": 10, "args": { "External id": 251612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251612, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251612, "pid": 5, "tid": 7, "ts": 1716454225448384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391780, "dur": 11, "args": { "External id": 251612, "cbid": 211, "correlation": 251612 } }, { "ph": "s", "id": 251612, "pid": 76337, "tid": -914061504, "ts": 1716454225391780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225448395, "dur": 4, "args": { "External id": 251624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251624, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 251624, "pid": 5, "tid": 7, "ts": 1716454225448395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391801, "dur": 6, "args": { "External id": 251624, "cbid": 211, "correlation": 251624 } }, { "ph": "s", "id": 251624, "pid": 76337, "tid": -914061504, "ts": 1716454225391801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225448400, "dur": 7, "args": { "External id": 251627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251627, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251627, "pid": 5, "tid": 7, "ts": 1716454225448400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391820, "dur": 6, "args": { "External id": 251627, "cbid": 211, "correlation": 251627 } }, { "ph": "s", "id": 251627, "pid": 76337, "tid": -914061504, "ts": 1716454225391820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225448408, "dur": 5, "args": { "External id": 251636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251636, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251636, "pid": 5, "tid": 7, "ts": 1716454225448408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391861, "dur": 9, "args": { "External id": 251636, "cbid": 211, "correlation": 251636 } }, { "ph": "s", "id": 251636, "pid": 76337, "tid": -914061504, "ts": 1716454225391861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225391923, "dur": 0, "args": { "External id": 251646, "cbid": 317, "correlation": 251646 } }, { "ph": "f", "id": 251646, "pid": 76337, "tid": -914061504, "ts": 1716454225391923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225391924, "dur": 0, "args": { "External id": 251647, "cbid": 203, "correlation": 251647 } }, { "ph": "f", "id": 251647, "pid": 76337, "tid": -914061504, "ts": 1716454225391924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225391925, "dur": 0, "args": { "External id": 251648, "cbid": 205, "correlation": 251648 } }, { "ph": "f", "id": 251648, "pid": 76337, "tid": -914061504, "ts": 1716454225391925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225448415, "dur": 5, "args": { "External id": 251652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251652, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251652, "pid": 5, "tid": 7, "ts": 1716454225448415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391938, "dur": 12, "args": { "External id": 251652, "cbid": 211, "correlation": 251652 } }, { "ph": "s", "id": 251652, "pid": 76337, "tid": -914061504, "ts": 1716454225391938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225448421, "dur": 165, "args": { "External id": 251654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251654, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251654, "pid": 5, "tid": 7, "ts": 1716454225448421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391953, "dur": 6, "args": { "External id": 251654, "cbid": 211, "correlation": 251654 } }, { "ph": "s", "id": 251654, "pid": 76337, "tid": -914061504, "ts": 1716454225391953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225448588, "dur": 1, "args": { "External id": 251656, "device": 5, "context": 1, "stream": 7, "correlation": 251656, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 251656, "pid": 5, "tid": 7, "ts": 1716454225448588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225391964, "dur": 7, "args": { "External id": 251656, "cbid": 51, "correlation": 251656 } }, { "ph": "s", "id": 251656, "pid": 76337, "tid": -914061504, "ts": 1716454225391964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225448592, "dur": 264, "args": { "External id": 251657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251657, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251657, "pid": 5, "tid": 7, "ts": 1716454225448592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391972, "dur": 14, "args": { "External id": 251657, "cbid": 211, "correlation": 251657 } }, { "ph": "s", "id": 251657, "pid": 76337, "tid": -914061504, "ts": 1716454225391972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225448857, "dur": 6, "args": { "External id": 251659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251659, "pid": 5, "tid": 7, "ts": 1716454225448857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225391989, "dur": 5, "args": { "External id": 251659, "cbid": 211, "correlation": 251659 } }, { "ph": "s", "id": 251659, "pid": 76337, "tid": -914061504, "ts": 1716454225391989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225448864, "dur": 6, "args": { "External id": 251665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251665, "pid": 5, "tid": 7, "ts": 1716454225448864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392019, "dur": 8, "args": { "External id": 251665, "cbid": 211, "correlation": 251665 } }, { "ph": "s", "id": 251665, "pid": 76337, "tid": -914061504, "ts": 1716454225392019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225392079, "dur": 0, "args": { "External id": 251675, "cbid": 317, "correlation": 251675 } }, { "ph": "f", "id": 251675, "pid": 76337, "tid": -914061504, "ts": 1716454225392079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225392079, "dur": 0, "args": { "External id": 251676, "cbid": 203, "correlation": 251676 } }, { "ph": "f", "id": 251676, "pid": 76337, "tid": -914061504, "ts": 1716454225392079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225392080, "dur": 0, "args": { "External id": 251677, "cbid": 205, "correlation": 251677 } }, { "ph": "f", "id": 251677, "pid": 76337, "tid": -914061504, "ts": 1716454225392080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225448871, "dur": 8, "args": { "External id": 251681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251681, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251681, "pid": 5, "tid": 7, "ts": 1716454225448871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392092, "dur": 12, "args": { "External id": 251681, "cbid": 211, "correlation": 251681 } }, { "ph": "s", "id": 251681, "pid": 76337, "tid": -914061504, "ts": 1716454225392092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225448880, "dur": 3, "args": { "External id": 251683, "device": 5, "context": 1, "stream": 7, "correlation": 251683, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 251683, "pid": 5, "tid": 7, "ts": 1716454225448880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225392109, "dur": 9, "args": { "External id": 251683, "cbid": 51, "correlation": 251683 } }, { "ph": "s", "id": 251683, "pid": 76337, "tid": -914061504, "ts": 1716454225392109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225448884, "dur": 98, "args": { "External id": 251684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251684, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 251684, "pid": 5, "tid": 7, "ts": 1716454225448884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392119, "dur": 6, "args": { "External id": 251684, "cbid": 211, "correlation": 251684 } }, { "ph": "s", "id": 251684, "pid": 76337, "tid": -914061504, "ts": 1716454225392119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225448984, "dur": 6, "args": { "External id": 251686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251686, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251686, "pid": 5, "tid": 7, "ts": 1716454225448984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392129, "dur": 5, "args": { "External id": 251686, "cbid": 211, "correlation": 251686 } }, { "ph": "s", "id": 251686, "pid": 76337, "tid": -914061504, "ts": 1716454225392129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225448991, "dur": 6, "args": { "External id": 251692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251692, "pid": 5, "tid": 7, "ts": 1716454225448991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392156, "dur": 9, "args": { "External id": 251692, "cbid": 211, "correlation": 251692 } }, { "ph": "s", "id": 251692, "pid": 76337, "tid": -914061504, "ts": 1716454225392156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225448998, "dur": 5, "args": { "External id": 251700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251700, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251700, "pid": 5, "tid": 7, "ts": 1716454225448998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392186, "dur": 8, "args": { "External id": 251700, "cbid": 211, "correlation": 251700 } }, { "ph": "s", "id": 251700, "pid": 76337, "tid": -914061504, "ts": 1716454225392186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225449004, "dur": 5, "args": { "External id": 251708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251708, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251708, "pid": 5, "tid": 7, "ts": 1716454225449004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392215, "dur": 8, "args": { "External id": 251708, "cbid": 211, "correlation": 251708 } }, { "ph": "s", "id": 251708, "pid": 76337, "tid": -914061504, "ts": 1716454225392215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225449010, "dur": 11, "args": { "External id": 251717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251717, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251717, "pid": 5, "tid": 7, "ts": 1716454225449010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392259, "dur": 10, "args": { "External id": 251717, "cbid": 211, "correlation": 251717 } }, { "ph": "s", "id": 251717, "pid": 76337, "tid": -914061504, "ts": 1716454225392259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225449023, "dur": 12, "args": { "External id": 251737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251737, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251737, "pid": 5, "tid": 7, "ts": 1716454225449023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392332, "dur": 12, "args": { "External id": 251737, "cbid": 211, "correlation": 251737 } }, { "ph": "s", "id": 251737, "pid": 76337, "tid": -914061504, "ts": 1716454225392332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225449037, "dur": 4, "args": { "External id": 251749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251749, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251749, "pid": 5, "tid": 7, "ts": 1716454225449037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392354, "dur": 7, "args": { "External id": 251749, "cbid": 211, "correlation": 251749 } }, { "ph": "s", "id": 251749, "pid": 76337, "tid": -914061504, "ts": 1716454225392354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225449042, "dur": 11, "args": { "External id": 251752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251752, "pid": 5, "tid": 7, "ts": 1716454225449042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392372, "dur": 6, "args": { "External id": 251752, "cbid": 211, "correlation": 251752 } }, { "ph": "s", "id": 251752, "pid": 76337, "tid": -914061504, "ts": 1716454225392372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225449054, "dur": 7, "args": { "External id": 251761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251761, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251761, "pid": 5, "tid": 7, "ts": 1716454225449054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392411, "dur": 10, "args": { "External id": 251761, "cbid": 211, "correlation": 251761 } }, { "ph": "s", "id": 251761, "pid": 76337, "tid": -914061504, "ts": 1716454225392411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225392463, "dur": 0, "args": { "External id": 251771, "cbid": 317, "correlation": 251771 } }, { "ph": "f", "id": 251771, "pid": 76337, "tid": -914061504, "ts": 1716454225392463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225392464, "dur": 0, "args": { "External id": 251772, "cbid": 203, "correlation": 251772 } }, { "ph": "f", "id": 251772, "pid": 76337, "tid": -914061504, "ts": 1716454225392464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225392465, "dur": 0, "args": { "External id": 251773, "cbid": 205, "correlation": 251773 } }, { "ph": "f", "id": 251773, "pid": 76337, "tid": -914061504, "ts": 1716454225392465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225449062, "dur": 7, "args": { "External id": 251777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251777, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251777, "pid": 5, "tid": 7, "ts": 1716454225449062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392478, "dur": 11, "args": { "External id": 251777, "cbid": 211, "correlation": 251777 } }, { "ph": "s", "id": 251777, "pid": 76337, "tid": -914061504, "ts": 1716454225392478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225449070, "dur": 325, "args": { "External id": 251779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251779, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251779, "pid": 5, "tid": 7, "ts": 1716454225449070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392492, "dur": 5, "args": { "External id": 251779, "cbid": 211, "correlation": 251779 } }, { "ph": "s", "id": 251779, "pid": 76337, "tid": -914061504, "ts": 1716454225392492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225449398, "dur": 1, "args": { "External id": 251781, "device": 5, "context": 1, "stream": 7, "correlation": 251781, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 251781, "pid": 5, "tid": 7, "ts": 1716454225449398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225392503, "dur": 7, "args": { "External id": 251781, "cbid": 51, "correlation": 251781 } }, { "ph": "s", "id": 251781, "pid": 76337, "tid": -914061504, "ts": 1716454225392503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225449401, "dur": 505, "args": { "External id": 251782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251782, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251782, "pid": 5, "tid": 7, "ts": 1716454225449401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392511, "dur": 6, "args": { "External id": 251782, "cbid": 211, "correlation": 251782 } }, { "ph": "s", "id": 251782, "pid": 76337, "tid": -914061504, "ts": 1716454225392511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225449908, "dur": 6, "args": { "External id": 251784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251784, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251784, "pid": 5, "tid": 7, "ts": 1716454225449908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392520, "dur": 5, "args": { "External id": 251784, "cbid": 211, "correlation": 251784 } }, { "ph": "s", "id": 251784, "pid": 76337, "tid": -914061504, "ts": 1716454225392520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225449915, "dur": 6, "args": { "External id": 251790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251790, "pid": 5, "tid": 7, "ts": 1716454225449915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392548, "dur": 10, "args": { "External id": 251790, "cbid": 211, "correlation": 251790 } }, { "ph": "s", "id": 251790, "pid": 76337, "tid": -914061504, "ts": 1716454225392548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225449923, "dur": 3, "args": { "External id": 251798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251798, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 251798, "pid": 5, "tid": 7, "ts": 1716454225449923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392592, "dur": 9, "args": { "External id": 251798, "cbid": 211, "correlation": 251798 } }, { "ph": "s", "id": 251798, "pid": 76337, "tid": -914061504, "ts": 1716454225392592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225392655, "dur": 1, "args": { "External id": 251814, "cbid": 251, "correlation": 251814 } }, { "ph": "f", "id": 251814, "pid": 76337, "tid": -914061504, "ts": 1716454225392655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225392661, "dur": 0, "args": { "External id": 251816, "cbid": 251, "correlation": 251816 } }, { "ph": "f", "id": 251816, "pid": 76337, "tid": -914061504, "ts": 1716454225392661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225449927, "dur": 13, "args": { "External id": 251817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251817, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251817, "pid": 5, "tid": 7, "ts": 1716454225449927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392663, "dur": 11, "args": { "External id": 251817, "cbid": 211, "correlation": 251817 } }, { "ph": "s", "id": 251817, "pid": 76337, "tid": -914061504, "ts": 1716454225392663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225449942, "dur": 5, "args": { "External id": 251819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251819, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251819, "pid": 5, "tid": 7, "ts": 1716454225449942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392676, "dur": 5, "args": { "External id": 251819, "cbid": 211, "correlation": 251819 } }, { "ph": "s", "id": 251819, "pid": 76337, "tid": -914061504, "ts": 1716454225392676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225449948, "dur": 6, "args": { "External id": 251829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251829, "pid": 5, "tid": 7, "ts": 1716454225449948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392732, "dur": 12, "args": { "External id": 251829, "cbid": 211, "correlation": 251829 } }, { "ph": "s", "id": 251829, "pid": 76337, "tid": -914061504, "ts": 1716454225392732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225449955, "dur": 10, "args": { "External id": 251849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251849, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 251849, "pid": 5, "tid": 7, "ts": 1716454225449955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392798, "dur": 11, "args": { "External id": 251849, "cbid": 211, "correlation": 251849 } }, { "ph": "s", "id": 251849, "pid": 76337, "tid": -914061504, "ts": 1716454225392798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225449966, "dur": 3, "args": { "External id": 251861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251861, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 251861, "pid": 5, "tid": 7, "ts": 1716454225449966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392819, "dur": 6, "args": { "External id": 251861, "cbid": 211, "correlation": 251861 } }, { "ph": "s", "id": 251861, "pid": 76337, "tid": -914061504, "ts": 1716454225392819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225449971, "dur": 7, "args": { "External id": 251864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251864, "pid": 5, "tid": 7, "ts": 1716454225449971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392838, "dur": 6, "args": { "External id": 251864, "cbid": 211, "correlation": 251864 } }, { "ph": "s", "id": 251864, "pid": 76337, "tid": -914061504, "ts": 1716454225392838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225449979, "dur": 5, "args": { "External id": 251873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251873, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251873, "pid": 5, "tid": 7, "ts": 1716454225449979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392877, "dur": 10, "args": { "External id": 251873, "cbid": 211, "correlation": 251873 } }, { "ph": "s", "id": 251873, "pid": 76337, "tid": -914061504, "ts": 1716454225392877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225392940, "dur": 0, "args": { "External id": 251883, "cbid": 317, "correlation": 251883 } }, { "ph": "f", "id": 251883, "pid": 76337, "tid": -914061504, "ts": 1716454225392940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225392941, "dur": 0, "args": { "External id": 251884, "cbid": 203, "correlation": 251884 } }, { "ph": "f", "id": 251884, "pid": 76337, "tid": -914061504, "ts": 1716454225392941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225392941, "dur": 0, "args": { "External id": 251885, "cbid": 205, "correlation": 251885 } }, { "ph": "f", "id": 251885, "pid": 76337, "tid": -914061504, "ts": 1716454225392941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225449985, "dur": 5, "args": { "External id": 251889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251889, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251889, "pid": 5, "tid": 7, "ts": 1716454225449985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392955, "dur": 13, "args": { "External id": 251889, "cbid": 211, "correlation": 251889 } }, { "ph": "s", "id": 251889, "pid": 76337, "tid": -914061504, "ts": 1716454225392955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225449991, "dur": 165, "args": { "External id": 251891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251891, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251891, "pid": 5, "tid": 7, "ts": 1716454225449991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392970, "dur": 13, "args": { "External id": 251891, "cbid": 211, "correlation": 251891 } }, { "ph": "s", "id": 251891, "pid": 76337, "tid": -914061504, "ts": 1716454225392970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225450159, "dur": 1, "args": { "External id": 251893, "device": 5, "context": 1, "stream": 7, "correlation": 251893, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 251893, "pid": 5, "tid": 7, "ts": 1716454225450159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225392988, "dur": 6, "args": { "External id": 251893, "cbid": 51, "correlation": 251893 } }, { "ph": "s", "id": 251893, "pid": 76337, "tid": -914061504, "ts": 1716454225392988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225450162, "dur": 264, "args": { "External id": 251894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251894, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251894, "pid": 5, "tid": 7, "ts": 1716454225450162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225392995, "dur": 6, "args": { "External id": 251894, "cbid": 211, "correlation": 251894 } }, { "ph": "s", "id": 251894, "pid": 76337, "tid": -914061504, "ts": 1716454225392995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225450428, "dur": 6, "args": { "External id": 251896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251896, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251896, "pid": 5, "tid": 7, "ts": 1716454225450428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393007, "dur": 5, "args": { "External id": 251896, "cbid": 211, "correlation": 251896 } }, { "ph": "s", "id": 251896, "pid": 76337, "tid": -914061504, "ts": 1716454225393007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225450435, "dur": 6, "args": { "External id": 251902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251902, "pid": 5, "tid": 7, "ts": 1716454225450435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393036, "dur": 9, "args": { "External id": 251902, "cbid": 211, "correlation": 251902 } }, { "ph": "s", "id": 251902, "pid": 76337, "tid": -914061504, "ts": 1716454225393036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225393095, "dur": 0, "args": { "External id": 251912, "cbid": 317, "correlation": 251912 } }, { "ph": "f", "id": 251912, "pid": 76337, "tid": -914061504, "ts": 1716454225393095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225393096, "dur": 0, "args": { "External id": 251913, "cbid": 203, "correlation": 251913 } }, { "ph": "f", "id": 251913, "pid": 76337, "tid": -914061504, "ts": 1716454225393096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225393097, "dur": 0, "args": { "External id": 251914, "cbid": 205, "correlation": 251914 } }, { "ph": "f", "id": 251914, "pid": 76337, "tid": -914061504, "ts": 1716454225393097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225450442, "dur": 8, "args": { "External id": 251918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251918, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251918, "pid": 5, "tid": 7, "ts": 1716454225450442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393109, "dur": 12, "args": { "External id": 251918, "cbid": 211, "correlation": 251918 } }, { "ph": "s", "id": 251918, "pid": 76337, "tid": -914061504, "ts": 1716454225393109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225450451, "dur": 3, "args": { "External id": 251920, "device": 5, "context": 1, "stream": 7, "correlation": 251920, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 251920, "pid": 5, "tid": 7, "ts": 1716454225450451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225393126, "dur": 9, "args": { "External id": 251920, "cbid": 51, "correlation": 251920 } }, { "ph": "s", "id": 251920, "pid": 76337, "tid": -914061504, "ts": 1716454225393126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225450456, "dur": 97, "args": { "External id": 251921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251921, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 251921, "pid": 5, "tid": 7, "ts": 1716454225450456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393136, "dur": 6, "args": { "External id": 251921, "cbid": 211, "correlation": 251921 } }, { "ph": "s", "id": 251921, "pid": 76337, "tid": -914061504, "ts": 1716454225393136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225450554, "dur": 6, "args": { "External id": 251923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251923, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251923, "pid": 5, "tid": 7, "ts": 1716454225450554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393145, "dur": 5, "args": { "External id": 251923, "cbid": 211, "correlation": 251923 } }, { "ph": "s", "id": 251923, "pid": 76337, "tid": -914061504, "ts": 1716454225393145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225450561, "dur": 6, "args": { "External id": 251929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251929, "pid": 5, "tid": 7, "ts": 1716454225450561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393173, "dur": 8, "args": { "External id": 251929, "cbid": 211, "correlation": 251929 } }, { "ph": "s", "id": 251929, "pid": 76337, "tid": -914061504, "ts": 1716454225393173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225450569, "dur": 5, "args": { "External id": 251937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251937, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251937, "pid": 5, "tid": 7, "ts": 1716454225450569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393201, "dur": 7, "args": { "External id": 251937, "cbid": 211, "correlation": 251937 } }, { "ph": "s", "id": 251937, "pid": 76337, "tid": -914061504, "ts": 1716454225393201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225450575, "dur": 4, "args": { "External id": 251945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251945, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 251945, "pid": 5, "tid": 7, "ts": 1716454225450575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393231, "dur": 8, "args": { "External id": 251945, "cbid": 211, "correlation": 251945 } }, { "ph": "s", "id": 251945, "pid": 76337, "tid": -914061504, "ts": 1716454225393231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225450581, "dur": 14, "args": { "External id": 251956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251956, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251956, "pid": 5, "tid": 7, "ts": 1716454225450581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393313, "dur": 14, "args": { "External id": 251956, "cbid": 211, "correlation": 251956 } }, { "ph": "s", "id": 251956, "pid": 76337, "tid": -914061504, "ts": 1716454225393313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225393370, "dur": 0, "args": { "External id": 251966, "cbid": 317, "correlation": 251966 } }, { "ph": "f", "id": 251966, "pid": 76337, "tid": -914061504, "ts": 1716454225393370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225393371, "dur": 0, "args": { "External id": 251967, "cbid": 203, "correlation": 251967 } }, { "ph": "f", "id": 251967, "pid": 76337, "tid": -914061504, "ts": 1716454225393371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225393372, "dur": 0, "args": { "External id": 251968, "cbid": 205, "correlation": 251968 } }, { "ph": "f", "id": 251968, "pid": 76337, "tid": -914061504, "ts": 1716454225393372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225450596, "dur": 9, "args": { "External id": 251972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251972, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251972, "pid": 5, "tid": 7, "ts": 1716454225450596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393385, "dur": 11, "args": { "External id": 251972, "cbid": 211, "correlation": 251972 } }, { "ph": "s", "id": 251972, "pid": 76337, "tid": -914061504, "ts": 1716454225393385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225450607, "dur": 165, "args": { "External id": 251974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251974, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251974, "pid": 5, "tid": 7, "ts": 1716454225450607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393399, "dur": 5, "args": { "External id": 251974, "cbid": 211, "correlation": 251974 } }, { "ph": "s", "id": 251974, "pid": 76337, "tid": -914061504, "ts": 1716454225393399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225450774, "dur": 1, "args": { "External id": 251976, "device": 5, "context": 1, "stream": 7, "correlation": 251976, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 251976, "pid": 5, "tid": 7, "ts": 1716454225450774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225393410, "dur": 7, "args": { "External id": 251976, "cbid": 51, "correlation": 251976 } }, { "ph": "s", "id": 251976, "pid": 76337, "tid": -914061504, "ts": 1716454225393410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225450777, "dur": 660, "args": { "External id": 251977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251977, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 251977, "pid": 5, "tid": 7, "ts": 1716454225450777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393418, "dur": 6, "args": { "External id": 251977, "cbid": 211, "correlation": 251977 } }, { "ph": "s", "id": 251977, "pid": 76337, "tid": -914061504, "ts": 1716454225393418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225451439, "dur": 12, "args": { "External id": 251979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251979, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251979, "pid": 5, "tid": 7, "ts": 1716454225451439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393427, "dur": 5, "args": { "External id": 251979, "cbid": 211, "correlation": 251979 } }, { "ph": "s", "id": 251979, "pid": 76337, "tid": -914061504, "ts": 1716454225393427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225451452, "dur": 15, "args": { "External id": 251985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251985, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251985, "pid": 5, "tid": 7, "ts": 1716454225451452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393455, "dur": 9, "args": { "External id": 251985, "cbid": 211, "correlation": 251985 } }, { "ph": "s", "id": 251985, "pid": 76337, "tid": -914061504, "ts": 1716454225393455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225451468, "dur": 31, "args": { "External id": 251994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 251994, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 251994, "pid": 5, "tid": 7, "ts": 1716454225451468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393546, "dur": 12, "args": { "External id": 251994, "cbid": 211, "correlation": 251994 } }, { "ph": "s", "id": 251994, "pid": 76337, "tid": -914061504, "ts": 1716454225393546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225451501, "dur": 31, "args": { "External id": 252014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252014, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 252014, "pid": 5, "tid": 7, "ts": 1716454225451501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393612, "dur": 12, "args": { "External id": 252014, "cbid": 211, "correlation": 252014 } }, { "ph": "s", "id": 252014, "pid": 76337, "tid": -914061504, "ts": 1716454225393612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225451533, "dur": 4, "args": { "External id": 252026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252026, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252026, "pid": 5, "tid": 7, "ts": 1716454225451533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393634, "dur": 6, "args": { "External id": 252026, "cbid": 211, "correlation": 252026 } }, { "ph": "s", "id": 252026, "pid": 76337, "tid": -914061504, "ts": 1716454225393634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225451539, "dur": 29, "args": { "External id": 252029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252029, "pid": 5, "tid": 7, "ts": 1716454225451539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393652, "dur": 7, "args": { "External id": 252029, "cbid": 211, "correlation": 252029 } }, { "ph": "s", "id": 252029, "pid": 76337, "tid": -914061504, "ts": 1716454225393652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225451570, "dur": 21, "args": { "External id": 252038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252038, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252038, "pid": 5, "tid": 7, "ts": 1716454225451570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393690, "dur": 9, "args": { "External id": 252038, "cbid": 211, "correlation": 252038 } }, { "ph": "s", "id": 252038, "pid": 76337, "tid": -914061504, "ts": 1716454225393690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225393743, "dur": 0, "args": { "External id": 252048, "cbid": 317, "correlation": 252048 } }, { "ph": "f", "id": 252048, "pid": 76337, "tid": -914061504, "ts": 1716454225393743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225393744, "dur": 0, "args": { "External id": 252049, "cbid": 203, "correlation": 252049 } }, { "ph": "f", "id": 252049, "pid": 76337, "tid": -914061504, "ts": 1716454225393744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225393745, "dur": 0, "args": { "External id": 252050, "cbid": 205, "correlation": 252050 } }, { "ph": "f", "id": 252050, "pid": 76337, "tid": -914061504, "ts": 1716454225393745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225451592, "dur": 22, "args": { "External id": 252054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252054, "pid": 5, "tid": 7, "ts": 1716454225451592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393760, "dur": 12, "args": { "External id": 252054, "cbid": 211, "correlation": 252054 } }, { "ph": "s", "id": 252054, "pid": 76337, "tid": -914061504, "ts": 1716454225393760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225451615, "dur": 327, "args": { "External id": 252056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252056, "pid": 5, "tid": 7, "ts": 1716454225451615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393775, "dur": 5, "args": { "External id": 252056, "cbid": 211, "correlation": 252056 } }, { "ph": "s", "id": 252056, "pid": 76337, "tid": -914061504, "ts": 1716454225393775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225451945, "dur": 1, "args": { "External id": 252058, "device": 5, "context": 1, "stream": 7, "correlation": 252058, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 252058, "pid": 5, "tid": 7, "ts": 1716454225451945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225393786, "dur": 6, "args": { "External id": 252058, "cbid": 51, "correlation": 252058 } }, { "ph": "s", "id": 252058, "pid": 76337, "tid": -914061504, "ts": 1716454225393786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225451948, "dur": 1261, "args": { "External id": 252059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252059, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252059, "pid": 5, "tid": 7, "ts": 1716454225451948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393793, "dur": 6, "args": { "External id": 252059, "cbid": 211, "correlation": 252059 } }, { "ph": "s", "id": 252059, "pid": 76337, "tid": -914061504, "ts": 1716454225393793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225453211, "dur": 14, "args": { "External id": 252061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252061, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252061, "pid": 5, "tid": 7, "ts": 1716454225453211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393804, "dur": 5, "args": { "External id": 252061, "cbid": 211, "correlation": 252061 } }, { "ph": "s", "id": 252061, "pid": 76337, "tid": -914061504, "ts": 1716454225393804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225453226, "dur": 15, "args": { "External id": 252067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252067, "pid": 5, "tid": 7, "ts": 1716454225453226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393832, "dur": 8, "args": { "External id": 252067, "cbid": 211, "correlation": 252067 } }, { "ph": "s", "id": 252067, "pid": 76337, "tid": -914061504, "ts": 1716454225393832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225453242, "dur": 3, "args": { "External id": 252075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252075, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 252075, "pid": 5, "tid": 7, "ts": 1716454225453242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393876, "dur": 9, "args": { "External id": 252075, "cbid": 211, "correlation": 252075 } }, { "ph": "s", "id": 252075, "pid": 76337, "tid": -914061504, "ts": 1716454225393876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225393940, "dur": 1, "args": { "External id": 252091, "cbid": 251, "correlation": 252091 } }, { "ph": "f", "id": 252091, "pid": 76337, "tid": -914061504, "ts": 1716454225393940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225393945, "dur": 0, "args": { "External id": 252093, "cbid": 251, "correlation": 252093 } }, { "ph": "f", "id": 252093, "pid": 76337, "tid": -914061504, "ts": 1716454225393945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225453247, "dur": 13, "args": { "External id": 252094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252094, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252094, "pid": 5, "tid": 7, "ts": 1716454225453247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393948, "dur": 11, "args": { "External id": 252094, "cbid": 211, "correlation": 252094 } }, { "ph": "s", "id": 252094, "pid": 76337, "tid": -914061504, "ts": 1716454225393948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225453261, "dur": 5, "args": { "External id": 252096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252096, "pid": 5, "tid": 7, "ts": 1716454225453261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225393960, "dur": 5, "args": { "External id": 252096, "cbid": 211, "correlation": 252096 } }, { "ph": "s", "id": 252096, "pid": 76337, "tid": -914061504, "ts": 1716454225393960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225453267, "dur": 17, "args": { "External id": 252106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252106, "pid": 5, "tid": 7, "ts": 1716454225453267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394027, "dur": 12, "args": { "External id": 252106, "cbid": 211, "correlation": 252106 } }, { "ph": "s", "id": 252106, "pid": 76337, "tid": -914061504, "ts": 1716454225394027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225453286, "dur": 18, "args": { "External id": 252126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252126, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 252126, "pid": 5, "tid": 7, "ts": 1716454225453286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394092, "dur": 11, "args": { "External id": 252126, "cbid": 211, "correlation": 252126 } }, { "ph": "s", "id": 252126, "pid": 76337, "tid": -914061504, "ts": 1716454225394092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225453305, "dur": 4, "args": { "External id": 252138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252138, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 252138, "pid": 5, "tid": 7, "ts": 1716454225453305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394113, "dur": 6, "args": { "External id": 252138, "cbid": 211, "correlation": 252138 } }, { "ph": "s", "id": 252138, "pid": 76337, "tid": -914061504, "ts": 1716454225394113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225453310, "dur": 17, "args": { "External id": 252141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252141, "pid": 5, "tid": 7, "ts": 1716454225453310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394132, "dur": 6, "args": { "External id": 252141, "cbid": 211, "correlation": 252141 } }, { "ph": "s", "id": 252141, "pid": 76337, "tid": -914061504, "ts": 1716454225394132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225453328, "dur": 11, "args": { "External id": 252150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252150, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252150, "pid": 5, "tid": 7, "ts": 1716454225453328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394173, "dur": 10, "args": { "External id": 252150, "cbid": 211, "correlation": 252150 } }, { "ph": "s", "id": 252150, "pid": 76337, "tid": -914061504, "ts": 1716454225394173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225394235, "dur": 0, "args": { "External id": 252160, "cbid": 317, "correlation": 252160 } }, { "ph": "f", "id": 252160, "pid": 76337, "tid": -914061504, "ts": 1716454225394235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225394236, "dur": 0, "args": { "External id": 252161, "cbid": 203, "correlation": 252161 } }, { "ph": "f", "id": 252161, "pid": 76337, "tid": -914061504, "ts": 1716454225394236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225394236, "dur": 0, "args": { "External id": 252162, "cbid": 205, "correlation": 252162 } }, { "ph": "f", "id": 252162, "pid": 76337, "tid": -914061504, "ts": 1716454225394236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225453340, "dur": 11, "args": { "External id": 252166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252166, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252166, "pid": 5, "tid": 7, "ts": 1716454225453340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394250, "dur": 12, "args": { "External id": 252166, "cbid": 211, "correlation": 252166 } }, { "ph": "s", "id": 252166, "pid": 76337, "tid": -914061504, "ts": 1716454225394250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225453353, "dur": 165, "args": { "External id": 252168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252168, "pid": 5, "tid": 7, "ts": 1716454225453353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394264, "dur": 6, "args": { "External id": 252168, "cbid": 211, "correlation": 252168 } }, { "ph": "s", "id": 252168, "pid": 76337, "tid": -914061504, "ts": 1716454225394264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225453521, "dur": 1, "args": { "External id": 252170, "device": 5, "context": 1, "stream": 7, "correlation": 252170, "bytes": 960, "memory bandwidth (GB/s)": 0.5357142857142857 } }, { "ph": "f", "id": 252170, "pid": 5, "tid": 7, "ts": 1716454225453521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225394276, "dur": 6, "args": { "External id": 252170, "cbid": 51, "correlation": 252170 } }, { "ph": "s", "id": 252170, "pid": 76337, "tid": -914061504, "ts": 1716454225394276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225453525, "dur": 660, "args": { "External id": 252171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252171, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252171, "pid": 5, "tid": 7, "ts": 1716454225453525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394283, "dur": 6, "args": { "External id": 252171, "cbid": 211, "correlation": 252171 } }, { "ph": "s", "id": 252171, "pid": 76337, "tid": -914061504, "ts": 1716454225394283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225454186, "dur": 12, "args": { "External id": 252173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252173, "pid": 5, "tid": 7, "ts": 1716454225454186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394293, "dur": 5, "args": { "External id": 252173, "cbid": 211, "correlation": 252173 } }, { "ph": "s", "id": 252173, "pid": 76337, "tid": -914061504, "ts": 1716454225394293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225454199, "dur": 15, "args": { "External id": 252179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252179, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252179, "pid": 5, "tid": 7, "ts": 1716454225454199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394322, "dur": 8, "args": { "External id": 252179, "cbid": 211, "correlation": 252179 } }, { "ph": "s", "id": 252179, "pid": 76337, "tid": -914061504, "ts": 1716454225394322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225394380, "dur": 0, "args": { "External id": 252189, "cbid": 317, "correlation": 252189 } }, { "ph": "f", "id": 252189, "pid": 76337, "tid": -914061504, "ts": 1716454225394380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225394381, "dur": 0, "args": { "External id": 252190, "cbid": 203, "correlation": 252190 } }, { "ph": "f", "id": 252190, "pid": 76337, "tid": -914061504, "ts": 1716454225394381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225394382, "dur": 0, "args": { "External id": 252191, "cbid": 205, "correlation": 252191 } }, { "ph": "f", "id": 252191, "pid": 76337, "tid": -914061504, "ts": 1716454225394382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225454215, "dur": 21, "args": { "External id": 252195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252195, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252195, "pid": 5, "tid": 7, "ts": 1716454225454215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394395, "dur": 11, "args": { "External id": 252195, "cbid": 211, "correlation": 252195 } }, { "ph": "s", "id": 252195, "pid": 76337, "tid": -914061504, "ts": 1716454225394395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225454237, "dur": 4, "args": { "External id": 252197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252197, "pid": 5, "tid": 7, "ts": 1716454225454237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394411, "dur": 7, "args": { "External id": 252197, "cbid": 211, "correlation": 252197 } }, { "ph": "s", "id": 252197, "pid": 76337, "tid": -914061504, "ts": 1716454225394411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225394422, "dur": 0, "args": { "External id": 252198, "cbid": 51, "correlation": 252198 } }, { "ph": "s", "id": 252198, "pid": 76337, "tid": -914061504, "ts": 1716454225394422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225454243, "dur": 180, "args": { "External id": 252199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252199, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 252199, "pid": 5, "tid": 7, "ts": 1716454225454243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394423, "dur": 5, "args": { "External id": 252199, "cbid": 211, "correlation": 252199 } }, { "ph": "s", "id": 252199, "pid": 76337, "tid": -914061504, "ts": 1716454225394423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225454425, "dur": 16, "args": { "External id": 252204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252204, "pid": 5, "tid": 7, "ts": 1716454225454425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394449, "dur": 8, "args": { "External id": 252204, "cbid": 211, "correlation": 252204 } }, { "ph": "s", "id": 252204, "pid": 76337, "tid": -914061504, "ts": 1716454225394449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225454442, "dur": 12, "args": { "External id": 252212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252212, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252212, "pid": 5, "tid": 7, "ts": 1716454225454442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394479, "dur": 8, "args": { "External id": 252212, "cbid": 211, "correlation": 252212 } }, { "ph": "s", "id": 252212, "pid": 76337, "tid": -914061504, "ts": 1716454225394479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225454455, "dur": 10, "args": { "External id": 252220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252220, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252220, "pid": 5, "tid": 7, "ts": 1716454225454455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394507, "dur": 9, "args": { "External id": 252220, "cbid": 211, "correlation": 252220 } }, { "ph": "s", "id": 252220, "pid": 76337, "tid": -914061504, "ts": 1716454225394507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225454466, "dur": 19, "args": { "External id": 252240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252240, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 252240, "pid": 5, "tid": 7, "ts": 1716454225454466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394589, "dur": 13, "args": { "External id": 252240, "cbid": 211, "correlation": 252240 } }, { "ph": "s", "id": 252240, "pid": 76337, "tid": -914061504, "ts": 1716454225394589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225454487, "dur": 5, "args": { "External id": 252252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252252, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 252252, "pid": 5, "tid": 7, "ts": 1716454225454487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394612, "dur": 6, "args": { "External id": 252252, "cbid": 211, "correlation": 252252 } }, { "ph": "s", "id": 252252, "pid": 76337, "tid": -914061504, "ts": 1716454225394612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225454493, "dur": 17, "args": { "External id": 252255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252255, "pid": 5, "tid": 7, "ts": 1716454225454493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394629, "dur": 6, "args": { "External id": 252255, "cbid": 211, "correlation": 252255 } }, { "ph": "s", "id": 252255, "pid": 76337, "tid": -914061504, "ts": 1716454225394629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225394686, "dur": 0, "args": { "External id": 252266, "cbid": 317, "correlation": 252266 } }, { "ph": "f", "id": 252266, "pid": 76337, "tid": -914061504, "ts": 1716454225394686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225394687, "dur": 0, "args": { "External id": 252267, "cbid": 203, "correlation": 252267 } }, { "ph": "f", "id": 252267, "pid": 76337, "tid": -914061504, "ts": 1716454225394687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225394688, "dur": 0, "args": { "External id": 252268, "cbid": 205, "correlation": 252268 } }, { "ph": "f", "id": 252268, "pid": 76337, "tid": -914061504, "ts": 1716454225394688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225454511, "dur": 11, "args": { "External id": 252272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252272, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252272, "pid": 5, "tid": 7, "ts": 1716454225454511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394701, "dur": 11, "args": { "External id": 252272, "cbid": 211, "correlation": 252272 } }, { "ph": "s", "id": 252272, "pid": 76337, "tid": -914061504, "ts": 1716454225394701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225454524, "dur": 4, "args": { "External id": 252274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252274, "pid": 5, "tid": 7, "ts": 1716454225454524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394717, "dur": 7, "args": { "External id": 252274, "cbid": 211, "correlation": 252274 } }, { "ph": "s", "id": 252274, "pid": 76337, "tid": -914061504, "ts": 1716454225394717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225394726, "dur": 0, "args": { "External id": 252275, "cbid": 51, "correlation": 252275 } }, { "ph": "s", "id": 252275, "pid": 76337, "tid": -914061504, "ts": 1716454225394726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225454529, "dur": 93, "args": { "External id": 252276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252276, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 252276, "pid": 5, "tid": 7, "ts": 1716454225454529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394727, "dur": 5, "args": { "External id": 252276, "cbid": 211, "correlation": 252276 } }, { "ph": "s", "id": 252276, "pid": 76337, "tid": -914061504, "ts": 1716454225394727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225454623, "dur": 16, "args": { "External id": 252281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252281, "pid": 5, "tid": 7, "ts": 1716454225454623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394753, "dur": 9, "args": { "External id": 252281, "cbid": 211, "correlation": 252281 } }, { "ph": "s", "id": 252281, "pid": 76337, "tid": -914061504, "ts": 1716454225394753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225454641, "dur": 85, "args": { "External id": 252290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252290, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252290, "pid": 5, "tid": 7, "ts": 1716454225454641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394836, "dur": 14, "args": { "External id": 252290, "cbid": 211, "correlation": 252290 } }, { "ph": "s", "id": 252290, "pid": 76337, "tid": -914061504, "ts": 1716454225394836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225454727, "dur": 30, "args": { "External id": 252312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252312, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252312, "pid": 5, "tid": 7, "ts": 1716454225454727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225394895, "dur": 10, "args": { "External id": 252312, "cbid": 211, "correlation": 252312 } }, { "ph": "s", "id": 252312, "pid": 76337, "tid": -914061504, "ts": 1716454225394895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395000, "dur": 2, "args": { "External id": 252323, "cbid": 251, "correlation": 252323 } }, { "ph": "f", "id": 252323, "pid": 76337, "tid": -914061504, "ts": 1716454225395000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225454759, "dur": 169, "args": { "External id": 252324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252324, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252324, "pid": 5, "tid": 7, "ts": 1716454225454759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395006, "dur": 14, "args": { "External id": 252324, "cbid": 211, "correlation": 252324 } }, { "ph": "s", "id": 252324, "pid": 76337, "tid": -914061504, "ts": 1716454225395006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395080, "dur": 1, "args": { "External id": 252335, "cbid": 251, "correlation": 252335 } }, { "ph": "f", "id": 252335, "pid": 76337, "tid": -914061504, "ts": 1716454225395080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225454929, "dur": 160, "args": { "External id": 252336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252336, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252336, "pid": 5, "tid": 7, "ts": 1716454225454929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395085, "dur": 11, "args": { "External id": 252336, "cbid": 211, "correlation": 252336 } }, { "ph": "s", "id": 252336, "pid": 76337, "tid": -914061504, "ts": 1716454225395085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395150, "dur": 1, "args": { "External id": 252347, "cbid": 251, "correlation": 252347 } }, { "ph": "f", "id": 252347, "pid": 76337, "tid": -914061504, "ts": 1716454225395150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225455090, "dur": 158, "args": { "External id": 252348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252348, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252348, "pid": 5, "tid": 7, "ts": 1716454225455090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395154, "dur": 11, "args": { "External id": 252348, "cbid": 211, "correlation": 252348 } }, { "ph": "s", "id": 252348, "pid": 76337, "tid": -914061504, "ts": 1716454225395154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225455249, "dur": 347, "args": { "External id": 252373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252373, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252373, "pid": 5, "tid": 7, "ts": 1716454225455249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395242, "dur": 13, "args": { "External id": 252373, "cbid": 211, "correlation": 252373 } }, { "ph": "s", "id": 252373, "pid": 76337, "tid": -914061504, "ts": 1716454225395242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395342, "dur": 1, "args": { "External id": 252391, "cbid": 251, "correlation": 252391 } }, { "ph": "f", "id": 252391, "pid": 76337, "tid": -914061504, "ts": 1716454225395342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225455598, "dur": 170, "args": { "External id": 252393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252393, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252393, "pid": 5, "tid": 7, "ts": 1716454225455598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395348, "dur": 13, "args": { "External id": 252393, "cbid": 211, "correlation": 252393 } }, { "ph": "s", "id": 252393, "pid": 76337, "tid": -914061504, "ts": 1716454225395348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225455769, "dur": 20, "args": { "External id": 252401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252401, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252401, "pid": 5, "tid": 7, "ts": 1716454225455769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395418, "dur": 12, "args": { "External id": 252401, "cbid": 211, "correlation": 252401 } }, { "ph": "s", "id": 252401, "pid": 76337, "tid": -914061504, "ts": 1716454225395418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225455790, "dur": 28, "args": { "External id": 252409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252409, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252409, "pid": 5, "tid": 7, "ts": 1716454225455790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395457, "dur": 9, "args": { "External id": 252409, "cbid": 211, "correlation": 252409 } }, { "ph": "s", "id": 252409, "pid": 76337, "tid": -914061504, "ts": 1716454225395457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225455819, "dur": 19, "args": { "External id": 252420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252420, "pid": 5, "tid": 7, "ts": 1716454225455819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395530, "dur": 12, "args": { "External id": 252420, "cbid": 211, "correlation": 252420 } }, { "ph": "s", "id": 252420, "pid": 76337, "tid": -914061504, "ts": 1716454225395530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225455839, "dur": 16, "args": { "External id": 252442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252442, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252442, "pid": 5, "tid": 7, "ts": 1716454225455839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395560, "dur": 9, "args": { "External id": 252442, "cbid": 211, "correlation": 252442 } }, { "ph": "s", "id": 252442, "pid": 76337, "tid": -914061504, "ts": 1716454225395560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395646, "dur": 1, "args": { "External id": 252453, "cbid": 251, "correlation": 252453 } }, { "ph": "f", "id": 252453, "pid": 76337, "tid": -914061504, "ts": 1716454225395646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225455857, "dur": 91, "args": { "External id": 252454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252454, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252454, "pid": 5, "tid": 7, "ts": 1716454225455857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395652, "dur": 13, "args": { "External id": 252454, "cbid": 211, "correlation": 252454 } }, { "ph": "s", "id": 252454, "pid": 76337, "tid": -914061504, "ts": 1716454225395652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395722, "dur": 1, "args": { "External id": 252465, "cbid": 251, "correlation": 252465 } }, { "ph": "f", "id": 252465, "pid": 76337, "tid": -914061504, "ts": 1716454225395722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395726, "dur": 0, "args": { "External id": 252466, "cbid": 251, "correlation": 252466 } }, { "ph": "f", "id": 252466, "pid": 76337, "tid": -914061504, "ts": 1716454225395726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225455949, "dur": 12, "args": { "External id": 252467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252467, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252467, "pid": 5, "tid": 7, "ts": 1716454225455949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395727, "dur": 12, "args": { "External id": 252467, "cbid": 211, "correlation": 252467 } }, { "ph": "s", "id": 252467, "pid": 76337, "tid": -914061504, "ts": 1716454225395727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225455963, "dur": 6, "args": { "External id": 252469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252469, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252469, "pid": 5, "tid": 7, "ts": 1716454225455963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395741, "dur": 6, "args": { "External id": 252469, "cbid": 211, "correlation": 252469 } }, { "ph": "s", "id": 252469, "pid": 76337, "tid": -914061504, "ts": 1716454225395741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395799, "dur": 1, "args": { "External id": 252480, "cbid": 251, "correlation": 252480 } }, { "ph": "f", "id": 252480, "pid": 76337, "tid": -914061504, "ts": 1716454225395799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395802, "dur": 0, "args": { "External id": 252481, "cbid": 251, "correlation": 252481 } }, { "ph": "f", "id": 252481, "pid": 76337, "tid": -914061504, "ts": 1716454225395802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225455970, "dur": 9, "args": { "External id": 252482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252482, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252482, "pid": 5, "tid": 7, "ts": 1716454225455970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395803, "dur": 12, "args": { "External id": 252482, "cbid": 211, "correlation": 252482 } }, { "ph": "s", "id": 252482, "pid": 76337, "tid": -914061504, "ts": 1716454225395803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225455980, "dur": 4, "args": { "External id": 252484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252484, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252484, "pid": 5, "tid": 7, "ts": 1716454225455980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395817, "dur": 5, "args": { "External id": 252484, "cbid": 211, "correlation": 252484 } }, { "ph": "s", "id": 252484, "pid": 76337, "tid": -914061504, "ts": 1716454225395817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225455984, "dur": 56, "args": { "External id": 252509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252509, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252509, "pid": 5, "tid": 7, "ts": 1716454225455984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225395894, "dur": 12, "args": { "External id": 252509, "cbid": 211, "correlation": 252509 } }, { "ph": "s", "id": 252509, "pid": 76337, "tid": -914061504, "ts": 1716454225395894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225395999, "dur": 2, "args": { "External id": 252527, "cbid": 251, "correlation": 252527 } }, { "ph": "f", "id": 252527, "pid": 76337, "tid": -914061504, "ts": 1716454225395999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225456042, "dur": 92, "args": { "External id": 252529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252529, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252529, "pid": 5, "tid": 7, "ts": 1716454225456042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396005, "dur": 14, "args": { "External id": 252529, "cbid": 211, "correlation": 252529 } }, { "ph": "s", "id": 252529, "pid": 76337, "tid": -914061504, "ts": 1716454225396005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225456136, "dur": 10, "args": { "External id": 252537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252537, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252537, "pid": 5, "tid": 7, "ts": 1716454225456136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396076, "dur": 12, "args": { "External id": 252537, "cbid": 211, "correlation": 252537 } }, { "ph": "s", "id": 252537, "pid": 76337, "tid": -914061504, "ts": 1716454225396076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225456147, "dur": 21, "args": { "External id": 252545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252545, "pid": 5, "tid": 7, "ts": 1716454225456147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396118, "dur": 9, "args": { "External id": 252545, "cbid": 211, "correlation": 252545 } }, { "ph": "s", "id": 252545, "pid": 76337, "tid": -914061504, "ts": 1716454225396118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225456169, "dur": 17, "args": { "External id": 252567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252567, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252567, "pid": 5, "tid": 7, "ts": 1716454225456169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396172, "dur": 10, "args": { "External id": 252567, "cbid": 211, "correlation": 252567 } }, { "ph": "s", "id": 252567, "pid": 76337, "tid": -914061504, "ts": 1716454225396172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225396261, "dur": 1, "args": { "External id": 252583, "cbid": 251, "correlation": 252583 } }, { "ph": "f", "id": 252583, "pid": 76337, "tid": -914061504, "ts": 1716454225396261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225396266, "dur": 0, "args": { "External id": 252585, "cbid": 251, "correlation": 252585 } }, { "ph": "f", "id": 252585, "pid": 76337, "tid": -914061504, "ts": 1716454225396266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225456188, "dur": 504, "args": { "External id": 252586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252586, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252586, "pid": 5, "tid": 7, "ts": 1716454225456188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396268, "dur": 13, "args": { "External id": 252586, "cbid": 211, "correlation": 252586 } }, { "ph": "s", "id": 252586, "pid": 76337, "tid": -914061504, "ts": 1716454225396268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225456694, "dur": 66, "args": { "External id": 252594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252594, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252594, "pid": 5, "tid": 7, "ts": 1716454225456694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396336, "dur": 13, "args": { "External id": 252594, "cbid": 211, "correlation": 252594 } }, { "ph": "s", "id": 252594, "pid": 76337, "tid": -914061504, "ts": 1716454225396336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225456761, "dur": 67, "args": { "External id": 252602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252602, "pid": 5, "tid": 7, "ts": 1716454225456761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396369, "dur": 8, "args": { "External id": 252602, "cbid": 211, "correlation": 252602 } }, { "ph": "s", "id": 252602, "pid": 76337, "tid": -914061504, "ts": 1716454225396369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225396447, "dur": 1, "args": { "External id": 252618, "cbid": 251, "correlation": 252618 } }, { "ph": "f", "id": 252618, "pid": 76337, "tid": -914061504, "ts": 1716454225396447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225456831, "dur": 1, "args": { "External id": 252620, "device": 5, "context": 1, "stream": 7, "correlation": 252620, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 252620, "pid": 5, "tid": 7, "ts": 1716454225456831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225396452, "dur": 11, "args": { "External id": 252620, "cbid": 51, "correlation": 252620 } }, { "ph": "s", "id": 252620, "pid": 76337, "tid": -914061504, "ts": 1716454225396452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225456834, "dur": 274, "args": { "External id": 252621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252621, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252621, "pid": 5, "tid": 7, "ts": 1716454225456834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396465, "dur": 11, "args": { "External id": 252621, "cbid": 211, "correlation": 252621 } }, { "ph": "s", "id": 252621, "pid": 76337, "tid": -914061504, "ts": 1716454225396465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225457110, "dur": 14, "args": { "External id": 252629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252629, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252629, "pid": 5, "tid": 7, "ts": 1716454225457110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396507, "dur": 11, "args": { "External id": 252629, "cbid": 211, "correlation": 252629 } }, { "ph": "s", "id": 252629, "pid": 76337, "tid": -914061504, "ts": 1716454225396507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225457125, "dur": 39, "args": { "External id": 252640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252640, "pid": 5, "tid": 7, "ts": 1716454225457125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396577, "dur": 12, "args": { "External id": 252640, "cbid": 211, "correlation": 252640 } }, { "ph": "s", "id": 252640, "pid": 76337, "tid": -914061504, "ts": 1716454225396577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225396641, "dur": 0, "args": { "External id": 252652, "cbid": 317, "correlation": 252652 } }, { "ph": "f", "id": 252652, "pid": 76337, "tid": -914061504, "ts": 1716454225396641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225396642, "dur": 0, "args": { "External id": 252653, "cbid": 203, "correlation": 252653 } }, { "ph": "f", "id": 252653, "pid": 76337, "tid": -914061504, "ts": 1716454225396642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225396642, "dur": 0, "args": { "External id": 252654, "cbid": 205, "correlation": 252654 } }, { "ph": "f", "id": 252654, "pid": 76337, "tid": -914061504, "ts": 1716454225396642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225457165, "dur": 13, "args": { "External id": 252658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252658, "pid": 5, "tid": 7, "ts": 1716454225457165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396657, "dur": 13, "args": { "External id": 252658, "cbid": 211, "correlation": 252658 } }, { "ph": "s", "id": 252658, "pid": 76337, "tid": -914061504, "ts": 1716454225396657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225457179, "dur": 4, "args": { "External id": 252660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252660, "pid": 5, "tid": 7, "ts": 1716454225457179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396675, "dur": 6, "args": { "External id": 252660, "cbid": 211, "correlation": 252660 } }, { "ph": "s", "id": 252660, "pid": 76337, "tid": -914061504, "ts": 1716454225396675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225396684, "dur": 0, "args": { "External id": 252661, "cbid": 51, "correlation": 252661 } }, { "ph": "s", "id": 252661, "pid": 76337, "tid": -914061504, "ts": 1716454225396684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225457184, "dur": 98, "args": { "External id": 252662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252662, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 252662, "pid": 5, "tid": 7, "ts": 1716454225457184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396685, "dur": 5, "args": { "External id": 252662, "cbid": 211, "correlation": 252662 } }, { "ph": "s", "id": 252662, "pid": 76337, "tid": -914061504, "ts": 1716454225396685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225457284, "dur": 17, "args": { "External id": 252667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252667, "pid": 5, "tid": 7, "ts": 1716454225457284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396712, "dur": 9, "args": { "External id": 252667, "cbid": 211, "correlation": 252667 } }, { "ph": "s", "id": 252667, "pid": 76337, "tid": -914061504, "ts": 1716454225396712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225457302, "dur": 13, "args": { "External id": 252675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252675, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252675, "pid": 5, "tid": 7, "ts": 1716454225457302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396745, "dur": 8, "args": { "External id": 252675, "cbid": 211, "correlation": 252675 } }, { "ph": "s", "id": 252675, "pid": 76337, "tid": -914061504, "ts": 1716454225396745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225457316, "dur": 31, "args": { "External id": 252684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252684, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252684, "pid": 5, "tid": 7, "ts": 1716454225457316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396783, "dur": 10, "args": { "External id": 252684, "cbid": 211, "correlation": 252684 } }, { "ph": "s", "id": 252684, "pid": 76337, "tid": -914061504, "ts": 1716454225396783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225457348, "dur": 32, "args": { "External id": 252704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252704, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 252704, "pid": 5, "tid": 7, "ts": 1716454225457348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396855, "dur": 13, "args": { "External id": 252704, "cbid": 211, "correlation": 252704 } }, { "ph": "s", "id": 252704, "pid": 76337, "tid": -914061504, "ts": 1716454225396855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225457381, "dur": 5, "args": { "External id": 252716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252716, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252716, "pid": 5, "tid": 7, "ts": 1716454225457381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396877, "dur": 6, "args": { "External id": 252716, "cbid": 211, "correlation": 252716 } }, { "ph": "s", "id": 252716, "pid": 76337, "tid": -914061504, "ts": 1716454225396877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225457387, "dur": 32, "args": { "External id": 252719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252719, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252719, "pid": 5, "tid": 7, "ts": 1716454225457387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396895, "dur": 6, "args": { "External id": 252719, "cbid": 211, "correlation": 252719 } }, { "ph": "s", "id": 252719, "pid": 76337, "tid": -914061504, "ts": 1716454225396895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225457420, "dur": 22, "args": { "External id": 252728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252728, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252728, "pid": 5, "tid": 7, "ts": 1716454225457420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225396935, "dur": 10, "args": { "External id": 252728, "cbid": 211, "correlation": 252728 } }, { "ph": "s", "id": 252728, "pid": 76337, "tid": -914061504, "ts": 1716454225396935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225396994, "dur": 0, "args": { "External id": 252738, "cbid": 317, "correlation": 252738 } }, { "ph": "f", "id": 252738, "pid": 76337, "tid": -914061504, "ts": 1716454225396994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225396995, "dur": 0, "args": { "External id": 252739, "cbid": 203, "correlation": 252739 } }, { "ph": "f", "id": 252739, "pid": 76337, "tid": -914061504, "ts": 1716454225396995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225396995, "dur": 0, "args": { "External id": 252740, "cbid": 205, "correlation": 252740 } }, { "ph": "f", "id": 252740, "pid": 76337, "tid": -914061504, "ts": 1716454225396995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225457443, "dur": 22, "args": { "External id": 252744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252744, "pid": 5, "tid": 7, "ts": 1716454225457443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397009, "dur": 13, "args": { "External id": 252744, "cbid": 211, "correlation": 252744 } }, { "ph": "s", "id": 252744, "pid": 76337, "tid": -914061504, "ts": 1716454225397009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225457467, "dur": 326, "args": { "External id": 252746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252746, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252746, "pid": 5, "tid": 7, "ts": 1716454225457467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397024, "dur": 5, "args": { "External id": 252746, "cbid": 211, "correlation": 252746 } }, { "ph": "s", "id": 252746, "pid": 76337, "tid": -914061504, "ts": 1716454225397024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225457794, "dur": 1, "args": { "External id": 252748, "device": 5, "context": 1, "stream": 7, "correlation": 252748, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 252748, "pid": 5, "tid": 7, "ts": 1716454225457794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225397035, "dur": 7, "args": { "External id": 252748, "cbid": 51, "correlation": 252748 } }, { "ph": "s", "id": 252748, "pid": 76337, "tid": -914061504, "ts": 1716454225397035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225457798, "dur": 1278, "args": { "External id": 252749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252749, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252749, "pid": 5, "tid": 7, "ts": 1716454225457798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397043, "dur": 6, "args": { "External id": 252749, "cbid": 211, "correlation": 252749 } }, { "ph": "s", "id": 252749, "pid": 76337, "tid": -914061504, "ts": 1716454225397043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225459078, "dur": 14, "args": { "External id": 252751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252751, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252751, "pid": 5, "tid": 7, "ts": 1716454225459078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397054, "dur": 5, "args": { "External id": 252751, "cbid": 211, "correlation": 252751 } }, { "ph": "s", "id": 252751, "pid": 76337, "tid": -914061504, "ts": 1716454225397054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225459093, "dur": 15, "args": { "External id": 252757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252757, "pid": 5, "tid": 7, "ts": 1716454225459093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397082, "dur": 8, "args": { "External id": 252757, "cbid": 211, "correlation": 252757 } }, { "ph": "s", "id": 252757, "pid": 76337, "tid": -914061504, "ts": 1716454225397082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225459109, "dur": 3, "args": { "External id": 252765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252765, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 252765, "pid": 5, "tid": 7, "ts": 1716454225459109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397127, "dur": 9, "args": { "External id": 252765, "cbid": 211, "correlation": 252765 } }, { "ph": "s", "id": 252765, "pid": 76337, "tid": -914061504, "ts": 1716454225397127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225397191, "dur": 1, "args": { "External id": 252781, "cbid": 251, "correlation": 252781 } }, { "ph": "f", "id": 252781, "pid": 76337, "tid": -914061504, "ts": 1716454225397191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225397196, "dur": 0, "args": { "External id": 252783, "cbid": 251, "correlation": 252783 } }, { "ph": "f", "id": 252783, "pid": 76337, "tid": -914061504, "ts": 1716454225397196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225459114, "dur": 13, "args": { "External id": 252784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252784, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252784, "pid": 5, "tid": 7, "ts": 1716454225459114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397198, "dur": 11, "args": { "External id": 252784, "cbid": 211, "correlation": 252784 } }, { "ph": "s", "id": 252784, "pid": 76337, "tid": -914061504, "ts": 1716454225397198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225459129, "dur": 5, "args": { "External id": 252786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252786, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252786, "pid": 5, "tid": 7, "ts": 1716454225459129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397211, "dur": 6, "args": { "External id": 252786, "cbid": 211, "correlation": 252786 } }, { "ph": "s", "id": 252786, "pid": 76337, "tid": -914061504, "ts": 1716454225397211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225459135, "dur": 16, "args": { "External id": 252796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252796, "pid": 5, "tid": 7, "ts": 1716454225459135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397269, "dur": 13, "args": { "External id": 252796, "cbid": 211, "correlation": 252796 } }, { "ph": "s", "id": 252796, "pid": 76337, "tid": -914061504, "ts": 1716454225397269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225459153, "dur": 17, "args": { "External id": 252816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252816, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 252816, "pid": 5, "tid": 7, "ts": 1716454225459153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397334, "dur": 10, "args": { "External id": 252816, "cbid": 211, "correlation": 252816 } }, { "ph": "s", "id": 252816, "pid": 76337, "tid": -914061504, "ts": 1716454225397334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225459172, "dur": 4, "args": { "External id": 252828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252828, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 252828, "pid": 5, "tid": 7, "ts": 1716454225459172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397355, "dur": 6, "args": { "External id": 252828, "cbid": 211, "correlation": 252828 } }, { "ph": "s", "id": 252828, "pid": 76337, "tid": -914061504, "ts": 1716454225397355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225459178, "dur": 17, "args": { "External id": 252831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252831, "pid": 5, "tid": 7, "ts": 1716454225459178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397374, "dur": 6, "args": { "External id": 252831, "cbid": 211, "correlation": 252831 } }, { "ph": "s", "id": 252831, "pid": 76337, "tid": -914061504, "ts": 1716454225397374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225459196, "dur": 13, "args": { "External id": 252840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252840, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252840, "pid": 5, "tid": 7, "ts": 1716454225459196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397414, "dur": 11, "args": { "External id": 252840, "cbid": 211, "correlation": 252840 } }, { "ph": "s", "id": 252840, "pid": 76337, "tid": -914061504, "ts": 1716454225397414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225397477, "dur": 0, "args": { "External id": 252850, "cbid": 317, "correlation": 252850 } }, { "ph": "f", "id": 252850, "pid": 76337, "tid": -914061504, "ts": 1716454225397477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225397478, "dur": 0, "args": { "External id": 252851, "cbid": 203, "correlation": 252851 } }, { "ph": "f", "id": 252851, "pid": 76337, "tid": -914061504, "ts": 1716454225397478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225397479, "dur": 0, "args": { "External id": 252852, "cbid": 205, "correlation": 252852 } }, { "ph": "f", "id": 252852, "pid": 76337, "tid": -914061504, "ts": 1716454225397479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225459211, "dur": 12, "args": { "External id": 252856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252856, "pid": 5, "tid": 7, "ts": 1716454225459211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397493, "dur": 12, "args": { "External id": 252856, "cbid": 211, "correlation": 252856 } }, { "ph": "s", "id": 252856, "pid": 76337, "tid": -914061504, "ts": 1716454225397493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225459224, "dur": 164, "args": { "External id": 252858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252858, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252858, "pid": 5, "tid": 7, "ts": 1716454225459224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397507, "dur": 5, "args": { "External id": 252858, "cbid": 211, "correlation": 252858 } }, { "ph": "s", "id": 252858, "pid": 76337, "tid": -914061504, "ts": 1716454225397507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225459391, "dur": 1, "args": { "External id": 252860, "device": 5, "context": 1, "stream": 7, "correlation": 252860, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 252860, "pid": 5, "tid": 7, "ts": 1716454225459391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225397518, "dur": 7, "args": { "External id": 252860, "cbid": 51, "correlation": 252860 } }, { "ph": "s", "id": 252860, "pid": 76337, "tid": -914061504, "ts": 1716454225397518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225459394, "dur": 660, "args": { "External id": 252861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252861, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 252861, "pid": 5, "tid": 7, "ts": 1716454225459394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397525, "dur": 6, "args": { "External id": 252861, "cbid": 211, "correlation": 252861 } }, { "ph": "s", "id": 252861, "pid": 76337, "tid": -914061504, "ts": 1716454225397525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225460056, "dur": 12, "args": { "External id": 252863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252863, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252863, "pid": 5, "tid": 7, "ts": 1716454225460056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397536, "dur": 5, "args": { "External id": 252863, "cbid": 211, "correlation": 252863 } }, { "ph": "s", "id": 252863, "pid": 76337, "tid": -914061504, "ts": 1716454225397536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225460069, "dur": 15, "args": { "External id": 252869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252869, "pid": 5, "tid": 7, "ts": 1716454225460069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397564, "dur": 10, "args": { "External id": 252869, "cbid": 211, "correlation": 252869 } }, { "ph": "s", "id": 252869, "pid": 76337, "tid": -914061504, "ts": 1716454225397564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225397623, "dur": 0, "args": { "External id": 252879, "cbid": 317, "correlation": 252879 } }, { "ph": "f", "id": 252879, "pid": 76337, "tid": -914061504, "ts": 1716454225397623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225397624, "dur": 0, "args": { "External id": 252880, "cbid": 203, "correlation": 252880 } }, { "ph": "f", "id": 252880, "pid": 76337, "tid": -914061504, "ts": 1716454225397624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225397625, "dur": 0, "args": { "External id": 252881, "cbid": 205, "correlation": 252881 } }, { "ph": "f", "id": 252881, "pid": 76337, "tid": -914061504, "ts": 1716454225397625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225460085, "dur": 21, "args": { "External id": 252885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252885, "pid": 5, "tid": 7, "ts": 1716454225460085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397636, "dur": 11, "args": { "External id": 252885, "cbid": 211, "correlation": 252885 } }, { "ph": "s", "id": 252885, "pid": 76337, "tid": -914061504, "ts": 1716454225397636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225460108, "dur": 4, "args": { "External id": 252887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252887, "pid": 5, "tid": 7, "ts": 1716454225460108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397651, "dur": 6, "args": { "External id": 252887, "cbid": 211, "correlation": 252887 } }, { "ph": "s", "id": 252887, "pid": 76337, "tid": -914061504, "ts": 1716454225397651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225397660, "dur": 0, "args": { "External id": 252888, "cbid": 51, "correlation": 252888 } }, { "ph": "s", "id": 252888, "pid": 76337, "tid": -914061504, "ts": 1716454225397660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225460114, "dur": 176, "args": { "External id": 252889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252889, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 252889, "pid": 5, "tid": 7, "ts": 1716454225460114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397661, "dur": 5, "args": { "External id": 252889, "cbid": 211, "correlation": 252889 } }, { "ph": "s", "id": 252889, "pid": 76337, "tid": -914061504, "ts": 1716454225397661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225460291, "dur": 16, "args": { "External id": 252894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252894, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252894, "pid": 5, "tid": 7, "ts": 1716454225460291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397686, "dur": 8, "args": { "External id": 252894, "cbid": 211, "correlation": 252894 } }, { "ph": "s", "id": 252894, "pid": 76337, "tid": -914061504, "ts": 1716454225397686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225460309, "dur": 13, "args": { "External id": 252902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252902, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252902, "pid": 5, "tid": 7, "ts": 1716454225460309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397714, "dur": 9, "args": { "External id": 252902, "cbid": 211, "correlation": 252902 } }, { "ph": "s", "id": 252902, "pid": 76337, "tid": -914061504, "ts": 1716454225397714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225460323, "dur": 10, "args": { "External id": 252910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252910, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252910, "pid": 5, "tid": 7, "ts": 1716454225460323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397743, "dur": 8, "args": { "External id": 252910, "cbid": 211, "correlation": 252910 } }, { "ph": "s", "id": 252910, "pid": 76337, "tid": -914061504, "ts": 1716454225397743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225460334, "dur": 19, "args": { "External id": 252930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252930, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 252930, "pid": 5, "tid": 7, "ts": 1716454225460334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397825, "dur": 13, "args": { "External id": 252930, "cbid": 211, "correlation": 252930 } }, { "ph": "s", "id": 252930, "pid": 76337, "tid": -914061504, "ts": 1716454225397825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225460355, "dur": 4, "args": { "External id": 252942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252942, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 252942, "pid": 5, "tid": 7, "ts": 1716454225460355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397848, "dur": 6, "args": { "External id": 252942, "cbid": 211, "correlation": 252942 } }, { "ph": "s", "id": 252942, "pid": 76337, "tid": -914061504, "ts": 1716454225397848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225460361, "dur": 17, "args": { "External id": 252945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252945, "pid": 5, "tid": 7, "ts": 1716454225460361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397866, "dur": 7, "args": { "External id": 252945, "cbid": 211, "correlation": 252945 } }, { "ph": "s", "id": 252945, "pid": 76337, "tid": -914061504, "ts": 1716454225397866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225397923, "dur": 0, "args": { "External id": 252956, "cbid": 317, "correlation": 252956 } }, { "ph": "f", "id": 252956, "pid": 76337, "tid": -914061504, "ts": 1716454225397923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225397924, "dur": 0, "args": { "External id": 252957, "cbid": 203, "correlation": 252957 } }, { "ph": "f", "id": 252957, "pid": 76337, "tid": -914061504, "ts": 1716454225397924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225397925, "dur": 0, "args": { "External id": 252958, "cbid": 205, "correlation": 252958 } }, { "ph": "f", "id": 252958, "pid": 76337, "tid": -914061504, "ts": 1716454225397925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225460379, "dur": 12, "args": { "External id": 252962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252962, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252962, "pid": 5, "tid": 7, "ts": 1716454225460379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397939, "dur": 11, "args": { "External id": 252962, "cbid": 211, "correlation": 252962 } }, { "ph": "s", "id": 252962, "pid": 76337, "tid": -914061504, "ts": 1716454225397939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225460393, "dur": 4, "args": { "External id": 252964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 252964, "pid": 5, "tid": 7, "ts": 1716454225460393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397954, "dur": 6, "args": { "External id": 252964, "cbid": 211, "correlation": 252964 } }, { "ph": "s", "id": 252964, "pid": 76337, "tid": -914061504, "ts": 1716454225397954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225397963, "dur": 0, "args": { "External id": 252965, "cbid": 51, "correlation": 252965 } }, { "ph": "s", "id": 252965, "pid": 76337, "tid": -914061504, "ts": 1716454225397963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225460398, "dur": 93, "args": { "External id": 252966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252966, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 252966, "pid": 5, "tid": 7, "ts": 1716454225460398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397964, "dur": 5, "args": { "External id": 252966, "cbid": 211, "correlation": 252966 } }, { "ph": "s", "id": 252966, "pid": 76337, "tid": -914061504, "ts": 1716454225397964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225460492, "dur": 16, "args": { "External id": 252971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252971, "pid": 5, "tid": 7, "ts": 1716454225460492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225397998, "dur": 10, "args": { "External id": 252971, "cbid": 211, "correlation": 252971 } }, { "ph": "s", "id": 252971, "pid": 76337, "tid": -914061504, "ts": 1716454225397998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225460509, "dur": 85, "args": { "External id": 252980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 252980, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 252980, "pid": 5, "tid": 7, "ts": 1716454225460509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398081, "dur": 14, "args": { "External id": 252980, "cbid": 211, "correlation": 252980 } }, { "ph": "s", "id": 252980, "pid": 76337, "tid": -914061504, "ts": 1716454225398081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225460595, "dur": 31, "args": { "External id": 253002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253002, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253002, "pid": 5, "tid": 7, "ts": 1716454225460595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398137, "dur": 10, "args": { "External id": 253002, "cbid": 211, "correlation": 253002 } }, { "ph": "s", "id": 253002, "pid": 76337, "tid": -914061504, "ts": 1716454225398137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398225, "dur": 1, "args": { "External id": 253013, "cbid": 251, "correlation": 253013 } }, { "ph": "f", "id": 253013, "pid": 76337, "tid": -914061504, "ts": 1716454225398225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225460627, "dur": 168, "args": { "External id": 253014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253014, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253014, "pid": 5, "tid": 7, "ts": 1716454225460627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398231, "dur": 13, "args": { "External id": 253014, "cbid": 211, "correlation": 253014 } }, { "ph": "s", "id": 253014, "pid": 76337, "tid": -914061504, "ts": 1716454225398231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398301, "dur": 1, "args": { "External id": 253025, "cbid": 251, "correlation": 253025 } }, { "ph": "f", "id": 253025, "pid": 76337, "tid": -914061504, "ts": 1716454225398301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225460796, "dur": 160, "args": { "External id": 253026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253026, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253026, "pid": 5, "tid": 7, "ts": 1716454225460796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398305, "dur": 11, "args": { "External id": 253026, "cbid": 211, "correlation": 253026 } }, { "ph": "s", "id": 253026, "pid": 76337, "tid": -914061504, "ts": 1716454225398305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398371, "dur": 1, "args": { "External id": 253037, "cbid": 251, "correlation": 253037 } }, { "ph": "f", "id": 253037, "pid": 76337, "tid": -914061504, "ts": 1716454225398371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225460957, "dur": 161, "args": { "External id": 253038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253038, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253038, "pid": 5, "tid": 7, "ts": 1716454225460957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398375, "dur": 12, "args": { "External id": 253038, "cbid": 211, "correlation": 253038 } }, { "ph": "s", "id": 253038, "pid": 76337, "tid": -914061504, "ts": 1716454225398375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225461120, "dur": 341, "args": { "External id": 253063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253063, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253063, "pid": 5, "tid": 7, "ts": 1716454225461120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398460, "dur": 13, "args": { "External id": 253063, "cbid": 211, "correlation": 253063 } }, { "ph": "s", "id": 253063, "pid": 76337, "tid": -914061504, "ts": 1716454225398460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398559, "dur": 1, "args": { "External id": 253081, "cbid": 251, "correlation": 253081 } }, { "ph": "f", "id": 253081, "pid": 76337, "tid": -914061504, "ts": 1716454225398559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225461462, "dur": 169, "args": { "External id": 253083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253083, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253083, "pid": 5, "tid": 7, "ts": 1716454225461462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398565, "dur": 14, "args": { "External id": 253083, "cbid": 211, "correlation": 253083 } }, { "ph": "s", "id": 253083, "pid": 76337, "tid": -914061504, "ts": 1716454225398565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225461632, "dur": 19, "args": { "External id": 253091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253091, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253091, "pid": 5, "tid": 7, "ts": 1716454225461632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398635, "dur": 12, "args": { "External id": 253091, "cbid": 211, "correlation": 253091 } }, { "ph": "s", "id": 253091, "pid": 76337, "tid": -914061504, "ts": 1716454225398635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225461653, "dur": 28, "args": { "External id": 253099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253099, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253099, "pid": 5, "tid": 7, "ts": 1716454225461653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398674, "dur": 9, "args": { "External id": 253099, "cbid": 211, "correlation": 253099 } }, { "ph": "s", "id": 253099, "pid": 76337, "tid": -914061504, "ts": 1716454225398674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225461682, "dur": 19, "args": { "External id": 253110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253110, "pid": 5, "tid": 7, "ts": 1716454225461682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398746, "dur": 12, "args": { "External id": 253110, "cbid": 211, "correlation": 253110 } }, { "ph": "s", "id": 253110, "pid": 76337, "tid": -914061504, "ts": 1716454225398746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225461702, "dur": 17, "args": { "External id": 253132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253132, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253132, "pid": 5, "tid": 7, "ts": 1716454225461702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398777, "dur": 8, "args": { "External id": 253132, "cbid": 211, "correlation": 253132 } }, { "ph": "s", "id": 253132, "pid": 76337, "tid": -914061504, "ts": 1716454225398777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398862, "dur": 1, "args": { "External id": 253143, "cbid": 251, "correlation": 253143 } }, { "ph": "f", "id": 253143, "pid": 76337, "tid": -914061504, "ts": 1716454225398862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225461720, "dur": 91, "args": { "External id": 253144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253144, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253144, "pid": 5, "tid": 7, "ts": 1716454225461720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398867, "dur": 13, "args": { "External id": 253144, "cbid": 211, "correlation": 253144 } }, { "ph": "s", "id": 253144, "pid": 76337, "tid": -914061504, "ts": 1716454225398867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398938, "dur": 1, "args": { "External id": 253155, "cbid": 251, "correlation": 253155 } }, { "ph": "f", "id": 253155, "pid": 76337, "tid": -914061504, "ts": 1716454225398938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225398942, "dur": 0, "args": { "External id": 253156, "cbid": 251, "correlation": 253156 } }, { "ph": "f", "id": 253156, "pid": 76337, "tid": -914061504, "ts": 1716454225398942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225461812, "dur": 13, "args": { "External id": 253157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253157, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253157, "pid": 5, "tid": 7, "ts": 1716454225461812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398944, "dur": 12, "args": { "External id": 253157, "cbid": 211, "correlation": 253157 } }, { "ph": "s", "id": 253157, "pid": 76337, "tid": -914061504, "ts": 1716454225398944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225461826, "dur": 6, "args": { "External id": 253159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253159, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253159, "pid": 5, "tid": 7, "ts": 1716454225461826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225398958, "dur": 6, "args": { "External id": 253159, "cbid": 211, "correlation": 253159 } }, { "ph": "s", "id": 253159, "pid": 76337, "tid": -914061504, "ts": 1716454225398958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225399024, "dur": 1, "args": { "External id": 253170, "cbid": 251, "correlation": 253170 } }, { "ph": "f", "id": 253170, "pid": 76337, "tid": -914061504, "ts": 1716454225399024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225399028, "dur": 0, "args": { "External id": 253171, "cbid": 251, "correlation": 253171 } }, { "ph": "f", "id": 253171, "pid": 76337, "tid": -914061504, "ts": 1716454225399028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225461833, "dur": 8, "args": { "External id": 253172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253172, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253172, "pid": 5, "tid": 7, "ts": 1716454225461833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399029, "dur": 13, "args": { "External id": 253172, "cbid": 211, "correlation": 253172 } }, { "ph": "s", "id": 253172, "pid": 76337, "tid": -914061504, "ts": 1716454225399029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225461843, "dur": 4, "args": { "External id": 253174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253174, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253174, "pid": 5, "tid": 7, "ts": 1716454225461843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399044, "dur": 5, "args": { "External id": 253174, "cbid": 211, "correlation": 253174 } }, { "ph": "s", "id": 253174, "pid": 76337, "tid": -914061504, "ts": 1716454225399044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225461848, "dur": 56, "args": { "External id": 253199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253199, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253199, "pid": 5, "tid": 7, "ts": 1716454225461848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399122, "dur": 13, "args": { "External id": 253199, "cbid": 211, "correlation": 253199 } }, { "ph": "s", "id": 253199, "pid": 76337, "tid": -914061504, "ts": 1716454225399122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225399221, "dur": 1, "args": { "External id": 253217, "cbid": 251, "correlation": 253217 } }, { "ph": "f", "id": 253217, "pid": 76337, "tid": -914061504, "ts": 1716454225399221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225461905, "dur": 93, "args": { "External id": 253219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253219, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253219, "pid": 5, "tid": 7, "ts": 1716454225461905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399228, "dur": 14, "args": { "External id": 253219, "cbid": 211, "correlation": 253219 } }, { "ph": "s", "id": 253219, "pid": 76337, "tid": -914061504, "ts": 1716454225399228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225462000, "dur": 9, "args": { "External id": 253227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253227, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253227, "pid": 5, "tid": 7, "ts": 1716454225462000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399297, "dur": 12, "args": { "External id": 253227, "cbid": 211, "correlation": 253227 } }, { "ph": "s", "id": 253227, "pid": 76337, "tid": -914061504, "ts": 1716454225399297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225462010, "dur": 21, "args": { "External id": 253235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253235, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253235, "pid": 5, "tid": 7, "ts": 1716454225462010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399338, "dur": 9, "args": { "External id": 253235, "cbid": 211, "correlation": 253235 } }, { "ph": "s", "id": 253235, "pid": 76337, "tid": -914061504, "ts": 1716454225399338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225462033, "dur": 18, "args": { "External id": 253257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253257, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253257, "pid": 5, "tid": 7, "ts": 1716454225462033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399390, "dur": 10, "args": { "External id": 253257, "cbid": 211, "correlation": 253257 } }, { "ph": "s", "id": 253257, "pid": 76337, "tid": -914061504, "ts": 1716454225399390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225399476, "dur": 1, "args": { "External id": 253273, "cbid": 251, "correlation": 253273 } }, { "ph": "f", "id": 253273, "pid": 76337, "tid": -914061504, "ts": 1716454225399476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225399481, "dur": 0, "args": { "External id": 253275, "cbid": 251, "correlation": 253275 } }, { "ph": "f", "id": 253275, "pid": 76337, "tid": -914061504, "ts": 1716454225399481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225462052, "dur": 502, "args": { "External id": 253276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253276, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253276, "pid": 5, "tid": 7, "ts": 1716454225462052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399483, "dur": 14, "args": { "External id": 253276, "cbid": 211, "correlation": 253276 } }, { "ph": "s", "id": 253276, "pid": 76337, "tid": -914061504, "ts": 1716454225399483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225462555, "dur": 66, "args": { "External id": 253284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253284, "pid": 5, "tid": 7, "ts": 1716454225462555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399549, "dur": 12, "args": { "External id": 253284, "cbid": 211, "correlation": 253284 } }, { "ph": "s", "id": 253284, "pid": 76337, "tid": -914061504, "ts": 1716454225399549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225462622, "dur": 67, "args": { "External id": 253292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253292, "pid": 5, "tid": 7, "ts": 1716454225462622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399580, "dur": 9, "args": { "External id": 253292, "cbid": 211, "correlation": 253292 } }, { "ph": "s", "id": 253292, "pid": 76337, "tid": -914061504, "ts": 1716454225399580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225399661, "dur": 1, "args": { "External id": 253308, "cbid": 251, "correlation": 253308 } }, { "ph": "f", "id": 253308, "pid": 76337, "tid": -914061504, "ts": 1716454225399661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225462691, "dur": 1, "args": { "External id": 253310, "device": 5, "context": 1, "stream": 7, "correlation": 253310, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 253310, "pid": 5, "tid": 7, "ts": 1716454225462691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225399666, "dur": 9, "args": { "External id": 253310, "cbid": 51, "correlation": 253310 } }, { "ph": "s", "id": 253310, "pid": 76337, "tid": -914061504, "ts": 1716454225399666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225462695, "dur": 273, "args": { "External id": 253311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253311, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253311, "pid": 5, "tid": 7, "ts": 1716454225462695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399677, "dur": 12, "args": { "External id": 253311, "cbid": 211, "correlation": 253311 } }, { "ph": "s", "id": 253311, "pid": 76337, "tid": -914061504, "ts": 1716454225399677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225462969, "dur": 14, "args": { "External id": 253319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253319, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253319, "pid": 5, "tid": 7, "ts": 1716454225462969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399719, "dur": 10, "args": { "External id": 253319, "cbid": 211, "correlation": 253319 } }, { "ph": "s", "id": 253319, "pid": 76337, "tid": -914061504, "ts": 1716454225399719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225462984, "dur": 39, "args": { "External id": 253330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253330, "pid": 5, "tid": 7, "ts": 1716454225462984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399787, "dur": 12, "args": { "External id": 253330, "cbid": 211, "correlation": 253330 } }, { "ph": "s", "id": 253330, "pid": 76337, "tid": -914061504, "ts": 1716454225399787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225399851, "dur": 0, "args": { "External id": 253342, "cbid": 317, "correlation": 253342 } }, { "ph": "f", "id": 253342, "pid": 76337, "tid": -914061504, "ts": 1716454225399851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225399852, "dur": 0, "args": { "External id": 253343, "cbid": 203, "correlation": 253343 } }, { "ph": "f", "id": 253343, "pid": 76337, "tid": -914061504, "ts": 1716454225399852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225399852, "dur": 0, "args": { "External id": 253344, "cbid": 205, "correlation": 253344 } }, { "ph": "f", "id": 253344, "pid": 76337, "tid": -914061504, "ts": 1716454225399852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225463025, "dur": 13, "args": { "External id": 253348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253348, "pid": 5, "tid": 7, "ts": 1716454225463025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399866, "dur": 12, "args": { "External id": 253348, "cbid": 211, "correlation": 253348 } }, { "ph": "s", "id": 253348, "pid": 76337, "tid": -914061504, "ts": 1716454225399866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225463039, "dur": 4, "args": { "External id": 253350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253350, "pid": 5, "tid": 7, "ts": 1716454225463039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399882, "dur": 6, "args": { "External id": 253350, "cbid": 211, "correlation": 253350 } }, { "ph": "s", "id": 253350, "pid": 76337, "tid": -914061504, "ts": 1716454225399882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225399892, "dur": 0, "args": { "External id": 253351, "cbid": 51, "correlation": 253351 } }, { "ph": "s", "id": 253351, "pid": 76337, "tid": -914061504, "ts": 1716454225399892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225463045, "dur": 98, "args": { "External id": 253352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253352, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 253352, "pid": 5, "tid": 7, "ts": 1716454225463045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399893, "dur": 5, "args": { "External id": 253352, "cbid": 211, "correlation": 253352 } }, { "ph": "s", "id": 253352, "pid": 76337, "tid": -914061504, "ts": 1716454225399893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225463144, "dur": 17, "args": { "External id": 253357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253357, "pid": 5, "tid": 7, "ts": 1716454225463144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399920, "dur": 9, "args": { "External id": 253357, "cbid": 211, "correlation": 253357 } }, { "ph": "s", "id": 253357, "pid": 76337, "tid": -914061504, "ts": 1716454225399920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225463163, "dur": 12, "args": { "External id": 253365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253365, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253365, "pid": 5, "tid": 7, "ts": 1716454225463163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225399953, "dur": 8, "args": { "External id": 253365, "cbid": 211, "correlation": 253365 } }, { "ph": "s", "id": 253365, "pid": 76337, "tid": -914061504, "ts": 1716454225399953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225463176, "dur": 27, "args": { "External id": 253374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253374, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253374, "pid": 5, "tid": 7, "ts": 1716454225463176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400000, "dur": 11, "args": { "External id": 253374, "cbid": 211, "correlation": 253374 } }, { "ph": "s", "id": 253374, "pid": 76337, "tid": -914061504, "ts": 1716454225400000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225463204, "dur": 25, "args": { "External id": 253394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253394, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 253394, "pid": 5, "tid": 7, "ts": 1716454225463204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400073, "dur": 11, "args": { "External id": 253394, "cbid": 211, "correlation": 253394 } }, { "ph": "s", "id": 253394, "pid": 76337, "tid": -914061504, "ts": 1716454225400073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225463231, "dur": 5, "args": { "External id": 253406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253406, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 253406, "pid": 5, "tid": 7, "ts": 1716454225463231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400095, "dur": 7, "args": { "External id": 253406, "cbid": 211, "correlation": 253406 } }, { "ph": "s", "id": 253406, "pid": 76337, "tid": -914061504, "ts": 1716454225400095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225463237, "dur": 24, "args": { "External id": 253409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253409, "pid": 5, "tid": 7, "ts": 1716454225463237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400114, "dur": 6, "args": { "External id": 253409, "cbid": 211, "correlation": 253409 } }, { "ph": "s", "id": 253409, "pid": 76337, "tid": -914061504, "ts": 1716454225400114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225463262, "dur": 17, "args": { "External id": 253418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253418, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253418, "pid": 5, "tid": 7, "ts": 1716454225463262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400153, "dur": 10, "args": { "External id": 253418, "cbid": 211, "correlation": 253418 } }, { "ph": "s", "id": 253418, "pid": 76337, "tid": -914061504, "ts": 1716454225400153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225400204, "dur": 0, "args": { "External id": 253428, "cbid": 317, "correlation": 253428 } }, { "ph": "f", "id": 253428, "pid": 76337, "tid": -914061504, "ts": 1716454225400204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225400205, "dur": 0, "args": { "External id": 253429, "cbid": 203, "correlation": 253429 } }, { "ph": "f", "id": 253429, "pid": 76337, "tid": -914061504, "ts": 1716454225400205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225400206, "dur": 0, "args": { "External id": 253430, "cbid": 205, "correlation": 253430 } }, { "ph": "f", "id": 253430, "pid": 76337, "tid": -914061504, "ts": 1716454225400206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225463281, "dur": 17, "args": { "External id": 253434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253434, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253434, "pid": 5, "tid": 7, "ts": 1716454225463281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400220, "dur": 11, "args": { "External id": 253434, "cbid": 211, "correlation": 253434 } }, { "ph": "s", "id": 253434, "pid": 76337, "tid": -914061504, "ts": 1716454225400220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225463300, "dur": 246, "args": { "External id": 253436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253436, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253436, "pid": 5, "tid": 7, "ts": 1716454225463300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400234, "dur": 5, "args": { "External id": 253436, "cbid": 211, "correlation": 253436 } }, { "ph": "s", "id": 253436, "pid": 76337, "tid": -914061504, "ts": 1716454225400234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225463548, "dur": 1, "args": { "External id": 253438, "device": 5, "context": 1, "stream": 7, "correlation": 253438, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 253438, "pid": 5, "tid": 7, "ts": 1716454225463548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225400246, "dur": 8, "args": { "External id": 253438, "cbid": 51, "correlation": 253438 } }, { "ph": "s", "id": 253438, "pid": 76337, "tid": -914061504, "ts": 1716454225400246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225463551, "dur": 823, "args": { "External id": 253439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253439, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253439, "pid": 5, "tid": 7, "ts": 1716454225463551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400255, "dur": 6, "args": { "External id": 253439, "cbid": 211, "correlation": 253439 } }, { "ph": "s", "id": 253439, "pid": 76337, "tid": -914061504, "ts": 1716454225400255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225464376, "dur": 14, "args": { "External id": 253441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253441, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253441, "pid": 5, "tid": 7, "ts": 1716454225464376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400266, "dur": 5, "args": { "External id": 253441, "cbid": 211, "correlation": 253441 } }, { "ph": "s", "id": 253441, "pid": 76337, "tid": -914061504, "ts": 1716454225400266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225464391, "dur": 15, "args": { "External id": 253447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253447, "pid": 5, "tid": 7, "ts": 1716454225464391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400294, "dur": 8, "args": { "External id": 253447, "cbid": 211, "correlation": 253447 } }, { "ph": "s", "id": 253447, "pid": 76337, "tid": -914061504, "ts": 1716454225400294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225464407, "dur": 3, "args": { "External id": 253455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253455, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 253455, "pid": 5, "tid": 7, "ts": 1716454225464407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400337, "dur": 10, "args": { "External id": 253455, "cbid": 211, "correlation": 253455 } }, { "ph": "s", "id": 253455, "pid": 76337, "tid": -914061504, "ts": 1716454225400337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225400401, "dur": 1, "args": { "External id": 253471, "cbid": 251, "correlation": 253471 } }, { "ph": "f", "id": 253471, "pid": 76337, "tid": -914061504, "ts": 1716454225400401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225400407, "dur": 0, "args": { "External id": 253473, "cbid": 251, "correlation": 253473 } }, { "ph": "f", "id": 253473, "pid": 76337, "tid": -914061504, "ts": 1716454225400407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225464412, "dur": 13, "args": { "External id": 253474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253474, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253474, "pid": 5, "tid": 7, "ts": 1716454225464412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400409, "dur": 11, "args": { "External id": 253474, "cbid": 211, "correlation": 253474 } }, { "ph": "s", "id": 253474, "pid": 76337, "tid": -914061504, "ts": 1716454225400409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225464427, "dur": 5, "args": { "External id": 253476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253476, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253476, "pid": 5, "tid": 7, "ts": 1716454225464427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400421, "dur": 5, "args": { "External id": 253476, "cbid": 211, "correlation": 253476 } }, { "ph": "s", "id": 253476, "pid": 76337, "tid": -914061504, "ts": 1716454225400421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225464433, "dur": 17, "args": { "External id": 253486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253486, "pid": 5, "tid": 7, "ts": 1716454225464433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400479, "dur": 13, "args": { "External id": 253486, "cbid": 211, "correlation": 253486 } }, { "ph": "s", "id": 253486, "pid": 76337, "tid": -914061504, "ts": 1716454225400479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225464452, "dur": 18, "args": { "External id": 253506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253506, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 253506, "pid": 5, "tid": 7, "ts": 1716454225464452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400545, "dur": 10, "args": { "External id": 253506, "cbid": 211, "correlation": 253506 } }, { "ph": "s", "id": 253506, "pid": 76337, "tid": -914061504, "ts": 1716454225400545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225464471, "dur": 4, "args": { "External id": 253518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253518, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 253518, "pid": 5, "tid": 7, "ts": 1716454225464471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400565, "dur": 6, "args": { "External id": 253518, "cbid": 211, "correlation": 253518 } }, { "ph": "s", "id": 253518, "pid": 76337, "tid": -914061504, "ts": 1716454225400565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225464476, "dur": 17, "args": { "External id": 253521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253521, "pid": 5, "tid": 7, "ts": 1716454225464476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400584, "dur": 7, "args": { "External id": 253521, "cbid": 211, "correlation": 253521 } }, { "ph": "s", "id": 253521, "pid": 76337, "tid": -914061504, "ts": 1716454225400584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225464494, "dur": 11, "args": { "External id": 253530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253530, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253530, "pid": 5, "tid": 7, "ts": 1716454225464494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400625, "dur": 10, "args": { "External id": 253530, "cbid": 211, "correlation": 253530 } }, { "ph": "s", "id": 253530, "pid": 76337, "tid": -914061504, "ts": 1716454225400625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225400687, "dur": 0, "args": { "External id": 253540, "cbid": 317, "correlation": 253540 } }, { "ph": "f", "id": 253540, "pid": 76337, "tid": -914061504, "ts": 1716454225400687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225400688, "dur": 0, "args": { "External id": 253541, "cbid": 203, "correlation": 253541 } }, { "ph": "f", "id": 253541, "pid": 76337, "tid": -914061504, "ts": 1716454225400688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225400689, "dur": 0, "args": { "External id": 253542, "cbid": 205, "correlation": 253542 } }, { "ph": "f", "id": 253542, "pid": 76337, "tid": -914061504, "ts": 1716454225400689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225464507, "dur": 11, "args": { "External id": 253546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253546, "pid": 5, "tid": 7, "ts": 1716454225464507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400704, "dur": 12, "args": { "External id": 253546, "cbid": 211, "correlation": 253546 } }, { "ph": "s", "id": 253546, "pid": 76337, "tid": -914061504, "ts": 1716454225400704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225464519, "dur": 166, "args": { "External id": 253548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253548, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253548, "pid": 5, "tid": 7, "ts": 1716454225464519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400718, "dur": 5, "args": { "External id": 253548, "cbid": 211, "correlation": 253548 } }, { "ph": "s", "id": 253548, "pid": 76337, "tid": -914061504, "ts": 1716454225400718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225464687, "dur": 1, "args": { "External id": 253550, "device": 5, "context": 1, "stream": 7, "correlation": 253550, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 253550, "pid": 5, "tid": 7, "ts": 1716454225464687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225400729, "dur": 6, "args": { "External id": 253550, "cbid": 51, "correlation": 253550 } }, { "ph": "s", "id": 253550, "pid": 76337, "tid": -914061504, "ts": 1716454225400729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225464691, "dur": 659, "args": { "External id": 253551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253551, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253551, "pid": 5, "tid": 7, "ts": 1716454225464691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400736, "dur": 6, "args": { "External id": 253551, "cbid": 211, "correlation": 253551 } }, { "ph": "s", "id": 253551, "pid": 76337, "tid": -914061504, "ts": 1716454225400736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225465351, "dur": 13, "args": { "External id": 253553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253553, "pid": 5, "tid": 7, "ts": 1716454225465351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400747, "dur": 5, "args": { "External id": 253553, "cbid": 211, "correlation": 253553 } }, { "ph": "s", "id": 253553, "pid": 76337, "tid": -914061504, "ts": 1716454225400747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225465366, "dur": 15, "args": { "External id": 253559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253559, "pid": 5, "tid": 7, "ts": 1716454225465366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400775, "dur": 9, "args": { "External id": 253559, "cbid": 211, "correlation": 253559 } }, { "ph": "s", "id": 253559, "pid": 76337, "tid": -914061504, "ts": 1716454225400775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225400833, "dur": 0, "args": { "External id": 253569, "cbid": 317, "correlation": 253569 } }, { "ph": "f", "id": 253569, "pid": 76337, "tid": -914061504, "ts": 1716454225400833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225400834, "dur": 0, "args": { "External id": 253570, "cbid": 203, "correlation": 253570 } }, { "ph": "f", "id": 253570, "pid": 76337, "tid": -914061504, "ts": 1716454225400834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225400835, "dur": 0, "args": { "External id": 253571, "cbid": 205, "correlation": 253571 } }, { "ph": "f", "id": 253571, "pid": 76337, "tid": -914061504, "ts": 1716454225400835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225465383, "dur": 18, "args": { "External id": 253575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253575, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253575, "pid": 5, "tid": 7, "ts": 1716454225465383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400849, "dur": 12, "args": { "External id": 253575, "cbid": 211, "correlation": 253575 } }, { "ph": "s", "id": 253575, "pid": 76337, "tid": -914061504, "ts": 1716454225400849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225465402, "dur": 4, "args": { "External id": 253577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253577, "pid": 5, "tid": 7, "ts": 1716454225465402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400865, "dur": 6, "args": { "External id": 253577, "cbid": 211, "correlation": 253577 } }, { "ph": "s", "id": 253577, "pid": 76337, "tid": -914061504, "ts": 1716454225400865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225400874, "dur": 0, "args": { "External id": 253578, "cbid": 51, "correlation": 253578 } }, { "ph": "s", "id": 253578, "pid": 76337, "tid": -914061504, "ts": 1716454225400874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225465407, "dur": 135, "args": { "External id": 253579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253579, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 253579, "pid": 5, "tid": 7, "ts": 1716454225465407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400875, "dur": 5, "args": { "External id": 253579, "cbid": 211, "correlation": 253579 } }, { "ph": "s", "id": 253579, "pid": 76337, "tid": -914061504, "ts": 1716454225400875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225465544, "dur": 16, "args": { "External id": 253584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253584, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253584, "pid": 5, "tid": 7, "ts": 1716454225465544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400901, "dur": 8, "args": { "External id": 253584, "cbid": 211, "correlation": 253584 } }, { "ph": "s", "id": 253584, "pid": 76337, "tid": -914061504, "ts": 1716454225400901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225465561, "dur": 12, "args": { "External id": 253592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253592, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253592, "pid": 5, "tid": 7, "ts": 1716454225465561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400929, "dur": 8, "args": { "External id": 253592, "cbid": 211, "correlation": 253592 } }, { "ph": "s", "id": 253592, "pid": 76337, "tid": -914061504, "ts": 1716454225400929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225465575, "dur": 11, "args": { "External id": 253600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253600, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253600, "pid": 5, "tid": 7, "ts": 1716454225465575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225400958, "dur": 8, "args": { "External id": 253600, "cbid": 211, "correlation": 253600 } }, { "ph": "s", "id": 253600, "pid": 76337, "tid": -914061504, "ts": 1716454225400958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225465586, "dur": 19, "args": { "External id": 253620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253620, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 253620, "pid": 5, "tid": 7, "ts": 1716454225465586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401051, "dur": 13, "args": { "External id": 253620, "cbid": 211, "correlation": 253620 } }, { "ph": "s", "id": 253620, "pid": 76337, "tid": -914061504, "ts": 1716454225401051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225465607, "dur": 4, "args": { "External id": 253632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253632, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 253632, "pid": 5, "tid": 7, "ts": 1716454225465607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401073, "dur": 6, "args": { "External id": 253632, "cbid": 211, "correlation": 253632 } }, { "ph": "s", "id": 253632, "pid": 76337, "tid": -914061504, "ts": 1716454225401073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225465612, "dur": 17, "args": { "External id": 253635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253635, "pid": 5, "tid": 7, "ts": 1716454225465612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401091, "dur": 7, "args": { "External id": 253635, "cbid": 211, "correlation": 253635 } }, { "ph": "s", "id": 253635, "pid": 76337, "tid": -914061504, "ts": 1716454225401091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225401150, "dur": 0, "args": { "External id": 253646, "cbid": 317, "correlation": 253646 } }, { "ph": "f", "id": 253646, "pid": 76337, "tid": -914061504, "ts": 1716454225401150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225401151, "dur": 0, "args": { "External id": 253647, "cbid": 203, "correlation": 253647 } }, { "ph": "f", "id": 253647, "pid": 76337, "tid": -914061504, "ts": 1716454225401151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225401152, "dur": 0, "args": { "External id": 253648, "cbid": 205, "correlation": 253648 } }, { "ph": "f", "id": 253648, "pid": 76337, "tid": -914061504, "ts": 1716454225401152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225465631, "dur": 12, "args": { "External id": 253652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253652, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253652, "pid": 5, "tid": 7, "ts": 1716454225465631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401165, "dur": 12, "args": { "External id": 253652, "cbid": 211, "correlation": 253652 } }, { "ph": "s", "id": 253652, "pid": 76337, "tid": -914061504, "ts": 1716454225401165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225465644, "dur": 3, "args": { "External id": 253654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253654, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253654, "pid": 5, "tid": 7, "ts": 1716454225465644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401181, "dur": 5, "args": { "External id": 253654, "cbid": 211, "correlation": 253654 } }, { "ph": "s", "id": 253654, "pid": 76337, "tid": -914061504, "ts": 1716454225401181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225401189, "dur": 0, "args": { "External id": 253655, "cbid": 51, "correlation": 253655 } }, { "ph": "s", "id": 253655, "pid": 76337, "tid": -914061504, "ts": 1716454225401189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225465649, "dur": 92, "args": { "External id": 253656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253656, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 253656, "pid": 5, "tid": 7, "ts": 1716454225465649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401190, "dur": 5, "args": { "External id": 253656, "cbid": 211, "correlation": 253656 } }, { "ph": "s", "id": 253656, "pid": 76337, "tid": -914061504, "ts": 1716454225401190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225465742, "dur": 16, "args": { "External id": 253661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253661, "pid": 5, "tid": 7, "ts": 1716454225465742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401217, "dur": 8, "args": { "External id": 253661, "cbid": 211, "correlation": 253661 } }, { "ph": "s", "id": 253661, "pid": 76337, "tid": -914061504, "ts": 1716454225401217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225465759, "dur": 86, "args": { "External id": 253670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253670, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253670, "pid": 5, "tid": 7, "ts": 1716454225465759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401298, "dur": 13, "args": { "External id": 253670, "cbid": 211, "correlation": 253670 } }, { "ph": "s", "id": 253670, "pid": 76337, "tid": -914061504, "ts": 1716454225401298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225465846, "dur": 31, "args": { "External id": 253692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253692, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253692, "pid": 5, "tid": 7, "ts": 1716454225465846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401356, "dur": 10, "args": { "External id": 253692, "cbid": 211, "correlation": 253692 } }, { "ph": "s", "id": 253692, "pid": 76337, "tid": -914061504, "ts": 1716454225401356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225401444, "dur": 2, "args": { "External id": 253703, "cbid": 251, "correlation": 253703 } }, { "ph": "f", "id": 253703, "pid": 76337, "tid": -914061504, "ts": 1716454225401444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225465879, "dur": 171, "args": { "External id": 253704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253704, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253704, "pid": 5, "tid": 7, "ts": 1716454225465879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401450, "dur": 13, "args": { "External id": 253704, "cbid": 211, "correlation": 253704 } }, { "ph": "s", "id": 253704, "pid": 76337, "tid": -914061504, "ts": 1716454225401450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225401519, "dur": 1, "args": { "External id": 253715, "cbid": 251, "correlation": 253715 } }, { "ph": "f", "id": 253715, "pid": 76337, "tid": -914061504, "ts": 1716454225401519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225466051, "dur": 160, "args": { "External id": 253716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253716, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253716, "pid": 5, "tid": 7, "ts": 1716454225466051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401523, "dur": 11, "args": { "External id": 253716, "cbid": 211, "correlation": 253716 } }, { "ph": "s", "id": 253716, "pid": 76337, "tid": -914061504, "ts": 1716454225401523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225401587, "dur": 1, "args": { "External id": 253727, "cbid": 251, "correlation": 253727 } }, { "ph": "f", "id": 253727, "pid": 76337, "tid": -914061504, "ts": 1716454225401587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225466213, "dur": 160, "args": { "External id": 253728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253728, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253728, "pid": 5, "tid": 7, "ts": 1716454225466213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401590, "dur": 12, "args": { "External id": 253728, "cbid": 211, "correlation": 253728 } }, { "ph": "s", "id": 253728, "pid": 76337, "tid": -914061504, "ts": 1716454225401590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225466374, "dur": 342, "args": { "External id": 253753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253753, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253753, "pid": 5, "tid": 7, "ts": 1716454225466374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401675, "dur": 12, "args": { "External id": 253753, "cbid": 211, "correlation": 253753 } }, { "ph": "s", "id": 253753, "pid": 76337, "tid": -914061504, "ts": 1716454225401675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225401774, "dur": 1, "args": { "External id": 253771, "cbid": 251, "correlation": 253771 } }, { "ph": "f", "id": 253771, "pid": 76337, "tid": -914061504, "ts": 1716454225401774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225466718, "dur": 170, "args": { "External id": 253773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253773, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253773, "pid": 5, "tid": 7, "ts": 1716454225466718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401779, "dur": 13, "args": { "External id": 253773, "cbid": 211, "correlation": 253773 } }, { "ph": "s", "id": 253773, "pid": 76337, "tid": -914061504, "ts": 1716454225401779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225466889, "dur": 19, "args": { "External id": 253781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253781, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253781, "pid": 5, "tid": 7, "ts": 1716454225466889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401849, "dur": 12, "args": { "External id": 253781, "cbid": 211, "correlation": 253781 } }, { "ph": "s", "id": 253781, "pid": 76337, "tid": -914061504, "ts": 1716454225401849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225466910, "dur": 28, "args": { "External id": 253789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253789, "pid": 5, "tid": 7, "ts": 1716454225466910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401887, "dur": 9, "args": { "External id": 253789, "cbid": 211, "correlation": 253789 } }, { "ph": "s", "id": 253789, "pid": 76337, "tid": -914061504, "ts": 1716454225401887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225466939, "dur": 18, "args": { "External id": 253800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253800, "pid": 5, "tid": 7, "ts": 1716454225466939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401959, "dur": 13, "args": { "External id": 253800, "cbid": 211, "correlation": 253800 } }, { "ph": "s", "id": 253800, "pid": 76337, "tid": -914061504, "ts": 1716454225401959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225466958, "dur": 17, "args": { "External id": 253822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253822, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253822, "pid": 5, "tid": 7, "ts": 1716454225466958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225401999, "dur": 9, "args": { "External id": 253822, "cbid": 211, "correlation": 253822 } }, { "ph": "s", "id": 253822, "pid": 76337, "tid": -914061504, "ts": 1716454225401999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402086, "dur": 1, "args": { "External id": 253833, "cbid": 251, "correlation": 253833 } }, { "ph": "f", "id": 253833, "pid": 76337, "tid": -914061504, "ts": 1716454225402086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225466976, "dur": 91, "args": { "External id": 253834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253834, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253834, "pid": 5, "tid": 7, "ts": 1716454225466976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402091, "dur": 14, "args": { "External id": 253834, "cbid": 211, "correlation": 253834 } }, { "ph": "s", "id": 253834, "pid": 76337, "tid": -914061504, "ts": 1716454225402091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402161, "dur": 1, "args": { "External id": 253845, "cbid": 251, "correlation": 253845 } }, { "ph": "f", "id": 253845, "pid": 76337, "tid": -914061504, "ts": 1716454225402161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402164, "dur": 0, "args": { "External id": 253846, "cbid": 251, "correlation": 253846 } }, { "ph": "f", "id": 253846, "pid": 76337, "tid": -914061504, "ts": 1716454225402164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225467069, "dur": 12, "args": { "External id": 253847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253847, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253847, "pid": 5, "tid": 7, "ts": 1716454225467069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402166, "dur": 12, "args": { "External id": 253847, "cbid": 211, "correlation": 253847 } }, { "ph": "s", "id": 253847, "pid": 76337, "tid": -914061504, "ts": 1716454225402166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225467082, "dur": 6, "args": { "External id": 253849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253849, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253849, "pid": 5, "tid": 7, "ts": 1716454225467082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402179, "dur": 6, "args": { "External id": 253849, "cbid": 211, "correlation": 253849 } }, { "ph": "s", "id": 253849, "pid": 76337, "tid": -914061504, "ts": 1716454225402179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402238, "dur": 1, "args": { "External id": 253860, "cbid": 251, "correlation": 253860 } }, { "ph": "f", "id": 253860, "pid": 76337, "tid": -914061504, "ts": 1716454225402238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402241, "dur": 0, "args": { "External id": 253861, "cbid": 251, "correlation": 253861 } }, { "ph": "f", "id": 253861, "pid": 76337, "tid": -914061504, "ts": 1716454225402241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225467089, "dur": 9, "args": { "External id": 253862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253862, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253862, "pid": 5, "tid": 7, "ts": 1716454225467089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402243, "dur": 12, "args": { "External id": 253862, "cbid": 211, "correlation": 253862 } }, { "ph": "s", "id": 253862, "pid": 76337, "tid": -914061504, "ts": 1716454225402243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225467099, "dur": 4, "args": { "External id": 253864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253864, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253864, "pid": 5, "tid": 7, "ts": 1716454225467099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402257, "dur": 5, "args": { "External id": 253864, "cbid": 211, "correlation": 253864 } }, { "ph": "s", "id": 253864, "pid": 76337, "tid": -914061504, "ts": 1716454225402257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225467104, "dur": 57, "args": { "External id": 253889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253889, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253889, "pid": 5, "tid": 7, "ts": 1716454225467104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402333, "dur": 12, "args": { "External id": 253889, "cbid": 211, "correlation": 253889 } }, { "ph": "s", "id": 253889, "pid": 76337, "tid": -914061504, "ts": 1716454225402333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402433, "dur": 1, "args": { "External id": 253907, "cbid": 251, "correlation": 253907 } }, { "ph": "f", "id": 253907, "pid": 76337, "tid": -914061504, "ts": 1716454225402433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225467162, "dur": 94, "args": { "External id": 253909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253909, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 253909, "pid": 5, "tid": 7, "ts": 1716454225467162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402439, "dur": 15, "args": { "External id": 253909, "cbid": 211, "correlation": 253909 } }, { "ph": "s", "id": 253909, "pid": 76337, "tid": -914061504, "ts": 1716454225402439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225467257, "dur": 10, "args": { "External id": 253917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253917, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253917, "pid": 5, "tid": 7, "ts": 1716454225467257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402509, "dur": 12, "args": { "External id": 253917, "cbid": 211, "correlation": 253917 } }, { "ph": "s", "id": 253917, "pid": 76337, "tid": -914061504, "ts": 1716454225402509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225467268, "dur": 21, "args": { "External id": 253925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253925, "pid": 5, "tid": 7, "ts": 1716454225467268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402551, "dur": 10, "args": { "External id": 253925, "cbid": 211, "correlation": 253925 } }, { "ph": "s", "id": 253925, "pid": 76337, "tid": -914061504, "ts": 1716454225402551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225467291, "dur": 18, "args": { "External id": 253947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253947, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253947, "pid": 5, "tid": 7, "ts": 1716454225467291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402602, "dur": 10, "args": { "External id": 253947, "cbid": 211, "correlation": 253947 } }, { "ph": "s", "id": 253947, "pid": 76337, "tid": -914061504, "ts": 1716454225402602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402689, "dur": 1, "args": { "External id": 253963, "cbid": 251, "correlation": 253963 } }, { "ph": "f", "id": 253963, "pid": 76337, "tid": -914061504, "ts": 1716454225402689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402694, "dur": 0, "args": { "External id": 253965, "cbid": 251, "correlation": 253965 } }, { "ph": "f", "id": 253965, "pid": 76337, "tid": -914061504, "ts": 1716454225402694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225467310, "dur": 501, "args": { "External id": 253966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253966, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 253966, "pid": 5, "tid": 7, "ts": 1716454225467310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402696, "dur": 13, "args": { "External id": 253966, "cbid": 211, "correlation": 253966 } }, { "ph": "s", "id": 253966, "pid": 76337, "tid": -914061504, "ts": 1716454225402696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225467812, "dur": 67, "args": { "External id": 253974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253974, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253974, "pid": 5, "tid": 7, "ts": 1716454225467812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402762, "dur": 13, "args": { "External id": 253974, "cbid": 211, "correlation": 253974 } }, { "ph": "s", "id": 253974, "pid": 76337, "tid": -914061504, "ts": 1716454225402762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225467880, "dur": 66, "args": { "External id": 253982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 253982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 253982, "pid": 5, "tid": 7, "ts": 1716454225467880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402793, "dur": 9, "args": { "External id": 253982, "cbid": 211, "correlation": 253982 } }, { "ph": "s", "id": 253982, "pid": 76337, "tid": -914061504, "ts": 1716454225402793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225402872, "dur": 1, "args": { "External id": 253998, "cbid": 251, "correlation": 253998 } }, { "ph": "f", "id": 253998, "pid": 76337, "tid": -914061504, "ts": 1716454225402872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225467949, "dur": 1, "args": { "External id": 254000, "device": 5, "context": 1, "stream": 7, "correlation": 254000, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 254000, "pid": 5, "tid": 7, "ts": 1716454225467949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225402876, "dur": 9, "args": { "External id": 254000, "cbid": 51, "correlation": 254000 } }, { "ph": "s", "id": 254000, "pid": 76337, "tid": -914061504, "ts": 1716454225402876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225467952, "dur": 275, "args": { "External id": 254001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254001, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 254001, "pid": 5, "tid": 7, "ts": 1716454225467952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402887, "dur": 11, "args": { "External id": 254001, "cbid": 211, "correlation": 254001 } }, { "ph": "s", "id": 254001, "pid": 76337, "tid": -914061504, "ts": 1716454225402887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225468229, "dur": 14, "args": { "External id": 254009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254009, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254009, "pid": 5, "tid": 7, "ts": 1716454225468229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225402929, "dur": 10, "args": { "External id": 254009, "cbid": 211, "correlation": 254009 } }, { "ph": "s", "id": 254009, "pid": 76337, "tid": -914061504, "ts": 1716454225402929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225468244, "dur": 38, "args": { "External id": 254020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254020, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254020, "pid": 5, "tid": 7, "ts": 1716454225468244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403005, "dur": 13, "args": { "External id": 254020, "cbid": 211, "correlation": 254020 } }, { "ph": "s", "id": 254020, "pid": 76337, "tid": -914061504, "ts": 1716454225403005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225403070, "dur": 0, "args": { "External id": 254032, "cbid": 317, "correlation": 254032 } }, { "ph": "f", "id": 254032, "pid": 76337, "tid": -914061504, "ts": 1716454225403070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225403071, "dur": 0, "args": { "External id": 254033, "cbid": 203, "correlation": 254033 } }, { "ph": "f", "id": 254033, "pid": 76337, "tid": -914061504, "ts": 1716454225403071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225403072, "dur": 0, "args": { "External id": 254034, "cbid": 205, "correlation": 254034 } }, { "ph": "f", "id": 254034, "pid": 76337, "tid": -914061504, "ts": 1716454225403072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225468283, "dur": 13, "args": { "External id": 254038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254038, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254038, "pid": 5, "tid": 7, "ts": 1716454225468283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403087, "dur": 12, "args": { "External id": 254038, "cbid": 211, "correlation": 254038 } }, { "ph": "s", "id": 254038, "pid": 76337, "tid": -914061504, "ts": 1716454225403087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225468298, "dur": 4, "args": { "External id": 254040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 254040, "pid": 5, "tid": 7, "ts": 1716454225468298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403104, "dur": 6, "args": { "External id": 254040, "cbid": 211, "correlation": 254040 } }, { "ph": "s", "id": 254040, "pid": 76337, "tid": -914061504, "ts": 1716454225403104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225403113, "dur": 0, "args": { "External id": 254041, "cbid": 51, "correlation": 254041 } }, { "ph": "s", "id": 254041, "pid": 76337, "tid": -914061504, "ts": 1716454225403113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225468304, "dur": 99, "args": { "External id": 254042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254042, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 254042, "pid": 5, "tid": 7, "ts": 1716454225468304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403114, "dur": 5, "args": { "External id": 254042, "cbid": 211, "correlation": 254042 } }, { "ph": "s", "id": 254042, "pid": 76337, "tid": -914061504, "ts": 1716454225403114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225468404, "dur": 16, "args": { "External id": 254047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254047, "pid": 5, "tid": 7, "ts": 1716454225468404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403141, "dur": 9, "args": { "External id": 254047, "cbid": 211, "correlation": 254047 } }, { "ph": "s", "id": 254047, "pid": 76337, "tid": -914061504, "ts": 1716454225403141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225468422, "dur": 12, "args": { "External id": 254055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254055, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254055, "pid": 5, "tid": 7, "ts": 1716454225468422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403173, "dur": 8, "args": { "External id": 254055, "cbid": 211, "correlation": 254055 } }, { "ph": "s", "id": 254055, "pid": 76337, "tid": -914061504, "ts": 1716454225403173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225468435, "dur": 58, "args": { "External id": 254066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254066, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254066, "pid": 5, "tid": 7, "ts": 1716454225468435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403234, "dur": 11, "args": { "External id": 254066, "cbid": 211, "correlation": 254066 } }, { "ph": "s", "id": 254066, "pid": 76337, "tid": -914061504, "ts": 1716454225403234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225403289, "dur": 0, "args": { "External id": 254076, "cbid": 317, "correlation": 254076 } }, { "ph": "f", "id": 254076, "pid": 76337, "tid": -914061504, "ts": 1716454225403289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225403290, "dur": 0, "args": { "External id": 254077, "cbid": 203, "correlation": 254077 } }, { "ph": "f", "id": 254077, "pid": 76337, "tid": -914061504, "ts": 1716454225403290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225403291, "dur": 0, "args": { "External id": 254078, "cbid": 205, "correlation": 254078 } }, { "ph": "f", "id": 254078, "pid": 76337, "tid": -914061504, "ts": 1716454225403291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225468494, "dur": 39, "args": { "External id": 254082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254082, "pid": 5, "tid": 7, "ts": 1716454225468494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403307, "dur": 11, "args": { "External id": 254082, "cbid": 211, "correlation": 254082 } }, { "ph": "s", "id": 254082, "pid": 76337, "tid": -914061504, "ts": 1716454225403307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225468535, "dur": 166, "args": { "External id": 254084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254084, "pid": 5, "tid": 7, "ts": 1716454225468535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403321, "dur": 5, "args": { "External id": 254084, "cbid": 211, "correlation": 254084 } }, { "ph": "s", "id": 254084, "pid": 76337, "tid": -914061504, "ts": 1716454225403321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225468702, "dur": 1968, "args": { "External id": 254086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254086, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254086, "pid": 5, "tid": 7, "ts": 1716454225468702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403332, "dur": 7, "args": { "External id": 254086, "cbid": 211, "correlation": 254086 } }, { "ph": "s", "id": 254086, "pid": 76337, "tid": -914061504, "ts": 1716454225403332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225470671, "dur": 38, "args": { "External id": 254088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254088, "pid": 5, "tid": 7, "ts": 1716454225470671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403344, "dur": 5, "args": { "External id": 254088, "cbid": 211, "correlation": 254088 } }, { "ph": "s", "id": 254088, "pid": 76337, "tid": -914061504, "ts": 1716454225403344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225470711, "dur": 60, "args": { "External id": 254094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254094, "pid": 5, "tid": 7, "ts": 1716454225470711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403371, "dur": 9, "args": { "External id": 254094, "cbid": 211, "correlation": 254094 } }, { "ph": "s", "id": 254094, "pid": 76337, "tid": -914061504, "ts": 1716454225403371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225470771, "dur": 85, "args": { "External id": 254103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254103, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254103, "pid": 5, "tid": 7, "ts": 1716454225470771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403461, "dur": 13, "args": { "External id": 254103, "cbid": 211, "correlation": 254103 } }, { "ph": "s", "id": 254103, "pid": 76337, "tid": -914061504, "ts": 1716454225403461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225470858, "dur": 74, "args": { "External id": 254123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254123, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 254123, "pid": 5, "tid": 7, "ts": 1716454225470858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403530, "dur": 11, "args": { "External id": 254123, "cbid": 211, "correlation": 254123 } }, { "ph": "s", "id": 254123, "pid": 76337, "tid": -914061504, "ts": 1716454225403530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225470933, "dur": 5, "args": { "External id": 254135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254135, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 254135, "pid": 5, "tid": 7, "ts": 1716454225470933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403551, "dur": 6, "args": { "External id": 254135, "cbid": 211, "correlation": 254135 } }, { "ph": "s", "id": 254135, "pid": 76337, "tid": -914061504, "ts": 1716454225403551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225470939, "dur": 82, "args": { "External id": 254138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254138, "pid": 5, "tid": 7, "ts": 1716454225470939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403570, "dur": 7, "args": { "External id": 254138, "cbid": 211, "correlation": 254138 } }, { "ph": "s", "id": 254138, "pid": 76337, "tid": -914061504, "ts": 1716454225403570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225471022, "dur": 53, "args": { "External id": 254147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254147, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254147, "pid": 5, "tid": 7, "ts": 1716454225471022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403610, "dur": 10, "args": { "External id": 254147, "cbid": 211, "correlation": 254147 } }, { "ph": "s", "id": 254147, "pid": 76337, "tid": -914061504, "ts": 1716454225403610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225403662, "dur": 0, "args": { "External id": 254157, "cbid": 317, "correlation": 254157 } }, { "ph": "f", "id": 254157, "pid": 76337, "tid": -914061504, "ts": 1716454225403662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225403663, "dur": 0, "args": { "External id": 254158, "cbid": 203, "correlation": 254158 } }, { "ph": "f", "id": 254158, "pid": 76337, "tid": -914061504, "ts": 1716454225403663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225403664, "dur": 0, "args": { "External id": 254159, "cbid": 205, "correlation": 254159 } }, { "ph": "f", "id": 254159, "pid": 76337, "tid": -914061504, "ts": 1716454225403664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225471076, "dur": 56, "args": { "External id": 254163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254163, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254163, "pid": 5, "tid": 7, "ts": 1716454225471076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403681, "dur": 11, "args": { "External id": 254163, "cbid": 211, "correlation": 254163 } }, { "ph": "s", "id": 254163, "pid": 76337, "tid": -914061504, "ts": 1716454225403681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225471134, "dur": 125, "args": { "External id": 254165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254165, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254165, "pid": 5, "tid": 7, "ts": 1716454225471134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403694, "dur": 5, "args": { "External id": 254165, "cbid": 211, "correlation": 254165 } }, { "ph": "s", "id": 254165, "pid": 76337, "tid": -914061504, "ts": 1716454225403694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225471260, "dur": 1922, "args": { "External id": 254167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254167, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254167, "pid": 5, "tid": 7, "ts": 1716454225471260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403706, "dur": 6, "args": { "External id": 254167, "cbid": 211, "correlation": 254167 } }, { "ph": "s", "id": 254167, "pid": 76337, "tid": -914061504, "ts": 1716454225403706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225473183, "dur": 19, "args": { "External id": 254169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254169, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254169, "pid": 5, "tid": 7, "ts": 1716454225473183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403716, "dur": 5, "args": { "External id": 254169, "cbid": 211, "correlation": 254169 } }, { "ph": "s", "id": 254169, "pid": 76337, "tid": -914061504, "ts": 1716454225403716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225473204, "dur": 33, "args": { "External id": 254175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254175, "pid": 5, "tid": 7, "ts": 1716454225473204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403743, "dur": 9, "args": { "External id": 254175, "cbid": 211, "correlation": 254175 } }, { "ph": "s", "id": 254175, "pid": 76337, "tid": -914061504, "ts": 1716454225403743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225473238, "dur": 3, "args": { "External id": 254183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254183, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 254183, "pid": 5, "tid": 7, "ts": 1716454225473238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403787, "dur": 9, "args": { "External id": 254183, "cbid": 211, "correlation": 254183 } }, { "ph": "s", "id": 254183, "pid": 76337, "tid": -914061504, "ts": 1716454225403787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225403853, "dur": 1, "args": { "External id": 254199, "cbid": 251, "correlation": 254199 } }, { "ph": "f", "id": 254199, "pid": 76337, "tid": -914061504, "ts": 1716454225403853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225403858, "dur": 0, "args": { "External id": 254201, "cbid": 251, "correlation": 254201 } }, { "ph": "f", "id": 254201, "pid": 76337, "tid": -914061504, "ts": 1716454225403858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225473243, "dur": 12, "args": { "External id": 254202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254202, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 254202, "pid": 5, "tid": 7, "ts": 1716454225473243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403860, "dur": 11, "args": { "External id": 254202, "cbid": 211, "correlation": 254202 } }, { "ph": "s", "id": 254202, "pid": 76337, "tid": -914061504, "ts": 1716454225403860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225473257, "dur": 5, "args": { "External id": 254204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254204, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 254204, "pid": 5, "tid": 7, "ts": 1716454225473257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403874, "dur": 5, "args": { "External id": 254204, "cbid": 211, "correlation": 254204 } }, { "ph": "s", "id": 254204, "pid": 76337, "tid": -914061504, "ts": 1716454225403874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225473263, "dur": 30, "args": { "External id": 254214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254214, "pid": 5, "tid": 7, "ts": 1716454225473263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225403932, "dur": 12, "args": { "External id": 254214, "cbid": 211, "correlation": 254214 } }, { "ph": "s", "id": 254214, "pid": 76337, "tid": -914061504, "ts": 1716454225403932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225473295, "dur": 32, "args": { "External id": 254234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254234, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 254234, "pid": 5, "tid": 7, "ts": 1716454225473295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404007, "dur": 11, "args": { "External id": 254234, "cbid": 211, "correlation": 254234 } }, { "ph": "s", "id": 254234, "pid": 76337, "tid": -914061504, "ts": 1716454225404007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225473328, "dur": 5, "args": { "External id": 254246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254246, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 254246, "pid": 5, "tid": 7, "ts": 1716454225473328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404028, "dur": 6, "args": { "External id": 254246, "cbid": 211, "correlation": 254246 } }, { "ph": "s", "id": 254246, "pid": 76337, "tid": -914061504, "ts": 1716454225404028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225473334, "dur": 30, "args": { "External id": 254249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254249, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254249, "pid": 5, "tid": 7, "ts": 1716454225473334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404046, "dur": 7, "args": { "External id": 254249, "cbid": 211, "correlation": 254249 } }, { "ph": "s", "id": 254249, "pid": 76337, "tid": -914061504, "ts": 1716454225404046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225473365, "dur": 20, "args": { "External id": 254258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254258, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254258, "pid": 5, "tid": 7, "ts": 1716454225473365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404086, "dur": 9, "args": { "External id": 254258, "cbid": 211, "correlation": 254258 } }, { "ph": "s", "id": 254258, "pid": 76337, "tid": -914061504, "ts": 1716454225404086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225404147, "dur": 0, "args": { "External id": 254268, "cbid": 317, "correlation": 254268 } }, { "ph": "f", "id": 254268, "pid": 76337, "tid": -914061504, "ts": 1716454225404147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225404148, "dur": 0, "args": { "External id": 254269, "cbid": 203, "correlation": 254269 } }, { "ph": "f", "id": 254269, "pid": 76337, "tid": -914061504, "ts": 1716454225404148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225404149, "dur": 0, "args": { "External id": 254270, "cbid": 205, "correlation": 254270 } }, { "ph": "f", "id": 254270, "pid": 76337, "tid": -914061504, "ts": 1716454225404149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225473386, "dur": 23, "args": { "External id": 254274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254274, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254274, "pid": 5, "tid": 7, "ts": 1716454225473386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404164, "dur": 12, "args": { "External id": 254274, "cbid": 211, "correlation": 254274 } }, { "ph": "s", "id": 254274, "pid": 76337, "tid": -914061504, "ts": 1716454225404164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225473411, "dur": 45, "args": { "External id": 254276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254276, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254276, "pid": 5, "tid": 7, "ts": 1716454225473411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404179, "dur": 5, "args": { "External id": 254276, "cbid": 211, "correlation": 254276 } }, { "ph": "s", "id": 254276, "pid": 76337, "tid": -914061504, "ts": 1716454225404179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225473458, "dur": 658, "args": { "External id": 254278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254278, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254278, "pid": 5, "tid": 7, "ts": 1716454225473458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404190, "dur": 6, "args": { "External id": 254278, "cbid": 211, "correlation": 254278 } }, { "ph": "s", "id": 254278, "pid": 76337, "tid": -914061504, "ts": 1716454225404190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225474117, "dur": 23, "args": { "External id": 254280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254280, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254280, "pid": 5, "tid": 7, "ts": 1716454225474117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404201, "dur": 6, "args": { "External id": 254280, "cbid": 211, "correlation": 254280 } }, { "ph": "s", "id": 254280, "pid": 76337, "tid": -914061504, "ts": 1716454225404201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225474141, "dur": 33, "args": { "External id": 254286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254286, "pid": 5, "tid": 7, "ts": 1716454225474141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404229, "dur": 9, "args": { "External id": 254286, "cbid": 211, "correlation": 254286 } }, { "ph": "s", "id": 254286, "pid": 76337, "tid": -914061504, "ts": 1716454225404229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225404288, "dur": 0, "args": { "External id": 254296, "cbid": 317, "correlation": 254296 } }, { "ph": "f", "id": 254296, "pid": 76337, "tid": -914061504, "ts": 1716454225404288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225404289, "dur": 0, "args": { "External id": 254297, "cbid": 203, "correlation": 254297 } }, { "ph": "f", "id": 254297, "pid": 76337, "tid": -914061504, "ts": 1716454225404289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225404289, "dur": 0, "args": { "External id": 254298, "cbid": 205, "correlation": 254298 } }, { "ph": "f", "id": 254298, "pid": 76337, "tid": -914061504, "ts": 1716454225404289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225474175, "dur": 56, "args": { "External id": 254302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254302, "pid": 5, "tid": 7, "ts": 1716454225474175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404302, "dur": 12, "args": { "External id": 254302, "cbid": 211, "correlation": 254302 } }, { "ph": "s", "id": 254302, "pid": 76337, "tid": -914061504, "ts": 1716454225404302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225474233, "dur": 275, "args": { "External id": 254304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254304, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254304, "pid": 5, "tid": 7, "ts": 1716454225474233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404320, "dur": 7, "args": { "External id": 254304, "cbid": 211, "correlation": 254304 } }, { "ph": "s", "id": 254304, "pid": 76337, "tid": -914061504, "ts": 1716454225404320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225474509, "dur": 22, "args": { "External id": 254306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254306, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254306, "pid": 5, "tid": 7, "ts": 1716454225474509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404331, "dur": 5, "args": { "External id": 254306, "cbid": 211, "correlation": 254306 } }, { "ph": "s", "id": 254306, "pid": 76337, "tid": -914061504, "ts": 1716454225404331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225474533, "dur": 32, "args": { "External id": 254312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254312, "pid": 5, "tid": 7, "ts": 1716454225474533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404357, "dur": 8, "args": { "External id": 254312, "cbid": 211, "correlation": 254312 } }, { "ph": "s", "id": 254312, "pid": 76337, "tid": -914061504, "ts": 1716454225404357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225474567, "dur": 27, "args": { "External id": 254320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254320, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254320, "pid": 5, "tid": 7, "ts": 1716454225474567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404386, "dur": 8, "args": { "External id": 254320, "cbid": 211, "correlation": 254320 } }, { "ph": "s", "id": 254320, "pid": 76337, "tid": -914061504, "ts": 1716454225404386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225474596, "dur": 20, "args": { "External id": 254328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254328, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254328, "pid": 5, "tid": 7, "ts": 1716454225474596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404415, "dur": 8, "args": { "External id": 254328, "cbid": 211, "correlation": 254328 } }, { "ph": "s", "id": 254328, "pid": 76337, "tid": -914061504, "ts": 1716454225404415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225474617, "dur": 30, "args": { "External id": 254348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254348, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 254348, "pid": 5, "tid": 7, "ts": 1716454225474617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404495, "dur": 13, "args": { "External id": 254348, "cbid": 211, "correlation": 254348 } }, { "ph": "s", "id": 254348, "pid": 76337, "tid": -914061504, "ts": 1716454225404495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225474649, "dur": 5, "args": { "External id": 254360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254360, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 254360, "pid": 5, "tid": 7, "ts": 1716454225474649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404518, "dur": 6, "args": { "External id": 254360, "cbid": 211, "correlation": 254360 } }, { "ph": "s", "id": 254360, "pid": 76337, "tid": -914061504, "ts": 1716454225404518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225474655, "dur": 32, "args": { "External id": 254363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254363, "pid": 5, "tid": 7, "ts": 1716454225474655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404535, "dur": 6, "args": { "External id": 254363, "cbid": 211, "correlation": 254363 } }, { "ph": "s", "id": 254363, "pid": 76337, "tid": -914061504, "ts": 1716454225404535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225404591, "dur": 0, "args": { "External id": 254374, "cbid": 317, "correlation": 254374 } }, { "ph": "f", "id": 254374, "pid": 76337, "tid": -914061504, "ts": 1716454225404591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225404592, "dur": 0, "args": { "External id": 254375, "cbid": 203, "correlation": 254375 } }, { "ph": "f", "id": 254375, "pid": 76337, "tid": -914061504, "ts": 1716454225404592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225404593, "dur": 0, "args": { "External id": 254376, "cbid": 205, "correlation": 254376 } }, { "ph": "f", "id": 254376, "pid": 76337, "tid": -914061504, "ts": 1716454225404593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225474689, "dur": 22, "args": { "External id": 254380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254380, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254380, "pid": 5, "tid": 7, "ts": 1716454225474689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404607, "dur": 11, "args": { "External id": 254380, "cbid": 211, "correlation": 254380 } }, { "ph": "s", "id": 254380, "pid": 76337, "tid": -914061504, "ts": 1716454225404607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225474712, "dur": 107, "args": { "External id": 254382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254382, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254382, "pid": 5, "tid": 7, "ts": 1716454225474712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404624, "dur": 7, "args": { "External id": 254382, "cbid": 211, "correlation": 254382 } }, { "ph": "s", "id": 254382, "pid": 76337, "tid": -914061504, "ts": 1716454225404624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225474820, "dur": 23, "args": { "External id": 254384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254384, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254384, "pid": 5, "tid": 7, "ts": 1716454225474820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404634, "dur": 5, "args": { "External id": 254384, "cbid": 211, "correlation": 254384 } }, { "ph": "s", "id": 254384, "pid": 76337, "tid": -914061504, "ts": 1716454225404634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225474844, "dur": 33, "args": { "External id": 254390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254390, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254390, "pid": 5, "tid": 7, "ts": 1716454225474844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404662, "dur": 9, "args": { "External id": 254390, "cbid": 211, "correlation": 254390 } }, { "ph": "s", "id": 254390, "pid": 76337, "tid": -914061504, "ts": 1716454225404662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225474879, "dur": 199, "args": { "External id": 254399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254399, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254399, "pid": 5, "tid": 7, "ts": 1716454225474879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404742, "dur": 14, "args": { "External id": 254399, "cbid": 211, "correlation": 254399 } }, { "ph": "s", "id": 254399, "pid": 76337, "tid": -914061504, "ts": 1716454225404742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225475079, "dur": 66, "args": { "External id": 254421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254421, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254421, "pid": 5, "tid": 7, "ts": 1716454225475079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404800, "dur": 10, "args": { "External id": 254421, "cbid": 211, "correlation": 254421 } }, { "ph": "s", "id": 254421, "pid": 76337, "tid": -914061504, "ts": 1716454225404800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225404890, "dur": 1, "args": { "External id": 254432, "cbid": 251, "correlation": 254432 } }, { "ph": "f", "id": 254432, "pid": 76337, "tid": -914061504, "ts": 1716454225404890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225475147, "dur": 157, "args": { "External id": 254433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254433, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254433, "pid": 5, "tid": 7, "ts": 1716454225475147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404895, "dur": 14, "args": { "External id": 254433, "cbid": 211, "correlation": 254433 } }, { "ph": "s", "id": 254433, "pid": 76337, "tid": -914061504, "ts": 1716454225404895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225404966, "dur": 1, "args": { "External id": 254444, "cbid": 251, "correlation": 254444 } }, { "ph": "f", "id": 254444, "pid": 76337, "tid": -914061504, "ts": 1716454225404966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225475305, "dur": 149, "args": { "External id": 254445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254445, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254445, "pid": 5, "tid": 7, "ts": 1716454225475305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225404970, "dur": 19, "args": { "External id": 254445, "cbid": 211, "correlation": 254445 } }, { "ph": "s", "id": 254445, "pid": 76337, "tid": -914061504, "ts": 1716454225404970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405045, "dur": 1, "args": { "External id": 254456, "cbid": 251, "correlation": 254456 } }, { "ph": "f", "id": 254456, "pid": 76337, "tid": -914061504, "ts": 1716454225405045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225475455, "dur": 146, "args": { "External id": 254457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254457, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254457, "pid": 5, "tid": 7, "ts": 1716454225475455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405049, "dur": 12, "args": { "External id": 254457, "cbid": 211, "correlation": 254457 } }, { "ph": "s", "id": 254457, "pid": 76337, "tid": -914061504, "ts": 1716454225405049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225475602, "dur": 1984, "args": { "External id": 254478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254478, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 254478, "pid": 5, "tid": 7, "ts": 1716454225475602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405131, "dur": 13, "args": { "External id": 254478, "cbid": 211, "correlation": 254478 } }, { "ph": "s", "id": 254478, "pid": 76337, "tid": -914061504, "ts": 1716454225405131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405229, "dur": 1, "args": { "External id": 254496, "cbid": 251, "correlation": 254496 } }, { "ph": "f", "id": 254496, "pid": 76337, "tid": -914061504, "ts": 1716454225405229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225477588, "dur": 153, "args": { "External id": 254498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254498, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 254498, "pid": 5, "tid": 7, "ts": 1716454225477588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405235, "dur": 13, "args": { "External id": 254498, "cbid": 211, "correlation": 254498 } }, { "ph": "s", "id": 254498, "pid": 76337, "tid": -914061504, "ts": 1716454225405235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225477742, "dur": 35, "args": { "External id": 254506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254506, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254506, "pid": 5, "tid": 7, "ts": 1716454225477742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405305, "dur": 12, "args": { "External id": 254506, "cbid": 211, "correlation": 254506 } }, { "ph": "s", "id": 254506, "pid": 76337, "tid": -914061504, "ts": 1716454225405305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225477779, "dur": 50, "args": { "External id": 254514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254514, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254514, "pid": 5, "tid": 7, "ts": 1716454225477779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405344, "dur": 9, "args": { "External id": 254514, "cbid": 211, "correlation": 254514 } }, { "ph": "s", "id": 254514, "pid": 76337, "tid": -914061504, "ts": 1716454225405344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225477830, "dur": 31, "args": { "External id": 254525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254525, "pid": 5, "tid": 7, "ts": 1716454225477830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405416, "dur": 12, "args": { "External id": 254525, "cbid": 211, "correlation": 254525 } }, { "ph": "s", "id": 254525, "pid": 76337, "tid": -914061504, "ts": 1716454225405416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225477862, "dur": 35, "args": { "External id": 254547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254547, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254547, "pid": 5, "tid": 7, "ts": 1716454225477862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405446, "dur": 8, "args": { "External id": 254547, "cbid": 211, "correlation": 254547 } }, { "ph": "s", "id": 254547, "pid": 76337, "tid": -914061504, "ts": 1716454225405446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405531, "dur": 1, "args": { "External id": 254558, "cbid": 251, "correlation": 254558 } }, { "ph": "f", "id": 254558, "pid": 76337, "tid": -914061504, "ts": 1716454225405531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225477899, "dur": 94, "args": { "External id": 254559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254559, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254559, "pid": 5, "tid": 7, "ts": 1716454225477899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405536, "dur": 13, "args": { "External id": 254559, "cbid": 211, "correlation": 254559 } }, { "ph": "s", "id": 254559, "pid": 76337, "tid": -914061504, "ts": 1716454225405536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405606, "dur": 1, "args": { "External id": 254570, "cbid": 251, "correlation": 254570 } }, { "ph": "f", "id": 254570, "pid": 76337, "tid": -914061504, "ts": 1716454225405606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405610, "dur": 0, "args": { "External id": 254571, "cbid": 251, "correlation": 254571 } }, { "ph": "f", "id": 254571, "pid": 76337, "tid": -914061504, "ts": 1716454225405610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225477994, "dur": 11, "args": { "External id": 254572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254572, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 254572, "pid": 5, "tid": 7, "ts": 1716454225477994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405612, "dur": 12, "args": { "External id": 254572, "cbid": 211, "correlation": 254572 } }, { "ph": "s", "id": 254572, "pid": 76337, "tid": -914061504, "ts": 1716454225405612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225478007, "dur": 5, "args": { "External id": 254574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254574, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 254574, "pid": 5, "tid": 7, "ts": 1716454225478007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405625, "dur": 6, "args": { "External id": 254574, "cbid": 211, "correlation": 254574 } }, { "ph": "s", "id": 254574, "pid": 76337, "tid": -914061504, "ts": 1716454225405625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405682, "dur": 1, "args": { "External id": 254585, "cbid": 251, "correlation": 254585 } }, { "ph": "f", "id": 254585, "pid": 76337, "tid": -914061504, "ts": 1716454225405682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405685, "dur": 0, "args": { "External id": 254586, "cbid": 251, "correlation": 254586 } }, { "ph": "f", "id": 254586, "pid": 76337, "tid": -914061504, "ts": 1716454225405685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225478014, "dur": 8, "args": { "External id": 254587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254587, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 254587, "pid": 5, "tid": 7, "ts": 1716454225478014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405686, "dur": 11, "args": { "External id": 254587, "cbid": 211, "correlation": 254587 } }, { "ph": "s", "id": 254587, "pid": 76337, "tid": -914061504, "ts": 1716454225405686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225478022, "dur": 4, "args": { "External id": 254589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254589, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 254589, "pid": 5, "tid": 7, "ts": 1716454225478022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405699, "dur": 6, "args": { "External id": 254589, "cbid": 211, "correlation": 254589 } }, { "ph": "s", "id": 254589, "pid": 76337, "tid": -914061504, "ts": 1716454225405699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225478028, "dur": 94, "args": { "External id": 254610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254610, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 254610, "pid": 5, "tid": 7, "ts": 1716454225478028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405773, "dur": 13, "args": { "External id": 254610, "cbid": 211, "correlation": 254610 } }, { "ph": "s", "id": 254610, "pid": 76337, "tid": -914061504, "ts": 1716454225405773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225405869, "dur": 1, "args": { "External id": 254628, "cbid": 251, "correlation": 254628 } }, { "ph": "f", "id": 254628, "pid": 76337, "tid": -914061504, "ts": 1716454225405869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225478123, "dur": 100, "args": { "External id": 254630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254630, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254630, "pid": 5, "tid": 7, "ts": 1716454225478123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405874, "dur": 13, "args": { "External id": 254630, "cbid": 211, "correlation": 254630 } }, { "ph": "s", "id": 254630, "pid": 76337, "tid": -914061504, "ts": 1716454225405874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225478225, "dur": 19, "args": { "External id": 254638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254638, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254638, "pid": 5, "tid": 7, "ts": 1716454225478225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405943, "dur": 12, "args": { "External id": 254638, "cbid": 211, "correlation": 254638 } }, { "ph": "s", "id": 254638, "pid": 76337, "tid": -914061504, "ts": 1716454225405943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225478245, "dur": 38, "args": { "External id": 254646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254646, "pid": 5, "tid": 7, "ts": 1716454225478245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225405993, "dur": 10, "args": { "External id": 254646, "cbid": 211, "correlation": 254646 } }, { "ph": "s", "id": 254646, "pid": 76337, "tid": -914061504, "ts": 1716454225405993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225478284, "dur": 35, "args": { "External id": 254668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254668, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254668, "pid": 5, "tid": 7, "ts": 1716454225478284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406045, "dur": 10, "args": { "External id": 254668, "cbid": 211, "correlation": 254668 } }, { "ph": "s", "id": 254668, "pid": 76337, "tid": -914061504, "ts": 1716454225406045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225406134, "dur": 1, "args": { "External id": 254684, "cbid": 251, "correlation": 254684 } }, { "ph": "f", "id": 254684, "pid": 76337, "tid": -914061504, "ts": 1716454225406134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225406139, "dur": 0, "args": { "External id": 254686, "cbid": 251, "correlation": 254686 } }, { "ph": "f", "id": 254686, "pid": 76337, "tid": -914061504, "ts": 1716454225406139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225478321, "dur": 550, "args": { "External id": 254687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254687, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 254687, "pid": 5, "tid": 7, "ts": 1716454225478321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406144, "dur": 13, "args": { "External id": 254687, "cbid": 211, "correlation": 254687 } }, { "ph": "s", "id": 254687, "pid": 76337, "tid": -914061504, "ts": 1716454225406144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225478872, "dur": 126, "args": { "External id": 254695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254695, "pid": 5, "tid": 7, "ts": 1716454225478872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406211, "dur": 12, "args": { "External id": 254695, "cbid": 211, "correlation": 254695 } }, { "ph": "s", "id": 254695, "pid": 76337, "tid": -914061504, "ts": 1716454225406211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225479000, "dur": 130, "args": { "External id": 254703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254703, "pid": 5, "tid": 7, "ts": 1716454225479000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406241, "dur": 8, "args": { "External id": 254703, "cbid": 211, "correlation": 254703 } }, { "ph": "s", "id": 254703, "pid": 76337, "tid": -914061504, "ts": 1716454225406241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225406318, "dur": 1, "args": { "External id": 254719, "cbid": 251, "correlation": 254719 } }, { "ph": "f", "id": 254719, "pid": 76337, "tid": -914061504, "ts": 1716454225406318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225479131, "dur": 310, "args": { "External id": 254721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254721, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254721, "pid": 5, "tid": 7, "ts": 1716454225479131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406324, "dur": 12, "args": { "External id": 254721, "cbid": 211, "correlation": 254721 } }, { "ph": "s", "id": 254721, "pid": 76337, "tid": -914061504, "ts": 1716454225406324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225479442, "dur": 27, "args": { "External id": 254729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254729, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254729, "pid": 5, "tid": 7, "ts": 1716454225479442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406366, "dur": 9, "args": { "External id": 254729, "cbid": 211, "correlation": 254729 } }, { "ph": "s", "id": 254729, "pid": 76337, "tid": -914061504, "ts": 1716454225406366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225479470, "dur": 83, "args": { "External id": 254740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254740, "pid": 5, "tid": 7, "ts": 1716454225479470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406433, "dur": 12, "args": { "External id": 254740, "cbid": 211, "correlation": 254740 } }, { "ph": "s", "id": 254740, "pid": 76337, "tid": -914061504, "ts": 1716454225406433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225406497, "dur": 0, "args": { "External id": 254752, "cbid": 317, "correlation": 254752 } }, { "ph": "f", "id": 254752, "pid": 76337, "tid": -914061504, "ts": 1716454225406497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225406498, "dur": 0, "args": { "External id": 254753, "cbid": 203, "correlation": 254753 } }, { "ph": "f", "id": 254753, "pid": 76337, "tid": -914061504, "ts": 1716454225406498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225406498, "dur": 0, "args": { "External id": 254754, "cbid": 205, "correlation": 254754 } }, { "ph": "f", "id": 254754, "pid": 76337, "tid": -914061504, "ts": 1716454225406498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225479554, "dur": 23, "args": { "External id": 254758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254758, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254758, "pid": 5, "tid": 7, "ts": 1716454225479554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406515, "dur": 12, "args": { "External id": 254758, "cbid": 211, "correlation": 254758 } }, { "ph": "s", "id": 254758, "pid": 76337, "tid": -914061504, "ts": 1716454225406515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225479579, "dur": 123, "args": { "External id": 254760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254760, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254760, "pid": 5, "tid": 7, "ts": 1716454225479579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406533, "dur": 6, "args": { "External id": 254760, "cbid": 211, "correlation": 254760 } }, { "ph": "s", "id": 254760, "pid": 76337, "tid": -914061504, "ts": 1716454225406533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225479704, "dur": 23, "args": { "External id": 254762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254762, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254762, "pid": 5, "tid": 7, "ts": 1716454225479704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406543, "dur": 5, "args": { "External id": 254762, "cbid": 211, "correlation": 254762 } }, { "ph": "s", "id": 254762, "pid": 76337, "tid": -914061504, "ts": 1716454225406543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225479728, "dur": 33, "args": { "External id": 254768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254768, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254768, "pid": 5, "tid": 7, "ts": 1716454225479728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406571, "dur": 8, "args": { "External id": 254768, "cbid": 211, "correlation": 254768 } }, { "ph": "s", "id": 254768, "pid": 76337, "tid": -914061504, "ts": 1716454225406571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225479763, "dur": 27, "args": { "External id": 254776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254776, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254776, "pid": 5, "tid": 7, "ts": 1716454225479763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406603, "dur": 9, "args": { "External id": 254776, "cbid": 211, "correlation": 254776 } }, { "ph": "s", "id": 254776, "pid": 76337, "tid": -914061504, "ts": 1716454225406603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225479791, "dur": 54, "args": { "External id": 254785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254785, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254785, "pid": 5, "tid": 7, "ts": 1716454225479791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406641, "dur": 10, "args": { "External id": 254785, "cbid": 211, "correlation": 254785 } }, { "ph": "s", "id": 254785, "pid": 76337, "tid": -914061504, "ts": 1716454225406641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225479847, "dur": 52, "args": { "External id": 254805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254805, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 254805, "pid": 5, "tid": 7, "ts": 1716454225479847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406713, "dur": 11, "args": { "External id": 254805, "cbid": 211, "correlation": 254805 } }, { "ph": "s", "id": 254805, "pid": 76337, "tid": -914061504, "ts": 1716454225406713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225479900, "dur": 5, "args": { "External id": 254817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254817, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 254817, "pid": 5, "tid": 7, "ts": 1716454225479900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406734, "dur": 6, "args": { "External id": 254817, "cbid": 211, "correlation": 254817 } }, { "ph": "s", "id": 254817, "pid": 76337, "tid": -914061504, "ts": 1716454225406734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225479906, "dur": 58, "args": { "External id": 254820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254820, "pid": 5, "tid": 7, "ts": 1716454225479906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406752, "dur": 7, "args": { "External id": 254820, "cbid": 211, "correlation": 254820 } }, { "ph": "s", "id": 254820, "pid": 76337, "tid": -914061504, "ts": 1716454225406752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225479966, "dur": 37, "args": { "External id": 254829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254829, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254829, "pid": 5, "tid": 7, "ts": 1716454225479966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406792, "dur": 9, "args": { "External id": 254829, "cbid": 211, "correlation": 254829 } }, { "ph": "s", "id": 254829, "pid": 76337, "tid": -914061504, "ts": 1716454225406792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225406843, "dur": 0, "args": { "External id": 254839, "cbid": 317, "correlation": 254839 } }, { "ph": "f", "id": 254839, "pid": 76337, "tid": -914061504, "ts": 1716454225406843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225406844, "dur": 0, "args": { "External id": 254840, "cbid": 203, "correlation": 254840 } }, { "ph": "f", "id": 254840, "pid": 76337, "tid": -914061504, "ts": 1716454225406844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225406845, "dur": 0, "args": { "External id": 254841, "cbid": 205, "correlation": 254841 } }, { "ph": "f", "id": 254841, "pid": 76337, "tid": -914061504, "ts": 1716454225406845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225480004, "dur": 39, "args": { "External id": 254845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254845, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254845, "pid": 5, "tid": 7, "ts": 1716454225480004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406860, "dur": 11, "args": { "External id": 254845, "cbid": 211, "correlation": 254845 } }, { "ph": "s", "id": 254845, "pid": 76337, "tid": -914061504, "ts": 1716454225406860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225480045, "dur": 84, "args": { "External id": 254847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254847, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254847, "pid": 5, "tid": 7, "ts": 1716454225480045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406873, "dur": 5, "args": { "External id": 254847, "cbid": 211, "correlation": 254847 } }, { "ph": "s", "id": 254847, "pid": 76337, "tid": -914061504, "ts": 1716454225406873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225480131, "dur": 1299, "args": { "External id": 254849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254849, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254849, "pid": 5, "tid": 7, "ts": 1716454225480131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406884, "dur": 6, "args": { "External id": 254849, "cbid": 211, "correlation": 254849 } }, { "ph": "s", "id": 254849, "pid": 76337, "tid": -914061504, "ts": 1716454225406884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225481431, "dur": 22, "args": { "External id": 254851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254851, "pid": 5, "tid": 7, "ts": 1716454225481431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406894, "dur": 5, "args": { "External id": 254851, "cbid": 211, "correlation": 254851 } }, { "ph": "s", "id": 254851, "pid": 76337, "tid": -914061504, "ts": 1716454225406894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225481454, "dur": 33, "args": { "External id": 254857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254857, "pid": 5, "tid": 7, "ts": 1716454225481454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406922, "dur": 8, "args": { "External id": 254857, "cbid": 211, "correlation": 254857 } }, { "ph": "s", "id": 254857, "pid": 76337, "tid": -914061504, "ts": 1716454225406922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225481489, "dur": 3, "args": { "External id": 254865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254865, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 254865, "pid": 5, "tid": 7, "ts": 1716454225481489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225406965, "dur": 16, "args": { "External id": 254865, "cbid": 211, "correlation": 254865 } }, { "ph": "s", "id": 254865, "pid": 76337, "tid": -914061504, "ts": 1716454225406965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225407036, "dur": 1, "args": { "External id": 254881, "cbid": 251, "correlation": 254881 } }, { "ph": "f", "id": 254881, "pid": 76337, "tid": -914061504, "ts": 1716454225407036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225407041, "dur": 0, "args": { "External id": 254883, "cbid": 251, "correlation": 254883 } }, { "ph": "f", "id": 254883, "pid": 76337, "tid": -914061504, "ts": 1716454225407041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225481493, "dur": 12, "args": { "External id": 254884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254884, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 254884, "pid": 5, "tid": 7, "ts": 1716454225481493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407043, "dur": 12, "args": { "External id": 254884, "cbid": 211, "correlation": 254884 } }, { "ph": "s", "id": 254884, "pid": 76337, "tid": -914061504, "ts": 1716454225407043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225481507, "dur": 5, "args": { "External id": 254886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254886, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 254886, "pid": 5, "tid": 7, "ts": 1716454225481507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407056, "dur": 6, "args": { "External id": 254886, "cbid": 211, "correlation": 254886 } }, { "ph": "s", "id": 254886, "pid": 76337, "tid": -914061504, "ts": 1716454225407056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225481513, "dur": 29, "args": { "External id": 254896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254896, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254896, "pid": 5, "tid": 7, "ts": 1716454225481513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407115, "dur": 12, "args": { "External id": 254896, "cbid": 211, "correlation": 254896 } }, { "ph": "s", "id": 254896, "pid": 76337, "tid": -914061504, "ts": 1716454225407115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225481543, "dur": 31, "args": { "External id": 254916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254916, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 254916, "pid": 5, "tid": 7, "ts": 1716454225481543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407179, "dur": 10, "args": { "External id": 254916, "cbid": 211, "correlation": 254916 } }, { "ph": "s", "id": 254916, "pid": 76337, "tid": -914061504, "ts": 1716454225407179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225481576, "dur": 5, "args": { "External id": 254928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254928, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 254928, "pid": 5, "tid": 7, "ts": 1716454225481576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407199, "dur": 6, "args": { "External id": 254928, "cbid": 211, "correlation": 254928 } }, { "ph": "s", "id": 254928, "pid": 76337, "tid": -914061504, "ts": 1716454225407199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225481582, "dur": 30, "args": { "External id": 254931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254931, "pid": 5, "tid": 7, "ts": 1716454225481582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407218, "dur": 9, "args": { "External id": 254931, "cbid": 211, "correlation": 254931 } }, { "ph": "s", "id": 254931, "pid": 76337, "tid": -914061504, "ts": 1716454225407218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225481614, "dur": 20, "args": { "External id": 254940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254940, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254940, "pid": 5, "tid": 7, "ts": 1716454225481614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407261, "dur": 10, "args": { "External id": 254940, "cbid": 211, "correlation": 254940 } }, { "ph": "s", "id": 254940, "pid": 76337, "tid": -914061504, "ts": 1716454225407261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225407324, "dur": 0, "args": { "External id": 254950, "cbid": 317, "correlation": 254950 } }, { "ph": "f", "id": 254950, "pid": 76337, "tid": -914061504, "ts": 1716454225407324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225407325, "dur": 0, "args": { "External id": 254951, "cbid": 203, "correlation": 254951 } }, { "ph": "f", "id": 254951, "pid": 76337, "tid": -914061504, "ts": 1716454225407325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225407326, "dur": 0, "args": { "External id": 254952, "cbid": 205, "correlation": 254952 } }, { "ph": "f", "id": 254952, "pid": 76337, "tid": -914061504, "ts": 1716454225407326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225481635, "dur": 23, "args": { "External id": 254956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254956, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254956, "pid": 5, "tid": 7, "ts": 1716454225481635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407340, "dur": 11, "args": { "External id": 254956, "cbid": 211, "correlation": 254956 } }, { "ph": "s", "id": 254956, "pid": 76337, "tid": -914061504, "ts": 1716454225407340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225481660, "dur": 45, "args": { "External id": 254958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254958, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254958, "pid": 5, "tid": 7, "ts": 1716454225481660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407353, "dur": 6, "args": { "External id": 254958, "cbid": 211, "correlation": 254958 } }, { "ph": "s", "id": 254958, "pid": 76337, "tid": -914061504, "ts": 1716454225407353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225481706, "dur": 658, "args": { "External id": 254960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254960, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254960, "pid": 5, "tid": 7, "ts": 1716454225481706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407365, "dur": 6, "args": { "External id": 254960, "cbid": 211, "correlation": 254960 } }, { "ph": "s", "id": 254960, "pid": 76337, "tid": -914061504, "ts": 1716454225407365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225482365, "dur": 22, "args": { "External id": 254962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254962, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254962, "pid": 5, "tid": 7, "ts": 1716454225482365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407375, "dur": 5, "args": { "External id": 254962, "cbid": 211, "correlation": 254962 } }, { "ph": "s", "id": 254962, "pid": 76337, "tid": -914061504, "ts": 1716454225407375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225482389, "dur": 34, "args": { "External id": 254968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254968, "pid": 5, "tid": 7, "ts": 1716454225482389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407403, "dur": 9, "args": { "External id": 254968, "cbid": 211, "correlation": 254968 } }, { "ph": "s", "id": 254968, "pid": 76337, "tid": -914061504, "ts": 1716454225407403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225407462, "dur": 0, "args": { "External id": 254978, "cbid": 317, "correlation": 254978 } }, { "ph": "f", "id": 254978, "pid": 76337, "tid": -914061504, "ts": 1716454225407462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225407462, "dur": 0, "args": { "External id": 254979, "cbid": 203, "correlation": 254979 } }, { "ph": "f", "id": 254979, "pid": 76337, "tid": -914061504, "ts": 1716454225407462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225407463, "dur": 0, "args": { "External id": 254980, "cbid": 205, "correlation": 254980 } }, { "ph": "f", "id": 254980, "pid": 76337, "tid": -914061504, "ts": 1716454225407463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225482424, "dur": 40, "args": { "External id": 254984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254984, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254984, "pid": 5, "tid": 7, "ts": 1716454225482424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407475, "dur": 11, "args": { "External id": 254984, "cbid": 211, "correlation": 254984 } }, { "ph": "s", "id": 254984, "pid": 76337, "tid": -914061504, "ts": 1716454225407475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225482465, "dur": 193, "args": { "External id": 254986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254986, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 254986, "pid": 5, "tid": 7, "ts": 1716454225482465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407492, "dur": 6, "args": { "External id": 254986, "cbid": 211, "correlation": 254986 } }, { "ph": "s", "id": 254986, "pid": 76337, "tid": -914061504, "ts": 1716454225407492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225482660, "dur": 23, "args": { "External id": 254988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254988, "pid": 5, "tid": 7, "ts": 1716454225482660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407502, "dur": 5, "args": { "External id": 254988, "cbid": 211, "correlation": 254988 } }, { "ph": "s", "id": 254988, "pid": 76337, "tid": -914061504, "ts": 1716454225407502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225482685, "dur": 33, "args": { "External id": 254994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 254994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 254994, "pid": 5, "tid": 7, "ts": 1716454225482685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407528, "dur": 8, "args": { "External id": 254994, "cbid": 211, "correlation": 254994 } }, { "ph": "s", "id": 254994, "pid": 76337, "tid": -914061504, "ts": 1716454225407528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225482719, "dur": 27, "args": { "External id": 255002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255002, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255002, "pid": 5, "tid": 7, "ts": 1716454225482719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407557, "dur": 8, "args": { "External id": 255002, "cbid": 211, "correlation": 255002 } }, { "ph": "s", "id": 255002, "pid": 76337, "tid": -914061504, "ts": 1716454225407557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225482748, "dur": 20, "args": { "External id": 255010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255010, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255010, "pid": 5, "tid": 7, "ts": 1716454225482748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407586, "dur": 8, "args": { "External id": 255010, "cbid": 211, "correlation": 255010 } }, { "ph": "s", "id": 255010, "pid": 76337, "tid": -914061504, "ts": 1716454225407586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225482769, "dur": 31, "args": { "External id": 255030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255030, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 255030, "pid": 5, "tid": 7, "ts": 1716454225482769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407669, "dur": 12, "args": { "External id": 255030, "cbid": 211, "correlation": 255030 } }, { "ph": "s", "id": 255030, "pid": 76337, "tid": -914061504, "ts": 1716454225407669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225482801, "dur": 5, "args": { "External id": 255042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255042, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 255042, "pid": 5, "tid": 7, "ts": 1716454225482801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407690, "dur": 6, "args": { "External id": 255042, "cbid": 211, "correlation": 255042 } }, { "ph": "s", "id": 255042, "pid": 76337, "tid": -914061504, "ts": 1716454225407690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225482808, "dur": 30, "args": { "External id": 255045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255045, "pid": 5, "tid": 7, "ts": 1716454225482808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407708, "dur": 7, "args": { "External id": 255045, "cbid": 211, "correlation": 255045 } }, { "ph": "s", "id": 255045, "pid": 76337, "tid": -914061504, "ts": 1716454225407708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225407766, "dur": 0, "args": { "External id": 255056, "cbid": 317, "correlation": 255056 } }, { "ph": "f", "id": 255056, "pid": 76337, "tid": -914061504, "ts": 1716454225407766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225407767, "dur": 0, "args": { "External id": 255057, "cbid": 203, "correlation": 255057 } }, { "ph": "f", "id": 255057, "pid": 76337, "tid": -914061504, "ts": 1716454225407767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225407767, "dur": 0, "args": { "External id": 255058, "cbid": 205, "correlation": 255058 } }, { "ph": "f", "id": 255058, "pid": 76337, "tid": -914061504, "ts": 1716454225407767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225482840, "dur": 21, "args": { "External id": 255062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255062, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255062, "pid": 5, "tid": 7, "ts": 1716454225482840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407780, "dur": 11, "args": { "External id": 255062, "cbid": 211, "correlation": 255062 } }, { "ph": "s", "id": 255062, "pid": 76337, "tid": -914061504, "ts": 1716454225407780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225482862, "dur": 107, "args": { "External id": 255064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255064, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255064, "pid": 5, "tid": 7, "ts": 1716454225482862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407797, "dur": 6, "args": { "External id": 255064, "cbid": 211, "correlation": 255064 } }, { "ph": "s", "id": 255064, "pid": 76337, "tid": -914061504, "ts": 1716454225407797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225482971, "dur": 22, "args": { "External id": 255066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255066, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255066, "pid": 5, "tid": 7, "ts": 1716454225482971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407807, "dur": 6, "args": { "External id": 255066, "cbid": 211, "correlation": 255066 } }, { "ph": "s", "id": 255066, "pid": 76337, "tid": -914061504, "ts": 1716454225407807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225482995, "dur": 34, "args": { "External id": 255072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255072, "pid": 5, "tid": 7, "ts": 1716454225482995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407835, "dur": 9, "args": { "External id": 255072, "cbid": 211, "correlation": 255072 } }, { "ph": "s", "id": 255072, "pid": 76337, "tid": -914061504, "ts": 1716454225407835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225483030, "dur": 200, "args": { "External id": 255081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255081, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255081, "pid": 5, "tid": 7, "ts": 1716454225483030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407918, "dur": 14, "args": { "External id": 255081, "cbid": 211, "correlation": 255081 } }, { "ph": "s", "id": 255081, "pid": 76337, "tid": -914061504, "ts": 1716454225407918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225483231, "dur": 66, "args": { "External id": 255103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255103, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255103, "pid": 5, "tid": 7, "ts": 1716454225483231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225407982, "dur": 11, "args": { "External id": 255103, "cbid": 211, "correlation": 255103 } }, { "ph": "s", "id": 255103, "pid": 76337, "tid": -914061504, "ts": 1716454225407982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408072, "dur": 1, "args": { "External id": 255114, "cbid": 251, "correlation": 255114 } }, { "ph": "f", "id": 255114, "pid": 76337, "tid": -914061504, "ts": 1716454225408072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225483299, "dur": 158, "args": { "External id": 255115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255115, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255115, "pid": 5, "tid": 7, "ts": 1716454225483299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408078, "dur": 13, "args": { "External id": 255115, "cbid": 211, "correlation": 255115 } }, { "ph": "s", "id": 255115, "pid": 76337, "tid": -914061504, "ts": 1716454225408078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408148, "dur": 1, "args": { "External id": 255126, "cbid": 251, "correlation": 255126 } }, { "ph": "f", "id": 255126, "pid": 76337, "tid": -914061504, "ts": 1716454225408148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225483459, "dur": 151, "args": { "External id": 255127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255127, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255127, "pid": 5, "tid": 7, "ts": 1716454225483459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408152, "dur": 13, "args": { "External id": 255127, "cbid": 211, "correlation": 255127 } }, { "ph": "s", "id": 255127, "pid": 76337, "tid": -914061504, "ts": 1716454225408152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408219, "dur": 1, "args": { "External id": 255138, "cbid": 251, "correlation": 255138 } }, { "ph": "f", "id": 255138, "pid": 76337, "tid": -914061504, "ts": 1716454225408219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225483611, "dur": 147, "args": { "External id": 255139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255139, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255139, "pid": 5, "tid": 7, "ts": 1716454225483611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408223, "dur": 11, "args": { "External id": 255139, "cbid": 211, "correlation": 255139 } }, { "ph": "s", "id": 255139, "pid": 76337, "tid": -914061504, "ts": 1716454225408223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225483759, "dur": 1982, "args": { "External id": 255160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255160, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 255160, "pid": 5, "tid": 7, "ts": 1716454225483759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408303, "dur": 13, "args": { "External id": 255160, "cbid": 211, "correlation": 255160 } }, { "ph": "s", "id": 255160, "pid": 76337, "tid": -914061504, "ts": 1716454225408303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408402, "dur": 1, "args": { "External id": 255178, "cbid": 251, "correlation": 255178 } }, { "ph": "f", "id": 255178, "pid": 76337, "tid": -914061504, "ts": 1716454225408402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225485742, "dur": 149, "args": { "External id": 255180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255180, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 255180, "pid": 5, "tid": 7, "ts": 1716454225485742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408408, "dur": 14, "args": { "External id": 255180, "cbid": 211, "correlation": 255180 } }, { "ph": "s", "id": 255180, "pid": 76337, "tid": -914061504, "ts": 1716454225408408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225485893, "dur": 35, "args": { "External id": 255188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255188, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255188, "pid": 5, "tid": 7, "ts": 1716454225485893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408480, "dur": 12, "args": { "External id": 255188, "cbid": 211, "correlation": 255188 } }, { "ph": "s", "id": 255188, "pid": 76337, "tid": -914061504, "ts": 1716454225408480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225485929, "dur": 51, "args": { "External id": 255196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255196, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255196, "pid": 5, "tid": 7, "ts": 1716454225485929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408519, "dur": 8, "args": { "External id": 255196, "cbid": 211, "correlation": 255196 } }, { "ph": "s", "id": 255196, "pid": 76337, "tid": -914061504, "ts": 1716454225408519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225485982, "dur": 31, "args": { "External id": 255207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255207, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255207, "pid": 5, "tid": 7, "ts": 1716454225485982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408590, "dur": 13, "args": { "External id": 255207, "cbid": 211, "correlation": 255207 } }, { "ph": "s", "id": 255207, "pid": 76337, "tid": -914061504, "ts": 1716454225408590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225486013, "dur": 35, "args": { "External id": 255229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255229, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255229, "pid": 5, "tid": 7, "ts": 1716454225486013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408622, "dur": 8, "args": { "External id": 255229, "cbid": 211, "correlation": 255229 } }, { "ph": "s", "id": 255229, "pid": 76337, "tid": -914061504, "ts": 1716454225408622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408707, "dur": 1, "args": { "External id": 255240, "cbid": 251, "correlation": 255240 } }, { "ph": "f", "id": 255240, "pid": 76337, "tid": -914061504, "ts": 1716454225408707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225486050, "dur": 92, "args": { "External id": 255241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255241, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255241, "pid": 5, "tid": 7, "ts": 1716454225486050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408712, "dur": 13, "args": { "External id": 255241, "cbid": 211, "correlation": 255241 } }, { "ph": "s", "id": 255241, "pid": 76337, "tid": -914061504, "ts": 1716454225408712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408782, "dur": 1, "args": { "External id": 255252, "cbid": 251, "correlation": 255252 } }, { "ph": "f", "id": 255252, "pid": 76337, "tid": -914061504, "ts": 1716454225408782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408786, "dur": 0, "args": { "External id": 255253, "cbid": 251, "correlation": 255253 } }, { "ph": "f", "id": 255253, "pid": 76337, "tid": -914061504, "ts": 1716454225408786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225486143, "dur": 11, "args": { "External id": 255254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255254, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 255254, "pid": 5, "tid": 7, "ts": 1716454225486143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408788, "dur": 12, "args": { "External id": 255254, "cbid": 211, "correlation": 255254 } }, { "ph": "s", "id": 255254, "pid": 76337, "tid": -914061504, "ts": 1716454225408788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225486155, "dur": 5, "args": { "External id": 255256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255256, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 255256, "pid": 5, "tid": 7, "ts": 1716454225486155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408801, "dur": 5, "args": { "External id": 255256, "cbid": 211, "correlation": 255256 } }, { "ph": "s", "id": 255256, "pid": 76337, "tid": -914061504, "ts": 1716454225408801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408858, "dur": 1, "args": { "External id": 255267, "cbid": 251, "correlation": 255267 } }, { "ph": "f", "id": 255267, "pid": 76337, "tid": -914061504, "ts": 1716454225408858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225408861, "dur": 0, "args": { "External id": 255268, "cbid": 251, "correlation": 255268 } }, { "ph": "f", "id": 255268, "pid": 76337, "tid": -914061504, "ts": 1716454225408861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225486162, "dur": 8, "args": { "External id": 255269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255269, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 255269, "pid": 5, "tid": 7, "ts": 1716454225486162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408862, "dur": 12, "args": { "External id": 255269, "cbid": 211, "correlation": 255269 } }, { "ph": "s", "id": 255269, "pid": 76337, "tid": -914061504, "ts": 1716454225408862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225486171, "dur": 4, "args": { "External id": 255271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255271, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 255271, "pid": 5, "tid": 7, "ts": 1716454225486171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408876, "dur": 6, "args": { "External id": 255271, "cbid": 211, "correlation": 255271 } }, { "ph": "s", "id": 255271, "pid": 76337, "tid": -914061504, "ts": 1716454225408876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225486176, "dur": 94, "args": { "External id": 255292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255292, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 255292, "pid": 5, "tid": 7, "ts": 1716454225486176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225408949, "dur": 12, "args": { "External id": 255292, "cbid": 211, "correlation": 255292 } }, { "ph": "s", "id": 255292, "pid": 76337, "tid": -914061504, "ts": 1716454225408949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225409054, "dur": 1, "args": { "External id": 255310, "cbid": 251, "correlation": 255310 } }, { "ph": "f", "id": 255310, "pid": 76337, "tid": -914061504, "ts": 1716454225409054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225486272, "dur": 97, "args": { "External id": 255312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255312, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255312, "pid": 5, "tid": 7, "ts": 1716454225486272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409060, "dur": 14, "args": { "External id": 255312, "cbid": 211, "correlation": 255312 } }, { "ph": "s", "id": 255312, "pid": 76337, "tid": -914061504, "ts": 1716454225409060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225486370, "dur": 19, "args": { "External id": 255320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255320, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255320, "pid": 5, "tid": 7, "ts": 1716454225486370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409129, "dur": 12, "args": { "External id": 255320, "cbid": 211, "correlation": 255320 } }, { "ph": "s", "id": 255320, "pid": 76337, "tid": -914061504, "ts": 1716454225409129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225486390, "dur": 38, "args": { "External id": 255328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255328, "pid": 5, "tid": 7, "ts": 1716454225486390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409171, "dur": 9, "args": { "External id": 255328, "cbid": 211, "correlation": 255328 } }, { "ph": "s", "id": 255328, "pid": 76337, "tid": -914061504, "ts": 1716454225409171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225486429, "dur": 36, "args": { "External id": 255350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255350, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255350, "pid": 5, "tid": 7, "ts": 1716454225486429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409221, "dur": 11, "args": { "External id": 255350, "cbid": 211, "correlation": 255350 } }, { "ph": "s", "id": 255350, "pid": 76337, "tid": -914061504, "ts": 1716454225409221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225409309, "dur": 1, "args": { "External id": 255366, "cbid": 251, "correlation": 255366 } }, { "ph": "f", "id": 255366, "pid": 76337, "tid": -914061504, "ts": 1716454225409309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225409313, "dur": 0, "args": { "External id": 255368, "cbid": 251, "correlation": 255368 } }, { "ph": "f", "id": 255368, "pid": 76337, "tid": -914061504, "ts": 1716454225409313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225486466, "dur": 549, "args": { "External id": 255369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255369, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 255369, "pid": 5, "tid": 7, "ts": 1716454225486466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409317, "dur": 13, "args": { "External id": 255369, "cbid": 211, "correlation": 255369 } }, { "ph": "s", "id": 255369, "pid": 76337, "tid": -914061504, "ts": 1716454225409317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225487016, "dur": 127, "args": { "External id": 255377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255377, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255377, "pid": 5, "tid": 7, "ts": 1716454225487016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409382, "dur": 12, "args": { "External id": 255377, "cbid": 211, "correlation": 255377 } }, { "ph": "s", "id": 255377, "pid": 76337, "tid": -914061504, "ts": 1716454225409382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225487145, "dur": 130, "args": { "External id": 255385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255385, "pid": 5, "tid": 7, "ts": 1716454225487145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409412, "dur": 9, "args": { "External id": 255385, "cbid": 211, "correlation": 255385 } }, { "ph": "s", "id": 255385, "pid": 76337, "tid": -914061504, "ts": 1716454225409412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225409490, "dur": 1, "args": { "External id": 255401, "cbid": 251, "correlation": 255401 } }, { "ph": "f", "id": 255401, "pid": 76337, "tid": -914061504, "ts": 1716454225409490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225487277, "dur": 313, "args": { "External id": 255403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255403, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255403, "pid": 5, "tid": 7, "ts": 1716454225487277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409496, "dur": 12, "args": { "External id": 255403, "cbid": 211, "correlation": 255403 } }, { "ph": "s", "id": 255403, "pid": 76337, "tid": -914061504, "ts": 1716454225409496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225487591, "dur": 28, "args": { "External id": 255411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255411, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255411, "pid": 5, "tid": 7, "ts": 1716454225487591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409538, "dur": 10, "args": { "External id": 255411, "cbid": 211, "correlation": 255411 } }, { "ph": "s", "id": 255411, "pid": 76337, "tid": -914061504, "ts": 1716454225409538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225487620, "dur": 83, "args": { "External id": 255422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255422, "pid": 5, "tid": 7, "ts": 1716454225487620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409605, "dur": 12, "args": { "External id": 255422, "cbid": 211, "correlation": 255422 } }, { "ph": "s", "id": 255422, "pid": 76337, "tid": -914061504, "ts": 1716454225409605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225409669, "dur": 0, "args": { "External id": 255434, "cbid": 317, "correlation": 255434 } }, { "ph": "f", "id": 255434, "pid": 76337, "tid": -914061504, "ts": 1716454225409669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225409670, "dur": 0, "args": { "External id": 255435, "cbid": 203, "correlation": 255435 } }, { "ph": "f", "id": 255435, "pid": 76337, "tid": -914061504, "ts": 1716454225409670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225409672, "dur": 0, "args": { "External id": 255436, "cbid": 205, "correlation": 255436 } }, { "ph": "f", "id": 255436, "pid": 76337, "tid": -914061504, "ts": 1716454225409672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225487704, "dur": 24, "args": { "External id": 255440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255440, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255440, "pid": 5, "tid": 7, "ts": 1716454225487704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409687, "dur": 12, "args": { "External id": 255440, "cbid": 211, "correlation": 255440 } }, { "ph": "s", "id": 255440, "pid": 76337, "tid": -914061504, "ts": 1716454225409687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225487730, "dur": 123, "args": { "External id": 255442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255442, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255442, "pid": 5, "tid": 7, "ts": 1716454225487730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409706, "dur": 6, "args": { "External id": 255442, "cbid": 211, "correlation": 255442 } }, { "ph": "s", "id": 255442, "pid": 76337, "tid": -914061504, "ts": 1716454225409706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225487854, "dur": 26, "args": { "External id": 255444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255444, "pid": 5, "tid": 7, "ts": 1716454225487854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409716, "dur": 5, "args": { "External id": 255444, "cbid": 211, "correlation": 255444 } }, { "ph": "s", "id": 255444, "pid": 76337, "tid": -914061504, "ts": 1716454225409716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225487880, "dur": 34, "args": { "External id": 255450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255450, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255450, "pid": 5, "tid": 7, "ts": 1716454225487880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409743, "dur": 8, "args": { "External id": 255450, "cbid": 211, "correlation": 255450 } }, { "ph": "s", "id": 255450, "pid": 76337, "tid": -914061504, "ts": 1716454225409743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225487916, "dur": 26, "args": { "External id": 255458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255458, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255458, "pid": 5, "tid": 7, "ts": 1716454225487916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409776, "dur": 8, "args": { "External id": 255458, "cbid": 211, "correlation": 255458 } }, { "ph": "s", "id": 255458, "pid": 76337, "tid": -914061504, "ts": 1716454225409776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225487943, "dur": 48, "args": { "External id": 255467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255467, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255467, "pid": 5, "tid": 7, "ts": 1716454225487943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409813, "dur": 11, "args": { "External id": 255467, "cbid": 211, "correlation": 255467 } }, { "ph": "s", "id": 255467, "pid": 76337, "tid": -914061504, "ts": 1716454225409813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225487993, "dur": 42, "args": { "External id": 255487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255487, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 255487, "pid": 5, "tid": 7, "ts": 1716454225487993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409885, "dur": 12, "args": { "External id": 255487, "cbid": 211, "correlation": 255487 } }, { "ph": "s", "id": 255487, "pid": 76337, "tid": -914061504, "ts": 1716454225409885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225488036, "dur": 4, "args": { "External id": 255499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255499, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 255499, "pid": 5, "tid": 7, "ts": 1716454225488036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409906, "dur": 6, "args": { "External id": 255499, "cbid": 211, "correlation": 255499 } }, { "ph": "s", "id": 255499, "pid": 76337, "tid": -914061504, "ts": 1716454225409906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225488042, "dur": 43, "args": { "External id": 255502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255502, "pid": 5, "tid": 7, "ts": 1716454225488042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409926, "dur": 7, "args": { "External id": 255502, "cbid": 211, "correlation": 255502 } }, { "ph": "s", "id": 255502, "pid": 76337, "tid": -914061504, "ts": 1716454225409926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225488087, "dur": 29, "args": { "External id": 255511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255511, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255511, "pid": 5, "tid": 7, "ts": 1716454225488087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225409964, "dur": 18, "args": { "External id": 255511, "cbid": 211, "correlation": 255511 } }, { "ph": "s", "id": 255511, "pid": 76337, "tid": -914061504, "ts": 1716454225409964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225410025, "dur": 0, "args": { "External id": 255521, "cbid": 317, "correlation": 255521 } }, { "ph": "f", "id": 255521, "pid": 76337, "tid": -914061504, "ts": 1716454225410025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225410026, "dur": 0, "args": { "External id": 255522, "cbid": 203, "correlation": 255522 } }, { "ph": "f", "id": 255522, "pid": 76337, "tid": -914061504, "ts": 1716454225410026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225410027, "dur": 0, "args": { "External id": 255523, "cbid": 205, "correlation": 255523 } }, { "ph": "f", "id": 255523, "pid": 76337, "tid": -914061504, "ts": 1716454225410027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225488117, "dur": 31, "args": { "External id": 255527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255527, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255527, "pid": 5, "tid": 7, "ts": 1716454225488117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410041, "dur": 12, "args": { "External id": 255527, "cbid": 211, "correlation": 255527 } }, { "ph": "s", "id": 255527, "pid": 76337, "tid": -914061504, "ts": 1716454225410041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225488149, "dur": 64, "args": { "External id": 255529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255529, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255529, "pid": 5, "tid": 7, "ts": 1716454225488149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410055, "dur": 5, "args": { "External id": 255529, "cbid": 211, "correlation": 255529 } }, { "ph": "s", "id": 255529, "pid": 76337, "tid": -914061504, "ts": 1716454225410055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225488215, "dur": 980, "args": { "External id": 255531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255531, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255531, "pid": 5, "tid": 7, "ts": 1716454225488215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410067, "dur": 6, "args": { "External id": 255531, "cbid": 211, "correlation": 255531 } }, { "ph": "s", "id": 255531, "pid": 76337, "tid": -914061504, "ts": 1716454225410067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225489196, "dur": 22, "args": { "External id": 255533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255533, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255533, "pid": 5, "tid": 7, "ts": 1716454225489196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410077, "dur": 5, "args": { "External id": 255533, "cbid": 211, "correlation": 255533 } }, { "ph": "s", "id": 255533, "pid": 76337, "tid": -914061504, "ts": 1716454225410077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225489219, "dur": 33, "args": { "External id": 255539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255539, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255539, "pid": 5, "tid": 7, "ts": 1716454225489219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410105, "dur": 9, "args": { "External id": 255539, "cbid": 211, "correlation": 255539 } }, { "ph": "s", "id": 255539, "pid": 76337, "tid": -914061504, "ts": 1716454225410105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225489254, "dur": 3, "args": { "External id": 255547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255547, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 255547, "pid": 5, "tid": 7, "ts": 1716454225489254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410149, "dur": 9, "args": { "External id": 255547, "cbid": 211, "correlation": 255547 } }, { "ph": "s", "id": 255547, "pid": 76337, "tid": -914061504, "ts": 1716454225410149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225410216, "dur": 1, "args": { "External id": 255563, "cbid": 251, "correlation": 255563 } }, { "ph": "f", "id": 255563, "pid": 76337, "tid": -914061504, "ts": 1716454225410216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225410222, "dur": 0, "args": { "External id": 255565, "cbid": 251, "correlation": 255565 } }, { "ph": "f", "id": 255565, "pid": 76337, "tid": -914061504, "ts": 1716454225410222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225489258, "dur": 12, "args": { "External id": 255566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255566, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 255566, "pid": 5, "tid": 7, "ts": 1716454225489258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410224, "dur": 11, "args": { "External id": 255566, "cbid": 211, "correlation": 255566 } }, { "ph": "s", "id": 255566, "pid": 76337, "tid": -914061504, "ts": 1716454225410224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225489272, "dur": 5, "args": { "External id": 255568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255568, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 255568, "pid": 5, "tid": 7, "ts": 1716454225489272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410237, "dur": 5, "args": { "External id": 255568, "cbid": 211, "correlation": 255568 } }, { "ph": "s", "id": 255568, "pid": 76337, "tid": -914061504, "ts": 1716454225410237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225489279, "dur": 29, "args": { "External id": 255578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255578, "pid": 5, "tid": 7, "ts": 1716454225489279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410294, "dur": 12, "args": { "External id": 255578, "cbid": 211, "correlation": 255578 } }, { "ph": "s", "id": 255578, "pid": 76337, "tid": -914061504, "ts": 1716454225410294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225489309, "dur": 32, "args": { "External id": 255598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255598, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 255598, "pid": 5, "tid": 7, "ts": 1716454225489309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410359, "dur": 11, "args": { "External id": 255598, "cbid": 211, "correlation": 255598 } }, { "ph": "s", "id": 255598, "pid": 76337, "tid": -914061504, "ts": 1716454225410359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225489343, "dur": 4, "args": { "External id": 255610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255610, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 255610, "pid": 5, "tid": 7, "ts": 1716454225489343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410380, "dur": 6, "args": { "External id": 255610, "cbid": 211, "correlation": 255610 } }, { "ph": "s", "id": 255610, "pid": 76337, "tid": -914061504, "ts": 1716454225410380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225489349, "dur": 30, "args": { "External id": 255613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255613, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255613, "pid": 5, "tid": 7, "ts": 1716454225489349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410399, "dur": 6, "args": { "External id": 255613, "cbid": 211, "correlation": 255613 } }, { "ph": "s", "id": 255613, "pid": 76337, "tid": -914061504, "ts": 1716454225410399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225489380, "dur": 20, "args": { "External id": 255622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255622, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255622, "pid": 5, "tid": 7, "ts": 1716454225489380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410439, "dur": 10, "args": { "External id": 255622, "cbid": 211, "correlation": 255622 } }, { "ph": "s", "id": 255622, "pid": 76337, "tid": -914061504, "ts": 1716454225410439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225410501, "dur": 0, "args": { "External id": 255632, "cbid": 317, "correlation": 255632 } }, { "ph": "f", "id": 255632, "pid": 76337, "tid": -914061504, "ts": 1716454225410501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225410502, "dur": 0, "args": { "External id": 255633, "cbid": 203, "correlation": 255633 } }, { "ph": "f", "id": 255633, "pid": 76337, "tid": -914061504, "ts": 1716454225410502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225410502, "dur": 0, "args": { "External id": 255634, "cbid": 205, "correlation": 255634 } }, { "ph": "f", "id": 255634, "pid": 76337, "tid": -914061504, "ts": 1716454225410502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225489402, "dur": 23, "args": { "External id": 255638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255638, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255638, "pid": 5, "tid": 7, "ts": 1716454225489402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410516, "dur": 12, "args": { "External id": 255638, "cbid": 211, "correlation": 255638 } }, { "ph": "s", "id": 255638, "pid": 76337, "tid": -914061504, "ts": 1716454225410516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225489426, "dur": 45, "args": { "External id": 255640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255640, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255640, "pid": 5, "tid": 7, "ts": 1716454225489426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410531, "dur": 5, "args": { "External id": 255640, "cbid": 211, "correlation": 255640 } }, { "ph": "s", "id": 255640, "pid": 76337, "tid": -914061504, "ts": 1716454225410531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225489473, "dur": 654, "args": { "External id": 255642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255642, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255642, "pid": 5, "tid": 7, "ts": 1716454225489473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410542, "dur": 5, "args": { "External id": 255642, "cbid": 211, "correlation": 255642 } }, { "ph": "s", "id": 255642, "pid": 76337, "tid": -914061504, "ts": 1716454225410542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225490128, "dur": 22, "args": { "External id": 255644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255644, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255644, "pid": 5, "tid": 7, "ts": 1716454225490128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410551, "dur": 5, "args": { "External id": 255644, "cbid": 211, "correlation": 255644 } }, { "ph": "s", "id": 255644, "pid": 76337, "tid": -914061504, "ts": 1716454225410551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225490152, "dur": 34, "args": { "External id": 255650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255650, "pid": 5, "tid": 7, "ts": 1716454225490152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410579, "dur": 8, "args": { "External id": 255650, "cbid": 211, "correlation": 255650 } }, { "ph": "s", "id": 255650, "pid": 76337, "tid": -914061504, "ts": 1716454225410579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225410636, "dur": 0, "args": { "External id": 255660, "cbid": 317, "correlation": 255660 } }, { "ph": "f", "id": 255660, "pid": 76337, "tid": -914061504, "ts": 1716454225410636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225410637, "dur": 0, "args": { "External id": 255661, "cbid": 203, "correlation": 255661 } }, { "ph": "f", "id": 255661, "pid": 76337, "tid": -914061504, "ts": 1716454225410637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225410638, "dur": 0, "args": { "External id": 255662, "cbid": 205, "correlation": 255662 } }, { "ph": "f", "id": 255662, "pid": 76337, "tid": -914061504, "ts": 1716454225410638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225490187, "dur": 30, "args": { "External id": 255666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255666, "pid": 5, "tid": 7, "ts": 1716454225490187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410651, "dur": 12, "args": { "External id": 255666, "cbid": 211, "correlation": 255666 } }, { "ph": "s", "id": 255666, "pid": 76337, "tid": -914061504, "ts": 1716454225410651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225490218, "dur": 155, "args": { "External id": 255668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255668, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255668, "pid": 5, "tid": 7, "ts": 1716454225490218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410669, "dur": 6, "args": { "External id": 255668, "cbid": 211, "correlation": 255668 } }, { "ph": "s", "id": 255668, "pid": 76337, "tid": -914061504, "ts": 1716454225410669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225490374, "dur": 23, "args": { "External id": 255670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255670, "pid": 5, "tid": 7, "ts": 1716454225490374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410679, "dur": 5, "args": { "External id": 255670, "cbid": 211, "correlation": 255670 } }, { "ph": "s", "id": 255670, "pid": 76337, "tid": -914061504, "ts": 1716454225410679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225490398, "dur": 33, "args": { "External id": 255676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255676, "pid": 5, "tid": 7, "ts": 1716454225490398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410705, "dur": 8, "args": { "External id": 255676, "cbid": 211, "correlation": 255676 } }, { "ph": "s", "id": 255676, "pid": 76337, "tid": -914061504, "ts": 1716454225410705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225490432, "dur": 27, "args": { "External id": 255684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255684, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255684, "pid": 5, "tid": 7, "ts": 1716454225490432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410734, "dur": 7, "args": { "External id": 255684, "cbid": 211, "correlation": 255684 } }, { "ph": "s", "id": 255684, "pid": 76337, "tid": -914061504, "ts": 1716454225410734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225490461, "dur": 20, "args": { "External id": 255692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255692, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255692, "pid": 5, "tid": 7, "ts": 1716454225490461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410762, "dur": 8, "args": { "External id": 255692, "cbid": 211, "correlation": 255692 } }, { "ph": "s", "id": 255692, "pid": 76337, "tid": -914061504, "ts": 1716454225410762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225490481, "dur": 31, "args": { "External id": 255712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255712, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 255712, "pid": 5, "tid": 7, "ts": 1716454225490481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410843, "dur": 13, "args": { "External id": 255712, "cbid": 211, "correlation": 255712 } }, { "ph": "s", "id": 255712, "pid": 76337, "tid": -914061504, "ts": 1716454225410843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225490514, "dur": 5, "args": { "External id": 255724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255724, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 255724, "pid": 5, "tid": 7, "ts": 1716454225490514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410865, "dur": 6, "args": { "External id": 255724, "cbid": 211, "correlation": 255724 } }, { "ph": "s", "id": 255724, "pid": 76337, "tid": -914061504, "ts": 1716454225410865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225490520, "dur": 31, "args": { "External id": 255727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255727, "pid": 5, "tid": 7, "ts": 1716454225490520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410883, "dur": 6, "args": { "External id": 255727, "cbid": 211, "correlation": 255727 } }, { "ph": "s", "id": 255727, "pid": 76337, "tid": -914061504, "ts": 1716454225410883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225410940, "dur": 0, "args": { "External id": 255738, "cbid": 317, "correlation": 255738 } }, { "ph": "f", "id": 255738, "pid": 76337, "tid": -914061504, "ts": 1716454225410940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225410940, "dur": 0, "args": { "External id": 255739, "cbid": 203, "correlation": 255739 } }, { "ph": "f", "id": 255739, "pid": 76337, "tid": -914061504, "ts": 1716454225410940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225410941, "dur": 0, "args": { "External id": 255740, "cbid": 205, "correlation": 255740 } }, { "ph": "f", "id": 255740, "pid": 76337, "tid": -914061504, "ts": 1716454225410941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225490553, "dur": 24, "args": { "External id": 255744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255744, "pid": 5, "tid": 7, "ts": 1716454225490553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410954, "dur": 12, "args": { "External id": 255744, "cbid": 211, "correlation": 255744 } }, { "ph": "s", "id": 255744, "pid": 76337, "tid": -914061504, "ts": 1716454225410954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225490578, "dur": 107, "args": { "External id": 255746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255746, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255746, "pid": 5, "tid": 7, "ts": 1716454225490578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410972, "dur": 15, "args": { "External id": 255746, "cbid": 211, "correlation": 255746 } }, { "ph": "s", "id": 255746, "pid": 76337, "tid": -914061504, "ts": 1716454225410972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225490686, "dur": 23, "args": { "External id": 255748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255748, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255748, "pid": 5, "tid": 7, "ts": 1716454225490686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225410990, "dur": 5, "args": { "External id": 255748, "cbid": 211, "correlation": 255748 } }, { "ph": "s", "id": 255748, "pid": 76337, "tid": -914061504, "ts": 1716454225410990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225490711, "dur": 33, "args": { "External id": 255754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255754, "pid": 5, "tid": 7, "ts": 1716454225490711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411018, "dur": 8, "args": { "External id": 255754, "cbid": 211, "correlation": 255754 } }, { "ph": "s", "id": 255754, "pid": 76337, "tid": -914061504, "ts": 1716454225411018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225490746, "dur": 204, "args": { "External id": 255763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255763, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255763, "pid": 5, "tid": 7, "ts": 1716454225490746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411099, "dur": 14, "args": { "External id": 255763, "cbid": 211, "correlation": 255763 } }, { "ph": "s", "id": 255763, "pid": 76337, "tid": -914061504, "ts": 1716454225411099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225490951, "dur": 67, "args": { "External id": 255785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255785, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255785, "pid": 5, "tid": 7, "ts": 1716454225490951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411156, "dur": 10, "args": { "External id": 255785, "cbid": 211, "correlation": 255785 } }, { "ph": "s", "id": 255785, "pid": 76337, "tid": -914061504, "ts": 1716454225411156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411244, "dur": 1, "args": { "External id": 255796, "cbid": 251, "correlation": 255796 } }, { "ph": "f", "id": 255796, "pid": 76337, "tid": -914061504, "ts": 1716454225411244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225491019, "dur": 151, "args": { "External id": 255797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255797, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255797, "pid": 5, "tid": 7, "ts": 1716454225491019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411249, "dur": 13, "args": { "External id": 255797, "cbid": 211, "correlation": 255797 } }, { "ph": "s", "id": 255797, "pid": 76337, "tid": -914061504, "ts": 1716454225411249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411318, "dur": 1, "args": { "External id": 255808, "cbid": 251, "correlation": 255808 } }, { "ph": "f", "id": 255808, "pid": 76337, "tid": -914061504, "ts": 1716454225411318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225491171, "dur": 145, "args": { "External id": 255809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255809, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255809, "pid": 5, "tid": 7, "ts": 1716454225491171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411322, "dur": 12, "args": { "External id": 255809, "cbid": 211, "correlation": 255809 } }, { "ph": "s", "id": 255809, "pid": 76337, "tid": -914061504, "ts": 1716454225411322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411389, "dur": 1, "args": { "External id": 255820, "cbid": 251, "correlation": 255820 } }, { "ph": "f", "id": 255820, "pid": 76337, "tid": -914061504, "ts": 1716454225411389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225491318, "dur": 146, "args": { "External id": 255821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255821, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255821, "pid": 5, "tid": 7, "ts": 1716454225491318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411393, "dur": 11, "args": { "External id": 255821, "cbid": 211, "correlation": 255821 } }, { "ph": "s", "id": 255821, "pid": 76337, "tid": -914061504, "ts": 1716454225411393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225491465, "dur": 1988, "args": { "External id": 255842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255842, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 255842, "pid": 5, "tid": 7, "ts": 1716454225491465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411473, "dur": 13, "args": { "External id": 255842, "cbid": 211, "correlation": 255842 } }, { "ph": "s", "id": 255842, "pid": 76337, "tid": -914061504, "ts": 1716454225411473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411571, "dur": 1, "args": { "External id": 255860, "cbid": 251, "correlation": 255860 } }, { "ph": "f", "id": 255860, "pid": 76337, "tid": -914061504, "ts": 1716454225411571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225493454, "dur": 151, "args": { "External id": 255862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255862, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 255862, "pid": 5, "tid": 7, "ts": 1716454225493454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411577, "dur": 14, "args": { "External id": 255862, "cbid": 211, "correlation": 255862 } }, { "ph": "s", "id": 255862, "pid": 76337, "tid": -914061504, "ts": 1716454225411577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225493606, "dur": 36, "args": { "External id": 255870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255870, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255870, "pid": 5, "tid": 7, "ts": 1716454225493606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411647, "dur": 12, "args": { "External id": 255870, "cbid": 211, "correlation": 255870 } }, { "ph": "s", "id": 255870, "pid": 76337, "tid": -914061504, "ts": 1716454225411647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225493644, "dur": 50, "args": { "External id": 255878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255878, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255878, "pid": 5, "tid": 7, "ts": 1716454225493644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411686, "dur": 8, "args": { "External id": 255878, "cbid": 211, "correlation": 255878 } }, { "ph": "s", "id": 255878, "pid": 76337, "tid": -914061504, "ts": 1716454225411686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225493695, "dur": 31, "args": { "External id": 255889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255889, "pid": 5, "tid": 7, "ts": 1716454225493695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411756, "dur": 13, "args": { "External id": 255889, "cbid": 211, "correlation": 255889 } }, { "ph": "s", "id": 255889, "pid": 76337, "tid": -914061504, "ts": 1716454225411756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225493728, "dur": 35, "args": { "External id": 255911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255911, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 255911, "pid": 5, "tid": 7, "ts": 1716454225493728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411788, "dur": 8, "args": { "External id": 255911, "cbid": 211, "correlation": 255911 } }, { "ph": "s", "id": 255911, "pid": 76337, "tid": -914061504, "ts": 1716454225411788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411872, "dur": 1, "args": { "External id": 255922, "cbid": 251, "correlation": 255922 } }, { "ph": "f", "id": 255922, "pid": 76337, "tid": -914061504, "ts": 1716454225411872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225493764, "dur": 77, "args": { "External id": 255923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255923, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255923, "pid": 5, "tid": 7, "ts": 1716454225493764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411877, "dur": 14, "args": { "External id": 255923, "cbid": 211, "correlation": 255923 } }, { "ph": "s", "id": 255923, "pid": 76337, "tid": -914061504, "ts": 1716454225411877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411947, "dur": 1, "args": { "External id": 255934, "cbid": 251, "correlation": 255934 } }, { "ph": "f", "id": 255934, "pid": 76337, "tid": -914061504, "ts": 1716454225411947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225411950, "dur": 0, "args": { "External id": 255935, "cbid": 251, "correlation": 255935 } }, { "ph": "f", "id": 255935, "pid": 76337, "tid": -914061504, "ts": 1716454225411950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225493843, "dur": 12, "args": { "External id": 255936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255936, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 255936, "pid": 5, "tid": 7, "ts": 1716454225493843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411952, "dur": 12, "args": { "External id": 255936, "cbid": 211, "correlation": 255936 } }, { "ph": "s", "id": 255936, "pid": 76337, "tid": -914061504, "ts": 1716454225411952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225493856, "dur": 5, "args": { "External id": 255938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255938, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 255938, "pid": 5, "tid": 7, "ts": 1716454225493856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225411965, "dur": 6, "args": { "External id": 255938, "cbid": 211, "correlation": 255938 } }, { "ph": "s", "id": 255938, "pid": 76337, "tid": -914061504, "ts": 1716454225411965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225412032, "dur": 1, "args": { "External id": 255949, "cbid": 251, "correlation": 255949 } }, { "ph": "f", "id": 255949, "pid": 76337, "tid": -914061504, "ts": 1716454225412032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225412036, "dur": 0, "args": { "External id": 255950, "cbid": 251, "correlation": 255950 } }, { "ph": "f", "id": 255950, "pid": 76337, "tid": -914061504, "ts": 1716454225412036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225493862, "dur": 7, "args": { "External id": 255951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255951, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 255951, "pid": 5, "tid": 7, "ts": 1716454225493862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412038, "dur": 12, "args": { "External id": 255951, "cbid": 211, "correlation": 255951 } }, { "ph": "s", "id": 255951, "pid": 76337, "tid": -914061504, "ts": 1716454225412038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225493871, "dur": 3, "args": { "External id": 255953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255953, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 255953, "pid": 5, "tid": 7, "ts": 1716454225493871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412051, "dur": 5, "args": { "External id": 255953, "cbid": 211, "correlation": 255953 } }, { "ph": "s", "id": 255953, "pid": 76337, "tid": -914061504, "ts": 1716454225412051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225493875, "dur": 93, "args": { "External id": 255974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255974, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 255974, "pid": 5, "tid": 7, "ts": 1716454225493875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412125, "dur": 14, "args": { "External id": 255974, "cbid": 211, "correlation": 255974 } }, { "ph": "s", "id": 255974, "pid": 76337, "tid": -914061504, "ts": 1716454225412125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225412221, "dur": 1, "args": { "External id": 255992, "cbid": 251, "correlation": 255992 } }, { "ph": "f", "id": 255992, "pid": 76337, "tid": -914061504, "ts": 1716454225412221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225493970, "dur": 97, "args": { "External id": 255994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 255994, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 255994, "pid": 5, "tid": 7, "ts": 1716454225493970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412227, "dur": 14, "args": { "External id": 255994, "cbid": 211, "correlation": 255994 } }, { "ph": "s", "id": 255994, "pid": 76337, "tid": -914061504, "ts": 1716454225412227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225494068, "dur": 20, "args": { "External id": 256002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256002, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256002, "pid": 5, "tid": 7, "ts": 1716454225494068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412297, "dur": 12, "args": { "External id": 256002, "cbid": 211, "correlation": 256002 } }, { "ph": "s", "id": 256002, "pid": 76337, "tid": -914061504, "ts": 1716454225412297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225494089, "dur": 38, "args": { "External id": 256010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256010, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256010, "pid": 5, "tid": 7, "ts": 1716454225494089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412338, "dur": 9, "args": { "External id": 256010, "cbid": 211, "correlation": 256010 } }, { "ph": "s", "id": 256010, "pid": 76337, "tid": -914061504, "ts": 1716454225412338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225494129, "dur": 36, "args": { "External id": 256032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256032, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256032, "pid": 5, "tid": 7, "ts": 1716454225494129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412389, "dur": 10, "args": { "External id": 256032, "cbid": 211, "correlation": 256032 } }, { "ph": "s", "id": 256032, "pid": 76337, "tid": -914061504, "ts": 1716454225412389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225412477, "dur": 1, "args": { "External id": 256048, "cbid": 251, "correlation": 256048 } }, { "ph": "f", "id": 256048, "pid": 76337, "tid": -914061504, "ts": 1716454225412477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225412482, "dur": 0, "args": { "External id": 256050, "cbid": 251, "correlation": 256050 } }, { "ph": "f", "id": 256050, "pid": 76337, "tid": -914061504, "ts": 1716454225412482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225494166, "dur": 550, "args": { "External id": 256051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256051, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 256051, "pid": 5, "tid": 7, "ts": 1716454225494166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412486, "dur": 13, "args": { "External id": 256051, "cbid": 211, "correlation": 256051 } }, { "ph": "s", "id": 256051, "pid": 76337, "tid": -914061504, "ts": 1716454225412486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225494717, "dur": 128, "args": { "External id": 256059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256059, "pid": 5, "tid": 7, "ts": 1716454225494717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412551, "dur": 12, "args": { "External id": 256059, "cbid": 211, "correlation": 256059 } }, { "ph": "s", "id": 256059, "pid": 76337, "tid": -914061504, "ts": 1716454225412551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225494846, "dur": 128, "args": { "External id": 256067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256067, "pid": 5, "tid": 7, "ts": 1716454225494846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412580, "dur": 9, "args": { "External id": 256067, "cbid": 211, "correlation": 256067 } }, { "ph": "s", "id": 256067, "pid": 76337, "tid": -914061504, "ts": 1716454225412580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225412657, "dur": 1, "args": { "External id": 256083, "cbid": 251, "correlation": 256083 } }, { "ph": "f", "id": 256083, "pid": 76337, "tid": -914061504, "ts": 1716454225412657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225494976, "dur": 315, "args": { "External id": 256085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256085, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256085, "pid": 5, "tid": 7, "ts": 1716454225494976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412663, "dur": 12, "args": { "External id": 256085, "cbid": 211, "correlation": 256085 } }, { "ph": "s", "id": 256085, "pid": 76337, "tid": -914061504, "ts": 1716454225412663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225495292, "dur": 27, "args": { "External id": 256093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256093, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256093, "pid": 5, "tid": 7, "ts": 1716454225495292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412705, "dur": 10, "args": { "External id": 256093, "cbid": 211, "correlation": 256093 } }, { "ph": "s", "id": 256093, "pid": 76337, "tid": -914061504, "ts": 1716454225412705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225495321, "dur": 83, "args": { "External id": 256104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256104, "pid": 5, "tid": 7, "ts": 1716454225495321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412772, "dur": 12, "args": { "External id": 256104, "cbid": 211, "correlation": 256104 } }, { "ph": "s", "id": 256104, "pid": 76337, "tid": -914061504, "ts": 1716454225412772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225412835, "dur": 0, "args": { "External id": 256116, "cbid": 317, "correlation": 256116 } }, { "ph": "f", "id": 256116, "pid": 76337, "tid": -914061504, "ts": 1716454225412835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225412836, "dur": 0, "args": { "External id": 256117, "cbid": 203, "correlation": 256117 } }, { "ph": "f", "id": 256117, "pid": 76337, "tid": -914061504, "ts": 1716454225412836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225412837, "dur": 0, "args": { "External id": 256118, "cbid": 205, "correlation": 256118 } }, { "ph": "f", "id": 256118, "pid": 76337, "tid": -914061504, "ts": 1716454225412837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225495405, "dur": 23, "args": { "External id": 256122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256122, "pid": 5, "tid": 7, "ts": 1716454225495405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412851, "dur": 12, "args": { "External id": 256122, "cbid": 211, "correlation": 256122 } }, { "ph": "s", "id": 256122, "pid": 76337, "tid": -914061504, "ts": 1716454225412851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225495429, "dur": 122, "args": { "External id": 256124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256124, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256124, "pid": 5, "tid": 7, "ts": 1716454225495429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412870, "dur": 6, "args": { "External id": 256124, "cbid": 211, "correlation": 256124 } }, { "ph": "s", "id": 256124, "pid": 76337, "tid": -914061504, "ts": 1716454225412870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225495552, "dur": 24, "args": { "External id": 256126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256126, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256126, "pid": 5, "tid": 7, "ts": 1716454225495552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412880, "dur": 5, "args": { "External id": 256126, "cbid": 211, "correlation": 256126 } }, { "ph": "s", "id": 256126, "pid": 76337, "tid": -914061504, "ts": 1716454225412880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225495578, "dur": 33, "args": { "External id": 256132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256132, "pid": 5, "tid": 7, "ts": 1716454225495578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412908, "dur": 9, "args": { "External id": 256132, "cbid": 211, "correlation": 256132 } }, { "ph": "s", "id": 256132, "pid": 76337, "tid": -914061504, "ts": 1716454225412908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225495613, "dur": 27, "args": { "External id": 256140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256140, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256140, "pid": 5, "tid": 7, "ts": 1716454225495613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225412940, "dur": 8, "args": { "External id": 256140, "cbid": 211, "correlation": 256140 } }, { "ph": "s", "id": 256140, "pid": 76337, "tid": -914061504, "ts": 1716454225412940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225495641, "dur": 103, "args": { "External id": 256151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256151, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256151, "pid": 5, "tid": 7, "ts": 1716454225495641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413009, "dur": 12, "args": { "External id": 256151, "cbid": 211, "correlation": 256151 } }, { "ph": "s", "id": 256151, "pid": 76337, "tid": -914061504, "ts": 1716454225413009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225413065, "dur": 0, "args": { "External id": 256161, "cbid": 317, "correlation": 256161 } }, { "ph": "f", "id": 256161, "pid": 76337, "tid": -914061504, "ts": 1716454225413065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225413066, "dur": 0, "args": { "External id": 256162, "cbid": 203, "correlation": 256162 } }, { "ph": "f", "id": 256162, "pid": 76337, "tid": -914061504, "ts": 1716454225413066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225413066, "dur": 0, "args": { "External id": 256163, "cbid": 205, "correlation": 256163 } }, { "ph": "f", "id": 256163, "pid": 76337, "tid": -914061504, "ts": 1716454225413066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225495745, "dur": 75, "args": { "External id": 256167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256167, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256167, "pid": 5, "tid": 7, "ts": 1716454225495745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413080, "dur": 12, "args": { "External id": 256167, "cbid": 211, "correlation": 256167 } }, { "ph": "s", "id": 256167, "pid": 76337, "tid": -914061504, "ts": 1716454225413080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225495821, "dur": 45, "args": { "External id": 256169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256169, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256169, "pid": 5, "tid": 7, "ts": 1716454225495821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413095, "dur": 5, "args": { "External id": 256169, "cbid": 211, "correlation": 256169 } }, { "ph": "s", "id": 256169, "pid": 76337, "tid": -914061504, "ts": 1716454225413095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225495867, "dur": 4, "args": { "External id": 256171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256171, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256171, "pid": 5, "tid": 7, "ts": 1716454225495867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413104, "dur": 6, "args": { "External id": 256171, "cbid": 211, "correlation": 256171 } }, { "ph": "s", "id": 256171, "pid": 76337, "tid": -914061504, "ts": 1716454225413104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225413113, "dur": 0, "args": { "External id": 256172, "cbid": 51, "correlation": 256172 } }, { "ph": "s", "id": 256172, "pid": 76337, "tid": -914061504, "ts": 1716454225413113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225495872, "dur": 2239, "args": { "External id": 256173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256173, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256173, "pid": 5, "tid": 7, "ts": 1716454225495872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413114, "dur": 5, "args": { "External id": 256173, "cbid": 211, "correlation": 256173 } }, { "ph": "s", "id": 256173, "pid": 76337, "tid": -914061504, "ts": 1716454225413114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225498112, "dur": 114, "args": { "External id": 256178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256178, "pid": 5, "tid": 7, "ts": 1716454225498112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413143, "dur": 9, "args": { "External id": 256178, "cbid": 211, "correlation": 256178 } }, { "ph": "s", "id": 256178, "pid": 76337, "tid": -914061504, "ts": 1716454225413143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225498227, "dur": 167, "args": { "External id": 256187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256187, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256187, "pid": 5, "tid": 7, "ts": 1716454225498227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413233, "dur": 14, "args": { "External id": 256187, "cbid": 211, "correlation": 256187 } }, { "ph": "s", "id": 256187, "pid": 76337, "tid": -914061504, "ts": 1716454225413233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225498395, "dur": 128, "args": { "External id": 256207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256207, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 256207, "pid": 5, "tid": 7, "ts": 1716454225498395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413303, "dur": 11, "args": { "External id": 256207, "cbid": 211, "correlation": 256207 } }, { "ph": "s", "id": 256207, "pid": 76337, "tid": -914061504, "ts": 1716454225413303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225498525, "dur": 5, "args": { "External id": 256219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256219, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 256219, "pid": 5, "tid": 7, "ts": 1716454225498525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413324, "dur": 6, "args": { "External id": 256219, "cbid": 211, "correlation": 256219 } }, { "ph": "s", "id": 256219, "pid": 76337, "tid": -914061504, "ts": 1716454225413324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225498531, "dur": 161, "args": { "External id": 256222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256222, "pid": 5, "tid": 7, "ts": 1716454225498531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413344, "dur": 7, "args": { "External id": 256222, "cbid": 211, "correlation": 256222 } }, { "ph": "s", "id": 256222, "pid": 76337, "tid": -914061504, "ts": 1716454225413344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225498694, "dur": 102, "args": { "External id": 256231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256231, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256231, "pid": 5, "tid": 7, "ts": 1716454225498694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413383, "dur": 11, "args": { "External id": 256231, "cbid": 211, "correlation": 256231 } }, { "ph": "s", "id": 256231, "pid": 76337, "tid": -914061504, "ts": 1716454225413383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225413436, "dur": 0, "args": { "External id": 256241, "cbid": 317, "correlation": 256241 } }, { "ph": "f", "id": 256241, "pid": 76337, "tid": -914061504, "ts": 1716454225413436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225413437, "dur": 0, "args": { "External id": 256242, "cbid": 203, "correlation": 256242 } }, { "ph": "f", "id": 256242, "pid": 76337, "tid": -914061504, "ts": 1716454225413437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225413438, "dur": 0, "args": { "External id": 256243, "cbid": 205, "correlation": 256243 } }, { "ph": "f", "id": 256243, "pid": 76337, "tid": -914061504, "ts": 1716454225413438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225498796, "dur": 110, "args": { "External id": 256247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256247, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256247, "pid": 5, "tid": 7, "ts": 1716454225498796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413455, "dur": 11, "args": { "External id": 256247, "cbid": 211, "correlation": 256247 } }, { "ph": "s", "id": 256247, "pid": 76337, "tid": -914061504, "ts": 1716454225413455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225498908, "dur": 34, "args": { "External id": 256249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256249, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256249, "pid": 5, "tid": 7, "ts": 1716454225498908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413468, "dur": 5, "args": { "External id": 256249, "cbid": 211, "correlation": 256249 } }, { "ph": "s", "id": 256249, "pid": 76337, "tid": -914061504, "ts": 1716454225413468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225498944, "dur": 4, "args": { "External id": 256251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256251, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256251, "pid": 5, "tid": 7, "ts": 1716454225498944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413478, "dur": 6, "args": { "External id": 256251, "cbid": 211, "correlation": 256251 } }, { "ph": "s", "id": 256251, "pid": 76337, "tid": -914061504, "ts": 1716454225413478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225413488, "dur": 0, "args": { "External id": 256252, "cbid": 51, "correlation": 256252 } }, { "ph": "s", "id": 256252, "pid": 76337, "tid": -914061504, "ts": 1716454225413488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225498949, "dur": 2049, "args": { "External id": 256253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256253, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256253, "pid": 5, "tid": 7, "ts": 1716454225498949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413488, "dur": 5, "args": { "External id": 256253, "cbid": 211, "correlation": 256253 } }, { "ph": "s", "id": 256253, "pid": 76337, "tid": -914061504, "ts": 1716454225413488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225500998, "dur": 60, "args": { "External id": 256258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256258, "pid": 5, "tid": 7, "ts": 1716454225500998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413517, "dur": 9, "args": { "External id": 256258, "cbid": 211, "correlation": 256258 } }, { "ph": "s", "id": 256258, "pid": 76337, "tid": -914061504, "ts": 1716454225413517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225501059, "dur": 4, "args": { "External id": 256266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256266, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256266, "pid": 5, "tid": 7, "ts": 1716454225501059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413560, "dur": 9, "args": { "External id": 256266, "cbid": 211, "correlation": 256266 } }, { "ph": "s", "id": 256266, "pid": 76337, "tid": -914061504, "ts": 1716454225413560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225413627, "dur": 2, "args": { "External id": 256282, "cbid": 251, "correlation": 256282 } }, { "ph": "f", "id": 256282, "pid": 76337, "tid": -914061504, "ts": 1716454225413627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225413633, "dur": 0, "args": { "External id": 256284, "cbid": 251, "correlation": 256284 } }, { "ph": "f", "id": 256284, "pid": 76337, "tid": -914061504, "ts": 1716454225413633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225501064, "dur": 12, "args": { "External id": 256285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256285, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 256285, "pid": 5, "tid": 7, "ts": 1716454225501064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413635, "dur": 11, "args": { "External id": 256285, "cbid": 211, "correlation": 256285 } }, { "ph": "s", "id": 256285, "pid": 76337, "tid": -914061504, "ts": 1716454225413635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225501077, "dur": 5, "args": { "External id": 256287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256287, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 256287, "pid": 5, "tid": 7, "ts": 1716454225501077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413649, "dur": 6, "args": { "External id": 256287, "cbid": 211, "correlation": 256287 } }, { "ph": "s", "id": 256287, "pid": 76337, "tid": -914061504, "ts": 1716454225413649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225501084, "dur": 54, "args": { "External id": 256297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256297, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256297, "pid": 5, "tid": 7, "ts": 1716454225501084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413706, "dur": 12, "args": { "External id": 256297, "cbid": 211, "correlation": 256297 } }, { "ph": "s", "id": 256297, "pid": 76337, "tid": -914061504, "ts": 1716454225413706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225501139, "dur": 53, "args": { "External id": 256317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256317, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 256317, "pid": 5, "tid": 7, "ts": 1716454225501139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413772, "dur": 11, "args": { "External id": 256317, "cbid": 211, "correlation": 256317 } }, { "ph": "s", "id": 256317, "pid": 76337, "tid": -914061504, "ts": 1716454225413772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225501194, "dur": 4, "args": { "External id": 256329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256329, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256329, "pid": 5, "tid": 7, "ts": 1716454225501194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413793, "dur": 6, "args": { "External id": 256329, "cbid": 211, "correlation": 256329 } }, { "ph": "s", "id": 256329, "pid": 76337, "tid": -914061504, "ts": 1716454225413793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225501199, "dur": 56, "args": { "External id": 256332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256332, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256332, "pid": 5, "tid": 7, "ts": 1716454225501199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413811, "dur": 6, "args": { "External id": 256332, "cbid": 211, "correlation": 256332 } }, { "ph": "s", "id": 256332, "pid": 76337, "tid": -914061504, "ts": 1716454225413811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225501256, "dur": 36, "args": { "External id": 256341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256341, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256341, "pid": 5, "tid": 7, "ts": 1716454225501256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413851, "dur": 10, "args": { "External id": 256341, "cbid": 211, "correlation": 256341 } }, { "ph": "s", "id": 256341, "pid": 76337, "tid": -914061504, "ts": 1716454225413851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225413914, "dur": 0, "args": { "External id": 256351, "cbid": 317, "correlation": 256351 } }, { "ph": "f", "id": 256351, "pid": 76337, "tid": -914061504, "ts": 1716454225413914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225413915, "dur": 0, "args": { "External id": 256352, "cbid": 203, "correlation": 256352 } }, { "ph": "f", "id": 256352, "pid": 76337, "tid": -914061504, "ts": 1716454225413915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225413915, "dur": 0, "args": { "External id": 256353, "cbid": 205, "correlation": 256353 } }, { "ph": "f", "id": 256353, "pid": 76337, "tid": -914061504, "ts": 1716454225413915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225501294, "dur": 41, "args": { "External id": 256357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256357, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256357, "pid": 5, "tid": 7, "ts": 1716454225501294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413932, "dur": 13, "args": { "External id": 256357, "cbid": 211, "correlation": 256357 } }, { "ph": "s", "id": 256357, "pid": 76337, "tid": -914061504, "ts": 1716454225413932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225501336, "dur": 14, "args": { "External id": 256359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256359, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256359, "pid": 5, "tid": 7, "ts": 1716454225501336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413947, "dur": 5, "args": { "External id": 256359, "cbid": 211, "correlation": 256359 } }, { "ph": "s", "id": 256359, "pid": 76337, "tid": -914061504, "ts": 1716454225413947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225501351, "dur": 3, "args": { "External id": 256361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256361, "pid": 5, "tid": 7, "ts": 1716454225501351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413957, "dur": 5, "args": { "External id": 256361, "cbid": 211, "correlation": 256361 } }, { "ph": "s", "id": 256361, "pid": 76337, "tid": -914061504, "ts": 1716454225413957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225413965, "dur": 0, "args": { "External id": 256362, "cbid": 51, "correlation": 256362 } }, { "ph": "s", "id": 256362, "pid": 76337, "tid": -914061504, "ts": 1716454225413965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225501356, "dur": 713, "args": { "External id": 256363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256363, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256363, "pid": 5, "tid": 7, "ts": 1716454225501356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225413965, "dur": 5, "args": { "External id": 256363, "cbid": 211, "correlation": 256363 } }, { "ph": "s", "id": 256363, "pid": 76337, "tid": -914061504, "ts": 1716454225413965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225502071, "dur": 60, "args": { "External id": 256368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256368, "pid": 5, "tid": 7, "ts": 1716454225502071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414002, "dur": 9, "args": { "External id": 256368, "cbid": 211, "correlation": 256368 } }, { "ph": "s", "id": 256368, "pid": 76337, "tid": -914061504, "ts": 1716454225414002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225414061, "dur": 0, "args": { "External id": 256378, "cbid": 317, "correlation": 256378 } }, { "ph": "f", "id": 256378, "pid": 76337, "tid": -914061504, "ts": 1716454225414061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225414062, "dur": 0, "args": { "External id": 256379, "cbid": 203, "correlation": 256379 } }, { "ph": "f", "id": 256379, "pid": 76337, "tid": -914061504, "ts": 1716454225414062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225414062, "dur": 0, "args": { "External id": 256380, "cbid": 205, "correlation": 256380 } }, { "ph": "f", "id": 256380, "pid": 76337, "tid": -914061504, "ts": 1716454225414062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225502132, "dur": 4, "args": { "External id": 256384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256384, "pid": 5, "tid": 7, "ts": 1716454225502132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414078, "dur": 12, "args": { "External id": 256384, "cbid": 211, "correlation": 256384 } }, { "ph": "s", "id": 256384, "pid": 76337, "tid": -914061504, "ts": 1716454225414078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225414095, "dur": 0, "args": { "External id": 256385, "cbid": 51, "correlation": 256385 } }, { "ph": "s", "id": 256385, "pid": 76337, "tid": -914061504, "ts": 1716454225414095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454225502137, "dur": 270, "args": { "External id": 256386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256386, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256386, "pid": 5, "tid": 7, "ts": 1716454225502137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414096, "dur": 7, "args": { "External id": 256386, "cbid": 211, "correlation": 256386 } }, { "ph": "s", "id": 256386, "pid": 76337, "tid": -914061504, "ts": 1716454225414096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225502409, "dur": 61, "args": { "External id": 256391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256391, "pid": 5, "tid": 7, "ts": 1716454225502409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414123, "dur": 8, "args": { "External id": 256391, "cbid": 211, "correlation": 256391 } }, { "ph": "s", "id": 256391, "pid": 76337, "tid": -914061504, "ts": 1716454225414123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225502471, "dur": 50, "args": { "External id": 256399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256399, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256399, "pid": 5, "tid": 7, "ts": 1716454225502471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414152, "dur": 8, "args": { "External id": 256399, "cbid": 211, "correlation": 256399 } }, { "ph": "s", "id": 256399, "pid": 76337, "tid": -914061504, "ts": 1716454225414152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225502522, "dur": 36, "args": { "External id": 256407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256407, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256407, "pid": 5, "tid": 7, "ts": 1716454225502522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414181, "dur": 9, "args": { "External id": 256407, "cbid": 211, "correlation": 256407 } }, { "ph": "s", "id": 256407, "pid": 76337, "tid": -914061504, "ts": 1716454225414181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225502559, "dur": 53, "args": { "External id": 256427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256427, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 256427, "pid": 5, "tid": 7, "ts": 1716454225502559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414262, "dur": 12, "args": { "External id": 256427, "cbid": 211, "correlation": 256427 } }, { "ph": "s", "id": 256427, "pid": 76337, "tid": -914061504, "ts": 1716454225414262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225502613, "dur": 4, "args": { "External id": 256439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256439, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256439, "pid": 5, "tid": 7, "ts": 1716454225502613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414284, "dur": 7, "args": { "External id": 256439, "cbid": 211, "correlation": 256439 } }, { "ph": "s", "id": 256439, "pid": 76337, "tid": -914061504, "ts": 1716454225414284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225502618, "dur": 55, "args": { "External id": 256442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256442, "pid": 5, "tid": 7, "ts": 1716454225502618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414302, "dur": 6, "args": { "External id": 256442, "cbid": 211, "correlation": 256442 } }, { "ph": "s", "id": 256442, "pid": 76337, "tid": -914061504, "ts": 1716454225414302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225414358, "dur": 0, "args": { "External id": 256453, "cbid": 317, "correlation": 256453 } }, { "ph": "f", "id": 256453, "pid": 76337, "tid": -914061504, "ts": 1716454225414358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225414359, "dur": 0, "args": { "External id": 256454, "cbid": 203, "correlation": 256454 } }, { "ph": "f", "id": 256454, "pid": 76337, "tid": -914061504, "ts": 1716454225414359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225414360, "dur": 0, "args": { "External id": 256455, "cbid": 205, "correlation": 256455 } }, { "ph": "f", "id": 256455, "pid": 76337, "tid": -914061504, "ts": 1716454225414360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414388, "dur": 1, "args": { "External id": 256459, "cbid": 251, "correlation": 256459 } }, { "ph": "f", "id": 256459, "pid": 76337, "tid": -914061504, "ts": 1716454225414388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414391, "dur": 0, "args": { "External id": 256460, "cbid": 251, "correlation": 256460 } }, { "ph": "f", "id": 256460, "pid": 76337, "tid": -914061504, "ts": 1716454225414391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414392, "dur": 0, "args": { "External id": 256461, "cbid": 251, "correlation": 256461 } }, { "ph": "f", "id": 256461, "pid": 76337, "tid": -914061504, "ts": 1716454225414392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414393, "dur": 0, "args": { "External id": 256462, "cbid": 251, "correlation": 256462 } }, { "ph": "f", "id": 256462, "pid": 76337, "tid": -914061504, "ts": 1716454225414393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414395, "dur": 1, "args": { "External id": 256463, "cbid": 251, "correlation": 256463 } }, { "ph": "f", "id": 256463, "pid": 76337, "tid": -914061504, "ts": 1716454225414395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414397, "dur": 1, "args": { "External id": 256464, "cbid": 251, "correlation": 256464 } }, { "ph": "f", "id": 256464, "pid": 76337, "tid": -914061504, "ts": 1716454225414397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414399, "dur": 1, "args": { "External id": 256465, "cbid": 251, "correlation": 256465 } }, { "ph": "f", "id": 256465, "pid": 76337, "tid": -914061504, "ts": 1716454225414399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414400, "dur": 1, "args": { "External id": 256466, "cbid": 251, "correlation": 256466 } }, { "ph": "f", "id": 256466, "pid": 76337, "tid": -914061504, "ts": 1716454225414400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414402, "dur": 0, "args": { "External id": 256467, "cbid": 251, "correlation": 256467 } }, { "ph": "f", "id": 256467, "pid": 76337, "tid": -914061504, "ts": 1716454225414402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225502675, "dur": 114, "args": { "External id": 256468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256468, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 256468, "pid": 5, "tid": 7, "ts": 1716454225502675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414405, "dur": 12, "args": { "External id": 256468, "cbid": 211, "correlation": 256468 } }, { "ph": "s", "id": 256468, "pid": 76337, "tid": -914061504, "ts": 1716454225414405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225502790, "dur": 60, "args": { "External id": 256474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256474, "pid": 5, "tid": 7, "ts": 1716454225502790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414441, "dur": 9, "args": { "External id": 256474, "cbid": 211, "correlation": 256474 } }, { "ph": "s", "id": 256474, "pid": 76337, "tid": -914061504, "ts": 1716454225414441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225502852, "dur": 499, "args": { "External id": 256483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256483, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256483, "pid": 5, "tid": 7, "ts": 1716454225502852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414522, "dur": 14, "args": { "External id": 256483, "cbid": 211, "correlation": 256483 } }, { "ph": "s", "id": 256483, "pid": 76337, "tid": -914061504, "ts": 1716454225414522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225503352, "dur": 184, "args": { "External id": 256505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256505, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256505, "pid": 5, "tid": 7, "ts": 1716454225503352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414579, "dur": 10, "args": { "External id": 256505, "cbid": 211, "correlation": 256505 } }, { "ph": "s", "id": 256505, "pid": 76337, "tid": -914061504, "ts": 1716454225414579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414667, "dur": 1, "args": { "External id": 256516, "cbid": 251, "correlation": 256516 } }, { "ph": "f", "id": 256516, "pid": 76337, "tid": -914061504, "ts": 1716454225414667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225503537, "dur": 198, "args": { "External id": 256517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256517, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256517, "pid": 5, "tid": 7, "ts": 1716454225503537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414672, "dur": 14, "args": { "External id": 256517, "cbid": 211, "correlation": 256517 } }, { "ph": "s", "id": 256517, "pid": 76337, "tid": -914061504, "ts": 1716454225414672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414742, "dur": 1, "args": { "External id": 256528, "cbid": 251, "correlation": 256528 } }, { "ph": "f", "id": 256528, "pid": 76337, "tid": -914061504, "ts": 1716454225414742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225503737, "dur": 189, "args": { "External id": 256529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256529, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256529, "pid": 5, "tid": 7, "ts": 1716454225503737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414746, "dur": 11, "args": { "External id": 256529, "cbid": 211, "correlation": 256529 } }, { "ph": "s", "id": 256529, "pid": 76337, "tid": -914061504, "ts": 1716454225414746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225414817, "dur": 1, "args": { "External id": 256540, "cbid": 251, "correlation": 256540 } }, { "ph": "f", "id": 256540, "pid": 76337, "tid": -914061504, "ts": 1716454225414817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225503928, "dur": 189, "args": { "External id": 256541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256541, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256541, "pid": 5, "tid": 7, "ts": 1716454225503928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414821, "dur": 13, "args": { "External id": 256541, "cbid": 211, "correlation": 256541 } }, { "ph": "s", "id": 256541, "pid": 76337, "tid": -914061504, "ts": 1716454225414821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225504118, "dur": 18959, "args": { "External id": 256562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256562, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 256562, "pid": 5, "tid": 7, "ts": 1716454225504118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225414904, "dur": 14, "args": { "External id": 256562, "cbid": 211, "correlation": 256562 } }, { "ph": "s", "id": 256562, "pid": 76337, "tid": -914061504, "ts": 1716454225414904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415008, "dur": 1, "args": { "External id": 256580, "cbid": 251, "correlation": 256580 } }, { "ph": "f", "id": 256580, "pid": 76337, "tid": -914061504, "ts": 1716454225415008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225523078, "dur": 204, "args": { "External id": 256582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256582, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256582, "pid": 5, "tid": 7, "ts": 1716454225523078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415014, "dur": 14, "args": { "External id": 256582, "cbid": 211, "correlation": 256582 } }, { "ph": "s", "id": 256582, "pid": 76337, "tid": -914061504, "ts": 1716454225415014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225523284, "dur": 67, "args": { "External id": 256590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256590, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256590, "pid": 5, "tid": 7, "ts": 1716454225523284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415086, "dur": 12, "args": { "External id": 256590, "cbid": 211, "correlation": 256590 } }, { "ph": "s", "id": 256590, "pid": 76337, "tid": -914061504, "ts": 1716454225415086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225523352, "dur": 97, "args": { "External id": 256598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256598, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256598, "pid": 5, "tid": 7, "ts": 1716454225523352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415124, "dur": 9, "args": { "External id": 256598, "cbid": 211, "correlation": 256598 } }, { "ph": "s", "id": 256598, "pid": 76337, "tid": -914061504, "ts": 1716454225415124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225523450, "dur": 55, "args": { "External id": 256609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256609, "pid": 5, "tid": 7, "ts": 1716454225523450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415197, "dur": 12, "args": { "External id": 256609, "cbid": 211, "correlation": 256609 } }, { "ph": "s", "id": 256609, "pid": 76337, "tid": -914061504, "ts": 1716454225415197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225523506, "dur": 94, "args": { "External id": 256631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256631, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256631, "pid": 5, "tid": 7, "ts": 1716454225523506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415229, "dur": 8, "args": { "External id": 256631, "cbid": 211, "correlation": 256631 } }, { "ph": "s", "id": 256631, "pid": 76337, "tid": -914061504, "ts": 1716454225415229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415312, "dur": 1, "args": { "External id": 256642, "cbid": 251, "correlation": 256642 } }, { "ph": "f", "id": 256642, "pid": 76337, "tid": -914061504, "ts": 1716454225415312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225523602, "dur": 108, "args": { "External id": 256643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256643, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256643, "pid": 5, "tid": 7, "ts": 1716454225523602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415317, "dur": 13, "args": { "External id": 256643, "cbid": 211, "correlation": 256643 } }, { "ph": "s", "id": 256643, "pid": 76337, "tid": -914061504, "ts": 1716454225415317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415391, "dur": 1, "args": { "External id": 256654, "cbid": 251, "correlation": 256654 } }, { "ph": "f", "id": 256654, "pid": 76337, "tid": -914061504, "ts": 1716454225415391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415395, "dur": 0, "args": { "External id": 256655, "cbid": 251, "correlation": 256655 } }, { "ph": "f", "id": 256655, "pid": 76337, "tid": -914061504, "ts": 1716454225415395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225523711, "dur": 10, "args": { "External id": 256656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256656, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 256656, "pid": 5, "tid": 7, "ts": 1716454225523711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415397, "dur": 13, "args": { "External id": 256656, "cbid": 211, "correlation": 256656 } }, { "ph": "s", "id": 256656, "pid": 76337, "tid": -914061504, "ts": 1716454225415397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225523723, "dur": 5, "args": { "External id": 256658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256658, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 256658, "pid": 5, "tid": 7, "ts": 1716454225523723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415413, "dur": 7, "args": { "External id": 256658, "cbid": 211, "correlation": 256658 } }, { "ph": "s", "id": 256658, "pid": 76337, "tid": -914061504, "ts": 1716454225415413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415475, "dur": 1, "args": { "External id": 256669, "cbid": 251, "correlation": 256669 } }, { "ph": "f", "id": 256669, "pid": 76337, "tid": -914061504, "ts": 1716454225415475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415479, "dur": 0, "args": { "External id": 256670, "cbid": 251, "correlation": 256670 } }, { "ph": "f", "id": 256670, "pid": 76337, "tid": -914061504, "ts": 1716454225415479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225523729, "dur": 6, "args": { "External id": 256671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256671, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 256671, "pid": 5, "tid": 7, "ts": 1716454225523729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415481, "dur": 11, "args": { "External id": 256671, "cbid": 211, "correlation": 256671 } }, { "ph": "s", "id": 256671, "pid": 76337, "tid": -914061504, "ts": 1716454225415481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225523736, "dur": 4, "args": { "External id": 256673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256673, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 256673, "pid": 5, "tid": 7, "ts": 1716454225523736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415494, "dur": 5, "args": { "External id": 256673, "cbid": 211, "correlation": 256673 } }, { "ph": "s", "id": 256673, "pid": 76337, "tid": -914061504, "ts": 1716454225415494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225523741, "dur": 157, "args": { "External id": 256694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256694, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 256694, "pid": 5, "tid": 7, "ts": 1716454225523741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415567, "dur": 13, "args": { "External id": 256694, "cbid": 211, "correlation": 256694 } }, { "ph": "s", "id": 256694, "pid": 76337, "tid": -914061504, "ts": 1716454225415567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415662, "dur": 1, "args": { "External id": 256712, "cbid": 251, "correlation": 256712 } }, { "ph": "f", "id": 256712, "pid": 76337, "tid": -914061504, "ts": 1716454225415662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225523900, "dur": 108, "args": { "External id": 256714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256714, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 256714, "pid": 5, "tid": 7, "ts": 1716454225523900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415668, "dur": 14, "args": { "External id": 256714, "cbid": 211, "correlation": 256714 } }, { "ph": "s", "id": 256714, "pid": 76337, "tid": -914061504, "ts": 1716454225415668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225524009, "dur": 35, "args": { "External id": 256722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256722, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256722, "pid": 5, "tid": 7, "ts": 1716454225524009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415737, "dur": 11, "args": { "External id": 256722, "cbid": 211, "correlation": 256722 } }, { "ph": "s", "id": 256722, "pid": 76337, "tid": -914061504, "ts": 1716454225415737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225524045, "dur": 68, "args": { "External id": 256730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256730, "pid": 5, "tid": 7, "ts": 1716454225524045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415777, "dur": 9, "args": { "External id": 256730, "cbid": 211, "correlation": 256730 } }, { "ph": "s", "id": 256730, "pid": 76337, "tid": -914061504, "ts": 1716454225415777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225524115, "dur": 94, "args": { "External id": 256752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256752, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256752, "pid": 5, "tid": 7, "ts": 1716454225524115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415828, "dur": 11, "args": { "External id": 256752, "cbid": 211, "correlation": 256752 } }, { "ph": "s", "id": 256752, "pid": 76337, "tid": -914061504, "ts": 1716454225415828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225415914, "dur": 1, "args": { "External id": 256768, "cbid": 251, "correlation": 256768 } }, { "ph": "f", "id": 256768, "pid": 76337, "tid": -914061504, "ts": 1716454225415914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225524210, "dur": 583, "args": { "External id": 256770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256770, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256770, "pid": 5, "tid": 7, "ts": 1716454225524210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415920, "dur": 13, "args": { "External id": 256770, "cbid": 211, "correlation": 256770 } }, { "ph": "s", "id": 256770, "pid": 76337, "tid": -914061504, "ts": 1716454225415920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225524794, "dur": 247, "args": { "External id": 256778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256778, "pid": 5, "tid": 7, "ts": 1716454225524794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225415992, "dur": 12, "args": { "External id": 256778, "cbid": 211, "correlation": 256778 } }, { "ph": "s", "id": 256778, "pid": 76337, "tid": -914061504, "ts": 1716454225415992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225525043, "dur": 256, "args": { "External id": 256786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256786, "pid": 5, "tid": 7, "ts": 1716454225525043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416023, "dur": 9, "args": { "External id": 256786, "cbid": 211, "correlation": 256786 } }, { "ph": "s", "id": 256786, "pid": 76337, "tid": -914061504, "ts": 1716454225416023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416105, "dur": 1, "args": { "External id": 256802, "cbid": 251, "correlation": 256802 } }, { "ph": "f", "id": 256802, "pid": 76337, "tid": -914061504, "ts": 1716454225416105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416110, "dur": 0, "args": { "External id": 256804, "cbid": 251, "correlation": 256804 } }, { "ph": "f", "id": 256804, "pid": 76337, "tid": -914061504, "ts": 1716454225416110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225525300, "dur": 362, "args": { "External id": 256805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256805, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 256805, "pid": 5, "tid": 7, "ts": 1716454225525300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416113, "dur": 13, "args": { "External id": 256805, "cbid": 211, "correlation": 256805 } }, { "ph": "s", "id": 256805, "pid": 76337, "tid": -914061504, "ts": 1716454225416113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225525663, "dur": 50, "args": { "External id": 256813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256813, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256813, "pid": 5, "tid": 7, "ts": 1716454225525663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416155, "dur": 10, "args": { "External id": 256813, "cbid": 211, "correlation": 256813 } }, { "ph": "s", "id": 256813, "pid": 76337, "tid": -914061504, "ts": 1716454225416155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225525714, "dur": 161, "args": { "External id": 256824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256824, "pid": 5, "tid": 7, "ts": 1716454225525714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416222, "dur": 13, "args": { "External id": 256824, "cbid": 211, "correlation": 256824 } }, { "ph": "s", "id": 256824, "pid": 76337, "tid": -914061504, "ts": 1716454225416222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225416287, "dur": 0, "args": { "External id": 256836, "cbid": 317, "correlation": 256836 } }, { "ph": "f", "id": 256836, "pid": 76337, "tid": -914061504, "ts": 1716454225416287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225416287, "dur": 0, "args": { "External id": 256837, "cbid": 203, "correlation": 256837 } }, { "ph": "f", "id": 256837, "pid": 76337, "tid": -914061504, "ts": 1716454225416287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225416288, "dur": 0, "args": { "External id": 256838, "cbid": 205, "correlation": 256838 } }, { "ph": "f", "id": 256838, "pid": 76337, "tid": -914061504, "ts": 1716454225416288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416312, "dur": 1, "args": { "External id": 256842, "cbid": 251, "correlation": 256842 } }, { "ph": "f", "id": 256842, "pid": 76337, "tid": -914061504, "ts": 1716454225416312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416314, "dur": 0, "args": { "External id": 256843, "cbid": 251, "correlation": 256843 } }, { "ph": "f", "id": 256843, "pid": 76337, "tid": -914061504, "ts": 1716454225416314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416314, "dur": 0, "args": { "External id": 256844, "cbid": 251, "correlation": 256844 } }, { "ph": "f", "id": 256844, "pid": 76337, "tid": -914061504, "ts": 1716454225416314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416315, "dur": 0, "args": { "External id": 256845, "cbid": 251, "correlation": 256845 } }, { "ph": "f", "id": 256845, "pid": 76337, "tid": -914061504, "ts": 1716454225416315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416316, "dur": 0, "args": { "External id": 256846, "cbid": 251, "correlation": 256846 } }, { "ph": "f", "id": 256846, "pid": 76337, "tid": -914061504, "ts": 1716454225416316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416317, "dur": 0, "args": { "External id": 256847, "cbid": 251, "correlation": 256847 } }, { "ph": "f", "id": 256847, "pid": 76337, "tid": -914061504, "ts": 1716454225416317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416318, "dur": 0, "args": { "External id": 256848, "cbid": 251, "correlation": 256848 } }, { "ph": "f", "id": 256848, "pid": 76337, "tid": -914061504, "ts": 1716454225416318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416318, "dur": 0, "args": { "External id": 256849, "cbid": 251, "correlation": 256849 } }, { "ph": "f", "id": 256849, "pid": 76337, "tid": -914061504, "ts": 1716454225416318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416320, "dur": 0, "args": { "External id": 256850, "cbid": 251, "correlation": 256850 } }, { "ph": "f", "id": 256850, "pid": 76337, "tid": -914061504, "ts": 1716454225416320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225525877, "dur": 116, "args": { "External id": 256851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256851, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 256851, "pid": 5, "tid": 7, "ts": 1716454225525877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416322, "dur": 13, "args": { "External id": 256851, "cbid": 211, "correlation": 256851 } }, { "ph": "s", "id": 256851, "pid": 76337, "tid": -914061504, "ts": 1716454225416322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225525994, "dur": 61, "args": { "External id": 256857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256857, "pid": 5, "tid": 7, "ts": 1716454225525994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416357, "dur": 9, "args": { "External id": 256857, "cbid": 211, "correlation": 256857 } }, { "ph": "s", "id": 256857, "pid": 76337, "tid": -914061504, "ts": 1716454225416357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225526056, "dur": 50, "args": { "External id": 256865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256865, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256865, "pid": 5, "tid": 7, "ts": 1716454225526056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416388, "dur": 8, "args": { "External id": 256865, "cbid": 211, "correlation": 256865 } }, { "ph": "s", "id": 256865, "pid": 76337, "tid": -914061504, "ts": 1716454225416388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225526108, "dur": 97, "args": { "External id": 256874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256874, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256874, "pid": 5, "tid": 7, "ts": 1716454225526108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416426, "dur": 11, "args": { "External id": 256874, "cbid": 211, "correlation": 256874 } }, { "ph": "s", "id": 256874, "pid": 76337, "tid": -914061504, "ts": 1716454225416426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225526207, "dur": 93, "args": { "External id": 256894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256894, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 256894, "pid": 5, "tid": 7, "ts": 1716454225526207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416497, "dur": 11, "args": { "External id": 256894, "cbid": 211, "correlation": 256894 } }, { "ph": "s", "id": 256894, "pid": 76337, "tid": -914061504, "ts": 1716454225416497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225526301, "dur": 5, "args": { "External id": 256906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256906, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 256906, "pid": 5, "tid": 7, "ts": 1716454225526301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416518, "dur": 165, "args": { "External id": 256906, "cbid": 211, "correlation": 256906 } }, { "ph": "s", "id": 256906, "pid": 76337, "tid": -914061504, "ts": 1716454225416518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225526307, "dur": 110, "args": { "External id": 256909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256909, "pid": 5, "tid": 7, "ts": 1716454225526307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416696, "dur": 7, "args": { "External id": 256909, "cbid": 211, "correlation": 256909 } }, { "ph": "s", "id": 256909, "pid": 76337, "tid": -914061504, "ts": 1716454225416696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225526419, "dur": 69, "args": { "External id": 256918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256918, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256918, "pid": 5, "tid": 7, "ts": 1716454225526419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416736, "dur": 11, "args": { "External id": 256918, "cbid": 211, "correlation": 256918 } }, { "ph": "s", "id": 256918, "pid": 76337, "tid": -914061504, "ts": 1716454225416736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225416787, "dur": 0, "args": { "External id": 256928, "cbid": 317, "correlation": 256928 } }, { "ph": "f", "id": 256928, "pid": 76337, "tid": -914061504, "ts": 1716454225416787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225416788, "dur": 0, "args": { "External id": 256929, "cbid": 203, "correlation": 256929 } }, { "ph": "f", "id": 256929, "pid": 76337, "tid": -914061504, "ts": 1716454225416788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225416789, "dur": 0, "args": { "External id": 256930, "cbid": 205, "correlation": 256930 } }, { "ph": "f", "id": 256930, "pid": 76337, "tid": -914061504, "ts": 1716454225416789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225526489, "dur": 76, "args": { "External id": 256934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256934, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256934, "pid": 5, "tid": 7, "ts": 1716454225526489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416804, "dur": 11, "args": { "External id": 256934, "cbid": 211, "correlation": 256934 } }, { "ph": "s", "id": 256934, "pid": 76337, "tid": -914061504, "ts": 1716454225416804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225526567, "dur": 24, "args": { "External id": 256936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256936, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256936, "pid": 5, "tid": 7, "ts": 1716454225526567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416818, "dur": 5, "args": { "External id": 256936, "cbid": 211, "correlation": 256936 } }, { "ph": "s", "id": 256936, "pid": 76337, "tid": -914061504, "ts": 1716454225416818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225526592, "dur": 4, "args": { "External id": 256938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256938, "pid": 5, "tid": 7, "ts": 1716454225526592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416828, "dur": 6, "args": { "External id": 256938, "cbid": 211, "correlation": 256938 } }, { "ph": "s", "id": 256938, "pid": 76337, "tid": -914061504, "ts": 1716454225416828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225416838, "dur": 0, "args": { "External id": 256939, "cbid": 51, "correlation": 256939 } }, { "ph": "s", "id": 256939, "pid": 76337, "tid": -914061504, "ts": 1716454225416838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225526598, "dur": 1388, "args": { "External id": 256940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256940, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 256940, "pid": 5, "tid": 7, "ts": 1716454225526598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416839, "dur": 6, "args": { "External id": 256940, "cbid": 211, "correlation": 256940 } }, { "ph": "s", "id": 256940, "pid": 76337, "tid": -914061504, "ts": 1716454225416839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225527987, "dur": 59, "args": { "External id": 256945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256945, "pid": 5, "tid": 7, "ts": 1716454225527987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416867, "dur": 8, "args": { "External id": 256945, "cbid": 211, "correlation": 256945 } }, { "ph": "s", "id": 256945, "pid": 76337, "tid": -914061504, "ts": 1716454225416867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225528048, "dur": 3, "args": { "External id": 256953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256953, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 256953, "pid": 5, "tid": 7, "ts": 1716454225528048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416910, "dur": 9, "args": { "External id": 256953, "cbid": 211, "correlation": 256953 } }, { "ph": "s", "id": 256953, "pid": 76337, "tid": -914061504, "ts": 1716454225416910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416982, "dur": 1, "args": { "External id": 256969, "cbid": 251, "correlation": 256969 } }, { "ph": "f", "id": 256969, "pid": 76337, "tid": -914061504, "ts": 1716454225416982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225416988, "dur": 0, "args": { "External id": 256971, "cbid": 251, "correlation": 256971 } }, { "ph": "f", "id": 256971, "pid": 76337, "tid": -914061504, "ts": 1716454225416988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225528053, "dur": 11, "args": { "External id": 256972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256972, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 256972, "pid": 5, "tid": 7, "ts": 1716454225528053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225416990, "dur": 12, "args": { "External id": 256972, "cbid": 211, "correlation": 256972 } }, { "ph": "s", "id": 256972, "pid": 76337, "tid": -914061504, "ts": 1716454225416990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225528065, "dur": 5, "args": { "External id": 256974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256974, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 256974, "pid": 5, "tid": 7, "ts": 1716454225528065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417004, "dur": 176, "args": { "External id": 256974, "cbid": 211, "correlation": 256974 } }, { "ph": "s", "id": 256974, "pid": 76337, "tid": -914061504, "ts": 1716454225417004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225528071, "dur": 55, "args": { "External id": 256984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 256984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 256984, "pid": 5, "tid": 7, "ts": 1716454225528071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417234, "dur": 12, "args": { "External id": 256984, "cbid": 211, "correlation": 256984 } }, { "ph": "s", "id": 256984, "pid": 76337, "tid": -914061504, "ts": 1716454225417234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225528128, "dur": 54, "args": { "External id": 257004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257004, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 257004, "pid": 5, "tid": 7, "ts": 1716454225528128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417300, "dur": 11, "args": { "External id": 257004, "cbid": 211, "correlation": 257004 } }, { "ph": "s", "id": 257004, "pid": 76337, "tid": -914061504, "ts": 1716454225417300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225528183, "dur": 4, "args": { "External id": 257016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257016, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257016, "pid": 5, "tid": 7, "ts": 1716454225528183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417321, "dur": 6, "args": { "External id": 257016, "cbid": 211, "correlation": 257016 } }, { "ph": "s", "id": 257016, "pid": 76337, "tid": -914061504, "ts": 1716454225417321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225528189, "dur": 56, "args": { "External id": 257019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257019, "pid": 5, "tid": 7, "ts": 1716454225528189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417340, "dur": 7, "args": { "External id": 257019, "cbid": 211, "correlation": 257019 } }, { "ph": "s", "id": 257019, "pid": 76337, "tid": -914061504, "ts": 1716454225417340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225528246, "dur": 37, "args": { "External id": 257028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257028, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257028, "pid": 5, "tid": 7, "ts": 1716454225528246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417380, "dur": 9, "args": { "External id": 257028, "cbid": 211, "correlation": 257028 } }, { "ph": "s", "id": 257028, "pid": 76337, "tid": -914061504, "ts": 1716454225417380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225417441, "dur": 0, "args": { "External id": 257038, "cbid": 317, "correlation": 257038 } }, { "ph": "f", "id": 257038, "pid": 76337, "tid": -914061504, "ts": 1716454225417441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225417442, "dur": 0, "args": { "External id": 257039, "cbid": 203, "correlation": 257039 } }, { "ph": "f", "id": 257039, "pid": 76337, "tid": -914061504, "ts": 1716454225417442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225417443, "dur": 0, "args": { "External id": 257040, "cbid": 205, "correlation": 257040 } }, { "ph": "f", "id": 257040, "pid": 76337, "tid": -914061504, "ts": 1716454225417443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225528284, "dur": 41, "args": { "External id": 257044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257044, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257044, "pid": 5, "tid": 7, "ts": 1716454225528284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417457, "dur": 12, "args": { "External id": 257044, "cbid": 211, "correlation": 257044 } }, { "ph": "s", "id": 257044, "pid": 76337, "tid": -914061504, "ts": 1716454225417457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225528326, "dur": 15, "args": { "External id": 257046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257046, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257046, "pid": 5, "tid": 7, "ts": 1716454225528326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417472, "dur": 5, "args": { "External id": 257046, "cbid": 211, "correlation": 257046 } }, { "ph": "s", "id": 257046, "pid": 76337, "tid": -914061504, "ts": 1716454225417472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225528342, "dur": 3, "args": { "External id": 257048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257048, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257048, "pid": 5, "tid": 7, "ts": 1716454225528342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417481, "dur": 5, "args": { "External id": 257048, "cbid": 211, "correlation": 257048 } }, { "ph": "s", "id": 257048, "pid": 76337, "tid": -914061504, "ts": 1716454225417481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225417489, "dur": 0, "args": { "External id": 257049, "cbid": 51, "correlation": 257049 } }, { "ph": "s", "id": 257049, "pid": 76337, "tid": -914061504, "ts": 1716454225417489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225528346, "dur": 709, "args": { "External id": 257050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257050, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257050, "pid": 5, "tid": 7, "ts": 1716454225528346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417490, "dur": 5, "args": { "External id": 257050, "cbid": 211, "correlation": 257050 } }, { "ph": "s", "id": 257050, "pid": 76337, "tid": -914061504, "ts": 1716454225417490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225529057, "dur": 60, "args": { "External id": 257055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257055, "pid": 5, "tid": 7, "ts": 1716454225529057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417517, "dur": 9, "args": { "External id": 257055, "cbid": 211, "correlation": 257055 } }, { "ph": "s", "id": 257055, "pid": 76337, "tid": -914061504, "ts": 1716454225417517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225417575, "dur": 0, "args": { "External id": 257065, "cbid": 317, "correlation": 257065 } }, { "ph": "f", "id": 257065, "pid": 76337, "tid": -914061504, "ts": 1716454225417575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225417576, "dur": 0, "args": { "External id": 257066, "cbid": 203, "correlation": 257066 } }, { "ph": "f", "id": 257066, "pid": 76337, "tid": -914061504, "ts": 1716454225417576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225417577, "dur": 0, "args": { "External id": 257067, "cbid": 205, "correlation": 257067 } }, { "ph": "f", "id": 257067, "pid": 76337, "tid": -914061504, "ts": 1716454225417577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225529118, "dur": 75, "args": { "External id": 257071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257071, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257071, "pid": 5, "tid": 7, "ts": 1716454225529118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417589, "dur": 11, "args": { "External id": 257071, "cbid": 211, "correlation": 257071 } }, { "ph": "s", "id": 257071, "pid": 76337, "tid": -914061504, "ts": 1716454225417589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225529195, "dur": 211, "args": { "External id": 257073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257073, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257073, "pid": 5, "tid": 7, "ts": 1716454225529195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417608, "dur": 9, "args": { "External id": 257073, "cbid": 211, "correlation": 257073 } }, { "ph": "s", "id": 257073, "pid": 76337, "tid": -914061504, "ts": 1716454225417608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225529407, "dur": 38, "args": { "External id": 257075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257075, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257075, "pid": 5, "tid": 7, "ts": 1716454225529407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225417622, "dur": 502, "args": { "External id": 257075, "cbid": 211, "correlation": 257075 } }, { "ph": "s", "id": 257075, "pid": 76337, "tid": -914061504, "ts": 1716454225417622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225529447, "dur": 61, "args": { "External id": 257081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257081, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257081, "pid": 5, "tid": 7, "ts": 1716454225529447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418147, "dur": 9, "args": { "External id": 257081, "cbid": 211, "correlation": 257081 } }, { "ph": "s", "id": 257081, "pid": 76337, "tid": -914061504, "ts": 1716454225418147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225529509, "dur": 50, "args": { "External id": 257089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257089, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257089, "pid": 5, "tid": 7, "ts": 1716454225529509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418177, "dur": 9, "args": { "External id": 257089, "cbid": 211, "correlation": 257089 } }, { "ph": "s", "id": 257089, "pid": 76337, "tid": -914061504, "ts": 1716454225418177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225529560, "dur": 35, "args": { "External id": 257097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257097, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257097, "pid": 5, "tid": 7, "ts": 1716454225529560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418207, "dur": 29, "args": { "External id": 257097, "cbid": 211, "correlation": 257097 } }, { "ph": "s", "id": 257097, "pid": 76337, "tid": -914061504, "ts": 1716454225418207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225529597, "dur": 53, "args": { "External id": 257117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257117, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 257117, "pid": 5, "tid": 7, "ts": 1716454225529597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418310, "dur": 12, "args": { "External id": 257117, "cbid": 211, "correlation": 257117 } }, { "ph": "s", "id": 257117, "pid": 76337, "tid": -914061504, "ts": 1716454225418310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225529651, "dur": 4, "args": { "External id": 257129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257129, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257129, "pid": 5, "tid": 7, "ts": 1716454225529651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418332, "dur": 7, "args": { "External id": 257129, "cbid": 211, "correlation": 257129 } }, { "ph": "s", "id": 257129, "pid": 76337, "tid": -914061504, "ts": 1716454225418332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225529657, "dur": 57, "args": { "External id": 257132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257132, "pid": 5, "tid": 7, "ts": 1716454225529657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418350, "dur": 6, "args": { "External id": 257132, "cbid": 211, "correlation": 257132 } }, { "ph": "s", "id": 257132, "pid": 76337, "tid": -914061504, "ts": 1716454225418350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225418408, "dur": 0, "args": { "External id": 257143, "cbid": 317, "correlation": 257143 } }, { "ph": "f", "id": 257143, "pid": 76337, "tid": -914061504, "ts": 1716454225418408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225418409, "dur": 0, "args": { "External id": 257144, "cbid": 203, "correlation": 257144 } }, { "ph": "f", "id": 257144, "pid": 76337, "tid": -914061504, "ts": 1716454225418409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225418410, "dur": 0, "args": { "External id": 257145, "cbid": 205, "correlation": 257145 } }, { "ph": "f", "id": 257145, "pid": 76337, "tid": -914061504, "ts": 1716454225418410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418431, "dur": 1, "args": { "External id": 257149, "cbid": 251, "correlation": 257149 } }, { "ph": "f", "id": 257149, "pid": 76337, "tid": -914061504, "ts": 1716454225418431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418433, "dur": 0, "args": { "External id": 257150, "cbid": 251, "correlation": 257150 } }, { "ph": "f", "id": 257150, "pid": 76337, "tid": -914061504, "ts": 1716454225418433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418434, "dur": 0, "args": { "External id": 257151, "cbid": 251, "correlation": 257151 } }, { "ph": "f", "id": 257151, "pid": 76337, "tid": -914061504, "ts": 1716454225418434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418435, "dur": 0, "args": { "External id": 257152, "cbid": 251, "correlation": 257152 } }, { "ph": "f", "id": 257152, "pid": 76337, "tid": -914061504, "ts": 1716454225418435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418436, "dur": 0, "args": { "External id": 257153, "cbid": 251, "correlation": 257153 } }, { "ph": "f", "id": 257153, "pid": 76337, "tid": -914061504, "ts": 1716454225418436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418437, "dur": 0, "args": { "External id": 257154, "cbid": 251, "correlation": 257154 } }, { "ph": "f", "id": 257154, "pid": 76337, "tid": -914061504, "ts": 1716454225418437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418438, "dur": 0, "args": { "External id": 257155, "cbid": 251, "correlation": 257155 } }, { "ph": "f", "id": 257155, "pid": 76337, "tid": -914061504, "ts": 1716454225418438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418438, "dur": 0, "args": { "External id": 257156, "cbid": 251, "correlation": 257156 } }, { "ph": "f", "id": 257156, "pid": 76337, "tid": -914061504, "ts": 1716454225418438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418440, "dur": 0, "args": { "External id": 257157, "cbid": 251, "correlation": 257157 } }, { "ph": "f", "id": 257157, "pid": 76337, "tid": -914061504, "ts": 1716454225418440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225529715, "dur": 113, "args": { "External id": 257158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257158, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257158, "pid": 5, "tid": 7, "ts": 1716454225529715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418442, "dur": 12, "args": { "External id": 257158, "cbid": 211, "correlation": 257158 } }, { "ph": "s", "id": 257158, "pid": 76337, "tid": -914061504, "ts": 1716454225418442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225529830, "dur": 60, "args": { "External id": 257164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257164, "pid": 5, "tid": 7, "ts": 1716454225529830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418477, "dur": 9, "args": { "External id": 257164, "cbid": 211, "correlation": 257164 } }, { "ph": "s", "id": 257164, "pid": 76337, "tid": -914061504, "ts": 1716454225418477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225529891, "dur": 530, "args": { "External id": 257173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257173, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257173, "pid": 5, "tid": 7, "ts": 1716454225529891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418559, "dur": 13, "args": { "External id": 257173, "cbid": 211, "correlation": 257173 } }, { "ph": "s", "id": 257173, "pid": 76337, "tid": -914061504, "ts": 1716454225418559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225530422, "dur": 184, "args": { "External id": 257195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257195, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257195, "pid": 5, "tid": 7, "ts": 1716454225530422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418616, "dur": 10, "args": { "External id": 257195, "cbid": 211, "correlation": 257195 } }, { "ph": "s", "id": 257195, "pid": 76337, "tid": -914061504, "ts": 1716454225418616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418703, "dur": 1, "args": { "External id": 257206, "cbid": 251, "correlation": 257206 } }, { "ph": "f", "id": 257206, "pid": 76337, "tid": -914061504, "ts": 1716454225418703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225530608, "dur": 199, "args": { "External id": 257207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257207, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257207, "pid": 5, "tid": 7, "ts": 1716454225530608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418707, "dur": 13, "args": { "External id": 257207, "cbid": 211, "correlation": 257207 } }, { "ph": "s", "id": 257207, "pid": 76337, "tid": -914061504, "ts": 1716454225418707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418775, "dur": 1, "args": { "External id": 257218, "cbid": 251, "correlation": 257218 } }, { "ph": "f", "id": 257218, "pid": 76337, "tid": -914061504, "ts": 1716454225418775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225530808, "dur": 189, "args": { "External id": 257219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257219, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257219, "pid": 5, "tid": 7, "ts": 1716454225530808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418778, "dur": 12, "args": { "External id": 257219, "cbid": 211, "correlation": 257219 } }, { "ph": "s", "id": 257219, "pid": 76337, "tid": -914061504, "ts": 1716454225418778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225418843, "dur": 1, "args": { "External id": 257230, "cbid": 251, "correlation": 257230 } }, { "ph": "f", "id": 257230, "pid": 76337, "tid": -914061504, "ts": 1716454225418843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225530999, "dur": 191, "args": { "External id": 257231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257231, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257231, "pid": 5, "tid": 7, "ts": 1716454225530999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418848, "dur": 11, "args": { "External id": 257231, "cbid": 211, "correlation": 257231 } }, { "ph": "s", "id": 257231, "pid": 76337, "tid": -914061504, "ts": 1716454225418848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225531191, "dur": 18997, "args": { "External id": 257252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257252, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257252, "pid": 5, "tid": 7, "ts": 1716454225531191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225418928, "dur": 12, "args": { "External id": 257252, "cbid": 211, "correlation": 257252 } }, { "ph": "s", "id": 257252, "pid": 76337, "tid": -914061504, "ts": 1716454225418928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225419031, "dur": 1, "args": { "External id": 257270, "cbid": 251, "correlation": 257270 } }, { "ph": "f", "id": 257270, "pid": 76337, "tid": -914061504, "ts": 1716454225419031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225550189, "dur": 204, "args": { "External id": 257272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257272, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257272, "pid": 5, "tid": 7, "ts": 1716454225550189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225419037, "dur": 15, "args": { "External id": 257272, "cbid": 211, "correlation": 257272 } }, { "ph": "s", "id": 257272, "pid": 76337, "tid": -914061504, "ts": 1716454225419037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225550395, "dur": 66, "args": { "External id": 257280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257280, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257280, "pid": 5, "tid": 7, "ts": 1716454225550395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225419109, "dur": 12, "args": { "External id": 257280, "cbid": 211, "correlation": 257280 } }, { "ph": "s", "id": 257280, "pid": 76337, "tid": -914061504, "ts": 1716454225419109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225550462, "dur": 96, "args": { "External id": 257288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257288, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257288, "pid": 5, "tid": 7, "ts": 1716454225550462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225419148, "dur": 28, "args": { "External id": 257288, "cbid": 211, "correlation": 257288 } }, { "ph": "s", "id": 257288, "pid": 76337, "tid": -914061504, "ts": 1716454225419148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225550560, "dur": 54, "args": { "External id": 257299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257299, "pid": 5, "tid": 7, "ts": 1716454225550560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225419239, "dur": 87, "args": { "External id": 257299, "cbid": 211, "correlation": 257299 } }, { "ph": "s", "id": 257299, "pid": 76337, "tid": -914061504, "ts": 1716454225419239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225550615, "dur": 93, "args": { "External id": 257321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257321, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257321, "pid": 5, "tid": 7, "ts": 1716454225550615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225419346, "dur": 1968, "args": { "External id": 257321, "cbid": 211, "correlation": 257321 } }, { "ph": "s", "id": 257321, "pid": 76337, "tid": -914061504, "ts": 1716454225419346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225421391, "dur": 1, "args": { "External id": 257332, "cbid": 251, "correlation": 257332 } }, { "ph": "f", "id": 257332, "pid": 76337, "tid": -914061504, "ts": 1716454225421391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225550710, "dur": 105, "args": { "External id": 257333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257333, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257333, "pid": 5, "tid": 7, "ts": 1716454225550710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421396, "dur": 71, "args": { "External id": 257333, "cbid": 211, "correlation": 257333 } }, { "ph": "s", "id": 257333, "pid": 76337, "tid": -914061504, "ts": 1716454225421396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225421526, "dur": 1, "args": { "External id": 257344, "cbid": 251, "correlation": 257344 } }, { "ph": "f", "id": 257344, "pid": 76337, "tid": -914061504, "ts": 1716454225421526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225421530, "dur": 0, "args": { "External id": 257345, "cbid": 251, "correlation": 257345 } }, { "ph": "f", "id": 257345, "pid": 76337, "tid": -914061504, "ts": 1716454225421530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225550817, "dur": 10, "args": { "External id": 257346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257346, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 257346, "pid": 5, "tid": 7, "ts": 1716454225550817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421531, "dur": 12, "args": { "External id": 257346, "cbid": 211, "correlation": 257346 } }, { "ph": "s", "id": 257346, "pid": 76337, "tid": -914061504, "ts": 1716454225421531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225550828, "dur": 5, "args": { "External id": 257348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257348, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 257348, "pid": 5, "tid": 7, "ts": 1716454225550828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421545, "dur": 6, "args": { "External id": 257348, "cbid": 211, "correlation": 257348 } }, { "ph": "s", "id": 257348, "pid": 76337, "tid": -914061504, "ts": 1716454225421545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225421606, "dur": 1, "args": { "External id": 257359, "cbid": 251, "correlation": 257359 } }, { "ph": "f", "id": 257359, "pid": 76337, "tid": -914061504, "ts": 1716454225421606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225421610, "dur": 0, "args": { "External id": 257360, "cbid": 251, "correlation": 257360 } }, { "ph": "f", "id": 257360, "pid": 76337, "tid": -914061504, "ts": 1716454225421610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225550835, "dur": 6, "args": { "External id": 257361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257361, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 257361, "pid": 5, "tid": 7, "ts": 1716454225550835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421611, "dur": 12, "args": { "External id": 257361, "cbid": 211, "correlation": 257361 } }, { "ph": "s", "id": 257361, "pid": 76337, "tid": -914061504, "ts": 1716454225421611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225550842, "dur": 3, "args": { "External id": 257363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257363, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 257363, "pid": 5, "tid": 7, "ts": 1716454225550842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421625, "dur": 5, "args": { "External id": 257363, "cbid": 211, "correlation": 257363 } }, { "ph": "s", "id": 257363, "pid": 76337, "tid": -914061504, "ts": 1716454225421625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225550847, "dur": 157, "args": { "External id": 257384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257384, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257384, "pid": 5, "tid": 7, "ts": 1716454225550847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421698, "dur": 18, "args": { "External id": 257384, "cbid": 211, "correlation": 257384 } }, { "ph": "s", "id": 257384, "pid": 76337, "tid": -914061504, "ts": 1716454225421698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225421800, "dur": 1, "args": { "External id": 257402, "cbid": 251, "correlation": 257402 } }, { "ph": "f", "id": 257402, "pid": 76337, "tid": -914061504, "ts": 1716454225421800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225551005, "dur": 108, "args": { "External id": 257404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257404, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257404, "pid": 5, "tid": 7, "ts": 1716454225551005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421806, "dur": 14, "args": { "External id": 257404, "cbid": 211, "correlation": 257404 } }, { "ph": "s", "id": 257404, "pid": 76337, "tid": -914061504, "ts": 1716454225421806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225551115, "dur": 35, "args": { "External id": 257412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257412, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257412, "pid": 5, "tid": 7, "ts": 1716454225551115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421876, "dur": 12, "args": { "External id": 257412, "cbid": 211, "correlation": 257412 } }, { "ph": "s", "id": 257412, "pid": 76337, "tid": -914061504, "ts": 1716454225421876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225551151, "dur": 68, "args": { "External id": 257420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257420, "pid": 5, "tid": 7, "ts": 1716454225551151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421917, "dur": 9, "args": { "External id": 257420, "cbid": 211, "correlation": 257420 } }, { "ph": "s", "id": 257420, "pid": 76337, "tid": -914061504, "ts": 1716454225421917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225551220, "dur": 93, "args": { "External id": 257442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257442, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257442, "pid": 5, "tid": 7, "ts": 1716454225551220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225421968, "dur": 17, "args": { "External id": 257442, "cbid": 211, "correlation": 257442 } }, { "ph": "s", "id": 257442, "pid": 76337, "tid": -914061504, "ts": 1716454225421968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422061, "dur": 1, "args": { "External id": 257458, "cbid": 251, "correlation": 257458 } }, { "ph": "f", "id": 257458, "pid": 76337, "tid": -914061504, "ts": 1716454225422061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225551315, "dur": 582, "args": { "External id": 257460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257460, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257460, "pid": 5, "tid": 7, "ts": 1716454225551315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422068, "dur": 14, "args": { "External id": 257460, "cbid": 211, "correlation": 257460 } }, { "ph": "s", "id": 257460, "pid": 76337, "tid": -914061504, "ts": 1716454225422068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225551898, "dur": 245, "args": { "External id": 257468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257468, "pid": 5, "tid": 7, "ts": 1716454225551898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422134, "dur": 12, "args": { "External id": 257468, "cbid": 211, "correlation": 257468 } }, { "ph": "s", "id": 257468, "pid": 76337, "tid": -914061504, "ts": 1716454225422134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225552145, "dur": 255, "args": { "External id": 257476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257476, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257476, "pid": 5, "tid": 7, "ts": 1716454225552145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422163, "dur": 9, "args": { "External id": 257476, "cbid": 211, "correlation": 257476 } }, { "ph": "s", "id": 257476, "pid": 76337, "tid": -914061504, "ts": 1716454225422163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422243, "dur": 1, "args": { "External id": 257492, "cbid": 251, "correlation": 257492 } }, { "ph": "f", "id": 257492, "pid": 76337, "tid": -914061504, "ts": 1716454225422243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422248, "dur": 0, "args": { "External id": 257494, "cbid": 251, "correlation": 257494 } }, { "ph": "f", "id": 257494, "pid": 76337, "tid": -914061504, "ts": 1716454225422248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225552402, "dur": 360, "args": { "External id": 257495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257495, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 257495, "pid": 5, "tid": 7, "ts": 1716454225552402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422251, "dur": 13, "args": { "External id": 257495, "cbid": 211, "correlation": 257495 } }, { "ph": "s", "id": 257495, "pid": 76337, "tid": -914061504, "ts": 1716454225422251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225552763, "dur": 50, "args": { "External id": 257503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257503, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257503, "pid": 5, "tid": 7, "ts": 1716454225552763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422294, "dur": 10, "args": { "External id": 257503, "cbid": 211, "correlation": 257503 } }, { "ph": "s", "id": 257503, "pid": 76337, "tid": -914061504, "ts": 1716454225422294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225552815, "dur": 161, "args": { "External id": 257514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257514, "pid": 5, "tid": 7, "ts": 1716454225552815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422361, "dur": 232, "args": { "External id": 257514, "cbid": 211, "correlation": 257514 } }, { "ph": "s", "id": 257514, "pid": 76337, "tid": -914061504, "ts": 1716454225422361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225422646, "dur": 0, "args": { "External id": 257526, "cbid": 317, "correlation": 257526 } }, { "ph": "f", "id": 257526, "pid": 76337, "tid": -914061504, "ts": 1716454225422646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225422647, "dur": 0, "args": { "External id": 257527, "cbid": 203, "correlation": 257527 } }, { "ph": "f", "id": 257527, "pid": 76337, "tid": -914061504, "ts": 1716454225422647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225422648, "dur": 0, "args": { "External id": 257528, "cbid": 205, "correlation": 257528 } }, { "ph": "f", "id": 257528, "pid": 76337, "tid": -914061504, "ts": 1716454225422648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422671, "dur": 1, "args": { "External id": 257532, "cbid": 251, "correlation": 257532 } }, { "ph": "f", "id": 257532, "pid": 76337, "tid": -914061504, "ts": 1716454225422671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422673, "dur": 0, "args": { "External id": 257533, "cbid": 251, "correlation": 257533 } }, { "ph": "f", "id": 257533, "pid": 76337, "tid": -914061504, "ts": 1716454225422673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422673, "dur": 0, "args": { "External id": 257534, "cbid": 251, "correlation": 257534 } }, { "ph": "f", "id": 257534, "pid": 76337, "tid": -914061504, "ts": 1716454225422673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422674, "dur": 0, "args": { "External id": 257535, "cbid": 251, "correlation": 257535 } }, { "ph": "f", "id": 257535, "pid": 76337, "tid": -914061504, "ts": 1716454225422674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422675, "dur": 0, "args": { "External id": 257536, "cbid": 251, "correlation": 257536 } }, { "ph": "f", "id": 257536, "pid": 76337, "tid": -914061504, "ts": 1716454225422675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422675, "dur": 0, "args": { "External id": 257537, "cbid": 251, "correlation": 257537 } }, { "ph": "f", "id": 257537, "pid": 76337, "tid": -914061504, "ts": 1716454225422675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422676, "dur": 0, "args": { "External id": 257538, "cbid": 251, "correlation": 257538 } }, { "ph": "f", "id": 257538, "pid": 76337, "tid": -914061504, "ts": 1716454225422676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422677, "dur": 0, "args": { "External id": 257539, "cbid": 251, "correlation": 257539 } }, { "ph": "f", "id": 257539, "pid": 76337, "tid": -914061504, "ts": 1716454225422677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225422678, "dur": 0, "args": { "External id": 257540, "cbid": 251, "correlation": 257540 } }, { "ph": "f", "id": 257540, "pid": 76337, "tid": -914061504, "ts": 1716454225422678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225552977, "dur": 116, "args": { "External id": 257541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257541, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257541, "pid": 5, "tid": 7, "ts": 1716454225552977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422680, "dur": 42, "args": { "External id": 257541, "cbid": 211, "correlation": 257541 } }, { "ph": "s", "id": 257541, "pid": 76337, "tid": -914061504, "ts": 1716454225422680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225553094, "dur": 60, "args": { "External id": 257547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257547, "pid": 5, "tid": 7, "ts": 1716454225553094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422745, "dur": 106, "args": { "External id": 257547, "cbid": 211, "correlation": 257547 } }, { "ph": "s", "id": 257547, "pid": 76337, "tid": -914061504, "ts": 1716454225422745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225553155, "dur": 51, "args": { "External id": 257555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257555, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257555, "pid": 5, "tid": 7, "ts": 1716454225553155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225422875, "dur": 285, "args": { "External id": 257555, "cbid": 211, "correlation": 257555 } }, { "ph": "s", "id": 257555, "pid": 76337, "tid": -914061504, "ts": 1716454225422875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225553207, "dur": 100, "args": { "External id": 257564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257564, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257564, "pid": 5, "tid": 7, "ts": 1716454225553207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423192, "dur": 11, "args": { "External id": 257564, "cbid": 211, "correlation": 257564 } }, { "ph": "s", "id": 257564, "pid": 76337, "tid": -914061504, "ts": 1716454225423192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225553309, "dur": 93, "args": { "External id": 257584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257584, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 257584, "pid": 5, "tid": 7, "ts": 1716454225553309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423263, "dur": 12, "args": { "External id": 257584, "cbid": 211, "correlation": 257584 } }, { "ph": "s", "id": 257584, "pid": 76337, "tid": -914061504, "ts": 1716454225423263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225553403, "dur": 5, "args": { "External id": 257596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257596, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 257596, "pid": 5, "tid": 7, "ts": 1716454225553403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423284, "dur": 13, "args": { "External id": 257596, "cbid": 211, "correlation": 257596 } }, { "ph": "s", "id": 257596, "pid": 76337, "tid": -914061504, "ts": 1716454225423284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225553409, "dur": 108, "args": { "External id": 257599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257599, "pid": 5, "tid": 7, "ts": 1716454225553409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423309, "dur": 110, "args": { "External id": 257599, "cbid": 211, "correlation": 257599 } }, { "ph": "s", "id": 257599, "pid": 76337, "tid": -914061504, "ts": 1716454225423309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225553519, "dur": 69, "args": { "External id": 257608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257608, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257608, "pid": 5, "tid": 7, "ts": 1716454225553519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423452, "dur": 10, "args": { "External id": 257608, "cbid": 211, "correlation": 257608 } }, { "ph": "s", "id": 257608, "pid": 76337, "tid": -914061504, "ts": 1716454225423452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225423504, "dur": 0, "args": { "External id": 257618, "cbid": 317, "correlation": 257618 } }, { "ph": "f", "id": 257618, "pid": 76337, "tid": -914061504, "ts": 1716454225423504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225423505, "dur": 0, "args": { "External id": 257619, "cbid": 203, "correlation": 257619 } }, { "ph": "f", "id": 257619, "pid": 76337, "tid": -914061504, "ts": 1716454225423505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225423505, "dur": 0, "args": { "External id": 257620, "cbid": 205, "correlation": 257620 } }, { "ph": "f", "id": 257620, "pid": 76337, "tid": -914061504, "ts": 1716454225423505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225553589, "dur": 76, "args": { "External id": 257624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257624, "pid": 5, "tid": 7, "ts": 1716454225553589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423520, "dur": 11, "args": { "External id": 257624, "cbid": 211, "correlation": 257624 } }, { "ph": "s", "id": 257624, "pid": 76337, "tid": -914061504, "ts": 1716454225423520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225553666, "dur": 24, "args": { "External id": 257626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257626, "pid": 5, "tid": 7, "ts": 1716454225553666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423534, "dur": 5, "args": { "External id": 257626, "cbid": 211, "correlation": 257626 } }, { "ph": "s", "id": 257626, "pid": 76337, "tid": -914061504, "ts": 1716454225423534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225553692, "dur": 4, "args": { "External id": 257628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257628, "pid": 5, "tid": 7, "ts": 1716454225553692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423544, "dur": 6, "args": { "External id": 257628, "cbid": 211, "correlation": 257628 } }, { "ph": "s", "id": 257628, "pid": 76337, "tid": -914061504, "ts": 1716454225423544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225423553, "dur": 0, "args": { "External id": 257629, "cbid": 51, "correlation": 257629 } }, { "ph": "s", "id": 257629, "pid": 76337, "tid": -914061504, "ts": 1716454225423553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225553697, "dur": 1383, "args": { "External id": 257630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257630, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257630, "pid": 5, "tid": 7, "ts": 1716454225553697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423554, "dur": 5, "args": { "External id": 257630, "cbid": 211, "correlation": 257630 } }, { "ph": "s", "id": 257630, "pid": 76337, "tid": -914061504, "ts": 1716454225423554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225555082, "dur": 60, "args": { "External id": 257635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257635, "pid": 5, "tid": 7, "ts": 1716454225555082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423581, "dur": 9, "args": { "External id": 257635, "cbid": 211, "correlation": 257635 } }, { "ph": "s", "id": 257635, "pid": 76337, "tid": -914061504, "ts": 1716454225423581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225555143, "dur": 4, "args": { "External id": 257643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257643, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257643, "pid": 5, "tid": 7, "ts": 1716454225555143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423624, "dur": 9, "args": { "External id": 257643, "cbid": 211, "correlation": 257643 } }, { "ph": "s", "id": 257643, "pid": 76337, "tid": -914061504, "ts": 1716454225423624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225423690, "dur": 1, "args": { "External id": 257659, "cbid": 251, "correlation": 257659 } }, { "ph": "f", "id": 257659, "pid": 76337, "tid": -914061504, "ts": 1716454225423690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225423695, "dur": 0, "args": { "External id": 257661, "cbid": 251, "correlation": 257661 } }, { "ph": "f", "id": 257661, "pid": 76337, "tid": -914061504, "ts": 1716454225423695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225555148, "dur": 12, "args": { "External id": 257662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257662, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 257662, "pid": 5, "tid": 7, "ts": 1716454225555148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423697, "dur": 11, "args": { "External id": 257662, "cbid": 211, "correlation": 257662 } }, { "ph": "s", "id": 257662, "pid": 76337, "tid": -914061504, "ts": 1716454225423697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225555161, "dur": 5, "args": { "External id": 257664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257664, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 257664, "pid": 5, "tid": 7, "ts": 1716454225555161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423711, "dur": 6, "args": { "External id": 257664, "cbid": 211, "correlation": 257664 } }, { "ph": "s", "id": 257664, "pid": 76337, "tid": -914061504, "ts": 1716454225423711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225555168, "dur": 55, "args": { "External id": 257674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257674, "pid": 5, "tid": 7, "ts": 1716454225555168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225423769, "dur": 572, "args": { "External id": 257674, "cbid": 211, "correlation": 257674 } }, { "ph": "s", "id": 257674, "pid": 76337, "tid": -914061504, "ts": 1716454225423769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225555224, "dur": 51, "args": { "External id": 257694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257694, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 257694, "pid": 5, "tid": 7, "ts": 1716454225555224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424397, "dur": 11, "args": { "External id": 257694, "cbid": 211, "correlation": 257694 } }, { "ph": "s", "id": 257694, "pid": 76337, "tid": -914061504, "ts": 1716454225424397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225555276, "dur": 4, "args": { "External id": 257706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257706, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257706, "pid": 5, "tid": 7, "ts": 1716454225555276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424417, "dur": 6, "args": { "External id": 257706, "cbid": 211, "correlation": 257706 } }, { "ph": "s", "id": 257706, "pid": 76337, "tid": -914061504, "ts": 1716454225424417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225555282, "dur": 56, "args": { "External id": 257709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257709, "pid": 5, "tid": 7, "ts": 1716454225555282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424436, "dur": 6, "args": { "External id": 257709, "cbid": 211, "correlation": 257709 } }, { "ph": "s", "id": 257709, "pid": 76337, "tid": -914061504, "ts": 1716454225424436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225555339, "dur": 37, "args": { "External id": 257718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257718, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257718, "pid": 5, "tid": 7, "ts": 1716454225555339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424475, "dur": 10, "args": { "External id": 257718, "cbid": 211, "correlation": 257718 } }, { "ph": "s", "id": 257718, "pid": 76337, "tid": -914061504, "ts": 1716454225424475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225424538, "dur": 0, "args": { "External id": 257728, "cbid": 317, "correlation": 257728 } }, { "ph": "f", "id": 257728, "pid": 76337, "tid": -914061504, "ts": 1716454225424538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225424538, "dur": 0, "args": { "External id": 257729, "cbid": 203, "correlation": 257729 } }, { "ph": "f", "id": 257729, "pid": 76337, "tid": -914061504, "ts": 1716454225424538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225424539, "dur": 0, "args": { "External id": 257730, "cbid": 205, "correlation": 257730 } }, { "ph": "f", "id": 257730, "pid": 76337, "tid": -914061504, "ts": 1716454225424539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225555377, "dur": 40, "args": { "External id": 257734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257734, "pid": 5, "tid": 7, "ts": 1716454225555377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424553, "dur": 12, "args": { "External id": 257734, "cbid": 211, "correlation": 257734 } }, { "ph": "s", "id": 257734, "pid": 76337, "tid": -914061504, "ts": 1716454225424553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225555418, "dur": 14, "args": { "External id": 257736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257736, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257736, "pid": 5, "tid": 7, "ts": 1716454225555418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424567, "dur": 5, "args": { "External id": 257736, "cbid": 211, "correlation": 257736 } }, { "ph": "s", "id": 257736, "pid": 76337, "tid": -914061504, "ts": 1716454225424567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225555434, "dur": 3, "args": { "External id": 257738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257738, "pid": 5, "tid": 7, "ts": 1716454225555434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424576, "dur": 6, "args": { "External id": 257738, "cbid": 211, "correlation": 257738 } }, { "ph": "s", "id": 257738, "pid": 76337, "tid": -914061504, "ts": 1716454225424576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225424586, "dur": 0, "args": { "External id": 257739, "cbid": 51, "correlation": 257739 } }, { "ph": "s", "id": 257739, "pid": 76337, "tid": -914061504, "ts": 1716454225424586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225555439, "dur": 708, "args": { "External id": 257740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257740, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257740, "pid": 5, "tid": 7, "ts": 1716454225555439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424587, "dur": 5, "args": { "External id": 257740, "cbid": 211, "correlation": 257740 } }, { "ph": "s", "id": 257740, "pid": 76337, "tid": -914061504, "ts": 1716454225424587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225556148, "dur": 60, "args": { "External id": 257745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257745, "pid": 5, "tid": 7, "ts": 1716454225556148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424614, "dur": 9, "args": { "External id": 257745, "cbid": 211, "correlation": 257745 } }, { "ph": "s", "id": 257745, "pid": 76337, "tid": -914061504, "ts": 1716454225424614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225424671, "dur": 0, "args": { "External id": 257755, "cbid": 317, "correlation": 257755 } }, { "ph": "f", "id": 257755, "pid": 76337, "tid": -914061504, "ts": 1716454225424671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225424672, "dur": 0, "args": { "External id": 257756, "cbid": 203, "correlation": 257756 } }, { "ph": "f", "id": 257756, "pid": 76337, "tid": -914061504, "ts": 1716454225424672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225424673, "dur": 0, "args": { "External id": 257757, "cbid": 205, "correlation": 257757 } }, { "ph": "f", "id": 257757, "pid": 76337, "tid": -914061504, "ts": 1716454225424673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225556209, "dur": 74, "args": { "External id": 257761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257761, "pid": 5, "tid": 7, "ts": 1716454225556209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424685, "dur": 12, "args": { "External id": 257761, "cbid": 211, "correlation": 257761 } }, { "ph": "s", "id": 257761, "pid": 76337, "tid": -914061504, "ts": 1716454225424685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225556285, "dur": 211, "args": { "External id": 257763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257763, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257763, "pid": 5, "tid": 7, "ts": 1716454225556285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424703, "dur": 7, "args": { "External id": 257763, "cbid": 211, "correlation": 257763 } }, { "ph": "s", "id": 257763, "pid": 76337, "tid": -914061504, "ts": 1716454225424703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225556497, "dur": 39, "args": { "External id": 257765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257765, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257765, "pid": 5, "tid": 7, "ts": 1716454225556497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424714, "dur": 5, "args": { "External id": 257765, "cbid": 211, "correlation": 257765 } }, { "ph": "s", "id": 257765, "pid": 76337, "tid": -914061504, "ts": 1716454225424714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225556537, "dur": 60, "args": { "External id": 257771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257771, "pid": 5, "tid": 7, "ts": 1716454225556537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225424740, "dur": 532, "args": { "External id": 257771, "cbid": 211, "correlation": 257771 } }, { "ph": "s", "id": 257771, "pid": 76337, "tid": -914061504, "ts": 1716454225424740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225556599, "dur": 51, "args": { "External id": 257779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257779, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257779, "pid": 5, "tid": 7, "ts": 1716454225556599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425294, "dur": 9, "args": { "External id": 257779, "cbid": 211, "correlation": 257779 } }, { "ph": "s", "id": 257779, "pid": 76337, "tid": -914061504, "ts": 1716454225425294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225556650, "dur": 35, "args": { "External id": 257787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257787, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257787, "pid": 5, "tid": 7, "ts": 1716454225556650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425324, "dur": 8, "args": { "External id": 257787, "cbid": 211, "correlation": 257787 } }, { "ph": "s", "id": 257787, "pid": 76337, "tid": -914061504, "ts": 1716454225425324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225556686, "dur": 52, "args": { "External id": 257807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257807, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 257807, "pid": 5, "tid": 7, "ts": 1716454225556686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425407, "dur": 12, "args": { "External id": 257807, "cbid": 211, "correlation": 257807 } }, { "ph": "s", "id": 257807, "pid": 76337, "tid": -914061504, "ts": 1716454225425407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225556740, "dur": 4, "args": { "External id": 257819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257819, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 257819, "pid": 5, "tid": 7, "ts": 1716454225556740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425429, "dur": 6, "args": { "External id": 257819, "cbid": 211, "correlation": 257819 } }, { "ph": "s", "id": 257819, "pid": 76337, "tid": -914061504, "ts": 1716454225425429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225556746, "dur": 56, "args": { "External id": 257822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257822, "pid": 5, "tid": 7, "ts": 1716454225556746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425447, "dur": 8, "args": { "External id": 257822, "cbid": 211, "correlation": 257822 } }, { "ph": "s", "id": 257822, "pid": 76337, "tid": -914061504, "ts": 1716454225425447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225425505, "dur": 0, "args": { "External id": 257833, "cbid": 317, "correlation": 257833 } }, { "ph": "f", "id": 257833, "pid": 76337, "tid": -914061504, "ts": 1716454225425505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225425505, "dur": 0, "args": { "External id": 257834, "cbid": 203, "correlation": 257834 } }, { "ph": "f", "id": 257834, "pid": 76337, "tid": -914061504, "ts": 1716454225425505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225425506, "dur": 0, "args": { "External id": 257835, "cbid": 205, "correlation": 257835 } }, { "ph": "f", "id": 257835, "pid": 76337, "tid": -914061504, "ts": 1716454225425506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425528, "dur": 1, "args": { "External id": 257839, "cbid": 251, "correlation": 257839 } }, { "ph": "f", "id": 257839, "pid": 76337, "tid": -914061504, "ts": 1716454225425528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425530, "dur": 0, "args": { "External id": 257840, "cbid": 251, "correlation": 257840 } }, { "ph": "f", "id": 257840, "pid": 76337, "tid": -914061504, "ts": 1716454225425530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425531, "dur": 0, "args": { "External id": 257841, "cbid": 251, "correlation": 257841 } }, { "ph": "f", "id": 257841, "pid": 76337, "tid": -914061504, "ts": 1716454225425531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425531, "dur": 0, "args": { "External id": 257842, "cbid": 251, "correlation": 257842 } }, { "ph": "f", "id": 257842, "pid": 76337, "tid": -914061504, "ts": 1716454225425531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425532, "dur": 0, "args": { "External id": 257843, "cbid": 251, "correlation": 257843 } }, { "ph": "f", "id": 257843, "pid": 76337, "tid": -914061504, "ts": 1716454225425532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425533, "dur": 0, "args": { "External id": 257844, "cbid": 251, "correlation": 257844 } }, { "ph": "f", "id": 257844, "pid": 76337, "tid": -914061504, "ts": 1716454225425533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425534, "dur": 0, "args": { "External id": 257845, "cbid": 251, "correlation": 257845 } }, { "ph": "f", "id": 257845, "pid": 76337, "tid": -914061504, "ts": 1716454225425534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425535, "dur": 0, "args": { "External id": 257846, "cbid": 251, "correlation": 257846 } }, { "ph": "f", "id": 257846, "pid": 76337, "tid": -914061504, "ts": 1716454225425535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425536, "dur": 0, "args": { "External id": 257847, "cbid": 251, "correlation": 257847 } }, { "ph": "f", "id": 257847, "pid": 76337, "tid": -914061504, "ts": 1716454225425536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225556802, "dur": 116, "args": { "External id": 257848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257848, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257848, "pid": 5, "tid": 7, "ts": 1716454225556802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425538, "dur": 13, "args": { "External id": 257848, "cbid": 211, "correlation": 257848 } }, { "ph": "s", "id": 257848, "pid": 76337, "tid": -914061504, "ts": 1716454225425538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225556920, "dur": 61, "args": { "External id": 257854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257854, "pid": 5, "tid": 7, "ts": 1716454225556920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425574, "dur": 9, "args": { "External id": 257854, "cbid": 211, "correlation": 257854 } }, { "ph": "s", "id": 257854, "pid": 76337, "tid": -914061504, "ts": 1716454225425574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225556982, "dur": 617, "args": { "External id": 257863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257863, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257863, "pid": 5, "tid": 7, "ts": 1716454225556982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425657, "dur": 13, "args": { "External id": 257863, "cbid": 211, "correlation": 257863 } }, { "ph": "s", "id": 257863, "pid": 76337, "tid": -914061504, "ts": 1716454225425657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225557601, "dur": 185, "args": { "External id": 257885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257885, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257885, "pid": 5, "tid": 7, "ts": 1716454225557601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425714, "dur": 10, "args": { "External id": 257885, "cbid": 211, "correlation": 257885 } }, { "ph": "s", "id": 257885, "pid": 76337, "tid": -914061504, "ts": 1716454225425714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425800, "dur": 1, "args": { "External id": 257896, "cbid": 251, "correlation": 257896 } }, { "ph": "f", "id": 257896, "pid": 76337, "tid": -914061504, "ts": 1716454225425800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225557787, "dur": 196, "args": { "External id": 257897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257897, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257897, "pid": 5, "tid": 7, "ts": 1716454225557787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425805, "dur": 12, "args": { "External id": 257897, "cbid": 211, "correlation": 257897 } }, { "ph": "s", "id": 257897, "pid": 76337, "tid": -914061504, "ts": 1716454225425805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425879, "dur": 1, "args": { "External id": 257908, "cbid": 251, "correlation": 257908 } }, { "ph": "f", "id": 257908, "pid": 76337, "tid": -914061504, "ts": 1716454225425879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225557984, "dur": 190, "args": { "External id": 257909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257909, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257909, "pid": 5, "tid": 7, "ts": 1716454225557984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425883, "dur": 11, "args": { "External id": 257909, "cbid": 211, "correlation": 257909 } }, { "ph": "s", "id": 257909, "pid": 76337, "tid": -914061504, "ts": 1716454225425883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225425945, "dur": 1, "args": { "External id": 257920, "cbid": 251, "correlation": 257920 } }, { "ph": "f", "id": 257920, "pid": 76337, "tid": -914061504, "ts": 1716454225425945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225558176, "dur": 189, "args": { "External id": 257921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257921, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257921, "pid": 5, "tid": 7, "ts": 1716454225558176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225425950, "dur": 12, "args": { "External id": 257921, "cbid": 211, "correlation": 257921 } }, { "ph": "s", "id": 257921, "pid": 76337, "tid": -914061504, "ts": 1716454225425950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225558366, "dur": 18987, "args": { "External id": 257942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257942, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 257942, "pid": 5, "tid": 7, "ts": 1716454225558366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225426039, "dur": 13, "args": { "External id": 257942, "cbid": 211, "correlation": 257942 } }, { "ph": "s", "id": 257942, "pid": 76337, "tid": -914061504, "ts": 1716454225426039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225426139, "dur": 1, "args": { "External id": 257960, "cbid": 251, "correlation": 257960 } }, { "ph": "f", "id": 257960, "pid": 76337, "tid": -914061504, "ts": 1716454225426139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225577354, "dur": 206, "args": { "External id": 257962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257962, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 257962, "pid": 5, "tid": 7, "ts": 1716454225577354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225426145, "dur": 14, "args": { "External id": 257962, "cbid": 211, "correlation": 257962 } }, { "ph": "s", "id": 257962, "pid": 76337, "tid": -914061504, "ts": 1716454225426145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225577561, "dur": 66, "args": { "External id": 257970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257970, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257970, "pid": 5, "tid": 7, "ts": 1716454225577561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225426216, "dur": 12, "args": { "External id": 257970, "cbid": 211, "correlation": 257970 } }, { "ph": "s", "id": 257970, "pid": 76337, "tid": -914061504, "ts": 1716454225426216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225577629, "dur": 96, "args": { "External id": 257978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257978, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257978, "pid": 5, "tid": 7, "ts": 1716454225577629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225426255, "dur": 109, "args": { "External id": 257978, "cbid": 211, "correlation": 257978 } }, { "ph": "s", "id": 257978, "pid": 76337, "tid": -914061504, "ts": 1716454225426255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225577727, "dur": 54, "args": { "External id": 257989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 257989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 257989, "pid": 5, "tid": 7, "ts": 1716454225577727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225426428, "dur": 1928, "args": { "External id": 257989, "cbid": 211, "correlation": 257989 } }, { "ph": "s", "id": 257989, "pid": 76337, "tid": -914061504, "ts": 1716454225426428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225577782, "dur": 94, "args": { "External id": 258011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258011, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258011, "pid": 5, "tid": 7, "ts": 1716454225577782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428377, "dur": 128, "args": { "External id": 258011, "cbid": 211, "correlation": 258011 } }, { "ph": "s", "id": 258011, "pid": 76337, "tid": -914061504, "ts": 1716454225428377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225428582, "dur": 1, "args": { "External id": 258022, "cbid": 251, "correlation": 258022 } }, { "ph": "f", "id": 258022, "pid": 76337, "tid": -914061504, "ts": 1716454225428582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225577877, "dur": 105, "args": { "External id": 258023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258023, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 258023, "pid": 5, "tid": 7, "ts": 1716454225577877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428588, "dur": 13, "args": { "External id": 258023, "cbid": 211, "correlation": 258023 } }, { "ph": "s", "id": 258023, "pid": 76337, "tid": -914061504, "ts": 1716454225428588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225428659, "dur": 1, "args": { "External id": 258034, "cbid": 251, "correlation": 258034 } }, { "ph": "f", "id": 258034, "pid": 76337, "tid": -914061504, "ts": 1716454225428659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225428663, "dur": 0, "args": { "External id": 258035, "cbid": 251, "correlation": 258035 } }, { "ph": "f", "id": 258035, "pid": 76337, "tid": -914061504, "ts": 1716454225428663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225577983, "dur": 10, "args": { "External id": 258036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258036, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258036, "pid": 5, "tid": 7, "ts": 1716454225577983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428665, "dur": 13, "args": { "External id": 258036, "cbid": 211, "correlation": 258036 } }, { "ph": "s", "id": 258036, "pid": 76337, "tid": -914061504, "ts": 1716454225428665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225577995, "dur": 5, "args": { "External id": 258038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258038, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 258038, "pid": 5, "tid": 7, "ts": 1716454225577995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428680, "dur": 6, "args": { "External id": 258038, "cbid": 211, "correlation": 258038 } }, { "ph": "s", "id": 258038, "pid": 76337, "tid": -914061504, "ts": 1716454225428680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225428741, "dur": 1, "args": { "External id": 258049, "cbid": 251, "correlation": 258049 } }, { "ph": "f", "id": 258049, "pid": 76337, "tid": -914061504, "ts": 1716454225428741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225428745, "dur": 0, "args": { "External id": 258050, "cbid": 251, "correlation": 258050 } }, { "ph": "f", "id": 258050, "pid": 76337, "tid": -914061504, "ts": 1716454225428745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225578001, "dur": 6, "args": { "External id": 258051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258051, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258051, "pid": 5, "tid": 7, "ts": 1716454225578001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428746, "dur": 12, "args": { "External id": 258051, "cbid": 211, "correlation": 258051 } }, { "ph": "s", "id": 258051, "pid": 76337, "tid": -914061504, "ts": 1716454225428746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225578009, "dur": 4, "args": { "External id": 258053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258053, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 258053, "pid": 5, "tid": 7, "ts": 1716454225578009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428759, "dur": 6, "args": { "External id": 258053, "cbid": 211, "correlation": 258053 } }, { "ph": "s", "id": 258053, "pid": 76337, "tid": -914061504, "ts": 1716454225428759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225578014, "dur": 158, "args": { "External id": 258074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258074, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 258074, "pid": 5, "tid": 7, "ts": 1716454225578014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428834, "dur": 12, "args": { "External id": 258074, "cbid": 211, "correlation": 258074 } }, { "ph": "s", "id": 258074, "pid": 76337, "tid": -914061504, "ts": 1716454225428834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225428932, "dur": 1, "args": { "External id": 258092, "cbid": 251, "correlation": 258092 } }, { "ph": "f", "id": 258092, "pid": 76337, "tid": -914061504, "ts": 1716454225428932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225578173, "dur": 108, "args": { "External id": 258094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258094, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 258094, "pid": 5, "tid": 7, "ts": 1716454225578173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225428938, "dur": 13, "args": { "External id": 258094, "cbid": 211, "correlation": 258094 } }, { "ph": "s", "id": 258094, "pid": 76337, "tid": -914061504, "ts": 1716454225428938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225578282, "dur": 35, "args": { "External id": 258102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258102, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258102, "pid": 5, "tid": 7, "ts": 1716454225578282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429014, "dur": 13, "args": { "External id": 258102, "cbid": 211, "correlation": 258102 } }, { "ph": "s", "id": 258102, "pid": 76337, "tid": -914061504, "ts": 1716454225429014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225578319, "dur": 67, "args": { "External id": 258110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258110, "pid": 5, "tid": 7, "ts": 1716454225578319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429056, "dur": 9, "args": { "External id": 258110, "cbid": 211, "correlation": 258110 } }, { "ph": "s", "id": 258110, "pid": 76337, "tid": -914061504, "ts": 1716454225429056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225578388, "dur": 94, "args": { "External id": 258132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258132, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258132, "pid": 5, "tid": 7, "ts": 1716454225578388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429107, "dur": 10, "args": { "External id": 258132, "cbid": 211, "correlation": 258132 } }, { "ph": "s", "id": 258132, "pid": 76337, "tid": -914061504, "ts": 1716454225429107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429193, "dur": 1, "args": { "External id": 258148, "cbid": 251, "correlation": 258148 } }, { "ph": "f", "id": 258148, "pid": 76337, "tid": -914061504, "ts": 1716454225429193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225578483, "dur": 582, "args": { "External id": 258150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258150, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 258150, "pid": 5, "tid": 7, "ts": 1716454225578483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429199, "dur": 12, "args": { "External id": 258150, "cbid": 211, "correlation": 258150 } }, { "ph": "s", "id": 258150, "pid": 76337, "tid": -914061504, "ts": 1716454225429199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225579067, "dur": 247, "args": { "External id": 258158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258158, "pid": 5, "tid": 7, "ts": 1716454225579067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429263, "dur": 13, "args": { "External id": 258158, "cbid": 211, "correlation": 258158 } }, { "ph": "s", "id": 258158, "pid": 76337, "tid": -914061504, "ts": 1716454225429263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225579316, "dur": 252, "args": { "External id": 258166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258166, "pid": 5, "tid": 7, "ts": 1716454225579316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429293, "dur": 8, "args": { "External id": 258166, "cbid": 211, "correlation": 258166 } }, { "ph": "s", "id": 258166, "pid": 76337, "tid": -914061504, "ts": 1716454225429293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429373, "dur": 1, "args": { "External id": 258182, "cbid": 251, "correlation": 258182 } }, { "ph": "f", "id": 258182, "pid": 76337, "tid": -914061504, "ts": 1716454225429373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429379, "dur": 0, "args": { "External id": 258184, "cbid": 251, "correlation": 258184 } }, { "ph": "f", "id": 258184, "pid": 76337, "tid": -914061504, "ts": 1716454225429379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225579570, "dur": 361, "args": { "External id": 258185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258185, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258185, "pid": 5, "tid": 7, "ts": 1716454225579570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429382, "dur": 13, "args": { "External id": 258185, "cbid": 211, "correlation": 258185 } }, { "ph": "s", "id": 258185, "pid": 76337, "tid": -914061504, "ts": 1716454225429382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225579932, "dur": 50, "args": { "External id": 258193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258193, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258193, "pid": 5, "tid": 7, "ts": 1716454225579932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429423, "dur": 207, "args": { "External id": 258193, "cbid": 211, "correlation": 258193 } }, { "ph": "s", "id": 258193, "pid": 76337, "tid": -914061504, "ts": 1716454225429423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225579983, "dur": 159, "args": { "External id": 258204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258204, "pid": 5, "tid": 7, "ts": 1716454225579983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429688, "dur": 73, "args": { "External id": 258204, "cbid": 211, "correlation": 258204 } }, { "ph": "s", "id": 258204, "pid": 76337, "tid": -914061504, "ts": 1716454225429688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225429815, "dur": 0, "args": { "External id": 258216, "cbid": 317, "correlation": 258216 } }, { "ph": "f", "id": 258216, "pid": 76337, "tid": -914061504, "ts": 1716454225429815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225429816, "dur": 0, "args": { "External id": 258217, "cbid": 203, "correlation": 258217 } }, { "ph": "f", "id": 258217, "pid": 76337, "tid": -914061504, "ts": 1716454225429816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225429816, "dur": 0, "args": { "External id": 258218, "cbid": 205, "correlation": 258218 } }, { "ph": "f", "id": 258218, "pid": 76337, "tid": -914061504, "ts": 1716454225429816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429839, "dur": 1, "args": { "External id": 258222, "cbid": 251, "correlation": 258222 } }, { "ph": "f", "id": 258222, "pid": 76337, "tid": -914061504, "ts": 1716454225429839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429841, "dur": 0, "args": { "External id": 258223, "cbid": 251, "correlation": 258223 } }, { "ph": "f", "id": 258223, "pid": 76337, "tid": -914061504, "ts": 1716454225429841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429841, "dur": 0, "args": { "External id": 258224, "cbid": 251, "correlation": 258224 } }, { "ph": "f", "id": 258224, "pid": 76337, "tid": -914061504, "ts": 1716454225429841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429842, "dur": 0, "args": { "External id": 258225, "cbid": 251, "correlation": 258225 } }, { "ph": "f", "id": 258225, "pid": 76337, "tid": -914061504, "ts": 1716454225429842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429843, "dur": 0, "args": { "External id": 258226, "cbid": 251, "correlation": 258226 } }, { "ph": "f", "id": 258226, "pid": 76337, "tid": -914061504, "ts": 1716454225429843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429843, "dur": 0, "args": { "External id": 258227, "cbid": 251, "correlation": 258227 } }, { "ph": "f", "id": 258227, "pid": 76337, "tid": -914061504, "ts": 1716454225429843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429844, "dur": 0, "args": { "External id": 258228, "cbid": 251, "correlation": 258228 } }, { "ph": "f", "id": 258228, "pid": 76337, "tid": -914061504, "ts": 1716454225429844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429845, "dur": 0, "args": { "External id": 258229, "cbid": 251, "correlation": 258229 } }, { "ph": "f", "id": 258229, "pid": 76337, "tid": -914061504, "ts": 1716454225429845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225429846, "dur": 0, "args": { "External id": 258230, "cbid": 251, "correlation": 258230 } }, { "ph": "f", "id": 258230, "pid": 76337, "tid": -914061504, "ts": 1716454225429846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225580144, "dur": 116, "args": { "External id": 258231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258231, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 258231, "pid": 5, "tid": 7, "ts": 1716454225580144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429848, "dur": 42, "args": { "External id": 258231, "cbid": 211, "correlation": 258231 } }, { "ph": "s", "id": 258231, "pid": 76337, "tid": -914061504, "ts": 1716454225429848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225580261, "dur": 60, "args": { "External id": 258237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258237, "pid": 5, "tid": 7, "ts": 1716454225580261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225429914, "dur": 288, "args": { "External id": 258237, "cbid": 211, "correlation": 258237 } }, { "ph": "s", "id": 258237, "pid": 76337, "tid": -914061504, "ts": 1716454225429914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580322, "dur": 49, "args": { "External id": 258245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258245, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258245, "pid": 5, "tid": 7, "ts": 1716454225580322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430226, "dur": 9, "args": { "External id": 258245, "cbid": 211, "correlation": 258245 } }, { "ph": "s", "id": 258245, "pid": 76337, "tid": -914061504, "ts": 1716454225430226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225580373, "dur": 52, "args": { "External id": 258265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258265, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 258265, "pid": 5, "tid": 7, "ts": 1716454225580373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430306, "dur": 12, "args": { "External id": 258265, "cbid": 211, "correlation": 258265 } }, { "ph": "s", "id": 258265, "pid": 76337, "tid": -914061504, "ts": 1716454225430306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225580427, "dur": 5, "args": { "External id": 258277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258277, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 258277, "pid": 5, "tid": 7, "ts": 1716454225580427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430328, "dur": 10, "args": { "External id": 258277, "cbid": 211, "correlation": 258277 } }, { "ph": "s", "id": 258277, "pid": 76337, "tid": -914061504, "ts": 1716454225430328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225580433, "dur": 55, "args": { "External id": 258280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258280, "pid": 5, "tid": 7, "ts": 1716454225580433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430351, "dur": 111, "args": { "External id": 258280, "cbid": 211, "correlation": 258280 } }, { "ph": "s", "id": 258280, "pid": 76337, "tid": -914061504, "ts": 1716454225430351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580490, "dur": 37, "args": { "External id": 258289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258289, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258289, "pid": 5, "tid": 7, "ts": 1716454225580490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430499, "dur": 11, "args": { "External id": 258289, "cbid": 211, "correlation": 258289 } }, { "ph": "s", "id": 258289, "pid": 76337, "tid": -914061504, "ts": 1716454225430499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225430556, "dur": 0, "args": { "External id": 258299, "cbid": 317, "correlation": 258299 } }, { "ph": "f", "id": 258299, "pid": 76337, "tid": -914061504, "ts": 1716454225430556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225430557, "dur": 0, "args": { "External id": 258300, "cbid": 203, "correlation": 258300 } }, { "ph": "f", "id": 258300, "pid": 76337, "tid": -914061504, "ts": 1716454225430557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225430558, "dur": 0, "args": { "External id": 258301, "cbid": 205, "correlation": 258301 } }, { "ph": "f", "id": 258301, "pid": 76337, "tid": -914061504, "ts": 1716454225430558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225580529, "dur": 41, "args": { "External id": 258305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258305, "pid": 5, "tid": 7, "ts": 1716454225580529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430574, "dur": 12, "args": { "External id": 258305, "cbid": 211, "correlation": 258305 } }, { "ph": "s", "id": 258305, "pid": 76337, "tid": -914061504, "ts": 1716454225430574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225580571, "dur": 3, "args": { "External id": 258307, "device": 5, "context": 1, "stream": 7, "correlation": 258307, "bytes": 46080, "memory bandwidth (GB/s)": 12.003125814014066 } }, { "ph": "f", "id": 258307, "pid": 5, "tid": 7, "ts": 1716454225580571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225430589, "dur": 18, "args": { "External id": 258307, "cbid": 51, "correlation": 258307 } }, { "ph": "s", "id": 258307, "pid": 76337, "tid": -914061504, "ts": 1716454225430589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225430612, "dur": 1, "args": { "External id": 258309, "cbid": 200, "correlation": 258309 } }, { "ph": "f", "id": 258309, "pid": 76337, "tid": -914061504, "ts": 1716454225430612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225430614, "dur": 0, "args": { "External id": 258310, "cbid": 200, "correlation": 258310 } }, { "ph": "f", "id": 258310, "pid": 76337, "tid": -914061504, "ts": 1716454225430614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225430615, "dur": 0, "args": { "External id": 258311, "cbid": 200, "correlation": 258311 } }, { "ph": "f", "id": 258311, "pid": 76337, "tid": -914061504, "ts": 1716454225430615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225430615, "dur": 0, "args": { "External id": 258312, "cbid": 200, "correlation": 258312 } }, { "ph": "f", "id": 258312, "pid": 76337, "tid": -914061504, "ts": 1716454225430615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454225430616, "dur": 3, "args": { "External id": 258313, "cbid": 15, "correlation": 258313 } }, { "ph": "f", "id": 258313, "pid": 76337, "tid": -914061504, "ts": 1716454225430616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225430621, "dur": 1, "args": { "External id": 258314, "cbid": 251, "correlation": 258314 } }, { "ph": "f", "id": 258314, "pid": 76337, "tid": -914061504, "ts": 1716454225430621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454225580576, "dur": 24, "args": { "External id": 258315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258315, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258315, "pid": 5, "tid": 7, "ts": 1716454225580576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430624, "dur": 8, "args": { "External id": 258315, "cbid": 211, "correlation": 258315 } }, { "ph": "s", "id": 258315, "pid": 76337, "tid": -914061504, "ts": 1716454225430624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225580601, "dur": 4, "args": { "External id": 258317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258317, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 258317, "pid": 5, "tid": 7, "ts": 1716454225580601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430638, "dur": 6, "args": { "External id": 258317, "cbid": 211, "correlation": 258317 } }, { "ph": "s", "id": 258317, "pid": 76337, "tid": -914061504, "ts": 1716454225430638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225430648, "dur": 0, "args": { "External id": 258318, "cbid": 51, "correlation": 258318 } }, { "ph": "s", "id": 258318, "pid": 76337, "tid": -914061504, "ts": 1716454225430648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225580606, "dur": 192, "args": { "External id": 258319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258319, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258319, "pid": 5, "tid": 7, "ts": 1716454225580606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430649, "dur": 207, "args": { "External id": 258319, "cbid": 211, "correlation": 258319 } }, { "ph": "s", "id": 258319, "pid": 76337, "tid": -914061504, "ts": 1716454225430649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225580800, "dur": 6, "args": { "External id": 258320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258320, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258320, "pid": 5, "tid": 7, "ts": 1716454225580800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430859, "dur": 6, "args": { "External id": 258320, "cbid": 211, "correlation": 258320 } }, { "ph": "s", "id": 258320, "pid": 76337, "tid": -914061504, "ts": 1716454225430859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225580808, "dur": 5, "args": { "External id": 258326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 258326, "pid": 5, "tid": 7, "ts": 1716454225580808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225430889, "dur": 9, "args": { "External id": 258326, "cbid": 211, "correlation": 258326 } }, { "ph": "s", "id": 258326, "pid": 76337, "tid": -914061504, "ts": 1716454225430889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580814, "dur": 3, "args": { "External id": 258334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258334, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258334, "pid": 5, "tid": 7, "ts": 1716454225580814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225432560, "dur": 15, "args": { "External id": 258334, "cbid": 211, "correlation": 258334 } }, { "ph": "s", "id": 258334, "pid": 76337, "tid": -914061504, "ts": 1716454225432560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580818, "dur": 3, "args": { "External id": 258342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258342, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258342, "pid": 5, "tid": 7, "ts": 1716454225580818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225432601, "dur": 10, "args": { "External id": 258342, "cbid": 211, "correlation": 258342 } }, { "ph": "s", "id": 258342, "pid": 76337, "tid": -914061504, "ts": 1716454225432601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580823, "dur": 3, "args": { "External id": 258350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258350, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258350, "pid": 5, "tid": 7, "ts": 1716454225580823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225432628, "dur": 9, "args": { "External id": 258350, "cbid": 211, "correlation": 258350 } }, { "ph": "s", "id": 258350, "pid": 76337, "tid": -914061504, "ts": 1716454225432628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580827, "dur": 3, "args": { "External id": 258359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258359, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258359, "pid": 5, "tid": 7, "ts": 1716454225580827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225432810, "dur": 14, "args": { "External id": 258359, "cbid": 211, "correlation": 258359 } }, { "ph": "s", "id": 258359, "pid": 76337, "tid": -914061504, "ts": 1716454225432810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580831, "dur": 3, "args": { "External id": 258368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258368, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258368, "pid": 5, "tid": 7, "ts": 1716454225580831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225432839, "dur": 7, "args": { "External id": 258368, "cbid": 211, "correlation": 258368 } }, { "ph": "s", "id": 258368, "pid": 76337, "tid": -914061504, "ts": 1716454225432839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580836, "dur": 3, "args": { "External id": 258376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258376, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258376, "pid": 5, "tid": 7, "ts": 1716454225580836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225432865, "dur": 8, "args": { "External id": 258376, "cbid": 211, "correlation": 258376 } }, { "ph": "s", "id": 258376, "pid": 76337, "tid": -914061504, "ts": 1716454225432865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580840, "dur": 3, "args": { "External id": 258384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258384, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258384, "pid": 5, "tid": 7, "ts": 1716454225580840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225433135, "dur": 15, "args": { "External id": 258384, "cbid": 211, "correlation": 258384 } }, { "ph": "s", "id": 258384, "pid": 76337, "tid": -914061504, "ts": 1716454225433135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225580844, "dur": 3, "args": { "External id": 258392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258392, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258392, "pid": 5, "tid": 7, "ts": 1716454225580844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225433165, "dur": 7, "args": { "External id": 258392, "cbid": 211, "correlation": 258392 } }, { "ph": "s", "id": 258392, "pid": 76337, "tid": -914061504, "ts": 1716454225433165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225580849, "dur": 1, "args": { "External id": 258402, "device": 5, "context": 1, "stream": 7, "correlation": 258402, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 258402, "pid": 5, "tid": 7, "ts": 1716454225580849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225433233, "dur": 42, "args": { "External id": 258402, "cbid": 41, "correlation": 258402 } }, { "ph": "s", "id": 258402, "pid": 76337, "tid": -914061504, "ts": 1716454225433233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225433276, "dur": 147589, "args": { "External id": 258403, "cbid": 131, "correlation": 258403 } }, { "ph": "f", "id": 258403, "pid": 76337, "tid": -914061504, "ts": 1716454225433276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225581026, "dur": 3, "args": { "External id": 258411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258411, "pid": 5, "tid": 7, "ts": 1716454225581026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581001, "dur": 28, "args": { "External id": 258411, "cbid": 211, "correlation": 258411 } }, { "ph": "s", "id": 258411, "pid": 76337, "tid": -914061504, "ts": 1716454225581001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581127, "dur": 3, "args": { "External id": 258420, "device": 5, "context": 1, "stream": 7, "correlation": 258420, "bytes": 8, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 258420, "pid": 5, "tid": 7, "ts": 1716454225581127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581092, "dur": 35, "args": { "External id": 258420, "cbid": 41, "correlation": 258420 } }, { "ph": "s", "id": 258420, "pid": 76337, "tid": -914061504, "ts": 1716454225581092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225581221, "dur": 4, "args": { "External id": 258430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258430, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258430, "pid": 5, "tid": 7, "ts": 1716454225581221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581205, "dur": 18, "args": { "External id": 258430, "cbid": 211, "correlation": 258430 } }, { "ph": "s", "id": 258430, "pid": 76337, "tid": -914061504, "ts": 1716454225581205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581296, "dur": 1, "args": { "External id": 258440, "device": 5, "context": 1, "stream": 7, "correlation": 258440, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 258440, "pid": 5, "tid": 7, "ts": 1716454225581296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581275, "dur": 19, "args": { "External id": 258440, "cbid": 41, "correlation": 258440 } }, { "ph": "s", "id": 258440, "pid": 76337, "tid": -914061504, "ts": 1716454225581275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225581295, "dur": 8, "args": { "External id": 258441, "cbid": 131, "correlation": 258441 } }, { "ph": "f", "id": 258441, "pid": 76337, "tid": -914061504, "ts": 1716454225581295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581376, "dur": 3, "args": { "External id": 258448, "device": 5, "context": 1, "stream": 7, "correlation": 258448, "bytes": 98304, "memory bandwidth (GB/s)": 30.11764705882353 } }, { "ph": "f", "id": 258448, "pid": 5, "tid": 7, "ts": 1716454225581376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581346, "dur": 30, "args": { "External id": 258448, "cbid": 41, "correlation": 258448 } }, { "ph": "s", "id": 258448, "pid": 76337, "tid": -914061504, "ts": 1716454225581346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581470, "dur": 3, "args": { "External id": 258467, "device": 5, "context": 1, "stream": 7, "correlation": 258467, "bytes": 16, "memory bandwidth (GB/s)": 0.0052648897663705166 } }, { "ph": "f", "id": 258467, "pid": 5, "tid": 7, "ts": 1716454225581470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581445, "dur": 24, "args": { "External id": 258467, "cbid": 41, "correlation": 258467 } }, { "ph": "s", "id": 258467, "pid": 76337, "tid": -914061504, "ts": 1716454225581445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225581508, "dur": 3, "args": { "External id": 258473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258473, "pid": 5, "tid": 7, "ts": 1716454225581508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581496, "dur": 13, "args": { "External id": 258473, "cbid": 211, "correlation": 258473 } }, { "ph": "s", "id": 258473, "pid": 76337, "tid": -914061504, "ts": 1716454225581496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454225581523, "dur": 6, "args": { "External id": 258475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258475, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 258475, "pid": 5, "tid": 7, "ts": 1716454225581523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581513, "dur": 9, "args": { "External id": 258475, "cbid": 211, "correlation": 258475 } }, { "ph": "s", "id": 258475, "pid": 76337, "tid": -914061504, "ts": 1716454225581513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454225581531, "dur": 3, "args": { "External id": 258477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258477, "pid": 5, "tid": 7, "ts": 1716454225581531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581524, "dur": 6, "args": { "External id": 258477, "cbid": 211, "correlation": 258477 } }, { "ph": "s", "id": 258477, "pid": 76337, "tid": -914061504, "ts": 1716454225581524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581570, "dur": 2, "args": { "External id": 258485, "device": 5, "context": 1, "stream": 7, "correlation": 258485, "bytes": 8, "memory bandwidth (GB/s)": 0.002717391304347826 } }, { "ph": "f", "id": 258485, "pid": 5, "tid": 7, "ts": 1716454225581570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581552, "dur": 17, "args": { "External id": 258485, "cbid": 41, "correlation": 258485 } }, { "ph": "s", "id": 258485, "pid": 76337, "tid": -914061504, "ts": 1716454225581552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225581618, "dur": 3, "args": { "External id": 258499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258499, "pid": 5, "tid": 7, "ts": 1716454225581618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581607, "dur": 12, "args": { "External id": 258499, "cbid": 211, "correlation": 258499 } }, { "ph": "s", "id": 258499, "pid": 76337, "tid": -914061504, "ts": 1716454225581607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225581638, "dur": 2, "args": { "External id": 258513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258513, "pid": 5, "tid": 7, "ts": 1716454225581638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581631, "dur": 6, "args": { "External id": 258513, "cbid": 211, "correlation": 258513 } }, { "ph": "s", "id": 258513, "pid": 76337, "tid": -914061504, "ts": 1716454225581631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225581673, "dur": 6, "args": { "External id": 258520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258520, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258520, "pid": 5, "tid": 7, "ts": 1716454225581673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581662, "dur": 11, "args": { "External id": 258520, "cbid": 211, "correlation": 258520 } }, { "ph": "s", "id": 258520, "pid": 76337, "tid": -914061504, "ts": 1716454225581662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225581684, "dur": 6, "args": { "External id": 258523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258523, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258523, "pid": 5, "tid": 7, "ts": 1716454225581684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581676, "dur": 7, "args": { "External id": 258523, "cbid": 211, "correlation": 258523 } }, { "ph": "s", "id": 258523, "pid": 76337, "tid": -914061504, "ts": 1716454225581676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454225581692, "dur": 3, "args": { "External id": 258525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258525, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258525, "pid": 5, "tid": 7, "ts": 1716454225581692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581685, "dur": 7, "args": { "External id": 258525, "cbid": 211, "correlation": 258525 } }, { "ph": "s", "id": 258525, "pid": 76337, "tid": -914061504, "ts": 1716454225581685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581714, "dur": 2, "args": { "External id": 258528, "device": 5, "context": 1, "stream": 7, "correlation": 258528, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 258528, "pid": 5, "tid": 7, "ts": 1716454225581714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581701, "dur": 12, "args": { "External id": 258528, "cbid": 41, "correlation": 258528 } }, { "ph": "s", "id": 258528, "pid": 76337, "tid": -914061504, "ts": 1716454225581701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225581767, "dur": 4, "args": { "External id": 258544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258544, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258544, "pid": 5, "tid": 7, "ts": 1716454225581767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581755, "dur": 12, "args": { "External id": 258544, "cbid": 211, "correlation": 258544 } }, { "ph": "s", "id": 258544, "pid": 76337, "tid": -914061504, "ts": 1716454225581755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225581790, "dur": 3, "args": { "External id": 258549, "device": 5, "context": 1, "stream": 7, "correlation": 258549, "bytes": 1, "memory bandwidth (GB/s)": 0.0003094059405940594 } }, { "ph": "f", "id": 258549, "pid": 5, "tid": 7, "ts": 1716454225581790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581772, "dur": 17, "args": { "External id": 258549, "cbid": 41, "correlation": 258549 } }, { "ph": "s", "id": 258549, "pid": 76337, "tid": -914061504, "ts": 1716454225581772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225581819, "dur": 1, "args": { "External id": 258555, "device": 5, "context": 1, "stream": 7, "correlation": 258555, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 258555, "pid": 5, "tid": 7, "ts": 1716454225581819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225581800, "dur": 28, "args": { "External id": 258555, "cbid": 41, "correlation": 258555 } }, { "ph": "s", "id": 258555, "pid": 76337, "tid": -914061504, "ts": 1716454225581800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225581829, "dur": 4, "args": { "External id": 258556, "cbid": 131, "correlation": 258556 } }, { "ph": "f", "id": 258556, "pid": 76337, "tid": -914061504, "ts": 1716454225581829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225581881, "dur": 3, "args": { "External id": 258564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258564, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258564, "pid": 5, "tid": 7, "ts": 1716454225581881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581868, "dur": 13, "args": { "External id": 258564, "cbid": 211, "correlation": 258564 } }, { "ph": "s", "id": 258564, "pid": 76337, "tid": -914061504, "ts": 1716454225581868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225581912, "dur": 3, "args": { "External id": 258574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258574, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258574, "pid": 5, "tid": 7, "ts": 1716454225581912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581903, "dur": 8, "args": { "External id": 258574, "cbid": 211, "correlation": 258574 } }, { "ph": "s", "id": 258574, "pid": 76337, "tid": -914061504, "ts": 1716454225581903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225581936, "dur": 3, "args": { "External id": 258583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258583, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258583, "pid": 5, "tid": 7, "ts": 1716454225581936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225581927, "dur": 8, "args": { "External id": 258583, "cbid": 211, "correlation": 258583 } }, { "ph": "s", "id": 258583, "pid": 76337, "tid": -914061504, "ts": 1716454225581927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225582059, "dur": 12, "args": { "External id": 258593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258593, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258593, "pid": 5, "tid": 7, "ts": 1716454225582059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582045, "dur": 15, "args": { "External id": 258593, "cbid": 211, "correlation": 258593 } }, { "ph": "s", "id": 258593, "pid": 76337, "tid": -914061504, "ts": 1716454225582045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225582099, "dur": 3, "args": { "External id": 258601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258601, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258601, "pid": 5, "tid": 7, "ts": 1716454225582099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582090, "dur": 8, "args": { "External id": 258601, "cbid": 211, "correlation": 258601 } }, { "ph": "s", "id": 258601, "pid": 76337, "tid": -914061504, "ts": 1716454225582090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225582144, "dur": 12, "args": { "External id": 258611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258611, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258611, "pid": 5, "tid": 7, "ts": 1716454225582144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582132, "dur": 12, "args": { "External id": 258611, "cbid": 211, "correlation": 258611 } }, { "ph": "s", "id": 258611, "pid": 76337, "tid": -914061504, "ts": 1716454225582132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225582175, "dur": 10, "args": { "External id": 258619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258619, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258619, "pid": 5, "tid": 7, "ts": 1716454225582175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582166, "dur": 9, "args": { "External id": 258619, "cbid": 211, "correlation": 258619 } }, { "ph": "s", "id": 258619, "pid": 76337, "tid": -914061504, "ts": 1716454225582166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225582206, "dur": 3, "args": { "External id": 258628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258628, "pid": 5, "tid": 7, "ts": 1716454225582206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582194, "dur": 12, "args": { "External id": 258628, "cbid": 211, "correlation": 258628 } }, { "ph": "s", "id": 258628, "pid": 76337, "tid": -914061504, "ts": 1716454225582194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225582232, "dur": 5, "args": { "External id": 258637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258637, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258637, "pid": 5, "tid": 7, "ts": 1716454225582232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582222, "dur": 8, "args": { "External id": 258637, "cbid": 211, "correlation": 258637 } }, { "ph": "s", "id": 258637, "pid": 76337, "tid": -914061504, "ts": 1716454225582222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225582271, "dur": 8, "args": { "External id": 258647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258647, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258647, "pid": 5, "tid": 7, "ts": 1716454225582271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582260, "dur": 10, "args": { "External id": 258647, "cbid": 211, "correlation": 258647 } }, { "ph": "s", "id": 258647, "pid": 76337, "tid": -914061504, "ts": 1716454225582260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225582586, "dur": 3, "args": { "External id": 258656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258656, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258656, "pid": 5, "tid": 7, "ts": 1716454225582586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582570, "dur": 16, "args": { "External id": 258656, "cbid": 211, "correlation": 258656 } }, { "ph": "s", "id": 258656, "pid": 76337, "tid": -914061504, "ts": 1716454225582570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225582613, "dur": 3, "args": { "External id": 258664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258664, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258664, "pid": 5, "tid": 7, "ts": 1716454225582613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582604, "dur": 8, "args": { "External id": 258664, "cbid": 211, "correlation": 258664 } }, { "ph": "s", "id": 258664, "pid": 76337, "tid": -914061504, "ts": 1716454225582604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225582664, "dur": 1, "args": { "External id": 258674, "device": 5, "context": 1, "stream": 7, "correlation": 258674, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 258674, "pid": 5, "tid": 7, "ts": 1716454225582664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225582649, "dur": 13, "args": { "External id": 258674, "cbid": 41, "correlation": 258674 } }, { "ph": "s", "id": 258674, "pid": 76337, "tid": -914061504, "ts": 1716454225582649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225582663, "dur": 8, "args": { "External id": 258675, "cbid": 131, "correlation": 258675 } }, { "ph": "f", "id": 258675, "pid": 76337, "tid": -914061504, "ts": 1716454225582663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225582754, "dur": 2, "args": { "External id": 258683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258683, "pid": 5, "tid": 7, "ts": 1716454225582754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582740, "dur": 14, "args": { "External id": 258683, "cbid": 211, "correlation": 258683 } }, { "ph": "s", "id": 258683, "pid": 76337, "tid": -914061504, "ts": 1716454225582740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225582826, "dur": 3, "args": { "External id": 258692, "device": 5, "context": 1, "stream": 7, "correlation": 258692, "bytes": 8, "memory bandwidth (GB/s)": 0.002551020408163265 } }, { "ph": "f", "id": 258692, "pid": 5, "tid": 7, "ts": 1716454225582826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225582808, "dur": 18, "args": { "External id": 258692, "cbid": 41, "correlation": 258692 } }, { "ph": "s", "id": 258692, "pid": 76337, "tid": -914061504, "ts": 1716454225582808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225582898, "dur": 3, "args": { "External id": 258702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258702, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 258702, "pid": 5, "tid": 7, "ts": 1716454225582898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225582883, "dur": 15, "args": { "External id": 258702, "cbid": 211, "correlation": 258702 } }, { "ph": "s", "id": 258702, "pid": 76337, "tid": -914061504, "ts": 1716454225582883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225582950, "dur": 1, "args": { "External id": 258712, "device": 5, "context": 1, "stream": 7, "correlation": 258712, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 258712, "pid": 5, "tid": 7, "ts": 1716454225582950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225582936, "dur": 12, "args": { "External id": 258712, "cbid": 41, "correlation": 258712 } }, { "ph": "s", "id": 258712, "pid": 76337, "tid": -914061504, "ts": 1716454225582936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225582949, "dur": 8, "args": { "External id": 258713, "cbid": 131, "correlation": 258713 } }, { "ph": "f", "id": 258713, "pid": 76337, "tid": -914061504, "ts": 1716454225582949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225583025, "dur": 3, "args": { "External id": 258720, "device": 5, "context": 1, "stream": 7, "correlation": 258720, "bytes": 98304, "memory bandwidth (GB/s)": 31.03030303030303 } }, { "ph": "f", "id": 258720, "pid": 5, "tid": 7, "ts": 1716454225583025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583001, "dur": 24, "args": { "External id": 258720, "cbid": 41, "correlation": 258720 } }, { "ph": "s", "id": 258720, "pid": 76337, "tid": -914061504, "ts": 1716454225583001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225583073, "dur": 1, "args": { "External id": 258731, "device": 5, "context": 1, "stream": 7, "correlation": 258731, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 258731, "pid": 5, "tid": 7, "ts": 1716454225583073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583062, "dur": 10, "args": { "External id": 258731, "cbid": 41, "correlation": 258731 } }, { "ph": "s", "id": 258731, "pid": 76337, "tid": -914061504, "ts": 1716454225583062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583072, "dur": 8, "args": { "External id": 258732, "cbid": 131, "correlation": 258732 } }, { "ph": "f", "id": 258732, "pid": 76337, "tid": -914061504, "ts": 1716454225583072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583123, "dur": 3, "args": { "External id": 258740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258740, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258740, "pid": 5, "tid": 7, "ts": 1716454225583123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583110, "dur": 13, "args": { "External id": 258740, "cbid": 211, "correlation": 258740 } }, { "ph": "s", "id": 258740, "pid": 76337, "tid": -914061504, "ts": 1716454225583110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583153, "dur": 3, "args": { "External id": 258750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258750, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258750, "pid": 5, "tid": 7, "ts": 1716454225583153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583144, "dur": 8, "args": { "External id": 258750, "cbid": 211, "correlation": 258750 } }, { "ph": "s", "id": 258750, "pid": 76337, "tid": -914061504, "ts": 1716454225583144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583175, "dur": 3, "args": { "External id": 258759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258759, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258759, "pid": 5, "tid": 7, "ts": 1716454225583175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583166, "dur": 7, "args": { "External id": 258759, "cbid": 211, "correlation": 258759 } }, { "ph": "s", "id": 258759, "pid": 76337, "tid": -914061504, "ts": 1716454225583166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225583243, "dur": 5, "args": { "External id": 258767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258767, "pid": 5, "tid": 7, "ts": 1716454225583243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583230, "dur": 15, "args": { "External id": 258767, "cbid": 211, "correlation": 258767 } }, { "ph": "s", "id": 258767, "pid": 76337, "tid": -914061504, "ts": 1716454225583230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583283, "dur": 3, "args": { "External id": 258776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258776, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258776, "pid": 5, "tid": 7, "ts": 1716454225583283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583274, "dur": 9, "args": { "External id": 258776, "cbid": 211, "correlation": 258776 } }, { "ph": "s", "id": 258776, "pid": 76337, "tid": -914061504, "ts": 1716454225583274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583307, "dur": 3, "args": { "External id": 258785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258785, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258785, "pid": 5, "tid": 7, "ts": 1716454225583307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583299, "dur": 7, "args": { "External id": 258785, "cbid": 211, "correlation": 258785 } }, { "ph": "s", "id": 258785, "pid": 76337, "tid": -914061504, "ts": 1716454225583299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583368, "dur": 3, "args": { "External id": 258793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258793, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258793, "pid": 5, "tid": 7, "ts": 1716454225583368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583357, "dur": 10, "args": { "External id": 258793, "cbid": 211, "correlation": 258793 } }, { "ph": "s", "id": 258793, "pid": 76337, "tid": -914061504, "ts": 1716454225583357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225583425, "dur": 1, "args": { "External id": 258801, "device": 5, "context": 1, "stream": 7, "correlation": 258801, "bytes": 8, "memory bandwidth (GB/s)": 0.0043859649122807015 } }, { "ph": "f", "id": 258801, "pid": 5, "tid": 7, "ts": 1716454225583425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583410, "dur": 25, "args": { "External id": 258801, "cbid": 41, "correlation": 258801 } }, { "ph": "s", "id": 258801, "pid": 76337, "tid": -914061504, "ts": 1716454225583410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583436, "dur": 4, "args": { "External id": 258802, "cbid": 131, "correlation": 258802 } }, { "ph": "f", "id": 258802, "pid": 76337, "tid": -914061504, "ts": 1716454225583436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225583497, "dur": 1, "args": { "External id": 258812, "device": 5, "context": 1, "stream": 7, "correlation": 258812, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 258812, "pid": 5, "tid": 7, "ts": 1716454225583497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583484, "dur": 11, "args": { "External id": 258812, "cbid": 41, "correlation": 258812 } }, { "ph": "s", "id": 258812, "pid": 76337, "tid": -914061504, "ts": 1716454225583484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583496, "dur": 8, "args": { "External id": 258813, "cbid": 131, "correlation": 258813 } }, { "ph": "f", "id": 258813, "pid": 76337, "tid": -914061504, "ts": 1716454225583496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225583552, "dur": 1, "args": { "External id": 258822, "device": 5, "context": 1, "stream": 7, "correlation": 258822, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 258822, "pid": 5, "tid": 7, "ts": 1716454225583552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583542, "dur": 8, "args": { "External id": 258822, "cbid": 41, "correlation": 258822 } }, { "ph": "s", "id": 258822, "pid": 76337, "tid": -914061504, "ts": 1716454225583542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583550, "dur": 8, "args": { "External id": 258823, "cbid": 131, "correlation": 258823 } }, { "ph": "f", "id": 258823, "pid": 76337, "tid": -914061504, "ts": 1716454225583550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225583625, "dur": 4, "args": { "External id": 258830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258830, "pid": 5, "tid": 7, "ts": 1716454225583625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583609, "dur": 17, "args": { "External id": 258830, "cbid": 211, "correlation": 258830 } }, { "ph": "s", "id": 258830, "pid": 76337, "tid": -914061504, "ts": 1716454225583609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454225583662, "dur": 4, "args": { "External id": 258850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258850, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258850, "pid": 5, "tid": 7, "ts": 1716454225583662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583652, "dur": 11, "args": { "External id": 258850, "cbid": 211, "correlation": 258850 } }, { "ph": "s", "id": 258850, "pid": 76337, "tid": -914061504, "ts": 1716454225583652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225583663, "dur": 0, "args": { "External id": 258851, "cbid": 11, "correlation": 258851 } }, { "ph": "f", "id": 258851, "pid": 76337, "tid": -914061504, "ts": 1716454225583663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225583664, "dur": 0, "args": { "External id": 258852, "cbid": 11, "correlation": 258852 } }, { "ph": "f", "id": 258852, "pid": 76337, "tid": -914061504, "ts": 1716454225583664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225583677, "dur": 1, "args": { "External id": 258855, "device": 5, "context": 1, "stream": 7, "correlation": 258855, "bytes": 4, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 258855, "pid": 5, "tid": 7, "ts": 1716454225583677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583665, "dur": 21, "args": { "External id": 258855, "cbid": 41, "correlation": 258855 } }, { "ph": "s", "id": 258855, "pid": 76337, "tid": -914061504, "ts": 1716454225583665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583687, "dur": 3, "args": { "External id": 258856, "cbid": 131, "correlation": 258856 } }, { "ph": "f", "id": 258856, "pid": 76337, "tid": -914061504, "ts": 1716454225583687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225583714, "dur": 3, "args": { "External id": 258880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258880, "pid": 5, "tid": 7, "ts": 1716454225583714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583705, "dur": 9, "args": { "External id": 258880, "cbid": 211, "correlation": 258880 } }, { "ph": "s", "id": 258880, "pid": 76337, "tid": -914061504, "ts": 1716454225583705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225583715, "dur": 0, "args": { "External id": 258881, "cbid": 11, "correlation": 258881 } }, { "ph": "f", "id": 258881, "pid": 76337, "tid": -914061504, "ts": 1716454225583715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225583715, "dur": 0, "args": { "External id": 258882, "cbid": 11, "correlation": 258882 } }, { "ph": "f", "id": 258882, "pid": 76337, "tid": -914061504, "ts": 1716454225583715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225583717, "dur": 1, "args": { "External id": 258884, "cbid": 200, "correlation": 258884 } }, { "ph": "f", "id": 258884, "pid": 76337, "tid": -914061504, "ts": 1716454225583717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454225583728, "dur": 4, "args": { "External id": 258886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258886, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258886, "pid": 5, "tid": 7, "ts": 1716454225583728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583720, "dur": 9, "args": { "External id": 258886, "cbid": 211, "correlation": 258886 } }, { "ph": "s", "id": 258886, "pid": 76337, "tid": -914061504, "ts": 1716454225583720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225583730, "dur": 0, "args": { "External id": 258887, "cbid": 11, "correlation": 258887 } }, { "ph": "f", "id": 258887, "pid": 76337, "tid": -914061504, "ts": 1716454225583730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225583730, "dur": 0, "args": { "External id": 258888, "cbid": 11, "correlation": 258888 } }, { "ph": "f", "id": 258888, "pid": 76337, "tid": -914061504, "ts": 1716454225583730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225583768, "dur": 1, "args": { "External id": 258895, "device": 5, "context": 1, "stream": 7, "correlation": 258895, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 258895, "pid": 5, "tid": 7, "ts": 1716454225583768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583756, "dur": 20, "args": { "External id": 258895, "cbid": 41, "correlation": 258895 } }, { "ph": "s", "id": 258895, "pid": 76337, "tid": -914061504, "ts": 1716454225583756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583777, "dur": 3, "args": { "External id": 258896, "cbid": 131, "correlation": 258896 } }, { "ph": "f", "id": 258896, "pid": 76337, "tid": -914061504, "ts": 1716454225583777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225583826, "dur": 1, "args": { "External id": 258906, "device": 5, "context": 1, "stream": 7, "correlation": 258906, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 258906, "pid": 5, "tid": 7, "ts": 1716454225583826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225583815, "dur": 9, "args": { "External id": 258906, "cbid": 41, "correlation": 258906 } }, { "ph": "s", "id": 258906, "pid": 76337, "tid": -914061504, "ts": 1716454225583815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225583825, "dur": 8, "args": { "External id": 258907, "cbid": 131, "correlation": 258907 } }, { "ph": "f", "id": 258907, "pid": 76337, "tid": -914061504, "ts": 1716454225583825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225583895, "dur": 5, "args": { "External id": 258914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258914, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258914, "pid": 5, "tid": 7, "ts": 1716454225583895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583880, "dur": 16, "args": { "External id": 258914, "cbid": 211, "correlation": 258914 } }, { "ph": "s", "id": 258914, "pid": 76337, "tid": -914061504, "ts": 1716454225583880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225583966, "dur": 3, "args": { "External id": 258923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258923, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258923, "pid": 5, "tid": 7, "ts": 1716454225583966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583953, "dur": 13, "args": { "External id": 258923, "cbid": 211, "correlation": 258923 } }, { "ph": "s", "id": 258923, "pid": 76337, "tid": -914061504, "ts": 1716454225583953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584010, "dur": 3, "args": { "External id": 258931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258931, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258931, "pid": 5, "tid": 7, "ts": 1716454225584010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225583999, "dur": 11, "args": { "External id": 258931, "cbid": 211, "correlation": 258931 } }, { "ph": "s", "id": 258931, "pid": 76337, "tid": -914061504, "ts": 1716454225583999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584045, "dur": 4, "args": { "External id": 258939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258939, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258939, "pid": 5, "tid": 7, "ts": 1716454225584045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584034, "dur": 11, "args": { "External id": 258939, "cbid": 211, "correlation": 258939 } }, { "ph": "s", "id": 258939, "pid": 76337, "tid": -914061504, "ts": 1716454225584034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584073, "dur": 4, "args": { "External id": 258947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258947, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258947, "pid": 5, "tid": 7, "ts": 1716454225584073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584063, "dur": 10, "args": { "External id": 258947, "cbid": 211, "correlation": 258947 } }, { "ph": "s", "id": 258947, "pid": 76337, "tid": -914061504, "ts": 1716454225584063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584100, "dur": 3, "args": { "External id": 258955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258955, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258955, "pid": 5, "tid": 7, "ts": 1716454225584100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584091, "dur": 8, "args": { "External id": 258955, "cbid": 211, "correlation": 258955 } }, { "ph": "s", "id": 258955, "pid": 76337, "tid": -914061504, "ts": 1716454225584091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584124, "dur": 3, "args": { "External id": 258963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258963, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 258963, "pid": 5, "tid": 7, "ts": 1716454225584124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584115, "dur": 9, "args": { "External id": 258963, "cbid": 211, "correlation": 258963 } }, { "ph": "s", "id": 258963, "pid": 76337, "tid": -914061504, "ts": 1716454225584115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225584147, "dur": 4, "args": { "External id": 258971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258971, "pid": 5, "tid": 7, "ts": 1716454225584147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584139, "dur": 7, "args": { "External id": 258971, "cbid": 211, "correlation": 258971 } }, { "ph": "s", "id": 258971, "pid": 76337, "tid": -914061504, "ts": 1716454225584139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225584165, "dur": 4, "args": { "External id": 258979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258979, "pid": 5, "tid": 7, "ts": 1716454225584165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584157, "dur": 7, "args": { "External id": 258979, "cbid": 211, "correlation": 258979 } }, { "ph": "s", "id": 258979, "pid": 76337, "tid": -914061504, "ts": 1716454225584157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584185, "dur": 3, "args": { "External id": 258987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258987, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 258987, "pid": 5, "tid": 7, "ts": 1716454225584185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584177, "dur": 7, "args": { "External id": 258987, "cbid": 211, "correlation": 258987 } }, { "ph": "s", "id": 258987, "pid": 76337, "tid": -914061504, "ts": 1716454225584177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584243, "dur": 3, "args": { "External id": 258995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 258995, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 258995, "pid": 5, "tid": 7, "ts": 1716454225584243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584233, "dur": 10, "args": { "External id": 258995, "cbid": 211, "correlation": 258995 } }, { "ph": "s", "id": 258995, "pid": 76337, "tid": -914061504, "ts": 1716454225584233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225584269, "dur": 4, "args": { "External id": 259003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 259003, "pid": 5, "tid": 7, "ts": 1716454225584269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584260, "dur": 8, "args": { "External id": 259003, "cbid": 211, "correlation": 259003 } }, { "ph": "s", "id": 259003, "pid": 76337, "tid": -914061504, "ts": 1716454225584260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225584293, "dur": 4, "args": { "External id": 259011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 259011, "pid": 5, "tid": 7, "ts": 1716454225584293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584284, "dur": 8, "args": { "External id": 259011, "cbid": 211, "correlation": 259011 } }, { "ph": "s", "id": 259011, "pid": 76337, "tid": -914061504, "ts": 1716454225584284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584312, "dur": 3, "args": { "External id": 259019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259019, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 259019, "pid": 5, "tid": 7, "ts": 1716454225584312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584304, "dur": 7, "args": { "External id": 259019, "cbid": 211, "correlation": 259019 } }, { "ph": "s", "id": 259019, "pid": 76337, "tid": -914061504, "ts": 1716454225584304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225584721, "dur": 5, "args": { "External id": 259028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259028, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259028, "pid": 5, "tid": 7, "ts": 1716454225584721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584704, "dur": 18, "args": { "External id": 259028, "cbid": 211, "correlation": 259028 } }, { "ph": "s", "id": 259028, "pid": 76337, "tid": -914061504, "ts": 1716454225584704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225584758, "dur": 5, "args": { "External id": 259037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259037, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259037, "pid": 5, "tid": 7, "ts": 1716454225584758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584748, "dur": 9, "args": { "External id": 259037, "cbid": 211, "correlation": 259037 } }, { "ph": "s", "id": 259037, "pid": 76337, "tid": -914061504, "ts": 1716454225584748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225584887, "dur": 3, "args": { "External id": 259053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259053, "pid": 5, "tid": 7, "ts": 1716454225584887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584874, "dur": 14, "args": { "External id": 259053, "cbid": 211, "correlation": 259053 } }, { "ph": "s", "id": 259053, "pid": 76337, "tid": -914061504, "ts": 1716454225584874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584922, "dur": 3, "args": { "External id": 259061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259061, "pid": 5, "tid": 7, "ts": 1716454225584922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584911, "dur": 10, "args": { "External id": 259061, "cbid": 211, "correlation": 259061 } }, { "ph": "s", "id": 259061, "pid": 76337, "tid": -914061504, "ts": 1716454225584911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584953, "dur": 3, "args": { "External id": 259069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259069, "pid": 5, "tid": 7, "ts": 1716454225584953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584943, "dur": 8, "args": { "External id": 259069, "cbid": 211, "correlation": 259069 } }, { "ph": "s", "id": 259069, "pid": 76337, "tid": -914061504, "ts": 1716454225584943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225584993, "dur": 4, "args": { "External id": 259077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259077, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259077, "pid": 5, "tid": 7, "ts": 1716454225584993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225584983, "dur": 10, "args": { "External id": 259077, "cbid": 211, "correlation": 259077 } }, { "ph": "s", "id": 259077, "pid": 76337, "tid": -914061504, "ts": 1716454225584983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225585050, "dur": 4, "args": { "External id": 259089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259089, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259089, "pid": 5, "tid": 7, "ts": 1716454225585050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585037, "dur": 13, "args": { "External id": 259089, "cbid": 211, "correlation": 259089 } }, { "ph": "s", "id": 259089, "pid": 76337, "tid": -914061504, "ts": 1716454225585037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225585096, "dur": 4, "args": { "External id": 259100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259100, "pid": 5, "tid": 7, "ts": 1716454225585096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585084, "dur": 12, "args": { "External id": 259100, "cbid": 211, "correlation": 259100 } }, { "ph": "s", "id": 259100, "pid": 76337, "tid": -914061504, "ts": 1716454225585084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225585128, "dur": 3, "args": { "External id": 259108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259108, "pid": 5, "tid": 7, "ts": 1716454225585128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585118, "dur": 8, "args": { "External id": 259108, "cbid": 211, "correlation": 259108 } }, { "ph": "s", "id": 259108, "pid": 76337, "tid": -914061504, "ts": 1716454225585118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225585161, "dur": 5, "args": { "External id": 259116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259116, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259116, "pid": 5, "tid": 7, "ts": 1716454225585161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585150, "dur": 11, "args": { "External id": 259116, "cbid": 211, "correlation": 259116 } }, { "ph": "s", "id": 259116, "pid": 76337, "tid": -914061504, "ts": 1716454225585150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225585190, "dur": 5, "args": { "External id": 259124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259124, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259124, "pid": 5, "tid": 7, "ts": 1716454225585190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585181, "dur": 9, "args": { "External id": 259124, "cbid": 211, "correlation": 259124 } }, { "ph": "s", "id": 259124, "pid": 76337, "tid": -914061504, "ts": 1716454225585181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225585221, "dur": 4, "args": { "External id": 259133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259133, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259133, "pid": 5, "tid": 7, "ts": 1716454225585221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585211, "dur": 9, "args": { "External id": 259133, "cbid": 211, "correlation": 259133 } }, { "ph": "s", "id": 259133, "pid": 76337, "tid": -914061504, "ts": 1716454225585211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225585281, "dur": 4, "args": { "External id": 259146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259146, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259146, "pid": 5, "tid": 7, "ts": 1716454225585281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585269, "dur": 13, "args": { "External id": 259146, "cbid": 211, "correlation": 259146 } }, { "ph": "s", "id": 259146, "pid": 76337, "tid": -914061504, "ts": 1716454225585269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225585322, "dur": 5, "args": { "External id": 259156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259156, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259156, "pid": 5, "tid": 7, "ts": 1716454225585322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585311, "dur": 11, "args": { "External id": 259156, "cbid": 211, "correlation": 259156 } }, { "ph": "s", "id": 259156, "pid": 76337, "tid": -914061504, "ts": 1716454225585311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225585449, "dur": 4, "args": { "External id": 259173, "cbid": 251, "correlation": 259173 } }, { "ph": "f", "id": 259173, "pid": 76337, "tid": -914061504, "ts": 1716454225585449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454225585478, "dur": 11, "args": { "External id": 259175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259175, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259175, "pid": 5, "tid": 7, "ts": 1716454225585478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585463, "dur": 16, "args": { "External id": 259175, "cbid": 211, "correlation": 259175 } }, { "ph": "s", "id": 259175, "pid": 76337, "tid": -914061504, "ts": 1716454225585463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225585535, "dur": 4, "args": { "External id": 259183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259183, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259183, "pid": 5, "tid": 7, "ts": 1716454225585535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585524, "dur": 11, "args": { "External id": 259183, "cbid": 211, "correlation": 259183 } }, { "ph": "s", "id": 259183, "pid": 76337, "tid": -914061504, "ts": 1716454225585524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225585597, "dur": 2, "args": { "External id": 259199, "cbid": 251, "correlation": 259199 } }, { "ph": "f", "id": 259199, "pid": 76337, "tid": -914061504, "ts": 1716454225585597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225585603, "dur": 0, "args": { "External id": 259201, "cbid": 251, "correlation": 259201 } }, { "ph": "f", "id": 259201, "pid": 76337, "tid": -914061504, "ts": 1716454225585603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225585619, "dur": 13, "args": { "External id": 259202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259202, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259202, "pid": 5, "tid": 7, "ts": 1716454225585619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585605, "dur": 14, "args": { "External id": 259202, "cbid": 211, "correlation": 259202 } }, { "ph": "s", "id": 259202, "pid": 76337, "tid": -914061504, "ts": 1716454225585605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225585634, "dur": 6, "args": { "External id": 259204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259204, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259204, "pid": 5, "tid": 7, "ts": 1716454225585634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585623, "dur": 9, "args": { "External id": 259204, "cbid": 211, "correlation": 259204 } }, { "ph": "s", "id": 259204, "pid": 76337, "tid": -914061504, "ts": 1716454225585623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225585733, "dur": 1, "args": { "External id": 259214, "cbid": 317, "correlation": 259214 } }, { "ph": "f", "id": 259214, "pid": 76337, "tid": -914061504, "ts": 1716454225585733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225585735, "dur": 1, "args": { "External id": 259215, "cbid": 203, "correlation": 259215 } }, { "ph": "f", "id": 259215, "pid": 76337, "tid": -914061504, "ts": 1716454225585735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225585737, "dur": 1, "args": { "External id": 259216, "cbid": 205, "correlation": 259216 } }, { "ph": "f", "id": 259216, "pid": 76337, "tid": -914061504, "ts": 1716454225585737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225585792, "dur": 7, "args": { "External id": 259220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259220, "pid": 5, "tid": 7, "ts": 1716454225585792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585777, "dur": 14, "args": { "External id": 259220, "cbid": 211, "correlation": 259220 } }, { "ph": "s", "id": 259220, "pid": 76337, "tid": -914061504, "ts": 1716454225585777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225585802, "dur": 4, "args": { "External id": 259222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259222, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 259222, "pid": 5, "tid": 7, "ts": 1716454225585802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585795, "dur": 6, "args": { "External id": 259222, "cbid": 211, "correlation": 259222 } }, { "ph": "s", "id": 259222, "pid": 76337, "tid": -914061504, "ts": 1716454225585795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225585823, "dur": 3, "args": { "External id": 259224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259224, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259224, "pid": 5, "tid": 7, "ts": 1716454225585823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585813, "dur": 8, "args": { "External id": 259224, "cbid": 211, "correlation": 259224 } }, { "ph": "s", "id": 259224, "pid": 76337, "tid": -914061504, "ts": 1716454225585813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225585828, "dur": 0, "args": { "External id": 259225, "cbid": 51, "correlation": 259225 } }, { "ph": "s", "id": 259225, "pid": 76337, "tid": -914061504, "ts": 1716454225585828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225585838, "dur": 86, "args": { "External id": 259226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259226, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259226, "pid": 5, "tid": 7, "ts": 1716454225585838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585829, "dur": 7, "args": { "External id": 259226, "cbid": 211, "correlation": 259226 } }, { "ph": "s", "id": 259226, "pid": 76337, "tid": -914061504, "ts": 1716454225585829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225585925, "dur": 61, "args": { "External id": 259231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259231, "pid": 5, "tid": 7, "ts": 1716454225585925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225585865, "dur": 11, "args": { "External id": 259231, "cbid": 211, "correlation": 259231 } }, { "ph": "s", "id": 259231, "pid": 76337, "tid": -914061504, "ts": 1716454225585865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225587673, "dur": 52, "args": { "External id": 259251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259251, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 259251, "pid": 5, "tid": 7, "ts": 1716454225587673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587657, "dur": 17, "args": { "External id": 259251, "cbid": 211, "correlation": 259251 } }, { "ph": "s", "id": 259251, "pid": 76337, "tid": -914061504, "ts": 1716454225587657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225587727, "dur": 5, "args": { "External id": 259263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259263, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259263, "pid": 5, "tid": 7, "ts": 1716454225587727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587686, "dur": 9, "args": { "External id": 259263, "cbid": 211, "correlation": 259263 } }, { "ph": "s", "id": 259263, "pid": 76337, "tid": -914061504, "ts": 1716454225587686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225587733, "dur": 58, "args": { "External id": 259266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259266, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259266, "pid": 5, "tid": 7, "ts": 1716454225587733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587710, "dur": 7, "args": { "External id": 259266, "cbid": 211, "correlation": 259266 } }, { "ph": "s", "id": 259266, "pid": 76337, "tid": -914061504, "ts": 1716454225587710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225587792, "dur": 37, "args": { "External id": 259275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259275, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259275, "pid": 5, "tid": 7, "ts": 1716454225587792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587755, "dur": 10, "args": { "External id": 259275, "cbid": 211, "correlation": 259275 } }, { "ph": "s", "id": 259275, "pid": 76337, "tid": -914061504, "ts": 1716454225587755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225587813, "dur": 0, "args": { "External id": 259285, "cbid": 317, "correlation": 259285 } }, { "ph": "f", "id": 259285, "pid": 76337, "tid": -914061504, "ts": 1716454225587813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225587814, "dur": 0, "args": { "External id": 259286, "cbid": 203, "correlation": 259286 } }, { "ph": "f", "id": 259286, "pid": 76337, "tid": -914061504, "ts": 1716454225587814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225587815, "dur": 0, "args": { "External id": 259287, "cbid": 205, "correlation": 259287 } }, { "ph": "f", "id": 259287, "pid": 76337, "tid": -914061504, "ts": 1716454225587815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225587846, "dur": 41, "args": { "External id": 259291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259291, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259291, "pid": 5, "tid": 7, "ts": 1716454225587846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587834, "dur": 12, "args": { "External id": 259291, "cbid": 211, "correlation": 259291 } }, { "ph": "s", "id": 259291, "pid": 76337, "tid": -914061504, "ts": 1716454225587834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225587888, "dur": 15, "args": { "External id": 259293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259293, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259293, "pid": 5, "tid": 7, "ts": 1716454225587888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587848, "dur": 6, "args": { "External id": 259293, "cbid": 211, "correlation": 259293 } }, { "ph": "s", "id": 259293, "pid": 76337, "tid": -914061504, "ts": 1716454225587848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225587904, "dur": 3, "args": { "External id": 259295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259295, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259295, "pid": 5, "tid": 7, "ts": 1716454225587904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587860, "dur": 6, "args": { "External id": 259295, "cbid": 211, "correlation": 259295 } }, { "ph": "s", "id": 259295, "pid": 76337, "tid": -914061504, "ts": 1716454225587860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225587871, "dur": 0, "args": { "External id": 259296, "cbid": 51, "correlation": 259296 } }, { "ph": "s", "id": 259296, "pid": 76337, "tid": -914061504, "ts": 1716454225587871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225587908, "dur": 712, "args": { "External id": 259297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259297, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259297, "pid": 5, "tid": 7, "ts": 1716454225587908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587872, "dur": 7, "args": { "External id": 259297, "cbid": 211, "correlation": 259297 } }, { "ph": "s", "id": 259297, "pid": 76337, "tid": -914061504, "ts": 1716454225587872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225588622, "dur": 60, "args": { "External id": 259302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259302, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259302, "pid": 5, "tid": 7, "ts": 1716454225588622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587902, "dur": 9, "args": { "External id": 259302, "cbid": 211, "correlation": 259302 } }, { "ph": "s", "id": 259302, "pid": 76337, "tid": -914061504, "ts": 1716454225587902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225588683, "dur": 3, "args": { "External id": 259310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259310, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259310, "pid": 5, "tid": 7, "ts": 1716454225588683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225587946, "dur": 9, "args": { "External id": 259310, "cbid": 211, "correlation": 259310 } }, { "ph": "s", "id": 259310, "pid": 76337, "tid": -914061504, "ts": 1716454225587946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588020, "dur": 2, "args": { "External id": 259326, "cbid": 251, "correlation": 259326 } }, { "ph": "f", "id": 259326, "pid": 76337, "tid": -914061504, "ts": 1716454225588020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588026, "dur": 0, "args": { "External id": 259328, "cbid": 251, "correlation": 259328 } }, { "ph": "f", "id": 259328, "pid": 76337, "tid": -914061504, "ts": 1716454225588026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225588688, "dur": 9, "args": { "External id": 259329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259329, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 259329, "pid": 5, "tid": 7, "ts": 1716454225588688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588028, "dur": 12, "args": { "External id": 259329, "cbid": 211, "correlation": 259329 } }, { "ph": "s", "id": 259329, "pid": 76337, "tid": -914061504, "ts": 1716454225588028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225588698, "dur": 4, "args": { "External id": 259331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259331, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 259331, "pid": 5, "tid": 7, "ts": 1716454225588698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588042, "dur": 6, "args": { "External id": 259331, "cbid": 211, "correlation": 259331 } }, { "ph": "s", "id": 259331, "pid": 76337, "tid": -914061504, "ts": 1716454225588042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225588704, "dur": 54, "args": { "External id": 259341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259341, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259341, "pid": 5, "tid": 7, "ts": 1716454225588704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588102, "dur": 12, "args": { "External id": 259341, "cbid": 211, "correlation": 259341 } }, { "ph": "s", "id": 259341, "pid": 76337, "tid": -914061504, "ts": 1716454225588102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225588759, "dur": 54, "args": { "External id": 259361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259361, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 259361, "pid": 5, "tid": 7, "ts": 1716454225588759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588169, "dur": 11, "args": { "External id": 259361, "cbid": 211, "correlation": 259361 } }, { "ph": "s", "id": 259361, "pid": 76337, "tid": -914061504, "ts": 1716454225588169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225588815, "dur": 4, "args": { "External id": 259373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259373, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259373, "pid": 5, "tid": 7, "ts": 1716454225588815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588190, "dur": 6, "args": { "External id": 259373, "cbid": 211, "correlation": 259373 } }, { "ph": "s", "id": 259373, "pid": 76337, "tid": -914061504, "ts": 1716454225588190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225588820, "dur": 57, "args": { "External id": 259376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259376, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259376, "pid": 5, "tid": 7, "ts": 1716454225588820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588209, "dur": 6, "args": { "External id": 259376, "cbid": 211, "correlation": 259376 } }, { "ph": "s", "id": 259376, "pid": 76337, "tid": -914061504, "ts": 1716454225588209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225588878, "dur": 36, "args": { "External id": 259385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259385, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259385, "pid": 5, "tid": 7, "ts": 1716454225588878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588250, "dur": 9, "args": { "External id": 259385, "cbid": 211, "correlation": 259385 } }, { "ph": "s", "id": 259385, "pid": 76337, "tid": -914061504, "ts": 1716454225588250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225588319, "dur": 0, "args": { "External id": 259395, "cbid": 317, "correlation": 259395 } }, { "ph": "f", "id": 259395, "pid": 76337, "tid": -914061504, "ts": 1716454225588319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225588319, "dur": 0, "args": { "External id": 259396, "cbid": 203, "correlation": 259396 } }, { "ph": "f", "id": 259396, "pid": 76337, "tid": -914061504, "ts": 1716454225588319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225588320, "dur": 0, "args": { "External id": 259397, "cbid": 205, "correlation": 259397 } }, { "ph": "f", "id": 259397, "pid": 76337, "tid": -914061504, "ts": 1716454225588320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225588916, "dur": 40, "args": { "External id": 259401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259401, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259401, "pid": 5, "tid": 7, "ts": 1716454225588916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588335, "dur": 12, "args": { "External id": 259401, "cbid": 211, "correlation": 259401 } }, { "ph": "s", "id": 259401, "pid": 76337, "tid": -914061504, "ts": 1716454225588335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225588957, "dur": 15, "args": { "External id": 259403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259403, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259403, "pid": 5, "tid": 7, "ts": 1716454225588957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588350, "dur": 5, "args": { "External id": 259403, "cbid": 211, "correlation": 259403 } }, { "ph": "s", "id": 259403, "pid": 76337, "tid": -914061504, "ts": 1716454225588350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225588973, "dur": 4, "args": { "External id": 259405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259405, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259405, "pid": 5, "tid": 7, "ts": 1716454225588973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588359, "dur": 6, "args": { "External id": 259405, "cbid": 211, "correlation": 259405 } }, { "ph": "s", "id": 259405, "pid": 76337, "tid": -914061504, "ts": 1716454225588359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225588368, "dur": 0, "args": { "External id": 259406, "cbid": 51, "correlation": 259406 } }, { "ph": "s", "id": 259406, "pid": 76337, "tid": -914061504, "ts": 1716454225588368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225588978, "dur": 709, "args": { "External id": 259407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259407, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259407, "pid": 5, "tid": 7, "ts": 1716454225588978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588369, "dur": 6, "args": { "External id": 259407, "cbid": 211, "correlation": 259407 } }, { "ph": "s", "id": 259407, "pid": 76337, "tid": -914061504, "ts": 1716454225588369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225589689, "dur": 60, "args": { "External id": 259412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259412, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259412, "pid": 5, "tid": 7, "ts": 1716454225589689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588397, "dur": 8, "args": { "External id": 259412, "cbid": 211, "correlation": 259412 } }, { "ph": "s", "id": 259412, "pid": 76337, "tid": -914061504, "ts": 1716454225588397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225589751, "dur": 50, "args": { "External id": 259420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259420, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259420, "pid": 5, "tid": 7, "ts": 1716454225589751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588430, "dur": 8, "args": { "External id": 259420, "cbid": 211, "correlation": 259420 } }, { "ph": "s", "id": 259420, "pid": 76337, "tid": -914061504, "ts": 1716454225588430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225589802, "dur": 36, "args": { "External id": 259428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259428, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259428, "pid": 5, "tid": 7, "ts": 1716454225589802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588460, "dur": 9, "args": { "External id": 259428, "cbid": 211, "correlation": 259428 } }, { "ph": "s", "id": 259428, "pid": 76337, "tid": -914061504, "ts": 1716454225588460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225589839, "dur": 51, "args": { "External id": 259448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259448, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 259448, "pid": 5, "tid": 7, "ts": 1716454225589839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588542, "dur": 12, "args": { "External id": 259448, "cbid": 211, "correlation": 259448 } }, { "ph": "s", "id": 259448, "pid": 76337, "tid": -914061504, "ts": 1716454225588542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225589891, "dur": 4, "args": { "External id": 259460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259460, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259460, "pid": 5, "tid": 7, "ts": 1716454225589891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588563, "dur": 6, "args": { "External id": 259460, "cbid": 211, "correlation": 259460 } }, { "ph": "s", "id": 259460, "pid": 76337, "tid": -914061504, "ts": 1716454225588563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225589896, "dur": 56, "args": { "External id": 259463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259463, "pid": 5, "tid": 7, "ts": 1716454225589896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588581, "dur": 6, "args": { "External id": 259463, "cbid": 211, "correlation": 259463 } }, { "ph": "s", "id": 259463, "pid": 76337, "tid": -914061504, "ts": 1716454225588581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225588639, "dur": 0, "args": { "External id": 259474, "cbid": 317, "correlation": 259474 } }, { "ph": "f", "id": 259474, "pid": 76337, "tid": -914061504, "ts": 1716454225588639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225588639, "dur": 0, "args": { "External id": 259475, "cbid": 203, "correlation": 259475 } }, { "ph": "f", "id": 259475, "pid": 76337, "tid": -914061504, "ts": 1716454225588639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225588640, "dur": 0, "args": { "External id": 259476, "cbid": 205, "correlation": 259476 } }, { "ph": "f", "id": 259476, "pid": 76337, "tid": -914061504, "ts": 1716454225588640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588676, "dur": 2, "args": { "External id": 259480, "cbid": 251, "correlation": 259480 } }, { "ph": "f", "id": 259480, "pid": 76337, "tid": -914061504, "ts": 1716454225588676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588679, "dur": 1, "args": { "External id": 259481, "cbid": 251, "correlation": 259481 } }, { "ph": "f", "id": 259481, "pid": 76337, "tid": -914061504, "ts": 1716454225588679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588681, "dur": 1, "args": { "External id": 259482, "cbid": 251, "correlation": 259482 } }, { "ph": "f", "id": 259482, "pid": 76337, "tid": -914061504, "ts": 1716454225588681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588683, "dur": 1, "args": { "External id": 259483, "cbid": 251, "correlation": 259483 } }, { "ph": "f", "id": 259483, "pid": 76337, "tid": -914061504, "ts": 1716454225588683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588685, "dur": 1, "args": { "External id": 259484, "cbid": 251, "correlation": 259484 } }, { "ph": "f", "id": 259484, "pid": 76337, "tid": -914061504, "ts": 1716454225588685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588687, "dur": 1, "args": { "External id": 259485, "cbid": 251, "correlation": 259485 } }, { "ph": "f", "id": 259485, "pid": 76337, "tid": -914061504, "ts": 1716454225588687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588689, "dur": 1, "args": { "External id": 259486, "cbid": 251, "correlation": 259486 } }, { "ph": "f", "id": 259486, "pid": 76337, "tid": -914061504, "ts": 1716454225588689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588691, "dur": 1, "args": { "External id": 259487, "cbid": 251, "correlation": 259487 } }, { "ph": "f", "id": 259487, "pid": 76337, "tid": -914061504, "ts": 1716454225588691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225588693, "dur": 0, "args": { "External id": 259488, "cbid": 251, "correlation": 259488 } }, { "ph": "f", "id": 259488, "pid": 76337, "tid": -914061504, "ts": 1716454225588693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225589954, "dur": 115, "args": { "External id": 259489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259489, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 259489, "pid": 5, "tid": 7, "ts": 1716454225589954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588698, "dur": 14, "args": { "External id": 259489, "cbid": 211, "correlation": 259489 } }, { "ph": "s", "id": 259489, "pid": 76337, "tid": -914061504, "ts": 1716454225588698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225590070, "dur": 59, "args": { "External id": 259495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259495, "pid": 5, "tid": 7, "ts": 1716454225590070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588736, "dur": 9, "args": { "External id": 259495, "cbid": 211, "correlation": 259495 } }, { "ph": "s", "id": 259495, "pid": 76337, "tid": -914061504, "ts": 1716454225588736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225590131, "dur": 578, "args": { "External id": 259504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259504, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259504, "pid": 5, "tid": 7, "ts": 1716454225590131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588829, "dur": 16, "args": { "External id": 259504, "cbid": 211, "correlation": 259504 } }, { "ph": "s", "id": 259504, "pid": 76337, "tid": -914061504, "ts": 1716454225588829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225590710, "dur": 184, "args": { "External id": 259526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259526, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259526, "pid": 5, "tid": 7, "ts": 1716454225590710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225588904, "dur": 14, "args": { "External id": 259526, "cbid": 211, "correlation": 259526 } }, { "ph": "s", "id": 259526, "pid": 76337, "tid": -914061504, "ts": 1716454225588904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589027, "dur": 2, "args": { "External id": 259537, "cbid": 251, "correlation": 259537 } }, { "ph": "f", "id": 259537, "pid": 76337, "tid": -914061504, "ts": 1716454225589027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225590896, "dur": 199, "args": { "External id": 259538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259538, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259538, "pid": 5, "tid": 7, "ts": 1716454225590896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589034, "dur": 15, "args": { "External id": 259538, "cbid": 211, "correlation": 259538 } }, { "ph": "s", "id": 259538, "pid": 76337, "tid": -914061504, "ts": 1716454225589034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589110, "dur": 1, "args": { "External id": 259549, "cbid": 251, "correlation": 259549 } }, { "ph": "f", "id": 259549, "pid": 76337, "tid": -914061504, "ts": 1716454225589110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225591096, "dur": 192, "args": { "External id": 259550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259550, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259550, "pid": 5, "tid": 7, "ts": 1716454225591096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589114, "dur": 12, "args": { "External id": 259550, "cbid": 211, "correlation": 259550 } }, { "ph": "s", "id": 259550, "pid": 76337, "tid": -914061504, "ts": 1716454225589114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589179, "dur": 1, "args": { "External id": 259561, "cbid": 251, "correlation": 259561 } }, { "ph": "f", "id": 259561, "pid": 76337, "tid": -914061504, "ts": 1716454225589179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225591290, "dur": 191, "args": { "External id": 259562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259562, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259562, "pid": 5, "tid": 7, "ts": 1716454225591290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589183, "dur": 11, "args": { "External id": 259562, "cbid": 211, "correlation": 259562 } }, { "ph": "s", "id": 259562, "pid": 76337, "tid": -914061504, "ts": 1716454225589183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225591483, "dur": 19052, "args": { "External id": 259583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259583, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 259583, "pid": 5, "tid": 7, "ts": 1716454225591483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589290, "dur": 16, "args": { "External id": 259583, "cbid": 211, "correlation": 259583 } }, { "ph": "s", "id": 259583, "pid": 76337, "tid": -914061504, "ts": 1716454225589290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589406, "dur": 2, "args": { "External id": 259601, "cbid": 251, "correlation": 259601 } }, { "ph": "f", "id": 259601, "pid": 76337, "tid": -914061504, "ts": 1716454225589406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225610536, "dur": 204, "args": { "External id": 259603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259603, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259603, "pid": 5, "tid": 7, "ts": 1716454225610536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589413, "dur": 14, "args": { "External id": 259603, "cbid": 211, "correlation": 259603 } }, { "ph": "s", "id": 259603, "pid": 76337, "tid": -914061504, "ts": 1716454225589413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225610742, "dur": 66, "args": { "External id": 259611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259611, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259611, "pid": 5, "tid": 7, "ts": 1716454225610742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589487, "dur": 13, "args": { "External id": 259611, "cbid": 211, "correlation": 259611 } }, { "ph": "s", "id": 259611, "pid": 76337, "tid": -914061504, "ts": 1716454225589487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225610809, "dur": 97, "args": { "External id": 259619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259619, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259619, "pid": 5, "tid": 7, "ts": 1716454225610809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589527, "dur": 8, "args": { "External id": 259619, "cbid": 211, "correlation": 259619 } }, { "ph": "s", "id": 259619, "pid": 76337, "tid": -914061504, "ts": 1716454225589527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225610908, "dur": 55, "args": { "External id": 259630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259630, "pid": 5, "tid": 7, "ts": 1716454225610908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589613, "dur": 13, "args": { "External id": 259630, "cbid": 211, "correlation": 259630 } }, { "ph": "s", "id": 259630, "pid": 76337, "tid": -914061504, "ts": 1716454225589613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225610964, "dur": 94, "args": { "External id": 259652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259652, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259652, "pid": 5, "tid": 7, "ts": 1716454225610964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589646, "dur": 8, "args": { "External id": 259652, "cbid": 211, "correlation": 259652 } }, { "ph": "s", "id": 259652, "pid": 76337, "tid": -914061504, "ts": 1716454225589646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589729, "dur": 1, "args": { "External id": 259663, "cbid": 251, "correlation": 259663 } }, { "ph": "f", "id": 259663, "pid": 76337, "tid": -914061504, "ts": 1716454225589729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225611059, "dur": 107, "args": { "External id": 259664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259664, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259664, "pid": 5, "tid": 7, "ts": 1716454225611059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589734, "dur": 13, "args": { "External id": 259664, "cbid": 211, "correlation": 259664 } }, { "ph": "s", "id": 259664, "pid": 76337, "tid": -914061504, "ts": 1716454225589734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589817, "dur": 2, "args": { "External id": 259675, "cbid": 251, "correlation": 259675 } }, { "ph": "f", "id": 259675, "pid": 76337, "tid": -914061504, "ts": 1716454225589817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589822, "dur": 0, "args": { "External id": 259676, "cbid": 251, "correlation": 259676 } }, { "ph": "f", "id": 259676, "pid": 76337, "tid": -914061504, "ts": 1716454225589822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225611167, "dur": 10, "args": { "External id": 259677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259677, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 259677, "pid": 5, "tid": 7, "ts": 1716454225611167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589824, "dur": 14, "args": { "External id": 259677, "cbid": 211, "correlation": 259677 } }, { "ph": "s", "id": 259677, "pid": 76337, "tid": -914061504, "ts": 1716454225589824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225611179, "dur": 5, "args": { "External id": 259679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259679, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 259679, "pid": 5, "tid": 7, "ts": 1716454225611179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589842, "dur": 8, "args": { "External id": 259679, "cbid": 211, "correlation": 259679 } }, { "ph": "s", "id": 259679, "pid": 76337, "tid": -914061504, "ts": 1716454225589842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589905, "dur": 1, "args": { "External id": 259690, "cbid": 251, "correlation": 259690 } }, { "ph": "f", "id": 259690, "pid": 76337, "tid": -914061504, "ts": 1716454225589905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225589909, "dur": 0, "args": { "External id": 259691, "cbid": 251, "correlation": 259691 } }, { "ph": "f", "id": 259691, "pid": 76337, "tid": -914061504, "ts": 1716454225589909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225611185, "dur": 6, "args": { "External id": 259692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259692, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 259692, "pid": 5, "tid": 7, "ts": 1716454225611185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589911, "dur": 12, "args": { "External id": 259692, "cbid": 211, "correlation": 259692 } }, { "ph": "s", "id": 259692, "pid": 76337, "tid": -914061504, "ts": 1716454225589911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225611192, "dur": 4, "args": { "External id": 259694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259694, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 259694, "pid": 5, "tid": 7, "ts": 1716454225611192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225589924, "dur": 5, "args": { "External id": 259694, "cbid": 211, "correlation": 259694 } }, { "ph": "s", "id": 259694, "pid": 76337, "tid": -914061504, "ts": 1716454225589924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225611197, "dur": 157, "args": { "External id": 259715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259715, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 259715, "pid": 5, "tid": 7, "ts": 1716454225611197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590007, "dur": 12, "args": { "External id": 259715, "cbid": 211, "correlation": 259715 } }, { "ph": "s", "id": 259715, "pid": 76337, "tid": -914061504, "ts": 1716454225590007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590107, "dur": 2, "args": { "External id": 259733, "cbid": 251, "correlation": 259733 } }, { "ph": "f", "id": 259733, "pid": 76337, "tid": -914061504, "ts": 1716454225590107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225611356, "dur": 109, "args": { "External id": 259735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259735, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 259735, "pid": 5, "tid": 7, "ts": 1716454225611356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590114, "dur": 14, "args": { "External id": 259735, "cbid": 211, "correlation": 259735 } }, { "ph": "s", "id": 259735, "pid": 76337, "tid": -914061504, "ts": 1716454225590114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225611466, "dur": 35, "args": { "External id": 259743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259743, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259743, "pid": 5, "tid": 7, "ts": 1716454225611466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590185, "dur": 12, "args": { "External id": 259743, "cbid": 211, "correlation": 259743 } }, { "ph": "s", "id": 259743, "pid": 76337, "tid": -914061504, "ts": 1716454225590185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225611502, "dur": 69, "args": { "External id": 259751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259751, "pid": 5, "tid": 7, "ts": 1716454225611502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590227, "dur": 9, "args": { "External id": 259751, "cbid": 211, "correlation": 259751 } }, { "ph": "s", "id": 259751, "pid": 76337, "tid": -914061504, "ts": 1716454225590227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225611572, "dur": 93, "args": { "External id": 259773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259773, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259773, "pid": 5, "tid": 7, "ts": 1716454225611572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590279, "dur": 10, "args": { "External id": 259773, "cbid": 211, "correlation": 259773 } }, { "ph": "s", "id": 259773, "pid": 76337, "tid": -914061504, "ts": 1716454225590279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590368, "dur": 1, "args": { "External id": 259789, "cbid": 251, "correlation": 259789 } }, { "ph": "f", "id": 259789, "pid": 76337, "tid": -914061504, "ts": 1716454225590368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225611667, "dur": 583, "args": { "External id": 259791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259791, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259791, "pid": 5, "tid": 7, "ts": 1716454225611667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590374, "dur": 12, "args": { "External id": 259791, "cbid": 211, "correlation": 259791 } }, { "ph": "s", "id": 259791, "pid": 76337, "tid": -914061504, "ts": 1716454225590374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225612251, "dur": 248, "args": { "External id": 259799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259799, "pid": 5, "tid": 7, "ts": 1716454225612251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590452, "dur": 15, "args": { "External id": 259799, "cbid": 211, "correlation": 259799 } }, { "ph": "s", "id": 259799, "pid": 76337, "tid": -914061504, "ts": 1716454225590452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225612500, "dur": 252, "args": { "External id": 259807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259807, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259807, "pid": 5, "tid": 7, "ts": 1716454225612500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590488, "dur": 10, "args": { "External id": 259807, "cbid": 211, "correlation": 259807 } }, { "ph": "s", "id": 259807, "pid": 76337, "tid": -914061504, "ts": 1716454225590488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590572, "dur": 2, "args": { "External id": 259823, "cbid": 251, "correlation": 259823 } }, { "ph": "f", "id": 259823, "pid": 76337, "tid": -914061504, "ts": 1716454225590572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590577, "dur": 0, "args": { "External id": 259825, "cbid": 251, "correlation": 259825 } }, { "ph": "f", "id": 259825, "pid": 76337, "tid": -914061504, "ts": 1716454225590577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225612754, "dur": 361, "args": { "External id": 259826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259826, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 259826, "pid": 5, "tid": 7, "ts": 1716454225612754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590582, "dur": 14, "args": { "External id": 259826, "cbid": 211, "correlation": 259826 } }, { "ph": "s", "id": 259826, "pid": 76337, "tid": -914061504, "ts": 1716454225590582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225613116, "dur": 50, "args": { "External id": 259834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259834, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259834, "pid": 5, "tid": 7, "ts": 1716454225613116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590624, "dur": 10, "args": { "External id": 259834, "cbid": 211, "correlation": 259834 } }, { "ph": "s", "id": 259834, "pid": 76337, "tid": -914061504, "ts": 1716454225590624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225613167, "dur": 160, "args": { "External id": 259845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259845, "pid": 5, "tid": 7, "ts": 1716454225613167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590696, "dur": 13, "args": { "External id": 259845, "cbid": 211, "correlation": 259845 } }, { "ph": "s", "id": 259845, "pid": 76337, "tid": -914061504, "ts": 1716454225590696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225590762, "dur": 0, "args": { "External id": 259857, "cbid": 317, "correlation": 259857 } }, { "ph": "f", "id": 259857, "pid": 76337, "tid": -914061504, "ts": 1716454225590762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225590763, "dur": 0, "args": { "External id": 259858, "cbid": 203, "correlation": 259858 } }, { "ph": "f", "id": 259858, "pid": 76337, "tid": -914061504, "ts": 1716454225590763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225590763, "dur": 0, "args": { "External id": 259859, "cbid": 205, "correlation": 259859 } }, { "ph": "f", "id": 259859, "pid": 76337, "tid": -914061504, "ts": 1716454225590763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590787, "dur": 1, "args": { "External id": 259863, "cbid": 251, "correlation": 259863 } }, { "ph": "f", "id": 259863, "pid": 76337, "tid": -914061504, "ts": 1716454225590787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590789, "dur": 0, "args": { "External id": 259864, "cbid": 251, "correlation": 259864 } }, { "ph": "f", "id": 259864, "pid": 76337, "tid": -914061504, "ts": 1716454225590789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590790, "dur": 0, "args": { "External id": 259865, "cbid": 251, "correlation": 259865 } }, { "ph": "f", "id": 259865, "pid": 76337, "tid": -914061504, "ts": 1716454225590790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590791, "dur": 0, "args": { "External id": 259866, "cbid": 251, "correlation": 259866 } }, { "ph": "f", "id": 259866, "pid": 76337, "tid": -914061504, "ts": 1716454225590791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590792, "dur": 0, "args": { "External id": 259867, "cbid": 251, "correlation": 259867 } }, { "ph": "f", "id": 259867, "pid": 76337, "tid": -914061504, "ts": 1716454225590792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590793, "dur": 0, "args": { "External id": 259868, "cbid": 251, "correlation": 259868 } }, { "ph": "f", "id": 259868, "pid": 76337, "tid": -914061504, "ts": 1716454225590793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590794, "dur": 0, "args": { "External id": 259869, "cbid": 251, "correlation": 259869 } }, { "ph": "f", "id": 259869, "pid": 76337, "tid": -914061504, "ts": 1716454225590794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590795, "dur": 0, "args": { "External id": 259870, "cbid": 251, "correlation": 259870 } }, { "ph": "f", "id": 259870, "pid": 76337, "tid": -914061504, "ts": 1716454225590795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225590796, "dur": 0, "args": { "External id": 259871, "cbid": 251, "correlation": 259871 } }, { "ph": "f", "id": 259871, "pid": 76337, "tid": -914061504, "ts": 1716454225590796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225613329, "dur": 117, "args": { "External id": 259872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259872, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 259872, "pid": 5, "tid": 7, "ts": 1716454225613329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590799, "dur": 12, "args": { "External id": 259872, "cbid": 211, "correlation": 259872 } }, { "ph": "s", "id": 259872, "pid": 76337, "tid": -914061504, "ts": 1716454225590799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225613448, "dur": 61, "args": { "External id": 259878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259878, "pid": 5, "tid": 7, "ts": 1716454225613448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590834, "dur": 9, "args": { "External id": 259878, "cbid": 211, "correlation": 259878 } }, { "ph": "s", "id": 259878, "pid": 76337, "tid": -914061504, "ts": 1716454225590834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225613510, "dur": 50, "args": { "External id": 259886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259886, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259886, "pid": 5, "tid": 7, "ts": 1716454225613510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590867, "dur": 8, "args": { "External id": 259886, "cbid": 211, "correlation": 259886 } }, { "ph": "s", "id": 259886, "pid": 76337, "tid": -914061504, "ts": 1716454225590867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225613561, "dur": 53, "args": { "External id": 259906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259906, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 259906, "pid": 5, "tid": 7, "ts": 1716454225613561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590939, "dur": 12, "args": { "External id": 259906, "cbid": 211, "correlation": 259906 } }, { "ph": "s", "id": 259906, "pid": 76337, "tid": -914061504, "ts": 1716454225590939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225613615, "dur": 5, "args": { "External id": 259918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259918, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259918, "pid": 5, "tid": 7, "ts": 1716454225613615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590962, "dur": 6, "args": { "External id": 259918, "cbid": 211, "correlation": 259918 } }, { "ph": "s", "id": 259918, "pid": 76337, "tid": -914061504, "ts": 1716454225590962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225613621, "dur": 57, "args": { "External id": 259921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259921, "pid": 5, "tid": 7, "ts": 1716454225613621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225590988, "dur": 7, "args": { "External id": 259921, "cbid": 211, "correlation": 259921 } }, { "ph": "s", "id": 259921, "pid": 76337, "tid": -914061504, "ts": 1716454225590988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225613679, "dur": 37, "args": { "External id": 259930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259930, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259930, "pid": 5, "tid": 7, "ts": 1716454225613679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591030, "dur": 10, "args": { "External id": 259930, "cbid": 211, "correlation": 259930 } }, { "ph": "s", "id": 259930, "pid": 76337, "tid": -914061504, "ts": 1716454225591030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225591083, "dur": 0, "args": { "External id": 259940, "cbid": 317, "correlation": 259940 } }, { "ph": "f", "id": 259940, "pid": 76337, "tid": -914061504, "ts": 1716454225591083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225591084, "dur": 0, "args": { "External id": 259941, "cbid": 203, "correlation": 259941 } }, { "ph": "f", "id": 259941, "pid": 76337, "tid": -914061504, "ts": 1716454225591084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225591085, "dur": 0, "args": { "External id": 259942, "cbid": 205, "correlation": 259942 } }, { "ph": "f", "id": 259942, "pid": 76337, "tid": -914061504, "ts": 1716454225591085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225613717, "dur": 41, "args": { "External id": 259946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259946, "pid": 5, "tid": 7, "ts": 1716454225613717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591099, "dur": 12, "args": { "External id": 259946, "cbid": 211, "correlation": 259946 } }, { "ph": "s", "id": 259946, "pid": 76337, "tid": -914061504, "ts": 1716454225591099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225613759, "dur": 15, "args": { "External id": 259948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259948, "pid": 5, "tid": 7, "ts": 1716454225613759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591114, "dur": 5, "args": { "External id": 259948, "cbid": 211, "correlation": 259948 } }, { "ph": "s", "id": 259948, "pid": 76337, "tid": -914061504, "ts": 1716454225591114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225613775, "dur": 4, "args": { "External id": 259950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259950, "pid": 5, "tid": 7, "ts": 1716454225613775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591124, "dur": 5, "args": { "External id": 259950, "cbid": 211, "correlation": 259950 } }, { "ph": "s", "id": 259950, "pid": 76337, "tid": -914061504, "ts": 1716454225591124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225591133, "dur": 0, "args": { "External id": 259951, "cbid": 51, "correlation": 259951 } }, { "ph": "s", "id": 259951, "pid": 76337, "tid": -914061504, "ts": 1716454225591133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225613780, "dur": 713, "args": { "External id": 259952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259952, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 259952, "pid": 5, "tid": 7, "ts": 1716454225613780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591134, "dur": 5, "args": { "External id": 259952, "cbid": 211, "correlation": 259952 } }, { "ph": "s", "id": 259952, "pid": 76337, "tid": -914061504, "ts": 1716454225591134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225614495, "dur": 60, "args": { "External id": 259957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259957, "pid": 5, "tid": 7, "ts": 1716454225614495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591161, "dur": 9, "args": { "External id": 259957, "cbid": 211, "correlation": 259957 } }, { "ph": "s", "id": 259957, "pid": 76337, "tid": -914061504, "ts": 1716454225591161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225614556, "dur": 3, "args": { "External id": 259965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259965, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 259965, "pid": 5, "tid": 7, "ts": 1716454225614556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591205, "dur": 9, "args": { "External id": 259965, "cbid": 211, "correlation": 259965 } }, { "ph": "s", "id": 259965, "pid": 76337, "tid": -914061504, "ts": 1716454225591205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591271, "dur": 1, "args": { "External id": 259981, "cbid": 251, "correlation": 259981 } }, { "ph": "f", "id": 259981, "pid": 76337, "tid": -914061504, "ts": 1716454225591271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591276, "dur": 0, "args": { "External id": 259983, "cbid": 251, "correlation": 259983 } }, { "ph": "f", "id": 259983, "pid": 76337, "tid": -914061504, "ts": 1716454225591276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225614560, "dur": 12, "args": { "External id": 259984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259984, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 259984, "pid": 5, "tid": 7, "ts": 1716454225614560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591278, "dur": 11, "args": { "External id": 259984, "cbid": 211, "correlation": 259984 } }, { "ph": "s", "id": 259984, "pid": 76337, "tid": -914061504, "ts": 1716454225591278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225614574, "dur": 5, "args": { "External id": 259986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259986, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 259986, "pid": 5, "tid": 7, "ts": 1716454225614574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591291, "dur": 6, "args": { "External id": 259986, "cbid": 211, "correlation": 259986 } }, { "ph": "s", "id": 259986, "pid": 76337, "tid": -914061504, "ts": 1716454225591291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225614580, "dur": 54, "args": { "External id": 259996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 259996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 259996, "pid": 5, "tid": 7, "ts": 1716454225614580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591349, "dur": 13, "args": { "External id": 259996, "cbid": 211, "correlation": 259996 } }, { "ph": "s", "id": 259996, "pid": 76337, "tid": -914061504, "ts": 1716454225591349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225614636, "dur": 51, "args": { "External id": 260016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260016, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 260016, "pid": 5, "tid": 7, "ts": 1716454225614636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591415, "dur": 11, "args": { "External id": 260016, "cbid": 211, "correlation": 260016 } }, { "ph": "s", "id": 260016, "pid": 76337, "tid": -914061504, "ts": 1716454225591415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225614688, "dur": 4, "args": { "External id": 260028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260028, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 260028, "pid": 5, "tid": 7, "ts": 1716454225614688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591435, "dur": 6, "args": { "External id": 260028, "cbid": 211, "correlation": 260028 } }, { "ph": "s", "id": 260028, "pid": 76337, "tid": -914061504, "ts": 1716454225591435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225614694, "dur": 55, "args": { "External id": 260031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260031, "pid": 5, "tid": 7, "ts": 1716454225614694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591455, "dur": 6, "args": { "External id": 260031, "cbid": 211, "correlation": 260031 } }, { "ph": "s", "id": 260031, "pid": 76337, "tid": -914061504, "ts": 1716454225591455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225614750, "dur": 36, "args": { "External id": 260040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260040, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260040, "pid": 5, "tid": 7, "ts": 1716454225614750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591495, "dur": 11, "args": { "External id": 260040, "cbid": 211, "correlation": 260040 } }, { "ph": "s", "id": 260040, "pid": 76337, "tid": -914061504, "ts": 1716454225591495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225591558, "dur": 0, "args": { "External id": 260050, "cbid": 317, "correlation": 260050 } }, { "ph": "f", "id": 260050, "pid": 76337, "tid": -914061504, "ts": 1716454225591558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225591559, "dur": 0, "args": { "External id": 260051, "cbid": 203, "correlation": 260051 } }, { "ph": "f", "id": 260051, "pid": 76337, "tid": -914061504, "ts": 1716454225591559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225591560, "dur": 0, "args": { "External id": 260052, "cbid": 205, "correlation": 260052 } }, { "ph": "f", "id": 260052, "pid": 76337, "tid": -914061504, "ts": 1716454225591560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225614788, "dur": 40, "args": { "External id": 260056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260056, "pid": 5, "tid": 7, "ts": 1716454225614788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591573, "dur": 12, "args": { "External id": 260056, "cbid": 211, "correlation": 260056 } }, { "ph": "s", "id": 260056, "pid": 76337, "tid": -914061504, "ts": 1716454225591573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225614830, "dur": 15, "args": { "External id": 260058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260058, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260058, "pid": 5, "tid": 7, "ts": 1716454225614830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591587, "dur": 5, "args": { "External id": 260058, "cbid": 211, "correlation": 260058 } }, { "ph": "s", "id": 260058, "pid": 76337, "tid": -914061504, "ts": 1716454225591587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225614845, "dur": 3, "args": { "External id": 260060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 260060, "pid": 5, "tid": 7, "ts": 1716454225614845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591597, "dur": 6, "args": { "External id": 260060, "cbid": 211, "correlation": 260060 } }, { "ph": "s", "id": 260060, "pid": 76337, "tid": -914061504, "ts": 1716454225591597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225591607, "dur": 0, "args": { "External id": 260061, "cbid": 51, "correlation": 260061 } }, { "ph": "s", "id": 260061, "pid": 76337, "tid": -914061504, "ts": 1716454225591607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225614850, "dur": 706, "args": { "External id": 260062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260062, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260062, "pid": 5, "tid": 7, "ts": 1716454225614850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591607, "dur": 5, "args": { "External id": 260062, "cbid": 211, "correlation": 260062 } }, { "ph": "s", "id": 260062, "pid": 76337, "tid": -914061504, "ts": 1716454225591607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225615557, "dur": 60, "args": { "External id": 260067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260067, "pid": 5, "tid": 7, "ts": 1716454225615557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591634, "dur": 9, "args": { "External id": 260067, "cbid": 211, "correlation": 260067 } }, { "ph": "s", "id": 260067, "pid": 76337, "tid": -914061504, "ts": 1716454225591634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225615618, "dur": 50, "args": { "External id": 260075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260075, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260075, "pid": 5, "tid": 7, "ts": 1716454225615618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591666, "dur": 8, "args": { "External id": 260075, "cbid": 211, "correlation": 260075 } }, { "ph": "s", "id": 260075, "pid": 76337, "tid": -914061504, "ts": 1716454225591666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225615670, "dur": 36, "args": { "External id": 260083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260083, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260083, "pid": 5, "tid": 7, "ts": 1716454225615670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591695, "dur": 9, "args": { "External id": 260083, "cbid": 211, "correlation": 260083 } }, { "ph": "s", "id": 260083, "pid": 76337, "tid": -914061504, "ts": 1716454225591695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225615707, "dur": 51, "args": { "External id": 260103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260103, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 260103, "pid": 5, "tid": 7, "ts": 1716454225615707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591773, "dur": 12, "args": { "External id": 260103, "cbid": 211, "correlation": 260103 } }, { "ph": "s", "id": 260103, "pid": 76337, "tid": -914061504, "ts": 1716454225591773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225615759, "dur": 4, "args": { "External id": 260115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260115, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 260115, "pid": 5, "tid": 7, "ts": 1716454225615759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591795, "dur": 6, "args": { "External id": 260115, "cbid": 211, "correlation": 260115 } }, { "ph": "s", "id": 260115, "pid": 76337, "tid": -914061504, "ts": 1716454225591795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225615764, "dur": 56, "args": { "External id": 260118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260118, "pid": 5, "tid": 7, "ts": 1716454225615764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591813, "dur": 6, "args": { "External id": 260118, "cbid": 211, "correlation": 260118 } }, { "ph": "s", "id": 260118, "pid": 76337, "tid": -914061504, "ts": 1716454225591813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225591869, "dur": 0, "args": { "External id": 260129, "cbid": 317, "correlation": 260129 } }, { "ph": "f", "id": 260129, "pid": 76337, "tid": -914061504, "ts": 1716454225591869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225591870, "dur": 0, "args": { "External id": 260130, "cbid": 203, "correlation": 260130 } }, { "ph": "f", "id": 260130, "pid": 76337, "tid": -914061504, "ts": 1716454225591870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225591871, "dur": 0, "args": { "External id": 260131, "cbid": 205, "correlation": 260131 } }, { "ph": "f", "id": 260131, "pid": 76337, "tid": -914061504, "ts": 1716454225591871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591892, "dur": 1, "args": { "External id": 260135, "cbid": 251, "correlation": 260135 } }, { "ph": "f", "id": 260135, "pid": 76337, "tid": -914061504, "ts": 1716454225591892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591894, "dur": 0, "args": { "External id": 260136, "cbid": 251, "correlation": 260136 } }, { "ph": "f", "id": 260136, "pid": 76337, "tid": -914061504, "ts": 1716454225591894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591895, "dur": 0, "args": { "External id": 260137, "cbid": 251, "correlation": 260137 } }, { "ph": "f", "id": 260137, "pid": 76337, "tid": -914061504, "ts": 1716454225591895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591895, "dur": 0, "args": { "External id": 260138, "cbid": 251, "correlation": 260138 } }, { "ph": "f", "id": 260138, "pid": 76337, "tid": -914061504, "ts": 1716454225591895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591896, "dur": 0, "args": { "External id": 260139, "cbid": 251, "correlation": 260139 } }, { "ph": "f", "id": 260139, "pid": 76337, "tid": -914061504, "ts": 1716454225591896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591897, "dur": 0, "args": { "External id": 260140, "cbid": 251, "correlation": 260140 } }, { "ph": "f", "id": 260140, "pid": 76337, "tid": -914061504, "ts": 1716454225591897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591898, "dur": 0, "args": { "External id": 260141, "cbid": 251, "correlation": 260141 } }, { "ph": "f", "id": 260141, "pid": 76337, "tid": -914061504, "ts": 1716454225591898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591898, "dur": 0, "args": { "External id": 260142, "cbid": 251, "correlation": 260142 } }, { "ph": "f", "id": 260142, "pid": 76337, "tid": -914061504, "ts": 1716454225591898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225591900, "dur": 0, "args": { "External id": 260143, "cbid": 251, "correlation": 260143 } }, { "ph": "f", "id": 260143, "pid": 76337, "tid": -914061504, "ts": 1716454225591900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225615821, "dur": 111, "args": { "External id": 260144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260144, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260144, "pid": 5, "tid": 7, "ts": 1716454225615821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591902, "dur": 13, "args": { "External id": 260144, "cbid": 211, "correlation": 260144 } }, { "ph": "s", "id": 260144, "pid": 76337, "tid": -914061504, "ts": 1716454225591902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225615934, "dur": 60, "args": { "External id": 260150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260150, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260150, "pid": 5, "tid": 7, "ts": 1716454225615934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225591937, "dur": 9, "args": { "External id": 260150, "cbid": 211, "correlation": 260150 } }, { "ph": "s", "id": 260150, "pid": 76337, "tid": -914061504, "ts": 1716454225591937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225615995, "dur": 570, "args": { "External id": 260159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260159, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260159, "pid": 5, "tid": 7, "ts": 1716454225615995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592029, "dur": 14, "args": { "External id": 260159, "cbid": 211, "correlation": 260159 } }, { "ph": "s", "id": 260159, "pid": 76337, "tid": -914061504, "ts": 1716454225592029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225616566, "dur": 184, "args": { "External id": 260181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260181, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260181, "pid": 5, "tid": 7, "ts": 1716454225616566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592087, "dur": 10, "args": { "External id": 260181, "cbid": 211, "correlation": 260181 } }, { "ph": "s", "id": 260181, "pid": 76337, "tid": -914061504, "ts": 1716454225592087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592173, "dur": 1, "args": { "External id": 260192, "cbid": 251, "correlation": 260192 } }, { "ph": "f", "id": 260192, "pid": 76337, "tid": -914061504, "ts": 1716454225592173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225616752, "dur": 196, "args": { "External id": 260193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260193, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260193, "pid": 5, "tid": 7, "ts": 1716454225616752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592178, "dur": 13, "args": { "External id": 260193, "cbid": 211, "correlation": 260193 } }, { "ph": "s", "id": 260193, "pid": 76337, "tid": -914061504, "ts": 1716454225592178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592246, "dur": 1, "args": { "External id": 260204, "cbid": 251, "correlation": 260204 } }, { "ph": "f", "id": 260204, "pid": 76337, "tid": -914061504, "ts": 1716454225592246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225616950, "dur": 189, "args": { "External id": 260205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260205, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260205, "pid": 5, "tid": 7, "ts": 1716454225616950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592250, "dur": 12, "args": { "External id": 260205, "cbid": 211, "correlation": 260205 } }, { "ph": "s", "id": 260205, "pid": 76337, "tid": -914061504, "ts": 1716454225592250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592313, "dur": 1, "args": { "External id": 260216, "cbid": 251, "correlation": 260216 } }, { "ph": "f", "id": 260216, "pid": 76337, "tid": -914061504, "ts": 1716454225592313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225617140, "dur": 190, "args": { "External id": 260217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260217, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260217, "pid": 5, "tid": 7, "ts": 1716454225617140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592317, "dur": 11, "args": { "External id": 260217, "cbid": 211, "correlation": 260217 } }, { "ph": "s", "id": 260217, "pid": 76337, "tid": -914061504, "ts": 1716454225592317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225617331, "dur": 18971, "args": { "External id": 260238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260238, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260238, "pid": 5, "tid": 7, "ts": 1716454225617331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592397, "dur": 13, "args": { "External id": 260238, "cbid": 211, "correlation": 260238 } }, { "ph": "s", "id": 260238, "pid": 76337, "tid": -914061504, "ts": 1716454225592397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592496, "dur": 1, "args": { "External id": 260256, "cbid": 251, "correlation": 260256 } }, { "ph": "f", "id": 260256, "pid": 76337, "tid": -914061504, "ts": 1716454225592496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225636303, "dur": 207, "args": { "External id": 260258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260258, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260258, "pid": 5, "tid": 7, "ts": 1716454225636303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592502, "dur": 15, "args": { "External id": 260258, "cbid": 211, "correlation": 260258 } }, { "ph": "s", "id": 260258, "pid": 76337, "tid": -914061504, "ts": 1716454225592502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225636511, "dur": 65, "args": { "External id": 260266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260266, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260266, "pid": 5, "tid": 7, "ts": 1716454225636511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592574, "dur": 13, "args": { "External id": 260266, "cbid": 211, "correlation": 260266 } }, { "ph": "s", "id": 260266, "pid": 76337, "tid": -914061504, "ts": 1716454225592574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225636578, "dur": 96, "args": { "External id": 260274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260274, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260274, "pid": 5, "tid": 7, "ts": 1716454225636578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592615, "dur": 9, "args": { "External id": 260274, "cbid": 211, "correlation": 260274 } }, { "ph": "s", "id": 260274, "pid": 76337, "tid": -914061504, "ts": 1716454225592615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225636676, "dur": 56, "args": { "External id": 260285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260285, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260285, "pid": 5, "tid": 7, "ts": 1716454225636676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592686, "dur": 12, "args": { "External id": 260285, "cbid": 211, "correlation": 260285 } }, { "ph": "s", "id": 260285, "pid": 76337, "tid": -914061504, "ts": 1716454225592686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225636732, "dur": 94, "args": { "External id": 260307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260307, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260307, "pid": 5, "tid": 7, "ts": 1716454225636732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592718, "dur": 8, "args": { "External id": 260307, "cbid": 211, "correlation": 260307 } }, { "ph": "s", "id": 260307, "pid": 76337, "tid": -914061504, "ts": 1716454225592718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592803, "dur": 1, "args": { "External id": 260318, "cbid": 251, "correlation": 260318 } }, { "ph": "f", "id": 260318, "pid": 76337, "tid": -914061504, "ts": 1716454225592803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225636828, "dur": 106, "args": { "External id": 260319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260319, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260319, "pid": 5, "tid": 7, "ts": 1716454225636828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592808, "dur": 13, "args": { "External id": 260319, "cbid": 211, "correlation": 260319 } }, { "ph": "s", "id": 260319, "pid": 76337, "tid": -914061504, "ts": 1716454225592808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592881, "dur": 1, "args": { "External id": 260330, "cbid": 251, "correlation": 260330 } }, { "ph": "f", "id": 260330, "pid": 76337, "tid": -914061504, "ts": 1716454225592881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592884, "dur": 0, "args": { "External id": 260331, "cbid": 251, "correlation": 260331 } }, { "ph": "f", "id": 260331, "pid": 76337, "tid": -914061504, "ts": 1716454225592884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225636935, "dur": 10, "args": { "External id": 260332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260332, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 260332, "pid": 5, "tid": 7, "ts": 1716454225636935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592886, "dur": 12, "args": { "External id": 260332, "cbid": 211, "correlation": 260332 } }, { "ph": "s", "id": 260332, "pid": 76337, "tid": -914061504, "ts": 1716454225592886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225636947, "dur": 5, "args": { "External id": 260334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260334, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 260334, "pid": 5, "tid": 7, "ts": 1716454225636947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592900, "dur": 7, "args": { "External id": 260334, "cbid": 211, "correlation": 260334 } }, { "ph": "s", "id": 260334, "pid": 76337, "tid": -914061504, "ts": 1716454225592900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592961, "dur": 1, "args": { "External id": 260345, "cbid": 251, "correlation": 260345 } }, { "ph": "f", "id": 260345, "pid": 76337, "tid": -914061504, "ts": 1716454225592961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225592965, "dur": 0, "args": { "External id": 260346, "cbid": 251, "correlation": 260346 } }, { "ph": "f", "id": 260346, "pid": 76337, "tid": -914061504, "ts": 1716454225592965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225636953, "dur": 6, "args": { "External id": 260347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260347, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 260347, "pid": 5, "tid": 7, "ts": 1716454225636953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592966, "dur": 20, "args": { "External id": 260347, "cbid": 211, "correlation": 260347 } }, { "ph": "s", "id": 260347, "pid": 76337, "tid": -914061504, "ts": 1716454225592966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225636961, "dur": 4, "args": { "External id": 260349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260349, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 260349, "pid": 5, "tid": 7, "ts": 1716454225636961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225592988, "dur": 6, "args": { "External id": 260349, "cbid": 211, "correlation": 260349 } }, { "ph": "s", "id": 260349, "pid": 76337, "tid": -914061504, "ts": 1716454225592988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225636966, "dur": 157, "args": { "External id": 260370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260370, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260370, "pid": 5, "tid": 7, "ts": 1716454225636966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593063, "dur": 13, "args": { "External id": 260370, "cbid": 211, "correlation": 260370 } }, { "ph": "s", "id": 260370, "pid": 76337, "tid": -914061504, "ts": 1716454225593063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593160, "dur": 1, "args": { "External id": 260388, "cbid": 251, "correlation": 260388 } }, { "ph": "f", "id": 260388, "pid": 76337, "tid": -914061504, "ts": 1716454225593160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225637125, "dur": 108, "args": { "External id": 260390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260390, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260390, "pid": 5, "tid": 7, "ts": 1716454225637125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593166, "dur": 14, "args": { "External id": 260390, "cbid": 211, "correlation": 260390 } }, { "ph": "s", "id": 260390, "pid": 76337, "tid": -914061504, "ts": 1716454225593166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225637234, "dur": 35, "args": { "External id": 260398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260398, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260398, "pid": 5, "tid": 7, "ts": 1716454225637234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593235, "dur": 12, "args": { "External id": 260398, "cbid": 211, "correlation": 260398 } }, { "ph": "s", "id": 260398, "pid": 76337, "tid": -914061504, "ts": 1716454225593235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225637271, "dur": 67, "args": { "External id": 260406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260406, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260406, "pid": 5, "tid": 7, "ts": 1716454225637271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593276, "dur": 9, "args": { "External id": 260406, "cbid": 211, "correlation": 260406 } }, { "ph": "s", "id": 260406, "pid": 76337, "tid": -914061504, "ts": 1716454225593276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225637339, "dur": 94, "args": { "External id": 260428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260428, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260428, "pid": 5, "tid": 7, "ts": 1716454225637339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593327, "dur": 10, "args": { "External id": 260428, "cbid": 211, "correlation": 260428 } }, { "ph": "s", "id": 260428, "pid": 76337, "tid": -914061504, "ts": 1716454225593327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593412, "dur": 1, "args": { "External id": 260444, "cbid": 251, "correlation": 260444 } }, { "ph": "f", "id": 260444, "pid": 76337, "tid": -914061504, "ts": 1716454225593412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225637434, "dur": 583, "args": { "External id": 260446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260446, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260446, "pid": 5, "tid": 7, "ts": 1716454225637434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593418, "dur": 13, "args": { "External id": 260446, "cbid": 211, "correlation": 260446 } }, { "ph": "s", "id": 260446, "pid": 76337, "tid": -914061504, "ts": 1716454225593418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225638019, "dur": 247, "args": { "External id": 260454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260454, "pid": 5, "tid": 7, "ts": 1716454225638019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593484, "dur": 12, "args": { "External id": 260454, "cbid": 211, "correlation": 260454 } }, { "ph": "s", "id": 260454, "pid": 76337, "tid": -914061504, "ts": 1716454225593484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225638267, "dur": 254, "args": { "External id": 260462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260462, "pid": 5, "tid": 7, "ts": 1716454225638267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593514, "dur": 9, "args": { "External id": 260462, "cbid": 211, "correlation": 260462 } }, { "ph": "s", "id": 260462, "pid": 76337, "tid": -914061504, "ts": 1716454225593514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593596, "dur": 1, "args": { "External id": 260478, "cbid": 251, "correlation": 260478 } }, { "ph": "f", "id": 260478, "pid": 76337, "tid": -914061504, "ts": 1716454225593596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593601, "dur": 0, "args": { "External id": 260480, "cbid": 251, "correlation": 260480 } }, { "ph": "f", "id": 260480, "pid": 76337, "tid": -914061504, "ts": 1716454225593601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225638523, "dur": 361, "args": { "External id": 260481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260481, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 260481, "pid": 5, "tid": 7, "ts": 1716454225638523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593604, "dur": 12, "args": { "External id": 260481, "cbid": 211, "correlation": 260481 } }, { "ph": "s", "id": 260481, "pid": 76337, "tid": -914061504, "ts": 1716454225593604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225638885, "dur": 50, "args": { "External id": 260489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260489, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260489, "pid": 5, "tid": 7, "ts": 1716454225638885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593645, "dur": 10, "args": { "External id": 260489, "cbid": 211, "correlation": 260489 } }, { "ph": "s", "id": 260489, "pid": 76337, "tid": -914061504, "ts": 1716454225593645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225638936, "dur": 160, "args": { "External id": 260500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260500, "pid": 5, "tid": 7, "ts": 1716454225638936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593711, "dur": 12, "args": { "External id": 260500, "cbid": 211, "correlation": 260500 } }, { "ph": "s", "id": 260500, "pid": 76337, "tid": -914061504, "ts": 1716454225593711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225593776, "dur": 0, "args": { "External id": 260512, "cbid": 317, "correlation": 260512 } }, { "ph": "f", "id": 260512, "pid": 76337, "tid": -914061504, "ts": 1716454225593776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225593777, "dur": 0, "args": { "External id": 260513, "cbid": 203, "correlation": 260513 } }, { "ph": "f", "id": 260513, "pid": 76337, "tid": -914061504, "ts": 1716454225593777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225593778, "dur": 0, "args": { "External id": 260514, "cbid": 205, "correlation": 260514 } }, { "ph": "f", "id": 260514, "pid": 76337, "tid": -914061504, "ts": 1716454225593778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593800, "dur": 1, "args": { "External id": 260518, "cbid": 251, "correlation": 260518 } }, { "ph": "f", "id": 260518, "pid": 76337, "tid": -914061504, "ts": 1716454225593800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593802, "dur": 0, "args": { "External id": 260519, "cbid": 251, "correlation": 260519 } }, { "ph": "f", "id": 260519, "pid": 76337, "tid": -914061504, "ts": 1716454225593802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593803, "dur": 0, "args": { "External id": 260520, "cbid": 251, "correlation": 260520 } }, { "ph": "f", "id": 260520, "pid": 76337, "tid": -914061504, "ts": 1716454225593803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593803, "dur": 0, "args": { "External id": 260521, "cbid": 251, "correlation": 260521 } }, { "ph": "f", "id": 260521, "pid": 76337, "tid": -914061504, "ts": 1716454225593803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593804, "dur": 0, "args": { "External id": 260522, "cbid": 251, "correlation": 260522 } }, { "ph": "f", "id": 260522, "pid": 76337, "tid": -914061504, "ts": 1716454225593804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593805, "dur": 0, "args": { "External id": 260523, "cbid": 251, "correlation": 260523 } }, { "ph": "f", "id": 260523, "pid": 76337, "tid": -914061504, "ts": 1716454225593805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593805, "dur": 0, "args": { "External id": 260524, "cbid": 251, "correlation": 260524 } }, { "ph": "f", "id": 260524, "pid": 76337, "tid": -914061504, "ts": 1716454225593805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593806, "dur": 0, "args": { "External id": 260525, "cbid": 251, "correlation": 260525 } }, { "ph": "f", "id": 260525, "pid": 76337, "tid": -914061504, "ts": 1716454225593806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225593807, "dur": 0, "args": { "External id": 260526, "cbid": 251, "correlation": 260526 } }, { "ph": "f", "id": 260526, "pid": 76337, "tid": -914061504, "ts": 1716454225593807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225639098, "dur": 116, "args": { "External id": 260527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260527, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260527, "pid": 5, "tid": 7, "ts": 1716454225639098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593809, "dur": 12, "args": { "External id": 260527, "cbid": 211, "correlation": 260527 } }, { "ph": "s", "id": 260527, "pid": 76337, "tid": -914061504, "ts": 1716454225593809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225639215, "dur": 60, "args": { "External id": 260533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260533, "pid": 5, "tid": 7, "ts": 1716454225639215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593844, "dur": 9, "args": { "External id": 260533, "cbid": 211, "correlation": 260533 } }, { "ph": "s", "id": 260533, "pid": 76337, "tid": -914061504, "ts": 1716454225593844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225639276, "dur": 49, "args": { "External id": 260541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260541, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260541, "pid": 5, "tid": 7, "ts": 1716454225639276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593876, "dur": 8, "args": { "External id": 260541, "cbid": 211, "correlation": 260541 } }, { "ph": "s", "id": 260541, "pid": 76337, "tid": -914061504, "ts": 1716454225593876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225593949, "dur": 0, "args": { "External id": 260551, "cbid": 317, "correlation": 260551 } }, { "ph": "f", "id": 260551, "pid": 76337, "tid": -914061504, "ts": 1716454225593949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225593950, "dur": 0, "args": { "External id": 260552, "cbid": 203, "correlation": 260552 } }, { "ph": "f", "id": 260552, "pid": 76337, "tid": -914061504, "ts": 1716454225593950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225593950, "dur": 0, "args": { "External id": 260553, "cbid": 205, "correlation": 260553 } }, { "ph": "f", "id": 260553, "pid": 76337, "tid": -914061504, "ts": 1716454225593950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225639327, "dur": 41, "args": { "External id": 260557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260557, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260557, "pid": 5, "tid": 7, "ts": 1716454225639327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593967, "dur": 20, "args": { "External id": 260557, "cbid": 211, "correlation": 260557 } }, { "ph": "s", "id": 260557, "pid": 76337, "tid": -914061504, "ts": 1716454225593967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225639369, "dur": 14, "args": { "External id": 260559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260559, "pid": 5, "tid": 7, "ts": 1716454225639369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225593990, "dur": 5, "args": { "External id": 260559, "cbid": 211, "correlation": 260559 } }, { "ph": "s", "id": 260559, "pid": 76337, "tid": -914061504, "ts": 1716454225593990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225639386, "dur": 1, "args": { "External id": 260561, "device": 5, "context": 1, "stream": 7, "correlation": 260561, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 260561, "pid": 5, "tid": 7, "ts": 1716454225639386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225594009, "dur": 17, "args": { "External id": 260561, "cbid": 51, "correlation": 260561 } }, { "ph": "s", "id": 260561, "pid": 76337, "tid": -914061504, "ts": 1716454225594009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225639390, "dur": 369, "args": { "External id": 260562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260562, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260562, "pid": 5, "tid": 7, "ts": 1716454225639390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594028, "dur": 9, "args": { "External id": 260562, "cbid": 211, "correlation": 260562 } }, { "ph": "s", "id": 260562, "pid": 76337, "tid": -914061504, "ts": 1716454225594028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225639760, "dur": 15, "args": { "External id": 260564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260564, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260564, "pid": 5, "tid": 7, "ts": 1716454225639760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594045, "dur": 7, "args": { "External id": 260564, "cbid": 211, "correlation": 260564 } }, { "ph": "s", "id": 260564, "pid": 76337, "tid": -914061504, "ts": 1716454225594045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225639777, "dur": 15, "args": { "External id": 260570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260570, "pid": 5, "tid": 7, "ts": 1716454225639777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594076, "dur": 9, "args": { "External id": 260570, "cbid": 211, "correlation": 260570 } }, { "ph": "s", "id": 260570, "pid": 76337, "tid": -914061504, "ts": 1716454225594076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225639793, "dur": 19, "args": { "External id": 260590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260590, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 260590, "pid": 5, "tid": 7, "ts": 1716454225639793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594166, "dur": 13, "args": { "External id": 260590, "cbid": 211, "correlation": 260590 } }, { "ph": "s", "id": 260590, "pid": 76337, "tid": -914061504, "ts": 1716454225594166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225639813, "dur": 4, "args": { "External id": 260602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260602, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 260602, "pid": 5, "tid": 7, "ts": 1716454225639813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594190, "dur": 6, "args": { "External id": 260602, "cbid": 211, "correlation": 260602 } }, { "ph": "s", "id": 260602, "pid": 76337, "tid": -914061504, "ts": 1716454225594190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225639819, "dur": 18, "args": { "External id": 260605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260605, "pid": 5, "tid": 7, "ts": 1716454225639819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594207, "dur": 6, "args": { "External id": 260605, "cbid": 211, "correlation": 260605 } }, { "ph": "s", "id": 260605, "pid": 76337, "tid": -914061504, "ts": 1716454225594207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225639838, "dur": 12, "args": { "External id": 260614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260614, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260614, "pid": 5, "tid": 7, "ts": 1716454225639838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594247, "dur": 9, "args": { "External id": 260614, "cbid": 211, "correlation": 260614 } }, { "ph": "s", "id": 260614, "pid": 76337, "tid": -914061504, "ts": 1716454225594247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225594301, "dur": 0, "args": { "External id": 260624, "cbid": 317, "correlation": 260624 } }, { "ph": "f", "id": 260624, "pid": 76337, "tid": -914061504, "ts": 1716454225594301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225594302, "dur": 0, "args": { "External id": 260625, "cbid": 203, "correlation": 260625 } }, { "ph": "f", "id": 260625, "pid": 76337, "tid": -914061504, "ts": 1716454225594302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225594302, "dur": 0, "args": { "External id": 260626, "cbid": 205, "correlation": 260626 } }, { "ph": "f", "id": 260626, "pid": 76337, "tid": -914061504, "ts": 1716454225594302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225639852, "dur": 11, "args": { "External id": 260630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260630, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260630, "pid": 5, "tid": 7, "ts": 1716454225639852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594317, "dur": 13, "args": { "External id": 260630, "cbid": 211, "correlation": 260630 } }, { "ph": "s", "id": 260630, "pid": 76337, "tid": -914061504, "ts": 1716454225594317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225639864, "dur": 24, "args": { "External id": 260632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260632, "pid": 5, "tid": 7, "ts": 1716454225639864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594332, "dur": 5, "args": { "External id": 260632, "cbid": 211, "correlation": 260632 } }, { "ph": "s", "id": 260632, "pid": 76337, "tid": -914061504, "ts": 1716454225594332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225639890, "dur": 4, "args": { "External id": 260634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 260634, "pid": 5, "tid": 7, "ts": 1716454225639890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594343, "dur": 6, "args": { "External id": 260634, "cbid": 211, "correlation": 260634 } }, { "ph": "s", "id": 260634, "pid": 76337, "tid": -914061504, "ts": 1716454225594343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225594352, "dur": 0, "args": { "External id": 260635, "cbid": 51, "correlation": 260635 } }, { "ph": "s", "id": 260635, "pid": 76337, "tid": -914061504, "ts": 1716454225594352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225639895, "dur": 361, "args": { "External id": 260636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260636, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260636, "pid": 5, "tid": 7, "ts": 1716454225639895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594353, "dur": 8, "args": { "External id": 260636, "cbid": 211, "correlation": 260636 } }, { "ph": "s", "id": 260636, "pid": 76337, "tid": -914061504, "ts": 1716454225594353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225640258, "dur": 20, "args": { "External id": 260637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260637, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260637, "pid": 5, "tid": 7, "ts": 1716454225640258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594364, "dur": 5, "args": { "External id": 260637, "cbid": 211, "correlation": 260637 } }, { "ph": "s", "id": 260637, "pid": 76337, "tid": -914061504, "ts": 1716454225594364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225640279, "dur": 34, "args": { "External id": 260643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260643, "pid": 5, "tid": 7, "ts": 1716454225640279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594392, "dur": 9, "args": { "External id": 260643, "cbid": 211, "correlation": 260643 } }, { "ph": "s", "id": 260643, "pid": 76337, "tid": -914061504, "ts": 1716454225594392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225640314, "dur": 3, "args": { "External id": 260651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260651, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 260651, "pid": 5, "tid": 7, "ts": 1716454225640314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594435, "dur": 9, "args": { "External id": 260651, "cbid": 211, "correlation": 260651 } }, { "ph": "s", "id": 260651, "pid": 76337, "tid": -914061504, "ts": 1716454225594435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594502, "dur": 1, "args": { "External id": 260667, "cbid": 251, "correlation": 260667 } }, { "ph": "f", "id": 260667, "pid": 76337, "tid": -914061504, "ts": 1716454225594502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594507, "dur": 0, "args": { "External id": 260669, "cbid": 251, "correlation": 260669 } }, { "ph": "f", "id": 260669, "pid": 76337, "tid": -914061504, "ts": 1716454225594507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225640319, "dur": 13, "args": { "External id": 260670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260670, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 260670, "pid": 5, "tid": 7, "ts": 1716454225640319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594509, "dur": 11, "args": { "External id": 260670, "cbid": 211, "correlation": 260670 } }, { "ph": "s", "id": 260670, "pid": 76337, "tid": -914061504, "ts": 1716454225594509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225640333, "dur": 5, "args": { "External id": 260672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260672, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 260672, "pid": 5, "tid": 7, "ts": 1716454225640333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594523, "dur": 6, "args": { "External id": 260672, "cbid": 211, "correlation": 260672 } }, { "ph": "s", "id": 260672, "pid": 76337, "tid": -914061504, "ts": 1716454225594523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225640339, "dur": 31, "args": { "External id": 260682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260682, "pid": 5, "tid": 7, "ts": 1716454225640339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594582, "dur": 12, "args": { "External id": 260682, "cbid": 211, "correlation": 260682 } }, { "ph": "s", "id": 260682, "pid": 76337, "tid": -914061504, "ts": 1716454225594582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225640371, "dur": 30, "args": { "External id": 260702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260702, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 260702, "pid": 5, "tid": 7, "ts": 1716454225640371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594647, "dur": 10, "args": { "External id": 260702, "cbid": 211, "correlation": 260702 } }, { "ph": "s", "id": 260702, "pid": 76337, "tid": -914061504, "ts": 1716454225594647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225640403, "dur": 4, "args": { "External id": 260714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260714, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 260714, "pid": 5, "tid": 7, "ts": 1716454225640403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594667, "dur": 6, "args": { "External id": 260714, "cbid": 211, "correlation": 260714 } }, { "ph": "s", "id": 260714, "pid": 76337, "tid": -914061504, "ts": 1716454225594667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225640408, "dur": 30, "args": { "External id": 260717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260717, "pid": 5, "tid": 7, "ts": 1716454225640408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594687, "dur": 6, "args": { "External id": 260717, "cbid": 211, "correlation": 260717 } }, { "ph": "s", "id": 260717, "pid": 76337, "tid": -914061504, "ts": 1716454225594687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225640439, "dur": 23, "args": { "External id": 260726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260726, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260726, "pid": 5, "tid": 7, "ts": 1716454225640439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594727, "dur": 11, "args": { "External id": 260726, "cbid": 211, "correlation": 260726 } }, { "ph": "s", "id": 260726, "pid": 76337, "tid": -914061504, "ts": 1716454225594727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225594792, "dur": 0, "args": { "External id": 260736, "cbid": 317, "correlation": 260736 } }, { "ph": "f", "id": 260736, "pid": 76337, "tid": -914061504, "ts": 1716454225594792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225594793, "dur": 0, "args": { "External id": 260737, "cbid": 203, "correlation": 260737 } }, { "ph": "f", "id": 260737, "pid": 76337, "tid": -914061504, "ts": 1716454225594793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225594793, "dur": 0, "args": { "External id": 260738, "cbid": 205, "correlation": 260738 } }, { "ph": "f", "id": 260738, "pid": 76337, "tid": -914061504, "ts": 1716454225594793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225640463, "dur": 22, "args": { "External id": 260742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260742, "pid": 5, "tid": 7, "ts": 1716454225640463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594811, "dur": 12, "args": { "External id": 260742, "cbid": 211, "correlation": 260742 } }, { "ph": "s", "id": 260742, "pid": 76337, "tid": -914061504, "ts": 1716454225594811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225640487, "dur": 44, "args": { "External id": 260744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260744, "pid": 5, "tid": 7, "ts": 1716454225640487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594825, "dur": 5, "args": { "External id": 260744, "cbid": 211, "correlation": 260744 } }, { "ph": "s", "id": 260744, "pid": 76337, "tid": -914061504, "ts": 1716454225594825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225640532, "dur": 666, "args": { "External id": 260746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260746, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260746, "pid": 5, "tid": 7, "ts": 1716454225640532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594840, "dur": 10, "args": { "External id": 260746, "cbid": 211, "correlation": 260746 } }, { "ph": "s", "id": 260746, "pid": 76337, "tid": -914061504, "ts": 1716454225594840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225641199, "dur": 21, "args": { "External id": 260748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260748, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260748, "pid": 5, "tid": 7, "ts": 1716454225641199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594854, "dur": 5, "args": { "External id": 260748, "cbid": 211, "correlation": 260748 } }, { "ph": "s", "id": 260748, "pid": 76337, "tid": -914061504, "ts": 1716454225594854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225641222, "dur": 33, "args": { "External id": 260754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260754, "pid": 5, "tid": 7, "ts": 1716454225641222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594882, "dur": 9, "args": { "External id": 260754, "cbid": 211, "correlation": 260754 } }, { "ph": "s", "id": 260754, "pid": 76337, "tid": -914061504, "ts": 1716454225594882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225594941, "dur": 0, "args": { "External id": 260764, "cbid": 317, "correlation": 260764 } }, { "ph": "f", "id": 260764, "pid": 76337, "tid": -914061504, "ts": 1716454225594941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225594942, "dur": 0, "args": { "External id": 260765, "cbid": 203, "correlation": 260765 } }, { "ph": "f", "id": 260765, "pid": 76337, "tid": -914061504, "ts": 1716454225594942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225594943, "dur": 0, "args": { "External id": 260766, "cbid": 205, "correlation": 260766 } }, { "ph": "f", "id": 260766, "pid": 76337, "tid": -914061504, "ts": 1716454225594943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594965, "dur": 1, "args": { "External id": 260770, "cbid": 251, "correlation": 260770 } }, { "ph": "f", "id": 260770, "pid": 76337, "tid": -914061504, "ts": 1716454225594965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594967, "dur": 0, "args": { "External id": 260771, "cbid": 251, "correlation": 260771 } }, { "ph": "f", "id": 260771, "pid": 76337, "tid": -914061504, "ts": 1716454225594967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594967, "dur": 0, "args": { "External id": 260772, "cbid": 251, "correlation": 260772 } }, { "ph": "f", "id": 260772, "pid": 76337, "tid": -914061504, "ts": 1716454225594967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594968, "dur": 0, "args": { "External id": 260773, "cbid": 251, "correlation": 260773 } }, { "ph": "f", "id": 260773, "pid": 76337, "tid": -914061504, "ts": 1716454225594968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594969, "dur": 0, "args": { "External id": 260774, "cbid": 251, "correlation": 260774 } }, { "ph": "f", "id": 260774, "pid": 76337, "tid": -914061504, "ts": 1716454225594969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594969, "dur": 0, "args": { "External id": 260775, "cbid": 251, "correlation": 260775 } }, { "ph": "f", "id": 260775, "pid": 76337, "tid": -914061504, "ts": 1716454225594969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594970, "dur": 0, "args": { "External id": 260776, "cbid": 251, "correlation": 260776 } }, { "ph": "f", "id": 260776, "pid": 76337, "tid": -914061504, "ts": 1716454225594970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594971, "dur": 0, "args": { "External id": 260777, "cbid": 251, "correlation": 260777 } }, { "ph": "f", "id": 260777, "pid": 76337, "tid": -914061504, "ts": 1716454225594971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225594972, "dur": 0, "args": { "External id": 260778, "cbid": 251, "correlation": 260778 } }, { "ph": "f", "id": 260778, "pid": 76337, "tid": -914061504, "ts": 1716454225594972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225641256, "dur": 52, "args": { "External id": 260779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260779, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260779, "pid": 5, "tid": 7, "ts": 1716454225641256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225594981, "dur": 14, "args": { "External id": 260779, "cbid": 211, "correlation": 260779 } }, { "ph": "s", "id": 260779, "pid": 76337, "tid": -914061504, "ts": 1716454225594981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225641310, "dur": 32, "args": { "External id": 260785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260785, "pid": 5, "tid": 7, "ts": 1716454225641310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595016, "dur": 8, "args": { "External id": 260785, "cbid": 211, "correlation": 260785 } }, { "ph": "s", "id": 260785, "pid": 76337, "tid": -914061504, "ts": 1716454225595016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225641343, "dur": 27, "args": { "External id": 260793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260793, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260793, "pid": 5, "tid": 7, "ts": 1716454225641343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595046, "dur": 7, "args": { "External id": 260793, "cbid": 211, "correlation": 260793 } }, { "ph": "s", "id": 260793, "pid": 76337, "tid": -914061504, "ts": 1716454225595046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225641372, "dur": 20, "args": { "External id": 260801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260801, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260801, "pid": 5, "tid": 7, "ts": 1716454225641372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595074, "dur": 8, "args": { "External id": 260801, "cbid": 211, "correlation": 260801 } }, { "ph": "s", "id": 260801, "pid": 76337, "tid": -914061504, "ts": 1716454225595074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225641393, "dur": 30, "args": { "External id": 260821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260821, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 260821, "pid": 5, "tid": 7, "ts": 1716454225641393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595156, "dur": 12, "args": { "External id": 260821, "cbid": 211, "correlation": 260821 } }, { "ph": "s", "id": 260821, "pid": 76337, "tid": -914061504, "ts": 1716454225595156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225641425, "dur": 4, "args": { "External id": 260833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260833, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 260833, "pid": 5, "tid": 7, "ts": 1716454225641425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595177, "dur": 6, "args": { "External id": 260833, "cbid": 211, "correlation": 260833 } }, { "ph": "s", "id": 260833, "pid": 76337, "tid": -914061504, "ts": 1716454225595177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225641430, "dur": 30, "args": { "External id": 260836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260836, "pid": 5, "tid": 7, "ts": 1716454225641430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595196, "dur": 7, "args": { "External id": 260836, "cbid": 211, "correlation": 260836 } }, { "ph": "s", "id": 260836, "pid": 76337, "tid": -914061504, "ts": 1716454225595196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225595253, "dur": 0, "args": { "External id": 260847, "cbid": 317, "correlation": 260847 } }, { "ph": "f", "id": 260847, "pid": 76337, "tid": -914061504, "ts": 1716454225595253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225595254, "dur": 0, "args": { "External id": 260848, "cbid": 203, "correlation": 260848 } }, { "ph": "f", "id": 260848, "pid": 76337, "tid": -914061504, "ts": 1716454225595254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225595255, "dur": 0, "args": { "External id": 260849, "cbid": 205, "correlation": 260849 } }, { "ph": "f", "id": 260849, "pid": 76337, "tid": -914061504, "ts": 1716454225595255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225641461, "dur": 22, "args": { "External id": 260853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260853, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260853, "pid": 5, "tid": 7, "ts": 1716454225641461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595269, "dur": 11, "args": { "External id": 260853, "cbid": 211, "correlation": 260853 } }, { "ph": "s", "id": 260853, "pid": 76337, "tid": -914061504, "ts": 1716454225595269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225641484, "dur": 123, "args": { "External id": 260855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260855, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260855, "pid": 5, "tid": 7, "ts": 1716454225641484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595289, "dur": 9, "args": { "External id": 260855, "cbid": 211, "correlation": 260855 } }, { "ph": "s", "id": 260855, "pid": 76337, "tid": -914061504, "ts": 1716454225595289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225641608, "dur": 22, "args": { "External id": 260857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260857, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260857, "pid": 5, "tid": 7, "ts": 1716454225641608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595302, "dur": 5, "args": { "External id": 260857, "cbid": 211, "correlation": 260857 } }, { "ph": "s", "id": 260857, "pid": 76337, "tid": -914061504, "ts": 1716454225595302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225641632, "dur": 33, "args": { "External id": 260863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260863, "pid": 5, "tid": 7, "ts": 1716454225641632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595329, "dur": 9, "args": { "External id": 260863, "cbid": 211, "correlation": 260863 } }, { "ph": "s", "id": 260863, "pid": 76337, "tid": -914061504, "ts": 1716454225595329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225641666, "dur": 189, "args": { "External id": 260872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260872, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260872, "pid": 5, "tid": 7, "ts": 1716454225641666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595411, "dur": 14, "args": { "External id": 260872, "cbid": 211, "correlation": 260872 } }, { "ph": "s", "id": 260872, "pid": 76337, "tid": -914061504, "ts": 1716454225595411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225641857, "dur": 66, "args": { "External id": 260894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260894, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260894, "pid": 5, "tid": 7, "ts": 1716454225641857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595469, "dur": 10, "args": { "External id": 260894, "cbid": 211, "correlation": 260894 } }, { "ph": "s", "id": 260894, "pid": 76337, "tid": -914061504, "ts": 1716454225595469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225595559, "dur": 1, "args": { "External id": 260905, "cbid": 251, "correlation": 260905 } }, { "ph": "f", "id": 260905, "pid": 76337, "tid": -914061504, "ts": 1716454225595559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225641924, "dur": 156, "args": { "External id": 260906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260906, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260906, "pid": 5, "tid": 7, "ts": 1716454225641924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595565, "dur": 13, "args": { "External id": 260906, "cbid": 211, "correlation": 260906 } }, { "ph": "s", "id": 260906, "pid": 76337, "tid": -914061504, "ts": 1716454225595565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225595635, "dur": 1, "args": { "External id": 260917, "cbid": 251, "correlation": 260917 } }, { "ph": "f", "id": 260917, "pid": 76337, "tid": -914061504, "ts": 1716454225595635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225642082, "dur": 147, "args": { "External id": 260918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260918, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260918, "pid": 5, "tid": 7, "ts": 1716454225642082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595639, "dur": 11, "args": { "External id": 260918, "cbid": 211, "correlation": 260918 } }, { "ph": "s", "id": 260918, "pid": 76337, "tid": -914061504, "ts": 1716454225595639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225595704, "dur": 1, "args": { "External id": 260929, "cbid": 251, "correlation": 260929 } }, { "ph": "f", "id": 260929, "pid": 76337, "tid": -914061504, "ts": 1716454225595704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225642230, "dur": 146, "args": { "External id": 260930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260930, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 260930, "pid": 5, "tid": 7, "ts": 1716454225642230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595708, "dur": 11, "args": { "External id": 260930, "cbid": 211, "correlation": 260930 } }, { "ph": "s", "id": 260930, "pid": 76337, "tid": -914061504, "ts": 1716454225595708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225642377, "dur": 1988, "args": { "External id": 260951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260951, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 260951, "pid": 5, "tid": 7, "ts": 1716454225642377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595793, "dur": 13, "args": { "External id": 260951, "cbid": 211, "correlation": 260951 } }, { "ph": "s", "id": 260951, "pid": 76337, "tid": -914061504, "ts": 1716454225595793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225595895, "dur": 1, "args": { "External id": 260969, "cbid": 251, "correlation": 260969 } }, { "ph": "f", "id": 260969, "pid": 76337, "tid": -914061504, "ts": 1716454225595895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225644367, "dur": 148, "args": { "External id": 260971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260971, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 260971, "pid": 5, "tid": 7, "ts": 1716454225644367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595900, "dur": 13, "args": { "External id": 260971, "cbid": 211, "correlation": 260971 } }, { "ph": "s", "id": 260971, "pid": 76337, "tid": -914061504, "ts": 1716454225595900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225644516, "dur": 36, "args": { "External id": 260979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260979, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260979, "pid": 5, "tid": 7, "ts": 1716454225644516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225595970, "dur": 20, "args": { "External id": 260979, "cbid": 211, "correlation": 260979 } }, { "ph": "s", "id": 260979, "pid": 76337, "tid": -914061504, "ts": 1716454225595970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225644553, "dur": 50, "args": { "External id": 260987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260987, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260987, "pid": 5, "tid": 7, "ts": 1716454225644553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596018, "dur": 9, "args": { "External id": 260987, "cbid": 211, "correlation": 260987 } }, { "ph": "s", "id": 260987, "pid": 76337, "tid": -914061504, "ts": 1716454225596018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225644605, "dur": 31, "args": { "External id": 260998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 260998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 260998, "pid": 5, "tid": 7, "ts": 1716454225644605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596092, "dur": 13, "args": { "External id": 260998, "cbid": 211, "correlation": 260998 } }, { "ph": "s", "id": 260998, "pid": 76337, "tid": -914061504, "ts": 1716454225596092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225644637, "dur": 35, "args": { "External id": 261020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261020, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261020, "pid": 5, "tid": 7, "ts": 1716454225644637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596123, "dur": 7, "args": { "External id": 261020, "cbid": 211, "correlation": 261020 } }, { "ph": "s", "id": 261020, "pid": 76337, "tid": -914061504, "ts": 1716454225596123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596209, "dur": 1, "args": { "External id": 261031, "cbid": 251, "correlation": 261031 } }, { "ph": "f", "id": 261031, "pid": 76337, "tid": -914061504, "ts": 1716454225596209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225644674, "dur": 91, "args": { "External id": 261032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261032, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261032, "pid": 5, "tid": 7, "ts": 1716454225644674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596214, "dur": 13, "args": { "External id": 261032, "cbid": 211, "correlation": 261032 } }, { "ph": "s", "id": 261032, "pid": 76337, "tid": -914061504, "ts": 1716454225596214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596285, "dur": 1, "args": { "External id": 261043, "cbid": 251, "correlation": 261043 } }, { "ph": "f", "id": 261043, "pid": 76337, "tid": -914061504, "ts": 1716454225596285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596289, "dur": 0, "args": { "External id": 261044, "cbid": 251, "correlation": 261044 } }, { "ph": "f", "id": 261044, "pid": 76337, "tid": -914061504, "ts": 1716454225596289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225644766, "dur": 12, "args": { "External id": 261045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261045, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 261045, "pid": 5, "tid": 7, "ts": 1716454225644766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596291, "dur": 12, "args": { "External id": 261045, "cbid": 211, "correlation": 261045 } }, { "ph": "s", "id": 261045, "pid": 76337, "tid": -914061504, "ts": 1716454225596291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225644779, "dur": 5, "args": { "External id": 261047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261047, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 261047, "pid": 5, "tid": 7, "ts": 1716454225644779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596308, "dur": 7, "args": { "External id": 261047, "cbid": 211, "correlation": 261047 } }, { "ph": "s", "id": 261047, "pid": 76337, "tid": -914061504, "ts": 1716454225596308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596366, "dur": 1, "args": { "External id": 261058, "cbid": 251, "correlation": 261058 } }, { "ph": "f", "id": 261058, "pid": 76337, "tid": -914061504, "ts": 1716454225596366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596370, "dur": 0, "args": { "External id": 261059, "cbid": 251, "correlation": 261059 } }, { "ph": "f", "id": 261059, "pid": 76337, "tid": -914061504, "ts": 1716454225596370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225644786, "dur": 7, "args": { "External id": 261060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261060, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 261060, "pid": 5, "tid": 7, "ts": 1716454225644786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596371, "dur": 11, "args": { "External id": 261060, "cbid": 211, "correlation": 261060 } }, { "ph": "s", "id": 261060, "pid": 76337, "tid": -914061504, "ts": 1716454225596371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225644794, "dur": 3, "args": { "External id": 261062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261062, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 261062, "pid": 5, "tid": 7, "ts": 1716454225644794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596384, "dur": 6, "args": { "External id": 261062, "cbid": 211, "correlation": 261062 } }, { "ph": "s", "id": 261062, "pid": 76337, "tid": -914061504, "ts": 1716454225596384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225644799, "dur": 93, "args": { "External id": 261083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261083, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 261083, "pid": 5, "tid": 7, "ts": 1716454225644799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596458, "dur": 12, "args": { "External id": 261083, "cbid": 211, "correlation": 261083 } }, { "ph": "s", "id": 261083, "pid": 76337, "tid": -914061504, "ts": 1716454225596458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596554, "dur": 1, "args": { "External id": 261101, "cbid": 251, "correlation": 261101 } }, { "ph": "f", "id": 261101, "pid": 76337, "tid": -914061504, "ts": 1716454225596554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225644893, "dur": 100, "args": { "External id": 261103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261103, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261103, "pid": 5, "tid": 7, "ts": 1716454225644893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596560, "dur": 13, "args": { "External id": 261103, "cbid": 211, "correlation": 261103 } }, { "ph": "s", "id": 261103, "pid": 76337, "tid": -914061504, "ts": 1716454225596560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225644995, "dur": 19, "args": { "External id": 261111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261111, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261111, "pid": 5, "tid": 7, "ts": 1716454225644995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596629, "dur": 12, "args": { "External id": 261111, "cbid": 211, "correlation": 261111 } }, { "ph": "s", "id": 261111, "pid": 76337, "tid": -914061504, "ts": 1716454225596629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225645015, "dur": 37, "args": { "External id": 261119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261119, "pid": 5, "tid": 7, "ts": 1716454225645015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596671, "dur": 9, "args": { "External id": 261119, "cbid": 211, "correlation": 261119 } }, { "ph": "s", "id": 261119, "pid": 76337, "tid": -914061504, "ts": 1716454225596671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225645053, "dur": 35, "args": { "External id": 261141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261141, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261141, "pid": 5, "tid": 7, "ts": 1716454225645053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596723, "dur": 10, "args": { "External id": 261141, "cbid": 211, "correlation": 261141 } }, { "ph": "s", "id": 261141, "pid": 76337, "tid": -914061504, "ts": 1716454225596723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596812, "dur": 1, "args": { "External id": 261157, "cbid": 251, "correlation": 261157 } }, { "ph": "f", "id": 261157, "pid": 76337, "tid": -914061504, "ts": 1716454225596812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225596817, "dur": 0, "args": { "External id": 261159, "cbid": 251, "correlation": 261159 } }, { "ph": "f", "id": 261159, "pid": 76337, "tid": -914061504, "ts": 1716454225596817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225645090, "dur": 549, "args": { "External id": 261160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261160, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 261160, "pid": 5, "tid": 7, "ts": 1716454225645090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596820, "dur": 12, "args": { "External id": 261160, "cbid": 211, "correlation": 261160 } }, { "ph": "s", "id": 261160, "pid": 76337, "tid": -914061504, "ts": 1716454225596820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225645641, "dur": 127, "args": { "External id": 261168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261168, "pid": 5, "tid": 7, "ts": 1716454225645641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596885, "dur": 13, "args": { "External id": 261168, "cbid": 211, "correlation": 261168 } }, { "ph": "s", "id": 261168, "pid": 76337, "tid": -914061504, "ts": 1716454225596885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225645769, "dur": 129, "args": { "External id": 261176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261176, "pid": 5, "tid": 7, "ts": 1716454225645769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225596917, "dur": 8, "args": { "External id": 261176, "cbid": 211, "correlation": 261176 } }, { "ph": "s", "id": 261176, "pid": 76337, "tid": -914061504, "ts": 1716454225596917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225597004, "dur": 1, "args": { "External id": 261192, "cbid": 251, "correlation": 261192 } }, { "ph": "f", "id": 261192, "pid": 76337, "tid": -914061504, "ts": 1716454225597004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225645899, "dur": 311, "args": { "External id": 261194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261194, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261194, "pid": 5, "tid": 7, "ts": 1716454225645899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597009, "dur": 13, "args": { "External id": 261194, "cbid": 211, "correlation": 261194 } }, { "ph": "s", "id": 261194, "pid": 76337, "tid": -914061504, "ts": 1716454225597009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225646212, "dur": 27, "args": { "External id": 261202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261202, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261202, "pid": 5, "tid": 7, "ts": 1716454225646212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597054, "dur": 9, "args": { "External id": 261202, "cbid": 211, "correlation": 261202 } }, { "ph": "s", "id": 261202, "pid": 76337, "tid": -914061504, "ts": 1716454225597054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225646240, "dur": 83, "args": { "External id": 261213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261213, "pid": 5, "tid": 7, "ts": 1716454225646240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597121, "dur": 12, "args": { "External id": 261213, "cbid": 211, "correlation": 261213 } }, { "ph": "s", "id": 261213, "pid": 76337, "tid": -914061504, "ts": 1716454225597121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225597185, "dur": 0, "args": { "External id": 261225, "cbid": 317, "correlation": 261225 } }, { "ph": "f", "id": 261225, "pid": 76337, "tid": -914061504, "ts": 1716454225597185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225597186, "dur": 0, "args": { "External id": 261226, "cbid": 203, "correlation": 261226 } }, { "ph": "f", "id": 261226, "pid": 76337, "tid": -914061504, "ts": 1716454225597186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225597187, "dur": 0, "args": { "External id": 261227, "cbid": 205, "correlation": 261227 } }, { "ph": "f", "id": 261227, "pid": 76337, "tid": -914061504, "ts": 1716454225597187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225646325, "dur": 22, "args": { "External id": 261231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261231, "pid": 5, "tid": 7, "ts": 1716454225646325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597203, "dur": 12, "args": { "External id": 261231, "cbid": 211, "correlation": 261231 } }, { "ph": "s", "id": 261231, "pid": 76337, "tid": -914061504, "ts": 1716454225597203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225646349, "dur": 123, "args": { "External id": 261233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261233, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261233, "pid": 5, "tid": 7, "ts": 1716454225646349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597221, "dur": 6, "args": { "External id": 261233, "cbid": 211, "correlation": 261233 } }, { "ph": "s", "id": 261233, "pid": 76337, "tid": -914061504, "ts": 1716454225597221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225646473, "dur": 22, "args": { "External id": 261235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261235, "pid": 5, "tid": 7, "ts": 1716454225646473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597231, "dur": 5, "args": { "External id": 261235, "cbid": 211, "correlation": 261235 } }, { "ph": "s", "id": 261235, "pid": 76337, "tid": -914061504, "ts": 1716454225597231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225646496, "dur": 33, "args": { "External id": 261241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261241, "pid": 5, "tid": 7, "ts": 1716454225646496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597260, "dur": 8, "args": { "External id": 261241, "cbid": 211, "correlation": 261241 } }, { "ph": "s", "id": 261241, "pid": 76337, "tid": -914061504, "ts": 1716454225597260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225646530, "dur": 27, "args": { "External id": 261249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261249, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261249, "pid": 5, "tid": 7, "ts": 1716454225646530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597290, "dur": 9, "args": { "External id": 261249, "cbid": 211, "correlation": 261249 } }, { "ph": "s", "id": 261249, "pid": 76337, "tid": -914061504, "ts": 1716454225597290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225646558, "dur": 31, "args": { "External id": 261269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261269, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 261269, "pid": 5, "tid": 7, "ts": 1716454225646558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597363, "dur": 11, "args": { "External id": 261269, "cbid": 211, "correlation": 261269 } }, { "ph": "s", "id": 261269, "pid": 76337, "tid": -914061504, "ts": 1716454225597363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225646591, "dur": 5, "args": { "External id": 261281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261281, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 261281, "pid": 5, "tid": 7, "ts": 1716454225646591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597384, "dur": 6, "args": { "External id": 261281, "cbid": 211, "correlation": 261281 } }, { "ph": "s", "id": 261281, "pid": 76337, "tid": -914061504, "ts": 1716454225597384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225646597, "dur": 31, "args": { "External id": 261284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261284, "pid": 5, "tid": 7, "ts": 1716454225646597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597402, "dur": 6, "args": { "External id": 261284, "cbid": 211, "correlation": 261284 } }, { "ph": "s", "id": 261284, "pid": 76337, "tid": -914061504, "ts": 1716454225597402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225646629, "dur": 21, "args": { "External id": 261293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261293, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261293, "pid": 5, "tid": 7, "ts": 1716454225646629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597440, "dur": 10, "args": { "External id": 261293, "cbid": 211, "correlation": 261293 } }, { "ph": "s", "id": 261293, "pid": 76337, "tid": -914061504, "ts": 1716454225597440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225597491, "dur": 0, "args": { "External id": 261303, "cbid": 317, "correlation": 261303 } }, { "ph": "f", "id": 261303, "pid": 76337, "tid": -914061504, "ts": 1716454225597491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225597492, "dur": 0, "args": { "External id": 261304, "cbid": 203, "correlation": 261304 } }, { "ph": "f", "id": 261304, "pid": 76337, "tid": -914061504, "ts": 1716454225597492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225597492, "dur": 0, "args": { "External id": 261305, "cbid": 205, "correlation": 261305 } }, { "ph": "f", "id": 261305, "pid": 76337, "tid": -914061504, "ts": 1716454225597492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225646652, "dur": 22, "args": { "External id": 261309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261309, "pid": 5, "tid": 7, "ts": 1716454225646652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597506, "dur": 11, "args": { "External id": 261309, "cbid": 211, "correlation": 261309 } }, { "ph": "s", "id": 261309, "pid": 76337, "tid": -914061504, "ts": 1716454225597506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225646674, "dur": 44, "args": { "External id": 261311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261311, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261311, "pid": 5, "tid": 7, "ts": 1716454225646674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597520, "dur": 5, "args": { "External id": 261311, "cbid": 211, "correlation": 261311 } }, { "ph": "s", "id": 261311, "pid": 76337, "tid": -914061504, "ts": 1716454225597520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225646720, "dur": 663, "args": { "External id": 261313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261313, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261313, "pid": 5, "tid": 7, "ts": 1716454225646720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597531, "dur": 6, "args": { "External id": 261313, "cbid": 211, "correlation": 261313 } }, { "ph": "s", "id": 261313, "pid": 76337, "tid": -914061504, "ts": 1716454225597531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225647385, "dur": 22, "args": { "External id": 261315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261315, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261315, "pid": 5, "tid": 7, "ts": 1716454225647385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597541, "dur": 5, "args": { "External id": 261315, "cbid": 211, "correlation": 261315 } }, { "ph": "s", "id": 261315, "pid": 76337, "tid": -914061504, "ts": 1716454225597541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225647408, "dur": 34, "args": { "External id": 261321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261321, "pid": 5, "tid": 7, "ts": 1716454225647408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597568, "dur": 8, "args": { "External id": 261321, "cbid": 211, "correlation": 261321 } }, { "ph": "s", "id": 261321, "pid": 76337, "tid": -914061504, "ts": 1716454225597568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225647444, "dur": 3, "args": { "External id": 261329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261329, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 261329, "pid": 5, "tid": 7, "ts": 1716454225647444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597611, "dur": 9, "args": { "External id": 261329, "cbid": 211, "correlation": 261329 } }, { "ph": "s", "id": 261329, "pid": 76337, "tid": -914061504, "ts": 1716454225597611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225597675, "dur": 1, "args": { "External id": 261345, "cbid": 251, "correlation": 261345 } }, { "ph": "f", "id": 261345, "pid": 76337, "tid": -914061504, "ts": 1716454225597675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225597680, "dur": 0, "args": { "External id": 261347, "cbid": 251, "correlation": 261347 } }, { "ph": "f", "id": 261347, "pid": 76337, "tid": -914061504, "ts": 1716454225597680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225647448, "dur": 13, "args": { "External id": 261348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261348, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 261348, "pid": 5, "tid": 7, "ts": 1716454225647448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597682, "dur": 11, "args": { "External id": 261348, "cbid": 211, "correlation": 261348 } }, { "ph": "s", "id": 261348, "pid": 76337, "tid": -914061504, "ts": 1716454225597682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225647462, "dur": 5, "args": { "External id": 261350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261350, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 261350, "pid": 5, "tid": 7, "ts": 1716454225647462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597695, "dur": 6, "args": { "External id": 261350, "cbid": 211, "correlation": 261350 } }, { "ph": "s", "id": 261350, "pid": 76337, "tid": -914061504, "ts": 1716454225597695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225647469, "dur": 30, "args": { "External id": 261360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261360, "pid": 5, "tid": 7, "ts": 1716454225647469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597753, "dur": 12, "args": { "External id": 261360, "cbid": 211, "correlation": 261360 } }, { "ph": "s", "id": 261360, "pid": 76337, "tid": -914061504, "ts": 1716454225597753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225647500, "dur": 32, "args": { "External id": 261380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261380, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 261380, "pid": 5, "tid": 7, "ts": 1716454225647500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597818, "dur": 11, "args": { "External id": 261380, "cbid": 211, "correlation": 261380 } }, { "ph": "s", "id": 261380, "pid": 76337, "tid": -914061504, "ts": 1716454225597818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225647534, "dur": 4, "args": { "External id": 261392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261392, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 261392, "pid": 5, "tid": 7, "ts": 1716454225647534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597839, "dur": 6, "args": { "External id": 261392, "cbid": 211, "correlation": 261392 } }, { "ph": "s", "id": 261392, "pid": 76337, "tid": -914061504, "ts": 1716454225597839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225647539, "dur": 30, "args": { "External id": 261395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261395, "pid": 5, "tid": 7, "ts": 1716454225647539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597858, "dur": 6, "args": { "External id": 261395, "cbid": 211, "correlation": 261395 } }, { "ph": "s", "id": 261395, "pid": 76337, "tid": -914061504, "ts": 1716454225597858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225647571, "dur": 21, "args": { "External id": 261404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261404, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261404, "pid": 5, "tid": 7, "ts": 1716454225647571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597898, "dur": 11, "args": { "External id": 261404, "cbid": 211, "correlation": 261404 } }, { "ph": "s", "id": 261404, "pid": 76337, "tid": -914061504, "ts": 1716454225597898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225597960, "dur": 0, "args": { "External id": 261414, "cbid": 317, "correlation": 261414 } }, { "ph": "f", "id": 261414, "pid": 76337, "tid": -914061504, "ts": 1716454225597960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225597961, "dur": 0, "args": { "External id": 261415, "cbid": 203, "correlation": 261415 } }, { "ph": "f", "id": 261415, "pid": 76337, "tid": -914061504, "ts": 1716454225597961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225597962, "dur": 0, "args": { "External id": 261416, "cbid": 205, "correlation": 261416 } }, { "ph": "f", "id": 261416, "pid": 76337, "tid": -914061504, "ts": 1716454225597962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225647593, "dur": 24, "args": { "External id": 261420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261420, "pid": 5, "tid": 7, "ts": 1716454225647593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597983, "dur": 12, "args": { "External id": 261420, "cbid": 211, "correlation": 261420 } }, { "ph": "s", "id": 261420, "pid": 76337, "tid": -914061504, "ts": 1716454225597983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225647619, "dur": 44, "args": { "External id": 261422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261422, "pid": 5, "tid": 7, "ts": 1716454225647619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225597998, "dur": 5, "args": { "External id": 261422, "cbid": 211, "correlation": 261422 } }, { "ph": "s", "id": 261422, "pid": 76337, "tid": -914061504, "ts": 1716454225597998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225647664, "dur": 656, "args": { "External id": 261424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261424, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261424, "pid": 5, "tid": 7, "ts": 1716454225647664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598010, "dur": 6, "args": { "External id": 261424, "cbid": 211, "correlation": 261424 } }, { "ph": "s", "id": 261424, "pid": 76337, "tid": -914061504, "ts": 1716454225598010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225648322, "dur": 22, "args": { "External id": 261426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261426, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261426, "pid": 5, "tid": 7, "ts": 1716454225648322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598019, "dur": 5, "args": { "External id": 261426, "cbid": 211, "correlation": 261426 } }, { "ph": "s", "id": 261426, "pid": 76337, "tid": -914061504, "ts": 1716454225598019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225648346, "dur": 34, "args": { "External id": 261432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261432, "pid": 5, "tid": 7, "ts": 1716454225648346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598047, "dur": 9, "args": { "External id": 261432, "cbid": 211, "correlation": 261432 } }, { "ph": "s", "id": 261432, "pid": 76337, "tid": -914061504, "ts": 1716454225598047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225648381, "dur": 26, "args": { "External id": 261440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261440, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261440, "pid": 5, "tid": 7, "ts": 1716454225648381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598080, "dur": 8, "args": { "External id": 261440, "cbid": 211, "correlation": 261440 } }, { "ph": "s", "id": 261440, "pid": 76337, "tid": -914061504, "ts": 1716454225598080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225648409, "dur": 20, "args": { "External id": 261448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261448, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261448, "pid": 5, "tid": 7, "ts": 1716454225648409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598110, "dur": 8, "args": { "External id": 261448, "cbid": 211, "correlation": 261448 } }, { "ph": "s", "id": 261448, "pid": 76337, "tid": -914061504, "ts": 1716454225598110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225648430, "dur": 29, "args": { "External id": 261468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261468, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 261468, "pid": 5, "tid": 7, "ts": 1716454225648430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598187, "dur": 12, "args": { "External id": 261468, "cbid": 211, "correlation": 261468 } }, { "ph": "s", "id": 261468, "pid": 76337, "tid": -914061504, "ts": 1716454225598187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225648461, "dur": 4, "args": { "External id": 261480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261480, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 261480, "pid": 5, "tid": 7, "ts": 1716454225648461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598209, "dur": 6, "args": { "External id": 261480, "cbid": 211, "correlation": 261480 } }, { "ph": "s", "id": 261480, "pid": 76337, "tid": -914061504, "ts": 1716454225598209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225648466, "dur": 30, "args": { "External id": 261483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261483, "pid": 5, "tid": 7, "ts": 1716454225648466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598227, "dur": 7, "args": { "External id": 261483, "cbid": 211, "correlation": 261483 } }, { "ph": "s", "id": 261483, "pid": 76337, "tid": -914061504, "ts": 1716454225598227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225598285, "dur": 0, "args": { "External id": 261494, "cbid": 317, "correlation": 261494 } }, { "ph": "f", "id": 261494, "pid": 76337, "tid": -914061504, "ts": 1716454225598285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225598286, "dur": 0, "args": { "External id": 261495, "cbid": 203, "correlation": 261495 } }, { "ph": "f", "id": 261495, "pid": 76337, "tid": -914061504, "ts": 1716454225598286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225598287, "dur": 0, "args": { "External id": 261496, "cbid": 205, "correlation": 261496 } }, { "ph": "f", "id": 261496, "pid": 76337, "tid": -914061504, "ts": 1716454225598287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225648497, "dur": 22, "args": { "External id": 261500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261500, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261500, "pid": 5, "tid": 7, "ts": 1716454225648497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598300, "dur": 12, "args": { "External id": 261500, "cbid": 211, "correlation": 261500 } }, { "ph": "s", "id": 261500, "pid": 76337, "tid": -914061504, "ts": 1716454225598300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225648521, "dur": 117, "args": { "External id": 261502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261502, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261502, "pid": 5, "tid": 7, "ts": 1716454225648521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598319, "dur": 6, "args": { "External id": 261502, "cbid": 211, "correlation": 261502 } }, { "ph": "s", "id": 261502, "pid": 76337, "tid": -914061504, "ts": 1716454225598319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225648640, "dur": 22, "args": { "External id": 261504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261504, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261504, "pid": 5, "tid": 7, "ts": 1716454225648640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598328, "dur": 5, "args": { "External id": 261504, "cbid": 211, "correlation": 261504 } }, { "ph": "s", "id": 261504, "pid": 76337, "tid": -914061504, "ts": 1716454225598328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225648664, "dur": 33, "args": { "External id": 261510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261510, "pid": 5, "tid": 7, "ts": 1716454225648664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598356, "dur": 9, "args": { "External id": 261510, "cbid": 211, "correlation": 261510 } }, { "ph": "s", "id": 261510, "pid": 76337, "tid": -914061504, "ts": 1716454225598356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225648698, "dur": 189, "args": { "External id": 261519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261519, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261519, "pid": 5, "tid": 7, "ts": 1716454225648698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598437, "dur": 14, "args": { "External id": 261519, "cbid": 211, "correlation": 261519 } }, { "ph": "s", "id": 261519, "pid": 76337, "tid": -914061504, "ts": 1716454225598437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225648888, "dur": 66, "args": { "External id": 261541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261541, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261541, "pid": 5, "tid": 7, "ts": 1716454225648888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598493, "dur": 10, "args": { "External id": 261541, "cbid": 211, "correlation": 261541 } }, { "ph": "s", "id": 261541, "pid": 76337, "tid": -914061504, "ts": 1716454225598493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225598580, "dur": 2, "args": { "External id": 261552, "cbid": 251, "correlation": 261552 } }, { "ph": "f", "id": 261552, "pid": 76337, "tid": -914061504, "ts": 1716454225598580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225648955, "dur": 155, "args": { "External id": 261553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261553, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261553, "pid": 5, "tid": 7, "ts": 1716454225648955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598585, "dur": 13, "args": { "External id": 261553, "cbid": 211, "correlation": 261553 } }, { "ph": "s", "id": 261553, "pid": 76337, "tid": -914061504, "ts": 1716454225598585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225598656, "dur": 1, "args": { "External id": 261564, "cbid": 251, "correlation": 261564 } }, { "ph": "f", "id": 261564, "pid": 76337, "tid": -914061504, "ts": 1716454225598656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225649112, "dur": 146, "args": { "External id": 261565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261565, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261565, "pid": 5, "tid": 7, "ts": 1716454225649112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598660, "dur": 12, "args": { "External id": 261565, "cbid": 211, "correlation": 261565 } }, { "ph": "s", "id": 261565, "pid": 76337, "tid": -914061504, "ts": 1716454225598660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225598724, "dur": 1, "args": { "External id": 261576, "cbid": 251, "correlation": 261576 } }, { "ph": "f", "id": 261576, "pid": 76337, "tid": -914061504, "ts": 1716454225598724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225649259, "dur": 147, "args": { "External id": 261577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261577, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261577, "pid": 5, "tid": 7, "ts": 1716454225649259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598729, "dur": 11, "args": { "External id": 261577, "cbid": 211, "correlation": 261577 } }, { "ph": "s", "id": 261577, "pid": 76337, "tid": -914061504, "ts": 1716454225598729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225649407, "dur": 1987, "args": { "External id": 261598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261598, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 261598, "pid": 5, "tid": 7, "ts": 1716454225649407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598808, "dur": 13, "args": { "External id": 261598, "cbid": 211, "correlation": 261598 } }, { "ph": "s", "id": 261598, "pid": 76337, "tid": -914061504, "ts": 1716454225598808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225598906, "dur": 1, "args": { "External id": 261616, "cbid": 251, "correlation": 261616 } }, { "ph": "f", "id": 261616, "pid": 76337, "tid": -914061504, "ts": 1716454225598906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225651396, "dur": 150, "args": { "External id": 261618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261618, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 261618, "pid": 5, "tid": 7, "ts": 1716454225651396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598912, "dur": 14, "args": { "External id": 261618, "cbid": 211, "correlation": 261618 } }, { "ph": "s", "id": 261618, "pid": 76337, "tid": -914061504, "ts": 1716454225598912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225651547, "dur": 36, "args": { "External id": 261626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261626, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261626, "pid": 5, "tid": 7, "ts": 1716454225651547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225598991, "dur": 12, "args": { "External id": 261626, "cbid": 211, "correlation": 261626 } }, { "ph": "s", "id": 261626, "pid": 76337, "tid": -914061504, "ts": 1716454225598991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225651584, "dur": 50, "args": { "External id": 261634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261634, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261634, "pid": 5, "tid": 7, "ts": 1716454225651584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599031, "dur": 8, "args": { "External id": 261634, "cbid": 211, "correlation": 261634 } }, { "ph": "s", "id": 261634, "pid": 76337, "tid": -914061504, "ts": 1716454225599031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225651635, "dur": 32, "args": { "External id": 261645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261645, "pid": 5, "tid": 7, "ts": 1716454225651635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599102, "dur": 13, "args": { "External id": 261645, "cbid": 211, "correlation": 261645 } }, { "ph": "s", "id": 261645, "pid": 76337, "tid": -914061504, "ts": 1716454225599102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225651668, "dur": 35, "args": { "External id": 261667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261667, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261667, "pid": 5, "tid": 7, "ts": 1716454225651668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599133, "dur": 7, "args": { "External id": 261667, "cbid": 211, "correlation": 261667 } }, { "ph": "s", "id": 261667, "pid": 76337, "tid": -914061504, "ts": 1716454225599133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599217, "dur": 1, "args": { "External id": 261678, "cbid": 251, "correlation": 261678 } }, { "ph": "f", "id": 261678, "pid": 76337, "tid": -914061504, "ts": 1716454225599217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225651704, "dur": 91, "args": { "External id": 261679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261679, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261679, "pid": 5, "tid": 7, "ts": 1716454225651704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599223, "dur": 13, "args": { "External id": 261679, "cbid": 211, "correlation": 261679 } }, { "ph": "s", "id": 261679, "pid": 76337, "tid": -914061504, "ts": 1716454225599223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599292, "dur": 1, "args": { "External id": 261690, "cbid": 251, "correlation": 261690 } }, { "ph": "f", "id": 261690, "pid": 76337, "tid": -914061504, "ts": 1716454225599292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599295, "dur": 0, "args": { "External id": 261691, "cbid": 251, "correlation": 261691 } }, { "ph": "f", "id": 261691, "pid": 76337, "tid": -914061504, "ts": 1716454225599295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225651797, "dur": 12, "args": { "External id": 261692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261692, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 261692, "pid": 5, "tid": 7, "ts": 1716454225651797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599297, "dur": 12, "args": { "External id": 261692, "cbid": 211, "correlation": 261692 } }, { "ph": "s", "id": 261692, "pid": 76337, "tid": -914061504, "ts": 1716454225599297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225651810, "dur": 5, "args": { "External id": 261694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261694, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 261694, "pid": 5, "tid": 7, "ts": 1716454225651810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599311, "dur": 6, "args": { "External id": 261694, "cbid": 211, "correlation": 261694 } }, { "ph": "s", "id": 261694, "pid": 76337, "tid": -914061504, "ts": 1716454225599311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599368, "dur": 1, "args": { "External id": 261705, "cbid": 251, "correlation": 261705 } }, { "ph": "f", "id": 261705, "pid": 76337, "tid": -914061504, "ts": 1716454225599368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599371, "dur": 0, "args": { "External id": 261706, "cbid": 251, "correlation": 261706 } }, { "ph": "f", "id": 261706, "pid": 76337, "tid": -914061504, "ts": 1716454225599371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225651816, "dur": 7, "args": { "External id": 261707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261707, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 261707, "pid": 5, "tid": 7, "ts": 1716454225651816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599373, "dur": 12, "args": { "External id": 261707, "cbid": 211, "correlation": 261707 } }, { "ph": "s", "id": 261707, "pid": 76337, "tid": -914061504, "ts": 1716454225599373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225651825, "dur": 3, "args": { "External id": 261709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261709, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 261709, "pid": 5, "tid": 7, "ts": 1716454225651825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599387, "dur": 5, "args": { "External id": 261709, "cbid": 211, "correlation": 261709 } }, { "ph": "s", "id": 261709, "pid": 76337, "tid": -914061504, "ts": 1716454225599387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225651829, "dur": 93, "args": { "External id": 261730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261730, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 261730, "pid": 5, "tid": 7, "ts": 1716454225651829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599459, "dur": 13, "args": { "External id": 261730, "cbid": 211, "correlation": 261730 } }, { "ph": "s", "id": 261730, "pid": 76337, "tid": -914061504, "ts": 1716454225599459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599557, "dur": 1, "args": { "External id": 261748, "cbid": 251, "correlation": 261748 } }, { "ph": "f", "id": 261748, "pid": 76337, "tid": -914061504, "ts": 1716454225599557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225651924, "dur": 99, "args": { "External id": 261750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261750, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261750, "pid": 5, "tid": 7, "ts": 1716454225651924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599562, "dur": 14, "args": { "External id": 261750, "cbid": 211, "correlation": 261750 } }, { "ph": "s", "id": 261750, "pid": 76337, "tid": -914061504, "ts": 1716454225599562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225652024, "dur": 19, "args": { "External id": 261758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261758, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261758, "pid": 5, "tid": 7, "ts": 1716454225652024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599631, "dur": 12, "args": { "External id": 261758, "cbid": 211, "correlation": 261758 } }, { "ph": "s", "id": 261758, "pid": 76337, "tid": -914061504, "ts": 1716454225599631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225652045, "dur": 36, "args": { "External id": 261766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261766, "pid": 5, "tid": 7, "ts": 1716454225652045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599672, "dur": 10, "args": { "External id": 261766, "cbid": 211, "correlation": 261766 } }, { "ph": "s", "id": 261766, "pid": 76337, "tid": -914061504, "ts": 1716454225599672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225652082, "dur": 35, "args": { "External id": 261788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261788, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261788, "pid": 5, "tid": 7, "ts": 1716454225652082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599723, "dur": 11, "args": { "External id": 261788, "cbid": 211, "correlation": 261788 } }, { "ph": "s", "id": 261788, "pid": 76337, "tid": -914061504, "ts": 1716454225599723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599811, "dur": 1, "args": { "External id": 261804, "cbid": 251, "correlation": 261804 } }, { "ph": "f", "id": 261804, "pid": 76337, "tid": -914061504, "ts": 1716454225599811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225599816, "dur": 0, "args": { "External id": 261806, "cbid": 251, "correlation": 261806 } }, { "ph": "f", "id": 261806, "pid": 76337, "tid": -914061504, "ts": 1716454225599816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225652118, "dur": 548, "args": { "External id": 261807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261807, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 261807, "pid": 5, "tid": 7, "ts": 1716454225652118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599820, "dur": 13, "args": { "External id": 261807, "cbid": 211, "correlation": 261807 } }, { "ph": "s", "id": 261807, "pid": 76337, "tid": -914061504, "ts": 1716454225599820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225652667, "dur": 127, "args": { "External id": 261815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261815, "pid": 5, "tid": 7, "ts": 1716454225652667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599885, "dur": 13, "args": { "External id": 261815, "cbid": 211, "correlation": 261815 } }, { "ph": "s", "id": 261815, "pid": 76337, "tid": -914061504, "ts": 1716454225599885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225652795, "dur": 128, "args": { "External id": 261823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261823, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261823, "pid": 5, "tid": 7, "ts": 1716454225652795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225599915, "dur": 8, "args": { "External id": 261823, "cbid": 211, "correlation": 261823 } }, { "ph": "s", "id": 261823, "pid": 76337, "tid": -914061504, "ts": 1716454225599915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225600000, "dur": 1, "args": { "External id": 261839, "cbid": 251, "correlation": 261839 } }, { "ph": "f", "id": 261839, "pid": 76337, "tid": -914061504, "ts": 1716454225600000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225652925, "dur": 309, "args": { "External id": 261841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261841, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261841, "pid": 5, "tid": 7, "ts": 1716454225652925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600006, "dur": 13, "args": { "External id": 261841, "cbid": 211, "correlation": 261841 } }, { "ph": "s", "id": 261841, "pid": 76337, "tid": -914061504, "ts": 1716454225600006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225653235, "dur": 27, "args": { "External id": 261849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261849, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261849, "pid": 5, "tid": 7, "ts": 1716454225653235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600050, "dur": 10, "args": { "External id": 261849, "cbid": 211, "correlation": 261849 } }, { "ph": "s", "id": 261849, "pid": 76337, "tid": -914061504, "ts": 1716454225600050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225653264, "dur": 84, "args": { "External id": 261860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261860, "pid": 5, "tid": 7, "ts": 1716454225653264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600118, "dur": 12, "args": { "External id": 261860, "cbid": 211, "correlation": 261860 } }, { "ph": "s", "id": 261860, "pid": 76337, "tid": -914061504, "ts": 1716454225600118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225600181, "dur": 0, "args": { "External id": 261872, "cbid": 317, "correlation": 261872 } }, { "ph": "f", "id": 261872, "pid": 76337, "tid": -914061504, "ts": 1716454225600181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225600182, "dur": 0, "args": { "External id": 261873, "cbid": 203, "correlation": 261873 } }, { "ph": "f", "id": 261873, "pid": 76337, "tid": -914061504, "ts": 1716454225600182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225600183, "dur": 0, "args": { "External id": 261874, "cbid": 205, "correlation": 261874 } }, { "ph": "f", "id": 261874, "pid": 76337, "tid": -914061504, "ts": 1716454225600183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653349, "dur": 24, "args": { "External id": 261878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261878, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261878, "pid": 5, "tid": 7, "ts": 1716454225653349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600198, "dur": 12, "args": { "External id": 261878, "cbid": 211, "correlation": 261878 } }, { "ph": "s", "id": 261878, "pid": 76337, "tid": -914061504, "ts": 1716454225600198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225653374, "dur": 123, "args": { "External id": 261880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261880, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261880, "pid": 5, "tid": 7, "ts": 1716454225653374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600216, "dur": 6, "args": { "External id": 261880, "cbid": 211, "correlation": 261880 } }, { "ph": "s", "id": 261880, "pid": 76337, "tid": -914061504, "ts": 1716454225600216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653498, "dur": 22, "args": { "External id": 261882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261882, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261882, "pid": 5, "tid": 7, "ts": 1716454225653498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600226, "dur": 5, "args": { "External id": 261882, "cbid": 211, "correlation": 261882 } }, { "ph": "s", "id": 261882, "pid": 76337, "tid": -914061504, "ts": 1716454225600226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225653521, "dur": 32, "args": { "External id": 261888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261888, "pid": 5, "tid": 7, "ts": 1716454225653521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600254, "dur": 8, "args": { "External id": 261888, "cbid": 211, "correlation": 261888 } }, { "ph": "s", "id": 261888, "pid": 76337, "tid": -914061504, "ts": 1716454225600254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225653554, "dur": 26, "args": { "External id": 261896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261896, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261896, "pid": 5, "tid": 7, "ts": 1716454225653554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600286, "dur": 9, "args": { "External id": 261896, "cbid": 211, "correlation": 261896 } }, { "ph": "s", "id": 261896, "pid": 76337, "tid": -914061504, "ts": 1716454225600286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225600357, "dur": 0, "args": { "External id": 261906, "cbid": 317, "correlation": 261906 } }, { "ph": "f", "id": 261906, "pid": 76337, "tid": -914061504, "ts": 1716454225600357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225600357, "dur": 0, "args": { "External id": 261907, "cbid": 203, "correlation": 261907 } }, { "ph": "f", "id": 261907, "pid": 76337, "tid": -914061504, "ts": 1716454225600357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225600358, "dur": 0, "args": { "External id": 261908, "cbid": 205, "correlation": 261908 } }, { "ph": "f", "id": 261908, "pid": 76337, "tid": -914061504, "ts": 1716454225600358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653582, "dur": 23, "args": { "External id": 261912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261912, "pid": 5, "tid": 7, "ts": 1716454225653582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600372, "dur": 12, "args": { "External id": 261912, "cbid": 211, "correlation": 261912 } }, { "ph": "s", "id": 261912, "pid": 76337, "tid": -914061504, "ts": 1716454225600372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653607, "dur": 45, "args": { "External id": 261914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261914, "pid": 5, "tid": 7, "ts": 1716454225653607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600387, "dur": 6, "args": { "External id": 261914, "cbid": 211, "correlation": 261914 } }, { "ph": "s", "id": 261914, "pid": 76337, "tid": -914061504, "ts": 1716454225600387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225653653, "dur": 237, "args": { "External id": 261916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261916, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 261916, "pid": 5, "tid": 7, "ts": 1716454225653653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600400, "dur": 6, "args": { "External id": 261916, "cbid": 211, "correlation": 261916 } }, { "ph": "s", "id": 261916, "pid": 76337, "tid": -914061504, "ts": 1716454225600400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653892, "dur": 7, "args": { "External id": 261918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261918, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261918, "pid": 5, "tid": 7, "ts": 1716454225653892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600409, "dur": 5, "args": { "External id": 261918, "cbid": 211, "correlation": 261918 } }, { "ph": "s", "id": 261918, "pid": 76337, "tid": -914061504, "ts": 1716454225600409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225653899, "dur": 9, "args": { "External id": 261924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261924, "pid": 5, "tid": 7, "ts": 1716454225653899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600435, "dur": 10, "args": { "External id": 261924, "cbid": 211, "correlation": 261924 } }, { "ph": "s", "id": 261924, "pid": 76337, "tid": -914061504, "ts": 1716454225600435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225653910, "dur": 12, "args": { "External id": 261944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261944, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 261944, "pid": 5, "tid": 7, "ts": 1716454225653910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600527, "dur": 13, "args": { "External id": 261944, "cbid": 211, "correlation": 261944 } }, { "ph": "s", "id": 261944, "pid": 76337, "tid": -914061504, "ts": 1716454225600527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225653923, "dur": 4, "args": { "External id": 261956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261956, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 261956, "pid": 5, "tid": 7, "ts": 1716454225653923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600550, "dur": 6, "args": { "External id": 261956, "cbid": 211, "correlation": 261956 } }, { "ph": "s", "id": 261956, "pid": 76337, "tid": -914061504, "ts": 1716454225600550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225653929, "dur": 13, "args": { "External id": 261959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261959, "pid": 5, "tid": 7, "ts": 1716454225653929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600568, "dur": 6, "args": { "External id": 261959, "cbid": 211, "correlation": 261959 } }, { "ph": "s", "id": 261959, "pid": 76337, "tid": -914061504, "ts": 1716454225600568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225653943, "dur": 7, "args": { "External id": 261968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261968, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261968, "pid": 5, "tid": 7, "ts": 1716454225653943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600607, "dur": 10, "args": { "External id": 261968, "cbid": 211, "correlation": 261968 } }, { "ph": "s", "id": 261968, "pid": 76337, "tid": -914061504, "ts": 1716454225600607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225600659, "dur": 0, "args": { "External id": 261978, "cbid": 317, "correlation": 261978 } }, { "ph": "f", "id": 261978, "pid": 76337, "tid": -914061504, "ts": 1716454225600659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225600660, "dur": 0, "args": { "External id": 261979, "cbid": 203, "correlation": 261979 } }, { "ph": "f", "id": 261979, "pid": 76337, "tid": -914061504, "ts": 1716454225600660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225600661, "dur": 0, "args": { "External id": 261980, "cbid": 205, "correlation": 261980 } }, { "ph": "f", "id": 261980, "pid": 76337, "tid": -914061504, "ts": 1716454225600661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653951, "dur": 6, "args": { "External id": 261984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261984, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261984, "pid": 5, "tid": 7, "ts": 1716454225653951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600676, "dur": 11, "args": { "External id": 261984, "cbid": 211, "correlation": 261984 } }, { "ph": "s", "id": 261984, "pid": 76337, "tid": -914061504, "ts": 1716454225600676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225653958, "dur": 85, "args": { "External id": 261986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261986, "pid": 5, "tid": 7, "ts": 1716454225653958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600691, "dur": 6, "args": { "External id": 261986, "cbid": 211, "correlation": 261986 } }, { "ph": "s", "id": 261986, "pid": 76337, "tid": -914061504, "ts": 1716454225600691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225654046, "dur": 1, "args": { "External id": 261988, "device": 5, "context": 1, "stream": 7, "correlation": 261988, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 261988, "pid": 5, "tid": 7, "ts": 1716454225654046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225600705, "dur": 9, "args": { "External id": 261988, "cbid": 51, "correlation": 261988 } }, { "ph": "s", "id": 261988, "pid": 76337, "tid": -914061504, "ts": 1716454225600705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225654049, "dur": 546, "args": { "External id": 261989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261989, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 261989, "pid": 5, "tid": 7, "ts": 1716454225654049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600715, "dur": 8, "args": { "External id": 261989, "cbid": 211, "correlation": 261989 } }, { "ph": "s", "id": 261989, "pid": 76337, "tid": -914061504, "ts": 1716454225600715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225654597, "dur": 12, "args": { "External id": 261991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261991, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261991, "pid": 5, "tid": 7, "ts": 1716454225654597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600727, "dur": 5, "args": { "External id": 261991, "cbid": 211, "correlation": 261991 } }, { "ph": "s", "id": 261991, "pid": 76337, "tid": -914061504, "ts": 1716454225600727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225654610, "dur": 15, "args": { "External id": 261997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 261997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 261997, "pid": 5, "tid": 7, "ts": 1716454225654610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600755, "dur": 9, "args": { "External id": 261997, "cbid": 211, "correlation": 261997 } }, { "ph": "s", "id": 261997, "pid": 76337, "tid": -914061504, "ts": 1716454225600755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225654626, "dur": 3, "args": { "External id": 262005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 262005, "pid": 5, "tid": 7, "ts": 1716454225654626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600799, "dur": 9, "args": { "External id": 262005, "cbid": 211, "correlation": 262005 } }, { "ph": "s", "id": 262005, "pid": 76337, "tid": -914061504, "ts": 1716454225600799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225600864, "dur": 1, "args": { "External id": 262021, "cbid": 251, "correlation": 262021 } }, { "ph": "f", "id": 262021, "pid": 76337, "tid": -914061504, "ts": 1716454225600864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225600870, "dur": 0, "args": { "External id": 262023, "cbid": 251, "correlation": 262023 } }, { "ph": "f", "id": 262023, "pid": 76337, "tid": -914061504, "ts": 1716454225600870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225654631, "dur": 13, "args": { "External id": 262024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262024, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262024, "pid": 5, "tid": 7, "ts": 1716454225654631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600871, "dur": 11, "args": { "External id": 262024, "cbid": 211, "correlation": 262024 } }, { "ph": "s", "id": 262024, "pid": 76337, "tid": -914061504, "ts": 1716454225600871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225654646, "dur": 5, "args": { "External id": 262026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262026, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262026, "pid": 5, "tid": 7, "ts": 1716454225654646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600885, "dur": 5, "args": { "External id": 262026, "cbid": 211, "correlation": 262026 } }, { "ph": "s", "id": 262026, "pid": 76337, "tid": -914061504, "ts": 1716454225600885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225654652, "dur": 17, "args": { "External id": 262036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262036, "pid": 5, "tid": 7, "ts": 1716454225654652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225600942, "dur": 13, "args": { "External id": 262036, "cbid": 211, "correlation": 262036 } }, { "ph": "s", "id": 262036, "pid": 76337, "tid": -914061504, "ts": 1716454225600942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225654671, "dur": 18, "args": { "External id": 262056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262056, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 262056, "pid": 5, "tid": 7, "ts": 1716454225654671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601015, "dur": 11, "args": { "External id": 262056, "cbid": 211, "correlation": 262056 } }, { "ph": "s", "id": 262056, "pid": 76337, "tid": -914061504, "ts": 1716454225601015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225654690, "dur": 5, "args": { "External id": 262068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262068, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 262068, "pid": 5, "tid": 7, "ts": 1716454225654690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601036, "dur": 7, "args": { "External id": 262068, "cbid": 211, "correlation": 262068 } }, { "ph": "s", "id": 262068, "pid": 76337, "tid": -914061504, "ts": 1716454225601036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225654697, "dur": 17, "args": { "External id": 262071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262071, "pid": 5, "tid": 7, "ts": 1716454225654697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601056, "dur": 7, "args": { "External id": 262071, "cbid": 211, "correlation": 262071 } }, { "ph": "s", "id": 262071, "pid": 76337, "tid": -914061504, "ts": 1716454225601056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225654714, "dur": 11, "args": { "External id": 262080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262080, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262080, "pid": 5, "tid": 7, "ts": 1716454225654714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601098, "dur": 10, "args": { "External id": 262080, "cbid": 211, "correlation": 262080 } }, { "ph": "s", "id": 262080, "pid": 76337, "tid": -914061504, "ts": 1716454225601098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225601161, "dur": 0, "args": { "External id": 262090, "cbid": 317, "correlation": 262090 } }, { "ph": "f", "id": 262090, "pid": 76337, "tid": -914061504, "ts": 1716454225601161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225601162, "dur": 0, "args": { "External id": 262091, "cbid": 203, "correlation": 262091 } }, { "ph": "f", "id": 262091, "pid": 76337, "tid": -914061504, "ts": 1716454225601162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225601162, "dur": 0, "args": { "External id": 262092, "cbid": 205, "correlation": 262092 } }, { "ph": "f", "id": 262092, "pid": 76337, "tid": -914061504, "ts": 1716454225601162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225654727, "dur": 11, "args": { "External id": 262096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262096, "pid": 5, "tid": 7, "ts": 1716454225654727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601180, "dur": 12, "args": { "External id": 262096, "cbid": 211, "correlation": 262096 } }, { "ph": "s", "id": 262096, "pid": 76337, "tid": -914061504, "ts": 1716454225601180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225654739, "dur": 165, "args": { "External id": 262098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262098, "pid": 5, "tid": 7, "ts": 1716454225654739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601194, "dur": 6, "args": { "External id": 262098, "cbid": 211, "correlation": 262098 } }, { "ph": "s", "id": 262098, "pid": 76337, "tid": -914061504, "ts": 1716454225601194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225654907, "dur": 1, "args": { "External id": 262100, "device": 5, "context": 1, "stream": 7, "correlation": 262100, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 262100, "pid": 5, "tid": 7, "ts": 1716454225654907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225601207, "dur": 7, "args": { "External id": 262100, "cbid": 51, "correlation": 262100 } }, { "ph": "s", "id": 262100, "pid": 76337, "tid": -914061504, "ts": 1716454225601207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225654911, "dur": 674, "args": { "External id": 262101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262101, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262101, "pid": 5, "tid": 7, "ts": 1716454225654911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601215, "dur": 6, "args": { "External id": 262101, "cbid": 211, "correlation": 262101 } }, { "ph": "s", "id": 262101, "pid": 76337, "tid": -914061504, "ts": 1716454225601215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225655586, "dur": 13, "args": { "External id": 262103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262103, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262103, "pid": 5, "tid": 7, "ts": 1716454225655586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601226, "dur": 5, "args": { "External id": 262103, "cbid": 211, "correlation": 262103 } }, { "ph": "s", "id": 262103, "pid": 76337, "tid": -914061504, "ts": 1716454225601226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225655600, "dur": 15, "args": { "External id": 262109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262109, "pid": 5, "tid": 7, "ts": 1716454225655600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601255, "dur": 8, "args": { "External id": 262109, "cbid": 211, "correlation": 262109 } }, { "ph": "s", "id": 262109, "pid": 76337, "tid": -914061504, "ts": 1716454225601255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225601313, "dur": 0, "args": { "External id": 262119, "cbid": 317, "correlation": 262119 } }, { "ph": "f", "id": 262119, "pid": 76337, "tid": -914061504, "ts": 1716454225601313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225601314, "dur": 0, "args": { "External id": 262120, "cbid": 203, "correlation": 262120 } }, { "ph": "f", "id": 262120, "pid": 76337, "tid": -914061504, "ts": 1716454225601314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225601314, "dur": 0, "args": { "External id": 262121, "cbid": 205, "correlation": 262121 } }, { "ph": "f", "id": 262121, "pid": 76337, "tid": -914061504, "ts": 1716454225601314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225655617, "dur": 8, "args": { "External id": 262125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262125, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262125, "pid": 5, "tid": 7, "ts": 1716454225655617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601329, "dur": 11, "args": { "External id": 262125, "cbid": 211, "correlation": 262125 } }, { "ph": "s", "id": 262125, "pid": 76337, "tid": -914061504, "ts": 1716454225601329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225655626, "dur": 3, "args": { "External id": 262127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262127, "pid": 5, "tid": 7, "ts": 1716454225655626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601347, "dur": 6, "args": { "External id": 262127, "cbid": 211, "correlation": 262127 } }, { "ph": "s", "id": 262127, "pid": 76337, "tid": -914061504, "ts": 1716454225601347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225601356, "dur": 0, "args": { "External id": 262128, "cbid": 51, "correlation": 262128 } }, { "ph": "s", "id": 262128, "pid": 76337, "tid": -914061504, "ts": 1716454225601356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225655631, "dur": 60, "args": { "External id": 262129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262129, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 262129, "pid": 5, "tid": 7, "ts": 1716454225655631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601357, "dur": 5, "args": { "External id": 262129, "cbid": 211, "correlation": 262129 } }, { "ph": "s", "id": 262129, "pid": 76337, "tid": -914061504, "ts": 1716454225601357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225655692, "dur": 14, "args": { "External id": 262134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262134, "pid": 5, "tid": 7, "ts": 1716454225655692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601383, "dur": 9, "args": { "External id": 262134, "cbid": 211, "correlation": 262134 } }, { "ph": "s", "id": 262134, "pid": 76337, "tid": -914061504, "ts": 1716454225601383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225655708, "dur": 12, "args": { "External id": 262142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262142, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262142, "pid": 5, "tid": 7, "ts": 1716454225655708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601412, "dur": 8, "args": { "External id": 262142, "cbid": 211, "correlation": 262142 } }, { "ph": "s", "id": 262142, "pid": 76337, "tid": -914061504, "ts": 1716454225601412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225655721, "dur": 11, "args": { "External id": 262150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262150, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262150, "pid": 5, "tid": 7, "ts": 1716454225655721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601440, "dur": 9, "args": { "External id": 262150, "cbid": 211, "correlation": 262150 } }, { "ph": "s", "id": 262150, "pid": 76337, "tid": -914061504, "ts": 1716454225601440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225655733, "dur": 19, "args": { "External id": 262170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262170, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 262170, "pid": 5, "tid": 7, "ts": 1716454225655733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601520, "dur": 12, "args": { "External id": 262170, "cbid": 211, "correlation": 262170 } }, { "ph": "s", "id": 262170, "pid": 76337, "tid": -914061504, "ts": 1716454225601520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225655754, "dur": 4, "args": { "External id": 262182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262182, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 262182, "pid": 5, "tid": 7, "ts": 1716454225655754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601543, "dur": 7, "args": { "External id": 262182, "cbid": 211, "correlation": 262182 } }, { "ph": "s", "id": 262182, "pid": 76337, "tid": -914061504, "ts": 1716454225601543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225655760, "dur": 17, "args": { "External id": 262185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262185, "pid": 5, "tid": 7, "ts": 1716454225655760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601561, "dur": 7, "args": { "External id": 262185, "cbid": 211, "correlation": 262185 } }, { "ph": "s", "id": 262185, "pid": 76337, "tid": -914061504, "ts": 1716454225601561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225601617, "dur": 0, "args": { "External id": 262196, "cbid": 317, "correlation": 262196 } }, { "ph": "f", "id": 262196, "pid": 76337, "tid": -914061504, "ts": 1716454225601617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225601618, "dur": 0, "args": { "External id": 262197, "cbid": 203, "correlation": 262197 } }, { "ph": "f", "id": 262197, "pid": 76337, "tid": -914061504, "ts": 1716454225601618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225601619, "dur": 0, "args": { "External id": 262198, "cbid": 205, "correlation": 262198 } }, { "ph": "f", "id": 262198, "pid": 76337, "tid": -914061504, "ts": 1716454225601619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225655778, "dur": 12, "args": { "External id": 262202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262202, "pid": 5, "tid": 7, "ts": 1716454225655778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601634, "dur": 12, "args": { "External id": 262202, "cbid": 211, "correlation": 262202 } }, { "ph": "s", "id": 262202, "pid": 76337, "tid": -914061504, "ts": 1716454225601634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225655791, "dur": 3, "args": { "External id": 262204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262204, "pid": 5, "tid": 7, "ts": 1716454225655791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601652, "dur": 5, "args": { "External id": 262204, "cbid": 211, "correlation": 262204 } }, { "ph": "s", "id": 262204, "pid": 76337, "tid": -914061504, "ts": 1716454225601652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225601661, "dur": 0, "args": { "External id": 262205, "cbid": 51, "correlation": 262205 } }, { "ph": "s", "id": 262205, "pid": 76337, "tid": -914061504, "ts": 1716454225601661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225655796, "dur": 99, "args": { "External id": 262206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262206, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 262206, "pid": 5, "tid": 7, "ts": 1716454225655796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601662, "dur": 5, "args": { "External id": 262206, "cbid": 211, "correlation": 262206 } }, { "ph": "s", "id": 262206, "pid": 76337, "tid": -914061504, "ts": 1716454225601662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225655897, "dur": 16, "args": { "External id": 262211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262211, "pid": 5, "tid": 7, "ts": 1716454225655897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601689, "dur": 9, "args": { "External id": 262211, "cbid": 211, "correlation": 262211 } }, { "ph": "s", "id": 262211, "pid": 76337, "tid": -914061504, "ts": 1716454225601689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225655914, "dur": 86, "args": { "External id": 262220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262220, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262220, "pid": 5, "tid": 7, "ts": 1716454225655914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601770, "dur": 14, "args": { "External id": 262220, "cbid": 211, "correlation": 262220 } }, { "ph": "s", "id": 262220, "pid": 76337, "tid": -914061504, "ts": 1716454225601770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225656001, "dur": 30, "args": { "External id": 262242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262242, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262242, "pid": 5, "tid": 7, "ts": 1716454225656001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601827, "dur": 10, "args": { "External id": 262242, "cbid": 211, "correlation": 262242 } }, { "ph": "s", "id": 262242, "pid": 76337, "tid": -914061504, "ts": 1716454225601827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225601915, "dur": 1, "args": { "External id": 262253, "cbid": 251, "correlation": 262253 } }, { "ph": "f", "id": 262253, "pid": 76337, "tid": -914061504, "ts": 1716454225601915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225656033, "dur": 166, "args": { "External id": 262254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262254, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262254, "pid": 5, "tid": 7, "ts": 1716454225656033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225601921, "dur": 13, "args": { "External id": 262254, "cbid": 211, "correlation": 262254 } }, { "ph": "s", "id": 262254, "pid": 76337, "tid": -914061504, "ts": 1716454225601921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225601998, "dur": 1, "args": { "External id": 262265, "cbid": 251, "correlation": 262265 } }, { "ph": "f", "id": 262265, "pid": 76337, "tid": -914061504, "ts": 1716454225601998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225656200, "dur": 161, "args": { "External id": 262266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262266, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262266, "pid": 5, "tid": 7, "ts": 1716454225656200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602003, "dur": 12, "args": { "External id": 262266, "cbid": 211, "correlation": 262266 } }, { "ph": "s", "id": 262266, "pid": 76337, "tid": -914061504, "ts": 1716454225602003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602070, "dur": 1, "args": { "External id": 262277, "cbid": 251, "correlation": 262277 } }, { "ph": "f", "id": 262277, "pid": 76337, "tid": -914061504, "ts": 1716454225602070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225656363, "dur": 161, "args": { "External id": 262278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262278, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262278, "pid": 5, "tid": 7, "ts": 1716454225656363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602074, "dur": 11, "args": { "External id": 262278, "cbid": 211, "correlation": 262278 } }, { "ph": "s", "id": 262278, "pid": 76337, "tid": -914061504, "ts": 1716454225602074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225656525, "dur": 342, "args": { "External id": 262303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262303, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262303, "pid": 5, "tid": 7, "ts": 1716454225656525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602161, "dur": 13, "args": { "External id": 262303, "cbid": 211, "correlation": 262303 } }, { "ph": "s", "id": 262303, "pid": 76337, "tid": -914061504, "ts": 1716454225602161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602262, "dur": 1, "args": { "External id": 262321, "cbid": 251, "correlation": 262321 } }, { "ph": "f", "id": 262321, "pid": 76337, "tid": -914061504, "ts": 1716454225602262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225656869, "dur": 169, "args": { "External id": 262323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262323, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262323, "pid": 5, "tid": 7, "ts": 1716454225656869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602267, "dur": 13, "args": { "External id": 262323, "cbid": 211, "correlation": 262323 } }, { "ph": "s", "id": 262323, "pid": 76337, "tid": -914061504, "ts": 1716454225602267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225657039, "dur": 20, "args": { "External id": 262331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262331, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262331, "pid": 5, "tid": 7, "ts": 1716454225657039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602335, "dur": 12, "args": { "External id": 262331, "cbid": 211, "correlation": 262331 } }, { "ph": "s", "id": 262331, "pid": 76337, "tid": -914061504, "ts": 1716454225602335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225657060, "dur": 28, "args": { "External id": 262339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262339, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262339, "pid": 5, "tid": 7, "ts": 1716454225657060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602374, "dur": 9, "args": { "External id": 262339, "cbid": 211, "correlation": 262339 } }, { "ph": "s", "id": 262339, "pid": 76337, "tid": -914061504, "ts": 1716454225602374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225657089, "dur": 18, "args": { "External id": 262350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262350, "pid": 5, "tid": 7, "ts": 1716454225657089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602446, "dur": 13, "args": { "External id": 262350, "cbid": 211, "correlation": 262350 } }, { "ph": "s", "id": 262350, "pid": 76337, "tid": -914061504, "ts": 1716454225602446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225657108, "dur": 17, "args": { "External id": 262372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262372, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262372, "pid": 5, "tid": 7, "ts": 1716454225657108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602477, "dur": 7, "args": { "External id": 262372, "cbid": 211, "correlation": 262372 } }, { "ph": "s", "id": 262372, "pid": 76337, "tid": -914061504, "ts": 1716454225602477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602563, "dur": 1, "args": { "External id": 262383, "cbid": 251, "correlation": 262383 } }, { "ph": "f", "id": 262383, "pid": 76337, "tid": -914061504, "ts": 1716454225602563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225657126, "dur": 91, "args": { "External id": 262384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262384, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262384, "pid": 5, "tid": 7, "ts": 1716454225657126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602569, "dur": 14, "args": { "External id": 262384, "cbid": 211, "correlation": 262384 } }, { "ph": "s", "id": 262384, "pid": 76337, "tid": -914061504, "ts": 1716454225602569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602639, "dur": 1, "args": { "External id": 262395, "cbid": 251, "correlation": 262395 } }, { "ph": "f", "id": 262395, "pid": 76337, "tid": -914061504, "ts": 1716454225602639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602643, "dur": 0, "args": { "External id": 262396, "cbid": 251, "correlation": 262396 } }, { "ph": "f", "id": 262396, "pid": 76337, "tid": -914061504, "ts": 1716454225602643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225657218, "dur": 13, "args": { "External id": 262397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262397, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262397, "pid": 5, "tid": 7, "ts": 1716454225657218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602644, "dur": 13, "args": { "External id": 262397, "cbid": 211, "correlation": 262397 } }, { "ph": "s", "id": 262397, "pid": 76337, "tid": -914061504, "ts": 1716454225602644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225657232, "dur": 6, "args": { "External id": 262399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262399, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262399, "pid": 5, "tid": 7, "ts": 1716454225657232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602659, "dur": 6, "args": { "External id": 262399, "cbid": 211, "correlation": 262399 } }, { "ph": "s", "id": 262399, "pid": 76337, "tid": -914061504, "ts": 1716454225602659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602717, "dur": 1, "args": { "External id": 262410, "cbid": 251, "correlation": 262410 } }, { "ph": "f", "id": 262410, "pid": 76337, "tid": -914061504, "ts": 1716454225602717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602720, "dur": 0, "args": { "External id": 262411, "cbid": 251, "correlation": 262411 } }, { "ph": "f", "id": 262411, "pid": 76337, "tid": -914061504, "ts": 1716454225602720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225657240, "dur": 8, "args": { "External id": 262412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262412, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262412, "pid": 5, "tid": 7, "ts": 1716454225657240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602722, "dur": 12, "args": { "External id": 262412, "cbid": 211, "correlation": 262412 } }, { "ph": "s", "id": 262412, "pid": 76337, "tid": -914061504, "ts": 1716454225602722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225657249, "dur": 3, "args": { "External id": 262414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262414, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262414, "pid": 5, "tid": 7, "ts": 1716454225657249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602735, "dur": 5, "args": { "External id": 262414, "cbid": 211, "correlation": 262414 } }, { "ph": "s", "id": 262414, "pid": 76337, "tid": -914061504, "ts": 1716454225602735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225657254, "dur": 56, "args": { "External id": 262439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262439, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262439, "pid": 5, "tid": 7, "ts": 1716454225657254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602812, "dur": 12, "args": { "External id": 262439, "cbid": 211, "correlation": 262439 } }, { "ph": "s", "id": 262439, "pid": 76337, "tid": -914061504, "ts": 1716454225602812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225602911, "dur": 2, "args": { "External id": 262457, "cbid": 251, "correlation": 262457 } }, { "ph": "f", "id": 262457, "pid": 76337, "tid": -914061504, "ts": 1716454225602911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225657312, "dur": 93, "args": { "External id": 262459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262459, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262459, "pid": 5, "tid": 7, "ts": 1716454225657312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602917, "dur": 14, "args": { "External id": 262459, "cbid": 211, "correlation": 262459 } }, { "ph": "s", "id": 262459, "pid": 76337, "tid": -914061504, "ts": 1716454225602917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225657407, "dur": 9, "args": { "External id": 262467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262467, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262467, "pid": 5, "tid": 7, "ts": 1716454225657407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225602996, "dur": 12, "args": { "External id": 262467, "cbid": 211, "correlation": 262467 } }, { "ph": "s", "id": 262467, "pid": 76337, "tid": -914061504, "ts": 1716454225602996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225657417, "dur": 22, "args": { "External id": 262475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262475, "pid": 5, "tid": 7, "ts": 1716454225657417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603038, "dur": 9, "args": { "External id": 262475, "cbid": 211, "correlation": 262475 } }, { "ph": "s", "id": 262475, "pid": 76337, "tid": -914061504, "ts": 1716454225603038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225657440, "dur": 17, "args": { "External id": 262497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262497, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262497, "pid": 5, "tid": 7, "ts": 1716454225657440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603090, "dur": 10, "args": { "External id": 262497, "cbid": 211, "correlation": 262497 } }, { "ph": "s", "id": 262497, "pid": 76337, "tid": -914061504, "ts": 1716454225603090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225603181, "dur": 2, "args": { "External id": 262513, "cbid": 251, "correlation": 262513 } }, { "ph": "f", "id": 262513, "pid": 76337, "tid": -914061504, "ts": 1716454225603181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225603187, "dur": 0, "args": { "External id": 262515, "cbid": 251, "correlation": 262515 } }, { "ph": "f", "id": 262515, "pid": 76337, "tid": -914061504, "ts": 1716454225603187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225657459, "dur": 498, "args": { "External id": 262516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262516, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262516, "pid": 5, "tid": 7, "ts": 1716454225657459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603189, "dur": 14, "args": { "External id": 262516, "cbid": 211, "correlation": 262516 } }, { "ph": "s", "id": 262516, "pid": 76337, "tid": -914061504, "ts": 1716454225603189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225657959, "dur": 67, "args": { "External id": 262524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262524, "pid": 5, "tid": 7, "ts": 1716454225657959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603256, "dur": 13, "args": { "External id": 262524, "cbid": 211, "correlation": 262524 } }, { "ph": "s", "id": 262524, "pid": 76337, "tid": -914061504, "ts": 1716454225603256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225658027, "dur": 69, "args": { "External id": 262532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262532, "pid": 5, "tid": 7, "ts": 1716454225658027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603287, "dur": 8, "args": { "External id": 262532, "cbid": 211, "correlation": 262532 } }, { "ph": "s", "id": 262532, "pid": 76337, "tid": -914061504, "ts": 1716454225603287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225603370, "dur": 1, "args": { "External id": 262548, "cbid": 251, "correlation": 262548 } }, { "ph": "f", "id": 262548, "pid": 76337, "tid": -914061504, "ts": 1716454225603370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225658099, "dur": 1, "args": { "External id": 262550, "device": 5, "context": 1, "stream": 7, "correlation": 262550, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 262550, "pid": 5, "tid": 7, "ts": 1716454225658099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225603375, "dur": 11, "args": { "External id": 262550, "cbid": 51, "correlation": 262550 } }, { "ph": "s", "id": 262550, "pid": 76337, "tid": -914061504, "ts": 1716454225603375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225658103, "dur": 274, "args": { "External id": 262551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262551, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262551, "pid": 5, "tid": 7, "ts": 1716454225658103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603388, "dur": 11, "args": { "External id": 262551, "cbid": 211, "correlation": 262551 } }, { "ph": "s", "id": 262551, "pid": 76337, "tid": -914061504, "ts": 1716454225603388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225658378, "dur": 14, "args": { "External id": 262559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262559, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262559, "pid": 5, "tid": 7, "ts": 1716454225658378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603431, "dur": 10, "args": { "External id": 262559, "cbid": 211, "correlation": 262559 } }, { "ph": "s", "id": 262559, "pid": 76337, "tid": -914061504, "ts": 1716454225603431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225658393, "dur": 39, "args": { "External id": 262570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262570, "pid": 5, "tid": 7, "ts": 1716454225658393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603498, "dur": 12, "args": { "External id": 262570, "cbid": 211, "correlation": 262570 } }, { "ph": "s", "id": 262570, "pid": 76337, "tid": -914061504, "ts": 1716454225603498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225603562, "dur": 0, "args": { "External id": 262582, "cbid": 317, "correlation": 262582 } }, { "ph": "f", "id": 262582, "pid": 76337, "tid": -914061504, "ts": 1716454225603562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225603563, "dur": 0, "args": { "External id": 262583, "cbid": 203, "correlation": 262583 } }, { "ph": "f", "id": 262583, "pid": 76337, "tid": -914061504, "ts": 1716454225603563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225603565, "dur": 0, "args": { "External id": 262584, "cbid": 205, "correlation": 262584 } }, { "ph": "f", "id": 262584, "pid": 76337, "tid": -914061504, "ts": 1716454225603565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225658434, "dur": 13, "args": { "External id": 262588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262588, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262588, "pid": 5, "tid": 7, "ts": 1716454225658434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603580, "dur": 12, "args": { "External id": 262588, "cbid": 211, "correlation": 262588 } }, { "ph": "s", "id": 262588, "pid": 76337, "tid": -914061504, "ts": 1716454225603580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225658448, "dur": 4, "args": { "External id": 262590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262590, "pid": 5, "tid": 7, "ts": 1716454225658448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603596, "dur": 5, "args": { "External id": 262590, "cbid": 211, "correlation": 262590 } }, { "ph": "s", "id": 262590, "pid": 76337, "tid": -914061504, "ts": 1716454225603596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225603604, "dur": 0, "args": { "External id": 262591, "cbid": 51, "correlation": 262591 } }, { "ph": "s", "id": 262591, "pid": 76337, "tid": -914061504, "ts": 1716454225603604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225658453, "dur": 99, "args": { "External id": 262592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262592, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 262592, "pid": 5, "tid": 7, "ts": 1716454225658453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603605, "dur": 5, "args": { "External id": 262592, "cbid": 211, "correlation": 262592 } }, { "ph": "s", "id": 262592, "pid": 76337, "tid": -914061504, "ts": 1716454225603605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225658553, "dur": 17, "args": { "External id": 262597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262597, "pid": 5, "tid": 7, "ts": 1716454225658553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603633, "dur": 9, "args": { "External id": 262597, "cbid": 211, "correlation": 262597 } }, { "ph": "s", "id": 262597, "pid": 76337, "tid": -914061504, "ts": 1716454225603633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225658572, "dur": 12, "args": { "External id": 262605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262605, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262605, "pid": 5, "tid": 7, "ts": 1716454225658572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603664, "dur": 9, "args": { "External id": 262605, "cbid": 211, "correlation": 262605 } }, { "ph": "s", "id": 262605, "pid": 76337, "tid": -914061504, "ts": 1716454225603664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225658585, "dur": 19, "args": { "External id": 262625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262625, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 262625, "pid": 5, "tid": 7, "ts": 1716454225658585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603737, "dur": 11, "args": { "External id": 262625, "cbid": 211, "correlation": 262625 } }, { "ph": "s", "id": 262625, "pid": 76337, "tid": -914061504, "ts": 1716454225603737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225658605, "dur": 5, "args": { "External id": 262637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262637, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 262637, "pid": 5, "tid": 7, "ts": 1716454225658605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603759, "dur": 6, "args": { "External id": 262637, "cbid": 211, "correlation": 262637 } }, { "ph": "s", "id": 262637, "pid": 76337, "tid": -914061504, "ts": 1716454225603759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225658611, "dur": 18, "args": { "External id": 262640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262640, "pid": 5, "tid": 7, "ts": 1716454225658611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603777, "dur": 7, "args": { "External id": 262640, "cbid": 211, "correlation": 262640 } }, { "ph": "s", "id": 262640, "pid": 76337, "tid": -914061504, "ts": 1716454225603777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225658630, "dur": 12, "args": { "External id": 262649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262649, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262649, "pid": 5, "tid": 7, "ts": 1716454225658630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603814, "dur": 11, "args": { "External id": 262649, "cbid": 211, "correlation": 262649 } }, { "ph": "s", "id": 262649, "pid": 76337, "tid": -914061504, "ts": 1716454225603814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225603866, "dur": 0, "args": { "External id": 262659, "cbid": 317, "correlation": 262659 } }, { "ph": "f", "id": 262659, "pid": 76337, "tid": -914061504, "ts": 1716454225603866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225603866, "dur": 0, "args": { "External id": 262660, "cbid": 203, "correlation": 262660 } }, { "ph": "f", "id": 262660, "pid": 76337, "tid": -914061504, "ts": 1716454225603866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225603867, "dur": 0, "args": { "External id": 262661, "cbid": 205, "correlation": 262661 } }, { "ph": "f", "id": 262661, "pid": 76337, "tid": -914061504, "ts": 1716454225603867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225658643, "dur": 11, "args": { "External id": 262665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262665, "pid": 5, "tid": 7, "ts": 1716454225658643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603881, "dur": 11, "args": { "External id": 262665, "cbid": 211, "correlation": 262665 } }, { "ph": "s", "id": 262665, "pid": 76337, "tid": -914061504, "ts": 1716454225603881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225658656, "dur": 165, "args": { "External id": 262667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262667, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262667, "pid": 5, "tid": 7, "ts": 1716454225658656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603895, "dur": 5, "args": { "External id": 262667, "cbid": 211, "correlation": 262667 } }, { "ph": "s", "id": 262667, "pid": 76337, "tid": -914061504, "ts": 1716454225603895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225658823, "dur": 1, "args": { "External id": 262669, "device": 5, "context": 1, "stream": 7, "correlation": 262669, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 262669, "pid": 5, "tid": 7, "ts": 1716454225658823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225603906, "dur": 7, "args": { "External id": 262669, "cbid": 51, "correlation": 262669 } }, { "ph": "s", "id": 262669, "pid": 76337, "tid": -914061504, "ts": 1716454225603906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225658827, "dur": 674, "args": { "External id": 262670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262670, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262670, "pid": 5, "tid": 7, "ts": 1716454225658827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603914, "dur": 7, "args": { "External id": 262670, "cbid": 211, "correlation": 262670 } }, { "ph": "s", "id": 262670, "pid": 76337, "tid": -914061504, "ts": 1716454225603914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225659502, "dur": 14, "args": { "External id": 262672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262672, "pid": 5, "tid": 7, "ts": 1716454225659502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603925, "dur": 5, "args": { "External id": 262672, "cbid": 211, "correlation": 262672 } }, { "ph": "s", "id": 262672, "pid": 76337, "tid": -914061504, "ts": 1716454225603925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225659518, "dur": 15, "args": { "External id": 262678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262678, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262678, "pid": 5, "tid": 7, "ts": 1716454225659518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225603953, "dur": 8, "args": { "External id": 262678, "cbid": 211, "correlation": 262678 } }, { "ph": "s", "id": 262678, "pid": 76337, "tid": -914061504, "ts": 1716454225603953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225659534, "dur": 3, "args": { "External id": 262686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262686, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 262686, "pid": 5, "tid": 7, "ts": 1716454225659534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604005, "dur": 10, "args": { "External id": 262686, "cbid": 211, "correlation": 262686 } }, { "ph": "s", "id": 262686, "pid": 76337, "tid": -914061504, "ts": 1716454225604005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225604069, "dur": 1, "args": { "External id": 262702, "cbid": 251, "correlation": 262702 } }, { "ph": "f", "id": 262702, "pid": 76337, "tid": -914061504, "ts": 1716454225604069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225604075, "dur": 0, "args": { "External id": 262704, "cbid": 251, "correlation": 262704 } }, { "ph": "f", "id": 262704, "pid": 76337, "tid": -914061504, "ts": 1716454225604075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225659538, "dur": 13, "args": { "External id": 262705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262705, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262705, "pid": 5, "tid": 7, "ts": 1716454225659538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604077, "dur": 12, "args": { "External id": 262705, "cbid": 211, "correlation": 262705 } }, { "ph": "s", "id": 262705, "pid": 76337, "tid": -914061504, "ts": 1716454225604077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225659553, "dur": 5, "args": { "External id": 262707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262707, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262707, "pid": 5, "tid": 7, "ts": 1716454225659553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604091, "dur": 5, "args": { "External id": 262707, "cbid": 211, "correlation": 262707 } }, { "ph": "s", "id": 262707, "pid": 76337, "tid": -914061504, "ts": 1716454225604091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225659560, "dur": 17, "args": { "External id": 262717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262717, "pid": 5, "tid": 7, "ts": 1716454225659560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604148, "dur": 11, "args": { "External id": 262717, "cbid": 211, "correlation": 262717 } }, { "ph": "s", "id": 262717, "pid": 76337, "tid": -914061504, "ts": 1716454225604148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225659578, "dur": 18, "args": { "External id": 262737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262737, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 262737, "pid": 5, "tid": 7, "ts": 1716454225659578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604213, "dur": 11, "args": { "External id": 262737, "cbid": 211, "correlation": 262737 } }, { "ph": "s", "id": 262737, "pid": 76337, "tid": -914061504, "ts": 1716454225604213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225659598, "dur": 4, "args": { "External id": 262749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262749, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 262749, "pid": 5, "tid": 7, "ts": 1716454225659598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604234, "dur": 6, "args": { "External id": 262749, "cbid": 211, "correlation": 262749 } }, { "ph": "s", "id": 262749, "pid": 76337, "tid": -914061504, "ts": 1716454225604234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225659603, "dur": 17, "args": { "External id": 262752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262752, "pid": 5, "tid": 7, "ts": 1716454225659603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604252, "dur": 6, "args": { "External id": 262752, "cbid": 211, "correlation": 262752 } }, { "ph": "s", "id": 262752, "pid": 76337, "tid": -914061504, "ts": 1716454225604252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225659621, "dur": 11, "args": { "External id": 262761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262761, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262761, "pid": 5, "tid": 7, "ts": 1716454225659621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604292, "dur": 10, "args": { "External id": 262761, "cbid": 211, "correlation": 262761 } }, { "ph": "s", "id": 262761, "pid": 76337, "tid": -914061504, "ts": 1716454225604292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225604353, "dur": 0, "args": { "External id": 262771, "cbid": 317, "correlation": 262771 } }, { "ph": "f", "id": 262771, "pid": 76337, "tid": -914061504, "ts": 1716454225604353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225604354, "dur": 0, "args": { "External id": 262772, "cbid": 203, "correlation": 262772 } }, { "ph": "f", "id": 262772, "pid": 76337, "tid": -914061504, "ts": 1716454225604354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225604355, "dur": 0, "args": { "External id": 262773, "cbid": 205, "correlation": 262773 } }, { "ph": "f", "id": 262773, "pid": 76337, "tid": -914061504, "ts": 1716454225604355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225659633, "dur": 11, "args": { "External id": 262777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262777, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262777, "pid": 5, "tid": 7, "ts": 1716454225659633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604368, "dur": 12, "args": { "External id": 262777, "cbid": 211, "correlation": 262777 } }, { "ph": "s", "id": 262777, "pid": 76337, "tid": -914061504, "ts": 1716454225604368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225659646, "dur": 166, "args": { "External id": 262779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262779, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262779, "pid": 5, "tid": 7, "ts": 1716454225659646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604383, "dur": 6, "args": { "External id": 262779, "cbid": 211, "correlation": 262779 } }, { "ph": "s", "id": 262779, "pid": 76337, "tid": -914061504, "ts": 1716454225604383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225659814, "dur": 1, "args": { "External id": 262781, "device": 5, "context": 1, "stream": 7, "correlation": 262781, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 262781, "pid": 5, "tid": 7, "ts": 1716454225659814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225604395, "dur": 6, "args": { "External id": 262781, "cbid": 51, "correlation": 262781 } }, { "ph": "s", "id": 262781, "pid": 76337, "tid": -914061504, "ts": 1716454225604395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225659818, "dur": 660, "args": { "External id": 262782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262782, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262782, "pid": 5, "tid": 7, "ts": 1716454225659818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604402, "dur": 6, "args": { "External id": 262782, "cbid": 211, "correlation": 262782 } }, { "ph": "s", "id": 262782, "pid": 76337, "tid": -914061504, "ts": 1716454225604402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225660479, "dur": 13, "args": { "External id": 262784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262784, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262784, "pid": 5, "tid": 7, "ts": 1716454225660479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604414, "dur": 5, "args": { "External id": 262784, "cbid": 211, "correlation": 262784 } }, { "ph": "s", "id": 262784, "pid": 76337, "tid": -914061504, "ts": 1716454225604414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225660493, "dur": 15, "args": { "External id": 262790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262790, "pid": 5, "tid": 7, "ts": 1716454225660493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604442, "dur": 8, "args": { "External id": 262790, "cbid": 211, "correlation": 262790 } }, { "ph": "s", "id": 262790, "pid": 76337, "tid": -914061504, "ts": 1716454225604442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225660510, "dur": 12, "args": { "External id": 262798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262798, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262798, "pid": 5, "tid": 7, "ts": 1716454225660510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604474, "dur": 9, "args": { "External id": 262798, "cbid": 211, "correlation": 262798 } }, { "ph": "s", "id": 262798, "pid": 76337, "tid": -914061504, "ts": 1716454225604474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225660523, "dur": 10, "args": { "External id": 262806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262806, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262806, "pid": 5, "tid": 7, "ts": 1716454225660523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604503, "dur": 8, "args": { "External id": 262806, "cbid": 211, "correlation": 262806 } }, { "ph": "s", "id": 262806, "pid": 76337, "tid": -914061504, "ts": 1716454225604503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225660534, "dur": 19, "args": { "External id": 262826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262826, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 262826, "pid": 5, "tid": 7, "ts": 1716454225660534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604581, "dur": 13, "args": { "External id": 262826, "cbid": 211, "correlation": 262826 } }, { "ph": "s", "id": 262826, "pid": 76337, "tid": -914061504, "ts": 1716454225604581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225660555, "dur": 4, "args": { "External id": 262838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262838, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 262838, "pid": 5, "tid": 7, "ts": 1716454225660555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604603, "dur": 6, "args": { "External id": 262838, "cbid": 211, "correlation": 262838 } }, { "ph": "s", "id": 262838, "pid": 76337, "tid": -914061504, "ts": 1716454225604603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225660560, "dur": 16, "args": { "External id": 262841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262841, "pid": 5, "tid": 7, "ts": 1716454225660560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604621, "dur": 8, "args": { "External id": 262841, "cbid": 211, "correlation": 262841 } }, { "ph": "s", "id": 262841, "pid": 76337, "tid": -914061504, "ts": 1716454225604621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225604678, "dur": 0, "args": { "External id": 262852, "cbid": 317, "correlation": 262852 } }, { "ph": "f", "id": 262852, "pid": 76337, "tid": -914061504, "ts": 1716454225604678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225604679, "dur": 0, "args": { "External id": 262853, "cbid": 203, "correlation": 262853 } }, { "ph": "f", "id": 262853, "pid": 76337, "tid": -914061504, "ts": 1716454225604679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225604680, "dur": 0, "args": { "External id": 262854, "cbid": 205, "correlation": 262854 } }, { "ph": "f", "id": 262854, "pid": 76337, "tid": -914061504, "ts": 1716454225604680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225660578, "dur": 11, "args": { "External id": 262858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262858, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262858, "pid": 5, "tid": 7, "ts": 1716454225660578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604694, "dur": 12, "args": { "External id": 262858, "cbid": 211, "correlation": 262858 } }, { "ph": "s", "id": 262858, "pid": 76337, "tid": -914061504, "ts": 1716454225604694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225660590, "dur": 4, "args": { "External id": 262860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 262860, "pid": 5, "tid": 7, "ts": 1716454225660590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604710, "dur": 6, "args": { "External id": 262860, "cbid": 211, "correlation": 262860 } }, { "ph": "s", "id": 262860, "pid": 76337, "tid": -914061504, "ts": 1716454225604710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225604719, "dur": 0, "args": { "External id": 262861, "cbid": 51, "correlation": 262861 } }, { "ph": "s", "id": 262861, "pid": 76337, "tid": -914061504, "ts": 1716454225604719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225660596, "dur": 96, "args": { "External id": 262862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262862, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 262862, "pid": 5, "tid": 7, "ts": 1716454225660596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604719, "dur": 5, "args": { "External id": 262862, "cbid": 211, "correlation": 262862 } }, { "ph": "s", "id": 262862, "pid": 76337, "tid": -914061504, "ts": 1716454225604719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225660693, "dur": 16, "args": { "External id": 262867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262867, "pid": 5, "tid": 7, "ts": 1716454225660693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604746, "dur": 9, "args": { "External id": 262867, "cbid": 211, "correlation": 262867 } }, { "ph": "s", "id": 262867, "pid": 76337, "tid": -914061504, "ts": 1716454225604746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225660711, "dur": 86, "args": { "External id": 262876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262876, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262876, "pid": 5, "tid": 7, "ts": 1716454225660711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604827, "dur": 15, "args": { "External id": 262876, "cbid": 211, "correlation": 262876 } }, { "ph": "s", "id": 262876, "pid": 76337, "tid": -914061504, "ts": 1716454225604827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225660798, "dur": 31, "args": { "External id": 262898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262898, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262898, "pid": 5, "tid": 7, "ts": 1716454225660798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604884, "dur": 11, "args": { "External id": 262898, "cbid": 211, "correlation": 262898 } }, { "ph": "s", "id": 262898, "pid": 76337, "tid": -914061504, "ts": 1716454225604884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225604971, "dur": 8, "args": { "External id": 262909, "cbid": 251, "correlation": 262909 } }, { "ph": "f", "id": 262909, "pid": 76337, "tid": -914061504, "ts": 1716454225604971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225660829, "dur": 168, "args": { "External id": 262910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262910, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262910, "pid": 5, "tid": 7, "ts": 1716454225660829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225604984, "dur": 15, "args": { "External id": 262910, "cbid": 211, "correlation": 262910 } }, { "ph": "s", "id": 262910, "pid": 76337, "tid": -914061504, "ts": 1716454225604984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605057, "dur": 1, "args": { "External id": 262921, "cbid": 251, "correlation": 262921 } }, { "ph": "f", "id": 262921, "pid": 76337, "tid": -914061504, "ts": 1716454225605057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225660999, "dur": 160, "args": { "External id": 262922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262922, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262922, "pid": 5, "tid": 7, "ts": 1716454225660999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605061, "dur": 11, "args": { "External id": 262922, "cbid": 211, "correlation": 262922 } }, { "ph": "s", "id": 262922, "pid": 76337, "tid": -914061504, "ts": 1716454225605061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605125, "dur": 1, "args": { "External id": 262933, "cbid": 251, "correlation": 262933 } }, { "ph": "f", "id": 262933, "pid": 76337, "tid": -914061504, "ts": 1716454225605125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225661161, "dur": 161, "args": { "External id": 262934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262934, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262934, "pid": 5, "tid": 7, "ts": 1716454225661161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605129, "dur": 11, "args": { "External id": 262934, "cbid": 211, "correlation": 262934 } }, { "ph": "s", "id": 262934, "pid": 76337, "tid": -914061504, "ts": 1716454225605129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225661323, "dur": 343, "args": { "External id": 262959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262959, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262959, "pid": 5, "tid": 7, "ts": 1716454225661323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605214, "dur": 12, "args": { "External id": 262959, "cbid": 211, "correlation": 262959 } }, { "ph": "s", "id": 262959, "pid": 76337, "tid": -914061504, "ts": 1716454225605214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605313, "dur": 1, "args": { "External id": 262977, "cbid": 251, "correlation": 262977 } }, { "ph": "f", "id": 262977, "pid": 76337, "tid": -914061504, "ts": 1716454225605313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225661667, "dur": 145, "args": { "External id": 262979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262979, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 262979, "pid": 5, "tid": 7, "ts": 1716454225661667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605319, "dur": 13, "args": { "External id": 262979, "cbid": 211, "correlation": 262979 } }, { "ph": "s", "id": 262979, "pid": 76337, "tid": -914061504, "ts": 1716454225605319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225661814, "dur": 19, "args": { "External id": 262987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262987, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262987, "pid": 5, "tid": 7, "ts": 1716454225661814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605388, "dur": 13, "args": { "External id": 262987, "cbid": 211, "correlation": 262987 } }, { "ph": "s", "id": 262987, "pid": 76337, "tid": -914061504, "ts": 1716454225605388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225661834, "dur": 27, "args": { "External id": 262995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 262995, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 262995, "pid": 5, "tid": 7, "ts": 1716454225661834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605427, "dur": 8, "args": { "External id": 262995, "cbid": 211, "correlation": 262995 } }, { "ph": "s", "id": 262995, "pid": 76337, "tid": -914061504, "ts": 1716454225605427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225661863, "dur": 19, "args": { "External id": 263006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263006, "pid": 5, "tid": 7, "ts": 1716454225661863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605498, "dur": 12, "args": { "External id": 263006, "cbid": 211, "correlation": 263006 } }, { "ph": "s", "id": 263006, "pid": 76337, "tid": -914061504, "ts": 1716454225605498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225661883, "dur": 16, "args": { "External id": 263028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263028, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263028, "pid": 5, "tid": 7, "ts": 1716454225661883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605529, "dur": 7, "args": { "External id": 263028, "cbid": 211, "correlation": 263028 } }, { "ph": "s", "id": 263028, "pid": 76337, "tid": -914061504, "ts": 1716454225605529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605613, "dur": 1, "args": { "External id": 263039, "cbid": 251, "correlation": 263039 } }, { "ph": "f", "id": 263039, "pid": 76337, "tid": -914061504, "ts": 1716454225605613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225661901, "dur": 92, "args": { "External id": 263040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263040, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 263040, "pid": 5, "tid": 7, "ts": 1716454225661901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605619, "dur": 13, "args": { "External id": 263040, "cbid": 211, "correlation": 263040 } }, { "ph": "s", "id": 263040, "pid": 76337, "tid": -914061504, "ts": 1716454225605619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605688, "dur": 1, "args": { "External id": 263051, "cbid": 251, "correlation": 263051 } }, { "ph": "f", "id": 263051, "pid": 76337, "tid": -914061504, "ts": 1716454225605688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605692, "dur": 0, "args": { "External id": 263052, "cbid": 251, "correlation": 263052 } }, { "ph": "f", "id": 263052, "pid": 76337, "tid": -914061504, "ts": 1716454225605692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225661994, "dur": 12, "args": { "External id": 263053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263053, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263053, "pid": 5, "tid": 7, "ts": 1716454225661994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605694, "dur": 13, "args": { "External id": 263053, "cbid": 211, "correlation": 263053 } }, { "ph": "s", "id": 263053, "pid": 76337, "tid": -914061504, "ts": 1716454225605694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225662007, "dur": 6, "args": { "External id": 263055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263055, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263055, "pid": 5, "tid": 7, "ts": 1716454225662007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605708, "dur": 6, "args": { "External id": 263055, "cbid": 211, "correlation": 263055 } }, { "ph": "s", "id": 263055, "pid": 76337, "tid": -914061504, "ts": 1716454225605708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605766, "dur": 1, "args": { "External id": 263066, "cbid": 251, "correlation": 263066 } }, { "ph": "f", "id": 263066, "pid": 76337, "tid": -914061504, "ts": 1716454225605766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605769, "dur": 0, "args": { "External id": 263067, "cbid": 251, "correlation": 263067 } }, { "ph": "f", "id": 263067, "pid": 76337, "tid": -914061504, "ts": 1716454225605769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225662014, "dur": 8, "args": { "External id": 263068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263068, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263068, "pid": 5, "tid": 7, "ts": 1716454225662014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605771, "dur": 12, "args": { "External id": 263068, "cbid": 211, "correlation": 263068 } }, { "ph": "s", "id": 263068, "pid": 76337, "tid": -914061504, "ts": 1716454225605771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225662024, "dur": 3, "args": { "External id": 263070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263070, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263070, "pid": 5, "tid": 7, "ts": 1716454225662024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605784, "dur": 5, "args": { "External id": 263070, "cbid": 211, "correlation": 263070 } }, { "ph": "s", "id": 263070, "pid": 76337, "tid": -914061504, "ts": 1716454225605784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225662028, "dur": 55, "args": { "External id": 263095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263095, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263095, "pid": 5, "tid": 7, "ts": 1716454225662028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605861, "dur": 12, "args": { "External id": 263095, "cbid": 211, "correlation": 263095 } }, { "ph": "s", "id": 263095, "pid": 76337, "tid": -914061504, "ts": 1716454225605861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225605959, "dur": 1, "args": { "External id": 263113, "cbid": 251, "correlation": 263113 } }, { "ph": "f", "id": 263113, "pid": 76337, "tid": -914061504, "ts": 1716454225605959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225662084, "dur": 92, "args": { "External id": 263115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263115, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 263115, "pid": 5, "tid": 7, "ts": 1716454225662084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225605965, "dur": 21, "args": { "External id": 263115, "cbid": 211, "correlation": 263115 } }, { "ph": "s", "id": 263115, "pid": 76337, "tid": -914061504, "ts": 1716454225605965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225662178, "dur": 10, "args": { "External id": 263123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263123, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263123, "pid": 5, "tid": 7, "ts": 1716454225662178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606044, "dur": 13, "args": { "External id": 263123, "cbid": 211, "correlation": 263123 } }, { "ph": "s", "id": 263123, "pid": 76337, "tid": -914061504, "ts": 1716454225606044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225662189, "dur": 21, "args": { "External id": 263131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263131, "pid": 5, "tid": 7, "ts": 1716454225662189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606086, "dur": 9, "args": { "External id": 263131, "cbid": 211, "correlation": 263131 } }, { "ph": "s", "id": 263131, "pid": 76337, "tid": -914061504, "ts": 1716454225606086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225662211, "dur": 18, "args": { "External id": 263153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263153, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263153, "pid": 5, "tid": 7, "ts": 1716454225662211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606138, "dur": 10, "args": { "External id": 263153, "cbid": 211, "correlation": 263153 } }, { "ph": "s", "id": 263153, "pid": 76337, "tid": -914061504, "ts": 1716454225606138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225606225, "dur": 1, "args": { "External id": 263169, "cbid": 251, "correlation": 263169 } }, { "ph": "f", "id": 263169, "pid": 76337, "tid": -914061504, "ts": 1716454225606225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225606230, "dur": 0, "args": { "External id": 263171, "cbid": 251, "correlation": 263171 } }, { "ph": "f", "id": 263171, "pid": 76337, "tid": -914061504, "ts": 1716454225606230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225662231, "dur": 498, "args": { "External id": 263172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263172, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263172, "pid": 5, "tid": 7, "ts": 1716454225662231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606232, "dur": 13, "args": { "External id": 263172, "cbid": 211, "correlation": 263172 } }, { "ph": "s", "id": 263172, "pid": 76337, "tid": -914061504, "ts": 1716454225606232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225662730, "dur": 66, "args": { "External id": 263180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263180, "pid": 5, "tid": 7, "ts": 1716454225662730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606297, "dur": 13, "args": { "External id": 263180, "cbid": 211, "correlation": 263180 } }, { "ph": "s", "id": 263180, "pid": 76337, "tid": -914061504, "ts": 1716454225606297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225662797, "dur": 68, "args": { "External id": 263188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263188, "pid": 5, "tid": 7, "ts": 1716454225662797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606327, "dur": 8, "args": { "External id": 263188, "cbid": 211, "correlation": 263188 } }, { "ph": "s", "id": 263188, "pid": 76337, "tid": -914061504, "ts": 1716454225606327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225606406, "dur": 1, "args": { "External id": 263204, "cbid": 251, "correlation": 263204 } }, { "ph": "f", "id": 263204, "pid": 76337, "tid": -914061504, "ts": 1716454225606406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225662868, "dur": 1, "args": { "External id": 263206, "device": 5, "context": 1, "stream": 7, "correlation": 263206, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 263206, "pid": 5, "tid": 7, "ts": 1716454225662868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225606411, "dur": 9, "args": { "External id": 263206, "cbid": 51, "correlation": 263206 } }, { "ph": "s", "id": 263206, "pid": 76337, "tid": -914061504, "ts": 1716454225606411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225662871, "dur": 275, "args": { "External id": 263207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263207, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 263207, "pid": 5, "tid": 7, "ts": 1716454225662871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606422, "dur": 11, "args": { "External id": 263207, "cbid": 211, "correlation": 263207 } }, { "ph": "s", "id": 263207, "pid": 76337, "tid": -914061504, "ts": 1716454225606422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225663148, "dur": 13, "args": { "External id": 263215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263215, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263215, "pid": 5, "tid": 7, "ts": 1716454225663148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606465, "dur": 10, "args": { "External id": 263215, "cbid": 211, "correlation": 263215 } }, { "ph": "s", "id": 263215, "pid": 76337, "tid": -914061504, "ts": 1716454225606465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225663163, "dur": 38, "args": { "External id": 263226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263226, "pid": 5, "tid": 7, "ts": 1716454225663163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606532, "dur": 12, "args": { "External id": 263226, "cbid": 211, "correlation": 263226 } }, { "ph": "s", "id": 263226, "pid": 76337, "tid": -914061504, "ts": 1716454225606532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225606596, "dur": 0, "args": { "External id": 263238, "cbid": 317, "correlation": 263238 } }, { "ph": "f", "id": 263238, "pid": 76337, "tid": -914061504, "ts": 1716454225606596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225606597, "dur": 0, "args": { "External id": 263239, "cbid": 203, "correlation": 263239 } }, { "ph": "f", "id": 263239, "pid": 76337, "tid": -914061504, "ts": 1716454225606597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225606598, "dur": 0, "args": { "External id": 263240, "cbid": 205, "correlation": 263240 } }, { "ph": "f", "id": 263240, "pid": 76337, "tid": -914061504, "ts": 1716454225606598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225663202, "dur": 13, "args": { "External id": 263244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263244, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263244, "pid": 5, "tid": 7, "ts": 1716454225663202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606613, "dur": 12, "args": { "External id": 263244, "cbid": 211, "correlation": 263244 } }, { "ph": "s", "id": 263244, "pid": 76337, "tid": -914061504, "ts": 1716454225606613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225663216, "dur": 4, "args": { "External id": 263246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 263246, "pid": 5, "tid": 7, "ts": 1716454225663216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606630, "dur": 6, "args": { "External id": 263246, "cbid": 211, "correlation": 263246 } }, { "ph": "s", "id": 263246, "pid": 76337, "tid": -914061504, "ts": 1716454225606630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225606638, "dur": 0, "args": { "External id": 263247, "cbid": 51, "correlation": 263247 } }, { "ph": "s", "id": 263247, "pid": 76337, "tid": -914061504, "ts": 1716454225606638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225663222, "dur": 99, "args": { "External id": 263248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263248, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 263248, "pid": 5, "tid": 7, "ts": 1716454225663222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606639, "dur": 5, "args": { "External id": 263248, "cbid": 211, "correlation": 263248 } }, { "ph": "s", "id": 263248, "pid": 76337, "tid": -914061504, "ts": 1716454225606639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225663322, "dur": 16, "args": { "External id": 263253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263253, "pid": 5, "tid": 7, "ts": 1716454225663322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606666, "dur": 8, "args": { "External id": 263253, "cbid": 211, "correlation": 263253 } }, { "ph": "s", "id": 263253, "pid": 76337, "tid": -914061504, "ts": 1716454225606666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225663340, "dur": 11, "args": { "External id": 263261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263261, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263261, "pid": 5, "tid": 7, "ts": 1716454225663340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606697, "dur": 9, "args": { "External id": 263261, "cbid": 211, "correlation": 263261 } }, { "ph": "s", "id": 263261, "pid": 76337, "tid": -914061504, "ts": 1716454225606697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225606766, "dur": 0, "args": { "External id": 263271, "cbid": 317, "correlation": 263271 } }, { "ph": "f", "id": 263271, "pid": 76337, "tid": -914061504, "ts": 1716454225606766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225606767, "dur": 0, "args": { "External id": 263272, "cbid": 203, "correlation": 263272 } }, { "ph": "f", "id": 263272, "pid": 76337, "tid": -914061504, "ts": 1716454225606767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225606768, "dur": 0, "args": { "External id": 263273, "cbid": 205, "correlation": 263273 } }, { "ph": "f", "id": 263273, "pid": 76337, "tid": -914061504, "ts": 1716454225606768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225663353, "dur": 13, "args": { "External id": 263277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263277, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263277, "pid": 5, "tid": 7, "ts": 1716454225663353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606782, "dur": 12, "args": { "External id": 263277, "cbid": 211, "correlation": 263277 } }, { "ph": "s", "id": 263277, "pid": 76337, "tid": -914061504, "ts": 1716454225606782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225663366, "dur": 166, "args": { "External id": 263279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263279, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263279, "pid": 5, "tid": 7, "ts": 1716454225663366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606797, "dur": 6, "args": { "External id": 263279, "cbid": 211, "correlation": 263279 } }, { "ph": "s", "id": 263279, "pid": 76337, "tid": -914061504, "ts": 1716454225606797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225663535, "dur": 1, "args": { "External id": 263281, "device": 5, "context": 1, "stream": 7, "correlation": 263281, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 263281, "pid": 5, "tid": 7, "ts": 1716454225663535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225606809, "dur": 6, "args": { "External id": 263281, "cbid": 51, "correlation": 263281 } }, { "ph": "s", "id": 263281, "pid": 76337, "tid": -914061504, "ts": 1716454225606809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225663538, "dur": 202, "args": { "External id": 263282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263282, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 263282, "pid": 5, "tid": 7, "ts": 1716454225663538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606817, "dur": 8, "args": { "External id": 263282, "cbid": 211, "correlation": 263282 } }, { "ph": "s", "id": 263282, "pid": 76337, "tid": -914061504, "ts": 1716454225606817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225663742, "dur": 6, "args": { "External id": 263284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263284, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263284, "pid": 5, "tid": 7, "ts": 1716454225663742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606829, "dur": 5, "args": { "External id": 263284, "cbid": 211, "correlation": 263284 } }, { "ph": "s", "id": 263284, "pid": 76337, "tid": -914061504, "ts": 1716454225606829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225663749, "dur": 6, "args": { "External id": 263290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263290, "pid": 5, "tid": 7, "ts": 1716454225663749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606858, "dur": 8, "args": { "External id": 263290, "cbid": 211, "correlation": 263290 } }, { "ph": "s", "id": 263290, "pid": 76337, "tid": -914061504, "ts": 1716454225606858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225663757, "dur": 11, "args": { "External id": 263310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263310, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263310, "pid": 5, "tid": 7, "ts": 1716454225663757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606948, "dur": 13, "args": { "External id": 263310, "cbid": 211, "correlation": 263310 } }, { "ph": "s", "id": 263310, "pid": 76337, "tid": -914061504, "ts": 1716454225606948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225663769, "dur": 5, "args": { "External id": 263322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263322, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263322, "pid": 5, "tid": 7, "ts": 1716454225663769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225606972, "dur": 15, "args": { "External id": 263322, "cbid": 211, "correlation": 263322 } }, { "ph": "s", "id": 263322, "pid": 76337, "tid": -914061504, "ts": 1716454225606972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225663775, "dur": 8, "args": { "External id": 263325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263325, "pid": 5, "tid": 7, "ts": 1716454225663775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607000, "dur": 8, "args": { "External id": 263325, "cbid": 211, "correlation": 263325 } }, { "ph": "s", "id": 263325, "pid": 76337, "tid": -914061504, "ts": 1716454225607000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225663785, "dur": 5, "args": { "External id": 263334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263334, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263334, "pid": 5, "tid": 7, "ts": 1716454225663785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607041, "dur": 10, "args": { "External id": 263334, "cbid": 211, "correlation": 263334 } }, { "ph": "s", "id": 263334, "pid": 76337, "tid": -914061504, "ts": 1716454225607041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225607094, "dur": 0, "args": { "External id": 263344, "cbid": 317, "correlation": 263344 } }, { "ph": "f", "id": 263344, "pid": 76337, "tid": -914061504, "ts": 1716454225607094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225607095, "dur": 0, "args": { "External id": 263345, "cbid": 203, "correlation": 263345 } }, { "ph": "f", "id": 263345, "pid": 76337, "tid": -914061504, "ts": 1716454225607095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225607096, "dur": 0, "args": { "External id": 263346, "cbid": 205, "correlation": 263346 } }, { "ph": "f", "id": 263346, "pid": 76337, "tid": -914061504, "ts": 1716454225607096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225663792, "dur": 5, "args": { "External id": 263350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263350, "pid": 5, "tid": 7, "ts": 1716454225663792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607110, "dur": 12, "args": { "External id": 263350, "cbid": 211, "correlation": 263350 } }, { "ph": "s", "id": 263350, "pid": 76337, "tid": -914061504, "ts": 1716454225607110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225663798, "dur": 165, "args": { "External id": 263352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263352, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263352, "pid": 5, "tid": 7, "ts": 1716454225663798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607125, "dur": 5, "args": { "External id": 263352, "cbid": 211, "correlation": 263352 } }, { "ph": "s", "id": 263352, "pid": 76337, "tid": -914061504, "ts": 1716454225607125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225663965, "dur": 1, "args": { "External id": 263354, "device": 5, "context": 1, "stream": 7, "correlation": 263354, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 263354, "pid": 5, "tid": 7, "ts": 1716454225663965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225607135, "dur": 6, "args": { "External id": 263354, "cbid": 51, "correlation": 263354 } }, { "ph": "s", "id": 263354, "pid": 76337, "tid": -914061504, "ts": 1716454225607135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225663969, "dur": 276, "args": { "External id": 263355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263355, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263355, "pid": 5, "tid": 7, "ts": 1716454225663969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607143, "dur": 6, "args": { "External id": 263355, "cbid": 211, "correlation": 263355 } }, { "ph": "s", "id": 263355, "pid": 76337, "tid": -914061504, "ts": 1716454225607143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225664246, "dur": 6, "args": { "External id": 263357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263357, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263357, "pid": 5, "tid": 7, "ts": 1716454225664246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607154, "dur": 6, "args": { "External id": 263357, "cbid": 211, "correlation": 263357 } }, { "ph": "s", "id": 263357, "pid": 76337, "tid": -914061504, "ts": 1716454225607154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225664253, "dur": 6, "args": { "External id": 263363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263363, "pid": 5, "tid": 7, "ts": 1716454225664253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607183, "dur": 8, "args": { "External id": 263363, "cbid": 211, "correlation": 263363 } }, { "ph": "s", "id": 263363, "pid": 76337, "tid": -914061504, "ts": 1716454225607183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225664261, "dur": 3, "args": { "External id": 263371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263371, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 263371, "pid": 5, "tid": 7, "ts": 1716454225664261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607227, "dur": 9, "args": { "External id": 263371, "cbid": 211, "correlation": 263371 } }, { "ph": "s", "id": 263371, "pid": 76337, "tid": -914061504, "ts": 1716454225607227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225607290, "dur": 1, "args": { "External id": 263387, "cbid": 251, "correlation": 263387 } }, { "ph": "f", "id": 263387, "pid": 76337, "tid": -914061504, "ts": 1716454225607290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225607295, "dur": 0, "args": { "External id": 263389, "cbid": 251, "correlation": 263389 } }, { "ph": "f", "id": 263389, "pid": 76337, "tid": -914061504, "ts": 1716454225607295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225664266, "dur": 13, "args": { "External id": 263390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263390, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263390, "pid": 5, "tid": 7, "ts": 1716454225664266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607297, "dur": 13, "args": { "External id": 263390, "cbid": 211, "correlation": 263390 } }, { "ph": "s", "id": 263390, "pid": 76337, "tid": -914061504, "ts": 1716454225607297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225664280, "dur": 5, "args": { "External id": 263392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263392, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263392, "pid": 5, "tid": 7, "ts": 1716454225664280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607312, "dur": 5, "args": { "External id": 263392, "cbid": 211, "correlation": 263392 } }, { "ph": "s", "id": 263392, "pid": 76337, "tid": -914061504, "ts": 1716454225607312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225664287, "dur": 6, "args": { "External id": 263402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263402, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263402, "pid": 5, "tid": 7, "ts": 1716454225664287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607370, "dur": 12, "args": { "External id": 263402, "cbid": 211, "correlation": 263402 } }, { "ph": "s", "id": 263402, "pid": 76337, "tid": -914061504, "ts": 1716454225607370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225664294, "dur": 10, "args": { "External id": 263422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263422, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263422, "pid": 5, "tid": 7, "ts": 1716454225664294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607436, "dur": 10, "args": { "External id": 263422, "cbid": 211, "correlation": 263422 } }, { "ph": "s", "id": 263422, "pid": 76337, "tid": -914061504, "ts": 1716454225607436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225664305, "dur": 4, "args": { "External id": 263434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263434, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263434, "pid": 5, "tid": 7, "ts": 1716454225664305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607456, "dur": 7, "args": { "External id": 263434, "cbid": 211, "correlation": 263434 } }, { "ph": "s", "id": 263434, "pid": 76337, "tid": -914061504, "ts": 1716454225607456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225664310, "dur": 7, "args": { "External id": 263437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263437, "pid": 5, "tid": 7, "ts": 1716454225664310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607475, "dur": 6, "args": { "External id": 263437, "cbid": 211, "correlation": 263437 } }, { "ph": "s", "id": 263437, "pid": 76337, "tid": -914061504, "ts": 1716454225607475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225664319, "dur": 5, "args": { "External id": 263446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263446, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263446, "pid": 5, "tid": 7, "ts": 1716454225664319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607515, "dur": 9, "args": { "External id": 263446, "cbid": 211, "correlation": 263446 } }, { "ph": "s", "id": 263446, "pid": 76337, "tid": -914061504, "ts": 1716454225607515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225607577, "dur": 0, "args": { "External id": 263456, "cbid": 317, "correlation": 263456 } }, { "ph": "f", "id": 263456, "pid": 76337, "tid": -914061504, "ts": 1716454225607577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225607578, "dur": 0, "args": { "External id": 263457, "cbid": 203, "correlation": 263457 } }, { "ph": "f", "id": 263457, "pid": 76337, "tid": -914061504, "ts": 1716454225607578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225607579, "dur": 0, "args": { "External id": 263458, "cbid": 205, "correlation": 263458 } }, { "ph": "f", "id": 263458, "pid": 76337, "tid": -914061504, "ts": 1716454225607579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225664325, "dur": 5, "args": { "External id": 263462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263462, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263462, "pid": 5, "tid": 7, "ts": 1716454225664325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607593, "dur": 12, "args": { "External id": 263462, "cbid": 211, "correlation": 263462 } }, { "ph": "s", "id": 263462, "pid": 76337, "tid": -914061504, "ts": 1716454225607593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225664331, "dur": 165, "args": { "External id": 263464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263464, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263464, "pid": 5, "tid": 7, "ts": 1716454225664331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607607, "dur": 5, "args": { "External id": 263464, "cbid": 211, "correlation": 263464 } }, { "ph": "s", "id": 263464, "pid": 76337, "tid": -914061504, "ts": 1716454225607607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225664498, "dur": 1, "args": { "External id": 263466, "device": 5, "context": 1, "stream": 7, "correlation": 263466, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 263466, "pid": 5, "tid": 7, "ts": 1716454225664498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225607619, "dur": 6, "args": { "External id": 263466, "cbid": 51, "correlation": 263466 } }, { "ph": "s", "id": 263466, "pid": 76337, "tid": -914061504, "ts": 1716454225607619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225664502, "dur": 264, "args": { "External id": 263467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263467, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263467, "pid": 5, "tid": 7, "ts": 1716454225664502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607626, "dur": 6, "args": { "External id": 263467, "cbid": 211, "correlation": 263467 } }, { "ph": "s", "id": 263467, "pid": 76337, "tid": -914061504, "ts": 1716454225607626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225664767, "dur": 6, "args": { "External id": 263469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263469, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263469, "pid": 5, "tid": 7, "ts": 1716454225664767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607636, "dur": 5, "args": { "External id": 263469, "cbid": 211, "correlation": 263469 } }, { "ph": "s", "id": 263469, "pid": 76337, "tid": -914061504, "ts": 1716454225607636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225664774, "dur": 6, "args": { "External id": 263475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263475, "pid": 5, "tid": 7, "ts": 1716454225664774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607665, "dur": 9, "args": { "External id": 263475, "cbid": 211, "correlation": 263475 } }, { "ph": "s", "id": 263475, "pid": 76337, "tid": -914061504, "ts": 1716454225607665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225664782, "dur": 5, "args": { "External id": 263483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263483, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263483, "pid": 5, "tid": 7, "ts": 1716454225664782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607697, "dur": 8, "args": { "External id": 263483, "cbid": 211, "correlation": 263483 } }, { "ph": "s", "id": 263483, "pid": 76337, "tid": -914061504, "ts": 1716454225607697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225664788, "dur": 5, "args": { "External id": 263491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263491, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263491, "pid": 5, "tid": 7, "ts": 1716454225664788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607727, "dur": 8, "args": { "External id": 263491, "cbid": 211, "correlation": 263491 } }, { "ph": "s", "id": 263491, "pid": 76337, "tid": -914061504, "ts": 1716454225607727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225664794, "dur": 10, "args": { "External id": 263511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263511, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263511, "pid": 5, "tid": 7, "ts": 1716454225664794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607800, "dur": 12, "args": { "External id": 263511, "cbid": 211, "correlation": 263511 } }, { "ph": "s", "id": 263511, "pid": 76337, "tid": -914061504, "ts": 1716454225607800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225664805, "dur": 4, "args": { "External id": 263523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263523, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263523, "pid": 5, "tid": 7, "ts": 1716454225664805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607823, "dur": 6, "args": { "External id": 263523, "cbid": 211, "correlation": 263523 } }, { "ph": "s", "id": 263523, "pid": 76337, "tid": -914061504, "ts": 1716454225607823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225664810, "dur": 7, "args": { "External id": 263526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263526, "pid": 5, "tid": 7, "ts": 1716454225664810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607840, "dur": 6, "args": { "External id": 263526, "cbid": 211, "correlation": 263526 } }, { "ph": "s", "id": 263526, "pid": 76337, "tid": -914061504, "ts": 1716454225607840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225664818, "dur": 5, "args": { "External id": 263535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263535, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263535, "pid": 5, "tid": 7, "ts": 1716454225664818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607878, "dur": 9, "args": { "External id": 263535, "cbid": 211, "correlation": 263535 } }, { "ph": "s", "id": 263535, "pid": 76337, "tid": -914061504, "ts": 1716454225607878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225607929, "dur": 0, "args": { "External id": 263545, "cbid": 317, "correlation": 263545 } }, { "ph": "f", "id": 263545, "pid": 76337, "tid": -914061504, "ts": 1716454225607929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225607930, "dur": 0, "args": { "External id": 263546, "cbid": 203, "correlation": 263546 } }, { "ph": "f", "id": 263546, "pid": 76337, "tid": -914061504, "ts": 1716454225607930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225607931, "dur": 0, "args": { "External id": 263547, "cbid": 205, "correlation": 263547 } }, { "ph": "f", "id": 263547, "pid": 76337, "tid": -914061504, "ts": 1716454225607931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225664824, "dur": 5, "args": { "External id": 263551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263551, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263551, "pid": 5, "tid": 7, "ts": 1716454225664824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607944, "dur": 11, "args": { "External id": 263551, "cbid": 211, "correlation": 263551 } }, { "ph": "s", "id": 263551, "pid": 76337, "tid": -914061504, "ts": 1716454225607944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225664831, "dur": 165, "args": { "External id": 263553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263553, "pid": 5, "tid": 7, "ts": 1716454225664831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607958, "dur": 5, "args": { "External id": 263553, "cbid": 211, "correlation": 263553 } }, { "ph": "s", "id": 263553, "pid": 76337, "tid": -914061504, "ts": 1716454225607958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225664997, "dur": 1, "args": { "External id": 263555, "device": 5, "context": 1, "stream": 7, "correlation": 263555, "bytes": 240, "memory bandwidth (GB/s)": 0.14150943396226415 } }, { "ph": "f", "id": 263555, "pid": 5, "tid": 7, "ts": 1716454225664997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225607969, "dur": 15, "args": { "External id": 263555, "cbid": 51, "correlation": 263555 } }, { "ph": "s", "id": 263555, "pid": 76337, "tid": -914061504, "ts": 1716454225607969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225665001, "dur": 263, "args": { "External id": 263556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263556, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263556, "pid": 5, "tid": 7, "ts": 1716454225665001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607984, "dur": 6, "args": { "External id": 263556, "cbid": 211, "correlation": 263556 } }, { "ph": "s", "id": 263556, "pid": 76337, "tid": -914061504, "ts": 1716454225607984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225665266, "dur": 6, "args": { "External id": 263558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263558, "pid": 5, "tid": 7, "ts": 1716454225665266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225607994, "dur": 5, "args": { "External id": 263558, "cbid": 211, "correlation": 263558 } }, { "ph": "s", "id": 263558, "pid": 76337, "tid": -914061504, "ts": 1716454225607994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225665273, "dur": 6, "args": { "External id": 263564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263564, "pid": 5, "tid": 7, "ts": 1716454225665273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608023, "dur": 9, "args": { "External id": 263564, "cbid": 211, "correlation": 263564 } }, { "ph": "s", "id": 263564, "pid": 76337, "tid": -914061504, "ts": 1716454225608023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225665281, "dur": 3, "args": { "External id": 263572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263572, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 263572, "pid": 5, "tid": 7, "ts": 1716454225665281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608067, "dur": 10, "args": { "External id": 263572, "cbid": 211, "correlation": 263572 } }, { "ph": "s", "id": 263572, "pid": 76337, "tid": -914061504, "ts": 1716454225608067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225608130, "dur": 1, "args": { "External id": 263588, "cbid": 251, "correlation": 263588 } }, { "ph": "f", "id": 263588, "pid": 76337, "tid": -914061504, "ts": 1716454225608130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225608135, "dur": 0, "args": { "External id": 263590, "cbid": 251, "correlation": 263590 } }, { "ph": "f", "id": 263590, "pid": 76337, "tid": -914061504, "ts": 1716454225608135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225665285, "dur": 11, "args": { "External id": 263591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263591, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263591, "pid": 5, "tid": 7, "ts": 1716454225665285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608137, "dur": 11, "args": { "External id": 263591, "cbid": 211, "correlation": 263591 } }, { "ph": "s", "id": 263591, "pid": 76337, "tid": -914061504, "ts": 1716454225608137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225665297, "dur": 4, "args": { "External id": 263593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263593, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263593, "pid": 5, "tid": 7, "ts": 1716454225665297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608149, "dur": 5, "args": { "External id": 263593, "cbid": 211, "correlation": 263593 } }, { "ph": "s", "id": 263593, "pid": 76337, "tid": -914061504, "ts": 1716454225608149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225665302, "dur": 6, "args": { "External id": 263603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263603, "pid": 5, "tid": 7, "ts": 1716454225665302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608205, "dur": 12, "args": { "External id": 263603, "cbid": 211, "correlation": 263603 } }, { "ph": "s", "id": 263603, "pid": 76337, "tid": -914061504, "ts": 1716454225608205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225665309, "dur": 10, "args": { "External id": 263623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263623, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263623, "pid": 5, "tid": 7, "ts": 1716454225665309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608269, "dur": 11, "args": { "External id": 263623, "cbid": 211, "correlation": 263623 } }, { "ph": "s", "id": 263623, "pid": 76337, "tid": -914061504, "ts": 1716454225608269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225665320, "dur": 4, "args": { "External id": 263635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263635, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263635, "pid": 5, "tid": 7, "ts": 1716454225665320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608291, "dur": 6, "args": { "External id": 263635, "cbid": 211, "correlation": 263635 } }, { "ph": "s", "id": 263635, "pid": 76337, "tid": -914061504, "ts": 1716454225608291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225665325, "dur": 7, "args": { "External id": 263638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263638, "pid": 5, "tid": 7, "ts": 1716454225665325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608309, "dur": 6, "args": { "External id": 263638, "cbid": 211, "correlation": 263638 } }, { "ph": "s", "id": 263638, "pid": 76337, "tid": -914061504, "ts": 1716454225608309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225665333, "dur": 5, "args": { "External id": 263647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263647, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263647, "pid": 5, "tid": 7, "ts": 1716454225665333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608349, "dur": 9, "args": { "External id": 263647, "cbid": 211, "correlation": 263647 } }, { "ph": "s", "id": 263647, "pid": 76337, "tid": -914061504, "ts": 1716454225608349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225608409, "dur": 0, "args": { "External id": 263657, "cbid": 317, "correlation": 263657 } }, { "ph": "f", "id": 263657, "pid": 76337, "tid": -914061504, "ts": 1716454225608409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225608410, "dur": 0, "args": { "External id": 263658, "cbid": 203, "correlation": 263658 } }, { "ph": "f", "id": 263658, "pid": 76337, "tid": -914061504, "ts": 1716454225608410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225608411, "dur": 0, "args": { "External id": 263659, "cbid": 205, "correlation": 263659 } }, { "ph": "f", "id": 263659, "pid": 76337, "tid": -914061504, "ts": 1716454225608411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225665339, "dur": 5, "args": { "External id": 263663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263663, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263663, "pid": 5, "tid": 7, "ts": 1716454225665339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608425, "dur": 13, "args": { "External id": 263663, "cbid": 211, "correlation": 263663 } }, { "ph": "s", "id": 263663, "pid": 76337, "tid": -914061504, "ts": 1716454225608425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225665346, "dur": 165, "args": { "External id": 263665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263665, "pid": 5, "tid": 7, "ts": 1716454225665346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608440, "dur": 5, "args": { "External id": 263665, "cbid": 211, "correlation": 263665 } }, { "ph": "s", "id": 263665, "pid": 76337, "tid": -914061504, "ts": 1716454225608440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225665513, "dur": 1, "args": { "External id": 263667, "device": 5, "context": 1, "stream": 7, "correlation": 263667, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 263667, "pid": 5, "tid": 7, "ts": 1716454225665513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225608450, "dur": 6, "args": { "External id": 263667, "cbid": 51, "correlation": 263667 } }, { "ph": "s", "id": 263667, "pid": 76337, "tid": -914061504, "ts": 1716454225608450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225665517, "dur": 264, "args": { "External id": 263668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263668, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263668, "pid": 5, "tid": 7, "ts": 1716454225665517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608458, "dur": 6, "args": { "External id": 263668, "cbid": 211, "correlation": 263668 } }, { "ph": "s", "id": 263668, "pid": 76337, "tid": -914061504, "ts": 1716454225608458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225665782, "dur": 6, "args": { "External id": 263670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263670, "pid": 5, "tid": 7, "ts": 1716454225665782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608467, "dur": 5, "args": { "External id": 263670, "cbid": 211, "correlation": 263670 } }, { "ph": "s", "id": 263670, "pid": 76337, "tid": -914061504, "ts": 1716454225608467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225665790, "dur": 6, "args": { "External id": 263676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263676, "pid": 5, "tid": 7, "ts": 1716454225665790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608496, "dur": 9, "args": { "External id": 263676, "cbid": 211, "correlation": 263676 } }, { "ph": "s", "id": 263676, "pid": 76337, "tid": -914061504, "ts": 1716454225608496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225665797, "dur": 5, "args": { "External id": 263684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263684, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263684, "pid": 5, "tid": 7, "ts": 1716454225665797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608530, "dur": 9, "args": { "External id": 263684, "cbid": 211, "correlation": 263684 } }, { "ph": "s", "id": 263684, "pid": 76337, "tid": -914061504, "ts": 1716454225608530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225665804, "dur": 4, "args": { "External id": 263692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263692, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263692, "pid": 5, "tid": 7, "ts": 1716454225665804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608559, "dur": 8, "args": { "External id": 263692, "cbid": 211, "correlation": 263692 } }, { "ph": "s", "id": 263692, "pid": 76337, "tid": -914061504, "ts": 1716454225608559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225665809, "dur": 10, "args": { "External id": 263712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263712, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263712, "pid": 5, "tid": 7, "ts": 1716454225665809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608660, "dur": 12, "args": { "External id": 263712, "cbid": 211, "correlation": 263712 } }, { "ph": "s", "id": 263712, "pid": 76337, "tid": -914061504, "ts": 1716454225608660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225665821, "dur": 4, "args": { "External id": 263724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263724, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263724, "pid": 5, "tid": 7, "ts": 1716454225665821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608683, "dur": 7, "args": { "External id": 263724, "cbid": 211, "correlation": 263724 } }, { "ph": "s", "id": 263724, "pid": 76337, "tid": -914061504, "ts": 1716454225608683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225665826, "dur": 7, "args": { "External id": 263727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263727, "pid": 5, "tid": 7, "ts": 1716454225665826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608701, "dur": 6, "args": { "External id": 263727, "cbid": 211, "correlation": 263727 } }, { "ph": "s", "id": 263727, "pid": 76337, "tid": -914061504, "ts": 1716454225608701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225665834, "dur": 5, "args": { "External id": 263736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263736, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263736, "pid": 5, "tid": 7, "ts": 1716454225665834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608740, "dur": 9, "args": { "External id": 263736, "cbid": 211, "correlation": 263736 } }, { "ph": "s", "id": 263736, "pid": 76337, "tid": -914061504, "ts": 1716454225608740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225608793, "dur": 0, "args": { "External id": 263746, "cbid": 317, "correlation": 263746 } }, { "ph": "f", "id": 263746, "pid": 76337, "tid": -914061504, "ts": 1716454225608793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225608793, "dur": 0, "args": { "External id": 263747, "cbid": 203, "correlation": 263747 } }, { "ph": "f", "id": 263747, "pid": 76337, "tid": -914061504, "ts": 1716454225608793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225608794, "dur": 0, "args": { "External id": 263748, "cbid": 205, "correlation": 263748 } }, { "ph": "f", "id": 263748, "pid": 76337, "tid": -914061504, "ts": 1716454225608794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225665840, "dur": 5, "args": { "External id": 263752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263752, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263752, "pid": 5, "tid": 7, "ts": 1716454225665840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608807, "dur": 11, "args": { "External id": 263752, "cbid": 211, "correlation": 263752 } }, { "ph": "s", "id": 263752, "pid": 76337, "tid": -914061504, "ts": 1716454225608807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225665846, "dur": 165, "args": { "External id": 263754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263754, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263754, "pid": 5, "tid": 7, "ts": 1716454225665846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608821, "dur": 5, "args": { "External id": 263754, "cbid": 211, "correlation": 263754 } }, { "ph": "s", "id": 263754, "pid": 76337, "tid": -914061504, "ts": 1716454225608821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225666014, "dur": 1, "args": { "External id": 263756, "device": 5, "context": 1, "stream": 7, "correlation": 263756, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 263756, "pid": 5, "tid": 7, "ts": 1716454225666014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225608831, "dur": 7, "args": { "External id": 263756, "cbid": 51, "correlation": 263756 } }, { "ph": "s", "id": 263756, "pid": 76337, "tid": -914061504, "ts": 1716454225608831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225666017, "dur": 263, "args": { "External id": 263757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263757, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263757, "pid": 5, "tid": 7, "ts": 1716454225666017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608839, "dur": 6, "args": { "External id": 263757, "cbid": 211, "correlation": 263757 } }, { "ph": "s", "id": 263757, "pid": 76337, "tid": -914061504, "ts": 1716454225608839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225666282, "dur": 5, "args": { "External id": 263759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263759, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263759, "pid": 5, "tid": 7, "ts": 1716454225666282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608848, "dur": 5, "args": { "External id": 263759, "cbid": 211, "correlation": 263759 } }, { "ph": "s", "id": 263759, "pid": 76337, "tid": -914061504, "ts": 1716454225608848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225666289, "dur": 6, "args": { "External id": 263765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263765, "pid": 5, "tid": 7, "ts": 1716454225666289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608876, "dur": 9, "args": { "External id": 263765, "cbid": 211, "correlation": 263765 } }, { "ph": "s", "id": 263765, "pid": 76337, "tid": -914061504, "ts": 1716454225608876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225666297, "dur": 3, "args": { "External id": 263773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263773, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 263773, "pid": 5, "tid": 7, "ts": 1716454225666297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608919, "dur": 9, "args": { "External id": 263773, "cbid": 211, "correlation": 263773 } }, { "ph": "s", "id": 263773, "pid": 76337, "tid": -914061504, "ts": 1716454225608919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225608988, "dur": 1, "args": { "External id": 263789, "cbid": 251, "correlation": 263789 } }, { "ph": "f", "id": 263789, "pid": 76337, "tid": -914061504, "ts": 1716454225608988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225608994, "dur": 0, "args": { "External id": 263791, "cbid": 251, "correlation": 263791 } }, { "ph": "f", "id": 263791, "pid": 76337, "tid": -914061504, "ts": 1716454225608994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225666301, "dur": 10, "args": { "External id": 263792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263792, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263792, "pid": 5, "tid": 7, "ts": 1716454225666301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225608996, "dur": 12, "args": { "External id": 263792, "cbid": 211, "correlation": 263792 } }, { "ph": "s", "id": 263792, "pid": 76337, "tid": -914061504, "ts": 1716454225608996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225666313, "dur": 4, "args": { "External id": 263794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263794, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263794, "pid": 5, "tid": 7, "ts": 1716454225666313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609010, "dur": 6, "args": { "External id": 263794, "cbid": 211, "correlation": 263794 } }, { "ph": "s", "id": 263794, "pid": 76337, "tid": -914061504, "ts": 1716454225609010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225666318, "dur": 6, "args": { "External id": 263804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263804, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263804, "pid": 5, "tid": 7, "ts": 1716454225666318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609067, "dur": 12, "args": { "External id": 263804, "cbid": 211, "correlation": 263804 } }, { "ph": "s", "id": 263804, "pid": 76337, "tid": -914061504, "ts": 1716454225609067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225666325, "dur": 10, "args": { "External id": 263824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263824, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263824, "pid": 5, "tid": 7, "ts": 1716454225666325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609134, "dur": 11, "args": { "External id": 263824, "cbid": 211, "correlation": 263824 } }, { "ph": "s", "id": 263824, "pid": 76337, "tid": -914061504, "ts": 1716454225609134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225666336, "dur": 4, "args": { "External id": 263836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263836, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263836, "pid": 5, "tid": 7, "ts": 1716454225666336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609155, "dur": 6, "args": { "External id": 263836, "cbid": 211, "correlation": 263836 } }, { "ph": "s", "id": 263836, "pid": 76337, "tid": -914061504, "ts": 1716454225609155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225666341, "dur": 7, "args": { "External id": 263839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263839, "pid": 5, "tid": 7, "ts": 1716454225666341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609174, "dur": 6, "args": { "External id": 263839, "cbid": 211, "correlation": 263839 } }, { "ph": "s", "id": 263839, "pid": 76337, "tid": -914061504, "ts": 1716454225609174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225666349, "dur": 5, "args": { "External id": 263848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263848, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263848, "pid": 5, "tid": 7, "ts": 1716454225666349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609213, "dur": 9, "args": { "External id": 263848, "cbid": 211, "correlation": 263848 } }, { "ph": "s", "id": 263848, "pid": 76337, "tid": -914061504, "ts": 1716454225609213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225609275, "dur": 0, "args": { "External id": 263858, "cbid": 317, "correlation": 263858 } }, { "ph": "f", "id": 263858, "pid": 76337, "tid": -914061504, "ts": 1716454225609275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225609276, "dur": 0, "args": { "External id": 263859, "cbid": 203, "correlation": 263859 } }, { "ph": "f", "id": 263859, "pid": 76337, "tid": -914061504, "ts": 1716454225609276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225609277, "dur": 0, "args": { "External id": 263860, "cbid": 205, "correlation": 263860 } }, { "ph": "f", "id": 263860, "pid": 76337, "tid": -914061504, "ts": 1716454225609277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225666355, "dur": 5, "args": { "External id": 263864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263864, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263864, "pid": 5, "tid": 7, "ts": 1716454225666355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609291, "dur": 13, "args": { "External id": 263864, "cbid": 211, "correlation": 263864 } }, { "ph": "s", "id": 263864, "pid": 76337, "tid": -914061504, "ts": 1716454225609291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225666362, "dur": 165, "args": { "External id": 263866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263866, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263866, "pid": 5, "tid": 7, "ts": 1716454225666362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609306, "dur": 5, "args": { "External id": 263866, "cbid": 211, "correlation": 263866 } }, { "ph": "s", "id": 263866, "pid": 76337, "tid": -914061504, "ts": 1716454225609306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225666529, "dur": 1, "args": { "External id": 263868, "device": 5, "context": 1, "stream": 7, "correlation": 263868, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 263868, "pid": 5, "tid": 7, "ts": 1716454225666529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225609316, "dur": 7, "args": { "External id": 263868, "cbid": 51, "correlation": 263868 } }, { "ph": "s", "id": 263868, "pid": 76337, "tid": -914061504, "ts": 1716454225609316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225666533, "dur": 264, "args": { "External id": 263869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263869, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 263869, "pid": 5, "tid": 7, "ts": 1716454225666533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609324, "dur": 6, "args": { "External id": 263869, "cbid": 211, "correlation": 263869 } }, { "ph": "s", "id": 263869, "pid": 76337, "tid": -914061504, "ts": 1716454225609324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225666798, "dur": 6, "args": { "External id": 263871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263871, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263871, "pid": 5, "tid": 7, "ts": 1716454225666798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609334, "dur": 5, "args": { "External id": 263871, "cbid": 211, "correlation": 263871 } }, { "ph": "s", "id": 263871, "pid": 76337, "tid": -914061504, "ts": 1716454225609334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225666806, "dur": 6, "args": { "External id": 263877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263877, "pid": 5, "tid": 7, "ts": 1716454225666806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609363, "dur": 9, "args": { "External id": 263877, "cbid": 211, "correlation": 263877 } }, { "ph": "s", "id": 263877, "pid": 76337, "tid": -914061504, "ts": 1716454225609363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225666813, "dur": 5, "args": { "External id": 263885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263885, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263885, "pid": 5, "tid": 7, "ts": 1716454225666813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609396, "dur": 9, "args": { "External id": 263885, "cbid": 211, "correlation": 263885 } }, { "ph": "s", "id": 263885, "pid": 76337, "tid": -914061504, "ts": 1716454225609396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225666820, "dur": 5, "args": { "External id": 263893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263893, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263893, "pid": 5, "tid": 7, "ts": 1716454225666820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609425, "dur": 8, "args": { "External id": 263893, "cbid": 211, "correlation": 263893 } }, { "ph": "s", "id": 263893, "pid": 76337, "tid": -914061504, "ts": 1716454225609425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225666826, "dur": 10, "args": { "External id": 263913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263913, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 263913, "pid": 5, "tid": 7, "ts": 1716454225666826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609565, "dur": 13, "args": { "External id": 263913, "cbid": 211, "correlation": 263913 } }, { "ph": "s", "id": 263913, "pid": 76337, "tid": -914061504, "ts": 1716454225609565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225666837, "dur": 4, "args": { "External id": 263925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263925, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 263925, "pid": 5, "tid": 7, "ts": 1716454225666837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609588, "dur": 7, "args": { "External id": 263925, "cbid": 211, "correlation": 263925 } }, { "ph": "s", "id": 263925, "pid": 76337, "tid": -914061504, "ts": 1716454225609588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225666842, "dur": 7, "args": { "External id": 263928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263928, "pid": 5, "tid": 7, "ts": 1716454225666842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609607, "dur": 8, "args": { "External id": 263928, "cbid": 211, "correlation": 263928 } }, { "ph": "s", "id": 263928, "pid": 76337, "tid": -914061504, "ts": 1716454225609607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225609667, "dur": 0, "args": { "External id": 263939, "cbid": 317, "correlation": 263939 } }, { "ph": "f", "id": 263939, "pid": 76337, "tid": -914061504, "ts": 1716454225609667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225609668, "dur": 0, "args": { "External id": 263940, "cbid": 203, "correlation": 263940 } }, { "ph": "f", "id": 263940, "pid": 76337, "tid": -914061504, "ts": 1716454225609668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225609668, "dur": 0, "args": { "External id": 263941, "cbid": 205, "correlation": 263941 } }, { "ph": "f", "id": 263941, "pid": 76337, "tid": -914061504, "ts": 1716454225609668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225666850, "dur": 5, "args": { "External id": 263945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263945, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263945, "pid": 5, "tid": 7, "ts": 1716454225666850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609685, "dur": 13, "args": { "External id": 263945, "cbid": 211, "correlation": 263945 } }, { "ph": "s", "id": 263945, "pid": 76337, "tid": -914061504, "ts": 1716454225609685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225666856, "dur": 38, "args": { "External id": 263947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263947, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 263947, "pid": 5, "tid": 7, "ts": 1716454225666856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609705, "dur": 9, "args": { "External id": 263947, "cbid": 211, "correlation": 263947 } }, { "ph": "s", "id": 263947, "pid": 76337, "tid": -914061504, "ts": 1716454225609705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225666895, "dur": 5, "args": { "External id": 263949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263949, "pid": 5, "tid": 7, "ts": 1716454225666895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609717, "dur": 5, "args": { "External id": 263949, "cbid": 211, "correlation": 263949 } }, { "ph": "s", "id": 263949, "pid": 76337, "tid": -914061504, "ts": 1716454225609717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225666902, "dur": 6, "args": { "External id": 263955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 263955, "pid": 5, "tid": 7, "ts": 1716454225666902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609745, "dur": 9, "args": { "External id": 263955, "cbid": 211, "correlation": 263955 } }, { "ph": "s", "id": 263955, "pid": 76337, "tid": -914061504, "ts": 1716454225609745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225666909, "dur": 21, "args": { "External id": 263964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263964, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 263964, "pid": 5, "tid": 7, "ts": 1716454225666909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609828, "dur": 14, "args": { "External id": 263964, "cbid": 211, "correlation": 263964 } }, { "ph": "s", "id": 263964, "pid": 76337, "tid": -914061504, "ts": 1716454225609828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225666931, "dur": 11, "args": { "External id": 263986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263986, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 263986, "pid": 5, "tid": 7, "ts": 1716454225666931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609885, "dur": 10, "args": { "External id": 263986, "cbid": 211, "correlation": 263986 } }, { "ph": "s", "id": 263986, "pid": 76337, "tid": -914061504, "ts": 1716454225609885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225609984, "dur": 2, "args": { "External id": 263997, "cbid": 251, "correlation": 263997 } }, { "ph": "f", "id": 263997, "pid": 76337, "tid": -914061504, "ts": 1716454225609984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225609990, "dur": 0, "args": { "External id": 263998, "cbid": 251, "correlation": 263998 } }, { "ph": "f", "id": 263998, "pid": 76337, "tid": -914061504, "ts": 1716454225609990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225666943, "dur": 55, "args": { "External id": 263999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 263999, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 263999, "pid": 5, "tid": 7, "ts": 1716454225666943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225609993, "dur": 15, "args": { "External id": 263999, "cbid": 211, "correlation": 263999 } }, { "ph": "s", "id": 263999, "pid": 76337, "tid": -914061504, "ts": 1716454225609993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610067, "dur": 1, "args": { "External id": 264010, "cbid": 251, "correlation": 264010 } }, { "ph": "f", "id": 264010, "pid": 76337, "tid": -914061504, "ts": 1716454225610067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610070, "dur": 0, "args": { "External id": 264011, "cbid": 251, "correlation": 264011 } }, { "ph": "f", "id": 264011, "pid": 76337, "tid": -914061504, "ts": 1716454225610070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225666999, "dur": 54, "args": { "External id": 264012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264012, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 264012, "pid": 5, "tid": 7, "ts": 1716454225666999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610072, "dur": 11, "args": { "External id": 264012, "cbid": 211, "correlation": 264012 } }, { "ph": "s", "id": 264012, "pid": 76337, "tid": -914061504, "ts": 1716454225610072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610136, "dur": 1, "args": { "External id": 264023, "cbid": 251, "correlation": 264023 } }, { "ph": "f", "id": 264023, "pid": 76337, "tid": -914061504, "ts": 1716454225610136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610140, "dur": 0, "args": { "External id": 264024, "cbid": 251, "correlation": 264024 } }, { "ph": "f", "id": 264024, "pid": 76337, "tid": -914061504, "ts": 1716454225610140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225667055, "dur": 54, "args": { "External id": 264025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264025, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 264025, "pid": 5, "tid": 7, "ts": 1716454225667055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610142, "dur": 12, "args": { "External id": 264025, "cbid": 211, "correlation": 264025 } }, { "ph": "s", "id": 264025, "pid": 76337, "tid": -914061504, "ts": 1716454225610142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225667110, "dur": 58, "args": { "External id": 264050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264050, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264050, "pid": 5, "tid": 7, "ts": 1716454225667110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610227, "dur": 13, "args": { "External id": 264050, "cbid": 211, "correlation": 264050 } }, { "ph": "s", "id": 264050, "pid": 76337, "tid": -914061504, "ts": 1716454225610227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610326, "dur": 1, "args": { "External id": 264068, "cbid": 251, "correlation": 264068 } }, { "ph": "f", "id": 264068, "pid": 76337, "tid": -914061504, "ts": 1716454225610326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225667169, "dur": 64, "args": { "External id": 264070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264070, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 264070, "pid": 5, "tid": 7, "ts": 1716454225667169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610332, "dur": 13, "args": { "External id": 264070, "cbid": 211, "correlation": 264070 } }, { "ph": "s", "id": 264070, "pid": 76337, "tid": -914061504, "ts": 1716454225610332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225667234, "dur": 6, "args": { "External id": 264078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264078, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264078, "pid": 5, "tid": 7, "ts": 1716454225667234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610402, "dur": 12, "args": { "External id": 264078, "cbid": 211, "correlation": 264078 } }, { "ph": "s", "id": 264078, "pid": 76337, "tid": -914061504, "ts": 1716454225610402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225667242, "dur": 7, "args": { "External id": 264086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264086, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264086, "pid": 5, "tid": 7, "ts": 1716454225667242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610440, "dur": 9, "args": { "External id": 264086, "cbid": 211, "correlation": 264086 } }, { "ph": "s", "id": 264086, "pid": 76337, "tid": -914061504, "ts": 1716454225610440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667250, "dur": 8, "args": { "External id": 264097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264097, "pid": 5, "tid": 7, "ts": 1716454225667250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610513, "dur": 13, "args": { "External id": 264097, "cbid": 211, "correlation": 264097 } }, { "ph": "s", "id": 264097, "pid": 76337, "tid": -914061504, "ts": 1716454225610513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225667259, "dur": 9, "args": { "External id": 264119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264119, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 264119, "pid": 5, "tid": 7, "ts": 1716454225667259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610545, "dur": 8, "args": { "External id": 264119, "cbid": 211, "correlation": 264119 } }, { "ph": "s", "id": 264119, "pid": 76337, "tid": -914061504, "ts": 1716454225610545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610630, "dur": 2, "args": { "External id": 264130, "cbid": 251, "correlation": 264130 } }, { "ph": "f", "id": 264130, "pid": 76337, "tid": -914061504, "ts": 1716454225610630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225667270, "dur": 1, "args": { "External id": 264131, "device": 5, "context": 1, "stream": 7, "correlation": 264131, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 264131, "pid": 5, "tid": 7, "ts": 1716454225667270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225610636, "dur": 11, "args": { "External id": 264131, "cbid": 51, "correlation": 264131 } }, { "ph": "s", "id": 264131, "pid": 76337, "tid": -914061504, "ts": 1716454225610636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225667274, "dur": 37, "args": { "External id": 264132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264132, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 264132, "pid": 5, "tid": 7, "ts": 1716454225667274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610648, "dur": 12, "args": { "External id": 264132, "cbid": 211, "correlation": 264132 } }, { "ph": "s", "id": 264132, "pid": 76337, "tid": -914061504, "ts": 1716454225610648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610719, "dur": 1, "args": { "External id": 264143, "cbid": 251, "correlation": 264143 } }, { "ph": "f", "id": 264143, "pid": 76337, "tid": -914061504, "ts": 1716454225610719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610723, "dur": 0, "args": { "External id": 264144, "cbid": 251, "correlation": 264144 } }, { "ph": "f", "id": 264144, "pid": 76337, "tid": -914061504, "ts": 1716454225610723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225667312, "dur": 12, "args": { "External id": 264145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264145, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264145, "pid": 5, "tid": 7, "ts": 1716454225667312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610725, "dur": 12, "args": { "External id": 264145, "cbid": 211, "correlation": 264145 } }, { "ph": "s", "id": 264145, "pid": 76337, "tid": -914061504, "ts": 1716454225610725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225667325, "dur": 5, "args": { "External id": 264147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264147, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264147, "pid": 5, "tid": 7, "ts": 1716454225667325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610738, "dur": 5, "args": { "External id": 264147, "cbid": 211, "correlation": 264147 } }, { "ph": "s", "id": 264147, "pid": 76337, "tid": -914061504, "ts": 1716454225610738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610795, "dur": 1, "args": { "External id": 264158, "cbid": 251, "correlation": 264158 } }, { "ph": "f", "id": 264158, "pid": 76337, "tid": -914061504, "ts": 1716454225610795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610799, "dur": 0, "args": { "External id": 264159, "cbid": 251, "correlation": 264159 } }, { "ph": "f", "id": 264159, "pid": 76337, "tid": -914061504, "ts": 1716454225610799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225667332, "dur": 9, "args": { "External id": 264160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264160, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264160, "pid": 5, "tid": 7, "ts": 1716454225667332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610800, "dur": 12, "args": { "External id": 264160, "cbid": 211, "correlation": 264160 } }, { "ph": "s", "id": 264160, "pid": 76337, "tid": -914061504, "ts": 1716454225610800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225667342, "dur": 3, "args": { "External id": 264162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264162, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264162, "pid": 5, "tid": 7, "ts": 1716454225667342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610814, "dur": 5, "args": { "External id": 264162, "cbid": 211, "correlation": 264162 } }, { "ph": "s", "id": 264162, "pid": 76337, "tid": -914061504, "ts": 1716454225610814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225667346, "dur": 20, "args": { "External id": 264187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264187, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 264187, "pid": 5, "tid": 7, "ts": 1716454225667346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225610891, "dur": 13, "args": { "External id": 264187, "cbid": 211, "correlation": 264187 } }, { "ph": "s", "id": 264187, "pid": 76337, "tid": -914061504, "ts": 1716454225610891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225610999, "dur": 2, "args": { "External id": 264205, "cbid": 251, "correlation": 264205 } }, { "ph": "f", "id": 264205, "pid": 76337, "tid": -914061504, "ts": 1716454225610999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225667369, "dur": 1, "args": { "External id": 264207, "device": 5, "context": 1, "stream": 7, "correlation": 264207, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 264207, "pid": 5, "tid": 7, "ts": 1716454225667369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225611006, "dur": 11, "args": { "External id": 264207, "cbid": 51, "correlation": 264207 } }, { "ph": "s", "id": 264207, "pid": 76337, "tid": -914061504, "ts": 1716454225611006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225667372, "dur": 37, "args": { "External id": 264208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264208, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 264208, "pid": 5, "tid": 7, "ts": 1716454225667372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611018, "dur": 12, "args": { "External id": 264208, "cbid": 211, "correlation": 264208 } }, { "ph": "s", "id": 264208, "pid": 76337, "tid": -914061504, "ts": 1716454225611018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225667411, "dur": 4, "args": { "External id": 264216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264216, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264216, "pid": 5, "tid": 7, "ts": 1716454225667411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611090, "dur": 12, "args": { "External id": 264216, "cbid": 211, "correlation": 264216 } }, { "ph": "s", "id": 264216, "pid": 76337, "tid": -914061504, "ts": 1716454225611090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667416, "dur": 8, "args": { "External id": 264224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264224, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264224, "pid": 5, "tid": 7, "ts": 1716454225667416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611131, "dur": 9, "args": { "External id": 264224, "cbid": 211, "correlation": 264224 } }, { "ph": "s", "id": 264224, "pid": 76337, "tid": -914061504, "ts": 1716454225611131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225667426, "dur": 8, "args": { "External id": 264246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264246, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 264246, "pid": 5, "tid": 7, "ts": 1716454225667426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611184, "dur": 10, "args": { "External id": 264246, "cbid": 211, "correlation": 264246 } }, { "ph": "s", "id": 264246, "pid": 76337, "tid": -914061504, "ts": 1716454225611184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225611275, "dur": 1, "args": { "External id": 264262, "cbid": 251, "correlation": 264262 } }, { "ph": "f", "id": 264262, "pid": 76337, "tid": -914061504, "ts": 1716454225611275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225611280, "dur": 0, "args": { "External id": 264264, "cbid": 251, "correlation": 264264 } }, { "ph": "f", "id": 264264, "pid": 76337, "tid": -914061504, "ts": 1716454225611280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225667436, "dur": 189, "args": { "External id": 264265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264265, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264265, "pid": 5, "tid": 7, "ts": 1716454225667436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611282, "dur": 13, "args": { "External id": 264265, "cbid": 211, "correlation": 264265 } }, { "ph": "s", "id": 264265, "pid": 76337, "tid": -914061504, "ts": 1716454225611282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667626, "dur": 21, "args": { "External id": 264273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264273, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264273, "pid": 5, "tid": 7, "ts": 1716454225667626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611347, "dur": 13, "args": { "External id": 264273, "cbid": 211, "correlation": 264273 } }, { "ph": "s", "id": 264273, "pid": 76337, "tid": -914061504, "ts": 1716454225611347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667649, "dur": 21, "args": { "External id": 264281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264281, "pid": 5, "tid": 7, "ts": 1716454225667649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611379, "dur": 8, "args": { "External id": 264281, "cbid": 211, "correlation": 264281 } }, { "ph": "s", "id": 264281, "pid": 76337, "tid": -914061504, "ts": 1716454225611379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225611459, "dur": 1, "args": { "External id": 264297, "cbid": 251, "correlation": 264297 } }, { "ph": "f", "id": 264297, "pid": 76337, "tid": -914061504, "ts": 1716454225611459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225667672, "dur": 1, "args": { "External id": 264299, "device": 5, "context": 1, "stream": 7, "correlation": 264299, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 264299, "pid": 5, "tid": 7, "ts": 1716454225667672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225611464, "dur": 9, "args": { "External id": 264299, "cbid": 51, "correlation": 264299 } }, { "ph": "s", "id": 264299, "pid": 76337, "tid": -914061504, "ts": 1716454225611464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225667676, "dur": 111, "args": { "External id": 264300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264300, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 264300, "pid": 5, "tid": 7, "ts": 1716454225667676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611475, "dur": 11, "args": { "External id": 264300, "cbid": 211, "correlation": 264300 } }, { "ph": "s", "id": 264300, "pid": 76337, "tid": -914061504, "ts": 1716454225611475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225667789, "dur": 6, "args": { "External id": 264308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264308, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264308, "pid": 5, "tid": 7, "ts": 1716454225667789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611517, "dur": 10, "args": { "External id": 264308, "cbid": 211, "correlation": 264308 } }, { "ph": "s", "id": 264308, "pid": 76337, "tid": -914061504, "ts": 1716454225611517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667796, "dur": 10, "args": { "External id": 264319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264319, "pid": 5, "tid": 7, "ts": 1716454225667796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611585, "dur": 12, "args": { "External id": 264319, "cbid": 211, "correlation": 264319 } }, { "ph": "s", "id": 264319, "pid": 76337, "tid": -914061504, "ts": 1716454225611585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225611650, "dur": 0, "args": { "External id": 264331, "cbid": 317, "correlation": 264331 } }, { "ph": "f", "id": 264331, "pid": 76337, "tid": -914061504, "ts": 1716454225611650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225611651, "dur": 0, "args": { "External id": 264332, "cbid": 203, "correlation": 264332 } }, { "ph": "f", "id": 264332, "pid": 76337, "tid": -914061504, "ts": 1716454225611651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225611651, "dur": 0, "args": { "External id": 264333, "cbid": 205, "correlation": 264333 } }, { "ph": "f", "id": 264333, "pid": 76337, "tid": -914061504, "ts": 1716454225611651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225667807, "dur": 5, "args": { "External id": 264337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264337, "pid": 5, "tid": 7, "ts": 1716454225667807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611667, "dur": 12, "args": { "External id": 264337, "cbid": 211, "correlation": 264337 } }, { "ph": "s", "id": 264337, "pid": 76337, "tid": -914061504, "ts": 1716454225611667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225667814, "dur": 38, "args": { "External id": 264339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264339, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 264339, "pid": 5, "tid": 7, "ts": 1716454225667814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611686, "dur": 7, "args": { "External id": 264339, "cbid": 211, "correlation": 264339 } }, { "ph": "s", "id": 264339, "pid": 76337, "tid": -914061504, "ts": 1716454225611686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225667853, "dur": 6, "args": { "External id": 264341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264341, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264341, "pid": 5, "tid": 7, "ts": 1716454225667853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611697, "dur": 5, "args": { "External id": 264341, "cbid": 211, "correlation": 264341 } }, { "ph": "s", "id": 264341, "pid": 76337, "tid": -914061504, "ts": 1716454225611697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667860, "dur": 7, "args": { "External id": 264347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264347, "pid": 5, "tid": 7, "ts": 1716454225667860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611724, "dur": 8, "args": { "External id": 264347, "cbid": 211, "correlation": 264347 } }, { "ph": "s", "id": 264347, "pid": 76337, "tid": -914061504, "ts": 1716454225611724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225667868, "dur": 5, "args": { "External id": 264355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264355, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264355, "pid": 5, "tid": 7, "ts": 1716454225667868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611755, "dur": 9, "args": { "External id": 264355, "cbid": 211, "correlation": 264355 } }, { "ph": "s", "id": 264355, "pid": 76337, "tid": -914061504, "ts": 1716454225611755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225667875, "dur": 11, "args": { "External id": 264375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264375, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 264375, "pid": 5, "tid": 7, "ts": 1716454225667875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611827, "dur": 11, "args": { "External id": 264375, "cbid": 211, "correlation": 264375 } }, { "ph": "s", "id": 264375, "pid": 76337, "tid": -914061504, "ts": 1716454225611827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225667888, "dur": 5, "args": { "External id": 264387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264387, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 264387, "pid": 5, "tid": 7, "ts": 1716454225667888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611848, "dur": 6, "args": { "External id": 264387, "cbid": 211, "correlation": 264387 } }, { "ph": "s", "id": 264387, "pid": 76337, "tid": -914061504, "ts": 1716454225611848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225667894, "dur": 8, "args": { "External id": 264390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264390, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264390, "pid": 5, "tid": 7, "ts": 1716454225667894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611866, "dur": 7, "args": { "External id": 264390, "cbid": 211, "correlation": 264390 } }, { "ph": "s", "id": 264390, "pid": 76337, "tid": -914061504, "ts": 1716454225611866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225667903, "dur": 5, "args": { "External id": 264399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264399, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264399, "pid": 5, "tid": 7, "ts": 1716454225667903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611903, "dur": 10, "args": { "External id": 264399, "cbid": 211, "correlation": 264399 } }, { "ph": "s", "id": 264399, "pid": 76337, "tid": -914061504, "ts": 1716454225611903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225611954, "dur": 0, "args": { "External id": 264409, "cbid": 317, "correlation": 264409 } }, { "ph": "f", "id": 264409, "pid": 76337, "tid": -914061504, "ts": 1716454225611954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225611955, "dur": 0, "args": { "External id": 264410, "cbid": 203, "correlation": 264410 } }, { "ph": "f", "id": 264410, "pid": 76337, "tid": -914061504, "ts": 1716454225611955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225611956, "dur": 0, "args": { "External id": 264411, "cbid": 205, "correlation": 264411 } }, { "ph": "f", "id": 264411, "pid": 76337, "tid": -914061504, "ts": 1716454225611956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225667910, "dur": 5, "args": { "External id": 264415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264415, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264415, "pid": 5, "tid": 7, "ts": 1716454225667910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611969, "dur": 20, "args": { "External id": 264415, "cbid": 211, "correlation": 264415 } }, { "ph": "s", "id": 264415, "pid": 76337, "tid": -914061504, "ts": 1716454225611969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225667916, "dur": 165, "args": { "External id": 264417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264417, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264417, "pid": 5, "tid": 7, "ts": 1716454225667916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225611992, "dur": 5, "args": { "External id": 264417, "cbid": 211, "correlation": 264417 } }, { "ph": "s", "id": 264417, "pid": 76337, "tid": -914061504, "ts": 1716454225611992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225668083, "dur": 1, "args": { "External id": 264419, "device": 5, "context": 1, "stream": 7, "correlation": 264419, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 264419, "pid": 5, "tid": 7, "ts": 1716454225668083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225612003, "dur": 7, "args": { "External id": 264419, "cbid": 51, "correlation": 264419 } }, { "ph": "s", "id": 264419, "pid": 76337, "tid": -914061504, "ts": 1716454225612003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225668087, "dur": 275, "args": { "External id": 264420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264420, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264420, "pid": 5, "tid": 7, "ts": 1716454225668087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612012, "dur": 7, "args": { "External id": 264420, "cbid": 211, "correlation": 264420 } }, { "ph": "s", "id": 264420, "pid": 76337, "tid": -914061504, "ts": 1716454225612012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225668363, "dur": 6, "args": { "External id": 264422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264422, "pid": 5, "tid": 7, "ts": 1716454225668363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612023, "dur": 5, "args": { "External id": 264422, "cbid": 211, "correlation": 264422 } }, { "ph": "s", "id": 264422, "pid": 76337, "tid": -914061504, "ts": 1716454225612023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225668370, "dur": 6, "args": { "External id": 264428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264428, "pid": 5, "tid": 7, "ts": 1716454225668370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612050, "dur": 8, "args": { "External id": 264428, "cbid": 211, "correlation": 264428 } }, { "ph": "s", "id": 264428, "pid": 76337, "tid": -914061504, "ts": 1716454225612050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225668378, "dur": 3, "args": { "External id": 264436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264436, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 264436, "pid": 5, "tid": 7, "ts": 1716454225668378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612095, "dur": 10, "args": { "External id": 264436, "cbid": 211, "correlation": 264436 } }, { "ph": "s", "id": 264436, "pid": 76337, "tid": -914061504, "ts": 1716454225612095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225612163, "dur": 1, "args": { "External id": 264452, "cbid": 251, "correlation": 264452 } }, { "ph": "f", "id": 264452, "pid": 76337, "tid": -914061504, "ts": 1716454225612163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225612169, "dur": 0, "args": { "External id": 264454, "cbid": 251, "correlation": 264454 } }, { "ph": "f", "id": 264454, "pid": 76337, "tid": -914061504, "ts": 1716454225612169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225668382, "dur": 13, "args": { "External id": 264455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264455, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264455, "pid": 5, "tid": 7, "ts": 1716454225668382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612171, "dur": 11, "args": { "External id": 264455, "cbid": 211, "correlation": 264455 } }, { "ph": "s", "id": 264455, "pid": 76337, "tid": -914061504, "ts": 1716454225612171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225668396, "dur": 5, "args": { "External id": 264457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264457, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264457, "pid": 5, "tid": 7, "ts": 1716454225668396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612183, "dur": 5, "args": { "External id": 264457, "cbid": 211, "correlation": 264457 } }, { "ph": "s", "id": 264457, "pid": 76337, "tid": -914061504, "ts": 1716454225612183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225668403, "dur": 6, "args": { "External id": 264467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264467, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264467, "pid": 5, "tid": 7, "ts": 1716454225668403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612241, "dur": 12, "args": { "External id": 264467, "cbid": 211, "correlation": 264467 } }, { "ph": "s", "id": 264467, "pid": 76337, "tid": -914061504, "ts": 1716454225612241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225668410, "dur": 10, "args": { "External id": 264487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264487, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 264487, "pid": 5, "tid": 7, "ts": 1716454225668410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612306, "dur": 11, "args": { "External id": 264487, "cbid": 211, "correlation": 264487 } }, { "ph": "s", "id": 264487, "pid": 76337, "tid": -914061504, "ts": 1716454225612306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225668421, "dur": 4, "args": { "External id": 264499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264499, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 264499, "pid": 5, "tid": 7, "ts": 1716454225668421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612327, "dur": 6, "args": { "External id": 264499, "cbid": 211, "correlation": 264499 } }, { "ph": "s", "id": 264499, "pid": 76337, "tid": -914061504, "ts": 1716454225612327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225668427, "dur": 7, "args": { "External id": 264502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264502, "pid": 5, "tid": 7, "ts": 1716454225668427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612345, "dur": 6, "args": { "External id": 264502, "cbid": 211, "correlation": 264502 } }, { "ph": "s", "id": 264502, "pid": 76337, "tid": -914061504, "ts": 1716454225612345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225668435, "dur": 5, "args": { "External id": 264511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264511, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264511, "pid": 5, "tid": 7, "ts": 1716454225668435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612385, "dur": 10, "args": { "External id": 264511, "cbid": 211, "correlation": 264511 } }, { "ph": "s", "id": 264511, "pid": 76337, "tid": -914061504, "ts": 1716454225612385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225612448, "dur": 0, "args": { "External id": 264521, "cbid": 317, "correlation": 264521 } }, { "ph": "f", "id": 264521, "pid": 76337, "tid": -914061504, "ts": 1716454225612448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225612448, "dur": 0, "args": { "External id": 264522, "cbid": 203, "correlation": 264522 } }, { "ph": "f", "id": 264522, "pid": 76337, "tid": -914061504, "ts": 1716454225612448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225612449, "dur": 0, "args": { "External id": 264523, "cbid": 205, "correlation": 264523 } }, { "ph": "f", "id": 264523, "pid": 76337, "tid": -914061504, "ts": 1716454225612449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225668441, "dur": 5, "args": { "External id": 264527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264527, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264527, "pid": 5, "tid": 7, "ts": 1716454225668441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612463, "dur": 12, "args": { "External id": 264527, "cbid": 211, "correlation": 264527 } }, { "ph": "s", "id": 264527, "pid": 76337, "tid": -914061504, "ts": 1716454225612463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225668447, "dur": 165, "args": { "External id": 264529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264529, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264529, "pid": 5, "tid": 7, "ts": 1716454225668447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612478, "dur": 5, "args": { "External id": 264529, "cbid": 211, "correlation": 264529 } }, { "ph": "s", "id": 264529, "pid": 76337, "tid": -914061504, "ts": 1716454225612478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225668615, "dur": 1, "args": { "External id": 264531, "device": 5, "context": 1, "stream": 7, "correlation": 264531, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 264531, "pid": 5, "tid": 7, "ts": 1716454225668615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225612488, "dur": 6, "args": { "External id": 264531, "cbid": 51, "correlation": 264531 } }, { "ph": "s", "id": 264531, "pid": 76337, "tid": -914061504, "ts": 1716454225612488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225668619, "dur": 264, "args": { "External id": 264532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264532, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264532, "pid": 5, "tid": 7, "ts": 1716454225668619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612495, "dur": 6, "args": { "External id": 264532, "cbid": 211, "correlation": 264532 } }, { "ph": "s", "id": 264532, "pid": 76337, "tid": -914061504, "ts": 1716454225612495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225668884, "dur": 6, "args": { "External id": 264534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264534, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264534, "pid": 5, "tid": 7, "ts": 1716454225668884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612505, "dur": 5, "args": { "External id": 264534, "cbid": 211, "correlation": 264534 } }, { "ph": "s", "id": 264534, "pid": 76337, "tid": -914061504, "ts": 1716454225612505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225668891, "dur": 6, "args": { "External id": 264540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264540, "pid": 5, "tid": 7, "ts": 1716454225668891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612534, "dur": 8, "args": { "External id": 264540, "cbid": 211, "correlation": 264540 } }, { "ph": "s", "id": 264540, "pid": 76337, "tid": -914061504, "ts": 1716454225612534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225668899, "dur": 5, "args": { "External id": 264548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264548, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264548, "pid": 5, "tid": 7, "ts": 1716454225668899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612565, "dur": 9, "args": { "External id": 264548, "cbid": 211, "correlation": 264548 } }, { "ph": "s", "id": 264548, "pid": 76337, "tid": -914061504, "ts": 1716454225612565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225668905, "dur": 5, "args": { "External id": 264556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264556, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264556, "pid": 5, "tid": 7, "ts": 1716454225668905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612595, "dur": 8, "args": { "External id": 264556, "cbid": 211, "correlation": 264556 } }, { "ph": "s", "id": 264556, "pid": 76337, "tid": -914061504, "ts": 1716454225612595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225668910, "dur": 12, "args": { "External id": 264565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264565, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264565, "pid": 5, "tid": 7, "ts": 1716454225668910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612683, "dur": 13, "args": { "External id": 264565, "cbid": 211, "correlation": 264565 } }, { "ph": "s", "id": 264565, "pid": 76337, "tid": -914061504, "ts": 1716454225612683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225668923, "dur": 12, "args": { "External id": 264585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264585, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 264585, "pid": 5, "tid": 7, "ts": 1716454225668923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612750, "dur": 11, "args": { "External id": 264585, "cbid": 211, "correlation": 264585 } }, { "ph": "s", "id": 264585, "pid": 76337, "tid": -914061504, "ts": 1716454225612750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225668937, "dur": 4, "args": { "External id": 264597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264597, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264597, "pid": 5, "tid": 7, "ts": 1716454225668937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612771, "dur": 7, "args": { "External id": 264597, "cbid": 211, "correlation": 264597 } }, { "ph": "s", "id": 264597, "pid": 76337, "tid": -914061504, "ts": 1716454225612771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225668942, "dur": 10, "args": { "External id": 264600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264600, "pid": 5, "tid": 7, "ts": 1716454225668942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612790, "dur": 6, "args": { "External id": 264600, "cbid": 211, "correlation": 264600 } }, { "ph": "s", "id": 264600, "pid": 76337, "tid": -914061504, "ts": 1716454225612790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225668954, "dur": 6, "args": { "External id": 264609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264609, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264609, "pid": 5, "tid": 7, "ts": 1716454225668954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612829, "dur": 9, "args": { "External id": 264609, "cbid": 211, "correlation": 264609 } }, { "ph": "s", "id": 264609, "pid": 76337, "tid": -914061504, "ts": 1716454225612829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225612881, "dur": 0, "args": { "External id": 264619, "cbid": 317, "correlation": 264619 } }, { "ph": "f", "id": 264619, "pid": 76337, "tid": -914061504, "ts": 1716454225612881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225612882, "dur": 0, "args": { "External id": 264620, "cbid": 203, "correlation": 264620 } }, { "ph": "f", "id": 264620, "pid": 76337, "tid": -914061504, "ts": 1716454225612882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225612883, "dur": 0, "args": { "External id": 264621, "cbid": 205, "correlation": 264621 } }, { "ph": "f", "id": 264621, "pid": 76337, "tid": -914061504, "ts": 1716454225612883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225668961, "dur": 7, "args": { "External id": 264625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264625, "pid": 5, "tid": 7, "ts": 1716454225668961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612899, "dur": 11, "args": { "External id": 264625, "cbid": 211, "correlation": 264625 } }, { "ph": "s", "id": 264625, "pid": 76337, "tid": -914061504, "ts": 1716454225612899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225668969, "dur": 325, "args": { "External id": 264627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264627, "pid": 5, "tid": 7, "ts": 1716454225668969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612912, "dur": 5, "args": { "External id": 264627, "cbid": 211, "correlation": 264627 } }, { "ph": "s", "id": 264627, "pid": 76337, "tid": -914061504, "ts": 1716454225612912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225669297, "dur": 1, "args": { "External id": 264629, "device": 5, "context": 1, "stream": 7, "correlation": 264629, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 264629, "pid": 5, "tid": 7, "ts": 1716454225669297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225612923, "dur": 8, "args": { "External id": 264629, "cbid": 51, "correlation": 264629 } }, { "ph": "s", "id": 264629, "pid": 76337, "tid": -914061504, "ts": 1716454225612923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225669300, "dur": 504, "args": { "External id": 264630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264630, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264630, "pid": 5, "tid": 7, "ts": 1716454225669300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612932, "dur": 6, "args": { "External id": 264630, "cbid": 211, "correlation": 264630 } }, { "ph": "s", "id": 264630, "pid": 76337, "tid": -914061504, "ts": 1716454225612932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225669806, "dur": 6, "args": { "External id": 264632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264632, "pid": 5, "tid": 7, "ts": 1716454225669806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612942, "dur": 5, "args": { "External id": 264632, "cbid": 211, "correlation": 264632 } }, { "ph": "s", "id": 264632, "pid": 76337, "tid": -914061504, "ts": 1716454225612942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225669813, "dur": 6, "args": { "External id": 264638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264638, "pid": 5, "tid": 7, "ts": 1716454225669813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225612970, "dur": 18, "args": { "External id": 264638, "cbid": 211, "correlation": 264638 } }, { "ph": "s", "id": 264638, "pid": 76337, "tid": -914061504, "ts": 1716454225612970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225669821, "dur": 3, "args": { "External id": 264646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264646, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 264646, "pid": 5, "tid": 7, "ts": 1716454225669821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613023, "dur": 10, "args": { "External id": 264646, "cbid": 211, "correlation": 264646 } }, { "ph": "s", "id": 264646, "pid": 76337, "tid": -914061504, "ts": 1716454225613023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225613086, "dur": 1, "args": { "External id": 264662, "cbid": 251, "correlation": 264662 } }, { "ph": "f", "id": 264662, "pid": 76337, "tid": -914061504, "ts": 1716454225613086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225613091, "dur": 0, "args": { "External id": 264664, "cbid": 251, "correlation": 264664 } }, { "ph": "f", "id": 264664, "pid": 76337, "tid": -914061504, "ts": 1716454225613091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225669825, "dur": 11, "args": { "External id": 264665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264665, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264665, "pid": 5, "tid": 7, "ts": 1716454225669825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613093, "dur": 11, "args": { "External id": 264665, "cbid": 211, "correlation": 264665 } }, { "ph": "s", "id": 264665, "pid": 76337, "tid": -914061504, "ts": 1716454225613093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225669838, "dur": 5, "args": { "External id": 264667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264667, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264667, "pid": 5, "tid": 7, "ts": 1716454225669838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613106, "dur": 5, "args": { "External id": 264667, "cbid": 211, "correlation": 264667 } }, { "ph": "s", "id": 264667, "pid": 76337, "tid": -914061504, "ts": 1716454225613106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225669844, "dur": 6, "args": { "External id": 264677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264677, "pid": 5, "tid": 7, "ts": 1716454225669844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613163, "dur": 13, "args": { "External id": 264677, "cbid": 211, "correlation": 264677 } }, { "ph": "s", "id": 264677, "pid": 76337, "tid": -914061504, "ts": 1716454225613163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225669851, "dur": 10, "args": { "External id": 264697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264697, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 264697, "pid": 5, "tid": 7, "ts": 1716454225669851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613229, "dur": 11, "args": { "External id": 264697, "cbid": 211, "correlation": 264697 } }, { "ph": "s", "id": 264697, "pid": 76337, "tid": -914061504, "ts": 1716454225613229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225669862, "dur": 4, "args": { "External id": 264709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264709, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 264709, "pid": 5, "tid": 7, "ts": 1716454225669862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613250, "dur": 6, "args": { "External id": 264709, "cbid": 211, "correlation": 264709 } }, { "ph": "s", "id": 264709, "pid": 76337, "tid": -914061504, "ts": 1716454225613250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225669867, "dur": 7, "args": { "External id": 264712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264712, "pid": 5, "tid": 7, "ts": 1716454225669867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613268, "dur": 7, "args": { "External id": 264712, "cbid": 211, "correlation": 264712 } }, { "ph": "s", "id": 264712, "pid": 76337, "tid": -914061504, "ts": 1716454225613268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225669875, "dur": 5, "args": { "External id": 264721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264721, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264721, "pid": 5, "tid": 7, "ts": 1716454225669875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613309, "dur": 10, "args": { "External id": 264721, "cbid": 211, "correlation": 264721 } }, { "ph": "s", "id": 264721, "pid": 76337, "tid": -914061504, "ts": 1716454225613309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225613371, "dur": 0, "args": { "External id": 264731, "cbid": 317, "correlation": 264731 } }, { "ph": "f", "id": 264731, "pid": 76337, "tid": -914061504, "ts": 1716454225613371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225613372, "dur": 0, "args": { "External id": 264732, "cbid": 203, "correlation": 264732 } }, { "ph": "f", "id": 264732, "pid": 76337, "tid": -914061504, "ts": 1716454225613372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225613372, "dur": 0, "args": { "External id": 264733, "cbid": 205, "correlation": 264733 } }, { "ph": "f", "id": 264733, "pid": 76337, "tid": -914061504, "ts": 1716454225613372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225669881, "dur": 5, "args": { "External id": 264737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264737, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264737, "pid": 5, "tid": 7, "ts": 1716454225669881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613387, "dur": 12, "args": { "External id": 264737, "cbid": 211, "correlation": 264737 } }, { "ph": "s", "id": 264737, "pid": 76337, "tid": -914061504, "ts": 1716454225613387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225669887, "dur": 165, "args": { "External id": 264739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264739, "pid": 5, "tid": 7, "ts": 1716454225669887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613401, "dur": 5, "args": { "External id": 264739, "cbid": 211, "correlation": 264739 } }, { "ph": "s", "id": 264739, "pid": 76337, "tid": -914061504, "ts": 1716454225613401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225670054, "dur": 1, "args": { "External id": 264741, "device": 5, "context": 1, "stream": 7, "correlation": 264741, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 264741, "pid": 5, "tid": 7, "ts": 1716454225670054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225613411, "dur": 6, "args": { "External id": 264741, "cbid": 51, "correlation": 264741 } }, { "ph": "s", "id": 264741, "pid": 76337, "tid": -914061504, "ts": 1716454225613411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225670058, "dur": 263, "args": { "External id": 264742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264742, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264742, "pid": 5, "tid": 7, "ts": 1716454225670058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613419, "dur": 6, "args": { "External id": 264742, "cbid": 211, "correlation": 264742 } }, { "ph": "s", "id": 264742, "pid": 76337, "tid": -914061504, "ts": 1716454225613419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225670322, "dur": 6, "args": { "External id": 264744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264744, "pid": 5, "tid": 7, "ts": 1716454225670322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613429, "dur": 6, "args": { "External id": 264744, "cbid": 211, "correlation": 264744 } }, { "ph": "s", "id": 264744, "pid": 76337, "tid": -914061504, "ts": 1716454225613429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225670330, "dur": 6, "args": { "External id": 264750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264750, "pid": 5, "tid": 7, "ts": 1716454225670330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613457, "dur": 9, "args": { "External id": 264750, "cbid": 211, "correlation": 264750 } }, { "ph": "s", "id": 264750, "pid": 76337, "tid": -914061504, "ts": 1716454225613457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225613517, "dur": 0, "args": { "External id": 264760, "cbid": 317, "correlation": 264760 } }, { "ph": "f", "id": 264760, "pid": 76337, "tid": -914061504, "ts": 1716454225613517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225613517, "dur": 0, "args": { "External id": 264761, "cbid": 203, "correlation": 264761 } }, { "ph": "f", "id": 264761, "pid": 76337, "tid": -914061504, "ts": 1716454225613517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225613518, "dur": 0, "args": { "External id": 264762, "cbid": 205, "correlation": 264762 } }, { "ph": "f", "id": 264762, "pid": 76337, "tid": -914061504, "ts": 1716454225613518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225670337, "dur": 8, "args": { "External id": 264766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264766, "pid": 5, "tid": 7, "ts": 1716454225670337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613533, "dur": 12, "args": { "External id": 264766, "cbid": 211, "correlation": 264766 } }, { "ph": "s", "id": 264766, "pid": 76337, "tid": -914061504, "ts": 1716454225613533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225670347, "dur": 3, "args": { "External id": 264768, "device": 5, "context": 1, "stream": 7, "correlation": 264768, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 264768, "pid": 5, "tid": 7, "ts": 1716454225670347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225613551, "dur": 15, "args": { "External id": 264768, "cbid": 51, "correlation": 264768 } }, { "ph": "s", "id": 264768, "pid": 76337, "tid": -914061504, "ts": 1716454225613551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225670351, "dur": 97, "args": { "External id": 264769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264769, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 264769, "pid": 5, "tid": 7, "ts": 1716454225670351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613567, "dur": 7, "args": { "External id": 264769, "cbid": 211, "correlation": 264769 } }, { "ph": "s", "id": 264769, "pid": 76337, "tid": -914061504, "ts": 1716454225613567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225670449, "dur": 6, "args": { "External id": 264771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264771, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264771, "pid": 5, "tid": 7, "ts": 1716454225670449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613578, "dur": 6, "args": { "External id": 264771, "cbid": 211, "correlation": 264771 } }, { "ph": "s", "id": 264771, "pid": 76337, "tid": -914061504, "ts": 1716454225613578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225670456, "dur": 6, "args": { "External id": 264777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264777, "pid": 5, "tid": 7, "ts": 1716454225670456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613605, "dur": 8, "args": { "External id": 264777, "cbid": 211, "correlation": 264777 } }, { "ph": "s", "id": 264777, "pid": 76337, "tid": -914061504, "ts": 1716454225613605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225670464, "dur": 5, "args": { "External id": 264785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264785, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264785, "pid": 5, "tid": 7, "ts": 1716454225670464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613635, "dur": 8, "args": { "External id": 264785, "cbid": 211, "correlation": 264785 } }, { "ph": "s", "id": 264785, "pid": 76337, "tid": -914061504, "ts": 1716454225613635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225670470, "dur": 5, "args": { "External id": 264793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264793, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264793, "pid": 5, "tid": 7, "ts": 1716454225670470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613663, "dur": 8, "args": { "External id": 264793, "cbid": 211, "correlation": 264793 } }, { "ph": "s", "id": 264793, "pid": 76337, "tid": -914061504, "ts": 1716454225613663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225670476, "dur": 11, "args": { "External id": 264802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264802, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264802, "pid": 5, "tid": 7, "ts": 1716454225670476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613707, "dur": 10, "args": { "External id": 264802, "cbid": 211, "correlation": 264802 } }, { "ph": "s", "id": 264802, "pid": 76337, "tid": -914061504, "ts": 1716454225613707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225670489, "dur": 12, "args": { "External id": 264822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264822, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 264822, "pid": 5, "tid": 7, "ts": 1716454225670489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613776, "dur": 12, "args": { "External id": 264822, "cbid": 211, "correlation": 264822 } }, { "ph": "s", "id": 264822, "pid": 76337, "tid": -914061504, "ts": 1716454225613776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225670503, "dur": 4, "args": { "External id": 264834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264834, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264834, "pid": 5, "tid": 7, "ts": 1716454225670503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613797, "dur": 6, "args": { "External id": 264834, "cbid": 211, "correlation": 264834 } }, { "ph": "s", "id": 264834, "pid": 76337, "tid": -914061504, "ts": 1716454225613797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225670508, "dur": 11, "args": { "External id": 264837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264837, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264837, "pid": 5, "tid": 7, "ts": 1716454225670508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613815, "dur": 6, "args": { "External id": 264837, "cbid": 211, "correlation": 264837 } }, { "ph": "s", "id": 264837, "pid": 76337, "tid": -914061504, "ts": 1716454225613815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225670521, "dur": 6, "args": { "External id": 264846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264846, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264846, "pid": 5, "tid": 7, "ts": 1716454225670521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613853, "dur": 9, "args": { "External id": 264846, "cbid": 211, "correlation": 264846 } }, { "ph": "s", "id": 264846, "pid": 76337, "tid": -914061504, "ts": 1716454225613853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225613905, "dur": 0, "args": { "External id": 264856, "cbid": 317, "correlation": 264856 } }, { "ph": "f", "id": 264856, "pid": 76337, "tid": -914061504, "ts": 1716454225613905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225613906, "dur": 0, "args": { "External id": 264857, "cbid": 203, "correlation": 264857 } }, { "ph": "f", "id": 264857, "pid": 76337, "tid": -914061504, "ts": 1716454225613906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225613906, "dur": 0, "args": { "External id": 264858, "cbid": 205, "correlation": 264858 } }, { "ph": "f", "id": 264858, "pid": 76337, "tid": -914061504, "ts": 1716454225613906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225670528, "dur": 7, "args": { "External id": 264862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264862, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264862, "pid": 5, "tid": 7, "ts": 1716454225670528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613919, "dur": 11, "args": { "External id": 264862, "cbid": 211, "correlation": 264862 } }, { "ph": "s", "id": 264862, "pid": 76337, "tid": -914061504, "ts": 1716454225613919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225670536, "dur": 325, "args": { "External id": 264864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264864, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264864, "pid": 5, "tid": 7, "ts": 1716454225670536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613933, "dur": 5, "args": { "External id": 264864, "cbid": 211, "correlation": 264864 } }, { "ph": "s", "id": 264864, "pid": 76337, "tid": -914061504, "ts": 1716454225613933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225670863, "dur": 1, "args": { "External id": 264866, "device": 5, "context": 1, "stream": 7, "correlation": 264866, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 264866, "pid": 5, "tid": 7, "ts": 1716454225670863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225613944, "dur": 6, "args": { "External id": 264866, "cbid": 51, "correlation": 264866 } }, { "ph": "s", "id": 264866, "pid": 76337, "tid": -914061504, "ts": 1716454225613944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225670867, "dur": 504, "args": { "External id": 264867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264867, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264867, "pid": 5, "tid": 7, "ts": 1716454225670867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613951, "dur": 6, "args": { "External id": 264867, "cbid": 211, "correlation": 264867 } }, { "ph": "s", "id": 264867, "pid": 76337, "tid": -914061504, "ts": 1716454225613951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225671372, "dur": 5, "args": { "External id": 264869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264869, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264869, "pid": 5, "tid": 7, "ts": 1716454225671372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613961, "dur": 5, "args": { "External id": 264869, "cbid": 211, "correlation": 264869 } }, { "ph": "s", "id": 264869, "pid": 76337, "tid": -914061504, "ts": 1716454225613961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225671379, "dur": 6, "args": { "External id": 264875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264875, "pid": 5, "tid": 7, "ts": 1716454225671379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225613997, "dur": 9, "args": { "External id": 264875, "cbid": 211, "correlation": 264875 } }, { "ph": "s", "id": 264875, "pid": 76337, "tid": -914061504, "ts": 1716454225613997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225671387, "dur": 3, "args": { "External id": 264883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264883, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 264883, "pid": 5, "tid": 7, "ts": 1716454225671387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614043, "dur": 9, "args": { "External id": 264883, "cbid": 211, "correlation": 264883 } }, { "ph": "s", "id": 264883, "pid": 76337, "tid": -914061504, "ts": 1716454225614043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225614104, "dur": 1, "args": { "External id": 264899, "cbid": 251, "correlation": 264899 } }, { "ph": "f", "id": 264899, "pid": 76337, "tid": -914061504, "ts": 1716454225614104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225614109, "dur": 0, "args": { "External id": 264901, "cbid": 251, "correlation": 264901 } }, { "ph": "f", "id": 264901, "pid": 76337, "tid": -914061504, "ts": 1716454225614109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225671391, "dur": 13, "args": { "External id": 264902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264902, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264902, "pid": 5, "tid": 7, "ts": 1716454225671391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614111, "dur": 11, "args": { "External id": 264902, "cbid": 211, "correlation": 264902 } }, { "ph": "s", "id": 264902, "pid": 76337, "tid": -914061504, "ts": 1716454225614111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225671405, "dur": 5, "args": { "External id": 264904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264904, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264904, "pid": 5, "tid": 7, "ts": 1716454225671405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614123, "dur": 5, "args": { "External id": 264904, "cbid": 211, "correlation": 264904 } }, { "ph": "s", "id": 264904, "pid": 76337, "tid": -914061504, "ts": 1716454225614123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225671412, "dur": 6, "args": { "External id": 264914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264914, "pid": 5, "tid": 7, "ts": 1716454225671412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614180, "dur": 13, "args": { "External id": 264914, "cbid": 211, "correlation": 264914 } }, { "ph": "s", "id": 264914, "pid": 76337, "tid": -914061504, "ts": 1716454225614180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225671419, "dur": 10, "args": { "External id": 264934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264934, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 264934, "pid": 5, "tid": 7, "ts": 1716454225671419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614246, "dur": 11, "args": { "External id": 264934, "cbid": 211, "correlation": 264934 } }, { "ph": "s", "id": 264934, "pid": 76337, "tid": -914061504, "ts": 1716454225614246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225671430, "dur": 4, "args": { "External id": 264946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264946, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 264946, "pid": 5, "tid": 7, "ts": 1716454225671430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614266, "dur": 6, "args": { "External id": 264946, "cbid": 211, "correlation": 264946 } }, { "ph": "s", "id": 264946, "pid": 76337, "tid": -914061504, "ts": 1716454225614266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225671435, "dur": 7, "args": { "External id": 264949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264949, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264949, "pid": 5, "tid": 7, "ts": 1716454225671435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614285, "dur": 6, "args": { "External id": 264949, "cbid": 211, "correlation": 264949 } }, { "ph": "s", "id": 264949, "pid": 76337, "tid": -914061504, "ts": 1716454225614285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225671443, "dur": 5, "args": { "External id": 264958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264958, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264958, "pid": 5, "tid": 7, "ts": 1716454225671443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614325, "dur": 10, "args": { "External id": 264958, "cbid": 211, "correlation": 264958 } }, { "ph": "s", "id": 264958, "pid": 76337, "tid": -914061504, "ts": 1716454225614325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225614388, "dur": 0, "args": { "External id": 264968, "cbid": 317, "correlation": 264968 } }, { "ph": "f", "id": 264968, "pid": 76337, "tid": -914061504, "ts": 1716454225614388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225614389, "dur": 0, "args": { "External id": 264969, "cbid": 203, "correlation": 264969 } }, { "ph": "f", "id": 264969, "pid": 76337, "tid": -914061504, "ts": 1716454225614389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225614389, "dur": 0, "args": { "External id": 264970, "cbid": 205, "correlation": 264970 } }, { "ph": "f", "id": 264970, "pid": 76337, "tid": -914061504, "ts": 1716454225614389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225671449, "dur": 5, "args": { "External id": 264974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264974, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264974, "pid": 5, "tid": 7, "ts": 1716454225671449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614404, "dur": 12, "args": { "External id": 264974, "cbid": 211, "correlation": 264974 } }, { "ph": "s", "id": 264974, "pid": 76337, "tid": -914061504, "ts": 1716454225614404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225671455, "dur": 165, "args": { "External id": 264976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264976, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264976, "pid": 5, "tid": 7, "ts": 1716454225671455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614418, "dur": 5, "args": { "External id": 264976, "cbid": 211, "correlation": 264976 } }, { "ph": "s", "id": 264976, "pid": 76337, "tid": -914061504, "ts": 1716454225614418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225671622, "dur": 1, "args": { "External id": 264978, "device": 5, "context": 1, "stream": 7, "correlation": 264978, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 264978, "pid": 5, "tid": 7, "ts": 1716454225671622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225614429, "dur": 6, "args": { "External id": 264978, "cbid": 51, "correlation": 264978 } }, { "ph": "s", "id": 264978, "pid": 76337, "tid": -914061504, "ts": 1716454225614429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225671626, "dur": 264, "args": { "External id": 264979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264979, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 264979, "pid": 5, "tid": 7, "ts": 1716454225671626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614436, "dur": 6, "args": { "External id": 264979, "cbid": 211, "correlation": 264979 } }, { "ph": "s", "id": 264979, "pid": 76337, "tid": -914061504, "ts": 1716454225614436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225671891, "dur": 6, "args": { "External id": 264981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 264981, "pid": 5, "tid": 7, "ts": 1716454225671891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614445, "dur": 6, "args": { "External id": 264981, "cbid": 211, "correlation": 264981 } }, { "ph": "s", "id": 264981, "pid": 76337, "tid": -914061504, "ts": 1716454225614445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225671898, "dur": 6, "args": { "External id": 264987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 264987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 264987, "pid": 5, "tid": 7, "ts": 1716454225671898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614474, "dur": 9, "args": { "External id": 264987, "cbid": 211, "correlation": 264987 } }, { "ph": "s", "id": 264987, "pid": 76337, "tid": -914061504, "ts": 1716454225614474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225614533, "dur": 0, "args": { "External id": 264997, "cbid": 317, "correlation": 264997 } }, { "ph": "f", "id": 264997, "pid": 76337, "tid": -914061504, "ts": 1716454225614533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225614534, "dur": 0, "args": { "External id": 264998, "cbid": 203, "correlation": 264998 } }, { "ph": "f", "id": 264998, "pid": 76337, "tid": -914061504, "ts": 1716454225614534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225614535, "dur": 0, "args": { "External id": 264999, "cbid": 205, "correlation": 264999 } }, { "ph": "f", "id": 264999, "pid": 76337, "tid": -914061504, "ts": 1716454225614535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225671906, "dur": 8, "args": { "External id": 265003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265003, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265003, "pid": 5, "tid": 7, "ts": 1716454225671906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614547, "dur": 12, "args": { "External id": 265003, "cbid": 211, "correlation": 265003 } }, { "ph": "s", "id": 265003, "pid": 76337, "tid": -914061504, "ts": 1716454225614547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225671915, "dur": 3, "args": { "External id": 265005, "device": 5, "context": 1, "stream": 7, "correlation": 265005, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 265005, "pid": 5, "tid": 7, "ts": 1716454225671915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225614564, "dur": 10, "args": { "External id": 265005, "cbid": 51, "correlation": 265005 } }, { "ph": "s", "id": 265005, "pid": 76337, "tid": -914061504, "ts": 1716454225614564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225671919, "dur": 96, "args": { "External id": 265006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265006, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 265006, "pid": 5, "tid": 7, "ts": 1716454225671919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614575, "dur": 6, "args": { "External id": 265006, "cbid": 211, "correlation": 265006 } }, { "ph": "s", "id": 265006, "pid": 76337, "tid": -914061504, "ts": 1716454225614575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225672017, "dur": 6, "args": { "External id": 265008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265008, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265008, "pid": 5, "tid": 7, "ts": 1716454225672017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614584, "dur": 5, "args": { "External id": 265008, "cbid": 211, "correlation": 265008 } }, { "ph": "s", "id": 265008, "pid": 76337, "tid": -914061504, "ts": 1716454225614584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225672024, "dur": 6, "args": { "External id": 265014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265014, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265014, "pid": 5, "tid": 7, "ts": 1716454225672024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614612, "dur": 8, "args": { "External id": 265014, "cbid": 211, "correlation": 265014 } }, { "ph": "s", "id": 265014, "pid": 76337, "tid": -914061504, "ts": 1716454225614612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225672031, "dur": 5, "args": { "External id": 265022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265022, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265022, "pid": 5, "tid": 7, "ts": 1716454225672031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614641, "dur": 9, "args": { "External id": 265022, "cbid": 211, "correlation": 265022 } }, { "ph": "s", "id": 265022, "pid": 76337, "tid": -914061504, "ts": 1716454225614641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225672038, "dur": 5, "args": { "External id": 265030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265030, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265030, "pid": 5, "tid": 7, "ts": 1716454225672038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614670, "dur": 8, "args": { "External id": 265030, "cbid": 211, "correlation": 265030 } }, { "ph": "s", "id": 265030, "pid": 76337, "tid": -914061504, "ts": 1716454225614670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225672044, "dur": 11, "args": { "External id": 265039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265039, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265039, "pid": 5, "tid": 7, "ts": 1716454225672044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614713, "dur": 10, "args": { "External id": 265039, "cbid": 211, "correlation": 265039 } }, { "ph": "s", "id": 265039, "pid": 76337, "tid": -914061504, "ts": 1716454225614713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225672056, "dur": 13, "args": { "External id": 265059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265059, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 265059, "pid": 5, "tid": 7, "ts": 1716454225672056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614783, "dur": 11, "args": { "External id": 265059, "cbid": 211, "correlation": 265059 } }, { "ph": "s", "id": 265059, "pid": 76337, "tid": -914061504, "ts": 1716454225614783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225672070, "dur": 4, "args": { "External id": 265071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265071, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265071, "pid": 5, "tid": 7, "ts": 1716454225672070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614805, "dur": 7, "args": { "External id": 265071, "cbid": 211, "correlation": 265071 } }, { "ph": "s", "id": 265071, "pid": 76337, "tid": -914061504, "ts": 1716454225614805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225672075, "dur": 11, "args": { "External id": 265074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265074, "pid": 5, "tid": 7, "ts": 1716454225672075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614823, "dur": 7, "args": { "External id": 265074, "cbid": 211, "correlation": 265074 } }, { "ph": "s", "id": 265074, "pid": 76337, "tid": -914061504, "ts": 1716454225614823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225672088, "dur": 6, "args": { "External id": 265083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265083, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265083, "pid": 5, "tid": 7, "ts": 1716454225672088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614863, "dur": 9, "args": { "External id": 265083, "cbid": 211, "correlation": 265083 } }, { "ph": "s", "id": 265083, "pid": 76337, "tid": -914061504, "ts": 1716454225614863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225614915, "dur": 0, "args": { "External id": 265093, "cbid": 317, "correlation": 265093 } }, { "ph": "f", "id": 265093, "pid": 76337, "tid": -914061504, "ts": 1716454225614915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225614916, "dur": 0, "args": { "External id": 265094, "cbid": 203, "correlation": 265094 } }, { "ph": "f", "id": 265094, "pid": 76337, "tid": -914061504, "ts": 1716454225614916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225614917, "dur": 0, "args": { "External id": 265095, "cbid": 205, "correlation": 265095 } }, { "ph": "f", "id": 265095, "pid": 76337, "tid": -914061504, "ts": 1716454225614917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225672095, "dur": 7, "args": { "External id": 265099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265099, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265099, "pid": 5, "tid": 7, "ts": 1716454225672095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614930, "dur": 11, "args": { "External id": 265099, "cbid": 211, "correlation": 265099 } }, { "ph": "s", "id": 265099, "pid": 76337, "tid": -914061504, "ts": 1716454225614930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225672103, "dur": 326, "args": { "External id": 265101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265101, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265101, "pid": 5, "tid": 7, "ts": 1716454225672103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614943, "dur": 5, "args": { "External id": 265101, "cbid": 211, "correlation": 265101 } }, { "ph": "s", "id": 265101, "pid": 76337, "tid": -914061504, "ts": 1716454225614943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225672431, "dur": 1, "args": { "External id": 265103, "device": 5, "context": 1, "stream": 7, "correlation": 265103, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 265103, "pid": 5, "tid": 7, "ts": 1716454225672431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225614954, "dur": 7, "args": { "External id": 265103, "cbid": 51, "correlation": 265103 } }, { "ph": "s", "id": 265103, "pid": 76337, "tid": -914061504, "ts": 1716454225614954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225672435, "dur": 504, "args": { "External id": 265104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265104, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265104, "pid": 5, "tid": 7, "ts": 1716454225672435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614962, "dur": 6, "args": { "External id": 265104, "cbid": 211, "correlation": 265104 } }, { "ph": "s", "id": 265104, "pid": 76337, "tid": -914061504, "ts": 1716454225614962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225672940, "dur": 6, "args": { "External id": 265106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265106, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265106, "pid": 5, "tid": 7, "ts": 1716454225672940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225614972, "dur": 14, "args": { "External id": 265106, "cbid": 211, "correlation": 265106 } }, { "ph": "s", "id": 265106, "pid": 76337, "tid": -914061504, "ts": 1716454225614972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225672948, "dur": 6, "args": { "External id": 265112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265112, "pid": 5, "tid": 7, "ts": 1716454225672948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615010, "dur": 9, "args": { "External id": 265112, "cbid": 211, "correlation": 265112 } }, { "ph": "s", "id": 265112, "pid": 76337, "tid": -914061504, "ts": 1716454225615010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225672955, "dur": 3, "args": { "External id": 265120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265120, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 265120, "pid": 5, "tid": 7, "ts": 1716454225672955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615054, "dur": 11, "args": { "External id": 265120, "cbid": 211, "correlation": 265120 } }, { "ph": "s", "id": 265120, "pid": 76337, "tid": -914061504, "ts": 1716454225615054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225615117, "dur": 1, "args": { "External id": 265136, "cbid": 251, "correlation": 265136 } }, { "ph": "f", "id": 265136, "pid": 76337, "tid": -914061504, "ts": 1716454225615117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225615122, "dur": 0, "args": { "External id": 265138, "cbid": 251, "correlation": 265138 } }, { "ph": "f", "id": 265138, "pid": 76337, "tid": -914061504, "ts": 1716454225615122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225672960, "dur": 13, "args": { "External id": 265139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265139, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265139, "pid": 5, "tid": 7, "ts": 1716454225672960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615124, "dur": 11, "args": { "External id": 265139, "cbid": 211, "correlation": 265139 } }, { "ph": "s", "id": 265139, "pid": 76337, "tid": -914061504, "ts": 1716454225615124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225672974, "dur": 5, "args": { "External id": 265141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265141, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265141, "pid": 5, "tid": 7, "ts": 1716454225672974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615137, "dur": 5, "args": { "External id": 265141, "cbid": 211, "correlation": 265141 } }, { "ph": "s", "id": 265141, "pid": 76337, "tid": -914061504, "ts": 1716454225615137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225672980, "dur": 6, "args": { "External id": 265151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265151, "pid": 5, "tid": 7, "ts": 1716454225672980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615194, "dur": 12, "args": { "External id": 265151, "cbid": 211, "correlation": 265151 } }, { "ph": "s", "id": 265151, "pid": 76337, "tid": -914061504, "ts": 1716454225615194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225672988, "dur": 10, "args": { "External id": 265171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265171, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 265171, "pid": 5, "tid": 7, "ts": 1716454225672988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615259, "dur": 11, "args": { "External id": 265171, "cbid": 211, "correlation": 265171 } }, { "ph": "s", "id": 265171, "pid": 76337, "tid": -914061504, "ts": 1716454225615259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225672999, "dur": 4, "args": { "External id": 265183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265183, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 265183, "pid": 5, "tid": 7, "ts": 1716454225672999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615279, "dur": 6, "args": { "External id": 265183, "cbid": 211, "correlation": 265183 } }, { "ph": "s", "id": 265183, "pid": 76337, "tid": -914061504, "ts": 1716454225615279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225673004, "dur": 7, "args": { "External id": 265186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265186, "pid": 5, "tid": 7, "ts": 1716454225673004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615298, "dur": 6, "args": { "External id": 265186, "cbid": 211, "correlation": 265186 } }, { "ph": "s", "id": 265186, "pid": 76337, "tid": -914061504, "ts": 1716454225615298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225673012, "dur": 5, "args": { "External id": 265195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265195, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265195, "pid": 5, "tid": 7, "ts": 1716454225673012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615337, "dur": 10, "args": { "External id": 265195, "cbid": 211, "correlation": 265195 } }, { "ph": "s", "id": 265195, "pid": 76337, "tid": -914061504, "ts": 1716454225615337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225615400, "dur": 0, "args": { "External id": 265205, "cbid": 317, "correlation": 265205 } }, { "ph": "f", "id": 265205, "pid": 76337, "tid": -914061504, "ts": 1716454225615400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225615401, "dur": 0, "args": { "External id": 265206, "cbid": 203, "correlation": 265206 } }, { "ph": "f", "id": 265206, "pid": 76337, "tid": -914061504, "ts": 1716454225615401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225615401, "dur": 0, "args": { "External id": 265207, "cbid": 205, "correlation": 265207 } }, { "ph": "f", "id": 265207, "pid": 76337, "tid": -914061504, "ts": 1716454225615401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673018, "dur": 5, "args": { "External id": 265211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265211, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265211, "pid": 5, "tid": 7, "ts": 1716454225673018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615416, "dur": 12, "args": { "External id": 265211, "cbid": 211, "correlation": 265211 } }, { "ph": "s", "id": 265211, "pid": 76337, "tid": -914061504, "ts": 1716454225615416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673025, "dur": 165, "args": { "External id": 265213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265213, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265213, "pid": 5, "tid": 7, "ts": 1716454225673025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615430, "dur": 5, "args": { "External id": 265213, "cbid": 211, "correlation": 265213 } }, { "ph": "s", "id": 265213, "pid": 76337, "tid": -914061504, "ts": 1716454225615430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225673192, "dur": 1, "args": { "External id": 265215, "device": 5, "context": 1, "stream": 7, "correlation": 265215, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 265215, "pid": 5, "tid": 7, "ts": 1716454225673192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225615441, "dur": 6, "args": { "External id": 265215, "cbid": 51, "correlation": 265215 } }, { "ph": "s", "id": 265215, "pid": 76337, "tid": -914061504, "ts": 1716454225615441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225673195, "dur": 264, "args": { "External id": 265216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265216, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265216, "pid": 5, "tid": 7, "ts": 1716454225673195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615448, "dur": 6, "args": { "External id": 265216, "cbid": 211, "correlation": 265216 } }, { "ph": "s", "id": 265216, "pid": 76337, "tid": -914061504, "ts": 1716454225615448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673460, "dur": 6, "args": { "External id": 265218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265218, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265218, "pid": 5, "tid": 7, "ts": 1716454225673460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615457, "dur": 6, "args": { "External id": 265218, "cbid": 211, "correlation": 265218 } }, { "ph": "s", "id": 265218, "pid": 76337, "tid": -914061504, "ts": 1716454225615457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225673468, "dur": 6, "args": { "External id": 265224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265224, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265224, "pid": 5, "tid": 7, "ts": 1716454225673468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615486, "dur": 8, "args": { "External id": 265224, "cbid": 211, "correlation": 265224 } }, { "ph": "s", "id": 265224, "pid": 76337, "tid": -914061504, "ts": 1716454225615486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225615544, "dur": 0, "args": { "External id": 265234, "cbid": 317, "correlation": 265234 } }, { "ph": "f", "id": 265234, "pid": 76337, "tid": -914061504, "ts": 1716454225615544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225615545, "dur": 0, "args": { "External id": 265235, "cbid": 203, "correlation": 265235 } }, { "ph": "f", "id": 265235, "pid": 76337, "tid": -914061504, "ts": 1716454225615545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225615545, "dur": 0, "args": { "External id": 265236, "cbid": 205, "correlation": 265236 } }, { "ph": "f", "id": 265236, "pid": 76337, "tid": -914061504, "ts": 1716454225615545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673475, "dur": 8, "args": { "External id": 265240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265240, "pid": 5, "tid": 7, "ts": 1716454225673475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615557, "dur": 12, "args": { "External id": 265240, "cbid": 211, "correlation": 265240 } }, { "ph": "s", "id": 265240, "pid": 76337, "tid": -914061504, "ts": 1716454225615557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225673484, "dur": 3, "args": { "External id": 265242, "device": 5, "context": 1, "stream": 7, "correlation": 265242, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 265242, "pid": 5, "tid": 7, "ts": 1716454225673484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225615574, "dur": 9, "args": { "External id": 265242, "cbid": 51, "correlation": 265242 } }, { "ph": "s", "id": 265242, "pid": 76337, "tid": -914061504, "ts": 1716454225615574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225673488, "dur": 95, "args": { "External id": 265243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265243, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 265243, "pid": 5, "tid": 7, "ts": 1716454225673488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615584, "dur": 6, "args": { "External id": 265243, "cbid": 211, "correlation": 265243 } }, { "ph": "s", "id": 265243, "pid": 76337, "tid": -914061504, "ts": 1716454225615584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673585, "dur": 5, "args": { "External id": 265245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265245, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265245, "pid": 5, "tid": 7, "ts": 1716454225673585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615594, "dur": 5, "args": { "External id": 265245, "cbid": 211, "correlation": 265245 } }, { "ph": "s", "id": 265245, "pid": 76337, "tid": -914061504, "ts": 1716454225615594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225673591, "dur": 6, "args": { "External id": 265251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265251, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265251, "pid": 5, "tid": 7, "ts": 1716454225673591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615621, "dur": 9, "args": { "External id": 265251, "cbid": 211, "correlation": 265251 } }, { "ph": "s", "id": 265251, "pid": 76337, "tid": -914061504, "ts": 1716454225615621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225673599, "dur": 5, "args": { "External id": 265259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265259, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265259, "pid": 5, "tid": 7, "ts": 1716454225673599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615650, "dur": 8, "args": { "External id": 265259, "cbid": 211, "correlation": 265259 } }, { "ph": "s", "id": 265259, "pid": 76337, "tid": -914061504, "ts": 1716454225615650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225673605, "dur": 4, "args": { "External id": 265267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265267, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 265267, "pid": 5, "tid": 7, "ts": 1716454225673605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615679, "dur": 8, "args": { "External id": 265267, "cbid": 211, "correlation": 265267 } }, { "ph": "s", "id": 265267, "pid": 76337, "tid": -914061504, "ts": 1716454225615679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225673611, "dur": 14, "args": { "External id": 265278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265278, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265278, "pid": 5, "tid": 7, "ts": 1716454225673611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615760, "dur": 13, "args": { "External id": 265278, "cbid": 211, "correlation": 265278 } }, { "ph": "s", "id": 265278, "pid": 76337, "tid": -914061504, "ts": 1716454225615760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225615817, "dur": 0, "args": { "External id": 265288, "cbid": 317, "correlation": 265288 } }, { "ph": "f", "id": 265288, "pid": 76337, "tid": -914061504, "ts": 1716454225615817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225615818, "dur": 0, "args": { "External id": 265289, "cbid": 203, "correlation": 265289 } }, { "ph": "f", "id": 265289, "pid": 76337, "tid": -914061504, "ts": 1716454225615818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225615819, "dur": 0, "args": { "External id": 265290, "cbid": 205, "correlation": 265290 } }, { "ph": "f", "id": 265290, "pid": 76337, "tid": -914061504, "ts": 1716454225615819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673626, "dur": 9, "args": { "External id": 265294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265294, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265294, "pid": 5, "tid": 7, "ts": 1716454225673626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615832, "dur": 11, "args": { "External id": 265294, "cbid": 211, "correlation": 265294 } }, { "ph": "s", "id": 265294, "pid": 76337, "tid": -914061504, "ts": 1716454225615832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225673637, "dur": 166, "args": { "External id": 265296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265296, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265296, "pid": 5, "tid": 7, "ts": 1716454225673637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615846, "dur": 5, "args": { "External id": 265296, "cbid": 211, "correlation": 265296 } }, { "ph": "s", "id": 265296, "pid": 76337, "tid": -914061504, "ts": 1716454225615846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225673805, "dur": 1, "args": { "External id": 265298, "device": 5, "context": 1, "stream": 7, "correlation": 265298, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 265298, "pid": 5, "tid": 7, "ts": 1716454225673805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225615857, "dur": 7, "args": { "External id": 265298, "cbid": 51, "correlation": 265298 } }, { "ph": "s", "id": 265298, "pid": 76337, "tid": -914061504, "ts": 1716454225615857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225673809, "dur": 659, "args": { "External id": 265299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265299, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265299, "pid": 5, "tid": 7, "ts": 1716454225673809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615865, "dur": 6, "args": { "External id": 265299, "cbid": 211, "correlation": 265299 } }, { "ph": "s", "id": 265299, "pid": 76337, "tid": -914061504, "ts": 1716454225615865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225674469, "dur": 12, "args": { "External id": 265301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265301, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265301, "pid": 5, "tid": 7, "ts": 1716454225674469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615875, "dur": 5, "args": { "External id": 265301, "cbid": 211, "correlation": 265301 } }, { "ph": "s", "id": 265301, "pid": 76337, "tid": -914061504, "ts": 1716454225615875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225674482, "dur": 15, "args": { "External id": 265307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265307, "pid": 5, "tid": 7, "ts": 1716454225674482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225615904, "dur": 10, "args": { "External id": 265307, "cbid": 211, "correlation": 265307 } }, { "ph": "s", "id": 265307, "pid": 76337, "tid": -914061504, "ts": 1716454225615904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225674498, "dur": 31, "args": { "External id": 265316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265316, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265316, "pid": 5, "tid": 7, "ts": 1716454225674498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616003, "dur": 14, "args": { "External id": 265316, "cbid": 211, "correlation": 265316 } }, { "ph": "s", "id": 265316, "pid": 76337, "tid": -914061504, "ts": 1716454225616003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225674531, "dur": 30, "args": { "External id": 265336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265336, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 265336, "pid": 5, "tid": 7, "ts": 1716454225674531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616074, "dur": 11, "args": { "External id": 265336, "cbid": 211, "correlation": 265336 } }, { "ph": "s", "id": 265336, "pid": 76337, "tid": -914061504, "ts": 1716454225616074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225674562, "dur": 4, "args": { "External id": 265348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265348, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265348, "pid": 5, "tid": 7, "ts": 1716454225674562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616094, "dur": 6, "args": { "External id": 265348, "cbid": 211, "correlation": 265348 } }, { "ph": "s", "id": 265348, "pid": 76337, "tid": -914061504, "ts": 1716454225616094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225674568, "dur": 30, "args": { "External id": 265351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265351, "pid": 5, "tid": 7, "ts": 1716454225674568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616113, "dur": 7, "args": { "External id": 265351, "cbid": 211, "correlation": 265351 } }, { "ph": "s", "id": 265351, "pid": 76337, "tid": -914061504, "ts": 1716454225616113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225674599, "dur": 21, "args": { "External id": 265360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265360, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265360, "pid": 5, "tid": 7, "ts": 1716454225674599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616152, "dur": 9, "args": { "External id": 265360, "cbid": 211, "correlation": 265360 } }, { "ph": "s", "id": 265360, "pid": 76337, "tid": -914061504, "ts": 1716454225616152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225616204, "dur": 0, "args": { "External id": 265370, "cbid": 317, "correlation": 265370 } }, { "ph": "f", "id": 265370, "pid": 76337, "tid": -914061504, "ts": 1716454225616204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225616205, "dur": 0, "args": { "External id": 265371, "cbid": 203, "correlation": 265371 } }, { "ph": "f", "id": 265371, "pid": 76337, "tid": -914061504, "ts": 1716454225616205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225616205, "dur": 0, "args": { "External id": 265372, "cbid": 205, "correlation": 265372 } }, { "ph": "f", "id": 265372, "pid": 76337, "tid": -914061504, "ts": 1716454225616205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225674621, "dur": 22, "args": { "External id": 265376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265376, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265376, "pid": 5, "tid": 7, "ts": 1716454225674621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616221, "dur": 12, "args": { "External id": 265376, "cbid": 211, "correlation": 265376 } }, { "ph": "s", "id": 265376, "pid": 76337, "tid": -914061504, "ts": 1716454225616221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225674644, "dur": 325, "args": { "External id": 265378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265378, "pid": 5, "tid": 7, "ts": 1716454225674644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616235, "dur": 5, "args": { "External id": 265378, "cbid": 211, "correlation": 265378 } }, { "ph": "s", "id": 265378, "pid": 76337, "tid": -914061504, "ts": 1716454225616235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225674972, "dur": 1, "args": { "External id": 265380, "device": 5, "context": 1, "stream": 7, "correlation": 265380, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 265380, "pid": 5, "tid": 7, "ts": 1716454225674972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225616246, "dur": 6, "args": { "External id": 265380, "cbid": 51, "correlation": 265380 } }, { "ph": "s", "id": 265380, "pid": 76337, "tid": -914061504, "ts": 1716454225616246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225674975, "dur": 1262, "args": { "External id": 265381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265381, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265381, "pid": 5, "tid": 7, "ts": 1716454225674975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616253, "dur": 6, "args": { "External id": 265381, "cbid": 211, "correlation": 265381 } }, { "ph": "s", "id": 265381, "pid": 76337, "tid": -914061504, "ts": 1716454225616253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225676239, "dur": 12, "args": { "External id": 265383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265383, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265383, "pid": 5, "tid": 7, "ts": 1716454225676239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616262, "dur": 5, "args": { "External id": 265383, "cbid": 211, "correlation": 265383 } }, { "ph": "s", "id": 265383, "pid": 76337, "tid": -914061504, "ts": 1716454225616262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225676253, "dur": 15, "args": { "External id": 265389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265389, "pid": 5, "tid": 7, "ts": 1716454225676253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616291, "dur": 9, "args": { "External id": 265389, "cbid": 211, "correlation": 265389 } }, { "ph": "s", "id": 265389, "pid": 76337, "tid": -914061504, "ts": 1716454225616291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225676269, "dur": 3, "args": { "External id": 265397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265397, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 265397, "pid": 5, "tid": 7, "ts": 1716454225676269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616335, "dur": 9, "args": { "External id": 265397, "cbid": 211, "correlation": 265397 } }, { "ph": "s", "id": 265397, "pid": 76337, "tid": -914061504, "ts": 1716454225616335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225616399, "dur": 1, "args": { "External id": 265413, "cbid": 251, "correlation": 265413 } }, { "ph": "f", "id": 265413, "pid": 76337, "tid": -914061504, "ts": 1716454225616399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225616405, "dur": 0, "args": { "External id": 265415, "cbid": 251, "correlation": 265415 } }, { "ph": "f", "id": 265415, "pid": 76337, "tid": -914061504, "ts": 1716454225616405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225676273, "dur": 12, "args": { "External id": 265416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265416, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265416, "pid": 5, "tid": 7, "ts": 1716454225676273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616406, "dur": 11, "args": { "External id": 265416, "cbid": 211, "correlation": 265416 } }, { "ph": "s", "id": 265416, "pid": 76337, "tid": -914061504, "ts": 1716454225616406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225676287, "dur": 5, "args": { "External id": 265418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265418, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265418, "pid": 5, "tid": 7, "ts": 1716454225676287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616420, "dur": 6, "args": { "External id": 265418, "cbid": 211, "correlation": 265418 } }, { "ph": "s", "id": 265418, "pid": 76337, "tid": -914061504, "ts": 1716454225616420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225676293, "dur": 18, "args": { "External id": 265428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265428, "pid": 5, "tid": 7, "ts": 1716454225676293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616478, "dur": 12, "args": { "External id": 265428, "cbid": 211, "correlation": 265428 } }, { "ph": "s", "id": 265428, "pid": 76337, "tid": -914061504, "ts": 1716454225616478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225676313, "dur": 18, "args": { "External id": 265448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265448, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 265448, "pid": 5, "tid": 7, "ts": 1716454225676313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616543, "dur": 10, "args": { "External id": 265448, "cbid": 211, "correlation": 265448 } }, { "ph": "s", "id": 265448, "pid": 76337, "tid": -914061504, "ts": 1716454225616543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225676331, "dur": 4, "args": { "External id": 265460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265460, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 265460, "pid": 5, "tid": 7, "ts": 1716454225676331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616563, "dur": 7, "args": { "External id": 265460, "cbid": 211, "correlation": 265460 } }, { "ph": "s", "id": 265460, "pid": 76337, "tid": -914061504, "ts": 1716454225616563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225676337, "dur": 16, "args": { "External id": 265463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265463, "pid": 5, "tid": 7, "ts": 1716454225676337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616582, "dur": 6, "args": { "External id": 265463, "cbid": 211, "correlation": 265463 } }, { "ph": "s", "id": 265463, "pid": 76337, "tid": -914061504, "ts": 1716454225616582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225676355, "dur": 11, "args": { "External id": 265472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265472, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265472, "pid": 5, "tid": 7, "ts": 1716454225676355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616621, "dur": 10, "args": { "External id": 265472, "cbid": 211, "correlation": 265472 } }, { "ph": "s", "id": 265472, "pid": 76337, "tid": -914061504, "ts": 1716454225616621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225616683, "dur": 0, "args": { "External id": 265482, "cbid": 317, "correlation": 265482 } }, { "ph": "f", "id": 265482, "pid": 76337, "tid": -914061504, "ts": 1716454225616683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225616684, "dur": 0, "args": { "External id": 265483, "cbid": 203, "correlation": 265483 } }, { "ph": "f", "id": 265483, "pid": 76337, "tid": -914061504, "ts": 1716454225616684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225616685, "dur": 0, "args": { "External id": 265484, "cbid": 205, "correlation": 265484 } }, { "ph": "f", "id": 265484, "pid": 76337, "tid": -914061504, "ts": 1716454225616685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225676367, "dur": 11, "args": { "External id": 265488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265488, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265488, "pid": 5, "tid": 7, "ts": 1716454225676367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616698, "dur": 12, "args": { "External id": 265488, "cbid": 211, "correlation": 265488 } }, { "ph": "s", "id": 265488, "pid": 76337, "tid": -914061504, "ts": 1716454225616698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225676379, "dur": 166, "args": { "External id": 265490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265490, "pid": 5, "tid": 7, "ts": 1716454225676379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616712, "dur": 5, "args": { "External id": 265490, "cbid": 211, "correlation": 265490 } }, { "ph": "s", "id": 265490, "pid": 76337, "tid": -914061504, "ts": 1716454225616712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225676548, "dur": 1, "args": { "External id": 265492, "device": 5, "context": 1, "stream": 7, "correlation": 265492, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 265492, "pid": 5, "tid": 7, "ts": 1716454225676548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225616724, "dur": 8, "args": { "External id": 265492, "cbid": 51, "correlation": 265492 } }, { "ph": "s", "id": 265492, "pid": 76337, "tid": -914061504, "ts": 1716454225616724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225676552, "dur": 659, "args": { "External id": 265493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265493, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265493, "pid": 5, "tid": 7, "ts": 1716454225676552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616732, "dur": 6, "args": { "External id": 265493, "cbid": 211, "correlation": 265493 } }, { "ph": "s", "id": 265493, "pid": 76337, "tid": -914061504, "ts": 1716454225616732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225677212, "dur": 14, "args": { "External id": 265495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265495, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265495, "pid": 5, "tid": 7, "ts": 1716454225677212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616742, "dur": 5, "args": { "External id": 265495, "cbid": 211, "correlation": 265495 } }, { "ph": "s", "id": 265495, "pid": 76337, "tid": -914061504, "ts": 1716454225616742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225677228, "dur": 15, "args": { "External id": 265501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265501, "pid": 5, "tid": 7, "ts": 1716454225677228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616770, "dur": 10, "args": { "External id": 265501, "cbid": 211, "correlation": 265501 } }, { "ph": "s", "id": 265501, "pid": 76337, "tid": -914061504, "ts": 1716454225616770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225616828, "dur": 0, "args": { "External id": 265511, "cbid": 317, "correlation": 265511 } }, { "ph": "f", "id": 265511, "pid": 76337, "tid": -914061504, "ts": 1716454225616828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225616829, "dur": 0, "args": { "External id": 265512, "cbid": 203, "correlation": 265512 } }, { "ph": "f", "id": 265512, "pid": 76337, "tid": -914061504, "ts": 1716454225616829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225616830, "dur": 0, "args": { "External id": 265513, "cbid": 205, "correlation": 265513 } }, { "ph": "f", "id": 265513, "pid": 76337, "tid": -914061504, "ts": 1716454225616830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225677244, "dur": 21, "args": { "External id": 265517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265517, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265517, "pid": 5, "tid": 7, "ts": 1716454225677244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616844, "dur": 11, "args": { "External id": 265517, "cbid": 211, "correlation": 265517 } }, { "ph": "s", "id": 265517, "pid": 76337, "tid": -914061504, "ts": 1716454225616844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225677267, "dur": 4, "args": { "External id": 265519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265519, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 265519, "pid": 5, "tid": 7, "ts": 1716454225677267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616860, "dur": 6, "args": { "External id": 265519, "cbid": 211, "correlation": 265519 } }, { "ph": "s", "id": 265519, "pid": 76337, "tid": -914061504, "ts": 1716454225616860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225616869, "dur": 0, "args": { "External id": 265520, "cbid": 51, "correlation": 265520 } }, { "ph": "s", "id": 265520, "pid": 76337, "tid": -914061504, "ts": 1716454225616869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225677272, "dur": 177, "args": { "External id": 265521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265521, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 265521, "pid": 5, "tid": 7, "ts": 1716454225677272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616870, "dur": 5, "args": { "External id": 265521, "cbid": 211, "correlation": 265521 } }, { "ph": "s", "id": 265521, "pid": 76337, "tid": -914061504, "ts": 1716454225616870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225677450, "dur": 16, "args": { "External id": 265526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265526, "pid": 5, "tid": 7, "ts": 1716454225677450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616897, "dur": 8, "args": { "External id": 265526, "cbid": 211, "correlation": 265526 } }, { "ph": "s", "id": 265526, "pid": 76337, "tid": -914061504, "ts": 1716454225616897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225677468, "dur": 12, "args": { "External id": 265534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265534, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265534, "pid": 5, "tid": 7, "ts": 1716454225677468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616925, "dur": 8, "args": { "External id": 265534, "cbid": 211, "correlation": 265534 } }, { "ph": "s", "id": 265534, "pid": 76337, "tid": -914061504, "ts": 1716454225616925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225677481, "dur": 10, "args": { "External id": 265542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265542, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265542, "pid": 5, "tid": 7, "ts": 1716454225677481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225616954, "dur": 8, "args": { "External id": 265542, "cbid": 211, "correlation": 265542 } }, { "ph": "s", "id": 265542, "pid": 76337, "tid": -914061504, "ts": 1716454225616954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225677492, "dur": 18, "args": { "External id": 265562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265562, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 265562, "pid": 5, "tid": 7, "ts": 1716454225677492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617045, "dur": 13, "args": { "External id": 265562, "cbid": 211, "correlation": 265562 } }, { "ph": "s", "id": 265562, "pid": 76337, "tid": -914061504, "ts": 1716454225617045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225677512, "dur": 5, "args": { "External id": 265574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265574, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 265574, "pid": 5, "tid": 7, "ts": 1716454225677512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617068, "dur": 6, "args": { "External id": 265574, "cbid": 211, "correlation": 265574 } }, { "ph": "s", "id": 265574, "pid": 76337, "tid": -914061504, "ts": 1716454225617068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225677518, "dur": 17, "args": { "External id": 265577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265577, "pid": 5, "tid": 7, "ts": 1716454225677518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617086, "dur": 7, "args": { "External id": 265577, "cbid": 211, "correlation": 265577 } }, { "ph": "s", "id": 265577, "pid": 76337, "tid": -914061504, "ts": 1716454225617086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225617144, "dur": 0, "args": { "External id": 265588, "cbid": 317, "correlation": 265588 } }, { "ph": "f", "id": 265588, "pid": 76337, "tid": -914061504, "ts": 1716454225617144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225617145, "dur": 0, "args": { "External id": 265589, "cbid": 203, "correlation": 265589 } }, { "ph": "f", "id": 265589, "pid": 76337, "tid": -914061504, "ts": 1716454225617145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225617146, "dur": 0, "args": { "External id": 265590, "cbid": 205, "correlation": 265590 } }, { "ph": "f", "id": 265590, "pid": 76337, "tid": -914061504, "ts": 1716454225617146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225677536, "dur": 11, "args": { "External id": 265594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265594, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265594, "pid": 5, "tid": 7, "ts": 1716454225677536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617159, "dur": 12, "args": { "External id": 265594, "cbid": 211, "correlation": 265594 } }, { "ph": "s", "id": 265594, "pid": 76337, "tid": -914061504, "ts": 1716454225617159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225677549, "dur": 3, "args": { "External id": 265596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265596, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 265596, "pid": 5, "tid": 7, "ts": 1716454225677549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617175, "dur": 7, "args": { "External id": 265596, "cbid": 211, "correlation": 265596 } }, { "ph": "s", "id": 265596, "pid": 76337, "tid": -914061504, "ts": 1716454225617175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225617184, "dur": 0, "args": { "External id": 265597, "cbid": 51, "correlation": 265597 } }, { "ph": "s", "id": 265597, "pid": 76337, "tid": -914061504, "ts": 1716454225617184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225677554, "dur": 92, "args": { "External id": 265598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265598, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 265598, "pid": 5, "tid": 7, "ts": 1716454225677554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617185, "dur": 5, "args": { "External id": 265598, "cbid": 211, "correlation": 265598 } }, { "ph": "s", "id": 265598, "pid": 76337, "tid": -914061504, "ts": 1716454225617185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225677648, "dur": 16, "args": { "External id": 265603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265603, "pid": 5, "tid": 7, "ts": 1716454225677648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617212, "dur": 8, "args": { "External id": 265603, "cbid": 211, "correlation": 265603 } }, { "ph": "s", "id": 265603, "pid": 76337, "tid": -914061504, "ts": 1716454225617212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225677665, "dur": 84, "args": { "External id": 265612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265612, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265612, "pid": 5, "tid": 7, "ts": 1716454225677665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617294, "dur": 15, "args": { "External id": 265612, "cbid": 211, "correlation": 265612 } }, { "ph": "s", "id": 265612, "pid": 76337, "tid": -914061504, "ts": 1716454225617294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225677751, "dur": 30, "args": { "External id": 265634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265634, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265634, "pid": 5, "tid": 7, "ts": 1716454225677751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617353, "dur": 10, "args": { "External id": 265634, "cbid": 211, "correlation": 265634 } }, { "ph": "s", "id": 265634, "pid": 76337, "tid": -914061504, "ts": 1716454225617353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225617445, "dur": 2, "args": { "External id": 265645, "cbid": 251, "correlation": 265645 } }, { "ph": "f", "id": 265645, "pid": 76337, "tid": -914061504, "ts": 1716454225617445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225677782, "dur": 165, "args": { "External id": 265646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265646, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265646, "pid": 5, "tid": 7, "ts": 1716454225677782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617451, "dur": 13, "args": { "External id": 265646, "cbid": 211, "correlation": 265646 } }, { "ph": "s", "id": 265646, "pid": 76337, "tid": -914061504, "ts": 1716454225617451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225617522, "dur": 1, "args": { "External id": 265657, "cbid": 251, "correlation": 265657 } }, { "ph": "f", "id": 265657, "pid": 76337, "tid": -914061504, "ts": 1716454225617522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225677948, "dur": 162, "args": { "External id": 265658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265658, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265658, "pid": 5, "tid": 7, "ts": 1716454225677948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617526, "dur": 12, "args": { "External id": 265658, "cbid": 211, "correlation": 265658 } }, { "ph": "s", "id": 265658, "pid": 76337, "tid": -914061504, "ts": 1716454225617526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225617593, "dur": 1, "args": { "External id": 265669, "cbid": 251, "correlation": 265669 } }, { "ph": "f", "id": 265669, "pid": 76337, "tid": -914061504, "ts": 1716454225617593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225678111, "dur": 160, "args": { "External id": 265670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265670, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265670, "pid": 5, "tid": 7, "ts": 1716454225678111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617597, "dur": 11, "args": { "External id": 265670, "cbid": 211, "correlation": 265670 } }, { "ph": "s", "id": 265670, "pid": 76337, "tid": -914061504, "ts": 1716454225617597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225678273, "dur": 341, "args": { "External id": 265695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265695, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265695, "pid": 5, "tid": 7, "ts": 1716454225678273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617684, "dur": 13, "args": { "External id": 265695, "cbid": 211, "correlation": 265695 } }, { "ph": "s", "id": 265695, "pid": 76337, "tid": -914061504, "ts": 1716454225617684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225617785, "dur": 1, "args": { "External id": 265713, "cbid": 251, "correlation": 265713 } }, { "ph": "f", "id": 265713, "pid": 76337, "tid": -914061504, "ts": 1716454225617785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225678615, "dur": 170, "args": { "External id": 265715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265715, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265715, "pid": 5, "tid": 7, "ts": 1716454225678615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617791, "dur": 13, "args": { "External id": 265715, "cbid": 211, "correlation": 265715 } }, { "ph": "s", "id": 265715, "pid": 76337, "tid": -914061504, "ts": 1716454225617791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225678787, "dur": 19, "args": { "External id": 265723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265723, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265723, "pid": 5, "tid": 7, "ts": 1716454225678787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617860, "dur": 12, "args": { "External id": 265723, "cbid": 211, "correlation": 265723 } }, { "ph": "s", "id": 265723, "pid": 76337, "tid": -914061504, "ts": 1716454225617860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225678807, "dur": 27, "args": { "External id": 265731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265731, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265731, "pid": 5, "tid": 7, "ts": 1716454225678807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617899, "dur": 9, "args": { "External id": 265731, "cbid": 211, "correlation": 265731 } }, { "ph": "s", "id": 265731, "pid": 76337, "tid": -914061504, "ts": 1716454225617899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225678836, "dur": 19, "args": { "External id": 265742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265742, "pid": 5, "tid": 7, "ts": 1716454225678836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225617972, "dur": 21, "args": { "External id": 265742, "cbid": 211, "correlation": 265742 } }, { "ph": "s", "id": 265742, "pid": 76337, "tid": -914061504, "ts": 1716454225617972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225678856, "dur": 16, "args": { "External id": 265764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265764, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265764, "pid": 5, "tid": 7, "ts": 1716454225678856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618011, "dur": 8, "args": { "External id": 265764, "cbid": 211, "correlation": 265764 } }, { "ph": "s", "id": 265764, "pid": 76337, "tid": -914061504, "ts": 1716454225618011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618098, "dur": 1, "args": { "External id": 265775, "cbid": 251, "correlation": 265775 } }, { "ph": "f", "id": 265775, "pid": 76337, "tid": -914061504, "ts": 1716454225618098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225678873, "dur": 90, "args": { "External id": 265776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265776, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 265776, "pid": 5, "tid": 7, "ts": 1716454225678873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618103, "dur": 13, "args": { "External id": 265776, "cbid": 211, "correlation": 265776 } }, { "ph": "s", "id": 265776, "pid": 76337, "tid": -914061504, "ts": 1716454225618103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618172, "dur": 1, "args": { "External id": 265787, "cbid": 251, "correlation": 265787 } }, { "ph": "f", "id": 265787, "pid": 76337, "tid": -914061504, "ts": 1716454225618172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618175, "dur": 0, "args": { "External id": 265788, "cbid": 251, "correlation": 265788 } }, { "ph": "f", "id": 265788, "pid": 76337, "tid": -914061504, "ts": 1716454225618175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225678965, "dur": 13, "args": { "External id": 265789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265789, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265789, "pid": 5, "tid": 7, "ts": 1716454225678965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618177, "dur": 12, "args": { "External id": 265789, "cbid": 211, "correlation": 265789 } }, { "ph": "s", "id": 265789, "pid": 76337, "tid": -914061504, "ts": 1716454225618177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225678979, "dur": 6, "args": { "External id": 265791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265791, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265791, "pid": 5, "tid": 7, "ts": 1716454225678979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618190, "dur": 6, "args": { "External id": 265791, "cbid": 211, "correlation": 265791 } }, { "ph": "s", "id": 265791, "pid": 76337, "tid": -914061504, "ts": 1716454225618190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618247, "dur": 1, "args": { "External id": 265802, "cbid": 251, "correlation": 265802 } }, { "ph": "f", "id": 265802, "pid": 76337, "tid": -914061504, "ts": 1716454225618247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618250, "dur": 0, "args": { "External id": 265803, "cbid": 251, "correlation": 265803 } }, { "ph": "f", "id": 265803, "pid": 76337, "tid": -914061504, "ts": 1716454225618250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225678986, "dur": 8, "args": { "External id": 265804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265804, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265804, "pid": 5, "tid": 7, "ts": 1716454225678986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618252, "dur": 11, "args": { "External id": 265804, "cbid": 211, "correlation": 265804 } }, { "ph": "s", "id": 265804, "pid": 76337, "tid": -914061504, "ts": 1716454225618252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225678995, "dur": 3, "args": { "External id": 265806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265806, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265806, "pid": 5, "tid": 7, "ts": 1716454225678995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618265, "dur": 5, "args": { "External id": 265806, "cbid": 211, "correlation": 265806 } }, { "ph": "s", "id": 265806, "pid": 76337, "tid": -914061504, "ts": 1716454225618265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225679000, "dur": 57, "args": { "External id": 265831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265831, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265831, "pid": 5, "tid": 7, "ts": 1716454225679000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618342, "dur": 13, "args": { "External id": 265831, "cbid": 211, "correlation": 265831 } }, { "ph": "s", "id": 265831, "pid": 76337, "tid": -914061504, "ts": 1716454225618342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618441, "dur": 2, "args": { "External id": 265849, "cbid": 251, "correlation": 265849 } }, { "ph": "f", "id": 265849, "pid": 76337, "tid": -914061504, "ts": 1716454225618441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225679058, "dur": 92, "args": { "External id": 265851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265851, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 265851, "pid": 5, "tid": 7, "ts": 1716454225679058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618447, "dur": 13, "args": { "External id": 265851, "cbid": 211, "correlation": 265851 } }, { "ph": "s", "id": 265851, "pid": 76337, "tid": -914061504, "ts": 1716454225618447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225679152, "dur": 10, "args": { "External id": 265859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265859, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265859, "pid": 5, "tid": 7, "ts": 1716454225679152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618516, "dur": 12, "args": { "External id": 265859, "cbid": 211, "correlation": 265859 } }, { "ph": "s", "id": 265859, "pid": 76337, "tid": -914061504, "ts": 1716454225618516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225679162, "dur": 20, "args": { "External id": 265867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265867, "pid": 5, "tid": 7, "ts": 1716454225679162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618557, "dur": 9, "args": { "External id": 265867, "cbid": 211, "correlation": 265867 } }, { "ph": "s", "id": 265867, "pid": 76337, "tid": -914061504, "ts": 1716454225618557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225679184, "dur": 17, "args": { "External id": 265889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265889, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265889, "pid": 5, "tid": 7, "ts": 1716454225679184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618608, "dur": 10, "args": { "External id": 265889, "cbid": 211, "correlation": 265889 } }, { "ph": "s", "id": 265889, "pid": 76337, "tid": -914061504, "ts": 1716454225618608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618694, "dur": 1, "args": { "External id": 265905, "cbid": 251, "correlation": 265905 } }, { "ph": "f", "id": 265905, "pid": 76337, "tid": -914061504, "ts": 1716454225618694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618700, "dur": 0, "args": { "External id": 265907, "cbid": 251, "correlation": 265907 } }, { "ph": "f", "id": 265907, "pid": 76337, "tid": -914061504, "ts": 1716454225618700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225679203, "dur": 498, "args": { "External id": 265908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265908, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 265908, "pid": 5, "tid": 7, "ts": 1716454225679203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618702, "dur": 13, "args": { "External id": 265908, "cbid": 211, "correlation": 265908 } }, { "ph": "s", "id": 265908, "pid": 76337, "tid": -914061504, "ts": 1716454225618702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225679703, "dur": 67, "args": { "External id": 265916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265916, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265916, "pid": 5, "tid": 7, "ts": 1716454225679703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618768, "dur": 12, "args": { "External id": 265916, "cbid": 211, "correlation": 265916 } }, { "ph": "s", "id": 265916, "pid": 76337, "tid": -914061504, "ts": 1716454225618768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225679770, "dur": 68, "args": { "External id": 265924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265924, "pid": 5, "tid": 7, "ts": 1716454225679770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618801, "dur": 8, "args": { "External id": 265924, "cbid": 211, "correlation": 265924 } }, { "ph": "s", "id": 265924, "pid": 76337, "tid": -914061504, "ts": 1716454225618801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225618880, "dur": 1, "args": { "External id": 265940, "cbid": 251, "correlation": 265940 } }, { "ph": "f", "id": 265940, "pid": 76337, "tid": -914061504, "ts": 1716454225618880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225679841, "dur": 1, "args": { "External id": 265942, "device": 5, "context": 1, "stream": 7, "correlation": 265942, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 265942, "pid": 5, "tid": 7, "ts": 1716454225679841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225618885, "dur": 10, "args": { "External id": 265942, "cbid": 51, "correlation": 265942 } }, { "ph": "s", "id": 265942, "pid": 76337, "tid": -914061504, "ts": 1716454225618885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225679844, "dur": 274, "args": { "External id": 265943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265943, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 265943, "pid": 5, "tid": 7, "ts": 1716454225679844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618897, "dur": 11, "args": { "External id": 265943, "cbid": 211, "correlation": 265943 } }, { "ph": "s", "id": 265943, "pid": 76337, "tid": -914061504, "ts": 1716454225618897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225680120, "dur": 14, "args": { "External id": 265951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265951, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265951, "pid": 5, "tid": 7, "ts": 1716454225680120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225618939, "dur": 11, "args": { "External id": 265951, "cbid": 211, "correlation": 265951 } }, { "ph": "s", "id": 265951, "pid": 76337, "tid": -914061504, "ts": 1716454225618939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225680136, "dur": 39, "args": { "External id": 265962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265962, "pid": 5, "tid": 7, "ts": 1716454225680136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619015, "dur": 13, "args": { "External id": 265962, "cbid": 211, "correlation": 265962 } }, { "ph": "s", "id": 265962, "pid": 76337, "tid": -914061504, "ts": 1716454225619015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225619081, "dur": 0, "args": { "External id": 265974, "cbid": 317, "correlation": 265974 } }, { "ph": "f", "id": 265974, "pid": 76337, "tid": -914061504, "ts": 1716454225619081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225619082, "dur": 0, "args": { "External id": 265975, "cbid": 203, "correlation": 265975 } }, { "ph": "f", "id": 265975, "pid": 76337, "tid": -914061504, "ts": 1716454225619082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225619082, "dur": 0, "args": { "External id": 265976, "cbid": 205, "correlation": 265976 } }, { "ph": "f", "id": 265976, "pid": 76337, "tid": -914061504, "ts": 1716454225619082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225680177, "dur": 14, "args": { "External id": 265980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265980, "pid": 5, "tid": 7, "ts": 1716454225680177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619098, "dur": 12, "args": { "External id": 265980, "cbid": 211, "correlation": 265980 } }, { "ph": "s", "id": 265980, "pid": 76337, "tid": -914061504, "ts": 1716454225619098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225680192, "dur": 4, "args": { "External id": 265982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 265982, "pid": 5, "tid": 7, "ts": 1716454225680192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619115, "dur": 5, "args": { "External id": 265982, "cbid": 211, "correlation": 265982 } }, { "ph": "s", "id": 265982, "pid": 76337, "tid": -914061504, "ts": 1716454225619115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225619123, "dur": 0, "args": { "External id": 265983, "cbid": 51, "correlation": 265983 } }, { "ph": "s", "id": 265983, "pid": 76337, "tid": -914061504, "ts": 1716454225619123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225680197, "dur": 98, "args": { "External id": 265984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265984, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 265984, "pid": 5, "tid": 7, "ts": 1716454225680197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619124, "dur": 5, "args": { "External id": 265984, "cbid": 211, "correlation": 265984 } }, { "ph": "s", "id": 265984, "pid": 76337, "tid": -914061504, "ts": 1716454225619124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225680296, "dur": 17, "args": { "External id": 265989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265989, "pid": 5, "tid": 7, "ts": 1716454225680296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619151, "dur": 9, "args": { "External id": 265989, "cbid": 211, "correlation": 265989 } }, { "ph": "s", "id": 265989, "pid": 76337, "tid": -914061504, "ts": 1716454225619151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225680315, "dur": 12, "args": { "External id": 265997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 265997, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 265997, "pid": 5, "tid": 7, "ts": 1716454225680315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619183, "dur": 8, "args": { "External id": 265997, "cbid": 211, "correlation": 265997 } }, { "ph": "s", "id": 265997, "pid": 76337, "tid": -914061504, "ts": 1716454225619183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225680329, "dur": 31, "args": { "External id": 266006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266006, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266006, "pid": 5, "tid": 7, "ts": 1716454225680329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619222, "dur": 10, "args": { "External id": 266006, "cbid": 211, "correlation": 266006 } }, { "ph": "s", "id": 266006, "pid": 76337, "tid": -914061504, "ts": 1716454225619222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225680361, "dur": 31, "args": { "External id": 266026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266026, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 266026, "pid": 5, "tid": 7, "ts": 1716454225680361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619292, "dur": 12, "args": { "External id": 266026, "cbid": 211, "correlation": 266026 } }, { "ph": "s", "id": 266026, "pid": 76337, "tid": -914061504, "ts": 1716454225619292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225680394, "dur": 5, "args": { "External id": 266038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266038, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266038, "pid": 5, "tid": 7, "ts": 1716454225680394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619314, "dur": 6, "args": { "External id": 266038, "cbid": 211, "correlation": 266038 } }, { "ph": "s", "id": 266038, "pid": 76337, "tid": -914061504, "ts": 1716454225619314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225680400, "dur": 32, "args": { "External id": 266041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266041, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266041, "pid": 5, "tid": 7, "ts": 1716454225680400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619331, "dur": 7, "args": { "External id": 266041, "cbid": 211, "correlation": 266041 } }, { "ph": "s", "id": 266041, "pid": 76337, "tid": -914061504, "ts": 1716454225619331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225680433, "dur": 21, "args": { "External id": 266050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266050, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266050, "pid": 5, "tid": 7, "ts": 1716454225680433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619371, "dur": 9, "args": { "External id": 266050, "cbid": 211, "correlation": 266050 } }, { "ph": "s", "id": 266050, "pid": 76337, "tid": -914061504, "ts": 1716454225619371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225619423, "dur": 0, "args": { "External id": 266060, "cbid": 317, "correlation": 266060 } }, { "ph": "f", "id": 266060, "pid": 76337, "tid": -914061504, "ts": 1716454225619423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225619423, "dur": 0, "args": { "External id": 266061, "cbid": 203, "correlation": 266061 } }, { "ph": "f", "id": 266061, "pid": 76337, "tid": -914061504, "ts": 1716454225619423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225619424, "dur": 0, "args": { "External id": 266062, "cbid": 205, "correlation": 266062 } }, { "ph": "f", "id": 266062, "pid": 76337, "tid": -914061504, "ts": 1716454225619424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225680455, "dur": 23, "args": { "External id": 266066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266066, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266066, "pid": 5, "tid": 7, "ts": 1716454225680455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619437, "dur": 12, "args": { "External id": 266066, "cbid": 211, "correlation": 266066 } }, { "ph": "s", "id": 266066, "pid": 76337, "tid": -914061504, "ts": 1716454225619437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225680479, "dur": 326, "args": { "External id": 266068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266068, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266068, "pid": 5, "tid": 7, "ts": 1716454225680479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619452, "dur": 5, "args": { "External id": 266068, "cbid": 211, "correlation": 266068 } }, { "ph": "s", "id": 266068, "pid": 76337, "tid": -914061504, "ts": 1716454225619452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225680807, "dur": 1, "args": { "External id": 266070, "device": 5, "context": 1, "stream": 7, "correlation": 266070, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 266070, "pid": 5, "tid": 7, "ts": 1716454225680807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225619463, "dur": 6, "args": { "External id": 266070, "cbid": 51, "correlation": 266070 } }, { "ph": "s", "id": 266070, "pid": 76337, "tid": -914061504, "ts": 1716454225619463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225680811, "dur": 1277, "args": { "External id": 266071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266071, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266071, "pid": 5, "tid": 7, "ts": 1716454225680811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619470, "dur": 6, "args": { "External id": 266071, "cbid": 211, "correlation": 266071 } }, { "ph": "s", "id": 266071, "pid": 76337, "tid": -914061504, "ts": 1716454225619470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225682089, "dur": 13, "args": { "External id": 266073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266073, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266073, "pid": 5, "tid": 7, "ts": 1716454225682089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619480, "dur": 5, "args": { "External id": 266073, "cbid": 211, "correlation": 266073 } }, { "ph": "s", "id": 266073, "pid": 76337, "tid": -914061504, "ts": 1716454225619480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225682104, "dur": 15, "args": { "External id": 266079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266079, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266079, "pid": 5, "tid": 7, "ts": 1716454225682104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619509, "dur": 8, "args": { "External id": 266079, "cbid": 211, "correlation": 266079 } }, { "ph": "s", "id": 266079, "pid": 76337, "tid": -914061504, "ts": 1716454225619509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225682120, "dur": 3, "args": { "External id": 266087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266087, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 266087, "pid": 5, "tid": 7, "ts": 1716454225682120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619553, "dur": 9, "args": { "External id": 266087, "cbid": 211, "correlation": 266087 } }, { "ph": "s", "id": 266087, "pid": 76337, "tid": -914061504, "ts": 1716454225619553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225619617, "dur": 1, "args": { "External id": 266103, "cbid": 251, "correlation": 266103 } }, { "ph": "f", "id": 266103, "pid": 76337, "tid": -914061504, "ts": 1716454225619617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225619622, "dur": 0, "args": { "External id": 266105, "cbid": 251, "correlation": 266105 } }, { "ph": "f", "id": 266105, "pid": 76337, "tid": -914061504, "ts": 1716454225619622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225682125, "dur": 13, "args": { "External id": 266106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266106, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266106, "pid": 5, "tid": 7, "ts": 1716454225682125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619624, "dur": 11, "args": { "External id": 266106, "cbid": 211, "correlation": 266106 } }, { "ph": "s", "id": 266106, "pid": 76337, "tid": -914061504, "ts": 1716454225619624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225682140, "dur": 6, "args": { "External id": 266108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266108, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266108, "pid": 5, "tid": 7, "ts": 1716454225682140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619637, "dur": 5, "args": { "External id": 266108, "cbid": 211, "correlation": 266108 } }, { "ph": "s", "id": 266108, "pid": 76337, "tid": -914061504, "ts": 1716454225619637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225682147, "dur": 18, "args": { "External id": 266118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266118, "pid": 5, "tid": 7, "ts": 1716454225682147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619694, "dur": 13, "args": { "External id": 266118, "cbid": 211, "correlation": 266118 } }, { "ph": "s", "id": 266118, "pid": 76337, "tid": -914061504, "ts": 1716454225619694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225682166, "dur": 19, "args": { "External id": 266138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266138, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 266138, "pid": 5, "tid": 7, "ts": 1716454225682166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619761, "dur": 10, "args": { "External id": 266138, "cbid": 211, "correlation": 266138 } }, { "ph": "s", "id": 266138, "pid": 76337, "tid": -914061504, "ts": 1716454225619761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225682186, "dur": 4, "args": { "External id": 266150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266150, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 266150, "pid": 5, "tid": 7, "ts": 1716454225682186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619782, "dur": 6, "args": { "External id": 266150, "cbid": 211, "correlation": 266150 } }, { "ph": "s", "id": 266150, "pid": 76337, "tid": -914061504, "ts": 1716454225619782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225682191, "dur": 17, "args": { "External id": 266153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266153, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266153, "pid": 5, "tid": 7, "ts": 1716454225682191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619800, "dur": 7, "args": { "External id": 266153, "cbid": 211, "correlation": 266153 } }, { "ph": "s", "id": 266153, "pid": 76337, "tid": -914061504, "ts": 1716454225619800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225682209, "dur": 12, "args": { "External id": 266162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266162, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266162, "pid": 5, "tid": 7, "ts": 1716454225682209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619840, "dur": 9, "args": { "External id": 266162, "cbid": 211, "correlation": 266162 } }, { "ph": "s", "id": 266162, "pid": 76337, "tid": -914061504, "ts": 1716454225619840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225619901, "dur": 0, "args": { "External id": 266172, "cbid": 317, "correlation": 266172 } }, { "ph": "f", "id": 266172, "pid": 76337, "tid": -914061504, "ts": 1716454225619901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225619902, "dur": 0, "args": { "External id": 266173, "cbid": 203, "correlation": 266173 } }, { "ph": "f", "id": 266173, "pid": 76337, "tid": -914061504, "ts": 1716454225619902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225619903, "dur": 0, "args": { "External id": 266174, "cbid": 205, "correlation": 266174 } }, { "ph": "f", "id": 266174, "pid": 76337, "tid": -914061504, "ts": 1716454225619903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225682223, "dur": 11, "args": { "External id": 266178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266178, "pid": 5, "tid": 7, "ts": 1716454225682223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619917, "dur": 12, "args": { "External id": 266178, "cbid": 211, "correlation": 266178 } }, { "ph": "s", "id": 266178, "pid": 76337, "tid": -914061504, "ts": 1716454225619917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225682235, "dur": 166, "args": { "External id": 266180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266180, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266180, "pid": 5, "tid": 7, "ts": 1716454225682235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619931, "dur": 5, "args": { "External id": 266180, "cbid": 211, "correlation": 266180 } }, { "ph": "s", "id": 266180, "pid": 76337, "tid": -914061504, "ts": 1716454225619931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225682403, "dur": 1, "args": { "External id": 266182, "device": 5, "context": 1, "stream": 7, "correlation": 266182, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 266182, "pid": 5, "tid": 7, "ts": 1716454225682403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225619942, "dur": 6, "args": { "External id": 266182, "cbid": 51, "correlation": 266182 } }, { "ph": "s", "id": 266182, "pid": 76337, "tid": -914061504, "ts": 1716454225619942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225682407, "dur": 659, "args": { "External id": 266183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266183, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266183, "pid": 5, "tid": 7, "ts": 1716454225682407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619949, "dur": 6, "args": { "External id": 266183, "cbid": 211, "correlation": 266183 } }, { "ph": "s", "id": 266183, "pid": 76337, "tid": -914061504, "ts": 1716454225619949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225683067, "dur": 12, "args": { "External id": 266185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266185, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266185, "pid": 5, "tid": 7, "ts": 1716454225683067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619960, "dur": 5, "args": { "External id": 266185, "cbid": 211, "correlation": 266185 } }, { "ph": "s", "id": 266185, "pid": 76337, "tid": -914061504, "ts": 1716454225619960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225683080, "dur": 15, "args": { "External id": 266191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266191, "pid": 5, "tid": 7, "ts": 1716454225683080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225619996, "dur": 10, "args": { "External id": 266191, "cbid": 211, "correlation": 266191 } }, { "ph": "s", "id": 266191, "pid": 76337, "tid": -914061504, "ts": 1716454225619996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225620056, "dur": 0, "args": { "External id": 266201, "cbid": 317, "correlation": 266201 } }, { "ph": "f", "id": 266201, "pid": 76337, "tid": -914061504, "ts": 1716454225620056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225620057, "dur": 0, "args": { "External id": 266202, "cbid": 203, "correlation": 266202 } }, { "ph": "f", "id": 266202, "pid": 76337, "tid": -914061504, "ts": 1716454225620057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225620058, "dur": 0, "args": { "External id": 266203, "cbid": 205, "correlation": 266203 } }, { "ph": "f", "id": 266203, "pid": 76337, "tid": -914061504, "ts": 1716454225620058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225683096, "dur": 21, "args": { "External id": 266207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266207, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266207, "pid": 5, "tid": 7, "ts": 1716454225683096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620071, "dur": 11, "args": { "External id": 266207, "cbid": 211, "correlation": 266207 } }, { "ph": "s", "id": 266207, "pid": 76337, "tid": -914061504, "ts": 1716454225620071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225683119, "dur": 4, "args": { "External id": 266209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266209, "pid": 5, "tid": 7, "ts": 1716454225683119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620086, "dur": 6, "args": { "External id": 266209, "cbid": 211, "correlation": 266209 } }, { "ph": "s", "id": 266209, "pid": 76337, "tid": -914061504, "ts": 1716454225620086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225620094, "dur": 0, "args": { "External id": 266210, "cbid": 51, "correlation": 266210 } }, { "ph": "s", "id": 266210, "pid": 76337, "tid": -914061504, "ts": 1716454225620094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225683124, "dur": 175, "args": { "External id": 266211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266211, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 266211, "pid": 5, "tid": 7, "ts": 1716454225683124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620095, "dur": 5, "args": { "External id": 266211, "cbid": 211, "correlation": 266211 } }, { "ph": "s", "id": 266211, "pid": 76337, "tid": -914061504, "ts": 1716454225620095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225683301, "dur": 16, "args": { "External id": 266216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266216, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266216, "pid": 5, "tid": 7, "ts": 1716454225683301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620121, "dur": 8, "args": { "External id": 266216, "cbid": 211, "correlation": 266216 } }, { "ph": "s", "id": 266216, "pid": 76337, "tid": -914061504, "ts": 1716454225620121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225683318, "dur": 13, "args": { "External id": 266224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266224, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266224, "pid": 5, "tid": 7, "ts": 1716454225683318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620149, "dur": 8, "args": { "External id": 266224, "cbid": 211, "correlation": 266224 } }, { "ph": "s", "id": 266224, "pid": 76337, "tid": -914061504, "ts": 1716454225620149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225683332, "dur": 11, "args": { "External id": 266232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266232, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266232, "pid": 5, "tid": 7, "ts": 1716454225683332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620179, "dur": 8, "args": { "External id": 266232, "cbid": 211, "correlation": 266232 } }, { "ph": "s", "id": 266232, "pid": 76337, "tid": -914061504, "ts": 1716454225620179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225683344, "dur": 19, "args": { "External id": 266252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266252, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 266252, "pid": 5, "tid": 7, "ts": 1716454225683344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620260, "dur": 13, "args": { "External id": 266252, "cbid": 211, "correlation": 266252 } }, { "ph": "s", "id": 266252, "pid": 76337, "tid": -914061504, "ts": 1716454225620260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225683365, "dur": 4, "args": { "External id": 266264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266264, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 266264, "pid": 5, "tid": 7, "ts": 1716454225683365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620282, "dur": 6, "args": { "External id": 266264, "cbid": 211, "correlation": 266264 } }, { "ph": "s", "id": 266264, "pid": 76337, "tid": -914061504, "ts": 1716454225620282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225683371, "dur": 18, "args": { "External id": 266267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266267, "pid": 5, "tid": 7, "ts": 1716454225683371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620299, "dur": 6, "args": { "External id": 266267, "cbid": 211, "correlation": 266267 } }, { "ph": "s", "id": 266267, "pid": 76337, "tid": -914061504, "ts": 1716454225620299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225620356, "dur": 0, "args": { "External id": 266278, "cbid": 317, "correlation": 266278 } }, { "ph": "f", "id": 266278, "pid": 76337, "tid": -914061504, "ts": 1716454225620356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225620357, "dur": 0, "args": { "External id": 266279, "cbid": 203, "correlation": 266279 } }, { "ph": "f", "id": 266279, "pid": 76337, "tid": -914061504, "ts": 1716454225620357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225620358, "dur": 0, "args": { "External id": 266280, "cbid": 205, "correlation": 266280 } }, { "ph": "f", "id": 266280, "pid": 76337, "tid": -914061504, "ts": 1716454225620358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225683390, "dur": 12, "args": { "External id": 266284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266284, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266284, "pid": 5, "tid": 7, "ts": 1716454225683390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620371, "dur": 11, "args": { "External id": 266284, "cbid": 211, "correlation": 266284 } }, { "ph": "s", "id": 266284, "pid": 76337, "tid": -914061504, "ts": 1716454225620371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225683402, "dur": 3, "args": { "External id": 266286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266286, "pid": 5, "tid": 7, "ts": 1716454225683402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620387, "dur": 5, "args": { "External id": 266286, "cbid": 211, "correlation": 266286 } }, { "ph": "s", "id": 266286, "pid": 76337, "tid": -914061504, "ts": 1716454225620387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225620395, "dur": 0, "args": { "External id": 266287, "cbid": 51, "correlation": 266287 } }, { "ph": "s", "id": 266287, "pid": 76337, "tid": -914061504, "ts": 1716454225620395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225683407, "dur": 93, "args": { "External id": 266288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266288, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 266288, "pid": 5, "tid": 7, "ts": 1716454225683407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620396, "dur": 5, "args": { "External id": 266288, "cbid": 211, "correlation": 266288 } }, { "ph": "s", "id": 266288, "pid": 76337, "tid": -914061504, "ts": 1716454225620396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225683502, "dur": 16, "args": { "External id": 266293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266293, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266293, "pid": 5, "tid": 7, "ts": 1716454225683502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620423, "dur": 8, "args": { "External id": 266293, "cbid": 211, "correlation": 266293 } }, { "ph": "s", "id": 266293, "pid": 76337, "tid": -914061504, "ts": 1716454225620423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225683519, "dur": 85, "args": { "External id": 266302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266302, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266302, "pid": 5, "tid": 7, "ts": 1716454225683519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620502, "dur": 15, "args": { "External id": 266302, "cbid": 211, "correlation": 266302 } }, { "ph": "s", "id": 266302, "pid": 76337, "tid": -914061504, "ts": 1716454225620502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225683605, "dur": 31, "args": { "External id": 266324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266324, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266324, "pid": 5, "tid": 7, "ts": 1716454225683605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620559, "dur": 11, "args": { "External id": 266324, "cbid": 211, "correlation": 266324 } }, { "ph": "s", "id": 266324, "pid": 76337, "tid": -914061504, "ts": 1716454225620559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225620647, "dur": 1, "args": { "External id": 266335, "cbid": 251, "correlation": 266335 } }, { "ph": "f", "id": 266335, "pid": 76337, "tid": -914061504, "ts": 1716454225620647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225683637, "dur": 165, "args": { "External id": 266336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266336, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266336, "pid": 5, "tid": 7, "ts": 1716454225683637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620652, "dur": 14, "args": { "External id": 266336, "cbid": 211, "correlation": 266336 } }, { "ph": "s", "id": 266336, "pid": 76337, "tid": -914061504, "ts": 1716454225620652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225620723, "dur": 1, "args": { "External id": 266347, "cbid": 251, "correlation": 266347 } }, { "ph": "f", "id": 266347, "pid": 76337, "tid": -914061504, "ts": 1716454225620723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225683804, "dur": 162, "args": { "External id": 266348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266348, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266348, "pid": 5, "tid": 7, "ts": 1716454225683804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620727, "dur": 11, "args": { "External id": 266348, "cbid": 211, "correlation": 266348 } }, { "ph": "s", "id": 266348, "pid": 76337, "tid": -914061504, "ts": 1716454225620727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225620791, "dur": 1, "args": { "External id": 266359, "cbid": 251, "correlation": 266359 } }, { "ph": "f", "id": 266359, "pid": 76337, "tid": -914061504, "ts": 1716454225620791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225683967, "dur": 162, "args": { "External id": 266360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266360, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266360, "pid": 5, "tid": 7, "ts": 1716454225683967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620795, "dur": 11, "args": { "External id": 266360, "cbid": 211, "correlation": 266360 } }, { "ph": "s", "id": 266360, "pid": 76337, "tid": -914061504, "ts": 1716454225620795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225684131, "dur": 339, "args": { "External id": 266385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266385, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266385, "pid": 5, "tid": 7, "ts": 1716454225684131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620879, "dur": 13, "args": { "External id": 266385, "cbid": 211, "correlation": 266385 } }, { "ph": "s", "id": 266385, "pid": 76337, "tid": -914061504, "ts": 1716454225620879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225620985, "dur": 1, "args": { "External id": 266403, "cbid": 251, "correlation": 266403 } }, { "ph": "f", "id": 266403, "pid": 76337, "tid": -914061504, "ts": 1716454225620985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225684471, "dur": 169, "args": { "External id": 266405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266405, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266405, "pid": 5, "tid": 7, "ts": 1716454225684471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225620991, "dur": 14, "args": { "External id": 266405, "cbid": 211, "correlation": 266405 } }, { "ph": "s", "id": 266405, "pid": 76337, "tid": -914061504, "ts": 1716454225620991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225684641, "dur": 20, "args": { "External id": 266413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266413, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266413, "pid": 5, "tid": 7, "ts": 1716454225684641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621061, "dur": 13, "args": { "External id": 266413, "cbid": 211, "correlation": 266413 } }, { "ph": "s", "id": 266413, "pid": 76337, "tid": -914061504, "ts": 1716454225621061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225684662, "dur": 28, "args": { "External id": 266421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266421, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266421, "pid": 5, "tid": 7, "ts": 1716454225684662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621100, "dur": 8, "args": { "External id": 266421, "cbid": 211, "correlation": 266421 } }, { "ph": "s", "id": 266421, "pid": 76337, "tid": -914061504, "ts": 1716454225621100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225684691, "dur": 18, "args": { "External id": 266432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266432, "pid": 5, "tid": 7, "ts": 1716454225684691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621171, "dur": 12, "args": { "External id": 266432, "cbid": 211, "correlation": 266432 } }, { "ph": "s", "id": 266432, "pid": 76337, "tid": -914061504, "ts": 1716454225621171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225684711, "dur": 16, "args": { "External id": 266454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266454, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266454, "pid": 5, "tid": 7, "ts": 1716454225684711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621202, "dur": 7, "args": { "External id": 266454, "cbid": 211, "correlation": 266454 } }, { "ph": "s", "id": 266454, "pid": 76337, "tid": -914061504, "ts": 1716454225621202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621286, "dur": 1, "args": { "External id": 266465, "cbid": 251, "correlation": 266465 } }, { "ph": "f", "id": 266465, "pid": 76337, "tid": -914061504, "ts": 1716454225621286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225684728, "dur": 90, "args": { "External id": 266466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266466, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266466, "pid": 5, "tid": 7, "ts": 1716454225684728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621291, "dur": 13, "args": { "External id": 266466, "cbid": 211, "correlation": 266466 } }, { "ph": "s", "id": 266466, "pid": 76337, "tid": -914061504, "ts": 1716454225621291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621360, "dur": 1, "args": { "External id": 266477, "cbid": 251, "correlation": 266477 } }, { "ph": "f", "id": 266477, "pid": 76337, "tid": -914061504, "ts": 1716454225621360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621364, "dur": 0, "args": { "External id": 266478, "cbid": 251, "correlation": 266478 } }, { "ph": "f", "id": 266478, "pid": 76337, "tid": -914061504, "ts": 1716454225621364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225684820, "dur": 12, "args": { "External id": 266479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266479, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266479, "pid": 5, "tid": 7, "ts": 1716454225684820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621366, "dur": 12, "args": { "External id": 266479, "cbid": 211, "correlation": 266479 } }, { "ph": "s", "id": 266479, "pid": 76337, "tid": -914061504, "ts": 1716454225621366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225684833, "dur": 6, "args": { "External id": 266481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266481, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266481, "pid": 5, "tid": 7, "ts": 1716454225684833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621380, "dur": 6, "args": { "External id": 266481, "cbid": 211, "correlation": 266481 } }, { "ph": "s", "id": 266481, "pid": 76337, "tid": -914061504, "ts": 1716454225621380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621437, "dur": 1, "args": { "External id": 266492, "cbid": 251, "correlation": 266492 } }, { "ph": "f", "id": 266492, "pid": 76337, "tid": -914061504, "ts": 1716454225621437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621441, "dur": 0, "args": { "External id": 266493, "cbid": 251, "correlation": 266493 } }, { "ph": "f", "id": 266493, "pid": 76337, "tid": -914061504, "ts": 1716454225621441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225684840, "dur": 8, "args": { "External id": 266494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266494, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266494, "pid": 5, "tid": 7, "ts": 1716454225684840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621442, "dur": 12, "args": { "External id": 266494, "cbid": 211, "correlation": 266494 } }, { "ph": "s", "id": 266494, "pid": 76337, "tid": -914061504, "ts": 1716454225621442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225684850, "dur": 3, "args": { "External id": 266496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266496, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266496, "pid": 5, "tid": 7, "ts": 1716454225684850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621455, "dur": 5, "args": { "External id": 266496, "cbid": 211, "correlation": 266496 } }, { "ph": "s", "id": 266496, "pid": 76337, "tid": -914061504, "ts": 1716454225621455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225684854, "dur": 56, "args": { "External id": 266521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266521, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266521, "pid": 5, "tid": 7, "ts": 1716454225684854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621531, "dur": 12, "args": { "External id": 266521, "cbid": 211, "correlation": 266521 } }, { "ph": "s", "id": 266521, "pid": 76337, "tid": -914061504, "ts": 1716454225621531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621630, "dur": 1, "args": { "External id": 266539, "cbid": 251, "correlation": 266539 } }, { "ph": "f", "id": 266539, "pid": 76337, "tid": -914061504, "ts": 1716454225621630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225684912, "dur": 92, "args": { "External id": 266541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266541, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266541, "pid": 5, "tid": 7, "ts": 1716454225684912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621636, "dur": 13, "args": { "External id": 266541, "cbid": 211, "correlation": 266541 } }, { "ph": "s", "id": 266541, "pid": 76337, "tid": -914061504, "ts": 1716454225621636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225685005, "dur": 10, "args": { "External id": 266549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266549, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266549, "pid": 5, "tid": 7, "ts": 1716454225685005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621704, "dur": 12, "args": { "External id": 266549, "cbid": 211, "correlation": 266549 } }, { "ph": "s", "id": 266549, "pid": 76337, "tid": -914061504, "ts": 1716454225621704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225685016, "dur": 20, "args": { "External id": 266557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266557, "pid": 5, "tid": 7, "ts": 1716454225685016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621746, "dur": 9, "args": { "External id": 266557, "cbid": 211, "correlation": 266557 } }, { "ph": "s", "id": 266557, "pid": 76337, "tid": -914061504, "ts": 1716454225621746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225685038, "dur": 18, "args": { "External id": 266579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266579, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266579, "pid": 5, "tid": 7, "ts": 1716454225685038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621797, "dur": 10, "args": { "External id": 266579, "cbid": 211, "correlation": 266579 } }, { "ph": "s", "id": 266579, "pid": 76337, "tid": -914061504, "ts": 1716454225621797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621883, "dur": 1, "args": { "External id": 266595, "cbid": 251, "correlation": 266595 } }, { "ph": "f", "id": 266595, "pid": 76337, "tid": -914061504, "ts": 1716454225621883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225621888, "dur": 0, "args": { "External id": 266597, "cbid": 251, "correlation": 266597 } }, { "ph": "f", "id": 266597, "pid": 76337, "tid": -914061504, "ts": 1716454225621888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225685057, "dur": 498, "args": { "External id": 266598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266598, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266598, "pid": 5, "tid": 7, "ts": 1716454225685057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621890, "dur": 12, "args": { "External id": 266598, "cbid": 211, "correlation": 266598 } }, { "ph": "s", "id": 266598, "pid": 76337, "tid": -914061504, "ts": 1716454225621890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225685557, "dur": 67, "args": { "External id": 266606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266606, "pid": 5, "tid": 7, "ts": 1716454225685557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621954, "dur": 12, "args": { "External id": 266606, "cbid": 211, "correlation": 266606 } }, { "ph": "s", "id": 266606, "pid": 76337, "tid": -914061504, "ts": 1716454225621954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225685625, "dur": 65, "args": { "External id": 266614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266614, "pid": 5, "tid": 7, "ts": 1716454225685625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225621993, "dur": 9, "args": { "External id": 266614, "cbid": 211, "correlation": 266614 } }, { "ph": "s", "id": 266614, "pid": 76337, "tid": -914061504, "ts": 1716454225621993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225622077, "dur": 1, "args": { "External id": 266630, "cbid": 251, "correlation": 266630 } }, { "ph": "f", "id": 266630, "pid": 76337, "tid": -914061504, "ts": 1716454225622077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225685693, "dur": 1, "args": { "External id": 266632, "device": 5, "context": 1, "stream": 7, "correlation": 266632, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 266632, "pid": 5, "tid": 7, "ts": 1716454225685693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225622082, "dur": 9, "args": { "External id": 266632, "cbid": 51, "correlation": 266632 } }, { "ph": "s", "id": 266632, "pid": 76337, "tid": -914061504, "ts": 1716454225622082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225685697, "dur": 275, "args": { "External id": 266633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266633, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266633, "pid": 5, "tid": 7, "ts": 1716454225685697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622092, "dur": 11, "args": { "External id": 266633, "cbid": 211, "correlation": 266633 } }, { "ph": "s", "id": 266633, "pid": 76337, "tid": -914061504, "ts": 1716454225622092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225685973, "dur": 14, "args": { "External id": 266641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266641, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266641, "pid": 5, "tid": 7, "ts": 1716454225685973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622135, "dur": 10, "args": { "External id": 266641, "cbid": 211, "correlation": 266641 } }, { "ph": "s", "id": 266641, "pid": 76337, "tid": -914061504, "ts": 1716454225622135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225685989, "dur": 39, "args": { "External id": 266652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266652, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266652, "pid": 5, "tid": 7, "ts": 1716454225685989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622203, "dur": 13, "args": { "External id": 266652, "cbid": 211, "correlation": 266652 } }, { "ph": "s", "id": 266652, "pid": 76337, "tid": -914061504, "ts": 1716454225622203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225622267, "dur": 0, "args": { "External id": 266664, "cbid": 317, "correlation": 266664 } }, { "ph": "f", "id": 266664, "pid": 76337, "tid": -914061504, "ts": 1716454225622267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225622268, "dur": 0, "args": { "External id": 266665, "cbid": 203, "correlation": 266665 } }, { "ph": "f", "id": 266665, "pid": 76337, "tid": -914061504, "ts": 1716454225622268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225622269, "dur": 0, "args": { "External id": 266666, "cbid": 205, "correlation": 266666 } }, { "ph": "f", "id": 266666, "pid": 76337, "tid": -914061504, "ts": 1716454225622269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225686029, "dur": 12, "args": { "External id": 266670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266670, "pid": 5, "tid": 7, "ts": 1716454225686029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622284, "dur": 12, "args": { "External id": 266670, "cbid": 211, "correlation": 266670 } }, { "ph": "s", "id": 266670, "pid": 76337, "tid": -914061504, "ts": 1716454225622284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225686042, "dur": 4, "args": { "External id": 266672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266672, "pid": 5, "tid": 7, "ts": 1716454225686042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622301, "dur": 6, "args": { "External id": 266672, "cbid": 211, "correlation": 266672 } }, { "ph": "s", "id": 266672, "pid": 76337, "tid": -914061504, "ts": 1716454225622301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225622309, "dur": 0, "args": { "External id": 266673, "cbid": 51, "correlation": 266673 } }, { "ph": "s", "id": 266673, "pid": 76337, "tid": -914061504, "ts": 1716454225622309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225686048, "dur": 98, "args": { "External id": 266674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266674, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 266674, "pid": 5, "tid": 7, "ts": 1716454225686048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622310, "dur": 5, "args": { "External id": 266674, "cbid": 211, "correlation": 266674 } }, { "ph": "s", "id": 266674, "pid": 76337, "tid": -914061504, "ts": 1716454225622310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225686147, "dur": 17, "args": { "External id": 266679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266679, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266679, "pid": 5, "tid": 7, "ts": 1716454225686147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622338, "dur": 9, "args": { "External id": 266679, "cbid": 211, "correlation": 266679 } }, { "ph": "s", "id": 266679, "pid": 76337, "tid": -914061504, "ts": 1716454225622338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225686165, "dur": 12, "args": { "External id": 266687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266687, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266687, "pid": 5, "tid": 7, "ts": 1716454225686165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622370, "dur": 9, "args": { "External id": 266687, "cbid": 211, "correlation": 266687 } }, { "ph": "s", "id": 266687, "pid": 76337, "tid": -914061504, "ts": 1716454225622370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225686178, "dur": 25, "args": { "External id": 266696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266696, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266696, "pid": 5, "tid": 7, "ts": 1716454225686178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622409, "dur": 10, "args": { "External id": 266696, "cbid": 211, "correlation": 266696 } }, { "ph": "s", "id": 266696, "pid": 76337, "tid": -914061504, "ts": 1716454225622409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225686204, "dur": 25, "args": { "External id": 266716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266716, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 266716, "pid": 5, "tid": 7, "ts": 1716454225686204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622480, "dur": 12, "args": { "External id": 266716, "cbid": 211, "correlation": 266716 } }, { "ph": "s", "id": 266716, "pid": 76337, "tid": -914061504, "ts": 1716454225622480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225686230, "dur": 5, "args": { "External id": 266728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266728, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 266728, "pid": 5, "tid": 7, "ts": 1716454225686230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622501, "dur": 7, "args": { "External id": 266728, "cbid": 211, "correlation": 266728 } }, { "ph": "s", "id": 266728, "pid": 76337, "tid": -914061504, "ts": 1716454225622501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225686236, "dur": 25, "args": { "External id": 266731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266731, "pid": 5, "tid": 7, "ts": 1716454225686236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622520, "dur": 7, "args": { "External id": 266731, "cbid": 211, "correlation": 266731 } }, { "ph": "s", "id": 266731, "pid": 76337, "tid": -914061504, "ts": 1716454225622520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225686263, "dur": 17, "args": { "External id": 266740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266740, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266740, "pid": 5, "tid": 7, "ts": 1716454225686263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622558, "dur": 9, "args": { "External id": 266740, "cbid": 211, "correlation": 266740 } }, { "ph": "s", "id": 266740, "pid": 76337, "tid": -914061504, "ts": 1716454225622558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225622610, "dur": 0, "args": { "External id": 266750, "cbid": 317, "correlation": 266750 } }, { "ph": "f", "id": 266750, "pid": 76337, "tid": -914061504, "ts": 1716454225622610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225622611, "dur": 0, "args": { "External id": 266751, "cbid": 203, "correlation": 266751 } }, { "ph": "f", "id": 266751, "pid": 76337, "tid": -914061504, "ts": 1716454225622611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225622612, "dur": 0, "args": { "External id": 266752, "cbid": 205, "correlation": 266752 } }, { "ph": "f", "id": 266752, "pid": 76337, "tid": -914061504, "ts": 1716454225622612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225686281, "dur": 17, "args": { "External id": 266756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266756, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266756, "pid": 5, "tid": 7, "ts": 1716454225686281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622626, "dur": 13, "args": { "External id": 266756, "cbid": 211, "correlation": 266756 } }, { "ph": "s", "id": 266756, "pid": 76337, "tid": -914061504, "ts": 1716454225622626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225686299, "dur": 246, "args": { "External id": 266758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266758, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266758, "pid": 5, "tid": 7, "ts": 1716454225686299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622641, "dur": 5, "args": { "External id": 266758, "cbid": 211, "correlation": 266758 } }, { "ph": "s", "id": 266758, "pid": 76337, "tid": -914061504, "ts": 1716454225622641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225686548, "dur": 1, "args": { "External id": 266760, "device": 5, "context": 1, "stream": 7, "correlation": 266760, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 266760, "pid": 5, "tid": 7, "ts": 1716454225686548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225622653, "dur": 8, "args": { "External id": 266760, "cbid": 51, "correlation": 266760 } }, { "ph": "s", "id": 266760, "pid": 76337, "tid": -914061504, "ts": 1716454225622653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225686551, "dur": 822, "args": { "External id": 266761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266761, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266761, "pid": 5, "tid": 7, "ts": 1716454225686551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622661, "dur": 6, "args": { "External id": 266761, "cbid": 211, "correlation": 266761 } }, { "ph": "s", "id": 266761, "pid": 76337, "tid": -914061504, "ts": 1716454225622661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225687375, "dur": 13, "args": { "External id": 266763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266763, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266763, "pid": 5, "tid": 7, "ts": 1716454225687375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622671, "dur": 5, "args": { "External id": 266763, "cbid": 211, "correlation": 266763 } }, { "ph": "s", "id": 266763, "pid": 76337, "tid": -914061504, "ts": 1716454225622671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225687390, "dur": 15, "args": { "External id": 266769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266769, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266769, "pid": 5, "tid": 7, "ts": 1716454225687390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622700, "dur": 8, "args": { "External id": 266769, "cbid": 211, "correlation": 266769 } }, { "ph": "s", "id": 266769, "pid": 76337, "tid": -914061504, "ts": 1716454225622700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225687406, "dur": 3, "args": { "External id": 266777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266777, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 266777, "pid": 5, "tid": 7, "ts": 1716454225687406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622744, "dur": 9, "args": { "External id": 266777, "cbid": 211, "correlation": 266777 } }, { "ph": "s", "id": 266777, "pid": 76337, "tid": -914061504, "ts": 1716454225622744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225622808, "dur": 1, "args": { "External id": 266793, "cbid": 251, "correlation": 266793 } }, { "ph": "f", "id": 266793, "pid": 76337, "tid": -914061504, "ts": 1716454225622808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225622813, "dur": 0, "args": { "External id": 266795, "cbid": 251, "correlation": 266795 } }, { "ph": "f", "id": 266795, "pid": 76337, "tid": -914061504, "ts": 1716454225622813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225687410, "dur": 13, "args": { "External id": 266796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266796, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266796, "pid": 5, "tid": 7, "ts": 1716454225687410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622815, "dur": 11, "args": { "External id": 266796, "cbid": 211, "correlation": 266796 } }, { "ph": "s", "id": 266796, "pid": 76337, "tid": -914061504, "ts": 1716454225622815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225687425, "dur": 5, "args": { "External id": 266798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266798, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266798, "pid": 5, "tid": 7, "ts": 1716454225687425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622828, "dur": 6, "args": { "External id": 266798, "cbid": 211, "correlation": 266798 } }, { "ph": "s", "id": 266798, "pid": 76337, "tid": -914061504, "ts": 1716454225622828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225687431, "dur": 17, "args": { "External id": 266808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266808, "pid": 5, "tid": 7, "ts": 1716454225687431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622887, "dur": 13, "args": { "External id": 266808, "cbid": 211, "correlation": 266808 } }, { "ph": "s", "id": 266808, "pid": 76337, "tid": -914061504, "ts": 1716454225622887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225687450, "dur": 18, "args": { "External id": 266828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266828, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 266828, "pid": 5, "tid": 7, "ts": 1716454225687450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622952, "dur": 10, "args": { "External id": 266828, "cbid": 211, "correlation": 266828 } }, { "ph": "s", "id": 266828, "pid": 76337, "tid": -914061504, "ts": 1716454225622952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225687469, "dur": 4, "args": { "External id": 266840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266840, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 266840, "pid": 5, "tid": 7, "ts": 1716454225687469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622972, "dur": 15, "args": { "External id": 266840, "cbid": 211, "correlation": 266840 } }, { "ph": "s", "id": 266840, "pid": 76337, "tid": -914061504, "ts": 1716454225622972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225687475, "dur": 17, "args": { "External id": 266843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266843, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266843, "pid": 5, "tid": 7, "ts": 1716454225687475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225622999, "dur": 7, "args": { "External id": 266843, "cbid": 211, "correlation": 266843 } }, { "ph": "s", "id": 266843, "pid": 76337, "tid": -914061504, "ts": 1716454225622999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225687493, "dur": 11, "args": { "External id": 266852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266852, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266852, "pid": 5, "tid": 7, "ts": 1716454225687493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623042, "dur": 10, "args": { "External id": 266852, "cbid": 211, "correlation": 266852 } }, { "ph": "s", "id": 266852, "pid": 76337, "tid": -914061504, "ts": 1716454225623042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225623104, "dur": 0, "args": { "External id": 266862, "cbid": 317, "correlation": 266862 } }, { "ph": "f", "id": 266862, "pid": 76337, "tid": -914061504, "ts": 1716454225623104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225623105, "dur": 0, "args": { "External id": 266863, "cbid": 203, "correlation": 266863 } }, { "ph": "f", "id": 266863, "pid": 76337, "tid": -914061504, "ts": 1716454225623105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225623105, "dur": 0, "args": { "External id": 266864, "cbid": 205, "correlation": 266864 } }, { "ph": "f", "id": 266864, "pid": 76337, "tid": -914061504, "ts": 1716454225623105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225687506, "dur": 12, "args": { "External id": 266868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266868, "pid": 5, "tid": 7, "ts": 1716454225687506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623118, "dur": 12, "args": { "External id": 266868, "cbid": 211, "correlation": 266868 } }, { "ph": "s", "id": 266868, "pid": 76337, "tid": -914061504, "ts": 1716454225623118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225687518, "dur": 165, "args": { "External id": 266870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266870, "pid": 5, "tid": 7, "ts": 1716454225687518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623132, "dur": 6, "args": { "External id": 266870, "cbid": 211, "correlation": 266870 } }, { "ph": "s", "id": 266870, "pid": 76337, "tid": -914061504, "ts": 1716454225623132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225687686, "dur": 1, "args": { "External id": 266872, "device": 5, "context": 1, "stream": 7, "correlation": 266872, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 266872, "pid": 5, "tid": 7, "ts": 1716454225687686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225623145, "dur": 6, "args": { "External id": 266872, "cbid": 51, "correlation": 266872 } }, { "ph": "s", "id": 266872, "pid": 76337, "tid": -914061504, "ts": 1716454225623145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225687690, "dur": 660, "args": { "External id": 266873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266873, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 266873, "pid": 5, "tid": 7, "ts": 1716454225687690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623152, "dur": 6, "args": { "External id": 266873, "cbid": 211, "correlation": 266873 } }, { "ph": "s", "id": 266873, "pid": 76337, "tid": -914061504, "ts": 1716454225623152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225688351, "dur": 13, "args": { "External id": 266875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266875, "pid": 5, "tid": 7, "ts": 1716454225688351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623162, "dur": 5, "args": { "External id": 266875, "cbid": 211, "correlation": 266875 } }, { "ph": "s", "id": 266875, "pid": 76337, "tid": -914061504, "ts": 1716454225623162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225688365, "dur": 15, "args": { "External id": 266881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266881, "pid": 5, "tid": 7, "ts": 1716454225688365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623191, "dur": 8, "args": { "External id": 266881, "cbid": 211, "correlation": 266881 } }, { "ph": "s", "id": 266881, "pid": 76337, "tid": -914061504, "ts": 1716454225623191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225623249, "dur": 0, "args": { "External id": 266891, "cbid": 317, "correlation": 266891 } }, { "ph": "f", "id": 266891, "pid": 76337, "tid": -914061504, "ts": 1716454225623249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225623250, "dur": 0, "args": { "External id": 266892, "cbid": 203, "correlation": 266892 } }, { "ph": "f", "id": 266892, "pid": 76337, "tid": -914061504, "ts": 1716454225623250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225623250, "dur": 0, "args": { "External id": 266893, "cbid": 205, "correlation": 266893 } }, { "ph": "f", "id": 266893, "pid": 76337, "tid": -914061504, "ts": 1716454225623250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225688382, "dur": 17, "args": { "External id": 266897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266897, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266897, "pid": 5, "tid": 7, "ts": 1716454225688382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623264, "dur": 11, "args": { "External id": 266897, "cbid": 211, "correlation": 266897 } }, { "ph": "s", "id": 266897, "pid": 76337, "tid": -914061504, "ts": 1716454225623264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225688400, "dur": 4, "args": { "External id": 266899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266899, "pid": 5, "tid": 7, "ts": 1716454225688400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623280, "dur": 6, "args": { "External id": 266899, "cbid": 211, "correlation": 266899 } }, { "ph": "s", "id": 266899, "pid": 76337, "tid": -914061504, "ts": 1716454225623280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225623290, "dur": 0, "args": { "External id": 266900, "cbid": 51, "correlation": 266900 } }, { "ph": "s", "id": 266900, "pid": 76337, "tid": -914061504, "ts": 1716454225623290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225688405, "dur": 135, "args": { "External id": 266901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266901, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 266901, "pid": 5, "tid": 7, "ts": 1716454225688405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623291, "dur": 5, "args": { "External id": 266901, "cbid": 211, "correlation": 266901 } }, { "ph": "s", "id": 266901, "pid": 76337, "tid": -914061504, "ts": 1716454225623291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225688541, "dur": 15, "args": { "External id": 266906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266906, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266906, "pid": 5, "tid": 7, "ts": 1716454225688541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623316, "dur": 8, "args": { "External id": 266906, "cbid": 211, "correlation": 266906 } }, { "ph": "s", "id": 266906, "pid": 76337, "tid": -914061504, "ts": 1716454225623316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225688558, "dur": 13, "args": { "External id": 266914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266914, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266914, "pid": 5, "tid": 7, "ts": 1716454225688558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623345, "dur": 8, "args": { "External id": 266914, "cbid": 211, "correlation": 266914 } }, { "ph": "s", "id": 266914, "pid": 76337, "tid": -914061504, "ts": 1716454225623345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225688572, "dur": 10, "args": { "External id": 266922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266922, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266922, "pid": 5, "tid": 7, "ts": 1716454225688572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623373, "dur": 8, "args": { "External id": 266922, "cbid": 211, "correlation": 266922 } }, { "ph": "s", "id": 266922, "pid": 76337, "tid": -914061504, "ts": 1716454225623373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225688584, "dur": 19, "args": { "External id": 266942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266942, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 266942, "pid": 5, "tid": 7, "ts": 1716454225688584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623456, "dur": 12, "args": { "External id": 266942, "cbid": 211, "correlation": 266942 } }, { "ph": "s", "id": 266942, "pid": 76337, "tid": -914061504, "ts": 1716454225623456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225688604, "dur": 5, "args": { "External id": 266954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266954, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 266954, "pid": 5, "tid": 7, "ts": 1716454225688604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623478, "dur": 6, "args": { "External id": 266954, "cbid": 211, "correlation": 266954 } }, { "ph": "s", "id": 266954, "pid": 76337, "tid": -914061504, "ts": 1716454225623478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225688610, "dur": 18, "args": { "External id": 266957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266957, "pid": 5, "tid": 7, "ts": 1716454225688610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623496, "dur": 7, "args": { "External id": 266957, "cbid": 211, "correlation": 266957 } }, { "ph": "s", "id": 266957, "pid": 76337, "tid": -914061504, "ts": 1716454225623496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225623553, "dur": 0, "args": { "External id": 266968, "cbid": 317, "correlation": 266968 } }, { "ph": "f", "id": 266968, "pid": 76337, "tid": -914061504, "ts": 1716454225623553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225623554, "dur": 0, "args": { "External id": 266969, "cbid": 203, "correlation": 266969 } }, { "ph": "f", "id": 266969, "pid": 76337, "tid": -914061504, "ts": 1716454225623554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225623555, "dur": 0, "args": { "External id": 266970, "cbid": 205, "correlation": 266970 } }, { "ph": "f", "id": 266970, "pid": 76337, "tid": -914061504, "ts": 1716454225623555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225688629, "dur": 12, "args": { "External id": 266974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266974, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266974, "pid": 5, "tid": 7, "ts": 1716454225688629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623568, "dur": 11, "args": { "External id": 266974, "cbid": 211, "correlation": 266974 } }, { "ph": "s", "id": 266974, "pid": 76337, "tid": -914061504, "ts": 1716454225623568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225688642, "dur": 3, "args": { "External id": 266976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 266976, "pid": 5, "tid": 7, "ts": 1716454225688642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623583, "dur": 6, "args": { "External id": 266976, "cbid": 211, "correlation": 266976 } }, { "ph": "s", "id": 266976, "pid": 76337, "tid": -914061504, "ts": 1716454225623583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225623593, "dur": 0, "args": { "External id": 266977, "cbid": 51, "correlation": 266977 } }, { "ph": "s", "id": 266977, "pid": 76337, "tid": -914061504, "ts": 1716454225623593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225688647, "dur": 92, "args": { "External id": 266978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266978, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 266978, "pid": 5, "tid": 7, "ts": 1716454225688647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623593, "dur": 5, "args": { "External id": 266978, "cbid": 211, "correlation": 266978 } }, { "ph": "s", "id": 266978, "pid": 76337, "tid": -914061504, "ts": 1716454225623593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225688740, "dur": 16, "args": { "External id": 266983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266983, "pid": 5, "tid": 7, "ts": 1716454225688740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623620, "dur": 8, "args": { "External id": 266983, "cbid": 211, "correlation": 266983 } }, { "ph": "s", "id": 266983, "pid": 76337, "tid": -914061504, "ts": 1716454225623620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225688757, "dur": 84, "args": { "External id": 266992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 266992, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 266992, "pid": 5, "tid": 7, "ts": 1716454225688757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623700, "dur": 14, "args": { "External id": 266992, "cbid": 211, "correlation": 266992 } }, { "ph": "s", "id": 266992, "pid": 76337, "tid": -914061504, "ts": 1716454225623700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225688843, "dur": 31, "args": { "External id": 267014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267014, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267014, "pid": 5, "tid": 7, "ts": 1716454225688843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623757, "dur": 10, "args": { "External id": 267014, "cbid": 211, "correlation": 267014 } }, { "ph": "s", "id": 267014, "pid": 76337, "tid": -914061504, "ts": 1716454225623757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225623846, "dur": 1, "args": { "External id": 267025, "cbid": 251, "correlation": 267025 } }, { "ph": "f", "id": 267025, "pid": 76337, "tid": -914061504, "ts": 1716454225623846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225688875, "dur": 167, "args": { "External id": 267026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267026, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267026, "pid": 5, "tid": 7, "ts": 1716454225688875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623852, "dur": 13, "args": { "External id": 267026, "cbid": 211, "correlation": 267026 } }, { "ph": "s", "id": 267026, "pid": 76337, "tid": -914061504, "ts": 1716454225623852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225623921, "dur": 1, "args": { "External id": 267037, "cbid": 251, "correlation": 267037 } }, { "ph": "f", "id": 267037, "pid": 76337, "tid": -914061504, "ts": 1716454225623921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225689044, "dur": 161, "args": { "External id": 267038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267038, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267038, "pid": 5, "tid": 7, "ts": 1716454225689044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225623925, "dur": 11, "args": { "External id": 267038, "cbid": 211, "correlation": 267038 } }, { "ph": "s", "id": 267038, "pid": 76337, "tid": -914061504, "ts": 1716454225623925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225623999, "dur": 1, "args": { "External id": 267049, "cbid": 251, "correlation": 267049 } }, { "ph": "f", "id": 267049, "pid": 76337, "tid": -914061504, "ts": 1716454225623999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225689206, "dur": 162, "args": { "External id": 267050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267050, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267050, "pid": 5, "tid": 7, "ts": 1716454225689206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624003, "dur": 12, "args": { "External id": 267050, "cbid": 211, "correlation": 267050 } }, { "ph": "s", "id": 267050, "pid": 76337, "tid": -914061504, "ts": 1716454225624003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225689369, "dur": 342, "args": { "External id": 267075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267075, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267075, "pid": 5, "tid": 7, "ts": 1716454225689369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624090, "dur": 13, "args": { "External id": 267075, "cbid": 211, "correlation": 267075 } }, { "ph": "s", "id": 267075, "pid": 76337, "tid": -914061504, "ts": 1716454225624090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624188, "dur": 1, "args": { "External id": 267093, "cbid": 251, "correlation": 267093 } }, { "ph": "f", "id": 267093, "pid": 76337, "tid": -914061504, "ts": 1716454225624188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225689713, "dur": 166, "args": { "External id": 267095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267095, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267095, "pid": 5, "tid": 7, "ts": 1716454225689713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624194, "dur": 13, "args": { "External id": 267095, "cbid": 211, "correlation": 267095 } }, { "ph": "s", "id": 267095, "pid": 76337, "tid": -914061504, "ts": 1716454225624194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225689881, "dur": 19, "args": { "External id": 267103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267103, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267103, "pid": 5, "tid": 7, "ts": 1716454225689881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624263, "dur": 13, "args": { "External id": 267103, "cbid": 211, "correlation": 267103 } }, { "ph": "s", "id": 267103, "pid": 76337, "tid": -914061504, "ts": 1716454225624263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225689902, "dur": 27, "args": { "External id": 267111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267111, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267111, "pid": 5, "tid": 7, "ts": 1716454225689902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624303, "dur": 9, "args": { "External id": 267111, "cbid": 211, "correlation": 267111 } }, { "ph": "s", "id": 267111, "pid": 76337, "tid": -914061504, "ts": 1716454225624303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225689930, "dur": 19, "args": { "External id": 267122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267122, "pid": 5, "tid": 7, "ts": 1716454225689930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624373, "dur": 12, "args": { "External id": 267122, "cbid": 211, "correlation": 267122 } }, { "ph": "s", "id": 267122, "pid": 76337, "tid": -914061504, "ts": 1716454225624373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225689951, "dur": 16, "args": { "External id": 267144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267144, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267144, "pid": 5, "tid": 7, "ts": 1716454225689951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624405, "dur": 7, "args": { "External id": 267144, "cbid": 211, "correlation": 267144 } }, { "ph": "s", "id": 267144, "pid": 76337, "tid": -914061504, "ts": 1716454225624405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624489, "dur": 1, "args": { "External id": 267155, "cbid": 251, "correlation": 267155 } }, { "ph": "f", "id": 267155, "pid": 76337, "tid": -914061504, "ts": 1716454225624489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225689968, "dur": 90, "args": { "External id": 267156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267156, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 267156, "pid": 5, "tid": 7, "ts": 1716454225689968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624495, "dur": 13, "args": { "External id": 267156, "cbid": 211, "correlation": 267156 } }, { "ph": "s", "id": 267156, "pid": 76337, "tid": -914061504, "ts": 1716454225624495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624563, "dur": 1, "args": { "External id": 267167, "cbid": 251, "correlation": 267167 } }, { "ph": "f", "id": 267167, "pid": 76337, "tid": -914061504, "ts": 1716454225624563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624567, "dur": 0, "args": { "External id": 267168, "cbid": 251, "correlation": 267168 } }, { "ph": "f", "id": 267168, "pid": 76337, "tid": -914061504, "ts": 1716454225624567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225690060, "dur": 12, "args": { "External id": 267169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267169, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267169, "pid": 5, "tid": 7, "ts": 1716454225690060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624569, "dur": 12, "args": { "External id": 267169, "cbid": 211, "correlation": 267169 } }, { "ph": "s", "id": 267169, "pid": 76337, "tid": -914061504, "ts": 1716454225624569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225690073, "dur": 5, "args": { "External id": 267171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267171, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267171, "pid": 5, "tid": 7, "ts": 1716454225690073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624582, "dur": 6, "args": { "External id": 267171, "cbid": 211, "correlation": 267171 } }, { "ph": "s", "id": 267171, "pid": 76337, "tid": -914061504, "ts": 1716454225624582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624639, "dur": 1, "args": { "External id": 267182, "cbid": 251, "correlation": 267182 } }, { "ph": "f", "id": 267182, "pid": 76337, "tid": -914061504, "ts": 1716454225624639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624643, "dur": 0, "args": { "External id": 267183, "cbid": 251, "correlation": 267183 } }, { "ph": "f", "id": 267183, "pid": 76337, "tid": -914061504, "ts": 1716454225624643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225690079, "dur": 8, "args": { "External id": 267184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267184, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267184, "pid": 5, "tid": 7, "ts": 1716454225690079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624644, "dur": 12, "args": { "External id": 267184, "cbid": 211, "correlation": 267184 } }, { "ph": "s", "id": 267184, "pid": 76337, "tid": -914061504, "ts": 1716454225624644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225690089, "dur": 3, "args": { "External id": 267186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267186, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267186, "pid": 5, "tid": 7, "ts": 1716454225690089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624658, "dur": 6, "args": { "External id": 267186, "cbid": 211, "correlation": 267186 } }, { "ph": "s", "id": 267186, "pid": 76337, "tid": -914061504, "ts": 1716454225624658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225690093, "dur": 56, "args": { "External id": 267211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267211, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267211, "pid": 5, "tid": 7, "ts": 1716454225690093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624735, "dur": 12, "args": { "External id": 267211, "cbid": 211, "correlation": 267211 } }, { "ph": "s", "id": 267211, "pid": 76337, "tid": -914061504, "ts": 1716454225624735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225624832, "dur": 1, "args": { "External id": 267229, "cbid": 251, "correlation": 267229 } }, { "ph": "f", "id": 267229, "pid": 76337, "tid": -914061504, "ts": 1716454225624832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225690151, "dur": 92, "args": { "External id": 267231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267231, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 267231, "pid": 5, "tid": 7, "ts": 1716454225690151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624838, "dur": 14, "args": { "External id": 267231, "cbid": 211, "correlation": 267231 } }, { "ph": "s", "id": 267231, "pid": 76337, "tid": -914061504, "ts": 1716454225624838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225690244, "dur": 10, "args": { "External id": 267239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267239, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267239, "pid": 5, "tid": 7, "ts": 1716454225690244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624908, "dur": 12, "args": { "External id": 267239, "cbid": 211, "correlation": 267239 } }, { "ph": "s", "id": 267239, "pid": 76337, "tid": -914061504, "ts": 1716454225624908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225690255, "dur": 21, "args": { "External id": 267247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267247, "pid": 5, "tid": 7, "ts": 1716454225690255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225624949, "dur": 10, "args": { "External id": 267247, "cbid": 211, "correlation": 267247 } }, { "ph": "s", "id": 267247, "pid": 76337, "tid": -914061504, "ts": 1716454225624949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225690278, "dur": 18, "args": { "External id": 267269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267269, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267269, "pid": 5, "tid": 7, "ts": 1716454225690278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625008, "dur": 10, "args": { "External id": 267269, "cbid": 211, "correlation": 267269 } }, { "ph": "s", "id": 267269, "pid": 76337, "tid": -914061504, "ts": 1716454225625008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225625095, "dur": 1, "args": { "External id": 267285, "cbid": 251, "correlation": 267285 } }, { "ph": "f", "id": 267285, "pid": 76337, "tid": -914061504, "ts": 1716454225625095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225625100, "dur": 0, "args": { "External id": 267287, "cbid": 251, "correlation": 267287 } }, { "ph": "f", "id": 267287, "pid": 76337, "tid": -914061504, "ts": 1716454225625100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225690297, "dur": 498, "args": { "External id": 267288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267288, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267288, "pid": 5, "tid": 7, "ts": 1716454225690297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625102, "dur": 13, "args": { "External id": 267288, "cbid": 211, "correlation": 267288 } }, { "ph": "s", "id": 267288, "pid": 76337, "tid": -914061504, "ts": 1716454225625102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225690797, "dur": 66, "args": { "External id": 267296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267296, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267296, "pid": 5, "tid": 7, "ts": 1716454225690797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625167, "dur": 12, "args": { "External id": 267296, "cbid": 211, "correlation": 267296 } }, { "ph": "s", "id": 267296, "pid": 76337, "tid": -914061504, "ts": 1716454225625167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225690864, "dur": 66, "args": { "External id": 267304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267304, "pid": 5, "tid": 7, "ts": 1716454225690864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625197, "dur": 9, "args": { "External id": 267304, "cbid": 211, "correlation": 267304 } }, { "ph": "s", "id": 267304, "pid": 76337, "tid": -914061504, "ts": 1716454225625197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225625277, "dur": 1, "args": { "External id": 267320, "cbid": 251, "correlation": 267320 } }, { "ph": "f", "id": 267320, "pid": 76337, "tid": -914061504, "ts": 1716454225625277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225690933, "dur": 1, "args": { "External id": 267322, "device": 5, "context": 1, "stream": 7, "correlation": 267322, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 267322, "pid": 5, "tid": 7, "ts": 1716454225690933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225625282, "dur": 9, "args": { "External id": 267322, "cbid": 51, "correlation": 267322 } }, { "ph": "s", "id": 267322, "pid": 76337, "tid": -914061504, "ts": 1716454225625282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225690937, "dur": 274, "args": { "External id": 267323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267323, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 267323, "pid": 5, "tid": 7, "ts": 1716454225690937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625292, "dur": 11, "args": { "External id": 267323, "cbid": 211, "correlation": 267323 } }, { "ph": "s", "id": 267323, "pid": 76337, "tid": -914061504, "ts": 1716454225625292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225691212, "dur": 13, "args": { "External id": 267331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267331, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267331, "pid": 5, "tid": 7, "ts": 1716454225691212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625335, "dur": 10, "args": { "External id": 267331, "cbid": 211, "correlation": 267331 } }, { "ph": "s", "id": 267331, "pid": 76337, "tid": -914061504, "ts": 1716454225625335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225691227, "dur": 38, "args": { "External id": 267342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267342, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267342, "pid": 5, "tid": 7, "ts": 1716454225691227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625402, "dur": 12, "args": { "External id": 267342, "cbid": 211, "correlation": 267342 } }, { "ph": "s", "id": 267342, "pid": 76337, "tid": -914061504, "ts": 1716454225625402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225625465, "dur": 0, "args": { "External id": 267354, "cbid": 317, "correlation": 267354 } }, { "ph": "f", "id": 267354, "pid": 76337, "tid": -914061504, "ts": 1716454225625465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225625466, "dur": 0, "args": { "External id": 267355, "cbid": 203, "correlation": 267355 } }, { "ph": "f", "id": 267355, "pid": 76337, "tid": -914061504, "ts": 1716454225625466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225625466, "dur": 0, "args": { "External id": 267356, "cbid": 205, "correlation": 267356 } }, { "ph": "f", "id": 267356, "pid": 76337, "tid": -914061504, "ts": 1716454225625466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225691266, "dur": 13, "args": { "External id": 267360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267360, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267360, "pid": 5, "tid": 7, "ts": 1716454225691266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625481, "dur": 12, "args": { "External id": 267360, "cbid": 211, "correlation": 267360 } }, { "ph": "s", "id": 267360, "pid": 76337, "tid": -914061504, "ts": 1716454225625481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225691280, "dur": 4, "args": { "External id": 267362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 267362, "pid": 5, "tid": 7, "ts": 1716454225691280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625497, "dur": 5, "args": { "External id": 267362, "cbid": 211, "correlation": 267362 } }, { "ph": "s", "id": 267362, "pid": 76337, "tid": -914061504, "ts": 1716454225625497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225625506, "dur": 0, "args": { "External id": 267363, "cbid": 51, "correlation": 267363 } }, { "ph": "s", "id": 267363, "pid": 76337, "tid": -914061504, "ts": 1716454225625506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225691286, "dur": 99, "args": { "External id": 267364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267364, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 267364, "pid": 5, "tid": 7, "ts": 1716454225691286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625506, "dur": 6, "args": { "External id": 267364, "cbid": 211, "correlation": 267364 } }, { "ph": "s", "id": 267364, "pid": 76337, "tid": -914061504, "ts": 1716454225625506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225691386, "dur": 16, "args": { "External id": 267369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267369, "pid": 5, "tid": 7, "ts": 1716454225691386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625534, "dur": 8, "args": { "External id": 267369, "cbid": 211, "correlation": 267369 } }, { "ph": "s", "id": 267369, "pid": 76337, "tid": -914061504, "ts": 1716454225625534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225691404, "dur": 12, "args": { "External id": 267377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267377, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267377, "pid": 5, "tid": 7, "ts": 1716454225691404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625565, "dur": 8, "args": { "External id": 267377, "cbid": 211, "correlation": 267377 } }, { "ph": "s", "id": 267377, "pid": 76337, "tid": -914061504, "ts": 1716454225625565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225691418, "dur": 58, "args": { "External id": 267388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267388, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267388, "pid": 5, "tid": 7, "ts": 1716454225691418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625626, "dur": 11, "args": { "External id": 267388, "cbid": 211, "correlation": 267388 } }, { "ph": "s", "id": 267388, "pid": 76337, "tid": -914061504, "ts": 1716454225625626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225625681, "dur": 0, "args": { "External id": 267398, "cbid": 317, "correlation": 267398 } }, { "ph": "f", "id": 267398, "pid": 76337, "tid": -914061504, "ts": 1716454225625681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225625682, "dur": 0, "args": { "External id": 267399, "cbid": 203, "correlation": 267399 } }, { "ph": "f", "id": 267399, "pid": 76337, "tid": -914061504, "ts": 1716454225625682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225625683, "dur": 0, "args": { "External id": 267400, "cbid": 205, "correlation": 267400 } }, { "ph": "f", "id": 267400, "pid": 76337, "tid": -914061504, "ts": 1716454225625683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225691477, "dur": 39, "args": { "External id": 267404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267404, "pid": 5, "tid": 7, "ts": 1716454225691477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625699, "dur": 12, "args": { "External id": 267404, "cbid": 211, "correlation": 267404 } }, { "ph": "s", "id": 267404, "pid": 76337, "tid": -914061504, "ts": 1716454225625699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225691518, "dur": 165, "args": { "External id": 267406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267406, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267406, "pid": 5, "tid": 7, "ts": 1716454225691518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625713, "dur": 6, "args": { "External id": 267406, "cbid": 211, "correlation": 267406 } }, { "ph": "s", "id": 267406, "pid": 76337, "tid": -914061504, "ts": 1716454225625713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225691684, "dur": 1962, "args": { "External id": 267408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267408, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267408, "pid": 5, "tid": 7, "ts": 1716454225691684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625726, "dur": 8, "args": { "External id": 267408, "cbid": 211, "correlation": 267408 } }, { "ph": "s", "id": 267408, "pid": 76337, "tid": -914061504, "ts": 1716454225625726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225693647, "dur": 40, "args": { "External id": 267410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267410, "pid": 5, "tid": 7, "ts": 1716454225693647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625737, "dur": 5, "args": { "External id": 267410, "cbid": 211, "correlation": 267410 } }, { "ph": "s", "id": 267410, "pid": 76337, "tid": -914061504, "ts": 1716454225625737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225693688, "dur": 60, "args": { "External id": 267416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267416, "pid": 5, "tid": 7, "ts": 1716454225693688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625765, "dur": 8, "args": { "External id": 267416, "cbid": 211, "correlation": 267416 } }, { "ph": "s", "id": 267416, "pid": 76337, "tid": -914061504, "ts": 1716454225625765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225693749, "dur": 86, "args": { "External id": 267425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267425, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267425, "pid": 5, "tid": 7, "ts": 1716454225693749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625855, "dur": 14, "args": { "External id": 267425, "cbid": 211, "correlation": 267425 } }, { "ph": "s", "id": 267425, "pid": 76337, "tid": -914061504, "ts": 1716454225625855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225693837, "dur": 74, "args": { "External id": 267445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267445, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 267445, "pid": 5, "tid": 7, "ts": 1716454225693837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625926, "dur": 11, "args": { "External id": 267445, "cbid": 211, "correlation": 267445 } }, { "ph": "s", "id": 267445, "pid": 76337, "tid": -914061504, "ts": 1716454225625926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225693912, "dur": 5, "args": { "External id": 267457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267457, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 267457, "pid": 5, "tid": 7, "ts": 1716454225693912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625947, "dur": 6, "args": { "External id": 267457, "cbid": 211, "correlation": 267457 } }, { "ph": "s", "id": 267457, "pid": 76337, "tid": -914061504, "ts": 1716454225625947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225693918, "dur": 81, "args": { "External id": 267460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267460, "pid": 5, "tid": 7, "ts": 1716454225693918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225625965, "dur": 16, "args": { "External id": 267460, "cbid": 211, "correlation": 267460 } }, { "ph": "s", "id": 267460, "pid": 76337, "tid": -914061504, "ts": 1716454225625965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225694000, "dur": 55, "args": { "External id": 267469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267469, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267469, "pid": 5, "tid": 7, "ts": 1716454225694000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626015, "dur": 11, "args": { "External id": 267469, "cbid": 211, "correlation": 267469 } }, { "ph": "s", "id": 267469, "pid": 76337, "tid": -914061504, "ts": 1716454225626015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225626069, "dur": 0, "args": { "External id": 267479, "cbid": 317, "correlation": 267479 } }, { "ph": "f", "id": 267479, "pid": 76337, "tid": -914061504, "ts": 1716454225626069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225626070, "dur": 0, "args": { "External id": 267480, "cbid": 203, "correlation": 267480 } }, { "ph": "f", "id": 267480, "pid": 76337, "tid": -914061504, "ts": 1716454225626070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225626071, "dur": 0, "args": { "External id": 267481, "cbid": 205, "correlation": 267481 } }, { "ph": "f", "id": 267481, "pid": 76337, "tid": -914061504, "ts": 1716454225626071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225694056, "dur": 56, "args": { "External id": 267485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267485, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267485, "pid": 5, "tid": 7, "ts": 1716454225694056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626087, "dur": 11, "args": { "External id": 267485, "cbid": 211, "correlation": 267485 } }, { "ph": "s", "id": 267485, "pid": 76337, "tid": -914061504, "ts": 1716454225626087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225694114, "dur": 124, "args": { "External id": 267487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267487, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267487, "pid": 5, "tid": 7, "ts": 1716454225694114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626101, "dur": 5, "args": { "External id": 267487, "cbid": 211, "correlation": 267487 } }, { "ph": "s", "id": 267487, "pid": 76337, "tid": -914061504, "ts": 1716454225626101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225694239, "dur": 1915, "args": { "External id": 267489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267489, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267489, "pid": 5, "tid": 7, "ts": 1716454225694239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626112, "dur": 6, "args": { "External id": 267489, "cbid": 211, "correlation": 267489 } }, { "ph": "s", "id": 267489, "pid": 76337, "tid": -914061504, "ts": 1716454225626112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225696156, "dur": 20, "args": { "External id": 267491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267491, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267491, "pid": 5, "tid": 7, "ts": 1716454225696156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626122, "dur": 5, "args": { "External id": 267491, "cbid": 211, "correlation": 267491 } }, { "ph": "s", "id": 267491, "pid": 76337, "tid": -914061504, "ts": 1716454225626122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225696177, "dur": 33, "args": { "External id": 267497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267497, "pid": 5, "tid": 7, "ts": 1716454225696177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626150, "dur": 8, "args": { "External id": 267497, "cbid": 211, "correlation": 267497 } }, { "ph": "s", "id": 267497, "pid": 76337, "tid": -914061504, "ts": 1716454225626150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225696212, "dur": 3, "args": { "External id": 267505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267505, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 267505, "pid": 5, "tid": 7, "ts": 1716454225696212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626194, "dur": 9, "args": { "External id": 267505, "cbid": 211, "correlation": 267505 } }, { "ph": "s", "id": 267505, "pid": 76337, "tid": -914061504, "ts": 1716454225626194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225626258, "dur": 1, "args": { "External id": 267521, "cbid": 251, "correlation": 267521 } }, { "ph": "f", "id": 267521, "pid": 76337, "tid": -914061504, "ts": 1716454225626258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225626263, "dur": 0, "args": { "External id": 267523, "cbid": 251, "correlation": 267523 } }, { "ph": "f", "id": 267523, "pid": 76337, "tid": -914061504, "ts": 1716454225626263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225696217, "dur": 12, "args": { "External id": 267524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267524, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 267524, "pid": 5, "tid": 7, "ts": 1716454225696217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626265, "dur": 12, "args": { "External id": 267524, "cbid": 211, "correlation": 267524 } }, { "ph": "s", "id": 267524, "pid": 76337, "tid": -914061504, "ts": 1716454225626265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225696230, "dur": 5, "args": { "External id": 267526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267526, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 267526, "pid": 5, "tid": 7, "ts": 1716454225696230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626279, "dur": 5, "args": { "External id": 267526, "cbid": 211, "correlation": 267526 } }, { "ph": "s", "id": 267526, "pid": 76337, "tid": -914061504, "ts": 1716454225626279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225696236, "dur": 29, "args": { "External id": 267536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267536, "pid": 5, "tid": 7, "ts": 1716454225696236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626335, "dur": 12, "args": { "External id": 267536, "cbid": 211, "correlation": 267536 } }, { "ph": "s", "id": 267536, "pid": 76337, "tid": -914061504, "ts": 1716454225626335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225696267, "dur": 31, "args": { "External id": 267556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267556, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 267556, "pid": 5, "tid": 7, "ts": 1716454225696267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626400, "dur": 10, "args": { "External id": 267556, "cbid": 211, "correlation": 267556 } }, { "ph": "s", "id": 267556, "pid": 76337, "tid": -914061504, "ts": 1716454225626400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225696300, "dur": 4, "args": { "External id": 267568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267568, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 267568, "pid": 5, "tid": 7, "ts": 1716454225696300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626420, "dur": 7, "args": { "External id": 267568, "cbid": 211, "correlation": 267568 } }, { "ph": "s", "id": 267568, "pid": 76337, "tid": -914061504, "ts": 1716454225626420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225696306, "dur": 30, "args": { "External id": 267571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267571, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267571, "pid": 5, "tid": 7, "ts": 1716454225696306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626438, "dur": 6, "args": { "External id": 267571, "cbid": 211, "correlation": 267571 } }, { "ph": "s", "id": 267571, "pid": 76337, "tid": -914061504, "ts": 1716454225626438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225696337, "dur": 20, "args": { "External id": 267580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267580, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267580, "pid": 5, "tid": 7, "ts": 1716454225696337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626478, "dur": 9, "args": { "External id": 267580, "cbid": 211, "correlation": 267580 } }, { "ph": "s", "id": 267580, "pid": 76337, "tid": -914061504, "ts": 1716454225626478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225626541, "dur": 0, "args": { "External id": 267590, "cbid": 317, "correlation": 267590 } }, { "ph": "f", "id": 267590, "pid": 76337, "tid": -914061504, "ts": 1716454225626541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225626542, "dur": 0, "args": { "External id": 267591, "cbid": 203, "correlation": 267591 } }, { "ph": "f", "id": 267591, "pid": 76337, "tid": -914061504, "ts": 1716454225626542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225626543, "dur": 0, "args": { "External id": 267592, "cbid": 205, "correlation": 267592 } }, { "ph": "f", "id": 267592, "pid": 76337, "tid": -914061504, "ts": 1716454225626543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225696359, "dur": 23, "args": { "External id": 267596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267596, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267596, "pid": 5, "tid": 7, "ts": 1716454225696359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626558, "dur": 12, "args": { "External id": 267596, "cbid": 211, "correlation": 267596 } }, { "ph": "s", "id": 267596, "pid": 76337, "tid": -914061504, "ts": 1716454225626558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225696384, "dur": 45, "args": { "External id": 267598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267598, "pid": 5, "tid": 7, "ts": 1716454225696384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626572, "dur": 6, "args": { "External id": 267598, "cbid": 211, "correlation": 267598 } }, { "ph": "s", "id": 267598, "pid": 76337, "tid": -914061504, "ts": 1716454225626572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225696430, "dur": 654, "args": { "External id": 267600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267600, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267600, "pid": 5, "tid": 7, "ts": 1716454225696430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626585, "dur": 6, "args": { "External id": 267600, "cbid": 211, "correlation": 267600 } }, { "ph": "s", "id": 267600, "pid": 76337, "tid": -914061504, "ts": 1716454225626585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225697086, "dur": 22, "args": { "External id": 267602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267602, "pid": 5, "tid": 7, "ts": 1716454225697086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626594, "dur": 5, "args": { "External id": 267602, "cbid": 211, "correlation": 267602 } }, { "ph": "s", "id": 267602, "pid": 76337, "tid": -914061504, "ts": 1716454225626594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225697109, "dur": 33, "args": { "External id": 267608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267608, "pid": 5, "tid": 7, "ts": 1716454225697109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626622, "dur": 9, "args": { "External id": 267608, "cbid": 211, "correlation": 267608 } }, { "ph": "s", "id": 267608, "pid": 76337, "tid": -914061504, "ts": 1716454225626622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225626680, "dur": 0, "args": { "External id": 267618, "cbid": 317, "correlation": 267618 } }, { "ph": "f", "id": 267618, "pid": 76337, "tid": -914061504, "ts": 1716454225626680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225626681, "dur": 0, "args": { "External id": 267619, "cbid": 203, "correlation": 267619 } }, { "ph": "f", "id": 267619, "pid": 76337, "tid": -914061504, "ts": 1716454225626681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225626682, "dur": 0, "args": { "External id": 267620, "cbid": 205, "correlation": 267620 } }, { "ph": "f", "id": 267620, "pid": 76337, "tid": -914061504, "ts": 1716454225626682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225697143, "dur": 57, "args": { "External id": 267624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267624, "pid": 5, "tid": 7, "ts": 1716454225697143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626694, "dur": 11, "args": { "External id": 267624, "cbid": 211, "correlation": 267624 } }, { "ph": "s", "id": 267624, "pid": 76337, "tid": -914061504, "ts": 1716454225626694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225697201, "dur": 274, "args": { "External id": 267626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267626, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267626, "pid": 5, "tid": 7, "ts": 1716454225697201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626712, "dur": 7, "args": { "External id": 267626, "cbid": 211, "correlation": 267626 } }, { "ph": "s", "id": 267626, "pid": 76337, "tid": -914061504, "ts": 1716454225626712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225697477, "dur": 21, "args": { "External id": 267628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267628, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267628, "pid": 5, "tid": 7, "ts": 1716454225697477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626722, "dur": 6, "args": { "External id": 267628, "cbid": 211, "correlation": 267628 } }, { "ph": "s", "id": 267628, "pid": 76337, "tid": -914061504, "ts": 1716454225626722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225697499, "dur": 34, "args": { "External id": 267634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267634, "pid": 5, "tid": 7, "ts": 1716454225697499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626749, "dur": 8, "args": { "External id": 267634, "cbid": 211, "correlation": 267634 } }, { "ph": "s", "id": 267634, "pid": 76337, "tid": -914061504, "ts": 1716454225626749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225697534, "dur": 26, "args": { "External id": 267642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267642, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267642, "pid": 5, "tid": 7, "ts": 1716454225697534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626778, "dur": 8, "args": { "External id": 267642, "cbid": 211, "correlation": 267642 } }, { "ph": "s", "id": 267642, "pid": 76337, "tid": -914061504, "ts": 1716454225626778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225697562, "dur": 20, "args": { "External id": 267650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267650, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267650, "pid": 5, "tid": 7, "ts": 1716454225697562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626807, "dur": 8, "args": { "External id": 267650, "cbid": 211, "correlation": 267650 } }, { "ph": "s", "id": 267650, "pid": 76337, "tid": -914061504, "ts": 1716454225626807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225697584, "dur": 31, "args": { "External id": 267670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267670, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 267670, "pid": 5, "tid": 7, "ts": 1716454225697584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626890, "dur": 12, "args": { "External id": 267670, "cbid": 211, "correlation": 267670 } }, { "ph": "s", "id": 267670, "pid": 76337, "tid": -914061504, "ts": 1716454225626890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225697616, "dur": 4, "args": { "External id": 267682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267682, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 267682, "pid": 5, "tid": 7, "ts": 1716454225697616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626912, "dur": 6, "args": { "External id": 267682, "cbid": 211, "correlation": 267682 } }, { "ph": "s", "id": 267682, "pid": 76337, "tid": -914061504, "ts": 1716454225626912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225697622, "dur": 31, "args": { "External id": 267685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267685, "pid": 5, "tid": 7, "ts": 1716454225697622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225626929, "dur": 7, "args": { "External id": 267685, "cbid": 211, "correlation": 267685 } }, { "ph": "s", "id": 267685, "pid": 76337, "tid": -914061504, "ts": 1716454225626929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225626997, "dur": 0, "args": { "External id": 267696, "cbid": 317, "correlation": 267696 } }, { "ph": "f", "id": 267696, "pid": 76337, "tid": -914061504, "ts": 1716454225626997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225626998, "dur": 0, "args": { "External id": 267697, "cbid": 203, "correlation": 267697 } }, { "ph": "f", "id": 267697, "pid": 76337, "tid": -914061504, "ts": 1716454225626998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225626999, "dur": 0, "args": { "External id": 267698, "cbid": 205, "correlation": 267698 } }, { "ph": "f", "id": 267698, "pid": 76337, "tid": -914061504, "ts": 1716454225626999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225697654, "dur": 22, "args": { "External id": 267702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267702, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267702, "pid": 5, "tid": 7, "ts": 1716454225697654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627013, "dur": 12, "args": { "External id": 267702, "cbid": 211, "correlation": 267702 } }, { "ph": "s", "id": 267702, "pid": 76337, "tid": -914061504, "ts": 1716454225627013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225697677, "dur": 107, "args": { "External id": 267704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267704, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267704, "pid": 5, "tid": 7, "ts": 1716454225697677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627031, "dur": 7, "args": { "External id": 267704, "cbid": 211, "correlation": 267704 } }, { "ph": "s", "id": 267704, "pid": 76337, "tid": -914061504, "ts": 1716454225627031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225697786, "dur": 22, "args": { "External id": 267706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267706, "pid": 5, "tid": 7, "ts": 1716454225697786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627042, "dur": 5, "args": { "External id": 267706, "cbid": 211, "correlation": 267706 } }, { "ph": "s", "id": 267706, "pid": 76337, "tid": -914061504, "ts": 1716454225627042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225697809, "dur": 33, "args": { "External id": 267712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267712, "pid": 5, "tid": 7, "ts": 1716454225697809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627069, "dur": 8, "args": { "External id": 267712, "cbid": 211, "correlation": 267712 } }, { "ph": "s", "id": 267712, "pid": 76337, "tid": -914061504, "ts": 1716454225627069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225697843, "dur": 186, "args": { "External id": 267721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267721, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267721, "pid": 5, "tid": 7, "ts": 1716454225697843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627151, "dur": 14, "args": { "External id": 267721, "cbid": 211, "correlation": 267721 } }, { "ph": "s", "id": 267721, "pid": 76337, "tid": -914061504, "ts": 1716454225627151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225698030, "dur": 65, "args": { "External id": 267743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267743, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267743, "pid": 5, "tid": 7, "ts": 1716454225698030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627208, "dur": 10, "args": { "External id": 267743, "cbid": 211, "correlation": 267743 } }, { "ph": "s", "id": 267743, "pid": 76337, "tid": -914061504, "ts": 1716454225627208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225627297, "dur": 1, "args": { "External id": 267754, "cbid": 251, "correlation": 267754 } }, { "ph": "f", "id": 267754, "pid": 76337, "tid": -914061504, "ts": 1716454225627297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225698097, "dur": 155, "args": { "External id": 267755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267755, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267755, "pid": 5, "tid": 7, "ts": 1716454225698097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627302, "dur": 13, "args": { "External id": 267755, "cbid": 211, "correlation": 267755 } }, { "ph": "s", "id": 267755, "pid": 76337, "tid": -914061504, "ts": 1716454225627302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225627373, "dur": 1, "args": { "External id": 267766, "cbid": 251, "correlation": 267766 } }, { "ph": "f", "id": 267766, "pid": 76337, "tid": -914061504, "ts": 1716454225627373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225698254, "dur": 148, "args": { "External id": 267767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267767, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267767, "pid": 5, "tid": 7, "ts": 1716454225698254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627377, "dur": 12, "args": { "External id": 267767, "cbid": 211, "correlation": 267767 } }, { "ph": "s", "id": 267767, "pid": 76337, "tid": -914061504, "ts": 1716454225627377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225627440, "dur": 1, "args": { "External id": 267778, "cbid": 251, "correlation": 267778 } }, { "ph": "f", "id": 267778, "pid": 76337, "tid": -914061504, "ts": 1716454225627440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225698403, "dur": 147, "args": { "External id": 267779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267779, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267779, "pid": 5, "tid": 7, "ts": 1716454225698403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627444, "dur": 11, "args": { "External id": 267779, "cbid": 211, "correlation": 267779 } }, { "ph": "s", "id": 267779, "pid": 76337, "tid": -914061504, "ts": 1716454225627444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225698551, "dur": 1984, "args": { "External id": 267800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267800, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 267800, "pid": 5, "tid": 7, "ts": 1716454225698551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627526, "dur": 13, "args": { "External id": 267800, "cbid": 211, "correlation": 267800 } }, { "ph": "s", "id": 267800, "pid": 76337, "tid": -914061504, "ts": 1716454225627526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225627623, "dur": 1, "args": { "External id": 267818, "cbid": 251, "correlation": 267818 } }, { "ph": "f", "id": 267818, "pid": 76337, "tid": -914061504, "ts": 1716454225627623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225700536, "dur": 150, "args": { "External id": 267820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267820, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 267820, "pid": 5, "tid": 7, "ts": 1716454225700536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627630, "dur": 14, "args": { "External id": 267820, "cbid": 211, "correlation": 267820 } }, { "ph": "s", "id": 267820, "pid": 76337, "tid": -914061504, "ts": 1716454225627630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225700688, "dur": 35, "args": { "External id": 267828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267828, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267828, "pid": 5, "tid": 7, "ts": 1716454225700688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627699, "dur": 12, "args": { "External id": 267828, "cbid": 211, "correlation": 267828 } }, { "ph": "s", "id": 267828, "pid": 76337, "tid": -914061504, "ts": 1716454225627699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225700725, "dur": 50, "args": { "External id": 267836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267836, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267836, "pid": 5, "tid": 7, "ts": 1716454225700725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627738, "dur": 8, "args": { "External id": 267836, "cbid": 211, "correlation": 267836 } }, { "ph": "s", "id": 267836, "pid": 76337, "tid": -914061504, "ts": 1716454225627738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225700776, "dur": 30, "args": { "External id": 267847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267847, "pid": 5, "tid": 7, "ts": 1716454225700776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627808, "dur": 13, "args": { "External id": 267847, "cbid": 211, "correlation": 267847 } }, { "ph": "s", "id": 267847, "pid": 76337, "tid": -914061504, "ts": 1716454225627808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225700808, "dur": 34, "args": { "External id": 267869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267869, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267869, "pid": 5, "tid": 7, "ts": 1716454225700808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627839, "dur": 7, "args": { "External id": 267869, "cbid": 211, "correlation": 267869 } }, { "ph": "s", "id": 267869, "pid": 76337, "tid": -914061504, "ts": 1716454225627839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225627922, "dur": 1, "args": { "External id": 267880, "cbid": 251, "correlation": 267880 } }, { "ph": "f", "id": 267880, "pid": 76337, "tid": -914061504, "ts": 1716454225627922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225700844, "dur": 92, "args": { "External id": 267881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267881, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267881, "pid": 5, "tid": 7, "ts": 1716454225700844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225627927, "dur": 14, "args": { "External id": 267881, "cbid": 211, "correlation": 267881 } }, { "ph": "s", "id": 267881, "pid": 76337, "tid": -914061504, "ts": 1716454225627927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628006, "dur": 1, "args": { "External id": 267892, "cbid": 251, "correlation": 267892 } }, { "ph": "f", "id": 267892, "pid": 76337, "tid": -914061504, "ts": 1716454225628006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628010, "dur": 0, "args": { "External id": 267893, "cbid": 251, "correlation": 267893 } }, { "ph": "f", "id": 267893, "pid": 76337, "tid": -914061504, "ts": 1716454225628010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225700937, "dur": 11, "args": { "External id": 267894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267894, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 267894, "pid": 5, "tid": 7, "ts": 1716454225700937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628011, "dur": 13, "args": { "External id": 267894, "cbid": 211, "correlation": 267894 } }, { "ph": "s", "id": 267894, "pid": 76337, "tid": -914061504, "ts": 1716454225628011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225700949, "dur": 5, "args": { "External id": 267896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267896, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 267896, "pid": 5, "tid": 7, "ts": 1716454225700949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628026, "dur": 6, "args": { "External id": 267896, "cbid": 211, "correlation": 267896 } }, { "ph": "s", "id": 267896, "pid": 76337, "tid": -914061504, "ts": 1716454225628026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628084, "dur": 1, "args": { "External id": 267907, "cbid": 251, "correlation": 267907 } }, { "ph": "f", "id": 267907, "pid": 76337, "tid": -914061504, "ts": 1716454225628084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628088, "dur": 0, "args": { "External id": 267908, "cbid": 251, "correlation": 267908 } }, { "ph": "f", "id": 267908, "pid": 76337, "tid": -914061504, "ts": 1716454225628088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225700956, "dur": 7, "args": { "External id": 267909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267909, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 267909, "pid": 5, "tid": 7, "ts": 1716454225700956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628089, "dur": 12, "args": { "External id": 267909, "cbid": 211, "correlation": 267909 } }, { "ph": "s", "id": 267909, "pid": 76337, "tid": -914061504, "ts": 1716454225628089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225700964, "dur": 4, "args": { "External id": 267911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267911, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 267911, "pid": 5, "tid": 7, "ts": 1716454225700964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628103, "dur": 5, "args": { "External id": 267911, "cbid": 211, "correlation": 267911 } }, { "ph": "s", "id": 267911, "pid": 76337, "tid": -914061504, "ts": 1716454225628103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225700969, "dur": 93, "args": { "External id": 267932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267932, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 267932, "pid": 5, "tid": 7, "ts": 1716454225700969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628176, "dur": 13, "args": { "External id": 267932, "cbid": 211, "correlation": 267932 } }, { "ph": "s", "id": 267932, "pid": 76337, "tid": -914061504, "ts": 1716454225628176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628272, "dur": 1, "args": { "External id": 267950, "cbid": 251, "correlation": 267950 } }, { "ph": "f", "id": 267950, "pid": 76337, "tid": -914061504, "ts": 1716454225628272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225701064, "dur": 98, "args": { "External id": 267952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267952, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 267952, "pid": 5, "tid": 7, "ts": 1716454225701064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628278, "dur": 14, "args": { "External id": 267952, "cbid": 211, "correlation": 267952 } }, { "ph": "s", "id": 267952, "pid": 76337, "tid": -914061504, "ts": 1716454225628278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225701163, "dur": 19, "args": { "External id": 267960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267960, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267960, "pid": 5, "tid": 7, "ts": 1716454225701163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628347, "dur": 12, "args": { "External id": 267960, "cbid": 211, "correlation": 267960 } }, { "ph": "s", "id": 267960, "pid": 76337, "tid": -914061504, "ts": 1716454225628347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225701184, "dur": 38, "args": { "External id": 267968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267968, "pid": 5, "tid": 7, "ts": 1716454225701184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628388, "dur": 10, "args": { "External id": 267968, "cbid": 211, "correlation": 267968 } }, { "ph": "s", "id": 267968, "pid": 76337, "tid": -914061504, "ts": 1716454225628388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225701223, "dur": 35, "args": { "External id": 267990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 267990, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 267990, "pid": 5, "tid": 7, "ts": 1716454225701223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628438, "dur": 10, "args": { "External id": 267990, "cbid": 211, "correlation": 267990 } }, { "ph": "s", "id": 267990, "pid": 76337, "tid": -914061504, "ts": 1716454225628438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628526, "dur": 1, "args": { "External id": 268006, "cbid": 251, "correlation": 268006 } }, { "ph": "f", "id": 268006, "pid": 76337, "tid": -914061504, "ts": 1716454225628526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628531, "dur": 0, "args": { "External id": 268008, "cbid": 251, "correlation": 268008 } }, { "ph": "f", "id": 268008, "pid": 76337, "tid": -914061504, "ts": 1716454225628531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225701260, "dur": 549, "args": { "External id": 268009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268009, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 268009, "pid": 5, "tid": 7, "ts": 1716454225701260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628536, "dur": 13, "args": { "External id": 268009, "cbid": 211, "correlation": 268009 } }, { "ph": "s", "id": 268009, "pid": 76337, "tid": -914061504, "ts": 1716454225628536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225701811, "dur": 128, "args": { "External id": 268017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268017, "pid": 5, "tid": 7, "ts": 1716454225701811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628602, "dur": 12, "args": { "External id": 268017, "cbid": 211, "correlation": 268017 } }, { "ph": "s", "id": 268017, "pid": 76337, "tid": -914061504, "ts": 1716454225628602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225701940, "dur": 130, "args": { "External id": 268025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268025, "pid": 5, "tid": 7, "ts": 1716454225701940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628632, "dur": 9, "args": { "External id": 268025, "cbid": 211, "correlation": 268025 } }, { "ph": "s", "id": 268025, "pid": 76337, "tid": -914061504, "ts": 1716454225628632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225628709, "dur": 1, "args": { "External id": 268041, "cbid": 251, "correlation": 268041 } }, { "ph": "f", "id": 268041, "pid": 76337, "tid": -914061504, "ts": 1716454225628709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225702071, "dur": 309, "args": { "External id": 268043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268043, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268043, "pid": 5, "tid": 7, "ts": 1716454225702071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628715, "dur": 12, "args": { "External id": 268043, "cbid": 211, "correlation": 268043 } }, { "ph": "s", "id": 268043, "pid": 76337, "tid": -914061504, "ts": 1716454225628715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225702381, "dur": 27, "args": { "External id": 268051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268051, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268051, "pid": 5, "tid": 7, "ts": 1716454225702381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628757, "dur": 10, "args": { "External id": 268051, "cbid": 211, "correlation": 268051 } }, { "ph": "s", "id": 268051, "pid": 76337, "tid": -914061504, "ts": 1716454225628757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225702410, "dur": 83, "args": { "External id": 268062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268062, "pid": 5, "tid": 7, "ts": 1716454225702410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628825, "dur": 12, "args": { "External id": 268062, "cbid": 211, "correlation": 268062 } }, { "ph": "s", "id": 268062, "pid": 76337, "tid": -914061504, "ts": 1716454225628825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225628887, "dur": 0, "args": { "External id": 268074, "cbid": 317, "correlation": 268074 } }, { "ph": "f", "id": 268074, "pid": 76337, "tid": -914061504, "ts": 1716454225628887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225628888, "dur": 0, "args": { "External id": 268075, "cbid": 203, "correlation": 268075 } }, { "ph": "f", "id": 268075, "pid": 76337, "tid": -914061504, "ts": 1716454225628888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225628889, "dur": 0, "args": { "External id": 268076, "cbid": 205, "correlation": 268076 } }, { "ph": "f", "id": 268076, "pid": 76337, "tid": -914061504, "ts": 1716454225628889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225702493, "dur": 22, "args": { "External id": 268080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268080, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268080, "pid": 5, "tid": 7, "ts": 1716454225702493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628905, "dur": 12, "args": { "External id": 268080, "cbid": 211, "correlation": 268080 } }, { "ph": "s", "id": 268080, "pid": 76337, "tid": -914061504, "ts": 1716454225628905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225702517, "dur": 122, "args": { "External id": 268082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268082, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268082, "pid": 5, "tid": 7, "ts": 1716454225702517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628922, "dur": 6, "args": { "External id": 268082, "cbid": 211, "correlation": 268082 } }, { "ph": "s", "id": 268082, "pid": 76337, "tid": -914061504, "ts": 1716454225628922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225702640, "dur": 22, "args": { "External id": 268084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268084, "pid": 5, "tid": 7, "ts": 1716454225702640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628933, "dur": 5, "args": { "External id": 268084, "cbid": 211, "correlation": 268084 } }, { "ph": "s", "id": 268084, "pid": 76337, "tid": -914061504, "ts": 1716454225628933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225702664, "dur": 33, "args": { "External id": 268090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268090, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268090, "pid": 5, "tid": 7, "ts": 1716454225702664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225628961, "dur": 9, "args": { "External id": 268090, "cbid": 211, "correlation": 268090 } }, { "ph": "s", "id": 268090, "pid": 76337, "tid": -914061504, "ts": 1716454225628961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225702699, "dur": 27, "args": { "External id": 268098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268098, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268098, "pid": 5, "tid": 7, "ts": 1716454225702699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629003, "dur": 9, "args": { "External id": 268098, "cbid": 211, "correlation": 268098 } }, { "ph": "s", "id": 268098, "pid": 76337, "tid": -914061504, "ts": 1716454225629003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225702727, "dur": 54, "args": { "External id": 268107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268107, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268107, "pid": 5, "tid": 7, "ts": 1716454225702727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629042, "dur": 11, "args": { "External id": 268107, "cbid": 211, "correlation": 268107 } }, { "ph": "s", "id": 268107, "pid": 76337, "tid": -914061504, "ts": 1716454225629042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225702782, "dur": 52, "args": { "External id": 268127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268127, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 268127, "pid": 5, "tid": 7, "ts": 1716454225702782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629112, "dur": 11, "args": { "External id": 268127, "cbid": 211, "correlation": 268127 } }, { "ph": "s", "id": 268127, "pid": 76337, "tid": -914061504, "ts": 1716454225629112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225702836, "dur": 5, "args": { "External id": 268139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268139, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 268139, "pid": 5, "tid": 7, "ts": 1716454225702836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629133, "dur": 6, "args": { "External id": 268139, "cbid": 211, "correlation": 268139 } }, { "ph": "s", "id": 268139, "pid": 76337, "tid": -914061504, "ts": 1716454225629133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225702842, "dur": 58, "args": { "External id": 268142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268142, "pid": 5, "tid": 7, "ts": 1716454225702842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629152, "dur": 6, "args": { "External id": 268142, "cbid": 211, "correlation": 268142 } }, { "ph": "s", "id": 268142, "pid": 76337, "tid": -914061504, "ts": 1716454225629152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225702902, "dur": 37, "args": { "External id": 268151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268151, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268151, "pid": 5, "tid": 7, "ts": 1716454225702902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629190, "dur": 11, "args": { "External id": 268151, "cbid": 211, "correlation": 268151 } }, { "ph": "s", "id": 268151, "pid": 76337, "tid": -914061504, "ts": 1716454225629190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225629242, "dur": 0, "args": { "External id": 268161, "cbid": 317, "correlation": 268161 } }, { "ph": "f", "id": 268161, "pid": 76337, "tid": -914061504, "ts": 1716454225629242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225629243, "dur": 0, "args": { "External id": 268162, "cbid": 203, "correlation": 268162 } }, { "ph": "f", "id": 268162, "pid": 76337, "tid": -914061504, "ts": 1716454225629243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225629243, "dur": 0, "args": { "External id": 268163, "cbid": 205, "correlation": 268163 } }, { "ph": "f", "id": 268163, "pid": 76337, "tid": -914061504, "ts": 1716454225629243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225702941, "dur": 40, "args": { "External id": 268167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268167, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268167, "pid": 5, "tid": 7, "ts": 1716454225702941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629259, "dur": 11, "args": { "External id": 268167, "cbid": 211, "correlation": 268167 } }, { "ph": "s", "id": 268167, "pid": 76337, "tid": -914061504, "ts": 1716454225629259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225702982, "dur": 84, "args": { "External id": 268169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268169, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268169, "pid": 5, "tid": 7, "ts": 1716454225702982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629273, "dur": 5, "args": { "External id": 268169, "cbid": 211, "correlation": 268169 } }, { "ph": "s", "id": 268169, "pid": 76337, "tid": -914061504, "ts": 1716454225629273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225703067, "dur": 1297, "args": { "External id": 268171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268171, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268171, "pid": 5, "tid": 7, "ts": 1716454225703067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629284, "dur": 6, "args": { "External id": 268171, "cbid": 211, "correlation": 268171 } }, { "ph": "s", "id": 268171, "pid": 76337, "tid": -914061504, "ts": 1716454225629284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225704366, "dur": 22, "args": { "External id": 268173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268173, "pid": 5, "tid": 7, "ts": 1716454225704366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629294, "dur": 5, "args": { "External id": 268173, "cbid": 211, "correlation": 268173 } }, { "ph": "s", "id": 268173, "pid": 76337, "tid": -914061504, "ts": 1716454225629294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225704390, "dur": 33, "args": { "External id": 268179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268179, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268179, "pid": 5, "tid": 7, "ts": 1716454225704390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629322, "dur": 8, "args": { "External id": 268179, "cbid": 211, "correlation": 268179 } }, { "ph": "s", "id": 268179, "pid": 76337, "tid": -914061504, "ts": 1716454225629322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225704424, "dur": 3, "args": { "External id": 268187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268187, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 268187, "pid": 5, "tid": 7, "ts": 1716454225704424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629365, "dur": 9, "args": { "External id": 268187, "cbid": 211, "correlation": 268187 } }, { "ph": "s", "id": 268187, "pid": 76337, "tid": -914061504, "ts": 1716454225629365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225629429, "dur": 1, "args": { "External id": 268203, "cbid": 251, "correlation": 268203 } }, { "ph": "f", "id": 268203, "pid": 76337, "tid": -914061504, "ts": 1716454225629429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225629434, "dur": 0, "args": { "External id": 268205, "cbid": 251, "correlation": 268205 } }, { "ph": "f", "id": 268205, "pid": 76337, "tid": -914061504, "ts": 1716454225629434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225704429, "dur": 13, "args": { "External id": 268206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268206, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 268206, "pid": 5, "tid": 7, "ts": 1716454225704429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629436, "dur": 11, "args": { "External id": 268206, "cbid": 211, "correlation": 268206 } }, { "ph": "s", "id": 268206, "pid": 76337, "tid": -914061504, "ts": 1716454225629436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225704443, "dur": 5, "args": { "External id": 268208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268208, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 268208, "pid": 5, "tid": 7, "ts": 1716454225704443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629449, "dur": 6, "args": { "External id": 268208, "cbid": 211, "correlation": 268208 } }, { "ph": "s", "id": 268208, "pid": 76337, "tid": -914061504, "ts": 1716454225629449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225704449, "dur": 30, "args": { "External id": 268218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268218, "pid": 5, "tid": 7, "ts": 1716454225704449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629508, "dur": 12, "args": { "External id": 268218, "cbid": 211, "correlation": 268218 } }, { "ph": "s", "id": 268218, "pid": 76337, "tid": -914061504, "ts": 1716454225629508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225704480, "dur": 32, "args": { "External id": 268238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268238, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 268238, "pid": 5, "tid": 7, "ts": 1716454225704480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629573, "dur": 12, "args": { "External id": 268238, "cbid": 211, "correlation": 268238 } }, { "ph": "s", "id": 268238, "pid": 76337, "tid": -914061504, "ts": 1716454225629573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225704513, "dur": 4, "args": { "External id": 268250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268250, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 268250, "pid": 5, "tid": 7, "ts": 1716454225704513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629595, "dur": 6, "args": { "External id": 268250, "cbid": 211, "correlation": 268250 } }, { "ph": "s", "id": 268250, "pid": 76337, "tid": -914061504, "ts": 1716454225629595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225704519, "dur": 30, "args": { "External id": 268253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268253, "pid": 5, "tid": 7, "ts": 1716454225704519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629615, "dur": 6, "args": { "External id": 268253, "cbid": 211, "correlation": 268253 } }, { "ph": "s", "id": 268253, "pid": 76337, "tid": -914061504, "ts": 1716454225629615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225704550, "dur": 20, "args": { "External id": 268262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268262, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268262, "pid": 5, "tid": 7, "ts": 1716454225704550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629654, "dur": 10, "args": { "External id": 268262, "cbid": 211, "correlation": 268262 } }, { "ph": "s", "id": 268262, "pid": 76337, "tid": -914061504, "ts": 1716454225629654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225629716, "dur": 0, "args": { "External id": 268272, "cbid": 317, "correlation": 268272 } }, { "ph": "f", "id": 268272, "pid": 76337, "tid": -914061504, "ts": 1716454225629716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225629717, "dur": 0, "args": { "External id": 268273, "cbid": 203, "correlation": 268273 } }, { "ph": "f", "id": 268273, "pid": 76337, "tid": -914061504, "ts": 1716454225629717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225629717, "dur": 0, "args": { "External id": 268274, "cbid": 205, "correlation": 268274 } }, { "ph": "f", "id": 268274, "pid": 76337, "tid": -914061504, "ts": 1716454225629717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225704571, "dur": 22, "args": { "External id": 268278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268278, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268278, "pid": 5, "tid": 7, "ts": 1716454225704571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629731, "dur": 12, "args": { "External id": 268278, "cbid": 211, "correlation": 268278 } }, { "ph": "s", "id": 268278, "pid": 76337, "tid": -914061504, "ts": 1716454225629731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225704595, "dur": 45, "args": { "External id": 268280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268280, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268280, "pid": 5, "tid": 7, "ts": 1716454225704595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629746, "dur": 5, "args": { "External id": 268280, "cbid": 211, "correlation": 268280 } }, { "ph": "s", "id": 268280, "pid": 76337, "tid": -914061504, "ts": 1716454225629746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225704641, "dur": 652, "args": { "External id": 268282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268282, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268282, "pid": 5, "tid": 7, "ts": 1716454225704641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629757, "dur": 6, "args": { "External id": 268282, "cbid": 211, "correlation": 268282 } }, { "ph": "s", "id": 268282, "pid": 76337, "tid": -914061504, "ts": 1716454225629757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225705294, "dur": 22, "args": { "External id": 268284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268284, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268284, "pid": 5, "tid": 7, "ts": 1716454225705294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629766, "dur": 5, "args": { "External id": 268284, "cbid": 211, "correlation": 268284 } }, { "ph": "s", "id": 268284, "pid": 76337, "tid": -914061504, "ts": 1716454225629766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225705317, "dur": 33, "args": { "External id": 268290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268290, "pid": 5, "tid": 7, "ts": 1716454225705317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629794, "dur": 9, "args": { "External id": 268290, "cbid": 211, "correlation": 268290 } }, { "ph": "s", "id": 268290, "pid": 76337, "tid": -914061504, "ts": 1716454225629794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225629851, "dur": 0, "args": { "External id": 268300, "cbid": 317, "correlation": 268300 } }, { "ph": "f", "id": 268300, "pid": 76337, "tid": -914061504, "ts": 1716454225629851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225629852, "dur": 0, "args": { "External id": 268301, "cbid": 203, "correlation": 268301 } }, { "ph": "f", "id": 268301, "pid": 76337, "tid": -914061504, "ts": 1716454225629852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225629852, "dur": 0, "args": { "External id": 268302, "cbid": 205, "correlation": 268302 } }, { "ph": "f", "id": 268302, "pid": 76337, "tid": -914061504, "ts": 1716454225629852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225705352, "dur": 39, "args": { "External id": 268306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268306, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268306, "pid": 5, "tid": 7, "ts": 1716454225705352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629866, "dur": 12, "args": { "External id": 268306, "cbid": 211, "correlation": 268306 } }, { "ph": "s", "id": 268306, "pid": 76337, "tid": -914061504, "ts": 1716454225629866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225705392, "dur": 193, "args": { "External id": 268308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268308, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268308, "pid": 5, "tid": 7, "ts": 1716454225705392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629883, "dur": 6, "args": { "External id": 268308, "cbid": 211, "correlation": 268308 } }, { "ph": "s", "id": 268308, "pid": 76337, "tid": -914061504, "ts": 1716454225629883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225705586, "dur": 23, "args": { "External id": 268310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268310, "pid": 5, "tid": 7, "ts": 1716454225705586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629892, "dur": 5, "args": { "External id": 268310, "cbid": 211, "correlation": 268310 } }, { "ph": "s", "id": 268310, "pid": 76337, "tid": -914061504, "ts": 1716454225629892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225705611, "dur": 33, "args": { "External id": 268316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268316, "pid": 5, "tid": 7, "ts": 1716454225705611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629918, "dur": 8, "args": { "External id": 268316, "cbid": 211, "correlation": 268316 } }, { "ph": "s", "id": 268316, "pid": 76337, "tid": -914061504, "ts": 1716454225629918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225705645, "dur": 27, "args": { "External id": 268324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268324, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268324, "pid": 5, "tid": 7, "ts": 1716454225705645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629946, "dur": 8, "args": { "External id": 268324, "cbid": 211, "correlation": 268324 } }, { "ph": "s", "id": 268324, "pid": 76337, "tid": -914061504, "ts": 1716454225629946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225705673, "dur": 20, "args": { "External id": 268332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268332, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268332, "pid": 5, "tid": 7, "ts": 1716454225705673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225629983, "dur": 9, "args": { "External id": 268332, "cbid": 211, "correlation": 268332 } }, { "ph": "s", "id": 268332, "pid": 76337, "tid": -914061504, "ts": 1716454225629983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225705695, "dur": 31, "args": { "External id": 268352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268352, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 268352, "pid": 5, "tid": 7, "ts": 1716454225705695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630068, "dur": 12, "args": { "External id": 268352, "cbid": 211, "correlation": 268352 } }, { "ph": "s", "id": 268352, "pid": 76337, "tid": -914061504, "ts": 1716454225630068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225705727, "dur": 4, "args": { "External id": 268364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268364, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 268364, "pid": 5, "tid": 7, "ts": 1716454225705727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630090, "dur": 6, "args": { "External id": 268364, "cbid": 211, "correlation": 268364 } }, { "ph": "s", "id": 268364, "pid": 76337, "tid": -914061504, "ts": 1716454225630090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225705732, "dur": 30, "args": { "External id": 268367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268367, "pid": 5, "tid": 7, "ts": 1716454225705732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630108, "dur": 7, "args": { "External id": 268367, "cbid": 211, "correlation": 268367 } }, { "ph": "s", "id": 268367, "pid": 76337, "tid": -914061504, "ts": 1716454225630108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225630164, "dur": 0, "args": { "External id": 268378, "cbid": 317, "correlation": 268378 } }, { "ph": "f", "id": 268378, "pid": 76337, "tid": -914061504, "ts": 1716454225630164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225630165, "dur": 0, "args": { "External id": 268379, "cbid": 203, "correlation": 268379 } }, { "ph": "f", "id": 268379, "pid": 76337, "tid": -914061504, "ts": 1716454225630165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225630166, "dur": 0, "args": { "External id": 268380, "cbid": 205, "correlation": 268380 } }, { "ph": "f", "id": 268380, "pid": 76337, "tid": -914061504, "ts": 1716454225630166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225705764, "dur": 23, "args": { "External id": 268384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268384, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268384, "pid": 5, "tid": 7, "ts": 1716454225705764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630180, "dur": 12, "args": { "External id": 268384, "cbid": 211, "correlation": 268384 } }, { "ph": "s", "id": 268384, "pid": 76337, "tid": -914061504, "ts": 1716454225630180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225705788, "dur": 107, "args": { "External id": 268386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268386, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268386, "pid": 5, "tid": 7, "ts": 1716454225705788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630197, "dur": 6, "args": { "External id": 268386, "cbid": 211, "correlation": 268386 } }, { "ph": "s", "id": 268386, "pid": 76337, "tid": -914061504, "ts": 1716454225630197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225705897, "dur": 23, "args": { "External id": 268388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268388, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268388, "pid": 5, "tid": 7, "ts": 1716454225705897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630207, "dur": 5, "args": { "External id": 268388, "cbid": 211, "correlation": 268388 } }, { "ph": "s", "id": 268388, "pid": 76337, "tid": -914061504, "ts": 1716454225630207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225705921, "dur": 33, "args": { "External id": 268394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268394, "pid": 5, "tid": 7, "ts": 1716454225705921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630235, "dur": 9, "args": { "External id": 268394, "cbid": 211, "correlation": 268394 } }, { "ph": "s", "id": 268394, "pid": 76337, "tid": -914061504, "ts": 1716454225630235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225705955, "dur": 184, "args": { "External id": 268403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268403, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268403, "pid": 5, "tid": 7, "ts": 1716454225705955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630316, "dur": 14, "args": { "External id": 268403, "cbid": 211, "correlation": 268403 } }, { "ph": "s", "id": 268403, "pid": 76337, "tid": -914061504, "ts": 1716454225630316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225706141, "dur": 66, "args": { "External id": 268425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268425, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268425, "pid": 5, "tid": 7, "ts": 1716454225706141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630374, "dur": 10, "args": { "External id": 268425, "cbid": 211, "correlation": 268425 } }, { "ph": "s", "id": 268425, "pid": 76337, "tid": -914061504, "ts": 1716454225630374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225630460, "dur": 1, "args": { "External id": 268436, "cbid": 251, "correlation": 268436 } }, { "ph": "f", "id": 268436, "pid": 76337, "tid": -914061504, "ts": 1716454225630460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225706208, "dur": 154, "args": { "External id": 268437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268437, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268437, "pid": 5, "tid": 7, "ts": 1716454225706208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630466, "dur": 14, "args": { "External id": 268437, "cbid": 211, "correlation": 268437 } }, { "ph": "s", "id": 268437, "pid": 76337, "tid": -914061504, "ts": 1716454225630466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225630537, "dur": 1, "args": { "External id": 268448, "cbid": 251, "correlation": 268448 } }, { "ph": "f", "id": 268448, "pid": 76337, "tid": -914061504, "ts": 1716454225630537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225706363, "dur": 149, "args": { "External id": 268449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268449, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268449, "pid": 5, "tid": 7, "ts": 1716454225706363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630541, "dur": 11, "args": { "External id": 268449, "cbid": 211, "correlation": 268449 } }, { "ph": "s", "id": 268449, "pid": 76337, "tid": -914061504, "ts": 1716454225630541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225630606, "dur": 1, "args": { "External id": 268460, "cbid": 251, "correlation": 268460 } }, { "ph": "f", "id": 268460, "pid": 76337, "tid": -914061504, "ts": 1716454225630606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225706514, "dur": 145, "args": { "External id": 268461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268461, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268461, "pid": 5, "tid": 7, "ts": 1716454225706514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630610, "dur": 12, "args": { "External id": 268461, "cbid": 211, "correlation": 268461 } }, { "ph": "s", "id": 268461, "pid": 76337, "tid": -914061504, "ts": 1716454225630610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225706660, "dur": 1988, "args": { "External id": 268482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268482, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 268482, "pid": 5, "tid": 7, "ts": 1716454225706660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630692, "dur": 12, "args": { "External id": 268482, "cbid": 211, "correlation": 268482 } }, { "ph": "s", "id": 268482, "pid": 76337, "tid": -914061504, "ts": 1716454225630692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225630789, "dur": 1, "args": { "External id": 268500, "cbid": 251, "correlation": 268500 } }, { "ph": "f", "id": 268500, "pid": 76337, "tid": -914061504, "ts": 1716454225630789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225708650, "dur": 150, "args": { "External id": 268502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268502, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 268502, "pid": 5, "tid": 7, "ts": 1716454225708650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630795, "dur": 13, "args": { "External id": 268502, "cbid": 211, "correlation": 268502 } }, { "ph": "s", "id": 268502, "pid": 76337, "tid": -914061504, "ts": 1716454225630795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225708801, "dur": 35, "args": { "External id": 268510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268510, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268510, "pid": 5, "tid": 7, "ts": 1716454225708801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630864, "dur": 13, "args": { "External id": 268510, "cbid": 211, "correlation": 268510 } }, { "ph": "s", "id": 268510, "pid": 76337, "tid": -914061504, "ts": 1716454225630864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225708838, "dur": 50, "args": { "External id": 268518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268518, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268518, "pid": 5, "tid": 7, "ts": 1716454225708838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630903, "dur": 8, "args": { "External id": 268518, "cbid": 211, "correlation": 268518 } }, { "ph": "s", "id": 268518, "pid": 76337, "tid": -914061504, "ts": 1716454225630903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225708889, "dur": 30, "args": { "External id": 268529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268529, "pid": 5, "tid": 7, "ts": 1716454225708889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225630986, "dur": 13, "args": { "External id": 268529, "cbid": 211, "correlation": 268529 } }, { "ph": "s", "id": 268529, "pid": 76337, "tid": -914061504, "ts": 1716454225630986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225708921, "dur": 35, "args": { "External id": 268551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268551, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268551, "pid": 5, "tid": 7, "ts": 1716454225708921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631018, "dur": 8, "args": { "External id": 268551, "cbid": 211, "correlation": 268551 } }, { "ph": "s", "id": 268551, "pid": 76337, "tid": -914061504, "ts": 1716454225631018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631104, "dur": 1, "args": { "External id": 268562, "cbid": 251, "correlation": 268562 } }, { "ph": "f", "id": 268562, "pid": 76337, "tid": -914061504, "ts": 1716454225631104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225708957, "dur": 78, "args": { "External id": 268563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268563, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268563, "pid": 5, "tid": 7, "ts": 1716454225708957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631109, "dur": 13, "args": { "External id": 268563, "cbid": 211, "correlation": 268563 } }, { "ph": "s", "id": 268563, "pid": 76337, "tid": -914061504, "ts": 1716454225631109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631178, "dur": 1, "args": { "External id": 268574, "cbid": 251, "correlation": 268574 } }, { "ph": "f", "id": 268574, "pid": 76337, "tid": -914061504, "ts": 1716454225631178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631182, "dur": 0, "args": { "External id": 268575, "cbid": 251, "correlation": 268575 } }, { "ph": "f", "id": 268575, "pid": 76337, "tid": -914061504, "ts": 1716454225631182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225709036, "dur": 11, "args": { "External id": 268576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268576, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 268576, "pid": 5, "tid": 7, "ts": 1716454225709036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631183, "dur": 12, "args": { "External id": 268576, "cbid": 211, "correlation": 268576 } }, { "ph": "s", "id": 268576, "pid": 76337, "tid": -914061504, "ts": 1716454225631183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225709049, "dur": 5, "args": { "External id": 268578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268578, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 268578, "pid": 5, "tid": 7, "ts": 1716454225709049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631197, "dur": 6, "args": { "External id": 268578, "cbid": 211, "correlation": 268578 } }, { "ph": "s", "id": 268578, "pid": 76337, "tid": -914061504, "ts": 1716454225631197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631255, "dur": 1, "args": { "External id": 268589, "cbid": 251, "correlation": 268589 } }, { "ph": "f", "id": 268589, "pid": 76337, "tid": -914061504, "ts": 1716454225631255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631258, "dur": 0, "args": { "External id": 268590, "cbid": 251, "correlation": 268590 } }, { "ph": "f", "id": 268590, "pid": 76337, "tid": -914061504, "ts": 1716454225631258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225709055, "dur": 7, "args": { "External id": 268591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268591, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 268591, "pid": 5, "tid": 7, "ts": 1716454225709055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631260, "dur": 11, "args": { "External id": 268591, "cbid": 211, "correlation": 268591 } }, { "ph": "s", "id": 268591, "pid": 76337, "tid": -914061504, "ts": 1716454225631260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225709063, "dur": 4, "args": { "External id": 268593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268593, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 268593, "pid": 5, "tid": 7, "ts": 1716454225709063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631273, "dur": 6, "args": { "External id": 268593, "cbid": 211, "correlation": 268593 } }, { "ph": "s", "id": 268593, "pid": 76337, "tid": -914061504, "ts": 1716454225631273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225709068, "dur": 93, "args": { "External id": 268614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268614, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 268614, "pid": 5, "tid": 7, "ts": 1716454225709068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631347, "dur": 13, "args": { "External id": 268614, "cbid": 211, "correlation": 268614 } }, { "ph": "s", "id": 268614, "pid": 76337, "tid": -914061504, "ts": 1716454225631347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631444, "dur": 1, "args": { "External id": 268632, "cbid": 251, "correlation": 268632 } }, { "ph": "f", "id": 268632, "pid": 76337, "tid": -914061504, "ts": 1716454225631444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225709163, "dur": 97, "args": { "External id": 268634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268634, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268634, "pid": 5, "tid": 7, "ts": 1716454225709163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631450, "dur": 13, "args": { "External id": 268634, "cbid": 211, "correlation": 268634 } }, { "ph": "s", "id": 268634, "pid": 76337, "tid": -914061504, "ts": 1716454225631450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225709262, "dur": 19, "args": { "External id": 268642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268642, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268642, "pid": 5, "tid": 7, "ts": 1716454225709262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631518, "dur": 13, "args": { "External id": 268642, "cbid": 211, "correlation": 268642 } }, { "ph": "s", "id": 268642, "pid": 76337, "tid": -914061504, "ts": 1716454225631518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225709282, "dur": 38, "args": { "External id": 268650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268650, "pid": 5, "tid": 7, "ts": 1716454225709282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631561, "dur": 9, "args": { "External id": 268650, "cbid": 211, "correlation": 268650 } }, { "ph": "s", "id": 268650, "pid": 76337, "tid": -914061504, "ts": 1716454225631561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225709322, "dur": 36, "args": { "External id": 268672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268672, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268672, "pid": 5, "tid": 7, "ts": 1716454225709322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631612, "dur": 10, "args": { "External id": 268672, "cbid": 211, "correlation": 268672 } }, { "ph": "s", "id": 268672, "pid": 76337, "tid": -914061504, "ts": 1716454225631612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631701, "dur": 1, "args": { "External id": 268688, "cbid": 251, "correlation": 268688 } }, { "ph": "f", "id": 268688, "pid": 76337, "tid": -914061504, "ts": 1716454225631701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631706, "dur": 0, "args": { "External id": 268690, "cbid": 251, "correlation": 268690 } }, { "ph": "f", "id": 268690, "pid": 76337, "tid": -914061504, "ts": 1716454225631706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225709359, "dur": 548, "args": { "External id": 268691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268691, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 268691, "pid": 5, "tid": 7, "ts": 1716454225709359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631710, "dur": 12, "args": { "External id": 268691, "cbid": 211, "correlation": 268691 } }, { "ph": "s", "id": 268691, "pid": 76337, "tid": -914061504, "ts": 1716454225631710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225709909, "dur": 126, "args": { "External id": 268699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268699, "pid": 5, "tid": 7, "ts": 1716454225709909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631774, "dur": 13, "args": { "External id": 268699, "cbid": 211, "correlation": 268699 } }, { "ph": "s", "id": 268699, "pid": 76337, "tid": -914061504, "ts": 1716454225631774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225710036, "dur": 131, "args": { "External id": 268707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268707, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268707, "pid": 5, "tid": 7, "ts": 1716454225710036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631805, "dur": 8, "args": { "External id": 268707, "cbid": 211, "correlation": 268707 } }, { "ph": "s", "id": 268707, "pid": 76337, "tid": -914061504, "ts": 1716454225631805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225631880, "dur": 1, "args": { "External id": 268723, "cbid": 251, "correlation": 268723 } }, { "ph": "f", "id": 268723, "pid": 76337, "tid": -914061504, "ts": 1716454225631880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225710168, "dur": 307, "args": { "External id": 268725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268725, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268725, "pid": 5, "tid": 7, "ts": 1716454225710168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631887, "dur": 13, "args": { "External id": 268725, "cbid": 211, "correlation": 268725 } }, { "ph": "s", "id": 268725, "pid": 76337, "tid": -914061504, "ts": 1716454225631887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225710477, "dur": 27, "args": { "External id": 268733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268733, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268733, "pid": 5, "tid": 7, "ts": 1716454225710477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225631928, "dur": 11, "args": { "External id": 268733, "cbid": 211, "correlation": 268733 } }, { "ph": "s", "id": 268733, "pid": 76337, "tid": -914061504, "ts": 1716454225631928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225710505, "dur": 84, "args": { "External id": 268744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268744, "pid": 5, "tid": 7, "ts": 1716454225710505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632056, "dur": 13, "args": { "External id": 268744, "cbid": 211, "correlation": 268744 } }, { "ph": "s", "id": 268744, "pid": 76337, "tid": -914061504, "ts": 1716454225632056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225632121, "dur": 0, "args": { "External id": 268756, "cbid": 317, "correlation": 268756 } }, { "ph": "f", "id": 268756, "pid": 76337, "tid": -914061504, "ts": 1716454225632121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225632122, "dur": 0, "args": { "External id": 268757, "cbid": 203, "correlation": 268757 } }, { "ph": "f", "id": 268757, "pid": 76337, "tid": -914061504, "ts": 1716454225632122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225632123, "dur": 0, "args": { "External id": 268758, "cbid": 205, "correlation": 268758 } }, { "ph": "f", "id": 268758, "pid": 76337, "tid": -914061504, "ts": 1716454225632123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225710590, "dur": 24, "args": { "External id": 268762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268762, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268762, "pid": 5, "tid": 7, "ts": 1716454225710590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632138, "dur": 12, "args": { "External id": 268762, "cbid": 211, "correlation": 268762 } }, { "ph": "s", "id": 268762, "pid": 76337, "tid": -914061504, "ts": 1716454225632138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225710615, "dur": 123, "args": { "External id": 268764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268764, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268764, "pid": 5, "tid": 7, "ts": 1716454225710615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632156, "dur": 7, "args": { "External id": 268764, "cbid": 211, "correlation": 268764 } }, { "ph": "s", "id": 268764, "pid": 76337, "tid": -914061504, "ts": 1716454225632156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225710740, "dur": 24, "args": { "External id": 268766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268766, "pid": 5, "tid": 7, "ts": 1716454225710740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632167, "dur": 5, "args": { "External id": 268766, "cbid": 211, "correlation": 268766 } }, { "ph": "s", "id": 268766, "pid": 76337, "tid": -914061504, "ts": 1716454225632167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225710765, "dur": 33, "args": { "External id": 268772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268772, "pid": 5, "tid": 7, "ts": 1716454225710765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632195, "dur": 8, "args": { "External id": 268772, "cbid": 211, "correlation": 268772 } }, { "ph": "s", "id": 268772, "pid": 76337, "tid": -914061504, "ts": 1716454225632195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225710799, "dur": 26, "args": { "External id": 268780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268780, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268780, "pid": 5, "tid": 7, "ts": 1716454225710799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632225, "dur": 9, "args": { "External id": 268780, "cbid": 211, "correlation": 268780 } }, { "ph": "s", "id": 268780, "pid": 76337, "tid": -914061504, "ts": 1716454225632225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225710827, "dur": 44, "args": { "External id": 268789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268789, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268789, "pid": 5, "tid": 7, "ts": 1716454225710827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632263, "dur": 10, "args": { "External id": 268789, "cbid": 211, "correlation": 268789 } }, { "ph": "s", "id": 268789, "pid": 76337, "tid": -914061504, "ts": 1716454225632263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225710872, "dur": 43, "args": { "External id": 268809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268809, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 268809, "pid": 5, "tid": 7, "ts": 1716454225710872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632334, "dur": 11, "args": { "External id": 268809, "cbid": 211, "correlation": 268809 } }, { "ph": "s", "id": 268809, "pid": 76337, "tid": -914061504, "ts": 1716454225632334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225710917, "dur": 5, "args": { "External id": 268821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268821, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 268821, "pid": 5, "tid": 7, "ts": 1716454225710917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632355, "dur": 6, "args": { "External id": 268821, "cbid": 211, "correlation": 268821 } }, { "ph": "s", "id": 268821, "pid": 76337, "tid": -914061504, "ts": 1716454225632355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225710923, "dur": 44, "args": { "External id": 268824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268824, "pid": 5, "tid": 7, "ts": 1716454225710923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632375, "dur": 7, "args": { "External id": 268824, "cbid": 211, "correlation": 268824 } }, { "ph": "s", "id": 268824, "pid": 76337, "tid": -914061504, "ts": 1716454225632375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225710969, "dur": 29, "args": { "External id": 268833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268833, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268833, "pid": 5, "tid": 7, "ts": 1716454225710969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632414, "dur": 10, "args": { "External id": 268833, "cbid": 211, "correlation": 268833 } }, { "ph": "s", "id": 268833, "pid": 76337, "tid": -914061504, "ts": 1716454225632414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225632466, "dur": 0, "args": { "External id": 268843, "cbid": 317, "correlation": 268843 } }, { "ph": "f", "id": 268843, "pid": 76337, "tid": -914061504, "ts": 1716454225632466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225632467, "dur": 0, "args": { "External id": 268844, "cbid": 203, "correlation": 268844 } }, { "ph": "f", "id": 268844, "pid": 76337, "tid": -914061504, "ts": 1716454225632467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225632468, "dur": 0, "args": { "External id": 268845, "cbid": 205, "correlation": 268845 } }, { "ph": "f", "id": 268845, "pid": 76337, "tid": -914061504, "ts": 1716454225632468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225710999, "dur": 30, "args": { "External id": 268849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268849, "pid": 5, "tid": 7, "ts": 1716454225710999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632482, "dur": 12, "args": { "External id": 268849, "cbid": 211, "correlation": 268849 } }, { "ph": "s", "id": 268849, "pid": 76337, "tid": -914061504, "ts": 1716454225632482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225711030, "dur": 64, "args": { "External id": 268851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268851, "pid": 5, "tid": 7, "ts": 1716454225711030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632497, "dur": 5, "args": { "External id": 268851, "cbid": 211, "correlation": 268851 } }, { "ph": "s", "id": 268851, "pid": 76337, "tid": -914061504, "ts": 1716454225632497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225711096, "dur": 979, "args": { "External id": 268853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268853, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268853, "pid": 5, "tid": 7, "ts": 1716454225711096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632508, "dur": 6, "args": { "External id": 268853, "cbid": 211, "correlation": 268853 } }, { "ph": "s", "id": 268853, "pid": 76337, "tid": -914061504, "ts": 1716454225632508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225712077, "dur": 22, "args": { "External id": 268855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268855, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268855, "pid": 5, "tid": 7, "ts": 1716454225712077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632518, "dur": 5, "args": { "External id": 268855, "cbid": 211, "correlation": 268855 } }, { "ph": "s", "id": 268855, "pid": 76337, "tid": -914061504, "ts": 1716454225632518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225712100, "dur": 33, "args": { "External id": 268861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268861, "pid": 5, "tid": 7, "ts": 1716454225712100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632546, "dur": 8, "args": { "External id": 268861, "cbid": 211, "correlation": 268861 } }, { "ph": "s", "id": 268861, "pid": 76337, "tid": -914061504, "ts": 1716454225632546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225712134, "dur": 3, "args": { "External id": 268869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268869, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 268869, "pid": 5, "tid": 7, "ts": 1716454225712134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632590, "dur": 10, "args": { "External id": 268869, "cbid": 211, "correlation": 268869 } }, { "ph": "s", "id": 268869, "pid": 76337, "tid": -914061504, "ts": 1716454225632590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225632655, "dur": 1, "args": { "External id": 268885, "cbid": 251, "correlation": 268885 } }, { "ph": "f", "id": 268885, "pid": 76337, "tid": -914061504, "ts": 1716454225632655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225632660, "dur": 0, "args": { "External id": 268887, "cbid": 251, "correlation": 268887 } }, { "ph": "f", "id": 268887, "pid": 76337, "tid": -914061504, "ts": 1716454225632660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225712139, "dur": 12, "args": { "External id": 268888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268888, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 268888, "pid": 5, "tid": 7, "ts": 1716454225712139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632662, "dur": 11, "args": { "External id": 268888, "cbid": 211, "correlation": 268888 } }, { "ph": "s", "id": 268888, "pid": 76337, "tid": -914061504, "ts": 1716454225632662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225712153, "dur": 5, "args": { "External id": 268890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268890, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 268890, "pid": 5, "tid": 7, "ts": 1716454225712153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632675, "dur": 5, "args": { "External id": 268890, "cbid": 211, "correlation": 268890 } }, { "ph": "s", "id": 268890, "pid": 76337, "tid": -914061504, "ts": 1716454225632675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225712159, "dur": 29, "args": { "External id": 268900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268900, "pid": 5, "tid": 7, "ts": 1716454225712159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632731, "dur": 13, "args": { "External id": 268900, "cbid": 211, "correlation": 268900 } }, { "ph": "s", "id": 268900, "pid": 76337, "tid": -914061504, "ts": 1716454225632731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225712190, "dur": 31, "args": { "External id": 268920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268920, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 268920, "pid": 5, "tid": 7, "ts": 1716454225712190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632797, "dur": 10, "args": { "External id": 268920, "cbid": 211, "correlation": 268920 } }, { "ph": "s", "id": 268920, "pid": 76337, "tid": -914061504, "ts": 1716454225632797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225712223, "dur": 4, "args": { "External id": 268932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268932, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 268932, "pid": 5, "tid": 7, "ts": 1716454225712223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632817, "dur": 6, "args": { "External id": 268932, "cbid": 211, "correlation": 268932 } }, { "ph": "s", "id": 268932, "pid": 76337, "tid": -914061504, "ts": 1716454225632817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225712228, "dur": 31, "args": { "External id": 268935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268935, "pid": 5, "tid": 7, "ts": 1716454225712228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632836, "dur": 7, "args": { "External id": 268935, "cbid": 211, "correlation": 268935 } }, { "ph": "s", "id": 268935, "pid": 76337, "tid": -914061504, "ts": 1716454225632836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225712260, "dur": 21, "args": { "External id": 268944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268944, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268944, "pid": 5, "tid": 7, "ts": 1716454225712260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632876, "dur": 10, "args": { "External id": 268944, "cbid": 211, "correlation": 268944 } }, { "ph": "s", "id": 268944, "pid": 76337, "tid": -914061504, "ts": 1716454225632876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225632938, "dur": 0, "args": { "External id": 268954, "cbid": 317, "correlation": 268954 } }, { "ph": "f", "id": 268954, "pid": 76337, "tid": -914061504, "ts": 1716454225632938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225632939, "dur": 0, "args": { "External id": 268955, "cbid": 203, "correlation": 268955 } }, { "ph": "f", "id": 268955, "pid": 76337, "tid": -914061504, "ts": 1716454225632939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225632940, "dur": 0, "args": { "External id": 268956, "cbid": 205, "correlation": 268956 } }, { "ph": "f", "id": 268956, "pid": 76337, "tid": -914061504, "ts": 1716454225632940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225712283, "dur": 23, "args": { "External id": 268960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268960, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268960, "pid": 5, "tid": 7, "ts": 1716454225712283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632954, "dur": 12, "args": { "External id": 268960, "cbid": 211, "correlation": 268960 } }, { "ph": "s", "id": 268960, "pid": 76337, "tid": -914061504, "ts": 1716454225632954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225712307, "dur": 44, "args": { "External id": 268962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268962, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268962, "pid": 5, "tid": 7, "ts": 1716454225712307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632969, "dur": 13, "args": { "External id": 268962, "cbid": 211, "correlation": 268962 } }, { "ph": "s", "id": 268962, "pid": 76337, "tid": -914061504, "ts": 1716454225632969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225712352, "dur": 653, "args": { "External id": 268964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268964, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268964, "pid": 5, "tid": 7, "ts": 1716454225712352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632988, "dur": 7, "args": { "External id": 268964, "cbid": 211, "correlation": 268964 } }, { "ph": "s", "id": 268964, "pid": 76337, "tid": -914061504, "ts": 1716454225632988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225713006, "dur": 22, "args": { "External id": 268966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268966, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268966, "pid": 5, "tid": 7, "ts": 1716454225713006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225632999, "dur": 5, "args": { "External id": 268966, "cbid": 211, "correlation": 268966 } }, { "ph": "s", "id": 268966, "pid": 76337, "tid": -914061504, "ts": 1716454225632999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225713030, "dur": 33, "args": { "External id": 268972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268972, "pid": 5, "tid": 7, "ts": 1716454225713030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633027, "dur": 8, "args": { "External id": 268972, "cbid": 211, "correlation": 268972 } }, { "ph": "s", "id": 268972, "pid": 76337, "tid": -914061504, "ts": 1716454225633027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225633085, "dur": 0, "args": { "External id": 268982, "cbid": 317, "correlation": 268982 } }, { "ph": "f", "id": 268982, "pid": 76337, "tid": -914061504, "ts": 1716454225633085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225633086, "dur": 0, "args": { "External id": 268983, "cbid": 203, "correlation": 268983 } }, { "ph": "f", "id": 268983, "pid": 76337, "tid": -914061504, "ts": 1716454225633086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225633087, "dur": 0, "args": { "External id": 268984, "cbid": 205, "correlation": 268984 } }, { "ph": "f", "id": 268984, "pid": 76337, "tid": -914061504, "ts": 1716454225633087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225713064, "dur": 30, "args": { "External id": 268988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268988, "pid": 5, "tid": 7, "ts": 1716454225713064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633101, "dur": 11, "args": { "External id": 268988, "cbid": 211, "correlation": 268988 } }, { "ph": "s", "id": 268988, "pid": 76337, "tid": -914061504, "ts": 1716454225633101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225713096, "dur": 154, "args": { "External id": 268990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268990, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 268990, "pid": 5, "tid": 7, "ts": 1716454225713096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633118, "dur": 6, "args": { "External id": 268990, "cbid": 211, "correlation": 268990 } }, { "ph": "s", "id": 268990, "pid": 76337, "tid": -914061504, "ts": 1716454225633118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225713251, "dur": 23, "args": { "External id": 268992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268992, "pid": 5, "tid": 7, "ts": 1716454225713251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633128, "dur": 5, "args": { "External id": 268992, "cbid": 211, "correlation": 268992 } }, { "ph": "s", "id": 268992, "pid": 76337, "tid": -914061504, "ts": 1716454225633128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225713275, "dur": 33, "args": { "External id": 268998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 268998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 268998, "pid": 5, "tid": 7, "ts": 1716454225713275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633154, "dur": 8, "args": { "External id": 268998, "cbid": 211, "correlation": 268998 } }, { "ph": "s", "id": 268998, "pid": 76337, "tid": -914061504, "ts": 1716454225633154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225713310, "dur": 27, "args": { "External id": 269006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269006, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269006, "pid": 5, "tid": 7, "ts": 1716454225713310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633182, "dur": 8, "args": { "External id": 269006, "cbid": 211, "correlation": 269006 } }, { "ph": "s", "id": 269006, "pid": 76337, "tid": -914061504, "ts": 1716454225633182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225713338, "dur": 19, "args": { "External id": 269014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269014, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269014, "pid": 5, "tid": 7, "ts": 1716454225713338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633211, "dur": 8, "args": { "External id": 269014, "cbid": 211, "correlation": 269014 } }, { "ph": "s", "id": 269014, "pid": 76337, "tid": -914061504, "ts": 1716454225633211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225713359, "dur": 32, "args": { "External id": 269034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269034, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 269034, "pid": 5, "tid": 7, "ts": 1716454225713359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633292, "dur": 13, "args": { "External id": 269034, "cbid": 211, "correlation": 269034 } }, { "ph": "s", "id": 269034, "pid": 76337, "tid": -914061504, "ts": 1716454225633292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225713392, "dur": 4, "args": { "External id": 269046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269046, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 269046, "pid": 5, "tid": 7, "ts": 1716454225713392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633315, "dur": 6, "args": { "External id": 269046, "cbid": 211, "correlation": 269046 } }, { "ph": "s", "id": 269046, "pid": 76337, "tid": -914061504, "ts": 1716454225633315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225713398, "dur": 32, "args": { "External id": 269049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269049, "pid": 5, "tid": 7, "ts": 1716454225713398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633332, "dur": 7, "args": { "External id": 269049, "cbid": 211, "correlation": 269049 } }, { "ph": "s", "id": 269049, "pid": 76337, "tid": -914061504, "ts": 1716454225633332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225633389, "dur": 0, "args": { "External id": 269060, "cbid": 317, "correlation": 269060 } }, { "ph": "f", "id": 269060, "pid": 76337, "tid": -914061504, "ts": 1716454225633389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225633390, "dur": 0, "args": { "External id": 269061, "cbid": 203, "correlation": 269061 } }, { "ph": "f", "id": 269061, "pid": 76337, "tid": -914061504, "ts": 1716454225633390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225633391, "dur": 0, "args": { "External id": 269062, "cbid": 205, "correlation": 269062 } }, { "ph": "f", "id": 269062, "pid": 76337, "tid": -914061504, "ts": 1716454225633391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225713431, "dur": 22, "args": { "External id": 269066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269066, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269066, "pid": 5, "tid": 7, "ts": 1716454225713431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633404, "dur": 12, "args": { "External id": 269066, "cbid": 211, "correlation": 269066 } }, { "ph": "s", "id": 269066, "pid": 76337, "tid": -914061504, "ts": 1716454225633404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225713454, "dur": 107, "args": { "External id": 269068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269068, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269068, "pid": 5, "tid": 7, "ts": 1716454225713454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633422, "dur": 6, "args": { "External id": 269068, "cbid": 211, "correlation": 269068 } }, { "ph": "s", "id": 269068, "pid": 76337, "tid": -914061504, "ts": 1716454225633422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225713563, "dur": 23, "args": { "External id": 269070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269070, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269070, "pid": 5, "tid": 7, "ts": 1716454225713563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633432, "dur": 5, "args": { "External id": 269070, "cbid": 211, "correlation": 269070 } }, { "ph": "s", "id": 269070, "pid": 76337, "tid": -914061504, "ts": 1716454225633432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225713587, "dur": 33, "args": { "External id": 269076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269076, "pid": 5, "tid": 7, "ts": 1716454225713587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633459, "dur": 8, "args": { "External id": 269076, "cbid": 211, "correlation": 269076 } }, { "ph": "s", "id": 269076, "pid": 76337, "tid": -914061504, "ts": 1716454225633459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225713621, "dur": 190, "args": { "External id": 269085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269085, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269085, "pid": 5, "tid": 7, "ts": 1716454225713621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633540, "dur": 15, "args": { "External id": 269085, "cbid": 211, "correlation": 269085 } }, { "ph": "s", "id": 269085, "pid": 76337, "tid": -914061504, "ts": 1716454225633540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225713812, "dur": 65, "args": { "External id": 269107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269107, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269107, "pid": 5, "tid": 7, "ts": 1716454225713812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633602, "dur": 10, "args": { "External id": 269107, "cbid": 211, "correlation": 269107 } }, { "ph": "s", "id": 269107, "pid": 76337, "tid": -914061504, "ts": 1716454225633602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225633688, "dur": 1, "args": { "External id": 269118, "cbid": 251, "correlation": 269118 } }, { "ph": "f", "id": 269118, "pid": 76337, "tid": -914061504, "ts": 1716454225633688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225713879, "dur": 152, "args": { "External id": 269119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269119, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269119, "pid": 5, "tid": 7, "ts": 1716454225713879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633693, "dur": 14, "args": { "External id": 269119, "cbid": 211, "correlation": 269119 } }, { "ph": "s", "id": 269119, "pid": 76337, "tid": -914061504, "ts": 1716454225633693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225633763, "dur": 1, "args": { "External id": 269130, "cbid": 251, "correlation": 269130 } }, { "ph": "f", "id": 269130, "pid": 76337, "tid": -914061504, "ts": 1716454225633763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225714032, "dur": 147, "args": { "External id": 269131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269131, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269131, "pid": 5, "tid": 7, "ts": 1716454225714032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633767, "dur": 11, "args": { "External id": 269131, "cbid": 211, "correlation": 269131 } }, { "ph": "s", "id": 269131, "pid": 76337, "tid": -914061504, "ts": 1716454225633767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225633831, "dur": 1, "args": { "External id": 269142, "cbid": 251, "correlation": 269142 } }, { "ph": "f", "id": 269142, "pid": 76337, "tid": -914061504, "ts": 1716454225633831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225714180, "dur": 148, "args": { "External id": 269143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269143, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269143, "pid": 5, "tid": 7, "ts": 1716454225714180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633836, "dur": 11, "args": { "External id": 269143, "cbid": 211, "correlation": 269143 } }, { "ph": "s", "id": 269143, "pid": 76337, "tid": -914061504, "ts": 1716454225633836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225714330, "dur": 1993, "args": { "External id": 269164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269164, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 269164, "pid": 5, "tid": 7, "ts": 1716454225714330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225633917, "dur": 12, "args": { "External id": 269164, "cbid": 211, "correlation": 269164 } }, { "ph": "s", "id": 269164, "pid": 76337, "tid": -914061504, "ts": 1716454225633917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634020, "dur": 1, "args": { "External id": 269182, "cbid": 251, "correlation": 269182 } }, { "ph": "f", "id": 269182, "pid": 76337, "tid": -914061504, "ts": 1716454225634020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225716325, "dur": 152, "args": { "External id": 269184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269184, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 269184, "pid": 5, "tid": 7, "ts": 1716454225716325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634026, "dur": 13, "args": { "External id": 269184, "cbid": 211, "correlation": 269184 } }, { "ph": "s", "id": 269184, "pid": 76337, "tid": -914061504, "ts": 1716454225634026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225716478, "dur": 36, "args": { "External id": 269192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269192, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269192, "pid": 5, "tid": 7, "ts": 1716454225716478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634097, "dur": 13, "args": { "External id": 269192, "cbid": 211, "correlation": 269192 } }, { "ph": "s", "id": 269192, "pid": 76337, "tid": -914061504, "ts": 1716454225634097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225716515, "dur": 51, "args": { "External id": 269200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269200, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269200, "pid": 5, "tid": 7, "ts": 1716454225716515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634136, "dur": 8, "args": { "External id": 269200, "cbid": 211, "correlation": 269200 } }, { "ph": "s", "id": 269200, "pid": 76337, "tid": -914061504, "ts": 1716454225634136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225716567, "dur": 30, "args": { "External id": 269211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269211, "pid": 5, "tid": 7, "ts": 1716454225716567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634206, "dur": 13, "args": { "External id": 269211, "cbid": 211, "correlation": 269211 } }, { "ph": "s", "id": 269211, "pid": 76337, "tid": -914061504, "ts": 1716454225634206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225716598, "dur": 35, "args": { "External id": 269233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269233, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269233, "pid": 5, "tid": 7, "ts": 1716454225716598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634237, "dur": 7, "args": { "External id": 269233, "cbid": 211, "correlation": 269233 } }, { "ph": "s", "id": 269233, "pid": 76337, "tid": -914061504, "ts": 1716454225634237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634321, "dur": 1, "args": { "External id": 269244, "cbid": 251, "correlation": 269244 } }, { "ph": "f", "id": 269244, "pid": 76337, "tid": -914061504, "ts": 1716454225634321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225716635, "dur": 91, "args": { "External id": 269245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269245, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269245, "pid": 5, "tid": 7, "ts": 1716454225716635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634326, "dur": 13, "args": { "External id": 269245, "cbid": 211, "correlation": 269245 } }, { "ph": "s", "id": 269245, "pid": 76337, "tid": -914061504, "ts": 1716454225634326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634393, "dur": 1, "args": { "External id": 269256, "cbid": 251, "correlation": 269256 } }, { "ph": "f", "id": 269256, "pid": 76337, "tid": -914061504, "ts": 1716454225634393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634397, "dur": 0, "args": { "External id": 269257, "cbid": 251, "correlation": 269257 } }, { "ph": "f", "id": 269257, "pid": 76337, "tid": -914061504, "ts": 1716454225634397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225716727, "dur": 12, "args": { "External id": 269258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269258, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 269258, "pid": 5, "tid": 7, "ts": 1716454225716727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634398, "dur": 13, "args": { "External id": 269258, "cbid": 211, "correlation": 269258 } }, { "ph": "s", "id": 269258, "pid": 76337, "tid": -914061504, "ts": 1716454225634398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225716740, "dur": 5, "args": { "External id": 269260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269260, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 269260, "pid": 5, "tid": 7, "ts": 1716454225716740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634413, "dur": 6, "args": { "External id": 269260, "cbid": 211, "correlation": 269260 } }, { "ph": "s", "id": 269260, "pid": 76337, "tid": -914061504, "ts": 1716454225634413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634470, "dur": 1, "args": { "External id": 269271, "cbid": 251, "correlation": 269271 } }, { "ph": "f", "id": 269271, "pid": 76337, "tid": -914061504, "ts": 1716454225634470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634474, "dur": 0, "args": { "External id": 269272, "cbid": 251, "correlation": 269272 } }, { "ph": "f", "id": 269272, "pid": 76337, "tid": -914061504, "ts": 1716454225634474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225716747, "dur": 7, "args": { "External id": 269273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269273, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 269273, "pid": 5, "tid": 7, "ts": 1716454225716747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634475, "dur": 12, "args": { "External id": 269273, "cbid": 211, "correlation": 269273 } }, { "ph": "s", "id": 269273, "pid": 76337, "tid": -914061504, "ts": 1716454225634475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225716756, "dur": 4, "args": { "External id": 269275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269275, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 269275, "pid": 5, "tid": 7, "ts": 1716454225716756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634488, "dur": 5, "args": { "External id": 269275, "cbid": 211, "correlation": 269275 } }, { "ph": "s", "id": 269275, "pid": 76337, "tid": -914061504, "ts": 1716454225634488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225716760, "dur": 93, "args": { "External id": 269296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269296, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 269296, "pid": 5, "tid": 7, "ts": 1716454225716760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634563, "dur": 12, "args": { "External id": 269296, "cbid": 211, "correlation": 269296 } }, { "ph": "s", "id": 269296, "pid": 76337, "tid": -914061504, "ts": 1716454225634563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634657, "dur": 1, "args": { "External id": 269314, "cbid": 251, "correlation": 269314 } }, { "ph": "f", "id": 269314, "pid": 76337, "tid": -914061504, "ts": 1716454225634657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225716855, "dur": 98, "args": { "External id": 269316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269316, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269316, "pid": 5, "tid": 7, "ts": 1716454225716855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634663, "dur": 13, "args": { "External id": 269316, "cbid": 211, "correlation": 269316 } }, { "ph": "s", "id": 269316, "pid": 76337, "tid": -914061504, "ts": 1716454225634663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225716954, "dur": 19, "args": { "External id": 269324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269324, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269324, "pid": 5, "tid": 7, "ts": 1716454225716954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634732, "dur": 12, "args": { "External id": 269324, "cbid": 211, "correlation": 269324 } }, { "ph": "s", "id": 269324, "pid": 76337, "tid": -914061504, "ts": 1716454225634732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225716974, "dur": 37, "args": { "External id": 269332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269332, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269332, "pid": 5, "tid": 7, "ts": 1716454225716974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634773, "dur": 9, "args": { "External id": 269332, "cbid": 211, "correlation": 269332 } }, { "ph": "s", "id": 269332, "pid": 76337, "tid": -914061504, "ts": 1716454225634773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225717013, "dur": 34, "args": { "External id": 269354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269354, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269354, "pid": 5, "tid": 7, "ts": 1716454225717013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634824, "dur": 10, "args": { "External id": 269354, "cbid": 211, "correlation": 269354 } }, { "ph": "s", "id": 269354, "pid": 76337, "tid": -914061504, "ts": 1716454225634824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634912, "dur": 1, "args": { "External id": 269370, "cbid": 251, "correlation": 269370 } }, { "ph": "f", "id": 269370, "pid": 76337, "tid": -914061504, "ts": 1716454225634912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225634917, "dur": 0, "args": { "External id": 269372, "cbid": 251, "correlation": 269372 } }, { "ph": "f", "id": 269372, "pid": 76337, "tid": -914061504, "ts": 1716454225634917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225717049, "dur": 549, "args": { "External id": 269373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269373, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 269373, "pid": 5, "tid": 7, "ts": 1716454225717049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634921, "dur": 13, "args": { "External id": 269373, "cbid": 211, "correlation": 269373 } }, { "ph": "s", "id": 269373, "pid": 76337, "tid": -914061504, "ts": 1716454225634921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225717600, "dur": 126, "args": { "External id": 269381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269381, "pid": 5, "tid": 7, "ts": 1716454225717600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225634994, "dur": 13, "args": { "External id": 269381, "cbid": 211, "correlation": 269381 } }, { "ph": "s", "id": 269381, "pid": 76337, "tid": -914061504, "ts": 1716454225634994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225717727, "dur": 130, "args": { "External id": 269389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269389, "pid": 5, "tid": 7, "ts": 1716454225717727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635025, "dur": 8, "args": { "External id": 269389, "cbid": 211, "correlation": 269389 } }, { "ph": "s", "id": 269389, "pid": 76337, "tid": -914061504, "ts": 1716454225635025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225635102, "dur": 1, "args": { "External id": 269405, "cbid": 251, "correlation": 269405 } }, { "ph": "f", "id": 269405, "pid": 76337, "tid": -914061504, "ts": 1716454225635102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225717859, "dur": 310, "args": { "External id": 269407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269407, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269407, "pid": 5, "tid": 7, "ts": 1716454225717859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635107, "dur": 12, "args": { "External id": 269407, "cbid": 211, "correlation": 269407 } }, { "ph": "s", "id": 269407, "pid": 76337, "tid": -914061504, "ts": 1716454225635107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225718170, "dur": 27, "args": { "External id": 269415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269415, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269415, "pid": 5, "tid": 7, "ts": 1716454225718170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635150, "dur": 9, "args": { "External id": 269415, "cbid": 211, "correlation": 269415 } }, { "ph": "s", "id": 269415, "pid": 76337, "tid": -914061504, "ts": 1716454225635150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225718198, "dur": 82, "args": { "External id": 269426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269426, "pid": 5, "tid": 7, "ts": 1716454225718198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635217, "dur": 13, "args": { "External id": 269426, "cbid": 211, "correlation": 269426 } }, { "ph": "s", "id": 269426, "pid": 76337, "tid": -914061504, "ts": 1716454225635217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225635281, "dur": 0, "args": { "External id": 269438, "cbid": 317, "correlation": 269438 } }, { "ph": "f", "id": 269438, "pid": 76337, "tid": -914061504, "ts": 1716454225635281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225635282, "dur": 0, "args": { "External id": 269439, "cbid": 203, "correlation": 269439 } }, { "ph": "f", "id": 269439, "pid": 76337, "tid": -914061504, "ts": 1716454225635282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225635283, "dur": 0, "args": { "External id": 269440, "cbid": 205, "correlation": 269440 } }, { "ph": "f", "id": 269440, "pid": 76337, "tid": -914061504, "ts": 1716454225635283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225718282, "dur": 24, "args": { "External id": 269444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269444, "pid": 5, "tid": 7, "ts": 1716454225718282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635297, "dur": 12, "args": { "External id": 269444, "cbid": 211, "correlation": 269444 } }, { "ph": "s", "id": 269444, "pid": 76337, "tid": -914061504, "ts": 1716454225635297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225718307, "dur": 121, "args": { "External id": 269446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269446, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269446, "pid": 5, "tid": 7, "ts": 1716454225718307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635316, "dur": 7, "args": { "External id": 269446, "cbid": 211, "correlation": 269446 } }, { "ph": "s", "id": 269446, "pid": 76337, "tid": -914061504, "ts": 1716454225635316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225718430, "dur": 23, "args": { "External id": 269448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269448, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269448, "pid": 5, "tid": 7, "ts": 1716454225718430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635327, "dur": 5, "args": { "External id": 269448, "cbid": 211, "correlation": 269448 } }, { "ph": "s", "id": 269448, "pid": 76337, "tid": -914061504, "ts": 1716454225635327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225718454, "dur": 33, "args": { "External id": 269454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269454, "pid": 5, "tid": 7, "ts": 1716454225718454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635354, "dur": 8, "args": { "External id": 269454, "cbid": 211, "correlation": 269454 } }, { "ph": "s", "id": 269454, "pid": 76337, "tid": -914061504, "ts": 1716454225635354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225718489, "dur": 27, "args": { "External id": 269462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269462, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269462, "pid": 5, "tid": 7, "ts": 1716454225718489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635385, "dur": 8, "args": { "External id": 269462, "cbid": 211, "correlation": 269462 } }, { "ph": "s", "id": 269462, "pid": 76337, "tid": -914061504, "ts": 1716454225635385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225718516, "dur": 102, "args": { "External id": 269473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269473, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269473, "pid": 5, "tid": 7, "ts": 1716454225718516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635446, "dur": 11, "args": { "External id": 269473, "cbid": 211, "correlation": 269473 } }, { "ph": "s", "id": 269473, "pid": 76337, "tid": -914061504, "ts": 1716454225635446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225635500, "dur": 0, "args": { "External id": 269483, "cbid": 317, "correlation": 269483 } }, { "ph": "f", "id": 269483, "pid": 76337, "tid": -914061504, "ts": 1716454225635500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225635501, "dur": 0, "args": { "External id": 269484, "cbid": 203, "correlation": 269484 } }, { "ph": "f", "id": 269484, "pid": 76337, "tid": -914061504, "ts": 1716454225635501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225635502, "dur": 0, "args": { "External id": 269485, "cbid": 205, "correlation": 269485 } }, { "ph": "f", "id": 269485, "pid": 76337, "tid": -914061504, "ts": 1716454225635502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225718620, "dur": 75, "args": { "External id": 269489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269489, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269489, "pid": 5, "tid": 7, "ts": 1716454225718620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635516, "dur": 12, "args": { "External id": 269489, "cbid": 211, "correlation": 269489 } }, { "ph": "s", "id": 269489, "pid": 76337, "tid": -914061504, "ts": 1716454225635516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225718697, "dur": 45, "args": { "External id": 269491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269491, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269491, "pid": 5, "tid": 7, "ts": 1716454225718697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635531, "dur": 5, "args": { "External id": 269491, "cbid": 211, "correlation": 269491 } }, { "ph": "s", "id": 269491, "pid": 76337, "tid": -914061504, "ts": 1716454225635531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225718743, "dur": 4, "args": { "External id": 269493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269493, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269493, "pid": 5, "tid": 7, "ts": 1716454225718743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635542, "dur": 6, "args": { "External id": 269493, "cbid": 211, "correlation": 269493 } }, { "ph": "s", "id": 269493, "pid": 76337, "tid": -914061504, "ts": 1716454225635542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225635551, "dur": 0, "args": { "External id": 269494, "cbid": 51, "correlation": 269494 } }, { "ph": "s", "id": 269494, "pid": 76337, "tid": -914061504, "ts": 1716454225635551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225718748, "dur": 2235, "args": { "External id": 269495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269495, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269495, "pid": 5, "tid": 7, "ts": 1716454225718748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635552, "dur": 5, "args": { "External id": 269495, "cbid": 211, "correlation": 269495 } }, { "ph": "s", "id": 269495, "pid": 76337, "tid": -914061504, "ts": 1716454225635552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225720984, "dur": 113, "args": { "External id": 269500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269500, "pid": 5, "tid": 7, "ts": 1716454225720984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635581, "dur": 8, "args": { "External id": 269500, "cbid": 211, "correlation": 269500 } }, { "ph": "s", "id": 269500, "pid": 76337, "tid": -914061504, "ts": 1716454225635581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225721099, "dur": 164, "args": { "External id": 269509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269509, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269509, "pid": 5, "tid": 7, "ts": 1716454225721099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635672, "dur": 14, "args": { "External id": 269509, "cbid": 211, "correlation": 269509 } }, { "ph": "s", "id": 269509, "pid": 76337, "tid": -914061504, "ts": 1716454225635672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225721264, "dur": 129, "args": { "External id": 269529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269529, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 269529, "pid": 5, "tid": 7, "ts": 1716454225721264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635741, "dur": 11, "args": { "External id": 269529, "cbid": 211, "correlation": 269529 } }, { "ph": "s", "id": 269529, "pid": 76337, "tid": -914061504, "ts": 1716454225635741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225721394, "dur": 5, "args": { "External id": 269541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269541, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 269541, "pid": 5, "tid": 7, "ts": 1716454225721394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635763, "dur": 6, "args": { "External id": 269541, "cbid": 211, "correlation": 269541 } }, { "ph": "s", "id": 269541, "pid": 76337, "tid": -914061504, "ts": 1716454225635763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225721400, "dur": 161, "args": { "External id": 269544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269544, "pid": 5, "tid": 7, "ts": 1716454225721400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635782, "dur": 7, "args": { "External id": 269544, "cbid": 211, "correlation": 269544 } }, { "ph": "s", "id": 269544, "pid": 76337, "tid": -914061504, "ts": 1716454225635782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225721563, "dur": 102, "args": { "External id": 269553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269553, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269553, "pid": 5, "tid": 7, "ts": 1716454225721563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635822, "dur": 10, "args": { "External id": 269553, "cbid": 211, "correlation": 269553 } }, { "ph": "s", "id": 269553, "pid": 76337, "tid": -914061504, "ts": 1716454225635822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225635874, "dur": 0, "args": { "External id": 269563, "cbid": 317, "correlation": 269563 } }, { "ph": "f", "id": 269563, "pid": 76337, "tid": -914061504, "ts": 1716454225635874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225635875, "dur": 0, "args": { "External id": 269564, "cbid": 203, "correlation": 269564 } }, { "ph": "f", "id": 269564, "pid": 76337, "tid": -914061504, "ts": 1716454225635875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225635875, "dur": 0, "args": { "External id": 269565, "cbid": 205, "correlation": 269565 } }, { "ph": "f", "id": 269565, "pid": 76337, "tid": -914061504, "ts": 1716454225635875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225721666, "dur": 112, "args": { "External id": 269569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269569, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269569, "pid": 5, "tid": 7, "ts": 1716454225721666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635892, "dur": 11, "args": { "External id": 269569, "cbid": 211, "correlation": 269569 } }, { "ph": "s", "id": 269569, "pid": 76337, "tid": -914061504, "ts": 1716454225635892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225721779, "dur": 34, "args": { "External id": 269571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269571, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269571, "pid": 5, "tid": 7, "ts": 1716454225721779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635906, "dur": 5, "args": { "External id": 269571, "cbid": 211, "correlation": 269571 } }, { "ph": "s", "id": 269571, "pid": 76337, "tid": -914061504, "ts": 1716454225635906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225721814, "dur": 3, "args": { "External id": 269573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269573, "pid": 5, "tid": 7, "ts": 1716454225721814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635915, "dur": 5, "args": { "External id": 269573, "cbid": 211, "correlation": 269573 } }, { "ph": "s", "id": 269573, "pid": 76337, "tid": -914061504, "ts": 1716454225635915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225635924, "dur": 0, "args": { "External id": 269574, "cbid": 51, "correlation": 269574 } }, { "ph": "s", "id": 269574, "pid": 76337, "tid": -914061504, "ts": 1716454225635924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225721819, "dur": 2043, "args": { "External id": 269575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269575, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269575, "pid": 5, "tid": 7, "ts": 1716454225721819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635925, "dur": 7, "args": { "External id": 269575, "cbid": 211, "correlation": 269575 } }, { "ph": "s", "id": 269575, "pid": 76337, "tid": -914061504, "ts": 1716454225635925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225723864, "dur": 60, "args": { "External id": 269580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269580, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269580, "pid": 5, "tid": 7, "ts": 1716454225723864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225635954, "dur": 9, "args": { "External id": 269580, "cbid": 211, "correlation": 269580 } }, { "ph": "s", "id": 269580, "pid": 76337, "tid": -914061504, "ts": 1716454225635954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225723925, "dur": 3, "args": { "External id": 269588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269588, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269588, "pid": 5, "tid": 7, "ts": 1716454225723925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636005, "dur": 10, "args": { "External id": 269588, "cbid": 211, "correlation": 269588 } }, { "ph": "s", "id": 269588, "pid": 76337, "tid": -914061504, "ts": 1716454225636005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636070, "dur": 1, "args": { "External id": 269604, "cbid": 251, "correlation": 269604 } }, { "ph": "f", "id": 269604, "pid": 76337, "tid": -914061504, "ts": 1716454225636070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636076, "dur": 0, "args": { "External id": 269606, "cbid": 251, "correlation": 269606 } }, { "ph": "f", "id": 269606, "pid": 76337, "tid": -914061504, "ts": 1716454225636076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225723930, "dur": 12, "args": { "External id": 269607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269607, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 269607, "pid": 5, "tid": 7, "ts": 1716454225723930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636078, "dur": 12, "args": { "External id": 269607, "cbid": 211, "correlation": 269607 } }, { "ph": "s", "id": 269607, "pid": 76337, "tid": -914061504, "ts": 1716454225636078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225723943, "dur": 5, "args": { "External id": 269609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269609, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 269609, "pid": 5, "tid": 7, "ts": 1716454225723943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636092, "dur": 5, "args": { "External id": 269609, "cbid": 211, "correlation": 269609 } }, { "ph": "s", "id": 269609, "pid": 76337, "tid": -914061504, "ts": 1716454225636092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225723949, "dur": 55, "args": { "External id": 269619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269619, "pid": 5, "tid": 7, "ts": 1716454225723949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636149, "dur": 12, "args": { "External id": 269619, "cbid": 211, "correlation": 269619 } }, { "ph": "s", "id": 269619, "pid": 76337, "tid": -914061504, "ts": 1716454225636149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225724005, "dur": 53, "args": { "External id": 269639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269639, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 269639, "pid": 5, "tid": 7, "ts": 1716454225724005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636216, "dur": 11, "args": { "External id": 269639, "cbid": 211, "correlation": 269639 } }, { "ph": "s", "id": 269639, "pid": 76337, "tid": -914061504, "ts": 1716454225636216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225724059, "dur": 4, "args": { "External id": 269651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269651, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269651, "pid": 5, "tid": 7, "ts": 1716454225724059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636238, "dur": 6, "args": { "External id": 269651, "cbid": 211, "correlation": 269651 } }, { "ph": "s", "id": 269651, "pid": 76337, "tid": -914061504, "ts": 1716454225636238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225724065, "dur": 56, "args": { "External id": 269654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269654, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269654, "pid": 5, "tid": 7, "ts": 1716454225724065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636256, "dur": 6, "args": { "External id": 269654, "cbid": 211, "correlation": 269654 } }, { "ph": "s", "id": 269654, "pid": 76337, "tid": -914061504, "ts": 1716454225636256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225724122, "dur": 37, "args": { "External id": 269663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269663, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269663, "pid": 5, "tid": 7, "ts": 1716454225724122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636296, "dur": 9, "args": { "External id": 269663, "cbid": 211, "correlation": 269663 } }, { "ph": "s", "id": 269663, "pid": 76337, "tid": -914061504, "ts": 1716454225636296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225636358, "dur": 0, "args": { "External id": 269673, "cbid": 317, "correlation": 269673 } }, { "ph": "f", "id": 269673, "pid": 76337, "tid": -914061504, "ts": 1716454225636358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225636359, "dur": 0, "args": { "External id": 269674, "cbid": 203, "correlation": 269674 } }, { "ph": "f", "id": 269674, "pid": 76337, "tid": -914061504, "ts": 1716454225636359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225636360, "dur": 0, "args": { "External id": 269675, "cbid": 205, "correlation": 269675 } }, { "ph": "f", "id": 269675, "pid": 76337, "tid": -914061504, "ts": 1716454225636360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225724161, "dur": 40, "args": { "External id": 269679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269679, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269679, "pid": 5, "tid": 7, "ts": 1716454225724161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636375, "dur": 12, "args": { "External id": 269679, "cbid": 211, "correlation": 269679 } }, { "ph": "s", "id": 269679, "pid": 76337, "tid": -914061504, "ts": 1716454225636375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225724202, "dur": 14, "args": { "External id": 269681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269681, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269681, "pid": 5, "tid": 7, "ts": 1716454225724202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636390, "dur": 5, "args": { "External id": 269681, "cbid": 211, "correlation": 269681 } }, { "ph": "s", "id": 269681, "pid": 76337, "tid": -914061504, "ts": 1716454225636390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225724218, "dur": 3, "args": { "External id": 269683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269683, "pid": 5, "tid": 7, "ts": 1716454225724218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636399, "dur": 5, "args": { "External id": 269683, "cbid": 211, "correlation": 269683 } }, { "ph": "s", "id": 269683, "pid": 76337, "tid": -914061504, "ts": 1716454225636399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225636407, "dur": 0, "args": { "External id": 269684, "cbid": 51, "correlation": 269684 } }, { "ph": "s", "id": 269684, "pid": 76337, "tid": -914061504, "ts": 1716454225636407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225724223, "dur": 710, "args": { "External id": 269685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269685, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269685, "pid": 5, "tid": 7, "ts": 1716454225724223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636408, "dur": 5, "args": { "External id": 269685, "cbid": 211, "correlation": 269685 } }, { "ph": "s", "id": 269685, "pid": 76337, "tid": -914061504, "ts": 1716454225636408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225724933, "dur": 60, "args": { "External id": 269690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269690, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269690, "pid": 5, "tid": 7, "ts": 1716454225724933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636436, "dur": 8, "args": { "External id": 269690, "cbid": 211, "correlation": 269690 } }, { "ph": "s", "id": 269690, "pid": 76337, "tid": -914061504, "ts": 1716454225636436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225636494, "dur": 0, "args": { "External id": 269700, "cbid": 317, "correlation": 269700 } }, { "ph": "f", "id": 269700, "pid": 76337, "tid": -914061504, "ts": 1716454225636494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225636494, "dur": 0, "args": { "External id": 269701, "cbid": 203, "correlation": 269701 } }, { "ph": "f", "id": 269701, "pid": 76337, "tid": -914061504, "ts": 1716454225636494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225636495, "dur": 0, "args": { "External id": 269702, "cbid": 205, "correlation": 269702 } }, { "ph": "f", "id": 269702, "pid": 76337, "tid": -914061504, "ts": 1716454225636495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225724994, "dur": 4, "args": { "External id": 269706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269706, "pid": 5, "tid": 7, "ts": 1716454225724994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636511, "dur": 11, "args": { "External id": 269706, "cbid": 211, "correlation": 269706 } }, { "ph": "s", "id": 269706, "pid": 76337, "tid": -914061504, "ts": 1716454225636511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225636527, "dur": 0, "args": { "External id": 269707, "cbid": 51, "correlation": 269707 } }, { "ph": "s", "id": 269707, "pid": 76337, "tid": -914061504, "ts": 1716454225636527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454225724999, "dur": 268, "args": { "External id": 269708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269708, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269708, "pid": 5, "tid": 7, "ts": 1716454225724999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636528, "dur": 8, "args": { "External id": 269708, "cbid": 211, "correlation": 269708 } }, { "ph": "s", "id": 269708, "pid": 76337, "tid": -914061504, "ts": 1716454225636528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225725269, "dur": 59, "args": { "External id": 269713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269713, "pid": 5, "tid": 7, "ts": 1716454225725269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636556, "dur": 8, "args": { "External id": 269713, "cbid": 211, "correlation": 269713 } }, { "ph": "s", "id": 269713, "pid": 76337, "tid": -914061504, "ts": 1716454225636556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225725329, "dur": 50, "args": { "External id": 269721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269721, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269721, "pid": 5, "tid": 7, "ts": 1716454225725329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636585, "dur": 8, "args": { "External id": 269721, "cbid": 211, "correlation": 269721 } }, { "ph": "s", "id": 269721, "pid": 76337, "tid": -914061504, "ts": 1716454225636585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225725381, "dur": 36, "args": { "External id": 269729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269729, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269729, "pid": 5, "tid": 7, "ts": 1716454225725381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636613, "dur": 8, "args": { "External id": 269729, "cbid": 211, "correlation": 269729 } }, { "ph": "s", "id": 269729, "pid": 76337, "tid": -914061504, "ts": 1716454225636613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225725418, "dur": 54, "args": { "External id": 269749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269749, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 269749, "pid": 5, "tid": 7, "ts": 1716454225725418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636695, "dur": 12, "args": { "External id": 269749, "cbid": 211, "correlation": 269749 } }, { "ph": "s", "id": 269749, "pid": 76337, "tid": -914061504, "ts": 1716454225636695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225725473, "dur": 4, "args": { "External id": 269761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269761, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 269761, "pid": 5, "tid": 7, "ts": 1716454225725473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636717, "dur": 6, "args": { "External id": 269761, "cbid": 211, "correlation": 269761 } }, { "ph": "s", "id": 269761, "pid": 76337, "tid": -914061504, "ts": 1716454225636717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225725478, "dur": 55, "args": { "External id": 269764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269764, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269764, "pid": 5, "tid": 7, "ts": 1716454225725478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636734, "dur": 7, "args": { "External id": 269764, "cbid": 211, "correlation": 269764 } }, { "ph": "s", "id": 269764, "pid": 76337, "tid": -914061504, "ts": 1716454225636734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225636792, "dur": 0, "args": { "External id": 269775, "cbid": 317, "correlation": 269775 } }, { "ph": "f", "id": 269775, "pid": 76337, "tid": -914061504, "ts": 1716454225636792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225636793, "dur": 0, "args": { "External id": 269776, "cbid": 203, "correlation": 269776 } }, { "ph": "f", "id": 269776, "pid": 76337, "tid": -914061504, "ts": 1716454225636793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225636794, "dur": 0, "args": { "External id": 269777, "cbid": 205, "correlation": 269777 } }, { "ph": "f", "id": 269777, "pid": 76337, "tid": -914061504, "ts": 1716454225636794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636822, "dur": 2, "args": { "External id": 269781, "cbid": 251, "correlation": 269781 } }, { "ph": "f", "id": 269781, "pid": 76337, "tid": -914061504, "ts": 1716454225636822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636825, "dur": 1, "args": { "External id": 269782, "cbid": 251, "correlation": 269782 } }, { "ph": "f", "id": 269782, "pid": 76337, "tid": -914061504, "ts": 1716454225636825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636827, "dur": 0, "args": { "External id": 269783, "cbid": 251, "correlation": 269783 } }, { "ph": "f", "id": 269783, "pid": 76337, "tid": -914061504, "ts": 1716454225636827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636828, "dur": 1, "args": { "External id": 269784, "cbid": 251, "correlation": 269784 } }, { "ph": "f", "id": 269784, "pid": 76337, "tid": -914061504, "ts": 1716454225636828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636831, "dur": 1, "args": { "External id": 269785, "cbid": 251, "correlation": 269785 } }, { "ph": "f", "id": 269785, "pid": 76337, "tid": -914061504, "ts": 1716454225636831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636832, "dur": 1, "args": { "External id": 269786, "cbid": 251, "correlation": 269786 } }, { "ph": "f", "id": 269786, "pid": 76337, "tid": -914061504, "ts": 1716454225636832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636835, "dur": 1, "args": { "External id": 269787, "cbid": 251, "correlation": 269787 } }, { "ph": "f", "id": 269787, "pid": 76337, "tid": -914061504, "ts": 1716454225636835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636836, "dur": 1, "args": { "External id": 269788, "cbid": 251, "correlation": 269788 } }, { "ph": "f", "id": 269788, "pid": 76337, "tid": -914061504, "ts": 1716454225636836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225636839, "dur": 0, "args": { "External id": 269789, "cbid": 251, "correlation": 269789 } }, { "ph": "f", "id": 269789, "pid": 76337, "tid": -914061504, "ts": 1716454225636839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225725535, "dur": 116, "args": { "External id": 269790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269790, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 269790, "pid": 5, "tid": 7, "ts": 1716454225725535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636841, "dur": 12, "args": { "External id": 269790, "cbid": 211, "correlation": 269790 } }, { "ph": "s", "id": 269790, "pid": 76337, "tid": -914061504, "ts": 1716454225636841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225725652, "dur": 60, "args": { "External id": 269796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269796, "pid": 5, "tid": 7, "ts": 1716454225725652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636876, "dur": 9, "args": { "External id": 269796, "cbid": 211, "correlation": 269796 } }, { "ph": "s", "id": 269796, "pid": 76337, "tid": -914061504, "ts": 1716454225636876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225725713, "dur": 487, "args": { "External id": 269805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269805, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269805, "pid": 5, "tid": 7, "ts": 1716454225725713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225636958, "dur": 22, "args": { "External id": 269805, "cbid": 211, "correlation": 269805 } }, { "ph": "s", "id": 269805, "pid": 76337, "tid": -914061504, "ts": 1716454225636958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225726202, "dur": 184, "args": { "External id": 269827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269827, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269827, "pid": 5, "tid": 7, "ts": 1716454225726202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637025, "dur": 11, "args": { "External id": 269827, "cbid": 211, "correlation": 269827 } }, { "ph": "s", "id": 269827, "pid": 76337, "tid": -914061504, "ts": 1716454225637025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637114, "dur": 1, "args": { "External id": 269838, "cbid": 251, "correlation": 269838 } }, { "ph": "f", "id": 269838, "pid": 76337, "tid": -914061504, "ts": 1716454225637114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225726387, "dur": 198, "args": { "External id": 269839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269839, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269839, "pid": 5, "tid": 7, "ts": 1716454225726387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637119, "dur": 13, "args": { "External id": 269839, "cbid": 211, "correlation": 269839 } }, { "ph": "s", "id": 269839, "pid": 76337, "tid": -914061504, "ts": 1716454225637119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637187, "dur": 1, "args": { "External id": 269850, "cbid": 251, "correlation": 269850 } }, { "ph": "f", "id": 269850, "pid": 76337, "tid": -914061504, "ts": 1716454225637187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225726586, "dur": 190, "args": { "External id": 269851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269851, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269851, "pid": 5, "tid": 7, "ts": 1716454225726586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637192, "dur": 11, "args": { "External id": 269851, "cbid": 211, "correlation": 269851 } }, { "ph": "s", "id": 269851, "pid": 76337, "tid": -914061504, "ts": 1716454225637192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637254, "dur": 1, "args": { "External id": 269862, "cbid": 251, "correlation": 269862 } }, { "ph": "f", "id": 269862, "pid": 76337, "tid": -914061504, "ts": 1716454225637254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225726778, "dur": 190, "args": { "External id": 269863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269863, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269863, "pid": 5, "tid": 7, "ts": 1716454225726778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637258, "dur": 12, "args": { "External id": 269863, "cbid": 211, "correlation": 269863 } }, { "ph": "s", "id": 269863, "pid": 76337, "tid": -914061504, "ts": 1716454225637258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225726969, "dur": 18905, "args": { "External id": 269884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269884, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 269884, "pid": 5, "tid": 7, "ts": 1716454225726969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637342, "dur": 13, "args": { "External id": 269884, "cbid": 211, "correlation": 269884 } }, { "ph": "s", "id": 269884, "pid": 76337, "tid": -914061504, "ts": 1716454225637342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637440, "dur": 1, "args": { "External id": 269902, "cbid": 251, "correlation": 269902 } }, { "ph": "f", "id": 269902, "pid": 76337, "tid": -914061504, "ts": 1716454225637440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225745875, "dur": 207, "args": { "External id": 269904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269904, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269904, "pid": 5, "tid": 7, "ts": 1716454225745875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637446, "dur": 13, "args": { "External id": 269904, "cbid": 211, "correlation": 269904 } }, { "ph": "s", "id": 269904, "pid": 76337, "tid": -914061504, "ts": 1716454225637446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225746083, "dur": 67, "args": { "External id": 269912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269912, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269912, "pid": 5, "tid": 7, "ts": 1716454225746083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637514, "dur": 12, "args": { "External id": 269912, "cbid": 211, "correlation": 269912 } }, { "ph": "s", "id": 269912, "pid": 76337, "tid": -914061504, "ts": 1716454225637514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225746152, "dur": 97, "args": { "External id": 269920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269920, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269920, "pid": 5, "tid": 7, "ts": 1716454225746152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637553, "dur": 8, "args": { "External id": 269920, "cbid": 211, "correlation": 269920 } }, { "ph": "s", "id": 269920, "pid": 76337, "tid": -914061504, "ts": 1716454225637553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225746250, "dur": 53, "args": { "External id": 269931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269931, "pid": 5, "tid": 7, "ts": 1716454225746250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637622, "dur": 13, "args": { "External id": 269931, "cbid": 211, "correlation": 269931 } }, { "ph": "s", "id": 269931, "pid": 76337, "tid": -914061504, "ts": 1716454225637622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225746304, "dur": 94, "args": { "External id": 269953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269953, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 269953, "pid": 5, "tid": 7, "ts": 1716454225746304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637654, "dur": 8, "args": { "External id": 269953, "cbid": 211, "correlation": 269953 } }, { "ph": "s", "id": 269953, "pid": 76337, "tid": -914061504, "ts": 1716454225637654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637737, "dur": 1, "args": { "External id": 269964, "cbid": 251, "correlation": 269964 } }, { "ph": "f", "id": 269964, "pid": 76337, "tid": -914061504, "ts": 1716454225637737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225746399, "dur": 107, "args": { "External id": 269965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269965, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 269965, "pid": 5, "tid": 7, "ts": 1716454225746399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637742, "dur": 14, "args": { "External id": 269965, "cbid": 211, "correlation": 269965 } }, { "ph": "s", "id": 269965, "pid": 76337, "tid": -914061504, "ts": 1716454225637742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637815, "dur": 1, "args": { "External id": 269976, "cbid": 251, "correlation": 269976 } }, { "ph": "f", "id": 269976, "pid": 76337, "tid": -914061504, "ts": 1716454225637815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637819, "dur": 0, "args": { "External id": 269977, "cbid": 251, "correlation": 269977 } }, { "ph": "f", "id": 269977, "pid": 76337, "tid": -914061504, "ts": 1716454225637819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225746508, "dur": 11, "args": { "External id": 269978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269978, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 269978, "pid": 5, "tid": 7, "ts": 1716454225746508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637821, "dur": 13, "args": { "External id": 269978, "cbid": 211, "correlation": 269978 } }, { "ph": "s", "id": 269978, "pid": 76337, "tid": -914061504, "ts": 1716454225637821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225746520, "dur": 5, "args": { "External id": 269980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269980, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 269980, "pid": 5, "tid": 7, "ts": 1716454225746520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637836, "dur": 7, "args": { "External id": 269980, "cbid": 211, "correlation": 269980 } }, { "ph": "s", "id": 269980, "pid": 76337, "tid": -914061504, "ts": 1716454225637836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637898, "dur": 1, "args": { "External id": 269991, "cbid": 251, "correlation": 269991 } }, { "ph": "f", "id": 269991, "pid": 76337, "tid": -914061504, "ts": 1716454225637898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225637903, "dur": 0, "args": { "External id": 269992, "cbid": 251, "correlation": 269992 } }, { "ph": "f", "id": 269992, "pid": 76337, "tid": -914061504, "ts": 1716454225637903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225746526, "dur": 5, "args": { "External id": 269993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269993, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 269993, "pid": 5, "tid": 7, "ts": 1716454225746526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637904, "dur": 12, "args": { "External id": 269993, "cbid": 211, "correlation": 269993 } }, { "ph": "s", "id": 269993, "pid": 76337, "tid": -914061504, "ts": 1716454225637904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225746533, "dur": 3, "args": { "External id": 269995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 269995, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 269995, "pid": 5, "tid": 7, "ts": 1716454225746533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225637917, "dur": 5, "args": { "External id": 269995, "cbid": 211, "correlation": 269995 } }, { "ph": "s", "id": 269995, "pid": 76337, "tid": -914061504, "ts": 1716454225637917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225746537, "dur": 157, "args": { "External id": 270016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270016, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270016, "pid": 5, "tid": 7, "ts": 1716454225746537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638000, "dur": 13, "args": { "External id": 270016, "cbid": 211, "correlation": 270016 } }, { "ph": "s", "id": 270016, "pid": 76337, "tid": -914061504, "ts": 1716454225638000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225638098, "dur": 2, "args": { "External id": 270034, "cbid": 251, "correlation": 270034 } }, { "ph": "f", "id": 270034, "pid": 76337, "tid": -914061504, "ts": 1716454225638098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225746696, "dur": 109, "args": { "External id": 270036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270036, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270036, "pid": 5, "tid": 7, "ts": 1716454225746696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638105, "dur": 14, "args": { "External id": 270036, "cbid": 211, "correlation": 270036 } }, { "ph": "s", "id": 270036, "pid": 76337, "tid": -914061504, "ts": 1716454225638105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225746806, "dur": 35, "args": { "External id": 270044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270044, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270044, "pid": 5, "tid": 7, "ts": 1716454225746806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638174, "dur": 12, "args": { "External id": 270044, "cbid": 211, "correlation": 270044 } }, { "ph": "s", "id": 270044, "pid": 76337, "tid": -914061504, "ts": 1716454225638174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225746843, "dur": 68, "args": { "External id": 270052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270052, "pid": 5, "tid": 7, "ts": 1716454225746843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638215, "dur": 9, "args": { "External id": 270052, "cbid": 211, "correlation": 270052 } }, { "ph": "s", "id": 270052, "pid": 76337, "tid": -914061504, "ts": 1716454225638215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225746912, "dur": 93, "args": { "External id": 270074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270074, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270074, "pid": 5, "tid": 7, "ts": 1716454225746912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638267, "dur": 10, "args": { "External id": 270074, "cbid": 211, "correlation": 270074 } }, { "ph": "s", "id": 270074, "pid": 76337, "tid": -914061504, "ts": 1716454225638267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225638354, "dur": 1, "args": { "External id": 270090, "cbid": 251, "correlation": 270090 } }, { "ph": "f", "id": 270090, "pid": 76337, "tid": -914061504, "ts": 1716454225638354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225747007, "dur": 584, "args": { "External id": 270092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270092, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270092, "pid": 5, "tid": 7, "ts": 1716454225747007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638360, "dur": 13, "args": { "External id": 270092, "cbid": 211, "correlation": 270092 } }, { "ph": "s", "id": 270092, "pid": 76337, "tid": -914061504, "ts": 1716454225638360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225747592, "dur": 245, "args": { "External id": 270100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270100, "pid": 5, "tid": 7, "ts": 1716454225747592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638425, "dur": 106, "args": { "External id": 270100, "cbid": 211, "correlation": 270100 } }, { "ph": "s", "id": 270100, "pid": 76337, "tid": -914061504, "ts": 1716454225638425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225747839, "dur": 252, "args": { "External id": 270108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270108, "pid": 5, "tid": 7, "ts": 1716454225747839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638551, "dur": 342, "args": { "External id": 270108, "cbid": 211, "correlation": 270108 } }, { "ph": "s", "id": 270108, "pid": 76337, "tid": -914061504, "ts": 1716454225638551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225638968, "dur": 1, "args": { "External id": 270124, "cbid": 251, "correlation": 270124 } }, { "ph": "f", "id": 270124, "pid": 76337, "tid": -914061504, "ts": 1716454225638968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225638980, "dur": 0, "args": { "External id": 270126, "cbid": 251, "correlation": 270126 } }, { "ph": "f", "id": 270126, "pid": 76337, "tid": -914061504, "ts": 1716454225638980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225748092, "dur": 357, "args": { "External id": 270127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270127, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 270127, "pid": 5, "tid": 7, "ts": 1716454225748092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225638983, "dur": 13, "args": { "External id": 270127, "cbid": 211, "correlation": 270127 } }, { "ph": "s", "id": 270127, "pid": 76337, "tid": -914061504, "ts": 1716454225638983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225748451, "dur": 50, "args": { "External id": 270135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270135, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270135, "pid": 5, "tid": 7, "ts": 1716454225748451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639027, "dur": 79, "args": { "External id": 270135, "cbid": 211, "correlation": 270135 } }, { "ph": "s", "id": 270135, "pid": 76337, "tid": -914061504, "ts": 1716454225639027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225748502, "dur": 161, "args": { "External id": 270146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270146, "pid": 5, "tid": 7, "ts": 1716454225748502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639165, "dur": 60, "args": { "External id": 270146, "cbid": 211, "correlation": 270146 } }, { "ph": "s", "id": 270146, "pid": 76337, "tid": -914061504, "ts": 1716454225639165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225639277, "dur": 0, "args": { "External id": 270158, "cbid": 317, "correlation": 270158 } }, { "ph": "f", "id": 270158, "pid": 76337, "tid": -914061504, "ts": 1716454225639277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225639278, "dur": 0, "args": { "External id": 270159, "cbid": 203, "correlation": 270159 } }, { "ph": "f", "id": 270159, "pid": 76337, "tid": -914061504, "ts": 1716454225639278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225639278, "dur": 0, "args": { "External id": 270160, "cbid": 205, "correlation": 270160 } }, { "ph": "f", "id": 270160, "pid": 76337, "tid": -914061504, "ts": 1716454225639278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639303, "dur": 1, "args": { "External id": 270164, "cbid": 251, "correlation": 270164 } }, { "ph": "f", "id": 270164, "pid": 76337, "tid": -914061504, "ts": 1716454225639303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639305, "dur": 0, "args": { "External id": 270165, "cbid": 251, "correlation": 270165 } }, { "ph": "f", "id": 270165, "pid": 76337, "tid": -914061504, "ts": 1716454225639305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639306, "dur": 0, "args": { "External id": 270166, "cbid": 251, "correlation": 270166 } }, { "ph": "f", "id": 270166, "pid": 76337, "tid": -914061504, "ts": 1716454225639306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639306, "dur": 0, "args": { "External id": 270167, "cbid": 251, "correlation": 270167 } }, { "ph": "f", "id": 270167, "pid": 76337, "tid": -914061504, "ts": 1716454225639306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639307, "dur": 0, "args": { "External id": 270168, "cbid": 251, "correlation": 270168 } }, { "ph": "f", "id": 270168, "pid": 76337, "tid": -914061504, "ts": 1716454225639307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639308, "dur": 0, "args": { "External id": 270169, "cbid": 251, "correlation": 270169 } }, { "ph": "f", "id": 270169, "pid": 76337, "tid": -914061504, "ts": 1716454225639308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639309, "dur": 0, "args": { "External id": 270170, "cbid": 251, "correlation": 270170 } }, { "ph": "f", "id": 270170, "pid": 76337, "tid": -914061504, "ts": 1716454225639309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639310, "dur": 0, "args": { "External id": 270171, "cbid": 251, "correlation": 270171 } }, { "ph": "f", "id": 270171, "pid": 76337, "tid": -914061504, "ts": 1716454225639310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225639311, "dur": 0, "args": { "External id": 270172, "cbid": 251, "correlation": 270172 } }, { "ph": "f", "id": 270172, "pid": 76337, "tid": -914061504, "ts": 1716454225639311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225748665, "dur": 116, "args": { "External id": 270173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270173, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270173, "pid": 5, "tid": 7, "ts": 1716454225748665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639313, "dur": 12, "args": { "External id": 270173, "cbid": 211, "correlation": 270173 } }, { "ph": "s", "id": 270173, "pid": 76337, "tid": -914061504, "ts": 1716454225639313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225748782, "dur": 61, "args": { "External id": 270179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270179, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270179, "pid": 5, "tid": 7, "ts": 1716454225748782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639349, "dur": 9, "args": { "External id": 270179, "cbid": 211, "correlation": 270179 } }, { "ph": "s", "id": 270179, "pid": 76337, "tid": -914061504, "ts": 1716454225639349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225748844, "dur": 50, "args": { "External id": 270187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270187, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270187, "pid": 5, "tid": 7, "ts": 1716454225748844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639380, "dur": 8, "args": { "External id": 270187, "cbid": 211, "correlation": 270187 } }, { "ph": "s", "id": 270187, "pid": 76337, "tid": -914061504, "ts": 1716454225639380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225748896, "dur": 99, "args": { "External id": 270196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270196, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270196, "pid": 5, "tid": 7, "ts": 1716454225748896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639419, "dur": 10, "args": { "External id": 270196, "cbid": 211, "correlation": 270196 } }, { "ph": "s", "id": 270196, "pid": 76337, "tid": -914061504, "ts": 1716454225639419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225748996, "dur": 96, "args": { "External id": 270216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270216, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 270216, "pid": 5, "tid": 7, "ts": 1716454225748996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639489, "dur": 12, "args": { "External id": 270216, "cbid": 211, "correlation": 270216 } }, { "ph": "s", "id": 270216, "pid": 76337, "tid": -914061504, "ts": 1716454225639489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225749093, "dur": 5, "args": { "External id": 270228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270228, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 270228, "pid": 5, "tid": 7, "ts": 1716454225749093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639512, "dur": 255, "args": { "External id": 270228, "cbid": 211, "correlation": 270228 } }, { "ph": "s", "id": 270228, "pid": 76337, "tid": -914061504, "ts": 1716454225639512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225749099, "dur": 109, "args": { "External id": 270231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270231, "pid": 5, "tid": 7, "ts": 1716454225749099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639779, "dur": 8, "args": { "External id": 270231, "cbid": 211, "correlation": 270231 } }, { "ph": "s", "id": 270231, "pid": 76337, "tid": -914061504, "ts": 1716454225639779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225749209, "dur": 69, "args": { "External id": 270240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270240, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270240, "pid": 5, "tid": 7, "ts": 1716454225749209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639819, "dur": 10, "args": { "External id": 270240, "cbid": 211, "correlation": 270240 } }, { "ph": "s", "id": 270240, "pid": 76337, "tid": -914061504, "ts": 1716454225639819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225639871, "dur": 0, "args": { "External id": 270250, "cbid": 317, "correlation": 270250 } }, { "ph": "f", "id": 270250, "pid": 76337, "tid": -914061504, "ts": 1716454225639871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225639872, "dur": 0, "args": { "External id": 270251, "cbid": 203, "correlation": 270251 } }, { "ph": "f", "id": 270251, "pid": 76337, "tid": -914061504, "ts": 1716454225639872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225639873, "dur": 0, "args": { "External id": 270252, "cbid": 205, "correlation": 270252 } }, { "ph": "f", "id": 270252, "pid": 76337, "tid": -914061504, "ts": 1716454225639873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225749280, "dur": 76, "args": { "External id": 270256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270256, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270256, "pid": 5, "tid": 7, "ts": 1716454225749280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639888, "dur": 12, "args": { "External id": 270256, "cbid": 211, "correlation": 270256 } }, { "ph": "s", "id": 270256, "pid": 76337, "tid": -914061504, "ts": 1716454225639888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225749357, "dur": 25, "args": { "External id": 270258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270258, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270258, "pid": 5, "tid": 7, "ts": 1716454225749357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639902, "dur": 5, "args": { "External id": 270258, "cbid": 211, "correlation": 270258 } }, { "ph": "s", "id": 270258, "pid": 76337, "tid": -914061504, "ts": 1716454225639902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225749383, "dur": 4, "args": { "External id": 270260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270260, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270260, "pid": 5, "tid": 7, "ts": 1716454225749383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639912, "dur": 5, "args": { "External id": 270260, "cbid": 211, "correlation": 270260 } }, { "ph": "s", "id": 270260, "pid": 76337, "tid": -914061504, "ts": 1716454225639912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225639920, "dur": 0, "args": { "External id": 270261, "cbid": 51, "correlation": 270261 } }, { "ph": "s", "id": 270261, "pid": 76337, "tid": -914061504, "ts": 1716454225639920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225749388, "dur": 1389, "args": { "External id": 270262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270262, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270262, "pid": 5, "tid": 7, "ts": 1716454225749388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639921, "dur": 5, "args": { "External id": 270262, "cbid": 211, "correlation": 270262 } }, { "ph": "s", "id": 270262, "pid": 76337, "tid": -914061504, "ts": 1716454225639921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225750778, "dur": 61, "args": { "External id": 270267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270267, "pid": 5, "tid": 7, "ts": 1716454225750778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225639949, "dur": 8, "args": { "External id": 270267, "cbid": 211, "correlation": 270267 } }, { "ph": "s", "id": 270267, "pid": 76337, "tid": -914061504, "ts": 1716454225639949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225750840, "dur": 4, "args": { "External id": 270275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270275, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270275, "pid": 5, "tid": 7, "ts": 1716454225750840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640001, "dur": 10, "args": { "External id": 270275, "cbid": 211, "correlation": 270275 } }, { "ph": "s", "id": 270275, "pid": 76337, "tid": -914061504, "ts": 1716454225640001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225640067, "dur": 1, "args": { "External id": 270291, "cbid": 251, "correlation": 270291 } }, { "ph": "f", "id": 270291, "pid": 76337, "tid": -914061504, "ts": 1716454225640067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225640072, "dur": 0, "args": { "External id": 270293, "cbid": 251, "correlation": 270293 } }, { "ph": "f", "id": 270293, "pid": 76337, "tid": -914061504, "ts": 1716454225640072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225750845, "dur": 11, "args": { "External id": 270294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270294, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 270294, "pid": 5, "tid": 7, "ts": 1716454225750845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640073, "dur": 12, "args": { "External id": 270294, "cbid": 211, "correlation": 270294 } }, { "ph": "s", "id": 270294, "pid": 76337, "tid": -914061504, "ts": 1716454225640073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225750857, "dur": 5, "args": { "External id": 270296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270296, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 270296, "pid": 5, "tid": 7, "ts": 1716454225750857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640088, "dur": 176, "args": { "External id": 270296, "cbid": 211, "correlation": 270296 } }, { "ph": "s", "id": 270296, "pid": 76337, "tid": -914061504, "ts": 1716454225640088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225750864, "dur": 54, "args": { "External id": 270306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270306, "pid": 5, "tid": 7, "ts": 1716454225750864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640319, "dur": 12, "args": { "External id": 270306, "cbid": 211, "correlation": 270306 } }, { "ph": "s", "id": 270306, "pid": 76337, "tid": -914061504, "ts": 1716454225640319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225750920, "dur": 52, "args": { "External id": 270326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270326, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 270326, "pid": 5, "tid": 7, "ts": 1716454225750920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640384, "dur": 11, "args": { "External id": 270326, "cbid": 211, "correlation": 270326 } }, { "ph": "s", "id": 270326, "pid": 76337, "tid": -914061504, "ts": 1716454225640384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225750973, "dur": 4, "args": { "External id": 270338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270338, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270338, "pid": 5, "tid": 7, "ts": 1716454225750973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640405, "dur": 7, "args": { "External id": 270338, "cbid": 211, "correlation": 270338 } }, { "ph": "s", "id": 270338, "pid": 76337, "tid": -914061504, "ts": 1716454225640405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225750978, "dur": 56, "args": { "External id": 270341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270341, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270341, "pid": 5, "tid": 7, "ts": 1716454225750978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640425, "dur": 7, "args": { "External id": 270341, "cbid": 211, "correlation": 270341 } }, { "ph": "s", "id": 270341, "pid": 76337, "tid": -914061504, "ts": 1716454225640425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225751035, "dur": 36, "args": { "External id": 270350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270350, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270350, "pid": 5, "tid": 7, "ts": 1716454225751035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640466, "dur": 10, "args": { "External id": 270350, "cbid": 211, "correlation": 270350 } }, { "ph": "s", "id": 270350, "pid": 76337, "tid": -914061504, "ts": 1716454225640466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225640528, "dur": 0, "args": { "External id": 270360, "cbid": 317, "correlation": 270360 } }, { "ph": "f", "id": 270360, "pid": 76337, "tid": -914061504, "ts": 1716454225640528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225640529, "dur": 0, "args": { "External id": 270361, "cbid": 203, "correlation": 270361 } }, { "ph": "f", "id": 270361, "pid": 76337, "tid": -914061504, "ts": 1716454225640529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225640529, "dur": 0, "args": { "External id": 270362, "cbid": 205, "correlation": 270362 } }, { "ph": "f", "id": 270362, "pid": 76337, "tid": -914061504, "ts": 1716454225640529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225751073, "dur": 41, "args": { "External id": 270366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270366, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270366, "pid": 5, "tid": 7, "ts": 1716454225751073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640544, "dur": 12, "args": { "External id": 270366, "cbid": 211, "correlation": 270366 } }, { "ph": "s", "id": 270366, "pid": 76337, "tid": -914061504, "ts": 1716454225640544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225751115, "dur": 15, "args": { "External id": 270368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270368, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270368, "pid": 5, "tid": 7, "ts": 1716454225751115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640559, "dur": 6, "args": { "External id": 270368, "cbid": 211, "correlation": 270368 } }, { "ph": "s", "id": 270368, "pid": 76337, "tid": -914061504, "ts": 1716454225640559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225751131, "dur": 3, "args": { "External id": 270370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270370, "pid": 5, "tid": 7, "ts": 1716454225751131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640568, "dur": 5, "args": { "External id": 270370, "cbid": 211, "correlation": 270370 } }, { "ph": "s", "id": 270370, "pid": 76337, "tid": -914061504, "ts": 1716454225640568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225640576, "dur": 0, "args": { "External id": 270371, "cbid": 51, "correlation": 270371 } }, { "ph": "s", "id": 270371, "pid": 76337, "tid": -914061504, "ts": 1716454225640576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225751135, "dur": 707, "args": { "External id": 270372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270372, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270372, "pid": 5, "tid": 7, "ts": 1716454225751135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640577, "dur": 5, "args": { "External id": 270372, "cbid": 211, "correlation": 270372 } }, { "ph": "s", "id": 270372, "pid": 76337, "tid": -914061504, "ts": 1716454225640577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225751844, "dur": 60, "args": { "External id": 270377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270377, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270377, "pid": 5, "tid": 7, "ts": 1716454225751844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640604, "dur": 9, "args": { "External id": 270377, "cbid": 211, "correlation": 270377 } }, { "ph": "s", "id": 270377, "pid": 76337, "tid": -914061504, "ts": 1716454225640604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225640661, "dur": 0, "args": { "External id": 270387, "cbid": 317, "correlation": 270387 } }, { "ph": "f", "id": 270387, "pid": 76337, "tid": -914061504, "ts": 1716454225640661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225640662, "dur": 0, "args": { "External id": 270388, "cbid": 203, "correlation": 270388 } }, { "ph": "f", "id": 270388, "pid": 76337, "tid": -914061504, "ts": 1716454225640662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225640663, "dur": 0, "args": { "External id": 270389, "cbid": 205, "correlation": 270389 } }, { "ph": "f", "id": 270389, "pid": 76337, "tid": -914061504, "ts": 1716454225640663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225751905, "dur": 76, "args": { "External id": 270393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270393, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270393, "pid": 5, "tid": 7, "ts": 1716454225751905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640676, "dur": 11, "args": { "External id": 270393, "cbid": 211, "correlation": 270393 } }, { "ph": "s", "id": 270393, "pid": 76337, "tid": -914061504, "ts": 1716454225640676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225751983, "dur": 208, "args": { "External id": 270395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270395, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270395, "pid": 5, "tid": 7, "ts": 1716454225751983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640695, "dur": 8, "args": { "External id": 270395, "cbid": 211, "correlation": 270395 } }, { "ph": "s", "id": 270395, "pid": 76337, "tid": -914061504, "ts": 1716454225640695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225752193, "dur": 40, "args": { "External id": 270397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270397, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270397, "pid": 5, "tid": 7, "ts": 1716454225752193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225640706, "dur": 500, "args": { "External id": 270397, "cbid": 211, "correlation": 270397 } }, { "ph": "s", "id": 270397, "pid": 76337, "tid": -914061504, "ts": 1716454225640706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225752234, "dur": 59, "args": { "External id": 270403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270403, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270403, "pid": 5, "tid": 7, "ts": 1716454225752234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641228, "dur": 11, "args": { "External id": 270403, "cbid": 211, "correlation": 270403 } }, { "ph": "s", "id": 270403, "pid": 76337, "tid": -914061504, "ts": 1716454225641228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225752295, "dur": 50, "args": { "External id": 270411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270411, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270411, "pid": 5, "tid": 7, "ts": 1716454225752295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641260, "dur": 8, "args": { "External id": 270411, "cbid": 211, "correlation": 270411 } }, { "ph": "s", "id": 270411, "pid": 76337, "tid": -914061504, "ts": 1716454225641260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225752346, "dur": 36, "args": { "External id": 270419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270419, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270419, "pid": 5, "tid": 7, "ts": 1716454225752346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641290, "dur": 27, "args": { "External id": 270419, "cbid": 211, "correlation": 270419 } }, { "ph": "s", "id": 270419, "pid": 76337, "tid": -914061504, "ts": 1716454225641290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225752383, "dur": 53, "args": { "External id": 270439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270439, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 270439, "pid": 5, "tid": 7, "ts": 1716454225752383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641393, "dur": 12, "args": { "External id": 270439, "cbid": 211, "correlation": 270439 } }, { "ph": "s", "id": 270439, "pid": 76337, "tid": -914061504, "ts": 1716454225641393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225752437, "dur": 5, "args": { "External id": 270451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270451, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270451, "pid": 5, "tid": 7, "ts": 1716454225752437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641415, "dur": 6, "args": { "External id": 270451, "cbid": 211, "correlation": 270451 } }, { "ph": "s", "id": 270451, "pid": 76337, "tid": -914061504, "ts": 1716454225641415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225752443, "dur": 55, "args": { "External id": 270454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270454, "pid": 5, "tid": 7, "ts": 1716454225752443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641433, "dur": 8, "args": { "External id": 270454, "cbid": 211, "correlation": 270454 } }, { "ph": "s", "id": 270454, "pid": 76337, "tid": -914061504, "ts": 1716454225641433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225641490, "dur": 0, "args": { "External id": 270465, "cbid": 317, "correlation": 270465 } }, { "ph": "f", "id": 270465, "pid": 76337, "tid": -914061504, "ts": 1716454225641490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225641491, "dur": 0, "args": { "External id": 270466, "cbid": 203, "correlation": 270466 } }, { "ph": "f", "id": 270466, "pid": 76337, "tid": -914061504, "ts": 1716454225641491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225641492, "dur": 0, "args": { "External id": 270467, "cbid": 205, "correlation": 270467 } }, { "ph": "f", "id": 270467, "pid": 76337, "tid": -914061504, "ts": 1716454225641492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641515, "dur": 1, "args": { "External id": 270471, "cbid": 251, "correlation": 270471 } }, { "ph": "f", "id": 270471, "pid": 76337, "tid": -914061504, "ts": 1716454225641515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641517, "dur": 0, "args": { "External id": 270472, "cbid": 251, "correlation": 270472 } }, { "ph": "f", "id": 270472, "pid": 76337, "tid": -914061504, "ts": 1716454225641517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641518, "dur": 0, "args": { "External id": 270473, "cbid": 251, "correlation": 270473 } }, { "ph": "f", "id": 270473, "pid": 76337, "tid": -914061504, "ts": 1716454225641518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641519, "dur": 0, "args": { "External id": 270474, "cbid": 251, "correlation": 270474 } }, { "ph": "f", "id": 270474, "pid": 76337, "tid": -914061504, "ts": 1716454225641519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641520, "dur": 0, "args": { "External id": 270475, "cbid": 251, "correlation": 270475 } }, { "ph": "f", "id": 270475, "pid": 76337, "tid": -914061504, "ts": 1716454225641520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641520, "dur": 0, "args": { "External id": 270476, "cbid": 251, "correlation": 270476 } }, { "ph": "f", "id": 270476, "pid": 76337, "tid": -914061504, "ts": 1716454225641520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641521, "dur": 0, "args": { "External id": 270477, "cbid": 251, "correlation": 270477 } }, { "ph": "f", "id": 270477, "pid": 76337, "tid": -914061504, "ts": 1716454225641521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641522, "dur": 0, "args": { "External id": 270478, "cbid": 251, "correlation": 270478 } }, { "ph": "f", "id": 270478, "pid": 76337, "tid": -914061504, "ts": 1716454225641522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641523, "dur": 0, "args": { "External id": 270479, "cbid": 251, "correlation": 270479 } }, { "ph": "f", "id": 270479, "pid": 76337, "tid": -914061504, "ts": 1716454225641523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225752500, "dur": 115, "args": { "External id": 270480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270480, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270480, "pid": 5, "tid": 7, "ts": 1716454225752500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641525, "dur": 13, "args": { "External id": 270480, "cbid": 211, "correlation": 270480 } }, { "ph": "s", "id": 270480, "pid": 76337, "tid": -914061504, "ts": 1716454225641525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225752616, "dur": 60, "args": { "External id": 270486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270486, "pid": 5, "tid": 7, "ts": 1716454225752616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641560, "dur": 9, "args": { "External id": 270486, "cbid": 211, "correlation": 270486 } }, { "ph": "s", "id": 270486, "pid": 76337, "tid": -914061504, "ts": 1716454225641560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225752678, "dur": 586, "args": { "External id": 270495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270495, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270495, "pid": 5, "tid": 7, "ts": 1716454225752678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641643, "dur": 13, "args": { "External id": 270495, "cbid": 211, "correlation": 270495 } }, { "ph": "s", "id": 270495, "pid": 76337, "tid": -914061504, "ts": 1716454225641643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225753265, "dur": 184, "args": { "External id": 270517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270517, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270517, "pid": 5, "tid": 7, "ts": 1716454225753265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641701, "dur": 10, "args": { "External id": 270517, "cbid": 211, "correlation": 270517 } }, { "ph": "s", "id": 270517, "pid": 76337, "tid": -914061504, "ts": 1716454225641701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641787, "dur": 1, "args": { "External id": 270528, "cbid": 251, "correlation": 270528 } }, { "ph": "f", "id": 270528, "pid": 76337, "tid": -914061504, "ts": 1716454225641787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225753450, "dur": 199, "args": { "External id": 270529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270529, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270529, "pid": 5, "tid": 7, "ts": 1716454225753450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641792, "dur": 13, "args": { "External id": 270529, "cbid": 211, "correlation": 270529 } }, { "ph": "s", "id": 270529, "pid": 76337, "tid": -914061504, "ts": 1716454225641792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641860, "dur": 1, "args": { "External id": 270540, "cbid": 251, "correlation": 270540 } }, { "ph": "f", "id": 270540, "pid": 76337, "tid": -914061504, "ts": 1716454225641860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225753651, "dur": 191, "args": { "External id": 270541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270541, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270541, "pid": 5, "tid": 7, "ts": 1716454225753651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641864, "dur": 11, "args": { "External id": 270541, "cbid": 211, "correlation": 270541 } }, { "ph": "s", "id": 270541, "pid": 76337, "tid": -914061504, "ts": 1716454225641864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225641927, "dur": 1, "args": { "External id": 270552, "cbid": 251, "correlation": 270552 } }, { "ph": "f", "id": 270552, "pid": 76337, "tid": -914061504, "ts": 1716454225641927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225753843, "dur": 193, "args": { "External id": 270553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270553, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270553, "pid": 5, "tid": 7, "ts": 1716454225753843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225641931, "dur": 11, "args": { "External id": 270553, "cbid": 211, "correlation": 270553 } }, { "ph": "s", "id": 270553, "pid": 76337, "tid": -914061504, "ts": 1716454225641931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225754038, "dur": 18993, "args": { "External id": 270574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270574, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270574, "pid": 5, "tid": 7, "ts": 1716454225754038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225642019, "dur": 13, "args": { "External id": 270574, "cbid": 211, "correlation": 270574 } }, { "ph": "s", "id": 270574, "pid": 76337, "tid": -914061504, "ts": 1716454225642019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225642117, "dur": 1, "args": { "External id": 270592, "cbid": 251, "correlation": 270592 } }, { "ph": "f", "id": 270592, "pid": 76337, "tid": -914061504, "ts": 1716454225642117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225773032, "dur": 206, "args": { "External id": 270594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270594, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270594, "pid": 5, "tid": 7, "ts": 1716454225773032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225642123, "dur": 13, "args": { "External id": 270594, "cbid": 211, "correlation": 270594 } }, { "ph": "s", "id": 270594, "pid": 76337, "tid": -914061504, "ts": 1716454225642123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225773240, "dur": 66, "args": { "External id": 270602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270602, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270602, "pid": 5, "tid": 7, "ts": 1716454225773240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225642192, "dur": 13, "args": { "External id": 270602, "cbid": 211, "correlation": 270602 } }, { "ph": "s", "id": 270602, "pid": 76337, "tid": -914061504, "ts": 1716454225642192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225773307, "dur": 97, "args": { "External id": 270610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270610, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270610, "pid": 5, "tid": 7, "ts": 1716454225773307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225642232, "dur": 9, "args": { "External id": 270610, "cbid": 211, "correlation": 270610 } }, { "ph": "s", "id": 270610, "pid": 76337, "tid": -914061504, "ts": 1716454225642232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225773405, "dur": 54, "args": { "External id": 270621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270621, "pid": 5, "tid": 7, "ts": 1716454225773405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225642303, "dur": 83, "args": { "External id": 270621, "cbid": 211, "correlation": 270621 } }, { "ph": "s", "id": 270621, "pid": 76337, "tid": -914061504, "ts": 1716454225642303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225773461, "dur": 95, "args": { "External id": 270643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270643, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270643, "pid": 5, "tid": 7, "ts": 1716454225773461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225642406, "dur": 1969, "args": { "External id": 270643, "cbid": 211, "correlation": 270643 } }, { "ph": "s", "id": 270643, "pid": 76337, "tid": -914061504, "ts": 1716454225642406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225644454, "dur": 1, "args": { "External id": 270654, "cbid": 251, "correlation": 270654 } }, { "ph": "f", "id": 270654, "pid": 76337, "tid": -914061504, "ts": 1716454225644454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225773557, "dur": 106, "args": { "External id": 270655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270655, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270655, "pid": 5, "tid": 7, "ts": 1716454225773557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644459, "dur": 67, "args": { "External id": 270655, "cbid": 211, "correlation": 270655 } }, { "ph": "s", "id": 270655, "pid": 76337, "tid": -914061504, "ts": 1716454225644459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225644586, "dur": 1, "args": { "External id": 270666, "cbid": 251, "correlation": 270666 } }, { "ph": "f", "id": 270666, "pid": 76337, "tid": -914061504, "ts": 1716454225644586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225644589, "dur": 0, "args": { "External id": 270667, "cbid": 251, "correlation": 270667 } }, { "ph": "f", "id": 270667, "pid": 76337, "tid": -914061504, "ts": 1716454225644589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225773664, "dur": 10, "args": { "External id": 270668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270668, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 270668, "pid": 5, "tid": 7, "ts": 1716454225773664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644591, "dur": 12, "args": { "External id": 270668, "cbid": 211, "correlation": 270668 } }, { "ph": "s", "id": 270668, "pid": 76337, "tid": -914061504, "ts": 1716454225644591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225773675, "dur": 5, "args": { "External id": 270670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270670, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 270670, "pid": 5, "tid": 7, "ts": 1716454225773675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644605, "dur": 6, "args": { "External id": 270670, "cbid": 211, "correlation": 270670 } }, { "ph": "s", "id": 270670, "pid": 76337, "tid": -914061504, "ts": 1716454225644605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225644667, "dur": 1, "args": { "External id": 270681, "cbid": 251, "correlation": 270681 } }, { "ph": "f", "id": 270681, "pid": 76337, "tid": -914061504, "ts": 1716454225644667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225644670, "dur": 0, "args": { "External id": 270682, "cbid": 251, "correlation": 270682 } }, { "ph": "f", "id": 270682, "pid": 76337, "tid": -914061504, "ts": 1716454225644670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225773682, "dur": 6, "args": { "External id": 270683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270683, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 270683, "pid": 5, "tid": 7, "ts": 1716454225773682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644672, "dur": 13, "args": { "External id": 270683, "cbid": 211, "correlation": 270683 } }, { "ph": "s", "id": 270683, "pid": 76337, "tid": -914061504, "ts": 1716454225644672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225773689, "dur": 4, "args": { "External id": 270685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270685, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 270685, "pid": 5, "tid": 7, "ts": 1716454225773689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644686, "dur": 5, "args": { "External id": 270685, "cbid": 211, "correlation": 270685 } }, { "ph": "s", "id": 270685, "pid": 76337, "tid": -914061504, "ts": 1716454225644686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225773694, "dur": 159, "args": { "External id": 270706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270706, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270706, "pid": 5, "tid": 7, "ts": 1716454225773694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644760, "dur": 16, "args": { "External id": 270706, "cbid": 211, "correlation": 270706 } }, { "ph": "s", "id": 270706, "pid": 76337, "tid": -914061504, "ts": 1716454225644760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225644861, "dur": 1, "args": { "External id": 270724, "cbid": 251, "correlation": 270724 } }, { "ph": "f", "id": 270724, "pid": 76337, "tid": -914061504, "ts": 1716454225644861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225773854, "dur": 108, "args": { "External id": 270726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270726, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270726, "pid": 5, "tid": 7, "ts": 1716454225773854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644867, "dur": 14, "args": { "External id": 270726, "cbid": 211, "correlation": 270726 } }, { "ph": "s", "id": 270726, "pid": 76337, "tid": -914061504, "ts": 1716454225644867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225773964, "dur": 35, "args": { "External id": 270734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270734, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270734, "pid": 5, "tid": 7, "ts": 1716454225773964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644937, "dur": 12, "args": { "External id": 270734, "cbid": 211, "correlation": 270734 } }, { "ph": "s", "id": 270734, "pid": 76337, "tid": -914061504, "ts": 1716454225644937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225774000, "dur": 67, "args": { "External id": 270742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270742, "pid": 5, "tid": 7, "ts": 1716454225774000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225644985, "dur": 10, "args": { "External id": 270742, "cbid": 211, "correlation": 270742 } }, { "ph": "s", "id": 270742, "pid": 76337, "tid": -914061504, "ts": 1716454225644985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225774068, "dur": 95, "args": { "External id": 270764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270764, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270764, "pid": 5, "tid": 7, "ts": 1716454225774068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645040, "dur": 10, "args": { "External id": 270764, "cbid": 211, "correlation": 270764 } }, { "ph": "s", "id": 270764, "pid": 76337, "tid": -914061504, "ts": 1716454225645040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645126, "dur": 1, "args": { "External id": 270780, "cbid": 251, "correlation": 270780 } }, { "ph": "f", "id": 270780, "pid": 76337, "tid": -914061504, "ts": 1716454225645126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225774164, "dur": 587, "args": { "External id": 270782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270782, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270782, "pid": 5, "tid": 7, "ts": 1716454225774164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645132, "dur": 14, "args": { "External id": 270782, "cbid": 211, "correlation": 270782 } }, { "ph": "s", "id": 270782, "pid": 76337, "tid": -914061504, "ts": 1716454225645132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225774752, "dur": 248, "args": { "External id": 270790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270790, "pid": 5, "tid": 7, "ts": 1716454225774752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645198, "dur": 12, "args": { "External id": 270790, "cbid": 211, "correlation": 270790 } }, { "ph": "s", "id": 270790, "pid": 76337, "tid": -914061504, "ts": 1716454225645198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225775001, "dur": 256, "args": { "External id": 270798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270798, "pid": 5, "tid": 7, "ts": 1716454225775001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645228, "dur": 9, "args": { "External id": 270798, "cbid": 211, "correlation": 270798 } }, { "ph": "s", "id": 270798, "pid": 76337, "tid": -914061504, "ts": 1716454225645228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645309, "dur": 1, "args": { "External id": 270814, "cbid": 251, "correlation": 270814 } }, { "ph": "f", "id": 270814, "pid": 76337, "tid": -914061504, "ts": 1716454225645309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645314, "dur": 0, "args": { "External id": 270816, "cbid": 251, "correlation": 270816 } }, { "ph": "f", "id": 270816, "pid": 76337, "tid": -914061504, "ts": 1716454225645314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225775259, "dur": 362, "args": { "External id": 270817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270817, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 270817, "pid": 5, "tid": 7, "ts": 1716454225775259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645317, "dur": 13, "args": { "External id": 270817, "cbid": 211, "correlation": 270817 } }, { "ph": "s", "id": 270817, "pid": 76337, "tid": -914061504, "ts": 1716454225645317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225775623, "dur": 50, "args": { "External id": 270825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270825, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270825, "pid": 5, "tid": 7, "ts": 1716454225775623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645359, "dur": 10, "args": { "External id": 270825, "cbid": 211, "correlation": 270825 } }, { "ph": "s", "id": 270825, "pid": 76337, "tid": -914061504, "ts": 1716454225645359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225775674, "dur": 161, "args": { "External id": 270836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270836, "pid": 5, "tid": 7, "ts": 1716454225775674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645426, "dur": 224, "args": { "External id": 270836, "cbid": 211, "correlation": 270836 } }, { "ph": "s", "id": 270836, "pid": 76337, "tid": -914061504, "ts": 1716454225645426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225645702, "dur": 0, "args": { "External id": 270848, "cbid": 317, "correlation": 270848 } }, { "ph": "f", "id": 270848, "pid": 76337, "tid": -914061504, "ts": 1716454225645702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225645703, "dur": 0, "args": { "External id": 270849, "cbid": 203, "correlation": 270849 } }, { "ph": "f", "id": 270849, "pid": 76337, "tid": -914061504, "ts": 1716454225645703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225645704, "dur": 0, "args": { "External id": 270850, "cbid": 205, "correlation": 270850 } }, { "ph": "f", "id": 270850, "pid": 76337, "tid": -914061504, "ts": 1716454225645704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645728, "dur": 1, "args": { "External id": 270854, "cbid": 251, "correlation": 270854 } }, { "ph": "f", "id": 270854, "pid": 76337, "tid": -914061504, "ts": 1716454225645728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645730, "dur": 0, "args": { "External id": 270855, "cbid": 251, "correlation": 270855 } }, { "ph": "f", "id": 270855, "pid": 76337, "tid": -914061504, "ts": 1716454225645730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645730, "dur": 0, "args": { "External id": 270856, "cbid": 251, "correlation": 270856 } }, { "ph": "f", "id": 270856, "pid": 76337, "tid": -914061504, "ts": 1716454225645730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645731, "dur": 0, "args": { "External id": 270857, "cbid": 251, "correlation": 270857 } }, { "ph": "f", "id": 270857, "pid": 76337, "tid": -914061504, "ts": 1716454225645731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645732, "dur": 0, "args": { "External id": 270858, "cbid": 251, "correlation": 270858 } }, { "ph": "f", "id": 270858, "pid": 76337, "tid": -914061504, "ts": 1716454225645732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645733, "dur": 0, "args": { "External id": 270859, "cbid": 251, "correlation": 270859 } }, { "ph": "f", "id": 270859, "pid": 76337, "tid": -914061504, "ts": 1716454225645733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645734, "dur": 0, "args": { "External id": 270860, "cbid": 251, "correlation": 270860 } }, { "ph": "f", "id": 270860, "pid": 76337, "tid": -914061504, "ts": 1716454225645734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645734, "dur": 0, "args": { "External id": 270861, "cbid": 251, "correlation": 270861 } }, { "ph": "f", "id": 270861, "pid": 76337, "tid": -914061504, "ts": 1716454225645734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225645736, "dur": 0, "args": { "External id": 270862, "cbid": 251, "correlation": 270862 } }, { "ph": "f", "id": 270862, "pid": 76337, "tid": -914061504, "ts": 1716454225645736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225775836, "dur": 117, "args": { "External id": 270863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270863, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 270863, "pid": 5, "tid": 7, "ts": 1716454225775836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645738, "dur": 41, "args": { "External id": 270863, "cbid": 211, "correlation": 270863 } }, { "ph": "s", "id": 270863, "pid": 76337, "tid": -914061504, "ts": 1716454225645738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225775955, "dur": 61, "args": { "External id": 270869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270869, "pid": 5, "tid": 7, "ts": 1716454225775955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645801, "dur": 106, "args": { "External id": 270869, "cbid": 211, "correlation": 270869 } }, { "ph": "s", "id": 270869, "pid": 76337, "tid": -914061504, "ts": 1716454225645801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225776017, "dur": 50, "args": { "External id": 270877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270877, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270877, "pid": 5, "tid": 7, "ts": 1716454225776017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225645930, "dur": 291, "args": { "External id": 270877, "cbid": 211, "correlation": 270877 } }, { "ph": "s", "id": 270877, "pid": 76337, "tid": -914061504, "ts": 1716454225645930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225776068, "dur": 100, "args": { "External id": 270886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270886, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270886, "pid": 5, "tid": 7, "ts": 1716454225776068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646252, "dur": 12, "args": { "External id": 270886, "cbid": 211, "correlation": 270886 } }, { "ph": "s", "id": 270886, "pid": 76337, "tid": -914061504, "ts": 1716454225646252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225776170, "dur": 93, "args": { "External id": 270906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270906, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 270906, "pid": 5, "tid": 7, "ts": 1716454225776170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646325, "dur": 11, "args": { "External id": 270906, "cbid": 211, "correlation": 270906 } }, { "ph": "s", "id": 270906, "pid": 76337, "tid": -914061504, "ts": 1716454225646325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225776264, "dur": 4, "args": { "External id": 270918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270918, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 270918, "pid": 5, "tid": 7, "ts": 1716454225776264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646347, "dur": 10, "args": { "External id": 270918, "cbid": 211, "correlation": 270918 } }, { "ph": "s", "id": 270918, "pid": 76337, "tid": -914061504, "ts": 1716454225646347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225776270, "dur": 111, "args": { "External id": 270921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270921, "pid": 5, "tid": 7, "ts": 1716454225776270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646370, "dur": 111, "args": { "External id": 270921, "cbid": 211, "correlation": 270921 } }, { "ph": "s", "id": 270921, "pid": 76337, "tid": -914061504, "ts": 1716454225646370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225776382, "dur": 69, "args": { "External id": 270930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270930, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270930, "pid": 5, "tid": 7, "ts": 1716454225776382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646513, "dur": 10, "args": { "External id": 270930, "cbid": 211, "correlation": 270930 } }, { "ph": "s", "id": 270930, "pid": 76337, "tid": -914061504, "ts": 1716454225646513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225646565, "dur": 0, "args": { "External id": 270940, "cbid": 317, "correlation": 270940 } }, { "ph": "f", "id": 270940, "pid": 76337, "tid": -914061504, "ts": 1716454225646565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225646566, "dur": 0, "args": { "External id": 270941, "cbid": 203, "correlation": 270941 } }, { "ph": "f", "id": 270941, "pid": 76337, "tid": -914061504, "ts": 1716454225646566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225646566, "dur": 0, "args": { "External id": 270942, "cbid": 205, "correlation": 270942 } }, { "ph": "f", "id": 270942, "pid": 76337, "tid": -914061504, "ts": 1716454225646566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225776451, "dur": 76, "args": { "External id": 270946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270946, "pid": 5, "tid": 7, "ts": 1716454225776451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646581, "dur": 12, "args": { "External id": 270946, "cbid": 211, "correlation": 270946 } }, { "ph": "s", "id": 270946, "pid": 76337, "tid": -914061504, "ts": 1716454225646581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225776529, "dur": 25, "args": { "External id": 270948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270948, "pid": 5, "tid": 7, "ts": 1716454225776529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646595, "dur": 5, "args": { "External id": 270948, "cbid": 211, "correlation": 270948 } }, { "ph": "s", "id": 270948, "pid": 76337, "tid": -914061504, "ts": 1716454225646595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225776555, "dur": 4, "args": { "External id": 270950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270950, "pid": 5, "tid": 7, "ts": 1716454225776555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646606, "dur": 5, "args": { "External id": 270950, "cbid": 211, "correlation": 270950 } }, { "ph": "s", "id": 270950, "pid": 76337, "tid": -914061504, "ts": 1716454225646606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225646614, "dur": 0, "args": { "External id": 270951, "cbid": 51, "correlation": 270951 } }, { "ph": "s", "id": 270951, "pid": 76337, "tid": -914061504, "ts": 1716454225646614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225776560, "dur": 1398, "args": { "External id": 270952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270952, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 270952, "pid": 5, "tid": 7, "ts": 1716454225776560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646615, "dur": 5, "args": { "External id": 270952, "cbid": 211, "correlation": 270952 } }, { "ph": "s", "id": 270952, "pid": 76337, "tid": -914061504, "ts": 1716454225646615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225777959, "dur": 60, "args": { "External id": 270957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270957, "pid": 5, "tid": 7, "ts": 1716454225777959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646643, "dur": 9, "args": { "External id": 270957, "cbid": 211, "correlation": 270957 } }, { "ph": "s", "id": 270957, "pid": 76337, "tid": -914061504, "ts": 1716454225646643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225778021, "dur": 4, "args": { "External id": 270965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270965, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 270965, "pid": 5, "tid": 7, "ts": 1716454225778021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646686, "dur": 10, "args": { "External id": 270965, "cbid": 211, "correlation": 270965 } }, { "ph": "s", "id": 270965, "pid": 76337, "tid": -914061504, "ts": 1716454225646686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225646751, "dur": 1, "args": { "External id": 270981, "cbid": 251, "correlation": 270981 } }, { "ph": "f", "id": 270981, "pid": 76337, "tid": -914061504, "ts": 1716454225646751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225646757, "dur": 0, "args": { "External id": 270983, "cbid": 251, "correlation": 270983 } }, { "ph": "f", "id": 270983, "pid": 76337, "tid": -914061504, "ts": 1716454225646757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225778026, "dur": 11, "args": { "External id": 270984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270984, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 270984, "pid": 5, "tid": 7, "ts": 1716454225778026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646758, "dur": 11, "args": { "External id": 270984, "cbid": 211, "correlation": 270984 } }, { "ph": "s", "id": 270984, "pid": 76337, "tid": -914061504, "ts": 1716454225646758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225778039, "dur": 5, "args": { "External id": 270986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270986, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 270986, "pid": 5, "tid": 7, "ts": 1716454225778039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646771, "dur": 7, "args": { "External id": 270986, "cbid": 211, "correlation": 270986 } }, { "ph": "s", "id": 270986, "pid": 76337, "tid": -914061504, "ts": 1716454225646771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225778045, "dur": 56, "args": { "External id": 270996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 270996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 270996, "pid": 5, "tid": 7, "ts": 1716454225778045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225646830, "dur": 565, "args": { "External id": 270996, "cbid": 211, "correlation": 270996 } }, { "ph": "s", "id": 270996, "pid": 76337, "tid": -914061504, "ts": 1716454225646830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225778102, "dur": 54, "args": { "External id": 271016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271016, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 271016, "pid": 5, "tid": 7, "ts": 1716454225778102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647449, "dur": 12, "args": { "External id": 271016, "cbid": 211, "correlation": 271016 } }, { "ph": "s", "id": 271016, "pid": 76337, "tid": -914061504, "ts": 1716454225647449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225778158, "dur": 4, "args": { "External id": 271028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271028, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 271028, "pid": 5, "tid": 7, "ts": 1716454225778158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647471, "dur": 6, "args": { "External id": 271028, "cbid": 211, "correlation": 271028 } }, { "ph": "s", "id": 271028, "pid": 76337, "tid": -914061504, "ts": 1716454225647471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225778163, "dur": 56, "args": { "External id": 271031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271031, "pid": 5, "tid": 7, "ts": 1716454225778163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647490, "dur": 6, "args": { "External id": 271031, "cbid": 211, "correlation": 271031 } }, { "ph": "s", "id": 271031, "pid": 76337, "tid": -914061504, "ts": 1716454225647490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225778220, "dur": 36, "args": { "External id": 271040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271040, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271040, "pid": 5, "tid": 7, "ts": 1716454225778220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647530, "dur": 10, "args": { "External id": 271040, "cbid": 211, "correlation": 271040 } }, { "ph": "s", "id": 271040, "pid": 76337, "tid": -914061504, "ts": 1716454225647530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225647593, "dur": 0, "args": { "External id": 271050, "cbid": 317, "correlation": 271050 } }, { "ph": "f", "id": 271050, "pid": 76337, "tid": -914061504, "ts": 1716454225647593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225647594, "dur": 0, "args": { "External id": 271051, "cbid": 203, "correlation": 271051 } }, { "ph": "f", "id": 271051, "pid": 76337, "tid": -914061504, "ts": 1716454225647594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225647595, "dur": 0, "args": { "External id": 271052, "cbid": 205, "correlation": 271052 } }, { "ph": "f", "id": 271052, "pid": 76337, "tid": -914061504, "ts": 1716454225647595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225778258, "dur": 40, "args": { "External id": 271056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271056, "pid": 5, "tid": 7, "ts": 1716454225778258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647610, "dur": 12, "args": { "External id": 271056, "cbid": 211, "correlation": 271056 } }, { "ph": "s", "id": 271056, "pid": 76337, "tid": -914061504, "ts": 1716454225647610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225778299, "dur": 14, "args": { "External id": 271058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271058, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271058, "pid": 5, "tid": 7, "ts": 1716454225778299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647624, "dur": 5, "args": { "External id": 271058, "cbid": 211, "correlation": 271058 } }, { "ph": "s", "id": 271058, "pid": 76337, "tid": -914061504, "ts": 1716454225647624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225778315, "dur": 3, "args": { "External id": 271060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 271060, "pid": 5, "tid": 7, "ts": 1716454225778315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647633, "dur": 6, "args": { "External id": 271060, "cbid": 211, "correlation": 271060 } }, { "ph": "s", "id": 271060, "pid": 76337, "tid": -914061504, "ts": 1716454225647633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225647641, "dur": 0, "args": { "External id": 271061, "cbid": 51, "correlation": 271061 } }, { "ph": "s", "id": 271061, "pid": 76337, "tid": -914061504, "ts": 1716454225647641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225778320, "dur": 715, "args": { "External id": 271062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271062, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271062, "pid": 5, "tid": 7, "ts": 1716454225778320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647642, "dur": 5, "args": { "External id": 271062, "cbid": 211, "correlation": 271062 } }, { "ph": "s", "id": 271062, "pid": 76337, "tid": -914061504, "ts": 1716454225647642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225779036, "dur": 60, "args": { "External id": 271067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271067, "pid": 5, "tid": 7, "ts": 1716454225779036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647670, "dur": 9, "args": { "External id": 271067, "cbid": 211, "correlation": 271067 } }, { "ph": "s", "id": 271067, "pid": 76337, "tid": -914061504, "ts": 1716454225647670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225647727, "dur": 0, "args": { "External id": 271077, "cbid": 317, "correlation": 271077 } }, { "ph": "f", "id": 271077, "pid": 76337, "tid": -914061504, "ts": 1716454225647727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225647728, "dur": 0, "args": { "External id": 271078, "cbid": 203, "correlation": 271078 } }, { "ph": "f", "id": 271078, "pid": 76337, "tid": -914061504, "ts": 1716454225647728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225647729, "dur": 0, "args": { "External id": 271079, "cbid": 205, "correlation": 271079 } }, { "ph": "f", "id": 271079, "pid": 76337, "tid": -914061504, "ts": 1716454225647729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225779098, "dur": 75, "args": { "External id": 271083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271083, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271083, "pid": 5, "tid": 7, "ts": 1716454225779098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647740, "dur": 11, "args": { "External id": 271083, "cbid": 211, "correlation": 271083 } }, { "ph": "s", "id": 271083, "pid": 76337, "tid": -914061504, "ts": 1716454225647740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225779174, "dur": 212, "args": { "External id": 271085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271085, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271085, "pid": 5, "tid": 7, "ts": 1716454225779174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647758, "dur": 6, "args": { "External id": 271085, "cbid": 211, "correlation": 271085 } }, { "ph": "s", "id": 271085, "pid": 76337, "tid": -914061504, "ts": 1716454225647758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225779387, "dur": 38, "args": { "External id": 271087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271087, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271087, "pid": 5, "tid": 7, "ts": 1716454225779387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647769, "dur": 5, "args": { "External id": 271087, "cbid": 211, "correlation": 271087 } }, { "ph": "s", "id": 271087, "pid": 76337, "tid": -914061504, "ts": 1716454225647769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225779427, "dur": 60, "args": { "External id": 271093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271093, "pid": 5, "tid": 7, "ts": 1716454225779427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225647794, "dur": 536, "args": { "External id": 271093, "cbid": 211, "correlation": 271093 } }, { "ph": "s", "id": 271093, "pid": 76337, "tid": -914061504, "ts": 1716454225647794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225779488, "dur": 50, "args": { "External id": 271101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271101, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271101, "pid": 5, "tid": 7, "ts": 1716454225779488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648352, "dur": 9, "args": { "External id": 271101, "cbid": 211, "correlation": 271101 } }, { "ph": "s", "id": 271101, "pid": 76337, "tid": -914061504, "ts": 1716454225648352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225779540, "dur": 36, "args": { "External id": 271109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271109, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271109, "pid": 5, "tid": 7, "ts": 1716454225779540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648384, "dur": 8, "args": { "External id": 271109, "cbid": 211, "correlation": 271109 } }, { "ph": "s", "id": 271109, "pid": 76337, "tid": -914061504, "ts": 1716454225648384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225779577, "dur": 53, "args": { "External id": 271129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271129, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 271129, "pid": 5, "tid": 7, "ts": 1716454225779577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648465, "dur": 13, "args": { "External id": 271129, "cbid": 211, "correlation": 271129 } }, { "ph": "s", "id": 271129, "pid": 76337, "tid": -914061504, "ts": 1716454225648465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225779632, "dur": 4, "args": { "External id": 271141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271141, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 271141, "pid": 5, "tid": 7, "ts": 1716454225779632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648488, "dur": 7, "args": { "External id": 271141, "cbid": 211, "correlation": 271141 } }, { "ph": "s", "id": 271141, "pid": 76337, "tid": -914061504, "ts": 1716454225648488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225779637, "dur": 57, "args": { "External id": 271144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271144, "pid": 5, "tid": 7, "ts": 1716454225779637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648506, "dur": 6, "args": { "External id": 271144, "cbid": 211, "correlation": 271144 } }, { "ph": "s", "id": 271144, "pid": 76337, "tid": -914061504, "ts": 1716454225648506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225648562, "dur": 0, "args": { "External id": 271155, "cbid": 317, "correlation": 271155 } }, { "ph": "f", "id": 271155, "pid": 76337, "tid": -914061504, "ts": 1716454225648562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225648563, "dur": 0, "args": { "External id": 271156, "cbid": 203, "correlation": 271156 } }, { "ph": "f", "id": 271156, "pid": 76337, "tid": -914061504, "ts": 1716454225648563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225648564, "dur": 0, "args": { "External id": 271157, "cbid": 205, "correlation": 271157 } }, { "ph": "f", "id": 271157, "pid": 76337, "tid": -914061504, "ts": 1716454225648564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648586, "dur": 1, "args": { "External id": 271161, "cbid": 251, "correlation": 271161 } }, { "ph": "f", "id": 271161, "pid": 76337, "tid": -914061504, "ts": 1716454225648586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648588, "dur": 0, "args": { "External id": 271162, "cbid": 251, "correlation": 271162 } }, { "ph": "f", "id": 271162, "pid": 76337, "tid": -914061504, "ts": 1716454225648588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648589, "dur": 0, "args": { "External id": 271163, "cbid": 251, "correlation": 271163 } }, { "ph": "f", "id": 271163, "pid": 76337, "tid": -914061504, "ts": 1716454225648589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648589, "dur": 0, "args": { "External id": 271164, "cbid": 251, "correlation": 271164 } }, { "ph": "f", "id": 271164, "pid": 76337, "tid": -914061504, "ts": 1716454225648589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648590, "dur": 0, "args": { "External id": 271165, "cbid": 251, "correlation": 271165 } }, { "ph": "f", "id": 271165, "pid": 76337, "tid": -914061504, "ts": 1716454225648590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648591, "dur": 0, "args": { "External id": 271166, "cbid": 251, "correlation": 271166 } }, { "ph": "f", "id": 271166, "pid": 76337, "tid": -914061504, "ts": 1716454225648591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648592, "dur": 0, "args": { "External id": 271167, "cbid": 251, "correlation": 271167 } }, { "ph": "f", "id": 271167, "pid": 76337, "tid": -914061504, "ts": 1716454225648592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648592, "dur": 0, "args": { "External id": 271168, "cbid": 251, "correlation": 271168 } }, { "ph": "f", "id": 271168, "pid": 76337, "tid": -914061504, "ts": 1716454225648592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648594, "dur": 0, "args": { "External id": 271169, "cbid": 251, "correlation": 271169 } }, { "ph": "f", "id": 271169, "pid": 76337, "tid": -914061504, "ts": 1716454225648594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225779695, "dur": 117, "args": { "External id": 271170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271170, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 271170, "pid": 5, "tid": 7, "ts": 1716454225779695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648596, "dur": 12, "args": { "External id": 271170, "cbid": 211, "correlation": 271170 } }, { "ph": "s", "id": 271170, "pid": 76337, "tid": -914061504, "ts": 1716454225648596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225779814, "dur": 60, "args": { "External id": 271176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271176, "pid": 5, "tid": 7, "ts": 1716454225779814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648632, "dur": 9, "args": { "External id": 271176, "cbid": 211, "correlation": 271176 } }, { "ph": "s", "id": 271176, "pid": 76337, "tid": -914061504, "ts": 1716454225648632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225779875, "dur": 584, "args": { "External id": 271185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271185, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271185, "pid": 5, "tid": 7, "ts": 1716454225779875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648713, "dur": 14, "args": { "External id": 271185, "cbid": 211, "correlation": 271185 } }, { "ph": "s", "id": 271185, "pid": 76337, "tid": -914061504, "ts": 1716454225648713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225780460, "dur": 187, "args": { "External id": 271207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271207, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271207, "pid": 5, "tid": 7, "ts": 1716454225780460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648770, "dur": 11, "args": { "External id": 271207, "cbid": 211, "correlation": 271207 } }, { "ph": "s", "id": 271207, "pid": 76337, "tid": -914061504, "ts": 1716454225648770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648857, "dur": 1, "args": { "External id": 271218, "cbid": 251, "correlation": 271218 } }, { "ph": "f", "id": 271218, "pid": 76337, "tid": -914061504, "ts": 1716454225648857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225780648, "dur": 200, "args": { "External id": 271219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271219, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271219, "pid": 5, "tid": 7, "ts": 1716454225780648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648862, "dur": 12, "args": { "External id": 271219, "cbid": 211, "correlation": 271219 } }, { "ph": "s", "id": 271219, "pid": 76337, "tid": -914061504, "ts": 1716454225648862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225648929, "dur": 1, "args": { "External id": 271230, "cbid": 251, "correlation": 271230 } }, { "ph": "f", "id": 271230, "pid": 76337, "tid": -914061504, "ts": 1716454225648929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225780849, "dur": 198, "args": { "External id": 271231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271231, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271231, "pid": 5, "tid": 7, "ts": 1716454225780849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225648933, "dur": 12, "args": { "External id": 271231, "cbid": 211, "correlation": 271231 } }, { "ph": "s", "id": 271231, "pid": 76337, "tid": -914061504, "ts": 1716454225648933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225649005, "dur": 1, "args": { "External id": 271242, "cbid": 251, "correlation": 271242 } }, { "ph": "f", "id": 271242, "pid": 76337, "tid": -914061504, "ts": 1716454225649005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225781049, "dur": 192, "args": { "External id": 271243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271243, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271243, "pid": 5, "tid": 7, "ts": 1716454225781049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225649009, "dur": 12, "args": { "External id": 271243, "cbid": 211, "correlation": 271243 } }, { "ph": "s", "id": 271243, "pid": 76337, "tid": -914061504, "ts": 1716454225649009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225781243, "dur": 19185, "args": { "External id": 271264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271264, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 271264, "pid": 5, "tid": 7, "ts": 1716454225781243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225649091, "dur": 12, "args": { "External id": 271264, "cbid": 211, "correlation": 271264 } }, { "ph": "s", "id": 271264, "pid": 76337, "tid": -914061504, "ts": 1716454225649091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225649187, "dur": 1, "args": { "External id": 271282, "cbid": 251, "correlation": 271282 } }, { "ph": "f", "id": 271282, "pid": 76337, "tid": -914061504, "ts": 1716454225649187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225800429, "dur": 207, "args": { "External id": 271284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271284, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271284, "pid": 5, "tid": 7, "ts": 1716454225800429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225649193, "dur": 13, "args": { "External id": 271284, "cbid": 211, "correlation": 271284 } }, { "ph": "s", "id": 271284, "pid": 76337, "tid": -914061504, "ts": 1716454225649193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225800638, "dur": 67, "args": { "External id": 271292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271292, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271292, "pid": 5, "tid": 7, "ts": 1716454225800638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225649261, "dur": 12, "args": { "External id": 271292, "cbid": 211, "correlation": 271292 } }, { "ph": "s", "id": 271292, "pid": 76337, "tid": -914061504, "ts": 1716454225649261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225800706, "dur": 96, "args": { "External id": 271300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271300, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271300, "pid": 5, "tid": 7, "ts": 1716454225800706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225649299, "dur": 116, "args": { "External id": 271300, "cbid": 211, "correlation": 271300 } }, { "ph": "s", "id": 271300, "pid": 76337, "tid": -914061504, "ts": 1716454225649299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225800803, "dur": 55, "args": { "External id": 271311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271311, "pid": 5, "tid": 7, "ts": 1716454225800803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225649479, "dur": 1928, "args": { "External id": 271311, "cbid": 211, "correlation": 271311 } }, { "ph": "s", "id": 271311, "pid": 76337, "tid": -914061504, "ts": 1716454225649479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225800860, "dur": 95, "args": { "External id": 271333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271333, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271333, "pid": 5, "tid": 7, "ts": 1716454225800860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651431, "dur": 123, "args": { "External id": 271333, "cbid": 211, "correlation": 271333 } }, { "ph": "s", "id": 271333, "pid": 76337, "tid": -914061504, "ts": 1716454225651431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225651652, "dur": 1, "args": { "External id": 271344, "cbid": 251, "correlation": 271344 } }, { "ph": "f", "id": 271344, "pid": 76337, "tid": -914061504, "ts": 1716454225651652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225800956, "dur": 105, "args": { "External id": 271345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271345, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271345, "pid": 5, "tid": 7, "ts": 1716454225800956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651658, "dur": 13, "args": { "External id": 271345, "cbid": 211, "correlation": 271345 } }, { "ph": "s", "id": 271345, "pid": 76337, "tid": -914061504, "ts": 1716454225651658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225651737, "dur": 1, "args": { "External id": 271356, "cbid": 251, "correlation": 271356 } }, { "ph": "f", "id": 271356, "pid": 76337, "tid": -914061504, "ts": 1716454225651737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225651741, "dur": 0, "args": { "External id": 271357, "cbid": 251, "correlation": 271357 } }, { "ph": "f", "id": 271357, "pid": 76337, "tid": -914061504, "ts": 1716454225651741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225801063, "dur": 11, "args": { "External id": 271358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271358, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271358, "pid": 5, "tid": 7, "ts": 1716454225801063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651743, "dur": 13, "args": { "External id": 271358, "cbid": 211, "correlation": 271358 } }, { "ph": "s", "id": 271358, "pid": 76337, "tid": -914061504, "ts": 1716454225651743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225801075, "dur": 5, "args": { "External id": 271360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271360, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 271360, "pid": 5, "tid": 7, "ts": 1716454225801075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651759, "dur": 6, "args": { "External id": 271360, "cbid": 211, "correlation": 271360 } }, { "ph": "s", "id": 271360, "pid": 76337, "tid": -914061504, "ts": 1716454225651759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225651821, "dur": 1, "args": { "External id": 271371, "cbid": 251, "correlation": 271371 } }, { "ph": "f", "id": 271371, "pid": 76337, "tid": -914061504, "ts": 1716454225651821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225651824, "dur": 0, "args": { "External id": 271372, "cbid": 251, "correlation": 271372 } }, { "ph": "f", "id": 271372, "pid": 76337, "tid": -914061504, "ts": 1716454225651824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225801082, "dur": 6, "args": { "External id": 271373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271373, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271373, "pid": 5, "tid": 7, "ts": 1716454225801082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651826, "dur": 13, "args": { "External id": 271373, "cbid": 211, "correlation": 271373 } }, { "ph": "s", "id": 271373, "pid": 76337, "tid": -914061504, "ts": 1716454225651826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225801089, "dur": 4, "args": { "External id": 271375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271375, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 271375, "pid": 5, "tid": 7, "ts": 1716454225801089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651840, "dur": 6, "args": { "External id": 271375, "cbid": 211, "correlation": 271375 } }, { "ph": "s", "id": 271375, "pid": 76337, "tid": -914061504, "ts": 1716454225651840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225801094, "dur": 160, "args": { "External id": 271396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271396, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 271396, "pid": 5, "tid": 7, "ts": 1716454225801094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225651925, "dur": 14, "args": { "External id": 271396, "cbid": 211, "correlation": 271396 } }, { "ph": "s", "id": 271396, "pid": 76337, "tid": -914061504, "ts": 1716454225651925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652041, "dur": 1, "args": { "External id": 271414, "cbid": 251, "correlation": 271414 } }, { "ph": "f", "id": 271414, "pid": 76337, "tid": -914061504, "ts": 1716454225652041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225801256, "dur": 110, "args": { "External id": 271416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271416, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 271416, "pid": 5, "tid": 7, "ts": 1716454225801256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652047, "dur": 14, "args": { "External id": 271416, "cbid": 211, "correlation": 271416 } }, { "ph": "s", "id": 271416, "pid": 76337, "tid": -914061504, "ts": 1716454225652047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225801367, "dur": 35, "args": { "External id": 271424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271424, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271424, "pid": 5, "tid": 7, "ts": 1716454225801367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652125, "dur": 12, "args": { "External id": 271424, "cbid": 211, "correlation": 271424 } }, { "ph": "s", "id": 271424, "pid": 76337, "tid": -914061504, "ts": 1716454225652125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225801404, "dur": 67, "args": { "External id": 271432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271432, "pid": 5, "tid": 7, "ts": 1716454225801404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652169, "dur": 10, "args": { "External id": 271432, "cbid": 211, "correlation": 271432 } }, { "ph": "s", "id": 271432, "pid": 76337, "tid": -914061504, "ts": 1716454225652169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225801473, "dur": 95, "args": { "External id": 271454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271454, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271454, "pid": 5, "tid": 7, "ts": 1716454225801473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652222, "dur": 11, "args": { "External id": 271454, "cbid": 211, "correlation": 271454 } }, { "ph": "s", "id": 271454, "pid": 76337, "tid": -914061504, "ts": 1716454225652222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652312, "dur": 1, "args": { "External id": 271470, "cbid": 251, "correlation": 271470 } }, { "ph": "f", "id": 271470, "pid": 76337, "tid": -914061504, "ts": 1716454225652312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225801569, "dur": 590, "args": { "External id": 271472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271472, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 271472, "pid": 5, "tid": 7, "ts": 1716454225801569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652318, "dur": 13, "args": { "External id": 271472, "cbid": 211, "correlation": 271472 } }, { "ph": "s", "id": 271472, "pid": 76337, "tid": -914061504, "ts": 1716454225652318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225802160, "dur": 250, "args": { "External id": 271480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271480, "pid": 5, "tid": 7, "ts": 1716454225802160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652394, "dur": 13, "args": { "External id": 271480, "cbid": 211, "correlation": 271480 } }, { "ph": "s", "id": 271480, "pid": 76337, "tid": -914061504, "ts": 1716454225652394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225802412, "dur": 255, "args": { "External id": 271488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271488, "pid": 5, "tid": 7, "ts": 1716454225802412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652429, "dur": 9, "args": { "External id": 271488, "cbid": 211, "correlation": 271488 } }, { "ph": "s", "id": 271488, "pid": 76337, "tid": -914061504, "ts": 1716454225652429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652511, "dur": 2, "args": { "External id": 271504, "cbid": 251, "correlation": 271504 } }, { "ph": "f", "id": 271504, "pid": 76337, "tid": -914061504, "ts": 1716454225652511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652517, "dur": 0, "args": { "External id": 271506, "cbid": 251, "correlation": 271506 } }, { "ph": "f", "id": 271506, "pid": 76337, "tid": -914061504, "ts": 1716454225652517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225802668, "dur": 362, "args": { "External id": 271507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271507, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271507, "pid": 5, "tid": 7, "ts": 1716454225802668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652520, "dur": 14, "args": { "External id": 271507, "cbid": 211, "correlation": 271507 } }, { "ph": "s", "id": 271507, "pid": 76337, "tid": -914061504, "ts": 1716454225652520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803031, "dur": 51, "args": { "External id": 271515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271515, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271515, "pid": 5, "tid": 7, "ts": 1716454225803031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652563, "dur": 112, "args": { "External id": 271515, "cbid": 211, "correlation": 271515 } }, { "ph": "s", "id": 271515, "pid": 76337, "tid": -914061504, "ts": 1716454225652563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225803084, "dur": 162, "args": { "External id": 271526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271526, "pid": 5, "tid": 7, "ts": 1716454225803084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652741, "dur": 63, "args": { "External id": 271526, "cbid": 211, "correlation": 271526 } }, { "ph": "s", "id": 271526, "pid": 76337, "tid": -914061504, "ts": 1716454225652741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225652873, "dur": 0, "args": { "External id": 271538, "cbid": 317, "correlation": 271538 } }, { "ph": "f", "id": 271538, "pid": 76337, "tid": -914061504, "ts": 1716454225652873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225652874, "dur": 1, "args": { "External id": 271539, "cbid": 203, "correlation": 271539 } }, { "ph": "f", "id": 271539, "pid": 76337, "tid": -914061504, "ts": 1716454225652874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225652876, "dur": 0, "args": { "External id": 271540, "cbid": 205, "correlation": 271540 } }, { "ph": "f", "id": 271540, "pid": 76337, "tid": -914061504, "ts": 1716454225652876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652907, "dur": 1, "args": { "External id": 271544, "cbid": 251, "correlation": 271544 } }, { "ph": "f", "id": 271544, "pid": 76337, "tid": -914061504, "ts": 1716454225652907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652910, "dur": 0, "args": { "External id": 271545, "cbid": 251, "correlation": 271545 } }, { "ph": "f", "id": 271545, "pid": 76337, "tid": -914061504, "ts": 1716454225652910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652910, "dur": 0, "args": { "External id": 271546, "cbid": 251, "correlation": 271546 } }, { "ph": "f", "id": 271546, "pid": 76337, "tid": -914061504, "ts": 1716454225652910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652912, "dur": 0, "args": { "External id": 271547, "cbid": 251, "correlation": 271547 } }, { "ph": "f", "id": 271547, "pid": 76337, "tid": -914061504, "ts": 1716454225652912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652913, "dur": 0, "args": { "External id": 271548, "cbid": 251, "correlation": 271548 } }, { "ph": "f", "id": 271548, "pid": 76337, "tid": -914061504, "ts": 1716454225652913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652914, "dur": 0, "args": { "External id": 271549, "cbid": 251, "correlation": 271549 } }, { "ph": "f", "id": 271549, "pid": 76337, "tid": -914061504, "ts": 1716454225652914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652915, "dur": 0, "args": { "External id": 271550, "cbid": 251, "correlation": 271550 } }, { "ph": "f", "id": 271550, "pid": 76337, "tid": -914061504, "ts": 1716454225652915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652916, "dur": 0, "args": { "External id": 271551, "cbid": 251, "correlation": 271551 } }, { "ph": "f", "id": 271551, "pid": 76337, "tid": -914061504, "ts": 1716454225652916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225652918, "dur": 0, "args": { "External id": 271552, "cbid": 251, "correlation": 271552 } }, { "ph": "f", "id": 271552, "pid": 76337, "tid": -914061504, "ts": 1716454225652918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225803246, "dur": 117, "args": { "External id": 271553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271553, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 271553, "pid": 5, "tid": 7, "ts": 1716454225803246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652920, "dur": 13, "args": { "External id": 271553, "cbid": 211, "correlation": 271553 } }, { "ph": "s", "id": 271553, "pid": 76337, "tid": -914061504, "ts": 1716454225652920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225803365, "dur": 61, "args": { "External id": 271559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271559, "pid": 5, "tid": 7, "ts": 1716454225803365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225652959, "dur": 285, "args": { "External id": 271559, "cbid": 211, "correlation": 271559 } }, { "ph": "s", "id": 271559, "pid": 76337, "tid": -914061504, "ts": 1716454225652959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803427, "dur": 50, "args": { "External id": 271567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271567, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271567, "pid": 5, "tid": 7, "ts": 1716454225803427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653268, "dur": 9, "args": { "External id": 271567, "cbid": 211, "correlation": 271567 } }, { "ph": "s", "id": 271567, "pid": 76337, "tid": -914061504, "ts": 1716454225653268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225803479, "dur": 54, "args": { "External id": 271587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271587, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 271587, "pid": 5, "tid": 7, "ts": 1716454225803479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653363, "dur": 12, "args": { "External id": 271587, "cbid": 211, "correlation": 271587 } }, { "ph": "s", "id": 271587, "pid": 76337, "tid": -914061504, "ts": 1716454225653363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225803534, "dur": 4, "args": { "External id": 271599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271599, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 271599, "pid": 5, "tid": 7, "ts": 1716454225803534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653387, "dur": 8, "args": { "External id": 271599, "cbid": 211, "correlation": 271599 } }, { "ph": "s", "id": 271599, "pid": 76337, "tid": -914061504, "ts": 1716454225653387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225803539, "dur": 57, "args": { "External id": 271602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271602, "pid": 5, "tid": 7, "ts": 1716454225803539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653408, "dur": 97, "args": { "External id": 271602, "cbid": 211, "correlation": 271602 } }, { "ph": "s", "id": 271602, "pid": 76337, "tid": -914061504, "ts": 1716454225653408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803598, "dur": 38, "args": { "External id": 271611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271611, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271611, "pid": 5, "tid": 7, "ts": 1716454225803598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653553, "dur": 11, "args": { "External id": 271611, "cbid": 211, "correlation": 271611 } }, { "ph": "s", "id": 271611, "pid": 76337, "tid": -914061504, "ts": 1716454225653553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225653609, "dur": 0, "args": { "External id": 271621, "cbid": 317, "correlation": 271621 } }, { "ph": "f", "id": 271621, "pid": 76337, "tid": -914061504, "ts": 1716454225653609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225653610, "dur": 0, "args": { "External id": 271622, "cbid": 203, "correlation": 271622 } }, { "ph": "f", "id": 271622, "pid": 76337, "tid": -914061504, "ts": 1716454225653610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225653611, "dur": 0, "args": { "External id": 271623, "cbid": 205, "correlation": 271623 } }, { "ph": "f", "id": 271623, "pid": 76337, "tid": -914061504, "ts": 1716454225653611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225803637, "dur": 40, "args": { "External id": 271627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271627, "pid": 5, "tid": 7, "ts": 1716454225803637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653630, "dur": 13, "args": { "External id": 271627, "cbid": 211, "correlation": 271627 } }, { "ph": "s", "id": 271627, "pid": 76337, "tid": -914061504, "ts": 1716454225653630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225803678, "dur": 3, "args": { "External id": 271629, "device": 5, "context": 1, "stream": 7, "correlation": 271629, "bytes": 46080, "memory bandwidth (GB/s)": 12.100840336134453 } }, { "ph": "f", "id": 271629, "pid": 5, "tid": 7, "ts": 1716454225803678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225653646, "dur": 22, "args": { "External id": 271629, "cbid": 51, "correlation": 271629 } }, { "ph": "s", "id": 271629, "pid": 76337, "tid": -914061504, "ts": 1716454225653646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225653674, "dur": 2, "args": { "External id": 271631, "cbid": 200, "correlation": 271631 } }, { "ph": "f", "id": 271631, "pid": 76337, "tid": -914061504, "ts": 1716454225653674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225653677, "dur": 0, "args": { "External id": 271632, "cbid": 200, "correlation": 271632 } }, { "ph": "f", "id": 271632, "pid": 76337, "tid": -914061504, "ts": 1716454225653677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225653678, "dur": 0, "args": { "External id": 271633, "cbid": 200, "correlation": 271633 } }, { "ph": "f", "id": 271633, "pid": 76337, "tid": -914061504, "ts": 1716454225653678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225653678, "dur": 0, "args": { "External id": 271634, "cbid": 200, "correlation": 271634 } }, { "ph": "f", "id": 271634, "pid": 76337, "tid": -914061504, "ts": 1716454225653678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454225653679, "dur": 3, "args": { "External id": 271635, "cbid": 15, "correlation": 271635 } }, { "ph": "f", "id": 271635, "pid": 76337, "tid": -914061504, "ts": 1716454225653679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225653684, "dur": 1, "args": { "External id": 271636, "cbid": 251, "correlation": 271636 } }, { "ph": "f", "id": 271636, "pid": 76337, "tid": -914061504, "ts": 1716454225653684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454225803683, "dur": 23, "args": { "External id": 271637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271637, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271637, "pid": 5, "tid": 7, "ts": 1716454225803683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653687, "dur": 9, "args": { "External id": 271637, "cbid": 211, "correlation": 271637 } }, { "ph": "s", "id": 271637, "pid": 76337, "tid": -914061504, "ts": 1716454225653687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225803707, "dur": 4, "args": { "External id": 271639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271639, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 271639, "pid": 5, "tid": 7, "ts": 1716454225803707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653705, "dur": 7, "args": { "External id": 271639, "cbid": 211, "correlation": 271639 } }, { "ph": "s", "id": 271639, "pid": 76337, "tid": -914061504, "ts": 1716454225653705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225653718, "dur": 0, "args": { "External id": 271640, "cbid": 51, "correlation": 271640 } }, { "ph": "s", "id": 271640, "pid": 76337, "tid": -914061504, "ts": 1716454225653718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225803712, "dur": 193, "args": { "External id": 271641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271641, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271641, "pid": 5, "tid": 7, "ts": 1716454225803712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653719, "dur": 180, "args": { "External id": 271641, "cbid": 211, "correlation": 271641 } }, { "ph": "s", "id": 271641, "pid": 76337, "tid": -914061504, "ts": 1716454225653719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225803907, "dur": 6, "args": { "External id": 271642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271642, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271642, "pid": 5, "tid": 7, "ts": 1716454225803907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653905, "dur": 7, "args": { "External id": 271642, "cbid": 211, "correlation": 271642 } }, { "ph": "s", "id": 271642, "pid": 76337, "tid": -914061504, "ts": 1716454225653905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225803915, "dur": 5, "args": { "External id": 271648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 271648, "pid": 5, "tid": 7, "ts": 1716454225803915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225653937, "dur": 8, "args": { "External id": 271648, "cbid": 211, "correlation": 271648 } }, { "ph": "s", "id": 271648, "pid": 76337, "tid": -914061504, "ts": 1716454225653937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803921, "dur": 3, "args": { "External id": 271656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271656, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271656, "pid": 5, "tid": 7, "ts": 1716454225803921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225655648, "dur": 16, "args": { "External id": 271656, "cbid": 211, "correlation": 271656 } }, { "ph": "s", "id": 271656, "pid": 76337, "tid": -914061504, "ts": 1716454225655648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803926, "dur": 3, "args": { "External id": 271664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271664, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271664, "pid": 5, "tid": 7, "ts": 1716454225803926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225655692, "dur": 10, "args": { "External id": 271664, "cbid": 211, "correlation": 271664 } }, { "ph": "s", "id": 271664, "pid": 76337, "tid": -914061504, "ts": 1716454225655692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803930, "dur": 3, "args": { "External id": 271672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271672, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271672, "pid": 5, "tid": 7, "ts": 1716454225803930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225655719, "dur": 8, "args": { "External id": 271672, "cbid": 211, "correlation": 271672 } }, { "ph": "s", "id": 271672, "pid": 76337, "tid": -914061504, "ts": 1716454225655719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803934, "dur": 3, "args": { "External id": 271681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271681, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271681, "pid": 5, "tid": 7, "ts": 1716454225803934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225655897, "dur": 14, "args": { "External id": 271681, "cbid": 211, "correlation": 271681 } }, { "ph": "s", "id": 271681, "pid": 76337, "tid": -914061504, "ts": 1716454225655897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803939, "dur": 3, "args": { "External id": 271690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271690, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271690, "pid": 5, "tid": 7, "ts": 1716454225803939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225655928, "dur": 7, "args": { "External id": 271690, "cbid": 211, "correlation": 271690 } }, { "ph": "s", "id": 271690, "pid": 76337, "tid": -914061504, "ts": 1716454225655928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803943, "dur": 3, "args": { "External id": 271698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271698, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271698, "pid": 5, "tid": 7, "ts": 1716454225803943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225655953, "dur": 8, "args": { "External id": 271698, "cbid": 211, "correlation": 271698 } }, { "ph": "s", "id": 271698, "pid": 76337, "tid": -914061504, "ts": 1716454225655953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803947, "dur": 3, "args": { "External id": 271706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271706, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271706, "pid": 5, "tid": 7, "ts": 1716454225803947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225656223, "dur": 16, "args": { "External id": 271706, "cbid": 211, "correlation": 271706 } }, { "ph": "s", "id": 271706, "pid": 76337, "tid": -914061504, "ts": 1716454225656223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225803952, "dur": 3, "args": { "External id": 271714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271714, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271714, "pid": 5, "tid": 7, "ts": 1716454225803952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225656254, "dur": 8, "args": { "External id": 271714, "cbid": 211, "correlation": 271714 } }, { "ph": "s", "id": 271714, "pid": 76337, "tid": -914061504, "ts": 1716454225656254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225803957, "dur": 1, "args": { "External id": 271724, "device": 5, "context": 1, "stream": 7, "correlation": 271724, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 271724, "pid": 5, "tid": 7, "ts": 1716454225803957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225656321, "dur": 35, "args": { "External id": 271724, "cbid": 41, "correlation": 271724 } }, { "ph": "s", "id": 271724, "pid": 76337, "tid": -914061504, "ts": 1716454225656321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225656358, "dur": 147623, "args": { "External id": 271725, "cbid": 131, "correlation": 271725 } }, { "ph": "f", "id": 271725, "pid": 76337, "tid": -914061504, "ts": 1716454225656358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225804179, "dur": 3, "args": { "External id": 271733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271733, "pid": 5, "tid": 7, "ts": 1716454225804179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804149, "dur": 32, "args": { "External id": 271733, "cbid": 211, "correlation": 271733 } }, { "ph": "s", "id": 271733, "pid": 76337, "tid": -914061504, "ts": 1716454225804149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225804293, "dur": 3, "args": { "External id": 271742, "device": 5, "context": 1, "stream": 7, "correlation": 271742, "bytes": 8, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 271742, "pid": 5, "tid": 7, "ts": 1716454225804293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804250, "dur": 43, "args": { "External id": 271742, "cbid": 41, "correlation": 271742 } }, { "ph": "s", "id": 271742, "pid": 76337, "tid": -914061504, "ts": 1716454225804250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225804399, "dur": 4, "args": { "External id": 271752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271752, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 271752, "pid": 5, "tid": 7, "ts": 1716454225804399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804382, "dur": 19, "args": { "External id": 271752, "cbid": 211, "correlation": 271752 } }, { "ph": "s", "id": 271752, "pid": 76337, "tid": -914061504, "ts": 1716454225804382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225804487, "dur": 1, "args": { "External id": 271762, "device": 5, "context": 1, "stream": 7, "correlation": 271762, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 271762, "pid": 5, "tid": 7, "ts": 1716454225804487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804463, "dur": 23, "args": { "External id": 271762, "cbid": 41, "correlation": 271762 } }, { "ph": "s", "id": 271762, "pid": 76337, "tid": -914061504, "ts": 1716454225804463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225804487, "dur": 8, "args": { "External id": 271763, "cbid": 131, "correlation": 271763 } }, { "ph": "f", "id": 271763, "pid": 76337, "tid": -914061504, "ts": 1716454225804487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225804573, "dur": 3, "args": { "External id": 271770, "device": 5, "context": 1, "stream": 7, "correlation": 271770, "bytes": 98304, "memory bandwidth (GB/s)": 29.825242718446603 } }, { "ph": "f", "id": 271770, "pid": 5, "tid": 7, "ts": 1716454225804573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804539, "dur": 34, "args": { "External id": 271770, "cbid": 41, "correlation": 271770 } }, { "ph": "s", "id": 271770, "pid": 76337, "tid": -914061504, "ts": 1716454225804539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225804673, "dur": 3, "args": { "External id": 271789, "device": 5, "context": 1, "stream": 7, "correlation": 271789, "bytes": 16, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 271789, "pid": 5, "tid": 7, "ts": 1716454225804673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804652, "dur": 20, "args": { "External id": 271789, "cbid": 41, "correlation": 271789 } }, { "ph": "s", "id": 271789, "pid": 76337, "tid": -914061504, "ts": 1716454225804652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225804713, "dur": 3, "args": { "External id": 271795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271795, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271795, "pid": 5, "tid": 7, "ts": 1716454225804713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804702, "dur": 11, "args": { "External id": 271795, "cbid": 211, "correlation": 271795 } }, { "ph": "s", "id": 271795, "pid": 76337, "tid": -914061504, "ts": 1716454225804702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454225804727, "dur": 6, "args": { "External id": 271797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271797, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 271797, "pid": 5, "tid": 7, "ts": 1716454225804727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804717, "dur": 9, "args": { "External id": 271797, "cbid": 211, "correlation": 271797 } }, { "ph": "s", "id": 271797, "pid": 76337, "tid": -914061504, "ts": 1716454225804717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454225804735, "dur": 3, "args": { "External id": 271799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271799, "pid": 5, "tid": 7, "ts": 1716454225804735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804728, "dur": 7, "args": { "External id": 271799, "cbid": 211, "correlation": 271799 } }, { "ph": "s", "id": 271799, "pid": 76337, "tid": -914061504, "ts": 1716454225804728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225804773, "dur": 2, "args": { "External id": 271807, "device": 5, "context": 1, "stream": 7, "correlation": 271807, "bytes": 8, "memory bandwidth (GB/s)": 0.002688172043010753 } }, { "ph": "f", "id": 271807, "pid": 5, "tid": 7, "ts": 1716454225804773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804757, "dur": 15, "args": { "External id": 271807, "cbid": 41, "correlation": 271807 } }, { "ph": "s", "id": 271807, "pid": 76337, "tid": -914061504, "ts": 1716454225804757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225804822, "dur": 3, "args": { "External id": 271821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271821, "pid": 5, "tid": 7, "ts": 1716454225804822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804811, "dur": 12, "args": { "External id": 271821, "cbid": 211, "correlation": 271821 } }, { "ph": "s", "id": 271821, "pid": 76337, "tid": -914061504, "ts": 1716454225804811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225804843, "dur": 2, "args": { "External id": 271835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271835, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271835, "pid": 5, "tid": 7, "ts": 1716454225804843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804835, "dur": 7, "args": { "External id": 271835, "cbid": 211, "correlation": 271835 } }, { "ph": "s", "id": 271835, "pid": 76337, "tid": -914061504, "ts": 1716454225804835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225804878, "dur": 6, "args": { "External id": 271842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271842, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271842, "pid": 5, "tid": 7, "ts": 1716454225804878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804867, "dur": 11, "args": { "External id": 271842, "cbid": 211, "correlation": 271842 } }, { "ph": "s", "id": 271842, "pid": 76337, "tid": -914061504, "ts": 1716454225804867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454225804890, "dur": 6, "args": { "External id": 271845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271845, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271845, "pid": 5, "tid": 7, "ts": 1716454225804890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804881, "dur": 8, "args": { "External id": 271845, "cbid": 211, "correlation": 271845 } }, { "ph": "s", "id": 271845, "pid": 76337, "tid": -914061504, "ts": 1716454225804881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454225804899, "dur": 3, "args": { "External id": 271847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271847, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271847, "pid": 5, "tid": 7, "ts": 1716454225804899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804891, "dur": 7, "args": { "External id": 271847, "cbid": 211, "correlation": 271847 } }, { "ph": "s", "id": 271847, "pid": 76337, "tid": -914061504, "ts": 1716454225804891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225804920, "dur": 2, "args": { "External id": 271850, "device": 5, "context": 1, "stream": 7, "correlation": 271850, "bytes": 8, "memory bandwidth (GB/s)": 0.002717391304347826 } }, { "ph": "f", "id": 271850, "pid": 5, "tid": 7, "ts": 1716454225804920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804907, "dur": 12, "args": { "External id": 271850, "cbid": 41, "correlation": 271850 } }, { "ph": "s", "id": 271850, "pid": 76337, "tid": -914061504, "ts": 1716454225804907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225804972, "dur": 4, "args": { "External id": 271866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271866, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 271866, "pid": 5, "tid": 7, "ts": 1716454225804972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225804960, "dur": 12, "args": { "External id": 271866, "cbid": 211, "correlation": 271866 } }, { "ph": "s", "id": 271866, "pid": 76337, "tid": -914061504, "ts": 1716454225804960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225805004, "dur": 3, "args": { "External id": 271871, "device": 5, "context": 1, "stream": 7, "correlation": 271871, "bytes": 1, "memory bandwidth (GB/s)": 0.00030637254901960784 } }, { "ph": "f", "id": 271871, "pid": 5, "tid": 7, "ts": 1716454225805004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225804986, "dur": 17, "args": { "External id": 271871, "cbid": 41, "correlation": 271871 } }, { "ph": "s", "id": 271871, "pid": 76337, "tid": -914061504, "ts": 1716454225804986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225805033, "dur": 1, "args": { "External id": 271877, "device": 5, "context": 1, "stream": 7, "correlation": 271877, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 271877, "pid": 5, "tid": 7, "ts": 1716454225805033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225805013, "dur": 29, "args": { "External id": 271877, "cbid": 41, "correlation": 271877 } }, { "ph": "s", "id": 271877, "pid": 76337, "tid": -914061504, "ts": 1716454225805013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225805042, "dur": 3, "args": { "External id": 271878, "cbid": 131, "correlation": 271878 } }, { "ph": "f", "id": 271878, "pid": 76337, "tid": -914061504, "ts": 1716454225805042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805097, "dur": 3, "args": { "External id": 271886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271886, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271886, "pid": 5, "tid": 7, "ts": 1716454225805097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805084, "dur": 15, "args": { "External id": 271886, "cbid": 211, "correlation": 271886 } }, { "ph": "s", "id": 271886, "pid": 76337, "tid": -914061504, "ts": 1716454225805084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805133, "dur": 3, "args": { "External id": 271896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271896, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271896, "pid": 5, "tid": 7, "ts": 1716454225805133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805120, "dur": 12, "args": { "External id": 271896, "cbid": 211, "correlation": 271896 } }, { "ph": "s", "id": 271896, "pid": 76337, "tid": -914061504, "ts": 1716454225805120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805159, "dur": 3, "args": { "External id": 271905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271905, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271905, "pid": 5, "tid": 7, "ts": 1716454225805159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805149, "dur": 9, "args": { "External id": 271905, "cbid": 211, "correlation": 271905 } }, { "ph": "s", "id": 271905, "pid": 76337, "tid": -914061504, "ts": 1716454225805149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225805278, "dur": 12, "args": { "External id": 271915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271915, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271915, "pid": 5, "tid": 7, "ts": 1716454225805278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805264, "dur": 15, "args": { "External id": 271915, "cbid": 211, "correlation": 271915 } }, { "ph": "s", "id": 271915, "pid": 76337, "tid": -914061504, "ts": 1716454225805264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805318, "dur": 3, "args": { "External id": 271923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271923, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271923, "pid": 5, "tid": 7, "ts": 1716454225805318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805308, "dur": 9, "args": { "External id": 271923, "cbid": 211, "correlation": 271923 } }, { "ph": "s", "id": 271923, "pid": 76337, "tid": -914061504, "ts": 1716454225805308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225805369, "dur": 11, "args": { "External id": 271933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271933, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271933, "pid": 5, "tid": 7, "ts": 1716454225805369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805356, "dur": 14, "args": { "External id": 271933, "cbid": 211, "correlation": 271933 } }, { "ph": "s", "id": 271933, "pid": 76337, "tid": -914061504, "ts": 1716454225805356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225805406, "dur": 10, "args": { "External id": 271941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271941, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271941, "pid": 5, "tid": 7, "ts": 1716454225805406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805395, "dur": 11, "args": { "External id": 271941, "cbid": 211, "correlation": 271941 } }, { "ph": "s", "id": 271941, "pid": 76337, "tid": -914061504, "ts": 1716454225805395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805437, "dur": 3, "args": { "External id": 271950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271950, "pid": 5, "tid": 7, "ts": 1716454225805437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805427, "dur": 9, "args": { "External id": 271950, "cbid": 211, "correlation": 271950 } }, { "ph": "s", "id": 271950, "pid": 76337, "tid": -914061504, "ts": 1716454225805427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225805462, "dur": 5, "args": { "External id": 271959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271959, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271959, "pid": 5, "tid": 7, "ts": 1716454225805462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805452, "dur": 9, "args": { "External id": 271959, "cbid": 211, "correlation": 271959 } }, { "ph": "s", "id": 271959, "pid": 76337, "tid": -914061504, "ts": 1716454225805452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225805508, "dur": 8, "args": { "External id": 271969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271969, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271969, "pid": 5, "tid": 7, "ts": 1716454225805508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805495, "dur": 13, "args": { "External id": 271969, "cbid": 211, "correlation": 271969 } }, { "ph": "s", "id": 271969, "pid": 76337, "tid": -914061504, "ts": 1716454225805495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805843, "dur": 3, "args": { "External id": 271978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271978, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271978, "pid": 5, "tid": 7, "ts": 1716454225805843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805827, "dur": 16, "args": { "External id": 271978, "cbid": 211, "correlation": 271978 } }, { "ph": "s", "id": 271978, "pid": 76337, "tid": -914061504, "ts": 1716454225805827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225805873, "dur": 3, "args": { "External id": 271986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 271986, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 271986, "pid": 5, "tid": 7, "ts": 1716454225805873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225805862, "dur": 10, "args": { "External id": 271986, "cbid": 211, "correlation": 271986 } }, { "ph": "s", "id": 271986, "pid": 76337, "tid": -914061504, "ts": 1716454225805862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225805928, "dur": 1, "args": { "External id": 271996, "device": 5, "context": 1, "stream": 7, "correlation": 271996, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 271996, "pid": 5, "tid": 7, "ts": 1716454225805928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225805909, "dur": 17, "args": { "External id": 271996, "cbid": 41, "correlation": 271996 } }, { "ph": "s", "id": 271996, "pid": 76337, "tid": -914061504, "ts": 1716454225805909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225805926, "dur": 8, "args": { "External id": 271997, "cbid": 131, "correlation": 271997 } }, { "ph": "f", "id": 271997, "pid": 76337, "tid": -914061504, "ts": 1716454225805926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806027, "dur": 2, "args": { "External id": 272005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272005, "pid": 5, "tid": 7, "ts": 1716454225806027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806012, "dur": 15, "args": { "External id": 272005, "cbid": 211, "correlation": 272005 } }, { "ph": "s", "id": 272005, "pid": 76337, "tid": -914061504, "ts": 1716454225806012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225806100, "dur": 3, "args": { "External id": 272014, "device": 5, "context": 1, "stream": 7, "correlation": 272014, "bytes": 8, "memory bandwidth (GB/s)": 0.002551020408163265 } }, { "ph": "f", "id": 272014, "pid": 5, "tid": 7, "ts": 1716454225806100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806081, "dur": 19, "args": { "External id": 272014, "cbid": 41, "correlation": 272014 } }, { "ph": "s", "id": 272014, "pid": 76337, "tid": -914061504, "ts": 1716454225806081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225806170, "dur": 3, "args": { "External id": 272024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272024, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272024, "pid": 5, "tid": 7, "ts": 1716454225806170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806156, "dur": 14, "args": { "External id": 272024, "cbid": 211, "correlation": 272024 } }, { "ph": "s", "id": 272024, "pid": 76337, "tid": -914061504, "ts": 1716454225806156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225806221, "dur": 1, "args": { "External id": 272034, "device": 5, "context": 1, "stream": 7, "correlation": 272034, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 272034, "pid": 5, "tid": 7, "ts": 1716454225806221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806208, "dur": 11, "args": { "External id": 272034, "cbid": 41, "correlation": 272034 } }, { "ph": "s", "id": 272034, "pid": 76337, "tid": -914061504, "ts": 1716454225806208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225806220, "dur": 8, "args": { "External id": 272035, "cbid": 131, "correlation": 272035 } }, { "ph": "f", "id": 272035, "pid": 76337, "tid": -914061504, "ts": 1716454225806220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454225806283, "dur": 3, "args": { "External id": 272042, "device": 5, "context": 1, "stream": 7, "correlation": 272042, "bytes": 98304, "memory bandwidth (GB/s)": 31.03030303030303 } }, { "ph": "f", "id": 272042, "pid": 5, "tid": 7, "ts": 1716454225806283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806261, "dur": 21, "args": { "External id": 272042, "cbid": 41, "correlation": 272042 } }, { "ph": "s", "id": 272042, "pid": 76337, "tid": -914061504, "ts": 1716454225806261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225806334, "dur": 1, "args": { "External id": 272053, "device": 5, "context": 1, "stream": 7, "correlation": 272053, "bytes": 2, "memory bandwidth (GB/s)": 0.0012755102040816326 } }, { "ph": "f", "id": 272053, "pid": 5, "tid": 7, "ts": 1716454225806334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806319, "dur": 13, "args": { "External id": 272053, "cbid": 41, "correlation": 272053 } }, { "ph": "s", "id": 272053, "pid": 76337, "tid": -914061504, "ts": 1716454225806319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225806333, "dur": 8, "args": { "External id": 272054, "cbid": 131, "correlation": 272054 } }, { "ph": "f", "id": 272054, "pid": 76337, "tid": -914061504, "ts": 1716454225806333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806383, "dur": 3, "args": { "External id": 272062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272062, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272062, "pid": 5, "tid": 7, "ts": 1716454225806383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806370, "dur": 14, "args": { "External id": 272062, "cbid": 211, "correlation": 272062 } }, { "ph": "s", "id": 272062, "pid": 76337, "tid": -914061504, "ts": 1716454225806370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806416, "dur": 3, "args": { "External id": 272072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272072, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272072, "pid": 5, "tid": 7, "ts": 1716454225806416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806406, "dur": 8, "args": { "External id": 272072, "cbid": 211, "correlation": 272072 } }, { "ph": "s", "id": 272072, "pid": 76337, "tid": -914061504, "ts": 1716454225806406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806439, "dur": 3, "args": { "External id": 272081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272081, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272081, "pid": 5, "tid": 7, "ts": 1716454225806439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806430, "dur": 8, "args": { "External id": 272081, "cbid": 211, "correlation": 272081 } }, { "ph": "s", "id": 272081, "pid": 76337, "tid": -914061504, "ts": 1716454225806430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225806507, "dur": 6, "args": { "External id": 272089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272089, "pid": 5, "tid": 7, "ts": 1716454225806507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806495, "dur": 13, "args": { "External id": 272089, "cbid": 211, "correlation": 272089 } }, { "ph": "s", "id": 272089, "pid": 76337, "tid": -914061504, "ts": 1716454225806495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806548, "dur": 3, "args": { "External id": 272098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272098, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272098, "pid": 5, "tid": 7, "ts": 1716454225806548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806538, "dur": 9, "args": { "External id": 272098, "cbid": 211, "correlation": 272098 } }, { "ph": "s", "id": 272098, "pid": 76337, "tid": -914061504, "ts": 1716454225806538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806570, "dur": 3, "args": { "External id": 272107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272107, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272107, "pid": 5, "tid": 7, "ts": 1716454225806570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806562, "dur": 7, "args": { "External id": 272107, "cbid": 211, "correlation": 272107 } }, { "ph": "s", "id": 272107, "pid": 76337, "tid": -914061504, "ts": 1716454225806562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225806631, "dur": 3, "args": { "External id": 272115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272115, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272115, "pid": 5, "tid": 7, "ts": 1716454225806631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806621, "dur": 10, "args": { "External id": 272115, "cbid": 211, "correlation": 272115 } }, { "ph": "s", "id": 272115, "pid": 76337, "tid": -914061504, "ts": 1716454225806621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225806695, "dur": 1, "args": { "External id": 272123, "device": 5, "context": 1, "stream": 7, "correlation": 272123, "bytes": 8, "memory bandwidth (GB/s)": 0.0043859649122807015 } }, { "ph": "f", "id": 272123, "pid": 5, "tid": 7, "ts": 1716454225806695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806674, "dur": 30, "args": { "External id": 272123, "cbid": 41, "correlation": 272123 } }, { "ph": "s", "id": 272123, "pid": 76337, "tid": -914061504, "ts": 1716454225806674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225806705, "dur": 3, "args": { "External id": 272124, "cbid": 131, "correlation": 272124 } }, { "ph": "f", "id": 272124, "pid": 76337, "tid": -914061504, "ts": 1716454225806705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225806770, "dur": 1, "args": { "External id": 272134, "device": 5, "context": 1, "stream": 7, "correlation": 272134, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 272134, "pid": 5, "tid": 7, "ts": 1716454225806770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806754, "dur": 14, "args": { "External id": 272134, "cbid": 41, "correlation": 272134 } }, { "ph": "s", "id": 272134, "pid": 76337, "tid": -914061504, "ts": 1716454225806754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225806769, "dur": 8, "args": { "External id": 272135, "cbid": 131, "correlation": 272135 } }, { "ph": "f", "id": 272135, "pid": 76337, "tid": -914061504, "ts": 1716454225806769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225806826, "dur": 1, "args": { "External id": 272144, "device": 5, "context": 1, "stream": 7, "correlation": 272144, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 272144, "pid": 5, "tid": 7, "ts": 1716454225806826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806815, "dur": 8, "args": { "External id": 272144, "cbid": 41, "correlation": 272144 } }, { "ph": "s", "id": 272144, "pid": 76337, "tid": -914061504, "ts": 1716454225806815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225806824, "dur": 8, "args": { "External id": 272145, "cbid": 131, "correlation": 272145 } }, { "ph": "f", "id": 272145, "pid": 76337, "tid": -914061504, "ts": 1716454225806824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225806897, "dur": 4, "args": { "External id": 272152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272152, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272152, "pid": 5, "tid": 7, "ts": 1716454225806897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806880, "dur": 17, "args": { "External id": 272152, "cbid": 211, "correlation": 272152 } }, { "ph": "s", "id": 272152, "pid": 76337, "tid": -914061504, "ts": 1716454225806880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454225806935, "dur": 4, "args": { "External id": 272172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272172, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272172, "pid": 5, "tid": 7, "ts": 1716454225806935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806924, "dur": 12, "args": { "External id": 272172, "cbid": 211, "correlation": 272172 } }, { "ph": "s", "id": 272172, "pid": 76337, "tid": -914061504, "ts": 1716454225806924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225806937, "dur": 0, "args": { "External id": 272173, "cbid": 11, "correlation": 272173 } }, { "ph": "f", "id": 272173, "pid": 76337, "tid": -914061504, "ts": 1716454225806937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225806937, "dur": 0, "args": { "External id": 272174, "cbid": 11, "correlation": 272174 } }, { "ph": "f", "id": 272174, "pid": 76337, "tid": -914061504, "ts": 1716454225806937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225806953, "dur": 1, "args": { "External id": 272177, "device": 5, "context": 1, "stream": 7, "correlation": 272177, "bytes": 4, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 272177, "pid": 5, "tid": 7, "ts": 1716454225806953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225806938, "dur": 24, "args": { "External id": 272177, "cbid": 41, "correlation": 272177 } }, { "ph": "s", "id": 272177, "pid": 76337, "tid": -914061504, "ts": 1716454225806938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225806963, "dur": 3, "args": { "External id": 272178, "cbid": 131, "correlation": 272178 } }, { "ph": "f", "id": 272178, "pid": 76337, "tid": -914061504, "ts": 1716454225806963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454225807001, "dur": 3, "args": { "External id": 272202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272202, "pid": 5, "tid": 7, "ts": 1716454225807001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225806991, "dur": 11, "args": { "External id": 272202, "cbid": 211, "correlation": 272202 } }, { "ph": "s", "id": 272202, "pid": 76337, "tid": -914061504, "ts": 1716454225806991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225807002, "dur": 0, "args": { "External id": 272203, "cbid": 11, "correlation": 272203 } }, { "ph": "f", "id": 272203, "pid": 76337, "tid": -914061504, "ts": 1716454225807002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225807003, "dur": 0, "args": { "External id": 272204, "cbid": 11, "correlation": 272204 } }, { "ph": "f", "id": 272204, "pid": 76337, "tid": -914061504, "ts": 1716454225807003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225807004, "dur": 1, "args": { "External id": 272206, "cbid": 200, "correlation": 272206 } }, { "ph": "f", "id": 272206, "pid": 76337, "tid": -914061504, "ts": 1716454225807004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454225807015, "dur": 4, "args": { "External id": 272208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272208, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272208, "pid": 5, "tid": 7, "ts": 1716454225807015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807007, "dur": 8, "args": { "External id": 272208, "cbid": 211, "correlation": 272208 } }, { "ph": "s", "id": 272208, "pid": 76337, "tid": -914061504, "ts": 1716454225807007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225807016, "dur": 0, "args": { "External id": 272209, "cbid": 11, "correlation": 272209 } }, { "ph": "f", "id": 272209, "pid": 76337, "tid": -914061504, "ts": 1716454225807016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454225807016, "dur": 0, "args": { "External id": 272210, "cbid": 11, "correlation": 272210 } }, { "ph": "f", "id": 272210, "pid": 76337, "tid": -914061504, "ts": 1716454225807016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454225807057, "dur": 1, "args": { "External id": 272217, "device": 5, "context": 1, "stream": 7, "correlation": 272217, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 272217, "pid": 5, "tid": 7, "ts": 1716454225807057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225807042, "dur": 23, "args": { "External id": 272217, "cbid": 41, "correlation": 272217 } }, { "ph": "s", "id": 272217, "pid": 76337, "tid": -914061504, "ts": 1716454225807042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225807066, "dur": 3, "args": { "External id": 272218, "cbid": 131, "correlation": 272218 } }, { "ph": "f", "id": 272218, "pid": 76337, "tid": -914061504, "ts": 1716454225807066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454225807117, "dur": 1, "args": { "External id": 272228, "device": 5, "context": 1, "stream": 7, "correlation": 272228, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 272228, "pid": 5, "tid": 7, "ts": 1716454225807117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225807105, "dur": 9, "args": { "External id": 272228, "cbid": 41, "correlation": 272228 } }, { "ph": "s", "id": 272228, "pid": 76337, "tid": -914061504, "ts": 1716454225807105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225807116, "dur": 7, "args": { "External id": 272229, "cbid": 131, "correlation": 272229 } }, { "ph": "f", "id": 272229, "pid": 76337, "tid": -914061504, "ts": 1716454225807116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225807186, "dur": 5, "args": { "External id": 272236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272236, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272236, "pid": 5, "tid": 7, "ts": 1716454225807186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807171, "dur": 15, "args": { "External id": 272236, "cbid": 211, "correlation": 272236 } }, { "ph": "s", "id": 272236, "pid": 76337, "tid": -914061504, "ts": 1716454225807171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807257, "dur": 3, "args": { "External id": 272245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272245, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272245, "pid": 5, "tid": 7, "ts": 1716454225807257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807244, "dur": 13, "args": { "External id": 272245, "cbid": 211, "correlation": 272245 } }, { "ph": "s", "id": 272245, "pid": 76337, "tid": -914061504, "ts": 1716454225807244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807293, "dur": 3, "args": { "External id": 272253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272253, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272253, "pid": 5, "tid": 7, "ts": 1716454225807293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807283, "dur": 9, "args": { "External id": 272253, "cbid": 211, "correlation": 272253 } }, { "ph": "s", "id": 272253, "pid": 76337, "tid": -914061504, "ts": 1716454225807283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807324, "dur": 4, "args": { "External id": 272261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272261, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272261, "pid": 5, "tid": 7, "ts": 1716454225807324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807313, "dur": 11, "args": { "External id": 272261, "cbid": 211, "correlation": 272261 } }, { "ph": "s", "id": 272261, "pid": 76337, "tid": -914061504, "ts": 1716454225807313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807354, "dur": 4, "args": { "External id": 272269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272269, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272269, "pid": 5, "tid": 7, "ts": 1716454225807354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807343, "dur": 10, "args": { "External id": 272269, "cbid": 211, "correlation": 272269 } }, { "ph": "s", "id": 272269, "pid": 76337, "tid": -914061504, "ts": 1716454225807343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807381, "dur": 3, "args": { "External id": 272277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272277, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272277, "pid": 5, "tid": 7, "ts": 1716454225807381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807372, "dur": 8, "args": { "External id": 272277, "cbid": 211, "correlation": 272277 } }, { "ph": "s", "id": 272277, "pid": 76337, "tid": -914061504, "ts": 1716454225807372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807407, "dur": 4, "args": { "External id": 272285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272285, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272285, "pid": 5, "tid": 7, "ts": 1716454225807407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807397, "dur": 10, "args": { "External id": 272285, "cbid": 211, "correlation": 272285 } }, { "ph": "s", "id": 272285, "pid": 76337, "tid": -914061504, "ts": 1716454225807397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225807429, "dur": 4, "args": { "External id": 272293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272293, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272293, "pid": 5, "tid": 7, "ts": 1716454225807429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807421, "dur": 7, "args": { "External id": 272293, "cbid": 211, "correlation": 272293 } }, { "ph": "s", "id": 272293, "pid": 76337, "tid": -914061504, "ts": 1716454225807421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225807448, "dur": 4, "args": { "External id": 272301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272301, "pid": 5, "tid": 7, "ts": 1716454225807448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807440, "dur": 7, "args": { "External id": 272301, "cbid": 211, "correlation": 272301 } }, { "ph": "s", "id": 272301, "pid": 76337, "tid": -914061504, "ts": 1716454225807440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807469, "dur": 3, "args": { "External id": 272309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272309, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272309, "pid": 5, "tid": 7, "ts": 1716454225807469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807460, "dur": 7, "args": { "External id": 272309, "cbid": 211, "correlation": 272309 } }, { "ph": "s", "id": 272309, "pid": 76337, "tid": -914061504, "ts": 1716454225807460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807528, "dur": 3, "args": { "External id": 272317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272317, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272317, "pid": 5, "tid": 7, "ts": 1716454225807528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807517, "dur": 11, "args": { "External id": 272317, "cbid": 211, "correlation": 272317 } }, { "ph": "s", "id": 272317, "pid": 76337, "tid": -914061504, "ts": 1716454225807517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225807555, "dur": 4, "args": { "External id": 272325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272325, "pid": 5, "tid": 7, "ts": 1716454225807555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807545, "dur": 9, "args": { "External id": 272325, "cbid": 211, "correlation": 272325 } }, { "ph": "s", "id": 272325, "pid": 76337, "tid": -914061504, "ts": 1716454225807545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225807578, "dur": 4, "args": { "External id": 272333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272333, "pid": 5, "tid": 7, "ts": 1716454225807578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807569, "dur": 7, "args": { "External id": 272333, "cbid": 211, "correlation": 272333 } }, { "ph": "s", "id": 272333, "pid": 76337, "tid": -914061504, "ts": 1716454225807569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225807599, "dur": 3, "args": { "External id": 272341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272341, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272341, "pid": 5, "tid": 7, "ts": 1716454225807599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225807591, "dur": 7, "args": { "External id": 272341, "cbid": 211, "correlation": 272341 } }, { "ph": "s", "id": 272341, "pid": 76337, "tid": -914061504, "ts": 1716454225807591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225808018, "dur": 5, "args": { "External id": 272350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272350, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272350, "pid": 5, "tid": 7, "ts": 1716454225808018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808001, "dur": 17, "args": { "External id": 272350, "cbid": 211, "correlation": 272350 } }, { "ph": "s", "id": 272350, "pid": 76337, "tid": -914061504, "ts": 1716454225808001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225808056, "dur": 5, "args": { "External id": 272359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272359, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272359, "pid": 5, "tid": 7, "ts": 1716454225808056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808045, "dur": 10, "args": { "External id": 272359, "cbid": 211, "correlation": 272359 } }, { "ph": "s", "id": 272359, "pid": 76337, "tid": -914061504, "ts": 1716454225808045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454225808197, "dur": 3, "args": { "External id": 272375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272375, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272375, "pid": 5, "tid": 7, "ts": 1716454225808197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808182, "dur": 16, "args": { "External id": 272375, "cbid": 211, "correlation": 272375 } }, { "ph": "s", "id": 272375, "pid": 76337, "tid": -914061504, "ts": 1716454225808182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808230, "dur": 3, "args": { "External id": 272383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272383, "pid": 5, "tid": 7, "ts": 1716454225808230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808220, "dur": 9, "args": { "External id": 272383, "cbid": 211, "correlation": 272383 } }, { "ph": "s", "id": 272383, "pid": 76337, "tid": -914061504, "ts": 1716454225808220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808263, "dur": 3, "args": { "External id": 272391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272391, "pid": 5, "tid": 7, "ts": 1716454225808263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808254, "dur": 8, "args": { "External id": 272391, "cbid": 211, "correlation": 272391 } }, { "ph": "s", "id": 272391, "pid": 76337, "tid": -914061504, "ts": 1716454225808254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808294, "dur": 4, "args": { "External id": 272399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272399, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272399, "pid": 5, "tid": 7, "ts": 1716454225808294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808285, "dur": 8, "args": { "External id": 272399, "cbid": 211, "correlation": 272399 } }, { "ph": "s", "id": 272399, "pid": 76337, "tid": -914061504, "ts": 1716454225808285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454225808354, "dur": 4, "args": { "External id": 272411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272411, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272411, "pid": 5, "tid": 7, "ts": 1716454225808354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808340, "dur": 14, "args": { "External id": 272411, "cbid": 211, "correlation": 272411 } }, { "ph": "s", "id": 272411, "pid": 76337, "tid": -914061504, "ts": 1716454225808340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225808403, "dur": 4, "args": { "External id": 272422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272422, "pid": 5, "tid": 7, "ts": 1716454225808403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808391, "dur": 12, "args": { "External id": 272422, "cbid": 211, "correlation": 272422 } }, { "ph": "s", "id": 272422, "pid": 76337, "tid": -914061504, "ts": 1716454225808391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808434, "dur": 3, "args": { "External id": 272430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272430, "pid": 5, "tid": 7, "ts": 1716454225808434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808424, "dur": 9, "args": { "External id": 272430, "cbid": 211, "correlation": 272430 } }, { "ph": "s", "id": 272430, "pid": 76337, "tid": -914061504, "ts": 1716454225808424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808468, "dur": 5, "args": { "External id": 272438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272438, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272438, "pid": 5, "tid": 7, "ts": 1716454225808468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808457, "dur": 11, "args": { "External id": 272438, "cbid": 211, "correlation": 272438 } }, { "ph": "s", "id": 272438, "pid": 76337, "tid": -914061504, "ts": 1716454225808457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808498, "dur": 5, "args": { "External id": 272446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272446, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272446, "pid": 5, "tid": 7, "ts": 1716454225808498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808488, "dur": 10, "args": { "External id": 272446, "cbid": 211, "correlation": 272446 } }, { "ph": "s", "id": 272446, "pid": 76337, "tid": -914061504, "ts": 1716454225808488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225808528, "dur": 4, "args": { "External id": 272455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272455, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272455, "pid": 5, "tid": 7, "ts": 1716454225808528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808518, "dur": 10, "args": { "External id": 272455, "cbid": 211, "correlation": 272455 } }, { "ph": "s", "id": 272455, "pid": 76337, "tid": -914061504, "ts": 1716454225808518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225808590, "dur": 4, "args": { "External id": 272468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272468, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272468, "pid": 5, "tid": 7, "ts": 1716454225808590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808577, "dur": 13, "args": { "External id": 272468, "cbid": 211, "correlation": 272468 } }, { "ph": "s", "id": 272468, "pid": 76337, "tid": -914061504, "ts": 1716454225808577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454225808631, "dur": 5, "args": { "External id": 272478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272478, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 272478, "pid": 5, "tid": 7, "ts": 1716454225808631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808619, "dur": 11, "args": { "External id": 272478, "cbid": 211, "correlation": 272478 } }, { "ph": "s", "id": 272478, "pid": 76337, "tid": -914061504, "ts": 1716454225808619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225808761, "dur": 8, "args": { "External id": 272495, "cbid": 251, "correlation": 272495 } }, { "ph": "f", "id": 272495, "pid": 76337, "tid": -914061504, "ts": 1716454225808761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454225808796, "dur": 11, "args": { "External id": 272497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272497, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272497, "pid": 5, "tid": 7, "ts": 1716454225808796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808778, "dur": 20, "args": { "External id": 272497, "cbid": 211, "correlation": 272497 } }, { "ph": "s", "id": 272497, "pid": 76337, "tid": -914061504, "ts": 1716454225808778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225808861, "dur": 4, "args": { "External id": 272505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272505, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272505, "pid": 5, "tid": 7, "ts": 1716454225808861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808847, "dur": 14, "args": { "External id": 272505, "cbid": 211, "correlation": 272505 } }, { "ph": "s", "id": 272505, "pid": 76337, "tid": -914061504, "ts": 1716454225808847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225808922, "dur": 5, "args": { "External id": 272521, "cbid": 251, "correlation": 272521 } }, { "ph": "f", "id": 272521, "pid": 76337, "tid": -914061504, "ts": 1716454225808922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225808931, "dur": 0, "args": { "External id": 272523, "cbid": 251, "correlation": 272523 } }, { "ph": "f", "id": 272523, "pid": 76337, "tid": -914061504, "ts": 1716454225808931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225808950, "dur": 14, "args": { "External id": 272524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272524, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272524, "pid": 5, "tid": 7, "ts": 1716454225808950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808935, "dur": 15, "args": { "External id": 272524, "cbid": 211, "correlation": 272524 } }, { "ph": "s", "id": 272524, "pid": 76337, "tid": -914061504, "ts": 1716454225808935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225808965, "dur": 5, "args": { "External id": 272526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272526, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272526, "pid": 5, "tid": 7, "ts": 1716454225808965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225808954, "dur": 9, "args": { "External id": 272526, "cbid": 211, "correlation": 272526 } }, { "ph": "s", "id": 272526, "pid": 76337, "tid": -914061504, "ts": 1716454225808954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225809078, "dur": 1, "args": { "External id": 272536, "cbid": 317, "correlation": 272536 } }, { "ph": "f", "id": 272536, "pid": 76337, "tid": -914061504, "ts": 1716454225809078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225809080, "dur": 1, "args": { "External id": 272537, "cbid": 203, "correlation": 272537 } }, { "ph": "f", "id": 272537, "pid": 76337, "tid": -914061504, "ts": 1716454225809080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225809082, "dur": 1, "args": { "External id": 272538, "cbid": 205, "correlation": 272538 } }, { "ph": "f", "id": 272538, "pid": 76337, "tid": -914061504, "ts": 1716454225809082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225809143, "dur": 7, "args": { "External id": 272542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272542, "pid": 5, "tid": 7, "ts": 1716454225809143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225809126, "dur": 17, "args": { "External id": 272542, "cbid": 211, "correlation": 272542 } }, { "ph": "s", "id": 272542, "pid": 76337, "tid": -914061504, "ts": 1716454225809126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225809153, "dur": 4, "args": { "External id": 272544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272544, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 272544, "pid": 5, "tid": 7, "ts": 1716454225809153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225809146, "dur": 6, "args": { "External id": 272544, "cbid": 211, "correlation": 272544 } }, { "ph": "s", "id": 272544, "pid": 76337, "tid": -914061504, "ts": 1716454225809146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225809174, "dur": 3, "args": { "External id": 272546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272546, "pid": 5, "tid": 7, "ts": 1716454225809174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225809164, "dur": 8, "args": { "External id": 272546, "cbid": 211, "correlation": 272546 } }, { "ph": "s", "id": 272546, "pid": 76337, "tid": -914061504, "ts": 1716454225809164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225809179, "dur": 0, "args": { "External id": 272547, "cbid": 51, "correlation": 272547 } }, { "ph": "s", "id": 272547, "pid": 76337, "tid": -914061504, "ts": 1716454225809179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225809192, "dur": 90, "args": { "External id": 272548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272548, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272548, "pid": 5, "tid": 7, "ts": 1716454225809192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225809182, "dur": 8, "args": { "External id": 272548, "cbid": 211, "correlation": 272548 } }, { "ph": "s", "id": 272548, "pid": 76337, "tid": -914061504, "ts": 1716454225809182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225809283, "dur": 60, "args": { "External id": 272553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272553, "pid": 5, "tid": 7, "ts": 1716454225809283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225809221, "dur": 11, "args": { "External id": 272553, "cbid": 211, "correlation": 272553 } }, { "ph": "s", "id": 272553, "pid": 76337, "tid": -914061504, "ts": 1716454225809221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225811067, "dur": 51, "args": { "External id": 272573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272573, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 272573, "pid": 5, "tid": 7, "ts": 1716454225811067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811050, "dur": 17, "args": { "External id": 272573, "cbid": 211, "correlation": 272573 } }, { "ph": "s", "id": 272573, "pid": 76337, "tid": -914061504, "ts": 1716454225811050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225811120, "dur": 5, "args": { "External id": 272585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272585, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272585, "pid": 5, "tid": 7, "ts": 1716454225811120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811082, "dur": 9, "args": { "External id": 272585, "cbid": 211, "correlation": 272585 } }, { "ph": "s", "id": 272585, "pid": 76337, "tid": -914061504, "ts": 1716454225811082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225811126, "dur": 58, "args": { "External id": 272588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272588, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272588, "pid": 5, "tid": 7, "ts": 1716454225811126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811108, "dur": 7, "args": { "External id": 272588, "cbid": 211, "correlation": 272588 } }, { "ph": "s", "id": 272588, "pid": 76337, "tid": -914061504, "ts": 1716454225811108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225811185, "dur": 37, "args": { "External id": 272597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272597, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272597, "pid": 5, "tid": 7, "ts": 1716454225811185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811155, "dur": 10, "args": { "External id": 272597, "cbid": 211, "correlation": 272597 } }, { "ph": "s", "id": 272597, "pid": 76337, "tid": -914061504, "ts": 1716454225811155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225811215, "dur": 0, "args": { "External id": 272607, "cbid": 317, "correlation": 272607 } }, { "ph": "f", "id": 272607, "pid": 76337, "tid": -914061504, "ts": 1716454225811215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225811215, "dur": 0, "args": { "External id": 272608, "cbid": 203, "correlation": 272608 } }, { "ph": "f", "id": 272608, "pid": 76337, "tid": -914061504, "ts": 1716454225811215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225811216, "dur": 0, "args": { "External id": 272609, "cbid": 205, "correlation": 272609 } }, { "ph": "f", "id": 272609, "pid": 76337, "tid": -914061504, "ts": 1716454225811216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225811246, "dur": 40, "args": { "External id": 272613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272613, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272613, "pid": 5, "tid": 7, "ts": 1716454225811246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811233, "dur": 13, "args": { "External id": 272613, "cbid": 211, "correlation": 272613 } }, { "ph": "s", "id": 272613, "pid": 76337, "tid": -914061504, "ts": 1716454225811233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225811288, "dur": 15, "args": { "External id": 272615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272615, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272615, "pid": 5, "tid": 7, "ts": 1716454225811288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811249, "dur": 6, "args": { "External id": 272615, "cbid": 211, "correlation": 272615 } }, { "ph": "s", "id": 272615, "pid": 76337, "tid": -914061504, "ts": 1716454225811249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225811305, "dur": 3, "args": { "External id": 272617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272617, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272617, "pid": 5, "tid": 7, "ts": 1716454225811305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811261, "dur": 6, "args": { "External id": 272617, "cbid": 211, "correlation": 272617 } }, { "ph": "s", "id": 272617, "pid": 76337, "tid": -914061504, "ts": 1716454225811261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225811271, "dur": 0, "args": { "External id": 272618, "cbid": 51, "correlation": 272618 } }, { "ph": "s", "id": 272618, "pid": 76337, "tid": -914061504, "ts": 1716454225811271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225811309, "dur": 720, "args": { "External id": 272619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272619, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272619, "pid": 5, "tid": 7, "ts": 1716454225811309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811274, "dur": 7, "args": { "External id": 272619, "cbid": 211, "correlation": 272619 } }, { "ph": "s", "id": 272619, "pid": 76337, "tid": -914061504, "ts": 1716454225811274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225812031, "dur": 60, "args": { "External id": 272624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272624, "pid": 5, "tid": 7, "ts": 1716454225812031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811304, "dur": 9, "args": { "External id": 272624, "cbid": 211, "correlation": 272624 } }, { "ph": "s", "id": 272624, "pid": 76337, "tid": -914061504, "ts": 1716454225811304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225812092, "dur": 4, "args": { "External id": 272632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272632, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272632, "pid": 5, "tid": 7, "ts": 1716454225812092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811349, "dur": 9, "args": { "External id": 272632, "cbid": 211, "correlation": 272632 } }, { "ph": "s", "id": 272632, "pid": 76337, "tid": -914061504, "ts": 1716454225811349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225811418, "dur": 2, "args": { "External id": 272648, "cbid": 251, "correlation": 272648 } }, { "ph": "f", "id": 272648, "pid": 76337, "tid": -914061504, "ts": 1716454225811418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225811424, "dur": 0, "args": { "External id": 272650, "cbid": 251, "correlation": 272650 } }, { "ph": "f", "id": 272650, "pid": 76337, "tid": -914061504, "ts": 1716454225811424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225812098, "dur": 9, "args": { "External id": 272651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272651, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 272651, "pid": 5, "tid": 7, "ts": 1716454225812098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811426, "dur": 12, "args": { "External id": 272651, "cbid": 211, "correlation": 272651 } }, { "ph": "s", "id": 272651, "pid": 76337, "tid": -914061504, "ts": 1716454225811426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225812108, "dur": 4, "args": { "External id": 272653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272653, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 272653, "pid": 5, "tid": 7, "ts": 1716454225812108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811440, "dur": 6, "args": { "External id": 272653, "cbid": 211, "correlation": 272653 } }, { "ph": "s", "id": 272653, "pid": 76337, "tid": -914061504, "ts": 1716454225811440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225812114, "dur": 54, "args": { "External id": 272663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272663, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272663, "pid": 5, "tid": 7, "ts": 1716454225812114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811500, "dur": 12, "args": { "External id": 272663, "cbid": 211, "correlation": 272663 } }, { "ph": "s", "id": 272663, "pid": 76337, "tid": -914061504, "ts": 1716454225811500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225812169, "dur": 53, "args": { "External id": 272683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272683, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 272683, "pid": 5, "tid": 7, "ts": 1716454225812169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811569, "dur": 11, "args": { "External id": 272683, "cbid": 211, "correlation": 272683 } }, { "ph": "s", "id": 272683, "pid": 76337, "tid": -914061504, "ts": 1716454225811569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225812224, "dur": 4, "args": { "External id": 272695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272695, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272695, "pid": 5, "tid": 7, "ts": 1716454225812224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811590, "dur": 7, "args": { "External id": 272695, "cbid": 211, "correlation": 272695 } }, { "ph": "s", "id": 272695, "pid": 76337, "tid": -914061504, "ts": 1716454225811590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225812229, "dur": 56, "args": { "External id": 272698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272698, "pid": 5, "tid": 7, "ts": 1716454225812229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811609, "dur": 7, "args": { "External id": 272698, "cbid": 211, "correlation": 272698 } }, { "ph": "s", "id": 272698, "pid": 76337, "tid": -914061504, "ts": 1716454225811609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225812287, "dur": 36, "args": { "External id": 272707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272707, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272707, "pid": 5, "tid": 7, "ts": 1716454225812287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811650, "dur": 9, "args": { "External id": 272707, "cbid": 211, "correlation": 272707 } }, { "ph": "s", "id": 272707, "pid": 76337, "tid": -914061504, "ts": 1716454225811650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225811719, "dur": 0, "args": { "External id": 272717, "cbid": 317, "correlation": 272717 } }, { "ph": "f", "id": 272717, "pid": 76337, "tid": -914061504, "ts": 1716454225811719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225811719, "dur": 0, "args": { "External id": 272718, "cbid": 203, "correlation": 272718 } }, { "ph": "f", "id": 272718, "pid": 76337, "tid": -914061504, "ts": 1716454225811719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225811720, "dur": 0, "args": { "External id": 272719, "cbid": 205, "correlation": 272719 } }, { "ph": "f", "id": 272719, "pid": 76337, "tid": -914061504, "ts": 1716454225811720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225812324, "dur": 40, "args": { "External id": 272723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272723, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272723, "pid": 5, "tid": 7, "ts": 1716454225812324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811734, "dur": 12, "args": { "External id": 272723, "cbid": 211, "correlation": 272723 } }, { "ph": "s", "id": 272723, "pid": 76337, "tid": -914061504, "ts": 1716454225811734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225812366, "dur": 15, "args": { "External id": 272725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272725, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272725, "pid": 5, "tid": 7, "ts": 1716454225812366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811749, "dur": 5, "args": { "External id": 272725, "cbid": 211, "correlation": 272725 } }, { "ph": "s", "id": 272725, "pid": 76337, "tid": -914061504, "ts": 1716454225811749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225812382, "dur": 3, "args": { "External id": 272727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272727, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272727, "pid": 5, "tid": 7, "ts": 1716454225812382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811758, "dur": 5, "args": { "External id": 272727, "cbid": 211, "correlation": 272727 } }, { "ph": "s", "id": 272727, "pid": 76337, "tid": -914061504, "ts": 1716454225811758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225811766, "dur": 0, "args": { "External id": 272728, "cbid": 51, "correlation": 272728 } }, { "ph": "s", "id": 272728, "pid": 76337, "tid": -914061504, "ts": 1716454225811766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225812386, "dur": 715, "args": { "External id": 272729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272729, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272729, "pid": 5, "tid": 7, "ts": 1716454225812386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811767, "dur": 5, "args": { "External id": 272729, "cbid": 211, "correlation": 272729 } }, { "ph": "s", "id": 272729, "pid": 76337, "tid": -914061504, "ts": 1716454225811767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225813103, "dur": 60, "args": { "External id": 272734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272734, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272734, "pid": 5, "tid": 7, "ts": 1716454225813103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811795, "dur": 8, "args": { "External id": 272734, "cbid": 211, "correlation": 272734 } }, { "ph": "s", "id": 272734, "pid": 76337, "tid": -914061504, "ts": 1716454225811795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225813164, "dur": 50, "args": { "External id": 272742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272742, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272742, "pid": 5, "tid": 7, "ts": 1716454225813164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811827, "dur": 9, "args": { "External id": 272742, "cbid": 211, "correlation": 272742 } }, { "ph": "s", "id": 272742, "pid": 76337, "tid": -914061504, "ts": 1716454225811827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225813216, "dur": 36, "args": { "External id": 272750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272750, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272750, "pid": 5, "tid": 7, "ts": 1716454225813216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811859, "dur": 9, "args": { "External id": 272750, "cbid": 211, "correlation": 272750 } }, { "ph": "s", "id": 272750, "pid": 76337, "tid": -914061504, "ts": 1716454225811859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225813253, "dur": 52, "args": { "External id": 272770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272770, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 272770, "pid": 5, "tid": 7, "ts": 1716454225813253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811938, "dur": 13, "args": { "External id": 272770, "cbid": 211, "correlation": 272770 } }, { "ph": "s", "id": 272770, "pid": 76337, "tid": -914061504, "ts": 1716454225811938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225813306, "dur": 4, "args": { "External id": 272782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272782, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 272782, "pid": 5, "tid": 7, "ts": 1716454225813306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811960, "dur": 6, "args": { "External id": 272782, "cbid": 211, "correlation": 272782 } }, { "ph": "s", "id": 272782, "pid": 76337, "tid": -914061504, "ts": 1716454225811960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225813312, "dur": 57, "args": { "External id": 272785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272785, "pid": 5, "tid": 7, "ts": 1716454225813312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225811986, "dur": 8, "args": { "External id": 272785, "cbid": 211, "correlation": 272785 } }, { "ph": "s", "id": 272785, "pid": 76337, "tid": -914061504, "ts": 1716454225811986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225812047, "dur": 0, "args": { "External id": 272796, "cbid": 317, "correlation": 272796 } }, { "ph": "f", "id": 272796, "pid": 76337, "tid": -914061504, "ts": 1716454225812047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225812048, "dur": 0, "args": { "External id": 272797, "cbid": 203, "correlation": 272797 } }, { "ph": "f", "id": 272797, "pid": 76337, "tid": -914061504, "ts": 1716454225812048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225812049, "dur": 0, "args": { "External id": 272798, "cbid": 205, "correlation": 272798 } }, { "ph": "f", "id": 272798, "pid": 76337, "tid": -914061504, "ts": 1716454225812049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812086, "dur": 4, "args": { "External id": 272802, "cbid": 251, "correlation": 272802 } }, { "ph": "f", "id": 272802, "pid": 76337, "tid": -914061504, "ts": 1716454225812086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812091, "dur": 1, "args": { "External id": 272803, "cbid": 251, "correlation": 272803 } }, { "ph": "f", "id": 272803, "pid": 76337, "tid": -914061504, "ts": 1716454225812091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812094, "dur": 2, "args": { "External id": 272804, "cbid": 251, "correlation": 272804 } }, { "ph": "f", "id": 272804, "pid": 76337, "tid": -914061504, "ts": 1716454225812094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812097, "dur": 2, "args": { "External id": 272805, "cbid": 251, "correlation": 272805 } }, { "ph": "f", "id": 272805, "pid": 76337, "tid": -914061504, "ts": 1716454225812097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812099, "dur": 1, "args": { "External id": 272806, "cbid": 251, "correlation": 272806 } }, { "ph": "f", "id": 272806, "pid": 76337, "tid": -914061504, "ts": 1716454225812099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812101, "dur": 2, "args": { "External id": 272807, "cbid": 251, "correlation": 272807 } }, { "ph": "f", "id": 272807, "pid": 76337, "tid": -914061504, "ts": 1716454225812101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812105, "dur": 1, "args": { "External id": 272808, "cbid": 251, "correlation": 272808 } }, { "ph": "f", "id": 272808, "pid": 76337, "tid": -914061504, "ts": 1716454225812105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812106, "dur": 1, "args": { "External id": 272809, "cbid": 251, "correlation": 272809 } }, { "ph": "f", "id": 272809, "pid": 76337, "tid": -914061504, "ts": 1716454225812106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812109, "dur": 0, "args": { "External id": 272810, "cbid": 251, "correlation": 272810 } }, { "ph": "f", "id": 272810, "pid": 76337, "tid": -914061504, "ts": 1716454225812109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225813370, "dur": 117, "args": { "External id": 272811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272811, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 272811, "pid": 5, "tid": 7, "ts": 1716454225813370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812113, "dur": 15, "args": { "External id": 272811, "cbid": 211, "correlation": 272811 } }, { "ph": "s", "id": 272811, "pid": 76337, "tid": -914061504, "ts": 1716454225812113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225813489, "dur": 62, "args": { "External id": 272817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272817, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272817, "pid": 5, "tid": 7, "ts": 1716454225813489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812153, "dur": 9, "args": { "External id": 272817, "cbid": 211, "correlation": 272817 } }, { "ph": "s", "id": 272817, "pid": 76337, "tid": -914061504, "ts": 1716454225812153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225813552, "dur": 612, "args": { "External id": 272826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272826, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272826, "pid": 5, "tid": 7, "ts": 1716454225813552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812248, "dur": 16, "args": { "External id": 272826, "cbid": 211, "correlation": 272826 } }, { "ph": "s", "id": 272826, "pid": 76337, "tid": -914061504, "ts": 1716454225812248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225814165, "dur": 186, "args": { "External id": 272848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272848, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272848, "pid": 5, "tid": 7, "ts": 1716454225814165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812324, "dur": 12, "args": { "External id": 272848, "cbid": 211, "correlation": 272848 } }, { "ph": "s", "id": 272848, "pid": 76337, "tid": -914061504, "ts": 1716454225812324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812439, "dur": 2, "args": { "External id": 272859, "cbid": 251, "correlation": 272859 } }, { "ph": "f", "id": 272859, "pid": 76337, "tid": -914061504, "ts": 1716454225812439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225814353, "dur": 198, "args": { "External id": 272860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272860, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272860, "pid": 5, "tid": 7, "ts": 1716454225814353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812446, "dur": 14, "args": { "External id": 272860, "cbid": 211, "correlation": 272860 } }, { "ph": "s", "id": 272860, "pid": 76337, "tid": -914061504, "ts": 1716454225812446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812518, "dur": 1, "args": { "External id": 272871, "cbid": 251, "correlation": 272871 } }, { "ph": "f", "id": 272871, "pid": 76337, "tid": -914061504, "ts": 1716454225812518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225814553, "dur": 196, "args": { "External id": 272872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272872, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272872, "pid": 5, "tid": 7, "ts": 1716454225814553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812522, "dur": 11, "args": { "External id": 272872, "cbid": 211, "correlation": 272872 } }, { "ph": "s", "id": 272872, "pid": 76337, "tid": -914061504, "ts": 1716454225812522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812591, "dur": 1, "args": { "External id": 272883, "cbid": 251, "correlation": 272883 } }, { "ph": "f", "id": 272883, "pid": 76337, "tid": -914061504, "ts": 1716454225812591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225814751, "dur": 192, "args": { "External id": 272884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272884, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272884, "pid": 5, "tid": 7, "ts": 1716454225814751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812595, "dur": 12, "args": { "External id": 272884, "cbid": 211, "correlation": 272884 } }, { "ph": "s", "id": 272884, "pid": 76337, "tid": -914061504, "ts": 1716454225812595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225814944, "dur": 19208, "args": { "External id": 272905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272905, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 272905, "pid": 5, "tid": 7, "ts": 1716454225814944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812706, "dur": 14, "args": { "External id": 272905, "cbid": 211, "correlation": 272905 } }, { "ph": "s", "id": 272905, "pid": 76337, "tid": -914061504, "ts": 1716454225812706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225812816, "dur": 2, "args": { "External id": 272923, "cbid": 251, "correlation": 272923 } }, { "ph": "f", "id": 272923, "pid": 76337, "tid": -914061504, "ts": 1716454225812816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225834153, "dur": 208, "args": { "External id": 272925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272925, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272925, "pid": 5, "tid": 7, "ts": 1716454225834153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812823, "dur": 13, "args": { "External id": 272925, "cbid": 211, "correlation": 272925 } }, { "ph": "s", "id": 272925, "pid": 76337, "tid": -914061504, "ts": 1716454225812823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225834363, "dur": 67, "args": { "External id": 272933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272933, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272933, "pid": 5, "tid": 7, "ts": 1716454225834363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812894, "dur": 13, "args": { "External id": 272933, "cbid": 211, "correlation": 272933 } }, { "ph": "s", "id": 272933, "pid": 76337, "tid": -914061504, "ts": 1716454225812894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225834431, "dur": 96, "args": { "External id": 272941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272941, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272941, "pid": 5, "tid": 7, "ts": 1716454225834431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225812934, "dur": 9, "args": { "External id": 272941, "cbid": 211, "correlation": 272941 } }, { "ph": "s", "id": 272941, "pid": 76337, "tid": -914061504, "ts": 1716454225812934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225834528, "dur": 55, "args": { "External id": 272952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272952, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272952, "pid": 5, "tid": 7, "ts": 1716454225834528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813030, "dur": 16, "args": { "External id": 272952, "cbid": 211, "correlation": 272952 } }, { "ph": "s", "id": 272952, "pid": 76337, "tid": -914061504, "ts": 1716454225813030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225834585, "dur": 95, "args": { "External id": 272974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272974, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 272974, "pid": 5, "tid": 7, "ts": 1716454225834585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813067, "dur": 8, "args": { "External id": 272974, "cbid": 211, "correlation": 272974 } }, { "ph": "s", "id": 272974, "pid": 76337, "tid": -914061504, "ts": 1716454225813067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813151, "dur": 1, "args": { "External id": 272985, "cbid": 251, "correlation": 272985 } }, { "ph": "f", "id": 272985, "pid": 76337, "tid": -914061504, "ts": 1716454225813151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225834681, "dur": 109, "args": { "External id": 272986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272986, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 272986, "pid": 5, "tid": 7, "ts": 1716454225834681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813156, "dur": 13, "args": { "External id": 272986, "cbid": 211, "correlation": 272986 } }, { "ph": "s", "id": 272986, "pid": 76337, "tid": -914061504, "ts": 1716454225813156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813239, "dur": 2, "args": { "External id": 272997, "cbid": 251, "correlation": 272997 } }, { "ph": "f", "id": 272997, "pid": 76337, "tid": -914061504, "ts": 1716454225813239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813243, "dur": 0, "args": { "External id": 272998, "cbid": 251, "correlation": 272998 } }, { "ph": "f", "id": 272998, "pid": 76337, "tid": -914061504, "ts": 1716454225813243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225834792, "dur": 10, "args": { "External id": 272999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 272999, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 272999, "pid": 5, "tid": 7, "ts": 1716454225834792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813245, "dur": 17, "args": { "External id": 272999, "cbid": 211, "correlation": 272999 } }, { "ph": "s", "id": 272999, "pid": 76337, "tid": -914061504, "ts": 1716454225813245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225834804, "dur": 5, "args": { "External id": 273001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273001, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 273001, "pid": 5, "tid": 7, "ts": 1716454225834804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813266, "dur": 8, "args": { "External id": 273001, "cbid": 211, "correlation": 273001 } }, { "ph": "s", "id": 273001, "pid": 76337, "tid": -914061504, "ts": 1716454225813266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813334, "dur": 1, "args": { "External id": 273012, "cbid": 251, "correlation": 273012 } }, { "ph": "f", "id": 273012, "pid": 76337, "tid": -914061504, "ts": 1716454225813334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813337, "dur": 0, "args": { "External id": 273013, "cbid": 251, "correlation": 273013 } }, { "ph": "f", "id": 273013, "pid": 76337, "tid": -914061504, "ts": 1716454225813337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225834810, "dur": 7, "args": { "External id": 273014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273014, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 273014, "pid": 5, "tid": 7, "ts": 1716454225834810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813339, "dur": 12, "args": { "External id": 273014, "cbid": 211, "correlation": 273014 } }, { "ph": "s", "id": 273014, "pid": 76337, "tid": -914061504, "ts": 1716454225813339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225834818, "dur": 4, "args": { "External id": 273016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273016, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 273016, "pid": 5, "tid": 7, "ts": 1716454225834818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813353, "dur": 6, "args": { "External id": 273016, "cbid": 211, "correlation": 273016 } }, { "ph": "s", "id": 273016, "pid": 76337, "tid": -914061504, "ts": 1716454225813353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225834823, "dur": 159, "args": { "External id": 273037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273037, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273037, "pid": 5, "tid": 7, "ts": 1716454225834823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813427, "dur": 13, "args": { "External id": 273037, "cbid": 211, "correlation": 273037 } }, { "ph": "s", "id": 273037, "pid": 76337, "tid": -914061504, "ts": 1716454225813427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813524, "dur": 2, "args": { "External id": 273055, "cbid": 251, "correlation": 273055 } }, { "ph": "f", "id": 273055, "pid": 76337, "tid": -914061504, "ts": 1716454225813524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225834984, "dur": 109, "args": { "External id": 273057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273057, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273057, "pid": 5, "tid": 7, "ts": 1716454225834984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813531, "dur": 15, "args": { "External id": 273057, "cbid": 211, "correlation": 273057 } }, { "ph": "s", "id": 273057, "pid": 76337, "tid": -914061504, "ts": 1716454225813531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225835094, "dur": 35, "args": { "External id": 273065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273065, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273065, "pid": 5, "tid": 7, "ts": 1716454225835094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813602, "dur": 12, "args": { "External id": 273065, "cbid": 211, "correlation": 273065 } }, { "ph": "s", "id": 273065, "pid": 76337, "tid": -914061504, "ts": 1716454225813602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225835130, "dur": 68, "args": { "External id": 273073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273073, "pid": 5, "tid": 7, "ts": 1716454225835130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813643, "dur": 9, "args": { "External id": 273073, "cbid": 211, "correlation": 273073 } }, { "ph": "s", "id": 273073, "pid": 76337, "tid": -914061504, "ts": 1716454225813643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225835200, "dur": 95, "args": { "External id": 273095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273095, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273095, "pid": 5, "tid": 7, "ts": 1716454225835200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813698, "dur": 10, "args": { "External id": 273095, "cbid": 211, "correlation": 273095 } }, { "ph": "s", "id": 273095, "pid": 76337, "tid": -914061504, "ts": 1716454225813698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225813789, "dur": 1, "args": { "External id": 273111, "cbid": 251, "correlation": 273111 } }, { "ph": "f", "id": 273111, "pid": 76337, "tid": -914061504, "ts": 1716454225813789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225835296, "dur": 590, "args": { "External id": 273113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273113, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273113, "pid": 5, "tid": 7, "ts": 1716454225835296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813795, "dur": 12, "args": { "External id": 273113, "cbid": 211, "correlation": 273113 } }, { "ph": "s", "id": 273113, "pid": 76337, "tid": -914061504, "ts": 1716454225813795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225835888, "dur": 248, "args": { "External id": 273121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273121, "pid": 5, "tid": 7, "ts": 1716454225835888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813872, "dur": 15, "args": { "External id": 273121, "cbid": 211, "correlation": 273121 } }, { "ph": "s", "id": 273121, "pid": 76337, "tid": -914061504, "ts": 1716454225813872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225836137, "dur": 254, "args": { "External id": 273129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273129, "pid": 5, "tid": 7, "ts": 1716454225836137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225813908, "dur": 9, "args": { "External id": 273129, "cbid": 211, "correlation": 273129 } }, { "ph": "s", "id": 273129, "pid": 76337, "tid": -914061504, "ts": 1716454225813908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814002, "dur": 4, "args": { "External id": 273145, "cbid": 251, "correlation": 273145 } }, { "ph": "f", "id": 273145, "pid": 76337, "tid": -914061504, "ts": 1716454225814002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814009, "dur": 0, "args": { "External id": 273147, "cbid": 251, "correlation": 273147 } }, { "ph": "f", "id": 273147, "pid": 76337, "tid": -914061504, "ts": 1716454225814009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225836392, "dur": 362, "args": { "External id": 273148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273148, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 273148, "pid": 5, "tid": 7, "ts": 1716454225836392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814015, "dur": 14, "args": { "External id": 273148, "cbid": 211, "correlation": 273148 } }, { "ph": "s", "id": 273148, "pid": 76337, "tid": -914061504, "ts": 1716454225814015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225836755, "dur": 50, "args": { "External id": 273156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273156, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273156, "pid": 5, "tid": 7, "ts": 1716454225836755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814060, "dur": 10, "args": { "External id": 273156, "cbid": 211, "correlation": 273156 } }, { "ph": "s", "id": 273156, "pid": 76337, "tid": -914061504, "ts": 1716454225814060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225836806, "dur": 162, "args": { "External id": 273167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273167, "pid": 5, "tid": 7, "ts": 1716454225836806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814132, "dur": 12, "args": { "External id": 273167, "cbid": 211, "correlation": 273167 } }, { "ph": "s", "id": 273167, "pid": 76337, "tid": -914061504, "ts": 1716454225814132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225814200, "dur": 0, "args": { "External id": 273179, "cbid": 317, "correlation": 273179 } }, { "ph": "f", "id": 273179, "pid": 76337, "tid": -914061504, "ts": 1716454225814200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225814201, "dur": 0, "args": { "External id": 273180, "cbid": 203, "correlation": 273180 } }, { "ph": "f", "id": 273180, "pid": 76337, "tid": -914061504, "ts": 1716454225814201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225814202, "dur": 0, "args": { "External id": 273181, "cbid": 205, "correlation": 273181 } }, { "ph": "f", "id": 273181, "pid": 76337, "tid": -914061504, "ts": 1716454225814202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814225, "dur": 1, "args": { "External id": 273185, "cbid": 251, "correlation": 273185 } }, { "ph": "f", "id": 273185, "pid": 76337, "tid": -914061504, "ts": 1716454225814225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814227, "dur": 0, "args": { "External id": 273186, "cbid": 251, "correlation": 273186 } }, { "ph": "f", "id": 273186, "pid": 76337, "tid": -914061504, "ts": 1716454225814227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814228, "dur": 0, "args": { "External id": 273187, "cbid": 251, "correlation": 273187 } }, { "ph": "f", "id": 273187, "pid": 76337, "tid": -914061504, "ts": 1716454225814228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814229, "dur": 0, "args": { "External id": 273188, "cbid": 251, "correlation": 273188 } }, { "ph": "f", "id": 273188, "pid": 76337, "tid": -914061504, "ts": 1716454225814229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814230, "dur": 0, "args": { "External id": 273189, "cbid": 251, "correlation": 273189 } }, { "ph": "f", "id": 273189, "pid": 76337, "tid": -914061504, "ts": 1716454225814230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814231, "dur": 0, "args": { "External id": 273190, "cbid": 251, "correlation": 273190 } }, { "ph": "f", "id": 273190, "pid": 76337, "tid": -914061504, "ts": 1716454225814231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814232, "dur": 0, "args": { "External id": 273191, "cbid": 251, "correlation": 273191 } }, { "ph": "f", "id": 273191, "pid": 76337, "tid": -914061504, "ts": 1716454225814232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814233, "dur": 0, "args": { "External id": 273192, "cbid": 251, "correlation": 273192 } }, { "ph": "f", "id": 273192, "pid": 76337, "tid": -914061504, "ts": 1716454225814233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814234, "dur": 0, "args": { "External id": 273193, "cbid": 251, "correlation": 273193 } }, { "ph": "f", "id": 273193, "pid": 76337, "tid": -914061504, "ts": 1716454225814234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225836970, "dur": 118, "args": { "External id": 273194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273194, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273194, "pid": 5, "tid": 7, "ts": 1716454225836970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814236, "dur": 13, "args": { "External id": 273194, "cbid": 211, "correlation": 273194 } }, { "ph": "s", "id": 273194, "pid": 76337, "tid": -914061504, "ts": 1716454225814236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225837089, "dur": 60, "args": { "External id": 273200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273200, "pid": 5, "tid": 7, "ts": 1716454225837089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814271, "dur": 9, "args": { "External id": 273200, "cbid": 211, "correlation": 273200 } }, { "ph": "s", "id": 273200, "pid": 76337, "tid": -914061504, "ts": 1716454225814271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225837151, "dur": 50, "args": { "External id": 273208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273208, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273208, "pid": 5, "tid": 7, "ts": 1716454225837151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814304, "dur": 8, "args": { "External id": 273208, "cbid": 211, "correlation": 273208 } }, { "ph": "s", "id": 273208, "pid": 76337, "tid": -914061504, "ts": 1716454225814304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225837202, "dur": 53, "args": { "External id": 273228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273228, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 273228, "pid": 5, "tid": 7, "ts": 1716454225837202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814380, "dur": 12, "args": { "External id": 273228, "cbid": 211, "correlation": 273228 } }, { "ph": "s", "id": 273228, "pid": 76337, "tid": -914061504, "ts": 1716454225814380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225837256, "dur": 5, "args": { "External id": 273240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273240, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273240, "pid": 5, "tid": 7, "ts": 1716454225837256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814402, "dur": 6, "args": { "External id": 273240, "cbid": 211, "correlation": 273240 } }, { "ph": "s", "id": 273240, "pid": 76337, "tid": -914061504, "ts": 1716454225814402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225837262, "dur": 58, "args": { "External id": 273243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273243, "pid": 5, "tid": 7, "ts": 1716454225837262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814420, "dur": 7, "args": { "External id": 273243, "cbid": 211, "correlation": 273243 } }, { "ph": "s", "id": 273243, "pid": 76337, "tid": -914061504, "ts": 1716454225814420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225837321, "dur": 38, "args": { "External id": 273252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273252, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273252, "pid": 5, "tid": 7, "ts": 1716454225837321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814458, "dur": 9, "args": { "External id": 273252, "cbid": 211, "correlation": 273252 } }, { "ph": "s", "id": 273252, "pid": 76337, "tid": -914061504, "ts": 1716454225814458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225814509, "dur": 0, "args": { "External id": 273262, "cbid": 317, "correlation": 273262 } }, { "ph": "f", "id": 273262, "pid": 76337, "tid": -914061504, "ts": 1716454225814509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225814510, "dur": 0, "args": { "External id": 273263, "cbid": 203, "correlation": 273263 } }, { "ph": "f", "id": 273263, "pid": 76337, "tid": -914061504, "ts": 1716454225814510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225814511, "dur": 0, "args": { "External id": 273264, "cbid": 205, "correlation": 273264 } }, { "ph": "f", "id": 273264, "pid": 76337, "tid": -914061504, "ts": 1716454225814511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225837360, "dur": 41, "args": { "External id": 273268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273268, "pid": 5, "tid": 7, "ts": 1716454225837360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814526, "dur": 12, "args": { "External id": 273268, "cbid": 211, "correlation": 273268 } }, { "ph": "s", "id": 273268, "pid": 76337, "tid": -914061504, "ts": 1716454225814526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225837402, "dur": 14, "args": { "External id": 273270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273270, "pid": 5, "tid": 7, "ts": 1716454225837402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814540, "dur": 5, "args": { "External id": 273270, "cbid": 211, "correlation": 273270 } }, { "ph": "s", "id": 273270, "pid": 76337, "tid": -914061504, "ts": 1716454225814540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225837418, "dur": 4, "args": { "External id": 273272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273272, "pid": 5, "tid": 7, "ts": 1716454225837418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814549, "dur": 6, "args": { "External id": 273272, "cbid": 211, "correlation": 273272 } }, { "ph": "s", "id": 273272, "pid": 76337, "tid": -914061504, "ts": 1716454225814549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225814558, "dur": 0, "args": { "External id": 273273, "cbid": 51, "correlation": 273273 } }, { "ph": "s", "id": 273273, "pid": 76337, "tid": -914061504, "ts": 1716454225814558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225837423, "dur": 716, "args": { "External id": 273274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273274, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273274, "pid": 5, "tid": 7, "ts": 1716454225837423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814559, "dur": 5, "args": { "External id": 273274, "cbid": 211, "correlation": 273274 } }, { "ph": "s", "id": 273274, "pid": 76337, "tid": -914061504, "ts": 1716454225814559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225838141, "dur": 60, "args": { "External id": 273279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273279, "pid": 5, "tid": 7, "ts": 1716454225838141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814587, "dur": 8, "args": { "External id": 273279, "cbid": 211, "correlation": 273279 } }, { "ph": "s", "id": 273279, "pid": 76337, "tid": -914061504, "ts": 1716454225814587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225838203, "dur": 5, "args": { "External id": 273287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273287, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273287, "pid": 5, "tid": 7, "ts": 1716454225838203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814630, "dur": 9, "args": { "External id": 273287, "cbid": 211, "correlation": 273287 } }, { "ph": "s", "id": 273287, "pid": 76337, "tid": -914061504, "ts": 1716454225814630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814696, "dur": 1, "args": { "External id": 273303, "cbid": 251, "correlation": 273303 } }, { "ph": "f", "id": 273303, "pid": 76337, "tid": -914061504, "ts": 1716454225814696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225814701, "dur": 0, "args": { "External id": 273305, "cbid": 251, "correlation": 273305 } }, { "ph": "f", "id": 273305, "pid": 76337, "tid": -914061504, "ts": 1716454225814701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225838209, "dur": 11, "args": { "External id": 273306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273306, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 273306, "pid": 5, "tid": 7, "ts": 1716454225838209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814703, "dur": 11, "args": { "External id": 273306, "cbid": 211, "correlation": 273306 } }, { "ph": "s", "id": 273306, "pid": 76337, "tid": -914061504, "ts": 1716454225814703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225838222, "dur": 5, "args": { "External id": 273308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273308, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 273308, "pid": 5, "tid": 7, "ts": 1716454225838222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814716, "dur": 6, "args": { "External id": 273308, "cbid": 211, "correlation": 273308 } }, { "ph": "s", "id": 273308, "pid": 76337, "tid": -914061504, "ts": 1716454225814716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225838228, "dur": 55, "args": { "External id": 273318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273318, "pid": 5, "tid": 7, "ts": 1716454225838228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814773, "dur": 12, "args": { "External id": 273318, "cbid": 211, "correlation": 273318 } }, { "ph": "s", "id": 273318, "pid": 76337, "tid": -914061504, "ts": 1716454225814773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225838284, "dur": 51, "args": { "External id": 273338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273338, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 273338, "pid": 5, "tid": 7, "ts": 1716454225838284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814840, "dur": 10, "args": { "External id": 273338, "cbid": 211, "correlation": 273338 } }, { "ph": "s", "id": 273338, "pid": 76337, "tid": -914061504, "ts": 1716454225814840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225838337, "dur": 4, "args": { "External id": 273350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273350, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273350, "pid": 5, "tid": 7, "ts": 1716454225838337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814863, "dur": 6, "args": { "External id": 273350, "cbid": 211, "correlation": 273350 } }, { "ph": "s", "id": 273350, "pid": 76337, "tid": -914061504, "ts": 1716454225814863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225838342, "dur": 56, "args": { "External id": 273353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273353, "pid": 5, "tid": 7, "ts": 1716454225838342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814883, "dur": 6, "args": { "External id": 273353, "cbid": 211, "correlation": 273353 } }, { "ph": "s", "id": 273353, "pid": 76337, "tid": -914061504, "ts": 1716454225814883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225838400, "dur": 37, "args": { "External id": 273362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273362, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273362, "pid": 5, "tid": 7, "ts": 1716454225838400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225814923, "dur": 10, "args": { "External id": 273362, "cbid": 211, "correlation": 273362 } }, { "ph": "s", "id": 273362, "pid": 76337, "tid": -914061504, "ts": 1716454225814923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225814995, "dur": 0, "args": { "External id": 273372, "cbid": 317, "correlation": 273372 } }, { "ph": "f", "id": 273372, "pid": 76337, "tid": -914061504, "ts": 1716454225814995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225814996, "dur": 0, "args": { "External id": 273373, "cbid": 203, "correlation": 273373 } }, { "ph": "f", "id": 273373, "pid": 76337, "tid": -914061504, "ts": 1716454225814996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225814997, "dur": 0, "args": { "External id": 273374, "cbid": 205, "correlation": 273374 } }, { "ph": "f", "id": 273374, "pid": 76337, "tid": -914061504, "ts": 1716454225814997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225838438, "dur": 40, "args": { "External id": 273378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273378, "pid": 5, "tid": 7, "ts": 1716454225838438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815010, "dur": 13, "args": { "External id": 273378, "cbid": 211, "correlation": 273378 } }, { "ph": "s", "id": 273378, "pid": 76337, "tid": -914061504, "ts": 1716454225815010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225838479, "dur": 14, "args": { "External id": 273380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273380, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273380, "pid": 5, "tid": 7, "ts": 1716454225838479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815026, "dur": 5, "args": { "External id": 273380, "cbid": 211, "correlation": 273380 } }, { "ph": "s", "id": 273380, "pid": 76337, "tid": -914061504, "ts": 1716454225815026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225838495, "dur": 4, "args": { "External id": 273382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273382, "pid": 5, "tid": 7, "ts": 1716454225838495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815035, "dur": 5, "args": { "External id": 273382, "cbid": 211, "correlation": 273382 } }, { "ph": "s", "id": 273382, "pid": 76337, "tid": -914061504, "ts": 1716454225815035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225815044, "dur": 0, "args": { "External id": 273383, "cbid": 51, "correlation": 273383 } }, { "ph": "s", "id": 273383, "pid": 76337, "tid": -914061504, "ts": 1716454225815044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225838500, "dur": 712, "args": { "External id": 273384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273384, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273384, "pid": 5, "tid": 7, "ts": 1716454225838500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815044, "dur": 5, "args": { "External id": 273384, "cbid": 211, "correlation": 273384 } }, { "ph": "s", "id": 273384, "pid": 76337, "tid": -914061504, "ts": 1716454225815044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225839213, "dur": 60, "args": { "External id": 273389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273389, "pid": 5, "tid": 7, "ts": 1716454225839213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815073, "dur": 8, "args": { "External id": 273389, "cbid": 211, "correlation": 273389 } }, { "ph": "s", "id": 273389, "pid": 76337, "tid": -914061504, "ts": 1716454225815073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225839275, "dur": 50, "args": { "External id": 273397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273397, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273397, "pid": 5, "tid": 7, "ts": 1716454225839275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815105, "dur": 8, "args": { "External id": 273397, "cbid": 211, "correlation": 273397 } }, { "ph": "s", "id": 273397, "pid": 76337, "tid": -914061504, "ts": 1716454225815105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225839326, "dur": 35, "args": { "External id": 273405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273405, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273405, "pid": 5, "tid": 7, "ts": 1716454225839326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815134, "dur": 8, "args": { "External id": 273405, "cbid": 211, "correlation": 273405 } }, { "ph": "s", "id": 273405, "pid": 76337, "tid": -914061504, "ts": 1716454225815134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225839363, "dur": 52, "args": { "External id": 273425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273425, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 273425, "pid": 5, "tid": 7, "ts": 1716454225839363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815215, "dur": 13, "args": { "External id": 273425, "cbid": 211, "correlation": 273425 } }, { "ph": "s", "id": 273425, "pid": 76337, "tid": -914061504, "ts": 1716454225815215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225839416, "dur": 4, "args": { "External id": 273437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273437, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273437, "pid": 5, "tid": 7, "ts": 1716454225839416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815237, "dur": 6, "args": { "External id": 273437, "cbid": 211, "correlation": 273437 } }, { "ph": "s", "id": 273437, "pid": 76337, "tid": -914061504, "ts": 1716454225815237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225839421, "dur": 57, "args": { "External id": 273440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273440, "pid": 5, "tid": 7, "ts": 1716454225839421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815255, "dur": 6, "args": { "External id": 273440, "cbid": 211, "correlation": 273440 } }, { "ph": "s", "id": 273440, "pid": 76337, "tid": -914061504, "ts": 1716454225815255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225815314, "dur": 0, "args": { "External id": 273451, "cbid": 317, "correlation": 273451 } }, { "ph": "f", "id": 273451, "pid": 76337, "tid": -914061504, "ts": 1716454225815314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225815315, "dur": 0, "args": { "External id": 273452, "cbid": 203, "correlation": 273452 } }, { "ph": "f", "id": 273452, "pid": 76337, "tid": -914061504, "ts": 1716454225815315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225815316, "dur": 0, "args": { "External id": 273453, "cbid": 205, "correlation": 273453 } }, { "ph": "f", "id": 273453, "pid": 76337, "tid": -914061504, "ts": 1716454225815316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815337, "dur": 1, "args": { "External id": 273457, "cbid": 251, "correlation": 273457 } }, { "ph": "f", "id": 273457, "pid": 76337, "tid": -914061504, "ts": 1716454225815337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815339, "dur": 0, "args": { "External id": 273458, "cbid": 251, "correlation": 273458 } }, { "ph": "f", "id": 273458, "pid": 76337, "tid": -914061504, "ts": 1716454225815339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815340, "dur": 0, "args": { "External id": 273459, "cbid": 251, "correlation": 273459 } }, { "ph": "f", "id": 273459, "pid": 76337, "tid": -914061504, "ts": 1716454225815340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815340, "dur": 0, "args": { "External id": 273460, "cbid": 251, "correlation": 273460 } }, { "ph": "f", "id": 273460, "pid": 76337, "tid": -914061504, "ts": 1716454225815340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815341, "dur": 0, "args": { "External id": 273461, "cbid": 251, "correlation": 273461 } }, { "ph": "f", "id": 273461, "pid": 76337, "tid": -914061504, "ts": 1716454225815341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815342, "dur": 0, "args": { "External id": 273462, "cbid": 251, "correlation": 273462 } }, { "ph": "f", "id": 273462, "pid": 76337, "tid": -914061504, "ts": 1716454225815342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815343, "dur": 0, "args": { "External id": 273463, "cbid": 251, "correlation": 273463 } }, { "ph": "f", "id": 273463, "pid": 76337, "tid": -914061504, "ts": 1716454225815343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815344, "dur": 0, "args": { "External id": 273464, "cbid": 251, "correlation": 273464 } }, { "ph": "f", "id": 273464, "pid": 76337, "tid": -914061504, "ts": 1716454225815344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815345, "dur": 0, "args": { "External id": 273465, "cbid": 251, "correlation": 273465 } }, { "ph": "f", "id": 273465, "pid": 76337, "tid": -914061504, "ts": 1716454225815345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225839480, "dur": 112, "args": { "External id": 273466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273466, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273466, "pid": 5, "tid": 7, "ts": 1716454225839480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815347, "dur": 12, "args": { "External id": 273466, "cbid": 211, "correlation": 273466 } }, { "ph": "s", "id": 273466, "pid": 76337, "tid": -914061504, "ts": 1716454225815347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225839593, "dur": 60, "args": { "External id": 273472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273472, "pid": 5, "tid": 7, "ts": 1716454225839593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815383, "dur": 8, "args": { "External id": 273472, "cbid": 211, "correlation": 273472 } }, { "ph": "s", "id": 273472, "pid": 76337, "tid": -914061504, "ts": 1716454225815383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225839654, "dur": 604, "args": { "External id": 273481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273481, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273481, "pid": 5, "tid": 7, "ts": 1716454225839654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815465, "dur": 14, "args": { "External id": 273481, "cbid": 211, "correlation": 273481 } }, { "ph": "s", "id": 273481, "pid": 76337, "tid": -914061504, "ts": 1716454225815465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225840260, "dur": 185, "args": { "External id": 273503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273503, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273503, "pid": 5, "tid": 7, "ts": 1716454225840260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815523, "dur": 10, "args": { "External id": 273503, "cbid": 211, "correlation": 273503 } }, { "ph": "s", "id": 273503, "pid": 76337, "tid": -914061504, "ts": 1716454225815523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815611, "dur": 1, "args": { "External id": 273514, "cbid": 251, "correlation": 273514 } }, { "ph": "f", "id": 273514, "pid": 76337, "tid": -914061504, "ts": 1716454225815611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225840446, "dur": 196, "args": { "External id": 273515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273515, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273515, "pid": 5, "tid": 7, "ts": 1716454225840446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815617, "dur": 13, "args": { "External id": 273515, "cbid": 211, "correlation": 273515 } }, { "ph": "s", "id": 273515, "pid": 76337, "tid": -914061504, "ts": 1716454225815617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815686, "dur": 1, "args": { "External id": 273526, "cbid": 251, "correlation": 273526 } }, { "ph": "f", "id": 273526, "pid": 76337, "tid": -914061504, "ts": 1716454225815686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225840643, "dur": 190, "args": { "External id": 273527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273527, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273527, "pid": 5, "tid": 7, "ts": 1716454225840643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815690, "dur": 11, "args": { "External id": 273527, "cbid": 211, "correlation": 273527 } }, { "ph": "s", "id": 273527, "pid": 76337, "tid": -914061504, "ts": 1716454225815690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815752, "dur": 1, "args": { "External id": 273538, "cbid": 251, "correlation": 273538 } }, { "ph": "f", "id": 273538, "pid": 76337, "tid": -914061504, "ts": 1716454225815752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225840835, "dur": 189, "args": { "External id": 273539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273539, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273539, "pid": 5, "tid": 7, "ts": 1716454225840835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815757, "dur": 12, "args": { "External id": 273539, "cbid": 211, "correlation": 273539 } }, { "ph": "s", "id": 273539, "pid": 76337, "tid": -914061504, "ts": 1716454225815757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225841026, "dur": 17715, "args": { "External id": 273560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273560, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273560, "pid": 5, "tid": 7, "ts": 1716454225841026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815837, "dur": 12, "args": { "External id": 273560, "cbid": 211, "correlation": 273560 } }, { "ph": "s", "id": 273560, "pid": 76337, "tid": -914061504, "ts": 1716454225815837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225815934, "dur": 1, "args": { "External id": 273578, "cbid": 251, "correlation": 273578 } }, { "ph": "f", "id": 273578, "pid": 76337, "tid": -914061504, "ts": 1716454225815934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225858742, "dur": 195, "args": { "External id": 273580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273580, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273580, "pid": 5, "tid": 7, "ts": 1716454225858742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225815940, "dur": 13, "args": { "External id": 273580, "cbid": 211, "correlation": 273580 } }, { "ph": "s", "id": 273580, "pid": 76337, "tid": -914061504, "ts": 1716454225815940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225858939, "dur": 66, "args": { "External id": 273588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273588, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273588, "pid": 5, "tid": 7, "ts": 1716454225858939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816019, "dur": 13, "args": { "External id": 273588, "cbid": 211, "correlation": 273588 } }, { "ph": "s", "id": 273588, "pid": 76337, "tid": -914061504, "ts": 1716454225816019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225859007, "dur": 98, "args": { "External id": 273596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273596, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273596, "pid": 5, "tid": 7, "ts": 1716454225859007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816059, "dur": 9, "args": { "External id": 273596, "cbid": 211, "correlation": 273596 } }, { "ph": "s", "id": 273596, "pid": 76337, "tid": -914061504, "ts": 1716454225816059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225859105, "dur": 53, "args": { "External id": 273607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273607, "pid": 5, "tid": 7, "ts": 1716454225859105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816130, "dur": 13, "args": { "External id": 273607, "cbid": 211, "correlation": 273607 } }, { "ph": "s", "id": 273607, "pid": 76337, "tid": -914061504, "ts": 1716454225816130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225859159, "dur": 87, "args": { "External id": 273629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273629, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273629, "pid": 5, "tid": 7, "ts": 1716454225859159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816161, "dur": 8, "args": { "External id": 273629, "cbid": 211, "correlation": 273629 } }, { "ph": "s", "id": 273629, "pid": 76337, "tid": -914061504, "ts": 1716454225816161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816244, "dur": 1, "args": { "External id": 273640, "cbid": 251, "correlation": 273640 } }, { "ph": "f", "id": 273640, "pid": 76337, "tid": -914061504, "ts": 1716454225816244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225859248, "dur": 99, "args": { "External id": 273641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273641, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273641, "pid": 5, "tid": 7, "ts": 1716454225859248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816249, "dur": 13, "args": { "External id": 273641, "cbid": 211, "correlation": 273641 } }, { "ph": "s", "id": 273641, "pid": 76337, "tid": -914061504, "ts": 1716454225816249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816321, "dur": 1, "args": { "External id": 273652, "cbid": 251, "correlation": 273652 } }, { "ph": "f", "id": 273652, "pid": 76337, "tid": -914061504, "ts": 1716454225816321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816324, "dur": 0, "args": { "External id": 273653, "cbid": 251, "correlation": 273653 } }, { "ph": "f", "id": 273653, "pid": 76337, "tid": -914061504, "ts": 1716454225816324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225859349, "dur": 9, "args": { "External id": 273654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273654, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 273654, "pid": 5, "tid": 7, "ts": 1716454225859349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816326, "dur": 12, "args": { "External id": 273654, "cbid": 211, "correlation": 273654 } }, { "ph": "s", "id": 273654, "pid": 76337, "tid": -914061504, "ts": 1716454225816326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225859360, "dur": 5, "args": { "External id": 273656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273656, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 273656, "pid": 5, "tid": 7, "ts": 1716454225859360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816340, "dur": 6, "args": { "External id": 273656, "cbid": 211, "correlation": 273656 } }, { "ph": "s", "id": 273656, "pid": 76337, "tid": -914061504, "ts": 1716454225816340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816405, "dur": 1, "args": { "External id": 273667, "cbid": 251, "correlation": 273667 } }, { "ph": "f", "id": 273667, "pid": 76337, "tid": -914061504, "ts": 1716454225816405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816409, "dur": 0, "args": { "External id": 273668, "cbid": 251, "correlation": 273668 } }, { "ph": "f", "id": 273668, "pid": 76337, "tid": -914061504, "ts": 1716454225816409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225859366, "dur": 6, "args": { "External id": 273669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273669, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 273669, "pid": 5, "tid": 7, "ts": 1716454225859366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816410, "dur": 12, "args": { "External id": 273669, "cbid": 211, "correlation": 273669 } }, { "ph": "s", "id": 273669, "pid": 76337, "tid": -914061504, "ts": 1716454225816410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225859373, "dur": 3, "args": { "External id": 273671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273671, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 273671, "pid": 5, "tid": 7, "ts": 1716454225859373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816424, "dur": 5, "args": { "External id": 273671, "cbid": 211, "correlation": 273671 } }, { "ph": "s", "id": 273671, "pid": 76337, "tid": -914061504, "ts": 1716454225816424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225859378, "dur": 148, "args": { "External id": 273692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273692, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273692, "pid": 5, "tid": 7, "ts": 1716454225859378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816497, "dur": 13, "args": { "External id": 273692, "cbid": 211, "correlation": 273692 } }, { "ph": "s", "id": 273692, "pid": 76337, "tid": -914061504, "ts": 1716454225816497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816596, "dur": 1, "args": { "External id": 273710, "cbid": 251, "correlation": 273710 } }, { "ph": "f", "id": 273710, "pid": 76337, "tid": -914061504, "ts": 1716454225816596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225859528, "dur": 103, "args": { "External id": 273712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273712, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273712, "pid": 5, "tid": 7, "ts": 1716454225859528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816602, "dur": 14, "args": { "External id": 273712, "cbid": 211, "correlation": 273712 } }, { "ph": "s", "id": 273712, "pid": 76337, "tid": -914061504, "ts": 1716454225816602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225859632, "dur": 35, "args": { "External id": 273720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273720, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273720, "pid": 5, "tid": 7, "ts": 1716454225859632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816671, "dur": 12, "args": { "External id": 273720, "cbid": 211, "correlation": 273720 } }, { "ph": "s", "id": 273720, "pid": 76337, "tid": -914061504, "ts": 1716454225816671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225859668, "dur": 65, "args": { "External id": 273728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273728, "pid": 5, "tid": 7, "ts": 1716454225859668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816712, "dur": 9, "args": { "External id": 273728, "cbid": 211, "correlation": 273728 } }, { "ph": "s", "id": 273728, "pid": 76337, "tid": -914061504, "ts": 1716454225816712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225859735, "dur": 87, "args": { "External id": 273750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273750, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273750, "pid": 5, "tid": 7, "ts": 1716454225859735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816767, "dur": 10, "args": { "External id": 273750, "cbid": 211, "correlation": 273750 } }, { "ph": "s", "id": 273750, "pid": 76337, "tid": -914061504, "ts": 1716454225816767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225816857, "dur": 1, "args": { "External id": 273766, "cbid": 251, "correlation": 273766 } }, { "ph": "f", "id": 273766, "pid": 76337, "tid": -914061504, "ts": 1716454225816857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225859823, "dur": 560, "args": { "External id": 273768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273768, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273768, "pid": 5, "tid": 7, "ts": 1716454225859823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816863, "dur": 12, "args": { "External id": 273768, "cbid": 211, "correlation": 273768 } }, { "ph": "s", "id": 273768, "pid": 76337, "tid": -914061504, "ts": 1716454225816863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225860384, "dur": 237, "args": { "External id": 273776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273776, "pid": 5, "tid": 7, "ts": 1716454225860384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816927, "dur": 12, "args": { "External id": 273776, "cbid": 211, "correlation": 273776 } }, { "ph": "s", "id": 273776, "pid": 76337, "tid": -914061504, "ts": 1716454225816927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225860623, "dur": 249, "args": { "External id": 273784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273784, "pid": 5, "tid": 7, "ts": 1716454225860623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225816960, "dur": 8, "args": { "External id": 273784, "cbid": 211, "correlation": 273784 } }, { "ph": "s", "id": 273784, "pid": 76337, "tid": -914061504, "ts": 1716454225816960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817048, "dur": 2, "args": { "External id": 273800, "cbid": 251, "correlation": 273800 } }, { "ph": "f", "id": 273800, "pid": 76337, "tid": -914061504, "ts": 1716454225817048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817054, "dur": 0, "args": { "External id": 273802, "cbid": 251, "correlation": 273802 } }, { "ph": "f", "id": 273802, "pid": 76337, "tid": -914061504, "ts": 1716454225817054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225860872, "dur": 354, "args": { "External id": 273803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273803, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 273803, "pid": 5, "tid": 7, "ts": 1716454225860872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817057, "dur": 13, "args": { "External id": 273803, "cbid": 211, "correlation": 273803 } }, { "ph": "s", "id": 273803, "pid": 76337, "tid": -914061504, "ts": 1716454225817057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225861228, "dur": 50, "args": { "External id": 273811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273811, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273811, "pid": 5, "tid": 7, "ts": 1716454225861228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817099, "dur": 10, "args": { "External id": 273811, "cbid": 211, "correlation": 273811 } }, { "ph": "s", "id": 273811, "pid": 76337, "tid": -914061504, "ts": 1716454225817099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225861280, "dur": 151, "args": { "External id": 273822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273822, "pid": 5, "tid": 7, "ts": 1716454225861280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817166, "dur": 12, "args": { "External id": 273822, "cbid": 211, "correlation": 273822 } }, { "ph": "s", "id": 273822, "pid": 76337, "tid": -914061504, "ts": 1716454225817166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225817229, "dur": 0, "args": { "External id": 273834, "cbid": 317, "correlation": 273834 } }, { "ph": "f", "id": 273834, "pid": 76337, "tid": -914061504, "ts": 1716454225817229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225817230, "dur": 0, "args": { "External id": 273835, "cbid": 203, "correlation": 273835 } }, { "ph": "f", "id": 273835, "pid": 76337, "tid": -914061504, "ts": 1716454225817230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225817231, "dur": 0, "args": { "External id": 273836, "cbid": 205, "correlation": 273836 } }, { "ph": "f", "id": 273836, "pid": 76337, "tid": -914061504, "ts": 1716454225817231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817254, "dur": 1, "args": { "External id": 273840, "cbid": 251, "correlation": 273840 } }, { "ph": "f", "id": 273840, "pid": 76337, "tid": -914061504, "ts": 1716454225817254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817256, "dur": 0, "args": { "External id": 273841, "cbid": 251, "correlation": 273841 } }, { "ph": "f", "id": 273841, "pid": 76337, "tid": -914061504, "ts": 1716454225817256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817256, "dur": 0, "args": { "External id": 273842, "cbid": 251, "correlation": 273842 } }, { "ph": "f", "id": 273842, "pid": 76337, "tid": -914061504, "ts": 1716454225817256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817257, "dur": 0, "args": { "External id": 273843, "cbid": 251, "correlation": 273843 } }, { "ph": "f", "id": 273843, "pid": 76337, "tid": -914061504, "ts": 1716454225817257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817258, "dur": 0, "args": { "External id": 273844, "cbid": 251, "correlation": 273844 } }, { "ph": "f", "id": 273844, "pid": 76337, "tid": -914061504, "ts": 1716454225817258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817258, "dur": 0, "args": { "External id": 273845, "cbid": 251, "correlation": 273845 } }, { "ph": "f", "id": 273845, "pid": 76337, "tid": -914061504, "ts": 1716454225817258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817259, "dur": 0, "args": { "External id": 273846, "cbid": 251, "correlation": 273846 } }, { "ph": "f", "id": 273846, "pid": 76337, "tid": -914061504, "ts": 1716454225817259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817260, "dur": 0, "args": { "External id": 273847, "cbid": 251, "correlation": 273847 } }, { "ph": "f", "id": 273847, "pid": 76337, "tid": -914061504, "ts": 1716454225817260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817261, "dur": 0, "args": { "External id": 273848, "cbid": 251, "correlation": 273848 } }, { "ph": "f", "id": 273848, "pid": 76337, "tid": -914061504, "ts": 1716454225817261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225861432, "dur": 112, "args": { "External id": 273849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273849, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 273849, "pid": 5, "tid": 7, "ts": 1716454225861432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817263, "dur": 12, "args": { "External id": 273849, "cbid": 211, "correlation": 273849 } }, { "ph": "s", "id": 273849, "pid": 76337, "tid": -914061504, "ts": 1716454225817263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225861546, "dur": 57, "args": { "External id": 273855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273855, "pid": 5, "tid": 7, "ts": 1716454225861546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817298, "dur": 9, "args": { "External id": 273855, "cbid": 211, "correlation": 273855 } }, { "ph": "s", "id": 273855, "pid": 76337, "tid": -914061504, "ts": 1716454225817298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225861604, "dur": 50, "args": { "External id": 273863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273863, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273863, "pid": 5, "tid": 7, "ts": 1716454225861604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817329, "dur": 8, "args": { "External id": 273863, "cbid": 211, "correlation": 273863 } }, { "ph": "s", "id": 273863, "pid": 76337, "tid": -914061504, "ts": 1716454225817329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225817403, "dur": 0, "args": { "External id": 273873, "cbid": 317, "correlation": 273873 } }, { "ph": "f", "id": 273873, "pid": 76337, "tid": -914061504, "ts": 1716454225817403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225817404, "dur": 0, "args": { "External id": 273874, "cbid": 203, "correlation": 273874 } }, { "ph": "f", "id": 273874, "pid": 76337, "tid": -914061504, "ts": 1716454225817404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225817404, "dur": 0, "args": { "External id": 273875, "cbid": 205, "correlation": 273875 } }, { "ph": "f", "id": 273875, "pid": 76337, "tid": -914061504, "ts": 1716454225817404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225861655, "dur": 41, "args": { "External id": 273879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273879, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273879, "pid": 5, "tid": 7, "ts": 1716454225861655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817421, "dur": 12, "args": { "External id": 273879, "cbid": 211, "correlation": 273879 } }, { "ph": "s", "id": 273879, "pid": 76337, "tid": -914061504, "ts": 1716454225817421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225861697, "dur": 13, "args": { "External id": 273881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273881, "pid": 5, "tid": 7, "ts": 1716454225861697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817436, "dur": 5, "args": { "External id": 273881, "cbid": 211, "correlation": 273881 } }, { "ph": "s", "id": 273881, "pid": 76337, "tid": -914061504, "ts": 1716454225817436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225861713, "dur": 1, "args": { "External id": 273883, "device": 5, "context": 1, "stream": 7, "correlation": 273883, "bytes": 1536, "memory bandwidth (GB/s)": 0.9411764705882353 } }, { "ph": "f", "id": 273883, "pid": 5, "tid": 7, "ts": 1716454225861713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225817455, "dur": 17, "args": { "External id": 273883, "cbid": 51, "correlation": 273883 } }, { "ph": "s", "id": 273883, "pid": 76337, "tid": -914061504, "ts": 1716454225817455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225861717, "dur": 348, "args": { "External id": 273884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273884, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273884, "pid": 5, "tid": 7, "ts": 1716454225861717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817477, "dur": 10, "args": { "External id": 273884, "cbid": 211, "correlation": 273884 } }, { "ph": "s", "id": 273884, "pid": 76337, "tid": -914061504, "ts": 1716454225817477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225862066, "dur": 13, "args": { "External id": 273886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273886, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273886, "pid": 5, "tid": 7, "ts": 1716454225862066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817497, "dur": 7, "args": { "External id": 273886, "cbid": 211, "correlation": 273886 } }, { "ph": "s", "id": 273886, "pid": 76337, "tid": -914061504, "ts": 1716454225817497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225862080, "dur": 14, "args": { "External id": 273892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273892, "pid": 5, "tid": 7, "ts": 1716454225862080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817527, "dur": 9, "args": { "External id": 273892, "cbid": 211, "correlation": 273892 } }, { "ph": "s", "id": 273892, "pid": 76337, "tid": -914061504, "ts": 1716454225817527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225862096, "dur": 19, "args": { "External id": 273912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273912, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 273912, "pid": 5, "tid": 7, "ts": 1716454225862096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817619, "dur": 12, "args": { "External id": 273912, "cbid": 211, "correlation": 273912 } }, { "ph": "s", "id": 273912, "pid": 76337, "tid": -914061504, "ts": 1716454225817619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225862116, "dur": 5, "args": { "External id": 273924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273924, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273924, "pid": 5, "tid": 7, "ts": 1716454225862116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817642, "dur": 6, "args": { "External id": 273924, "cbid": 211, "correlation": 273924 } }, { "ph": "s", "id": 273924, "pid": 76337, "tid": -914061504, "ts": 1716454225817642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225862121, "dur": 16, "args": { "External id": 273927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273927, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273927, "pid": 5, "tid": 7, "ts": 1716454225862121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817660, "dur": 7, "args": { "External id": 273927, "cbid": 211, "correlation": 273927 } }, { "ph": "s", "id": 273927, "pid": 76337, "tid": -914061504, "ts": 1716454225817660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225862139, "dur": 12, "args": { "External id": 273936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273936, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273936, "pid": 5, "tid": 7, "ts": 1716454225862139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817700, "dur": 10, "args": { "External id": 273936, "cbid": 211, "correlation": 273936 } }, { "ph": "s", "id": 273936, "pid": 76337, "tid": -914061504, "ts": 1716454225817700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225817763, "dur": 0, "args": { "External id": 273946, "cbid": 317, "correlation": 273946 } }, { "ph": "f", "id": 273946, "pid": 76337, "tid": -914061504, "ts": 1716454225817763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225817763, "dur": 0, "args": { "External id": 273947, "cbid": 203, "correlation": 273947 } }, { "ph": "f", "id": 273947, "pid": 76337, "tid": -914061504, "ts": 1716454225817763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225817764, "dur": 0, "args": { "External id": 273948, "cbid": 205, "correlation": 273948 } }, { "ph": "f", "id": 273948, "pid": 76337, "tid": -914061504, "ts": 1716454225817764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225862152, "dur": 11, "args": { "External id": 273952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273952, "pid": 5, "tid": 7, "ts": 1716454225862152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817780, "dur": 13, "args": { "External id": 273952, "cbid": 211, "correlation": 273952 } }, { "ph": "s", "id": 273952, "pid": 76337, "tid": -914061504, "ts": 1716454225817780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225862164, "dur": 23, "args": { "External id": 273954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273954, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273954, "pid": 5, "tid": 7, "ts": 1716454225862164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817796, "dur": 5, "args": { "External id": 273954, "cbid": 211, "correlation": 273954 } }, { "ph": "s", "id": 273954, "pid": 76337, "tid": -914061504, "ts": 1716454225817796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225862189, "dur": 4, "args": { "External id": 273956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 273956, "pid": 5, "tid": 7, "ts": 1716454225862189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817806, "dur": 6, "args": { "External id": 273956, "cbid": 211, "correlation": 273956 } }, { "ph": "s", "id": 273956, "pid": 76337, "tid": -914061504, "ts": 1716454225817806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225817816, "dur": 0, "args": { "External id": 273957, "cbid": 51, "correlation": 273957 } }, { "ph": "s", "id": 273957, "pid": 76337, "tid": -914061504, "ts": 1716454225817816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225862194, "dur": 346, "args": { "External id": 273958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273958, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 273958, "pid": 5, "tid": 7, "ts": 1716454225862194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817819, "dur": 7, "args": { "External id": 273958, "cbid": 211, "correlation": 273958 } }, { "ph": "s", "id": 273958, "pid": 76337, "tid": -914061504, "ts": 1716454225817819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225862542, "dur": 19, "args": { "External id": 273959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273959, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273959, "pid": 5, "tid": 7, "ts": 1716454225862542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817829, "dur": 5, "args": { "External id": 273959, "cbid": 211, "correlation": 273959 } }, { "ph": "s", "id": 273959, "pid": 76337, "tid": -914061504, "ts": 1716454225817829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225862562, "dur": 32, "args": { "External id": 273965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 273965, "pid": 5, "tid": 7, "ts": 1716454225862562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817859, "dur": 9, "args": { "External id": 273965, "cbid": 211, "correlation": 273965 } }, { "ph": "s", "id": 273965, "pid": 76337, "tid": -914061504, "ts": 1716454225817859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225862595, "dur": 4, "args": { "External id": 273973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273973, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 273973, "pid": 5, "tid": 7, "ts": 1716454225862595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817904, "dur": 10, "args": { "External id": 273973, "cbid": 211, "correlation": 273973 } }, { "ph": "s", "id": 273973, "pid": 76337, "tid": -914061504, "ts": 1716454225817904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817969, "dur": 1, "args": { "External id": 273989, "cbid": 251, "correlation": 273989 } }, { "ph": "f", "id": 273989, "pid": 76337, "tid": -914061504, "ts": 1716454225817969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225817982, "dur": 0, "args": { "External id": 273991, "cbid": 251, "correlation": 273991 } }, { "ph": "f", "id": 273991, "pid": 76337, "tid": -914061504, "ts": 1716454225817982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225862601, "dur": 11, "args": { "External id": 273992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273992, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 273992, "pid": 5, "tid": 7, "ts": 1716454225862601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817984, "dur": 12, "args": { "External id": 273992, "cbid": 211, "correlation": 273992 } }, { "ph": "s", "id": 273992, "pid": 76337, "tid": -914061504, "ts": 1716454225817984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225862613, "dur": 5, "args": { "External id": 273994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 273994, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 273994, "pid": 5, "tid": 7, "ts": 1716454225862613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225817998, "dur": 5, "args": { "External id": 273994, "cbid": 211, "correlation": 273994 } }, { "ph": "s", "id": 273994, "pid": 76337, "tid": -914061504, "ts": 1716454225817998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225862619, "dur": 28, "args": { "External id": 274004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274004, "pid": 5, "tid": 7, "ts": 1716454225862619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818057, "dur": 12, "args": { "External id": 274004, "cbid": 211, "correlation": 274004 } }, { "ph": "s", "id": 274004, "pid": 76337, "tid": -914061504, "ts": 1716454225818057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225862648, "dur": 29, "args": { "External id": 274024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274024, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 274024, "pid": 5, "tid": 7, "ts": 1716454225862648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818126, "dur": 12, "args": { "External id": 274024, "cbid": 211, "correlation": 274024 } }, { "ph": "s", "id": 274024, "pid": 76337, "tid": -914061504, "ts": 1716454225818126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225862679, "dur": 4, "args": { "External id": 274036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274036, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 274036, "pid": 5, "tid": 7, "ts": 1716454225862679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818149, "dur": 6, "args": { "External id": 274036, "cbid": 211, "correlation": 274036 } }, { "ph": "s", "id": 274036, "pid": 76337, "tid": -914061504, "ts": 1716454225818149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225862684, "dur": 29, "args": { "External id": 274039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274039, "pid": 5, "tid": 7, "ts": 1716454225862684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818168, "dur": 6, "args": { "External id": 274039, "cbid": 211, "correlation": 274039 } }, { "ph": "s", "id": 274039, "pid": 76337, "tid": -914061504, "ts": 1716454225818168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225862714, "dur": 21, "args": { "External id": 274048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274048, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274048, "pid": 5, "tid": 7, "ts": 1716454225862714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818209, "dur": 10, "args": { "External id": 274048, "cbid": 211, "correlation": 274048 } }, { "ph": "s", "id": 274048, "pid": 76337, "tid": -914061504, "ts": 1716454225818209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225818272, "dur": 0, "args": { "External id": 274058, "cbid": 317, "correlation": 274058 } }, { "ph": "f", "id": 274058, "pid": 76337, "tid": -914061504, "ts": 1716454225818272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225818273, "dur": 0, "args": { "External id": 274059, "cbid": 203, "correlation": 274059 } }, { "ph": "f", "id": 274059, "pid": 76337, "tid": -914061504, "ts": 1716454225818273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225818274, "dur": 0, "args": { "External id": 274060, "cbid": 205, "correlation": 274060 } }, { "ph": "f", "id": 274060, "pid": 76337, "tid": -914061504, "ts": 1716454225818274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225862736, "dur": 22, "args": { "External id": 274064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274064, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274064, "pid": 5, "tid": 7, "ts": 1716454225862736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818296, "dur": 12, "args": { "External id": 274064, "cbid": 211, "correlation": 274064 } }, { "ph": "s", "id": 274064, "pid": 76337, "tid": -914061504, "ts": 1716454225818296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225862759, "dur": 41, "args": { "External id": 274066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274066, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274066, "pid": 5, "tid": 7, "ts": 1716454225862759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818311, "dur": 5, "args": { "External id": 274066, "cbid": 211, "correlation": 274066 } }, { "ph": "s", "id": 274066, "pid": 76337, "tid": -914061504, "ts": 1716454225818311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225862802, "dur": 633, "args": { "External id": 274068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274068, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274068, "pid": 5, "tid": 7, "ts": 1716454225862802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818328, "dur": 14, "args": { "External id": 274068, "cbid": 211, "correlation": 274068 } }, { "ph": "s", "id": 274068, "pid": 76337, "tid": -914061504, "ts": 1716454225818328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225863436, "dur": 20, "args": { "External id": 274070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274070, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274070, "pid": 5, "tid": 7, "ts": 1716454225863436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818346, "dur": 5, "args": { "External id": 274070, "cbid": 211, "correlation": 274070 } }, { "ph": "s", "id": 274070, "pid": 76337, "tid": -914061504, "ts": 1716454225818346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225863458, "dur": 32, "args": { "External id": 274076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274076, "pid": 5, "tid": 7, "ts": 1716454225863458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818374, "dur": 9, "args": { "External id": 274076, "cbid": 211, "correlation": 274076 } }, { "ph": "s", "id": 274076, "pid": 76337, "tid": -914061504, "ts": 1716454225818374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225818434, "dur": 0, "args": { "External id": 274086, "cbid": 317, "correlation": 274086 } }, { "ph": "f", "id": 274086, "pid": 76337, "tid": -914061504, "ts": 1716454225818434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225818434, "dur": 0, "args": { "External id": 274087, "cbid": 203, "correlation": 274087 } }, { "ph": "f", "id": 274087, "pid": 76337, "tid": -914061504, "ts": 1716454225818434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225818435, "dur": 0, "args": { "External id": 274088, "cbid": 205, "correlation": 274088 } }, { "ph": "f", "id": 274088, "pid": 76337, "tid": -914061504, "ts": 1716454225818435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818458, "dur": 1, "args": { "External id": 274092, "cbid": 251, "correlation": 274092 } }, { "ph": "f", "id": 274092, "pid": 76337, "tid": -914061504, "ts": 1716454225818458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818460, "dur": 0, "args": { "External id": 274093, "cbid": 251, "correlation": 274093 } }, { "ph": "f", "id": 274093, "pid": 76337, "tid": -914061504, "ts": 1716454225818460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818460, "dur": 0, "args": { "External id": 274094, "cbid": 251, "correlation": 274094 } }, { "ph": "f", "id": 274094, "pid": 76337, "tid": -914061504, "ts": 1716454225818460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818461, "dur": 0, "args": { "External id": 274095, "cbid": 251, "correlation": 274095 } }, { "ph": "f", "id": 274095, "pid": 76337, "tid": -914061504, "ts": 1716454225818461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818462, "dur": 0, "args": { "External id": 274096, "cbid": 251, "correlation": 274096 } }, { "ph": "f", "id": 274096, "pid": 76337, "tid": -914061504, "ts": 1716454225818462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818463, "dur": 0, "args": { "External id": 274097, "cbid": 251, "correlation": 274097 } }, { "ph": "f", "id": 274097, "pid": 76337, "tid": -914061504, "ts": 1716454225818463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818463, "dur": 0, "args": { "External id": 274098, "cbid": 251, "correlation": 274098 } }, { "ph": "f", "id": 274098, "pid": 76337, "tid": -914061504, "ts": 1716454225818463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818464, "dur": 0, "args": { "External id": 274099, "cbid": 251, "correlation": 274099 } }, { "ph": "f", "id": 274099, "pid": 76337, "tid": -914061504, "ts": 1716454225818464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225818466, "dur": 0, "args": { "External id": 274100, "cbid": 251, "correlation": 274100 } }, { "ph": "f", "id": 274100, "pid": 76337, "tid": -914061504, "ts": 1716454225818466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225863491, "dur": 49, "args": { "External id": 274101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274101, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 274101, "pid": 5, "tid": 7, "ts": 1716454225863491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818468, "dur": 13, "args": { "External id": 274101, "cbid": 211, "correlation": 274101 } }, { "ph": "s", "id": 274101, "pid": 76337, "tid": -914061504, "ts": 1716454225818468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225863541, "dur": 31, "args": { "External id": 274107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274107, "pid": 5, "tid": 7, "ts": 1716454225863541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818500, "dur": 8, "args": { "External id": 274107, "cbid": 211, "correlation": 274107 } }, { "ph": "s", "id": 274107, "pid": 76337, "tid": -914061504, "ts": 1716454225818500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225863574, "dur": 27, "args": { "External id": 274115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274115, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274115, "pid": 5, "tid": 7, "ts": 1716454225863574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818531, "dur": 8, "args": { "External id": 274115, "cbid": 211, "correlation": 274115 } }, { "ph": "s", "id": 274115, "pid": 76337, "tid": -914061504, "ts": 1716454225818531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225863602, "dur": 20, "args": { "External id": 274123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274123, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274123, "pid": 5, "tid": 7, "ts": 1716454225863602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818559, "dur": 8, "args": { "External id": 274123, "cbid": 211, "correlation": 274123 } }, { "ph": "s", "id": 274123, "pid": 76337, "tid": -914061504, "ts": 1716454225818559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225863623, "dur": 29, "args": { "External id": 274143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274143, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 274143, "pid": 5, "tid": 7, "ts": 1716454225863623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818640, "dur": 12, "args": { "External id": 274143, "cbid": 211, "correlation": 274143 } }, { "ph": "s", "id": 274143, "pid": 76337, "tid": -914061504, "ts": 1716454225818640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225863653, "dur": 4, "args": { "External id": 274155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274155, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 274155, "pid": 5, "tid": 7, "ts": 1716454225863653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818662, "dur": 6, "args": { "External id": 274155, "cbid": 211, "correlation": 274155 } }, { "ph": "s", "id": 274155, "pid": 76337, "tid": -914061504, "ts": 1716454225818662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225863658, "dur": 29, "args": { "External id": 274158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274158, "pid": 5, "tid": 7, "ts": 1716454225863658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818681, "dur": 6, "args": { "External id": 274158, "cbid": 211, "correlation": 274158 } }, { "ph": "s", "id": 274158, "pid": 76337, "tid": -914061504, "ts": 1716454225818681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225818738, "dur": 0, "args": { "External id": 274169, "cbid": 317, "correlation": 274169 } }, { "ph": "f", "id": 274169, "pid": 76337, "tid": -914061504, "ts": 1716454225818738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225818738, "dur": 0, "args": { "External id": 274170, "cbid": 203, "correlation": 274170 } }, { "ph": "f", "id": 274170, "pid": 76337, "tid": -914061504, "ts": 1716454225818738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225818739, "dur": 0, "args": { "External id": 274171, "cbid": 205, "correlation": 274171 } }, { "ph": "f", "id": 274171, "pid": 76337, "tid": -914061504, "ts": 1716454225818739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225863688, "dur": 22, "args": { "External id": 274175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274175, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274175, "pid": 5, "tid": 7, "ts": 1716454225863688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818754, "dur": 12, "args": { "External id": 274175, "cbid": 211, "correlation": 274175 } }, { "ph": "s", "id": 274175, "pid": 76337, "tid": -914061504, "ts": 1716454225818754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225863712, "dur": 114, "args": { "External id": 274177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274177, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274177, "pid": 5, "tid": 7, "ts": 1716454225863712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818778, "dur": 9, "args": { "External id": 274177, "cbid": 211, "correlation": 274177 } }, { "ph": "s", "id": 274177, "pid": 76337, "tid": -914061504, "ts": 1716454225818778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225863828, "dur": 22, "args": { "External id": 274179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274179, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274179, "pid": 5, "tid": 7, "ts": 1716454225863828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818791, "dur": 5, "args": { "External id": 274179, "cbid": 211, "correlation": 274179 } }, { "ph": "s", "id": 274179, "pid": 76337, "tid": -914061504, "ts": 1716454225818791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225863851, "dur": 31, "args": { "External id": 274185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274185, "pid": 5, "tid": 7, "ts": 1716454225863851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818819, "dur": 9, "args": { "External id": 274185, "cbid": 211, "correlation": 274185 } }, { "ph": "s", "id": 274185, "pid": 76337, "tid": -914061504, "ts": 1716454225818819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225863884, "dur": 184, "args": { "External id": 274194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274194, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274194, "pid": 5, "tid": 7, "ts": 1716454225863884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818902, "dur": 14, "args": { "External id": 274194, "cbid": 211, "correlation": 274194 } }, { "ph": "s", "id": 274194, "pid": 76337, "tid": -914061504, "ts": 1716454225818902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225864069, "dur": 63, "args": { "External id": 274216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274216, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274216, "pid": 5, "tid": 7, "ts": 1716454225864069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225818959, "dur": 10, "args": { "External id": 274216, "cbid": 211, "correlation": 274216 } }, { "ph": "s", "id": 274216, "pid": 76337, "tid": -914061504, "ts": 1716454225818959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819058, "dur": 1, "args": { "External id": 274227, "cbid": 251, "correlation": 274227 } }, { "ph": "f", "id": 274227, "pid": 76337, "tid": -914061504, "ts": 1716454225819058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225864133, "dur": 150, "args": { "External id": 274228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274228, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274228, "pid": 5, "tid": 7, "ts": 1716454225864133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819063, "dur": 14, "args": { "External id": 274228, "cbid": 211, "correlation": 274228 } }, { "ph": "s", "id": 274228, "pid": 76337, "tid": -914061504, "ts": 1716454225819063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819135, "dur": 1, "args": { "External id": 274239, "cbid": 251, "correlation": 274239 } }, { "ph": "f", "id": 274239, "pid": 76337, "tid": -914061504, "ts": 1716454225819135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225864285, "dur": 142, "args": { "External id": 274240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274240, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274240, "pid": 5, "tid": 7, "ts": 1716454225864285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819138, "dur": 11, "args": { "External id": 274240, "cbid": 211, "correlation": 274240 } }, { "ph": "s", "id": 274240, "pid": 76337, "tid": -914061504, "ts": 1716454225819138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819202, "dur": 1, "args": { "External id": 274251, "cbid": 251, "correlation": 274251 } }, { "ph": "f", "id": 274251, "pid": 76337, "tid": -914061504, "ts": 1716454225819202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225864429, "dur": 143, "args": { "External id": 274252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274252, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274252, "pid": 5, "tid": 7, "ts": 1716454225864429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819206, "dur": 11, "args": { "External id": 274252, "cbid": 211, "correlation": 274252 } }, { "ph": "s", "id": 274252, "pid": 76337, "tid": -914061504, "ts": 1716454225819206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225864573, "dur": 1848, "args": { "External id": 274273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274273, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 274273, "pid": 5, "tid": 7, "ts": 1716454225864573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819290, "dur": 13, "args": { "External id": 274273, "cbid": 211, "correlation": 274273 } }, { "ph": "s", "id": 274273, "pid": 76337, "tid": -914061504, "ts": 1716454225819290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819390, "dur": 1, "args": { "External id": 274291, "cbid": 251, "correlation": 274291 } }, { "ph": "f", "id": 274291, "pid": 76337, "tid": -914061504, "ts": 1716454225819390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225866423, "dur": 142, "args": { "External id": 274293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274293, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 274293, "pid": 5, "tid": 7, "ts": 1716454225866423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819395, "dur": 13, "args": { "External id": 274293, "cbid": 211, "correlation": 274293 } }, { "ph": "s", "id": 274293, "pid": 76337, "tid": -914061504, "ts": 1716454225819395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225866567, "dur": 36, "args": { "External id": 274301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274301, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274301, "pid": 5, "tid": 7, "ts": 1716454225866567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819466, "dur": 12, "args": { "External id": 274301, "cbid": 211, "correlation": 274301 } }, { "ph": "s", "id": 274301, "pid": 76337, "tid": -914061504, "ts": 1716454225819466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225866604, "dur": 51, "args": { "External id": 274309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274309, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274309, "pid": 5, "tid": 7, "ts": 1716454225866604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819505, "dur": 9, "args": { "External id": 274309, "cbid": 211, "correlation": 274309 } }, { "ph": "s", "id": 274309, "pid": 76337, "tid": -914061504, "ts": 1716454225819505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225866656, "dur": 30, "args": { "External id": 274320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274320, "pid": 5, "tid": 7, "ts": 1716454225866656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819578, "dur": 13, "args": { "External id": 274320, "cbid": 211, "correlation": 274320 } }, { "ph": "s", "id": 274320, "pid": 76337, "tid": -914061504, "ts": 1716454225819578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225866687, "dur": 33, "args": { "External id": 274342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274342, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274342, "pid": 5, "tid": 7, "ts": 1716454225866687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819609, "dur": 8, "args": { "External id": 274342, "cbid": 211, "correlation": 274342 } }, { "ph": "s", "id": 274342, "pid": 76337, "tid": -914061504, "ts": 1716454225819609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819699, "dur": 1, "args": { "External id": 274353, "cbid": 251, "correlation": 274353 } }, { "ph": "f", "id": 274353, "pid": 76337, "tid": -914061504, "ts": 1716454225819699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225866721, "dur": 87, "args": { "External id": 274354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274354, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274354, "pid": 5, "tid": 7, "ts": 1716454225866721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819704, "dur": 14, "args": { "External id": 274354, "cbid": 211, "correlation": 274354 } }, { "ph": "s", "id": 274354, "pid": 76337, "tid": -914061504, "ts": 1716454225819704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819774, "dur": 1, "args": { "External id": 274365, "cbid": 251, "correlation": 274365 } }, { "ph": "f", "id": 274365, "pid": 76337, "tid": -914061504, "ts": 1716454225819774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819778, "dur": 0, "args": { "External id": 274366, "cbid": 251, "correlation": 274366 } }, { "ph": "f", "id": 274366, "pid": 76337, "tid": -914061504, "ts": 1716454225819778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225866809, "dur": 11, "args": { "External id": 274367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274367, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 274367, "pid": 5, "tid": 7, "ts": 1716454225866809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819780, "dur": 12, "args": { "External id": 274367, "cbid": 211, "correlation": 274367 } }, { "ph": "s", "id": 274367, "pid": 76337, "tid": -914061504, "ts": 1716454225819780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225866822, "dur": 5, "args": { "External id": 274369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274369, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 274369, "pid": 5, "tid": 7, "ts": 1716454225866822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819795, "dur": 7, "args": { "External id": 274369, "cbid": 211, "correlation": 274369 } }, { "ph": "s", "id": 274369, "pid": 76337, "tid": -914061504, "ts": 1716454225819795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819856, "dur": 1, "args": { "External id": 274380, "cbid": 251, "correlation": 274380 } }, { "ph": "f", "id": 274380, "pid": 76337, "tid": -914061504, "ts": 1716454225819856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225819859, "dur": 0, "args": { "External id": 274381, "cbid": 251, "correlation": 274381 } }, { "ph": "f", "id": 274381, "pid": 76337, "tid": -914061504, "ts": 1716454225819859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225866828, "dur": 7, "args": { "External id": 274382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274382, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 274382, "pid": 5, "tid": 7, "ts": 1716454225866828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819861, "dur": 12, "args": { "External id": 274382, "cbid": 211, "correlation": 274382 } }, { "ph": "s", "id": 274382, "pid": 76337, "tid": -914061504, "ts": 1716454225819861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225866836, "dur": 3, "args": { "External id": 274384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274384, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 274384, "pid": 5, "tid": 7, "ts": 1716454225866836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819875, "dur": 5, "args": { "External id": 274384, "cbid": 211, "correlation": 274384 } }, { "ph": "s", "id": 274384, "pid": 76337, "tid": -914061504, "ts": 1716454225819875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225866841, "dur": 88, "args": { "External id": 274405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274405, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 274405, "pid": 5, "tid": 7, "ts": 1716454225866841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225819949, "dur": 13, "args": { "External id": 274405, "cbid": 211, "correlation": 274405 } }, { "ph": "s", "id": 274405, "pid": 76337, "tid": -914061504, "ts": 1716454225819949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225820060, "dur": 1, "args": { "External id": 274423, "cbid": 251, "correlation": 274423 } }, { "ph": "f", "id": 274423, "pid": 76337, "tid": -914061504, "ts": 1716454225820060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225866930, "dur": 95, "args": { "External id": 274425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274425, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274425, "pid": 5, "tid": 7, "ts": 1716454225866930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820067, "dur": 14, "args": { "External id": 274425, "cbid": 211, "correlation": 274425 } }, { "ph": "s", "id": 274425, "pid": 76337, "tid": -914061504, "ts": 1716454225820067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225867025, "dur": 18, "args": { "External id": 274433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274433, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274433, "pid": 5, "tid": 7, "ts": 1716454225867025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820137, "dur": 12, "args": { "External id": 274433, "cbid": 211, "correlation": 274433 } }, { "ph": "s", "id": 274433, "pid": 76337, "tid": -914061504, "ts": 1716454225820137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225867045, "dur": 36, "args": { "External id": 274441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274441, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274441, "pid": 5, "tid": 7, "ts": 1716454225867045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820178, "dur": 9, "args": { "External id": 274441, "cbid": 211, "correlation": 274441 } }, { "ph": "s", "id": 274441, "pid": 76337, "tid": -914061504, "ts": 1716454225820178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225867082, "dur": 33, "args": { "External id": 274463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274463, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274463, "pid": 5, "tid": 7, "ts": 1716454225867082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820233, "dur": 10, "args": { "External id": 274463, "cbid": 211, "correlation": 274463 } }, { "ph": "s", "id": 274463, "pid": 76337, "tid": -914061504, "ts": 1716454225820233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225820324, "dur": 1, "args": { "External id": 274479, "cbid": 251, "correlation": 274479 } }, { "ph": "f", "id": 274479, "pid": 76337, "tid": -914061504, "ts": 1716454225820324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225820329, "dur": 0, "args": { "External id": 274481, "cbid": 251, "correlation": 274481 } }, { "ph": "f", "id": 274481, "pid": 76337, "tid": -914061504, "ts": 1716454225820329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225867117, "dur": 523, "args": { "External id": 274482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274482, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 274482, "pid": 5, "tid": 7, "ts": 1716454225867117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820332, "dur": 13, "args": { "External id": 274482, "cbid": 211, "correlation": 274482 } }, { "ph": "s", "id": 274482, "pid": 76337, "tid": -914061504, "ts": 1716454225820332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225867641, "dur": 120, "args": { "External id": 274490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274490, "pid": 5, "tid": 7, "ts": 1716454225867641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820397, "dur": 13, "args": { "External id": 274490, "cbid": 211, "correlation": 274490 } }, { "ph": "s", "id": 274490, "pid": 76337, "tid": -914061504, "ts": 1716454225820397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225867763, "dur": 127, "args": { "External id": 274498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274498, "pid": 5, "tid": 7, "ts": 1716454225867763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820429, "dur": 8, "args": { "External id": 274498, "cbid": 211, "correlation": 274498 } }, { "ph": "s", "id": 274498, "pid": 76337, "tid": -914061504, "ts": 1716454225820429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225820511, "dur": 2, "args": { "External id": 274514, "cbid": 251, "correlation": 274514 } }, { "ph": "f", "id": 274514, "pid": 76337, "tid": -914061504, "ts": 1716454225820511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225867891, "dur": 292, "args": { "External id": 274516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274516, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274516, "pid": 5, "tid": 7, "ts": 1716454225867891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820517, "dur": 12, "args": { "External id": 274516, "cbid": 211, "correlation": 274516 } }, { "ph": "s", "id": 274516, "pid": 76337, "tid": -914061504, "ts": 1716454225820517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225868185, "dur": 28, "args": { "External id": 274524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274524, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274524, "pid": 5, "tid": 7, "ts": 1716454225868185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820560, "dur": 10, "args": { "External id": 274524, "cbid": 211, "correlation": 274524 } }, { "ph": "s", "id": 274524, "pid": 76337, "tid": -914061504, "ts": 1716454225820560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225868214, "dur": 78, "args": { "External id": 274535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274535, "pid": 5, "tid": 7, "ts": 1716454225868214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820628, "dur": 12, "args": { "External id": 274535, "cbid": 211, "correlation": 274535 } }, { "ph": "s", "id": 274535, "pid": 76337, "tid": -914061504, "ts": 1716454225820628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225820693, "dur": 0, "args": { "External id": 274547, "cbid": 317, "correlation": 274547 } }, { "ph": "f", "id": 274547, "pid": 76337, "tid": -914061504, "ts": 1716454225820693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225820694, "dur": 0, "args": { "External id": 274548, "cbid": 203, "correlation": 274548 } }, { "ph": "f", "id": 274548, "pid": 76337, "tid": -914061504, "ts": 1716454225820694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225820694, "dur": 0, "args": { "External id": 274549, "cbid": 205, "correlation": 274549 } }, { "ph": "f", "id": 274549, "pid": 76337, "tid": -914061504, "ts": 1716454225820694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225868292, "dur": 22, "args": { "External id": 274553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274553, "pid": 5, "tid": 7, "ts": 1716454225868292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820713, "dur": 12, "args": { "External id": 274553, "cbid": 211, "correlation": 274553 } }, { "ph": "s", "id": 274553, "pid": 76337, "tid": -914061504, "ts": 1716454225820713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225868315, "dur": 115, "args": { "External id": 274555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274555, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274555, "pid": 5, "tid": 7, "ts": 1716454225868315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820731, "dur": 7, "args": { "External id": 274555, "cbid": 211, "correlation": 274555 } }, { "ph": "s", "id": 274555, "pid": 76337, "tid": -914061504, "ts": 1716454225820731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225868431, "dur": 22, "args": { "External id": 274557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274557, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274557, "pid": 5, "tid": 7, "ts": 1716454225868431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820742, "dur": 5, "args": { "External id": 274557, "cbid": 211, "correlation": 274557 } }, { "ph": "s", "id": 274557, "pid": 76337, "tid": -914061504, "ts": 1716454225820742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225868455, "dur": 32, "args": { "External id": 274563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274563, "pid": 5, "tid": 7, "ts": 1716454225868455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820771, "dur": 8, "args": { "External id": 274563, "cbid": 211, "correlation": 274563 } }, { "ph": "s", "id": 274563, "pid": 76337, "tid": -914061504, "ts": 1716454225820771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225868488, "dur": 27, "args": { "External id": 274571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274571, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274571, "pid": 5, "tid": 7, "ts": 1716454225868488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820803, "dur": 9, "args": { "External id": 274571, "cbid": 211, "correlation": 274571 } }, { "ph": "s", "id": 274571, "pid": 76337, "tid": -914061504, "ts": 1716454225820803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225868516, "dur": 29, "args": { "External id": 274591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274591, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 274591, "pid": 5, "tid": 7, "ts": 1716454225868516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820875, "dur": 11, "args": { "External id": 274591, "cbid": 211, "correlation": 274591 } }, { "ph": "s", "id": 274591, "pid": 76337, "tid": -914061504, "ts": 1716454225820875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225868546, "dur": 4, "args": { "External id": 274603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274603, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 274603, "pid": 5, "tid": 7, "ts": 1716454225868546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820899, "dur": 7, "args": { "External id": 274603, "cbid": 211, "correlation": 274603 } }, { "ph": "s", "id": 274603, "pid": 76337, "tid": -914061504, "ts": 1716454225820899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225868552, "dur": 30, "args": { "External id": 274606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274606, "pid": 5, "tid": 7, "ts": 1716454225868552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820918, "dur": 7, "args": { "External id": 274606, "cbid": 211, "correlation": 274606 } }, { "ph": "s", "id": 274606, "pid": 76337, "tid": -914061504, "ts": 1716454225820918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225868583, "dur": 20, "args": { "External id": 274615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274615, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274615, "pid": 5, "tid": 7, "ts": 1716454225868583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225820958, "dur": 10, "args": { "External id": 274615, "cbid": 211, "correlation": 274615 } }, { "ph": "s", "id": 274615, "pid": 76337, "tid": -914061504, "ts": 1716454225820958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225821018, "dur": 0, "args": { "External id": 274625, "cbid": 317, "correlation": 274625 } }, { "ph": "f", "id": 274625, "pid": 76337, "tid": -914061504, "ts": 1716454225821018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225821019, "dur": 0, "args": { "External id": 274626, "cbid": 203, "correlation": 274626 } }, { "ph": "f", "id": 274626, "pid": 76337, "tid": -914061504, "ts": 1716454225821019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225821019, "dur": 0, "args": { "External id": 274627, "cbid": 205, "correlation": 274627 } }, { "ph": "f", "id": 274627, "pid": 76337, "tid": -914061504, "ts": 1716454225821019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225868605, "dur": 22, "args": { "External id": 274631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274631, "pid": 5, "tid": 7, "ts": 1716454225868605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821032, "dur": 12, "args": { "External id": 274631, "cbid": 211, "correlation": 274631 } }, { "ph": "s", "id": 274631, "pid": 76337, "tid": -914061504, "ts": 1716454225821032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225868628, "dur": 42, "args": { "External id": 274633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274633, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274633, "pid": 5, "tid": 7, "ts": 1716454225868628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821046, "dur": 5, "args": { "External id": 274633, "cbid": 211, "correlation": 274633 } }, { "ph": "s", "id": 274633, "pid": 76337, "tid": -914061504, "ts": 1716454225821046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225868671, "dur": 633, "args": { "External id": 274635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274635, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274635, "pid": 5, "tid": 7, "ts": 1716454225868671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821058, "dur": 6, "args": { "External id": 274635, "cbid": 211, "correlation": 274635 } }, { "ph": "s", "id": 274635, "pid": 76337, "tid": -914061504, "ts": 1716454225821058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225869306, "dur": 20, "args": { "External id": 274637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274637, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274637, "pid": 5, "tid": 7, "ts": 1716454225869306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821068, "dur": 5, "args": { "External id": 274637, "cbid": 211, "correlation": 274637 } }, { "ph": "s", "id": 274637, "pid": 76337, "tid": -914061504, "ts": 1716454225821068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225869328, "dur": 32, "args": { "External id": 274643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274643, "pid": 5, "tid": 7, "ts": 1716454225869328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821095, "dur": 9, "args": { "External id": 274643, "cbid": 211, "correlation": 274643 } }, { "ph": "s", "id": 274643, "pid": 76337, "tid": -914061504, "ts": 1716454225821095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225869361, "dur": 4, "args": { "External id": 274651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274651, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 274651, "pid": 5, "tid": 7, "ts": 1716454225869361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821138, "dur": 9, "args": { "External id": 274651, "cbid": 211, "correlation": 274651 } }, { "ph": "s", "id": 274651, "pid": 76337, "tid": -914061504, "ts": 1716454225821138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225821206, "dur": 1, "args": { "External id": 274667, "cbid": 251, "correlation": 274667 } }, { "ph": "f", "id": 274667, "pid": 76337, "tid": -914061504, "ts": 1716454225821206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225821211, "dur": 0, "args": { "External id": 274669, "cbid": 251, "correlation": 274669 } }, { "ph": "f", "id": 274669, "pid": 76337, "tid": -914061504, "ts": 1716454225821211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225869366, "dur": 12, "args": { "External id": 274670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274670, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 274670, "pid": 5, "tid": 7, "ts": 1716454225869366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821213, "dur": 11, "args": { "External id": 274670, "cbid": 211, "correlation": 274670 } }, { "ph": "s", "id": 274670, "pid": 76337, "tid": -914061504, "ts": 1716454225821213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225869380, "dur": 5, "args": { "External id": 274672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274672, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 274672, "pid": 5, "tid": 7, "ts": 1716454225869380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821225, "dur": 5, "args": { "External id": 274672, "cbid": 211, "correlation": 274672 } }, { "ph": "s", "id": 274672, "pid": 76337, "tid": -914061504, "ts": 1716454225821225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225869386, "dur": 29, "args": { "External id": 274682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274682, "pid": 5, "tid": 7, "ts": 1716454225869386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821282, "dur": 12, "args": { "External id": 274682, "cbid": 211, "correlation": 274682 } }, { "ph": "s", "id": 274682, "pid": 76337, "tid": -914061504, "ts": 1716454225821282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225869416, "dur": 30, "args": { "External id": 274702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274702, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 274702, "pid": 5, "tid": 7, "ts": 1716454225869416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821347, "dur": 11, "args": { "External id": 274702, "cbid": 211, "correlation": 274702 } }, { "ph": "s", "id": 274702, "pid": 76337, "tid": -914061504, "ts": 1716454225821347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225869447, "dur": 4, "args": { "External id": 274714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274714, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 274714, "pid": 5, "tid": 7, "ts": 1716454225869447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821370, "dur": 7, "args": { "External id": 274714, "cbid": 211, "correlation": 274714 } }, { "ph": "s", "id": 274714, "pid": 76337, "tid": -914061504, "ts": 1716454225821370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225869453, "dur": 29, "args": { "External id": 274717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274717, "pid": 5, "tid": 7, "ts": 1716454225869453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821390, "dur": 6, "args": { "External id": 274717, "cbid": 211, "correlation": 274717 } }, { "ph": "s", "id": 274717, "pid": 76337, "tid": -914061504, "ts": 1716454225821390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225869483, "dur": 21, "args": { "External id": 274726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274726, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274726, "pid": 5, "tid": 7, "ts": 1716454225869483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821431, "dur": 10, "args": { "External id": 274726, "cbid": 211, "correlation": 274726 } }, { "ph": "s", "id": 274726, "pid": 76337, "tid": -914061504, "ts": 1716454225821431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225821494, "dur": 0, "args": { "External id": 274736, "cbid": 317, "correlation": 274736 } }, { "ph": "f", "id": 274736, "pid": 76337, "tid": -914061504, "ts": 1716454225821494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225821495, "dur": 0, "args": { "External id": 274737, "cbid": 203, "correlation": 274737 } }, { "ph": "f", "id": 274737, "pid": 76337, "tid": -914061504, "ts": 1716454225821495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225821496, "dur": 0, "args": { "External id": 274738, "cbid": 205, "correlation": 274738 } }, { "ph": "f", "id": 274738, "pid": 76337, "tid": -914061504, "ts": 1716454225821496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225869505, "dur": 22, "args": { "External id": 274742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274742, "pid": 5, "tid": 7, "ts": 1716454225869505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821510, "dur": 12, "args": { "External id": 274742, "cbid": 211, "correlation": 274742 } }, { "ph": "s", "id": 274742, "pid": 76337, "tid": -914061504, "ts": 1716454225821510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225869528, "dur": 42, "args": { "External id": 274744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274744, "pid": 5, "tid": 7, "ts": 1716454225869528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821524, "dur": 5, "args": { "External id": 274744, "cbid": 211, "correlation": 274744 } }, { "ph": "s", "id": 274744, "pid": 76337, "tid": -914061504, "ts": 1716454225821524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225869572, "dur": 624, "args": { "External id": 274746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274746, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274746, "pid": 5, "tid": 7, "ts": 1716454225869572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821535, "dur": 5, "args": { "External id": 274746, "cbid": 211, "correlation": 274746 } }, { "ph": "s", "id": 274746, "pid": 76337, "tid": -914061504, "ts": 1716454225821535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225870197, "dur": 21, "args": { "External id": 274748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274748, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274748, "pid": 5, "tid": 7, "ts": 1716454225870197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821544, "dur": 5, "args": { "External id": 274748, "cbid": 211, "correlation": 274748 } }, { "ph": "s", "id": 274748, "pid": 76337, "tid": -914061504, "ts": 1716454225821544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225870220, "dur": 32, "args": { "External id": 274754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274754, "pid": 5, "tid": 7, "ts": 1716454225870220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821572, "dur": 8, "args": { "External id": 274754, "cbid": 211, "correlation": 274754 } }, { "ph": "s", "id": 274754, "pid": 76337, "tid": -914061504, "ts": 1716454225821572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225870254, "dur": 27, "args": { "External id": 274762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274762, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274762, "pid": 5, "tid": 7, "ts": 1716454225870254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821605, "dur": 8, "args": { "External id": 274762, "cbid": 211, "correlation": 274762 } }, { "ph": "s", "id": 274762, "pid": 76337, "tid": -914061504, "ts": 1716454225821605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225870282, "dur": 20, "args": { "External id": 274770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274770, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274770, "pid": 5, "tid": 7, "ts": 1716454225870282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821633, "dur": 8, "args": { "External id": 274770, "cbid": 211, "correlation": 274770 } }, { "ph": "s", "id": 274770, "pid": 76337, "tid": -914061504, "ts": 1716454225821633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225870303, "dur": 29, "args": { "External id": 274790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274790, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 274790, "pid": 5, "tid": 7, "ts": 1716454225870303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821712, "dur": 12, "args": { "External id": 274790, "cbid": 211, "correlation": 274790 } }, { "ph": "s", "id": 274790, "pid": 76337, "tid": -914061504, "ts": 1716454225821712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225870333, "dur": 4, "args": { "External id": 274802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274802, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 274802, "pid": 5, "tid": 7, "ts": 1716454225870333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821733, "dur": 6, "args": { "External id": 274802, "cbid": 211, "correlation": 274802 } }, { "ph": "s", "id": 274802, "pid": 76337, "tid": -914061504, "ts": 1716454225821733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225870338, "dur": 29, "args": { "External id": 274805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274805, "pid": 5, "tid": 7, "ts": 1716454225870338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821751, "dur": 6, "args": { "External id": 274805, "cbid": 211, "correlation": 274805 } }, { "ph": "s", "id": 274805, "pid": 76337, "tid": -914061504, "ts": 1716454225821751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225821809, "dur": 0, "args": { "External id": 274816, "cbid": 317, "correlation": 274816 } }, { "ph": "f", "id": 274816, "pid": 76337, "tid": -914061504, "ts": 1716454225821809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225821809, "dur": 0, "args": { "External id": 274817, "cbid": 203, "correlation": 274817 } }, { "ph": "f", "id": 274817, "pid": 76337, "tid": -914061504, "ts": 1716454225821809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225821810, "dur": 0, "args": { "External id": 274818, "cbid": 205, "correlation": 274818 } }, { "ph": "f", "id": 274818, "pid": 76337, "tid": -914061504, "ts": 1716454225821810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225870368, "dur": 22, "args": { "External id": 274822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274822, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274822, "pid": 5, "tid": 7, "ts": 1716454225870368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821823, "dur": 11, "args": { "External id": 274822, "cbid": 211, "correlation": 274822 } }, { "ph": "s", "id": 274822, "pid": 76337, "tid": -914061504, "ts": 1716454225821823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225870391, "dur": 110, "args": { "External id": 274824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274824, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274824, "pid": 5, "tid": 7, "ts": 1716454225870391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821840, "dur": 6, "args": { "External id": 274824, "cbid": 211, "correlation": 274824 } }, { "ph": "s", "id": 274824, "pid": 76337, "tid": -914061504, "ts": 1716454225821840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225870503, "dur": 21, "args": { "External id": 274826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274826, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274826, "pid": 5, "tid": 7, "ts": 1716454225870503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821850, "dur": 5, "args": { "External id": 274826, "cbid": 211, "correlation": 274826 } }, { "ph": "s", "id": 274826, "pid": 76337, "tid": -914061504, "ts": 1716454225821850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225870525, "dur": 31, "args": { "External id": 274832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274832, "pid": 5, "tid": 7, "ts": 1716454225870525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821877, "dur": 8, "args": { "External id": 274832, "cbid": 211, "correlation": 274832 } }, { "ph": "s", "id": 274832, "pid": 76337, "tid": -914061504, "ts": 1716454225821877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225870557, "dur": 184, "args": { "External id": 274841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274841, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274841, "pid": 5, "tid": 7, "ts": 1716454225870557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225821958, "dur": 14, "args": { "External id": 274841, "cbid": 211, "correlation": 274841 } }, { "ph": "s", "id": 274841, "pid": 76337, "tid": -914061504, "ts": 1716454225821958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225870743, "dur": 63, "args": { "External id": 274863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274863, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274863, "pid": 5, "tid": 7, "ts": 1716454225870743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822026, "dur": 11, "args": { "External id": 274863, "cbid": 211, "correlation": 274863 } }, { "ph": "s", "id": 274863, "pid": 76337, "tid": -914061504, "ts": 1716454225822026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822115, "dur": 1, "args": { "External id": 274874, "cbid": 251, "correlation": 274874 } }, { "ph": "f", "id": 274874, "pid": 76337, "tid": -914061504, "ts": 1716454225822115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225870807, "dur": 151, "args": { "External id": 274875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274875, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274875, "pid": 5, "tid": 7, "ts": 1716454225870807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822120, "dur": 12, "args": { "External id": 274875, "cbid": 211, "correlation": 274875 } }, { "ph": "s", "id": 274875, "pid": 76337, "tid": -914061504, "ts": 1716454225822120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822193, "dur": 1, "args": { "External id": 274886, "cbid": 251, "correlation": 274886 } }, { "ph": "f", "id": 274886, "pid": 76337, "tid": -914061504, "ts": 1716454225822193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225870960, "dur": 143, "args": { "External id": 274887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274887, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274887, "pid": 5, "tid": 7, "ts": 1716454225870960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822198, "dur": 11, "args": { "External id": 274887, "cbid": 211, "correlation": 274887 } }, { "ph": "s", "id": 274887, "pid": 76337, "tid": -914061504, "ts": 1716454225822198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822263, "dur": 1, "args": { "External id": 274898, "cbid": 251, "correlation": 274898 } }, { "ph": "f", "id": 274898, "pid": 76337, "tid": -914061504, "ts": 1716454225822263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225871104, "dur": 144, "args": { "External id": 274899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274899, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 274899, "pid": 5, "tid": 7, "ts": 1716454225871104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822267, "dur": 11, "args": { "External id": 274899, "cbid": 211, "correlation": 274899 } }, { "ph": "s", "id": 274899, "pid": 76337, "tid": -914061504, "ts": 1716454225822267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225871249, "dur": 1855, "args": { "External id": 274920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274920, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 274920, "pid": 5, "tid": 7, "ts": 1716454225871249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822349, "dur": 12, "args": { "External id": 274920, "cbid": 211, "correlation": 274920 } }, { "ph": "s", "id": 274920, "pid": 76337, "tid": -914061504, "ts": 1716454225822349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822446, "dur": 1, "args": { "External id": 274938, "cbid": 251, "correlation": 274938 } }, { "ph": "f", "id": 274938, "pid": 76337, "tid": -914061504, "ts": 1716454225822446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225873105, "dur": 143, "args": { "External id": 274940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274940, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 274940, "pid": 5, "tid": 7, "ts": 1716454225873105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822452, "dur": 13, "args": { "External id": 274940, "cbid": 211, "correlation": 274940 } }, { "ph": "s", "id": 274940, "pid": 76337, "tid": -914061504, "ts": 1716454225822452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225873250, "dur": 35, "args": { "External id": 274948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274948, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274948, "pid": 5, "tid": 7, "ts": 1716454225873250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822522, "dur": 12, "args": { "External id": 274948, "cbid": 211, "correlation": 274948 } }, { "ph": "s", "id": 274948, "pid": 76337, "tid": -914061504, "ts": 1716454225822522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225873286, "dur": 51, "args": { "External id": 274956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274956, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274956, "pid": 5, "tid": 7, "ts": 1716454225873286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822561, "dur": 9, "args": { "External id": 274956, "cbid": 211, "correlation": 274956 } }, { "ph": "s", "id": 274956, "pid": 76337, "tid": -914061504, "ts": 1716454225822561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225873338, "dur": 29, "args": { "External id": 274967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274967, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274967, "pid": 5, "tid": 7, "ts": 1716454225873338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822636, "dur": 14, "args": { "External id": 274967, "cbid": 211, "correlation": 274967 } }, { "ph": "s", "id": 274967, "pid": 76337, "tid": -914061504, "ts": 1716454225822636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225873368, "dur": 33, "args": { "External id": 274989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 274989, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 274989, "pid": 5, "tid": 7, "ts": 1716454225873368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822668, "dur": 8, "args": { "External id": 274989, "cbid": 211, "correlation": 274989 } }, { "ph": "s", "id": 274989, "pid": 76337, "tid": -914061504, "ts": 1716454225822668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822755, "dur": 1, "args": { "External id": 275000, "cbid": 251, "correlation": 275000 } }, { "ph": "f", "id": 275000, "pid": 76337, "tid": -914061504, "ts": 1716454225822755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225873403, "dur": 84, "args": { "External id": 275001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275001, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275001, "pid": 5, "tid": 7, "ts": 1716454225873403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822760, "dur": 12, "args": { "External id": 275001, "cbid": 211, "correlation": 275001 } }, { "ph": "s", "id": 275001, "pid": 76337, "tid": -914061504, "ts": 1716454225822760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822829, "dur": 1, "args": { "External id": 275012, "cbid": 251, "correlation": 275012 } }, { "ph": "f", "id": 275012, "pid": 76337, "tid": -914061504, "ts": 1716454225822829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822833, "dur": 0, "args": { "External id": 275013, "cbid": 251, "correlation": 275013 } }, { "ph": "f", "id": 275013, "pid": 76337, "tid": -914061504, "ts": 1716454225822833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225873488, "dur": 11, "args": { "External id": 275014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275014, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 275014, "pid": 5, "tid": 7, "ts": 1716454225873488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822834, "dur": 13, "args": { "External id": 275014, "cbid": 211, "correlation": 275014 } }, { "ph": "s", "id": 275014, "pid": 76337, "tid": -914061504, "ts": 1716454225822834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225873501, "dur": 5, "args": { "External id": 275016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275016, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 275016, "pid": 5, "tid": 7, "ts": 1716454225873501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822849, "dur": 6, "args": { "External id": 275016, "cbid": 211, "correlation": 275016 } }, { "ph": "s", "id": 275016, "pid": 76337, "tid": -914061504, "ts": 1716454225822849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822906, "dur": 1, "args": { "External id": 275027, "cbid": 251, "correlation": 275027 } }, { "ph": "f", "id": 275027, "pid": 76337, "tid": -914061504, "ts": 1716454225822906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225822910, "dur": 0, "args": { "External id": 275028, "cbid": 251, "correlation": 275028 } }, { "ph": "f", "id": 275028, "pid": 76337, "tid": -914061504, "ts": 1716454225822910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225873508, "dur": 7, "args": { "External id": 275029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275029, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 275029, "pid": 5, "tid": 7, "ts": 1716454225873508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822911, "dur": 11, "args": { "External id": 275029, "cbid": 211, "correlation": 275029 } }, { "ph": "s", "id": 275029, "pid": 76337, "tid": -914061504, "ts": 1716454225822911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225873516, "dur": 3, "args": { "External id": 275031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275031, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 275031, "pid": 5, "tid": 7, "ts": 1716454225873516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225822924, "dur": 5, "args": { "External id": 275031, "cbid": 211, "correlation": 275031 } }, { "ph": "s", "id": 275031, "pid": 76337, "tid": -914061504, "ts": 1716454225822924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225873520, "dur": 88, "args": { "External id": 275052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275052, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 275052, "pid": 5, "tid": 7, "ts": 1716454225873520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823005, "dur": 13, "args": { "External id": 275052, "cbid": 211, "correlation": 275052 } }, { "ph": "s", "id": 275052, "pid": 76337, "tid": -914061504, "ts": 1716454225823005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225823104, "dur": 1, "args": { "External id": 275070, "cbid": 251, "correlation": 275070 } }, { "ph": "f", "id": 275070, "pid": 76337, "tid": -914061504, "ts": 1716454225823104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225873609, "dur": 95, "args": { "External id": 275072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275072, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275072, "pid": 5, "tid": 7, "ts": 1716454225873609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823110, "dur": 13, "args": { "External id": 275072, "cbid": 211, "correlation": 275072 } }, { "ph": "s", "id": 275072, "pid": 76337, "tid": -914061504, "ts": 1716454225823110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225873705, "dur": 19, "args": { "External id": 275080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275080, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275080, "pid": 5, "tid": 7, "ts": 1716454225873705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823179, "dur": 12, "args": { "External id": 275080, "cbid": 211, "correlation": 275080 } }, { "ph": "s", "id": 275080, "pid": 76337, "tid": -914061504, "ts": 1716454225823179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225873725, "dur": 37, "args": { "External id": 275088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275088, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275088, "pid": 5, "tid": 7, "ts": 1716454225873725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823222, "dur": 9, "args": { "External id": 275088, "cbid": 211, "correlation": 275088 } }, { "ph": "s", "id": 275088, "pid": 76337, "tid": -914061504, "ts": 1716454225823222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225873764, "dur": 33, "args": { "External id": 275110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275110, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275110, "pid": 5, "tid": 7, "ts": 1716454225873764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823276, "dur": 10, "args": { "External id": 275110, "cbid": 211, "correlation": 275110 } }, { "ph": "s", "id": 275110, "pid": 76337, "tid": -914061504, "ts": 1716454225823276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225823369, "dur": 1, "args": { "External id": 275126, "cbid": 251, "correlation": 275126 } }, { "ph": "f", "id": 275126, "pid": 76337, "tid": -914061504, "ts": 1716454225823369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225823373, "dur": 0, "args": { "External id": 275128, "cbid": 251, "correlation": 275128 } }, { "ph": "f", "id": 275128, "pid": 76337, "tid": -914061504, "ts": 1716454225823373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225873798, "dur": 523, "args": { "External id": 275129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275129, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275129, "pid": 5, "tid": 7, "ts": 1716454225873798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823377, "dur": 13, "args": { "External id": 275129, "cbid": 211, "correlation": 275129 } }, { "ph": "s", "id": 275129, "pid": 76337, "tid": -914061504, "ts": 1716454225823377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225874322, "dur": 123, "args": { "External id": 275137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275137, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275137, "pid": 5, "tid": 7, "ts": 1716454225874322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823443, "dur": 12, "args": { "External id": 275137, "cbid": 211, "correlation": 275137 } }, { "ph": "s", "id": 275137, "pid": 76337, "tid": -914061504, "ts": 1716454225823443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225874447, "dur": 125, "args": { "External id": 275145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275145, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275145, "pid": 5, "tid": 7, "ts": 1716454225874447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823473, "dur": 8, "args": { "External id": 275145, "cbid": 211, "correlation": 275145 } }, { "ph": "s", "id": 275145, "pid": 76337, "tid": -914061504, "ts": 1716454225823473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225823553, "dur": 1, "args": { "External id": 275161, "cbid": 251, "correlation": 275161 } }, { "ph": "f", "id": 275161, "pid": 76337, "tid": -914061504, "ts": 1716454225823553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225874573, "dur": 300, "args": { "External id": 275163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275163, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275163, "pid": 5, "tid": 7, "ts": 1716454225874573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823558, "dur": 12, "args": { "External id": 275163, "cbid": 211, "correlation": 275163 } }, { "ph": "s", "id": 275163, "pid": 76337, "tid": -914061504, "ts": 1716454225823558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225874874, "dur": 27, "args": { "External id": 275171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275171, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275171, "pid": 5, "tid": 7, "ts": 1716454225874874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823602, "dur": 10, "args": { "External id": 275171, "cbid": 211, "correlation": 275171 } }, { "ph": "s", "id": 275171, "pid": 76337, "tid": -914061504, "ts": 1716454225823602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225874902, "dur": 78, "args": { "External id": 275182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275182, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275182, "pid": 5, "tid": 7, "ts": 1716454225874902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823668, "dur": 12, "args": { "External id": 275182, "cbid": 211, "correlation": 275182 } }, { "ph": "s", "id": 275182, "pid": 76337, "tid": -914061504, "ts": 1716454225823668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225823732, "dur": 0, "args": { "External id": 275194, "cbid": 317, "correlation": 275194 } }, { "ph": "f", "id": 275194, "pid": 76337, "tid": -914061504, "ts": 1716454225823732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225823733, "dur": 0, "args": { "External id": 275195, "cbid": 203, "correlation": 275195 } }, { "ph": "f", "id": 275195, "pid": 76337, "tid": -914061504, "ts": 1716454225823733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225823734, "dur": 0, "args": { "External id": 275196, "cbid": 205, "correlation": 275196 } }, { "ph": "f", "id": 275196, "pid": 76337, "tid": -914061504, "ts": 1716454225823734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225874982, "dur": 23, "args": { "External id": 275200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275200, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275200, "pid": 5, "tid": 7, "ts": 1716454225874982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823752, "dur": 12, "args": { "External id": 275200, "cbid": 211, "correlation": 275200 } }, { "ph": "s", "id": 275200, "pid": 76337, "tid": -914061504, "ts": 1716454225823752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225875007, "dur": 115, "args": { "External id": 275202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275202, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275202, "pid": 5, "tid": 7, "ts": 1716454225875007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823770, "dur": 7, "args": { "External id": 275202, "cbid": 211, "correlation": 275202 } }, { "ph": "s", "id": 275202, "pid": 76337, "tid": -914061504, "ts": 1716454225823770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225875123, "dur": 22, "args": { "External id": 275204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275204, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275204, "pid": 5, "tid": 7, "ts": 1716454225875123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823781, "dur": 6, "args": { "External id": 275204, "cbid": 211, "correlation": 275204 } }, { "ph": "s", "id": 275204, "pid": 76337, "tid": -914061504, "ts": 1716454225823781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225875147, "dur": 31, "args": { "External id": 275210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275210, "pid": 5, "tid": 7, "ts": 1716454225875147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823809, "dur": 8, "args": { "External id": 275210, "cbid": 211, "correlation": 275210 } }, { "ph": "s", "id": 275210, "pid": 76337, "tid": -914061504, "ts": 1716454225823809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225875179, "dur": 27, "args": { "External id": 275218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275218, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275218, "pid": 5, "tid": 7, "ts": 1716454225875179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823842, "dur": 8, "args": { "External id": 275218, "cbid": 211, "correlation": 275218 } }, { "ph": "s", "id": 275218, "pid": 76337, "tid": -914061504, "ts": 1716454225823842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225823919, "dur": 0, "args": { "External id": 275228, "cbid": 317, "correlation": 275228 } }, { "ph": "f", "id": 275228, "pid": 76337, "tid": -914061504, "ts": 1716454225823919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225823919, "dur": 0, "args": { "External id": 275229, "cbid": 203, "correlation": 275229 } }, { "ph": "f", "id": 275229, "pid": 76337, "tid": -914061504, "ts": 1716454225823919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225823920, "dur": 0, "args": { "External id": 275230, "cbid": 205, "correlation": 275230 } }, { "ph": "f", "id": 275230, "pid": 76337, "tid": -914061504, "ts": 1716454225823920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225875207, "dur": 23, "args": { "External id": 275234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275234, "pid": 5, "tid": 7, "ts": 1716454225875207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823935, "dur": 13, "args": { "External id": 275234, "cbid": 211, "correlation": 275234 } }, { "ph": "s", "id": 275234, "pid": 76337, "tid": -914061504, "ts": 1716454225823935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225875232, "dur": 42, "args": { "External id": 275236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275236, "pid": 5, "tid": 7, "ts": 1716454225875232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823950, "dur": 5, "args": { "External id": 275236, "cbid": 211, "correlation": 275236 } }, { "ph": "s", "id": 275236, "pid": 76337, "tid": -914061504, "ts": 1716454225823950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225875275, "dur": 227, "args": { "External id": 275238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275238, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 275238, "pid": 5, "tid": 7, "ts": 1716454225875275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823962, "dur": 7, "args": { "External id": 275238, "cbid": 211, "correlation": 275238 } }, { "ph": "s", "id": 275238, "pid": 76337, "tid": -914061504, "ts": 1716454225823962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225875504, "dur": 6, "args": { "External id": 275240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275240, "pid": 5, "tid": 7, "ts": 1716454225875504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225823973, "dur": 13, "args": { "External id": 275240, "cbid": 211, "correlation": 275240 } }, { "ph": "s", "id": 275240, "pid": 76337, "tid": -914061504, "ts": 1716454225823973, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225875511, "dur": 9, "args": { "External id": 275246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275246, "pid": 5, "tid": 7, "ts": 1716454225875511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824008, "dur": 8, "args": { "External id": 275246, "cbid": 211, "correlation": 275246 } }, { "ph": "s", "id": 275246, "pid": 76337, "tid": -914061504, "ts": 1716454225824008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225875522, "dur": 11, "args": { "External id": 275266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275266, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 275266, "pid": 5, "tid": 7, "ts": 1716454225875522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824101, "dur": 12, "args": { "External id": 275266, "cbid": 211, "correlation": 275266 } }, { "ph": "s", "id": 275266, "pid": 76337, "tid": -914061504, "ts": 1716454225824101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225875534, "dur": 4, "args": { "External id": 275278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275278, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 275278, "pid": 5, "tid": 7, "ts": 1716454225875534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824127, "dur": 7, "args": { "External id": 275278, "cbid": 211, "correlation": 275278 } }, { "ph": "s", "id": 275278, "pid": 76337, "tid": -914061504, "ts": 1716454225824127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225875540, "dur": 12, "args": { "External id": 275281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275281, "pid": 5, "tid": 7, "ts": 1716454225875540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824146, "dur": 7, "args": { "External id": 275281, "cbid": 211, "correlation": 275281 } }, { "ph": "s", "id": 275281, "pid": 76337, "tid": -914061504, "ts": 1716454225824146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225875553, "dur": 6, "args": { "External id": 275290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275290, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275290, "pid": 5, "tid": 7, "ts": 1716454225875553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824186, "dur": 9, "args": { "External id": 275290, "cbid": 211, "correlation": 275290 } }, { "ph": "s", "id": 275290, "pid": 76337, "tid": -914061504, "ts": 1716454225824186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225824239, "dur": 0, "args": { "External id": 275300, "cbid": 317, "correlation": 275300 } }, { "ph": "f", "id": 275300, "pid": 76337, "tid": -914061504, "ts": 1716454225824239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225824239, "dur": 0, "args": { "External id": 275301, "cbid": 203, "correlation": 275301 } }, { "ph": "f", "id": 275301, "pid": 76337, "tid": -914061504, "ts": 1716454225824239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225824240, "dur": 0, "args": { "External id": 275302, "cbid": 205, "correlation": 275302 } }, { "ph": "f", "id": 275302, "pid": 76337, "tid": -914061504, "ts": 1716454225824240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225875560, "dur": 5, "args": { "External id": 275306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275306, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275306, "pid": 5, "tid": 7, "ts": 1716454225875560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824259, "dur": 11, "args": { "External id": 275306, "cbid": 211, "correlation": 275306 } }, { "ph": "s", "id": 275306, "pid": 76337, "tid": -914061504, "ts": 1716454225824259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225875567, "dur": 80, "args": { "External id": 275308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275308, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275308, "pid": 5, "tid": 7, "ts": 1716454225875567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824274, "dur": 5, "args": { "External id": 275308, "cbid": 211, "correlation": 275308 } }, { "ph": "s", "id": 275308, "pid": 76337, "tid": -914061504, "ts": 1716454225824274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225875649, "dur": 1, "args": { "External id": 275310, "device": 5, "context": 1, "stream": 7, "correlation": 275310, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 275310, "pid": 5, "tid": 7, "ts": 1716454225875649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225824287, "dur": 8, "args": { "External id": 275310, "cbid": 51, "correlation": 275310 } }, { "ph": "s", "id": 275310, "pid": 76337, "tid": -914061504, "ts": 1716454225824287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225875652, "dur": 525, "args": { "External id": 275311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275311, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275311, "pid": 5, "tid": 7, "ts": 1716454225875652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824299, "dur": 9, "args": { "External id": 275311, "cbid": 211, "correlation": 275311 } }, { "ph": "s", "id": 275311, "pid": 76337, "tid": -914061504, "ts": 1716454225824299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225876179, "dur": 11, "args": { "External id": 275313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275313, "pid": 5, "tid": 7, "ts": 1716454225876179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824312, "dur": 5, "args": { "External id": 275313, "cbid": 211, "correlation": 275313 } }, { "ph": "s", "id": 275313, "pid": 76337, "tid": -914061504, "ts": 1716454225824312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225876191, "dur": 14, "args": { "External id": 275319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275319, "pid": 5, "tid": 7, "ts": 1716454225876191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824340, "dur": 8, "args": { "External id": 275319, "cbid": 211, "correlation": 275319 } }, { "ph": "s", "id": 275319, "pid": 76337, "tid": -914061504, "ts": 1716454225824340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225876206, "dur": 4, "args": { "External id": 275327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275327, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 275327, "pid": 5, "tid": 7, "ts": 1716454225876206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824384, "dur": 9, "args": { "External id": 275327, "cbid": 211, "correlation": 275327 } }, { "ph": "s", "id": 275327, "pid": 76337, "tid": -914061504, "ts": 1716454225824384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225824452, "dur": 1, "args": { "External id": 275343, "cbid": 251, "correlation": 275343 } }, { "ph": "f", "id": 275343, "pid": 76337, "tid": -914061504, "ts": 1716454225824452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225824457, "dur": 0, "args": { "External id": 275345, "cbid": 251, "correlation": 275345 } }, { "ph": "f", "id": 275345, "pid": 76337, "tid": -914061504, "ts": 1716454225824457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225876211, "dur": 13, "args": { "External id": 275346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275346, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275346, "pid": 5, "tid": 7, "ts": 1716454225876211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824459, "dur": 12, "args": { "External id": 275346, "cbid": 211, "correlation": 275346 } }, { "ph": "s", "id": 275346, "pid": 76337, "tid": -914061504, "ts": 1716454225824459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225876226, "dur": 5, "args": { "External id": 275348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275348, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275348, "pid": 5, "tid": 7, "ts": 1716454225876226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824472, "dur": 6, "args": { "External id": 275348, "cbid": 211, "correlation": 275348 } }, { "ph": "s", "id": 275348, "pid": 76337, "tid": -914061504, "ts": 1716454225824472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225876232, "dur": 17, "args": { "External id": 275358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275358, "pid": 5, "tid": 7, "ts": 1716454225876232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824531, "dur": 12, "args": { "External id": 275358, "cbid": 211, "correlation": 275358 } }, { "ph": "s", "id": 275358, "pid": 76337, "tid": -914061504, "ts": 1716454225824531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225876250, "dur": 18, "args": { "External id": 275378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275378, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 275378, "pid": 5, "tid": 7, "ts": 1716454225876250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824596, "dur": 10, "args": { "External id": 275378, "cbid": 211, "correlation": 275378 } }, { "ph": "s", "id": 275378, "pid": 76337, "tid": -914061504, "ts": 1716454225824596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225876270, "dur": 4, "args": { "External id": 275390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275390, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 275390, "pid": 5, "tid": 7, "ts": 1716454225876270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824620, "dur": 7, "args": { "External id": 275390, "cbid": 211, "correlation": 275390 } }, { "ph": "s", "id": 275390, "pid": 76337, "tid": -914061504, "ts": 1716454225824620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225876276, "dur": 16, "args": { "External id": 275393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275393, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275393, "pid": 5, "tid": 7, "ts": 1716454225876276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824641, "dur": 6, "args": { "External id": 275393, "cbid": 211, "correlation": 275393 } }, { "ph": "s", "id": 275393, "pid": 76337, "tid": -914061504, "ts": 1716454225824641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225876293, "dur": 10, "args": { "External id": 275402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275402, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275402, "pid": 5, "tid": 7, "ts": 1716454225876293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824682, "dur": 9, "args": { "External id": 275402, "cbid": 211, "correlation": 275402 } }, { "ph": "s", "id": 275402, "pid": 76337, "tid": -914061504, "ts": 1716454225824682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225824745, "dur": 0, "args": { "External id": 275412, "cbid": 317, "correlation": 275412 } }, { "ph": "f", "id": 275412, "pid": 76337, "tid": -914061504, "ts": 1716454225824745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225824746, "dur": 0, "args": { "External id": 275413, "cbid": 203, "correlation": 275413 } }, { "ph": "f", "id": 275413, "pid": 76337, "tid": -914061504, "ts": 1716454225824746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225824747, "dur": 0, "args": { "External id": 275414, "cbid": 205, "correlation": 275414 } }, { "ph": "f", "id": 275414, "pid": 76337, "tid": -914061504, "ts": 1716454225824747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225876305, "dur": 11, "args": { "External id": 275418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275418, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275418, "pid": 5, "tid": 7, "ts": 1716454225876305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824764, "dur": 13, "args": { "External id": 275418, "cbid": 211, "correlation": 275418 } }, { "ph": "s", "id": 275418, "pid": 76337, "tid": -914061504, "ts": 1716454225824764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225876317, "dur": 156, "args": { "External id": 275420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275420, "pid": 5, "tid": 7, "ts": 1716454225876317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824779, "dur": 5, "args": { "External id": 275420, "cbid": 211, "correlation": 275420 } }, { "ph": "s", "id": 275420, "pid": 76337, "tid": -914061504, "ts": 1716454225824779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225876475, "dur": 1, "args": { "External id": 275422, "device": 5, "context": 1, "stream": 7, "correlation": 275422, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 275422, "pid": 5, "tid": 7, "ts": 1716454225876475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225824791, "dur": 7, "args": { "External id": 275422, "cbid": 51, "correlation": 275422 } }, { "ph": "s", "id": 275422, "pid": 76337, "tid": -914061504, "ts": 1716454225824791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225876479, "dur": 647, "args": { "External id": 275423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275423, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275423, "pid": 5, "tid": 7, "ts": 1716454225876479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824799, "dur": 6, "args": { "External id": 275423, "cbid": 211, "correlation": 275423 } }, { "ph": "s", "id": 275423, "pid": 76337, "tid": -914061504, "ts": 1716454225824799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225877127, "dur": 13, "args": { "External id": 275425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275425, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275425, "pid": 5, "tid": 7, "ts": 1716454225877127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824810, "dur": 5, "args": { "External id": 275425, "cbid": 211, "correlation": 275425 } }, { "ph": "s", "id": 275425, "pid": 76337, "tid": -914061504, "ts": 1716454225824810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225877141, "dur": 14, "args": { "External id": 275431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275431, "pid": 5, "tid": 7, "ts": 1716454225877141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824839, "dur": 8, "args": { "External id": 275431, "cbid": 211, "correlation": 275431 } }, { "ph": "s", "id": 275431, "pid": 76337, "tid": -914061504, "ts": 1716454225824839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225824896, "dur": 0, "args": { "External id": 275441, "cbid": 317, "correlation": 275441 } }, { "ph": "f", "id": 275441, "pid": 76337, "tid": -914061504, "ts": 1716454225824896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225824897, "dur": 0, "args": { "External id": 275442, "cbid": 203, "correlation": 275442 } }, { "ph": "f", "id": 275442, "pid": 76337, "tid": -914061504, "ts": 1716454225824897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225824898, "dur": 0, "args": { "External id": 275443, "cbid": 205, "correlation": 275443 } }, { "ph": "f", "id": 275443, "pid": 76337, "tid": -914061504, "ts": 1716454225824898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225877156, "dur": 8, "args": { "External id": 275447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275447, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275447, "pid": 5, "tid": 7, "ts": 1716454225877156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824913, "dur": 12, "args": { "External id": 275447, "cbid": 211, "correlation": 275447 } }, { "ph": "s", "id": 275447, "pid": 76337, "tid": -914061504, "ts": 1716454225824913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225877166, "dur": 3, "args": { "External id": 275449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275449, "pid": 5, "tid": 7, "ts": 1716454225877166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824932, "dur": 6, "args": { "External id": 275449, "cbid": 211, "correlation": 275449 } }, { "ph": "s", "id": 275449, "pid": 76337, "tid": -914061504, "ts": 1716454225824932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225824941, "dur": 0, "args": { "External id": 275450, "cbid": 51, "correlation": 275450 } }, { "ph": "s", "id": 275450, "pid": 76337, "tid": -914061504, "ts": 1716454225824941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225877170, "dur": 54, "args": { "External id": 275451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275451, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 275451, "pid": 5, "tid": 7, "ts": 1716454225877170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824942, "dur": 5, "args": { "External id": 275451, "cbid": 211, "correlation": 275451 } }, { "ph": "s", "id": 275451, "pid": 76337, "tid": -914061504, "ts": 1716454225824942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225877226, "dur": 13, "args": { "External id": 275456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275456, "pid": 5, "tid": 7, "ts": 1716454225877226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225824970, "dur": 17, "args": { "External id": 275456, "cbid": 211, "correlation": 275456 } }, { "ph": "s", "id": 275456, "pid": 76337, "tid": -914061504, "ts": 1716454225824970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225877241, "dur": 12, "args": { "External id": 275464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275464, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275464, "pid": 5, "tid": 7, "ts": 1716454225877241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825009, "dur": 8, "args": { "External id": 275464, "cbid": 211, "correlation": 275464 } }, { "ph": "s", "id": 275464, "pid": 76337, "tid": -914061504, "ts": 1716454225825009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225877254, "dur": 10, "args": { "External id": 275472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275472, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275472, "pid": 5, "tid": 7, "ts": 1716454225877254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825038, "dur": 8, "args": { "External id": 275472, "cbid": 211, "correlation": 275472 } }, { "ph": "s", "id": 275472, "pid": 76337, "tid": -914061504, "ts": 1716454225825038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225877265, "dur": 18, "args": { "External id": 275492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275492, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 275492, "pid": 5, "tid": 7, "ts": 1716454225877265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825119, "dur": 13, "args": { "External id": 275492, "cbid": 211, "correlation": 275492 } }, { "ph": "s", "id": 275492, "pid": 76337, "tid": -914061504, "ts": 1716454225825119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225877284, "dur": 4, "args": { "External id": 275504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275504, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 275504, "pid": 5, "tid": 7, "ts": 1716454225877284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825142, "dur": 6, "args": { "External id": 275504, "cbid": 211, "correlation": 275504 } }, { "ph": "s", "id": 275504, "pid": 76337, "tid": -914061504, "ts": 1716454225825142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225877289, "dur": 16, "args": { "External id": 275507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275507, "pid": 5, "tid": 7, "ts": 1716454225877289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825159, "dur": 6, "args": { "External id": 275507, "cbid": 211, "correlation": 275507 } }, { "ph": "s", "id": 275507, "pid": 76337, "tid": -914061504, "ts": 1716454225825159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225825216, "dur": 0, "args": { "External id": 275518, "cbid": 317, "correlation": 275518 } }, { "ph": "f", "id": 275518, "pid": 76337, "tid": -914061504, "ts": 1716454225825216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225825216, "dur": 0, "args": { "External id": 275519, "cbid": 203, "correlation": 275519 } }, { "ph": "f", "id": 275519, "pid": 76337, "tid": -914061504, "ts": 1716454225825216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225825217, "dur": 0, "args": { "External id": 275520, "cbid": 205, "correlation": 275520 } }, { "ph": "f", "id": 275520, "pid": 76337, "tid": -914061504, "ts": 1716454225825217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225877307, "dur": 11, "args": { "External id": 275524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275524, "pid": 5, "tid": 7, "ts": 1716454225877307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825234, "dur": 11, "args": { "External id": 275524, "cbid": 211, "correlation": 275524 } }, { "ph": "s", "id": 275524, "pid": 76337, "tid": -914061504, "ts": 1716454225825234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225877320, "dur": 3, "args": { "External id": 275526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275526, "pid": 5, "tid": 7, "ts": 1716454225877320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825251, "dur": 6, "args": { "External id": 275526, "cbid": 211, "correlation": 275526 } }, { "ph": "s", "id": 275526, "pid": 76337, "tid": -914061504, "ts": 1716454225825251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225825259, "dur": 0, "args": { "External id": 275527, "cbid": 51, "correlation": 275527 } }, { "ph": "s", "id": 275527, "pid": 76337, "tid": -914061504, "ts": 1716454225825259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225877324, "dur": 94, "args": { "External id": 275528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275528, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 275528, "pid": 5, "tid": 7, "ts": 1716454225877324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825260, "dur": 6, "args": { "External id": 275528, "cbid": 211, "correlation": 275528 } }, { "ph": "s", "id": 275528, "pid": 76337, "tid": -914061504, "ts": 1716454225825260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225877420, "dur": 15, "args": { "External id": 275533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275533, "pid": 5, "tid": 7, "ts": 1716454225877420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825289, "dur": 9, "args": { "External id": 275533, "cbid": 211, "correlation": 275533 } }, { "ph": "s", "id": 275533, "pid": 76337, "tid": -914061504, "ts": 1716454225825289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225877436, "dur": 80, "args": { "External id": 275542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275542, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275542, "pid": 5, "tid": 7, "ts": 1716454225877436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825371, "dur": 14, "args": { "External id": 275542, "cbid": 211, "correlation": 275542 } }, { "ph": "s", "id": 275542, "pid": 76337, "tid": -914061504, "ts": 1716454225825371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225877518, "dur": 30, "args": { "External id": 275564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275564, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275564, "pid": 5, "tid": 7, "ts": 1716454225877518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825428, "dur": 10, "args": { "External id": 275564, "cbid": 211, "correlation": 275564 } }, { "ph": "s", "id": 275564, "pid": 76337, "tid": -914061504, "ts": 1716454225825428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225825517, "dur": 2, "args": { "External id": 275575, "cbid": 251, "correlation": 275575 } }, { "ph": "f", "id": 275575, "pid": 76337, "tid": -914061504, "ts": 1716454225825517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225877549, "dur": 159, "args": { "External id": 275576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275576, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275576, "pid": 5, "tid": 7, "ts": 1716454225877549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825523, "dur": 14, "args": { "External id": 275576, "cbid": 211, "correlation": 275576 } }, { "ph": "s", "id": 275576, "pid": 76337, "tid": -914061504, "ts": 1716454225825523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225825594, "dur": 1, "args": { "External id": 275587, "cbid": 251, "correlation": 275587 } }, { "ph": "f", "id": 275587, "pid": 76337, "tid": -914061504, "ts": 1716454225825594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225877709, "dur": 153, "args": { "External id": 275588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275588, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275588, "pid": 5, "tid": 7, "ts": 1716454225877709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825599, "dur": 11, "args": { "External id": 275588, "cbid": 211, "correlation": 275588 } }, { "ph": "s", "id": 275588, "pid": 76337, "tid": -914061504, "ts": 1716454225825599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225825664, "dur": 1, "args": { "External id": 275599, "cbid": 251, "correlation": 275599 } }, { "ph": "f", "id": 275599, "pid": 76337, "tid": -914061504, "ts": 1716454225825664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225877864, "dur": 152, "args": { "External id": 275600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275600, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275600, "pid": 5, "tid": 7, "ts": 1716454225877864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825668, "dur": 11, "args": { "External id": 275600, "cbid": 211, "correlation": 275600 } }, { "ph": "s", "id": 275600, "pid": 76337, "tid": -914061504, "ts": 1716454225825668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225878017, "dur": 326, "args": { "External id": 275625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275625, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275625, "pid": 5, "tid": 7, "ts": 1716454225878017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825753, "dur": 14, "args": { "External id": 275625, "cbid": 211, "correlation": 275625 } }, { "ph": "s", "id": 275625, "pid": 76337, "tid": -914061504, "ts": 1716454225825753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225825857, "dur": 1, "args": { "External id": 275643, "cbid": 251, "correlation": 275643 } }, { "ph": "f", "id": 275643, "pid": 76337, "tid": -914061504, "ts": 1716454225825857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225878344, "dur": 160, "args": { "External id": 275645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275645, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275645, "pid": 5, "tid": 7, "ts": 1716454225878344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825862, "dur": 14, "args": { "External id": 275645, "cbid": 211, "correlation": 275645 } }, { "ph": "s", "id": 275645, "pid": 76337, "tid": -914061504, "ts": 1716454225825862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225878506, "dur": 19, "args": { "External id": 275653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275653, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275653, "pid": 5, "tid": 7, "ts": 1716454225878506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825932, "dur": 12, "args": { "External id": 275653, "cbid": 211, "correlation": 275653 } }, { "ph": "s", "id": 275653, "pid": 76337, "tid": -914061504, "ts": 1716454225825932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225878526, "dur": 27, "args": { "External id": 275661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275661, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275661, "pid": 5, "tid": 7, "ts": 1716454225878526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225825972, "dur": 17, "args": { "External id": 275661, "cbid": 211, "correlation": 275661 } }, { "ph": "s", "id": 275661, "pid": 76337, "tid": -914061504, "ts": 1716454225825972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225878555, "dur": 17, "args": { "External id": 275672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275672, "pid": 5, "tid": 7, "ts": 1716454225878555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826053, "dur": 13, "args": { "External id": 275672, "cbid": 211, "correlation": 275672 } }, { "ph": "s", "id": 275672, "pid": 76337, "tid": -914061504, "ts": 1716454225826053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225878573, "dur": 15, "args": { "External id": 275694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275694, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275694, "pid": 5, "tid": 7, "ts": 1716454225878573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826084, "dur": 7, "args": { "External id": 275694, "cbid": 211, "correlation": 275694 } }, { "ph": "s", "id": 275694, "pid": 76337, "tid": -914061504, "ts": 1716454225826084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826169, "dur": 2, "args": { "External id": 275705, "cbid": 251, "correlation": 275705 } }, { "ph": "f", "id": 275705, "pid": 76337, "tid": -914061504, "ts": 1716454225826169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225878590, "dur": 85, "args": { "External id": 275706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275706, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275706, "pid": 5, "tid": 7, "ts": 1716454225878590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826176, "dur": 14, "args": { "External id": 275706, "cbid": 211, "correlation": 275706 } }, { "ph": "s", "id": 275706, "pid": 76337, "tid": -914061504, "ts": 1716454225826176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826246, "dur": 1, "args": { "External id": 275717, "cbid": 251, "correlation": 275717 } }, { "ph": "f", "id": 275717, "pid": 76337, "tid": -914061504, "ts": 1716454225826246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826250, "dur": 0, "args": { "External id": 275718, "cbid": 251, "correlation": 275718 } }, { "ph": "f", "id": 275718, "pid": 76337, "tid": -914061504, "ts": 1716454225826250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225878677, "dur": 11, "args": { "External id": 275719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275719, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275719, "pid": 5, "tid": 7, "ts": 1716454225878677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826251, "dur": 12, "args": { "External id": 275719, "cbid": 211, "correlation": 275719 } }, { "ph": "s", "id": 275719, "pid": 76337, "tid": -914061504, "ts": 1716454225826251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225878689, "dur": 6, "args": { "External id": 275721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275721, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275721, "pid": 5, "tid": 7, "ts": 1716454225878689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826265, "dur": 7, "args": { "External id": 275721, "cbid": 211, "correlation": 275721 } }, { "ph": "s", "id": 275721, "pid": 76337, "tid": -914061504, "ts": 1716454225826265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826323, "dur": 1, "args": { "External id": 275732, "cbid": 251, "correlation": 275732 } }, { "ph": "f", "id": 275732, "pid": 76337, "tid": -914061504, "ts": 1716454225826323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826326, "dur": 0, "args": { "External id": 275733, "cbid": 251, "correlation": 275733 } }, { "ph": "f", "id": 275733, "pid": 76337, "tid": -914061504, "ts": 1716454225826326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225878696, "dur": 8, "args": { "External id": 275734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275734, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275734, "pid": 5, "tid": 7, "ts": 1716454225878696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826328, "dur": 11, "args": { "External id": 275734, "cbid": 211, "correlation": 275734 } }, { "ph": "s", "id": 275734, "pid": 76337, "tid": -914061504, "ts": 1716454225826328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225878706, "dur": 3, "args": { "External id": 275736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275736, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275736, "pid": 5, "tid": 7, "ts": 1716454225878706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826340, "dur": 5, "args": { "External id": 275736, "cbid": 211, "correlation": 275736 } }, { "ph": "s", "id": 275736, "pid": 76337, "tid": -914061504, "ts": 1716454225826340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225878710, "dur": 52, "args": { "External id": 275761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275761, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275761, "pid": 5, "tid": 7, "ts": 1716454225878710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826417, "dur": 12, "args": { "External id": 275761, "cbid": 211, "correlation": 275761 } }, { "ph": "s", "id": 275761, "pid": 76337, "tid": -914061504, "ts": 1716454225826417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826514, "dur": 6, "args": { "External id": 275779, "cbid": 251, "correlation": 275779 } }, { "ph": "f", "id": 275779, "pid": 76337, "tid": -914061504, "ts": 1716454225826514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225878763, "dur": 88, "args": { "External id": 275781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275781, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275781, "pid": 5, "tid": 7, "ts": 1716454225878763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826525, "dur": 15, "args": { "External id": 275781, "cbid": 211, "correlation": 275781 } }, { "ph": "s", "id": 275781, "pid": 76337, "tid": -914061504, "ts": 1716454225826525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225878853, "dur": 9, "args": { "External id": 275789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275789, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275789, "pid": 5, "tid": 7, "ts": 1716454225878853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826595, "dur": 11, "args": { "External id": 275789, "cbid": 211, "correlation": 275789 } }, { "ph": "s", "id": 275789, "pid": 76337, "tid": -914061504, "ts": 1716454225826595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225878863, "dur": 20, "args": { "External id": 275797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275797, "pid": 5, "tid": 7, "ts": 1716454225878863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826636, "dur": 9, "args": { "External id": 275797, "cbid": 211, "correlation": 275797 } }, { "ph": "s", "id": 275797, "pid": 76337, "tid": -914061504, "ts": 1716454225826636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225878885, "dur": 17, "args": { "External id": 275819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275819, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275819, "pid": 5, "tid": 7, "ts": 1716454225878885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826687, "dur": 10, "args": { "External id": 275819, "cbid": 211, "correlation": 275819 } }, { "ph": "s", "id": 275819, "pid": 76337, "tid": -914061504, "ts": 1716454225826687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826778, "dur": 2, "args": { "External id": 275835, "cbid": 251, "correlation": 275835 } }, { "ph": "f", "id": 275835, "pid": 76337, "tid": -914061504, "ts": 1716454225826778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826784, "dur": 0, "args": { "External id": 275837, "cbid": 251, "correlation": 275837 } }, { "ph": "f", "id": 275837, "pid": 76337, "tid": -914061504, "ts": 1716454225826784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225878903, "dur": 483, "args": { "External id": 275838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275838, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275838, "pid": 5, "tid": 7, "ts": 1716454225878903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826787, "dur": 15, "args": { "External id": 275838, "cbid": 211, "correlation": 275838 } }, { "ph": "s", "id": 275838, "pid": 76337, "tid": -914061504, "ts": 1716454225826787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225879387, "dur": 63, "args": { "External id": 275846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275846, "pid": 5, "tid": 7, "ts": 1716454225879387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826853, "dur": 12, "args": { "External id": 275846, "cbid": 211, "correlation": 275846 } }, { "ph": "s", "id": 275846, "pid": 76337, "tid": -914061504, "ts": 1716454225826853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225879452, "dur": 66, "args": { "External id": 275854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275854, "pid": 5, "tid": 7, "ts": 1716454225879452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826884, "dur": 8, "args": { "External id": 275854, "cbid": 211, "correlation": 275854 } }, { "ph": "s", "id": 275854, "pid": 76337, "tid": -914061504, "ts": 1716454225826884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225826967, "dur": 1, "args": { "External id": 275870, "cbid": 251, "correlation": 275870 } }, { "ph": "f", "id": 275870, "pid": 76337, "tid": -914061504, "ts": 1716454225826967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225879521, "dur": 1, "args": { "External id": 275872, "device": 5, "context": 1, "stream": 7, "correlation": 275872, "bytes": 240, "memory bandwidth (GB/s)": 0.1596806387225549 } }, { "ph": "f", "id": 275872, "pid": 5, "tid": 7, "ts": 1716454225879521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225826981, "dur": 15, "args": { "External id": 275872, "cbid": 51, "correlation": 275872 } }, { "ph": "s", "id": 275872, "pid": 76337, "tid": -914061504, "ts": 1716454225826981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225879524, "dur": 261, "args": { "External id": 275873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275873, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275873, "pid": 5, "tid": 7, "ts": 1716454225879524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225826998, "dur": 11, "args": { "External id": 275873, "cbid": 211, "correlation": 275873 } }, { "ph": "s", "id": 275873, "pid": 76337, "tid": -914061504, "ts": 1716454225826998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225879787, "dur": 14, "args": { "External id": 275881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275881, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275881, "pid": 5, "tid": 7, "ts": 1716454225879787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827041, "dur": 10, "args": { "External id": 275881, "cbid": 211, "correlation": 275881 } }, { "ph": "s", "id": 275881, "pid": 76337, "tid": -914061504, "ts": 1716454225827041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225879802, "dur": 36, "args": { "External id": 275892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275892, "pid": 5, "tid": 7, "ts": 1716454225879802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827109, "dur": 13, "args": { "External id": 275892, "cbid": 211, "correlation": 275892 } }, { "ph": "s", "id": 275892, "pid": 76337, "tid": -914061504, "ts": 1716454225827109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225827174, "dur": 0, "args": { "External id": 275904, "cbid": 317, "correlation": 275904 } }, { "ph": "f", "id": 275904, "pid": 76337, "tid": -914061504, "ts": 1716454225827174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225827175, "dur": 0, "args": { "External id": 275905, "cbid": 203, "correlation": 275905 } }, { "ph": "f", "id": 275905, "pid": 76337, "tid": -914061504, "ts": 1716454225827175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225827176, "dur": 0, "args": { "External id": 275906, "cbid": 205, "correlation": 275906 } }, { "ph": "f", "id": 275906, "pid": 76337, "tid": -914061504, "ts": 1716454225827176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225879839, "dur": 13, "args": { "External id": 275910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275910, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275910, "pid": 5, "tid": 7, "ts": 1716454225879839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827194, "dur": 12, "args": { "External id": 275910, "cbid": 211, "correlation": 275910 } }, { "ph": "s", "id": 275910, "pid": 76337, "tid": -914061504, "ts": 1716454225827194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225879853, "dur": 4, "args": { "External id": 275912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 275912, "pid": 5, "tid": 7, "ts": 1716454225879853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827211, "dur": 7, "args": { "External id": 275912, "cbid": 211, "correlation": 275912 } }, { "ph": "s", "id": 275912, "pid": 76337, "tid": -914061504, "ts": 1716454225827211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225827221, "dur": 0, "args": { "External id": 275913, "cbid": 51, "correlation": 275913 } }, { "ph": "s", "id": 275913, "pid": 76337, "tid": -914061504, "ts": 1716454225827221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225879857, "dur": 93, "args": { "External id": 275914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275914, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 275914, "pid": 5, "tid": 7, "ts": 1716454225879857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827222, "dur": 5, "args": { "External id": 275914, "cbid": 211, "correlation": 275914 } }, { "ph": "s", "id": 275914, "pid": 76337, "tid": -914061504, "ts": 1716454225827222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225879952, "dur": 15, "args": { "External id": 275919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275919, "pid": 5, "tid": 7, "ts": 1716454225879952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827252, "dur": 9, "args": { "External id": 275919, "cbid": 211, "correlation": 275919 } }, { "ph": "s", "id": 275919, "pid": 76337, "tid": -914061504, "ts": 1716454225827252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225879968, "dur": 12, "args": { "External id": 275927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275927, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275927, "pid": 5, "tid": 7, "ts": 1716454225879968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827284, "dur": 8, "args": { "External id": 275927, "cbid": 211, "correlation": 275927 } }, { "ph": "s", "id": 275927, "pid": 76337, "tid": -914061504, "ts": 1716454225827284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225879981, "dur": 17, "args": { "External id": 275947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275947, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 275947, "pid": 5, "tid": 7, "ts": 1716454225879981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827357, "dur": 13, "args": { "External id": 275947, "cbid": 211, "correlation": 275947 } }, { "ph": "s", "id": 275947, "pid": 76337, "tid": -914061504, "ts": 1716454225827357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225880000, "dur": 5, "args": { "External id": 275959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275959, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 275959, "pid": 5, "tid": 7, "ts": 1716454225880000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827380, "dur": 6, "args": { "External id": 275959, "cbid": 211, "correlation": 275959 } }, { "ph": "s", "id": 275959, "pid": 76337, "tid": -914061504, "ts": 1716454225827380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225880006, "dur": 17, "args": { "External id": 275962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275962, "pid": 5, "tid": 7, "ts": 1716454225880006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827397, "dur": 6, "args": { "External id": 275962, "cbid": 211, "correlation": 275962 } }, { "ph": "s", "id": 275962, "pid": 76337, "tid": -914061504, "ts": 1716454225827397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225880024, "dur": 13, "args": { "External id": 275971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275971, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275971, "pid": 5, "tid": 7, "ts": 1716454225880024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827436, "dur": 9, "args": { "External id": 275971, "cbid": 211, "correlation": 275971 } }, { "ph": "s", "id": 275971, "pid": 76337, "tid": -914061504, "ts": 1716454225827436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225827486, "dur": 0, "args": { "External id": 275981, "cbid": 317, "correlation": 275981 } }, { "ph": "f", "id": 275981, "pid": 76337, "tid": -914061504, "ts": 1716454225827486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225827487, "dur": 0, "args": { "External id": 275982, "cbid": 203, "correlation": 275982 } }, { "ph": "f", "id": 275982, "pid": 76337, "tid": -914061504, "ts": 1716454225827487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225827488, "dur": 0, "args": { "External id": 275983, "cbid": 205, "correlation": 275983 } }, { "ph": "f", "id": 275983, "pid": 76337, "tid": -914061504, "ts": 1716454225827488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225880038, "dur": 11, "args": { "External id": 275987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275987, "pid": 5, "tid": 7, "ts": 1716454225880038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827501, "dur": 12, "args": { "External id": 275987, "cbid": 211, "correlation": 275987 } }, { "ph": "s", "id": 275987, "pid": 76337, "tid": -914061504, "ts": 1716454225827501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225880050, "dur": 157, "args": { "External id": 275989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275989, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275989, "pid": 5, "tid": 7, "ts": 1716454225880050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827515, "dur": 6, "args": { "External id": 275989, "cbid": 211, "correlation": 275989 } }, { "ph": "s", "id": 275989, "pid": 76337, "tid": -914061504, "ts": 1716454225827515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225880210, "dur": 1, "args": { "External id": 275991, "device": 5, "context": 1, "stream": 7, "correlation": 275991, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 275991, "pid": 5, "tid": 7, "ts": 1716454225880210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225827527, "dur": 8, "args": { "External id": 275991, "cbid": 51, "correlation": 275991 } }, { "ph": "s", "id": 275991, "pid": 76337, "tid": -914061504, "ts": 1716454225827527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225880213, "dur": 648, "args": { "External id": 275992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275992, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 275992, "pid": 5, "tid": 7, "ts": 1716454225880213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827536, "dur": 6, "args": { "External id": 275992, "cbid": 211, "correlation": 275992 } }, { "ph": "s", "id": 275992, "pid": 76337, "tid": -914061504, "ts": 1716454225827536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225880862, "dur": 13, "args": { "External id": 275994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 275994, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 275994, "pid": 5, "tid": 7, "ts": 1716454225880862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827546, "dur": 5, "args": { "External id": 275994, "cbid": 211, "correlation": 275994 } }, { "ph": "s", "id": 275994, "pid": 76337, "tid": -914061504, "ts": 1716454225827546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225880876, "dur": 14, "args": { "External id": 276000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276000, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276000, "pid": 5, "tid": 7, "ts": 1716454225880876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827579, "dur": 9, "args": { "External id": 276000, "cbid": 211, "correlation": 276000 } }, { "ph": "s", "id": 276000, "pid": 76337, "tid": -914061504, "ts": 1716454225827579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225880891, "dur": 4, "args": { "External id": 276008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276008, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 276008, "pid": 5, "tid": 7, "ts": 1716454225880891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827623, "dur": 9, "args": { "External id": 276008, "cbid": 211, "correlation": 276008 } }, { "ph": "s", "id": 276008, "pid": 76337, "tid": -914061504, "ts": 1716454225827623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225827690, "dur": 1, "args": { "External id": 276024, "cbid": 251, "correlation": 276024 } }, { "ph": "f", "id": 276024, "pid": 76337, "tid": -914061504, "ts": 1716454225827690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225827695, "dur": 0, "args": { "External id": 276026, "cbid": 251, "correlation": 276026 } }, { "ph": "f", "id": 276026, "pid": 76337, "tid": -914061504, "ts": 1716454225827695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225880897, "dur": 13, "args": { "External id": 276027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276027, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276027, "pid": 5, "tid": 7, "ts": 1716454225880897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827697, "dur": 12, "args": { "External id": 276027, "cbid": 211, "correlation": 276027 } }, { "ph": "s", "id": 276027, "pid": 76337, "tid": -914061504, "ts": 1716454225827697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225880911, "dur": 5, "args": { "External id": 276029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276029, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276029, "pid": 5, "tid": 7, "ts": 1716454225880911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827711, "dur": 5, "args": { "External id": 276029, "cbid": 211, "correlation": 276029 } }, { "ph": "s", "id": 276029, "pid": 76337, "tid": -914061504, "ts": 1716454225827711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225880917, "dur": 17, "args": { "External id": 276039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276039, "pid": 5, "tid": 7, "ts": 1716454225880917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827769, "dur": 12, "args": { "External id": 276039, "cbid": 211, "correlation": 276039 } }, { "ph": "s", "id": 276039, "pid": 76337, "tid": -914061504, "ts": 1716454225827769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225880936, "dur": 17, "args": { "External id": 276059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276059, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 276059, "pid": 5, "tid": 7, "ts": 1716454225880936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827835, "dur": 10, "args": { "External id": 276059, "cbid": 211, "correlation": 276059 } }, { "ph": "s", "id": 276059, "pid": 76337, "tid": -914061504, "ts": 1716454225827835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225880954, "dur": 4, "args": { "External id": 276071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276071, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 276071, "pid": 5, "tid": 7, "ts": 1716454225880954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827855, "dur": 6, "args": { "External id": 276071, "cbid": 211, "correlation": 276071 } }, { "ph": "s", "id": 276071, "pid": 76337, "tid": -914061504, "ts": 1716454225827855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225880959, "dur": 16, "args": { "External id": 276074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276074, "pid": 5, "tid": 7, "ts": 1716454225880959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827875, "dur": 6, "args": { "External id": 276074, "cbid": 211, "correlation": 276074 } }, { "ph": "s", "id": 276074, "pid": 76337, "tid": -914061504, "ts": 1716454225827875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225880976, "dur": 11, "args": { "External id": 276083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276083, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276083, "pid": 5, "tid": 7, "ts": 1716454225880976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827914, "dur": 11, "args": { "External id": 276083, "cbid": 211, "correlation": 276083 } }, { "ph": "s", "id": 276083, "pid": 76337, "tid": -914061504, "ts": 1716454225827914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225827983, "dur": 0, "args": { "External id": 276093, "cbid": 317, "correlation": 276093 } }, { "ph": "f", "id": 276093, "pid": 76337, "tid": -914061504, "ts": 1716454225827983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225827984, "dur": 0, "args": { "External id": 276094, "cbid": 203, "correlation": 276094 } }, { "ph": "f", "id": 276094, "pid": 76337, "tid": -914061504, "ts": 1716454225827984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225827985, "dur": 0, "args": { "External id": 276095, "cbid": 205, "correlation": 276095 } }, { "ph": "f", "id": 276095, "pid": 76337, "tid": -914061504, "ts": 1716454225827985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225880988, "dur": 10, "args": { "External id": 276099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276099, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276099, "pid": 5, "tid": 7, "ts": 1716454225880988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225827999, "dur": 12, "args": { "External id": 276099, "cbid": 211, "correlation": 276099 } }, { "ph": "s", "id": 276099, "pid": 76337, "tid": -914061504, "ts": 1716454225827999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225881000, "dur": 156, "args": { "External id": 276101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276101, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276101, "pid": 5, "tid": 7, "ts": 1716454225881000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828013, "dur": 5, "args": { "External id": 276101, "cbid": 211, "correlation": 276101 } }, { "ph": "s", "id": 276101, "pid": 76337, "tid": -914061504, "ts": 1716454225828013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225881159, "dur": 1, "args": { "External id": 276103, "device": 5, "context": 1, "stream": 7, "correlation": 276103, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 276103, "pid": 5, "tid": 7, "ts": 1716454225881159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225828025, "dur": 6, "args": { "External id": 276103, "cbid": 51, "correlation": 276103 } }, { "ph": "s", "id": 276103, "pid": 76337, "tid": -914061504, "ts": 1716454225828025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225881162, "dur": 634, "args": { "External id": 276104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276104, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276104, "pid": 5, "tid": 7, "ts": 1716454225881162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828032, "dur": 6, "args": { "External id": 276104, "cbid": 211, "correlation": 276104 } }, { "ph": "s", "id": 276104, "pid": 76337, "tid": -914061504, "ts": 1716454225828032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225881798, "dur": 12, "args": { "External id": 276106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276106, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276106, "pid": 5, "tid": 7, "ts": 1716454225881798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828042, "dur": 5, "args": { "External id": 276106, "cbid": 211, "correlation": 276106 } }, { "ph": "s", "id": 276106, "pid": 76337, "tid": -914061504, "ts": 1716454225828042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225881811, "dur": 14, "args": { "External id": 276112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276112, "pid": 5, "tid": 7, "ts": 1716454225881811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828071, "dur": 8, "args": { "External id": 276112, "cbid": 211, "correlation": 276112 } }, { "ph": "s", "id": 276112, "pid": 76337, "tid": -914061504, "ts": 1716454225828071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225881826, "dur": 12, "args": { "External id": 276120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276120, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276120, "pid": 5, "tid": 7, "ts": 1716454225881826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828103, "dur": 8, "args": { "External id": 276120, "cbid": 211, "correlation": 276120 } }, { "ph": "s", "id": 276120, "pid": 76337, "tid": -914061504, "ts": 1716454225828103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225881840, "dur": 10, "args": { "External id": 276128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276128, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276128, "pid": 5, "tid": 7, "ts": 1716454225881840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828133, "dur": 8, "args": { "External id": 276128, "cbid": 211, "correlation": 276128 } }, { "ph": "s", "id": 276128, "pid": 76337, "tid": -914061504, "ts": 1716454225828133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225881851, "dur": 18, "args": { "External id": 276148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276148, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 276148, "pid": 5, "tid": 7, "ts": 1716454225881851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828210, "dur": 13, "args": { "External id": 276148, "cbid": 211, "correlation": 276148 } }, { "ph": "s", "id": 276148, "pid": 76337, "tid": -914061504, "ts": 1716454225828210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225881870, "dur": 4, "args": { "External id": 276160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276160, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 276160, "pid": 5, "tid": 7, "ts": 1716454225881870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828232, "dur": 6, "args": { "External id": 276160, "cbid": 211, "correlation": 276160 } }, { "ph": "s", "id": 276160, "pid": 76337, "tid": -914061504, "ts": 1716454225828232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225881875, "dur": 16, "args": { "External id": 276163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276163, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276163, "pid": 5, "tid": 7, "ts": 1716454225881875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828250, "dur": 8, "args": { "External id": 276163, "cbid": 211, "correlation": 276163 } }, { "ph": "s", "id": 276163, "pid": 76337, "tid": -914061504, "ts": 1716454225828250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225828308, "dur": 0, "args": { "External id": 276174, "cbid": 317, "correlation": 276174 } }, { "ph": "f", "id": 276174, "pid": 76337, "tid": -914061504, "ts": 1716454225828308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225828309, "dur": 0, "args": { "External id": 276175, "cbid": 203, "correlation": 276175 } }, { "ph": "f", "id": 276175, "pid": 76337, "tid": -914061504, "ts": 1716454225828309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225828309, "dur": 0, "args": { "External id": 276176, "cbid": 205, "correlation": 276176 } }, { "ph": "f", "id": 276176, "pid": 76337, "tid": -914061504, "ts": 1716454225828309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225881892, "dur": 11, "args": { "External id": 276180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276180, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276180, "pid": 5, "tid": 7, "ts": 1716454225881892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828323, "dur": 12, "args": { "External id": 276180, "cbid": 211, "correlation": 276180 } }, { "ph": "s", "id": 276180, "pid": 76337, "tid": -914061504, "ts": 1716454225828323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225881904, "dur": 4, "args": { "External id": 276182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276182, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 276182, "pid": 5, "tid": 7, "ts": 1716454225881904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828339, "dur": 5, "args": { "External id": 276182, "cbid": 211, "correlation": 276182 } }, { "ph": "s", "id": 276182, "pid": 76337, "tid": -914061504, "ts": 1716454225828339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225828347, "dur": 0, "args": { "External id": 276183, "cbid": 51, "correlation": 276183 } }, { "ph": "s", "id": 276183, "pid": 76337, "tid": -914061504, "ts": 1716454225828347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225881909, "dur": 91, "args": { "External id": 276184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276184, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 276184, "pid": 5, "tid": 7, "ts": 1716454225881909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828348, "dur": 5, "args": { "External id": 276184, "cbid": 211, "correlation": 276184 } }, { "ph": "s", "id": 276184, "pid": 76337, "tid": -914061504, "ts": 1716454225828348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225882001, "dur": 15, "args": { "External id": 276189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276189, "pid": 5, "tid": 7, "ts": 1716454225882001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828375, "dur": 9, "args": { "External id": 276189, "cbid": 211, "correlation": 276189 } }, { "ph": "s", "id": 276189, "pid": 76337, "tid": -914061504, "ts": 1716454225828375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225882017, "dur": 80, "args": { "External id": 276198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276198, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276198, "pid": 5, "tid": 7, "ts": 1716454225882017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828457, "dur": 15, "args": { "External id": 276198, "cbid": 211, "correlation": 276198 } }, { "ph": "s", "id": 276198, "pid": 76337, "tid": -914061504, "ts": 1716454225828457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225882099, "dur": 30, "args": { "External id": 276220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276220, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276220, "pid": 5, "tid": 7, "ts": 1716454225882099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828517, "dur": 10, "args": { "External id": 276220, "cbid": 211, "correlation": 276220 } }, { "ph": "s", "id": 276220, "pid": 76337, "tid": -914061504, "ts": 1716454225828517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225828603, "dur": 1, "args": { "External id": 276231, "cbid": 251, "correlation": 276231 } }, { "ph": "f", "id": 276231, "pid": 76337, "tid": -914061504, "ts": 1716454225828603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225882130, "dur": 140, "args": { "External id": 276232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276232, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276232, "pid": 5, "tid": 7, "ts": 1716454225882130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828609, "dur": 14, "args": { "External id": 276232, "cbid": 211, "correlation": 276232 } }, { "ph": "s", "id": 276232, "pid": 76337, "tid": -914061504, "ts": 1716454225828609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225828679, "dur": 1, "args": { "External id": 276243, "cbid": 251, "correlation": 276243 } }, { "ph": "f", "id": 276243, "pid": 76337, "tid": -914061504, "ts": 1716454225828679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225882271, "dur": 154, "args": { "External id": 276244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276244, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276244, "pid": 5, "tid": 7, "ts": 1716454225882271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828683, "dur": 11, "args": { "External id": 276244, "cbid": 211, "correlation": 276244 } }, { "ph": "s", "id": 276244, "pid": 76337, "tid": -914061504, "ts": 1716454225828683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225828746, "dur": 1, "args": { "External id": 276255, "cbid": 251, "correlation": 276255 } }, { "ph": "f", "id": 276255, "pid": 76337, "tid": -914061504, "ts": 1716454225828746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225882426, "dur": 155, "args": { "External id": 276256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276256, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276256, "pid": 5, "tid": 7, "ts": 1716454225882426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828750, "dur": 11, "args": { "External id": 276256, "cbid": 211, "correlation": 276256 } }, { "ph": "s", "id": 276256, "pid": 76337, "tid": -914061504, "ts": 1716454225828750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225882582, "dur": 326, "args": { "External id": 276281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276281, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276281, "pid": 5, "tid": 7, "ts": 1716454225882582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828834, "dur": 13, "args": { "External id": 276281, "cbid": 211, "correlation": 276281 } }, { "ph": "s", "id": 276281, "pid": 76337, "tid": -914061504, "ts": 1716454225828834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225828935, "dur": 1, "args": { "External id": 276299, "cbid": 251, "correlation": 276299 } }, { "ph": "f", "id": 276299, "pid": 76337, "tid": -914061504, "ts": 1716454225828935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225882910, "dur": 163, "args": { "External id": 276301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276301, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276301, "pid": 5, "tid": 7, "ts": 1716454225882910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225828941, "dur": 13, "args": { "External id": 276301, "cbid": 211, "correlation": 276301 } }, { "ph": "s", "id": 276301, "pid": 76337, "tid": -914061504, "ts": 1716454225828941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225883075, "dur": 19, "args": { "External id": 276309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276309, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276309, "pid": 5, "tid": 7, "ts": 1716454225883075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829019, "dur": 13, "args": { "External id": 276309, "cbid": 211, "correlation": 276309 } }, { "ph": "s", "id": 276309, "pid": 76337, "tid": -914061504, "ts": 1716454225829019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225883095, "dur": 27, "args": { "External id": 276317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276317, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276317, "pid": 5, "tid": 7, "ts": 1716454225883095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829058, "dur": 9, "args": { "External id": 276317, "cbid": 211, "correlation": 276317 } }, { "ph": "s", "id": 276317, "pid": 76337, "tid": -914061504, "ts": 1716454225829058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225883124, "dur": 18, "args": { "External id": 276328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276328, "pid": 5, "tid": 7, "ts": 1716454225883124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829129, "dur": 12, "args": { "External id": 276328, "cbid": 211, "correlation": 276328 } }, { "ph": "s", "id": 276328, "pid": 76337, "tid": -914061504, "ts": 1716454225829129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225883143, "dur": 15, "args": { "External id": 276350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276350, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276350, "pid": 5, "tid": 7, "ts": 1716454225883143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829160, "dur": 8, "args": { "External id": 276350, "cbid": 211, "correlation": 276350 } }, { "ph": "s", "id": 276350, "pid": 76337, "tid": -914061504, "ts": 1716454225829160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829245, "dur": 1, "args": { "External id": 276361, "cbid": 251, "correlation": 276361 } }, { "ph": "f", "id": 276361, "pid": 76337, "tid": -914061504, "ts": 1716454225829245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225883159, "dur": 85, "args": { "External id": 276362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276362, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 276362, "pid": 5, "tid": 7, "ts": 1716454225883159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829250, "dur": 14, "args": { "External id": 276362, "cbid": 211, "correlation": 276362 } }, { "ph": "s", "id": 276362, "pid": 76337, "tid": -914061504, "ts": 1716454225829250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829319, "dur": 1, "args": { "External id": 276373, "cbid": 251, "correlation": 276373 } }, { "ph": "f", "id": 276373, "pid": 76337, "tid": -914061504, "ts": 1716454225829319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829323, "dur": 0, "args": { "External id": 276374, "cbid": 251, "correlation": 276374 } }, { "ph": "f", "id": 276374, "pid": 76337, "tid": -914061504, "ts": 1716454225829323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225883245, "dur": 12, "args": { "External id": 276375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276375, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276375, "pid": 5, "tid": 7, "ts": 1716454225883245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829325, "dur": 12, "args": { "External id": 276375, "cbid": 211, "correlation": 276375 } }, { "ph": "s", "id": 276375, "pid": 76337, "tid": -914061504, "ts": 1716454225829325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225883258, "dur": 5, "args": { "External id": 276377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276377, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276377, "pid": 5, "tid": 7, "ts": 1716454225883258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829338, "dur": 6, "args": { "External id": 276377, "cbid": 211, "correlation": 276377 } }, { "ph": "s", "id": 276377, "pid": 76337, "tid": -914061504, "ts": 1716454225829338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829401, "dur": 1, "args": { "External id": 276388, "cbid": 251, "correlation": 276388 } }, { "ph": "f", "id": 276388, "pid": 76337, "tid": -914061504, "ts": 1716454225829401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829404, "dur": 0, "args": { "External id": 276389, "cbid": 251, "correlation": 276389 } }, { "ph": "f", "id": 276389, "pid": 76337, "tid": -914061504, "ts": 1716454225829404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225883265, "dur": 8, "args": { "External id": 276390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276390, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276390, "pid": 5, "tid": 7, "ts": 1716454225883265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829406, "dur": 13, "args": { "External id": 276390, "cbid": 211, "correlation": 276390 } }, { "ph": "s", "id": 276390, "pid": 76337, "tid": -914061504, "ts": 1716454225829406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225883274, "dur": 3, "args": { "External id": 276392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276392, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276392, "pid": 5, "tid": 7, "ts": 1716454225883274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829420, "dur": 6, "args": { "External id": 276392, "cbid": 211, "correlation": 276392 } }, { "ph": "s", "id": 276392, "pid": 76337, "tid": -914061504, "ts": 1716454225829420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225883279, "dur": 53, "args": { "External id": 276417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276417, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276417, "pid": 5, "tid": 7, "ts": 1716454225883279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829498, "dur": 12, "args": { "External id": 276417, "cbid": 211, "correlation": 276417 } }, { "ph": "s", "id": 276417, "pid": 76337, "tid": -914061504, "ts": 1716454225829498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829600, "dur": 1, "args": { "External id": 276435, "cbid": 251, "correlation": 276435 } }, { "ph": "f", "id": 276435, "pid": 76337, "tid": -914061504, "ts": 1716454225829600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225883333, "dur": 87, "args": { "External id": 276437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276437, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 276437, "pid": 5, "tid": 7, "ts": 1716454225883333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829606, "dur": 15, "args": { "External id": 276437, "cbid": 211, "correlation": 276437 } }, { "ph": "s", "id": 276437, "pid": 76337, "tid": -914061504, "ts": 1716454225829606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225883421, "dur": 9, "args": { "External id": 276445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276445, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276445, "pid": 5, "tid": 7, "ts": 1716454225883421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829676, "dur": 12, "args": { "External id": 276445, "cbid": 211, "correlation": 276445 } }, { "ph": "s", "id": 276445, "pid": 76337, "tid": -914061504, "ts": 1716454225829676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225883432, "dur": 20, "args": { "External id": 276453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276453, "pid": 5, "tid": 7, "ts": 1716454225883432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829718, "dur": 9, "args": { "External id": 276453, "cbid": 211, "correlation": 276453 } }, { "ph": "s", "id": 276453, "pid": 76337, "tid": -914061504, "ts": 1716454225829718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225883453, "dur": 17, "args": { "External id": 276475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276475, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276475, "pid": 5, "tid": 7, "ts": 1716454225883453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829768, "dur": 10, "args": { "External id": 276475, "cbid": 211, "correlation": 276475 } }, { "ph": "s", "id": 276475, "pid": 76337, "tid": -914061504, "ts": 1716454225829768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829855, "dur": 1, "args": { "External id": 276491, "cbid": 251, "correlation": 276491 } }, { "ph": "f", "id": 276491, "pid": 76337, "tid": -914061504, "ts": 1716454225829855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225829860, "dur": 0, "args": { "External id": 276493, "cbid": 251, "correlation": 276493 } }, { "ph": "f", "id": 276493, "pid": 76337, "tid": -914061504, "ts": 1716454225829860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225883471, "dur": 485, "args": { "External id": 276494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276494, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276494, "pid": 5, "tid": 7, "ts": 1716454225883471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829862, "dur": 13, "args": { "External id": 276494, "cbid": 211, "correlation": 276494 } }, { "ph": "s", "id": 276494, "pid": 76337, "tid": -914061504, "ts": 1716454225829862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225883957, "dur": 63, "args": { "External id": 276502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276502, "pid": 5, "tid": 7, "ts": 1716454225883957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829927, "dur": 12, "args": { "External id": 276502, "cbid": 211, "correlation": 276502 } }, { "ph": "s", "id": 276502, "pid": 76337, "tid": -914061504, "ts": 1716454225829927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225884022, "dur": 66, "args": { "External id": 276510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276510, "pid": 5, "tid": 7, "ts": 1716454225884022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225829956, "dur": 8, "args": { "External id": 276510, "cbid": 211, "correlation": 276510 } }, { "ph": "s", "id": 276510, "pid": 76337, "tid": -914061504, "ts": 1716454225829956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225830042, "dur": 1, "args": { "External id": 276526, "cbid": 251, "correlation": 276526 } }, { "ph": "f", "id": 276526, "pid": 76337, "tid": -914061504, "ts": 1716454225830042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225884089, "dur": 1, "args": { "External id": 276528, "device": 5, "context": 1, "stream": 7, "correlation": 276528, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 276528, "pid": 5, "tid": 7, "ts": 1716454225884089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225830047, "dur": 9, "args": { "External id": 276528, "cbid": 51, "correlation": 276528 } }, { "ph": "s", "id": 276528, "pid": 76337, "tid": -914061504, "ts": 1716454225830047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225884093, "dur": 261, "args": { "External id": 276529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276529, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 276529, "pid": 5, "tid": 7, "ts": 1716454225884093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830058, "dur": 12, "args": { "External id": 276529, "cbid": 211, "correlation": 276529 } }, { "ph": "s", "id": 276529, "pid": 76337, "tid": -914061504, "ts": 1716454225830058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225884356, "dur": 13, "args": { "External id": 276537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276537, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276537, "pid": 5, "tid": 7, "ts": 1716454225884356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830100, "dur": 10, "args": { "External id": 276537, "cbid": 211, "correlation": 276537 } }, { "ph": "s", "id": 276537, "pid": 76337, "tid": -914061504, "ts": 1716454225830100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225884370, "dur": 35, "args": { "External id": 276548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276548, "pid": 5, "tid": 7, "ts": 1716454225884370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830169, "dur": 12, "args": { "External id": 276548, "cbid": 211, "correlation": 276548 } }, { "ph": "s", "id": 276548, "pid": 76337, "tid": -914061504, "ts": 1716454225830169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225830232, "dur": 0, "args": { "External id": 276560, "cbid": 317, "correlation": 276560 } }, { "ph": "f", "id": 276560, "pid": 76337, "tid": -914061504, "ts": 1716454225830232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225830233, "dur": 0, "args": { "External id": 276561, "cbid": 203, "correlation": 276561 } }, { "ph": "f", "id": 276561, "pid": 76337, "tid": -914061504, "ts": 1716454225830233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225830234, "dur": 0, "args": { "External id": 276562, "cbid": 205, "correlation": 276562 } }, { "ph": "f", "id": 276562, "pid": 76337, "tid": -914061504, "ts": 1716454225830234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225884407, "dur": 12, "args": { "External id": 276566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276566, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276566, "pid": 5, "tid": 7, "ts": 1716454225884407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830249, "dur": 12, "args": { "External id": 276566, "cbid": 211, "correlation": 276566 } }, { "ph": "s", "id": 276566, "pid": 76337, "tid": -914061504, "ts": 1716454225830249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225884420, "dur": 3, "args": { "External id": 276568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 276568, "pid": 5, "tid": 7, "ts": 1716454225884420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830266, "dur": 6, "args": { "External id": 276568, "cbid": 211, "correlation": 276568 } }, { "ph": "s", "id": 276568, "pid": 76337, "tid": -914061504, "ts": 1716454225830266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225830275, "dur": 0, "args": { "External id": 276569, "cbid": 51, "correlation": 276569 } }, { "ph": "s", "id": 276569, "pid": 76337, "tid": -914061504, "ts": 1716454225830275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225884425, "dur": 93, "args": { "External id": 276570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276570, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 276570, "pid": 5, "tid": 7, "ts": 1716454225884425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830276, "dur": 5, "args": { "External id": 276570, "cbid": 211, "correlation": 276570 } }, { "ph": "s", "id": 276570, "pid": 76337, "tid": -914061504, "ts": 1716454225830276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225884519, "dur": 15, "args": { "External id": 276575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276575, "pid": 5, "tid": 7, "ts": 1716454225884519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830303, "dur": 9, "args": { "External id": 276575, "cbid": 211, "correlation": 276575 } }, { "ph": "s", "id": 276575, "pid": 76337, "tid": -914061504, "ts": 1716454225830303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225884536, "dur": 11, "args": { "External id": 276583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276583, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276583, "pid": 5, "tid": 7, "ts": 1716454225884536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830335, "dur": 9, "args": { "External id": 276583, "cbid": 211, "correlation": 276583 } }, { "ph": "s", "id": 276583, "pid": 76337, "tid": -914061504, "ts": 1716454225830335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225830404, "dur": 0, "args": { "External id": 276593, "cbid": 317, "correlation": 276593 } }, { "ph": "f", "id": 276593, "pid": 76337, "tid": -914061504, "ts": 1716454225830404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225830405, "dur": 0, "args": { "External id": 276594, "cbid": 203, "correlation": 276594 } }, { "ph": "f", "id": 276594, "pid": 76337, "tid": -914061504, "ts": 1716454225830405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225830406, "dur": 0, "args": { "External id": 276595, "cbid": 205, "correlation": 276595 } }, { "ph": "f", "id": 276595, "pid": 76337, "tid": -914061504, "ts": 1716454225830406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225884549, "dur": 11, "args": { "External id": 276599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276599, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276599, "pid": 5, "tid": 7, "ts": 1716454225884549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830421, "dur": 12, "args": { "External id": 276599, "cbid": 211, "correlation": 276599 } }, { "ph": "s", "id": 276599, "pid": 76337, "tid": -914061504, "ts": 1716454225830421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225884561, "dur": 155, "args": { "External id": 276601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276601, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276601, "pid": 5, "tid": 7, "ts": 1716454225884561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830435, "dur": 5, "args": { "External id": 276601, "cbid": 211, "correlation": 276601 } }, { "ph": "s", "id": 276601, "pid": 76337, "tid": -914061504, "ts": 1716454225830435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225884718, "dur": 1, "args": { "External id": 276603, "device": 5, "context": 1, "stream": 7, "correlation": 276603, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 276603, "pid": 5, "tid": 7, "ts": 1716454225884718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225830447, "dur": 6, "args": { "External id": 276603, "cbid": 51, "correlation": 276603 } }, { "ph": "s", "id": 276603, "pid": 76337, "tid": -914061504, "ts": 1716454225830447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225884722, "dur": 196, "args": { "External id": 276604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276604, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 276604, "pid": 5, "tid": 7, "ts": 1716454225884722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830455, "dur": 9, "args": { "External id": 276604, "cbid": 211, "correlation": 276604 } }, { "ph": "s", "id": 276604, "pid": 76337, "tid": -914061504, "ts": 1716454225830455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225884919, "dur": 6, "args": { "External id": 276606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276606, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276606, "pid": 5, "tid": 7, "ts": 1716454225884919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830468, "dur": 5, "args": { "External id": 276606, "cbid": 211, "correlation": 276606 } }, { "ph": "s", "id": 276606, "pid": 76337, "tid": -914061504, "ts": 1716454225830468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225884926, "dur": 6, "args": { "External id": 276612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276612, "pid": 5, "tid": 7, "ts": 1716454225884926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830495, "dur": 8, "args": { "External id": 276612, "cbid": 211, "correlation": 276612 } }, { "ph": "s", "id": 276612, "pid": 76337, "tid": -914061504, "ts": 1716454225830495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225884934, "dur": 11, "args": { "External id": 276632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276632, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 276632, "pid": 5, "tid": 7, "ts": 1716454225884934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830588, "dur": 12, "args": { "External id": 276632, "cbid": 211, "correlation": 276632 } }, { "ph": "s", "id": 276632, "pid": 76337, "tid": -914061504, "ts": 1716454225830588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225884946, "dur": 4, "args": { "External id": 276644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276644, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 276644, "pid": 5, "tid": 7, "ts": 1716454225884946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830611, "dur": 6, "args": { "External id": 276644, "cbid": 211, "correlation": 276644 } }, { "ph": "s", "id": 276644, "pid": 76337, "tid": -914061504, "ts": 1716454225830611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225884951, "dur": 8, "args": { "External id": 276647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276647, "pid": 5, "tid": 7, "ts": 1716454225884951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830630, "dur": 7, "args": { "External id": 276647, "cbid": 211, "correlation": 276647 } }, { "ph": "s", "id": 276647, "pid": 76337, "tid": -914061504, "ts": 1716454225830630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225884960, "dur": 5, "args": { "External id": 276656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276656, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276656, "pid": 5, "tid": 7, "ts": 1716454225884960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830670, "dur": 9, "args": { "External id": 276656, "cbid": 211, "correlation": 276656 } }, { "ph": "s", "id": 276656, "pid": 76337, "tid": -914061504, "ts": 1716454225830670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225830723, "dur": 0, "args": { "External id": 276666, "cbid": 317, "correlation": 276666 } }, { "ph": "f", "id": 276666, "pid": 76337, "tid": -914061504, "ts": 1716454225830723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225830724, "dur": 0, "args": { "External id": 276667, "cbid": 203, "correlation": 276667 } }, { "ph": "f", "id": 276667, "pid": 76337, "tid": -914061504, "ts": 1716454225830724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225830724, "dur": 0, "args": { "External id": 276668, "cbid": 205, "correlation": 276668 } }, { "ph": "f", "id": 276668, "pid": 76337, "tid": -914061504, "ts": 1716454225830724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225884967, "dur": 5, "args": { "External id": 276672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276672, "pid": 5, "tid": 7, "ts": 1716454225884967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830739, "dur": 11, "args": { "External id": 276672, "cbid": 211, "correlation": 276672 } }, { "ph": "s", "id": 276672, "pid": 76337, "tid": -914061504, "ts": 1716454225830739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225884973, "dur": 155, "args": { "External id": 276674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276674, "pid": 5, "tid": 7, "ts": 1716454225884973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830753, "dur": 5, "args": { "External id": 276674, "cbid": 211, "correlation": 276674 } }, { "ph": "s", "id": 276674, "pid": 76337, "tid": -914061504, "ts": 1716454225830753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225885130, "dur": 1, "args": { "External id": 276676, "device": 5, "context": 1, "stream": 7, "correlation": 276676, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 276676, "pid": 5, "tid": 7, "ts": 1716454225885130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225830764, "dur": 6, "args": { "External id": 276676, "cbid": 51, "correlation": 276676 } }, { "ph": "s", "id": 276676, "pid": 76337, "tid": -914061504, "ts": 1716454225830764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225885134, "dur": 261, "args": { "External id": 276677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276677, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276677, "pid": 5, "tid": 7, "ts": 1716454225885134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830771, "dur": 6, "args": { "External id": 276677, "cbid": 211, "correlation": 276677 } }, { "ph": "s", "id": 276677, "pid": 76337, "tid": -914061504, "ts": 1716454225830771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225885396, "dur": 5, "args": { "External id": 276679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276679, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276679, "pid": 5, "tid": 7, "ts": 1716454225885396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830783, "dur": 5, "args": { "External id": 276679, "cbid": 211, "correlation": 276679 } }, { "ph": "s", "id": 276679, "pid": 76337, "tid": -914061504, "ts": 1716454225830783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225885403, "dur": 6, "args": { "External id": 276685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276685, "pid": 5, "tid": 7, "ts": 1716454225885403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830812, "dur": 8, "args": { "External id": 276685, "cbid": 211, "correlation": 276685 } }, { "ph": "s", "id": 276685, "pid": 76337, "tid": -914061504, "ts": 1716454225830812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225885410, "dur": 4, "args": { "External id": 276693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276693, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 276693, "pid": 5, "tid": 7, "ts": 1716454225885410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830854, "dur": 10, "args": { "External id": 276693, "cbid": 211, "correlation": 276693 } }, { "ph": "s", "id": 276693, "pid": 76337, "tid": -914061504, "ts": 1716454225830854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225830922, "dur": 1, "args": { "External id": 276709, "cbid": 251, "correlation": 276709 } }, { "ph": "f", "id": 276709, "pid": 76337, "tid": -914061504, "ts": 1716454225830922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225830927, "dur": 0, "args": { "External id": 276711, "cbid": 251, "correlation": 276711 } }, { "ph": "f", "id": 276711, "pid": 76337, "tid": -914061504, "ts": 1716454225830927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225885415, "dur": 13, "args": { "External id": 276712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276712, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276712, "pid": 5, "tid": 7, "ts": 1716454225885415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830929, "dur": 11, "args": { "External id": 276712, "cbid": 211, "correlation": 276712 } }, { "ph": "s", "id": 276712, "pid": 76337, "tid": -914061504, "ts": 1716454225830929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225885429, "dur": 5, "args": { "External id": 276714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276714, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276714, "pid": 5, "tid": 7, "ts": 1716454225885429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225830942, "dur": 5, "args": { "External id": 276714, "cbid": 211, "correlation": 276714 } }, { "ph": "s", "id": 276714, "pid": 76337, "tid": -914061504, "ts": 1716454225830942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225885436, "dur": 5, "args": { "External id": 276724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276724, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276724, "pid": 5, "tid": 7, "ts": 1716454225885436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831010, "dur": 13, "args": { "External id": 276724, "cbid": 211, "correlation": 276724 } }, { "ph": "s", "id": 276724, "pid": 76337, "tid": -914061504, "ts": 1716454225831010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225885442, "dur": 9, "args": { "External id": 276744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276744, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 276744, "pid": 5, "tid": 7, "ts": 1716454225885442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831076, "dur": 11, "args": { "External id": 276744, "cbid": 211, "correlation": 276744 } }, { "ph": "s", "id": 276744, "pid": 76337, "tid": -914061504, "ts": 1716454225831076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225885453, "dur": 4, "args": { "External id": 276756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276756, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 276756, "pid": 5, "tid": 7, "ts": 1716454225885453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831097, "dur": 6, "args": { "External id": 276756, "cbid": 211, "correlation": 276756 } }, { "ph": "s", "id": 276756, "pid": 76337, "tid": -914061504, "ts": 1716454225831097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225885458, "dur": 7, "args": { "External id": 276759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276759, "pid": 5, "tid": 7, "ts": 1716454225885458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831117, "dur": 6, "args": { "External id": 276759, "cbid": 211, "correlation": 276759 } }, { "ph": "s", "id": 276759, "pid": 76337, "tid": -914061504, "ts": 1716454225831117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225885466, "dur": 4, "args": { "External id": 276768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276768, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276768, "pid": 5, "tid": 7, "ts": 1716454225885466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831157, "dur": 10, "args": { "External id": 276768, "cbid": 211, "correlation": 276768 } }, { "ph": "s", "id": 276768, "pid": 76337, "tid": -914061504, "ts": 1716454225831157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225831221, "dur": 0, "args": { "External id": 276778, "cbid": 317, "correlation": 276778 } }, { "ph": "f", "id": 276778, "pid": 76337, "tid": -914061504, "ts": 1716454225831221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225831221, "dur": 0, "args": { "External id": 276779, "cbid": 203, "correlation": 276779 } }, { "ph": "f", "id": 276779, "pid": 76337, "tid": -914061504, "ts": 1716454225831221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225831222, "dur": 0, "args": { "External id": 276780, "cbid": 205, "correlation": 276780 } }, { "ph": "f", "id": 276780, "pid": 76337, "tid": -914061504, "ts": 1716454225831222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225885472, "dur": 5, "args": { "External id": 276784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276784, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276784, "pid": 5, "tid": 7, "ts": 1716454225885472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831236, "dur": 12, "args": { "External id": 276784, "cbid": 211, "correlation": 276784 } }, { "ph": "s", "id": 276784, "pid": 76337, "tid": -914061504, "ts": 1716454225831236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225885478, "dur": 154, "args": { "External id": 276786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276786, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276786, "pid": 5, "tid": 7, "ts": 1716454225885478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831251, "dur": 6, "args": { "External id": 276786, "cbid": 211, "correlation": 276786 } }, { "ph": "s", "id": 276786, "pid": 76337, "tid": -914061504, "ts": 1716454225831251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225885634, "dur": 1, "args": { "External id": 276788, "device": 5, "context": 1, "stream": 7, "correlation": 276788, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 276788, "pid": 5, "tid": 7, "ts": 1716454225885634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225831262, "dur": 6, "args": { "External id": 276788, "cbid": 51, "correlation": 276788 } }, { "ph": "s", "id": 276788, "pid": 76337, "tid": -914061504, "ts": 1716454225831262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225885638, "dur": 250, "args": { "External id": 276789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276789, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276789, "pid": 5, "tid": 7, "ts": 1716454225885638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831269, "dur": 6, "args": { "External id": 276789, "cbid": 211, "correlation": 276789 } }, { "ph": "s", "id": 276789, "pid": 76337, "tid": -914061504, "ts": 1716454225831269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225885889, "dur": 6, "args": { "External id": 276791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276791, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276791, "pid": 5, "tid": 7, "ts": 1716454225885889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831279, "dur": 5, "args": { "External id": 276791, "cbid": 211, "correlation": 276791 } }, { "ph": "s", "id": 276791, "pid": 76337, "tid": -914061504, "ts": 1716454225831279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225885896, "dur": 6, "args": { "External id": 276797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276797, "pid": 5, "tid": 7, "ts": 1716454225885896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831308, "dur": 8, "args": { "External id": 276797, "cbid": 211, "correlation": 276797 } }, { "ph": "s", "id": 276797, "pid": 76337, "tid": -914061504, "ts": 1716454225831308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225885903, "dur": 5, "args": { "External id": 276805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276805, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276805, "pid": 5, "tid": 7, "ts": 1716454225885903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831340, "dur": 8, "args": { "External id": 276805, "cbid": 211, "correlation": 276805 } }, { "ph": "s", "id": 276805, "pid": 76337, "tid": -914061504, "ts": 1716454225831340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225885910, "dur": 4, "args": { "External id": 276813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276813, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276813, "pid": 5, "tid": 7, "ts": 1716454225885910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831369, "dur": 8, "args": { "External id": 276813, "cbid": 211, "correlation": 276813 } }, { "ph": "s", "id": 276813, "pid": 76337, "tid": -914061504, "ts": 1716454225831369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225885915, "dur": 9, "args": { "External id": 276833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276833, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 276833, "pid": 5, "tid": 7, "ts": 1716454225885915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831442, "dur": 13, "args": { "External id": 276833, "cbid": 211, "correlation": 276833 } }, { "ph": "s", "id": 276833, "pid": 76337, "tid": -914061504, "ts": 1716454225831442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225885925, "dur": 4, "args": { "External id": 276845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276845, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 276845, "pid": 5, "tid": 7, "ts": 1716454225885925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831465, "dur": 6, "args": { "External id": 276845, "cbid": 211, "correlation": 276845 } }, { "ph": "s", "id": 276845, "pid": 76337, "tid": -914061504, "ts": 1716454225831465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225885930, "dur": 6, "args": { "External id": 276848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276848, "pid": 5, "tid": 7, "ts": 1716454225885930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831482, "dur": 6, "args": { "External id": 276848, "cbid": 211, "correlation": 276848 } }, { "ph": "s", "id": 276848, "pid": 76337, "tid": -914061504, "ts": 1716454225831482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225885938, "dur": 4, "args": { "External id": 276857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276857, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276857, "pid": 5, "tid": 7, "ts": 1716454225885938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831520, "dur": 9, "args": { "External id": 276857, "cbid": 211, "correlation": 276857 } }, { "ph": "s", "id": 276857, "pid": 76337, "tid": -914061504, "ts": 1716454225831520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225831570, "dur": 0, "args": { "External id": 276867, "cbid": 317, "correlation": 276867 } }, { "ph": "f", "id": 276867, "pid": 76337, "tid": -914061504, "ts": 1716454225831570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225831571, "dur": 0, "args": { "External id": 276868, "cbid": 203, "correlation": 276868 } }, { "ph": "f", "id": 276868, "pid": 76337, "tid": -914061504, "ts": 1716454225831571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225831571, "dur": 0, "args": { "External id": 276869, "cbid": 205, "correlation": 276869 } }, { "ph": "f", "id": 276869, "pid": 76337, "tid": -914061504, "ts": 1716454225831571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225885943, "dur": 5, "args": { "External id": 276873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276873, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276873, "pid": 5, "tid": 7, "ts": 1716454225885943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831584, "dur": 11, "args": { "External id": 276873, "cbid": 211, "correlation": 276873 } }, { "ph": "s", "id": 276873, "pid": 76337, "tid": -914061504, "ts": 1716454225831584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225885949, "dur": 155, "args": { "External id": 276875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276875, "pid": 5, "tid": 7, "ts": 1716454225885949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831597, "dur": 6, "args": { "External id": 276875, "cbid": 211, "correlation": 276875 } }, { "ph": "s", "id": 276875, "pid": 76337, "tid": -914061504, "ts": 1716454225831597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225886106, "dur": 1, "args": { "External id": 276877, "device": 5, "context": 1, "stream": 7, "correlation": 276877, "bytes": 240, "memory bandwidth (GB/s)": 0.1596806387225549 } }, { "ph": "f", "id": 276877, "pid": 5, "tid": 7, "ts": 1716454225886106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225831609, "dur": 6, "args": { "External id": 276877, "cbid": 51, "correlation": 276877 } }, { "ph": "s", "id": 276877, "pid": 76337, "tid": -914061504, "ts": 1716454225831609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225886110, "dur": 250, "args": { "External id": 276878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276878, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276878, "pid": 5, "tid": 7, "ts": 1716454225886110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831616, "dur": 6, "args": { "External id": 276878, "cbid": 211, "correlation": 276878 } }, { "ph": "s", "id": 276878, "pid": 76337, "tid": -914061504, "ts": 1716454225831616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225886361, "dur": 5, "args": { "External id": 276880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276880, "pid": 5, "tid": 7, "ts": 1716454225886361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831625, "dur": 5, "args": { "External id": 276880, "cbid": 211, "correlation": 276880 } }, { "ph": "s", "id": 276880, "pid": 76337, "tid": -914061504, "ts": 1716454225831625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225886368, "dur": 6, "args": { "External id": 276886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276886, "pid": 5, "tid": 7, "ts": 1716454225886368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831653, "dur": 9, "args": { "External id": 276886, "cbid": 211, "correlation": 276886 } }, { "ph": "s", "id": 276886, "pid": 76337, "tid": -914061504, "ts": 1716454225831653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225886375, "dur": 3, "args": { "External id": 276894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276894, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 276894, "pid": 5, "tid": 7, "ts": 1716454225886375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831696, "dur": 10, "args": { "External id": 276894, "cbid": 211, "correlation": 276894 } }, { "ph": "s", "id": 276894, "pid": 76337, "tid": -914061504, "ts": 1716454225831696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225831758, "dur": 1, "args": { "External id": 276910, "cbid": 251, "correlation": 276910 } }, { "ph": "f", "id": 276910, "pid": 76337, "tid": -914061504, "ts": 1716454225831758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225831763, "dur": 0, "args": { "External id": 276912, "cbid": 251, "correlation": 276912 } }, { "ph": "f", "id": 276912, "pid": 76337, "tid": -914061504, "ts": 1716454225831763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225886380, "dur": 10, "args": { "External id": 276913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276913, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276913, "pid": 5, "tid": 7, "ts": 1716454225886380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831765, "dur": 11, "args": { "External id": 276913, "cbid": 211, "correlation": 276913 } }, { "ph": "s", "id": 276913, "pid": 76337, "tid": -914061504, "ts": 1716454225831765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225886390, "dur": 3, "args": { "External id": 276915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276915, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276915, "pid": 5, "tid": 7, "ts": 1716454225886390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831778, "dur": 5, "args": { "External id": 276915, "cbid": 211, "correlation": 276915 } }, { "ph": "s", "id": 276915, "pid": 76337, "tid": -914061504, "ts": 1716454225831778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225886395, "dur": 5, "args": { "External id": 276925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276925, "pid": 5, "tid": 7, "ts": 1716454225886395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831834, "dur": 12, "args": { "External id": 276925, "cbid": 211, "correlation": 276925 } }, { "ph": "s", "id": 276925, "pid": 76337, "tid": -914061504, "ts": 1716454225831834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225886402, "dur": 9, "args": { "External id": 276945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276945, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 276945, "pid": 5, "tid": 7, "ts": 1716454225886402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831898, "dur": 11, "args": { "External id": 276945, "cbid": 211, "correlation": 276945 } }, { "ph": "s", "id": 276945, "pid": 76337, "tid": -914061504, "ts": 1716454225831898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225886412, "dur": 4, "args": { "External id": 276957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276957, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 276957, "pid": 5, "tid": 7, "ts": 1716454225886412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831919, "dur": 6, "args": { "External id": 276957, "cbid": 211, "correlation": 276957 } }, { "ph": "s", "id": 276957, "pid": 76337, "tid": -914061504, "ts": 1716454225831919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225886417, "dur": 7, "args": { "External id": 276960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276960, "pid": 5, "tid": 7, "ts": 1716454225886417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831938, "dur": 6, "args": { "External id": 276960, "cbid": 211, "correlation": 276960 } }, { "ph": "s", "id": 276960, "pid": 76337, "tid": -914061504, "ts": 1716454225831938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225886425, "dur": 4, "args": { "External id": 276969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276969, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276969, "pid": 5, "tid": 7, "ts": 1716454225886425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225831988, "dur": 11, "args": { "External id": 276969, "cbid": 211, "correlation": 276969 } }, { "ph": "s", "id": 276969, "pid": 76337, "tid": -914061504, "ts": 1716454225831988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225832053, "dur": 0, "args": { "External id": 276979, "cbid": 317, "correlation": 276979 } }, { "ph": "f", "id": 276979, "pid": 76337, "tid": -914061504, "ts": 1716454225832053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225832054, "dur": 0, "args": { "External id": 276980, "cbid": 203, "correlation": 276980 } }, { "ph": "f", "id": 276980, "pid": 76337, "tid": -914061504, "ts": 1716454225832054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225832055, "dur": 0, "args": { "External id": 276981, "cbid": 205, "correlation": 276981 } }, { "ph": "f", "id": 276981, "pid": 76337, "tid": -914061504, "ts": 1716454225832055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225886431, "dur": 5, "args": { "External id": 276985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276985, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276985, "pid": 5, "tid": 7, "ts": 1716454225886431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832069, "dur": 12, "args": { "External id": 276985, "cbid": 211, "correlation": 276985 } }, { "ph": "s", "id": 276985, "pid": 76337, "tid": -914061504, "ts": 1716454225832069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225886437, "dur": 155, "args": { "External id": 276987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276987, "pid": 5, "tid": 7, "ts": 1716454225886437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832083, "dur": 5, "args": { "External id": 276987, "cbid": 211, "correlation": 276987 } }, { "ph": "s", "id": 276987, "pid": 76337, "tid": -914061504, "ts": 1716454225832083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225886594, "dur": 1, "args": { "External id": 276989, "device": 5, "context": 1, "stream": 7, "correlation": 276989, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 276989, "pid": 5, "tid": 7, "ts": 1716454225886594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225832094, "dur": 6, "args": { "External id": 276989, "cbid": 51, "correlation": 276989 } }, { "ph": "s", "id": 276989, "pid": 76337, "tid": -914061504, "ts": 1716454225832094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225886597, "dur": 250, "args": { "External id": 276990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276990, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 276990, "pid": 5, "tid": 7, "ts": 1716454225886597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832101, "dur": 6, "args": { "External id": 276990, "cbid": 211, "correlation": 276990 } }, { "ph": "s", "id": 276990, "pid": 76337, "tid": -914061504, "ts": 1716454225832101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225886849, "dur": 6, "args": { "External id": 276992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 276992, "pid": 5, "tid": 7, "ts": 1716454225886849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832112, "dur": 5, "args": { "External id": 276992, "cbid": 211, "correlation": 276992 } }, { "ph": "s", "id": 276992, "pid": 76337, "tid": -914061504, "ts": 1716454225832112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225886856, "dur": 6, "args": { "External id": 276998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 276998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 276998, "pid": 5, "tid": 7, "ts": 1716454225886856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832140, "dur": 8, "args": { "External id": 276998, "cbid": 211, "correlation": 276998 } }, { "ph": "s", "id": 276998, "pid": 76337, "tid": -914061504, "ts": 1716454225832140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225886863, "dur": 5, "args": { "External id": 277006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277006, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277006, "pid": 5, "tid": 7, "ts": 1716454225886863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832173, "dur": 8, "args": { "External id": 277006, "cbid": 211, "correlation": 277006 } }, { "ph": "s", "id": 277006, "pid": 76337, "tid": -914061504, "ts": 1716454225832173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225886869, "dur": 5, "args": { "External id": 277014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277014, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277014, "pid": 5, "tid": 7, "ts": 1716454225886869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832202, "dur": 9, "args": { "External id": 277014, "cbid": 211, "correlation": 277014 } }, { "ph": "s", "id": 277014, "pid": 76337, "tid": -914061504, "ts": 1716454225832202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225886875, "dur": 9, "args": { "External id": 277034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277034, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 277034, "pid": 5, "tid": 7, "ts": 1716454225886875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832305, "dur": 13, "args": { "External id": 277034, "cbid": 211, "correlation": 277034 } }, { "ph": "s", "id": 277034, "pid": 76337, "tid": -914061504, "ts": 1716454225832305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225886886, "dur": 3, "args": { "External id": 277046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277046, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 277046, "pid": 5, "tid": 7, "ts": 1716454225886886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832328, "dur": 6, "args": { "External id": 277046, "cbid": 211, "correlation": 277046 } }, { "ph": "s", "id": 277046, "pid": 76337, "tid": -914061504, "ts": 1716454225832328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225886890, "dur": 6, "args": { "External id": 277049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277049, "pid": 5, "tid": 7, "ts": 1716454225886890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832346, "dur": 6, "args": { "External id": 277049, "cbid": 211, "correlation": 277049 } }, { "ph": "s", "id": 277049, "pid": 76337, "tid": -914061504, "ts": 1716454225832346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225886898, "dur": 4, "args": { "External id": 277058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277058, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277058, "pid": 5, "tid": 7, "ts": 1716454225886898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832384, "dur": 9, "args": { "External id": 277058, "cbid": 211, "correlation": 277058 } }, { "ph": "s", "id": 277058, "pid": 76337, "tid": -914061504, "ts": 1716454225832384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225832436, "dur": 0, "args": { "External id": 277068, "cbid": 317, "correlation": 277068 } }, { "ph": "f", "id": 277068, "pid": 76337, "tid": -914061504, "ts": 1716454225832436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225832437, "dur": 0, "args": { "External id": 277069, "cbid": 203, "correlation": 277069 } }, { "ph": "f", "id": 277069, "pid": 76337, "tid": -914061504, "ts": 1716454225832437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225832437, "dur": 0, "args": { "External id": 277070, "cbid": 205, "correlation": 277070 } }, { "ph": "f", "id": 277070, "pid": 76337, "tid": -914061504, "ts": 1716454225832437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225886903, "dur": 5, "args": { "External id": 277074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277074, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277074, "pid": 5, "tid": 7, "ts": 1716454225886903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832450, "dur": 11, "args": { "External id": 277074, "cbid": 211, "correlation": 277074 } }, { "ph": "s", "id": 277074, "pid": 76337, "tid": -914061504, "ts": 1716454225832450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225886910, "dur": 155, "args": { "External id": 277076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277076, "pid": 5, "tid": 7, "ts": 1716454225886910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832465, "dur": 6, "args": { "External id": 277076, "cbid": 211, "correlation": 277076 } }, { "ph": "s", "id": 277076, "pid": 76337, "tid": -914061504, "ts": 1716454225832465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225887067, "dur": 1, "args": { "External id": 277078, "device": 5, "context": 1, "stream": 7, "correlation": 277078, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 277078, "pid": 5, "tid": 7, "ts": 1716454225887067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225832476, "dur": 6, "args": { "External id": 277078, "cbid": 51, "correlation": 277078 } }, { "ph": "s", "id": 277078, "pid": 76337, "tid": -914061504, "ts": 1716454225832476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225887070, "dur": 250, "args": { "External id": 277079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277079, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277079, "pid": 5, "tid": 7, "ts": 1716454225887070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832483, "dur": 6, "args": { "External id": 277079, "cbid": 211, "correlation": 277079 } }, { "ph": "s", "id": 277079, "pid": 76337, "tid": -914061504, "ts": 1716454225832483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225887321, "dur": 5, "args": { "External id": 277081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277081, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277081, "pid": 5, "tid": 7, "ts": 1716454225887321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832493, "dur": 5, "args": { "External id": 277081, "cbid": 211, "correlation": 277081 } }, { "ph": "s", "id": 277081, "pid": 76337, "tid": -914061504, "ts": 1716454225832493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225887328, "dur": 6, "args": { "External id": 277087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277087, "pid": 5, "tid": 7, "ts": 1716454225887328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832521, "dur": 8, "args": { "External id": 277087, "cbid": 211, "correlation": 277087 } }, { "ph": "s", "id": 277087, "pid": 76337, "tid": -914061504, "ts": 1716454225832521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225887335, "dur": 3, "args": { "External id": 277095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277095, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 277095, "pid": 5, "tid": 7, "ts": 1716454225887335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832564, "dur": 10, "args": { "External id": 277095, "cbid": 211, "correlation": 277095 } }, { "ph": "s", "id": 277095, "pid": 76337, "tid": -914061504, "ts": 1716454225832564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225832625, "dur": 1, "args": { "External id": 277111, "cbid": 251, "correlation": 277111 } }, { "ph": "f", "id": 277111, "pid": 76337, "tid": -914061504, "ts": 1716454225832625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225832631, "dur": 0, "args": { "External id": 277113, "cbid": 251, "correlation": 277113 } }, { "ph": "f", "id": 277113, "pid": 76337, "tid": -914061504, "ts": 1716454225832631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225887340, "dur": 10, "args": { "External id": 277114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277114, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277114, "pid": 5, "tid": 7, "ts": 1716454225887340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832632, "dur": 11, "args": { "External id": 277114, "cbid": 211, "correlation": 277114 } }, { "ph": "s", "id": 277114, "pid": 76337, "tid": -914061504, "ts": 1716454225832632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225887351, "dur": 3, "args": { "External id": 277116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277116, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277116, "pid": 5, "tid": 7, "ts": 1716454225887351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832645, "dur": 5, "args": { "External id": 277116, "cbid": 211, "correlation": 277116 } }, { "ph": "s", "id": 277116, "pid": 76337, "tid": -914061504, "ts": 1716454225832645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225887356, "dur": 5, "args": { "External id": 277126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277126, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277126, "pid": 5, "tid": 7, "ts": 1716454225887356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832701, "dur": 12, "args": { "External id": 277126, "cbid": 211, "correlation": 277126 } }, { "ph": "s", "id": 277126, "pid": 76337, "tid": -914061504, "ts": 1716454225832701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225887362, "dur": 9, "args": { "External id": 277146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277146, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 277146, "pid": 5, "tid": 7, "ts": 1716454225887362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832766, "dur": 11, "args": { "External id": 277146, "cbid": 211, "correlation": 277146 } }, { "ph": "s", "id": 277146, "pid": 76337, "tid": -914061504, "ts": 1716454225832766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225887373, "dur": 4, "args": { "External id": 277158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277158, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 277158, "pid": 5, "tid": 7, "ts": 1716454225887373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832786, "dur": 6, "args": { "External id": 277158, "cbid": 211, "correlation": 277158 } }, { "ph": "s", "id": 277158, "pid": 76337, "tid": -914061504, "ts": 1716454225832786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225887378, "dur": 6, "args": { "External id": 277161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277161, "pid": 5, "tid": 7, "ts": 1716454225887378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832805, "dur": 6, "args": { "External id": 277161, "cbid": 211, "correlation": 277161 } }, { "ph": "s", "id": 277161, "pid": 76337, "tid": -914061504, "ts": 1716454225832805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225887385, "dur": 4, "args": { "External id": 277170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277170, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277170, "pid": 5, "tid": 7, "ts": 1716454225887385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832845, "dur": 10, "args": { "External id": 277170, "cbid": 211, "correlation": 277170 } }, { "ph": "s", "id": 277170, "pid": 76337, "tid": -914061504, "ts": 1716454225832845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225832908, "dur": 0, "args": { "External id": 277180, "cbid": 317, "correlation": 277180 } }, { "ph": "f", "id": 277180, "pid": 76337, "tid": -914061504, "ts": 1716454225832908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225832909, "dur": 0, "args": { "External id": 277181, "cbid": 203, "correlation": 277181 } }, { "ph": "f", "id": 277181, "pid": 76337, "tid": -914061504, "ts": 1716454225832909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225832910, "dur": 0, "args": { "External id": 277182, "cbid": 205, "correlation": 277182 } }, { "ph": "f", "id": 277182, "pid": 76337, "tid": -914061504, "ts": 1716454225832910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225887391, "dur": 5, "args": { "External id": 277186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277186, "pid": 5, "tid": 7, "ts": 1716454225887391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832924, "dur": 12, "args": { "External id": 277186, "cbid": 211, "correlation": 277186 } }, { "ph": "s", "id": 277186, "pid": 76337, "tid": -914061504, "ts": 1716454225832924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225887397, "dur": 155, "args": { "External id": 277188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277188, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277188, "pid": 5, "tid": 7, "ts": 1716454225887397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832938, "dur": 5, "args": { "External id": 277188, "cbid": 211, "correlation": 277188 } }, { "ph": "s", "id": 277188, "pid": 76337, "tid": -914061504, "ts": 1716454225832938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225887555, "dur": 1, "args": { "External id": 277190, "device": 5, "context": 1, "stream": 7, "correlation": 277190, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 277190, "pid": 5, "tid": 7, "ts": 1716454225887555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225832949, "dur": 6, "args": { "External id": 277190, "cbid": 51, "correlation": 277190 } }, { "ph": "s", "id": 277190, "pid": 76337, "tid": -914061504, "ts": 1716454225832949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225887558, "dur": 250, "args": { "External id": 277191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277191, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277191, "pid": 5, "tid": 7, "ts": 1716454225887558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832956, "dur": 6, "args": { "External id": 277191, "cbid": 211, "correlation": 277191 } }, { "ph": "s", "id": 277191, "pid": 76337, "tid": -914061504, "ts": 1716454225832956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225887810, "dur": 6, "args": { "External id": 277193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277193, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277193, "pid": 5, "tid": 7, "ts": 1716454225887810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225832965, "dur": 5, "args": { "External id": 277193, "cbid": 211, "correlation": 277193 } }, { "ph": "s", "id": 277193, "pid": 76337, "tid": -914061504, "ts": 1716454225832965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225887817, "dur": 6, "args": { "External id": 277199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277199, "pid": 5, "tid": 7, "ts": 1716454225887817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833001, "dur": 9, "args": { "External id": 277199, "cbid": 211, "correlation": 277199 } }, { "ph": "s", "id": 277199, "pid": 76337, "tid": -914061504, "ts": 1716454225833001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225887824, "dur": 5, "args": { "External id": 277207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277207, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277207, "pid": 5, "tid": 7, "ts": 1716454225887824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833035, "dur": 8, "args": { "External id": 277207, "cbid": 211, "correlation": 277207 } }, { "ph": "s", "id": 277207, "pid": 76337, "tid": -914061504, "ts": 1716454225833035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225887830, "dur": 4, "args": { "External id": 277215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277215, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277215, "pid": 5, "tid": 7, "ts": 1716454225887830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833064, "dur": 9, "args": { "External id": 277215, "cbid": 211, "correlation": 277215 } }, { "ph": "s", "id": 277215, "pid": 76337, "tid": -914061504, "ts": 1716454225833064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225887836, "dur": 9, "args": { "External id": 277235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277235, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 277235, "pid": 5, "tid": 7, "ts": 1716454225887836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833207, "dur": 14, "args": { "External id": 277235, "cbid": 211, "correlation": 277235 } }, { "ph": "s", "id": 277235, "pid": 76337, "tid": -914061504, "ts": 1716454225833207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225887846, "dur": 4, "args": { "External id": 277247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277247, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 277247, "pid": 5, "tid": 7, "ts": 1716454225887846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833233, "dur": 6, "args": { "External id": 277247, "cbid": 211, "correlation": 277247 } }, { "ph": "s", "id": 277247, "pid": 76337, "tid": -914061504, "ts": 1716454225833233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225887851, "dur": 6, "args": { "External id": 277250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277250, "pid": 5, "tid": 7, "ts": 1716454225887851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833252, "dur": 6, "args": { "External id": 277250, "cbid": 211, "correlation": 277250 } }, { "ph": "s", "id": 277250, "pid": 76337, "tid": -914061504, "ts": 1716454225833252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225833311, "dur": 0, "args": { "External id": 277261, "cbid": 317, "correlation": 277261 } }, { "ph": "f", "id": 277261, "pid": 76337, "tid": -914061504, "ts": 1716454225833311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225833312, "dur": 0, "args": { "External id": 277262, "cbid": 203, "correlation": 277262 } }, { "ph": "f", "id": 277262, "pid": 76337, "tid": -914061504, "ts": 1716454225833312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225833313, "dur": 0, "args": { "External id": 277263, "cbid": 205, "correlation": 277263 } }, { "ph": "f", "id": 277263, "pid": 76337, "tid": -914061504, "ts": 1716454225833313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225887858, "dur": 5, "args": { "External id": 277267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277267, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277267, "pid": 5, "tid": 7, "ts": 1716454225887858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833330, "dur": 12, "args": { "External id": 277267, "cbid": 211, "correlation": 277267 } }, { "ph": "s", "id": 277267, "pid": 76337, "tid": -914061504, "ts": 1716454225833330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225887864, "dur": 35, "args": { "External id": 277269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277269, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 277269, "pid": 5, "tid": 7, "ts": 1716454225887864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833352, "dur": 10, "args": { "External id": 277269, "cbid": 211, "correlation": 277269 } }, { "ph": "s", "id": 277269, "pid": 76337, "tid": -914061504, "ts": 1716454225833352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225887901, "dur": 5, "args": { "External id": 277271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277271, "pid": 5, "tid": 7, "ts": 1716454225887901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833366, "dur": 5, "args": { "External id": 277271, "cbid": 211, "correlation": 277271 } }, { "ph": "s", "id": 277271, "pid": 76337, "tid": -914061504, "ts": 1716454225833366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225887908, "dur": 6, "args": { "External id": 277277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277277, "pid": 5, "tid": 7, "ts": 1716454225887908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833393, "dur": 8, "args": { "External id": 277277, "cbid": 211, "correlation": 277277 } }, { "ph": "s", "id": 277277, "pid": 76337, "tid": -914061504, "ts": 1716454225833393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225887915, "dur": 19, "args": { "External id": 277286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277286, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277286, "pid": 5, "tid": 7, "ts": 1716454225887915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833476, "dur": 14, "args": { "External id": 277286, "cbid": 211, "correlation": 277286 } }, { "ph": "s", "id": 277286, "pid": 76337, "tid": -914061504, "ts": 1716454225833476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225887936, "dur": 10, "args": { "External id": 277308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277308, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 277308, "pid": 5, "tid": 7, "ts": 1716454225887936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833534, "dur": 10, "args": { "External id": 277308, "cbid": 211, "correlation": 277308 } }, { "ph": "s", "id": 277308, "pid": 76337, "tid": -914061504, "ts": 1716454225833534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833625, "dur": 2, "args": { "External id": 277319, "cbid": 251, "correlation": 277319 } }, { "ph": "f", "id": 277319, "pid": 76337, "tid": -914061504, "ts": 1716454225833625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833630, "dur": 0, "args": { "External id": 277320, "cbid": 251, "correlation": 277320 } }, { "ph": "f", "id": 277320, "pid": 76337, "tid": -914061504, "ts": 1716454225833630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225887947, "dur": 51, "args": { "External id": 277321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277321, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 277321, "pid": 5, "tid": 7, "ts": 1716454225887947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833633, "dur": 14, "args": { "External id": 277321, "cbid": 211, "correlation": 277321 } }, { "ph": "s", "id": 277321, "pid": 76337, "tid": -914061504, "ts": 1716454225833633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833708, "dur": 1, "args": { "External id": 277332, "cbid": 251, "correlation": 277332 } }, { "ph": "f", "id": 277332, "pid": 76337, "tid": -914061504, "ts": 1716454225833708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833712, "dur": 0, "args": { "External id": 277333, "cbid": 251, "correlation": 277333 } }, { "ph": "f", "id": 277333, "pid": 76337, "tid": -914061504, "ts": 1716454225833712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225887999, "dur": 51, "args": { "External id": 277334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277334, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 277334, "pid": 5, "tid": 7, "ts": 1716454225887999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833714, "dur": 12, "args": { "External id": 277334, "cbid": 211, "correlation": 277334 } }, { "ph": "s", "id": 277334, "pid": 76337, "tid": -914061504, "ts": 1716454225833714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833780, "dur": 1, "args": { "External id": 277345, "cbid": 251, "correlation": 277345 } }, { "ph": "f", "id": 277345, "pid": 76337, "tid": -914061504, "ts": 1716454225833780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833783, "dur": 0, "args": { "External id": 277346, "cbid": 251, "correlation": 277346 } }, { "ph": "f", "id": 277346, "pid": 76337, "tid": -914061504, "ts": 1716454225833783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225888052, "dur": 51, "args": { "External id": 277347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277347, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 277347, "pid": 5, "tid": 7, "ts": 1716454225888052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833785, "dur": 11, "args": { "External id": 277347, "cbid": 211, "correlation": 277347 } }, { "ph": "s", "id": 277347, "pid": 76337, "tid": -914061504, "ts": 1716454225833785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225888104, "dur": 53, "args": { "External id": 277372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277372, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277372, "pid": 5, "tid": 7, "ts": 1716454225888104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833870, "dur": 13, "args": { "External id": 277372, "cbid": 211, "correlation": 277372 } }, { "ph": "s", "id": 277372, "pid": 76337, "tid": -914061504, "ts": 1716454225833870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225833972, "dur": 8, "args": { "External id": 277390, "cbid": 251, "correlation": 277390 } }, { "ph": "f", "id": 277390, "pid": 76337, "tid": -914061504, "ts": 1716454225833972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225888159, "dur": 60, "args": { "External id": 277392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277392, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 277392, "pid": 5, "tid": 7, "ts": 1716454225888159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225833985, "dur": 14, "args": { "External id": 277392, "cbid": 211, "correlation": 277392 } }, { "ph": "s", "id": 277392, "pid": 76337, "tid": -914061504, "ts": 1716454225833985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225888220, "dur": 6, "args": { "External id": 277400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277400, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277400, "pid": 5, "tid": 7, "ts": 1716454225888220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834057, "dur": 12, "args": { "External id": 277400, "cbid": 211, "correlation": 277400 } }, { "ph": "s", "id": 277400, "pid": 76337, "tid": -914061504, "ts": 1716454225834057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225888227, "dur": 7, "args": { "External id": 277408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277408, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277408, "pid": 5, "tid": 7, "ts": 1716454225888227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834095, "dur": 9, "args": { "External id": 277408, "cbid": 211, "correlation": 277408 } }, { "ph": "s", "id": 277408, "pid": 76337, "tid": -914061504, "ts": 1716454225834095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888236, "dur": 7, "args": { "External id": 277419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277419, "pid": 5, "tid": 7, "ts": 1716454225888236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834168, "dur": 12, "args": { "External id": 277419, "cbid": 211, "correlation": 277419 } }, { "ph": "s", "id": 277419, "pid": 76337, "tid": -914061504, "ts": 1716454225834168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225888245, "dur": 8, "args": { "External id": 277441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277441, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 277441, "pid": 5, "tid": 7, "ts": 1716454225888245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834203, "dur": 8, "args": { "External id": 277441, "cbid": 211, "correlation": 277441 } }, { "ph": "s", "id": 277441, "pid": 76337, "tid": -914061504, "ts": 1716454225834203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834290, "dur": 5, "args": { "External id": 277452, "cbid": 251, "correlation": 277452 } }, { "ph": "f", "id": 277452, "pid": 76337, "tid": -914061504, "ts": 1716454225834290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225888255, "dur": 1, "args": { "External id": 277453, "device": 5, "context": 1, "stream": 7, "correlation": 277453, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 277453, "pid": 5, "tid": 7, "ts": 1716454225888255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225834298, "dur": 11, "args": { "External id": 277453, "cbid": 51, "correlation": 277453 } }, { "ph": "s", "id": 277453, "pid": 76337, "tid": -914061504, "ts": 1716454225834298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225888258, "dur": 35, "args": { "External id": 277454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277454, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 277454, "pid": 5, "tid": 7, "ts": 1716454225888258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834311, "dur": 13, "args": { "External id": 277454, "cbid": 211, "correlation": 277454 } }, { "ph": "s", "id": 277454, "pid": 76337, "tid": -914061504, "ts": 1716454225834311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834383, "dur": 1, "args": { "External id": 277465, "cbid": 251, "correlation": 277465 } }, { "ph": "f", "id": 277465, "pid": 76337, "tid": -914061504, "ts": 1716454225834383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834387, "dur": 0, "args": { "External id": 277466, "cbid": 251, "correlation": 277466 } }, { "ph": "f", "id": 277466, "pid": 76337, "tid": -914061504, "ts": 1716454225834387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225888295, "dur": 12, "args": { "External id": 277467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277467, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277467, "pid": 5, "tid": 7, "ts": 1716454225888295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834388, "dur": 12, "args": { "External id": 277467, "cbid": 211, "correlation": 277467 } }, { "ph": "s", "id": 277467, "pid": 76337, "tid": -914061504, "ts": 1716454225834388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225888308, "dur": 5, "args": { "External id": 277469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277469, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277469, "pid": 5, "tid": 7, "ts": 1716454225888308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834403, "dur": 6, "args": { "External id": 277469, "cbid": 211, "correlation": 277469 } }, { "ph": "s", "id": 277469, "pid": 76337, "tid": -914061504, "ts": 1716454225834403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834460, "dur": 1, "args": { "External id": 277480, "cbid": 251, "correlation": 277480 } }, { "ph": "f", "id": 277480, "pid": 76337, "tid": -914061504, "ts": 1716454225834460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834464, "dur": 0, "args": { "External id": 277481, "cbid": 251, "correlation": 277481 } }, { "ph": "f", "id": 277481, "pid": 76337, "tid": -914061504, "ts": 1716454225834464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225888314, "dur": 8, "args": { "External id": 277482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277482, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277482, "pid": 5, "tid": 7, "ts": 1716454225888314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834465, "dur": 11, "args": { "External id": 277482, "cbid": 211, "correlation": 277482 } }, { "ph": "s", "id": 277482, "pid": 76337, "tid": -914061504, "ts": 1716454225834465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225888324, "dur": 3, "args": { "External id": 277484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277484, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277484, "pid": 5, "tid": 7, "ts": 1716454225888324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834478, "dur": 5, "args": { "External id": 277484, "cbid": 211, "correlation": 277484 } }, { "ph": "s", "id": 277484, "pid": 76337, "tid": -914061504, "ts": 1716454225834478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225888329, "dur": 20, "args": { "External id": 277509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277509, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 277509, "pid": 5, "tid": 7, "ts": 1716454225888329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834557, "dur": 12, "args": { "External id": 277509, "cbid": 211, "correlation": 277509 } }, { "ph": "s", "id": 277509, "pid": 76337, "tid": -914061504, "ts": 1716454225834557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834660, "dur": 3, "args": { "External id": 277527, "cbid": 251, "correlation": 277527 } }, { "ph": "f", "id": 277527, "pid": 76337, "tid": -914061504, "ts": 1716454225834660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225888351, "dur": 1, "args": { "External id": 277529, "device": 5, "context": 1, "stream": 7, "correlation": 277529, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 277529, "pid": 5, "tid": 7, "ts": 1716454225888351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225834667, "dur": 10, "args": { "External id": 277529, "cbid": 51, "correlation": 277529 } }, { "ph": "s", "id": 277529, "pid": 76337, "tid": -914061504, "ts": 1716454225834667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225888354, "dur": 34, "args": { "External id": 277530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277530, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 277530, "pid": 5, "tid": 7, "ts": 1716454225888354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834679, "dur": 12, "args": { "External id": 277530, "cbid": 211, "correlation": 277530 } }, { "ph": "s", "id": 277530, "pid": 76337, "tid": -914061504, "ts": 1716454225834679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225888390, "dur": 4, "args": { "External id": 277538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277538, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277538, "pid": 5, "tid": 7, "ts": 1716454225888390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834750, "dur": 13, "args": { "External id": 277538, "cbid": 211, "correlation": 277538 } }, { "ph": "s", "id": 277538, "pid": 76337, "tid": -914061504, "ts": 1716454225834750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888395, "dur": 8, "args": { "External id": 277546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277546, "pid": 5, "tid": 7, "ts": 1716454225888395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834793, "dur": 10, "args": { "External id": 277546, "cbid": 211, "correlation": 277546 } }, { "ph": "s", "id": 277546, "pid": 76337, "tid": -914061504, "ts": 1716454225834793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225888405, "dur": 8, "args": { "External id": 277568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277568, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 277568, "pid": 5, "tid": 7, "ts": 1716454225888405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834844, "dur": 10, "args": { "External id": 277568, "cbid": 211, "correlation": 277568 } }, { "ph": "s", "id": 277568, "pid": 76337, "tid": -914061504, "ts": 1716454225834844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834936, "dur": 1, "args": { "External id": 277584, "cbid": 251, "correlation": 277584 } }, { "ph": "f", "id": 277584, "pid": 76337, "tid": -914061504, "ts": 1716454225834936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225834941, "dur": 0, "args": { "External id": 277586, "cbid": 251, "correlation": 277586 } }, { "ph": "f", "id": 277586, "pid": 76337, "tid": -914061504, "ts": 1716454225834941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225888414, "dur": 186, "args": { "External id": 277587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277587, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277587, "pid": 5, "tid": 7, "ts": 1716454225888414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225834943, "dur": 13, "args": { "External id": 277587, "cbid": 211, "correlation": 277587 } }, { "ph": "s", "id": 277587, "pid": 76337, "tid": -914061504, "ts": 1716454225834943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888601, "dur": 20, "args": { "External id": 277595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277595, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277595, "pid": 5, "tid": 7, "ts": 1716454225888601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835019, "dur": 13, "args": { "External id": 277595, "cbid": 211, "correlation": 277595 } }, { "ph": "s", "id": 277595, "pid": 76337, "tid": -914061504, "ts": 1716454225835019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888622, "dur": 21, "args": { "External id": 277603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277603, "pid": 5, "tid": 7, "ts": 1716454225888622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835051, "dur": 9, "args": { "External id": 277603, "cbid": 211, "correlation": 277603 } }, { "ph": "s", "id": 277603, "pid": 76337, "tid": -914061504, "ts": 1716454225835051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225835133, "dur": 1, "args": { "External id": 277619, "cbid": 251, "correlation": 277619 } }, { "ph": "f", "id": 277619, "pid": 76337, "tid": -914061504, "ts": 1716454225835133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225888645, "dur": 1, "args": { "External id": 277621, "device": 5, "context": 1, "stream": 7, "correlation": 277621, "bytes": 120, "memory bandwidth (GB/s)": 0.08152173913043478 } }, { "ph": "f", "id": 277621, "pid": 5, "tid": 7, "ts": 1716454225888645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225835138, "dur": 8, "args": { "External id": 277621, "cbid": 51, "correlation": 277621 } }, { "ph": "s", "id": 277621, "pid": 76337, "tid": -914061504, "ts": 1716454225835138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225888649, "dur": 107, "args": { "External id": 277622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277622, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 277622, "pid": 5, "tid": 7, "ts": 1716454225888649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835148, "dur": 12, "args": { "External id": 277622, "cbid": 211, "correlation": 277622 } }, { "ph": "s", "id": 277622, "pid": 76337, "tid": -914061504, "ts": 1716454225835148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225888757, "dur": 5, "args": { "External id": 277630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277630, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277630, "pid": 5, "tid": 7, "ts": 1716454225888757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835191, "dur": 10, "args": { "External id": 277630, "cbid": 211, "correlation": 277630 } }, { "ph": "s", "id": 277630, "pid": 76337, "tid": -914061504, "ts": 1716454225835191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888764, "dur": 9, "args": { "External id": 277641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277641, "pid": 5, "tid": 7, "ts": 1716454225888764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835259, "dur": 12, "args": { "External id": 277641, "cbid": 211, "correlation": 277641 } }, { "ph": "s", "id": 277641, "pid": 76337, "tid": -914061504, "ts": 1716454225835259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225835324, "dur": 0, "args": { "External id": 277653, "cbid": 317, "correlation": 277653 } }, { "ph": "f", "id": 277653, "pid": 76337, "tid": -914061504, "ts": 1716454225835324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225835325, "dur": 0, "args": { "External id": 277654, "cbid": 203, "correlation": 277654 } }, { "ph": "f", "id": 277654, "pid": 76337, "tid": -914061504, "ts": 1716454225835325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225835325, "dur": 0, "args": { "External id": 277655, "cbid": 205, "correlation": 277655 } }, { "ph": "f", "id": 277655, "pid": 76337, "tid": -914061504, "ts": 1716454225835325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225888774, "dur": 5, "args": { "External id": 277659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277659, "pid": 5, "tid": 7, "ts": 1716454225888774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835340, "dur": 12, "args": { "External id": 277659, "cbid": 211, "correlation": 277659 } }, { "ph": "s", "id": 277659, "pid": 76337, "tid": -914061504, "ts": 1716454225835340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225888781, "dur": 36, "args": { "External id": 277661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277661, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 277661, "pid": 5, "tid": 7, "ts": 1716454225888781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835359, "dur": 7, "args": { "External id": 277661, "cbid": 211, "correlation": 277661 } }, { "ph": "s", "id": 277661, "pid": 76337, "tid": -914061504, "ts": 1716454225835359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225888818, "dur": 5, "args": { "External id": 277663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277663, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277663, "pid": 5, "tid": 7, "ts": 1716454225888818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835370, "dur": 5, "args": { "External id": 277663, "cbid": 211, "correlation": 277663 } }, { "ph": "s", "id": 277663, "pid": 76337, "tid": -914061504, "ts": 1716454225835370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888825, "dur": 7, "args": { "External id": 277669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277669, "pid": 5, "tid": 7, "ts": 1716454225888825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835397, "dur": 9, "args": { "External id": 277669, "cbid": 211, "correlation": 277669 } }, { "ph": "s", "id": 277669, "pid": 76337, "tid": -914061504, "ts": 1716454225835397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225888833, "dur": 5, "args": { "External id": 277677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277677, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277677, "pid": 5, "tid": 7, "ts": 1716454225888833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835429, "dur": 8, "args": { "External id": 277677, "cbid": 211, "correlation": 277677 } }, { "ph": "s", "id": 277677, "pid": 76337, "tid": -914061504, "ts": 1716454225835429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225888840, "dur": 10, "args": { "External id": 277697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277697, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 277697, "pid": 5, "tid": 7, "ts": 1716454225888840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835500, "dur": 13, "args": { "External id": 277697, "cbid": 211, "correlation": 277697 } }, { "ph": "s", "id": 277697, "pid": 76337, "tid": -914061504, "ts": 1716454225835500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225888851, "dur": 4, "args": { "External id": 277709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277709, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 277709, "pid": 5, "tid": 7, "ts": 1716454225888851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835522, "dur": 6, "args": { "External id": 277709, "cbid": 211, "correlation": 277709 } }, { "ph": "s", "id": 277709, "pid": 76337, "tid": -914061504, "ts": 1716454225835522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225888857, "dur": 8, "args": { "External id": 277712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277712, "pid": 5, "tid": 7, "ts": 1716454225888857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835540, "dur": 7, "args": { "External id": 277712, "cbid": 211, "correlation": 277712 } }, { "ph": "s", "id": 277712, "pid": 76337, "tid": -914061504, "ts": 1716454225835540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225888867, "dur": 5, "args": { "External id": 277721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277721, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277721, "pid": 5, "tid": 7, "ts": 1716454225888867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835579, "dur": 10, "args": { "External id": 277721, "cbid": 211, "correlation": 277721 } }, { "ph": "s", "id": 277721, "pid": 76337, "tid": -914061504, "ts": 1716454225835579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225835632, "dur": 0, "args": { "External id": 277731, "cbid": 317, "correlation": 277731 } }, { "ph": "f", "id": 277731, "pid": 76337, "tid": -914061504, "ts": 1716454225835632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225835633, "dur": 0, "args": { "External id": 277732, "cbid": 203, "correlation": 277732 } }, { "ph": "f", "id": 277732, "pid": 76337, "tid": -914061504, "ts": 1716454225835633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225835634, "dur": 0, "args": { "External id": 277733, "cbid": 205, "correlation": 277733 } }, { "ph": "f", "id": 277733, "pid": 76337, "tid": -914061504, "ts": 1716454225835634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225888873, "dur": 5, "args": { "External id": 277737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277737, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277737, "pid": 5, "tid": 7, "ts": 1716454225888873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835647, "dur": 12, "args": { "External id": 277737, "cbid": 211, "correlation": 277737 } }, { "ph": "s", "id": 277737, "pid": 76337, "tid": -914061504, "ts": 1716454225835647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225888879, "dur": 154, "args": { "External id": 277739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277739, "pid": 5, "tid": 7, "ts": 1716454225888879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835661, "dur": 5, "args": { "External id": 277739, "cbid": 211, "correlation": 277739 } }, { "ph": "s", "id": 277739, "pid": 76337, "tid": -914061504, "ts": 1716454225835661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225889035, "dur": 1, "args": { "External id": 277741, "device": 5, "context": 1, "stream": 7, "correlation": 277741, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 277741, "pid": 5, "tid": 7, "ts": 1716454225889035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225835672, "dur": 6, "args": { "External id": 277741, "cbid": 51, "correlation": 277741 } }, { "ph": "s", "id": 277741, "pid": 76337, "tid": -914061504, "ts": 1716454225835672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225889039, "dur": 260, "args": { "External id": 277742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277742, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277742, "pid": 5, "tid": 7, "ts": 1716454225889039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835679, "dur": 6, "args": { "External id": 277742, "cbid": 211, "correlation": 277742 } }, { "ph": "s", "id": 277742, "pid": 76337, "tid": -914061504, "ts": 1716454225835679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225889300, "dur": 5, "args": { "External id": 277744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277744, "pid": 5, "tid": 7, "ts": 1716454225889300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835689, "dur": 5, "args": { "External id": 277744, "cbid": 211, "correlation": 277744 } }, { "ph": "s", "id": 277744, "pid": 76337, "tid": -914061504, "ts": 1716454225835689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225889307, "dur": 6, "args": { "External id": 277750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277750, "pid": 5, "tid": 7, "ts": 1716454225889307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835717, "dur": 8, "args": { "External id": 277750, "cbid": 211, "correlation": 277750 } }, { "ph": "s", "id": 277750, "pid": 76337, "tid": -914061504, "ts": 1716454225835717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225889314, "dur": 4, "args": { "External id": 277758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277758, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 277758, "pid": 5, "tid": 7, "ts": 1716454225889314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835762, "dur": 10, "args": { "External id": 277758, "cbid": 211, "correlation": 277758 } }, { "ph": "s", "id": 277758, "pid": 76337, "tid": -914061504, "ts": 1716454225835762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225835829, "dur": 1, "args": { "External id": 277774, "cbid": 251, "correlation": 277774 } }, { "ph": "f", "id": 277774, "pid": 76337, "tid": -914061504, "ts": 1716454225835829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225835834, "dur": 0, "args": { "External id": 277776, "cbid": 251, "correlation": 277776 } }, { "ph": "f", "id": 277776, "pid": 76337, "tid": -914061504, "ts": 1716454225835834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225889319, "dur": 13, "args": { "External id": 277777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277777, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277777, "pid": 5, "tid": 7, "ts": 1716454225889319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835835, "dur": 11, "args": { "External id": 277777, "cbid": 211, "correlation": 277777 } }, { "ph": "s", "id": 277777, "pid": 76337, "tid": -914061504, "ts": 1716454225835835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225889333, "dur": 5, "args": { "External id": 277779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277779, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277779, "pid": 5, "tid": 7, "ts": 1716454225889333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835848, "dur": 5, "args": { "External id": 277779, "cbid": 211, "correlation": 277779 } }, { "ph": "s", "id": 277779, "pid": 76337, "tid": -914061504, "ts": 1716454225835848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225889339, "dur": 5, "args": { "External id": 277789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277789, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277789, "pid": 5, "tid": 7, "ts": 1716454225889339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835905, "dur": 13, "args": { "External id": 277789, "cbid": 211, "correlation": 277789 } }, { "ph": "s", "id": 277789, "pid": 76337, "tid": -914061504, "ts": 1716454225835905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225889346, "dur": 10, "args": { "External id": 277809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277809, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 277809, "pid": 5, "tid": 7, "ts": 1716454225889346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225835972, "dur": 18, "args": { "External id": 277809, "cbid": 211, "correlation": 277809 } }, { "ph": "s", "id": 277809, "pid": 76337, "tid": -914061504, "ts": 1716454225835972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225889357, "dur": 4, "args": { "External id": 277821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277821, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 277821, "pid": 5, "tid": 7, "ts": 1716454225889357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836000, "dur": 6, "args": { "External id": 277821, "cbid": 211, "correlation": 277821 } }, { "ph": "s", "id": 277821, "pid": 76337, "tid": -914061504, "ts": 1716454225836000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225889362, "dur": 6, "args": { "External id": 277824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277824, "pid": 5, "tid": 7, "ts": 1716454225889362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836020, "dur": 7, "args": { "External id": 277824, "cbid": 211, "correlation": 277824 } }, { "ph": "s", "id": 277824, "pid": 76337, "tid": -914061504, "ts": 1716454225836020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225889369, "dur": 4, "args": { "External id": 277833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277833, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277833, "pid": 5, "tid": 7, "ts": 1716454225889369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836062, "dur": 10, "args": { "External id": 277833, "cbid": 211, "correlation": 277833 } }, { "ph": "s", "id": 277833, "pid": 76337, "tid": -914061504, "ts": 1716454225836062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225836124, "dur": 0, "args": { "External id": 277843, "cbid": 317, "correlation": 277843 } }, { "ph": "f", "id": 277843, "pid": 76337, "tid": -914061504, "ts": 1716454225836124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225836124, "dur": 0, "args": { "External id": 277844, "cbid": 203, "correlation": 277844 } }, { "ph": "f", "id": 277844, "pid": 76337, "tid": -914061504, "ts": 1716454225836124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225836125, "dur": 0, "args": { "External id": 277845, "cbid": 205, "correlation": 277845 } }, { "ph": "f", "id": 277845, "pid": 76337, "tid": -914061504, "ts": 1716454225836125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225889375, "dur": 5, "args": { "External id": 277849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277849, "pid": 5, "tid": 7, "ts": 1716454225889375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836138, "dur": 12, "args": { "External id": 277849, "cbid": 211, "correlation": 277849 } }, { "ph": "s", "id": 277849, "pid": 76337, "tid": -914061504, "ts": 1716454225836138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225889381, "dur": 155, "args": { "External id": 277851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277851, "pid": 5, "tid": 7, "ts": 1716454225889381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836153, "dur": 5, "args": { "External id": 277851, "cbid": 211, "correlation": 277851 } }, { "ph": "s", "id": 277851, "pid": 76337, "tid": -914061504, "ts": 1716454225836153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225889538, "dur": 1, "args": { "External id": 277853, "device": 5, "context": 1, "stream": 7, "correlation": 277853, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 277853, "pid": 5, "tid": 7, "ts": 1716454225889538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225836164, "dur": 6, "args": { "External id": 277853, "cbid": 51, "correlation": 277853 } }, { "ph": "s", "id": 277853, "pid": 76337, "tid": -914061504, "ts": 1716454225836164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225889541, "dur": 250, "args": { "External id": 277854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277854, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277854, "pid": 5, "tid": 7, "ts": 1716454225889541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836171, "dur": 6, "args": { "External id": 277854, "cbid": 211, "correlation": 277854 } }, { "ph": "s", "id": 277854, "pid": 76337, "tid": -914061504, "ts": 1716454225836171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225889793, "dur": 6, "args": { "External id": 277856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277856, "pid": 5, "tid": 7, "ts": 1716454225889793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836181, "dur": 5, "args": { "External id": 277856, "cbid": 211, "correlation": 277856 } }, { "ph": "s", "id": 277856, "pid": 76337, "tid": -914061504, "ts": 1716454225836181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225889800, "dur": 6, "args": { "External id": 277862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277862, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277862, "pid": 5, "tid": 7, "ts": 1716454225889800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836209, "dur": 9, "args": { "External id": 277862, "cbid": 211, "correlation": 277862 } }, { "ph": "s", "id": 277862, "pid": 76337, "tid": -914061504, "ts": 1716454225836209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225889807, "dur": 5, "args": { "External id": 277870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277870, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277870, "pid": 5, "tid": 7, "ts": 1716454225889807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836242, "dur": 8, "args": { "External id": 277870, "cbid": 211, "correlation": 277870 } }, { "ph": "s", "id": 277870, "pid": 76337, "tid": -914061504, "ts": 1716454225836242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225889813, "dur": 4, "args": { "External id": 277878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277878, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277878, "pid": 5, "tid": 7, "ts": 1716454225889813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836271, "dur": 8, "args": { "External id": 277878, "cbid": 211, "correlation": 277878 } }, { "ph": "s", "id": 277878, "pid": 76337, "tid": -914061504, "ts": 1716454225836271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225889819, "dur": 11, "args": { "External id": 277887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277887, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277887, "pid": 5, "tid": 7, "ts": 1716454225889819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836360, "dur": 13, "args": { "External id": 277887, "cbid": 211, "correlation": 277887 } }, { "ph": "s", "id": 277887, "pid": 76337, "tid": -914061504, "ts": 1716454225836360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225889831, "dur": 12, "args": { "External id": 277907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277907, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 277907, "pid": 5, "tid": 7, "ts": 1716454225889831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836425, "dur": 11, "args": { "External id": 277907, "cbid": 211, "correlation": 277907 } }, { "ph": "s", "id": 277907, "pid": 76337, "tid": -914061504, "ts": 1716454225836425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225889844, "dur": 4, "args": { "External id": 277919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277919, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277919, "pid": 5, "tid": 7, "ts": 1716454225889844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836446, "dur": 6, "args": { "External id": 277919, "cbid": 211, "correlation": 277919 } }, { "ph": "s", "id": 277919, "pid": 76337, "tid": -914061504, "ts": 1716454225836446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225889849, "dur": 9, "args": { "External id": 277922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277922, "pid": 5, "tid": 7, "ts": 1716454225889849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836465, "dur": 6, "args": { "External id": 277922, "cbid": 211, "correlation": 277922 } }, { "ph": "s", "id": 277922, "pid": 76337, "tid": -914061504, "ts": 1716454225836465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225889860, "dur": 6, "args": { "External id": 277931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277931, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277931, "pid": 5, "tid": 7, "ts": 1716454225889860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836503, "dur": 10, "args": { "External id": 277931, "cbid": 211, "correlation": 277931 } }, { "ph": "s", "id": 277931, "pid": 76337, "tid": -914061504, "ts": 1716454225836503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225836554, "dur": 0, "args": { "External id": 277941, "cbid": 317, "correlation": 277941 } }, { "ph": "f", "id": 277941, "pid": 76337, "tid": -914061504, "ts": 1716454225836554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225836555, "dur": 0, "args": { "External id": 277942, "cbid": 203, "correlation": 277942 } }, { "ph": "f", "id": 277942, "pid": 76337, "tid": -914061504, "ts": 1716454225836555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225836556, "dur": 0, "args": { "External id": 277943, "cbid": 205, "correlation": 277943 } }, { "ph": "f", "id": 277943, "pid": 76337, "tid": -914061504, "ts": 1716454225836556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225889867, "dur": 6, "args": { "External id": 277947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277947, "pid": 5, "tid": 7, "ts": 1716454225889867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836571, "dur": 11, "args": { "External id": 277947, "cbid": 211, "correlation": 277947 } }, { "ph": "s", "id": 277947, "pid": 76337, "tid": -914061504, "ts": 1716454225836571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225889875, "dur": 306, "args": { "External id": 277949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277949, "pid": 5, "tid": 7, "ts": 1716454225889875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836585, "dur": 5, "args": { "External id": 277949, "cbid": 211, "correlation": 277949 } }, { "ph": "s", "id": 277949, "pid": 76337, "tid": -914061504, "ts": 1716454225836585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225890183, "dur": 1, "args": { "External id": 277951, "device": 5, "context": 1, "stream": 7, "correlation": 277951, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 277951, "pid": 5, "tid": 7, "ts": 1716454225890183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225836597, "dur": 8, "args": { "External id": 277951, "cbid": 51, "correlation": 277951 } }, { "ph": "s", "id": 277951, "pid": 76337, "tid": -914061504, "ts": 1716454225836597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225890186, "dur": 479, "args": { "External id": 277952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277952, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277952, "pid": 5, "tid": 7, "ts": 1716454225890186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836606, "dur": 7, "args": { "External id": 277952, "cbid": 211, "correlation": 277952 } }, { "ph": "s", "id": 277952, "pid": 76337, "tid": -914061504, "ts": 1716454225836606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225890667, "dur": 6, "args": { "External id": 277954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277954, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 277954, "pid": 5, "tid": 7, "ts": 1716454225890667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836617, "dur": 5, "args": { "External id": 277954, "cbid": 211, "correlation": 277954 } }, { "ph": "s", "id": 277954, "pid": 76337, "tid": -914061504, "ts": 1716454225836617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225890674, "dur": 6, "args": { "External id": 277960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277960, "pid": 5, "tid": 7, "ts": 1716454225890674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836649, "dur": 10, "args": { "External id": 277960, "cbid": 211, "correlation": 277960 } }, { "ph": "s", "id": 277960, "pid": 76337, "tid": -914061504, "ts": 1716454225836649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225890681, "dur": 3, "args": { "External id": 277968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277968, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 277968, "pid": 5, "tid": 7, "ts": 1716454225890681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836694, "dur": 10, "args": { "External id": 277968, "cbid": 211, "correlation": 277968 } }, { "ph": "s", "id": 277968, "pid": 76337, "tid": -914061504, "ts": 1716454225836694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225836762, "dur": 1, "args": { "External id": 277984, "cbid": 251, "correlation": 277984 } }, { "ph": "f", "id": 277984, "pid": 76337, "tid": -914061504, "ts": 1716454225836762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225836767, "dur": 0, "args": { "External id": 277986, "cbid": 251, "correlation": 277986 } }, { "ph": "f", "id": 277986, "pid": 76337, "tid": -914061504, "ts": 1716454225836767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225890686, "dur": 10, "args": { "External id": 277987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277987, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277987, "pid": 5, "tid": 7, "ts": 1716454225890686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836769, "dur": 11, "args": { "External id": 277987, "cbid": 211, "correlation": 277987 } }, { "ph": "s", "id": 277987, "pid": 76337, "tid": -914061504, "ts": 1716454225836769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225890697, "dur": 4, "args": { "External id": 277989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277989, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 277989, "pid": 5, "tid": 7, "ts": 1716454225890697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836782, "dur": 5, "args": { "External id": 277989, "cbid": 211, "correlation": 277989 } }, { "ph": "s", "id": 277989, "pid": 76337, "tid": -914061504, "ts": 1716454225836782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225890703, "dur": 6, "args": { "External id": 277999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 277999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 277999, "pid": 5, "tid": 7, "ts": 1716454225890703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836839, "dur": 12, "args": { "External id": 277999, "cbid": 211, "correlation": 277999 } }, { "ph": "s", "id": 277999, "pid": 76337, "tid": -914061504, "ts": 1716454225836839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225890710, "dur": 9, "args": { "External id": 278019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278019, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278019, "pid": 5, "tid": 7, "ts": 1716454225890710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836905, "dur": 12, "args": { "External id": 278019, "cbid": 211, "correlation": 278019 } }, { "ph": "s", "id": 278019, "pid": 76337, "tid": -914061504, "ts": 1716454225836905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225890720, "dur": 4, "args": { "External id": 278031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278031, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 278031, "pid": 5, "tid": 7, "ts": 1716454225890720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836926, "dur": 6, "args": { "External id": 278031, "cbid": 211, "correlation": 278031 } }, { "ph": "s", "id": 278031, "pid": 76337, "tid": -914061504, "ts": 1716454225836926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225890725, "dur": 6, "args": { "External id": 278034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278034, "pid": 5, "tid": 7, "ts": 1716454225890725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836945, "dur": 6, "args": { "External id": 278034, "cbid": 211, "correlation": 278034 } }, { "ph": "s", "id": 278034, "pid": 76337, "tid": -914061504, "ts": 1716454225836945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225890733, "dur": 4, "args": { "External id": 278043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278043, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278043, "pid": 5, "tid": 7, "ts": 1716454225890733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225836993, "dur": 10, "args": { "External id": 278043, "cbid": 211, "correlation": 278043 } }, { "ph": "s", "id": 278043, "pid": 76337, "tid": -914061504, "ts": 1716454225836993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225837057, "dur": 0, "args": { "External id": 278053, "cbid": 317, "correlation": 278053 } }, { "ph": "f", "id": 278053, "pid": 76337, "tid": -914061504, "ts": 1716454225837057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225837058, "dur": 0, "args": { "External id": 278054, "cbid": 203, "correlation": 278054 } }, { "ph": "f", "id": 278054, "pid": 76337, "tid": -914061504, "ts": 1716454225837058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225837058, "dur": 0, "args": { "External id": 278055, "cbid": 205, "correlation": 278055 } }, { "ph": "f", "id": 278055, "pid": 76337, "tid": -914061504, "ts": 1716454225837058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225890738, "dur": 5, "args": { "External id": 278059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278059, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278059, "pid": 5, "tid": 7, "ts": 1716454225890738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837073, "dur": 11, "args": { "External id": 278059, "cbid": 211, "correlation": 278059 } }, { "ph": "s", "id": 278059, "pid": 76337, "tid": -914061504, "ts": 1716454225837073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225890744, "dur": 155, "args": { "External id": 278061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278061, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278061, "pid": 5, "tid": 7, "ts": 1716454225890744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837087, "dur": 5, "args": { "External id": 278061, "cbid": 211, "correlation": 278061 } }, { "ph": "s", "id": 278061, "pid": 76337, "tid": -914061504, "ts": 1716454225837087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225890901, "dur": 1, "args": { "External id": 278063, "device": 5, "context": 1, "stream": 7, "correlation": 278063, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 278063, "pid": 5, "tid": 7, "ts": 1716454225890901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225837098, "dur": 6, "args": { "External id": 278063, "cbid": 51, "correlation": 278063 } }, { "ph": "s", "id": 278063, "pid": 76337, "tid": -914061504, "ts": 1716454225837098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225890905, "dur": 250, "args": { "External id": 278064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278064, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278064, "pid": 5, "tid": 7, "ts": 1716454225890905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837105, "dur": 6, "args": { "External id": 278064, "cbid": 211, "correlation": 278064 } }, { "ph": "s", "id": 278064, "pid": 76337, "tid": -914061504, "ts": 1716454225837105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225891156, "dur": 5, "args": { "External id": 278066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278066, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278066, "pid": 5, "tid": 7, "ts": 1716454225891156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837116, "dur": 6, "args": { "External id": 278066, "cbid": 211, "correlation": 278066 } }, { "ph": "s", "id": 278066, "pid": 76337, "tid": -914061504, "ts": 1716454225837116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225891163, "dur": 6, "args": { "External id": 278072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278072, "pid": 5, "tid": 7, "ts": 1716454225891163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837144, "dur": 8, "args": { "External id": 278072, "cbid": 211, "correlation": 278072 } }, { "ph": "s", "id": 278072, "pid": 76337, "tid": -914061504, "ts": 1716454225837144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225837204, "dur": 0, "args": { "External id": 278082, "cbid": 317, "correlation": 278082 } }, { "ph": "f", "id": 278082, "pid": 76337, "tid": -914061504, "ts": 1716454225837204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225837205, "dur": 0, "args": { "External id": 278083, "cbid": 203, "correlation": 278083 } }, { "ph": "f", "id": 278083, "pid": 76337, "tid": -914061504, "ts": 1716454225837205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225837206, "dur": 0, "args": { "External id": 278084, "cbid": 205, "correlation": 278084 } }, { "ph": "f", "id": 278084, "pid": 76337, "tid": -914061504, "ts": 1716454225837206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225891170, "dur": 7, "args": { "External id": 278088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278088, "pid": 5, "tid": 7, "ts": 1716454225891170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837219, "dur": 12, "args": { "External id": 278088, "cbid": 211, "correlation": 278088 } }, { "ph": "s", "id": 278088, "pid": 76337, "tid": -914061504, "ts": 1716454225837219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225891179, "dur": 3, "args": { "External id": 278090, "device": 5, "context": 1, "stream": 7, "correlation": 278090, "bytes": 4800, "memory bandwidth (GB/s)": 1.5625 } }, { "ph": "f", "id": 278090, "pid": 5, "tid": 7, "ts": 1716454225891179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225837237, "dur": 14, "args": { "External id": 278090, "cbid": 51, "correlation": 278090 } }, { "ph": "s", "id": 278090, "pid": 76337, "tid": -914061504, "ts": 1716454225837237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225891182, "dur": 96, "args": { "External id": 278091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278091, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 278091, "pid": 5, "tid": 7, "ts": 1716454225891182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837253, "dur": 6, "args": { "External id": 278091, "cbid": 211, "correlation": 278091 } }, { "ph": "s", "id": 278091, "pid": 76337, "tid": -914061504, "ts": 1716454225837253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225891280, "dur": 5, "args": { "External id": 278093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278093, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278093, "pid": 5, "tid": 7, "ts": 1716454225891280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837263, "dur": 6, "args": { "External id": 278093, "cbid": 211, "correlation": 278093 } }, { "ph": "s", "id": 278093, "pid": 76337, "tid": -914061504, "ts": 1716454225837263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225891286, "dur": 6, "args": { "External id": 278099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278099, "pid": 5, "tid": 7, "ts": 1716454225891286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837290, "dur": 8, "args": { "External id": 278099, "cbid": 211, "correlation": 278099 } }, { "ph": "s", "id": 278099, "pid": 76337, "tid": -914061504, "ts": 1716454225837290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225891293, "dur": 5, "args": { "External id": 278107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278107, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278107, "pid": 5, "tid": 7, "ts": 1716454225891293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837320, "dur": 8, "args": { "External id": 278107, "cbid": 211, "correlation": 278107 } }, { "ph": "s", "id": 278107, "pid": 76337, "tid": -914061504, "ts": 1716454225837320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225891300, "dur": 4, "args": { "External id": 278115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278115, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278115, "pid": 5, "tid": 7, "ts": 1716454225891300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837348, "dur": 8, "args": { "External id": 278115, "cbid": 211, "correlation": 278115 } }, { "ph": "s", "id": 278115, "pid": 76337, "tid": -914061504, "ts": 1716454225837348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225891305, "dur": 11, "args": { "External id": 278124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278124, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278124, "pid": 5, "tid": 7, "ts": 1716454225891305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837393, "dur": 10, "args": { "External id": 278124, "cbid": 211, "correlation": 278124 } }, { "ph": "s", "id": 278124, "pid": 76337, "tid": -914061504, "ts": 1716454225837393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225891317, "dur": 11, "args": { "External id": 278144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278144, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278144, "pid": 5, "tid": 7, "ts": 1716454225891317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837461, "dur": 12, "args": { "External id": 278144, "cbid": 211, "correlation": 278144 } }, { "ph": "s", "id": 278144, "pid": 76337, "tid": -914061504, "ts": 1716454225837461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225891330, "dur": 4, "args": { "External id": 278156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278156, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278156, "pid": 5, "tid": 7, "ts": 1716454225891330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837483, "dur": 6, "args": { "External id": 278156, "cbid": 211, "correlation": 278156 } }, { "ph": "s", "id": 278156, "pid": 76337, "tid": -914061504, "ts": 1716454225837483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225891335, "dur": 11, "args": { "External id": 278159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278159, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278159, "pid": 5, "tid": 7, "ts": 1716454225891335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837501, "dur": 6, "args": { "External id": 278159, "cbid": 211, "correlation": 278159 } }, { "ph": "s", "id": 278159, "pid": 76337, "tid": -914061504, "ts": 1716454225837501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225891347, "dur": 6, "args": { "External id": 278168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278168, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278168, "pid": 5, "tid": 7, "ts": 1716454225891347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837539, "dur": 9, "args": { "External id": 278168, "cbid": 211, "correlation": 278168 } }, { "ph": "s", "id": 278168, "pid": 76337, "tid": -914061504, "ts": 1716454225837539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225837590, "dur": 0, "args": { "External id": 278178, "cbid": 317, "correlation": 278178 } }, { "ph": "f", "id": 278178, "pid": 76337, "tid": -914061504, "ts": 1716454225837590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225837590, "dur": 0, "args": { "External id": 278179, "cbid": 203, "correlation": 278179 } }, { "ph": "f", "id": 278179, "pid": 76337, "tid": -914061504, "ts": 1716454225837590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225837591, "dur": 0, "args": { "External id": 278180, "cbid": 205, "correlation": 278180 } }, { "ph": "f", "id": 278180, "pid": 76337, "tid": -914061504, "ts": 1716454225837591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225891354, "dur": 6, "args": { "External id": 278184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278184, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278184, "pid": 5, "tid": 7, "ts": 1716454225891354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837605, "dur": 11, "args": { "External id": 278184, "cbid": 211, "correlation": 278184 } }, { "ph": "s", "id": 278184, "pid": 76337, "tid": -914061504, "ts": 1716454225837605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225891362, "dur": 306, "args": { "External id": 278186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278186, "pid": 5, "tid": 7, "ts": 1716454225891362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837619, "dur": 6, "args": { "External id": 278186, "cbid": 211, "correlation": 278186 } }, { "ph": "s", "id": 278186, "pid": 76337, "tid": -914061504, "ts": 1716454225837619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225891670, "dur": 1, "args": { "External id": 278188, "device": 5, "context": 1, "stream": 7, "correlation": 278188, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 278188, "pid": 5, "tid": 7, "ts": 1716454225891670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225837630, "dur": 6, "args": { "External id": 278188, "cbid": 51, "correlation": 278188 } }, { "ph": "s", "id": 278188, "pid": 76337, "tid": -914061504, "ts": 1716454225837630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225891674, "dur": 480, "args": { "External id": 278189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278189, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278189, "pid": 5, "tid": 7, "ts": 1716454225891674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837637, "dur": 6, "args": { "External id": 278189, "cbid": 211, "correlation": 278189 } }, { "ph": "s", "id": 278189, "pid": 76337, "tid": -914061504, "ts": 1716454225837637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892156, "dur": 6, "args": { "External id": 278191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278191, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278191, "pid": 5, "tid": 7, "ts": 1716454225892156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837648, "dur": 5, "args": { "External id": 278191, "cbid": 211, "correlation": 278191 } }, { "ph": "s", "id": 278191, "pid": 76337, "tid": -914061504, "ts": 1716454225837648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225892163, "dur": 6, "args": { "External id": 278197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278197, "pid": 5, "tid": 7, "ts": 1716454225892163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837677, "dur": 8, "args": { "External id": 278197, "cbid": 211, "correlation": 278197 } }, { "ph": "s", "id": 278197, "pid": 76337, "tid": -914061504, "ts": 1716454225837677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225892170, "dur": 4, "args": { "External id": 278205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278205, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 278205, "pid": 5, "tid": 7, "ts": 1716454225892170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837719, "dur": 10, "args": { "External id": 278205, "cbid": 211, "correlation": 278205 } }, { "ph": "s", "id": 278205, "pid": 76337, "tid": -914061504, "ts": 1716454225837719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225837782, "dur": 1, "args": { "External id": 278221, "cbid": 251, "correlation": 278221 } }, { "ph": "f", "id": 278221, "pid": 76337, "tid": -914061504, "ts": 1716454225837782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225837787, "dur": 0, "args": { "External id": 278223, "cbid": 251, "correlation": 278223 } }, { "ph": "f", "id": 278223, "pid": 76337, "tid": -914061504, "ts": 1716454225837787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225892175, "dur": 12, "args": { "External id": 278224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278224, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278224, "pid": 5, "tid": 7, "ts": 1716454225892175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837789, "dur": 11, "args": { "External id": 278224, "cbid": 211, "correlation": 278224 } }, { "ph": "s", "id": 278224, "pid": 76337, "tid": -914061504, "ts": 1716454225837789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225892188, "dur": 5, "args": { "External id": 278226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278226, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278226, "pid": 5, "tid": 7, "ts": 1716454225892188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837802, "dur": 5, "args": { "External id": 278226, "cbid": 211, "correlation": 278226 } }, { "ph": "s", "id": 278226, "pid": 76337, "tid": -914061504, "ts": 1716454225837802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225892194, "dur": 6, "args": { "External id": 278236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278236, "pid": 5, "tid": 7, "ts": 1716454225892194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837858, "dur": 12, "args": { "External id": 278236, "cbid": 211, "correlation": 278236 } }, { "ph": "s", "id": 278236, "pid": 76337, "tid": -914061504, "ts": 1716454225837858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225892202, "dur": 9, "args": { "External id": 278256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278256, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278256, "pid": 5, "tid": 7, "ts": 1716454225892202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837924, "dur": 11, "args": { "External id": 278256, "cbid": 211, "correlation": 278256 } }, { "ph": "s", "id": 278256, "pid": 76337, "tid": -914061504, "ts": 1716454225837924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225892212, "dur": 4, "args": { "External id": 278268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278268, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 278268, "pid": 5, "tid": 7, "ts": 1716454225892212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837945, "dur": 6, "args": { "External id": 278268, "cbid": 211, "correlation": 278268 } }, { "ph": "s", "id": 278268, "pid": 76337, "tid": -914061504, "ts": 1716454225837945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225892217, "dur": 7, "args": { "External id": 278271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278271, "pid": 5, "tid": 7, "ts": 1716454225892217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225837963, "dur": 6, "args": { "External id": 278271, "cbid": 211, "correlation": 278271 } }, { "ph": "s", "id": 278271, "pid": 76337, "tid": -914061504, "ts": 1716454225837963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225892225, "dur": 4, "args": { "External id": 278280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278280, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278280, "pid": 5, "tid": 7, "ts": 1716454225892225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838014, "dur": 10, "args": { "External id": 278280, "cbid": 211, "correlation": 278280 } }, { "ph": "s", "id": 278280, "pid": 76337, "tid": -914061504, "ts": 1716454225838014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225838078, "dur": 0, "args": { "External id": 278290, "cbid": 317, "correlation": 278290 } }, { "ph": "f", "id": 278290, "pid": 76337, "tid": -914061504, "ts": 1716454225838078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225838079, "dur": 0, "args": { "External id": 278291, "cbid": 203, "correlation": 278291 } }, { "ph": "f", "id": 278291, "pid": 76337, "tid": -914061504, "ts": 1716454225838079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225838080, "dur": 0, "args": { "External id": 278292, "cbid": 205, "correlation": 278292 } }, { "ph": "f", "id": 278292, "pid": 76337, "tid": -914061504, "ts": 1716454225838080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892230, "dur": 5, "args": { "External id": 278296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278296, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278296, "pid": 5, "tid": 7, "ts": 1716454225892230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838093, "dur": 12, "args": { "External id": 278296, "cbid": 211, "correlation": 278296 } }, { "ph": "s", "id": 278296, "pid": 76337, "tid": -914061504, "ts": 1716454225838093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892236, "dur": 155, "args": { "External id": 278298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278298, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278298, "pid": 5, "tid": 7, "ts": 1716454225892236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838107, "dur": 5, "args": { "External id": 278298, "cbid": 211, "correlation": 278298 } }, { "ph": "s", "id": 278298, "pid": 76337, "tid": -914061504, "ts": 1716454225838107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225892394, "dur": 1, "args": { "External id": 278300, "device": 5, "context": 1, "stream": 7, "correlation": 278300, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 278300, "pid": 5, "tid": 7, "ts": 1716454225892394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225838118, "dur": 7, "args": { "External id": 278300, "cbid": 51, "correlation": 278300 } }, { "ph": "s", "id": 278300, "pid": 76337, "tid": -914061504, "ts": 1716454225838118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225892397, "dur": 250, "args": { "External id": 278301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278301, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278301, "pid": 5, "tid": 7, "ts": 1716454225892397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838126, "dur": 6, "args": { "External id": 278301, "cbid": 211, "correlation": 278301 } }, { "ph": "s", "id": 278301, "pid": 76337, "tid": -914061504, "ts": 1716454225838126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892649, "dur": 5, "args": { "External id": 278303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278303, "pid": 5, "tid": 7, "ts": 1716454225892649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838136, "dur": 5, "args": { "External id": 278303, "cbid": 211, "correlation": 278303 } }, { "ph": "s", "id": 278303, "pid": 76337, "tid": -914061504, "ts": 1716454225838136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225892655, "dur": 6, "args": { "External id": 278309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278309, "pid": 5, "tid": 7, "ts": 1716454225892655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838163, "dur": 9, "args": { "External id": 278309, "cbid": 211, "correlation": 278309 } }, { "ph": "s", "id": 278309, "pid": 76337, "tid": -914061504, "ts": 1716454225838163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225838222, "dur": 0, "args": { "External id": 278319, "cbid": 317, "correlation": 278319 } }, { "ph": "f", "id": 278319, "pid": 76337, "tid": -914061504, "ts": 1716454225838222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225838223, "dur": 0, "args": { "External id": 278320, "cbid": 203, "correlation": 278320 } }, { "ph": "f", "id": 278320, "pid": 76337, "tid": -914061504, "ts": 1716454225838223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225838224, "dur": 0, "args": { "External id": 278321, "cbid": 205, "correlation": 278321 } }, { "ph": "f", "id": 278321, "pid": 76337, "tid": -914061504, "ts": 1716454225838224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892662, "dur": 8, "args": { "External id": 278325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278325, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278325, "pid": 5, "tid": 7, "ts": 1716454225892662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838236, "dur": 12, "args": { "External id": 278325, "cbid": 211, "correlation": 278325 } }, { "ph": "s", "id": 278325, "pid": 76337, "tid": -914061504, "ts": 1716454225838236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225892671, "dur": 3, "args": { "External id": 278327, "device": 5, "context": 1, "stream": 7, "correlation": 278327, "bytes": 4800, "memory bandwidth (GB/s)": 1.5463917525773196 } }, { "ph": "f", "id": 278327, "pid": 5, "tid": 7, "ts": 1716454225892671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225838253, "dur": 9, "args": { "External id": 278327, "cbid": 51, "correlation": 278327 } }, { "ph": "s", "id": 278327, "pid": 76337, "tid": -914061504, "ts": 1716454225838253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225892675, "dur": 94, "args": { "External id": 278328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278328, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 278328, "pid": 5, "tid": 7, "ts": 1716454225892675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838263, "dur": 6, "args": { "External id": 278328, "cbid": 211, "correlation": 278328 } }, { "ph": "s", "id": 278328, "pid": 76337, "tid": -914061504, "ts": 1716454225838263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892770, "dur": 5, "args": { "External id": 278330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278330, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278330, "pid": 5, "tid": 7, "ts": 1716454225892770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838272, "dur": 6, "args": { "External id": 278330, "cbid": 211, "correlation": 278330 } }, { "ph": "s", "id": 278330, "pid": 76337, "tid": -914061504, "ts": 1716454225838272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225892777, "dur": 6, "args": { "External id": 278336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278336, "pid": 5, "tid": 7, "ts": 1716454225892777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838300, "dur": 8, "args": { "External id": 278336, "cbid": 211, "correlation": 278336 } }, { "ph": "s", "id": 278336, "pid": 76337, "tid": -914061504, "ts": 1716454225838300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225892784, "dur": 5, "args": { "External id": 278344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278344, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278344, "pid": 5, "tid": 7, "ts": 1716454225892784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838330, "dur": 8, "args": { "External id": 278344, "cbid": 211, "correlation": 278344 } }, { "ph": "s", "id": 278344, "pid": 76337, "tid": -914061504, "ts": 1716454225838330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225892791, "dur": 4, "args": { "External id": 278352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278352, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278352, "pid": 5, "tid": 7, "ts": 1716454225892791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838358, "dur": 8, "args": { "External id": 278352, "cbid": 211, "correlation": 278352 } }, { "ph": "s", "id": 278352, "pid": 76337, "tid": -914061504, "ts": 1716454225838358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225892796, "dur": 10, "args": { "External id": 278361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278361, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278361, "pid": 5, "tid": 7, "ts": 1716454225892796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838400, "dur": 10, "args": { "External id": 278361, "cbid": 211, "correlation": 278361 } }, { "ph": "s", "id": 278361, "pid": 76337, "tid": -914061504, "ts": 1716454225838400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225892808, "dur": 12, "args": { "External id": 278381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278381, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278381, "pid": 5, "tid": 7, "ts": 1716454225892808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838470, "dur": 12, "args": { "External id": 278381, "cbid": 211, "correlation": 278381 } }, { "ph": "s", "id": 278381, "pid": 76337, "tid": -914061504, "ts": 1716454225838470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225892821, "dur": 4, "args": { "External id": 278393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278393, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278393, "pid": 5, "tid": 7, "ts": 1716454225892821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838492, "dur": 6, "args": { "External id": 278393, "cbid": 211, "correlation": 278393 } }, { "ph": "s", "id": 278393, "pid": 76337, "tid": -914061504, "ts": 1716454225838492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225892826, "dur": 10, "args": { "External id": 278396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278396, "pid": 5, "tid": 7, "ts": 1716454225892826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838509, "dur": 7, "args": { "External id": 278396, "cbid": 211, "correlation": 278396 } }, { "ph": "s", "id": 278396, "pid": 76337, "tid": -914061504, "ts": 1716454225838509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225892838, "dur": 6, "args": { "External id": 278405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278405, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278405, "pid": 5, "tid": 7, "ts": 1716454225892838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838548, "dur": 9, "args": { "External id": 278405, "cbid": 211, "correlation": 278405 } }, { "ph": "s", "id": 278405, "pid": 76337, "tid": -914061504, "ts": 1716454225838548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225838599, "dur": 0, "args": { "External id": 278415, "cbid": 317, "correlation": 278415 } }, { "ph": "f", "id": 278415, "pid": 76337, "tid": -914061504, "ts": 1716454225838599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225838600, "dur": 0, "args": { "External id": 278416, "cbid": 203, "correlation": 278416 } }, { "ph": "f", "id": 278416, "pid": 76337, "tid": -914061504, "ts": 1716454225838600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225838601, "dur": 0, "args": { "External id": 278417, "cbid": 205, "correlation": 278417 } }, { "ph": "f", "id": 278417, "pid": 76337, "tid": -914061504, "ts": 1716454225838601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892845, "dur": 6, "args": { "External id": 278421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278421, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278421, "pid": 5, "tid": 7, "ts": 1716454225892845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838614, "dur": 11, "args": { "External id": 278421, "cbid": 211, "correlation": 278421 } }, { "ph": "s", "id": 278421, "pid": 76337, "tid": -914061504, "ts": 1716454225838614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225892852, "dur": 307, "args": { "External id": 278423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278423, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278423, "pid": 5, "tid": 7, "ts": 1716454225892852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838628, "dur": 6, "args": { "External id": 278423, "cbid": 211, "correlation": 278423 } }, { "ph": "s", "id": 278423, "pid": 76337, "tid": -914061504, "ts": 1716454225838628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225893161, "dur": 1, "args": { "External id": 278425, "device": 5, "context": 1, "stream": 7, "correlation": 278425, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 278425, "pid": 5, "tid": 7, "ts": 1716454225893161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225838639, "dur": 6, "args": { "External id": 278425, "cbid": 51, "correlation": 278425 } }, { "ph": "s", "id": 278425, "pid": 76337, "tid": -914061504, "ts": 1716454225838639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225893165, "dur": 480, "args": { "External id": 278426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278426, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278426, "pid": 5, "tid": 7, "ts": 1716454225893165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838646, "dur": 6, "args": { "External id": 278426, "cbid": 211, "correlation": 278426 } }, { "ph": "s", "id": 278426, "pid": 76337, "tid": -914061504, "ts": 1716454225838646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225893646, "dur": 5, "args": { "External id": 278428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278428, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278428, "pid": 5, "tid": 7, "ts": 1716454225893646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838655, "dur": 5, "args": { "External id": 278428, "cbid": 211, "correlation": 278428 } }, { "ph": "s", "id": 278428, "pid": 76337, "tid": -914061504, "ts": 1716454225838655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225893653, "dur": 6, "args": { "External id": 278434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278434, "pid": 5, "tid": 7, "ts": 1716454225893653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838684, "dur": 9, "args": { "External id": 278434, "cbid": 211, "correlation": 278434 } }, { "ph": "s", "id": 278434, "pid": 76337, "tid": -914061504, "ts": 1716454225838684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225893660, "dur": 4, "args": { "External id": 278442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278442, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 278442, "pid": 5, "tid": 7, "ts": 1716454225893660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838726, "dur": 10, "args": { "External id": 278442, "cbid": 211, "correlation": 278442 } }, { "ph": "s", "id": 278442, "pid": 76337, "tid": -914061504, "ts": 1716454225838726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225838788, "dur": 1, "args": { "External id": 278458, "cbid": 251, "correlation": 278458 } }, { "ph": "f", "id": 278458, "pid": 76337, "tid": -914061504, "ts": 1716454225838788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225838793, "dur": 0, "args": { "External id": 278460, "cbid": 251, "correlation": 278460 } }, { "ph": "f", "id": 278460, "pid": 76337, "tid": -914061504, "ts": 1716454225838793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225893665, "dur": 12, "args": { "External id": 278461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278461, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278461, "pid": 5, "tid": 7, "ts": 1716454225893665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838795, "dur": 11, "args": { "External id": 278461, "cbid": 211, "correlation": 278461 } }, { "ph": "s", "id": 278461, "pid": 76337, "tid": -914061504, "ts": 1716454225838795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225893679, "dur": 5, "args": { "External id": 278463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278463, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278463, "pid": 5, "tid": 7, "ts": 1716454225893679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838808, "dur": 5, "args": { "External id": 278463, "cbid": 211, "correlation": 278463 } }, { "ph": "s", "id": 278463, "pid": 76337, "tid": -914061504, "ts": 1716454225838808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225893685, "dur": 6, "args": { "External id": 278473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278473, "pid": 5, "tid": 7, "ts": 1716454225893685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838864, "dur": 11, "args": { "External id": 278473, "cbid": 211, "correlation": 278473 } }, { "ph": "s", "id": 278473, "pid": 76337, "tid": -914061504, "ts": 1716454225838864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225893692, "dur": 9, "args": { "External id": 278493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278493, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278493, "pid": 5, "tid": 7, "ts": 1716454225893692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838929, "dur": 11, "args": { "External id": 278493, "cbid": 211, "correlation": 278493 } }, { "ph": "s", "id": 278493, "pid": 76337, "tid": -914061504, "ts": 1716454225838929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225893702, "dur": 4, "args": { "External id": 278505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278505, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 278505, "pid": 5, "tid": 7, "ts": 1716454225893702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838950, "dur": 6, "args": { "External id": 278505, "cbid": 211, "correlation": 278505 } }, { "ph": "s", "id": 278505, "pid": 76337, "tid": -914061504, "ts": 1716454225838950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225893707, "dur": 6, "args": { "External id": 278508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278508, "pid": 5, "tid": 7, "ts": 1716454225893707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225838969, "dur": 14, "args": { "External id": 278508, "cbid": 211, "correlation": 278508 } }, { "ph": "s", "id": 278508, "pid": 76337, "tid": -914061504, "ts": 1716454225838969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225893715, "dur": 4, "args": { "External id": 278517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278517, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278517, "pid": 5, "tid": 7, "ts": 1716454225893715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839018, "dur": 10, "args": { "External id": 278517, "cbid": 211, "correlation": 278517 } }, { "ph": "s", "id": 278517, "pid": 76337, "tid": -914061504, "ts": 1716454225839018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225839082, "dur": 0, "args": { "External id": 278527, "cbid": 317, "correlation": 278527 } }, { "ph": "f", "id": 278527, "pid": 76337, "tid": -914061504, "ts": 1716454225839082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225839083, "dur": 0, "args": { "External id": 278528, "cbid": 203, "correlation": 278528 } }, { "ph": "f", "id": 278528, "pid": 76337, "tid": -914061504, "ts": 1716454225839083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225839084, "dur": 0, "args": { "External id": 278529, "cbid": 205, "correlation": 278529 } }, { "ph": "f", "id": 278529, "pid": 76337, "tid": -914061504, "ts": 1716454225839084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225893721, "dur": 5, "args": { "External id": 278533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278533, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278533, "pid": 5, "tid": 7, "ts": 1716454225893721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839098, "dur": 12, "args": { "External id": 278533, "cbid": 211, "correlation": 278533 } }, { "ph": "s", "id": 278533, "pid": 76337, "tid": -914061504, "ts": 1716454225839098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225893727, "dur": 155, "args": { "External id": 278535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278535, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278535, "pid": 5, "tid": 7, "ts": 1716454225893727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839112, "dur": 5, "args": { "External id": 278535, "cbid": 211, "correlation": 278535 } }, { "ph": "s", "id": 278535, "pid": 76337, "tid": -914061504, "ts": 1716454225839112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225893884, "dur": 1, "args": { "External id": 278537, "device": 5, "context": 1, "stream": 7, "correlation": 278537, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 278537, "pid": 5, "tid": 7, "ts": 1716454225893884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225839123, "dur": 7, "args": { "External id": 278537, "cbid": 51, "correlation": 278537 } }, { "ph": "s", "id": 278537, "pid": 76337, "tid": -914061504, "ts": 1716454225839123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225893888, "dur": 250, "args": { "External id": 278538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278538, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278538, "pid": 5, "tid": 7, "ts": 1716454225893888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839131, "dur": 7, "args": { "External id": 278538, "cbid": 211, "correlation": 278538 } }, { "ph": "s", "id": 278538, "pid": 76337, "tid": -914061504, "ts": 1716454225839131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225894139, "dur": 5, "args": { "External id": 278540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278540, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278540, "pid": 5, "tid": 7, "ts": 1716454225894139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839142, "dur": 5, "args": { "External id": 278540, "cbid": 211, "correlation": 278540 } }, { "ph": "s", "id": 278540, "pid": 76337, "tid": -914061504, "ts": 1716454225839142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225894145, "dur": 6, "args": { "External id": 278546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278546, "pid": 5, "tid": 7, "ts": 1716454225894145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839170, "dur": 9, "args": { "External id": 278546, "cbid": 211, "correlation": 278546 } }, { "ph": "s", "id": 278546, "pid": 76337, "tid": -914061504, "ts": 1716454225839170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225839228, "dur": 0, "args": { "External id": 278556, "cbid": 317, "correlation": 278556 } }, { "ph": "f", "id": 278556, "pid": 76337, "tid": -914061504, "ts": 1716454225839228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225839229, "dur": 0, "args": { "External id": 278557, "cbid": 203, "correlation": 278557 } }, { "ph": "f", "id": 278557, "pid": 76337, "tid": -914061504, "ts": 1716454225839229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225839230, "dur": 0, "args": { "External id": 278558, "cbid": 205, "correlation": 278558 } }, { "ph": "f", "id": 278558, "pid": 76337, "tid": -914061504, "ts": 1716454225839230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225894153, "dur": 8, "args": { "External id": 278562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278562, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278562, "pid": 5, "tid": 7, "ts": 1716454225894153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839242, "dur": 11, "args": { "External id": 278562, "cbid": 211, "correlation": 278562 } }, { "ph": "s", "id": 278562, "pid": 76337, "tid": -914061504, "ts": 1716454225839242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225894162, "dur": 3, "args": { "External id": 278564, "device": 5, "context": 1, "stream": 7, "correlation": 278564, "bytes": 4800, "memory bandwidth (GB/s)": 1.5625 } }, { "ph": "f", "id": 278564, "pid": 5, "tid": 7, "ts": 1716454225894162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225839259, "dur": 9, "args": { "External id": 278564, "cbid": 51, "correlation": 278564 } }, { "ph": "s", "id": 278564, "pid": 76337, "tid": -914061504, "ts": 1716454225839259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225894166, "dur": 93, "args": { "External id": 278565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278565, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 278565, "pid": 5, "tid": 7, "ts": 1716454225894166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839269, "dur": 6, "args": { "External id": 278565, "cbid": 211, "correlation": 278565 } }, { "ph": "s", "id": 278565, "pid": 76337, "tid": -914061504, "ts": 1716454225839269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225894260, "dur": 6, "args": { "External id": 278567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278567, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278567, "pid": 5, "tid": 7, "ts": 1716454225894260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839278, "dur": 5, "args": { "External id": 278567, "cbid": 211, "correlation": 278567 } }, { "ph": "s", "id": 278567, "pid": 76337, "tid": -914061504, "ts": 1716454225839278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225894267, "dur": 6, "args": { "External id": 278573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278573, "pid": 5, "tid": 7, "ts": 1716454225894267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839305, "dur": 8, "args": { "External id": 278573, "cbid": 211, "correlation": 278573 } }, { "ph": "s", "id": 278573, "pid": 76337, "tid": -914061504, "ts": 1716454225839305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225894274, "dur": 5, "args": { "External id": 278581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278581, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278581, "pid": 5, "tid": 7, "ts": 1716454225894274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839334, "dur": 8, "args": { "External id": 278581, "cbid": 211, "correlation": 278581 } }, { "ph": "s", "id": 278581, "pid": 76337, "tid": -914061504, "ts": 1716454225839334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225894280, "dur": 4, "args": { "External id": 278589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278589, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 278589, "pid": 5, "tid": 7, "ts": 1716454225894280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839363, "dur": 8, "args": { "External id": 278589, "cbid": 211, "correlation": 278589 } }, { "ph": "s", "id": 278589, "pid": 76337, "tid": -914061504, "ts": 1716454225839363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225894286, "dur": 14, "args": { "External id": 278600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278600, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278600, "pid": 5, "tid": 7, "ts": 1716454225894286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839442, "dur": 13, "args": { "External id": 278600, "cbid": 211, "correlation": 278600 } }, { "ph": "s", "id": 278600, "pid": 76337, "tid": -914061504, "ts": 1716454225839442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225839499, "dur": 0, "args": { "External id": 278610, "cbid": 317, "correlation": 278610 } }, { "ph": "f", "id": 278610, "pid": 76337, "tid": -914061504, "ts": 1716454225839499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225839500, "dur": 0, "args": { "External id": 278611, "cbid": 203, "correlation": 278611 } }, { "ph": "f", "id": 278611, "pid": 76337, "tid": -914061504, "ts": 1716454225839500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225839500, "dur": 0, "args": { "External id": 278612, "cbid": 205, "correlation": 278612 } }, { "ph": "f", "id": 278612, "pid": 76337, "tid": -914061504, "ts": 1716454225839500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225894301, "dur": 8, "args": { "External id": 278616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278616, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278616, "pid": 5, "tid": 7, "ts": 1716454225894301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839513, "dur": 11, "args": { "External id": 278616, "cbid": 211, "correlation": 278616 } }, { "ph": "s", "id": 278616, "pid": 76337, "tid": -914061504, "ts": 1716454225839513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225894311, "dur": 157, "args": { "External id": 278618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278618, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278618, "pid": 5, "tid": 7, "ts": 1716454225894311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839527, "dur": 5, "args": { "External id": 278618, "cbid": 211, "correlation": 278618 } }, { "ph": "s", "id": 278618, "pid": 76337, "tid": -914061504, "ts": 1716454225839527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225894470, "dur": 1, "args": { "External id": 278620, "device": 5, "context": 1, "stream": 7, "correlation": 278620, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 278620, "pid": 5, "tid": 7, "ts": 1716454225894470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225839538, "dur": 6, "args": { "External id": 278620, "cbid": 51, "correlation": 278620 } }, { "ph": "s", "id": 278620, "pid": 76337, "tid": -914061504, "ts": 1716454225839538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225894474, "dur": 634, "args": { "External id": 278621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278621, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278621, "pid": 5, "tid": 7, "ts": 1716454225894474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839545, "dur": 6, "args": { "External id": 278621, "cbid": 211, "correlation": 278621 } }, { "ph": "s", "id": 278621, "pid": 76337, "tid": -914061504, "ts": 1716454225839545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225895109, "dur": 12, "args": { "External id": 278623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278623, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278623, "pid": 5, "tid": 7, "ts": 1716454225895109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839555, "dur": 5, "args": { "External id": 278623, "cbid": 211, "correlation": 278623 } }, { "ph": "s", "id": 278623, "pid": 76337, "tid": -914061504, "ts": 1716454225839555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225895122, "dur": 14, "args": { "External id": 278629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278629, "pid": 5, "tid": 7, "ts": 1716454225895122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839583, "dur": 9, "args": { "External id": 278629, "cbid": 211, "correlation": 278629 } }, { "ph": "s", "id": 278629, "pid": 76337, "tid": -914061504, "ts": 1716454225839583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225895138, "dur": 29, "args": { "External id": 278638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278638, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278638, "pid": 5, "tid": 7, "ts": 1716454225895138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839673, "dur": 13, "args": { "External id": 278638, "cbid": 211, "correlation": 278638 } }, { "ph": "s", "id": 278638, "pid": 76337, "tid": -914061504, "ts": 1716454225839673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225895168, "dur": 30, "args": { "External id": 278658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278658, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278658, "pid": 5, "tid": 7, "ts": 1716454225895168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839742, "dur": 11, "args": { "External id": 278658, "cbid": 211, "correlation": 278658 } }, { "ph": "s", "id": 278658, "pid": 76337, "tid": -914061504, "ts": 1716454225839742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225895199, "dur": 4, "args": { "External id": 278670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278670, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278670, "pid": 5, "tid": 7, "ts": 1716454225895199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839762, "dur": 6, "args": { "External id": 278670, "cbid": 211, "correlation": 278670 } }, { "ph": "s", "id": 278670, "pid": 76337, "tid": -914061504, "ts": 1716454225839762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225895204, "dur": 30, "args": { "External id": 278673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278673, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278673, "pid": 5, "tid": 7, "ts": 1716454225895204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839780, "dur": 6, "args": { "External id": 278673, "cbid": 211, "correlation": 278673 } }, { "ph": "s", "id": 278673, "pid": 76337, "tid": -914061504, "ts": 1716454225839780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225895235, "dur": 21, "args": { "External id": 278682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278682, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278682, "pid": 5, "tid": 7, "ts": 1716454225895235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839818, "dur": 10, "args": { "External id": 278682, "cbid": 211, "correlation": 278682 } }, { "ph": "s", "id": 278682, "pid": 76337, "tid": -914061504, "ts": 1716454225839818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225839870, "dur": 0, "args": { "External id": 278692, "cbid": 317, "correlation": 278692 } }, { "ph": "f", "id": 278692, "pid": 76337, "tid": -914061504, "ts": 1716454225839870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225839871, "dur": 0, "args": { "External id": 278693, "cbid": 203, "correlation": 278693 } }, { "ph": "f", "id": 278693, "pid": 76337, "tid": -914061504, "ts": 1716454225839871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225839872, "dur": 0, "args": { "External id": 278694, "cbid": 205, "correlation": 278694 } }, { "ph": "f", "id": 278694, "pid": 76337, "tid": -914061504, "ts": 1716454225839872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225895257, "dur": 22, "args": { "External id": 278698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278698, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278698, "pid": 5, "tid": 7, "ts": 1716454225895257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839886, "dur": 12, "args": { "External id": 278698, "cbid": 211, "correlation": 278698 } }, { "ph": "s", "id": 278698, "pid": 76337, "tid": -914061504, "ts": 1716454225839886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225895280, "dur": 304, "args": { "External id": 278700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278700, "pid": 5, "tid": 7, "ts": 1716454225895280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839901, "dur": 5, "args": { "External id": 278700, "cbid": 211, "correlation": 278700 } }, { "ph": "s", "id": 278700, "pid": 76337, "tid": -914061504, "ts": 1716454225839901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225895586, "dur": 1, "args": { "External id": 278702, "device": 5, "context": 1, "stream": 7, "correlation": 278702, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 278702, "pid": 5, "tid": 7, "ts": 1716454225895586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225839912, "dur": 6, "args": { "External id": 278702, "cbid": 51, "correlation": 278702 } }, { "ph": "s", "id": 278702, "pid": 76337, "tid": -914061504, "ts": 1716454225839912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225895590, "dur": 1216, "args": { "External id": 278703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278703, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278703, "pid": 5, "tid": 7, "ts": 1716454225895590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839919, "dur": 6, "args": { "External id": 278703, "cbid": 211, "correlation": 278703 } }, { "ph": "s", "id": 278703, "pid": 76337, "tid": -914061504, "ts": 1716454225839919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225896807, "dur": 12, "args": { "External id": 278705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278705, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278705, "pid": 5, "tid": 7, "ts": 1716454225896807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839930, "dur": 5, "args": { "External id": 278705, "cbid": 211, "correlation": 278705 } }, { "ph": "s", "id": 278705, "pid": 76337, "tid": -914061504, "ts": 1716454225839930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225896820, "dur": 14, "args": { "External id": 278711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278711, "pid": 5, "tid": 7, "ts": 1716454225896820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225839959, "dur": 8, "args": { "External id": 278711, "cbid": 211, "correlation": 278711 } }, { "ph": "s", "id": 278711, "pid": 76337, "tid": -914061504, "ts": 1716454225839959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225896836, "dur": 4, "args": { "External id": 278719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278719, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 278719, "pid": 5, "tid": 7, "ts": 1716454225896836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840011, "dur": 10, "args": { "External id": 278719, "cbid": 211, "correlation": 278719 } }, { "ph": "s", "id": 278719, "pid": 76337, "tid": -914061504, "ts": 1716454225840011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225840075, "dur": 1, "args": { "External id": 278735, "cbid": 251, "correlation": 278735 } }, { "ph": "f", "id": 278735, "pid": 76337, "tid": -914061504, "ts": 1716454225840075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225840080, "dur": 0, "args": { "External id": 278737, "cbid": 251, "correlation": 278737 } }, { "ph": "f", "id": 278737, "pid": 76337, "tid": -914061504, "ts": 1716454225840080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225896841, "dur": 12, "args": { "External id": 278738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278738, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278738, "pid": 5, "tid": 7, "ts": 1716454225896841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840082, "dur": 12, "args": { "External id": 278738, "cbid": 211, "correlation": 278738 } }, { "ph": "s", "id": 278738, "pid": 76337, "tid": -914061504, "ts": 1716454225840082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225896855, "dur": 5, "args": { "External id": 278740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278740, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278740, "pid": 5, "tid": 7, "ts": 1716454225896855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840095, "dur": 6, "args": { "External id": 278740, "cbid": 211, "correlation": 278740 } }, { "ph": "s", "id": 278740, "pid": 76337, "tid": -914061504, "ts": 1716454225840095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225896861, "dur": 17, "args": { "External id": 278750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278750, "pid": 5, "tid": 7, "ts": 1716454225896861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840152, "dur": 12, "args": { "External id": 278750, "cbid": 211, "correlation": 278750 } }, { "ph": "s", "id": 278750, "pid": 76337, "tid": -914061504, "ts": 1716454225840152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225896879, "dur": 17, "args": { "External id": 278770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278770, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278770, "pid": 5, "tid": 7, "ts": 1716454225896879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840218, "dur": 11, "args": { "External id": 278770, "cbid": 211, "correlation": 278770 } }, { "ph": "s", "id": 278770, "pid": 76337, "tid": -914061504, "ts": 1716454225840218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225896897, "dur": 5, "args": { "External id": 278782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278782, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 278782, "pid": 5, "tid": 7, "ts": 1716454225896897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840238, "dur": 6, "args": { "External id": 278782, "cbid": 211, "correlation": 278782 } }, { "ph": "s", "id": 278782, "pid": 76337, "tid": -914061504, "ts": 1716454225840238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225896903, "dur": 16, "args": { "External id": 278785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278785, "pid": 5, "tid": 7, "ts": 1716454225896903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840257, "dur": 7, "args": { "External id": 278785, "cbid": 211, "correlation": 278785 } }, { "ph": "s", "id": 278785, "pid": 76337, "tid": -914061504, "ts": 1716454225840257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225896921, "dur": 12, "args": { "External id": 278794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278794, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278794, "pid": 5, "tid": 7, "ts": 1716454225896921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840297, "dur": 9, "args": { "External id": 278794, "cbid": 211, "correlation": 278794 } }, { "ph": "s", "id": 278794, "pid": 76337, "tid": -914061504, "ts": 1716454225840297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225840359, "dur": 0, "args": { "External id": 278804, "cbid": 317, "correlation": 278804 } }, { "ph": "f", "id": 278804, "pid": 76337, "tid": -914061504, "ts": 1716454225840359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225840360, "dur": 0, "args": { "External id": 278805, "cbid": 203, "correlation": 278805 } }, { "ph": "f", "id": 278805, "pid": 76337, "tid": -914061504, "ts": 1716454225840360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225840361, "dur": 0, "args": { "External id": 278806, "cbid": 205, "correlation": 278806 } }, { "ph": "f", "id": 278806, "pid": 76337, "tid": -914061504, "ts": 1716454225840361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225896934, "dur": 11, "args": { "External id": 278810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278810, "pid": 5, "tid": 7, "ts": 1716454225896934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840375, "dur": 12, "args": { "External id": 278810, "cbid": 211, "correlation": 278810 } }, { "ph": "s", "id": 278810, "pid": 76337, "tid": -914061504, "ts": 1716454225840375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225896946, "dur": 156, "args": { "External id": 278812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278812, "pid": 5, "tid": 7, "ts": 1716454225896946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840389, "dur": 5, "args": { "External id": 278812, "cbid": 211, "correlation": 278812 } }, { "ph": "s", "id": 278812, "pid": 76337, "tid": -914061504, "ts": 1716454225840389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225897104, "dur": 1, "args": { "External id": 278814, "device": 5, "context": 1, "stream": 7, "correlation": 278814, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 278814, "pid": 5, "tid": 7, "ts": 1716454225897104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225840400, "dur": 6, "args": { "External id": 278814, "cbid": 51, "correlation": 278814 } }, { "ph": "s", "id": 278814, "pid": 76337, "tid": -914061504, "ts": 1716454225840400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225897107, "dur": 635, "args": { "External id": 278815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278815, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278815, "pid": 5, "tid": 7, "ts": 1716454225897107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840407, "dur": 6, "args": { "External id": 278815, "cbid": 211, "correlation": 278815 } }, { "ph": "s", "id": 278815, "pid": 76337, "tid": -914061504, "ts": 1716454225840407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225897744, "dur": 13, "args": { "External id": 278817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278817, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278817, "pid": 5, "tid": 7, "ts": 1716454225897744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840417, "dur": 5, "args": { "External id": 278817, "cbid": 211, "correlation": 278817 } }, { "ph": "s", "id": 278817, "pid": 76337, "tid": -914061504, "ts": 1716454225840417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225897758, "dur": 14, "args": { "External id": 278823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278823, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278823, "pid": 5, "tid": 7, "ts": 1716454225897758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840445, "dur": 9, "args": { "External id": 278823, "cbid": 211, "correlation": 278823 } }, { "ph": "s", "id": 278823, "pid": 76337, "tid": -914061504, "ts": 1716454225840445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225840505, "dur": 0, "args": { "External id": 278833, "cbid": 317, "correlation": 278833 } }, { "ph": "f", "id": 278833, "pid": 76337, "tid": -914061504, "ts": 1716454225840505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225840506, "dur": 0, "args": { "External id": 278834, "cbid": 203, "correlation": 278834 } }, { "ph": "f", "id": 278834, "pid": 76337, "tid": -914061504, "ts": 1716454225840506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225840507, "dur": 0, "args": { "External id": 278835, "cbid": 205, "correlation": 278835 } }, { "ph": "f", "id": 278835, "pid": 76337, "tid": -914061504, "ts": 1716454225840507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225897773, "dur": 21, "args": { "External id": 278839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278839, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278839, "pid": 5, "tid": 7, "ts": 1716454225897773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840520, "dur": 11, "args": { "External id": 278839, "cbid": 211, "correlation": 278839 } }, { "ph": "s", "id": 278839, "pid": 76337, "tid": -914061504, "ts": 1716454225840520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225897795, "dur": 4, "args": { "External id": 278841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 278841, "pid": 5, "tid": 7, "ts": 1716454225897795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840537, "dur": 6, "args": { "External id": 278841, "cbid": 211, "correlation": 278841 } }, { "ph": "s", "id": 278841, "pid": 76337, "tid": -914061504, "ts": 1716454225840537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225840547, "dur": 0, "args": { "External id": 278842, "cbid": 51, "correlation": 278842 } }, { "ph": "s", "id": 278842, "pid": 76337, "tid": -914061504, "ts": 1716454225840547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225897800, "dur": 166, "args": { "External id": 278843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278843, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 278843, "pid": 5, "tid": 7, "ts": 1716454225897800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840548, "dur": 6, "args": { "External id": 278843, "cbid": 211, "correlation": 278843 } }, { "ph": "s", "id": 278843, "pid": 76337, "tid": -914061504, "ts": 1716454225840548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225897968, "dur": 15, "args": { "External id": 278848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278848, "pid": 5, "tid": 7, "ts": 1716454225897968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840574, "dur": 9, "args": { "External id": 278848, "cbid": 211, "correlation": 278848 } }, { "ph": "s", "id": 278848, "pid": 76337, "tid": -914061504, "ts": 1716454225840574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225897984, "dur": 12, "args": { "External id": 278856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278856, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278856, "pid": 5, "tid": 7, "ts": 1716454225897984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840603, "dur": 7, "args": { "External id": 278856, "cbid": 211, "correlation": 278856 } }, { "ph": "s", "id": 278856, "pid": 76337, "tid": -914061504, "ts": 1716454225840603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225897997, "dur": 10, "args": { "External id": 278864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278864, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278864, "pid": 5, "tid": 7, "ts": 1716454225897997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840631, "dur": 8, "args": { "External id": 278864, "cbid": 211, "correlation": 278864 } }, { "ph": "s", "id": 278864, "pid": 76337, "tid": -914061504, "ts": 1716454225840631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225898009, "dur": 18, "args": { "External id": 278884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278884, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 278884, "pid": 5, "tid": 7, "ts": 1716454225898009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840713, "dur": 12, "args": { "External id": 278884, "cbid": 211, "correlation": 278884 } }, { "ph": "s", "id": 278884, "pid": 76337, "tid": -914061504, "ts": 1716454225840713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225898028, "dur": 4, "args": { "External id": 278896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278896, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 278896, "pid": 5, "tid": 7, "ts": 1716454225898028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840735, "dur": 6, "args": { "External id": 278896, "cbid": 211, "correlation": 278896 } }, { "ph": "s", "id": 278896, "pid": 76337, "tid": -914061504, "ts": 1716454225840735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225898033, "dur": 17, "args": { "External id": 278899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278899, "pid": 5, "tid": 7, "ts": 1716454225898033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840753, "dur": 7, "args": { "External id": 278899, "cbid": 211, "correlation": 278899 } }, { "ph": "s", "id": 278899, "pid": 76337, "tid": -914061504, "ts": 1716454225840753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225840811, "dur": 0, "args": { "External id": 278910, "cbid": 317, "correlation": 278910 } }, { "ph": "f", "id": 278910, "pid": 76337, "tid": -914061504, "ts": 1716454225840811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225840811, "dur": 0, "args": { "External id": 278911, "cbid": 203, "correlation": 278911 } }, { "ph": "f", "id": 278911, "pid": 76337, "tid": -914061504, "ts": 1716454225840811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225840812, "dur": 0, "args": { "External id": 278912, "cbid": 205, "correlation": 278912 } }, { "ph": "f", "id": 278912, "pid": 76337, "tid": -914061504, "ts": 1716454225840812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225898052, "dur": 12, "args": { "External id": 278916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278916, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278916, "pid": 5, "tid": 7, "ts": 1716454225898052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840825, "dur": 11, "args": { "External id": 278916, "cbid": 211, "correlation": 278916 } }, { "ph": "s", "id": 278916, "pid": 76337, "tid": -914061504, "ts": 1716454225840825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225898065, "dur": 3, "args": { "External id": 278918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 278918, "pid": 5, "tid": 7, "ts": 1716454225898065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840841, "dur": 6, "args": { "External id": 278918, "cbid": 211, "correlation": 278918 } }, { "ph": "s", "id": 278918, "pid": 76337, "tid": -914061504, "ts": 1716454225840841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225840850, "dur": 0, "args": { "External id": 278919, "cbid": 51, "correlation": 278919 } }, { "ph": "s", "id": 278919, "pid": 76337, "tid": -914061504, "ts": 1716454225840850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225898070, "dur": 86, "args": { "External id": 278920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278920, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 278920, "pid": 5, "tid": 7, "ts": 1716454225898070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840851, "dur": 5, "args": { "External id": 278920, "cbid": 211, "correlation": 278920 } }, { "ph": "s", "id": 278920, "pid": 76337, "tid": -914061504, "ts": 1716454225840851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225898157, "dur": 15, "args": { "External id": 278925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278925, "pid": 5, "tid": 7, "ts": 1716454225898157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840878, "dur": 8, "args": { "External id": 278925, "cbid": 211, "correlation": 278925 } }, { "ph": "s", "id": 278925, "pid": 76337, "tid": -914061504, "ts": 1716454225840878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225898173, "dur": 80, "args": { "External id": 278934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278934, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278934, "pid": 5, "tid": 7, "ts": 1716454225898173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225840959, "dur": 14, "args": { "External id": 278934, "cbid": 211, "correlation": 278934 } }, { "ph": "s", "id": 278934, "pid": 76337, "tid": -914061504, "ts": 1716454225840959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225898255, "dur": 29, "args": { "External id": 278956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278956, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 278956, "pid": 5, "tid": 7, "ts": 1716454225898255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841030, "dur": 10, "args": { "External id": 278956, "cbid": 211, "correlation": 278956 } }, { "ph": "s", "id": 278956, "pid": 76337, "tid": -914061504, "ts": 1716454225841030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841126, "dur": 2, "args": { "External id": 278967, "cbid": 251, "correlation": 278967 } }, { "ph": "f", "id": 278967, "pid": 76337, "tid": -914061504, "ts": 1716454225841126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225898285, "dur": 139, "args": { "External id": 278968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278968, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278968, "pid": 5, "tid": 7, "ts": 1716454225898285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841132, "dur": 14, "args": { "External id": 278968, "cbid": 211, "correlation": 278968 } }, { "ph": "s", "id": 278968, "pid": 76337, "tid": -914061504, "ts": 1716454225841132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841203, "dur": 1, "args": { "External id": 278979, "cbid": 251, "correlation": 278979 } }, { "ph": "f", "id": 278979, "pid": 76337, "tid": -914061504, "ts": 1716454225841203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225898425, "dur": 153, "args": { "External id": 278980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278980, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278980, "pid": 5, "tid": 7, "ts": 1716454225898425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841207, "dur": 11, "args": { "External id": 278980, "cbid": 211, "correlation": 278980 } }, { "ph": "s", "id": 278980, "pid": 76337, "tid": -914061504, "ts": 1716454225841207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841272, "dur": 1, "args": { "External id": 278991, "cbid": 251, "correlation": 278991 } }, { "ph": "f", "id": 278991, "pid": 76337, "tid": -914061504, "ts": 1716454225841272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225898580, "dur": 134, "args": { "External id": 278992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 278992, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 278992, "pid": 5, "tid": 7, "ts": 1716454225898580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841276, "dur": 11, "args": { "External id": 278992, "cbid": 211, "correlation": 278992 } }, { "ph": "s", "id": 278992, "pid": 76337, "tid": -914061504, "ts": 1716454225841276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225898715, "dur": 324, "args": { "External id": 279017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279017, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279017, "pid": 5, "tid": 7, "ts": 1716454225898715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841363, "dur": 13, "args": { "External id": 279017, "cbid": 211, "correlation": 279017 } }, { "ph": "s", "id": 279017, "pid": 76337, "tid": -914061504, "ts": 1716454225841363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841464, "dur": 1, "args": { "External id": 279035, "cbid": 251, "correlation": 279035 } }, { "ph": "f", "id": 279035, "pid": 76337, "tid": -914061504, "ts": 1716454225841464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225899041, "dur": 161, "args": { "External id": 279037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279037, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279037, "pid": 5, "tid": 7, "ts": 1716454225899041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841470, "dur": 13, "args": { "External id": 279037, "cbid": 211, "correlation": 279037 } }, { "ph": "s", "id": 279037, "pid": 76337, "tid": -914061504, "ts": 1716454225841470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225899203, "dur": 19, "args": { "External id": 279045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279045, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279045, "pid": 5, "tid": 7, "ts": 1716454225899203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841539, "dur": 13, "args": { "External id": 279045, "cbid": 211, "correlation": 279045 } }, { "ph": "s", "id": 279045, "pid": 76337, "tid": -914061504, "ts": 1716454225841539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225899224, "dur": 28, "args": { "External id": 279053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279053, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279053, "pid": 5, "tid": 7, "ts": 1716454225899224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841579, "dur": 8, "args": { "External id": 279053, "cbid": 211, "correlation": 279053 } }, { "ph": "s", "id": 279053, "pid": 76337, "tid": -914061504, "ts": 1716454225841579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225899253, "dur": 19, "args": { "External id": 279064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279064, "pid": 5, "tid": 7, "ts": 1716454225899253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841653, "dur": 13, "args": { "External id": 279064, "cbid": 211, "correlation": 279064 } }, { "ph": "s", "id": 279064, "pid": 76337, "tid": -914061504, "ts": 1716454225841653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225899273, "dur": 15, "args": { "External id": 279086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279086, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279086, "pid": 5, "tid": 7, "ts": 1716454225899273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841684, "dur": 7, "args": { "External id": 279086, "cbid": 211, "correlation": 279086 } }, { "ph": "s", "id": 279086, "pid": 76337, "tid": -914061504, "ts": 1716454225841684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841771, "dur": 2, "args": { "External id": 279097, "cbid": 251, "correlation": 279097 } }, { "ph": "f", "id": 279097, "pid": 76337, "tid": -914061504, "ts": 1716454225841771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225899289, "dur": 86, "args": { "External id": 279098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279098, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279098, "pid": 5, "tid": 7, "ts": 1716454225899289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841776, "dur": 13, "args": { "External id": 279098, "cbid": 211, "correlation": 279098 } }, { "ph": "s", "id": 279098, "pid": 76337, "tid": -914061504, "ts": 1716454225841776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841848, "dur": 1, "args": { "External id": 279109, "cbid": 251, "correlation": 279109 } }, { "ph": "f", "id": 279109, "pid": 76337, "tid": -914061504, "ts": 1716454225841848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841852, "dur": 0, "args": { "External id": 279110, "cbid": 251, "correlation": 279110 } }, { "ph": "f", "id": 279110, "pid": 76337, "tid": -914061504, "ts": 1716454225841852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225899376, "dur": 12, "args": { "External id": 279111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279111, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279111, "pid": 5, "tid": 7, "ts": 1716454225899376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841853, "dur": 11, "args": { "External id": 279111, "cbid": 211, "correlation": 279111 } }, { "ph": "s", "id": 279111, "pid": 76337, "tid": -914061504, "ts": 1716454225841853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225899390, "dur": 5, "args": { "External id": 279113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279113, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279113, "pid": 5, "tid": 7, "ts": 1716454225899390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841866, "dur": 6, "args": { "External id": 279113, "cbid": 211, "correlation": 279113 } }, { "ph": "s", "id": 279113, "pid": 76337, "tid": -914061504, "ts": 1716454225841866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841922, "dur": 1, "args": { "External id": 279124, "cbid": 251, "correlation": 279124 } }, { "ph": "f", "id": 279124, "pid": 76337, "tid": -914061504, "ts": 1716454225841922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225841926, "dur": 0, "args": { "External id": 279125, "cbid": 251, "correlation": 279125 } }, { "ph": "f", "id": 279125, "pid": 76337, "tid": -914061504, "ts": 1716454225841926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225899397, "dur": 8, "args": { "External id": 279126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279126, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279126, "pid": 5, "tid": 7, "ts": 1716454225899397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841927, "dur": 11, "args": { "External id": 279126, "cbid": 211, "correlation": 279126 } }, { "ph": "s", "id": 279126, "pid": 76337, "tid": -914061504, "ts": 1716454225841927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225899406, "dur": 3, "args": { "External id": 279128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279128, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279128, "pid": 5, "tid": 7, "ts": 1716454225899406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225841940, "dur": 6, "args": { "External id": 279128, "cbid": 211, "correlation": 279128 } }, { "ph": "s", "id": 279128, "pid": 76337, "tid": -914061504, "ts": 1716454225841940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225899411, "dur": 53, "args": { "External id": 279153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279153, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279153, "pid": 5, "tid": 7, "ts": 1716454225899411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842027, "dur": 12, "args": { "External id": 279153, "cbid": 211, "correlation": 279153 } }, { "ph": "s", "id": 279153, "pid": 76337, "tid": -914061504, "ts": 1716454225842027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225842126, "dur": 2, "args": { "External id": 279171, "cbid": 251, "correlation": 279171 } }, { "ph": "f", "id": 279171, "pid": 76337, "tid": -914061504, "ts": 1716454225842126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225899465, "dur": 87, "args": { "External id": 279173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279173, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279173, "pid": 5, "tid": 7, "ts": 1716454225899465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842132, "dur": 14, "args": { "External id": 279173, "cbid": 211, "correlation": 279173 } }, { "ph": "s", "id": 279173, "pid": 76337, "tid": -914061504, "ts": 1716454225842132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225899553, "dur": 9, "args": { "External id": 279181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279181, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279181, "pid": 5, "tid": 7, "ts": 1716454225899553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842201, "dur": 13, "args": { "External id": 279181, "cbid": 211, "correlation": 279181 } }, { "ph": "s", "id": 279181, "pid": 76337, "tid": -914061504, "ts": 1716454225842201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225899563, "dur": 21, "args": { "External id": 279189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279189, "pid": 5, "tid": 7, "ts": 1716454225899563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842243, "dur": 10, "args": { "External id": 279189, "cbid": 211, "correlation": 279189 } }, { "ph": "s", "id": 279189, "pid": 76337, "tid": -914061504, "ts": 1716454225842243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225899586, "dur": 17, "args": { "External id": 279211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279211, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279211, "pid": 5, "tid": 7, "ts": 1716454225899586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842294, "dur": 10, "args": { "External id": 279211, "cbid": 211, "correlation": 279211 } }, { "ph": "s", "id": 279211, "pid": 76337, "tid": -914061504, "ts": 1716454225842294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225842382, "dur": 1, "args": { "External id": 279227, "cbid": 251, "correlation": 279227 } }, { "ph": "f", "id": 279227, "pid": 76337, "tid": -914061504, "ts": 1716454225842382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225842387, "dur": 0, "args": { "External id": 279229, "cbid": 251, "correlation": 279229 } }, { "ph": "f", "id": 279229, "pid": 76337, "tid": -914061504, "ts": 1716454225842387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225899604, "dur": 486, "args": { "External id": 279230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279230, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279230, "pid": 5, "tid": 7, "ts": 1716454225899604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842389, "dur": 12, "args": { "External id": 279230, "cbid": 211, "correlation": 279230 } }, { "ph": "s", "id": 279230, "pid": 76337, "tid": -914061504, "ts": 1716454225842389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225900092, "dur": 64, "args": { "External id": 279238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279238, "pid": 5, "tid": 7, "ts": 1716454225900092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842456, "dur": 13, "args": { "External id": 279238, "cbid": 211, "correlation": 279238 } }, { "ph": "s", "id": 279238, "pid": 76337, "tid": -914061504, "ts": 1716454225842456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225900157, "dur": 66, "args": { "External id": 279246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279246, "pid": 5, "tid": 7, "ts": 1716454225900157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842487, "dur": 8, "args": { "External id": 279246, "cbid": 211, "correlation": 279246 } }, { "ph": "s", "id": 279246, "pid": 76337, "tid": -914061504, "ts": 1716454225842487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225842567, "dur": 1, "args": { "External id": 279262, "cbid": 251, "correlation": 279262 } }, { "ph": "f", "id": 279262, "pid": 76337, "tid": -914061504, "ts": 1716454225842567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225900225, "dur": 1, "args": { "External id": 279264, "device": 5, "context": 1, "stream": 7, "correlation": 279264, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 279264, "pid": 5, "tid": 7, "ts": 1716454225900225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225842572, "dur": 9, "args": { "External id": 279264, "cbid": 51, "correlation": 279264 } }, { "ph": "s", "id": 279264, "pid": 76337, "tid": -914061504, "ts": 1716454225842572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225900229, "dur": 263, "args": { "External id": 279265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279265, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279265, "pid": 5, "tid": 7, "ts": 1716454225900229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842582, "dur": 11, "args": { "External id": 279265, "cbid": 211, "correlation": 279265 } }, { "ph": "s", "id": 279265, "pid": 76337, "tid": -914061504, "ts": 1716454225842582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225900493, "dur": 14, "args": { "External id": 279273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279273, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279273, "pid": 5, "tid": 7, "ts": 1716454225900493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842625, "dur": 10, "args": { "External id": 279273, "cbid": 211, "correlation": 279273 } }, { "ph": "s", "id": 279273, "pid": 76337, "tid": -914061504, "ts": 1716454225842625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225900509, "dur": 36, "args": { "External id": 279284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279284, "pid": 5, "tid": 7, "ts": 1716454225900509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842693, "dur": 13, "args": { "External id": 279284, "cbid": 211, "correlation": 279284 } }, { "ph": "s", "id": 279284, "pid": 76337, "tid": -914061504, "ts": 1716454225842693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225842759, "dur": 0, "args": { "External id": 279296, "cbid": 317, "correlation": 279296 } }, { "ph": "f", "id": 279296, "pid": 76337, "tid": -914061504, "ts": 1716454225842759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225842760, "dur": 0, "args": { "External id": 279297, "cbid": 203, "correlation": 279297 } }, { "ph": "f", "id": 279297, "pid": 76337, "tid": -914061504, "ts": 1716454225842760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225842761, "dur": 0, "args": { "External id": 279298, "cbid": 205, "correlation": 279298 } }, { "ph": "f", "id": 279298, "pid": 76337, "tid": -914061504, "ts": 1716454225842761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225900546, "dur": 12, "args": { "External id": 279302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279302, "pid": 5, "tid": 7, "ts": 1716454225900546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842775, "dur": 12, "args": { "External id": 279302, "cbid": 211, "correlation": 279302 } }, { "ph": "s", "id": 279302, "pid": 76337, "tid": -914061504, "ts": 1716454225842775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225900559, "dur": 3, "args": { "External id": 279304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279304, "pid": 5, "tid": 7, "ts": 1716454225900559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842791, "dur": 6, "args": { "External id": 279304, "cbid": 211, "correlation": 279304 } }, { "ph": "s", "id": 279304, "pid": 76337, "tid": -914061504, "ts": 1716454225842791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225842800, "dur": 0, "args": { "External id": 279305, "cbid": 51, "correlation": 279305 } }, { "ph": "s", "id": 279305, "pid": 76337, "tid": -914061504, "ts": 1716454225842800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225900564, "dur": 93, "args": { "External id": 279306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279306, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 279306, "pid": 5, "tid": 7, "ts": 1716454225900564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842801, "dur": 6, "args": { "External id": 279306, "cbid": 211, "correlation": 279306 } }, { "ph": "s", "id": 279306, "pid": 76337, "tid": -914061504, "ts": 1716454225842801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225900658, "dur": 15, "args": { "External id": 279311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279311, "pid": 5, "tid": 7, "ts": 1716454225900658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842829, "dur": 8, "args": { "External id": 279311, "cbid": 211, "correlation": 279311 } }, { "ph": "s", "id": 279311, "pid": 76337, "tid": -914061504, "ts": 1716454225842829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225900674, "dur": 11, "args": { "External id": 279319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279319, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279319, "pid": 5, "tid": 7, "ts": 1716454225900674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842862, "dur": 8, "args": { "External id": 279319, "cbid": 211, "correlation": 279319 } }, { "ph": "s", "id": 279319, "pid": 76337, "tid": -914061504, "ts": 1716454225842862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225900687, "dur": 28, "args": { "External id": 279328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279328, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279328, "pid": 5, "tid": 7, "ts": 1716454225900687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842900, "dur": 11, "args": { "External id": 279328, "cbid": 211, "correlation": 279328 } }, { "ph": "s", "id": 279328, "pid": 76337, "tid": -914061504, "ts": 1716454225842900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225900716, "dur": 30, "args": { "External id": 279348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279348, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 279348, "pid": 5, "tid": 7, "ts": 1716454225900716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225842973, "dur": 21, "args": { "External id": 279348, "cbid": 211, "correlation": 279348 } }, { "ph": "s", "id": 279348, "pid": 76337, "tid": -914061504, "ts": 1716454225842973, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225900747, "dur": 5, "args": { "External id": 279360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279360, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279360, "pid": 5, "tid": 7, "ts": 1716454225900747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843004, "dur": 7, "args": { "External id": 279360, "cbid": 211, "correlation": 279360 } }, { "ph": "s", "id": 279360, "pid": 76337, "tid": -914061504, "ts": 1716454225843004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225900753, "dur": 31, "args": { "External id": 279363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279363, "pid": 5, "tid": 7, "ts": 1716454225900753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843023, "dur": 7, "args": { "External id": 279363, "cbid": 211, "correlation": 279363 } }, { "ph": "s", "id": 279363, "pid": 76337, "tid": -914061504, "ts": 1716454225843023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225900786, "dur": 21, "args": { "External id": 279372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279372, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279372, "pid": 5, "tid": 7, "ts": 1716454225900786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843063, "dur": 10, "args": { "External id": 279372, "cbid": 211, "correlation": 279372 } }, { "ph": "s", "id": 279372, "pid": 76337, "tid": -914061504, "ts": 1716454225843063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225843115, "dur": 0, "args": { "External id": 279382, "cbid": 317, "correlation": 279382 } }, { "ph": "f", "id": 279382, "pid": 76337, "tid": -914061504, "ts": 1716454225843115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225843116, "dur": 0, "args": { "External id": 279383, "cbid": 203, "correlation": 279383 } }, { "ph": "f", "id": 279383, "pid": 76337, "tid": -914061504, "ts": 1716454225843116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225843117, "dur": 0, "args": { "External id": 279384, "cbid": 205, "correlation": 279384 } }, { "ph": "f", "id": 279384, "pid": 76337, "tid": -914061504, "ts": 1716454225843117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225900808, "dur": 21, "args": { "External id": 279388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279388, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279388, "pid": 5, "tid": 7, "ts": 1716454225900808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843130, "dur": 11, "args": { "External id": 279388, "cbid": 211, "correlation": 279388 } }, { "ph": "s", "id": 279388, "pid": 76337, "tid": -914061504, "ts": 1716454225843130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225900830, "dur": 304, "args": { "External id": 279390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279390, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279390, "pid": 5, "tid": 7, "ts": 1716454225900830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843144, "dur": 5, "args": { "External id": 279390, "cbid": 211, "correlation": 279390 } }, { "ph": "s", "id": 279390, "pid": 76337, "tid": -914061504, "ts": 1716454225843144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225901137, "dur": 1, "args": { "External id": 279392, "device": 5, "context": 1, "stream": 7, "correlation": 279392, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 279392, "pid": 5, "tid": 7, "ts": 1716454225901137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225843155, "dur": 7, "args": { "External id": 279392, "cbid": 51, "correlation": 279392 } }, { "ph": "s", "id": 279392, "pid": 76337, "tid": -914061504, "ts": 1716454225843155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225901140, "dur": 1228, "args": { "External id": 279393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279393, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279393, "pid": 5, "tid": 7, "ts": 1716454225901140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843163, "dur": 6, "args": { "External id": 279393, "cbid": 211, "correlation": 279393 } }, { "ph": "s", "id": 279393, "pid": 76337, "tid": -914061504, "ts": 1716454225843163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225902370, "dur": 13, "args": { "External id": 279395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279395, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279395, "pid": 5, "tid": 7, "ts": 1716454225902370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843173, "dur": 5, "args": { "External id": 279395, "cbid": 211, "correlation": 279395 } }, { "ph": "s", "id": 279395, "pid": 76337, "tid": -914061504, "ts": 1716454225843173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225902384, "dur": 14, "args": { "External id": 279401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279401, "pid": 5, "tid": 7, "ts": 1716454225902384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843201, "dur": 8, "args": { "External id": 279401, "cbid": 211, "correlation": 279401 } }, { "ph": "s", "id": 279401, "pid": 76337, "tid": -914061504, "ts": 1716454225843201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225902399, "dur": 4, "args": { "External id": 279409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279409, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 279409, "pid": 5, "tid": 7, "ts": 1716454225902399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843244, "dur": 9, "args": { "External id": 279409, "cbid": 211, "correlation": 279409 } }, { "ph": "s", "id": 279409, "pid": 76337, "tid": -914061504, "ts": 1716454225843244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225843307, "dur": 1, "args": { "External id": 279425, "cbid": 251, "correlation": 279425 } }, { "ph": "f", "id": 279425, "pid": 76337, "tid": -914061504, "ts": 1716454225843307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225843312, "dur": 0, "args": { "External id": 279427, "cbid": 251, "correlation": 279427 } }, { "ph": "f", "id": 279427, "pid": 76337, "tid": -914061504, "ts": 1716454225843312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225902404, "dur": 13, "args": { "External id": 279428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279428, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279428, "pid": 5, "tid": 7, "ts": 1716454225902404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843314, "dur": 12, "args": { "External id": 279428, "cbid": 211, "correlation": 279428 } }, { "ph": "s", "id": 279428, "pid": 76337, "tid": -914061504, "ts": 1716454225843314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225902418, "dur": 5, "args": { "External id": 279430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279430, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279430, "pid": 5, "tid": 7, "ts": 1716454225902418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843328, "dur": 5, "args": { "External id": 279430, "cbid": 211, "correlation": 279430 } }, { "ph": "s", "id": 279430, "pid": 76337, "tid": -914061504, "ts": 1716454225843328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225902424, "dur": 16, "args": { "External id": 279440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279440, "pid": 5, "tid": 7, "ts": 1716454225902424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843385, "dur": 12, "args": { "External id": 279440, "cbid": 211, "correlation": 279440 } }, { "ph": "s", "id": 279440, "pid": 76337, "tid": -914061504, "ts": 1716454225843385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225902442, "dur": 16, "args": { "External id": 279460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279460, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 279460, "pid": 5, "tid": 7, "ts": 1716454225902442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843451, "dur": 11, "args": { "External id": 279460, "cbid": 211, "correlation": 279460 } }, { "ph": "s", "id": 279460, "pid": 76337, "tid": -914061504, "ts": 1716454225843451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225902460, "dur": 4, "args": { "External id": 279472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279472, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 279472, "pid": 5, "tid": 7, "ts": 1716454225902460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843472, "dur": 6, "args": { "External id": 279472, "cbid": 211, "correlation": 279472 } }, { "ph": "s", "id": 279472, "pid": 76337, "tid": -914061504, "ts": 1716454225843472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225902465, "dur": 17, "args": { "External id": 279475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279475, "pid": 5, "tid": 7, "ts": 1716454225902465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843490, "dur": 6, "args": { "External id": 279475, "cbid": 211, "correlation": 279475 } }, { "ph": "s", "id": 279475, "pid": 76337, "tid": -914061504, "ts": 1716454225843490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225902483, "dur": 11, "args": { "External id": 279484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279484, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279484, "pid": 5, "tid": 7, "ts": 1716454225902483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843530, "dur": 10, "args": { "External id": 279484, "cbid": 211, "correlation": 279484 } }, { "ph": "s", "id": 279484, "pid": 76337, "tid": -914061504, "ts": 1716454225843530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225843592, "dur": 0, "args": { "External id": 279494, "cbid": 317, "correlation": 279494 } }, { "ph": "f", "id": 279494, "pid": 76337, "tid": -914061504, "ts": 1716454225843592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225843592, "dur": 0, "args": { "External id": 279495, "cbid": 203, "correlation": 279495 } }, { "ph": "f", "id": 279495, "pid": 76337, "tid": -914061504, "ts": 1716454225843592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225843593, "dur": 0, "args": { "External id": 279496, "cbid": 205, "correlation": 279496 } }, { "ph": "f", "id": 279496, "pid": 76337, "tid": -914061504, "ts": 1716454225843593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225902495, "dur": 11, "args": { "External id": 279500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279500, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279500, "pid": 5, "tid": 7, "ts": 1716454225902495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843607, "dur": 12, "args": { "External id": 279500, "cbid": 211, "correlation": 279500 } }, { "ph": "s", "id": 279500, "pid": 76337, "tid": -914061504, "ts": 1716454225843607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225902508, "dur": 155, "args": { "External id": 279502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279502, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279502, "pid": 5, "tid": 7, "ts": 1716454225902508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843621, "dur": 5, "args": { "External id": 279502, "cbid": 211, "correlation": 279502 } }, { "ph": "s", "id": 279502, "pid": 76337, "tid": -914061504, "ts": 1716454225843621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225902665, "dur": 1, "args": { "External id": 279504, "device": 5, "context": 1, "stream": 7, "correlation": 279504, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 279504, "pid": 5, "tid": 7, "ts": 1716454225902665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225843632, "dur": 6, "args": { "External id": 279504, "cbid": 51, "correlation": 279504 } }, { "ph": "s", "id": 279504, "pid": 76337, "tid": -914061504, "ts": 1716454225843632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225902669, "dur": 634, "args": { "External id": 279505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279505, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279505, "pid": 5, "tid": 7, "ts": 1716454225902669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843640, "dur": 6, "args": { "External id": 279505, "cbid": 211, "correlation": 279505 } }, { "ph": "s", "id": 279505, "pid": 76337, "tid": -914061504, "ts": 1716454225843640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225903304, "dur": 12, "args": { "External id": 279507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279507, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279507, "pid": 5, "tid": 7, "ts": 1716454225903304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843651, "dur": 5, "args": { "External id": 279507, "cbid": 211, "correlation": 279507 } }, { "ph": "s", "id": 279507, "pid": 76337, "tid": -914061504, "ts": 1716454225843651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225903318, "dur": 14, "args": { "External id": 279513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279513, "pid": 5, "tid": 7, "ts": 1716454225903318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843678, "dur": 9, "args": { "External id": 279513, "cbid": 211, "correlation": 279513 } }, { "ph": "s", "id": 279513, "pid": 76337, "tid": -914061504, "ts": 1716454225843678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225843737, "dur": 0, "args": { "External id": 279523, "cbid": 317, "correlation": 279523 } }, { "ph": "f", "id": 279523, "pid": 76337, "tid": -914061504, "ts": 1716454225843737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225843738, "dur": 0, "args": { "External id": 279524, "cbid": 203, "correlation": 279524 } }, { "ph": "f", "id": 279524, "pid": 76337, "tid": -914061504, "ts": 1716454225843738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225843739, "dur": 0, "args": { "External id": 279525, "cbid": 205, "correlation": 279525 } }, { "ph": "f", "id": 279525, "pid": 76337, "tid": -914061504, "ts": 1716454225843739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225903333, "dur": 21, "args": { "External id": 279529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279529, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279529, "pid": 5, "tid": 7, "ts": 1716454225903333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843751, "dur": 11, "args": { "External id": 279529, "cbid": 211, "correlation": 279529 } }, { "ph": "s", "id": 279529, "pid": 76337, "tid": -914061504, "ts": 1716454225843751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225903355, "dur": 4, "args": { "External id": 279531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279531, "pid": 5, "tid": 7, "ts": 1716454225903355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843766, "dur": 5, "args": { "External id": 279531, "cbid": 211, "correlation": 279531 } }, { "ph": "s", "id": 279531, "pid": 76337, "tid": -914061504, "ts": 1716454225843766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225843774, "dur": 0, "args": { "External id": 279532, "cbid": 51, "correlation": 279532 } }, { "ph": "s", "id": 279532, "pid": 76337, "tid": -914061504, "ts": 1716454225843774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225903361, "dur": 165, "args": { "External id": 279533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279533, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 279533, "pid": 5, "tid": 7, "ts": 1716454225903361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843775, "dur": 5, "args": { "External id": 279533, "cbid": 211, "correlation": 279533 } }, { "ph": "s", "id": 279533, "pid": 76337, "tid": -914061504, "ts": 1716454225843775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225903528, "dur": 15, "args": { "External id": 279538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279538, "pid": 5, "tid": 7, "ts": 1716454225903528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843801, "dur": 8, "args": { "External id": 279538, "cbid": 211, "correlation": 279538 } }, { "ph": "s", "id": 279538, "pid": 76337, "tid": -914061504, "ts": 1716454225843801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225903544, "dur": 12, "args": { "External id": 279546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279546, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279546, "pid": 5, "tid": 7, "ts": 1716454225903544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843829, "dur": 8, "args": { "External id": 279546, "cbid": 211, "correlation": 279546 } }, { "ph": "s", "id": 279546, "pid": 76337, "tid": -914061504, "ts": 1716454225843829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225903557, "dur": 10, "args": { "External id": 279554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279554, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279554, "pid": 5, "tid": 7, "ts": 1716454225903557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843858, "dur": 9, "args": { "External id": 279554, "cbid": 211, "correlation": 279554 } }, { "ph": "s", "id": 279554, "pid": 76337, "tid": -914061504, "ts": 1716454225843858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225903568, "dur": 18, "args": { "External id": 279574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279574, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 279574, "pid": 5, "tid": 7, "ts": 1716454225903568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843939, "dur": 12, "args": { "External id": 279574, "cbid": 211, "correlation": 279574 } }, { "ph": "s", "id": 279574, "pid": 76337, "tid": -914061504, "ts": 1716454225843939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225903587, "dur": 4, "args": { "External id": 279586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279586, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 279586, "pid": 5, "tid": 7, "ts": 1716454225903587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843962, "dur": 6, "args": { "External id": 279586, "cbid": 211, "correlation": 279586 } }, { "ph": "s", "id": 279586, "pid": 76337, "tid": -914061504, "ts": 1716454225843962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225903593, "dur": 16, "args": { "External id": 279589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279589, "pid": 5, "tid": 7, "ts": 1716454225903593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225843988, "dur": 8, "args": { "External id": 279589, "cbid": 211, "correlation": 279589 } }, { "ph": "s", "id": 279589, "pid": 76337, "tid": -914061504, "ts": 1716454225843988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225844048, "dur": 0, "args": { "External id": 279600, "cbid": 317, "correlation": 279600 } }, { "ph": "f", "id": 279600, "pid": 76337, "tid": -914061504, "ts": 1716454225844048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225844049, "dur": 0, "args": { "External id": 279601, "cbid": 203, "correlation": 279601 } }, { "ph": "f", "id": 279601, "pid": 76337, "tid": -914061504, "ts": 1716454225844049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225844049, "dur": 0, "args": { "External id": 279602, "cbid": 205, "correlation": 279602 } }, { "ph": "f", "id": 279602, "pid": 76337, "tid": -914061504, "ts": 1716454225844049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225903610, "dur": 11, "args": { "External id": 279606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279606, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279606, "pid": 5, "tid": 7, "ts": 1716454225903610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844063, "dur": 13, "args": { "External id": 279606, "cbid": 211, "correlation": 279606 } }, { "ph": "s", "id": 279606, "pid": 76337, "tid": -914061504, "ts": 1716454225844063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225903622, "dur": 3, "args": { "External id": 279608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279608, "pid": 5, "tid": 7, "ts": 1716454225903622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844080, "dur": 7, "args": { "External id": 279608, "cbid": 211, "correlation": 279608 } }, { "ph": "s", "id": 279608, "pid": 76337, "tid": -914061504, "ts": 1716454225844080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225844090, "dur": 0, "args": { "External id": 279609, "cbid": 51, "correlation": 279609 } }, { "ph": "s", "id": 279609, "pid": 76337, "tid": -914061504, "ts": 1716454225844090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225903627, "dur": 88, "args": { "External id": 279610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279610, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 279610, "pid": 5, "tid": 7, "ts": 1716454225903627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844090, "dur": 6, "args": { "External id": 279610, "cbid": 211, "correlation": 279610 } }, { "ph": "s", "id": 279610, "pid": 76337, "tid": -914061504, "ts": 1716454225844090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225903716, "dur": 15, "args": { "External id": 279615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279615, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279615, "pid": 5, "tid": 7, "ts": 1716454225903716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844119, "dur": 8, "args": { "External id": 279615, "cbid": 211, "correlation": 279615 } }, { "ph": "s", "id": 279615, "pid": 76337, "tid": -914061504, "ts": 1716454225844119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225903732, "dur": 82, "args": { "External id": 279624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279624, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279624, "pid": 5, "tid": 7, "ts": 1716454225903732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844201, "dur": 14, "args": { "External id": 279624, "cbid": 211, "correlation": 279624 } }, { "ph": "s", "id": 279624, "pid": 76337, "tid": -914061504, "ts": 1716454225844201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225903816, "dur": 29, "args": { "External id": 279646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279646, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279646, "pid": 5, "tid": 7, "ts": 1716454225903816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844260, "dur": 11, "args": { "External id": 279646, "cbid": 211, "correlation": 279646 } }, { "ph": "s", "id": 279646, "pid": 76337, "tid": -914061504, "ts": 1716454225844260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225844356, "dur": 2, "args": { "External id": 279657, "cbid": 251, "correlation": 279657 } }, { "ph": "f", "id": 279657, "pid": 76337, "tid": -914061504, "ts": 1716454225844356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225903846, "dur": 156, "args": { "External id": 279658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279658, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279658, "pid": 5, "tid": 7, "ts": 1716454225903846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844361, "dur": 14, "args": { "External id": 279658, "cbid": 211, "correlation": 279658 } }, { "ph": "s", "id": 279658, "pid": 76337, "tid": -914061504, "ts": 1716454225844361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225844432, "dur": 1, "args": { "External id": 279669, "cbid": 251, "correlation": 279669 } }, { "ph": "f", "id": 279669, "pid": 76337, "tid": -914061504, "ts": 1716454225844432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225904003, "dur": 152, "args": { "External id": 279670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279670, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279670, "pid": 5, "tid": 7, "ts": 1716454225904003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844436, "dur": 12, "args": { "External id": 279670, "cbid": 211, "correlation": 279670 } }, { "ph": "s", "id": 279670, "pid": 76337, "tid": -914061504, "ts": 1716454225844436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225844501, "dur": 1, "args": { "External id": 279681, "cbid": 251, "correlation": 279681 } }, { "ph": "f", "id": 279681, "pid": 76337, "tid": -914061504, "ts": 1716454225844501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225904156, "dur": 154, "args": { "External id": 279682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279682, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279682, "pid": 5, "tid": 7, "ts": 1716454225904156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844505, "dur": 11, "args": { "External id": 279682, "cbid": 211, "correlation": 279682 } }, { "ph": "s", "id": 279682, "pid": 76337, "tid": -914061504, "ts": 1716454225844505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225904312, "dur": 327, "args": { "External id": 279707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279707, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279707, "pid": 5, "tid": 7, "ts": 1716454225904312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844592, "dur": 16, "args": { "External id": 279707, "cbid": 211, "correlation": 279707 } }, { "ph": "s", "id": 279707, "pid": 76337, "tid": -914061504, "ts": 1716454225844592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225844697, "dur": 2, "args": { "External id": 279725, "cbid": 251, "correlation": 279725 } }, { "ph": "f", "id": 279725, "pid": 76337, "tid": -914061504, "ts": 1716454225844697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225904640, "dur": 161, "args": { "External id": 279727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279727, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279727, "pid": 5, "tid": 7, "ts": 1716454225904640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844703, "dur": 16, "args": { "External id": 279727, "cbid": 211, "correlation": 279727 } }, { "ph": "s", "id": 279727, "pid": 76337, "tid": -914061504, "ts": 1716454225844703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225904802, "dur": 19, "args": { "External id": 279735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279735, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279735, "pid": 5, "tid": 7, "ts": 1716454225904802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844778, "dur": 13, "args": { "External id": 279735, "cbid": 211, "correlation": 279735 } }, { "ph": "s", "id": 279735, "pid": 76337, "tid": -914061504, "ts": 1716454225844778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225904822, "dur": 28, "args": { "External id": 279743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279743, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279743, "pid": 5, "tid": 7, "ts": 1716454225904822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844819, "dur": 10, "args": { "External id": 279743, "cbid": 211, "correlation": 279743 } }, { "ph": "s", "id": 279743, "pid": 76337, "tid": -914061504, "ts": 1716454225844819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225904852, "dur": 17, "args": { "External id": 279754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279754, "pid": 5, "tid": 7, "ts": 1716454225904852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844893, "dur": 16, "args": { "External id": 279754, "cbid": 211, "correlation": 279754 } }, { "ph": "s", "id": 279754, "pid": 76337, "tid": -914061504, "ts": 1716454225844893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225904871, "dur": 15, "args": { "External id": 279776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279776, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279776, "pid": 5, "tid": 7, "ts": 1716454225904871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225844928, "dur": 10, "args": { "External id": 279776, "cbid": 211, "correlation": 279776 } }, { "ph": "s", "id": 279776, "pid": 76337, "tid": -914061504, "ts": 1716454225844928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845030, "dur": 2, "args": { "External id": 279787, "cbid": 251, "correlation": 279787 } }, { "ph": "f", "id": 279787, "pid": 76337, "tid": -914061504, "ts": 1716454225845030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225904887, "dur": 85, "args": { "External id": 279788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279788, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279788, "pid": 5, "tid": 7, "ts": 1716454225904887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845036, "dur": 14, "args": { "External id": 279788, "cbid": 211, "correlation": 279788 } }, { "ph": "s", "id": 279788, "pid": 76337, "tid": -914061504, "ts": 1716454225845036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845108, "dur": 1, "args": { "External id": 279799, "cbid": 251, "correlation": 279799 } }, { "ph": "f", "id": 279799, "pid": 76337, "tid": -914061504, "ts": 1716454225845108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845112, "dur": 0, "args": { "External id": 279800, "cbid": 251, "correlation": 279800 } }, { "ph": "f", "id": 279800, "pid": 76337, "tid": -914061504, "ts": 1716454225845112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225904973, "dur": 12, "args": { "External id": 279801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279801, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279801, "pid": 5, "tid": 7, "ts": 1716454225904973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845114, "dur": 14, "args": { "External id": 279801, "cbid": 211, "correlation": 279801 } }, { "ph": "s", "id": 279801, "pid": 76337, "tid": -914061504, "ts": 1716454225845114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225904987, "dur": 5, "args": { "External id": 279803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279803, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279803, "pid": 5, "tid": 7, "ts": 1716454225904987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845130, "dur": 8, "args": { "External id": 279803, "cbid": 211, "correlation": 279803 } }, { "ph": "s", "id": 279803, "pid": 76337, "tid": -914061504, "ts": 1716454225845130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845190, "dur": 1, "args": { "External id": 279814, "cbid": 251, "correlation": 279814 } }, { "ph": "f", "id": 279814, "pid": 76337, "tid": -914061504, "ts": 1716454225845190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845193, "dur": 0, "args": { "External id": 279815, "cbid": 251, "correlation": 279815 } }, { "ph": "f", "id": 279815, "pid": 76337, "tid": -914061504, "ts": 1716454225845193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225904993, "dur": 8, "args": { "External id": 279816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279816, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279816, "pid": 5, "tid": 7, "ts": 1716454225904993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845195, "dur": 12, "args": { "External id": 279816, "cbid": 211, "correlation": 279816 } }, { "ph": "s", "id": 279816, "pid": 76337, "tid": -914061504, "ts": 1716454225845195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225905003, "dur": 3, "args": { "External id": 279818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279818, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279818, "pid": 5, "tid": 7, "ts": 1716454225905003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845208, "dur": 7, "args": { "External id": 279818, "cbid": 211, "correlation": 279818 } }, { "ph": "s", "id": 279818, "pid": 76337, "tid": -914061504, "ts": 1716454225845208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225905008, "dur": 53, "args": { "External id": 279843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279843, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279843, "pid": 5, "tid": 7, "ts": 1716454225905008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845286, "dur": 13, "args": { "External id": 279843, "cbid": 211, "correlation": 279843 } }, { "ph": "s", "id": 279843, "pid": 76337, "tid": -914061504, "ts": 1716454225845286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845384, "dur": 1, "args": { "External id": 279861, "cbid": 251, "correlation": 279861 } }, { "ph": "f", "id": 279861, "pid": 76337, "tid": -914061504, "ts": 1716454225845384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225905062, "dur": 87, "args": { "External id": 279863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279863, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279863, "pid": 5, "tid": 7, "ts": 1716454225905062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845390, "dur": 15, "args": { "External id": 279863, "cbid": 211, "correlation": 279863 } }, { "ph": "s", "id": 279863, "pid": 76337, "tid": -914061504, "ts": 1716454225845390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225905151, "dur": 9, "args": { "External id": 279871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279871, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279871, "pid": 5, "tid": 7, "ts": 1716454225905151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845460, "dur": 16, "args": { "External id": 279871, "cbid": 211, "correlation": 279871 } }, { "ph": "s", "id": 279871, "pid": 76337, "tid": -914061504, "ts": 1716454225845460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225905161, "dur": 20, "args": { "External id": 279879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279879, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279879, "pid": 5, "tid": 7, "ts": 1716454225905161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845505, "dur": 10, "args": { "External id": 279879, "cbid": 211, "correlation": 279879 } }, { "ph": "s", "id": 279879, "pid": 76337, "tid": -914061504, "ts": 1716454225845505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225905183, "dur": 17, "args": { "External id": 279901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279901, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279901, "pid": 5, "tid": 7, "ts": 1716454225905183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845557, "dur": 11, "args": { "External id": 279901, "cbid": 211, "correlation": 279901 } }, { "ph": "s", "id": 279901, "pid": 76337, "tid": -914061504, "ts": 1716454225845557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845647, "dur": 1, "args": { "External id": 279917, "cbid": 251, "correlation": 279917 } }, { "ph": "f", "id": 279917, "pid": 76337, "tid": -914061504, "ts": 1716454225845647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845652, "dur": 0, "args": { "External id": 279919, "cbid": 251, "correlation": 279919 } }, { "ph": "f", "id": 279919, "pid": 76337, "tid": -914061504, "ts": 1716454225845652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225905201, "dur": 488, "args": { "External id": 279920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279920, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 279920, "pid": 5, "tid": 7, "ts": 1716454225905201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845654, "dur": 13, "args": { "External id": 279920, "cbid": 211, "correlation": 279920 } }, { "ph": "s", "id": 279920, "pid": 76337, "tid": -914061504, "ts": 1716454225845654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225905691, "dur": 64, "args": { "External id": 279928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279928, "pid": 5, "tid": 7, "ts": 1716454225905691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845725, "dur": 13, "args": { "External id": 279928, "cbid": 211, "correlation": 279928 } }, { "ph": "s", "id": 279928, "pid": 76337, "tid": -914061504, "ts": 1716454225845725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225905756, "dur": 65, "args": { "External id": 279936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279936, "pid": 5, "tid": 7, "ts": 1716454225905756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845759, "dur": 9, "args": { "External id": 279936, "cbid": 211, "correlation": 279936 } }, { "ph": "s", "id": 279936, "pid": 76337, "tid": -914061504, "ts": 1716454225845759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225845839, "dur": 1, "args": { "External id": 279952, "cbid": 251, "correlation": 279952 } }, { "ph": "f", "id": 279952, "pid": 76337, "tid": -914061504, "ts": 1716454225845839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225905823, "dur": 1, "args": { "External id": 279954, "device": 5, "context": 1, "stream": 7, "correlation": 279954, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 279954, "pid": 5, "tid": 7, "ts": 1716454225905823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225845844, "dur": 14, "args": { "External id": 279954, "cbid": 51, "correlation": 279954 } }, { "ph": "s", "id": 279954, "pid": 76337, "tid": -914061504, "ts": 1716454225845844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225905826, "dur": 261, "args": { "External id": 279955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279955, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279955, "pid": 5, "tid": 7, "ts": 1716454225905826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845860, "dur": 15, "args": { "External id": 279955, "cbid": 211, "correlation": 279955 } }, { "ph": "s", "id": 279955, "pid": 76337, "tid": -914061504, "ts": 1716454225845860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225906089, "dur": 14, "args": { "External id": 279963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279963, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279963, "pid": 5, "tid": 7, "ts": 1716454225906089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225845910, "dur": 16, "args": { "External id": 279963, "cbid": 211, "correlation": 279963 } }, { "ph": "s", "id": 279963, "pid": 76337, "tid": -914061504, "ts": 1716454225845910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225906104, "dur": 36, "args": { "External id": 279974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279974, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279974, "pid": 5, "tid": 7, "ts": 1716454225906104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846092, "dur": 53, "args": { "External id": 279974, "cbid": 211, "correlation": 279974 } }, { "ph": "s", "id": 279974, "pid": 76337, "tid": -914061504, "ts": 1716454225846092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225846242, "dur": 0, "args": { "External id": 279986, "cbid": 317, "correlation": 279986 } }, { "ph": "f", "id": 279986, "pid": 76337, "tid": -914061504, "ts": 1716454225846242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225846244, "dur": 0, "args": { "External id": 279987, "cbid": 203, "correlation": 279987 } }, { "ph": "f", "id": 279987, "pid": 76337, "tid": -914061504, "ts": 1716454225846244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225846246, "dur": 0, "args": { "External id": 279988, "cbid": 205, "correlation": 279988 } }, { "ph": "f", "id": 279988, "pid": 76337, "tid": -914061504, "ts": 1716454225846246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225906142, "dur": 12, "args": { "External id": 279992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 279992, "pid": 5, "tid": 7, "ts": 1716454225906142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846282, "dur": 22, "args": { "External id": 279992, "cbid": 211, "correlation": 279992 } }, { "ph": "s", "id": 279992, "pid": 76337, "tid": -914061504, "ts": 1716454225846282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225906155, "dur": 4, "args": { "External id": 279994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 279994, "pid": 5, "tid": 7, "ts": 1716454225906155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846311, "dur": 8, "args": { "External id": 279994, "cbid": 211, "correlation": 279994 } }, { "ph": "s", "id": 279994, "pid": 76337, "tid": -914061504, "ts": 1716454225846311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225846322, "dur": 1, "args": { "External id": 279995, "cbid": 51, "correlation": 279995 } }, { "ph": "s", "id": 279995, "pid": 76337, "tid": -914061504, "ts": 1716454225846322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225906160, "dur": 93, "args": { "External id": 279996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 279996, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 279996, "pid": 5, "tid": 7, "ts": 1716454225906160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846324, "dur": 9, "args": { "External id": 279996, "cbid": 211, "correlation": 279996 } }, { "ph": "s", "id": 279996, "pid": 76337, "tid": -914061504, "ts": 1716454225846324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225906254, "dur": 15, "args": { "External id": 280001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280001, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280001, "pid": 5, "tid": 7, "ts": 1716454225906254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846379, "dur": 18, "args": { "External id": 280001, "cbid": 211, "correlation": 280001 } }, { "ph": "s", "id": 280001, "pid": 76337, "tid": -914061504, "ts": 1716454225846379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225906271, "dur": 11, "args": { "External id": 280009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280009, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280009, "pid": 5, "tid": 7, "ts": 1716454225906271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846513, "dur": 14, "args": { "External id": 280009, "cbid": 211, "correlation": 280009 } }, { "ph": "s", "id": 280009, "pid": 76337, "tid": -914061504, "ts": 1716454225846513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225906283, "dur": 24, "args": { "External id": 280018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280018, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280018, "pid": 5, "tid": 7, "ts": 1716454225906283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846644, "dur": 33, "args": { "External id": 280018, "cbid": 211, "correlation": 280018 } }, { "ph": "s", "id": 280018, "pid": 76337, "tid": -914061504, "ts": 1716454225846644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225906309, "dur": 23, "args": { "External id": 280038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280038, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 280038, "pid": 5, "tid": 7, "ts": 1716454225906309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846871, "dur": 23, "args": { "External id": 280038, "cbid": 211, "correlation": 280038 } }, { "ph": "s", "id": 280038, "pid": 76337, "tid": -914061504, "ts": 1716454225846871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225906333, "dur": 5, "args": { "External id": 280050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280050, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 280050, "pid": 5, "tid": 7, "ts": 1716454225906333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846921, "dur": 15, "args": { "External id": 280050, "cbid": 211, "correlation": 280050 } }, { "ph": "s", "id": 280050, "pid": 76337, "tid": -914061504, "ts": 1716454225846921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225906340, "dur": 24, "args": { "External id": 280053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280053, "pid": 5, "tid": 7, "ts": 1716454225906340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225846955, "dur": 96, "args": { "External id": 280053, "cbid": 211, "correlation": 280053 } }, { "ph": "s", "id": 280053, "pid": 76337, "tid": -914061504, "ts": 1716454225846955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225906365, "dur": 17, "args": { "External id": 280062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280062, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280062, "pid": 5, "tid": 7, "ts": 1716454225906365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847128, "dur": 18, "args": { "External id": 280062, "cbid": 211, "correlation": 280062 } }, { "ph": "s", "id": 280062, "pid": 76337, "tid": -914061504, "ts": 1716454225847128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225847306, "dur": 0, "args": { "External id": 280072, "cbid": 317, "correlation": 280072 } }, { "ph": "f", "id": 280072, "pid": 76337, "tid": -914061504, "ts": 1716454225847306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225847307, "dur": 0, "args": { "External id": 280073, "cbid": 203, "correlation": 280073 } }, { "ph": "f", "id": 280073, "pid": 76337, "tid": -914061504, "ts": 1716454225847307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225847308, "dur": 0, "args": { "External id": 280074, "cbid": 205, "correlation": 280074 } }, { "ph": "f", "id": 280074, "pid": 76337, "tid": -914061504, "ts": 1716454225847308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225906383, "dur": 17, "args": { "External id": 280078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280078, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280078, "pid": 5, "tid": 7, "ts": 1716454225906383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847331, "dur": 18, "args": { "External id": 280078, "cbid": 211, "correlation": 280078 } }, { "ph": "s", "id": 280078, "pid": 76337, "tid": -914061504, "ts": 1716454225847331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225906402, "dur": 230, "args": { "External id": 280080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280080, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280080, "pid": 5, "tid": 7, "ts": 1716454225906402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847355, "dur": 46, "args": { "External id": 280080, "cbid": 211, "correlation": 280080 } }, { "ph": "s", "id": 280080, "pid": 76337, "tid": -914061504, "ts": 1716454225847355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225906634, "dur": 1, "args": { "External id": 280082, "device": 5, "context": 1, "stream": 7, "correlation": 280082, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 280082, "pid": 5, "tid": 7, "ts": 1716454225906634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225847460, "dur": 32, "args": { "External id": 280082, "cbid": 51, "correlation": 280082 } }, { "ph": "s", "id": 280082, "pid": 76337, "tid": -914061504, "ts": 1716454225847460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225906638, "dur": 802, "args": { "External id": 280083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280083, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280083, "pid": 5, "tid": 7, "ts": 1716454225906638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847497, "dur": 21, "args": { "External id": 280083, "cbid": 211, "correlation": 280083 } }, { "ph": "s", "id": 280083, "pid": 76337, "tid": -914061504, "ts": 1716454225847497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225907442, "dur": 13, "args": { "External id": 280085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280085, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280085, "pid": 5, "tid": 7, "ts": 1716454225907442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847528, "dur": 24, "args": { "External id": 280085, "cbid": 211, "correlation": 280085 } }, { "ph": "s", "id": 280085, "pid": 76337, "tid": -914061504, "ts": 1716454225847528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225907455, "dur": 14, "args": { "External id": 280091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280091, "pid": 5, "tid": 7, "ts": 1716454225907455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847688, "dur": 42, "args": { "External id": 280091, "cbid": 211, "correlation": 280091 } }, { "ph": "s", "id": 280091, "pid": 76337, "tid": -914061504, "ts": 1716454225847688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225907471, "dur": 4, "args": { "External id": 280099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280099, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 280099, "pid": 5, "tid": 7, "ts": 1716454225907471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225847896, "dur": 42, "args": { "External id": 280099, "cbid": 211, "correlation": 280099 } }, { "ph": "s", "id": 280099, "pid": 76337, "tid": -914061504, "ts": 1716454225847896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225848342, "dur": 10, "args": { "External id": 280115, "cbid": 251, "correlation": 280115 } }, { "ph": "f", "id": 280115, "pid": 76337, "tid": -914061504, "ts": 1716454225848342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225848374, "dur": 6, "args": { "External id": 280117, "cbid": 251, "correlation": 280117 } }, { "ph": "f", "id": 280117, "pid": 76337, "tid": -914061504, "ts": 1716454225848374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225907476, "dur": 13, "args": { "External id": 280118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280118, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280118, "pid": 5, "tid": 7, "ts": 1716454225907476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848401, "dur": 88, "args": { "External id": 280118, "cbid": 211, "correlation": 280118 } }, { "ph": "s", "id": 280118, "pid": 76337, "tid": -914061504, "ts": 1716454225848401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225907490, "dur": 5, "args": { "External id": 280120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280120, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280120, "pid": 5, "tid": 7, "ts": 1716454225907490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848510, "dur": 31, "args": { "External id": 280120, "cbid": 211, "correlation": 280120 } }, { "ph": "s", "id": 280120, "pid": 76337, "tid": -914061504, "ts": 1716454225848510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225907496, "dur": 17, "args": { "External id": 280130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280130, "pid": 5, "tid": 7, "ts": 1716454225907496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848669, "dur": 19, "args": { "External id": 280130, "cbid": 211, "correlation": 280130 } }, { "ph": "s", "id": 280130, "pid": 76337, "tid": -914061504, "ts": 1716454225848669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225907515, "dur": 18, "args": { "External id": 280150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280150, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 280150, "pid": 5, "tid": 7, "ts": 1716454225907515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848814, "dur": 33, "args": { "External id": 280150, "cbid": 211, "correlation": 280150 } }, { "ph": "s", "id": 280150, "pid": 76337, "tid": -914061504, "ts": 1716454225848814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225907533, "dur": 5, "args": { "External id": 280162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280162, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 280162, "pid": 5, "tid": 7, "ts": 1716454225907533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848876, "dur": 12, "args": { "External id": 280162, "cbid": 211, "correlation": 280162 } }, { "ph": "s", "id": 280162, "pid": 76337, "tid": -914061504, "ts": 1716454225848876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225907540, "dur": 16, "args": { "External id": 280165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280165, "pid": 5, "tid": 7, "ts": 1716454225907540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848902, "dur": 8, "args": { "External id": 280165, "cbid": 211, "correlation": 280165 } }, { "ph": "s", "id": 280165, "pid": 76337, "tid": -914061504, "ts": 1716454225848902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225907557, "dur": 11, "args": { "External id": 280174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280174, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280174, "pid": 5, "tid": 7, "ts": 1716454225907557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225848954, "dur": 15, "args": { "External id": 280174, "cbid": 211, "correlation": 280174 } }, { "ph": "s", "id": 280174, "pid": 76337, "tid": -914061504, "ts": 1716454225848954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225849059, "dur": 0, "args": { "External id": 280184, "cbid": 317, "correlation": 280184 } }, { "ph": "f", "id": 280184, "pid": 76337, "tid": -914061504, "ts": 1716454225849059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225849060, "dur": 0, "args": { "External id": 280185, "cbid": 203, "correlation": 280185 } }, { "ph": "f", "id": 280185, "pid": 76337, "tid": -914061504, "ts": 1716454225849060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225849061, "dur": 0, "args": { "External id": 280186, "cbid": 205, "correlation": 280186 } }, { "ph": "f", "id": 280186, "pid": 76337, "tid": -914061504, "ts": 1716454225849061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225907570, "dur": 11, "args": { "External id": 280190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280190, "pid": 5, "tid": 7, "ts": 1716454225907570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849080, "dur": 15, "args": { "External id": 280190, "cbid": 211, "correlation": 280190 } }, { "ph": "s", "id": 280190, "pid": 76337, "tid": -914061504, "ts": 1716454225849080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225907582, "dur": 155, "args": { "External id": 280192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280192, "pid": 5, "tid": 7, "ts": 1716454225907582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849098, "dur": 8, "args": { "External id": 280192, "cbid": 211, "correlation": 280192 } }, { "ph": "s", "id": 280192, "pid": 76337, "tid": -914061504, "ts": 1716454225849098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225907740, "dur": 1, "args": { "External id": 280194, "device": 5, "context": 1, "stream": 7, "correlation": 280194, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 280194, "pid": 5, "tid": 7, "ts": 1716454225907740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225849115, "dur": 11, "args": { "External id": 280194, "cbid": 51, "correlation": 280194 } }, { "ph": "s", "id": 280194, "pid": 76337, "tid": -914061504, "ts": 1716454225849115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225907743, "dur": 635, "args": { "External id": 280195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280195, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280195, "pid": 5, "tid": 7, "ts": 1716454225907743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849127, "dur": 10, "args": { "External id": 280195, "cbid": 211, "correlation": 280195 } }, { "ph": "s", "id": 280195, "pid": 76337, "tid": -914061504, "ts": 1716454225849127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225908379, "dur": 12, "args": { "External id": 280197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280197, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280197, "pid": 5, "tid": 7, "ts": 1716454225908379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849144, "dur": 8, "args": { "External id": 280197, "cbid": 211, "correlation": 280197 } }, { "ph": "s", "id": 280197, "pid": 76337, "tid": -914061504, "ts": 1716454225849144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225908393, "dur": 14, "args": { "External id": 280203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280203, "pid": 5, "tid": 7, "ts": 1716454225908393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849185, "dur": 16, "args": { "External id": 280203, "cbid": 211, "correlation": 280203 } }, { "ph": "s", "id": 280203, "pid": 76337, "tid": -914061504, "ts": 1716454225849185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225849270, "dur": 0, "args": { "External id": 280213, "cbid": 317, "correlation": 280213 } }, { "ph": "f", "id": 280213, "pid": 76337, "tid": -914061504, "ts": 1716454225849270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225849271, "dur": 0, "args": { "External id": 280214, "cbid": 203, "correlation": 280214 } }, { "ph": "f", "id": 280214, "pid": 76337, "tid": -914061504, "ts": 1716454225849271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225849272, "dur": 0, "args": { "External id": 280215, "cbid": 205, "correlation": 280215 } }, { "ph": "f", "id": 280215, "pid": 76337, "tid": -914061504, "ts": 1716454225849272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225908409, "dur": 17, "args": { "External id": 280219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280219, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280219, "pid": 5, "tid": 7, "ts": 1716454225908409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849288, "dur": 22, "args": { "External id": 280219, "cbid": 211, "correlation": 280219 } }, { "ph": "s", "id": 280219, "pid": 76337, "tid": -914061504, "ts": 1716454225849288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225908427, "dur": 4, "args": { "External id": 280221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280221, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 280221, "pid": 5, "tid": 7, "ts": 1716454225908427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849317, "dur": 8, "args": { "External id": 280221, "cbid": 211, "correlation": 280221 } }, { "ph": "s", "id": 280221, "pid": 76337, "tid": -914061504, "ts": 1716454225849317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225849328, "dur": 0, "args": { "External id": 280222, "cbid": 51, "correlation": 280222 } }, { "ph": "s", "id": 280222, "pid": 76337, "tid": -914061504, "ts": 1716454225849328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225908432, "dur": 125, "args": { "External id": 280223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280223, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 280223, "pid": 5, "tid": 7, "ts": 1716454225908432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849329, "dur": 6, "args": { "External id": 280223, "cbid": 211, "correlation": 280223 } }, { "ph": "s", "id": 280223, "pid": 76337, "tid": -914061504, "ts": 1716454225849329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225908559, "dur": 15, "args": { "External id": 280228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280228, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280228, "pid": 5, "tid": 7, "ts": 1716454225908559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849365, "dur": 15, "args": { "External id": 280228, "cbid": 211, "correlation": 280228 } }, { "ph": "s", "id": 280228, "pid": 76337, "tid": -914061504, "ts": 1716454225849365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225908575, "dur": 13, "args": { "External id": 280236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280236, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280236, "pid": 5, "tid": 7, "ts": 1716454225908575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849411, "dur": 14, "args": { "External id": 280236, "cbid": 211, "correlation": 280236 } }, { "ph": "s", "id": 280236, "pid": 76337, "tid": -914061504, "ts": 1716454225849411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225908589, "dur": 10, "args": { "External id": 280244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280244, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280244, "pid": 5, "tid": 7, "ts": 1716454225908589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849459, "dur": 12, "args": { "External id": 280244, "cbid": 211, "correlation": 280244 } }, { "ph": "s", "id": 280244, "pid": 76337, "tid": -914061504, "ts": 1716454225849459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225908600, "dur": 17, "args": { "External id": 280264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280264, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 280264, "pid": 5, "tid": 7, "ts": 1716454225908600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849561, "dur": 14, "args": { "External id": 280264, "cbid": 211, "correlation": 280264 } }, { "ph": "s", "id": 280264, "pid": 76337, "tid": -914061504, "ts": 1716454225849561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225908619, "dur": 4, "args": { "External id": 280276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280276, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 280276, "pid": 5, "tid": 7, "ts": 1716454225908619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849594, "dur": 18, "args": { "External id": 280276, "cbid": 211, "correlation": 280276 } }, { "ph": "s", "id": 280276, "pid": 76337, "tid": -914061504, "ts": 1716454225849594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225908624, "dur": 16, "args": { "External id": 280279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280279, "pid": 5, "tid": 7, "ts": 1716454225908624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849632, "dur": 16, "args": { "External id": 280279, "cbid": 211, "correlation": 280279 } }, { "ph": "s", "id": 280279, "pid": 76337, "tid": -914061504, "ts": 1716454225849632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225849730, "dur": 1, "args": { "External id": 280290, "cbid": 317, "correlation": 280290 } }, { "ph": "f", "id": 280290, "pid": 76337, "tid": -914061504, "ts": 1716454225849730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225849732, "dur": 0, "args": { "External id": 280291, "cbid": 203, "correlation": 280291 } }, { "ph": "f", "id": 280291, "pid": 76337, "tid": -914061504, "ts": 1716454225849732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225849733, "dur": 0, "args": { "External id": 280292, "cbid": 205, "correlation": 280292 } }, { "ph": "f", "id": 280292, "pid": 76337, "tid": -914061504, "ts": 1716454225849733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225908642, "dur": 11, "args": { "External id": 280296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280296, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280296, "pid": 5, "tid": 7, "ts": 1716454225908642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849750, "dur": 20, "args": { "External id": 280296, "cbid": 211, "correlation": 280296 } }, { "ph": "s", "id": 280296, "pid": 76337, "tid": -914061504, "ts": 1716454225849750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225908654, "dur": 3, "args": { "External id": 280298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280298, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 280298, "pid": 5, "tid": 7, "ts": 1716454225908654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849776, "dur": 10, "args": { "External id": 280298, "cbid": 211, "correlation": 280298 } }, { "ph": "s", "id": 280298, "pid": 76337, "tid": -914061504, "ts": 1716454225849776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225849789, "dur": 0, "args": { "External id": 280299, "cbid": 51, "correlation": 280299 } }, { "ph": "s", "id": 280299, "pid": 76337, "tid": -914061504, "ts": 1716454225849789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225908658, "dur": 87, "args": { "External id": 280300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280300, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 280300, "pid": 5, "tid": 7, "ts": 1716454225908658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849790, "dur": 5, "args": { "External id": 280300, "cbid": 211, "correlation": 280300 } }, { "ph": "s", "id": 280300, "pid": 76337, "tid": -914061504, "ts": 1716454225849790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225908746, "dur": 15, "args": { "External id": 280305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280305, "pid": 5, "tid": 7, "ts": 1716454225908746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849822, "dur": 9, "args": { "External id": 280305, "cbid": 211, "correlation": 280305 } }, { "ph": "s", "id": 280305, "pid": 76337, "tid": -914061504, "ts": 1716454225849822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225908763, "dur": 79, "args": { "External id": 280314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280314, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280314, "pid": 5, "tid": 7, "ts": 1716454225908763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849920, "dur": 16, "args": { "External id": 280314, "cbid": 211, "correlation": 280314 } }, { "ph": "s", "id": 280314, "pid": 76337, "tid": -914061504, "ts": 1716454225849920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225908843, "dur": 29, "args": { "External id": 280336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280336, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280336, "pid": 5, "tid": 7, "ts": 1716454225908843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225849995, "dur": 11, "args": { "External id": 280336, "cbid": 211, "correlation": 280336 } }, { "ph": "s", "id": 280336, "pid": 76337, "tid": -914061504, "ts": 1716454225849995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225850121, "dur": 4, "args": { "External id": 280347, "cbid": 251, "correlation": 280347 } }, { "ph": "f", "id": 280347, "pid": 76337, "tid": -914061504, "ts": 1716454225850121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225908873, "dur": 155, "args": { "External id": 280348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280348, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280348, "pid": 5, "tid": 7, "ts": 1716454225908873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850134, "dur": 19, "args": { "External id": 280348, "cbid": 211, "correlation": 280348 } }, { "ph": "s", "id": 280348, "pid": 76337, "tid": -914061504, "ts": 1716454225850134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225850221, "dur": 1, "args": { "External id": 280359, "cbid": 251, "correlation": 280359 } }, { "ph": "f", "id": 280359, "pid": 76337, "tid": -914061504, "ts": 1716454225850221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225909029, "dur": 153, "args": { "External id": 280360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280360, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280360, "pid": 5, "tid": 7, "ts": 1716454225909029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850225, "dur": 13, "args": { "External id": 280360, "cbid": 211, "correlation": 280360 } }, { "ph": "s", "id": 280360, "pid": 76337, "tid": -914061504, "ts": 1716454225850225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225850292, "dur": 1, "args": { "External id": 280371, "cbid": 251, "correlation": 280371 } }, { "ph": "f", "id": 280371, "pid": 76337, "tid": -914061504, "ts": 1716454225850292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225909184, "dur": 155, "args": { "External id": 280372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280372, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280372, "pid": 5, "tid": 7, "ts": 1716454225909184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850296, "dur": 11, "args": { "External id": 280372, "cbid": 211, "correlation": 280372 } }, { "ph": "s", "id": 280372, "pid": 76337, "tid": -914061504, "ts": 1716454225850296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225909340, "dur": 326, "args": { "External id": 280397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280397, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280397, "pid": 5, "tid": 7, "ts": 1716454225909340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850406, "dur": 18, "args": { "External id": 280397, "cbid": 211, "correlation": 280397 } }, { "ph": "s", "id": 280397, "pid": 76337, "tid": -914061504, "ts": 1716454225850406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225850577, "dur": 2, "args": { "External id": 280415, "cbid": 251, "correlation": 280415 } }, { "ph": "f", "id": 280415, "pid": 76337, "tid": -914061504, "ts": 1716454225850577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225909667, "dur": 159, "args": { "External id": 280417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280417, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280417, "pid": 5, "tid": 7, "ts": 1716454225909667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850584, "dur": 16, "args": { "External id": 280417, "cbid": 211, "correlation": 280417 } }, { "ph": "s", "id": 280417, "pid": 76337, "tid": -914061504, "ts": 1716454225850584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225909828, "dur": 19, "args": { "External id": 280425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280425, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280425, "pid": 5, "tid": 7, "ts": 1716454225909828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850666, "dur": 12, "args": { "External id": 280425, "cbid": 211, "correlation": 280425 } }, { "ph": "s", "id": 280425, "pid": 76337, "tid": -914061504, "ts": 1716454225850666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225909848, "dur": 28, "args": { "External id": 280433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280433, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280433, "pid": 5, "tid": 7, "ts": 1716454225909848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850706, "dur": 8, "args": { "External id": 280433, "cbid": 211, "correlation": 280433 } }, { "ph": "s", "id": 280433, "pid": 76337, "tid": -914061504, "ts": 1716454225850706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225909877, "dur": 18, "args": { "External id": 280444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280444, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280444, "pid": 5, "tid": 7, "ts": 1716454225909877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850795, "dur": 14, "args": { "External id": 280444, "cbid": 211, "correlation": 280444 } }, { "ph": "s", "id": 280444, "pid": 76337, "tid": -914061504, "ts": 1716454225850795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225909897, "dur": 15, "args": { "External id": 280466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280466, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280466, "pid": 5, "tid": 7, "ts": 1716454225909897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850829, "dur": 8, "args": { "External id": 280466, "cbid": 211, "correlation": 280466 } }, { "ph": "s", "id": 280466, "pid": 76337, "tid": -914061504, "ts": 1716454225850829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225850917, "dur": 2, "args": { "External id": 280477, "cbid": 251, "correlation": 280477 } }, { "ph": "f", "id": 280477, "pid": 76337, "tid": -914061504, "ts": 1716454225850917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225909913, "dur": 85, "args": { "External id": 280478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280478, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 280478, "pid": 5, "tid": 7, "ts": 1716454225909913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225850923, "dur": 14, "args": { "External id": 280478, "cbid": 211, "correlation": 280478 } }, { "ph": "s", "id": 280478, "pid": 76337, "tid": -914061504, "ts": 1716454225850923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851014, "dur": 1, "args": { "External id": 280489, "cbid": 251, "correlation": 280489 } }, { "ph": "f", "id": 280489, "pid": 76337, "tid": -914061504, "ts": 1716454225851014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851018, "dur": 0, "args": { "External id": 280490, "cbid": 251, "correlation": 280490 } }, { "ph": "f", "id": 280490, "pid": 76337, "tid": -914061504, "ts": 1716454225851018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225910000, "dur": 12, "args": { "External id": 280491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280491, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280491, "pid": 5, "tid": 7, "ts": 1716454225910000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851020, "dur": 14, "args": { "External id": 280491, "cbid": 211, "correlation": 280491 } }, { "ph": "s", "id": 280491, "pid": 76337, "tid": -914061504, "ts": 1716454225851020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225910013, "dur": 5, "args": { "External id": 280493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280493, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280493, "pid": 5, "tid": 7, "ts": 1716454225910013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851037, "dur": 8, "args": { "External id": 280493, "cbid": 211, "correlation": 280493 } }, { "ph": "s", "id": 280493, "pid": 76337, "tid": -914061504, "ts": 1716454225851037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851099, "dur": 1, "args": { "External id": 280504, "cbid": 251, "correlation": 280504 } }, { "ph": "f", "id": 280504, "pid": 76337, "tid": -914061504, "ts": 1716454225851099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851103, "dur": 0, "args": { "External id": 280505, "cbid": 251, "correlation": 280505 } }, { "ph": "f", "id": 280505, "pid": 76337, "tid": -914061504, "ts": 1716454225851103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225910020, "dur": 9, "args": { "External id": 280506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280506, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280506, "pid": 5, "tid": 7, "ts": 1716454225910020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851105, "dur": 12, "args": { "External id": 280506, "cbid": 211, "correlation": 280506 } }, { "ph": "s", "id": 280506, "pid": 76337, "tid": -914061504, "ts": 1716454225851105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225910030, "dur": 3, "args": { "External id": 280508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280508, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280508, "pid": 5, "tid": 7, "ts": 1716454225910030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851118, "dur": 5, "args": { "External id": 280508, "cbid": 211, "correlation": 280508 } }, { "ph": "s", "id": 280508, "pid": 76337, "tid": -914061504, "ts": 1716454225851118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225910035, "dur": 53, "args": { "External id": 280533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280533, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280533, "pid": 5, "tid": 7, "ts": 1716454225910035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851200, "dur": 13, "args": { "External id": 280533, "cbid": 211, "correlation": 280533 } }, { "ph": "s", "id": 280533, "pid": 76337, "tid": -914061504, "ts": 1716454225851200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851306, "dur": 2, "args": { "External id": 280551, "cbid": 251, "correlation": 280551 } }, { "ph": "f", "id": 280551, "pid": 76337, "tid": -914061504, "ts": 1716454225851306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225910089, "dur": 87, "args": { "External id": 280553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280553, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 280553, "pid": 5, "tid": 7, "ts": 1716454225910089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851312, "dur": 15, "args": { "External id": 280553, "cbid": 211, "correlation": 280553 } }, { "ph": "s", "id": 280553, "pid": 76337, "tid": -914061504, "ts": 1716454225851312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225910178, "dur": 10, "args": { "External id": 280561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280561, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280561, "pid": 5, "tid": 7, "ts": 1716454225910178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851385, "dur": 12, "args": { "External id": 280561, "cbid": 211, "correlation": 280561 } }, { "ph": "s", "id": 280561, "pid": 76337, "tid": -914061504, "ts": 1716454225851385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225910188, "dur": 21, "args": { "External id": 280569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280569, "pid": 5, "tid": 7, "ts": 1716454225910188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851428, "dur": 10, "args": { "External id": 280569, "cbid": 211, "correlation": 280569 } }, { "ph": "s", "id": 280569, "pid": 76337, "tid": -914061504, "ts": 1716454225851428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225910211, "dur": 18, "args": { "External id": 280591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280591, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280591, "pid": 5, "tid": 7, "ts": 1716454225910211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851482, "dur": 10, "args": { "External id": 280591, "cbid": 211, "correlation": 280591 } }, { "ph": "s", "id": 280591, "pid": 76337, "tid": -914061504, "ts": 1716454225851482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851578, "dur": 1, "args": { "External id": 280607, "cbid": 251, "correlation": 280607 } }, { "ph": "f", "id": 280607, "pid": 76337, "tid": -914061504, "ts": 1716454225851578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851583, "dur": 0, "args": { "External id": 280609, "cbid": 251, "correlation": 280609 } }, { "ph": "f", "id": 280609, "pid": 76337, "tid": -914061504, "ts": 1716454225851583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225910230, "dur": 486, "args": { "External id": 280610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280610, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280610, "pid": 5, "tid": 7, "ts": 1716454225910230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851585, "dur": 14, "args": { "External id": 280610, "cbid": 211, "correlation": 280610 } }, { "ph": "s", "id": 280610, "pid": 76337, "tid": -914061504, "ts": 1716454225851585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225910717, "dur": 64, "args": { "External id": 280618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280618, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280618, "pid": 5, "tid": 7, "ts": 1716454225910717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851675, "dur": 14, "args": { "External id": 280618, "cbid": 211, "correlation": 280618 } }, { "ph": "s", "id": 280618, "pid": 76337, "tid": -914061504, "ts": 1716454225851675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225910783, "dur": 64, "args": { "External id": 280626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280626, "pid": 5, "tid": 7, "ts": 1716454225910783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851718, "dur": 11, "args": { "External id": 280626, "cbid": 211, "correlation": 280626 } }, { "ph": "s", "id": 280626, "pid": 76337, "tid": -914061504, "ts": 1716454225851718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225851801, "dur": 1, "args": { "External id": 280642, "cbid": 251, "correlation": 280642 } }, { "ph": "f", "id": 280642, "pid": 76337, "tid": -914061504, "ts": 1716454225851801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454225910850, "dur": 1, "args": { "External id": 280644, "device": 5, "context": 1, "stream": 7, "correlation": 280644, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 280644, "pid": 5, "tid": 7, "ts": 1716454225910850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225851806, "dur": 17, "args": { "External id": 280644, "cbid": 51, "correlation": 280644 } }, { "ph": "s", "id": 280644, "pid": 76337, "tid": -914061504, "ts": 1716454225851806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225910853, "dur": 263, "args": { "External id": 280645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280645, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 280645, "pid": 5, "tid": 7, "ts": 1716454225910853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851825, "dur": 11, "args": { "External id": 280645, "cbid": 211, "correlation": 280645 } }, { "ph": "s", "id": 280645, "pid": 76337, "tid": -914061504, "ts": 1716454225851825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225911117, "dur": 14, "args": { "External id": 280653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280653, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280653, "pid": 5, "tid": 7, "ts": 1716454225911117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851868, "dur": 10, "args": { "External id": 280653, "cbid": 211, "correlation": 280653 } }, { "ph": "s", "id": 280653, "pid": 76337, "tid": -914061504, "ts": 1716454225851868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225911132, "dur": 36, "args": { "External id": 280664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280664, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280664, "pid": 5, "tid": 7, "ts": 1716454225911132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225851947, "dur": 13, "args": { "External id": 280664, "cbid": 211, "correlation": 280664 } }, { "ph": "s", "id": 280664, "pid": 76337, "tid": -914061504, "ts": 1716454225851947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225852038, "dur": 0, "args": { "External id": 280676, "cbid": 317, "correlation": 280676 } }, { "ph": "f", "id": 280676, "pid": 76337, "tid": -914061504, "ts": 1716454225852038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225852039, "dur": 1, "args": { "External id": 280677, "cbid": 203, "correlation": 280677 } }, { "ph": "f", "id": 280677, "pid": 76337, "tid": -914061504, "ts": 1716454225852039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225852040, "dur": 0, "args": { "External id": 280678, "cbid": 205, "correlation": 280678 } }, { "ph": "f", "id": 280678, "pid": 76337, "tid": -914061504, "ts": 1716454225852040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225911170, "dur": 12, "args": { "External id": 280682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280682, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280682, "pid": 5, "tid": 7, "ts": 1716454225911170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852066, "dur": 14, "args": { "External id": 280682, "cbid": 211, "correlation": 280682 } }, { "ph": "s", "id": 280682, "pid": 76337, "tid": -914061504, "ts": 1716454225852066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225911184, "dur": 4, "args": { "External id": 280684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 280684, "pid": 5, "tid": 7, "ts": 1716454225911184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852088, "dur": 7, "args": { "External id": 280684, "cbid": 211, "correlation": 280684 } }, { "ph": "s", "id": 280684, "pid": 76337, "tid": -914061504, "ts": 1716454225852088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225852099, "dur": 0, "args": { "External id": 280685, "cbid": 51, "correlation": 280685 } }, { "ph": "s", "id": 280685, "pid": 76337, "tid": -914061504, "ts": 1716454225852099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225911188, "dur": 93, "args": { "External id": 280686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280686, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 280686, "pid": 5, "tid": 7, "ts": 1716454225911188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852100, "dur": 7, "args": { "External id": 280686, "cbid": 211, "correlation": 280686 } }, { "ph": "s", "id": 280686, "pid": 76337, "tid": -914061504, "ts": 1716454225852100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225911283, "dur": 16, "args": { "External id": 280691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280691, "pid": 5, "tid": 7, "ts": 1716454225911283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852131, "dur": 9, "args": { "External id": 280691, "cbid": 211, "correlation": 280691 } }, { "ph": "s", "id": 280691, "pid": 76337, "tid": -914061504, "ts": 1716454225852131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225911300, "dur": 11, "args": { "External id": 280699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280699, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280699, "pid": 5, "tid": 7, "ts": 1716454225911300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852164, "dur": 8, "args": { "External id": 280699, "cbid": 211, "correlation": 280699 } }, { "ph": "s", "id": 280699, "pid": 76337, "tid": -914061504, "ts": 1716454225852164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225911312, "dur": 54, "args": { "External id": 280710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280710, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280710, "pid": 5, "tid": 7, "ts": 1716454225911312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852247, "dur": 14, "args": { "External id": 280710, "cbid": 211, "correlation": 280710 } }, { "ph": "s", "id": 280710, "pid": 76337, "tid": -914061504, "ts": 1716454225852247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225852307, "dur": 0, "args": { "External id": 280720, "cbid": 317, "correlation": 280720 } }, { "ph": "f", "id": 280720, "pid": 76337, "tid": -914061504, "ts": 1716454225852307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225852308, "dur": 0, "args": { "External id": 280721, "cbid": 203, "correlation": 280721 } }, { "ph": "f", "id": 280721, "pid": 76337, "tid": -914061504, "ts": 1716454225852308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225852309, "dur": 0, "args": { "External id": 280722, "cbid": 205, "correlation": 280722 } }, { "ph": "f", "id": 280722, "pid": 76337, "tid": -914061504, "ts": 1716454225852309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225911367, "dur": 39, "args": { "External id": 280726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280726, "pid": 5, "tid": 7, "ts": 1716454225911367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852328, "dur": 11, "args": { "External id": 280726, "cbid": 211, "correlation": 280726 } }, { "ph": "s", "id": 280726, "pid": 76337, "tid": -914061504, "ts": 1716454225852328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225911408, "dur": 155, "args": { "External id": 280728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280728, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280728, "pid": 5, "tid": 7, "ts": 1716454225911408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852342, "dur": 5, "args": { "External id": 280728, "cbid": 211, "correlation": 280728 } }, { "ph": "s", "id": 280728, "pid": 76337, "tid": -914061504, "ts": 1716454225852342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225911564, "dur": 1944, "args": { "External id": 280730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280730, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280730, "pid": 5, "tid": 7, "ts": 1716454225911564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852358, "dur": 10, "args": { "External id": 280730, "cbid": 211, "correlation": 280730 } }, { "ph": "s", "id": 280730, "pid": 76337, "tid": -914061504, "ts": 1716454225852358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225913510, "dur": 39, "args": { "External id": 280732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280732, "pid": 5, "tid": 7, "ts": 1716454225913510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852375, "dur": 6, "args": { "External id": 280732, "cbid": 211, "correlation": 280732 } }, { "ph": "s", "id": 280732, "pid": 76337, "tid": -914061504, "ts": 1716454225852375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225913550, "dur": 58, "args": { "External id": 280738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280738, "pid": 5, "tid": 7, "ts": 1716454225913550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852414, "dur": 9, "args": { "External id": 280738, "cbid": 211, "correlation": 280738 } }, { "ph": "s", "id": 280738, "pid": 76337, "tid": -914061504, "ts": 1716454225852414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225913609, "dur": 83, "args": { "External id": 280747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280747, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280747, "pid": 5, "tid": 7, "ts": 1716454225913609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852547, "dur": 20, "args": { "External id": 280747, "cbid": 211, "correlation": 280747 } }, { "ph": "s", "id": 280747, "pid": 76337, "tid": -914061504, "ts": 1716454225852547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225913694, "dur": 71, "args": { "External id": 280767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280767, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 280767, "pid": 5, "tid": 7, "ts": 1716454225913694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852647, "dur": 14, "args": { "External id": 280767, "cbid": 211, "correlation": 280767 } }, { "ph": "s", "id": 280767, "pid": 76337, "tid": -914061504, "ts": 1716454225852647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225913767, "dur": 5, "args": { "External id": 280779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280779, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 280779, "pid": 5, "tid": 7, "ts": 1716454225913767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852672, "dur": 7, "args": { "External id": 280779, "cbid": 211, "correlation": 280779 } }, { "ph": "s", "id": 280779, "pid": 76337, "tid": -914061504, "ts": 1716454225852672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225913772, "dur": 80, "args": { "External id": 280782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280782, "pid": 5, "tid": 7, "ts": 1716454225913772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852695, "dur": 9, "args": { "External id": 280782, "cbid": 211, "correlation": 280782 } }, { "ph": "s", "id": 280782, "pid": 76337, "tid": -914061504, "ts": 1716454225852695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225913853, "dur": 52, "args": { "External id": 280791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280791, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280791, "pid": 5, "tid": 7, "ts": 1716454225913853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852746, "dur": 13, "args": { "External id": 280791, "cbid": 211, "correlation": 280791 } }, { "ph": "s", "id": 280791, "pid": 76337, "tid": -914061504, "ts": 1716454225852746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225852804, "dur": 0, "args": { "External id": 280801, "cbid": 317, "correlation": 280801 } }, { "ph": "f", "id": 280801, "pid": 76337, "tid": -914061504, "ts": 1716454225852804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225852804, "dur": 0, "args": { "External id": 280802, "cbid": 203, "correlation": 280802 } }, { "ph": "f", "id": 280802, "pid": 76337, "tid": -914061504, "ts": 1716454225852804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225852805, "dur": 0, "args": { "External id": 280803, "cbid": 205, "correlation": 280803 } }, { "ph": "f", "id": 280803, "pid": 76337, "tid": -914061504, "ts": 1716454225852805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225913907, "dur": 58, "args": { "External id": 280807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280807, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280807, "pid": 5, "tid": 7, "ts": 1716454225913907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852823, "dur": 12, "args": { "External id": 280807, "cbid": 211, "correlation": 280807 } }, { "ph": "s", "id": 280807, "pid": 76337, "tid": -914061504, "ts": 1716454225852823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225913967, "dur": 116, "args": { "External id": 280809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280809, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280809, "pid": 5, "tid": 7, "ts": 1716454225913967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852837, "dur": 5, "args": { "External id": 280809, "cbid": 211, "correlation": 280809 } }, { "ph": "s", "id": 280809, "pid": 76337, "tid": -914061504, "ts": 1716454225852837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225914084, "dur": 1841, "args": { "External id": 280811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280811, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280811, "pid": 5, "tid": 7, "ts": 1716454225914084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852848, "dur": 7, "args": { "External id": 280811, "cbid": 211, "correlation": 280811 } }, { "ph": "s", "id": 280811, "pid": 76337, "tid": -914061504, "ts": 1716454225852848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225915926, "dur": 20, "args": { "External id": 280813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280813, "pid": 5, "tid": 7, "ts": 1716454225915926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852859, "dur": 5, "args": { "External id": 280813, "cbid": 211, "correlation": 280813 } }, { "ph": "s", "id": 280813, "pid": 76337, "tid": -914061504, "ts": 1716454225852859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225915947, "dur": 32, "args": { "External id": 280819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280819, "pid": 5, "tid": 7, "ts": 1716454225915947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852887, "dur": 8, "args": { "External id": 280819, "cbid": 211, "correlation": 280819 } }, { "ph": "s", "id": 280819, "pid": 76337, "tid": -914061504, "ts": 1716454225852887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225915980, "dur": 4, "args": { "External id": 280827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280827, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 280827, "pid": 5, "tid": 7, "ts": 1716454225915980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225852933, "dur": 9, "args": { "External id": 280827, "cbid": 211, "correlation": 280827 } }, { "ph": "s", "id": 280827, "pid": 76337, "tid": -914061504, "ts": 1716454225852933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225853012, "dur": 1, "args": { "External id": 280843, "cbid": 251, "correlation": 280843 } }, { "ph": "f", "id": 280843, "pid": 76337, "tid": -914061504, "ts": 1716454225853012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225853018, "dur": 0, "args": { "External id": 280845, "cbid": 251, "correlation": 280845 } }, { "ph": "f", "id": 280845, "pid": 76337, "tid": -914061504, "ts": 1716454225853018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225915986, "dur": 12, "args": { "External id": 280846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280846, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 280846, "pid": 5, "tid": 7, "ts": 1716454225915986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853019, "dur": 12, "args": { "External id": 280846, "cbid": 211, "correlation": 280846 } }, { "ph": "s", "id": 280846, "pid": 76337, "tid": -914061504, "ts": 1716454225853019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225915999, "dur": 5, "args": { "External id": 280848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280848, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 280848, "pid": 5, "tid": 7, "ts": 1716454225915999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853035, "dur": 7, "args": { "External id": 280848, "cbid": 211, "correlation": 280848 } }, { "ph": "s", "id": 280848, "pid": 76337, "tid": -914061504, "ts": 1716454225853035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225916005, "dur": 29, "args": { "External id": 280858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280858, "pid": 5, "tid": 7, "ts": 1716454225916005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853102, "dur": 12, "args": { "External id": 280858, "cbid": 211, "correlation": 280858 } }, { "ph": "s", "id": 280858, "pid": 76337, "tid": -914061504, "ts": 1716454225853102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225916036, "dur": 29, "args": { "External id": 280878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280878, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 280878, "pid": 5, "tid": 7, "ts": 1716454225916036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853167, "dur": 11, "args": { "External id": 280878, "cbid": 211, "correlation": 280878 } }, { "ph": "s", "id": 280878, "pid": 76337, "tid": -914061504, "ts": 1716454225853167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225916066, "dur": 4, "args": { "External id": 280890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280890, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 280890, "pid": 5, "tid": 7, "ts": 1716454225916066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853188, "dur": 6, "args": { "External id": 280890, "cbid": 211, "correlation": 280890 } }, { "ph": "s", "id": 280890, "pid": 76337, "tid": -914061504, "ts": 1716454225853188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225916072, "dur": 29, "args": { "External id": 280893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280893, "pid": 5, "tid": 7, "ts": 1716454225916072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853206, "dur": 6, "args": { "External id": 280893, "cbid": 211, "correlation": 280893 } }, { "ph": "s", "id": 280893, "pid": 76337, "tid": -914061504, "ts": 1716454225853206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225916102, "dur": 20, "args": { "External id": 280902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280902, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280902, "pid": 5, "tid": 7, "ts": 1716454225916102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853246, "dur": 11, "args": { "External id": 280902, "cbid": 211, "correlation": 280902 } }, { "ph": "s", "id": 280902, "pid": 76337, "tid": -914061504, "ts": 1716454225853246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225853311, "dur": 0, "args": { "External id": 280912, "cbid": 317, "correlation": 280912 } }, { "ph": "f", "id": 280912, "pid": 76337, "tid": -914061504, "ts": 1716454225853311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225853312, "dur": 0, "args": { "External id": 280913, "cbid": 203, "correlation": 280913 } }, { "ph": "f", "id": 280913, "pid": 76337, "tid": -914061504, "ts": 1716454225853312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225853313, "dur": 0, "args": { "External id": 280914, "cbid": 205, "correlation": 280914 } }, { "ph": "f", "id": 280914, "pid": 76337, "tid": -914061504, "ts": 1716454225853313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225916124, "dur": 22, "args": { "External id": 280918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280918, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280918, "pid": 5, "tid": 7, "ts": 1716454225916124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853329, "dur": 12, "args": { "External id": 280918, "cbid": 211, "correlation": 280918 } }, { "ph": "s", "id": 280918, "pid": 76337, "tid": -914061504, "ts": 1716454225853329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225916147, "dur": 43, "args": { "External id": 280920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280920, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280920, "pid": 5, "tid": 7, "ts": 1716454225916147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853343, "dur": 5, "args": { "External id": 280920, "cbid": 211, "correlation": 280920 } }, { "ph": "s", "id": 280920, "pid": 76337, "tid": -914061504, "ts": 1716454225853343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225916192, "dur": 625, "args": { "External id": 280922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280922, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280922, "pid": 5, "tid": 7, "ts": 1716454225916192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853356, "dur": 6, "args": { "External id": 280922, "cbid": 211, "correlation": 280922 } }, { "ph": "s", "id": 280922, "pid": 76337, "tid": -914061504, "ts": 1716454225853356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225916818, "dur": 22, "args": { "External id": 280924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280924, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280924, "pid": 5, "tid": 7, "ts": 1716454225916818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853366, "dur": 5, "args": { "External id": 280924, "cbid": 211, "correlation": 280924 } }, { "ph": "s", "id": 280924, "pid": 76337, "tid": -914061504, "ts": 1716454225853366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225916841, "dur": 32, "args": { "External id": 280930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280930, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280930, "pid": 5, "tid": 7, "ts": 1716454225916841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853394, "dur": 9, "args": { "External id": 280930, "cbid": 211, "correlation": 280930 } }, { "ph": "s", "id": 280930, "pid": 76337, "tid": -914061504, "ts": 1716454225853394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225853453, "dur": 0, "args": { "External id": 280940, "cbid": 317, "correlation": 280940 } }, { "ph": "f", "id": 280940, "pid": 76337, "tid": -914061504, "ts": 1716454225853453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225853453, "dur": 0, "args": { "External id": 280941, "cbid": 203, "correlation": 280941 } }, { "ph": "f", "id": 280941, "pid": 76337, "tid": -914061504, "ts": 1716454225853453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225853454, "dur": 0, "args": { "External id": 280942, "cbid": 205, "correlation": 280942 } }, { "ph": "f", "id": 280942, "pid": 76337, "tid": -914061504, "ts": 1716454225853454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225916874, "dur": 55, "args": { "External id": 280946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280946, "pid": 5, "tid": 7, "ts": 1716454225916874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853468, "dur": 11, "args": { "External id": 280946, "cbid": 211, "correlation": 280946 } }, { "ph": "s", "id": 280946, "pid": 76337, "tid": -914061504, "ts": 1716454225853468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225916931, "dur": 257, "args": { "External id": 280948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280948, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 280948, "pid": 5, "tid": 7, "ts": 1716454225916931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853486, "dur": 8, "args": { "External id": 280948, "cbid": 211, "correlation": 280948 } }, { "ph": "s", "id": 280948, "pid": 76337, "tid": -914061504, "ts": 1716454225853486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225917189, "dur": 20, "args": { "External id": 280950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280950, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280950, "pid": 5, "tid": 7, "ts": 1716454225917189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853498, "dur": 5, "args": { "External id": 280950, "cbid": 211, "correlation": 280950 } }, { "ph": "s", "id": 280950, "pid": 76337, "tid": -914061504, "ts": 1716454225853498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225917210, "dur": 31, "args": { "External id": 280956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280956, "pid": 5, "tid": 7, "ts": 1716454225917210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853524, "dur": 8, "args": { "External id": 280956, "cbid": 211, "correlation": 280956 } }, { "ph": "s", "id": 280956, "pid": 76337, "tid": -914061504, "ts": 1716454225853524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225917243, "dur": 27, "args": { "External id": 280964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280964, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280964, "pid": 5, "tid": 7, "ts": 1716454225917243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853553, "dur": 8, "args": { "External id": 280964, "cbid": 211, "correlation": 280964 } }, { "ph": "s", "id": 280964, "pid": 76337, "tid": -914061504, "ts": 1716454225853553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225917271, "dur": 19, "args": { "External id": 280972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280972, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 280972, "pid": 5, "tid": 7, "ts": 1716454225917271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853583, "dur": 9, "args": { "External id": 280972, "cbid": 211, "correlation": 280972 } }, { "ph": "s", "id": 280972, "pid": 76337, "tid": -914061504, "ts": 1716454225853583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225917292, "dur": 29, "args": { "External id": 280992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 280992, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 280992, "pid": 5, "tid": 7, "ts": 1716454225917292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853668, "dur": 12, "args": { "External id": 280992, "cbid": 211, "correlation": 280992 } }, { "ph": "s", "id": 280992, "pid": 76337, "tid": -914061504, "ts": 1716454225853668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225917323, "dur": 4, "args": { "External id": 281004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281004, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 281004, "pid": 5, "tid": 7, "ts": 1716454225917323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853689, "dur": 6, "args": { "External id": 281004, "cbid": 211, "correlation": 281004 } }, { "ph": "s", "id": 281004, "pid": 76337, "tid": -914061504, "ts": 1716454225853689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225917328, "dur": 31, "args": { "External id": 281007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281007, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281007, "pid": 5, "tid": 7, "ts": 1716454225917328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853708, "dur": 7, "args": { "External id": 281007, "cbid": 211, "correlation": 281007 } }, { "ph": "s", "id": 281007, "pid": 76337, "tid": -914061504, "ts": 1716454225853708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225853765, "dur": 0, "args": { "External id": 281018, "cbid": 317, "correlation": 281018 } }, { "ph": "f", "id": 281018, "pid": 76337, "tid": -914061504, "ts": 1716454225853765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225853766, "dur": 0, "args": { "External id": 281019, "cbid": 203, "correlation": 281019 } }, { "ph": "f", "id": 281019, "pid": 76337, "tid": -914061504, "ts": 1716454225853766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225853767, "dur": 0, "args": { "External id": 281020, "cbid": 205, "correlation": 281020 } }, { "ph": "f", "id": 281020, "pid": 76337, "tid": -914061504, "ts": 1716454225853767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225917360, "dur": 22, "args": { "External id": 281024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281024, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281024, "pid": 5, "tid": 7, "ts": 1716454225917360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853781, "dur": 12, "args": { "External id": 281024, "cbid": 211, "correlation": 281024 } }, { "ph": "s", "id": 281024, "pid": 76337, "tid": -914061504, "ts": 1716454225853781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225917384, "dur": 100, "args": { "External id": 281026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281026, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281026, "pid": 5, "tid": 7, "ts": 1716454225917384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853799, "dur": 6, "args": { "External id": 281026, "cbid": 211, "correlation": 281026 } }, { "ph": "s", "id": 281026, "pid": 76337, "tid": -914061504, "ts": 1716454225853799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225917485, "dur": 20, "args": { "External id": 281028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281028, "pid": 5, "tid": 7, "ts": 1716454225917485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853810, "dur": 5, "args": { "External id": 281028, "cbid": 211, "correlation": 281028 } }, { "ph": "s", "id": 281028, "pid": 76337, "tid": -914061504, "ts": 1716454225853810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225917507, "dur": 31, "args": { "External id": 281034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281034, "pid": 5, "tid": 7, "ts": 1716454225917507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853841, "dur": 9, "args": { "External id": 281034, "cbid": 211, "correlation": 281034 } }, { "ph": "s", "id": 281034, "pid": 76337, "tid": -914061504, "ts": 1716454225853841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225917539, "dur": 179, "args": { "External id": 281043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281043, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281043, "pid": 5, "tid": 7, "ts": 1716454225917539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225853928, "dur": 16, "args": { "External id": 281043, "cbid": 211, "correlation": 281043 } }, { "ph": "s", "id": 281043, "pid": 76337, "tid": -914061504, "ts": 1716454225853928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225917719, "dur": 62, "args": { "External id": 281065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281065, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281065, "pid": 5, "tid": 7, "ts": 1716454225917719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854002, "dur": 12, "args": { "External id": 281065, "cbid": 211, "correlation": 281065 } }, { "ph": "s", "id": 281065, "pid": 76337, "tid": -914061504, "ts": 1716454225854002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854100, "dur": 1, "args": { "External id": 281076, "cbid": 251, "correlation": 281076 } }, { "ph": "f", "id": 281076, "pid": 76337, "tid": -914061504, "ts": 1716454225854100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225917782, "dur": 147, "args": { "External id": 281077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281077, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281077, "pid": 5, "tid": 7, "ts": 1716454225917782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854106, "dur": 14, "args": { "External id": 281077, "cbid": 211, "correlation": 281077 } }, { "ph": "s", "id": 281077, "pid": 76337, "tid": -914061504, "ts": 1716454225854106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854177, "dur": 1, "args": { "External id": 281088, "cbid": 251, "correlation": 281088 } }, { "ph": "f", "id": 281088, "pid": 76337, "tid": -914061504, "ts": 1716454225854177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225917931, "dur": 142, "args": { "External id": 281089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281089, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281089, "pid": 5, "tid": 7, "ts": 1716454225917931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854181, "dur": 11, "args": { "External id": 281089, "cbid": 211, "correlation": 281089 } }, { "ph": "s", "id": 281089, "pid": 76337, "tid": -914061504, "ts": 1716454225854181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854247, "dur": 1, "args": { "External id": 281100, "cbid": 251, "correlation": 281100 } }, { "ph": "f", "id": 281100, "pid": 76337, "tid": -914061504, "ts": 1716454225854247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225918074, "dur": 139, "args": { "External id": 281101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281101, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281101, "pid": 5, "tid": 7, "ts": 1716454225918074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854251, "dur": 11, "args": { "External id": 281101, "cbid": 211, "correlation": 281101 } }, { "ph": "s", "id": 281101, "pid": 76337, "tid": -914061504, "ts": 1716454225854251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225918214, "dur": 1853, "args": { "External id": 281122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281122, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 281122, "pid": 5, "tid": 7, "ts": 1716454225918214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854339, "dur": 14, "args": { "External id": 281122, "cbid": 211, "correlation": 281122 } }, { "ph": "s", "id": 281122, "pid": 76337, "tid": -914061504, "ts": 1716454225854339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854447, "dur": 2, "args": { "External id": 281140, "cbid": 251, "correlation": 281140 } }, { "ph": "f", "id": 281140, "pid": 76337, "tid": -914061504, "ts": 1716454225854447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225920068, "dur": 142, "args": { "External id": 281142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281142, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 281142, "pid": 5, "tid": 7, "ts": 1716454225920068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854454, "dur": 14, "args": { "External id": 281142, "cbid": 211, "correlation": 281142 } }, { "ph": "s", "id": 281142, "pid": 76337, "tid": -914061504, "ts": 1716454225854454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225920212, "dur": 35, "args": { "External id": 281150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281150, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281150, "pid": 5, "tid": 7, "ts": 1716454225920212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854525, "dur": 13, "args": { "External id": 281150, "cbid": 211, "correlation": 281150 } }, { "ph": "s", "id": 281150, "pid": 76337, "tid": -914061504, "ts": 1716454225854525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225920249, "dur": 50, "args": { "External id": 281158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281158, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281158, "pid": 5, "tid": 7, "ts": 1716454225920249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854565, "dur": 9, "args": { "External id": 281158, "cbid": 211, "correlation": 281158 } }, { "ph": "s", "id": 281158, "pid": 76337, "tid": -914061504, "ts": 1716454225854565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225920300, "dur": 29, "args": { "External id": 281169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281169, "pid": 5, "tid": 7, "ts": 1716454225920300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854639, "dur": 13, "args": { "External id": 281169, "cbid": 211, "correlation": 281169 } }, { "ph": "s", "id": 281169, "pid": 76337, "tid": -914061504, "ts": 1716454225854639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225920331, "dur": 33, "args": { "External id": 281191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281191, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281191, "pid": 5, "tid": 7, "ts": 1716454225920331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854671, "dur": 7, "args": { "External id": 281191, "cbid": 211, "correlation": 281191 } }, { "ph": "s", "id": 281191, "pid": 76337, "tid": -914061504, "ts": 1716454225854671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854755, "dur": 1, "args": { "External id": 281202, "cbid": 251, "correlation": 281202 } }, { "ph": "f", "id": 281202, "pid": 76337, "tid": -914061504, "ts": 1716454225854755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225920365, "dur": 86, "args": { "External id": 281203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281203, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281203, "pid": 5, "tid": 7, "ts": 1716454225920365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854761, "dur": 14, "args": { "External id": 281203, "cbid": 211, "correlation": 281203 } }, { "ph": "s", "id": 281203, "pid": 76337, "tid": -914061504, "ts": 1716454225854761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854833, "dur": 1, "args": { "External id": 281214, "cbid": 251, "correlation": 281214 } }, { "ph": "f", "id": 281214, "pid": 76337, "tid": -914061504, "ts": 1716454225854833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854837, "dur": 0, "args": { "External id": 281215, "cbid": 251, "correlation": 281215 } }, { "ph": "f", "id": 281215, "pid": 76337, "tid": -914061504, "ts": 1716454225854837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225920452, "dur": 10, "args": { "External id": 281216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281216, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 281216, "pid": 5, "tid": 7, "ts": 1716454225920452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854838, "dur": 12, "args": { "External id": 281216, "cbid": 211, "correlation": 281216 } }, { "ph": "s", "id": 281216, "pid": 76337, "tid": -914061504, "ts": 1716454225854838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225920464, "dur": 5, "args": { "External id": 281218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281218, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 281218, "pid": 5, "tid": 7, "ts": 1716454225920464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854852, "dur": 6, "args": { "External id": 281218, "cbid": 211, "correlation": 281218 } }, { "ph": "s", "id": 281218, "pid": 76337, "tid": -914061504, "ts": 1716454225854852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854910, "dur": 1, "args": { "External id": 281229, "cbid": 251, "correlation": 281229 } }, { "ph": "f", "id": 281229, "pid": 76337, "tid": -914061504, "ts": 1716454225854910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225854913, "dur": 0, "args": { "External id": 281230, "cbid": 251, "correlation": 281230 } }, { "ph": "f", "id": 281230, "pid": 76337, "tid": -914061504, "ts": 1716454225854913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225920470, "dur": 7, "args": { "External id": 281231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281231, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 281231, "pid": 5, "tid": 7, "ts": 1716454225920470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854914, "dur": 13, "args": { "External id": 281231, "cbid": 211, "correlation": 281231 } }, { "ph": "s", "id": 281231, "pid": 76337, "tid": -914061504, "ts": 1716454225854914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225920478, "dur": 3, "args": { "External id": 281233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281233, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 281233, "pid": 5, "tid": 7, "ts": 1716454225920478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225854928, "dur": 5, "args": { "External id": 281233, "cbid": 211, "correlation": 281233 } }, { "ph": "s", "id": 281233, "pid": 76337, "tid": -914061504, "ts": 1716454225854928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225920483, "dur": 88, "args": { "External id": 281254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281254, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 281254, "pid": 5, "tid": 7, "ts": 1716454225920483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855009, "dur": 13, "args": { "External id": 281254, "cbid": 211, "correlation": 281254 } }, { "ph": "s", "id": 281254, "pid": 76337, "tid": -914061504, "ts": 1716454225855009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225855107, "dur": 2, "args": { "External id": 281272, "cbid": 251, "correlation": 281272 } }, { "ph": "f", "id": 281272, "pid": 76337, "tid": -914061504, "ts": 1716454225855107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225920572, "dur": 94, "args": { "External id": 281274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281274, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281274, "pid": 5, "tid": 7, "ts": 1716454225920572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855113, "dur": 14, "args": { "External id": 281274, "cbid": 211, "correlation": 281274 } }, { "ph": "s", "id": 281274, "pid": 76337, "tid": -914061504, "ts": 1716454225855113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225920668, "dur": 19, "args": { "External id": 281282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281282, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281282, "pid": 5, "tid": 7, "ts": 1716454225920668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855183, "dur": 12, "args": { "External id": 281282, "cbid": 211, "correlation": 281282 } }, { "ph": "s", "id": 281282, "pid": 76337, "tid": -914061504, "ts": 1716454225855183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225920688, "dur": 37, "args": { "External id": 281290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281290, "pid": 5, "tid": 7, "ts": 1716454225920688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855224, "dur": 10, "args": { "External id": 281290, "cbid": 211, "correlation": 281290 } }, { "ph": "s", "id": 281290, "pid": 76337, "tid": -914061504, "ts": 1716454225855224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225920727, "dur": 33, "args": { "External id": 281312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281312, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281312, "pid": 5, "tid": 7, "ts": 1716454225920727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855275, "dur": 11, "args": { "External id": 281312, "cbid": 211, "correlation": 281312 } }, { "ph": "s", "id": 281312, "pid": 76337, "tid": -914061504, "ts": 1716454225855275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225855369, "dur": 1, "args": { "External id": 281328, "cbid": 251, "correlation": 281328 } }, { "ph": "f", "id": 281328, "pid": 76337, "tid": -914061504, "ts": 1716454225855369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225855374, "dur": 0, "args": { "External id": 281330, "cbid": 251, "correlation": 281330 } }, { "ph": "f", "id": 281330, "pid": 76337, "tid": -914061504, "ts": 1716454225855374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225920762, "dur": 523, "args": { "External id": 281331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281331, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 281331, "pid": 5, "tid": 7, "ts": 1716454225920762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855381, "dur": 13, "args": { "External id": 281331, "cbid": 211, "correlation": 281331 } }, { "ph": "s", "id": 281331, "pid": 76337, "tid": -914061504, "ts": 1716454225855381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225921286, "dur": 122, "args": { "External id": 281339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281339, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281339, "pid": 5, "tid": 7, "ts": 1716454225921286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855452, "dur": 13, "args": { "External id": 281339, "cbid": 211, "correlation": 281339 } }, { "ph": "s", "id": 281339, "pid": 76337, "tid": -914061504, "ts": 1716454225855452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225921409, "dur": 127, "args": { "External id": 281347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281347, "pid": 5, "tid": 7, "ts": 1716454225921409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855488, "dur": 8, "args": { "External id": 281347, "cbid": 211, "correlation": 281347 } }, { "ph": "s", "id": 281347, "pid": 76337, "tid": -914061504, "ts": 1716454225855488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225855565, "dur": 1, "args": { "External id": 281363, "cbid": 251, "correlation": 281363 } }, { "ph": "f", "id": 281363, "pid": 76337, "tid": -914061504, "ts": 1716454225855565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225921537, "dur": 300, "args": { "External id": 281365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281365, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281365, "pid": 5, "tid": 7, "ts": 1716454225921537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855571, "dur": 13, "args": { "External id": 281365, "cbid": 211, "correlation": 281365 } }, { "ph": "s", "id": 281365, "pid": 76337, "tid": -914061504, "ts": 1716454225855571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225921838, "dur": 27, "args": { "External id": 281373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281373, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281373, "pid": 5, "tid": 7, "ts": 1716454225921838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855613, "dur": 10, "args": { "External id": 281373, "cbid": 211, "correlation": 281373 } }, { "ph": "s", "id": 281373, "pid": 76337, "tid": -914061504, "ts": 1716454225855613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225921867, "dur": 78, "args": { "External id": 281384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281384, "pid": 5, "tid": 7, "ts": 1716454225921867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855683, "dur": 12, "args": { "External id": 281384, "cbid": 211, "correlation": 281384 } }, { "ph": "s", "id": 281384, "pid": 76337, "tid": -914061504, "ts": 1716454225855683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225855746, "dur": 0, "args": { "External id": 281396, "cbid": 317, "correlation": 281396 } }, { "ph": "f", "id": 281396, "pid": 76337, "tid": -914061504, "ts": 1716454225855746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225855747, "dur": 0, "args": { "External id": 281397, "cbid": 203, "correlation": 281397 } }, { "ph": "f", "id": 281397, "pid": 76337, "tid": -914061504, "ts": 1716454225855747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225855748, "dur": 0, "args": { "External id": 281398, "cbid": 205, "correlation": 281398 } }, { "ph": "f", "id": 281398, "pid": 76337, "tid": -914061504, "ts": 1716454225855748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225921946, "dur": 23, "args": { "External id": 281402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281402, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281402, "pid": 5, "tid": 7, "ts": 1716454225921946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855763, "dur": 12, "args": { "External id": 281402, "cbid": 211, "correlation": 281402 } }, { "ph": "s", "id": 281402, "pid": 76337, "tid": -914061504, "ts": 1716454225855763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225921970, "dur": 115, "args": { "External id": 281404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281404, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281404, "pid": 5, "tid": 7, "ts": 1716454225921970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855781, "dur": 6, "args": { "External id": 281404, "cbid": 211, "correlation": 281404 } }, { "ph": "s", "id": 281404, "pid": 76337, "tid": -914061504, "ts": 1716454225855781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225922087, "dur": 24, "args": { "External id": 281406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281406, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281406, "pid": 5, "tid": 7, "ts": 1716454225922087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855792, "dur": 5, "args": { "External id": 281406, "cbid": 211, "correlation": 281406 } }, { "ph": "s", "id": 281406, "pid": 76337, "tid": -914061504, "ts": 1716454225855792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225922112, "dur": 31, "args": { "External id": 281412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281412, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281412, "pid": 5, "tid": 7, "ts": 1716454225922112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855819, "dur": 9, "args": { "External id": 281412, "cbid": 211, "correlation": 281412 } }, { "ph": "s", "id": 281412, "pid": 76337, "tid": -914061504, "ts": 1716454225855819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225922145, "dur": 27, "args": { "External id": 281420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281420, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281420, "pid": 5, "tid": 7, "ts": 1716454225922145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855851, "dur": 8, "args": { "External id": 281420, "cbid": 211, "correlation": 281420 } }, { "ph": "s", "id": 281420, "pid": 76337, "tid": -914061504, "ts": 1716454225855851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225922173, "dur": 52, "args": { "External id": 281429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281429, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281429, "pid": 5, "tid": 7, "ts": 1716454225922173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855890, "dur": 10, "args": { "External id": 281429, "cbid": 211, "correlation": 281429 } }, { "ph": "s", "id": 281429, "pid": 76337, "tid": -914061504, "ts": 1716454225855890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225922226, "dur": 51, "args": { "External id": 281449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281449, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 281449, "pid": 5, "tid": 7, "ts": 1716454225922226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855960, "dur": 11, "args": { "External id": 281449, "cbid": 211, "correlation": 281449 } }, { "ph": "s", "id": 281449, "pid": 76337, "tid": -914061504, "ts": 1716454225855960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225922279, "dur": 4, "args": { "External id": 281461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281461, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 281461, "pid": 5, "tid": 7, "ts": 1716454225922279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225855991, "dur": 7, "args": { "External id": 281461, "cbid": 211, "correlation": 281461 } }, { "ph": "s", "id": 281461, "pid": 76337, "tid": -914061504, "ts": 1716454225855991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225922284, "dur": 55, "args": { "External id": 281464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281464, "pid": 5, "tid": 7, "ts": 1716454225922284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856010, "dur": 6, "args": { "External id": 281464, "cbid": 211, "correlation": 281464 } }, { "ph": "s", "id": 281464, "pid": 76337, "tid": -914061504, "ts": 1716454225856010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225922340, "dur": 37, "args": { "External id": 281473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281473, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281473, "pid": 5, "tid": 7, "ts": 1716454225922340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856049, "dur": 10, "args": { "External id": 281473, "cbid": 211, "correlation": 281473 } }, { "ph": "s", "id": 281473, "pid": 76337, "tid": -914061504, "ts": 1716454225856049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225856103, "dur": 0, "args": { "External id": 281483, "cbid": 317, "correlation": 281483 } }, { "ph": "f", "id": 281483, "pid": 76337, "tid": -914061504, "ts": 1716454225856103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225856104, "dur": 0, "args": { "External id": 281484, "cbid": 203, "correlation": 281484 } }, { "ph": "f", "id": 281484, "pid": 76337, "tid": -914061504, "ts": 1716454225856104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225856105, "dur": 0, "args": { "External id": 281485, "cbid": 205, "correlation": 281485 } }, { "ph": "f", "id": 281485, "pid": 76337, "tid": -914061504, "ts": 1716454225856105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225922378, "dur": 39, "args": { "External id": 281489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281489, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281489, "pid": 5, "tid": 7, "ts": 1716454225922378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856119, "dur": 11, "args": { "External id": 281489, "cbid": 211, "correlation": 281489 } }, { "ph": "s", "id": 281489, "pid": 76337, "tid": -914061504, "ts": 1716454225856119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225922419, "dur": 80, "args": { "External id": 281491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281491, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281491, "pid": 5, "tid": 7, "ts": 1716454225922419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856133, "dur": 5, "args": { "External id": 281491, "cbid": 211, "correlation": 281491 } }, { "ph": "s", "id": 281491, "pid": 76337, "tid": -914061504, "ts": 1716454225856133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225922500, "dur": 1249, "args": { "External id": 281493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281493, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281493, "pid": 5, "tid": 7, "ts": 1716454225922500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856144, "dur": 6, "args": { "External id": 281493, "cbid": 211, "correlation": 281493 } }, { "ph": "s", "id": 281493, "pid": 76337, "tid": -914061504, "ts": 1716454225856144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225923750, "dur": 20, "args": { "External id": 281495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281495, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281495, "pid": 5, "tid": 7, "ts": 1716454225923750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856154, "dur": 5, "args": { "External id": 281495, "cbid": 211, "correlation": 281495 } }, { "ph": "s", "id": 281495, "pid": 76337, "tid": -914061504, "ts": 1716454225856154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225923771, "dur": 32, "args": { "External id": 281501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281501, "pid": 5, "tid": 7, "ts": 1716454225923771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856181, "dur": 9, "args": { "External id": 281501, "cbid": 211, "correlation": 281501 } }, { "ph": "s", "id": 281501, "pid": 76337, "tid": -914061504, "ts": 1716454225856181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225923804, "dur": 4, "args": { "External id": 281509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281509, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 281509, "pid": 5, "tid": 7, "ts": 1716454225923804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856225, "dur": 9, "args": { "External id": 281509, "cbid": 211, "correlation": 281509 } }, { "ph": "s", "id": 281509, "pid": 76337, "tid": -914061504, "ts": 1716454225856225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225856291, "dur": 1, "args": { "External id": 281525, "cbid": 251, "correlation": 281525 } }, { "ph": "f", "id": 281525, "pid": 76337, "tid": -914061504, "ts": 1716454225856291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225856296, "dur": 0, "args": { "External id": 281527, "cbid": 251, "correlation": 281527 } }, { "ph": "f", "id": 281527, "pid": 76337, "tid": -914061504, "ts": 1716454225856296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225923809, "dur": 11, "args": { "External id": 281528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281528, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 281528, "pid": 5, "tid": 7, "ts": 1716454225923809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856298, "dur": 11, "args": { "External id": 281528, "cbid": 211, "correlation": 281528 } }, { "ph": "s", "id": 281528, "pid": 76337, "tid": -914061504, "ts": 1716454225856298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225923822, "dur": 5, "args": { "External id": 281530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281530, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 281530, "pid": 5, "tid": 7, "ts": 1716454225923822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856311, "dur": 5, "args": { "External id": 281530, "cbid": 211, "correlation": 281530 } }, { "ph": "s", "id": 281530, "pid": 76337, "tid": -914061504, "ts": 1716454225856311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225923828, "dur": 28, "args": { "External id": 281540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281540, "pid": 5, "tid": 7, "ts": 1716454225923828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856369, "dur": 12, "args": { "External id": 281540, "cbid": 211, "correlation": 281540 } }, { "ph": "s", "id": 281540, "pid": 76337, "tid": -914061504, "ts": 1716454225856369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225923857, "dur": 29, "args": { "External id": 281560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281560, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 281560, "pid": 5, "tid": 7, "ts": 1716454225923857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856435, "dur": 11, "args": { "External id": 281560, "cbid": 211, "correlation": 281560 } }, { "ph": "s", "id": 281560, "pid": 76337, "tid": -914061504, "ts": 1716454225856435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225923888, "dur": 4, "args": { "External id": 281572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281572, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 281572, "pid": 5, "tid": 7, "ts": 1716454225923888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856456, "dur": 7, "args": { "External id": 281572, "cbid": 211, "correlation": 281572 } }, { "ph": "s", "id": 281572, "pid": 76337, "tid": -914061504, "ts": 1716454225856456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225923893, "dur": 29, "args": { "External id": 281575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281575, "pid": 5, "tid": 7, "ts": 1716454225923893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856476, "dur": 8, "args": { "External id": 281575, "cbid": 211, "correlation": 281575 } }, { "ph": "s", "id": 281575, "pid": 76337, "tid": -914061504, "ts": 1716454225856476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225923923, "dur": 21, "args": { "External id": 281584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281584, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281584, "pid": 5, "tid": 7, "ts": 1716454225923923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856519, "dur": 10, "args": { "External id": 281584, "cbid": 211, "correlation": 281584 } }, { "ph": "s", "id": 281584, "pid": 76337, "tid": -914061504, "ts": 1716454225856519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225856581, "dur": 0, "args": { "External id": 281594, "cbid": 317, "correlation": 281594 } }, { "ph": "f", "id": 281594, "pid": 76337, "tid": -914061504, "ts": 1716454225856581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225856581, "dur": 0, "args": { "External id": 281595, "cbid": 203, "correlation": 281595 } }, { "ph": "f", "id": 281595, "pid": 76337, "tid": -914061504, "ts": 1716454225856581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225856582, "dur": 0, "args": { "External id": 281596, "cbid": 205, "correlation": 281596 } }, { "ph": "f", "id": 281596, "pid": 76337, "tid": -914061504, "ts": 1716454225856582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225923945, "dur": 22, "args": { "External id": 281600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281600, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281600, "pid": 5, "tid": 7, "ts": 1716454225923945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856597, "dur": 12, "args": { "External id": 281600, "cbid": 211, "correlation": 281600 } }, { "ph": "s", "id": 281600, "pid": 76337, "tid": -914061504, "ts": 1716454225856597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225923968, "dur": 42, "args": { "External id": 281602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281602, "pid": 5, "tid": 7, "ts": 1716454225923968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856611, "dur": 5, "args": { "External id": 281602, "cbid": 211, "correlation": 281602 } }, { "ph": "s", "id": 281602, "pid": 76337, "tid": -914061504, "ts": 1716454225856611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225924012, "dur": 626, "args": { "External id": 281604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281604, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281604, "pid": 5, "tid": 7, "ts": 1716454225924012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856622, "dur": 6, "args": { "External id": 281604, "cbid": 211, "correlation": 281604 } }, { "ph": "s", "id": 281604, "pid": 76337, "tid": -914061504, "ts": 1716454225856622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225924639, "dur": 21, "args": { "External id": 281606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281606, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281606, "pid": 5, "tid": 7, "ts": 1716454225924639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856631, "dur": 5, "args": { "External id": 281606, "cbid": 211, "correlation": 281606 } }, { "ph": "s", "id": 281606, "pid": 76337, "tid": -914061504, "ts": 1716454225856631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225924661, "dur": 32, "args": { "External id": 281612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281612, "pid": 5, "tid": 7, "ts": 1716454225924661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856659, "dur": 8, "args": { "External id": 281612, "cbid": 211, "correlation": 281612 } }, { "ph": "s", "id": 281612, "pid": 76337, "tid": -914061504, "ts": 1716454225856659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225856718, "dur": 0, "args": { "External id": 281622, "cbid": 317, "correlation": 281622 } }, { "ph": "f", "id": 281622, "pid": 76337, "tid": -914061504, "ts": 1716454225856718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225856718, "dur": 0, "args": { "External id": 281623, "cbid": 203, "correlation": 281623 } }, { "ph": "f", "id": 281623, "pid": 76337, "tid": -914061504, "ts": 1716454225856718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225856719, "dur": 0, "args": { "External id": 281624, "cbid": 205, "correlation": 281624 } }, { "ph": "f", "id": 281624, "pid": 76337, "tid": -914061504, "ts": 1716454225856719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225924694, "dur": 37, "args": { "External id": 281628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281628, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281628, "pid": 5, "tid": 7, "ts": 1716454225924694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856732, "dur": 12, "args": { "External id": 281628, "cbid": 211, "correlation": 281628 } }, { "ph": "s", "id": 281628, "pid": 76337, "tid": -914061504, "ts": 1716454225856732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225924733, "dur": 183, "args": { "External id": 281630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281630, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281630, "pid": 5, "tid": 7, "ts": 1716454225924733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856750, "dur": 6, "args": { "External id": 281630, "cbid": 211, "correlation": 281630 } }, { "ph": "s", "id": 281630, "pid": 76337, "tid": -914061504, "ts": 1716454225856750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225924917, "dur": 21, "args": { "External id": 281632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281632, "pid": 5, "tid": 7, "ts": 1716454225924917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856759, "dur": 5, "args": { "External id": 281632, "cbid": 211, "correlation": 281632 } }, { "ph": "s", "id": 281632, "pid": 76337, "tid": -914061504, "ts": 1716454225856759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225924940, "dur": 32, "args": { "External id": 281638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281638, "pid": 5, "tid": 7, "ts": 1716454225924940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856785, "dur": 8, "args": { "External id": 281638, "cbid": 211, "correlation": 281638 } }, { "ph": "s", "id": 281638, "pid": 76337, "tid": -914061504, "ts": 1716454225856785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225924973, "dur": 27, "args": { "External id": 281646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281646, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281646, "pid": 5, "tid": 7, "ts": 1716454225924973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856814, "dur": 8, "args": { "External id": 281646, "cbid": 211, "correlation": 281646 } }, { "ph": "s", "id": 281646, "pid": 76337, "tid": -914061504, "ts": 1716454225856814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225925001, "dur": 20, "args": { "External id": 281654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281654, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281654, "pid": 5, "tid": 7, "ts": 1716454225925001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856842, "dur": 9, "args": { "External id": 281654, "cbid": 211, "correlation": 281654 } }, { "ph": "s", "id": 281654, "pid": 76337, "tid": -914061504, "ts": 1716454225856842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225925023, "dur": 29, "args": { "External id": 281674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281674, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 281674, "pid": 5, "tid": 7, "ts": 1716454225925023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856925, "dur": 12, "args": { "External id": 281674, "cbid": 211, "correlation": 281674 } }, { "ph": "s", "id": 281674, "pid": 76337, "tid": -914061504, "ts": 1716454225856925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225925053, "dur": 4, "args": { "External id": 281686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281686, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 281686, "pid": 5, "tid": 7, "ts": 1716454225925053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856947, "dur": 6, "args": { "External id": 281686, "cbid": 211, "correlation": 281686 } }, { "ph": "s", "id": 281686, "pid": 76337, "tid": -914061504, "ts": 1716454225856947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225925058, "dur": 29, "args": { "External id": 281689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281689, "pid": 5, "tid": 7, "ts": 1716454225925058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225856966, "dur": 7, "args": { "External id": 281689, "cbid": 211, "correlation": 281689 } }, { "ph": "s", "id": 281689, "pid": 76337, "tid": -914061504, "ts": 1716454225856966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225857031, "dur": 0, "args": { "External id": 281700, "cbid": 317, "correlation": 281700 } }, { "ph": "f", "id": 281700, "pid": 76337, "tid": -914061504, "ts": 1716454225857031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225857032, "dur": 0, "args": { "External id": 281701, "cbid": 203, "correlation": 281701 } }, { "ph": "f", "id": 281701, "pid": 76337, "tid": -914061504, "ts": 1716454225857032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225857032, "dur": 0, "args": { "External id": 281702, "cbid": 205, "correlation": 281702 } }, { "ph": "f", "id": 281702, "pid": 76337, "tid": -914061504, "ts": 1716454225857032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225925088, "dur": 21, "args": { "External id": 281706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281706, "pid": 5, "tid": 7, "ts": 1716454225925088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857045, "dur": 13, "args": { "External id": 281706, "cbid": 211, "correlation": 281706 } }, { "ph": "s", "id": 281706, "pid": 76337, "tid": -914061504, "ts": 1716454225857045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225925110, "dur": 100, "args": { "External id": 281708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281708, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281708, "pid": 5, "tid": 7, "ts": 1716454225925110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857064, "dur": 6, "args": { "External id": 281708, "cbid": 211, "correlation": 281708 } }, { "ph": "s", "id": 281708, "pid": 76337, "tid": -914061504, "ts": 1716454225857064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225925211, "dur": 22, "args": { "External id": 281710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281710, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281710, "pid": 5, "tid": 7, "ts": 1716454225925211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857074, "dur": 5, "args": { "External id": 281710, "cbid": 211, "correlation": 281710 } }, { "ph": "s", "id": 281710, "pid": 76337, "tid": -914061504, "ts": 1716454225857074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225925235, "dur": 31, "args": { "External id": 281716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281716, "pid": 5, "tid": 7, "ts": 1716454225925235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857101, "dur": 9, "args": { "External id": 281716, "cbid": 211, "correlation": 281716 } }, { "ph": "s", "id": 281716, "pid": 76337, "tid": -914061504, "ts": 1716454225857101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225925267, "dur": 166, "args": { "External id": 281725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281725, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281725, "pid": 5, "tid": 7, "ts": 1716454225925267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857183, "dur": 14, "args": { "External id": 281725, "cbid": 211, "correlation": 281725 } }, { "ph": "s", "id": 281725, "pid": 76337, "tid": -914061504, "ts": 1716454225857183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225925435, "dur": 62, "args": { "External id": 281747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281747, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281747, "pid": 5, "tid": 7, "ts": 1716454225925435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857240, "dur": 10, "args": { "External id": 281747, "cbid": 211, "correlation": 281747 } }, { "ph": "s", "id": 281747, "pid": 76337, "tid": -914061504, "ts": 1716454225857240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225857327, "dur": 1, "args": { "External id": 281758, "cbid": 251, "correlation": 281758 } }, { "ph": "f", "id": 281758, "pid": 76337, "tid": -914061504, "ts": 1716454225857327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225925499, "dur": 147, "args": { "External id": 281759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281759, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281759, "pid": 5, "tid": 7, "ts": 1716454225925499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857333, "dur": 13, "args": { "External id": 281759, "cbid": 211, "correlation": 281759 } }, { "ph": "s", "id": 281759, "pid": 76337, "tid": -914061504, "ts": 1716454225857333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225857403, "dur": 1, "args": { "External id": 281770, "cbid": 251, "correlation": 281770 } }, { "ph": "f", "id": 281770, "pid": 76337, "tid": -914061504, "ts": 1716454225857403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225925647, "dur": 143, "args": { "External id": 281771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281771, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281771, "pid": 5, "tid": 7, "ts": 1716454225925647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857406, "dur": 12, "args": { "External id": 281771, "cbid": 211, "correlation": 281771 } }, { "ph": "s", "id": 281771, "pid": 76337, "tid": -914061504, "ts": 1716454225857406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225857476, "dur": 1, "args": { "External id": 281782, "cbid": 251, "correlation": 281782 } }, { "ph": "f", "id": 281782, "pid": 76337, "tid": -914061504, "ts": 1716454225857476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225925791, "dur": 143, "args": { "External id": 281783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281783, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281783, "pid": 5, "tid": 7, "ts": 1716454225925791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857479, "dur": 12, "args": { "External id": 281783, "cbid": 211, "correlation": 281783 } }, { "ph": "s", "id": 281783, "pid": 76337, "tid": -914061504, "ts": 1716454225857479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225925935, "dur": 1849, "args": { "External id": 281804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281804, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 281804, "pid": 5, "tid": 7, "ts": 1716454225925935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857560, "dur": 13, "args": { "External id": 281804, "cbid": 211, "correlation": 281804 } }, { "ph": "s", "id": 281804, "pid": 76337, "tid": -914061504, "ts": 1716454225857560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225857658, "dur": 1, "args": { "External id": 281822, "cbid": 251, "correlation": 281822 } }, { "ph": "f", "id": 281822, "pid": 76337, "tid": -914061504, "ts": 1716454225857658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225927785, "dur": 144, "args": { "External id": 281824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281824, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 281824, "pid": 5, "tid": 7, "ts": 1716454225927785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857664, "dur": 13, "args": { "External id": 281824, "cbid": 211, "correlation": 281824 } }, { "ph": "s", "id": 281824, "pid": 76337, "tid": -914061504, "ts": 1716454225857664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225927931, "dur": 35, "args": { "External id": 281832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281832, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281832, "pid": 5, "tid": 7, "ts": 1716454225927931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857734, "dur": 12, "args": { "External id": 281832, "cbid": 211, "correlation": 281832 } }, { "ph": "s", "id": 281832, "pid": 76337, "tid": -914061504, "ts": 1716454225857734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225927967, "dur": 51, "args": { "External id": 281840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281840, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281840, "pid": 5, "tid": 7, "ts": 1716454225927967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857774, "dur": 9, "args": { "External id": 281840, "cbid": 211, "correlation": 281840 } }, { "ph": "s", "id": 281840, "pid": 76337, "tid": -914061504, "ts": 1716454225857774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225928019, "dur": 29, "args": { "External id": 281851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281851, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281851, "pid": 5, "tid": 7, "ts": 1716454225928019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857844, "dur": 13, "args": { "External id": 281851, "cbid": 211, "correlation": 281851 } }, { "ph": "s", "id": 281851, "pid": 76337, "tid": -914061504, "ts": 1716454225857844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225928049, "dur": 33, "args": { "External id": 281873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281873, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281873, "pid": 5, "tid": 7, "ts": 1716454225928049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857876, "dur": 7, "args": { "External id": 281873, "cbid": 211, "correlation": 281873 } }, { "ph": "s", "id": 281873, "pid": 76337, "tid": -914061504, "ts": 1716454225857876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225857960, "dur": 1, "args": { "External id": 281884, "cbid": 251, "correlation": 281884 } }, { "ph": "f", "id": 281884, "pid": 76337, "tid": -914061504, "ts": 1716454225857960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225928083, "dur": 86, "args": { "External id": 281885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281885, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281885, "pid": 5, "tid": 7, "ts": 1716454225928083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225857965, "dur": 21, "args": { "External id": 281885, "cbid": 211, "correlation": 281885 } }, { "ph": "s", "id": 281885, "pid": 76337, "tid": -914061504, "ts": 1716454225857965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858043, "dur": 1, "args": { "External id": 281896, "cbid": 251, "correlation": 281896 } }, { "ph": "f", "id": 281896, "pid": 76337, "tid": -914061504, "ts": 1716454225858043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858047, "dur": 0, "args": { "External id": 281897, "cbid": 251, "correlation": 281897 } }, { "ph": "f", "id": 281897, "pid": 76337, "tid": -914061504, "ts": 1716454225858047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225928171, "dur": 11, "args": { "External id": 281898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281898, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 281898, "pid": 5, "tid": 7, "ts": 1716454225928171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858048, "dur": 12, "args": { "External id": 281898, "cbid": 211, "correlation": 281898 } }, { "ph": "s", "id": 281898, "pid": 76337, "tid": -914061504, "ts": 1716454225858048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225928183, "dur": 5, "args": { "External id": 281900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281900, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 281900, "pid": 5, "tid": 7, "ts": 1716454225928183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858062, "dur": 6, "args": { "External id": 281900, "cbid": 211, "correlation": 281900 } }, { "ph": "s", "id": 281900, "pid": 76337, "tid": -914061504, "ts": 1716454225858062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858120, "dur": 1, "args": { "External id": 281911, "cbid": 251, "correlation": 281911 } }, { "ph": "f", "id": 281911, "pid": 76337, "tid": -914061504, "ts": 1716454225858120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858124, "dur": 0, "args": { "External id": 281912, "cbid": 251, "correlation": 281912 } }, { "ph": "f", "id": 281912, "pid": 76337, "tid": -914061504, "ts": 1716454225858124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225928190, "dur": 7, "args": { "External id": 281913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281913, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 281913, "pid": 5, "tid": 7, "ts": 1716454225928190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858125, "dur": 12, "args": { "External id": 281913, "cbid": 211, "correlation": 281913 } }, { "ph": "s", "id": 281913, "pid": 76337, "tid": -914061504, "ts": 1716454225858125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225928198, "dur": 3, "args": { "External id": 281915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281915, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 281915, "pid": 5, "tid": 7, "ts": 1716454225928198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858139, "dur": 5, "args": { "External id": 281915, "cbid": 211, "correlation": 281915 } }, { "ph": "s", "id": 281915, "pid": 76337, "tid": -914061504, "ts": 1716454225858139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225928202, "dur": 88, "args": { "External id": 281936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281936, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 281936, "pid": 5, "tid": 7, "ts": 1716454225928202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858212, "dur": 12, "args": { "External id": 281936, "cbid": 211, "correlation": 281936 } }, { "ph": "s", "id": 281936, "pid": 76337, "tid": -914061504, "ts": 1716454225858212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858308, "dur": 1, "args": { "External id": 281954, "cbid": 251, "correlation": 281954 } }, { "ph": "f", "id": 281954, "pid": 76337, "tid": -914061504, "ts": 1716454225858308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225928292, "dur": 93, "args": { "External id": 281956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281956, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 281956, "pid": 5, "tid": 7, "ts": 1716454225928292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858314, "dur": 13, "args": { "External id": 281956, "cbid": 211, "correlation": 281956 } }, { "ph": "s", "id": 281956, "pid": 76337, "tid": -914061504, "ts": 1716454225858314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225928386, "dur": 19, "args": { "External id": 281964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281964, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281964, "pid": 5, "tid": 7, "ts": 1716454225928386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858381, "dur": 13, "args": { "External id": 281964, "cbid": 211, "correlation": 281964 } }, { "ph": "s", "id": 281964, "pid": 76337, "tid": -914061504, "ts": 1716454225858381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225928407, "dur": 37, "args": { "External id": 281972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281972, "pid": 5, "tid": 7, "ts": 1716454225928407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858422, "dur": 9, "args": { "External id": 281972, "cbid": 211, "correlation": 281972 } }, { "ph": "s", "id": 281972, "pid": 76337, "tid": -914061504, "ts": 1716454225858422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225928445, "dur": 33, "args": { "External id": 281994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 281994, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 281994, "pid": 5, "tid": 7, "ts": 1716454225928445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858473, "dur": 10, "args": { "External id": 281994, "cbid": 211, "correlation": 281994 } }, { "ph": "s", "id": 281994, "pid": 76337, "tid": -914061504, "ts": 1716454225858473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858563, "dur": 1, "args": { "External id": 282010, "cbid": 251, "correlation": 282010 } }, { "ph": "f", "id": 282010, "pid": 76337, "tid": -914061504, "ts": 1716454225858563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858568, "dur": 0, "args": { "External id": 282012, "cbid": 251, "correlation": 282012 } }, { "ph": "f", "id": 282012, "pid": 76337, "tid": -914061504, "ts": 1716454225858568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225928479, "dur": 520, "args": { "External id": 282013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282013, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 282013, "pid": 5, "tid": 7, "ts": 1716454225928479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858571, "dur": 13, "args": { "External id": 282013, "cbid": 211, "correlation": 282013 } }, { "ph": "s", "id": 282013, "pid": 76337, "tid": -914061504, "ts": 1716454225858571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225929001, "dur": 121, "args": { "External id": 282021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282021, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282021, "pid": 5, "tid": 7, "ts": 1716454225929001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858635, "dur": 13, "args": { "External id": 282021, "cbid": 211, "correlation": 282021 } }, { "ph": "s", "id": 282021, "pid": 76337, "tid": -914061504, "ts": 1716454225858635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225929123, "dur": 129, "args": { "External id": 282029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282029, "pid": 5, "tid": 7, "ts": 1716454225929123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858666, "dur": 8, "args": { "External id": 282029, "cbid": 211, "correlation": 282029 } }, { "ph": "s", "id": 282029, "pid": 76337, "tid": -914061504, "ts": 1716454225858666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225858743, "dur": 1, "args": { "External id": 282045, "cbid": 251, "correlation": 282045 } }, { "ph": "f", "id": 282045, "pid": 76337, "tid": -914061504, "ts": 1716454225858743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225929253, "dur": 299, "args": { "External id": 282047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282047, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282047, "pid": 5, "tid": 7, "ts": 1716454225929253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858749, "dur": 13, "args": { "External id": 282047, "cbid": 211, "correlation": 282047 } }, { "ph": "s", "id": 282047, "pid": 76337, "tid": -914061504, "ts": 1716454225858749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225929553, "dur": 27, "args": { "External id": 282055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282055, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282055, "pid": 5, "tid": 7, "ts": 1716454225929553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858791, "dur": 11, "args": { "External id": 282055, "cbid": 211, "correlation": 282055 } }, { "ph": "s", "id": 282055, "pid": 76337, "tid": -914061504, "ts": 1716454225858791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225929582, "dur": 77, "args": { "External id": 282066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282066, "pid": 5, "tid": 7, "ts": 1716454225929582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858860, "dur": 12, "args": { "External id": 282066, "cbid": 211, "correlation": 282066 } }, { "ph": "s", "id": 282066, "pid": 76337, "tid": -914061504, "ts": 1716454225858860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225858923, "dur": 0, "args": { "External id": 282078, "cbid": 317, "correlation": 282078 } }, { "ph": "f", "id": 282078, "pid": 76337, "tid": -914061504, "ts": 1716454225858923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225858924, "dur": 0, "args": { "External id": 282079, "cbid": 203, "correlation": 282079 } }, { "ph": "f", "id": 282079, "pid": 76337, "tid": -914061504, "ts": 1716454225858924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225858925, "dur": 0, "args": { "External id": 282080, "cbid": 205, "correlation": 282080 } }, { "ph": "f", "id": 282080, "pid": 76337, "tid": -914061504, "ts": 1716454225858925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225929660, "dur": 23, "args": { "External id": 282084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282084, "pid": 5, "tid": 7, "ts": 1716454225929660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858939, "dur": 13, "args": { "External id": 282084, "cbid": 211, "correlation": 282084 } }, { "ph": "s", "id": 282084, "pid": 76337, "tid": -914061504, "ts": 1716454225858939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225929684, "dur": 116, "args": { "External id": 282086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282086, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282086, "pid": 5, "tid": 7, "ts": 1716454225929684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858959, "dur": 6, "args": { "External id": 282086, "cbid": 211, "correlation": 282086 } }, { "ph": "s", "id": 282086, "pid": 76337, "tid": -914061504, "ts": 1716454225858959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225929802, "dur": 23, "args": { "External id": 282088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282088, "pid": 5, "tid": 7, "ts": 1716454225929802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225858969, "dur": 14, "args": { "External id": 282088, "cbid": 211, "correlation": 282088 } }, { "ph": "s", "id": 282088, "pid": 76337, "tid": -914061504, "ts": 1716454225858969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225929826, "dur": 32, "args": { "External id": 282094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282094, "pid": 5, "tid": 7, "ts": 1716454225929826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859006, "dur": 9, "args": { "External id": 282094, "cbid": 211, "correlation": 282094 } }, { "ph": "s", "id": 282094, "pid": 76337, "tid": -914061504, "ts": 1716454225859006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225929859, "dur": 26, "args": { "External id": 282102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282102, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282102, "pid": 5, "tid": 7, "ts": 1716454225929859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859038, "dur": 8, "args": { "External id": 282102, "cbid": 211, "correlation": 282102 } }, { "ph": "s", "id": 282102, "pid": 76337, "tid": -914061504, "ts": 1716454225859038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225929887, "dur": 42, "args": { "External id": 282111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282111, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282111, "pid": 5, "tid": 7, "ts": 1716454225929887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859076, "dur": 10, "args": { "External id": 282111, "cbid": 211, "correlation": 282111 } }, { "ph": "s", "id": 282111, "pid": 76337, "tid": -914061504, "ts": 1716454225859076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225929931, "dur": 42, "args": { "External id": 282131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282131, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 282131, "pid": 5, "tid": 7, "ts": 1716454225929931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859148, "dur": 12, "args": { "External id": 282131, "cbid": 211, "correlation": 282131 } }, { "ph": "s", "id": 282131, "pid": 76337, "tid": -914061504, "ts": 1716454225859148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225929974, "dur": 5, "args": { "External id": 282143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282143, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 282143, "pid": 5, "tid": 7, "ts": 1716454225929974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859169, "dur": 6, "args": { "External id": 282143, "cbid": 211, "correlation": 282143 } }, { "ph": "s", "id": 282143, "pid": 76337, "tid": -914061504, "ts": 1716454225859169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225929980, "dur": 41, "args": { "External id": 282146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282146, "pid": 5, "tid": 7, "ts": 1716454225929980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859189, "dur": 6, "args": { "External id": 282146, "cbid": 211, "correlation": 282146 } }, { "ph": "s", "id": 282146, "pid": 76337, "tid": -914061504, "ts": 1716454225859189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225930023, "dur": 30, "args": { "External id": 282155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282155, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282155, "pid": 5, "tid": 7, "ts": 1716454225930023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859228, "dur": 10, "args": { "External id": 282155, "cbid": 211, "correlation": 282155 } }, { "ph": "s", "id": 282155, "pid": 76337, "tid": -914061504, "ts": 1716454225859228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225859279, "dur": 0, "args": { "External id": 282165, "cbid": 317, "correlation": 282165 } }, { "ph": "f", "id": 282165, "pid": 76337, "tid": -914061504, "ts": 1716454225859279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225859279, "dur": 0, "args": { "External id": 282166, "cbid": 203, "correlation": 282166 } }, { "ph": "f", "id": 282166, "pid": 76337, "tid": -914061504, "ts": 1716454225859279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225859280, "dur": 0, "args": { "External id": 282167, "cbid": 205, "correlation": 282167 } }, { "ph": "f", "id": 282167, "pid": 76337, "tid": -914061504, "ts": 1716454225859280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225930054, "dur": 31, "args": { "External id": 282171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282171, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282171, "pid": 5, "tid": 7, "ts": 1716454225930054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859295, "dur": 12, "args": { "External id": 282171, "cbid": 211, "correlation": 282171 } }, { "ph": "s", "id": 282171, "pid": 76337, "tid": -914061504, "ts": 1716454225859295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225930086, "dur": 61, "args": { "External id": 282173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282173, "pid": 5, "tid": 7, "ts": 1716454225930086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859309, "dur": 6, "args": { "External id": 282173, "cbid": 211, "correlation": 282173 } }, { "ph": "s", "id": 282173, "pid": 76337, "tid": -914061504, "ts": 1716454225859309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225930149, "dur": 937, "args": { "External id": 282175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282175, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282175, "pid": 5, "tid": 7, "ts": 1716454225930149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859321, "dur": 6, "args": { "External id": 282175, "cbid": 211, "correlation": 282175 } }, { "ph": "s", "id": 282175, "pid": 76337, "tid": -914061504, "ts": 1716454225859321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225931087, "dur": 22, "args": { "External id": 282177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282177, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282177, "pid": 5, "tid": 7, "ts": 1716454225931087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859331, "dur": 5, "args": { "External id": 282177, "cbid": 211, "correlation": 282177 } }, { "ph": "s", "id": 282177, "pid": 76337, "tid": -914061504, "ts": 1716454225859331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225931110, "dur": 32, "args": { "External id": 282183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282183, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282183, "pid": 5, "tid": 7, "ts": 1716454225931110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859358, "dur": 9, "args": { "External id": 282183, "cbid": 211, "correlation": 282183 } }, { "ph": "s", "id": 282183, "pid": 76337, "tid": -914061504, "ts": 1716454225859358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225931143, "dur": 4, "args": { "External id": 282191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282191, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 282191, "pid": 5, "tid": 7, "ts": 1716454225931143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859401, "dur": 10, "args": { "External id": 282191, "cbid": 211, "correlation": 282191 } }, { "ph": "s", "id": 282191, "pid": 76337, "tid": -914061504, "ts": 1716454225859401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225859468, "dur": 1, "args": { "External id": 282207, "cbid": 251, "correlation": 282207 } }, { "ph": "f", "id": 282207, "pid": 76337, "tid": -914061504, "ts": 1716454225859468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225859473, "dur": 0, "args": { "External id": 282209, "cbid": 251, "correlation": 282209 } }, { "ph": "f", "id": 282209, "pid": 76337, "tid": -914061504, "ts": 1716454225859473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225931148, "dur": 11, "args": { "External id": 282210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282210, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 282210, "pid": 5, "tid": 7, "ts": 1716454225931148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859475, "dur": 11, "args": { "External id": 282210, "cbid": 211, "correlation": 282210 } }, { "ph": "s", "id": 282210, "pid": 76337, "tid": -914061504, "ts": 1716454225859475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225931161, "dur": 5, "args": { "External id": 282212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282212, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 282212, "pid": 5, "tid": 7, "ts": 1716454225931161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859488, "dur": 5, "args": { "External id": 282212, "cbid": 211, "correlation": 282212 } }, { "ph": "s", "id": 282212, "pid": 76337, "tid": -914061504, "ts": 1716454225859488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225931167, "dur": 28, "args": { "External id": 282222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282222, "pid": 5, "tid": 7, "ts": 1716454225931167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859545, "dur": 12, "args": { "External id": 282222, "cbid": 211, "correlation": 282222 } }, { "ph": "s", "id": 282222, "pid": 76337, "tid": -914061504, "ts": 1716454225859545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225931197, "dur": 30, "args": { "External id": 282242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282242, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 282242, "pid": 5, "tid": 7, "ts": 1716454225931197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859610, "dur": 11, "args": { "External id": 282242, "cbid": 211, "correlation": 282242 } }, { "ph": "s", "id": 282242, "pid": 76337, "tid": -914061504, "ts": 1716454225859610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225931228, "dur": 4, "args": { "External id": 282254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282254, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 282254, "pid": 5, "tid": 7, "ts": 1716454225931228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859631, "dur": 6, "args": { "External id": 282254, "cbid": 211, "correlation": 282254 } }, { "ph": "s", "id": 282254, "pid": 76337, "tid": -914061504, "ts": 1716454225859631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225931233, "dur": 29, "args": { "External id": 282257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282257, "pid": 5, "tid": 7, "ts": 1716454225931233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859650, "dur": 7, "args": { "External id": 282257, "cbid": 211, "correlation": 282257 } }, { "ph": "s", "id": 282257, "pid": 76337, "tid": -914061504, "ts": 1716454225859650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225931264, "dur": 20, "args": { "External id": 282266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282266, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282266, "pid": 5, "tid": 7, "ts": 1716454225931264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859690, "dur": 9, "args": { "External id": 282266, "cbid": 211, "correlation": 282266 } }, { "ph": "s", "id": 282266, "pid": 76337, "tid": -914061504, "ts": 1716454225859690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225859752, "dur": 0, "args": { "External id": 282276, "cbid": 317, "correlation": 282276 } }, { "ph": "f", "id": 282276, "pid": 76337, "tid": -914061504, "ts": 1716454225859752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225859753, "dur": 0, "args": { "External id": 282277, "cbid": 203, "correlation": 282277 } }, { "ph": "f", "id": 282277, "pid": 76337, "tid": -914061504, "ts": 1716454225859753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225859754, "dur": 0, "args": { "External id": 282278, "cbid": 205, "correlation": 282278 } }, { "ph": "f", "id": 282278, "pid": 76337, "tid": -914061504, "ts": 1716454225859754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225931285, "dur": 22, "args": { "External id": 282282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282282, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282282, "pid": 5, "tid": 7, "ts": 1716454225931285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859768, "dur": 12, "args": { "External id": 282282, "cbid": 211, "correlation": 282282 } }, { "ph": "s", "id": 282282, "pid": 76337, "tid": -914061504, "ts": 1716454225859768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225931309, "dur": 42, "args": { "External id": 282284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282284, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282284, "pid": 5, "tid": 7, "ts": 1716454225931309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859782, "dur": 5, "args": { "External id": 282284, "cbid": 211, "correlation": 282284 } }, { "ph": "s", "id": 282284, "pid": 76337, "tid": -914061504, "ts": 1716454225859782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225931352, "dur": 627, "args": { "External id": 282286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282286, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282286, "pid": 5, "tid": 7, "ts": 1716454225931352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859794, "dur": 6, "args": { "External id": 282286, "cbid": 211, "correlation": 282286 } }, { "ph": "s", "id": 282286, "pid": 76337, "tid": -914061504, "ts": 1716454225859794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225931981, "dur": 21, "args": { "External id": 282288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282288, "pid": 5, "tid": 7, "ts": 1716454225931981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859803, "dur": 5, "args": { "External id": 282288, "cbid": 211, "correlation": 282288 } }, { "ph": "s", "id": 282288, "pid": 76337, "tid": -914061504, "ts": 1716454225859803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225932003, "dur": 32, "args": { "External id": 282294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282294, "pid": 5, "tid": 7, "ts": 1716454225932003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859831, "dur": 8, "args": { "External id": 282294, "cbid": 211, "correlation": 282294 } }, { "ph": "s", "id": 282294, "pid": 76337, "tid": -914061504, "ts": 1716454225859831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225859890, "dur": 0, "args": { "External id": 282304, "cbid": 317, "correlation": 282304 } }, { "ph": "f", "id": 282304, "pid": 76337, "tid": -914061504, "ts": 1716454225859890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225859890, "dur": 0, "args": { "External id": 282305, "cbid": 203, "correlation": 282305 } }, { "ph": "f", "id": 282305, "pid": 76337, "tid": -914061504, "ts": 1716454225859890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225859891, "dur": 0, "args": { "External id": 282306, "cbid": 205, "correlation": 282306 } }, { "ph": "f", "id": 282306, "pid": 76337, "tid": -914061504, "ts": 1716454225859891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225932036, "dur": 29, "args": { "External id": 282310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282310, "pid": 5, "tid": 7, "ts": 1716454225932036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859905, "dur": 11, "args": { "External id": 282310, "cbid": 211, "correlation": 282310 } }, { "ph": "s", "id": 282310, "pid": 76337, "tid": -914061504, "ts": 1716454225859905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225932067, "dur": 147, "args": { "External id": 282312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282312, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282312, "pid": 5, "tid": 7, "ts": 1716454225932067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859924, "dur": 7, "args": { "External id": 282312, "cbid": 211, "correlation": 282312 } }, { "ph": "s", "id": 282312, "pid": 76337, "tid": -914061504, "ts": 1716454225859924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225932215, "dur": 21, "args": { "External id": 282314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282314, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282314, "pid": 5, "tid": 7, "ts": 1716454225932215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859934, "dur": 5, "args": { "External id": 282314, "cbid": 211, "correlation": 282314 } }, { "ph": "s", "id": 282314, "pid": 76337, "tid": -914061504, "ts": 1716454225859934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225932238, "dur": 32, "args": { "External id": 282320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282320, "pid": 5, "tid": 7, "ts": 1716454225932238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859960, "dur": 8, "args": { "External id": 282320, "cbid": 211, "correlation": 282320 } }, { "ph": "s", "id": 282320, "pid": 76337, "tid": -914061504, "ts": 1716454225859960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225932271, "dur": 28, "args": { "External id": 282328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282328, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282328, "pid": 5, "tid": 7, "ts": 1716454225932271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225859997, "dur": 8, "args": { "External id": 282328, "cbid": 211, "correlation": 282328 } }, { "ph": "s", "id": 282328, "pid": 76337, "tid": -914061504, "ts": 1716454225859997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225932300, "dur": 20, "args": { "External id": 282336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282336, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282336, "pid": 5, "tid": 7, "ts": 1716454225932300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860026, "dur": 8, "args": { "External id": 282336, "cbid": 211, "correlation": 282336 } }, { "ph": "s", "id": 282336, "pid": 76337, "tid": -914061504, "ts": 1716454225860026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225932321, "dur": 28, "args": { "External id": 282356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282356, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 282356, "pid": 5, "tid": 7, "ts": 1716454225932321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860108, "dur": 12, "args": { "External id": 282356, "cbid": 211, "correlation": 282356 } }, { "ph": "s", "id": 282356, "pid": 76337, "tid": -914061504, "ts": 1716454225860108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225932351, "dur": 5, "args": { "External id": 282368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282368, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 282368, "pid": 5, "tid": 7, "ts": 1716454225932351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860131, "dur": 6, "args": { "External id": 282368, "cbid": 211, "correlation": 282368 } }, { "ph": "s", "id": 282368, "pid": 76337, "tid": -914061504, "ts": 1716454225860131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225932357, "dur": 30, "args": { "External id": 282371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282371, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282371, "pid": 5, "tid": 7, "ts": 1716454225932357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860148, "dur": 6, "args": { "External id": 282371, "cbid": 211, "correlation": 282371 } }, { "ph": "s", "id": 282371, "pid": 76337, "tid": -914061504, "ts": 1716454225860148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225860205, "dur": 0, "args": { "External id": 282382, "cbid": 317, "correlation": 282382 } }, { "ph": "f", "id": 282382, "pid": 76337, "tid": -914061504, "ts": 1716454225860205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225860205, "dur": 0, "args": { "External id": 282383, "cbid": 203, "correlation": 282383 } }, { "ph": "f", "id": 282383, "pid": 76337, "tid": -914061504, "ts": 1716454225860205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225860206, "dur": 0, "args": { "External id": 282384, "cbid": 205, "correlation": 282384 } }, { "ph": "f", "id": 282384, "pid": 76337, "tid": -914061504, "ts": 1716454225860206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225932388, "dur": 22, "args": { "External id": 282388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282388, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282388, "pid": 5, "tid": 7, "ts": 1716454225932388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860219, "dur": 13, "args": { "External id": 282388, "cbid": 211, "correlation": 282388 } }, { "ph": "s", "id": 282388, "pid": 76337, "tid": -914061504, "ts": 1716454225860219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225932411, "dur": 100, "args": { "External id": 282390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282390, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282390, "pid": 5, "tid": 7, "ts": 1716454225932411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860238, "dur": 6, "args": { "External id": 282390, "cbid": 211, "correlation": 282390 } }, { "ph": "s", "id": 282390, "pid": 76337, "tid": -914061504, "ts": 1716454225860238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225932513, "dur": 23, "args": { "External id": 282392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282392, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282392, "pid": 5, "tid": 7, "ts": 1716454225932513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860247, "dur": 5, "args": { "External id": 282392, "cbid": 211, "correlation": 282392 } }, { "ph": "s", "id": 282392, "pid": 76337, "tid": -914061504, "ts": 1716454225860247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225932537, "dur": 32, "args": { "External id": 282398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282398, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282398, "pid": 5, "tid": 7, "ts": 1716454225932537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860274, "dur": 9, "args": { "External id": 282398, "cbid": 211, "correlation": 282398 } }, { "ph": "s", "id": 282398, "pid": 76337, "tid": -914061504, "ts": 1716454225860274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225932570, "dur": 181, "args": { "External id": 282407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282407, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282407, "pid": 5, "tid": 7, "ts": 1716454225932570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860356, "dur": 14, "args": { "External id": 282407, "cbid": 211, "correlation": 282407 } }, { "ph": "s", "id": 282407, "pid": 76337, "tid": -914061504, "ts": 1716454225860356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225932752, "dur": 62, "args": { "External id": 282429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282429, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282429, "pid": 5, "tid": 7, "ts": 1716454225932752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860413, "dur": 10, "args": { "External id": 282429, "cbid": 211, "correlation": 282429 } }, { "ph": "s", "id": 282429, "pid": 76337, "tid": -914061504, "ts": 1716454225860413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225860500, "dur": 1, "args": { "External id": 282440, "cbid": 251, "correlation": 282440 } }, { "ph": "f", "id": 282440, "pid": 76337, "tid": -914061504, "ts": 1716454225860500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225932816, "dur": 147, "args": { "External id": 282441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282441, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282441, "pid": 5, "tid": 7, "ts": 1716454225932816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860506, "dur": 12, "args": { "External id": 282441, "cbid": 211, "correlation": 282441 } }, { "ph": "s", "id": 282441, "pid": 76337, "tid": -914061504, "ts": 1716454225860506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225860575, "dur": 1, "args": { "External id": 282452, "cbid": 251, "correlation": 282452 } }, { "ph": "f", "id": 282452, "pid": 76337, "tid": -914061504, "ts": 1716454225860575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225932964, "dur": 141, "args": { "External id": 282453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282453, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282453, "pid": 5, "tid": 7, "ts": 1716454225932964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860579, "dur": 12, "args": { "External id": 282453, "cbid": 211, "correlation": 282453 } }, { "ph": "s", "id": 282453, "pid": 76337, "tid": -914061504, "ts": 1716454225860579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225860646, "dur": 1, "args": { "External id": 282464, "cbid": 251, "correlation": 282464 } }, { "ph": "f", "id": 282464, "pid": 76337, "tid": -914061504, "ts": 1716454225860646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225933106, "dur": 142, "args": { "External id": 282465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282465, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282465, "pid": 5, "tid": 7, "ts": 1716454225933106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860650, "dur": 11, "args": { "External id": 282465, "cbid": 211, "correlation": 282465 } }, { "ph": "s", "id": 282465, "pid": 76337, "tid": -914061504, "ts": 1716454225860650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225933250, "dur": 1855, "args": { "External id": 282486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282486, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 282486, "pid": 5, "tid": 7, "ts": 1716454225933250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860729, "dur": 12, "args": { "External id": 282486, "cbid": 211, "correlation": 282486 } }, { "ph": "s", "id": 282486, "pid": 76337, "tid": -914061504, "ts": 1716454225860729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225860826, "dur": 1, "args": { "External id": 282504, "cbid": 251, "correlation": 282504 } }, { "ph": "f", "id": 282504, "pid": 76337, "tid": -914061504, "ts": 1716454225860826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225935106, "dur": 143, "args": { "External id": 282506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282506, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 282506, "pid": 5, "tid": 7, "ts": 1716454225935106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860831, "dur": 13, "args": { "External id": 282506, "cbid": 211, "correlation": 282506 } }, { "ph": "s", "id": 282506, "pid": 76337, "tid": -914061504, "ts": 1716454225860831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225935250, "dur": 35, "args": { "External id": 282514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282514, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282514, "pid": 5, "tid": 7, "ts": 1716454225935250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860900, "dur": 12, "args": { "External id": 282514, "cbid": 211, "correlation": 282514 } }, { "ph": "s", "id": 282514, "pid": 76337, "tid": -914061504, "ts": 1716454225860900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225935287, "dur": 51, "args": { "External id": 282522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282522, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282522, "pid": 5, "tid": 7, "ts": 1716454225935287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225860939, "dur": 9, "args": { "External id": 282522, "cbid": 211, "correlation": 282522 } }, { "ph": "s", "id": 282522, "pid": 76337, "tid": -914061504, "ts": 1716454225860939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225935339, "dur": 29, "args": { "External id": 282533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282533, "pid": 5, "tid": 7, "ts": 1716454225935339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861019, "dur": 13, "args": { "External id": 282533, "cbid": 211, "correlation": 282533 } }, { "ph": "s", "id": 282533, "pid": 76337, "tid": -914061504, "ts": 1716454225861019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225935370, "dur": 33, "args": { "External id": 282555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282555, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282555, "pid": 5, "tid": 7, "ts": 1716454225935370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861049, "dur": 8, "args": { "External id": 282555, "cbid": 211, "correlation": 282555 } }, { "ph": "s", "id": 282555, "pid": 76337, "tid": -914061504, "ts": 1716454225861049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861134, "dur": 1, "args": { "External id": 282566, "cbid": 251, "correlation": 282566 } }, { "ph": "f", "id": 282566, "pid": 76337, "tid": -914061504, "ts": 1716454225861134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225935403, "dur": 87, "args": { "External id": 282567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282567, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282567, "pid": 5, "tid": 7, "ts": 1716454225935403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861139, "dur": 13, "args": { "External id": 282567, "cbid": 211, "correlation": 282567 } }, { "ph": "s", "id": 282567, "pid": 76337, "tid": -914061504, "ts": 1716454225861139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861207, "dur": 1, "args": { "External id": 282578, "cbid": 251, "correlation": 282578 } }, { "ph": "f", "id": 282578, "pid": 76337, "tid": -914061504, "ts": 1716454225861207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861211, "dur": 0, "args": { "External id": 282579, "cbid": 251, "correlation": 282579 } }, { "ph": "f", "id": 282579, "pid": 76337, "tid": -914061504, "ts": 1716454225861211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225935491, "dur": 10, "args": { "External id": 282580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282580, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 282580, "pid": 5, "tid": 7, "ts": 1716454225935491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861213, "dur": 12, "args": { "External id": 282580, "cbid": 211, "correlation": 282580 } }, { "ph": "s", "id": 282580, "pid": 76337, "tid": -914061504, "ts": 1716454225861213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225935503, "dur": 5, "args": { "External id": 282582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282582, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 282582, "pid": 5, "tid": 7, "ts": 1716454225935503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861226, "dur": 6, "args": { "External id": 282582, "cbid": 211, "correlation": 282582 } }, { "ph": "s", "id": 282582, "pid": 76337, "tid": -914061504, "ts": 1716454225861226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861285, "dur": 1, "args": { "External id": 282593, "cbid": 251, "correlation": 282593 } }, { "ph": "f", "id": 282593, "pid": 76337, "tid": -914061504, "ts": 1716454225861285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861288, "dur": 0, "args": { "External id": 282594, "cbid": 251, "correlation": 282594 } }, { "ph": "f", "id": 282594, "pid": 76337, "tid": -914061504, "ts": 1716454225861288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225935509, "dur": 7, "args": { "External id": 282595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282595, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 282595, "pid": 5, "tid": 7, "ts": 1716454225935509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861290, "dur": 11, "args": { "External id": 282595, "cbid": 211, "correlation": 282595 } }, { "ph": "s", "id": 282595, "pid": 76337, "tid": -914061504, "ts": 1716454225861290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225935518, "dur": 3, "args": { "External id": 282597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282597, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 282597, "pid": 5, "tid": 7, "ts": 1716454225935518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861303, "dur": 6, "args": { "External id": 282597, "cbid": 211, "correlation": 282597 } }, { "ph": "s", "id": 282597, "pid": 76337, "tid": -914061504, "ts": 1716454225861303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225935522, "dur": 90, "args": { "External id": 282618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282618, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 282618, "pid": 5, "tid": 7, "ts": 1716454225935522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861377, "dur": 12, "args": { "External id": 282618, "cbid": 211, "correlation": 282618 } }, { "ph": "s", "id": 282618, "pid": 76337, "tid": -914061504, "ts": 1716454225861377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861474, "dur": 1, "args": { "External id": 282636, "cbid": 251, "correlation": 282636 } }, { "ph": "f", "id": 282636, "pid": 76337, "tid": -914061504, "ts": 1716454225861474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225935613, "dur": 92, "args": { "External id": 282638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282638, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282638, "pid": 5, "tid": 7, "ts": 1716454225935613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861479, "dur": 13, "args": { "External id": 282638, "cbid": 211, "correlation": 282638 } }, { "ph": "s", "id": 282638, "pid": 76337, "tid": -914061504, "ts": 1716454225861479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225935707, "dur": 19, "args": { "External id": 282646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282646, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282646, "pid": 5, "tid": 7, "ts": 1716454225935707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861548, "dur": 13, "args": { "External id": 282646, "cbid": 211, "correlation": 282646 } }, { "ph": "s", "id": 282646, "pid": 76337, "tid": -914061504, "ts": 1716454225861548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225935727, "dur": 37, "args": { "External id": 282654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282654, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282654, "pid": 5, "tid": 7, "ts": 1716454225935727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861589, "dur": 9, "args": { "External id": 282654, "cbid": 211, "correlation": 282654 } }, { "ph": "s", "id": 282654, "pid": 76337, "tid": -914061504, "ts": 1716454225861589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225935765, "dur": 33, "args": { "External id": 282676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282676, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282676, "pid": 5, "tid": 7, "ts": 1716454225935765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861639, "dur": 10, "args": { "External id": 282676, "cbid": 211, "correlation": 282676 } }, { "ph": "s", "id": 282676, "pid": 76337, "tid": -914061504, "ts": 1716454225861639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861729, "dur": 1, "args": { "External id": 282692, "cbid": 251, "correlation": 282692 } }, { "ph": "f", "id": 282692, "pid": 76337, "tid": -914061504, "ts": 1716454225861729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861734, "dur": 0, "args": { "External id": 282694, "cbid": 251, "correlation": 282694 } }, { "ph": "f", "id": 282694, "pid": 76337, "tid": -914061504, "ts": 1716454225861734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225935800, "dur": 522, "args": { "External id": 282695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282695, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 282695, "pid": 5, "tid": 7, "ts": 1716454225935800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861737, "dur": 13, "args": { "External id": 282695, "cbid": 211, "correlation": 282695 } }, { "ph": "s", "id": 282695, "pid": 76337, "tid": -914061504, "ts": 1716454225861737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225936323, "dur": 122, "args": { "External id": 282703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282703, "pid": 5, "tid": 7, "ts": 1716454225936323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861802, "dur": 13, "args": { "External id": 282703, "cbid": 211, "correlation": 282703 } }, { "ph": "s", "id": 282703, "pid": 76337, "tid": -914061504, "ts": 1716454225861802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225936446, "dur": 129, "args": { "External id": 282711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282711, "pid": 5, "tid": 7, "ts": 1716454225936446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861832, "dur": 8, "args": { "External id": 282711, "cbid": 211, "correlation": 282711 } }, { "ph": "s", "id": 282711, "pid": 76337, "tid": -914061504, "ts": 1716454225861832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225861908, "dur": 1, "args": { "External id": 282727, "cbid": 251, "correlation": 282727 } }, { "ph": "f", "id": 282727, "pid": 76337, "tid": -914061504, "ts": 1716454225861908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225936576, "dur": 301, "args": { "External id": 282729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282729, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282729, "pid": 5, "tid": 7, "ts": 1716454225936576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861914, "dur": 12, "args": { "External id": 282729, "cbid": 211, "correlation": 282729 } }, { "ph": "s", "id": 282729, "pid": 76337, "tid": -914061504, "ts": 1716454225861914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225936879, "dur": 28, "args": { "External id": 282737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282737, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282737, "pid": 5, "tid": 7, "ts": 1716454225936879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225861956, "dur": 10, "args": { "External id": 282737, "cbid": 211, "correlation": 282737 } }, { "ph": "s", "id": 282737, "pid": 76337, "tid": -914061504, "ts": 1716454225861956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225936908, "dur": 78, "args": { "External id": 282748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282748, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282748, "pid": 5, "tid": 7, "ts": 1716454225936908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862031, "dur": 13, "args": { "External id": 282748, "cbid": 211, "correlation": 282748 } }, { "ph": "s", "id": 282748, "pid": 76337, "tid": -914061504, "ts": 1716454225862031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225862095, "dur": 0, "args": { "External id": 282760, "cbid": 317, "correlation": 282760 } }, { "ph": "f", "id": 282760, "pid": 76337, "tid": -914061504, "ts": 1716454225862095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225862096, "dur": 0, "args": { "External id": 282761, "cbid": 203, "correlation": 282761 } }, { "ph": "f", "id": 282761, "pid": 76337, "tid": -914061504, "ts": 1716454225862096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225862096, "dur": 0, "args": { "External id": 282762, "cbid": 205, "correlation": 282762 } }, { "ph": "f", "id": 282762, "pid": 76337, "tid": -914061504, "ts": 1716454225862096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225936987, "dur": 22, "args": { "External id": 282766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282766, "pid": 5, "tid": 7, "ts": 1716454225936987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862111, "dur": 13, "args": { "External id": 282766, "cbid": 211, "correlation": 282766 } }, { "ph": "s", "id": 282766, "pid": 76337, "tid": -914061504, "ts": 1716454225862111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225937010, "dur": 115, "args": { "External id": 282768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282768, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282768, "pid": 5, "tid": 7, "ts": 1716454225937010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862130, "dur": 6, "args": { "External id": 282768, "cbid": 211, "correlation": 282768 } }, { "ph": "s", "id": 282768, "pid": 76337, "tid": -914061504, "ts": 1716454225862130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225937126, "dur": 23, "args": { "External id": 282770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282770, "pid": 5, "tid": 7, "ts": 1716454225937126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862140, "dur": 5, "args": { "External id": 282770, "cbid": 211, "correlation": 282770 } }, { "ph": "s", "id": 282770, "pid": 76337, "tid": -914061504, "ts": 1716454225862140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225937151, "dur": 31, "args": { "External id": 282776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282776, "pid": 5, "tid": 7, "ts": 1716454225937151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862167, "dur": 9, "args": { "External id": 282776, "cbid": 211, "correlation": 282776 } }, { "ph": "s", "id": 282776, "pid": 76337, "tid": -914061504, "ts": 1716454225862167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225937183, "dur": 28, "args": { "External id": 282784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282784, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282784, "pid": 5, "tid": 7, "ts": 1716454225937183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862199, "dur": 8, "args": { "External id": 282784, "cbid": 211, "correlation": 282784 } }, { "ph": "s", "id": 282784, "pid": 76337, "tid": -914061504, "ts": 1716454225862199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454225937212, "dur": 98, "args": { "External id": 282795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282795, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282795, "pid": 5, "tid": 7, "ts": 1716454225937212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862265, "dur": 13, "args": { "External id": 282795, "cbid": 211, "correlation": 282795 } }, { "ph": "s", "id": 282795, "pid": 76337, "tid": -914061504, "ts": 1716454225862265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225862322, "dur": 0, "args": { "External id": 282805, "cbid": 317, "correlation": 282805 } }, { "ph": "f", "id": 282805, "pid": 76337, "tid": -914061504, "ts": 1716454225862322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225862323, "dur": 0, "args": { "External id": 282806, "cbid": 203, "correlation": 282806 } }, { "ph": "f", "id": 282806, "pid": 76337, "tid": -914061504, "ts": 1716454225862323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225862324, "dur": 0, "args": { "External id": 282807, "cbid": 205, "correlation": 282807 } }, { "ph": "f", "id": 282807, "pid": 76337, "tid": -914061504, "ts": 1716454225862324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225937311, "dur": 74, "args": { "External id": 282811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282811, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282811, "pid": 5, "tid": 7, "ts": 1716454225937311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862338, "dur": 11, "args": { "External id": 282811, "cbid": 211, "correlation": 282811 } }, { "ph": "s", "id": 282811, "pid": 76337, "tid": -914061504, "ts": 1716454225862338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225937387, "dur": 42, "args": { "External id": 282813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282813, "pid": 5, "tid": 7, "ts": 1716454225937387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862352, "dur": 5, "args": { "External id": 282813, "cbid": 211, "correlation": 282813 } }, { "ph": "s", "id": 282813, "pid": 76337, "tid": -914061504, "ts": 1716454225862352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225937430, "dur": 4, "args": { "External id": 282815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 282815, "pid": 5, "tid": 7, "ts": 1716454225937430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862363, "dur": 7, "args": { "External id": 282815, "cbid": 211, "correlation": 282815 } }, { "ph": "s", "id": 282815, "pid": 76337, "tid": -914061504, "ts": 1716454225862363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225862373, "dur": 0, "args": { "External id": 282816, "cbid": 51, "correlation": 282816 } }, { "ph": "s", "id": 282816, "pid": 76337, "tid": -914061504, "ts": 1716454225862373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225937435, "dur": 2195, "args": { "External id": 282817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282817, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282817, "pid": 5, "tid": 7, "ts": 1716454225937435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862374, "dur": 6, "args": { "External id": 282817, "cbid": 211, "correlation": 282817 } }, { "ph": "s", "id": 282817, "pid": 76337, "tid": -914061504, "ts": 1716454225862374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225939631, "dur": 109, "args": { "External id": 282822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282822, "pid": 5, "tid": 7, "ts": 1716454225939631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862402, "dur": 8, "args": { "External id": 282822, "cbid": 211, "correlation": 282822 } }, { "ph": "s", "id": 282822, "pid": 76337, "tid": -914061504, "ts": 1716454225862402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225939742, "dur": 162, "args": { "External id": 282831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282831, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282831, "pid": 5, "tid": 7, "ts": 1716454225939742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862497, "dur": 13, "args": { "External id": 282831, "cbid": 211, "correlation": 282831 } }, { "ph": "s", "id": 282831, "pid": 76337, "tid": -914061504, "ts": 1716454225862497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225939905, "dur": 125, "args": { "External id": 282851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282851, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 282851, "pid": 5, "tid": 7, "ts": 1716454225939905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862568, "dur": 12, "args": { "External id": 282851, "cbid": 211, "correlation": 282851 } }, { "ph": "s", "id": 282851, "pid": 76337, "tid": -914061504, "ts": 1716454225862568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225940031, "dur": 4, "args": { "External id": 282863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282863, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 282863, "pid": 5, "tid": 7, "ts": 1716454225940031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862602, "dur": 7, "args": { "External id": 282863, "cbid": 211, "correlation": 282863 } }, { "ph": "s", "id": 282863, "pid": 76337, "tid": -914061504, "ts": 1716454225862602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225940037, "dur": 157, "args": { "External id": 282866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282866, "pid": 5, "tid": 7, "ts": 1716454225940037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862621, "dur": 7, "args": { "External id": 282866, "cbid": 211, "correlation": 282866 } }, { "ph": "s", "id": 282866, "pid": 76337, "tid": -914061504, "ts": 1716454225862621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225940195, "dur": 101, "args": { "External id": 282875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282875, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282875, "pid": 5, "tid": 7, "ts": 1716454225940195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862662, "dur": 10, "args": { "External id": 282875, "cbid": 211, "correlation": 282875 } }, { "ph": "s", "id": 282875, "pid": 76337, "tid": -914061504, "ts": 1716454225862662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225862716, "dur": 0, "args": { "External id": 282885, "cbid": 317, "correlation": 282885 } }, { "ph": "f", "id": 282885, "pid": 76337, "tid": -914061504, "ts": 1716454225862716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225862716, "dur": 0, "args": { "External id": 282886, "cbid": 203, "correlation": 282886 } }, { "ph": "f", "id": 282886, "pid": 76337, "tid": -914061504, "ts": 1716454225862716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225862717, "dur": 0, "args": { "External id": 282887, "cbid": 205, "correlation": 282887 } }, { "ph": "f", "id": 282887, "pid": 76337, "tid": -914061504, "ts": 1716454225862717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225940297, "dur": 109, "args": { "External id": 282891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282891, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282891, "pid": 5, "tid": 7, "ts": 1716454225940297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862734, "dur": 12, "args": { "External id": 282891, "cbid": 211, "correlation": 282891 } }, { "ph": "s", "id": 282891, "pid": 76337, "tid": -914061504, "ts": 1716454225862734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225940407, "dur": 32, "args": { "External id": 282893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282893, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282893, "pid": 5, "tid": 7, "ts": 1716454225940407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862748, "dur": 5, "args": { "External id": 282893, "cbid": 211, "correlation": 282893 } }, { "ph": "s", "id": 282893, "pid": 76337, "tid": -914061504, "ts": 1716454225862748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225940441, "dur": 4, "args": { "External id": 282895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282895, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 282895, "pid": 5, "tid": 7, "ts": 1716454225940441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862758, "dur": 6, "args": { "External id": 282895, "cbid": 211, "correlation": 282895 } }, { "ph": "s", "id": 282895, "pid": 76337, "tid": -914061504, "ts": 1716454225862758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225862767, "dur": 0, "args": { "External id": 282896, "cbid": 51, "correlation": 282896 } }, { "ph": "s", "id": 282896, "pid": 76337, "tid": -914061504, "ts": 1716454225862767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225940445, "dur": 1958, "args": { "External id": 282897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282897, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 282897, "pid": 5, "tid": 7, "ts": 1716454225940445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862768, "dur": 8, "args": { "External id": 282897, "cbid": 211, "correlation": 282897 } }, { "ph": "s", "id": 282897, "pid": 76337, "tid": -914061504, "ts": 1716454225862768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225942404, "dur": 58, "args": { "External id": 282902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282902, "pid": 5, "tid": 7, "ts": 1716454225942404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862803, "dur": 9, "args": { "External id": 282902, "cbid": 211, "correlation": 282902 } }, { "ph": "s", "id": 282902, "pid": 76337, "tid": -914061504, "ts": 1716454225862803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225942463, "dur": 4, "args": { "External id": 282910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282910, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 282910, "pid": 5, "tid": 7, "ts": 1716454225942463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862847, "dur": 9, "args": { "External id": 282910, "cbid": 211, "correlation": 282910 } }, { "ph": "s", "id": 282910, "pid": 76337, "tid": -914061504, "ts": 1716454225862847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225862910, "dur": 1, "args": { "External id": 282926, "cbid": 251, "correlation": 282926 } }, { "ph": "f", "id": 282926, "pid": 76337, "tid": -914061504, "ts": 1716454225862910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225862915, "dur": 0, "args": { "External id": 282928, "cbid": 251, "correlation": 282928 } }, { "ph": "f", "id": 282928, "pid": 76337, "tid": -914061504, "ts": 1716454225862915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225942469, "dur": 11, "args": { "External id": 282929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282929, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 282929, "pid": 5, "tid": 7, "ts": 1716454225942469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862917, "dur": 12, "args": { "External id": 282929, "cbid": 211, "correlation": 282929 } }, { "ph": "s", "id": 282929, "pid": 76337, "tid": -914061504, "ts": 1716454225862917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225942481, "dur": 5, "args": { "External id": 282931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282931, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 282931, "pid": 5, "tid": 7, "ts": 1716454225942481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862931, "dur": 6, "args": { "External id": 282931, "cbid": 211, "correlation": 282931 } }, { "ph": "s", "id": 282931, "pid": 76337, "tid": -914061504, "ts": 1716454225862931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225942487, "dur": 51, "args": { "External id": 282941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282941, "pid": 5, "tid": 7, "ts": 1716454225942487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225862995, "dur": 13, "args": { "External id": 282941, "cbid": 211, "correlation": 282941 } }, { "ph": "s", "id": 282941, "pid": 76337, "tid": -914061504, "ts": 1716454225862995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225942540, "dur": 49, "args": { "External id": 282961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282961, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 282961, "pid": 5, "tid": 7, "ts": 1716454225942540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863064, "dur": 11, "args": { "External id": 282961, "cbid": 211, "correlation": 282961 } }, { "ph": "s", "id": 282961, "pid": 76337, "tid": -914061504, "ts": 1716454225863064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225942590, "dur": 4, "args": { "External id": 282973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282973, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 282973, "pid": 5, "tid": 7, "ts": 1716454225942590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863085, "dur": 6, "args": { "External id": 282973, "cbid": 211, "correlation": 282973 } }, { "ph": "s", "id": 282973, "pid": 76337, "tid": -914061504, "ts": 1716454225863085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225942595, "dur": 54, "args": { "External id": 282976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282976, "pid": 5, "tid": 7, "ts": 1716454225942595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863104, "dur": 6, "args": { "External id": 282976, "cbid": 211, "correlation": 282976 } }, { "ph": "s", "id": 282976, "pid": 76337, "tid": -914061504, "ts": 1716454225863104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225942650, "dur": 36, "args": { "External id": 282985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 282985, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 282985, "pid": 5, "tid": 7, "ts": 1716454225942650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863144, "dur": 9, "args": { "External id": 282985, "cbid": 211, "correlation": 282985 } }, { "ph": "s", "id": 282985, "pid": 76337, "tid": -914061504, "ts": 1716454225863144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225863206, "dur": 0, "args": { "External id": 282995, "cbid": 317, "correlation": 282995 } }, { "ph": "f", "id": 282995, "pid": 76337, "tid": -914061504, "ts": 1716454225863206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225863207, "dur": 0, "args": { "External id": 282996, "cbid": 203, "correlation": 282996 } }, { "ph": "f", "id": 282996, "pid": 76337, "tid": -914061504, "ts": 1716454225863207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225863208, "dur": 0, "args": { "External id": 282997, "cbid": 205, "correlation": 282997 } }, { "ph": "f", "id": 282997, "pid": 76337, "tid": -914061504, "ts": 1716454225863208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225942687, "dur": 39, "args": { "External id": 283001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283001, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283001, "pid": 5, "tid": 7, "ts": 1716454225942687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863226, "dur": 13, "args": { "External id": 283001, "cbid": 211, "correlation": 283001 } }, { "ph": "s", "id": 283001, "pid": 76337, "tid": -914061504, "ts": 1716454225863226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225942728, "dur": 13, "args": { "External id": 283003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283003, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283003, "pid": 5, "tid": 7, "ts": 1716454225942728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863242, "dur": 5, "args": { "External id": 283003, "cbid": 211, "correlation": 283003 } }, { "ph": "s", "id": 283003, "pid": 76337, "tid": -914061504, "ts": 1716454225863242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225942743, "dur": 3, "args": { "External id": 283005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283005, "pid": 5, "tid": 7, "ts": 1716454225942743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863252, "dur": 6, "args": { "External id": 283005, "cbid": 211, "correlation": 283005 } }, { "ph": "s", "id": 283005, "pid": 76337, "tid": -914061504, "ts": 1716454225863252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225863261, "dur": 0, "args": { "External id": 283006, "cbid": 51, "correlation": 283006 } }, { "ph": "s", "id": 283006, "pid": 76337, "tid": -914061504, "ts": 1716454225863261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225942747, "dur": 676, "args": { "External id": 283007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283007, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283007, "pid": 5, "tid": 7, "ts": 1716454225942747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863262, "dur": 5, "args": { "External id": 283007, "cbid": 211, "correlation": 283007 } }, { "ph": "s", "id": 283007, "pid": 76337, "tid": -914061504, "ts": 1716454225863262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225943424, "dur": 58, "args": { "External id": 283012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283012, "pid": 5, "tid": 7, "ts": 1716454225943424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863290, "dur": 8, "args": { "External id": 283012, "cbid": 211, "correlation": 283012 } }, { "ph": "s", "id": 283012, "pid": 76337, "tid": -914061504, "ts": 1716454225863290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225863346, "dur": 0, "args": { "External id": 283022, "cbid": 317, "correlation": 283022 } }, { "ph": "f", "id": 283022, "pid": 76337, "tid": -914061504, "ts": 1716454225863346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225863347, "dur": 0, "args": { "External id": 283023, "cbid": 203, "correlation": 283023 } }, { "ph": "f", "id": 283023, "pid": 76337, "tid": -914061504, "ts": 1716454225863347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225863348, "dur": 0, "args": { "External id": 283024, "cbid": 205, "correlation": 283024 } }, { "ph": "f", "id": 283024, "pid": 76337, "tid": -914061504, "ts": 1716454225863348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225943484, "dur": 3, "args": { "External id": 283028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283028, "pid": 5, "tid": 7, "ts": 1716454225943484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863364, "dur": 11, "args": { "External id": 283028, "cbid": 211, "correlation": 283028 } }, { "ph": "s", "id": 283028, "pid": 76337, "tid": -914061504, "ts": 1716454225863364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225863379, "dur": 0, "args": { "External id": 283029, "cbid": 51, "correlation": 283029 } }, { "ph": "s", "id": 283029, "pid": 76337, "tid": -914061504, "ts": 1716454225863379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454225943488, "dur": 257, "args": { "External id": 283030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283030, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283030, "pid": 5, "tid": 7, "ts": 1716454225943488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863381, "dur": 8, "args": { "External id": 283030, "cbid": 211, "correlation": 283030 } }, { "ph": "s", "id": 283030, "pid": 76337, "tid": -914061504, "ts": 1716454225863381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225943746, "dur": 57, "args": { "External id": 283035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283035, "pid": 5, "tid": 7, "ts": 1716454225943746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863408, "dur": 8, "args": { "External id": 283035, "cbid": 211, "correlation": 283035 } }, { "ph": "s", "id": 283035, "pid": 76337, "tid": -914061504, "ts": 1716454225863408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225943805, "dur": 50, "args": { "External id": 283043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283043, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283043, "pid": 5, "tid": 7, "ts": 1716454225943805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863437, "dur": 8, "args": { "External id": 283043, "cbid": 211, "correlation": 283043 } }, { "ph": "s", "id": 283043, "pid": 76337, "tid": -914061504, "ts": 1716454225863437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225943856, "dur": 35, "args": { "External id": 283051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283051, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283051, "pid": 5, "tid": 7, "ts": 1716454225943856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863465, "dur": 8, "args": { "External id": 283051, "cbid": 211, "correlation": 283051 } }, { "ph": "s", "id": 283051, "pid": 76337, "tid": -914061504, "ts": 1716454225863465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225943892, "dur": 50, "args": { "External id": 283071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283071, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 283071, "pid": 5, "tid": 7, "ts": 1716454225943892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863547, "dur": 12, "args": { "External id": 283071, "cbid": 211, "correlation": 283071 } }, { "ph": "s", "id": 283071, "pid": 76337, "tid": -914061504, "ts": 1716454225863547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225943944, "dur": 4, "args": { "External id": 283083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283083, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283083, "pid": 5, "tid": 7, "ts": 1716454225943944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863572, "dur": 7, "args": { "External id": 283083, "cbid": 211, "correlation": 283083 } }, { "ph": "s", "id": 283083, "pid": 76337, "tid": -914061504, "ts": 1716454225863572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225943949, "dur": 53, "args": { "External id": 283086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283086, "pid": 5, "tid": 7, "ts": 1716454225943949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863590, "dur": 8, "args": { "External id": 283086, "cbid": 211, "correlation": 283086 } }, { "ph": "s", "id": 283086, "pid": 76337, "tid": -914061504, "ts": 1716454225863590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225863649, "dur": 0, "args": { "External id": 283097, "cbid": 317, "correlation": 283097 } }, { "ph": "f", "id": 283097, "pid": 76337, "tid": -914061504, "ts": 1716454225863649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225863650, "dur": 0, "args": { "External id": 283098, "cbid": 203, "correlation": 283098 } }, { "ph": "f", "id": 283098, "pid": 76337, "tid": -914061504, "ts": 1716454225863650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225863651, "dur": 0, "args": { "External id": 283099, "cbid": 205, "correlation": 283099 } }, { "ph": "f", "id": 283099, "pid": 76337, "tid": -914061504, "ts": 1716454225863651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863684, "dur": 2, "args": { "External id": 283103, "cbid": 251, "correlation": 283103 } }, { "ph": "f", "id": 283103, "pid": 76337, "tid": -914061504, "ts": 1716454225863684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863688, "dur": 1, "args": { "External id": 283104, "cbid": 251, "correlation": 283104 } }, { "ph": "f", "id": 283104, "pid": 76337, "tid": -914061504, "ts": 1716454225863688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863690, "dur": 1, "args": { "External id": 283105, "cbid": 251, "correlation": 283105 } }, { "ph": "f", "id": 283105, "pid": 76337, "tid": -914061504, "ts": 1716454225863690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863692, "dur": 1, "args": { "External id": 283106, "cbid": 251, "correlation": 283106 } }, { "ph": "f", "id": 283106, "pid": 76337, "tid": -914061504, "ts": 1716454225863692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863694, "dur": 1, "args": { "External id": 283107, "cbid": 251, "correlation": 283107 } }, { "ph": "f", "id": 283107, "pid": 76337, "tid": -914061504, "ts": 1716454225863694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863695, "dur": 1, "args": { "External id": 283108, "cbid": 251, "correlation": 283108 } }, { "ph": "f", "id": 283108, "pid": 76337, "tid": -914061504, "ts": 1716454225863695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863697, "dur": 1, "args": { "External id": 283109, "cbid": 251, "correlation": 283109 } }, { "ph": "f", "id": 283109, "pid": 76337, "tid": -914061504, "ts": 1716454225863697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863699, "dur": 1, "args": { "External id": 283110, "cbid": 251, "correlation": 283110 } }, { "ph": "f", "id": 283110, "pid": 76337, "tid": -914061504, "ts": 1716454225863699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863702, "dur": 0, "args": { "External id": 283111, "cbid": 251, "correlation": 283111 } }, { "ph": "f", "id": 283111, "pid": 76337, "tid": -914061504, "ts": 1716454225863702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225944003, "dur": 110, "args": { "External id": 283112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283112, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283112, "pid": 5, "tid": 7, "ts": 1716454225944003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863705, "dur": 13, "args": { "External id": 283112, "cbid": 211, "correlation": 283112 } }, { "ph": "s", "id": 283112, "pid": 76337, "tid": -914061504, "ts": 1716454225863705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225944114, "dur": 58, "args": { "External id": 283118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283118, "pid": 5, "tid": 7, "ts": 1716454225944114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863744, "dur": 9, "args": { "External id": 283118, "cbid": 211, "correlation": 283118 } }, { "ph": "s", "id": 283118, "pid": 76337, "tid": -914061504, "ts": 1716454225863744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225944173, "dur": 604, "args": { "External id": 283127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283127, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283127, "pid": 5, "tid": 7, "ts": 1716454225944173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863827, "dur": 14, "args": { "External id": 283127, "cbid": 211, "correlation": 283127 } }, { "ph": "s", "id": 283127, "pid": 76337, "tid": -914061504, "ts": 1716454225863827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225944779, "dur": 172, "args": { "External id": 283149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283149, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283149, "pid": 5, "tid": 7, "ts": 1716454225944779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863887, "dur": 11, "args": { "External id": 283149, "cbid": 211, "correlation": 283149 } }, { "ph": "s", "id": 283149, "pid": 76337, "tid": -914061504, "ts": 1716454225863887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225863987, "dur": 1, "args": { "External id": 283160, "cbid": 251, "correlation": 283160 } }, { "ph": "f", "id": 283160, "pid": 76337, "tid": -914061504, "ts": 1716454225863987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225944952, "dur": 195, "args": { "External id": 283161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283161, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283161, "pid": 5, "tid": 7, "ts": 1716454225944952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225863993, "dur": 13, "args": { "External id": 283161, "cbid": 211, "correlation": 283161 } }, { "ph": "s", "id": 283161, "pid": 76337, "tid": -914061504, "ts": 1716454225863993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864067, "dur": 1, "args": { "External id": 283172, "cbid": 251, "correlation": 283172 } }, { "ph": "f", "id": 283172, "pid": 76337, "tid": -914061504, "ts": 1716454225864067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225945148, "dur": 181, "args": { "External id": 283173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283173, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283173, "pid": 5, "tid": 7, "ts": 1716454225945148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864070, "dur": 12, "args": { "External id": 283173, "cbid": 211, "correlation": 283173 } }, { "ph": "s", "id": 283173, "pid": 76337, "tid": -914061504, "ts": 1716454225864070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864134, "dur": 1, "args": { "External id": 283184, "cbid": 251, "correlation": 283184 } }, { "ph": "f", "id": 283184, "pid": 76337, "tid": -914061504, "ts": 1716454225864134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225945330, "dur": 178, "args": { "External id": 283185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283185, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283185, "pid": 5, "tid": 7, "ts": 1716454225945330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864138, "dur": 12, "args": { "External id": 283185, "cbid": 211, "correlation": 283185 } }, { "ph": "s", "id": 283185, "pid": 76337, "tid": -914061504, "ts": 1716454225864138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225945509, "dur": 17570, "args": { "External id": 283206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283206, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283206, "pid": 5, "tid": 7, "ts": 1716454225945509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864222, "dur": 15, "args": { "External id": 283206, "cbid": 211, "correlation": 283206 } }, { "ph": "s", "id": 283206, "pid": 76337, "tid": -914061504, "ts": 1716454225864222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864325, "dur": 1, "args": { "External id": 283224, "cbid": 251, "correlation": 283224 } }, { "ph": "f", "id": 283224, "pid": 76337, "tid": -914061504, "ts": 1716454225864325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225963080, "dur": 193, "args": { "External id": 283226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283226, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283226, "pid": 5, "tid": 7, "ts": 1716454225963080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864331, "dur": 14, "args": { "External id": 283226, "cbid": 211, "correlation": 283226 } }, { "ph": "s", "id": 283226, "pid": 76337, "tid": -914061504, "ts": 1716454225864331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225963274, "dur": 67, "args": { "External id": 283234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283234, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283234, "pid": 5, "tid": 7, "ts": 1716454225963274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864401, "dur": 12, "args": { "External id": 283234, "cbid": 211, "correlation": 283234 } }, { "ph": "s", "id": 283234, "pid": 76337, "tid": -914061504, "ts": 1716454225864401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225963342, "dur": 98, "args": { "External id": 283242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283242, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283242, "pid": 5, "tid": 7, "ts": 1716454225963342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864440, "dur": 9, "args": { "External id": 283242, "cbid": 211, "correlation": 283242 } }, { "ph": "s", "id": 283242, "pid": 76337, "tid": -914061504, "ts": 1716454225864440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225963441, "dur": 53, "args": { "External id": 283253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283253, "pid": 5, "tid": 7, "ts": 1716454225963441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864511, "dur": 12, "args": { "External id": 283253, "cbid": 211, "correlation": 283253 } }, { "ph": "s", "id": 283253, "pid": 76337, "tid": -914061504, "ts": 1716454225864511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225963496, "dur": 87, "args": { "External id": 283275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283275, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283275, "pid": 5, "tid": 7, "ts": 1716454225963496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864545, "dur": 8, "args": { "External id": 283275, "cbid": 211, "correlation": 283275 } }, { "ph": "s", "id": 283275, "pid": 76337, "tid": -914061504, "ts": 1716454225864545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864630, "dur": 1, "args": { "External id": 283286, "cbid": 251, "correlation": 283286 } }, { "ph": "f", "id": 283286, "pid": 76337, "tid": -914061504, "ts": 1716454225864630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225963584, "dur": 99, "args": { "External id": 283287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283287, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283287, "pid": 5, "tid": 7, "ts": 1716454225963584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864636, "dur": 14, "args": { "External id": 283287, "cbid": 211, "correlation": 283287 } }, { "ph": "s", "id": 283287, "pid": 76337, "tid": -914061504, "ts": 1716454225864636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864716, "dur": 1, "args": { "External id": 283298, "cbid": 251, "correlation": 283298 } }, { "ph": "f", "id": 283298, "pid": 76337, "tid": -914061504, "ts": 1716454225864716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864721, "dur": 0, "args": { "External id": 283299, "cbid": 251, "correlation": 283299 } }, { "ph": "f", "id": 283299, "pid": 76337, "tid": -914061504, "ts": 1716454225864721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225963684, "dur": 10, "args": { "External id": 283300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283300, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 283300, "pid": 5, "tid": 7, "ts": 1716454225963684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864723, "dur": 14, "args": { "External id": 283300, "cbid": 211, "correlation": 283300 } }, { "ph": "s", "id": 283300, "pid": 76337, "tid": -914061504, "ts": 1716454225864723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225963695, "dur": 5, "args": { "External id": 283302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283302, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 283302, "pid": 5, "tid": 7, "ts": 1716454225963695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864740, "dur": 9, "args": { "External id": 283302, "cbid": 211, "correlation": 283302 } }, { "ph": "s", "id": 283302, "pid": 76337, "tid": -914061504, "ts": 1716454225864740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864807, "dur": 1, "args": { "External id": 283313, "cbid": 251, "correlation": 283313 } }, { "ph": "f", "id": 283313, "pid": 76337, "tid": -914061504, "ts": 1716454225864807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225864811, "dur": 0, "args": { "External id": 283314, "cbid": 251, "correlation": 283314 } }, { "ph": "f", "id": 283314, "pid": 76337, "tid": -914061504, "ts": 1716454225864811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225963701, "dur": 6, "args": { "External id": 283315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283315, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 283315, "pid": 5, "tid": 7, "ts": 1716454225963701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864812, "dur": 12, "args": { "External id": 283315, "cbid": 211, "correlation": 283315 } }, { "ph": "s", "id": 283315, "pid": 76337, "tid": -914061504, "ts": 1716454225864812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225963708, "dur": 3, "args": { "External id": 283317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283317, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 283317, "pid": 5, "tid": 7, "ts": 1716454225963708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864826, "dur": 6, "args": { "External id": 283317, "cbid": 211, "correlation": 283317 } }, { "ph": "s", "id": 283317, "pid": 76337, "tid": -914061504, "ts": 1716454225864826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225963713, "dur": 151, "args": { "External id": 283338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283338, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283338, "pid": 5, "tid": 7, "ts": 1716454225963713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225864901, "dur": 12, "args": { "External id": 283338, "cbid": 211, "correlation": 283338 } }, { "ph": "s", "id": 283338, "pid": 76337, "tid": -914061504, "ts": 1716454225864901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865007, "dur": 2, "args": { "External id": 283356, "cbid": 251, "correlation": 283356 } }, { "ph": "f", "id": 283356, "pid": 76337, "tid": -914061504, "ts": 1716454225865007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225963865, "dur": 102, "args": { "External id": 283358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283358, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283358, "pid": 5, "tid": 7, "ts": 1716454225963865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865013, "dur": 14, "args": { "External id": 283358, "cbid": 211, "correlation": 283358 } }, { "ph": "s", "id": 283358, "pid": 76337, "tid": -914061504, "ts": 1716454225865013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225963969, "dur": 35, "args": { "External id": 283366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283366, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283366, "pid": 5, "tid": 7, "ts": 1716454225963969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865083, "dur": 12, "args": { "External id": 283366, "cbid": 211, "correlation": 283366 } }, { "ph": "s", "id": 283366, "pid": 76337, "tid": -914061504, "ts": 1716454225865083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225964005, "dur": 65, "args": { "External id": 283374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283374, "pid": 5, "tid": 7, "ts": 1716454225964005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865124, "dur": 9, "args": { "External id": 283374, "cbid": 211, "correlation": 283374 } }, { "ph": "s", "id": 283374, "pid": 76337, "tid": -914061504, "ts": 1716454225865124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225964071, "dur": 87, "args": { "External id": 283396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283396, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283396, "pid": 5, "tid": 7, "ts": 1716454225964071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865175, "dur": 10, "args": { "External id": 283396, "cbid": 211, "correlation": 283396 } }, { "ph": "s", "id": 283396, "pid": 76337, "tid": -914061504, "ts": 1716454225865175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865260, "dur": 1, "args": { "External id": 283412, "cbid": 251, "correlation": 283412 } }, { "ph": "f", "id": 283412, "pid": 76337, "tid": -914061504, "ts": 1716454225865260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225964159, "dur": 558, "args": { "External id": 283414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283414, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283414, "pid": 5, "tid": 7, "ts": 1716454225964159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865266, "dur": 12, "args": { "External id": 283414, "cbid": 211, "correlation": 283414 } }, { "ph": "s", "id": 283414, "pid": 76337, "tid": -914061504, "ts": 1716454225865266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225964718, "dur": 236, "args": { "External id": 283422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283422, "pid": 5, "tid": 7, "ts": 1716454225964718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865331, "dur": 12, "args": { "External id": 283422, "cbid": 211, "correlation": 283422 } }, { "ph": "s", "id": 283422, "pid": 76337, "tid": -914061504, "ts": 1716454225865331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225964956, "dur": 251, "args": { "External id": 283430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283430, "pid": 5, "tid": 7, "ts": 1716454225964956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865363, "dur": 8, "args": { "External id": 283430, "cbid": 211, "correlation": 283430 } }, { "ph": "s", "id": 283430, "pid": 76337, "tid": -914061504, "ts": 1716454225865363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865444, "dur": 1, "args": { "External id": 283446, "cbid": 251, "correlation": 283446 } }, { "ph": "f", "id": 283446, "pid": 76337, "tid": -914061504, "ts": 1716454225865444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865449, "dur": 0, "args": { "External id": 283448, "cbid": 251, "correlation": 283448 } }, { "ph": "f", "id": 283448, "pid": 76337, "tid": -914061504, "ts": 1716454225865449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225965208, "dur": 353, "args": { "External id": 283449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283449, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 283449, "pid": 5, "tid": 7, "ts": 1716454225965208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865452, "dur": 13, "args": { "External id": 283449, "cbid": 211, "correlation": 283449 } }, { "ph": "s", "id": 283449, "pid": 76337, "tid": -914061504, "ts": 1716454225865452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225965562, "dur": 50, "args": { "External id": 283457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283457, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283457, "pid": 5, "tid": 7, "ts": 1716454225965562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865493, "dur": 11, "args": { "External id": 283457, "cbid": 211, "correlation": 283457 } }, { "ph": "s", "id": 283457, "pid": 76337, "tid": -914061504, "ts": 1716454225865493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225965614, "dur": 151, "args": { "External id": 283468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283468, "pid": 5, "tid": 7, "ts": 1716454225965614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865562, "dur": 12, "args": { "External id": 283468, "cbid": 211, "correlation": 283468 } }, { "ph": "s", "id": 283468, "pid": 76337, "tid": -914061504, "ts": 1716454225865562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225865625, "dur": 0, "args": { "External id": 283480, "cbid": 317, "correlation": 283480 } }, { "ph": "f", "id": 283480, "pid": 76337, "tid": -914061504, "ts": 1716454225865625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225865626, "dur": 0, "args": { "External id": 283481, "cbid": 203, "correlation": 283481 } }, { "ph": "f", "id": 283481, "pid": 76337, "tid": -914061504, "ts": 1716454225865626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225865627, "dur": 0, "args": { "External id": 283482, "cbid": 205, "correlation": 283482 } }, { "ph": "f", "id": 283482, "pid": 76337, "tid": -914061504, "ts": 1716454225865627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865650, "dur": 1, "args": { "External id": 283486, "cbid": 251, "correlation": 283486 } }, { "ph": "f", "id": 283486, "pid": 76337, "tid": -914061504, "ts": 1716454225865650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865652, "dur": 1, "args": { "External id": 283487, "cbid": 251, "correlation": 283487 } }, { "ph": "f", "id": 283487, "pid": 76337, "tid": -914061504, "ts": 1716454225865652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865654, "dur": 0, "args": { "External id": 283488, "cbid": 251, "correlation": 283488 } }, { "ph": "f", "id": 283488, "pid": 76337, "tid": -914061504, "ts": 1716454225865654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865655, "dur": 0, "args": { "External id": 283489, "cbid": 251, "correlation": 283489 } }, { "ph": "f", "id": 283489, "pid": 76337, "tid": -914061504, "ts": 1716454225865655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865656, "dur": 0, "args": { "External id": 283490, "cbid": 251, "correlation": 283490 } }, { "ph": "f", "id": 283490, "pid": 76337, "tid": -914061504, "ts": 1716454225865656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865657, "dur": 0, "args": { "External id": 283491, "cbid": 251, "correlation": 283491 } }, { "ph": "f", "id": 283491, "pid": 76337, "tid": -914061504, "ts": 1716454225865657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865658, "dur": 0, "args": { "External id": 283492, "cbid": 251, "correlation": 283492 } }, { "ph": "f", "id": 283492, "pid": 76337, "tid": -914061504, "ts": 1716454225865658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865659, "dur": 0, "args": { "External id": 283493, "cbid": 251, "correlation": 283493 } }, { "ph": "f", "id": 283493, "pid": 76337, "tid": -914061504, "ts": 1716454225865659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225865660, "dur": 0, "args": { "External id": 283494, "cbid": 251, "correlation": 283494 } }, { "ph": "f", "id": 283494, "pid": 76337, "tid": -914061504, "ts": 1716454225865660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225965766, "dur": 111, "args": { "External id": 283495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283495, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283495, "pid": 5, "tid": 7, "ts": 1716454225965766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865662, "dur": 12, "args": { "External id": 283495, "cbid": 211, "correlation": 283495 } }, { "ph": "s", "id": 283495, "pid": 76337, "tid": -914061504, "ts": 1716454225865662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225965878, "dur": 58, "args": { "External id": 283501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283501, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283501, "pid": 5, "tid": 7, "ts": 1716454225965878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865696, "dur": 9, "args": { "External id": 283501, "cbid": 211, "correlation": 283501 } }, { "ph": "s", "id": 283501, "pid": 76337, "tid": -914061504, "ts": 1716454225865696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225965937, "dur": 50, "args": { "External id": 283509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283509, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283509, "pid": 5, "tid": 7, "ts": 1716454225965937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865728, "dur": 8, "args": { "External id": 283509, "cbid": 211, "correlation": 283509 } }, { "ph": "s", "id": 283509, "pid": 76337, "tid": -914061504, "ts": 1716454225865728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225965989, "dur": 96, "args": { "External id": 283518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283518, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283518, "pid": 5, "tid": 7, "ts": 1716454225965989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865767, "dur": 10, "args": { "External id": 283518, "cbid": 211, "correlation": 283518 } }, { "ph": "s", "id": 283518, "pid": 76337, "tid": -914061504, "ts": 1716454225865767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225966086, "dur": 88, "args": { "External id": 283538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283538, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 283538, "pid": 5, "tid": 7, "ts": 1716454225966086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865838, "dur": 11, "args": { "External id": 283538, "cbid": 211, "correlation": 283538 } }, { "ph": "s", "id": 283538, "pid": 76337, "tid": -914061504, "ts": 1716454225865838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225966175, "dur": 4, "args": { "External id": 283550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283550, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 283550, "pid": 5, "tid": 7, "ts": 1716454225966175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865859, "dur": 7, "args": { "External id": 283550, "cbid": 211, "correlation": 283550 } }, { "ph": "s", "id": 283550, "pid": 76337, "tid": -914061504, "ts": 1716454225865859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225966181, "dur": 105, "args": { "External id": 283553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283553, "pid": 5, "tid": 7, "ts": 1716454225966181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865878, "dur": 7, "args": { "External id": 283553, "cbid": 211, "correlation": 283553 } }, { "ph": "s", "id": 283553, "pid": 76337, "tid": -914061504, "ts": 1716454225865878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225966287, "dur": 68, "args": { "External id": 283562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283562, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283562, "pid": 5, "tid": 7, "ts": 1716454225966287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865917, "dur": 9, "args": { "External id": 283562, "cbid": 211, "correlation": 283562 } }, { "ph": "s", "id": 283562, "pid": 76337, "tid": -914061504, "ts": 1716454225865917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225865968, "dur": 0, "args": { "External id": 283572, "cbid": 317, "correlation": 283572 } }, { "ph": "f", "id": 283572, "pid": 76337, "tid": -914061504, "ts": 1716454225865968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225865969, "dur": 0, "args": { "External id": 283573, "cbid": 203, "correlation": 283573 } }, { "ph": "f", "id": 283573, "pid": 76337, "tid": -914061504, "ts": 1716454225865969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225865970, "dur": 0, "args": { "External id": 283574, "cbid": 205, "correlation": 283574 } }, { "ph": "f", "id": 283574, "pid": 76337, "tid": -914061504, "ts": 1716454225865970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225966356, "dur": 76, "args": { "External id": 283578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283578, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283578, "pid": 5, "tid": 7, "ts": 1716454225966356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225865992, "dur": 12, "args": { "External id": 283578, "cbid": 211, "correlation": 283578 } }, { "ph": "s", "id": 283578, "pid": 76337, "tid": -914061504, "ts": 1716454225865992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225966433, "dur": 23, "args": { "External id": 283580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283580, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283580, "pid": 5, "tid": 7, "ts": 1716454225966433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866006, "dur": 5, "args": { "External id": 283580, "cbid": 211, "correlation": 283580 } }, { "ph": "s", "id": 283580, "pid": 76337, "tid": -914061504, "ts": 1716454225866006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225966458, "dur": 4, "args": { "External id": 283582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283582, "pid": 5, "tid": 7, "ts": 1716454225966458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866016, "dur": 6, "args": { "External id": 283582, "cbid": 211, "correlation": 283582 } }, { "ph": "s", "id": 283582, "pid": 76337, "tid": -914061504, "ts": 1716454225866016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225866025, "dur": 0, "args": { "External id": 283583, "cbid": 51, "correlation": 283583 } }, { "ph": "s", "id": 283583, "pid": 76337, "tid": -914061504, "ts": 1716454225866025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225966463, "dur": 1316, "args": { "External id": 283584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283584, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283584, "pid": 5, "tid": 7, "ts": 1716454225966463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866026, "dur": 5, "args": { "External id": 283584, "cbid": 211, "correlation": 283584 } }, { "ph": "s", "id": 283584, "pid": 76337, "tid": -914061504, "ts": 1716454225866026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225967780, "dur": 57, "args": { "External id": 283589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283589, "pid": 5, "tid": 7, "ts": 1716454225967780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866053, "dur": 8, "args": { "External id": 283589, "cbid": 211, "correlation": 283589 } }, { "ph": "s", "id": 283589, "pid": 76337, "tid": -914061504, "ts": 1716454225866053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225967839, "dur": 4, "args": { "External id": 283597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283597, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283597, "pid": 5, "tid": 7, "ts": 1716454225967839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866096, "dur": 9, "args": { "External id": 283597, "cbid": 211, "correlation": 283597 } }, { "ph": "s", "id": 283597, "pid": 76337, "tid": -914061504, "ts": 1716454225866096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866161, "dur": 2, "args": { "External id": 283613, "cbid": 251, "correlation": 283613 } }, { "ph": "f", "id": 283613, "pid": 76337, "tid": -914061504, "ts": 1716454225866161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866167, "dur": 0, "args": { "External id": 283615, "cbid": 251, "correlation": 283615 } }, { "ph": "f", "id": 283615, "pid": 76337, "tid": -914061504, "ts": 1716454225866167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225967844, "dur": 11, "args": { "External id": 283616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283616, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 283616, "pid": 5, "tid": 7, "ts": 1716454225967844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866170, "dur": 11, "args": { "External id": 283616, "cbid": 211, "correlation": 283616 } }, { "ph": "s", "id": 283616, "pid": 76337, "tid": -914061504, "ts": 1716454225866170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225967857, "dur": 5, "args": { "External id": 283618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283618, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 283618, "pid": 5, "tid": 7, "ts": 1716454225967857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866183, "dur": 5, "args": { "External id": 283618, "cbid": 211, "correlation": 283618 } }, { "ph": "s", "id": 283618, "pid": 76337, "tid": -914061504, "ts": 1716454225866183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225967863, "dur": 53, "args": { "External id": 283628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283628, "pid": 5, "tid": 7, "ts": 1716454225967863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866239, "dur": 12, "args": { "External id": 283628, "cbid": 211, "correlation": 283628 } }, { "ph": "s", "id": 283628, "pid": 76337, "tid": -914061504, "ts": 1716454225866239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225967918, "dur": 49, "args": { "External id": 283648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283648, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 283648, "pid": 5, "tid": 7, "ts": 1716454225967918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866305, "dur": 10, "args": { "External id": 283648, "cbid": 211, "correlation": 283648 } }, { "ph": "s", "id": 283648, "pid": 76337, "tid": -914061504, "ts": 1716454225866305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225967968, "dur": 4, "args": { "External id": 283660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283660, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283660, "pid": 5, "tid": 7, "ts": 1716454225967968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866326, "dur": 6, "args": { "External id": 283660, "cbid": 211, "correlation": 283660 } }, { "ph": "s", "id": 283660, "pid": 76337, "tid": -914061504, "ts": 1716454225866326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225967972, "dur": 53, "args": { "External id": 283663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283663, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283663, "pid": 5, "tid": 7, "ts": 1716454225967972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866345, "dur": 6, "args": { "External id": 283663, "cbid": 211, "correlation": 283663 } }, { "ph": "s", "id": 283663, "pid": 76337, "tid": -914061504, "ts": 1716454225866345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225968027, "dur": 36, "args": { "External id": 283672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283672, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283672, "pid": 5, "tid": 7, "ts": 1716454225968027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866385, "dur": 10, "args": { "External id": 283672, "cbid": 211, "correlation": 283672 } }, { "ph": "s", "id": 283672, "pid": 76337, "tid": -914061504, "ts": 1716454225866385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225866446, "dur": 0, "args": { "External id": 283682, "cbid": 317, "correlation": 283682 } }, { "ph": "f", "id": 283682, "pid": 76337, "tid": -914061504, "ts": 1716454225866446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225866447, "dur": 0, "args": { "External id": 283683, "cbid": 203, "correlation": 283683 } }, { "ph": "f", "id": 283683, "pid": 76337, "tid": -914061504, "ts": 1716454225866447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225866448, "dur": 0, "args": { "External id": 283684, "cbid": 205, "correlation": 283684 } }, { "ph": "f", "id": 283684, "pid": 76337, "tid": -914061504, "ts": 1716454225866448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225968064, "dur": 39, "args": { "External id": 283688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283688, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283688, "pid": 5, "tid": 7, "ts": 1716454225968064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866462, "dur": 12, "args": { "External id": 283688, "cbid": 211, "correlation": 283688 } }, { "ph": "s", "id": 283688, "pid": 76337, "tid": -914061504, "ts": 1716454225866462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225968105, "dur": 14, "args": { "External id": 283690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283690, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283690, "pid": 5, "tid": 7, "ts": 1716454225968105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866477, "dur": 5, "args": { "External id": 283690, "cbid": 211, "correlation": 283690 } }, { "ph": "s", "id": 283690, "pid": 76337, "tid": -914061504, "ts": 1716454225866477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225968120, "dur": 3, "args": { "External id": 283692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283692, "pid": 5, "tid": 7, "ts": 1716454225968120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866486, "dur": 5, "args": { "External id": 283692, "cbid": 211, "correlation": 283692 } }, { "ph": "s", "id": 283692, "pid": 76337, "tid": -914061504, "ts": 1716454225866486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225866494, "dur": 0, "args": { "External id": 283693, "cbid": 51, "correlation": 283693 } }, { "ph": "s", "id": 283693, "pid": 76337, "tid": -914061504, "ts": 1716454225866494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225968124, "dur": 675, "args": { "External id": 283694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283694, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283694, "pid": 5, "tid": 7, "ts": 1716454225968124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866495, "dur": 5, "args": { "External id": 283694, "cbid": 211, "correlation": 283694 } }, { "ph": "s", "id": 283694, "pid": 76337, "tid": -914061504, "ts": 1716454225866495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225968801, "dur": 57, "args": { "External id": 283699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283699, "pid": 5, "tid": 7, "ts": 1716454225968801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866522, "dur": 9, "args": { "External id": 283699, "cbid": 211, "correlation": 283699 } }, { "ph": "s", "id": 283699, "pid": 76337, "tid": -914061504, "ts": 1716454225866522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225866579, "dur": 0, "args": { "External id": 283709, "cbid": 317, "correlation": 283709 } }, { "ph": "f", "id": 283709, "pid": 76337, "tid": -914061504, "ts": 1716454225866579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225866580, "dur": 0, "args": { "External id": 283710, "cbid": 203, "correlation": 283710 } }, { "ph": "f", "id": 283710, "pid": 76337, "tid": -914061504, "ts": 1716454225866580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225866581, "dur": 0, "args": { "External id": 283711, "cbid": 205, "correlation": 283711 } }, { "ph": "f", "id": 283711, "pid": 76337, "tid": -914061504, "ts": 1716454225866581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225968859, "dur": 75, "args": { "External id": 283715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283715, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283715, "pid": 5, "tid": 7, "ts": 1716454225968859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866593, "dur": 11, "args": { "External id": 283715, "cbid": 211, "correlation": 283715 } }, { "ph": "s", "id": 283715, "pid": 76337, "tid": -914061504, "ts": 1716454225866593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225968935, "dur": 201, "args": { "External id": 283717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283717, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283717, "pid": 5, "tid": 7, "ts": 1716454225968935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866612, "dur": 8, "args": { "External id": 283717, "cbid": 211, "correlation": 283717 } }, { "ph": "s", "id": 283717, "pid": 76337, "tid": -914061504, "ts": 1716454225866612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225969137, "dur": 39, "args": { "External id": 283719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283719, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283719, "pid": 5, "tid": 7, "ts": 1716454225969137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866625, "dur": 7, "args": { "External id": 283719, "cbid": 211, "correlation": 283719 } }, { "ph": "s", "id": 283719, "pid": 76337, "tid": -914061504, "ts": 1716454225866625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225969178, "dur": 57, "args": { "External id": 283725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283725, "pid": 5, "tid": 7, "ts": 1716454225969178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866653, "dur": 9, "args": { "External id": 283725, "cbid": 211, "correlation": 283725 } }, { "ph": "s", "id": 283725, "pid": 76337, "tid": -914061504, "ts": 1716454225866653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225969236, "dur": 51, "args": { "External id": 283733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283733, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283733, "pid": 5, "tid": 7, "ts": 1716454225969236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866683, "dur": 8, "args": { "External id": 283733, "cbid": 211, "correlation": 283733 } }, { "ph": "s", "id": 283733, "pid": 76337, "tid": -914061504, "ts": 1716454225866683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225969288, "dur": 35, "args": { "External id": 283741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283741, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283741, "pid": 5, "tid": 7, "ts": 1716454225969288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866711, "dur": 8, "args": { "External id": 283741, "cbid": 211, "correlation": 283741 } }, { "ph": "s", "id": 283741, "pid": 76337, "tid": -914061504, "ts": 1716454225866711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225969325, "dur": 49, "args": { "External id": 283761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283761, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 283761, "pid": 5, "tid": 7, "ts": 1716454225969325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866793, "dur": 12, "args": { "External id": 283761, "cbid": 211, "correlation": 283761 } }, { "ph": "s", "id": 283761, "pid": 76337, "tid": -914061504, "ts": 1716454225866793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225969375, "dur": 4, "args": { "External id": 283773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283773, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 283773, "pid": 5, "tid": 7, "ts": 1716454225969375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866818, "dur": 7, "args": { "External id": 283773, "cbid": 211, "correlation": 283773 } }, { "ph": "s", "id": 283773, "pid": 76337, "tid": -914061504, "ts": 1716454225866818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225969381, "dur": 56, "args": { "External id": 283776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283776, "pid": 5, "tid": 7, "ts": 1716454225969381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866837, "dur": 7, "args": { "External id": 283776, "cbid": 211, "correlation": 283776 } }, { "ph": "s", "id": 283776, "pid": 76337, "tid": -914061504, "ts": 1716454225866837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225866895, "dur": 0, "args": { "External id": 283787, "cbid": 317, "correlation": 283787 } }, { "ph": "f", "id": 283787, "pid": 76337, "tid": -914061504, "ts": 1716454225866895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225866896, "dur": 0, "args": { "External id": 283788, "cbid": 203, "correlation": 283788 } }, { "ph": "f", "id": 283788, "pid": 76337, "tid": -914061504, "ts": 1716454225866896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225866897, "dur": 0, "args": { "External id": 283789, "cbid": 205, "correlation": 283789 } }, { "ph": "f", "id": 283789, "pid": 76337, "tid": -914061504, "ts": 1716454225866897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866919, "dur": 1, "args": { "External id": 283793, "cbid": 251, "correlation": 283793 } }, { "ph": "f", "id": 283793, "pid": 76337, "tid": -914061504, "ts": 1716454225866919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866921, "dur": 0, "args": { "External id": 283794, "cbid": 251, "correlation": 283794 } }, { "ph": "f", "id": 283794, "pid": 76337, "tid": -914061504, "ts": 1716454225866921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866921, "dur": 0, "args": { "External id": 283795, "cbid": 251, "correlation": 283795 } }, { "ph": "f", "id": 283795, "pid": 76337, "tid": -914061504, "ts": 1716454225866921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866922, "dur": 0, "args": { "External id": 283796, "cbid": 251, "correlation": 283796 } }, { "ph": "f", "id": 283796, "pid": 76337, "tid": -914061504, "ts": 1716454225866922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866923, "dur": 0, "args": { "External id": 283797, "cbid": 251, "correlation": 283797 } }, { "ph": "f", "id": 283797, "pid": 76337, "tid": -914061504, "ts": 1716454225866923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866923, "dur": 0, "args": { "External id": 283798, "cbid": 251, "correlation": 283798 } }, { "ph": "f", "id": 283798, "pid": 76337, "tid": -914061504, "ts": 1716454225866923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866924, "dur": 0, "args": { "External id": 283799, "cbid": 251, "correlation": 283799 } }, { "ph": "f", "id": 283799, "pid": 76337, "tid": -914061504, "ts": 1716454225866924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866925, "dur": 0, "args": { "External id": 283800, "cbid": 251, "correlation": 283800 } }, { "ph": "f", "id": 283800, "pid": 76337, "tid": -914061504, "ts": 1716454225866925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225866926, "dur": 0, "args": { "External id": 283801, "cbid": 251, "correlation": 283801 } }, { "ph": "f", "id": 283801, "pid": 76337, "tid": -914061504, "ts": 1716454225866926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225969438, "dur": 108, "args": { "External id": 283802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283802, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283802, "pid": 5, "tid": 7, "ts": 1716454225969438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866928, "dur": 13, "args": { "External id": 283802, "cbid": 211, "correlation": 283802 } }, { "ph": "s", "id": 283802, "pid": 76337, "tid": -914061504, "ts": 1716454225866928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225969548, "dur": 57, "args": { "External id": 283808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283808, "pid": 5, "tid": 7, "ts": 1716454225969548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225866963, "dur": 9, "args": { "External id": 283808, "cbid": 211, "correlation": 283808 } }, { "ph": "s", "id": 283808, "pid": 76337, "tid": -914061504, "ts": 1716454225866963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225969606, "dur": 536, "args": { "External id": 283817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283817, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283817, "pid": 5, "tid": 7, "ts": 1716454225969606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867054, "dur": 14, "args": { "External id": 283817, "cbid": 211, "correlation": 283817 } }, { "ph": "s", "id": 283817, "pid": 76337, "tid": -914061504, "ts": 1716454225867054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225970144, "dur": 172, "args": { "External id": 283839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283839, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283839, "pid": 5, "tid": 7, "ts": 1716454225970144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867114, "dur": 11, "args": { "External id": 283839, "cbid": 211, "correlation": 283839 } }, { "ph": "s", "id": 283839, "pid": 76337, "tid": -914061504, "ts": 1716454225867114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867205, "dur": 1, "args": { "External id": 283850, "cbid": 251, "correlation": 283850 } }, { "ph": "f", "id": 283850, "pid": 76337, "tid": -914061504, "ts": 1716454225867205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225970317, "dur": 190, "args": { "External id": 283851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283851, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283851, "pid": 5, "tid": 7, "ts": 1716454225970317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867210, "dur": 13, "args": { "External id": 283851, "cbid": 211, "correlation": 283851 } }, { "ph": "s", "id": 283851, "pid": 76337, "tid": -914061504, "ts": 1716454225867210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867281, "dur": 1, "args": { "External id": 283862, "cbid": 251, "correlation": 283862 } }, { "ph": "f", "id": 283862, "pid": 76337, "tid": -914061504, "ts": 1716454225867281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225970509, "dur": 182, "args": { "External id": 283863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283863, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283863, "pid": 5, "tid": 7, "ts": 1716454225970509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867285, "dur": 12, "args": { "External id": 283863, "cbid": 211, "correlation": 283863 } }, { "ph": "s", "id": 283863, "pid": 76337, "tid": -914061504, "ts": 1716454225867285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867351, "dur": 1, "args": { "External id": 283874, "cbid": 251, "correlation": 283874 } }, { "ph": "f", "id": 283874, "pid": 76337, "tid": -914061504, "ts": 1716454225867351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225970692, "dur": 183, "args": { "External id": 283875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283875, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283875, "pid": 5, "tid": 7, "ts": 1716454225970692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867355, "dur": 11, "args": { "External id": 283875, "cbid": 211, "correlation": 283875 } }, { "ph": "s", "id": 283875, "pid": 76337, "tid": -914061504, "ts": 1716454225867355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225970876, "dur": 17654, "args": { "External id": 283896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283896, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 283896, "pid": 5, "tid": 7, "ts": 1716454225970876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867435, "dur": 13, "args": { "External id": 283896, "cbid": 211, "correlation": 283896 } }, { "ph": "s", "id": 283896, "pid": 76337, "tid": -914061504, "ts": 1716454225867435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867534, "dur": 1, "args": { "External id": 283914, "cbid": 251, "correlation": 283914 } }, { "ph": "f", "id": 283914, "pid": 76337, "tid": -914061504, "ts": 1716454225867534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225988531, "dur": 193, "args": { "External id": 283916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283916, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283916, "pid": 5, "tid": 7, "ts": 1716454225988531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867540, "dur": 14, "args": { "External id": 283916, "cbid": 211, "correlation": 283916 } }, { "ph": "s", "id": 283916, "pid": 76337, "tid": -914061504, "ts": 1716454225867540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225988726, "dur": 66, "args": { "External id": 283924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283924, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283924, "pid": 5, "tid": 7, "ts": 1716454225988726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867610, "dur": 12, "args": { "External id": 283924, "cbid": 211, "correlation": 283924 } }, { "ph": "s", "id": 283924, "pid": 76337, "tid": -914061504, "ts": 1716454225867610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225988793, "dur": 98, "args": { "External id": 283932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283932, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283932, "pid": 5, "tid": 7, "ts": 1716454225988793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867649, "dur": 9, "args": { "External id": 283932, "cbid": 211, "correlation": 283932 } }, { "ph": "s", "id": 283932, "pid": 76337, "tid": -914061504, "ts": 1716454225867649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225988892, "dur": 53, "args": { "External id": 283943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283943, "pid": 5, "tid": 7, "ts": 1716454225988892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867719, "dur": 12, "args": { "External id": 283943, "cbid": 211, "correlation": 283943 } }, { "ph": "s", "id": 283943, "pid": 76337, "tid": -914061504, "ts": 1716454225867719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225988946, "dur": 87, "args": { "External id": 283965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283965, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 283965, "pid": 5, "tid": 7, "ts": 1716454225988946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867750, "dur": 8, "args": { "External id": 283965, "cbid": 211, "correlation": 283965 } }, { "ph": "s", "id": 283965, "pid": 76337, "tid": -914061504, "ts": 1716454225867750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867838, "dur": 1, "args": { "External id": 283976, "cbid": 251, "correlation": 283976 } }, { "ph": "f", "id": 283976, "pid": 76337, "tid": -914061504, "ts": 1716454225867838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225989034, "dur": 97, "args": { "External id": 283977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283977, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 283977, "pid": 5, "tid": 7, "ts": 1716454225989034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867843, "dur": 14, "args": { "External id": 283977, "cbid": 211, "correlation": 283977 } }, { "ph": "s", "id": 283977, "pid": 76337, "tid": -914061504, "ts": 1716454225867843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867915, "dur": 1, "args": { "External id": 283988, "cbid": 251, "correlation": 283988 } }, { "ph": "f", "id": 283988, "pid": 76337, "tid": -914061504, "ts": 1716454225867915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225867918, "dur": 0, "args": { "External id": 283989, "cbid": 251, "correlation": 283989 } }, { "ph": "f", "id": 283989, "pid": 76337, "tid": -914061504, "ts": 1716454225867918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225989133, "dur": 10, "args": { "External id": 283990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283990, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 283990, "pid": 5, "tid": 7, "ts": 1716454225989133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867920, "dur": 12, "args": { "External id": 283990, "cbid": 211, "correlation": 283990 } }, { "ph": "s", "id": 283990, "pid": 76337, "tid": -914061504, "ts": 1716454225867920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225989144, "dur": 5, "args": { "External id": 283992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 283992, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 283992, "pid": 5, "tid": 7, "ts": 1716454225989144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225867934, "dur": 6, "args": { "External id": 283992, "cbid": 211, "correlation": 283992 } }, { "ph": "s", "id": 283992, "pid": 76337, "tid": -914061504, "ts": 1716454225867934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868008, "dur": 1, "args": { "External id": 284003, "cbid": 251, "correlation": 284003 } }, { "ph": "f", "id": 284003, "pid": 76337, "tid": -914061504, "ts": 1716454225868008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868012, "dur": 0, "args": { "External id": 284004, "cbid": 251, "correlation": 284004 } }, { "ph": "f", "id": 284004, "pid": 76337, "tid": -914061504, "ts": 1716454225868012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225989151, "dur": 6, "args": { "External id": 284005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284005, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284005, "pid": 5, "tid": 7, "ts": 1716454225989151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868014, "dur": 12, "args": { "External id": 284005, "cbid": 211, "correlation": 284005 } }, { "ph": "s", "id": 284005, "pid": 76337, "tid": -914061504, "ts": 1716454225868014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225989158, "dur": 3, "args": { "External id": 284007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284007, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 284007, "pid": 5, "tid": 7, "ts": 1716454225989158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868028, "dur": 5, "args": { "External id": 284007, "cbid": 211, "correlation": 284007 } }, { "ph": "s", "id": 284007, "pid": 76337, "tid": -914061504, "ts": 1716454225868028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225989162, "dur": 148, "args": { "External id": 284028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284028, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284028, "pid": 5, "tid": 7, "ts": 1716454225989162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868102, "dur": 13, "args": { "External id": 284028, "cbid": 211, "correlation": 284028 } }, { "ph": "s", "id": 284028, "pid": 76337, "tid": -914061504, "ts": 1716454225868102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868198, "dur": 1, "args": { "External id": 284046, "cbid": 251, "correlation": 284046 } }, { "ph": "f", "id": 284046, "pid": 76337, "tid": -914061504, "ts": 1716454225868198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225989312, "dur": 102, "args": { "External id": 284048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284048, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284048, "pid": 5, "tid": 7, "ts": 1716454225989312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868204, "dur": 14, "args": { "External id": 284048, "cbid": 211, "correlation": 284048 } }, { "ph": "s", "id": 284048, "pid": 76337, "tid": -914061504, "ts": 1716454225868204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225989415, "dur": 34, "args": { "External id": 284056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284056, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284056, "pid": 5, "tid": 7, "ts": 1716454225989415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868273, "dur": 12, "args": { "External id": 284056, "cbid": 211, "correlation": 284056 } }, { "ph": "s", "id": 284056, "pid": 76337, "tid": -914061504, "ts": 1716454225868273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225989451, "dur": 65, "args": { "External id": 284064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284064, "pid": 5, "tid": 7, "ts": 1716454225989451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868314, "dur": 9, "args": { "External id": 284064, "cbid": 211, "correlation": 284064 } }, { "ph": "s", "id": 284064, "pid": 76337, "tid": -914061504, "ts": 1716454225868314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225989517, "dur": 87, "args": { "External id": 284086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284086, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284086, "pid": 5, "tid": 7, "ts": 1716454225989517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868364, "dur": 10, "args": { "External id": 284086, "cbid": 211, "correlation": 284086 } }, { "ph": "s", "id": 284086, "pid": 76337, "tid": -914061504, "ts": 1716454225868364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868450, "dur": 1, "args": { "External id": 284102, "cbid": 251, "correlation": 284102 } }, { "ph": "f", "id": 284102, "pid": 76337, "tid": -914061504, "ts": 1716454225868450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225989606, "dur": 558, "args": { "External id": 284104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284104, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284104, "pid": 5, "tid": 7, "ts": 1716454225989606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868457, "dur": 14, "args": { "External id": 284104, "cbid": 211, "correlation": 284104 } }, { "ph": "s", "id": 284104, "pid": 76337, "tid": -914061504, "ts": 1716454225868457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225990165, "dur": 237, "args": { "External id": 284112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284112, "pid": 5, "tid": 7, "ts": 1716454225990165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868525, "dur": 12, "args": { "External id": 284112, "cbid": 211, "correlation": 284112 } }, { "ph": "s", "id": 284112, "pid": 76337, "tid": -914061504, "ts": 1716454225868525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225990403, "dur": 253, "args": { "External id": 284120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284120, "pid": 5, "tid": 7, "ts": 1716454225990403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868556, "dur": 9, "args": { "External id": 284120, "cbid": 211, "correlation": 284120 } }, { "ph": "s", "id": 284120, "pid": 76337, "tid": -914061504, "ts": 1716454225868556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868637, "dur": 1, "args": { "External id": 284136, "cbid": 251, "correlation": 284136 } }, { "ph": "f", "id": 284136, "pid": 76337, "tid": -914061504, "ts": 1716454225868637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868642, "dur": 0, "args": { "External id": 284138, "cbid": 251, "correlation": 284138 } }, { "ph": "f", "id": 284138, "pid": 76337, "tid": -914061504, "ts": 1716454225868642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454225990658, "dur": 348, "args": { "External id": 284139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284139, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 284139, "pid": 5, "tid": 7, "ts": 1716454225990658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868645, "dur": 13, "args": { "External id": 284139, "cbid": 211, "correlation": 284139 } }, { "ph": "s", "id": 284139, "pid": 76337, "tid": -914061504, "ts": 1716454225868645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225991008, "dur": 50, "args": { "External id": 284147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284147, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284147, "pid": 5, "tid": 7, "ts": 1716454225991008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868686, "dur": 10, "args": { "External id": 284147, "cbid": 211, "correlation": 284147 } }, { "ph": "s", "id": 284147, "pid": 76337, "tid": -914061504, "ts": 1716454225868686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225991059, "dur": 150, "args": { "External id": 284158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284158, "pid": 5, "tid": 7, "ts": 1716454225991059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868753, "dur": 13, "args": { "External id": 284158, "cbid": 211, "correlation": 284158 } }, { "ph": "s", "id": 284158, "pid": 76337, "tid": -914061504, "ts": 1716454225868753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225868817, "dur": 0, "args": { "External id": 284170, "cbid": 317, "correlation": 284170 } }, { "ph": "f", "id": 284170, "pid": 76337, "tid": -914061504, "ts": 1716454225868817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225868818, "dur": 0, "args": { "External id": 284171, "cbid": 203, "correlation": 284171 } }, { "ph": "f", "id": 284171, "pid": 76337, "tid": -914061504, "ts": 1716454225868818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225868818, "dur": 0, "args": { "External id": 284172, "cbid": 205, "correlation": 284172 } }, { "ph": "f", "id": 284172, "pid": 76337, "tid": -914061504, "ts": 1716454225868818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868841, "dur": 1, "args": { "External id": 284176, "cbid": 251, "correlation": 284176 } }, { "ph": "f", "id": 284176, "pid": 76337, "tid": -914061504, "ts": 1716454225868841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868843, "dur": 0, "args": { "External id": 284177, "cbid": 251, "correlation": 284177 } }, { "ph": "f", "id": 284177, "pid": 76337, "tid": -914061504, "ts": 1716454225868843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868843, "dur": 0, "args": { "External id": 284178, "cbid": 251, "correlation": 284178 } }, { "ph": "f", "id": 284178, "pid": 76337, "tid": -914061504, "ts": 1716454225868843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868844, "dur": 0, "args": { "External id": 284179, "cbid": 251, "correlation": 284179 } }, { "ph": "f", "id": 284179, "pid": 76337, "tid": -914061504, "ts": 1716454225868844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868845, "dur": 0, "args": { "External id": 284180, "cbid": 251, "correlation": 284180 } }, { "ph": "f", "id": 284180, "pid": 76337, "tid": -914061504, "ts": 1716454225868845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868845, "dur": 0, "args": { "External id": 284181, "cbid": 251, "correlation": 284181 } }, { "ph": "f", "id": 284181, "pid": 76337, "tid": -914061504, "ts": 1716454225868845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868846, "dur": 0, "args": { "External id": 284182, "cbid": 251, "correlation": 284182 } }, { "ph": "f", "id": 284182, "pid": 76337, "tid": -914061504, "ts": 1716454225868846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868847, "dur": 0, "args": { "External id": 284183, "cbid": 251, "correlation": 284183 } }, { "ph": "f", "id": 284183, "pid": 76337, "tid": -914061504, "ts": 1716454225868847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225868848, "dur": 0, "args": { "External id": 284184, "cbid": 251, "correlation": 284184 } }, { "ph": "f", "id": 284184, "pid": 76337, "tid": -914061504, "ts": 1716454225868848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225991210, "dur": 112, "args": { "External id": 284185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284185, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284185, "pid": 5, "tid": 7, "ts": 1716454225991210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868850, "dur": 12, "args": { "External id": 284185, "cbid": 211, "correlation": 284185 } }, { "ph": "s", "id": 284185, "pid": 76337, "tid": -914061504, "ts": 1716454225868850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225991323, "dur": 58, "args": { "External id": 284191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284191, "pid": 5, "tid": 7, "ts": 1716454225991323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868885, "dur": 9, "args": { "External id": 284191, "cbid": 211, "correlation": 284191 } }, { "ph": "s", "id": 284191, "pid": 76337, "tid": -914061504, "ts": 1716454225868885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225991382, "dur": 50, "args": { "External id": 284199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284199, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284199, "pid": 5, "tid": 7, "ts": 1716454225991382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868916, "dur": 9, "args": { "External id": 284199, "cbid": 211, "correlation": 284199 } }, { "ph": "s", "id": 284199, "pid": 76337, "tid": -914061504, "ts": 1716454225868916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225991433, "dur": 95, "args": { "External id": 284208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284208, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284208, "pid": 5, "tid": 7, "ts": 1716454225991433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225868955, "dur": 10, "args": { "External id": 284208, "cbid": 211, "correlation": 284208 } }, { "ph": "s", "id": 284208, "pid": 76337, "tid": -914061504, "ts": 1716454225868955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225991529, "dur": 88, "args": { "External id": 284228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284228, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 284228, "pid": 5, "tid": 7, "ts": 1716454225991529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869032, "dur": 12, "args": { "External id": 284228, "cbid": 211, "correlation": 284228 } }, { "ph": "s", "id": 284228, "pid": 76337, "tid": -914061504, "ts": 1716454225869032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225991619, "dur": 4, "args": { "External id": 284240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284240, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 284240, "pid": 5, "tid": 7, "ts": 1716454225991619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869055, "dur": 6, "args": { "External id": 284240, "cbid": 211, "correlation": 284240 } }, { "ph": "s", "id": 284240, "pid": 76337, "tid": -914061504, "ts": 1716454225869055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225991625, "dur": 105, "args": { "External id": 284243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284243, "pid": 5, "tid": 7, "ts": 1716454225991625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869073, "dur": 7, "args": { "External id": 284243, "cbid": 211, "correlation": 284243 } }, { "ph": "s", "id": 284243, "pid": 76337, "tid": -914061504, "ts": 1716454225869073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225991731, "dur": 69, "args": { "External id": 284252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284252, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284252, "pid": 5, "tid": 7, "ts": 1716454225991731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869112, "dur": 9, "args": { "External id": 284252, "cbid": 211, "correlation": 284252 } }, { "ph": "s", "id": 284252, "pid": 76337, "tid": -914061504, "ts": 1716454225869112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225869163, "dur": 0, "args": { "External id": 284262, "cbid": 317, "correlation": 284262 } }, { "ph": "f", "id": 284262, "pid": 76337, "tid": -914061504, "ts": 1716454225869163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225869164, "dur": 0, "args": { "External id": 284263, "cbid": 203, "correlation": 284263 } }, { "ph": "f", "id": 284263, "pid": 76337, "tid": -914061504, "ts": 1716454225869164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225869165, "dur": 0, "args": { "External id": 284264, "cbid": 205, "correlation": 284264 } }, { "ph": "f", "id": 284264, "pid": 76337, "tid": -914061504, "ts": 1716454225869165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225991801, "dur": 76, "args": { "External id": 284268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284268, "pid": 5, "tid": 7, "ts": 1716454225991801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869179, "dur": 12, "args": { "External id": 284268, "cbid": 211, "correlation": 284268 } }, { "ph": "s", "id": 284268, "pid": 76337, "tid": -914061504, "ts": 1716454225869179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225991878, "dur": 23, "args": { "External id": 284270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284270, "pid": 5, "tid": 7, "ts": 1716454225991878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869194, "dur": 5, "args": { "External id": 284270, "cbid": 211, "correlation": 284270 } }, { "ph": "s", "id": 284270, "pid": 76337, "tid": -914061504, "ts": 1716454225869194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225991903, "dur": 3, "args": { "External id": 284272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284272, "pid": 5, "tid": 7, "ts": 1716454225991903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869203, "dur": 5, "args": { "External id": 284272, "cbid": 211, "correlation": 284272 } }, { "ph": "s", "id": 284272, "pid": 76337, "tid": -914061504, "ts": 1716454225869203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225869211, "dur": 0, "args": { "External id": 284273, "cbid": 51, "correlation": 284273 } }, { "ph": "s", "id": 284273, "pid": 76337, "tid": -914061504, "ts": 1716454225869211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225991907, "dur": 1317, "args": { "External id": 284274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284274, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284274, "pid": 5, "tid": 7, "ts": 1716454225991907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869212, "dur": 5, "args": { "External id": 284274, "cbid": 211, "correlation": 284274 } }, { "ph": "s", "id": 284274, "pid": 76337, "tid": -914061504, "ts": 1716454225869212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225993225, "dur": 57, "args": { "External id": 284279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284279, "pid": 5, "tid": 7, "ts": 1716454225993225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869240, "dur": 8, "args": { "External id": 284279, "cbid": 211, "correlation": 284279 } }, { "ph": "s", "id": 284279, "pid": 76337, "tid": -914061504, "ts": 1716454225869240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225993284, "dur": 3, "args": { "External id": 284287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284287, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284287, "pid": 5, "tid": 7, "ts": 1716454225993284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869283, "dur": 10, "args": { "External id": 284287, "cbid": 211, "correlation": 284287 } }, { "ph": "s", "id": 284287, "pid": 76337, "tid": -914061504, "ts": 1716454225869283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225869352, "dur": 1, "args": { "External id": 284303, "cbid": 251, "correlation": 284303 } }, { "ph": "f", "id": 284303, "pid": 76337, "tid": -914061504, "ts": 1716454225869352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225869357, "dur": 0, "args": { "External id": 284305, "cbid": 251, "correlation": 284305 } }, { "ph": "f", "id": 284305, "pid": 76337, "tid": -914061504, "ts": 1716454225869357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454225993288, "dur": 11, "args": { "External id": 284306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284306, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 284306, "pid": 5, "tid": 7, "ts": 1716454225993288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869359, "dur": 11, "args": { "External id": 284306, "cbid": 211, "correlation": 284306 } }, { "ph": "s", "id": 284306, "pid": 76337, "tid": -914061504, "ts": 1716454225869359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454225993300, "dur": 5, "args": { "External id": 284308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284308, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 284308, "pid": 5, "tid": 7, "ts": 1716454225993300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869372, "dur": 5, "args": { "External id": 284308, "cbid": 211, "correlation": 284308 } }, { "ph": "s", "id": 284308, "pid": 76337, "tid": -914061504, "ts": 1716454225869372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225993307, "dur": 52, "args": { "External id": 284318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284318, "pid": 5, "tid": 7, "ts": 1716454225993307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869429, "dur": 13, "args": { "External id": 284318, "cbid": 211, "correlation": 284318 } }, { "ph": "s", "id": 284318, "pid": 76337, "tid": -914061504, "ts": 1716454225869429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225993360, "dur": 48, "args": { "External id": 284338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284338, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 284338, "pid": 5, "tid": 7, "ts": 1716454225993360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869495, "dur": 11, "args": { "External id": 284338, "cbid": 211, "correlation": 284338 } }, { "ph": "s", "id": 284338, "pid": 76337, "tid": -914061504, "ts": 1716454225869495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225993409, "dur": 4, "args": { "External id": 284350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284350, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284350, "pid": 5, "tid": 7, "ts": 1716454225993409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869516, "dur": 6, "args": { "External id": 284350, "cbid": 211, "correlation": 284350 } }, { "ph": "s", "id": 284350, "pid": 76337, "tid": -914061504, "ts": 1716454225869516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225993414, "dur": 52, "args": { "External id": 284353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284353, "pid": 5, "tid": 7, "ts": 1716454225993414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869536, "dur": 6, "args": { "External id": 284353, "cbid": 211, "correlation": 284353 } }, { "ph": "s", "id": 284353, "pid": 76337, "tid": -914061504, "ts": 1716454225869536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225993468, "dur": 36, "args": { "External id": 284362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284362, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284362, "pid": 5, "tid": 7, "ts": 1716454225993468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869576, "dur": 10, "args": { "External id": 284362, "cbid": 211, "correlation": 284362 } }, { "ph": "s", "id": 284362, "pid": 76337, "tid": -914061504, "ts": 1716454225869576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225869637, "dur": 0, "args": { "External id": 284372, "cbid": 317, "correlation": 284372 } }, { "ph": "f", "id": 284372, "pid": 76337, "tid": -914061504, "ts": 1716454225869637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225869638, "dur": 0, "args": { "External id": 284373, "cbid": 203, "correlation": 284373 } }, { "ph": "f", "id": 284373, "pid": 76337, "tid": -914061504, "ts": 1716454225869638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225869638, "dur": 0, "args": { "External id": 284374, "cbid": 205, "correlation": 284374 } }, { "ph": "f", "id": 284374, "pid": 76337, "tid": -914061504, "ts": 1716454225869638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225993505, "dur": 39, "args": { "External id": 284378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284378, "pid": 5, "tid": 7, "ts": 1716454225993505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869652, "dur": 12, "args": { "External id": 284378, "cbid": 211, "correlation": 284378 } }, { "ph": "s", "id": 284378, "pid": 76337, "tid": -914061504, "ts": 1716454225869652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225993545, "dur": 13, "args": { "External id": 284380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284380, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284380, "pid": 5, "tid": 7, "ts": 1716454225993545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869666, "dur": 5, "args": { "External id": 284380, "cbid": 211, "correlation": 284380 } }, { "ph": "s", "id": 284380, "pid": 76337, "tid": -914061504, "ts": 1716454225869666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454225993560, "dur": 3, "args": { "External id": 284382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284382, "pid": 5, "tid": 7, "ts": 1716454225993560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869675, "dur": 6, "args": { "External id": 284382, "cbid": 211, "correlation": 284382 } }, { "ph": "s", "id": 284382, "pid": 76337, "tid": -914061504, "ts": 1716454225869675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225869684, "dur": 0, "args": { "External id": 284383, "cbid": 51, "correlation": 284383 } }, { "ph": "s", "id": 284383, "pid": 76337, "tid": -914061504, "ts": 1716454225869684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454225993564, "dur": 674, "args": { "External id": 284384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284384, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284384, "pid": 5, "tid": 7, "ts": 1716454225993564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869685, "dur": 6, "args": { "External id": 284384, "cbid": 211, "correlation": 284384 } }, { "ph": "s", "id": 284384, "pid": 76337, "tid": -914061504, "ts": 1716454225869685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225994240, "dur": 58, "args": { "External id": 284389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284389, "pid": 5, "tid": 7, "ts": 1716454225994240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869713, "dur": 8, "args": { "External id": 284389, "cbid": 211, "correlation": 284389 } }, { "ph": "s", "id": 284389, "pid": 76337, "tid": -914061504, "ts": 1716454225869713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225869770, "dur": 0, "args": { "External id": 284399, "cbid": 317, "correlation": 284399 } }, { "ph": "f", "id": 284399, "pid": 76337, "tid": -914061504, "ts": 1716454225869770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225869771, "dur": 0, "args": { "External id": 284400, "cbid": 203, "correlation": 284400 } }, { "ph": "f", "id": 284400, "pid": 76337, "tid": -914061504, "ts": 1716454225869771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225869771, "dur": 0, "args": { "External id": 284401, "cbid": 205, "correlation": 284401 } }, { "ph": "f", "id": 284401, "pid": 76337, "tid": -914061504, "ts": 1716454225869771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225994299, "dur": 75, "args": { "External id": 284405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284405, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284405, "pid": 5, "tid": 7, "ts": 1716454225994299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869783, "dur": 12, "args": { "External id": 284405, "cbid": 211, "correlation": 284405 } }, { "ph": "s", "id": 284405, "pid": 76337, "tid": -914061504, "ts": 1716454225869783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454225994375, "dur": 202, "args": { "External id": 284407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284407, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284407, "pid": 5, "tid": 7, "ts": 1716454225994375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869801, "dur": 6, "args": { "External id": 284407, "cbid": 211, "correlation": 284407 } }, { "ph": "s", "id": 284407, "pid": 76337, "tid": -914061504, "ts": 1716454225869801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454225994578, "dur": 38, "args": { "External id": 284409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284409, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284409, "pid": 5, "tid": 7, "ts": 1716454225994578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869812, "dur": 5, "args": { "External id": 284409, "cbid": 211, "correlation": 284409 } }, { "ph": "s", "id": 284409, "pid": 76337, "tid": -914061504, "ts": 1716454225869812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225994617, "dur": 57, "args": { "External id": 284415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284415, "pid": 5, "tid": 7, "ts": 1716454225994617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225869837, "dur": 368, "args": { "External id": 284415, "cbid": 211, "correlation": 284415 } }, { "ph": "s", "id": 284415, "pid": 76337, "tid": -914061504, "ts": 1716454225869837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225994676, "dur": 50, "args": { "External id": 284423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284423, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284423, "pid": 5, "tid": 7, "ts": 1716454225994676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870227, "dur": 8, "args": { "External id": 284423, "cbid": 211, "correlation": 284423 } }, { "ph": "s", "id": 284423, "pid": 76337, "tid": -914061504, "ts": 1716454225870227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454225994727, "dur": 35, "args": { "External id": 284431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284431, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284431, "pid": 5, "tid": 7, "ts": 1716454225994727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870256, "dur": 8, "args": { "External id": 284431, "cbid": 211, "correlation": 284431 } }, { "ph": "s", "id": 284431, "pid": 76337, "tid": -914061504, "ts": 1716454225870256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225994763, "dur": 49, "args": { "External id": 284451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284451, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 284451, "pid": 5, "tid": 7, "ts": 1716454225994763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870337, "dur": 13, "args": { "External id": 284451, "cbid": 211, "correlation": 284451 } }, { "ph": "s", "id": 284451, "pid": 76337, "tid": -914061504, "ts": 1716454225870337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454225994813, "dur": 4, "args": { "External id": 284463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284463, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284463, "pid": 5, "tid": 7, "ts": 1716454225994813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870360, "dur": 6, "args": { "External id": 284463, "cbid": 211, "correlation": 284463 } }, { "ph": "s", "id": 284463, "pid": 76337, "tid": -914061504, "ts": 1716454225870360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225994819, "dur": 52, "args": { "External id": 284466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284466, "pid": 5, "tid": 7, "ts": 1716454225994819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870378, "dur": 6, "args": { "External id": 284466, "cbid": 211, "correlation": 284466 } }, { "ph": "s", "id": 284466, "pid": 76337, "tid": -914061504, "ts": 1716454225870378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225870434, "dur": 0, "args": { "External id": 284477, "cbid": 317, "correlation": 284477 } }, { "ph": "f", "id": 284477, "pid": 76337, "tid": -914061504, "ts": 1716454225870434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225870435, "dur": 0, "args": { "External id": 284478, "cbid": 203, "correlation": 284478 } }, { "ph": "f", "id": 284478, "pid": 76337, "tid": -914061504, "ts": 1716454225870435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225870436, "dur": 0, "args": { "External id": 284479, "cbid": 205, "correlation": 284479 } }, { "ph": "f", "id": 284479, "pid": 76337, "tid": -914061504, "ts": 1716454225870436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870458, "dur": 1, "args": { "External id": 284483, "cbid": 251, "correlation": 284483 } }, { "ph": "f", "id": 284483, "pid": 76337, "tid": -914061504, "ts": 1716454225870458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870460, "dur": 0, "args": { "External id": 284484, "cbid": 251, "correlation": 284484 } }, { "ph": "f", "id": 284484, "pid": 76337, "tid": -914061504, "ts": 1716454225870460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870461, "dur": 0, "args": { "External id": 284485, "cbid": 251, "correlation": 284485 } }, { "ph": "f", "id": 284485, "pid": 76337, "tid": -914061504, "ts": 1716454225870461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870462, "dur": 0, "args": { "External id": 284486, "cbid": 251, "correlation": 284486 } }, { "ph": "f", "id": 284486, "pid": 76337, "tid": -914061504, "ts": 1716454225870462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870462, "dur": 0, "args": { "External id": 284487, "cbid": 251, "correlation": 284487 } }, { "ph": "f", "id": 284487, "pid": 76337, "tid": -914061504, "ts": 1716454225870462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870463, "dur": 0, "args": { "External id": 284488, "cbid": 251, "correlation": 284488 } }, { "ph": "f", "id": 284488, "pid": 76337, "tid": -914061504, "ts": 1716454225870463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870464, "dur": 0, "args": { "External id": 284489, "cbid": 251, "correlation": 284489 } }, { "ph": "f", "id": 284489, "pid": 76337, "tid": -914061504, "ts": 1716454225870464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870465, "dur": 0, "args": { "External id": 284490, "cbid": 251, "correlation": 284490 } }, { "ph": "f", "id": 284490, "pid": 76337, "tid": -914061504, "ts": 1716454225870465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870466, "dur": 0, "args": { "External id": 284491, "cbid": 251, "correlation": 284491 } }, { "ph": "f", "id": 284491, "pid": 76337, "tid": -914061504, "ts": 1716454225870466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454225994872, "dur": 109, "args": { "External id": 284492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284492, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284492, "pid": 5, "tid": 7, "ts": 1716454225994872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870468, "dur": 12, "args": { "External id": 284492, "cbid": 211, "correlation": 284492 } }, { "ph": "s", "id": 284492, "pid": 76337, "tid": -914061504, "ts": 1716454225870468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454225994983, "dur": 57, "args": { "External id": 284498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284498, "pid": 5, "tid": 7, "ts": 1716454225994983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870504, "dur": 9, "args": { "External id": 284498, "cbid": 211, "correlation": 284498 } }, { "ph": "s", "id": 284498, "pid": 76337, "tid": -914061504, "ts": 1716454225870504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454225995041, "dur": 503, "args": { "External id": 284507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284507, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284507, "pid": 5, "tid": 7, "ts": 1716454225995041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870585, "dur": 14, "args": { "External id": 284507, "cbid": 211, "correlation": 284507 } }, { "ph": "s", "id": 284507, "pid": 76337, "tid": -914061504, "ts": 1716454225870585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454225995545, "dur": 171, "args": { "External id": 284529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284529, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284529, "pid": 5, "tid": 7, "ts": 1716454225995545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870642, "dur": 11, "args": { "External id": 284529, "cbid": 211, "correlation": 284529 } }, { "ph": "s", "id": 284529, "pid": 76337, "tid": -914061504, "ts": 1716454225870642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870732, "dur": 1, "args": { "External id": 284540, "cbid": 251, "correlation": 284540 } }, { "ph": "f", "id": 284540, "pid": 76337, "tid": -914061504, "ts": 1716454225870732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225995718, "dur": 191, "args": { "External id": 284541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284541, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284541, "pid": 5, "tid": 7, "ts": 1716454225995718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870737, "dur": 15, "args": { "External id": 284541, "cbid": 211, "correlation": 284541 } }, { "ph": "s", "id": 284541, "pid": 76337, "tid": -914061504, "ts": 1716454225870737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870807, "dur": 1, "args": { "External id": 284552, "cbid": 251, "correlation": 284552 } }, { "ph": "f", "id": 284552, "pid": 76337, "tid": -914061504, "ts": 1716454225870807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225995910, "dur": 182, "args": { "External id": 284553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284553, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284553, "pid": 5, "tid": 7, "ts": 1716454225995910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870811, "dur": 11, "args": { "External id": 284553, "cbid": 211, "correlation": 284553 } }, { "ph": "s", "id": 284553, "pid": 76337, "tid": -914061504, "ts": 1716454225870811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225870877, "dur": 1, "args": { "External id": 284564, "cbid": 251, "correlation": 284564 } }, { "ph": "f", "id": 284564, "pid": 76337, "tid": -914061504, "ts": 1716454225870877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454225996093, "dur": 180, "args": { "External id": 284565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284565, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284565, "pid": 5, "tid": 7, "ts": 1716454225996093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870881, "dur": 11, "args": { "External id": 284565, "cbid": 211, "correlation": 284565 } }, { "ph": "s", "id": 284565, "pid": 76337, "tid": -914061504, "ts": 1716454225870881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454225996274, "dur": 17603, "args": { "External id": 284586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284586, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284586, "pid": 5, "tid": 7, "ts": 1716454225996274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225870962, "dur": 19, "args": { "External id": 284586, "cbid": 211, "correlation": 284586 } }, { "ph": "s", "id": 284586, "pid": 76337, "tid": -914061504, "ts": 1716454225870962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225871066, "dur": 1, "args": { "External id": 284604, "cbid": 251, "correlation": 284604 } }, { "ph": "f", "id": 284604, "pid": 76337, "tid": -914061504, "ts": 1716454225871066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454226013878, "dur": 195, "args": { "External id": 284606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284606, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284606, "pid": 5, "tid": 7, "ts": 1716454226013878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225871071, "dur": 13, "args": { "External id": 284606, "cbid": 211, "correlation": 284606 } }, { "ph": "s", "id": 284606, "pid": 76337, "tid": -914061504, "ts": 1716454225871071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226014074, "dur": 66, "args": { "External id": 284614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284614, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284614, "pid": 5, "tid": 7, "ts": 1716454226014074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225871140, "dur": 12, "args": { "External id": 284614, "cbid": 211, "correlation": 284614 } }, { "ph": "s", "id": 284614, "pid": 76337, "tid": -914061504, "ts": 1716454225871140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226014142, "dur": 98, "args": { "External id": 284622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284622, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284622, "pid": 5, "tid": 7, "ts": 1716454226014142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225871179, "dur": 77, "args": { "External id": 284622, "cbid": 211, "correlation": 284622 } }, { "ph": "s", "id": 284622, "pid": 76337, "tid": -914061504, "ts": 1716454225871179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226014241, "dur": 52, "args": { "External id": 284633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284633, "pid": 5, "tid": 7, "ts": 1716454226014241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225871321, "dur": 1794, "args": { "External id": 284633, "cbid": 211, "correlation": 284633 } }, { "ph": "s", "id": 284633, "pid": 76337, "tid": -914061504, "ts": 1716454225871321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226014294, "dur": 87, "args": { "External id": 284655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284655, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284655, "pid": 5, "tid": 7, "ts": 1716454226014294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873135, "dur": 122, "args": { "External id": 284655, "cbid": 211, "correlation": 284655 } }, { "ph": "s", "id": 284655, "pid": 76337, "tid": -914061504, "ts": 1716454225873135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873336, "dur": 1, "args": { "External id": 284666, "cbid": 251, "correlation": 284666 } }, { "ph": "f", "id": 284666, "pid": 76337, "tid": -914061504, "ts": 1716454225873336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454226014383, "dur": 102, "args": { "External id": 284667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284667, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284667, "pid": 5, "tid": 7, "ts": 1716454226014383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873341, "dur": 13, "args": { "External id": 284667, "cbid": 211, "correlation": 284667 } }, { "ph": "s", "id": 284667, "pid": 76337, "tid": -914061504, "ts": 1716454225873341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873415, "dur": 1, "args": { "External id": 284678, "cbid": 251, "correlation": 284678 } }, { "ph": "f", "id": 284678, "pid": 76337, "tid": -914061504, "ts": 1716454225873415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873419, "dur": 0, "args": { "External id": 284679, "cbid": 251, "correlation": 284679 } }, { "ph": "f", "id": 284679, "pid": 76337, "tid": -914061504, "ts": 1716454225873419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454226014486, "dur": 10, "args": { "External id": 284680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284680, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284680, "pid": 5, "tid": 7, "ts": 1716454226014486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873421, "dur": 12, "args": { "External id": 284680, "cbid": 211, "correlation": 284680 } }, { "ph": "s", "id": 284680, "pid": 76337, "tid": -914061504, "ts": 1716454225873421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454226014497, "dur": 5, "args": { "External id": 284682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284682, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 284682, "pid": 5, "tid": 7, "ts": 1716454226014497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873435, "dur": 6, "args": { "External id": 284682, "cbid": 211, "correlation": 284682 } }, { "ph": "s", "id": 284682, "pid": 76337, "tid": -914061504, "ts": 1716454225873435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873499, "dur": 1, "args": { "External id": 284693, "cbid": 251, "correlation": 284693 } }, { "ph": "f", "id": 284693, "pid": 76337, "tid": -914061504, "ts": 1716454225873499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873503, "dur": 0, "args": { "External id": 284694, "cbid": 251, "correlation": 284694 } }, { "ph": "f", "id": 284694, "pid": 76337, "tid": -914061504, "ts": 1716454225873503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454226014503, "dur": 6, "args": { "External id": 284695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284695, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284695, "pid": 5, "tid": 7, "ts": 1716454226014503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873504, "dur": 12, "args": { "External id": 284695, "cbid": 211, "correlation": 284695 } }, { "ph": "s", "id": 284695, "pid": 76337, "tid": -914061504, "ts": 1716454225873504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454226014510, "dur": 3, "args": { "External id": 284697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284697, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 284697, "pid": 5, "tid": 7, "ts": 1716454226014510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873518, "dur": 6, "args": { "External id": 284697, "cbid": 211, "correlation": 284697 } }, { "ph": "s", "id": 284697, "pid": 76337, "tid": -914061504, "ts": 1716454225873518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454226014515, "dur": 149, "args": { "External id": 284718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284718, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284718, "pid": 5, "tid": 7, "ts": 1716454226014515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873592, "dur": 13, "args": { "External id": 284718, "cbid": 211, "correlation": 284718 } }, { "ph": "s", "id": 284718, "pid": 76337, "tid": -914061504, "ts": 1716454225873592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873691, "dur": 1, "args": { "External id": 284736, "cbid": 251, "correlation": 284736 } }, { "ph": "f", "id": 284736, "pid": 76337, "tid": -914061504, "ts": 1716454225873691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454226014665, "dur": 103, "args": { "External id": 284738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284738, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284738, "pid": 5, "tid": 7, "ts": 1716454226014665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873697, "dur": 14, "args": { "External id": 284738, "cbid": 211, "correlation": 284738 } }, { "ph": "s", "id": 284738, "pid": 76337, "tid": -914061504, "ts": 1716454225873697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226014770, "dur": 35, "args": { "External id": 284746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284746, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284746, "pid": 5, "tid": 7, "ts": 1716454226014770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873767, "dur": 12, "args": { "External id": 284746, "cbid": 211, "correlation": 284746 } }, { "ph": "s", "id": 284746, "pid": 76337, "tid": -914061504, "ts": 1716454225873767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226014806, "dur": 64, "args": { "External id": 284754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284754, "pid": 5, "tid": 7, "ts": 1716454226014806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873807, "dur": 10, "args": { "External id": 284754, "cbid": 211, "correlation": 284754 } }, { "ph": "s", "id": 284754, "pid": 76337, "tid": -914061504, "ts": 1716454225873807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226014872, "dur": 87, "args": { "External id": 284776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284776, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284776, "pid": 5, "tid": 7, "ts": 1716454226014872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873857, "dur": 11, "args": { "External id": 284776, "cbid": 211, "correlation": 284776 } }, { "ph": "s", "id": 284776, "pid": 76337, "tid": -914061504, "ts": 1716454225873857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225873941, "dur": 1, "args": { "External id": 284792, "cbid": 251, "correlation": 284792 } }, { "ph": "f", "id": 284792, "pid": 76337, "tid": -914061504, "ts": 1716454225873941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454226014960, "dur": 561, "args": { "External id": 284794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284794, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 284794, "pid": 5, "tid": 7, "ts": 1716454226014960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225873947, "dur": 12, "args": { "External id": 284794, "cbid": 211, "correlation": 284794 } }, { "ph": "s", "id": 284794, "pid": 76337, "tid": -914061504, "ts": 1716454225873947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226015522, "dur": 238, "args": { "External id": 284802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284802, "pid": 5, "tid": 7, "ts": 1716454226015522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874021, "dur": 13, "args": { "External id": 284802, "cbid": 211, "correlation": 284802 } }, { "ph": "s", "id": 284802, "pid": 76337, "tid": -914061504, "ts": 1716454225874021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226015762, "dur": 247, "args": { "External id": 284810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284810, "pid": 5, "tid": 7, "ts": 1716454226015762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874051, "dur": 8, "args": { "External id": 284810, "cbid": 211, "correlation": 284810 } }, { "ph": "s", "id": 284810, "pid": 76337, "tid": -914061504, "ts": 1716454225874051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874133, "dur": 1, "args": { "External id": 284826, "cbid": 251, "correlation": 284826 } }, { "ph": "f", "id": 284826, "pid": 76337, "tid": -914061504, "ts": 1716454225874133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874138, "dur": 0, "args": { "External id": 284828, "cbid": 251, "correlation": 284828 } }, { "ph": "f", "id": 284828, "pid": 76337, "tid": -914061504, "ts": 1716454225874138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454226016010, "dur": 351, "args": { "External id": 284829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284829, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 284829, "pid": 5, "tid": 7, "ts": 1716454226016010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874141, "dur": 13, "args": { "External id": 284829, "cbid": 211, "correlation": 284829 } }, { "ph": "s", "id": 284829, "pid": 76337, "tid": -914061504, "ts": 1716454225874141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226016362, "dur": 50, "args": { "External id": 284837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284837, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284837, "pid": 5, "tid": 7, "ts": 1716454226016362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874183, "dur": 148, "args": { "External id": 284837, "cbid": 211, "correlation": 284837 } }, { "ph": "s", "id": 284837, "pid": 76337, "tid": -914061504, "ts": 1716454225874183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226016414, "dur": 150, "args": { "External id": 284848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284848, "pid": 5, "tid": 7, "ts": 1716454226016414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874389, "dur": 67, "args": { "External id": 284848, "cbid": 211, "correlation": 284848 } }, { "ph": "s", "id": 284848, "pid": 76337, "tid": -914061504, "ts": 1716454225874389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225874509, "dur": 0, "args": { "External id": 284860, "cbid": 317, "correlation": 284860 } }, { "ph": "f", "id": 284860, "pid": 76337, "tid": -914061504, "ts": 1716454225874509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225874509, "dur": 0, "args": { "External id": 284861, "cbid": 203, "correlation": 284861 } }, { "ph": "f", "id": 284861, "pid": 76337, "tid": -914061504, "ts": 1716454225874509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225874510, "dur": 0, "args": { "External id": 284862, "cbid": 205, "correlation": 284862 } }, { "ph": "f", "id": 284862, "pid": 76337, "tid": -914061504, "ts": 1716454225874510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874533, "dur": 1, "args": { "External id": 284866, "cbid": 251, "correlation": 284866 } }, { "ph": "f", "id": 284866, "pid": 76337, "tid": -914061504, "ts": 1716454225874533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874534, "dur": 0, "args": { "External id": 284867, "cbid": 251, "correlation": 284867 } }, { "ph": "f", "id": 284867, "pid": 76337, "tid": -914061504, "ts": 1716454225874534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874535, "dur": 0, "args": { "External id": 284868, "cbid": 251, "correlation": 284868 } }, { "ph": "f", "id": 284868, "pid": 76337, "tid": -914061504, "ts": 1716454225874535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874536, "dur": 0, "args": { "External id": 284869, "cbid": 251, "correlation": 284869 } }, { "ph": "f", "id": 284869, "pid": 76337, "tid": -914061504, "ts": 1716454225874536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874537, "dur": 0, "args": { "External id": 284870, "cbid": 251, "correlation": 284870 } }, { "ph": "f", "id": 284870, "pid": 76337, "tid": -914061504, "ts": 1716454225874537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874538, "dur": 0, "args": { "External id": 284871, "cbid": 251, "correlation": 284871 } }, { "ph": "f", "id": 284871, "pid": 76337, "tid": -914061504, "ts": 1716454225874538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874538, "dur": 0, "args": { "External id": 284872, "cbid": 251, "correlation": 284872 } }, { "ph": "f", "id": 284872, "pid": 76337, "tid": -914061504, "ts": 1716454225874538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874539, "dur": 0, "args": { "External id": 284873, "cbid": 251, "correlation": 284873 } }, { "ph": "f", "id": 284873, "pid": 76337, "tid": -914061504, "ts": 1716454225874539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225874541, "dur": 0, "args": { "External id": 284874, "cbid": 251, "correlation": 284874 } }, { "ph": "f", "id": 284874, "pid": 76337, "tid": -914061504, "ts": 1716454225874541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454226016565, "dur": 109, "args": { "External id": 284875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284875, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 284875, "pid": 5, "tid": 7, "ts": 1716454226016565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874543, "dur": 38, "args": { "External id": 284875, "cbid": 211, "correlation": 284875 } }, { "ph": "s", "id": 284875, "pid": 76337, "tid": -914061504, "ts": 1716454225874543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226016675, "dur": 57, "args": { "External id": 284881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284881, "pid": 5, "tid": 7, "ts": 1716454226016675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874605, "dur": 277, "args": { "External id": 284881, "cbid": 211, "correlation": 284881 } }, { "ph": "s", "id": 284881, "pid": 76337, "tid": -914061504, "ts": 1716454225874605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226016733, "dur": 50, "args": { "External id": 284889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284889, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284889, "pid": 5, "tid": 7, "ts": 1716454226016733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874905, "dur": 8, "args": { "External id": 284889, "cbid": 211, "correlation": 284889 } }, { "ph": "s", "id": 284889, "pid": 76337, "tid": -914061504, "ts": 1716454225874905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226016785, "dur": 50, "args": { "External id": 284909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284909, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 284909, "pid": 5, "tid": 7, "ts": 1716454226016785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225874998, "dur": 13, "args": { "External id": 284909, "cbid": 211, "correlation": 284909 } }, { "ph": "s", "id": 284909, "pid": 76337, "tid": -914061504, "ts": 1716454225874998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226016836, "dur": 5, "args": { "External id": 284921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284921, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284921, "pid": 5, "tid": 7, "ts": 1716454226016836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875022, "dur": 7, "args": { "External id": 284921, "cbid": 211, "correlation": 284921 } }, { "ph": "s", "id": 284921, "pid": 76337, "tid": -914061504, "ts": 1716454225875022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226016842, "dur": 55, "args": { "External id": 284924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284924, "pid": 5, "tid": 7, "ts": 1716454226016842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875042, "dur": 88, "args": { "External id": 284924, "cbid": 211, "correlation": 284924 } }, { "ph": "s", "id": 284924, "pid": 76337, "tid": -914061504, "ts": 1716454225875042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226016898, "dur": 37, "args": { "External id": 284933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284933, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284933, "pid": 5, "tid": 7, "ts": 1716454226016898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875170, "dur": 10, "args": { "External id": 284933, "cbid": 211, "correlation": 284933 } }, { "ph": "s", "id": 284933, "pid": 76337, "tid": -914061504, "ts": 1716454225875170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454225875226, "dur": 0, "args": { "External id": 284943, "cbid": 317, "correlation": 284943 } }, { "ph": "f", "id": 284943, "pid": 76337, "tid": -914061504, "ts": 1716454225875226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454225875226, "dur": 0, "args": { "External id": 284944, "cbid": 203, "correlation": 284944 } }, { "ph": "f", "id": 284944, "pid": 76337, "tid": -914061504, "ts": 1716454225875226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454225875227, "dur": 0, "args": { "External id": 284945, "cbid": 205, "correlation": 284945 } }, { "ph": "f", "id": 284945, "pid": 76337, "tid": -914061504, "ts": 1716454225875227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226016936, "dur": 41, "args": { "External id": 284949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284949, "pid": 5, "tid": 7, "ts": 1716454226016936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875244, "dur": 12, "args": { "External id": 284949, "cbid": 211, "correlation": 284949 } }, { "ph": "s", "id": 284949, "pid": 76337, "tid": -914061504, "ts": 1716454225875244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454226016978, "dur": 3, "args": { "External id": 284951, "device": 5, "context": 1, "stream": 7, "correlation": 284951, "bytes": 46080, "memory bandwidth (GB/s)": 11.80327868852459 } }, { "ph": "f", "id": 284951, "pid": 5, "tid": 7, "ts": 1716454226016978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225875259, "dur": 22, "args": { "External id": 284951, "cbid": 51, "correlation": 284951 } }, { "ph": "s", "id": 284951, "pid": 76337, "tid": -914061504, "ts": 1716454225875259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225875292, "dur": 2, "args": { "External id": 284953, "cbid": 200, "correlation": 284953 } }, { "ph": "f", "id": 284953, "pid": 76337, "tid": -914061504, "ts": 1716454225875292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225875295, "dur": 0, "args": { "External id": 284954, "cbid": 200, "correlation": 284954 } }, { "ph": "f", "id": 284954, "pid": 76337, "tid": -914061504, "ts": 1716454225875295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225875295, "dur": 0, "args": { "External id": 284955, "cbid": 200, "correlation": 284955 } }, { "ph": "f", "id": 284955, "pid": 76337, "tid": -914061504, "ts": 1716454225875295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454225875296, "dur": 0, "args": { "External id": 284956, "cbid": 200, "correlation": 284956 } }, { "ph": "f", "id": 284956, "pid": 76337, "tid": -914061504, "ts": 1716454225875296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454225875297, "dur": 5, "args": { "External id": 284957, "cbid": 15, "correlation": 284957 } }, { "ph": "f", "id": 284957, "pid": 76337, "tid": -914061504, "ts": 1716454225875297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454225875302, "dur": 1, "args": { "External id": 284958, "cbid": 251, "correlation": 284958 } }, { "ph": "f", "id": 284958, "pid": 76337, "tid": -914061504, "ts": 1716454225875302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454226016983, "dur": 23, "args": { "External id": 284959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284959, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284959, "pid": 5, "tid": 7, "ts": 1716454226016983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875306, "dur": 11, "args": { "External id": 284959, "cbid": 211, "correlation": 284959 } }, { "ph": "s", "id": 284959, "pid": 76337, "tid": -914061504, "ts": 1716454225875306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226017007, "dur": 4, "args": { "External id": 284961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 284961, "pid": 5, "tid": 7, "ts": 1716454226017007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875322, "dur": 6, "args": { "External id": 284961, "cbid": 211, "correlation": 284961 } }, { "ph": "s", "id": 284961, "pid": 76337, "tid": -914061504, "ts": 1716454225875322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225875331, "dur": 0, "args": { "External id": 284962, "cbid": 51, "correlation": 284962 } }, { "ph": "s", "id": 284962, "pid": 76337, "tid": -914061504, "ts": 1716454225875331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226017013, "dur": 182, "args": { "External id": 284963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284963, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284963, "pid": 5, "tid": 7, "ts": 1716454226017013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875332, "dur": 179, "args": { "External id": 284963, "cbid": 211, "correlation": 284963 } }, { "ph": "s", "id": 284963, "pid": 76337, "tid": -914061504, "ts": 1716454225875332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226017196, "dur": 6, "args": { "External id": 284964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284964, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 284964, "pid": 5, "tid": 7, "ts": 1716454226017196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875515, "dur": 6, "args": { "External id": 284964, "cbid": 211, "correlation": 284964 } }, { "ph": "s", "id": 284964, "pid": 76337, "tid": -914061504, "ts": 1716454225875515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226017203, "dur": 4, "args": { "External id": 284970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 284970, "pid": 5, "tid": 7, "ts": 1716454226017203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225875545, "dur": 9, "args": { "External id": 284970, "cbid": 211, "correlation": 284970 } }, { "ph": "s", "id": 284970, "pid": 76337, "tid": -914061504, "ts": 1716454225875545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017209, "dur": 3, "args": { "External id": 284978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284978, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284978, "pid": 5, "tid": 7, "ts": 1716454226017209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877323, "dur": 16, "args": { "External id": 284978, "cbid": 211, "correlation": 284978 } }, { "ph": "s", "id": 284978, "pid": 76337, "tid": -914061504, "ts": 1716454225877323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017213, "dur": 3, "args": { "External id": 284986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284986, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284986, "pid": 5, "tid": 7, "ts": 1716454226017213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877364, "dur": 11, "args": { "External id": 284986, "cbid": 211, "correlation": 284986 } }, { "ph": "s", "id": 284986, "pid": 76337, "tid": -914061504, "ts": 1716454225877364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017217, "dur": 3, "args": { "External id": 284994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 284994, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 284994, "pid": 5, "tid": 7, "ts": 1716454226017217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877391, "dur": 9, "args": { "External id": 284994, "cbid": 211, "correlation": 284994 } }, { "ph": "s", "id": 284994, "pid": 76337, "tid": -914061504, "ts": 1716454225877391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017221, "dur": 3, "args": { "External id": 285003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285003, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285003, "pid": 5, "tid": 7, "ts": 1716454226017221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877567, "dur": 14, "args": { "External id": 285003, "cbid": 211, "correlation": 285003 } }, { "ph": "s", "id": 285003, "pid": 76337, "tid": -914061504, "ts": 1716454225877567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017225, "dur": 3, "args": { "External id": 285012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285012, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285012, "pid": 5, "tid": 7, "ts": 1716454226017225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877596, "dur": 7, "args": { "External id": 285012, "cbid": 211, "correlation": 285012 } }, { "ph": "s", "id": 285012, "pid": 76337, "tid": -914061504, "ts": 1716454225877596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017229, "dur": 3, "args": { "External id": 285020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285020, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285020, "pid": 5, "tid": 7, "ts": 1716454226017229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877621, "dur": 8, "args": { "External id": 285020, "cbid": 211, "correlation": 285020 } }, { "ph": "s", "id": 285020, "pid": 76337, "tid": -914061504, "ts": 1716454225877621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017234, "dur": 3, "args": { "External id": 285028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285028, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285028, "pid": 5, "tid": 7, "ts": 1716454226017234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877877, "dur": 16, "args": { "External id": 285028, "cbid": 211, "correlation": 285028 } }, { "ph": "s", "id": 285028, "pid": 76337, "tid": -914061504, "ts": 1716454225877877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017238, "dur": 3, "args": { "External id": 285036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285036, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285036, "pid": 5, "tid": 7, "ts": 1716454226017238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454225877908, "dur": 7, "args": { "External id": 285036, "cbid": 211, "correlation": 285036 } }, { "ph": "s", "id": 285036, "pid": 76337, "tid": -914061504, "ts": 1716454225877908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454226017242, "dur": 1, "args": { "External id": 285046, "device": 5, "context": 1, "stream": 7, "correlation": 285046, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 285046, "pid": 5, "tid": 7, "ts": 1716454226017242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454225877982, "dur": 38, "args": { "External id": 285046, "cbid": 41, "correlation": 285046 } }, { "ph": "s", "id": 285046, "pid": 76337, "tid": -914061504, "ts": 1716454225877982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454225878021, "dur": 139241, "args": { "External id": 285047, "cbid": 131, "correlation": 285047 } }, { "ph": "f", "id": 285047, "pid": 76337, "tid": -914061504, "ts": 1716454225878021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226017449, "dur": 2, "args": { "External id": 285055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285055, "pid": 5, "tid": 7, "ts": 1716454226017449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226017422, "dur": 29, "args": { "External id": 285055, "cbid": 211, "correlation": 285055 } }, { "ph": "s", "id": 285055, "pid": 76337, "tid": -914061504, "ts": 1716454226017422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226017555, "dur": 3, "args": { "External id": 285064, "device": 5, "context": 1, "stream": 7, "correlation": 285064, "bytes": 8, "memory bandwidth (GB/s)": 0.0025 } }, { "ph": "f", "id": 285064, "pid": 5, "tid": 7, "ts": 1716454226017555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226017513, "dur": 42, "args": { "External id": 285064, "cbid": 41, "correlation": 285064 } }, { "ph": "s", "id": 285064, "pid": 76337, "tid": -914061504, "ts": 1716454226017513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454226017654, "dur": 4, "args": { "External id": 285074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285074, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285074, "pid": 5, "tid": 7, "ts": 1716454226017654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226017638, "dur": 18, "args": { "External id": 285074, "cbid": 211, "correlation": 285074 } }, { "ph": "s", "id": 285074, "pid": 76337, "tid": -914061504, "ts": 1716454226017638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454226017740, "dur": 1, "args": { "External id": 285084, "device": 5, "context": 1, "stream": 7, "correlation": 285084, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 285084, "pid": 5, "tid": 7, "ts": 1716454226017740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226017714, "dur": 24, "args": { "External id": 285084, "cbid": 41, "correlation": 285084 } }, { "ph": "s", "id": 285084, "pid": 76337, "tid": -914061504, "ts": 1716454226017714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454226017739, "dur": 8, "args": { "External id": 285085, "cbid": 131, "correlation": 285085 } }, { "ph": "f", "id": 285085, "pid": 76337, "tid": -914061504, "ts": 1716454226017739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226017813, "dur": 3, "args": { "External id": 285092, "device": 5, "context": 1, "stream": 7, "correlation": 285092, "bytes": 98304, "memory bandwidth (GB/s)": 32 } }, { "ph": "f", "id": 285092, "pid": 5, "tid": 7, "ts": 1716454226017813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226017789, "dur": 23, "args": { "External id": 285092, "cbid": 41, "correlation": 285092 } }, { "ph": "s", "id": 285092, "pid": 76337, "tid": -914061504, "ts": 1716454226017789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226017902, "dur": 2, "args": { "External id": 285111, "device": 5, "context": 1, "stream": 7, "correlation": 285111, "bytes": 16, "memory bandwidth (GB/s)": 0.005555555555555556 } }, { "ph": "f", "id": 285111, "pid": 5, "tid": 7, "ts": 1716454226017902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226017882, "dur": 20, "args": { "External id": 285111, "cbid": 41, "correlation": 285111 } }, { "ph": "s", "id": 285111, "pid": 76337, "tid": -914061504, "ts": 1716454226017882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454226017942, "dur": 2, "args": { "External id": 285117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285117, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285117, "pid": 5, "tid": 7, "ts": 1716454226017942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226017929, "dur": 13, "args": { "External id": 285117, "cbid": 211, "correlation": 285117 } }, { "ph": "s", "id": 285117, "pid": 76337, "tid": -914061504, "ts": 1716454226017929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454226017959, "dur": 6, "args": { "External id": 285119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285119, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 285119, "pid": 5, "tid": 7, "ts": 1716454226017959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226017946, "dur": 12, "args": { "External id": 285119, "cbid": 211, "correlation": 285119 } }, { "ph": "s", "id": 285119, "pid": 76337, "tid": -914061504, "ts": 1716454226017946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454226017968, "dur": 3, "args": { "External id": 285121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285121, "pid": 5, "tid": 7, "ts": 1716454226017968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226017960, "dur": 7, "args": { "External id": 285121, "cbid": 211, "correlation": 285121 } }, { "ph": "s", "id": 285121, "pid": 76337, "tid": -914061504, "ts": 1716454226017960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226018013, "dur": 2, "args": { "External id": 285129, "device": 5, "context": 1, "stream": 7, "correlation": 285129, "bytes": 8, "memory bandwidth (GB/s)": 0.0028735632183908046 } }, { "ph": "f", "id": 285129, "pid": 5, "tid": 7, "ts": 1716454226018013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226017997, "dur": 15, "args": { "External id": 285129, "cbid": 41, "correlation": 285129 } }, { "ph": "s", "id": 285129, "pid": 76337, "tid": -914061504, "ts": 1716454226017997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454226018060, "dur": 2, "args": { "External id": 285143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285143, "pid": 5, "tid": 7, "ts": 1716454226018060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018050, "dur": 12, "args": { "External id": 285143, "cbid": 211, "correlation": 285143 } }, { "ph": "s", "id": 285143, "pid": 76337, "tid": -914061504, "ts": 1716454226018050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454226018080, "dur": 2, "args": { "External id": 285157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285157, "pid": 5, "tid": 7, "ts": 1716454226018080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018073, "dur": 7, "args": { "External id": 285157, "cbid": 211, "correlation": 285157 } }, { "ph": "s", "id": 285157, "pid": 76337, "tid": -914061504, "ts": 1716454226018073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454226018117, "dur": 6, "args": { "External id": 285164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285164, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285164, "pid": 5, "tid": 7, "ts": 1716454226018117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018106, "dur": 11, "args": { "External id": 285164, "cbid": 211, "correlation": 285164 } }, { "ph": "s", "id": 285164, "pid": 76337, "tid": -914061504, "ts": 1716454226018106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454226018129, "dur": 6, "args": { "External id": 285167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285167, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285167, "pid": 5, "tid": 7, "ts": 1716454226018129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018121, "dur": 8, "args": { "External id": 285167, "cbid": 211, "correlation": 285167 } }, { "ph": "s", "id": 285167, "pid": 76337, "tid": -914061504, "ts": 1716454226018121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454226018140, "dur": 3, "args": { "External id": 285169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285169, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285169, "pid": 5, "tid": 7, "ts": 1716454226018140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018131, "dur": 6, "args": { "External id": 285169, "cbid": 211, "correlation": 285169 } }, { "ph": "s", "id": 285169, "pid": 76337, "tid": -914061504, "ts": 1716454226018131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226018159, "dur": 2, "args": { "External id": 285172, "device": 5, "context": 1, "stream": 7, "correlation": 285172, "bytes": 8, "memory bandwidth (GB/s)": 0.0029080334423845873 } }, { "ph": "f", "id": 285172, "pid": 5, "tid": 7, "ts": 1716454226018159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226018146, "dur": 12, "args": { "External id": 285172, "cbid": 41, "correlation": 285172 } }, { "ph": "s", "id": 285172, "pid": 76337, "tid": -914061504, "ts": 1716454226018146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454226018215, "dur": 4, "args": { "External id": 285188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285188, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285188, "pid": 5, "tid": 7, "ts": 1716454226018215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018199, "dur": 16, "args": { "External id": 285188, "cbid": 211, "correlation": 285188 } }, { "ph": "s", "id": 285188, "pid": 76337, "tid": -914061504, "ts": 1716454226018199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226018236, "dur": 3, "args": { "External id": 285193, "device": 5, "context": 1, "stream": 7, "correlation": 285193, "bytes": 1, "memory bandwidth (GB/s)": 0.0003255208333333333 } }, { "ph": "f", "id": 285193, "pid": 5, "tid": 7, "ts": 1716454226018236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226018221, "dur": 14, "args": { "External id": 285193, "cbid": 41, "correlation": 285193 } }, { "ph": "s", "id": 285193, "pid": 76337, "tid": -914061504, "ts": 1716454226018221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454226018261, "dur": 1, "args": { "External id": 285199, "device": 5, "context": 1, "stream": 7, "correlation": 285199, "bytes": 1, "memory bandwidth (GB/s)": 0.0006009615384615385 } }, { "ph": "f", "id": 285199, "pid": 5, "tid": 7, "ts": 1716454226018261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226018244, "dur": 26, "args": { "External id": 285199, "cbid": 41, "correlation": 285199 } }, { "ph": "s", "id": 285199, "pid": 76337, "tid": -914061504, "ts": 1716454226018244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454226018271, "dur": 4, "args": { "External id": 285200, "cbid": 131, "correlation": 285200 } }, { "ph": "f", "id": 285200, "pid": 76337, "tid": -914061504, "ts": 1716454226018271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226018325, "dur": 3, "args": { "External id": 285208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285208, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285208, "pid": 5, "tid": 7, "ts": 1716454226018325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018312, "dur": 14, "args": { "External id": 285208, "cbid": 211, "correlation": 285208 } }, { "ph": "s", "id": 285208, "pid": 76337, "tid": -914061504, "ts": 1716454226018312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226018356, "dur": 3, "args": { "External id": 285218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285218, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285218, "pid": 5, "tid": 7, "ts": 1716454226018356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018347, "dur": 8, "args": { "External id": 285218, "cbid": 211, "correlation": 285218 } }, { "ph": "s", "id": 285218, "pid": 76337, "tid": -914061504, "ts": 1716454226018347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226018383, "dur": 3, "args": { "External id": 285227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285227, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285227, "pid": 5, "tid": 7, "ts": 1716454226018383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018373, "dur": 8, "args": { "External id": 285227, "cbid": 211, "correlation": 285227 } }, { "ph": "s", "id": 285227, "pid": 76337, "tid": -914061504, "ts": 1716454226018373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454226018498, "dur": 11, "args": { "External id": 285237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285237, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285237, "pid": 5, "tid": 7, "ts": 1716454226018498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018484, "dur": 15, "args": { "External id": 285237, "cbid": 211, "correlation": 285237 } }, { "ph": "s", "id": 285237, "pid": 76337, "tid": -914061504, "ts": 1716454226018484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226018539, "dur": 3, "args": { "External id": 285245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285245, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285245, "pid": 5, "tid": 7, "ts": 1716454226018539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018530, "dur": 9, "args": { "External id": 285245, "cbid": 211, "correlation": 285245 } }, { "ph": "s", "id": 285245, "pid": 76337, "tid": -914061504, "ts": 1716454226018530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454226018585, "dur": 11, "args": { "External id": 285255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285255, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285255, "pid": 5, "tid": 7, "ts": 1716454226018585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018573, "dur": 11, "args": { "External id": 285255, "cbid": 211, "correlation": 285255 } }, { "ph": "s", "id": 285255, "pid": 76337, "tid": -914061504, "ts": 1716454226018573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454226018616, "dur": 10, "args": { "External id": 285263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285263, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285263, "pid": 5, "tid": 7, "ts": 1716454226018616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018606, "dur": 9, "args": { "External id": 285263, "cbid": 211, "correlation": 285263 } }, { "ph": "s", "id": 285263, "pid": 76337, "tid": -914061504, "ts": 1716454226018606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226018643, "dur": 3, "args": { "External id": 285272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285272, "pid": 5, "tid": 7, "ts": 1716454226018643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018633, "dur": 8, "args": { "External id": 285272, "cbid": 211, "correlation": 285272 } }, { "ph": "s", "id": 285272, "pid": 76337, "tid": -914061504, "ts": 1716454226018633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454226018672, "dur": 5, "args": { "External id": 285281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285281, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285281, "pid": 5, "tid": 7, "ts": 1716454226018672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018659, "dur": 12, "args": { "External id": 285281, "cbid": 211, "correlation": 285281 } }, { "ph": "s", "id": 285281, "pid": 76337, "tid": -914061504, "ts": 1716454226018659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454226018710, "dur": 8, "args": { "External id": 285291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285291, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285291, "pid": 5, "tid": 7, "ts": 1716454226018710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226018700, "dur": 10, "args": { "External id": 285291, "cbid": 211, "correlation": 285291 } }, { "ph": "s", "id": 285291, "pid": 76337, "tid": -914061504, "ts": 1716454226018700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454226018984, "dur": 1, "args": { "External id": 285302, "device": 5, "context": 1, "stream": 7, "correlation": 285302, "bytes": 4, "memory bandwidth (GB/s)": 0.0026595744680851063 } }, { "ph": "f", "id": 285302, "pid": 5, "tid": 7, "ts": 1716454226018984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226018961, "dur": 22, "args": { "External id": 285302, "cbid": 41, "correlation": 285302 } }, { "ph": "s", "id": 285302, "pid": 76337, "tid": -914061504, "ts": 1716454226018961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454226018983, "dur": 8, "args": { "External id": 285303, "cbid": 131, "correlation": 285303 } }, { "ph": "f", "id": 285303, "pid": 76337, "tid": -914061504, "ts": 1716454226018983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019074, "dur": 2, "args": { "External id": 285311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 285311, "pid": 5, "tid": 7, "ts": 1716454226019074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019058, "dur": 16, "args": { "External id": 285311, "cbid": 211, "correlation": 285311 } }, { "ph": "s", "id": 285311, "pid": 76337, "tid": -914061504, "ts": 1716454226019058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454226019157, "dur": 2, "args": { "External id": 285320, "device": 5, "context": 1, "stream": 7, "correlation": 285320, "bytes": 4, "memory bandwidth (GB/s)": 0.001358695652173913 } }, { "ph": "f", "id": 285320, "pid": 5, "tid": 7, "ts": 1716454226019157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226019138, "dur": 19, "args": { "External id": 285320, "cbid": 41, "correlation": 285320 } }, { "ph": "s", "id": 285320, "pid": 76337, "tid": -914061504, "ts": 1716454226019138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454226019213, "dur": 1, "args": { "External id": 285331, "device": 5, "context": 1, "stream": 7, "correlation": 285331, "bytes": 4, "memory bandwidth (GB/s)": 0.0026595744680851063 } }, { "ph": "f", "id": 285331, "pid": 5, "tid": 7, "ts": 1716454226019213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226019197, "dur": 14, "args": { "External id": 285331, "cbid": 41, "correlation": 285331 } }, { "ph": "s", "id": 285331, "pid": 76337, "tid": -914061504, "ts": 1716454226019197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454226019211, "dur": 8, "args": { "External id": 285332, "cbid": 131, "correlation": 285332 } }, { "ph": "f", "id": 285332, "pid": 76337, "tid": -914061504, "ts": 1716454226019211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019266, "dur": 3, "args": { "External id": 285340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285340, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285340, "pid": 5, "tid": 7, "ts": 1716454226019266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019252, "dur": 13, "args": { "External id": 285340, "cbid": 211, "correlation": 285340 } }, { "ph": "s", "id": 285340, "pid": 76337, "tid": -914061504, "ts": 1716454226019252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019295, "dur": 3, "args": { "External id": 285350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285350, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285350, "pid": 5, "tid": 7, "ts": 1716454226019295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019285, "dur": 8, "args": { "External id": 285350, "cbid": 211, "correlation": 285350 } }, { "ph": "s", "id": 285350, "pid": 76337, "tid": -914061504, "ts": 1716454226019285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019316, "dur": 3, "args": { "External id": 285359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285359, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285359, "pid": 5, "tid": 7, "ts": 1716454226019316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019308, "dur": 7, "args": { "External id": 285359, "cbid": 211, "correlation": 285359 } }, { "ph": "s", "id": 285359, "pid": 76337, "tid": -914061504, "ts": 1716454226019308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019367, "dur": 3, "args": { "External id": 285367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285367, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285367, "pid": 5, "tid": 7, "ts": 1716454226019367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019356, "dur": 11, "args": { "External id": 285367, "cbid": 211, "correlation": 285367 } }, { "ph": "s", "id": 285367, "pid": 76337, "tid": -914061504, "ts": 1716454226019356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019424, "dur": 3, "args": { "External id": 285375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285375, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285375, "pid": 5, "tid": 7, "ts": 1716454226019424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019414, "dur": 9, "args": { "External id": 285375, "cbid": 211, "correlation": 285375 } }, { "ph": "s", "id": 285375, "pid": 76337, "tid": -914061504, "ts": 1716454226019414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019477, "dur": 3, "args": { "External id": 285383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285383, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 285383, "pid": 5, "tid": 7, "ts": 1716454226019477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019466, "dur": 12, "args": { "External id": 285383, "cbid": 211, "correlation": 285383 } }, { "ph": "s", "id": 285383, "pid": 76337, "tid": -914061504, "ts": 1716454226019466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226019505, "dur": 5, "args": { "External id": 285391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285391, "pid": 5, "tid": 7, "ts": 1716454226019505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019496, "dur": 9, "args": { "External id": 285391, "cbid": 211, "correlation": 285391 } }, { "ph": "s", "id": 285391, "pid": 76337, "tid": -914061504, "ts": 1716454226019496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226019529, "dur": 4, "args": { "External id": 285399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285399, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285399, "pid": 5, "tid": 7, "ts": 1716454226019529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019520, "dur": 8, "args": { "External id": 285399, "cbid": 211, "correlation": 285399 } }, { "ph": "s", "id": 285399, "pid": 76337, "tid": -914061504, "ts": 1716454226019520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226019549, "dur": 3, "args": { "External id": 285407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285407, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285407, "pid": 5, "tid": 7, "ts": 1716454226019549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226019541, "dur": 7, "args": { "External id": 285407, "cbid": 211, "correlation": 285407 } }, { "ph": "s", "id": 285407, "pid": 76337, "tid": -914061504, "ts": 1716454226019541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226020300, "dur": 3, "args": { "External id": 285415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285415, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285415, "pid": 5, "tid": 7, "ts": 1716454226020300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226020281, "dur": 20, "args": { "External id": 285415, "cbid": 211, "correlation": 285415 } }, { "ph": "s", "id": 285415, "pid": 76337, "tid": -914061504, "ts": 1716454226020281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226020462, "dur": 1, "args": { "External id": 285425, "cbid": 317, "correlation": 285425 } }, { "ph": "f", "id": 285425, "pid": 76337, "tid": -914061504, "ts": 1716454226020462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226020465, "dur": 1, "args": { "External id": 285426, "cbid": 203, "correlation": 285426 } }, { "ph": "f", "id": 285426, "pid": 76337, "tid": -914061504, "ts": 1716454226020465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226020466, "dur": 1, "args": { "External id": 285427, "cbid": 205, "correlation": 285427 } }, { "ph": "f", "id": 285427, "pid": 76337, "tid": -914061504, "ts": 1716454226020466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226024001, "dur": 171, "args": { "External id": 285431, "cbid": 251, "correlation": 285431 } }, { "ph": "f", "id": 285431, "pid": 76337, "tid": -914061504, "ts": 1716454226024001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226024174, "dur": 45, "args": { "External id": 285432, "cbid": 251, "correlation": 285432 } }, { "ph": "f", "id": 285432, "pid": 76337, "tid": -914061504, "ts": 1716454226024174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226024221, "dur": 0, "args": { "External id": 285433, "cbid": 251, "correlation": 285433 } }, { "ph": "f", "id": 285433, "pid": 76337, "tid": -914061504, "ts": 1716454226024221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_NN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454226024251, "dur": 5, "args": { "External id": 285434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285434, "registers per thread": 64, "shared memory": 12288, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [48, 1, 4], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 285434, "pid": 5, "tid": 7, "ts": 1716454226024251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226024227, "dur": 99, "args": { "External id": 285434, "cbid": 211, "correlation": 285434 } }, { "ph": "s", "id": 285434, "pid": 76337, "tid": -914061504, "ts": 1716454226024227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226025037, "dur": 5, "args": { "External id": 285440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 285440, "pid": 5, "tid": 7, "ts": 1716454226025037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226025020, "dur": 16, "args": { "External id": 285440, "cbid": 211, "correlation": 285440 } }, { "ph": "s", "id": 285440, "pid": 76337, "tid": -914061504, "ts": 1716454226025020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226025211, "dur": 0, "args": { "External id": 285450, "cbid": 317, "correlation": 285450 } }, { "ph": "f", "id": 285450, "pid": 76337, "tid": -914061504, "ts": 1716454226025211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226025212, "dur": 1, "args": { "External id": 285451, "cbid": 203, "correlation": 285451 } }, { "ph": "f", "id": 285451, "pid": 76337, "tid": -914061504, "ts": 1716454226025212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226025214, "dur": 0, "args": { "External id": 285452, "cbid": 205, "correlation": 285452 } }, { "ph": "f", "id": 285452, "pid": 76337, "tid": -914061504, "ts": 1716454226025214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454226028781, "dur": 3, "args": { "External id": 285456, "device": 5, "context": 1, "stream": 7, "correlation": 285456, "bytes": 196608, "memory bandwidth (GB/s)": 60.23529411764706 } }, { "ph": "f", "id": 285456, "pid": 5, "tid": 7, "ts": 1716454226028781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226028738, "dur": 43, "args": { "External id": 285456, "cbid": 51, "correlation": 285456 } }, { "ph": "s", "id": 285456, "pid": 76337, "tid": -914061504, "ts": 1716454226028738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226028801, "dur": 5, "args": { "External id": 285457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285457, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4.8, "warps per SM": 38.4, "grid": [96, 1, 4], "block": [256, 1, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 285457, "pid": 5, "tid": 7, "ts": 1716454226028801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226028788, "dur": 13, "args": { "External id": 285457, "cbid": 211, "correlation": 285457 } }, { "ph": "s", "id": 285457, "pid": 76337, "tid": -914061504, "ts": 1716454226028788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454226028813, "dur": 2, "args": { "External id": 285459, "device": 5, "context": 1, "stream": 7, "correlation": 285459, "bytes": 73728, "memory bandwidth (GB/s)": 25.318681318681318 } }, { "ph": "f", "id": 285459, "pid": 5, "tid": 7, "ts": 1716454226028813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226028804, "dur": 7, "args": { "External id": 285459, "cbid": 51, "correlation": 285459 } }, { "ph": "s", "id": 285459, "pid": 76337, "tid": -914061504, "ts": 1716454226028804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226028820, "dur": 4, "args": { "External id": 285460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285460, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 1, 512], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 285460, "pid": 5, "tid": 7, "ts": 1716454226028820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226028813, "dur": 6, "args": { "External id": 285460, "cbid": 211, "correlation": 285460 } }, { "ph": "s", "id": 285460, "pid": 76337, "tid": -914061504, "ts": 1716454226028813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x64x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226029081, "dur": 72, "args": { "External id": 285462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285462, "registers per thread": 184, "shared memory": 12288, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [8, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285462, "pid": 5, "tid": 7, "ts": 1716454226029081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226028838, "dur": 245, "args": { "External id": 285462, "cbid": 211, "correlation": 285462 } }, { "ph": "s", "id": 285462, "pid": 76337, "tid": -914061504, "ts": 1716454226028838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226029155, "dur": 31, "args": { "External id": 285464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285464, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285464, "pid": 5, "tid": 7, "ts": 1716454226029155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226029092, "dur": 8, "args": { "External id": 285464, "cbid": 211, "correlation": 285464 } }, { "ph": "s", "id": 285464, "pid": 76337, "tid": -914061504, "ts": 1716454226029092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226029986, "dur": 48, "args": { "External id": 285470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285470, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285470, "pid": 5, "tid": 7, "ts": 1716454226029986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226029964, "dur": 21, "args": { "External id": 285470, "cbid": 211, "correlation": 285470 } }, { "ph": "s", "id": 285470, "pid": 76337, "tid": -914061504, "ts": 1716454226029964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226030343, "dur": 57, "args": { "External id": 285490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285490, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 285490, "pid": 5, "tid": 7, "ts": 1716454226030343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226030326, "dur": 16, "args": { "External id": 285490, "cbid": 211, "correlation": 285490 } }, { "ph": "s", "id": 285490, "pid": 76337, "tid": -914061504, "ts": 1716454226030326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226030401, "dur": 5, "args": { "External id": 285502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285502, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 285502, "pid": 5, "tid": 7, "ts": 1716454226030401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226030355, "dur": 10, "args": { "External id": 285502, "cbid": 211, "correlation": 285502 } }, { "ph": "s", "id": 285502, "pid": 76337, "tid": -914061504, "ts": 1716454226030355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226030407, "dur": 43, "args": { "External id": 285505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285505, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285505, "pid": 5, "tid": 7, "ts": 1716454226030407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226030389, "dur": 9, "args": { "External id": 285505, "cbid": 211, "correlation": 285505 } }, { "ph": "s", "id": 285505, "pid": 76337, "tid": -914061504, "ts": 1716454226030389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226030469, "dur": 31, "args": { "External id": 285514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285514, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285514, "pid": 5, "tid": 7, "ts": 1716454226030469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226030457, "dur": 12, "args": { "External id": 285514, "cbid": 211, "correlation": 285514 } }, { "ph": "s", "id": 285514, "pid": 76337, "tid": -914061504, "ts": 1716454226030457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226030541, "dur": 0, "args": { "External id": 285524, "cbid": 317, "correlation": 285524 } }, { "ph": "f", "id": 285524, "pid": 76337, "tid": -914061504, "ts": 1716454226030541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226030543, "dur": 0, "args": { "External id": 285525, "cbid": 203, "correlation": 285525 } }, { "ph": "f", "id": 285525, "pid": 76337, "tid": -914061504, "ts": 1716454226030543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226030544, "dur": 0, "args": { "External id": 285526, "cbid": 205, "correlation": 285526 } }, { "ph": "f", "id": 285526, "pid": 76337, "tid": -914061504, "ts": 1716454226030544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226033677, "dur": 34, "args": { "External id": 285530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285530, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285530, "pid": 5, "tid": 7, "ts": 1716454226033677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226033652, "dur": 26, "args": { "External id": 285530, "cbid": 211, "correlation": 285530 } }, { "ph": "s", "id": 285530, "pid": 76337, "tid": -914061504, "ts": 1716454226033652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226033713, "dur": 28, "args": { "External id": 285532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285532, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285532, "pid": 5, "tid": 7, "ts": 1716454226033713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226033682, "dur": 6, "args": { "External id": 285532, "cbid": 211, "correlation": 285532 } }, { "ph": "s", "id": 285532, "pid": 76337, "tid": -914061504, "ts": 1716454226033682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226033742, "dur": 746, "args": { "External id": 285534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285534, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285534, "pid": 5, "tid": 7, "ts": 1716454226033742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226033706, "dur": 11, "args": { "External id": 285534, "cbid": 211, "correlation": 285534 } }, { "ph": "s", "id": 285534, "pid": 76337, "tid": -914061504, "ts": 1716454226033706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226034490, "dur": 30, "args": { "External id": 285536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285536, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285536, "pid": 5, "tid": 7, "ts": 1716454226034490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226033723, "dur": 7, "args": { "External id": 285536, "cbid": 211, "correlation": 285536 } }, { "ph": "s", "id": 285536, "pid": 76337, "tid": -914061504, "ts": 1716454226033723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226034541, "dur": 47, "args": { "External id": 285542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285542, "pid": 5, "tid": 7, "ts": 1716454226034541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034526, "dur": 14, "args": { "External id": 285542, "cbid": 211, "correlation": 285542 } }, { "ph": "s", "id": 285542, "pid": 76337, "tid": -914061504, "ts": 1716454226034526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226034684, "dur": 57, "args": { "External id": 285562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285562, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 285562, "pid": 5, "tid": 7, "ts": 1716454226034684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034671, "dur": 13, "args": { "External id": 285562, "cbid": 211, "correlation": 285562 } }, { "ph": "s", "id": 285562, "pid": 76337, "tid": -914061504, "ts": 1716454226034671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226034743, "dur": 4, "args": { "External id": 285574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285574, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 285574, "pid": 5, "tid": 7, "ts": 1716454226034743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034698, "dur": 9, "args": { "External id": 285574, "cbid": 211, "correlation": 285574 } }, { "ph": "s", "id": 285574, "pid": 76337, "tid": -914061504, "ts": 1716454226034698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226034748, "dur": 43, "args": { "External id": 285577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285577, "pid": 5, "tid": 7, "ts": 1716454226034748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034735, "dur": 8, "args": { "External id": 285577, "cbid": 211, "correlation": 285577 } }, { "ph": "s", "id": 285577, "pid": 76337, "tid": -914061504, "ts": 1716454226034735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226034804, "dur": 30, "args": { "External id": 285586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285586, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285586, "pid": 5, "tid": 7, "ts": 1716454226034804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034791, "dur": 12, "args": { "External id": 285586, "cbid": 211, "correlation": 285586 } }, { "ph": "s", "id": 285586, "pid": 76337, "tid": -914061504, "ts": 1716454226034791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226034891, "dur": 0, "args": { "External id": 285596, "cbid": 317, "correlation": 285596 } }, { "ph": "f", "id": 285596, "pid": 76337, "tid": -914061504, "ts": 1716454226034891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226034892, "dur": 1, "args": { "External id": 285597, "cbid": 203, "correlation": 285597 } }, { "ph": "f", "id": 285597, "pid": 76337, "tid": -914061504, "ts": 1716454226034892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226034894, "dur": 0, "args": { "External id": 285598, "cbid": 205, "correlation": 285598 } }, { "ph": "f", "id": 285598, "pid": 76337, "tid": -914061504, "ts": 1716454226034894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226034923, "dur": 32, "args": { "External id": 285602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285602, "pid": 5, "tid": 7, "ts": 1716454226034923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034910, "dur": 12, "args": { "External id": 285602, "cbid": 211, "correlation": 285602 } }, { "ph": "s", "id": 285602, "pid": 76337, "tid": -914061504, "ts": 1716454226034910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226034956, "dur": 29, "args": { "External id": 285604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285604, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285604, "pid": 5, "tid": 7, "ts": 1716454226034956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034925, "dur": 6, "args": { "External id": 285604, "cbid": 211, "correlation": 285604 } }, { "ph": "s", "id": 285604, "pid": 76337, "tid": -914061504, "ts": 1716454226034925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226034986, "dur": 725, "args": { "External id": 285606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285606, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285606, "pid": 5, "tid": 7, "ts": 1716454226034986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034937, "dur": 6, "args": { "External id": 285606, "cbid": 211, "correlation": 285606 } }, { "ph": "s", "id": 285606, "pid": 76337, "tid": -914061504, "ts": 1716454226034937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226035712, "dur": 30, "args": { "External id": 285608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285608, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285608, "pid": 5, "tid": 7, "ts": 1716454226035712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034946, "dur": 6, "args": { "External id": 285608, "cbid": 211, "correlation": 285608 } }, { "ph": "s", "id": 285608, "pid": 76337, "tid": -914061504, "ts": 1716454226034946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226035744, "dur": 48, "args": { "External id": 285614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285614, "pid": 5, "tid": 7, "ts": 1716454226035744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226034983, "dur": 9, "args": { "External id": 285614, "cbid": 211, "correlation": 285614 } }, { "ph": "s", "id": 285614, "pid": 76337, "tid": -914061504, "ts": 1716454226034983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226035793, "dur": 41, "args": { "External id": 285622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285622, "registers per thread": 19, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285622, "pid": 5, "tid": 7, "ts": 1716454226035793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035026, "dur": 10, "args": { "External id": 285622, "cbid": 211, "correlation": 285622 } }, { "ph": "s", "id": 285622, "pid": 76337, "tid": -914061504, "ts": 1716454226035026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226035836, "dur": 29, "args": { "External id": 285630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285630, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285630, "pid": 5, "tid": 7, "ts": 1716454226035836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035077, "dur": 10, "args": { "External id": 285630, "cbid": 211, "correlation": 285630 } }, { "ph": "s", "id": 285630, "pid": 76337, "tid": -914061504, "ts": 1716454226035077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226035866, "dur": 60, "args": { "External id": 285650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285650, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 285650, "pid": 5, "tid": 7, "ts": 1716454226035866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035310, "dur": 15, "args": { "External id": 285650, "cbid": 211, "correlation": 285650 } }, { "ph": "s", "id": 285650, "pid": 76337, "tid": -914061504, "ts": 1716454226035310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226035928, "dur": 4, "args": { "External id": 285662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285662, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 285662, "pid": 5, "tid": 7, "ts": 1716454226035928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035339, "dur": 7, "args": { "External id": 285662, "cbid": 211, "correlation": 285662 } }, { "ph": "s", "id": 285662, "pid": 76337, "tid": -914061504, "ts": 1716454226035339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226035933, "dur": 43, "args": { "External id": 285665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285665, "pid": 5, "tid": 7, "ts": 1716454226035933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035358, "dur": 7, "args": { "External id": 285665, "cbid": 211, "correlation": 285665 } }, { "ph": "s", "id": 285665, "pid": 76337, "tid": -914061504, "ts": 1716454226035358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035527, "dur": 104, "args": { "External id": 285677, "cbid": 251, "correlation": 285677 } }, { "ph": "f", "id": 285677, "pid": 76337, "tid": -914061504, "ts": 1716454226035527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035633, "dur": 1, "args": { "External id": 285678, "cbid": 251, "correlation": 285678 } }, { "ph": "f", "id": 285678, "pid": 76337, "tid": -914061504, "ts": 1716454226035633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035635, "dur": 45, "args": { "External id": 285679, "cbid": 251, "correlation": 285679 } }, { "ph": "f", "id": 285679, "pid": 76337, "tid": -914061504, "ts": 1716454226035635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035681, "dur": 3, "args": { "External id": 285680, "cbid": 251, "correlation": 285680 } }, { "ph": "f", "id": 285680, "pid": 76337, "tid": -914061504, "ts": 1716454226035681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035686, "dur": 51, "args": { "External id": 285681, "cbid": 251, "correlation": 285681 } }, { "ph": "f", "id": 285681, "pid": 76337, "tid": -914061504, "ts": 1716454226035686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035737, "dur": 1, "args": { "External id": 285682, "cbid": 251, "correlation": 285682 } }, { "ph": "f", "id": 285682, "pid": 76337, "tid": -914061504, "ts": 1716454226035737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035739, "dur": 1, "args": { "External id": 285683, "cbid": 251, "correlation": 285683 } }, { "ph": "f", "id": 285683, "pid": 76337, "tid": -914061504, "ts": 1716454226035739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035741, "dur": 40, "args": { "External id": 285684, "cbid": 251, "correlation": 285684 } }, { "ph": "f", "id": 285684, "pid": 76337, "tid": -914061504, "ts": 1716454226035741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035783, "dur": 0, "args": { "External id": 285685, "cbid": 251, "correlation": 285685 } }, { "ph": "f", "id": 285685, "pid": 76337, "tid": -914061504, "ts": 1716454226035783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454226036003, "dur": 104, "args": { "External id": 285686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285686, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [8, 24, 4], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 285686, "pid": 5, "tid": 7, "ts": 1716454226036003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035793, "dur": 16, "args": { "External id": 285686, "cbid": 211, "correlation": 285686 } }, { "ph": "s", "id": 285686, "pid": 76337, "tid": -914061504, "ts": 1716454226035793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226036109, "dur": 46, "args": { "External id": 285691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285691, "pid": 5, "tid": 7, "ts": 1716454226036109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035842, "dur": 10, "args": { "External id": 285691, "cbid": 211, "correlation": 285691 } }, { "ph": "s", "id": 285691, "pid": 76337, "tid": -914061504, "ts": 1716454226035842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035926, "dur": 1, "args": { "External id": 285702, "cbid": 251, "correlation": 285702 } }, { "ph": "f", "id": 285702, "pid": 76337, "tid": -914061504, "ts": 1716454226035926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035928, "dur": 0, "args": { "External id": 285703, "cbid": 251, "correlation": 285703 } }, { "ph": "f", "id": 285703, "pid": 76337, "tid": -914061504, "ts": 1716454226035928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035928, "dur": 0, "args": { "External id": 285704, "cbid": 251, "correlation": 285704 } }, { "ph": "f", "id": 285704, "pid": 76337, "tid": -914061504, "ts": 1716454226035928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035929, "dur": 0, "args": { "External id": 285705, "cbid": 251, "correlation": 285705 } }, { "ph": "f", "id": 285705, "pid": 76337, "tid": -914061504, "ts": 1716454226035929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035930, "dur": 0, "args": { "External id": 285706, "cbid": 251, "correlation": 285706 } }, { "ph": "f", "id": 285706, "pid": 76337, "tid": -914061504, "ts": 1716454226035930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035930, "dur": 0, "args": { "External id": 285707, "cbid": 251, "correlation": 285707 } }, { "ph": "f", "id": 285707, "pid": 76337, "tid": -914061504, "ts": 1716454226035930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035931, "dur": 0, "args": { "External id": 285708, "cbid": 251, "correlation": 285708 } }, { "ph": "f", "id": 285708, "pid": 76337, "tid": -914061504, "ts": 1716454226035931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035932, "dur": 0, "args": { "External id": 285709, "cbid": 251, "correlation": 285709 } }, { "ph": "f", "id": 285709, "pid": 76337, "tid": -914061504, "ts": 1716454226035932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226035933, "dur": 0, "args": { "External id": 285710, "cbid": 251, "correlation": 285710 } }, { "ph": "f", "id": 285710, "pid": 76337, "tid": -914061504, "ts": 1716454226035933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454226036157, "dur": 101, "args": { "External id": 285711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285711, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [8, 24, 4], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 285711, "pid": 5, "tid": 7, "ts": 1716454226036157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035936, "dur": 13, "args": { "External id": 285711, "cbid": 211, "correlation": 285711 } }, { "ph": "s", "id": 285711, "pid": 76337, "tid": -914061504, "ts": 1716454226035936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226036259, "dur": 46, "args": { "External id": 285716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285716, "pid": 5, "tid": 7, "ts": 1716454226036259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226035967, "dur": 18, "args": { "External id": 285716, "cbid": 211, "correlation": 285716 } }, { "ph": "s", "id": 285716, "pid": 76337, "tid": -914061504, "ts": 1716454226035967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036048, "dur": 1, "args": { "External id": 285727, "cbid": 251, "correlation": 285727 } }, { "ph": "f", "id": 285727, "pid": 76337, "tid": -914061504, "ts": 1716454226036048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036050, "dur": 0, "args": { "External id": 285728, "cbid": 251, "correlation": 285728 } }, { "ph": "f", "id": 285728, "pid": 76337, "tid": -914061504, "ts": 1716454226036050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036050, "dur": 0, "args": { "External id": 285729, "cbid": 251, "correlation": 285729 } }, { "ph": "f", "id": 285729, "pid": 76337, "tid": -914061504, "ts": 1716454226036050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036051, "dur": 0, "args": { "External id": 285730, "cbid": 251, "correlation": 285730 } }, { "ph": "f", "id": 285730, "pid": 76337, "tid": -914061504, "ts": 1716454226036051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036051, "dur": 0, "args": { "External id": 285731, "cbid": 251, "correlation": 285731 } }, { "ph": "f", "id": 285731, "pid": 76337, "tid": -914061504, "ts": 1716454226036051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036052, "dur": 0, "args": { "External id": 285732, "cbid": 251, "correlation": 285732 } }, { "ph": "f", "id": 285732, "pid": 76337, "tid": -914061504, "ts": 1716454226036052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036053, "dur": 0, "args": { "External id": 285733, "cbid": 251, "correlation": 285733 } }, { "ph": "f", "id": 285733, "pid": 76337, "tid": -914061504, "ts": 1716454226036053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036053, "dur": 0, "args": { "External id": 285734, "cbid": 251, "correlation": 285734 } }, { "ph": "f", "id": 285734, "pid": 76337, "tid": -914061504, "ts": 1716454226036053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036054, "dur": 0, "args": { "External id": 285735, "cbid": 251, "correlation": 285735 } }, { "ph": "f", "id": 285735, "pid": 76337, "tid": -914061504, "ts": 1716454226036054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_tt", "pid": 5, "tid": 7, "ts": 1716454226036306, "dur": 104, "args": { "External id": 285736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285736, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [8, 24, 4], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 285736, "pid": 5, "tid": 7, "ts": 1716454226036306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036058, "dur": 13, "args": { "External id": 285736, "cbid": 211, "correlation": 285736 } }, { "ph": "s", "id": 285736, "pid": 76337, "tid": -914061504, "ts": 1716454226036058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226036412, "dur": 46, "args": { "External id": 285741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285741, "pid": 5, "tid": 7, "ts": 1716454226036412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036090, "dur": 9, "args": { "External id": 285741, "cbid": 211, "correlation": 285741 } }, { "ph": "s", "id": 285741, "pid": 76337, "tid": -914061504, "ts": 1716454226036090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454226036459, "dur": 3561, "args": { "External id": 285766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285766, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [96, 1, 4], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285766, "pid": 5, "tid": 7, "ts": 1716454226036459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036195, "dur": 15, "args": { "External id": 285766, "cbid": 211, "correlation": 285766 } }, { "ph": "s", "id": 285766, "pid": 76337, "tid": -914061504, "ts": 1716454226036195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036349, "dur": 2, "args": { "External id": 285784, "cbid": 251, "correlation": 285784 } }, { "ph": "f", "id": 285784, "pid": 76337, "tid": -914061504, "ts": 1716454226036349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226036355, "dur": 0, "args": { "External id": 285786, "cbid": 251, "correlation": 285786 } }, { "ph": "f", "id": 285786, "pid": 76337, "tid": -914061504, "ts": 1716454226036355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454226040021, "dur": 112, "args": { "External id": 285787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285787, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [4, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285787, "pid": 5, "tid": 7, "ts": 1716454226040021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036361, "dur": 15, "args": { "External id": 285787, "cbid": 211, "correlation": 285787 } }, { "ph": "s", "id": 285787, "pid": 76337, "tid": -914061504, "ts": 1716454226036361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226040135, "dur": 122, "args": { "External id": 285795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285795, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285795, "pid": 5, "tid": 7, "ts": 1716454226040135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036461, "dur": 14, "args": { "External id": 285795, "cbid": 211, "correlation": 285795 } }, { "ph": "s", "id": 285795, "pid": 76337, "tid": -914061504, "ts": 1716454226036461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226040259, "dur": 28, "args": { "External id": 285803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285803, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285803, "pid": 5, "tid": 7, "ts": 1716454226040259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036501, "dur": 10, "args": { "External id": 285803, "cbid": 211, "correlation": 285803 } }, { "ph": "s", "id": 285803, "pid": 76337, "tid": -914061504, "ts": 1716454226036501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226040288, "dur": 117, "args": { "External id": 285813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285813, "pid": 5, "tid": 7, "ts": 1716454226040288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036600, "dur": 16, "args": { "External id": 285813, "cbid": 211, "correlation": 285813 } }, { "ph": "s", "id": 285813, "pid": 76337, "tid": -914061504, "ts": 1716454226036600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226040406, "dur": 61, "args": { "External id": 285834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285834, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 285834, "pid": 5, "tid": 7, "ts": 1716454226040406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036639, "dur": 8, "args": { "External id": 285834, "cbid": 211, "correlation": 285834 } }, { "ph": "s", "id": 285834, "pid": 76337, "tid": -914061504, "ts": 1716454226036639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226040469, "dur": 4, "args": { "External id": 285846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285846, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 285846, "pid": 5, "tid": 7, "ts": 1716454226040469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036659, "dur": 6, "args": { "External id": 285846, "cbid": 211, "correlation": 285846 } }, { "ph": "s", "id": 285846, "pid": 76337, "tid": -914061504, "ts": 1716454226036659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226040474, "dur": 45, "args": { "External id": 285849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285849, "pid": 5, "tid": 7, "ts": 1716454226040474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036675, "dur": 6, "args": { "External id": 285849, "cbid": 211, "correlation": 285849 } }, { "ph": "s", "id": 285849, "pid": 76337, "tid": -914061504, "ts": 1716454226036675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226040521, "dur": 30, "args": { "External id": 285858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285858, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285858, "pid": 5, "tid": 7, "ts": 1716454226040521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036717, "dur": 9, "args": { "External id": 285858, "cbid": 211, "correlation": 285858 } }, { "ph": "s", "id": 285858, "pid": 76337, "tid": -914061504, "ts": 1716454226036717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226036780, "dur": 0, "args": { "External id": 285868, "cbid": 317, "correlation": 285868 } }, { "ph": "f", "id": 285868, "pid": 76337, "tid": -914061504, "ts": 1716454226036780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226036781, "dur": 0, "args": { "External id": 285869, "cbid": 203, "correlation": 285869 } }, { "ph": "f", "id": 285869, "pid": 76337, "tid": -914061504, "ts": 1716454226036781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226036782, "dur": 0, "args": { "External id": 285870, "cbid": 205, "correlation": 285870 } }, { "ph": "f", "id": 285870, "pid": 76337, "tid": -914061504, "ts": 1716454226036782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226040552, "dur": 34, "args": { "External id": 285874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285874, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285874, "pid": 5, "tid": 7, "ts": 1716454226040552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036796, "dur": 13, "args": { "External id": 285874, "cbid": 211, "correlation": 285874 } }, { "ph": "s", "id": 285874, "pid": 76337, "tid": -914061504, "ts": 1716454226036796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226040588, "dur": 28, "args": { "External id": 285876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285876, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285876, "pid": 5, "tid": 7, "ts": 1716454226040588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036811, "dur": 5, "args": { "External id": 285876, "cbid": 211, "correlation": 285876 } }, { "ph": "s", "id": 285876, "pid": 76337, "tid": -914061504, "ts": 1716454226036811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226040618, "dur": 743, "args": { "External id": 285878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285878, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285878, "pid": 5, "tid": 7, "ts": 1716454226040618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036823, "dur": 6, "args": { "External id": 285878, "cbid": 211, "correlation": 285878 } }, { "ph": "s", "id": 285878, "pid": 76337, "tid": -914061504, "ts": 1716454226036823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226041362, "dur": 30, "args": { "External id": 285880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285880, "pid": 5, "tid": 7, "ts": 1716454226041362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036833, "dur": 5, "args": { "External id": 285880, "cbid": 211, "correlation": 285880 } }, { "ph": "s", "id": 285880, "pid": 76337, "tid": -914061504, "ts": 1716454226036833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226041394, "dur": 47, "args": { "External id": 285886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285886, "pid": 5, "tid": 7, "ts": 1716454226041394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036862, "dur": 9, "args": { "External id": 285886, "cbid": 211, "correlation": 285886 } }, { "ph": "s", "id": 285886, "pid": 76337, "tid": -914061504, "ts": 1716454226036862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226041442, "dur": 59, "args": { "External id": 285906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285906, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 285906, "pid": 5, "tid": 7, "ts": 1716454226041442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036936, "dur": 12, "args": { "External id": 285906, "cbid": 211, "correlation": 285906 } }, { "ph": "s", "id": 285906, "pid": 76337, "tid": -914061504, "ts": 1716454226036936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226041502, "dur": 4, "args": { "External id": 285918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285918, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 285918, "pid": 5, "tid": 7, "ts": 1716454226041502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036959, "dur": 6, "args": { "External id": 285918, "cbid": 211, "correlation": 285918 } }, { "ph": "s", "id": 285918, "pid": 76337, "tid": -914061504, "ts": 1716454226036959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226041507, "dur": 44, "args": { "External id": 285921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285921, "pid": 5, "tid": 7, "ts": 1716454226041507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226036987, "dur": 7, "args": { "External id": 285921, "cbid": 211, "correlation": 285921 } }, { "ph": "s", "id": 285921, "pid": 76337, "tid": -914061504, "ts": 1716454226036987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226041552, "dur": 29, "args": { "External id": 285930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285930, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285930, "pid": 5, "tid": 7, "ts": 1716454226041552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037029, "dur": 10, "args": { "External id": 285930, "cbid": 211, "correlation": 285930 } }, { "ph": "s", "id": 285930, "pid": 76337, "tid": -914061504, "ts": 1716454226037029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226037094, "dur": 0, "args": { "External id": 285940, "cbid": 317, "correlation": 285940 } }, { "ph": "f", "id": 285940, "pid": 76337, "tid": -914061504, "ts": 1716454226037094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226037095, "dur": 0, "args": { "External id": 285941, "cbid": 203, "correlation": 285941 } }, { "ph": "f", "id": 285941, "pid": 76337, "tid": -914061504, "ts": 1716454226037095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226037096, "dur": 0, "args": { "External id": 285942, "cbid": 205, "correlation": 285942 } }, { "ph": "f", "id": 285942, "pid": 76337, "tid": -914061504, "ts": 1716454226037096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226041583, "dur": 33, "args": { "External id": 285946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285946, "pid": 5, "tid": 7, "ts": 1716454226041583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037109, "dur": 12, "args": { "External id": 285946, "cbid": 211, "correlation": 285946 } }, { "ph": "s", "id": 285946, "pid": 76337, "tid": -914061504, "ts": 1716454226037109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226041617, "dur": 28, "args": { "External id": 285948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285948, "pid": 5, "tid": 7, "ts": 1716454226041617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037124, "dur": 5, "args": { "External id": 285948, "cbid": 211, "correlation": 285948 } }, { "ph": "s", "id": 285948, "pid": 76337, "tid": -914061504, "ts": 1716454226037124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226041647, "dur": 727, "args": { "External id": 285950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285950, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 285950, "pid": 5, "tid": 7, "ts": 1716454226041647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037135, "dur": 6, "args": { "External id": 285950, "cbid": 211, "correlation": 285950 } }, { "ph": "s", "id": 285950, "pid": 76337, "tid": -914061504, "ts": 1716454226037135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226042375, "dur": 30, "args": { "External id": 285952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285952, "pid": 5, "tid": 7, "ts": 1716454226042375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037144, "dur": 5, "args": { "External id": 285952, "cbid": 211, "correlation": 285952 } }, { "ph": "s", "id": 285952, "pid": 76337, "tid": -914061504, "ts": 1716454226037144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226042406, "dur": 47, "args": { "External id": 285958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285958, "pid": 5, "tid": 7, "ts": 1716454226042406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037172, "dur": 8, "args": { "External id": 285958, "cbid": 211, "correlation": 285958 } }, { "ph": "s", "id": 285958, "pid": 76337, "tid": -914061504, "ts": 1716454226037172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226042454, "dur": 123, "args": { "External id": 285966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285966, "pid": 5, "tid": 7, "ts": 1716454226042454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037207, "dur": 9, "args": { "External id": 285966, "cbid": 211, "correlation": 285966 } }, { "ph": "s", "id": 285966, "pid": 76337, "tid": -914061504, "ts": 1716454226037207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226042578, "dur": 28, "args": { "External id": 285974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285974, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285974, "pid": 5, "tid": 7, "ts": 1716454226042578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037237, "dur": 8, "args": { "External id": 285974, "cbid": 211, "correlation": 285974 } }, { "ph": "s", "id": 285974, "pid": 76337, "tid": -914061504, "ts": 1716454226037237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226042607, "dur": 117, "args": { "External id": 285984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 285984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 285984, "pid": 5, "tid": 7, "ts": 1716454226042607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037343, "dur": 14, "args": { "External id": 285984, "cbid": 211, "correlation": 285984 } }, { "ph": "s", "id": 285984, "pid": 76337, "tid": -914061504, "ts": 1716454226037343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226042726, "dur": 61, "args": { "External id": 286005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286005, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286005, "pid": 5, "tid": 7, "ts": 1716454226042726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037381, "dur": 8, "args": { "External id": 286005, "cbid": 211, "correlation": 286005 } }, { "ph": "s", "id": 286005, "pid": 76337, "tid": -914061504, "ts": 1716454226037381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226042788, "dur": 4, "args": { "External id": 286017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286017, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286017, "pid": 5, "tid": 7, "ts": 1716454226042788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037398, "dur": 6, "args": { "External id": 286017, "cbid": 211, "correlation": 286017 } }, { "ph": "s", "id": 286017, "pid": 76337, "tid": -914061504, "ts": 1716454226037398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226042793, "dur": 43, "args": { "External id": 286020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286020, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286020, "pid": 5, "tid": 7, "ts": 1716454226042793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037416, "dur": 6, "args": { "External id": 286020, "cbid": 211, "correlation": 286020 } }, { "ph": "s", "id": 286020, "pid": 76337, "tid": -914061504, "ts": 1716454226037416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226042838, "dur": 30, "args": { "External id": 286029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286029, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286029, "pid": 5, "tid": 7, "ts": 1716454226042838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037455, "dur": 10, "args": { "External id": 286029, "cbid": 211, "correlation": 286029 } }, { "ph": "s", "id": 286029, "pid": 76337, "tid": -914061504, "ts": 1716454226037455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226037517, "dur": 0, "args": { "External id": 286039, "cbid": 317, "correlation": 286039 } }, { "ph": "f", "id": 286039, "pid": 76337, "tid": -914061504, "ts": 1716454226037517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226037518, "dur": 0, "args": { "External id": 286040, "cbid": 203, "correlation": 286040 } }, { "ph": "f", "id": 286040, "pid": 76337, "tid": -914061504, "ts": 1716454226037518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226037519, "dur": 0, "args": { "External id": 286041, "cbid": 205, "correlation": 286041 } }, { "ph": "f", "id": 286041, "pid": 76337, "tid": -914061504, "ts": 1716454226037519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226042869, "dur": 34, "args": { "External id": 286045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286045, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286045, "pid": 5, "tid": 7, "ts": 1716454226042869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037535, "dur": 12, "args": { "External id": 286045, "cbid": 211, "correlation": 286045 } }, { "ph": "s", "id": 286045, "pid": 76337, "tid": -914061504, "ts": 1716454226037535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226042904, "dur": 29, "args": { "External id": 286047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286047, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286047, "pid": 5, "tid": 7, "ts": 1716454226042904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037549, "dur": 5, "args": { "External id": 286047, "cbid": 211, "correlation": 286047 } }, { "ph": "s", "id": 286047, "pid": 76337, "tid": -914061504, "ts": 1716454226037549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226042934, "dur": 726, "args": { "External id": 286049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286049, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286049, "pid": 5, "tid": 7, "ts": 1716454226042934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037561, "dur": 6, "args": { "External id": 286049, "cbid": 211, "correlation": 286049 } }, { "ph": "s", "id": 286049, "pid": 76337, "tid": -914061504, "ts": 1716454226037561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226043661, "dur": 30, "args": { "External id": 286051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286051, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286051, "pid": 5, "tid": 7, "ts": 1716454226043661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037570, "dur": 5, "args": { "External id": 286051, "cbid": 211, "correlation": 286051 } }, { "ph": "s", "id": 286051, "pid": 76337, "tid": -914061504, "ts": 1716454226037570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226043693, "dur": 48, "args": { "External id": 286057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286057, "pid": 5, "tid": 7, "ts": 1716454226043693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037597, "dur": 9, "args": { "External id": 286057, "cbid": 211, "correlation": 286057 } }, { "ph": "s", "id": 286057, "pid": 76337, "tid": -914061504, "ts": 1716454226037597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226043742, "dur": 57, "args": { "External id": 286077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286077, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286077, "pid": 5, "tid": 7, "ts": 1716454226043742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037672, "dur": 12, "args": { "External id": 286077, "cbid": 211, "correlation": 286077 } }, { "ph": "s", "id": 286077, "pid": 76337, "tid": -914061504, "ts": 1716454226037672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226043800, "dur": 4, "args": { "External id": 286089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286089, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286089, "pid": 5, "tid": 7, "ts": 1716454226043800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037696, "dur": 6, "args": { "External id": 286089, "cbid": 211, "correlation": 286089 } }, { "ph": "s", "id": 286089, "pid": 76337, "tid": -914061504, "ts": 1716454226037696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226043805, "dur": 43, "args": { "External id": 286092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286092, "pid": 5, "tid": 7, "ts": 1716454226043805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037714, "dur": 6, "args": { "External id": 286092, "cbid": 211, "correlation": 286092 } }, { "ph": "s", "id": 286092, "pid": 76337, "tid": -914061504, "ts": 1716454226037714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226043849, "dur": 30, "args": { "External id": 286101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286101, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286101, "pid": 5, "tid": 7, "ts": 1716454226043849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037753, "dur": 11, "args": { "External id": 286101, "cbid": 211, "correlation": 286101 } }, { "ph": "s", "id": 286101, "pid": 76337, "tid": -914061504, "ts": 1716454226037753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226037821, "dur": 0, "args": { "External id": 286111, "cbid": 317, "correlation": 286111 } }, { "ph": "f", "id": 286111, "pid": 76337, "tid": -914061504, "ts": 1716454226037821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226037821, "dur": 0, "args": { "External id": 286112, "cbid": 203, "correlation": 286112 } }, { "ph": "f", "id": 286112, "pid": 76337, "tid": -914061504, "ts": 1716454226037821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226037822, "dur": 0, "args": { "External id": 286113, "cbid": 205, "correlation": 286113 } }, { "ph": "f", "id": 286113, "pid": 76337, "tid": -914061504, "ts": 1716454226037822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226043880, "dur": 32, "args": { "External id": 286117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286117, "pid": 5, "tid": 7, "ts": 1716454226043880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037837, "dur": 12, "args": { "External id": 286117, "cbid": 211, "correlation": 286117 } }, { "ph": "s", "id": 286117, "pid": 76337, "tid": -914061504, "ts": 1716454226037837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226043914, "dur": 28, "args": { "External id": 286119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286119, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286119, "pid": 5, "tid": 7, "ts": 1716454226043914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037851, "dur": 5, "args": { "External id": 286119, "cbid": 211, "correlation": 286119 } }, { "ph": "s", "id": 286119, "pid": 76337, "tid": -914061504, "ts": 1716454226037851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226043943, "dur": 726, "args": { "External id": 286121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286121, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286121, "pid": 5, "tid": 7, "ts": 1716454226043943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037863, "dur": 6, "args": { "External id": 286121, "cbid": 211, "correlation": 286121 } }, { "ph": "s", "id": 286121, "pid": 76337, "tid": -914061504, "ts": 1716454226037863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226044671, "dur": 32, "args": { "External id": 286123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286123, "pid": 5, "tid": 7, "ts": 1716454226044671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037872, "dur": 5, "args": { "External id": 286123, "cbid": 211, "correlation": 286123 } }, { "ph": "s", "id": 286123, "pid": 76337, "tid": -914061504, "ts": 1716454226037872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226044705, "dur": 48, "args": { "External id": 286129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286129, "pid": 5, "tid": 7, "ts": 1716454226044705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037899, "dur": 8, "args": { "External id": 286129, "cbid": 211, "correlation": 286129 } }, { "ph": "s", "id": 286129, "pid": 76337, "tid": -914061504, "ts": 1716454226037899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226044754, "dur": 123, "args": { "External id": 286137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286137, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286137, "pid": 5, "tid": 7, "ts": 1716454226044754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037933, "dur": 9, "args": { "External id": 286137, "cbid": 211, "correlation": 286137 } }, { "ph": "s", "id": 286137, "pid": 76337, "tid": -914061504, "ts": 1716454226037933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226044878, "dur": 28, "args": { "External id": 286145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286145, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286145, "pid": 5, "tid": 7, "ts": 1716454226044878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226037964, "dur": 8, "args": { "External id": 286145, "cbid": 211, "correlation": 286145 } }, { "ph": "s", "id": 286145, "pid": 76337, "tid": -914061504, "ts": 1716454226037964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226044907, "dur": 117, "args": { "External id": 286155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286155, "pid": 5, "tid": 7, "ts": 1716454226044907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038046, "dur": 13, "args": { "External id": 286155, "cbid": 211, "correlation": 286155 } }, { "ph": "s", "id": 286155, "pid": 76337, "tid": -914061504, "ts": 1716454226038046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226045025, "dur": 59, "args": { "External id": 286176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286176, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286176, "pid": 5, "tid": 7, "ts": 1716454226045025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038083, "dur": 8, "args": { "External id": 286176, "cbid": 211, "correlation": 286176 } }, { "ph": "s", "id": 286176, "pid": 76337, "tid": -914061504, "ts": 1716454226038083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226045085, "dur": 4, "args": { "External id": 286188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286188, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286188, "pid": 5, "tid": 7, "ts": 1716454226045085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038100, "dur": 6, "args": { "External id": 286188, "cbid": 211, "correlation": 286188 } }, { "ph": "s", "id": 286188, "pid": 76337, "tid": -914061504, "ts": 1716454226038100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226045091, "dur": 44, "args": { "External id": 286191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286191, "pid": 5, "tid": 7, "ts": 1716454226045091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038118, "dur": 6, "args": { "External id": 286191, "cbid": 211, "correlation": 286191 } }, { "ph": "s", "id": 286191, "pid": 76337, "tid": -914061504, "ts": 1716454226038118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226045136, "dur": 30, "args": { "External id": 286200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286200, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286200, "pid": 5, "tid": 7, "ts": 1716454226045136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038157, "dur": 10, "args": { "External id": 286200, "cbid": 211, "correlation": 286200 } }, { "ph": "s", "id": 286200, "pid": 76337, "tid": -914061504, "ts": 1716454226038157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226038209, "dur": 0, "args": { "External id": 286210, "cbid": 317, "correlation": 286210 } }, { "ph": "f", "id": 286210, "pid": 76337, "tid": -914061504, "ts": 1716454226038209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226038209, "dur": 0, "args": { "External id": 286211, "cbid": 203, "correlation": 286211 } }, { "ph": "f", "id": 286211, "pid": 76337, "tid": -914061504, "ts": 1716454226038209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226038210, "dur": 0, "args": { "External id": 286212, "cbid": 205, "correlation": 286212 } }, { "ph": "f", "id": 286212, "pid": 76337, "tid": -914061504, "ts": 1716454226038210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226045168, "dur": 32, "args": { "External id": 286216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286216, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286216, "pid": 5, "tid": 7, "ts": 1716454226045168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038224, "dur": 11, "args": { "External id": 286216, "cbid": 211, "correlation": 286216 } }, { "ph": "s", "id": 286216, "pid": 76337, "tid": -914061504, "ts": 1716454226038224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226045201, "dur": 28, "args": { "External id": 286218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286218, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286218, "pid": 5, "tid": 7, "ts": 1716454226045201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038238, "dur": 5, "args": { "External id": 286218, "cbid": 211, "correlation": 286218 } }, { "ph": "s", "id": 286218, "pid": 76337, "tid": -914061504, "ts": 1716454226038238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226045230, "dur": 728, "args": { "External id": 286220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286220, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286220, "pid": 5, "tid": 7, "ts": 1716454226045230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038249, "dur": 6, "args": { "External id": 286220, "cbid": 211, "correlation": 286220 } }, { "ph": "s", "id": 286220, "pid": 76337, "tid": -914061504, "ts": 1716454226038249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226045959, "dur": 30, "args": { "External id": 286222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286222, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286222, "pid": 5, "tid": 7, "ts": 1716454226045959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038258, "dur": 6, "args": { "External id": 286222, "cbid": 211, "correlation": 286222 } }, { "ph": "s", "id": 286222, "pid": 76337, "tid": -914061504, "ts": 1716454226038258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226045991, "dur": 47, "args": { "External id": 286228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286228, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286228, "pid": 5, "tid": 7, "ts": 1716454226045991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038286, "dur": 8, "args": { "External id": 286228, "cbid": 211, "correlation": 286228 } }, { "ph": "s", "id": 286228, "pid": 76337, "tid": -914061504, "ts": 1716454226038286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226046039, "dur": 57, "args": { "External id": 286248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286248, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286248, "pid": 5, "tid": 7, "ts": 1716454226046039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038360, "dur": 12, "args": { "External id": 286248, "cbid": 211, "correlation": 286248 } }, { "ph": "s", "id": 286248, "pid": 76337, "tid": -914061504, "ts": 1716454226038360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226046097, "dur": 4, "args": { "External id": 286260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286260, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286260, "pid": 5, "tid": 7, "ts": 1716454226046097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038382, "dur": 6, "args": { "External id": 286260, "cbid": 211, "correlation": 286260 } }, { "ph": "s", "id": 286260, "pid": 76337, "tid": -914061504, "ts": 1716454226038382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226046102, "dur": 44, "args": { "External id": 286263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286263, "pid": 5, "tid": 7, "ts": 1716454226046102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038399, "dur": 6, "args": { "External id": 286263, "cbid": 211, "correlation": 286263 } }, { "ph": "s", "id": 286263, "pid": 76337, "tid": -914061504, "ts": 1716454226038399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226046147, "dur": 29, "args": { "External id": 286272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286272, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286272, "pid": 5, "tid": 7, "ts": 1716454226046147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038438, "dur": 10, "args": { "External id": 286272, "cbid": 211, "correlation": 286272 } }, { "ph": "s", "id": 286272, "pid": 76337, "tid": -914061504, "ts": 1716454226038438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226038504, "dur": 0, "args": { "External id": 286282, "cbid": 317, "correlation": 286282 } }, { "ph": "f", "id": 286282, "pid": 76337, "tid": -914061504, "ts": 1716454226038504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226038505, "dur": 0, "args": { "External id": 286283, "cbid": 203, "correlation": 286283 } }, { "ph": "f", "id": 286283, "pid": 76337, "tid": -914061504, "ts": 1716454226038505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226038506, "dur": 0, "args": { "External id": 286284, "cbid": 205, "correlation": 286284 } }, { "ph": "f", "id": 286284, "pid": 76337, "tid": -914061504, "ts": 1716454226038506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226046178, "dur": 32, "args": { "External id": 286288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286288, "pid": 5, "tid": 7, "ts": 1716454226046178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038520, "dur": 12, "args": { "External id": 286288, "cbid": 211, "correlation": 286288 } }, { "ph": "s", "id": 286288, "pid": 76337, "tid": -914061504, "ts": 1716454226038520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226046211, "dur": 28, "args": { "External id": 286290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286290, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286290, "pid": 5, "tid": 7, "ts": 1716454226046211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038534, "dur": 5, "args": { "External id": 286290, "cbid": 211, "correlation": 286290 } }, { "ph": "s", "id": 286290, "pid": 76337, "tid": -914061504, "ts": 1716454226038534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226046241, "dur": 726, "args": { "External id": 286292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286292, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286292, "pid": 5, "tid": 7, "ts": 1716454226046241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038545, "dur": 6, "args": { "External id": 286292, "cbid": 211, "correlation": 286292 } }, { "ph": "s", "id": 286292, "pid": 76337, "tid": -914061504, "ts": 1716454226038545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226046969, "dur": 31, "args": { "External id": 286294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286294, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286294, "pid": 5, "tid": 7, "ts": 1716454226046969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038555, "dur": 5, "args": { "External id": 286294, "cbid": 211, "correlation": 286294 } }, { "ph": "s", "id": 286294, "pid": 76337, "tid": -914061504, "ts": 1716454226038555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226047001, "dur": 47, "args": { "External id": 286300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286300, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286300, "pid": 5, "tid": 7, "ts": 1716454226047001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038582, "dur": 8, "args": { "External id": 286300, "cbid": 211, "correlation": 286300 } }, { "ph": "s", "id": 286300, "pid": 76337, "tid": -914061504, "ts": 1716454226038582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226047050, "dur": 121, "args": { "External id": 286308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286308, "pid": 5, "tid": 7, "ts": 1716454226047050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038617, "dur": 9, "args": { "External id": 286308, "cbid": 211, "correlation": 286308 } }, { "ph": "s", "id": 286308, "pid": 76337, "tid": -914061504, "ts": 1716454226038617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226047172, "dur": 28, "args": { "External id": 286316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286316, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286316, "pid": 5, "tid": 7, "ts": 1716454226047172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038645, "dur": 8, "args": { "External id": 286316, "cbid": 211, "correlation": 286316 } }, { "ph": "s", "id": 286316, "pid": 76337, "tid": -914061504, "ts": 1716454226038645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226047201, "dur": 117, "args": { "External id": 286326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286326, "pid": 5, "tid": 7, "ts": 1716454226047201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038722, "dur": 12, "args": { "External id": 286326, "cbid": 211, "correlation": 286326 } }, { "ph": "s", "id": 286326, "pid": 76337, "tid": -914061504, "ts": 1716454226038722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226047320, "dur": 60, "args": { "External id": 286347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286347, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286347, "pid": 5, "tid": 7, "ts": 1716454226047320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038757, "dur": 8, "args": { "External id": 286347, "cbid": 211, "correlation": 286347 } }, { "ph": "s", "id": 286347, "pid": 76337, "tid": -914061504, "ts": 1716454226038757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226047381, "dur": 4, "args": { "External id": 286359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286359, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286359, "pid": 5, "tid": 7, "ts": 1716454226047381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038775, "dur": 6, "args": { "External id": 286359, "cbid": 211, "correlation": 286359 } }, { "ph": "s", "id": 286359, "pid": 76337, "tid": -914061504, "ts": 1716454226038775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226047386, "dur": 43, "args": { "External id": 286362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286362, "pid": 5, "tid": 7, "ts": 1716454226047386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038792, "dur": 6, "args": { "External id": 286362, "cbid": 211, "correlation": 286362 } }, { "ph": "s", "id": 286362, "pid": 76337, "tid": -914061504, "ts": 1716454226038792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226047431, "dur": 30, "args": { "External id": 286371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286371, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286371, "pid": 5, "tid": 7, "ts": 1716454226047431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038831, "dur": 9, "args": { "External id": 286371, "cbid": 211, "correlation": 286371 } }, { "ph": "s", "id": 286371, "pid": 76337, "tid": -914061504, "ts": 1716454226038831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226038889, "dur": 0, "args": { "External id": 286381, "cbid": 317, "correlation": 286381 } }, { "ph": "f", "id": 286381, "pid": 76337, "tid": -914061504, "ts": 1716454226038889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226038889, "dur": 0, "args": { "External id": 286382, "cbid": 203, "correlation": 286382 } }, { "ph": "f", "id": 286382, "pid": 76337, "tid": -914061504, "ts": 1716454226038889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226038890, "dur": 0, "args": { "External id": 286383, "cbid": 205, "correlation": 286383 } }, { "ph": "f", "id": 286383, "pid": 76337, "tid": -914061504, "ts": 1716454226038890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226047462, "dur": 33, "args": { "External id": 286387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286387, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286387, "pid": 5, "tid": 7, "ts": 1716454226047462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038903, "dur": 12, "args": { "External id": 286387, "cbid": 211, "correlation": 286387 } }, { "ph": "s", "id": 286387, "pid": 76337, "tid": -914061504, "ts": 1716454226038903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226047496, "dur": 29, "args": { "External id": 286389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286389, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286389, "pid": 5, "tid": 7, "ts": 1716454226047496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038919, "dur": 5, "args": { "External id": 286389, "cbid": 211, "correlation": 286389 } }, { "ph": "s", "id": 286389, "pid": 76337, "tid": -914061504, "ts": 1716454226038919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226047526, "dur": 728, "args": { "External id": 286391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286391, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286391, "pid": 5, "tid": 7, "ts": 1716454226047526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038930, "dur": 6, "args": { "External id": 286391, "cbid": 211, "correlation": 286391 } }, { "ph": "s", "id": 286391, "pid": 76337, "tid": -914061504, "ts": 1716454226038930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226048256, "dur": 31, "args": { "External id": 286393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286393, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286393, "pid": 5, "tid": 7, "ts": 1716454226048256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038939, "dur": 5, "args": { "External id": 286393, "cbid": 211, "correlation": 286393 } }, { "ph": "s", "id": 286393, "pid": 76337, "tid": -914061504, "ts": 1716454226038939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226048288, "dur": 47, "args": { "External id": 286399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286399, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286399, "pid": 5, "tid": 7, "ts": 1716454226048288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226038967, "dur": 18, "args": { "External id": 286399, "cbid": 211, "correlation": 286399 } }, { "ph": "s", "id": 286399, "pid": 76337, "tid": -914061504, "ts": 1716454226038967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226048337, "dur": 58, "args": { "External id": 286419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286419, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286419, "pid": 5, "tid": 7, "ts": 1716454226048337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039048, "dur": 11, "args": { "External id": 286419, "cbid": 211, "correlation": 286419 } }, { "ph": "s", "id": 286419, "pid": 76337, "tid": -914061504, "ts": 1716454226039048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226048396, "dur": 4, "args": { "External id": 286431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286431, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286431, "pid": 5, "tid": 7, "ts": 1716454226048396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039069, "dur": 7, "args": { "External id": 286431, "cbid": 211, "correlation": 286431 } }, { "ph": "s", "id": 286431, "pid": 76337, "tid": -914061504, "ts": 1716454226039069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226048401, "dur": 44, "args": { "External id": 286434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286434, "pid": 5, "tid": 7, "ts": 1716454226048401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039087, "dur": 6, "args": { "External id": 286434, "cbid": 211, "correlation": 286434 } }, { "ph": "s", "id": 286434, "pid": 76337, "tid": -914061504, "ts": 1716454226039087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226048446, "dur": 29, "args": { "External id": 286443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286443, "registers per thread": 24, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286443, "pid": 5, "tid": 7, "ts": 1716454226048446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039126, "dur": 10, "args": { "External id": 286443, "cbid": 211, "correlation": 286443 } }, { "ph": "s", "id": 286443, "pid": 76337, "tid": -914061504, "ts": 1716454226039126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226039191, "dur": 0, "args": { "External id": 286453, "cbid": 317, "correlation": 286453 } }, { "ph": "f", "id": 286453, "pid": 76337, "tid": -914061504, "ts": 1716454226039191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226039192, "dur": 0, "args": { "External id": 286454, "cbid": 203, "correlation": 286454 } }, { "ph": "f", "id": 286454, "pid": 76337, "tid": -914061504, "ts": 1716454226039192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226039193, "dur": 0, "args": { "External id": 286455, "cbid": 205, "correlation": 286455 } }, { "ph": "f", "id": 286455, "pid": 76337, "tid": -914061504, "ts": 1716454226039193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226048477, "dur": 33, "args": { "External id": 286459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286459, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286459, "pid": 5, "tid": 7, "ts": 1716454226048477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039207, "dur": 13, "args": { "External id": 286459, "cbid": 211, "correlation": 286459 } }, { "ph": "s", "id": 286459, "pid": 76337, "tid": -914061504, "ts": 1716454226039207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226048511, "dur": 29, "args": { "External id": 286461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286461, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286461, "pid": 5, "tid": 7, "ts": 1716454226048511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039223, "dur": 5, "args": { "External id": 286461, "cbid": 211, "correlation": 286461 } }, { "ph": "s", "id": 286461, "pid": 76337, "tid": -914061504, "ts": 1716454226039223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226048541, "dur": 727, "args": { "External id": 286463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286463, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 2.4, "warps per SM": 19.2, "grid": [2, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286463, "pid": 5, "tid": 7, "ts": 1716454226048541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039233, "dur": 6, "args": { "External id": 286463, "cbid": 211, "correlation": 286463 } }, { "ph": "s", "id": 286463, "pid": 76337, "tid": -914061504, "ts": 1716454226039233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226049270, "dur": 31, "args": { "External id": 286465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286465, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 76.8, "warps per SM": 614.4, "grid": [96, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286465, "pid": 5, "tid": 7, "ts": 1716454226049270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039243, "dur": 5, "args": { "External id": 286465, "cbid": 211, "correlation": 286465 } }, { "ph": "s", "id": 286465, "pid": 76337, "tid": -914061504, "ts": 1716454226039243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226049302, "dur": 47, "args": { "External id": 286471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286471, "pid": 5, "tid": 7, "ts": 1716454226049302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039270, "dur": 8, "args": { "External id": 286471, "cbid": 211, "correlation": 286471 } }, { "ph": "s", "id": 286471, "pid": 76337, "tid": -914061504, "ts": 1716454226039270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226049351, "dur": 123, "args": { "External id": 286479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286479, "pid": 5, "tid": 7, "ts": 1716454226049351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039304, "dur": 9, "args": { "External id": 286479, "cbid": 211, "correlation": 286479 } }, { "ph": "s", "id": 286479, "pid": 76337, "tid": -914061504, "ts": 1716454226039304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226049475, "dur": 27, "args": { "External id": 286487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286487, "registers per thread": 17, "shared memory": 0, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286487, "pid": 5, "tid": 7, "ts": 1716454226049475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039334, "dur": 8, "args": { "External id": 286487, "cbid": 211, "correlation": 286487 } }, { "ph": "s", "id": 286487, "pid": 76337, "tid": -914061504, "ts": 1716454226039334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_nhwc_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float, unsigned long)", "pid": 5, "tid": 7, "ts": 1716454226049509, "dur": 324, "args": { "External id": 286496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286496, "registers per thread": 26, "shared memory": 0, "blocks per SM": 307.2, "warps per SM": 9830.4, "grid": [24576, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286496, "pid": 5, "tid": 7, "ts": 1716454226049509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039419, "dur": 51, "args": { "External id": 286496, "cbid": 211, "correlation": 286496 } }, { "ph": "s", "id": 286496, "pid": 76337, "tid": -914061504, "ts": 1716454226039419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226049834, "dur": 24, "args": { "External id": 286510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 57.6, "warps per SM": 230.4, "grid": [4608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286510, "pid": 5, "tid": 7, "ts": 1716454226049834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226039536, "dur": 12, "args": { "External id": 286510, "cbid": 211, "correlation": 286510 } }, { "ph": "s", "id": 286510, "pid": 76337, "tid": -914061504, "ts": 1716454226039536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226039556, "dur": 0, "args": { "External id": 286517, "cbid": 317, "correlation": 286517 } }, { "ph": "f", "id": 286517, "pid": 76337, "tid": -914061504, "ts": 1716454226039556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226039557, "dur": 0, "args": { "External id": 286518, "cbid": 203, "correlation": 286518 } }, { "ph": "f", "id": 286518, "pid": 76337, "tid": -914061504, "ts": 1716454226039557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226039558, "dur": 0, "args": { "External id": 286519, "cbid": 205, "correlation": 286519 } }, { "ph": "f", "id": 286519, "pid": 76337, "tid": -914061504, "ts": 1716454226039558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226049860, "dur": 2553, "args": { "External id": 286523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286523, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286523, "pid": 5, "tid": 7, "ts": 1716454226049860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226042491, "dur": 40, "args": { "External id": 286523, "cbid": 211, "correlation": 286523 } }, { "ph": "s", "id": 286523, "pid": 76337, "tid": -914061504, "ts": 1716454226042491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226052414, "dur": 166, "args": { "External id": 286529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286529, "pid": 5, "tid": 7, "ts": 1716454226052414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226043311, "dur": 14, "args": { "External id": 286529, "cbid": 211, "correlation": 286529 } }, { "ph": "s", "id": 286529, "pid": 76337, "tid": -914061504, "ts": 1716454226043311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226052581, "dur": 453, "args": { "External id": 286539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286539, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286539, "pid": 5, "tid": 7, "ts": 1716454226052581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226043479, "dur": 15, "args": { "External id": 286539, "cbid": 211, "correlation": 286539 } }, { "ph": "s", "id": 286539, "pid": 76337, "tid": -914061504, "ts": 1716454226043479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226053036, "dur": 203, "args": { "External id": 286560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286560, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286560, "pid": 5, "tid": 7, "ts": 1716454226053036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226043523, "dur": 8, "args": { "External id": 286560, "cbid": 211, "correlation": 286560 } }, { "ph": "s", "id": 286560, "pid": 76337, "tid": -914061504, "ts": 1716454226043523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226053240, "dur": 4, "args": { "External id": 286572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286572, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286572, "pid": 5, "tid": 7, "ts": 1716454226053240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226043545, "dur": 9, "args": { "External id": 286572, "cbid": 211, "correlation": 286572 } }, { "ph": "s", "id": 286572, "pid": 76337, "tid": -914061504, "ts": 1716454226043545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226053245, "dur": 164, "args": { "External id": 286575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286575, "pid": 5, "tid": 7, "ts": 1716454226053245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226043577, "dur": 8, "args": { "External id": 286575, "cbid": 211, "correlation": 286575 } }, { "ph": "s", "id": 286575, "pid": 76337, "tid": -914061504, "ts": 1716454226043577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226053411, "dur": 107, "args": { "External id": 286584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286584, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286584, "pid": 5, "tid": 7, "ts": 1716454226053411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226043631, "dur": 10, "args": { "External id": 286584, "cbid": 211, "correlation": 286584 } }, { "ph": "s", "id": 286584, "pid": 76337, "tid": -914061504, "ts": 1716454226043631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226043705, "dur": 0, "args": { "External id": 286594, "cbid": 317, "correlation": 286594 } }, { "ph": "f", "id": 286594, "pid": 76337, "tid": -914061504, "ts": 1716454226043705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226043706, "dur": 1, "args": { "External id": 286595, "cbid": 203, "correlation": 286595 } }, { "ph": "f", "id": 286595, "pid": 76337, "tid": -914061504, "ts": 1716454226043706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226043708, "dur": 0, "args": { "External id": 286596, "cbid": 205, "correlation": 286596 } }, { "ph": "f", "id": 286596, "pid": 76337, "tid": -914061504, "ts": 1716454226043708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226053519, "dur": 117, "args": { "External id": 286600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286600, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286600, "pid": 5, "tid": 7, "ts": 1716454226053519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226046332, "dur": 26, "args": { "External id": 286600, "cbid": 211, "correlation": 286600 } }, { "ph": "s", "id": 286600, "pid": 76337, "tid": -914061504, "ts": 1716454226046332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226053638, "dur": 29, "args": { "External id": 286602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286602, "pid": 5, "tid": 7, "ts": 1716454226053638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226046362, "dur": 6, "args": { "External id": 286602, "cbid": 211, "correlation": 286602 } }, { "ph": "s", "id": 286602, "pid": 76337, "tid": -914061504, "ts": 1716454226046362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226053668, "dur": 2533, "args": { "External id": 286604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286604, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286604, "pid": 5, "tid": 7, "ts": 1716454226053668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226046377, "dur": 9, "args": { "External id": 286604, "cbid": 211, "correlation": 286604 } }, { "ph": "s", "id": 286604, "pid": 76337, "tid": -914061504, "ts": 1716454226046377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226056202, "dur": 135, "args": { "External id": 286606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286606, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286606, "pid": 5, "tid": 7, "ts": 1716454226056202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226046392, "dur": 8, "args": { "External id": 286606, "cbid": 211, "correlation": 286606 } }, { "ph": "s", "id": 286606, "pid": 76337, "tid": -914061504, "ts": 1716454226046392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226056338, "dur": 175, "args": { "External id": 286612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286612, "pid": 5, "tid": 7, "ts": 1716454226056338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047128, "dur": 14, "args": { "External id": 286612, "cbid": 211, "correlation": 286612 } }, { "ph": "s", "id": 286612, "pid": 76337, "tid": -914061504, "ts": 1716454226047128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226056514, "dur": 208, "args": { "External id": 286632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286632, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286632, "pid": 5, "tid": 7, "ts": 1716454226056514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047256, "dur": 13, "args": { "External id": 286632, "cbid": 211, "correlation": 286632 } }, { "ph": "s", "id": 286632, "pid": 76337, "tid": -914061504, "ts": 1716454226047256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226056723, "dur": 4, "args": { "External id": 286644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286644, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286644, "pid": 5, "tid": 7, "ts": 1716454226056723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047280, "dur": 8, "args": { "External id": 286644, "cbid": 211, "correlation": 286644 } }, { "ph": "s", "id": 286644, "pid": 76337, "tid": -914061504, "ts": 1716454226047280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226056728, "dur": 163, "args": { "External id": 286647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286647, "pid": 5, "tid": 7, "ts": 1716454226056728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047309, "dur": 7, "args": { "External id": 286647, "cbid": 211, "correlation": 286647 } }, { "ph": "s", "id": 286647, "pid": 76337, "tid": -914061504, "ts": 1716454226047309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226056892, "dur": 107, "args": { "External id": 286656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286656, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286656, "pid": 5, "tid": 7, "ts": 1716454226056892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047359, "dur": 10, "args": { "External id": 286656, "cbid": 211, "correlation": 286656 } }, { "ph": "s", "id": 286656, "pid": 76337, "tid": -914061504, "ts": 1716454226047359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226047450, "dur": 0, "args": { "External id": 286666, "cbid": 317, "correlation": 286666 } }, { "ph": "f", "id": 286666, "pid": 76337, "tid": -914061504, "ts": 1716454226047450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226047451, "dur": 0, "args": { "External id": 286667, "cbid": 203, "correlation": 286667 } }, { "ph": "f", "id": 286667, "pid": 76337, "tid": -914061504, "ts": 1716454226047451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226047452, "dur": 0, "args": { "External id": 286668, "cbid": 205, "correlation": 286668 } }, { "ph": "f", "id": 286668, "pid": 76337, "tid": -914061504, "ts": 1716454226047452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226057000, "dur": 117, "args": { "External id": 286672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286672, "pid": 5, "tid": 7, "ts": 1716454226057000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047466, "dur": 13, "args": { "External id": 286672, "cbid": 211, "correlation": 286672 } }, { "ph": "s", "id": 286672, "pid": 76337, "tid": -914061504, "ts": 1716454226047466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226057119, "dur": 29, "args": { "External id": 286674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286674, "pid": 5, "tid": 7, "ts": 1716454226057119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047482, "dur": 6, "args": { "External id": 286674, "cbid": 211, "correlation": 286674 } }, { "ph": "s", "id": 286674, "pid": 76337, "tid": -914061504, "ts": 1716454226047482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226057149, "dur": 2538, "args": { "External id": 286676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286676, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286676, "pid": 5, "tid": 7, "ts": 1716454226057149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047493, "dur": 6, "args": { "External id": 286676, "cbid": 211, "correlation": 286676 } }, { "ph": "s", "id": 286676, "pid": 76337, "tid": -914061504, "ts": 1716454226047493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226059689, "dur": 144, "args": { "External id": 286678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286678, "pid": 5, "tid": 7, "ts": 1716454226059689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047503, "dur": 5, "args": { "External id": 286678, "cbid": 211, "correlation": 286678 } }, { "ph": "s", "id": 286678, "pid": 76337, "tid": -914061504, "ts": 1716454226047503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226059834, "dur": 176, "args": { "External id": 286684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286684, "pid": 5, "tid": 7, "ts": 1716454226059834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047532, "dur": 9, "args": { "External id": 286684, "cbid": 211, "correlation": 286684 } }, { "ph": "s", "id": 286684, "pid": 76337, "tid": -914061504, "ts": 1716454226047532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226060011, "dur": 474, "args": { "External id": 286692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286692, "pid": 5, "tid": 7, "ts": 1716454226060011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047576, "dur": 9, "args": { "External id": 286692, "cbid": 211, "correlation": 286692 } }, { "ph": "s", "id": 286692, "pid": 76337, "tid": -914061504, "ts": 1716454226047576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226060486, "dur": 103, "args": { "External id": 286700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286700, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286700, "pid": 5, "tid": 7, "ts": 1716454226060486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047621, "dur": 12, "args": { "External id": 286700, "cbid": 211, "correlation": 286700 } }, { "ph": "s", "id": 286700, "pid": 76337, "tid": -914061504, "ts": 1716454226047621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226060591, "dur": 454, "args": { "External id": 286710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286710, "pid": 5, "tid": 7, "ts": 1716454226060591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047710, "dur": 14, "args": { "External id": 286710, "cbid": 211, "correlation": 286710 } }, { "ph": "s", "id": 286710, "pid": 76337, "tid": -914061504, "ts": 1716454226047710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226061046, "dur": 204, "args": { "External id": 286731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286731, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286731, "pid": 5, "tid": 7, "ts": 1716454226061046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047746, "dur": 7, "args": { "External id": 286731, "cbid": 211, "correlation": 286731 } }, { "ph": "s", "id": 286731, "pid": 76337, "tid": -914061504, "ts": 1716454226047746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226061251, "dur": 4, "args": { "External id": 286743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286743, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286743, "pid": 5, "tid": 7, "ts": 1716454226061251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047765, "dur": 6, "args": { "External id": 286743, "cbid": 211, "correlation": 286743 } }, { "ph": "s", "id": 286743, "pid": 76337, "tid": -914061504, "ts": 1716454226047765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226061256, "dur": 163, "args": { "External id": 286746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286746, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286746, "pid": 5, "tid": 7, "ts": 1716454226061256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047783, "dur": 7, "args": { "External id": 286746, "cbid": 211, "correlation": 286746 } }, { "ph": "s", "id": 286746, "pid": 76337, "tid": -914061504, "ts": 1716454226047783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226061420, "dur": 106, "args": { "External id": 286755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286755, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286755, "pid": 5, "tid": 7, "ts": 1716454226061420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047823, "dur": 10, "args": { "External id": 286755, "cbid": 211, "correlation": 286755 } }, { "ph": "s", "id": 286755, "pid": 76337, "tid": -914061504, "ts": 1716454226047823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226047879, "dur": 0, "args": { "External id": 286765, "cbid": 317, "correlation": 286765 } }, { "ph": "f", "id": 286765, "pid": 76337, "tid": -914061504, "ts": 1716454226047879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226047880, "dur": 0, "args": { "External id": 286766, "cbid": 203, "correlation": 286766 } }, { "ph": "f", "id": 286766, "pid": 76337, "tid": -914061504, "ts": 1716454226047880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226047881, "dur": 0, "args": { "External id": 286767, "cbid": 205, "correlation": 286767 } }, { "ph": "f", "id": 286767, "pid": 76337, "tid": -914061504, "ts": 1716454226047881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226061528, "dur": 116, "args": { "External id": 286771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286771, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286771, "pid": 5, "tid": 7, "ts": 1716454226061528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047900, "dur": 12, "args": { "External id": 286771, "cbid": 211, "correlation": 286771 } }, { "ph": "s", "id": 286771, "pid": 76337, "tid": -914061504, "ts": 1716454226047900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226061645, "dur": 28, "args": { "External id": 286773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286773, "pid": 5, "tid": 7, "ts": 1716454226061645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047914, "dur": 5, "args": { "External id": 286773, "cbid": 211, "correlation": 286773 } }, { "ph": "s", "id": 286773, "pid": 76337, "tid": -914061504, "ts": 1716454226047914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226061674, "dur": 2528, "args": { "External id": 286775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286775, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286775, "pid": 5, "tid": 7, "ts": 1716454226061674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047925, "dur": 6, "args": { "External id": 286775, "cbid": 211, "correlation": 286775 } }, { "ph": "s", "id": 286775, "pid": 76337, "tid": -914061504, "ts": 1716454226047925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226064203, "dur": 135, "args": { "External id": 286777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286777, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286777, "pid": 5, "tid": 7, "ts": 1716454226064203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047935, "dur": 6, "args": { "External id": 286777, "cbid": 211, "correlation": 286777 } }, { "ph": "s", "id": 286777, "pid": 76337, "tid": -914061504, "ts": 1716454226047935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226064340, "dur": 175, "args": { "External id": 286783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286783, "pid": 5, "tid": 7, "ts": 1716454226064340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226047963, "dur": 8, "args": { "External id": 286783, "cbid": 211, "correlation": 286783 } }, { "ph": "s", "id": 286783, "pid": 76337, "tid": -914061504, "ts": 1716454226047963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226064516, "dur": 209, "args": { "External id": 286803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286803, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286803, "pid": 5, "tid": 7, "ts": 1716454226064516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048051, "dur": 12, "args": { "External id": 286803, "cbid": 211, "correlation": 286803 } }, { "ph": "s", "id": 286803, "pid": 76337, "tid": -914061504, "ts": 1716454226048051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226064727, "dur": 4, "args": { "External id": 286815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286815, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286815, "pid": 5, "tid": 7, "ts": 1716454226064727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048072, "dur": 6, "args": { "External id": 286815, "cbid": 211, "correlation": 286815 } }, { "ph": "s", "id": 286815, "pid": 76337, "tid": -914061504, "ts": 1716454226048072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226064731, "dur": 163, "args": { "External id": 286818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286818, "pid": 5, "tid": 7, "ts": 1716454226064731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048091, "dur": 6, "args": { "External id": 286818, "cbid": 211, "correlation": 286818 } }, { "ph": "s", "id": 286818, "pid": 76337, "tid": -914061504, "ts": 1716454226048091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226064895, "dur": 106, "args": { "External id": 286827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286827, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286827, "pid": 5, "tid": 7, "ts": 1716454226064895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048131, "dur": 11, "args": { "External id": 286827, "cbid": 211, "correlation": 286827 } }, { "ph": "s", "id": 286827, "pid": 76337, "tid": -914061504, "ts": 1716454226048131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226048199, "dur": 0, "args": { "External id": 286837, "cbid": 317, "correlation": 286837 } }, { "ph": "f", "id": 286837, "pid": 76337, "tid": -914061504, "ts": 1716454226048199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226048200, "dur": 0, "args": { "External id": 286838, "cbid": 203, "correlation": 286838 } }, { "ph": "f", "id": 286838, "pid": 76337, "tid": -914061504, "ts": 1716454226048200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226048201, "dur": 0, "args": { "External id": 286839, "cbid": 205, "correlation": 286839 } }, { "ph": "f", "id": 286839, "pid": 76337, "tid": -914061504, "ts": 1716454226048201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226065003, "dur": 118, "args": { "External id": 286843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286843, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286843, "pid": 5, "tid": 7, "ts": 1716454226065003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048214, "dur": 12, "args": { "External id": 286843, "cbid": 211, "correlation": 286843 } }, { "ph": "s", "id": 286843, "pid": 76337, "tid": -914061504, "ts": 1716454226048214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226065122, "dur": 28, "args": { "External id": 286845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286845, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286845, "pid": 5, "tid": 7, "ts": 1716454226065122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048228, "dur": 5, "args": { "External id": 286845, "cbid": 211, "correlation": 286845 } }, { "ph": "s", "id": 286845, "pid": 76337, "tid": -914061504, "ts": 1716454226048228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226065151, "dur": 2537, "args": { "External id": 286847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286847, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286847, "pid": 5, "tid": 7, "ts": 1716454226065151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048239, "dur": 6, "args": { "External id": 286847, "cbid": 211, "correlation": 286847 } }, { "ph": "s", "id": 286847, "pid": 76337, "tid": -914061504, "ts": 1716454226048239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226067690, "dur": 132, "args": { "External id": 286849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286849, "pid": 5, "tid": 7, "ts": 1716454226067690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048248, "dur": 5, "args": { "External id": 286849, "cbid": 211, "correlation": 286849 } }, { "ph": "s", "id": 286849, "pid": 76337, "tid": -914061504, "ts": 1716454226048248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226067823, "dur": 175, "args": { "External id": 286855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286855, "pid": 5, "tid": 7, "ts": 1716454226067823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048276, "dur": 8, "args": { "External id": 286855, "cbid": 211, "correlation": 286855 } }, { "ph": "s", "id": 286855, "pid": 76337, "tid": -914061504, "ts": 1716454226048276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226067999, "dur": 473, "args": { "External id": 286863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286863, "pid": 5, "tid": 7, "ts": 1716454226067999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048311, "dur": 9, "args": { "External id": 286863, "cbid": 211, "correlation": 286863 } }, { "ph": "s", "id": 286863, "pid": 76337, "tid": -914061504, "ts": 1716454226048311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226068474, "dur": 102, "args": { "External id": 286871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286871, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286871, "pid": 5, "tid": 7, "ts": 1716454226068474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048341, "dur": 9, "args": { "External id": 286871, "cbid": 211, "correlation": 286871 } }, { "ph": "s", "id": 286871, "pid": 76337, "tid": -914061504, "ts": 1716454226048341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226068577, "dur": 455, "args": { "External id": 286881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286881, "pid": 5, "tid": 7, "ts": 1716454226068577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048419, "dur": 14, "args": { "External id": 286881, "cbid": 211, "correlation": 286881 } }, { "ph": "s", "id": 286881, "pid": 76337, "tid": -914061504, "ts": 1716454226048419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226069033, "dur": 202, "args": { "External id": 286902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286902, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286902, "pid": 5, "tid": 7, "ts": 1716454226069033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048458, "dur": 8, "args": { "External id": 286902, "cbid": 211, "correlation": 286902 } }, { "ph": "s", "id": 286902, "pid": 76337, "tid": -914061504, "ts": 1716454226048458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226069236, "dur": 4, "args": { "External id": 286914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286914, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286914, "pid": 5, "tid": 7, "ts": 1716454226069236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048475, "dur": 6, "args": { "External id": 286914, "cbid": 211, "correlation": 286914 } }, { "ph": "s", "id": 286914, "pid": 76337, "tid": -914061504, "ts": 1716454226048475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226069241, "dur": 164, "args": { "External id": 286917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286917, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286917, "pid": 5, "tid": 7, "ts": 1716454226069241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048492, "dur": 7, "args": { "External id": 286917, "cbid": 211, "correlation": 286917 } }, { "ph": "s", "id": 286917, "pid": 76337, "tid": -914061504, "ts": 1716454226048492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226069407, "dur": 107, "args": { "External id": 286926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286926, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286926, "pid": 5, "tid": 7, "ts": 1716454226069407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048532, "dur": 11, "args": { "External id": 286926, "cbid": 211, "correlation": 286926 } }, { "ph": "s", "id": 286926, "pid": 76337, "tid": -914061504, "ts": 1716454226048532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226048590, "dur": 0, "args": { "External id": 286936, "cbid": 317, "correlation": 286936 } }, { "ph": "f", "id": 286936, "pid": 76337, "tid": -914061504, "ts": 1716454226048590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226048591, "dur": 0, "args": { "External id": 286937, "cbid": 203, "correlation": 286937 } }, { "ph": "f", "id": 286937, "pid": 76337, "tid": -914061504, "ts": 1716454226048591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226048592, "dur": 0, "args": { "External id": 286938, "cbid": 205, "correlation": 286938 } }, { "ph": "f", "id": 286938, "pid": 76337, "tid": -914061504, "ts": 1716454226048592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226069515, "dur": 119, "args": { "External id": 286942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286942, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286942, "pid": 5, "tid": 7, "ts": 1716454226069515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048606, "dur": 12, "args": { "External id": 286942, "cbid": 211, "correlation": 286942 } }, { "ph": "s", "id": 286942, "pid": 76337, "tid": -914061504, "ts": 1716454226048606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226069636, "dur": 29, "args": { "External id": 286944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286944, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286944, "pid": 5, "tid": 7, "ts": 1716454226069636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048620, "dur": 5, "args": { "External id": 286944, "cbid": 211, "correlation": 286944 } }, { "ph": "s", "id": 286944, "pid": 76337, "tid": -914061504, "ts": 1716454226048620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226069666, "dur": 2532, "args": { "External id": 286946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286946, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 286946, "pid": 5, "tid": 7, "ts": 1716454226069666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048631, "dur": 6, "args": { "External id": 286946, "cbid": 211, "correlation": 286946 } }, { "ph": "s", "id": 286946, "pid": 76337, "tid": -914061504, "ts": 1716454226048631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226072200, "dur": 132, "args": { "External id": 286948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286948, "pid": 5, "tid": 7, "ts": 1716454226072200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048640, "dur": 6, "args": { "External id": 286948, "cbid": 211, "correlation": 286948 } }, { "ph": "s", "id": 286948, "pid": 76337, "tid": -914061504, "ts": 1716454226048640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226072333, "dur": 175, "args": { "External id": 286954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286954, "pid": 5, "tid": 7, "ts": 1716454226072333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048669, "dur": 8, "args": { "External id": 286954, "cbid": 211, "correlation": 286954 } }, { "ph": "s", "id": 286954, "pid": 76337, "tid": -914061504, "ts": 1716454226048669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226072510, "dur": 207, "args": { "External id": 286974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286974, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 286974, "pid": 5, "tid": 7, "ts": 1716454226072510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048740, "dur": 12, "args": { "External id": 286974, "cbid": 211, "correlation": 286974 } }, { "ph": "s", "id": 286974, "pid": 76337, "tid": -914061504, "ts": 1716454226048740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226072718, "dur": 4, "args": { "External id": 286986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286986, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 286986, "pid": 5, "tid": 7, "ts": 1716454226072718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048762, "dur": 6, "args": { "External id": 286986, "cbid": 211, "correlation": 286986 } }, { "ph": "s", "id": 286986, "pid": 76337, "tid": -914061504, "ts": 1716454226048762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226072723, "dur": 163, "args": { "External id": 286989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286989, "pid": 5, "tid": 7, "ts": 1716454226072723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048779, "dur": 6, "args": { "External id": 286989, "cbid": 211, "correlation": 286989 } }, { "ph": "s", "id": 286989, "pid": 76337, "tid": -914061504, "ts": 1716454226048779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226072888, "dur": 106, "args": { "External id": 286998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 286998, "registers per thread": 24, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 286998, "pid": 5, "tid": 7, "ts": 1716454226072888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048819, "dur": 10, "args": { "External id": 286998, "cbid": 211, "correlation": 286998 } }, { "ph": "s", "id": 286998, "pid": 76337, "tid": -914061504, "ts": 1716454226048819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226048884, "dur": 0, "args": { "External id": 287008, "cbid": 317, "correlation": 287008 } }, { "ph": "f", "id": 287008, "pid": 76337, "tid": -914061504, "ts": 1716454226048884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226048885, "dur": 0, "args": { "External id": 287009, "cbid": 203, "correlation": 287009 } }, { "ph": "f", "id": 287009, "pid": 76337, "tid": -914061504, "ts": 1716454226048885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226048885, "dur": 0, "args": { "External id": 287010, "cbid": 205, "correlation": 287010 } }, { "ph": "f", "id": 287010, "pid": 76337, "tid": -914061504, "ts": 1716454226048885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226072995, "dur": 117, "args": { "External id": 287014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287014, "pid": 5, "tid": 7, "ts": 1716454226072995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048898, "dur": 13, "args": { "External id": 287014, "cbid": 211, "correlation": 287014 } }, { "ph": "s", "id": 287014, "pid": 76337, "tid": -914061504, "ts": 1716454226048898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226073114, "dur": 28, "args": { "External id": 287016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287016, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 102.4, "warps per SM": 819.2, "grid": [1, 16, 512], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287016, "pid": 5, "tid": 7, "ts": 1716454226073114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048914, "dur": 5, "args": { "External id": 287016, "cbid": 211, "correlation": 287016 } }, { "ph": "s", "id": 287016, "pid": 76337, "tid": -914061504, "ts": 1716454226048914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226073143, "dur": 2537, "args": { "External id": 287018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287018, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [4, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287018, "pid": 5, "tid": 7, "ts": 1716454226073143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048925, "dur": 6, "args": { "External id": 287018, "cbid": 211, "correlation": 287018 } }, { "ph": "s", "id": 287018, "pid": 76337, "tid": -914061504, "ts": 1716454226048925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226075682, "dur": 131, "args": { "External id": 287020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [384, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287020, "pid": 5, "tid": 7, "ts": 1716454226075682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048934, "dur": 5, "args": { "External id": 287020, "cbid": 211, "correlation": 287020 } }, { "ph": "s", "id": 287020, "pid": 76337, "tid": -914061504, "ts": 1716454226048934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226075814, "dur": 174, "args": { "External id": 287026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287026, "pid": 5, "tid": 7, "ts": 1716454226075814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226048961, "dur": 8, "args": { "External id": 287026, "cbid": 211, "correlation": 287026 } }, { "ph": "s", "id": 287026, "pid": 76337, "tid": -914061504, "ts": 1716454226048961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226075990, "dur": 473, "args": { "External id": 287034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287034, "pid": 5, "tid": 7, "ts": 1716454226075990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226049005, "dur": 10, "args": { "External id": 287034, "cbid": 211, "correlation": 287034 } }, { "ph": "s", "id": 287034, "pid": 76337, "tid": -914061504, "ts": 1716454226049005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226076464, "dur": 103, "args": { "External id": 287042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287042, "registers per thread": 17, "shared memory": 0, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287042, "pid": 5, "tid": 7, "ts": 1716454226076464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226049036, "dur": 8, "args": { "External id": 287042, "cbid": 211, "correlation": 287042 } }, { "ph": "s", "id": 287042, "pid": 76337, "tid": -914061504, "ts": 1716454226049036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226049126, "dur": 1, "args": { "External id": 287050, "cbid": 317, "correlation": 287050 } }, { "ph": "f", "id": 287050, "pid": 76337, "tid": -914061504, "ts": 1716454226049126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226049128, "dur": 114586, "args": { "External id": 287051, "cbid": 20, "correlation": 287051 } }, { "ph": "f", "id": 287051, "pid": 76337, "tid": -914061504, "ts": 1716454226049128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_nhwc_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float, unsigned long)", "pid": 5, "tid": 7, "ts": 1716454226163767, "dur": 1285, "args": { "External id": 287055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287055, "registers per thread": 26, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 39321.6, "grid": [98304, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287055, "pid": 5, "tid": 7, "ts": 1716454226163767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226163739, "dur": 27, "args": { "External id": 287055, "cbid": 211, "correlation": 287055 } }, { "ph": "s", "id": 287055, "pid": 76337, "tid": -914061504, "ts": 1716454226163739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226163851, "dur": 1, "args": { "External id": 287063, "cbid": 317, "correlation": 287063 } }, { "ph": "f", "id": 287063, "pid": 76337, "tid": -914061504, "ts": 1716454226163851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226163853, "dur": 2693, "args": { "External id": 287064, "cbid": 20, "correlation": 287064 } }, { "ph": "f", "id": 287064, "pid": 76337, "tid": -914061504, "ts": 1716454226163853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226166614, "dur": 24, "args": { "External id": 287073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 57.6, "warps per SM": 230.4, "grid": [4608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287073, "pid": 5, "tid": 7, "ts": 1716454226166614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226166596, "dur": 19, "args": { "External id": 287073, "cbid": 211, "correlation": 287073 } }, { "ph": "s", "id": 287073, "pid": 76337, "tid": -914061504, "ts": 1716454226166596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226166628, "dur": 0, "args": { "External id": 287080, "cbid": 317, "correlation": 287080 } }, { "ph": "f", "id": 287080, "pid": 76337, "tid": -914061504, "ts": 1716454226166628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226166629, "dur": 1, "args": { "External id": 287081, "cbid": 203, "correlation": 287081 } }, { "ph": "f", "id": 287081, "pid": 76337, "tid": -914061504, "ts": 1716454226166629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226166631, "dur": 0, "args": { "External id": 287082, "cbid": 205, "correlation": 287082 } }, { "ph": "f", "id": 287082, "pid": 76337, "tid": -914061504, "ts": 1716454226166631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize256x128x32_stage1_warpsize4x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226169243, "dur": 10145, "args": { "External id": 287086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287086, "registers per thread": 224, "shared memory": 34816, "blocks per SM": 38.4, "warps per SM": 307.2, "grid": [4, 768, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287086, "pid": 5, "tid": 7, "ts": 1716454226169243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226168999, "dur": 247, "args": { "External id": 287086, "cbid": 211, "correlation": 287086 } }, { "ph": "s", "id": 287086, "pid": 76337, "tid": -914061504, "ts": 1716454226168999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226179390, "dur": 642, "args": { "External id": 287092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287092, "pid": 5, "tid": 7, "ts": 1716454226179390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226169899, "dur": 15, "args": { "External id": 287092, "cbid": 211, "correlation": 287092 } }, { "ph": "s", "id": 287092, "pid": 76337, "tid": -914061504, "ts": 1716454226169899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226180033, "dur": 2840, "args": { "External id": 287102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287102, "pid": 5, "tid": 7, "ts": 1716454226180033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226170099, "dur": 16, "args": { "External id": 287102, "cbid": 211, "correlation": 287102 } }, { "ph": "s", "id": 287102, "pid": 76337, "tid": -914061504, "ts": 1716454226170099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226170136, "dur": 1, "args": { "External id": 287112, "cbid": 317, "correlation": 287112 } }, { "ph": "f", "id": 287112, "pid": 76337, "tid": -914061504, "ts": 1716454226170136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226170137, "dur": 862, "args": { "External id": 287113, "cbid": 20, "correlation": 287113 } }, { "ph": "f", "id": 287113, "pid": 76337, "tid": -914061504, "ts": 1716454226170137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226182874, "dur": 808, "args": { "External id": 287127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287127, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287127, "pid": 5, "tid": 7, "ts": 1716454226182874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226171024, "dur": 18, "args": { "External id": 287127, "cbid": 211, "correlation": 287127 } }, { "ph": "s", "id": 287127, "pid": 76337, "tid": -914061504, "ts": 1716454226171024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226183683, "dur": 5, "args": { "External id": 287139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287139, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.1, "warps per SM": 0.8, "grid": [8, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287139, "pid": 5, "tid": 7, "ts": 1716454226183683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226171055, "dur": 9, "args": { "External id": 287139, "cbid": 211, "correlation": 287139 } }, { "ph": "s", "id": 287139, "pid": 76337, "tid": -914061504, "ts": 1716454226171055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226183689, "dur": 641, "args": { "External id": 287142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287142, "pid": 5, "tid": 7, "ts": 1716454226183689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226171084, "dur": 9, "args": { "External id": 287142, "cbid": 211, "correlation": 287142 } }, { "ph": "s", "id": 287142, "pid": 76337, "tid": -914061504, "ts": 1716454226171084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226184332, "dur": 414, "args": { "External id": 287151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287151, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287151, "pid": 5, "tid": 7, "ts": 1716454226184332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226171152, "dur": 14, "args": { "External id": 287151, "cbid": 211, "correlation": 287151 } }, { "ph": "s", "id": 287151, "pid": 76337, "tid": -914061504, "ts": 1716454226171152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226171236, "dur": 0, "args": { "External id": 287161, "cbid": 317, "correlation": 287161 } }, { "ph": "f", "id": 287161, "pid": 76337, "tid": -914061504, "ts": 1716454226171236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226171237, "dur": 1, "args": { "External id": 287162, "cbid": 203, "correlation": 287162 } }, { "ph": "f", "id": 287162, "pid": 76337, "tid": -914061504, "ts": 1716454226171237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226171239, "dur": 0, "args": { "External id": 287163, "cbid": 205, "correlation": 287163 } }, { "ph": "f", "id": 287163, "pid": 76337, "tid": -914061504, "ts": 1716454226171239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226173434, "dur": 3, "args": { "External id": 287169, "cbid": 317, "correlation": 287169 } }, { "ph": "f", "id": 287169, "pid": 76337, "tid": -914061504, "ts": 1716454226173434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226173438, "dur": 8481, "args": { "External id": 287170, "cbid": 20, "correlation": 287170 } }, { "ph": "f", "id": 287170, "pid": 76337, "tid": -914061504, "ts": 1716454226173438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226184747, "dur": 1080, "args": { "External id": 287171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287171, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [1536, 16, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287171, "pid": 5, "tid": 7, "ts": 1716454226184747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226181947, "dur": 36, "args": { "External id": 287171, "cbid": 211, "correlation": 287171 } }, { "ph": "s", "id": 287171, "pid": 76337, "tid": -914061504, "ts": 1716454226181947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226185829, "dur": 17, "args": { "External id": 287173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 51.2, "warps per SM": 409.6, "grid": [1, 16, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287173, "pid": 5, "tid": 7, "ts": 1716454226185829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226181986, "dur": 6, "args": { "External id": 287173, "cbid": 211, "correlation": 287173 } }, { "ph": "s", "id": 287173, "pid": 76337, "tid": -914061504, "ts": 1716454226181986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226185847, "dur": 4885, "args": { "External id": 287175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287175, "registers per thread": 254, "shared memory": 33792, "blocks per SM": 19.2, "warps per SM": 153.6, "grid": [1, 1536, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287175, "pid": 5, "tid": 7, "ts": 1716454226185847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182006, "dur": 11, "args": { "External id": 287175, "cbid": 211, "correlation": 287175 } }, { "ph": "s", "id": 287175, "pid": 76337, "tid": -914061504, "ts": 1716454226182006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226190734, "dur": 207, "args": { "External id": 287177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287177, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 614.4, "warps per SM": 4915.2, "grid": [1536, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287177, "pid": 5, "tid": 7, "ts": 1716454226190734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182024, "dur": 7, "args": { "External id": 287177, "cbid": 211, "correlation": 287177 } }, { "ph": "s", "id": 287177, "pid": 76337, "tid": -914061504, "ts": 1716454226182024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226190942, "dur": 340, "args": { "External id": 287183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287183, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287183, "pid": 5, "tid": 7, "ts": 1716454226190942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182653, "dur": 14, "args": { "External id": 287183, "cbid": 211, "correlation": 287183 } }, { "ph": "s", "id": 287183, "pid": 76337, "tid": -914061504, "ts": 1716454226182653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226191284, "dur": 401, "args": { "External id": 287203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287203, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287203, "pid": 5, "tid": 7, "ts": 1716454226191284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182791, "dur": 14, "args": { "External id": 287203, "cbid": 211, "correlation": 287203 } }, { "ph": "s", "id": 287203, "pid": 76337, "tid": -914061504, "ts": 1716454226182791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226191685, "dur": 4, "args": { "External id": 287215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287215, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287215, "pid": 5, "tid": 7, "ts": 1716454226191685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182820, "dur": 8, "args": { "External id": 287215, "cbid": 211, "correlation": 287215 } }, { "ph": "s", "id": 287215, "pid": 76337, "tid": -914061504, "ts": 1716454226182820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226191691, "dur": 323, "args": { "External id": 287218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287218, "pid": 5, "tid": 7, "ts": 1716454226191691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182849, "dur": 9, "args": { "External id": 287218, "cbid": 211, "correlation": 287218 } }, { "ph": "s", "id": 287218, "pid": 76337, "tid": -914061504, "ts": 1716454226182849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226192015, "dur": 210, "args": { "External id": 287227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287227, "registers per thread": 24, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287227, "pid": 5, "tid": 7, "ts": 1716454226192015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226182905, "dur": 11, "args": { "External id": 287227, "cbid": 211, "correlation": 287227 } }, { "ph": "s", "id": 287227, "pid": 76337, "tid": -914061504, "ts": 1716454226182905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226183010, "dur": 0, "args": { "External id": 287237, "cbid": 317, "correlation": 287237 } }, { "ph": "f", "id": 287237, "pid": 76337, "tid": -914061504, "ts": 1716454226183010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226183011, "dur": 1, "args": { "External id": 287238, "cbid": 203, "correlation": 287238 } }, { "ph": "f", "id": 287238, "pid": 76337, "tid": -914061504, "ts": 1716454226183011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226183012, "dur": 0, "args": { "External id": 287239, "cbid": 205, "correlation": 287239 } }, { "ph": "f", "id": 287239, "pid": 76337, "tid": -914061504, "ts": 1716454226183012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226192227, "dur": 369, "args": { "External id": 287243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287243, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 614.4, "warps per SM": 4915.2, "grid": [1536, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287243, "pid": 5, "tid": 7, "ts": 1716454226192227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226185332, "dur": 23, "args": { "External id": 287243, "cbid": 211, "correlation": 287243 } }, { "ph": "s", "id": 287243, "pid": 76337, "tid": -914061504, "ts": 1716454226185332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226192597, "dur": 10, "args": { "External id": 287245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287245, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287245, "pid": 5, "tid": 7, "ts": 1716454226192597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226185358, "dur": 6, "args": { "External id": 287245, "cbid": 211, "correlation": 287245 } }, { "ph": "s", "id": 287245, "pid": 76337, "tid": -914061504, "ts": 1716454226185358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226192609, "dur": 4, "args": { "External id": 287247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 287247, "pid": 5, "tid": 7, "ts": 1716454226192609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226185371, "dur": 12, "args": { "External id": 287247, "cbid": 211, "correlation": 287247 } }, { "ph": "s", "id": 287247, "pid": 76337, "tid": -914061504, "ts": 1716454226185371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226185386, "dur": 0, "args": { "External id": 287248, "cbid": 51, "correlation": 287248 } }, { "ph": "s", "id": 287248, "pid": 76337, "tid": -914061504, "ts": 1716454226185386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226192614, "dur": 2669, "args": { "External id": 287249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287249, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [1536, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287249, "pid": 5, "tid": 7, "ts": 1716454226192614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226185388, "dur": 10, "args": { "External id": 287249, "cbid": 211, "correlation": 287249 } }, { "ph": "s", "id": 287249, "pid": 76337, "tid": -914061504, "ts": 1716454226185388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226195285, "dur": 338, "args": { "External id": 287254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287254, "pid": 5, "tid": 7, "ts": 1716454226195285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226186041, "dur": 13, "args": { "External id": 287254, "cbid": 211, "correlation": 287254 } }, { "ph": "s", "id": 287254, "pid": 76337, "tid": -914061504, "ts": 1716454226186041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226186139, "dur": 0, "args": { "External id": 287264, "cbid": 317, "correlation": 287264 } }, { "ph": "f", "id": 287264, "pid": 76337, "tid": -914061504, "ts": 1716454226186139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226186140, "dur": 0, "args": { "External id": 287265, "cbid": 203, "correlation": 287265 } }, { "ph": "f", "id": 287265, "pid": 76337, "tid": -914061504, "ts": 1716454226186140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226186141, "dur": 0, "args": { "External id": 287266, "cbid": 205, "correlation": 287266 } }, { "ph": "f", "id": 287266, "pid": 76337, "tid": -914061504, "ts": 1716454226186141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226195624, "dur": 776, "args": { "External id": 287270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287270, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [2, 1536, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287270, "pid": 5, "tid": 7, "ts": 1716454226195624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226187750, "dur": 22, "args": { "External id": 287270, "cbid": 211, "correlation": 287270 } }, { "ph": "s", "id": 287270, "pid": 76337, "tid": -914061504, "ts": 1716454226187750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226196401, "dur": 321, "args": { "External id": 287276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287276, "pid": 5, "tid": 7, "ts": 1716454226196401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188202, "dur": 12, "args": { "External id": 287276, "cbid": 211, "correlation": 287276 } }, { "ph": "s", "id": 287276, "pid": 76337, "tid": -914061504, "ts": 1716454226188202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226196724, "dur": 889, "args": { "External id": 287284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287284, "pid": 5, "tid": 7, "ts": 1716454226196724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188260, "dur": 9, "args": { "External id": 287284, "cbid": 211, "correlation": 287284 } }, { "ph": "s", "id": 287284, "pid": 76337, "tid": -914061504, "ts": 1716454226188260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226197614, "dur": 203, "args": { "External id": 287292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287292, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287292, "pid": 5, "tid": 7, "ts": 1716454226197614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188316, "dur": 11, "args": { "External id": 287292, "cbid": 211, "correlation": 287292 } }, { "ph": "s", "id": 287292, "pid": 76337, "tid": -914061504, "ts": 1716454226188316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226197819, "dur": 1258, "args": { "External id": 287302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287302, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287302, "pid": 5, "tid": 7, "ts": 1716454226197819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188443, "dur": 16, "args": { "External id": 287302, "cbid": 211, "correlation": 287302 } }, { "ph": "s", "id": 287302, "pid": 76337, "tid": -914061504, "ts": 1716454226188443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226199078, "dur": 419, "args": { "External id": 287323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287323, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287323, "pid": 5, "tid": 7, "ts": 1716454226199078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188490, "dur": 8, "args": { "External id": 287323, "cbid": 211, "correlation": 287323 } }, { "ph": "s", "id": 287323, "pid": 76337, "tid": -914061504, "ts": 1716454226188490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226199499, "dur": 4, "args": { "External id": 287335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287335, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287335, "pid": 5, "tid": 7, "ts": 1716454226199499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188514, "dur": 8, "args": { "External id": 287335, "cbid": 211, "correlation": 287335 } }, { "ph": "s", "id": 287335, "pid": 76337, "tid": -914061504, "ts": 1716454226188514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226199504, "dur": 322, "args": { "External id": 287338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287338, "pid": 5, "tid": 7, "ts": 1716454226199504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188538, "dur": 7, "args": { "External id": 287338, "cbid": 211, "correlation": 287338 } }, { "ph": "s", "id": 287338, "pid": 76337, "tid": -914061504, "ts": 1716454226188538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226199827, "dur": 209, "args": { "External id": 287347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287347, "registers per thread": 24, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287347, "pid": 5, "tid": 7, "ts": 1716454226199827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188592, "dur": 11, "args": { "External id": 287347, "cbid": 211, "correlation": 287347 } }, { "ph": "s", "id": 287347, "pid": 76337, "tid": -914061504, "ts": 1716454226188592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226188658, "dur": 0, "args": { "External id": 287357, "cbid": 317, "correlation": 287357 } }, { "ph": "f", "id": 287357, "pid": 76337, "tid": -914061504, "ts": 1716454226188658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226188659, "dur": 0, "args": { "External id": 287358, "cbid": 203, "correlation": 287358 } }, { "ph": "f", "id": 287358, "pid": 76337, "tid": -914061504, "ts": 1716454226188659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226188660, "dur": 0, "args": { "External id": 287359, "cbid": 205, "correlation": 287359 } }, { "ph": "f", "id": 287359, "pid": 76337, "tid": -914061504, "ts": 1716454226188660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226200038, "dur": 367, "args": { "External id": 287363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287363, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 614.4, "warps per SM": 4915.2, "grid": [1536, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287363, "pid": 5, "tid": 7, "ts": 1716454226200038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188677, "dur": 13, "args": { "External id": 287363, "cbid": 211, "correlation": 287363 } }, { "ph": "s", "id": 287363, "pid": 76337, "tid": -914061504, "ts": 1716454226188677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226200406, "dur": 10, "args": { "External id": 287365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287365, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287365, "pid": 5, "tid": 7, "ts": 1716454226200406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188693, "dur": 5, "args": { "External id": 287365, "cbid": 211, "correlation": 287365 } }, { "ph": "s", "id": 287365, "pid": 76337, "tid": -914061504, "ts": 1716454226188693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226200418, "dur": 4, "args": { "External id": 287367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 287367, "pid": 5, "tid": 7, "ts": 1716454226200418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188703, "dur": 8, "args": { "External id": 287367, "cbid": 211, "correlation": 287367 } }, { "ph": "s", "id": 287367, "pid": 76337, "tid": -914061504, "ts": 1716454226188703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226188714, "dur": 0, "args": { "External id": 287368, "cbid": 51, "correlation": 287368 } }, { "ph": "s", "id": 287368, "pid": 76337, "tid": -914061504, "ts": 1716454226188714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226200423, "dur": 2663, "args": { "External id": 287369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287369, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [1536, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287369, "pid": 5, "tid": 7, "ts": 1716454226200423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188715, "dur": 6, "args": { "External id": 287369, "cbid": 211, "correlation": 287369 } }, { "ph": "s", "id": 287369, "pid": 76337, "tid": -914061504, "ts": 1716454226188715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226203087, "dur": 339, "args": { "External id": 287374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287374, "pid": 5, "tid": 7, "ts": 1716454226203087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188745, "dur": 9, "args": { "External id": 287374, "cbid": 211, "correlation": 287374 } }, { "ph": "s", "id": 287374, "pid": 76337, "tid": -914061504, "ts": 1716454226188745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226203427, "dur": 398, "args": { "External id": 287394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287394, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287394, "pid": 5, "tid": 7, "ts": 1716454226203427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188819, "dur": 12, "args": { "External id": 287394, "cbid": 211, "correlation": 287394 } }, { "ph": "s", "id": 287394, "pid": 76337, "tid": -914061504, "ts": 1716454226188819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226203826, "dur": 4, "args": { "External id": 287406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287406, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287406, "pid": 5, "tid": 7, "ts": 1716454226203826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188841, "dur": 6, "args": { "External id": 287406, "cbid": 211, "correlation": 287406 } }, { "ph": "s", "id": 287406, "pid": 76337, "tid": -914061504, "ts": 1716454226188841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226203831, "dur": 324, "args": { "External id": 287409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287409, "pid": 5, "tid": 7, "ts": 1716454226203831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188859, "dur": 6, "args": { "External id": 287409, "cbid": 211, "correlation": 287409 } }, { "ph": "s", "id": 287409, "pid": 76337, "tid": -914061504, "ts": 1716454226188859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226204156, "dur": 209, "args": { "External id": 287418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287418, "registers per thread": 24, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287418, "pid": 5, "tid": 7, "ts": 1716454226204156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188900, "dur": 10, "args": { "External id": 287418, "cbid": 211, "correlation": 287418 } }, { "ph": "s", "id": 287418, "pid": 76337, "tid": -914061504, "ts": 1716454226188900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226188981, "dur": 0, "args": { "External id": 287428, "cbid": 317, "correlation": 287428 } }, { "ph": "f", "id": 287428, "pid": 76337, "tid": -914061504, "ts": 1716454226188981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226188982, "dur": 0, "args": { "External id": 287429, "cbid": 203, "correlation": 287429 } }, { "ph": "f", "id": 287429, "pid": 76337, "tid": -914061504, "ts": 1716454226188982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226188983, "dur": 0, "args": { "External id": 287430, "cbid": 205, "correlation": 287430 } }, { "ph": "f", "id": 287430, "pid": 76337, "tid": -914061504, "ts": 1716454226188983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226204366, "dur": 372, "args": { "External id": 287434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287434, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 614.4, "warps per SM": 4915.2, "grid": [1536, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287434, "pid": 5, "tid": 7, "ts": 1716454226204366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226188997, "dur": 13, "args": { "External id": 287434, "cbid": 211, "correlation": 287434 } }, { "ph": "s", "id": 287434, "pid": 76337, "tid": -914061504, "ts": 1716454226188997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226204739, "dur": 11, "args": { "External id": 287436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287436, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287436, "pid": 5, "tid": 7, "ts": 1716454226204739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189012, "dur": 5, "args": { "External id": 287436, "cbid": 211, "correlation": 287436 } }, { "ph": "s", "id": 287436, "pid": 76337, "tid": -914061504, "ts": 1716454226189012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226204751, "dur": 3, "args": { "External id": 287438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287438, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 287438, "pid": 5, "tid": 7, "ts": 1716454226204751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189021, "dur": 6, "args": { "External id": 287438, "cbid": 211, "correlation": 287438 } }, { "ph": "s", "id": 287438, "pid": 76337, "tid": -914061504, "ts": 1716454226189021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226189029, "dur": 0, "args": { "External id": 287439, "cbid": 51, "correlation": 287439 } }, { "ph": "s", "id": 287439, "pid": 76337, "tid": -914061504, "ts": 1716454226189029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226204756, "dur": 2676, "args": { "External id": 287440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287440, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [1536, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287440, "pid": 5, "tid": 7, "ts": 1716454226204756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189030, "dur": 6, "args": { "External id": 287440, "cbid": 211, "correlation": 287440 } }, { "ph": "s", "id": 287440, "pid": 76337, "tid": -914061504, "ts": 1716454226189030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226207433, "dur": 340, "args": { "External id": 287445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287445, "pid": 5, "tid": 7, "ts": 1716454226207433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189061, "dur": 9, "args": { "External id": 287445, "cbid": 211, "correlation": 287445 } }, { "ph": "s", "id": 287445, "pid": 76337, "tid": -914061504, "ts": 1716454226189061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226207774, "dur": 886, "args": { "External id": 287453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287453, "pid": 5, "tid": 7, "ts": 1716454226207774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189097, "dur": 9, "args": { "External id": 287453, "cbid": 211, "correlation": 287453 } }, { "ph": "s", "id": 287453, "pid": 76337, "tid": -914061504, "ts": 1716454226189097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226208661, "dur": 202, "args": { "External id": 287461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287461, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287461, "pid": 5, "tid": 7, "ts": 1716454226208661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189127, "dur": 10, "args": { "External id": 287461, "cbid": 211, "correlation": 287461 } }, { "ph": "s", "id": 287461, "pid": 76337, "tid": -914061504, "ts": 1716454226189127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226208865, "dur": 1269, "args": { "External id": 287471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287471, "pid": 5, "tid": 7, "ts": 1716454226208865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189204, "dur": 13, "args": { "External id": 287471, "cbid": 211, "correlation": 287471 } }, { "ph": "s", "id": 287471, "pid": 76337, "tid": -914061504, "ts": 1716454226189204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226210135, "dur": 426, "args": { "External id": 287492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287492, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287492, "pid": 5, "tid": 7, "ts": 1716454226210135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189238, "dur": 7, "args": { "External id": 287492, "cbid": 211, "correlation": 287492 } }, { "ph": "s", "id": 287492, "pid": 76337, "tid": -914061504, "ts": 1716454226189238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226210563, "dur": 4, "args": { "External id": 287504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287504, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287504, "pid": 5, "tid": 7, "ts": 1716454226210563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189255, "dur": 6, "args": { "External id": 287504, "cbid": 211, "correlation": 287504 } }, { "ph": "s", "id": 287504, "pid": 76337, "tid": -914061504, "ts": 1716454226189255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226210567, "dur": 325, "args": { "External id": 287507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287507, "pid": 5, "tid": 7, "ts": 1716454226210567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189271, "dur": 6, "args": { "External id": 287507, "cbid": 211, "correlation": 287507 } }, { "ph": "s", "id": 287507, "pid": 76337, "tid": -914061504, "ts": 1716454226189271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226210894, "dur": 210, "args": { "External id": 287516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287516, "registers per thread": 24, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287516, "pid": 5, "tid": 7, "ts": 1716454226210894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189311, "dur": 9, "args": { "External id": 287516, "cbid": 211, "correlation": 287516 } }, { "ph": "s", "id": 287516, "pid": 76337, "tid": -914061504, "ts": 1716454226189311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226189367, "dur": 0, "args": { "External id": 287526, "cbid": 317, "correlation": 287526 } }, { "ph": "f", "id": 287526, "pid": 76337, "tid": -914061504, "ts": 1716454226189367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226189367, "dur": 0, "args": { "External id": 287527, "cbid": 203, "correlation": 287527 } }, { "ph": "f", "id": 287527, "pid": 76337, "tid": -914061504, "ts": 1716454226189367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226189368, "dur": 0, "args": { "External id": 287528, "cbid": 205, "correlation": 287528 } }, { "ph": "f", "id": 287528, "pid": 76337, "tid": -914061504, "ts": 1716454226189368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226211105, "dur": 381, "args": { "External id": 287532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287532, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 614.4, "warps per SM": 4915.2, "grid": [1536, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287532, "pid": 5, "tid": 7, "ts": 1716454226211105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189381, "dur": 13, "args": { "External id": 287532, "cbid": 211, "correlation": 287532 } }, { "ph": "s", "id": 287532, "pid": 76337, "tid": -914061504, "ts": 1716454226189381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226211487, "dur": 11, "args": { "External id": 287534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287534, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287534, "pid": 5, "tid": 7, "ts": 1716454226211487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189397, "dur": 5, "args": { "External id": 287534, "cbid": 211, "correlation": 287534 } }, { "ph": "s", "id": 287534, "pid": 76337, "tid": -914061504, "ts": 1716454226189397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226211499, "dur": 4, "args": { "External id": 287536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 287536, "pid": 5, "tid": 7, "ts": 1716454226211499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189405, "dur": 5, "args": { "External id": 287536, "cbid": 211, "correlation": 287536 } }, { "ph": "s", "id": 287536, "pid": 76337, "tid": -914061504, "ts": 1716454226189405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226189413, "dur": 0, "args": { "External id": 287537, "cbid": 51, "correlation": 287537 } }, { "ph": "s", "id": 287537, "pid": 76337, "tid": -914061504, "ts": 1716454226189413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226211504, "dur": 2699, "args": { "External id": 287538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287538, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [1536, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287538, "pid": 5, "tid": 7, "ts": 1716454226211504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189414, "dur": 5, "args": { "External id": 287538, "cbid": 211, "correlation": 287538 } }, { "ph": "s", "id": 287538, "pid": 76337, "tid": -914061504, "ts": 1716454226189414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226214205, "dur": 343, "args": { "External id": 287543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287543, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287543, "pid": 5, "tid": 7, "ts": 1716454226214205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189442, "dur": 8, "args": { "External id": 287543, "cbid": 211, "correlation": 287543 } }, { "ph": "s", "id": 287543, "pid": 76337, "tid": -914061504, "ts": 1716454226189442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226214549, "dur": 403, "args": { "External id": 287563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287563, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287563, "pid": 5, "tid": 7, "ts": 1716454226214549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189514, "dur": 12, "args": { "External id": 287563, "cbid": 211, "correlation": 287563 } }, { "ph": "s", "id": 287563, "pid": 76337, "tid": -914061504, "ts": 1716454226189514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226214953, "dur": 4, "args": { "External id": 287575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287575, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287575, "pid": 5, "tid": 7, "ts": 1716454226214953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189535, "dur": 7, "args": { "External id": 287575, "cbid": 211, "correlation": 287575 } }, { "ph": "s", "id": 287575, "pid": 76337, "tid": -914061504, "ts": 1716454226189535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226214958, "dur": 327, "args": { "External id": 287578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287578, "pid": 5, "tid": 7, "ts": 1716454226214958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189553, "dur": 6, "args": { "External id": 287578, "cbid": 211, "correlation": 287578 } }, { "ph": "s", "id": 287578, "pid": 76337, "tid": -914061504, "ts": 1716454226189553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226215286, "dur": 210, "args": { "External id": 287587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287587, "registers per thread": 24, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287587, "pid": 5, "tid": 7, "ts": 1716454226215286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189594, "dur": 10, "args": { "External id": 287587, "cbid": 211, "correlation": 287587 } }, { "ph": "s", "id": 287587, "pid": 76337, "tid": -914061504, "ts": 1716454226189594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226189661, "dur": 0, "args": { "External id": 287597, "cbid": 317, "correlation": 287597 } }, { "ph": "f", "id": 287597, "pid": 76337, "tid": -914061504, "ts": 1716454226189661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226189662, "dur": 0, "args": { "External id": 287598, "cbid": 203, "correlation": 287598 } }, { "ph": "f", "id": 287598, "pid": 76337, "tid": -914061504, "ts": 1716454226189662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226189663, "dur": 0, "args": { "External id": 287599, "cbid": 205, "correlation": 287599 } }, { "ph": "f", "id": 287599, "pid": 76337, "tid": -914061504, "ts": 1716454226189663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226215497, "dur": 380, "args": { "External id": 287603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287603, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 614.4, "warps per SM": 4915.2, "grid": [1536, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287603, "pid": 5, "tid": 7, "ts": 1716454226215497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189675, "dur": 12, "args": { "External id": 287603, "cbid": 211, "correlation": 287603 } }, { "ph": "s", "id": 287603, "pid": 76337, "tid": -914061504, "ts": 1716454226189675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226215878, "dur": 11, "args": { "External id": 287605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287605, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 25.6, "warps per SM": 204.8, "grid": [1, 8, 256], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287605, "pid": 5, "tid": 7, "ts": 1716454226215878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189690, "dur": 6, "args": { "External id": 287605, "cbid": 211, "correlation": 287605 } }, { "ph": "s", "id": 287605, "pid": 76337, "tid": -914061504, "ts": 1716454226189690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226215890, "dur": 4, "args": { "External id": 287607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4125, "warps per SM": 19.3, "grid": [193, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 287607, "pid": 5, "tid": 7, "ts": 1716454226215890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189700, "dur": 6, "args": { "External id": 287607, "cbid": 211, "correlation": 287607 } }, { "ph": "s", "id": 287607, "pid": 76337, "tid": -914061504, "ts": 1716454226189700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226189708, "dur": 0, "args": { "External id": 287608, "cbid": 51, "correlation": 287608 } }, { "ph": "s", "id": 287608, "pid": 76337, "tid": -914061504, "ts": 1716454226189708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226215895, "dur": 2759, "args": { "External id": 287609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287609, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [1536, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287609, "pid": 5, "tid": 7, "ts": 1716454226215895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189709, "dur": 5, "args": { "External id": 287609, "cbid": 211, "correlation": 287609 } }, { "ph": "s", "id": 287609, "pid": 76337, "tid": -914061504, "ts": 1716454226189709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226218656, "dur": 347, "args": { "External id": 287614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287614, "pid": 5, "tid": 7, "ts": 1716454226218656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189736, "dur": 9, "args": { "External id": 287614, "cbid": 211, "correlation": 287614 } }, { "ph": "s", "id": 287614, "pid": 76337, "tid": -914061504, "ts": 1716454226189736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226219004, "dur": 906, "args": { "External id": 287622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287622, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287622, "pid": 5, "tid": 7, "ts": 1716454226219004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189771, "dur": 9, "args": { "External id": 287622, "cbid": 211, "correlation": 287622 } }, { "ph": "s", "id": 287622, "pid": 76337, "tid": -914061504, "ts": 1716454226189771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226219911, "dur": 203, "args": { "External id": 287630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287630, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1228.8, "warps per SM": 4915.2, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287630, "pid": 5, "tid": 7, "ts": 1716454226219911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226189801, "dur": 8, "args": { "External id": 287630, "cbid": 211, "correlation": 287630 } }, { "ph": "s", "id": 287630, "pid": 76337, "tid": -914061504, "ts": 1716454226189801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226189885, "dur": 1, "args": { "External id": 287638, "cbid": 317, "correlation": 287638 } }, { "ph": "f", "id": 287638, "pid": 76337, "tid": -914061504, "ts": 1716454226189885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226189887, "dur": 1499, "args": { "External id": 287639, "cbid": 20, "correlation": 287639 } }, { "ph": "f", "id": 287639, "pid": 76337, "tid": -914061504, "ts": 1716454226189887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_nhwc_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float, unsigned long)", "pid": 5, "tid": 7, "ts": 1716454226220116, "dur": 2652, "args": { "External id": 287643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287643, "registers per thread": 26, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 78643.2, "grid": [196608, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287643, "pid": 5, "tid": 7, "ts": 1716454226220116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226191403, "dur": 19, "args": { "External id": 287643, "cbid": 211, "correlation": 287643 } }, { "ph": "s", "id": 287643, "pid": 76337, "tid": -914061504, "ts": 1716454226191403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226191471, "dur": 1, "args": { "External id": 287651, "cbid": 317, "correlation": 287651 } }, { "ph": "f", "id": 287651, "pid": 76337, "tid": -914061504, "ts": 1716454226191471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226191473, "dur": 9562, "args": { "External id": 287652, "cbid": 20, "correlation": 287652 } }, { "ph": "f", "id": 287652, "pid": 76337, "tid": -914061504, "ts": 1716454226191473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226222768, "dur": 9, "args": { "External id": 287661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 287661, "pid": 5, "tid": 7, "ts": 1716454226222768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226201107, "dur": 28, "args": { "External id": 287661, "cbid": 211, "correlation": 287661 } }, { "ph": "s", "id": 287661, "pid": 76337, "tid": -914061504, "ts": 1716454226201107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226201152, "dur": 1, "args": { "External id": 287668, "cbid": 317, "correlation": 287668 } }, { "ph": "f", "id": 287668, "pid": 76337, "tid": -914061504, "ts": 1716454226201152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226201153, "dur": 1, "args": { "External id": 287669, "cbid": 203, "correlation": 287669 } }, { "ph": "f", "id": 287669, "pid": 76337, "tid": -914061504, "ts": 1716454226201153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226201155, "dur": 1, "args": { "External id": 287670, "cbid": 205, "correlation": 287670 } }, { "ph": "f", "id": 287670, "pid": 76337, "tid": -914061504, "ts": 1716454226201155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226222779, "dur": 10232, "args": { "External id": 287674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287674, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [2, 6144, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287674, "pid": 5, "tid": 7, "ts": 1716454226222779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226203468, "dur": 22, "args": { "External id": 287674, "cbid": 211, "correlation": 287674 } }, { "ph": "s", "id": 287674, "pid": 76337, "tid": -914061504, "ts": 1716454226203468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226233012, "dur": 1292, "args": { "External id": 287680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 4915.2, "warps per SM": 19660.8, "grid": [393216, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287680, "pid": 5, "tid": 7, "ts": 1716454226233012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226204082, "dur": 14, "args": { "External id": 287680, "cbid": 211, "correlation": 287680 } }, { "ph": "s", "id": 287680, "pid": 76337, "tid": -914061504, "ts": 1716454226204082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226234305, "dur": 20760, "args": { "External id": 287690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287690, "registers per thread": 16, "shared memory": 0, "blocks per SM": 4915.2, "warps per SM": 19660.8, "grid": [393216, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287690, "pid": 5, "tid": 7, "ts": 1716454226234305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226204327, "dur": 14, "args": { "External id": 287690, "cbid": 211, "correlation": 287690 } }, { "ph": "s", "id": 287690, "pid": 76337, "tid": -914061504, "ts": 1716454226204327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226204368, "dur": 1, "args": { "External id": 287700, "cbid": 317, "correlation": 287700 } }, { "ph": "f", "id": 287700, "pid": 76337, "tid": -914061504, "ts": 1716454226204368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226204369, "dur": 12140, "args": { "External id": 287701, "cbid": 20, "correlation": 287701 } }, { "ph": "f", "id": 287701, "pid": 76337, "tid": -914061504, "ts": 1716454226204369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226255066, "dur": 1688, "args": { "External id": 287715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287715, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287715, "pid": 5, "tid": 7, "ts": 1716454226255066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226216535, "dur": 18, "args": { "External id": 287715, "cbid": 211, "correlation": 287715 } }, { "ph": "s", "id": 287715, "pid": 76337, "tid": -914061504, "ts": 1716454226216535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226256755, "dur": 4, "args": { "External id": 287727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287727, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 287727, "pid": 5, "tid": 7, "ts": 1716454226256755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226216573, "dur": 10, "args": { "External id": 287727, "cbid": 211, "correlation": 287727 } }, { "ph": "s", "id": 287727, "pid": 76337, "tid": -914061504, "ts": 1716454226216573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226256760, "dur": 1298, "args": { "External id": 287730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 4915.2, "warps per SM": 19660.8, "grid": [393216, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287730, "pid": 5, "tid": 7, "ts": 1716454226256760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226216603, "dur": 8, "args": { "External id": 287730, "cbid": 211, "correlation": 287730 } }, { "ph": "s", "id": 287730, "pid": 76337, "tid": -914061504, "ts": 1716454226216603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226258060, "dur": 827, "args": { "External id": 287739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287739, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4915.2, "warps per SM": 19660.8, "grid": [393216, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287739, "pid": 5, "tid": 7, "ts": 1716454226258060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226216672, "dur": 12, "args": { "External id": 287739, "cbid": 211, "correlation": 287739 } }, { "ph": "s", "id": 287739, "pid": 76337, "tid": -914061504, "ts": 1716454226216672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226216766, "dur": 0, "args": { "External id": 287749, "cbid": 317, "correlation": 287749 } }, { "ph": "f", "id": 287749, "pid": 76337, "tid": -914061504, "ts": 1716454226216766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226216767, "dur": 0, "args": { "External id": 287750, "cbid": 203, "correlation": 287750 } }, { "ph": "f", "id": 287750, "pid": 76337, "tid": -914061504, "ts": 1716454226216767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226216768, "dur": 0, "args": { "External id": 287751, "cbid": 205, "correlation": 287751 } }, { "ph": "f", "id": 287751, "pid": 76337, "tid": -914061504, "ts": 1716454226216768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226219182, "dur": 2, "args": { "External id": 287757, "cbid": 317, "correlation": 287757 } }, { "ph": "f", "id": 287757, "pid": 76337, "tid": -914061504, "ts": 1716454226219182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMalloc", "pid": 76337, "tid": -914061504, "ts": 1716454226219184, "dur": 669555, "args": { "External id": 287758, "cbid": 20, "correlation": 287758 } }, { "ph": "f", "id": 287758, "pid": 76337, "tid": -914061504, "ts": 1716454226219184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226888850, "dur": 1608, "args": { "External id": 287759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287759, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 2457.6, "warps per SM": 19660.8, "grid": [6144, 8, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287759, "pid": 5, "tid": 7, "ts": 1716454226888850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226888792, "dur": 61, "args": { "External id": 287759, "cbid": 211, "correlation": 287759 } }, { "ph": "s", "id": 287759, "pid": 76337, "tid": -914061504, "ts": 1716454226888792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226890460, "dur": 8, "args": { "External id": 287761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12.8, "warps per SM": 102.4, "grid": [1, 8, 128], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287761, "pid": 5, "tid": 7, "ts": 1716454226890460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226888856, "dur": 7, "args": { "External id": 287761, "cbid": 211, "correlation": 287761 } }, { "ph": "s", "id": 287761, "pid": 76337, "tid": -914061504, "ts": 1716454226888856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226890469, "dur": 5, "args": { "External id": 287763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287763, "pid": 5, "tid": 7, "ts": 1716454226890469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226888875, "dur": 13, "args": { "External id": 287763, "cbid": 211, "correlation": 287763 } }, { "ph": "s", "id": 287763, "pid": 76337, "tid": -914061504, "ts": 1716454226888875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226888895, "dur": 0, "args": { "External id": 287764, "cbid": 51, "correlation": 287764 } }, { "ph": "s", "id": 287764, "pid": 76337, "tid": -914061504, "ts": 1716454226888895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226890475, "dur": 5320, "args": { "External id": 287765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287765, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287765, "pid": 5, "tid": 7, "ts": 1716454226890475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226888896, "dur": 11, "args": { "External id": 287765, "cbid": 211, "correlation": 287765 } }, { "ph": "s", "id": 287765, "pid": 76337, "tid": -914061504, "ts": 1716454226888896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226895796, "dur": 683, "args": { "External id": 287770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287770, "pid": 5, "tid": 7, "ts": 1716454226895796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226889825, "dur": 18, "args": { "External id": 287770, "cbid": 211, "correlation": 287770 } }, { "ph": "s", "id": 287770, "pid": 76337, "tid": -914061504, "ts": 1716454226889825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226896481, "dur": 798, "args": { "External id": 287790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287790, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287790, "pid": 5, "tid": 7, "ts": 1716454226896481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226890088, "dur": 15, "args": { "External id": 287790, "cbid": 211, "correlation": 287790 } }, { "ph": "s", "id": 287790, "pid": 76337, "tid": -914061504, "ts": 1716454226890088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226897281, "dur": 4, "args": { "External id": 287802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287802, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 287802, "pid": 5, "tid": 7, "ts": 1716454226897281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226890117, "dur": 9, "args": { "External id": 287802, "cbid": 211, "correlation": 287802 } }, { "ph": "s", "id": 287802, "pid": 76337, "tid": -914061504, "ts": 1716454226890117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226897286, "dur": 655, "args": { "External id": 287805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287805, "pid": 5, "tid": 7, "ts": 1716454226897286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226890150, "dur": 8, "args": { "External id": 287805, "cbid": 211, "correlation": 287805 } }, { "ph": "s", "id": 287805, "pid": 76337, "tid": -914061504, "ts": 1716454226890150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226897942, "dur": 416, "args": { "External id": 287814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287814, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287814, "pid": 5, "tid": 7, "ts": 1716454226897942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226890217, "dur": 14, "args": { "External id": 287814, "cbid": 211, "correlation": 287814 } }, { "ph": "s", "id": 287814, "pid": 76337, "tid": -914061504, "ts": 1716454226890217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226890353, "dur": 1, "args": { "External id": 287824, "cbid": 317, "correlation": 287824 } }, { "ph": "f", "id": 287824, "pid": 76337, "tid": -914061504, "ts": 1716454226890353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226890355, "dur": 1, "args": { "External id": 287825, "cbid": 203, "correlation": 287825 } }, { "ph": "f", "id": 287825, "pid": 76337, "tid": -914061504, "ts": 1716454226890355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226890357, "dur": 1, "args": { "External id": 287826, "cbid": 205, "correlation": 287826 } }, { "ph": "f", "id": 287826, "pid": 76337, "tid": -914061504, "ts": 1716454226890357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226898359, "dur": 514, "args": { "External id": 287830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287830, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [6144, 4, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287830, "pid": 5, "tid": 7, "ts": 1716454226898359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226893084, "dur": 24, "args": { "External id": 287830, "cbid": 211, "correlation": 287830 } }, { "ph": "s", "id": 287830, "pid": 76337, "tid": -914061504, "ts": 1716454226893084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226898874, "dur": 6, "args": { "External id": 287832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287832, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 287832, "pid": 5, "tid": 7, "ts": 1716454226898874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226893111, "dur": 6, "args": { "External id": 287832, "cbid": 211, "correlation": 287832 } }, { "ph": "s", "id": 287832, "pid": 76337, "tid": -914061504, "ts": 1716454226893111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226898881, "dur": 5, "args": { "External id": 287834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287834, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287834, "pid": 5, "tid": 7, "ts": 1716454226898881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226893124, "dur": 9, "args": { "External id": 287834, "cbid": 211, "correlation": 287834 } }, { "ph": "s", "id": 287834, "pid": 76337, "tid": -914061504, "ts": 1716454226893124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226893136, "dur": 1, "args": { "External id": 287835, "cbid": 51, "correlation": 287835 } }, { "ph": "s", "id": 287835, "pid": 76337, "tid": -914061504, "ts": 1716454226893136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226898888, "dur": 2852, "args": { "External id": 287836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287836, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287836, "pid": 5, "tid": 7, "ts": 1716454226898888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226893138, "dur": 9, "args": { "External id": 287836, "cbid": 211, "correlation": 287836 } }, { "ph": "s", "id": 287836, "pid": 76337, "tid": -914061504, "ts": 1716454226893138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226901741, "dur": 687, "args": { "External id": 287841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287841, "pid": 5, "tid": 7, "ts": 1716454226901741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226893846, "dur": 13, "args": { "External id": 287841, "cbid": 211, "correlation": 287841 } }, { "ph": "s", "id": 287841, "pid": 76337, "tid": -914061504, "ts": 1716454226893846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226893944, "dur": 0, "args": { "External id": 287851, "cbid": 317, "correlation": 287851 } }, { "ph": "f", "id": 287851, "pid": 76337, "tid": -914061504, "ts": 1716454226893944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226893945, "dur": 0, "args": { "External id": 287852, "cbid": 203, "correlation": 287852 } }, { "ph": "f", "id": 287852, "pid": 76337, "tid": -914061504, "ts": 1716454226893945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226893946, "dur": 0, "args": { "External id": 287853, "cbid": 205, "correlation": 287853 } }, { "ph": "f", "id": 287853, "pid": 76337, "tid": -914061504, "ts": 1716454226893946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226902429, "dur": 900, "args": { "External id": 287857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287857, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [1, 6144, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287857, "pid": 5, "tid": 7, "ts": 1716454226902429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226895635, "dur": 70, "args": { "External id": 287857, "cbid": 211, "correlation": 287857 } }, { "ph": "s", "id": 287857, "pid": 76337, "tid": -914061504, "ts": 1716454226895635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226903331, "dur": 651, "args": { "External id": 287863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287863, "pid": 5, "tid": 7, "ts": 1716454226903331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896170, "dur": 12, "args": { "External id": 287863, "cbid": 211, "correlation": 287863 } }, { "ph": "s", "id": 287863, "pid": 76337, "tid": -914061504, "ts": 1716454226896170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226903983, "dur": 2699, "args": { "External id": 287871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287871, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287871, "pid": 5, "tid": 7, "ts": 1716454226903983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896224, "dur": 9, "args": { "External id": 287871, "cbid": 211, "correlation": 287871 } }, { "ph": "s", "id": 287871, "pid": 76337, "tid": -914061504, "ts": 1716454226896224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226906683, "dur": 404, "args": { "External id": 287879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287879, "registers per thread": 17, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287879, "pid": 5, "tid": 7, "ts": 1716454226906683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896278, "dur": 14, "args": { "External id": 287879, "cbid": 211, "correlation": 287879 } }, { "ph": "s", "id": 287879, "pid": 76337, "tid": -914061504, "ts": 1716454226896278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226907088, "dur": 7129, "args": { "External id": 287889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287889, "pid": 5, "tid": 7, "ts": 1716454226907088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896429, "dur": 16, "args": { "External id": 287889, "cbid": 211, "correlation": 287889 } }, { "ph": "s", "id": 287889, "pid": 76337, "tid": -914061504, "ts": 1716454226896429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226914219, "dur": 791, "args": { "External id": 287910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287910, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287910, "pid": 5, "tid": 7, "ts": 1716454226914219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896477, "dur": 9, "args": { "External id": 287910, "cbid": 211, "correlation": 287910 } }, { "ph": "s", "id": 287910, "pid": 76337, "tid": -914061504, "ts": 1716454226896477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226915011, "dur": 4, "args": { "External id": 287922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287922, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 287922, "pid": 5, "tid": 7, "ts": 1716454226915011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896498, "dur": 8, "args": { "External id": 287922, "cbid": 211, "correlation": 287922 } }, { "ph": "s", "id": 287922, "pid": 76337, "tid": -914061504, "ts": 1716454226896498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226915017, "dur": 650, "args": { "External id": 287925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287925, "pid": 5, "tid": 7, "ts": 1716454226915017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896525, "dur": 8, "args": { "External id": 287925, "cbid": 211, "correlation": 287925 } }, { "ph": "s", "id": 287925, "pid": 76337, "tid": -914061504, "ts": 1716454226896525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226915668, "dur": 415, "args": { "External id": 287934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287934, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287934, "pid": 5, "tid": 7, "ts": 1716454226915668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896580, "dur": 11, "args": { "External id": 287934, "cbid": 211, "correlation": 287934 } }, { "ph": "s", "id": 287934, "pid": 76337, "tid": -914061504, "ts": 1716454226896580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226896646, "dur": 0, "args": { "External id": 287944, "cbid": 317, "correlation": 287944 } }, { "ph": "f", "id": 287944, "pid": 76337, "tid": -914061504, "ts": 1716454226896646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226896647, "dur": 0, "args": { "External id": 287945, "cbid": 203, "correlation": 287945 } }, { "ph": "f", "id": 287945, "pid": 76337, "tid": -914061504, "ts": 1716454226896647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226896648, "dur": 0, "args": { "External id": 287946, "cbid": 205, "correlation": 287946 } }, { "ph": "f", "id": 287946, "pid": 76337, "tid": -914061504, "ts": 1716454226896648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226916085, "dur": 516, "args": { "External id": 287950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287950, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [6144, 4, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287950, "pid": 5, "tid": 7, "ts": 1716454226916085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896666, "dur": 13, "args": { "External id": 287950, "cbid": 211, "correlation": 287950 } }, { "ph": "s", "id": 287950, "pid": 76337, "tid": -914061504, "ts": 1716454226896666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226916602, "dur": 5, "args": { "External id": 287952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 287952, "pid": 5, "tid": 7, "ts": 1716454226916602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896682, "dur": 5, "args": { "External id": 287952, "cbid": 211, "correlation": 287952 } }, { "ph": "s", "id": 287952, "pid": 76337, "tid": -914061504, "ts": 1716454226896682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226916609, "dur": 5, "args": { "External id": 287954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287954, "pid": 5, "tid": 7, "ts": 1716454226916609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896691, "dur": 6, "args": { "External id": 287954, "cbid": 211, "correlation": 287954 } }, { "ph": "s", "id": 287954, "pid": 76337, "tid": -914061504, "ts": 1716454226896691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226896701, "dur": 0, "args": { "External id": 287955, "cbid": 51, "correlation": 287955 } }, { "ph": "s", "id": 287955, "pid": 76337, "tid": -914061504, "ts": 1716454226896701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226916615, "dur": 2835, "args": { "External id": 287956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287956, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 287956, "pid": 5, "tid": 7, "ts": 1716454226916615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896702, "dur": 7, "args": { "External id": 287956, "cbid": 211, "correlation": 287956 } }, { "ph": "s", "id": 287956, "pid": 76337, "tid": -914061504, "ts": 1716454226896702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226919452, "dur": 682, "args": { "External id": 287961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287961, "pid": 5, "tid": 7, "ts": 1716454226919452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896734, "dur": 9, "args": { "External id": 287961, "cbid": 211, "correlation": 287961 } }, { "ph": "s", "id": 287961, "pid": 76337, "tid": -914061504, "ts": 1716454226896734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226920136, "dur": 794, "args": { "External id": 287981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287981, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 287981, "pid": 5, "tid": 7, "ts": 1716454226920136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896807, "dur": 12, "args": { "External id": 287981, "cbid": 211, "correlation": 287981 } }, { "ph": "s", "id": 287981, "pid": 76337, "tid": -914061504, "ts": 1716454226896807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226920931, "dur": 4, "args": { "External id": 287993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287993, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 287993, "pid": 5, "tid": 7, "ts": 1716454226920931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896830, "dur": 6, "args": { "External id": 287993, "cbid": 211, "correlation": 287993 } }, { "ph": "s", "id": 287993, "pid": 76337, "tid": -914061504, "ts": 1716454226896830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226920936, "dur": 652, "args": { "External id": 287996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 287996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 287996, "pid": 5, "tid": 7, "ts": 1716454226920936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896847, "dur": 7, "args": { "External id": 287996, "cbid": 211, "correlation": 287996 } }, { "ph": "s", "id": 287996, "pid": 76337, "tid": -914061504, "ts": 1716454226896847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226921590, "dur": 415, "args": { "External id": 288005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288005, "pid": 5, "tid": 7, "ts": 1716454226921590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896888, "dur": 10, "args": { "External id": 288005, "cbid": 211, "correlation": 288005 } }, { "ph": "s", "id": 288005, "pid": 76337, "tid": -914061504, "ts": 1716454226896888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226896961, "dur": 0, "args": { "External id": 288015, "cbid": 317, "correlation": 288015 } }, { "ph": "f", "id": 288015, "pid": 76337, "tid": -914061504, "ts": 1716454226896961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226896962, "dur": 0, "args": { "External id": 288016, "cbid": 203, "correlation": 288016 } }, { "ph": "f", "id": 288016, "pid": 76337, "tid": -914061504, "ts": 1716454226896962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226896963, "dur": 0, "args": { "External id": 288017, "cbid": 205, "correlation": 288017 } }, { "ph": "f", "id": 288017, "pid": 76337, "tid": -914061504, "ts": 1716454226896963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226922006, "dur": 524, "args": { "External id": 288021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [6144, 4, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288021, "pid": 5, "tid": 7, "ts": 1716454226922006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896983, "dur": 13, "args": { "External id": 288021, "cbid": 211, "correlation": 288021 } }, { "ph": "s", "id": 288021, "pid": 76337, "tid": -914061504, "ts": 1716454226896983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226922531, "dur": 6, "args": { "External id": 288023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 288023, "pid": 5, "tid": 7, "ts": 1716454226922531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226896998, "dur": 5, "args": { "External id": 288023, "cbid": 211, "correlation": 288023 } }, { "ph": "s", "id": 288023, "pid": 76337, "tid": -914061504, "ts": 1716454226896998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226922538, "dur": 5, "args": { "External id": 288025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288025, "pid": 5, "tid": 7, "ts": 1716454226922538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897007, "dur": 7, "args": { "External id": 288025, "cbid": 211, "correlation": 288025 } }, { "ph": "s", "id": 288025, "pid": 76337, "tid": -914061504, "ts": 1716454226897007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226897017, "dur": 0, "args": { "External id": 288026, "cbid": 51, "correlation": 288026 } }, { "ph": "s", "id": 288026, "pid": 76337, "tid": -914061504, "ts": 1716454226897017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226922544, "dur": 2844, "args": { "External id": 288027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288027, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 288027, "pid": 5, "tid": 7, "ts": 1716454226922544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897017, "dur": 5, "args": { "External id": 288027, "cbid": 211, "correlation": 288027 } }, { "ph": "s", "id": 288027, "pid": 76337, "tid": -914061504, "ts": 1716454226897017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226925389, "dur": 685, "args": { "External id": 288032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288032, "pid": 5, "tid": 7, "ts": 1716454226925389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897045, "dur": 8, "args": { "External id": 288032, "cbid": 211, "correlation": 288032 } }, { "ph": "s", "id": 288032, "pid": 76337, "tid": -914061504, "ts": 1716454226897045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226926075, "dur": 2703, "args": { "External id": 288040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288040, "pid": 5, "tid": 7, "ts": 1716454226926075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897080, "dur": 9, "args": { "External id": 288040, "cbid": 211, "correlation": 288040 } }, { "ph": "s", "id": 288040, "pid": 76337, "tid": -914061504, "ts": 1716454226897080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226928780, "dur": 402, "args": { "External id": 288048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288048, "registers per thread": 17, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288048, "pid": 5, "tid": 7, "ts": 1716454226928780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897112, "dur": 9, "args": { "External id": 288048, "cbid": 211, "correlation": 288048 } }, { "ph": "s", "id": 288048, "pid": 76337, "tid": -914061504, "ts": 1716454226897112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226929183, "dur": 7125, "args": { "External id": 288058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288058, "pid": 5, "tid": 7, "ts": 1716454226929183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897189, "dur": 13, "args": { "External id": 288058, "cbid": 211, "correlation": 288058 } }, { "ph": "s", "id": 288058, "pid": 76337, "tid": -914061504, "ts": 1716454226897189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226936310, "dur": 793, "args": { "External id": 288079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288079, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 288079, "pid": 5, "tid": 7, "ts": 1716454226936310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897223, "dur": 8, "args": { "External id": 288079, "cbid": 211, "correlation": 288079 } }, { "ph": "s", "id": 288079, "pid": 76337, "tid": -914061504, "ts": 1716454226897223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226937103, "dur": 4, "args": { "External id": 288091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288091, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 288091, "pid": 5, "tid": 7, "ts": 1716454226937103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897240, "dur": 6, "args": { "External id": 288091, "cbid": 211, "correlation": 288091 } }, { "ph": "s", "id": 288091, "pid": 76337, "tid": -914061504, "ts": 1716454226897240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226937108, "dur": 655, "args": { "External id": 288094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288094, "pid": 5, "tid": 7, "ts": 1716454226937108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897257, "dur": 7, "args": { "External id": 288094, "cbid": 211, "correlation": 288094 } }, { "ph": "s", "id": 288094, "pid": 76337, "tid": -914061504, "ts": 1716454226897257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226937764, "dur": 416, "args": { "External id": 288103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288103, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288103, "pid": 5, "tid": 7, "ts": 1716454226937764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897297, "dur": 10, "args": { "External id": 288103, "cbid": 211, "correlation": 288103 } }, { "ph": "s", "id": 288103, "pid": 76337, "tid": -914061504, "ts": 1716454226897297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226897351, "dur": 0, "args": { "External id": 288113, "cbid": 317, "correlation": 288113 } }, { "ph": "f", "id": 288113, "pid": 76337, "tid": -914061504, "ts": 1716454226897351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226897352, "dur": 0, "args": { "External id": 288114, "cbid": 203, "correlation": 288114 } }, { "ph": "f", "id": 288114, "pid": 76337, "tid": -914061504, "ts": 1716454226897352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226897352, "dur": 0, "args": { "External id": 288115, "cbid": 205, "correlation": 288115 } }, { "ph": "f", "id": 288115, "pid": 76337, "tid": -914061504, "ts": 1716454226897352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226938181, "dur": 521, "args": { "External id": 288119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288119, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [6144, 4, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288119, "pid": 5, "tid": 7, "ts": 1716454226938181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897366, "dur": 12, "args": { "External id": 288119, "cbid": 211, "correlation": 288119 } }, { "ph": "s", "id": 288119, "pid": 76337, "tid": -914061504, "ts": 1716454226897366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226938704, "dur": 6, "args": { "External id": 288121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288121, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 288121, "pid": 5, "tid": 7, "ts": 1716454226938704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897380, "dur": 5, "args": { "External id": 288121, "cbid": 211, "correlation": 288121 } }, { "ph": "s", "id": 288121, "pid": 76337, "tid": -914061504, "ts": 1716454226897380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226938711, "dur": 5, "args": { "External id": 288123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288123, "pid": 5, "tid": 7, "ts": 1716454226938711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897389, "dur": 6, "args": { "External id": 288123, "cbid": 211, "correlation": 288123 } }, { "ph": "s", "id": 288123, "pid": 76337, "tid": -914061504, "ts": 1716454226897389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226897398, "dur": 0, "args": { "External id": 288124, "cbid": 51, "correlation": 288124 } }, { "ph": "s", "id": 288124, "pid": 76337, "tid": -914061504, "ts": 1716454226897398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226938717, "dur": 2828, "args": { "External id": 288125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288125, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 288125, "pid": 5, "tid": 7, "ts": 1716454226938717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897398, "dur": 5, "args": { "External id": 288125, "cbid": 211, "correlation": 288125 } }, { "ph": "s", "id": 288125, "pid": 76337, "tid": -914061504, "ts": 1716454226897398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226941546, "dur": 685, "args": { "External id": 288130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288130, "pid": 5, "tid": 7, "ts": 1716454226941546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897426, "dur": 9, "args": { "External id": 288130, "cbid": 211, "correlation": 288130 } }, { "ph": "s", "id": 288130, "pid": 76337, "tid": -914061504, "ts": 1716454226897426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226942232, "dur": 798, "args": { "External id": 288150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288150, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 288150, "pid": 5, "tid": 7, "ts": 1716454226942232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897498, "dur": 12, "args": { "External id": 288150, "cbid": 211, "correlation": 288150 } }, { "ph": "s", "id": 288150, "pid": 76337, "tid": -914061504, "ts": 1716454226897498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226943032, "dur": 4, "args": { "External id": 288162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288162, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 288162, "pid": 5, "tid": 7, "ts": 1716454226943032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897520, "dur": 6, "args": { "External id": 288162, "cbid": 211, "correlation": 288162 } }, { "ph": "s", "id": 288162, "pid": 76337, "tid": -914061504, "ts": 1716454226897520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226943036, "dur": 653, "args": { "External id": 288165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288165, "pid": 5, "tid": 7, "ts": 1716454226943036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897537, "dur": 6, "args": { "External id": 288165, "cbid": 211, "correlation": 288165 } }, { "ph": "s", "id": 288165, "pid": 76337, "tid": -914061504, "ts": 1716454226897537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226943691, "dur": 414, "args": { "External id": 288174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288174, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288174, "pid": 5, "tid": 7, "ts": 1716454226943691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897577, "dur": 10, "args": { "External id": 288174, "cbid": 211, "correlation": 288174 } }, { "ph": "s", "id": 288174, "pid": 76337, "tid": -914061504, "ts": 1716454226897577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226897643, "dur": 0, "args": { "External id": 288184, "cbid": 317, "correlation": 288184 } }, { "ph": "f", "id": 288184, "pid": 76337, "tid": -914061504, "ts": 1716454226897643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226897644, "dur": 0, "args": { "External id": 288185, "cbid": 203, "correlation": 288185 } }, { "ph": "f", "id": 288185, "pid": 76337, "tid": -914061504, "ts": 1716454226897644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226897645, "dur": 0, "args": { "External id": 288186, "cbid": 205, "correlation": 288186 } }, { "ph": "f", "id": 288186, "pid": 76337, "tid": -914061504, "ts": 1716454226897645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226944107, "dur": 523, "args": { "External id": 288190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [6144, 4, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288190, "pid": 5, "tid": 7, "ts": 1716454226944107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897658, "dur": 12, "args": { "External id": 288190, "cbid": 211, "correlation": 288190 } }, { "ph": "s", "id": 288190, "pid": 76337, "tid": -914061504, "ts": 1716454226897658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226944631, "dur": 5, "args": { "External id": 288192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 6.4, "warps per SM": 51.2, "grid": [1, 4, 128], "block": [256, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 288192, "pid": 5, "tid": 7, "ts": 1716454226944631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897672, "dur": 5, "args": { "External id": 288192, "cbid": 211, "correlation": 288192 } }, { "ph": "s", "id": 288192, "pid": 76337, "tid": -914061504, "ts": 1716454226897672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454226944638, "dur": 5, "args": { "External id": 288194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288194, "registers per thread": 16, "shared memory": 0, "blocks per SM": 9.6125, "warps per SM": 76.9, "grid": [769, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288194, "pid": 5, "tid": 7, "ts": 1716454226944638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897681, "dur": 5, "args": { "External id": 288194, "cbid": 211, "correlation": 288194 } }, { "ph": "s", "id": 288194, "pid": 76337, "tid": -914061504, "ts": 1716454226897681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226897690, "dur": 0, "args": { "External id": 288195, "cbid": 51, "correlation": 288195 } }, { "ph": "s", "id": 288195, "pid": 76337, "tid": -914061504, "ts": 1716454226897690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454226944644, "dur": 2833, "args": { "External id": 288196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288196, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 288196, "pid": 5, "tid": 7, "ts": 1716454226944644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897690, "dur": 5, "args": { "External id": 288196, "cbid": 211, "correlation": 288196 } }, { "ph": "s", "id": 288196, "pid": 76337, "tid": -914061504, "ts": 1716454226897690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226947478, "dur": 685, "args": { "External id": 288201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288201, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288201, "pid": 5, "tid": 7, "ts": 1716454226947478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897718, "dur": 9, "args": { "External id": 288201, "cbid": 211, "correlation": 288201 } }, { "ph": "s", "id": 288201, "pid": 76337, "tid": -914061504, "ts": 1716454226897718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226948164, "dur": 2702, "args": { "External id": 288209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288209, "pid": 5, "tid": 7, "ts": 1716454226948164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897752, "dur": 9, "args": { "External id": 288209, "cbid": 211, "correlation": 288209 } }, { "ph": "s", "id": 288209, "pid": 76337, "tid": -914061504, "ts": 1716454226897752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226950867, "dur": 401, "args": { "External id": 288217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288217, "registers per thread": 17, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288217, "pid": 5, "tid": 7, "ts": 1716454226950867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897782, "dur": 8, "args": { "External id": 288217, "cbid": 211, "correlation": 288217 } }, { "ph": "s", "id": 288217, "pid": 76337, "tid": -914061504, "ts": 1716454226897782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226951270, "dur": 7119, "args": { "External id": 288227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288227, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288227, "pid": 5, "tid": 7, "ts": 1716454226951270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897855, "dur": 12, "args": { "External id": 288227, "cbid": 211, "correlation": 288227 } }, { "ph": "s", "id": 288227, "pid": 76337, "tid": -914061504, "ts": 1716454226897855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454226958390, "dur": 784, "args": { "External id": 288248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288248, "registers per thread": 25, "shared memory": 768, "blocks per SM": 1.6, "warps per SM": 25.6, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 40 } }, { "ph": "f", "id": 288248, "pid": 5, "tid": 7, "ts": 1716454226958390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897887, "dur": 7, "args": { "External id": 288248, "cbid": 211, "correlation": 288248 } }, { "ph": "s", "id": 288248, "pid": 76337, "tid": -914061504, "ts": 1716454226897887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454226959176, "dur": 4, "args": { "External id": 288260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288260, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.025, "warps per SM": 0.2, "grid": [2, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 288260, "pid": 5, "tid": 7, "ts": 1716454226959176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897903, "dur": 7, "args": { "External id": 288260, "cbid": 211, "correlation": 288260 } }, { "ph": "s", "id": 288260, "pid": 76337, "tid": -914061504, "ts": 1716454226897903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226959181, "dur": 651, "args": { "External id": 288263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288263, "pid": 5, "tid": 7, "ts": 1716454226959181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897920, "dur": 6, "args": { "External id": 288263, "cbid": 211, "correlation": 288263 } }, { "ph": "s", "id": 288263, "pid": 76337, "tid": -914061504, "ts": 1716454226897920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226959833, "dur": 418, "args": { "External id": 288272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288272, "registers per thread": 24, "shared memory": 0, "blocks per SM": 2457.6, "warps per SM": 9830.4, "grid": [196608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288272, "pid": 5, "tid": 7, "ts": 1716454226959833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226897967, "dur": 18, "args": { "External id": 288272, "cbid": 211, "correlation": 288272 } }, { "ph": "s", "id": 288272, "pid": 76337, "tid": -914061504, "ts": 1716454226897967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454226898030, "dur": 0, "args": { "External id": 288282, "cbid": 317, "correlation": 288282 } }, { "ph": "f", "id": 288282, "pid": 76337, "tid": -914061504, "ts": 1716454226898030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454226898031, "dur": 0, "args": { "External id": 288283, "cbid": 203, "correlation": 288283 } }, { "ph": "f", "id": 288283, "pid": 76337, "tid": -914061504, "ts": 1716454226898031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454226898032, "dur": 0, "args": { "External id": 288284, "cbid": 205, "correlation": 288284 } }, { "ph": "f", "id": 288284, "pid": 76337, "tid": -914061504, "ts": 1716454226898032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226960253, "dur": 541, "args": { "External id": 288288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1228.8, "warps per SM": 9830.4, "grid": [6144, 4, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288288, "pid": 5, "tid": 7, "ts": 1716454226960253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226900409, "dur": 24, "args": { "External id": 288288, "cbid": 211, "correlation": 288288 } }, { "ph": "s", "id": 288288, "pid": 76337, "tid": -914061504, "ts": 1716454226900409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454226960795, "dur": 3, "args": { "External id": 288290, "device": 5, "context": 1, "stream": 7, "correlation": 288290, "bytes": 18432, "memory bandwidth (GB/s)": 5.052631578947368 } }, { "ph": "f", "id": 288290, "pid": 5, "tid": 7, "ts": 1716454226960795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226900437, "dur": 25, "args": { "External id": 288290, "cbid": 51, "correlation": 288290 } }, { "ph": "s", "id": 288290, "pid": 76337, "tid": -914061504, "ts": 1716454226900437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454226900468, "dur": 2, "args": { "External id": 288292, "cbid": 200, "correlation": 288292 } }, { "ph": "f", "id": 288292, "pid": 76337, "tid": -914061504, "ts": 1716454226900468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454226900470, "dur": 0, "args": { "External id": 288293, "cbid": 200, "correlation": 288293 } }, { "ph": "f", "id": 288293, "pid": 76337, "tid": -914061504, "ts": 1716454226900470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454226900471, "dur": 0, "args": { "External id": 288294, "cbid": 200, "correlation": 288294 } }, { "ph": "f", "id": 288294, "pid": 76337, "tid": -914061504, "ts": 1716454226900471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454226900471, "dur": 0, "args": { "External id": 288295, "cbid": 200, "correlation": 288295 } }, { "ph": "f", "id": 288295, "pid": 76337, "tid": -914061504, "ts": 1716454226900471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454226900472, "dur": 4, "args": { "External id": 288296, "cbid": 15, "correlation": 288296 } }, { "ph": "f", "id": 288296, "pid": 76337, "tid": -914061504, "ts": 1716454226900472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454226900477, "dur": 3, "args": { "External id": 288297, "cbid": 251, "correlation": 288297 } }, { "ph": "f", "id": 288297, "pid": 76337, "tid": -914061504, "ts": 1716454226900477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454226960800, "dur": 21, "args": { "External id": 288298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288298, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288298, "pid": 5, "tid": 7, "ts": 1716454226960800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226900482, "dur": 10, "args": { "External id": 288298, "cbid": 211, "correlation": 288298 } }, { "ph": "s", "id": 288298, "pid": 76337, "tid": -914061504, "ts": 1716454226900482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x32_stage1_warpsize4x1x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454226960833, "dur": 1072, "args": { "External id": 288300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288300, "registers per thread": 128, "shared memory": 10240, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [1, 6144, 1], "block": [128, 1, 1], "est. achieved occupancy %": 25 } }, { "ph": "f", "id": 288300, "pid": 5, "tid": 7, "ts": 1716454226960833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226900509, "dur": 190, "args": { "External id": 288300, "cbid": 211, "correlation": 288300 } }, { "ph": "s", "id": 288300, "pid": 76337, "tid": -914061504, "ts": 1716454226900509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454226961907, "dur": 74, "args": { "External id": 288302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 307.2, "warps per SM": 2457.6, "grid": [6144, 1, 4], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288302, "pid": 5, "tid": 7, "ts": 1716454226961907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226900707, "dur": 9, "args": { "External id": 288302, "cbid": 211, "correlation": 288302 } }, { "ph": "s", "id": 288302, "pid": 76337, "tid": -914061504, "ts": 1716454226900707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454226961982, "dur": 20, "args": { "External id": 288308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 57.6, "warps per SM": 230.4, "grid": [4608, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288308, "pid": 5, "tid": 7, "ts": 1716454226961982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901387, "dur": 13, "args": { "External id": 288308, "cbid": 211, "correlation": 288308 } }, { "ph": "s", "id": 288308, "pid": 76337, "tid": -914061504, "ts": 1716454226901387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962004, "dur": 5, "args": { "External id": 288318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288318, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288318, "pid": 5, "tid": 7, "ts": 1716454226962004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901689, "dur": 16, "args": { "External id": 288318, "cbid": 211, "correlation": 288318 } }, { "ph": "s", "id": 288318, "pid": 76337, "tid": -914061504, "ts": 1716454226901689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962011, "dur": 5, "args": { "External id": 288326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288326, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288326, "pid": 5, "tid": 7, "ts": 1716454226962011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901731, "dur": 9, "args": { "External id": 288326, "cbid": 211, "correlation": 288326 } }, { "ph": "s", "id": 288326, "pid": 76337, "tid": -914061504, "ts": 1716454226901731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962017, "dur": 6, "args": { "External id": 288334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288334, "registers per thread": 18, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288334, "pid": 5, "tid": 7, "ts": 1716454226962017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901825, "dur": 16, "args": { "External id": 288334, "cbid": 211, "correlation": 288334 } }, { "ph": "s", "id": 288334, "pid": 76337, "tid": -914061504, "ts": 1716454226901825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962024, "dur": 5, "args": { "External id": 288344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288344, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288344, "pid": 5, "tid": 7, "ts": 1716454226962024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901877, "dur": 10, "args": { "External id": 288344, "cbid": 211, "correlation": 288344 } }, { "ph": "s", "id": 288344, "pid": 76337, "tid": -914061504, "ts": 1716454226901877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962030, "dur": 4, "args": { "External id": 288352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288352, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288352, "pid": 5, "tid": 7, "ts": 1716454226962030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901905, "dur": 8, "args": { "External id": 288352, "cbid": 211, "correlation": 288352 } }, { "ph": "s", "id": 288352, "pid": 76337, "tid": -914061504, "ts": 1716454226901905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962036, "dur": 4, "args": { "External id": 288360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288360, "registers per thread": 18, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288360, "pid": 5, "tid": 7, "ts": 1716454226962036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901930, "dur": 7, "args": { "External id": 288360, "cbid": 211, "correlation": 288360 } }, { "ph": "s", "id": 288360, "pid": 76337, "tid": -914061504, "ts": 1716454226901930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962041, "dur": 5, "args": { "External id": 288370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288370, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288370, "pid": 5, "tid": 7, "ts": 1716454226962041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226901967, "dur": 18, "args": { "External id": 288370, "cbid": 211, "correlation": 288370 } }, { "ph": "s", "id": 288370, "pid": 76337, "tid": -914061504, "ts": 1716454226901967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962047, "dur": 4, "args": { "External id": 288378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288378, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288378, "pid": 5, "tid": 7, "ts": 1716454226962047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226902006, "dur": 8, "args": { "External id": 288378, "cbid": 211, "correlation": 288378 } }, { "ph": "s", "id": 288378, "pid": 76337, "tid": -914061504, "ts": 1716454226902006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962053, "dur": 4, "args": { "External id": 288386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288386, "registers per thread": 18, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288386, "pid": 5, "tid": 7, "ts": 1716454226962053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226902032, "dur": 8, "args": { "External id": 288386, "cbid": 211, "correlation": 288386 } }, { "ph": "s", "id": 288386, "pid": 76337, "tid": -914061504, "ts": 1716454226902032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962059, "dur": 5, "args": { "External id": 288396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288396, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288396, "pid": 5, "tid": 7, "ts": 1716454226962059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226902073, "dur": 10, "args": { "External id": 288396, "cbid": 211, "correlation": 288396 } }, { "ph": "s", "id": 288396, "pid": 76337, "tid": -914061504, "ts": 1716454226902073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962065, "dur": 4, "args": { "External id": 288404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288404, "registers per thread": 17, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288404, "pid": 5, "tid": 7, "ts": 1716454226962065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226902100, "dur": 9, "args": { "External id": 288404, "cbid": 211, "correlation": 288404 } }, { "ph": "s", "id": 288404, "pid": 76337, "tid": -914061504, "ts": 1716454226902100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454226962070, "dur": 4, "args": { "External id": 288412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288412, "registers per thread": 18, "shared memory": 0, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [1152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 90 } }, { "ph": "f", "id": 288412, "pid": 5, "tid": 7, "ts": 1716454226962070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226902129, "dur": 8, "args": { "External id": 288412, "cbid": 211, "correlation": 288412 } }, { "ph": "s", "id": 288412, "pid": 76337, "tid": -914061504, "ts": 1716454226902129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454226962076, "dur": 18, "args": { "External id": 288421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 288421, "registers per thread": 26, "shared memory": 0, "blocks per SM": 8, "warps per SM": 128, "grid": [160, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 288421, "pid": 5, "tid": 7, "ts": 1716454226962076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454226902190, "dur": 17, "args": { "External id": 288421, "cbid": 211, "correlation": 288421 } }, { "ph": "s", "id": 288421, "pid": 76337, "tid": -914061504, "ts": 1716454226902190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454226962096, "dur": 852, "args": { "External id": 288426, "device": 5, "context": 1, "stream": 7, "correlation": 288426, "bytes": 4718592, "memory bandwidth (GB/s)": 5.536434035762895 } }, { "ph": "f", "id": 288426, "pid": 5, "tid": 7, "ts": 1716454226962096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454226902270, "dur": 61007, "args": { "External id": 288426, "cbid": 41, "correlation": 288426 } }, { "ph": "s", "id": 288426, "pid": 76337, "tid": -914061504, "ts": 1716454226902270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454226963279, "dur": 12, "args": { "External id": 288427, "cbid": 131, "correlation": 288427 } }, { "ph": "f", "id": 288427, "pid": 76337, "tid": -914061504, "ts": 1716454226963279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454227004833, "dur": 81, "args": { "External id": 288431, "cbid": 165, "correlation": 288431 } }, { "ph": "f", "id": 288431, "pid": 76337, "tid": -914061504, "ts": 1716454227004833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222500365, "dur": 1, "args": { "External id": 72204, "device": 5, "context": 1, "stream": 7, "correlation": 72204, "bytes": 8, "memory bandwidth (GB/s)": 0.00423728813559322 } }, { "ph": "f", "id": 72204, "pid": 5, "tid": 7, "ts": 1716454222500365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222500334, "dur": 29, "args": { "External id": 72204, "cbid": 41, "correlation": 72204 } }, { "ph": "s", "id": 72204, "pid": 76337, "tid": -914061504, "ts": 1716454222500334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222500365, "dur": 11, "args": { "External id": 72205, "cbid": 131, "correlation": 72205 } }, { "ph": "f", "id": 72205, "pid": 76337, "tid": -914061504, "ts": 1716454222500365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222500459, "dur": 3, "args": { "External id": 72212, "device": 5, "context": 1, "stream": 7, "correlation": 72212, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 72212, "pid": 5, "tid": 7, "ts": 1716454222500459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222500428, "dur": 31, "args": { "External id": 72212, "cbid": 41, "correlation": 72212 } }, { "ph": "s", "id": 72212, "pid": 76337, "tid": -914061504, "ts": 1716454222500428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222500512, "dur": 1, "args": { "External id": 72223, "device": 5, "context": 1, "stream": 7, "correlation": 72223, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 72223, "pid": 5, "tid": 7, "ts": 1716454222500512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222500498, "dur": 11, "args": { "External id": 72223, "cbid": 41, "correlation": 72223 } }, { "ph": "s", "id": 72223, "pid": 76337, "tid": -914061504, "ts": 1716454222500498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222500510, "dur": 9, "args": { "External id": 72224, "cbid": 131, "correlation": 72224 } }, { "ph": "f", "id": 72224, "pid": 76337, "tid": -914061504, "ts": 1716454222500510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222500575, "dur": 3, "args": { "External id": 72232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72232, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72232, "pid": 5, "tid": 7, "ts": 1716454222500575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500557, "dur": 20, "args": { "External id": 72232, "cbid": 211, "correlation": 72232 } }, { "ph": "s", "id": 72232, "pid": 76337, "tid": -914061504, "ts": 1716454222500557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222500620, "dur": 3, "args": { "External id": 72242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72242, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72242, "pid": 5, "tid": 7, "ts": 1716454222500620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500607, "dur": 13, "args": { "External id": 72242, "cbid": 211, "correlation": 72242 } }, { "ph": "s", "id": 72242, "pid": 76337, "tid": -914061504, "ts": 1716454222500607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222500663, "dur": 3, "args": { "External id": 72251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72251, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72251, "pid": 5, "tid": 7, "ts": 1716454222500663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500648, "dur": 14, "args": { "External id": 72251, "cbid": 211, "correlation": 72251 } }, { "ph": "s", "id": 72251, "pid": 76337, "tid": -914061504, "ts": 1716454222500648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222500783, "dur": 5, "args": { "External id": 72259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72259, "pid": 5, "tid": 7, "ts": 1716454222500783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500767, "dur": 16, "args": { "External id": 72259, "cbid": 211, "correlation": 72259 } }, { "ph": "s", "id": 72259, "pid": 76337, "tid": -914061504, "ts": 1716454222500767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222500830, "dur": 3, "args": { "External id": 72268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72268, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72268, "pid": 5, "tid": 7, "ts": 1716454222500830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500819, "dur": 9, "args": { "External id": 72268, "cbid": 211, "correlation": 72268 } }, { "ph": "s", "id": 72268, "pid": 76337, "tid": -914061504, "ts": 1716454222500819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222500854, "dur": 3, "args": { "External id": 72277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72277, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72277, "pid": 5, "tid": 7, "ts": 1716454222500854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500845, "dur": 8, "args": { "External id": 72277, "cbid": 211, "correlation": 72277 } }, { "ph": "s", "id": 72277, "pid": 76337, "tid": -914061504, "ts": 1716454222500845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222500925, "dur": 3, "args": { "External id": 72285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72285, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72285, "pid": 5, "tid": 7, "ts": 1716454222500925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222500914, "dur": 9, "args": { "External id": 72285, "cbid": 211, "correlation": 72285 } }, { "ph": "s", "id": 72285, "pid": 76337, "tid": -914061504, "ts": 1716454222500914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222501011, "dur": 1, "args": { "External id": 72293, "device": 5, "context": 1, "stream": 7, "correlation": 72293, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 72293, "pid": 5, "tid": 7, "ts": 1716454222501011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222500988, "dur": 32, "args": { "External id": 72293, "cbid": 41, "correlation": 72293 } }, { "ph": "s", "id": 72293, "pid": 76337, "tid": -914061504, "ts": 1716454222500988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222501021, "dur": 4, "args": { "External id": 72294, "cbid": 131, "correlation": 72294 } }, { "ph": "f", "id": 72294, "pid": 76337, "tid": -914061504, "ts": 1716454222501021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222501089, "dur": 1, "args": { "External id": 72304, "device": 5, "context": 1, "stream": 7, "correlation": 72304, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 72304, "pid": 5, "tid": 7, "ts": 1716454222501089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222501075, "dur": 11, "args": { "External id": 72304, "cbid": 41, "correlation": 72304 } }, { "ph": "s", "id": 72304, "pid": 76337, "tid": -914061504, "ts": 1716454222501075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222501087, "dur": 8, "args": { "External id": 72305, "cbid": 131, "correlation": 72305 } }, { "ph": "f", "id": 72305, "pid": 76337, "tid": -914061504, "ts": 1716454222501087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222501148, "dur": 1, "args": { "External id": 72314, "device": 5, "context": 1, "stream": 7, "correlation": 72314, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 72314, "pid": 5, "tid": 7, "ts": 1716454222501148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222501136, "dur": 10, "args": { "External id": 72314, "cbid": 41, "correlation": 72314 } }, { "ph": "s", "id": 72314, "pid": 76337, "tid": -914061504, "ts": 1716454222501136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222501146, "dur": 8, "args": { "External id": 72315, "cbid": 131, "correlation": 72315 } }, { "ph": "f", "id": 72315, "pid": 76337, "tid": -914061504, "ts": 1716454222501146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222501222, "dur": 4, "args": { "External id": 72322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72322, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72322, "pid": 5, "tid": 7, "ts": 1716454222501222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501205, "dur": 17, "args": { "External id": 72322, "cbid": 211, "correlation": 72322 } }, { "ph": "s", "id": 72322, "pid": 76337, "tid": -914061504, "ts": 1716454222501205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454222501259, "dur": 4, "args": { "External id": 72342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72342, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72342, "pid": 5, "tid": 7, "ts": 1716454222501259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501248, "dur": 12, "args": { "External id": 72342, "cbid": 211, "correlation": 72342 } }, { "ph": "s", "id": 72342, "pid": 76337, "tid": -914061504, "ts": 1716454222501248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222501261, "dur": 0, "args": { "External id": 72343, "cbid": 11, "correlation": 72343 } }, { "ph": "f", "id": 72343, "pid": 76337, "tid": -914061504, "ts": 1716454222501261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222501261, "dur": 0, "args": { "External id": 72344, "cbid": 11, "correlation": 72344 } }, { "ph": "f", "id": 72344, "pid": 76337, "tid": -914061504, "ts": 1716454222501261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222501277, "dur": 1, "args": { "External id": 72347, "device": 5, "context": 1, "stream": 7, "correlation": 72347, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 72347, "pid": 5, "tid": 7, "ts": 1716454222501277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222501263, "dur": 24, "args": { "External id": 72347, "cbid": 41, "correlation": 72347 } }, { "ph": "s", "id": 72347, "pid": 76337, "tid": -914061504, "ts": 1716454222501263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222501287, "dur": 3, "args": { "External id": 72348, "cbid": 131, "correlation": 72348 } }, { "ph": "f", "id": 72348, "pid": 76337, "tid": -914061504, "ts": 1716454222501287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222501318, "dur": 3, "args": { "External id": 72372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72372, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72372, "pid": 5, "tid": 7, "ts": 1716454222501318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501308, "dur": 9, "args": { "External id": 72372, "cbid": 211, "correlation": 72372 } }, { "ph": "s", "id": 72372, "pid": 76337, "tid": -914061504, "ts": 1716454222501308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222501318, "dur": 0, "args": { "External id": 72373, "cbid": 11, "correlation": 72373 } }, { "ph": "f", "id": 72373, "pid": 76337, "tid": -914061504, "ts": 1716454222501318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222501318, "dur": 0, "args": { "External id": 72374, "cbid": 11, "correlation": 72374 } }, { "ph": "f", "id": 72374, "pid": 76337, "tid": -914061504, "ts": 1716454222501318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222501320, "dur": 1, "args": { "External id": 72376, "cbid": 200, "correlation": 72376 } }, { "ph": "f", "id": 72376, "pid": 76337, "tid": -914061504, "ts": 1716454222501320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454222501331, "dur": 4, "args": { "External id": 72378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72378, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72378, "pid": 5, "tid": 7, "ts": 1716454222501331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501323, "dur": 9, "args": { "External id": 72378, "cbid": 211, "correlation": 72378 } }, { "ph": "s", "id": 72378, "pid": 76337, "tid": -914061504, "ts": 1716454222501323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222501333, "dur": 0, "args": { "External id": 72379, "cbid": 11, "correlation": 72379 } }, { "ph": "f", "id": 72379, "pid": 76337, "tid": -914061504, "ts": 1716454222501333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222501334, "dur": 0, "args": { "External id": 72380, "cbid": 11, "correlation": 72380 } }, { "ph": "f", "id": 72380, "pid": 76337, "tid": -914061504, "ts": 1716454222501334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222501374, "dur": 1, "args": { "External id": 72387, "device": 5, "context": 1, "stream": 7, "correlation": 72387, "bytes": 8, "memory bandwidth (GB/s)": 0.0047169811320754715 } }, { "ph": "f", "id": 72387, "pid": 5, "tid": 7, "ts": 1716454222501374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222501361, "dur": 21, "args": { "External id": 72387, "cbid": 41, "correlation": 72387 } }, { "ph": "s", "id": 72387, "pid": 76337, "tid": -914061504, "ts": 1716454222501361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222501383, "dur": 3, "args": { "External id": 72388, "cbid": 131, "correlation": 72388 } }, { "ph": "f", "id": 72388, "pid": 76337, "tid": -914061504, "ts": 1716454222501383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222501434, "dur": 1, "args": { "External id": 72398, "device": 5, "context": 1, "stream": 7, "correlation": 72398, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 72398, "pid": 5, "tid": 7, "ts": 1716454222501434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222501421, "dur": 11, "args": { "External id": 72398, "cbid": 41, "correlation": 72398 } }, { "ph": "s", "id": 72398, "pid": 76337, "tid": -914061504, "ts": 1716454222501421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222501433, "dur": 8, "args": { "External id": 72399, "cbid": 131, "correlation": 72399 } }, { "ph": "f", "id": 72399, "pid": 76337, "tid": -914061504, "ts": 1716454222501433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222501508, "dur": 5, "args": { "External id": 72406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72406, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72406, "pid": 5, "tid": 7, "ts": 1716454222501508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501492, "dur": 16, "args": { "External id": 72406, "cbid": 211, "correlation": 72406 } }, { "ph": "s", "id": 72406, "pid": 76337, "tid": -914061504, "ts": 1716454222501492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501581, "dur": 3, "args": { "External id": 72415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72415, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72415, "pid": 5, "tid": 7, "ts": 1716454222501581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501568, "dur": 13, "args": { "External id": 72415, "cbid": 211, "correlation": 72415 } }, { "ph": "s", "id": 72415, "pid": 76337, "tid": -914061504, "ts": 1716454222501568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501620, "dur": 3, "args": { "External id": 72423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72423, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72423, "pid": 5, "tid": 7, "ts": 1716454222501620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501609, "dur": 11, "args": { "External id": 72423, "cbid": 211, "correlation": 72423 } }, { "ph": "s", "id": 72423, "pid": 76337, "tid": -914061504, "ts": 1716454222501609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501655, "dur": 4, "args": { "External id": 72431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72431, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72431, "pid": 5, "tid": 7, "ts": 1716454222501655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501642, "dur": 12, "args": { "External id": 72431, "cbid": 211, "correlation": 72431 } }, { "ph": "s", "id": 72431, "pid": 76337, "tid": -914061504, "ts": 1716454222501642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501687, "dur": 4, "args": { "External id": 72439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72439, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72439, "pid": 5, "tid": 7, "ts": 1716454222501687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501676, "dur": 10, "args": { "External id": 72439, "cbid": 211, "correlation": 72439 } }, { "ph": "s", "id": 72439, "pid": 76337, "tid": -914061504, "ts": 1716454222501676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501714, "dur": 3, "args": { "External id": 72447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72447, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72447, "pid": 5, "tid": 7, "ts": 1716454222501714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501704, "dur": 8, "args": { "External id": 72447, "cbid": 211, "correlation": 72447 } }, { "ph": "s", "id": 72447, "pid": 76337, "tid": -914061504, "ts": 1716454222501704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501740, "dur": 3, "args": { "External id": 72455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72455, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72455, "pid": 5, "tid": 7, "ts": 1716454222501740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501730, "dur": 8, "args": { "External id": 72455, "cbid": 211, "correlation": 72455 } }, { "ph": "s", "id": 72455, "pid": 76337, "tid": -914061504, "ts": 1716454222501730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222501762, "dur": 4, "args": { "External id": 72463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72463, "pid": 5, "tid": 7, "ts": 1716454222501762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501753, "dur": 7, "args": { "External id": 72463, "cbid": 211, "correlation": 72463 } }, { "ph": "s", "id": 72463, "pid": 76337, "tid": -914061504, "ts": 1716454222501753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222501780, "dur": 4, "args": { "External id": 72471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72471, "pid": 5, "tid": 7, "ts": 1716454222501780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501771, "dur": 7, "args": { "External id": 72471, "cbid": 211, "correlation": 72471 } }, { "ph": "s", "id": 72471, "pid": 76337, "tid": -914061504, "ts": 1716454222501771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501800, "dur": 3, "args": { "External id": 72479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72479, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72479, "pid": 5, "tid": 7, "ts": 1716454222501800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501791, "dur": 7, "args": { "External id": 72479, "cbid": 211, "correlation": 72479 } }, { "ph": "s", "id": 72479, "pid": 76337, "tid": -914061504, "ts": 1716454222501791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501860, "dur": 3, "args": { "External id": 72487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72487, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72487, "pid": 5, "tid": 7, "ts": 1716454222501860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501847, "dur": 12, "args": { "External id": 72487, "cbid": 211, "correlation": 72487 } }, { "ph": "s", "id": 72487, "pid": 76337, "tid": -914061504, "ts": 1716454222501847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222501885, "dur": 4, "args": { "External id": 72495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72495, "pid": 5, "tid": 7, "ts": 1716454222501885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501876, "dur": 7, "args": { "External id": 72495, "cbid": 211, "correlation": 72495 } }, { "ph": "s", "id": 72495, "pid": 76337, "tid": -914061504, "ts": 1716454222501876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222501908, "dur": 4, "args": { "External id": 72503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72503, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72503, "pid": 5, "tid": 7, "ts": 1716454222501908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501898, "dur": 8, "args": { "External id": 72503, "cbid": 211, "correlation": 72503 } }, { "ph": "s", "id": 72503, "pid": 76337, "tid": -914061504, "ts": 1716454222501898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222501926, "dur": 3, "args": { "External id": 72511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72511, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 72511, "pid": 5, "tid": 7, "ts": 1716454222501926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222501918, "dur": 7, "args": { "External id": 72511, "cbid": 211, "correlation": 72511 } }, { "ph": "s", "id": 72511, "pid": 76337, "tid": -914061504, "ts": 1716454222501918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222502324, "dur": 5, "args": { "External id": 72520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72520, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72520, "pid": 5, "tid": 7, "ts": 1716454222502324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502306, "dur": 18, "args": { "External id": 72520, "cbid": 211, "correlation": 72520 } }, { "ph": "s", "id": 72520, "pid": 76337, "tid": -914061504, "ts": 1716454222502306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222502363, "dur": 5, "args": { "External id": 72529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72529, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72529, "pid": 5, "tid": 7, "ts": 1716454222502363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502352, "dur": 9, "args": { "External id": 72529, "cbid": 211, "correlation": 72529 } }, { "ph": "s", "id": 72529, "pid": 76337, "tid": -914061504, "ts": 1716454222502352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222502501, "dur": 3, "args": { "External id": 72545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72545, "pid": 5, "tid": 7, "ts": 1716454222502501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502486, "dur": 15, "args": { "External id": 72545, "cbid": 211, "correlation": 72545 } }, { "ph": "s", "id": 72545, "pid": 76337, "tid": -914061504, "ts": 1716454222502486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222502537, "dur": 3, "args": { "External id": 72553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72553, "pid": 5, "tid": 7, "ts": 1716454222502537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502528, "dur": 9, "args": { "External id": 72553, "cbid": 211, "correlation": 72553 } }, { "ph": "s", "id": 72553, "pid": 76337, "tid": -914061504, "ts": 1716454222502528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222502571, "dur": 3, "args": { "External id": 72561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72561, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72561, "pid": 5, "tid": 7, "ts": 1716454222502571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502560, "dur": 10, "args": { "External id": 72561, "cbid": 211, "correlation": 72561 } }, { "ph": "s", "id": 72561, "pid": 76337, "tid": -914061504, "ts": 1716454222502560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222502604, "dur": 4, "args": { "External id": 72569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72569, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72569, "pid": 5, "tid": 7, "ts": 1716454222502604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502594, "dur": 9, "args": { "External id": 72569, "cbid": 211, "correlation": 72569 } }, { "ph": "s", "id": 72569, "pid": 76337, "tid": -914061504, "ts": 1716454222502594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222502662, "dur": 4, "args": { "External id": 72581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72581, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72581, "pid": 5, "tid": 7, "ts": 1716454222502662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502649, "dur": 13, "args": { "External id": 72581, "cbid": 211, "correlation": 72581 } }, { "ph": "s", "id": 72581, "pid": 76337, "tid": -914061504, "ts": 1716454222502649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222502710, "dur": 4, "args": { "External id": 72592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72592, "pid": 5, "tid": 7, "ts": 1716454222502710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502698, "dur": 12, "args": { "External id": 72592, "cbid": 211, "correlation": 72592 } }, { "ph": "s", "id": 72592, "pid": 76337, "tid": -914061504, "ts": 1716454222502698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222502741, "dur": 3, "args": { "External id": 72600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72600, "pid": 5, "tid": 7, "ts": 1716454222502741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502732, "dur": 8, "args": { "External id": 72600, "cbid": 211, "correlation": 72600 } }, { "ph": "s", "id": 72600, "pid": 76337, "tid": -914061504, "ts": 1716454222502732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222502773, "dur": 5, "args": { "External id": 72608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72608, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72608, "pid": 5, "tid": 7, "ts": 1716454222502773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502763, "dur": 10, "args": { "External id": 72608, "cbid": 211, "correlation": 72608 } }, { "ph": "s", "id": 72608, "pid": 76337, "tid": -914061504, "ts": 1716454222502763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222502802, "dur": 5, "args": { "External id": 72616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72616, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72616, "pid": 5, "tid": 7, "ts": 1716454222502802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502792, "dur": 9, "args": { "External id": 72616, "cbid": 211, "correlation": 72616 } }, { "ph": "s", "id": 72616, "pid": 76337, "tid": -914061504, "ts": 1716454222502792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222502833, "dur": 4, "args": { "External id": 72625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72625, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72625, "pid": 5, "tid": 7, "ts": 1716454222502833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502822, "dur": 11, "args": { "External id": 72625, "cbid": 211, "correlation": 72625 } }, { "ph": "s", "id": 72625, "pid": 76337, "tid": -914061504, "ts": 1716454222502822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222502894, "dur": 4, "args": { "External id": 72638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72638, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72638, "pid": 5, "tid": 7, "ts": 1716454222502894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502881, "dur": 13, "args": { "External id": 72638, "cbid": 211, "correlation": 72638 } }, { "ph": "s", "id": 72638, "pid": 76337, "tid": -914061504, "ts": 1716454222502881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222502937, "dur": 5, "args": { "External id": 72648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72648, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 72648, "pid": 5, "tid": 7, "ts": 1716454222502937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222502924, "dur": 12, "args": { "External id": 72648, "cbid": 211, "correlation": 72648 } }, { "ph": "s", "id": 72648, "pid": 76337, "tid": -914061504, "ts": 1716454222502924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222503079, "dur": 5, "args": { "External id": 72665, "cbid": 251, "correlation": 72665 } }, { "ph": "f", "id": 72665, "pid": 76337, "tid": -914061504, "ts": 1716454222503079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454222503110, "dur": 12, "args": { "External id": 72667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72667, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 72667, "pid": 5, "tid": 7, "ts": 1716454222503110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503094, "dur": 17, "args": { "External id": 72667, "cbid": 211, "correlation": 72667 } }, { "ph": "s", "id": 72667, "pid": 76337, "tid": -914061504, "ts": 1716454222503094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222503174, "dur": 4, "args": { "External id": 72675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72675, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72675, "pid": 5, "tid": 7, "ts": 1716454222503174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503161, "dur": 13, "args": { "External id": 72675, "cbid": 211, "correlation": 72675 } }, { "ph": "s", "id": 72675, "pid": 76337, "tid": -914061504, "ts": 1716454222503161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222503232, "dur": 2, "args": { "External id": 72691, "cbid": 251, "correlation": 72691 } }, { "ph": "f", "id": 72691, "pid": 76337, "tid": -914061504, "ts": 1716454222503232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222503238, "dur": 0, "args": { "External id": 72693, "cbid": 251, "correlation": 72693 } }, { "ph": "f", "id": 72693, "pid": 76337, "tid": -914061504, "ts": 1716454222503238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222503255, "dur": 13, "args": { "External id": 72694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72694, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 72694, "pid": 5, "tid": 7, "ts": 1716454222503255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503241, "dur": 14, "args": { "External id": 72694, "cbid": 211, "correlation": 72694 } }, { "ph": "s", "id": 72694, "pid": 76337, "tid": -914061504, "ts": 1716454222503241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222503270, "dur": 5, "args": { "External id": 72696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72696, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 72696, "pid": 5, "tid": 7, "ts": 1716454222503270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503259, "dur": 9, "args": { "External id": 72696, "cbid": 211, "correlation": 72696 } }, { "ph": "s", "id": 72696, "pid": 76337, "tid": -914061504, "ts": 1716454222503259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222503419, "dur": 4, "args": { "External id": 72706, "cbid": 317, "correlation": 72706 } }, { "ph": "f", "id": 72706, "pid": 76337, "tid": -914061504, "ts": 1716454222503419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222503423, "dur": 1, "args": { "External id": 72707, "cbid": 203, "correlation": 72707 } }, { "ph": "f", "id": 72707, "pid": 76337, "tid": -914061504, "ts": 1716454222503423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222503425, "dur": 1, "args": { "External id": 72708, "cbid": 205, "correlation": 72708 } }, { "ph": "f", "id": 72708, "pid": 76337, "tid": -914061504, "ts": 1716454222503425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222503494, "dur": 7, "args": { "External id": 72712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72712, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72712, "pid": 5, "tid": 7, "ts": 1716454222503494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503478, "dur": 15, "args": { "External id": 72712, "cbid": 211, "correlation": 72712 } }, { "ph": "s", "id": 72712, "pid": 76337, "tid": -914061504, "ts": 1716454222503478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222503504, "dur": 4, "args": { "External id": 72714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72714, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 72714, "pid": 5, "tid": 7, "ts": 1716454222503504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503497, "dur": 6, "args": { "External id": 72714, "cbid": 211, "correlation": 72714 } }, { "ph": "s", "id": 72714, "pid": 76337, "tid": -914061504, "ts": 1716454222503497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222503547, "dur": 4, "args": { "External id": 72716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72716, "pid": 5, "tid": 7, "ts": 1716454222503547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503536, "dur": 9, "args": { "External id": 72716, "cbid": 211, "correlation": 72716 } }, { "ph": "s", "id": 72716, "pid": 76337, "tid": -914061504, "ts": 1716454222503536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222503557, "dur": 1, "args": { "External id": 72717, "cbid": 51, "correlation": 72717 } }, { "ph": "s", "id": 72717, "pid": 76337, "tid": -914061504, "ts": 1716454222503557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222503568, "dur": 87, "args": { "External id": 72718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72718, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 72718, "pid": 5, "tid": 7, "ts": 1716454222503568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503559, "dur": 8, "args": { "External id": 72718, "cbid": 211, "correlation": 72718 } }, { "ph": "s", "id": 72718, "pid": 76337, "tid": -914061504, "ts": 1716454222503559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222503656, "dur": 60, "args": { "External id": 72723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72723, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72723, "pid": 5, "tid": 7, "ts": 1716454222503656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222503598, "dur": 11, "args": { "External id": 72723, "cbid": 211, "correlation": 72723 } }, { "ph": "s", "id": 72723, "pid": 76337, "tid": -914061504, "ts": 1716454222503598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222505498, "dur": 52, "args": { "External id": 72743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72743, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 72743, "pid": 5, "tid": 7, "ts": 1716454222505498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505481, "dur": 17, "args": { "External id": 72743, "cbid": 211, "correlation": 72743 } }, { "ph": "s", "id": 72743, "pid": 76337, "tid": -914061504, "ts": 1716454222505481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222505551, "dur": 4, "args": { "External id": 72755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72755, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72755, "pid": 5, "tid": 7, "ts": 1716454222505551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505511, "dur": 8, "args": { "External id": 72755, "cbid": 211, "correlation": 72755 } }, { "ph": "s", "id": 72755, "pid": 76337, "tid": -914061504, "ts": 1716454222505511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222505557, "dur": 58, "args": { "External id": 72758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72758, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72758, "pid": 5, "tid": 7, "ts": 1716454222505557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505536, "dur": 8, "args": { "External id": 72758, "cbid": 211, "correlation": 72758 } }, { "ph": "s", "id": 72758, "pid": 76337, "tid": -914061504, "ts": 1716454222505536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222505616, "dur": 37, "args": { "External id": 72767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72767, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72767, "pid": 5, "tid": 7, "ts": 1716454222505616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505582, "dur": 10, "args": { "External id": 72767, "cbid": 211, "correlation": 72767 } }, { "ph": "s", "id": 72767, "pid": 76337, "tid": -914061504, "ts": 1716454222505582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222505642, "dur": 0, "args": { "External id": 72777, "cbid": 317, "correlation": 72777 } }, { "ph": "f", "id": 72777, "pid": 76337, "tid": -914061504, "ts": 1716454222505642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222505642, "dur": 0, "args": { "External id": 72778, "cbid": 203, "correlation": 72778 } }, { "ph": "f", "id": 72778, "pid": 76337, "tid": -914061504, "ts": 1716454222505642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222505643, "dur": 0, "args": { "External id": 72779, "cbid": 205, "correlation": 72779 } }, { "ph": "f", "id": 72779, "pid": 76337, "tid": -914061504, "ts": 1716454222505643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222505675, "dur": 40, "args": { "External id": 72783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72783, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72783, "pid": 5, "tid": 7, "ts": 1716454222505675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505662, "dur": 12, "args": { "External id": 72783, "cbid": 211, "correlation": 72783 } }, { "ph": "s", "id": 72783, "pid": 76337, "tid": -914061504, "ts": 1716454222505662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222505717, "dur": 14, "args": { "External id": 72785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72785, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72785, "pid": 5, "tid": 7, "ts": 1716454222505717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505677, "dur": 6, "args": { "External id": 72785, "cbid": 211, "correlation": 72785 } }, { "ph": "s", "id": 72785, "pid": 76337, "tid": -914061504, "ts": 1716454222505677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222505733, "dur": 3, "args": { "External id": 72787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72787, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72787, "pid": 5, "tid": 7, "ts": 1716454222505733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505701, "dur": 8, "args": { "External id": 72787, "cbid": 211, "correlation": 72787 } }, { "ph": "s", "id": 72787, "pid": 76337, "tid": -914061504, "ts": 1716454222505701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222505718, "dur": 0, "args": { "External id": 72788, "cbid": 51, "correlation": 72788 } }, { "ph": "s", "id": 72788, "pid": 76337, "tid": -914061504, "ts": 1716454222505718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222505738, "dur": 712, "args": { "External id": 72789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72789, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 72789, "pid": 5, "tid": 7, "ts": 1716454222505738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505719, "dur": 8, "args": { "External id": 72789, "cbid": 211, "correlation": 72789 } }, { "ph": "s", "id": 72789, "pid": 76337, "tid": -914061504, "ts": 1716454222505719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222506451, "dur": 59, "args": { "External id": 72794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72794, "pid": 5, "tid": 7, "ts": 1716454222506451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505751, "dur": 10, "args": { "External id": 72794, "cbid": 211, "correlation": 72794 } }, { "ph": "s", "id": 72794, "pid": 76337, "tid": -914061504, "ts": 1716454222505751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222506511, "dur": 3, "args": { "External id": 72802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72802, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72802, "pid": 5, "tid": 7, "ts": 1716454222506511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505797, "dur": 9, "args": { "External id": 72802, "cbid": 211, "correlation": 72802 } }, { "ph": "s", "id": 72802, "pid": 76337, "tid": -914061504, "ts": 1716454222505797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222505867, "dur": 2, "args": { "External id": 72818, "cbid": 251, "correlation": 72818 } }, { "ph": "f", "id": 72818, "pid": 76337, "tid": -914061504, "ts": 1716454222505867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222505872, "dur": 0, "args": { "External id": 72820, "cbid": 251, "correlation": 72820 } }, { "ph": "f", "id": 72820, "pid": 76337, "tid": -914061504, "ts": 1716454222505872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222506516, "dur": 9, "args": { "External id": 72821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72821, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 72821, "pid": 5, "tid": 7, "ts": 1716454222506516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505874, "dur": 12, "args": { "External id": 72821, "cbid": 211, "correlation": 72821 } }, { "ph": "s", "id": 72821, "pid": 76337, "tid": -914061504, "ts": 1716454222505874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222506526, "dur": 4, "args": { "External id": 72823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72823, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 72823, "pid": 5, "tid": 7, "ts": 1716454222506526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505888, "dur": 6, "args": { "External id": 72823, "cbid": 211, "correlation": 72823 } }, { "ph": "s", "id": 72823, "pid": 76337, "tid": -914061504, "ts": 1716454222505888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222506531, "dur": 56, "args": { "External id": 72833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72833, "pid": 5, "tid": 7, "ts": 1716454222506531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222505948, "dur": 12, "args": { "External id": 72833, "cbid": 211, "correlation": 72833 } }, { "ph": "s", "id": 72833, "pid": 76337, "tid": -914061504, "ts": 1716454222505948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222506589, "dur": 53, "args": { "External id": 72853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72853, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 72853, "pid": 5, "tid": 7, "ts": 1716454222506589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506027, "dur": 12, "args": { "External id": 72853, "cbid": 211, "correlation": 72853 } }, { "ph": "s", "id": 72853, "pid": 76337, "tid": -914061504, "ts": 1716454222506027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222506643, "dur": 4, "args": { "External id": 72865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72865, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72865, "pid": 5, "tid": 7, "ts": 1716454222506643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506050, "dur": 6, "args": { "External id": 72865, "cbid": 211, "correlation": 72865 } }, { "ph": "s", "id": 72865, "pid": 76337, "tid": -914061504, "ts": 1716454222506050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222506648, "dur": 56, "args": { "External id": 72868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72868, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72868, "pid": 5, "tid": 7, "ts": 1716454222506648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506070, "dur": 6, "args": { "External id": 72868, "cbid": 211, "correlation": 72868 } }, { "ph": "s", "id": 72868, "pid": 76337, "tid": -914061504, "ts": 1716454222506070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222506706, "dur": 37, "args": { "External id": 72877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72877, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72877, "pid": 5, "tid": 7, "ts": 1716454222506706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506111, "dur": 10, "args": { "External id": 72877, "cbid": 211, "correlation": 72877 } }, { "ph": "s", "id": 72877, "pid": 76337, "tid": -914061504, "ts": 1716454222506111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222506182, "dur": 0, "args": { "External id": 72887, "cbid": 317, "correlation": 72887 } }, { "ph": "f", "id": 72887, "pid": 76337, "tid": -914061504, "ts": 1716454222506182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222506183, "dur": 0, "args": { "External id": 72888, "cbid": 203, "correlation": 72888 } }, { "ph": "f", "id": 72888, "pid": 76337, "tid": -914061504, "ts": 1716454222506183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222506184, "dur": 0, "args": { "External id": 72889, "cbid": 205, "correlation": 72889 } }, { "ph": "f", "id": 72889, "pid": 76337, "tid": -914061504, "ts": 1716454222506184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222506744, "dur": 39, "args": { "External id": 72893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72893, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72893, "pid": 5, "tid": 7, "ts": 1716454222506744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506198, "dur": 12, "args": { "External id": 72893, "cbid": 211, "correlation": 72893 } }, { "ph": "s", "id": 72893, "pid": 76337, "tid": -914061504, "ts": 1716454222506198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222506785, "dur": 15, "args": { "External id": 72895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72895, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72895, "pid": 5, "tid": 7, "ts": 1716454222506785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506213, "dur": 5, "args": { "External id": 72895, "cbid": 211, "correlation": 72895 } }, { "ph": "s", "id": 72895, "pid": 76337, "tid": -914061504, "ts": 1716454222506213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222506801, "dur": 4, "args": { "External id": 72897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72897, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72897, "pid": 5, "tid": 7, "ts": 1716454222506801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506222, "dur": 6, "args": { "External id": 72897, "cbid": 211, "correlation": 72897 } }, { "ph": "s", "id": 72897, "pid": 76337, "tid": -914061504, "ts": 1716454222506222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222506231, "dur": 0, "args": { "External id": 72898, "cbid": 51, "correlation": 72898 } }, { "ph": "s", "id": 72898, "pid": 76337, "tid": -914061504, "ts": 1716454222506231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222506806, "dur": 705, "args": { "External id": 72899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72899, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 72899, "pid": 5, "tid": 7, "ts": 1716454222506806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506231, "dur": 5, "args": { "External id": 72899, "cbid": 211, "correlation": 72899 } }, { "ph": "s", "id": 72899, "pid": 76337, "tid": -914061504, "ts": 1716454222506231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222507513, "dur": 60, "args": { "External id": 72904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72904, "pid": 5, "tid": 7, "ts": 1716454222507513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506259, "dur": 8, "args": { "External id": 72904, "cbid": 211, "correlation": 72904 } }, { "ph": "s", "id": 72904, "pid": 76337, "tid": -914061504, "ts": 1716454222506259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222507574, "dur": 50, "args": { "External id": 72912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72912, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72912, "pid": 5, "tid": 7, "ts": 1716454222507574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506291, "dur": 9, "args": { "External id": 72912, "cbid": 211, "correlation": 72912 } }, { "ph": "s", "id": 72912, "pid": 76337, "tid": -914061504, "ts": 1716454222506291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222507626, "dur": 35, "args": { "External id": 72920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72920, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72920, "pid": 5, "tid": 7, "ts": 1716454222507626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506324, "dur": 10, "args": { "External id": 72920, "cbid": 211, "correlation": 72920 } }, { "ph": "s", "id": 72920, "pid": 76337, "tid": -914061504, "ts": 1716454222506324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222507662, "dur": 51, "args": { "External id": 72940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72940, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 72940, "pid": 5, "tid": 7, "ts": 1716454222507662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506407, "dur": 12, "args": { "External id": 72940, "cbid": 211, "correlation": 72940 } }, { "ph": "s", "id": 72940, "pid": 76337, "tid": -914061504, "ts": 1716454222506407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222507715, "dur": 4, "args": { "External id": 72952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72952, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 72952, "pid": 5, "tid": 7, "ts": 1716454222507715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506429, "dur": 7, "args": { "External id": 72952, "cbid": 211, "correlation": 72952 } }, { "ph": "s", "id": 72952, "pid": 76337, "tid": -914061504, "ts": 1716454222506429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222507720, "dur": 56, "args": { "External id": 72955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72955, "pid": 5, "tid": 7, "ts": 1716454222507720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506447, "dur": 6, "args": { "External id": 72955, "cbid": 211, "correlation": 72955 } }, { "ph": "s", "id": 72955, "pid": 76337, "tid": -914061504, "ts": 1716454222506447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222506504, "dur": 0, "args": { "External id": 72966, "cbid": 317, "correlation": 72966 } }, { "ph": "f", "id": 72966, "pid": 76337, "tid": -914061504, "ts": 1716454222506504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222506505, "dur": 0, "args": { "External id": 72967, "cbid": 203, "correlation": 72967 } }, { "ph": "f", "id": 72967, "pid": 76337, "tid": -914061504, "ts": 1716454222506505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222506506, "dur": 0, "args": { "External id": 72968, "cbid": 205, "correlation": 72968 } }, { "ph": "f", "id": 72968, "pid": 76337, "tid": -914061504, "ts": 1716454222506506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506539, "dur": 2, "args": { "External id": 72972, "cbid": 251, "correlation": 72972 } }, { "ph": "f", "id": 72972, "pid": 76337, "tid": -914061504, "ts": 1716454222506539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506543, "dur": 1, "args": { "External id": 72973, "cbid": 251, "correlation": 72973 } }, { "ph": "f", "id": 72973, "pid": 76337, "tid": -914061504, "ts": 1716454222506543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506545, "dur": 1, "args": { "External id": 72974, "cbid": 251, "correlation": 72974 } }, { "ph": "f", "id": 72974, "pid": 76337, "tid": -914061504, "ts": 1716454222506545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506547, "dur": 1, "args": { "External id": 72975, "cbid": 251, "correlation": 72975 } }, { "ph": "f", "id": 72975, "pid": 76337, "tid": -914061504, "ts": 1716454222506547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506549, "dur": 1, "args": { "External id": 72976, "cbid": 251, "correlation": 72976 } }, { "ph": "f", "id": 72976, "pid": 76337, "tid": -914061504, "ts": 1716454222506549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506551, "dur": 1, "args": { "External id": 72977, "cbid": 251, "correlation": 72977 } }, { "ph": "f", "id": 72977, "pid": 76337, "tid": -914061504, "ts": 1716454222506551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506553, "dur": 1, "args": { "External id": 72978, "cbid": 251, "correlation": 72978 } }, { "ph": "f", "id": 72978, "pid": 76337, "tid": -914061504, "ts": 1716454222506553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506555, "dur": 1, "args": { "External id": 72979, "cbid": 251, "correlation": 72979 } }, { "ph": "f", "id": 72979, "pid": 76337, "tid": -914061504, "ts": 1716454222506555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506558, "dur": 0, "args": { "External id": 72980, "cbid": 251, "correlation": 72980 } }, { "ph": "f", "id": 72980, "pid": 76337, "tid": -914061504, "ts": 1716454222506558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222507777, "dur": 118, "args": { "External id": 72981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72981, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 72981, "pid": 5, "tid": 7, "ts": 1716454222507777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506562, "dur": 14, "args": { "External id": 72981, "cbid": 211, "correlation": 72981 } }, { "ph": "s", "id": 72981, "pid": 76337, "tid": -914061504, "ts": 1716454222506562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222507896, "dur": 60, "args": { "External id": 72987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72987, "pid": 5, "tid": 7, "ts": 1716454222507896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506600, "dur": 10, "args": { "External id": 72987, "cbid": 211, "correlation": 72987 } }, { "ph": "s", "id": 72987, "pid": 76337, "tid": -914061504, "ts": 1716454222506600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222507958, "dur": 552, "args": { "External id": 72996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 72996, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 72996, "pid": 5, "tid": 7, "ts": 1716454222507958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506698, "dur": 16, "args": { "External id": 72996, "cbid": 211, "correlation": 72996 } }, { "ph": "s", "id": 72996, "pid": 76337, "tid": -914061504, "ts": 1716454222506698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222508511, "dur": 184, "args": { "External id": 73018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73018, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73018, "pid": 5, "tid": 7, "ts": 1716454222508511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506772, "dur": 11, "args": { "External id": 73018, "cbid": 211, "correlation": 73018 } }, { "ph": "s", "id": 73018, "pid": 76337, "tid": -914061504, "ts": 1716454222506772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506889, "dur": 2, "args": { "External id": 73029, "cbid": 251, "correlation": 73029 } }, { "ph": "f", "id": 73029, "pid": 76337, "tid": -914061504, "ts": 1716454222506889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222508697, "dur": 199, "args": { "External id": 73030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73030, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73030, "pid": 5, "tid": 7, "ts": 1716454222508697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506896, "dur": 15, "args": { "External id": 73030, "cbid": 211, "correlation": 73030 } }, { "ph": "s", "id": 73030, "pid": 76337, "tid": -914061504, "ts": 1716454222506896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222506970, "dur": 1, "args": { "External id": 73041, "cbid": 251, "correlation": 73041 } }, { "ph": "f", "id": 73041, "pid": 76337, "tid": -914061504, "ts": 1716454222506970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222508897, "dur": 191, "args": { "External id": 73042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73042, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73042, "pid": 5, "tid": 7, "ts": 1716454222508897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222506981, "dur": 13, "args": { "External id": 73042, "cbid": 211, "correlation": 73042 } }, { "ph": "s", "id": 73042, "pid": 76337, "tid": -914061504, "ts": 1716454222506981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507050, "dur": 1, "args": { "External id": 73053, "cbid": 251, "correlation": 73053 } }, { "ph": "f", "id": 73053, "pid": 76337, "tid": -914061504, "ts": 1716454222507050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222509089, "dur": 196, "args": { "External id": 73054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73054, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73054, "pid": 5, "tid": 7, "ts": 1716454222509089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507054, "dur": 11, "args": { "External id": 73054, "cbid": 211, "correlation": 73054 } }, { "ph": "s", "id": 73054, "pid": 76337, "tid": -914061504, "ts": 1716454222507054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222509286, "dur": 19050, "args": { "External id": 73075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73075, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73075, "pid": 5, "tid": 7, "ts": 1716454222509286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507163, "dur": 15, "args": { "External id": 73075, "cbid": 211, "correlation": 73075 } }, { "ph": "s", "id": 73075, "pid": 76337, "tid": -914061504, "ts": 1716454222507163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507277, "dur": 2, "args": { "External id": 73093, "cbid": 251, "correlation": 73093 } }, { "ph": "f", "id": 73093, "pid": 76337, "tid": -914061504, "ts": 1716454222507277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222528338, "dur": 206, "args": { "External id": 73095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73095, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73095, "pid": 5, "tid": 7, "ts": 1716454222528338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507284, "dur": 14, "args": { "External id": 73095, "cbid": 211, "correlation": 73095 } }, { "ph": "s", "id": 73095, "pid": 76337, "tid": -914061504, "ts": 1716454222507284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222528545, "dur": 66, "args": { "External id": 73103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73103, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73103, "pid": 5, "tid": 7, "ts": 1716454222528545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507360, "dur": 13, "args": { "External id": 73103, "cbid": 211, "correlation": 73103 } }, { "ph": "s", "id": 73103, "pid": 76337, "tid": -914061504, "ts": 1716454222507360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222528613, "dur": 97, "args": { "External id": 73111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73111, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73111, "pid": 5, "tid": 7, "ts": 1716454222528613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507400, "dur": 10, "args": { "External id": 73111, "cbid": 211, "correlation": 73111 } }, { "ph": "s", "id": 73111, "pid": 76337, "tid": -914061504, "ts": 1716454222507400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222528711, "dur": 56, "args": { "External id": 73122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73122, "pid": 5, "tid": 7, "ts": 1716454222528711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507487, "dur": 14, "args": { "External id": 73122, "cbid": 211, "correlation": 73122 } }, { "ph": "s", "id": 73122, "pid": 76337, "tid": -914061504, "ts": 1716454222507487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222528768, "dur": 94, "args": { "External id": 73144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73144, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73144, "pid": 5, "tid": 7, "ts": 1716454222528768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507521, "dur": 10, "args": { "External id": 73144, "cbid": 211, "correlation": 73144 } }, { "ph": "s", "id": 73144, "pid": 76337, "tid": -914061504, "ts": 1716454222507521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507606, "dur": 1, "args": { "External id": 73155, "cbid": 251, "correlation": 73155 } }, { "ph": "f", "id": 73155, "pid": 76337, "tid": -914061504, "ts": 1716454222507606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222528864, "dur": 107, "args": { "External id": 73156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73156, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73156, "pid": 5, "tid": 7, "ts": 1716454222528864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507611, "dur": 12, "args": { "External id": 73156, "cbid": 211, "correlation": 73156 } }, { "ph": "s", "id": 73156, "pid": 76337, "tid": -914061504, "ts": 1716454222507611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507694, "dur": 1, "args": { "External id": 73167, "cbid": 251, "correlation": 73167 } }, { "ph": "f", "id": 73167, "pid": 76337, "tid": -914061504, "ts": 1716454222507694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507698, "dur": 0, "args": { "External id": 73168, "cbid": 251, "correlation": 73168 } }, { "ph": "f", "id": 73168, "pid": 76337, "tid": -914061504, "ts": 1716454222507698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222528972, "dur": 10, "args": { "External id": 73169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73169, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 73169, "pid": 5, "tid": 7, "ts": 1716454222528972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507700, "dur": 14, "args": { "External id": 73169, "cbid": 211, "correlation": 73169 } }, { "ph": "s", "id": 73169, "pid": 76337, "tid": -914061504, "ts": 1716454222507700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222528984, "dur": 5, "args": { "External id": 73171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73171, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 73171, "pid": 5, "tid": 7, "ts": 1716454222528984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507718, "dur": 7, "args": { "External id": 73171, "cbid": 211, "correlation": 73171 } }, { "ph": "s", "id": 73171, "pid": 76337, "tid": -914061504, "ts": 1716454222507718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507779, "dur": 1, "args": { "External id": 73182, "cbid": 251, "correlation": 73182 } }, { "ph": "f", "id": 73182, "pid": 76337, "tid": -914061504, "ts": 1716454222507779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507783, "dur": 0, "args": { "External id": 73183, "cbid": 251, "correlation": 73183 } }, { "ph": "f", "id": 73183, "pid": 76337, "tid": -914061504, "ts": 1716454222507783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222528990, "dur": 6, "args": { "External id": 73184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73184, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 73184, "pid": 5, "tid": 7, "ts": 1716454222528990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507784, "dur": 12, "args": { "External id": 73184, "cbid": 211, "correlation": 73184 } }, { "ph": "s", "id": 73184, "pid": 76337, "tid": -914061504, "ts": 1716454222507784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222528998, "dur": 4, "args": { "External id": 73186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73186, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 73186, "pid": 5, "tid": 7, "ts": 1716454222528998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507797, "dur": 5, "args": { "External id": 73186, "cbid": 211, "correlation": 73186 } }, { "ph": "s", "id": 73186, "pid": 76337, "tid": -914061504, "ts": 1716454222507797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222529003, "dur": 159, "args": { "External id": 73207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73207, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73207, "pid": 5, "tid": 7, "ts": 1716454222529003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507872, "dur": 12, "args": { "External id": 73207, "cbid": 211, "correlation": 73207 } }, { "ph": "s", "id": 73207, "pid": 76337, "tid": -914061504, "ts": 1716454222507872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222507971, "dur": 9, "args": { "External id": 73225, "cbid": 251, "correlation": 73225 } }, { "ph": "f", "id": 73225, "pid": 76337, "tid": -914061504, "ts": 1716454222507971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222529163, "dur": 109, "args": { "External id": 73227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73227, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73227, "pid": 5, "tid": 7, "ts": 1716454222529163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222507985, "dur": 14, "args": { "External id": 73227, "cbid": 211, "correlation": 73227 } }, { "ph": "s", "id": 73227, "pid": 76337, "tid": -914061504, "ts": 1716454222507985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222529274, "dur": 35, "args": { "External id": 73235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73235, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73235, "pid": 5, "tid": 7, "ts": 1716454222529274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508059, "dur": 12, "args": { "External id": 73235, "cbid": 211, "correlation": 73235 } }, { "ph": "s", "id": 73235, "pid": 76337, "tid": -914061504, "ts": 1716454222508059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222529310, "dur": 69, "args": { "External id": 73243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73243, "pid": 5, "tid": 7, "ts": 1716454222529310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508101, "dur": 9, "args": { "External id": 73243, "cbid": 211, "correlation": 73243 } }, { "ph": "s", "id": 73243, "pid": 76337, "tid": -914061504, "ts": 1716454222508101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222529380, "dur": 94, "args": { "External id": 73265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73265, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73265, "pid": 5, "tid": 7, "ts": 1716454222529380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508153, "dur": 10, "args": { "External id": 73265, "cbid": 211, "correlation": 73265 } }, { "ph": "s", "id": 73265, "pid": 76337, "tid": -914061504, "ts": 1716454222508153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222508245, "dur": 1, "args": { "External id": 73281, "cbid": 251, "correlation": 73281 } }, { "ph": "f", "id": 73281, "pid": 76337, "tid": -914061504, "ts": 1716454222508245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222529475, "dur": 585, "args": { "External id": 73283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73283, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73283, "pid": 5, "tid": 7, "ts": 1716454222529475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508250, "dur": 13, "args": { "External id": 73283, "cbid": 211, "correlation": 73283 } }, { "ph": "s", "id": 73283, "pid": 76337, "tid": -914061504, "ts": 1716454222508250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222530061, "dur": 244, "args": { "External id": 73291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73291, "pid": 5, "tid": 7, "ts": 1716454222530061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508329, "dur": 14, "args": { "External id": 73291, "cbid": 211, "correlation": 73291 } }, { "ph": "s", "id": 73291, "pid": 76337, "tid": -914061504, "ts": 1716454222508329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222530306, "dur": 253, "args": { "External id": 73299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73299, "pid": 5, "tid": 7, "ts": 1716454222530306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508363, "dur": 9, "args": { "External id": 73299, "cbid": 211, "correlation": 73299 } }, { "ph": "s", "id": 73299, "pid": 76337, "tid": -914061504, "ts": 1716454222508363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222508451, "dur": 2, "args": { "External id": 73315, "cbid": 251, "correlation": 73315 } }, { "ph": "f", "id": 73315, "pid": 76337, "tid": -914061504, "ts": 1716454222508451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222508457, "dur": 0, "args": { "External id": 73317, "cbid": 251, "correlation": 73317 } }, { "ph": "f", "id": 73317, "pid": 76337, "tid": -914061504, "ts": 1716454222508457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222530560, "dur": 362, "args": { "External id": 73318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73318, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 73318, "pid": 5, "tid": 7, "ts": 1716454222530560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508462, "dur": 13, "args": { "External id": 73318, "cbid": 211, "correlation": 73318 } }, { "ph": "s", "id": 73318, "pid": 76337, "tid": -914061504, "ts": 1716454222508462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222530923, "dur": 50, "args": { "External id": 73326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73326, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73326, "pid": 5, "tid": 7, "ts": 1716454222530923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508505, "dur": 10, "args": { "External id": 73326, "cbid": 211, "correlation": 73326 } }, { "ph": "s", "id": 73326, "pid": 76337, "tid": -914061504, "ts": 1716454222508505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222530974, "dur": 160, "args": { "External id": 73337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73337, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73337, "pid": 5, "tid": 7, "ts": 1716454222530974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222508579, "dur": 13, "args": { "External id": 73337, "cbid": 211, "correlation": 73337 } }, { "ph": "s", "id": 73337, "pid": 76337, "tid": -914061504, "ts": 1716454222508579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222508649, "dur": 0, "args": { "External id": 73349, "cbid": 317, "correlation": 73349 } }, { "ph": "f", "id": 73349, "pid": 76337, "tid": -914061504, "ts": 1716454222508649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222508650, "dur": 0, "args": { "External id": 73350, "cbid": 203, "correlation": 73350 } }, { "ph": "f", "id": 73350, "pid": 76337, "tid": -914061504, "ts": 1716454222508650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222508650, "dur": 0, "args": { "External id": 73351, "cbid": 205, "correlation": 73351 } }, { "ph": "f", "id": 73351, "pid": 76337, "tid": -914061504, "ts": 1716454222508650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509301, "dur": 2, "args": { "External id": 73355, "cbid": 251, "correlation": 73355 } }, { "ph": "f", "id": 73355, "pid": 76337, "tid": -914061504, "ts": 1716454222509301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509303, "dur": 0, "args": { "External id": 73356, "cbid": 251, "correlation": 73356 } }, { "ph": "f", "id": 73356, "pid": 76337, "tid": -914061504, "ts": 1716454222509303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509304, "dur": 0, "args": { "External id": 73357, "cbid": 251, "correlation": 73357 } }, { "ph": "f", "id": 73357, "pid": 76337, "tid": -914061504, "ts": 1716454222509304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509306, "dur": 0, "args": { "External id": 73358, "cbid": 251, "correlation": 73358 } }, { "ph": "f", "id": 73358, "pid": 76337, "tid": -914061504, "ts": 1716454222509306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509307, "dur": 1, "args": { "External id": 73359, "cbid": 251, "correlation": 73359 } }, { "ph": "f", "id": 73359, "pid": 76337, "tid": -914061504, "ts": 1716454222509307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509308, "dur": 0, "args": { "External id": 73360, "cbid": 251, "correlation": 73360 } }, { "ph": "f", "id": 73360, "pid": 76337, "tid": -914061504, "ts": 1716454222509308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509309, "dur": 0, "args": { "External id": 73361, "cbid": 251, "correlation": 73361 } }, { "ph": "f", "id": 73361, "pid": 76337, "tid": -914061504, "ts": 1716454222509309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509310, "dur": 0, "args": { "External id": 73362, "cbid": 251, "correlation": 73362 } }, { "ph": "f", "id": 73362, "pid": 76337, "tid": -914061504, "ts": 1716454222509310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509312, "dur": 0, "args": { "External id": 73363, "cbid": 251, "correlation": 73363 } }, { "ph": "f", "id": 73363, "pid": 76337, "tid": -914061504, "ts": 1716454222509312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222531136, "dur": 115, "args": { "External id": 73364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73364, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73364, "pid": 5, "tid": 7, "ts": 1716454222531136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509314, "dur": 14, "args": { "External id": 73364, "cbid": 211, "correlation": 73364 } }, { "ph": "s", "id": 73364, "pid": 76337, "tid": -914061504, "ts": 1716454222509314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222531252, "dur": 60, "args": { "External id": 73370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73370, "pid": 5, "tid": 7, "ts": 1716454222531252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509354, "dur": 9, "args": { "External id": 73370, "cbid": 211, "correlation": 73370 } }, { "ph": "s", "id": 73370, "pid": 76337, "tid": -914061504, "ts": 1716454222509354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222531314, "dur": 50, "args": { "External id": 73378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73378, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73378, "pid": 5, "tid": 7, "ts": 1716454222531314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509389, "dur": 9, "args": { "External id": 73378, "cbid": 211, "correlation": 73378 } }, { "ph": "s", "id": 73378, "pid": 76337, "tid": -914061504, "ts": 1716454222509389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222531365, "dur": 53, "args": { "External id": 73398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73398, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 73398, "pid": 5, "tid": 7, "ts": 1716454222531365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509467, "dur": 11, "args": { "External id": 73398, "cbid": 211, "correlation": 73398 } }, { "ph": "s", "id": 73398, "pid": 76337, "tid": -914061504, "ts": 1716454222509467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222531419, "dur": 4, "args": { "External id": 73410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73410, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 73410, "pid": 5, "tid": 7, "ts": 1716454222531419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509490, "dur": 7, "args": { "External id": 73410, "cbid": 211, "correlation": 73410 } }, { "ph": "s", "id": 73410, "pid": 76337, "tid": -914061504, "ts": 1716454222509490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222531425, "dur": 58, "args": { "External id": 73413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73413, "pid": 5, "tid": 7, "ts": 1716454222531425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509509, "dur": 7, "args": { "External id": 73413, "cbid": 211, "correlation": 73413 } }, { "ph": "s", "id": 73413, "pid": 76337, "tid": -914061504, "ts": 1716454222509509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222531484, "dur": 38, "args": { "External id": 73422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73422, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73422, "pid": 5, "tid": 7, "ts": 1716454222531484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509551, "dur": 10, "args": { "External id": 73422, "cbid": 211, "correlation": 73422 } }, { "ph": "s", "id": 73422, "pid": 76337, "tid": -914061504, "ts": 1716454222509551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222509605, "dur": 0, "args": { "External id": 73432, "cbid": 317, "correlation": 73432 } }, { "ph": "f", "id": 73432, "pid": 76337, "tid": -914061504, "ts": 1716454222509605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222509606, "dur": 0, "args": { "External id": 73433, "cbid": 203, "correlation": 73433 } }, { "ph": "f", "id": 73433, "pid": 76337, "tid": -914061504, "ts": 1716454222509606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222509607, "dur": 0, "args": { "External id": 73434, "cbid": 205, "correlation": 73434 } }, { "ph": "f", "id": 73434, "pid": 76337, "tid": -914061504, "ts": 1716454222509607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222531523, "dur": 42, "args": { "External id": 73438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73438, "pid": 5, "tid": 7, "ts": 1716454222531523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509640, "dur": 12, "args": { "External id": 73438, "cbid": 211, "correlation": 73438 } }, { "ph": "s", "id": 73438, "pid": 76337, "tid": -914061504, "ts": 1716454222509640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222531567, "dur": 15, "args": { "External id": 73440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73440, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73440, "pid": 5, "tid": 7, "ts": 1716454222531567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509655, "dur": 6, "args": { "External id": 73440, "cbid": 211, "correlation": 73440 } }, { "ph": "s", "id": 73440, "pid": 76337, "tid": -914061504, "ts": 1716454222509655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222531582, "dur": 4, "args": { "External id": 73442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 73442, "pid": 5, "tid": 7, "ts": 1716454222531582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509666, "dur": 6, "args": { "External id": 73442, "cbid": 211, "correlation": 73442 } }, { "ph": "s", "id": 73442, "pid": 76337, "tid": -914061504, "ts": 1716454222509666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222509675, "dur": 0, "args": { "External id": 73443, "cbid": 51, "correlation": 73443 } }, { "ph": "s", "id": 73443, "pid": 76337, "tid": -914061504, "ts": 1716454222509675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222531587, "dur": 712, "args": { "External id": 73444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73444, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73444, "pid": 5, "tid": 7, "ts": 1716454222531587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509676, "dur": 5, "args": { "External id": 73444, "cbid": 211, "correlation": 73444 } }, { "ph": "s", "id": 73444, "pid": 76337, "tid": -914061504, "ts": 1716454222509676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222532301, "dur": 60, "args": { "External id": 73449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73449, "pid": 5, "tid": 7, "ts": 1716454222532301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509703, "dur": 9, "args": { "External id": 73449, "cbid": 211, "correlation": 73449 } }, { "ph": "s", "id": 73449, "pid": 76337, "tid": -914061504, "ts": 1716454222509703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222532362, "dur": 3, "args": { "External id": 73457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73457, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 73457, "pid": 5, "tid": 7, "ts": 1716454222532362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509746, "dur": 9, "args": { "External id": 73457, "cbid": 211, "correlation": 73457 } }, { "ph": "s", "id": 73457, "pid": 76337, "tid": -914061504, "ts": 1716454222509746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509813, "dur": 2, "args": { "External id": 73473, "cbid": 251, "correlation": 73473 } }, { "ph": "f", "id": 73473, "pid": 76337, "tid": -914061504, "ts": 1716454222509813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222509819, "dur": 0, "args": { "External id": 73475, "cbid": 251, "correlation": 73475 } }, { "ph": "f", "id": 73475, "pid": 76337, "tid": -914061504, "ts": 1716454222509819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222532367, "dur": 11, "args": { "External id": 73476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73476, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 73476, "pid": 5, "tid": 7, "ts": 1716454222532367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509821, "dur": 11, "args": { "External id": 73476, "cbid": 211, "correlation": 73476 } }, { "ph": "s", "id": 73476, "pid": 76337, "tid": -914061504, "ts": 1716454222509821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222532379, "dur": 5, "args": { "External id": 73478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73478, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 73478, "pid": 5, "tid": 7, "ts": 1716454222532379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509835, "dur": 5, "args": { "External id": 73478, "cbid": 211, "correlation": 73478 } }, { "ph": "s", "id": 73478, "pid": 76337, "tid": -914061504, "ts": 1716454222509835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222532386, "dur": 55, "args": { "External id": 73488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73488, "pid": 5, "tid": 7, "ts": 1716454222532386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509893, "dur": 12, "args": { "External id": 73488, "cbid": 211, "correlation": 73488 } }, { "ph": "s", "id": 73488, "pid": 76337, "tid": -914061504, "ts": 1716454222509893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222532442, "dur": 51, "args": { "External id": 73508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73508, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 73508, "pid": 5, "tid": 7, "ts": 1716454222532442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509960, "dur": 11, "args": { "External id": 73508, "cbid": 211, "correlation": 73508 } }, { "ph": "s", "id": 73508, "pid": 76337, "tid": -914061504, "ts": 1716454222509960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222532494, "dur": 4, "args": { "External id": 73520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73520, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 73520, "pid": 5, "tid": 7, "ts": 1716454222532494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222509989, "dur": 7, "args": { "External id": 73520, "cbid": 211, "correlation": 73520 } }, { "ph": "s", "id": 73520, "pid": 76337, "tid": -914061504, "ts": 1716454222509989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222532500, "dur": 57, "args": { "External id": 73523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73523, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73523, "pid": 5, "tid": 7, "ts": 1716454222532500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510008, "dur": 6, "args": { "External id": 73523, "cbid": 211, "correlation": 73523 } }, { "ph": "s", "id": 73523, "pid": 76337, "tid": -914061504, "ts": 1716454222510008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222532558, "dur": 36, "args": { "External id": 73532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73532, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73532, "pid": 5, "tid": 7, "ts": 1716454222532558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510050, "dur": 9, "args": { "External id": 73532, "cbid": 211, "correlation": 73532 } }, { "ph": "s", "id": 73532, "pid": 76337, "tid": -914061504, "ts": 1716454222510050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222510113, "dur": 0, "args": { "External id": 73542, "cbid": 317, "correlation": 73542 } }, { "ph": "f", "id": 73542, "pid": 76337, "tid": -914061504, "ts": 1716454222510113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222510113, "dur": 0, "args": { "External id": 73543, "cbid": 203, "correlation": 73543 } }, { "ph": "f", "id": 73543, "pid": 76337, "tid": -914061504, "ts": 1716454222510113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222510114, "dur": 0, "args": { "External id": 73544, "cbid": 205, "correlation": 73544 } }, { "ph": "f", "id": 73544, "pid": 76337, "tid": -914061504, "ts": 1716454222510114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222532595, "dur": 40, "args": { "External id": 73548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73548, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73548, "pid": 5, "tid": 7, "ts": 1716454222532595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510129, "dur": 12, "args": { "External id": 73548, "cbid": 211, "correlation": 73548 } }, { "ph": "s", "id": 73548, "pid": 76337, "tid": -914061504, "ts": 1716454222510129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222532636, "dur": 15, "args": { "External id": 73550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73550, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73550, "pid": 5, "tid": 7, "ts": 1716454222532636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510143, "dur": 5, "args": { "External id": 73550, "cbid": 211, "correlation": 73550 } }, { "ph": "s", "id": 73550, "pid": 76337, "tid": -914061504, "ts": 1716454222510143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222532652, "dur": 3, "args": { "External id": 73552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73552, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 73552, "pid": 5, "tid": 7, "ts": 1716454222532652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510152, "dur": 5, "args": { "External id": 73552, "cbid": 211, "correlation": 73552 } }, { "ph": "s", "id": 73552, "pid": 76337, "tid": -914061504, "ts": 1716454222510152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222510160, "dur": 0, "args": { "External id": 73553, "cbid": 51, "correlation": 73553 } }, { "ph": "s", "id": 73553, "pid": 76337, "tid": -914061504, "ts": 1716454222510160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222532656, "dur": 707, "args": { "External id": 73554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73554, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73554, "pid": 5, "tid": 7, "ts": 1716454222532656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510161, "dur": 5, "args": { "External id": 73554, "cbid": 211, "correlation": 73554 } }, { "ph": "s", "id": 73554, "pid": 76337, "tid": -914061504, "ts": 1716454222510161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222533365, "dur": 60, "args": { "External id": 73559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73559, "pid": 5, "tid": 7, "ts": 1716454222533365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510188, "dur": 8, "args": { "External id": 73559, "cbid": 211, "correlation": 73559 } }, { "ph": "s", "id": 73559, "pid": 76337, "tid": -914061504, "ts": 1716454222510188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222533426, "dur": 50, "args": { "External id": 73567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73567, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73567, "pid": 5, "tid": 7, "ts": 1716454222533426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510219, "dur": 8, "args": { "External id": 73567, "cbid": 211, "correlation": 73567 } }, { "ph": "s", "id": 73567, "pid": 76337, "tid": -914061504, "ts": 1716454222510219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222533478, "dur": 36, "args": { "External id": 73575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73575, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73575, "pid": 5, "tid": 7, "ts": 1716454222533478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510249, "dur": 8, "args": { "External id": 73575, "cbid": 211, "correlation": 73575 } }, { "ph": "s", "id": 73575, "pid": 76337, "tid": -914061504, "ts": 1716454222510249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222533515, "dur": 53, "args": { "External id": 73595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73595, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 73595, "pid": 5, "tid": 7, "ts": 1716454222533515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510329, "dur": 13, "args": { "External id": 73595, "cbid": 211, "correlation": 73595 } }, { "ph": "s", "id": 73595, "pid": 76337, "tid": -914061504, "ts": 1716454222510329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222533569, "dur": 4, "args": { "External id": 73607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73607, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 73607, "pid": 5, "tid": 7, "ts": 1716454222533569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510351, "dur": 6, "args": { "External id": 73607, "cbid": 211, "correlation": 73607 } }, { "ph": "s", "id": 73607, "pid": 76337, "tid": -914061504, "ts": 1716454222510351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222533574, "dur": 55, "args": { "External id": 73610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73610, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73610, "pid": 5, "tid": 7, "ts": 1716454222533574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510368, "dur": 6, "args": { "External id": 73610, "cbid": 211, "correlation": 73610 } }, { "ph": "s", "id": 73610, "pid": 76337, "tid": -914061504, "ts": 1716454222510368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222510424, "dur": 0, "args": { "External id": 73621, "cbid": 317, "correlation": 73621 } }, { "ph": "f", "id": 73621, "pid": 76337, "tid": -914061504, "ts": 1716454222510424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222510424, "dur": 0, "args": { "External id": 73622, "cbid": 203, "correlation": 73622 } }, { "ph": "f", "id": 73622, "pid": 76337, "tid": -914061504, "ts": 1716454222510424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222510425, "dur": 0, "args": { "External id": 73623, "cbid": 205, "correlation": 73623 } }, { "ph": "f", "id": 73623, "pid": 76337, "tid": -914061504, "ts": 1716454222510425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510447, "dur": 1, "args": { "External id": 73627, "cbid": 251, "correlation": 73627 } }, { "ph": "f", "id": 73627, "pid": 76337, "tid": -914061504, "ts": 1716454222510447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510449, "dur": 0, "args": { "External id": 73628, "cbid": 251, "correlation": 73628 } }, { "ph": "f", "id": 73628, "pid": 76337, "tid": -914061504, "ts": 1716454222510449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510450, "dur": 0, "args": { "External id": 73629, "cbid": 251, "correlation": 73629 } }, { "ph": "f", "id": 73629, "pid": 76337, "tid": -914061504, "ts": 1716454222510450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510451, "dur": 0, "args": { "External id": 73630, "cbid": 251, "correlation": 73630 } }, { "ph": "f", "id": 73630, "pid": 76337, "tid": -914061504, "ts": 1716454222510451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510451, "dur": 0, "args": { "External id": 73631, "cbid": 251, "correlation": 73631 } }, { "ph": "f", "id": 73631, "pid": 76337, "tid": -914061504, "ts": 1716454222510451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510452, "dur": 0, "args": { "External id": 73632, "cbid": 251, "correlation": 73632 } }, { "ph": "f", "id": 73632, "pid": 76337, "tid": -914061504, "ts": 1716454222510452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510453, "dur": 0, "args": { "External id": 73633, "cbid": 251, "correlation": 73633 } }, { "ph": "f", "id": 73633, "pid": 76337, "tid": -914061504, "ts": 1716454222510453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510454, "dur": 0, "args": { "External id": 73634, "cbid": 251, "correlation": 73634 } }, { "ph": "f", "id": 73634, "pid": 76337, "tid": -914061504, "ts": 1716454222510454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510455, "dur": 0, "args": { "External id": 73635, "cbid": 251, "correlation": 73635 } }, { "ph": "f", "id": 73635, "pid": 76337, "tid": -914061504, "ts": 1716454222510455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222533631, "dur": 114, "args": { "External id": 73636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73636, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73636, "pid": 5, "tid": 7, "ts": 1716454222533631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510457, "dur": 12, "args": { "External id": 73636, "cbid": 211, "correlation": 73636 } }, { "ph": "s", "id": 73636, "pid": 76337, "tid": -914061504, "ts": 1716454222510457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222533746, "dur": 60, "args": { "External id": 73642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73642, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73642, "pid": 5, "tid": 7, "ts": 1716454222533746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510492, "dur": 9, "args": { "External id": 73642, "cbid": 211, "correlation": 73642 } }, { "ph": "s", "id": 73642, "pid": 76337, "tid": -914061504, "ts": 1716454222510492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222533808, "dur": 663, "args": { "External id": 73651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73651, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73651, "pid": 5, "tid": 7, "ts": 1716454222533808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510577, "dur": 14, "args": { "External id": 73651, "cbid": 211, "correlation": 73651 } }, { "ph": "s", "id": 73651, "pid": 76337, "tid": -914061504, "ts": 1716454222510577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222534472, "dur": 184, "args": { "External id": 73673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73673, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73673, "pid": 5, "tid": 7, "ts": 1716454222534472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510636, "dur": 10, "args": { "External id": 73673, "cbid": 211, "correlation": 73673 } }, { "ph": "s", "id": 73673, "pid": 76337, "tid": -914061504, "ts": 1716454222510636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510726, "dur": 1, "args": { "External id": 73684, "cbid": 251, "correlation": 73684 } }, { "ph": "f", "id": 73684, "pid": 76337, "tid": -914061504, "ts": 1716454222510726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222534658, "dur": 198, "args": { "External id": 73685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73685, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73685, "pid": 5, "tid": 7, "ts": 1716454222534658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510731, "dur": 12, "args": { "External id": 73685, "cbid": 211, "correlation": 73685 } }, { "ph": "s", "id": 73685, "pid": 76337, "tid": -914061504, "ts": 1716454222510731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510799, "dur": 1, "args": { "External id": 73696, "cbid": 251, "correlation": 73696 } }, { "ph": "f", "id": 73696, "pid": 76337, "tid": -914061504, "ts": 1716454222510799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222534857, "dur": 188, "args": { "External id": 73697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73697, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73697, "pid": 5, "tid": 7, "ts": 1716454222534857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510803, "dur": 12, "args": { "External id": 73697, "cbid": 211, "correlation": 73697 } }, { "ph": "s", "id": 73697, "pid": 76337, "tid": -914061504, "ts": 1716454222510803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222510866, "dur": 1, "args": { "External id": 73708, "cbid": 251, "correlation": 73708 } }, { "ph": "f", "id": 73708, "pid": 76337, "tid": -914061504, "ts": 1716454222510866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222535046, "dur": 188, "args": { "External id": 73709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73709, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73709, "pid": 5, "tid": 7, "ts": 1716454222535046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510871, "dur": 11, "args": { "External id": 73709, "cbid": 211, "correlation": 73709 } }, { "ph": "s", "id": 73709, "pid": 76337, "tid": -914061504, "ts": 1716454222510871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222535235, "dur": 18980, "args": { "External id": 73730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73730, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73730, "pid": 5, "tid": 7, "ts": 1716454222535235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222510952, "dur": 13, "args": { "External id": 73730, "cbid": 211, "correlation": 73730 } }, { "ph": "s", "id": 73730, "pid": 76337, "tid": -914061504, "ts": 1716454222510952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511061, "dur": 1, "args": { "External id": 73748, "cbid": 251, "correlation": 73748 } }, { "ph": "f", "id": 73748, "pid": 76337, "tid": -914061504, "ts": 1716454222511061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222554216, "dur": 207, "args": { "External id": 73750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73750, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73750, "pid": 5, "tid": 7, "ts": 1716454222554216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511066, "dur": 14, "args": { "External id": 73750, "cbid": 211, "correlation": 73750 } }, { "ph": "s", "id": 73750, "pid": 76337, "tid": -914061504, "ts": 1716454222511066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222554425, "dur": 66, "args": { "External id": 73758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73758, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73758, "pid": 5, "tid": 7, "ts": 1716454222554425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511138, "dur": 12, "args": { "External id": 73758, "cbid": 211, "correlation": 73758 } }, { "ph": "s", "id": 73758, "pid": 76337, "tid": -914061504, "ts": 1716454222511138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222554492, "dur": 97, "args": { "External id": 73766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73766, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73766, "pid": 5, "tid": 7, "ts": 1716454222554492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511178, "dur": 9, "args": { "External id": 73766, "cbid": 211, "correlation": 73766 } }, { "ph": "s", "id": 73766, "pid": 76337, "tid": -914061504, "ts": 1716454222511178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222554590, "dur": 56, "args": { "External id": 73777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73777, "pid": 5, "tid": 7, "ts": 1716454222554590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511251, "dur": 12, "args": { "External id": 73777, "cbid": 211, "correlation": 73777 } }, { "ph": "s", "id": 73777, "pid": 76337, "tid": -914061504, "ts": 1716454222511251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222554648, "dur": 94, "args": { "External id": 73799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73799, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73799, "pid": 5, "tid": 7, "ts": 1716454222554648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511282, "dur": 8, "args": { "External id": 73799, "cbid": 211, "correlation": 73799 } }, { "ph": "s", "id": 73799, "pid": 76337, "tid": -914061504, "ts": 1716454222511282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511366, "dur": 1, "args": { "External id": 73810, "cbid": 251, "correlation": 73810 } }, { "ph": "f", "id": 73810, "pid": 76337, "tid": -914061504, "ts": 1716454222511366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222554743, "dur": 105, "args": { "External id": 73811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73811, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73811, "pid": 5, "tid": 7, "ts": 1716454222554743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511371, "dur": 12, "args": { "External id": 73811, "cbid": 211, "correlation": 73811 } }, { "ph": "s", "id": 73811, "pid": 76337, "tid": -914061504, "ts": 1716454222511371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511444, "dur": 1, "args": { "External id": 73822, "cbid": 251, "correlation": 73822 } }, { "ph": "f", "id": 73822, "pid": 76337, "tid": -914061504, "ts": 1716454222511444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511448, "dur": 0, "args": { "External id": 73823, "cbid": 251, "correlation": 73823 } }, { "ph": "f", "id": 73823, "pid": 76337, "tid": -914061504, "ts": 1716454222511448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222554849, "dur": 10, "args": { "External id": 73824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73824, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 73824, "pid": 5, "tid": 7, "ts": 1716454222554849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511449, "dur": 12, "args": { "External id": 73824, "cbid": 211, "correlation": 73824 } }, { "ph": "s", "id": 73824, "pid": 76337, "tid": -914061504, "ts": 1716454222511449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222554860, "dur": 5, "args": { "External id": 73826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73826, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 73826, "pid": 5, "tid": 7, "ts": 1716454222554860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511464, "dur": 6, "args": { "External id": 73826, "cbid": 211, "correlation": 73826 } }, { "ph": "s", "id": 73826, "pid": 76337, "tid": -914061504, "ts": 1716454222511464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511524, "dur": 1, "args": { "External id": 73837, "cbid": 251, "correlation": 73837 } }, { "ph": "f", "id": 73837, "pid": 76337, "tid": -914061504, "ts": 1716454222511524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511527, "dur": 0, "args": { "External id": 73838, "cbid": 251, "correlation": 73838 } }, { "ph": "f", "id": 73838, "pid": 76337, "tid": -914061504, "ts": 1716454222511527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222554867, "dur": 6, "args": { "External id": 73839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73839, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 73839, "pid": 5, "tid": 7, "ts": 1716454222554867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511529, "dur": 11, "args": { "External id": 73839, "cbid": 211, "correlation": 73839 } }, { "ph": "s", "id": 73839, "pid": 76337, "tid": -914061504, "ts": 1716454222511529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222554874, "dur": 4, "args": { "External id": 73841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73841, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 73841, "pid": 5, "tid": 7, "ts": 1716454222554874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511542, "dur": 6, "args": { "External id": 73841, "cbid": 211, "correlation": 73841 } }, { "ph": "s", "id": 73841, "pid": 76337, "tid": -914061504, "ts": 1716454222511542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222554879, "dur": 158, "args": { "External id": 73862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73862, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73862, "pid": 5, "tid": 7, "ts": 1716454222554879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511617, "dur": 12, "args": { "External id": 73862, "cbid": 211, "correlation": 73862 } }, { "ph": "s", "id": 73862, "pid": 76337, "tid": -914061504, "ts": 1716454222511617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511712, "dur": 1, "args": { "External id": 73880, "cbid": 251, "correlation": 73880 } }, { "ph": "f", "id": 73880, "pid": 76337, "tid": -914061504, "ts": 1716454222511712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222555039, "dur": 106, "args": { "External id": 73882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73882, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 73882, "pid": 5, "tid": 7, "ts": 1716454222555039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511718, "dur": 13, "args": { "External id": 73882, "cbid": 211, "correlation": 73882 } }, { "ph": "s", "id": 73882, "pid": 76337, "tid": -914061504, "ts": 1716454222511718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222555146, "dur": 34, "args": { "External id": 73890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73890, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73890, "pid": 5, "tid": 7, "ts": 1716454222555146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511787, "dur": 12, "args": { "External id": 73890, "cbid": 211, "correlation": 73890 } }, { "ph": "s", "id": 73890, "pid": 76337, "tid": -914061504, "ts": 1716454222511787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222555182, "dur": 68, "args": { "External id": 73898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73898, "pid": 5, "tid": 7, "ts": 1716454222555182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511829, "dur": 9, "args": { "External id": 73898, "cbid": 211, "correlation": 73898 } }, { "ph": "s", "id": 73898, "pid": 76337, "tid": -914061504, "ts": 1716454222511829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222555251, "dur": 94, "args": { "External id": 73920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73920, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73920, "pid": 5, "tid": 7, "ts": 1716454222555251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511881, "dur": 10, "args": { "External id": 73920, "cbid": 211, "correlation": 73920 } }, { "ph": "s", "id": 73920, "pid": 76337, "tid": -914061504, "ts": 1716454222511881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222511967, "dur": 1, "args": { "External id": 73936, "cbid": 251, "correlation": 73936 } }, { "ph": "f", "id": 73936, "pid": 76337, "tid": -914061504, "ts": 1716454222511967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222555346, "dur": 588, "args": { "External id": 73938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73938, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 73938, "pid": 5, "tid": 7, "ts": 1716454222555346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222511980, "dur": 12, "args": { "External id": 73938, "cbid": 211, "correlation": 73938 } }, { "ph": "s", "id": 73938, "pid": 76337, "tid": -914061504, "ts": 1716454222511980, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222555936, "dur": 246, "args": { "External id": 73946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73946, "pid": 5, "tid": 7, "ts": 1716454222555936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512049, "dur": 12, "args": { "External id": 73946, "cbid": 211, "correlation": 73946 } }, { "ph": "s", "id": 73946, "pid": 76337, "tid": -914061504, "ts": 1716454222512049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222556183, "dur": 253, "args": { "External id": 73954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73954, "pid": 5, "tid": 7, "ts": 1716454222556183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512081, "dur": 8, "args": { "External id": 73954, "cbid": 211, "correlation": 73954 } }, { "ph": "s", "id": 73954, "pid": 76337, "tid": -914061504, "ts": 1716454222512081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512163, "dur": 2, "args": { "External id": 73970, "cbid": 251, "correlation": 73970 } }, { "ph": "f", "id": 73970, "pid": 76337, "tid": -914061504, "ts": 1716454222512163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512168, "dur": 0, "args": { "External id": 73972, "cbid": 251, "correlation": 73972 } }, { "ph": "f", "id": 73972, "pid": 76337, "tid": -914061504, "ts": 1716454222512168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222556437, "dur": 362, "args": { "External id": 73973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73973, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 73973, "pid": 5, "tid": 7, "ts": 1716454222556437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512172, "dur": 13, "args": { "External id": 73973, "cbid": 211, "correlation": 73973 } }, { "ph": "s", "id": 73973, "pid": 76337, "tid": -914061504, "ts": 1716454222512172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222556800, "dur": 50, "args": { "External id": 73981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73981, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73981, "pid": 5, "tid": 7, "ts": 1716454222556800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512214, "dur": 10, "args": { "External id": 73981, "cbid": 211, "correlation": 73981 } }, { "ph": "s", "id": 73981, "pid": 76337, "tid": -914061504, "ts": 1716454222512214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222556851, "dur": 161, "args": { "External id": 73992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 73992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 73992, "pid": 5, "tid": 7, "ts": 1716454222556851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512284, "dur": 12, "args": { "External id": 73992, "cbid": 211, "correlation": 73992 } }, { "ph": "s", "id": 73992, "pid": 76337, "tid": -914061504, "ts": 1716454222512284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222512348, "dur": 0, "args": { "External id": 74004, "cbid": 317, "correlation": 74004 } }, { "ph": "f", "id": 74004, "pid": 76337, "tid": -914061504, "ts": 1716454222512348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222512349, "dur": 0, "args": { "External id": 74005, "cbid": 203, "correlation": 74005 } }, { "ph": "f", "id": 74005, "pid": 76337, "tid": -914061504, "ts": 1716454222512349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222512350, "dur": 0, "args": { "External id": 74006, "cbid": 205, "correlation": 74006 } }, { "ph": "f", "id": 74006, "pid": 76337, "tid": -914061504, "ts": 1716454222512350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512376, "dur": 1, "args": { "External id": 74010, "cbid": 251, "correlation": 74010 } }, { "ph": "f", "id": 74010, "pid": 76337, "tid": -914061504, "ts": 1716454222512376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512377, "dur": 0, "args": { "External id": 74011, "cbid": 251, "correlation": 74011 } }, { "ph": "f", "id": 74011, "pid": 76337, "tid": -914061504, "ts": 1716454222512377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512378, "dur": 0, "args": { "External id": 74012, "cbid": 251, "correlation": 74012 } }, { "ph": "f", "id": 74012, "pid": 76337, "tid": -914061504, "ts": 1716454222512378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512379, "dur": 0, "args": { "External id": 74013, "cbid": 251, "correlation": 74013 } }, { "ph": "f", "id": 74013, "pid": 76337, "tid": -914061504, "ts": 1716454222512379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512380, "dur": 0, "args": { "External id": 74014, "cbid": 251, "correlation": 74014 } }, { "ph": "f", "id": 74014, "pid": 76337, "tid": -914061504, "ts": 1716454222512380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512380, "dur": 0, "args": { "External id": 74015, "cbid": 251, "correlation": 74015 } }, { "ph": "f", "id": 74015, "pid": 76337, "tid": -914061504, "ts": 1716454222512380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512381, "dur": 0, "args": { "External id": 74016, "cbid": 251, "correlation": 74016 } }, { "ph": "f", "id": 74016, "pid": 76337, "tid": -914061504, "ts": 1716454222512381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512382, "dur": 0, "args": { "External id": 74017, "cbid": 251, "correlation": 74017 } }, { "ph": "f", "id": 74017, "pid": 76337, "tid": -914061504, "ts": 1716454222512382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222512383, "dur": 0, "args": { "External id": 74018, "cbid": 251, "correlation": 74018 } }, { "ph": "f", "id": 74018, "pid": 76337, "tid": -914061504, "ts": 1716454222512383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222557014, "dur": 117, "args": { "External id": 74019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74019, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 74019, "pid": 5, "tid": 7, "ts": 1716454222557014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512385, "dur": 12, "args": { "External id": 74019, "cbid": 211, "correlation": 74019 } }, { "ph": "s", "id": 74019, "pid": 76337, "tid": -914061504, "ts": 1716454222512385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222557132, "dur": 60, "args": { "External id": 74025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74025, "pid": 5, "tid": 7, "ts": 1716454222557132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512420, "dur": 9, "args": { "External id": 74025, "cbid": 211, "correlation": 74025 } }, { "ph": "s", "id": 74025, "pid": 76337, "tid": -914061504, "ts": 1716454222512420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222557194, "dur": 49, "args": { "External id": 74033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74033, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74033, "pid": 5, "tid": 7, "ts": 1716454222557194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512452, "dur": 8, "args": { "External id": 74033, "cbid": 211, "correlation": 74033 } }, { "ph": "s", "id": 74033, "pid": 76337, "tid": -914061504, "ts": 1716454222512452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222512526, "dur": 0, "args": { "External id": 74043, "cbid": 317, "correlation": 74043 } }, { "ph": "f", "id": 74043, "pid": 76337, "tid": -914061504, "ts": 1716454222512526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222512526, "dur": 0, "args": { "External id": 74044, "cbid": 203, "correlation": 74044 } }, { "ph": "f", "id": 74044, "pid": 76337, "tid": -914061504, "ts": 1716454222512526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222512527, "dur": 0, "args": { "External id": 74045, "cbid": 205, "correlation": 74045 } }, { "ph": "f", "id": 74045, "pid": 76337, "tid": -914061504, "ts": 1716454222512527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222557244, "dur": 41, "args": { "External id": 74049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74049, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74049, "pid": 5, "tid": 7, "ts": 1716454222557244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512544, "dur": 12, "args": { "External id": 74049, "cbid": 211, "correlation": 74049 } }, { "ph": "s", "id": 74049, "pid": 76337, "tid": -914061504, "ts": 1716454222512544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222557287, "dur": 14, "args": { "External id": 74051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74051, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74051, "pid": 5, "tid": 7, "ts": 1716454222557287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512559, "dur": 5, "args": { "External id": 74051, "cbid": 211, "correlation": 74051 } }, { "ph": "s", "id": 74051, "pid": 76337, "tid": -914061504, "ts": 1716454222512559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222557303, "dur": 1, "args": { "External id": 74053, "device": 5, "context": 1, "stream": 7, "correlation": 74053, "bytes": 1536, "memory bandwidth (GB/s)": 0.8894035900405327 } }, { "ph": "f", "id": 74053, "pid": 5, "tid": 7, "ts": 1716454222557303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222512601, "dur": 18, "args": { "External id": 74053, "cbid": 51, "correlation": 74053 } }, { "ph": "s", "id": 74053, "pid": 76337, "tid": -914061504, "ts": 1716454222512601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222557307, "dur": 367, "args": { "External id": 74054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74054, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74054, "pid": 5, "tid": 7, "ts": 1716454222557307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512620, "dur": 10, "args": { "External id": 74054, "cbid": 211, "correlation": 74054 } }, { "ph": "s", "id": 74054, "pid": 76337, "tid": -914061504, "ts": 1716454222512620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222557676, "dur": 14, "args": { "External id": 74056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74056, "pid": 5, "tid": 7, "ts": 1716454222557676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512638, "dur": 7, "args": { "External id": 74056, "cbid": 211, "correlation": 74056 } }, { "ph": "s", "id": 74056, "pid": 76337, "tid": -914061504, "ts": 1716454222512638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222557691, "dur": 16, "args": { "External id": 74062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74062, "pid": 5, "tid": 7, "ts": 1716454222557691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512669, "dur": 9, "args": { "External id": 74062, "cbid": 211, "correlation": 74062 } }, { "ph": "s", "id": 74062, "pid": 76337, "tid": -914061504, "ts": 1716454222512669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222557708, "dur": 19, "args": { "External id": 74082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74082, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 74082, "pid": 5, "tid": 7, "ts": 1716454222557708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512765, "dur": 13, "args": { "External id": 74082, "cbid": 211, "correlation": 74082 } }, { "ph": "s", "id": 74082, "pid": 76337, "tid": -914061504, "ts": 1716454222512765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222557728, "dur": 4, "args": { "External id": 74094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74094, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 74094, "pid": 5, "tid": 7, "ts": 1716454222557728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512788, "dur": 6, "args": { "External id": 74094, "cbid": 211, "correlation": 74094 } }, { "ph": "s", "id": 74094, "pid": 76337, "tid": -914061504, "ts": 1716454222512788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222557734, "dur": 18, "args": { "External id": 74097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74097, "pid": 5, "tid": 7, "ts": 1716454222557734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512806, "dur": 6, "args": { "External id": 74097, "cbid": 211, "correlation": 74097 } }, { "ph": "s", "id": 74097, "pid": 76337, "tid": -914061504, "ts": 1716454222512806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222557753, "dur": 12, "args": { "External id": 74106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74106, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74106, "pid": 5, "tid": 7, "ts": 1716454222557753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512845, "dur": 9, "args": { "External id": 74106, "cbid": 211, "correlation": 74106 } }, { "ph": "s", "id": 74106, "pid": 76337, "tid": -914061504, "ts": 1716454222512845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222512900, "dur": 0, "args": { "External id": 74116, "cbid": 317, "correlation": 74116 } }, { "ph": "f", "id": 74116, "pid": 76337, "tid": -914061504, "ts": 1716454222512900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222512901, "dur": 0, "args": { "External id": 74117, "cbid": 203, "correlation": 74117 } }, { "ph": "f", "id": 74117, "pid": 76337, "tid": -914061504, "ts": 1716454222512901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222512902, "dur": 0, "args": { "External id": 74118, "cbid": 205, "correlation": 74118 } }, { "ph": "f", "id": 74118, "pid": 76337, "tid": -914061504, "ts": 1716454222512902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222557766, "dur": 12, "args": { "External id": 74122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74122, "pid": 5, "tid": 7, "ts": 1716454222557766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512917, "dur": 12, "args": { "External id": 74122, "cbid": 211, "correlation": 74122 } }, { "ph": "s", "id": 74122, "pid": 76337, "tid": -914061504, "ts": 1716454222512917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222557779, "dur": 25, "args": { "External id": 74124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74124, "pid": 5, "tid": 7, "ts": 1716454222557779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512932, "dur": 5, "args": { "External id": 74124, "cbid": 211, "correlation": 74124 } }, { "ph": "s", "id": 74124, "pid": 76337, "tid": -914061504, "ts": 1716454222512932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222557805, "dur": 4, "args": { "External id": 74126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74126, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 74126, "pid": 5, "tid": 7, "ts": 1716454222557805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512943, "dur": 6, "args": { "External id": 74126, "cbid": 211, "correlation": 74126 } }, { "ph": "s", "id": 74126, "pid": 76337, "tid": -914061504, "ts": 1716454222512943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222512965, "dur": 0, "args": { "External id": 74127, "cbid": 51, "correlation": 74127 } }, { "ph": "s", "id": 74127, "pid": 76337, "tid": -914061504, "ts": 1716454222512965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222557810, "dur": 362, "args": { "External id": 74128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74128, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74128, "pid": 5, "tid": 7, "ts": 1716454222557810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512966, "dur": 22, "args": { "External id": 74128, "cbid": 211, "correlation": 74128 } }, { "ph": "s", "id": 74128, "pid": 76337, "tid": -914061504, "ts": 1716454222512966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222558174, "dur": 21, "args": { "External id": 74129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74129, "pid": 5, "tid": 7, "ts": 1716454222558174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222512992, "dur": 6, "args": { "External id": 74129, "cbid": 211, "correlation": 74129 } }, { "ph": "s", "id": 74129, "pid": 76337, "tid": -914061504, "ts": 1716454222512992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222558197, "dur": 33, "args": { "External id": 74135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74135, "pid": 5, "tid": 7, "ts": 1716454222558197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513022, "dur": 8, "args": { "External id": 74135, "cbid": 211, "correlation": 74135 } }, { "ph": "s", "id": 74135, "pid": 76337, "tid": -914061504, "ts": 1716454222513022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222558231, "dur": 4, "args": { "External id": 74143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74143, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 74143, "pid": 5, "tid": 7, "ts": 1716454222558231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513067, "dur": 9, "args": { "External id": 74143, "cbid": 211, "correlation": 74143 } }, { "ph": "s", "id": 74143, "pid": 76337, "tid": -914061504, "ts": 1716454222513067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513134, "dur": 1, "args": { "External id": 74159, "cbid": 251, "correlation": 74159 } }, { "ph": "f", "id": 74159, "pid": 76337, "tid": -914061504, "ts": 1716454222513134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513139, "dur": 0, "args": { "External id": 74161, "cbid": 251, "correlation": 74161 } }, { "ph": "f", "id": 74161, "pid": 76337, "tid": -914061504, "ts": 1716454222513139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222558236, "dur": 12, "args": { "External id": 74162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74162, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 74162, "pid": 5, "tid": 7, "ts": 1716454222558236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513141, "dur": 12, "args": { "External id": 74162, "cbid": 211, "correlation": 74162 } }, { "ph": "s", "id": 74162, "pid": 76337, "tid": -914061504, "ts": 1716454222513141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222558249, "dur": 5, "args": { "External id": 74164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74164, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 74164, "pid": 5, "tid": 7, "ts": 1716454222558249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513155, "dur": 5, "args": { "External id": 74164, "cbid": 211, "correlation": 74164 } }, { "ph": "s", "id": 74164, "pid": 76337, "tid": -914061504, "ts": 1716454222513155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222558255, "dur": 29, "args": { "External id": 74174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74174, "pid": 5, "tid": 7, "ts": 1716454222558255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513214, "dur": 12, "args": { "External id": 74174, "cbid": 211, "correlation": 74174 } }, { "ph": "s", "id": 74174, "pid": 76337, "tid": -914061504, "ts": 1716454222513214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222558286, "dur": 30, "args": { "External id": 74194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74194, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 74194, "pid": 5, "tid": 7, "ts": 1716454222558286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513281, "dur": 10, "args": { "External id": 74194, "cbid": 211, "correlation": 74194 } }, { "ph": "s", "id": 74194, "pid": 76337, "tid": -914061504, "ts": 1716454222513281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222558317, "dur": 4, "args": { "External id": 74206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74206, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 74206, "pid": 5, "tid": 7, "ts": 1716454222558317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513301, "dur": 6, "args": { "External id": 74206, "cbid": 211, "correlation": 74206 } }, { "ph": "s", "id": 74206, "pid": 76337, "tid": -914061504, "ts": 1716454222513301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222558322, "dur": 30, "args": { "External id": 74209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74209, "pid": 5, "tid": 7, "ts": 1716454222558322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513319, "dur": 7, "args": { "External id": 74209, "cbid": 211, "correlation": 74209 } }, { "ph": "s", "id": 74209, "pid": 76337, "tid": -914061504, "ts": 1716454222513319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222558354, "dur": 20, "args": { "External id": 74218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74218, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74218, "pid": 5, "tid": 7, "ts": 1716454222558354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513359, "dur": 10, "args": { "External id": 74218, "cbid": 211, "correlation": 74218 } }, { "ph": "s", "id": 74218, "pid": 76337, "tid": -914061504, "ts": 1716454222513359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222513423, "dur": 0, "args": { "External id": 74228, "cbid": 317, "correlation": 74228 } }, { "ph": "f", "id": 74228, "pid": 76337, "tid": -914061504, "ts": 1716454222513423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222513424, "dur": 0, "args": { "External id": 74229, "cbid": 203, "correlation": 74229 } }, { "ph": "f", "id": 74229, "pid": 76337, "tid": -914061504, "ts": 1716454222513424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222513425, "dur": 0, "args": { "External id": 74230, "cbid": 205, "correlation": 74230 } }, { "ph": "f", "id": 74230, "pid": 76337, "tid": -914061504, "ts": 1716454222513425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222558376, "dur": 23, "args": { "External id": 74234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74234, "pid": 5, "tid": 7, "ts": 1716454222558376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513442, "dur": 12, "args": { "External id": 74234, "cbid": 211, "correlation": 74234 } }, { "ph": "s", "id": 74234, "pid": 76337, "tid": -914061504, "ts": 1716454222513442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222558400, "dur": 45, "args": { "External id": 74236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74236, "pid": 5, "tid": 7, "ts": 1716454222558400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513457, "dur": 5, "args": { "External id": 74236, "cbid": 211, "correlation": 74236 } }, { "ph": "s", "id": 74236, "pid": 76337, "tid": -914061504, "ts": 1716454222513457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222558447, "dur": 664, "args": { "External id": 74238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74238, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74238, "pid": 5, "tid": 7, "ts": 1716454222558447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513471, "dur": 10, "args": { "External id": 74238, "cbid": 211, "correlation": 74238 } }, { "ph": "s", "id": 74238, "pid": 76337, "tid": -914061504, "ts": 1716454222513471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222559112, "dur": 22, "args": { "External id": 74240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74240, "pid": 5, "tid": 7, "ts": 1716454222559112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513485, "dur": 5, "args": { "External id": 74240, "cbid": 211, "correlation": 74240 } }, { "ph": "s", "id": 74240, "pid": 76337, "tid": -914061504, "ts": 1716454222513485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222559136, "dur": 33, "args": { "External id": 74246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74246, "pid": 5, "tid": 7, "ts": 1716454222559136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513512, "dur": 8, "args": { "External id": 74246, "cbid": 211, "correlation": 74246 } }, { "ph": "s", "id": 74246, "pid": 76337, "tid": -914061504, "ts": 1716454222513512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222513570, "dur": 0, "args": { "External id": 74256, "cbid": 317, "correlation": 74256 } }, { "ph": "f", "id": 74256, "pid": 76337, "tid": -914061504, "ts": 1716454222513570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222513571, "dur": 0, "args": { "External id": 74257, "cbid": 203, "correlation": 74257 } }, { "ph": "f", "id": 74257, "pid": 76337, "tid": -914061504, "ts": 1716454222513571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222513571, "dur": 0, "args": { "External id": 74258, "cbid": 205, "correlation": 74258 } }, { "ph": "f", "id": 74258, "pid": 76337, "tid": -914061504, "ts": 1716454222513571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513594, "dur": 1, "args": { "External id": 74262, "cbid": 251, "correlation": 74262 } }, { "ph": "f", "id": 74262, "pid": 76337, "tid": -914061504, "ts": 1716454222513594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513596, "dur": 0, "args": { "External id": 74263, "cbid": 251, "correlation": 74263 } }, { "ph": "f", "id": 74263, "pid": 76337, "tid": -914061504, "ts": 1716454222513596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513596, "dur": 0, "args": { "External id": 74264, "cbid": 251, "correlation": 74264 } }, { "ph": "f", "id": 74264, "pid": 76337, "tid": -914061504, "ts": 1716454222513596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513597, "dur": 0, "args": { "External id": 74265, "cbid": 251, "correlation": 74265 } }, { "ph": "f", "id": 74265, "pid": 76337, "tid": -914061504, "ts": 1716454222513597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513598, "dur": 0, "args": { "External id": 74266, "cbid": 251, "correlation": 74266 } }, { "ph": "f", "id": 74266, "pid": 76337, "tid": -914061504, "ts": 1716454222513598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513599, "dur": 0, "args": { "External id": 74267, "cbid": 251, "correlation": 74267 } }, { "ph": "f", "id": 74267, "pid": 76337, "tid": -914061504, "ts": 1716454222513599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513600, "dur": 0, "args": { "External id": 74268, "cbid": 251, "correlation": 74268 } }, { "ph": "f", "id": 74268, "pid": 76337, "tid": -914061504, "ts": 1716454222513600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513600, "dur": 0, "args": { "External id": 74269, "cbid": 251, "correlation": 74269 } }, { "ph": "f", "id": 74269, "pid": 76337, "tid": -914061504, "ts": 1716454222513600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222513602, "dur": 0, "args": { "External id": 74270, "cbid": 251, "correlation": 74270 } }, { "ph": "f", "id": 74270, "pid": 76337, "tid": -914061504, "ts": 1716454222513602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222559170, "dur": 51, "args": { "External id": 74271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74271, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 74271, "pid": 5, "tid": 7, "ts": 1716454222559170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513604, "dur": 12, "args": { "External id": 74271, "cbid": 211, "correlation": 74271 } }, { "ph": "s", "id": 74271, "pid": 76337, "tid": -914061504, "ts": 1716454222513604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222559223, "dur": 33, "args": { "External id": 74277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74277, "pid": 5, "tid": 7, "ts": 1716454222559223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513635, "dur": 9, "args": { "External id": 74277, "cbid": 211, "correlation": 74277 } }, { "ph": "s", "id": 74277, "pid": 76337, "tid": -914061504, "ts": 1716454222513635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222559257, "dur": 27, "args": { "External id": 74285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74285, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74285, "pid": 5, "tid": 7, "ts": 1716454222559257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513665, "dur": 8, "args": { "External id": 74285, "cbid": 211, "correlation": 74285 } }, { "ph": "s", "id": 74285, "pid": 76337, "tid": -914061504, "ts": 1716454222513665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222559285, "dur": 19, "args": { "External id": 74293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74293, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74293, "pid": 5, "tid": 7, "ts": 1716454222559285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513694, "dur": 9, "args": { "External id": 74293, "cbid": 211, "correlation": 74293 } }, { "ph": "s", "id": 74293, "pid": 76337, "tid": -914061504, "ts": 1716454222513694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222559306, "dur": 31, "args": { "External id": 74313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74313, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 74313, "pid": 5, "tid": 7, "ts": 1716454222559306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513778, "dur": 12, "args": { "External id": 74313, "cbid": 211, "correlation": 74313 } }, { "ph": "s", "id": 74313, "pid": 76337, "tid": -914061504, "ts": 1716454222513778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222559338, "dur": 4, "args": { "External id": 74325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74325, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 74325, "pid": 5, "tid": 7, "ts": 1716454222559338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513800, "dur": 6, "args": { "External id": 74325, "cbid": 211, "correlation": 74325 } }, { "ph": "s", "id": 74325, "pid": 76337, "tid": -914061504, "ts": 1716454222513800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222559343, "dur": 30, "args": { "External id": 74328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74328, "pid": 5, "tid": 7, "ts": 1716454222559343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513817, "dur": 7, "args": { "External id": 74328, "cbid": 211, "correlation": 74328 } }, { "ph": "s", "id": 74328, "pid": 76337, "tid": -914061504, "ts": 1716454222513817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222513875, "dur": 0, "args": { "External id": 74339, "cbid": 317, "correlation": 74339 } }, { "ph": "f", "id": 74339, "pid": 76337, "tid": -914061504, "ts": 1716454222513875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222513876, "dur": 0, "args": { "External id": 74340, "cbid": 203, "correlation": 74340 } }, { "ph": "f", "id": 74340, "pid": 76337, "tid": -914061504, "ts": 1716454222513876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222513877, "dur": 0, "args": { "External id": 74341, "cbid": 205, "correlation": 74341 } }, { "ph": "f", "id": 74341, "pid": 76337, "tid": -914061504, "ts": 1716454222513877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222559375, "dur": 23, "args": { "External id": 74345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74345, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74345, "pid": 5, "tid": 7, "ts": 1716454222559375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513892, "dur": 12, "args": { "External id": 74345, "cbid": 211, "correlation": 74345 } }, { "ph": "s", "id": 74345, "pid": 76337, "tid": -914061504, "ts": 1716454222513892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222559400, "dur": 123, "args": { "External id": 74347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74347, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74347, "pid": 5, "tid": 7, "ts": 1716454222559400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513912, "dur": 8, "args": { "External id": 74347, "cbid": 211, "correlation": 74347 } }, { "ph": "s", "id": 74347, "pid": 76337, "tid": -914061504, "ts": 1716454222513912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222559525, "dur": 22, "args": { "External id": 74349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74349, "pid": 5, "tid": 7, "ts": 1716454222559525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513924, "dur": 5, "args": { "External id": 74349, "cbid": 211, "correlation": 74349 } }, { "ph": "s", "id": 74349, "pid": 76337, "tid": -914061504, "ts": 1716454222513924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222559548, "dur": 33, "args": { "External id": 74355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74355, "pid": 5, "tid": 7, "ts": 1716454222559548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222513952, "dur": 8, "args": { "External id": 74355, "cbid": 211, "correlation": 74355 } }, { "ph": "s", "id": 74355, "pid": 76337, "tid": -914061504, "ts": 1716454222513952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222559581, "dur": 174, "args": { "External id": 74364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74364, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74364, "pid": 5, "tid": 7, "ts": 1716454222559581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514046, "dur": 15, "args": { "External id": 74364, "cbid": 211, "correlation": 74364 } }, { "ph": "s", "id": 74364, "pid": 76337, "tid": -914061504, "ts": 1716454222514046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222559757, "dur": 66, "args": { "External id": 74386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74386, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74386, "pid": 5, "tid": 7, "ts": 1716454222559757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514104, "dur": 10, "args": { "External id": 74386, "cbid": 211, "correlation": 74386 } }, { "ph": "s", "id": 74386, "pid": 76337, "tid": -914061504, "ts": 1716454222514104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514194, "dur": 1, "args": { "External id": 74397, "cbid": 251, "correlation": 74397 } }, { "ph": "f", "id": 74397, "pid": 76337, "tid": -914061504, "ts": 1716454222514194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222559824, "dur": 157, "args": { "External id": 74398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74398, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74398, "pid": 5, "tid": 7, "ts": 1716454222559824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514199, "dur": 12, "args": { "External id": 74398, "cbid": 211, "correlation": 74398 } }, { "ph": "s", "id": 74398, "pid": 76337, "tid": -914061504, "ts": 1716454222514199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514268, "dur": 1, "args": { "External id": 74409, "cbid": 251, "correlation": 74409 } }, { "ph": "f", "id": 74409, "pid": 76337, "tid": -914061504, "ts": 1716454222514268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222559982, "dur": 151, "args": { "External id": 74410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74410, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74410, "pid": 5, "tid": 7, "ts": 1716454222559982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514272, "dur": 12, "args": { "External id": 74410, "cbid": 211, "correlation": 74410 } }, { "ph": "s", "id": 74410, "pid": 76337, "tid": -914061504, "ts": 1716454222514272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514339, "dur": 1, "args": { "External id": 74421, "cbid": 251, "correlation": 74421 } }, { "ph": "f", "id": 74421, "pid": 76337, "tid": -914061504, "ts": 1716454222514339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222560134, "dur": 147, "args": { "External id": 74422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74422, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74422, "pid": 5, "tid": 7, "ts": 1716454222560134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514343, "dur": 11, "args": { "External id": 74422, "cbid": 211, "correlation": 74422 } }, { "ph": "s", "id": 74422, "pid": 76337, "tid": -914061504, "ts": 1716454222514343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222560283, "dur": 1990, "args": { "External id": 74443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74443, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 74443, "pid": 5, "tid": 7, "ts": 1716454222560283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514425, "dur": 13, "args": { "External id": 74443, "cbid": 211, "correlation": 74443 } }, { "ph": "s", "id": 74443, "pid": 76337, "tid": -914061504, "ts": 1716454222514425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514526, "dur": 1, "args": { "External id": 74461, "cbid": 251, "correlation": 74461 } }, { "ph": "f", "id": 74461, "pid": 76337, "tid": -914061504, "ts": 1716454222514526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222562275, "dur": 149, "args": { "External id": 74463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74463, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 74463, "pid": 5, "tid": 7, "ts": 1716454222562275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514532, "dur": 13, "args": { "External id": 74463, "cbid": 211, "correlation": 74463 } }, { "ph": "s", "id": 74463, "pid": 76337, "tid": -914061504, "ts": 1716454222514532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222562425, "dur": 35, "args": { "External id": 74471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74471, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74471, "pid": 5, "tid": 7, "ts": 1716454222562425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514603, "dur": 12, "args": { "External id": 74471, "cbid": 211, "correlation": 74471 } }, { "ph": "s", "id": 74471, "pid": 76337, "tid": -914061504, "ts": 1716454222514603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222562462, "dur": 51, "args": { "External id": 74479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74479, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74479, "pid": 5, "tid": 7, "ts": 1716454222562462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514642, "dur": 9, "args": { "External id": 74479, "cbid": 211, "correlation": 74479 } }, { "ph": "s", "id": 74479, "pid": 76337, "tid": -914061504, "ts": 1716454222514642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222562514, "dur": 30, "args": { "External id": 74490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74490, "pid": 5, "tid": 7, "ts": 1716454222562514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514715, "dur": 13, "args": { "External id": 74490, "cbid": 211, "correlation": 74490 } }, { "ph": "s", "id": 74490, "pid": 76337, "tid": -914061504, "ts": 1716454222514715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222562545, "dur": 35, "args": { "External id": 74512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74512, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74512, "pid": 5, "tid": 7, "ts": 1716454222562545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514746, "dur": 8, "args": { "External id": 74512, "cbid": 211, "correlation": 74512 } }, { "ph": "s", "id": 74512, "pid": 76337, "tid": -914061504, "ts": 1716454222514746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514830, "dur": 1, "args": { "External id": 74523, "cbid": 251, "correlation": 74523 } }, { "ph": "f", "id": 74523, "pid": 76337, "tid": -914061504, "ts": 1716454222514830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222562582, "dur": 91, "args": { "External id": 74524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74524, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74524, "pid": 5, "tid": 7, "ts": 1716454222562582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514836, "dur": 12, "args": { "External id": 74524, "cbid": 211, "correlation": 74524 } }, { "ph": "s", "id": 74524, "pid": 76337, "tid": -914061504, "ts": 1716454222514836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514905, "dur": 1, "args": { "External id": 74535, "cbid": 251, "correlation": 74535 } }, { "ph": "f", "id": 74535, "pid": 76337, "tid": -914061504, "ts": 1716454222514905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514908, "dur": 0, "args": { "External id": 74536, "cbid": 251, "correlation": 74536 } }, { "ph": "f", "id": 74536, "pid": 76337, "tid": -914061504, "ts": 1716454222514908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222562674, "dur": 11, "args": { "External id": 74537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74537, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 74537, "pid": 5, "tid": 7, "ts": 1716454222562674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514910, "dur": 12, "args": { "External id": 74537, "cbid": 211, "correlation": 74537 } }, { "ph": "s", "id": 74537, "pid": 76337, "tid": -914061504, "ts": 1716454222514910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222562686, "dur": 5, "args": { "External id": 74539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74539, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 74539, "pid": 5, "tid": 7, "ts": 1716454222562686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514925, "dur": 6, "args": { "External id": 74539, "cbid": 211, "correlation": 74539 } }, { "ph": "s", "id": 74539, "pid": 76337, "tid": -914061504, "ts": 1716454222514925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514991, "dur": 1, "args": { "External id": 74550, "cbid": 251, "correlation": 74550 } }, { "ph": "f", "id": 74550, "pid": 76337, "tid": -914061504, "ts": 1716454222514991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222514995, "dur": 0, "args": { "External id": 74551, "cbid": 251, "correlation": 74551 } }, { "ph": "f", "id": 74551, "pid": 76337, "tid": -914061504, "ts": 1716454222514995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222562692, "dur": 7, "args": { "External id": 74552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74552, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 74552, "pid": 5, "tid": 7, "ts": 1716454222562692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222514996, "dur": 12, "args": { "External id": 74552, "cbid": 211, "correlation": 74552 } }, { "ph": "s", "id": 74552, "pid": 76337, "tid": -914061504, "ts": 1716454222514996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222562700, "dur": 3, "args": { "External id": 74554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74554, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 74554, "pid": 5, "tid": 7, "ts": 1716454222562700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515010, "dur": 5, "args": { "External id": 74554, "cbid": 211, "correlation": 74554 } }, { "ph": "s", "id": 74554, "pid": 76337, "tid": -914061504, "ts": 1716454222515010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222562705, "dur": 94, "args": { "External id": 74575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74575, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 74575, "pid": 5, "tid": 7, "ts": 1716454222562705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515085, "dur": 14, "args": { "External id": 74575, "cbid": 211, "correlation": 74575 } }, { "ph": "s", "id": 74575, "pid": 76337, "tid": -914061504, "ts": 1716454222515085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222515181, "dur": 1, "args": { "External id": 74593, "cbid": 251, "correlation": 74593 } }, { "ph": "f", "id": 74593, "pid": 76337, "tid": -914061504, "ts": 1716454222515181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222562800, "dur": 99, "args": { "External id": 74595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74595, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74595, "pid": 5, "tid": 7, "ts": 1716454222562800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515187, "dur": 13, "args": { "External id": 74595, "cbid": 211, "correlation": 74595 } }, { "ph": "s", "id": 74595, "pid": 76337, "tid": -914061504, "ts": 1716454222515187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222562901, "dur": 19, "args": { "External id": 74603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74603, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74603, "pid": 5, "tid": 7, "ts": 1716454222562901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515256, "dur": 13, "args": { "External id": 74603, "cbid": 211, "correlation": 74603 } }, { "ph": "s", "id": 74603, "pid": 76337, "tid": -914061504, "ts": 1716454222515256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222562921, "dur": 37, "args": { "External id": 74611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74611, "pid": 5, "tid": 7, "ts": 1716454222562921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515298, "dur": 9, "args": { "External id": 74611, "cbid": 211, "correlation": 74611 } }, { "ph": "s", "id": 74611, "pid": 76337, "tid": -914061504, "ts": 1716454222515298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222562960, "dur": 35, "args": { "External id": 74633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74633, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74633, "pid": 5, "tid": 7, "ts": 1716454222562960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515351, "dur": 10, "args": { "External id": 74633, "cbid": 211, "correlation": 74633 } }, { "ph": "s", "id": 74633, "pid": 76337, "tid": -914061504, "ts": 1716454222515351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222515440, "dur": 1, "args": { "External id": 74649, "cbid": 251, "correlation": 74649 } }, { "ph": "f", "id": 74649, "pid": 76337, "tid": -914061504, "ts": 1716454222515440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222515446, "dur": 0, "args": { "External id": 74651, "cbid": 251, "correlation": 74651 } }, { "ph": "f", "id": 74651, "pid": 76337, "tid": -914061504, "ts": 1716454222515446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222562997, "dur": 551, "args": { "External id": 74652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74652, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 74652, "pid": 5, "tid": 7, "ts": 1716454222562997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515449, "dur": 12, "args": { "External id": 74652, "cbid": 211, "correlation": 74652 } }, { "ph": "s", "id": 74652, "pid": 76337, "tid": -914061504, "ts": 1716454222515449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222563549, "dur": 127, "args": { "External id": 74660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74660, "pid": 5, "tid": 7, "ts": 1716454222563549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515516, "dur": 13, "args": { "External id": 74660, "cbid": 211, "correlation": 74660 } }, { "ph": "s", "id": 74660, "pid": 76337, "tid": -914061504, "ts": 1716454222515516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222563677, "dur": 129, "args": { "External id": 74668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74668, "pid": 5, "tid": 7, "ts": 1716454222563677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515548, "dur": 8, "args": { "External id": 74668, "cbid": 211, "correlation": 74668 } }, { "ph": "s", "id": 74668, "pid": 76337, "tid": -914061504, "ts": 1716454222515548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222515626, "dur": 1, "args": { "External id": 74684, "cbid": 251, "correlation": 74684 } }, { "ph": "f", "id": 74684, "pid": 76337, "tid": -914061504, "ts": 1716454222515626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222563807, "dur": 308, "args": { "External id": 74686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74686, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74686, "pid": 5, "tid": 7, "ts": 1716454222563807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515631, "dur": 13, "args": { "External id": 74686, "cbid": 211, "correlation": 74686 } }, { "ph": "s", "id": 74686, "pid": 76337, "tid": -914061504, "ts": 1716454222515631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222564117, "dur": 27, "args": { "External id": 74694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74694, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74694, "pid": 5, "tid": 7, "ts": 1716454222564117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515674, "dur": 10, "args": { "External id": 74694, "cbid": 211, "correlation": 74694 } }, { "ph": "s", "id": 74694, "pid": 76337, "tid": -914061504, "ts": 1716454222515674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222564145, "dur": 83, "args": { "External id": 74705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74705, "pid": 5, "tid": 7, "ts": 1716454222564145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515743, "dur": 12, "args": { "External id": 74705, "cbid": 211, "correlation": 74705 } }, { "ph": "s", "id": 74705, "pid": 76337, "tid": -914061504, "ts": 1716454222515743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222515807, "dur": 0, "args": { "External id": 74717, "cbid": 317, "correlation": 74717 } }, { "ph": "f", "id": 74717, "pid": 76337, "tid": -914061504, "ts": 1716454222515807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222515808, "dur": 0, "args": { "External id": 74718, "cbid": 203, "correlation": 74718 } }, { "ph": "f", "id": 74718, "pid": 76337, "tid": -914061504, "ts": 1716454222515808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222515809, "dur": 0, "args": { "External id": 74719, "cbid": 205, "correlation": 74719 } }, { "ph": "f", "id": 74719, "pid": 76337, "tid": -914061504, "ts": 1716454222515809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222564230, "dur": 23, "args": { "External id": 74723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74723, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74723, "pid": 5, "tid": 7, "ts": 1716454222564230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515826, "dur": 12, "args": { "External id": 74723, "cbid": 211, "correlation": 74723 } }, { "ph": "s", "id": 74723, "pid": 76337, "tid": -914061504, "ts": 1716454222515826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222564254, "dur": 122, "args": { "External id": 74725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74725, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74725, "pid": 5, "tid": 7, "ts": 1716454222564254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515845, "dur": 7, "args": { "External id": 74725, "cbid": 211, "correlation": 74725 } }, { "ph": "s", "id": 74725, "pid": 76337, "tid": -914061504, "ts": 1716454222515845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222564377, "dur": 23, "args": { "External id": 74727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74727, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74727, "pid": 5, "tid": 7, "ts": 1716454222564377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515856, "dur": 5, "args": { "External id": 74727, "cbid": 211, "correlation": 74727 } }, { "ph": "s", "id": 74727, "pid": 76337, "tid": -914061504, "ts": 1716454222515856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222564401, "dur": 33, "args": { "External id": 74733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74733, "pid": 5, "tid": 7, "ts": 1716454222564401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515883, "dur": 8, "args": { "External id": 74733, "cbid": 211, "correlation": 74733 } }, { "ph": "s", "id": 74733, "pid": 76337, "tid": -914061504, "ts": 1716454222515883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222564435, "dur": 27, "args": { "External id": 74741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74741, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74741, "pid": 5, "tid": 7, "ts": 1716454222564435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515915, "dur": 8, "args": { "External id": 74741, "cbid": 211, "correlation": 74741 } }, { "ph": "s", "id": 74741, "pid": 76337, "tid": -914061504, "ts": 1716454222515915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222564463, "dur": 31, "args": { "External id": 74761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74761, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 74761, "pid": 5, "tid": 7, "ts": 1716454222564463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222515995, "dur": 13, "args": { "External id": 74761, "cbid": 211, "correlation": 74761 } }, { "ph": "s", "id": 74761, "pid": 76337, "tid": -914061504, "ts": 1716454222515995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222564495, "dur": 4, "args": { "External id": 74773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74773, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 74773, "pid": 5, "tid": 7, "ts": 1716454222564495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516018, "dur": 6, "args": { "External id": 74773, "cbid": 211, "correlation": 74773 } }, { "ph": "s", "id": 74773, "pid": 76337, "tid": -914061504, "ts": 1716454222516018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222564501, "dur": 31, "args": { "External id": 74776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74776, "pid": 5, "tid": 7, "ts": 1716454222564501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516037, "dur": 6, "args": { "External id": 74776, "cbid": 211, "correlation": 74776 } }, { "ph": "s", "id": 74776, "pid": 76337, "tid": -914061504, "ts": 1716454222516037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222564533, "dur": 21, "args": { "External id": 74785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74785, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74785, "pid": 5, "tid": 7, "ts": 1716454222564533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516077, "dur": 9, "args": { "External id": 74785, "cbid": 211, "correlation": 74785 } }, { "ph": "s", "id": 74785, "pid": 76337, "tid": -914061504, "ts": 1716454222516077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222516127, "dur": 0, "args": { "External id": 74795, "cbid": 317, "correlation": 74795 } }, { "ph": "f", "id": 74795, "pid": 76337, "tid": -914061504, "ts": 1716454222516127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222516128, "dur": 0, "args": { "External id": 74796, "cbid": 203, "correlation": 74796 } }, { "ph": "f", "id": 74796, "pid": 76337, "tid": -914061504, "ts": 1716454222516128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222516129, "dur": 0, "args": { "External id": 74797, "cbid": 205, "correlation": 74797 } }, { "ph": "f", "id": 74797, "pid": 76337, "tid": -914061504, "ts": 1716454222516129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222564555, "dur": 23, "args": { "External id": 74801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74801, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74801, "pid": 5, "tid": 7, "ts": 1716454222564555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516142, "dur": 11, "args": { "External id": 74801, "cbid": 211, "correlation": 74801 } }, { "ph": "s", "id": 74801, "pid": 76337, "tid": -914061504, "ts": 1716454222516142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222564579, "dur": 45, "args": { "External id": 74803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74803, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74803, "pid": 5, "tid": 7, "ts": 1716454222564579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516156, "dur": 6, "args": { "External id": 74803, "cbid": 211, "correlation": 74803 } }, { "ph": "s", "id": 74803, "pid": 76337, "tid": -914061504, "ts": 1716454222516156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222564626, "dur": 665, "args": { "External id": 74805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74805, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74805, "pid": 5, "tid": 7, "ts": 1716454222564626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516168, "dur": 6, "args": { "External id": 74805, "cbid": 211, "correlation": 74805 } }, { "ph": "s", "id": 74805, "pid": 76337, "tid": -914061504, "ts": 1716454222516168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222565292, "dur": 22, "args": { "External id": 74807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74807, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74807, "pid": 5, "tid": 7, "ts": 1716454222565292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516178, "dur": 5, "args": { "External id": 74807, "cbid": 211, "correlation": 74807 } }, { "ph": "s", "id": 74807, "pid": 76337, "tid": -914061504, "ts": 1716454222516178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222565315, "dur": 33, "args": { "External id": 74813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74813, "pid": 5, "tid": 7, "ts": 1716454222565315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516205, "dur": 9, "args": { "External id": 74813, "cbid": 211, "correlation": 74813 } }, { "ph": "s", "id": 74813, "pid": 76337, "tid": -914061504, "ts": 1716454222516205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222565349, "dur": 3, "args": { "External id": 74821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74821, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 74821, "pid": 5, "tid": 7, "ts": 1716454222565349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516247, "dur": 9, "args": { "External id": 74821, "cbid": 211, "correlation": 74821 } }, { "ph": "s", "id": 74821, "pid": 76337, "tid": -914061504, "ts": 1716454222516247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222516313, "dur": 1, "args": { "External id": 74837, "cbid": 251, "correlation": 74837 } }, { "ph": "f", "id": 74837, "pid": 76337, "tid": -914061504, "ts": 1716454222516313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222516318, "dur": 0, "args": { "External id": 74839, "cbid": 251, "correlation": 74839 } }, { "ph": "f", "id": 74839, "pid": 76337, "tid": -914061504, "ts": 1716454222516318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222565354, "dur": 12, "args": { "External id": 74840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74840, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 74840, "pid": 5, "tid": 7, "ts": 1716454222565354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516320, "dur": 11, "args": { "External id": 74840, "cbid": 211, "correlation": 74840 } }, { "ph": "s", "id": 74840, "pid": 76337, "tid": -914061504, "ts": 1716454222516320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222565368, "dur": 5, "args": { "External id": 74842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74842, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 74842, "pid": 5, "tid": 7, "ts": 1716454222565368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516333, "dur": 5, "args": { "External id": 74842, "cbid": 211, "correlation": 74842 } }, { "ph": "s", "id": 74842, "pid": 76337, "tid": -914061504, "ts": 1716454222516333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222565375, "dur": 29, "args": { "External id": 74852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74852, "pid": 5, "tid": 7, "ts": 1716454222565375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516391, "dur": 12, "args": { "External id": 74852, "cbid": 211, "correlation": 74852 } }, { "ph": "s", "id": 74852, "pid": 76337, "tid": -914061504, "ts": 1716454222516391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222565405, "dur": 30, "args": { "External id": 74872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74872, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 74872, "pid": 5, "tid": 7, "ts": 1716454222565405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516456, "dur": 11, "args": { "External id": 74872, "cbid": 211, "correlation": 74872 } }, { "ph": "s", "id": 74872, "pid": 76337, "tid": -914061504, "ts": 1716454222516456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222565437, "dur": 4, "args": { "External id": 74884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74884, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 74884, "pid": 5, "tid": 7, "ts": 1716454222565437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516477, "dur": 6, "args": { "External id": 74884, "cbid": 211, "correlation": 74884 } }, { "ph": "s", "id": 74884, "pid": 76337, "tid": -914061504, "ts": 1716454222516477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222565442, "dur": 30, "args": { "External id": 74887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74887, "pid": 5, "tid": 7, "ts": 1716454222565442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516495, "dur": 6, "args": { "External id": 74887, "cbid": 211, "correlation": 74887 } }, { "ph": "s", "id": 74887, "pid": 76337, "tid": -914061504, "ts": 1716454222516495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222565474, "dur": 21, "args": { "External id": 74896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74896, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74896, "pid": 5, "tid": 7, "ts": 1716454222565474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516536, "dur": 9, "args": { "External id": 74896, "cbid": 211, "correlation": 74896 } }, { "ph": "s", "id": 74896, "pid": 76337, "tid": -914061504, "ts": 1716454222516536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222516598, "dur": 0, "args": { "External id": 74906, "cbid": 317, "correlation": 74906 } }, { "ph": "f", "id": 74906, "pid": 76337, "tid": -914061504, "ts": 1716454222516598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222516599, "dur": 0, "args": { "External id": 74907, "cbid": 203, "correlation": 74907 } }, { "ph": "f", "id": 74907, "pid": 76337, "tid": -914061504, "ts": 1716454222516599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222516600, "dur": 0, "args": { "External id": 74908, "cbid": 205, "correlation": 74908 } }, { "ph": "f", "id": 74908, "pid": 76337, "tid": -914061504, "ts": 1716454222516600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222565496, "dur": 24, "args": { "External id": 74912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74912, "pid": 5, "tid": 7, "ts": 1716454222565496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516614, "dur": 12, "args": { "External id": 74912, "cbid": 211, "correlation": 74912 } }, { "ph": "s", "id": 74912, "pid": 76337, "tid": -914061504, "ts": 1716454222516614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222565521, "dur": 44, "args": { "External id": 74914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74914, "pid": 5, "tid": 7, "ts": 1716454222565521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516628, "dur": 5, "args": { "External id": 74914, "cbid": 211, "correlation": 74914 } }, { "ph": "s", "id": 74914, "pid": 76337, "tid": -914061504, "ts": 1716454222516628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222565567, "dur": 655, "args": { "External id": 74916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74916, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74916, "pid": 5, "tid": 7, "ts": 1716454222565567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516640, "dur": 6, "args": { "External id": 74916, "cbid": 211, "correlation": 74916 } }, { "ph": "s", "id": 74916, "pid": 76337, "tid": -914061504, "ts": 1716454222516640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222566223, "dur": 22, "args": { "External id": 74918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74918, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74918, "pid": 5, "tid": 7, "ts": 1716454222566223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516649, "dur": 5, "args": { "External id": 74918, "cbid": 211, "correlation": 74918 } }, { "ph": "s", "id": 74918, "pid": 76337, "tid": -914061504, "ts": 1716454222516649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222566246, "dur": 34, "args": { "External id": 74924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74924, "pid": 5, "tid": 7, "ts": 1716454222566246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516677, "dur": 8, "args": { "External id": 74924, "cbid": 211, "correlation": 74924 } }, { "ph": "s", "id": 74924, "pid": 76337, "tid": -914061504, "ts": 1716454222516677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222566282, "dur": 27, "args": { "External id": 74932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74932, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74932, "pid": 5, "tid": 7, "ts": 1716454222566282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516708, "dur": 8, "args": { "External id": 74932, "cbid": 211, "correlation": 74932 } }, { "ph": "s", "id": 74932, "pid": 76337, "tid": -914061504, "ts": 1716454222516708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222566310, "dur": 20, "args": { "External id": 74940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74940, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74940, "pid": 5, "tid": 7, "ts": 1716454222566310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516738, "dur": 8, "args": { "External id": 74940, "cbid": 211, "correlation": 74940 } }, { "ph": "s", "id": 74940, "pid": 76337, "tid": -914061504, "ts": 1716454222516738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222566331, "dur": 31, "args": { "External id": 74960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74960, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 74960, "pid": 5, "tid": 7, "ts": 1716454222566331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516816, "dur": 12, "args": { "External id": 74960, "cbid": 211, "correlation": 74960 } }, { "ph": "s", "id": 74960, "pid": 76337, "tid": -914061504, "ts": 1716454222516816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222566364, "dur": 4, "args": { "External id": 74972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74972, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 74972, "pid": 5, "tid": 7, "ts": 1716454222566364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516837, "dur": 6, "args": { "External id": 74972, "cbid": 211, "correlation": 74972 } }, { "ph": "s", "id": 74972, "pid": 76337, "tid": -914061504, "ts": 1716454222516837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222566369, "dur": 31, "args": { "External id": 74975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74975, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74975, "pid": 5, "tid": 7, "ts": 1716454222566369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516855, "dur": 6, "args": { "External id": 74975, "cbid": 211, "correlation": 74975 } }, { "ph": "s", "id": 74975, "pid": 76337, "tid": -914061504, "ts": 1716454222516855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222516912, "dur": 0, "args": { "External id": 74986, "cbid": 317, "correlation": 74986 } }, { "ph": "f", "id": 74986, "pid": 76337, "tid": -914061504, "ts": 1716454222516912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222516912, "dur": 0, "args": { "External id": 74987, "cbid": 203, "correlation": 74987 } }, { "ph": "f", "id": 74987, "pid": 76337, "tid": -914061504, "ts": 1716454222516912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222516913, "dur": 0, "args": { "External id": 74988, "cbid": 205, "correlation": 74988 } }, { "ph": "f", "id": 74988, "pid": 76337, "tid": -914061504, "ts": 1716454222516913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222566401, "dur": 22, "args": { "External id": 74992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74992, "pid": 5, "tid": 7, "ts": 1716454222566401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516926, "dur": 11, "args": { "External id": 74992, "cbid": 211, "correlation": 74992 } }, { "ph": "s", "id": 74992, "pid": 76337, "tid": -914061504, "ts": 1716454222516926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222566424, "dur": 119, "args": { "External id": 74994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74994, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 74994, "pid": 5, "tid": 7, "ts": 1716454222566424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516944, "dur": 7, "args": { "External id": 74994, "cbid": 211, "correlation": 74994 } }, { "ph": "s", "id": 74994, "pid": 76337, "tid": -914061504, "ts": 1716454222516944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222566544, "dur": 22, "args": { "External id": 74996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 74996, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 74996, "pid": 5, "tid": 7, "ts": 1716454222566544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516954, "dur": 5, "args": { "External id": 74996, "cbid": 211, "correlation": 74996 } }, { "ph": "s", "id": 74996, "pid": 76337, "tid": -914061504, "ts": 1716454222516954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222566567, "dur": 33, "args": { "External id": 75002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75002, "pid": 5, "tid": 7, "ts": 1716454222566567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222516989, "dur": 10, "args": { "External id": 75002, "cbid": 211, "correlation": 75002 } }, { "ph": "s", "id": 75002, "pid": 76337, "tid": -914061504, "ts": 1716454222516989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222566602, "dur": 202, "args": { "External id": 75011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75011, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75011, "pid": 5, "tid": 7, "ts": 1716454222566602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517072, "dur": 13, "args": { "External id": 75011, "cbid": 211, "correlation": 75011 } }, { "ph": "s", "id": 75011, "pid": 76337, "tid": -914061504, "ts": 1716454222517072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222566805, "dur": 67, "args": { "External id": 75033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75033, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75033, "pid": 5, "tid": 7, "ts": 1716454222566805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517129, "dur": 10, "args": { "External id": 75033, "cbid": 211, "correlation": 75033 } }, { "ph": "s", "id": 75033, "pid": 76337, "tid": -914061504, "ts": 1716454222517129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517216, "dur": 1, "args": { "External id": 75044, "cbid": 251, "correlation": 75044 } }, { "ph": "f", "id": 75044, "pid": 76337, "tid": -914061504, "ts": 1716454222517216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222566873, "dur": 155, "args": { "External id": 75045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75045, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75045, "pid": 5, "tid": 7, "ts": 1716454222566873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517222, "dur": 12, "args": { "External id": 75045, "cbid": 211, "correlation": 75045 } }, { "ph": "s", "id": 75045, "pid": 76337, "tid": -914061504, "ts": 1716454222517222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517291, "dur": 1, "args": { "External id": 75056, "cbid": 251, "correlation": 75056 } }, { "ph": "f", "id": 75056, "pid": 76337, "tid": -914061504, "ts": 1716454222517291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222567030, "dur": 148, "args": { "External id": 75057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75057, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75057, "pid": 5, "tid": 7, "ts": 1716454222567030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517295, "dur": 12, "args": { "External id": 75057, "cbid": 211, "correlation": 75057 } }, { "ph": "s", "id": 75057, "pid": 76337, "tid": -914061504, "ts": 1716454222517295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517360, "dur": 1, "args": { "External id": 75068, "cbid": 251, "correlation": 75068 } }, { "ph": "f", "id": 75068, "pid": 76337, "tid": -914061504, "ts": 1716454222517360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222567180, "dur": 148, "args": { "External id": 75069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75069, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75069, "pid": 5, "tid": 7, "ts": 1716454222567180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517364, "dur": 11, "args": { "External id": 75069, "cbid": 211, "correlation": 75069 } }, { "ph": "s", "id": 75069, "pid": 76337, "tid": -914061504, "ts": 1716454222517364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222567329, "dur": 1988, "args": { "External id": 75090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75090, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 75090, "pid": 5, "tid": 7, "ts": 1716454222567329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517442, "dur": 14, "args": { "External id": 75090, "cbid": 211, "correlation": 75090 } }, { "ph": "s", "id": 75090, "pid": 76337, "tid": -914061504, "ts": 1716454222517442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517541, "dur": 1, "args": { "External id": 75108, "cbid": 251, "correlation": 75108 } }, { "ph": "f", "id": 75108, "pid": 76337, "tid": -914061504, "ts": 1716454222517541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222569319, "dur": 151, "args": { "External id": 75110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75110, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 75110, "pid": 5, "tid": 7, "ts": 1716454222569319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517547, "dur": 13, "args": { "External id": 75110, "cbid": 211, "correlation": 75110 } }, { "ph": "s", "id": 75110, "pid": 76337, "tid": -914061504, "ts": 1716454222517547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222569471, "dur": 35, "args": { "External id": 75118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75118, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75118, "pid": 5, "tid": 7, "ts": 1716454222569471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517619, "dur": 12, "args": { "External id": 75118, "cbid": 211, "correlation": 75118 } }, { "ph": "s", "id": 75118, "pid": 76337, "tid": -914061504, "ts": 1716454222517619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222569508, "dur": 50, "args": { "External id": 75126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75126, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75126, "pid": 5, "tid": 7, "ts": 1716454222569508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517657, "dur": 8, "args": { "External id": 75126, "cbid": 211, "correlation": 75126 } }, { "ph": "s", "id": 75126, "pid": 76337, "tid": -914061504, "ts": 1716454222517657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222569559, "dur": 31, "args": { "External id": 75137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75137, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75137, "pid": 5, "tid": 7, "ts": 1716454222569559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517729, "dur": 13, "args": { "External id": 75137, "cbid": 211, "correlation": 75137 } }, { "ph": "s", "id": 75137, "pid": 76337, "tid": -914061504, "ts": 1716454222517729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222569591, "dur": 35, "args": { "External id": 75159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75159, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75159, "pid": 5, "tid": 7, "ts": 1716454222569591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517760, "dur": 7, "args": { "External id": 75159, "cbid": 211, "correlation": 75159 } }, { "ph": "s", "id": 75159, "pid": 76337, "tid": -914061504, "ts": 1716454222517760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517845, "dur": 1, "args": { "External id": 75170, "cbid": 251, "correlation": 75170 } }, { "ph": "f", "id": 75170, "pid": 76337, "tid": -914061504, "ts": 1716454222517845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222569628, "dur": 90, "args": { "External id": 75171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75171, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75171, "pid": 5, "tid": 7, "ts": 1716454222569628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517850, "dur": 12, "args": { "External id": 75171, "cbid": 211, "correlation": 75171 } }, { "ph": "s", "id": 75171, "pid": 76337, "tid": -914061504, "ts": 1716454222517850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517918, "dur": 1, "args": { "External id": 75182, "cbid": 251, "correlation": 75182 } }, { "ph": "f", "id": 75182, "pid": 76337, "tid": -914061504, "ts": 1716454222517918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222517921, "dur": 0, "args": { "External id": 75183, "cbid": 251, "correlation": 75183 } }, { "ph": "f", "id": 75183, "pid": 76337, "tid": -914061504, "ts": 1716454222517921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222569720, "dur": 11, "args": { "External id": 75184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75184, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 75184, "pid": 5, "tid": 7, "ts": 1716454222569720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517923, "dur": 12, "args": { "External id": 75184, "cbid": 211, "correlation": 75184 } }, { "ph": "s", "id": 75184, "pid": 76337, "tid": -914061504, "ts": 1716454222517923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222569732, "dur": 5, "args": { "External id": 75186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75186, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 75186, "pid": 5, "tid": 7, "ts": 1716454222569732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222517936, "dur": 6, "args": { "External id": 75186, "cbid": 211, "correlation": 75186 } }, { "ph": "s", "id": 75186, "pid": 76337, "tid": -914061504, "ts": 1716454222517936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222518003, "dur": 1, "args": { "External id": 75197, "cbid": 251, "correlation": 75197 } }, { "ph": "f", "id": 75197, "pid": 76337, "tid": -914061504, "ts": 1716454222518003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222518007, "dur": 0, "args": { "External id": 75198, "cbid": 251, "correlation": 75198 } }, { "ph": "f", "id": 75198, "pid": 76337, "tid": -914061504, "ts": 1716454222518007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222569738, "dur": 7, "args": { "External id": 75199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75199, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 75199, "pid": 5, "tid": 7, "ts": 1716454222569738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518008, "dur": 12, "args": { "External id": 75199, "cbid": 211, "correlation": 75199 } }, { "ph": "s", "id": 75199, "pid": 76337, "tid": -914061504, "ts": 1716454222518008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222569747, "dur": 3, "args": { "External id": 75201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75201, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 75201, "pid": 5, "tid": 7, "ts": 1716454222569747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518022, "dur": 5, "args": { "External id": 75201, "cbid": 211, "correlation": 75201 } }, { "ph": "s", "id": 75201, "pid": 76337, "tid": -914061504, "ts": 1716454222518022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222569751, "dur": 94, "args": { "External id": 75222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75222, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 75222, "pid": 5, "tid": 7, "ts": 1716454222569751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518096, "dur": 12, "args": { "External id": 75222, "cbid": 211, "correlation": 75222 } }, { "ph": "s", "id": 75222, "pid": 76337, "tid": -914061504, "ts": 1716454222518096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222518194, "dur": 1, "args": { "External id": 75240, "cbid": 251, "correlation": 75240 } }, { "ph": "f", "id": 75240, "pid": 76337, "tid": -914061504, "ts": 1716454222518194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222569846, "dur": 100, "args": { "External id": 75242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75242, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75242, "pid": 5, "tid": 7, "ts": 1716454222569846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518200, "dur": 13, "args": { "External id": 75242, "cbid": 211, "correlation": 75242 } }, { "ph": "s", "id": 75242, "pid": 76337, "tid": -914061504, "ts": 1716454222518200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222569948, "dur": 19, "args": { "External id": 75250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75250, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75250, "pid": 5, "tid": 7, "ts": 1716454222569948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518268, "dur": 12, "args": { "External id": 75250, "cbid": 211, "correlation": 75250 } }, { "ph": "s", "id": 75250, "pid": 76337, "tid": -914061504, "ts": 1716454222518268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222569969, "dur": 37, "args": { "External id": 75258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75258, "pid": 5, "tid": 7, "ts": 1716454222569969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518310, "dur": 9, "args": { "External id": 75258, "cbid": 211, "correlation": 75258 } }, { "ph": "s", "id": 75258, "pid": 76337, "tid": -914061504, "ts": 1716454222518310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222570007, "dur": 36, "args": { "External id": 75280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75280, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75280, "pid": 5, "tid": 7, "ts": 1716454222570007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518361, "dur": 10, "args": { "External id": 75280, "cbid": 211, "correlation": 75280 } }, { "ph": "s", "id": 75280, "pid": 76337, "tid": -914061504, "ts": 1716454222518361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222518450, "dur": 1, "args": { "External id": 75296, "cbid": 251, "correlation": 75296 } }, { "ph": "f", "id": 75296, "pid": 76337, "tid": -914061504, "ts": 1716454222518450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222518455, "dur": 0, "args": { "External id": 75298, "cbid": 251, "correlation": 75298 } }, { "ph": "f", "id": 75298, "pid": 76337, "tid": -914061504, "ts": 1716454222518455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222570044, "dur": 549, "args": { "External id": 75299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75299, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 75299, "pid": 5, "tid": 7, "ts": 1716454222570044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518458, "dur": 13, "args": { "External id": 75299, "cbid": 211, "correlation": 75299 } }, { "ph": "s", "id": 75299, "pid": 76337, "tid": -914061504, "ts": 1716454222518458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222570594, "dur": 127, "args": { "External id": 75307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75307, "pid": 5, "tid": 7, "ts": 1716454222570594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518524, "dur": 13, "args": { "External id": 75307, "cbid": 211, "correlation": 75307 } }, { "ph": "s", "id": 75307, "pid": 76337, "tid": -914061504, "ts": 1716454222518524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222570723, "dur": 130, "args": { "External id": 75315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75315, "pid": 5, "tid": 7, "ts": 1716454222570723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518555, "dur": 8, "args": { "External id": 75315, "cbid": 211, "correlation": 75315 } }, { "ph": "s", "id": 75315, "pid": 76337, "tid": -914061504, "ts": 1716454222518555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222518631, "dur": 1, "args": { "External id": 75331, "cbid": 251, "correlation": 75331 } }, { "ph": "f", "id": 75331, "pid": 76337, "tid": -914061504, "ts": 1716454222518631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222570854, "dur": 308, "args": { "External id": 75333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75333, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75333, "pid": 5, "tid": 7, "ts": 1716454222570854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518636, "dur": 12, "args": { "External id": 75333, "cbid": 211, "correlation": 75333 } }, { "ph": "s", "id": 75333, "pid": 76337, "tid": -914061504, "ts": 1716454222518636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222571163, "dur": 27, "args": { "External id": 75341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75341, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75341, "pid": 5, "tid": 7, "ts": 1716454222571163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518678, "dur": 10, "args": { "External id": 75341, "cbid": 211, "correlation": 75341 } }, { "ph": "s", "id": 75341, "pid": 76337, "tid": -914061504, "ts": 1716454222518678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222571191, "dur": 83, "args": { "External id": 75352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75352, "pid": 5, "tid": 7, "ts": 1716454222571191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518745, "dur": 12, "args": { "External id": 75352, "cbid": 211, "correlation": 75352 } }, { "ph": "s", "id": 75352, "pid": 76337, "tid": -914061504, "ts": 1716454222518745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222518809, "dur": 0, "args": { "External id": 75364, "cbid": 317, "correlation": 75364 } }, { "ph": "f", "id": 75364, "pid": 76337, "tid": -914061504, "ts": 1716454222518809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222518810, "dur": 0, "args": { "External id": 75365, "cbid": 203, "correlation": 75365 } }, { "ph": "f", "id": 75365, "pid": 76337, "tid": -914061504, "ts": 1716454222518810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222518811, "dur": 0, "args": { "External id": 75366, "cbid": 205, "correlation": 75366 } }, { "ph": "f", "id": 75366, "pid": 76337, "tid": -914061504, "ts": 1716454222518811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571276, "dur": 23, "args": { "External id": 75370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75370, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75370, "pid": 5, "tid": 7, "ts": 1716454222571276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518826, "dur": 12, "args": { "External id": 75370, "cbid": 211, "correlation": 75370 } }, { "ph": "s", "id": 75370, "pid": 76337, "tid": -914061504, "ts": 1716454222518826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222571300, "dur": 123, "args": { "External id": 75372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75372, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75372, "pid": 5, "tid": 7, "ts": 1716454222571300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518846, "dur": 6, "args": { "External id": 75372, "cbid": 211, "correlation": 75372 } }, { "ph": "s", "id": 75372, "pid": 76337, "tid": -914061504, "ts": 1716454222518846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571425, "dur": 21, "args": { "External id": 75374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75374, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75374, "pid": 5, "tid": 7, "ts": 1716454222571425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518856, "dur": 5, "args": { "External id": 75374, "cbid": 211, "correlation": 75374 } }, { "ph": "s", "id": 75374, "pid": 76337, "tid": -914061504, "ts": 1716454222518856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222571447, "dur": 33, "args": { "External id": 75380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75380, "pid": 5, "tid": 7, "ts": 1716454222571447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518884, "dur": 8, "args": { "External id": 75380, "cbid": 211, "correlation": 75380 } }, { "ph": "s", "id": 75380, "pid": 76337, "tid": -914061504, "ts": 1716454222518884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222571482, "dur": 27, "args": { "External id": 75388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75388, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75388, "pid": 5, "tid": 7, "ts": 1716454222571482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222518916, "dur": 8, "args": { "External id": 75388, "cbid": 211, "correlation": 75388 } }, { "ph": "s", "id": 75388, "pid": 76337, "tid": -914061504, "ts": 1716454222518916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222518996, "dur": 0, "args": { "External id": 75398, "cbid": 317, "correlation": 75398 } }, { "ph": "f", "id": 75398, "pid": 76337, "tid": -914061504, "ts": 1716454222518996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222518997, "dur": 0, "args": { "External id": 75399, "cbid": 203, "correlation": 75399 } }, { "ph": "f", "id": 75399, "pid": 76337, "tid": -914061504, "ts": 1716454222518997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222518997, "dur": 0, "args": { "External id": 75400, "cbid": 205, "correlation": 75400 } }, { "ph": "f", "id": 75400, "pid": 76337, "tid": -914061504, "ts": 1716454222518997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571510, "dur": 22, "args": { "External id": 75404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75404, "pid": 5, "tid": 7, "ts": 1716454222571510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519011, "dur": 12, "args": { "External id": 75404, "cbid": 211, "correlation": 75404 } }, { "ph": "s", "id": 75404, "pid": 76337, "tid": -914061504, "ts": 1716454222519011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571533, "dur": 46, "args": { "External id": 75406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75406, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75406, "pid": 5, "tid": 7, "ts": 1716454222571533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519026, "dur": 5, "args": { "External id": 75406, "cbid": 211, "correlation": 75406 } }, { "ph": "s", "id": 75406, "pid": 76337, "tid": -914061504, "ts": 1716454222519026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222571580, "dur": 239, "args": { "External id": 75408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75408, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 75408, "pid": 5, "tid": 7, "ts": 1716454222571580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519038, "dur": 7, "args": { "External id": 75408, "cbid": 211, "correlation": 75408 } }, { "ph": "s", "id": 75408, "pid": 76337, "tid": -914061504, "ts": 1716454222519038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571820, "dur": 6, "args": { "External id": 75410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75410, "pid": 5, "tid": 7, "ts": 1716454222571820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519048, "dur": 5, "args": { "External id": 75410, "cbid": 211, "correlation": 75410 } }, { "ph": "s", "id": 75410, "pid": 76337, "tid": -914061504, "ts": 1716454222519048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222571828, "dur": 9, "args": { "External id": 75416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75416, "pid": 5, "tid": 7, "ts": 1716454222571828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519075, "dur": 8, "args": { "External id": 75416, "cbid": 211, "correlation": 75416 } }, { "ph": "s", "id": 75416, "pid": 76337, "tid": -914061504, "ts": 1716454222519075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222571838, "dur": 12, "args": { "External id": 75436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75436, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 75436, "pid": 5, "tid": 7, "ts": 1716454222571838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519169, "dur": 13, "args": { "External id": 75436, "cbid": 211, "correlation": 75436 } }, { "ph": "s", "id": 75436, "pid": 76337, "tid": -914061504, "ts": 1716454222519169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222571852, "dur": 4, "args": { "External id": 75448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75448, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 75448, "pid": 5, "tid": 7, "ts": 1716454222571852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519192, "dur": 6, "args": { "External id": 75448, "cbid": 211, "correlation": 75448 } }, { "ph": "s", "id": 75448, "pid": 76337, "tid": -914061504, "ts": 1716454222519192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222571858, "dur": 12, "args": { "External id": 75451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75451, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75451, "pid": 5, "tid": 7, "ts": 1716454222571858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519210, "dur": 7, "args": { "External id": 75451, "cbid": 211, "correlation": 75451 } }, { "ph": "s", "id": 75451, "pid": 76337, "tid": -914061504, "ts": 1716454222519210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222571872, "dur": 7, "args": { "External id": 75460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75460, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75460, "pid": 5, "tid": 7, "ts": 1716454222571872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519250, "dur": 9, "args": { "External id": 75460, "cbid": 211, "correlation": 75460 } }, { "ph": "s", "id": 75460, "pid": 76337, "tid": -914061504, "ts": 1716454222519250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222519302, "dur": 0, "args": { "External id": 75470, "cbid": 317, "correlation": 75470 } }, { "ph": "f", "id": 75470, "pid": 76337, "tid": -914061504, "ts": 1716454222519302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222519303, "dur": 0, "args": { "External id": 75471, "cbid": 203, "correlation": 75471 } }, { "ph": "f", "id": 75471, "pid": 76337, "tid": -914061504, "ts": 1716454222519303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222519304, "dur": 0, "args": { "External id": 75472, "cbid": 205, "correlation": 75472 } }, { "ph": "f", "id": 75472, "pid": 76337, "tid": -914061504, "ts": 1716454222519304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571880, "dur": 6, "args": { "External id": 75476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75476, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75476, "pid": 5, "tid": 7, "ts": 1716454222571880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519320, "dur": 11, "args": { "External id": 75476, "cbid": 211, "correlation": 75476 } }, { "ph": "s", "id": 75476, "pid": 76337, "tid": -914061504, "ts": 1716454222519320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222571887, "dur": 85, "args": { "External id": 75478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75478, "pid": 5, "tid": 7, "ts": 1716454222571887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519334, "dur": 5, "args": { "External id": 75478, "cbid": 211, "correlation": 75478 } }, { "ph": "s", "id": 75478, "pid": 76337, "tid": -914061504, "ts": 1716454222519334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222571974, "dur": 1, "args": { "External id": 75480, "device": 5, "context": 1, "stream": 7, "correlation": 75480, "bytes": 960, "memory bandwidth (GB/s)": 0.5454545454545454 } }, { "ph": "f", "id": 75480, "pid": 5, "tid": 7, "ts": 1716454222571974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222519347, "dur": 9, "args": { "External id": 75480, "cbid": 51, "correlation": 75480 } }, { "ph": "s", "id": 75480, "pid": 76337, "tid": -914061504, "ts": 1716454222519347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222571978, "dur": 544, "args": { "External id": 75481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75481, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75481, "pid": 5, "tid": 7, "ts": 1716454222571978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519357, "dur": 8, "args": { "External id": 75481, "cbid": 211, "correlation": 75481 } }, { "ph": "s", "id": 75481, "pid": 76337, "tid": -914061504, "ts": 1716454222519357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222572523, "dur": 12, "args": { "External id": 75483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75483, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75483, "pid": 5, "tid": 7, "ts": 1716454222572523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519368, "dur": 5, "args": { "External id": 75483, "cbid": 211, "correlation": 75483 } }, { "ph": "s", "id": 75483, "pid": 76337, "tid": -914061504, "ts": 1716454222519368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222572536, "dur": 15, "args": { "External id": 75489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75489, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75489, "pid": 5, "tid": 7, "ts": 1716454222572536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519396, "dur": 9, "args": { "External id": 75489, "cbid": 211, "correlation": 75489 } }, { "ph": "s", "id": 75489, "pid": 76337, "tid": -914061504, "ts": 1716454222519396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222572552, "dur": 3, "args": { "External id": 75497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75497, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 75497, "pid": 5, "tid": 7, "ts": 1716454222572552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519440, "dur": 10, "args": { "External id": 75497, "cbid": 211, "correlation": 75497 } }, { "ph": "s", "id": 75497, "pid": 76337, "tid": -914061504, "ts": 1716454222519440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222519504, "dur": 1, "args": { "External id": 75513, "cbid": 251, "correlation": 75513 } }, { "ph": "f", "id": 75513, "pid": 76337, "tid": -914061504, "ts": 1716454222519504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222519510, "dur": 0, "args": { "External id": 75515, "cbid": 251, "correlation": 75515 } }, { "ph": "f", "id": 75515, "pid": 76337, "tid": -914061504, "ts": 1716454222519510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222572557, "dur": 13, "args": { "External id": 75516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75516, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75516, "pid": 5, "tid": 7, "ts": 1716454222572557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519512, "dur": 11, "args": { "External id": 75516, "cbid": 211, "correlation": 75516 } }, { "ph": "s", "id": 75516, "pid": 76337, "tid": -914061504, "ts": 1716454222519512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222572572, "dur": 5, "args": { "External id": 75518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75518, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75518, "pid": 5, "tid": 7, "ts": 1716454222572572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519524, "dur": 5, "args": { "External id": 75518, "cbid": 211, "correlation": 75518 } }, { "ph": "s", "id": 75518, "pid": 76337, "tid": -914061504, "ts": 1716454222519524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222572578, "dur": 17, "args": { "External id": 75528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75528, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75528, "pid": 5, "tid": 7, "ts": 1716454222572578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519581, "dur": 12, "args": { "External id": 75528, "cbid": 211, "correlation": 75528 } }, { "ph": "s", "id": 75528, "pid": 76337, "tid": -914061504, "ts": 1716454222519581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222572596, "dur": 19, "args": { "External id": 75548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75548, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 75548, "pid": 5, "tid": 7, "ts": 1716454222572596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519647, "dur": 11, "args": { "External id": 75548, "cbid": 211, "correlation": 75548 } }, { "ph": "s", "id": 75548, "pid": 76337, "tid": -914061504, "ts": 1716454222519647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222572617, "dur": 5, "args": { "External id": 75560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75560, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 75560, "pid": 5, "tid": 7, "ts": 1716454222572617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519669, "dur": 6, "args": { "External id": 75560, "cbid": 211, "correlation": 75560 } }, { "ph": "s", "id": 75560, "pid": 76337, "tid": -914061504, "ts": 1716454222519669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222572623, "dur": 17, "args": { "External id": 75563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75563, "pid": 5, "tid": 7, "ts": 1716454222572623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519687, "dur": 6, "args": { "External id": 75563, "cbid": 211, "correlation": 75563 } }, { "ph": "s", "id": 75563, "pid": 76337, "tid": -914061504, "ts": 1716454222519687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222572641, "dur": 11, "args": { "External id": 75572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75572, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75572, "pid": 5, "tid": 7, "ts": 1716454222572641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519727, "dur": 10, "args": { "External id": 75572, "cbid": 211, "correlation": 75572 } }, { "ph": "s", "id": 75572, "pid": 76337, "tid": -914061504, "ts": 1716454222519727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222519790, "dur": 0, "args": { "External id": 75582, "cbid": 317, "correlation": 75582 } }, { "ph": "f", "id": 75582, "pid": 76337, "tid": -914061504, "ts": 1716454222519790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222519791, "dur": 0, "args": { "External id": 75583, "cbid": 203, "correlation": 75583 } }, { "ph": "f", "id": 75583, "pid": 76337, "tid": -914061504, "ts": 1716454222519791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222519792, "dur": 0, "args": { "External id": 75584, "cbid": 205, "correlation": 75584 } }, { "ph": "f", "id": 75584, "pid": 76337, "tid": -914061504, "ts": 1716454222519792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222572654, "dur": 11, "args": { "External id": 75588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75588, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75588, "pid": 5, "tid": 7, "ts": 1716454222572654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519810, "dur": 12, "args": { "External id": 75588, "cbid": 211, "correlation": 75588 } }, { "ph": "s", "id": 75588, "pid": 76337, "tid": -914061504, "ts": 1716454222519810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222572666, "dur": 165, "args": { "External id": 75590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75590, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75590, "pid": 5, "tid": 7, "ts": 1716454222572666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519824, "dur": 5, "args": { "External id": 75590, "cbid": 211, "correlation": 75590 } }, { "ph": "s", "id": 75590, "pid": 76337, "tid": -914061504, "ts": 1716454222519824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222572833, "dur": 1, "args": { "External id": 75592, "device": 5, "context": 1, "stream": 7, "correlation": 75592, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 75592, "pid": 5, "tid": 7, "ts": 1716454222572833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222519836, "dur": 7, "args": { "External id": 75592, "cbid": 51, "correlation": 75592 } }, { "ph": "s", "id": 75592, "pid": 76337, "tid": -914061504, "ts": 1716454222519836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222572837, "dur": 672, "args": { "External id": 75593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75593, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75593, "pid": 5, "tid": 7, "ts": 1716454222572837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519844, "dur": 7, "args": { "External id": 75593, "cbid": 211, "correlation": 75593 } }, { "ph": "s", "id": 75593, "pid": 76337, "tid": -914061504, "ts": 1716454222519844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222573511, "dur": 13, "args": { "External id": 75595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75595, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75595, "pid": 5, "tid": 7, "ts": 1716454222573511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519855, "dur": 6, "args": { "External id": 75595, "cbid": 211, "correlation": 75595 } }, { "ph": "s", "id": 75595, "pid": 76337, "tid": -914061504, "ts": 1716454222519855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222573526, "dur": 15, "args": { "External id": 75601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75601, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75601, "pid": 5, "tid": 7, "ts": 1716454222573526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519883, "dur": 8, "args": { "External id": 75601, "cbid": 211, "correlation": 75601 } }, { "ph": "s", "id": 75601, "pid": 76337, "tid": -914061504, "ts": 1716454222519883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222519941, "dur": 0, "args": { "External id": 75611, "cbid": 317, "correlation": 75611 } }, { "ph": "f", "id": 75611, "pid": 76337, "tid": -914061504, "ts": 1716454222519941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222519942, "dur": 0, "args": { "External id": 75612, "cbid": 203, "correlation": 75612 } }, { "ph": "f", "id": 75612, "pid": 76337, "tid": -914061504, "ts": 1716454222519942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222519942, "dur": 0, "args": { "External id": 75613, "cbid": 205, "correlation": 75613 } }, { "ph": "f", "id": 75613, "pid": 76337, "tid": -914061504, "ts": 1716454222519942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222573542, "dur": 8, "args": { "External id": 75617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75617, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75617, "pid": 5, "tid": 7, "ts": 1716454222573542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519957, "dur": 12, "args": { "External id": 75617, "cbid": 211, "correlation": 75617 } }, { "ph": "s", "id": 75617, "pid": 76337, "tid": -914061504, "ts": 1716454222519957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222573551, "dur": 3, "args": { "External id": 75619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 75619, "pid": 5, "tid": 7, "ts": 1716454222573551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519983, "dur": 6, "args": { "External id": 75619, "cbid": 211, "correlation": 75619 } }, { "ph": "s", "id": 75619, "pid": 76337, "tid": -914061504, "ts": 1716454222519983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222519993, "dur": 0, "args": { "External id": 75620, "cbid": 51, "correlation": 75620 } }, { "ph": "s", "id": 75620, "pid": 76337, "tid": -914061504, "ts": 1716454222519993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222573556, "dur": 58, "args": { "External id": 75621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75621, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 75621, "pid": 5, "tid": 7, "ts": 1716454222573556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222519994, "dur": 6, "args": { "External id": 75621, "cbid": 211, "correlation": 75621 } }, { "ph": "s", "id": 75621, "pid": 76337, "tid": -914061504, "ts": 1716454222519994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222573616, "dur": 15, "args": { "External id": 75626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75626, "pid": 5, "tid": 7, "ts": 1716454222573616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520021, "dur": 8, "args": { "External id": 75626, "cbid": 211, "correlation": 75626 } }, { "ph": "s", "id": 75626, "pid": 76337, "tid": -914061504, "ts": 1716454222520021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222573632, "dur": 11, "args": { "External id": 75634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75634, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75634, "pid": 5, "tid": 7, "ts": 1716454222573632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520049, "dur": 8, "args": { "External id": 75634, "cbid": 211, "correlation": 75634 } }, { "ph": "s", "id": 75634, "pid": 76337, "tid": -914061504, "ts": 1716454222520049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222573644, "dur": 11, "args": { "External id": 75642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75642, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75642, "pid": 5, "tid": 7, "ts": 1716454222573644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520078, "dur": 8, "args": { "External id": 75642, "cbid": 211, "correlation": 75642 } }, { "ph": "s", "id": 75642, "pid": 76337, "tid": -914061504, "ts": 1716454222520078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222573657, "dur": 19, "args": { "External id": 75662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75662, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 75662, "pid": 5, "tid": 7, "ts": 1716454222573657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520158, "dur": 12, "args": { "External id": 75662, "cbid": 211, "correlation": 75662 } }, { "ph": "s", "id": 75662, "pid": 76337, "tid": -914061504, "ts": 1716454222520158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222573677, "dur": 4, "args": { "External id": 75674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75674, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 75674, "pid": 5, "tid": 7, "ts": 1716454222573677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520180, "dur": 6, "args": { "External id": 75674, "cbid": 211, "correlation": 75674 } }, { "ph": "s", "id": 75674, "pid": 76337, "tid": -914061504, "ts": 1716454222520180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222573683, "dur": 17, "args": { "External id": 75677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75677, "pid": 5, "tid": 7, "ts": 1716454222573683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520197, "dur": 6, "args": { "External id": 75677, "cbid": 211, "correlation": 75677 } }, { "ph": "s", "id": 75677, "pid": 76337, "tid": -914061504, "ts": 1716454222520197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222520254, "dur": 0, "args": { "External id": 75688, "cbid": 317, "correlation": 75688 } }, { "ph": "f", "id": 75688, "pid": 76337, "tid": -914061504, "ts": 1716454222520254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222520255, "dur": 0, "args": { "External id": 75689, "cbid": 203, "correlation": 75689 } }, { "ph": "f", "id": 75689, "pid": 76337, "tid": -914061504, "ts": 1716454222520255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222520256, "dur": 0, "args": { "External id": 75690, "cbid": 205, "correlation": 75690 } }, { "ph": "f", "id": 75690, "pid": 76337, "tid": -914061504, "ts": 1716454222520256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222573701, "dur": 12, "args": { "External id": 75694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75694, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75694, "pid": 5, "tid": 7, "ts": 1716454222573701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520273, "dur": 11, "args": { "External id": 75694, "cbid": 211, "correlation": 75694 } }, { "ph": "s", "id": 75694, "pid": 76337, "tid": -914061504, "ts": 1716454222520273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222573714, "dur": 3, "args": { "External id": 75696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 75696, "pid": 5, "tid": 7, "ts": 1716454222573714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520289, "dur": 5, "args": { "External id": 75696, "cbid": 211, "correlation": 75696 } }, { "ph": "s", "id": 75696, "pid": 76337, "tid": -914061504, "ts": 1716454222520289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222520297, "dur": 0, "args": { "External id": 75697, "cbid": 51, "correlation": 75697 } }, { "ph": "s", "id": 75697, "pid": 76337, "tid": -914061504, "ts": 1716454222520297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222573719, "dur": 99, "args": { "External id": 75698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75698, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 75698, "pid": 5, "tid": 7, "ts": 1716454222573719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520298, "dur": 7, "args": { "External id": 75698, "cbid": 211, "correlation": 75698 } }, { "ph": "s", "id": 75698, "pid": 76337, "tid": -914061504, "ts": 1716454222520298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222573819, "dur": 16, "args": { "External id": 75703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75703, "pid": 5, "tid": 7, "ts": 1716454222573819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520328, "dur": 8, "args": { "External id": 75703, "cbid": 211, "correlation": 75703 } }, { "ph": "s", "id": 75703, "pid": 76337, "tid": -914061504, "ts": 1716454222520328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222573836, "dur": 86, "args": { "External id": 75712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75712, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75712, "pid": 5, "tid": 7, "ts": 1716454222573836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520410, "dur": 14, "args": { "External id": 75712, "cbid": 211, "correlation": 75712 } }, { "ph": "s", "id": 75712, "pid": 76337, "tid": -914061504, "ts": 1716454222520410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222573923, "dur": 31, "args": { "External id": 75734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75734, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75734, "pid": 5, "tid": 7, "ts": 1716454222573923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520466, "dur": 10, "args": { "External id": 75734, "cbid": 211, "correlation": 75734 } }, { "ph": "s", "id": 75734, "pid": 76337, "tid": -914061504, "ts": 1716454222520466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222520555, "dur": 1, "args": { "External id": 75745, "cbid": 251, "correlation": 75745 } }, { "ph": "f", "id": 75745, "pid": 76337, "tid": -914061504, "ts": 1716454222520555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222573956, "dur": 166, "args": { "External id": 75746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75746, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75746, "pid": 5, "tid": 7, "ts": 1716454222573956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520560, "dur": 13, "args": { "External id": 75746, "cbid": 211, "correlation": 75746 } }, { "ph": "s", "id": 75746, "pid": 76337, "tid": -914061504, "ts": 1716454222520560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222520630, "dur": 1, "args": { "External id": 75757, "cbid": 251, "correlation": 75757 } }, { "ph": "f", "id": 75757, "pid": 76337, "tid": -914061504, "ts": 1716454222520630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222574123, "dur": 161, "args": { "External id": 75758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75758, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75758, "pid": 5, "tid": 7, "ts": 1716454222574123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520634, "dur": 11, "args": { "External id": 75758, "cbid": 211, "correlation": 75758 } }, { "ph": "s", "id": 75758, "pid": 76337, "tid": -914061504, "ts": 1716454222520634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222520699, "dur": 1, "args": { "External id": 75769, "cbid": 251, "correlation": 75769 } }, { "ph": "f", "id": 75769, "pid": 76337, "tid": -914061504, "ts": 1716454222520699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222574286, "dur": 160, "args": { "External id": 75770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75770, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75770, "pid": 5, "tid": 7, "ts": 1716454222574286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520703, "dur": 11, "args": { "External id": 75770, "cbid": 211, "correlation": 75770 } }, { "ph": "s", "id": 75770, "pid": 76337, "tid": -914061504, "ts": 1716454222520703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222574448, "dur": 342, "args": { "External id": 75795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75795, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75795, "pid": 5, "tid": 7, "ts": 1716454222574448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520790, "dur": 12, "args": { "External id": 75795, "cbid": 211, "correlation": 75795 } }, { "ph": "s", "id": 75795, "pid": 76337, "tid": -914061504, "ts": 1716454222520790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222520891, "dur": 1, "args": { "External id": 75813, "cbid": 251, "correlation": 75813 } }, { "ph": "f", "id": 75813, "pid": 76337, "tid": -914061504, "ts": 1716454222520891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222574792, "dur": 168, "args": { "External id": 75815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75815, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75815, "pid": 5, "tid": 7, "ts": 1716454222574792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520897, "dur": 14, "args": { "External id": 75815, "cbid": 211, "correlation": 75815 } }, { "ph": "s", "id": 75815, "pid": 76337, "tid": -914061504, "ts": 1716454222520897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222574961, "dur": 19, "args": { "External id": 75823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75823, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75823, "pid": 5, "tid": 7, "ts": 1716454222574961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222520967, "dur": 19, "args": { "External id": 75823, "cbid": 211, "correlation": 75823 } }, { "ph": "s", "id": 75823, "pid": 76337, "tid": -914061504, "ts": 1716454222520967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222574981, "dur": 28, "args": { "External id": 75831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75831, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75831, "pid": 5, "tid": 7, "ts": 1716454222574981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521016, "dur": 9, "args": { "External id": 75831, "cbid": 211, "correlation": 75831 } }, { "ph": "s", "id": 75831, "pid": 76337, "tid": -914061504, "ts": 1716454222521016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222575010, "dur": 18, "args": { "External id": 75842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75842, "pid": 5, "tid": 7, "ts": 1716454222575010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521088, "dur": 12, "args": { "External id": 75842, "cbid": 211, "correlation": 75842 } }, { "ph": "s", "id": 75842, "pid": 76337, "tid": -914061504, "ts": 1716454222521088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222575029, "dur": 17, "args": { "External id": 75864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75864, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75864, "pid": 5, "tid": 7, "ts": 1716454222575029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521120, "dur": 7, "args": { "External id": 75864, "cbid": 211, "correlation": 75864 } }, { "ph": "s", "id": 75864, "pid": 76337, "tid": -914061504, "ts": 1716454222521120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521205, "dur": 2, "args": { "External id": 75875, "cbid": 251, "correlation": 75875 } }, { "ph": "f", "id": 75875, "pid": 76337, "tid": -914061504, "ts": 1716454222521205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222575047, "dur": 91, "args": { "External id": 75876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75876, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 75876, "pid": 5, "tid": 7, "ts": 1716454222575047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521212, "dur": 14, "args": { "External id": 75876, "cbid": 211, "correlation": 75876 } }, { "ph": "s", "id": 75876, "pid": 76337, "tid": -914061504, "ts": 1716454222521212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521282, "dur": 1, "args": { "External id": 75887, "cbid": 251, "correlation": 75887 } }, { "ph": "f", "id": 75887, "pid": 76337, "tid": -914061504, "ts": 1716454222521282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521286, "dur": 0, "args": { "External id": 75888, "cbid": 251, "correlation": 75888 } }, { "ph": "f", "id": 75888, "pid": 76337, "tid": -914061504, "ts": 1716454222521286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222575140, "dur": 13, "args": { "External id": 75889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75889, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75889, "pid": 5, "tid": 7, "ts": 1716454222575140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521288, "dur": 12, "args": { "External id": 75889, "cbid": 211, "correlation": 75889 } }, { "ph": "s", "id": 75889, "pid": 76337, "tid": -914061504, "ts": 1716454222521288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222575154, "dur": 6, "args": { "External id": 75891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75891, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75891, "pid": 5, "tid": 7, "ts": 1716454222575154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521301, "dur": 5, "args": { "External id": 75891, "cbid": 211, "correlation": 75891 } }, { "ph": "s", "id": 75891, "pid": 76337, "tid": -914061504, "ts": 1716454222521301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521358, "dur": 1, "args": { "External id": 75902, "cbid": 251, "correlation": 75902 } }, { "ph": "f", "id": 75902, "pid": 76337, "tid": -914061504, "ts": 1716454222521358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521361, "dur": 0, "args": { "External id": 75903, "cbid": 251, "correlation": 75903 } }, { "ph": "f", "id": 75903, "pid": 76337, "tid": -914061504, "ts": 1716454222521361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222575161, "dur": 8, "args": { "External id": 75904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75904, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75904, "pid": 5, "tid": 7, "ts": 1716454222575161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521363, "dur": 12, "args": { "External id": 75904, "cbid": 211, "correlation": 75904 } }, { "ph": "s", "id": 75904, "pid": 76337, "tid": -914061504, "ts": 1716454222521363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222575171, "dur": 3, "args": { "External id": 75906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75906, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75906, "pid": 5, "tid": 7, "ts": 1716454222575171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521376, "dur": 5, "args": { "External id": 75906, "cbid": 211, "correlation": 75906 } }, { "ph": "s", "id": 75906, "pid": 76337, "tid": -914061504, "ts": 1716454222521376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222575175, "dur": 56, "args": { "External id": 75931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75931, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 75931, "pid": 5, "tid": 7, "ts": 1716454222575175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521452, "dur": 12, "args": { "External id": 75931, "cbid": 211, "correlation": 75931 } }, { "ph": "s", "id": 75931, "pid": 76337, "tid": -914061504, "ts": 1716454222521452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521551, "dur": 2, "args": { "External id": 75949, "cbid": 251, "correlation": 75949 } }, { "ph": "f", "id": 75949, "pid": 76337, "tid": -914061504, "ts": 1716454222521551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222575233, "dur": 93, "args": { "External id": 75951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75951, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 75951, "pid": 5, "tid": 7, "ts": 1716454222575233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521558, "dur": 14, "args": { "External id": 75951, "cbid": 211, "correlation": 75951 } }, { "ph": "s", "id": 75951, "pid": 76337, "tid": -914061504, "ts": 1716454222521558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222575327, "dur": 10, "args": { "External id": 75959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75959, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75959, "pid": 5, "tid": 7, "ts": 1716454222575327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521627, "dur": 12, "args": { "External id": 75959, "cbid": 211, "correlation": 75959 } }, { "ph": "s", "id": 75959, "pid": 76337, "tid": -914061504, "ts": 1716454222521627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222575338, "dur": 21, "args": { "External id": 75967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75967, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75967, "pid": 5, "tid": 7, "ts": 1716454222575338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521669, "dur": 10, "args": { "External id": 75967, "cbid": 211, "correlation": 75967 } }, { "ph": "s", "id": 75967, "pid": 76337, "tid": -914061504, "ts": 1716454222521669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222575360, "dur": 18, "args": { "External id": 75989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 75989, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 75989, "pid": 5, "tid": 7, "ts": 1716454222575360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521720, "dur": 10, "args": { "External id": 75989, "cbid": 211, "correlation": 75989 } }, { "ph": "s", "id": 75989, "pid": 76337, "tid": -914061504, "ts": 1716454222521720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521809, "dur": 2, "args": { "External id": 76005, "cbid": 251, "correlation": 76005 } }, { "ph": "f", "id": 76005, "pid": 76337, "tid": -914061504, "ts": 1716454222521809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222521815, "dur": 0, "args": { "External id": 76007, "cbid": 251, "correlation": 76007 } }, { "ph": "f", "id": 76007, "pid": 76337, "tid": -914061504, "ts": 1716454222521815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222575380, "dur": 496, "args": { "External id": 76008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76008, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76008, "pid": 5, "tid": 7, "ts": 1716454222575380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521818, "dur": 14, "args": { "External id": 76008, "cbid": 211, "correlation": 76008 } }, { "ph": "s", "id": 76008, "pid": 76337, "tid": -914061504, "ts": 1716454222521818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222575877, "dur": 68, "args": { "External id": 76016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76016, "pid": 5, "tid": 7, "ts": 1716454222575877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521884, "dur": 13, "args": { "External id": 76016, "cbid": 211, "correlation": 76016 } }, { "ph": "s", "id": 76016, "pid": 76337, "tid": -914061504, "ts": 1716454222521884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222575947, "dur": 69, "args": { "External id": 76024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76024, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76024, "pid": 5, "tid": 7, "ts": 1716454222575947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222521916, "dur": 8, "args": { "External id": 76024, "cbid": 211, "correlation": 76024 } }, { "ph": "s", "id": 76024, "pid": 76337, "tid": -914061504, "ts": 1716454222521916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222522004, "dur": 1, "args": { "External id": 76040, "cbid": 251, "correlation": 76040 } }, { "ph": "f", "id": 76040, "pid": 76337, "tid": -914061504, "ts": 1716454222522004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222576018, "dur": 1, "args": { "External id": 76042, "device": 5, "context": 1, "stream": 7, "correlation": 76042, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 76042, "pid": 5, "tid": 7, "ts": 1716454222576018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222522010, "dur": 12, "args": { "External id": 76042, "cbid": 51, "correlation": 76042 } }, { "ph": "s", "id": 76042, "pid": 76337, "tid": -914061504, "ts": 1716454222522010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222576021, "dur": 272, "args": { "External id": 76043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76043, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76043, "pid": 5, "tid": 7, "ts": 1716454222576021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522024, "dur": 11, "args": { "External id": 76043, "cbid": 211, "correlation": 76043 } }, { "ph": "s", "id": 76043, "pid": 76337, "tid": -914061504, "ts": 1716454222522024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222576295, "dur": 14, "args": { "External id": 76051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76051, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76051, "pid": 5, "tid": 7, "ts": 1716454222576295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522066, "dur": 11, "args": { "External id": 76051, "cbid": 211, "correlation": 76051 } }, { "ph": "s", "id": 76051, "pid": 76337, "tid": -914061504, "ts": 1716454222522066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222576311, "dur": 39, "args": { "External id": 76062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76062, "pid": 5, "tid": 7, "ts": 1716454222576311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522134, "dur": 12, "args": { "External id": 76062, "cbid": 211, "correlation": 76062 } }, { "ph": "s", "id": 76062, "pid": 76337, "tid": -914061504, "ts": 1716454222522134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222522199, "dur": 0, "args": { "External id": 76074, "cbid": 317, "correlation": 76074 } }, { "ph": "f", "id": 76074, "pid": 76337, "tid": -914061504, "ts": 1716454222522199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222522199, "dur": 0, "args": { "External id": 76075, "cbid": 203, "correlation": 76075 } }, { "ph": "f", "id": 76075, "pid": 76337, "tid": -914061504, "ts": 1716454222522199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222522200, "dur": 0, "args": { "External id": 76076, "cbid": 205, "correlation": 76076 } }, { "ph": "f", "id": 76076, "pid": 76337, "tid": -914061504, "ts": 1716454222522200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222576351, "dur": 13, "args": { "External id": 76080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76080, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76080, "pid": 5, "tid": 7, "ts": 1716454222576351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522217, "dur": 12, "args": { "External id": 76080, "cbid": 211, "correlation": 76080 } }, { "ph": "s", "id": 76080, "pid": 76337, "tid": -914061504, "ts": 1716454222522217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222576365, "dur": 4, "args": { "External id": 76082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76082, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76082, "pid": 5, "tid": 7, "ts": 1716454222576365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522234, "dur": 6, "args": { "External id": 76082, "cbid": 211, "correlation": 76082 } }, { "ph": "s", "id": 76082, "pid": 76337, "tid": -914061504, "ts": 1716454222522234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222522242, "dur": 0, "args": { "External id": 76083, "cbid": 51, "correlation": 76083 } }, { "ph": "s", "id": 76083, "pid": 76337, "tid": -914061504, "ts": 1716454222522242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222576371, "dur": 99, "args": { "External id": 76084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76084, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 76084, "pid": 5, "tid": 7, "ts": 1716454222576371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522243, "dur": 5, "args": { "External id": 76084, "cbid": 211, "correlation": 76084 } }, { "ph": "s", "id": 76084, "pid": 76337, "tid": -914061504, "ts": 1716454222522243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222576471, "dur": 17, "args": { "External id": 76089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76089, "pid": 5, "tid": 7, "ts": 1716454222576471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522271, "dur": 8, "args": { "External id": 76089, "cbid": 211, "correlation": 76089 } }, { "ph": "s", "id": 76089, "pid": 76337, "tid": -914061504, "ts": 1716454222522271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222576490, "dur": 13, "args": { "External id": 76097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76097, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76097, "pid": 5, "tid": 7, "ts": 1716454222576490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522301, "dur": 8, "args": { "External id": 76097, "cbid": 211, "correlation": 76097 } }, { "ph": "s", "id": 76097, "pid": 76337, "tid": -914061504, "ts": 1716454222522301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222576504, "dur": 18, "args": { "External id": 76117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76117, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 76117, "pid": 5, "tid": 7, "ts": 1716454222576504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522374, "dur": 12, "args": { "External id": 76117, "cbid": 211, "correlation": 76117 } }, { "ph": "s", "id": 76117, "pid": 76337, "tid": -914061504, "ts": 1716454222522374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222576524, "dur": 5, "args": { "External id": 76129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76129, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 76129, "pid": 5, "tid": 7, "ts": 1716454222576524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522397, "dur": 6, "args": { "External id": 76129, "cbid": 211, "correlation": 76129 } }, { "ph": "s", "id": 76129, "pid": 76337, "tid": -914061504, "ts": 1716454222522397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222576530, "dur": 18, "args": { "External id": 76132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76132, "pid": 5, "tid": 7, "ts": 1716454222576530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522414, "dur": 7, "args": { "External id": 76132, "cbid": 211, "correlation": 76132 } }, { "ph": "s", "id": 76132, "pid": 76337, "tid": -914061504, "ts": 1716454222522414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222576549, "dur": 11, "args": { "External id": 76141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76141, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76141, "pid": 5, "tid": 7, "ts": 1716454222576549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522452, "dur": 10, "args": { "External id": 76141, "cbid": 211, "correlation": 76141 } }, { "ph": "s", "id": 76141, "pid": 76337, "tid": -914061504, "ts": 1716454222522452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222522504, "dur": 0, "args": { "External id": 76151, "cbid": 317, "correlation": 76151 } }, { "ph": "f", "id": 76151, "pid": 76337, "tid": -914061504, "ts": 1716454222522504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222522505, "dur": 0, "args": { "External id": 76152, "cbid": 203, "correlation": 76152 } }, { "ph": "f", "id": 76152, "pid": 76337, "tid": -914061504, "ts": 1716454222522505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222522506, "dur": 0, "args": { "External id": 76153, "cbid": 205, "correlation": 76153 } }, { "ph": "f", "id": 76153, "pid": 76337, "tid": -914061504, "ts": 1716454222522506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222576562, "dur": 11, "args": { "External id": 76157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76157, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76157, "pid": 5, "tid": 7, "ts": 1716454222576562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522520, "dur": 11, "args": { "External id": 76157, "cbid": 211, "correlation": 76157 } }, { "ph": "s", "id": 76157, "pid": 76337, "tid": -914061504, "ts": 1716454222522520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222576575, "dur": 165, "args": { "External id": 76159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76159, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76159, "pid": 5, "tid": 7, "ts": 1716454222576575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522533, "dur": 6, "args": { "External id": 76159, "cbid": 211, "correlation": 76159 } }, { "ph": "s", "id": 76159, "pid": 76337, "tid": -914061504, "ts": 1716454222522533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222576742, "dur": 1, "args": { "External id": 76161, "device": 5, "context": 1, "stream": 7, "correlation": 76161, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 76161, "pid": 5, "tid": 7, "ts": 1716454222576742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222522545, "dur": 6, "args": { "External id": 76161, "cbid": 51, "correlation": 76161 } }, { "ph": "s", "id": 76161, "pid": 76337, "tid": -914061504, "ts": 1716454222522545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222576746, "dur": 674, "args": { "External id": 76162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76162, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76162, "pid": 5, "tid": 7, "ts": 1716454222576746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522553, "dur": 6, "args": { "External id": 76162, "cbid": 211, "correlation": 76162 } }, { "ph": "s", "id": 76162, "pid": 76337, "tid": -914061504, "ts": 1716454222522553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222577421, "dur": 13, "args": { "External id": 76164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76164, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76164, "pid": 5, "tid": 7, "ts": 1716454222577421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522563, "dur": 5, "args": { "External id": 76164, "cbid": 211, "correlation": 76164 } }, { "ph": "s", "id": 76164, "pid": 76337, "tid": -914061504, "ts": 1716454222522563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222577435, "dur": 15, "args": { "External id": 76170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76170, "pid": 5, "tid": 7, "ts": 1716454222577435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522591, "dur": 9, "args": { "External id": 76170, "cbid": 211, "correlation": 76170 } }, { "ph": "s", "id": 76170, "pid": 76337, "tid": -914061504, "ts": 1716454222522591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222577451, "dur": 3, "args": { "External id": 76178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76178, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 76178, "pid": 5, "tid": 7, "ts": 1716454222577451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522635, "dur": 9, "args": { "External id": 76178, "cbid": 211, "correlation": 76178 } }, { "ph": "s", "id": 76178, "pid": 76337, "tid": -914061504, "ts": 1716454222522635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222522699, "dur": 1, "args": { "External id": 76194, "cbid": 251, "correlation": 76194 } }, { "ph": "f", "id": 76194, "pid": 76337, "tid": -914061504, "ts": 1716454222522699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222522704, "dur": 0, "args": { "External id": 76196, "cbid": 251, "correlation": 76196 } }, { "ph": "f", "id": 76196, "pid": 76337, "tid": -914061504, "ts": 1716454222522704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222577456, "dur": 13, "args": { "External id": 76197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76197, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76197, "pid": 5, "tid": 7, "ts": 1716454222577456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522706, "dur": 11, "args": { "External id": 76197, "cbid": 211, "correlation": 76197 } }, { "ph": "s", "id": 76197, "pid": 76337, "tid": -914061504, "ts": 1716454222522706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222577471, "dur": 5, "args": { "External id": 76199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76199, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76199, "pid": 5, "tid": 7, "ts": 1716454222577471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522719, "dur": 5, "args": { "External id": 76199, "cbid": 211, "correlation": 76199 } }, { "ph": "s", "id": 76199, "pid": 76337, "tid": -914061504, "ts": 1716454222522719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222577477, "dur": 17, "args": { "External id": 76209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76209, "pid": 5, "tid": 7, "ts": 1716454222577477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522777, "dur": 12, "args": { "External id": 76209, "cbid": 211, "correlation": 76209 } }, { "ph": "s", "id": 76209, "pid": 76337, "tid": -914061504, "ts": 1716454222522777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222577496, "dur": 18, "args": { "External id": 76229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76229, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 76229, "pid": 5, "tid": 7, "ts": 1716454222577496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522842, "dur": 11, "args": { "External id": 76229, "cbid": 211, "correlation": 76229 } }, { "ph": "s", "id": 76229, "pid": 76337, "tid": -914061504, "ts": 1716454222522842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222577515, "dur": 4, "args": { "External id": 76241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76241, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 76241, "pid": 5, "tid": 7, "ts": 1716454222577515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522862, "dur": 6, "args": { "External id": 76241, "cbid": 211, "correlation": 76241 } }, { "ph": "s", "id": 76241, "pid": 76337, "tid": -914061504, "ts": 1716454222522862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222577520, "dur": 17, "args": { "External id": 76244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76244, "pid": 5, "tid": 7, "ts": 1716454222577520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522881, "dur": 6, "args": { "External id": 76244, "cbid": 211, "correlation": 76244 } }, { "ph": "s", "id": 76244, "pid": 76337, "tid": -914061504, "ts": 1716454222522881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222577538, "dur": 11, "args": { "External id": 76253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76253, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76253, "pid": 5, "tid": 7, "ts": 1716454222577538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222522920, "dur": 9, "args": { "External id": 76253, "cbid": 211, "correlation": 76253 } }, { "ph": "s", "id": 76253, "pid": 76337, "tid": -914061504, "ts": 1716454222522920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222522988, "dur": 0, "args": { "External id": 76263, "cbid": 317, "correlation": 76263 } }, { "ph": "f", "id": 76263, "pid": 76337, "tid": -914061504, "ts": 1716454222522988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222522988, "dur": 0, "args": { "External id": 76264, "cbid": 203, "correlation": 76264 } }, { "ph": "f", "id": 76264, "pid": 76337, "tid": -914061504, "ts": 1716454222522988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222522989, "dur": 0, "args": { "External id": 76265, "cbid": 205, "correlation": 76265 } }, { "ph": "f", "id": 76265, "pid": 76337, "tid": -914061504, "ts": 1716454222522989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222577550, "dur": 11, "args": { "External id": 76269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76269, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76269, "pid": 5, "tid": 7, "ts": 1716454222577550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523004, "dur": 12, "args": { "External id": 76269, "cbid": 211, "correlation": 76269 } }, { "ph": "s", "id": 76269, "pid": 76337, "tid": -914061504, "ts": 1716454222523004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222577563, "dur": 166, "args": { "External id": 76271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76271, "pid": 5, "tid": 7, "ts": 1716454222577563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523019, "dur": 5, "args": { "External id": 76271, "cbid": 211, "correlation": 76271 } }, { "ph": "s", "id": 76271, "pid": 76337, "tid": -914061504, "ts": 1716454222523019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222577731, "dur": 1, "args": { "External id": 76273, "device": 5, "context": 1, "stream": 7, "correlation": 76273, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 76273, "pid": 5, "tid": 7, "ts": 1716454222577731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222523030, "dur": 6, "args": { "External id": 76273, "cbid": 51, "correlation": 76273 } }, { "ph": "s", "id": 76273, "pid": 76337, "tid": -914061504, "ts": 1716454222523030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222577735, "dur": 659, "args": { "External id": 76274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76274, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76274, "pid": 5, "tid": 7, "ts": 1716454222577735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523037, "dur": 6, "args": { "External id": 76274, "cbid": 211, "correlation": 76274 } }, { "ph": "s", "id": 76274, "pid": 76337, "tid": -914061504, "ts": 1716454222523037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222578396, "dur": 12, "args": { "External id": 76276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76276, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76276, "pid": 5, "tid": 7, "ts": 1716454222578396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523047, "dur": 5, "args": { "External id": 76276, "cbid": 211, "correlation": 76276 } }, { "ph": "s", "id": 76276, "pid": 76337, "tid": -914061504, "ts": 1716454222523047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222578410, "dur": 15, "args": { "External id": 76282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76282, "pid": 5, "tid": 7, "ts": 1716454222578410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523075, "dur": 8, "args": { "External id": 76282, "cbid": 211, "correlation": 76282 } }, { "ph": "s", "id": 76282, "pid": 76337, "tid": -914061504, "ts": 1716454222523075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222578426, "dur": 12, "args": { "External id": 76290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76290, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76290, "pid": 5, "tid": 7, "ts": 1716454222578426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523107, "dur": 9, "args": { "External id": 76290, "cbid": 211, "correlation": 76290 } }, { "ph": "s", "id": 76290, "pid": 76337, "tid": -914061504, "ts": 1716454222523107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222578439, "dur": 10, "args": { "External id": 76298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76298, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76298, "pid": 5, "tid": 7, "ts": 1716454222578439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523136, "dur": 8, "args": { "External id": 76298, "cbid": 211, "correlation": 76298 } }, { "ph": "s", "id": 76298, "pid": 76337, "tid": -914061504, "ts": 1716454222523136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222578451, "dur": 19, "args": { "External id": 76318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76318, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 76318, "pid": 5, "tid": 7, "ts": 1716454222578451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523213, "dur": 12, "args": { "External id": 76318, "cbid": 211, "correlation": 76318 } }, { "ph": "s", "id": 76318, "pid": 76337, "tid": -914061504, "ts": 1716454222523213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222578471, "dur": 4, "args": { "External id": 76330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76330, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 76330, "pid": 5, "tid": 7, "ts": 1716454222578471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523235, "dur": 7, "args": { "External id": 76330, "cbid": 211, "correlation": 76330 } }, { "ph": "s", "id": 76330, "pid": 76337, "tid": -914061504, "ts": 1716454222523235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222578476, "dur": 17, "args": { "External id": 76333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76333, "pid": 5, "tid": 7, "ts": 1716454222578476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523254, "dur": 6, "args": { "External id": 76333, "cbid": 211, "correlation": 76333 } }, { "ph": "s", "id": 76333, "pid": 76337, "tid": -914061504, "ts": 1716454222523254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222523310, "dur": 0, "args": { "External id": 76344, "cbid": 317, "correlation": 76344 } }, { "ph": "f", "id": 76344, "pid": 76337, "tid": -914061504, "ts": 1716454222523310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222523311, "dur": 0, "args": { "External id": 76345, "cbid": 203, "correlation": 76345 } }, { "ph": "f", "id": 76345, "pid": 76337, "tid": -914061504, "ts": 1716454222523311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222523311, "dur": 0, "args": { "External id": 76346, "cbid": 205, "correlation": 76346 } }, { "ph": "f", "id": 76346, "pid": 76337, "tid": -914061504, "ts": 1716454222523311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222578494, "dur": 11, "args": { "External id": 76350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76350, "pid": 5, "tid": 7, "ts": 1716454222578494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523324, "dur": 12, "args": { "External id": 76350, "cbid": 211, "correlation": 76350 } }, { "ph": "s", "id": 76350, "pid": 76337, "tid": -914061504, "ts": 1716454222523324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222578506, "dur": 4, "args": { "External id": 76352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76352, "pid": 5, "tid": 7, "ts": 1716454222578506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523340, "dur": 6, "args": { "External id": 76352, "cbid": 211, "correlation": 76352 } }, { "ph": "s", "id": 76352, "pid": 76337, "tid": -914061504, "ts": 1716454222523340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222523349, "dur": 0, "args": { "External id": 76353, "cbid": 51, "correlation": 76353 } }, { "ph": "s", "id": 76353, "pid": 76337, "tid": -914061504, "ts": 1716454222523349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222578512, "dur": 96, "args": { "External id": 76354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76354, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 76354, "pid": 5, "tid": 7, "ts": 1716454222578512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523350, "dur": 5, "args": { "External id": 76354, "cbid": 211, "correlation": 76354 } }, { "ph": "s", "id": 76354, "pid": 76337, "tid": -914061504, "ts": 1716454222523350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222578609, "dur": 16, "args": { "External id": 76359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76359, "pid": 5, "tid": 7, "ts": 1716454222578609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523376, "dur": 8, "args": { "External id": 76359, "cbid": 211, "correlation": 76359 } }, { "ph": "s", "id": 76359, "pid": 76337, "tid": -914061504, "ts": 1716454222523376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222578627, "dur": 84, "args": { "External id": 76368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76368, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76368, "pid": 5, "tid": 7, "ts": 1716454222578627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523458, "dur": 14, "args": { "External id": 76368, "cbid": 211, "correlation": 76368 } }, { "ph": "s", "id": 76368, "pid": 76337, "tid": -914061504, "ts": 1716454222523458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222578713, "dur": 31, "args": { "External id": 76390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76390, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76390, "pid": 5, "tid": 7, "ts": 1716454222578713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523514, "dur": 10, "args": { "External id": 76390, "cbid": 211, "correlation": 76390 } }, { "ph": "s", "id": 76390, "pid": 76337, "tid": -914061504, "ts": 1716454222523514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222523602, "dur": 1, "args": { "External id": 76401, "cbid": 251, "correlation": 76401 } }, { "ph": "f", "id": 76401, "pid": 76337, "tid": -914061504, "ts": 1716454222523602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222578746, "dur": 166, "args": { "External id": 76402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76402, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76402, "pid": 5, "tid": 7, "ts": 1716454222578746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523607, "dur": 13, "args": { "External id": 76402, "cbid": 211, "correlation": 76402 } }, { "ph": "s", "id": 76402, "pid": 76337, "tid": -914061504, "ts": 1716454222523607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222523677, "dur": 1, "args": { "External id": 76413, "cbid": 251, "correlation": 76413 } }, { "ph": "f", "id": 76413, "pid": 76337, "tid": -914061504, "ts": 1716454222523677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222578914, "dur": 160, "args": { "External id": 76414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76414, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76414, "pid": 5, "tid": 7, "ts": 1716454222578914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523681, "dur": 12, "args": { "External id": 76414, "cbid": 211, "correlation": 76414 } }, { "ph": "s", "id": 76414, "pid": 76337, "tid": -914061504, "ts": 1716454222523681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222523745, "dur": 1, "args": { "External id": 76425, "cbid": 251, "correlation": 76425 } }, { "ph": "f", "id": 76425, "pid": 76337, "tid": -914061504, "ts": 1716454222523745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222579075, "dur": 163, "args": { "External id": 76426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76426, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76426, "pid": 5, "tid": 7, "ts": 1716454222579075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523749, "dur": 11, "args": { "External id": 76426, "cbid": 211, "correlation": 76426 } }, { "ph": "s", "id": 76426, "pid": 76337, "tid": -914061504, "ts": 1716454222523749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222579240, "dur": 343, "args": { "External id": 76451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76451, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76451, "pid": 5, "tid": 7, "ts": 1716454222579240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523831, "dur": 13, "args": { "External id": 76451, "cbid": 211, "correlation": 76451 } }, { "ph": "s", "id": 76451, "pid": 76337, "tid": -914061504, "ts": 1716454222523831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222523931, "dur": 1, "args": { "External id": 76469, "cbid": 251, "correlation": 76469 } }, { "ph": "f", "id": 76469, "pid": 76337, "tid": -914061504, "ts": 1716454222523931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222579584, "dur": 170, "args": { "External id": 76471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76471, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76471, "pid": 5, "tid": 7, "ts": 1716454222579584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222523936, "dur": 13, "args": { "External id": 76471, "cbid": 211, "correlation": 76471 } }, { "ph": "s", "id": 76471, "pid": 76337, "tid": -914061504, "ts": 1716454222523936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222579756, "dur": 19, "args": { "External id": 76479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76479, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76479, "pid": 5, "tid": 7, "ts": 1716454222579756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524015, "dur": 13, "args": { "External id": 76479, "cbid": 211, "correlation": 76479 } }, { "ph": "s", "id": 76479, "pid": 76337, "tid": -914061504, "ts": 1716454222524015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222579776, "dur": 28, "args": { "External id": 76487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76487, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76487, "pid": 5, "tid": 7, "ts": 1716454222579776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524055, "dur": 9, "args": { "External id": 76487, "cbid": 211, "correlation": 76487 } }, { "ph": "s", "id": 76487, "pid": 76337, "tid": -914061504, "ts": 1716454222524055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222579805, "dur": 19, "args": { "External id": 76498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76498, "pid": 5, "tid": 7, "ts": 1716454222579805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524127, "dur": 12, "args": { "External id": 76498, "cbid": 211, "correlation": 76498 } }, { "ph": "s", "id": 76498, "pid": 76337, "tid": -914061504, "ts": 1716454222524127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222579825, "dur": 16, "args": { "External id": 76520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76520, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76520, "pid": 5, "tid": 7, "ts": 1716454222579825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524158, "dur": 8, "args": { "External id": 76520, "cbid": 211, "correlation": 76520 } }, { "ph": "s", "id": 76520, "pid": 76337, "tid": -914061504, "ts": 1716454222524158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524244, "dur": 1, "args": { "External id": 76531, "cbid": 251, "correlation": 76531 } }, { "ph": "f", "id": 76531, "pid": 76337, "tid": -914061504, "ts": 1716454222524244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222579843, "dur": 90, "args": { "External id": 76532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76532, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76532, "pid": 5, "tid": 7, "ts": 1716454222579843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524249, "dur": 13, "args": { "External id": 76532, "cbid": 211, "correlation": 76532 } }, { "ph": "s", "id": 76532, "pid": 76337, "tid": -914061504, "ts": 1716454222524249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524318, "dur": 1, "args": { "External id": 76543, "cbid": 251, "correlation": 76543 } }, { "ph": "f", "id": 76543, "pid": 76337, "tid": -914061504, "ts": 1716454222524318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524321, "dur": 0, "args": { "External id": 76544, "cbid": 251, "correlation": 76544 } }, { "ph": "f", "id": 76544, "pid": 76337, "tid": -914061504, "ts": 1716454222524321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222579934, "dur": 12, "args": { "External id": 76545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76545, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76545, "pid": 5, "tid": 7, "ts": 1716454222579934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524323, "dur": 12, "args": { "External id": 76545, "cbid": 211, "correlation": 76545 } }, { "ph": "s", "id": 76545, "pid": 76337, "tid": -914061504, "ts": 1716454222524323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222579948, "dur": 5, "args": { "External id": 76547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76547, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76547, "pid": 5, "tid": 7, "ts": 1716454222579948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524337, "dur": 5, "args": { "External id": 76547, "cbid": 211, "correlation": 76547 } }, { "ph": "s", "id": 76547, "pid": 76337, "tid": -914061504, "ts": 1716454222524337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524394, "dur": 1, "args": { "External id": 76558, "cbid": 251, "correlation": 76558 } }, { "ph": "f", "id": 76558, "pid": 76337, "tid": -914061504, "ts": 1716454222524394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524397, "dur": 0, "args": { "External id": 76559, "cbid": 251, "correlation": 76559 } }, { "ph": "f", "id": 76559, "pid": 76337, "tid": -914061504, "ts": 1716454222524397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222579955, "dur": 8, "args": { "External id": 76560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76560, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76560, "pid": 5, "tid": 7, "ts": 1716454222579955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524399, "dur": 11, "args": { "External id": 76560, "cbid": 211, "correlation": 76560 } }, { "ph": "s", "id": 76560, "pid": 76337, "tid": -914061504, "ts": 1716454222524399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222579965, "dur": 3, "args": { "External id": 76562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76562, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76562, "pid": 5, "tid": 7, "ts": 1716454222579965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524411, "dur": 5, "args": { "External id": 76562, "cbid": 211, "correlation": 76562 } }, { "ph": "s", "id": 76562, "pid": 76337, "tid": -914061504, "ts": 1716454222524411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222579969, "dur": 57, "args": { "External id": 76587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76587, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76587, "pid": 5, "tid": 7, "ts": 1716454222579969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524487, "dur": 13, "args": { "External id": 76587, "cbid": 211, "correlation": 76587 } }, { "ph": "s", "id": 76587, "pid": 76337, "tid": -914061504, "ts": 1716454222524487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524586, "dur": 1, "args": { "External id": 76605, "cbid": 251, "correlation": 76605 } }, { "ph": "f", "id": 76605, "pid": 76337, "tid": -914061504, "ts": 1716454222524586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222580027, "dur": 93, "args": { "External id": 76607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76607, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76607, "pid": 5, "tid": 7, "ts": 1716454222580027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524592, "dur": 14, "args": { "External id": 76607, "cbid": 211, "correlation": 76607 } }, { "ph": "s", "id": 76607, "pid": 76337, "tid": -914061504, "ts": 1716454222524592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222580122, "dur": 10, "args": { "External id": 76615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76615, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76615, "pid": 5, "tid": 7, "ts": 1716454222580122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524662, "dur": 12, "args": { "External id": 76615, "cbid": 211, "correlation": 76615 } }, { "ph": "s", "id": 76615, "pid": 76337, "tid": -914061504, "ts": 1716454222524662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222580133, "dur": 22, "args": { "External id": 76623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76623, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76623, "pid": 5, "tid": 7, "ts": 1716454222580133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524703, "dur": 9, "args": { "External id": 76623, "cbid": 211, "correlation": 76623 } }, { "ph": "s", "id": 76623, "pid": 76337, "tid": -914061504, "ts": 1716454222524703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222580156, "dur": 18, "args": { "External id": 76645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76645, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76645, "pid": 5, "tid": 7, "ts": 1716454222580156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524754, "dur": 10, "args": { "External id": 76645, "cbid": 211, "correlation": 76645 } }, { "ph": "s", "id": 76645, "pid": 76337, "tid": -914061504, "ts": 1716454222524754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524840, "dur": 1, "args": { "External id": 76661, "cbid": 251, "correlation": 76661 } }, { "ph": "f", "id": 76661, "pid": 76337, "tid": -914061504, "ts": 1716454222524840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222524845, "dur": 0, "args": { "External id": 76663, "cbid": 251, "correlation": 76663 } }, { "ph": "f", "id": 76663, "pid": 76337, "tid": -914061504, "ts": 1716454222524845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222580176, "dur": 496, "args": { "External id": 76664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76664, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76664, "pid": 5, "tid": 7, "ts": 1716454222580176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524847, "dur": 14, "args": { "External id": 76664, "cbid": 211, "correlation": 76664 } }, { "ph": "s", "id": 76664, "pid": 76337, "tid": -914061504, "ts": 1716454222524847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222580673, "dur": 67, "args": { "External id": 76672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76672, "pid": 5, "tid": 7, "ts": 1716454222580673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524913, "dur": 12, "args": { "External id": 76672, "cbid": 211, "correlation": 76672 } }, { "ph": "s", "id": 76672, "pid": 76337, "tid": -914061504, "ts": 1716454222524913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222580741, "dur": 67, "args": { "External id": 76680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76680, "pid": 5, "tid": 7, "ts": 1716454222580741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222524942, "dur": 8, "args": { "External id": 76680, "cbid": 211, "correlation": 76680 } }, { "ph": "s", "id": 76680, "pid": 76337, "tid": -914061504, "ts": 1716454222524942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222525030, "dur": 1, "args": { "External id": 76696, "cbid": 251, "correlation": 76696 } }, { "ph": "f", "id": 76696, "pid": 76337, "tid": -914061504, "ts": 1716454222525030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222580811, "dur": 1, "args": { "External id": 76698, "device": 5, "context": 1, "stream": 7, "correlation": 76698, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 76698, "pid": 5, "tid": 7, "ts": 1716454222580811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222525035, "dur": 9, "args": { "External id": 76698, "cbid": 51, "correlation": 76698 } }, { "ph": "s", "id": 76698, "pid": 76337, "tid": -914061504, "ts": 1716454222525035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222580814, "dur": 278, "args": { "External id": 76699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76699, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76699, "pid": 5, "tid": 7, "ts": 1716454222580814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525046, "dur": 11, "args": { "External id": 76699, "cbid": 211, "correlation": 76699 } }, { "ph": "s", "id": 76699, "pid": 76337, "tid": -914061504, "ts": 1716454222525046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222581093, "dur": 14, "args": { "External id": 76707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76707, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76707, "pid": 5, "tid": 7, "ts": 1716454222581093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525088, "dur": 10, "args": { "External id": 76707, "cbid": 211, "correlation": 76707 } }, { "ph": "s", "id": 76707, "pid": 76337, "tid": -914061504, "ts": 1716454222525088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222581109, "dur": 38, "args": { "External id": 76718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76718, "pid": 5, "tid": 7, "ts": 1716454222581109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525155, "dur": 12, "args": { "External id": 76718, "cbid": 211, "correlation": 76718 } }, { "ph": "s", "id": 76718, "pid": 76337, "tid": -914061504, "ts": 1716454222525155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222525220, "dur": 0, "args": { "External id": 76730, "cbid": 317, "correlation": 76730 } }, { "ph": "f", "id": 76730, "pid": 76337, "tid": -914061504, "ts": 1716454222525220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222525221, "dur": 0, "args": { "External id": 76731, "cbid": 203, "correlation": 76731 } }, { "ph": "f", "id": 76731, "pid": 76337, "tid": -914061504, "ts": 1716454222525221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222525221, "dur": 0, "args": { "External id": 76732, "cbid": 205, "correlation": 76732 } }, { "ph": "f", "id": 76732, "pid": 76337, "tid": -914061504, "ts": 1716454222525221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222581149, "dur": 14, "args": { "External id": 76736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76736, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76736, "pid": 5, "tid": 7, "ts": 1716454222581149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525238, "dur": 12, "args": { "External id": 76736, "cbid": 211, "correlation": 76736 } }, { "ph": "s", "id": 76736, "pid": 76337, "tid": -914061504, "ts": 1716454222525238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222581163, "dur": 4, "args": { "External id": 76738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 76738, "pid": 5, "tid": 7, "ts": 1716454222581163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525254, "dur": 6, "args": { "External id": 76738, "cbid": 211, "correlation": 76738 } }, { "ph": "s", "id": 76738, "pid": 76337, "tid": -914061504, "ts": 1716454222525254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222525263, "dur": 0, "args": { "External id": 76739, "cbid": 51, "correlation": 76739 } }, { "ph": "s", "id": 76739, "pid": 76337, "tid": -914061504, "ts": 1716454222525263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222581169, "dur": 100, "args": { "External id": 76740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76740, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 76740, "pid": 5, "tid": 7, "ts": 1716454222581169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525264, "dur": 5, "args": { "External id": 76740, "cbid": 211, "correlation": 76740 } }, { "ph": "s", "id": 76740, "pid": 76337, "tid": -914061504, "ts": 1716454222525264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222581270, "dur": 17, "args": { "External id": 76745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76745, "pid": 5, "tid": 7, "ts": 1716454222581270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525291, "dur": 8, "args": { "External id": 76745, "cbid": 211, "correlation": 76745 } }, { "ph": "s", "id": 76745, "pid": 76337, "tid": -914061504, "ts": 1716454222525291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222581288, "dur": 11, "args": { "External id": 76753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76753, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76753, "pid": 5, "tid": 7, "ts": 1716454222581288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525322, "dur": 9, "args": { "External id": 76753, "cbid": 211, "correlation": 76753 } }, { "ph": "s", "id": 76753, "pid": 76337, "tid": -914061504, "ts": 1716454222525322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222525391, "dur": 0, "args": { "External id": 76763, "cbid": 317, "correlation": 76763 } }, { "ph": "f", "id": 76763, "pid": 76337, "tid": -914061504, "ts": 1716454222525391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222525392, "dur": 0, "args": { "External id": 76764, "cbid": 203, "correlation": 76764 } }, { "ph": "f", "id": 76764, "pid": 76337, "tid": -914061504, "ts": 1716454222525392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222525393, "dur": 0, "args": { "External id": 76765, "cbid": 205, "correlation": 76765 } }, { "ph": "f", "id": 76765, "pid": 76337, "tid": -914061504, "ts": 1716454222525393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222581301, "dur": 12, "args": { "External id": 76769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76769, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76769, "pid": 5, "tid": 7, "ts": 1716454222581301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525407, "dur": 12, "args": { "External id": 76769, "cbid": 211, "correlation": 76769 } }, { "ph": "s", "id": 76769, "pid": 76337, "tid": -914061504, "ts": 1716454222525407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222581315, "dur": 165, "args": { "External id": 76771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76771, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76771, "pid": 5, "tid": 7, "ts": 1716454222581315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525421, "dur": 5, "args": { "External id": 76771, "cbid": 211, "correlation": 76771 } }, { "ph": "s", "id": 76771, "pid": 76337, "tid": -914061504, "ts": 1716454222525421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222581482, "dur": 1, "args": { "External id": 76773, "device": 5, "context": 1, "stream": 7, "correlation": 76773, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 76773, "pid": 5, "tid": 7, "ts": 1716454222581482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222525432, "dur": 6, "args": { "External id": 76773, "cbid": 51, "correlation": 76773 } }, { "ph": "s", "id": 76773, "pid": 76337, "tid": -914061504, "ts": 1716454222525432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222581486, "dur": 201, "args": { "External id": 76774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76774, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 76774, "pid": 5, "tid": 7, "ts": 1716454222581486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525440, "dur": 8, "args": { "External id": 76774, "cbid": 211, "correlation": 76774 } }, { "ph": "s", "id": 76774, "pid": 76337, "tid": -914061504, "ts": 1716454222525440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222581688, "dur": 6, "args": { "External id": 76776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76776, "pid": 5, "tid": 7, "ts": 1716454222581688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525452, "dur": 5, "args": { "External id": 76776, "cbid": 211, "correlation": 76776 } }, { "ph": "s", "id": 76776, "pid": 76337, "tid": -914061504, "ts": 1716454222525452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222581696, "dur": 7, "args": { "External id": 76782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76782, "pid": 5, "tid": 7, "ts": 1716454222581696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525481, "dur": 8, "args": { "External id": 76782, "cbid": 211, "correlation": 76782 } }, { "ph": "s", "id": 76782, "pid": 76337, "tid": -914061504, "ts": 1716454222525481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222581704, "dur": 11, "args": { "External id": 76802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76802, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 76802, "pid": 5, "tid": 7, "ts": 1716454222581704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525574, "dur": 12, "args": { "External id": 76802, "cbid": 211, "correlation": 76802 } }, { "ph": "s", "id": 76802, "pid": 76337, "tid": -914061504, "ts": 1716454222525574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222581716, "dur": 4, "args": { "External id": 76814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76814, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 76814, "pid": 5, "tid": 7, "ts": 1716454222581716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525596, "dur": 6, "args": { "External id": 76814, "cbid": 211, "correlation": 76814 } }, { "ph": "s", "id": 76814, "pid": 76337, "tid": -914061504, "ts": 1716454222525596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222581722, "dur": 9, "args": { "External id": 76817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76817, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76817, "pid": 5, "tid": 7, "ts": 1716454222581722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525614, "dur": 6, "args": { "External id": 76817, "cbid": 211, "correlation": 76817 } }, { "ph": "s", "id": 76817, "pid": 76337, "tid": -914061504, "ts": 1716454222525614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222581731, "dur": 5, "args": { "External id": 76826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76826, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76826, "pid": 5, "tid": 7, "ts": 1716454222581731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525654, "dur": 9, "args": { "External id": 76826, "cbid": 211, "correlation": 76826 } }, { "ph": "s", "id": 76826, "pid": 76337, "tid": -914061504, "ts": 1716454222525654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222525705, "dur": 0, "args": { "External id": 76836, "cbid": 317, "correlation": 76836 } }, { "ph": "f", "id": 76836, "pid": 76337, "tid": -914061504, "ts": 1716454222525705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222525706, "dur": 0, "args": { "External id": 76837, "cbid": 203, "correlation": 76837 } }, { "ph": "f", "id": 76837, "pid": 76337, "tid": -914061504, "ts": 1716454222525706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222525707, "dur": 0, "args": { "External id": 76838, "cbid": 205, "correlation": 76838 } }, { "ph": "f", "id": 76838, "pid": 76337, "tid": -914061504, "ts": 1716454222525707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222581738, "dur": 5, "args": { "External id": 76842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76842, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76842, "pid": 5, "tid": 7, "ts": 1716454222581738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525722, "dur": 11, "args": { "External id": 76842, "cbid": 211, "correlation": 76842 } }, { "ph": "s", "id": 76842, "pid": 76337, "tid": -914061504, "ts": 1716454222525722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222581744, "dur": 165, "args": { "External id": 76844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76844, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76844, "pid": 5, "tid": 7, "ts": 1716454222581744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525736, "dur": 6, "args": { "External id": 76844, "cbid": 211, "correlation": 76844 } }, { "ph": "s", "id": 76844, "pid": 76337, "tid": -914061504, "ts": 1716454222525736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222581912, "dur": 1, "args": { "External id": 76846, "device": 5, "context": 1, "stream": 7, "correlation": 76846, "bytes": 240, "memory bandwidth (GB/s)": 0.15315890236119975 } }, { "ph": "f", "id": 76846, "pid": 5, "tid": 7, "ts": 1716454222581912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222525747, "dur": 7, "args": { "External id": 76846, "cbid": 51, "correlation": 76846 } }, { "ph": "s", "id": 76846, "pid": 76337, "tid": -914061504, "ts": 1716454222525747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222581915, "dur": 275, "args": { "External id": 76847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76847, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76847, "pid": 5, "tid": 7, "ts": 1716454222581915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525755, "dur": 6, "args": { "External id": 76847, "cbid": 211, "correlation": 76847 } }, { "ph": "s", "id": 76847, "pid": 76337, "tid": -914061504, "ts": 1716454222525755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222582192, "dur": 6, "args": { "External id": 76849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76849, "pid": 5, "tid": 7, "ts": 1716454222582192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525765, "dur": 6, "args": { "External id": 76849, "cbid": 211, "correlation": 76849 } }, { "ph": "s", "id": 76849, "pid": 76337, "tid": -914061504, "ts": 1716454222525765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222582199, "dur": 6, "args": { "External id": 76855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76855, "pid": 5, "tid": 7, "ts": 1716454222582199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525794, "dur": 8, "args": { "External id": 76855, "cbid": 211, "correlation": 76855 } }, { "ph": "s", "id": 76855, "pid": 76337, "tid": -914061504, "ts": 1716454222525794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222582207, "dur": 3, "args": { "External id": 76863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76863, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 76863, "pid": 5, "tid": 7, "ts": 1716454222582207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525836, "dur": 10, "args": { "External id": 76863, "cbid": 211, "correlation": 76863 } }, { "ph": "s", "id": 76863, "pid": 76337, "tid": -914061504, "ts": 1716454222525836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222525900, "dur": 1, "args": { "External id": 76879, "cbid": 251, "correlation": 76879 } }, { "ph": "f", "id": 76879, "pid": 76337, "tid": -914061504, "ts": 1716454222525900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222525905, "dur": 0, "args": { "External id": 76881, "cbid": 251, "correlation": 76881 } }, { "ph": "f", "id": 76881, "pid": 76337, "tid": -914061504, "ts": 1716454222525905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222582212, "dur": 13, "args": { "External id": 76882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76882, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76882, "pid": 5, "tid": 7, "ts": 1716454222582212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525907, "dur": 11, "args": { "External id": 76882, "cbid": 211, "correlation": 76882 } }, { "ph": "s", "id": 76882, "pid": 76337, "tid": -914061504, "ts": 1716454222525907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222582226, "dur": 5, "args": { "External id": 76884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76884, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76884, "pid": 5, "tid": 7, "ts": 1716454222582226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525920, "dur": 5, "args": { "External id": 76884, "cbid": 211, "correlation": 76884 } }, { "ph": "s", "id": 76884, "pid": 76337, "tid": -914061504, "ts": 1716454222525920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222582233, "dur": 6, "args": { "External id": 76894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76894, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76894, "pid": 5, "tid": 7, "ts": 1716454222582233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222525987, "dur": 13, "args": { "External id": 76894, "cbid": 211, "correlation": 76894 } }, { "ph": "s", "id": 76894, "pid": 76337, "tid": -914061504, "ts": 1716454222525987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222582240, "dur": 10, "args": { "External id": 76914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76914, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 76914, "pid": 5, "tid": 7, "ts": 1716454222582240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526055, "dur": 11, "args": { "External id": 76914, "cbid": 211, "correlation": 76914 } }, { "ph": "s", "id": 76914, "pid": 76337, "tid": -914061504, "ts": 1716454222526055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222582251, "dur": 4, "args": { "External id": 76926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76926, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 76926, "pid": 5, "tid": 7, "ts": 1716454222582251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526076, "dur": 6, "args": { "External id": 76926, "cbid": 211, "correlation": 76926 } }, { "ph": "s", "id": 76926, "pid": 76337, "tid": -914061504, "ts": 1716454222526076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222582257, "dur": 7, "args": { "External id": 76929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76929, "pid": 5, "tid": 7, "ts": 1716454222582257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526094, "dur": 7, "args": { "External id": 76929, "cbid": 211, "correlation": 76929 } }, { "ph": "s", "id": 76929, "pid": 76337, "tid": -914061504, "ts": 1716454222526094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222582265, "dur": 5, "args": { "External id": 76938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76938, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76938, "pid": 5, "tid": 7, "ts": 1716454222582265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526134, "dur": 10, "args": { "External id": 76938, "cbid": 211, "correlation": 76938 } }, { "ph": "s", "id": 76938, "pid": 76337, "tid": -914061504, "ts": 1716454222526134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222526197, "dur": 0, "args": { "External id": 76948, "cbid": 317, "correlation": 76948 } }, { "ph": "f", "id": 76948, "pid": 76337, "tid": -914061504, "ts": 1716454222526197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222526198, "dur": 0, "args": { "External id": 76949, "cbid": 203, "correlation": 76949 } }, { "ph": "f", "id": 76949, "pid": 76337, "tid": -914061504, "ts": 1716454222526198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222526199, "dur": 0, "args": { "External id": 76950, "cbid": 205, "correlation": 76950 } }, { "ph": "f", "id": 76950, "pid": 76337, "tid": -914061504, "ts": 1716454222526199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222582271, "dur": 5, "args": { "External id": 76954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76954, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76954, "pid": 5, "tid": 7, "ts": 1716454222582271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526213, "dur": 11, "args": { "External id": 76954, "cbid": 211, "correlation": 76954 } }, { "ph": "s", "id": 76954, "pid": 76337, "tid": -914061504, "ts": 1716454222526213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222582278, "dur": 164, "args": { "External id": 76956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76956, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76956, "pid": 5, "tid": 7, "ts": 1716454222582278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526227, "dur": 5, "args": { "External id": 76956, "cbid": 211, "correlation": 76956 } }, { "ph": "s", "id": 76956, "pid": 76337, "tid": -914061504, "ts": 1716454222526227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222582444, "dur": 1, "args": { "External id": 76958, "device": 5, "context": 1, "stream": 7, "correlation": 76958, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 76958, "pid": 5, "tid": 7, "ts": 1716454222582444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222526237, "dur": 7, "args": { "External id": 76958, "cbid": 51, "correlation": 76958 } }, { "ph": "s", "id": 76958, "pid": 76337, "tid": -914061504, "ts": 1716454222526237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222582448, "dur": 263, "args": { "External id": 76959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76959, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 76959, "pid": 5, "tid": 7, "ts": 1716454222582448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526246, "dur": 6, "args": { "External id": 76959, "cbid": 211, "correlation": 76959 } }, { "ph": "s", "id": 76959, "pid": 76337, "tid": -914061504, "ts": 1716454222526246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222582712, "dur": 6, "args": { "External id": 76961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76961, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 76961, "pid": 5, "tid": 7, "ts": 1716454222582712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526255, "dur": 5, "args": { "External id": 76961, "cbid": 211, "correlation": 76961 } }, { "ph": "s", "id": 76961, "pid": 76337, "tid": -914061504, "ts": 1716454222526255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222582720, "dur": 6, "args": { "External id": 76967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76967, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76967, "pid": 5, "tid": 7, "ts": 1716454222582720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526282, "dur": 8, "args": { "External id": 76967, "cbid": 211, "correlation": 76967 } }, { "ph": "s", "id": 76967, "pid": 76337, "tid": -914061504, "ts": 1716454222526282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222582727, "dur": 5, "args": { "External id": 76975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76975, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76975, "pid": 5, "tid": 7, "ts": 1716454222582727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526315, "dur": 8, "args": { "External id": 76975, "cbid": 211, "correlation": 76975 } }, { "ph": "s", "id": 76975, "pid": 76337, "tid": -914061504, "ts": 1716454222526315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222582734, "dur": 5, "args": { "External id": 76983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 76983, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 76983, "pid": 5, "tid": 7, "ts": 1716454222582734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526344, "dur": 9, "args": { "External id": 76983, "cbid": 211, "correlation": 76983 } }, { "ph": "s", "id": 76983, "pid": 76337, "tid": -914061504, "ts": 1716454222526344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222582740, "dur": 10, "args": { "External id": 77003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77003, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77003, "pid": 5, "tid": 7, "ts": 1716454222582740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526419, "dur": 13, "args": { "External id": 77003, "cbid": 211, "correlation": 77003 } }, { "ph": "s", "id": 77003, "pid": 76337, "tid": -914061504, "ts": 1716454222526419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222582751, "dur": 4, "args": { "External id": 77015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77015, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77015, "pid": 5, "tid": 7, "ts": 1716454222582751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526441, "dur": 6, "args": { "External id": 77015, "cbid": 211, "correlation": 77015 } }, { "ph": "s", "id": 77015, "pid": 76337, "tid": -914061504, "ts": 1716454222526441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222582756, "dur": 6, "args": { "External id": 77018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77018, "pid": 5, "tid": 7, "ts": 1716454222582756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526459, "dur": 6, "args": { "External id": 77018, "cbid": 211, "correlation": 77018 } }, { "ph": "s", "id": 77018, "pid": 76337, "tid": -914061504, "ts": 1716454222526459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222582764, "dur": 5, "args": { "External id": 77027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77027, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77027, "pid": 5, "tid": 7, "ts": 1716454222582764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526496, "dur": 10, "args": { "External id": 77027, "cbid": 211, "correlation": 77027 } }, { "ph": "s", "id": 77027, "pid": 76337, "tid": -914061504, "ts": 1716454222526496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222526547, "dur": 0, "args": { "External id": 77037, "cbid": 317, "correlation": 77037 } }, { "ph": "f", "id": 77037, "pid": 76337, "tid": -914061504, "ts": 1716454222526547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222526548, "dur": 0, "args": { "External id": 77038, "cbid": 203, "correlation": 77038 } }, { "ph": "f", "id": 77038, "pid": 76337, "tid": -914061504, "ts": 1716454222526548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222526548, "dur": 0, "args": { "External id": 77039, "cbid": 205, "correlation": 77039 } }, { "ph": "f", "id": 77039, "pid": 76337, "tid": -914061504, "ts": 1716454222526548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222582770, "dur": 5, "args": { "External id": 77043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77043, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77043, "pid": 5, "tid": 7, "ts": 1716454222582770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526563, "dur": 11, "args": { "External id": 77043, "cbid": 211, "correlation": 77043 } }, { "ph": "s", "id": 77043, "pid": 76337, "tid": -914061504, "ts": 1716454222526563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222582776, "dur": 166, "args": { "External id": 77045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77045, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77045, "pid": 5, "tid": 7, "ts": 1716454222582776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526577, "dur": 5, "args": { "External id": 77045, "cbid": 211, "correlation": 77045 } }, { "ph": "s", "id": 77045, "pid": 76337, "tid": -914061504, "ts": 1716454222526577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222582944, "dur": 1, "args": { "External id": 77047, "device": 5, "context": 1, "stream": 7, "correlation": 77047, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 77047, "pid": 5, "tid": 7, "ts": 1716454222582944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222526587, "dur": 6, "args": { "External id": 77047, "cbid": 51, "correlation": 77047 } }, { "ph": "s", "id": 77047, "pid": 76337, "tid": -914061504, "ts": 1716454222526587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222582948, "dur": 263, "args": { "External id": 77048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77048, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77048, "pid": 5, "tid": 7, "ts": 1716454222582948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526594, "dur": 6, "args": { "External id": 77048, "cbid": 211, "correlation": 77048 } }, { "ph": "s", "id": 77048, "pid": 76337, "tid": -914061504, "ts": 1716454222526594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222583212, "dur": 6, "args": { "External id": 77050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77050, "pid": 5, "tid": 7, "ts": 1716454222583212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526604, "dur": 5, "args": { "External id": 77050, "cbid": 211, "correlation": 77050 } }, { "ph": "s", "id": 77050, "pid": 76337, "tid": -914061504, "ts": 1716454222526604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222583219, "dur": 6, "args": { "External id": 77056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77056, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77056, "pid": 5, "tid": 7, "ts": 1716454222583219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526632, "dur": 8, "args": { "External id": 77056, "cbid": 211, "correlation": 77056 } }, { "ph": "s", "id": 77056, "pid": 76337, "tid": -914061504, "ts": 1716454222526632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222583227, "dur": 3, "args": { "External id": 77064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77064, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 77064, "pid": 5, "tid": 7, "ts": 1716454222583227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526676, "dur": 9, "args": { "External id": 77064, "cbid": 211, "correlation": 77064 } }, { "ph": "s", "id": 77064, "pid": 76337, "tid": -914061504, "ts": 1716454222526676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222526738, "dur": 1, "args": { "External id": 77080, "cbid": 251, "correlation": 77080 } }, { "ph": "f", "id": 77080, "pid": 76337, "tid": -914061504, "ts": 1716454222526738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222526743, "dur": 0, "args": { "External id": 77082, "cbid": 251, "correlation": 77082 } }, { "ph": "f", "id": 77082, "pid": 76337, "tid": -914061504, "ts": 1716454222526743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222583231, "dur": 10, "args": { "External id": 77083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77083, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77083, "pid": 5, "tid": 7, "ts": 1716454222583231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526745, "dur": 11, "args": { "External id": 77083, "cbid": 211, "correlation": 77083 } }, { "ph": "s", "id": 77083, "pid": 76337, "tid": -914061504, "ts": 1716454222526745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222583242, "dur": 4, "args": { "External id": 77085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77085, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77085, "pid": 5, "tid": 7, "ts": 1716454222583242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526758, "dur": 6, "args": { "External id": 77085, "cbid": 211, "correlation": 77085 } }, { "ph": "s", "id": 77085, "pid": 76337, "tid": -914061504, "ts": 1716454222526758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222583247, "dur": 6, "args": { "External id": 77095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77095, "pid": 5, "tid": 7, "ts": 1716454222583247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526815, "dur": 12, "args": { "External id": 77095, "cbid": 211, "correlation": 77095 } }, { "ph": "s", "id": 77095, "pid": 76337, "tid": -914061504, "ts": 1716454222526815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222583254, "dur": 10, "args": { "External id": 77115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77115, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77115, "pid": 5, "tid": 7, "ts": 1716454222583254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526880, "dur": 10, "args": { "External id": 77115, "cbid": 211, "correlation": 77115 } }, { "ph": "s", "id": 77115, "pid": 76337, "tid": -914061504, "ts": 1716454222526880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222583265, "dur": 4, "args": { "External id": 77127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77127, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77127, "pid": 5, "tid": 7, "ts": 1716454222583265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526901, "dur": 6, "args": { "External id": 77127, "cbid": 211, "correlation": 77127 } }, { "ph": "s", "id": 77127, "pid": 76337, "tid": -914061504, "ts": 1716454222526901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222583271, "dur": 7, "args": { "External id": 77130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77130, "pid": 5, "tid": 7, "ts": 1716454222583271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526918, "dur": 7, "args": { "External id": 77130, "cbid": 211, "correlation": 77130 } }, { "ph": "s", "id": 77130, "pid": 76337, "tid": -914061504, "ts": 1716454222526918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222583279, "dur": 5, "args": { "External id": 77139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77139, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77139, "pid": 5, "tid": 7, "ts": 1716454222583279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222526958, "dur": 10, "args": { "External id": 77139, "cbid": 211, "correlation": 77139 } }, { "ph": "s", "id": 77139, "pid": 76337, "tid": -914061504, "ts": 1716454222526958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222527030, "dur": 0, "args": { "External id": 77149, "cbid": 317, "correlation": 77149 } }, { "ph": "f", "id": 77149, "pid": 76337, "tid": -914061504, "ts": 1716454222527030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222527031, "dur": 0, "args": { "External id": 77150, "cbid": 203, "correlation": 77150 } }, { "ph": "f", "id": 77150, "pid": 76337, "tid": -914061504, "ts": 1716454222527031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222527032, "dur": 0, "args": { "External id": 77151, "cbid": 205, "correlation": 77151 } }, { "ph": "f", "id": 77151, "pid": 76337, "tid": -914061504, "ts": 1716454222527032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222583285, "dur": 5, "args": { "External id": 77155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77155, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77155, "pid": 5, "tid": 7, "ts": 1716454222583285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527046, "dur": 12, "args": { "External id": 77155, "cbid": 211, "correlation": 77155 } }, { "ph": "s", "id": 77155, "pid": 76337, "tid": -914061504, "ts": 1716454222527046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222583292, "dur": 166, "args": { "External id": 77157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77157, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77157, "pid": 5, "tid": 7, "ts": 1716454222583292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527061, "dur": 5, "args": { "External id": 77157, "cbid": 211, "correlation": 77157 } }, { "ph": "s", "id": 77157, "pid": 76337, "tid": -914061504, "ts": 1716454222527061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222583460, "dur": 1, "args": { "External id": 77159, "device": 5, "context": 1, "stream": 7, "correlation": 77159, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 77159, "pid": 5, "tid": 7, "ts": 1716454222583460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222527071, "dur": 6, "args": { "External id": 77159, "cbid": 51, "correlation": 77159 } }, { "ph": "s", "id": 77159, "pid": 76337, "tid": -914061504, "ts": 1716454222527071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222583463, "dur": 263, "args": { "External id": 77160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77160, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77160, "pid": 5, "tid": 7, "ts": 1716454222583463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527079, "dur": 7, "args": { "External id": 77160, "cbid": 211, "correlation": 77160 } }, { "ph": "s", "id": 77160, "pid": 76337, "tid": -914061504, "ts": 1716454222527079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222583728, "dur": 6, "args": { "External id": 77162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77162, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77162, "pid": 5, "tid": 7, "ts": 1716454222583728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527090, "dur": 5, "args": { "External id": 77162, "cbid": 211, "correlation": 77162 } }, { "ph": "s", "id": 77162, "pid": 76337, "tid": -914061504, "ts": 1716454222527090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222583735, "dur": 6, "args": { "External id": 77168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77168, "pid": 5, "tid": 7, "ts": 1716454222583735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527118, "dur": 8, "args": { "External id": 77168, "cbid": 211, "correlation": 77168 } }, { "ph": "s", "id": 77168, "pid": 76337, "tid": -914061504, "ts": 1716454222527118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222583743, "dur": 5, "args": { "External id": 77176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77176, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77176, "pid": 5, "tid": 7, "ts": 1716454222583743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527151, "dur": 8, "args": { "External id": 77176, "cbid": 211, "correlation": 77176 } }, { "ph": "s", "id": 77176, "pid": 76337, "tid": -914061504, "ts": 1716454222527151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222583749, "dur": 5, "args": { "External id": 77184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77184, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77184, "pid": 5, "tid": 7, "ts": 1716454222583749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527179, "dur": 8, "args": { "External id": 77184, "cbid": 211, "correlation": 77184 } }, { "ph": "s", "id": 77184, "pid": 76337, "tid": -914061504, "ts": 1716454222527179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222583755, "dur": 10, "args": { "External id": 77204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77204, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77204, "pid": 5, "tid": 7, "ts": 1716454222583755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527284, "dur": 14, "args": { "External id": 77204, "cbid": 211, "correlation": 77204 } }, { "ph": "s", "id": 77204, "pid": 76337, "tid": -914061504, "ts": 1716454222527284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222583766, "dur": 4, "args": { "External id": 77216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77216, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77216, "pid": 5, "tid": 7, "ts": 1716454222583766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527308, "dur": 6, "args": { "External id": 77216, "cbid": 211, "correlation": 77216 } }, { "ph": "s", "id": 77216, "pid": 76337, "tid": -914061504, "ts": 1716454222527308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222583771, "dur": 7, "args": { "External id": 77219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77219, "pid": 5, "tid": 7, "ts": 1716454222583771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527326, "dur": 6, "args": { "External id": 77219, "cbid": 211, "correlation": 77219 } }, { "ph": "s", "id": 77219, "pid": 76337, "tid": -914061504, "ts": 1716454222527326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222583779, "dur": 5, "args": { "External id": 77228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77228, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77228, "pid": 5, "tid": 7, "ts": 1716454222583779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527365, "dur": 9, "args": { "External id": 77228, "cbid": 211, "correlation": 77228 } }, { "ph": "s", "id": 77228, "pid": 76337, "tid": -914061504, "ts": 1716454222527365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222527416, "dur": 0, "args": { "External id": 77238, "cbid": 317, "correlation": 77238 } }, { "ph": "f", "id": 77238, "pid": 76337, "tid": -914061504, "ts": 1716454222527416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222527417, "dur": 0, "args": { "External id": 77239, "cbid": 203, "correlation": 77239 } }, { "ph": "f", "id": 77239, "pid": 76337, "tid": -914061504, "ts": 1716454222527417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222527418, "dur": 0, "args": { "External id": 77240, "cbid": 205, "correlation": 77240 } }, { "ph": "f", "id": 77240, "pid": 76337, "tid": -914061504, "ts": 1716454222527418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222583785, "dur": 5, "args": { "External id": 77244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77244, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77244, "pid": 5, "tid": 7, "ts": 1716454222583785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527431, "dur": 11, "args": { "External id": 77244, "cbid": 211, "correlation": 77244 } }, { "ph": "s", "id": 77244, "pid": 76337, "tid": -914061504, "ts": 1716454222527431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222583791, "dur": 165, "args": { "External id": 77246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77246, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77246, "pid": 5, "tid": 7, "ts": 1716454222583791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527445, "dur": 5, "args": { "External id": 77246, "cbid": 211, "correlation": 77246 } }, { "ph": "s", "id": 77246, "pid": 76337, "tid": -914061504, "ts": 1716454222527445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222583958, "dur": 1, "args": { "External id": 77248, "device": 5, "context": 1, "stream": 7, "correlation": 77248, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 77248, "pid": 5, "tid": 7, "ts": 1716454222583958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222527455, "dur": 6, "args": { "External id": 77248, "cbid": 51, "correlation": 77248 } }, { "ph": "s", "id": 77248, "pid": 76337, "tid": -914061504, "ts": 1716454222527455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222583962, "dur": 263, "args": { "External id": 77249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77249, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77249, "pid": 5, "tid": 7, "ts": 1716454222583962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527462, "dur": 6, "args": { "External id": 77249, "cbid": 211, "correlation": 77249 } }, { "ph": "s", "id": 77249, "pid": 76337, "tid": -914061504, "ts": 1716454222527462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222584227, "dur": 6, "args": { "External id": 77251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77251, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77251, "pid": 5, "tid": 7, "ts": 1716454222584227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527472, "dur": 5, "args": { "External id": 77251, "cbid": 211, "correlation": 77251 } }, { "ph": "s", "id": 77251, "pid": 76337, "tid": -914061504, "ts": 1716454222527472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222584233, "dur": 6, "args": { "External id": 77257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77257, "pid": 5, "tid": 7, "ts": 1716454222584233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527500, "dur": 8, "args": { "External id": 77257, "cbid": 211, "correlation": 77257 } }, { "ph": "s", "id": 77257, "pid": 76337, "tid": -914061504, "ts": 1716454222527500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222584241, "dur": 3, "args": { "External id": 77265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77265, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 77265, "pid": 5, "tid": 7, "ts": 1716454222584241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527543, "dur": 10, "args": { "External id": 77265, "cbid": 211, "correlation": 77265 } }, { "ph": "s", "id": 77265, "pid": 76337, "tid": -914061504, "ts": 1716454222527543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222527605, "dur": 1, "args": { "External id": 77281, "cbid": 251, "correlation": 77281 } }, { "ph": "f", "id": 77281, "pid": 76337, "tid": -914061504, "ts": 1716454222527605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222527611, "dur": 0, "args": { "External id": 77283, "cbid": 251, "correlation": 77283 } }, { "ph": "f", "id": 77283, "pid": 76337, "tid": -914061504, "ts": 1716454222527611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222584245, "dur": 11, "args": { "External id": 77284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77284, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77284, "pid": 5, "tid": 7, "ts": 1716454222584245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527613, "dur": 11, "args": { "External id": 77284, "cbid": 211, "correlation": 77284 } }, { "ph": "s", "id": 77284, "pid": 76337, "tid": -914061504, "ts": 1716454222527613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222584258, "dur": 4, "args": { "External id": 77286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77286, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77286, "pid": 5, "tid": 7, "ts": 1716454222584258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527625, "dur": 5, "args": { "External id": 77286, "cbid": 211, "correlation": 77286 } }, { "ph": "s", "id": 77286, "pid": 76337, "tid": -914061504, "ts": 1716454222527625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222584263, "dur": 6, "args": { "External id": 77296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77296, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77296, "pid": 5, "tid": 7, "ts": 1716454222584263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527682, "dur": 12, "args": { "External id": 77296, "cbid": 211, "correlation": 77296 } }, { "ph": "s", "id": 77296, "pid": 76337, "tid": -914061504, "ts": 1716454222527682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222584270, "dur": 10, "args": { "External id": 77316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77316, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77316, "pid": 5, "tid": 7, "ts": 1716454222584270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527748, "dur": 10, "args": { "External id": 77316, "cbid": 211, "correlation": 77316 } }, { "ph": "s", "id": 77316, "pid": 76337, "tid": -914061504, "ts": 1716454222527748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222584281, "dur": 4, "args": { "External id": 77328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77328, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77328, "pid": 5, "tid": 7, "ts": 1716454222584281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527769, "dur": 6, "args": { "External id": 77328, "cbid": 211, "correlation": 77328 } }, { "ph": "s", "id": 77328, "pid": 76337, "tid": -914061504, "ts": 1716454222527769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222584286, "dur": 7, "args": { "External id": 77331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77331, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77331, "pid": 5, "tid": 7, "ts": 1716454222584286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527787, "dur": 6, "args": { "External id": 77331, "cbid": 211, "correlation": 77331 } }, { "ph": "s", "id": 77331, "pid": 76337, "tid": -914061504, "ts": 1716454222527787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222584295, "dur": 5, "args": { "External id": 77340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77340, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77340, "pid": 5, "tid": 7, "ts": 1716454222584295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527826, "dur": 10, "args": { "External id": 77340, "cbid": 211, "correlation": 77340 } }, { "ph": "s", "id": 77340, "pid": 76337, "tid": -914061504, "ts": 1716454222527826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222527889, "dur": 0, "args": { "External id": 77350, "cbid": 317, "correlation": 77350 } }, { "ph": "f", "id": 77350, "pid": 76337, "tid": -914061504, "ts": 1716454222527889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222527890, "dur": 0, "args": { "External id": 77351, "cbid": 203, "correlation": 77351 } }, { "ph": "f", "id": 77351, "pid": 76337, "tid": -914061504, "ts": 1716454222527890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222527891, "dur": 0, "args": { "External id": 77352, "cbid": 205, "correlation": 77352 } }, { "ph": "f", "id": 77352, "pid": 76337, "tid": -914061504, "ts": 1716454222527891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222584301, "dur": 5, "args": { "External id": 77356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77356, "pid": 5, "tid": 7, "ts": 1716454222584301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527905, "dur": 12, "args": { "External id": 77356, "cbid": 211, "correlation": 77356 } }, { "ph": "s", "id": 77356, "pid": 76337, "tid": -914061504, "ts": 1716454222527905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222584307, "dur": 165, "args": { "External id": 77358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77358, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77358, "pid": 5, "tid": 7, "ts": 1716454222584307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527919, "dur": 5, "args": { "External id": 77358, "cbid": 211, "correlation": 77358 } }, { "ph": "s", "id": 77358, "pid": 76337, "tid": -914061504, "ts": 1716454222527919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222584474, "dur": 1, "args": { "External id": 77360, "device": 5, "context": 1, "stream": 7, "correlation": 77360, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 77360, "pid": 5, "tid": 7, "ts": 1716454222584474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222527930, "dur": 6, "args": { "External id": 77360, "cbid": 51, "correlation": 77360 } }, { "ph": "s", "id": 77360, "pid": 76337, "tid": -914061504, "ts": 1716454222527930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222584478, "dur": 264, "args": { "External id": 77361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77361, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77361, "pid": 5, "tid": 7, "ts": 1716454222584478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527937, "dur": 6, "args": { "External id": 77361, "cbid": 211, "correlation": 77361 } }, { "ph": "s", "id": 77361, "pid": 76337, "tid": -914061504, "ts": 1716454222527937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222584744, "dur": 6, "args": { "External id": 77363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77363, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77363, "pid": 5, "tid": 7, "ts": 1716454222584744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527946, "dur": 5, "args": { "External id": 77363, "cbid": 211, "correlation": 77363 } }, { "ph": "s", "id": 77363, "pid": 76337, "tid": -914061504, "ts": 1716454222527946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222584751, "dur": 6, "args": { "External id": 77369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77369, "pid": 5, "tid": 7, "ts": 1716454222584751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222527983, "dur": 9, "args": { "External id": 77369, "cbid": 211, "correlation": 77369 } }, { "ph": "s", "id": 77369, "pid": 76337, "tid": -914061504, "ts": 1716454222527983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222584758, "dur": 5, "args": { "External id": 77377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77377, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77377, "pid": 5, "tid": 7, "ts": 1716454222584758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528016, "dur": 8, "args": { "External id": 77377, "cbid": 211, "correlation": 77377 } }, { "ph": "s", "id": 77377, "pid": 76337, "tid": -914061504, "ts": 1716454222528016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222584765, "dur": 5, "args": { "External id": 77385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77385, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77385, "pid": 5, "tid": 7, "ts": 1716454222584765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528046, "dur": 8, "args": { "External id": 77385, "cbid": 211, "correlation": 77385 } }, { "ph": "s", "id": 77385, "pid": 76337, "tid": -914061504, "ts": 1716454222528046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222584771, "dur": 10, "args": { "External id": 77405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77405, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77405, "pid": 5, "tid": 7, "ts": 1716454222584771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528187, "dur": 13, "args": { "External id": 77405, "cbid": 211, "correlation": 77405 } }, { "ph": "s", "id": 77405, "pid": 76337, "tid": -914061504, "ts": 1716454222528187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222584782, "dur": 4, "args": { "External id": 77417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77417, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77417, "pid": 5, "tid": 7, "ts": 1716454222584782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528210, "dur": 6, "args": { "External id": 77417, "cbid": 211, "correlation": 77417 } }, { "ph": "s", "id": 77417, "pid": 76337, "tid": -914061504, "ts": 1716454222528210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222584787, "dur": 7, "args": { "External id": 77420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77420, "pid": 5, "tid": 7, "ts": 1716454222584787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528228, "dur": 7, "args": { "External id": 77420, "cbid": 211, "correlation": 77420 } }, { "ph": "s", "id": 77420, "pid": 76337, "tid": -914061504, "ts": 1716454222528228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222528287, "dur": 0, "args": { "External id": 77431, "cbid": 317, "correlation": 77431 } }, { "ph": "f", "id": 77431, "pid": 76337, "tid": -914061504, "ts": 1716454222528287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222528288, "dur": 0, "args": { "External id": 77432, "cbid": 203, "correlation": 77432 } }, { "ph": "f", "id": 77432, "pid": 76337, "tid": -914061504, "ts": 1716454222528288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222528289, "dur": 0, "args": { "External id": 77433, "cbid": 205, "correlation": 77433 } }, { "ph": "f", "id": 77433, "pid": 76337, "tid": -914061504, "ts": 1716454222528289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222584795, "dur": 5, "args": { "External id": 77437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77437, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77437, "pid": 5, "tid": 7, "ts": 1716454222584795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528306, "dur": 11, "args": { "External id": 77437, "cbid": 211, "correlation": 77437 } }, { "ph": "s", "id": 77437, "pid": 76337, "tid": -914061504, "ts": 1716454222528306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222584801, "dur": 38, "args": { "External id": 77439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77439, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 77439, "pid": 5, "tid": 7, "ts": 1716454222584801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528325, "dur": 9, "args": { "External id": 77439, "cbid": 211, "correlation": 77439 } }, { "ph": "s", "id": 77439, "pid": 76337, "tid": -914061504, "ts": 1716454222528325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222584840, "dur": 5, "args": { "External id": 77441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77441, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77441, "pid": 5, "tid": 7, "ts": 1716454222584840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528338, "dur": 5, "args": { "External id": 77441, "cbid": 211, "correlation": 77441 } }, { "ph": "s", "id": 77441, "pid": 76337, "tid": -914061504, "ts": 1716454222528338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222584846, "dur": 6, "args": { "External id": 77447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77447, "pid": 5, "tid": 7, "ts": 1716454222584846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528365, "dur": 8, "args": { "External id": 77447, "cbid": 211, "correlation": 77447 } }, { "ph": "s", "id": 77447, "pid": 76337, "tid": -914061504, "ts": 1716454222528365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222584854, "dur": 21, "args": { "External id": 77456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77456, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77456, "pid": 5, "tid": 7, "ts": 1716454222584854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528448, "dur": 14, "args": { "External id": 77456, "cbid": 211, "correlation": 77456 } }, { "ph": "s", "id": 77456, "pid": 76337, "tid": -914061504, "ts": 1716454222528448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222584876, "dur": 11, "args": { "External id": 77478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77478, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 77478, "pid": 5, "tid": 7, "ts": 1716454222584876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528506, "dur": 10, "args": { "External id": 77478, "cbid": 211, "correlation": 77478 } }, { "ph": "s", "id": 77478, "pid": 76337, "tid": -914061504, "ts": 1716454222528506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528598, "dur": 2, "args": { "External id": 77489, "cbid": 251, "correlation": 77489 } }, { "ph": "f", "id": 77489, "pid": 76337, "tid": -914061504, "ts": 1716454222528598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528602, "dur": 0, "args": { "External id": 77490, "cbid": 251, "correlation": 77490 } }, { "ph": "f", "id": 77490, "pid": 76337, "tid": -914061504, "ts": 1716454222528602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222584888, "dur": 55, "args": { "External id": 77491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77491, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 77491, "pid": 5, "tid": 7, "ts": 1716454222584888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528606, "dur": 13, "args": { "External id": 77491, "cbid": 211, "correlation": 77491 } }, { "ph": "s", "id": 77491, "pid": 76337, "tid": -914061504, "ts": 1716454222528606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528675, "dur": 1, "args": { "External id": 77502, "cbid": 251, "correlation": 77502 } }, { "ph": "f", "id": 77502, "pid": 76337, "tid": -914061504, "ts": 1716454222528675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528678, "dur": 0, "args": { "External id": 77503, "cbid": 251, "correlation": 77503 } }, { "ph": "f", "id": 77503, "pid": 76337, "tid": -914061504, "ts": 1716454222528678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222584944, "dur": 53, "args": { "External id": 77504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77504, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 77504, "pid": 5, "tid": 7, "ts": 1716454222584944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528680, "dur": 11, "args": { "External id": 77504, "cbid": 211, "correlation": 77504 } }, { "ph": "s", "id": 77504, "pid": 76337, "tid": -914061504, "ts": 1716454222528680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528745, "dur": 1, "args": { "External id": 77515, "cbid": 251, "correlation": 77515 } }, { "ph": "f", "id": 77515, "pid": 76337, "tid": -914061504, "ts": 1716454222528745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528749, "dur": 0, "args": { "External id": 77516, "cbid": 251, "correlation": 77516 } }, { "ph": "f", "id": 77516, "pid": 76337, "tid": -914061504, "ts": 1716454222528749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222584999, "dur": 54, "args": { "External id": 77517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77517, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 77517, "pid": 5, "tid": 7, "ts": 1716454222584999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528751, "dur": 11, "args": { "External id": 77517, "cbid": 211, "correlation": 77517 } }, { "ph": "s", "id": 77517, "pid": 76337, "tid": -914061504, "ts": 1716454222528751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222585054, "dur": 58, "args": { "External id": 77542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77542, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77542, "pid": 5, "tid": 7, "ts": 1716454222585054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528834, "dur": 13, "args": { "External id": 77542, "cbid": 211, "correlation": 77542 } }, { "ph": "s", "id": 77542, "pid": 76337, "tid": -914061504, "ts": 1716454222528834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222528933, "dur": 1, "args": { "External id": 77560, "cbid": 251, "correlation": 77560 } }, { "ph": "f", "id": 77560, "pid": 76337, "tid": -914061504, "ts": 1716454222528933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222585113, "dur": 64, "args": { "External id": 77562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77562, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 77562, "pid": 5, "tid": 7, "ts": 1716454222585113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222528939, "dur": 13, "args": { "External id": 77562, "cbid": 211, "correlation": 77562 } }, { "ph": "s", "id": 77562, "pid": 76337, "tid": -914061504, "ts": 1716454222528939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222585178, "dur": 6, "args": { "External id": 77570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77570, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77570, "pid": 5, "tid": 7, "ts": 1716454222585178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529019, "dur": 12, "args": { "External id": 77570, "cbid": 211, "correlation": 77570 } }, { "ph": "s", "id": 77570, "pid": 76337, "tid": -914061504, "ts": 1716454222529019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222585186, "dur": 7, "args": { "External id": 77578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77578, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77578, "pid": 5, "tid": 7, "ts": 1716454222585186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529057, "dur": 9, "args": { "External id": 77578, "cbid": 211, "correlation": 77578 } }, { "ph": "s", "id": 77578, "pid": 76337, "tid": -914061504, "ts": 1716454222529057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585194, "dur": 8, "args": { "External id": 77589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77589, "pid": 5, "tid": 7, "ts": 1716454222585194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529131, "dur": 13, "args": { "External id": 77589, "cbid": 211, "correlation": 77589 } }, { "ph": "s", "id": 77589, "pid": 76337, "tid": -914061504, "ts": 1716454222529131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222585203, "dur": 9, "args": { "External id": 77611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77611, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 77611, "pid": 5, "tid": 7, "ts": 1716454222585203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529163, "dur": 8, "args": { "External id": 77611, "cbid": 211, "correlation": 77611 } }, { "ph": "s", "id": 77611, "pid": 76337, "tid": -914061504, "ts": 1716454222529163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529248, "dur": 2, "args": { "External id": 77622, "cbid": 251, "correlation": 77622 } }, { "ph": "f", "id": 77622, "pid": 76337, "tid": -914061504, "ts": 1716454222529248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222585214, "dur": 1, "args": { "External id": 77623, "device": 5, "context": 1, "stream": 7, "correlation": 77623, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 77623, "pid": 5, "tid": 7, "ts": 1716454222585214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222529253, "dur": 11, "args": { "External id": 77623, "cbid": 51, "correlation": 77623 } }, { "ph": "s", "id": 77623, "pid": 76337, "tid": -914061504, "ts": 1716454222529253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222585218, "dur": 37, "args": { "External id": 77624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77624, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 77624, "pid": 5, "tid": 7, "ts": 1716454222585218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529266, "dur": 13, "args": { "External id": 77624, "cbid": 211, "correlation": 77624 } }, { "ph": "s", "id": 77624, "pid": 76337, "tid": -914061504, "ts": 1716454222529266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529337, "dur": 1, "args": { "External id": 77635, "cbid": 251, "correlation": 77635 } }, { "ph": "f", "id": 77635, "pid": 76337, "tid": -914061504, "ts": 1716454222529337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529341, "dur": 0, "args": { "External id": 77636, "cbid": 251, "correlation": 77636 } }, { "ph": "f", "id": 77636, "pid": 76337, "tid": -914061504, "ts": 1716454222529341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222585256, "dur": 12, "args": { "External id": 77637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77637, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77637, "pid": 5, "tid": 7, "ts": 1716454222585256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529343, "dur": 12, "args": { "External id": 77637, "cbid": 211, "correlation": 77637 } }, { "ph": "s", "id": 77637, "pid": 76337, "tid": -914061504, "ts": 1716454222529343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222585269, "dur": 5, "args": { "External id": 77639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77639, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77639, "pid": 5, "tid": 7, "ts": 1716454222585269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529356, "dur": 5, "args": { "External id": 77639, "cbid": 211, "correlation": 77639 } }, { "ph": "s", "id": 77639, "pid": 76337, "tid": -914061504, "ts": 1716454222529356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529413, "dur": 1, "args": { "External id": 77650, "cbid": 251, "correlation": 77650 } }, { "ph": "f", "id": 77650, "pid": 76337, "tid": -914061504, "ts": 1716454222529413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529416, "dur": 0, "args": { "External id": 77651, "cbid": 251, "correlation": 77651 } }, { "ph": "f", "id": 77651, "pid": 76337, "tid": -914061504, "ts": 1716454222529416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222585276, "dur": 8, "args": { "External id": 77652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77652, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77652, "pid": 5, "tid": 7, "ts": 1716454222585276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529417, "dur": 12, "args": { "External id": 77652, "cbid": 211, "correlation": 77652 } }, { "ph": "s", "id": 77652, "pid": 76337, "tid": -914061504, "ts": 1716454222529417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222585285, "dur": 3, "args": { "External id": 77654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77654, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77654, "pid": 5, "tid": 7, "ts": 1716454222585285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529431, "dur": 5, "args": { "External id": 77654, "cbid": 211, "correlation": 77654 } }, { "ph": "s", "id": 77654, "pid": 76337, "tid": -914061504, "ts": 1716454222529431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222585290, "dur": 20, "args": { "External id": 77679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77679, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 77679, "pid": 5, "tid": 7, "ts": 1716454222585290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529508, "dur": 13, "args": { "External id": 77679, "cbid": 211, "correlation": 77679 } }, { "ph": "s", "id": 77679, "pid": 76337, "tid": -914061504, "ts": 1716454222529508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529609, "dur": 2, "args": { "External id": 77697, "cbid": 251, "correlation": 77697 } }, { "ph": "f", "id": 77697, "pid": 76337, "tid": -914061504, "ts": 1716454222529609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222585312, "dur": 1, "args": { "External id": 77699, "device": 5, "context": 1, "stream": 7, "correlation": 77699, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 77699, "pid": 5, "tid": 7, "ts": 1716454222585312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222529615, "dur": 10, "args": { "External id": 77699, "cbid": 51, "correlation": 77699 } }, { "ph": "s", "id": 77699, "pid": 76337, "tid": -914061504, "ts": 1716454222529615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222585315, "dur": 38, "args": { "External id": 77700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77700, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 77700, "pid": 5, "tid": 7, "ts": 1716454222585315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529626, "dur": 13, "args": { "External id": 77700, "cbid": 211, "correlation": 77700 } }, { "ph": "s", "id": 77700, "pid": 76337, "tid": -914061504, "ts": 1716454222529626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222585354, "dur": 4, "args": { "External id": 77708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77708, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77708, "pid": 5, "tid": 7, "ts": 1716454222585354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529698, "dur": 12, "args": { "External id": 77708, "cbid": 211, "correlation": 77708 } }, { "ph": "s", "id": 77708, "pid": 76337, "tid": -914061504, "ts": 1716454222529698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585360, "dur": 8, "args": { "External id": 77716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77716, "pid": 5, "tid": 7, "ts": 1716454222585360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529739, "dur": 9, "args": { "External id": 77716, "cbid": 211, "correlation": 77716 } }, { "ph": "s", "id": 77716, "pid": 76337, "tid": -914061504, "ts": 1716454222529739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222585369, "dur": 8, "args": { "External id": 77738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77738, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 77738, "pid": 5, "tid": 7, "ts": 1716454222585369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529790, "dur": 10, "args": { "External id": 77738, "cbid": 211, "correlation": 77738 } }, { "ph": "s", "id": 77738, "pid": 76337, "tid": -914061504, "ts": 1716454222529790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529880, "dur": 1, "args": { "External id": 77754, "cbid": 251, "correlation": 77754 } }, { "ph": "f", "id": 77754, "pid": 76337, "tid": -914061504, "ts": 1716454222529880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222529885, "dur": 0, "args": { "External id": 77756, "cbid": 251, "correlation": 77756 } }, { "ph": "f", "id": 77756, "pid": 76337, "tid": -914061504, "ts": 1716454222529885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222585379, "dur": 193, "args": { "External id": 77757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77757, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77757, "pid": 5, "tid": 7, "ts": 1716454222585379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529887, "dur": 13, "args": { "External id": 77757, "cbid": 211, "correlation": 77757 } }, { "ph": "s", "id": 77757, "pid": 76337, "tid": -914061504, "ts": 1716454222529887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585574, "dur": 21, "args": { "External id": 77765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77765, "pid": 5, "tid": 7, "ts": 1716454222585574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529952, "dur": 12, "args": { "External id": 77765, "cbid": 211, "correlation": 77765 } }, { "ph": "s", "id": 77765, "pid": 76337, "tid": -914061504, "ts": 1716454222529952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585596, "dur": 20, "args": { "External id": 77773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77773, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77773, "pid": 5, "tid": 7, "ts": 1716454222585596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222529991, "dur": 10, "args": { "External id": 77773, "cbid": 211, "correlation": 77773 } }, { "ph": "s", "id": 77773, "pid": 76337, "tid": -914061504, "ts": 1716454222529991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222530075, "dur": 1, "args": { "External id": 77789, "cbid": 251, "correlation": 77789 } }, { "ph": "f", "id": 77789, "pid": 76337, "tid": -914061504, "ts": 1716454222530075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222585618, "dur": 1, "args": { "External id": 77791, "device": 5, "context": 1, "stream": 7, "correlation": 77791, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 77791, "pid": 5, "tid": 7, "ts": 1716454222585618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222530080, "dur": 8, "args": { "External id": 77791, "cbid": 51, "correlation": 77791 } }, { "ph": "s", "id": 77791, "pid": 76337, "tid": -914061504, "ts": 1716454222530080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222585622, "dur": 110, "args": { "External id": 77792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77792, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 77792, "pid": 5, "tid": 7, "ts": 1716454222585622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530090, "dur": 12, "args": { "External id": 77792, "cbid": 211, "correlation": 77792 } }, { "ph": "s", "id": 77792, "pid": 76337, "tid": -914061504, "ts": 1716454222530090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222585733, "dur": 5, "args": { "External id": 77800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77800, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77800, "pid": 5, "tid": 7, "ts": 1716454222585733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530133, "dur": 10, "args": { "External id": 77800, "cbid": 211, "correlation": 77800 } }, { "ph": "s", "id": 77800, "pid": 76337, "tid": -914061504, "ts": 1716454222530133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585740, "dur": 10, "args": { "External id": 77811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77811, "pid": 5, "tid": 7, "ts": 1716454222585740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530202, "dur": 13, "args": { "External id": 77811, "cbid": 211, "correlation": 77811 } }, { "ph": "s", "id": 77811, "pid": 76337, "tid": -914061504, "ts": 1716454222530202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222530268, "dur": 0, "args": { "External id": 77823, "cbid": 317, "correlation": 77823 } }, { "ph": "f", "id": 77823, "pid": 76337, "tid": -914061504, "ts": 1716454222530268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222530269, "dur": 0, "args": { "External id": 77824, "cbid": 203, "correlation": 77824 } }, { "ph": "f", "id": 77824, "pid": 76337, "tid": -914061504, "ts": 1716454222530269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222530270, "dur": 0, "args": { "External id": 77825, "cbid": 205, "correlation": 77825 } }, { "ph": "f", "id": 77825, "pid": 76337, "tid": -914061504, "ts": 1716454222530270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222585751, "dur": 6, "args": { "External id": 77829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77829, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77829, "pid": 5, "tid": 7, "ts": 1716454222585751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530286, "dur": 12, "args": { "External id": 77829, "cbid": 211, "correlation": 77829 } }, { "ph": "s", "id": 77829, "pid": 76337, "tid": -914061504, "ts": 1716454222530286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222585758, "dur": 38, "args": { "External id": 77831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77831, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 77831, "pid": 5, "tid": 7, "ts": 1716454222585758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530305, "dur": 7, "args": { "External id": 77831, "cbid": 211, "correlation": 77831 } }, { "ph": "s", "id": 77831, "pid": 76337, "tid": -914061504, "ts": 1716454222530305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222585798, "dur": 6, "args": { "External id": 77833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77833, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77833, "pid": 5, "tid": 7, "ts": 1716454222585798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530316, "dur": 5, "args": { "External id": 77833, "cbid": 211, "correlation": 77833 } }, { "ph": "s", "id": 77833, "pid": 76337, "tid": -914061504, "ts": 1716454222530316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585805, "dur": 7, "args": { "External id": 77839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77839, "pid": 5, "tid": 7, "ts": 1716454222585805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530343, "dur": 8, "args": { "External id": 77839, "cbid": 211, "correlation": 77839 } }, { "ph": "s", "id": 77839, "pid": 76337, "tid": -914061504, "ts": 1716454222530343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222585813, "dur": 5, "args": { "External id": 77847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77847, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77847, "pid": 5, "tid": 7, "ts": 1716454222585813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530374, "dur": 8, "args": { "External id": 77847, "cbid": 211, "correlation": 77847 } }, { "ph": "s", "id": 77847, "pid": 76337, "tid": -914061504, "ts": 1716454222530374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222585820, "dur": 11, "args": { "External id": 77867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77867, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77867, "pid": 5, "tid": 7, "ts": 1716454222585820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530445, "dur": 12, "args": { "External id": 77867, "cbid": 211, "correlation": 77867 } }, { "ph": "s", "id": 77867, "pid": 76337, "tid": -914061504, "ts": 1716454222530445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222585832, "dur": 4, "args": { "External id": 77879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77879, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77879, "pid": 5, "tid": 7, "ts": 1716454222585832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530467, "dur": 7, "args": { "External id": 77879, "cbid": 211, "correlation": 77879 } }, { "ph": "s", "id": 77879, "pid": 76337, "tid": -914061504, "ts": 1716454222530467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222585838, "dur": 8, "args": { "External id": 77882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77882, "pid": 5, "tid": 7, "ts": 1716454222585838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530485, "dur": 6, "args": { "External id": 77882, "cbid": 211, "correlation": 77882 } }, { "ph": "s", "id": 77882, "pid": 76337, "tid": -914061504, "ts": 1716454222530485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222585847, "dur": 5, "args": { "External id": 77891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77891, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77891, "pid": 5, "tid": 7, "ts": 1716454222585847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530524, "dur": 10, "args": { "External id": 77891, "cbid": 211, "correlation": 77891 } }, { "ph": "s", "id": 77891, "pid": 76337, "tid": -914061504, "ts": 1716454222530524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222530576, "dur": 0, "args": { "External id": 77901, "cbid": 317, "correlation": 77901 } }, { "ph": "f", "id": 77901, "pid": 76337, "tid": -914061504, "ts": 1716454222530576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222530576, "dur": 0, "args": { "External id": 77902, "cbid": 203, "correlation": 77902 } }, { "ph": "f", "id": 77902, "pid": 76337, "tid": -914061504, "ts": 1716454222530576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222530577, "dur": 0, "args": { "External id": 77903, "cbid": 205, "correlation": 77903 } }, { "ph": "f", "id": 77903, "pid": 76337, "tid": -914061504, "ts": 1716454222530577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222585854, "dur": 5, "args": { "External id": 77907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77907, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77907, "pid": 5, "tid": 7, "ts": 1716454222585854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530591, "dur": 11, "args": { "External id": 77907, "cbid": 211, "correlation": 77907 } }, { "ph": "s", "id": 77907, "pid": 76337, "tid": -914061504, "ts": 1716454222530591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222585860, "dur": 165, "args": { "External id": 77909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77909, "pid": 5, "tid": 7, "ts": 1716454222585860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530605, "dur": 5, "args": { "External id": 77909, "cbid": 211, "correlation": 77909 } }, { "ph": "s", "id": 77909, "pid": 76337, "tid": -914061504, "ts": 1716454222530605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222586028, "dur": 1, "args": { "External id": 77911, "device": 5, "context": 1, "stream": 7, "correlation": 77911, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 77911, "pid": 5, "tid": 7, "ts": 1716454222586028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222530615, "dur": 7, "args": { "External id": 77911, "cbid": 51, "correlation": 77911 } }, { "ph": "s", "id": 77911, "pid": 76337, "tid": -914061504, "ts": 1716454222530615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222586031, "dur": 275, "args": { "External id": 77912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77912, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77912, "pid": 5, "tid": 7, "ts": 1716454222586031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530624, "dur": 6, "args": { "External id": 77912, "cbid": 211, "correlation": 77912 } }, { "ph": "s", "id": 77912, "pid": 76337, "tid": -914061504, "ts": 1716454222530624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222586308, "dur": 6, "args": { "External id": 77914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 77914, "pid": 5, "tid": 7, "ts": 1716454222586308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530633, "dur": 5, "args": { "External id": 77914, "cbid": 211, "correlation": 77914 } }, { "ph": "s", "id": 77914, "pid": 76337, "tid": -914061504, "ts": 1716454222530633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222586315, "dur": 6, "args": { "External id": 77920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77920, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77920, "pid": 5, "tid": 7, "ts": 1716454222586315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530661, "dur": 9, "args": { "External id": 77920, "cbid": 211, "correlation": 77920 } }, { "ph": "s", "id": 77920, "pid": 76337, "tid": -914061504, "ts": 1716454222530661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222586322, "dur": 3, "args": { "External id": 77928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77928, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 77928, "pid": 5, "tid": 7, "ts": 1716454222586322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530706, "dur": 9, "args": { "External id": 77928, "cbid": 211, "correlation": 77928 } }, { "ph": "s", "id": 77928, "pid": 76337, "tid": -914061504, "ts": 1716454222530706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222530769, "dur": 1, "args": { "External id": 77944, "cbid": 251, "correlation": 77944 } }, { "ph": "f", "id": 77944, "pid": 76337, "tid": -914061504, "ts": 1716454222530769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222530774, "dur": 0, "args": { "External id": 77946, "cbid": 251, "correlation": 77946 } }, { "ph": "f", "id": 77946, "pid": 76337, "tid": -914061504, "ts": 1716454222530774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222586327, "dur": 12, "args": { "External id": 77947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77947, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77947, "pid": 5, "tid": 7, "ts": 1716454222586327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530776, "dur": 12, "args": { "External id": 77947, "cbid": 211, "correlation": 77947 } }, { "ph": "s", "id": 77947, "pid": 76337, "tid": -914061504, "ts": 1716454222530776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222586341, "dur": 5, "args": { "External id": 77949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77949, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 77949, "pid": 5, "tid": 7, "ts": 1716454222586341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530790, "dur": 5, "args": { "External id": 77949, "cbid": 211, "correlation": 77949 } }, { "ph": "s", "id": 77949, "pid": 76337, "tid": -914061504, "ts": 1716454222530790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222586347, "dur": 6, "args": { "External id": 77959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77959, "pid": 5, "tid": 7, "ts": 1716454222586347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530847, "dur": 13, "args": { "External id": 77959, "cbid": 211, "correlation": 77959 } }, { "ph": "s", "id": 77959, "pid": 76337, "tid": -914061504, "ts": 1716454222530847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222586355, "dur": 10, "args": { "External id": 77979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77979, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 77979, "pid": 5, "tid": 7, "ts": 1716454222586355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530914, "dur": 11, "args": { "External id": 77979, "cbid": 211, "correlation": 77979 } }, { "ph": "s", "id": 77979, "pid": 76337, "tid": -914061504, "ts": 1716454222530914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222586366, "dur": 4, "args": { "External id": 77991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77991, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 77991, "pid": 5, "tid": 7, "ts": 1716454222586366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530934, "dur": 6, "args": { "External id": 77991, "cbid": 211, "correlation": 77991 } }, { "ph": "s", "id": 77991, "pid": 76337, "tid": -914061504, "ts": 1716454222530934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222586371, "dur": 7, "args": { "External id": 77994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 77994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 77994, "pid": 5, "tid": 7, "ts": 1716454222586371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222530953, "dur": 6, "args": { "External id": 77994, "cbid": 211, "correlation": 77994 } }, { "ph": "s", "id": 77994, "pid": 76337, "tid": -914061504, "ts": 1716454222530953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222586379, "dur": 5, "args": { "External id": 78003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78003, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78003, "pid": 5, "tid": 7, "ts": 1716454222586379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531001, "dur": 11, "args": { "External id": 78003, "cbid": 211, "correlation": 78003 } }, { "ph": "s", "id": 78003, "pid": 76337, "tid": -914061504, "ts": 1716454222531001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222531065, "dur": 0, "args": { "External id": 78013, "cbid": 317, "correlation": 78013 } }, { "ph": "f", "id": 78013, "pid": 76337, "tid": -914061504, "ts": 1716454222531065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222531066, "dur": 0, "args": { "External id": 78014, "cbid": 203, "correlation": 78014 } }, { "ph": "f", "id": 78014, "pid": 76337, "tid": -914061504, "ts": 1716454222531066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222531067, "dur": 0, "args": { "External id": 78015, "cbid": 205, "correlation": 78015 } }, { "ph": "f", "id": 78015, "pid": 76337, "tid": -914061504, "ts": 1716454222531067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222586385, "dur": 5, "args": { "External id": 78019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78019, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78019, "pid": 5, "tid": 7, "ts": 1716454222586385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531080, "dur": 12, "args": { "External id": 78019, "cbid": 211, "correlation": 78019 } }, { "ph": "s", "id": 78019, "pid": 76337, "tid": -914061504, "ts": 1716454222531080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222586392, "dur": 164, "args": { "External id": 78021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78021, "pid": 5, "tid": 7, "ts": 1716454222586392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531094, "dur": 6, "args": { "External id": 78021, "cbid": 211, "correlation": 78021 } }, { "ph": "s", "id": 78021, "pid": 76337, "tid": -914061504, "ts": 1716454222531094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222586559, "dur": 1, "args": { "External id": 78023, "device": 5, "context": 1, "stream": 7, "correlation": 78023, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 78023, "pid": 5, "tid": 7, "ts": 1716454222586559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222531106, "dur": 6, "args": { "External id": 78023, "cbid": 51, "correlation": 78023 } }, { "ph": "s", "id": 78023, "pid": 76337, "tid": -914061504, "ts": 1716454222531106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222586562, "dur": 265, "args": { "External id": 78024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78024, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78024, "pid": 5, "tid": 7, "ts": 1716454222586562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531113, "dur": 6, "args": { "External id": 78024, "cbid": 211, "correlation": 78024 } }, { "ph": "s", "id": 78024, "pid": 76337, "tid": -914061504, "ts": 1716454222531113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222586828, "dur": 6, "args": { "External id": 78026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78026, "pid": 5, "tid": 7, "ts": 1716454222586828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531123, "dur": 5, "args": { "External id": 78026, "cbid": 211, "correlation": 78026 } }, { "ph": "s", "id": 78026, "pid": 76337, "tid": -914061504, "ts": 1716454222531123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222586836, "dur": 6, "args": { "External id": 78032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78032, "pid": 5, "tid": 7, "ts": 1716454222586836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531151, "dur": 9, "args": { "External id": 78032, "cbid": 211, "correlation": 78032 } }, { "ph": "s", "id": 78032, "pid": 76337, "tid": -914061504, "ts": 1716454222531151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222586844, "dur": 5, "args": { "External id": 78040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78040, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78040, "pid": 5, "tid": 7, "ts": 1716454222586844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531184, "dur": 8, "args": { "External id": 78040, "cbid": 211, "correlation": 78040 } }, { "ph": "s", "id": 78040, "pid": 76337, "tid": -914061504, "ts": 1716454222531184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222586849, "dur": 5, "args": { "External id": 78048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78048, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78048, "pid": 5, "tid": 7, "ts": 1716454222586849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531213, "dur": 8, "args": { "External id": 78048, "cbid": 211, "correlation": 78048 } }, { "ph": "s", "id": 78048, "pid": 76337, "tid": -914061504, "ts": 1716454222531213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222586855, "dur": 11, "args": { "External id": 78057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78057, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78057, "pid": 5, "tid": 7, "ts": 1716454222586855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531301, "dur": 15, "args": { "External id": 78057, "cbid": 211, "correlation": 78057 } }, { "ph": "s", "id": 78057, "pid": 76337, "tid": -914061504, "ts": 1716454222531301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222586868, "dur": 12, "args": { "External id": 78077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78077, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78077, "pid": 5, "tid": 7, "ts": 1716454222586868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531371, "dur": 11, "args": { "External id": 78077, "cbid": 211, "correlation": 78077 } }, { "ph": "s", "id": 78077, "pid": 76337, "tid": -914061504, "ts": 1716454222531371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222586881, "dur": 4, "args": { "External id": 78089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78089, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78089, "pid": 5, "tid": 7, "ts": 1716454222586881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531392, "dur": 6, "args": { "External id": 78089, "cbid": 211, "correlation": 78089 } }, { "ph": "s", "id": 78089, "pid": 76337, "tid": -914061504, "ts": 1716454222531392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222586887, "dur": 10, "args": { "External id": 78092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78092, "pid": 5, "tid": 7, "ts": 1716454222586887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531410, "dur": 8, "args": { "External id": 78092, "cbid": 211, "correlation": 78092 } }, { "ph": "s", "id": 78092, "pid": 76337, "tid": -914061504, "ts": 1716454222531410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222586899, "dur": 6, "args": { "External id": 78101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78101, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78101, "pid": 5, "tid": 7, "ts": 1716454222586899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531449, "dur": 9, "args": { "External id": 78101, "cbid": 211, "correlation": 78101 } }, { "ph": "s", "id": 78101, "pid": 76337, "tid": -914061504, "ts": 1716454222531449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222531502, "dur": 0, "args": { "External id": 78111, "cbid": 317, "correlation": 78111 } }, { "ph": "f", "id": 78111, "pid": 76337, "tid": -914061504, "ts": 1716454222531502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222531503, "dur": 0, "args": { "External id": 78112, "cbid": 203, "correlation": 78112 } }, { "ph": "f", "id": 78112, "pid": 76337, "tid": -914061504, "ts": 1716454222531503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222531503, "dur": 0, "args": { "External id": 78113, "cbid": 205, "correlation": 78113 } }, { "ph": "f", "id": 78113, "pid": 76337, "tid": -914061504, "ts": 1716454222531503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222586906, "dur": 7, "args": { "External id": 78117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78117, "pid": 5, "tid": 7, "ts": 1716454222586906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531520, "dur": 12, "args": { "External id": 78117, "cbid": 211, "correlation": 78117 } }, { "ph": "s", "id": 78117, "pid": 76337, "tid": -914061504, "ts": 1716454222531520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222586914, "dur": 324, "args": { "External id": 78119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78119, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78119, "pid": 5, "tid": 7, "ts": 1716454222586914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531534, "dur": 5, "args": { "External id": 78119, "cbid": 211, "correlation": 78119 } }, { "ph": "s", "id": 78119, "pid": 76337, "tid": -914061504, "ts": 1716454222531534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222587241, "dur": 1, "args": { "External id": 78121, "device": 5, "context": 1, "stream": 7, "correlation": 78121, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 78121, "pid": 5, "tid": 7, "ts": 1716454222587241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222531545, "dur": 6, "args": { "External id": 78121, "cbid": 51, "correlation": 78121 } }, { "ph": "s", "id": 78121, "pid": 76337, "tid": -914061504, "ts": 1716454222531545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222587245, "dur": 504, "args": { "External id": 78122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78122, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78122, "pid": 5, "tid": 7, "ts": 1716454222587245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531552, "dur": 6, "args": { "External id": 78122, "cbid": 211, "correlation": 78122 } }, { "ph": "s", "id": 78122, "pid": 76337, "tid": -914061504, "ts": 1716454222531552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222587750, "dur": 6, "args": { "External id": 78124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78124, "pid": 5, "tid": 7, "ts": 1716454222587750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531563, "dur": 5, "args": { "External id": 78124, "cbid": 211, "correlation": 78124 } }, { "ph": "s", "id": 78124, "pid": 76337, "tid": -914061504, "ts": 1716454222531563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222587757, "dur": 6, "args": { "External id": 78130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78130, "pid": 5, "tid": 7, "ts": 1716454222587757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531591, "dur": 8, "args": { "External id": 78130, "cbid": 211, "correlation": 78130 } }, { "ph": "s", "id": 78130, "pid": 76337, "tid": -914061504, "ts": 1716454222531591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222587765, "dur": 3, "args": { "External id": 78138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78138, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 78138, "pid": 5, "tid": 7, "ts": 1716454222587765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531635, "dur": 9, "args": { "External id": 78138, "cbid": 211, "correlation": 78138 } }, { "ph": "s", "id": 78138, "pid": 76337, "tid": -914061504, "ts": 1716454222531635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222531697, "dur": 1, "args": { "External id": 78154, "cbid": 251, "correlation": 78154 } }, { "ph": "f", "id": 78154, "pid": 76337, "tid": -914061504, "ts": 1716454222531697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222531702, "dur": 0, "args": { "External id": 78156, "cbid": 251, "correlation": 78156 } }, { "ph": "f", "id": 78156, "pid": 76337, "tid": -914061504, "ts": 1716454222531702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222587769, "dur": 11, "args": { "External id": 78157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78157, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78157, "pid": 5, "tid": 7, "ts": 1716454222587769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531704, "dur": 11, "args": { "External id": 78157, "cbid": 211, "correlation": 78157 } }, { "ph": "s", "id": 78157, "pid": 76337, "tid": -914061504, "ts": 1716454222531704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222587782, "dur": 5, "args": { "External id": 78159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78159, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78159, "pid": 5, "tid": 7, "ts": 1716454222587782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531717, "dur": 5, "args": { "External id": 78159, "cbid": 211, "correlation": 78159 } }, { "ph": "s", "id": 78159, "pid": 76337, "tid": -914061504, "ts": 1716454222531717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222587788, "dur": 6, "args": { "External id": 78169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78169, "pid": 5, "tid": 7, "ts": 1716454222587788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531774, "dur": 12, "args": { "External id": 78169, "cbid": 211, "correlation": 78169 } }, { "ph": "s", "id": 78169, "pid": 76337, "tid": -914061504, "ts": 1716454222531774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222587795, "dur": 10, "args": { "External id": 78189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78189, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78189, "pid": 5, "tid": 7, "ts": 1716454222587795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531840, "dur": 11, "args": { "External id": 78189, "cbid": 211, "correlation": 78189 } }, { "ph": "s", "id": 78189, "pid": 76337, "tid": -914061504, "ts": 1716454222531840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222587806, "dur": 4, "args": { "External id": 78201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78201, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 78201, "pid": 5, "tid": 7, "ts": 1716454222587806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531861, "dur": 6, "args": { "External id": 78201, "cbid": 211, "correlation": 78201 } }, { "ph": "s", "id": 78201, "pid": 76337, "tid": -914061504, "ts": 1716454222531861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222587811, "dur": 7, "args": { "External id": 78204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78204, "pid": 5, "tid": 7, "ts": 1716454222587811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531879, "dur": 7, "args": { "External id": 78204, "cbid": 211, "correlation": 78204 } }, { "ph": "s", "id": 78204, "pid": 76337, "tid": -914061504, "ts": 1716454222531879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222587819, "dur": 5, "args": { "External id": 78213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78213, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78213, "pid": 5, "tid": 7, "ts": 1716454222587819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222531919, "dur": 10, "args": { "External id": 78213, "cbid": 211, "correlation": 78213 } }, { "ph": "s", "id": 78213, "pid": 76337, "tid": -914061504, "ts": 1716454222531919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222531988, "dur": 0, "args": { "External id": 78223, "cbid": 317, "correlation": 78223 } }, { "ph": "f", "id": 78223, "pid": 76337, "tid": -914061504, "ts": 1716454222531988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222531989, "dur": 0, "args": { "External id": 78224, "cbid": 203, "correlation": 78224 } }, { "ph": "f", "id": 78224, "pid": 76337, "tid": -914061504, "ts": 1716454222531989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222531990, "dur": 0, "args": { "External id": 78225, "cbid": 205, "correlation": 78225 } }, { "ph": "f", "id": 78225, "pid": 76337, "tid": -914061504, "ts": 1716454222531990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222587825, "dur": 5, "args": { "External id": 78229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78229, "pid": 5, "tid": 7, "ts": 1716454222587825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532004, "dur": 12, "args": { "External id": 78229, "cbid": 211, "correlation": 78229 } }, { "ph": "s", "id": 78229, "pid": 76337, "tid": -914061504, "ts": 1716454222532004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222587832, "dur": 166, "args": { "External id": 78231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78231, "pid": 5, "tid": 7, "ts": 1716454222587832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532018, "dur": 5, "args": { "External id": 78231, "cbid": 211, "correlation": 78231 } }, { "ph": "s", "id": 78231, "pid": 76337, "tid": -914061504, "ts": 1716454222532018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222588000, "dur": 1, "args": { "External id": 78233, "device": 5, "context": 1, "stream": 7, "correlation": 78233, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 78233, "pid": 5, "tid": 7, "ts": 1716454222588000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222532029, "dur": 7, "args": { "External id": 78233, "cbid": 51, "correlation": 78233 } }, { "ph": "s", "id": 78233, "pid": 76337, "tid": -914061504, "ts": 1716454222532029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222588003, "dur": 264, "args": { "External id": 78234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78234, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78234, "pid": 5, "tid": 7, "ts": 1716454222588003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532036, "dur": 7, "args": { "External id": 78234, "cbid": 211, "correlation": 78234 } }, { "ph": "s", "id": 78234, "pid": 76337, "tid": -914061504, "ts": 1716454222532036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222588268, "dur": 6, "args": { "External id": 78236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78236, "pid": 5, "tid": 7, "ts": 1716454222588268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532046, "dur": 5, "args": { "External id": 78236, "cbid": 211, "correlation": 78236 } }, { "ph": "s", "id": 78236, "pid": 76337, "tid": -914061504, "ts": 1716454222532046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222588276, "dur": 6, "args": { "External id": 78242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78242, "pid": 5, "tid": 7, "ts": 1716454222588276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532074, "dur": 8, "args": { "External id": 78242, "cbid": 211, "correlation": 78242 } }, { "ph": "s", "id": 78242, "pid": 76337, "tid": -914061504, "ts": 1716454222532074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222532133, "dur": 0, "args": { "External id": 78252, "cbid": 317, "correlation": 78252 } }, { "ph": "f", "id": 78252, "pid": 76337, "tid": -914061504, "ts": 1716454222532133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222532133, "dur": 0, "args": { "External id": 78253, "cbid": 203, "correlation": 78253 } }, { "ph": "f", "id": 78253, "pid": 76337, "tid": -914061504, "ts": 1716454222532133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222532134, "dur": 0, "args": { "External id": 78254, "cbid": 205, "correlation": 78254 } }, { "ph": "f", "id": 78254, "pid": 76337, "tid": -914061504, "ts": 1716454222532134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222588283, "dur": 8, "args": { "External id": 78258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78258, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78258, "pid": 5, "tid": 7, "ts": 1716454222588283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532149, "dur": 12, "args": { "External id": 78258, "cbid": 211, "correlation": 78258 } }, { "ph": "s", "id": 78258, "pid": 76337, "tid": -914061504, "ts": 1716454222532149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222588292, "dur": 3, "args": { "External id": 78260, "device": 5, "context": 1, "stream": 7, "correlation": 78260, "bytes": 4800, "memory bandwidth (GB/s)": 1.4846891432106404 } }, { "ph": "f", "id": 78260, "pid": 5, "tid": 7, "ts": 1716454222588292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222532167, "dur": 14, "args": { "External id": 78260, "cbid": 51, "correlation": 78260 } }, { "ph": "s", "id": 78260, "pid": 76337, "tid": -914061504, "ts": 1716454222532167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222588296, "dur": 98, "args": { "External id": 78261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78261, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 78261, "pid": 5, "tid": 7, "ts": 1716454222588296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532181, "dur": 7, "args": { "External id": 78261, "cbid": 211, "correlation": 78261 } }, { "ph": "s", "id": 78261, "pid": 76337, "tid": -914061504, "ts": 1716454222532181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222588396, "dur": 5, "args": { "External id": 78263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78263, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78263, "pid": 5, "tid": 7, "ts": 1716454222588396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532193, "dur": 5, "args": { "External id": 78263, "cbid": 211, "correlation": 78263 } }, { "ph": "s", "id": 78263, "pid": 76337, "tid": -914061504, "ts": 1716454222532193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222588403, "dur": 6, "args": { "External id": 78269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78269, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78269, "pid": 5, "tid": 7, "ts": 1716454222588403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532219, "dur": 8, "args": { "External id": 78269, "cbid": 211, "correlation": 78269 } }, { "ph": "s", "id": 78269, "pid": 76337, "tid": -914061504, "ts": 1716454222532219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222588411, "dur": 5, "args": { "External id": 78277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78277, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78277, "pid": 5, "tid": 7, "ts": 1716454222588411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532249, "dur": 7, "args": { "External id": 78277, "cbid": 211, "correlation": 78277 } }, { "ph": "s", "id": 78277, "pid": 76337, "tid": -914061504, "ts": 1716454222532249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222588417, "dur": 4, "args": { "External id": 78285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78285, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78285, "pid": 5, "tid": 7, "ts": 1716454222588417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532277, "dur": 8, "args": { "External id": 78285, "cbid": 211, "correlation": 78285 } }, { "ph": "s", "id": 78285, "pid": 76337, "tid": -914061504, "ts": 1716454222532277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222588423, "dur": 11, "args": { "External id": 78294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78294, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78294, "pid": 5, "tid": 7, "ts": 1716454222588423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532320, "dur": 10, "args": { "External id": 78294, "cbid": 211, "correlation": 78294 } }, { "ph": "s", "id": 78294, "pid": 76337, "tid": -914061504, "ts": 1716454222532320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222588435, "dur": 12, "args": { "External id": 78314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78314, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78314, "pid": 5, "tid": 7, "ts": 1716454222588435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532389, "dur": 11, "args": { "External id": 78314, "cbid": 211, "correlation": 78314 } }, { "ph": "s", "id": 78314, "pid": 76337, "tid": -914061504, "ts": 1716454222532389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222588449, "dur": 4, "args": { "External id": 78326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78326, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78326, "pid": 5, "tid": 7, "ts": 1716454222588449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532410, "dur": 6, "args": { "External id": 78326, "cbid": 211, "correlation": 78326 } }, { "ph": "s", "id": 78326, "pid": 76337, "tid": -914061504, "ts": 1716454222532410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222588455, "dur": 11, "args": { "External id": 78329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78329, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78329, "pid": 5, "tid": 7, "ts": 1716454222588455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532428, "dur": 7, "args": { "External id": 78329, "cbid": 211, "correlation": 78329 } }, { "ph": "s", "id": 78329, "pid": 76337, "tid": -914061504, "ts": 1716454222532428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222588467, "dur": 6, "args": { "External id": 78338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78338, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78338, "pid": 5, "tid": 7, "ts": 1716454222588467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532467, "dur": 9, "args": { "External id": 78338, "cbid": 211, "correlation": 78338 } }, { "ph": "s", "id": 78338, "pid": 76337, "tid": -914061504, "ts": 1716454222532467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222532518, "dur": 0, "args": { "External id": 78348, "cbid": 317, "correlation": 78348 } }, { "ph": "f", "id": 78348, "pid": 76337, "tid": -914061504, "ts": 1716454222532518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222532518, "dur": 0, "args": { "External id": 78349, "cbid": 203, "correlation": 78349 } }, { "ph": "f", "id": 78349, "pid": 76337, "tid": -914061504, "ts": 1716454222532518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222532519, "dur": 0, "args": { "External id": 78350, "cbid": 205, "correlation": 78350 } }, { "ph": "f", "id": 78350, "pid": 76337, "tid": -914061504, "ts": 1716454222532519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222588475, "dur": 7, "args": { "External id": 78354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78354, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78354, "pid": 5, "tid": 7, "ts": 1716454222588475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532533, "dur": 11, "args": { "External id": 78354, "cbid": 211, "correlation": 78354 } }, { "ph": "s", "id": 78354, "pid": 76337, "tid": -914061504, "ts": 1716454222532533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222588483, "dur": 326, "args": { "External id": 78356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78356, "pid": 5, "tid": 7, "ts": 1716454222588483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532547, "dur": 5, "args": { "External id": 78356, "cbid": 211, "correlation": 78356 } }, { "ph": "s", "id": 78356, "pid": 76337, "tid": -914061504, "ts": 1716454222532547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222588811, "dur": 1, "args": { "External id": 78358, "device": 5, "context": 1, "stream": 7, "correlation": 78358, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 78358, "pid": 5, "tid": 7, "ts": 1716454222588811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222532558, "dur": 6, "args": { "External id": 78358, "cbid": 51, "correlation": 78358 } }, { "ph": "s", "id": 78358, "pid": 76337, "tid": -914061504, "ts": 1716454222532558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222588814, "dur": 505, "args": { "External id": 78359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78359, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78359, "pid": 5, "tid": 7, "ts": 1716454222588814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532565, "dur": 6, "args": { "External id": 78359, "cbid": 211, "correlation": 78359 } }, { "ph": "s", "id": 78359, "pid": 76337, "tid": -914061504, "ts": 1716454222532565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222589321, "dur": 6, "args": { "External id": 78361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78361, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78361, "pid": 5, "tid": 7, "ts": 1716454222589321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532576, "dur": 5, "args": { "External id": 78361, "cbid": 211, "correlation": 78361 } }, { "ph": "s", "id": 78361, "pid": 76337, "tid": -914061504, "ts": 1716454222532576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222589328, "dur": 6, "args": { "External id": 78367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78367, "pid": 5, "tid": 7, "ts": 1716454222589328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532604, "dur": 8, "args": { "External id": 78367, "cbid": 211, "correlation": 78367 } }, { "ph": "s", "id": 78367, "pid": 76337, "tid": -914061504, "ts": 1716454222532604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222589336, "dur": 3, "args": { "External id": 78375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78375, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 78375, "pid": 5, "tid": 7, "ts": 1716454222589336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532646, "dur": 9, "args": { "External id": 78375, "cbid": 211, "correlation": 78375 } }, { "ph": "s", "id": 78375, "pid": 76337, "tid": -914061504, "ts": 1716454222532646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222532709, "dur": 1, "args": { "External id": 78391, "cbid": 251, "correlation": 78391 } }, { "ph": "f", "id": 78391, "pid": 76337, "tid": -914061504, "ts": 1716454222532709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222532714, "dur": 0, "args": { "External id": 78393, "cbid": 251, "correlation": 78393 } }, { "ph": "f", "id": 78393, "pid": 76337, "tid": -914061504, "ts": 1716454222532714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222589340, "dur": 13, "args": { "External id": 78394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78394, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78394, "pid": 5, "tid": 7, "ts": 1716454222589340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532716, "dur": 11, "args": { "External id": 78394, "cbid": 211, "correlation": 78394 } }, { "ph": "s", "id": 78394, "pid": 76337, "tid": -914061504, "ts": 1716454222532716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222589354, "dur": 5, "args": { "External id": 78396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78396, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78396, "pid": 5, "tid": 7, "ts": 1716454222589354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532729, "dur": 5, "args": { "External id": 78396, "cbid": 211, "correlation": 78396 } }, { "ph": "s", "id": 78396, "pid": 76337, "tid": -914061504, "ts": 1716454222532729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222589361, "dur": 6, "args": { "External id": 78406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78406, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78406, "pid": 5, "tid": 7, "ts": 1716454222589361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532785, "dur": 11, "args": { "External id": 78406, "cbid": 211, "correlation": 78406 } }, { "ph": "s", "id": 78406, "pid": 76337, "tid": -914061504, "ts": 1716454222532785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222589368, "dur": 10, "args": { "External id": 78426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78426, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78426, "pid": 5, "tid": 7, "ts": 1716454222589368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532849, "dur": 10, "args": { "External id": 78426, "cbid": 211, "correlation": 78426 } }, { "ph": "s", "id": 78426, "pid": 76337, "tid": -914061504, "ts": 1716454222532849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222589379, "dur": 4, "args": { "External id": 78438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78438, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 78438, "pid": 5, "tid": 7, "ts": 1716454222589379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532869, "dur": 6, "args": { "External id": 78438, "cbid": 211, "correlation": 78438 } }, { "ph": "s", "id": 78438, "pid": 76337, "tid": -914061504, "ts": 1716454222532869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222589384, "dur": 7, "args": { "External id": 78441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78441, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78441, "pid": 5, "tid": 7, "ts": 1716454222589384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532888, "dur": 6, "args": { "External id": 78441, "cbid": 211, "correlation": 78441 } }, { "ph": "s", "id": 78441, "pid": 76337, "tid": -914061504, "ts": 1716454222532888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222589392, "dur": 5, "args": { "External id": 78450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78450, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78450, "pid": 5, "tid": 7, "ts": 1716454222589392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222532927, "dur": 10, "args": { "External id": 78450, "cbid": 211, "correlation": 78450 } }, { "ph": "s", "id": 78450, "pid": 76337, "tid": -914061504, "ts": 1716454222532927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222532999, "dur": 0, "args": { "External id": 78460, "cbid": 317, "correlation": 78460 } }, { "ph": "f", "id": 78460, "pid": 76337, "tid": -914061504, "ts": 1716454222532999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222533000, "dur": 0, "args": { "External id": 78461, "cbid": 203, "correlation": 78461 } }, { "ph": "f", "id": 78461, "pid": 76337, "tid": -914061504, "ts": 1716454222533000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222533001, "dur": 0, "args": { "External id": 78462, "cbid": 205, "correlation": 78462 } }, { "ph": "f", "id": 78462, "pid": 76337, "tid": -914061504, "ts": 1716454222533001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222589398, "dur": 5, "args": { "External id": 78466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78466, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78466, "pid": 5, "tid": 7, "ts": 1716454222589398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533014, "dur": 12, "args": { "External id": 78466, "cbid": 211, "correlation": 78466 } }, { "ph": "s", "id": 78466, "pid": 76337, "tid": -914061504, "ts": 1716454222533014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222589405, "dur": 165, "args": { "External id": 78468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78468, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78468, "pid": 5, "tid": 7, "ts": 1716454222589405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533029, "dur": 5, "args": { "External id": 78468, "cbid": 211, "correlation": 78468 } }, { "ph": "s", "id": 78468, "pid": 76337, "tid": -914061504, "ts": 1716454222533029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222589572, "dur": 1, "args": { "External id": 78470, "device": 5, "context": 1, "stream": 7, "correlation": 78470, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 78470, "pid": 5, "tid": 7, "ts": 1716454222589572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222533040, "dur": 6, "args": { "External id": 78470, "cbid": 51, "correlation": 78470 } }, { "ph": "s", "id": 78470, "pid": 76337, "tid": -914061504, "ts": 1716454222533040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222589576, "dur": 264, "args": { "External id": 78471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78471, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78471, "pid": 5, "tid": 7, "ts": 1716454222589576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533047, "dur": 6, "args": { "External id": 78471, "cbid": 211, "correlation": 78471 } }, { "ph": "s", "id": 78471, "pid": 76337, "tid": -914061504, "ts": 1716454222533047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222589841, "dur": 6, "args": { "External id": 78473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78473, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78473, "pid": 5, "tid": 7, "ts": 1716454222589841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533057, "dur": 5, "args": { "External id": 78473, "cbid": 211, "correlation": 78473 } }, { "ph": "s", "id": 78473, "pid": 76337, "tid": -914061504, "ts": 1716454222533057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222589848, "dur": 6, "args": { "External id": 78479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78479, "pid": 5, "tid": 7, "ts": 1716454222589848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533085, "dur": 8, "args": { "External id": 78479, "cbid": 211, "correlation": 78479 } }, { "ph": "s", "id": 78479, "pid": 76337, "tid": -914061504, "ts": 1716454222533085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222533145, "dur": 0, "args": { "External id": 78489, "cbid": 317, "correlation": 78489 } }, { "ph": "f", "id": 78489, "pid": 76337, "tid": -914061504, "ts": 1716454222533145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222533146, "dur": 0, "args": { "External id": 78490, "cbid": 203, "correlation": 78490 } }, { "ph": "f", "id": 78490, "pid": 76337, "tid": -914061504, "ts": 1716454222533146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222533147, "dur": 0, "args": { "External id": 78491, "cbid": 205, "correlation": 78491 } }, { "ph": "f", "id": 78491, "pid": 76337, "tid": -914061504, "ts": 1716454222533147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222589856, "dur": 8, "args": { "External id": 78495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78495, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78495, "pid": 5, "tid": 7, "ts": 1716454222589856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533158, "dur": 11, "args": { "External id": 78495, "cbid": 211, "correlation": 78495 } }, { "ph": "s", "id": 78495, "pid": 76337, "tid": -914061504, "ts": 1716454222533158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222589866, "dur": 3, "args": { "External id": 78497, "device": 5, "context": 1, "stream": 7, "correlation": 78497, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 78497, "pid": 5, "tid": 7, "ts": 1716454222589866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222533174, "dur": 9, "args": { "External id": 78497, "cbid": 51, "correlation": 78497 } }, { "ph": "s", "id": 78497, "pid": 76337, "tid": -914061504, "ts": 1716454222533174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222589870, "dur": 96, "args": { "External id": 78498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78498, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 78498, "pid": 5, "tid": 7, "ts": 1716454222589870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533184, "dur": 7, "args": { "External id": 78498, "cbid": 211, "correlation": 78498 } }, { "ph": "s", "id": 78498, "pid": 76337, "tid": -914061504, "ts": 1716454222533184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222589968, "dur": 6, "args": { "External id": 78500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78500, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78500, "pid": 5, "tid": 7, "ts": 1716454222589968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533196, "dur": 5, "args": { "External id": 78500, "cbid": 211, "correlation": 78500 } }, { "ph": "s", "id": 78500, "pid": 76337, "tid": -914061504, "ts": 1716454222533196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222589975, "dur": 7, "args": { "External id": 78506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78506, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78506, "pid": 5, "tid": 7, "ts": 1716454222589975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533221, "dur": 8, "args": { "External id": 78506, "cbid": 211, "correlation": 78506 } }, { "ph": "s", "id": 78506, "pid": 76337, "tid": -914061504, "ts": 1716454222533221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222589983, "dur": 5, "args": { "External id": 78514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78514, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78514, "pid": 5, "tid": 7, "ts": 1716454222589983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533251, "dur": 8, "args": { "External id": 78514, "cbid": 211, "correlation": 78514 } }, { "ph": "s", "id": 78514, "pid": 76337, "tid": -914061504, "ts": 1716454222533251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222589990, "dur": 5, "args": { "External id": 78522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78522, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78522, "pid": 5, "tid": 7, "ts": 1716454222589990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533279, "dur": 8, "args": { "External id": 78522, "cbid": 211, "correlation": 78522 } }, { "ph": "s", "id": 78522, "pid": 76337, "tid": -914061504, "ts": 1716454222533279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222589995, "dur": 11, "args": { "External id": 78531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78531, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78531, "pid": 5, "tid": 7, "ts": 1716454222589995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533322, "dur": 10, "args": { "External id": 78531, "cbid": 211, "correlation": 78531 } }, { "ph": "s", "id": 78531, "pid": 76337, "tid": -914061504, "ts": 1716454222533322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222590008, "dur": 12, "args": { "External id": 78551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78551, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78551, "pid": 5, "tid": 7, "ts": 1716454222590008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533390, "dur": 12, "args": { "External id": 78551, "cbid": 211, "correlation": 78551 } }, { "ph": "s", "id": 78551, "pid": 76337, "tid": -914061504, "ts": 1716454222533390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222590022, "dur": 4, "args": { "External id": 78563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78563, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78563, "pid": 5, "tid": 7, "ts": 1716454222590022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533411, "dur": 6, "args": { "External id": 78563, "cbid": 211, "correlation": 78563 } }, { "ph": "s", "id": 78563, "pid": 76337, "tid": -914061504, "ts": 1716454222533411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222590027, "dur": 11, "args": { "External id": 78566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78566, "pid": 5, "tid": 7, "ts": 1716454222590027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533429, "dur": 8, "args": { "External id": 78566, "cbid": 211, "correlation": 78566 } }, { "ph": "s", "id": 78566, "pid": 76337, "tid": -914061504, "ts": 1716454222533429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222590039, "dur": 6, "args": { "External id": 78575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78575, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78575, "pid": 5, "tid": 7, "ts": 1716454222590039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533469, "dur": 9, "args": { "External id": 78575, "cbid": 211, "correlation": 78575 } }, { "ph": "s", "id": 78575, "pid": 76337, "tid": -914061504, "ts": 1716454222533469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222533519, "dur": 0, "args": { "External id": 78585, "cbid": 317, "correlation": 78585 } }, { "ph": "f", "id": 78585, "pid": 76337, "tid": -914061504, "ts": 1716454222533519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222533520, "dur": 0, "args": { "External id": 78586, "cbid": 203, "correlation": 78586 } }, { "ph": "f", "id": 78586, "pid": 76337, "tid": -914061504, "ts": 1716454222533520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222533521, "dur": 0, "args": { "External id": 78587, "cbid": 205, "correlation": 78587 } }, { "ph": "f", "id": 78587, "pid": 76337, "tid": -914061504, "ts": 1716454222533521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222590047, "dur": 7, "args": { "External id": 78591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78591, "pid": 5, "tid": 7, "ts": 1716454222590047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533534, "dur": 12, "args": { "External id": 78591, "cbid": 211, "correlation": 78591 } }, { "ph": "s", "id": 78591, "pid": 76337, "tid": -914061504, "ts": 1716454222533534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222590055, "dur": 325, "args": { "External id": 78593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78593, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78593, "pid": 5, "tid": 7, "ts": 1716454222590055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533548, "dur": 5, "args": { "External id": 78593, "cbid": 211, "correlation": 78593 } }, { "ph": "s", "id": 78593, "pid": 76337, "tid": -914061504, "ts": 1716454222533548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222590383, "dur": 1, "args": { "External id": 78595, "device": 5, "context": 1, "stream": 7, "correlation": 78595, "bytes": 240, "memory bandwidth (GB/s)": 0.15315890236119975 } }, { "ph": "f", "id": 78595, "pid": 5, "tid": 7, "ts": 1716454222590383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222533559, "dur": 6, "args": { "External id": 78595, "cbid": 51, "correlation": 78595 } }, { "ph": "s", "id": 78595, "pid": 76337, "tid": -914061504, "ts": 1716454222533559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222590386, "dur": 505, "args": { "External id": 78596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78596, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78596, "pid": 5, "tid": 7, "ts": 1716454222590386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533566, "dur": 6, "args": { "External id": 78596, "cbid": 211, "correlation": 78596 } }, { "ph": "s", "id": 78596, "pid": 76337, "tid": -914061504, "ts": 1716454222533566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222590893, "dur": 6, "args": { "External id": 78598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78598, "pid": 5, "tid": 7, "ts": 1716454222590893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533575, "dur": 5, "args": { "External id": 78598, "cbid": 211, "correlation": 78598 } }, { "ph": "s", "id": 78598, "pid": 76337, "tid": -914061504, "ts": 1716454222533575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222590900, "dur": 6, "args": { "External id": 78604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78604, "pid": 5, "tid": 7, "ts": 1716454222590900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533604, "dur": 8, "args": { "External id": 78604, "cbid": 211, "correlation": 78604 } }, { "ph": "s", "id": 78604, "pid": 76337, "tid": -914061504, "ts": 1716454222533604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222590908, "dur": 3, "args": { "External id": 78612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78612, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 78612, "pid": 5, "tid": 7, "ts": 1716454222590908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533646, "dur": 10, "args": { "External id": 78612, "cbid": 211, "correlation": 78612 } }, { "ph": "s", "id": 78612, "pid": 76337, "tid": -914061504, "ts": 1716454222533646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222533709, "dur": 1, "args": { "External id": 78628, "cbid": 251, "correlation": 78628 } }, { "ph": "f", "id": 78628, "pid": 76337, "tid": -914061504, "ts": 1716454222533709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222533714, "dur": 0, "args": { "External id": 78630, "cbid": 251, "correlation": 78630 } }, { "ph": "f", "id": 78630, "pid": 76337, "tid": -914061504, "ts": 1716454222533714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222590912, "dur": 13, "args": { "External id": 78631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78631, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78631, "pid": 5, "tid": 7, "ts": 1716454222590912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533716, "dur": 11, "args": { "External id": 78631, "cbid": 211, "correlation": 78631 } }, { "ph": "s", "id": 78631, "pid": 76337, "tid": -914061504, "ts": 1716454222533716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222590926, "dur": 5, "args": { "External id": 78633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78633, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78633, "pid": 5, "tid": 7, "ts": 1716454222590926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533728, "dur": 5, "args": { "External id": 78633, "cbid": 211, "correlation": 78633 } }, { "ph": "s", "id": 78633, "pid": 76337, "tid": -914061504, "ts": 1716454222533728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222590933, "dur": 6, "args": { "External id": 78643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78643, "pid": 5, "tid": 7, "ts": 1716454222590933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533785, "dur": 11, "args": { "External id": 78643, "cbid": 211, "correlation": 78643 } }, { "ph": "s", "id": 78643, "pid": 76337, "tid": -914061504, "ts": 1716454222533785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222590940, "dur": 10, "args": { "External id": 78663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78663, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78663, "pid": 5, "tid": 7, "ts": 1716454222590940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533849, "dur": 11, "args": { "External id": 78663, "cbid": 211, "correlation": 78663 } }, { "ph": "s", "id": 78663, "pid": 76337, "tid": -914061504, "ts": 1716454222533849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222590951, "dur": 4, "args": { "External id": 78675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78675, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 78675, "pid": 5, "tid": 7, "ts": 1716454222590951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533871, "dur": 6, "args": { "External id": 78675, "cbid": 211, "correlation": 78675 } }, { "ph": "s", "id": 78675, "pid": 76337, "tid": -914061504, "ts": 1716454222533871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222590956, "dur": 7, "args": { "External id": 78678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78678, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78678, "pid": 5, "tid": 7, "ts": 1716454222590956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533889, "dur": 6, "args": { "External id": 78678, "cbid": 211, "correlation": 78678 } }, { "ph": "s", "id": 78678, "pid": 76337, "tid": -914061504, "ts": 1716454222533889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222590964, "dur": 5, "args": { "External id": 78687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78687, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78687, "pid": 5, "tid": 7, "ts": 1716454222590964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222533928, "dur": 10, "args": { "External id": 78687, "cbid": 211, "correlation": 78687 } }, { "ph": "s", "id": 78687, "pid": 76337, "tid": -914061504, "ts": 1716454222533928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222533999, "dur": 0, "args": { "External id": 78697, "cbid": 317, "correlation": 78697 } }, { "ph": "f", "id": 78697, "pid": 76337, "tid": -914061504, "ts": 1716454222533999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222533999, "dur": 0, "args": { "External id": 78698, "cbid": 203, "correlation": 78698 } }, { "ph": "f", "id": 78698, "pid": 76337, "tid": -914061504, "ts": 1716454222533999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222534000, "dur": 0, "args": { "External id": 78699, "cbid": 205, "correlation": 78699 } }, { "ph": "f", "id": 78699, "pid": 76337, "tid": -914061504, "ts": 1716454222534000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222590970, "dur": 5, "args": { "External id": 78703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78703, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78703, "pid": 5, "tid": 7, "ts": 1716454222590970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534013, "dur": 13, "args": { "External id": 78703, "cbid": 211, "correlation": 78703 } }, { "ph": "s", "id": 78703, "pid": 76337, "tid": -914061504, "ts": 1716454222534013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222590977, "dur": 167, "args": { "External id": 78705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78705, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78705, "pid": 5, "tid": 7, "ts": 1716454222590977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534029, "dur": 5, "args": { "External id": 78705, "cbid": 211, "correlation": 78705 } }, { "ph": "s", "id": 78705, "pid": 76337, "tid": -914061504, "ts": 1716454222534029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222591146, "dur": 1, "args": { "External id": 78707, "device": 5, "context": 1, "stream": 7, "correlation": 78707, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 78707, "pid": 5, "tid": 7, "ts": 1716454222591146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222534040, "dur": 7, "args": { "External id": 78707, "cbid": 51, "correlation": 78707 } }, { "ph": "s", "id": 78707, "pid": 76337, "tid": -914061504, "ts": 1716454222534040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222591149, "dur": 264, "args": { "External id": 78708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78708, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78708, "pid": 5, "tid": 7, "ts": 1716454222591149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534047, "dur": 6, "args": { "External id": 78708, "cbid": 211, "correlation": 78708 } }, { "ph": "s", "id": 78708, "pid": 76337, "tid": -914061504, "ts": 1716454222534047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222591415, "dur": 6, "args": { "External id": 78710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78710, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78710, "pid": 5, "tid": 7, "ts": 1716454222591415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534057, "dur": 5, "args": { "External id": 78710, "cbid": 211, "correlation": 78710 } }, { "ph": "s", "id": 78710, "pid": 76337, "tid": -914061504, "ts": 1716454222534057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222591422, "dur": 6, "args": { "External id": 78716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78716, "pid": 5, "tid": 7, "ts": 1716454222591422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534086, "dur": 8, "args": { "External id": 78716, "cbid": 211, "correlation": 78716 } }, { "ph": "s", "id": 78716, "pid": 76337, "tid": -914061504, "ts": 1716454222534086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222534145, "dur": 0, "args": { "External id": 78726, "cbid": 317, "correlation": 78726 } }, { "ph": "f", "id": 78726, "pid": 76337, "tid": -914061504, "ts": 1716454222534145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222534145, "dur": 0, "args": { "External id": 78727, "cbid": 203, "correlation": 78727 } }, { "ph": "f", "id": 78727, "pid": 76337, "tid": -914061504, "ts": 1716454222534145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222534146, "dur": 0, "args": { "External id": 78728, "cbid": 205, "correlation": 78728 } }, { "ph": "f", "id": 78728, "pid": 76337, "tid": -914061504, "ts": 1716454222534146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222591429, "dur": 8, "args": { "External id": 78732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78732, "pid": 5, "tid": 7, "ts": 1716454222591429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534157, "dur": 11, "args": { "External id": 78732, "cbid": 211, "correlation": 78732 } }, { "ph": "s", "id": 78732, "pid": 76337, "tid": -914061504, "ts": 1716454222534157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222591439, "dur": 3, "args": { "External id": 78734, "device": 5, "context": 1, "stream": 7, "correlation": 78734, "bytes": 4800, "memory bandwidth (GB/s)": 1.4285714285714286 } }, { "ph": "f", "id": 78734, "pid": 5, "tid": 7, "ts": 1716454222591439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222534174, "dur": 10, "args": { "External id": 78734, "cbid": 51, "correlation": 78734 } }, { "ph": "s", "id": 78734, "pid": 76337, "tid": -914061504, "ts": 1716454222534174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222591443, "dur": 96, "args": { "External id": 78735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78735, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 78735, "pid": 5, "tid": 7, "ts": 1716454222591443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534184, "dur": 6, "args": { "External id": 78735, "cbid": 211, "correlation": 78735 } }, { "ph": "s", "id": 78735, "pid": 76337, "tid": -914061504, "ts": 1716454222534184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222591541, "dur": 6, "args": { "External id": 78737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78737, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78737, "pid": 5, "tid": 7, "ts": 1716454222591541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534194, "dur": 5, "args": { "External id": 78737, "cbid": 211, "correlation": 78737 } }, { "ph": "s", "id": 78737, "pid": 76337, "tid": -914061504, "ts": 1716454222534194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222591548, "dur": 6, "args": { "External id": 78743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78743, "pid": 5, "tid": 7, "ts": 1716454222591548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534220, "dur": 9, "args": { "External id": 78743, "cbid": 211, "correlation": 78743 } }, { "ph": "s", "id": 78743, "pid": 76337, "tid": -914061504, "ts": 1716454222534220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222591555, "dur": 5, "args": { "External id": 78751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78751, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78751, "pid": 5, "tid": 7, "ts": 1716454222591555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534250, "dur": 8, "args": { "External id": 78751, "cbid": 211, "correlation": 78751 } }, { "ph": "s", "id": 78751, "pid": 76337, "tid": -914061504, "ts": 1716454222534250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222591562, "dur": 4, "args": { "External id": 78759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78759, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 78759, "pid": 5, "tid": 7, "ts": 1716454222591562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534278, "dur": 8, "args": { "External id": 78759, "cbid": 211, "correlation": 78759 } }, { "ph": "s", "id": 78759, "pid": 76337, "tid": -914061504, "ts": 1716454222534278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222591567, "dur": 15, "args": { "External id": 78770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78770, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78770, "pid": 5, "tid": 7, "ts": 1716454222591567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534358, "dur": 12, "args": { "External id": 78770, "cbid": 211, "correlation": 78770 } }, { "ph": "s", "id": 78770, "pid": 76337, "tid": -914061504, "ts": 1716454222534358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222534415, "dur": 0, "args": { "External id": 78780, "cbid": 317, "correlation": 78780 } }, { "ph": "f", "id": 78780, "pid": 76337, "tid": -914061504, "ts": 1716454222534415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222534416, "dur": 0, "args": { "External id": 78781, "cbid": 203, "correlation": 78781 } }, { "ph": "f", "id": 78781, "pid": 76337, "tid": -914061504, "ts": 1716454222534416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222534416, "dur": 0, "args": { "External id": 78782, "cbid": 205, "correlation": 78782 } }, { "ph": "f", "id": 78782, "pid": 76337, "tid": -914061504, "ts": 1716454222534416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222591583, "dur": 9, "args": { "External id": 78786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78786, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78786, "pid": 5, "tid": 7, "ts": 1716454222591583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534430, "dur": 11, "args": { "External id": 78786, "cbid": 211, "correlation": 78786 } }, { "ph": "s", "id": 78786, "pid": 76337, "tid": -914061504, "ts": 1716454222534430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222591593, "dur": 165, "args": { "External id": 78788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78788, "pid": 5, "tid": 7, "ts": 1716454222591593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534444, "dur": 5, "args": { "External id": 78788, "cbid": 211, "correlation": 78788 } }, { "ph": "s", "id": 78788, "pid": 76337, "tid": -914061504, "ts": 1716454222534444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222591761, "dur": 1, "args": { "External id": 78790, "device": 5, "context": 1, "stream": 7, "correlation": 78790, "bytes": 960, "memory bandwidth (GB/s)": 0.5772699939867709 } }, { "ph": "f", "id": 78790, "pid": 5, "tid": 7, "ts": 1716454222591761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222534454, "dur": 6, "args": { "External id": 78790, "cbid": 51, "correlation": 78790 } }, { "ph": "s", "id": 78790, "pid": 76337, "tid": -914061504, "ts": 1716454222534454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222591764, "dur": 659, "args": { "External id": 78791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78791, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78791, "pid": 5, "tid": 7, "ts": 1716454222591764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534461, "dur": 6, "args": { "External id": 78791, "cbid": 211, "correlation": 78791 } }, { "ph": "s", "id": 78791, "pid": 76337, "tid": -914061504, "ts": 1716454222534461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222592425, "dur": 12, "args": { "External id": 78793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78793, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78793, "pid": 5, "tid": 7, "ts": 1716454222592425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534471, "dur": 7, "args": { "External id": 78793, "cbid": 211, "correlation": 78793 } }, { "ph": "s", "id": 78793, "pid": 76337, "tid": -914061504, "ts": 1716454222534471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222592438, "dur": 15, "args": { "External id": 78799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78799, "pid": 5, "tid": 7, "ts": 1716454222592438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534501, "dur": 8, "args": { "External id": 78799, "cbid": 211, "correlation": 78799 } }, { "ph": "s", "id": 78799, "pid": 76337, "tid": -914061504, "ts": 1716454222534501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222592454, "dur": 32, "args": { "External id": 78808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78808, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78808, "pid": 5, "tid": 7, "ts": 1716454222592454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534592, "dur": 12, "args": { "External id": 78808, "cbid": 211, "correlation": 78808 } }, { "ph": "s", "id": 78808, "pid": 76337, "tid": -914061504, "ts": 1716454222534592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222592487, "dur": 31, "args": { "External id": 78828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78828, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78828, "pid": 5, "tid": 7, "ts": 1716454222592487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534660, "dur": 10, "args": { "External id": 78828, "cbid": 211, "correlation": 78828 } }, { "ph": "s", "id": 78828, "pid": 76337, "tid": -914061504, "ts": 1716454222534660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222592519, "dur": 5, "args": { "External id": 78840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78840, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78840, "pid": 5, "tid": 7, "ts": 1716454222592519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534679, "dur": 6, "args": { "External id": 78840, "cbid": 211, "correlation": 78840 } }, { "ph": "s", "id": 78840, "pid": 76337, "tid": -914061504, "ts": 1716454222534679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222592525, "dur": 30, "args": { "External id": 78843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78843, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78843, "pid": 5, "tid": 7, "ts": 1716454222592525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534698, "dur": 7, "args": { "External id": 78843, "cbid": 211, "correlation": 78843 } }, { "ph": "s", "id": 78843, "pid": 76337, "tid": -914061504, "ts": 1716454222534698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222592556, "dur": 20, "args": { "External id": 78852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78852, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78852, "pid": 5, "tid": 7, "ts": 1716454222592556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534736, "dur": 9, "args": { "External id": 78852, "cbid": 211, "correlation": 78852 } }, { "ph": "s", "id": 78852, "pid": 76337, "tid": -914061504, "ts": 1716454222534736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222534788, "dur": 0, "args": { "External id": 78862, "cbid": 317, "correlation": 78862 } }, { "ph": "f", "id": 78862, "pid": 76337, "tid": -914061504, "ts": 1716454222534788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222534789, "dur": 0, "args": { "External id": 78863, "cbid": 203, "correlation": 78863 } }, { "ph": "f", "id": 78863, "pid": 76337, "tid": -914061504, "ts": 1716454222534789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222534790, "dur": 0, "args": { "External id": 78864, "cbid": 205, "correlation": 78864 } }, { "ph": "f", "id": 78864, "pid": 76337, "tid": -914061504, "ts": 1716454222534790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222592577, "dur": 21, "args": { "External id": 78868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78868, "pid": 5, "tid": 7, "ts": 1716454222592577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534807, "dur": 11, "args": { "External id": 78868, "cbid": 211, "correlation": 78868 } }, { "ph": "s", "id": 78868, "pid": 76337, "tid": -914061504, "ts": 1716454222534807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222592600, "dur": 325, "args": { "External id": 78870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78870, "pid": 5, "tid": 7, "ts": 1716454222592600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534820, "dur": 6, "args": { "External id": 78870, "cbid": 211, "correlation": 78870 } }, { "ph": "s", "id": 78870, "pid": 76337, "tid": -914061504, "ts": 1716454222534820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222592927, "dur": 1, "args": { "External id": 78872, "device": 5, "context": 1, "stream": 7, "correlation": 78872, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 78872, "pid": 5, "tid": 7, "ts": 1716454222592927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222534832, "dur": 6, "args": { "External id": 78872, "cbid": 51, "correlation": 78872 } }, { "ph": "s", "id": 78872, "pid": 76337, "tid": -914061504, "ts": 1716454222534832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222592931, "dur": 1261, "args": { "External id": 78873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78873, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78873, "pid": 5, "tid": 7, "ts": 1716454222592931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534839, "dur": 7, "args": { "External id": 78873, "cbid": 211, "correlation": 78873 } }, { "ph": "s", "id": 78873, "pid": 76337, "tid": -914061504, "ts": 1716454222534839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222594193, "dur": 13, "args": { "External id": 78875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78875, "pid": 5, "tid": 7, "ts": 1716454222594193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534849, "dur": 5, "args": { "External id": 78875, "cbid": 211, "correlation": 78875 } }, { "ph": "s", "id": 78875, "pid": 76337, "tid": -914061504, "ts": 1716454222534849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222594207, "dur": 16, "args": { "External id": 78881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78881, "pid": 5, "tid": 7, "ts": 1716454222594207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534877, "dur": 8, "args": { "External id": 78881, "cbid": 211, "correlation": 78881 } }, { "ph": "s", "id": 78881, "pid": 76337, "tid": -914061504, "ts": 1716454222534877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222594224, "dur": 3, "args": { "External id": 78889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78889, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 78889, "pid": 5, "tid": 7, "ts": 1716454222594224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222534922, "dur": 9, "args": { "External id": 78889, "cbid": 211, "correlation": 78889 } }, { "ph": "s", "id": 78889, "pid": 76337, "tid": -914061504, "ts": 1716454222534922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222534992, "dur": 2, "args": { "External id": 78905, "cbid": 251, "correlation": 78905 } }, { "ph": "f", "id": 78905, "pid": 76337, "tid": -914061504, "ts": 1716454222534992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222534998, "dur": 0, "args": { "External id": 78907, "cbid": 251, "correlation": 78907 } }, { "ph": "f", "id": 78907, "pid": 76337, "tid": -914061504, "ts": 1716454222534998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222594229, "dur": 12, "args": { "External id": 78908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78908, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78908, "pid": 5, "tid": 7, "ts": 1716454222594229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535000, "dur": 12, "args": { "External id": 78908, "cbid": 211, "correlation": 78908 } }, { "ph": "s", "id": 78908, "pid": 76337, "tid": -914061504, "ts": 1716454222535000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222594243, "dur": 5, "args": { "External id": 78910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78910, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78910, "pid": 5, "tid": 7, "ts": 1716454222594243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535014, "dur": 5, "args": { "External id": 78910, "cbid": 211, "correlation": 78910 } }, { "ph": "s", "id": 78910, "pid": 76337, "tid": -914061504, "ts": 1716454222535014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222594249, "dur": 17, "args": { "External id": 78920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78920, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78920, "pid": 5, "tid": 7, "ts": 1716454222594249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535072, "dur": 12, "args": { "External id": 78920, "cbid": 211, "correlation": 78920 } }, { "ph": "s", "id": 78920, "pid": 76337, "tid": -914061504, "ts": 1716454222535072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222594267, "dur": 18, "args": { "External id": 78940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78940, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 78940, "pid": 5, "tid": 7, "ts": 1716454222594267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535138, "dur": 11, "args": { "External id": 78940, "cbid": 211, "correlation": 78940 } }, { "ph": "s", "id": 78940, "pid": 76337, "tid": -914061504, "ts": 1716454222535138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222594286, "dur": 4, "args": { "External id": 78952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78952, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 78952, "pid": 5, "tid": 7, "ts": 1716454222594286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535160, "dur": 6, "args": { "External id": 78952, "cbid": 211, "correlation": 78952 } }, { "ph": "s", "id": 78952, "pid": 76337, "tid": -914061504, "ts": 1716454222535160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222594292, "dur": 17, "args": { "External id": 78955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78955, "pid": 5, "tid": 7, "ts": 1716454222594292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535178, "dur": 6, "args": { "External id": 78955, "cbid": 211, "correlation": 78955 } }, { "ph": "s", "id": 78955, "pid": 76337, "tid": -914061504, "ts": 1716454222535178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222594310, "dur": 11, "args": { "External id": 78964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78964, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78964, "pid": 5, "tid": 7, "ts": 1716454222594310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535218, "dur": 9, "args": { "External id": 78964, "cbid": 211, "correlation": 78964 } }, { "ph": "s", "id": 78964, "pid": 76337, "tid": -914061504, "ts": 1716454222535218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222535281, "dur": 0, "args": { "External id": 78974, "cbid": 317, "correlation": 78974 } }, { "ph": "f", "id": 78974, "pid": 76337, "tid": -914061504, "ts": 1716454222535281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222535281, "dur": 0, "args": { "External id": 78975, "cbid": 203, "correlation": 78975 } }, { "ph": "f", "id": 78975, "pid": 76337, "tid": -914061504, "ts": 1716454222535281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222535282, "dur": 0, "args": { "External id": 78976, "cbid": 205, "correlation": 78976 } }, { "ph": "f", "id": 78976, "pid": 76337, "tid": -914061504, "ts": 1716454222535282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222594322, "dur": 12, "args": { "External id": 78980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78980, "pid": 5, "tid": 7, "ts": 1716454222594322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535296, "dur": 12, "args": { "External id": 78980, "cbid": 211, "correlation": 78980 } }, { "ph": "s", "id": 78980, "pid": 76337, "tid": -914061504, "ts": 1716454222535296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222594335, "dur": 167, "args": { "External id": 78982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78982, "pid": 5, "tid": 7, "ts": 1716454222594335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535310, "dur": 6, "args": { "External id": 78982, "cbid": 211, "correlation": 78982 } }, { "ph": "s", "id": 78982, "pid": 76337, "tid": -914061504, "ts": 1716454222535310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222594504, "dur": 1, "args": { "External id": 78984, "device": 5, "context": 1, "stream": 7, "correlation": 78984, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 78984, "pid": 5, "tid": 7, "ts": 1716454222594504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222535322, "dur": 7, "args": { "External id": 78984, "cbid": 51, "correlation": 78984 } }, { "ph": "s", "id": 78984, "pid": 76337, "tid": -914061504, "ts": 1716454222535322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222594508, "dur": 659, "args": { "External id": 78985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78985, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 78985, "pid": 5, "tid": 7, "ts": 1716454222594508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535330, "dur": 6, "args": { "External id": 78985, "cbid": 211, "correlation": 78985 } }, { "ph": "s", "id": 78985, "pid": 76337, "tid": -914061504, "ts": 1716454222535330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222595169, "dur": 13, "args": { "External id": 78987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78987, "pid": 5, "tid": 7, "ts": 1716454222595169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535340, "dur": 5, "args": { "External id": 78987, "cbid": 211, "correlation": 78987 } }, { "ph": "s", "id": 78987, "pid": 76337, "tid": -914061504, "ts": 1716454222535340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222595183, "dur": 15, "args": { "External id": 78993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 78993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 78993, "pid": 5, "tid": 7, "ts": 1716454222595183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535368, "dur": 9, "args": { "External id": 78993, "cbid": 211, "correlation": 78993 } }, { "ph": "s", "id": 78993, "pid": 76337, "tid": -914061504, "ts": 1716454222535368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222535428, "dur": 0, "args": { "External id": 79003, "cbid": 317, "correlation": 79003 } }, { "ph": "f", "id": 79003, "pid": 76337, "tid": -914061504, "ts": 1716454222535428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222535428, "dur": 0, "args": { "External id": 79004, "cbid": 203, "correlation": 79004 } }, { "ph": "f", "id": 79004, "pid": 76337, "tid": -914061504, "ts": 1716454222535428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222535429, "dur": 0, "args": { "External id": 79005, "cbid": 205, "correlation": 79005 } }, { "ph": "f", "id": 79005, "pid": 76337, "tid": -914061504, "ts": 1716454222535429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222595199, "dur": 21, "args": { "External id": 79009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79009, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79009, "pid": 5, "tid": 7, "ts": 1716454222595199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535442, "dur": 11, "args": { "External id": 79009, "cbid": 211, "correlation": 79009 } }, { "ph": "s", "id": 79009, "pid": 76337, "tid": -914061504, "ts": 1716454222535442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222595222, "dur": 4, "args": { "External id": 79011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79011, "pid": 5, "tid": 7, "ts": 1716454222595222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535459, "dur": 6, "args": { "External id": 79011, "cbid": 211, "correlation": 79011 } }, { "ph": "s", "id": 79011, "pid": 76337, "tid": -914061504, "ts": 1716454222535459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222535468, "dur": 0, "args": { "External id": 79012, "cbid": 51, "correlation": 79012 } }, { "ph": "s", "id": 79012, "pid": 76337, "tid": -914061504, "ts": 1716454222535468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222595227, "dur": 179, "args": { "External id": 79013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79013, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 79013, "pid": 5, "tid": 7, "ts": 1716454222595227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535469, "dur": 6, "args": { "External id": 79013, "cbid": 211, "correlation": 79013 } }, { "ph": "s", "id": 79013, "pid": 76337, "tid": -914061504, "ts": 1716454222535469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222595407, "dur": 16, "args": { "External id": 79018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79018, "pid": 5, "tid": 7, "ts": 1716454222595407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535496, "dur": 8, "args": { "External id": 79018, "cbid": 211, "correlation": 79018 } }, { "ph": "s", "id": 79018, "pid": 76337, "tid": -914061504, "ts": 1716454222535496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222595425, "dur": 12, "args": { "External id": 79026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79026, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79026, "pid": 5, "tid": 7, "ts": 1716454222595425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535524, "dur": 8, "args": { "External id": 79026, "cbid": 211, "correlation": 79026 } }, { "ph": "s", "id": 79026, "pid": 76337, "tid": -914061504, "ts": 1716454222535524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222595438, "dur": 10, "args": { "External id": 79034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79034, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79034, "pid": 5, "tid": 7, "ts": 1716454222595438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535554, "dur": 9, "args": { "External id": 79034, "cbid": 211, "correlation": 79034 } }, { "ph": "s", "id": 79034, "pid": 76337, "tid": -914061504, "ts": 1716454222535554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222595450, "dur": 19, "args": { "External id": 79054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79054, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 79054, "pid": 5, "tid": 7, "ts": 1716454222595450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535638, "dur": 12, "args": { "External id": 79054, "cbid": 211, "correlation": 79054 } }, { "ph": "s", "id": 79054, "pid": 76337, "tid": -914061504, "ts": 1716454222535638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222595470, "dur": 5, "args": { "External id": 79066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79066, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 79066, "pid": 5, "tid": 7, "ts": 1716454222595470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535660, "dur": 6, "args": { "External id": 79066, "cbid": 211, "correlation": 79066 } }, { "ph": "s", "id": 79066, "pid": 76337, "tid": -914061504, "ts": 1716454222535660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222595476, "dur": 17, "args": { "External id": 79069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79069, "pid": 5, "tid": 7, "ts": 1716454222595476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535677, "dur": 6, "args": { "External id": 79069, "cbid": 211, "correlation": 79069 } }, { "ph": "s", "id": 79069, "pid": 76337, "tid": -914061504, "ts": 1716454222535677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222535734, "dur": 0, "args": { "External id": 79080, "cbid": 317, "correlation": 79080 } }, { "ph": "f", "id": 79080, "pid": 76337, "tid": -914061504, "ts": 1716454222535734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222535735, "dur": 0, "args": { "External id": 79081, "cbid": 203, "correlation": 79081 } }, { "ph": "f", "id": 79081, "pid": 76337, "tid": -914061504, "ts": 1716454222535735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222535735, "dur": 0, "args": { "External id": 79082, "cbid": 205, "correlation": 79082 } }, { "ph": "f", "id": 79082, "pid": 76337, "tid": -914061504, "ts": 1716454222535735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222595494, "dur": 12, "args": { "External id": 79086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79086, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79086, "pid": 5, "tid": 7, "ts": 1716454222595494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535749, "dur": 12, "args": { "External id": 79086, "cbid": 211, "correlation": 79086 } }, { "ph": "s", "id": 79086, "pid": 76337, "tid": -914061504, "ts": 1716454222535749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222595507, "dur": 3, "args": { "External id": 79088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79088, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79088, "pid": 5, "tid": 7, "ts": 1716454222595507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535765, "dur": 6, "args": { "External id": 79088, "cbid": 211, "correlation": 79088 } }, { "ph": "s", "id": 79088, "pid": 76337, "tid": -914061504, "ts": 1716454222535765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222535774, "dur": 0, "args": { "External id": 79089, "cbid": 51, "correlation": 79089 } }, { "ph": "s", "id": 79089, "pid": 76337, "tid": -914061504, "ts": 1716454222535774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222595512, "dur": 93, "args": { "External id": 79090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79090, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 79090, "pid": 5, "tid": 7, "ts": 1716454222595512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535775, "dur": 5, "args": { "External id": 79090, "cbid": 211, "correlation": 79090 } }, { "ph": "s", "id": 79090, "pid": 76337, "tid": -914061504, "ts": 1716454222535775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222595606, "dur": 16, "args": { "External id": 79095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79095, "pid": 5, "tid": 7, "ts": 1716454222595606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535802, "dur": 9, "args": { "External id": 79095, "cbid": 211, "correlation": 79095 } }, { "ph": "s", "id": 79095, "pid": 76337, "tid": -914061504, "ts": 1716454222535802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222595623, "dur": 85, "args": { "External id": 79104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79104, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79104, "pid": 5, "tid": 7, "ts": 1716454222595623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535884, "dur": 16, "args": { "External id": 79104, "cbid": 211, "correlation": 79104 } }, { "ph": "s", "id": 79104, "pid": 76337, "tid": -914061504, "ts": 1716454222535884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222595710, "dur": 29, "args": { "External id": 79126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79126, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79126, "pid": 5, "tid": 7, "ts": 1716454222595710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222535945, "dur": 10, "args": { "External id": 79126, "cbid": 211, "correlation": 79126 } }, { "ph": "s", "id": 79126, "pid": 76337, "tid": -914061504, "ts": 1716454222535945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536048, "dur": 2, "args": { "External id": 79137, "cbid": 251, "correlation": 79137 } }, { "ph": "f", "id": 79137, "pid": 76337, "tid": -914061504, "ts": 1716454222536048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222595740, "dur": 167, "args": { "External id": 79138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79138, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79138, "pid": 5, "tid": 7, "ts": 1716454222595740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536054, "dur": 13, "args": { "External id": 79138, "cbid": 211, "correlation": 79138 } }, { "ph": "s", "id": 79138, "pid": 76337, "tid": -914061504, "ts": 1716454222536054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536124, "dur": 1, "args": { "External id": 79149, "cbid": 251, "correlation": 79149 } }, { "ph": "f", "id": 79149, "pid": 76337, "tid": -914061504, "ts": 1716454222536124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222595909, "dur": 162, "args": { "External id": 79150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79150, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79150, "pid": 5, "tid": 7, "ts": 1716454222595909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536128, "dur": 11, "args": { "External id": 79150, "cbid": 211, "correlation": 79150 } }, { "ph": "s", "id": 79150, "pid": 76337, "tid": -914061504, "ts": 1716454222536128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536192, "dur": 1, "args": { "External id": 79161, "cbid": 251, "correlation": 79161 } }, { "ph": "f", "id": 79161, "pid": 76337, "tid": -914061504, "ts": 1716454222536192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222596072, "dur": 161, "args": { "External id": 79162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79162, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79162, "pid": 5, "tid": 7, "ts": 1716454222596072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536196, "dur": 11, "args": { "External id": 79162, "cbid": 211, "correlation": 79162 } }, { "ph": "s", "id": 79162, "pid": 76337, "tid": -914061504, "ts": 1716454222536196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222596234, "dur": 343, "args": { "External id": 79187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79187, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79187, "pid": 5, "tid": 7, "ts": 1716454222596234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536283, "dur": 12, "args": { "External id": 79187, "cbid": 211, "correlation": 79187 } }, { "ph": "s", "id": 79187, "pid": 76337, "tid": -914061504, "ts": 1716454222536283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536385, "dur": 1, "args": { "External id": 79205, "cbid": 251, "correlation": 79205 } }, { "ph": "f", "id": 79205, "pid": 76337, "tid": -914061504, "ts": 1716454222536385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222596578, "dur": 170, "args": { "External id": 79207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79207, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79207, "pid": 5, "tid": 7, "ts": 1716454222596578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536391, "dur": 14, "args": { "External id": 79207, "cbid": 211, "correlation": 79207 } }, { "ph": "s", "id": 79207, "pid": 76337, "tid": -914061504, "ts": 1716454222536391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222596749, "dur": 19, "args": { "External id": 79215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79215, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79215, "pid": 5, "tid": 7, "ts": 1716454222596749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536461, "dur": 12, "args": { "External id": 79215, "cbid": 211, "correlation": 79215 } }, { "ph": "s", "id": 79215, "pid": 76337, "tid": -914061504, "ts": 1716454222536461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222596770, "dur": 28, "args": { "External id": 79223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79223, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79223, "pid": 5, "tid": 7, "ts": 1716454222596770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536502, "dur": 9, "args": { "External id": 79223, "cbid": 211, "correlation": 79223 } }, { "ph": "s", "id": 79223, "pid": 76337, "tid": -914061504, "ts": 1716454222536502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222596799, "dur": 19, "args": { "External id": 79234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79234, "pid": 5, "tid": 7, "ts": 1716454222596799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536576, "dur": 12, "args": { "External id": 79234, "cbid": 211, "correlation": 79234 } }, { "ph": "s", "id": 79234, "pid": 76337, "tid": -914061504, "ts": 1716454222536576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222596819, "dur": 17, "args": { "External id": 79256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79256, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79256, "pid": 5, "tid": 7, "ts": 1716454222596819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536607, "dur": 7, "args": { "External id": 79256, "cbid": 211, "correlation": 79256 } }, { "ph": "s", "id": 79256, "pid": 76337, "tid": -914061504, "ts": 1716454222536607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536692, "dur": 2, "args": { "External id": 79267, "cbid": 251, "correlation": 79267 } }, { "ph": "f", "id": 79267, "pid": 76337, "tid": -914061504, "ts": 1716454222536692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222596837, "dur": 91, "args": { "External id": 79268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79268, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79268, "pid": 5, "tid": 7, "ts": 1716454222596837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536698, "dur": 14, "args": { "External id": 79268, "cbid": 211, "correlation": 79268 } }, { "ph": "s", "id": 79268, "pid": 76337, "tid": -914061504, "ts": 1716454222536698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536768, "dur": 1, "args": { "External id": 79279, "cbid": 251, "correlation": 79279 } }, { "ph": "f", "id": 79279, "pid": 76337, "tid": -914061504, "ts": 1716454222536768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536772, "dur": 0, "args": { "External id": 79280, "cbid": 251, "correlation": 79280 } }, { "ph": "f", "id": 79280, "pid": 76337, "tid": -914061504, "ts": 1716454222536772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222596929, "dur": 13, "args": { "External id": 79281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79281, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79281, "pid": 5, "tid": 7, "ts": 1716454222596929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536774, "dur": 11, "args": { "External id": 79281, "cbid": 211, "correlation": 79281 } }, { "ph": "s", "id": 79281, "pid": 76337, "tid": -914061504, "ts": 1716454222536774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222596943, "dur": 6, "args": { "External id": 79283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79283, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79283, "pid": 5, "tid": 7, "ts": 1716454222596943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536787, "dur": 5, "args": { "External id": 79283, "cbid": 211, "correlation": 79283 } }, { "ph": "s", "id": 79283, "pid": 76337, "tid": -914061504, "ts": 1716454222536787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536844, "dur": 1, "args": { "External id": 79294, "cbid": 251, "correlation": 79294 } }, { "ph": "f", "id": 79294, "pid": 76337, "tid": -914061504, "ts": 1716454222536844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222536847, "dur": 0, "args": { "External id": 79295, "cbid": 251, "correlation": 79295 } }, { "ph": "f", "id": 79295, "pid": 76337, "tid": -914061504, "ts": 1716454222536847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222596950, "dur": 8, "args": { "External id": 79296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79296, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79296, "pid": 5, "tid": 7, "ts": 1716454222596950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536848, "dur": 12, "args": { "External id": 79296, "cbid": 211, "correlation": 79296 } }, { "ph": "s", "id": 79296, "pid": 76337, "tid": -914061504, "ts": 1716454222536848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222596960, "dur": 3, "args": { "External id": 79298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79298, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79298, "pid": 5, "tid": 7, "ts": 1716454222596960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536861, "dur": 5, "args": { "External id": 79298, "cbid": 211, "correlation": 79298 } }, { "ph": "s", "id": 79298, "pid": 76337, "tid": -914061504, "ts": 1716454222536861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222596965, "dur": 56, "args": { "External id": 79323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79323, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79323, "pid": 5, "tid": 7, "ts": 1716454222596965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222536939, "dur": 12, "args": { "External id": 79323, "cbid": 211, "correlation": 79323 } }, { "ph": "s", "id": 79323, "pid": 76337, "tid": -914061504, "ts": 1716454222536939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222537046, "dur": 2, "args": { "External id": 79341, "cbid": 251, "correlation": 79341 } }, { "ph": "f", "id": 79341, "pid": 76337, "tid": -914061504, "ts": 1716454222537046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222597022, "dur": 93, "args": { "External id": 79343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79343, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79343, "pid": 5, "tid": 7, "ts": 1716454222597022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537052, "dur": 14, "args": { "External id": 79343, "cbid": 211, "correlation": 79343 } }, { "ph": "s", "id": 79343, "pid": 76337, "tid": -914061504, "ts": 1716454222537052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222597116, "dur": 9, "args": { "External id": 79351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79351, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79351, "pid": 5, "tid": 7, "ts": 1716454222597116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537122, "dur": 13, "args": { "External id": 79351, "cbid": 211, "correlation": 79351 } }, { "ph": "s", "id": 79351, "pid": 76337, "tid": -914061504, "ts": 1716454222537122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222597127, "dur": 22, "args": { "External id": 79359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79359, "pid": 5, "tid": 7, "ts": 1716454222597127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537164, "dur": 10, "args": { "External id": 79359, "cbid": 211, "correlation": 79359 } }, { "ph": "s", "id": 79359, "pid": 76337, "tid": -914061504, "ts": 1716454222537164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222597150, "dur": 17, "args": { "External id": 79381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79381, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79381, "pid": 5, "tid": 7, "ts": 1716454222597150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537217, "dur": 10, "args": { "External id": 79381, "cbid": 211, "correlation": 79381 } }, { "ph": "s", "id": 79381, "pid": 76337, "tid": -914061504, "ts": 1716454222537217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222537306, "dur": 1, "args": { "External id": 79397, "cbid": 251, "correlation": 79397 } }, { "ph": "f", "id": 79397, "pid": 76337, "tid": -914061504, "ts": 1716454222537306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222537311, "dur": 0, "args": { "External id": 79399, "cbid": 251, "correlation": 79399 } }, { "ph": "f", "id": 79399, "pid": 76337, "tid": -914061504, "ts": 1716454222537311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222597169, "dur": 498, "args": { "External id": 79400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79400, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79400, "pid": 5, "tid": 7, "ts": 1716454222597169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537313, "dur": 13, "args": { "External id": 79400, "cbid": 211, "correlation": 79400 } }, { "ph": "s", "id": 79400, "pid": 76337, "tid": -914061504, "ts": 1716454222537313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222597668, "dur": 66, "args": { "External id": 79408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79408, "pid": 5, "tid": 7, "ts": 1716454222597668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537379, "dur": 13, "args": { "External id": 79408, "cbid": 211, "correlation": 79408 } }, { "ph": "s", "id": 79408, "pid": 76337, "tid": -914061504, "ts": 1716454222537379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222597736, "dur": 66, "args": { "External id": 79416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79416, "pid": 5, "tid": 7, "ts": 1716454222597736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537411, "dur": 8, "args": { "External id": 79416, "cbid": 211, "correlation": 79416 } }, { "ph": "s", "id": 79416, "pid": 76337, "tid": -914061504, "ts": 1716454222537411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222537493, "dur": 1, "args": { "External id": 79432, "cbid": 251, "correlation": 79432 } }, { "ph": "f", "id": 79432, "pid": 76337, "tid": -914061504, "ts": 1716454222537493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222597805, "dur": 1, "args": { "External id": 79434, "device": 5, "context": 1, "stream": 7, "correlation": 79434, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 79434, "pid": 5, "tid": 7, "ts": 1716454222597805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222537498, "dur": 10, "args": { "External id": 79434, "cbid": 51, "correlation": 79434 } }, { "ph": "s", "id": 79434, "pid": 76337, "tid": -914061504, "ts": 1716454222537498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222597808, "dur": 276, "args": { "External id": 79435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79435, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79435, "pid": 5, "tid": 7, "ts": 1716454222597808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537510, "dur": 11, "args": { "External id": 79435, "cbid": 211, "correlation": 79435 } }, { "ph": "s", "id": 79435, "pid": 76337, "tid": -914061504, "ts": 1716454222537510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222598086, "dur": 14, "args": { "External id": 79443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79443, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79443, "pid": 5, "tid": 7, "ts": 1716454222598086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537553, "dur": 10, "args": { "External id": 79443, "cbid": 211, "correlation": 79443 } }, { "ph": "s", "id": 79443, "pid": 76337, "tid": -914061504, "ts": 1716454222537553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222598101, "dur": 38, "args": { "External id": 79454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79454, "pid": 5, "tid": 7, "ts": 1716454222598101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537622, "dur": 13, "args": { "External id": 79454, "cbid": 211, "correlation": 79454 } }, { "ph": "s", "id": 79454, "pid": 76337, "tid": -914061504, "ts": 1716454222537622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222537688, "dur": 0, "args": { "External id": 79466, "cbid": 317, "correlation": 79466 } }, { "ph": "f", "id": 79466, "pid": 76337, "tid": -914061504, "ts": 1716454222537688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222537689, "dur": 0, "args": { "External id": 79467, "cbid": 203, "correlation": 79467 } }, { "ph": "f", "id": 79467, "pid": 76337, "tid": -914061504, "ts": 1716454222537689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222537690, "dur": 0, "args": { "External id": 79468, "cbid": 205, "correlation": 79468 } }, { "ph": "f", "id": 79468, "pid": 76337, "tid": -914061504, "ts": 1716454222537690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222598140, "dur": 13, "args": { "External id": 79472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79472, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79472, "pid": 5, "tid": 7, "ts": 1716454222598140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537707, "dur": 12, "args": { "External id": 79472, "cbid": 211, "correlation": 79472 } }, { "ph": "s", "id": 79472, "pid": 76337, "tid": -914061504, "ts": 1716454222537707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222598154, "dur": 4, "args": { "External id": 79474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79474, "pid": 5, "tid": 7, "ts": 1716454222598154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537723, "dur": 6, "args": { "External id": 79474, "cbid": 211, "correlation": 79474 } }, { "ph": "s", "id": 79474, "pid": 76337, "tid": -914061504, "ts": 1716454222537723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222537732, "dur": 0, "args": { "External id": 79475, "cbid": 51, "correlation": 79475 } }, { "ph": "s", "id": 79475, "pid": 76337, "tid": -914061504, "ts": 1716454222537732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222598159, "dur": 99, "args": { "External id": 79476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79476, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 79476, "pid": 5, "tid": 7, "ts": 1716454222598159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537733, "dur": 5, "args": { "External id": 79476, "cbid": 211, "correlation": 79476 } }, { "ph": "s", "id": 79476, "pid": 76337, "tid": -914061504, "ts": 1716454222537733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222598260, "dur": 17, "args": { "External id": 79481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79481, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79481, "pid": 5, "tid": 7, "ts": 1716454222598260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537761, "dur": 9, "args": { "External id": 79481, "cbid": 211, "correlation": 79481 } }, { "ph": "s", "id": 79481, "pid": 76337, "tid": -914061504, "ts": 1716454222537761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222598278, "dur": 12, "args": { "External id": 79489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79489, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79489, "pid": 5, "tid": 7, "ts": 1716454222598278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537793, "dur": 9, "args": { "External id": 79489, "cbid": 211, "correlation": 79489 } }, { "ph": "s", "id": 79489, "pid": 76337, "tid": -914061504, "ts": 1716454222537793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222598291, "dur": 32, "args": { "External id": 79498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79498, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79498, "pid": 5, "tid": 7, "ts": 1716454222598291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537833, "dur": 10, "args": { "External id": 79498, "cbid": 211, "correlation": 79498 } }, { "ph": "s", "id": 79498, "pid": 76337, "tid": -914061504, "ts": 1716454222537833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222598324, "dur": 31, "args": { "External id": 79518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79518, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 79518, "pid": 5, "tid": 7, "ts": 1716454222598324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537903, "dur": 12, "args": { "External id": 79518, "cbid": 211, "correlation": 79518 } }, { "ph": "s", "id": 79518, "pid": 76337, "tid": -914061504, "ts": 1716454222537903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222598356, "dur": 5, "args": { "External id": 79530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79530, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79530, "pid": 5, "tid": 7, "ts": 1716454222598356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537925, "dur": 6, "args": { "External id": 79530, "cbid": 211, "correlation": 79530 } }, { "ph": "s", "id": 79530, "pid": 76337, "tid": -914061504, "ts": 1716454222537925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222598363, "dur": 32, "args": { "External id": 79533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79533, "pid": 5, "tid": 7, "ts": 1716454222598363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537943, "dur": 6, "args": { "External id": 79533, "cbid": 211, "correlation": 79533 } }, { "ph": "s", "id": 79533, "pid": 76337, "tid": -914061504, "ts": 1716454222537943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222598396, "dur": 21, "args": { "External id": 79542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79542, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79542, "pid": 5, "tid": 7, "ts": 1716454222598396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222537989, "dur": 11, "args": { "External id": 79542, "cbid": 211, "correlation": 79542 } }, { "ph": "s", "id": 79542, "pid": 76337, "tid": -914061504, "ts": 1716454222537989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222538042, "dur": 0, "args": { "External id": 79552, "cbid": 317, "correlation": 79552 } }, { "ph": "f", "id": 79552, "pid": 76337, "tid": -914061504, "ts": 1716454222538042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222538043, "dur": 0, "args": { "External id": 79553, "cbid": 203, "correlation": 79553 } }, { "ph": "f", "id": 79553, "pid": 76337, "tid": -914061504, "ts": 1716454222538043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222538044, "dur": 0, "args": { "External id": 79554, "cbid": 205, "correlation": 79554 } }, { "ph": "f", "id": 79554, "pid": 76337, "tid": -914061504, "ts": 1716454222538044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222598418, "dur": 22, "args": { "External id": 79558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79558, "pid": 5, "tid": 7, "ts": 1716454222598418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538058, "dur": 12, "args": { "External id": 79558, "cbid": 211, "correlation": 79558 } }, { "ph": "s", "id": 79558, "pid": 76337, "tid": -914061504, "ts": 1716454222538058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222598441, "dur": 325, "args": { "External id": 79560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79560, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79560, "pid": 5, "tid": 7, "ts": 1716454222598441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538072, "dur": 5, "args": { "External id": 79560, "cbid": 211, "correlation": 79560 } }, { "ph": "s", "id": 79560, "pid": 76337, "tid": -914061504, "ts": 1716454222538072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222598768, "dur": 1, "args": { "External id": 79562, "device": 5, "context": 1, "stream": 7, "correlation": 79562, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 79562, "pid": 5, "tid": 7, "ts": 1716454222598768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222538084, "dur": 6, "args": { "External id": 79562, "cbid": 51, "correlation": 79562 } }, { "ph": "s", "id": 79562, "pid": 76337, "tid": -914061504, "ts": 1716454222538084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222598772, "dur": 1279, "args": { "External id": 79563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79563, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79563, "pid": 5, "tid": 7, "ts": 1716454222598772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538091, "dur": 6, "args": { "External id": 79563, "cbid": 211, "correlation": 79563 } }, { "ph": "s", "id": 79563, "pid": 76337, "tid": -914061504, "ts": 1716454222538091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222600052, "dur": 13, "args": { "External id": 79565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79565, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79565, "pid": 5, "tid": 7, "ts": 1716454222600052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538101, "dur": 5, "args": { "External id": 79565, "cbid": 211, "correlation": 79565 } }, { "ph": "s", "id": 79565, "pid": 76337, "tid": -914061504, "ts": 1716454222538101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222600066, "dur": 15, "args": { "External id": 79571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79571, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79571, "pid": 5, "tid": 7, "ts": 1716454222600066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538130, "dur": 8, "args": { "External id": 79571, "cbid": 211, "correlation": 79571 } }, { "ph": "s", "id": 79571, "pid": 76337, "tid": -914061504, "ts": 1716454222538130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222600083, "dur": 4, "args": { "External id": 79579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79579, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 79579, "pid": 5, "tid": 7, "ts": 1716454222600083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538174, "dur": 10, "args": { "External id": 79579, "cbid": 211, "correlation": 79579 } }, { "ph": "s", "id": 79579, "pid": 76337, "tid": -914061504, "ts": 1716454222538174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222538240, "dur": 1, "args": { "External id": 79595, "cbid": 251, "correlation": 79595 } }, { "ph": "f", "id": 79595, "pid": 76337, "tid": -914061504, "ts": 1716454222538240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222538245, "dur": 0, "args": { "External id": 79597, "cbid": 251, "correlation": 79597 } }, { "ph": "f", "id": 79597, "pid": 76337, "tid": -914061504, "ts": 1716454222538245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222600088, "dur": 14, "args": { "External id": 79598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79598, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79598, "pid": 5, "tid": 7, "ts": 1716454222600088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538247, "dur": 11, "args": { "External id": 79598, "cbid": 211, "correlation": 79598 } }, { "ph": "s", "id": 79598, "pid": 76337, "tid": -914061504, "ts": 1716454222538247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222600103, "dur": 5, "args": { "External id": 79600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79600, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79600, "pid": 5, "tid": 7, "ts": 1716454222600103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538260, "dur": 5, "args": { "External id": 79600, "cbid": 211, "correlation": 79600 } }, { "ph": "s", "id": 79600, "pid": 76337, "tid": -914061504, "ts": 1716454222538260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222600109, "dur": 17, "args": { "External id": 79610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79610, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79610, "pid": 5, "tid": 7, "ts": 1716454222600109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538318, "dur": 13, "args": { "External id": 79610, "cbid": 211, "correlation": 79610 } }, { "ph": "s", "id": 79610, "pid": 76337, "tid": -914061504, "ts": 1716454222538318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222600127, "dur": 18, "args": { "External id": 79630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79630, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 79630, "pid": 5, "tid": 7, "ts": 1716454222600127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538385, "dur": 11, "args": { "External id": 79630, "cbid": 211, "correlation": 79630 } }, { "ph": "s", "id": 79630, "pid": 76337, "tid": -914061504, "ts": 1716454222538385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222600147, "dur": 4, "args": { "External id": 79642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79642, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 79642, "pid": 5, "tid": 7, "ts": 1716454222600147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538406, "dur": 6, "args": { "External id": 79642, "cbid": 211, "correlation": 79642 } }, { "ph": "s", "id": 79642, "pid": 76337, "tid": -914061504, "ts": 1716454222538406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222600152, "dur": 17, "args": { "External id": 79645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79645, "pid": 5, "tid": 7, "ts": 1716454222600152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538424, "dur": 7, "args": { "External id": 79645, "cbid": 211, "correlation": 79645 } }, { "ph": "s", "id": 79645, "pid": 76337, "tid": -914061504, "ts": 1716454222538424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222600170, "dur": 11, "args": { "External id": 79654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79654, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79654, "pid": 5, "tid": 7, "ts": 1716454222600170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538464, "dur": 10, "args": { "External id": 79654, "cbid": 211, "correlation": 79654 } }, { "ph": "s", "id": 79654, "pid": 76337, "tid": -914061504, "ts": 1716454222538464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222538526, "dur": 0, "args": { "External id": 79664, "cbid": 317, "correlation": 79664 } }, { "ph": "f", "id": 79664, "pid": 76337, "tid": -914061504, "ts": 1716454222538526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222538527, "dur": 0, "args": { "External id": 79665, "cbid": 203, "correlation": 79665 } }, { "ph": "f", "id": 79665, "pid": 76337, "tid": -914061504, "ts": 1716454222538527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222538528, "dur": 0, "args": { "External id": 79666, "cbid": 205, "correlation": 79666 } }, { "ph": "f", "id": 79666, "pid": 76337, "tid": -914061504, "ts": 1716454222538528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222600183, "dur": 11, "args": { "External id": 79670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79670, "pid": 5, "tid": 7, "ts": 1716454222600183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538542, "dur": 12, "args": { "External id": 79670, "cbid": 211, "correlation": 79670 } }, { "ph": "s", "id": 79670, "pid": 76337, "tid": -914061504, "ts": 1716454222538542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222600195, "dur": 166, "args": { "External id": 79672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79672, "pid": 5, "tid": 7, "ts": 1716454222600195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538556, "dur": 5, "args": { "External id": 79672, "cbid": 211, "correlation": 79672 } }, { "ph": "s", "id": 79672, "pid": 76337, "tid": -914061504, "ts": 1716454222538556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222600363, "dur": 1, "args": { "External id": 79674, "device": 5, "context": 1, "stream": 7, "correlation": 79674, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 79674, "pid": 5, "tid": 7, "ts": 1716454222600363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222538567, "dur": 6, "args": { "External id": 79674, "cbid": 51, "correlation": 79674 } }, { "ph": "s", "id": 79674, "pid": 76337, "tid": -914061504, "ts": 1716454222538567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222600367, "dur": 661, "args": { "External id": 79675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79675, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79675, "pid": 5, "tid": 7, "ts": 1716454222600367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538575, "dur": 7, "args": { "External id": 79675, "cbid": 211, "correlation": 79675 } }, { "ph": "s", "id": 79675, "pid": 76337, "tid": -914061504, "ts": 1716454222538575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222601029, "dur": 13, "args": { "External id": 79677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79677, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79677, "pid": 5, "tid": 7, "ts": 1716454222601029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538586, "dur": 5, "args": { "External id": 79677, "cbid": 211, "correlation": 79677 } }, { "ph": "s", "id": 79677, "pid": 76337, "tid": -914061504, "ts": 1716454222538586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222601043, "dur": 15, "args": { "External id": 79683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79683, "pid": 5, "tid": 7, "ts": 1716454222601043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538614, "dur": 8, "args": { "External id": 79683, "cbid": 211, "correlation": 79683 } }, { "ph": "s", "id": 79683, "pid": 76337, "tid": -914061504, "ts": 1716454222538614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222538672, "dur": 0, "args": { "External id": 79693, "cbid": 317, "correlation": 79693 } }, { "ph": "f", "id": 79693, "pid": 76337, "tid": -914061504, "ts": 1716454222538672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222538673, "dur": 0, "args": { "External id": 79694, "cbid": 203, "correlation": 79694 } }, { "ph": "f", "id": 79694, "pid": 76337, "tid": -914061504, "ts": 1716454222538673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222538673, "dur": 0, "args": { "External id": 79695, "cbid": 205, "correlation": 79695 } }, { "ph": "f", "id": 79695, "pid": 76337, "tid": -914061504, "ts": 1716454222538673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222601060, "dur": 21, "args": { "External id": 79699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79699, "pid": 5, "tid": 7, "ts": 1716454222601060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538686, "dur": 12, "args": { "External id": 79699, "cbid": 211, "correlation": 79699 } }, { "ph": "s", "id": 79699, "pid": 76337, "tid": -914061504, "ts": 1716454222538686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222601082, "dur": 4, "args": { "External id": 79701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79701, "pid": 5, "tid": 7, "ts": 1716454222601082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538702, "dur": 6, "args": { "External id": 79701, "cbid": 211, "correlation": 79701 } }, { "ph": "s", "id": 79701, "pid": 76337, "tid": -914061504, "ts": 1716454222538702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222538711, "dur": 0, "args": { "External id": 79702, "cbid": 51, "correlation": 79702 } }, { "ph": "s", "id": 79702, "pid": 76337, "tid": -914061504, "ts": 1716454222538711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222601087, "dur": 174, "args": { "External id": 79703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79703, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 79703, "pid": 5, "tid": 7, "ts": 1716454222601087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538711, "dur": 5, "args": { "External id": 79703, "cbid": 211, "correlation": 79703 } }, { "ph": "s", "id": 79703, "pid": 76337, "tid": -914061504, "ts": 1716454222538711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222601263, "dur": 17, "args": { "External id": 79708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79708, "pid": 5, "tid": 7, "ts": 1716454222601263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538736, "dur": 9, "args": { "External id": 79708, "cbid": 211, "correlation": 79708 } }, { "ph": "s", "id": 79708, "pid": 76337, "tid": -914061504, "ts": 1716454222538736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222601281, "dur": 12, "args": { "External id": 79716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79716, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79716, "pid": 5, "tid": 7, "ts": 1716454222601281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538766, "dur": 8, "args": { "External id": 79716, "cbid": 211, "correlation": 79716 } }, { "ph": "s", "id": 79716, "pid": 76337, "tid": -914061504, "ts": 1716454222538766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222601294, "dur": 10, "args": { "External id": 79724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79724, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79724, "pid": 5, "tid": 7, "ts": 1716454222601294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538794, "dur": 9, "args": { "External id": 79724, "cbid": 211, "correlation": 79724 } }, { "ph": "s", "id": 79724, "pid": 76337, "tid": -914061504, "ts": 1716454222538794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222601305, "dur": 19, "args": { "External id": 79744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79744, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 79744, "pid": 5, "tid": 7, "ts": 1716454222601305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538876, "dur": 13, "args": { "External id": 79744, "cbid": 211, "correlation": 79744 } }, { "ph": "s", "id": 79744, "pid": 76337, "tid": -914061504, "ts": 1716454222538876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222601325, "dur": 4, "args": { "External id": 79756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79756, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 79756, "pid": 5, "tid": 7, "ts": 1716454222601325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538899, "dur": 6, "args": { "External id": 79756, "cbid": 211, "correlation": 79756 } }, { "ph": "s", "id": 79756, "pid": 76337, "tid": -914061504, "ts": 1716454222538899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222601331, "dur": 18, "args": { "External id": 79759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79759, "pid": 5, "tid": 7, "ts": 1716454222601331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538916, "dur": 7, "args": { "External id": 79759, "cbid": 211, "correlation": 79759 } }, { "ph": "s", "id": 79759, "pid": 76337, "tid": -914061504, "ts": 1716454222538916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222538982, "dur": 0, "args": { "External id": 79770, "cbid": 317, "correlation": 79770 } }, { "ph": "f", "id": 79770, "pid": 76337, "tid": -914061504, "ts": 1716454222538982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222538983, "dur": 0, "args": { "External id": 79771, "cbid": 203, "correlation": 79771 } }, { "ph": "f", "id": 79771, "pid": 76337, "tid": -914061504, "ts": 1716454222538983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222538984, "dur": 0, "args": { "External id": 79772, "cbid": 205, "correlation": 79772 } }, { "ph": "f", "id": 79772, "pid": 76337, "tid": -914061504, "ts": 1716454222538984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222601350, "dur": 13, "args": { "External id": 79776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79776, "pid": 5, "tid": 7, "ts": 1716454222601350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222538997, "dur": 12, "args": { "External id": 79776, "cbid": 211, "correlation": 79776 } }, { "ph": "s", "id": 79776, "pid": 76337, "tid": -914061504, "ts": 1716454222538997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222601364, "dur": 3, "args": { "External id": 79778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79778, "pid": 5, "tid": 7, "ts": 1716454222601364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539013, "dur": 6, "args": { "External id": 79778, "cbid": 211, "correlation": 79778 } }, { "ph": "s", "id": 79778, "pid": 76337, "tid": -914061504, "ts": 1716454222539013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222539022, "dur": 0, "args": { "External id": 79779, "cbid": 51, "correlation": 79779 } }, { "ph": "s", "id": 79779, "pid": 76337, "tid": -914061504, "ts": 1716454222539022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222601369, "dur": 93, "args": { "External id": 79780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79780, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 79780, "pid": 5, "tid": 7, "ts": 1716454222601369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539023, "dur": 5, "args": { "External id": 79780, "cbid": 211, "correlation": 79780 } }, { "ph": "s", "id": 79780, "pid": 76337, "tid": -914061504, "ts": 1716454222539023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222601463, "dur": 16, "args": { "External id": 79785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79785, "pid": 5, "tid": 7, "ts": 1716454222601463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539050, "dur": 8, "args": { "External id": 79785, "cbid": 211, "correlation": 79785 } }, { "ph": "s", "id": 79785, "pid": 76337, "tid": -914061504, "ts": 1716454222539050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222601480, "dur": 86, "args": { "External id": 79794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79794, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79794, "pid": 5, "tid": 7, "ts": 1716454222601480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539132, "dur": 14, "args": { "External id": 79794, "cbid": 211, "correlation": 79794 } }, { "ph": "s", "id": 79794, "pid": 76337, "tid": -914061504, "ts": 1716454222539132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222601567, "dur": 30, "args": { "External id": 79816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79816, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79816, "pid": 5, "tid": 7, "ts": 1716454222601567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539189, "dur": 10, "args": { "External id": 79816, "cbid": 211, "correlation": 79816 } }, { "ph": "s", "id": 79816, "pid": 76337, "tid": -914061504, "ts": 1716454222539189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222539277, "dur": 1, "args": { "External id": 79827, "cbid": 251, "correlation": 79827 } }, { "ph": "f", "id": 79827, "pid": 76337, "tid": -914061504, "ts": 1716454222539277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222601598, "dur": 168, "args": { "External id": 79828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79828, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79828, "pid": 5, "tid": 7, "ts": 1716454222601598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539283, "dur": 13, "args": { "External id": 79828, "cbid": 211, "correlation": 79828 } }, { "ph": "s", "id": 79828, "pid": 76337, "tid": -914061504, "ts": 1716454222539283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222539353, "dur": 1, "args": { "External id": 79839, "cbid": 251, "correlation": 79839 } }, { "ph": "f", "id": 79839, "pid": 76337, "tid": -914061504, "ts": 1716454222539353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222601767, "dur": 161, "args": { "External id": 79840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79840, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79840, "pid": 5, "tid": 7, "ts": 1716454222601767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539357, "dur": 12, "args": { "External id": 79840, "cbid": 211, "correlation": 79840 } }, { "ph": "s", "id": 79840, "pid": 76337, "tid": -914061504, "ts": 1716454222539357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222539424, "dur": 1, "args": { "External id": 79851, "cbid": 251, "correlation": 79851 } }, { "ph": "f", "id": 79851, "pid": 76337, "tid": -914061504, "ts": 1716454222539424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222601930, "dur": 161, "args": { "External id": 79852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79852, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79852, "pid": 5, "tid": 7, "ts": 1716454222601930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539428, "dur": 12, "args": { "External id": 79852, "cbid": 211, "correlation": 79852 } }, { "ph": "s", "id": 79852, "pid": 76337, "tid": -914061504, "ts": 1716454222539428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222602093, "dur": 342, "args": { "External id": 79877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79877, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79877, "pid": 5, "tid": 7, "ts": 1716454222602093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539512, "dur": 12, "args": { "External id": 79877, "cbid": 211, "correlation": 79877 } }, { "ph": "s", "id": 79877, "pid": 76337, "tid": -914061504, "ts": 1716454222539512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222539612, "dur": 1, "args": { "External id": 79895, "cbid": 251, "correlation": 79895 } }, { "ph": "f", "id": 79895, "pid": 76337, "tid": -914061504, "ts": 1716454222539612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222602436, "dur": 169, "args": { "External id": 79897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79897, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79897, "pid": 5, "tid": 7, "ts": 1716454222602436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539617, "dur": 13, "args": { "External id": 79897, "cbid": 211, "correlation": 79897 } }, { "ph": "s", "id": 79897, "pid": 76337, "tid": -914061504, "ts": 1716454222539617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222602607, "dur": 19, "args": { "External id": 79905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79905, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79905, "pid": 5, "tid": 7, "ts": 1716454222602607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539688, "dur": 12, "args": { "External id": 79905, "cbid": 211, "correlation": 79905 } }, { "ph": "s", "id": 79905, "pid": 76337, "tid": -914061504, "ts": 1716454222539688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222602627, "dur": 27, "args": { "External id": 79913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79913, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79913, "pid": 5, "tid": 7, "ts": 1716454222602627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539728, "dur": 8, "args": { "External id": 79913, "cbid": 211, "correlation": 79913 } }, { "ph": "s", "id": 79913, "pid": 76337, "tid": -914061504, "ts": 1716454222539728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222602656, "dur": 19, "args": { "External id": 79924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79924, "pid": 5, "tid": 7, "ts": 1716454222602656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539801, "dur": 13, "args": { "External id": 79924, "cbid": 211, "correlation": 79924 } }, { "ph": "s", "id": 79924, "pid": 76337, "tid": -914061504, "ts": 1716454222539801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222602676, "dur": 16, "args": { "External id": 79946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79946, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 79946, "pid": 5, "tid": 7, "ts": 1716454222602676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539832, "dur": 8, "args": { "External id": 79946, "cbid": 211, "correlation": 79946 } }, { "ph": "s", "id": 79946, "pid": 76337, "tid": -914061504, "ts": 1716454222539832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222539919, "dur": 1, "args": { "External id": 79957, "cbid": 251, "correlation": 79957 } }, { "ph": "f", "id": 79957, "pid": 76337, "tid": -914061504, "ts": 1716454222539919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222602694, "dur": 92, "args": { "External id": 79958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79958, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 79958, "pid": 5, "tid": 7, "ts": 1716454222602694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222539925, "dur": 13, "args": { "External id": 79958, "cbid": 211, "correlation": 79958 } }, { "ph": "s", "id": 79958, "pid": 76337, "tid": -914061504, "ts": 1716454222539925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540001, "dur": 1, "args": { "External id": 79969, "cbid": 251, "correlation": 79969 } }, { "ph": "f", "id": 79969, "pid": 76337, "tid": -914061504, "ts": 1716454222540001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540005, "dur": 0, "args": { "External id": 79970, "cbid": 251, "correlation": 79970 } }, { "ph": "f", "id": 79970, "pid": 76337, "tid": -914061504, "ts": 1716454222540005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222602787, "dur": 13, "args": { "External id": 79971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79971, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79971, "pid": 5, "tid": 7, "ts": 1716454222602787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540007, "dur": 12, "args": { "External id": 79971, "cbid": 211, "correlation": 79971 } }, { "ph": "s", "id": 79971, "pid": 76337, "tid": -914061504, "ts": 1716454222540007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222602802, "dur": 6, "args": { "External id": 79973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79973, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79973, "pid": 5, "tid": 7, "ts": 1716454222602802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540021, "dur": 6, "args": { "External id": 79973, "cbid": 211, "correlation": 79973 } }, { "ph": "s", "id": 79973, "pid": 76337, "tid": -914061504, "ts": 1716454222540021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540080, "dur": 1, "args": { "External id": 79984, "cbid": 251, "correlation": 79984 } }, { "ph": "f", "id": 79984, "pid": 76337, "tid": -914061504, "ts": 1716454222540080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540083, "dur": 0, "args": { "External id": 79985, "cbid": 251, "correlation": 79985 } }, { "ph": "f", "id": 79985, "pid": 76337, "tid": -914061504, "ts": 1716454222540083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222602809, "dur": 9, "args": { "External id": 79986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79986, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79986, "pid": 5, "tid": 7, "ts": 1716454222602809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540085, "dur": 12, "args": { "External id": 79986, "cbid": 211, "correlation": 79986 } }, { "ph": "s", "id": 79986, "pid": 76337, "tid": -914061504, "ts": 1716454222540085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222602819, "dur": 3, "args": { "External id": 79988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 79988, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 79988, "pid": 5, "tid": 7, "ts": 1716454222602819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540098, "dur": 5, "args": { "External id": 79988, "cbid": 211, "correlation": 79988 } }, { "ph": "s", "id": 79988, "pid": 76337, "tid": -914061504, "ts": 1716454222540098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222602823, "dur": 55, "args": { "External id": 80013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80013, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80013, "pid": 5, "tid": 7, "ts": 1716454222602823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540175, "dur": 12, "args": { "External id": 80013, "cbid": 211, "correlation": 80013 } }, { "ph": "s", "id": 80013, "pid": 76337, "tid": -914061504, "ts": 1716454222540175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540273, "dur": 1, "args": { "External id": 80031, "cbid": 251, "correlation": 80031 } }, { "ph": "f", "id": 80031, "pid": 76337, "tid": -914061504, "ts": 1716454222540273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222602880, "dur": 92, "args": { "External id": 80033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80033, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80033, "pid": 5, "tid": 7, "ts": 1716454222602880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540279, "dur": 13, "args": { "External id": 80033, "cbid": 211, "correlation": 80033 } }, { "ph": "s", "id": 80033, "pid": 76337, "tid": -914061504, "ts": 1716454222540279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222602973, "dur": 11, "args": { "External id": 80041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80041, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80041, "pid": 5, "tid": 7, "ts": 1716454222602973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540348, "dur": 13, "args": { "External id": 80041, "cbid": 211, "correlation": 80041 } }, { "ph": "s", "id": 80041, "pid": 76337, "tid": -914061504, "ts": 1716454222540348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222602986, "dur": 21, "args": { "External id": 80049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80049, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80049, "pid": 5, "tid": 7, "ts": 1716454222602986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540391, "dur": 10, "args": { "External id": 80049, "cbid": 211, "correlation": 80049 } }, { "ph": "s", "id": 80049, "pid": 76337, "tid": -914061504, "ts": 1716454222540391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222603008, "dur": 18, "args": { "External id": 80071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80071, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80071, "pid": 5, "tid": 7, "ts": 1716454222603008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540442, "dur": 10, "args": { "External id": 80071, "cbid": 211, "correlation": 80071 } }, { "ph": "s", "id": 80071, "pid": 76337, "tid": -914061504, "ts": 1716454222540442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540530, "dur": 1, "args": { "External id": 80087, "cbid": 251, "correlation": 80087 } }, { "ph": "f", "id": 80087, "pid": 76337, "tid": -914061504, "ts": 1716454222540530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540535, "dur": 0, "args": { "External id": 80089, "cbid": 251, "correlation": 80089 } }, { "ph": "f", "id": 80089, "pid": 76337, "tid": -914061504, "ts": 1716454222540535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222603027, "dur": 496, "args": { "External id": 80090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80090, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80090, "pid": 5, "tid": 7, "ts": 1716454222603027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540537, "dur": 12, "args": { "External id": 80090, "cbid": 211, "correlation": 80090 } }, { "ph": "s", "id": 80090, "pid": 76337, "tid": -914061504, "ts": 1716454222540537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222603525, "dur": 66, "args": { "External id": 80098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80098, "pid": 5, "tid": 7, "ts": 1716454222603525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540602, "dur": 12, "args": { "External id": 80098, "cbid": 211, "correlation": 80098 } }, { "ph": "s", "id": 80098, "pid": 76337, "tid": -914061504, "ts": 1716454222540602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222603592, "dur": 66, "args": { "External id": 80106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80106, "pid": 5, "tid": 7, "ts": 1716454222603592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540634, "dur": 8, "args": { "External id": 80106, "cbid": 211, "correlation": 80106 } }, { "ph": "s", "id": 80106, "pid": 76337, "tid": -914061504, "ts": 1716454222540634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222540713, "dur": 1, "args": { "External id": 80122, "cbid": 251, "correlation": 80122 } }, { "ph": "f", "id": 80122, "pid": 76337, "tid": -914061504, "ts": 1716454222540713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222603661, "dur": 1, "args": { "External id": 80124, "device": 5, "context": 1, "stream": 7, "correlation": 80124, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 80124, "pid": 5, "tid": 7, "ts": 1716454222603661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222540718, "dur": 9, "args": { "External id": 80124, "cbid": 51, "correlation": 80124 } }, { "ph": "s", "id": 80124, "pid": 76337, "tid": -914061504, "ts": 1716454222540718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222603665, "dur": 273, "args": { "External id": 80125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80125, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80125, "pid": 5, "tid": 7, "ts": 1716454222603665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540728, "dur": 11, "args": { "External id": 80125, "cbid": 211, "correlation": 80125 } }, { "ph": "s", "id": 80125, "pid": 76337, "tid": -914061504, "ts": 1716454222540728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222603940, "dur": 14, "args": { "External id": 80133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80133, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80133, "pid": 5, "tid": 7, "ts": 1716454222603940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540770, "dur": 10, "args": { "External id": 80133, "cbid": 211, "correlation": 80133 } }, { "ph": "s", "id": 80133, "pid": 76337, "tid": -914061504, "ts": 1716454222540770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222603955, "dur": 38, "args": { "External id": 80144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80144, "pid": 5, "tid": 7, "ts": 1716454222603955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540839, "dur": 12, "args": { "External id": 80144, "cbid": 211, "correlation": 80144 } }, { "ph": "s", "id": 80144, "pid": 76337, "tid": -914061504, "ts": 1716454222540839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222540903, "dur": 0, "args": { "External id": 80156, "cbid": 317, "correlation": 80156 } }, { "ph": "f", "id": 80156, "pid": 76337, "tid": -914061504, "ts": 1716454222540903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222540904, "dur": 0, "args": { "External id": 80157, "cbid": 203, "correlation": 80157 } }, { "ph": "f", "id": 80157, "pid": 76337, "tid": -914061504, "ts": 1716454222540904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222540904, "dur": 0, "args": { "External id": 80158, "cbid": 205, "correlation": 80158 } }, { "ph": "f", "id": 80158, "pid": 76337, "tid": -914061504, "ts": 1716454222540904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222603995, "dur": 13, "args": { "External id": 80162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80162, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80162, "pid": 5, "tid": 7, "ts": 1716454222603995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540920, "dur": 12, "args": { "External id": 80162, "cbid": 211, "correlation": 80162 } }, { "ph": "s", "id": 80162, "pid": 76337, "tid": -914061504, "ts": 1716454222540920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222604009, "dur": 4, "args": { "External id": 80164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80164, "pid": 5, "tid": 7, "ts": 1716454222604009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540936, "dur": 6, "args": { "External id": 80164, "cbid": 211, "correlation": 80164 } }, { "ph": "s", "id": 80164, "pid": 76337, "tid": -914061504, "ts": 1716454222540936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222540946, "dur": 0, "args": { "External id": 80165, "cbid": 51, "correlation": 80165 } }, { "ph": "s", "id": 80165, "pid": 76337, "tid": -914061504, "ts": 1716454222540946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222604014, "dur": 99, "args": { "External id": 80166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80166, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 80166, "pid": 5, "tid": 7, "ts": 1716454222604014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540947, "dur": 5, "args": { "External id": 80166, "cbid": 211, "correlation": 80166 } }, { "ph": "s", "id": 80166, "pid": 76337, "tid": -914061504, "ts": 1716454222540947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222604115, "dur": 16, "args": { "External id": 80171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80171, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80171, "pid": 5, "tid": 7, "ts": 1716454222604115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222540981, "dur": 10, "args": { "External id": 80171, "cbid": 211, "correlation": 80171 } }, { "ph": "s", "id": 80171, "pid": 76337, "tid": -914061504, "ts": 1716454222540981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222604133, "dur": 12, "args": { "External id": 80179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80179, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80179, "pid": 5, "tid": 7, "ts": 1716454222604133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541014, "dur": 8, "args": { "External id": 80179, "cbid": 211, "correlation": 80179 } }, { "ph": "s", "id": 80179, "pid": 76337, "tid": -914061504, "ts": 1716454222541014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222604146, "dur": 25, "args": { "External id": 80188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80188, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80188, "pid": 5, "tid": 7, "ts": 1716454222604146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541052, "dur": 10, "args": { "External id": 80188, "cbid": 211, "correlation": 80188 } }, { "ph": "s", "id": 80188, "pid": 76337, "tid": -914061504, "ts": 1716454222541052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222604172, "dur": 25, "args": { "External id": 80208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80208, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 80208, "pid": 5, "tid": 7, "ts": 1716454222604172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541123, "dur": 11, "args": { "External id": 80208, "cbid": 211, "correlation": 80208 } }, { "ph": "s", "id": 80208, "pid": 76337, "tid": -914061504, "ts": 1716454222541123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222604199, "dur": 5, "args": { "External id": 80220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80220, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 80220, "pid": 5, "tid": 7, "ts": 1716454222604199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541144, "dur": 7, "args": { "External id": 80220, "cbid": 211, "correlation": 80220 } }, { "ph": "s", "id": 80220, "pid": 76337, "tid": -914061504, "ts": 1716454222541144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222604205, "dur": 25, "args": { "External id": 80223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80223, "pid": 5, "tid": 7, "ts": 1716454222604205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541163, "dur": 7, "args": { "External id": 80223, "cbid": 211, "correlation": 80223 } }, { "ph": "s", "id": 80223, "pid": 76337, "tid": -914061504, "ts": 1716454222541163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222604231, "dur": 18, "args": { "External id": 80232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80232, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80232, "pid": 5, "tid": 7, "ts": 1716454222604231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541202, "dur": 9, "args": { "External id": 80232, "cbid": 211, "correlation": 80232 } }, { "ph": "s", "id": 80232, "pid": 76337, "tid": -914061504, "ts": 1716454222541202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222541254, "dur": 0, "args": { "External id": 80242, "cbid": 317, "correlation": 80242 } }, { "ph": "f", "id": 80242, "pid": 76337, "tid": -914061504, "ts": 1716454222541254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222541254, "dur": 0, "args": { "External id": 80243, "cbid": 203, "correlation": 80243 } }, { "ph": "f", "id": 80243, "pid": 76337, "tid": -914061504, "ts": 1716454222541254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222541255, "dur": 0, "args": { "External id": 80244, "cbid": 205, "correlation": 80244 } }, { "ph": "f", "id": 80244, "pid": 76337, "tid": -914061504, "ts": 1716454222541255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222604250, "dur": 18, "args": { "External id": 80248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80248, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80248, "pid": 5, "tid": 7, "ts": 1716454222604250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541270, "dur": 11, "args": { "External id": 80248, "cbid": 211, "correlation": 80248 } }, { "ph": "s", "id": 80248, "pid": 76337, "tid": -914061504, "ts": 1716454222541270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222604269, "dur": 246, "args": { "External id": 80250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80250, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80250, "pid": 5, "tid": 7, "ts": 1716454222604269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541284, "dur": 5, "args": { "External id": 80250, "cbid": 211, "correlation": 80250 } }, { "ph": "s", "id": 80250, "pid": 76337, "tid": -914061504, "ts": 1716454222541284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222604518, "dur": 1, "args": { "External id": 80252, "device": 5, "context": 1, "stream": 7, "correlation": 80252, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 80252, "pid": 5, "tid": 7, "ts": 1716454222604518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222541295, "dur": 8, "args": { "External id": 80252, "cbid": 51, "correlation": 80252 } }, { "ph": "s", "id": 80252, "pid": 76337, "tid": -914061504, "ts": 1716454222541295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222604522, "dur": 824, "args": { "External id": 80253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80253, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80253, "pid": 5, "tid": 7, "ts": 1716454222604522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541304, "dur": 6, "args": { "External id": 80253, "cbid": 211, "correlation": 80253 } }, { "ph": "s", "id": 80253, "pid": 76337, "tid": -914061504, "ts": 1716454222541304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222605346, "dur": 13, "args": { "External id": 80255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80255, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80255, "pid": 5, "tid": 7, "ts": 1716454222605346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541315, "dur": 5, "args": { "External id": 80255, "cbid": 211, "correlation": 80255 } }, { "ph": "s", "id": 80255, "pid": 76337, "tid": -914061504, "ts": 1716454222541315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222605361, "dur": 15, "args": { "External id": 80261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80261, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80261, "pid": 5, "tid": 7, "ts": 1716454222605361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541343, "dur": 9, "args": { "External id": 80261, "cbid": 211, "correlation": 80261 } }, { "ph": "s", "id": 80261, "pid": 76337, "tid": -914061504, "ts": 1716454222541343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222605377, "dur": 3, "args": { "External id": 80269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80269, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 80269, "pid": 5, "tid": 7, "ts": 1716454222605377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541387, "dur": 9, "args": { "External id": 80269, "cbid": 211, "correlation": 80269 } }, { "ph": "s", "id": 80269, "pid": 76337, "tid": -914061504, "ts": 1716454222541387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222541453, "dur": 2, "args": { "External id": 80285, "cbid": 251, "correlation": 80285 } }, { "ph": "f", "id": 80285, "pid": 76337, "tid": -914061504, "ts": 1716454222541453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222541459, "dur": 0, "args": { "External id": 80287, "cbid": 251, "correlation": 80287 } }, { "ph": "f", "id": 80287, "pid": 76337, "tid": -914061504, "ts": 1716454222541459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222605382, "dur": 13, "args": { "External id": 80288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80288, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80288, "pid": 5, "tid": 7, "ts": 1716454222605382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541461, "dur": 11, "args": { "External id": 80288, "cbid": 211, "correlation": 80288 } }, { "ph": "s", "id": 80288, "pid": 76337, "tid": -914061504, "ts": 1716454222541461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222605397, "dur": 5, "args": { "External id": 80290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80290, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80290, "pid": 5, "tid": 7, "ts": 1716454222605397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541473, "dur": 6, "args": { "External id": 80290, "cbid": 211, "correlation": 80290 } }, { "ph": "s", "id": 80290, "pid": 76337, "tid": -914061504, "ts": 1716454222541473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222605403, "dur": 17, "args": { "External id": 80300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80300, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80300, "pid": 5, "tid": 7, "ts": 1716454222605403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541532, "dur": 12, "args": { "External id": 80300, "cbid": 211, "correlation": 80300 } }, { "ph": "s", "id": 80300, "pid": 76337, "tid": -914061504, "ts": 1716454222541532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222605421, "dur": 18, "args": { "External id": 80320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80320, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 80320, "pid": 5, "tid": 7, "ts": 1716454222605421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541597, "dur": 11, "args": { "External id": 80320, "cbid": 211, "correlation": 80320 } }, { "ph": "s", "id": 80320, "pid": 76337, "tid": -914061504, "ts": 1716454222541597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222605440, "dur": 4, "args": { "External id": 80332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80332, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 80332, "pid": 5, "tid": 7, "ts": 1716454222605440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541618, "dur": 6, "args": { "External id": 80332, "cbid": 211, "correlation": 80332 } }, { "ph": "s", "id": 80332, "pid": 76337, "tid": -914061504, "ts": 1716454222541618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222605446, "dur": 17, "args": { "External id": 80335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80335, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80335, "pid": 5, "tid": 7, "ts": 1716454222605446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541636, "dur": 6, "args": { "External id": 80335, "cbid": 211, "correlation": 80335 } }, { "ph": "s", "id": 80335, "pid": 76337, "tid": -914061504, "ts": 1716454222541636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222605464, "dur": 11, "args": { "External id": 80344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80344, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80344, "pid": 5, "tid": 7, "ts": 1716454222605464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541677, "dur": 11, "args": { "External id": 80344, "cbid": 211, "correlation": 80344 } }, { "ph": "s", "id": 80344, "pid": 76337, "tid": -914061504, "ts": 1716454222541677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222541740, "dur": 0, "args": { "External id": 80354, "cbid": 317, "correlation": 80354 } }, { "ph": "f", "id": 80354, "pid": 76337, "tid": -914061504, "ts": 1716454222541740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222541740, "dur": 0, "args": { "External id": 80355, "cbid": 203, "correlation": 80355 } }, { "ph": "f", "id": 80355, "pid": 76337, "tid": -914061504, "ts": 1716454222541740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222541741, "dur": 0, "args": { "External id": 80356, "cbid": 205, "correlation": 80356 } }, { "ph": "f", "id": 80356, "pid": 76337, "tid": -914061504, "ts": 1716454222541741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222605476, "dur": 11, "args": { "External id": 80360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80360, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80360, "pid": 5, "tid": 7, "ts": 1716454222605476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541755, "dur": 12, "args": { "External id": 80360, "cbid": 211, "correlation": 80360 } }, { "ph": "s", "id": 80360, "pid": 76337, "tid": -914061504, "ts": 1716454222541755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222605489, "dur": 165, "args": { "External id": 80362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80362, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80362, "pid": 5, "tid": 7, "ts": 1716454222605489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541769, "dur": 6, "args": { "External id": 80362, "cbid": 211, "correlation": 80362 } }, { "ph": "s", "id": 80362, "pid": 76337, "tid": -914061504, "ts": 1716454222541769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222605657, "dur": 1, "args": { "External id": 80364, "device": 5, "context": 1, "stream": 7, "correlation": 80364, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 80364, "pid": 5, "tid": 7, "ts": 1716454222605657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222541780, "dur": 6, "args": { "External id": 80364, "cbid": 51, "correlation": 80364 } }, { "ph": "s", "id": 80364, "pid": 76337, "tid": -914061504, "ts": 1716454222541780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222605660, "dur": 660, "args": { "External id": 80365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80365, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80365, "pid": 5, "tid": 7, "ts": 1716454222605660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541788, "dur": 6, "args": { "External id": 80365, "cbid": 211, "correlation": 80365 } }, { "ph": "s", "id": 80365, "pid": 76337, "tid": -914061504, "ts": 1716454222541788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222606322, "dur": 13, "args": { "External id": 80367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80367, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80367, "pid": 5, "tid": 7, "ts": 1716454222606322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541798, "dur": 5, "args": { "External id": 80367, "cbid": 211, "correlation": 80367 } }, { "ph": "s", "id": 80367, "pid": 76337, "tid": -914061504, "ts": 1716454222541798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222606335, "dur": 15, "args": { "External id": 80373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80373, "pid": 5, "tid": 7, "ts": 1716454222606335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541826, "dur": 9, "args": { "External id": 80373, "cbid": 211, "correlation": 80373 } }, { "ph": "s", "id": 80373, "pid": 76337, "tid": -914061504, "ts": 1716454222541826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222541884, "dur": 0, "args": { "External id": 80383, "cbid": 317, "correlation": 80383 } }, { "ph": "f", "id": 80383, "pid": 76337, "tid": -914061504, "ts": 1716454222541884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222541885, "dur": 0, "args": { "External id": 80384, "cbid": 203, "correlation": 80384 } }, { "ph": "f", "id": 80384, "pid": 76337, "tid": -914061504, "ts": 1716454222541885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222541886, "dur": 0, "args": { "External id": 80385, "cbid": 205, "correlation": 80385 } }, { "ph": "f", "id": 80385, "pid": 76337, "tid": -914061504, "ts": 1716454222541886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222606352, "dur": 18, "args": { "External id": 80389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80389, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80389, "pid": 5, "tid": 7, "ts": 1716454222606352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541900, "dur": 11, "args": { "External id": 80389, "cbid": 211, "correlation": 80389 } }, { "ph": "s", "id": 80389, "pid": 76337, "tid": -914061504, "ts": 1716454222541900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222606371, "dur": 4, "args": { "External id": 80391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80391, "pid": 5, "tid": 7, "ts": 1716454222606371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541916, "dur": 6, "args": { "External id": 80391, "cbid": 211, "correlation": 80391 } }, { "ph": "s", "id": 80391, "pid": 76337, "tid": -914061504, "ts": 1716454222541916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222541924, "dur": 0, "args": { "External id": 80392, "cbid": 51, "correlation": 80392 } }, { "ph": "s", "id": 80392, "pid": 76337, "tid": -914061504, "ts": 1716454222541924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222606376, "dur": 135, "args": { "External id": 80393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80393, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 80393, "pid": 5, "tid": 7, "ts": 1716454222606376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541925, "dur": 5, "args": { "External id": 80393, "cbid": 211, "correlation": 80393 } }, { "ph": "s", "id": 80393, "pid": 76337, "tid": -914061504, "ts": 1716454222541925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222606513, "dur": 16, "args": { "External id": 80398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80398, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80398, "pid": 5, "tid": 7, "ts": 1716454222606513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541951, "dur": 8, "args": { "External id": 80398, "cbid": 211, "correlation": 80398 } }, { "ph": "s", "id": 80398, "pid": 76337, "tid": -914061504, "ts": 1716454222541951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222606530, "dur": 12, "args": { "External id": 80406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80406, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80406, "pid": 5, "tid": 7, "ts": 1716454222606530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222541990, "dur": 9, "args": { "External id": 80406, "cbid": 211, "correlation": 80406 } }, { "ph": "s", "id": 80406, "pid": 76337, "tid": -914061504, "ts": 1716454222541990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222606544, "dur": 10, "args": { "External id": 80414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80414, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80414, "pid": 5, "tid": 7, "ts": 1716454222606544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542021, "dur": 8, "args": { "External id": 80414, "cbid": 211, "correlation": 80414 } }, { "ph": "s", "id": 80414, "pid": 76337, "tid": -914061504, "ts": 1716454222542021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222606555, "dur": 19, "args": { "External id": 80434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80434, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 80434, "pid": 5, "tid": 7, "ts": 1716454222606555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542105, "dur": 12, "args": { "External id": 80434, "cbid": 211, "correlation": 80434 } }, { "ph": "s", "id": 80434, "pid": 76337, "tid": -914061504, "ts": 1716454222542105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222606575, "dur": 4, "args": { "External id": 80446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80446, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 80446, "pid": 5, "tid": 7, "ts": 1716454222606575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542127, "dur": 6, "args": { "External id": 80446, "cbid": 211, "correlation": 80446 } }, { "ph": "s", "id": 80446, "pid": 76337, "tid": -914061504, "ts": 1716454222542127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222606581, "dur": 18, "args": { "External id": 80449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80449, "pid": 5, "tid": 7, "ts": 1716454222606581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542145, "dur": 7, "args": { "External id": 80449, "cbid": 211, "correlation": 80449 } }, { "ph": "s", "id": 80449, "pid": 76337, "tid": -914061504, "ts": 1716454222542145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222542202, "dur": 0, "args": { "External id": 80460, "cbid": 317, "correlation": 80460 } }, { "ph": "f", "id": 80460, "pid": 76337, "tid": -914061504, "ts": 1716454222542202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222542203, "dur": 0, "args": { "External id": 80461, "cbid": 203, "correlation": 80461 } }, { "ph": "f", "id": 80461, "pid": 76337, "tid": -914061504, "ts": 1716454222542203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222542204, "dur": 0, "args": { "External id": 80462, "cbid": 205, "correlation": 80462 } }, { "ph": "f", "id": 80462, "pid": 76337, "tid": -914061504, "ts": 1716454222542204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222606600, "dur": 11, "args": { "External id": 80466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80466, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80466, "pid": 5, "tid": 7, "ts": 1716454222606600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542216, "dur": 11, "args": { "External id": 80466, "cbid": 211, "correlation": 80466 } }, { "ph": "s", "id": 80466, "pid": 76337, "tid": -914061504, "ts": 1716454222542216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222606613, "dur": 3, "args": { "External id": 80468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80468, "pid": 5, "tid": 7, "ts": 1716454222606613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542232, "dur": 6, "args": { "External id": 80468, "cbid": 211, "correlation": 80468 } }, { "ph": "s", "id": 80468, "pid": 76337, "tid": -914061504, "ts": 1716454222542232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222542240, "dur": 0, "args": { "External id": 80469, "cbid": 51, "correlation": 80469 } }, { "ph": "s", "id": 80469, "pid": 76337, "tid": -914061504, "ts": 1716454222542240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222606618, "dur": 94, "args": { "External id": 80470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80470, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 80470, "pid": 5, "tid": 7, "ts": 1716454222606618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542241, "dur": 5, "args": { "External id": 80470, "cbid": 211, "correlation": 80470 } }, { "ph": "s", "id": 80470, "pid": 76337, "tid": -914061504, "ts": 1716454222542241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222606713, "dur": 16, "args": { "External id": 80475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80475, "pid": 5, "tid": 7, "ts": 1716454222606713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542268, "dur": 8, "args": { "External id": 80475, "cbid": 211, "correlation": 80475 } }, { "ph": "s", "id": 80475, "pid": 76337, "tid": -914061504, "ts": 1716454222542268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222606731, "dur": 85, "args": { "External id": 80484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80484, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80484, "pid": 5, "tid": 7, "ts": 1716454222606731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542348, "dur": 15, "args": { "External id": 80484, "cbid": 211, "correlation": 80484 } }, { "ph": "s", "id": 80484, "pid": 76337, "tid": -914061504, "ts": 1716454222542348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222606818, "dur": 30, "args": { "External id": 80506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80506, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80506, "pid": 5, "tid": 7, "ts": 1716454222606818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542404, "dur": 11, "args": { "External id": 80506, "cbid": 211, "correlation": 80506 } }, { "ph": "s", "id": 80506, "pid": 76337, "tid": -914061504, "ts": 1716454222542404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222542493, "dur": 1, "args": { "External id": 80517, "cbid": 251, "correlation": 80517 } }, { "ph": "f", "id": 80517, "pid": 76337, "tid": -914061504, "ts": 1716454222542493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222606849, "dur": 167, "args": { "External id": 80518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80518, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80518, "pid": 5, "tid": 7, "ts": 1716454222606849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542499, "dur": 14, "args": { "External id": 80518, "cbid": 211, "correlation": 80518 } }, { "ph": "s", "id": 80518, "pid": 76337, "tid": -914061504, "ts": 1716454222542499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222542568, "dur": 1, "args": { "External id": 80529, "cbid": 251, "correlation": 80529 } }, { "ph": "f", "id": 80529, "pid": 76337, "tid": -914061504, "ts": 1716454222542568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222607017, "dur": 162, "args": { "External id": 80530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80530, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80530, "pid": 5, "tid": 7, "ts": 1716454222607017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542572, "dur": 11, "args": { "External id": 80530, "cbid": 211, "correlation": 80530 } }, { "ph": "s", "id": 80530, "pid": 76337, "tid": -914061504, "ts": 1716454222542572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222542637, "dur": 1, "args": { "External id": 80541, "cbid": 251, "correlation": 80541 } }, { "ph": "f", "id": 80541, "pid": 76337, "tid": -914061504, "ts": 1716454222542637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222607181, "dur": 160, "args": { "External id": 80542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80542, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80542, "pid": 5, "tid": 7, "ts": 1716454222607181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542641, "dur": 11, "args": { "External id": 80542, "cbid": 211, "correlation": 80542 } }, { "ph": "s", "id": 80542, "pid": 76337, "tid": -914061504, "ts": 1716454222542641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222607342, "dur": 340, "args": { "External id": 80567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80567, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80567, "pid": 5, "tid": 7, "ts": 1716454222607342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542725, "dur": 13, "args": { "External id": 80567, "cbid": 211, "correlation": 80567 } }, { "ph": "s", "id": 80567, "pid": 76337, "tid": -914061504, "ts": 1716454222542725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222542824, "dur": 1, "args": { "External id": 80585, "cbid": 251, "correlation": 80585 } }, { "ph": "f", "id": 80585, "pid": 76337, "tid": -914061504, "ts": 1716454222542824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222607683, "dur": 170, "args": { "External id": 80587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80587, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80587, "pid": 5, "tid": 7, "ts": 1716454222607683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542830, "dur": 13, "args": { "External id": 80587, "cbid": 211, "correlation": 80587 } }, { "ph": "s", "id": 80587, "pid": 76337, "tid": -914061504, "ts": 1716454222542830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222607855, "dur": 20, "args": { "External id": 80595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80595, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80595, "pid": 5, "tid": 7, "ts": 1716454222607855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542901, "dur": 13, "args": { "External id": 80595, "cbid": 211, "correlation": 80595 } }, { "ph": "s", "id": 80595, "pid": 76337, "tid": -914061504, "ts": 1716454222542901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222607876, "dur": 28, "args": { "External id": 80603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80603, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80603, "pid": 5, "tid": 7, "ts": 1716454222607876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222542942, "dur": 9, "args": { "External id": 80603, "cbid": 211, "correlation": 80603 } }, { "ph": "s", "id": 80603, "pid": 76337, "tid": -914061504, "ts": 1716454222542942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222607905, "dur": 18, "args": { "External id": 80614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80614, "pid": 5, "tid": 7, "ts": 1716454222607905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543023, "dur": 14, "args": { "External id": 80614, "cbid": 211, "correlation": 80614 } }, { "ph": "s", "id": 80614, "pid": 76337, "tid": -914061504, "ts": 1716454222543023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222607925, "dur": 16, "args": { "External id": 80636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80636, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80636, "pid": 5, "tid": 7, "ts": 1716454222607925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543056, "dur": 8, "args": { "External id": 80636, "cbid": 211, "correlation": 80636 } }, { "ph": "s", "id": 80636, "pid": 76337, "tid": -914061504, "ts": 1716454222543056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543142, "dur": 1, "args": { "External id": 80647, "cbid": 251, "correlation": 80647 } }, { "ph": "f", "id": 80647, "pid": 76337, "tid": -914061504, "ts": 1716454222543142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222607942, "dur": 91, "args": { "External id": 80648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80648, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80648, "pid": 5, "tid": 7, "ts": 1716454222607942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543147, "dur": 13, "args": { "External id": 80648, "cbid": 211, "correlation": 80648 } }, { "ph": "s", "id": 80648, "pid": 76337, "tid": -914061504, "ts": 1716454222543147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543216, "dur": 1, "args": { "External id": 80659, "cbid": 251, "correlation": 80659 } }, { "ph": "f", "id": 80659, "pid": 76337, "tid": -914061504, "ts": 1716454222543216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543220, "dur": 0, "args": { "External id": 80660, "cbid": 251, "correlation": 80660 } }, { "ph": "f", "id": 80660, "pid": 76337, "tid": -914061504, "ts": 1716454222543220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222608034, "dur": 12, "args": { "External id": 80661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80661, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80661, "pid": 5, "tid": 7, "ts": 1716454222608034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543222, "dur": 12, "args": { "External id": 80661, "cbid": 211, "correlation": 80661 } }, { "ph": "s", "id": 80661, "pid": 76337, "tid": -914061504, "ts": 1716454222543222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222608047, "dur": 5, "args": { "External id": 80663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80663, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80663, "pid": 5, "tid": 7, "ts": 1716454222608047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543235, "dur": 6, "args": { "External id": 80663, "cbid": 211, "correlation": 80663 } }, { "ph": "s", "id": 80663, "pid": 76337, "tid": -914061504, "ts": 1716454222543235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543293, "dur": 1, "args": { "External id": 80674, "cbid": 251, "correlation": 80674 } }, { "ph": "f", "id": 80674, "pid": 76337, "tid": -914061504, "ts": 1716454222543293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543296, "dur": 0, "args": { "External id": 80675, "cbid": 251, "correlation": 80675 } }, { "ph": "f", "id": 80675, "pid": 76337, "tid": -914061504, "ts": 1716454222543296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222608054, "dur": 8, "args": { "External id": 80676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80676, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80676, "pid": 5, "tid": 7, "ts": 1716454222608054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543298, "dur": 12, "args": { "External id": 80676, "cbid": 211, "correlation": 80676 } }, { "ph": "s", "id": 80676, "pid": 76337, "tid": -914061504, "ts": 1716454222543298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222608064, "dur": 3, "args": { "External id": 80678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80678, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80678, "pid": 5, "tid": 7, "ts": 1716454222608064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543311, "dur": 6, "args": { "External id": 80678, "cbid": 211, "correlation": 80678 } }, { "ph": "s", "id": 80678, "pid": 76337, "tid": -914061504, "ts": 1716454222543311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222608068, "dur": 57, "args": { "External id": 80703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80703, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80703, "pid": 5, "tid": 7, "ts": 1716454222608068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543388, "dur": 13, "args": { "External id": 80703, "cbid": 211, "correlation": 80703 } }, { "ph": "s", "id": 80703, "pid": 76337, "tid": -914061504, "ts": 1716454222543388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543486, "dur": 1, "args": { "External id": 80721, "cbid": 251, "correlation": 80721 } }, { "ph": "f", "id": 80721, "pid": 76337, "tid": -914061504, "ts": 1716454222543486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222608127, "dur": 93, "args": { "External id": 80723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80723, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80723, "pid": 5, "tid": 7, "ts": 1716454222608127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543493, "dur": 14, "args": { "External id": 80723, "cbid": 211, "correlation": 80723 } }, { "ph": "s", "id": 80723, "pid": 76337, "tid": -914061504, "ts": 1716454222543493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222608221, "dur": 9, "args": { "External id": 80731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80731, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80731, "pid": 5, "tid": 7, "ts": 1716454222608221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543563, "dur": 12, "args": { "External id": 80731, "cbid": 211, "correlation": 80731 } }, { "ph": "s", "id": 80731, "pid": 76337, "tid": -914061504, "ts": 1716454222543563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222608232, "dur": 21, "args": { "External id": 80739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80739, "pid": 5, "tid": 7, "ts": 1716454222608232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543605, "dur": 9, "args": { "External id": 80739, "cbid": 211, "correlation": 80739 } }, { "ph": "s", "id": 80739, "pid": 76337, "tid": -914061504, "ts": 1716454222543605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222608254, "dur": 18, "args": { "External id": 80761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80761, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80761, "pid": 5, "tid": 7, "ts": 1716454222608254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543655, "dur": 10, "args": { "External id": 80761, "cbid": 211, "correlation": 80761 } }, { "ph": "s", "id": 80761, "pid": 76337, "tid": -914061504, "ts": 1716454222543655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543742, "dur": 1, "args": { "External id": 80777, "cbid": 251, "correlation": 80777 } }, { "ph": "f", "id": 80777, "pid": 76337, "tid": -914061504, "ts": 1716454222543742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543747, "dur": 0, "args": { "External id": 80779, "cbid": 251, "correlation": 80779 } }, { "ph": "f", "id": 80779, "pid": 76337, "tid": -914061504, "ts": 1716454222543747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222608274, "dur": 500, "args": { "External id": 80780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80780, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80780, "pid": 5, "tid": 7, "ts": 1716454222608274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543749, "dur": 13, "args": { "External id": 80780, "cbid": 211, "correlation": 80780 } }, { "ph": "s", "id": 80780, "pid": 76337, "tid": -914061504, "ts": 1716454222543749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222608775, "dur": 67, "args": { "External id": 80788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80788, "pid": 5, "tid": 7, "ts": 1716454222608775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543815, "dur": 12, "args": { "External id": 80788, "cbid": 211, "correlation": 80788 } }, { "ph": "s", "id": 80788, "pid": 76337, "tid": -914061504, "ts": 1716454222543815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222608843, "dur": 67, "args": { "External id": 80796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80796, "pid": 5, "tid": 7, "ts": 1716454222608843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543845, "dur": 8, "args": { "External id": 80796, "cbid": 211, "correlation": 80796 } }, { "ph": "s", "id": 80796, "pid": 76337, "tid": -914061504, "ts": 1716454222543845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222543924, "dur": 1, "args": { "External id": 80812, "cbid": 251, "correlation": 80812 } }, { "ph": "f", "id": 80812, "pid": 76337, "tid": -914061504, "ts": 1716454222543924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222608912, "dur": 1, "args": { "External id": 80814, "device": 5, "context": 1, "stream": 7, "correlation": 80814, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 80814, "pid": 5, "tid": 7, "ts": 1716454222608912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222543929, "dur": 9, "args": { "External id": 80814, "cbid": 51, "correlation": 80814 } }, { "ph": "s", "id": 80814, "pid": 76337, "tid": -914061504, "ts": 1716454222543929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222608916, "dur": 276, "args": { "External id": 80815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80815, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80815, "pid": 5, "tid": 7, "ts": 1716454222608916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543940, "dur": 11, "args": { "External id": 80815, "cbid": 211, "correlation": 80815 } }, { "ph": "s", "id": 80815, "pid": 76337, "tid": -914061504, "ts": 1716454222543940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222609193, "dur": 14, "args": { "External id": 80823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80823, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80823, "pid": 5, "tid": 7, "ts": 1716454222609193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222543989, "dur": 11, "args": { "External id": 80823, "cbid": 211, "correlation": 80823 } }, { "ph": "s", "id": 80823, "pid": 76337, "tid": -914061504, "ts": 1716454222543989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222609208, "dur": 38, "args": { "External id": 80834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80834, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80834, "pid": 5, "tid": 7, "ts": 1716454222609208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544059, "dur": 12, "args": { "External id": 80834, "cbid": 211, "correlation": 80834 } }, { "ph": "s", "id": 80834, "pid": 76337, "tid": -914061504, "ts": 1716454222544059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222544122, "dur": 0, "args": { "External id": 80846, "cbid": 317, "correlation": 80846 } }, { "ph": "f", "id": 80846, "pid": 76337, "tid": -914061504, "ts": 1716454222544122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222544123, "dur": 0, "args": { "External id": 80847, "cbid": 203, "correlation": 80847 } }, { "ph": "f", "id": 80847, "pid": 76337, "tid": -914061504, "ts": 1716454222544123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222544124, "dur": 0, "args": { "External id": 80848, "cbid": 205, "correlation": 80848 } }, { "ph": "f", "id": 80848, "pid": 76337, "tid": -914061504, "ts": 1716454222544124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222609248, "dur": 13, "args": { "External id": 80852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80852, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80852, "pid": 5, "tid": 7, "ts": 1716454222609248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544139, "dur": 12, "args": { "External id": 80852, "cbid": 211, "correlation": 80852 } }, { "ph": "s", "id": 80852, "pid": 76337, "tid": -914061504, "ts": 1716454222544139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222609262, "dur": 4, "args": { "External id": 80854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 80854, "pid": 5, "tid": 7, "ts": 1716454222609262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544156, "dur": 5, "args": { "External id": 80854, "cbid": 211, "correlation": 80854 } }, { "ph": "s", "id": 80854, "pid": 76337, "tid": -914061504, "ts": 1716454222544156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222544164, "dur": 0, "args": { "External id": 80855, "cbid": 51, "correlation": 80855 } }, { "ph": "s", "id": 80855, "pid": 76337, "tid": -914061504, "ts": 1716454222544164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222609267, "dur": 98, "args": { "External id": 80856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80856, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 80856, "pid": 5, "tid": 7, "ts": 1716454222609267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544165, "dur": 5, "args": { "External id": 80856, "cbid": 211, "correlation": 80856 } }, { "ph": "s", "id": 80856, "pid": 76337, "tid": -914061504, "ts": 1716454222544165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222609367, "dur": 16, "args": { "External id": 80861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80861, "pid": 5, "tid": 7, "ts": 1716454222609367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544192, "dur": 9, "args": { "External id": 80861, "cbid": 211, "correlation": 80861 } }, { "ph": "s", "id": 80861, "pid": 76337, "tid": -914061504, "ts": 1716454222544192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222609385, "dur": 11, "args": { "External id": 80869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80869, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80869, "pid": 5, "tid": 7, "ts": 1716454222609385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544224, "dur": 8, "args": { "External id": 80869, "cbid": 211, "correlation": 80869 } }, { "ph": "s", "id": 80869, "pid": 76337, "tid": -914061504, "ts": 1716454222544224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222609397, "dur": 57, "args": { "External id": 80880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80880, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80880, "pid": 5, "tid": 7, "ts": 1716454222609397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544289, "dur": 11, "args": { "External id": 80880, "cbid": 211, "correlation": 80880 } }, { "ph": "s", "id": 80880, "pid": 76337, "tid": -914061504, "ts": 1716454222544289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222544344, "dur": 0, "args": { "External id": 80890, "cbid": 317, "correlation": 80890 } }, { "ph": "f", "id": 80890, "pid": 76337, "tid": -914061504, "ts": 1716454222544344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222544345, "dur": 0, "args": { "External id": 80891, "cbid": 203, "correlation": 80891 } }, { "ph": "f", "id": 80891, "pid": 76337, "tid": -914061504, "ts": 1716454222544345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222544346, "dur": 0, "args": { "External id": 80892, "cbid": 205, "correlation": 80892 } }, { "ph": "f", "id": 80892, "pid": 76337, "tid": -914061504, "ts": 1716454222544346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222609456, "dur": 39, "args": { "External id": 80896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80896, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80896, "pid": 5, "tid": 7, "ts": 1716454222609456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544362, "dur": 12, "args": { "External id": 80896, "cbid": 211, "correlation": 80896 } }, { "ph": "s", "id": 80896, "pid": 76337, "tid": -914061504, "ts": 1716454222544362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222609497, "dur": 165, "args": { "External id": 80898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80898, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80898, "pid": 5, "tid": 7, "ts": 1716454222609497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544376, "dur": 5, "args": { "External id": 80898, "cbid": 211, "correlation": 80898 } }, { "ph": "s", "id": 80898, "pid": 76337, "tid": -914061504, "ts": 1716454222544376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222609663, "dur": 1960, "args": { "External id": 80900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80900, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80900, "pid": 5, "tid": 7, "ts": 1716454222609663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544388, "dur": 8, "args": { "External id": 80900, "cbid": 211, "correlation": 80900 } }, { "ph": "s", "id": 80900, "pid": 76337, "tid": -914061504, "ts": 1716454222544388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222611624, "dur": 40, "args": { "External id": 80902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80902, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80902, "pid": 5, "tid": 7, "ts": 1716454222611624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544400, "dur": 5, "args": { "External id": 80902, "cbid": 211, "correlation": 80902 } }, { "ph": "s", "id": 80902, "pid": 76337, "tid": -914061504, "ts": 1716454222544400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222611665, "dur": 60, "args": { "External id": 80908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80908, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80908, "pid": 5, "tid": 7, "ts": 1716454222611665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544428, "dur": 9, "args": { "External id": 80908, "cbid": 211, "correlation": 80908 } }, { "ph": "s", "id": 80908, "pid": 76337, "tid": -914061504, "ts": 1716454222544428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222611726, "dur": 85, "args": { "External id": 80917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80917, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80917, "pid": 5, "tid": 7, "ts": 1716454222611726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544520, "dur": 13, "args": { "External id": 80917, "cbid": 211, "correlation": 80917 } }, { "ph": "s", "id": 80917, "pid": 76337, "tid": -914061504, "ts": 1716454222544520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222611813, "dur": 74, "args": { "External id": 80937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80937, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 80937, "pid": 5, "tid": 7, "ts": 1716454222611813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544591, "dur": 12, "args": { "External id": 80937, "cbid": 211, "correlation": 80937 } }, { "ph": "s", "id": 80937, "pid": 76337, "tid": -914061504, "ts": 1716454222544591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222611888, "dur": 5, "args": { "External id": 80949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80949, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 80949, "pid": 5, "tid": 7, "ts": 1716454222611888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544613, "dur": 6, "args": { "External id": 80949, "cbid": 211, "correlation": 80949 } }, { "ph": "s", "id": 80949, "pid": 76337, "tid": -914061504, "ts": 1716454222544613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222611894, "dur": 82, "args": { "External id": 80952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80952, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80952, "pid": 5, "tid": 7, "ts": 1716454222611894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544631, "dur": 7, "args": { "External id": 80952, "cbid": 211, "correlation": 80952 } }, { "ph": "s", "id": 80952, "pid": 76337, "tid": -914061504, "ts": 1716454222544631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222611978, "dur": 54, "args": { "External id": 80961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80961, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80961, "pid": 5, "tid": 7, "ts": 1716454222611978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544670, "dur": 10, "args": { "External id": 80961, "cbid": 211, "correlation": 80961 } }, { "ph": "s", "id": 80961, "pid": 76337, "tid": -914061504, "ts": 1716454222544670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222544722, "dur": 0, "args": { "External id": 80971, "cbid": 317, "correlation": 80971 } }, { "ph": "f", "id": 80971, "pid": 76337, "tid": -914061504, "ts": 1716454222544722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222544723, "dur": 0, "args": { "External id": 80972, "cbid": 203, "correlation": 80972 } }, { "ph": "f", "id": 80972, "pid": 76337, "tid": -914061504, "ts": 1716454222544723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222544723, "dur": 0, "args": { "External id": 80973, "cbid": 205, "correlation": 80973 } }, { "ph": "f", "id": 80973, "pid": 76337, "tid": -914061504, "ts": 1716454222544723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222612033, "dur": 56, "args": { "External id": 80977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80977, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80977, "pid": 5, "tid": 7, "ts": 1716454222612033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544738, "dur": 11, "args": { "External id": 80977, "cbid": 211, "correlation": 80977 } }, { "ph": "s", "id": 80977, "pid": 76337, "tid": -914061504, "ts": 1716454222544738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222612091, "dur": 125, "args": { "External id": 80979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80979, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80979, "pid": 5, "tid": 7, "ts": 1716454222612091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544752, "dur": 5, "args": { "External id": 80979, "cbid": 211, "correlation": 80979 } }, { "ph": "s", "id": 80979, "pid": 76337, "tid": -914061504, "ts": 1716454222544752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222612216, "dur": 1920, "args": { "External id": 80981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80981, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 80981, "pid": 5, "tid": 7, "ts": 1716454222612216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544764, "dur": 6, "args": { "External id": 80981, "cbid": 211, "correlation": 80981 } }, { "ph": "s", "id": 80981, "pid": 76337, "tid": -914061504, "ts": 1716454222544764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222614138, "dur": 20, "args": { "External id": 80983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80983, "pid": 5, "tid": 7, "ts": 1716454222614138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544774, "dur": 5, "args": { "External id": 80983, "cbid": 211, "correlation": 80983 } }, { "ph": "s", "id": 80983, "pid": 76337, "tid": -914061504, "ts": 1716454222544774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222614159, "dur": 33, "args": { "External id": 80989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 80989, "pid": 5, "tid": 7, "ts": 1716454222614159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544801, "dur": 8, "args": { "External id": 80989, "cbid": 211, "correlation": 80989 } }, { "ph": "s", "id": 80989, "pid": 76337, "tid": -914061504, "ts": 1716454222544801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222614194, "dur": 3, "args": { "External id": 80997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 80997, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 80997, "pid": 5, "tid": 7, "ts": 1716454222614194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544845, "dur": 10, "args": { "External id": 80997, "cbid": 211, "correlation": 80997 } }, { "ph": "s", "id": 80997, "pid": 76337, "tid": -914061504, "ts": 1716454222544845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222544912, "dur": 1, "args": { "External id": 81013, "cbid": 251, "correlation": 81013 } }, { "ph": "f", "id": 81013, "pid": 76337, "tid": -914061504, "ts": 1716454222544912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222544917, "dur": 0, "args": { "External id": 81015, "cbid": 251, "correlation": 81015 } }, { "ph": "f", "id": 81015, "pid": 76337, "tid": -914061504, "ts": 1716454222544917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222614198, "dur": 12, "args": { "External id": 81016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81016, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 81016, "pid": 5, "tid": 7, "ts": 1716454222614198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544919, "dur": 12, "args": { "External id": 81016, "cbid": 211, "correlation": 81016 } }, { "ph": "s", "id": 81016, "pid": 76337, "tid": -914061504, "ts": 1716454222544919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222614212, "dur": 5, "args": { "External id": 81018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81018, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 81018, "pid": 5, "tid": 7, "ts": 1716454222614212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544933, "dur": 5, "args": { "External id": 81018, "cbid": 211, "correlation": 81018 } }, { "ph": "s", "id": 81018, "pid": 76337, "tid": -914061504, "ts": 1716454222544933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222614219, "dur": 29, "args": { "External id": 81028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81028, "pid": 5, "tid": 7, "ts": 1716454222614219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222544999, "dur": 12, "args": { "External id": 81028, "cbid": 211, "correlation": 81028 } }, { "ph": "s", "id": 81028, "pid": 76337, "tid": -914061504, "ts": 1716454222544999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222614249, "dur": 30, "args": { "External id": 81048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81048, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 81048, "pid": 5, "tid": 7, "ts": 1716454222614249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545065, "dur": 11, "args": { "External id": 81048, "cbid": 211, "correlation": 81048 } }, { "ph": "s", "id": 81048, "pid": 76337, "tid": -914061504, "ts": 1716454222545065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222614281, "dur": 4, "args": { "External id": 81060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81060, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 81060, "pid": 5, "tid": 7, "ts": 1716454222614281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545087, "dur": 6, "args": { "External id": 81060, "cbid": 211, "correlation": 81060 } }, { "ph": "s", "id": 81060, "pid": 76337, "tid": -914061504, "ts": 1716454222545087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222614287, "dur": 31, "args": { "External id": 81063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81063, "pid": 5, "tid": 7, "ts": 1716454222614287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545104, "dur": 7, "args": { "External id": 81063, "cbid": 211, "correlation": 81063 } }, { "ph": "s", "id": 81063, "pid": 76337, "tid": -914061504, "ts": 1716454222545104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222614319, "dur": 21, "args": { "External id": 81072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81072, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81072, "pid": 5, "tid": 7, "ts": 1716454222614319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545146, "dur": 10, "args": { "External id": 81072, "cbid": 211, "correlation": 81072 } }, { "ph": "s", "id": 81072, "pid": 76337, "tid": -914061504, "ts": 1716454222545146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222545208, "dur": 0, "args": { "External id": 81082, "cbid": 317, "correlation": 81082 } }, { "ph": "f", "id": 81082, "pid": 76337, "tid": -914061504, "ts": 1716454222545208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222545208, "dur": 0, "args": { "External id": 81083, "cbid": 203, "correlation": 81083 } }, { "ph": "f", "id": 81083, "pid": 76337, "tid": -914061504, "ts": 1716454222545208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222545209, "dur": 0, "args": { "External id": 81084, "cbid": 205, "correlation": 81084 } }, { "ph": "f", "id": 81084, "pid": 76337, "tid": -914061504, "ts": 1716454222545209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222614341, "dur": 22, "args": { "External id": 81088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81088, "pid": 5, "tid": 7, "ts": 1716454222614341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545224, "dur": 12, "args": { "External id": 81088, "cbid": 211, "correlation": 81088 } }, { "ph": "s", "id": 81088, "pid": 76337, "tid": -914061504, "ts": 1716454222545224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222614364, "dur": 45, "args": { "External id": 81090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81090, "pid": 5, "tid": 7, "ts": 1716454222614364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545239, "dur": 6, "args": { "External id": 81090, "cbid": 211, "correlation": 81090 } }, { "ph": "s", "id": 81090, "pid": 76337, "tid": -914061504, "ts": 1716454222545239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222614410, "dur": 653, "args": { "External id": 81092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81092, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81092, "pid": 5, "tid": 7, "ts": 1716454222614410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545251, "dur": 6, "args": { "External id": 81092, "cbid": 211, "correlation": 81092 } }, { "ph": "s", "id": 81092, "pid": 76337, "tid": -914061504, "ts": 1716454222545251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222615065, "dur": 22, "args": { "External id": 81094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81094, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81094, "pid": 5, "tid": 7, "ts": 1716454222615065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545261, "dur": 5, "args": { "External id": 81094, "cbid": 211, "correlation": 81094 } }, { "ph": "s", "id": 81094, "pid": 76337, "tid": -914061504, "ts": 1716454222545261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222615088, "dur": 33, "args": { "External id": 81100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81100, "pid": 5, "tid": 7, "ts": 1716454222615088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545289, "dur": 9, "args": { "External id": 81100, "cbid": 211, "correlation": 81100 } }, { "ph": "s", "id": 81100, "pid": 76337, "tid": -914061504, "ts": 1716454222545289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222545347, "dur": 0, "args": { "External id": 81110, "cbid": 317, "correlation": 81110 } }, { "ph": "f", "id": 81110, "pid": 76337, "tid": -914061504, "ts": 1716454222545347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222545348, "dur": 0, "args": { "External id": 81111, "cbid": 203, "correlation": 81111 } }, { "ph": "f", "id": 81111, "pid": 76337, "tid": -914061504, "ts": 1716454222545348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222545348, "dur": 0, "args": { "External id": 81112, "cbid": 205, "correlation": 81112 } }, { "ph": "f", "id": 81112, "pid": 76337, "tid": -914061504, "ts": 1716454222545348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222615123, "dur": 55, "args": { "External id": 81116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81116, "pid": 5, "tid": 7, "ts": 1716454222615123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545361, "dur": 12, "args": { "External id": 81116, "cbid": 211, "correlation": 81116 } }, { "ph": "s", "id": 81116, "pid": 76337, "tid": -914061504, "ts": 1716454222545361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222615179, "dur": 274, "args": { "External id": 81118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81118, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81118, "pid": 5, "tid": 7, "ts": 1716454222615179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545378, "dur": 7, "args": { "External id": 81118, "cbid": 211, "correlation": 81118 } }, { "ph": "s", "id": 81118, "pid": 76337, "tid": -914061504, "ts": 1716454222545378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222615455, "dur": 21, "args": { "External id": 81120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81120, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81120, "pid": 5, "tid": 7, "ts": 1716454222615455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545390, "dur": 6, "args": { "External id": 81120, "cbid": 211, "correlation": 81120 } }, { "ph": "s", "id": 81120, "pid": 76337, "tid": -914061504, "ts": 1716454222545390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222615478, "dur": 33, "args": { "External id": 81126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81126, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81126, "pid": 5, "tid": 7, "ts": 1716454222615478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545416, "dur": 9, "args": { "External id": 81126, "cbid": 211, "correlation": 81126 } }, { "ph": "s", "id": 81126, "pid": 76337, "tid": -914061504, "ts": 1716454222545416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222615512, "dur": 27, "args": { "External id": 81134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81134, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81134, "pid": 5, "tid": 7, "ts": 1716454222615512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545445, "dur": 8, "args": { "External id": 81134, "cbid": 211, "correlation": 81134 } }, { "ph": "s", "id": 81134, "pid": 76337, "tid": -914061504, "ts": 1716454222545445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222615540, "dur": 20, "args": { "External id": 81142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81142, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81142, "pid": 5, "tid": 7, "ts": 1716454222615540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545473, "dur": 8, "args": { "External id": 81142, "cbid": 211, "correlation": 81142 } }, { "ph": "s", "id": 81142, "pid": 76337, "tid": -914061504, "ts": 1716454222545473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222615562, "dur": 31, "args": { "External id": 81162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81162, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 81162, "pid": 5, "tid": 7, "ts": 1716454222615562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545555, "dur": 13, "args": { "External id": 81162, "cbid": 211, "correlation": 81162 } }, { "ph": "s", "id": 81162, "pid": 76337, "tid": -914061504, "ts": 1716454222545555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222615594, "dur": 4, "args": { "External id": 81174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81174, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 81174, "pid": 5, "tid": 7, "ts": 1716454222615594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545578, "dur": 6, "args": { "External id": 81174, "cbid": 211, "correlation": 81174 } }, { "ph": "s", "id": 81174, "pid": 76337, "tid": -914061504, "ts": 1716454222545578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222615599, "dur": 31, "args": { "External id": 81177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81177, "pid": 5, "tid": 7, "ts": 1716454222615599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545595, "dur": 6, "args": { "External id": 81177, "cbid": 211, "correlation": 81177 } }, { "ph": "s", "id": 81177, "pid": 76337, "tid": -914061504, "ts": 1716454222545595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222545652, "dur": 0, "args": { "External id": 81188, "cbid": 317, "correlation": 81188 } }, { "ph": "f", "id": 81188, "pid": 76337, "tid": -914061504, "ts": 1716454222545652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222545653, "dur": 0, "args": { "External id": 81189, "cbid": 203, "correlation": 81189 } }, { "ph": "f", "id": 81189, "pid": 76337, "tid": -914061504, "ts": 1716454222545653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222545653, "dur": 0, "args": { "External id": 81190, "cbid": 205, "correlation": 81190 } }, { "ph": "f", "id": 81190, "pid": 76337, "tid": -914061504, "ts": 1716454222545653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222615632, "dur": 22, "args": { "External id": 81194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81194, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81194, "pid": 5, "tid": 7, "ts": 1716454222615632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545668, "dur": 11, "args": { "External id": 81194, "cbid": 211, "correlation": 81194 } }, { "ph": "s", "id": 81194, "pid": 76337, "tid": -914061504, "ts": 1716454222545668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222615655, "dur": 107, "args": { "External id": 81196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81196, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81196, "pid": 5, "tid": 7, "ts": 1716454222615655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545685, "dur": 6, "args": { "External id": 81196, "cbid": 211, "correlation": 81196 } }, { "ph": "s", "id": 81196, "pid": 76337, "tid": -914061504, "ts": 1716454222545685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222615763, "dur": 23, "args": { "External id": 81198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81198, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81198, "pid": 5, "tid": 7, "ts": 1716454222615763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545695, "dur": 5, "args": { "External id": 81198, "cbid": 211, "correlation": 81198 } }, { "ph": "s", "id": 81198, "pid": 76337, "tid": -914061504, "ts": 1716454222545695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222615787, "dur": 33, "args": { "External id": 81204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81204, "pid": 5, "tid": 7, "ts": 1716454222615787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545723, "dur": 8, "args": { "External id": 81204, "cbid": 211, "correlation": 81204 } }, { "ph": "s", "id": 81204, "pid": 76337, "tid": -914061504, "ts": 1716454222545723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222615822, "dur": 188, "args": { "External id": 81213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81213, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81213, "pid": 5, "tid": 7, "ts": 1716454222615822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545805, "dur": 14, "args": { "External id": 81213, "cbid": 211, "correlation": 81213 } }, { "ph": "s", "id": 81213, "pid": 76337, "tid": -914061504, "ts": 1716454222545805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222616011, "dur": 66, "args": { "External id": 81235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81235, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81235, "pid": 5, "tid": 7, "ts": 1716454222616011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545862, "dur": 10, "args": { "External id": 81235, "cbid": 211, "correlation": 81235 } }, { "ph": "s", "id": 81235, "pid": 76337, "tid": -914061504, "ts": 1716454222545862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222545952, "dur": 1, "args": { "External id": 81246, "cbid": 251, "correlation": 81246 } }, { "ph": "f", "id": 81246, "pid": 76337, "tid": -914061504, "ts": 1716454222545952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222616078, "dur": 157, "args": { "External id": 81247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81247, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81247, "pid": 5, "tid": 7, "ts": 1716454222616078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222545957, "dur": 13, "args": { "External id": 81247, "cbid": 211, "correlation": 81247 } }, { "ph": "s", "id": 81247, "pid": 76337, "tid": -914061504, "ts": 1716454222545957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546034, "dur": 1, "args": { "External id": 81258, "cbid": 251, "correlation": 81258 } }, { "ph": "f", "id": 81258, "pid": 76337, "tid": -914061504, "ts": 1716454222546034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222616236, "dur": 148, "args": { "External id": 81259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81259, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81259, "pid": 5, "tid": 7, "ts": 1716454222616236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546039, "dur": 12, "args": { "External id": 81259, "cbid": 211, "correlation": 81259 } }, { "ph": "s", "id": 81259, "pid": 76337, "tid": -914061504, "ts": 1716454222546039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546105, "dur": 1, "args": { "External id": 81270, "cbid": 251, "correlation": 81270 } }, { "ph": "f", "id": 81270, "pid": 76337, "tid": -914061504, "ts": 1716454222546105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222616386, "dur": 146, "args": { "External id": 81271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81271, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81271, "pid": 5, "tid": 7, "ts": 1716454222616386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546110, "dur": 11, "args": { "External id": 81271, "cbid": 211, "correlation": 81271 } }, { "ph": "s", "id": 81271, "pid": 76337, "tid": -914061504, "ts": 1716454222546110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222616533, "dur": 1991, "args": { "External id": 81292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81292, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 81292, "pid": 5, "tid": 7, "ts": 1716454222616533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546191, "dur": 13, "args": { "External id": 81292, "cbid": 211, "correlation": 81292 } }, { "ph": "s", "id": 81292, "pid": 76337, "tid": -914061504, "ts": 1716454222546191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546291, "dur": 1, "args": { "External id": 81310, "cbid": 251, "correlation": 81310 } }, { "ph": "f", "id": 81310, "pid": 76337, "tid": -914061504, "ts": 1716454222546291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222618526, "dur": 153, "args": { "External id": 81312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81312, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 81312, "pid": 5, "tid": 7, "ts": 1716454222618526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546297, "dur": 14, "args": { "External id": 81312, "cbid": 211, "correlation": 81312 } }, { "ph": "s", "id": 81312, "pid": 76337, "tid": -914061504, "ts": 1716454222546297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222618680, "dur": 35, "args": { "External id": 81320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81320, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81320, "pid": 5, "tid": 7, "ts": 1716454222618680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546369, "dur": 12, "args": { "External id": 81320, "cbid": 211, "correlation": 81320 } }, { "ph": "s", "id": 81320, "pid": 76337, "tid": -914061504, "ts": 1716454222546369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222618717, "dur": 51, "args": { "External id": 81328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81328, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81328, "pid": 5, "tid": 7, "ts": 1716454222618717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546408, "dur": 9, "args": { "External id": 81328, "cbid": 211, "correlation": 81328 } }, { "ph": "s", "id": 81328, "pid": 76337, "tid": -914061504, "ts": 1716454222546408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222618769, "dur": 30, "args": { "External id": 81339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81339, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81339, "pid": 5, "tid": 7, "ts": 1716454222618769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546480, "dur": 13, "args": { "External id": 81339, "cbid": 211, "correlation": 81339 } }, { "ph": "s", "id": 81339, "pid": 76337, "tid": -914061504, "ts": 1716454222546480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222618801, "dur": 35, "args": { "External id": 81361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81361, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81361, "pid": 5, "tid": 7, "ts": 1716454222618801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546511, "dur": 8, "args": { "External id": 81361, "cbid": 211, "correlation": 81361 } }, { "ph": "s", "id": 81361, "pid": 76337, "tid": -914061504, "ts": 1716454222546511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546596, "dur": 1, "args": { "External id": 81372, "cbid": 251, "correlation": 81372 } }, { "ph": "f", "id": 81372, "pid": 76337, "tid": -914061504, "ts": 1716454222546596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222618837, "dur": 92, "args": { "External id": 81373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81373, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81373, "pid": 5, "tid": 7, "ts": 1716454222618837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546601, "dur": 13, "args": { "External id": 81373, "cbid": 211, "correlation": 81373 } }, { "ph": "s", "id": 81373, "pid": 76337, "tid": -914061504, "ts": 1716454222546601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546671, "dur": 1, "args": { "External id": 81384, "cbid": 251, "correlation": 81384 } }, { "ph": "f", "id": 81384, "pid": 76337, "tid": -914061504, "ts": 1716454222546671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546675, "dur": 0, "args": { "External id": 81385, "cbid": 251, "correlation": 81385 } }, { "ph": "f", "id": 81385, "pid": 76337, "tid": -914061504, "ts": 1716454222546675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222618930, "dur": 12, "args": { "External id": 81386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81386, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 81386, "pid": 5, "tid": 7, "ts": 1716454222618930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546677, "dur": 12, "args": { "External id": 81386, "cbid": 211, "correlation": 81386 } }, { "ph": "s", "id": 81386, "pid": 76337, "tid": -914061504, "ts": 1716454222546677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222618943, "dur": 5, "args": { "External id": 81388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81388, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 81388, "pid": 5, "tid": 7, "ts": 1716454222618943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546691, "dur": 6, "args": { "External id": 81388, "cbid": 211, "correlation": 81388 } }, { "ph": "s", "id": 81388, "pid": 76337, "tid": -914061504, "ts": 1716454222546691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546748, "dur": 1, "args": { "External id": 81399, "cbid": 251, "correlation": 81399 } }, { "ph": "f", "id": 81399, "pid": 76337, "tid": -914061504, "ts": 1716454222546748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546751, "dur": 0, "args": { "External id": 81400, "cbid": 251, "correlation": 81400 } }, { "ph": "f", "id": 81400, "pid": 76337, "tid": -914061504, "ts": 1716454222546751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222618949, "dur": 7, "args": { "External id": 81401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81401, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 81401, "pid": 5, "tid": 7, "ts": 1716454222618949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546753, "dur": 11, "args": { "External id": 81401, "cbid": 211, "correlation": 81401 } }, { "ph": "s", "id": 81401, "pid": 76337, "tid": -914061504, "ts": 1716454222546753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222618958, "dur": 4, "args": { "External id": 81403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81403, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 81403, "pid": 5, "tid": 7, "ts": 1716454222618958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546765, "dur": 5, "args": { "External id": 81403, "cbid": 211, "correlation": 81403 } }, { "ph": "s", "id": 81403, "pid": 76337, "tid": -914061504, "ts": 1716454222546765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222618963, "dur": 95, "args": { "External id": 81424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81424, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 81424, "pid": 5, "tid": 7, "ts": 1716454222618963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546839, "dur": 13, "args": { "External id": 81424, "cbid": 211, "correlation": 81424 } }, { "ph": "s", "id": 81424, "pid": 76337, "tid": -914061504, "ts": 1716454222546839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222546936, "dur": 1, "args": { "External id": 81442, "cbid": 251, "correlation": 81442 } }, { "ph": "f", "id": 81442, "pid": 76337, "tid": -914061504, "ts": 1716454222546936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222619058, "dur": 101, "args": { "External id": 81444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81444, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81444, "pid": 5, "tid": 7, "ts": 1716454222619058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222546942, "dur": 13, "args": { "External id": 81444, "cbid": 211, "correlation": 81444 } }, { "ph": "s", "id": 81444, "pid": 76337, "tid": -914061504, "ts": 1716454222546942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222619160, "dur": 20, "args": { "External id": 81452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81452, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81452, "pid": 5, "tid": 7, "ts": 1716454222619160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547020, "dur": 12, "args": { "External id": 81452, "cbid": 211, "correlation": 81452 } }, { "ph": "s", "id": 81452, "pid": 76337, "tid": -914061504, "ts": 1716454222547020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222619181, "dur": 37, "args": { "External id": 81460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81460, "pid": 5, "tid": 7, "ts": 1716454222619181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547062, "dur": 9, "args": { "External id": 81460, "cbid": 211, "correlation": 81460 } }, { "ph": "s", "id": 81460, "pid": 76337, "tid": -914061504, "ts": 1716454222547062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222619220, "dur": 35, "args": { "External id": 81482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81482, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81482, "pid": 5, "tid": 7, "ts": 1716454222619220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547113, "dur": 10, "args": { "External id": 81482, "cbid": 211, "correlation": 81482 } }, { "ph": "s", "id": 81482, "pid": 76337, "tid": -914061504, "ts": 1716454222547113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222547204, "dur": 1, "args": { "External id": 81498, "cbid": 251, "correlation": 81498 } }, { "ph": "f", "id": 81498, "pid": 76337, "tid": -914061504, "ts": 1716454222547204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222547209, "dur": 0, "args": { "External id": 81500, "cbid": 251, "correlation": 81500 } }, { "ph": "f", "id": 81500, "pid": 76337, "tid": -914061504, "ts": 1716454222547209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222619257, "dur": 551, "args": { "External id": 81501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81501, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 81501, "pid": 5, "tid": 7, "ts": 1716454222619257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547213, "dur": 13, "args": { "External id": 81501, "cbid": 211, "correlation": 81501 } }, { "ph": "s", "id": 81501, "pid": 76337, "tid": -914061504, "ts": 1716454222547213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222619809, "dur": 128, "args": { "External id": 81509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81509, "pid": 5, "tid": 7, "ts": 1716454222619809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547278, "dur": 13, "args": { "External id": 81509, "cbid": 211, "correlation": 81509 } }, { "ph": "s", "id": 81509, "pid": 76337, "tid": -914061504, "ts": 1716454222547278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222619938, "dur": 130, "args": { "External id": 81517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81517, "pid": 5, "tid": 7, "ts": 1716454222619938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547310, "dur": 8, "args": { "External id": 81517, "cbid": 211, "correlation": 81517 } }, { "ph": "s", "id": 81517, "pid": 76337, "tid": -914061504, "ts": 1716454222547310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222547386, "dur": 1, "args": { "External id": 81533, "cbid": 251, "correlation": 81533 } }, { "ph": "f", "id": 81533, "pid": 76337, "tid": -914061504, "ts": 1716454222547386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222620070, "dur": 306, "args": { "External id": 81535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81535, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81535, "pid": 5, "tid": 7, "ts": 1716454222620070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547392, "dur": 12, "args": { "External id": 81535, "cbid": 211, "correlation": 81535 } }, { "ph": "s", "id": 81535, "pid": 76337, "tid": -914061504, "ts": 1716454222547392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222620377, "dur": 27, "args": { "External id": 81543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81543, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81543, "pid": 5, "tid": 7, "ts": 1716454222620377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547434, "dur": 10, "args": { "External id": 81543, "cbid": 211, "correlation": 81543 } }, { "ph": "s", "id": 81543, "pid": 76337, "tid": -914061504, "ts": 1716454222547434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222620405, "dur": 83, "args": { "External id": 81554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81554, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81554, "pid": 5, "tid": 7, "ts": 1716454222620405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547502, "dur": 13, "args": { "External id": 81554, "cbid": 211, "correlation": 81554 } }, { "ph": "s", "id": 81554, "pid": 76337, "tid": -914061504, "ts": 1716454222547502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222547566, "dur": 0, "args": { "External id": 81566, "cbid": 317, "correlation": 81566 } }, { "ph": "f", "id": 81566, "pid": 76337, "tid": -914061504, "ts": 1716454222547566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222547567, "dur": 0, "args": { "External id": 81567, "cbid": 203, "correlation": 81567 } }, { "ph": "f", "id": 81567, "pid": 76337, "tid": -914061504, "ts": 1716454222547567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222547568, "dur": 0, "args": { "External id": 81568, "cbid": 205, "correlation": 81568 } }, { "ph": "f", "id": 81568, "pid": 76337, "tid": -914061504, "ts": 1716454222547568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222620490, "dur": 23, "args": { "External id": 81572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81572, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81572, "pid": 5, "tid": 7, "ts": 1716454222620490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547583, "dur": 12, "args": { "External id": 81572, "cbid": 211, "correlation": 81572 } }, { "ph": "s", "id": 81572, "pid": 76337, "tid": -914061504, "ts": 1716454222547583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222620514, "dur": 122, "args": { "External id": 81574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81574, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81574, "pid": 5, "tid": 7, "ts": 1716454222620514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547601, "dur": 6, "args": { "External id": 81574, "cbid": 211, "correlation": 81574 } }, { "ph": "s", "id": 81574, "pid": 76337, "tid": -914061504, "ts": 1716454222547601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222620638, "dur": 23, "args": { "External id": 81576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81576, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81576, "pid": 5, "tid": 7, "ts": 1716454222620638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547611, "dur": 6, "args": { "External id": 81576, "cbid": 211, "correlation": 81576 } }, { "ph": "s", "id": 81576, "pid": 76337, "tid": -914061504, "ts": 1716454222547611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222620663, "dur": 33, "args": { "External id": 81582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81582, "pid": 5, "tid": 7, "ts": 1716454222620663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547639, "dur": 8, "args": { "External id": 81582, "cbid": 211, "correlation": 81582 } }, { "ph": "s", "id": 81582, "pid": 76337, "tid": -914061504, "ts": 1716454222547639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222620697, "dur": 27, "args": { "External id": 81590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81590, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81590, "pid": 5, "tid": 7, "ts": 1716454222620697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547671, "dur": 8, "args": { "External id": 81590, "cbid": 211, "correlation": 81590 } }, { "ph": "s", "id": 81590, "pid": 76337, "tid": -914061504, "ts": 1716454222547671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222620725, "dur": 55, "args": { "External id": 81599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81599, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81599, "pid": 5, "tid": 7, "ts": 1716454222620725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547708, "dur": 11, "args": { "External id": 81599, "cbid": 211, "correlation": 81599 } }, { "ph": "s", "id": 81599, "pid": 76337, "tid": -914061504, "ts": 1716454222547708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222620781, "dur": 52, "args": { "External id": 81619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81619, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 81619, "pid": 5, "tid": 7, "ts": 1716454222620781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547780, "dur": 12, "args": { "External id": 81619, "cbid": 211, "correlation": 81619 } }, { "ph": "s", "id": 81619, "pid": 76337, "tid": -914061504, "ts": 1716454222547780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222620835, "dur": 5, "args": { "External id": 81631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81631, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 81631, "pid": 5, "tid": 7, "ts": 1716454222620835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547802, "dur": 6, "args": { "External id": 81631, "cbid": 211, "correlation": 81631 } }, { "ph": "s", "id": 81631, "pid": 76337, "tid": -914061504, "ts": 1716454222547802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222620841, "dur": 57, "args": { "External id": 81634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81634, "pid": 5, "tid": 7, "ts": 1716454222620841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547821, "dur": 7, "args": { "External id": 81634, "cbid": 211, "correlation": 81634 } }, { "ph": "s", "id": 81634, "pid": 76337, "tid": -914061504, "ts": 1716454222547821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222620899, "dur": 37, "args": { "External id": 81643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81643, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81643, "pid": 5, "tid": 7, "ts": 1716454222620899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547859, "dur": 10, "args": { "External id": 81643, "cbid": 211, "correlation": 81643 } }, { "ph": "s", "id": 81643, "pid": 76337, "tid": -914061504, "ts": 1716454222547859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222547910, "dur": 0, "args": { "External id": 81653, "cbid": 317, "correlation": 81653 } }, { "ph": "f", "id": 81653, "pid": 76337, "tid": -914061504, "ts": 1716454222547910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222547911, "dur": 0, "args": { "External id": 81654, "cbid": 203, "correlation": 81654 } }, { "ph": "f", "id": 81654, "pid": 76337, "tid": -914061504, "ts": 1716454222547911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222547912, "dur": 0, "args": { "External id": 81655, "cbid": 205, "correlation": 81655 } }, { "ph": "f", "id": 81655, "pid": 76337, "tid": -914061504, "ts": 1716454222547912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222620938, "dur": 39, "args": { "External id": 81659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81659, "pid": 5, "tid": 7, "ts": 1716454222620938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547927, "dur": 11, "args": { "External id": 81659, "cbid": 211, "correlation": 81659 } }, { "ph": "s", "id": 81659, "pid": 76337, "tid": -914061504, "ts": 1716454222547927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222620978, "dur": 85, "args": { "External id": 81661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81661, "pid": 5, "tid": 7, "ts": 1716454222620978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547941, "dur": 5, "args": { "External id": 81661, "cbid": 211, "correlation": 81661 } }, { "ph": "s", "id": 81661, "pid": 76337, "tid": -914061504, "ts": 1716454222547941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222621065, "dur": 1295, "args": { "External id": 81663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81663, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81663, "pid": 5, "tid": 7, "ts": 1716454222621065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547952, "dur": 6, "args": { "External id": 81663, "cbid": 211, "correlation": 81663 } }, { "ph": "s", "id": 81663, "pid": 76337, "tid": -914061504, "ts": 1716454222547952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222622361, "dur": 22, "args": { "External id": 81665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81665, "pid": 5, "tid": 7, "ts": 1716454222622361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547962, "dur": 5, "args": { "External id": 81665, "cbid": 211, "correlation": 81665 } }, { "ph": "s", "id": 81665, "pid": 76337, "tid": -914061504, "ts": 1716454222547962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222622384, "dur": 33, "args": { "External id": 81671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81671, "pid": 5, "tid": 7, "ts": 1716454222622384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222547997, "dur": 9, "args": { "External id": 81671, "cbid": 211, "correlation": 81671 } }, { "ph": "s", "id": 81671, "pid": 76337, "tid": -914061504, "ts": 1716454222547997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222622419, "dur": 3, "args": { "External id": 81679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81679, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 81679, "pid": 5, "tid": 7, "ts": 1716454222622419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548042, "dur": 10, "args": { "External id": 81679, "cbid": 211, "correlation": 81679 } }, { "ph": "s", "id": 81679, "pid": 76337, "tid": -914061504, "ts": 1716454222548042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222548106, "dur": 1, "args": { "External id": 81695, "cbid": 251, "correlation": 81695 } }, { "ph": "f", "id": 81695, "pid": 76337, "tid": -914061504, "ts": 1716454222548106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222548111, "dur": 0, "args": { "External id": 81697, "cbid": 251, "correlation": 81697 } }, { "ph": "f", "id": 81697, "pid": 76337, "tid": -914061504, "ts": 1716454222548111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222622423, "dur": 12, "args": { "External id": 81698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81698, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 81698, "pid": 5, "tid": 7, "ts": 1716454222622423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548113, "dur": 11, "args": { "External id": 81698, "cbid": 211, "correlation": 81698 } }, { "ph": "s", "id": 81698, "pid": 76337, "tid": -914061504, "ts": 1716454222548113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222622437, "dur": 5, "args": { "External id": 81700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81700, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 81700, "pid": 5, "tid": 7, "ts": 1716454222622437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548126, "dur": 6, "args": { "External id": 81700, "cbid": 211, "correlation": 81700 } }, { "ph": "s", "id": 81700, "pid": 76337, "tid": -914061504, "ts": 1716454222548126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222622443, "dur": 30, "args": { "External id": 81710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81710, "pid": 5, "tid": 7, "ts": 1716454222622443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548185, "dur": 12, "args": { "External id": 81710, "cbid": 211, "correlation": 81710 } }, { "ph": "s", "id": 81710, "pid": 76337, "tid": -914061504, "ts": 1716454222548185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222622474, "dur": 31, "args": { "External id": 81730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81730, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 81730, "pid": 5, "tid": 7, "ts": 1716454222622474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548251, "dur": 12, "args": { "External id": 81730, "cbid": 211, "correlation": 81730 } }, { "ph": "s", "id": 81730, "pid": 76337, "tid": -914061504, "ts": 1716454222548251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222622507, "dur": 5, "args": { "External id": 81742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81742, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 81742, "pid": 5, "tid": 7, "ts": 1716454222622507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548273, "dur": 6, "args": { "External id": 81742, "cbid": 211, "correlation": 81742 } }, { "ph": "s", "id": 81742, "pid": 76337, "tid": -914061504, "ts": 1716454222548273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222622512, "dur": 31, "args": { "External id": 81745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81745, "pid": 5, "tid": 7, "ts": 1716454222622512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548291, "dur": 7, "args": { "External id": 81745, "cbid": 211, "correlation": 81745 } }, { "ph": "s", "id": 81745, "pid": 76337, "tid": -914061504, "ts": 1716454222548291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222622544, "dur": 20, "args": { "External id": 81754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81754, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81754, "pid": 5, "tid": 7, "ts": 1716454222622544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548331, "dur": 10, "args": { "External id": 81754, "cbid": 211, "correlation": 81754 } }, { "ph": "s", "id": 81754, "pid": 76337, "tid": -914061504, "ts": 1716454222548331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222548393, "dur": 0, "args": { "External id": 81764, "cbid": 317, "correlation": 81764 } }, { "ph": "f", "id": 81764, "pid": 76337, "tid": -914061504, "ts": 1716454222548393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222548393, "dur": 0, "args": { "External id": 81765, "cbid": 203, "correlation": 81765 } }, { "ph": "f", "id": 81765, "pid": 76337, "tid": -914061504, "ts": 1716454222548393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222548394, "dur": 0, "args": { "External id": 81766, "cbid": 205, "correlation": 81766 } }, { "ph": "f", "id": 81766, "pid": 76337, "tid": -914061504, "ts": 1716454222548394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222622566, "dur": 22, "args": { "External id": 81770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81770, "pid": 5, "tid": 7, "ts": 1716454222622566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548408, "dur": 11, "args": { "External id": 81770, "cbid": 211, "correlation": 81770 } }, { "ph": "s", "id": 81770, "pid": 76337, "tid": -914061504, "ts": 1716454222548408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222622589, "dur": 44, "args": { "External id": 81772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81772, "pid": 5, "tid": 7, "ts": 1716454222622589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548422, "dur": 5, "args": { "External id": 81772, "cbid": 211, "correlation": 81772 } }, { "ph": "s", "id": 81772, "pid": 76337, "tid": -914061504, "ts": 1716454222548422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222622635, "dur": 654, "args": { "External id": 81774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81774, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81774, "pid": 5, "tid": 7, "ts": 1716454222622635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548433, "dur": 6, "args": { "External id": 81774, "cbid": 211, "correlation": 81774 } }, { "ph": "s", "id": 81774, "pid": 76337, "tid": -914061504, "ts": 1716454222548433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222623290, "dur": 21, "args": { "External id": 81776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81776, "pid": 5, "tid": 7, "ts": 1716454222623290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548443, "dur": 6, "args": { "External id": 81776, "cbid": 211, "correlation": 81776 } }, { "ph": "s", "id": 81776, "pid": 76337, "tid": -914061504, "ts": 1716454222548443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222623312, "dur": 33, "args": { "External id": 81782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81782, "pid": 5, "tid": 7, "ts": 1716454222623312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548471, "dur": 9, "args": { "External id": 81782, "cbid": 211, "correlation": 81782 } }, { "ph": "s", "id": 81782, "pid": 76337, "tid": -914061504, "ts": 1716454222548471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222548529, "dur": 0, "args": { "External id": 81792, "cbid": 317, "correlation": 81792 } }, { "ph": "f", "id": 81792, "pid": 76337, "tid": -914061504, "ts": 1716454222548529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222548530, "dur": 0, "args": { "External id": 81793, "cbid": 203, "correlation": 81793 } }, { "ph": "f", "id": 81793, "pid": 76337, "tid": -914061504, "ts": 1716454222548530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222548531, "dur": 0, "args": { "External id": 81794, "cbid": 205, "correlation": 81794 } }, { "ph": "f", "id": 81794, "pid": 76337, "tid": -914061504, "ts": 1716454222548531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222623347, "dur": 38, "args": { "External id": 81798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81798, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81798, "pid": 5, "tid": 7, "ts": 1716454222623347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548543, "dur": 12, "args": { "External id": 81798, "cbid": 211, "correlation": 81798 } }, { "ph": "s", "id": 81798, "pid": 76337, "tid": -914061504, "ts": 1716454222548543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222623386, "dur": 193, "args": { "External id": 81800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81800, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81800, "pid": 5, "tid": 7, "ts": 1716454222623386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548561, "dur": 6, "args": { "External id": 81800, "cbid": 211, "correlation": 81800 } }, { "ph": "s", "id": 81800, "pid": 76337, "tid": -914061504, "ts": 1716454222548561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222623581, "dur": 22, "args": { "External id": 81802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81802, "pid": 5, "tid": 7, "ts": 1716454222623581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548570, "dur": 5, "args": { "External id": 81802, "cbid": 211, "correlation": 81802 } }, { "ph": "s", "id": 81802, "pid": 76337, "tid": -914061504, "ts": 1716454222548570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222623605, "dur": 33, "args": { "External id": 81808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81808, "pid": 5, "tid": 7, "ts": 1716454222623605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548595, "dur": 9, "args": { "External id": 81808, "cbid": 211, "correlation": 81808 } }, { "ph": "s", "id": 81808, "pid": 76337, "tid": -914061504, "ts": 1716454222548595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222623640, "dur": 27, "args": { "External id": 81816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81816, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81816, "pid": 5, "tid": 7, "ts": 1716454222623640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548625, "dur": 8, "args": { "External id": 81816, "cbid": 211, "correlation": 81816 } }, { "ph": "s", "id": 81816, "pid": 76337, "tid": -914061504, "ts": 1716454222548625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222623668, "dur": 19, "args": { "External id": 81824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81824, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81824, "pid": 5, "tid": 7, "ts": 1716454222623668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548653, "dur": 9, "args": { "External id": 81824, "cbid": 211, "correlation": 81824 } }, { "ph": "s", "id": 81824, "pid": 76337, "tid": -914061504, "ts": 1716454222548653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222623689, "dur": 30, "args": { "External id": 81844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81844, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 81844, "pid": 5, "tid": 7, "ts": 1716454222623689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548736, "dur": 12, "args": { "External id": 81844, "cbid": 211, "correlation": 81844 } }, { "ph": "s", "id": 81844, "pid": 76337, "tid": -914061504, "ts": 1716454222548736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222623720, "dur": 4, "args": { "External id": 81856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81856, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 81856, "pid": 5, "tid": 7, "ts": 1716454222623720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548758, "dur": 7, "args": { "External id": 81856, "cbid": 211, "correlation": 81856 } }, { "ph": "s", "id": 81856, "pid": 76337, "tid": -914061504, "ts": 1716454222548758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222623726, "dur": 31, "args": { "External id": 81859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81859, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81859, "pid": 5, "tid": 7, "ts": 1716454222623726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548776, "dur": 6, "args": { "External id": 81859, "cbid": 211, "correlation": 81859 } }, { "ph": "s", "id": 81859, "pid": 76337, "tid": -914061504, "ts": 1716454222548776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222548833, "dur": 0, "args": { "External id": 81870, "cbid": 317, "correlation": 81870 } }, { "ph": "f", "id": 81870, "pid": 76337, "tid": -914061504, "ts": 1716454222548833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222548833, "dur": 0, "args": { "External id": 81871, "cbid": 203, "correlation": 81871 } }, { "ph": "f", "id": 81871, "pid": 76337, "tid": -914061504, "ts": 1716454222548833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222548834, "dur": 0, "args": { "External id": 81872, "cbid": 205, "correlation": 81872 } }, { "ph": "f", "id": 81872, "pid": 76337, "tid": -914061504, "ts": 1716454222548834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222623758, "dur": 23, "args": { "External id": 81876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81876, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81876, "pid": 5, "tid": 7, "ts": 1716454222623758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548847, "dur": 12, "args": { "External id": 81876, "cbid": 211, "correlation": 81876 } }, { "ph": "s", "id": 81876, "pid": 76337, "tid": -914061504, "ts": 1716454222548847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222623782, "dur": 108, "args": { "External id": 81878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81878, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81878, "pid": 5, "tid": 7, "ts": 1716454222623782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548865, "dur": 7, "args": { "External id": 81878, "cbid": 211, "correlation": 81878 } }, { "ph": "s", "id": 81878, "pid": 76337, "tid": -914061504, "ts": 1716454222548865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222623891, "dur": 21, "args": { "External id": 81880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81880, "pid": 5, "tid": 7, "ts": 1716454222623891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548875, "dur": 5, "args": { "External id": 81880, "cbid": 211, "correlation": 81880 } }, { "ph": "s", "id": 81880, "pid": 76337, "tid": -914061504, "ts": 1716454222548875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222623913, "dur": 33, "args": { "External id": 81886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81886, "pid": 5, "tid": 7, "ts": 1716454222623913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548903, "dur": 8, "args": { "External id": 81886, "cbid": 211, "correlation": 81886 } }, { "ph": "s", "id": 81886, "pid": 76337, "tid": -914061504, "ts": 1716454222548903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222623947, "dur": 185, "args": { "External id": 81895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81895, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81895, "pid": 5, "tid": 7, "ts": 1716454222623947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222548992, "dur": 14, "args": { "External id": 81895, "cbid": 211, "correlation": 81895 } }, { "ph": "s", "id": 81895, "pid": 76337, "tid": -914061504, "ts": 1716454222548992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222624134, "dur": 65, "args": { "External id": 81917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81917, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 81917, "pid": 5, "tid": 7, "ts": 1716454222624134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549050, "dur": 10, "args": { "External id": 81917, "cbid": 211, "correlation": 81917 } }, { "ph": "s", "id": 81917, "pid": 76337, "tid": -914061504, "ts": 1716454222549050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549139, "dur": 1, "args": { "External id": 81928, "cbid": 251, "correlation": 81928 } }, { "ph": "f", "id": 81928, "pid": 76337, "tid": -914061504, "ts": 1716454222549139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222624201, "dur": 158, "args": { "External id": 81929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81929, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81929, "pid": 5, "tid": 7, "ts": 1716454222624201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549144, "dur": 13, "args": { "External id": 81929, "cbid": 211, "correlation": 81929 } }, { "ph": "s", "id": 81929, "pid": 76337, "tid": -914061504, "ts": 1716454222549144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549214, "dur": 1, "args": { "External id": 81940, "cbid": 251, "correlation": 81940 } }, { "ph": "f", "id": 81940, "pid": 76337, "tid": -914061504, "ts": 1716454222549214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222624360, "dur": 148, "args": { "External id": 81941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81941, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81941, "pid": 5, "tid": 7, "ts": 1716454222624360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549218, "dur": 12, "args": { "External id": 81941, "cbid": 211, "correlation": 81941 } }, { "ph": "s", "id": 81941, "pid": 76337, "tid": -914061504, "ts": 1716454222549218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549284, "dur": 1, "args": { "External id": 81952, "cbid": 251, "correlation": 81952 } }, { "ph": "f", "id": 81952, "pid": 76337, "tid": -914061504, "ts": 1716454222549284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222624509, "dur": 145, "args": { "External id": 81953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81953, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 81953, "pid": 5, "tid": 7, "ts": 1716454222624509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549288, "dur": 11, "args": { "External id": 81953, "cbid": 211, "correlation": 81953 } }, { "ph": "s", "id": 81953, "pid": 76337, "tid": -914061504, "ts": 1716454222549288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222624656, "dur": 1988, "args": { "External id": 81974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81974, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 81974, "pid": 5, "tid": 7, "ts": 1716454222624656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549366, "dur": 12, "args": { "External id": 81974, "cbid": 211, "correlation": 81974 } }, { "ph": "s", "id": 81974, "pid": 76337, "tid": -914061504, "ts": 1716454222549366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549463, "dur": 1, "args": { "External id": 81992, "cbid": 251, "correlation": 81992 } }, { "ph": "f", "id": 81992, "pid": 76337, "tid": -914061504, "ts": 1716454222549463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222626646, "dur": 152, "args": { "External id": 81994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 81994, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 81994, "pid": 5, "tid": 7, "ts": 1716454222626646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549469, "dur": 13, "args": { "External id": 81994, "cbid": 211, "correlation": 81994 } }, { "ph": "s", "id": 81994, "pid": 76337, "tid": -914061504, "ts": 1716454222549469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222626799, "dur": 36, "args": { "External id": 82002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82002, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82002, "pid": 5, "tid": 7, "ts": 1716454222626799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549539, "dur": 12, "args": { "External id": 82002, "cbid": 211, "correlation": 82002 } }, { "ph": "s", "id": 82002, "pid": 76337, "tid": -914061504, "ts": 1716454222549539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222626835, "dur": 50, "args": { "External id": 82010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82010, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82010, "pid": 5, "tid": 7, "ts": 1716454222626835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549577, "dur": 8, "args": { "External id": 82010, "cbid": 211, "correlation": 82010 } }, { "ph": "s", "id": 82010, "pid": 76337, "tid": -914061504, "ts": 1716454222549577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222626887, "dur": 30, "args": { "External id": 82021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82021, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82021, "pid": 5, "tid": 7, "ts": 1716454222626887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549648, "dur": 12, "args": { "External id": 82021, "cbid": 211, "correlation": 82021 } }, { "ph": "s", "id": 82021, "pid": 76337, "tid": -914061504, "ts": 1716454222549648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222626919, "dur": 35, "args": { "External id": 82043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82043, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82043, "pid": 5, "tid": 7, "ts": 1716454222626919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549679, "dur": 7, "args": { "External id": 82043, "cbid": 211, "correlation": 82043 } }, { "ph": "s", "id": 82043, "pid": 76337, "tid": -914061504, "ts": 1716454222549679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549763, "dur": 1, "args": { "External id": 82054, "cbid": 251, "correlation": 82054 } }, { "ph": "f", "id": 82054, "pid": 76337, "tid": -914061504, "ts": 1716454222549763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222626956, "dur": 92, "args": { "External id": 82055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82055, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82055, "pid": 5, "tid": 7, "ts": 1716454222626956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549768, "dur": 13, "args": { "External id": 82055, "cbid": 211, "correlation": 82055 } }, { "ph": "s", "id": 82055, "pid": 76337, "tid": -914061504, "ts": 1716454222549768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549837, "dur": 1, "args": { "External id": 82066, "cbid": 251, "correlation": 82066 } }, { "ph": "f", "id": 82066, "pid": 76337, "tid": -914061504, "ts": 1716454222549837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549840, "dur": 0, "args": { "External id": 82067, "cbid": 251, "correlation": 82067 } }, { "ph": "f", "id": 82067, "pid": 76337, "tid": -914061504, "ts": 1716454222549840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222627049, "dur": 12, "args": { "External id": 82068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82068, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 82068, "pid": 5, "tid": 7, "ts": 1716454222627049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549842, "dur": 13, "args": { "External id": 82068, "cbid": 211, "correlation": 82068 } }, { "ph": "s", "id": 82068, "pid": 76337, "tid": -914061504, "ts": 1716454222549842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222627062, "dur": 5, "args": { "External id": 82070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82070, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 82070, "pid": 5, "tid": 7, "ts": 1716454222627062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549857, "dur": 6, "args": { "External id": 82070, "cbid": 211, "correlation": 82070 } }, { "ph": "s", "id": 82070, "pid": 76337, "tid": -914061504, "ts": 1716454222549857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549913, "dur": 1, "args": { "External id": 82081, "cbid": 251, "correlation": 82081 } }, { "ph": "f", "id": 82081, "pid": 76337, "tid": -914061504, "ts": 1716454222549913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222549916, "dur": 0, "args": { "External id": 82082, "cbid": 251, "correlation": 82082 } }, { "ph": "f", "id": 82082, "pid": 76337, "tid": -914061504, "ts": 1716454222549916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222627068, "dur": 7, "args": { "External id": 82083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82083, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 82083, "pid": 5, "tid": 7, "ts": 1716454222627068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549918, "dur": 11, "args": { "External id": 82083, "cbid": 211, "correlation": 82083 } }, { "ph": "s", "id": 82083, "pid": 76337, "tid": -914061504, "ts": 1716454222549918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222627077, "dur": 4, "args": { "External id": 82085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82085, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 82085, "pid": 5, "tid": 7, "ts": 1716454222627077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222549931, "dur": 5, "args": { "External id": 82085, "cbid": 211, "correlation": 82085 } }, { "ph": "s", "id": 82085, "pid": 76337, "tid": -914061504, "ts": 1716454222549931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222627081, "dur": 94, "args": { "External id": 82106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82106, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 82106, "pid": 5, "tid": 7, "ts": 1716454222627081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550016, "dur": 13, "args": { "External id": 82106, "cbid": 211, "correlation": 82106 } }, { "ph": "s", "id": 82106, "pid": 76337, "tid": -914061504, "ts": 1716454222550016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222550114, "dur": 1, "args": { "External id": 82124, "cbid": 251, "correlation": 82124 } }, { "ph": "f", "id": 82124, "pid": 76337, "tid": -914061504, "ts": 1716454222550114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222627176, "dur": 85, "args": { "External id": 82126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82126, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82126, "pid": 5, "tid": 7, "ts": 1716454222627176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550120, "dur": 13, "args": { "External id": 82126, "cbid": 211, "correlation": 82126 } }, { "ph": "s", "id": 82126, "pid": 76337, "tid": -914061504, "ts": 1716454222550120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222627263, "dur": 19, "args": { "External id": 82134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82134, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82134, "pid": 5, "tid": 7, "ts": 1716454222627263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550190, "dur": 12, "args": { "External id": 82134, "cbid": 211, "correlation": 82134 } }, { "ph": "s", "id": 82134, "pid": 76337, "tid": -914061504, "ts": 1716454222550190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222627283, "dur": 40, "args": { "External id": 82142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82142, "pid": 5, "tid": 7, "ts": 1716454222627283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550232, "dur": 9, "args": { "External id": 82142, "cbid": 211, "correlation": 82142 } }, { "ph": "s", "id": 82142, "pid": 76337, "tid": -914061504, "ts": 1716454222550232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222627324, "dur": 35, "args": { "External id": 82164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82164, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82164, "pid": 5, "tid": 7, "ts": 1716454222627324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550283, "dur": 10, "args": { "External id": 82164, "cbid": 211, "correlation": 82164 } }, { "ph": "s", "id": 82164, "pid": 76337, "tid": -914061504, "ts": 1716454222550283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222550372, "dur": 1, "args": { "External id": 82180, "cbid": 251, "correlation": 82180 } }, { "ph": "f", "id": 82180, "pid": 76337, "tid": -914061504, "ts": 1716454222550372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222550377, "dur": 0, "args": { "External id": 82182, "cbid": 251, "correlation": 82182 } }, { "ph": "f", "id": 82182, "pid": 76337, "tid": -914061504, "ts": 1716454222550377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222627361, "dur": 550, "args": { "External id": 82183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82183, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 82183, "pid": 5, "tid": 7, "ts": 1716454222627361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550381, "dur": 13, "args": { "External id": 82183, "cbid": 211, "correlation": 82183 } }, { "ph": "s", "id": 82183, "pid": 76337, "tid": -914061504, "ts": 1716454222550381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222627912, "dur": 128, "args": { "External id": 82191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82191, "pid": 5, "tid": 7, "ts": 1716454222627912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550445, "dur": 13, "args": { "External id": 82191, "cbid": 211, "correlation": 82191 } }, { "ph": "s", "id": 82191, "pid": 76337, "tid": -914061504, "ts": 1716454222550445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222628041, "dur": 131, "args": { "External id": 82199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82199, "pid": 5, "tid": 7, "ts": 1716454222628041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550475, "dur": 9, "args": { "External id": 82199, "cbid": 211, "correlation": 82199 } }, { "ph": "s", "id": 82199, "pid": 76337, "tid": -914061504, "ts": 1716454222550475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222550553, "dur": 1, "args": { "External id": 82215, "cbid": 251, "correlation": 82215 } }, { "ph": "f", "id": 82215, "pid": 76337, "tid": -914061504, "ts": 1716454222550553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222628173, "dur": 317, "args": { "External id": 82217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82217, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82217, "pid": 5, "tid": 7, "ts": 1716454222628173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550558, "dur": 12, "args": { "External id": 82217, "cbid": 211, "correlation": 82217 } }, { "ph": "s", "id": 82217, "pid": 76337, "tid": -914061504, "ts": 1716454222550558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222628492, "dur": 27, "args": { "External id": 82225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82225, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82225, "pid": 5, "tid": 7, "ts": 1716454222628492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550600, "dur": 10, "args": { "External id": 82225, "cbid": 211, "correlation": 82225 } }, { "ph": "s", "id": 82225, "pid": 76337, "tid": -914061504, "ts": 1716454222550600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222628520, "dur": 83, "args": { "External id": 82236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82236, "pid": 5, "tid": 7, "ts": 1716454222628520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550668, "dur": 12, "args": { "External id": 82236, "cbid": 211, "correlation": 82236 } }, { "ph": "s", "id": 82236, "pid": 76337, "tid": -914061504, "ts": 1716454222550668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222550730, "dur": 0, "args": { "External id": 82248, "cbid": 317, "correlation": 82248 } }, { "ph": "f", "id": 82248, "pid": 76337, "tid": -914061504, "ts": 1716454222550730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222550731, "dur": 0, "args": { "External id": 82249, "cbid": 203, "correlation": 82249 } }, { "ph": "f", "id": 82249, "pid": 76337, "tid": -914061504, "ts": 1716454222550731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222550732, "dur": 0, "args": { "External id": 82250, "cbid": 205, "correlation": 82250 } }, { "ph": "f", "id": 82250, "pid": 76337, "tid": -914061504, "ts": 1716454222550732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222628604, "dur": 24, "args": { "External id": 82254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82254, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82254, "pid": 5, "tid": 7, "ts": 1716454222628604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550748, "dur": 12, "args": { "External id": 82254, "cbid": 211, "correlation": 82254 } }, { "ph": "s", "id": 82254, "pid": 76337, "tid": -914061504, "ts": 1716454222550748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222628629, "dur": 124, "args": { "External id": 82256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82256, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82256, "pid": 5, "tid": 7, "ts": 1716454222628629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550765, "dur": 6, "args": { "External id": 82256, "cbid": 211, "correlation": 82256 } }, { "ph": "s", "id": 82256, "pid": 76337, "tid": -914061504, "ts": 1716454222550765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222628754, "dur": 24, "args": { "External id": 82258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82258, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82258, "pid": 5, "tid": 7, "ts": 1716454222628754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550776, "dur": 5, "args": { "External id": 82258, "cbid": 211, "correlation": 82258 } }, { "ph": "s", "id": 82258, "pid": 76337, "tid": -914061504, "ts": 1716454222550776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222628780, "dur": 33, "args": { "External id": 82264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82264, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82264, "pid": 5, "tid": 7, "ts": 1716454222628780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550803, "dur": 8, "args": { "External id": 82264, "cbid": 211, "correlation": 82264 } }, { "ph": "s", "id": 82264, "pid": 76337, "tid": -914061504, "ts": 1716454222550803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222628815, "dur": 26, "args": { "External id": 82272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82272, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82272, "pid": 5, "tid": 7, "ts": 1716454222628815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550834, "dur": 8, "args": { "External id": 82272, "cbid": 211, "correlation": 82272 } }, { "ph": "s", "id": 82272, "pid": 76337, "tid": -914061504, "ts": 1716454222550834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222628842, "dur": 44, "args": { "External id": 82281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82281, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82281, "pid": 5, "tid": 7, "ts": 1716454222628842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550872, "dur": 10, "args": { "External id": 82281, "cbid": 211, "correlation": 82281 } }, { "ph": "s", "id": 82281, "pid": 76337, "tid": -914061504, "ts": 1716454222550872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222628887, "dur": 43, "args": { "External id": 82301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82301, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 82301, "pid": 5, "tid": 7, "ts": 1716454222628887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550942, "dur": 12, "args": { "External id": 82301, "cbid": 211, "correlation": 82301 } }, { "ph": "s", "id": 82301, "pid": 76337, "tid": -914061504, "ts": 1716454222550942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222628932, "dur": 5, "args": { "External id": 82313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82313, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 82313, "pid": 5, "tid": 7, "ts": 1716454222628932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550964, "dur": 6, "args": { "External id": 82313, "cbid": 211, "correlation": 82313 } }, { "ph": "s", "id": 82313, "pid": 76337, "tid": -914061504, "ts": 1716454222550964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222628938, "dur": 45, "args": { "External id": 82316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82316, "pid": 5, "tid": 7, "ts": 1716454222628938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222550991, "dur": 8, "args": { "External id": 82316, "cbid": 211, "correlation": 82316 } }, { "ph": "s", "id": 82316, "pid": 76337, "tid": -914061504, "ts": 1716454222550991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222628984, "dur": 31, "args": { "External id": 82325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82325, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82325, "pid": 5, "tid": 7, "ts": 1716454222628984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551032, "dur": 10, "args": { "External id": 82325, "cbid": 211, "correlation": 82325 } }, { "ph": "s", "id": 82325, "pid": 76337, "tid": -914061504, "ts": 1716454222551032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222551085, "dur": 0, "args": { "External id": 82335, "cbid": 317, "correlation": 82335 } }, { "ph": "f", "id": 82335, "pid": 76337, "tid": -914061504, "ts": 1716454222551085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222551086, "dur": 0, "args": { "External id": 82336, "cbid": 203, "correlation": 82336 } }, { "ph": "f", "id": 82336, "pid": 76337, "tid": -914061504, "ts": 1716454222551086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222551086, "dur": 0, "args": { "External id": 82337, "cbid": 205, "correlation": 82337 } }, { "ph": "f", "id": 82337, "pid": 76337, "tid": -914061504, "ts": 1716454222551086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222629016, "dur": 30, "args": { "External id": 82341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82341, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82341, "pid": 5, "tid": 7, "ts": 1716454222629016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551101, "dur": 12, "args": { "External id": 82341, "cbid": 211, "correlation": 82341 } }, { "ph": "s", "id": 82341, "pid": 76337, "tid": -914061504, "ts": 1716454222551101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222629048, "dur": 64, "args": { "External id": 82343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82343, "pid": 5, "tid": 7, "ts": 1716454222629048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551116, "dur": 5, "args": { "External id": 82343, "cbid": 211, "correlation": 82343 } }, { "ph": "s", "id": 82343, "pid": 76337, "tid": -914061504, "ts": 1716454222551116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222629113, "dur": 979, "args": { "External id": 82345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82345, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82345, "pid": 5, "tid": 7, "ts": 1716454222629113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551127, "dur": 6, "args": { "External id": 82345, "cbid": 211, "correlation": 82345 } }, { "ph": "s", "id": 82345, "pid": 76337, "tid": -914061504, "ts": 1716454222551127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222630094, "dur": 22, "args": { "External id": 82347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82347, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82347, "pid": 5, "tid": 7, "ts": 1716454222630094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551137, "dur": 5, "args": { "External id": 82347, "cbid": 211, "correlation": 82347 } }, { "ph": "s", "id": 82347, "pid": 76337, "tid": -914061504, "ts": 1716454222551137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222630117, "dur": 34, "args": { "External id": 82353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82353, "pid": 5, "tid": 7, "ts": 1716454222630117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551164, "dur": 9, "args": { "External id": 82353, "cbid": 211, "correlation": 82353 } }, { "ph": "s", "id": 82353, "pid": 76337, "tid": -914061504, "ts": 1716454222551164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222630152, "dur": 3, "args": { "External id": 82361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82361, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 82361, "pid": 5, "tid": 7, "ts": 1716454222630152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551209, "dur": 9, "args": { "External id": 82361, "cbid": 211, "correlation": 82361 } }, { "ph": "s", "id": 82361, "pid": 76337, "tid": -914061504, "ts": 1716454222551209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222551272, "dur": 1, "args": { "External id": 82377, "cbid": 251, "correlation": 82377 } }, { "ph": "f", "id": 82377, "pid": 76337, "tid": -914061504, "ts": 1716454222551272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222551277, "dur": 0, "args": { "External id": 82379, "cbid": 251, "correlation": 82379 } }, { "ph": "f", "id": 82379, "pid": 76337, "tid": -914061504, "ts": 1716454222551277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222630157, "dur": 12, "args": { "External id": 82380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82380, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 82380, "pid": 5, "tid": 7, "ts": 1716454222630157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551279, "dur": 11, "args": { "External id": 82380, "cbid": 211, "correlation": 82380 } }, { "ph": "s", "id": 82380, "pid": 76337, "tid": -914061504, "ts": 1716454222551279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222630170, "dur": 5, "args": { "External id": 82382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82382, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 82382, "pid": 5, "tid": 7, "ts": 1716454222630170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551291, "dur": 5, "args": { "External id": 82382, "cbid": 211, "correlation": 82382 } }, { "ph": "s", "id": 82382, "pid": 76337, "tid": -914061504, "ts": 1716454222551291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222630177, "dur": 29, "args": { "External id": 82392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82392, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82392, "pid": 5, "tid": 7, "ts": 1716454222630177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551349, "dur": 13, "args": { "External id": 82392, "cbid": 211, "correlation": 82392 } }, { "ph": "s", "id": 82392, "pid": 76337, "tid": -914061504, "ts": 1716454222551349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222630208, "dur": 32, "args": { "External id": 82412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82412, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 82412, "pid": 5, "tid": 7, "ts": 1716454222630208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551417, "dur": 11, "args": { "External id": 82412, "cbid": 211, "correlation": 82412 } }, { "ph": "s", "id": 82412, "pid": 76337, "tid": -914061504, "ts": 1716454222551417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222630241, "dur": 4, "args": { "External id": 82424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82424, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 82424, "pid": 5, "tid": 7, "ts": 1716454222630241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551437, "dur": 6, "args": { "External id": 82424, "cbid": 211, "correlation": 82424 } }, { "ph": "s", "id": 82424, "pid": 76337, "tid": -914061504, "ts": 1716454222551437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222630246, "dur": 30, "args": { "External id": 82427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82427, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82427, "pid": 5, "tid": 7, "ts": 1716454222630246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551455, "dur": 7, "args": { "External id": 82427, "cbid": 211, "correlation": 82427 } }, { "ph": "s", "id": 82427, "pid": 76337, "tid": -914061504, "ts": 1716454222551455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222630278, "dur": 21, "args": { "External id": 82436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82436, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82436, "pid": 5, "tid": 7, "ts": 1716454222630278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551495, "dur": 10, "args": { "External id": 82436, "cbid": 211, "correlation": 82436 } }, { "ph": "s", "id": 82436, "pid": 76337, "tid": -914061504, "ts": 1716454222551495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222551558, "dur": 0, "args": { "External id": 82446, "cbid": 317, "correlation": 82446 } }, { "ph": "f", "id": 82446, "pid": 76337, "tid": -914061504, "ts": 1716454222551558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222551559, "dur": 0, "args": { "External id": 82447, "cbid": 203, "correlation": 82447 } }, { "ph": "f", "id": 82447, "pid": 76337, "tid": -914061504, "ts": 1716454222551559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222551559, "dur": 0, "args": { "External id": 82448, "cbid": 205, "correlation": 82448 } }, { "ph": "f", "id": 82448, "pid": 76337, "tid": -914061504, "ts": 1716454222551559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222630300, "dur": 22, "args": { "External id": 82452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82452, "pid": 5, "tid": 7, "ts": 1716454222630300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551574, "dur": 12, "args": { "External id": 82452, "cbid": 211, "correlation": 82452 } }, { "ph": "s", "id": 82452, "pid": 76337, "tid": -914061504, "ts": 1716454222551574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222630323, "dur": 45, "args": { "External id": 82454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82454, "pid": 5, "tid": 7, "ts": 1716454222630323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551588, "dur": 5, "args": { "External id": 82454, "cbid": 211, "correlation": 82454 } }, { "ph": "s", "id": 82454, "pid": 76337, "tid": -914061504, "ts": 1716454222551588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222630369, "dur": 654, "args": { "External id": 82456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82456, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82456, "pid": 5, "tid": 7, "ts": 1716454222630369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551599, "dur": 6, "args": { "External id": 82456, "cbid": 211, "correlation": 82456 } }, { "ph": "s", "id": 82456, "pid": 76337, "tid": -914061504, "ts": 1716454222551599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222631025, "dur": 23, "args": { "External id": 82458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82458, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82458, "pid": 5, "tid": 7, "ts": 1716454222631025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551609, "dur": 5, "args": { "External id": 82458, "cbid": 211, "correlation": 82458 } }, { "ph": "s", "id": 82458, "pid": 76337, "tid": -914061504, "ts": 1716454222551609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222631049, "dur": 34, "args": { "External id": 82464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82464, "pid": 5, "tid": 7, "ts": 1716454222631049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551636, "dur": 8, "args": { "External id": 82464, "cbid": 211, "correlation": 82464 } }, { "ph": "s", "id": 82464, "pid": 76337, "tid": -914061504, "ts": 1716454222551636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222551694, "dur": 0, "args": { "External id": 82474, "cbid": 317, "correlation": 82474 } }, { "ph": "f", "id": 82474, "pid": 76337, "tid": -914061504, "ts": 1716454222551694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222551694, "dur": 0, "args": { "External id": 82475, "cbid": 203, "correlation": 82475 } }, { "ph": "f", "id": 82475, "pid": 76337, "tid": -914061504, "ts": 1716454222551694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222551695, "dur": 0, "args": { "External id": 82476, "cbid": 205, "correlation": 82476 } }, { "ph": "f", "id": 82476, "pid": 76337, "tid": -914061504, "ts": 1716454222551695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222631085, "dur": 30, "args": { "External id": 82480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82480, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82480, "pid": 5, "tid": 7, "ts": 1716454222631085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551708, "dur": 11, "args": { "External id": 82480, "cbid": 211, "correlation": 82480 } }, { "ph": "s", "id": 82480, "pid": 76337, "tid": -914061504, "ts": 1716454222551708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222631116, "dur": 154, "args": { "External id": 82482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82482, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82482, "pid": 5, "tid": 7, "ts": 1716454222631116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551726, "dur": 6, "args": { "External id": 82482, "cbid": 211, "correlation": 82482 } }, { "ph": "s", "id": 82482, "pid": 76337, "tid": -914061504, "ts": 1716454222551726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222631272, "dur": 22, "args": { "External id": 82484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82484, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82484, "pid": 5, "tid": 7, "ts": 1716454222631272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551735, "dur": 5, "args": { "External id": 82484, "cbid": 211, "correlation": 82484 } }, { "ph": "s", "id": 82484, "pid": 76337, "tid": -914061504, "ts": 1716454222551735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222631296, "dur": 33, "args": { "External id": 82490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82490, "pid": 5, "tid": 7, "ts": 1716454222631296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551761, "dur": 8, "args": { "External id": 82490, "cbid": 211, "correlation": 82490 } }, { "ph": "s", "id": 82490, "pid": 76337, "tid": -914061504, "ts": 1716454222551761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222631330, "dur": 28, "args": { "External id": 82498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82498, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82498, "pid": 5, "tid": 7, "ts": 1716454222631330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551790, "dur": 7, "args": { "External id": 82498, "cbid": 211, "correlation": 82498 } }, { "ph": "s", "id": 82498, "pid": 76337, "tid": -914061504, "ts": 1716454222551790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222631358, "dur": 19, "args": { "External id": 82506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82506, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82506, "pid": 5, "tid": 7, "ts": 1716454222631358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551817, "dur": 8, "args": { "External id": 82506, "cbid": 211, "correlation": 82506 } }, { "ph": "s", "id": 82506, "pid": 76337, "tid": -914061504, "ts": 1716454222551817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222631379, "dur": 31, "args": { "External id": 82526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82526, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 82526, "pid": 5, "tid": 7, "ts": 1716454222631379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551900, "dur": 12, "args": { "External id": 82526, "cbid": 211, "correlation": 82526 } }, { "ph": "s", "id": 82526, "pid": 76337, "tid": -914061504, "ts": 1716454222551900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222631411, "dur": 4, "args": { "External id": 82538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82538, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 82538, "pid": 5, "tid": 7, "ts": 1716454222631411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551922, "dur": 6, "args": { "External id": 82538, "cbid": 211, "correlation": 82538 } }, { "ph": "s", "id": 82538, "pid": 76337, "tid": -914061504, "ts": 1716454222551922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222631416, "dur": 31, "args": { "External id": 82541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82541, "pid": 5, "tid": 7, "ts": 1716454222631416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222551940, "dur": 6, "args": { "External id": 82541, "cbid": 211, "correlation": 82541 } }, { "ph": "s", "id": 82541, "pid": 76337, "tid": -914061504, "ts": 1716454222551940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222552006, "dur": 0, "args": { "External id": 82552, "cbid": 317, "correlation": 82552 } }, { "ph": "f", "id": 82552, "pid": 76337, "tid": -914061504, "ts": 1716454222552006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222552007, "dur": 0, "args": { "External id": 82553, "cbid": 203, "correlation": 82553 } }, { "ph": "f", "id": 82553, "pid": 76337, "tid": -914061504, "ts": 1716454222552007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222552008, "dur": 0, "args": { "External id": 82554, "cbid": 205, "correlation": 82554 } }, { "ph": "f", "id": 82554, "pid": 76337, "tid": -914061504, "ts": 1716454222552008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222631449, "dur": 23, "args": { "External id": 82558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82558, "pid": 5, "tid": 7, "ts": 1716454222631449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552021, "dur": 12, "args": { "External id": 82558, "cbid": 211, "correlation": 82558 } }, { "ph": "s", "id": 82558, "pid": 76337, "tid": -914061504, "ts": 1716454222552021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222631473, "dur": 107, "args": { "External id": 82560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82560, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82560, "pid": 5, "tid": 7, "ts": 1716454222631473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552039, "dur": 7, "args": { "External id": 82560, "cbid": 211, "correlation": 82560 } }, { "ph": "s", "id": 82560, "pid": 76337, "tid": -914061504, "ts": 1716454222552039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222631581, "dur": 23, "args": { "External id": 82562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82562, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82562, "pid": 5, "tid": 7, "ts": 1716454222631581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552049, "dur": 5, "args": { "External id": 82562, "cbid": 211, "correlation": 82562 } }, { "ph": "s", "id": 82562, "pid": 76337, "tid": -914061504, "ts": 1716454222552049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222631605, "dur": 33, "args": { "External id": 82568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82568, "pid": 5, "tid": 7, "ts": 1716454222631605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552077, "dur": 8, "args": { "External id": 82568, "cbid": 211, "correlation": 82568 } }, { "ph": "s", "id": 82568, "pid": 76337, "tid": -914061504, "ts": 1716454222552077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222631639, "dur": 205, "args": { "External id": 82577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82577, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82577, "pid": 5, "tid": 7, "ts": 1716454222631639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552158, "dur": 14, "args": { "External id": 82577, "cbid": 211, "correlation": 82577 } }, { "ph": "s", "id": 82577, "pid": 76337, "tid": -914061504, "ts": 1716454222552158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222631845, "dur": 66, "args": { "External id": 82599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82599, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82599, "pid": 5, "tid": 7, "ts": 1716454222631845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552215, "dur": 10, "args": { "External id": 82599, "cbid": 211, "correlation": 82599 } }, { "ph": "s", "id": 82599, "pid": 76337, "tid": -914061504, "ts": 1716454222552215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222552305, "dur": 2, "args": { "External id": 82610, "cbid": 251, "correlation": 82610 } }, { "ph": "f", "id": 82610, "pid": 76337, "tid": -914061504, "ts": 1716454222552305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222631912, "dur": 156, "args": { "External id": 82611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82611, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82611, "pid": 5, "tid": 7, "ts": 1716454222631912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552310, "dur": 13, "args": { "External id": 82611, "cbid": 211, "correlation": 82611 } }, { "ph": "s", "id": 82611, "pid": 76337, "tid": -914061504, "ts": 1716454222552310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222552381, "dur": 1, "args": { "External id": 82622, "cbid": 251, "correlation": 82622 } }, { "ph": "f", "id": 82622, "pid": 76337, "tid": -914061504, "ts": 1716454222552381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222632069, "dur": 147, "args": { "External id": 82623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82623, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82623, "pid": 5, "tid": 7, "ts": 1716454222632069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552385, "dur": 12, "args": { "External id": 82623, "cbid": 211, "correlation": 82623 } }, { "ph": "s", "id": 82623, "pid": 76337, "tid": -914061504, "ts": 1716454222552385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222552451, "dur": 1, "args": { "External id": 82634, "cbid": 251, "correlation": 82634 } }, { "ph": "f", "id": 82634, "pid": 76337, "tid": -914061504, "ts": 1716454222552451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222632218, "dur": 147, "args": { "External id": 82635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82635, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82635, "pid": 5, "tid": 7, "ts": 1716454222632218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552455, "dur": 12, "args": { "External id": 82635, "cbid": 211, "correlation": 82635 } }, { "ph": "s", "id": 82635, "pid": 76337, "tid": -914061504, "ts": 1716454222552455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222632367, "dur": 1989, "args": { "External id": 82656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82656, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 82656, "pid": 5, "tid": 7, "ts": 1716454222632367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552536, "dur": 12, "args": { "External id": 82656, "cbid": 211, "correlation": 82656 } }, { "ph": "s", "id": 82656, "pid": 76337, "tid": -914061504, "ts": 1716454222552536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222552634, "dur": 1, "args": { "External id": 82674, "cbid": 251, "correlation": 82674 } }, { "ph": "f", "id": 82674, "pid": 76337, "tid": -914061504, "ts": 1716454222552634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222634357, "dur": 151, "args": { "External id": 82676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82676, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 82676, "pid": 5, "tid": 7, "ts": 1716454222634357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552640, "dur": 13, "args": { "External id": 82676, "cbid": 211, "correlation": 82676 } }, { "ph": "s", "id": 82676, "pid": 76337, "tid": -914061504, "ts": 1716454222552640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222634510, "dur": 35, "args": { "External id": 82684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82684, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82684, "pid": 5, "tid": 7, "ts": 1716454222634510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552709, "dur": 12, "args": { "External id": 82684, "cbid": 211, "correlation": 82684 } }, { "ph": "s", "id": 82684, "pid": 76337, "tid": -914061504, "ts": 1716454222552709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222634547, "dur": 51, "args": { "External id": 82692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82692, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82692, "pid": 5, "tid": 7, "ts": 1716454222634547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552749, "dur": 9, "args": { "External id": 82692, "cbid": 211, "correlation": 82692 } }, { "ph": "s", "id": 82692, "pid": 76337, "tid": -914061504, "ts": 1716454222552749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222634599, "dur": 31, "args": { "External id": 82703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82703, "pid": 5, "tid": 7, "ts": 1716454222634599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552820, "dur": 13, "args": { "External id": 82703, "cbid": 211, "correlation": 82703 } }, { "ph": "s", "id": 82703, "pid": 76337, "tid": -914061504, "ts": 1716454222552820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222634632, "dur": 35, "args": { "External id": 82725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82725, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82725, "pid": 5, "tid": 7, "ts": 1716454222634632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552852, "dur": 7, "args": { "External id": 82725, "cbid": 211, "correlation": 82725 } }, { "ph": "s", "id": 82725, "pid": 76337, "tid": -914061504, "ts": 1716454222552852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222552936, "dur": 1, "args": { "External id": 82736, "cbid": 251, "correlation": 82736 } }, { "ph": "f", "id": 82736, "pid": 76337, "tid": -914061504, "ts": 1716454222552936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222634668, "dur": 91, "args": { "External id": 82737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82737, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82737, "pid": 5, "tid": 7, "ts": 1716454222634668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222552942, "dur": 13, "args": { "External id": 82737, "cbid": 211, "correlation": 82737 } }, { "ph": "s", "id": 82737, "pid": 76337, "tid": -914061504, "ts": 1716454222552942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553019, "dur": 1, "args": { "External id": 82748, "cbid": 251, "correlation": 82748 } }, { "ph": "f", "id": 82748, "pid": 76337, "tid": -914061504, "ts": 1716454222553019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553022, "dur": 0, "args": { "External id": 82749, "cbid": 251, "correlation": 82749 } }, { "ph": "f", "id": 82749, "pid": 76337, "tid": -914061504, "ts": 1716454222553022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222634760, "dur": 11, "args": { "External id": 82750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82750, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 82750, "pid": 5, "tid": 7, "ts": 1716454222634760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553024, "dur": 12, "args": { "External id": 82750, "cbid": 211, "correlation": 82750 } }, { "ph": "s", "id": 82750, "pid": 76337, "tid": -914061504, "ts": 1716454222553024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222634773, "dur": 5, "args": { "External id": 82752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82752, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 82752, "pid": 5, "tid": 7, "ts": 1716454222634773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553038, "dur": 6, "args": { "External id": 82752, "cbid": 211, "correlation": 82752 } }, { "ph": "s", "id": 82752, "pid": 76337, "tid": -914061504, "ts": 1716454222553038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553096, "dur": 1, "args": { "External id": 82763, "cbid": 251, "correlation": 82763 } }, { "ph": "f", "id": 82763, "pid": 76337, "tid": -914061504, "ts": 1716454222553096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553100, "dur": 0, "args": { "External id": 82764, "cbid": 251, "correlation": 82764 } }, { "ph": "f", "id": 82764, "pid": 76337, "tid": -914061504, "ts": 1716454222553100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222634779, "dur": 7, "args": { "External id": 82765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82765, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 82765, "pid": 5, "tid": 7, "ts": 1716454222634779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553102, "dur": 12, "args": { "External id": 82765, "cbid": 211, "correlation": 82765 } }, { "ph": "s", "id": 82765, "pid": 76337, "tid": -914061504, "ts": 1716454222553102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222634788, "dur": 4, "args": { "External id": 82767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82767, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 82767, "pid": 5, "tid": 7, "ts": 1716454222634788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553115, "dur": 5, "args": { "External id": 82767, "cbid": 211, "correlation": 82767 } }, { "ph": "s", "id": 82767, "pid": 76337, "tid": -914061504, "ts": 1716454222553115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222634792, "dur": 92, "args": { "External id": 82788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82788, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 82788, "pid": 5, "tid": 7, "ts": 1716454222634792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553189, "dur": 13, "args": { "External id": 82788, "cbid": 211, "correlation": 82788 } }, { "ph": "s", "id": 82788, "pid": 76337, "tid": -914061504, "ts": 1716454222553189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553285, "dur": 1, "args": { "External id": 82806, "cbid": 251, "correlation": 82806 } }, { "ph": "f", "id": 82806, "pid": 76337, "tid": -914061504, "ts": 1716454222553285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222634886, "dur": 99, "args": { "External id": 82808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82808, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82808, "pid": 5, "tid": 7, "ts": 1716454222634886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553291, "dur": 13, "args": { "External id": 82808, "cbid": 211, "correlation": 82808 } }, { "ph": "s", "id": 82808, "pid": 76337, "tid": -914061504, "ts": 1716454222553291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222634987, "dur": 19, "args": { "External id": 82816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82816, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82816, "pid": 5, "tid": 7, "ts": 1716454222634987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553360, "dur": 12, "args": { "External id": 82816, "cbid": 211, "correlation": 82816 } }, { "ph": "s", "id": 82816, "pid": 76337, "tid": -914061504, "ts": 1716454222553360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222635007, "dur": 38, "args": { "External id": 82824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82824, "pid": 5, "tid": 7, "ts": 1716454222635007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553402, "dur": 10, "args": { "External id": 82824, "cbid": 211, "correlation": 82824 } }, { "ph": "s", "id": 82824, "pid": 76337, "tid": -914061504, "ts": 1716454222553402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222635046, "dur": 35, "args": { "External id": 82846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82846, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82846, "pid": 5, "tid": 7, "ts": 1716454222635046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553454, "dur": 10, "args": { "External id": 82846, "cbid": 211, "correlation": 82846 } }, { "ph": "s", "id": 82846, "pid": 76337, "tid": -914061504, "ts": 1716454222553454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553542, "dur": 1, "args": { "External id": 82862, "cbid": 251, "correlation": 82862 } }, { "ph": "f", "id": 82862, "pid": 76337, "tid": -914061504, "ts": 1716454222553542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553547, "dur": 0, "args": { "External id": 82864, "cbid": 251, "correlation": 82864 } }, { "ph": "f", "id": 82864, "pid": 76337, "tid": -914061504, "ts": 1716454222553547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222635083, "dur": 549, "args": { "External id": 82865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82865, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 82865, "pid": 5, "tid": 7, "ts": 1716454222635083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553550, "dur": 12, "args": { "External id": 82865, "cbid": 211, "correlation": 82865 } }, { "ph": "s", "id": 82865, "pid": 76337, "tid": -914061504, "ts": 1716454222553550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222635634, "dur": 128, "args": { "External id": 82873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82873, "pid": 5, "tid": 7, "ts": 1716454222635634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553615, "dur": 14, "args": { "External id": 82873, "cbid": 211, "correlation": 82873 } }, { "ph": "s", "id": 82873, "pid": 76337, "tid": -914061504, "ts": 1716454222553615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222635763, "dur": 131, "args": { "External id": 82881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82881, "pid": 5, "tid": 7, "ts": 1716454222635763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553646, "dur": 8, "args": { "External id": 82881, "cbid": 211, "correlation": 82881 } }, { "ph": "s", "id": 82881, "pid": 76337, "tid": -914061504, "ts": 1716454222553646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222553722, "dur": 1, "args": { "External id": 82897, "cbid": 251, "correlation": 82897 } }, { "ph": "f", "id": 82897, "pid": 76337, "tid": -914061504, "ts": 1716454222553722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222635895, "dur": 309, "args": { "External id": 82899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82899, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82899, "pid": 5, "tid": 7, "ts": 1716454222635895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553728, "dur": 13, "args": { "External id": 82899, "cbid": 211, "correlation": 82899 } }, { "ph": "s", "id": 82899, "pid": 76337, "tid": -914061504, "ts": 1716454222553728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222636205, "dur": 27, "args": { "External id": 82907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82907, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82907, "pid": 5, "tid": 7, "ts": 1716454222636205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553770, "dur": 10, "args": { "External id": 82907, "cbid": 211, "correlation": 82907 } }, { "ph": "s", "id": 82907, "pid": 76337, "tid": -914061504, "ts": 1716454222553770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222636234, "dur": 83, "args": { "External id": 82918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82918, "pid": 5, "tid": 7, "ts": 1716454222636234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553837, "dur": 13, "args": { "External id": 82918, "cbid": 211, "correlation": 82918 } }, { "ph": "s", "id": 82918, "pid": 76337, "tid": -914061504, "ts": 1716454222553837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222553901, "dur": 0, "args": { "External id": 82930, "cbid": 317, "correlation": 82930 } }, { "ph": "f", "id": 82930, "pid": 76337, "tid": -914061504, "ts": 1716454222553901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222553902, "dur": 0, "args": { "External id": 82931, "cbid": 203, "correlation": 82931 } }, { "ph": "f", "id": 82931, "pid": 76337, "tid": -914061504, "ts": 1716454222553902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222553903, "dur": 0, "args": { "External id": 82932, "cbid": 205, "correlation": 82932 } }, { "ph": "f", "id": 82932, "pid": 76337, "tid": -914061504, "ts": 1716454222553903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222636318, "dur": 23, "args": { "External id": 82936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82936, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82936, "pid": 5, "tid": 7, "ts": 1716454222636318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553918, "dur": 11, "args": { "External id": 82936, "cbid": 211, "correlation": 82936 } }, { "ph": "s", "id": 82936, "pid": 76337, "tid": -914061504, "ts": 1716454222553918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222636343, "dur": 123, "args": { "External id": 82938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82938, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82938, "pid": 5, "tid": 7, "ts": 1716454222636343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553936, "dur": 7, "args": { "External id": 82938, "cbid": 211, "correlation": 82938 } }, { "ph": "s", "id": 82938, "pid": 76337, "tid": -914061504, "ts": 1716454222553936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222636467, "dur": 23, "args": { "External id": 82940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82940, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82940, "pid": 5, "tid": 7, "ts": 1716454222636467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553947, "dur": 6, "args": { "External id": 82940, "cbid": 211, "correlation": 82940 } }, { "ph": "s", "id": 82940, "pid": 76337, "tid": -914061504, "ts": 1716454222553947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222636492, "dur": 33, "args": { "External id": 82946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82946, "pid": 5, "tid": 7, "ts": 1716454222636492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222553982, "dur": 9, "args": { "External id": 82946, "cbid": 211, "correlation": 82946 } }, { "ph": "s", "id": 82946, "pid": 76337, "tid": -914061504, "ts": 1716454222553982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222636526, "dur": 27, "args": { "External id": 82954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82954, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82954, "pid": 5, "tid": 7, "ts": 1716454222636526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554015, "dur": 8, "args": { "External id": 82954, "cbid": 211, "correlation": 82954 } }, { "ph": "s", "id": 82954, "pid": 76337, "tid": -914061504, "ts": 1716454222554015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222636554, "dur": 102, "args": { "External id": 82965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82965, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82965, "pid": 5, "tid": 7, "ts": 1716454222636554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554078, "dur": 11, "args": { "External id": 82965, "cbid": 211, "correlation": 82965 } }, { "ph": "s", "id": 82965, "pid": 76337, "tid": -914061504, "ts": 1716454222554078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222554132, "dur": 0, "args": { "External id": 82975, "cbid": 317, "correlation": 82975 } }, { "ph": "f", "id": 82975, "pid": 76337, "tid": -914061504, "ts": 1716454222554132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222554133, "dur": 0, "args": { "External id": 82976, "cbid": 203, "correlation": 82976 } }, { "ph": "f", "id": 82976, "pid": 76337, "tid": -914061504, "ts": 1716454222554133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222554133, "dur": 0, "args": { "External id": 82977, "cbid": 205, "correlation": 82977 } }, { "ph": "f", "id": 82977, "pid": 76337, "tid": -914061504, "ts": 1716454222554133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222636658, "dur": 75, "args": { "External id": 82981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82981, "pid": 5, "tid": 7, "ts": 1716454222636658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554147, "dur": 12, "args": { "External id": 82981, "cbid": 211, "correlation": 82981 } }, { "ph": "s", "id": 82981, "pid": 76337, "tid": -914061504, "ts": 1716454222554147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222636735, "dur": 44, "args": { "External id": 82983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82983, "pid": 5, "tid": 7, "ts": 1716454222636735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554162, "dur": 5, "args": { "External id": 82983, "cbid": 211, "correlation": 82983 } }, { "ph": "s", "id": 82983, "pid": 76337, "tid": -914061504, "ts": 1716454222554162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222636781, "dur": 4, "args": { "External id": 82985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82985, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 82985, "pid": 5, "tid": 7, "ts": 1716454222636781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554172, "dur": 6, "args": { "External id": 82985, "cbid": 211, "correlation": 82985 } }, { "ph": "s", "id": 82985, "pid": 76337, "tid": -914061504, "ts": 1716454222554172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222554181, "dur": 0, "args": { "External id": 82986, "cbid": 51, "correlation": 82986 } }, { "ph": "s", "id": 82986, "pid": 76337, "tid": -914061504, "ts": 1716454222554181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222636786, "dur": 2226, "args": { "External id": 82987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82987, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 82987, "pid": 5, "tid": 7, "ts": 1716454222636786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554182, "dur": 5, "args": { "External id": 82987, "cbid": 211, "correlation": 82987 } }, { "ph": "s", "id": 82987, "pid": 76337, "tid": -914061504, "ts": 1716454222554182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222639013, "dur": 113, "args": { "External id": 82992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 82992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 82992, "pid": 5, "tid": 7, "ts": 1716454222639013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554210, "dur": 9, "args": { "External id": 82992, "cbid": 211, "correlation": 82992 } }, { "ph": "s", "id": 82992, "pid": 76337, "tid": -914061504, "ts": 1716454222554210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222639128, "dur": 168, "args": { "External id": 83001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83001, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83001, "pid": 5, "tid": 7, "ts": 1716454222639128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554300, "dur": 13, "args": { "External id": 83001, "cbid": 211, "correlation": 83001 } }, { "ph": "s", "id": 83001, "pid": 76337, "tid": -914061504, "ts": 1716454222554300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222639297, "dur": 132, "args": { "External id": 83021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83021, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 83021, "pid": 5, "tid": 7, "ts": 1716454222639297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554373, "dur": 11, "args": { "External id": 83021, "cbid": 211, "correlation": 83021 } }, { "ph": "s", "id": 83021, "pid": 76337, "tid": -914061504, "ts": 1716454222554373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222639431, "dur": 4, "args": { "External id": 83033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83033, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 83033, "pid": 5, "tid": 7, "ts": 1716454222639431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554395, "dur": 7, "args": { "External id": 83033, "cbid": 211, "correlation": 83033 } }, { "ph": "s", "id": 83033, "pid": 76337, "tid": -914061504, "ts": 1716454222554395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222639437, "dur": 162, "args": { "External id": 83036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83036, "pid": 5, "tid": 7, "ts": 1716454222639437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554413, "dur": 7, "args": { "External id": 83036, "cbid": 211, "correlation": 83036 } }, { "ph": "s", "id": 83036, "pid": 76337, "tid": -914061504, "ts": 1716454222554413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222639600, "dur": 101, "args": { "External id": 83045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83045, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83045, "pid": 5, "tid": 7, "ts": 1716454222639600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554454, "dur": 10, "args": { "External id": 83045, "cbid": 211, "correlation": 83045 } }, { "ph": "s", "id": 83045, "pid": 76337, "tid": -914061504, "ts": 1716454222554454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222554508, "dur": 0, "args": { "External id": 83055, "cbid": 317, "correlation": 83055 } }, { "ph": "f", "id": 83055, "pid": 76337, "tid": -914061504, "ts": 1716454222554508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222554508, "dur": 0, "args": { "External id": 83056, "cbid": 203, "correlation": 83056 } }, { "ph": "f", "id": 83056, "pid": 76337, "tid": -914061504, "ts": 1716454222554508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222554509, "dur": 0, "args": { "External id": 83057, "cbid": 205, "correlation": 83057 } }, { "ph": "f", "id": 83057, "pid": 76337, "tid": -914061504, "ts": 1716454222554509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222639702, "dur": 112, "args": { "External id": 83061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83061, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83061, "pid": 5, "tid": 7, "ts": 1716454222639702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554525, "dur": 11, "args": { "External id": 83061, "cbid": 211, "correlation": 83061 } }, { "ph": "s", "id": 83061, "pid": 76337, "tid": -914061504, "ts": 1716454222554525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222639815, "dur": 34, "args": { "External id": 83063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83063, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83063, "pid": 5, "tid": 7, "ts": 1716454222639815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554539, "dur": 5, "args": { "External id": 83063, "cbid": 211, "correlation": 83063 } }, { "ph": "s", "id": 83063, "pid": 76337, "tid": -914061504, "ts": 1716454222554539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222639851, "dur": 3, "args": { "External id": 83065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83065, "pid": 5, "tid": 7, "ts": 1716454222639851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554549, "dur": 5, "args": { "External id": 83065, "cbid": 211, "correlation": 83065 } }, { "ph": "s", "id": 83065, "pid": 76337, "tid": -914061504, "ts": 1716454222554549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222554558, "dur": 0, "args": { "External id": 83066, "cbid": 51, "correlation": 83066 } }, { "ph": "s", "id": 83066, "pid": 76337, "tid": -914061504, "ts": 1716454222554558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222639856, "dur": 2045, "args": { "External id": 83067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83067, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83067, "pid": 5, "tid": 7, "ts": 1716454222639856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554558, "dur": 6, "args": { "External id": 83067, "cbid": 211, "correlation": 83067 } }, { "ph": "s", "id": 83067, "pid": 76337, "tid": -914061504, "ts": 1716454222554558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222641902, "dur": 59, "args": { "External id": 83072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83072, "pid": 5, "tid": 7, "ts": 1716454222641902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554587, "dur": 8, "args": { "External id": 83072, "cbid": 211, "correlation": 83072 } }, { "ph": "s", "id": 83072, "pid": 76337, "tid": -914061504, "ts": 1716454222554587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222641963, "dur": 3, "args": { "External id": 83080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83080, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83080, "pid": 5, "tid": 7, "ts": 1716454222641963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554631, "dur": 9, "args": { "External id": 83080, "cbid": 211, "correlation": 83080 } }, { "ph": "s", "id": 83080, "pid": 76337, "tid": -914061504, "ts": 1716454222554631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222554696, "dur": 1, "args": { "External id": 83096, "cbid": 251, "correlation": 83096 } }, { "ph": "f", "id": 83096, "pid": 76337, "tid": -914061504, "ts": 1716454222554696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222554701, "dur": 0, "args": { "External id": 83098, "cbid": 251, "correlation": 83098 } }, { "ph": "f", "id": 83098, "pid": 76337, "tid": -914061504, "ts": 1716454222554701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222641968, "dur": 11, "args": { "External id": 83099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83099, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 83099, "pid": 5, "tid": 7, "ts": 1716454222641968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554703, "dur": 11, "args": { "External id": 83099, "cbid": 211, "correlation": 83099 } }, { "ph": "s", "id": 83099, "pid": 76337, "tid": -914061504, "ts": 1716454222554703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222641980, "dur": 5, "args": { "External id": 83101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83101, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 83101, "pid": 5, "tid": 7, "ts": 1716454222641980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554716, "dur": 5, "args": { "External id": 83101, "cbid": 211, "correlation": 83101 } }, { "ph": "s", "id": 83101, "pid": 76337, "tid": -914061504, "ts": 1716454222554716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222641987, "dur": 54, "args": { "External id": 83111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83111, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83111, "pid": 5, "tid": 7, "ts": 1716454222641987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554773, "dur": 13, "args": { "External id": 83111, "cbid": 211, "correlation": 83111 } }, { "ph": "s", "id": 83111, "pid": 76337, "tid": -914061504, "ts": 1716454222554773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222642042, "dur": 52, "args": { "External id": 83131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83131, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 83131, "pid": 5, "tid": 7, "ts": 1716454222642042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554840, "dur": 12, "args": { "External id": 83131, "cbid": 211, "correlation": 83131 } }, { "ph": "s", "id": 83131, "pid": 76337, "tid": -914061504, "ts": 1716454222554840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222642096, "dur": 4, "args": { "External id": 83143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83143, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83143, "pid": 5, "tid": 7, "ts": 1716454222642096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554861, "dur": 6, "args": { "External id": 83143, "cbid": 211, "correlation": 83143 } }, { "ph": "s", "id": 83143, "pid": 76337, "tid": -914061504, "ts": 1716454222554861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222642101, "dur": 57, "args": { "External id": 83146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83146, "pid": 5, "tid": 7, "ts": 1716454222642101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554879, "dur": 6, "args": { "External id": 83146, "cbid": 211, "correlation": 83146 } }, { "ph": "s", "id": 83146, "pid": 76337, "tid": -914061504, "ts": 1716454222554879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222642159, "dur": 37, "args": { "External id": 83155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83155, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83155, "pid": 5, "tid": 7, "ts": 1716454222642159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222554920, "dur": 10, "args": { "External id": 83155, "cbid": 211, "correlation": 83155 } }, { "ph": "s", "id": 83155, "pid": 76337, "tid": -914061504, "ts": 1716454222554920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222554991, "dur": 0, "args": { "External id": 83165, "cbid": 317, "correlation": 83165 } }, { "ph": "f", "id": 83165, "pid": 76337, "tid": -914061504, "ts": 1716454222554991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222554992, "dur": 0, "args": { "External id": 83166, "cbid": 203, "correlation": 83166 } }, { "ph": "f", "id": 83166, "pid": 76337, "tid": -914061504, "ts": 1716454222554992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222554993, "dur": 0, "args": { "External id": 83167, "cbid": 205, "correlation": 83167 } }, { "ph": "f", "id": 83167, "pid": 76337, "tid": -914061504, "ts": 1716454222554993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222642198, "dur": 40, "args": { "External id": 83171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83171, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83171, "pid": 5, "tid": 7, "ts": 1716454222642198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555010, "dur": 13, "args": { "External id": 83171, "cbid": 211, "correlation": 83171 } }, { "ph": "s", "id": 83171, "pid": 76337, "tid": -914061504, "ts": 1716454222555010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222642239, "dur": 14, "args": { "External id": 83173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83173, "pid": 5, "tid": 7, "ts": 1716454222642239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555025, "dur": 5, "args": { "External id": 83173, "cbid": 211, "correlation": 83173 } }, { "ph": "s", "id": 83173, "pid": 76337, "tid": -914061504, "ts": 1716454222555025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222642255, "dur": 4, "args": { "External id": 83175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83175, "pid": 5, "tid": 7, "ts": 1716454222642255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555035, "dur": 5, "args": { "External id": 83175, "cbid": 211, "correlation": 83175 } }, { "ph": "s", "id": 83175, "pid": 76337, "tid": -914061504, "ts": 1716454222555035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222555043, "dur": 0, "args": { "External id": 83176, "cbid": 51, "correlation": 83176 } }, { "ph": "s", "id": 83176, "pid": 76337, "tid": -914061504, "ts": 1716454222555043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222642260, "dur": 711, "args": { "External id": 83177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83177, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83177, "pid": 5, "tid": 7, "ts": 1716454222642260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555044, "dur": 5, "args": { "External id": 83177, "cbid": 211, "correlation": 83177 } }, { "ph": "s", "id": 83177, "pid": 76337, "tid": -914061504, "ts": 1716454222555044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222642972, "dur": 59, "args": { "External id": 83182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83182, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83182, "pid": 5, "tid": 7, "ts": 1716454222642972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555073, "dur": 9, "args": { "External id": 83182, "cbid": 211, "correlation": 83182 } }, { "ph": "s", "id": 83182, "pid": 76337, "tid": -914061504, "ts": 1716454222555073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222555130, "dur": 0, "args": { "External id": 83192, "cbid": 317, "correlation": 83192 } }, { "ph": "f", "id": 83192, "pid": 76337, "tid": -914061504, "ts": 1716454222555130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222555131, "dur": 0, "args": { "External id": 83193, "cbid": 203, "correlation": 83193 } }, { "ph": "f", "id": 83193, "pid": 76337, "tid": -914061504, "ts": 1716454222555131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222555131, "dur": 0, "args": { "External id": 83194, "cbid": 205, "correlation": 83194 } }, { "ph": "f", "id": 83194, "pid": 76337, "tid": -914061504, "ts": 1716454222555131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222643032, "dur": 3, "args": { "External id": 83198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83198, "pid": 5, "tid": 7, "ts": 1716454222643032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555186, "dur": 12, "args": { "External id": 83198, "cbid": 211, "correlation": 83198 } }, { "ph": "s", "id": 83198, "pid": 76337, "tid": -914061504, "ts": 1716454222555186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222555209, "dur": 0, "args": { "External id": 83199, "cbid": 51, "correlation": 83199 } }, { "ph": "s", "id": 83199, "pid": 76337, "tid": -914061504, "ts": 1716454222555209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454222643037, "dur": 269, "args": { "External id": 83200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83200, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83200, "pid": 5, "tid": 7, "ts": 1716454222643037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555210, "dur": 7, "args": { "External id": 83200, "cbid": 211, "correlation": 83200 } }, { "ph": "s", "id": 83200, "pid": 76337, "tid": -914061504, "ts": 1716454222555210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222643307, "dur": 59, "args": { "External id": 83205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83205, "pid": 5, "tid": 7, "ts": 1716454222643307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555240, "dur": 9, "args": { "External id": 83205, "cbid": 211, "correlation": 83205 } }, { "ph": "s", "id": 83205, "pid": 76337, "tid": -914061504, "ts": 1716454222555240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222643368, "dur": 50, "args": { "External id": 83213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83213, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83213, "pid": 5, "tid": 7, "ts": 1716454222643368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555270, "dur": 8, "args": { "External id": 83213, "cbid": 211, "correlation": 83213 } }, { "ph": "s", "id": 83213, "pid": 76337, "tid": -914061504, "ts": 1716454222555270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222643419, "dur": 35, "args": { "External id": 83221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83221, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83221, "pid": 5, "tid": 7, "ts": 1716454222643419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555300, "dur": 8, "args": { "External id": 83221, "cbid": 211, "correlation": 83221 } }, { "ph": "s", "id": 83221, "pid": 76337, "tid": -914061504, "ts": 1716454222555300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222643456, "dur": 52, "args": { "External id": 83241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83241, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 83241, "pid": 5, "tid": 7, "ts": 1716454222643456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555382, "dur": 12, "args": { "External id": 83241, "cbid": 211, "correlation": 83241 } }, { "ph": "s", "id": 83241, "pid": 76337, "tid": -914061504, "ts": 1716454222555382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222643509, "dur": 4, "args": { "External id": 83253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83253, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83253, "pid": 5, "tid": 7, "ts": 1716454222643509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555404, "dur": 6, "args": { "External id": 83253, "cbid": 211, "correlation": 83253 } }, { "ph": "s", "id": 83253, "pid": 76337, "tid": -914061504, "ts": 1716454222555404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222643514, "dur": 57, "args": { "External id": 83256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83256, "pid": 5, "tid": 7, "ts": 1716454222643514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555423, "dur": 7, "args": { "External id": 83256, "cbid": 211, "correlation": 83256 } }, { "ph": "s", "id": 83256, "pid": 76337, "tid": -914061504, "ts": 1716454222555423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222555479, "dur": 0, "args": { "External id": 83267, "cbid": 317, "correlation": 83267 } }, { "ph": "f", "id": 83267, "pid": 76337, "tid": -914061504, "ts": 1716454222555479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222555480, "dur": 0, "args": { "External id": 83268, "cbid": 203, "correlation": 83268 } }, { "ph": "f", "id": 83268, "pid": 76337, "tid": -914061504, "ts": 1716454222555480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222555481, "dur": 0, "args": { "External id": 83269, "cbid": 205, "correlation": 83269 } }, { "ph": "f", "id": 83269, "pid": 76337, "tid": -914061504, "ts": 1716454222555481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555511, "dur": 2, "args": { "External id": 83273, "cbid": 251, "correlation": 83273 } }, { "ph": "f", "id": 83273, "pid": 76337, "tid": -914061504, "ts": 1716454222555511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555514, "dur": 0, "args": { "External id": 83274, "cbid": 251, "correlation": 83274 } }, { "ph": "f", "id": 83274, "pid": 76337, "tid": -914061504, "ts": 1716454222555514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555515, "dur": 1, "args": { "External id": 83275, "cbid": 251, "correlation": 83275 } }, { "ph": "f", "id": 83275, "pid": 76337, "tid": -914061504, "ts": 1716454222555515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555517, "dur": 0, "args": { "External id": 83276, "cbid": 251, "correlation": 83276 } }, { "ph": "f", "id": 83276, "pid": 76337, "tid": -914061504, "ts": 1716454222555517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555519, "dur": 1, "args": { "External id": 83277, "cbid": 251, "correlation": 83277 } }, { "ph": "f", "id": 83277, "pid": 76337, "tid": -914061504, "ts": 1716454222555519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555520, "dur": 1, "args": { "External id": 83278, "cbid": 251, "correlation": 83278 } }, { "ph": "f", "id": 83278, "pid": 76337, "tid": -914061504, "ts": 1716454222555520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555522, "dur": 1, "args": { "External id": 83279, "cbid": 251, "correlation": 83279 } }, { "ph": "f", "id": 83279, "pid": 76337, "tid": -914061504, "ts": 1716454222555522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555524, "dur": 1, "args": { "External id": 83280, "cbid": 251, "correlation": 83280 } }, { "ph": "f", "id": 83280, "pid": 76337, "tid": -914061504, "ts": 1716454222555524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555526, "dur": 0, "args": { "External id": 83281, "cbid": 251, "correlation": 83281 } }, { "ph": "f", "id": 83281, "pid": 76337, "tid": -914061504, "ts": 1716454222555526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222643573, "dur": 116, "args": { "External id": 83282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83282, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 83282, "pid": 5, "tid": 7, "ts": 1716454222643573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555528, "dur": 13, "args": { "External id": 83282, "cbid": 211, "correlation": 83282 } }, { "ph": "s", "id": 83282, "pid": 76337, "tid": -914061504, "ts": 1716454222555528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222643690, "dur": 60, "args": { "External id": 83288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83288, "pid": 5, "tid": 7, "ts": 1716454222643690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555565, "dur": 9, "args": { "External id": 83288, "cbid": 211, "correlation": 83288 } }, { "ph": "s", "id": 83288, "pid": 76337, "tid": -914061504, "ts": 1716454222555565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222643752, "dur": 552, "args": { "External id": 83297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83297, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83297, "pid": 5, "tid": 7, "ts": 1716454222643752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555648, "dur": 14, "args": { "External id": 83297, "cbid": 211, "correlation": 83297 } }, { "ph": "s", "id": 83297, "pid": 76337, "tid": -914061504, "ts": 1716454222555648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222644305, "dur": 184, "args": { "External id": 83319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83319, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83319, "pid": 5, "tid": 7, "ts": 1716454222644305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555706, "dur": 10, "args": { "External id": 83319, "cbid": 211, "correlation": 83319 } }, { "ph": "s", "id": 83319, "pid": 76337, "tid": -914061504, "ts": 1716454222555706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555796, "dur": 1, "args": { "External id": 83330, "cbid": 251, "correlation": 83330 } }, { "ph": "f", "id": 83330, "pid": 76337, "tid": -914061504, "ts": 1716454222555796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222644491, "dur": 198, "args": { "External id": 83331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83331, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83331, "pid": 5, "tid": 7, "ts": 1716454222644491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555801, "dur": 13, "args": { "External id": 83331, "cbid": 211, "correlation": 83331 } }, { "ph": "s", "id": 83331, "pid": 76337, "tid": -914061504, "ts": 1716454222555801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555870, "dur": 1, "args": { "External id": 83342, "cbid": 251, "correlation": 83342 } }, { "ph": "f", "id": 83342, "pid": 76337, "tid": -914061504, "ts": 1716454222555870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222644690, "dur": 191, "args": { "External id": 83343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83343, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83343, "pid": 5, "tid": 7, "ts": 1716454222644690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555875, "dur": 12, "args": { "External id": 83343, "cbid": 211, "correlation": 83343 } }, { "ph": "s", "id": 83343, "pid": 76337, "tid": -914061504, "ts": 1716454222555875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222555939, "dur": 1, "args": { "External id": 83354, "cbid": 251, "correlation": 83354 } }, { "ph": "f", "id": 83354, "pid": 76337, "tid": -914061504, "ts": 1716454222555939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222644882, "dur": 190, "args": { "External id": 83355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83355, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83355, "pid": 5, "tid": 7, "ts": 1716454222644882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222555943, "dur": 11, "args": { "External id": 83355, "cbid": 211, "correlation": 83355 } }, { "ph": "s", "id": 83355, "pid": 76337, "tid": -914061504, "ts": 1716454222555943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222645073, "dur": 18937, "args": { "External id": 83376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83376, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 83376, "pid": 5, "tid": 7, "ts": 1716454222645073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556033, "dur": 15, "args": { "External id": 83376, "cbid": 211, "correlation": 83376 } }, { "ph": "s", "id": 83376, "pid": 76337, "tid": -914061504, "ts": 1716454222556033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556133, "dur": 1, "args": { "External id": 83394, "cbid": 251, "correlation": 83394 } }, { "ph": "f", "id": 83394, "pid": 76337, "tid": -914061504, "ts": 1716454222556133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222664011, "dur": 206, "args": { "External id": 83396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83396, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83396, "pid": 5, "tid": 7, "ts": 1716454222664011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556139, "dur": 14, "args": { "External id": 83396, "cbid": 211, "correlation": 83396 } }, { "ph": "s", "id": 83396, "pid": 76337, "tid": -914061504, "ts": 1716454222556139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222664218, "dur": 66, "args": { "External id": 83404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83404, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83404, "pid": 5, "tid": 7, "ts": 1716454222664218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556211, "dur": 12, "args": { "External id": 83404, "cbid": 211, "correlation": 83404 } }, { "ph": "s", "id": 83404, "pid": 76337, "tid": -914061504, "ts": 1716454222556211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222664286, "dur": 97, "args": { "External id": 83412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83412, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83412, "pid": 5, "tid": 7, "ts": 1716454222664286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556250, "dur": 9, "args": { "External id": 83412, "cbid": 211, "correlation": 83412 } }, { "ph": "s", "id": 83412, "pid": 76337, "tid": -914061504, "ts": 1716454222556250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222664384, "dur": 55, "args": { "External id": 83423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83423, "pid": 5, "tid": 7, "ts": 1716454222664384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556323, "dur": 13, "args": { "External id": 83423, "cbid": 211, "correlation": 83423 } }, { "ph": "s", "id": 83423, "pid": 76337, "tid": -914061504, "ts": 1716454222556323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222664440, "dur": 94, "args": { "External id": 83445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83445, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83445, "pid": 5, "tid": 7, "ts": 1716454222664440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556356, "dur": 7, "args": { "External id": 83445, "cbid": 211, "correlation": 83445 } }, { "ph": "s", "id": 83445, "pid": 76337, "tid": -914061504, "ts": 1716454222556356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556439, "dur": 1, "args": { "External id": 83456, "cbid": 251, "correlation": 83456 } }, { "ph": "f", "id": 83456, "pid": 76337, "tid": -914061504, "ts": 1716454222556439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222664535, "dur": 104, "args": { "External id": 83457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83457, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83457, "pid": 5, "tid": 7, "ts": 1716454222664535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556444, "dur": 14, "args": { "External id": 83457, "cbid": 211, "correlation": 83457 } }, { "ph": "s", "id": 83457, "pid": 76337, "tid": -914061504, "ts": 1716454222556444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556518, "dur": 1, "args": { "External id": 83468, "cbid": 251, "correlation": 83468 } }, { "ph": "f", "id": 83468, "pid": 76337, "tid": -914061504, "ts": 1716454222556518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556522, "dur": 0, "args": { "External id": 83469, "cbid": 251, "correlation": 83469 } }, { "ph": "f", "id": 83469, "pid": 76337, "tid": -914061504, "ts": 1716454222556522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222664641, "dur": 10, "args": { "External id": 83470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83470, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 83470, "pid": 5, "tid": 7, "ts": 1716454222664641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556524, "dur": 13, "args": { "External id": 83470, "cbid": 211, "correlation": 83470 } }, { "ph": "s", "id": 83470, "pid": 76337, "tid": -914061504, "ts": 1716454222556524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222664652, "dur": 5, "args": { "External id": 83472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83472, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 83472, "pid": 5, "tid": 7, "ts": 1716454222664652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556540, "dur": 7, "args": { "External id": 83472, "cbid": 211, "correlation": 83472 } }, { "ph": "s", "id": 83472, "pid": 76337, "tid": -914061504, "ts": 1716454222556540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556602, "dur": 1, "args": { "External id": 83483, "cbid": 251, "correlation": 83483 } }, { "ph": "f", "id": 83483, "pid": 76337, "tid": -914061504, "ts": 1716454222556602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556605, "dur": 0, "args": { "External id": 83484, "cbid": 251, "correlation": 83484 } }, { "ph": "f", "id": 83484, "pid": 76337, "tid": -914061504, "ts": 1716454222556605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222664659, "dur": 6, "args": { "External id": 83485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83485, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 83485, "pid": 5, "tid": 7, "ts": 1716454222664659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556607, "dur": 12, "args": { "External id": 83485, "cbid": 211, "correlation": 83485 } }, { "ph": "s", "id": 83485, "pid": 76337, "tid": -914061504, "ts": 1716454222556607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222664666, "dur": 4, "args": { "External id": 83487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83487, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 83487, "pid": 5, "tid": 7, "ts": 1716454222664666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556620, "dur": 5, "args": { "External id": 83487, "cbid": 211, "correlation": 83487 } }, { "ph": "s", "id": 83487, "pid": 76337, "tid": -914061504, "ts": 1716454222556620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222664671, "dur": 157, "args": { "External id": 83508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83508, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 83508, "pid": 5, "tid": 7, "ts": 1716454222664671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556693, "dur": 12, "args": { "External id": 83508, "cbid": 211, "correlation": 83508 } }, { "ph": "s", "id": 83508, "pid": 76337, "tid": -914061504, "ts": 1716454222556693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222556790, "dur": 1, "args": { "External id": 83526, "cbid": 251, "correlation": 83526 } }, { "ph": "f", "id": 83526, "pid": 76337, "tid": -914061504, "ts": 1716454222556790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222664830, "dur": 109, "args": { "External id": 83528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83528, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 83528, "pid": 5, "tid": 7, "ts": 1716454222664830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556796, "dur": 13, "args": { "External id": 83528, "cbid": 211, "correlation": 83528 } }, { "ph": "s", "id": 83528, "pid": 76337, "tid": -914061504, "ts": 1716454222556796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222664941, "dur": 35, "args": { "External id": 83536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83536, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83536, "pid": 5, "tid": 7, "ts": 1716454222664941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556865, "dur": 12, "args": { "External id": 83536, "cbid": 211, "correlation": 83536 } }, { "ph": "s", "id": 83536, "pid": 76337, "tid": -914061504, "ts": 1716454222556865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222664977, "dur": 67, "args": { "External id": 83544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83544, "pid": 5, "tid": 7, "ts": 1716454222664977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556906, "dur": 9, "args": { "External id": 83544, "cbid": 211, "correlation": 83544 } }, { "ph": "s", "id": 83544, "pid": 76337, "tid": -914061504, "ts": 1716454222556906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222665045, "dur": 94, "args": { "External id": 83566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83566, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83566, "pid": 5, "tid": 7, "ts": 1716454222665045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222556958, "dur": 10, "args": { "External id": 83566, "cbid": 211, "correlation": 83566 } }, { "ph": "s", "id": 83566, "pid": 76337, "tid": -914061504, "ts": 1716454222556958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557055, "dur": 1, "args": { "External id": 83582, "cbid": 251, "correlation": 83582 } }, { "ph": "f", "id": 83582, "pid": 76337, "tid": -914061504, "ts": 1716454222557055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222665141, "dur": 583, "args": { "External id": 83584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83584, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83584, "pid": 5, "tid": 7, "ts": 1716454222665141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557061, "dur": 13, "args": { "External id": 83584, "cbid": 211, "correlation": 83584 } }, { "ph": "s", "id": 83584, "pid": 76337, "tid": -914061504, "ts": 1716454222557061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222665725, "dur": 245, "args": { "External id": 83592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83592, "pid": 5, "tid": 7, "ts": 1716454222665725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557127, "dur": 14, "args": { "External id": 83592, "cbid": 211, "correlation": 83592 } }, { "ph": "s", "id": 83592, "pid": 76337, "tid": -914061504, "ts": 1716454222557127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222665972, "dur": 255, "args": { "External id": 83600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83600, "pid": 5, "tid": 7, "ts": 1716454222665972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557159, "dur": 8, "args": { "External id": 83600, "cbid": 211, "correlation": 83600 } }, { "ph": "s", "id": 83600, "pid": 76337, "tid": -914061504, "ts": 1716454222557159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557242, "dur": 2, "args": { "External id": 83616, "cbid": 251, "correlation": 83616 } }, { "ph": "f", "id": 83616, "pid": 76337, "tid": -914061504, "ts": 1716454222557242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557248, "dur": 0, "args": { "External id": 83618, "cbid": 251, "correlation": 83618 } }, { "ph": "f", "id": 83618, "pid": 76337, "tid": -914061504, "ts": 1716454222557248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222666229, "dur": 359, "args": { "External id": 83619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83619, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 83619, "pid": 5, "tid": 7, "ts": 1716454222666229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557250, "dur": 13, "args": { "External id": 83619, "cbid": 211, "correlation": 83619 } }, { "ph": "s", "id": 83619, "pid": 76337, "tid": -914061504, "ts": 1716454222557250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222666589, "dur": 50, "args": { "External id": 83627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83627, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83627, "pid": 5, "tid": 7, "ts": 1716454222666589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557292, "dur": 10, "args": { "External id": 83627, "cbid": 211, "correlation": 83627 } }, { "ph": "s", "id": 83627, "pid": 76337, "tid": -914061504, "ts": 1716454222557292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222666641, "dur": 161, "args": { "External id": 83638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83638, "pid": 5, "tid": 7, "ts": 1716454222666641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557360, "dur": 12, "args": { "External id": 83638, "cbid": 211, "correlation": 83638 } }, { "ph": "s", "id": 83638, "pid": 76337, "tid": -914061504, "ts": 1716454222557360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222557424, "dur": 0, "args": { "External id": 83650, "cbid": 317, "correlation": 83650 } }, { "ph": "f", "id": 83650, "pid": 76337, "tid": -914061504, "ts": 1716454222557424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222557424, "dur": 0, "args": { "External id": 83651, "cbid": 203, "correlation": 83651 } }, { "ph": "f", "id": 83651, "pid": 76337, "tid": -914061504, "ts": 1716454222557424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222557425, "dur": 0, "args": { "External id": 83652, "cbid": 205, "correlation": 83652 } }, { "ph": "f", "id": 83652, "pid": 76337, "tid": -914061504, "ts": 1716454222557425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557449, "dur": 1, "args": { "External id": 83656, "cbid": 251, "correlation": 83656 } }, { "ph": "f", "id": 83656, "pid": 76337, "tid": -914061504, "ts": 1716454222557449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557451, "dur": 0, "args": { "External id": 83657, "cbid": 251, "correlation": 83657 } }, { "ph": "f", "id": 83657, "pid": 76337, "tid": -914061504, "ts": 1716454222557451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557452, "dur": 0, "args": { "External id": 83658, "cbid": 251, "correlation": 83658 } }, { "ph": "f", "id": 83658, "pid": 76337, "tid": -914061504, "ts": 1716454222557452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557453, "dur": 0, "args": { "External id": 83659, "cbid": 251, "correlation": 83659 } }, { "ph": "f", "id": 83659, "pid": 76337, "tid": -914061504, "ts": 1716454222557453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557454, "dur": 1, "args": { "External id": 83660, "cbid": 251, "correlation": 83660 } }, { "ph": "f", "id": 83660, "pid": 76337, "tid": -914061504, "ts": 1716454222557454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557455, "dur": 0, "args": { "External id": 83661, "cbid": 251, "correlation": 83661 } }, { "ph": "f", "id": 83661, "pid": 76337, "tid": -914061504, "ts": 1716454222557455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557456, "dur": 0, "args": { "External id": 83662, "cbid": 251, "correlation": 83662 } }, { "ph": "f", "id": 83662, "pid": 76337, "tid": -914061504, "ts": 1716454222557456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557457, "dur": 0, "args": { "External id": 83663, "cbid": 251, "correlation": 83663 } }, { "ph": "f", "id": 83663, "pid": 76337, "tid": -914061504, "ts": 1716454222557457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557458, "dur": 0, "args": { "External id": 83664, "cbid": 251, "correlation": 83664 } }, { "ph": "f", "id": 83664, "pid": 76337, "tid": -914061504, "ts": 1716454222557458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222666804, "dur": 115, "args": { "External id": 83665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83665, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 83665, "pid": 5, "tid": 7, "ts": 1716454222666804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557460, "dur": 12, "args": { "External id": 83665, "cbid": 211, "correlation": 83665 } }, { "ph": "s", "id": 83665, "pid": 76337, "tid": -914061504, "ts": 1716454222557460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222666920, "dur": 60, "args": { "External id": 83671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83671, "pid": 5, "tid": 7, "ts": 1716454222666920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557495, "dur": 9, "args": { "External id": 83671, "cbid": 211, "correlation": 83671 } }, { "ph": "s", "id": 83671, "pid": 76337, "tid": -914061504, "ts": 1716454222557495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222666981, "dur": 50, "args": { "External id": 83679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83679, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83679, "pid": 5, "tid": 7, "ts": 1716454222666981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557526, "dur": 8, "args": { "External id": 83679, "cbid": 211, "correlation": 83679 } }, { "ph": "s", "id": 83679, "pid": 76337, "tid": -914061504, "ts": 1716454222557526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222667032, "dur": 99, "args": { "External id": 83688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83688, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83688, "pid": 5, "tid": 7, "ts": 1716454222667032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557565, "dur": 10, "args": { "External id": 83688, "cbid": 211, "correlation": 83688 } }, { "ph": "s", "id": 83688, "pid": 76337, "tid": -914061504, "ts": 1716454222557565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222667132, "dur": 92, "args": { "External id": 83708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83708, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 83708, "pid": 5, "tid": 7, "ts": 1716454222667132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557637, "dur": 11, "args": { "External id": 83708, "cbid": 211, "correlation": 83708 } }, { "ph": "s", "id": 83708, "pid": 76337, "tid": -914061504, "ts": 1716454222557637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222667226, "dur": 5, "args": { "External id": 83720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83720, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 83720, "pid": 5, "tid": 7, "ts": 1716454222667226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557659, "dur": 23, "args": { "External id": 83720, "cbid": 211, "correlation": 83720 } }, { "ph": "s", "id": 83720, "pid": 76337, "tid": -914061504, "ts": 1716454222557659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222667232, "dur": 109, "args": { "External id": 83723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83723, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83723, "pid": 5, "tid": 7, "ts": 1716454222667232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557695, "dur": 7, "args": { "External id": 83723, "cbid": 211, "correlation": 83723 } }, { "ph": "s", "id": 83723, "pid": 76337, "tid": -914061504, "ts": 1716454222557695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222667343, "dur": 69, "args": { "External id": 83732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83732, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83732, "pid": 5, "tid": 7, "ts": 1716454222667343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557735, "dur": 10, "args": { "External id": 83732, "cbid": 211, "correlation": 83732 } }, { "ph": "s", "id": 83732, "pid": 76337, "tid": -914061504, "ts": 1716454222557735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222557787, "dur": 0, "args": { "External id": 83742, "cbid": 317, "correlation": 83742 } }, { "ph": "f", "id": 83742, "pid": 76337, "tid": -914061504, "ts": 1716454222557787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222557788, "dur": 0, "args": { "External id": 83743, "cbid": 203, "correlation": 83743 } }, { "ph": "f", "id": 83743, "pid": 76337, "tid": -914061504, "ts": 1716454222557788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222557788, "dur": 0, "args": { "External id": 83744, "cbid": 205, "correlation": 83744 } }, { "ph": "f", "id": 83744, "pid": 76337, "tid": -914061504, "ts": 1716454222557788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222667414, "dur": 77, "args": { "External id": 83748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83748, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83748, "pid": 5, "tid": 7, "ts": 1716454222667414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557803, "dur": 12, "args": { "External id": 83748, "cbid": 211, "correlation": 83748 } }, { "ph": "s", "id": 83748, "pid": 76337, "tid": -914061504, "ts": 1716454222557803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222667492, "dur": 24, "args": { "External id": 83750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83750, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83750, "pid": 5, "tid": 7, "ts": 1716454222667492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557817, "dur": 5, "args": { "External id": 83750, "cbid": 211, "correlation": 83750 } }, { "ph": "s", "id": 83750, "pid": 76337, "tid": -914061504, "ts": 1716454222557817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222667518, "dur": 4, "args": { "External id": 83752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83752, "pid": 5, "tid": 7, "ts": 1716454222667518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557829, "dur": 6, "args": { "External id": 83752, "cbid": 211, "correlation": 83752 } }, { "ph": "s", "id": 83752, "pid": 76337, "tid": -914061504, "ts": 1716454222557829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222557837, "dur": 0, "args": { "External id": 83753, "cbid": 51, "correlation": 83753 } }, { "ph": "s", "id": 83753, "pid": 76337, "tid": -914061504, "ts": 1716454222557837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222667523, "dur": 1385, "args": { "External id": 83754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83754, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83754, "pid": 5, "tid": 7, "ts": 1716454222667523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557838, "dur": 5, "args": { "External id": 83754, "cbid": 211, "correlation": 83754 } }, { "ph": "s", "id": 83754, "pid": 76337, "tid": -914061504, "ts": 1716454222557838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222668910, "dur": 60, "args": { "External id": 83759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83759, "pid": 5, "tid": 7, "ts": 1716454222668910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557865, "dur": 9, "args": { "External id": 83759, "cbid": 211, "correlation": 83759 } }, { "ph": "s", "id": 83759, "pid": 76337, "tid": -914061504, "ts": 1716454222557865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222668971, "dur": 3, "args": { "External id": 83767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83767, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83767, "pid": 5, "tid": 7, "ts": 1716454222668971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557909, "dur": 9, "args": { "External id": 83767, "cbid": 211, "correlation": 83767 } }, { "ph": "s", "id": 83767, "pid": 76337, "tid": -914061504, "ts": 1716454222557909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557982, "dur": 2, "args": { "External id": 83783, "cbid": 251, "correlation": 83783 } }, { "ph": "f", "id": 83783, "pid": 76337, "tid": -914061504, "ts": 1716454222557982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222557988, "dur": 0, "args": { "External id": 83785, "cbid": 251, "correlation": 83785 } }, { "ph": "f", "id": 83785, "pid": 76337, "tid": -914061504, "ts": 1716454222557988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222668976, "dur": 12, "args": { "External id": 83786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83786, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 83786, "pid": 5, "tid": 7, "ts": 1716454222668976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222557990, "dur": 12, "args": { "External id": 83786, "cbid": 211, "correlation": 83786 } }, { "ph": "s", "id": 83786, "pid": 76337, "tid": -914061504, "ts": 1716454222557990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222668989, "dur": 5, "args": { "External id": 83788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83788, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 83788, "pid": 5, "tid": 7, "ts": 1716454222668989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558004, "dur": 176, "args": { "External id": 83788, "cbid": 211, "correlation": 83788 } }, { "ph": "s", "id": 83788, "pid": 76337, "tid": -914061504, "ts": 1716454222558004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222668996, "dur": 56, "args": { "External id": 83798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83798, "pid": 5, "tid": 7, "ts": 1716454222668996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558234, "dur": 12, "args": { "External id": 83798, "cbid": 211, "correlation": 83798 } }, { "ph": "s", "id": 83798, "pid": 76337, "tid": -914061504, "ts": 1716454222558234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222669053, "dur": 53, "args": { "External id": 83818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83818, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 83818, "pid": 5, "tid": 7, "ts": 1716454222669053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558301, "dur": 11, "args": { "External id": 83818, "cbid": 211, "correlation": 83818 } }, { "ph": "s", "id": 83818, "pid": 76337, "tid": -914061504, "ts": 1716454222558301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222669107, "dur": 4, "args": { "External id": 83830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83830, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83830, "pid": 5, "tid": 7, "ts": 1716454222669107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558321, "dur": 7, "args": { "External id": 83830, "cbid": 211, "correlation": 83830 } }, { "ph": "s", "id": 83830, "pid": 76337, "tid": -914061504, "ts": 1716454222558321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222669112, "dur": 55, "args": { "External id": 83833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83833, "pid": 5, "tid": 7, "ts": 1716454222669112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558341, "dur": 7, "args": { "External id": 83833, "cbid": 211, "correlation": 83833 } }, { "ph": "s", "id": 83833, "pid": 76337, "tid": -914061504, "ts": 1716454222558341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222669169, "dur": 36, "args": { "External id": 83842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83842, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83842, "pid": 5, "tid": 7, "ts": 1716454222669169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558382, "dur": 10, "args": { "External id": 83842, "cbid": 211, "correlation": 83842 } }, { "ph": "s", "id": 83842, "pid": 76337, "tid": -914061504, "ts": 1716454222558382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222558445, "dur": 0, "args": { "External id": 83852, "cbid": 317, "correlation": 83852 } }, { "ph": "f", "id": 83852, "pid": 76337, "tid": -914061504, "ts": 1716454222558445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222558446, "dur": 0, "args": { "External id": 83853, "cbid": 203, "correlation": 83853 } }, { "ph": "f", "id": 83853, "pid": 76337, "tid": -914061504, "ts": 1716454222558446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222558447, "dur": 0, "args": { "External id": 83854, "cbid": 205, "correlation": 83854 } }, { "ph": "f", "id": 83854, "pid": 76337, "tid": -914061504, "ts": 1716454222558447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222669206, "dur": 40, "args": { "External id": 83858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83858, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83858, "pid": 5, "tid": 7, "ts": 1716454222669206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558460, "dur": 12, "args": { "External id": 83858, "cbid": 211, "correlation": 83858 } }, { "ph": "s", "id": 83858, "pid": 76337, "tid": -914061504, "ts": 1716454222558460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222669248, "dur": 14, "args": { "External id": 83860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83860, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83860, "pid": 5, "tid": 7, "ts": 1716454222669248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558475, "dur": 5, "args": { "External id": 83860, "cbid": 211, "correlation": 83860 } }, { "ph": "s", "id": 83860, "pid": 76337, "tid": -914061504, "ts": 1716454222558475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222669264, "dur": 3, "args": { "External id": 83862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83862, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83862, "pid": 5, "tid": 7, "ts": 1716454222669264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558485, "dur": 6, "args": { "External id": 83862, "cbid": 211, "correlation": 83862 } }, { "ph": "s", "id": 83862, "pid": 76337, "tid": -914061504, "ts": 1716454222558485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222558493, "dur": 0, "args": { "External id": 83863, "cbid": 51, "correlation": 83863 } }, { "ph": "s", "id": 83863, "pid": 76337, "tid": -914061504, "ts": 1716454222558493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222669269, "dur": 709, "args": { "External id": 83864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83864, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83864, "pid": 5, "tid": 7, "ts": 1716454222669269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558494, "dur": 5, "args": { "External id": 83864, "cbid": 211, "correlation": 83864 } }, { "ph": "s", "id": 83864, "pid": 76337, "tid": -914061504, "ts": 1716454222558494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222669979, "dur": 60, "args": { "External id": 83869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83869, "pid": 5, "tid": 7, "ts": 1716454222669979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558521, "dur": 9, "args": { "External id": 83869, "cbid": 211, "correlation": 83869 } }, { "ph": "s", "id": 83869, "pid": 76337, "tid": -914061504, "ts": 1716454222558521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222558579, "dur": 0, "args": { "External id": 83879, "cbid": 317, "correlation": 83879 } }, { "ph": "f", "id": 83879, "pid": 76337, "tid": -914061504, "ts": 1716454222558579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222558579, "dur": 0, "args": { "External id": 83880, "cbid": 203, "correlation": 83880 } }, { "ph": "f", "id": 83880, "pid": 76337, "tid": -914061504, "ts": 1716454222558579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222558580, "dur": 0, "args": { "External id": 83881, "cbid": 205, "correlation": 83881 } }, { "ph": "f", "id": 83881, "pid": 76337, "tid": -914061504, "ts": 1716454222558580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222670040, "dur": 74, "args": { "External id": 83885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83885, "pid": 5, "tid": 7, "ts": 1716454222670040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558593, "dur": 12, "args": { "External id": 83885, "cbid": 211, "correlation": 83885 } }, { "ph": "s", "id": 83885, "pid": 76337, "tid": -914061504, "ts": 1716454222558593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222670115, "dur": 209, "args": { "External id": 83887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83887, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 83887, "pid": 5, "tid": 7, "ts": 1716454222670115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558613, "dur": 8, "args": { "External id": 83887, "cbid": 211, "correlation": 83887 } }, { "ph": "s", "id": 83887, "pid": 76337, "tid": -914061504, "ts": 1716454222558613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222670325, "dur": 40, "args": { "External id": 83889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83889, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83889, "pid": 5, "tid": 7, "ts": 1716454222670325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222558625, "dur": 495, "args": { "External id": 83889, "cbid": 211, "correlation": 83889 } }, { "ph": "s", "id": 83889, "pid": 76337, "tid": -914061504, "ts": 1716454222558625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222670366, "dur": 61, "args": { "External id": 83895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83895, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83895, "pid": 5, "tid": 7, "ts": 1716454222670366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559142, "dur": 10, "args": { "External id": 83895, "cbid": 211, "correlation": 83895 } }, { "ph": "s", "id": 83895, "pid": 76337, "tid": -914061504, "ts": 1716454222559142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222670429, "dur": 50, "args": { "External id": 83903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83903, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83903, "pid": 5, "tid": 7, "ts": 1716454222670429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559174, "dur": 8, "args": { "External id": 83903, "cbid": 211, "correlation": 83903 } }, { "ph": "s", "id": 83903, "pid": 76337, "tid": -914061504, "ts": 1716454222559174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222670481, "dur": 35, "args": { "External id": 83911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83911, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83911, "pid": 5, "tid": 7, "ts": 1716454222670481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559202, "dur": 28, "args": { "External id": 83911, "cbid": 211, "correlation": 83911 } }, { "ph": "s", "id": 83911, "pid": 76337, "tid": -914061504, "ts": 1716454222559202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222670517, "dur": 53, "args": { "External id": 83931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83931, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 83931, "pid": 5, "tid": 7, "ts": 1716454222670517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559306, "dur": 13, "args": { "External id": 83931, "cbid": 211, "correlation": 83931 } }, { "ph": "s", "id": 83931, "pid": 76337, "tid": -914061504, "ts": 1716454222559306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222670571, "dur": 4, "args": { "External id": 83943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83943, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 83943, "pid": 5, "tid": 7, "ts": 1716454222670571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559328, "dur": 6, "args": { "External id": 83943, "cbid": 211, "correlation": 83943 } }, { "ph": "s", "id": 83943, "pid": 76337, "tid": -914061504, "ts": 1716454222559328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222670577, "dur": 55, "args": { "External id": 83946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83946, "pid": 5, "tid": 7, "ts": 1716454222670577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559346, "dur": 6, "args": { "External id": 83946, "cbid": 211, "correlation": 83946 } }, { "ph": "s", "id": 83946, "pid": 76337, "tid": -914061504, "ts": 1716454222559346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222559404, "dur": 0, "args": { "External id": 83957, "cbid": 317, "correlation": 83957 } }, { "ph": "f", "id": 83957, "pid": 76337, "tid": -914061504, "ts": 1716454222559404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222559405, "dur": 0, "args": { "External id": 83958, "cbid": 203, "correlation": 83958 } }, { "ph": "f", "id": 83958, "pid": 76337, "tid": -914061504, "ts": 1716454222559405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222559405, "dur": 0, "args": { "External id": 83959, "cbid": 205, "correlation": 83959 } }, { "ph": "f", "id": 83959, "pid": 76337, "tid": -914061504, "ts": 1716454222559405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559429, "dur": 1, "args": { "External id": 83963, "cbid": 251, "correlation": 83963 } }, { "ph": "f", "id": 83963, "pid": 76337, "tid": -914061504, "ts": 1716454222559429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559431, "dur": 0, "args": { "External id": 83964, "cbid": 251, "correlation": 83964 } }, { "ph": "f", "id": 83964, "pid": 76337, "tid": -914061504, "ts": 1716454222559431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559432, "dur": 0, "args": { "External id": 83965, "cbid": 251, "correlation": 83965 } }, { "ph": "f", "id": 83965, "pid": 76337, "tid": -914061504, "ts": 1716454222559432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559433, "dur": 0, "args": { "External id": 83966, "cbid": 251, "correlation": 83966 } }, { "ph": "f", "id": 83966, "pid": 76337, "tid": -914061504, "ts": 1716454222559433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559433, "dur": 0, "args": { "External id": 83967, "cbid": 251, "correlation": 83967 } }, { "ph": "f", "id": 83967, "pid": 76337, "tid": -914061504, "ts": 1716454222559433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559434, "dur": 0, "args": { "External id": 83968, "cbid": 251, "correlation": 83968 } }, { "ph": "f", "id": 83968, "pid": 76337, "tid": -914061504, "ts": 1716454222559434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559435, "dur": 0, "args": { "External id": 83969, "cbid": 251, "correlation": 83969 } }, { "ph": "f", "id": 83969, "pid": 76337, "tid": -914061504, "ts": 1716454222559435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559436, "dur": 0, "args": { "External id": 83970, "cbid": 251, "correlation": 83970 } }, { "ph": "f", "id": 83970, "pid": 76337, "tid": -914061504, "ts": 1716454222559436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559437, "dur": 0, "args": { "External id": 83971, "cbid": 251, "correlation": 83971 } }, { "ph": "f", "id": 83971, "pid": 76337, "tid": -914061504, "ts": 1716454222559437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222670633, "dur": 114, "args": { "External id": 83972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83972, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 83972, "pid": 5, "tid": 7, "ts": 1716454222670633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559439, "dur": 13, "args": { "External id": 83972, "cbid": 211, "correlation": 83972 } }, { "ph": "s", "id": 83972, "pid": 76337, "tid": -914061504, "ts": 1716454222559439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222670749, "dur": 60, "args": { "External id": 83978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83978, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83978, "pid": 5, "tid": 7, "ts": 1716454222670749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559475, "dur": 8, "args": { "External id": 83978, "cbid": 211, "correlation": 83978 } }, { "ph": "s", "id": 83978, "pid": 76337, "tid": -914061504, "ts": 1716454222559475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222670811, "dur": 456, "args": { "External id": 83987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 83987, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 83987, "pid": 5, "tid": 7, "ts": 1716454222670811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559556, "dur": 15, "args": { "External id": 83987, "cbid": 211, "correlation": 83987 } }, { "ph": "s", "id": 83987, "pid": 76337, "tid": -914061504, "ts": 1716454222559556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222671269, "dur": 185, "args": { "External id": 84009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84009, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84009, "pid": 5, "tid": 7, "ts": 1716454222671269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559614, "dur": 10, "args": { "External id": 84009, "cbid": 211, "correlation": 84009 } }, { "ph": "s", "id": 84009, "pid": 76337, "tid": -914061504, "ts": 1716454222559614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559701, "dur": 1, "args": { "External id": 84020, "cbid": 251, "correlation": 84020 } }, { "ph": "f", "id": 84020, "pid": 76337, "tid": -914061504, "ts": 1716454222559701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222671455, "dur": 200, "args": { "External id": 84021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84021, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84021, "pid": 5, "tid": 7, "ts": 1716454222671455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559706, "dur": 13, "args": { "External id": 84021, "cbid": 211, "correlation": 84021 } }, { "ph": "s", "id": 84021, "pid": 76337, "tid": -914061504, "ts": 1716454222559706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559774, "dur": 1, "args": { "External id": 84032, "cbid": 251, "correlation": 84032 } }, { "ph": "f", "id": 84032, "pid": 76337, "tid": -914061504, "ts": 1716454222559774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222671656, "dur": 195, "args": { "External id": 84033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84033, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84033, "pid": 5, "tid": 7, "ts": 1716454222671656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559779, "dur": 11, "args": { "External id": 84033, "cbid": 211, "correlation": 84033 } }, { "ph": "s", "id": 84033, "pid": 76337, "tid": -914061504, "ts": 1716454222559779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222559842, "dur": 1, "args": { "External id": 84044, "cbid": 251, "correlation": 84044 } }, { "ph": "f", "id": 84044, "pid": 76337, "tid": -914061504, "ts": 1716454222559842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222671853, "dur": 194, "args": { "External id": 84045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84045, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84045, "pid": 5, "tid": 7, "ts": 1716454222671853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559846, "dur": 11, "args": { "External id": 84045, "cbid": 211, "correlation": 84045 } }, { "ph": "s", "id": 84045, "pid": 76337, "tid": -914061504, "ts": 1716454222559846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222672048, "dur": 18957, "args": { "External id": 84066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84066, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84066, "pid": 5, "tid": 7, "ts": 1716454222672048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222559926, "dur": 13, "args": { "External id": 84066, "cbid": 211, "correlation": 84066 } }, { "ph": "s", "id": 84066, "pid": 76337, "tid": -914061504, "ts": 1716454222559926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222560033, "dur": 1, "args": { "External id": 84084, "cbid": 251, "correlation": 84084 } }, { "ph": "f", "id": 84084, "pid": 76337, "tid": -914061504, "ts": 1716454222560033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222691007, "dur": 204, "args": { "External id": 84086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84086, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84086, "pid": 5, "tid": 7, "ts": 1716454222691007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222560039, "dur": 14, "args": { "External id": 84086, "cbid": 211, "correlation": 84086 } }, { "ph": "s", "id": 84086, "pid": 76337, "tid": -914061504, "ts": 1716454222560039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222691212, "dur": 66, "args": { "External id": 84094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84094, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84094, "pid": 5, "tid": 7, "ts": 1716454222691212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222560111, "dur": 12, "args": { "External id": 84094, "cbid": 211, "correlation": 84094 } }, { "ph": "s", "id": 84094, "pid": 76337, "tid": -914061504, "ts": 1716454222560111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222691280, "dur": 97, "args": { "External id": 84102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84102, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84102, "pid": 5, "tid": 7, "ts": 1716454222691280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222560151, "dur": 9, "args": { "External id": 84102, "cbid": 211, "correlation": 84102 } }, { "ph": "s", "id": 84102, "pid": 76337, "tid": -914061504, "ts": 1716454222560151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222691378, "dur": 56, "args": { "External id": 84113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84113, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84113, "pid": 5, "tid": 7, "ts": 1716454222691378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222560222, "dur": 70, "args": { "External id": 84113, "cbid": 211, "correlation": 84113 } }, { "ph": "s", "id": 84113, "pid": 76337, "tid": -914061504, "ts": 1716454222560222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222691435, "dur": 94, "args": { "External id": 84135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84135, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84135, "pid": 5, "tid": 7, "ts": 1716454222691435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222560312, "dur": 1972, "args": { "External id": 84135, "cbid": 211, "correlation": 84135 } }, { "ph": "s", "id": 84135, "pid": 76337, "tid": -914061504, "ts": 1716454222560312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222562360, "dur": 1, "args": { "External id": 84146, "cbid": 251, "correlation": 84146 } }, { "ph": "f", "id": 84146, "pid": 76337, "tid": -914061504, "ts": 1716454222562360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222691530, "dur": 105, "args": { "External id": 84147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84147, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84147, "pid": 5, "tid": 7, "ts": 1716454222691530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562366, "dur": 69, "args": { "External id": 84147, "cbid": 211, "correlation": 84147 } }, { "ph": "s", "id": 84147, "pid": 76337, "tid": -914061504, "ts": 1716454222562366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222562495, "dur": 1, "args": { "External id": 84158, "cbid": 251, "correlation": 84158 } }, { "ph": "f", "id": 84158, "pid": 76337, "tid": -914061504, "ts": 1716454222562495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222562500, "dur": 0, "args": { "External id": 84159, "cbid": 251, "correlation": 84159 } }, { "ph": "f", "id": 84159, "pid": 76337, "tid": -914061504, "ts": 1716454222562500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222691637, "dur": 10, "args": { "External id": 84160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84160, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 84160, "pid": 5, "tid": 7, "ts": 1716454222691637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562501, "dur": 13, "args": { "External id": 84160, "cbid": 211, "correlation": 84160 } }, { "ph": "s", "id": 84160, "pid": 76337, "tid": -914061504, "ts": 1716454222562501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222691648, "dur": 5, "args": { "External id": 84162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84162, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 84162, "pid": 5, "tid": 7, "ts": 1716454222691648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562516, "dur": 6, "args": { "External id": 84162, "cbid": 211, "correlation": 84162 } }, { "ph": "s", "id": 84162, "pid": 76337, "tid": -914061504, "ts": 1716454222562516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222562577, "dur": 1, "args": { "External id": 84173, "cbid": 251, "correlation": 84173 } }, { "ph": "f", "id": 84173, "pid": 76337, "tid": -914061504, "ts": 1716454222562577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222562580, "dur": 0, "args": { "External id": 84174, "cbid": 251, "correlation": 84174 } }, { "ph": "f", "id": 84174, "pid": 76337, "tid": -914061504, "ts": 1716454222562580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222691655, "dur": 6, "args": { "External id": 84175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84175, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 84175, "pid": 5, "tid": 7, "ts": 1716454222691655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562582, "dur": 12, "args": { "External id": 84175, "cbid": 211, "correlation": 84175 } }, { "ph": "s", "id": 84175, "pid": 76337, "tid": -914061504, "ts": 1716454222562582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222691662, "dur": 4, "args": { "External id": 84177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84177, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 84177, "pid": 5, "tid": 7, "ts": 1716454222691662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562596, "dur": 6, "args": { "External id": 84177, "cbid": 211, "correlation": 84177 } }, { "ph": "s", "id": 84177, "pid": 76337, "tid": -914061504, "ts": 1716454222562596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222691667, "dur": 159, "args": { "External id": 84198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84198, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84198, "pid": 5, "tid": 7, "ts": 1716454222691667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562671, "dur": 13, "args": { "External id": 84198, "cbid": 211, "correlation": 84198 } }, { "ph": "s", "id": 84198, "pid": 76337, "tid": -914061504, "ts": 1716454222562671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222562771, "dur": 1, "args": { "External id": 84216, "cbid": 251, "correlation": 84216 } }, { "ph": "f", "id": 84216, "pid": 76337, "tid": -914061504, "ts": 1716454222562771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222691828, "dur": 107, "args": { "External id": 84218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84218, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84218, "pid": 5, "tid": 7, "ts": 1716454222691828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562777, "dur": 13, "args": { "External id": 84218, "cbid": 211, "correlation": 84218 } }, { "ph": "s", "id": 84218, "pid": 76337, "tid": -914061504, "ts": 1716454222562777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222691937, "dur": 34, "args": { "External id": 84226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84226, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84226, "pid": 5, "tid": 7, "ts": 1716454222691937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562847, "dur": 12, "args": { "External id": 84226, "cbid": 211, "correlation": 84226 } }, { "ph": "s", "id": 84226, "pid": 76337, "tid": -914061504, "ts": 1716454222562847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222691973, "dur": 67, "args": { "External id": 84234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84234, "pid": 5, "tid": 7, "ts": 1716454222691973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562889, "dur": 9, "args": { "External id": 84234, "cbid": 211, "correlation": 84234 } }, { "ph": "s", "id": 84234, "pid": 76337, "tid": -914061504, "ts": 1716454222562889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222692041, "dur": 94, "args": { "External id": 84256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84256, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84256, "pid": 5, "tid": 7, "ts": 1716454222692041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222562940, "dur": 10, "args": { "External id": 84256, "cbid": 211, "correlation": 84256 } }, { "ph": "s", "id": 84256, "pid": 76337, "tid": -914061504, "ts": 1716454222562940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563036, "dur": 1, "args": { "External id": 84272, "cbid": 251, "correlation": 84272 } }, { "ph": "f", "id": 84272, "pid": 76337, "tid": -914061504, "ts": 1716454222563036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222692136, "dur": 586, "args": { "External id": 84274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84274, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84274, "pid": 5, "tid": 7, "ts": 1716454222692136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563042, "dur": 14, "args": { "External id": 84274, "cbid": 211, "correlation": 84274 } }, { "ph": "s", "id": 84274, "pid": 76337, "tid": -914061504, "ts": 1716454222563042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222692723, "dur": 249, "args": { "External id": 84282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84282, "pid": 5, "tid": 7, "ts": 1716454222692723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563109, "dur": 12, "args": { "External id": 84282, "cbid": 211, "correlation": 84282 } }, { "ph": "s", "id": 84282, "pid": 76337, "tid": -914061504, "ts": 1716454222563109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222692973, "dur": 251, "args": { "External id": 84290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84290, "pid": 5, "tid": 7, "ts": 1716454222692973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563140, "dur": 8, "args": { "External id": 84290, "cbid": 211, "correlation": 84290 } }, { "ph": "s", "id": 84290, "pid": 76337, "tid": -914061504, "ts": 1716454222563140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563221, "dur": 1, "args": { "External id": 84306, "cbid": 251, "correlation": 84306 } }, { "ph": "f", "id": 84306, "pid": 76337, "tid": -914061504, "ts": 1716454222563221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563226, "dur": 0, "args": { "External id": 84308, "cbid": 251, "correlation": 84308 } }, { "ph": "f", "id": 84308, "pid": 76337, "tid": -914061504, "ts": 1716454222563226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222693226, "dur": 362, "args": { "External id": 84309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84309, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 84309, "pid": 5, "tid": 7, "ts": 1716454222693226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563229, "dur": 13, "args": { "External id": 84309, "cbid": 211, "correlation": 84309 } }, { "ph": "s", "id": 84309, "pid": 76337, "tid": -914061504, "ts": 1716454222563229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222693590, "dur": 50, "args": { "External id": 84317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84317, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84317, "pid": 5, "tid": 7, "ts": 1716454222693590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563272, "dur": 10, "args": { "External id": 84317, "cbid": 211, "correlation": 84317 } }, { "ph": "s", "id": 84317, "pid": 76337, "tid": -914061504, "ts": 1716454222563272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222693641, "dur": 162, "args": { "External id": 84328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84328, "pid": 5, "tid": 7, "ts": 1716454222693641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563339, "dur": 219, "args": { "External id": 84328, "cbid": 211, "correlation": 84328 } }, { "ph": "s", "id": 84328, "pid": 76337, "tid": -914061504, "ts": 1716454222563339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222563611, "dur": 0, "args": { "External id": 84340, "cbid": 317, "correlation": 84340 } }, { "ph": "f", "id": 84340, "pid": 76337, "tid": -914061504, "ts": 1716454222563611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222563611, "dur": 0, "args": { "External id": 84341, "cbid": 203, "correlation": 84341 } }, { "ph": "f", "id": 84341, "pid": 76337, "tid": -914061504, "ts": 1716454222563611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222563612, "dur": 0, "args": { "External id": 84342, "cbid": 205, "correlation": 84342 } }, { "ph": "f", "id": 84342, "pid": 76337, "tid": -914061504, "ts": 1716454222563612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563635, "dur": 1, "args": { "External id": 84346, "cbid": 251, "correlation": 84346 } }, { "ph": "f", "id": 84346, "pid": 76337, "tid": -914061504, "ts": 1716454222563635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563637, "dur": 0, "args": { "External id": 84347, "cbid": 251, "correlation": 84347 } }, { "ph": "f", "id": 84347, "pid": 76337, "tid": -914061504, "ts": 1716454222563637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563637, "dur": 0, "args": { "External id": 84348, "cbid": 251, "correlation": 84348 } }, { "ph": "f", "id": 84348, "pid": 76337, "tid": -914061504, "ts": 1716454222563637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563638, "dur": 0, "args": { "External id": 84349, "cbid": 251, "correlation": 84349 } }, { "ph": "f", "id": 84349, "pid": 76337, "tid": -914061504, "ts": 1716454222563638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563639, "dur": 0, "args": { "External id": 84350, "cbid": 251, "correlation": 84350 } }, { "ph": "f", "id": 84350, "pid": 76337, "tid": -914061504, "ts": 1716454222563639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563640, "dur": 0, "args": { "External id": 84351, "cbid": 251, "correlation": 84351 } }, { "ph": "f", "id": 84351, "pid": 76337, "tid": -914061504, "ts": 1716454222563640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563640, "dur": 0, "args": { "External id": 84352, "cbid": 251, "correlation": 84352 } }, { "ph": "f", "id": 84352, "pid": 76337, "tid": -914061504, "ts": 1716454222563640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563641, "dur": 0, "args": { "External id": 84353, "cbid": 251, "correlation": 84353 } }, { "ph": "f", "id": 84353, "pid": 76337, "tid": -914061504, "ts": 1716454222563641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222563642, "dur": 0, "args": { "External id": 84354, "cbid": 251, "correlation": 84354 } }, { "ph": "f", "id": 84354, "pid": 76337, "tid": -914061504, "ts": 1716454222563642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222693804, "dur": 117, "args": { "External id": 84355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84355, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84355, "pid": 5, "tid": 7, "ts": 1716454222693804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563644, "dur": 42, "args": { "External id": 84355, "cbid": 211, "correlation": 84355 } }, { "ph": "s", "id": 84355, "pid": 76337, "tid": -914061504, "ts": 1716454222563644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222693923, "dur": 60, "args": { "External id": 84361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84361, "pid": 5, "tid": 7, "ts": 1716454222693923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563709, "dur": 107, "args": { "External id": 84361, "cbid": 211, "correlation": 84361 } }, { "ph": "s", "id": 84361, "pid": 76337, "tid": -914061504, "ts": 1716454222563709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222693984, "dur": 50, "args": { "External id": 84369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84369, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84369, "pid": 5, "tid": 7, "ts": 1716454222693984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222563838, "dur": 286, "args": { "External id": 84369, "cbid": 211, "correlation": 84369 } }, { "ph": "s", "id": 84369, "pid": 76337, "tid": -914061504, "ts": 1716454222563838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222694036, "dur": 98, "args": { "External id": 84378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84378, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84378, "pid": 5, "tid": 7, "ts": 1716454222694036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564156, "dur": 11, "args": { "External id": 84378, "cbid": 211, "correlation": 84378 } }, { "ph": "s", "id": 84378, "pid": 76337, "tid": -914061504, "ts": 1716454222564156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222694135, "dur": 94, "args": { "External id": 84398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84398, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 84398, "pid": 5, "tid": 7, "ts": 1716454222694135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564230, "dur": 12, "args": { "External id": 84398, "cbid": 211, "correlation": 84398 } }, { "ph": "s", "id": 84398, "pid": 76337, "tid": -914061504, "ts": 1716454222564230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222694230, "dur": 5, "args": { "External id": 84410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84410, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 84410, "pid": 5, "tid": 7, "ts": 1716454222694230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564252, "dur": 10, "args": { "External id": 84410, "cbid": 211, "correlation": 84410 } }, { "ph": "s", "id": 84410, "pid": 76337, "tid": -914061504, "ts": 1716454222564252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222694236, "dur": 109, "args": { "External id": 84413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84413, "pid": 5, "tid": 7, "ts": 1716454222694236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564275, "dur": 109, "args": { "External id": 84413, "cbid": 211, "correlation": 84413 } }, { "ph": "s", "id": 84413, "pid": 76337, "tid": -914061504, "ts": 1716454222564275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222694347, "dur": 70, "args": { "External id": 84422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84422, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84422, "pid": 5, "tid": 7, "ts": 1716454222694347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564417, "dur": 10, "args": { "External id": 84422, "cbid": 211, "correlation": 84422 } }, { "ph": "s", "id": 84422, "pid": 76337, "tid": -914061504, "ts": 1716454222564417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222564470, "dur": 0, "args": { "External id": 84432, "cbid": 317, "correlation": 84432 } }, { "ph": "f", "id": 84432, "pid": 76337, "tid": -914061504, "ts": 1716454222564470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222564471, "dur": 0, "args": { "External id": 84433, "cbid": 203, "correlation": 84433 } }, { "ph": "f", "id": 84433, "pid": 76337, "tid": -914061504, "ts": 1716454222564471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222564471, "dur": 0, "args": { "External id": 84434, "cbid": 205, "correlation": 84434 } }, { "ph": "f", "id": 84434, "pid": 76337, "tid": -914061504, "ts": 1716454222564471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222694418, "dur": 76, "args": { "External id": 84438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84438, "pid": 5, "tid": 7, "ts": 1716454222694418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564486, "dur": 12, "args": { "External id": 84438, "cbid": 211, "correlation": 84438 } }, { "ph": "s", "id": 84438, "pid": 76337, "tid": -914061504, "ts": 1716454222564486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222694496, "dur": 24, "args": { "External id": 84440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84440, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84440, "pid": 5, "tid": 7, "ts": 1716454222694496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564500, "dur": 5, "args": { "External id": 84440, "cbid": 211, "correlation": 84440 } }, { "ph": "s", "id": 84440, "pid": 76337, "tid": -914061504, "ts": 1716454222564500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222694521, "dur": 4, "args": { "External id": 84442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 84442, "pid": 5, "tid": 7, "ts": 1716454222694521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564510, "dur": 6, "args": { "External id": 84442, "cbid": 211, "correlation": 84442 } }, { "ph": "s", "id": 84442, "pid": 76337, "tid": -914061504, "ts": 1716454222564510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222564519, "dur": 0, "args": { "External id": 84443, "cbid": 51, "correlation": 84443 } }, { "ph": "s", "id": 84443, "pid": 76337, "tid": -914061504, "ts": 1716454222564519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222694526, "dur": 1387, "args": { "External id": 84444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84444, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84444, "pid": 5, "tid": 7, "ts": 1716454222694526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564520, "dur": 5, "args": { "External id": 84444, "cbid": 211, "correlation": 84444 } }, { "ph": "s", "id": 84444, "pid": 76337, "tid": -914061504, "ts": 1716454222564520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222695915, "dur": 60, "args": { "External id": 84449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84449, "pid": 5, "tid": 7, "ts": 1716454222695915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564549, "dur": 8, "args": { "External id": 84449, "cbid": 211, "correlation": 84449 } }, { "ph": "s", "id": 84449, "pid": 76337, "tid": -914061504, "ts": 1716454222564549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222695976, "dur": 4, "args": { "External id": 84457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84457, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 84457, "pid": 5, "tid": 7, "ts": 1716454222695976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564592, "dur": 10, "args": { "External id": 84457, "cbid": 211, "correlation": 84457 } }, { "ph": "s", "id": 84457, "pid": 76337, "tid": -914061504, "ts": 1716454222564592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222564659, "dur": 1, "args": { "External id": 84473, "cbid": 251, "correlation": 84473 } }, { "ph": "f", "id": 84473, "pid": 76337, "tid": -914061504, "ts": 1716454222564659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222564664, "dur": 0, "args": { "External id": 84475, "cbid": 251, "correlation": 84475 } }, { "ph": "f", "id": 84475, "pid": 76337, "tid": -914061504, "ts": 1716454222564664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222695981, "dur": 12, "args": { "External id": 84476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84476, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 84476, "pid": 5, "tid": 7, "ts": 1716454222695981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564666, "dur": 12, "args": { "External id": 84476, "cbid": 211, "correlation": 84476 } }, { "ph": "s", "id": 84476, "pid": 76337, "tid": -914061504, "ts": 1716454222564666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222695994, "dur": 5, "args": { "External id": 84478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84478, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 84478, "pid": 5, "tid": 7, "ts": 1716454222695994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564680, "dur": 6, "args": { "External id": 84478, "cbid": 211, "correlation": 84478 } }, { "ph": "s", "id": 84478, "pid": 76337, "tid": -914061504, "ts": 1716454222564680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222696000, "dur": 55, "args": { "External id": 84488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84488, "pid": 5, "tid": 7, "ts": 1716454222696000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222564738, "dur": 563, "args": { "External id": 84488, "cbid": 211, "correlation": 84488 } }, { "ph": "s", "id": 84488, "pid": 76337, "tid": -914061504, "ts": 1716454222564738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222696057, "dur": 53, "args": { "External id": 84508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84508, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 84508, "pid": 5, "tid": 7, "ts": 1716454222696057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565357, "dur": 11, "args": { "External id": 84508, "cbid": 211, "correlation": 84508 } }, { "ph": "s", "id": 84508, "pid": 76337, "tid": -914061504, "ts": 1716454222565357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222696111, "dur": 4, "args": { "External id": 84520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84520, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 84520, "pid": 5, "tid": 7, "ts": 1716454222696111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565378, "dur": 6, "args": { "External id": 84520, "cbid": 211, "correlation": 84520 } }, { "ph": "s", "id": 84520, "pid": 76337, "tid": -914061504, "ts": 1716454222565378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222696116, "dur": 56, "args": { "External id": 84523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84523, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84523, "pid": 5, "tid": 7, "ts": 1716454222696116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565396, "dur": 7, "args": { "External id": 84523, "cbid": 211, "correlation": 84523 } }, { "ph": "s", "id": 84523, "pid": 76337, "tid": -914061504, "ts": 1716454222565396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222696173, "dur": 36, "args": { "External id": 84532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84532, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84532, "pid": 5, "tid": 7, "ts": 1716454222696173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565438, "dur": 10, "args": { "External id": 84532, "cbid": 211, "correlation": 84532 } }, { "ph": "s", "id": 84532, "pid": 76337, "tid": -914061504, "ts": 1716454222565438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222565501, "dur": 0, "args": { "External id": 84542, "cbid": 317, "correlation": 84542 } }, { "ph": "f", "id": 84542, "pid": 76337, "tid": -914061504, "ts": 1716454222565501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222565501, "dur": 0, "args": { "External id": 84543, "cbid": 203, "correlation": 84543 } }, { "ph": "f", "id": 84543, "pid": 76337, "tid": -914061504, "ts": 1716454222565501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222565502, "dur": 0, "args": { "External id": 84544, "cbid": 205, "correlation": 84544 } }, { "ph": "f", "id": 84544, "pid": 76337, "tid": -914061504, "ts": 1716454222565502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222696211, "dur": 40, "args": { "External id": 84548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84548, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84548, "pid": 5, "tid": 7, "ts": 1716454222696211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565517, "dur": 12, "args": { "External id": 84548, "cbid": 211, "correlation": 84548 } }, { "ph": "s", "id": 84548, "pid": 76337, "tid": -914061504, "ts": 1716454222565517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222696252, "dur": 14, "args": { "External id": 84550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84550, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84550, "pid": 5, "tid": 7, "ts": 1716454222696252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565531, "dur": 6, "args": { "External id": 84550, "cbid": 211, "correlation": 84550 } }, { "ph": "s", "id": 84550, "pid": 76337, "tid": -914061504, "ts": 1716454222565531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222696268, "dur": 3, "args": { "External id": 84552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84552, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 84552, "pid": 5, "tid": 7, "ts": 1716454222696268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565541, "dur": 5, "args": { "External id": 84552, "cbid": 211, "correlation": 84552 } }, { "ph": "s", "id": 84552, "pid": 76337, "tid": -914061504, "ts": 1716454222565541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222565549, "dur": 0, "args": { "External id": 84553, "cbid": 51, "correlation": 84553 } }, { "ph": "s", "id": 84553, "pid": 76337, "tid": -914061504, "ts": 1716454222565549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222696272, "dur": 708, "args": { "External id": 84554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84554, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84554, "pid": 5, "tid": 7, "ts": 1716454222696272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565550, "dur": 5, "args": { "External id": 84554, "cbid": 211, "correlation": 84554 } }, { "ph": "s", "id": 84554, "pid": 76337, "tid": -914061504, "ts": 1716454222565550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222696982, "dur": 60, "args": { "External id": 84559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84559, "pid": 5, "tid": 7, "ts": 1716454222696982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565578, "dur": 9, "args": { "External id": 84559, "cbid": 211, "correlation": 84559 } }, { "ph": "s", "id": 84559, "pid": 76337, "tid": -914061504, "ts": 1716454222565578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222565636, "dur": 0, "args": { "External id": 84569, "cbid": 317, "correlation": 84569 } }, { "ph": "f", "id": 84569, "pid": 76337, "tid": -914061504, "ts": 1716454222565636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222565637, "dur": 0, "args": { "External id": 84570, "cbid": 203, "correlation": 84570 } }, { "ph": "f", "id": 84570, "pid": 76337, "tid": -914061504, "ts": 1716454222565637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222565637, "dur": 0, "args": { "External id": 84571, "cbid": 205, "correlation": 84571 } }, { "ph": "f", "id": 84571, "pid": 76337, "tid": -914061504, "ts": 1716454222565637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222697043, "dur": 76, "args": { "External id": 84575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84575, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84575, "pid": 5, "tid": 7, "ts": 1716454222697043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565649, "dur": 12, "args": { "External id": 84575, "cbid": 211, "correlation": 84575 } }, { "ph": "s", "id": 84575, "pid": 76337, "tid": -914061504, "ts": 1716454222565649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222697120, "dur": 211, "args": { "External id": 84577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84577, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84577, "pid": 5, "tid": 7, "ts": 1716454222697120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565667, "dur": 7, "args": { "External id": 84577, "cbid": 211, "correlation": 84577 } }, { "ph": "s", "id": 84577, "pid": 76337, "tid": -914061504, "ts": 1716454222565667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222697332, "dur": 39, "args": { "External id": 84579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84579, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84579, "pid": 5, "tid": 7, "ts": 1716454222697332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565679, "dur": 6, "args": { "External id": 84579, "cbid": 211, "correlation": 84579 } }, { "ph": "s", "id": 84579, "pid": 76337, "tid": -914061504, "ts": 1716454222565679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222697373, "dur": 59, "args": { "External id": 84585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84585, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84585, "pid": 5, "tid": 7, "ts": 1716454222697373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222565705, "dur": 526, "args": { "External id": 84585, "cbid": 211, "correlation": 84585 } }, { "ph": "s", "id": 84585, "pid": 76337, "tid": -914061504, "ts": 1716454222565705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222697433, "dur": 50, "args": { "External id": 84593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84593, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84593, "pid": 5, "tid": 7, "ts": 1716454222697433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566253, "dur": 9, "args": { "External id": 84593, "cbid": 211, "correlation": 84593 } }, { "ph": "s", "id": 84593, "pid": 76337, "tid": -914061504, "ts": 1716454222566253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222697484, "dur": 35, "args": { "External id": 84601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84601, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84601, "pid": 5, "tid": 7, "ts": 1716454222697484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566283, "dur": 9, "args": { "External id": 84601, "cbid": 211, "correlation": 84601 } }, { "ph": "s", "id": 84601, "pid": 76337, "tid": -914061504, "ts": 1716454222566283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222697520, "dur": 52, "args": { "External id": 84621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84621, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 84621, "pid": 5, "tid": 7, "ts": 1716454222697520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566365, "dur": 13, "args": { "External id": 84621, "cbid": 211, "correlation": 84621 } }, { "ph": "s", "id": 84621, "pid": 76337, "tid": -914061504, "ts": 1716454222566365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222697574, "dur": 5, "args": { "External id": 84633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84633, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 84633, "pid": 5, "tid": 7, "ts": 1716454222697574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566389, "dur": 6, "args": { "External id": 84633, "cbid": 211, "correlation": 84633 } }, { "ph": "s", "id": 84633, "pid": 76337, "tid": -914061504, "ts": 1716454222566389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222697580, "dur": 56, "args": { "External id": 84636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84636, "pid": 5, "tid": 7, "ts": 1716454222697580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566407, "dur": 7, "args": { "External id": 84636, "cbid": 211, "correlation": 84636 } }, { "ph": "s", "id": 84636, "pid": 76337, "tid": -914061504, "ts": 1716454222566407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222566465, "dur": 0, "args": { "External id": 84647, "cbid": 317, "correlation": 84647 } }, { "ph": "f", "id": 84647, "pid": 76337, "tid": -914061504, "ts": 1716454222566465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222566465, "dur": 0, "args": { "External id": 84648, "cbid": 203, "correlation": 84648 } }, { "ph": "f", "id": 84648, "pid": 76337, "tid": -914061504, "ts": 1716454222566465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222566466, "dur": 0, "args": { "External id": 84649, "cbid": 205, "correlation": 84649 } }, { "ph": "f", "id": 84649, "pid": 76337, "tid": -914061504, "ts": 1716454222566466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566490, "dur": 1, "args": { "External id": 84653, "cbid": 251, "correlation": 84653 } }, { "ph": "f", "id": 84653, "pid": 76337, "tid": -914061504, "ts": 1716454222566490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566492, "dur": 0, "args": { "External id": 84654, "cbid": 251, "correlation": 84654 } }, { "ph": "f", "id": 84654, "pid": 76337, "tid": -914061504, "ts": 1716454222566492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566493, "dur": 0, "args": { "External id": 84655, "cbid": 251, "correlation": 84655 } }, { "ph": "f", "id": 84655, "pid": 76337, "tid": -914061504, "ts": 1716454222566493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566493, "dur": 0, "args": { "External id": 84656, "cbid": 251, "correlation": 84656 } }, { "ph": "f", "id": 84656, "pid": 76337, "tid": -914061504, "ts": 1716454222566493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566494, "dur": 0, "args": { "External id": 84657, "cbid": 251, "correlation": 84657 } }, { "ph": "f", "id": 84657, "pid": 76337, "tid": -914061504, "ts": 1716454222566494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566495, "dur": 0, "args": { "External id": 84658, "cbid": 251, "correlation": 84658 } }, { "ph": "f", "id": 84658, "pid": 76337, "tid": -914061504, "ts": 1716454222566495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566496, "dur": 0, "args": { "External id": 84659, "cbid": 251, "correlation": 84659 } }, { "ph": "f", "id": 84659, "pid": 76337, "tid": -914061504, "ts": 1716454222566496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566496, "dur": 0, "args": { "External id": 84660, "cbid": 251, "correlation": 84660 } }, { "ph": "f", "id": 84660, "pid": 76337, "tid": -914061504, "ts": 1716454222566496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566498, "dur": 0, "args": { "External id": 84661, "cbid": 251, "correlation": 84661 } }, { "ph": "f", "id": 84661, "pid": 76337, "tid": -914061504, "ts": 1716454222566498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222697637, "dur": 114, "args": { "External id": 84662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84662, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84662, "pid": 5, "tid": 7, "ts": 1716454222697637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566500, "dur": 13, "args": { "External id": 84662, "cbid": 211, "correlation": 84662 } }, { "ph": "s", "id": 84662, "pid": 76337, "tid": -914061504, "ts": 1716454222566500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222697753, "dur": 60, "args": { "External id": 84668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84668, "pid": 5, "tid": 7, "ts": 1716454222697753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566535, "dur": 9, "args": { "External id": 84668, "cbid": 211, "correlation": 84668 } }, { "ph": "s", "id": 84668, "pid": 76337, "tid": -914061504, "ts": 1716454222566535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222697815, "dur": 599, "args": { "External id": 84677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84677, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84677, "pid": 5, "tid": 7, "ts": 1716454222697815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566617, "dur": 13, "args": { "External id": 84677, "cbid": 211, "correlation": 84677 } }, { "ph": "s", "id": 84677, "pid": 76337, "tid": -914061504, "ts": 1716454222566617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222698416, "dur": 184, "args": { "External id": 84699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84699, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84699, "pid": 5, "tid": 7, "ts": 1716454222698416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566675, "dur": 10, "args": { "External id": 84699, "cbid": 211, "correlation": 84699 } }, { "ph": "s", "id": 84699, "pid": 76337, "tid": -914061504, "ts": 1716454222566675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566761, "dur": 1, "args": { "External id": 84710, "cbid": 251, "correlation": 84710 } }, { "ph": "f", "id": 84710, "pid": 76337, "tid": -914061504, "ts": 1716454222566761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222698602, "dur": 196, "args": { "External id": 84711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84711, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84711, "pid": 5, "tid": 7, "ts": 1716454222698602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566766, "dur": 13, "args": { "External id": 84711, "cbid": 211, "correlation": 84711 } }, { "ph": "s", "id": 84711, "pid": 76337, "tid": -914061504, "ts": 1716454222566766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566834, "dur": 1, "args": { "External id": 84722, "cbid": 251, "correlation": 84722 } }, { "ph": "f", "id": 84722, "pid": 76337, "tid": -914061504, "ts": 1716454222566834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222698799, "dur": 191, "args": { "External id": 84723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84723, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84723, "pid": 5, "tid": 7, "ts": 1716454222698799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566838, "dur": 11, "args": { "External id": 84723, "cbid": 211, "correlation": 84723 } }, { "ph": "s", "id": 84723, "pid": 76337, "tid": -914061504, "ts": 1716454222566838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222566902, "dur": 1, "args": { "External id": 84734, "cbid": 251, "correlation": 84734 } }, { "ph": "f", "id": 84734, "pid": 76337, "tid": -914061504, "ts": 1716454222566902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222698992, "dur": 188, "args": { "External id": 84735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84735, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84735, "pid": 5, "tid": 7, "ts": 1716454222698992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566906, "dur": 11, "args": { "External id": 84735, "cbid": 211, "correlation": 84735 } }, { "ph": "s", "id": 84735, "pid": 76337, "tid": -914061504, "ts": 1716454222566906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222699181, "dur": 18956, "args": { "External id": 84756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84756, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84756, "pid": 5, "tid": 7, "ts": 1716454222699181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222566992, "dur": 13, "args": { "External id": 84756, "cbid": 211, "correlation": 84756 } }, { "ph": "s", "id": 84756, "pid": 76337, "tid": -914061504, "ts": 1716454222566992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222567090, "dur": 1, "args": { "External id": 84774, "cbid": 251, "correlation": 84774 } }, { "ph": "f", "id": 84774, "pid": 76337, "tid": -914061504, "ts": 1716454222567090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222718139, "dur": 202, "args": { "External id": 84776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84776, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84776, "pid": 5, "tid": 7, "ts": 1716454222718139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222567096, "dur": 13, "args": { "External id": 84776, "cbid": 211, "correlation": 84776 } }, { "ph": "s", "id": 84776, "pid": 76337, "tid": -914061504, "ts": 1716454222567096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222718342, "dur": 66, "args": { "External id": 84784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84784, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84784, "pid": 5, "tid": 7, "ts": 1716454222718342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222567167, "dur": 21, "args": { "External id": 84784, "cbid": 211, "correlation": 84784 } }, { "ph": "s", "id": 84784, "pid": 76337, "tid": -914061504, "ts": 1716454222567167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222718410, "dur": 96, "args": { "External id": 84792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84792, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84792, "pid": 5, "tid": 7, "ts": 1716454222718410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222567217, "dur": 119, "args": { "External id": 84792, "cbid": 211, "correlation": 84792 } }, { "ph": "s", "id": 84792, "pid": 76337, "tid": -914061504, "ts": 1716454222567217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222718508, "dur": 54, "args": { "External id": 84803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84803, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84803, "pid": 5, "tid": 7, "ts": 1716454222718508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222567400, "dur": 1929, "args": { "External id": 84803, "cbid": 211, "correlation": 84803 } }, { "ph": "s", "id": 84803, "pid": 76337, "tid": -914061504, "ts": 1716454222567400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222718563, "dur": 94, "args": { "External id": 84825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84825, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84825, "pid": 5, "tid": 7, "ts": 1716454222718563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569349, "dur": 130, "args": { "External id": 84825, "cbid": 211, "correlation": 84825 } }, { "ph": "s", "id": 84825, "pid": 76337, "tid": -914061504, "ts": 1716454222569349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222569556, "dur": 1, "args": { "External id": 84836, "cbid": 251, "correlation": 84836 } }, { "ph": "f", "id": 84836, "pid": 76337, "tid": -914061504, "ts": 1716454222569556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222718658, "dur": 105, "args": { "External id": 84837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84837, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84837, "pid": 5, "tid": 7, "ts": 1716454222718658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569561, "dur": 14, "args": { "External id": 84837, "cbid": 211, "correlation": 84837 } }, { "ph": "s", "id": 84837, "pid": 76337, "tid": -914061504, "ts": 1716454222569561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222569635, "dur": 1, "args": { "External id": 84848, "cbid": 251, "correlation": 84848 } }, { "ph": "f", "id": 84848, "pid": 76337, "tid": -914061504, "ts": 1716454222569635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222569639, "dur": 0, "args": { "External id": 84849, "cbid": 251, "correlation": 84849 } }, { "ph": "f", "id": 84849, "pid": 76337, "tid": -914061504, "ts": 1716454222569639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222718765, "dur": 10, "args": { "External id": 84850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84850, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 84850, "pid": 5, "tid": 7, "ts": 1716454222718765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569641, "dur": 13, "args": { "External id": 84850, "cbid": 211, "correlation": 84850 } }, { "ph": "s", "id": 84850, "pid": 76337, "tid": -914061504, "ts": 1716454222569641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222718776, "dur": 5, "args": { "External id": 84852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84852, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 84852, "pid": 5, "tid": 7, "ts": 1716454222718776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569655, "dur": 6, "args": { "External id": 84852, "cbid": 211, "correlation": 84852 } }, { "ph": "s", "id": 84852, "pid": 76337, "tid": -914061504, "ts": 1716454222569655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222569716, "dur": 1, "args": { "External id": 84863, "cbid": 251, "correlation": 84863 } }, { "ph": "f", "id": 84863, "pid": 76337, "tid": -914061504, "ts": 1716454222569716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222569720, "dur": 0, "args": { "External id": 84864, "cbid": 251, "correlation": 84864 } }, { "ph": "f", "id": 84864, "pid": 76337, "tid": -914061504, "ts": 1716454222569720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222718782, "dur": 6, "args": { "External id": 84865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84865, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 84865, "pid": 5, "tid": 7, "ts": 1716454222718782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569721, "dur": 12, "args": { "External id": 84865, "cbid": 211, "correlation": 84865 } }, { "ph": "s", "id": 84865, "pid": 76337, "tid": -914061504, "ts": 1716454222569721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222718790, "dur": 3, "args": { "External id": 84867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84867, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 84867, "pid": 5, "tid": 7, "ts": 1716454222718790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569734, "dur": 7, "args": { "External id": 84867, "cbid": 211, "correlation": 84867 } }, { "ph": "s", "id": 84867, "pid": 76337, "tid": -914061504, "ts": 1716454222569734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222718795, "dur": 159, "args": { "External id": 84888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84888, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84888, "pid": 5, "tid": 7, "ts": 1716454222718795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569810, "dur": 12, "args": { "External id": 84888, "cbid": 211, "correlation": 84888 } }, { "ph": "s", "id": 84888, "pid": 76337, "tid": -914061504, "ts": 1716454222569810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222569908, "dur": 1, "args": { "External id": 84906, "cbid": 251, "correlation": 84906 } }, { "ph": "f", "id": 84906, "pid": 76337, "tid": -914061504, "ts": 1716454222569908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222718955, "dur": 109, "args": { "External id": 84908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84908, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 84908, "pid": 5, "tid": 7, "ts": 1716454222718955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569914, "dur": 13, "args": { "External id": 84908, "cbid": 211, "correlation": 84908 } }, { "ph": "s", "id": 84908, "pid": 76337, "tid": -914061504, "ts": 1716454222569914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222719066, "dur": 35, "args": { "External id": 84916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84916, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84916, "pid": 5, "tid": 7, "ts": 1716454222719066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222569992, "dur": 13, "args": { "External id": 84916, "cbid": 211, "correlation": 84916 } }, { "ph": "s", "id": 84916, "pid": 76337, "tid": -914061504, "ts": 1716454222569992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222719102, "dur": 67, "args": { "External id": 84924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84924, "pid": 5, "tid": 7, "ts": 1716454222719102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570034, "dur": 9, "args": { "External id": 84924, "cbid": 211, "correlation": 84924 } }, { "ph": "s", "id": 84924, "pid": 76337, "tid": -914061504, "ts": 1716454222570034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222719170, "dur": 94, "args": { "External id": 84946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84946, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84946, "pid": 5, "tid": 7, "ts": 1716454222719170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570086, "dur": 10, "args": { "External id": 84946, "cbid": 211, "correlation": 84946 } }, { "ph": "s", "id": 84946, "pid": 76337, "tid": -914061504, "ts": 1716454222570086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570174, "dur": 1, "args": { "External id": 84962, "cbid": 251, "correlation": 84962 } }, { "ph": "f", "id": 84962, "pid": 76337, "tid": -914061504, "ts": 1716454222570174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222719265, "dur": 584, "args": { "External id": 84964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84964, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 84964, "pid": 5, "tid": 7, "ts": 1716454222719265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570180, "dur": 13, "args": { "External id": 84964, "cbid": 211, "correlation": 84964 } }, { "ph": "s", "id": 84964, "pid": 76337, "tid": -914061504, "ts": 1716454222570180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222719851, "dur": 247, "args": { "External id": 84972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84972, "pid": 5, "tid": 7, "ts": 1716454222719851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570246, "dur": 13, "args": { "External id": 84972, "cbid": 211, "correlation": 84972 } }, { "ph": "s", "id": 84972, "pid": 76337, "tid": -914061504, "ts": 1716454222570246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222720099, "dur": 256, "args": { "External id": 84980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 84980, "pid": 5, "tid": 7, "ts": 1716454222720099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570276, "dur": 8, "args": { "External id": 84980, "cbid": 211, "correlation": 84980 } }, { "ph": "s", "id": 84980, "pid": 76337, "tid": -914061504, "ts": 1716454222570276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570357, "dur": 1, "args": { "External id": 84996, "cbid": 251, "correlation": 84996 } }, { "ph": "f", "id": 84996, "pid": 76337, "tid": -914061504, "ts": 1716454222570357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570362, "dur": 0, "args": { "External id": 84998, "cbid": 251, "correlation": 84998 } }, { "ph": "f", "id": 84998, "pid": 76337, "tid": -914061504, "ts": 1716454222570362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222720356, "dur": 360, "args": { "External id": 84999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 84999, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 84999, "pid": 5, "tid": 7, "ts": 1716454222720356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570365, "dur": 13, "args": { "External id": 84999, "cbid": 211, "correlation": 84999 } }, { "ph": "s", "id": 84999, "pid": 76337, "tid": -914061504, "ts": 1716454222570365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222720718, "dur": 50, "args": { "External id": 85007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85007, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85007, "pid": 5, "tid": 7, "ts": 1716454222720718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570407, "dur": 196, "args": { "External id": 85007, "cbid": 211, "correlation": 85007 } }, { "ph": "s", "id": 85007, "pid": 76337, "tid": -914061504, "ts": 1716454222570407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222720769, "dur": 161, "args": { "External id": 85018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85018, "pid": 5, "tid": 7, "ts": 1716454222720769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570660, "dur": 72, "args": { "External id": 85018, "cbid": 211, "correlation": 85018 } }, { "ph": "s", "id": 85018, "pid": 76337, "tid": -914061504, "ts": 1716454222570660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222570786, "dur": 0, "args": { "External id": 85030, "cbid": 317, "correlation": 85030 } }, { "ph": "f", "id": 85030, "pid": 76337, "tid": -914061504, "ts": 1716454222570786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222570787, "dur": 0, "args": { "External id": 85031, "cbid": 203, "correlation": 85031 } }, { "ph": "f", "id": 85031, "pid": 76337, "tid": -914061504, "ts": 1716454222570787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222570788, "dur": 0, "args": { "External id": 85032, "cbid": 205, "correlation": 85032 } }, { "ph": "f", "id": 85032, "pid": 76337, "tid": -914061504, "ts": 1716454222570788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570810, "dur": 1, "args": { "External id": 85036, "cbid": 251, "correlation": 85036 } }, { "ph": "f", "id": 85036, "pid": 76337, "tid": -914061504, "ts": 1716454222570810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570812, "dur": 0, "args": { "External id": 85037, "cbid": 251, "correlation": 85037 } }, { "ph": "f", "id": 85037, "pid": 76337, "tid": -914061504, "ts": 1716454222570812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570813, "dur": 0, "args": { "External id": 85038, "cbid": 251, "correlation": 85038 } }, { "ph": "f", "id": 85038, "pid": 76337, "tid": -914061504, "ts": 1716454222570813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570814, "dur": 0, "args": { "External id": 85039, "cbid": 251, "correlation": 85039 } }, { "ph": "f", "id": 85039, "pid": 76337, "tid": -914061504, "ts": 1716454222570814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570815, "dur": 0, "args": { "External id": 85040, "cbid": 251, "correlation": 85040 } }, { "ph": "f", "id": 85040, "pid": 76337, "tid": -914061504, "ts": 1716454222570815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570815, "dur": 0, "args": { "External id": 85041, "cbid": 251, "correlation": 85041 } }, { "ph": "f", "id": 85041, "pid": 76337, "tid": -914061504, "ts": 1716454222570815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570816, "dur": 0, "args": { "External id": 85042, "cbid": 251, "correlation": 85042 } }, { "ph": "f", "id": 85042, "pid": 76337, "tid": -914061504, "ts": 1716454222570816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570817, "dur": 0, "args": { "External id": 85043, "cbid": 251, "correlation": 85043 } }, { "ph": "f", "id": 85043, "pid": 76337, "tid": -914061504, "ts": 1716454222570817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222570818, "dur": 0, "args": { "External id": 85044, "cbid": 251, "correlation": 85044 } }, { "ph": "f", "id": 85044, "pid": 76337, "tid": -914061504, "ts": 1716454222570818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222720931, "dur": 117, "args": { "External id": 85045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85045, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 85045, "pid": 5, "tid": 7, "ts": 1716454222720931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570820, "dur": 43, "args": { "External id": 85045, "cbid": 211, "correlation": 85045 } }, { "ph": "s", "id": 85045, "pid": 76337, "tid": -914061504, "ts": 1716454222570820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222721050, "dur": 60, "args": { "External id": 85051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85051, "pid": 5, "tid": 7, "ts": 1716454222721050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222570885, "dur": 286, "args": { "External id": 85051, "cbid": 211, "correlation": 85051 } }, { "ph": "s", "id": 85051, "pid": 76337, "tid": -914061504, "ts": 1716454222570885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721111, "dur": 50, "args": { "External id": 85059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85059, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85059, "pid": 5, "tid": 7, "ts": 1716454222721111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571195, "dur": 9, "args": { "External id": 85059, "cbid": 211, "correlation": 85059 } }, { "ph": "s", "id": 85059, "pid": 76337, "tid": -914061504, "ts": 1716454222571195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222721162, "dur": 53, "args": { "External id": 85079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85079, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 85079, "pid": 5, "tid": 7, "ts": 1716454222721162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571276, "dur": 12, "args": { "External id": 85079, "cbid": 211, "correlation": 85079 } }, { "ph": "s", "id": 85079, "pid": 76337, "tid": -914061504, "ts": 1716454222571276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222721217, "dur": 4, "args": { "External id": 85091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85091, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 85091, "pid": 5, "tid": 7, "ts": 1716454222721217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571298, "dur": 10, "args": { "External id": 85091, "cbid": 211, "correlation": 85091 } }, { "ph": "s", "id": 85091, "pid": 76337, "tid": -914061504, "ts": 1716454222571298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222721223, "dur": 56, "args": { "External id": 85094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85094, "pid": 5, "tid": 7, "ts": 1716454222721223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571320, "dur": 111, "args": { "External id": 85094, "cbid": 211, "correlation": 85094 } }, { "ph": "s", "id": 85094, "pid": 76337, "tid": -914061504, "ts": 1716454222571320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721280, "dur": 37, "args": { "External id": 85103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85103, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85103, "pid": 5, "tid": 7, "ts": 1716454222721280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571471, "dur": 10, "args": { "External id": 85103, "cbid": 211, "correlation": 85103 } }, { "ph": "s", "id": 85103, "pid": 76337, "tid": -914061504, "ts": 1716454222571471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222571526, "dur": 0, "args": { "External id": 85113, "cbid": 317, "correlation": 85113 } }, { "ph": "f", "id": 85113, "pid": 76337, "tid": -914061504, "ts": 1716454222571526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222571527, "dur": 0, "args": { "External id": 85114, "cbid": 203, "correlation": 85114 } }, { "ph": "f", "id": 85114, "pid": 76337, "tid": -914061504, "ts": 1716454222571527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222571528, "dur": 0, "args": { "External id": 85115, "cbid": 205, "correlation": 85115 } }, { "ph": "f", "id": 85115, "pid": 76337, "tid": -914061504, "ts": 1716454222571528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222721318, "dur": 43, "args": { "External id": 85119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85119, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85119, "pid": 5, "tid": 7, "ts": 1716454222721318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571544, "dur": 12, "args": { "External id": 85119, "cbid": 211, "correlation": 85119 } }, { "ph": "s", "id": 85119, "pid": 76337, "tid": -914061504, "ts": 1716454222571544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222721362, "dur": 4, "args": { "External id": 85121, "device": 5, "context": 1, "stream": 7, "correlation": 85121, "bytes": 46080, "memory bandwidth (GB/s)": 11.52 } }, { "ph": "f", "id": 85121, "pid": 5, "tid": 7, "ts": 1716454222721362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222571559, "dur": 17, "args": { "External id": 85121, "cbid": 51, "correlation": 85121 } }, { "ph": "s", "id": 85121, "pid": 76337, "tid": -914061504, "ts": 1716454222571559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222571581, "dur": 1, "args": { "External id": 85123, "cbid": 200, "correlation": 85123 } }, { "ph": "f", "id": 85123, "pid": 76337, "tid": -914061504, "ts": 1716454222571581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222571583, "dur": 0, "args": { "External id": 85124, "cbid": 200, "correlation": 85124 } }, { "ph": "f", "id": 85124, "pid": 76337, "tid": -914061504, "ts": 1716454222571583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222571583, "dur": 0, "args": { "External id": 85125, "cbid": 200, "correlation": 85125 } }, { "ph": "f", "id": 85125, "pid": 76337, "tid": -914061504, "ts": 1716454222571583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222571584, "dur": 0, "args": { "External id": 85126, "cbid": 200, "correlation": 85126 } }, { "ph": "f", "id": 85126, "pid": 76337, "tid": -914061504, "ts": 1716454222571584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454222571585, "dur": 4, "args": { "External id": 85127, "cbid": 15, "correlation": 85127 } }, { "ph": "f", "id": 85127, "pid": 76337, "tid": -914061504, "ts": 1716454222571585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222571590, "dur": 1, "args": { "External id": 85128, "cbid": 251, "correlation": 85128 } }, { "ph": "f", "id": 85128, "pid": 76337, "tid": -914061504, "ts": 1716454222571590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454222721367, "dur": 24, "args": { "External id": 85129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85129, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85129, "pid": 5, "tid": 7, "ts": 1716454222721367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571593, "dur": 8, "args": { "External id": 85129, "cbid": 211, "correlation": 85129 } }, { "ph": "s", "id": 85129, "pid": 76337, "tid": -914061504, "ts": 1716454222571593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222721392, "dur": 4, "args": { "External id": 85131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 85131, "pid": 5, "tid": 7, "ts": 1716454222721392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571606, "dur": 7, "args": { "External id": 85131, "cbid": 211, "correlation": 85131 } }, { "ph": "s", "id": 85131, "pid": 76337, "tid": -914061504, "ts": 1716454222571606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222571617, "dur": 0, "args": { "External id": 85132, "cbid": 51, "correlation": 85132 } }, { "ph": "s", "id": 85132, "pid": 76337, "tid": -914061504, "ts": 1716454222571617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222721397, "dur": 191, "args": { "External id": 85133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85133, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85133, "pid": 5, "tid": 7, "ts": 1716454222721397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571618, "dur": 211, "args": { "External id": 85133, "cbid": 211, "correlation": 85133 } }, { "ph": "s", "id": 85133, "pid": 76337, "tid": -914061504, "ts": 1716454222571618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222721589, "dur": 6, "args": { "External id": 85134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85134, "pid": 5, "tid": 7, "ts": 1716454222721589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571832, "dur": 6, "args": { "External id": 85134, "cbid": 211, "correlation": 85134 } }, { "ph": "s", "id": 85134, "pid": 76337, "tid": -914061504, "ts": 1716454222571832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222721597, "dur": 5, "args": { "External id": 85140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 85140, "pid": 5, "tid": 7, "ts": 1716454222721597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222571862, "dur": 9, "args": { "External id": 85140, "cbid": 211, "correlation": 85140 } }, { "ph": "s", "id": 85140, "pid": 76337, "tid": -914061504, "ts": 1716454222571862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721603, "dur": 3, "args": { "External id": 85148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85148, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85148, "pid": 5, "tid": 7, "ts": 1716454222721603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222573552, "dur": 14, "args": { "External id": 85148, "cbid": 211, "correlation": 85148 } }, { "ph": "s", "id": 85148, "pid": 76337, "tid": -914061504, "ts": 1716454222573552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721607, "dur": 3, "args": { "External id": 85156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85156, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85156, "pid": 5, "tid": 7, "ts": 1716454222721607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222573593, "dur": 11, "args": { "External id": 85156, "cbid": 211, "correlation": 85156 } }, { "ph": "s", "id": 85156, "pid": 76337, "tid": -914061504, "ts": 1716454222573593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721611, "dur": 3, "args": { "External id": 85164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85164, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85164, "pid": 5, "tid": 7, "ts": 1716454222721611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222573621, "dur": 8, "args": { "External id": 85164, "cbid": 211, "correlation": 85164 } }, { "ph": "s", "id": 85164, "pid": 76337, "tid": -914061504, "ts": 1716454222573621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721615, "dur": 3, "args": { "External id": 85173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85173, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85173, "pid": 5, "tid": 7, "ts": 1716454222721615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222573798, "dur": 14, "args": { "External id": 85173, "cbid": 211, "correlation": 85173 } }, { "ph": "s", "id": 85173, "pid": 76337, "tid": -914061504, "ts": 1716454222573798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721619, "dur": 3, "args": { "External id": 85182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85182, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85182, "pid": 5, "tid": 7, "ts": 1716454222721619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222573827, "dur": 7, "args": { "External id": 85182, "cbid": 211, "correlation": 85182 } }, { "ph": "s", "id": 85182, "pid": 76337, "tid": -914061504, "ts": 1716454222573827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721623, "dur": 3, "args": { "External id": 85190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85190, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85190, "pid": 5, "tid": 7, "ts": 1716454222721623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222573852, "dur": 8, "args": { "External id": 85190, "cbid": 211, "correlation": 85190 } }, { "ph": "s", "id": 85190, "pid": 76337, "tid": -914061504, "ts": 1716454222573852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721628, "dur": 3, "args": { "External id": 85198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85198, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85198, "pid": 5, "tid": 7, "ts": 1716454222721628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222574120, "dur": 16, "args": { "External id": 85198, "cbid": 211, "correlation": 85198 } }, { "ph": "s", "id": 85198, "pid": 76337, "tid": -914061504, "ts": 1716454222574120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222721632, "dur": 3, "args": { "External id": 85206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85206, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85206, "pid": 5, "tid": 7, "ts": 1716454222721632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222574152, "dur": 7, "args": { "External id": 85206, "cbid": 211, "correlation": 85206 } }, { "ph": "s", "id": 85206, "pid": 76337, "tid": -914061504, "ts": 1716454222574152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222721637, "dur": 1, "args": { "External id": 85216, "device": 5, "context": 1, "stream": 7, "correlation": 85216, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85216, "pid": 5, "tid": 7, "ts": 1716454222721637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222574215, "dur": 36, "args": { "External id": 85216, "cbid": 41, "correlation": 85216 } }, { "ph": "s", "id": 85216, "pid": 76337, "tid": -914061504, "ts": 1716454222574215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222574252, "dur": 147413, "args": { "External id": 85217, "cbid": 131, "correlation": 85217 } }, { "ph": "f", "id": 85217, "pid": 76337, "tid": -914061504, "ts": 1716454222574252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222722022, "dur": 3, "args": { "External id": 85225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85225, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85225, "pid": 5, "tid": 7, "ts": 1716454222722022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222721958, "dur": 67, "args": { "External id": 85225, "cbid": 211, "correlation": 85225 } }, { "ph": "s", "id": 85225, "pid": 76337, "tid": -914061504, "ts": 1716454222721958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722148, "dur": 3, "args": { "External id": 85234, "device": 5, "context": 1, "stream": 7, "correlation": 85234, "bytes": 8, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 85234, "pid": 5, "tid": 7, "ts": 1716454222722148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722103, "dur": 45, "args": { "External id": 85234, "cbid": 41, "correlation": 85234 } }, { "ph": "s", "id": 85234, "pid": 76337, "tid": -914061504, "ts": 1716454222722103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222722260, "dur": 4, "args": { "External id": 85244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85244, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85244, "pid": 5, "tid": 7, "ts": 1716454222722260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722242, "dur": 19, "args": { "External id": 85244, "cbid": 211, "correlation": 85244 } }, { "ph": "s", "id": 85244, "pid": 76337, "tid": -914061504, "ts": 1716454222722242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722362, "dur": 1, "args": { "External id": 85254, "device": 5, "context": 1, "stream": 7, "correlation": 85254, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85254, "pid": 5, "tid": 7, "ts": 1716454222722362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722339, "dur": 21, "args": { "External id": 85254, "cbid": 41, "correlation": 85254 } }, { "ph": "s", "id": 85254, "pid": 76337, "tid": -914061504, "ts": 1716454222722339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222722362, "dur": 9, "args": { "External id": 85255, "cbid": 131, "correlation": 85255 } }, { "ph": "f", "id": 85255, "pid": 76337, "tid": -914061504, "ts": 1716454222722362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722444, "dur": 3, "args": { "External id": 85262, "device": 5, "context": 1, "stream": 7, "correlation": 85262, "bytes": 98304, "memory bandwidth (GB/s)": 30.415841584158414 } }, { "ph": "f", "id": 85262, "pid": 5, "tid": 7, "ts": 1716454222722444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722420, "dur": 23, "args": { "External id": 85262, "cbid": 41, "correlation": 85262 } }, { "ph": "s", "id": 85262, "pid": 76337, "tid": -914061504, "ts": 1716454222722420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722541, "dur": 3, "args": { "External id": 85281, "device": 5, "context": 1, "stream": 7, "correlation": 85281, "bytes": 16, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85281, "pid": 5, "tid": 7, "ts": 1716454222722541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722521, "dur": 19, "args": { "External id": 85281, "cbid": 41, "correlation": 85281 } }, { "ph": "s", "id": 85281, "pid": 76337, "tid": -914061504, "ts": 1716454222722521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222722580, "dur": 3, "args": { "External id": 85287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85287, "pid": 5, "tid": 7, "ts": 1716454222722580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722569, "dur": 11, "args": { "External id": 85287, "cbid": 211, "correlation": 85287 } }, { "ph": "s", "id": 85287, "pid": 76337, "tid": -914061504, "ts": 1716454222722569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454222722595, "dur": 6, "args": { "External id": 85289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85289, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 85289, "pid": 5, "tid": 7, "ts": 1716454222722595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722584, "dur": 9, "args": { "External id": 85289, "cbid": 211, "correlation": 85289 } }, { "ph": "s", "id": 85289, "pid": 76337, "tid": -914061504, "ts": 1716454222722584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454222722604, "dur": 3, "args": { "External id": 85291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85291, "pid": 5, "tid": 7, "ts": 1716454222722604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722595, "dur": 7, "args": { "External id": 85291, "cbid": 211, "correlation": 85291 } }, { "ph": "s", "id": 85291, "pid": 76337, "tid": -914061504, "ts": 1716454222722595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722638, "dur": 3, "args": { "External id": 85299, "device": 5, "context": 1, "stream": 7, "correlation": 85299, "bytes": 8, "memory bandwidth (GB/s)": 0.002631578947368421 } }, { "ph": "f", "id": 85299, "pid": 5, "tid": 7, "ts": 1716454222722638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722623, "dur": 14, "args": { "External id": 85299, "cbid": 41, "correlation": 85299 } }, { "ph": "s", "id": 85299, "pid": 76337, "tid": -914061504, "ts": 1716454222722623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222722688, "dur": 3, "args": { "External id": 85313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85313, "pid": 5, "tid": 7, "ts": 1716454222722688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722676, "dur": 13, "args": { "External id": 85313, "cbid": 211, "correlation": 85313 } }, { "ph": "s", "id": 85313, "pid": 76337, "tid": -914061504, "ts": 1716454222722676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222722708, "dur": 2, "args": { "External id": 85327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85327, "pid": 5, "tid": 7, "ts": 1716454222722708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722700, "dur": 6, "args": { "External id": 85327, "cbid": 211, "correlation": 85327 } }, { "ph": "s", "id": 85327, "pid": 76337, "tid": -914061504, "ts": 1716454222722700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222722744, "dur": 6, "args": { "External id": 85334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85334, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85334, "pid": 5, "tid": 7, "ts": 1716454222722744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722734, "dur": 11, "args": { "External id": 85334, "cbid": 211, "correlation": 85334 } }, { "ph": "s", "id": 85334, "pid": 76337, "tid": -914061504, "ts": 1716454222722734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222722755, "dur": 6, "args": { "External id": 85337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85337, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85337, "pid": 5, "tid": 7, "ts": 1716454222722755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722747, "dur": 7, "args": { "External id": 85337, "cbid": 211, "correlation": 85337 } }, { "ph": "s", "id": 85337, "pid": 76337, "tid": -914061504, "ts": 1716454222722747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454222722763, "dur": 3, "args": { "External id": 85339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85339, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85339, "pid": 5, "tid": 7, "ts": 1716454222722763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722756, "dur": 6, "args": { "External id": 85339, "cbid": 211, "correlation": 85339 } }, { "ph": "s", "id": 85339, "pid": 76337, "tid": -914061504, "ts": 1716454222722756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722784, "dur": 2, "args": { "External id": 85342, "device": 5, "context": 1, "stream": 7, "correlation": 85342, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 85342, "pid": 5, "tid": 7, "ts": 1716454222722784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722771, "dur": 12, "args": { "External id": 85342, "cbid": 41, "correlation": 85342 } }, { "ph": "s", "id": 85342, "pid": 76337, "tid": -914061504, "ts": 1716454222722771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222722838, "dur": 4, "args": { "External id": 85358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85358, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85358, "pid": 5, "tid": 7, "ts": 1716454222722838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722824, "dur": 14, "args": { "External id": 85358, "cbid": 211, "correlation": 85358 } }, { "ph": "s", "id": 85358, "pid": 76337, "tid": -914061504, "ts": 1716454222722824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222722858, "dur": 3, "args": { "External id": 85363, "device": 5, "context": 1, "stream": 7, "correlation": 85363, "bytes": 1, "memory bandwidth (GB/s)": 0.0003125 } }, { "ph": "f", "id": 85363, "pid": 5, "tid": 7, "ts": 1716454222722858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722844, "dur": 13, "args": { "External id": 85363, "cbid": 41, "correlation": 85363 } }, { "ph": "s", "id": 85363, "pid": 76337, "tid": -914061504, "ts": 1716454222722844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222722886, "dur": 1, "args": { "External id": 85369, "device": 5, "context": 1, "stream": 7, "correlation": 85369, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 85369, "pid": 5, "tid": 7, "ts": 1716454222722886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222722867, "dur": 28, "args": { "External id": 85369, "cbid": 41, "correlation": 85369 } }, { "ph": "s", "id": 85369, "pid": 76337, "tid": -914061504, "ts": 1716454222722867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222722896, "dur": 4, "args": { "External id": 85370, "cbid": 131, "correlation": 85370 } }, { "ph": "f", "id": 85370, "pid": 76337, "tid": -914061504, "ts": 1716454222722896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222722956, "dur": 3, "args": { "External id": 85378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85378, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85378, "pid": 5, "tid": 7, "ts": 1716454222722956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722939, "dur": 17, "args": { "External id": 85378, "cbid": 211, "correlation": 85378 } }, { "ph": "s", "id": 85378, "pid": 76337, "tid": -914061504, "ts": 1716454222722939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222722997, "dur": 3, "args": { "External id": 85388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85388, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85388, "pid": 5, "tid": 7, "ts": 1716454222722997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222722985, "dur": 11, "args": { "External id": 85388, "cbid": 211, "correlation": 85388 } }, { "ph": "s", "id": 85388, "pid": 76337, "tid": -914061504, "ts": 1716454222722985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222723027, "dur": 3, "args": { "External id": 85397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85397, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85397, "pid": 5, "tid": 7, "ts": 1716454222723027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723017, "dur": 9, "args": { "External id": 85397, "cbid": 211, "correlation": 85397 } }, { "ph": "s", "id": 85397, "pid": 76337, "tid": -914061504, "ts": 1716454222723017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222723158, "dur": 12, "args": { "External id": 85407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85407, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85407, "pid": 5, "tid": 7, "ts": 1716454222723158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723143, "dur": 16, "args": { "External id": 85407, "cbid": 211, "correlation": 85407 } }, { "ph": "s", "id": 85407, "pid": 76337, "tid": -914061504, "ts": 1716454222723143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222723199, "dur": 3, "args": { "External id": 85415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85415, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85415, "pid": 5, "tid": 7, "ts": 1716454222723199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723189, "dur": 8, "args": { "External id": 85415, "cbid": 211, "correlation": 85415 } }, { "ph": "s", "id": 85415, "pid": 76337, "tid": -914061504, "ts": 1716454222723189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222723247, "dur": 12, "args": { "External id": 85425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85425, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85425, "pid": 5, "tid": 7, "ts": 1716454222723247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723236, "dur": 11, "args": { "External id": 85425, "cbid": 211, "correlation": 85425 } }, { "ph": "s", "id": 85425, "pid": 76337, "tid": -914061504, "ts": 1716454222723236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222723284, "dur": 10, "args": { "External id": 85433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85433, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85433, "pid": 5, "tid": 7, "ts": 1716454222723284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723274, "dur": 9, "args": { "External id": 85433, "cbid": 211, "correlation": 85433 } }, { "ph": "s", "id": 85433, "pid": 76337, "tid": -914061504, "ts": 1716454222723274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222723313, "dur": 3, "args": { "External id": 85442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85442, "pid": 5, "tid": 7, "ts": 1716454222723313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723303, "dur": 9, "args": { "External id": 85442, "cbid": 211, "correlation": 85442 } }, { "ph": "s", "id": 85442, "pid": 76337, "tid": -914061504, "ts": 1716454222723303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222723337, "dur": 5, "args": { "External id": 85451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85451, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85451, "pid": 5, "tid": 7, "ts": 1716454222723337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723329, "dur": 8, "args": { "External id": 85451, "cbid": 211, "correlation": 85451 } }, { "ph": "s", "id": 85451, "pid": 76337, "tid": -914061504, "ts": 1716454222723329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222723380, "dur": 8, "args": { "External id": 85461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85461, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85461, "pid": 5, "tid": 7, "ts": 1716454222723380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723368, "dur": 12, "args": { "External id": 85461, "cbid": 211, "correlation": 85461 } }, { "ph": "s", "id": 85461, "pid": 76337, "tid": -914061504, "ts": 1716454222723368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222723729, "dur": 3, "args": { "External id": 85470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85470, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85470, "pid": 5, "tid": 7, "ts": 1716454222723729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723714, "dur": 16, "args": { "External id": 85470, "cbid": 211, "correlation": 85470 } }, { "ph": "s", "id": 85470, "pid": 76337, "tid": -914061504, "ts": 1716454222723714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222723761, "dur": 3, "args": { "External id": 85478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85478, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85478, "pid": 5, "tid": 7, "ts": 1716454222723761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723750, "dur": 11, "args": { "External id": 85478, "cbid": 211, "correlation": 85478 } }, { "ph": "s", "id": 85478, "pid": 76337, "tid": -914061504, "ts": 1716454222723750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222723814, "dur": 1, "args": { "External id": 85488, "device": 5, "context": 1, "stream": 7, "correlation": 85488, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85488, "pid": 5, "tid": 7, "ts": 1716454222723814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222723799, "dur": 13, "args": { "External id": 85488, "cbid": 41, "correlation": 85488 } }, { "ph": "s", "id": 85488, "pid": 76337, "tid": -914061504, "ts": 1716454222723799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222723813, "dur": 9, "args": { "External id": 85489, "cbid": 131, "correlation": 85489 } }, { "ph": "f", "id": 85489, "pid": 76337, "tid": -914061504, "ts": 1716454222723813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222723907, "dur": 2, "args": { "External id": 85497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85497, "pid": 5, "tid": 7, "ts": 1716454222723907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222723893, "dur": 15, "args": { "External id": 85497, "cbid": 211, "correlation": 85497 } }, { "ph": "s", "id": 85497, "pid": 76337, "tid": -914061504, "ts": 1716454222723893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222723990, "dur": 3, "args": { "External id": 85506, "device": 5, "context": 1, "stream": 7, "correlation": 85506, "bytes": 8, "memory bandwidth (GB/s)": 0.002577319587628866 } }, { "ph": "f", "id": 85506, "pid": 5, "tid": 7, "ts": 1716454222723990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222723964, "dur": 26, "args": { "External id": 85506, "cbid": 41, "correlation": 85506 } }, { "ph": "s", "id": 85506, "pid": 76337, "tid": -914061504, "ts": 1716454222723964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222724067, "dur": 3, "args": { "External id": 85516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85516, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85516, "pid": 5, "tid": 7, "ts": 1716454222724067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724052, "dur": 15, "args": { "External id": 85516, "cbid": 211, "correlation": 85516 } }, { "ph": "s", "id": 85516, "pid": 76337, "tid": -914061504, "ts": 1716454222724052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222724120, "dur": 1, "args": { "External id": 85526, "device": 5, "context": 1, "stream": 7, "correlation": 85526, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85526, "pid": 5, "tid": 7, "ts": 1716454222724120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724104, "dur": 14, "args": { "External id": 85526, "cbid": 41, "correlation": 85526 } }, { "ph": "s", "id": 85526, "pid": 76337, "tid": -914061504, "ts": 1716454222724104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724119, "dur": 8, "args": { "External id": 85527, "cbid": 131, "correlation": 85527 } }, { "ph": "f", "id": 85527, "pid": 76337, "tid": -914061504, "ts": 1716454222724119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222724181, "dur": 3, "args": { "External id": 85534, "device": 5, "context": 1, "stream": 7, "correlation": 85534, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 85534, "pid": 5, "tid": 7, "ts": 1716454222724181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724160, "dur": 20, "args": { "External id": 85534, "cbid": 41, "correlation": 85534 } }, { "ph": "s", "id": 85534, "pid": 76337, "tid": -914061504, "ts": 1716454222724160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222724228, "dur": 1, "args": { "External id": 85545, "device": 5, "context": 1, "stream": 7, "correlation": 85545, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 85545, "pid": 5, "tid": 7, "ts": 1716454222724228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724216, "dur": 10, "args": { "External id": 85545, "cbid": 41, "correlation": 85545 } }, { "ph": "s", "id": 85545, "pid": 76337, "tid": -914061504, "ts": 1716454222724216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724227, "dur": 8, "args": { "External id": 85546, "cbid": 131, "correlation": 85546 } }, { "ph": "f", "id": 85546, "pid": 76337, "tid": -914061504, "ts": 1716454222724227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222724276, "dur": 3, "args": { "External id": 85554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85554, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85554, "pid": 5, "tid": 7, "ts": 1716454222724276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724263, "dur": 13, "args": { "External id": 85554, "cbid": 211, "correlation": 85554 } }, { "ph": "s", "id": 85554, "pid": 76337, "tid": -914061504, "ts": 1716454222724263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222724306, "dur": 3, "args": { "External id": 85564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85564, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85564, "pid": 5, "tid": 7, "ts": 1716454222724306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724297, "dur": 8, "args": { "External id": 85564, "cbid": 211, "correlation": 85564 } }, { "ph": "s", "id": 85564, "pid": 76337, "tid": -914061504, "ts": 1716454222724297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222724328, "dur": 3, "args": { "External id": 85573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85573, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85573, "pid": 5, "tid": 7, "ts": 1716454222724328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724320, "dur": 7, "args": { "External id": 85573, "cbid": 211, "correlation": 85573 } }, { "ph": "s", "id": 85573, "pid": 76337, "tid": -914061504, "ts": 1716454222724320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222724397, "dur": 6, "args": { "External id": 85581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85581, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85581, "pid": 5, "tid": 7, "ts": 1716454222724397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724384, "dur": 14, "args": { "External id": 85581, "cbid": 211, "correlation": 85581 } }, { "ph": "s", "id": 85581, "pid": 76337, "tid": -914061504, "ts": 1716454222724384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222724437, "dur": 3, "args": { "External id": 85590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85590, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85590, "pid": 5, "tid": 7, "ts": 1716454222724437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724427, "dur": 9, "args": { "External id": 85590, "cbid": 211, "correlation": 85590 } }, { "ph": "s", "id": 85590, "pid": 76337, "tid": -914061504, "ts": 1716454222724427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222724460, "dur": 3, "args": { "External id": 85599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85599, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85599, "pid": 5, "tid": 7, "ts": 1716454222724460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724451, "dur": 8, "args": { "External id": 85599, "cbid": 211, "correlation": 85599 } }, { "ph": "s", "id": 85599, "pid": 76337, "tid": -914061504, "ts": 1716454222724451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222724526, "dur": 3, "args": { "External id": 85607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85607, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85607, "pid": 5, "tid": 7, "ts": 1716454222724526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724515, "dur": 10, "args": { "External id": 85607, "cbid": 211, "correlation": 85607 } }, { "ph": "s", "id": 85607, "pid": 76337, "tid": -914061504, "ts": 1716454222724515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222724584, "dur": 1, "args": { "External id": 85615, "device": 5, "context": 1, "stream": 7, "correlation": 85615, "bytes": 8, "memory bandwidth (GB/s)": 0.0043859649122807015 } }, { "ph": "f", "id": 85615, "pid": 5, "tid": 7, "ts": 1716454222724584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724568, "dur": 26, "args": { "External id": 85615, "cbid": 41, "correlation": 85615 } }, { "ph": "s", "id": 85615, "pid": 76337, "tid": -914061504, "ts": 1716454222724568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724594, "dur": 3, "args": { "External id": 85616, "cbid": 131, "correlation": 85616 } }, { "ph": "f", "id": 85616, "pid": 76337, "tid": -914061504, "ts": 1716454222724594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222724656, "dur": 1, "args": { "External id": 85626, "device": 5, "context": 1, "stream": 7, "correlation": 85626, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 85626, "pid": 5, "tid": 7, "ts": 1716454222724656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724643, "dur": 11, "args": { "External id": 85626, "cbid": 41, "correlation": 85626 } }, { "ph": "s", "id": 85626, "pid": 76337, "tid": -914061504, "ts": 1716454222724643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724655, "dur": 8, "args": { "External id": 85627, "cbid": 131, "correlation": 85627 } }, { "ph": "f", "id": 85627, "pid": 76337, "tid": -914061504, "ts": 1716454222724655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222724759, "dur": 1, "args": { "External id": 85636, "device": 5, "context": 1, "stream": 7, "correlation": 85636, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85636, "pid": 5, "tid": 7, "ts": 1716454222724759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724704, "dur": 53, "args": { "External id": 85636, "cbid": 41, "correlation": 85636 } }, { "ph": "s", "id": 85636, "pid": 76337, "tid": -914061504, "ts": 1716454222724704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724758, "dur": 8, "args": { "External id": 85637, "cbid": 131, "correlation": 85637 } }, { "ph": "f", "id": 85637, "pid": 76337, "tid": -914061504, "ts": 1716454222724758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222724834, "dur": 4, "args": { "External id": 85644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85644, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85644, "pid": 5, "tid": 7, "ts": 1716454222724834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724817, "dur": 17, "args": { "External id": 85644, "cbid": 211, "correlation": 85644 } }, { "ph": "s", "id": 85644, "pid": 76337, "tid": -914061504, "ts": 1716454222724817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454222724871, "dur": 4, "args": { "External id": 85664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85664, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85664, "pid": 5, "tid": 7, "ts": 1716454222724871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724859, "dur": 12, "args": { "External id": 85664, "cbid": 211, "correlation": 85664 } }, { "ph": "s", "id": 85664, "pid": 76337, "tid": -914061504, "ts": 1716454222724859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222724872, "dur": 0, "args": { "External id": 85665, "cbid": 11, "correlation": 85665 } }, { "ph": "f", "id": 85665, "pid": 76337, "tid": -914061504, "ts": 1716454222724872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222724872, "dur": 0, "args": { "External id": 85666, "cbid": 11, "correlation": 85666 } }, { "ph": "f", "id": 85666, "pid": 76337, "tid": -914061504, "ts": 1716454222724872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222724886, "dur": 1, "args": { "External id": 85669, "device": 5, "context": 1, "stream": 7, "correlation": 85669, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 85669, "pid": 5, "tid": 7, "ts": 1716454222724886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724873, "dur": 21, "args": { "External id": 85669, "cbid": 41, "correlation": 85669 } }, { "ph": "s", "id": 85669, "pid": 76337, "tid": -914061504, "ts": 1716454222724873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724896, "dur": 3, "args": { "External id": 85670, "cbid": 131, "correlation": 85670 } }, { "ph": "f", "id": 85670, "pid": 76337, "tid": -914061504, "ts": 1716454222724896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222724924, "dur": 3, "args": { "External id": 85694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85694, "pid": 5, "tid": 7, "ts": 1716454222724924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724914, "dur": 9, "args": { "External id": 85694, "cbid": 211, "correlation": 85694 } }, { "ph": "s", "id": 85694, "pid": 76337, "tid": -914061504, "ts": 1716454222724914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222724924, "dur": 0, "args": { "External id": 85695, "cbid": 11, "correlation": 85695 } }, { "ph": "f", "id": 85695, "pid": 76337, "tid": -914061504, "ts": 1716454222724924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222724924, "dur": 0, "args": { "External id": 85696, "cbid": 11, "correlation": 85696 } }, { "ph": "f", "id": 85696, "pid": 76337, "tid": -914061504, "ts": 1716454222724924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222724926, "dur": 1, "args": { "External id": 85698, "cbid": 200, "correlation": 85698 } }, { "ph": "f", "id": 85698, "pid": 76337, "tid": -914061504, "ts": 1716454222724926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454222724937, "dur": 4, "args": { "External id": 85700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85700, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85700, "pid": 5, "tid": 7, "ts": 1716454222724937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222724929, "dur": 8, "args": { "External id": 85700, "cbid": 211, "correlation": 85700 } }, { "ph": "s", "id": 85700, "pid": 76337, "tid": -914061504, "ts": 1716454222724929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222724938, "dur": 0, "args": { "External id": 85701, "cbid": 11, "correlation": 85701 } }, { "ph": "f", "id": 85701, "pid": 76337, "tid": -914061504, "ts": 1716454222724938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222724938, "dur": 0, "args": { "External id": 85702, "cbid": 11, "correlation": 85702 } }, { "ph": "f", "id": 85702, "pid": 76337, "tid": -914061504, "ts": 1716454222724938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222724984, "dur": 1, "args": { "External id": 85709, "device": 5, "context": 1, "stream": 7, "correlation": 85709, "bytes": 8, "memory bandwidth (GB/s)": 0.0047169811320754715 } }, { "ph": "f", "id": 85709, "pid": 5, "tid": 7, "ts": 1716454222724984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222724965, "dur": 28, "args": { "External id": 85709, "cbid": 41, "correlation": 85709 } }, { "ph": "s", "id": 85709, "pid": 76337, "tid": -914061504, "ts": 1716454222724965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222724993, "dur": 3, "args": { "External id": 85710, "cbid": 131, "correlation": 85710 } }, { "ph": "f", "id": 85710, "pid": 76337, "tid": -914061504, "ts": 1716454222724993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222725046, "dur": 1, "args": { "External id": 85720, "device": 5, "context": 1, "stream": 7, "correlation": 85720, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 85720, "pid": 5, "tid": 7, "ts": 1716454222725046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222725034, "dur": 10, "args": { "External id": 85720, "cbid": 41, "correlation": 85720 } }, { "ph": "s", "id": 85720, "pid": 76337, "tid": -914061504, "ts": 1716454222725034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222725045, "dur": 7, "args": { "External id": 85721, "cbid": 131, "correlation": 85721 } }, { "ph": "f", "id": 85721, "pid": 76337, "tid": -914061504, "ts": 1716454222725045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222725115, "dur": 5, "args": { "External id": 85728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85728, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85728, "pid": 5, "tid": 7, "ts": 1716454222725115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725100, "dur": 16, "args": { "External id": 85728, "cbid": 211, "correlation": 85728 } }, { "ph": "s", "id": 85728, "pid": 76337, "tid": -914061504, "ts": 1716454222725100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725185, "dur": 3, "args": { "External id": 85737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85737, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85737, "pid": 5, "tid": 7, "ts": 1716454222725185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725171, "dur": 13, "args": { "External id": 85737, "cbid": 211, "correlation": 85737 } }, { "ph": "s", "id": 85737, "pid": 76337, "tid": -914061504, "ts": 1716454222725171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725220, "dur": 3, "args": { "External id": 85745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85745, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85745, "pid": 5, "tid": 7, "ts": 1716454222725220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725210, "dur": 9, "args": { "External id": 85745, "cbid": 211, "correlation": 85745 } }, { "ph": "s", "id": 85745, "pid": 76337, "tid": -914061504, "ts": 1716454222725210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725252, "dur": 4, "args": { "External id": 85753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85753, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85753, "pid": 5, "tid": 7, "ts": 1716454222725252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725241, "dur": 11, "args": { "External id": 85753, "cbid": 211, "correlation": 85753 } }, { "ph": "s", "id": 85753, "pid": 76337, "tid": -914061504, "ts": 1716454222725241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725283, "dur": 4, "args": { "External id": 85761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85761, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85761, "pid": 5, "tid": 7, "ts": 1716454222725283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725271, "dur": 11, "args": { "External id": 85761, "cbid": 211, "correlation": 85761 } }, { "ph": "s", "id": 85761, "pid": 76337, "tid": -914061504, "ts": 1716454222725271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725309, "dur": 3, "args": { "External id": 85769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85769, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85769, "pid": 5, "tid": 7, "ts": 1716454222725309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725300, "dur": 8, "args": { "External id": 85769, "cbid": 211, "correlation": 85769 } }, { "ph": "s", "id": 85769, "pid": 76337, "tid": -914061504, "ts": 1716454222725300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725334, "dur": 3, "args": { "External id": 85777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85777, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85777, "pid": 5, "tid": 7, "ts": 1716454222725334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725325, "dur": 8, "args": { "External id": 85777, "cbid": 211, "correlation": 85777 } }, { "ph": "s", "id": 85777, "pid": 76337, "tid": -914061504, "ts": 1716454222725325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222725356, "dur": 4, "args": { "External id": 85785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85785, "pid": 5, "tid": 7, "ts": 1716454222725356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725348, "dur": 7, "args": { "External id": 85785, "cbid": 211, "correlation": 85785 } }, { "ph": "s", "id": 85785, "pid": 76337, "tid": -914061504, "ts": 1716454222725348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222725375, "dur": 5, "args": { "External id": 85793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85793, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85793, "pid": 5, "tid": 7, "ts": 1716454222725375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725367, "dur": 7, "args": { "External id": 85793, "cbid": 211, "correlation": 85793 } }, { "ph": "s", "id": 85793, "pid": 76337, "tid": -914061504, "ts": 1716454222725367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725394, "dur": 3, "args": { "External id": 85801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85801, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85801, "pid": 5, "tid": 7, "ts": 1716454222725394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725386, "dur": 7, "args": { "External id": 85801, "cbid": 211, "correlation": 85801 } }, { "ph": "s", "id": 85801, "pid": 76337, "tid": -914061504, "ts": 1716454222725386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725453, "dur": 4, "args": { "External id": 85809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85809, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 85809, "pid": 5, "tid": 7, "ts": 1716454222725453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725442, "dur": 11, "args": { "External id": 85809, "cbid": 211, "correlation": 85809 } }, { "ph": "s", "id": 85809, "pid": 76337, "tid": -914061504, "ts": 1716454222725442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222725479, "dur": 4, "args": { "External id": 85817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85817, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85817, "pid": 5, "tid": 7, "ts": 1716454222725479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725470, "dur": 8, "args": { "External id": 85817, "cbid": 211, "correlation": 85817 } }, { "ph": "s", "id": 85817, "pid": 76337, "tid": -914061504, "ts": 1716454222725470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222725501, "dur": 4, "args": { "External id": 85825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85825, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85825, "pid": 5, "tid": 7, "ts": 1716454222725501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725492, "dur": 7, "args": { "External id": 85825, "cbid": 211, "correlation": 85825 } }, { "ph": "s", "id": 85825, "pid": 76337, "tid": -914061504, "ts": 1716454222725492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222725521, "dur": 3, "args": { "External id": 85833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85833, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 85833, "pid": 5, "tid": 7, "ts": 1716454222725521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725512, "dur": 7, "args": { "External id": 85833, "cbid": 211, "correlation": 85833 } }, { "ph": "s", "id": 85833, "pid": 76337, "tid": -914061504, "ts": 1716454222725512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222725933, "dur": 5, "args": { "External id": 85842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85842, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85842, "pid": 5, "tid": 7, "ts": 1716454222725933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725916, "dur": 17, "args": { "External id": 85842, "cbid": 211, "correlation": 85842 } }, { "ph": "s", "id": 85842, "pid": 76337, "tid": -914061504, "ts": 1716454222725916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222725970, "dur": 5, "args": { "External id": 85851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85851, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85851, "pid": 5, "tid": 7, "ts": 1716454222725970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222725960, "dur": 9, "args": { "External id": 85851, "cbid": 211, "correlation": 85851 } }, { "ph": "s", "id": 85851, "pid": 76337, "tid": -914061504, "ts": 1716454222725960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222726113, "dur": 3, "args": { "External id": 85867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85867, "pid": 5, "tid": 7, "ts": 1716454222726113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726097, "dur": 16, "args": { "External id": 85867, "cbid": 211, "correlation": 85867 } }, { "ph": "s", "id": 85867, "pid": 76337, "tid": -914061504, "ts": 1716454222726097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726147, "dur": 3, "args": { "External id": 85875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85875, "pid": 5, "tid": 7, "ts": 1716454222726147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726137, "dur": 9, "args": { "External id": 85875, "cbid": 211, "correlation": 85875 } }, { "ph": "s", "id": 85875, "pid": 76337, "tid": -914061504, "ts": 1716454222726137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726179, "dur": 3, "args": { "External id": 85883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85883, "pid": 5, "tid": 7, "ts": 1716454222726179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726170, "dur": 8, "args": { "External id": 85883, "cbid": 211, "correlation": 85883 } }, { "ph": "s", "id": 85883, "pid": 76337, "tid": -914061504, "ts": 1716454222726170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726209, "dur": 4, "args": { "External id": 85891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85891, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85891, "pid": 5, "tid": 7, "ts": 1716454222726209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726200, "dur": 8, "args": { "External id": 85891, "cbid": 211, "correlation": 85891 } }, { "ph": "s", "id": 85891, "pid": 76337, "tid": -914061504, "ts": 1716454222726200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222726266, "dur": 4, "args": { "External id": 85903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85903, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85903, "pid": 5, "tid": 7, "ts": 1716454222726266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726253, "dur": 13, "args": { "External id": 85903, "cbid": 211, "correlation": 85903 } }, { "ph": "s", "id": 85903, "pid": 76337, "tid": -914061504, "ts": 1716454222726253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222726312, "dur": 4, "args": { "External id": 85914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85914, "pid": 5, "tid": 7, "ts": 1716454222726312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726300, "dur": 12, "args": { "External id": 85914, "cbid": 211, "correlation": 85914 } }, { "ph": "s", "id": 85914, "pid": 76337, "tid": -914061504, "ts": 1716454222726300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726344, "dur": 3, "args": { "External id": 85922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85922, "pid": 5, "tid": 7, "ts": 1716454222726344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726334, "dur": 8, "args": { "External id": 85922, "cbid": 211, "correlation": 85922 } }, { "ph": "s", "id": 85922, "pid": 76337, "tid": -914061504, "ts": 1716454222726334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726376, "dur": 5, "args": { "External id": 85930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85930, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85930, "pid": 5, "tid": 7, "ts": 1716454222726376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726366, "dur": 10, "args": { "External id": 85930, "cbid": 211, "correlation": 85930 } }, { "ph": "s", "id": 85930, "pid": 76337, "tid": -914061504, "ts": 1716454222726366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726404, "dur": 5, "args": { "External id": 85938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85938, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85938, "pid": 5, "tid": 7, "ts": 1716454222726404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726394, "dur": 9, "args": { "External id": 85938, "cbid": 211, "correlation": 85938 } }, { "ph": "s", "id": 85938, "pid": 76337, "tid": -914061504, "ts": 1716454222726394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222726435, "dur": 4, "args": { "External id": 85947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85947, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85947, "pid": 5, "tid": 7, "ts": 1716454222726435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726424, "dur": 10, "args": { "External id": 85947, "cbid": 211, "correlation": 85947 } }, { "ph": "s", "id": 85947, "pid": 76337, "tid": -914061504, "ts": 1716454222726424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222726494, "dur": 5, "args": { "External id": 85960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85960, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 85960, "pid": 5, "tid": 7, "ts": 1716454222726494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726482, "dur": 13, "args": { "External id": 85960, "cbid": 211, "correlation": 85960 } }, { "ph": "s", "id": 85960, "pid": 76337, "tid": -914061504, "ts": 1716454222726482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222726534, "dur": 5, "args": { "External id": 85970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85970, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 85970, "pid": 5, "tid": 7, "ts": 1716454222726534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726523, "dur": 10, "args": { "External id": 85970, "cbid": 211, "correlation": 85970 } }, { "ph": "s", "id": 85970, "pid": 76337, "tid": -914061504, "ts": 1716454222726523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222726665, "dur": 5, "args": { "External id": 85987, "cbid": 251, "correlation": 85987 } }, { "ph": "f", "id": 85987, "pid": 76337, "tid": -914061504, "ts": 1716454222726665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454222726694, "dur": 11, "args": { "External id": 85989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85989, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 85989, "pid": 5, "tid": 7, "ts": 1716454222726694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726679, "dur": 16, "args": { "External id": 85989, "cbid": 211, "correlation": 85989 } }, { "ph": "s", "id": 85989, "pid": 76337, "tid": -914061504, "ts": 1716454222726679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222726755, "dur": 3, "args": { "External id": 85997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 85997, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 85997, "pid": 5, "tid": 7, "ts": 1716454222726755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726743, "dur": 11, "args": { "External id": 85997, "cbid": 211, "correlation": 85997 } }, { "ph": "s", "id": 85997, "pid": 76337, "tid": -914061504, "ts": 1716454222726743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222726813, "dur": 2, "args": { "External id": 86013, "cbid": 251, "correlation": 86013 } }, { "ph": "f", "id": 86013, "pid": 76337, "tid": -914061504, "ts": 1716454222726813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222726819, "dur": 0, "args": { "External id": 86015, "cbid": 251, "correlation": 86015 } }, { "ph": "f", "id": 86015, "pid": 76337, "tid": -914061504, "ts": 1716454222726819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222726836, "dur": 14, "args": { "External id": 86016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86016, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86016, "pid": 5, "tid": 7, "ts": 1716454222726836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726822, "dur": 14, "args": { "External id": 86016, "cbid": 211, "correlation": 86016 } }, { "ph": "s", "id": 86016, "pid": 76337, "tid": -914061504, "ts": 1716454222726822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222726852, "dur": 5, "args": { "External id": 86018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86018, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86018, "pid": 5, "tid": 7, "ts": 1716454222726852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222726841, "dur": 9, "args": { "External id": 86018, "cbid": 211, "correlation": 86018 } }, { "ph": "s", "id": 86018, "pid": 76337, "tid": -914061504, "ts": 1716454222726841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222726955, "dur": 1, "args": { "External id": 86028, "cbid": 317, "correlation": 86028 } }, { "ph": "f", "id": 86028, "pid": 76337, "tid": -914061504, "ts": 1716454222726955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222726957, "dur": 1, "args": { "External id": 86029, "cbid": 203, "correlation": 86029 } }, { "ph": "f", "id": 86029, "pid": 76337, "tid": -914061504, "ts": 1716454222726957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222726959, "dur": 1, "args": { "External id": 86030, "cbid": 205, "correlation": 86030 } }, { "ph": "f", "id": 86030, "pid": 76337, "tid": -914061504, "ts": 1716454222726959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222727026, "dur": 6, "args": { "External id": 86034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86034, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86034, "pid": 5, "tid": 7, "ts": 1716454222727026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222727010, "dur": 15, "args": { "External id": 86034, "cbid": 211, "correlation": 86034 } }, { "ph": "s", "id": 86034, "pid": 76337, "tid": -914061504, "ts": 1716454222727010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222727036, "dur": 4, "args": { "External id": 86036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86036, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 86036, "pid": 5, "tid": 7, "ts": 1716454222727036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222727029, "dur": 6, "args": { "External id": 86036, "cbid": 211, "correlation": 86036 } }, { "ph": "s", "id": 86036, "pid": 76337, "tid": -914061504, "ts": 1716454222727029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222727057, "dur": 3, "args": { "External id": 86038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86038, "pid": 5, "tid": 7, "ts": 1716454222727057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222727047, "dur": 8, "args": { "External id": 86038, "cbid": 211, "correlation": 86038 } }, { "ph": "s", "id": 86038, "pid": 76337, "tid": -914061504, "ts": 1716454222727047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222727062, "dur": 0, "args": { "External id": 86039, "cbid": 51, "correlation": 86039 } }, { "ph": "s", "id": 86039, "pid": 76337, "tid": -914061504, "ts": 1716454222727062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222727072, "dur": 86, "args": { "External id": 86040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86040, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86040, "pid": 5, "tid": 7, "ts": 1716454222727072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222727063, "dur": 7, "args": { "External id": 86040, "cbid": 211, "correlation": 86040 } }, { "ph": "s", "id": 86040, "pid": 76337, "tid": -914061504, "ts": 1716454222727063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222727159, "dur": 59, "args": { "External id": 86045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86045, "pid": 5, "tid": 7, "ts": 1716454222727159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222727103, "dur": 10, "args": { "External id": 86045, "cbid": 211, "correlation": 86045 } }, { "ph": "s", "id": 86045, "pid": 76337, "tid": -914061504, "ts": 1716454222727103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222728927, "dur": 53, "args": { "External id": 86065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86065, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 86065, "pid": 5, "tid": 7, "ts": 1716454222728927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222728910, "dur": 17, "args": { "External id": 86065, "cbid": 211, "correlation": 86065 } }, { "ph": "s", "id": 86065, "pid": 76337, "tid": -914061504, "ts": 1716454222728910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222728981, "dur": 4, "args": { "External id": 86077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86077, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86077, "pid": 5, "tid": 7, "ts": 1716454222728981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222728939, "dur": 8, "args": { "External id": 86077, "cbid": 211, "correlation": 86077 } }, { "ph": "s", "id": 86077, "pid": 76337, "tid": -914061504, "ts": 1716454222728939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222728987, "dur": 56, "args": { "External id": 86080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86080, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86080, "pid": 5, "tid": 7, "ts": 1716454222728987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222728963, "dur": 7, "args": { "External id": 86080, "cbid": 211, "correlation": 86080 } }, { "ph": "s", "id": 86080, "pid": 76337, "tid": -914061504, "ts": 1716454222728963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222729044, "dur": 37, "args": { "External id": 86089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86089, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86089, "pid": 5, "tid": 7, "ts": 1716454222729044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729016, "dur": 11, "args": { "External id": 86089, "cbid": 211, "correlation": 86089 } }, { "ph": "s", "id": 86089, "pid": 76337, "tid": -914061504, "ts": 1716454222729016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222729076, "dur": 0, "args": { "External id": 86099, "cbid": 317, "correlation": 86099 } }, { "ph": "f", "id": 86099, "pid": 76337, "tid": -914061504, "ts": 1716454222729076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222729077, "dur": 0, "args": { "External id": 86100, "cbid": 203, "correlation": 86100 } }, { "ph": "f", "id": 86100, "pid": 76337, "tid": -914061504, "ts": 1716454222729077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222729078, "dur": 0, "args": { "External id": 86101, "cbid": 205, "correlation": 86101 } }, { "ph": "f", "id": 86101, "pid": 76337, "tid": -914061504, "ts": 1716454222729078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222729109, "dur": 41, "args": { "External id": 86105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86105, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86105, "pid": 5, "tid": 7, "ts": 1716454222729109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729097, "dur": 12, "args": { "External id": 86105, "cbid": 211, "correlation": 86105 } }, { "ph": "s", "id": 86105, "pid": 76337, "tid": -914061504, "ts": 1716454222729097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222729151, "dur": 15, "args": { "External id": 86107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86107, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86107, "pid": 5, "tid": 7, "ts": 1716454222729151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729111, "dur": 6, "args": { "External id": 86107, "cbid": 211, "correlation": 86107 } }, { "ph": "s", "id": 86107, "pid": 76337, "tid": -914061504, "ts": 1716454222729111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222729167, "dur": 3, "args": { "External id": 86109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86109, "pid": 5, "tid": 7, "ts": 1716454222729167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729123, "dur": 6, "args": { "External id": 86109, "cbid": 211, "correlation": 86109 } }, { "ph": "s", "id": 86109, "pid": 76337, "tid": -914061504, "ts": 1716454222729123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222729133, "dur": 0, "args": { "External id": 86110, "cbid": 51, "correlation": 86110 } }, { "ph": "s", "id": 86110, "pid": 76337, "tid": -914061504, "ts": 1716454222729133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222729172, "dur": 713, "args": { "External id": 86111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86111, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86111, "pid": 5, "tid": 7, "ts": 1716454222729172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729134, "dur": 6, "args": { "External id": 86111, "cbid": 211, "correlation": 86111 } }, { "ph": "s", "id": 86111, "pid": 76337, "tid": -914061504, "ts": 1716454222729134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222729886, "dur": 60, "args": { "External id": 86116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86116, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86116, "pid": 5, "tid": 7, "ts": 1716454222729886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729164, "dur": 9, "args": { "External id": 86116, "cbid": 211, "correlation": 86116 } }, { "ph": "s", "id": 86116, "pid": 76337, "tid": -914061504, "ts": 1716454222729164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222729947, "dur": 4, "args": { "External id": 86124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86124, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86124, "pid": 5, "tid": 7, "ts": 1716454222729947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729210, "dur": 9, "args": { "External id": 86124, "cbid": 211, "correlation": 86124 } }, { "ph": "s", "id": 86124, "pid": 76337, "tid": -914061504, "ts": 1716454222729210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729278, "dur": 2, "args": { "External id": 86140, "cbid": 251, "correlation": 86140 } }, { "ph": "f", "id": 86140, "pid": 76337, "tid": -914061504, "ts": 1716454222729278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729284, "dur": 0, "args": { "External id": 86142, "cbid": 251, "correlation": 86142 } }, { "ph": "f", "id": 86142, "pid": 76337, "tid": -914061504, "ts": 1716454222729284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222729952, "dur": 9, "args": { "External id": 86143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86143, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 86143, "pid": 5, "tid": 7, "ts": 1716454222729952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729286, "dur": 11, "args": { "External id": 86143, "cbid": 211, "correlation": 86143 } }, { "ph": "s", "id": 86143, "pid": 76337, "tid": -914061504, "ts": 1716454222729286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222729962, "dur": 4, "args": { "External id": 86145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86145, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 86145, "pid": 5, "tid": 7, "ts": 1716454222729962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729299, "dur": 7, "args": { "External id": 86145, "cbid": 211, "correlation": 86145 } }, { "ph": "s", "id": 86145, "pid": 76337, "tid": -914061504, "ts": 1716454222729299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222729967, "dur": 55, "args": { "External id": 86155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86155, "pid": 5, "tid": 7, "ts": 1716454222729967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729363, "dur": 12, "args": { "External id": 86155, "cbid": 211, "correlation": 86155 } }, { "ph": "s", "id": 86155, "pid": 76337, "tid": -914061504, "ts": 1716454222729363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222730023, "dur": 53, "args": { "External id": 86175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86175, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 86175, "pid": 5, "tid": 7, "ts": 1716454222730023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729430, "dur": 12, "args": { "External id": 86175, "cbid": 211, "correlation": 86175 } }, { "ph": "s", "id": 86175, "pid": 76337, "tid": -914061504, "ts": 1716454222729430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222730078, "dur": 4, "args": { "External id": 86187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86187, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86187, "pid": 5, "tid": 7, "ts": 1716454222730078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729451, "dur": 6, "args": { "External id": 86187, "cbid": 211, "correlation": 86187 } }, { "ph": "s", "id": 86187, "pid": 76337, "tid": -914061504, "ts": 1716454222729451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222730083, "dur": 56, "args": { "External id": 86190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86190, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86190, "pid": 5, "tid": 7, "ts": 1716454222730083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729470, "dur": 7, "args": { "External id": 86190, "cbid": 211, "correlation": 86190 } }, { "ph": "s", "id": 86190, "pid": 76337, "tid": -914061504, "ts": 1716454222729470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222730140, "dur": 37, "args": { "External id": 86199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86199, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86199, "pid": 5, "tid": 7, "ts": 1716454222730140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729511, "dur": 9, "args": { "External id": 86199, "cbid": 211, "correlation": 86199 } }, { "ph": "s", "id": 86199, "pid": 76337, "tid": -914061504, "ts": 1716454222729511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222729583, "dur": 0, "args": { "External id": 86209, "cbid": 317, "correlation": 86209 } }, { "ph": "f", "id": 86209, "pid": 76337, "tid": -914061504, "ts": 1716454222729583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222729584, "dur": 0, "args": { "External id": 86210, "cbid": 203, "correlation": 86210 } }, { "ph": "f", "id": 86210, "pid": 76337, "tid": -914061504, "ts": 1716454222729584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222729585, "dur": 0, "args": { "External id": 86211, "cbid": 205, "correlation": 86211 } }, { "ph": "f", "id": 86211, "pid": 76337, "tid": -914061504, "ts": 1716454222729585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222730179, "dur": 39, "args": { "External id": 86215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86215, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86215, "pid": 5, "tid": 7, "ts": 1716454222730179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729599, "dur": 13, "args": { "External id": 86215, "cbid": 211, "correlation": 86215 } }, { "ph": "s", "id": 86215, "pid": 76337, "tid": -914061504, "ts": 1716454222729599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222730219, "dur": 15, "args": { "External id": 86217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86217, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86217, "pid": 5, "tid": 7, "ts": 1716454222730219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729614, "dur": 5, "args": { "External id": 86217, "cbid": 211, "correlation": 86217 } }, { "ph": "s", "id": 86217, "pid": 76337, "tid": -914061504, "ts": 1716454222729614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222730235, "dur": 4, "args": { "External id": 86219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86219, "pid": 5, "tid": 7, "ts": 1716454222730235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729623, "dur": 5, "args": { "External id": 86219, "cbid": 211, "correlation": 86219 } }, { "ph": "s", "id": 86219, "pid": 76337, "tid": -914061504, "ts": 1716454222729623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222729632, "dur": 0, "args": { "External id": 86220, "cbid": 51, "correlation": 86220 } }, { "ph": "s", "id": 86220, "pid": 76337, "tid": -914061504, "ts": 1716454222729632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222730240, "dur": 704, "args": { "External id": 86221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86221, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86221, "pid": 5, "tid": 7, "ts": 1716454222730240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729632, "dur": 5, "args": { "External id": 86221, "cbid": 211, "correlation": 86221 } }, { "ph": "s", "id": 86221, "pid": 76337, "tid": -914061504, "ts": 1716454222729632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222730945, "dur": 59, "args": { "External id": 86226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86226, "pid": 5, "tid": 7, "ts": 1716454222730945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729660, "dur": 9, "args": { "External id": 86226, "cbid": 211, "correlation": 86226 } }, { "ph": "s", "id": 86226, "pid": 76337, "tid": -914061504, "ts": 1716454222729660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222731006, "dur": 50, "args": { "External id": 86234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86234, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86234, "pid": 5, "tid": 7, "ts": 1716454222731006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729693, "dur": 8, "args": { "External id": 86234, "cbid": 211, "correlation": 86234 } }, { "ph": "s", "id": 86234, "pid": 76337, "tid": -914061504, "ts": 1716454222729693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222731058, "dur": 36, "args": { "External id": 86242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86242, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86242, "pid": 5, "tid": 7, "ts": 1716454222731058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729726, "dur": 10, "args": { "External id": 86242, "cbid": 211, "correlation": 86242 } }, { "ph": "s", "id": 86242, "pid": 76337, "tid": -914061504, "ts": 1716454222729726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222731095, "dur": 53, "args": { "External id": 86262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86262, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 86262, "pid": 5, "tid": 7, "ts": 1716454222731095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729809, "dur": 12, "args": { "External id": 86262, "cbid": 211, "correlation": 86262 } }, { "ph": "s", "id": 86262, "pid": 76337, "tid": -914061504, "ts": 1716454222729809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222731149, "dur": 4, "args": { "External id": 86274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86274, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86274, "pid": 5, "tid": 7, "ts": 1716454222731149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729831, "dur": 6, "args": { "External id": 86274, "cbid": 211, "correlation": 86274 } }, { "ph": "s", "id": 86274, "pid": 76337, "tid": -914061504, "ts": 1716454222729831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222731154, "dur": 55, "args": { "External id": 86277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86277, "pid": 5, "tid": 7, "ts": 1716454222731154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729850, "dur": 6, "args": { "External id": 86277, "cbid": 211, "correlation": 86277 } }, { "ph": "s", "id": 86277, "pid": 76337, "tid": -914061504, "ts": 1716454222729850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222729907, "dur": 0, "args": { "External id": 86288, "cbid": 317, "correlation": 86288 } }, { "ph": "f", "id": 86288, "pid": 76337, "tid": -914061504, "ts": 1716454222729907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222729908, "dur": 0, "args": { "External id": 86289, "cbid": 203, "correlation": 86289 } }, { "ph": "f", "id": 86289, "pid": 76337, "tid": -914061504, "ts": 1716454222729908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222729908, "dur": 0, "args": { "External id": 86290, "cbid": 205, "correlation": 86290 } }, { "ph": "f", "id": 86290, "pid": 76337, "tid": -914061504, "ts": 1716454222729908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729943, "dur": 2, "args": { "External id": 86294, "cbid": 251, "correlation": 86294 } }, { "ph": "f", "id": 86294, "pid": 76337, "tid": -914061504, "ts": 1716454222729943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729947, "dur": 1, "args": { "External id": 86295, "cbid": 251, "correlation": 86295 } }, { "ph": "f", "id": 86295, "pid": 76337, "tid": -914061504, "ts": 1716454222729947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729949, "dur": 1, "args": { "External id": 86296, "cbid": 251, "correlation": 86296 } }, { "ph": "f", "id": 86296, "pid": 76337, "tid": -914061504, "ts": 1716454222729949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729952, "dur": 1, "args": { "External id": 86297, "cbid": 251, "correlation": 86297 } }, { "ph": "f", "id": 86297, "pid": 76337, "tid": -914061504, "ts": 1716454222729952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729954, "dur": 1, "args": { "External id": 86298, "cbid": 251, "correlation": 86298 } }, { "ph": "f", "id": 86298, "pid": 76337, "tid": -914061504, "ts": 1716454222729954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729956, "dur": 1, "args": { "External id": 86299, "cbid": 251, "correlation": 86299 } }, { "ph": "f", "id": 86299, "pid": 76337, "tid": -914061504, "ts": 1716454222729956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729958, "dur": 1, "args": { "External id": 86300, "cbid": 251, "correlation": 86300 } }, { "ph": "f", "id": 86300, "pid": 76337, "tid": -914061504, "ts": 1716454222729958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729959, "dur": 1, "args": { "External id": 86301, "cbid": 251, "correlation": 86301 } }, { "ph": "f", "id": 86301, "pid": 76337, "tid": -914061504, "ts": 1716454222729959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222729962, "dur": 0, "args": { "External id": 86302, "cbid": 251, "correlation": 86302 } }, { "ph": "f", "id": 86302, "pid": 76337, "tid": -914061504, "ts": 1716454222729962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222731211, "dur": 116, "args": { "External id": 86303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86303, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 86303, "pid": 5, "tid": 7, "ts": 1716454222731211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222729966, "dur": 22, "args": { "External id": 86303, "cbid": 211, "correlation": 86303 } }, { "ph": "s", "id": 86303, "pid": 76337, "tid": -914061504, "ts": 1716454222729966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222731328, "dur": 60, "args": { "External id": 86309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86309, "pid": 5, "tid": 7, "ts": 1716454222731328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730012, "dur": 9, "args": { "External id": 86309, "cbid": 211, "correlation": 86309 } }, { "ph": "s", "id": 86309, "pid": 76337, "tid": -914061504, "ts": 1716454222730012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222731390, "dur": 586, "args": { "External id": 86318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86318, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86318, "pid": 5, "tid": 7, "ts": 1716454222731390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730109, "dur": 16, "args": { "External id": 86318, "cbid": 211, "correlation": 86318 } }, { "ph": "s", "id": 86318, "pid": 76337, "tid": -914061504, "ts": 1716454222730109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222731978, "dur": 185, "args": { "External id": 86340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86340, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86340, "pid": 5, "tid": 7, "ts": 1716454222731978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730184, "dur": 12, "args": { "External id": 86340, "cbid": 211, "correlation": 86340 } }, { "ph": "s", "id": 86340, "pid": 76337, "tid": -914061504, "ts": 1716454222730184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222730300, "dur": 2, "args": { "External id": 86351, "cbid": 251, "correlation": 86351 } }, { "ph": "f", "id": 86351, "pid": 76337, "tid": -914061504, "ts": 1716454222730300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222732164, "dur": 199, "args": { "External id": 86352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86352, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86352, "pid": 5, "tid": 7, "ts": 1716454222732164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730308, "dur": 14, "args": { "External id": 86352, "cbid": 211, "correlation": 86352 } }, { "ph": "s", "id": 86352, "pid": 76337, "tid": -914061504, "ts": 1716454222730308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222730380, "dur": 1, "args": { "External id": 86363, "cbid": 251, "correlation": 86363 } }, { "ph": "f", "id": 86363, "pid": 76337, "tid": -914061504, "ts": 1716454222730380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222732364, "dur": 193, "args": { "External id": 86364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86364, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86364, "pid": 5, "tid": 7, "ts": 1716454222732364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730384, "dur": 11, "args": { "External id": 86364, "cbid": 211, "correlation": 86364 } }, { "ph": "s", "id": 86364, "pid": 76337, "tid": -914061504, "ts": 1716454222730384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222730448, "dur": 1, "args": { "External id": 86375, "cbid": 251, "correlation": 86375 } }, { "ph": "f", "id": 86375, "pid": 76337, "tid": -914061504, "ts": 1716454222730448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222732558, "dur": 189, "args": { "External id": 86376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86376, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86376, "pid": 5, "tid": 7, "ts": 1716454222732558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730453, "dur": 12, "args": { "External id": 86376, "cbid": 211, "correlation": 86376 } }, { "ph": "s", "id": 86376, "pid": 76337, "tid": -914061504, "ts": 1716454222730453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222732749, "dur": 18986, "args": { "External id": 86397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86397, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 86397, "pid": 5, "tid": 7, "ts": 1716454222732749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730563, "dur": 15, "args": { "External id": 86397, "cbid": 211, "correlation": 86397 } }, { "ph": "s", "id": 86397, "pid": 76337, "tid": -914061504, "ts": 1716454222730563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222730680, "dur": 2, "args": { "External id": 86415, "cbid": 251, "correlation": 86415 } }, { "ph": "f", "id": 86415, "pid": 76337, "tid": -914061504, "ts": 1716454222730680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222751736, "dur": 203, "args": { "External id": 86417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86417, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86417, "pid": 5, "tid": 7, "ts": 1716454222751736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730686, "dur": 13, "args": { "External id": 86417, "cbid": 211, "correlation": 86417 } }, { "ph": "s", "id": 86417, "pid": 76337, "tid": -914061504, "ts": 1716454222730686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222751940, "dur": 66, "args": { "External id": 86425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86425, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86425, "pid": 5, "tid": 7, "ts": 1716454222751940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730760, "dur": 13, "args": { "External id": 86425, "cbid": 211, "correlation": 86425 } }, { "ph": "s", "id": 86425, "pid": 76337, "tid": -914061504, "ts": 1716454222730760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222752008, "dur": 97, "args": { "External id": 86433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86433, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86433, "pid": 5, "tid": 7, "ts": 1716454222752008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730802, "dur": 9, "args": { "External id": 86433, "cbid": 211, "correlation": 86433 } }, { "ph": "s", "id": 86433, "pid": 76337, "tid": -914061504, "ts": 1716454222730802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222752106, "dur": 55, "args": { "External id": 86444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86444, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86444, "pid": 5, "tid": 7, "ts": 1716454222752106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730889, "dur": 15, "args": { "External id": 86444, "cbid": 211, "correlation": 86444 } }, { "ph": "s", "id": 86444, "pid": 76337, "tid": -914061504, "ts": 1716454222730889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222752163, "dur": 94, "args": { "External id": 86466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86466, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86466, "pid": 5, "tid": 7, "ts": 1716454222752163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222730925, "dur": 7, "args": { "External id": 86466, "cbid": 211, "correlation": 86466 } }, { "ph": "s", "id": 86466, "pid": 76337, "tid": -914061504, "ts": 1716454222730925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731016, "dur": 1, "args": { "External id": 86477, "cbid": 251, "correlation": 86477 } }, { "ph": "f", "id": 86477, "pid": 76337, "tid": -914061504, "ts": 1716454222731016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222752258, "dur": 104, "args": { "External id": 86478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86478, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86478, "pid": 5, "tid": 7, "ts": 1716454222752258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731020, "dur": 14, "args": { "External id": 86478, "cbid": 211, "correlation": 86478 } }, { "ph": "s", "id": 86478, "pid": 76337, "tid": -914061504, "ts": 1716454222731020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731102, "dur": 1, "args": { "External id": 86489, "cbid": 251, "correlation": 86489 } }, { "ph": "f", "id": 86489, "pid": 76337, "tid": -914061504, "ts": 1716454222731102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731106, "dur": 0, "args": { "External id": 86490, "cbid": 251, "correlation": 86490 } }, { "ph": "f", "id": 86490, "pid": 76337, "tid": -914061504, "ts": 1716454222731106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222752364, "dur": 10, "args": { "External id": 86491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86491, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 86491, "pid": 5, "tid": 7, "ts": 1716454222752364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731108, "dur": 14, "args": { "External id": 86491, "cbid": 211, "correlation": 86491 } }, { "ph": "s", "id": 86491, "pid": 76337, "tid": -914061504, "ts": 1716454222731108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222752375, "dur": 5, "args": { "External id": 86493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86493, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 86493, "pid": 5, "tid": 7, "ts": 1716454222752375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731125, "dur": 8, "args": { "External id": 86493, "cbid": 211, "correlation": 86493 } }, { "ph": "s", "id": 86493, "pid": 76337, "tid": -914061504, "ts": 1716454222731125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731189, "dur": 1, "args": { "External id": 86504, "cbid": 251, "correlation": 86504 } }, { "ph": "f", "id": 86504, "pid": 76337, "tid": -914061504, "ts": 1716454222731189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731193, "dur": 0, "args": { "External id": 86505, "cbid": 251, "correlation": 86505 } }, { "ph": "f", "id": 86505, "pid": 76337, "tid": -914061504, "ts": 1716454222731193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222752382, "dur": 6, "args": { "External id": 86506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86506, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 86506, "pid": 5, "tid": 7, "ts": 1716454222752382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731194, "dur": 13, "args": { "External id": 86506, "cbid": 211, "correlation": 86506 } }, { "ph": "s", "id": 86506, "pid": 76337, "tid": -914061504, "ts": 1716454222731194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222752389, "dur": 4, "args": { "External id": 86508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86508, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 86508, "pid": 5, "tid": 7, "ts": 1716454222752389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731208, "dur": 5, "args": { "External id": 86508, "cbid": 211, "correlation": 86508 } }, { "ph": "s", "id": 86508, "pid": 76337, "tid": -914061504, "ts": 1716454222731208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222752394, "dur": 159, "args": { "External id": 86529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86529, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 86529, "pid": 5, "tid": 7, "ts": 1716454222752394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731282, "dur": 13, "args": { "External id": 86529, "cbid": 211, "correlation": 86529 } }, { "ph": "s", "id": 86529, "pid": 76337, "tid": -914061504, "ts": 1716454222731282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731383, "dur": 2, "args": { "External id": 86547, "cbid": 251, "correlation": 86547 } }, { "ph": "f", "id": 86547, "pid": 76337, "tid": -914061504, "ts": 1716454222731383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222752554, "dur": 109, "args": { "External id": 86549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86549, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 86549, "pid": 5, "tid": 7, "ts": 1716454222752554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731389, "dur": 13, "args": { "External id": 86549, "cbid": 211, "correlation": 86549 } }, { "ph": "s", "id": 86549, "pid": 76337, "tid": -914061504, "ts": 1716454222731389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222752665, "dur": 35, "args": { "External id": 86557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86557, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86557, "pid": 5, "tid": 7, "ts": 1716454222752665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731461, "dur": 12, "args": { "External id": 86557, "cbid": 211, "correlation": 86557 } }, { "ph": "s", "id": 86557, "pid": 76337, "tid": -914061504, "ts": 1716454222731461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222752702, "dur": 68, "args": { "External id": 86565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86565, "pid": 5, "tid": 7, "ts": 1716454222752702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731504, "dur": 10, "args": { "External id": 86565, "cbid": 211, "correlation": 86565 } }, { "ph": "s", "id": 86565, "pid": 76337, "tid": -914061504, "ts": 1716454222731504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222752771, "dur": 94, "args": { "External id": 86587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86587, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86587, "pid": 5, "tid": 7, "ts": 1716454222752771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731558, "dur": 10, "args": { "External id": 86587, "cbid": 211, "correlation": 86587 } }, { "ph": "s", "id": 86587, "pid": 76337, "tid": -914061504, "ts": 1716454222731558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731649, "dur": 1, "args": { "External id": 86603, "cbid": 251, "correlation": 86603 } }, { "ph": "f", "id": 86603, "pid": 76337, "tid": -914061504, "ts": 1716454222731649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222752866, "dur": 587, "args": { "External id": 86605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86605, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86605, "pid": 5, "tid": 7, "ts": 1716454222752866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731655, "dur": 13, "args": { "External id": 86605, "cbid": 211, "correlation": 86605 } }, { "ph": "s", "id": 86605, "pid": 76337, "tid": -914061504, "ts": 1716454222731655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222753454, "dur": 245, "args": { "External id": 86613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86613, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86613, "pid": 5, "tid": 7, "ts": 1716454222753454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731741, "dur": 15, "args": { "External id": 86613, "cbid": 211, "correlation": 86613 } }, { "ph": "s", "id": 86613, "pid": 76337, "tid": -914061504, "ts": 1716454222731741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222753701, "dur": 254, "args": { "External id": 86621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86621, "pid": 5, "tid": 7, "ts": 1716454222753701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731780, "dur": 10, "args": { "External id": 86621, "cbid": 211, "correlation": 86621 } }, { "ph": "s", "id": 86621, "pid": 76337, "tid": -914061504, "ts": 1716454222731780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731865, "dur": 2, "args": { "External id": 86637, "cbid": 251, "correlation": 86637 } }, { "ph": "f", "id": 86637, "pid": 76337, "tid": -914061504, "ts": 1716454222731865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222731870, "dur": 0, "args": { "External id": 86639, "cbid": 251, "correlation": 86639 } }, { "ph": "f", "id": 86639, "pid": 76337, "tid": -914061504, "ts": 1716454222731870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222753956, "dur": 361, "args": { "External id": 86640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86640, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 86640, "pid": 5, "tid": 7, "ts": 1716454222753956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731875, "dur": 14, "args": { "External id": 86640, "cbid": 211, "correlation": 86640 } }, { "ph": "s", "id": 86640, "pid": 76337, "tid": -914061504, "ts": 1716454222731875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222754319, "dur": 51, "args": { "External id": 86648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86648, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86648, "pid": 5, "tid": 7, "ts": 1716454222754319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222731920, "dur": 11, "args": { "External id": 86648, "cbid": 211, "correlation": 86648 } }, { "ph": "s", "id": 86648, "pid": 76337, "tid": -914061504, "ts": 1716454222731920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222754372, "dur": 160, "args": { "External id": 86659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86659, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86659, "pid": 5, "tid": 7, "ts": 1716454222754372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732002, "dur": 15, "args": { "External id": 86659, "cbid": 211, "correlation": 86659 } }, { "ph": "s", "id": 86659, "pid": 76337, "tid": -914061504, "ts": 1716454222732002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222732075, "dur": 0, "args": { "External id": 86671, "cbid": 317, "correlation": 86671 } }, { "ph": "f", "id": 86671, "pid": 76337, "tid": -914061504, "ts": 1716454222732075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222732077, "dur": 0, "args": { "External id": 86672, "cbid": 203, "correlation": 86672 } }, { "ph": "f", "id": 86672, "pid": 76337, "tid": -914061504, "ts": 1716454222732077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222732077, "dur": 0, "args": { "External id": 86673, "cbid": 205, "correlation": 86673 } }, { "ph": "f", "id": 86673, "pid": 76337, "tid": -914061504, "ts": 1716454222732077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732105, "dur": 1, "args": { "External id": 86677, "cbid": 251, "correlation": 86677 } }, { "ph": "f", "id": 86677, "pid": 76337, "tid": -914061504, "ts": 1716454222732105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732108, "dur": 0, "args": { "External id": 86678, "cbid": 251, "correlation": 86678 } }, { "ph": "f", "id": 86678, "pid": 76337, "tid": -914061504, "ts": 1716454222732108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732109, "dur": 0, "args": { "External id": 86679, "cbid": 251, "correlation": 86679 } }, { "ph": "f", "id": 86679, "pid": 76337, "tid": -914061504, "ts": 1716454222732109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732110, "dur": 0, "args": { "External id": 86680, "cbid": 251, "correlation": 86680 } }, { "ph": "f", "id": 86680, "pid": 76337, "tid": -914061504, "ts": 1716454222732110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732111, "dur": 1, "args": { "External id": 86681, "cbid": 251, "correlation": 86681 } }, { "ph": "f", "id": 86681, "pid": 76337, "tid": -914061504, "ts": 1716454222732111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732112, "dur": 0, "args": { "External id": 86682, "cbid": 251, "correlation": 86682 } }, { "ph": "f", "id": 86682, "pid": 76337, "tid": -914061504, "ts": 1716454222732112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732114, "dur": 0, "args": { "External id": 86683, "cbid": 251, "correlation": 86683 } }, { "ph": "f", "id": 86683, "pid": 76337, "tid": -914061504, "ts": 1716454222732114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732114, "dur": 0, "args": { "External id": 86684, "cbid": 251, "correlation": 86684 } }, { "ph": "f", "id": 86684, "pid": 76337, "tid": -914061504, "ts": 1716454222732114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732116, "dur": 0, "args": { "External id": 86685, "cbid": 251, "correlation": 86685 } }, { "ph": "f", "id": 86685, "pid": 76337, "tid": -914061504, "ts": 1716454222732116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222754533, "dur": 117, "args": { "External id": 86686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86686, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 86686, "pid": 5, "tid": 7, "ts": 1716454222754533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732118, "dur": 13, "args": { "External id": 86686, "cbid": 211, "correlation": 86686 } }, { "ph": "s", "id": 86686, "pid": 76337, "tid": -914061504, "ts": 1716454222732118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222754651, "dur": 60, "args": { "External id": 86692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86692, "pid": 5, "tid": 7, "ts": 1716454222754651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732154, "dur": 9, "args": { "External id": 86692, "cbid": 211, "correlation": 86692 } }, { "ph": "s", "id": 86692, "pid": 76337, "tid": -914061504, "ts": 1716454222732154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222754713, "dur": 50, "args": { "External id": 86700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86700, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86700, "pid": 5, "tid": 7, "ts": 1716454222754713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732189, "dur": 8, "args": { "External id": 86700, "cbid": 211, "correlation": 86700 } }, { "ph": "s", "id": 86700, "pid": 76337, "tid": -914061504, "ts": 1716454222732189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222754764, "dur": 54, "args": { "External id": 86720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86720, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 86720, "pid": 5, "tid": 7, "ts": 1716454222754764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732267, "dur": 12, "args": { "External id": 86720, "cbid": 211, "correlation": 86720 } }, { "ph": "s", "id": 86720, "pid": 76337, "tid": -914061504, "ts": 1716454222732267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222754819, "dur": 4, "args": { "External id": 86732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86732, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86732, "pid": 5, "tid": 7, "ts": 1716454222754819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732289, "dur": 7, "args": { "External id": 86732, "cbid": 211, "correlation": 86732 } }, { "ph": "s", "id": 86732, "pid": 76337, "tid": -914061504, "ts": 1716454222732289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222754825, "dur": 57, "args": { "External id": 86735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86735, "pid": 5, "tid": 7, "ts": 1716454222754825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732309, "dur": 7, "args": { "External id": 86735, "cbid": 211, "correlation": 86735 } }, { "ph": "s", "id": 86735, "pid": 76337, "tid": -914061504, "ts": 1716454222732309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222754883, "dur": 36, "args": { "External id": 86744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86744, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86744, "pid": 5, "tid": 7, "ts": 1716454222754883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732349, "dur": 11, "args": { "External id": 86744, "cbid": 211, "correlation": 86744 } }, { "ph": "s", "id": 86744, "pid": 76337, "tid": -914061504, "ts": 1716454222732349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222732402, "dur": 0, "args": { "External id": 86754, "cbid": 317, "correlation": 86754 } }, { "ph": "f", "id": 86754, "pid": 76337, "tid": -914061504, "ts": 1716454222732402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222732403, "dur": 0, "args": { "External id": 86755, "cbid": 203, "correlation": 86755 } }, { "ph": "f", "id": 86755, "pid": 76337, "tid": -914061504, "ts": 1716454222732403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222732404, "dur": 0, "args": { "External id": 86756, "cbid": 205, "correlation": 86756 } }, { "ph": "f", "id": 86756, "pid": 76337, "tid": -914061504, "ts": 1716454222732404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222754921, "dur": 42, "args": { "External id": 86760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86760, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86760, "pid": 5, "tid": 7, "ts": 1716454222754921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732420, "dur": 12, "args": { "External id": 86760, "cbid": 211, "correlation": 86760 } }, { "ph": "s", "id": 86760, "pid": 76337, "tid": -914061504, "ts": 1716454222732420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222754964, "dur": 14, "args": { "External id": 86762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86762, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86762, "pid": 5, "tid": 7, "ts": 1716454222754964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732434, "dur": 5, "args": { "External id": 86762, "cbid": 211, "correlation": 86762 } }, { "ph": "s", "id": 86762, "pid": 76337, "tid": -914061504, "ts": 1716454222732434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222754980, "dur": 4, "args": { "External id": 86764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86764, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86764, "pid": 5, "tid": 7, "ts": 1716454222754980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732445, "dur": 6, "args": { "External id": 86764, "cbid": 211, "correlation": 86764 } }, { "ph": "s", "id": 86764, "pid": 76337, "tid": -914061504, "ts": 1716454222732445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222732454, "dur": 0, "args": { "External id": 86765, "cbid": 51, "correlation": 86765 } }, { "ph": "s", "id": 86765, "pid": 76337, "tid": -914061504, "ts": 1716454222732454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222754985, "dur": 715, "args": { "External id": 86766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86766, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86766, "pid": 5, "tid": 7, "ts": 1716454222754985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732455, "dur": 5, "args": { "External id": 86766, "cbid": 211, "correlation": 86766 } }, { "ph": "s", "id": 86766, "pid": 76337, "tid": -914061504, "ts": 1716454222732455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222755702, "dur": 60, "args": { "External id": 86771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86771, "pid": 5, "tid": 7, "ts": 1716454222755702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732484, "dur": 8, "args": { "External id": 86771, "cbid": 211, "correlation": 86771 } }, { "ph": "s", "id": 86771, "pid": 76337, "tid": -914061504, "ts": 1716454222732484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222755763, "dur": 3, "args": { "External id": 86779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86779, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86779, "pid": 5, "tid": 7, "ts": 1716454222755763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732527, "dur": 10, "args": { "External id": 86779, "cbid": 211, "correlation": 86779 } }, { "ph": "s", "id": 86779, "pid": 76337, "tid": -914061504, "ts": 1716454222732527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732594, "dur": 2, "args": { "External id": 86795, "cbid": 251, "correlation": 86795 } }, { "ph": "f", "id": 86795, "pid": 76337, "tid": -914061504, "ts": 1716454222732594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222732600, "dur": 0, "args": { "External id": 86797, "cbid": 251, "correlation": 86797 } }, { "ph": "f", "id": 86797, "pid": 76337, "tid": -914061504, "ts": 1716454222732600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222755768, "dur": 11, "args": { "External id": 86798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86798, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 86798, "pid": 5, "tid": 7, "ts": 1716454222755768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732602, "dur": 12, "args": { "External id": 86798, "cbid": 211, "correlation": 86798 } }, { "ph": "s", "id": 86798, "pid": 76337, "tid": -914061504, "ts": 1716454222732602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222755781, "dur": 5, "args": { "External id": 86800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86800, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 86800, "pid": 5, "tid": 7, "ts": 1716454222755781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732616, "dur": 6, "args": { "External id": 86800, "cbid": 211, "correlation": 86800 } }, { "ph": "s", "id": 86800, "pid": 76337, "tid": -914061504, "ts": 1716454222732616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222755787, "dur": 53, "args": { "External id": 86810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86810, "pid": 5, "tid": 7, "ts": 1716454222755787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732677, "dur": 12, "args": { "External id": 86810, "cbid": 211, "correlation": 86810 } }, { "ph": "s", "id": 86810, "pid": 76337, "tid": -914061504, "ts": 1716454222732677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222755841, "dur": 51, "args": { "External id": 86830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86830, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 86830, "pid": 5, "tid": 7, "ts": 1716454222755841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732744, "dur": 11, "args": { "External id": 86830, "cbid": 211, "correlation": 86830 } }, { "ph": "s", "id": 86830, "pid": 76337, "tid": -914061504, "ts": 1716454222732744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222755894, "dur": 4, "args": { "External id": 86842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86842, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86842, "pid": 5, "tid": 7, "ts": 1716454222755894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732765, "dur": 6, "args": { "External id": 86842, "cbid": 211, "correlation": 86842 } }, { "ph": "s", "id": 86842, "pid": 76337, "tid": -914061504, "ts": 1716454222732765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222755899, "dur": 57, "args": { "External id": 86845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86845, "pid": 5, "tid": 7, "ts": 1716454222755899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732784, "dur": 6, "args": { "External id": 86845, "cbid": 211, "correlation": 86845 } }, { "ph": "s", "id": 86845, "pid": 76337, "tid": -914061504, "ts": 1716454222732784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222755957, "dur": 38, "args": { "External id": 86854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86854, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86854, "pid": 5, "tid": 7, "ts": 1716454222755957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732824, "dur": 10, "args": { "External id": 86854, "cbid": 211, "correlation": 86854 } }, { "ph": "s", "id": 86854, "pid": 76337, "tid": -914061504, "ts": 1716454222732824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222732889, "dur": 0, "args": { "External id": 86864, "cbid": 317, "correlation": 86864 } }, { "ph": "f", "id": 86864, "pid": 76337, "tid": -914061504, "ts": 1716454222732889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222732889, "dur": 0, "args": { "External id": 86865, "cbid": 203, "correlation": 86865 } }, { "ph": "f", "id": 86865, "pid": 76337, "tid": -914061504, "ts": 1716454222732889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222732890, "dur": 0, "args": { "External id": 86866, "cbid": 205, "correlation": 86866 } }, { "ph": "f", "id": 86866, "pid": 76337, "tid": -914061504, "ts": 1716454222732890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222755996, "dur": 40, "args": { "External id": 86870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86870, "pid": 5, "tid": 7, "ts": 1716454222755996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732904, "dur": 12, "args": { "External id": 86870, "cbid": 211, "correlation": 86870 } }, { "ph": "s", "id": 86870, "pid": 76337, "tid": -914061504, "ts": 1716454222732904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222756038, "dur": 15, "args": { "External id": 86872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86872, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86872, "pid": 5, "tid": 7, "ts": 1716454222756038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732918, "dur": 5, "args": { "External id": 86872, "cbid": 211, "correlation": 86872 } }, { "ph": "s", "id": 86872, "pid": 76337, "tid": -914061504, "ts": 1716454222732918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222756054, "dur": 3, "args": { "External id": 86874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86874, "pid": 5, "tid": 7, "ts": 1716454222756054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732927, "dur": 6, "args": { "External id": 86874, "cbid": 211, "correlation": 86874 } }, { "ph": "s", "id": 86874, "pid": 76337, "tid": -914061504, "ts": 1716454222732927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222732936, "dur": 0, "args": { "External id": 86875, "cbid": 51, "correlation": 86875 } }, { "ph": "s", "id": 86875, "pid": 76337, "tid": -914061504, "ts": 1716454222732936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222756058, "dur": 705, "args": { "External id": 86876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86876, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 86876, "pid": 5, "tid": 7, "ts": 1716454222756058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732937, "dur": 5, "args": { "External id": 86876, "cbid": 211, "correlation": 86876 } }, { "ph": "s", "id": 86876, "pid": 76337, "tid": -914061504, "ts": 1716454222732937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222756765, "dur": 60, "args": { "External id": 86881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86881, "pid": 5, "tid": 7, "ts": 1716454222756765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222732964, "dur": 19, "args": { "External id": 86881, "cbid": 211, "correlation": 86881 } }, { "ph": "s", "id": 86881, "pid": 76337, "tid": -914061504, "ts": 1716454222732964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222756827, "dur": 50, "args": { "External id": 86889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86889, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86889, "pid": 5, "tid": 7, "ts": 1716454222756827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733008, "dur": 9, "args": { "External id": 86889, "cbid": 211, "correlation": 86889 } }, { "ph": "s", "id": 86889, "pid": 76337, "tid": -914061504, "ts": 1716454222733008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222756878, "dur": 35, "args": { "External id": 86897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86897, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86897, "pid": 5, "tid": 7, "ts": 1716454222756878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733038, "dur": 9, "args": { "External id": 86897, "cbid": 211, "correlation": 86897 } }, { "ph": "s", "id": 86897, "pid": 76337, "tid": -914061504, "ts": 1716454222733038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222756915, "dur": 52, "args": { "External id": 86917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86917, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 86917, "pid": 5, "tid": 7, "ts": 1716454222756915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733119, "dur": 12, "args": { "External id": 86917, "cbid": 211, "correlation": 86917 } }, { "ph": "s", "id": 86917, "pid": 76337, "tid": -914061504, "ts": 1716454222733119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222756968, "dur": 4, "args": { "External id": 86929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86929, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 86929, "pid": 5, "tid": 7, "ts": 1716454222756968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733141, "dur": 6, "args": { "External id": 86929, "cbid": 211, "correlation": 86929 } }, { "ph": "s", "id": 86929, "pid": 76337, "tid": -914061504, "ts": 1716454222733141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222756973, "dur": 55, "args": { "External id": 86932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86932, "pid": 5, "tid": 7, "ts": 1716454222756973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733159, "dur": 8, "args": { "External id": 86932, "cbid": 211, "correlation": 86932 } }, { "ph": "s", "id": 86932, "pid": 76337, "tid": -914061504, "ts": 1716454222733159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222733217, "dur": 0, "args": { "External id": 86943, "cbid": 317, "correlation": 86943 } }, { "ph": "f", "id": 86943, "pid": 76337, "tid": -914061504, "ts": 1716454222733217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222733218, "dur": 0, "args": { "External id": 86944, "cbid": 203, "correlation": 86944 } }, { "ph": "f", "id": 86944, "pid": 76337, "tid": -914061504, "ts": 1716454222733218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222733218, "dur": 0, "args": { "External id": 86945, "cbid": 205, "correlation": 86945 } }, { "ph": "f", "id": 86945, "pid": 76337, "tid": -914061504, "ts": 1716454222733218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733243, "dur": 1, "args": { "External id": 86949, "cbid": 251, "correlation": 86949 } }, { "ph": "f", "id": 86949, "pid": 76337, "tid": -914061504, "ts": 1716454222733243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733245, "dur": 0, "args": { "External id": 86950, "cbid": 251, "correlation": 86950 } }, { "ph": "f", "id": 86950, "pid": 76337, "tid": -914061504, "ts": 1716454222733245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733246, "dur": 0, "args": { "External id": 86951, "cbid": 251, "correlation": 86951 } }, { "ph": "f", "id": 86951, "pid": 76337, "tid": -914061504, "ts": 1716454222733246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733247, "dur": 0, "args": { "External id": 86952, "cbid": 251, "correlation": 86952 } }, { "ph": "f", "id": 86952, "pid": 76337, "tid": -914061504, "ts": 1716454222733247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733248, "dur": 0, "args": { "External id": 86953, "cbid": 251, "correlation": 86953 } }, { "ph": "f", "id": 86953, "pid": 76337, "tid": -914061504, "ts": 1716454222733248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733249, "dur": 0, "args": { "External id": 86954, "cbid": 251, "correlation": 86954 } }, { "ph": "f", "id": 86954, "pid": 76337, "tid": -914061504, "ts": 1716454222733249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733249, "dur": 0, "args": { "External id": 86955, "cbid": 251, "correlation": 86955 } }, { "ph": "f", "id": 86955, "pid": 76337, "tid": -914061504, "ts": 1716454222733249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733250, "dur": 0, "args": { "External id": 86956, "cbid": 251, "correlation": 86956 } }, { "ph": "f", "id": 86956, "pid": 76337, "tid": -914061504, "ts": 1716454222733250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733251, "dur": 0, "args": { "External id": 86957, "cbid": 251, "correlation": 86957 } }, { "ph": "f", "id": 86957, "pid": 76337, "tid": -914061504, "ts": 1716454222733251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222757030, "dur": 114, "args": { "External id": 86958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86958, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 86958, "pid": 5, "tid": 7, "ts": 1716454222757030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733253, "dur": 12, "args": { "External id": 86958, "cbid": 211, "correlation": 86958 } }, { "ph": "s", "id": 86958, "pid": 76337, "tid": -914061504, "ts": 1716454222733253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222757145, "dur": 60, "args": { "External id": 86964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86964, "pid": 5, "tid": 7, "ts": 1716454222757145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733289, "dur": 9, "args": { "External id": 86964, "cbid": 211, "correlation": 86964 } }, { "ph": "s", "id": 86964, "pid": 76337, "tid": -914061504, "ts": 1716454222733289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222757206, "dur": 449, "args": { "External id": 86973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86973, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86973, "pid": 5, "tid": 7, "ts": 1716454222757206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733375, "dur": 15, "args": { "External id": 86973, "cbid": 211, "correlation": 86973 } }, { "ph": "s", "id": 86973, "pid": 76337, "tid": -914061504, "ts": 1716454222733375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222757657, "dur": 184, "args": { "External id": 86995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 86995, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 86995, "pid": 5, "tid": 7, "ts": 1716454222757657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733434, "dur": 10, "args": { "External id": 86995, "cbid": 211, "correlation": 86995 } }, { "ph": "s", "id": 86995, "pid": 76337, "tid": -914061504, "ts": 1716454222733434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733527, "dur": 1, "args": { "External id": 87006, "cbid": 251, "correlation": 87006 } }, { "ph": "f", "id": 87006, "pid": 76337, "tid": -914061504, "ts": 1716454222733527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222757842, "dur": 197, "args": { "External id": 87007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87007, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87007, "pid": 5, "tid": 7, "ts": 1716454222757842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733532, "dur": 13, "args": { "External id": 87007, "cbid": 211, "correlation": 87007 } }, { "ph": "s", "id": 87007, "pid": 76337, "tid": -914061504, "ts": 1716454222733532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733603, "dur": 1, "args": { "External id": 87018, "cbid": 251, "correlation": 87018 } }, { "ph": "f", "id": 87018, "pid": 76337, "tid": -914061504, "ts": 1716454222733603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222758040, "dur": 188, "args": { "External id": 87019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87019, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87019, "pid": 5, "tid": 7, "ts": 1716454222758040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733607, "dur": 12, "args": { "External id": 87019, "cbid": 211, "correlation": 87019 } }, { "ph": "s", "id": 87019, "pid": 76337, "tid": -914061504, "ts": 1716454222733607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733669, "dur": 1, "args": { "External id": 87030, "cbid": 251, "correlation": 87030 } }, { "ph": "f", "id": 87030, "pid": 76337, "tid": -914061504, "ts": 1716454222733669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222758229, "dur": 188, "args": { "External id": 87031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87031, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87031, "pid": 5, "tid": 7, "ts": 1716454222758229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733673, "dur": 12, "args": { "External id": 87031, "cbid": 211, "correlation": 87031 } }, { "ph": "s", "id": 87031, "pid": 76337, "tid": -914061504, "ts": 1716454222733673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222758419, "dur": 19004, "args": { "External id": 87052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87052, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87052, "pid": 5, "tid": 7, "ts": 1716454222758419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733759, "dur": 14, "args": { "External id": 87052, "cbid": 211, "correlation": 87052 } }, { "ph": "s", "id": 87052, "pid": 76337, "tid": -914061504, "ts": 1716454222733759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222733862, "dur": 1, "args": { "External id": 87070, "cbid": 251, "correlation": 87070 } }, { "ph": "f", "id": 87070, "pid": 76337, "tid": -914061504, "ts": 1716454222733862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222777424, "dur": 204, "args": { "External id": 87072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87072, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87072, "pid": 5, "tid": 7, "ts": 1716454222777424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733868, "dur": 13, "args": { "External id": 87072, "cbid": 211, "correlation": 87072 } }, { "ph": "s", "id": 87072, "pid": 76337, "tid": -914061504, "ts": 1716454222733868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222777630, "dur": 66, "args": { "External id": 87080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87080, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87080, "pid": 5, "tid": 7, "ts": 1716454222777630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733939, "dur": 13, "args": { "External id": 87080, "cbid": 211, "correlation": 87080 } }, { "ph": "s", "id": 87080, "pid": 76337, "tid": -914061504, "ts": 1716454222733939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222777697, "dur": 98, "args": { "External id": 87088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87088, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87088, "pid": 5, "tid": 7, "ts": 1716454222777697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222733988, "dur": 9, "args": { "External id": 87088, "cbid": 211, "correlation": 87088 } }, { "ph": "s", "id": 87088, "pid": 76337, "tid": -914061504, "ts": 1716454222733988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222777796, "dur": 55, "args": { "External id": 87099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87099, "pid": 5, "tid": 7, "ts": 1716454222777796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734063, "dur": 13, "args": { "External id": 87099, "cbid": 211, "correlation": 87099 } }, { "ph": "s", "id": 87099, "pid": 76337, "tid": -914061504, "ts": 1716454222734063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222777852, "dur": 94, "args": { "External id": 87121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87121, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87121, "pid": 5, "tid": 7, "ts": 1716454222777852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734094, "dur": 8, "args": { "External id": 87121, "cbid": 211, "correlation": 87121 } }, { "ph": "s", "id": 87121, "pid": 76337, "tid": -914061504, "ts": 1716454222734094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734179, "dur": 1, "args": { "External id": 87132, "cbid": 251, "correlation": 87132 } }, { "ph": "f", "id": 87132, "pid": 76337, "tid": -914061504, "ts": 1716454222734179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222777948, "dur": 107, "args": { "External id": 87133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87133, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87133, "pid": 5, "tid": 7, "ts": 1716454222777948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734184, "dur": 14, "args": { "External id": 87133, "cbid": 211, "correlation": 87133 } }, { "ph": "s", "id": 87133, "pid": 76337, "tid": -914061504, "ts": 1716454222734184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734258, "dur": 1, "args": { "External id": 87144, "cbid": 251, "correlation": 87144 } }, { "ph": "f", "id": 87144, "pid": 76337, "tid": -914061504, "ts": 1716454222734258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734261, "dur": 0, "args": { "External id": 87145, "cbid": 251, "correlation": 87145 } }, { "ph": "f", "id": 87145, "pid": 76337, "tid": -914061504, "ts": 1716454222734261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222778055, "dur": 10, "args": { "External id": 87146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87146, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 87146, "pid": 5, "tid": 7, "ts": 1716454222778055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734264, "dur": 12, "args": { "External id": 87146, "cbid": 211, "correlation": 87146 } }, { "ph": "s", "id": 87146, "pid": 76337, "tid": -914061504, "ts": 1716454222734264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222778067, "dur": 5, "args": { "External id": 87148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87148, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 87148, "pid": 5, "tid": 7, "ts": 1716454222778067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734278, "dur": 6, "args": { "External id": 87148, "cbid": 211, "correlation": 87148 } }, { "ph": "s", "id": 87148, "pid": 76337, "tid": -914061504, "ts": 1716454222734278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734339, "dur": 1, "args": { "External id": 87159, "cbid": 251, "correlation": 87159 } }, { "ph": "f", "id": 87159, "pid": 76337, "tid": -914061504, "ts": 1716454222734339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734342, "dur": 0, "args": { "External id": 87160, "cbid": 251, "correlation": 87160 } }, { "ph": "f", "id": 87160, "pid": 76337, "tid": -914061504, "ts": 1716454222734342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222778073, "dur": 6, "args": { "External id": 87161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87161, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 87161, "pid": 5, "tid": 7, "ts": 1716454222778073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734344, "dur": 12, "args": { "External id": 87161, "cbid": 211, "correlation": 87161 } }, { "ph": "s", "id": 87161, "pid": 76337, "tid": -914061504, "ts": 1716454222734344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222778081, "dur": 4, "args": { "External id": 87163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87163, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 87163, "pid": 5, "tid": 7, "ts": 1716454222778081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734358, "dur": 6, "args": { "External id": 87163, "cbid": 211, "correlation": 87163 } }, { "ph": "s", "id": 87163, "pid": 76337, "tid": -914061504, "ts": 1716454222734358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222778086, "dur": 158, "args": { "External id": 87184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87184, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87184, "pid": 5, "tid": 7, "ts": 1716454222778086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734432, "dur": 12, "args": { "External id": 87184, "cbid": 211, "correlation": 87184 } }, { "ph": "s", "id": 87184, "pid": 76337, "tid": -914061504, "ts": 1716454222734432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734530, "dur": 1, "args": { "External id": 87202, "cbid": 251, "correlation": 87202 } }, { "ph": "f", "id": 87202, "pid": 76337, "tid": -914061504, "ts": 1716454222734530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222778245, "dur": 108, "args": { "External id": 87204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87204, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87204, "pid": 5, "tid": 7, "ts": 1716454222778245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734536, "dur": 13, "args": { "External id": 87204, "cbid": 211, "correlation": 87204 } }, { "ph": "s", "id": 87204, "pid": 76337, "tid": -914061504, "ts": 1716454222734536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222778355, "dur": 35, "args": { "External id": 87212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87212, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87212, "pid": 5, "tid": 7, "ts": 1716454222778355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734606, "dur": 12, "args": { "External id": 87212, "cbid": 211, "correlation": 87212 } }, { "ph": "s", "id": 87212, "pid": 76337, "tid": -914061504, "ts": 1716454222734606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222778391, "dur": 68, "args": { "External id": 87220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87220, "pid": 5, "tid": 7, "ts": 1716454222778391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734647, "dur": 9, "args": { "External id": 87220, "cbid": 211, "correlation": 87220 } }, { "ph": "s", "id": 87220, "pid": 76337, "tid": -914061504, "ts": 1716454222734647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222778460, "dur": 93, "args": { "External id": 87242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87242, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87242, "pid": 5, "tid": 7, "ts": 1716454222778460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734699, "dur": 10, "args": { "External id": 87242, "cbid": 211, "correlation": 87242 } }, { "ph": "s", "id": 87242, "pid": 76337, "tid": -914061504, "ts": 1716454222734699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734786, "dur": 1, "args": { "External id": 87258, "cbid": 251, "correlation": 87258 } }, { "ph": "f", "id": 87258, "pid": 76337, "tid": -914061504, "ts": 1716454222734786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222778555, "dur": 586, "args": { "External id": 87260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87260, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87260, "pid": 5, "tid": 7, "ts": 1716454222778555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734792, "dur": 13, "args": { "External id": 87260, "cbid": 211, "correlation": 87260 } }, { "ph": "s", "id": 87260, "pid": 76337, "tid": -914061504, "ts": 1716454222734792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222779142, "dur": 245, "args": { "External id": 87268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87268, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87268, "pid": 5, "tid": 7, "ts": 1716454222779142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734860, "dur": 14, "args": { "External id": 87268, "cbid": 211, "correlation": 87268 } }, { "ph": "s", "id": 87268, "pid": 76337, "tid": -914061504, "ts": 1716454222734860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222779389, "dur": 252, "args": { "External id": 87276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87276, "pid": 5, "tid": 7, "ts": 1716454222779389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734893, "dur": 8, "args": { "External id": 87276, "cbid": 211, "correlation": 87276 } }, { "ph": "s", "id": 87276, "pid": 76337, "tid": -914061504, "ts": 1716454222734893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734985, "dur": 1, "args": { "External id": 87292, "cbid": 251, "correlation": 87292 } }, { "ph": "f", "id": 87292, "pid": 76337, "tid": -914061504, "ts": 1716454222734985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222734990, "dur": 0, "args": { "External id": 87294, "cbid": 251, "correlation": 87294 } }, { "ph": "f", "id": 87294, "pid": 76337, "tid": -914061504, "ts": 1716454222734990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222779642, "dur": 364, "args": { "External id": 87295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87295, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 87295, "pid": 5, "tid": 7, "ts": 1716454222779642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222734993, "dur": 13, "args": { "External id": 87295, "cbid": 211, "correlation": 87295 } }, { "ph": "s", "id": 87295, "pid": 76337, "tid": -914061504, "ts": 1716454222734993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222780007, "dur": 50, "args": { "External id": 87303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87303, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87303, "pid": 5, "tid": 7, "ts": 1716454222780007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735037, "dur": 10, "args": { "External id": 87303, "cbid": 211, "correlation": 87303 } }, { "ph": "s", "id": 87303, "pid": 76337, "tid": -914061504, "ts": 1716454222735037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222780058, "dur": 159, "args": { "External id": 87314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87314, "pid": 5, "tid": 7, "ts": 1716454222780058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735105, "dur": 12, "args": { "External id": 87314, "cbid": 211, "correlation": 87314 } }, { "ph": "s", "id": 87314, "pid": 76337, "tid": -914061504, "ts": 1716454222735105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222735172, "dur": 0, "args": { "External id": 87326, "cbid": 317, "correlation": 87326 } }, { "ph": "f", "id": 87326, "pid": 76337, "tid": -914061504, "ts": 1716454222735172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222735173, "dur": 0, "args": { "External id": 87327, "cbid": 203, "correlation": 87327 } }, { "ph": "f", "id": 87327, "pid": 76337, "tid": -914061504, "ts": 1716454222735173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222735174, "dur": 0, "args": { "External id": 87328, "cbid": 205, "correlation": 87328 } }, { "ph": "f", "id": 87328, "pid": 76337, "tid": -914061504, "ts": 1716454222735174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735199, "dur": 1, "args": { "External id": 87332, "cbid": 251, "correlation": 87332 } }, { "ph": "f", "id": 87332, "pid": 76337, "tid": -914061504, "ts": 1716454222735199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735201, "dur": 0, "args": { "External id": 87333, "cbid": 251, "correlation": 87333 } }, { "ph": "f", "id": 87333, "pid": 76337, "tid": -914061504, "ts": 1716454222735201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735202, "dur": 0, "args": { "External id": 87334, "cbid": 251, "correlation": 87334 } }, { "ph": "f", "id": 87334, "pid": 76337, "tid": -914061504, "ts": 1716454222735202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735203, "dur": 0, "args": { "External id": 87335, "cbid": 251, "correlation": 87335 } }, { "ph": "f", "id": 87335, "pid": 76337, "tid": -914061504, "ts": 1716454222735203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735204, "dur": 0, "args": { "External id": 87336, "cbid": 251, "correlation": 87336 } }, { "ph": "f", "id": 87336, "pid": 76337, "tid": -914061504, "ts": 1716454222735204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735204, "dur": 0, "args": { "External id": 87337, "cbid": 251, "correlation": 87337 } }, { "ph": "f", "id": 87337, "pid": 76337, "tid": -914061504, "ts": 1716454222735204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735205, "dur": 0, "args": { "External id": 87338, "cbid": 251, "correlation": 87338 } }, { "ph": "f", "id": 87338, "pid": 76337, "tid": -914061504, "ts": 1716454222735205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735206, "dur": 0, "args": { "External id": 87339, "cbid": 251, "correlation": 87339 } }, { "ph": "f", "id": 87339, "pid": 76337, "tid": -914061504, "ts": 1716454222735206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735207, "dur": 0, "args": { "External id": 87340, "cbid": 251, "correlation": 87340 } }, { "ph": "f", "id": 87340, "pid": 76337, "tid": -914061504, "ts": 1716454222735207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222780218, "dur": 114, "args": { "External id": 87341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87341, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87341, "pid": 5, "tid": 7, "ts": 1716454222780218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735209, "dur": 12, "args": { "External id": 87341, "cbid": 211, "correlation": 87341 } }, { "ph": "s", "id": 87341, "pid": 76337, "tid": -914061504, "ts": 1716454222735209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222780334, "dur": 59, "args": { "External id": 87347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87347, "pid": 5, "tid": 7, "ts": 1716454222780334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735244, "dur": 10, "args": { "External id": 87347, "cbid": 211, "correlation": 87347 } }, { "ph": "s", "id": 87347, "pid": 76337, "tid": -914061504, "ts": 1716454222735244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222780394, "dur": 50, "args": { "External id": 87355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87355, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87355, "pid": 5, "tid": 7, "ts": 1716454222780394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735276, "dur": 8, "args": { "External id": 87355, "cbid": 211, "correlation": 87355 } }, { "ph": "s", "id": 87355, "pid": 76337, "tid": -914061504, "ts": 1716454222735276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222735351, "dur": 0, "args": { "External id": 87365, "cbid": 317, "correlation": 87365 } }, { "ph": "f", "id": 87365, "pid": 76337, "tid": -914061504, "ts": 1716454222735351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222735352, "dur": 0, "args": { "External id": 87366, "cbid": 203, "correlation": 87366 } }, { "ph": "f", "id": 87366, "pid": 76337, "tid": -914061504, "ts": 1716454222735352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222735352, "dur": 0, "args": { "External id": 87367, "cbid": 205, "correlation": 87367 } }, { "ph": "f", "id": 87367, "pid": 76337, "tid": -914061504, "ts": 1716454222735352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222780445, "dur": 41, "args": { "External id": 87371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87371, "pid": 5, "tid": 7, "ts": 1716454222780445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735371, "dur": 12, "args": { "External id": 87371, "cbid": 211, "correlation": 87371 } }, { "ph": "s", "id": 87371, "pid": 76337, "tid": -914061504, "ts": 1716454222735371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222780487, "dur": 14, "args": { "External id": 87373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87373, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87373, "pid": 5, "tid": 7, "ts": 1716454222780487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735386, "dur": 5, "args": { "External id": 87373, "cbid": 211, "correlation": 87373 } }, { "ph": "s", "id": 87373, "pid": 76337, "tid": -914061504, "ts": 1716454222735386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222780504, "dur": 1, "args": { "External id": 87375, "device": 5, "context": 1, "stream": 7, "correlation": 87375, "bytes": 1536, "memory bandwidth (GB/s)": 0.9230769230769231 } }, { "ph": "f", "id": 87375, "pid": 5, "tid": 7, "ts": 1716454222780504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222735406, "dur": 18, "args": { "External id": 87375, "cbid": 51, "correlation": 87375 } }, { "ph": "s", "id": 87375, "pid": 76337, "tid": -914061504, "ts": 1716454222735406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222780507, "dur": 357, "args": { "External id": 87376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87376, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87376, "pid": 5, "tid": 7, "ts": 1716454222780507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735426, "dur": 10, "args": { "External id": 87376, "cbid": 211, "correlation": 87376 } }, { "ph": "s", "id": 87376, "pid": 76337, "tid": -914061504, "ts": 1716454222735426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222780865, "dur": 13, "args": { "External id": 87378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87378, "pid": 5, "tid": 7, "ts": 1716454222780865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735444, "dur": 7, "args": { "External id": 87378, "cbid": 211, "correlation": 87378 } }, { "ph": "s", "id": 87378, "pid": 76337, "tid": -914061504, "ts": 1716454222735444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222780880, "dur": 15, "args": { "External id": 87384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87384, "pid": 5, "tid": 7, "ts": 1716454222780880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735474, "dur": 8, "args": { "External id": 87384, "cbid": 211, "correlation": 87384 } }, { "ph": "s", "id": 87384, "pid": 76337, "tid": -914061504, "ts": 1716454222735474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222780896, "dur": 18, "args": { "External id": 87404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87404, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 87404, "pid": 5, "tid": 7, "ts": 1716454222780896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735572, "dur": 13, "args": { "External id": 87404, "cbid": 211, "correlation": 87404 } }, { "ph": "s", "id": 87404, "pid": 76337, "tid": -914061504, "ts": 1716454222735572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222780915, "dur": 4, "args": { "External id": 87416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87416, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 87416, "pid": 5, "tid": 7, "ts": 1716454222780915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735595, "dur": 6, "args": { "External id": 87416, "cbid": 211, "correlation": 87416 } }, { "ph": "s", "id": 87416, "pid": 76337, "tid": -914061504, "ts": 1716454222735595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222780921, "dur": 18, "args": { "External id": 87419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87419, "pid": 5, "tid": 7, "ts": 1716454222780921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735613, "dur": 7, "args": { "External id": 87419, "cbid": 211, "correlation": 87419 } }, { "ph": "s", "id": 87419, "pid": 76337, "tid": -914061504, "ts": 1716454222735613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222780940, "dur": 11, "args": { "External id": 87428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87428, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87428, "pid": 5, "tid": 7, "ts": 1716454222780940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735654, "dur": 10, "args": { "External id": 87428, "cbid": 211, "correlation": 87428 } }, { "ph": "s", "id": 87428, "pid": 76337, "tid": -914061504, "ts": 1716454222735654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222735710, "dur": 0, "args": { "External id": 87438, "cbid": 317, "correlation": 87438 } }, { "ph": "f", "id": 87438, "pid": 76337, "tid": -914061504, "ts": 1716454222735710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222735711, "dur": 0, "args": { "External id": 87439, "cbid": 203, "correlation": 87439 } }, { "ph": "f", "id": 87439, "pid": 76337, "tid": -914061504, "ts": 1716454222735711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222735712, "dur": 0, "args": { "External id": 87440, "cbid": 205, "correlation": 87440 } }, { "ph": "f", "id": 87440, "pid": 76337, "tid": -914061504, "ts": 1716454222735712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222780953, "dur": 11, "args": { "External id": 87444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87444, "pid": 5, "tid": 7, "ts": 1716454222780953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735727, "dur": 11, "args": { "External id": 87444, "cbid": 211, "correlation": 87444 } }, { "ph": "s", "id": 87444, "pid": 76337, "tid": -914061504, "ts": 1716454222735727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222780965, "dur": 24, "args": { "External id": 87446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87446, "pid": 5, "tid": 7, "ts": 1716454222780965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735741, "dur": 5, "args": { "External id": 87446, "cbid": 211, "correlation": 87446 } }, { "ph": "s", "id": 87446, "pid": 76337, "tid": -914061504, "ts": 1716454222735741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222780991, "dur": 3, "args": { "External id": 87448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87448, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 87448, "pid": 5, "tid": 7, "ts": 1716454222780991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735753, "dur": 6, "args": { "External id": 87448, "cbid": 211, "correlation": 87448 } }, { "ph": "s", "id": 87448, "pid": 76337, "tid": -914061504, "ts": 1716454222735753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222735763, "dur": 0, "args": { "External id": 87449, "cbid": 51, "correlation": 87449 } }, { "ph": "s", "id": 87449, "pid": 76337, "tid": -914061504, "ts": 1716454222735763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222780995, "dur": 356, "args": { "External id": 87450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87450, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87450, "pid": 5, "tid": 7, "ts": 1716454222780995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735764, "dur": 7, "args": { "External id": 87450, "cbid": 211, "correlation": 87450 } }, { "ph": "s", "id": 87450, "pid": 76337, "tid": -914061504, "ts": 1716454222735764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222781353, "dur": 21, "args": { "External id": 87451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87451, "pid": 5, "tid": 7, "ts": 1716454222781353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735773, "dur": 5, "args": { "External id": 87451, "cbid": 211, "correlation": 87451 } }, { "ph": "s", "id": 87451, "pid": 76337, "tid": -914061504, "ts": 1716454222735773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222781375, "dur": 32, "args": { "External id": 87457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87457, "pid": 5, "tid": 7, "ts": 1716454222781375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735801, "dur": 8, "args": { "External id": 87457, "cbid": 211, "correlation": 87457 } }, { "ph": "s", "id": 87457, "pid": 76337, "tid": -914061504, "ts": 1716454222735801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222781409, "dur": 3, "args": { "External id": 87465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87465, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 87465, "pid": 5, "tid": 7, "ts": 1716454222781409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735845, "dur": 9, "args": { "External id": 87465, "cbid": 211, "correlation": 87465 } }, { "ph": "s", "id": 87465, "pid": 76337, "tid": -914061504, "ts": 1716454222735845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735914, "dur": 2, "args": { "External id": 87481, "cbid": 251, "correlation": 87481 } }, { "ph": "f", "id": 87481, "pid": 76337, "tid": -914061504, "ts": 1716454222735914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222735919, "dur": 0, "args": { "External id": 87483, "cbid": 251, "correlation": 87483 } }, { "ph": "f", "id": 87483, "pid": 76337, "tid": -914061504, "ts": 1716454222735919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222781413, "dur": 12, "args": { "External id": 87484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87484, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 87484, "pid": 5, "tid": 7, "ts": 1716454222781413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735921, "dur": 12, "args": { "External id": 87484, "cbid": 211, "correlation": 87484 } }, { "ph": "s", "id": 87484, "pid": 76337, "tid": -914061504, "ts": 1716454222735921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222781426, "dur": 5, "args": { "External id": 87486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87486, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 87486, "pid": 5, "tid": 7, "ts": 1716454222781426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222735935, "dur": 6, "args": { "External id": 87486, "cbid": 211, "correlation": 87486 } }, { "ph": "s", "id": 87486, "pid": 76337, "tid": -914061504, "ts": 1716454222735935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222781433, "dur": 28, "args": { "External id": 87496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87496, "pid": 5, "tid": 7, "ts": 1716454222781433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736004, "dur": 12, "args": { "External id": 87496, "cbid": 211, "correlation": 87496 } }, { "ph": "s", "id": 87496, "pid": 76337, "tid": -914061504, "ts": 1716454222736004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222781462, "dur": 30, "args": { "External id": 87516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87516, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 87516, "pid": 5, "tid": 7, "ts": 1716454222781462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736072, "dur": 11, "args": { "External id": 87516, "cbid": 211, "correlation": 87516 } }, { "ph": "s", "id": 87516, "pid": 76337, "tid": -914061504, "ts": 1716454222736072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222781493, "dur": 4, "args": { "External id": 87528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87528, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 87528, "pid": 5, "tid": 7, "ts": 1716454222781493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736093, "dur": 6, "args": { "External id": 87528, "cbid": 211, "correlation": 87528 } }, { "ph": "s", "id": 87528, "pid": 76337, "tid": -914061504, "ts": 1716454222736093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222781498, "dur": 29, "args": { "External id": 87531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87531, "pid": 5, "tid": 7, "ts": 1716454222781498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736112, "dur": 6, "args": { "External id": 87531, "cbid": 211, "correlation": 87531 } }, { "ph": "s", "id": 87531, "pid": 76337, "tid": -914061504, "ts": 1716454222736112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222781528, "dur": 20, "args": { "External id": 87540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87540, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87540, "pid": 5, "tid": 7, "ts": 1716454222781528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736154, "dur": 10, "args": { "External id": 87540, "cbid": 211, "correlation": 87540 } }, { "ph": "s", "id": 87540, "pid": 76337, "tid": -914061504, "ts": 1716454222736154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222736219, "dur": 0, "args": { "External id": 87550, "cbid": 317, "correlation": 87550 } }, { "ph": "f", "id": 87550, "pid": 76337, "tid": -914061504, "ts": 1716454222736219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222736220, "dur": 0, "args": { "External id": 87551, "cbid": 203, "correlation": 87551 } }, { "ph": "f", "id": 87551, "pid": 76337, "tid": -914061504, "ts": 1716454222736220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222736220, "dur": 0, "args": { "External id": 87552, "cbid": 205, "correlation": 87552 } }, { "ph": "f", "id": 87552, "pid": 76337, "tid": -914061504, "ts": 1716454222736220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222781550, "dur": 22, "args": { "External id": 87556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87556, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87556, "pid": 5, "tid": 7, "ts": 1716454222781550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736237, "dur": 12, "args": { "External id": 87556, "cbid": 211, "correlation": 87556 } }, { "ph": "s", "id": 87556, "pid": 76337, "tid": -914061504, "ts": 1716454222736237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222781573, "dur": 43, "args": { "External id": 87558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87558, "pid": 5, "tid": 7, "ts": 1716454222781573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736252, "dur": 5, "args": { "External id": 87558, "cbid": 211, "correlation": 87558 } }, { "ph": "s", "id": 87558, "pid": 76337, "tid": -914061504, "ts": 1716454222736252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222781618, "dur": 647, "args": { "External id": 87560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87560, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87560, "pid": 5, "tid": 7, "ts": 1716454222781618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736267, "dur": 10, "args": { "External id": 87560, "cbid": 211, "correlation": 87560 } }, { "ph": "s", "id": 87560, "pid": 76337, "tid": -914061504, "ts": 1716454222736267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222782266, "dur": 20, "args": { "External id": 87562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87562, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87562, "pid": 5, "tid": 7, "ts": 1716454222782266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736281, "dur": 5, "args": { "External id": 87562, "cbid": 211, "correlation": 87562 } }, { "ph": "s", "id": 87562, "pid": 76337, "tid": -914061504, "ts": 1716454222736281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222782287, "dur": 33, "args": { "External id": 87568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87568, "pid": 5, "tid": 7, "ts": 1716454222782287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736309, "dur": 8, "args": { "External id": 87568, "cbid": 211, "correlation": 87568 } }, { "ph": "s", "id": 87568, "pid": 76337, "tid": -914061504, "ts": 1716454222736309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222736368, "dur": 0, "args": { "External id": 87578, "cbid": 317, "correlation": 87578 } }, { "ph": "f", "id": 87578, "pid": 76337, "tid": -914061504, "ts": 1716454222736368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222736368, "dur": 0, "args": { "External id": 87579, "cbid": 203, "correlation": 87579 } }, { "ph": "f", "id": 87579, "pid": 76337, "tid": -914061504, "ts": 1716454222736368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222736369, "dur": 0, "args": { "External id": 87580, "cbid": 205, "correlation": 87580 } }, { "ph": "f", "id": 87580, "pid": 76337, "tid": -914061504, "ts": 1716454222736369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736390, "dur": 1, "args": { "External id": 87584, "cbid": 251, "correlation": 87584 } }, { "ph": "f", "id": 87584, "pid": 76337, "tid": -914061504, "ts": 1716454222736390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736392, "dur": 0, "args": { "External id": 87585, "cbid": 251, "correlation": 87585 } }, { "ph": "f", "id": 87585, "pid": 76337, "tid": -914061504, "ts": 1716454222736392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736393, "dur": 0, "args": { "External id": 87586, "cbid": 251, "correlation": 87586 } }, { "ph": "f", "id": 87586, "pid": 76337, "tid": -914061504, "ts": 1716454222736393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736394, "dur": 0, "args": { "External id": 87587, "cbid": 251, "correlation": 87587 } }, { "ph": "f", "id": 87587, "pid": 76337, "tid": -914061504, "ts": 1716454222736394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736395, "dur": 0, "args": { "External id": 87588, "cbid": 251, "correlation": 87588 } }, { "ph": "f", "id": 87588, "pid": 76337, "tid": -914061504, "ts": 1716454222736395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736395, "dur": 0, "args": { "External id": 87589, "cbid": 251, "correlation": 87589 } }, { "ph": "f", "id": 87589, "pid": 76337, "tid": -914061504, "ts": 1716454222736395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736396, "dur": 0, "args": { "External id": 87590, "cbid": 251, "correlation": 87590 } }, { "ph": "f", "id": 87590, "pid": 76337, "tid": -914061504, "ts": 1716454222736396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736397, "dur": 0, "args": { "External id": 87591, "cbid": 251, "correlation": 87591 } }, { "ph": "f", "id": 87591, "pid": 76337, "tid": -914061504, "ts": 1716454222736397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222736398, "dur": 0, "args": { "External id": 87592, "cbid": 251, "correlation": 87592 } }, { "ph": "f", "id": 87592, "pid": 76337, "tid": -914061504, "ts": 1716454222736398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222782321, "dur": 51, "args": { "External id": 87593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87593, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87593, "pid": 5, "tid": 7, "ts": 1716454222782321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736400, "dur": 13, "args": { "External id": 87593, "cbid": 211, "correlation": 87593 } }, { "ph": "s", "id": 87593, "pid": 76337, "tid": -914061504, "ts": 1716454222736400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222782373, "dur": 32, "args": { "External id": 87599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87599, "pid": 5, "tid": 7, "ts": 1716454222782373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736433, "dur": 9, "args": { "External id": 87599, "cbid": 211, "correlation": 87599 } }, { "ph": "s", "id": 87599, "pid": 76337, "tid": -914061504, "ts": 1716454222736433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222782406, "dur": 27, "args": { "External id": 87607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87607, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87607, "pid": 5, "tid": 7, "ts": 1716454222782406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736462, "dur": 8, "args": { "External id": 87607, "cbid": 211, "correlation": 87607 } }, { "ph": "s", "id": 87607, "pid": 76337, "tid": -914061504, "ts": 1716454222736462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222782434, "dur": 20, "args": { "External id": 87615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87615, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87615, "pid": 5, "tid": 7, "ts": 1716454222782434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736493, "dur": 9, "args": { "External id": 87615, "cbid": 211, "correlation": 87615 } }, { "ph": "s", "id": 87615, "pid": 76337, "tid": -914061504, "ts": 1716454222736493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222782456, "dur": 30, "args": { "External id": 87635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87635, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 87635, "pid": 5, "tid": 7, "ts": 1716454222782456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736578, "dur": 12, "args": { "External id": 87635, "cbid": 211, "correlation": 87635 } }, { "ph": "s", "id": 87635, "pid": 76337, "tid": -914061504, "ts": 1716454222736578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222782487, "dur": 4, "args": { "External id": 87647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87647, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 87647, "pid": 5, "tid": 7, "ts": 1716454222782487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736600, "dur": 6, "args": { "External id": 87647, "cbid": 211, "correlation": 87647 } }, { "ph": "s", "id": 87647, "pid": 76337, "tid": -914061504, "ts": 1716454222736600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222782492, "dur": 30, "args": { "External id": 87650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87650, "pid": 5, "tid": 7, "ts": 1716454222782492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736618, "dur": 6, "args": { "External id": 87650, "cbid": 211, "correlation": 87650 } }, { "ph": "s", "id": 87650, "pid": 76337, "tid": -914061504, "ts": 1716454222736618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222736675, "dur": 0, "args": { "External id": 87661, "cbid": 317, "correlation": 87661 } }, { "ph": "f", "id": 87661, "pid": 76337, "tid": -914061504, "ts": 1716454222736675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222736676, "dur": 0, "args": { "External id": 87662, "cbid": 203, "correlation": 87662 } }, { "ph": "f", "id": 87662, "pid": 76337, "tid": -914061504, "ts": 1716454222736676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222736677, "dur": 0, "args": { "External id": 87663, "cbid": 205, "correlation": 87663 } }, { "ph": "f", "id": 87663, "pid": 76337, "tid": -914061504, "ts": 1716454222736677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222782523, "dur": 24, "args": { "External id": 87667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87667, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87667, "pid": 5, "tid": 7, "ts": 1716454222782523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736692, "dur": 12, "args": { "External id": 87667, "cbid": 211, "correlation": 87667 } }, { "ph": "s", "id": 87667, "pid": 76337, "tid": -914061504, "ts": 1716454222736692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222782549, "dur": 118, "args": { "External id": 87669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87669, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87669, "pid": 5, "tid": 7, "ts": 1716454222782549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736712, "dur": 8, "args": { "External id": 87669, "cbid": 211, "correlation": 87669 } }, { "ph": "s", "id": 87669, "pid": 76337, "tid": -914061504, "ts": 1716454222736712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222782668, "dur": 22, "args": { "External id": 87671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87671, "pid": 5, "tid": 7, "ts": 1716454222782668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736724, "dur": 5, "args": { "External id": 87671, "cbid": 211, "correlation": 87671 } }, { "ph": "s", "id": 87671, "pid": 76337, "tid": -914061504, "ts": 1716454222736724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222782691, "dur": 32, "args": { "External id": 87677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87677, "pid": 5, "tid": 7, "ts": 1716454222782691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736752, "dur": 8, "args": { "External id": 87677, "cbid": 211, "correlation": 87677 } }, { "ph": "s", "id": 87677, "pid": 76337, "tid": -914061504, "ts": 1716454222736752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222782724, "dur": 160, "args": { "External id": 87686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87686, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87686, "pid": 5, "tid": 7, "ts": 1716454222782724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736837, "dur": 16, "args": { "External id": 87686, "cbid": 211, "correlation": 87686 } }, { "ph": "s", "id": 87686, "pid": 76337, "tid": -914061504, "ts": 1716454222736837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222782886, "dur": 64, "args": { "External id": 87708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87708, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87708, "pid": 5, "tid": 7, "ts": 1716454222782886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222736897, "dur": 10, "args": { "External id": 87708, "cbid": 211, "correlation": 87708 } }, { "ph": "s", "id": 87708, "pid": 76337, "tid": -914061504, "ts": 1716454222736897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737002, "dur": 1, "args": { "External id": 87719, "cbid": 251, "correlation": 87719 } }, { "ph": "f", "id": 87719, "pid": 76337, "tid": -914061504, "ts": 1716454222737002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222782952, "dur": 154, "args": { "External id": 87720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87720, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87720, "pid": 5, "tid": 7, "ts": 1716454222782952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737007, "dur": 14, "args": { "External id": 87720, "cbid": 211, "correlation": 87720 } }, { "ph": "s", "id": 87720, "pid": 76337, "tid": -914061504, "ts": 1716454222737007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737079, "dur": 1, "args": { "External id": 87731, "cbid": 251, "correlation": 87731 } }, { "ph": "f", "id": 87731, "pid": 76337, "tid": -914061504, "ts": 1716454222737079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222783107, "dur": 143, "args": { "External id": 87732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87732, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87732, "pid": 5, "tid": 7, "ts": 1716454222783107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737083, "dur": 12, "args": { "External id": 87732, "cbid": 211, "correlation": 87732 } }, { "ph": "s", "id": 87732, "pid": 76337, "tid": -914061504, "ts": 1716454222737083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737148, "dur": 1, "args": { "External id": 87743, "cbid": 251, "correlation": 87743 } }, { "ph": "f", "id": 87743, "pid": 76337, "tid": -914061504, "ts": 1716454222737148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222783252, "dur": 142, "args": { "External id": 87744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87744, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87744, "pid": 5, "tid": 7, "ts": 1716454222783252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737153, "dur": 11, "args": { "External id": 87744, "cbid": 211, "correlation": 87744 } }, { "ph": "s", "id": 87744, "pid": 76337, "tid": -914061504, "ts": 1716454222737153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222783395, "dur": 1907, "args": { "External id": 87765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87765, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87765, "pid": 5, "tid": 7, "ts": 1716454222783395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737240, "dur": 14, "args": { "External id": 87765, "cbid": 211, "correlation": 87765 } }, { "ph": "s", "id": 87765, "pid": 76337, "tid": -914061504, "ts": 1716454222737240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737343, "dur": 1, "args": { "External id": 87783, "cbid": 251, "correlation": 87783 } }, { "ph": "f", "id": 87783, "pid": 76337, "tid": -914061504, "ts": 1716454222737343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222785304, "dur": 143, "args": { "External id": 87785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87785, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 87785, "pid": 5, "tid": 7, "ts": 1716454222785304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737348, "dur": 14, "args": { "External id": 87785, "cbid": 211, "correlation": 87785 } }, { "ph": "s", "id": 87785, "pid": 76337, "tid": -914061504, "ts": 1716454222737348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222785448, "dur": 35, "args": { "External id": 87793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87793, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87793, "pid": 5, "tid": 7, "ts": 1716454222785448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737421, "dur": 12, "args": { "External id": 87793, "cbid": 211, "correlation": 87793 } }, { "ph": "s", "id": 87793, "pid": 76337, "tid": -914061504, "ts": 1716454222737421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222785485, "dur": 50, "args": { "External id": 87801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87801, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87801, "pid": 5, "tid": 7, "ts": 1716454222785485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737460, "dur": 8, "args": { "External id": 87801, "cbid": 211, "correlation": 87801 } }, { "ph": "s", "id": 87801, "pid": 76337, "tid": -914061504, "ts": 1716454222737460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222785536, "dur": 29, "args": { "External id": 87812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87812, "pid": 5, "tid": 7, "ts": 1716454222785536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737537, "dur": 13, "args": { "External id": 87812, "cbid": 211, "correlation": 87812 } }, { "ph": "s", "id": 87812, "pid": 76337, "tid": -914061504, "ts": 1716454222737537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222785567, "dur": 33, "args": { "External id": 87834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87834, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87834, "pid": 5, "tid": 7, "ts": 1716454222785567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737569, "dur": 8, "args": { "External id": 87834, "cbid": 211, "correlation": 87834 } }, { "ph": "s", "id": 87834, "pid": 76337, "tid": -914061504, "ts": 1716454222737569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737655, "dur": 1, "args": { "External id": 87845, "cbid": 251, "correlation": 87845 } }, { "ph": "f", "id": 87845, "pid": 76337, "tid": -914061504, "ts": 1716454222737655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222785601, "dur": 75, "args": { "External id": 87846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87846, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87846, "pid": 5, "tid": 7, "ts": 1716454222785601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737660, "dur": 13, "args": { "External id": 87846, "cbid": 211, "correlation": 87846 } }, { "ph": "s", "id": 87846, "pid": 76337, "tid": -914061504, "ts": 1716454222737660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737732, "dur": 1, "args": { "External id": 87857, "cbid": 251, "correlation": 87857 } }, { "ph": "f", "id": 87857, "pid": 76337, "tid": -914061504, "ts": 1716454222737732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737736, "dur": 0, "args": { "External id": 87858, "cbid": 251, "correlation": 87858 } }, { "ph": "f", "id": 87858, "pid": 76337, "tid": -914061504, "ts": 1716454222737736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222785678, "dur": 11, "args": { "External id": 87859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87859, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 87859, "pid": 5, "tid": 7, "ts": 1716454222785678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737737, "dur": 13, "args": { "External id": 87859, "cbid": 211, "correlation": 87859 } }, { "ph": "s", "id": 87859, "pid": 76337, "tid": -914061504, "ts": 1716454222737737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222785691, "dur": 5, "args": { "External id": 87861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87861, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 87861, "pid": 5, "tid": 7, "ts": 1716454222785691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737753, "dur": 7, "args": { "External id": 87861, "cbid": 211, "correlation": 87861 } }, { "ph": "s", "id": 87861, "pid": 76337, "tid": -914061504, "ts": 1716454222737753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737813, "dur": 1, "args": { "External id": 87872, "cbid": 251, "correlation": 87872 } }, { "ph": "f", "id": 87872, "pid": 76337, "tid": -914061504, "ts": 1716454222737813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222737816, "dur": 0, "args": { "External id": 87873, "cbid": 251, "correlation": 87873 } }, { "ph": "f", "id": 87873, "pid": 76337, "tid": -914061504, "ts": 1716454222737816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222785697, "dur": 7, "args": { "External id": 87874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87874, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 87874, "pid": 5, "tid": 7, "ts": 1716454222785697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737818, "dur": 11, "args": { "External id": 87874, "cbid": 211, "correlation": 87874 } }, { "ph": "s", "id": 87874, "pid": 76337, "tid": -914061504, "ts": 1716454222737818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222785705, "dur": 3, "args": { "External id": 87876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87876, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 87876, "pid": 5, "tid": 7, "ts": 1716454222785705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737831, "dur": 5, "args": { "External id": 87876, "cbid": 211, "correlation": 87876 } }, { "ph": "s", "id": 87876, "pid": 76337, "tid": -914061504, "ts": 1716454222737831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222785710, "dur": 90, "args": { "External id": 87897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87897, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 87897, "pid": 5, "tid": 7, "ts": 1716454222785710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222737905, "dur": 13, "args": { "External id": 87897, "cbid": 211, "correlation": 87897 } }, { "ph": "s", "id": 87897, "pid": 76337, "tid": -914061504, "ts": 1716454222737905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222738012, "dur": 1, "args": { "External id": 87915, "cbid": 251, "correlation": 87915 } }, { "ph": "f", "id": 87915, "pid": 76337, "tid": -914061504, "ts": 1716454222738012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222785801, "dur": 96, "args": { "External id": 87917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87917, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 87917, "pid": 5, "tid": 7, "ts": 1716454222785801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738018, "dur": 14, "args": { "External id": 87917, "cbid": 211, "correlation": 87917 } }, { "ph": "s", "id": 87917, "pid": 76337, "tid": -914061504, "ts": 1716454222738018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222785899, "dur": 19, "args": { "External id": 87925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87925, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87925, "pid": 5, "tid": 7, "ts": 1716454222785899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738089, "dur": 13, "args": { "External id": 87925, "cbid": 211, "correlation": 87925 } }, { "ph": "s", "id": 87925, "pid": 76337, "tid": -914061504, "ts": 1716454222738089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222785919, "dur": 37, "args": { "External id": 87933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87933, "pid": 5, "tid": 7, "ts": 1716454222785919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738132, "dur": 9, "args": { "External id": 87933, "cbid": 211, "correlation": 87933 } }, { "ph": "s", "id": 87933, "pid": 76337, "tid": -914061504, "ts": 1716454222738132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222785958, "dur": 35, "args": { "External id": 87955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87955, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87955, "pid": 5, "tid": 7, "ts": 1716454222785958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738185, "dur": 11, "args": { "External id": 87955, "cbid": 211, "correlation": 87955 } }, { "ph": "s", "id": 87955, "pid": 76337, "tid": -914061504, "ts": 1716454222738185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222738277, "dur": 1, "args": { "External id": 87971, "cbid": 251, "correlation": 87971 } }, { "ph": "f", "id": 87971, "pid": 76337, "tid": -914061504, "ts": 1716454222738277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222738282, "dur": 0, "args": { "External id": 87973, "cbid": 251, "correlation": 87973 } }, { "ph": "f", "id": 87973, "pid": 76337, "tid": -914061504, "ts": 1716454222738282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222785994, "dur": 532, "args": { "External id": 87974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87974, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 87974, "pid": 5, "tid": 7, "ts": 1716454222785994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738287, "dur": 13, "args": { "External id": 87974, "cbid": 211, "correlation": 87974 } }, { "ph": "s", "id": 87974, "pid": 76337, "tid": -914061504, "ts": 1716454222738287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222786528, "dur": 122, "args": { "External id": 87982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87982, "pid": 5, "tid": 7, "ts": 1716454222786528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738360, "dur": 15, "args": { "External id": 87982, "cbid": 211, "correlation": 87982 } }, { "ph": "s", "id": 87982, "pid": 76337, "tid": -914061504, "ts": 1716454222738360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222786651, "dur": 128, "args": { "External id": 87990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 87990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 87990, "pid": 5, "tid": 7, "ts": 1716454222786651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738395, "dur": 8, "args": { "External id": 87990, "cbid": 211, "correlation": 87990 } }, { "ph": "s", "id": 87990, "pid": 76337, "tid": -914061504, "ts": 1716454222738395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222738474, "dur": 1, "args": { "External id": 88006, "cbid": 251, "correlation": 88006 } }, { "ph": "f", "id": 88006, "pid": 76337, "tid": -914061504, "ts": 1716454222738474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222786780, "dur": 311, "args": { "External id": 88008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88008, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88008, "pid": 5, "tid": 7, "ts": 1716454222786780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738480, "dur": 13, "args": { "External id": 88008, "cbid": 211, "correlation": 88008 } }, { "ph": "s", "id": 88008, "pid": 76337, "tid": -914061504, "ts": 1716454222738480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222787092, "dur": 27, "args": { "External id": 88016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88016, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88016, "pid": 5, "tid": 7, "ts": 1716454222787092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738523, "dur": 9, "args": { "External id": 88016, "cbid": 211, "correlation": 88016 } }, { "ph": "s", "id": 88016, "pid": 76337, "tid": -914061504, "ts": 1716454222738523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222787120, "dur": 80, "args": { "External id": 88027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88027, "pid": 5, "tid": 7, "ts": 1716454222787120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738595, "dur": 12, "args": { "External id": 88027, "cbid": 211, "correlation": 88027 } }, { "ph": "s", "id": 88027, "pid": 76337, "tid": -914061504, "ts": 1716454222738595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222738662, "dur": 0, "args": { "External id": 88039, "cbid": 317, "correlation": 88039 } }, { "ph": "f", "id": 88039, "pid": 76337, "tid": -914061504, "ts": 1716454222738662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222738663, "dur": 0, "args": { "External id": 88040, "cbid": 203, "correlation": 88040 } }, { "ph": "f", "id": 88040, "pid": 76337, "tid": -914061504, "ts": 1716454222738663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222738664, "dur": 0, "args": { "External id": 88041, "cbid": 205, "correlation": 88041 } }, { "ph": "f", "id": 88041, "pid": 76337, "tid": -914061504, "ts": 1716454222738664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222787202, "dur": 22, "args": { "External id": 88045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88045, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88045, "pid": 5, "tid": 7, "ts": 1716454222787202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738683, "dur": 12, "args": { "External id": 88045, "cbid": 211, "correlation": 88045 } }, { "ph": "s", "id": 88045, "pid": 76337, "tid": -914061504, "ts": 1716454222738683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222787225, "dur": 118, "args": { "External id": 88047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88047, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88047, "pid": 5, "tid": 7, "ts": 1716454222787225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738703, "dur": 7, "args": { "External id": 88047, "cbid": 211, "correlation": 88047 } }, { "ph": "s", "id": 88047, "pid": 76337, "tid": -914061504, "ts": 1716454222738703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222787345, "dur": 23, "args": { "External id": 88049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88049, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88049, "pid": 5, "tid": 7, "ts": 1716454222787345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738714, "dur": 5, "args": { "External id": 88049, "cbid": 211, "correlation": 88049 } }, { "ph": "s", "id": 88049, "pid": 76337, "tid": -914061504, "ts": 1716454222738714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222787369, "dur": 32, "args": { "External id": 88055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88055, "pid": 5, "tid": 7, "ts": 1716454222787369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738742, "dur": 8, "args": { "External id": 88055, "cbid": 211, "correlation": 88055 } }, { "ph": "s", "id": 88055, "pid": 76337, "tid": -914061504, "ts": 1716454222738742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222787402, "dur": 27, "args": { "External id": 88063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88063, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88063, "pid": 5, "tid": 7, "ts": 1716454222787402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738774, "dur": 8, "args": { "External id": 88063, "cbid": 211, "correlation": 88063 } }, { "ph": "s", "id": 88063, "pid": 76337, "tid": -914061504, "ts": 1716454222738774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222787430, "dur": 29, "args": { "External id": 88083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88083, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 88083, "pid": 5, "tid": 7, "ts": 1716454222787430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738851, "dur": 12, "args": { "External id": 88083, "cbid": 211, "correlation": 88083 } }, { "ph": "s", "id": 88083, "pid": 76337, "tid": -914061504, "ts": 1716454222738851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222787461, "dur": 5, "args": { "External id": 88095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88095, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 88095, "pid": 5, "tid": 7, "ts": 1716454222787461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738874, "dur": 6, "args": { "External id": 88095, "cbid": 211, "correlation": 88095 } }, { "ph": "s", "id": 88095, "pid": 76337, "tid": -914061504, "ts": 1716454222738874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222787467, "dur": 30, "args": { "External id": 88098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88098, "pid": 5, "tid": 7, "ts": 1716454222787467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738892, "dur": 7, "args": { "External id": 88098, "cbid": 211, "correlation": 88098 } }, { "ph": "s", "id": 88098, "pid": 76337, "tid": -914061504, "ts": 1716454222738892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222787498, "dur": 22, "args": { "External id": 88107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88107, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88107, "pid": 5, "tid": 7, "ts": 1716454222787498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222738933, "dur": 10, "args": { "External id": 88107, "cbid": 211, "correlation": 88107 } }, { "ph": "s", "id": 88107, "pid": 76337, "tid": -914061504, "ts": 1716454222738933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222738994, "dur": 0, "args": { "External id": 88117, "cbid": 317, "correlation": 88117 } }, { "ph": "f", "id": 88117, "pid": 76337, "tid": -914061504, "ts": 1716454222738994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222738994, "dur": 0, "args": { "External id": 88118, "cbid": 203, "correlation": 88118 } }, { "ph": "f", "id": 88118, "pid": 76337, "tid": -914061504, "ts": 1716454222738994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222738995, "dur": 0, "args": { "External id": 88119, "cbid": 205, "correlation": 88119 } }, { "ph": "f", "id": 88119, "pid": 76337, "tid": -914061504, "ts": 1716454222738995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222787521, "dur": 22, "args": { "External id": 88123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88123, "pid": 5, "tid": 7, "ts": 1716454222787521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739009, "dur": 12, "args": { "External id": 88123, "cbid": 211, "correlation": 88123 } }, { "ph": "s", "id": 88123, "pid": 76337, "tid": -914061504, "ts": 1716454222739009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222787544, "dur": 44, "args": { "External id": 88125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88125, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88125, "pid": 5, "tid": 7, "ts": 1716454222787544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739024, "dur": 5, "args": { "External id": 88125, "cbid": 211, "correlation": 88125 } }, { "ph": "s", "id": 88125, "pid": 76337, "tid": -914061504, "ts": 1716454222739024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222787590, "dur": 646, "args": { "External id": 88127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88127, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88127, "pid": 5, "tid": 7, "ts": 1716454222787590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739036, "dur": 7, "args": { "External id": 88127, "cbid": 211, "correlation": 88127 } }, { "ph": "s", "id": 88127, "pid": 76337, "tid": -914061504, "ts": 1716454222739036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222788237, "dur": 23, "args": { "External id": 88129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88129, "pid": 5, "tid": 7, "ts": 1716454222788237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739046, "dur": 5, "args": { "External id": 88129, "cbid": 211, "correlation": 88129 } }, { "ph": "s", "id": 88129, "pid": 76337, "tid": -914061504, "ts": 1716454222739046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222788261, "dur": 32, "args": { "External id": 88135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88135, "pid": 5, "tid": 7, "ts": 1716454222788261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739073, "dur": 8, "args": { "External id": 88135, "cbid": 211, "correlation": 88135 } }, { "ph": "s", "id": 88135, "pid": 76337, "tid": -914061504, "ts": 1716454222739073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222788294, "dur": 3, "args": { "External id": 88143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88143, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 88143, "pid": 5, "tid": 7, "ts": 1716454222788294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739117, "dur": 9, "args": { "External id": 88143, "cbid": 211, "correlation": 88143 } }, { "ph": "s", "id": 88143, "pid": 76337, "tid": -914061504, "ts": 1716454222739117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222739185, "dur": 1, "args": { "External id": 88159, "cbid": 251, "correlation": 88159 } }, { "ph": "f", "id": 88159, "pid": 76337, "tid": -914061504, "ts": 1716454222739185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222739190, "dur": 0, "args": { "External id": 88161, "cbid": 251, "correlation": 88161 } }, { "ph": "f", "id": 88161, "pid": 76337, "tid": -914061504, "ts": 1716454222739190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222788299, "dur": 12, "args": { "External id": 88162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88162, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 88162, "pid": 5, "tid": 7, "ts": 1716454222788299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739192, "dur": 13, "args": { "External id": 88162, "cbid": 211, "correlation": 88162 } }, { "ph": "s", "id": 88162, "pid": 76337, "tid": -914061504, "ts": 1716454222739192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222788312, "dur": 5, "args": { "External id": 88164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88164, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 88164, "pid": 5, "tid": 7, "ts": 1716454222788312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739207, "dur": 5, "args": { "External id": 88164, "cbid": 211, "correlation": 88164 } }, { "ph": "s", "id": 88164, "pid": 76337, "tid": -914061504, "ts": 1716454222739207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222788319, "dur": 29, "args": { "External id": 88174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88174, "pid": 5, "tid": 7, "ts": 1716454222788319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739266, "dur": 12, "args": { "External id": 88174, "cbid": 211, "correlation": 88174 } }, { "ph": "s", "id": 88174, "pid": 76337, "tid": -914061504, "ts": 1716454222739266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222788349, "dur": 29, "args": { "External id": 88194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88194, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 88194, "pid": 5, "tid": 7, "ts": 1716454222788349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739333, "dur": 10, "args": { "External id": 88194, "cbid": 211, "correlation": 88194 } }, { "ph": "s", "id": 88194, "pid": 76337, "tid": -914061504, "ts": 1716454222739333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222788380, "dur": 4, "args": { "External id": 88206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88206, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 88206, "pid": 5, "tid": 7, "ts": 1716454222788380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739353, "dur": 6, "args": { "External id": 88206, "cbid": 211, "correlation": 88206 } }, { "ph": "s", "id": 88206, "pid": 76337, "tid": -914061504, "ts": 1716454222739353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222788385, "dur": 30, "args": { "External id": 88209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88209, "pid": 5, "tid": 7, "ts": 1716454222788385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739372, "dur": 7, "args": { "External id": 88209, "cbid": 211, "correlation": 88209 } }, { "ph": "s", "id": 88209, "pid": 76337, "tid": -914061504, "ts": 1716454222739372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222788416, "dur": 20, "args": { "External id": 88218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88218, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88218, "pid": 5, "tid": 7, "ts": 1716454222788416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739412, "dur": 10, "args": { "External id": 88218, "cbid": 211, "correlation": 88218 } }, { "ph": "s", "id": 88218, "pid": 76337, "tid": -914061504, "ts": 1716454222739412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222739476, "dur": 0, "args": { "External id": 88228, "cbid": 317, "correlation": 88228 } }, { "ph": "f", "id": 88228, "pid": 76337, "tid": -914061504, "ts": 1716454222739476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222739477, "dur": 0, "args": { "External id": 88229, "cbid": 203, "correlation": 88229 } }, { "ph": "f", "id": 88229, "pid": 76337, "tid": -914061504, "ts": 1716454222739477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222739478, "dur": 0, "args": { "External id": 88230, "cbid": 205, "correlation": 88230 } }, { "ph": "f", "id": 88230, "pid": 76337, "tid": -914061504, "ts": 1716454222739478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222788437, "dur": 22, "args": { "External id": 88234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88234, "pid": 5, "tid": 7, "ts": 1716454222788437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739492, "dur": 12, "args": { "External id": 88234, "cbid": 211, "correlation": 88234 } }, { "ph": "s", "id": 88234, "pid": 76337, "tid": -914061504, "ts": 1716454222739492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222788460, "dur": 44, "args": { "External id": 88236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88236, "pid": 5, "tid": 7, "ts": 1716454222788460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739506, "dur": 5, "args": { "External id": 88236, "cbid": 211, "correlation": 88236 } }, { "ph": "s", "id": 88236, "pid": 76337, "tid": -914061504, "ts": 1716454222739506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222788505, "dur": 638, "args": { "External id": 88238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88238, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88238, "pid": 5, "tid": 7, "ts": 1716454222788505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739518, "dur": 6, "args": { "External id": 88238, "cbid": 211, "correlation": 88238 } }, { "ph": "s", "id": 88238, "pid": 76337, "tid": -914061504, "ts": 1716454222739518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222789144, "dur": 24, "args": { "External id": 88240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88240, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88240, "pid": 5, "tid": 7, "ts": 1716454222789144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739527, "dur": 6, "args": { "External id": 88240, "cbid": 211, "correlation": 88240 } }, { "ph": "s", "id": 88240, "pid": 76337, "tid": -914061504, "ts": 1716454222739527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222789169, "dur": 33, "args": { "External id": 88246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88246, "pid": 5, "tid": 7, "ts": 1716454222789169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739555, "dur": 8, "args": { "External id": 88246, "cbid": 211, "correlation": 88246 } }, { "ph": "s", "id": 88246, "pid": 76337, "tid": -914061504, "ts": 1716454222739555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222789203, "dur": 27, "args": { "External id": 88254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88254, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88254, "pid": 5, "tid": 7, "ts": 1716454222789203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739587, "dur": 9, "args": { "External id": 88254, "cbid": 211, "correlation": 88254 } }, { "ph": "s", "id": 88254, "pid": 76337, "tid": -914061504, "ts": 1716454222739587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222789231, "dur": 20, "args": { "External id": 88262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88262, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88262, "pid": 5, "tid": 7, "ts": 1716454222789231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739618, "dur": 9, "args": { "External id": 88262, "cbid": 211, "correlation": 88262 } }, { "ph": "s", "id": 88262, "pid": 76337, "tid": -914061504, "ts": 1716454222739618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222789252, "dur": 30, "args": { "External id": 88282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88282, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 88282, "pid": 5, "tid": 7, "ts": 1716454222789252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739699, "dur": 13, "args": { "External id": 88282, "cbid": 211, "correlation": 88282 } }, { "ph": "s", "id": 88282, "pid": 76337, "tid": -914061504, "ts": 1716454222739699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222789284, "dur": 5, "args": { "External id": 88294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88294, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 88294, "pid": 5, "tid": 7, "ts": 1716454222789284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739722, "dur": 6, "args": { "External id": 88294, "cbid": 211, "correlation": 88294 } }, { "ph": "s", "id": 88294, "pid": 76337, "tid": -914061504, "ts": 1716454222739722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222789290, "dur": 30, "args": { "External id": 88297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88297, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88297, "pid": 5, "tid": 7, "ts": 1716454222789290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739739, "dur": 6, "args": { "External id": 88297, "cbid": 211, "correlation": 88297 } }, { "ph": "s", "id": 88297, "pid": 76337, "tid": -914061504, "ts": 1716454222739739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222739796, "dur": 0, "args": { "External id": 88308, "cbid": 317, "correlation": 88308 } }, { "ph": "f", "id": 88308, "pid": 76337, "tid": -914061504, "ts": 1716454222739796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222739797, "dur": 0, "args": { "External id": 88309, "cbid": 203, "correlation": 88309 } }, { "ph": "f", "id": 88309, "pid": 76337, "tid": -914061504, "ts": 1716454222739797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222739797, "dur": 0, "args": { "External id": 88310, "cbid": 205, "correlation": 88310 } }, { "ph": "f", "id": 88310, "pid": 76337, "tid": -914061504, "ts": 1716454222739797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222789321, "dur": 21, "args": { "External id": 88314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88314, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88314, "pid": 5, "tid": 7, "ts": 1716454222789321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739811, "dur": 12, "args": { "External id": 88314, "cbid": 211, "correlation": 88314 } }, { "ph": "s", "id": 88314, "pid": 76337, "tid": -914061504, "ts": 1716454222739811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222789343, "dur": 115, "args": { "External id": 88316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88316, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88316, "pid": 5, "tid": 7, "ts": 1716454222789343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739829, "dur": 7, "args": { "External id": 88316, "cbid": 211, "correlation": 88316 } }, { "ph": "s", "id": 88316, "pid": 76337, "tid": -914061504, "ts": 1716454222739829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222789459, "dur": 21, "args": { "External id": 88318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88318, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88318, "pid": 5, "tid": 7, "ts": 1716454222789459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739839, "dur": 5, "args": { "External id": 88318, "cbid": 211, "correlation": 88318 } }, { "ph": "s", "id": 88318, "pid": 76337, "tid": -914061504, "ts": 1716454222739839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222789482, "dur": 32, "args": { "External id": 88324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88324, "pid": 5, "tid": 7, "ts": 1716454222789482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739867, "dur": 8, "args": { "External id": 88324, "cbid": 211, "correlation": 88324 } }, { "ph": "s", "id": 88324, "pid": 76337, "tid": -914061504, "ts": 1716454222739867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222789515, "dur": 188, "args": { "External id": 88333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88333, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88333, "pid": 5, "tid": 7, "ts": 1716454222789515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222739952, "dur": 15, "args": { "External id": 88333, "cbid": 211, "correlation": 88333 } }, { "ph": "s", "id": 88333, "pid": 76337, "tid": -914061504, "ts": 1716454222739952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222789705, "dur": 64, "args": { "External id": 88355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88355, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88355, "pid": 5, "tid": 7, "ts": 1716454222789705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740021, "dur": 11, "args": { "External id": 88355, "cbid": 211, "correlation": 88355 } }, { "ph": "s", "id": 88355, "pid": 76337, "tid": -914061504, "ts": 1716454222740021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740115, "dur": 1, "args": { "External id": 88366, "cbid": 251, "correlation": 88366 } }, { "ph": "f", "id": 88366, "pid": 76337, "tid": -914061504, "ts": 1716454222740115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222789770, "dur": 154, "args": { "External id": 88367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88367, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88367, "pid": 5, "tid": 7, "ts": 1716454222789770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740121, "dur": 13, "args": { "External id": 88367, "cbid": 211, "correlation": 88367 } }, { "ph": "s", "id": 88367, "pid": 76337, "tid": -914061504, "ts": 1716454222740121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740191, "dur": 1, "args": { "External id": 88378, "cbid": 251, "correlation": 88378 } }, { "ph": "f", "id": 88378, "pid": 76337, "tid": -914061504, "ts": 1716454222740191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222789925, "dur": 145, "args": { "External id": 88379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88379, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88379, "pid": 5, "tid": 7, "ts": 1716454222789925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740195, "dur": 11, "args": { "External id": 88379, "cbid": 211, "correlation": 88379 } }, { "ph": "s", "id": 88379, "pid": 76337, "tid": -914061504, "ts": 1716454222740195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740261, "dur": 1, "args": { "External id": 88390, "cbid": 251, "correlation": 88390 } }, { "ph": "f", "id": 88390, "pid": 76337, "tid": -914061504, "ts": 1716454222740261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222790071, "dur": 144, "args": { "External id": 88391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88391, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88391, "pid": 5, "tid": 7, "ts": 1716454222790071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740265, "dur": 11, "args": { "External id": 88391, "cbid": 211, "correlation": 88391 } }, { "ph": "s", "id": 88391, "pid": 76337, "tid": -914061504, "ts": 1716454222740265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222790217, "dur": 1909, "args": { "External id": 88412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88412, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 88412, "pid": 5, "tid": 7, "ts": 1716454222790217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740351, "dur": 14, "args": { "External id": 88412, "cbid": 211, "correlation": 88412 } }, { "ph": "s", "id": 88412, "pid": 76337, "tid": -914061504, "ts": 1716454222740351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740457, "dur": 1, "args": { "External id": 88430, "cbid": 251, "correlation": 88430 } }, { "ph": "f", "id": 88430, "pid": 76337, "tid": -914061504, "ts": 1716454222740457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222792127, "dur": 145, "args": { "External id": 88432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88432, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 88432, "pid": 5, "tid": 7, "ts": 1716454222792127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740463, "dur": 14, "args": { "External id": 88432, "cbid": 211, "correlation": 88432 } }, { "ph": "s", "id": 88432, "pid": 76337, "tid": -914061504, "ts": 1716454222740463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222792274, "dur": 35, "args": { "External id": 88440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88440, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88440, "pid": 5, "tid": 7, "ts": 1716454222792274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740536, "dur": 12, "args": { "External id": 88440, "cbid": 211, "correlation": 88440 } }, { "ph": "s", "id": 88440, "pid": 76337, "tid": -914061504, "ts": 1716454222740536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222792310, "dur": 51, "args": { "External id": 88448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88448, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88448, "pid": 5, "tid": 7, "ts": 1716454222792310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740575, "dur": 9, "args": { "External id": 88448, "cbid": 211, "correlation": 88448 } }, { "ph": "s", "id": 88448, "pid": 76337, "tid": -914061504, "ts": 1716454222740575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222792362, "dur": 30, "args": { "External id": 88459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88459, "pid": 5, "tid": 7, "ts": 1716454222792362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740651, "dur": 13, "args": { "External id": 88459, "cbid": 211, "correlation": 88459 } }, { "ph": "s", "id": 88459, "pid": 76337, "tid": -914061504, "ts": 1716454222740651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222792393, "dur": 33, "args": { "External id": 88481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88481, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88481, "pid": 5, "tid": 7, "ts": 1716454222792393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740682, "dur": 8, "args": { "External id": 88481, "cbid": 211, "correlation": 88481 } }, { "ph": "s", "id": 88481, "pid": 76337, "tid": -914061504, "ts": 1716454222740682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740768, "dur": 1, "args": { "External id": 88492, "cbid": 251, "correlation": 88492 } }, { "ph": "f", "id": 88492, "pid": 76337, "tid": -914061504, "ts": 1716454222740768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222792428, "dur": 74, "args": { "External id": 88493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88493, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88493, "pid": 5, "tid": 7, "ts": 1716454222792428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740773, "dur": 13, "args": { "External id": 88493, "cbid": 211, "correlation": 88493 } }, { "ph": "s", "id": 88493, "pid": 76337, "tid": -914061504, "ts": 1716454222740773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740843, "dur": 1, "args": { "External id": 88504, "cbid": 251, "correlation": 88504 } }, { "ph": "f", "id": 88504, "pid": 76337, "tid": -914061504, "ts": 1716454222740843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740847, "dur": 0, "args": { "External id": 88505, "cbid": 251, "correlation": 88505 } }, { "ph": "f", "id": 88505, "pid": 76337, "tid": -914061504, "ts": 1716454222740847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222792503, "dur": 11, "args": { "External id": 88506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88506, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 88506, "pid": 5, "tid": 7, "ts": 1716454222792503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740849, "dur": 13, "args": { "External id": 88506, "cbid": 211, "correlation": 88506 } }, { "ph": "s", "id": 88506, "pid": 76337, "tid": -914061504, "ts": 1716454222740849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222792516, "dur": 5, "args": { "External id": 88508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88508, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 88508, "pid": 5, "tid": 7, "ts": 1716454222792516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740865, "dur": 6, "args": { "External id": 88508, "cbid": 211, "correlation": 88508 } }, { "ph": "s", "id": 88508, "pid": 76337, "tid": -914061504, "ts": 1716454222740865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740922, "dur": 1, "args": { "External id": 88519, "cbid": 251, "correlation": 88519 } }, { "ph": "f", "id": 88519, "pid": 76337, "tid": -914061504, "ts": 1716454222740922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222740926, "dur": 0, "args": { "External id": 88520, "cbid": 251, "correlation": 88520 } }, { "ph": "f", "id": 88520, "pid": 76337, "tid": -914061504, "ts": 1716454222740926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222792522, "dur": 7, "args": { "External id": 88521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88521, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 88521, "pid": 5, "tid": 7, "ts": 1716454222792522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740927, "dur": 11, "args": { "External id": 88521, "cbid": 211, "correlation": 88521 } }, { "ph": "s", "id": 88521, "pid": 76337, "tid": -914061504, "ts": 1716454222740927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222792531, "dur": 3, "args": { "External id": 88523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88523, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 88523, "pid": 5, "tid": 7, "ts": 1716454222792531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222740940, "dur": 6, "args": { "External id": 88523, "cbid": 211, "correlation": 88523 } }, { "ph": "s", "id": 88523, "pid": 76337, "tid": -914061504, "ts": 1716454222740940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222792535, "dur": 90, "args": { "External id": 88544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88544, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 88544, "pid": 5, "tid": 7, "ts": 1716454222792535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741025, "dur": 13, "args": { "External id": 88544, "cbid": 211, "correlation": 88544 } }, { "ph": "s", "id": 88544, "pid": 76337, "tid": -914061504, "ts": 1716454222741025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222741124, "dur": 1, "args": { "External id": 88562, "cbid": 251, "correlation": 88562 } }, { "ph": "f", "id": 88562, "pid": 76337, "tid": -914061504, "ts": 1716454222741124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222792626, "dur": 97, "args": { "External id": 88564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88564, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88564, "pid": 5, "tid": 7, "ts": 1716454222792626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741130, "dur": 14, "args": { "External id": 88564, "cbid": 211, "correlation": 88564 } }, { "ph": "s", "id": 88564, "pid": 76337, "tid": -914061504, "ts": 1716454222741130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222792724, "dur": 19, "args": { "External id": 88572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88572, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88572, "pid": 5, "tid": 7, "ts": 1716454222792724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741202, "dur": 12, "args": { "External id": 88572, "cbid": 211, "correlation": 88572 } }, { "ph": "s", "id": 88572, "pid": 76337, "tid": -914061504, "ts": 1716454222741202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222792745, "dur": 38, "args": { "External id": 88580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88580, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88580, "pid": 5, "tid": 7, "ts": 1716454222792745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741243, "dur": 10, "args": { "External id": 88580, "cbid": 211, "correlation": 88580 } }, { "ph": "s", "id": 88580, "pid": 76337, "tid": -914061504, "ts": 1716454222741243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222792784, "dur": 34, "args": { "External id": 88602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88602, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88602, "pid": 5, "tid": 7, "ts": 1716454222792784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741295, "dur": 10, "args": { "External id": 88602, "cbid": 211, "correlation": 88602 } }, { "ph": "s", "id": 88602, "pid": 76337, "tid": -914061504, "ts": 1716454222741295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222741386, "dur": 1, "args": { "External id": 88618, "cbid": 251, "correlation": 88618 } }, { "ph": "f", "id": 88618, "pid": 76337, "tid": -914061504, "ts": 1716454222741386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222741391, "dur": 0, "args": { "External id": 88620, "cbid": 251, "correlation": 88620 } }, { "ph": "f", "id": 88620, "pid": 76337, "tid": -914061504, "ts": 1716454222741391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222792819, "dur": 529, "args": { "External id": 88621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88621, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 88621, "pid": 5, "tid": 7, "ts": 1716454222792819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741395, "dur": 13, "args": { "External id": 88621, "cbid": 211, "correlation": 88621 } }, { "ph": "s", "id": 88621, "pid": 76337, "tid": -914061504, "ts": 1716454222741395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222793349, "dur": 125, "args": { "External id": 88629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88629, "pid": 5, "tid": 7, "ts": 1716454222793349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741463, "dur": 12, "args": { "External id": 88629, "cbid": 211, "correlation": 88629 } }, { "ph": "s", "id": 88629, "pid": 76337, "tid": -914061504, "ts": 1716454222741463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222793475, "dur": 127, "args": { "External id": 88637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88637, "pid": 5, "tid": 7, "ts": 1716454222793475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741495, "dur": 9, "args": { "External id": 88637, "cbid": 211, "correlation": 88637 } }, { "ph": "s", "id": 88637, "pid": 76337, "tid": -914061504, "ts": 1716454222741495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222741571, "dur": 1, "args": { "External id": 88653, "cbid": 251, "correlation": 88653 } }, { "ph": "f", "id": 88653, "pid": 76337, "tid": -914061504, "ts": 1716454222741571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222793603, "dur": 300, "args": { "External id": 88655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88655, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88655, "pid": 5, "tid": 7, "ts": 1716454222793603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741577, "dur": 12, "args": { "External id": 88655, "cbid": 211, "correlation": 88655 } }, { "ph": "s", "id": 88655, "pid": 76337, "tid": -914061504, "ts": 1716454222741577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222793905, "dur": 28, "args": { "External id": 88663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88663, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88663, "pid": 5, "tid": 7, "ts": 1716454222793905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741619, "dur": 10, "args": { "External id": 88663, "cbid": 211, "correlation": 88663 } }, { "ph": "s", "id": 88663, "pid": 76337, "tid": -914061504, "ts": 1716454222741619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222793933, "dur": 80, "args": { "External id": 88674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88674, "pid": 5, "tid": 7, "ts": 1716454222793933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741689, "dur": 12, "args": { "External id": 88674, "cbid": 211, "correlation": 88674 } }, { "ph": "s", "id": 88674, "pid": 76337, "tid": -914061504, "ts": 1716454222741689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222741755, "dur": 0, "args": { "External id": 88686, "cbid": 317, "correlation": 88686 } }, { "ph": "f", "id": 88686, "pid": 76337, "tid": -914061504, "ts": 1716454222741755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222741756, "dur": 0, "args": { "External id": 88687, "cbid": 203, "correlation": 88687 } }, { "ph": "f", "id": 88687, "pid": 76337, "tid": -914061504, "ts": 1716454222741756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222741757, "dur": 0, "args": { "External id": 88688, "cbid": 205, "correlation": 88688 } }, { "ph": "f", "id": 88688, "pid": 76337, "tid": -914061504, "ts": 1716454222741757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794014, "dur": 23, "args": { "External id": 88692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88692, "pid": 5, "tid": 7, "ts": 1716454222794014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741774, "dur": 12, "args": { "External id": 88692, "cbid": 211, "correlation": 88692 } }, { "ph": "s", "id": 88692, "pid": 76337, "tid": -914061504, "ts": 1716454222741774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222794038, "dur": 118, "args": { "External id": 88694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88694, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88694, "pid": 5, "tid": 7, "ts": 1716454222794038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741793, "dur": 7, "args": { "External id": 88694, "cbid": 211, "correlation": 88694 } }, { "ph": "s", "id": 88694, "pid": 76337, "tid": -914061504, "ts": 1716454222741793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794158, "dur": 22, "args": { "External id": 88696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88696, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88696, "pid": 5, "tid": 7, "ts": 1716454222794158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741804, "dur": 5, "args": { "External id": 88696, "cbid": 211, "correlation": 88696 } }, { "ph": "s", "id": 88696, "pid": 76337, "tid": -914061504, "ts": 1716454222741804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222794182, "dur": 32, "args": { "External id": 88702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88702, "pid": 5, "tid": 7, "ts": 1716454222794182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741831, "dur": 8, "args": { "External id": 88702, "cbid": 211, "correlation": 88702 } }, { "ph": "s", "id": 88702, "pid": 76337, "tid": -914061504, "ts": 1716454222741831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222794215, "dur": 27, "args": { "External id": 88710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88710, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88710, "pid": 5, "tid": 7, "ts": 1716454222794215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741864, "dur": 9, "args": { "External id": 88710, "cbid": 211, "correlation": 88710 } }, { "ph": "s", "id": 88710, "pid": 76337, "tid": -914061504, "ts": 1716454222741864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222741938, "dur": 0, "args": { "External id": 88720, "cbid": 317, "correlation": 88720 } }, { "ph": "f", "id": 88720, "pid": 76337, "tid": -914061504, "ts": 1716454222741938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222741939, "dur": 0, "args": { "External id": 88721, "cbid": 203, "correlation": 88721 } }, { "ph": "f", "id": 88721, "pid": 76337, "tid": -914061504, "ts": 1716454222741939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222741939, "dur": 0, "args": { "External id": 88722, "cbid": 205, "correlation": 88722 } }, { "ph": "f", "id": 88722, "pid": 76337, "tid": -914061504, "ts": 1716454222741939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794243, "dur": 23, "args": { "External id": 88726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88726, "pid": 5, "tid": 7, "ts": 1716454222794243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741954, "dur": 12, "args": { "External id": 88726, "cbid": 211, "correlation": 88726 } }, { "ph": "s", "id": 88726, "pid": 76337, "tid": -914061504, "ts": 1716454222741954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794267, "dur": 43, "args": { "External id": 88728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88728, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88728, "pid": 5, "tid": 7, "ts": 1716454222794267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741968, "dur": 14, "args": { "External id": 88728, "cbid": 211, "correlation": 88728 } }, { "ph": "s", "id": 88728, "pid": 76337, "tid": -914061504, "ts": 1716454222741968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222794312, "dur": 232, "args": { "External id": 88730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88730, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 88730, "pid": 5, "tid": 7, "ts": 1716454222794312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222741989, "dur": 7, "args": { "External id": 88730, "cbid": 211, "correlation": 88730 } }, { "ph": "s", "id": 88730, "pid": 76337, "tid": -914061504, "ts": 1716454222741989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794545, "dur": 6, "args": { "External id": 88732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88732, "pid": 5, "tid": 7, "ts": 1716454222794545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742000, "dur": 5, "args": { "External id": 88732, "cbid": 211, "correlation": 88732 } }, { "ph": "s", "id": 88732, "pid": 76337, "tid": -914061504, "ts": 1716454222742000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222794552, "dur": 9, "args": { "External id": 88738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88738, "pid": 5, "tid": 7, "ts": 1716454222794552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742027, "dur": 9, "args": { "External id": 88738, "cbid": 211, "correlation": 88738 } }, { "ph": "s", "id": 88738, "pid": 76337, "tid": -914061504, "ts": 1716454222742027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222794563, "dur": 12, "args": { "External id": 88758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88758, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 88758, "pid": 5, "tid": 7, "ts": 1716454222794563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742125, "dur": 12, "args": { "External id": 88758, "cbid": 211, "correlation": 88758 } }, { "ph": "s", "id": 88758, "pid": 76337, "tid": -914061504, "ts": 1716454222742125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222794576, "dur": 4, "args": { "External id": 88770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88770, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 88770, "pid": 5, "tid": 7, "ts": 1716454222794576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742148, "dur": 7, "args": { "External id": 88770, "cbid": 211, "correlation": 88770 } }, { "ph": "s", "id": 88770, "pid": 76337, "tid": -914061504, "ts": 1716454222742148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222794581, "dur": 11, "args": { "External id": 88773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88773, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88773, "pid": 5, "tid": 7, "ts": 1716454222794581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742168, "dur": 7, "args": { "External id": 88773, "cbid": 211, "correlation": 88773 } }, { "ph": "s", "id": 88773, "pid": 76337, "tid": -914061504, "ts": 1716454222742168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222794594, "dur": 7, "args": { "External id": 88782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88782, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88782, "pid": 5, "tid": 7, "ts": 1716454222794594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742210, "dur": 10, "args": { "External id": 88782, "cbid": 211, "correlation": 88782 } }, { "ph": "s", "id": 88782, "pid": 76337, "tid": -914061504, "ts": 1716454222742210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222742261, "dur": 0, "args": { "External id": 88792, "cbid": 317, "correlation": 88792 } }, { "ph": "f", "id": 88792, "pid": 76337, "tid": -914061504, "ts": 1716454222742261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222742263, "dur": 0, "args": { "External id": 88793, "cbid": 203, "correlation": 88793 } }, { "ph": "f", "id": 88793, "pid": 76337, "tid": -914061504, "ts": 1716454222742263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222742263, "dur": 0, "args": { "External id": 88794, "cbid": 205, "correlation": 88794 } }, { "ph": "f", "id": 88794, "pid": 76337, "tid": -914061504, "ts": 1716454222742263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794602, "dur": 5, "args": { "External id": 88798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88798, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88798, "pid": 5, "tid": 7, "ts": 1716454222794602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742279, "dur": 11, "args": { "External id": 88798, "cbid": 211, "correlation": 88798 } }, { "ph": "s", "id": 88798, "pid": 76337, "tid": -914061504, "ts": 1716454222742279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222794608, "dur": 83, "args": { "External id": 88800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88800, "pid": 5, "tid": 7, "ts": 1716454222794608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742293, "dur": 5, "args": { "External id": 88800, "cbid": 211, "correlation": 88800 } }, { "ph": "s", "id": 88800, "pid": 76337, "tid": -914061504, "ts": 1716454222742293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222794694, "dur": 1, "args": { "External id": 88802, "device": 5, "context": 1, "stream": 7, "correlation": 88802, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 88802, "pid": 5, "tid": 7, "ts": 1716454222794694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222742306, "dur": 13, "args": { "External id": 88802, "cbid": 51, "correlation": 88802 } }, { "ph": "s", "id": 88802, "pid": 76337, "tid": -914061504, "ts": 1716454222742306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222794698, "dur": 535, "args": { "External id": 88803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88803, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88803, "pid": 5, "tid": 7, "ts": 1716454222794698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742320, "dur": 9, "args": { "External id": 88803, "cbid": 211, "correlation": 88803 } }, { "ph": "s", "id": 88803, "pid": 76337, "tid": -914061504, "ts": 1716454222742320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222795234, "dur": 11, "args": { "External id": 88805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88805, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88805, "pid": 5, "tid": 7, "ts": 1716454222795234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742333, "dur": 5, "args": { "External id": 88805, "cbid": 211, "correlation": 88805 } }, { "ph": "s", "id": 88805, "pid": 76337, "tid": -914061504, "ts": 1716454222742333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222795247, "dur": 14, "args": { "External id": 88811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88811, "pid": 5, "tid": 7, "ts": 1716454222795247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742361, "dur": 9, "args": { "External id": 88811, "cbid": 211, "correlation": 88811 } }, { "ph": "s", "id": 88811, "pid": 76337, "tid": -914061504, "ts": 1716454222742361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222795262, "dur": 3, "args": { "External id": 88819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88819, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 88819, "pid": 5, "tid": 7, "ts": 1716454222795262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742405, "dur": 9, "args": { "External id": 88819, "cbid": 211, "correlation": 88819 } }, { "ph": "s", "id": 88819, "pid": 76337, "tid": -914061504, "ts": 1716454222742405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222742473, "dur": 1, "args": { "External id": 88835, "cbid": 251, "correlation": 88835 } }, { "ph": "f", "id": 88835, "pid": 76337, "tid": -914061504, "ts": 1716454222742473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222742479, "dur": 0, "args": { "External id": 88837, "cbid": 251, "correlation": 88837 } }, { "ph": "f", "id": 88837, "pid": 76337, "tid": -914061504, "ts": 1716454222742479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222795267, "dur": 13, "args": { "External id": 88838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88838, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88838, "pid": 5, "tid": 7, "ts": 1716454222795267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742481, "dur": 12, "args": { "External id": 88838, "cbid": 211, "correlation": 88838 } }, { "ph": "s", "id": 88838, "pid": 76337, "tid": -914061504, "ts": 1716454222742481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222795281, "dur": 5, "args": { "External id": 88840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88840, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88840, "pid": 5, "tid": 7, "ts": 1716454222795281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742495, "dur": 5, "args": { "External id": 88840, "cbid": 211, "correlation": 88840 } }, { "ph": "s", "id": 88840, "pid": 76337, "tid": -914061504, "ts": 1716454222742495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222795287, "dur": 16, "args": { "External id": 88850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88850, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88850, "pid": 5, "tid": 7, "ts": 1716454222795287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742555, "dur": 12, "args": { "External id": 88850, "cbid": 211, "correlation": 88850 } }, { "ph": "s", "id": 88850, "pid": 76337, "tid": -914061504, "ts": 1716454222742555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222795305, "dur": 17, "args": { "External id": 88870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88870, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 88870, "pid": 5, "tid": 7, "ts": 1716454222795305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742622, "dur": 11, "args": { "External id": 88870, "cbid": 211, "correlation": 88870 } }, { "ph": "s", "id": 88870, "pid": 76337, "tid": -914061504, "ts": 1716454222742622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222795323, "dur": 5, "args": { "External id": 88882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88882, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 88882, "pid": 5, "tid": 7, "ts": 1716454222795323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742644, "dur": 6, "args": { "External id": 88882, "cbid": 211, "correlation": 88882 } }, { "ph": "s", "id": 88882, "pid": 76337, "tid": -914061504, "ts": 1716454222742644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222795329, "dur": 17, "args": { "External id": 88885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88885, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88885, "pid": 5, "tid": 7, "ts": 1716454222795329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742663, "dur": 7, "args": { "External id": 88885, "cbid": 211, "correlation": 88885 } }, { "ph": "s", "id": 88885, "pid": 76337, "tid": -914061504, "ts": 1716454222742663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222795347, "dur": 11, "args": { "External id": 88894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88894, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88894, "pid": 5, "tid": 7, "ts": 1716454222795347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742704, "dur": 10, "args": { "External id": 88894, "cbid": 211, "correlation": 88894 } }, { "ph": "s", "id": 88894, "pid": 76337, "tid": -914061504, "ts": 1716454222742704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222742767, "dur": 0, "args": { "External id": 88904, "cbid": 317, "correlation": 88904 } }, { "ph": "f", "id": 88904, "pid": 76337, "tid": -914061504, "ts": 1716454222742767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222742768, "dur": 0, "args": { "External id": 88905, "cbid": 203, "correlation": 88905 } }, { "ph": "f", "id": 88905, "pid": 76337, "tid": -914061504, "ts": 1716454222742768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222742768, "dur": 0, "args": { "External id": 88906, "cbid": 205, "correlation": 88906 } }, { "ph": "f", "id": 88906, "pid": 76337, "tid": -914061504, "ts": 1716454222742768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222795359, "dur": 10, "args": { "External id": 88910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88910, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88910, "pid": 5, "tid": 7, "ts": 1716454222795359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742785, "dur": 12, "args": { "External id": 88910, "cbid": 211, "correlation": 88910 } }, { "ph": "s", "id": 88910, "pid": 76337, "tid": -914061504, "ts": 1716454222742785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222795371, "dur": 160, "args": { "External id": 88912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88912, "pid": 5, "tid": 7, "ts": 1716454222795371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742800, "dur": 5, "args": { "External id": 88912, "cbid": 211, "correlation": 88912 } }, { "ph": "s", "id": 88912, "pid": 76337, "tid": -914061504, "ts": 1716454222742800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222795533, "dur": 1, "args": { "External id": 88914, "device": 5, "context": 1, "stream": 7, "correlation": 88914, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 88914, "pid": 5, "tid": 7, "ts": 1716454222795533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222742812, "dur": 8, "args": { "External id": 88914, "cbid": 51, "correlation": 88914 } }, { "ph": "s", "id": 88914, "pid": 76337, "tid": -914061504, "ts": 1716454222742812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222795537, "dur": 652, "args": { "External id": 88915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88915, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 88915, "pid": 5, "tid": 7, "ts": 1716454222795537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742822, "dur": 8, "args": { "External id": 88915, "cbid": 211, "correlation": 88915 } }, { "ph": "s", "id": 88915, "pid": 76337, "tid": -914061504, "ts": 1716454222742822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222796191, "dur": 13, "args": { "External id": 88917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88917, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88917, "pid": 5, "tid": 7, "ts": 1716454222796191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742834, "dur": 5, "args": { "External id": 88917, "cbid": 211, "correlation": 88917 } }, { "ph": "s", "id": 88917, "pid": 76337, "tid": -914061504, "ts": 1716454222742834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222796205, "dur": 14, "args": { "External id": 88923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88923, "pid": 5, "tid": 7, "ts": 1716454222796205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742862, "dur": 8, "args": { "External id": 88923, "cbid": 211, "correlation": 88923 } }, { "ph": "s", "id": 88923, "pid": 76337, "tid": -914061504, "ts": 1716454222742862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222742920, "dur": 0, "args": { "External id": 88933, "cbid": 317, "correlation": 88933 } }, { "ph": "f", "id": 88933, "pid": 76337, "tid": -914061504, "ts": 1716454222742920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222742921, "dur": 0, "args": { "External id": 88934, "cbid": 203, "correlation": 88934 } }, { "ph": "f", "id": 88934, "pid": 76337, "tid": -914061504, "ts": 1716454222742921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222742922, "dur": 0, "args": { "External id": 88935, "cbid": 205, "correlation": 88935 } }, { "ph": "f", "id": 88935, "pid": 76337, "tid": -914061504, "ts": 1716454222742922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222796220, "dur": 8, "args": { "External id": 88939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88939, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88939, "pid": 5, "tid": 7, "ts": 1716454222796220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742936, "dur": 13, "args": { "External id": 88939, "cbid": 211, "correlation": 88939 } }, { "ph": "s", "id": 88939, "pid": 76337, "tid": -914061504, "ts": 1716454222742936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222796230, "dur": 4, "args": { "External id": 88941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 88941, "pid": 5, "tid": 7, "ts": 1716454222796230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742955, "dur": 6, "args": { "External id": 88941, "cbid": 211, "correlation": 88941 } }, { "ph": "s", "id": 88941, "pid": 76337, "tid": -914061504, "ts": 1716454222742955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222742966, "dur": 0, "args": { "External id": 88942, "cbid": 51, "correlation": 88942 } }, { "ph": "s", "id": 88942, "pid": 76337, "tid": -914061504, "ts": 1716454222742966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222796235, "dur": 56, "args": { "External id": 88943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88943, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 88943, "pid": 5, "tid": 7, "ts": 1716454222796235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222742966, "dur": 6, "args": { "External id": 88943, "cbid": 211, "correlation": 88943 } }, { "ph": "s", "id": 88943, "pid": 76337, "tid": -914061504, "ts": 1716454222742966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222796292, "dur": 14, "args": { "External id": 88948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88948, "pid": 5, "tid": 7, "ts": 1716454222796292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743000, "dur": 10, "args": { "External id": 88948, "cbid": 211, "correlation": 88948 } }, { "ph": "s", "id": 88948, "pid": 76337, "tid": -914061504, "ts": 1716454222743000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222796307, "dur": 12, "args": { "External id": 88956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88956, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88956, "pid": 5, "tid": 7, "ts": 1716454222796307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743030, "dur": 8, "args": { "External id": 88956, "cbid": 211, "correlation": 88956 } }, { "ph": "s", "id": 88956, "pid": 76337, "tid": -914061504, "ts": 1716454222743030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222796321, "dur": 10, "args": { "External id": 88964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88964, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88964, "pid": 5, "tid": 7, "ts": 1716454222796321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743061, "dur": 9, "args": { "External id": 88964, "cbid": 211, "correlation": 88964 } }, { "ph": "s", "id": 88964, "pid": 76337, "tid": -914061504, "ts": 1716454222743061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222796332, "dur": 18, "args": { "External id": 88984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88984, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 88984, "pid": 5, "tid": 7, "ts": 1716454222796332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743143, "dur": 12, "args": { "External id": 88984, "cbid": 211, "correlation": 88984 } }, { "ph": "s", "id": 88984, "pid": 76337, "tid": -914061504, "ts": 1716454222743143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222796351, "dur": 4, "args": { "External id": 88996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88996, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 88996, "pid": 5, "tid": 7, "ts": 1716454222796351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743165, "dur": 7, "args": { "External id": 88996, "cbid": 211, "correlation": 88996 } }, { "ph": "s", "id": 88996, "pid": 76337, "tid": -914061504, "ts": 1716454222743165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222796357, "dur": 17, "args": { "External id": 88999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 88999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 88999, "pid": 5, "tid": 7, "ts": 1716454222796357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743184, "dur": 6, "args": { "External id": 88999, "cbid": 211, "correlation": 88999 } }, { "ph": "s", "id": 88999, "pid": 76337, "tid": -914061504, "ts": 1716454222743184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222743241, "dur": 0, "args": { "External id": 89010, "cbid": 317, "correlation": 89010 } }, { "ph": "f", "id": 89010, "pid": 76337, "tid": -914061504, "ts": 1716454222743241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222743242, "dur": 0, "args": { "External id": 89011, "cbid": 203, "correlation": 89011 } }, { "ph": "f", "id": 89011, "pid": 76337, "tid": -914061504, "ts": 1716454222743242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222743243, "dur": 0, "args": { "External id": 89012, "cbid": 205, "correlation": 89012 } }, { "ph": "f", "id": 89012, "pid": 76337, "tid": -914061504, "ts": 1716454222743243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222796375, "dur": 12, "args": { "External id": 89016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89016, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89016, "pid": 5, "tid": 7, "ts": 1716454222796375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743258, "dur": 11, "args": { "External id": 89016, "cbid": 211, "correlation": 89016 } }, { "ph": "s", "id": 89016, "pid": 76337, "tid": -914061504, "ts": 1716454222743258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222796389, "dur": 3, "args": { "External id": 89018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89018, "pid": 5, "tid": 7, "ts": 1716454222796389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743275, "dur": 6, "args": { "External id": 89018, "cbid": 211, "correlation": 89018 } }, { "ph": "s", "id": 89018, "pid": 76337, "tid": -914061504, "ts": 1716454222743275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222743285, "dur": 0, "args": { "External id": 89019, "cbid": 51, "correlation": 89019 } }, { "ph": "s", "id": 89019, "pid": 76337, "tid": -914061504, "ts": 1716454222743285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222796393, "dur": 96, "args": { "External id": 89020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89020, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 89020, "pid": 5, "tid": 7, "ts": 1716454222796393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743286, "dur": 6, "args": { "External id": 89020, "cbid": 211, "correlation": 89020 } }, { "ph": "s", "id": 89020, "pid": 76337, "tid": -914061504, "ts": 1716454222743286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222796490, "dur": 15, "args": { "External id": 89025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89025, "pid": 5, "tid": 7, "ts": 1716454222796490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743314, "dur": 8, "args": { "External id": 89025, "cbid": 211, "correlation": 89025 } }, { "ph": "s", "id": 89025, "pid": 76337, "tid": -914061504, "ts": 1716454222743314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222796506, "dur": 81, "args": { "External id": 89034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89034, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89034, "pid": 5, "tid": 7, "ts": 1716454222796506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743400, "dur": 16, "args": { "External id": 89034, "cbid": 211, "correlation": 89034 } }, { "ph": "s", "id": 89034, "pid": 76337, "tid": -914061504, "ts": 1716454222743400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222796589, "dur": 30, "args": { "External id": 89056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89056, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89056, "pid": 5, "tid": 7, "ts": 1716454222796589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743460, "dur": 10, "args": { "External id": 89056, "cbid": 211, "correlation": 89056 } }, { "ph": "s", "id": 89056, "pid": 76337, "tid": -914061504, "ts": 1716454222743460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222743554, "dur": 1, "args": { "External id": 89067, "cbid": 251, "correlation": 89067 } }, { "ph": "f", "id": 89067, "pid": 76337, "tid": -914061504, "ts": 1716454222743554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222796621, "dur": 164, "args": { "External id": 89068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89068, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89068, "pid": 5, "tid": 7, "ts": 1716454222796621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743560, "dur": 14, "args": { "External id": 89068, "cbid": 211, "correlation": 89068 } }, { "ph": "s", "id": 89068, "pid": 76337, "tid": -914061504, "ts": 1716454222743560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222743632, "dur": 1, "args": { "External id": 89079, "cbid": 251, "correlation": 89079 } }, { "ph": "f", "id": 89079, "pid": 76337, "tid": -914061504, "ts": 1716454222743632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222796785, "dur": 157, "args": { "External id": 89080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89080, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89080, "pid": 5, "tid": 7, "ts": 1716454222796785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743636, "dur": 11, "args": { "External id": 89080, "cbid": 211, "correlation": 89080 } }, { "ph": "s", "id": 89080, "pid": 76337, "tid": -914061504, "ts": 1716454222743636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222743701, "dur": 1, "args": { "External id": 89091, "cbid": 251, "correlation": 89091 } }, { "ph": "f", "id": 89091, "pid": 76337, "tid": -914061504, "ts": 1716454222743701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222796944, "dur": 158, "args": { "External id": 89092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89092, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89092, "pid": 5, "tid": 7, "ts": 1716454222796944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743705, "dur": 11, "args": { "External id": 89092, "cbid": 211, "correlation": 89092 } }, { "ph": "s", "id": 89092, "pid": 76337, "tid": -914061504, "ts": 1716454222743705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222797103, "dur": 334, "args": { "External id": 89117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89117, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89117, "pid": 5, "tid": 7, "ts": 1716454222797103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743797, "dur": 14, "args": { "External id": 89117, "cbid": 211, "correlation": 89117 } }, { "ph": "s", "id": 89117, "pid": 76337, "tid": -914061504, "ts": 1716454222743797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222743904, "dur": 1, "args": { "External id": 89135, "cbid": 251, "correlation": 89135 } }, { "ph": "f", "id": 89135, "pid": 76337, "tid": -914061504, "ts": 1716454222743904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222797439, "dur": 165, "args": { "External id": 89137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89137, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89137, "pid": 5, "tid": 7, "ts": 1716454222797439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743910, "dur": 13, "args": { "External id": 89137, "cbid": 211, "correlation": 89137 } }, { "ph": "s", "id": 89137, "pid": 76337, "tid": -914061504, "ts": 1716454222743910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222797605, "dur": 19, "args": { "External id": 89145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89145, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89145, "pid": 5, "tid": 7, "ts": 1716454222797605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222743991, "dur": 13, "args": { "External id": 89145, "cbid": 211, "correlation": 89145 } }, { "ph": "s", "id": 89145, "pid": 76337, "tid": -914061504, "ts": 1716454222743991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222797625, "dur": 28, "args": { "External id": 89153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89153, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89153, "pid": 5, "tid": 7, "ts": 1716454222797625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744033, "dur": 9, "args": { "External id": 89153, "cbid": 211, "correlation": 89153 } }, { "ph": "s", "id": 89153, "pid": 76337, "tid": -914061504, "ts": 1716454222744033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222797654, "dur": 18, "args": { "External id": 89164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89164, "pid": 5, "tid": 7, "ts": 1716454222797654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744109, "dur": 13, "args": { "External id": 89164, "cbid": 211, "correlation": 89164 } }, { "ph": "s", "id": 89164, "pid": 76337, "tid": -914061504, "ts": 1716454222744109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222797673, "dur": 16, "args": { "External id": 89186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89186, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89186, "pid": 5, "tid": 7, "ts": 1716454222797673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744142, "dur": 7, "args": { "External id": 89186, "cbid": 211, "correlation": 89186 } }, { "ph": "s", "id": 89186, "pid": 76337, "tid": -914061504, "ts": 1716454222744142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744228, "dur": 2, "args": { "External id": 89197, "cbid": 251, "correlation": 89197 } }, { "ph": "f", "id": 89197, "pid": 76337, "tid": -914061504, "ts": 1716454222744228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222797690, "dur": 88, "args": { "External id": 89198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89198, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89198, "pid": 5, "tid": 7, "ts": 1716454222797690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744234, "dur": 15, "args": { "External id": 89198, "cbid": 211, "correlation": 89198 } }, { "ph": "s", "id": 89198, "pid": 76337, "tid": -914061504, "ts": 1716454222744234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744309, "dur": 1, "args": { "External id": 89209, "cbid": 251, "correlation": 89209 } }, { "ph": "f", "id": 89209, "pid": 76337, "tid": -914061504, "ts": 1716454222744309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744313, "dur": 0, "args": { "External id": 89210, "cbid": 251, "correlation": 89210 } }, { "ph": "f", "id": 89210, "pid": 76337, "tid": -914061504, "ts": 1716454222744313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222797780, "dur": 11, "args": { "External id": 89211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89211, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89211, "pid": 5, "tid": 7, "ts": 1716454222797780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744314, "dur": 12, "args": { "External id": 89211, "cbid": 211, "correlation": 89211 } }, { "ph": "s", "id": 89211, "pid": 76337, "tid": -914061504, "ts": 1716454222744314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222797792, "dur": 5, "args": { "External id": 89213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89213, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89213, "pid": 5, "tid": 7, "ts": 1716454222797792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744329, "dur": 6, "args": { "External id": 89213, "cbid": 211, "correlation": 89213 } }, { "ph": "s", "id": 89213, "pid": 76337, "tid": -914061504, "ts": 1716454222744329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744387, "dur": 1, "args": { "External id": 89224, "cbid": 251, "correlation": 89224 } }, { "ph": "f", "id": 89224, "pid": 76337, "tid": -914061504, "ts": 1716454222744387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744390, "dur": 0, "args": { "External id": 89225, "cbid": 251, "correlation": 89225 } }, { "ph": "f", "id": 89225, "pid": 76337, "tid": -914061504, "ts": 1716454222744390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222797799, "dur": 8, "args": { "External id": 89226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89226, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89226, "pid": 5, "tid": 7, "ts": 1716454222797799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744391, "dur": 11, "args": { "External id": 89226, "cbid": 211, "correlation": 89226 } }, { "ph": "s", "id": 89226, "pid": 76337, "tid": -914061504, "ts": 1716454222744391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222797809, "dur": 3, "args": { "External id": 89228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89228, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89228, "pid": 5, "tid": 7, "ts": 1716454222797809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744404, "dur": 6, "args": { "External id": 89228, "cbid": 211, "correlation": 89228 } }, { "ph": "s", "id": 89228, "pid": 76337, "tid": -914061504, "ts": 1716454222744404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222797813, "dur": 54, "args": { "External id": 89253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89253, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89253, "pid": 5, "tid": 7, "ts": 1716454222797813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744483, "dur": 13, "args": { "External id": 89253, "cbid": 211, "correlation": 89253 } }, { "ph": "s", "id": 89253, "pid": 76337, "tid": -914061504, "ts": 1716454222744483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744583, "dur": 2, "args": { "External id": 89271, "cbid": 251, "correlation": 89271 } }, { "ph": "f", "id": 89271, "pid": 76337, "tid": -914061504, "ts": 1716454222744583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222797869, "dur": 90, "args": { "External id": 89273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89273, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89273, "pid": 5, "tid": 7, "ts": 1716454222797869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744590, "dur": 14, "args": { "External id": 89273, "cbid": 211, "correlation": 89273 } }, { "ph": "s", "id": 89273, "pid": 76337, "tid": -914061504, "ts": 1716454222744590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222797960, "dur": 10, "args": { "External id": 89281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89281, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89281, "pid": 5, "tid": 7, "ts": 1716454222797960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744661, "dur": 12, "args": { "External id": 89281, "cbid": 211, "correlation": 89281 } }, { "ph": "s", "id": 89281, "pid": 76337, "tid": -914061504, "ts": 1716454222744661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222797971, "dur": 22, "args": { "External id": 89289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89289, "pid": 5, "tid": 7, "ts": 1716454222797971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744704, "dur": 9, "args": { "External id": 89289, "cbid": 211, "correlation": 89289 } }, { "ph": "s", "id": 89289, "pid": 76337, "tid": -914061504, "ts": 1716454222744704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222797994, "dur": 18, "args": { "External id": 89311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89311, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89311, "pid": 5, "tid": 7, "ts": 1716454222797994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744755, "dur": 11, "args": { "External id": 89311, "cbid": 211, "correlation": 89311 } }, { "ph": "s", "id": 89311, "pid": 76337, "tid": -914061504, "ts": 1716454222744755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744849, "dur": 2, "args": { "External id": 89327, "cbid": 251, "correlation": 89327 } }, { "ph": "f", "id": 89327, "pid": 76337, "tid": -914061504, "ts": 1716454222744849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222744854, "dur": 0, "args": { "External id": 89329, "cbid": 251, "correlation": 89329 } }, { "ph": "f", "id": 89329, "pid": 76337, "tid": -914061504, "ts": 1716454222744854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222798013, "dur": 493, "args": { "External id": 89330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89330, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89330, "pid": 5, "tid": 7, "ts": 1716454222798013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744857, "dur": 16, "args": { "External id": 89330, "cbid": 211, "correlation": 89330 } }, { "ph": "s", "id": 89330, "pid": 76337, "tid": -914061504, "ts": 1716454222744857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222798508, "dur": 66, "args": { "External id": 89338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89338, "pid": 5, "tid": 7, "ts": 1716454222798508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744932, "dur": 13, "args": { "External id": 89338, "cbid": 211, "correlation": 89338 } }, { "ph": "s", "id": 89338, "pid": 76337, "tid": -914061504, "ts": 1716454222744932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222798575, "dur": 69, "args": { "External id": 89346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89346, "pid": 5, "tid": 7, "ts": 1716454222798575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222744964, "dur": 8, "args": { "External id": 89346, "cbid": 211, "correlation": 89346 } }, { "ph": "s", "id": 89346, "pid": 76337, "tid": -914061504, "ts": 1716454222744964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222745054, "dur": 1, "args": { "External id": 89362, "cbid": 251, "correlation": 89362 } }, { "ph": "f", "id": 89362, "pid": 76337, "tid": -914061504, "ts": 1716454222745054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222798647, "dur": 1, "args": { "External id": 89364, "device": 5, "context": 1, "stream": 7, "correlation": 89364, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 89364, "pid": 5, "tid": 7, "ts": 1716454222798647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222745059, "dur": 12, "args": { "External id": 89364, "cbid": 51, "correlation": 89364 } }, { "ph": "s", "id": 89364, "pid": 76337, "tid": -914061504, "ts": 1716454222745059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222798650, "dur": 266, "args": { "External id": 89365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89365, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89365, "pid": 5, "tid": 7, "ts": 1716454222798650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745073, "dur": 11, "args": { "External id": 89365, "cbid": 211, "correlation": 89365 } }, { "ph": "s", "id": 89365, "pid": 76337, "tid": -914061504, "ts": 1716454222745073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222798918, "dur": 13, "args": { "External id": 89373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89373, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89373, "pid": 5, "tid": 7, "ts": 1716454222798918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745117, "dur": 10, "args": { "External id": 89373, "cbid": 211, "correlation": 89373 } }, { "ph": "s", "id": 89373, "pid": 76337, "tid": -914061504, "ts": 1716454222745117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222798933, "dur": 38, "args": { "External id": 89384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89384, "pid": 5, "tid": 7, "ts": 1716454222798933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745187, "dur": 13, "args": { "External id": 89384, "cbid": 211, "correlation": 89384 } }, { "ph": "s", "id": 89384, "pid": 76337, "tid": -914061504, "ts": 1716454222745187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222745254, "dur": 0, "args": { "External id": 89396, "cbid": 317, "correlation": 89396 } }, { "ph": "f", "id": 89396, "pid": 76337, "tid": -914061504, "ts": 1716454222745254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222745255, "dur": 0, "args": { "External id": 89397, "cbid": 203, "correlation": 89397 } }, { "ph": "f", "id": 89397, "pid": 76337, "tid": -914061504, "ts": 1716454222745255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222745256, "dur": 0, "args": { "External id": 89398, "cbid": 205, "correlation": 89398 } }, { "ph": "f", "id": 89398, "pid": 76337, "tid": -914061504, "ts": 1716454222745256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222798972, "dur": 12, "args": { "External id": 89402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89402, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89402, "pid": 5, "tid": 7, "ts": 1716454222798972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745275, "dur": 12, "args": { "External id": 89402, "cbid": 211, "correlation": 89402 } }, { "ph": "s", "id": 89402, "pid": 76337, "tid": -914061504, "ts": 1716454222745275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222798986, "dur": 4, "args": { "External id": 89404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89404, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89404, "pid": 5, "tid": 7, "ts": 1716454222798986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745292, "dur": 6, "args": { "External id": 89404, "cbid": 211, "correlation": 89404 } }, { "ph": "s", "id": 89404, "pid": 76337, "tid": -914061504, "ts": 1716454222745292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222745301, "dur": 0, "args": { "External id": 89405, "cbid": 51, "correlation": 89405 } }, { "ph": "s", "id": 89405, "pid": 76337, "tid": -914061504, "ts": 1716454222745301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222798991, "dur": 96, "args": { "External id": 89406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89406, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 89406, "pid": 5, "tid": 7, "ts": 1716454222798991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745302, "dur": 6, "args": { "External id": 89406, "cbid": 211, "correlation": 89406 } }, { "ph": "s", "id": 89406, "pid": 76337, "tid": -914061504, "ts": 1716454222745302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222799089, "dur": 16, "args": { "External id": 89411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89411, "pid": 5, "tid": 7, "ts": 1716454222799089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745331, "dur": 9, "args": { "External id": 89411, "cbid": 211, "correlation": 89411 } }, { "ph": "s", "id": 89411, "pid": 76337, "tid": -914061504, "ts": 1716454222745331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222799106, "dur": 11, "args": { "External id": 89419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89419, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89419, "pid": 5, "tid": 7, "ts": 1716454222799106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745363, "dur": 8, "args": { "External id": 89419, "cbid": 211, "correlation": 89419 } }, { "ph": "s", "id": 89419, "pid": 76337, "tid": -914061504, "ts": 1716454222745363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222799118, "dur": 18, "args": { "External id": 89439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89439, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 89439, "pid": 5, "tid": 7, "ts": 1716454222799118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745440, "dur": 12, "args": { "External id": 89439, "cbid": 211, "correlation": 89439 } }, { "ph": "s", "id": 89439, "pid": 76337, "tid": -914061504, "ts": 1716454222745440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222799138, "dur": 5, "args": { "External id": 89451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89451, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 89451, "pid": 5, "tid": 7, "ts": 1716454222799138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745462, "dur": 6, "args": { "External id": 89451, "cbid": 211, "correlation": 89451 } }, { "ph": "s", "id": 89451, "pid": 76337, "tid": -914061504, "ts": 1716454222745462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222799144, "dur": 18, "args": { "External id": 89454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89454, "pid": 5, "tid": 7, "ts": 1716454222799144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745480, "dur": 6, "args": { "External id": 89454, "cbid": 211, "correlation": 89454 } }, { "ph": "s", "id": 89454, "pid": 76337, "tid": -914061504, "ts": 1716454222745480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222799163, "dur": 12, "args": { "External id": 89463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89463, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89463, "pid": 5, "tid": 7, "ts": 1716454222799163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745522, "dur": 10, "args": { "External id": 89463, "cbid": 211, "correlation": 89463 } }, { "ph": "s", "id": 89463, "pid": 76337, "tid": -914061504, "ts": 1716454222745522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222745573, "dur": 0, "args": { "External id": 89473, "cbid": 317, "correlation": 89473 } }, { "ph": "f", "id": 89473, "pid": 76337, "tid": -914061504, "ts": 1716454222745573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222745574, "dur": 0, "args": { "External id": 89474, "cbid": 203, "correlation": 89474 } }, { "ph": "f", "id": 89474, "pid": 76337, "tid": -914061504, "ts": 1716454222745574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222745575, "dur": 0, "args": { "External id": 89475, "cbid": 205, "correlation": 89475 } }, { "ph": "f", "id": 89475, "pid": 76337, "tid": -914061504, "ts": 1716454222745575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222799176, "dur": 11, "args": { "External id": 89479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89479, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89479, "pid": 5, "tid": 7, "ts": 1716454222799176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745588, "dur": 11, "args": { "External id": 89479, "cbid": 211, "correlation": 89479 } }, { "ph": "s", "id": 89479, "pid": 76337, "tid": -914061504, "ts": 1716454222745588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222799188, "dur": 160, "args": { "External id": 89481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89481, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89481, "pid": 5, "tid": 7, "ts": 1716454222799188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745602, "dur": 5, "args": { "External id": 89481, "cbid": 211, "correlation": 89481 } }, { "ph": "s", "id": 89481, "pid": 76337, "tid": -914061504, "ts": 1716454222745602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222799351, "dur": 1, "args": { "External id": 89483, "device": 5, "context": 1, "stream": 7, "correlation": 89483, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 89483, "pid": 5, "tid": 7, "ts": 1716454222799351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222745614, "dur": 7, "args": { "External id": 89483, "cbid": 51, "correlation": 89483 } }, { "ph": "s", "id": 89483, "pid": 76337, "tid": -914061504, "ts": 1716454222745614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222799354, "dur": 656, "args": { "External id": 89484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89484, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89484, "pid": 5, "tid": 7, "ts": 1716454222799354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745623, "dur": 7, "args": { "External id": 89484, "cbid": 211, "correlation": 89484 } }, { "ph": "s", "id": 89484, "pid": 76337, "tid": -914061504, "ts": 1716454222745623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222800011, "dur": 13, "args": { "External id": 89486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89486, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89486, "pid": 5, "tid": 7, "ts": 1716454222800011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745634, "dur": 5, "args": { "External id": 89486, "cbid": 211, "correlation": 89486 } }, { "ph": "s", "id": 89486, "pid": 76337, "tid": -914061504, "ts": 1716454222745634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222800026, "dur": 14, "args": { "External id": 89492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89492, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89492, "pid": 5, "tid": 7, "ts": 1716454222800026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745663, "dur": 10, "args": { "External id": 89492, "cbid": 211, "correlation": 89492 } }, { "ph": "s", "id": 89492, "pid": 76337, "tid": -914061504, "ts": 1716454222745663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222800041, "dur": 3, "args": { "External id": 89500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89500, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 89500, "pid": 5, "tid": 7, "ts": 1716454222800041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745707, "dur": 9, "args": { "External id": 89500, "cbid": 211, "correlation": 89500 } }, { "ph": "s", "id": 89500, "pid": 76337, "tid": -914061504, "ts": 1716454222745707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222745776, "dur": 1, "args": { "External id": 89516, "cbid": 251, "correlation": 89516 } }, { "ph": "f", "id": 89516, "pid": 76337, "tid": -914061504, "ts": 1716454222745776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222745781, "dur": 0, "args": { "External id": 89518, "cbid": 251, "correlation": 89518 } }, { "ph": "f", "id": 89518, "pid": 76337, "tid": -914061504, "ts": 1716454222745781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222800046, "dur": 13, "args": { "External id": 89519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89519, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89519, "pid": 5, "tid": 7, "ts": 1716454222800046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745783, "dur": 12, "args": { "External id": 89519, "cbid": 211, "correlation": 89519 } }, { "ph": "s", "id": 89519, "pid": 76337, "tid": -914061504, "ts": 1716454222745783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222800060, "dur": 5, "args": { "External id": 89521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89521, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89521, "pid": 5, "tid": 7, "ts": 1716454222800060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745797, "dur": 6, "args": { "External id": 89521, "cbid": 211, "correlation": 89521 } }, { "ph": "s", "id": 89521, "pid": 76337, "tid": -914061504, "ts": 1716454222745797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222800067, "dur": 17, "args": { "External id": 89531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89531, "pid": 5, "tid": 7, "ts": 1716454222800067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745857, "dur": 12, "args": { "External id": 89531, "cbid": 211, "correlation": 89531 } }, { "ph": "s", "id": 89531, "pid": 76337, "tid": -914061504, "ts": 1716454222745857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222800085, "dur": 17, "args": { "External id": 89551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89551, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 89551, "pid": 5, "tid": 7, "ts": 1716454222800085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745923, "dur": 11, "args": { "External id": 89551, "cbid": 211, "correlation": 89551 } }, { "ph": "s", "id": 89551, "pid": 76337, "tid": -914061504, "ts": 1716454222745923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222800104, "dur": 4, "args": { "External id": 89563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89563, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 89563, "pid": 5, "tid": 7, "ts": 1716454222800104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745944, "dur": 6, "args": { "External id": 89563, "cbid": 211, "correlation": 89563 } }, { "ph": "s", "id": 89563, "pid": 76337, "tid": -914061504, "ts": 1716454222745944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222800109, "dur": 16, "args": { "External id": 89566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89566, "pid": 5, "tid": 7, "ts": 1716454222800109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222745964, "dur": 6, "args": { "External id": 89566, "cbid": 211, "correlation": 89566 } }, { "ph": "s", "id": 89566, "pid": 76337, "tid": -914061504, "ts": 1716454222745964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222800127, "dur": 11, "args": { "External id": 89575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89575, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89575, "pid": 5, "tid": 7, "ts": 1716454222800127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746013, "dur": 11, "args": { "External id": 89575, "cbid": 211, "correlation": 89575 } }, { "ph": "s", "id": 89575, "pid": 76337, "tid": -914061504, "ts": 1716454222746013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222746078, "dur": 0, "args": { "External id": 89585, "cbid": 317, "correlation": 89585 } }, { "ph": "f", "id": 89585, "pid": 76337, "tid": -914061504, "ts": 1716454222746078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222746079, "dur": 0, "args": { "External id": 89586, "cbid": 203, "correlation": 89586 } }, { "ph": "f", "id": 89586, "pid": 76337, "tid": -914061504, "ts": 1716454222746079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222746080, "dur": 0, "args": { "External id": 89587, "cbid": 205, "correlation": 89587 } }, { "ph": "f", "id": 89587, "pid": 76337, "tid": -914061504, "ts": 1716454222746080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222800139, "dur": 10, "args": { "External id": 89591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89591, "pid": 5, "tid": 7, "ts": 1716454222800139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746094, "dur": 12, "args": { "External id": 89591, "cbid": 211, "correlation": 89591 } }, { "ph": "s", "id": 89591, "pid": 76337, "tid": -914061504, "ts": 1716454222746094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222800150, "dur": 159, "args": { "External id": 89593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89593, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89593, "pid": 5, "tid": 7, "ts": 1716454222800150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746108, "dur": 5, "args": { "External id": 89593, "cbid": 211, "correlation": 89593 } }, { "ph": "s", "id": 89593, "pid": 76337, "tid": -914061504, "ts": 1716454222746108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222800311, "dur": 1, "args": { "External id": 89595, "device": 5, "context": 1, "stream": 7, "correlation": 89595, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 89595, "pid": 5, "tid": 7, "ts": 1716454222800311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222746119, "dur": 6, "args": { "External id": 89595, "cbid": 51, "correlation": 89595 } }, { "ph": "s", "id": 89595, "pid": 76337, "tid": -914061504, "ts": 1716454222746119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222800315, "dur": 637, "args": { "External id": 89596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89596, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89596, "pid": 5, "tid": 7, "ts": 1716454222800315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746127, "dur": 7, "args": { "External id": 89596, "cbid": 211, "correlation": 89596 } }, { "ph": "s", "id": 89596, "pid": 76337, "tid": -914061504, "ts": 1716454222746127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222800954, "dur": 12, "args": { "External id": 89598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89598, "pid": 5, "tid": 7, "ts": 1716454222800954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746139, "dur": 5, "args": { "External id": 89598, "cbid": 211, "correlation": 89598 } }, { "ph": "s", "id": 89598, "pid": 76337, "tid": -914061504, "ts": 1716454222746139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222800967, "dur": 14, "args": { "External id": 89604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89604, "pid": 5, "tid": 7, "ts": 1716454222800967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746168, "dur": 8, "args": { "External id": 89604, "cbid": 211, "correlation": 89604 } }, { "ph": "s", "id": 89604, "pid": 76337, "tid": -914061504, "ts": 1716454222746168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222800983, "dur": 12, "args": { "External id": 89612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89612, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89612, "pid": 5, "tid": 7, "ts": 1716454222800983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746201, "dur": 8, "args": { "External id": 89612, "cbid": 211, "correlation": 89612 } }, { "ph": "s", "id": 89612, "pid": 76337, "tid": -914061504, "ts": 1716454222746201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222800996, "dur": 10, "args": { "External id": 89620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89620, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89620, "pid": 5, "tid": 7, "ts": 1716454222800996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746230, "dur": 9, "args": { "External id": 89620, "cbid": 211, "correlation": 89620 } }, { "ph": "s", "id": 89620, "pid": 76337, "tid": -914061504, "ts": 1716454222746230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222801008, "dur": 18, "args": { "External id": 89640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89640, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 89640, "pid": 5, "tid": 7, "ts": 1716454222801008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746312, "dur": 12, "args": { "External id": 89640, "cbid": 211, "correlation": 89640 } }, { "ph": "s", "id": 89640, "pid": 76337, "tid": -914061504, "ts": 1716454222746312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222801027, "dur": 4, "args": { "External id": 89652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89652, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 89652, "pid": 5, "tid": 7, "ts": 1716454222801027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746334, "dur": 6, "args": { "External id": 89652, "cbid": 211, "correlation": 89652 } }, { "ph": "s", "id": 89652, "pid": 76337, "tid": -914061504, "ts": 1716454222746334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222801032, "dur": 16, "args": { "External id": 89655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89655, "pid": 5, "tid": 7, "ts": 1716454222801032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746352, "dur": 7, "args": { "External id": 89655, "cbid": 211, "correlation": 89655 } }, { "ph": "s", "id": 89655, "pid": 76337, "tid": -914061504, "ts": 1716454222746352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222746409, "dur": 0, "args": { "External id": 89666, "cbid": 317, "correlation": 89666 } }, { "ph": "f", "id": 89666, "pid": 76337, "tid": -914061504, "ts": 1716454222746409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222746410, "dur": 0, "args": { "External id": 89667, "cbid": 203, "correlation": 89667 } }, { "ph": "f", "id": 89667, "pid": 76337, "tid": -914061504, "ts": 1716454222746410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222746411, "dur": 0, "args": { "External id": 89668, "cbid": 205, "correlation": 89668 } }, { "ph": "f", "id": 89668, "pid": 76337, "tid": -914061504, "ts": 1716454222746411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222801049, "dur": 11, "args": { "External id": 89672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89672, "pid": 5, "tid": 7, "ts": 1716454222801049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746425, "dur": 11, "args": { "External id": 89672, "cbid": 211, "correlation": 89672 } }, { "ph": "s", "id": 89672, "pid": 76337, "tid": -914061504, "ts": 1716454222746425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222801062, "dur": 4, "args": { "External id": 89674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89674, "pid": 5, "tid": 7, "ts": 1716454222801062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746441, "dur": 7, "args": { "External id": 89674, "cbid": 211, "correlation": 89674 } }, { "ph": "s", "id": 89674, "pid": 76337, "tid": -914061504, "ts": 1716454222746441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222746450, "dur": 0, "args": { "External id": 89675, "cbid": 51, "correlation": 89675 } }, { "ph": "s", "id": 89675, "pid": 76337, "tid": -914061504, "ts": 1716454222746450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222801067, "dur": 93, "args": { "External id": 89676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89676, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 89676, "pid": 5, "tid": 7, "ts": 1716454222801067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746451, "dur": 5, "args": { "External id": 89676, "cbid": 211, "correlation": 89676 } }, { "ph": "s", "id": 89676, "pid": 76337, "tid": -914061504, "ts": 1716454222746451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222801161, "dur": 16, "args": { "External id": 89681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89681, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89681, "pid": 5, "tid": 7, "ts": 1716454222801161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746479, "dur": 8, "args": { "External id": 89681, "cbid": 211, "correlation": 89681 } }, { "ph": "s", "id": 89681, "pid": 76337, "tid": -914061504, "ts": 1716454222746479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222801178, "dur": 83, "args": { "External id": 89690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89690, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89690, "pid": 5, "tid": 7, "ts": 1716454222801178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746565, "dur": 16, "args": { "External id": 89690, "cbid": 211, "correlation": 89690 } }, { "ph": "s", "id": 89690, "pid": 76337, "tid": -914061504, "ts": 1716454222746565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222801262, "dur": 29, "args": { "External id": 89712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89712, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89712, "pid": 5, "tid": 7, "ts": 1716454222801262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746624, "dur": 10, "args": { "External id": 89712, "cbid": 211, "correlation": 89712 } }, { "ph": "s", "id": 89712, "pid": 76337, "tid": -914061504, "ts": 1716454222746624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222746717, "dur": 2, "args": { "External id": 89723, "cbid": 251, "correlation": 89723 } }, { "ph": "f", "id": 89723, "pid": 76337, "tid": -914061504, "ts": 1716454222746717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222801293, "dur": 160, "args": { "External id": 89724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89724, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89724, "pid": 5, "tid": 7, "ts": 1716454222801293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746722, "dur": 13, "args": { "External id": 89724, "cbid": 211, "correlation": 89724 } }, { "ph": "s", "id": 89724, "pid": 76337, "tid": -914061504, "ts": 1716454222746722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222746793, "dur": 1, "args": { "External id": 89735, "cbid": 251, "correlation": 89735 } }, { "ph": "f", "id": 89735, "pid": 76337, "tid": -914061504, "ts": 1716454222746793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222801454, "dur": 156, "args": { "External id": 89736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89736, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89736, "pid": 5, "tid": 7, "ts": 1716454222801454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746797, "dur": 11, "args": { "External id": 89736, "cbid": 211, "correlation": 89736 } }, { "ph": "s", "id": 89736, "pid": 76337, "tid": -914061504, "ts": 1716454222746797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222746862, "dur": 1, "args": { "External id": 89747, "cbid": 251, "correlation": 89747 } }, { "ph": "f", "id": 89747, "pid": 76337, "tid": -914061504, "ts": 1716454222746862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222801612, "dur": 156, "args": { "External id": 89748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89748, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89748, "pid": 5, "tid": 7, "ts": 1716454222801612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746866, "dur": 12, "args": { "External id": 89748, "cbid": 211, "correlation": 89748 } }, { "ph": "s", "id": 89748, "pid": 76337, "tid": -914061504, "ts": 1716454222746866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222801769, "dur": 332, "args": { "External id": 89773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89773, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89773, "pid": 5, "tid": 7, "ts": 1716454222801769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222746956, "dur": 14, "args": { "External id": 89773, "cbid": 211, "correlation": 89773 } }, { "ph": "s", "id": 89773, "pid": 76337, "tid": -914061504, "ts": 1716454222746956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747073, "dur": 1, "args": { "External id": 89791, "cbid": 251, "correlation": 89791 } }, { "ph": "f", "id": 89791, "pid": 76337, "tid": -914061504, "ts": 1716454222747073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222802103, "dur": 164, "args": { "External id": 89793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89793, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89793, "pid": 5, "tid": 7, "ts": 1716454222802103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747079, "dur": 14, "args": { "External id": 89793, "cbid": 211, "correlation": 89793 } }, { "ph": "s", "id": 89793, "pid": 76337, "tid": -914061504, "ts": 1716454222747079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222802268, "dur": 20, "args": { "External id": 89801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89801, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89801, "pid": 5, "tid": 7, "ts": 1716454222802268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747151, "dur": 13, "args": { "External id": 89801, "cbid": 211, "correlation": 89801 } }, { "ph": "s", "id": 89801, "pid": 76337, "tid": -914061504, "ts": 1716454222747151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222802289, "dur": 27, "args": { "External id": 89809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89809, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89809, "pid": 5, "tid": 7, "ts": 1716454222802289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747192, "dur": 8, "args": { "External id": 89809, "cbid": 211, "correlation": 89809 } }, { "ph": "s", "id": 89809, "pid": 76337, "tid": -914061504, "ts": 1716454222747192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222802318, "dur": 18, "args": { "External id": 89820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89820, "pid": 5, "tid": 7, "ts": 1716454222802318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747265, "dur": 14, "args": { "External id": 89820, "cbid": 211, "correlation": 89820 } }, { "ph": "s", "id": 89820, "pid": 76337, "tid": -914061504, "ts": 1716454222747265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222802337, "dur": 16, "args": { "External id": 89842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89842, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89842, "pid": 5, "tid": 7, "ts": 1716454222802337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747298, "dur": 7, "args": { "External id": 89842, "cbid": 211, "correlation": 89842 } }, { "ph": "s", "id": 89842, "pid": 76337, "tid": -914061504, "ts": 1716454222747298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747384, "dur": 2, "args": { "External id": 89853, "cbid": 251, "correlation": 89853 } }, { "ph": "f", "id": 89853, "pid": 76337, "tid": -914061504, "ts": 1716454222747384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222802355, "dur": 87, "args": { "External id": 89854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89854, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89854, "pid": 5, "tid": 7, "ts": 1716454222802355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747390, "dur": 14, "args": { "External id": 89854, "cbid": 211, "correlation": 89854 } }, { "ph": "s", "id": 89854, "pid": 76337, "tid": -914061504, "ts": 1716454222747390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747461, "dur": 1, "args": { "External id": 89865, "cbid": 251, "correlation": 89865 } }, { "ph": "f", "id": 89865, "pid": 76337, "tid": -914061504, "ts": 1716454222747461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747465, "dur": 0, "args": { "External id": 89866, "cbid": 251, "correlation": 89866 } }, { "ph": "f", "id": 89866, "pid": 76337, "tid": -914061504, "ts": 1716454222747465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222802443, "dur": 11, "args": { "External id": 89867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89867, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89867, "pid": 5, "tid": 7, "ts": 1716454222802443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747466, "dur": 12, "args": { "External id": 89867, "cbid": 211, "correlation": 89867 } }, { "ph": "s", "id": 89867, "pid": 76337, "tid": -914061504, "ts": 1716454222747466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222802456, "dur": 5, "args": { "External id": 89869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89869, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89869, "pid": 5, "tid": 7, "ts": 1716454222802456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747480, "dur": 7, "args": { "External id": 89869, "cbid": 211, "correlation": 89869 } }, { "ph": "s", "id": 89869, "pid": 76337, "tid": -914061504, "ts": 1716454222747480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747539, "dur": 1, "args": { "External id": 89880, "cbid": 251, "correlation": 89880 } }, { "ph": "f", "id": 89880, "pid": 76337, "tid": -914061504, "ts": 1716454222747539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747542, "dur": 0, "args": { "External id": 89881, "cbid": 251, "correlation": 89881 } }, { "ph": "f", "id": 89881, "pid": 76337, "tid": -914061504, "ts": 1716454222747542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222802463, "dur": 8, "args": { "External id": 89882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89882, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89882, "pid": 5, "tid": 7, "ts": 1716454222802463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747544, "dur": 12, "args": { "External id": 89882, "cbid": 211, "correlation": 89882 } }, { "ph": "s", "id": 89882, "pid": 76337, "tid": -914061504, "ts": 1716454222747544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222802472, "dur": 4, "args": { "External id": 89884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89884, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89884, "pid": 5, "tid": 7, "ts": 1716454222802472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747557, "dur": 6, "args": { "External id": 89884, "cbid": 211, "correlation": 89884 } }, { "ph": "s", "id": 89884, "pid": 76337, "tid": -914061504, "ts": 1716454222747557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222802477, "dur": 54, "args": { "External id": 89909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89909, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89909, "pid": 5, "tid": 7, "ts": 1716454222802477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747636, "dur": 13, "args": { "External id": 89909, "cbid": 211, "correlation": 89909 } }, { "ph": "s", "id": 89909, "pid": 76337, "tid": -914061504, "ts": 1716454222747636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222747739, "dur": 2, "args": { "External id": 89927, "cbid": 251, "correlation": 89927 } }, { "ph": "f", "id": 89927, "pid": 76337, "tid": -914061504, "ts": 1716454222747739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222802532, "dur": 90, "args": { "External id": 89929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89929, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 89929, "pid": 5, "tid": 7, "ts": 1716454222802532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747746, "dur": 14, "args": { "External id": 89929, "cbid": 211, "correlation": 89929 } }, { "ph": "s", "id": 89929, "pid": 76337, "tid": -914061504, "ts": 1716454222747746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222802623, "dur": 10, "args": { "External id": 89937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89937, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89937, "pid": 5, "tid": 7, "ts": 1716454222802623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747816, "dur": 13, "args": { "External id": 89937, "cbid": 211, "correlation": 89937 } }, { "ph": "s", "id": 89937, "pid": 76337, "tid": -914061504, "ts": 1716454222747816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222802634, "dur": 22, "args": { "External id": 89945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89945, "pid": 5, "tid": 7, "ts": 1716454222802634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747859, "dur": 10, "args": { "External id": 89945, "cbid": 211, "correlation": 89945 } }, { "ph": "s", "id": 89945, "pid": 76337, "tid": -914061504, "ts": 1716454222747859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222802657, "dur": 17, "args": { "External id": 89967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89967, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89967, "pid": 5, "tid": 7, "ts": 1716454222802657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222747912, "dur": 10, "args": { "External id": 89967, "cbid": 211, "correlation": 89967 } }, { "ph": "s", "id": 89967, "pid": 76337, "tid": -914061504, "ts": 1716454222747912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222748007, "dur": 2, "args": { "External id": 89983, "cbid": 251, "correlation": 89983 } }, { "ph": "f", "id": 89983, "pid": 76337, "tid": -914061504, "ts": 1716454222748007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222748013, "dur": 0, "args": { "External id": 89985, "cbid": 251, "correlation": 89985 } }, { "ph": "f", "id": 89985, "pid": 76337, "tid": -914061504, "ts": 1716454222748013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222802675, "dur": 490, "args": { "External id": 89986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89986, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 89986, "pid": 5, "tid": 7, "ts": 1716454222802675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748015, "dur": 14, "args": { "External id": 89986, "cbid": 211, "correlation": 89986 } }, { "ph": "s", "id": 89986, "pid": 76337, "tid": -914061504, "ts": 1716454222748015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222803167, "dur": 65, "args": { "External id": 89994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 89994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 89994, "pid": 5, "tid": 7, "ts": 1716454222803167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748087, "dur": 13, "args": { "External id": 89994, "cbid": 211, "correlation": 89994 } }, { "ph": "s", "id": 89994, "pid": 76337, "tid": -914061504, "ts": 1716454222748087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222803233, "dur": 67, "args": { "External id": 90002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90002, "pid": 5, "tid": 7, "ts": 1716454222803233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748120, "dur": 9, "args": { "External id": 90002, "cbid": 211, "correlation": 90002 } }, { "ph": "s", "id": 90002, "pid": 76337, "tid": -914061504, "ts": 1716454222748120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222748203, "dur": 1, "args": { "External id": 90018, "cbid": 251, "correlation": 90018 } }, { "ph": "f", "id": 90018, "pid": 76337, "tid": -914061504, "ts": 1716454222748203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222803302, "dur": 1, "args": { "External id": 90020, "device": 5, "context": 1, "stream": 7, "correlation": 90020, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 90020, "pid": 5, "tid": 7, "ts": 1716454222803302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222748208, "dur": 11, "args": { "External id": 90020, "cbid": 51, "correlation": 90020 } }, { "ph": "s", "id": 90020, "pid": 76337, "tid": -914061504, "ts": 1716454222748208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222803305, "dur": 271, "args": { "External id": 90021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90021, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 90021, "pid": 5, "tid": 7, "ts": 1716454222803305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748220, "dur": 11, "args": { "External id": 90021, "cbid": 211, "correlation": 90021 } }, { "ph": "s", "id": 90021, "pid": 76337, "tid": -914061504, "ts": 1716454222748220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222803578, "dur": 13, "args": { "External id": 90029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90029, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90029, "pid": 5, "tid": 7, "ts": 1716454222803578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748264, "dur": 10, "args": { "External id": 90029, "cbid": 211, "correlation": 90029 } }, { "ph": "s", "id": 90029, "pid": 76337, "tid": -914061504, "ts": 1716454222748264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222803593, "dur": 37, "args": { "External id": 90040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90040, "pid": 5, "tid": 7, "ts": 1716454222803593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748334, "dur": 13, "args": { "External id": 90040, "cbid": 211, "correlation": 90040 } }, { "ph": "s", "id": 90040, "pid": 76337, "tid": -914061504, "ts": 1716454222748334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222748401, "dur": 0, "args": { "External id": 90052, "cbid": 317, "correlation": 90052 } }, { "ph": "f", "id": 90052, "pid": 76337, "tid": -914061504, "ts": 1716454222748401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222748402, "dur": 0, "args": { "External id": 90053, "cbid": 203, "correlation": 90053 } }, { "ph": "f", "id": 90053, "pid": 76337, "tid": -914061504, "ts": 1716454222748402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222748403, "dur": 0, "args": { "External id": 90054, "cbid": 205, "correlation": 90054 } }, { "ph": "f", "id": 90054, "pid": 76337, "tid": -914061504, "ts": 1716454222748403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222803631, "dur": 12, "args": { "External id": 90058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90058, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90058, "pid": 5, "tid": 7, "ts": 1716454222803631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748420, "dur": 12, "args": { "External id": 90058, "cbid": 211, "correlation": 90058 } }, { "ph": "s", "id": 90058, "pid": 76337, "tid": -914061504, "ts": 1716454222748420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222803645, "dur": 4, "args": { "External id": 90060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 90060, "pid": 5, "tid": 7, "ts": 1716454222803645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748437, "dur": 6, "args": { "External id": 90060, "cbid": 211, "correlation": 90060 } }, { "ph": "s", "id": 90060, "pid": 76337, "tid": -914061504, "ts": 1716454222748437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222748446, "dur": 0, "args": { "External id": 90061, "cbid": 51, "correlation": 90061 } }, { "ph": "s", "id": 90061, "pid": 76337, "tid": -914061504, "ts": 1716454222748446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222803650, "dur": 96, "args": { "External id": 90062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90062, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 90062, "pid": 5, "tid": 7, "ts": 1716454222803650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748447, "dur": 5, "args": { "External id": 90062, "cbid": 211, "correlation": 90062 } }, { "ph": "s", "id": 90062, "pid": 76337, "tid": -914061504, "ts": 1716454222748447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222803747, "dur": 16, "args": { "External id": 90067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90067, "pid": 5, "tid": 7, "ts": 1716454222803747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748474, "dur": 8, "args": { "External id": 90067, "cbid": 211, "correlation": 90067 } }, { "ph": "s", "id": 90067, "pid": 76337, "tid": -914061504, "ts": 1716454222748474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222803765, "dur": 11, "args": { "External id": 90075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90075, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90075, "pid": 5, "tid": 7, "ts": 1716454222803765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748506, "dur": 8, "args": { "External id": 90075, "cbid": 211, "correlation": 90075 } }, { "ph": "s", "id": 90075, "pid": 76337, "tid": -914061504, "ts": 1716454222748506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222748579, "dur": 0, "args": { "External id": 90085, "cbid": 317, "correlation": 90085 } }, { "ph": "f", "id": 90085, "pid": 76337, "tid": -914061504, "ts": 1716454222748579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222748580, "dur": 0, "args": { "External id": 90086, "cbid": 203, "correlation": 90086 } }, { "ph": "f", "id": 90086, "pid": 76337, "tid": -914061504, "ts": 1716454222748580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222748581, "dur": 0, "args": { "External id": 90087, "cbid": 205, "correlation": 90087 } }, { "ph": "f", "id": 90087, "pid": 76337, "tid": -914061504, "ts": 1716454222748581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222803777, "dur": 12, "args": { "External id": 90091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90091, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90091, "pid": 5, "tid": 7, "ts": 1716454222803777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748596, "dur": 12, "args": { "External id": 90091, "cbid": 211, "correlation": 90091 } }, { "ph": "s", "id": 90091, "pid": 76337, "tid": -914061504, "ts": 1716454222748596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222803790, "dur": 159, "args": { "External id": 90093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90093, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90093, "pid": 5, "tid": 7, "ts": 1716454222803790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748610, "dur": 5, "args": { "External id": 90093, "cbid": 211, "correlation": 90093 } }, { "ph": "s", "id": 90093, "pid": 76337, "tid": -914061504, "ts": 1716454222748610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222803951, "dur": 1, "args": { "External id": 90095, "device": 5, "context": 1, "stream": 7, "correlation": 90095, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 90095, "pid": 5, "tid": 7, "ts": 1716454222803951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222748623, "dur": 7, "args": { "External id": 90095, "cbid": 51, "correlation": 90095 } }, { "ph": "s", "id": 90095, "pid": 76337, "tid": -914061504, "ts": 1716454222748623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222803955, "dur": 196, "args": { "External id": 90096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90096, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 90096, "pid": 5, "tid": 7, "ts": 1716454222803955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748632, "dur": 7, "args": { "External id": 90096, "cbid": 211, "correlation": 90096 } }, { "ph": "s", "id": 90096, "pid": 76337, "tid": -914061504, "ts": 1716454222748632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222804153, "dur": 6, "args": { "External id": 90098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90098, "pid": 5, "tid": 7, "ts": 1716454222804153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748644, "dur": 5, "args": { "External id": 90098, "cbid": 211, "correlation": 90098 } }, { "ph": "s", "id": 90098, "pid": 76337, "tid": -914061504, "ts": 1716454222748644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222804160, "dur": 6, "args": { "External id": 90104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90104, "pid": 5, "tid": 7, "ts": 1716454222804160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748671, "dur": 8, "args": { "External id": 90104, "cbid": 211, "correlation": 90104 } }, { "ph": "s", "id": 90104, "pid": 76337, "tid": -914061504, "ts": 1716454222748671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222804168, "dur": 11, "args": { "External id": 90124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90124, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90124, "pid": 5, "tid": 7, "ts": 1716454222804168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748770, "dur": 12, "args": { "External id": 90124, "cbid": 211, "correlation": 90124 } }, { "ph": "s", "id": 90124, "pid": 76337, "tid": -914061504, "ts": 1716454222748770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222804180, "dur": 4, "args": { "External id": 90136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90136, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90136, "pid": 5, "tid": 7, "ts": 1716454222804180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748793, "dur": 7, "args": { "External id": 90136, "cbid": 211, "correlation": 90136 } }, { "ph": "s", "id": 90136, "pid": 76337, "tid": -914061504, "ts": 1716454222748793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222804185, "dur": 8, "args": { "External id": 90139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90139, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90139, "pid": 5, "tid": 7, "ts": 1716454222804185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748813, "dur": 6, "args": { "External id": 90139, "cbid": 211, "correlation": 90139 } }, { "ph": "s", "id": 90139, "pid": 76337, "tid": -914061504, "ts": 1716454222748813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222804195, "dur": 5, "args": { "External id": 90148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90148, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90148, "pid": 5, "tid": 7, "ts": 1716454222804195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748855, "dur": 11, "args": { "External id": 90148, "cbid": 211, "correlation": 90148 } }, { "ph": "s", "id": 90148, "pid": 76337, "tid": -914061504, "ts": 1716454222748855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222748908, "dur": 0, "args": { "External id": 90158, "cbid": 317, "correlation": 90158 } }, { "ph": "f", "id": 90158, "pid": 76337, "tid": -914061504, "ts": 1716454222748908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222748909, "dur": 0, "args": { "External id": 90159, "cbid": 203, "correlation": 90159 } }, { "ph": "f", "id": 90159, "pid": 76337, "tid": -914061504, "ts": 1716454222748909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222748909, "dur": 0, "args": { "External id": 90160, "cbid": 205, "correlation": 90160 } }, { "ph": "f", "id": 90160, "pid": 76337, "tid": -914061504, "ts": 1716454222748909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222804201, "dur": 5, "args": { "External id": 90164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90164, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90164, "pid": 5, "tid": 7, "ts": 1716454222804201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748925, "dur": 12, "args": { "External id": 90164, "cbid": 211, "correlation": 90164 } }, { "ph": "s", "id": 90164, "pid": 76337, "tid": -914061504, "ts": 1716454222748925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222804208, "dur": 159, "args": { "External id": 90166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90166, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90166, "pid": 5, "tid": 7, "ts": 1716454222804208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748940, "dur": 5, "args": { "External id": 90166, "cbid": 211, "correlation": 90166 } }, { "ph": "s", "id": 90166, "pid": 76337, "tid": -914061504, "ts": 1716454222748940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222804369, "dur": 1, "args": { "External id": 90168, "device": 5, "context": 1, "stream": 7, "correlation": 90168, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 90168, "pid": 5, "tid": 7, "ts": 1716454222804369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222748951, "dur": 6, "args": { "External id": 90168, "cbid": 51, "correlation": 90168 } }, { "ph": "s", "id": 90168, "pid": 76337, "tid": -914061504, "ts": 1716454222748951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222804373, "dur": 266, "args": { "External id": 90169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90169, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90169, "pid": 5, "tid": 7, "ts": 1716454222804373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748959, "dur": 6, "args": { "External id": 90169, "cbid": 211, "correlation": 90169 } }, { "ph": "s", "id": 90169, "pid": 76337, "tid": -914061504, "ts": 1716454222748959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222804640, "dur": 5, "args": { "External id": 90171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90171, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90171, "pid": 5, "tid": 7, "ts": 1716454222804640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222748970, "dur": 13, "args": { "External id": 90171, "cbid": 211, "correlation": 90171 } }, { "ph": "s", "id": 90171, "pid": 76337, "tid": -914061504, "ts": 1716454222748970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222804646, "dur": 6, "args": { "External id": 90177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90177, "pid": 5, "tid": 7, "ts": 1716454222804646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749007, "dur": 9, "args": { "External id": 90177, "cbid": 211, "correlation": 90177 } }, { "ph": "s", "id": 90177, "pid": 76337, "tid": -914061504, "ts": 1716454222749007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222804654, "dur": 3, "args": { "External id": 90185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90185, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 90185, "pid": 5, "tid": 7, "ts": 1716454222804654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749052, "dur": 9, "args": { "External id": 90185, "cbid": 211, "correlation": 90185 } }, { "ph": "s", "id": 90185, "pid": 76337, "tid": -914061504, "ts": 1716454222749052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222749119, "dur": 1, "args": { "External id": 90201, "cbid": 251, "correlation": 90201 } }, { "ph": "f", "id": 90201, "pid": 76337, "tid": -914061504, "ts": 1716454222749119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222749124, "dur": 0, "args": { "External id": 90203, "cbid": 251, "correlation": 90203 } }, { "ph": "f", "id": 90203, "pid": 76337, "tid": -914061504, "ts": 1716454222749124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222804658, "dur": 13, "args": { "External id": 90204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90204, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90204, "pid": 5, "tid": 7, "ts": 1716454222804658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749126, "dur": 12, "args": { "External id": 90204, "cbid": 211, "correlation": 90204 } }, { "ph": "s", "id": 90204, "pid": 76337, "tid": -914061504, "ts": 1716454222749126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222804672, "dur": 5, "args": { "External id": 90206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90206, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90206, "pid": 5, "tid": 7, "ts": 1716454222804672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749140, "dur": 6, "args": { "External id": 90206, "cbid": 211, "correlation": 90206 } }, { "ph": "s", "id": 90206, "pid": 76337, "tid": -914061504, "ts": 1716454222749140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222804679, "dur": 6, "args": { "External id": 90216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90216, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90216, "pid": 5, "tid": 7, "ts": 1716454222804679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749200, "dur": 12, "args": { "External id": 90216, "cbid": 211, "correlation": 90216 } }, { "ph": "s", "id": 90216, "pid": 76337, "tid": -914061504, "ts": 1716454222749200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222804686, "dur": 10, "args": { "External id": 90236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90236, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90236, "pid": 5, "tid": 7, "ts": 1716454222804686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749267, "dur": 11, "args": { "External id": 90236, "cbid": 211, "correlation": 90236 } }, { "ph": "s", "id": 90236, "pid": 76337, "tid": -914061504, "ts": 1716454222749267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222804697, "dur": 4, "args": { "External id": 90248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90248, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90248, "pid": 5, "tid": 7, "ts": 1716454222804697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749288, "dur": 6, "args": { "External id": 90248, "cbid": 211, "correlation": 90248 } }, { "ph": "s", "id": 90248, "pid": 76337, "tid": -914061504, "ts": 1716454222749288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222804702, "dur": 7, "args": { "External id": 90251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90251, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90251, "pid": 5, "tid": 7, "ts": 1716454222804702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749307, "dur": 7, "args": { "External id": 90251, "cbid": 211, "correlation": 90251 } }, { "ph": "s", "id": 90251, "pid": 76337, "tid": -914061504, "ts": 1716454222749307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222804709, "dur": 4, "args": { "External id": 90260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90260, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90260, "pid": 5, "tid": 7, "ts": 1716454222804709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749348, "dur": 10, "args": { "External id": 90260, "cbid": 211, "correlation": 90260 } }, { "ph": "s", "id": 90260, "pid": 76337, "tid": -914061504, "ts": 1716454222749348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222749412, "dur": 0, "args": { "External id": 90270, "cbid": 317, "correlation": 90270 } }, { "ph": "f", "id": 90270, "pid": 76337, "tid": -914061504, "ts": 1716454222749412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222749413, "dur": 0, "args": { "External id": 90271, "cbid": 203, "correlation": 90271 } }, { "ph": "f", "id": 90271, "pid": 76337, "tid": -914061504, "ts": 1716454222749413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222749414, "dur": 0, "args": { "External id": 90272, "cbid": 205, "correlation": 90272 } }, { "ph": "f", "id": 90272, "pid": 76337, "tid": -914061504, "ts": 1716454222749414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222804715, "dur": 5, "args": { "External id": 90276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90276, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90276, "pid": 5, "tid": 7, "ts": 1716454222804715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749429, "dur": 12, "args": { "External id": 90276, "cbid": 211, "correlation": 90276 } }, { "ph": "s", "id": 90276, "pid": 76337, "tid": -914061504, "ts": 1716454222749429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222804721, "dur": 159, "args": { "External id": 90278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90278, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90278, "pid": 5, "tid": 7, "ts": 1716454222804721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749443, "dur": 5, "args": { "External id": 90278, "cbid": 211, "correlation": 90278 } }, { "ph": "s", "id": 90278, "pid": 76337, "tid": -914061504, "ts": 1716454222749443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222804882, "dur": 1, "args": { "External id": 90280, "device": 5, "context": 1, "stream": 7, "correlation": 90280, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 90280, "pid": 5, "tid": 7, "ts": 1716454222804882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222749454, "dur": 6, "args": { "External id": 90280, "cbid": 51, "correlation": 90280 } }, { "ph": "s", "id": 90280, "pid": 76337, "tid": -914061504, "ts": 1716454222749454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222804886, "dur": 255, "args": { "External id": 90281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90281, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90281, "pid": 5, "tid": 7, "ts": 1716454222804886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749461, "dur": 6, "args": { "External id": 90281, "cbid": 211, "correlation": 90281 } }, { "ph": "s", "id": 90281, "pid": 76337, "tid": -914061504, "ts": 1716454222749461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222805142, "dur": 6, "args": { "External id": 90283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90283, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90283, "pid": 5, "tid": 7, "ts": 1716454222805142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749472, "dur": 6, "args": { "External id": 90283, "cbid": 211, "correlation": 90283 } }, { "ph": "s", "id": 90283, "pid": 76337, "tid": -914061504, "ts": 1716454222749472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222805149, "dur": 6, "args": { "External id": 90289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90289, "pid": 5, "tid": 7, "ts": 1716454222805149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749500, "dur": 8, "args": { "External id": 90289, "cbid": 211, "correlation": 90289 } }, { "ph": "s", "id": 90289, "pid": 76337, "tid": -914061504, "ts": 1716454222749500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222805157, "dur": 5, "args": { "External id": 90297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90297, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90297, "pid": 5, "tid": 7, "ts": 1716454222805157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749533, "dur": 9, "args": { "External id": 90297, "cbid": 211, "correlation": 90297 } }, { "ph": "s", "id": 90297, "pid": 76337, "tid": -914061504, "ts": 1716454222749533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222805163, "dur": 4, "args": { "External id": 90305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90305, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90305, "pid": 5, "tid": 7, "ts": 1716454222805163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749562, "dur": 9, "args": { "External id": 90305, "cbid": 211, "correlation": 90305 } }, { "ph": "s", "id": 90305, "pid": 76337, "tid": -914061504, "ts": 1716454222749562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222805169, "dur": 9, "args": { "External id": 90325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90325, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90325, "pid": 5, "tid": 7, "ts": 1716454222805169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749637, "dur": 13, "args": { "External id": 90325, "cbid": 211, "correlation": 90325 } }, { "ph": "s", "id": 90325, "pid": 76337, "tid": -914061504, "ts": 1716454222749637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222805179, "dur": 4, "args": { "External id": 90337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90337, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90337, "pid": 5, "tid": 7, "ts": 1716454222805179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749660, "dur": 6, "args": { "External id": 90337, "cbid": 211, "correlation": 90337 } }, { "ph": "s", "id": 90337, "pid": 76337, "tid": -914061504, "ts": 1716454222749660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222805184, "dur": 7, "args": { "External id": 90340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90340, "pid": 5, "tid": 7, "ts": 1716454222805184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749678, "dur": 7, "args": { "External id": 90340, "cbid": 211, "correlation": 90340 } }, { "ph": "s", "id": 90340, "pid": 76337, "tid": -914061504, "ts": 1716454222749678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222805192, "dur": 4, "args": { "External id": 90349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90349, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90349, "pid": 5, "tid": 7, "ts": 1716454222805192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749717, "dur": 9, "args": { "External id": 90349, "cbid": 211, "correlation": 90349 } }, { "ph": "s", "id": 90349, "pid": 76337, "tid": -914061504, "ts": 1716454222749717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222749768, "dur": 0, "args": { "External id": 90359, "cbid": 317, "correlation": 90359 } }, { "ph": "f", "id": 90359, "pid": 76337, "tid": -914061504, "ts": 1716454222749768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222749769, "dur": 0, "args": { "External id": 90360, "cbid": 203, "correlation": 90360 } }, { "ph": "f", "id": 90360, "pid": 76337, "tid": -914061504, "ts": 1716454222749769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222749770, "dur": 0, "args": { "External id": 90361, "cbid": 205, "correlation": 90361 } }, { "ph": "f", "id": 90361, "pid": 76337, "tid": -914061504, "ts": 1716454222749770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222805198, "dur": 5, "args": { "External id": 90365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90365, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90365, "pid": 5, "tid": 7, "ts": 1716454222805198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749783, "dur": 11, "args": { "External id": 90365, "cbid": 211, "correlation": 90365 } }, { "ph": "s", "id": 90365, "pid": 76337, "tid": -914061504, "ts": 1716454222749783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222805204, "dur": 160, "args": { "External id": 90367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90367, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90367, "pid": 5, "tid": 7, "ts": 1716454222805204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749797, "dur": 5, "args": { "External id": 90367, "cbid": 211, "correlation": 90367 } }, { "ph": "s", "id": 90367, "pid": 76337, "tid": -914061504, "ts": 1716454222749797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222805367, "dur": 1, "args": { "External id": 90369, "device": 5, "context": 1, "stream": 7, "correlation": 90369, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 90369, "pid": 5, "tid": 7, "ts": 1716454222805367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222749808, "dur": 7, "args": { "External id": 90369, "cbid": 51, "correlation": 90369 } }, { "ph": "s", "id": 90369, "pid": 76337, "tid": -914061504, "ts": 1716454222749808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222805370, "dur": 255, "args": { "External id": 90370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90370, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90370, "pid": 5, "tid": 7, "ts": 1716454222805370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749815, "dur": 6, "args": { "External id": 90370, "cbid": 211, "correlation": 90370 } }, { "ph": "s", "id": 90370, "pid": 76337, "tid": -914061504, "ts": 1716454222749815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222805626, "dur": 6, "args": { "External id": 90372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90372, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90372, "pid": 5, "tid": 7, "ts": 1716454222805626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749826, "dur": 5, "args": { "External id": 90372, "cbid": 211, "correlation": 90372 } }, { "ph": "s", "id": 90372, "pid": 76337, "tid": -914061504, "ts": 1716454222749826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222805633, "dur": 6, "args": { "External id": 90378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90378, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90378, "pid": 5, "tid": 7, "ts": 1716454222805633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749853, "dur": 9, "args": { "External id": 90378, "cbid": 211, "correlation": 90378 } }, { "ph": "s", "id": 90378, "pid": 76337, "tid": -914061504, "ts": 1716454222749853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222805640, "dur": 3, "args": { "External id": 90386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90386, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 90386, "pid": 5, "tid": 7, "ts": 1716454222805640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749897, "dur": 9, "args": { "External id": 90386, "cbid": 211, "correlation": 90386 } }, { "ph": "s", "id": 90386, "pid": 76337, "tid": -914061504, "ts": 1716454222749897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222749961, "dur": 1, "args": { "External id": 90402, "cbid": 251, "correlation": 90402 } }, { "ph": "f", "id": 90402, "pid": 76337, "tid": -914061504, "ts": 1716454222749961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222749966, "dur": 0, "args": { "External id": 90404, "cbid": 251, "correlation": 90404 } }, { "ph": "f", "id": 90404, "pid": 76337, "tid": -914061504, "ts": 1716454222749966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222805645, "dur": 10, "args": { "External id": 90405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90405, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90405, "pid": 5, "tid": 7, "ts": 1716454222805645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749968, "dur": 21, "args": { "External id": 90405, "cbid": 211, "correlation": 90405 } }, { "ph": "s", "id": 90405, "pid": 76337, "tid": -914061504, "ts": 1716454222749968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222805656, "dur": 4, "args": { "External id": 90407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90407, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90407, "pid": 5, "tid": 7, "ts": 1716454222805656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222749991, "dur": 6, "args": { "External id": 90407, "cbid": 211, "correlation": 90407 } }, { "ph": "s", "id": 90407, "pid": 76337, "tid": -914061504, "ts": 1716454222749991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222805661, "dur": 6, "args": { "External id": 90417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90417, "pid": 5, "tid": 7, "ts": 1716454222805661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750049, "dur": 12, "args": { "External id": 90417, "cbid": 211, "correlation": 90417 } }, { "ph": "s", "id": 90417, "pid": 76337, "tid": -914061504, "ts": 1716454222750049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222805667, "dur": 9, "args": { "External id": 90437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90437, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90437, "pid": 5, "tid": 7, "ts": 1716454222805667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750115, "dur": 11, "args": { "External id": 90437, "cbid": 211, "correlation": 90437 } }, { "ph": "s", "id": 90437, "pid": 76337, "tid": -914061504, "ts": 1716454222750115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222805678, "dur": 4, "args": { "External id": 90449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90449, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90449, "pid": 5, "tid": 7, "ts": 1716454222805678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750136, "dur": 6, "args": { "External id": 90449, "cbid": 211, "correlation": 90449 } }, { "ph": "s", "id": 90449, "pid": 76337, "tid": -914061504, "ts": 1716454222750136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222805683, "dur": 6, "args": { "External id": 90452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90452, "pid": 5, "tid": 7, "ts": 1716454222805683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750157, "dur": 6, "args": { "External id": 90452, "cbid": 211, "correlation": 90452 } }, { "ph": "s", "id": 90452, "pid": 76337, "tid": -914061504, "ts": 1716454222750157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222805691, "dur": 4, "args": { "External id": 90461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90461, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90461, "pid": 5, "tid": 7, "ts": 1716454222805691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750197, "dur": 10, "args": { "External id": 90461, "cbid": 211, "correlation": 90461 } }, { "ph": "s", "id": 90461, "pid": 76337, "tid": -914061504, "ts": 1716454222750197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222750260, "dur": 0, "args": { "External id": 90471, "cbid": 317, "correlation": 90471 } }, { "ph": "f", "id": 90471, "pid": 76337, "tid": -914061504, "ts": 1716454222750260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222750261, "dur": 0, "args": { "External id": 90472, "cbid": 203, "correlation": 90472 } }, { "ph": "f", "id": 90472, "pid": 76337, "tid": -914061504, "ts": 1716454222750261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222750262, "dur": 0, "args": { "External id": 90473, "cbid": 205, "correlation": 90473 } }, { "ph": "f", "id": 90473, "pid": 76337, "tid": -914061504, "ts": 1716454222750262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222805696, "dur": 5, "args": { "External id": 90477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90477, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90477, "pid": 5, "tid": 7, "ts": 1716454222805696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750275, "dur": 12, "args": { "External id": 90477, "cbid": 211, "correlation": 90477 } }, { "ph": "s", "id": 90477, "pid": 76337, "tid": -914061504, "ts": 1716454222750275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222805703, "dur": 159, "args": { "External id": 90479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90479, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90479, "pid": 5, "tid": 7, "ts": 1716454222805703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750290, "dur": 5, "args": { "External id": 90479, "cbid": 211, "correlation": 90479 } }, { "ph": "s", "id": 90479, "pid": 76337, "tid": -914061504, "ts": 1716454222750290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222805864, "dur": 1, "args": { "External id": 90481, "device": 5, "context": 1, "stream": 7, "correlation": 90481, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 90481, "pid": 5, "tid": 7, "ts": 1716454222805864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222750301, "dur": 6, "args": { "External id": 90481, "cbid": 51, "correlation": 90481 } }, { "ph": "s", "id": 90481, "pid": 76337, "tid": -914061504, "ts": 1716454222750301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222805868, "dur": 253, "args": { "External id": 90482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90482, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90482, "pid": 5, "tid": 7, "ts": 1716454222805868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750308, "dur": 6, "args": { "External id": 90482, "cbid": 211, "correlation": 90482 } }, { "ph": "s", "id": 90482, "pid": 76337, "tid": -914061504, "ts": 1716454222750308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222806122, "dur": 6, "args": { "External id": 90484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90484, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90484, "pid": 5, "tid": 7, "ts": 1716454222806122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750320, "dur": 5, "args": { "External id": 90484, "cbid": 211, "correlation": 90484 } }, { "ph": "s", "id": 90484, "pid": 76337, "tid": -914061504, "ts": 1716454222750320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222806129, "dur": 6, "args": { "External id": 90490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90490, "pid": 5, "tid": 7, "ts": 1716454222806129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750348, "dur": 9, "args": { "External id": 90490, "cbid": 211, "correlation": 90490 } }, { "ph": "s", "id": 90490, "pid": 76337, "tid": -914061504, "ts": 1716454222750348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222806136, "dur": 5, "args": { "External id": 90498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90498, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90498, "pid": 5, "tid": 7, "ts": 1716454222806136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750382, "dur": 9, "args": { "External id": 90498, "cbid": 211, "correlation": 90498 } }, { "ph": "s", "id": 90498, "pid": 76337, "tid": -914061504, "ts": 1716454222750382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222806142, "dur": 4, "args": { "External id": 90506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90506, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90506, "pid": 5, "tid": 7, "ts": 1716454222806142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750411, "dur": 10, "args": { "External id": 90506, "cbid": 211, "correlation": 90506 } }, { "ph": "s", "id": 90506, "pid": 76337, "tid": -914061504, "ts": 1716454222750411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222806148, "dur": 10, "args": { "External id": 90526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90526, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90526, "pid": 5, "tid": 7, "ts": 1716454222806148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750519, "dur": 13, "args": { "External id": 90526, "cbid": 211, "correlation": 90526 } }, { "ph": "s", "id": 90526, "pid": 76337, "tid": -914061504, "ts": 1716454222750519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222806159, "dur": 3, "args": { "External id": 90538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90538, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90538, "pid": 5, "tid": 7, "ts": 1716454222806159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750542, "dur": 6, "args": { "External id": 90538, "cbid": 211, "correlation": 90538 } }, { "ph": "s", "id": 90538, "pid": 76337, "tid": -914061504, "ts": 1716454222750542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222806164, "dur": 6, "args": { "External id": 90541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90541, "pid": 5, "tid": 7, "ts": 1716454222806164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750559, "dur": 7, "args": { "External id": 90541, "cbid": 211, "correlation": 90541 } }, { "ph": "s", "id": 90541, "pid": 76337, "tid": -914061504, "ts": 1716454222750559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222806171, "dur": 4, "args": { "External id": 90550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90550, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90550, "pid": 5, "tid": 7, "ts": 1716454222806171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750597, "dur": 9, "args": { "External id": 90550, "cbid": 211, "correlation": 90550 } }, { "ph": "s", "id": 90550, "pid": 76337, "tid": -914061504, "ts": 1716454222750597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222750650, "dur": 0, "args": { "External id": 90560, "cbid": 317, "correlation": 90560 } }, { "ph": "f", "id": 90560, "pid": 76337, "tid": -914061504, "ts": 1716454222750650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222750651, "dur": 0, "args": { "External id": 90561, "cbid": 203, "correlation": 90561 } }, { "ph": "f", "id": 90561, "pid": 76337, "tid": -914061504, "ts": 1716454222750651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222750652, "dur": 0, "args": { "External id": 90562, "cbid": 205, "correlation": 90562 } }, { "ph": "f", "id": 90562, "pid": 76337, "tid": -914061504, "ts": 1716454222750652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222806177, "dur": 5, "args": { "External id": 90566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90566, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90566, "pid": 5, "tid": 7, "ts": 1716454222806177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750665, "dur": 12, "args": { "External id": 90566, "cbid": 211, "correlation": 90566 } }, { "ph": "s", "id": 90566, "pid": 76337, "tid": -914061504, "ts": 1716454222750665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222806183, "dur": 159, "args": { "External id": 90568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90568, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90568, "pid": 5, "tid": 7, "ts": 1716454222806183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750679, "dur": 5, "args": { "External id": 90568, "cbid": 211, "correlation": 90568 } }, { "ph": "s", "id": 90568, "pid": 76337, "tid": -914061504, "ts": 1716454222750679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222806345, "dur": 1, "args": { "External id": 90570, "device": 5, "context": 1, "stream": 7, "correlation": 90570, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 90570, "pid": 5, "tid": 7, "ts": 1716454222806345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222750690, "dur": 6, "args": { "External id": 90570, "cbid": 51, "correlation": 90570 } }, { "ph": "s", "id": 90570, "pid": 76337, "tid": -914061504, "ts": 1716454222750690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222806348, "dur": 253, "args": { "External id": 90571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90571, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90571, "pid": 5, "tid": 7, "ts": 1716454222806348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750697, "dur": 7, "args": { "External id": 90571, "cbid": 211, "correlation": 90571 } }, { "ph": "s", "id": 90571, "pid": 76337, "tid": -914061504, "ts": 1716454222750697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222806603, "dur": 6, "args": { "External id": 90573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90573, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90573, "pid": 5, "tid": 7, "ts": 1716454222806603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750708, "dur": 5, "args": { "External id": 90573, "cbid": 211, "correlation": 90573 } }, { "ph": "s", "id": 90573, "pid": 76337, "tid": -914061504, "ts": 1716454222750708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222806609, "dur": 6, "args": { "External id": 90579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90579, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90579, "pid": 5, "tid": 7, "ts": 1716454222806609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750736, "dur": 8, "args": { "External id": 90579, "cbid": 211, "correlation": 90579 } }, { "ph": "s", "id": 90579, "pid": 76337, "tid": -914061504, "ts": 1716454222750736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222806617, "dur": 3, "args": { "External id": 90587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90587, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 90587, "pid": 5, "tid": 7, "ts": 1716454222806617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750779, "dur": 9, "args": { "External id": 90587, "cbid": 211, "correlation": 90587 } }, { "ph": "s", "id": 90587, "pid": 76337, "tid": -914061504, "ts": 1716454222750779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222750843, "dur": 1, "args": { "External id": 90603, "cbid": 251, "correlation": 90603 } }, { "ph": "f", "id": 90603, "pid": 76337, "tid": -914061504, "ts": 1716454222750843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222750848, "dur": 0, "args": { "External id": 90605, "cbid": 251, "correlation": 90605 } }, { "ph": "f", "id": 90605, "pid": 76337, "tid": -914061504, "ts": 1716454222750848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222806621, "dur": 10, "args": { "External id": 90606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90606, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90606, "pid": 5, "tid": 7, "ts": 1716454222806621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750850, "dur": 11, "args": { "External id": 90606, "cbid": 211, "correlation": 90606 } }, { "ph": "s", "id": 90606, "pid": 76337, "tid": -914061504, "ts": 1716454222750850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222806632, "dur": 4, "args": { "External id": 90608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90608, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90608, "pid": 5, "tid": 7, "ts": 1716454222806632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750863, "dur": 6, "args": { "External id": 90608, "cbid": 211, "correlation": 90608 } }, { "ph": "s", "id": 90608, "pid": 76337, "tid": -914061504, "ts": 1716454222750863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222806638, "dur": 5, "args": { "External id": 90618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90618, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90618, "pid": 5, "tid": 7, "ts": 1716454222806638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750919, "dur": 13, "args": { "External id": 90618, "cbid": 211, "correlation": 90618 } }, { "ph": "s", "id": 90618, "pid": 76337, "tid": -914061504, "ts": 1716454222750919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222806644, "dur": 9, "args": { "External id": 90638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90638, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90638, "pid": 5, "tid": 7, "ts": 1716454222806644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222750995, "dur": 11, "args": { "External id": 90638, "cbid": 211, "correlation": 90638 } }, { "ph": "s", "id": 90638, "pid": 76337, "tid": -914061504, "ts": 1716454222750995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222806655, "dur": 3, "args": { "External id": 90650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90650, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90650, "pid": 5, "tid": 7, "ts": 1716454222806655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751016, "dur": 6, "args": { "External id": 90650, "cbid": 211, "correlation": 90650 } }, { "ph": "s", "id": 90650, "pid": 76337, "tid": -914061504, "ts": 1716454222751016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222806660, "dur": 6, "args": { "External id": 90653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90653, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90653, "pid": 5, "tid": 7, "ts": 1716454222806660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751036, "dur": 7, "args": { "External id": 90653, "cbid": 211, "correlation": 90653 } }, { "ph": "s", "id": 90653, "pid": 76337, "tid": -914061504, "ts": 1716454222751036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222806667, "dur": 4, "args": { "External id": 90662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90662, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90662, "pid": 5, "tid": 7, "ts": 1716454222806667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751077, "dur": 10, "args": { "External id": 90662, "cbid": 211, "correlation": 90662 } }, { "ph": "s", "id": 90662, "pid": 76337, "tid": -914061504, "ts": 1716454222751077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222751141, "dur": 0, "args": { "External id": 90672, "cbid": 317, "correlation": 90672 } }, { "ph": "f", "id": 90672, "pid": 76337, "tid": -914061504, "ts": 1716454222751141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222751142, "dur": 0, "args": { "External id": 90673, "cbid": 203, "correlation": 90673 } }, { "ph": "f", "id": 90673, "pid": 76337, "tid": -914061504, "ts": 1716454222751142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222751142, "dur": 0, "args": { "External id": 90674, "cbid": 205, "correlation": 90674 } }, { "ph": "f", "id": 90674, "pid": 76337, "tid": -914061504, "ts": 1716454222751142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222806673, "dur": 5, "args": { "External id": 90678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90678, "pid": 5, "tid": 7, "ts": 1716454222806673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751157, "dur": 12, "args": { "External id": 90678, "cbid": 211, "correlation": 90678 } }, { "ph": "s", "id": 90678, "pid": 76337, "tid": -914061504, "ts": 1716454222751157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222806679, "dur": 159, "args": { "External id": 90680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90680, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90680, "pid": 5, "tid": 7, "ts": 1716454222806679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751172, "dur": 5, "args": { "External id": 90680, "cbid": 211, "correlation": 90680 } }, { "ph": "s", "id": 90680, "pid": 76337, "tid": -914061504, "ts": 1716454222751172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222806841, "dur": 1, "args": { "External id": 90682, "device": 5, "context": 1, "stream": 7, "correlation": 90682, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 90682, "pid": 5, "tid": 7, "ts": 1716454222806841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222751182, "dur": 6, "args": { "External id": 90682, "cbid": 51, "correlation": 90682 } }, { "ph": "s", "id": 90682, "pid": 76337, "tid": -914061504, "ts": 1716454222751182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222806845, "dur": 254, "args": { "External id": 90683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90683, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90683, "pid": 5, "tid": 7, "ts": 1716454222806845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751190, "dur": 6, "args": { "External id": 90683, "cbid": 211, "correlation": 90683 } }, { "ph": "s", "id": 90683, "pid": 76337, "tid": -914061504, "ts": 1716454222751190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222807100, "dur": 6, "args": { "External id": 90685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90685, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90685, "pid": 5, "tid": 7, "ts": 1716454222807100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751199, "dur": 5, "args": { "External id": 90685, "cbid": 211, "correlation": 90685 } }, { "ph": "s", "id": 90685, "pid": 76337, "tid": -914061504, "ts": 1716454222751199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807107, "dur": 6, "args": { "External id": 90691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90691, "pid": 5, "tid": 7, "ts": 1716454222807107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751228, "dur": 9, "args": { "External id": 90691, "cbid": 211, "correlation": 90691 } }, { "ph": "s", "id": 90691, "pid": 76337, "tid": -914061504, "ts": 1716454222751228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222807115, "dur": 5, "args": { "External id": 90699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90699, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90699, "pid": 5, "tid": 7, "ts": 1716454222807115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751260, "dur": 9, "args": { "External id": 90699, "cbid": 211, "correlation": 90699 } }, { "ph": "s", "id": 90699, "pid": 76337, "tid": -914061504, "ts": 1716454222751260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222807121, "dur": 4, "args": { "External id": 90707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90707, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90707, "pid": 5, "tid": 7, "ts": 1716454222807121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751291, "dur": 9, "args": { "External id": 90707, "cbid": 211, "correlation": 90707 } }, { "ph": "s", "id": 90707, "pid": 76337, "tid": -914061504, "ts": 1716454222751291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222807126, "dur": 9, "args": { "External id": 90727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90727, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 90727, "pid": 5, "tid": 7, "ts": 1716454222807126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751439, "dur": 14, "args": { "External id": 90727, "cbid": 211, "correlation": 90727 } }, { "ph": "s", "id": 90727, "pid": 76337, "tid": -914061504, "ts": 1716454222751439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222807137, "dur": 4, "args": { "External id": 90739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90739, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 90739, "pid": 5, "tid": 7, "ts": 1716454222807137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751462, "dur": 7, "args": { "External id": 90739, "cbid": 211, "correlation": 90739 } }, { "ph": "s", "id": 90739, "pid": 76337, "tid": -914061504, "ts": 1716454222751462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807142, "dur": 6, "args": { "External id": 90742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90742, "pid": 5, "tid": 7, "ts": 1716454222807142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751480, "dur": 7, "args": { "External id": 90742, "cbid": 211, "correlation": 90742 } }, { "ph": "s", "id": 90742, "pid": 76337, "tid": -914061504, "ts": 1716454222751480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222751539, "dur": 0, "args": { "External id": 90753, "cbid": 317, "correlation": 90753 } }, { "ph": "f", "id": 90753, "pid": 76337, "tid": -914061504, "ts": 1716454222751539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222751540, "dur": 0, "args": { "External id": 90754, "cbid": 203, "correlation": 90754 } }, { "ph": "f", "id": 90754, "pid": 76337, "tid": -914061504, "ts": 1716454222751540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222751541, "dur": 0, "args": { "External id": 90755, "cbid": 205, "correlation": 90755 } }, { "ph": "f", "id": 90755, "pid": 76337, "tid": -914061504, "ts": 1716454222751541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222807150, "dur": 5, "args": { "External id": 90759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90759, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90759, "pid": 5, "tid": 7, "ts": 1716454222807150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751559, "dur": 12, "args": { "External id": 90759, "cbid": 211, "correlation": 90759 } }, { "ph": "s", "id": 90759, "pid": 76337, "tid": -914061504, "ts": 1716454222751559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222807156, "dur": 36, "args": { "External id": 90761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90761, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 90761, "pid": 5, "tid": 7, "ts": 1716454222807156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751579, "dur": 9, "args": { "External id": 90761, "cbid": 211, "correlation": 90761 } }, { "ph": "s", "id": 90761, "pid": 76337, "tid": -914061504, "ts": 1716454222751579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222807194, "dur": 5, "args": { "External id": 90763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90763, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90763, "pid": 5, "tid": 7, "ts": 1716454222807194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751591, "dur": 5, "args": { "External id": 90763, "cbid": 211, "correlation": 90763 } }, { "ph": "s", "id": 90763, "pid": 76337, "tid": -914061504, "ts": 1716454222751591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807200, "dur": 6, "args": { "External id": 90769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90769, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90769, "pid": 5, "tid": 7, "ts": 1716454222807200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751619, "dur": 8, "args": { "External id": 90769, "cbid": 211, "correlation": 90769 } }, { "ph": "s", "id": 90769, "pid": 76337, "tid": -914061504, "ts": 1716454222751619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222807207, "dur": 20, "args": { "External id": 90778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90778, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90778, "pid": 5, "tid": 7, "ts": 1716454222807207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751705, "dur": 16, "args": { "External id": 90778, "cbid": 211, "correlation": 90778 } }, { "ph": "s", "id": 90778, "pid": 76337, "tid": -914061504, "ts": 1716454222751705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222807229, "dur": 11, "args": { "External id": 90800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90800, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 90800, "pid": 5, "tid": 7, "ts": 1716454222807229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751764, "dur": 12, "args": { "External id": 90800, "cbid": 211, "correlation": 90800 } }, { "ph": "s", "id": 90800, "pid": 76337, "tid": -914061504, "ts": 1716454222751764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222751861, "dur": 2, "args": { "External id": 90811, "cbid": 251, "correlation": 90811 } }, { "ph": "f", "id": 90811, "pid": 76337, "tid": -914061504, "ts": 1716454222751861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222751867, "dur": 0, "args": { "External id": 90812, "cbid": 251, "correlation": 90812 } }, { "ph": "f", "id": 90812, "pid": 76337, "tid": -914061504, "ts": 1716454222751867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222807241, "dur": 53, "args": { "External id": 90813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90813, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 90813, "pid": 5, "tid": 7, "ts": 1716454222807241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751871, "dur": 15, "args": { "External id": 90813, "cbid": 211, "correlation": 90813 } }, { "ph": "s", "id": 90813, "pid": 76337, "tid": -914061504, "ts": 1716454222751871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222751944, "dur": 1, "args": { "External id": 90824, "cbid": 251, "correlation": 90824 } }, { "ph": "f", "id": 90824, "pid": 76337, "tid": -914061504, "ts": 1716454222751944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222751948, "dur": 0, "args": { "External id": 90825, "cbid": 251, "correlation": 90825 } }, { "ph": "f", "id": 90825, "pid": 76337, "tid": -914061504, "ts": 1716454222751948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222807295, "dur": 52, "args": { "External id": 90826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90826, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 90826, "pid": 5, "tid": 7, "ts": 1716454222807295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222751950, "dur": 12, "args": { "External id": 90826, "cbid": 211, "correlation": 90826 } }, { "ph": "s", "id": 90826, "pid": 76337, "tid": -914061504, "ts": 1716454222751950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752026, "dur": 1, "args": { "External id": 90837, "cbid": 251, "correlation": 90837 } }, { "ph": "f", "id": 90837, "pid": 76337, "tid": -914061504, "ts": 1716454222752026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752030, "dur": 0, "args": { "External id": 90838, "cbid": 251, "correlation": 90838 } }, { "ph": "f", "id": 90838, "pid": 76337, "tid": -914061504, "ts": 1716454222752030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222807348, "dur": 53, "args": { "External id": 90839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90839, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 90839, "pid": 5, "tid": 7, "ts": 1716454222807348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752032, "dur": 12, "args": { "External id": 90839, "cbid": 211, "correlation": 90839 } }, { "ph": "s", "id": 90839, "pid": 76337, "tid": -914061504, "ts": 1716454222752032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222807402, "dur": 56, "args": { "External id": 90864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90864, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90864, "pid": 5, "tid": 7, "ts": 1716454222807402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752124, "dur": 13, "args": { "External id": 90864, "cbid": 211, "correlation": 90864 } }, { "ph": "s", "id": 90864, "pid": 76337, "tid": -914061504, "ts": 1716454222752124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752229, "dur": 1, "args": { "External id": 90882, "cbid": 251, "correlation": 90882 } }, { "ph": "f", "id": 90882, "pid": 76337, "tid": -914061504, "ts": 1716454222752229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222807460, "dur": 62, "args": { "External id": 90884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90884, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 90884, "pid": 5, "tid": 7, "ts": 1716454222807460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752235, "dur": 14, "args": { "External id": 90884, "cbid": 211, "correlation": 90884 } }, { "ph": "s", "id": 90884, "pid": 76337, "tid": -914061504, "ts": 1716454222752235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222807523, "dur": 6, "args": { "External id": 90892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90892, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90892, "pid": 5, "tid": 7, "ts": 1716454222807523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752308, "dur": 14, "args": { "External id": 90892, "cbid": 211, "correlation": 90892 } }, { "ph": "s", "id": 90892, "pid": 76337, "tid": -914061504, "ts": 1716454222752308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222807531, "dur": 7, "args": { "External id": 90900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90900, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 90900, "pid": 5, "tid": 7, "ts": 1716454222807531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752348, "dur": 9, "args": { "External id": 90900, "cbid": 211, "correlation": 90900 } }, { "ph": "s", "id": 90900, "pid": 76337, "tid": -914061504, "ts": 1716454222752348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807539, "dur": 7, "args": { "External id": 90911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 90911, "pid": 5, "tid": 7, "ts": 1716454222807539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752427, "dur": 13, "args": { "External id": 90911, "cbid": 211, "correlation": 90911 } }, { "ph": "s", "id": 90911, "pid": 76337, "tid": -914061504, "ts": 1716454222752427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222807548, "dur": 8, "args": { "External id": 90933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90933, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 90933, "pid": 5, "tid": 7, "ts": 1716454222807548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752460, "dur": 8, "args": { "External id": 90933, "cbid": 211, "correlation": 90933 } }, { "ph": "s", "id": 90933, "pid": 76337, "tid": -914061504, "ts": 1716454222752460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752547, "dur": 2, "args": { "External id": 90944, "cbid": 251, "correlation": 90944 } }, { "ph": "f", "id": 90944, "pid": 76337, "tid": -914061504, "ts": 1716454222752547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222807558, "dur": 1, "args": { "External id": 90945, "device": 5, "context": 1, "stream": 7, "correlation": 90945, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 90945, "pid": 5, "tid": 7, "ts": 1716454222807558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222752553, "dur": 11, "args": { "External id": 90945, "cbid": 51, "correlation": 90945 } }, { "ph": "s", "id": 90945, "pid": 76337, "tid": -914061504, "ts": 1716454222752553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222807562, "dur": 36, "args": { "External id": 90946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90946, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 90946, "pid": 5, "tid": 7, "ts": 1716454222807562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752565, "dur": 12, "args": { "External id": 90946, "cbid": 211, "correlation": 90946 } }, { "ph": "s", "id": 90946, "pid": 76337, "tid": -914061504, "ts": 1716454222752565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752638, "dur": 1, "args": { "External id": 90957, "cbid": 251, "correlation": 90957 } }, { "ph": "f", "id": 90957, "pid": 76337, "tid": -914061504, "ts": 1716454222752638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752642, "dur": 0, "args": { "External id": 90958, "cbid": 251, "correlation": 90958 } }, { "ph": "f", "id": 90958, "pid": 76337, "tid": -914061504, "ts": 1716454222752642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222807599, "dur": 12, "args": { "External id": 90959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90959, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90959, "pid": 5, "tid": 7, "ts": 1716454222807599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752644, "dur": 13, "args": { "External id": 90959, "cbid": 211, "correlation": 90959 } }, { "ph": "s", "id": 90959, "pid": 76337, "tid": -914061504, "ts": 1716454222752644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222807612, "dur": 5, "args": { "External id": 90961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90961, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90961, "pid": 5, "tid": 7, "ts": 1716454222807612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752660, "dur": 6, "args": { "External id": 90961, "cbid": 211, "correlation": 90961 } }, { "ph": "s", "id": 90961, "pid": 76337, "tid": -914061504, "ts": 1716454222752660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752717, "dur": 1, "args": { "External id": 90972, "cbid": 251, "correlation": 90972 } }, { "ph": "f", "id": 90972, "pid": 76337, "tid": -914061504, "ts": 1716454222752717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752721, "dur": 0, "args": { "External id": 90973, "cbid": 251, "correlation": 90973 } }, { "ph": "f", "id": 90973, "pid": 76337, "tid": -914061504, "ts": 1716454222752721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222807619, "dur": 8, "args": { "External id": 90974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90974, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90974, "pid": 5, "tid": 7, "ts": 1716454222807619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752722, "dur": 12, "args": { "External id": 90974, "cbid": 211, "correlation": 90974 } }, { "ph": "s", "id": 90974, "pid": 76337, "tid": -914061504, "ts": 1716454222752722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222807628, "dur": 4, "args": { "External id": 90976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 90976, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 90976, "pid": 5, "tid": 7, "ts": 1716454222807628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752735, "dur": 6, "args": { "External id": 90976, "cbid": 211, "correlation": 90976 } }, { "ph": "s", "id": 90976, "pid": 76337, "tid": -914061504, "ts": 1716454222752735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222807633, "dur": 19, "args": { "External id": 91001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91001, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 91001, "pid": 5, "tid": 7, "ts": 1716454222807633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752815, "dur": 14, "args": { "External id": 91001, "cbid": 211, "correlation": 91001 } }, { "ph": "s", "id": 91001, "pid": 76337, "tid": -914061504, "ts": 1716454222752815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222752917, "dur": 2, "args": { "External id": 91019, "cbid": 251, "correlation": 91019 } }, { "ph": "f", "id": 91019, "pid": 76337, "tid": -914061504, "ts": 1716454222752917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222807655, "dur": 1, "args": { "External id": 91021, "device": 5, "context": 1, "stream": 7, "correlation": 91021, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 91021, "pid": 5, "tid": 7, "ts": 1716454222807655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222752922, "dur": 11, "args": { "External id": 91021, "cbid": 51, "correlation": 91021 } }, { "ph": "s", "id": 91021, "pid": 76337, "tid": -914061504, "ts": 1716454222752922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222807658, "dur": 36, "args": { "External id": 91022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91022, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 91022, "pid": 5, "tid": 7, "ts": 1716454222807658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222752935, "dur": 13, "args": { "External id": 91022, "cbid": 211, "correlation": 91022 } }, { "ph": "s", "id": 91022, "pid": 76337, "tid": -914061504, "ts": 1716454222752935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222807695, "dur": 4, "args": { "External id": 91030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91030, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91030, "pid": 5, "tid": 7, "ts": 1716454222807695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753017, "dur": 13, "args": { "External id": 91030, "cbid": 211, "correlation": 91030 } }, { "ph": "s", "id": 91030, "pid": 76337, "tid": -914061504, "ts": 1716454222753017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807700, "dur": 8, "args": { "External id": 91038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91038, "pid": 5, "tid": 7, "ts": 1716454222807700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753060, "dur": 9, "args": { "External id": 91038, "cbid": 211, "correlation": 91038 } }, { "ph": "s", "id": 91038, "pid": 76337, "tid": -914061504, "ts": 1716454222753060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222807710, "dur": 8, "args": { "External id": 91060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91060, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 91060, "pid": 5, "tid": 7, "ts": 1716454222807710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753113, "dur": 10, "args": { "External id": 91060, "cbid": 211, "correlation": 91060 } }, { "ph": "s", "id": 91060, "pid": 76337, "tid": -914061504, "ts": 1716454222753113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222753206, "dur": 1, "args": { "External id": 91076, "cbid": 251, "correlation": 91076 } }, { "ph": "f", "id": 91076, "pid": 76337, "tid": -914061504, "ts": 1716454222753206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222753212, "dur": 0, "args": { "External id": 91078, "cbid": 251, "correlation": 91078 } }, { "ph": "f", "id": 91078, "pid": 76337, "tid": -914061504, "ts": 1716454222753212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222807719, "dur": 188, "args": { "External id": 91079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91079, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91079, "pid": 5, "tid": 7, "ts": 1716454222807719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753215, "dur": 13, "args": { "External id": 91079, "cbid": 211, "correlation": 91079 } }, { "ph": "s", "id": 91079, "pid": 76337, "tid": -914061504, "ts": 1716454222753215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807908, "dur": 21, "args": { "External id": 91087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91087, "pid": 5, "tid": 7, "ts": 1716454222807908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753287, "dur": 13, "args": { "External id": 91087, "cbid": 211, "correlation": 91087 } }, { "ph": "s", "id": 91087, "pid": 76337, "tid": -914061504, "ts": 1716454222753287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222807931, "dur": 21, "args": { "External id": 91095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91095, "pid": 5, "tid": 7, "ts": 1716454222807931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753321, "dur": 8, "args": { "External id": 91095, "cbid": 211, "correlation": 91095 } }, { "ph": "s", "id": 91095, "pid": 76337, "tid": -914061504, "ts": 1716454222753321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222753402, "dur": 1, "args": { "External id": 91111, "cbid": 251, "correlation": 91111 } }, { "ph": "f", "id": 91111, "pid": 76337, "tid": -914061504, "ts": 1716454222753402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222807954, "dur": 1, "args": { "External id": 91113, "device": 5, "context": 1, "stream": 7, "correlation": 91113, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 91113, "pid": 5, "tid": 7, "ts": 1716454222807954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222753407, "dur": 9, "args": { "External id": 91113, "cbid": 51, "correlation": 91113 } }, { "ph": "s", "id": 91113, "pid": 76337, "tid": -914061504, "ts": 1716454222753407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222807958, "dur": 110, "args": { "External id": 91114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91114, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 91114, "pid": 5, "tid": 7, "ts": 1716454222807958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753417, "dur": 12, "args": { "External id": 91114, "cbid": 211, "correlation": 91114 } }, { "ph": "s", "id": 91114, "pid": 76337, "tid": -914061504, "ts": 1716454222753417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222808068, "dur": 5, "args": { "External id": 91122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91122, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91122, "pid": 5, "tid": 7, "ts": 1716454222808068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753459, "dur": 10, "args": { "External id": 91122, "cbid": 211, "correlation": 91122 } }, { "ph": "s", "id": 91122, "pid": 76337, "tid": -914061504, "ts": 1716454222753459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222808075, "dur": 9, "args": { "External id": 91133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91133, "pid": 5, "tid": 7, "ts": 1716454222808075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753533, "dur": 12, "args": { "External id": 91133, "cbid": 211, "correlation": 91133 } }, { "ph": "s", "id": 91133, "pid": 76337, "tid": -914061504, "ts": 1716454222753533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222753602, "dur": 0, "args": { "External id": 91145, "cbid": 317, "correlation": 91145 } }, { "ph": "f", "id": 91145, "pid": 76337, "tid": -914061504, "ts": 1716454222753602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222753603, "dur": 0, "args": { "External id": 91146, "cbid": 203, "correlation": 91146 } }, { "ph": "f", "id": 91146, "pid": 76337, "tid": -914061504, "ts": 1716454222753603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222753603, "dur": 0, "args": { "External id": 91147, "cbid": 205, "correlation": 91147 } }, { "ph": "f", "id": 91147, "pid": 76337, "tid": -914061504, "ts": 1716454222753603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808085, "dur": 5, "args": { "External id": 91151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91151, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91151, "pid": 5, "tid": 7, "ts": 1716454222808085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753621, "dur": 12, "args": { "External id": 91151, "cbid": 211, "correlation": 91151 } }, { "ph": "s", "id": 91151, "pid": 76337, "tid": -914061504, "ts": 1716454222753621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222808092, "dur": 37, "args": { "External id": 91153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91153, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 91153, "pid": 5, "tid": 7, "ts": 1716454222808092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753642, "dur": 8, "args": { "External id": 91153, "cbid": 211, "correlation": 91153 } }, { "ph": "s", "id": 91153, "pid": 76337, "tid": -914061504, "ts": 1716454222753642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808130, "dur": 6, "args": { "External id": 91155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91155, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91155, "pid": 5, "tid": 7, "ts": 1716454222808130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753654, "dur": 5, "args": { "External id": 91155, "cbid": 211, "correlation": 91155 } }, { "ph": "s", "id": 91155, "pid": 76337, "tid": -914061504, "ts": 1716454222753654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222808137, "dur": 7, "args": { "External id": 91161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91161, "pid": 5, "tid": 7, "ts": 1716454222808137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753681, "dur": 9, "args": { "External id": 91161, "cbid": 211, "correlation": 91161 } }, { "ph": "s", "id": 91161, "pid": 76337, "tid": -914061504, "ts": 1716454222753681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222808145, "dur": 5, "args": { "External id": 91169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91169, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91169, "pid": 5, "tid": 7, "ts": 1716454222808145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753714, "dur": 8, "args": { "External id": 91169, "cbid": 211, "correlation": 91169 } }, { "ph": "s", "id": 91169, "pid": 76337, "tid": -914061504, "ts": 1716454222753714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222808151, "dur": 11, "args": { "External id": 91189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91189, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91189, "pid": 5, "tid": 7, "ts": 1716454222808151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753792, "dur": 12, "args": { "External id": 91189, "cbid": 211, "correlation": 91189 } }, { "ph": "s", "id": 91189, "pid": 76337, "tid": -914061504, "ts": 1716454222753792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222808163, "dur": 4, "args": { "External id": 91201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91201, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 91201, "pid": 5, "tid": 7, "ts": 1716454222808163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753815, "dur": 7, "args": { "External id": 91201, "cbid": 211, "correlation": 91201 } }, { "ph": "s", "id": 91201, "pid": 76337, "tid": -914061504, "ts": 1716454222753815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222808169, "dur": 8, "args": { "External id": 91204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91204, "pid": 5, "tid": 7, "ts": 1716454222808169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753834, "dur": 6, "args": { "External id": 91204, "cbid": 211, "correlation": 91204 } }, { "ph": "s", "id": 91204, "pid": 76337, "tid": -914061504, "ts": 1716454222753834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222808178, "dur": 5, "args": { "External id": 91213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91213, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91213, "pid": 5, "tid": 7, "ts": 1716454222808178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753875, "dur": 10, "args": { "External id": 91213, "cbid": 211, "correlation": 91213 } }, { "ph": "s", "id": 91213, "pid": 76337, "tid": -914061504, "ts": 1716454222753875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222753928, "dur": 0, "args": { "External id": 91223, "cbid": 317, "correlation": 91223 } }, { "ph": "f", "id": 91223, "pid": 76337, "tid": -914061504, "ts": 1716454222753928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222753929, "dur": 0, "args": { "External id": 91224, "cbid": 203, "correlation": 91224 } }, { "ph": "f", "id": 91224, "pid": 76337, "tid": -914061504, "ts": 1716454222753929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222753930, "dur": 0, "args": { "External id": 91225, "cbid": 205, "correlation": 91225 } }, { "ph": "f", "id": 91225, "pid": 76337, "tid": -914061504, "ts": 1716454222753930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808185, "dur": 5, "args": { "External id": 91229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91229, "pid": 5, "tid": 7, "ts": 1716454222808185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753944, "dur": 11, "args": { "External id": 91229, "cbid": 211, "correlation": 91229 } }, { "ph": "s", "id": 91229, "pid": 76337, "tid": -914061504, "ts": 1716454222753944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808191, "dur": 160, "args": { "External id": 91231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91231, "pid": 5, "tid": 7, "ts": 1716454222808191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753958, "dur": 5, "args": { "External id": 91231, "cbid": 211, "correlation": 91231 } }, { "ph": "s", "id": 91231, "pid": 76337, "tid": -914061504, "ts": 1716454222753958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222808353, "dur": 1, "args": { "External id": 91233, "device": 5, "context": 1, "stream": 7, "correlation": 91233, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 91233, "pid": 5, "tid": 7, "ts": 1716454222808353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222753969, "dur": 16, "args": { "External id": 91233, "cbid": 51, "correlation": 91233 } }, { "ph": "s", "id": 91233, "pid": 76337, "tid": -914061504, "ts": 1716454222753969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222808357, "dur": 265, "args": { "External id": 91234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91234, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91234, "pid": 5, "tid": 7, "ts": 1716454222808357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753986, "dur": 7, "args": { "External id": 91234, "cbid": 211, "correlation": 91234 } }, { "ph": "s", "id": 91234, "pid": 76337, "tid": -914061504, "ts": 1716454222753986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808623, "dur": 6, "args": { "External id": 91236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91236, "pid": 5, "tid": 7, "ts": 1716454222808623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222753997, "dur": 5, "args": { "External id": 91236, "cbid": 211, "correlation": 91236 } }, { "ph": "s", "id": 91236, "pid": 76337, "tid": -914061504, "ts": 1716454222753997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222808630, "dur": 6, "args": { "External id": 91242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91242, "pid": 5, "tid": 7, "ts": 1716454222808630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754025, "dur": 10, "args": { "External id": 91242, "cbid": 211, "correlation": 91242 } }, { "ph": "s", "id": 91242, "pid": 76337, "tid": -914061504, "ts": 1716454222754025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222808638, "dur": 3, "args": { "External id": 91250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91250, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 91250, "pid": 5, "tid": 7, "ts": 1716454222808638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754071, "dur": 9, "args": { "External id": 91250, "cbid": 211, "correlation": 91250 } }, { "ph": "s", "id": 91250, "pid": 76337, "tid": -914061504, "ts": 1716454222754071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222754138, "dur": 1, "args": { "External id": 91266, "cbid": 251, "correlation": 91266 } }, { "ph": "f", "id": 91266, "pid": 76337, "tid": -914061504, "ts": 1716454222754138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222754144, "dur": 0, "args": { "External id": 91268, "cbid": 251, "correlation": 91268 } }, { "ph": "f", "id": 91268, "pid": 76337, "tid": -914061504, "ts": 1716454222754144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222808642, "dur": 12, "args": { "External id": 91269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91269, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91269, "pid": 5, "tid": 7, "ts": 1716454222808642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754146, "dur": 12, "args": { "External id": 91269, "cbid": 211, "correlation": 91269 } }, { "ph": "s", "id": 91269, "pid": 76337, "tid": -914061504, "ts": 1716454222754146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222808656, "dur": 5, "args": { "External id": 91271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91271, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91271, "pid": 5, "tid": 7, "ts": 1716454222808656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754160, "dur": 5, "args": { "External id": 91271, "cbid": 211, "correlation": 91271 } }, { "ph": "s", "id": 91271, "pid": 76337, "tid": -914061504, "ts": 1716454222754160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222808662, "dur": 6, "args": { "External id": 91281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91281, "pid": 5, "tid": 7, "ts": 1716454222808662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754218, "dur": 12, "args": { "External id": 91281, "cbid": 211, "correlation": 91281 } }, { "ph": "s", "id": 91281, "pid": 76337, "tid": -914061504, "ts": 1716454222754218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222808669, "dur": 10, "args": { "External id": 91301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91301, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91301, "pid": 5, "tid": 7, "ts": 1716454222808669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754285, "dur": 11, "args": { "External id": 91301, "cbid": 211, "correlation": 91301 } }, { "ph": "s", "id": 91301, "pid": 76337, "tid": -914061504, "ts": 1716454222754285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222808680, "dur": 4, "args": { "External id": 91313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91313, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 91313, "pid": 5, "tid": 7, "ts": 1716454222808680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754305, "dur": 6, "args": { "External id": 91313, "cbid": 211, "correlation": 91313 } }, { "ph": "s", "id": 91313, "pid": 76337, "tid": -914061504, "ts": 1716454222754305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222808685, "dur": 7, "args": { "External id": 91316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91316, "pid": 5, "tid": 7, "ts": 1716454222808685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754324, "dur": 7, "args": { "External id": 91316, "cbid": 211, "correlation": 91316 } }, { "ph": "s", "id": 91316, "pid": 76337, "tid": -914061504, "ts": 1716454222754324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222808693, "dur": 4, "args": { "External id": 91325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91325, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91325, "pid": 5, "tid": 7, "ts": 1716454222808693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754365, "dur": 10, "args": { "External id": 91325, "cbid": 211, "correlation": 91325 } }, { "ph": "s", "id": 91325, "pid": 76337, "tid": -914061504, "ts": 1716454222754365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222754429, "dur": 0, "args": { "External id": 91335, "cbid": 317, "correlation": 91335 } }, { "ph": "f", "id": 91335, "pid": 76337, "tid": -914061504, "ts": 1716454222754429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222754430, "dur": 0, "args": { "External id": 91336, "cbid": 203, "correlation": 91336 } }, { "ph": "f", "id": 91336, "pid": 76337, "tid": -914061504, "ts": 1716454222754430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222754431, "dur": 0, "args": { "External id": 91337, "cbid": 205, "correlation": 91337 } }, { "ph": "f", "id": 91337, "pid": 76337, "tid": -914061504, "ts": 1716454222754431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808699, "dur": 5, "args": { "External id": 91341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91341, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91341, "pid": 5, "tid": 7, "ts": 1716454222808699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754445, "dur": 12, "args": { "External id": 91341, "cbid": 211, "correlation": 91341 } }, { "ph": "s", "id": 91341, "pid": 76337, "tid": -914061504, "ts": 1716454222754445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222808705, "dur": 159, "args": { "External id": 91343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91343, "pid": 5, "tid": 7, "ts": 1716454222808705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754459, "dur": 5, "args": { "External id": 91343, "cbid": 211, "correlation": 91343 } }, { "ph": "s", "id": 91343, "pid": 76337, "tid": -914061504, "ts": 1716454222754459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222808866, "dur": 1, "args": { "External id": 91345, "device": 5, "context": 1, "stream": 7, "correlation": 91345, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 91345, "pid": 5, "tid": 7, "ts": 1716454222808866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222754470, "dur": 6, "args": { "External id": 91345, "cbid": 51, "correlation": 91345 } }, { "ph": "s", "id": 91345, "pid": 76337, "tid": -914061504, "ts": 1716454222754470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222808870, "dur": 255, "args": { "External id": 91346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91346, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91346, "pid": 5, "tid": 7, "ts": 1716454222808870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754477, "dur": 7, "args": { "External id": 91346, "cbid": 211, "correlation": 91346 } }, { "ph": "s", "id": 91346, "pid": 76337, "tid": -914061504, "ts": 1716454222754477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222809126, "dur": 6, "args": { "External id": 91348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91348, "pid": 5, "tid": 7, "ts": 1716454222809126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754489, "dur": 5, "args": { "External id": 91348, "cbid": 211, "correlation": 91348 } }, { "ph": "s", "id": 91348, "pid": 76337, "tid": -914061504, "ts": 1716454222754489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222809133, "dur": 6, "args": { "External id": 91354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91354, "pid": 5, "tid": 7, "ts": 1716454222809133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754517, "dur": 9, "args": { "External id": 91354, "cbid": 211, "correlation": 91354 } }, { "ph": "s", "id": 91354, "pid": 76337, "tid": -914061504, "ts": 1716454222754517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222809141, "dur": 5, "args": { "External id": 91362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91362, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91362, "pid": 5, "tid": 7, "ts": 1716454222809141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754550, "dur": 8, "args": { "External id": 91362, "cbid": 211, "correlation": 91362 } }, { "ph": "s", "id": 91362, "pid": 76337, "tid": -914061504, "ts": 1716454222754550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222809146, "dur": 4, "args": { "External id": 91370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91370, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91370, "pid": 5, "tid": 7, "ts": 1716454222809146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754580, "dur": 8, "args": { "External id": 91370, "cbid": 211, "correlation": 91370 } }, { "ph": "s", "id": 91370, "pid": 76337, "tid": -914061504, "ts": 1716454222754580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222809152, "dur": 11, "args": { "External id": 91379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91379, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91379, "pid": 5, "tid": 7, "ts": 1716454222809152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754676, "dur": 14, "args": { "External id": 91379, "cbid": 211, "correlation": 91379 } }, { "ph": "s", "id": 91379, "pid": 76337, "tid": -914061504, "ts": 1716454222754676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222809164, "dur": 12, "args": { "External id": 91399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91399, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91399, "pid": 5, "tid": 7, "ts": 1716454222809164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754745, "dur": 11, "args": { "External id": 91399, "cbid": 211, "correlation": 91399 } }, { "ph": "s", "id": 91399, "pid": 76337, "tid": -914061504, "ts": 1716454222754745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222809178, "dur": 4, "args": { "External id": 91411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91411, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91411, "pid": 5, "tid": 7, "ts": 1716454222809178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754768, "dur": 6, "args": { "External id": 91411, "cbid": 211, "correlation": 91411 } }, { "ph": "s", "id": 91411, "pid": 76337, "tid": -914061504, "ts": 1716454222754768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222809183, "dur": 10, "args": { "External id": 91414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91414, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91414, "pid": 5, "tid": 7, "ts": 1716454222809183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754786, "dur": 6, "args": { "External id": 91414, "cbid": 211, "correlation": 91414 } }, { "ph": "s", "id": 91414, "pid": 76337, "tid": -914061504, "ts": 1716454222754786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222809194, "dur": 6, "args": { "External id": 91423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91423, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91423, "pid": 5, "tid": 7, "ts": 1716454222809194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754825, "dur": 10, "args": { "External id": 91423, "cbid": 211, "correlation": 91423 } }, { "ph": "s", "id": 91423, "pid": 76337, "tid": -914061504, "ts": 1716454222754825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222754878, "dur": 0, "args": { "External id": 91433, "cbid": 317, "correlation": 91433 } }, { "ph": "f", "id": 91433, "pid": 76337, "tid": -914061504, "ts": 1716454222754878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222754879, "dur": 0, "args": { "External id": 91434, "cbid": 203, "correlation": 91434 } }, { "ph": "f", "id": 91434, "pid": 76337, "tid": -914061504, "ts": 1716454222754879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222754880, "dur": 0, "args": { "External id": 91435, "cbid": 205, "correlation": 91435 } }, { "ph": "f", "id": 91435, "pid": 76337, "tid": -914061504, "ts": 1716454222754880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222809201, "dur": 6, "args": { "External id": 91439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91439, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91439, "pid": 5, "tid": 7, "ts": 1716454222809201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754896, "dur": 11, "args": { "External id": 91439, "cbid": 211, "correlation": 91439 } }, { "ph": "s", "id": 91439, "pid": 76337, "tid": -914061504, "ts": 1716454222754896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222809209, "dur": 314, "args": { "External id": 91441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91441, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91441, "pid": 5, "tid": 7, "ts": 1716454222809209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754910, "dur": 5, "args": { "External id": 91441, "cbid": 211, "correlation": 91441 } }, { "ph": "s", "id": 91441, "pid": 76337, "tid": -914061504, "ts": 1716454222754910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222809525, "dur": 1, "args": { "External id": 91443, "device": 5, "context": 1, "stream": 7, "correlation": 91443, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 91443, "pid": 5, "tid": 7, "ts": 1716454222809525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222754921, "dur": 7, "args": { "External id": 91443, "cbid": 51, "correlation": 91443 } }, { "ph": "s", "id": 91443, "pid": 76337, "tid": -914061504, "ts": 1716454222754921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222809529, "dur": 488, "args": { "External id": 91444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91444, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91444, "pid": 5, "tid": 7, "ts": 1716454222809529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754930, "dur": 7, "args": { "External id": 91444, "cbid": 211, "correlation": 91444 } }, { "ph": "s", "id": 91444, "pid": 76337, "tid": -914061504, "ts": 1716454222754930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810018, "dur": 6, "args": { "External id": 91446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91446, "pid": 5, "tid": 7, "ts": 1716454222810018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754940, "dur": 5, "args": { "External id": 91446, "cbid": 211, "correlation": 91446 } }, { "ph": "s", "id": 91446, "pid": 76337, "tid": -914061504, "ts": 1716454222754940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222810025, "dur": 6, "args": { "External id": 91452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91452, "pid": 5, "tid": 7, "ts": 1716454222810025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222754969, "dur": 16, "args": { "External id": 91452, "cbid": 211, "correlation": 91452 } }, { "ph": "s", "id": 91452, "pid": 76337, "tid": -914061504, "ts": 1716454222754969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222810032, "dur": 3, "args": { "External id": 91460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91460, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 91460, "pid": 5, "tid": 7, "ts": 1716454222810032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755022, "dur": 10, "args": { "External id": 91460, "cbid": 211, "correlation": 91460 } }, { "ph": "s", "id": 91460, "pid": 76337, "tid": -914061504, "ts": 1716454222755022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222755086, "dur": 1, "args": { "External id": 91476, "cbid": 251, "correlation": 91476 } }, { "ph": "f", "id": 91476, "pid": 76337, "tid": -914061504, "ts": 1716454222755086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222755092, "dur": 0, "args": { "External id": 91478, "cbid": 251, "correlation": 91478 } }, { "ph": "f", "id": 91478, "pid": 76337, "tid": -914061504, "ts": 1716454222755092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222810036, "dur": 11, "args": { "External id": 91479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91479, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91479, "pid": 5, "tid": 7, "ts": 1716454222810036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755094, "dur": 12, "args": { "External id": 91479, "cbid": 211, "correlation": 91479 } }, { "ph": "s", "id": 91479, "pid": 76337, "tid": -914061504, "ts": 1716454222755094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222810048, "dur": 4, "args": { "External id": 91481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91481, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91481, "pid": 5, "tid": 7, "ts": 1716454222810048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755107, "dur": 5, "args": { "External id": 91481, "cbid": 211, "correlation": 91481 } }, { "ph": "s", "id": 91481, "pid": 76337, "tid": -914061504, "ts": 1716454222755107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222810054, "dur": 6, "args": { "External id": 91491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91491, "pid": 5, "tid": 7, "ts": 1716454222810054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755165, "dur": 12, "args": { "External id": 91491, "cbid": 211, "correlation": 91491 } }, { "ph": "s", "id": 91491, "pid": 76337, "tid": -914061504, "ts": 1716454222755165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222810061, "dur": 9, "args": { "External id": 91511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91511, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91511, "pid": 5, "tid": 7, "ts": 1716454222810061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755231, "dur": 10, "args": { "External id": 91511, "cbid": 211, "correlation": 91511 } }, { "ph": "s", "id": 91511, "pid": 76337, "tid": -914061504, "ts": 1716454222755231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222810072, "dur": 4, "args": { "External id": 91523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91523, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 91523, "pid": 5, "tid": 7, "ts": 1716454222810072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755252, "dur": 6, "args": { "External id": 91523, "cbid": 211, "correlation": 91523 } }, { "ph": "s", "id": 91523, "pid": 76337, "tid": -914061504, "ts": 1716454222755252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222810077, "dur": 6, "args": { "External id": 91526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91526, "pid": 5, "tid": 7, "ts": 1716454222810077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755272, "dur": 7, "args": { "External id": 91526, "cbid": 211, "correlation": 91526 } }, { "ph": "s", "id": 91526, "pid": 76337, "tid": -914061504, "ts": 1716454222755272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222810085, "dur": 4, "args": { "External id": 91535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91535, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91535, "pid": 5, "tid": 7, "ts": 1716454222810085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755312, "dur": 11, "args": { "External id": 91535, "cbid": 211, "correlation": 91535 } }, { "ph": "s", "id": 91535, "pid": 76337, "tid": -914061504, "ts": 1716454222755312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222755376, "dur": 0, "args": { "External id": 91545, "cbid": 317, "correlation": 91545 } }, { "ph": "f", "id": 91545, "pid": 76337, "tid": -914061504, "ts": 1716454222755376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222755377, "dur": 0, "args": { "External id": 91546, "cbid": 203, "correlation": 91546 } }, { "ph": "f", "id": 91546, "pid": 76337, "tid": -914061504, "ts": 1716454222755377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222755378, "dur": 0, "args": { "External id": 91547, "cbid": 205, "correlation": 91547 } }, { "ph": "f", "id": 91547, "pid": 76337, "tid": -914061504, "ts": 1716454222755378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810090, "dur": 5, "args": { "External id": 91551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91551, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91551, "pid": 5, "tid": 7, "ts": 1716454222810090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755392, "dur": 12, "args": { "External id": 91551, "cbid": 211, "correlation": 91551 } }, { "ph": "s", "id": 91551, "pid": 76337, "tid": -914061504, "ts": 1716454222755392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810097, "dur": 159, "args": { "External id": 91553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91553, "pid": 5, "tid": 7, "ts": 1716454222810097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755407, "dur": 5, "args": { "External id": 91553, "cbid": 211, "correlation": 91553 } }, { "ph": "s", "id": 91553, "pid": 76337, "tid": -914061504, "ts": 1716454222755407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222810258, "dur": 1, "args": { "External id": 91555, "device": 5, "context": 1, "stream": 7, "correlation": 91555, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 91555, "pid": 5, "tid": 7, "ts": 1716454222810258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222755418, "dur": 6, "args": { "External id": 91555, "cbid": 51, "correlation": 91555 } }, { "ph": "s", "id": 91555, "pid": 76337, "tid": -914061504, "ts": 1716454222755418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222810261, "dur": 254, "args": { "External id": 91556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91556, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91556, "pid": 5, "tid": 7, "ts": 1716454222810261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755425, "dur": 6, "args": { "External id": 91556, "cbid": 211, "correlation": 91556 } }, { "ph": "s", "id": 91556, "pid": 76337, "tid": -914061504, "ts": 1716454222755425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810517, "dur": 6, "args": { "External id": 91558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91558, "pid": 5, "tid": 7, "ts": 1716454222810517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755435, "dur": 6, "args": { "External id": 91558, "cbid": 211, "correlation": 91558 } }, { "ph": "s", "id": 91558, "pid": 76337, "tid": -914061504, "ts": 1716454222755435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222810524, "dur": 6, "args": { "External id": 91564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91564, "pid": 5, "tid": 7, "ts": 1716454222810524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755464, "dur": 8, "args": { "External id": 91564, "cbid": 211, "correlation": 91564 } }, { "ph": "s", "id": 91564, "pid": 76337, "tid": -914061504, "ts": 1716454222755464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222755523, "dur": 0, "args": { "External id": 91574, "cbid": 317, "correlation": 91574 } }, { "ph": "f", "id": 91574, "pid": 76337, "tid": -914061504, "ts": 1716454222755523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222755523, "dur": 0, "args": { "External id": 91575, "cbid": 203, "correlation": 91575 } }, { "ph": "f", "id": 91575, "pid": 76337, "tid": -914061504, "ts": 1716454222755523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222755524, "dur": 0, "args": { "External id": 91576, "cbid": 205, "correlation": 91576 } }, { "ph": "f", "id": 91576, "pid": 76337, "tid": -914061504, "ts": 1716454222755524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810531, "dur": 8, "args": { "External id": 91580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91580, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91580, "pid": 5, "tid": 7, "ts": 1716454222810531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755538, "dur": 12, "args": { "External id": 91580, "cbid": 211, "correlation": 91580 } }, { "ph": "s", "id": 91580, "pid": 76337, "tid": -914061504, "ts": 1716454222755538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222810540, "dur": 3, "args": { "External id": 91582, "device": 5, "context": 1, "stream": 7, "correlation": 91582, "bytes": 4800, "memory bandwidth (GB/s)": 1.530612244897959 } }, { "ph": "f", "id": 91582, "pid": 5, "tid": 7, "ts": 1716454222810540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222755557, "dur": 16, "args": { "External id": 91582, "cbid": 51, "correlation": 91582 } }, { "ph": "s", "id": 91582, "pid": 76337, "tid": -914061504, "ts": 1716454222755557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222810544, "dur": 95, "args": { "External id": 91583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91583, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 91583, "pid": 5, "tid": 7, "ts": 1716454222810544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755574, "dur": 7, "args": { "External id": 91583, "cbid": 211, "correlation": 91583 } }, { "ph": "s", "id": 91583, "pid": 76337, "tid": -914061504, "ts": 1716454222755574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810641, "dur": 6, "args": { "External id": 91585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91585, "pid": 5, "tid": 7, "ts": 1716454222810641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755585, "dur": 5, "args": { "External id": 91585, "cbid": 211, "correlation": 91585 } }, { "ph": "s", "id": 91585, "pid": 76337, "tid": -914061504, "ts": 1716454222755585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222810648, "dur": 6, "args": { "External id": 91591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91591, "pid": 5, "tid": 7, "ts": 1716454222810648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755613, "dur": 8, "args": { "External id": 91591, "cbid": 211, "correlation": 91591 } }, { "ph": "s", "id": 91591, "pid": 76337, "tid": -914061504, "ts": 1716454222755613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222810655, "dur": 5, "args": { "External id": 91599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91599, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91599, "pid": 5, "tid": 7, "ts": 1716454222810655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755641, "dur": 8, "args": { "External id": 91599, "cbid": 211, "correlation": 91599 } }, { "ph": "s", "id": 91599, "pid": 76337, "tid": -914061504, "ts": 1716454222755641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222810661, "dur": 4, "args": { "External id": 91607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91607, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91607, "pid": 5, "tid": 7, "ts": 1716454222810661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755670, "dur": 8, "args": { "External id": 91607, "cbid": 211, "correlation": 91607 } }, { "ph": "s", "id": 91607, "pid": 76337, "tid": -914061504, "ts": 1716454222755670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222810667, "dur": 11, "args": { "External id": 91616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91616, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91616, "pid": 5, "tid": 7, "ts": 1716454222810667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755715, "dur": 11, "args": { "External id": 91616, "cbid": 211, "correlation": 91616 } }, { "ph": "s", "id": 91616, "pid": 76337, "tid": -914061504, "ts": 1716454222755715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222810679, "dur": 12, "args": { "External id": 91636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91636, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91636, "pid": 5, "tid": 7, "ts": 1716454222810679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755786, "dur": 11, "args": { "External id": 91636, "cbid": 211, "correlation": 91636 } }, { "ph": "s", "id": 91636, "pid": 76337, "tid": -914061504, "ts": 1716454222755786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222810693, "dur": 4, "args": { "External id": 91648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91648, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91648, "pid": 5, "tid": 7, "ts": 1716454222810693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755807, "dur": 6, "args": { "External id": 91648, "cbid": 211, "correlation": 91648 } }, { "ph": "s", "id": 91648, "pid": 76337, "tid": -914061504, "ts": 1716454222755807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222810698, "dur": 11, "args": { "External id": 91651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91651, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91651, "pid": 5, "tid": 7, "ts": 1716454222810698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755825, "dur": 7, "args": { "External id": 91651, "cbid": 211, "correlation": 91651 } }, { "ph": "s", "id": 91651, "pid": 76337, "tid": -914061504, "ts": 1716454222755825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222810710, "dur": 6, "args": { "External id": 91660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91660, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91660, "pid": 5, "tid": 7, "ts": 1716454222810710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755863, "dur": 9, "args": { "External id": 91660, "cbid": 211, "correlation": 91660 } }, { "ph": "s", "id": 91660, "pid": 76337, "tid": -914061504, "ts": 1716454222755863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222755914, "dur": 0, "args": { "External id": 91670, "cbid": 317, "correlation": 91670 } }, { "ph": "f", "id": 91670, "pid": 76337, "tid": -914061504, "ts": 1716454222755914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222755915, "dur": 0, "args": { "External id": 91671, "cbid": 203, "correlation": 91671 } }, { "ph": "f", "id": 91671, "pid": 76337, "tid": -914061504, "ts": 1716454222755915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222755916, "dur": 0, "args": { "External id": 91672, "cbid": 205, "correlation": 91672 } }, { "ph": "f", "id": 91672, "pid": 76337, "tid": -914061504, "ts": 1716454222755916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810718, "dur": 6, "args": { "External id": 91676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91676, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91676, "pid": 5, "tid": 7, "ts": 1716454222810718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755930, "dur": 11, "args": { "External id": 91676, "cbid": 211, "correlation": 91676 } }, { "ph": "s", "id": 91676, "pid": 76337, "tid": -914061504, "ts": 1716454222755930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222810725, "dur": 315, "args": { "External id": 91678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91678, "pid": 5, "tid": 7, "ts": 1716454222810725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755944, "dur": 6, "args": { "External id": 91678, "cbid": 211, "correlation": 91678 } }, { "ph": "s", "id": 91678, "pid": 76337, "tid": -914061504, "ts": 1716454222755944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222811043, "dur": 1, "args": { "External id": 91680, "device": 5, "context": 1, "stream": 7, "correlation": 91680, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 91680, "pid": 5, "tid": 7, "ts": 1716454222811043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222755955, "dur": 6, "args": { "External id": 91680, "cbid": 51, "correlation": 91680 } }, { "ph": "s", "id": 91680, "pid": 76337, "tid": -914061504, "ts": 1716454222755955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222811046, "dur": 487, "args": { "External id": 91681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91681, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91681, "pid": 5, "tid": 7, "ts": 1716454222811046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755963, "dur": 6, "args": { "External id": 91681, "cbid": 211, "correlation": 91681 } }, { "ph": "s", "id": 91681, "pid": 76337, "tid": -914061504, "ts": 1716454222755963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222811535, "dur": 6, "args": { "External id": 91683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91683, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91683, "pid": 5, "tid": 7, "ts": 1716454222811535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222755972, "dur": 14, "args": { "External id": 91683, "cbid": 211, "correlation": 91683 } }, { "ph": "s", "id": 91683, "pid": 76337, "tid": -914061504, "ts": 1716454222755972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222811542, "dur": 6, "args": { "External id": 91689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91689, "pid": 5, "tid": 7, "ts": 1716454222811542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756011, "dur": 9, "args": { "External id": 91689, "cbid": 211, "correlation": 91689 } }, { "ph": "s", "id": 91689, "pid": 76337, "tid": -914061504, "ts": 1716454222756011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222811549, "dur": 3, "args": { "External id": 91697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91697, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 91697, "pid": 5, "tid": 7, "ts": 1716454222811549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756055, "dur": 10, "args": { "External id": 91697, "cbid": 211, "correlation": 91697 } }, { "ph": "s", "id": 91697, "pid": 76337, "tid": -914061504, "ts": 1716454222756055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222756119, "dur": 1, "args": { "External id": 91713, "cbid": 251, "correlation": 91713 } }, { "ph": "f", "id": 91713, "pid": 76337, "tid": -914061504, "ts": 1716454222756119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222756125, "dur": 0, "args": { "External id": 91715, "cbid": 251, "correlation": 91715 } }, { "ph": "f", "id": 91715, "pid": 76337, "tid": -914061504, "ts": 1716454222756125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222811554, "dur": 12, "args": { "External id": 91716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91716, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91716, "pid": 5, "tid": 7, "ts": 1716454222811554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756127, "dur": 11, "args": { "External id": 91716, "cbid": 211, "correlation": 91716 } }, { "ph": "s", "id": 91716, "pid": 76337, "tid": -914061504, "ts": 1716454222756127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222811567, "dur": 5, "args": { "External id": 91718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91718, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91718, "pid": 5, "tid": 7, "ts": 1716454222811567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756140, "dur": 6, "args": { "External id": 91718, "cbid": 211, "correlation": 91718 } }, { "ph": "s", "id": 91718, "pid": 76337, "tid": -914061504, "ts": 1716454222756140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222811574, "dur": 6, "args": { "External id": 91728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91728, "pid": 5, "tid": 7, "ts": 1716454222811574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756197, "dur": 12, "args": { "External id": 91728, "cbid": 211, "correlation": 91728 } }, { "ph": "s", "id": 91728, "pid": 76337, "tid": -914061504, "ts": 1716454222756197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222811581, "dur": 9, "args": { "External id": 91748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91748, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91748, "pid": 5, "tid": 7, "ts": 1716454222811581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756262, "dur": 11, "args": { "External id": 91748, "cbid": 211, "correlation": 91748 } }, { "ph": "s", "id": 91748, "pid": 76337, "tid": -914061504, "ts": 1716454222756262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222811592, "dur": 4, "args": { "External id": 91760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91760, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 91760, "pid": 5, "tid": 7, "ts": 1716454222811592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756283, "dur": 6, "args": { "External id": 91760, "cbid": 211, "correlation": 91760 } }, { "ph": "s", "id": 91760, "pid": 76337, "tid": -914061504, "ts": 1716454222756283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222811597, "dur": 6, "args": { "External id": 91763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91763, "pid": 5, "tid": 7, "ts": 1716454222811597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756303, "dur": 7, "args": { "External id": 91763, "cbid": 211, "correlation": 91763 } }, { "ph": "s", "id": 91763, "pid": 76337, "tid": -914061504, "ts": 1716454222756303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222811605, "dur": 4, "args": { "External id": 91772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91772, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91772, "pid": 5, "tid": 7, "ts": 1716454222811605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756344, "dur": 10, "args": { "External id": 91772, "cbid": 211, "correlation": 91772 } }, { "ph": "s", "id": 91772, "pid": 76337, "tid": -914061504, "ts": 1716454222756344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222756407, "dur": 0, "args": { "External id": 91782, "cbid": 317, "correlation": 91782 } }, { "ph": "f", "id": 91782, "pid": 76337, "tid": -914061504, "ts": 1716454222756407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222756408, "dur": 0, "args": { "External id": 91783, "cbid": 203, "correlation": 91783 } }, { "ph": "f", "id": 91783, "pid": 76337, "tid": -914061504, "ts": 1716454222756408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222756408, "dur": 0, "args": { "External id": 91784, "cbid": 205, "correlation": 91784 } }, { "ph": "f", "id": 91784, "pid": 76337, "tid": -914061504, "ts": 1716454222756408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222811611, "dur": 5, "args": { "External id": 91788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91788, "pid": 5, "tid": 7, "ts": 1716454222811611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756423, "dur": 12, "args": { "External id": 91788, "cbid": 211, "correlation": 91788 } }, { "ph": "s", "id": 91788, "pid": 76337, "tid": -914061504, "ts": 1716454222756423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222811617, "dur": 159, "args": { "External id": 91790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91790, "pid": 5, "tid": 7, "ts": 1716454222811617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756437, "dur": 6, "args": { "External id": 91790, "cbid": 211, "correlation": 91790 } }, { "ph": "s", "id": 91790, "pid": 76337, "tid": -914061504, "ts": 1716454222756437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222811778, "dur": 1, "args": { "External id": 91792, "device": 5, "context": 1, "stream": 7, "correlation": 91792, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 91792, "pid": 5, "tid": 7, "ts": 1716454222811778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222756449, "dur": 6, "args": { "External id": 91792, "cbid": 51, "correlation": 91792 } }, { "ph": "s", "id": 91792, "pid": 76337, "tid": -914061504, "ts": 1716454222756449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222811781, "dur": 254, "args": { "External id": 91793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91793, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91793, "pid": 5, "tid": 7, "ts": 1716454222811781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756456, "dur": 6, "args": { "External id": 91793, "cbid": 211, "correlation": 91793 } }, { "ph": "s", "id": 91793, "pid": 76337, "tid": -914061504, "ts": 1716454222756456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222812037, "dur": 6, "args": { "External id": 91795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91795, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91795, "pid": 5, "tid": 7, "ts": 1716454222812037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756466, "dur": 5, "args": { "External id": 91795, "cbid": 211, "correlation": 91795 } }, { "ph": "s", "id": 91795, "pid": 76337, "tid": -914061504, "ts": 1716454222756466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222812044, "dur": 6, "args": { "External id": 91801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91801, "pid": 5, "tid": 7, "ts": 1716454222812044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756495, "dur": 8, "args": { "External id": 91801, "cbid": 211, "correlation": 91801 } }, { "ph": "s", "id": 91801, "pid": 76337, "tid": -914061504, "ts": 1716454222756495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222756554, "dur": 0, "args": { "External id": 91811, "cbid": 317, "correlation": 91811 } }, { "ph": "f", "id": 91811, "pid": 76337, "tid": -914061504, "ts": 1716454222756554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222756555, "dur": 0, "args": { "External id": 91812, "cbid": 203, "correlation": 91812 } }, { "ph": "f", "id": 91812, "pid": 76337, "tid": -914061504, "ts": 1716454222756555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222756556, "dur": 0, "args": { "External id": 91813, "cbid": 205, "correlation": 91813 } }, { "ph": "f", "id": 91813, "pid": 76337, "tid": -914061504, "ts": 1716454222756556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222812051, "dur": 7, "args": { "External id": 91817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91817, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91817, "pid": 5, "tid": 7, "ts": 1716454222812051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756567, "dur": 11, "args": { "External id": 91817, "cbid": 211, "correlation": 91817 } }, { "ph": "s", "id": 91817, "pid": 76337, "tid": -914061504, "ts": 1716454222756567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222812060, "dur": 3, "args": { "External id": 91819, "device": 5, "context": 1, "stream": 7, "correlation": 91819, "bytes": 4800, "memory bandwidth (GB/s)": 1.5151515151515151 } }, { "ph": "f", "id": 91819, "pid": 5, "tid": 7, "ts": 1716454222812060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222756584, "dur": 11, "args": { "External id": 91819, "cbid": 51, "correlation": 91819 } }, { "ph": "s", "id": 91819, "pid": 76337, "tid": -914061504, "ts": 1716454222756584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222812064, "dur": 95, "args": { "External id": 91820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91820, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 91820, "pid": 5, "tid": 7, "ts": 1716454222812064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756597, "dur": 7, "args": { "External id": 91820, "cbid": 211, "correlation": 91820 } }, { "ph": "s", "id": 91820, "pid": 76337, "tid": -914061504, "ts": 1716454222756597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222812160, "dur": 6, "args": { "External id": 91822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91822, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91822, "pid": 5, "tid": 7, "ts": 1716454222812160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756607, "dur": 5, "args": { "External id": 91822, "cbid": 211, "correlation": 91822 } }, { "ph": "s", "id": 91822, "pid": 76337, "tid": -914061504, "ts": 1716454222756607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222812167, "dur": 6, "args": { "External id": 91828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91828, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91828, "pid": 5, "tid": 7, "ts": 1716454222812167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756633, "dur": 10, "args": { "External id": 91828, "cbid": 211, "correlation": 91828 } }, { "ph": "s", "id": 91828, "pid": 76337, "tid": -914061504, "ts": 1716454222756633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222812175, "dur": 5, "args": { "External id": 91836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91836, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91836, "pid": 5, "tid": 7, "ts": 1716454222812175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756665, "dur": 9, "args": { "External id": 91836, "cbid": 211, "correlation": 91836 } }, { "ph": "s", "id": 91836, "pid": 76337, "tid": -914061504, "ts": 1716454222756665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222812181, "dur": 4, "args": { "External id": 91844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91844, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91844, "pid": 5, "tid": 7, "ts": 1716454222812181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756695, "dur": 8, "args": { "External id": 91844, "cbid": 211, "correlation": 91844 } }, { "ph": "s", "id": 91844, "pid": 76337, "tid": -914061504, "ts": 1716454222756695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222812186, "dur": 11, "args": { "External id": 91853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91853, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91853, "pid": 5, "tid": 7, "ts": 1716454222812186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756740, "dur": 11, "args": { "External id": 91853, "cbid": 211, "correlation": 91853 } }, { "ph": "s", "id": 91853, "pid": 76337, "tid": -914061504, "ts": 1716454222756740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222812198, "dur": 12, "args": { "External id": 91873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91873, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91873, "pid": 5, "tid": 7, "ts": 1716454222812198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756810, "dur": 12, "args": { "External id": 91873, "cbid": 211, "correlation": 91873 } }, { "ph": "s", "id": 91873, "pid": 76337, "tid": -914061504, "ts": 1716454222756810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222812212, "dur": 4, "args": { "External id": 91885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91885, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91885, "pid": 5, "tid": 7, "ts": 1716454222812212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756832, "dur": 6, "args": { "External id": 91885, "cbid": 211, "correlation": 91885 } }, { "ph": "s", "id": 91885, "pid": 76337, "tid": -914061504, "ts": 1716454222756832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222812217, "dur": 11, "args": { "External id": 91888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91888, "pid": 5, "tid": 7, "ts": 1716454222812217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756850, "dur": 7, "args": { "External id": 91888, "cbid": 211, "correlation": 91888 } }, { "ph": "s", "id": 91888, "pid": 76337, "tid": -914061504, "ts": 1716454222756850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222812230, "dur": 6, "args": { "External id": 91897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91897, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91897, "pid": 5, "tid": 7, "ts": 1716454222812230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756888, "dur": 10, "args": { "External id": 91897, "cbid": 211, "correlation": 91897 } }, { "ph": "s", "id": 91897, "pid": 76337, "tid": -914061504, "ts": 1716454222756888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222756940, "dur": 0, "args": { "External id": 91907, "cbid": 317, "correlation": 91907 } }, { "ph": "f", "id": 91907, "pid": 76337, "tid": -914061504, "ts": 1716454222756940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222756941, "dur": 0, "args": { "External id": 91908, "cbid": 203, "correlation": 91908 } }, { "ph": "f", "id": 91908, "pid": 76337, "tid": -914061504, "ts": 1716454222756941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222756942, "dur": 0, "args": { "External id": 91909, "cbid": 205, "correlation": 91909 } }, { "ph": "f", "id": 91909, "pid": 76337, "tid": -914061504, "ts": 1716454222756942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222812237, "dur": 6, "args": { "External id": 91913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91913, "pid": 5, "tid": 7, "ts": 1716454222812237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756955, "dur": 11, "args": { "External id": 91913, "cbid": 211, "correlation": 91913 } }, { "ph": "s", "id": 91913, "pid": 76337, "tid": -914061504, "ts": 1716454222756955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222812245, "dur": 314, "args": { "External id": 91915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91915, "pid": 5, "tid": 7, "ts": 1716454222812245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756969, "dur": 13, "args": { "External id": 91915, "cbid": 211, "correlation": 91915 } }, { "ph": "s", "id": 91915, "pid": 76337, "tid": -914061504, "ts": 1716454222756969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222812561, "dur": 1, "args": { "External id": 91917, "device": 5, "context": 1, "stream": 7, "correlation": 91917, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 91917, "pid": 5, "tid": 7, "ts": 1716454222812561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222756987, "dur": 7, "args": { "External id": 91917, "cbid": 51, "correlation": 91917 } }, { "ph": "s", "id": 91917, "pid": 76337, "tid": -914061504, "ts": 1716454222756987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222812564, "dur": 489, "args": { "External id": 91918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91918, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91918, "pid": 5, "tid": 7, "ts": 1716454222812564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222756995, "dur": 7, "args": { "External id": 91918, "cbid": 211, "correlation": 91918 } }, { "ph": "s", "id": 91918, "pid": 76337, "tid": -914061504, "ts": 1716454222756995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813055, "dur": 6, "args": { "External id": 91920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91920, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 91920, "pid": 5, "tid": 7, "ts": 1716454222813055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757006, "dur": 5, "args": { "External id": 91920, "cbid": 211, "correlation": 91920 } }, { "ph": "s", "id": 91920, "pid": 76337, "tid": -914061504, "ts": 1716454222757006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222813062, "dur": 6, "args": { "External id": 91926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91926, "pid": 5, "tid": 7, "ts": 1716454222813062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757035, "dur": 10, "args": { "External id": 91926, "cbid": 211, "correlation": 91926 } }, { "ph": "s", "id": 91926, "pid": 76337, "tid": -914061504, "ts": 1716454222757035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222813069, "dur": 3, "args": { "External id": 91934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91934, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 91934, "pid": 5, "tid": 7, "ts": 1716454222813069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757079, "dur": 9, "args": { "External id": 91934, "cbid": 211, "correlation": 91934 } }, { "ph": "s", "id": 91934, "pid": 76337, "tid": -914061504, "ts": 1716454222757079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222757142, "dur": 1, "args": { "External id": 91950, "cbid": 251, "correlation": 91950 } }, { "ph": "f", "id": 91950, "pid": 76337, "tid": -914061504, "ts": 1716454222757142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222757148, "dur": 0, "args": { "External id": 91952, "cbid": 251, "correlation": 91952 } }, { "ph": "f", "id": 91952, "pid": 76337, "tid": -914061504, "ts": 1716454222757148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222813073, "dur": 13, "args": { "External id": 91953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91953, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91953, "pid": 5, "tid": 7, "ts": 1716454222813073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757150, "dur": 11, "args": { "External id": 91953, "cbid": 211, "correlation": 91953 } }, { "ph": "s", "id": 91953, "pid": 76337, "tid": -914061504, "ts": 1716454222757150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222813088, "dur": 5, "args": { "External id": 91955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91955, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 91955, "pid": 5, "tid": 7, "ts": 1716454222813088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757163, "dur": 5, "args": { "External id": 91955, "cbid": 211, "correlation": 91955 } }, { "ph": "s", "id": 91955, "pid": 76337, "tid": -914061504, "ts": 1716454222757163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222813094, "dur": 6, "args": { "External id": 91965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 91965, "pid": 5, "tid": 7, "ts": 1716454222813094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757220, "dur": 12, "args": { "External id": 91965, "cbid": 211, "correlation": 91965 } }, { "ph": "s", "id": 91965, "pid": 76337, "tid": -914061504, "ts": 1716454222757220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222813101, "dur": 9, "args": { "External id": 91985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91985, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 91985, "pid": 5, "tid": 7, "ts": 1716454222813101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757286, "dur": 11, "args": { "External id": 91985, "cbid": 211, "correlation": 91985 } }, { "ph": "s", "id": 91985, "pid": 76337, "tid": -914061504, "ts": 1716454222757286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222813111, "dur": 3, "args": { "External id": 91997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 91997, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 91997, "pid": 5, "tid": 7, "ts": 1716454222813111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757307, "dur": 7, "args": { "External id": 91997, "cbid": 211, "correlation": 91997 } }, { "ph": "s", "id": 91997, "pid": 76337, "tid": -914061504, "ts": 1716454222757307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222813116, "dur": 6, "args": { "External id": 92000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92000, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 92000, "pid": 5, "tid": 7, "ts": 1716454222813116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757327, "dur": 7, "args": { "External id": 92000, "cbid": 211, "correlation": 92000 } }, { "ph": "s", "id": 92000, "pid": 76337, "tid": -914061504, "ts": 1716454222757327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222813124, "dur": 4, "args": { "External id": 92009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92009, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 92009, "pid": 5, "tid": 7, "ts": 1716454222813124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757369, "dur": 10, "args": { "External id": 92009, "cbid": 211, "correlation": 92009 } }, { "ph": "s", "id": 92009, "pid": 76337, "tid": -914061504, "ts": 1716454222757369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222757433, "dur": 0, "args": { "External id": 92019, "cbid": 317, "correlation": 92019 } }, { "ph": "f", "id": 92019, "pid": 76337, "tid": -914061504, "ts": 1716454222757433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222757434, "dur": 0, "args": { "External id": 92020, "cbid": 203, "correlation": 92020 } }, { "ph": "f", "id": 92020, "pid": 76337, "tid": -914061504, "ts": 1716454222757434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222757435, "dur": 0, "args": { "External id": 92021, "cbid": 205, "correlation": 92021 } }, { "ph": "f", "id": 92021, "pid": 76337, "tid": -914061504, "ts": 1716454222757435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813130, "dur": 5, "args": { "External id": 92025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92025, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92025, "pid": 5, "tid": 7, "ts": 1716454222813130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757449, "dur": 12, "args": { "External id": 92025, "cbid": 211, "correlation": 92025 } }, { "ph": "s", "id": 92025, "pid": 76337, "tid": -914061504, "ts": 1716454222757449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813136, "dur": 160, "args": { "External id": 92027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92027, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92027, "pid": 5, "tid": 7, "ts": 1716454222813136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757463, "dur": 5, "args": { "External id": 92027, "cbid": 211, "correlation": 92027 } }, { "ph": "s", "id": 92027, "pid": 76337, "tid": -914061504, "ts": 1716454222757463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222813298, "dur": 1, "args": { "External id": 92029, "device": 5, "context": 1, "stream": 7, "correlation": 92029, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 92029, "pid": 5, "tid": 7, "ts": 1716454222813298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222757474, "dur": 7, "args": { "External id": 92029, "cbid": 51, "correlation": 92029 } }, { "ph": "s", "id": 92029, "pid": 76337, "tid": -914061504, "ts": 1716454222757474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222813301, "dur": 254, "args": { "External id": 92030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92030, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92030, "pid": 5, "tid": 7, "ts": 1716454222813301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757482, "dur": 6, "args": { "External id": 92030, "cbid": 211, "correlation": 92030 } }, { "ph": "s", "id": 92030, "pid": 76337, "tid": -914061504, "ts": 1716454222757482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813557, "dur": 6, "args": { "External id": 92032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92032, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92032, "pid": 5, "tid": 7, "ts": 1716454222813557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757492, "dur": 5, "args": { "External id": 92032, "cbid": 211, "correlation": 92032 } }, { "ph": "s", "id": 92032, "pid": 76337, "tid": -914061504, "ts": 1716454222757492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222813564, "dur": 6, "args": { "External id": 92038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 92038, "pid": 5, "tid": 7, "ts": 1716454222813564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757520, "dur": 9, "args": { "External id": 92038, "cbid": 211, "correlation": 92038 } }, { "ph": "s", "id": 92038, "pid": 76337, "tid": -914061504, "ts": 1716454222757520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222757578, "dur": 0, "args": { "External id": 92048, "cbid": 317, "correlation": 92048 } }, { "ph": "f", "id": 92048, "pid": 76337, "tid": -914061504, "ts": 1716454222757578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222757579, "dur": 0, "args": { "External id": 92049, "cbid": 203, "correlation": 92049 } }, { "ph": "f", "id": 92049, "pid": 76337, "tid": -914061504, "ts": 1716454222757579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222757580, "dur": 0, "args": { "External id": 92050, "cbid": 205, "correlation": 92050 } }, { "ph": "f", "id": 92050, "pid": 76337, "tid": -914061504, "ts": 1716454222757580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813571, "dur": 8, "args": { "External id": 92054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92054, "pid": 5, "tid": 7, "ts": 1716454222813571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757592, "dur": 11, "args": { "External id": 92054, "cbid": 211, "correlation": 92054 } }, { "ph": "s", "id": 92054, "pid": 76337, "tid": -914061504, "ts": 1716454222757592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222813580, "dur": 3, "args": { "External id": 92056, "device": 5, "context": 1, "stream": 7, "correlation": 92056, "bytes": 4800, "memory bandwidth (GB/s)": 1.530612244897959 } }, { "ph": "f", "id": 92056, "pid": 5, "tid": 7, "ts": 1716454222813580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222757608, "dur": 10, "args": { "External id": 92056, "cbid": 51, "correlation": 92056 } }, { "ph": "s", "id": 92056, "pid": 76337, "tid": -914061504, "ts": 1716454222757608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222813584, "dur": 92, "args": { "External id": 92057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92057, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 92057, "pid": 5, "tid": 7, "ts": 1716454222813584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757619, "dur": 6, "args": { "External id": 92057, "cbid": 211, "correlation": 92057 } }, { "ph": "s", "id": 92057, "pid": 76337, "tid": -914061504, "ts": 1716454222757619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813678, "dur": 6, "args": { "External id": 92059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92059, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92059, "pid": 5, "tid": 7, "ts": 1716454222813678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757629, "dur": 5, "args": { "External id": 92059, "cbid": 211, "correlation": 92059 } }, { "ph": "s", "id": 92059, "pid": 76337, "tid": -914061504, "ts": 1716454222757629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222813685, "dur": 6, "args": { "External id": 92065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 92065, "pid": 5, "tid": 7, "ts": 1716454222813685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757656, "dur": 8, "args": { "External id": 92065, "cbid": 211, "correlation": 92065 } }, { "ph": "s", "id": 92065, "pid": 76337, "tid": -914061504, "ts": 1716454222757656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222813692, "dur": 5, "args": { "External id": 92073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92073, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 92073, "pid": 5, "tid": 7, "ts": 1716454222813692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757685, "dur": 8, "args": { "External id": 92073, "cbid": 211, "correlation": 92073 } }, { "ph": "s", "id": 92073, "pid": 76337, "tid": -914061504, "ts": 1716454222757685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222813699, "dur": 4, "args": { "External id": 92081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92081, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 92081, "pid": 5, "tid": 7, "ts": 1716454222813699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757715, "dur": 8, "args": { "External id": 92081, "cbid": 211, "correlation": 92081 } }, { "ph": "s", "id": 92081, "pid": 76337, "tid": -914061504, "ts": 1716454222757715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222813704, "dur": 14, "args": { "External id": 92092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92092, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92092, "pid": 5, "tid": 7, "ts": 1716454222813704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757798, "dur": 14, "args": { "External id": 92092, "cbid": 211, "correlation": 92092 } }, { "ph": "s", "id": 92092, "pid": 76337, "tid": -914061504, "ts": 1716454222757798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222757855, "dur": 0, "args": { "External id": 92102, "cbid": 317, "correlation": 92102 } }, { "ph": "f", "id": 92102, "pid": 76337, "tid": -914061504, "ts": 1716454222757855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222757856, "dur": 0, "args": { "External id": 92103, "cbid": 203, "correlation": 92103 } }, { "ph": "f", "id": 92103, "pid": 76337, "tid": -914061504, "ts": 1716454222757856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222757857, "dur": 0, "args": { "External id": 92104, "cbid": 205, "correlation": 92104 } }, { "ph": "f", "id": 92104, "pid": 76337, "tid": -914061504, "ts": 1716454222757857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813719, "dur": 8, "args": { "External id": 92108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92108, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92108, "pid": 5, "tid": 7, "ts": 1716454222813719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757872, "dur": 12, "args": { "External id": 92108, "cbid": 211, "correlation": 92108 } }, { "ph": "s", "id": 92108, "pid": 76337, "tid": -914061504, "ts": 1716454222757872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222813729, "dur": 160, "args": { "External id": 92110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92110, "pid": 5, "tid": 7, "ts": 1716454222813729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757887, "dur": 5, "args": { "External id": 92110, "cbid": 211, "correlation": 92110 } }, { "ph": "s", "id": 92110, "pid": 76337, "tid": -914061504, "ts": 1716454222757887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222813891, "dur": 1, "args": { "External id": 92112, "device": 5, "context": 1, "stream": 7, "correlation": 92112, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 92112, "pid": 5, "tid": 7, "ts": 1716454222813891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222757898, "dur": 6, "args": { "External id": 92112, "cbid": 51, "correlation": 92112 } }, { "ph": "s", "id": 92112, "pid": 76337, "tid": -914061504, "ts": 1716454222757898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222813895, "dur": 642, "args": { "External id": 92113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92113, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92113, "pid": 5, "tid": 7, "ts": 1716454222813895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757905, "dur": 6, "args": { "External id": 92113, "cbid": 211, "correlation": 92113 } }, { "ph": "s", "id": 92113, "pid": 76337, "tid": -914061504, "ts": 1716454222757905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222814538, "dur": 12, "args": { "External id": 92115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92115, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92115, "pid": 5, "tid": 7, "ts": 1716454222814538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757915, "dur": 6, "args": { "External id": 92115, "cbid": 211, "correlation": 92115 } }, { "ph": "s", "id": 92115, "pid": 76337, "tid": -914061504, "ts": 1716454222757915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222814552, "dur": 14, "args": { "External id": 92121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92121, "pid": 5, "tid": 7, "ts": 1716454222814552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222757944, "dur": 9, "args": { "External id": 92121, "cbid": 211, "correlation": 92121 } }, { "ph": "s", "id": 92121, "pid": 76337, "tid": -914061504, "ts": 1716454222757944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222814567, "dur": 29, "args": { "External id": 92130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92130, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92130, "pid": 5, "tid": 7, "ts": 1716454222814567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758046, "dur": 13, "args": { "External id": 92130, "cbid": 211, "correlation": 92130 } }, { "ph": "s", "id": 92130, "pid": 76337, "tid": -914061504, "ts": 1716454222758046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222814598, "dur": 29, "args": { "External id": 92150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92150, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 92150, "pid": 5, "tid": 7, "ts": 1716454222814598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758115, "dur": 11, "args": { "External id": 92150, "cbid": 211, "correlation": 92150 } }, { "ph": "s", "id": 92150, "pid": 76337, "tid": -914061504, "ts": 1716454222758115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222814629, "dur": 4, "args": { "External id": 92162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92162, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92162, "pid": 5, "tid": 7, "ts": 1716454222814629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758136, "dur": 7, "args": { "External id": 92162, "cbid": 211, "correlation": 92162 } }, { "ph": "s", "id": 92162, "pid": 76337, "tid": -914061504, "ts": 1716454222758136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222814634, "dur": 30, "args": { "External id": 92165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92165, "pid": 5, "tid": 7, "ts": 1716454222814634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758155, "dur": 6, "args": { "External id": 92165, "cbid": 211, "correlation": 92165 } }, { "ph": "s", "id": 92165, "pid": 76337, "tid": -914061504, "ts": 1716454222758155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222814665, "dur": 21, "args": { "External id": 92174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92174, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92174, "pid": 5, "tid": 7, "ts": 1716454222814665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758194, "dur": 10, "args": { "External id": 92174, "cbid": 211, "correlation": 92174 } }, { "ph": "s", "id": 92174, "pid": 76337, "tid": -914061504, "ts": 1716454222758194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222758247, "dur": 0, "args": { "External id": 92184, "cbid": 317, "correlation": 92184 } }, { "ph": "f", "id": 92184, "pid": 76337, "tid": -914061504, "ts": 1716454222758247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222758248, "dur": 0, "args": { "External id": 92185, "cbid": 203, "correlation": 92185 } }, { "ph": "f", "id": 92185, "pid": 76337, "tid": -914061504, "ts": 1716454222758248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222758249, "dur": 0, "args": { "External id": 92186, "cbid": 205, "correlation": 92186 } }, { "ph": "f", "id": 92186, "pid": 76337, "tid": -914061504, "ts": 1716454222758249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222814687, "dur": 22, "args": { "External id": 92190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92190, "pid": 5, "tid": 7, "ts": 1716454222814687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758264, "dur": 11, "args": { "External id": 92190, "cbid": 211, "correlation": 92190 } }, { "ph": "s", "id": 92190, "pid": 76337, "tid": -914061504, "ts": 1716454222758264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222814711, "dur": 315, "args": { "External id": 92192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92192, "pid": 5, "tid": 7, "ts": 1716454222814711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758278, "dur": 5, "args": { "External id": 92192, "cbid": 211, "correlation": 92192 } }, { "ph": "s", "id": 92192, "pid": 76337, "tid": -914061504, "ts": 1716454222758278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222815027, "dur": 1, "args": { "External id": 92194, "device": 5, "context": 1, "stream": 7, "correlation": 92194, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 92194, "pid": 5, "tid": 7, "ts": 1716454222815027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222758289, "dur": 6, "args": { "External id": 92194, "cbid": 51, "correlation": 92194 } }, { "ph": "s", "id": 92194, "pid": 76337, "tid": -914061504, "ts": 1716454222758289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222815031, "dur": 1227, "args": { "External id": 92195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92195, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92195, "pid": 5, "tid": 7, "ts": 1716454222815031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758296, "dur": 6, "args": { "External id": 92195, "cbid": 211, "correlation": 92195 } }, { "ph": "s", "id": 92195, "pid": 76337, "tid": -914061504, "ts": 1716454222758296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222816259, "dur": 12, "args": { "External id": 92197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92197, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92197, "pid": 5, "tid": 7, "ts": 1716454222816259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758306, "dur": 5, "args": { "External id": 92197, "cbid": 211, "correlation": 92197 } }, { "ph": "s", "id": 92197, "pid": 76337, "tid": -914061504, "ts": 1716454222758306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222816272, "dur": 15, "args": { "External id": 92203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92203, "pid": 5, "tid": 7, "ts": 1716454222816272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758335, "dur": 8, "args": { "External id": 92203, "cbid": 211, "correlation": 92203 } }, { "ph": "s", "id": 92203, "pid": 76337, "tid": -914061504, "ts": 1716454222758335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222816288, "dur": 3, "args": { "External id": 92211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92211, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 92211, "pid": 5, "tid": 7, "ts": 1716454222816288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758379, "dur": 10, "args": { "External id": 92211, "cbid": 211, "correlation": 92211 } }, { "ph": "s", "id": 92211, "pid": 76337, "tid": -914061504, "ts": 1716454222758379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222758446, "dur": 1, "args": { "External id": 92227, "cbid": 251, "correlation": 92227 } }, { "ph": "f", "id": 92227, "pid": 76337, "tid": -914061504, "ts": 1716454222758446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222758451, "dur": 0, "args": { "External id": 92229, "cbid": 251, "correlation": 92229 } }, { "ph": "f", "id": 92229, "pid": 76337, "tid": -914061504, "ts": 1716454222758451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222816293, "dur": 12, "args": { "External id": 92230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92230, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92230, "pid": 5, "tid": 7, "ts": 1716454222816293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758453, "dur": 12, "args": { "External id": 92230, "cbid": 211, "correlation": 92230 } }, { "ph": "s", "id": 92230, "pid": 76337, "tid": -914061504, "ts": 1716454222758453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222816306, "dur": 5, "args": { "External id": 92232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92232, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92232, "pid": 5, "tid": 7, "ts": 1716454222816306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758467, "dur": 6, "args": { "External id": 92232, "cbid": 211, "correlation": 92232 } }, { "ph": "s", "id": 92232, "pid": 76337, "tid": -914061504, "ts": 1716454222758467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222816312, "dur": 17, "args": { "External id": 92242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92242, "pid": 5, "tid": 7, "ts": 1716454222816312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758525, "dur": 13, "args": { "External id": 92242, "cbid": 211, "correlation": 92242 } }, { "ph": "s", "id": 92242, "pid": 76337, "tid": -914061504, "ts": 1716454222758525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222816330, "dur": 17, "args": { "External id": 92262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92262, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 92262, "pid": 5, "tid": 7, "ts": 1716454222816330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758592, "dur": 11, "args": { "External id": 92262, "cbid": 211, "correlation": 92262 } }, { "ph": "s", "id": 92262, "pid": 76337, "tid": -914061504, "ts": 1716454222758592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222816349, "dur": 4, "args": { "External id": 92274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92274, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 92274, "pid": 5, "tid": 7, "ts": 1716454222816349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758613, "dur": 6, "args": { "External id": 92274, "cbid": 211, "correlation": 92274 } }, { "ph": "s", "id": 92274, "pid": 76337, "tid": -914061504, "ts": 1716454222758613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222816354, "dur": 16, "args": { "External id": 92277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92277, "pid": 5, "tid": 7, "ts": 1716454222816354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758632, "dur": 7, "args": { "External id": 92277, "cbid": 211, "correlation": 92277 } }, { "ph": "s", "id": 92277, "pid": 76337, "tid": -914061504, "ts": 1716454222758632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222816372, "dur": 11, "args": { "External id": 92286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92286, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92286, "pid": 5, "tid": 7, "ts": 1716454222816372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758672, "dur": 10, "args": { "External id": 92286, "cbid": 211, "correlation": 92286 } }, { "ph": "s", "id": 92286, "pid": 76337, "tid": -914061504, "ts": 1716454222758672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222758734, "dur": 0, "args": { "External id": 92296, "cbid": 317, "correlation": 92296 } }, { "ph": "f", "id": 92296, "pid": 76337, "tid": -914061504, "ts": 1716454222758734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222758735, "dur": 0, "args": { "External id": 92297, "cbid": 203, "correlation": 92297 } }, { "ph": "f", "id": 92297, "pid": 76337, "tid": -914061504, "ts": 1716454222758735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222758736, "dur": 0, "args": { "External id": 92298, "cbid": 205, "correlation": 92298 } }, { "ph": "f", "id": 92298, "pid": 76337, "tid": -914061504, "ts": 1716454222758736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222816384, "dur": 11, "args": { "External id": 92302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92302, "pid": 5, "tid": 7, "ts": 1716454222816384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758750, "dur": 12, "args": { "External id": 92302, "cbid": 211, "correlation": 92302 } }, { "ph": "s", "id": 92302, "pid": 76337, "tid": -914061504, "ts": 1716454222758750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222816396, "dur": 160, "args": { "External id": 92304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92304, "pid": 5, "tid": 7, "ts": 1716454222816396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758764, "dur": 5, "args": { "External id": 92304, "cbid": 211, "correlation": 92304 } }, { "ph": "s", "id": 92304, "pid": 76337, "tid": -914061504, "ts": 1716454222758764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222816557, "dur": 1, "args": { "External id": 92306, "device": 5, "context": 1, "stream": 7, "correlation": 92306, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 92306, "pid": 5, "tid": 7, "ts": 1716454222816557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222758775, "dur": 7, "args": { "External id": 92306, "cbid": 51, "correlation": 92306 } }, { "ph": "s", "id": 92306, "pid": 76337, "tid": -914061504, "ts": 1716454222758775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222816561, "dur": 639, "args": { "External id": 92307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92307, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92307, "pid": 5, "tid": 7, "ts": 1716454222816561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758783, "dur": 6, "args": { "External id": 92307, "cbid": 211, "correlation": 92307 } }, { "ph": "s", "id": 92307, "pid": 76337, "tid": -914061504, "ts": 1716454222758783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222817201, "dur": 12, "args": { "External id": 92309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92309, "pid": 5, "tid": 7, "ts": 1716454222817201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758793, "dur": 5, "args": { "External id": 92309, "cbid": 211, "correlation": 92309 } }, { "ph": "s", "id": 92309, "pid": 76337, "tid": -914061504, "ts": 1716454222758793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222817214, "dur": 15, "args": { "External id": 92315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92315, "pid": 5, "tid": 7, "ts": 1716454222817214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758822, "dur": 9, "args": { "External id": 92315, "cbid": 211, "correlation": 92315 } }, { "ph": "s", "id": 92315, "pid": 76337, "tid": -914061504, "ts": 1716454222758822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222758880, "dur": 0, "args": { "External id": 92325, "cbid": 317, "correlation": 92325 } }, { "ph": "f", "id": 92325, "pid": 76337, "tid": -914061504, "ts": 1716454222758880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222758881, "dur": 0, "args": { "External id": 92326, "cbid": 203, "correlation": 92326 } }, { "ph": "f", "id": 92326, "pid": 76337, "tid": -914061504, "ts": 1716454222758881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222758882, "dur": 0, "args": { "External id": 92327, "cbid": 205, "correlation": 92327 } }, { "ph": "f", "id": 92327, "pid": 76337, "tid": -914061504, "ts": 1716454222758882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222817230, "dur": 21, "args": { "External id": 92331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92331, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92331, "pid": 5, "tid": 7, "ts": 1716454222817230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758895, "dur": 11, "args": { "External id": 92331, "cbid": 211, "correlation": 92331 } }, { "ph": "s", "id": 92331, "pid": 76337, "tid": -914061504, "ts": 1716454222758895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222817252, "dur": 4, "args": { "External id": 92333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 92333, "pid": 5, "tid": 7, "ts": 1716454222817252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758913, "dur": 7, "args": { "External id": 92333, "cbid": 211, "correlation": 92333 } }, { "ph": "s", "id": 92333, "pid": 76337, "tid": -914061504, "ts": 1716454222758913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222758925, "dur": 0, "args": { "External id": 92334, "cbid": 51, "correlation": 92334 } }, { "ph": "s", "id": 92334, "pid": 76337, "tid": -914061504, "ts": 1716454222758925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222817257, "dur": 172, "args": { "External id": 92335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92335, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 92335, "pid": 5, "tid": 7, "ts": 1716454222817257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758926, "dur": 7, "args": { "External id": 92335, "cbid": 211, "correlation": 92335 } }, { "ph": "s", "id": 92335, "pid": 76337, "tid": -914061504, "ts": 1716454222758926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222817430, "dur": 16, "args": { "External id": 92340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92340, "pid": 5, "tid": 7, "ts": 1716454222817430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758953, "dur": 8, "args": { "External id": 92340, "cbid": 211, "correlation": 92340 } }, { "ph": "s", "id": 92340, "pid": 76337, "tid": -914061504, "ts": 1716454222758953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222817447, "dur": 11, "args": { "External id": 92348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92348, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92348, "pid": 5, "tid": 7, "ts": 1716454222817447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222758991, "dur": 9, "args": { "External id": 92348, "cbid": 211, "correlation": 92348 } }, { "ph": "s", "id": 92348, "pid": 76337, "tid": -914061504, "ts": 1716454222758991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222817460, "dur": 10, "args": { "External id": 92356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92356, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92356, "pid": 5, "tid": 7, "ts": 1716454222817460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759021, "dur": 9, "args": { "External id": 92356, "cbid": 211, "correlation": 92356 } }, { "ph": "s", "id": 92356, "pid": 76337, "tid": -914061504, "ts": 1716454222759021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222817472, "dur": 18, "args": { "External id": 92376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92376, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 92376, "pid": 5, "tid": 7, "ts": 1716454222817472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759106, "dur": 12, "args": { "External id": 92376, "cbid": 211, "correlation": 92376 } }, { "ph": "s", "id": 92376, "pid": 76337, "tid": -914061504, "ts": 1716454222759106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222817491, "dur": 5, "args": { "External id": 92388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92388, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 92388, "pid": 5, "tid": 7, "ts": 1716454222817491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759129, "dur": 7, "args": { "External id": 92388, "cbid": 211, "correlation": 92388 } }, { "ph": "s", "id": 92388, "pid": 76337, "tid": -914061504, "ts": 1716454222759129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222817497, "dur": 16, "args": { "External id": 92391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92391, "pid": 5, "tid": 7, "ts": 1716454222817497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759148, "dur": 7, "args": { "External id": 92391, "cbid": 211, "correlation": 92391 } }, { "ph": "s", "id": 92391, "pid": 76337, "tid": -914061504, "ts": 1716454222759148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222759205, "dur": 0, "args": { "External id": 92402, "cbid": 317, "correlation": 92402 } }, { "ph": "f", "id": 92402, "pid": 76337, "tid": -914061504, "ts": 1716454222759205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222759205, "dur": 0, "args": { "External id": 92403, "cbid": 203, "correlation": 92403 } }, { "ph": "f", "id": 92403, "pid": 76337, "tid": -914061504, "ts": 1716454222759205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222759206, "dur": 0, "args": { "External id": 92404, "cbid": 205, "correlation": 92404 } }, { "ph": "f", "id": 92404, "pid": 76337, "tid": -914061504, "ts": 1716454222759206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222817514, "dur": 11, "args": { "External id": 92408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92408, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92408, "pid": 5, "tid": 7, "ts": 1716454222817514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759220, "dur": 12, "args": { "External id": 92408, "cbid": 211, "correlation": 92408 } }, { "ph": "s", "id": 92408, "pid": 76337, "tid": -914061504, "ts": 1716454222759220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222817526, "dur": 3, "args": { "External id": 92410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92410, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 92410, "pid": 5, "tid": 7, "ts": 1716454222817526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759237, "dur": 6, "args": { "External id": 92410, "cbid": 211, "correlation": 92410 } }, { "ph": "s", "id": 92410, "pid": 76337, "tid": -914061504, "ts": 1716454222759237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222759245, "dur": 0, "args": { "External id": 92411, "cbid": 51, "correlation": 92411 } }, { "ph": "s", "id": 92411, "pid": 76337, "tid": -914061504, "ts": 1716454222759245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222817531, "dur": 88, "args": { "External id": 92412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92412, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 92412, "pid": 5, "tid": 7, "ts": 1716454222817531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759246, "dur": 5, "args": { "External id": 92412, "cbid": 211, "correlation": 92412 } }, { "ph": "s", "id": 92412, "pid": 76337, "tid": -914061504, "ts": 1716454222759246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222817621, "dur": 15, "args": { "External id": 92417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92417, "pid": 5, "tid": 7, "ts": 1716454222817621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759273, "dur": 8, "args": { "External id": 92417, "cbid": 211, "correlation": 92417 } }, { "ph": "s", "id": 92417, "pid": 76337, "tid": -914061504, "ts": 1716454222759273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222817637, "dur": 82, "args": { "External id": 92426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92426, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92426, "pid": 5, "tid": 7, "ts": 1716454222817637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759360, "dur": 15, "args": { "External id": 92426, "cbid": 211, "correlation": 92426 } }, { "ph": "s", "id": 92426, "pid": 76337, "tid": -914061504, "ts": 1716454222759360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222817720, "dur": 30, "args": { "External id": 92448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92448, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92448, "pid": 5, "tid": 7, "ts": 1716454222817720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759422, "dur": 10, "args": { "External id": 92448, "cbid": 211, "correlation": 92448 } }, { "ph": "s", "id": 92448, "pid": 76337, "tid": -914061504, "ts": 1716454222759422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222759525, "dur": 2, "args": { "External id": 92459, "cbid": 251, "correlation": 92459 } }, { "ph": "f", "id": 92459, "pid": 76337, "tid": -914061504, "ts": 1716454222759525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222817751, "dur": 161, "args": { "External id": 92460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92460, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92460, "pid": 5, "tid": 7, "ts": 1716454222817751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759531, "dur": 13, "args": { "External id": 92460, "cbid": 211, "correlation": 92460 } }, { "ph": "s", "id": 92460, "pid": 76337, "tid": -914061504, "ts": 1716454222759531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222759603, "dur": 1, "args": { "External id": 92471, "cbid": 251, "correlation": 92471 } }, { "ph": "f", "id": 92471, "pid": 76337, "tid": -914061504, "ts": 1716454222759603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222817913, "dur": 157, "args": { "External id": 92472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92472, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92472, "pid": 5, "tid": 7, "ts": 1716454222817913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759607, "dur": 12, "args": { "External id": 92472, "cbid": 211, "correlation": 92472 } }, { "ph": "s", "id": 92472, "pid": 76337, "tid": -914061504, "ts": 1716454222759607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222759674, "dur": 1, "args": { "External id": 92483, "cbid": 251, "correlation": 92483 } }, { "ph": "f", "id": 92483, "pid": 76337, "tid": -914061504, "ts": 1716454222759674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222818072, "dur": 133, "args": { "External id": 92484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92484, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92484, "pid": 5, "tid": 7, "ts": 1716454222818072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759679, "dur": 11, "args": { "External id": 92484, "cbid": 211, "correlation": 92484 } }, { "ph": "s", "id": 92484, "pid": 76337, "tid": -914061504, "ts": 1716454222759679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222818207, "dur": 330, "args": { "External id": 92509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92509, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92509, "pid": 5, "tid": 7, "ts": 1716454222818207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759771, "dur": 14, "args": { "External id": 92509, "cbid": 211, "correlation": 92509 } }, { "ph": "s", "id": 92509, "pid": 76337, "tid": -914061504, "ts": 1716454222759771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222759879, "dur": 2, "args": { "External id": 92527, "cbid": 251, "correlation": 92527 } }, { "ph": "f", "id": 92527, "pid": 76337, "tid": -914061504, "ts": 1716454222759879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222818539, "dur": 165, "args": { "External id": 92529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92529, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92529, "pid": 5, "tid": 7, "ts": 1716454222818539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759885, "dur": 14, "args": { "External id": 92529, "cbid": 211, "correlation": 92529 } }, { "ph": "s", "id": 92529, "pid": 76337, "tid": -914061504, "ts": 1716454222759885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222818705, "dur": 19, "args": { "External id": 92537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92537, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92537, "pid": 5, "tid": 7, "ts": 1716454222818705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222759959, "dur": 12, "args": { "External id": 92537, "cbid": 211, "correlation": 92537 } }, { "ph": "s", "id": 92537, "pid": 76337, "tid": -914061504, "ts": 1716454222759959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222818726, "dur": 28, "args": { "External id": 92545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92545, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92545, "pid": 5, "tid": 7, "ts": 1716454222818726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760008, "dur": 10, "args": { "External id": 92545, "cbid": 211, "correlation": 92545 } }, { "ph": "s", "id": 92545, "pid": 76337, "tid": -914061504, "ts": 1716454222760008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222818755, "dur": 19, "args": { "External id": 92556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92556, "pid": 5, "tid": 7, "ts": 1716454222818755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760088, "dur": 13, "args": { "External id": 92556, "cbid": 211, "correlation": 92556 } }, { "ph": "s", "id": 92556, "pid": 76337, "tid": -914061504, "ts": 1716454222760088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222818775, "dur": 16, "args": { "External id": 92578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92578, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92578, "pid": 5, "tid": 7, "ts": 1716454222818775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760122, "dur": 8, "args": { "External id": 92578, "cbid": 211, "correlation": 92578 } }, { "ph": "s", "id": 92578, "pid": 76337, "tid": -914061504, "ts": 1716454222760122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760209, "dur": 2, "args": { "External id": 92589, "cbid": 251, "correlation": 92589 } }, { "ph": "f", "id": 92589, "pid": 76337, "tid": -914061504, "ts": 1716454222760209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222818792, "dur": 87, "args": { "External id": 92590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92590, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 92590, "pid": 5, "tid": 7, "ts": 1716454222818792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760215, "dur": 14, "args": { "External id": 92590, "cbid": 211, "correlation": 92590 } }, { "ph": "s", "id": 92590, "pid": 76337, "tid": -914061504, "ts": 1716454222760215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760290, "dur": 1, "args": { "External id": 92601, "cbid": 251, "correlation": 92601 } }, { "ph": "f", "id": 92601, "pid": 76337, "tid": -914061504, "ts": 1716454222760290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760294, "dur": 0, "args": { "External id": 92602, "cbid": 251, "correlation": 92602 } }, { "ph": "f", "id": 92602, "pid": 76337, "tid": -914061504, "ts": 1716454222760294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222818881, "dur": 12, "args": { "External id": 92603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92603, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92603, "pid": 5, "tid": 7, "ts": 1716454222818881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760296, "dur": 12, "args": { "External id": 92603, "cbid": 211, "correlation": 92603 } }, { "ph": "s", "id": 92603, "pid": 76337, "tid": -914061504, "ts": 1716454222760296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222818894, "dur": 6, "args": { "External id": 92605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92605, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92605, "pid": 5, "tid": 7, "ts": 1716454222818894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760310, "dur": 6, "args": { "External id": 92605, "cbid": 211, "correlation": 92605 } }, { "ph": "s", "id": 92605, "pid": 76337, "tid": -914061504, "ts": 1716454222760310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760369, "dur": 1, "args": { "External id": 92616, "cbid": 251, "correlation": 92616 } }, { "ph": "f", "id": 92616, "pid": 76337, "tid": -914061504, "ts": 1716454222760369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760372, "dur": 0, "args": { "External id": 92617, "cbid": 251, "correlation": 92617 } }, { "ph": "f", "id": 92617, "pid": 76337, "tid": -914061504, "ts": 1716454222760372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222818901, "dur": 8, "args": { "External id": 92618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92618, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92618, "pid": 5, "tid": 7, "ts": 1716454222818901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760373, "dur": 11, "args": { "External id": 92618, "cbid": 211, "correlation": 92618 } }, { "ph": "s", "id": 92618, "pid": 76337, "tid": -914061504, "ts": 1716454222760373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222818911, "dur": 4, "args": { "External id": 92620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92620, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92620, "pid": 5, "tid": 7, "ts": 1716454222818911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760386, "dur": 6, "args": { "External id": 92620, "cbid": 211, "correlation": 92620 } }, { "ph": "s", "id": 92620, "pid": 76337, "tid": -914061504, "ts": 1716454222760386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222818916, "dur": 54, "args": { "External id": 92645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92645, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92645, "pid": 5, "tid": 7, "ts": 1716454222818916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760465, "dur": 12, "args": { "External id": 92645, "cbid": 211, "correlation": 92645 } }, { "ph": "s", "id": 92645, "pid": 76337, "tid": -914061504, "ts": 1716454222760465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760567, "dur": 2, "args": { "External id": 92663, "cbid": 251, "correlation": 92663 } }, { "ph": "f", "id": 92663, "pid": 76337, "tid": -914061504, "ts": 1716454222760567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222818971, "dur": 89, "args": { "External id": 92665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92665, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 92665, "pid": 5, "tid": 7, "ts": 1716454222818971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760574, "dur": 14, "args": { "External id": 92665, "cbid": 211, "correlation": 92665 } }, { "ph": "s", "id": 92665, "pid": 76337, "tid": -914061504, "ts": 1716454222760574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222819062, "dur": 10, "args": { "External id": 92673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92673, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92673, "pid": 5, "tid": 7, "ts": 1716454222819062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760645, "dur": 13, "args": { "External id": 92673, "cbid": 211, "correlation": 92673 } }, { "ph": "s", "id": 92673, "pid": 76337, "tid": -914061504, "ts": 1716454222760645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222819073, "dur": 20, "args": { "External id": 92681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92681, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92681, "pid": 5, "tid": 7, "ts": 1716454222819073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760687, "dur": 10, "args": { "External id": 92681, "cbid": 211, "correlation": 92681 } }, { "ph": "s", "id": 92681, "pid": 76337, "tid": -914061504, "ts": 1716454222760687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222819095, "dur": 17, "args": { "External id": 92703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92703, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92703, "pid": 5, "tid": 7, "ts": 1716454222819095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760739, "dur": 11, "args": { "External id": 92703, "cbid": 211, "correlation": 92703 } }, { "ph": "s", "id": 92703, "pid": 76337, "tid": -914061504, "ts": 1716454222760739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760829, "dur": 1, "args": { "External id": 92719, "cbid": 251, "correlation": 92719 } }, { "ph": "f", "id": 92719, "pid": 76337, "tid": -914061504, "ts": 1716454222760829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222760834, "dur": 0, "args": { "External id": 92721, "cbid": 251, "correlation": 92721 } }, { "ph": "f", "id": 92721, "pid": 76337, "tid": -914061504, "ts": 1716454222760834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222819113, "dur": 492, "args": { "External id": 92722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92722, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92722, "pid": 5, "tid": 7, "ts": 1716454222819113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760836, "dur": 13, "args": { "External id": 92722, "cbid": 211, "correlation": 92722 } }, { "ph": "s", "id": 92722, "pid": 76337, "tid": -914061504, "ts": 1716454222760836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222819606, "dur": 65, "args": { "External id": 92730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92730, "pid": 5, "tid": 7, "ts": 1716454222819606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760910, "dur": 14, "args": { "External id": 92730, "cbid": 211, "correlation": 92730 } }, { "ph": "s", "id": 92730, "pid": 76337, "tid": -914061504, "ts": 1716454222760910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222819672, "dur": 65, "args": { "External id": 92738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92738, "pid": 5, "tid": 7, "ts": 1716454222819672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222760945, "dur": 8, "args": { "External id": 92738, "cbid": 211, "correlation": 92738 } }, { "ph": "s", "id": 92738, "pid": 76337, "tid": -914061504, "ts": 1716454222760945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222761037, "dur": 1, "args": { "External id": 92754, "cbid": 251, "correlation": 92754 } }, { "ph": "f", "id": 92754, "pid": 76337, "tid": -914061504, "ts": 1716454222761037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222819740, "dur": 1, "args": { "External id": 92756, "device": 5, "context": 1, "stream": 7, "correlation": 92756, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 92756, "pid": 5, "tid": 7, "ts": 1716454222819740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222761042, "dur": 12, "args": { "External id": 92756, "cbid": 51, "correlation": 92756 } }, { "ph": "s", "id": 92756, "pid": 76337, "tid": -914061504, "ts": 1716454222761042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222819743, "dur": 270, "args": { "External id": 92757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92757, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 92757, "pid": 5, "tid": 7, "ts": 1716454222819743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761055, "dur": 12, "args": { "External id": 92757, "cbid": 211, "correlation": 92757 } }, { "ph": "s", "id": 92757, "pid": 76337, "tid": -914061504, "ts": 1716454222761055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222820014, "dur": 14, "args": { "External id": 92765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92765, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92765, "pid": 5, "tid": 7, "ts": 1716454222820014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761099, "dur": 11, "args": { "External id": 92765, "cbid": 211, "correlation": 92765 } }, { "ph": "s", "id": 92765, "pid": 76337, "tid": -914061504, "ts": 1716454222761099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222820030, "dur": 37, "args": { "External id": 92776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92776, "pid": 5, "tid": 7, "ts": 1716454222820030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761172, "dur": 13, "args": { "External id": 92776, "cbid": 211, "correlation": 92776 } }, { "ph": "s", "id": 92776, "pid": 76337, "tid": -914061504, "ts": 1716454222761172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222761241, "dur": 0, "args": { "External id": 92788, "cbid": 317, "correlation": 92788 } }, { "ph": "f", "id": 92788, "pid": 76337, "tid": -914061504, "ts": 1716454222761241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222761242, "dur": 0, "args": { "External id": 92789, "cbid": 203, "correlation": 92789 } }, { "ph": "f", "id": 92789, "pid": 76337, "tid": -914061504, "ts": 1716454222761242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222761243, "dur": 0, "args": { "External id": 92790, "cbid": 205, "correlation": 92790 } }, { "ph": "f", "id": 92790, "pid": 76337, "tid": -914061504, "ts": 1716454222761243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222820068, "dur": 13, "args": { "External id": 92794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92794, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92794, "pid": 5, "tid": 7, "ts": 1716454222820068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761261, "dur": 12, "args": { "External id": 92794, "cbid": 211, "correlation": 92794 } }, { "ph": "s", "id": 92794, "pid": 76337, "tid": -914061504, "ts": 1716454222761261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222820082, "dur": 4, "args": { "External id": 92796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 92796, "pid": 5, "tid": 7, "ts": 1716454222820082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761279, "dur": 6, "args": { "External id": 92796, "cbid": 211, "correlation": 92796 } }, { "ph": "s", "id": 92796, "pid": 76337, "tid": -914061504, "ts": 1716454222761279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222761288, "dur": 0, "args": { "External id": 92797, "cbid": 51, "correlation": 92797 } }, { "ph": "s", "id": 92797, "pid": 76337, "tid": -914061504, "ts": 1716454222761288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222820088, "dur": 96, "args": { "External id": 92798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92798, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 92798, "pid": 5, "tid": 7, "ts": 1716454222820088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761289, "dur": 5, "args": { "External id": 92798, "cbid": 211, "correlation": 92798 } }, { "ph": "s", "id": 92798, "pid": 76337, "tid": -914061504, "ts": 1716454222761289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222820186, "dur": 16, "args": { "External id": 92803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92803, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92803, "pid": 5, "tid": 7, "ts": 1716454222820186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761317, "dur": 9, "args": { "External id": 92803, "cbid": 211, "correlation": 92803 } }, { "ph": "s", "id": 92803, "pid": 76337, "tid": -914061504, "ts": 1716454222761317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222820203, "dur": 11, "args": { "External id": 92811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92811, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92811, "pid": 5, "tid": 7, "ts": 1716454222820203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761349, "dur": 8, "args": { "External id": 92811, "cbid": 211, "correlation": 92811 } }, { "ph": "s", "id": 92811, "pid": 76337, "tid": -914061504, "ts": 1716454222761349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222820216, "dur": 30, "args": { "External id": 92820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92820, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92820, "pid": 5, "tid": 7, "ts": 1716454222820216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761392, "dur": 11, "args": { "External id": 92820, "cbid": 211, "correlation": 92820 } }, { "ph": "s", "id": 92820, "pid": 76337, "tid": -914061504, "ts": 1716454222761392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222820247, "dur": 30, "args": { "External id": 92840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92840, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 92840, "pid": 5, "tid": 7, "ts": 1716454222820247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761469, "dur": 12, "args": { "External id": 92840, "cbid": 211, "correlation": 92840 } }, { "ph": "s", "id": 92840, "pid": 76337, "tid": -914061504, "ts": 1716454222761469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222820279, "dur": 5, "args": { "External id": 92852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92852, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92852, "pid": 5, "tid": 7, "ts": 1716454222820279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761491, "dur": 7, "args": { "External id": 92852, "cbid": 211, "correlation": 92852 } }, { "ph": "s", "id": 92852, "pid": 76337, "tid": -914061504, "ts": 1716454222761491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222820285, "dur": 30, "args": { "External id": 92855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92855, "pid": 5, "tid": 7, "ts": 1716454222820285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761510, "dur": 6, "args": { "External id": 92855, "cbid": 211, "correlation": 92855 } }, { "ph": "s", "id": 92855, "pid": 76337, "tid": -914061504, "ts": 1716454222761510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222820316, "dur": 21, "args": { "External id": 92864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92864, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92864, "pid": 5, "tid": 7, "ts": 1716454222820316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761551, "dur": 10, "args": { "External id": 92864, "cbid": 211, "correlation": 92864 } }, { "ph": "s", "id": 92864, "pid": 76337, "tid": -914061504, "ts": 1716454222761551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222761605, "dur": 0, "args": { "External id": 92874, "cbid": 317, "correlation": 92874 } }, { "ph": "f", "id": 92874, "pid": 76337, "tid": -914061504, "ts": 1716454222761605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222761606, "dur": 0, "args": { "External id": 92875, "cbid": 203, "correlation": 92875 } }, { "ph": "f", "id": 92875, "pid": 76337, "tid": -914061504, "ts": 1716454222761606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222761607, "dur": 0, "args": { "External id": 92876, "cbid": 205, "correlation": 92876 } }, { "ph": "f", "id": 92876, "pid": 76337, "tid": -914061504, "ts": 1716454222761607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222820339, "dur": 22, "args": { "External id": 92880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92880, "pid": 5, "tid": 7, "ts": 1716454222820339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761620, "dur": 11, "args": { "External id": 92880, "cbid": 211, "correlation": 92880 } }, { "ph": "s", "id": 92880, "pid": 76337, "tid": -914061504, "ts": 1716454222761620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222820362, "dur": 314, "args": { "External id": 92882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92882, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92882, "pid": 5, "tid": 7, "ts": 1716454222820362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761634, "dur": 5, "args": { "External id": 92882, "cbid": 211, "correlation": 92882 } }, { "ph": "s", "id": 92882, "pid": 76337, "tid": -914061504, "ts": 1716454222761634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222820678, "dur": 1, "args": { "External id": 92884, "device": 5, "context": 1, "stream": 7, "correlation": 92884, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 92884, "pid": 5, "tid": 7, "ts": 1716454222820678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222761646, "dur": 8, "args": { "External id": 92884, "cbid": 51, "correlation": 92884 } }, { "ph": "s", "id": 92884, "pid": 76337, "tid": -914061504, "ts": 1716454222761646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222820682, "dur": 1238, "args": { "External id": 92885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92885, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92885, "pid": 5, "tid": 7, "ts": 1716454222820682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761655, "dur": 7, "args": { "External id": 92885, "cbid": 211, "correlation": 92885 } }, { "ph": "s", "id": 92885, "pid": 76337, "tid": -914061504, "ts": 1716454222761655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222821921, "dur": 13, "args": { "External id": 92887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92887, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92887, "pid": 5, "tid": 7, "ts": 1716454222821921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761666, "dur": 5, "args": { "External id": 92887, "cbid": 211, "correlation": 92887 } }, { "ph": "s", "id": 92887, "pid": 76337, "tid": -914061504, "ts": 1716454222761666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222821935, "dur": 14, "args": { "External id": 92893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92893, "pid": 5, "tid": 7, "ts": 1716454222821935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761694, "dur": 9, "args": { "External id": 92893, "cbid": 211, "correlation": 92893 } }, { "ph": "s", "id": 92893, "pid": 76337, "tid": -914061504, "ts": 1716454222761694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222821951, "dur": 3, "args": { "External id": 92901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92901, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 92901, "pid": 5, "tid": 7, "ts": 1716454222821951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761740, "dur": 9, "args": { "External id": 92901, "cbid": 211, "correlation": 92901 } }, { "ph": "s", "id": 92901, "pid": 76337, "tid": -914061504, "ts": 1716454222761740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222761808, "dur": 1, "args": { "External id": 92917, "cbid": 251, "correlation": 92917 } }, { "ph": "f", "id": 92917, "pid": 76337, "tid": -914061504, "ts": 1716454222761808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222761813, "dur": 0, "args": { "External id": 92919, "cbid": 251, "correlation": 92919 } }, { "ph": "f", "id": 92919, "pid": 76337, "tid": -914061504, "ts": 1716454222761813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222821955, "dur": 13, "args": { "External id": 92920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92920, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92920, "pid": 5, "tid": 7, "ts": 1716454222821955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761815, "dur": 12, "args": { "External id": 92920, "cbid": 211, "correlation": 92920 } }, { "ph": "s", "id": 92920, "pid": 76337, "tid": -914061504, "ts": 1716454222761815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222821969, "dur": 5, "args": { "External id": 92922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92922, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92922, "pid": 5, "tid": 7, "ts": 1716454222821969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761830, "dur": 6, "args": { "External id": 92922, "cbid": 211, "correlation": 92922 } }, { "ph": "s", "id": 92922, "pid": 76337, "tid": -914061504, "ts": 1716454222761830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222821976, "dur": 16, "args": { "External id": 92932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92932, "pid": 5, "tid": 7, "ts": 1716454222821976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761890, "dur": 12, "args": { "External id": 92932, "cbid": 211, "correlation": 92932 } }, { "ph": "s", "id": 92932, "pid": 76337, "tid": -914061504, "ts": 1716454222761890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222821993, "dur": 17, "args": { "External id": 92952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92952, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 92952, "pid": 5, "tid": 7, "ts": 1716454222821993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761957, "dur": 11, "args": { "External id": 92952, "cbid": 211, "correlation": 92952 } }, { "ph": "s", "id": 92952, "pid": 76337, "tid": -914061504, "ts": 1716454222761957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222822011, "dur": 4, "args": { "External id": 92964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92964, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 92964, "pid": 5, "tid": 7, "ts": 1716454222822011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222761986, "dur": 7, "args": { "External id": 92964, "cbid": 211, "correlation": 92964 } }, { "ph": "s", "id": 92964, "pid": 76337, "tid": -914061504, "ts": 1716454222761986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222822016, "dur": 16, "args": { "External id": 92967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92967, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92967, "pid": 5, "tid": 7, "ts": 1716454222822016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762005, "dur": 7, "args": { "External id": 92967, "cbid": 211, "correlation": 92967 } }, { "ph": "s", "id": 92967, "pid": 76337, "tid": -914061504, "ts": 1716454222762005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222822034, "dur": 11, "args": { "External id": 92976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92976, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92976, "pid": 5, "tid": 7, "ts": 1716454222822034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762047, "dur": 11, "args": { "External id": 92976, "cbid": 211, "correlation": 92976 } }, { "ph": "s", "id": 92976, "pid": 76337, "tid": -914061504, "ts": 1716454222762047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222762112, "dur": 0, "args": { "External id": 92986, "cbid": 317, "correlation": 92986 } }, { "ph": "f", "id": 92986, "pid": 76337, "tid": -914061504, "ts": 1716454222762112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222762113, "dur": 0, "args": { "External id": 92987, "cbid": 203, "correlation": 92987 } }, { "ph": "f", "id": 92987, "pid": 76337, "tid": -914061504, "ts": 1716454222762113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222762114, "dur": 0, "args": { "External id": 92988, "cbid": 205, "correlation": 92988 } }, { "ph": "f", "id": 92988, "pid": 76337, "tid": -914061504, "ts": 1716454222762114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222822046, "dur": 11, "args": { "External id": 92992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92992, "pid": 5, "tid": 7, "ts": 1716454222822046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762128, "dur": 12, "args": { "External id": 92992, "cbid": 211, "correlation": 92992 } }, { "ph": "s", "id": 92992, "pid": 76337, "tid": -914061504, "ts": 1716454222762128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222822058, "dur": 160, "args": { "External id": 92994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92994, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92994, "pid": 5, "tid": 7, "ts": 1716454222822058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762142, "dur": 5, "args": { "External id": 92994, "cbid": 211, "correlation": 92994 } }, { "ph": "s", "id": 92994, "pid": 76337, "tid": -914061504, "ts": 1716454222762142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222822221, "dur": 1, "args": { "External id": 92996, "device": 5, "context": 1, "stream": 7, "correlation": 92996, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 92996, "pid": 5, "tid": 7, "ts": 1716454222822221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222762153, "dur": 6, "args": { "External id": 92996, "cbid": 51, "correlation": 92996 } }, { "ph": "s", "id": 92996, "pid": 76337, "tid": -914061504, "ts": 1716454222762153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222822224, "dur": 639, "args": { "External id": 92997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92997, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 92997, "pid": 5, "tid": 7, "ts": 1716454222822224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762161, "dur": 7, "args": { "External id": 92997, "cbid": 211, "correlation": 92997 } }, { "ph": "s", "id": 92997, "pid": 76337, "tid": -914061504, "ts": 1716454222762161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222822865, "dur": 12, "args": { "External id": 92999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 92999, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 92999, "pid": 5, "tid": 7, "ts": 1716454222822865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762172, "dur": 5, "args": { "External id": 92999, "cbid": 211, "correlation": 92999 } }, { "ph": "s", "id": 92999, "pid": 76337, "tid": -914061504, "ts": 1716454222762172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222822878, "dur": 14, "args": { "External id": 93005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93005, "pid": 5, "tid": 7, "ts": 1716454222822878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762200, "dur": 8, "args": { "External id": 93005, "cbid": 211, "correlation": 93005 } }, { "ph": "s", "id": 93005, "pid": 76337, "tid": -914061504, "ts": 1716454222762200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222762259, "dur": 0, "args": { "External id": 93015, "cbid": 317, "correlation": 93015 } }, { "ph": "f", "id": 93015, "pid": 76337, "tid": -914061504, "ts": 1716454222762259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222762260, "dur": 0, "args": { "External id": 93016, "cbid": 203, "correlation": 93016 } }, { "ph": "f", "id": 93016, "pid": 76337, "tid": -914061504, "ts": 1716454222762260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222762261, "dur": 0, "args": { "External id": 93017, "cbid": 205, "correlation": 93017 } }, { "ph": "f", "id": 93017, "pid": 76337, "tid": -914061504, "ts": 1716454222762261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222822894, "dur": 21, "args": { "External id": 93021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93021, "pid": 5, "tid": 7, "ts": 1716454222822894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762274, "dur": 12, "args": { "External id": 93021, "cbid": 211, "correlation": 93021 } }, { "ph": "s", "id": 93021, "pid": 76337, "tid": -914061504, "ts": 1716454222762274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222822916, "dur": 4, "args": { "External id": 93023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93023, "pid": 5, "tid": 7, "ts": 1716454222822916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762290, "dur": 6, "args": { "External id": 93023, "cbid": 211, "correlation": 93023 } }, { "ph": "s", "id": 93023, "pid": 76337, "tid": -914061504, "ts": 1716454222762290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222762299, "dur": 0, "args": { "External id": 93024, "cbid": 51, "correlation": 93024 } }, { "ph": "s", "id": 93024, "pid": 76337, "tid": -914061504, "ts": 1716454222762299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222822921, "dur": 167, "args": { "External id": 93025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93025, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 93025, "pid": 5, "tid": 7, "ts": 1716454222822921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762299, "dur": 5, "args": { "External id": 93025, "cbid": 211, "correlation": 93025 } }, { "ph": "s", "id": 93025, "pid": 76337, "tid": -914061504, "ts": 1716454222762299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222823090, "dur": 16, "args": { "External id": 93030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93030, "pid": 5, "tid": 7, "ts": 1716454222823090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762324, "dur": 8, "args": { "External id": 93030, "cbid": 211, "correlation": 93030 } }, { "ph": "s", "id": 93030, "pid": 76337, "tid": -914061504, "ts": 1716454222762324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222823106, "dur": 13, "args": { "External id": 93038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93038, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93038, "pid": 5, "tid": 7, "ts": 1716454222823106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762353, "dur": 8, "args": { "External id": 93038, "cbid": 211, "correlation": 93038 } }, { "ph": "s", "id": 93038, "pid": 76337, "tid": -914061504, "ts": 1716454222762353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222823120, "dur": 10, "args": { "External id": 93046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93046, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93046, "pid": 5, "tid": 7, "ts": 1716454222823120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762382, "dur": 9, "args": { "External id": 93046, "cbid": 211, "correlation": 93046 } }, { "ph": "s", "id": 93046, "pid": 76337, "tid": -914061504, "ts": 1716454222762382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222823132, "dur": 18, "args": { "External id": 93066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93066, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 93066, "pid": 5, "tid": 7, "ts": 1716454222823132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762467, "dur": 13, "args": { "External id": 93066, "cbid": 211, "correlation": 93066 } }, { "ph": "s", "id": 93066, "pid": 76337, "tid": -914061504, "ts": 1716454222762467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222823151, "dur": 4, "args": { "External id": 93078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93078, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 93078, "pid": 5, "tid": 7, "ts": 1716454222823151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762490, "dur": 6, "args": { "External id": 93078, "cbid": 211, "correlation": 93078 } }, { "ph": "s", "id": 93078, "pid": 76337, "tid": -914061504, "ts": 1716454222762490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222823157, "dur": 16, "args": { "External id": 93081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93081, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93081, "pid": 5, "tid": 7, "ts": 1716454222823157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762508, "dur": 6, "args": { "External id": 93081, "cbid": 211, "correlation": 93081 } }, { "ph": "s", "id": 93081, "pid": 76337, "tid": -914061504, "ts": 1716454222762508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222762566, "dur": 0, "args": { "External id": 93092, "cbid": 317, "correlation": 93092 } }, { "ph": "f", "id": 93092, "pid": 76337, "tid": -914061504, "ts": 1716454222762566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222762567, "dur": 0, "args": { "External id": 93093, "cbid": 203, "correlation": 93093 } }, { "ph": "f", "id": 93093, "pid": 76337, "tid": -914061504, "ts": 1716454222762567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222762567, "dur": 0, "args": { "External id": 93094, "cbid": 205, "correlation": 93094 } }, { "ph": "f", "id": 93094, "pid": 76337, "tid": -914061504, "ts": 1716454222762567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222823175, "dur": 13, "args": { "External id": 93098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93098, "pid": 5, "tid": 7, "ts": 1716454222823175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762581, "dur": 12, "args": { "External id": 93098, "cbid": 211, "correlation": 93098 } }, { "ph": "s", "id": 93098, "pid": 76337, "tid": -914061504, "ts": 1716454222762581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222823189, "dur": 3, "args": { "External id": 93100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93100, "pid": 5, "tid": 7, "ts": 1716454222823189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762596, "dur": 6, "args": { "External id": 93100, "cbid": 211, "correlation": 93100 } }, { "ph": "s", "id": 93100, "pid": 76337, "tid": -914061504, "ts": 1716454222762596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222762604, "dur": 0, "args": { "External id": 93101, "cbid": 51, "correlation": 93101 } }, { "ph": "s", "id": 93101, "pid": 76337, "tid": -914061504, "ts": 1716454222762604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222823194, "dur": 91, "args": { "External id": 93102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93102, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 93102, "pid": 5, "tid": 7, "ts": 1716454222823194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762605, "dur": 5, "args": { "External id": 93102, "cbid": 211, "correlation": 93102 } }, { "ph": "s", "id": 93102, "pid": 76337, "tid": -914061504, "ts": 1716454222762605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222823286, "dur": 16, "args": { "External id": 93107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93107, "pid": 5, "tid": 7, "ts": 1716454222823286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762632, "dur": 8, "args": { "External id": 93107, "cbid": 211, "correlation": 93107 } }, { "ph": "s", "id": 93107, "pid": 76337, "tid": -914061504, "ts": 1716454222762632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222823302, "dur": 81, "args": { "External id": 93116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93116, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93116, "pid": 5, "tid": 7, "ts": 1716454222823302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762715, "dur": 15, "args": { "External id": 93116, "cbid": 211, "correlation": 93116 } }, { "ph": "s", "id": 93116, "pid": 76337, "tid": -914061504, "ts": 1716454222762715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222823385, "dur": 30, "args": { "External id": 93138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93138, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93138, "pid": 5, "tid": 7, "ts": 1716454222823385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762773, "dur": 10, "args": { "External id": 93138, "cbid": 211, "correlation": 93138 } }, { "ph": "s", "id": 93138, "pid": 76337, "tid": -914061504, "ts": 1716454222762773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222762868, "dur": 1, "args": { "External id": 93149, "cbid": 251, "correlation": 93149 } }, { "ph": "f", "id": 93149, "pid": 76337, "tid": -914061504, "ts": 1716454222762868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222823416, "dur": 140, "args": { "External id": 93150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93150, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93150, "pid": 5, "tid": 7, "ts": 1716454222823416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762873, "dur": 13, "args": { "External id": 93150, "cbid": 211, "correlation": 93150 } }, { "ph": "s", "id": 93150, "pid": 76337, "tid": -914061504, "ts": 1716454222762873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222762944, "dur": 1, "args": { "External id": 93161, "cbid": 251, "correlation": 93161 } }, { "ph": "f", "id": 93161, "pid": 76337, "tid": -914061504, "ts": 1716454222762944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222823558, "dur": 153, "args": { "External id": 93162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93162, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93162, "pid": 5, "tid": 7, "ts": 1716454222823558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222762948, "dur": 11, "args": { "External id": 93162, "cbid": 211, "correlation": 93162 } }, { "ph": "s", "id": 93162, "pid": 76337, "tid": -914061504, "ts": 1716454222762948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763024, "dur": 1, "args": { "External id": 93173, "cbid": 251, "correlation": 93173 } }, { "ph": "f", "id": 93173, "pid": 76337, "tid": -914061504, "ts": 1716454222763024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222823712, "dur": 157, "args": { "External id": 93174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93174, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93174, "pid": 5, "tid": 7, "ts": 1716454222823712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763028, "dur": 12, "args": { "External id": 93174, "cbid": 211, "correlation": 93174 } }, { "ph": "s", "id": 93174, "pid": 76337, "tid": -914061504, "ts": 1716454222763028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222823871, "dur": 331, "args": { "External id": 93199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93199, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93199, "pid": 5, "tid": 7, "ts": 1716454222823871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763117, "dur": 14, "args": { "External id": 93199, "cbid": 211, "correlation": 93199 } }, { "ph": "s", "id": 93199, "pid": 76337, "tid": -914061504, "ts": 1716454222763117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763220, "dur": 1, "args": { "External id": 93217, "cbid": 251, "correlation": 93217 } }, { "ph": "f", "id": 93217, "pid": 76337, "tid": -914061504, "ts": 1716454222763220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222824203, "dur": 162, "args": { "External id": 93219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93219, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93219, "pid": 5, "tid": 7, "ts": 1716454222824203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763226, "dur": 13, "args": { "External id": 93219, "cbid": 211, "correlation": 93219 } }, { "ph": "s", "id": 93219, "pid": 76337, "tid": -914061504, "ts": 1716454222763226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222824366, "dur": 19, "args": { "External id": 93227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93227, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93227, "pid": 5, "tid": 7, "ts": 1716454222824366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763297, "dur": 13, "args": { "External id": 93227, "cbid": 211, "correlation": 93227 } }, { "ph": "s", "id": 93227, "pid": 76337, "tid": -914061504, "ts": 1716454222763297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222824386, "dur": 27, "args": { "External id": 93235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93235, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93235, "pid": 5, "tid": 7, "ts": 1716454222824386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763337, "dur": 9, "args": { "External id": 93235, "cbid": 211, "correlation": 93235 } }, { "ph": "s", "id": 93235, "pid": 76337, "tid": -914061504, "ts": 1716454222763337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222824415, "dur": 18, "args": { "External id": 93246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93246, "pid": 5, "tid": 7, "ts": 1716454222824415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763411, "dur": 14, "args": { "External id": 93246, "cbid": 211, "correlation": 93246 } }, { "ph": "s", "id": 93246, "pid": 76337, "tid": -914061504, "ts": 1716454222763411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222824435, "dur": 16, "args": { "External id": 93268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93268, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93268, "pid": 5, "tid": 7, "ts": 1716454222824435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763443, "dur": 7, "args": { "External id": 93268, "cbid": 211, "correlation": 93268 } }, { "ph": "s", "id": 93268, "pid": 76337, "tid": -914061504, "ts": 1716454222763443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763529, "dur": 1, "args": { "External id": 93279, "cbid": 251, "correlation": 93279 } }, { "ph": "f", "id": 93279, "pid": 76337, "tid": -914061504, "ts": 1716454222763529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222824452, "dur": 88, "args": { "External id": 93280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93280, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93280, "pid": 5, "tid": 7, "ts": 1716454222824452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763535, "dur": 13, "args": { "External id": 93280, "cbid": 211, "correlation": 93280 } }, { "ph": "s", "id": 93280, "pid": 76337, "tid": -914061504, "ts": 1716454222763535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763607, "dur": 1, "args": { "External id": 93291, "cbid": 251, "correlation": 93291 } }, { "ph": "f", "id": 93291, "pid": 76337, "tid": -914061504, "ts": 1716454222763607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763610, "dur": 0, "args": { "External id": 93292, "cbid": 251, "correlation": 93292 } }, { "ph": "f", "id": 93292, "pid": 76337, "tid": -914061504, "ts": 1716454222763610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222824542, "dur": 13, "args": { "External id": 93293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93293, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93293, "pid": 5, "tid": 7, "ts": 1716454222824542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763612, "dur": 12, "args": { "External id": 93293, "cbid": 211, "correlation": 93293 } }, { "ph": "s", "id": 93293, "pid": 76337, "tid": -914061504, "ts": 1716454222763612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222824556, "dur": 6, "args": { "External id": 93295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93295, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93295, "pid": 5, "tid": 7, "ts": 1716454222824556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763626, "dur": 6, "args": { "External id": 93295, "cbid": 211, "correlation": 93295 } }, { "ph": "s", "id": 93295, "pid": 76337, "tid": -914061504, "ts": 1716454222763626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763684, "dur": 1, "args": { "External id": 93306, "cbid": 251, "correlation": 93306 } }, { "ph": "f", "id": 93306, "pid": 76337, "tid": -914061504, "ts": 1716454222763684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763688, "dur": 0, "args": { "External id": 93307, "cbid": 251, "correlation": 93307 } }, { "ph": "f", "id": 93307, "pid": 76337, "tid": -914061504, "ts": 1716454222763688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222824563, "dur": 9, "args": { "External id": 93308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93308, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93308, "pid": 5, "tid": 7, "ts": 1716454222824563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763689, "dur": 11, "args": { "External id": 93308, "cbid": 211, "correlation": 93308 } }, { "ph": "s", "id": 93308, "pid": 76337, "tid": -914061504, "ts": 1716454222763689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222824573, "dur": 3, "args": { "External id": 93310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93310, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93310, "pid": 5, "tid": 7, "ts": 1716454222824573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763702, "dur": 5, "args": { "External id": 93310, "cbid": 211, "correlation": 93310 } }, { "ph": "s", "id": 93310, "pid": 76337, "tid": -914061504, "ts": 1716454222763702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222824578, "dur": 55, "args": { "External id": 93335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93335, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93335, "pid": 5, "tid": 7, "ts": 1716454222824578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763781, "dur": 12, "args": { "External id": 93335, "cbid": 211, "correlation": 93335 } }, { "ph": "s", "id": 93335, "pid": 76337, "tid": -914061504, "ts": 1716454222763781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222763881, "dur": 1, "args": { "External id": 93353, "cbid": 251, "correlation": 93353 } }, { "ph": "f", "id": 93353, "pid": 76337, "tid": -914061504, "ts": 1716454222763881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222824634, "dur": 89, "args": { "External id": 93355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93355, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93355, "pid": 5, "tid": 7, "ts": 1716454222824634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763887, "dur": 14, "args": { "External id": 93355, "cbid": 211, "correlation": 93355 } }, { "ph": "s", "id": 93355, "pid": 76337, "tid": -914061504, "ts": 1716454222763887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222824725, "dur": 9, "args": { "External id": 93363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93363, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93363, "pid": 5, "tid": 7, "ts": 1716454222824725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222763957, "dur": 12, "args": { "External id": 93363, "cbid": 211, "correlation": 93363 } }, { "ph": "s", "id": 93363, "pid": 76337, "tid": -914061504, "ts": 1716454222763957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222824735, "dur": 20, "args": { "External id": 93371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93371, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93371, "pid": 5, "tid": 7, "ts": 1716454222824735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764009, "dur": 10, "args": { "External id": 93371, "cbid": 211, "correlation": 93371 } }, { "ph": "s", "id": 93371, "pid": 76337, "tid": -914061504, "ts": 1716454222764009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222824756, "dur": 17, "args": { "External id": 93393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93393, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93393, "pid": 5, "tid": 7, "ts": 1716454222824756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764061, "dur": 11, "args": { "External id": 93393, "cbid": 211, "correlation": 93393 } }, { "ph": "s", "id": 93393, "pid": 76337, "tid": -914061504, "ts": 1716454222764061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222764149, "dur": 1, "args": { "External id": 93409, "cbid": 251, "correlation": 93409 } }, { "ph": "f", "id": 93409, "pid": 76337, "tid": -914061504, "ts": 1716454222764149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222764154, "dur": 0, "args": { "External id": 93411, "cbid": 251, "correlation": 93411 } }, { "ph": "f", "id": 93411, "pid": 76337, "tid": -914061504, "ts": 1716454222764154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222824775, "dur": 493, "args": { "External id": 93412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93412, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93412, "pid": 5, "tid": 7, "ts": 1716454222824775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764156, "dur": 13, "args": { "External id": 93412, "cbid": 211, "correlation": 93412 } }, { "ph": "s", "id": 93412, "pid": 76337, "tid": -914061504, "ts": 1716454222764156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222825269, "dur": 66, "args": { "External id": 93420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93420, "pid": 5, "tid": 7, "ts": 1716454222825269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764226, "dur": 14, "args": { "External id": 93420, "cbid": 211, "correlation": 93420 } }, { "ph": "s", "id": 93420, "pid": 76337, "tid": -914061504, "ts": 1716454222764226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222825336, "dur": 65, "args": { "External id": 93428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93428, "pid": 5, "tid": 7, "ts": 1716454222825336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764260, "dur": 8, "args": { "External id": 93428, "cbid": 211, "correlation": 93428 } }, { "ph": "s", "id": 93428, "pid": 76337, "tid": -914061504, "ts": 1716454222764260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222764339, "dur": 1, "args": { "External id": 93444, "cbid": 251, "correlation": 93444 } }, { "ph": "f", "id": 93444, "pid": 76337, "tid": -914061504, "ts": 1716454222764339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222825403, "dur": 1, "args": { "External id": 93446, "device": 5, "context": 1, "stream": 7, "correlation": 93446, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 93446, "pid": 5, "tid": 7, "ts": 1716454222825403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222764344, "dur": 11, "args": { "External id": 93446, "cbid": 51, "correlation": 93446 } }, { "ph": "s", "id": 93446, "pid": 76337, "tid": -914061504, "ts": 1716454222764344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222825407, "dur": 265, "args": { "External id": 93447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93447, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93447, "pid": 5, "tid": 7, "ts": 1716454222825407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764356, "dur": 11, "args": { "External id": 93447, "cbid": 211, "correlation": 93447 } }, { "ph": "s", "id": 93447, "pid": 76337, "tid": -914061504, "ts": 1716454222764356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222825673, "dur": 14, "args": { "External id": 93455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93455, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93455, "pid": 5, "tid": 7, "ts": 1716454222825673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764398, "dur": 11, "args": { "External id": 93455, "cbid": 211, "correlation": 93455 } }, { "ph": "s", "id": 93455, "pid": 76337, "tid": -914061504, "ts": 1716454222764398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222825688, "dur": 37, "args": { "External id": 93466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93466, "pid": 5, "tid": 7, "ts": 1716454222825688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764470, "dur": 12, "args": { "External id": 93466, "cbid": 211, "correlation": 93466 } }, { "ph": "s", "id": 93466, "pid": 76337, "tid": -914061504, "ts": 1716454222764470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222764537, "dur": 0, "args": { "External id": 93478, "cbid": 317, "correlation": 93478 } }, { "ph": "f", "id": 93478, "pid": 76337, "tid": -914061504, "ts": 1716454222764537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222764538, "dur": 0, "args": { "External id": 93479, "cbid": 203, "correlation": 93479 } }, { "ph": "f", "id": 93479, "pid": 76337, "tid": -914061504, "ts": 1716454222764538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222764539, "dur": 0, "args": { "External id": 93480, "cbid": 205, "correlation": 93480 } }, { "ph": "f", "id": 93480, "pid": 76337, "tid": -914061504, "ts": 1716454222764539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222825726, "dur": 14, "args": { "External id": 93484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93484, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93484, "pid": 5, "tid": 7, "ts": 1716454222825726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764556, "dur": 12, "args": { "External id": 93484, "cbid": 211, "correlation": 93484 } }, { "ph": "s", "id": 93484, "pid": 76337, "tid": -914061504, "ts": 1716454222764556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222825742, "dur": 4, "args": { "External id": 93486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93486, "pid": 5, "tid": 7, "ts": 1716454222825742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764574, "dur": 6, "args": { "External id": 93486, "cbid": 211, "correlation": 93486 } }, { "ph": "s", "id": 93486, "pid": 76337, "tid": -914061504, "ts": 1716454222764574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222764583, "dur": 0, "args": { "External id": 93487, "cbid": 51, "correlation": 93487 } }, { "ph": "s", "id": 93487, "pid": 76337, "tid": -914061504, "ts": 1716454222764583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222825747, "dur": 95, "args": { "External id": 93488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93488, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 93488, "pid": 5, "tid": 7, "ts": 1716454222825747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764584, "dur": 5, "args": { "External id": 93488, "cbid": 211, "correlation": 93488 } }, { "ph": "s", "id": 93488, "pid": 76337, "tid": -914061504, "ts": 1716454222764584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222825843, "dur": 16, "args": { "External id": 93493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93493, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93493, "pid": 5, "tid": 7, "ts": 1716454222825843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764611, "dur": 9, "args": { "External id": 93493, "cbid": 211, "correlation": 93493 } }, { "ph": "s", "id": 93493, "pid": 76337, "tid": -914061504, "ts": 1716454222764611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222825860, "dur": 12, "args": { "External id": 93501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93501, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93501, "pid": 5, "tid": 7, "ts": 1716454222825860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764643, "dur": 8, "args": { "External id": 93501, "cbid": 211, "correlation": 93501 } }, { "ph": "s", "id": 93501, "pid": 76337, "tid": -914061504, "ts": 1716454222764643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222825873, "dur": 24, "args": { "External id": 93510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93510, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93510, "pid": 5, "tid": 7, "ts": 1716454222825873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764684, "dur": 10, "args": { "External id": 93510, "cbid": 211, "correlation": 93510 } }, { "ph": "s", "id": 93510, "pid": 76337, "tid": -914061504, "ts": 1716454222764684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222825898, "dur": 23, "args": { "External id": 93530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93530, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 93530, "pid": 5, "tid": 7, "ts": 1716454222825898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764759, "dur": 12, "args": { "External id": 93530, "cbid": 211, "correlation": 93530 } }, { "ph": "s", "id": 93530, "pid": 76337, "tid": -914061504, "ts": 1716454222764759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222825923, "dur": 5, "args": { "External id": 93542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93542, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 93542, "pid": 5, "tid": 7, "ts": 1716454222825923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764780, "dur": 7, "args": { "External id": 93542, "cbid": 211, "correlation": 93542 } }, { "ph": "s", "id": 93542, "pid": 76337, "tid": -914061504, "ts": 1716454222764780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222825929, "dur": 24, "args": { "External id": 93545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93545, "pid": 5, "tid": 7, "ts": 1716454222825929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764799, "dur": 7, "args": { "External id": 93545, "cbid": 211, "correlation": 93545 } }, { "ph": "s", "id": 93545, "pid": 76337, "tid": -914061504, "ts": 1716454222764799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222825954, "dur": 17, "args": { "External id": 93554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93554, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93554, "pid": 5, "tid": 7, "ts": 1716454222825954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764839, "dur": 10, "args": { "External id": 93554, "cbid": 211, "correlation": 93554 } }, { "ph": "s", "id": 93554, "pid": 76337, "tid": -914061504, "ts": 1716454222764839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222764892, "dur": 0, "args": { "External id": 93564, "cbid": 317, "correlation": 93564 } }, { "ph": "f", "id": 93564, "pid": 76337, "tid": -914061504, "ts": 1716454222764892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222764893, "dur": 0, "args": { "External id": 93565, "cbid": 203, "correlation": 93565 } }, { "ph": "f", "id": 93565, "pid": 76337, "tid": -914061504, "ts": 1716454222764893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222764893, "dur": 0, "args": { "External id": 93566, "cbid": 205, "correlation": 93566 } }, { "ph": "f", "id": 93566, "pid": 76337, "tid": -914061504, "ts": 1716454222764893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222825972, "dur": 17, "args": { "External id": 93570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93570, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93570, "pid": 5, "tid": 7, "ts": 1716454222825972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764908, "dur": 12, "args": { "External id": 93570, "cbid": 211, "correlation": 93570 } }, { "ph": "s", "id": 93570, "pid": 76337, "tid": -914061504, "ts": 1716454222764908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222825991, "dur": 236, "args": { "External id": 93572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93572, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93572, "pid": 5, "tid": 7, "ts": 1716454222825991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764923, "dur": 5, "args": { "External id": 93572, "cbid": 211, "correlation": 93572 } }, { "ph": "s", "id": 93572, "pid": 76337, "tid": -914061504, "ts": 1716454222764923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222826230, "dur": 1, "args": { "External id": 93574, "device": 5, "context": 1, "stream": 7, "correlation": 93574, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 93574, "pid": 5, "tid": 7, "ts": 1716454222826230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222764935, "dur": 8, "args": { "External id": 93574, "cbid": 51, "correlation": 93574 } }, { "ph": "s", "id": 93574, "pid": 76337, "tid": -914061504, "ts": 1716454222764935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222826233, "dur": 804, "args": { "External id": 93575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93575, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93575, "pid": 5, "tid": 7, "ts": 1716454222826233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764944, "dur": 6, "args": { "External id": 93575, "cbid": 211, "correlation": 93575 } }, { "ph": "s", "id": 93575, "pid": 76337, "tid": -914061504, "ts": 1716454222764944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222827039, "dur": 13, "args": { "External id": 93577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93577, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93577, "pid": 5, "tid": 7, "ts": 1716454222827039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764955, "dur": 6, "args": { "External id": 93577, "cbid": 211, "correlation": 93577 } }, { "ph": "s", "id": 93577, "pid": 76337, "tid": -914061504, "ts": 1716454222764955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222827053, "dur": 14, "args": { "External id": 93583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93583, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93583, "pid": 5, "tid": 7, "ts": 1716454222827053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222764991, "dur": 9, "args": { "External id": 93583, "cbid": 211, "correlation": 93583 } }, { "ph": "s", "id": 93583, "pid": 76337, "tid": -914061504, "ts": 1716454222764991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222827069, "dur": 3, "args": { "External id": 93591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93591, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 93591, "pid": 5, "tid": 7, "ts": 1716454222827069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765038, "dur": 10, "args": { "External id": 93591, "cbid": 211, "correlation": 93591 } }, { "ph": "s", "id": 93591, "pid": 76337, "tid": -914061504, "ts": 1716454222765038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222765105, "dur": 1, "args": { "External id": 93607, "cbid": 251, "correlation": 93607 } }, { "ph": "f", "id": 93607, "pid": 76337, "tid": -914061504, "ts": 1716454222765105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222765110, "dur": 0, "args": { "External id": 93609, "cbid": 251, "correlation": 93609 } }, { "ph": "f", "id": 93609, "pid": 76337, "tid": -914061504, "ts": 1716454222765110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222827073, "dur": 13, "args": { "External id": 93610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93610, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93610, "pid": 5, "tid": 7, "ts": 1716454222827073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765112, "dur": 11, "args": { "External id": 93610, "cbid": 211, "correlation": 93610 } }, { "ph": "s", "id": 93610, "pid": 76337, "tid": -914061504, "ts": 1716454222765112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222827088, "dur": 5, "args": { "External id": 93612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93612, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93612, "pid": 5, "tid": 7, "ts": 1716454222827088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765125, "dur": 6, "args": { "External id": 93612, "cbid": 211, "correlation": 93612 } }, { "ph": "s", "id": 93612, "pid": 76337, "tid": -914061504, "ts": 1716454222765125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222827094, "dur": 17, "args": { "External id": 93622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93622, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93622, "pid": 5, "tid": 7, "ts": 1716454222827094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765185, "dur": 12, "args": { "External id": 93622, "cbid": 211, "correlation": 93622 } }, { "ph": "s", "id": 93622, "pid": 76337, "tid": -914061504, "ts": 1716454222765185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222827113, "dur": 19, "args": { "External id": 93642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93642, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 93642, "pid": 5, "tid": 7, "ts": 1716454222827113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765252, "dur": 11, "args": { "External id": 93642, "cbid": 211, "correlation": 93642 } }, { "ph": "s", "id": 93642, "pid": 76337, "tid": -914061504, "ts": 1716454222765252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222827132, "dur": 4, "args": { "External id": 93654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93654, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 93654, "pid": 5, "tid": 7, "ts": 1716454222827132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765273, "dur": 6, "args": { "External id": 93654, "cbid": 211, "correlation": 93654 } }, { "ph": "s", "id": 93654, "pid": 76337, "tid": -914061504, "ts": 1716454222765273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222827137, "dur": 16, "args": { "External id": 93657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93657, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93657, "pid": 5, "tid": 7, "ts": 1716454222827137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765290, "dur": 6, "args": { "External id": 93657, "cbid": 211, "correlation": 93657 } }, { "ph": "s", "id": 93657, "pid": 76337, "tid": -914061504, "ts": 1716454222765290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222827155, "dur": 11, "args": { "External id": 93666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93666, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93666, "pid": 5, "tid": 7, "ts": 1716454222827155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765331, "dur": 10, "args": { "External id": 93666, "cbid": 211, "correlation": 93666 } }, { "ph": "s", "id": 93666, "pid": 76337, "tid": -914061504, "ts": 1716454222765331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222765394, "dur": 0, "args": { "External id": 93676, "cbid": 317, "correlation": 93676 } }, { "ph": "f", "id": 93676, "pid": 76337, "tid": -914061504, "ts": 1716454222765394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222765395, "dur": 0, "args": { "External id": 93677, "cbid": 203, "correlation": 93677 } }, { "ph": "f", "id": 93677, "pid": 76337, "tid": -914061504, "ts": 1716454222765395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222765395, "dur": 0, "args": { "External id": 93678, "cbid": 205, "correlation": 93678 } }, { "ph": "f", "id": 93678, "pid": 76337, "tid": -914061504, "ts": 1716454222765395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222827167, "dur": 11, "args": { "External id": 93682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93682, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93682, "pid": 5, "tid": 7, "ts": 1716454222827167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765409, "dur": 12, "args": { "External id": 93682, "cbid": 211, "correlation": 93682 } }, { "ph": "s", "id": 93682, "pid": 76337, "tid": -914061504, "ts": 1716454222765409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222827180, "dur": 161, "args": { "External id": 93684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93684, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93684, "pid": 5, "tid": 7, "ts": 1716454222827180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765423, "dur": 5, "args": { "External id": 93684, "cbid": 211, "correlation": 93684 } }, { "ph": "s", "id": 93684, "pid": 76337, "tid": -914061504, "ts": 1716454222765423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222827342, "dur": 1, "args": { "External id": 93686, "device": 5, "context": 1, "stream": 7, "correlation": 93686, "bytes": 960, "memory bandwidth (GB/s)": 0.5996252342286071 } }, { "ph": "f", "id": 93686, "pid": 5, "tid": 7, "ts": 1716454222827342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222765434, "dur": 6, "args": { "External id": 93686, "cbid": 51, "correlation": 93686 } }, { "ph": "s", "id": 93686, "pid": 76337, "tid": -914061504, "ts": 1716454222765434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222827346, "dur": 639, "args": { "External id": 93687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93687, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93687, "pid": 5, "tid": 7, "ts": 1716454222827346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765442, "dur": 6, "args": { "External id": 93687, "cbid": 211, "correlation": 93687 } }, { "ph": "s", "id": 93687, "pid": 76337, "tid": -914061504, "ts": 1716454222765442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222827987, "dur": 12, "args": { "External id": 93689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93689, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93689, "pid": 5, "tid": 7, "ts": 1716454222827987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765452, "dur": 5, "args": { "External id": 93689, "cbid": 211, "correlation": 93689 } }, { "ph": "s", "id": 93689, "pid": 76337, "tid": -914061504, "ts": 1716454222765452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222828000, "dur": 14, "args": { "External id": 93695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93695, "pid": 5, "tid": 7, "ts": 1716454222828000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765480, "dur": 9, "args": { "External id": 93695, "cbid": 211, "correlation": 93695 } }, { "ph": "s", "id": 93695, "pid": 76337, "tid": -914061504, "ts": 1716454222765480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222765539, "dur": 0, "args": { "External id": 93705, "cbid": 317, "correlation": 93705 } }, { "ph": "f", "id": 93705, "pid": 76337, "tid": -914061504, "ts": 1716454222765539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222765539, "dur": 0, "args": { "External id": 93706, "cbid": 203, "correlation": 93706 } }, { "ph": "f", "id": 93706, "pid": 76337, "tid": -914061504, "ts": 1716454222765539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222765540, "dur": 0, "args": { "External id": 93707, "cbid": 205, "correlation": 93707 } }, { "ph": "f", "id": 93707, "pid": 76337, "tid": -914061504, "ts": 1716454222765540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222828015, "dur": 18, "args": { "External id": 93711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93711, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93711, "pid": 5, "tid": 7, "ts": 1716454222828015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765554, "dur": 12, "args": { "External id": 93711, "cbid": 211, "correlation": 93711 } }, { "ph": "s", "id": 93711, "pid": 76337, "tid": -914061504, "ts": 1716454222765554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222828035, "dur": 4, "args": { "External id": 93713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93713, "pid": 5, "tid": 7, "ts": 1716454222828035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765570, "dur": 6, "args": { "External id": 93713, "cbid": 211, "correlation": 93713 } }, { "ph": "s", "id": 93713, "pid": 76337, "tid": -914061504, "ts": 1716454222765570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222765579, "dur": 0, "args": { "External id": 93714, "cbid": 51, "correlation": 93714 } }, { "ph": "s", "id": 93714, "pid": 76337, "tid": -914061504, "ts": 1716454222765579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222828040, "dur": 132, "args": { "External id": 93715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93715, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 93715, "pid": 5, "tid": 7, "ts": 1716454222828040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765580, "dur": 5, "args": { "External id": 93715, "cbid": 211, "correlation": 93715 } }, { "ph": "s", "id": 93715, "pid": 76337, "tid": -914061504, "ts": 1716454222765580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222828174, "dur": 15, "args": { "External id": 93720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93720, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93720, "pid": 5, "tid": 7, "ts": 1716454222828174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765606, "dur": 8, "args": { "External id": 93720, "cbid": 211, "correlation": 93720 } }, { "ph": "s", "id": 93720, "pid": 76337, "tid": -914061504, "ts": 1716454222765606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222828189, "dur": 13, "args": { "External id": 93728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93728, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93728, "pid": 5, "tid": 7, "ts": 1716454222828189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765635, "dur": 8, "args": { "External id": 93728, "cbid": 211, "correlation": 93728 } }, { "ph": "s", "id": 93728, "pid": 76337, "tid": -914061504, "ts": 1716454222765635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222828204, "dur": 10, "args": { "External id": 93736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93736, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93736, "pid": 5, "tid": 7, "ts": 1716454222828204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765664, "dur": 9, "args": { "External id": 93736, "cbid": 211, "correlation": 93736 } }, { "ph": "s", "id": 93736, "pid": 76337, "tid": -914061504, "ts": 1716454222765664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222828215, "dur": 18, "args": { "External id": 93756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93756, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 93756, "pid": 5, "tid": 7, "ts": 1716454222828215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765748, "dur": 12, "args": { "External id": 93756, "cbid": 211, "correlation": 93756 } }, { "ph": "s", "id": 93756, "pid": 76337, "tid": -914061504, "ts": 1716454222765748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222828235, "dur": 4, "args": { "External id": 93768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93768, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 93768, "pid": 5, "tid": 7, "ts": 1716454222828235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765770, "dur": 7, "args": { "External id": 93768, "cbid": 211, "correlation": 93768 } }, { "ph": "s", "id": 93768, "pid": 76337, "tid": -914061504, "ts": 1716454222765770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222828240, "dur": 16, "args": { "External id": 93771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93771, "pid": 5, "tid": 7, "ts": 1716454222828240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765788, "dur": 6, "args": { "External id": 93771, "cbid": 211, "correlation": 93771 } }, { "ph": "s", "id": 93771, "pid": 76337, "tid": -914061504, "ts": 1716454222765788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222765845, "dur": 0, "args": { "External id": 93782, "cbid": 317, "correlation": 93782 } }, { "ph": "f", "id": 93782, "pid": 76337, "tid": -914061504, "ts": 1716454222765845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222765845, "dur": 0, "args": { "External id": 93783, "cbid": 203, "correlation": 93783 } }, { "ph": "f", "id": 93783, "pid": 76337, "tid": -914061504, "ts": 1716454222765845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222765846, "dur": 0, "args": { "External id": 93784, "cbid": 205, "correlation": 93784 } }, { "ph": "f", "id": 93784, "pid": 76337, "tid": -914061504, "ts": 1716454222765846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222828258, "dur": 12, "args": { "External id": 93788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93788, "pid": 5, "tid": 7, "ts": 1716454222828258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765859, "dur": 12, "args": { "External id": 93788, "cbid": 211, "correlation": 93788 } }, { "ph": "s", "id": 93788, "pid": 76337, "tid": -914061504, "ts": 1716454222765859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222828271, "dur": 3, "args": { "External id": 93790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93790, "pid": 5, "tid": 7, "ts": 1716454222828271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765876, "dur": 6, "args": { "External id": 93790, "cbid": 211, "correlation": 93790 } }, { "ph": "s", "id": 93790, "pid": 76337, "tid": -914061504, "ts": 1716454222765876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222765884, "dur": 0, "args": { "External id": 93791, "cbid": 51, "correlation": 93791 } }, { "ph": "s", "id": 93791, "pid": 76337, "tid": -914061504, "ts": 1716454222765884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222828276, "dur": 89, "args": { "External id": 93792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93792, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 93792, "pid": 5, "tid": 7, "ts": 1716454222828276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765884, "dur": 5, "args": { "External id": 93792, "cbid": 211, "correlation": 93792 } }, { "ph": "s", "id": 93792, "pid": 76337, "tid": -914061504, "ts": 1716454222765884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222828366, "dur": 16, "args": { "External id": 93797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93797, "pid": 5, "tid": 7, "ts": 1716454222828366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222765912, "dur": 9, "args": { "External id": 93797, "cbid": 211, "correlation": 93797 } }, { "ph": "s", "id": 93797, "pid": 76337, "tid": -914061504, "ts": 1716454222765912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222828383, "dur": 83, "args": { "External id": 93806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93806, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93806, "pid": 5, "tid": 7, "ts": 1716454222828383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766009, "dur": 15, "args": { "External id": 93806, "cbid": 211, "correlation": 93806 } }, { "ph": "s", "id": 93806, "pid": 76337, "tid": -914061504, "ts": 1716454222766009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222828467, "dur": 30, "args": { "External id": 93828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93828, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93828, "pid": 5, "tid": 7, "ts": 1716454222828467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766070, "dur": 11, "args": { "External id": 93828, "cbid": 211, "correlation": 93828 } }, { "ph": "s", "id": 93828, "pid": 76337, "tid": -914061504, "ts": 1716454222766070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766165, "dur": 2, "args": { "External id": 93839, "cbid": 251, "correlation": 93839 } }, { "ph": "f", "id": 93839, "pid": 76337, "tid": -914061504, "ts": 1716454222766165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222828498, "dur": 162, "args": { "External id": 93840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93840, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93840, "pid": 5, "tid": 7, "ts": 1716454222828498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766170, "dur": 14, "args": { "External id": 93840, "cbid": 211, "correlation": 93840 } }, { "ph": "s", "id": 93840, "pid": 76337, "tid": -914061504, "ts": 1716454222766170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766241, "dur": 1, "args": { "External id": 93851, "cbid": 251, "correlation": 93851 } }, { "ph": "f", "id": 93851, "pid": 76337, "tid": -914061504, "ts": 1716454222766241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222828661, "dur": 155, "args": { "External id": 93852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93852, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93852, "pid": 5, "tid": 7, "ts": 1716454222828661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766245, "dur": 12, "args": { "External id": 93852, "cbid": 211, "correlation": 93852 } }, { "ph": "s", "id": 93852, "pid": 76337, "tid": -914061504, "ts": 1716454222766245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766310, "dur": 1, "args": { "External id": 93863, "cbid": 251, "correlation": 93863 } }, { "ph": "f", "id": 93863, "pid": 76337, "tid": -914061504, "ts": 1716454222766310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222828817, "dur": 157, "args": { "External id": 93864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93864, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93864, "pid": 5, "tid": 7, "ts": 1716454222828817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766314, "dur": 11, "args": { "External id": 93864, "cbid": 211, "correlation": 93864 } }, { "ph": "s", "id": 93864, "pid": 76337, "tid": -914061504, "ts": 1716454222766314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222828975, "dur": 329, "args": { "External id": 93889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93889, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93889, "pid": 5, "tid": 7, "ts": 1716454222828975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766403, "dur": 14, "args": { "External id": 93889, "cbid": 211, "correlation": 93889 } }, { "ph": "s", "id": 93889, "pid": 76337, "tid": -914061504, "ts": 1716454222766403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766507, "dur": 1, "args": { "External id": 93907, "cbid": 251, "correlation": 93907 } }, { "ph": "f", "id": 93907, "pid": 76337, "tid": -914061504, "ts": 1716454222766507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222829306, "dur": 164, "args": { "External id": 93909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93909, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93909, "pid": 5, "tid": 7, "ts": 1716454222829306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766513, "dur": 14, "args": { "External id": 93909, "cbid": 211, "correlation": 93909 } }, { "ph": "s", "id": 93909, "pid": 76337, "tid": -914061504, "ts": 1716454222766513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222829471, "dur": 19, "args": { "External id": 93917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93917, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93917, "pid": 5, "tid": 7, "ts": 1716454222829471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766583, "dur": 12, "args": { "External id": 93917, "cbid": 211, "correlation": 93917 } }, { "ph": "s", "id": 93917, "pid": 76337, "tid": -914061504, "ts": 1716454222766583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222829492, "dur": 28, "args": { "External id": 93925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93925, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93925, "pid": 5, "tid": 7, "ts": 1716454222829492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766622, "dur": 9, "args": { "External id": 93925, "cbid": 211, "correlation": 93925 } }, { "ph": "s", "id": 93925, "pid": 76337, "tid": -914061504, "ts": 1716454222766622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222829521, "dur": 18, "args": { "External id": 93936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93936, "pid": 5, "tid": 7, "ts": 1716454222829521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766697, "dur": 13, "args": { "External id": 93936, "cbid": 211, "correlation": 93936 } }, { "ph": "s", "id": 93936, "pid": 76337, "tid": -914061504, "ts": 1716454222766697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222829540, "dur": 16, "args": { "External id": 93958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93958, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 93958, "pid": 5, "tid": 7, "ts": 1716454222829540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766729, "dur": 10, "args": { "External id": 93958, "cbid": 211, "correlation": 93958 } }, { "ph": "s", "id": 93958, "pid": 76337, "tid": -914061504, "ts": 1716454222766729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766817, "dur": 1, "args": { "External id": 93969, "cbid": 251, "correlation": 93969 } }, { "ph": "f", "id": 93969, "pid": 76337, "tid": -914061504, "ts": 1716454222766817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222829558, "dur": 88, "args": { "External id": 93970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93970, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 93970, "pid": 5, "tid": 7, "ts": 1716454222829558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766822, "dur": 13, "args": { "External id": 93970, "cbid": 211, "correlation": 93970 } }, { "ph": "s", "id": 93970, "pid": 76337, "tid": -914061504, "ts": 1716454222766822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766895, "dur": 1, "args": { "External id": 93981, "cbid": 251, "correlation": 93981 } }, { "ph": "f", "id": 93981, "pid": 76337, "tid": -914061504, "ts": 1716454222766895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766899, "dur": 0, "args": { "External id": 93982, "cbid": 251, "correlation": 93982 } }, { "ph": "f", "id": 93982, "pid": 76337, "tid": -914061504, "ts": 1716454222766899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222829647, "dur": 12, "args": { "External id": 93983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93983, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93983, "pid": 5, "tid": 7, "ts": 1716454222829647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766901, "dur": 12, "args": { "External id": 93983, "cbid": 211, "correlation": 93983 } }, { "ph": "s", "id": 93983, "pid": 76337, "tid": -914061504, "ts": 1716454222766901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222829660, "dur": 6, "args": { "External id": 93985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93985, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93985, "pid": 5, "tid": 7, "ts": 1716454222829660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766915, "dur": 7, "args": { "External id": 93985, "cbid": 211, "correlation": 93985 } }, { "ph": "s", "id": 93985, "pid": 76337, "tid": -914061504, "ts": 1716454222766915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766981, "dur": 1, "args": { "External id": 93996, "cbid": 251, "correlation": 93996 } }, { "ph": "f", "id": 93996, "pid": 76337, "tid": -914061504, "ts": 1716454222766981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222766985, "dur": 0, "args": { "External id": 93997, "cbid": 251, "correlation": 93997 } }, { "ph": "f", "id": 93997, "pid": 76337, "tid": -914061504, "ts": 1716454222766985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222829667, "dur": 8, "args": { "External id": 93998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 93998, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 93998, "pid": 5, "tid": 7, "ts": 1716454222829667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222766987, "dur": 12, "args": { "External id": 93998, "cbid": 211, "correlation": 93998 } }, { "ph": "s", "id": 93998, "pid": 76337, "tid": -914061504, "ts": 1716454222766987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222829677, "dur": 4, "args": { "External id": 94000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94000, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94000, "pid": 5, "tid": 7, "ts": 1716454222829677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767001, "dur": 6, "args": { "External id": 94000, "cbid": 211, "correlation": 94000 } }, { "ph": "s", "id": 94000, "pid": 76337, "tid": -914061504, "ts": 1716454222767001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222829681, "dur": 55, "args": { "External id": 94025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94025, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94025, "pid": 5, "tid": 7, "ts": 1716454222829681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767079, "dur": 13, "args": { "External id": 94025, "cbid": 211, "correlation": 94025 } }, { "ph": "s", "id": 94025, "pid": 76337, "tid": -914061504, "ts": 1716454222767079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222767180, "dur": 2, "args": { "External id": 94043, "cbid": 251, "correlation": 94043 } }, { "ph": "f", "id": 94043, "pid": 76337, "tid": -914061504, "ts": 1716454222767180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222829738, "dur": 90, "args": { "External id": 94045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94045, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 94045, "pid": 5, "tid": 7, "ts": 1716454222829738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767186, "dur": 15, "args": { "External id": 94045, "cbid": 211, "correlation": 94045 } }, { "ph": "s", "id": 94045, "pid": 76337, "tid": -914061504, "ts": 1716454222767186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222829829, "dur": 9, "args": { "External id": 94053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94053, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94053, "pid": 5, "tid": 7, "ts": 1716454222829829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767259, "dur": 12, "args": { "External id": 94053, "cbid": 211, "correlation": 94053 } }, { "ph": "s", "id": 94053, "pid": 76337, "tid": -914061504, "ts": 1716454222767259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222829840, "dur": 21, "args": { "External id": 94061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94061, "pid": 5, "tid": 7, "ts": 1716454222829840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767300, "dur": 10, "args": { "External id": 94061, "cbid": 211, "correlation": 94061 } }, { "ph": "s", "id": 94061, "pid": 76337, "tid": -914061504, "ts": 1716454222767300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222829863, "dur": 19, "args": { "External id": 94083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94083, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94083, "pid": 5, "tid": 7, "ts": 1716454222829863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767351, "dur": 10, "args": { "External id": 94083, "cbid": 211, "correlation": 94083 } }, { "ph": "s", "id": 94083, "pid": 76337, "tid": -914061504, "ts": 1716454222767351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222767440, "dur": 1, "args": { "External id": 94099, "cbid": 251, "correlation": 94099 } }, { "ph": "f", "id": 94099, "pid": 76337, "tid": -914061504, "ts": 1716454222767440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222767444, "dur": 0, "args": { "External id": 94101, "cbid": 251, "correlation": 94101 } }, { "ph": "f", "id": 94101, "pid": 76337, "tid": -914061504, "ts": 1716454222767444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222829883, "dur": 494, "args": { "External id": 94102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94102, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94102, "pid": 5, "tid": 7, "ts": 1716454222829883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767446, "dur": 13, "args": { "External id": 94102, "cbid": 211, "correlation": 94102 } }, { "ph": "s", "id": 94102, "pid": 76337, "tid": -914061504, "ts": 1716454222767446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222830378, "dur": 64, "args": { "External id": 94110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94110, "pid": 5, "tid": 7, "ts": 1716454222830378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767516, "dur": 13, "args": { "External id": 94110, "cbid": 211, "correlation": 94110 } }, { "ph": "s", "id": 94110, "pid": 76337, "tid": -914061504, "ts": 1716454222767516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222830443, "dur": 66, "args": { "External id": 94118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94118, "pid": 5, "tid": 7, "ts": 1716454222830443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767550, "dur": 9, "args": { "External id": 94118, "cbid": 211, "correlation": 94118 } }, { "ph": "s", "id": 94118, "pid": 76337, "tid": -914061504, "ts": 1716454222767550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222767629, "dur": 1, "args": { "External id": 94134, "cbid": 251, "correlation": 94134 } }, { "ph": "f", "id": 94134, "pid": 76337, "tid": -914061504, "ts": 1716454222767629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222830512, "dur": 1, "args": { "External id": 94136, "device": 5, "context": 1, "stream": 7, "correlation": 94136, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 94136, "pid": 5, "tid": 7, "ts": 1716454222830512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222767634, "dur": 11, "args": { "External id": 94136, "cbid": 51, "correlation": 94136 } }, { "ph": "s", "id": 94136, "pid": 76337, "tid": -914061504, "ts": 1716454222767634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222830516, "dur": 267, "args": { "External id": 94137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94137, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 94137, "pid": 5, "tid": 7, "ts": 1716454222830516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767647, "dur": 11, "args": { "External id": 94137, "cbid": 211, "correlation": 94137 } }, { "ph": "s", "id": 94137, "pid": 76337, "tid": -914061504, "ts": 1716454222767647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222830784, "dur": 15, "args": { "External id": 94145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94145, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94145, "pid": 5, "tid": 7, "ts": 1716454222830784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767689, "dur": 11, "args": { "External id": 94145, "cbid": 211, "correlation": 94145 } }, { "ph": "s", "id": 94145, "pid": 76337, "tid": -914061504, "ts": 1716454222767689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222830800, "dur": 37, "args": { "External id": 94156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94156, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94156, "pid": 5, "tid": 7, "ts": 1716454222830800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767760, "dur": 12, "args": { "External id": 94156, "cbid": 211, "correlation": 94156 } }, { "ph": "s", "id": 94156, "pid": 76337, "tid": -914061504, "ts": 1716454222767760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222767828, "dur": 0, "args": { "External id": 94168, "cbid": 317, "correlation": 94168 } }, { "ph": "f", "id": 94168, "pid": 76337, "tid": -914061504, "ts": 1716454222767828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222767829, "dur": 0, "args": { "External id": 94169, "cbid": 203, "correlation": 94169 } }, { "ph": "f", "id": 94169, "pid": 76337, "tid": -914061504, "ts": 1716454222767829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222767830, "dur": 0, "args": { "External id": 94170, "cbid": 205, "correlation": 94170 } }, { "ph": "f", "id": 94170, "pid": 76337, "tid": -914061504, "ts": 1716454222767830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222830838, "dur": 13, "args": { "External id": 94174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94174, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94174, "pid": 5, "tid": 7, "ts": 1716454222830838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767846, "dur": 12, "args": { "External id": 94174, "cbid": 211, "correlation": 94174 } }, { "ph": "s", "id": 94174, "pid": 76337, "tid": -914061504, "ts": 1716454222767846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222830852, "dur": 4, "args": { "External id": 94176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 94176, "pid": 5, "tid": 7, "ts": 1716454222830852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767864, "dur": 7, "args": { "External id": 94176, "cbid": 211, "correlation": 94176 } }, { "ph": "s", "id": 94176, "pid": 76337, "tid": -914061504, "ts": 1716454222767864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222767873, "dur": 0, "args": { "External id": 94177, "cbid": 51, "correlation": 94177 } }, { "ph": "s", "id": 94177, "pid": 76337, "tid": -914061504, "ts": 1716454222767873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222830857, "dur": 95, "args": { "External id": 94178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94178, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 94178, "pid": 5, "tid": 7, "ts": 1716454222830857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767874, "dur": 5, "args": { "External id": 94178, "cbid": 211, "correlation": 94178 } }, { "ph": "s", "id": 94178, "pid": 76337, "tid": -914061504, "ts": 1716454222767874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222830953, "dur": 16, "args": { "External id": 94183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94183, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94183, "pid": 5, "tid": 7, "ts": 1716454222830953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767901, "dur": 8, "args": { "External id": 94183, "cbid": 211, "correlation": 94183 } }, { "ph": "s", "id": 94183, "pid": 76337, "tid": -914061504, "ts": 1716454222767901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222830971, "dur": 12, "args": { "External id": 94191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94191, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94191, "pid": 5, "tid": 7, "ts": 1716454222830971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222767933, "dur": 8, "args": { "External id": 94191, "cbid": 211, "correlation": 94191 } }, { "ph": "s", "id": 94191, "pid": 76337, "tid": -914061504, "ts": 1716454222767933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222830984, "dur": 55, "args": { "External id": 94202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94202, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94202, "pid": 5, "tid": 7, "ts": 1716454222830984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768010, "dur": 13, "args": { "External id": 94202, "cbid": 211, "correlation": 94202 } }, { "ph": "s", "id": 94202, "pid": 76337, "tid": -914061504, "ts": 1716454222768010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222768069, "dur": 0, "args": { "External id": 94212, "cbid": 317, "correlation": 94212 } }, { "ph": "f", "id": 94212, "pid": 76337, "tid": -914061504, "ts": 1716454222768069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222768070, "dur": 0, "args": { "External id": 94213, "cbid": 203, "correlation": 94213 } }, { "ph": "f", "id": 94213, "pid": 76337, "tid": -914061504, "ts": 1716454222768070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222768070, "dur": 0, "args": { "External id": 94214, "cbid": 205, "correlation": 94214 } }, { "ph": "f", "id": 94214, "pid": 76337, "tid": -914061504, "ts": 1716454222768070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222831041, "dur": 39, "args": { "External id": 94218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94218, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94218, "pid": 5, "tid": 7, "ts": 1716454222831041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768088, "dur": 12, "args": { "External id": 94218, "cbid": 211, "correlation": 94218 } }, { "ph": "s", "id": 94218, "pid": 76337, "tid": -914061504, "ts": 1716454222768088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222831081, "dur": 160, "args": { "External id": 94220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94220, "pid": 5, "tid": 7, "ts": 1716454222831081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768103, "dur": 5, "args": { "External id": 94220, "cbid": 211, "correlation": 94220 } }, { "ph": "s", "id": 94220, "pid": 76337, "tid": -914061504, "ts": 1716454222768103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222831242, "dur": 1965, "args": { "External id": 94222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94222, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94222, "pid": 5, "tid": 7, "ts": 1716454222831242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768116, "dur": 8, "args": { "External id": 94222, "cbid": 211, "correlation": 94222 } }, { "ph": "s", "id": 94222, "pid": 76337, "tid": -914061504, "ts": 1716454222768116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222833209, "dur": 39, "args": { "External id": 94224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94224, "pid": 5, "tid": 7, "ts": 1716454222833209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768129, "dur": 5, "args": { "External id": 94224, "cbid": 211, "correlation": 94224 } }, { "ph": "s", "id": 94224, "pid": 76337, "tid": -914061504, "ts": 1716454222768129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222833249, "dur": 58, "args": { "External id": 94230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94230, "pid": 5, "tid": 7, "ts": 1716454222833249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768158, "dur": 8, "args": { "External id": 94230, "cbid": 211, "correlation": 94230 } }, { "ph": "s", "id": 94230, "pid": 76337, "tid": -914061504, "ts": 1716454222768158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222833308, "dur": 84, "args": { "External id": 94239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94239, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94239, "pid": 5, "tid": 7, "ts": 1716454222833308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768256, "dur": 14, "args": { "External id": 94239, "cbid": 211, "correlation": 94239 } }, { "ph": "s", "id": 94239, "pid": 76337, "tid": -914061504, "ts": 1716454222768256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222833394, "dur": 73, "args": { "External id": 94259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94259, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 94259, "pid": 5, "tid": 7, "ts": 1716454222833394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768331, "dur": 11, "args": { "External id": 94259, "cbid": 211, "correlation": 94259 } }, { "ph": "s", "id": 94259, "pid": 76337, "tid": -914061504, "ts": 1716454222768331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222833468, "dur": 5, "args": { "External id": 94271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94271, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 94271, "pid": 5, "tid": 7, "ts": 1716454222833468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768352, "dur": 7, "args": { "External id": 94271, "cbid": 211, "correlation": 94271 } }, { "ph": "s", "id": 94271, "pid": 76337, "tid": -914061504, "ts": 1716454222768352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222833474, "dur": 81, "args": { "External id": 94274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94274, "pid": 5, "tid": 7, "ts": 1716454222833474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768372, "dur": 7, "args": { "External id": 94274, "cbid": 211, "correlation": 94274 } }, { "ph": "s", "id": 94274, "pid": 76337, "tid": -914061504, "ts": 1716454222768372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222833556, "dur": 53, "args": { "External id": 94283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94283, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94283, "pid": 5, "tid": 7, "ts": 1716454222833556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768413, "dur": 10, "args": { "External id": 94283, "cbid": 211, "correlation": 94283 } }, { "ph": "s", "id": 94283, "pid": 76337, "tid": -914061504, "ts": 1716454222768413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222768466, "dur": 0, "args": { "External id": 94293, "cbid": 317, "correlation": 94293 } }, { "ph": "f", "id": 94293, "pid": 76337, "tid": -914061504, "ts": 1716454222768466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222768466, "dur": 0, "args": { "External id": 94294, "cbid": 203, "correlation": 94294 } }, { "ph": "f", "id": 94294, "pid": 76337, "tid": -914061504, "ts": 1716454222768466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222768467, "dur": 0, "args": { "External id": 94295, "cbid": 205, "correlation": 94295 } }, { "ph": "f", "id": 94295, "pid": 76337, "tid": -914061504, "ts": 1716454222768467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222833611, "dur": 57, "args": { "External id": 94299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94299, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94299, "pid": 5, "tid": 7, "ts": 1716454222833611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768484, "dur": 11, "args": { "External id": 94299, "cbid": 211, "correlation": 94299 } }, { "ph": "s", "id": 94299, "pid": 76337, "tid": -914061504, "ts": 1716454222768484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222833669, "dur": 121, "args": { "External id": 94301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94301, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94301, "pid": 5, "tid": 7, "ts": 1716454222833669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768498, "dur": 5, "args": { "External id": 94301, "cbid": 211, "correlation": 94301 } }, { "ph": "s", "id": 94301, "pid": 76337, "tid": -914061504, "ts": 1716454222768498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222833791, "dur": 1873, "args": { "External id": 94303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94303, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94303, "pid": 5, "tid": 7, "ts": 1716454222833791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768509, "dur": 6, "args": { "External id": 94303, "cbid": 211, "correlation": 94303 } }, { "ph": "s", "id": 94303, "pid": 76337, "tid": -914061504, "ts": 1716454222768509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222835665, "dur": 20, "args": { "External id": 94305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94305, "pid": 5, "tid": 7, "ts": 1716454222835665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768519, "dur": 5, "args": { "External id": 94305, "cbid": 211, "correlation": 94305 } }, { "ph": "s", "id": 94305, "pid": 76337, "tid": -914061504, "ts": 1716454222768519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222835686, "dur": 32, "args": { "External id": 94311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94311, "pid": 5, "tid": 7, "ts": 1716454222835686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768548, "dur": 9, "args": { "External id": 94311, "cbid": 211, "correlation": 94311 } }, { "ph": "s", "id": 94311, "pid": 76337, "tid": -914061504, "ts": 1716454222768548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222835720, "dur": 3, "args": { "External id": 94319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94319, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 94319, "pid": 5, "tid": 7, "ts": 1716454222835720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768592, "dur": 10, "args": { "External id": 94319, "cbid": 211, "correlation": 94319 } }, { "ph": "s", "id": 94319, "pid": 76337, "tid": -914061504, "ts": 1716454222768592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222768662, "dur": 1, "args": { "External id": 94335, "cbid": 251, "correlation": 94335 } }, { "ph": "f", "id": 94335, "pid": 76337, "tid": -914061504, "ts": 1716454222768662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222768668, "dur": 0, "args": { "External id": 94337, "cbid": 251, "correlation": 94337 } }, { "ph": "f", "id": 94337, "pid": 76337, "tid": -914061504, "ts": 1716454222768668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222835724, "dur": 12, "args": { "External id": 94338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94338, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 94338, "pid": 5, "tid": 7, "ts": 1716454222835724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768670, "dur": 12, "args": { "External id": 94338, "cbid": 211, "correlation": 94338 } }, { "ph": "s", "id": 94338, "pid": 76337, "tid": -914061504, "ts": 1716454222768670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222835738, "dur": 5, "args": { "External id": 94340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94340, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 94340, "pid": 5, "tid": 7, "ts": 1716454222835738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768684, "dur": 6, "args": { "External id": 94340, "cbid": 211, "correlation": 94340 } }, { "ph": "s", "id": 94340, "pid": 76337, "tid": -914061504, "ts": 1716454222768684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222835744, "dur": 29, "args": { "External id": 94350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94350, "pid": 5, "tid": 7, "ts": 1716454222835744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768745, "dur": 13, "args": { "External id": 94350, "cbid": 211, "correlation": 94350 } }, { "ph": "s", "id": 94350, "pid": 76337, "tid": -914061504, "ts": 1716454222768745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222835774, "dur": 31, "args": { "External id": 94370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94370, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 94370, "pid": 5, "tid": 7, "ts": 1716454222835774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768812, "dur": 11, "args": { "External id": 94370, "cbid": 211, "correlation": 94370 } }, { "ph": "s", "id": 94370, "pid": 76337, "tid": -914061504, "ts": 1716454222768812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222835807, "dur": 4, "args": { "External id": 94382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94382, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 94382, "pid": 5, "tid": 7, "ts": 1716454222835807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768833, "dur": 6, "args": { "External id": 94382, "cbid": 211, "correlation": 94382 } }, { "ph": "s", "id": 94382, "pid": 76337, "tid": -914061504, "ts": 1716454222768833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222835812, "dur": 30, "args": { "External id": 94385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94385, "pid": 5, "tid": 7, "ts": 1716454222835812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768851, "dur": 6, "args": { "External id": 94385, "cbid": 211, "correlation": 94385 } }, { "ph": "s", "id": 94385, "pid": 76337, "tid": -914061504, "ts": 1716454222768851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222835843, "dur": 21, "args": { "External id": 94394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94394, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94394, "pid": 5, "tid": 7, "ts": 1716454222835843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768892, "dur": 10, "args": { "External id": 94394, "cbid": 211, "correlation": 94394 } }, { "ph": "s", "id": 94394, "pid": 76337, "tid": -914061504, "ts": 1716454222768892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222768956, "dur": 0, "args": { "External id": 94404, "cbid": 317, "correlation": 94404 } }, { "ph": "f", "id": 94404, "pid": 76337, "tid": -914061504, "ts": 1716454222768956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222768957, "dur": 0, "args": { "External id": 94405, "cbid": 203, "correlation": 94405 } }, { "ph": "f", "id": 94405, "pid": 76337, "tid": -914061504, "ts": 1716454222768957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222768958, "dur": 0, "args": { "External id": 94406, "cbid": 205, "correlation": 94406 } }, { "ph": "f", "id": 94406, "pid": 76337, "tid": -914061504, "ts": 1716454222768958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222835865, "dur": 23, "args": { "External id": 94410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94410, "pid": 5, "tid": 7, "ts": 1716454222835865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768981, "dur": 13, "args": { "External id": 94410, "cbid": 211, "correlation": 94410 } }, { "ph": "s", "id": 94410, "pid": 76337, "tid": -914061504, "ts": 1716454222768981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222835889, "dur": 43, "args": { "External id": 94412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94412, "pid": 5, "tid": 7, "ts": 1716454222835889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222768998, "dur": 5, "args": { "External id": 94412, "cbid": 211, "correlation": 94412 } }, { "ph": "s", "id": 94412, "pid": 76337, "tid": -914061504, "ts": 1716454222768998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222835933, "dur": 640, "args": { "External id": 94414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94414, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94414, "pid": 5, "tid": 7, "ts": 1716454222835933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769009, "dur": 6, "args": { "External id": 94414, "cbid": 211, "correlation": 94414 } }, { "ph": "s", "id": 94414, "pid": 76337, "tid": -914061504, "ts": 1716454222769009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222836575, "dur": 21, "args": { "External id": 94416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94416, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94416, "pid": 5, "tid": 7, "ts": 1716454222836575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769019, "dur": 5, "args": { "External id": 94416, "cbid": 211, "correlation": 94416 } }, { "ph": "s", "id": 94416, "pid": 76337, "tid": -914061504, "ts": 1716454222769019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222836597, "dur": 32, "args": { "External id": 94422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94422, "pid": 5, "tid": 7, "ts": 1716454222836597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769048, "dur": 9, "args": { "External id": 94422, "cbid": 211, "correlation": 94422 } }, { "ph": "s", "id": 94422, "pid": 76337, "tid": -914061504, "ts": 1716454222769048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222769106, "dur": 0, "args": { "External id": 94432, "cbid": 317, "correlation": 94432 } }, { "ph": "f", "id": 94432, "pid": 76337, "tid": -914061504, "ts": 1716454222769106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222769107, "dur": 0, "args": { "External id": 94433, "cbid": 203, "correlation": 94433 } }, { "ph": "f", "id": 94433, "pid": 76337, "tid": -914061504, "ts": 1716454222769107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222769108, "dur": 0, "args": { "External id": 94434, "cbid": 205, "correlation": 94434 } }, { "ph": "f", "id": 94434, "pid": 76337, "tid": -914061504, "ts": 1716454222769108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222836631, "dur": 56, "args": { "External id": 94438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94438, "pid": 5, "tid": 7, "ts": 1716454222836631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769120, "dur": 11, "args": { "External id": 94438, "cbid": 211, "correlation": 94438 } }, { "ph": "s", "id": 94438, "pid": 76337, "tid": -914061504, "ts": 1716454222769120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222836688, "dur": 262, "args": { "External id": 94440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94440, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94440, "pid": 5, "tid": 7, "ts": 1716454222836688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769138, "dur": 8, "args": { "External id": 94440, "cbid": 211, "correlation": 94440 } }, { "ph": "s", "id": 94440, "pid": 76337, "tid": -914061504, "ts": 1716454222769138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222836951, "dur": 21, "args": { "External id": 94442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94442, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94442, "pid": 5, "tid": 7, "ts": 1716454222836951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769150, "dur": 5, "args": { "External id": 94442, "cbid": 211, "correlation": 94442 } }, { "ph": "s", "id": 94442, "pid": 76337, "tid": -914061504, "ts": 1716454222769150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222836973, "dur": 32, "args": { "External id": 94448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94448, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94448, "pid": 5, "tid": 7, "ts": 1716454222836973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769176, "dur": 8, "args": { "External id": 94448, "cbid": 211, "correlation": 94448 } }, { "ph": "s", "id": 94448, "pid": 76337, "tid": -914061504, "ts": 1716454222769176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222837007, "dur": 27, "args": { "External id": 94456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94456, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94456, "pid": 5, "tid": 7, "ts": 1716454222837007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769204, "dur": 8, "args": { "External id": 94456, "cbid": 211, "correlation": 94456 } }, { "ph": "s", "id": 94456, "pid": 76337, "tid": -914061504, "ts": 1716454222769204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222837035, "dur": 19, "args": { "External id": 94464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94464, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94464, "pid": 5, "tid": 7, "ts": 1716454222837035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769234, "dur": 9, "args": { "External id": 94464, "cbid": 211, "correlation": 94464 } }, { "ph": "s", "id": 94464, "pid": 76337, "tid": -914061504, "ts": 1716454222769234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222837055, "dur": 29, "args": { "External id": 94484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94484, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 94484, "pid": 5, "tid": 7, "ts": 1716454222837055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769317, "dur": 12, "args": { "External id": 94484, "cbid": 211, "correlation": 94484 } }, { "ph": "s", "id": 94484, "pid": 76337, "tid": -914061504, "ts": 1716454222769317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222837086, "dur": 4, "args": { "External id": 94496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94496, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 94496, "pid": 5, "tid": 7, "ts": 1716454222837086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769340, "dur": 7, "args": { "External id": 94496, "cbid": 211, "correlation": 94496 } }, { "ph": "s", "id": 94496, "pid": 76337, "tid": -914061504, "ts": 1716454222769340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222837092, "dur": 31, "args": { "External id": 94499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94499, "pid": 5, "tid": 7, "ts": 1716454222837092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769358, "dur": 6, "args": { "External id": 94499, "cbid": 211, "correlation": 94499 } }, { "ph": "s", "id": 94499, "pid": 76337, "tid": -914061504, "ts": 1716454222769358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222769415, "dur": 0, "args": { "External id": 94510, "cbid": 317, "correlation": 94510 } }, { "ph": "f", "id": 94510, "pid": 76337, "tid": -914061504, "ts": 1716454222769415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222769416, "dur": 0, "args": { "External id": 94511, "cbid": 203, "correlation": 94511 } }, { "ph": "f", "id": 94511, "pid": 76337, "tid": -914061504, "ts": 1716454222769416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222769416, "dur": 0, "args": { "External id": 94512, "cbid": 205, "correlation": 94512 } }, { "ph": "f", "id": 94512, "pid": 76337, "tid": -914061504, "ts": 1716454222769416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222837124, "dur": 21, "args": { "External id": 94516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94516, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94516, "pid": 5, "tid": 7, "ts": 1716454222837124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769430, "dur": 11, "args": { "External id": 94516, "cbid": 211, "correlation": 94516 } }, { "ph": "s", "id": 94516, "pid": 76337, "tid": -914061504, "ts": 1716454222769430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222837147, "dur": 103, "args": { "External id": 94518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94518, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94518, "pid": 5, "tid": 7, "ts": 1716454222837147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769448, "dur": 7, "args": { "External id": 94518, "cbid": 211, "correlation": 94518 } }, { "ph": "s", "id": 94518, "pid": 76337, "tid": -914061504, "ts": 1716454222769448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222837251, "dur": 22, "args": { "External id": 94520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94520, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94520, "pid": 5, "tid": 7, "ts": 1716454222837251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769458, "dur": 5, "args": { "External id": 94520, "cbid": 211, "correlation": 94520 } }, { "ph": "s", "id": 94520, "pid": 76337, "tid": -914061504, "ts": 1716454222769458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222837274, "dur": 32, "args": { "External id": 94526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94526, "pid": 5, "tid": 7, "ts": 1716454222837274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769486, "dur": 8, "args": { "External id": 94526, "cbid": 211, "correlation": 94526 } }, { "ph": "s", "id": 94526, "pid": 76337, "tid": -914061504, "ts": 1716454222769486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222837307, "dur": 194, "args": { "External id": 94535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94535, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94535, "pid": 5, "tid": 7, "ts": 1716454222837307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769570, "dur": 14, "args": { "External id": 94535, "cbid": 211, "correlation": 94535 } }, { "ph": "s", "id": 94535, "pid": 76337, "tid": -914061504, "ts": 1716454222769570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222837503, "dur": 63, "args": { "External id": 94557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94557, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94557, "pid": 5, "tid": 7, "ts": 1716454222837503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769631, "dur": 10, "args": { "External id": 94557, "cbid": 211, "correlation": 94557 } }, { "ph": "s", "id": 94557, "pid": 76337, "tid": -914061504, "ts": 1716454222769631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222769726, "dur": 2, "args": { "External id": 94568, "cbid": 251, "correlation": 94568 } }, { "ph": "f", "id": 94568, "pid": 76337, "tid": -914061504, "ts": 1716454222769726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222837567, "dur": 152, "args": { "External id": 94569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94569, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94569, "pid": 5, "tid": 7, "ts": 1716454222837567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769732, "dur": 14, "args": { "External id": 94569, "cbid": 211, "correlation": 94569 } }, { "ph": "s", "id": 94569, "pid": 76337, "tid": -914061504, "ts": 1716454222769732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222769802, "dur": 1, "args": { "External id": 94580, "cbid": 251, "correlation": 94580 } }, { "ph": "f", "id": 94580, "pid": 76337, "tid": -914061504, "ts": 1716454222769802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222837721, "dur": 143, "args": { "External id": 94581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94581, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94581, "pid": 5, "tid": 7, "ts": 1716454222837721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769806, "dur": 11, "args": { "External id": 94581, "cbid": 211, "correlation": 94581 } }, { "ph": "s", "id": 94581, "pid": 76337, "tid": -914061504, "ts": 1716454222769806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222769871, "dur": 1, "args": { "External id": 94592, "cbid": 251, "correlation": 94592 } }, { "ph": "f", "id": 94592, "pid": 76337, "tid": -914061504, "ts": 1716454222769871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222837866, "dur": 144, "args": { "External id": 94593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94593, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94593, "pid": 5, "tid": 7, "ts": 1716454222837866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769875, "dur": 11, "args": { "External id": 94593, "cbid": 211, "correlation": 94593 } }, { "ph": "s", "id": 94593, "pid": 76337, "tid": -914061504, "ts": 1716454222769875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222838011, "dur": 1909, "args": { "External id": 94614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94614, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 94614, "pid": 5, "tid": 7, "ts": 1716454222838011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222769962, "dur": 23, "args": { "External id": 94614, "cbid": 211, "correlation": 94614 } }, { "ph": "s", "id": 94614, "pid": 76337, "tid": -914061504, "ts": 1716454222769962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770079, "dur": 2, "args": { "External id": 94632, "cbid": 251, "correlation": 94632 } }, { "ph": "f", "id": 94632, "pid": 76337, "tid": -914061504, "ts": 1716454222770079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222839922, "dur": 147, "args": { "External id": 94634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94634, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 94634, "pid": 5, "tid": 7, "ts": 1716454222839922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770085, "dur": 14, "args": { "External id": 94634, "cbid": 211, "correlation": 94634 } }, { "ph": "s", "id": 94634, "pid": 76337, "tid": -914061504, "ts": 1716454222770085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222840070, "dur": 36, "args": { "External id": 94642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94642, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94642, "pid": 5, "tid": 7, "ts": 1716454222840070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770157, "dur": 12, "args": { "External id": 94642, "cbid": 211, "correlation": 94642 } }, { "ph": "s", "id": 94642, "pid": 76337, "tid": -914061504, "ts": 1716454222770157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222840108, "dur": 51, "args": { "External id": 94650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94650, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94650, "pid": 5, "tid": 7, "ts": 1716454222840108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770197, "dur": 8, "args": { "External id": 94650, "cbid": 211, "correlation": 94650 } }, { "ph": "s", "id": 94650, "pid": 76337, "tid": -914061504, "ts": 1716454222770197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222840160, "dur": 30, "args": { "External id": 94661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94661, "pid": 5, "tid": 7, "ts": 1716454222840160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770272, "dur": 13, "args": { "External id": 94661, "cbid": 211, "correlation": 94661 } }, { "ph": "s", "id": 94661, "pid": 76337, "tid": -914061504, "ts": 1716454222770272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222840191, "dur": 34, "args": { "External id": 94683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94683, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94683, "pid": 5, "tid": 7, "ts": 1716454222840191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770304, "dur": 8, "args": { "External id": 94683, "cbid": 211, "correlation": 94683 } }, { "ph": "s", "id": 94683, "pid": 76337, "tid": -914061504, "ts": 1716454222770304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770389, "dur": 1, "args": { "External id": 94694, "cbid": 251, "correlation": 94694 } }, { "ph": "f", "id": 94694, "pid": 76337, "tid": -914061504, "ts": 1716454222770389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222840226, "dur": 88, "args": { "External id": 94695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94695, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94695, "pid": 5, "tid": 7, "ts": 1716454222840226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770395, "dur": 13, "args": { "External id": 94695, "cbid": 211, "correlation": 94695 } }, { "ph": "s", "id": 94695, "pid": 76337, "tid": -914061504, "ts": 1716454222770395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770467, "dur": 1, "args": { "External id": 94706, "cbid": 251, "correlation": 94706 } }, { "ph": "f", "id": 94706, "pid": 76337, "tid": -914061504, "ts": 1716454222770467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770471, "dur": 0, "args": { "External id": 94707, "cbid": 251, "correlation": 94707 } }, { "ph": "f", "id": 94707, "pid": 76337, "tid": -914061504, "ts": 1716454222770471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222840315, "dur": 11, "args": { "External id": 94708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94708, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 94708, "pid": 5, "tid": 7, "ts": 1716454222840315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770473, "dur": 12, "args": { "External id": 94708, "cbid": 211, "correlation": 94708 } }, { "ph": "s", "id": 94708, "pid": 76337, "tid": -914061504, "ts": 1716454222770473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222840327, "dur": 5, "args": { "External id": 94710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94710, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 94710, "pid": 5, "tid": 7, "ts": 1716454222840327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770487, "dur": 6, "args": { "External id": 94710, "cbid": 211, "correlation": 94710 } }, { "ph": "s", "id": 94710, "pid": 76337, "tid": -914061504, "ts": 1716454222770487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770545, "dur": 1, "args": { "External id": 94721, "cbid": 251, "correlation": 94721 } }, { "ph": "f", "id": 94721, "pid": 76337, "tid": -914061504, "ts": 1716454222770545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770548, "dur": 0, "args": { "External id": 94722, "cbid": 251, "correlation": 94722 } }, { "ph": "f", "id": 94722, "pid": 76337, "tid": -914061504, "ts": 1716454222770548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222840333, "dur": 7, "args": { "External id": 94723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94723, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 94723, "pid": 5, "tid": 7, "ts": 1716454222840333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770550, "dur": 11, "args": { "External id": 94723, "cbid": 211, "correlation": 94723 } }, { "ph": "s", "id": 94723, "pid": 76337, "tid": -914061504, "ts": 1716454222770550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222840341, "dur": 3, "args": { "External id": 94725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94725, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 94725, "pid": 5, "tid": 7, "ts": 1716454222840341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770563, "dur": 5, "args": { "External id": 94725, "cbid": 211, "correlation": 94725 } }, { "ph": "s", "id": 94725, "pid": 76337, "tid": -914061504, "ts": 1716454222770563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222840346, "dur": 90, "args": { "External id": 94746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94746, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 94746, "pid": 5, "tid": 7, "ts": 1716454222840346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770637, "dur": 13, "args": { "External id": 94746, "cbid": 211, "correlation": 94746 } }, { "ph": "s", "id": 94746, "pid": 76337, "tid": -914061504, "ts": 1716454222770637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222770735, "dur": 1, "args": { "External id": 94764, "cbid": 251, "correlation": 94764 } }, { "ph": "f", "id": 94764, "pid": 76337, "tid": -914061504, "ts": 1716454222770735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222840437, "dur": 96, "args": { "External id": 94766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94766, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94766, "pid": 5, "tid": 7, "ts": 1716454222840437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770741, "dur": 14, "args": { "External id": 94766, "cbid": 211, "correlation": 94766 } }, { "ph": "s", "id": 94766, "pid": 76337, "tid": -914061504, "ts": 1716454222770741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222840534, "dur": 19, "args": { "External id": 94774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94774, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94774, "pid": 5, "tid": 7, "ts": 1716454222840534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770812, "dur": 12, "args": { "External id": 94774, "cbid": 211, "correlation": 94774 } }, { "ph": "s", "id": 94774, "pid": 76337, "tid": -914061504, "ts": 1716454222770812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222840555, "dur": 38, "args": { "External id": 94782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94782, "pid": 5, "tid": 7, "ts": 1716454222840555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770853, "dur": 10, "args": { "External id": 94782, "cbid": 211, "correlation": 94782 } }, { "ph": "s", "id": 94782, "pid": 76337, "tid": -914061504, "ts": 1716454222770853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222840594, "dur": 34, "args": { "External id": 94804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94804, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94804, "pid": 5, "tid": 7, "ts": 1716454222840594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222770905, "dur": 10, "args": { "External id": 94804, "cbid": 211, "correlation": 94804 } }, { "ph": "s", "id": 94804, "pid": 76337, "tid": -914061504, "ts": 1716454222770905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222771004, "dur": 1, "args": { "External id": 94820, "cbid": 251, "correlation": 94820 } }, { "ph": "f", "id": 94820, "pid": 76337, "tid": -914061504, "ts": 1716454222771004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222771009, "dur": 0, "args": { "External id": 94822, "cbid": 251, "correlation": 94822 } }, { "ph": "f", "id": 94822, "pid": 76337, "tid": -914061504, "ts": 1716454222771009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222840630, "dur": 531, "args": { "External id": 94823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94823, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 94823, "pid": 5, "tid": 7, "ts": 1716454222840630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771014, "dur": 14, "args": { "External id": 94823, "cbid": 211, "correlation": 94823 } }, { "ph": "s", "id": 94823, "pid": 76337, "tid": -914061504, "ts": 1716454222771014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222841162, "dur": 123, "args": { "External id": 94831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94831, "pid": 5, "tid": 7, "ts": 1716454222841162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771085, "dur": 13, "args": { "External id": 94831, "cbid": 211, "correlation": 94831 } }, { "ph": "s", "id": 94831, "pid": 76337, "tid": -914061504, "ts": 1716454222771085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222841286, "dur": 129, "args": { "External id": 94839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94839, "pid": 5, "tid": 7, "ts": 1716454222841286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771119, "dur": 9, "args": { "External id": 94839, "cbid": 211, "correlation": 94839 } }, { "ph": "s", "id": 94839, "pid": 76337, "tid": -914061504, "ts": 1716454222771119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222771199, "dur": 1, "args": { "External id": 94855, "cbid": 251, "correlation": 94855 } }, { "ph": "f", "id": 94855, "pid": 76337, "tid": -914061504, "ts": 1716454222771199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222841416, "dur": 301, "args": { "External id": 94857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94857, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94857, "pid": 5, "tid": 7, "ts": 1716454222841416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771205, "dur": 12, "args": { "External id": 94857, "cbid": 211, "correlation": 94857 } }, { "ph": "s", "id": 94857, "pid": 76337, "tid": -914061504, "ts": 1716454222771205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222841719, "dur": 27, "args": { "External id": 94865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94865, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94865, "pid": 5, "tid": 7, "ts": 1716454222841719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771247, "dur": 11, "args": { "External id": 94865, "cbid": 211, "correlation": 94865 } }, { "ph": "s", "id": 94865, "pid": 76337, "tid": -914061504, "ts": 1716454222771247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222841748, "dur": 80, "args": { "External id": 94876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94876, "pid": 5, "tid": 7, "ts": 1716454222841748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771318, "dur": 12, "args": { "External id": 94876, "cbid": 211, "correlation": 94876 } }, { "ph": "s", "id": 94876, "pid": 76337, "tid": -914061504, "ts": 1716454222771318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222771385, "dur": 0, "args": { "External id": 94888, "cbid": 317, "correlation": 94888 } }, { "ph": "f", "id": 94888, "pid": 76337, "tid": -914061504, "ts": 1716454222771385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222771386, "dur": 0, "args": { "External id": 94889, "cbid": 203, "correlation": 94889 } }, { "ph": "f", "id": 94889, "pid": 76337, "tid": -914061504, "ts": 1716454222771386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222771387, "dur": 0, "args": { "External id": 94890, "cbid": 205, "correlation": 94890 } }, { "ph": "f", "id": 94890, "pid": 76337, "tid": -914061504, "ts": 1716454222771387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222841828, "dur": 22, "args": { "External id": 94894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94894, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94894, "pid": 5, "tid": 7, "ts": 1716454222841828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771403, "dur": 13, "args": { "External id": 94894, "cbid": 211, "correlation": 94894 } }, { "ph": "s", "id": 94894, "pid": 76337, "tid": -914061504, "ts": 1716454222771403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222841852, "dur": 118, "args": { "External id": 94896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94896, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94896, "pid": 5, "tid": 7, "ts": 1716454222841852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771424, "dur": 7, "args": { "External id": 94896, "cbid": 211, "correlation": 94896 } }, { "ph": "s", "id": 94896, "pid": 76337, "tid": -914061504, "ts": 1716454222771424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222841971, "dur": 24, "args": { "External id": 94898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94898, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94898, "pid": 5, "tid": 7, "ts": 1716454222841971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771435, "dur": 5, "args": { "External id": 94898, "cbid": 211, "correlation": 94898 } }, { "ph": "s", "id": 94898, "pid": 76337, "tid": -914061504, "ts": 1716454222771435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222841996, "dur": 33, "args": { "External id": 94904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94904, "pid": 5, "tid": 7, "ts": 1716454222841996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771462, "dur": 10, "args": { "External id": 94904, "cbid": 211, "correlation": 94904 } }, { "ph": "s", "id": 94904, "pid": 76337, "tid": -914061504, "ts": 1716454222771462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222842030, "dur": 27, "args": { "External id": 94912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94912, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94912, "pid": 5, "tid": 7, "ts": 1716454222842030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771494, "dur": 8, "args": { "External id": 94912, "cbid": 211, "correlation": 94912 } }, { "ph": "s", "id": 94912, "pid": 76337, "tid": -914061504, "ts": 1716454222771494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222842058, "dur": 53, "args": { "External id": 94921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94921, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94921, "pid": 5, "tid": 7, "ts": 1716454222842058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771535, "dur": 10, "args": { "External id": 94921, "cbid": 211, "correlation": 94921 } }, { "ph": "s", "id": 94921, "pid": 76337, "tid": -914061504, "ts": 1716454222771535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222842113, "dur": 51, "args": { "External id": 94941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94941, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 94941, "pid": 5, "tid": 7, "ts": 1716454222842113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771611, "dur": 11, "args": { "External id": 94941, "cbid": 211, "correlation": 94941 } }, { "ph": "s", "id": 94941, "pid": 76337, "tid": -914061504, "ts": 1716454222771611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222842165, "dur": 5, "args": { "External id": 94953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94953, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 94953, "pid": 5, "tid": 7, "ts": 1716454222842165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771632, "dur": 7, "args": { "External id": 94953, "cbid": 211, "correlation": 94953 } }, { "ph": "s", "id": 94953, "pid": 76337, "tid": -914061504, "ts": 1716454222771632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222842171, "dur": 55, "args": { "External id": 94956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94956, "pid": 5, "tid": 7, "ts": 1716454222842171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771652, "dur": 6, "args": { "External id": 94956, "cbid": 211, "correlation": 94956 } }, { "ph": "s", "id": 94956, "pid": 76337, "tid": -914061504, "ts": 1716454222771652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222842227, "dur": 37, "args": { "External id": 94965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94965, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94965, "pid": 5, "tid": 7, "ts": 1716454222842227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771691, "dur": 10, "args": { "External id": 94965, "cbid": 211, "correlation": 94965 } }, { "ph": "s", "id": 94965, "pid": 76337, "tid": -914061504, "ts": 1716454222771691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222771743, "dur": 0, "args": { "External id": 94975, "cbid": 317, "correlation": 94975 } }, { "ph": "f", "id": 94975, "pid": 76337, "tid": -914061504, "ts": 1716454222771743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222771744, "dur": 0, "args": { "External id": 94976, "cbid": 203, "correlation": 94976 } }, { "ph": "f", "id": 94976, "pid": 76337, "tid": -914061504, "ts": 1716454222771744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222771745, "dur": 0, "args": { "External id": 94977, "cbid": 205, "correlation": 94977 } }, { "ph": "f", "id": 94977, "pid": 76337, "tid": -914061504, "ts": 1716454222771745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222842265, "dur": 39, "args": { "External id": 94981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94981, "pid": 5, "tid": 7, "ts": 1716454222842265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771760, "dur": 11, "args": { "External id": 94981, "cbid": 211, "correlation": 94981 } }, { "ph": "s", "id": 94981, "pid": 76337, "tid": -914061504, "ts": 1716454222771760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222842305, "dur": 81, "args": { "External id": 94983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94983, "pid": 5, "tid": 7, "ts": 1716454222842305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771774, "dur": 6, "args": { "External id": 94983, "cbid": 211, "correlation": 94983 } }, { "ph": "s", "id": 94983, "pid": 76337, "tid": -914061504, "ts": 1716454222771774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222842388, "dur": 1263, "args": { "External id": 94985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94985, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 94985, "pid": 5, "tid": 7, "ts": 1716454222842388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771786, "dur": 7, "args": { "External id": 94985, "cbid": 211, "correlation": 94985 } }, { "ph": "s", "id": 94985, "pid": 76337, "tid": -914061504, "ts": 1716454222771786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222843652, "dur": 20, "args": { "External id": 94987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94987, "pid": 5, "tid": 7, "ts": 1716454222843652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771796, "dur": 5, "args": { "External id": 94987, "cbid": 211, "correlation": 94987 } }, { "ph": "s", "id": 94987, "pid": 76337, "tid": -914061504, "ts": 1716454222771796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222843674, "dur": 33, "args": { "External id": 94993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 94993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 94993, "pid": 5, "tid": 7, "ts": 1716454222843674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771824, "dur": 8, "args": { "External id": 94993, "cbid": 211, "correlation": 94993 } }, { "ph": "s", "id": 94993, "pid": 76337, "tid": -914061504, "ts": 1716454222771824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222843708, "dur": 3, "args": { "External id": 95001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95001, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 95001, "pid": 5, "tid": 7, "ts": 1716454222843708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771868, "dur": 10, "args": { "External id": 95001, "cbid": 211, "correlation": 95001 } }, { "ph": "s", "id": 95001, "pid": 76337, "tid": -914061504, "ts": 1716454222771868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222771934, "dur": 1, "args": { "External id": 95017, "cbid": 251, "correlation": 95017 } }, { "ph": "f", "id": 95017, "pid": 76337, "tid": -914061504, "ts": 1716454222771934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222771940, "dur": 0, "args": { "External id": 95019, "cbid": 251, "correlation": 95019 } }, { "ph": "f", "id": 95019, "pid": 76337, "tid": -914061504, "ts": 1716454222771940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222843713, "dur": 12, "args": { "External id": 95020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95020, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 95020, "pid": 5, "tid": 7, "ts": 1716454222843713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771941, "dur": 12, "args": { "External id": 95020, "cbid": 211, "correlation": 95020 } }, { "ph": "s", "id": 95020, "pid": 76337, "tid": -914061504, "ts": 1716454222771941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222843726, "dur": 5, "args": { "External id": 95022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95022, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 95022, "pid": 5, "tid": 7, "ts": 1716454222843726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222771956, "dur": 6, "args": { "External id": 95022, "cbid": 211, "correlation": 95022 } }, { "ph": "s", "id": 95022, "pid": 76337, "tid": -914061504, "ts": 1716454222771956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222843732, "dur": 29, "args": { "External id": 95032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95032, "pid": 5, "tid": 7, "ts": 1716454222843732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772024, "dur": 13, "args": { "External id": 95032, "cbid": 211, "correlation": 95032 } }, { "ph": "s", "id": 95032, "pid": 76337, "tid": -914061504, "ts": 1716454222772024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222843762, "dur": 32, "args": { "External id": 95052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95052, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 95052, "pid": 5, "tid": 7, "ts": 1716454222843762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772091, "dur": 11, "args": { "External id": 95052, "cbid": 211, "correlation": 95052 } }, { "ph": "s", "id": 95052, "pid": 76337, "tid": -914061504, "ts": 1716454222772091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222843795, "dur": 4, "args": { "External id": 95064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95064, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 95064, "pid": 5, "tid": 7, "ts": 1716454222843795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772112, "dur": 8, "args": { "External id": 95064, "cbid": 211, "correlation": 95064 } }, { "ph": "s", "id": 95064, "pid": 76337, "tid": -914061504, "ts": 1716454222772112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222843800, "dur": 29, "args": { "External id": 95067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95067, "pid": 5, "tid": 7, "ts": 1716454222843800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772131, "dur": 6, "args": { "External id": 95067, "cbid": 211, "correlation": 95067 } }, { "ph": "s", "id": 95067, "pid": 76337, "tid": -914061504, "ts": 1716454222772131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222843831, "dur": 21, "args": { "External id": 95076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95076, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95076, "pid": 5, "tid": 7, "ts": 1716454222843831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772172, "dur": 10, "args": { "External id": 95076, "cbid": 211, "correlation": 95076 } }, { "ph": "s", "id": 95076, "pid": 76337, "tid": -914061504, "ts": 1716454222772172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222772236, "dur": 0, "args": { "External id": 95086, "cbid": 317, "correlation": 95086 } }, { "ph": "f", "id": 95086, "pid": 76337, "tid": -914061504, "ts": 1716454222772236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222772237, "dur": 0, "args": { "External id": 95087, "cbid": 203, "correlation": 95087 } }, { "ph": "f", "id": 95087, "pid": 76337, "tid": -914061504, "ts": 1716454222772237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222772238, "dur": 0, "args": { "External id": 95088, "cbid": 205, "correlation": 95088 } }, { "ph": "f", "id": 95088, "pid": 76337, "tid": -914061504, "ts": 1716454222772238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222843853, "dur": 22, "args": { "External id": 95092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95092, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95092, "pid": 5, "tid": 7, "ts": 1716454222843853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772252, "dur": 12, "args": { "External id": 95092, "cbid": 211, "correlation": 95092 } }, { "ph": "s", "id": 95092, "pid": 76337, "tid": -914061504, "ts": 1716454222772252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222843876, "dur": 43, "args": { "External id": 95094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95094, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95094, "pid": 5, "tid": 7, "ts": 1716454222843876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772267, "dur": 5, "args": { "External id": 95094, "cbid": 211, "correlation": 95094 } }, { "ph": "s", "id": 95094, "pid": 76337, "tid": -914061504, "ts": 1716454222772267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222843920, "dur": 637, "args": { "External id": 95096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95096, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95096, "pid": 5, "tid": 7, "ts": 1716454222843920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772279, "dur": 6, "args": { "External id": 95096, "cbid": 211, "correlation": 95096 } }, { "ph": "s", "id": 95096, "pid": 76337, "tid": -914061504, "ts": 1716454222772279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222844558, "dur": 21, "args": { "External id": 95098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95098, "pid": 5, "tid": 7, "ts": 1716454222844558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772289, "dur": 5, "args": { "External id": 95098, "cbid": 211, "correlation": 95098 } }, { "ph": "s", "id": 95098, "pid": 76337, "tid": -914061504, "ts": 1716454222772289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222844581, "dur": 32, "args": { "External id": 95104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95104, "pid": 5, "tid": 7, "ts": 1716454222844581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772316, "dur": 10, "args": { "External id": 95104, "cbid": 211, "correlation": 95104 } }, { "ph": "s", "id": 95104, "pid": 76337, "tid": -914061504, "ts": 1716454222772316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222772375, "dur": 0, "args": { "External id": 95114, "cbid": 317, "correlation": 95114 } }, { "ph": "f", "id": 95114, "pid": 76337, "tid": -914061504, "ts": 1716454222772375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222772376, "dur": 0, "args": { "External id": 95115, "cbid": 203, "correlation": 95115 } }, { "ph": "f", "id": 95115, "pid": 76337, "tid": -914061504, "ts": 1716454222772376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222772377, "dur": 0, "args": { "External id": 95116, "cbid": 205, "correlation": 95116 } }, { "ph": "f", "id": 95116, "pid": 76337, "tid": -914061504, "ts": 1716454222772377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222844614, "dur": 38, "args": { "External id": 95120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95120, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95120, "pid": 5, "tid": 7, "ts": 1716454222844614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772390, "dur": 12, "args": { "External id": 95120, "cbid": 211, "correlation": 95120 } }, { "ph": "s", "id": 95120, "pid": 76337, "tid": -914061504, "ts": 1716454222772390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222844654, "dur": 188, "args": { "External id": 95122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95122, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95122, "pid": 5, "tid": 7, "ts": 1716454222844654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772407, "dur": 6, "args": { "External id": 95122, "cbid": 211, "correlation": 95122 } }, { "ph": "s", "id": 95122, "pid": 76337, "tid": -914061504, "ts": 1716454222772407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222844843, "dur": 22, "args": { "External id": 95124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95124, "pid": 5, "tid": 7, "ts": 1716454222844843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772417, "dur": 5, "args": { "External id": 95124, "cbid": 211, "correlation": 95124 } }, { "ph": "s", "id": 95124, "pid": 76337, "tid": -914061504, "ts": 1716454222772417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222844866, "dur": 32, "args": { "External id": 95130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95130, "pid": 5, "tid": 7, "ts": 1716454222844866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772443, "dur": 8, "args": { "External id": 95130, "cbid": 211, "correlation": 95130 } }, { "ph": "s", "id": 95130, "pid": 76337, "tid": -914061504, "ts": 1716454222772443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222844900, "dur": 28, "args": { "External id": 95138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95138, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95138, "pid": 5, "tid": 7, "ts": 1716454222844900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772472, "dur": 8, "args": { "External id": 95138, "cbid": 211, "correlation": 95138 } }, { "ph": "s", "id": 95138, "pid": 76337, "tid": -914061504, "ts": 1716454222772472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222844929, "dur": 20, "args": { "External id": 95146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95146, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95146, "pid": 5, "tid": 7, "ts": 1716454222844929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772501, "dur": 8, "args": { "External id": 95146, "cbid": 211, "correlation": 95146 } }, { "ph": "s", "id": 95146, "pid": 76337, "tid": -914061504, "ts": 1716454222772501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222844950, "dur": 30, "args": { "External id": 95166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95166, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 95166, "pid": 5, "tid": 7, "ts": 1716454222844950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772584, "dur": 12, "args": { "External id": 95166, "cbid": 211, "correlation": 95166 } }, { "ph": "s", "id": 95166, "pid": 76337, "tid": -914061504, "ts": 1716454222772584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222844981, "dur": 4, "args": { "External id": 95178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95178, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 95178, "pid": 5, "tid": 7, "ts": 1716454222844981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772605, "dur": 7, "args": { "External id": 95178, "cbid": 211, "correlation": 95178 } }, { "ph": "s", "id": 95178, "pid": 76337, "tid": -914061504, "ts": 1716454222772605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222844987, "dur": 29, "args": { "External id": 95181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95181, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95181, "pid": 5, "tid": 7, "ts": 1716454222844987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772624, "dur": 7, "args": { "External id": 95181, "cbid": 211, "correlation": 95181 } }, { "ph": "s", "id": 95181, "pid": 76337, "tid": -914061504, "ts": 1716454222772624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222772681, "dur": 0, "args": { "External id": 95192, "cbid": 317, "correlation": 95192 } }, { "ph": "f", "id": 95192, "pid": 76337, "tid": -914061504, "ts": 1716454222772681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222772682, "dur": 0, "args": { "External id": 95193, "cbid": 203, "correlation": 95193 } }, { "ph": "f", "id": 95193, "pid": 76337, "tid": -914061504, "ts": 1716454222772682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222772682, "dur": 0, "args": { "External id": 95194, "cbid": 205, "correlation": 95194 } }, { "ph": "f", "id": 95194, "pid": 76337, "tid": -914061504, "ts": 1716454222772682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222845018, "dur": 23, "args": { "External id": 95198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95198, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95198, "pid": 5, "tid": 7, "ts": 1716454222845018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772696, "dur": 11, "args": { "External id": 95198, "cbid": 211, "correlation": 95198 } }, { "ph": "s", "id": 95198, "pid": 76337, "tid": -914061504, "ts": 1716454222772696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222845042, "dur": 103, "args": { "External id": 95200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95200, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95200, "pid": 5, "tid": 7, "ts": 1716454222845042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772714, "dur": 6, "args": { "External id": 95200, "cbid": 211, "correlation": 95200 } }, { "ph": "s", "id": 95200, "pid": 76337, "tid": -914061504, "ts": 1716454222772714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222845146, "dur": 21, "args": { "External id": 95202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95202, "pid": 5, "tid": 7, "ts": 1716454222845146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772724, "dur": 6, "args": { "External id": 95202, "cbid": 211, "correlation": 95202 } }, { "ph": "s", "id": 95202, "pid": 76337, "tid": -914061504, "ts": 1716454222772724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222845169, "dur": 32, "args": { "External id": 95208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95208, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95208, "pid": 5, "tid": 7, "ts": 1716454222845169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772751, "dur": 9, "args": { "External id": 95208, "cbid": 211, "correlation": 95208 } }, { "ph": "s", "id": 95208, "pid": 76337, "tid": -914061504, "ts": 1716454222772751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222845202, "dur": 198, "args": { "External id": 95217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95217, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95217, "pid": 5, "tid": 7, "ts": 1716454222845202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772834, "dur": 15, "args": { "External id": 95217, "cbid": 211, "correlation": 95217 } }, { "ph": "s", "id": 95217, "pid": 76337, "tid": -914061504, "ts": 1716454222772834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222845401, "dur": 64, "args": { "External id": 95239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95239, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95239, "pid": 5, "tid": 7, "ts": 1716454222845401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772892, "dur": 11, "args": { "External id": 95239, "cbid": 211, "correlation": 95239 } }, { "ph": "s", "id": 95239, "pid": 76337, "tid": -914061504, "ts": 1716454222772892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222772993, "dur": 1, "args": { "External id": 95250, "cbid": 251, "correlation": 95250 } }, { "ph": "f", "id": 95250, "pid": 76337, "tid": -914061504, "ts": 1716454222772993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222845466, "dur": 150, "args": { "External id": 95251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95251, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95251, "pid": 5, "tid": 7, "ts": 1716454222845466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222772999, "dur": 13, "args": { "External id": 95251, "cbid": 211, "correlation": 95251 } }, { "ph": "s", "id": 95251, "pid": 76337, "tid": -914061504, "ts": 1716454222772999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773071, "dur": 1, "args": { "External id": 95262, "cbid": 251, "correlation": 95262 } }, { "ph": "f", "id": 95262, "pid": 76337, "tid": -914061504, "ts": 1716454222773071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222845618, "dur": 143, "args": { "External id": 95263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95263, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95263, "pid": 5, "tid": 7, "ts": 1716454222845618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773075, "dur": 12, "args": { "External id": 95263, "cbid": 211, "correlation": 95263 } }, { "ph": "s", "id": 95263, "pid": 76337, "tid": -914061504, "ts": 1716454222773075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773140, "dur": 1, "args": { "External id": 95274, "cbid": 251, "correlation": 95274 } }, { "ph": "f", "id": 95274, "pid": 76337, "tid": -914061504, "ts": 1716454222773140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222845763, "dur": 144, "args": { "External id": 95275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95275, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95275, "pid": 5, "tid": 7, "ts": 1716454222845763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773144, "dur": 11, "args": { "External id": 95275, "cbid": 211, "correlation": 95275 } }, { "ph": "s", "id": 95275, "pid": 76337, "tid": -914061504, "ts": 1716454222773144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222845908, "dur": 1903, "args": { "External id": 95296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95296, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 95296, "pid": 5, "tid": 7, "ts": 1716454222845908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773226, "dur": 13, "args": { "External id": 95296, "cbid": 211, "correlation": 95296 } }, { "ph": "s", "id": 95296, "pid": 76337, "tid": -914061504, "ts": 1716454222773226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773329, "dur": 1, "args": { "External id": 95314, "cbid": 251, "correlation": 95314 } }, { "ph": "f", "id": 95314, "pid": 76337, "tid": -914061504, "ts": 1716454222773329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222847813, "dur": 147, "args": { "External id": 95316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95316, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 95316, "pid": 5, "tid": 7, "ts": 1716454222847813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773335, "dur": 13, "args": { "External id": 95316, "cbid": 211, "correlation": 95316 } }, { "ph": "s", "id": 95316, "pid": 76337, "tid": -914061504, "ts": 1716454222773335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222847961, "dur": 35, "args": { "External id": 95324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95324, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95324, "pid": 5, "tid": 7, "ts": 1716454222847961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773406, "dur": 14, "args": { "External id": 95324, "cbid": 211, "correlation": 95324 } }, { "ph": "s", "id": 95324, "pid": 76337, "tid": -914061504, "ts": 1716454222773406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222847997, "dur": 51, "args": { "External id": 95332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95332, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95332, "pid": 5, "tid": 7, "ts": 1716454222847997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773447, "dur": 8, "args": { "External id": 95332, "cbid": 211, "correlation": 95332 } }, { "ph": "s", "id": 95332, "pid": 76337, "tid": -914061504, "ts": 1716454222773447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222848049, "dur": 29, "args": { "External id": 95343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95343, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95343, "pid": 5, "tid": 7, "ts": 1716454222848049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773521, "dur": 13, "args": { "External id": 95343, "cbid": 211, "correlation": 95343 } }, { "ph": "s", "id": 95343, "pid": 76337, "tid": -914061504, "ts": 1716454222773521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222848079, "dur": 34, "args": { "External id": 95365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95365, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95365, "pid": 5, "tid": 7, "ts": 1716454222848079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773553, "dur": 7, "args": { "External id": 95365, "cbid": 211, "correlation": 95365 } }, { "ph": "s", "id": 95365, "pid": 76337, "tid": -914061504, "ts": 1716454222773553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773640, "dur": 1, "args": { "External id": 95376, "cbid": 251, "correlation": 95376 } }, { "ph": "f", "id": 95376, "pid": 76337, "tid": -914061504, "ts": 1716454222773640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222848115, "dur": 87, "args": { "External id": 95377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95377, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95377, "pid": 5, "tid": 7, "ts": 1716454222848115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773645, "dur": 13, "args": { "External id": 95377, "cbid": 211, "correlation": 95377 } }, { "ph": "s", "id": 95377, "pid": 76337, "tid": -914061504, "ts": 1716454222773645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773717, "dur": 1, "args": { "External id": 95388, "cbid": 251, "correlation": 95388 } }, { "ph": "f", "id": 95388, "pid": 76337, "tid": -914061504, "ts": 1716454222773717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773720, "dur": 0, "args": { "External id": 95389, "cbid": 251, "correlation": 95389 } }, { "ph": "f", "id": 95389, "pid": 76337, "tid": -914061504, "ts": 1716454222773720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222848204, "dur": 11, "args": { "External id": 95390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95390, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 95390, "pid": 5, "tid": 7, "ts": 1716454222848204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773722, "dur": 12, "args": { "External id": 95390, "cbid": 211, "correlation": 95390 } }, { "ph": "s", "id": 95390, "pid": 76337, "tid": -914061504, "ts": 1716454222773722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222848216, "dur": 5, "args": { "External id": 95392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95392, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 95392, "pid": 5, "tid": 7, "ts": 1716454222848216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773737, "dur": 6, "args": { "External id": 95392, "cbid": 211, "correlation": 95392 } }, { "ph": "s", "id": 95392, "pid": 76337, "tid": -914061504, "ts": 1716454222773737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773794, "dur": 1, "args": { "External id": 95403, "cbid": 251, "correlation": 95403 } }, { "ph": "f", "id": 95403, "pid": 76337, "tid": -914061504, "ts": 1716454222773794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773797, "dur": 0, "args": { "External id": 95404, "cbid": 251, "correlation": 95404 } }, { "ph": "f", "id": 95404, "pid": 76337, "tid": -914061504, "ts": 1716454222773797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222848222, "dur": 7, "args": { "External id": 95405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95405, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 95405, "pid": 5, "tid": 7, "ts": 1716454222848222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773799, "dur": 12, "args": { "External id": 95405, "cbid": 211, "correlation": 95405 } }, { "ph": "s", "id": 95405, "pid": 76337, "tid": -914061504, "ts": 1716454222773799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222848231, "dur": 3, "args": { "External id": 95407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95407, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 95407, "pid": 5, "tid": 7, "ts": 1716454222848231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773813, "dur": 6, "args": { "External id": 95407, "cbid": 211, "correlation": 95407 } }, { "ph": "s", "id": 95407, "pid": 76337, "tid": -914061504, "ts": 1716454222773813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222848235, "dur": 89, "args": { "External id": 95428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95428, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 95428, "pid": 5, "tid": 7, "ts": 1716454222848235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222773887, "dur": 12, "args": { "External id": 95428, "cbid": 211, "correlation": 95428 } }, { "ph": "s", "id": 95428, "pid": 76337, "tid": -914061504, "ts": 1716454222773887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222773994, "dur": 1, "args": { "External id": 95446, "cbid": 251, "correlation": 95446 } }, { "ph": "f", "id": 95446, "pid": 76337, "tid": -914061504, "ts": 1716454222773994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222848326, "dur": 98, "args": { "External id": 95448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95448, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95448, "pid": 5, "tid": 7, "ts": 1716454222848326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774000, "dur": 14, "args": { "External id": 95448, "cbid": 211, "correlation": 95448 } }, { "ph": "s", "id": 95448, "pid": 76337, "tid": -914061504, "ts": 1716454222774000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222848425, "dur": 19, "args": { "External id": 95456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95456, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95456, "pid": 5, "tid": 7, "ts": 1716454222848425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774070, "dur": 12, "args": { "External id": 95456, "cbid": 211, "correlation": 95456 } }, { "ph": "s", "id": 95456, "pid": 76337, "tid": -914061504, "ts": 1716454222774070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222848446, "dur": 38, "args": { "External id": 95464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95464, "pid": 5, "tid": 7, "ts": 1716454222848446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774112, "dur": 9, "args": { "External id": 95464, "cbid": 211, "correlation": 95464 } }, { "ph": "s", "id": 95464, "pid": 76337, "tid": -914061504, "ts": 1716454222774112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222848484, "dur": 34, "args": { "External id": 95486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95486, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95486, "pid": 5, "tid": 7, "ts": 1716454222848484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774163, "dur": 10, "args": { "External id": 95486, "cbid": 211, "correlation": 95486 } }, { "ph": "s", "id": 95486, "pid": 76337, "tid": -914061504, "ts": 1716454222774163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222774253, "dur": 1, "args": { "External id": 95502, "cbid": 251, "correlation": 95502 } }, { "ph": "f", "id": 95502, "pid": 76337, "tid": -914061504, "ts": 1716454222774253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222774258, "dur": 0, "args": { "External id": 95504, "cbid": 251, "correlation": 95504 } }, { "ph": "f", "id": 95504, "pid": 76337, "tid": -914061504, "ts": 1716454222774258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222848520, "dur": 531, "args": { "External id": 95505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95505, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 95505, "pid": 5, "tid": 7, "ts": 1716454222848520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774262, "dur": 13, "args": { "External id": 95505, "cbid": 211, "correlation": 95505 } }, { "ph": "s", "id": 95505, "pid": 76337, "tid": -914061504, "ts": 1716454222774262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222849053, "dur": 123, "args": { "External id": 95513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95513, "pid": 5, "tid": 7, "ts": 1716454222849053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774331, "dur": 13, "args": { "External id": 95513, "cbid": 211, "correlation": 95513 } }, { "ph": "s", "id": 95513, "pid": 76337, "tid": -914061504, "ts": 1716454222774331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222849177, "dur": 130, "args": { "External id": 95521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95521, "pid": 5, "tid": 7, "ts": 1716454222849177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774363, "dur": 8, "args": { "External id": 95521, "cbid": 211, "correlation": 95521 } }, { "ph": "s", "id": 95521, "pid": 76337, "tid": -914061504, "ts": 1716454222774363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222774440, "dur": 2, "args": { "External id": 95537, "cbid": 251, "correlation": 95537 } }, { "ph": "f", "id": 95537, "pid": 76337, "tid": -914061504, "ts": 1716454222774440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222849309, "dur": 305, "args": { "External id": 95539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95539, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95539, "pid": 5, "tid": 7, "ts": 1716454222849309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774446, "dur": 12, "args": { "External id": 95539, "cbid": 211, "correlation": 95539 } }, { "ph": "s", "id": 95539, "pid": 76337, "tid": -914061504, "ts": 1716454222774446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222849615, "dur": 27, "args": { "External id": 95547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95547, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95547, "pid": 5, "tid": 7, "ts": 1716454222849615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774487, "dur": 10, "args": { "External id": 95547, "cbid": 211, "correlation": 95547 } }, { "ph": "s", "id": 95547, "pid": 76337, "tid": -914061504, "ts": 1716454222774487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222849643, "dur": 80, "args": { "External id": 95558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95558, "pid": 5, "tid": 7, "ts": 1716454222849643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774557, "dur": 12, "args": { "External id": 95558, "cbid": 211, "correlation": 95558 } }, { "ph": "s", "id": 95558, "pid": 76337, "tid": -914061504, "ts": 1716454222774557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222774625, "dur": 0, "args": { "External id": 95570, "cbid": 317, "correlation": 95570 } }, { "ph": "f", "id": 95570, "pid": 76337, "tid": -914061504, "ts": 1716454222774625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222774626, "dur": 0, "args": { "External id": 95571, "cbid": 203, "correlation": 95571 } }, { "ph": "f", "id": 95571, "pid": 76337, "tid": -914061504, "ts": 1716454222774626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222774627, "dur": 0, "args": { "External id": 95572, "cbid": 205, "correlation": 95572 } }, { "ph": "f", "id": 95572, "pid": 76337, "tid": -914061504, "ts": 1716454222774627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222849725, "dur": 22, "args": { "External id": 95576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95576, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95576, "pid": 5, "tid": 7, "ts": 1716454222849725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774644, "dur": 12, "args": { "External id": 95576, "cbid": 211, "correlation": 95576 } }, { "ph": "s", "id": 95576, "pid": 76337, "tid": -914061504, "ts": 1716454222774644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222849748, "dur": 117, "args": { "External id": 95578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95578, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95578, "pid": 5, "tid": 7, "ts": 1716454222849748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774663, "dur": 8, "args": { "External id": 95578, "cbid": 211, "correlation": 95578 } }, { "ph": "s", "id": 95578, "pid": 76337, "tid": -914061504, "ts": 1716454222774663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222849867, "dur": 23, "args": { "External id": 95580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95580, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95580, "pid": 5, "tid": 7, "ts": 1716454222849867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774675, "dur": 5, "args": { "External id": 95580, "cbid": 211, "correlation": 95580 } }, { "ph": "s", "id": 95580, "pid": 76337, "tid": -914061504, "ts": 1716454222774675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222849891, "dur": 33, "args": { "External id": 95586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95586, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95586, "pid": 5, "tid": 7, "ts": 1716454222849891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774702, "dur": 8, "args": { "External id": 95586, "cbid": 211, "correlation": 95586 } }, { "ph": "s", "id": 95586, "pid": 76337, "tid": -914061504, "ts": 1716454222774702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222849925, "dur": 26, "args": { "External id": 95594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95594, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95594, "pid": 5, "tid": 7, "ts": 1716454222849925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774734, "dur": 8, "args": { "External id": 95594, "cbid": 211, "correlation": 95594 } }, { "ph": "s", "id": 95594, "pid": 76337, "tid": -914061504, "ts": 1716454222774734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222849953, "dur": 45, "args": { "External id": 95603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95603, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95603, "pid": 5, "tid": 7, "ts": 1716454222849953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774772, "dur": 12, "args": { "External id": 95603, "cbid": 211, "correlation": 95603 } }, { "ph": "s", "id": 95603, "pid": 76337, "tid": -914061504, "ts": 1716454222774772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222849999, "dur": 42, "args": { "External id": 95623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95623, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 95623, "pid": 5, "tid": 7, "ts": 1716454222849999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774849, "dur": 11, "args": { "External id": 95623, "cbid": 211, "correlation": 95623 } }, { "ph": "s", "id": 95623, "pid": 76337, "tid": -914061504, "ts": 1716454222774849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222850043, "dur": 5, "args": { "External id": 95635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95635, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 95635, "pid": 5, "tid": 7, "ts": 1716454222850043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774870, "dur": 6, "args": { "External id": 95635, "cbid": 211, "correlation": 95635 } }, { "ph": "s", "id": 95635, "pid": 76337, "tid": -914061504, "ts": 1716454222774870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222850049, "dur": 43, "args": { "External id": 95638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95638, "pid": 5, "tid": 7, "ts": 1716454222850049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774889, "dur": 8, "args": { "External id": 95638, "cbid": 211, "correlation": 95638 } }, { "ph": "s", "id": 95638, "pid": 76337, "tid": -914061504, "ts": 1716454222774889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222850094, "dur": 29, "args": { "External id": 95647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95647, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95647, "pid": 5, "tid": 7, "ts": 1716454222850094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222774930, "dur": 10, "args": { "External id": 95647, "cbid": 211, "correlation": 95647 } }, { "ph": "s", "id": 95647, "pid": 76337, "tid": -914061504, "ts": 1716454222774930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222774990, "dur": 0, "args": { "External id": 95657, "cbid": 317, "correlation": 95657 } }, { "ph": "f", "id": 95657, "pid": 76337, "tid": -914061504, "ts": 1716454222774990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222774991, "dur": 0, "args": { "External id": 95658, "cbid": 203, "correlation": 95658 } }, { "ph": "f", "id": 95658, "pid": 76337, "tid": -914061504, "ts": 1716454222774991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222774992, "dur": 0, "args": { "External id": 95659, "cbid": 205, "correlation": 95659 } }, { "ph": "f", "id": 95659, "pid": 76337, "tid": -914061504, "ts": 1716454222774992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222850124, "dur": 31, "args": { "External id": 95663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95663, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95663, "pid": 5, "tid": 7, "ts": 1716454222850124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775008, "dur": 12, "args": { "External id": 95663, "cbid": 211, "correlation": 95663 } }, { "ph": "s", "id": 95663, "pid": 76337, "tid": -914061504, "ts": 1716454222775008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222850157, "dur": 63, "args": { "External id": 95665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95665, "pid": 5, "tid": 7, "ts": 1716454222850157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775022, "dur": 5, "args": { "External id": 95665, "cbid": 211, "correlation": 95665 } }, { "ph": "s", "id": 95665, "pid": 76337, "tid": -914061504, "ts": 1716454222775022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222850221, "dur": 955, "args": { "External id": 95667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95667, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95667, "pid": 5, "tid": 7, "ts": 1716454222850221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775033, "dur": 6, "args": { "External id": 95667, "cbid": 211, "correlation": 95667 } }, { "ph": "s", "id": 95667, "pid": 76337, "tid": -914061504, "ts": 1716454222775033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222851177, "dur": 21, "args": { "External id": 95669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95669, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95669, "pid": 5, "tid": 7, "ts": 1716454222851177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775043, "dur": 5, "args": { "External id": 95669, "cbid": 211, "correlation": 95669 } }, { "ph": "s", "id": 95669, "pid": 76337, "tid": -914061504, "ts": 1716454222775043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222851200, "dur": 32, "args": { "External id": 95675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95675, "pid": 5, "tid": 7, "ts": 1716454222851200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775072, "dur": 8, "args": { "External id": 95675, "cbid": 211, "correlation": 95675 } }, { "ph": "s", "id": 95675, "pid": 76337, "tid": -914061504, "ts": 1716454222775072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222851234, "dur": 3, "args": { "External id": 95683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95683, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 95683, "pid": 5, "tid": 7, "ts": 1716454222851234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775116, "dur": 9, "args": { "External id": 95683, "cbid": 211, "correlation": 95683 } }, { "ph": "s", "id": 95683, "pid": 76337, "tid": -914061504, "ts": 1716454222775116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222775184, "dur": 1, "args": { "External id": 95699, "cbid": 251, "correlation": 95699 } }, { "ph": "f", "id": 95699, "pid": 76337, "tid": -914061504, "ts": 1716454222775184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222775190, "dur": 0, "args": { "External id": 95701, "cbid": 251, "correlation": 95701 } }, { "ph": "f", "id": 95701, "pid": 76337, "tid": -914061504, "ts": 1716454222775190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222851238, "dur": 12, "args": { "External id": 95702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95702, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 95702, "pid": 5, "tid": 7, "ts": 1716454222851238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775192, "dur": 12, "args": { "External id": 95702, "cbid": 211, "correlation": 95702 } }, { "ph": "s", "id": 95702, "pid": 76337, "tid": -914061504, "ts": 1716454222775192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222851252, "dur": 5, "args": { "External id": 95704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95704, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 95704, "pid": 5, "tid": 7, "ts": 1716454222851252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775205, "dur": 6, "args": { "External id": 95704, "cbid": 211, "correlation": 95704 } }, { "ph": "s", "id": 95704, "pid": 76337, "tid": -914061504, "ts": 1716454222775205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222851258, "dur": 30, "args": { "External id": 95714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95714, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95714, "pid": 5, "tid": 7, "ts": 1716454222851258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775264, "dur": 13, "args": { "External id": 95714, "cbid": 211, "correlation": 95714 } }, { "ph": "s", "id": 95714, "pid": 76337, "tid": -914061504, "ts": 1716454222775264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222851289, "dur": 31, "args": { "External id": 95734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95734, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 95734, "pid": 5, "tid": 7, "ts": 1716454222851289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775331, "dur": 11, "args": { "External id": 95734, "cbid": 211, "correlation": 95734 } }, { "ph": "s", "id": 95734, "pid": 76337, "tid": -914061504, "ts": 1716454222775331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222851321, "dur": 4, "args": { "External id": 95746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95746, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 95746, "pid": 5, "tid": 7, "ts": 1716454222851321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775351, "dur": 6, "args": { "External id": 95746, "cbid": 211, "correlation": 95746 } }, { "ph": "s", "id": 95746, "pid": 76337, "tid": -914061504, "ts": 1716454222775351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222851326, "dur": 29, "args": { "External id": 95749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95749, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95749, "pid": 5, "tid": 7, "ts": 1716454222851326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775370, "dur": 6, "args": { "External id": 95749, "cbid": 211, "correlation": 95749 } }, { "ph": "s", "id": 95749, "pid": 76337, "tid": -914061504, "ts": 1716454222775370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222851357, "dur": 20, "args": { "External id": 95758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95758, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95758, "pid": 5, "tid": 7, "ts": 1716454222851357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775411, "dur": 10, "args": { "External id": 95758, "cbid": 211, "correlation": 95758 } }, { "ph": "s", "id": 95758, "pid": 76337, "tid": -914061504, "ts": 1716454222775411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222775475, "dur": 0, "args": { "External id": 95768, "cbid": 317, "correlation": 95768 } }, { "ph": "f", "id": 95768, "pid": 76337, "tid": -914061504, "ts": 1716454222775475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222775476, "dur": 0, "args": { "External id": 95769, "cbid": 203, "correlation": 95769 } }, { "ph": "f", "id": 95769, "pid": 76337, "tid": -914061504, "ts": 1716454222775476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222775476, "dur": 0, "args": { "External id": 95770, "cbid": 205, "correlation": 95770 } }, { "ph": "f", "id": 95770, "pid": 76337, "tid": -914061504, "ts": 1716454222775476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222851378, "dur": 24, "args": { "External id": 95774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95774, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95774, "pid": 5, "tid": 7, "ts": 1716454222851378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775491, "dur": 13, "args": { "External id": 95774, "cbid": 211, "correlation": 95774 } }, { "ph": "s", "id": 95774, "pid": 76337, "tid": -914061504, "ts": 1716454222775491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222851403, "dur": 43, "args": { "External id": 95776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95776, "pid": 5, "tid": 7, "ts": 1716454222851403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775506, "dur": 5, "args": { "External id": 95776, "cbid": 211, "correlation": 95776 } }, { "ph": "s", "id": 95776, "pid": 76337, "tid": -914061504, "ts": 1716454222775506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222851447, "dur": 637, "args": { "External id": 95778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95778, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95778, "pid": 5, "tid": 7, "ts": 1716454222851447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775518, "dur": 6, "args": { "External id": 95778, "cbid": 211, "correlation": 95778 } }, { "ph": "s", "id": 95778, "pid": 76337, "tid": -914061504, "ts": 1716454222775518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222852085, "dur": 21, "args": { "External id": 95780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95780, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95780, "pid": 5, "tid": 7, "ts": 1716454222852085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775527, "dur": 5, "args": { "External id": 95780, "cbid": 211, "correlation": 95780 } }, { "ph": "s", "id": 95780, "pid": 76337, "tid": -914061504, "ts": 1716454222775527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222852108, "dur": 33, "args": { "External id": 95786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95786, "pid": 5, "tid": 7, "ts": 1716454222852108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775555, "dur": 9, "args": { "External id": 95786, "cbid": 211, "correlation": 95786 } }, { "ph": "s", "id": 95786, "pid": 76337, "tid": -914061504, "ts": 1716454222775555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222775612, "dur": 0, "args": { "External id": 95796, "cbid": 317, "correlation": 95796 } }, { "ph": "f", "id": 95796, "pid": 76337, "tid": -914061504, "ts": 1716454222775612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222775612, "dur": 0, "args": { "External id": 95797, "cbid": 203, "correlation": 95797 } }, { "ph": "f", "id": 95797, "pid": 76337, "tid": -914061504, "ts": 1716454222775612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222775613, "dur": 0, "args": { "External id": 95798, "cbid": 205, "correlation": 95798 } }, { "ph": "f", "id": 95798, "pid": 76337, "tid": -914061504, "ts": 1716454222775613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222852142, "dur": 29, "args": { "External id": 95802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95802, "pid": 5, "tid": 7, "ts": 1716454222852142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775628, "dur": 12, "args": { "External id": 95802, "cbid": 211, "correlation": 95802 } }, { "ph": "s", "id": 95802, "pid": 76337, "tid": -914061504, "ts": 1716454222775628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222852173, "dur": 150, "args": { "External id": 95804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95804, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95804, "pid": 5, "tid": 7, "ts": 1716454222852173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775646, "dur": 6, "args": { "External id": 95804, "cbid": 211, "correlation": 95804 } }, { "ph": "s", "id": 95804, "pid": 76337, "tid": -914061504, "ts": 1716454222775646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222852324, "dur": 22, "args": { "External id": 95806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95806, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95806, "pid": 5, "tid": 7, "ts": 1716454222852324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775655, "dur": 5, "args": { "External id": 95806, "cbid": 211, "correlation": 95806 } }, { "ph": "s", "id": 95806, "pid": 76337, "tid": -914061504, "ts": 1716454222775655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222852348, "dur": 32, "args": { "External id": 95812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95812, "pid": 5, "tid": 7, "ts": 1716454222852348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775681, "dur": 8, "args": { "External id": 95812, "cbid": 211, "correlation": 95812 } }, { "ph": "s", "id": 95812, "pid": 76337, "tid": -914061504, "ts": 1716454222775681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222852382, "dur": 27, "args": { "External id": 95820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95820, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95820, "pid": 5, "tid": 7, "ts": 1716454222852382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775709, "dur": 7, "args": { "External id": 95820, "cbid": 211, "correlation": 95820 } }, { "ph": "s", "id": 95820, "pid": 76337, "tid": -914061504, "ts": 1716454222775709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222852410, "dur": 19, "args": { "External id": 95828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95828, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95828, "pid": 5, "tid": 7, "ts": 1716454222852410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775738, "dur": 8, "args": { "External id": 95828, "cbid": 211, "correlation": 95828 } }, { "ph": "s", "id": 95828, "pid": 76337, "tid": -914061504, "ts": 1716454222775738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222852430, "dur": 30, "args": { "External id": 95848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95848, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 95848, "pid": 5, "tid": 7, "ts": 1716454222852430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775820, "dur": 13, "args": { "External id": 95848, "cbid": 211, "correlation": 95848 } }, { "ph": "s", "id": 95848, "pid": 76337, "tid": -914061504, "ts": 1716454222775820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222852462, "dur": 4, "args": { "External id": 95860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95860, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 95860, "pid": 5, "tid": 7, "ts": 1716454222852462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775843, "dur": 6, "args": { "External id": 95860, "cbid": 211, "correlation": 95860 } }, { "ph": "s", "id": 95860, "pid": 76337, "tid": -914061504, "ts": 1716454222775843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222852467, "dur": 30, "args": { "External id": 95863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95863, "pid": 5, "tid": 7, "ts": 1716454222852467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775861, "dur": 7, "args": { "External id": 95863, "cbid": 211, "correlation": 95863 } }, { "ph": "s", "id": 95863, "pid": 76337, "tid": -914061504, "ts": 1716454222775861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222775917, "dur": 0, "args": { "External id": 95874, "cbid": 317, "correlation": 95874 } }, { "ph": "f", "id": 95874, "pid": 76337, "tid": -914061504, "ts": 1716454222775917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222775918, "dur": 0, "args": { "External id": 95875, "cbid": 203, "correlation": 95875 } }, { "ph": "f", "id": 95875, "pid": 76337, "tid": -914061504, "ts": 1716454222775918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222775919, "dur": 0, "args": { "External id": 95876, "cbid": 205, "correlation": 95876 } }, { "ph": "f", "id": 95876, "pid": 76337, "tid": -914061504, "ts": 1716454222775919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222852499, "dur": 22, "args": { "External id": 95880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95880, "pid": 5, "tid": 7, "ts": 1716454222852499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775933, "dur": 12, "args": { "External id": 95880, "cbid": 211, "correlation": 95880 } }, { "ph": "s", "id": 95880, "pid": 76337, "tid": -914061504, "ts": 1716454222775933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222852521, "dur": 102, "args": { "External id": 95882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95882, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95882, "pid": 5, "tid": 7, "ts": 1716454222852521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775950, "dur": 6, "args": { "External id": 95882, "cbid": 211, "correlation": 95882 } }, { "ph": "s", "id": 95882, "pid": 76337, "tid": -914061504, "ts": 1716454222775950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222852625, "dur": 22, "args": { "External id": 95884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95884, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95884, "pid": 5, "tid": 7, "ts": 1716454222852625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775960, "dur": 5, "args": { "External id": 95884, "cbid": 211, "correlation": 95884 } }, { "ph": "s", "id": 95884, "pid": 76337, "tid": -914061504, "ts": 1716454222775960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222852649, "dur": 32, "args": { "External id": 95890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95890, "pid": 5, "tid": 7, "ts": 1716454222852649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222775998, "dur": 10, "args": { "External id": 95890, "cbid": 211, "correlation": 95890 } }, { "ph": "s", "id": 95890, "pid": 76337, "tid": -914061504, "ts": 1716454222775998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222852682, "dur": 191, "args": { "External id": 95899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95899, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95899, "pid": 5, "tid": 7, "ts": 1716454222852682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776084, "dur": 15, "args": { "External id": 95899, "cbid": 211, "correlation": 95899 } }, { "ph": "s", "id": 95899, "pid": 76337, "tid": -914061504, "ts": 1716454222776084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222852874, "dur": 63, "args": { "External id": 95921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95921, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 95921, "pid": 5, "tid": 7, "ts": 1716454222852874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776143, "dur": 10, "args": { "External id": 95921, "cbid": 211, "correlation": 95921 } }, { "ph": "s", "id": 95921, "pid": 76337, "tid": -914061504, "ts": 1716454222776143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776237, "dur": 2, "args": { "External id": 95932, "cbid": 251, "correlation": 95932 } }, { "ph": "f", "id": 95932, "pid": 76337, "tid": -914061504, "ts": 1716454222776237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222852938, "dur": 149, "args": { "External id": 95933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95933, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95933, "pid": 5, "tid": 7, "ts": 1716454222852938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776243, "dur": 13, "args": { "External id": 95933, "cbid": 211, "correlation": 95933 } }, { "ph": "s", "id": 95933, "pid": 76337, "tid": -914061504, "ts": 1716454222776243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776313, "dur": 1, "args": { "External id": 95944, "cbid": 251, "correlation": 95944 } }, { "ph": "f", "id": 95944, "pid": 76337, "tid": -914061504, "ts": 1716454222776313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222853089, "dur": 144, "args": { "External id": 95945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95945, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95945, "pid": 5, "tid": 7, "ts": 1716454222853089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776317, "dur": 12, "args": { "External id": 95945, "cbid": 211, "correlation": 95945 } }, { "ph": "s", "id": 95945, "pid": 76337, "tid": -914061504, "ts": 1716454222776317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776382, "dur": 1, "args": { "External id": 95956, "cbid": 251, "correlation": 95956 } }, { "ph": "f", "id": 95956, "pid": 76337, "tid": -914061504, "ts": 1716454222776382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222853234, "dur": 143, "args": { "External id": 95957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95957, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 95957, "pid": 5, "tid": 7, "ts": 1716454222853234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776387, "dur": 12, "args": { "External id": 95957, "cbid": 211, "correlation": 95957 } }, { "ph": "s", "id": 95957, "pid": 76337, "tid": -914061504, "ts": 1716454222776387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222853378, "dur": 1911, "args": { "External id": 95978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95978, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 95978, "pid": 5, "tid": 7, "ts": 1716454222853378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776471, "dur": 13, "args": { "External id": 95978, "cbid": 211, "correlation": 95978 } }, { "ph": "s", "id": 95978, "pid": 76337, "tid": -914061504, "ts": 1716454222776471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776574, "dur": 1, "args": { "External id": 95996, "cbid": 251, "correlation": 95996 } }, { "ph": "f", "id": 95996, "pid": 76337, "tid": -914061504, "ts": 1716454222776574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222855290, "dur": 147, "args": { "External id": 95998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 95998, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 95998, "pid": 5, "tid": 7, "ts": 1716454222855290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776580, "dur": 13, "args": { "External id": 95998, "cbid": 211, "correlation": 95998 } }, { "ph": "s", "id": 95998, "pid": 76337, "tid": -914061504, "ts": 1716454222776580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222855439, "dur": 35, "args": { "External id": 96006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96006, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96006, "pid": 5, "tid": 7, "ts": 1716454222855439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776651, "dur": 13, "args": { "External id": 96006, "cbid": 211, "correlation": 96006 } }, { "ph": "s", "id": 96006, "pid": 76337, "tid": -914061504, "ts": 1716454222776651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222855475, "dur": 51, "args": { "External id": 96014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96014, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96014, "pid": 5, "tid": 7, "ts": 1716454222855475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776691, "dur": 9, "args": { "External id": 96014, "cbid": 211, "correlation": 96014 } }, { "ph": "s", "id": 96014, "pid": 76337, "tid": -914061504, "ts": 1716454222776691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222855528, "dur": 30, "args": { "External id": 96025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96025, "pid": 5, "tid": 7, "ts": 1716454222855528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776767, "dur": 13, "args": { "External id": 96025, "cbid": 211, "correlation": 96025 } }, { "ph": "s", "id": 96025, "pid": 76337, "tid": -914061504, "ts": 1716454222776767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222855559, "dur": 34, "args": { "External id": 96047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96047, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96047, "pid": 5, "tid": 7, "ts": 1716454222855559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776800, "dur": 8, "args": { "External id": 96047, "cbid": 211, "correlation": 96047 } }, { "ph": "s", "id": 96047, "pid": 76337, "tid": -914061504, "ts": 1716454222776800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776885, "dur": 1, "args": { "External id": 96058, "cbid": 251, "correlation": 96058 } }, { "ph": "f", "id": 96058, "pid": 76337, "tid": -914061504, "ts": 1716454222776885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222855595, "dur": 88, "args": { "External id": 96059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96059, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96059, "pid": 5, "tid": 7, "ts": 1716454222855595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776890, "dur": 13, "args": { "External id": 96059, "cbid": 211, "correlation": 96059 } }, { "ph": "s", "id": 96059, "pid": 76337, "tid": -914061504, "ts": 1716454222776890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776962, "dur": 1, "args": { "External id": 96070, "cbid": 251, "correlation": 96070 } }, { "ph": "f", "id": 96070, "pid": 76337, "tid": -914061504, "ts": 1716454222776962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222776966, "dur": 0, "args": { "External id": 96071, "cbid": 251, "correlation": 96071 } }, { "ph": "f", "id": 96071, "pid": 76337, "tid": -914061504, "ts": 1716454222776966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222855684, "dur": 11, "args": { "External id": 96072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96072, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 96072, "pid": 5, "tid": 7, "ts": 1716454222855684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776967, "dur": 22, "args": { "External id": 96072, "cbid": 211, "correlation": 96072 } }, { "ph": "s", "id": 96072, "pid": 76337, "tid": -914061504, "ts": 1716454222776967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222855696, "dur": 5, "args": { "External id": 96074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96074, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 96074, "pid": 5, "tid": 7, "ts": 1716454222855696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222776991, "dur": 7, "args": { "External id": 96074, "cbid": 211, "correlation": 96074 } }, { "ph": "s", "id": 96074, "pid": 76337, "tid": -914061504, "ts": 1716454222776991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222777052, "dur": 1, "args": { "External id": 96085, "cbid": 251, "correlation": 96085 } }, { "ph": "f", "id": 96085, "pid": 76337, "tid": -914061504, "ts": 1716454222777052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222777056, "dur": 0, "args": { "External id": 96086, "cbid": 251, "correlation": 96086 } }, { "ph": "f", "id": 96086, "pid": 76337, "tid": -914061504, "ts": 1716454222777056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222855702, "dur": 7, "args": { "External id": 96087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96087, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 96087, "pid": 5, "tid": 7, "ts": 1716454222855702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777057, "dur": 11, "args": { "External id": 96087, "cbid": 211, "correlation": 96087 } }, { "ph": "s", "id": 96087, "pid": 76337, "tid": -914061504, "ts": 1716454222777057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222855710, "dur": 3, "args": { "External id": 96089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96089, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 96089, "pid": 5, "tid": 7, "ts": 1716454222855710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777070, "dur": 6, "args": { "External id": 96089, "cbid": 211, "correlation": 96089 } }, { "ph": "s", "id": 96089, "pid": 76337, "tid": -914061504, "ts": 1716454222777070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222855715, "dur": 90, "args": { "External id": 96110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96110, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 96110, "pid": 5, "tid": 7, "ts": 1716454222855715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777144, "dur": 12, "args": { "External id": 96110, "cbid": 211, "correlation": 96110 } }, { "ph": "s", "id": 96110, "pid": 76337, "tid": -914061504, "ts": 1716454222777144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222777242, "dur": 1, "args": { "External id": 96128, "cbid": 251, "correlation": 96128 } }, { "ph": "f", "id": 96128, "pid": 76337, "tid": -914061504, "ts": 1716454222777242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222855806, "dur": 96, "args": { "External id": 96130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96130, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96130, "pid": 5, "tid": 7, "ts": 1716454222855806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777247, "dur": 13, "args": { "External id": 96130, "cbid": 211, "correlation": 96130 } }, { "ph": "s", "id": 96130, "pid": 76337, "tid": -914061504, "ts": 1716454222777247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222855903, "dur": 20, "args": { "External id": 96138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96138, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96138, "pid": 5, "tid": 7, "ts": 1716454222855903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777317, "dur": 13, "args": { "External id": 96138, "cbid": 211, "correlation": 96138 } }, { "ph": "s", "id": 96138, "pid": 76337, "tid": -914061504, "ts": 1716454222777317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222855924, "dur": 38, "args": { "External id": 96146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96146, "pid": 5, "tid": 7, "ts": 1716454222855924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777358, "dur": 9, "args": { "External id": 96146, "cbid": 211, "correlation": 96146 } }, { "ph": "s", "id": 96146, "pid": 76337, "tid": -914061504, "ts": 1716454222777358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222855964, "dur": 34, "args": { "External id": 96168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96168, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96168, "pid": 5, "tid": 7, "ts": 1716454222855964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777409, "dur": 10, "args": { "External id": 96168, "cbid": 211, "correlation": 96168 } }, { "ph": "s", "id": 96168, "pid": 76337, "tid": -914061504, "ts": 1716454222777409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222777500, "dur": 1, "args": { "External id": 96184, "cbid": 251, "correlation": 96184 } }, { "ph": "f", "id": 96184, "pid": 76337, "tid": -914061504, "ts": 1716454222777500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222777505, "dur": 0, "args": { "External id": 96186, "cbid": 251, "correlation": 96186 } }, { "ph": "f", "id": 96186, "pid": 76337, "tid": -914061504, "ts": 1716454222777505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222855999, "dur": 532, "args": { "External id": 96187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96187, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 96187, "pid": 5, "tid": 7, "ts": 1716454222855999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777509, "dur": 13, "args": { "External id": 96187, "cbid": 211, "correlation": 96187 } }, { "ph": "s", "id": 96187, "pid": 76337, "tid": -914061504, "ts": 1716454222777509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222856532, "dur": 124, "args": { "External id": 96195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96195, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96195, "pid": 5, "tid": 7, "ts": 1716454222856532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777578, "dur": 13, "args": { "External id": 96195, "cbid": 211, "correlation": 96195 } }, { "ph": "s", "id": 96195, "pid": 76337, "tid": -914061504, "ts": 1716454222777578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222856658, "dur": 126, "args": { "External id": 96203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96203, "pid": 5, "tid": 7, "ts": 1716454222856658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777611, "dur": 8, "args": { "External id": 96203, "cbid": 211, "correlation": 96203 } }, { "ph": "s", "id": 96203, "pid": 76337, "tid": -914061504, "ts": 1716454222777611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222777689, "dur": 1, "args": { "External id": 96219, "cbid": 251, "correlation": 96219 } }, { "ph": "f", "id": 96219, "pid": 76337, "tid": -914061504, "ts": 1716454222777689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222856785, "dur": 306, "args": { "External id": 96221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96221, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96221, "pid": 5, "tid": 7, "ts": 1716454222856785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777694, "dur": 13, "args": { "External id": 96221, "cbid": 211, "correlation": 96221 } }, { "ph": "s", "id": 96221, "pid": 76337, "tid": -914061504, "ts": 1716454222777694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222857092, "dur": 27, "args": { "External id": 96229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96229, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96229, "pid": 5, "tid": 7, "ts": 1716454222857092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777737, "dur": 10, "args": { "External id": 96229, "cbid": 211, "correlation": 96229 } }, { "ph": "s", "id": 96229, "pid": 76337, "tid": -914061504, "ts": 1716454222777737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222857121, "dur": 80, "args": { "External id": 96240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96240, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96240, "pid": 5, "tid": 7, "ts": 1716454222857121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777806, "dur": 12, "args": { "External id": 96240, "cbid": 211, "correlation": 96240 } }, { "ph": "s", "id": 96240, "pid": 76337, "tid": -914061504, "ts": 1716454222777806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222777873, "dur": 0, "args": { "External id": 96252, "cbid": 317, "correlation": 96252 } }, { "ph": "f", "id": 96252, "pid": 76337, "tid": -914061504, "ts": 1716454222777873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222777874, "dur": 0, "args": { "External id": 96253, "cbid": 203, "correlation": 96253 } }, { "ph": "f", "id": 96253, "pid": 76337, "tid": -914061504, "ts": 1716454222777874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222777875, "dur": 0, "args": { "External id": 96254, "cbid": 205, "correlation": 96254 } }, { "ph": "f", "id": 96254, "pid": 76337, "tid": -914061504, "ts": 1716454222777875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222857202, "dur": 22, "args": { "External id": 96258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96258, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96258, "pid": 5, "tid": 7, "ts": 1716454222857202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777892, "dur": 12, "args": { "External id": 96258, "cbid": 211, "correlation": 96258 } }, { "ph": "s", "id": 96258, "pid": 76337, "tid": -914061504, "ts": 1716454222777892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222857226, "dur": 119, "args": { "External id": 96260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96260, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96260, "pid": 5, "tid": 7, "ts": 1716454222857226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777911, "dur": 7, "args": { "External id": 96260, "cbid": 211, "correlation": 96260 } }, { "ph": "s", "id": 96260, "pid": 76337, "tid": -914061504, "ts": 1716454222777911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222857346, "dur": 24, "args": { "External id": 96262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96262, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96262, "pid": 5, "tid": 7, "ts": 1716454222857346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777922, "dur": 5, "args": { "External id": 96262, "cbid": 211, "correlation": 96262 } }, { "ph": "s", "id": 96262, "pid": 76337, "tid": -914061504, "ts": 1716454222777922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222857371, "dur": 33, "args": { "External id": 96268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96268, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96268, "pid": 5, "tid": 7, "ts": 1716454222857371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777950, "dur": 9, "args": { "External id": 96268, "cbid": 211, "correlation": 96268 } }, { "ph": "s", "id": 96268, "pid": 76337, "tid": -914061504, "ts": 1716454222777950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222857405, "dur": 26, "args": { "External id": 96276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96276, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96276, "pid": 5, "tid": 7, "ts": 1716454222857405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222777991, "dur": 9, "args": { "External id": 96276, "cbid": 211, "correlation": 96276 } }, { "ph": "s", "id": 96276, "pid": 76337, "tid": -914061504, "ts": 1716454222777991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454222857433, "dur": 99, "args": { "External id": 96287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96287, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96287, "pid": 5, "tid": 7, "ts": 1716454222857433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778059, "dur": 12, "args": { "External id": 96287, "cbid": 211, "correlation": 96287 } }, { "ph": "s", "id": 96287, "pid": 76337, "tid": -914061504, "ts": 1716454222778059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222778117, "dur": 0, "args": { "External id": 96297, "cbid": 317, "correlation": 96297 } }, { "ph": "f", "id": 96297, "pid": 76337, "tid": -914061504, "ts": 1716454222778117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222778118, "dur": 0, "args": { "External id": 96298, "cbid": 203, "correlation": 96298 } }, { "ph": "f", "id": 96298, "pid": 76337, "tid": -914061504, "ts": 1716454222778118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222778118, "dur": 0, "args": { "External id": 96299, "cbid": 205, "correlation": 96299 } }, { "ph": "f", "id": 96299, "pid": 76337, "tid": -914061504, "ts": 1716454222778118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222857534, "dur": 75, "args": { "External id": 96303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96303, "pid": 5, "tid": 7, "ts": 1716454222857534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778133, "dur": 11, "args": { "External id": 96303, "cbid": 211, "correlation": 96303 } }, { "ph": "s", "id": 96303, "pid": 76337, "tid": -914061504, "ts": 1716454222778133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222857610, "dur": 44, "args": { "External id": 96305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96305, "pid": 5, "tid": 7, "ts": 1716454222857610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778147, "dur": 6, "args": { "External id": 96305, "cbid": 211, "correlation": 96305 } }, { "ph": "s", "id": 96305, "pid": 76337, "tid": -914061504, "ts": 1716454222778147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222857655, "dur": 4, "args": { "External id": 96307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96307, "pid": 5, "tid": 7, "ts": 1716454222857655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778159, "dur": 6, "args": { "External id": 96307, "cbid": 211, "correlation": 96307 } }, { "ph": "s", "id": 96307, "pid": 76337, "tid": -914061504, "ts": 1716454222778159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222778170, "dur": 0, "args": { "External id": 96308, "cbid": 51, "correlation": 96308 } }, { "ph": "s", "id": 96308, "pid": 76337, "tid": -914061504, "ts": 1716454222778170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222857660, "dur": 2220, "args": { "External id": 96309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96309, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96309, "pid": 5, "tid": 7, "ts": 1716454222857660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778171, "dur": 6, "args": { "External id": 96309, "cbid": 211, "correlation": 96309 } }, { "ph": "s", "id": 96309, "pid": 76337, "tid": -914061504, "ts": 1716454222778171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222859881, "dur": 111, "args": { "External id": 96314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96314, "pid": 5, "tid": 7, "ts": 1716454222859881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778200, "dur": 9, "args": { "External id": 96314, "cbid": 211, "correlation": 96314 } }, { "ph": "s", "id": 96314, "pid": 76337, "tid": -914061504, "ts": 1716454222778200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222859994, "dur": 166, "args": { "External id": 96323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96323, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96323, "pid": 5, "tid": 7, "ts": 1716454222859994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778297, "dur": 14, "args": { "External id": 96323, "cbid": 211, "correlation": 96323 } }, { "ph": "s", "id": 96323, "pid": 76337, "tid": -914061504, "ts": 1716454222778297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222860161, "dur": 129, "args": { "External id": 96343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96343, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 96343, "pid": 5, "tid": 7, "ts": 1716454222860161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778372, "dur": 11, "args": { "External id": 96343, "cbid": 211, "correlation": 96343 } }, { "ph": "s", "id": 96343, "pid": 76337, "tid": -914061504, "ts": 1716454222778372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222860292, "dur": 5, "args": { "External id": 96355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96355, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 96355, "pid": 5, "tid": 7, "ts": 1716454222860292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778393, "dur": 7, "args": { "External id": 96355, "cbid": 211, "correlation": 96355 } }, { "ph": "s", "id": 96355, "pid": 76337, "tid": -914061504, "ts": 1716454222778393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222860297, "dur": 158, "args": { "External id": 96358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96358, "pid": 5, "tid": 7, "ts": 1716454222860297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778412, "dur": 7, "args": { "External id": 96358, "cbid": 211, "correlation": 96358 } }, { "ph": "s", "id": 96358, "pid": 76337, "tid": -914061504, "ts": 1716454222778412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222860457, "dur": 101, "args": { "External id": 96367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96367, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96367, "pid": 5, "tid": 7, "ts": 1716454222860457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778454, "dur": 10, "args": { "External id": 96367, "cbid": 211, "correlation": 96367 } }, { "ph": "s", "id": 96367, "pid": 76337, "tid": -914061504, "ts": 1716454222778454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222778508, "dur": 0, "args": { "External id": 96377, "cbid": 317, "correlation": 96377 } }, { "ph": "f", "id": 96377, "pid": 76337, "tid": -914061504, "ts": 1716454222778508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222778509, "dur": 0, "args": { "External id": 96378, "cbid": 203, "correlation": 96378 } }, { "ph": "f", "id": 96378, "pid": 76337, "tid": -914061504, "ts": 1716454222778509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222778510, "dur": 0, "args": { "External id": 96379, "cbid": 205, "correlation": 96379 } }, { "ph": "f", "id": 96379, "pid": 76337, "tid": -914061504, "ts": 1716454222778510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222860559, "dur": 110, "args": { "External id": 96383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96383, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96383, "pid": 5, "tid": 7, "ts": 1716454222860559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778526, "dur": 12, "args": { "External id": 96383, "cbid": 211, "correlation": 96383 } }, { "ph": "s", "id": 96383, "pid": 76337, "tid": -914061504, "ts": 1716454222778526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222860671, "dur": 34, "args": { "External id": 96385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96385, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96385, "pid": 5, "tid": 7, "ts": 1716454222860671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778540, "dur": 5, "args": { "External id": 96385, "cbid": 211, "correlation": 96385 } }, { "ph": "s", "id": 96385, "pid": 76337, "tid": -914061504, "ts": 1716454222778540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222860706, "dur": 4, "args": { "External id": 96387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96387, "pid": 5, "tid": 7, "ts": 1716454222860706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778551, "dur": 6, "args": { "External id": 96387, "cbid": 211, "correlation": 96387 } }, { "ph": "s", "id": 96387, "pid": 76337, "tid": -914061504, "ts": 1716454222778551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222778561, "dur": 0, "args": { "External id": 96388, "cbid": 51, "correlation": 96388 } }, { "ph": "s", "id": 96388, "pid": 76337, "tid": -914061504, "ts": 1716454222778561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222860711, "dur": 1981, "args": { "External id": 96389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96389, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96389, "pid": 5, "tid": 7, "ts": 1716454222860711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778562, "dur": 6, "args": { "External id": 96389, "cbid": 211, "correlation": 96389 } }, { "ph": "s", "id": 96389, "pid": 76337, "tid": -914061504, "ts": 1716454222778562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222862693, "dur": 58, "args": { "External id": 96394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96394, "pid": 5, "tid": 7, "ts": 1716454222862693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778590, "dur": 10, "args": { "External id": 96394, "cbid": 211, "correlation": 96394 } }, { "ph": "s", "id": 96394, "pid": 76337, "tid": -914061504, "ts": 1716454222778590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222862752, "dur": 4, "args": { "External id": 96402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96402, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96402, "pid": 5, "tid": 7, "ts": 1716454222862752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778634, "dur": 10, "args": { "External id": 96402, "cbid": 211, "correlation": 96402 } }, { "ph": "s", "id": 96402, "pid": 76337, "tid": -914061504, "ts": 1716454222778634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222778703, "dur": 1, "args": { "External id": 96418, "cbid": 251, "correlation": 96418 } }, { "ph": "f", "id": 96418, "pid": 76337, "tid": -914061504, "ts": 1716454222778703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222778708, "dur": 0, "args": { "External id": 96420, "cbid": 251, "correlation": 96420 } }, { "ph": "f", "id": 96420, "pid": 76337, "tid": -914061504, "ts": 1716454222778708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222862757, "dur": 11, "args": { "External id": 96421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96421, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 96421, "pid": 5, "tid": 7, "ts": 1716454222862757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778711, "dur": 12, "args": { "External id": 96421, "cbid": 211, "correlation": 96421 } }, { "ph": "s", "id": 96421, "pid": 76337, "tid": -914061504, "ts": 1716454222778711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222862770, "dur": 5, "args": { "External id": 96423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96423, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 96423, "pid": 5, "tid": 7, "ts": 1716454222862770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778725, "dur": 6, "args": { "External id": 96423, "cbid": 211, "correlation": 96423 } }, { "ph": "s", "id": 96423, "pid": 76337, "tid": -914061504, "ts": 1716454222778725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222862776, "dur": 53, "args": { "External id": 96433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96433, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96433, "pid": 5, "tid": 7, "ts": 1716454222862776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778784, "dur": 12, "args": { "External id": 96433, "cbid": 211, "correlation": 96433 } }, { "ph": "s", "id": 96433, "pid": 76337, "tid": -914061504, "ts": 1716454222778784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222862830, "dur": 50, "args": { "External id": 96453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96453, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 96453, "pid": 5, "tid": 7, "ts": 1716454222862830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778851, "dur": 11, "args": { "External id": 96453, "cbid": 211, "correlation": 96453 } }, { "ph": "s", "id": 96453, "pid": 76337, "tid": -914061504, "ts": 1716454222778851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222862881, "dur": 4, "args": { "External id": 96465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96465, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96465, "pid": 5, "tid": 7, "ts": 1716454222862881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778872, "dur": 6, "args": { "External id": 96465, "cbid": 211, "correlation": 96465 } }, { "ph": "s", "id": 96465, "pid": 76337, "tid": -914061504, "ts": 1716454222778872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222862886, "dur": 55, "args": { "External id": 96468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96468, "pid": 5, "tid": 7, "ts": 1716454222862886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778890, "dur": 7, "args": { "External id": 96468, "cbid": 211, "correlation": 96468 } }, { "ph": "s", "id": 96468, "pid": 76337, "tid": -914061504, "ts": 1716454222778890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222862943, "dur": 36, "args": { "External id": 96477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96477, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96477, "pid": 5, "tid": 7, "ts": 1716454222862943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222778930, "dur": 11, "args": { "External id": 96477, "cbid": 211, "correlation": 96477 } }, { "ph": "s", "id": 96477, "pid": 76337, "tid": -914061504, "ts": 1716454222778930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222779003, "dur": 0, "args": { "External id": 96487, "cbid": 317, "correlation": 96487 } }, { "ph": "f", "id": 96487, "pid": 76337, "tid": -914061504, "ts": 1716454222779003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222779004, "dur": 0, "args": { "External id": 96488, "cbid": 203, "correlation": 96488 } }, { "ph": "f", "id": 96488, "pid": 76337, "tid": -914061504, "ts": 1716454222779004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222779005, "dur": 0, "args": { "External id": 96489, "cbid": 205, "correlation": 96489 } }, { "ph": "f", "id": 96489, "pid": 76337, "tid": -914061504, "ts": 1716454222779005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222862980, "dur": 39, "args": { "External id": 96493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96493, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96493, "pid": 5, "tid": 7, "ts": 1716454222862980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779021, "dur": 12, "args": { "External id": 96493, "cbid": 211, "correlation": 96493 } }, { "ph": "s", "id": 96493, "pid": 76337, "tid": -914061504, "ts": 1716454222779021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222863021, "dur": 14, "args": { "External id": 96495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96495, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96495, "pid": 5, "tid": 7, "ts": 1716454222863021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779036, "dur": 5, "args": { "External id": 96495, "cbid": 211, "correlation": 96495 } }, { "ph": "s", "id": 96495, "pid": 76337, "tid": -914061504, "ts": 1716454222779036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222863036, "dur": 3, "args": { "External id": 96497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96497, "pid": 5, "tid": 7, "ts": 1716454222863036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779046, "dur": 6, "args": { "External id": 96497, "cbid": 211, "correlation": 96497 } }, { "ph": "s", "id": 96497, "pid": 76337, "tid": -914061504, "ts": 1716454222779046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222779055, "dur": 0, "args": { "External id": 96498, "cbid": 51, "correlation": 96498 } }, { "ph": "s", "id": 96498, "pid": 76337, "tid": -914061504, "ts": 1716454222779055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222863040, "dur": 689, "args": { "External id": 96499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96499, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96499, "pid": 5, "tid": 7, "ts": 1716454222863040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779056, "dur": 5, "args": { "External id": 96499, "cbid": 211, "correlation": 96499 } }, { "ph": "s", "id": 96499, "pid": 76337, "tid": -914061504, "ts": 1716454222779056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222863731, "dur": 58, "args": { "External id": 96504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96504, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96504, "pid": 5, "tid": 7, "ts": 1716454222863731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779084, "dur": 8, "args": { "External id": 96504, "cbid": 211, "correlation": 96504 } }, { "ph": "s", "id": 96504, "pid": 76337, "tid": -914061504, "ts": 1716454222779084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222779141, "dur": 0, "args": { "External id": 96514, "cbid": 317, "correlation": 96514 } }, { "ph": "f", "id": 96514, "pid": 76337, "tid": -914061504, "ts": 1716454222779141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222779141, "dur": 0, "args": { "External id": 96515, "cbid": 203, "correlation": 96515 } }, { "ph": "f", "id": 96515, "pid": 76337, "tid": -914061504, "ts": 1716454222779141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222779142, "dur": 0, "args": { "External id": 96516, "cbid": 205, "correlation": 96516 } }, { "ph": "f", "id": 96516, "pid": 76337, "tid": -914061504, "ts": 1716454222779142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222863790, "dur": 3, "args": { "External id": 96520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96520, "pid": 5, "tid": 7, "ts": 1716454222863790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779158, "dur": 11, "args": { "External id": 96520, "cbid": 211, "correlation": 96520 } }, { "ph": "s", "id": 96520, "pid": 76337, "tid": -914061504, "ts": 1716454222779158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222779174, "dur": 0, "args": { "External id": 96521, "cbid": 51, "correlation": 96521 } }, { "ph": "s", "id": 96521, "pid": 76337, "tid": -914061504, "ts": 1716454222779174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454222863795, "dur": 264, "args": { "External id": 96522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96522, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96522, "pid": 5, "tid": 7, "ts": 1716454222863795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779175, "dur": 7, "args": { "External id": 96522, "cbid": 211, "correlation": 96522 } }, { "ph": "s", "id": 96522, "pid": 76337, "tid": -914061504, "ts": 1716454222779175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222864060, "dur": 58, "args": { "External id": 96527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96527, "pid": 5, "tid": 7, "ts": 1716454222864060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779201, "dur": 9, "args": { "External id": 96527, "cbid": 211, "correlation": 96527 } }, { "ph": "s", "id": 96527, "pid": 76337, "tid": -914061504, "ts": 1716454222779201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222864119, "dur": 50, "args": { "External id": 96535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96535, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96535, "pid": 5, "tid": 7, "ts": 1716454222864119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779230, "dur": 8, "args": { "External id": 96535, "cbid": 211, "correlation": 96535 } }, { "ph": "s", "id": 96535, "pid": 76337, "tid": -914061504, "ts": 1716454222779230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222864170, "dur": 35, "args": { "External id": 96543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96543, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96543, "pid": 5, "tid": 7, "ts": 1716454222864170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779260, "dur": 10, "args": { "External id": 96543, "cbid": 211, "correlation": 96543 } }, { "ph": "s", "id": 96543, "pid": 76337, "tid": -914061504, "ts": 1716454222779260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222864207, "dur": 51, "args": { "External id": 96563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96563, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 96563, "pid": 5, "tid": 7, "ts": 1716454222864207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779343, "dur": 12, "args": { "External id": 96563, "cbid": 211, "correlation": 96563 } }, { "ph": "s", "id": 96563, "pid": 76337, "tid": -914061504, "ts": 1716454222779343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222864259, "dur": 4, "args": { "External id": 96575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96575, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 96575, "pid": 5, "tid": 7, "ts": 1716454222864259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779365, "dur": 6, "args": { "External id": 96575, "cbid": 211, "correlation": 96575 } }, { "ph": "s", "id": 96575, "pid": 76337, "tid": -914061504, "ts": 1716454222779365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222864264, "dur": 54, "args": { "External id": 96578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96578, "pid": 5, "tid": 7, "ts": 1716454222864264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779383, "dur": 7, "args": { "External id": 96578, "cbid": 211, "correlation": 96578 } }, { "ph": "s", "id": 96578, "pid": 76337, "tid": -914061504, "ts": 1716454222779383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222779441, "dur": 0, "args": { "External id": 96589, "cbid": 317, "correlation": 96589 } }, { "ph": "f", "id": 96589, "pid": 76337, "tid": -914061504, "ts": 1716454222779441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222779442, "dur": 0, "args": { "External id": 96590, "cbid": 203, "correlation": 96590 } }, { "ph": "f", "id": 96590, "pid": 76337, "tid": -914061504, "ts": 1716454222779442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222779442, "dur": 0, "args": { "External id": 96591, "cbid": 205, "correlation": 96591 } }, { "ph": "f", "id": 96591, "pid": 76337, "tid": -914061504, "ts": 1716454222779442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779475, "dur": 2, "args": { "External id": 96595, "cbid": 251, "correlation": 96595 } }, { "ph": "f", "id": 96595, "pid": 76337, "tid": -914061504, "ts": 1716454222779475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779479, "dur": 1, "args": { "External id": 96596, "cbid": 251, "correlation": 96596 } }, { "ph": "f", "id": 96596, "pid": 76337, "tid": -914061504, "ts": 1716454222779479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779481, "dur": 1, "args": { "External id": 96597, "cbid": 251, "correlation": 96597 } }, { "ph": "f", "id": 96597, "pid": 76337, "tid": -914061504, "ts": 1716454222779481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779483, "dur": 1, "args": { "External id": 96598, "cbid": 251, "correlation": 96598 } }, { "ph": "f", "id": 96598, "pid": 76337, "tid": -914061504, "ts": 1716454222779483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779485, "dur": 1, "args": { "External id": 96599, "cbid": 251, "correlation": 96599 } }, { "ph": "f", "id": 96599, "pid": 76337, "tid": -914061504, "ts": 1716454222779485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779487, "dur": 1, "args": { "External id": 96600, "cbid": 251, "correlation": 96600 } }, { "ph": "f", "id": 96600, "pid": 76337, "tid": -914061504, "ts": 1716454222779487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779489, "dur": 1, "args": { "External id": 96601, "cbid": 251, "correlation": 96601 } }, { "ph": "f", "id": 96601, "pid": 76337, "tid": -914061504, "ts": 1716454222779489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779491, "dur": 1, "args": { "External id": 96602, "cbid": 251, "correlation": 96602 } }, { "ph": "f", "id": 96602, "pid": 76337, "tid": -914061504, "ts": 1716454222779491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779493, "dur": 0, "args": { "External id": 96603, "cbid": 251, "correlation": 96603 } }, { "ph": "f", "id": 96603, "pid": 76337, "tid": -914061504, "ts": 1716454222779493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222864319, "dur": 113, "args": { "External id": 96604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96604, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 96604, "pid": 5, "tid": 7, "ts": 1716454222864319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779496, "dur": 13, "args": { "External id": 96604, "cbid": 211, "correlation": 96604 } }, { "ph": "s", "id": 96604, "pid": 76337, "tid": -914061504, "ts": 1716454222779496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222864433, "dur": 60, "args": { "External id": 96610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96610, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96610, "pid": 5, "tid": 7, "ts": 1716454222864433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779532, "dur": 10, "args": { "External id": 96610, "cbid": 211, "correlation": 96610 } }, { "ph": "s", "id": 96610, "pid": 76337, "tid": -914061504, "ts": 1716454222779532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222864494, "dur": 489, "args": { "External id": 96619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96619, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96619, "pid": 5, "tid": 7, "ts": 1716454222864494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779617, "dur": 15, "args": { "External id": 96619, "cbid": 211, "correlation": 96619 } }, { "ph": "s", "id": 96619, "pid": 76337, "tid": -914061504, "ts": 1716454222779617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222864984, "dur": 177, "args": { "External id": 96641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96641, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96641, "pid": 5, "tid": 7, "ts": 1716454222864984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779678, "dur": 11, "args": { "External id": 96641, "cbid": 211, "correlation": 96641 } }, { "ph": "s", "id": 96641, "pid": 76337, "tid": -914061504, "ts": 1716454222779678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779773, "dur": 2, "args": { "External id": 96652, "cbid": 251, "correlation": 96652 } }, { "ph": "f", "id": 96652, "pid": 76337, "tid": -914061504, "ts": 1716454222779773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222865162, "dur": 196, "args": { "External id": 96653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96653, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96653, "pid": 5, "tid": 7, "ts": 1716454222865162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779779, "dur": 12, "args": { "External id": 96653, "cbid": 211, "correlation": 96653 } }, { "ph": "s", "id": 96653, "pid": 76337, "tid": -914061504, "ts": 1716454222779779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779848, "dur": 1, "args": { "External id": 96664, "cbid": 251, "correlation": 96664 } }, { "ph": "f", "id": 96664, "pid": 76337, "tid": -914061504, "ts": 1716454222779848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222865360, "dur": 187, "args": { "External id": 96665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96665, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96665, "pid": 5, "tid": 7, "ts": 1716454222865360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779852, "dur": 11, "args": { "External id": 96665, "cbid": 211, "correlation": 96665 } }, { "ph": "s", "id": 96665, "pid": 76337, "tid": -914061504, "ts": 1716454222779852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222779915, "dur": 1, "args": { "External id": 96676, "cbid": 251, "correlation": 96676 } }, { "ph": "f", "id": 96676, "pid": 76337, "tid": -914061504, "ts": 1716454222779915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222865548, "dur": 183, "args": { "External id": 96677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96677, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96677, "pid": 5, "tid": 7, "ts": 1716454222865548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222779919, "dur": 12, "args": { "External id": 96677, "cbid": 211, "correlation": 96677 } }, { "ph": "s", "id": 96677, "pid": 76337, "tid": -914061504, "ts": 1716454222779919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222865733, "dur": 18125, "args": { "External id": 96698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96698, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 96698, "pid": 5, "tid": 7, "ts": 1716454222865733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780012, "dur": 15, "args": { "External id": 96698, "cbid": 211, "correlation": 96698 } }, { "ph": "s", "id": 96698, "pid": 76337, "tid": -914061504, "ts": 1716454222780012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780116, "dur": 1, "args": { "External id": 96716, "cbid": 251, "correlation": 96716 } }, { "ph": "f", "id": 96716, "pid": 76337, "tid": -914061504, "ts": 1716454222780116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222883859, "dur": 198, "args": { "External id": 96718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96718, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96718, "pid": 5, "tid": 7, "ts": 1716454222883859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780122, "dur": 13, "args": { "External id": 96718, "cbid": 211, "correlation": 96718 } }, { "ph": "s", "id": 96718, "pid": 76337, "tid": -914061504, "ts": 1716454222780122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222884059, "dur": 67, "args": { "External id": 96726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96726, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96726, "pid": 5, "tid": 7, "ts": 1716454222884059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780192, "dur": 13, "args": { "External id": 96726, "cbid": 211, "correlation": 96726 } }, { "ph": "s", "id": 96726, "pid": 76337, "tid": -914061504, "ts": 1716454222780192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222884127, "dur": 97, "args": { "External id": 96734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96734, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96734, "pid": 5, "tid": 7, "ts": 1716454222884127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780232, "dur": 9, "args": { "External id": 96734, "cbid": 211, "correlation": 96734 } }, { "ph": "s", "id": 96734, "pid": 76337, "tid": -914061504, "ts": 1716454222780232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222884225, "dur": 54, "args": { "External id": 96745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96745, "pid": 5, "tid": 7, "ts": 1716454222884225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780308, "dur": 14, "args": { "External id": 96745, "cbid": 211, "correlation": 96745 } }, { "ph": "s", "id": 96745, "pid": 76337, "tid": -914061504, "ts": 1716454222780308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222884280, "dur": 90, "args": { "External id": 96767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96767, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96767, "pid": 5, "tid": 7, "ts": 1716454222884280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780340, "dur": 8, "args": { "External id": 96767, "cbid": 211, "correlation": 96767 } }, { "ph": "s", "id": 96767, "pid": 76337, "tid": -914061504, "ts": 1716454222780340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780424, "dur": 1, "args": { "External id": 96778, "cbid": 251, "correlation": 96778 } }, { "ph": "f", "id": 96778, "pid": 76337, "tid": -914061504, "ts": 1716454222780424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222884371, "dur": 103, "args": { "External id": 96779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96779, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96779, "pid": 5, "tid": 7, "ts": 1716454222884371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780429, "dur": 12, "args": { "External id": 96779, "cbid": 211, "correlation": 96779 } }, { "ph": "s", "id": 96779, "pid": 76337, "tid": -914061504, "ts": 1716454222780429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780506, "dur": 1, "args": { "External id": 96790, "cbid": 251, "correlation": 96790 } }, { "ph": "f", "id": 96790, "pid": 76337, "tid": -914061504, "ts": 1716454222780506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780510, "dur": 0, "args": { "External id": 96791, "cbid": 251, "correlation": 96791 } }, { "ph": "f", "id": 96791, "pid": 76337, "tid": -914061504, "ts": 1716454222780510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222884476, "dur": 10, "args": { "External id": 96792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96792, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 96792, "pid": 5, "tid": 7, "ts": 1716454222884476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780512, "dur": 15, "args": { "External id": 96792, "cbid": 211, "correlation": 96792 } }, { "ph": "s", "id": 96792, "pid": 76337, "tid": -914061504, "ts": 1716454222780512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222884487, "dur": 5, "args": { "External id": 96794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96794, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 96794, "pid": 5, "tid": 7, "ts": 1716454222884487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780530, "dur": 8, "args": { "External id": 96794, "cbid": 211, "correlation": 96794 } }, { "ph": "s", "id": 96794, "pid": 76337, "tid": -914061504, "ts": 1716454222780530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780593, "dur": 1, "args": { "External id": 96805, "cbid": 251, "correlation": 96805 } }, { "ph": "f", "id": 96805, "pid": 76337, "tid": -914061504, "ts": 1716454222780593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780597, "dur": 0, "args": { "External id": 96806, "cbid": 251, "correlation": 96806 } }, { "ph": "f", "id": 96806, "pid": 76337, "tid": -914061504, "ts": 1716454222780597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222884493, "dur": 6, "args": { "External id": 96807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96807, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 96807, "pid": 5, "tid": 7, "ts": 1716454222884493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780598, "dur": 12, "args": { "External id": 96807, "cbid": 211, "correlation": 96807 } }, { "ph": "s", "id": 96807, "pid": 76337, "tid": -914061504, "ts": 1716454222780598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222884501, "dur": 3, "args": { "External id": 96809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96809, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 96809, "pid": 5, "tid": 7, "ts": 1716454222884501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780611, "dur": 5, "args": { "External id": 96809, "cbid": 211, "correlation": 96809 } }, { "ph": "s", "id": 96809, "pid": 76337, "tid": -914061504, "ts": 1716454222780611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222884505, "dur": 152, "args": { "External id": 96830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96830, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 96830, "pid": 5, "tid": 7, "ts": 1716454222884505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780685, "dur": 13, "args": { "External id": 96830, "cbid": 211, "correlation": 96830 } }, { "ph": "s", "id": 96830, "pid": 76337, "tid": -914061504, "ts": 1716454222780685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222780782, "dur": 2, "args": { "External id": 96848, "cbid": 251, "correlation": 96848 } }, { "ph": "f", "id": 96848, "pid": 76337, "tid": -914061504, "ts": 1716454222780782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222884659, "dur": 106, "args": { "External id": 96850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96850, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 96850, "pid": 5, "tid": 7, "ts": 1716454222884659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780788, "dur": 14, "args": { "External id": 96850, "cbid": 211, "correlation": 96850 } }, { "ph": "s", "id": 96850, "pid": 76337, "tid": -914061504, "ts": 1716454222780788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222884767, "dur": 35, "args": { "External id": 96858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96858, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96858, "pid": 5, "tid": 7, "ts": 1716454222884767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780858, "dur": 12, "args": { "External id": 96858, "cbid": 211, "correlation": 96858 } }, { "ph": "s", "id": 96858, "pid": 76337, "tid": -914061504, "ts": 1716454222780858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222884803, "dur": 67, "args": { "External id": 96866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96866, "pid": 5, "tid": 7, "ts": 1716454222884803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780899, "dur": 9, "args": { "External id": 96866, "cbid": 211, "correlation": 96866 } }, { "ph": "s", "id": 96866, "pid": 76337, "tid": -914061504, "ts": 1716454222780899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222884871, "dur": 90, "args": { "External id": 96888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96888, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96888, "pid": 5, "tid": 7, "ts": 1716454222884871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222780950, "dur": 10, "args": { "External id": 96888, "cbid": 211, "correlation": 96888 } }, { "ph": "s", "id": 96888, "pid": 76337, "tid": -914061504, "ts": 1716454222780950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781048, "dur": 1, "args": { "External id": 96904, "cbid": 251, "correlation": 96904 } }, { "ph": "f", "id": 96904, "pid": 76337, "tid": -914061504, "ts": 1716454222781048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222884962, "dur": 564, "args": { "External id": 96906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96906, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 96906, "pid": 5, "tid": 7, "ts": 1716454222884962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781054, "dur": 13, "args": { "External id": 96906, "cbid": 211, "correlation": 96906 } }, { "ph": "s", "id": 96906, "pid": 76337, "tid": -914061504, "ts": 1716454222781054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222885527, "dur": 239, "args": { "External id": 96914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96914, "pid": 5, "tid": 7, "ts": 1716454222885527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781125, "dur": 13, "args": { "External id": 96914, "cbid": 211, "correlation": 96914 } }, { "ph": "s", "id": 96914, "pid": 76337, "tid": -914061504, "ts": 1716454222781125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222885767, "dur": 249, "args": { "External id": 96922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96922, "pid": 5, "tid": 7, "ts": 1716454222885767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781158, "dur": 9, "args": { "External id": 96922, "cbid": 211, "correlation": 96922 } }, { "ph": "s", "id": 96922, "pid": 76337, "tid": -914061504, "ts": 1716454222781158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781242, "dur": 1, "args": { "External id": 96938, "cbid": 251, "correlation": 96938 } }, { "ph": "f", "id": 96938, "pid": 76337, "tid": -914061504, "ts": 1716454222781242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781247, "dur": 0, "args": { "External id": 96940, "cbid": 251, "correlation": 96940 } }, { "ph": "f", "id": 96940, "pid": 76337, "tid": -914061504, "ts": 1716454222781247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222886017, "dur": 351, "args": { "External id": 96941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96941, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 96941, "pid": 5, "tid": 7, "ts": 1716454222886017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781252, "dur": 13, "args": { "External id": 96941, "cbid": 211, "correlation": 96941 } }, { "ph": "s", "id": 96941, "pid": 76337, "tid": -914061504, "ts": 1716454222781252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222886369, "dur": 50, "args": { "External id": 96949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96949, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96949, "pid": 5, "tid": 7, "ts": 1716454222886369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781294, "dur": 11, "args": { "External id": 96949, "cbid": 211, "correlation": 96949 } }, { "ph": "s", "id": 96949, "pid": 76337, "tid": -914061504, "ts": 1716454222781294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222886421, "dur": 155, "args": { "External id": 96960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96960, "pid": 5, "tid": 7, "ts": 1716454222886421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781363, "dur": 12, "args": { "External id": 96960, "cbid": 211, "correlation": 96960 } }, { "ph": "s", "id": 96960, "pid": 76337, "tid": -914061504, "ts": 1716454222781363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222781429, "dur": 0, "args": { "External id": 96972, "cbid": 317, "correlation": 96972 } }, { "ph": "f", "id": 96972, "pid": 76337, "tid": -914061504, "ts": 1716454222781429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222781430, "dur": 0, "args": { "External id": 96973, "cbid": 203, "correlation": 96973 } }, { "ph": "f", "id": 96973, "pid": 76337, "tid": -914061504, "ts": 1716454222781430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222781431, "dur": 0, "args": { "External id": 96974, "cbid": 205, "correlation": 96974 } }, { "ph": "f", "id": 96974, "pid": 76337, "tid": -914061504, "ts": 1716454222781431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781456, "dur": 1, "args": { "External id": 96978, "cbid": 251, "correlation": 96978 } }, { "ph": "f", "id": 96978, "pid": 76337, "tid": -914061504, "ts": 1716454222781456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781458, "dur": 1, "args": { "External id": 96979, "cbid": 251, "correlation": 96979 } }, { "ph": "f", "id": 96979, "pid": 76337, "tid": -914061504, "ts": 1716454222781458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781460, "dur": 1, "args": { "External id": 96980, "cbid": 251, "correlation": 96980 } }, { "ph": "f", "id": 96980, "pid": 76337, "tid": -914061504, "ts": 1716454222781460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781461, "dur": 0, "args": { "External id": 96981, "cbid": 251, "correlation": 96981 } }, { "ph": "f", "id": 96981, "pid": 76337, "tid": -914061504, "ts": 1716454222781461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781463, "dur": 1, "args": { "External id": 96982, "cbid": 251, "correlation": 96982 } }, { "ph": "f", "id": 96982, "pid": 76337, "tid": -914061504, "ts": 1716454222781463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781464, "dur": 0, "args": { "External id": 96983, "cbid": 251, "correlation": 96983 } }, { "ph": "f", "id": 96983, "pid": 76337, "tid": -914061504, "ts": 1716454222781464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781465, "dur": 0, "args": { "External id": 96984, "cbid": 251, "correlation": 96984 } }, { "ph": "f", "id": 96984, "pid": 76337, "tid": -914061504, "ts": 1716454222781465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781466, "dur": 0, "args": { "External id": 96985, "cbid": 251, "correlation": 96985 } }, { "ph": "f", "id": 96985, "pid": 76337, "tid": -914061504, "ts": 1716454222781466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781468, "dur": 0, "args": { "External id": 96986, "cbid": 251, "correlation": 96986 } }, { "ph": "f", "id": 96986, "pid": 76337, "tid": -914061504, "ts": 1716454222781468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222886578, "dur": 113, "args": { "External id": 96987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96987, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 96987, "pid": 5, "tid": 7, "ts": 1716454222886578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781470, "dur": 12, "args": { "External id": 96987, "cbid": 211, "correlation": 96987 } }, { "ph": "s", "id": 96987, "pid": 76337, "tid": -914061504, "ts": 1716454222781470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222886692, "dur": 60, "args": { "External id": 96993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 96993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 96993, "pid": 5, "tid": 7, "ts": 1716454222886692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781504, "dur": 10, "args": { "External id": 96993, "cbid": 211, "correlation": 96993 } }, { "ph": "s", "id": 96993, "pid": 76337, "tid": -914061504, "ts": 1716454222781504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222886753, "dur": 50, "args": { "External id": 97001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97001, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97001, "pid": 5, "tid": 7, "ts": 1716454222886753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781536, "dur": 8, "args": { "External id": 97001, "cbid": 211, "correlation": 97001 } }, { "ph": "s", "id": 97001, "pid": 76337, "tid": -914061504, "ts": 1716454222781536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222886804, "dur": 98, "args": { "External id": 97010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97010, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97010, "pid": 5, "tid": 7, "ts": 1716454222886804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781578, "dur": 10, "args": { "External id": 97010, "cbid": 211, "correlation": 97010 } }, { "ph": "s", "id": 97010, "pid": 76337, "tid": -914061504, "ts": 1716454222781578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222886903, "dur": 90, "args": { "External id": 97030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97030, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 97030, "pid": 5, "tid": 7, "ts": 1716454222886903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781654, "dur": 11, "args": { "External id": 97030, "cbid": 211, "correlation": 97030 } }, { "ph": "s", "id": 97030, "pid": 76337, "tid": -914061504, "ts": 1716454222781654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222886995, "dur": 4, "args": { "External id": 97042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97042, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 97042, "pid": 5, "tid": 7, "ts": 1716454222886995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781675, "dur": 7, "args": { "External id": 97042, "cbid": 211, "correlation": 97042 } }, { "ph": "s", "id": 97042, "pid": 76337, "tid": -914061504, "ts": 1716454222781675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222887001, "dur": 107, "args": { "External id": 97045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97045, "pid": 5, "tid": 7, "ts": 1716454222887001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781695, "dur": 7, "args": { "External id": 97045, "cbid": 211, "correlation": 97045 } }, { "ph": "s", "id": 97045, "pid": 76337, "tid": -914061504, "ts": 1716454222781695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222887109, "dur": 69, "args": { "External id": 97054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97054, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97054, "pid": 5, "tid": 7, "ts": 1716454222887109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781735, "dur": 10, "args": { "External id": 97054, "cbid": 211, "correlation": 97054 } }, { "ph": "s", "id": 97054, "pid": 76337, "tid": -914061504, "ts": 1716454222781735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222781787, "dur": 0, "args": { "External id": 97064, "cbid": 317, "correlation": 97064 } }, { "ph": "f", "id": 97064, "pid": 76337, "tid": -914061504, "ts": 1716454222781787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222781788, "dur": 0, "args": { "External id": 97065, "cbid": 203, "correlation": 97065 } }, { "ph": "f", "id": 97065, "pid": 76337, "tid": -914061504, "ts": 1716454222781788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222781788, "dur": 0, "args": { "External id": 97066, "cbid": 205, "correlation": 97066 } }, { "ph": "f", "id": 97066, "pid": 76337, "tid": -914061504, "ts": 1716454222781788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222887180, "dur": 74, "args": { "External id": 97070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97070, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97070, "pid": 5, "tid": 7, "ts": 1716454222887180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781805, "dur": 11, "args": { "External id": 97070, "cbid": 211, "correlation": 97070 } }, { "ph": "s", "id": 97070, "pid": 76337, "tid": -914061504, "ts": 1716454222781805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222887255, "dur": 23, "args": { "External id": 97072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97072, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97072, "pid": 5, "tid": 7, "ts": 1716454222887255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781819, "dur": 5, "args": { "External id": 97072, "cbid": 211, "correlation": 97072 } }, { "ph": "s", "id": 97072, "pid": 76337, "tid": -914061504, "ts": 1716454222781819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222887280, "dur": 4, "args": { "External id": 97074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97074, "pid": 5, "tid": 7, "ts": 1716454222887280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781830, "dur": 6, "args": { "External id": 97074, "cbid": 211, "correlation": 97074 } }, { "ph": "s", "id": 97074, "pid": 76337, "tid": -914061504, "ts": 1716454222781830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222781839, "dur": 0, "args": { "External id": 97075, "cbid": 51, "correlation": 97075 } }, { "ph": "s", "id": 97075, "pid": 76337, "tid": -914061504, "ts": 1716454222781839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222887285, "dur": 1340, "args": { "External id": 97076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97076, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97076, "pid": 5, "tid": 7, "ts": 1716454222887285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781840, "dur": 5, "args": { "External id": 97076, "cbid": 211, "correlation": 97076 } }, { "ph": "s", "id": 97076, "pid": 76337, "tid": -914061504, "ts": 1716454222781840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222888626, "dur": 58, "args": { "External id": 97081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97081, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97081, "pid": 5, "tid": 7, "ts": 1716454222888626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781868, "dur": 8, "args": { "External id": 97081, "cbid": 211, "correlation": 97081 } }, { "ph": "s", "id": 97081, "pid": 76337, "tid": -914061504, "ts": 1716454222781868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222888685, "dur": 3, "args": { "External id": 97089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97089, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97089, "pid": 5, "tid": 7, "ts": 1716454222888685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781910, "dur": 10, "args": { "External id": 97089, "cbid": 211, "correlation": 97089 } }, { "ph": "s", "id": 97089, "pid": 76337, "tid": -914061504, "ts": 1716454222781910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781984, "dur": 1, "args": { "External id": 97105, "cbid": 251, "correlation": 97105 } }, { "ph": "f", "id": 97105, "pid": 76337, "tid": -914061504, "ts": 1716454222781984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222781989, "dur": 0, "args": { "External id": 97107, "cbid": 251, "correlation": 97107 } }, { "ph": "f", "id": 97107, "pid": 76337, "tid": -914061504, "ts": 1716454222781989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222888690, "dur": 11, "args": { "External id": 97108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97108, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 97108, "pid": 5, "tid": 7, "ts": 1716454222888690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222781991, "dur": 12, "args": { "External id": 97108, "cbid": 211, "correlation": 97108 } }, { "ph": "s", "id": 97108, "pid": 76337, "tid": -914061504, "ts": 1716454222781991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222888702, "dur": 5, "args": { "External id": 97110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97110, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 97110, "pid": 5, "tid": 7, "ts": 1716454222888702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782006, "dur": 6, "args": { "External id": 97110, "cbid": 211, "correlation": 97110 } }, { "ph": "s", "id": 97110, "pid": 76337, "tid": -914061504, "ts": 1716454222782006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222888708, "dur": 54, "args": { "External id": 97120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97120, "pid": 5, "tid": 7, "ts": 1716454222888708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782065, "dur": 13, "args": { "External id": 97120, "cbid": 211, "correlation": 97120 } }, { "ph": "s", "id": 97120, "pid": 76337, "tid": -914061504, "ts": 1716454222782065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222888764, "dur": 52, "args": { "External id": 97140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97140, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 97140, "pid": 5, "tid": 7, "ts": 1716454222888764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782132, "dur": 12, "args": { "External id": 97140, "cbid": 211, "correlation": 97140 } }, { "ph": "s", "id": 97140, "pid": 76337, "tid": -914061504, "ts": 1716454222782132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222888817, "dur": 4, "args": { "External id": 97152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97152, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97152, "pid": 5, "tid": 7, "ts": 1716454222888817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782153, "dur": 6, "args": { "External id": 97152, "cbid": 211, "correlation": 97152 } }, { "ph": "s", "id": 97152, "pid": 76337, "tid": -914061504, "ts": 1716454222782153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222888822, "dur": 53, "args": { "External id": 97155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97155, "pid": 5, "tid": 7, "ts": 1716454222888822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782172, "dur": 6, "args": { "External id": 97155, "cbid": 211, "correlation": 97155 } }, { "ph": "s", "id": 97155, "pid": 76337, "tid": -914061504, "ts": 1716454222782172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222888877, "dur": 36, "args": { "External id": 97164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97164, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97164, "pid": 5, "tid": 7, "ts": 1716454222888877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782214, "dur": 10, "args": { "External id": 97164, "cbid": 211, "correlation": 97164 } }, { "ph": "s", "id": 97164, "pid": 76337, "tid": -914061504, "ts": 1716454222782214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222782278, "dur": 0, "args": { "External id": 97174, "cbid": 317, "correlation": 97174 } }, { "ph": "f", "id": 97174, "pid": 76337, "tid": -914061504, "ts": 1716454222782278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222782279, "dur": 0, "args": { "External id": 97175, "cbid": 203, "correlation": 97175 } }, { "ph": "f", "id": 97175, "pid": 76337, "tid": -914061504, "ts": 1716454222782279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222782280, "dur": 0, "args": { "External id": 97176, "cbid": 205, "correlation": 97176 } }, { "ph": "f", "id": 97176, "pid": 76337, "tid": -914061504, "ts": 1716454222782280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222888915, "dur": 40, "args": { "External id": 97180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97180, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97180, "pid": 5, "tid": 7, "ts": 1716454222888915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782294, "dur": 12, "args": { "External id": 97180, "cbid": 211, "correlation": 97180 } }, { "ph": "s", "id": 97180, "pid": 76337, "tid": -914061504, "ts": 1716454222782294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222888956, "dur": 14, "args": { "External id": 97182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97182, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97182, "pid": 5, "tid": 7, "ts": 1716454222888956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782308, "dur": 6, "args": { "External id": 97182, "cbid": 211, "correlation": 97182 } }, { "ph": "s", "id": 97182, "pid": 76337, "tid": -914061504, "ts": 1716454222782308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222888971, "dur": 3, "args": { "External id": 97184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97184, "pid": 5, "tid": 7, "ts": 1716454222888971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782318, "dur": 5, "args": { "External id": 97184, "cbid": 211, "correlation": 97184 } }, { "ph": "s", "id": 97184, "pid": 76337, "tid": -914061504, "ts": 1716454222782318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222782327, "dur": 0, "args": { "External id": 97185, "cbid": 51, "correlation": 97185 } }, { "ph": "s", "id": 97185, "pid": 76337, "tid": -914061504, "ts": 1716454222782327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222888976, "dur": 684, "args": { "External id": 97186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97186, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97186, "pid": 5, "tid": 7, "ts": 1716454222888976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782328, "dur": 5, "args": { "External id": 97186, "cbid": 211, "correlation": 97186 } }, { "ph": "s", "id": 97186, "pid": 76337, "tid": -914061504, "ts": 1716454222782328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222889662, "dur": 58, "args": { "External id": 97191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97191, "pid": 5, "tid": 7, "ts": 1716454222889662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782354, "dur": 8, "args": { "External id": 97191, "cbid": 211, "correlation": 97191 } }, { "ph": "s", "id": 97191, "pid": 76337, "tid": -914061504, "ts": 1716454222782354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222782412, "dur": 0, "args": { "External id": 97201, "cbid": 317, "correlation": 97201 } }, { "ph": "f", "id": 97201, "pid": 76337, "tid": -914061504, "ts": 1716454222782412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222782413, "dur": 0, "args": { "External id": 97202, "cbid": 203, "correlation": 97202 } }, { "ph": "f", "id": 97202, "pid": 76337, "tid": -914061504, "ts": 1716454222782413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222782413, "dur": 0, "args": { "External id": 97203, "cbid": 205, "correlation": 97203 } }, { "ph": "f", "id": 97203, "pid": 76337, "tid": -914061504, "ts": 1716454222782413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222889721, "dur": 75, "args": { "External id": 97207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97207, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97207, "pid": 5, "tid": 7, "ts": 1716454222889721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782426, "dur": 11, "args": { "External id": 97207, "cbid": 211, "correlation": 97207 } }, { "ph": "s", "id": 97207, "pid": 76337, "tid": -914061504, "ts": 1716454222782426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222889798, "dur": 205, "args": { "External id": 97209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97209, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97209, "pid": 5, "tid": 7, "ts": 1716454222889798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782447, "dur": 8, "args": { "External id": 97209, "cbid": 211, "correlation": 97209 } }, { "ph": "s", "id": 97209, "pid": 76337, "tid": -914061504, "ts": 1716454222782447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222890004, "dur": 38, "args": { "External id": 97211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97211, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97211, "pid": 5, "tid": 7, "ts": 1716454222890004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782460, "dur": 6, "args": { "External id": 97211, "cbid": 211, "correlation": 97211 } }, { "ph": "s", "id": 97211, "pid": 76337, "tid": -914061504, "ts": 1716454222782460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222890043, "dur": 58, "args": { "External id": 97217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97217, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97217, "pid": 5, "tid": 7, "ts": 1716454222890043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782487, "dur": 8, "args": { "External id": 97217, "cbid": 211, "correlation": 97217 } }, { "ph": "s", "id": 97217, "pid": 76337, "tid": -914061504, "ts": 1716454222782487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222890102, "dur": 50, "args": { "External id": 97225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97225, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97225, "pid": 5, "tid": 7, "ts": 1716454222890102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782516, "dur": 8, "args": { "External id": 97225, "cbid": 211, "correlation": 97225 } }, { "ph": "s", "id": 97225, "pid": 76337, "tid": -914061504, "ts": 1716454222782516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222890154, "dur": 35, "args": { "External id": 97233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97233, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97233, "pid": 5, "tid": 7, "ts": 1716454222890154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782545, "dur": 9, "args": { "External id": 97233, "cbid": 211, "correlation": 97233 } }, { "ph": "s", "id": 97233, "pid": 76337, "tid": -914061504, "ts": 1716454222782545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222890190, "dur": 51, "args": { "External id": 97253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97253, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 97253, "pid": 5, "tid": 7, "ts": 1716454222890190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782627, "dur": 12, "args": { "External id": 97253, "cbid": 211, "correlation": 97253 } }, { "ph": "s", "id": 97253, "pid": 76337, "tid": -914061504, "ts": 1716454222782627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222890243, "dur": 4, "args": { "External id": 97265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97265, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97265, "pid": 5, "tid": 7, "ts": 1716454222890243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782649, "dur": 7, "args": { "External id": 97265, "cbid": 211, "correlation": 97265 } }, { "ph": "s", "id": 97265, "pid": 76337, "tid": -914061504, "ts": 1716454222782649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222890248, "dur": 54, "args": { "External id": 97268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97268, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97268, "pid": 5, "tid": 7, "ts": 1716454222890248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782667, "dur": 6, "args": { "External id": 97268, "cbid": 211, "correlation": 97268 } }, { "ph": "s", "id": 97268, "pid": 76337, "tid": -914061504, "ts": 1716454222782667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222782724, "dur": 0, "args": { "External id": 97279, "cbid": 317, "correlation": 97279 } }, { "ph": "f", "id": 97279, "pid": 76337, "tid": -914061504, "ts": 1716454222782724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222782725, "dur": 0, "args": { "External id": 97280, "cbid": 203, "correlation": 97280 } }, { "ph": "f", "id": 97280, "pid": 76337, "tid": -914061504, "ts": 1716454222782725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222782725, "dur": 0, "args": { "External id": 97281, "cbid": 205, "correlation": 97281 } }, { "ph": "f", "id": 97281, "pid": 76337, "tid": -914061504, "ts": 1716454222782725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782749, "dur": 1, "args": { "External id": 97285, "cbid": 251, "correlation": 97285 } }, { "ph": "f", "id": 97285, "pid": 76337, "tid": -914061504, "ts": 1716454222782749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782751, "dur": 0, "args": { "External id": 97286, "cbid": 251, "correlation": 97286 } }, { "ph": "f", "id": 97286, "pid": 76337, "tid": -914061504, "ts": 1716454222782751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782752, "dur": 0, "args": { "External id": 97287, "cbid": 251, "correlation": 97287 } }, { "ph": "f", "id": 97287, "pid": 76337, "tid": -914061504, "ts": 1716454222782752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782753, "dur": 0, "args": { "External id": 97288, "cbid": 251, "correlation": 97288 } }, { "ph": "f", "id": 97288, "pid": 76337, "tid": -914061504, "ts": 1716454222782753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782753, "dur": 0, "args": { "External id": 97289, "cbid": 251, "correlation": 97289 } }, { "ph": "f", "id": 97289, "pid": 76337, "tid": -914061504, "ts": 1716454222782753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782754, "dur": 0, "args": { "External id": 97290, "cbid": 251, "correlation": 97290 } }, { "ph": "f", "id": 97290, "pid": 76337, "tid": -914061504, "ts": 1716454222782754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782755, "dur": 0, "args": { "External id": 97291, "cbid": 251, "correlation": 97291 } }, { "ph": "f", "id": 97291, "pid": 76337, "tid": -914061504, "ts": 1716454222782755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782756, "dur": 0, "args": { "External id": 97292, "cbid": 251, "correlation": 97292 } }, { "ph": "f", "id": 97292, "pid": 76337, "tid": -914061504, "ts": 1716454222782756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222782758, "dur": 0, "args": { "External id": 97293, "cbid": 251, "correlation": 97293 } }, { "ph": "f", "id": 97293, "pid": 76337, "tid": -914061504, "ts": 1716454222782758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222890303, "dur": 111, "args": { "External id": 97294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97294, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 97294, "pid": 5, "tid": 7, "ts": 1716454222890303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782760, "dur": 13, "args": { "External id": 97294, "cbid": 211, "correlation": 97294 } }, { "ph": "s", "id": 97294, "pid": 76337, "tid": -914061504, "ts": 1716454222782760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222890416, "dur": 59, "args": { "External id": 97300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97300, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97300, "pid": 5, "tid": 7, "ts": 1716454222890416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782795, "dur": 9, "args": { "External id": 97300, "cbid": 211, "correlation": 97300 } }, { "ph": "s", "id": 97300, "pid": 76337, "tid": -914061504, "ts": 1716454222782795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222890476, "dur": 566, "args": { "External id": 97309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97309, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97309, "pid": 5, "tid": 7, "ts": 1716454222890476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782879, "dur": 15, "args": { "External id": 97309, "cbid": 211, "correlation": 97309 } }, { "ph": "s", "id": 97309, "pid": 76337, "tid": -914061504, "ts": 1716454222782879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222891044, "dur": 177, "args": { "External id": 97331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97331, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97331, "pid": 5, "tid": 7, "ts": 1716454222891044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222782940, "dur": 10, "args": { "External id": 97331, "cbid": 211, "correlation": 97331 } }, { "ph": "s", "id": 97331, "pid": 76337, "tid": -914061504, "ts": 1716454222782940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222783038, "dur": 1, "args": { "External id": 97342, "cbid": 251, "correlation": 97342 } }, { "ph": "f", "id": 97342, "pid": 76337, "tid": -914061504, "ts": 1716454222783038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222891222, "dur": 193, "args": { "External id": 97343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97343, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97343, "pid": 5, "tid": 7, "ts": 1716454222891222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783043, "dur": 14, "args": { "External id": 97343, "cbid": 211, "correlation": 97343 } }, { "ph": "s", "id": 97343, "pid": 76337, "tid": -914061504, "ts": 1716454222783043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222783112, "dur": 1, "args": { "External id": 97354, "cbid": 251, "correlation": 97354 } }, { "ph": "f", "id": 97354, "pid": 76337, "tid": -914061504, "ts": 1716454222783112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222891417, "dur": 187, "args": { "External id": 97355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97355, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97355, "pid": 5, "tid": 7, "ts": 1716454222891417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783116, "dur": 11, "args": { "External id": 97355, "cbid": 211, "correlation": 97355 } }, { "ph": "s", "id": 97355, "pid": 76337, "tid": -914061504, "ts": 1716454222783116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222783178, "dur": 1, "args": { "External id": 97366, "cbid": 251, "correlation": 97366 } }, { "ph": "f", "id": 97366, "pid": 76337, "tid": -914061504, "ts": 1716454222783178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222891605, "dur": 187, "args": { "External id": 97367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97367, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97367, "pid": 5, "tid": 7, "ts": 1716454222891605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783182, "dur": 11, "args": { "External id": 97367, "cbid": 211, "correlation": 97367 } }, { "ph": "s", "id": 97367, "pid": 76337, "tid": -914061504, "ts": 1716454222783182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222891794, "dur": 18153, "args": { "External id": 97388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97388, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 97388, "pid": 5, "tid": 7, "ts": 1716454222891794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783269, "dur": 13, "args": { "External id": 97388, "cbid": 211, "correlation": 97388 } }, { "ph": "s", "id": 97388, "pid": 76337, "tid": -914061504, "ts": 1716454222783269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222783369, "dur": 1, "args": { "External id": 97406, "cbid": 251, "correlation": 97406 } }, { "ph": "f", "id": 97406, "pid": 76337, "tid": -914061504, "ts": 1716454222783369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222909948, "dur": 202, "args": { "External id": 97408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97408, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97408, "pid": 5, "tid": 7, "ts": 1716454222909948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783375, "dur": 14, "args": { "External id": 97408, "cbid": 211, "correlation": 97408 } }, { "ph": "s", "id": 97408, "pid": 76337, "tid": -914061504, "ts": 1716454222783375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222910152, "dur": 66, "args": { "External id": 97416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97416, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97416, "pid": 5, "tid": 7, "ts": 1716454222910152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783446, "dur": 12, "args": { "External id": 97416, "cbid": 211, "correlation": 97416 } }, { "ph": "s", "id": 97416, "pid": 76337, "tid": -914061504, "ts": 1716454222783446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222910219, "dur": 97, "args": { "External id": 97424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97424, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97424, "pid": 5, "tid": 7, "ts": 1716454222910219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783484, "dur": 9, "args": { "External id": 97424, "cbid": 211, "correlation": 97424 } }, { "ph": "s", "id": 97424, "pid": 76337, "tid": -914061504, "ts": 1716454222783484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222910317, "dur": 55, "args": { "External id": 97435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97435, "pid": 5, "tid": 7, "ts": 1716454222910317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783560, "dur": 13, "args": { "External id": 97435, "cbid": 211, "correlation": 97435 } }, { "ph": "s", "id": 97435, "pid": 76337, "tid": -914061504, "ts": 1716454222783560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222910374, "dur": 90, "args": { "External id": 97457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97457, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97457, "pid": 5, "tid": 7, "ts": 1716454222910374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222783592, "dur": 1720, "args": { "External id": 97457, "cbid": 211, "correlation": 97457 } }, { "ph": "s", "id": 97457, "pid": 76337, "tid": -914061504, "ts": 1716454222783592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222785392, "dur": 1, "args": { "External id": 97468, "cbid": 251, "correlation": 97468 } }, { "ph": "f", "id": 97468, "pid": 76337, "tid": -914061504, "ts": 1716454222785392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222910465, "dur": 101, "args": { "External id": 97469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97469, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97469, "pid": 5, "tid": 7, "ts": 1716454222910465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785397, "dur": 61, "args": { "External id": 97469, "cbid": 211, "correlation": 97469 } }, { "ph": "s", "id": 97469, "pid": 76337, "tid": -914061504, "ts": 1716454222785397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222785518, "dur": 1, "args": { "External id": 97480, "cbid": 251, "correlation": 97480 } }, { "ph": "f", "id": 97480, "pid": 76337, "tid": -914061504, "ts": 1716454222785518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222785523, "dur": 0, "args": { "External id": 97481, "cbid": 251, "correlation": 97481 } }, { "ph": "f", "id": 97481, "pid": 76337, "tid": -914061504, "ts": 1716454222785523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222910567, "dur": 10, "args": { "External id": 97482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97482, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 97482, "pid": 5, "tid": 7, "ts": 1716454222910567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785524, "dur": 12, "args": { "External id": 97482, "cbid": 211, "correlation": 97482 } }, { "ph": "s", "id": 97482, "pid": 76337, "tid": -914061504, "ts": 1716454222785524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222910578, "dur": 5, "args": { "External id": 97484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97484, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 97484, "pid": 5, "tid": 7, "ts": 1716454222910578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785538, "dur": 6, "args": { "External id": 97484, "cbid": 211, "correlation": 97484 } }, { "ph": "s", "id": 97484, "pid": 76337, "tid": -914061504, "ts": 1716454222785538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222785599, "dur": 1, "args": { "External id": 97495, "cbid": 251, "correlation": 97495 } }, { "ph": "f", "id": 97495, "pid": 76337, "tid": -914061504, "ts": 1716454222785599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222785602, "dur": 0, "args": { "External id": 97496, "cbid": 251, "correlation": 97496 } }, { "ph": "f", "id": 97496, "pid": 76337, "tid": -914061504, "ts": 1716454222785602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222910584, "dur": 6, "args": { "External id": 97497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97497, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 97497, "pid": 5, "tid": 7, "ts": 1716454222910584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785604, "dur": 11, "args": { "External id": 97497, "cbid": 211, "correlation": 97497 } }, { "ph": "s", "id": 97497, "pid": 76337, "tid": -914061504, "ts": 1716454222785604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222910591, "dur": 3, "args": { "External id": 97499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97499, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 97499, "pid": 5, "tid": 7, "ts": 1716454222910591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785617, "dur": 5, "args": { "External id": 97499, "cbid": 211, "correlation": 97499 } }, { "ph": "s", "id": 97499, "pid": 76337, "tid": -914061504, "ts": 1716454222785617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222910596, "dur": 154, "args": { "External id": 97520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97520, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 97520, "pid": 5, "tid": 7, "ts": 1716454222910596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785692, "dur": 12, "args": { "External id": 97520, "cbid": 211, "correlation": 97520 } }, { "ph": "s", "id": 97520, "pid": 76337, "tid": -914061504, "ts": 1716454222785692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222785790, "dur": 1, "args": { "External id": 97538, "cbid": 251, "correlation": 97538 } }, { "ph": "f", "id": 97538, "pid": 76337, "tid": -914061504, "ts": 1716454222785790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222910751, "dur": 105, "args": { "External id": 97540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97540, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 97540, "pid": 5, "tid": 7, "ts": 1716454222910751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785797, "dur": 14, "args": { "External id": 97540, "cbid": 211, "correlation": 97540 } }, { "ph": "s", "id": 97540, "pid": 76337, "tid": -914061504, "ts": 1716454222785797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222910858, "dur": 34, "args": { "External id": 97548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97548, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97548, "pid": 5, "tid": 7, "ts": 1716454222910858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785868, "dur": 12, "args": { "External id": 97548, "cbid": 211, "correlation": 97548 } }, { "ph": "s", "id": 97548, "pid": 76337, "tid": -914061504, "ts": 1716454222785868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222910893, "dur": 67, "args": { "External id": 97556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97556, "pid": 5, "tid": 7, "ts": 1716454222910893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785907, "dur": 10, "args": { "External id": 97556, "cbid": 211, "correlation": 97556 } }, { "ph": "s", "id": 97556, "pid": 76337, "tid": -914061504, "ts": 1716454222785907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222910961, "dur": 90, "args": { "External id": 97578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97578, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97578, "pid": 5, "tid": 7, "ts": 1716454222910961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222785958, "dur": 10, "args": { "External id": 97578, "cbid": 211, "correlation": 97578 } }, { "ph": "s", "id": 97578, "pid": 76337, "tid": -914061504, "ts": 1716454222785958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786057, "dur": 1, "args": { "External id": 97594, "cbid": 251, "correlation": 97594 } }, { "ph": "f", "id": 97594, "pid": 76337, "tid": -914061504, "ts": 1716454222786057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222911052, "dur": 564, "args": { "External id": 97596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97596, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97596, "pid": 5, "tid": 7, "ts": 1716454222911052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786063, "dur": 13, "args": { "External id": 97596, "cbid": 211, "correlation": 97596 } }, { "ph": "s", "id": 97596, "pid": 76337, "tid": -914061504, "ts": 1716454222786063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222911618, "dur": 242, "args": { "External id": 97604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97604, "pid": 5, "tid": 7, "ts": 1716454222911618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786133, "dur": 13, "args": { "External id": 97604, "cbid": 211, "correlation": 97604 } }, { "ph": "s", "id": 97604, "pid": 76337, "tid": -914061504, "ts": 1716454222786133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222911861, "dur": 250, "args": { "External id": 97612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97612, "pid": 5, "tid": 7, "ts": 1716454222911861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786166, "dur": 8, "args": { "External id": 97612, "cbid": 211, "correlation": 97612 } }, { "ph": "s", "id": 97612, "pid": 76337, "tid": -914061504, "ts": 1716454222786166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786249, "dur": 2, "args": { "External id": 97628, "cbid": 251, "correlation": 97628 } }, { "ph": "f", "id": 97628, "pid": 76337, "tid": -914061504, "ts": 1716454222786249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786254, "dur": 0, "args": { "External id": 97630, "cbid": 251, "correlation": 97630 } }, { "ph": "f", "id": 97630, "pid": 76337, "tid": -914061504, "ts": 1716454222786254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222912112, "dur": 355, "args": { "External id": 97631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97631, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 97631, "pid": 5, "tid": 7, "ts": 1716454222912112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786259, "dur": 14, "args": { "External id": 97631, "cbid": 211, "correlation": 97631 } }, { "ph": "s", "id": 97631, "pid": 76337, "tid": -914061504, "ts": 1716454222786259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222912469, "dur": 50, "args": { "External id": 97639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97639, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97639, "pid": 5, "tid": 7, "ts": 1716454222912469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786302, "dur": 10, "args": { "External id": 97639, "cbid": 211, "correlation": 97639 } }, { "ph": "s", "id": 97639, "pid": 76337, "tid": -914061504, "ts": 1716454222786302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222912520, "dur": 155, "args": { "External id": 97650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97650, "pid": 5, "tid": 7, "ts": 1716454222912520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786371, "dur": 165, "args": { "External id": 97650, "cbid": 211, "correlation": 97650 } }, { "ph": "s", "id": 97650, "pid": 76337, "tid": -914061504, "ts": 1716454222786371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222786593, "dur": 0, "args": { "External id": 97662, "cbid": 317, "correlation": 97662 } }, { "ph": "f", "id": 97662, "pid": 76337, "tid": -914061504, "ts": 1716454222786593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222786593, "dur": 0, "args": { "External id": 97663, "cbid": 203, "correlation": 97663 } }, { "ph": "f", "id": 97663, "pid": 76337, "tid": -914061504, "ts": 1716454222786593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222786594, "dur": 0, "args": { "External id": 97664, "cbid": 205, "correlation": 97664 } }, { "ph": "f", "id": 97664, "pid": 76337, "tid": -914061504, "ts": 1716454222786594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786620, "dur": 1, "args": { "External id": 97668, "cbid": 251, "correlation": 97668 } }, { "ph": "f", "id": 97668, "pid": 76337, "tid": -914061504, "ts": 1716454222786620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786622, "dur": 0, "args": { "External id": 97669, "cbid": 251, "correlation": 97669 } }, { "ph": "f", "id": 97669, "pid": 76337, "tid": -914061504, "ts": 1716454222786622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786623, "dur": 0, "args": { "External id": 97670, "cbid": 251, "correlation": 97670 } }, { "ph": "f", "id": 97670, "pid": 76337, "tid": -914061504, "ts": 1716454222786623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786624, "dur": 0, "args": { "External id": 97671, "cbid": 251, "correlation": 97671 } }, { "ph": "f", "id": 97671, "pid": 76337, "tid": -914061504, "ts": 1716454222786624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786625, "dur": 1, "args": { "External id": 97672, "cbid": 251, "correlation": 97672 } }, { "ph": "f", "id": 97672, "pid": 76337, "tid": -914061504, "ts": 1716454222786625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786626, "dur": 0, "args": { "External id": 97673, "cbid": 251, "correlation": 97673 } }, { "ph": "f", "id": 97673, "pid": 76337, "tid": -914061504, "ts": 1716454222786626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786627, "dur": 0, "args": { "External id": 97674, "cbid": 251, "correlation": 97674 } }, { "ph": "f", "id": 97674, "pid": 76337, "tid": -914061504, "ts": 1716454222786627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786628, "dur": 0, "args": { "External id": 97675, "cbid": 251, "correlation": 97675 } }, { "ph": "f", "id": 97675, "pid": 76337, "tid": -914061504, "ts": 1716454222786628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222786629, "dur": 0, "args": { "External id": 97676, "cbid": 251, "correlation": 97676 } }, { "ph": "f", "id": 97676, "pid": 76337, "tid": -914061504, "ts": 1716454222786629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222912676, "dur": 113, "args": { "External id": 97677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97677, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 97677, "pid": 5, "tid": 7, "ts": 1716454222912676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786631, "dur": 29, "args": { "External id": 97677, "cbid": 211, "correlation": 97677 } }, { "ph": "s", "id": 97677, "pid": 76337, "tid": -914061504, "ts": 1716454222786631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222912790, "dur": 59, "args": { "External id": 97683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97683, "pid": 5, "tid": 7, "ts": 1716454222912790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786683, "dur": 105, "args": { "External id": 97683, "cbid": 211, "correlation": 97683 } }, { "ph": "s", "id": 97683, "pid": 76337, "tid": -914061504, "ts": 1716454222786683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222912850, "dur": 50, "args": { "External id": 97691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97691, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97691, "pid": 5, "tid": 7, "ts": 1716454222912850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222786811, "dur": 289, "args": { "External id": 97691, "cbid": 211, "correlation": 97691 } }, { "ph": "s", "id": 97691, "pid": 76337, "tid": -914061504, "ts": 1716454222786811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222912902, "dur": 97, "args": { "External id": 97700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97700, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97700, "pid": 5, "tid": 7, "ts": 1716454222912902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787135, "dur": 11, "args": { "External id": 97700, "cbid": 211, "correlation": 97700 } }, { "ph": "s", "id": 97700, "pid": 76337, "tid": -914061504, "ts": 1716454222787135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222913000, "dur": 90, "args": { "External id": 97720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97720, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 97720, "pid": 5, "tid": 7, "ts": 1716454222913000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787214, "dur": 11, "args": { "External id": 97720, "cbid": 211, "correlation": 97720 } }, { "ph": "s", "id": 97720, "pid": 76337, "tid": -914061504, "ts": 1716454222787214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222913092, "dur": 4, "args": { "External id": 97732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97732, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 97732, "pid": 5, "tid": 7, "ts": 1716454222913092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787235, "dur": 7, "args": { "External id": 97732, "cbid": 211, "correlation": 97732 } }, { "ph": "s", "id": 97732, "pid": 76337, "tid": -914061504, "ts": 1716454222787235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222913097, "dur": 108, "args": { "External id": 97735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97735, "pid": 5, "tid": 7, "ts": 1716454222913097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787254, "dur": 97, "args": { "External id": 97735, "cbid": 211, "correlation": 97735 } }, { "ph": "s", "id": 97735, "pid": 76337, "tid": -914061504, "ts": 1716454222787254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222913206, "dur": 68, "args": { "External id": 97744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97744, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97744, "pid": 5, "tid": 7, "ts": 1716454222913206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787386, "dur": 10, "args": { "External id": 97744, "cbid": 211, "correlation": 97744 } }, { "ph": "s", "id": 97744, "pid": 76337, "tid": -914061504, "ts": 1716454222787386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222787440, "dur": 0, "args": { "External id": 97754, "cbid": 317, "correlation": 97754 } }, { "ph": "f", "id": 97754, "pid": 76337, "tid": -914061504, "ts": 1716454222787440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222787440, "dur": 0, "args": { "External id": 97755, "cbid": 203, "correlation": 97755 } }, { "ph": "f", "id": 97755, "pid": 76337, "tid": -914061504, "ts": 1716454222787440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222787441, "dur": 0, "args": { "External id": 97756, "cbid": 205, "correlation": 97756 } }, { "ph": "f", "id": 97756, "pid": 76337, "tid": -914061504, "ts": 1716454222787441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222913276, "dur": 76, "args": { "External id": 97760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97760, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97760, "pid": 5, "tid": 7, "ts": 1716454222913276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787458, "dur": 13, "args": { "External id": 97760, "cbid": 211, "correlation": 97760 } }, { "ph": "s", "id": 97760, "pid": 76337, "tid": -914061504, "ts": 1716454222787458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222913353, "dur": 23, "args": { "External id": 97762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97762, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97762, "pid": 5, "tid": 7, "ts": 1716454222913353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787473, "dur": 5, "args": { "External id": 97762, "cbid": 211, "correlation": 97762 } }, { "ph": "s", "id": 97762, "pid": 76337, "tid": -914061504, "ts": 1716454222787473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222913378, "dur": 4, "args": { "External id": 97764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97764, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97764, "pid": 5, "tid": 7, "ts": 1716454222913378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787485, "dur": 6, "args": { "External id": 97764, "cbid": 211, "correlation": 97764 } }, { "ph": "s", "id": 97764, "pid": 76337, "tid": -914061504, "ts": 1716454222787485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222787494, "dur": 0, "args": { "External id": 97765, "cbid": 51, "correlation": 97765 } }, { "ph": "s", "id": 97765, "pid": 76337, "tid": -914061504, "ts": 1716454222787494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222913383, "dur": 1339, "args": { "External id": 97766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97766, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97766, "pid": 5, "tid": 7, "ts": 1716454222913383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787495, "dur": 6, "args": { "External id": 97766, "cbid": 211, "correlation": 97766 } }, { "ph": "s", "id": 97766, "pid": 76337, "tid": -914061504, "ts": 1716454222787495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222914723, "dur": 58, "args": { "External id": 97771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97771, "pid": 5, "tid": 7, "ts": 1716454222914723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787524, "dur": 9, "args": { "External id": 97771, "cbid": 211, "correlation": 97771 } }, { "ph": "s", "id": 97771, "pid": 76337, "tid": -914061504, "ts": 1716454222787524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222914782, "dur": 3, "args": { "External id": 97779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97779, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97779, "pid": 5, "tid": 7, "ts": 1716454222914782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787567, "dur": 10, "args": { "External id": 97779, "cbid": 211, "correlation": 97779 } }, { "ph": "s", "id": 97779, "pid": 76337, "tid": -914061504, "ts": 1716454222787567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222787637, "dur": 2, "args": { "External id": 97795, "cbid": 251, "correlation": 97795 } }, { "ph": "f", "id": 97795, "pid": 76337, "tid": -914061504, "ts": 1716454222787637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222787643, "dur": 0, "args": { "External id": 97797, "cbid": 251, "correlation": 97797 } }, { "ph": "f", "id": 97797, "pid": 76337, "tid": -914061504, "ts": 1716454222787643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222914787, "dur": 11, "args": { "External id": 97798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97798, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 97798, "pid": 5, "tid": 7, "ts": 1716454222914787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787645, "dur": 12, "args": { "External id": 97798, "cbid": 211, "correlation": 97798 } }, { "ph": "s", "id": 97798, "pid": 76337, "tid": -914061504, "ts": 1716454222787645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222914800, "dur": 5, "args": { "External id": 97800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97800, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 97800, "pid": 5, "tid": 7, "ts": 1716454222914800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787659, "dur": 5, "args": { "External id": 97800, "cbid": 211, "correlation": 97800 } }, { "ph": "s", "id": 97800, "pid": 76337, "tid": -914061504, "ts": 1716454222787659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222914806, "dur": 54, "args": { "External id": 97810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97810, "pid": 5, "tid": 7, "ts": 1716454222914806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222787720, "dur": 526, "args": { "External id": 97810, "cbid": 211, "correlation": 97810 } }, { "ph": "s", "id": 97810, "pid": 76337, "tid": -914061504, "ts": 1716454222787720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222914861, "dur": 51, "args": { "External id": 97830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97830, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 97830, "pid": 5, "tid": 7, "ts": 1716454222914861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788303, "dur": 12, "args": { "External id": 97830, "cbid": 211, "correlation": 97830 } }, { "ph": "s", "id": 97830, "pid": 76337, "tid": -914061504, "ts": 1716454222788303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222914914, "dur": 4, "args": { "External id": 97842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97842, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97842, "pid": 5, "tid": 7, "ts": 1716454222914914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788325, "dur": 6, "args": { "External id": 97842, "cbid": 211, "correlation": 97842 } }, { "ph": "s", "id": 97842, "pid": 76337, "tid": -914061504, "ts": 1716454222788325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222914919, "dur": 54, "args": { "External id": 97845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97845, "pid": 5, "tid": 7, "ts": 1716454222914919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788345, "dur": 7, "args": { "External id": 97845, "cbid": 211, "correlation": 97845 } }, { "ph": "s", "id": 97845, "pid": 76337, "tid": -914061504, "ts": 1716454222788345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222914974, "dur": 37, "args": { "External id": 97854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97854, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97854, "pid": 5, "tid": 7, "ts": 1716454222914974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788386, "dur": 10, "args": { "External id": 97854, "cbid": 211, "correlation": 97854 } }, { "ph": "s", "id": 97854, "pid": 76337, "tid": -914061504, "ts": 1716454222788386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222788452, "dur": 0, "args": { "External id": 97864, "cbid": 317, "correlation": 97864 } }, { "ph": "f", "id": 97864, "pid": 76337, "tid": -914061504, "ts": 1716454222788452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222788453, "dur": 0, "args": { "External id": 97865, "cbid": 203, "correlation": 97865 } }, { "ph": "f", "id": 97865, "pid": 76337, "tid": -914061504, "ts": 1716454222788453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222788453, "dur": 0, "args": { "External id": 97866, "cbid": 205, "correlation": 97866 } }, { "ph": "f", "id": 97866, "pid": 76337, "tid": -914061504, "ts": 1716454222788453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222915012, "dur": 40, "args": { "External id": 97870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97870, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97870, "pid": 5, "tid": 7, "ts": 1716454222915012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788469, "dur": 12, "args": { "External id": 97870, "cbid": 211, "correlation": 97870 } }, { "ph": "s", "id": 97870, "pid": 76337, "tid": -914061504, "ts": 1716454222788469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222915053, "dur": 14, "args": { "External id": 97872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97872, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97872, "pid": 5, "tid": 7, "ts": 1716454222915053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788483, "dur": 5, "args": { "External id": 97872, "cbid": 211, "correlation": 97872 } }, { "ph": "s", "id": 97872, "pid": 76337, "tid": -914061504, "ts": 1716454222788483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222915068, "dur": 3, "args": { "External id": 97874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97874, "pid": 5, "tid": 7, "ts": 1716454222915068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788493, "dur": 6, "args": { "External id": 97874, "cbid": 211, "correlation": 97874 } }, { "ph": "s", "id": 97874, "pid": 76337, "tid": -914061504, "ts": 1716454222788493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222788501, "dur": 0, "args": { "External id": 97875, "cbid": 51, "correlation": 97875 } }, { "ph": "s", "id": 97875, "pid": 76337, "tid": -914061504, "ts": 1716454222788501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222915073, "dur": 683, "args": { "External id": 97876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97876, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97876, "pid": 5, "tid": 7, "ts": 1716454222915073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788502, "dur": 5, "args": { "External id": 97876, "cbid": 211, "correlation": 97876 } }, { "ph": "s", "id": 97876, "pid": 76337, "tid": -914061504, "ts": 1716454222788502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222915758, "dur": 59, "args": { "External id": 97881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97881, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97881, "pid": 5, "tid": 7, "ts": 1716454222915758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788530, "dur": 8, "args": { "External id": 97881, "cbid": 211, "correlation": 97881 } }, { "ph": "s", "id": 97881, "pid": 76337, "tid": -914061504, "ts": 1716454222788530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222788587, "dur": 0, "args": { "External id": 97891, "cbid": 317, "correlation": 97891 } }, { "ph": "f", "id": 97891, "pid": 76337, "tid": -914061504, "ts": 1716454222788587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222788588, "dur": 0, "args": { "External id": 97892, "cbid": 203, "correlation": 97892 } }, { "ph": "f", "id": 97892, "pid": 76337, "tid": -914061504, "ts": 1716454222788588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222788588, "dur": 0, "args": { "External id": 97893, "cbid": 205, "correlation": 97893 } }, { "ph": "f", "id": 97893, "pid": 76337, "tid": -914061504, "ts": 1716454222788588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222915817, "dur": 75, "args": { "External id": 97897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97897, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97897, "pid": 5, "tid": 7, "ts": 1716454222915817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788601, "dur": 11, "args": { "External id": 97897, "cbid": 211, "correlation": 97897 } }, { "ph": "s", "id": 97897, "pid": 76337, "tid": -914061504, "ts": 1716454222788601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222915893, "dur": 204, "args": { "External id": 97899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97899, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 97899, "pid": 5, "tid": 7, "ts": 1716454222915893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788621, "dur": 7, "args": { "External id": 97899, "cbid": 211, "correlation": 97899 } }, { "ph": "s", "id": 97899, "pid": 76337, "tid": -914061504, "ts": 1716454222788621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222916099, "dur": 40, "args": { "External id": 97901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97901, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97901, "pid": 5, "tid": 7, "ts": 1716454222916099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788633, "dur": 6, "args": { "External id": 97901, "cbid": 211, "correlation": 97901 } }, { "ph": "s", "id": 97901, "pid": 76337, "tid": -914061504, "ts": 1716454222788633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222916140, "dur": 58, "args": { "External id": 97907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97907, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97907, "pid": 5, "tid": 7, "ts": 1716454222916140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222788660, "dur": 491, "args": { "External id": 97907, "cbid": 211, "correlation": 97907 } }, { "ph": "s", "id": 97907, "pid": 76337, "tid": -914061504, "ts": 1716454222788660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222916199, "dur": 50, "args": { "External id": 97915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97915, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97915, "pid": 5, "tid": 7, "ts": 1716454222916199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789174, "dur": 9, "args": { "External id": 97915, "cbid": 211, "correlation": 97915 } }, { "ph": "s", "id": 97915, "pid": 76337, "tid": -914061504, "ts": 1716454222789174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222916250, "dur": 35, "args": { "External id": 97923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97923, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97923, "pid": 5, "tid": 7, "ts": 1716454222916250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789207, "dur": 10, "args": { "External id": 97923, "cbid": 211, "correlation": 97923 } }, { "ph": "s", "id": 97923, "pid": 76337, "tid": -914061504, "ts": 1716454222789207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222916287, "dur": 52, "args": { "External id": 97943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97943, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 97943, "pid": 5, "tid": 7, "ts": 1716454222916287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789293, "dur": 12, "args": { "External id": 97943, "cbid": 211, "correlation": 97943 } }, { "ph": "s", "id": 97943, "pid": 76337, "tid": -914061504, "ts": 1716454222789293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222916340, "dur": 4, "args": { "External id": 97955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97955, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 97955, "pid": 5, "tid": 7, "ts": 1716454222916340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789315, "dur": 7, "args": { "External id": 97955, "cbid": 211, "correlation": 97955 } }, { "ph": "s", "id": 97955, "pid": 76337, "tid": -914061504, "ts": 1716454222789315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222916346, "dur": 54, "args": { "External id": 97958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97958, "pid": 5, "tid": 7, "ts": 1716454222916346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789334, "dur": 7, "args": { "External id": 97958, "cbid": 211, "correlation": 97958 } }, { "ph": "s", "id": 97958, "pid": 76337, "tid": -914061504, "ts": 1716454222789334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222789391, "dur": 0, "args": { "External id": 97969, "cbid": 317, "correlation": 97969 } }, { "ph": "f", "id": 97969, "pid": 76337, "tid": -914061504, "ts": 1716454222789391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222789392, "dur": 0, "args": { "External id": 97970, "cbid": 203, "correlation": 97970 } }, { "ph": "f", "id": 97970, "pid": 76337, "tid": -914061504, "ts": 1716454222789392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222789393, "dur": 0, "args": { "External id": 97971, "cbid": 205, "correlation": 97971 } }, { "ph": "f", "id": 97971, "pid": 76337, "tid": -914061504, "ts": 1716454222789393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789418, "dur": 1, "args": { "External id": 97975, "cbid": 251, "correlation": 97975 } }, { "ph": "f", "id": 97975, "pid": 76337, "tid": -914061504, "ts": 1716454222789418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789420, "dur": 0, "args": { "External id": 97976, "cbid": 251, "correlation": 97976 } }, { "ph": "f", "id": 97976, "pid": 76337, "tid": -914061504, "ts": 1716454222789420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789422, "dur": 0, "args": { "External id": 97977, "cbid": 251, "correlation": 97977 } }, { "ph": "f", "id": 97977, "pid": 76337, "tid": -914061504, "ts": 1716454222789422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789423, "dur": 0, "args": { "External id": 97978, "cbid": 251, "correlation": 97978 } }, { "ph": "f", "id": 97978, "pid": 76337, "tid": -914061504, "ts": 1716454222789423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789424, "dur": 0, "args": { "External id": 97979, "cbid": 251, "correlation": 97979 } }, { "ph": "f", "id": 97979, "pid": 76337, "tid": -914061504, "ts": 1716454222789424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789424, "dur": 0, "args": { "External id": 97980, "cbid": 251, "correlation": 97980 } }, { "ph": "f", "id": 97980, "pid": 76337, "tid": -914061504, "ts": 1716454222789424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789426, "dur": 0, "args": { "External id": 97981, "cbid": 251, "correlation": 97981 } }, { "ph": "f", "id": 97981, "pid": 76337, "tid": -914061504, "ts": 1716454222789426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789427, "dur": 0, "args": { "External id": 97982, "cbid": 251, "correlation": 97982 } }, { "ph": "f", "id": 97982, "pid": 76337, "tid": -914061504, "ts": 1716454222789427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789428, "dur": 0, "args": { "External id": 97983, "cbid": 251, "correlation": 97983 } }, { "ph": "f", "id": 97983, "pid": 76337, "tid": -914061504, "ts": 1716454222789428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222916401, "dur": 112, "args": { "External id": 97984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97984, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 97984, "pid": 5, "tid": 7, "ts": 1716454222916401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789431, "dur": 14, "args": { "External id": 97984, "cbid": 211, "correlation": 97984 } }, { "ph": "s", "id": 97984, "pid": 76337, "tid": -914061504, "ts": 1716454222789431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222916515, "dur": 59, "args": { "External id": 97990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97990, "pid": 5, "tid": 7, "ts": 1716454222916515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789467, "dur": 9, "args": { "External id": 97990, "cbid": 211, "correlation": 97990 } }, { "ph": "s", "id": 97990, "pid": 76337, "tid": -914061504, "ts": 1716454222789467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222916575, "dur": 607, "args": { "External id": 97999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 97999, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 97999, "pid": 5, "tid": 7, "ts": 1716454222916575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789557, "dur": 15, "args": { "External id": 97999, "cbid": 211, "correlation": 97999 } }, { "ph": "s", "id": 97999, "pid": 76337, "tid": -914061504, "ts": 1716454222789557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222917183, "dur": 177, "args": { "External id": 98021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98021, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98021, "pid": 5, "tid": 7, "ts": 1716454222917183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789622, "dur": 11, "args": { "External id": 98021, "cbid": 211, "correlation": 98021 } }, { "ph": "s", "id": 98021, "pid": 76337, "tid": -914061504, "ts": 1716454222789622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789722, "dur": 1, "args": { "External id": 98032, "cbid": 251, "correlation": 98032 } }, { "ph": "f", "id": 98032, "pid": 76337, "tid": -914061504, "ts": 1716454222789722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222917361, "dur": 194, "args": { "External id": 98033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98033, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 98033, "pid": 5, "tid": 7, "ts": 1716454222917361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789727, "dur": 13, "args": { "External id": 98033, "cbid": 211, "correlation": 98033 } }, { "ph": "s", "id": 98033, "pid": 76337, "tid": -914061504, "ts": 1716454222789727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789797, "dur": 1, "args": { "External id": 98044, "cbid": 251, "correlation": 98044 } }, { "ph": "f", "id": 98044, "pid": 76337, "tid": -914061504, "ts": 1716454222789797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222917556, "dur": 184, "args": { "External id": 98045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98045, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 98045, "pid": 5, "tid": 7, "ts": 1716454222917556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789801, "dur": 11, "args": { "External id": 98045, "cbid": 211, "correlation": 98045 } }, { "ph": "s", "id": 98045, "pid": 76337, "tid": -914061504, "ts": 1716454222789801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222789864, "dur": 1, "args": { "External id": 98056, "cbid": 251, "correlation": 98056 } }, { "ph": "f", "id": 98056, "pid": 76337, "tid": -914061504, "ts": 1716454222789864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222917742, "dur": 185, "args": { "External id": 98057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98057, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 98057, "pid": 5, "tid": 7, "ts": 1716454222917742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789868, "dur": 11, "args": { "External id": 98057, "cbid": 211, "correlation": 98057 } }, { "ph": "s", "id": 98057, "pid": 76337, "tid": -914061504, "ts": 1716454222789868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222917928, "dur": 18122, "args": { "External id": 98078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98078, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 98078, "pid": 5, "tid": 7, "ts": 1716454222917928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222789961, "dur": 20, "args": { "External id": 98078, "cbid": 211, "correlation": 98078 } }, { "ph": "s", "id": 98078, "pid": 76337, "tid": -914061504, "ts": 1716454222789961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222790079, "dur": 2, "args": { "External id": 98096, "cbid": 251, "correlation": 98096 } }, { "ph": "f", "id": 98096, "pid": 76337, "tid": -914061504, "ts": 1716454222790079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222936051, "dur": 200, "args": { "External id": 98098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98098, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 98098, "pid": 5, "tid": 7, "ts": 1716454222936051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222790085, "dur": 13, "args": { "External id": 98098, "cbid": 211, "correlation": 98098 } }, { "ph": "s", "id": 98098, "pid": 76337, "tid": -914061504, "ts": 1716454222790085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222936252, "dur": 66, "args": { "External id": 98106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98106, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98106, "pid": 5, "tid": 7, "ts": 1716454222936252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222790159, "dur": 13, "args": { "External id": 98106, "cbid": 211, "correlation": 98106 } }, { "ph": "s", "id": 98106, "pid": 76337, "tid": -914061504, "ts": 1716454222790159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222936319, "dur": 97, "args": { "External id": 98114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98114, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98114, "pid": 5, "tid": 7, "ts": 1716454222936319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222790200, "dur": 26, "args": { "External id": 98114, "cbid": 211, "correlation": 98114 } }, { "ph": "s", "id": 98114, "pid": 76337, "tid": -914061504, "ts": 1716454222790200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222936418, "dur": 54, "args": { "External id": 98125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98125, "pid": 5, "tid": 7, "ts": 1716454222936418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222790298, "dur": 1838, "args": { "External id": 98125, "cbid": 211, "correlation": 98125 } }, { "ph": "s", "id": 98125, "pid": 76337, "tid": -914061504, "ts": 1716454222790298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222936472, "dur": 90, "args": { "External id": 98147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98147, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98147, "pid": 5, "tid": 7, "ts": 1716454222936472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792157, "dur": 124, "args": { "External id": 98147, "cbid": 211, "correlation": 98147 } }, { "ph": "s", "id": 98147, "pid": 76337, "tid": -914061504, "ts": 1716454222792157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792359, "dur": 1, "args": { "External id": 98158, "cbid": 251, "correlation": 98158 } }, { "ph": "f", "id": 98158, "pid": 76337, "tid": -914061504, "ts": 1716454222792359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222936564, "dur": 101, "args": { "External id": 98159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98159, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 98159, "pid": 5, "tid": 7, "ts": 1716454222936564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792364, "dur": 13, "args": { "External id": 98159, "cbid": 211, "correlation": 98159 } }, { "ph": "s", "id": 98159, "pid": 76337, "tid": -914061504, "ts": 1716454222792364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792445, "dur": 1, "args": { "External id": 98170, "cbid": 251, "correlation": 98170 } }, { "ph": "f", "id": 98170, "pid": 76337, "tid": -914061504, "ts": 1716454222792445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792449, "dur": 0, "args": { "External id": 98171, "cbid": 251, "correlation": 98171 } }, { "ph": "f", "id": 98171, "pid": 76337, "tid": -914061504, "ts": 1716454222792449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222936666, "dur": 10, "args": { "External id": 98172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98172, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98172, "pid": 5, "tid": 7, "ts": 1716454222936666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792451, "dur": 13, "args": { "External id": 98172, "cbid": 211, "correlation": 98172 } }, { "ph": "s", "id": 98172, "pid": 76337, "tid": -914061504, "ts": 1716454222792451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222936677, "dur": 5, "args": { "External id": 98174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98174, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 98174, "pid": 5, "tid": 7, "ts": 1716454222936677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792466, "dur": 7, "args": { "External id": 98174, "cbid": 211, "correlation": 98174 } }, { "ph": "s", "id": 98174, "pid": 76337, "tid": -914061504, "ts": 1716454222792466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792529, "dur": 1, "args": { "External id": 98185, "cbid": 251, "correlation": 98185 } }, { "ph": "f", "id": 98185, "pid": 76337, "tid": -914061504, "ts": 1716454222792529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792532, "dur": 0, "args": { "External id": 98186, "cbid": 251, "correlation": 98186 } }, { "ph": "f", "id": 98186, "pid": 76337, "tid": -914061504, "ts": 1716454222792532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222936684, "dur": 6, "args": { "External id": 98187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98187, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98187, "pid": 5, "tid": 7, "ts": 1716454222936684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792534, "dur": 12, "args": { "External id": 98187, "cbid": 211, "correlation": 98187 } }, { "ph": "s", "id": 98187, "pid": 76337, "tid": -914061504, "ts": 1716454222792534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222936691, "dur": 3, "args": { "External id": 98189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98189, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 98189, "pid": 5, "tid": 7, "ts": 1716454222936691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792547, "dur": 6, "args": { "External id": 98189, "cbid": 211, "correlation": 98189 } }, { "ph": "s", "id": 98189, "pid": 76337, "tid": -914061504, "ts": 1716454222792547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222936696, "dur": 152, "args": { "External id": 98210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98210, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 98210, "pid": 5, "tid": 7, "ts": 1716454222936696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792623, "dur": 12, "args": { "External id": 98210, "cbid": 211, "correlation": 98210 } }, { "ph": "s", "id": 98210, "pid": 76337, "tid": -914061504, "ts": 1716454222792623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792721, "dur": 1, "args": { "External id": 98228, "cbid": 251, "correlation": 98228 } }, { "ph": "f", "id": 98228, "pid": 76337, "tid": -914061504, "ts": 1716454222792721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222936850, "dur": 104, "args": { "External id": 98230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98230, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 98230, "pid": 5, "tid": 7, "ts": 1716454222936850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792728, "dur": 14, "args": { "External id": 98230, "cbid": 211, "correlation": 98230 } }, { "ph": "s", "id": 98230, "pid": 76337, "tid": -914061504, "ts": 1716454222792728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222936955, "dur": 35, "args": { "External id": 98238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98238, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98238, "pid": 5, "tid": 7, "ts": 1716454222936955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792798, "dur": 12, "args": { "External id": 98238, "cbid": 211, "correlation": 98238 } }, { "ph": "s", "id": 98238, "pid": 76337, "tid": -914061504, "ts": 1716454222792798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222936991, "dur": 67, "args": { "External id": 98246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98246, "pid": 5, "tid": 7, "ts": 1716454222936991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792839, "dur": 10, "args": { "External id": 98246, "cbid": 211, "correlation": 98246 } }, { "ph": "s", "id": 98246, "pid": 76337, "tid": -914061504, "ts": 1716454222792839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222937059, "dur": 90, "args": { "External id": 98268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98268, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98268, "pid": 5, "tid": 7, "ts": 1716454222937059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792891, "dur": 10, "args": { "External id": 98268, "cbid": 211, "correlation": 98268 } }, { "ph": "s", "id": 98268, "pid": 76337, "tid": -914061504, "ts": 1716454222792891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222792987, "dur": 1, "args": { "External id": 98284, "cbid": 251, "correlation": 98284 } }, { "ph": "f", "id": 98284, "pid": 76337, "tid": -914061504, "ts": 1716454222792987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222937151, "dur": 562, "args": { "External id": 98286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98286, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 98286, "pid": 5, "tid": 7, "ts": 1716454222937151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222792993, "dur": 13, "args": { "External id": 98286, "cbid": 211, "correlation": 98286 } }, { "ph": "s", "id": 98286, "pid": 76337, "tid": -914061504, "ts": 1716454222792993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222937714, "dur": 240, "args": { "External id": 98294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98294, "pid": 5, "tid": 7, "ts": 1716454222937714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793069, "dur": 15, "args": { "External id": 98294, "cbid": 211, "correlation": 98294 } }, { "ph": "s", "id": 98294, "pid": 76337, "tid": -914061504, "ts": 1716454222793069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222937955, "dur": 251, "args": { "External id": 98302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98302, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98302, "pid": 5, "tid": 7, "ts": 1716454222937955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793106, "dur": 9, "args": { "External id": 98302, "cbid": 211, "correlation": 98302 } }, { "ph": "s", "id": 98302, "pid": 76337, "tid": -914061504, "ts": 1716454222793106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793189, "dur": 1, "args": { "External id": 98318, "cbid": 251, "correlation": 98318 } }, { "ph": "f", "id": 98318, "pid": 76337, "tid": -914061504, "ts": 1716454222793189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793195, "dur": 0, "args": { "External id": 98320, "cbid": 251, "correlation": 98320 } }, { "ph": "f", "id": 98320, "pid": 76337, "tid": -914061504, "ts": 1716454222793195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222938208, "dur": 359, "args": { "External id": 98321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98321, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98321, "pid": 5, "tid": 7, "ts": 1716454222938208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793199, "dur": 13, "args": { "External id": 98321, "cbid": 211, "correlation": 98321 } }, { "ph": "s", "id": 98321, "pid": 76337, "tid": -914061504, "ts": 1716454222793199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222938568, "dur": 50, "args": { "External id": 98329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98329, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98329, "pid": 5, "tid": 7, "ts": 1716454222938568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793242, "dur": 115, "args": { "External id": 98329, "cbid": 211, "correlation": 98329 } }, { "ph": "s", "id": 98329, "pid": 76337, "tid": -914061504, "ts": 1716454222793242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222938619, "dur": 155, "args": { "External id": 98340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98340, "pid": 5, "tid": 7, "ts": 1716454222938619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793419, "dur": 66, "args": { "External id": 98340, "cbid": 211, "correlation": 98340 } }, { "ph": "s", "id": 98340, "pid": 76337, "tid": -914061504, "ts": 1716454222793419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222793541, "dur": 0, "args": { "External id": 98352, "cbid": 317, "correlation": 98352 } }, { "ph": "f", "id": 98352, "pid": 76337, "tid": -914061504, "ts": 1716454222793541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222793542, "dur": 0, "args": { "External id": 98353, "cbid": 203, "correlation": 98353 } }, { "ph": "f", "id": 98353, "pid": 76337, "tid": -914061504, "ts": 1716454222793542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222793543, "dur": 0, "args": { "External id": 98354, "cbid": 205, "correlation": 98354 } }, { "ph": "f", "id": 98354, "pid": 76337, "tid": -914061504, "ts": 1716454222793543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793570, "dur": 1, "args": { "External id": 98358, "cbid": 251, "correlation": 98358 } }, { "ph": "f", "id": 98358, "pid": 76337, "tid": -914061504, "ts": 1716454222793570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793572, "dur": 0, "args": { "External id": 98359, "cbid": 251, "correlation": 98359 } }, { "ph": "f", "id": 98359, "pid": 76337, "tid": -914061504, "ts": 1716454222793572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793573, "dur": 0, "args": { "External id": 98360, "cbid": 251, "correlation": 98360 } }, { "ph": "f", "id": 98360, "pid": 76337, "tid": -914061504, "ts": 1716454222793573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793574, "dur": 0, "args": { "External id": 98361, "cbid": 251, "correlation": 98361 } }, { "ph": "f", "id": 98361, "pid": 76337, "tid": -914061504, "ts": 1716454222793574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793575, "dur": 1, "args": { "External id": 98362, "cbid": 251, "correlation": 98362 } }, { "ph": "f", "id": 98362, "pid": 76337, "tid": -914061504, "ts": 1716454222793575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793576, "dur": 0, "args": { "External id": 98363, "cbid": 251, "correlation": 98363 } }, { "ph": "f", "id": 98363, "pid": 76337, "tid": -914061504, "ts": 1716454222793576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793577, "dur": 0, "args": { "External id": 98364, "cbid": 251, "correlation": 98364 } }, { "ph": "f", "id": 98364, "pid": 76337, "tid": -914061504, "ts": 1716454222793577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793578, "dur": 0, "args": { "External id": 98365, "cbid": 251, "correlation": 98365 } }, { "ph": "f", "id": 98365, "pid": 76337, "tid": -914061504, "ts": 1716454222793578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222793579, "dur": 0, "args": { "External id": 98366, "cbid": 251, "correlation": 98366 } }, { "ph": "f", "id": 98366, "pid": 76337, "tid": -914061504, "ts": 1716454222793579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222938776, "dur": 114, "args": { "External id": 98367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98367, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 98367, "pid": 5, "tid": 7, "ts": 1716454222938776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793582, "dur": 31, "args": { "External id": 98367, "cbid": 211, "correlation": 98367 } }, { "ph": "s", "id": 98367, "pid": 76337, "tid": -914061504, "ts": 1716454222793582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222938891, "dur": 59, "args": { "External id": 98373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98373, "pid": 5, "tid": 7, "ts": 1716454222938891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793635, "dur": 276, "args": { "External id": 98373, "cbid": 211, "correlation": 98373 } }, { "ph": "s", "id": 98373, "pid": 76337, "tid": -914061504, "ts": 1716454222793635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222938952, "dur": 50, "args": { "External id": 98381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98381, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98381, "pid": 5, "tid": 7, "ts": 1716454222938952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222793935, "dur": 9, "args": { "External id": 98381, "cbid": 211, "correlation": 98381 } }, { "ph": "s", "id": 98381, "pid": 76337, "tid": -914061504, "ts": 1716454222793935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222939003, "dur": 51, "args": { "External id": 98401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98401, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 98401, "pid": 5, "tid": 7, "ts": 1716454222939003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794030, "dur": 13, "args": { "External id": 98401, "cbid": 211, "correlation": 98401 } }, { "ph": "s", "id": 98401, "pid": 76337, "tid": -914061504, "ts": 1716454222794030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222939056, "dur": 4, "args": { "External id": 98413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98413, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 98413, "pid": 5, "tid": 7, "ts": 1716454222939056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794053, "dur": 6, "args": { "External id": 98413, "cbid": 211, "correlation": 98413 } }, { "ph": "s", "id": 98413, "pid": 76337, "tid": -914061504, "ts": 1716454222794053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222939061, "dur": 56, "args": { "External id": 98416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98416, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98416, "pid": 5, "tid": 7, "ts": 1716454222939061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794072, "dur": 93, "args": { "External id": 98416, "cbid": 211, "correlation": 98416 } }, { "ph": "s", "id": 98416, "pid": 76337, "tid": -914061504, "ts": 1716454222794072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939118, "dur": 37, "args": { "External id": 98425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98425, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98425, "pid": 5, "tid": 7, "ts": 1716454222939118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794207, "dur": 11, "args": { "External id": 98425, "cbid": 211, "correlation": 98425 } }, { "ph": "s", "id": 98425, "pid": 76337, "tid": -914061504, "ts": 1716454222794207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222794264, "dur": 0, "args": { "External id": 98435, "cbid": 317, "correlation": 98435 } }, { "ph": "f", "id": 98435, "pid": 76337, "tid": -914061504, "ts": 1716454222794264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222794264, "dur": 0, "args": { "External id": 98436, "cbid": 203, "correlation": 98436 } }, { "ph": "f", "id": 98436, "pid": 76337, "tid": -914061504, "ts": 1716454222794264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222794265, "dur": 0, "args": { "External id": 98437, "cbid": 205, "correlation": 98437 } }, { "ph": "f", "id": 98437, "pid": 76337, "tid": -914061504, "ts": 1716454222794265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222939156, "dur": 41, "args": { "External id": 98441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98441, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98441, "pid": 5, "tid": 7, "ts": 1716454222939156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794284, "dur": 12, "args": { "External id": 98441, "cbid": 211, "correlation": 98441 } }, { "ph": "s", "id": 98441, "pid": 76337, "tid": -914061504, "ts": 1716454222794284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222939199, "dur": 3, "args": { "External id": 98443, "device": 5, "context": 1, "stream": 7, "correlation": 98443, "bytes": 46080, "memory bandwidth (GB/s)": 12.413793103448276 } }, { "ph": "f", "id": 98443, "pid": 5, "tid": 7, "ts": 1716454222939199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222794300, "dur": 23, "args": { "External id": 98443, "cbid": 51, "correlation": 98443 } }, { "ph": "s", "id": 98443, "pid": 76337, "tid": -914061504, "ts": 1716454222794300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222794328, "dur": 2, "args": { "External id": 98445, "cbid": 200, "correlation": 98445 } }, { "ph": "f", "id": 98445, "pid": 76337, "tid": -914061504, "ts": 1716454222794328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222794331, "dur": 0, "args": { "External id": 98446, "cbid": 200, "correlation": 98446 } }, { "ph": "f", "id": 98446, "pid": 76337, "tid": -914061504, "ts": 1716454222794331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222794331, "dur": 0, "args": { "External id": 98447, "cbid": 200, "correlation": 98447 } }, { "ph": "f", "id": 98447, "pid": 76337, "tid": -914061504, "ts": 1716454222794331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222794331, "dur": 0, "args": { "External id": 98448, "cbid": 200, "correlation": 98448 } }, { "ph": "f", "id": 98448, "pid": 76337, "tid": -914061504, "ts": 1716454222794331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454222794333, "dur": 4, "args": { "External id": 98449, "cbid": 15, "correlation": 98449 } }, { "ph": "f", "id": 98449, "pid": 76337, "tid": -914061504, "ts": 1716454222794333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222794337, "dur": 1, "args": { "External id": 98450, "cbid": 251, "correlation": 98450 } }, { "ph": "f", "id": 98450, "pid": 76337, "tid": -914061504, "ts": 1716454222794337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454222939203, "dur": 25, "args": { "External id": 98451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98451, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98451, "pid": 5, "tid": 7, "ts": 1716454222939203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794341, "dur": 8, "args": { "External id": 98451, "cbid": 211, "correlation": 98451 } }, { "ph": "s", "id": 98451, "pid": 76337, "tid": -914061504, "ts": 1716454222794341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222939230, "dur": 4, "args": { "External id": 98453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 98453, "pid": 5, "tid": 7, "ts": 1716454222939230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794356, "dur": 6, "args": { "External id": 98453, "cbid": 211, "correlation": 98453 } }, { "ph": "s", "id": 98453, "pid": 76337, "tid": -914061504, "ts": 1716454222794356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222794367, "dur": 0, "args": { "External id": 98454, "cbid": 51, "correlation": 98454 } }, { "ph": "s", "id": 98454, "pid": 76337, "tid": -914061504, "ts": 1716454222794367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222939235, "dur": 185, "args": { "External id": 98455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98455, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98455, "pid": 5, "tid": 7, "ts": 1716454222939235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794368, "dur": 184, "args": { "External id": 98455, "cbid": 211, "correlation": 98455 } }, { "ph": "s", "id": 98455, "pid": 76337, "tid": -914061504, "ts": 1716454222794368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222939421, "dur": 6, "args": { "External id": 98456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98456, "pid": 5, "tid": 7, "ts": 1716454222939421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794556, "dur": 6, "args": { "External id": 98456, "cbid": 211, "correlation": 98456 } }, { "ph": "s", "id": 98456, "pid": 76337, "tid": -914061504, "ts": 1716454222794556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222939428, "dur": 5, "args": { "External id": 98462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 98462, "pid": 5, "tid": 7, "ts": 1716454222939428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222794586, "dur": 9, "args": { "External id": 98462, "cbid": 211, "correlation": 98462 } }, { "ph": "s", "id": 98462, "pid": 76337, "tid": -914061504, "ts": 1716454222794586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939434, "dur": 3, "args": { "External id": 98470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98470, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98470, "pid": 5, "tid": 7, "ts": 1716454222939434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796327, "dur": 16, "args": { "External id": 98470, "cbid": 211, "correlation": 98470 } }, { "ph": "s", "id": 98470, "pid": 76337, "tid": -914061504, "ts": 1716454222796327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939438, "dur": 3, "args": { "External id": 98478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98478, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98478, "pid": 5, "tid": 7, "ts": 1716454222939438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796373, "dur": 10, "args": { "External id": 98478, "cbid": 211, "correlation": 98478 } }, { "ph": "s", "id": 98478, "pid": 76337, "tid": -914061504, "ts": 1716454222796373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939443, "dur": 3, "args": { "External id": 98486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98486, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98486, "pid": 5, "tid": 7, "ts": 1716454222939443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796400, "dur": 9, "args": { "External id": 98486, "cbid": 211, "correlation": 98486 } }, { "ph": "s", "id": 98486, "pid": 76337, "tid": -914061504, "ts": 1716454222796400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939447, "dur": 3, "args": { "External id": 98495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98495, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98495, "pid": 5, "tid": 7, "ts": 1716454222939447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796583, "dur": 14, "args": { "External id": 98495, "cbid": 211, "correlation": 98495 } }, { "ph": "s", "id": 98495, "pid": 76337, "tid": -914061504, "ts": 1716454222796583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939450, "dur": 3, "args": { "External id": 98504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98504, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98504, "pid": 5, "tid": 7, "ts": 1716454222939450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796613, "dur": 7, "args": { "External id": 98504, "cbid": 211, "correlation": 98504 } }, { "ph": "s", "id": 98504, "pid": 76337, "tid": -914061504, "ts": 1716454222796613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939455, "dur": 3, "args": { "External id": 98512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98512, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98512, "pid": 5, "tid": 7, "ts": 1716454222939455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796641, "dur": 9, "args": { "External id": 98512, "cbid": 211, "correlation": 98512 } }, { "ph": "s", "id": 98512, "pid": 76337, "tid": -914061504, "ts": 1716454222796641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939459, "dur": 3, "args": { "External id": 98520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98520, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98520, "pid": 5, "tid": 7, "ts": 1716454222939459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796903, "dur": 15, "args": { "External id": 98520, "cbid": 211, "correlation": 98520 } }, { "ph": "s", "id": 98520, "pid": 76337, "tid": -914061504, "ts": 1716454222796903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939463, "dur": 3, "args": { "External id": 98528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98528, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98528, "pid": 5, "tid": 7, "ts": 1716454222939463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222796935, "dur": 8, "args": { "External id": 98528, "cbid": 211, "correlation": 98528 } }, { "ph": "s", "id": 98528, "pid": 76337, "tid": -914061504, "ts": 1716454222796935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222939468, "dur": 1, "args": { "External id": 98538, "device": 5, "context": 1, "stream": 7, "correlation": 98538, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 98538, "pid": 5, "tid": 7, "ts": 1716454222939468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222797010, "dur": 36, "args": { "External id": 98538, "cbid": 41, "correlation": 98538 } }, { "ph": "s", "id": 98538, "pid": 76337, "tid": -914061504, "ts": 1716454222797010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222797047, "dur": 142451, "args": { "External id": 98539, "cbid": 131, "correlation": 98539 } }, { "ph": "f", "id": 98539, "pid": 76337, "tid": -914061504, "ts": 1716454222797047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222939931, "dur": 3, "args": { "External id": 98547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98547, "pid": 5, "tid": 7, "ts": 1716454222939931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222939862, "dur": 75, "args": { "External id": 98547, "cbid": 211, "correlation": 98547 } }, { "ph": "s", "id": 98547, "pid": 76337, "tid": -914061504, "ts": 1716454222939862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940075, "dur": 3, "args": { "External id": 98556, "device": 5, "context": 1, "stream": 7, "correlation": 98556, "bytes": 8, "memory bandwidth (GB/s)": 0.0024279210925644916 } }, { "ph": "f", "id": 98556, "pid": 5, "tid": 7, "ts": 1716454222940075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940026, "dur": 50, "args": { "External id": 98556, "cbid": 41, "correlation": 98556 } }, { "ph": "s", "id": 98556, "pid": 76337, "tid": -914061504, "ts": 1716454222940026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222940193, "dur": 4, "args": { "External id": 98566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98566, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98566, "pid": 5, "tid": 7, "ts": 1716454222940193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940175, "dur": 19, "args": { "External id": 98566, "cbid": 211, "correlation": 98566 } }, { "ph": "s", "id": 98566, "pid": 76337, "tid": -914061504, "ts": 1716454222940175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940301, "dur": 1, "args": { "External id": 98576, "device": 5, "context": 1, "stream": 7, "correlation": 98576, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 98576, "pid": 5, "tid": 7, "ts": 1716454222940301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940278, "dur": 21, "args": { "External id": 98576, "cbid": 41, "correlation": 98576 } }, { "ph": "s", "id": 98576, "pid": 76337, "tid": -914061504, "ts": 1716454222940278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222940300, "dur": 7, "args": { "External id": 98577, "cbid": 131, "correlation": 98577 } }, { "ph": "f", "id": 98577, "pid": 76337, "tid": -914061504, "ts": 1716454222940300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940384, "dur": 3, "args": { "External id": 98584, "device": 5, "context": 1, "stream": 7, "correlation": 98584, "bytes": 98304, "memory bandwidth (GB/s)": 31.03030303030303 } }, { "ph": "f", "id": 98584, "pid": 5, "tid": 7, "ts": 1716454222940384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940361, "dur": 23, "args": { "External id": 98584, "cbid": 41, "correlation": 98584 } }, { "ph": "s", "id": 98584, "pid": 76337, "tid": -914061504, "ts": 1716454222940361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940484, "dur": 2, "args": { "External id": 98603, "device": 5, "context": 1, "stream": 7, "correlation": 98603, "bytes": 16, "memory bandwidth (GB/s)": 0.005376344086021506 } }, { "ph": "f", "id": 98603, "pid": 5, "tid": 7, "ts": 1716454222940484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940464, "dur": 19, "args": { "External id": 98603, "cbid": 41, "correlation": 98603 } }, { "ph": "s", "id": 98603, "pid": 76337, "tid": -914061504, "ts": 1716454222940464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222940526, "dur": 3, "args": { "External id": 98609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98609, "pid": 5, "tid": 7, "ts": 1716454222940526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940513, "dur": 14, "args": { "External id": 98609, "cbid": 211, "correlation": 98609 } }, { "ph": "s", "id": 98609, "pid": 76337, "tid": -914061504, "ts": 1716454222940513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454222940543, "dur": 6, "args": { "External id": 98611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98611, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 98611, "pid": 5, "tid": 7, "ts": 1716454222940543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940532, "dur": 10, "args": { "External id": 98611, "cbid": 211, "correlation": 98611 } }, { "ph": "s", "id": 98611, "pid": 76337, "tid": -914061504, "ts": 1716454222940532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454222940552, "dur": 3, "args": { "External id": 98613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98613, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98613, "pid": 5, "tid": 7, "ts": 1716454222940552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940544, "dur": 7, "args": { "External id": 98613, "cbid": 211, "correlation": 98613 } }, { "ph": "s", "id": 98613, "pid": 76337, "tid": -914061504, "ts": 1716454222940544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940588, "dur": 2, "args": { "External id": 98621, "device": 5, "context": 1, "stream": 7, "correlation": 98621, "bytes": 8, "memory bandwidth (GB/s)": 0.002808002808002808 } }, { "ph": "f", "id": 98621, "pid": 5, "tid": 7, "ts": 1716454222940588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940572, "dur": 15, "args": { "External id": 98621, "cbid": 41, "correlation": 98621 } }, { "ph": "s", "id": 98621, "pid": 76337, "tid": -914061504, "ts": 1716454222940572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222940642, "dur": 3, "args": { "External id": 98635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98635, "pid": 5, "tid": 7, "ts": 1716454222940642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940630, "dur": 13, "args": { "External id": 98635, "cbid": 211, "correlation": 98635 } }, { "ph": "s", "id": 98635, "pid": 76337, "tid": -914061504, "ts": 1716454222940630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222940664, "dur": 2, "args": { "External id": 98649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98649, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98649, "pid": 5, "tid": 7, "ts": 1716454222940664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940655, "dur": 8, "args": { "External id": 98649, "cbid": 211, "correlation": 98649 } }, { "ph": "s", "id": 98649, "pid": 76337, "tid": -914061504, "ts": 1716454222940655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222940703, "dur": 6, "args": { "External id": 98656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98656, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98656, "pid": 5, "tid": 7, "ts": 1716454222940703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940692, "dur": 11, "args": { "External id": 98656, "cbid": 211, "correlation": 98656 } }, { "ph": "s", "id": 98656, "pid": 76337, "tid": -914061504, "ts": 1716454222940692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454222940714, "dur": 6, "args": { "External id": 98659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98659, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98659, "pid": 5, "tid": 7, "ts": 1716454222940714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940706, "dur": 7, "args": { "External id": 98659, "cbid": 211, "correlation": 98659 } }, { "ph": "s", "id": 98659, "pid": 76337, "tid": -914061504, "ts": 1716454222940706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454222940723, "dur": 3, "args": { "External id": 98661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98661, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98661, "pid": 5, "tid": 7, "ts": 1716454222940723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940715, "dur": 8, "args": { "External id": 98661, "cbid": 211, "correlation": 98661 } }, { "ph": "s", "id": 98661, "pid": 76337, "tid": -914061504, "ts": 1716454222940715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940745, "dur": 2, "args": { "External id": 98664, "device": 5, "context": 1, "stream": 7, "correlation": 98664, "bytes": 8, "memory bandwidth (GB/s)": 0.0028089887640449437 } }, { "ph": "f", "id": 98664, "pid": 5, "tid": 7, "ts": 1716454222940745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940732, "dur": 12, "args": { "External id": 98664, "cbid": 41, "correlation": 98664 } }, { "ph": "s", "id": 98664, "pid": 76337, "tid": -914061504, "ts": 1716454222940732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222940800, "dur": 4, "args": { "External id": 98680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98680, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98680, "pid": 5, "tid": 7, "ts": 1716454222940800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940786, "dur": 15, "args": { "External id": 98680, "cbid": 211, "correlation": 98680 } }, { "ph": "s", "id": 98680, "pid": 76337, "tid": -914061504, "ts": 1716454222940786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222940822, "dur": 3, "args": { "External id": 98685, "device": 5, "context": 1, "stream": 7, "correlation": 98685, "bytes": 1, "memory bandwidth (GB/s)": 0.00030637254901960784 } }, { "ph": "f", "id": 98685, "pid": 5, "tid": 7, "ts": 1716454222940822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940806, "dur": 15, "args": { "External id": 98685, "cbid": 41, "correlation": 98685 } }, { "ph": "s", "id": 98685, "pid": 76337, "tid": -914061504, "ts": 1716454222940806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222940849, "dur": 1, "args": { "External id": 98691, "device": 5, "context": 1, "stream": 7, "correlation": 98691, "bytes": 1, "memory bandwidth (GB/s)": 0.0005896226415094339 } }, { "ph": "f", "id": 98691, "pid": 5, "tid": 7, "ts": 1716454222940849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222940831, "dur": 27, "args": { "External id": 98691, "cbid": 41, "correlation": 98691 } }, { "ph": "s", "id": 98691, "pid": 76337, "tid": -914061504, "ts": 1716454222940831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222940859, "dur": 4, "args": { "External id": 98692, "cbid": 131, "correlation": 98692 } }, { "ph": "f", "id": 98692, "pid": 76337, "tid": -914061504, "ts": 1716454222940859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222940918, "dur": 3, "args": { "External id": 98700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98700, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98700, "pid": 5, "tid": 7, "ts": 1716454222940918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940902, "dur": 16, "args": { "External id": 98700, "cbid": 211, "correlation": 98700 } }, { "ph": "s", "id": 98700, "pid": 76337, "tid": -914061504, "ts": 1716454222940902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222940951, "dur": 3, "args": { "External id": 98710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98710, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98710, "pid": 5, "tid": 7, "ts": 1716454222940951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940941, "dur": 9, "args": { "External id": 98710, "cbid": 211, "correlation": 98710 } }, { "ph": "s", "id": 98710, "pid": 76337, "tid": -914061504, "ts": 1716454222940941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222940992, "dur": 3, "args": { "External id": 98719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98719, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98719, "pid": 5, "tid": 7, "ts": 1716454222940992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222940980, "dur": 11, "args": { "External id": 98719, "cbid": 211, "correlation": 98719 } }, { "ph": "s", "id": 98719, "pid": 76337, "tid": -914061504, "ts": 1716454222940980, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222941126, "dur": 12, "args": { "External id": 98729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98729, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98729, "pid": 5, "tid": 7, "ts": 1716454222941126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941112, "dur": 15, "args": { "External id": 98729, "cbid": 211, "correlation": 98729 } }, { "ph": "s", "id": 98729, "pid": 76337, "tid": -914061504, "ts": 1716454222941112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222941168, "dur": 3, "args": { "External id": 98737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98737, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98737, "pid": 5, "tid": 7, "ts": 1716454222941168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941158, "dur": 9, "args": { "External id": 98737, "cbid": 211, "correlation": 98737 } }, { "ph": "s", "id": 98737, "pid": 76337, "tid": -914061504, "ts": 1716454222941158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222941219, "dur": 11, "args": { "External id": 98747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98747, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98747, "pid": 5, "tid": 7, "ts": 1716454222941219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941208, "dur": 12, "args": { "External id": 98747, "cbid": 211, "correlation": 98747 } }, { "ph": "s", "id": 98747, "pid": 76337, "tid": -914061504, "ts": 1716454222941208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222941257, "dur": 10, "args": { "External id": 98755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98755, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98755, "pid": 5, "tid": 7, "ts": 1716454222941257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941248, "dur": 9, "args": { "External id": 98755, "cbid": 211, "correlation": 98755 } }, { "ph": "s", "id": 98755, "pid": 76337, "tid": -914061504, "ts": 1716454222941248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222941284, "dur": 3, "args": { "External id": 98764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98764, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98764, "pid": 5, "tid": 7, "ts": 1716454222941284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941275, "dur": 9, "args": { "External id": 98764, "cbid": 211, "correlation": 98764 } }, { "ph": "s", "id": 98764, "pid": 76337, "tid": -914061504, "ts": 1716454222941275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222941313, "dur": 5, "args": { "External id": 98773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98773, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98773, "pid": 5, "tid": 7, "ts": 1716454222941313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941300, "dur": 12, "args": { "External id": 98773, "cbid": 211, "correlation": 98773 } }, { "ph": "s", "id": 98773, "pid": 76337, "tid": -914061504, "ts": 1716454222941300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222941354, "dur": 8, "args": { "External id": 98783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98783, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98783, "pid": 5, "tid": 7, "ts": 1716454222941354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941343, "dur": 12, "args": { "External id": 98783, "cbid": 211, "correlation": 98783 } }, { "ph": "s", "id": 98783, "pid": 76337, "tid": -914061504, "ts": 1716454222941343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222941723, "dur": 3, "args": { "External id": 98792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98792, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98792, "pid": 5, "tid": 7, "ts": 1716454222941723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941708, "dur": 15, "args": { "External id": 98792, "cbid": 211, "correlation": 98792 } }, { "ph": "s", "id": 98792, "pid": 76337, "tid": -914061504, "ts": 1716454222941708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222941756, "dur": 3, "args": { "External id": 98800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98800, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98800, "pid": 5, "tid": 7, "ts": 1716454222941756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941746, "dur": 11, "args": { "External id": 98800, "cbid": 211, "correlation": 98800 } }, { "ph": "s", "id": 98800, "pid": 76337, "tid": -914061504, "ts": 1716454222941746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222941849, "dur": 1, "args": { "External id": 98810, "device": 5, "context": 1, "stream": 7, "correlation": 98810, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 98810, "pid": 5, "tid": 7, "ts": 1716454222941849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222941794, "dur": 53, "args": { "External id": 98810, "cbid": 41, "correlation": 98810 } }, { "ph": "s", "id": 98810, "pid": 76337, "tid": -914061504, "ts": 1716454222941794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222941849, "dur": 8, "args": { "External id": 98811, "cbid": 131, "correlation": 98811 } }, { "ph": "f", "id": 98811, "pid": 76337, "tid": -914061504, "ts": 1716454222941849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222941945, "dur": 2, "args": { "External id": 98819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98819, "pid": 5, "tid": 7, "ts": 1716454222941945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222941930, "dur": 15, "args": { "External id": 98819, "cbid": 211, "correlation": 98819 } }, { "ph": "s", "id": 98819, "pid": 76337, "tid": -914061504, "ts": 1716454222941930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222942031, "dur": 3, "args": { "External id": 98828, "device": 5, "context": 1, "stream": 7, "correlation": 98828, "bytes": 8, "memory bandwidth (GB/s)": 0.0026595744680851063 } }, { "ph": "f", "id": 98828, "pid": 5, "tid": 7, "ts": 1716454222942031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942012, "dur": 18, "args": { "External id": 98828, "cbid": 41, "correlation": 98828 } }, { "ph": "s", "id": 98828, "pid": 76337, "tid": -914061504, "ts": 1716454222942012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222942104, "dur": 3, "args": { "External id": 98838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98838, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 98838, "pid": 5, "tid": 7, "ts": 1716454222942104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942089, "dur": 14, "args": { "External id": 98838, "cbid": 211, "correlation": 98838 } }, { "ph": "s", "id": 98838, "pid": 76337, "tid": -914061504, "ts": 1716454222942089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222942157, "dur": 1, "args": { "External id": 98848, "device": 5, "context": 1, "stream": 7, "correlation": 98848, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 98848, "pid": 5, "tid": 7, "ts": 1716454222942157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942142, "dur": 13, "args": { "External id": 98848, "cbid": 41, "correlation": 98848 } }, { "ph": "s", "id": 98848, "pid": 76337, "tid": -914061504, "ts": 1716454222942142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942156, "dur": 8, "args": { "External id": 98849, "cbid": 131, "correlation": 98849 } }, { "ph": "f", "id": 98849, "pid": 76337, "tid": -914061504, "ts": 1716454222942156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454222942217, "dur": 3, "args": { "External id": 98856, "device": 5, "context": 1, "stream": 7, "correlation": 98856, "bytes": 98304, "memory bandwidth (GB/s)": 32.33684210526316 } }, { "ph": "f", "id": 98856, "pid": 5, "tid": 7, "ts": 1716454222942217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942198, "dur": 19, "args": { "External id": 98856, "cbid": 41, "correlation": 98856 } }, { "ph": "s", "id": 98856, "pid": 76337, "tid": -914061504, "ts": 1716454222942198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222942264, "dur": 1, "args": { "External id": 98867, "device": 5, "context": 1, "stream": 7, "correlation": 98867, "bytes": 2, "memory bandwidth (GB/s)": 0.0013297872340425532 } }, { "ph": "f", "id": 98867, "pid": 5, "tid": 7, "ts": 1716454222942264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942252, "dur": 10, "args": { "External id": 98867, "cbid": 41, "correlation": 98867 } }, { "ph": "s", "id": 98867, "pid": 76337, "tid": -914061504, "ts": 1716454222942252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942263, "dur": 7, "args": { "External id": 98868, "cbid": 131, "correlation": 98868 } }, { "ph": "f", "id": 98868, "pid": 76337, "tid": -914061504, "ts": 1716454222942263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222942312, "dur": 3, "args": { "External id": 98876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98876, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98876, "pid": 5, "tid": 7, "ts": 1716454222942312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942299, "dur": 13, "args": { "External id": 98876, "cbid": 211, "correlation": 98876 } }, { "ph": "s", "id": 98876, "pid": 76337, "tid": -914061504, "ts": 1716454222942299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222942342, "dur": 3, "args": { "External id": 98886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98886, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98886, "pid": 5, "tid": 7, "ts": 1716454222942342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942332, "dur": 8, "args": { "External id": 98886, "cbid": 211, "correlation": 98886 } }, { "ph": "s", "id": 98886, "pid": 76337, "tid": -914061504, "ts": 1716454222942332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222942363, "dur": 3, "args": { "External id": 98895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98895, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98895, "pid": 5, "tid": 7, "ts": 1716454222942363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942355, "dur": 7, "args": { "External id": 98895, "cbid": 211, "correlation": 98895 } }, { "ph": "s", "id": 98895, "pid": 76337, "tid": -914061504, "ts": 1716454222942355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222942434, "dur": 5, "args": { "External id": 98903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98903, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98903, "pid": 5, "tid": 7, "ts": 1716454222942434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942421, "dur": 13, "args": { "External id": 98903, "cbid": 211, "correlation": 98903 } }, { "ph": "s", "id": 98903, "pid": 76337, "tid": -914061504, "ts": 1716454222942421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222942474, "dur": 3, "args": { "External id": 98912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98912, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98912, "pid": 5, "tid": 7, "ts": 1716454222942474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942464, "dur": 9, "args": { "External id": 98912, "cbid": 211, "correlation": 98912 } }, { "ph": "s", "id": 98912, "pid": 76337, "tid": -914061504, "ts": 1716454222942464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222942497, "dur": 3, "args": { "External id": 98921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98921, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98921, "pid": 5, "tid": 7, "ts": 1716454222942497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942488, "dur": 7, "args": { "External id": 98921, "cbid": 211, "correlation": 98921 } }, { "ph": "s", "id": 98921, "pid": 76337, "tid": -914061504, "ts": 1716454222942488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222942562, "dur": 3, "args": { "External id": 98929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98929, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 98929, "pid": 5, "tid": 7, "ts": 1716454222942562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942551, "dur": 10, "args": { "External id": 98929, "cbid": 211, "correlation": 98929 } }, { "ph": "s", "id": 98929, "pid": 76337, "tid": -914061504, "ts": 1716454222942551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222942622, "dur": 1, "args": { "External id": 98937, "device": 5, "context": 1, "stream": 7, "correlation": 98937, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 98937, "pid": 5, "tid": 7, "ts": 1716454222942622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942605, "dur": 26, "args": { "External id": 98937, "cbid": 41, "correlation": 98937 } }, { "ph": "s", "id": 98937, "pid": 76337, "tid": -914061504, "ts": 1716454222942605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942633, "dur": 3, "args": { "External id": 98938, "cbid": 131, "correlation": 98938 } }, { "ph": "f", "id": 98938, "pid": 76337, "tid": -914061504, "ts": 1716454222942633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222942693, "dur": 1, "args": { "External id": 98948, "device": 5, "context": 1, "stream": 7, "correlation": 98948, "bytes": 42, "memory bandwidth (GB/s)": 0.0234375 } }, { "ph": "f", "id": 98948, "pid": 5, "tid": 7, "ts": 1716454222942693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942681, "dur": 10, "args": { "External id": 98948, "cbid": 41, "correlation": 98948 } }, { "ph": "s", "id": 98948, "pid": 76337, "tid": -914061504, "ts": 1716454222942681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942692, "dur": 8, "args": { "External id": 98949, "cbid": 131, "correlation": 98949 } }, { "ph": "f", "id": 98949, "pid": 76337, "tid": -914061504, "ts": 1716454222942692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222942752, "dur": 1, "args": { "External id": 98958, "device": 5, "context": 1, "stream": 7, "correlation": 98958, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 98958, "pid": 5, "tid": 7, "ts": 1716454222942752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942741, "dur": 8, "args": { "External id": 98958, "cbid": 41, "correlation": 98958 } }, { "ph": "s", "id": 98958, "pid": 76337, "tid": -914061504, "ts": 1716454222942741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942750, "dur": 8, "args": { "External id": 98959, "cbid": 131, "correlation": 98959 } }, { "ph": "f", "id": 98959, "pid": 76337, "tid": -914061504, "ts": 1716454222942750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222942827, "dur": 4, "args": { "External id": 98966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98966, "pid": 5, "tid": 7, "ts": 1716454222942827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942810, "dur": 17, "args": { "External id": 98966, "cbid": 211, "correlation": 98966 } }, { "ph": "s", "id": 98966, "pid": 76337, "tid": -914061504, "ts": 1716454222942810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454222942864, "dur": 4, "args": { "External id": 98986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 98986, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 98986, "pid": 5, "tid": 7, "ts": 1716454222942864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942853, "dur": 12, "args": { "External id": 98986, "cbid": 211, "correlation": 98986 } }, { "ph": "s", "id": 98986, "pid": 76337, "tid": -914061504, "ts": 1716454222942853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222942866, "dur": 0, "args": { "External id": 98987, "cbid": 11, "correlation": 98987 } }, { "ph": "f", "id": 98987, "pid": 76337, "tid": -914061504, "ts": 1716454222942866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222942866, "dur": 0, "args": { "External id": 98988, "cbid": 11, "correlation": 98988 } }, { "ph": "f", "id": 98988, "pid": 76337, "tid": -914061504, "ts": 1716454222942866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222942880, "dur": 1, "args": { "External id": 98991, "device": 5, "context": 1, "stream": 7, "correlation": 98991, "bytes": 4, "memory bandwidth (GB/s)": 0.0024509803921568627 } }, { "ph": "f", "id": 98991, "pid": 5, "tid": 7, "ts": 1716454222942880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942867, "dur": 21, "args": { "External id": 98991, "cbid": 41, "correlation": 98991 } }, { "ph": "s", "id": 98991, "pid": 76337, "tid": -914061504, "ts": 1716454222942867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942889, "dur": 3, "args": { "External id": 98992, "cbid": 131, "correlation": 98992 } }, { "ph": "f", "id": 98992, "pid": 76337, "tid": -914061504, "ts": 1716454222942889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454222942917, "dur": 3, "args": { "External id": 99016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99016, "pid": 5, "tid": 7, "ts": 1716454222942917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942908, "dur": 9, "args": { "External id": 99016, "cbid": 211, "correlation": 99016 } }, { "ph": "s", "id": 99016, "pid": 76337, "tid": -914061504, "ts": 1716454222942908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222942918, "dur": 0, "args": { "External id": 99017, "cbid": 11, "correlation": 99017 } }, { "ph": "f", "id": 99017, "pid": 76337, "tid": -914061504, "ts": 1716454222942918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222942918, "dur": 0, "args": { "External id": 99018, "cbid": 11, "correlation": 99018 } }, { "ph": "f", "id": 99018, "pid": 76337, "tid": -914061504, "ts": 1716454222942918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454222942920, "dur": 1, "args": { "External id": 99020, "cbid": 200, "correlation": 99020 } }, { "ph": "f", "id": 99020, "pid": 76337, "tid": -914061504, "ts": 1716454222942920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454222942930, "dur": 4, "args": { "External id": 99022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99022, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99022, "pid": 5, "tid": 7, "ts": 1716454222942930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222942923, "dur": 8, "args": { "External id": 99022, "cbid": 211, "correlation": 99022 } }, { "ph": "s", "id": 99022, "pid": 76337, "tid": -914061504, "ts": 1716454222942923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222942931, "dur": 0, "args": { "External id": 99023, "cbid": 11, "correlation": 99023 } }, { "ph": "f", "id": 99023, "pid": 76337, "tid": -914061504, "ts": 1716454222942931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454222942932, "dur": 0, "args": { "External id": 99024, "cbid": 11, "correlation": 99024 } }, { "ph": "f", "id": 99024, "pid": 76337, "tid": -914061504, "ts": 1716454222942932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454222942970, "dur": 1, "args": { "External id": 99031, "device": 5, "context": 1, "stream": 7, "correlation": 99031, "bytes": 8, "memory bandwidth (GB/s)": 0.004901960784313725 } }, { "ph": "f", "id": 99031, "pid": 5, "tid": 7, "ts": 1716454222942970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222942959, "dur": 25, "args": { "External id": 99031, "cbid": 41, "correlation": 99031 } }, { "ph": "s", "id": 99031, "pid": 76337, "tid": -914061504, "ts": 1716454222942959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222942985, "dur": 3, "args": { "External id": 99032, "cbid": 131, "correlation": 99032 } }, { "ph": "f", "id": 99032, "pid": 76337, "tid": -914061504, "ts": 1716454222942985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454222943036, "dur": 1, "args": { "External id": 99042, "device": 5, "context": 1, "stream": 7, "correlation": 99042, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 99042, "pid": 5, "tid": 7, "ts": 1716454222943036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222943024, "dur": 10, "args": { "External id": 99042, "cbid": 41, "correlation": 99042 } }, { "ph": "s", "id": 99042, "pid": 76337, "tid": -914061504, "ts": 1716454222943024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454222943035, "dur": 8, "args": { "External id": 99043, "cbid": 131, "correlation": 99043 } }, { "ph": "f", "id": 99043, "pid": 76337, "tid": -914061504, "ts": 1716454222943035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222943106, "dur": 5, "args": { "External id": 99050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99050, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99050, "pid": 5, "tid": 7, "ts": 1716454222943106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943091, "dur": 16, "args": { "External id": 99050, "cbid": 211, "correlation": 99050 } }, { "ph": "s", "id": 99050, "pid": 76337, "tid": -914061504, "ts": 1716454222943091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943176, "dur": 3, "args": { "External id": 99059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99059, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99059, "pid": 5, "tid": 7, "ts": 1716454222943176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943163, "dur": 12, "args": { "External id": 99059, "cbid": 211, "correlation": 99059 } }, { "ph": "s", "id": 99059, "pid": 76337, "tid": -914061504, "ts": 1716454222943163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943213, "dur": 3, "args": { "External id": 99067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99067, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99067, "pid": 5, "tid": 7, "ts": 1716454222943213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943202, "dur": 10, "args": { "External id": 99067, "cbid": 211, "correlation": 99067 } }, { "ph": "s", "id": 99067, "pid": 76337, "tid": -914061504, "ts": 1716454222943202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943247, "dur": 4, "args": { "External id": 99075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99075, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99075, "pid": 5, "tid": 7, "ts": 1716454222943247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943235, "dur": 12, "args": { "External id": 99075, "cbid": 211, "correlation": 99075 } }, { "ph": "s", "id": 99075, "pid": 76337, "tid": -914061504, "ts": 1716454222943235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943276, "dur": 4, "args": { "External id": 99083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99083, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99083, "pid": 5, "tid": 7, "ts": 1716454222943276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943266, "dur": 10, "args": { "External id": 99083, "cbid": 211, "correlation": 99083 } }, { "ph": "s", "id": 99083, "pid": 76337, "tid": -914061504, "ts": 1716454222943266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943303, "dur": 3, "args": { "External id": 99091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99091, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99091, "pid": 5, "tid": 7, "ts": 1716454222943303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943294, "dur": 8, "args": { "External id": 99091, "cbid": 211, "correlation": 99091 } }, { "ph": "s", "id": 99091, "pid": 76337, "tid": -914061504, "ts": 1716454222943294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943328, "dur": 3, "args": { "External id": 99099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99099, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99099, "pid": 5, "tid": 7, "ts": 1716454222943328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943319, "dur": 9, "args": { "External id": 99099, "cbid": 211, "correlation": 99099 } }, { "ph": "s", "id": 99099, "pid": 76337, "tid": -914061504, "ts": 1716454222943319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222943351, "dur": 4, "args": { "External id": 99107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99107, "pid": 5, "tid": 7, "ts": 1716454222943351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943343, "dur": 7, "args": { "External id": 99107, "cbid": 211, "correlation": 99107 } }, { "ph": "s", "id": 99107, "pid": 76337, "tid": -914061504, "ts": 1716454222943343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222943369, "dur": 4, "args": { "External id": 99115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99115, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99115, "pid": 5, "tid": 7, "ts": 1716454222943369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943361, "dur": 7, "args": { "External id": 99115, "cbid": 211, "correlation": 99115 } }, { "ph": "s", "id": 99115, "pid": 76337, "tid": -914061504, "ts": 1716454222943361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943388, "dur": 3, "args": { "External id": 99123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99123, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99123, "pid": 5, "tid": 7, "ts": 1716454222943388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943380, "dur": 7, "args": { "External id": 99123, "cbid": 211, "correlation": 99123 } }, { "ph": "s", "id": 99123, "pid": 76337, "tid": -914061504, "ts": 1716454222943380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943446, "dur": 3, "args": { "External id": 99131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99131, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99131, "pid": 5, "tid": 7, "ts": 1716454222943446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943435, "dur": 11, "args": { "External id": 99131, "cbid": 211, "correlation": 99131 } }, { "ph": "s", "id": 99131, "pid": 76337, "tid": -914061504, "ts": 1716454222943435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222943470, "dur": 4, "args": { "External id": 99139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99139, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99139, "pid": 5, "tid": 7, "ts": 1716454222943470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943462, "dur": 8, "args": { "External id": 99139, "cbid": 211, "correlation": 99139 } }, { "ph": "s", "id": 99139, "pid": 76337, "tid": -914061504, "ts": 1716454222943462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222943494, "dur": 4, "args": { "External id": 99147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99147, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99147, "pid": 5, "tid": 7, "ts": 1716454222943494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943484, "dur": 8, "args": { "External id": 99147, "cbid": 211, "correlation": 99147 } }, { "ph": "s", "id": 99147, "pid": 76337, "tid": -914061504, "ts": 1716454222943484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222943512, "dur": 3, "args": { "External id": 99155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99155, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99155, "pid": 5, "tid": 7, "ts": 1716454222943512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943505, "dur": 6, "args": { "External id": 99155, "cbid": 211, "correlation": 99155 } }, { "ph": "s", "id": 99155, "pid": 76337, "tid": -914061504, "ts": 1716454222943505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222943920, "dur": 5, "args": { "External id": 99164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99164, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99164, "pid": 5, "tid": 7, "ts": 1716454222943920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943904, "dur": 17, "args": { "External id": 99164, "cbid": 211, "correlation": 99164 } }, { "ph": "s", "id": 99164, "pid": 76337, "tid": -914061504, "ts": 1716454222943904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222943957, "dur": 5, "args": { "External id": 99173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99173, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99173, "pid": 5, "tid": 7, "ts": 1716454222943957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222943947, "dur": 9, "args": { "External id": 99173, "cbid": 211, "correlation": 99173 } }, { "ph": "s", "id": 99173, "pid": 76337, "tid": -914061504, "ts": 1716454222943947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454222944100, "dur": 2, "args": { "External id": 99189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99189, "pid": 5, "tid": 7, "ts": 1716454222944100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944085, "dur": 15, "args": { "External id": 99189, "cbid": 211, "correlation": 99189 } }, { "ph": "s", "id": 99189, "pid": 76337, "tid": -914061504, "ts": 1716454222944085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944133, "dur": 3, "args": { "External id": 99197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99197, "pid": 5, "tid": 7, "ts": 1716454222944133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944124, "dur": 8, "args": { "External id": 99197, "cbid": 211, "correlation": 99197 } }, { "ph": "s", "id": 99197, "pid": 76337, "tid": -914061504, "ts": 1716454222944124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944165, "dur": 3, "args": { "External id": 99205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99205, "pid": 5, "tid": 7, "ts": 1716454222944165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944155, "dur": 10, "args": { "External id": 99205, "cbid": 211, "correlation": 99205 } }, { "ph": "s", "id": 99205, "pid": 76337, "tid": -914061504, "ts": 1716454222944155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944195, "dur": 4, "args": { "External id": 99213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99213, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99213, "pid": 5, "tid": 7, "ts": 1716454222944195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944186, "dur": 8, "args": { "External id": 99213, "cbid": 211, "correlation": 99213 } }, { "ph": "s", "id": 99213, "pid": 76337, "tid": -914061504, "ts": 1716454222944186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454222944251, "dur": 4, "args": { "External id": 99225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99225, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99225, "pid": 5, "tid": 7, "ts": 1716454222944251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944238, "dur": 13, "args": { "External id": 99225, "cbid": 211, "correlation": 99225 } }, { "ph": "s", "id": 99225, "pid": 76337, "tid": -914061504, "ts": 1716454222944238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222944297, "dur": 4, "args": { "External id": 99236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99236, "pid": 5, "tid": 7, "ts": 1716454222944297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944285, "dur": 11, "args": { "External id": 99236, "cbid": 211, "correlation": 99236 } }, { "ph": "s", "id": 99236, "pid": 76337, "tid": -914061504, "ts": 1716454222944285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944329, "dur": 2, "args": { "External id": 99244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99244, "pid": 5, "tid": 7, "ts": 1716454222944329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944319, "dur": 8, "args": { "External id": 99244, "cbid": 211, "correlation": 99244 } }, { "ph": "s", "id": 99244, "pid": 76337, "tid": -914061504, "ts": 1716454222944319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944362, "dur": 5, "args": { "External id": 99252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99252, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99252, "pid": 5, "tid": 7, "ts": 1716454222944362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944351, "dur": 10, "args": { "External id": 99252, "cbid": 211, "correlation": 99252 } }, { "ph": "s", "id": 99252, "pid": 76337, "tid": -914061504, "ts": 1716454222944351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944391, "dur": 5, "args": { "External id": 99260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99260, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99260, "pid": 5, "tid": 7, "ts": 1716454222944391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944380, "dur": 10, "args": { "External id": 99260, "cbid": 211, "correlation": 99260 } }, { "ph": "s", "id": 99260, "pid": 76337, "tid": -914061504, "ts": 1716454222944380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222944421, "dur": 4, "args": { "External id": 99269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99269, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99269, "pid": 5, "tid": 7, "ts": 1716454222944421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944411, "dur": 9, "args": { "External id": 99269, "cbid": 211, "correlation": 99269 } }, { "ph": "s", "id": 99269, "pid": 76337, "tid": -914061504, "ts": 1716454222944411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222944481, "dur": 4, "args": { "External id": 99282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99282, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99282, "pid": 5, "tid": 7, "ts": 1716454222944481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944469, "dur": 13, "args": { "External id": 99282, "cbid": 211, "correlation": 99282 } }, { "ph": "s", "id": 99282, "pid": 76337, "tid": -914061504, "ts": 1716454222944469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454222944523, "dur": 5, "args": { "External id": 99292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99292, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99292, "pid": 5, "tid": 7, "ts": 1716454222944523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944511, "dur": 11, "args": { "External id": 99292, "cbid": 211, "correlation": 99292 } }, { "ph": "s", "id": 99292, "pid": 76337, "tid": -914061504, "ts": 1716454222944511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222944657, "dur": 5, "args": { "External id": 99309, "cbid": 251, "correlation": 99309 } }, { "ph": "f", "id": 99309, "pid": 76337, "tid": -914061504, "ts": 1716454222944657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454222944688, "dur": 12, "args": { "External id": 99311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99311, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99311, "pid": 5, "tid": 7, "ts": 1716454222944688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944672, "dur": 17, "args": { "External id": 99311, "cbid": 211, "correlation": 99311 } }, { "ph": "s", "id": 99311, "pid": 76337, "tid": -914061504, "ts": 1716454222944672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222944754, "dur": 4, "args": { "External id": 99319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99319, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99319, "pid": 5, "tid": 7, "ts": 1716454222944754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944742, "dur": 12, "args": { "External id": 99319, "cbid": 211, "correlation": 99319 } }, { "ph": "s", "id": 99319, "pid": 76337, "tid": -914061504, "ts": 1716454222944742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222944814, "dur": 2, "args": { "External id": 99335, "cbid": 251, "correlation": 99335 } }, { "ph": "f", "id": 99335, "pid": 76337, "tid": -914061504, "ts": 1716454222944814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222944820, "dur": 0, "args": { "External id": 99337, "cbid": 251, "correlation": 99337 } }, { "ph": "f", "id": 99337, "pid": 76337, "tid": -914061504, "ts": 1716454222944820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222944838, "dur": 13, "args": { "External id": 99338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99338, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99338, "pid": 5, "tid": 7, "ts": 1716454222944838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944823, "dur": 15, "args": { "External id": 99338, "cbid": 211, "correlation": 99338 } }, { "ph": "s", "id": 99338, "pid": 76337, "tid": -914061504, "ts": 1716454222944823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222944853, "dur": 5, "args": { "External id": 99340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99340, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99340, "pid": 5, "tid": 7, "ts": 1716454222944853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222944843, "dur": 9, "args": { "External id": 99340, "cbid": 211, "correlation": 99340 } }, { "ph": "s", "id": 99340, "pid": 76337, "tid": -914061504, "ts": 1716454222944843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222944957, "dur": 1, "args": { "External id": 99350, "cbid": 317, "correlation": 99350 } }, { "ph": "f", "id": 99350, "pid": 76337, "tid": -914061504, "ts": 1716454222944957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222944959, "dur": 1, "args": { "External id": 99351, "cbid": 203, "correlation": 99351 } }, { "ph": "f", "id": 99351, "pid": 76337, "tid": -914061504, "ts": 1716454222944959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222944961, "dur": 1, "args": { "External id": 99352, "cbid": 205, "correlation": 99352 } }, { "ph": "f", "id": 99352, "pid": 76337, "tid": -914061504, "ts": 1716454222944961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222945030, "dur": 6, "args": { "External id": 99356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99356, "pid": 5, "tid": 7, "ts": 1716454222945030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222945014, "dur": 16, "args": { "External id": 99356, "cbid": 211, "correlation": 99356 } }, { "ph": "s", "id": 99356, "pid": 76337, "tid": -914061504, "ts": 1716454222945014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222945041, "dur": 4, "args": { "External id": 99358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99358, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 99358, "pid": 5, "tid": 7, "ts": 1716454222945041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222945033, "dur": 6, "args": { "External id": 99358, "cbid": 211, "correlation": 99358 } }, { "ph": "s", "id": 99358, "pid": 76337, "tid": -914061504, "ts": 1716454222945033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222945062, "dur": 3, "args": { "External id": 99360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99360, "pid": 5, "tid": 7, "ts": 1716454222945062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222945052, "dur": 8, "args": { "External id": 99360, "cbid": 211, "correlation": 99360 } }, { "ph": "s", "id": 99360, "pid": 76337, "tid": -914061504, "ts": 1716454222945052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222945067, "dur": 0, "args": { "External id": 99361, "cbid": 51, "correlation": 99361 } }, { "ph": "s", "id": 99361, "pid": 76337, "tid": -914061504, "ts": 1716454222945067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222945078, "dur": 82, "args": { "External id": 99362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99362, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99362, "pid": 5, "tid": 7, "ts": 1716454222945078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222945069, "dur": 7, "args": { "External id": 99362, "cbid": 211, "correlation": 99362 } }, { "ph": "s", "id": 99362, "pid": 76337, "tid": -914061504, "ts": 1716454222945069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222945161, "dur": 59, "args": { "External id": 99367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99367, "pid": 5, "tid": 7, "ts": 1716454222945161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222945106, "dur": 11, "args": { "External id": 99367, "cbid": 211, "correlation": 99367 } }, { "ph": "s", "id": 99367, "pid": 76337, "tid": -914061504, "ts": 1716454222945106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222946949, "dur": 51, "args": { "External id": 99387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99387, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 99387, "pid": 5, "tid": 7, "ts": 1716454222946949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222946932, "dur": 16, "args": { "External id": 99387, "cbid": 211, "correlation": 99387 } }, { "ph": "s", "id": 99387, "pid": 76337, "tid": -914061504, "ts": 1716454222946932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222947001, "dur": 4, "args": { "External id": 99399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99399, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99399, "pid": 5, "tid": 7, "ts": 1716454222947001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222946961, "dur": 8, "args": { "External id": 99399, "cbid": 211, "correlation": 99399 } }, { "ph": "s", "id": 99399, "pid": 76337, "tid": -914061504, "ts": 1716454222946961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222947006, "dur": 55, "args": { "External id": 99402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99402, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99402, "pid": 5, "tid": 7, "ts": 1716454222947006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222946992, "dur": 10, "args": { "External id": 99402, "cbid": 211, "correlation": 99402 } }, { "ph": "s", "id": 99402, "pid": 76337, "tid": -914061504, "ts": 1716454222946992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222947063, "dur": 36, "args": { "External id": 99411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99411, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99411, "pid": 5, "tid": 7, "ts": 1716454222947063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947039, "dur": 11, "args": { "External id": 99411, "cbid": 211, "correlation": 99411 } }, { "ph": "s", "id": 99411, "pid": 76337, "tid": -914061504, "ts": 1716454222947039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222947100, "dur": 0, "args": { "External id": 99421, "cbid": 317, "correlation": 99421 } }, { "ph": "f", "id": 99421, "pid": 76337, "tid": -914061504, "ts": 1716454222947100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222947100, "dur": 0, "args": { "External id": 99422, "cbid": 203, "correlation": 99422 } }, { "ph": "f", "id": 99422, "pid": 76337, "tid": -914061504, "ts": 1716454222947100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222947101, "dur": 0, "args": { "External id": 99423, "cbid": 205, "correlation": 99423 } }, { "ph": "f", "id": 99423, "pid": 76337, "tid": -914061504, "ts": 1716454222947101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222947133, "dur": 39, "args": { "External id": 99427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99427, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99427, "pid": 5, "tid": 7, "ts": 1716454222947133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947120, "dur": 12, "args": { "External id": 99427, "cbid": 211, "correlation": 99427 } }, { "ph": "s", "id": 99427, "pid": 76337, "tid": -914061504, "ts": 1716454222947120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222947173, "dur": 14, "args": { "External id": 99429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99429, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99429, "pid": 5, "tid": 7, "ts": 1716454222947173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947135, "dur": 6, "args": { "External id": 99429, "cbid": 211, "correlation": 99429 } }, { "ph": "s", "id": 99429, "pid": 76337, "tid": -914061504, "ts": 1716454222947135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222947188, "dur": 3, "args": { "External id": 99431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99431, "pid": 5, "tid": 7, "ts": 1716454222947188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947148, "dur": 6, "args": { "External id": 99431, "cbid": 211, "correlation": 99431 } }, { "ph": "s", "id": 99431, "pid": 76337, "tid": -914061504, "ts": 1716454222947148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222947158, "dur": 0, "args": { "External id": 99432, "cbid": 51, "correlation": 99432 } }, { "ph": "s", "id": 99432, "pid": 76337, "tid": -914061504, "ts": 1716454222947158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222947193, "dur": 688, "args": { "External id": 99433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99433, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99433, "pid": 5, "tid": 7, "ts": 1716454222947193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947159, "dur": 6, "args": { "External id": 99433, "cbid": 211, "correlation": 99433 } }, { "ph": "s", "id": 99433, "pid": 76337, "tid": -914061504, "ts": 1716454222947159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222947882, "dur": 58, "args": { "External id": 99438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99438, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99438, "pid": 5, "tid": 7, "ts": 1716454222947882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947189, "dur": 10, "args": { "External id": 99438, "cbid": 211, "correlation": 99438 } }, { "ph": "s", "id": 99438, "pid": 76337, "tid": -914061504, "ts": 1716454222947189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222947941, "dur": 3, "args": { "External id": 99446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99446, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99446, "pid": 5, "tid": 7, "ts": 1716454222947941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947233, "dur": 9, "args": { "External id": 99446, "cbid": 211, "correlation": 99446 } }, { "ph": "s", "id": 99446, "pid": 76337, "tid": -914061504, "ts": 1716454222947233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947301, "dur": 1, "args": { "External id": 99462, "cbid": 251, "correlation": 99462 } }, { "ph": "f", "id": 99462, "pid": 76337, "tid": -914061504, "ts": 1716454222947301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947306, "dur": 0, "args": { "External id": 99464, "cbid": 251, "correlation": 99464 } }, { "ph": "f", "id": 99464, "pid": 76337, "tid": -914061504, "ts": 1716454222947306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222947946, "dur": 8, "args": { "External id": 99465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99465, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 99465, "pid": 5, "tid": 7, "ts": 1716454222947946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947308, "dur": 12, "args": { "External id": 99465, "cbid": 211, "correlation": 99465 } }, { "ph": "s", "id": 99465, "pid": 76337, "tid": -914061504, "ts": 1716454222947308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222947955, "dur": 4, "args": { "External id": 99467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99467, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 99467, "pid": 5, "tid": 7, "ts": 1716454222947955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947322, "dur": 7, "args": { "External id": 99467, "cbid": 211, "correlation": 99467 } }, { "ph": "s", "id": 99467, "pid": 76337, "tid": -914061504, "ts": 1716454222947322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222947961, "dur": 54, "args": { "External id": 99477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99477, "pid": 5, "tid": 7, "ts": 1716454222947961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947383, "dur": 13, "args": { "External id": 99477, "cbid": 211, "correlation": 99477 } }, { "ph": "s", "id": 99477, "pid": 76337, "tid": -914061504, "ts": 1716454222947383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222948016, "dur": 52, "args": { "External id": 99497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99497, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 99497, "pid": 5, "tid": 7, "ts": 1716454222948016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947451, "dur": 11, "args": { "External id": 99497, "cbid": 211, "correlation": 99497 } }, { "ph": "s", "id": 99497, "pid": 76337, "tid": -914061504, "ts": 1716454222947451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222948069, "dur": 4, "args": { "External id": 99509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99509, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99509, "pid": 5, "tid": 7, "ts": 1716454222948069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947472, "dur": 6, "args": { "External id": 99509, "cbid": 211, "correlation": 99509 } }, { "ph": "s", "id": 99509, "pid": 76337, "tid": -914061504, "ts": 1716454222947472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222948074, "dur": 54, "args": { "External id": 99512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99512, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99512, "pid": 5, "tid": 7, "ts": 1716454222948074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947490, "dur": 7, "args": { "External id": 99512, "cbid": 211, "correlation": 99512 } }, { "ph": "s", "id": 99512, "pid": 76337, "tid": -914061504, "ts": 1716454222947490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222948130, "dur": 37, "args": { "External id": 99521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99521, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99521, "pid": 5, "tid": 7, "ts": 1716454222948130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947530, "dur": 10, "args": { "External id": 99521, "cbid": 211, "correlation": 99521 } }, { "ph": "s", "id": 99521, "pid": 76337, "tid": -914061504, "ts": 1716454222947530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222947602, "dur": 0, "args": { "External id": 99531, "cbid": 317, "correlation": 99531 } }, { "ph": "f", "id": 99531, "pid": 76337, "tid": -914061504, "ts": 1716454222947602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222947603, "dur": 0, "args": { "External id": 99532, "cbid": 203, "correlation": 99532 } }, { "ph": "f", "id": 99532, "pid": 76337, "tid": -914061504, "ts": 1716454222947603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222947604, "dur": 0, "args": { "External id": 99533, "cbid": 205, "correlation": 99533 } }, { "ph": "f", "id": 99533, "pid": 76337, "tid": -914061504, "ts": 1716454222947604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222948168, "dur": 42, "args": { "External id": 99537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99537, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99537, "pid": 5, "tid": 7, "ts": 1716454222948168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947619, "dur": 12, "args": { "External id": 99537, "cbid": 211, "correlation": 99537 } }, { "ph": "s", "id": 99537, "pid": 76337, "tid": -914061504, "ts": 1716454222947619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222948211, "dur": 14, "args": { "External id": 99539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99539, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99539, "pid": 5, "tid": 7, "ts": 1716454222948211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947633, "dur": 5, "args": { "External id": 99539, "cbid": 211, "correlation": 99539 } }, { "ph": "s", "id": 99539, "pid": 76337, "tid": -914061504, "ts": 1716454222947633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222948227, "dur": 3, "args": { "External id": 99541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99541, "pid": 5, "tid": 7, "ts": 1716454222948227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947643, "dur": 6, "args": { "External id": 99541, "cbid": 211, "correlation": 99541 } }, { "ph": "s", "id": 99541, "pid": 76337, "tid": -914061504, "ts": 1716454222947643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222947652, "dur": 0, "args": { "External id": 99542, "cbid": 51, "correlation": 99542 } }, { "ph": "s", "id": 99542, "pid": 76337, "tid": -914061504, "ts": 1716454222947652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222948231, "dur": 684, "args": { "External id": 99543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99543, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99543, "pid": 5, "tid": 7, "ts": 1716454222948231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947653, "dur": 5, "args": { "External id": 99543, "cbid": 211, "correlation": 99543 } }, { "ph": "s", "id": 99543, "pid": 76337, "tid": -914061504, "ts": 1716454222947653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222948916, "dur": 58, "args": { "External id": 99548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99548, "pid": 5, "tid": 7, "ts": 1716454222948916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947681, "dur": 9, "args": { "External id": 99548, "cbid": 211, "correlation": 99548 } }, { "ph": "s", "id": 99548, "pid": 76337, "tid": -914061504, "ts": 1716454222947681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222948976, "dur": 50, "args": { "External id": 99556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99556, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99556, "pid": 5, "tid": 7, "ts": 1716454222948976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947714, "dur": 8, "args": { "External id": 99556, "cbid": 211, "correlation": 99556 } }, { "ph": "s", "id": 99556, "pid": 76337, "tid": -914061504, "ts": 1716454222947714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222949028, "dur": 35, "args": { "External id": 99564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99564, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99564, "pid": 5, "tid": 7, "ts": 1716454222949028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947746, "dur": 10, "args": { "External id": 99564, "cbid": 211, "correlation": 99564 } }, { "ph": "s", "id": 99564, "pid": 76337, "tid": -914061504, "ts": 1716454222947746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222949064, "dur": 51, "args": { "External id": 99584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99584, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 99584, "pid": 5, "tid": 7, "ts": 1716454222949064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947831, "dur": 13, "args": { "External id": 99584, "cbid": 211, "correlation": 99584 } }, { "ph": "s", "id": 99584, "pid": 76337, "tid": -914061504, "ts": 1716454222947831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222949116, "dur": 4, "args": { "External id": 99596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99596, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 99596, "pid": 5, "tid": 7, "ts": 1716454222949116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947854, "dur": 6, "args": { "External id": 99596, "cbid": 211, "correlation": 99596 } }, { "ph": "s", "id": 99596, "pid": 76337, "tid": -914061504, "ts": 1716454222947854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222949121, "dur": 54, "args": { "External id": 99599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99599, "pid": 5, "tid": 7, "ts": 1716454222949121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947872, "dur": 7, "args": { "External id": 99599, "cbid": 211, "correlation": 99599 } }, { "ph": "s", "id": 99599, "pid": 76337, "tid": -914061504, "ts": 1716454222947872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222947931, "dur": 0, "args": { "External id": 99610, "cbid": 317, "correlation": 99610 } }, { "ph": "f", "id": 99610, "pid": 76337, "tid": -914061504, "ts": 1716454222947931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222947932, "dur": 0, "args": { "External id": 99611, "cbid": 203, "correlation": 99611 } }, { "ph": "f", "id": 99611, "pid": 76337, "tid": -914061504, "ts": 1716454222947932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222947933, "dur": 0, "args": { "External id": 99612, "cbid": 205, "correlation": 99612 } }, { "ph": "f", "id": 99612, "pid": 76337, "tid": -914061504, "ts": 1716454222947933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947968, "dur": 2, "args": { "External id": 99616, "cbid": 251, "correlation": 99616 } }, { "ph": "f", "id": 99616, "pid": 76337, "tid": -914061504, "ts": 1716454222947968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947972, "dur": 9, "args": { "External id": 99617, "cbid": 251, "correlation": 99617 } }, { "ph": "f", "id": 99617, "pid": 76337, "tid": -914061504, "ts": 1716454222947972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947982, "dur": 1, "args": { "External id": 99618, "cbid": 251, "correlation": 99618 } }, { "ph": "f", "id": 99618, "pid": 76337, "tid": -914061504, "ts": 1716454222947982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947984, "dur": 1, "args": { "External id": 99619, "cbid": 251, "correlation": 99619 } }, { "ph": "f", "id": 99619, "pid": 76337, "tid": -914061504, "ts": 1716454222947984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947986, "dur": 1, "args": { "External id": 99620, "cbid": 251, "correlation": 99620 } }, { "ph": "f", "id": 99620, "pid": 76337, "tid": -914061504, "ts": 1716454222947986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947988, "dur": 1, "args": { "External id": 99621, "cbid": 251, "correlation": 99621 } }, { "ph": "f", "id": 99621, "pid": 76337, "tid": -914061504, "ts": 1716454222947988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947990, "dur": 1, "args": { "External id": 99622, "cbid": 251, "correlation": 99622 } }, { "ph": "f", "id": 99622, "pid": 76337, "tid": -914061504, "ts": 1716454222947990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947991, "dur": 1, "args": { "External id": 99623, "cbid": 251, "correlation": 99623 } }, { "ph": "f", "id": 99623, "pid": 76337, "tid": -914061504, "ts": 1716454222947991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222947994, "dur": 0, "args": { "External id": 99624, "cbid": 251, "correlation": 99624 } }, { "ph": "f", "id": 99624, "pid": 76337, "tid": -914061504, "ts": 1716454222947994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222949177, "dur": 115, "args": { "External id": 99625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99625, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 99625, "pid": 5, "tid": 7, "ts": 1716454222949177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222947998, "dur": 14, "args": { "External id": 99625, "cbid": 211, "correlation": 99625 } }, { "ph": "s", "id": 99625, "pid": 76337, "tid": -914061504, "ts": 1716454222947998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222949293, "dur": 59, "args": { "External id": 99631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99631, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99631, "pid": 5, "tid": 7, "ts": 1716454222949293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948036, "dur": 10, "args": { "External id": 99631, "cbid": 211, "correlation": 99631 } }, { "ph": "s", "id": 99631, "pid": 76337, "tid": -914061504, "ts": 1716454222948036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222949353, "dur": 526, "args": { "External id": 99640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99640, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99640, "pid": 5, "tid": 7, "ts": 1716454222949353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948133, "dur": 16, "args": { "External id": 99640, "cbid": 211, "correlation": 99640 } }, { "ph": "s", "id": 99640, "pid": 76337, "tid": -914061504, "ts": 1716454222948133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222949880, "dur": 177, "args": { "External id": 99662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99662, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99662, "pid": 5, "tid": 7, "ts": 1716454222949880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948207, "dur": 12, "args": { "External id": 99662, "cbid": 211, "correlation": 99662 } }, { "ph": "s", "id": 99662, "pid": 76337, "tid": -914061504, "ts": 1716454222948207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222948325, "dur": 2, "args": { "External id": 99673, "cbid": 251, "correlation": 99673 } }, { "ph": "f", "id": 99673, "pid": 76337, "tid": -914061504, "ts": 1716454222948325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222950058, "dur": 195, "args": { "External id": 99674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99674, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99674, "pid": 5, "tid": 7, "ts": 1716454222950058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948332, "dur": 14, "args": { "External id": 99674, "cbid": 211, "correlation": 99674 } }, { "ph": "s", "id": 99674, "pid": 76337, "tid": -914061504, "ts": 1716454222948332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222948406, "dur": 1, "args": { "External id": 99685, "cbid": 251, "correlation": 99685 } }, { "ph": "f", "id": 99685, "pid": 76337, "tid": -914061504, "ts": 1716454222948406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222950254, "dur": 186, "args": { "External id": 99686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99686, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99686, "pid": 5, "tid": 7, "ts": 1716454222950254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948410, "dur": 12, "args": { "External id": 99686, "cbid": 211, "correlation": 99686 } }, { "ph": "s", "id": 99686, "pid": 76337, "tid": -914061504, "ts": 1716454222948410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222948476, "dur": 1, "args": { "External id": 99697, "cbid": 251, "correlation": 99697 } }, { "ph": "f", "id": 99697, "pid": 76337, "tid": -914061504, "ts": 1716454222948476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222950441, "dur": 185, "args": { "External id": 99698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99698, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99698, "pid": 5, "tid": 7, "ts": 1716454222950441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948480, "dur": 11, "args": { "External id": 99698, "cbid": 211, "correlation": 99698 } }, { "ph": "s", "id": 99698, "pid": 76337, "tid": -914061504, "ts": 1716454222948480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222950628, "dur": 18157, "args": { "External id": 99719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99719, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 99719, "pid": 5, "tid": 7, "ts": 1716454222950628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948591, "dur": 15, "args": { "External id": 99719, "cbid": 211, "correlation": 99719 } }, { "ph": "s", "id": 99719, "pid": 76337, "tid": -914061504, "ts": 1716454222948591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222948706, "dur": 2, "args": { "External id": 99737, "cbid": 251, "correlation": 99737 } }, { "ph": "f", "id": 99737, "pid": 76337, "tid": -914061504, "ts": 1716454222948706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222968786, "dur": 198, "args": { "External id": 99739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99739, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99739, "pid": 5, "tid": 7, "ts": 1716454222968786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948714, "dur": 14, "args": { "External id": 99739, "cbid": 211, "correlation": 99739 } }, { "ph": "s", "id": 99739, "pid": 76337, "tid": -914061504, "ts": 1716454222948714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222968985, "dur": 66, "args": { "External id": 99747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99747, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99747, "pid": 5, "tid": 7, "ts": 1716454222968985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948788, "dur": 12, "args": { "External id": 99747, "cbid": 211, "correlation": 99747 } }, { "ph": "s", "id": 99747, "pid": 76337, "tid": -914061504, "ts": 1716454222948788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222969053, "dur": 97, "args": { "External id": 99755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99755, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99755, "pid": 5, "tid": 7, "ts": 1716454222969053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948866, "dur": 10, "args": { "External id": 99755, "cbid": 211, "correlation": 99755 } }, { "ph": "s", "id": 99755, "pid": 76337, "tid": -914061504, "ts": 1716454222948866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222969151, "dur": 53, "args": { "External id": 99766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99766, "pid": 5, "tid": 7, "ts": 1716454222969151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948955, "dur": 15, "args": { "External id": 99766, "cbid": 211, "correlation": 99766 } }, { "ph": "s", "id": 99766, "pid": 76337, "tid": -914061504, "ts": 1716454222948955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222969206, "dur": 90, "args": { "External id": 99788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99788, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99788, "pid": 5, "tid": 7, "ts": 1716454222969206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222948996, "dur": 9, "args": { "External id": 99788, "cbid": 211, "correlation": 99788 } }, { "ph": "s", "id": 99788, "pid": 76337, "tid": -914061504, "ts": 1716454222948996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949083, "dur": 1, "args": { "External id": 99799, "cbid": 251, "correlation": 99799 } }, { "ph": "f", "id": 99799, "pid": 76337, "tid": -914061504, "ts": 1716454222949083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222969297, "dur": 103, "args": { "External id": 99800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99800, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99800, "pid": 5, "tid": 7, "ts": 1716454222969297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949089, "dur": 13, "args": { "External id": 99800, "cbid": 211, "correlation": 99800 } }, { "ph": "s", "id": 99800, "pid": 76337, "tid": -914061504, "ts": 1716454222949089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949170, "dur": 1, "args": { "External id": 99811, "cbid": 251, "correlation": 99811 } }, { "ph": "f", "id": 99811, "pid": 76337, "tid": -914061504, "ts": 1716454222949170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949175, "dur": 0, "args": { "External id": 99812, "cbid": 251, "correlation": 99812 } }, { "ph": "f", "id": 99812, "pid": 76337, "tid": -914061504, "ts": 1716454222949175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222969401, "dur": 10, "args": { "External id": 99813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99813, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99813, "pid": 5, "tid": 7, "ts": 1716454222969401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949177, "dur": 14, "args": { "External id": 99813, "cbid": 211, "correlation": 99813 } }, { "ph": "s", "id": 99813, "pid": 76337, "tid": -914061504, "ts": 1716454222949177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222969412, "dur": 5, "args": { "External id": 99815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99815, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 99815, "pid": 5, "tid": 7, "ts": 1716454222969412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949194, "dur": 9, "args": { "External id": 99815, "cbid": 211, "correlation": 99815 } }, { "ph": "s", "id": 99815, "pid": 76337, "tid": -914061504, "ts": 1716454222949194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949258, "dur": 1, "args": { "External id": 99826, "cbid": 251, "correlation": 99826 } }, { "ph": "f", "id": 99826, "pid": 76337, "tid": -914061504, "ts": 1716454222949258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949262, "dur": 0, "args": { "External id": 99827, "cbid": 251, "correlation": 99827 } }, { "ph": "f", "id": 99827, "pid": 76337, "tid": -914061504, "ts": 1716454222949262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222969418, "dur": 6, "args": { "External id": 99828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99828, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 99828, "pid": 5, "tid": 7, "ts": 1716454222969418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949263, "dur": 12, "args": { "External id": 99828, "cbid": 211, "correlation": 99828 } }, { "ph": "s", "id": 99828, "pid": 76337, "tid": -914061504, "ts": 1716454222949263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222969425, "dur": 3, "args": { "External id": 99830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99830, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 99830, "pid": 5, "tid": 7, "ts": 1716454222969425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949277, "dur": 6, "args": { "External id": 99830, "cbid": 211, "correlation": 99830 } }, { "ph": "s", "id": 99830, "pid": 76337, "tid": -914061504, "ts": 1716454222949277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222969430, "dur": 152, "args": { "External id": 99851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99851, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 99851, "pid": 5, "tid": 7, "ts": 1716454222969430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949352, "dur": 12, "args": { "External id": 99851, "cbid": 211, "correlation": 99851 } }, { "ph": "s", "id": 99851, "pid": 76337, "tid": -914061504, "ts": 1716454222949352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949452, "dur": 2, "args": { "External id": 99869, "cbid": 251, "correlation": 99869 } }, { "ph": "f", "id": 99869, "pid": 76337, "tid": -914061504, "ts": 1716454222949452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222969583, "dur": 106, "args": { "External id": 99871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99871, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 99871, "pid": 5, "tid": 7, "ts": 1716454222969583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949459, "dur": 14, "args": { "External id": 99871, "cbid": 211, "correlation": 99871 } }, { "ph": "s", "id": 99871, "pid": 76337, "tid": -914061504, "ts": 1716454222949459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222969690, "dur": 35, "args": { "External id": 99879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99879, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99879, "pid": 5, "tid": 7, "ts": 1716454222969690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949530, "dur": 12, "args": { "External id": 99879, "cbid": 211, "correlation": 99879 } }, { "ph": "s", "id": 99879, "pid": 76337, "tid": -914061504, "ts": 1716454222949530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222969726, "dur": 68, "args": { "External id": 99887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99887, "pid": 5, "tid": 7, "ts": 1716454222969726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949572, "dur": 9, "args": { "External id": 99887, "cbid": 211, "correlation": 99887 } }, { "ph": "s", "id": 99887, "pid": 76337, "tid": -914061504, "ts": 1716454222949572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222969796, "dur": 90, "args": { "External id": 99909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99909, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99909, "pid": 5, "tid": 7, "ts": 1716454222969796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949624, "dur": 10, "args": { "External id": 99909, "cbid": 211, "correlation": 99909 } }, { "ph": "s", "id": 99909, "pid": 76337, "tid": -914061504, "ts": 1716454222949624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949715, "dur": 1, "args": { "External id": 99925, "cbid": 251, "correlation": 99925 } }, { "ph": "f", "id": 99925, "pid": 76337, "tid": -914061504, "ts": 1716454222949715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222969887, "dur": 567, "args": { "External id": 99927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99927, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 99927, "pid": 5, "tid": 7, "ts": 1716454222969887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949720, "dur": 12, "args": { "External id": 99927, "cbid": 211, "correlation": 99927 } }, { "ph": "s", "id": 99927, "pid": 76337, "tid": -914061504, "ts": 1716454222949720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222970456, "dur": 242, "args": { "External id": 99935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99935, "pid": 5, "tid": 7, "ts": 1716454222970456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949801, "dur": 14, "args": { "External id": 99935, "cbid": 211, "correlation": 99935 } }, { "ph": "s", "id": 99935, "pid": 76337, "tid": -914061504, "ts": 1716454222949801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222970699, "dur": 250, "args": { "External id": 99943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99943, "pid": 5, "tid": 7, "ts": 1716454222970699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949838, "dur": 9, "args": { "External id": 99943, "cbid": 211, "correlation": 99943 } }, { "ph": "s", "id": 99943, "pid": 76337, "tid": -914061504, "ts": 1716454222949838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949922, "dur": 2, "args": { "External id": 99959, "cbid": 251, "correlation": 99959 } }, { "ph": "f", "id": 99959, "pid": 76337, "tid": -914061504, "ts": 1716454222949922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222949927, "dur": 0, "args": { "External id": 99961, "cbid": 251, "correlation": 99961 } }, { "ph": "f", "id": 99961, "pid": 76337, "tid": -914061504, "ts": 1716454222949927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222970950, "dur": 354, "args": { "External id": 99962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99962, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 99962, "pid": 5, "tid": 7, "ts": 1716454222970950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949932, "dur": 14, "args": { "External id": 99962, "cbid": 211, "correlation": 99962 } }, { "ph": "s", "id": 99962, "pid": 76337, "tid": -914061504, "ts": 1716454222949932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222971305, "dur": 51, "args": { "External id": 99970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99970, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99970, "pid": 5, "tid": 7, "ts": 1716454222971305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222949984, "dur": 11, "args": { "External id": 99970, "cbid": 211, "correlation": 99970 } }, { "ph": "s", "id": 99970, "pid": 76337, "tid": -914061504, "ts": 1716454222949984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222971357, "dur": 154, "args": { "External id": 99981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 99981, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 99981, "pid": 5, "tid": 7, "ts": 1716454222971357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950059, "dur": 12, "args": { "External id": 99981, "cbid": 211, "correlation": 99981 } }, { "ph": "s", "id": 99981, "pid": 76337, "tid": -914061504, "ts": 1716454222950059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222950124, "dur": 0, "args": { "External id": 99993, "cbid": 317, "correlation": 99993 } }, { "ph": "f", "id": 99993, "pid": 76337, "tid": -914061504, "ts": 1716454222950124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222950125, "dur": 0, "args": { "External id": 99994, "cbid": 203, "correlation": 99994 } }, { "ph": "f", "id": 99994, "pid": 76337, "tid": -914061504, "ts": 1716454222950125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222950125, "dur": 0, "args": { "External id": 99995, "cbid": 205, "correlation": 99995 } }, { "ph": "f", "id": 99995, "pid": 76337, "tid": -914061504, "ts": 1716454222950125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950150, "dur": 1, "args": { "External id": 99999, "cbid": 251, "correlation": 99999 } }, { "ph": "f", "id": 99999, "pid": 76337, "tid": -914061504, "ts": 1716454222950150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950152, "dur": 0, "args": { "External id": 100000, "cbid": 251, "correlation": 100000 } }, { "ph": "f", "id": 100000, "pid": 76337, "tid": -914061504, "ts": 1716454222950152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950152, "dur": 0, "args": { "External id": 100001, "cbid": 251, "correlation": 100001 } }, { "ph": "f", "id": 100001, "pid": 76337, "tid": -914061504, "ts": 1716454222950152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950153, "dur": 0, "args": { "External id": 100002, "cbid": 251, "correlation": 100002 } }, { "ph": "f", "id": 100002, "pid": 76337, "tid": -914061504, "ts": 1716454222950153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950154, "dur": 0, "args": { "External id": 100003, "cbid": 251, "correlation": 100003 } }, { "ph": "f", "id": 100003, "pid": 76337, "tid": -914061504, "ts": 1716454222950154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950155, "dur": 0, "args": { "External id": 100004, "cbid": 251, "correlation": 100004 } }, { "ph": "f", "id": 100004, "pid": 76337, "tid": -914061504, "ts": 1716454222950155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950156, "dur": 0, "args": { "External id": 100005, "cbid": 251, "correlation": 100005 } }, { "ph": "f", "id": 100005, "pid": 76337, "tid": -914061504, "ts": 1716454222950156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950157, "dur": 0, "args": { "External id": 100006, "cbid": 251, "correlation": 100006 } }, { "ph": "f", "id": 100006, "pid": 76337, "tid": -914061504, "ts": 1716454222950157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950158, "dur": 0, "args": { "External id": 100007, "cbid": 251, "correlation": 100007 } }, { "ph": "f", "id": 100007, "pid": 76337, "tid": -914061504, "ts": 1716454222950158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222971513, "dur": 113, "args": { "External id": 100008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100008, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100008, "pid": 5, "tid": 7, "ts": 1716454222971513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950160, "dur": 12, "args": { "External id": 100008, "cbid": 211, "correlation": 100008 } }, { "ph": "s", "id": 100008, "pid": 76337, "tid": -914061504, "ts": 1716454222950160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222971628, "dur": 59, "args": { "External id": 100014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100014, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100014, "pid": 5, "tid": 7, "ts": 1716454222971628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950195, "dur": 9, "args": { "External id": 100014, "cbid": 211, "correlation": 100014 } }, { "ph": "s", "id": 100014, "pid": 76337, "tid": -914061504, "ts": 1716454222950195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222971688, "dur": 50, "args": { "External id": 100022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100022, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100022, "pid": 5, "tid": 7, "ts": 1716454222971688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950227, "dur": 8, "args": { "External id": 100022, "cbid": 211, "correlation": 100022 } }, { "ph": "s", "id": 100022, "pid": 76337, "tid": -914061504, "ts": 1716454222950227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222971739, "dur": 51, "args": { "External id": 100042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100042, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 100042, "pid": 5, "tid": 7, "ts": 1716454222971739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950302, "dur": 12, "args": { "External id": 100042, "cbid": 211, "correlation": 100042 } }, { "ph": "s", "id": 100042, "pid": 76337, "tid": -914061504, "ts": 1716454222950302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222971792, "dur": 5, "args": { "External id": 100054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100054, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100054, "pid": 5, "tid": 7, "ts": 1716454222971792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950324, "dur": 6, "args": { "External id": 100054, "cbid": 211, "correlation": 100054 } }, { "ph": "s", "id": 100054, "pid": 76337, "tid": -914061504, "ts": 1716454222950324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222971798, "dur": 55, "args": { "External id": 100057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100057, "pid": 5, "tid": 7, "ts": 1716454222971798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950342, "dur": 7, "args": { "External id": 100057, "cbid": 211, "correlation": 100057 } }, { "ph": "s", "id": 100057, "pid": 76337, "tid": -914061504, "ts": 1716454222950342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222971855, "dur": 37, "args": { "External id": 100066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100066, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100066, "pid": 5, "tid": 7, "ts": 1716454222971855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950382, "dur": 10, "args": { "External id": 100066, "cbid": 211, "correlation": 100066 } }, { "ph": "s", "id": 100066, "pid": 76337, "tid": -914061504, "ts": 1716454222950382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222950434, "dur": 0, "args": { "External id": 100076, "cbid": 317, "correlation": 100076 } }, { "ph": "f", "id": 100076, "pid": 76337, "tid": -914061504, "ts": 1716454222950434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222950435, "dur": 0, "args": { "External id": 100077, "cbid": 203, "correlation": 100077 } }, { "ph": "f", "id": 100077, "pid": 76337, "tid": -914061504, "ts": 1716454222950435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222950435, "dur": 0, "args": { "External id": 100078, "cbid": 205, "correlation": 100078 } }, { "ph": "f", "id": 100078, "pid": 76337, "tid": -914061504, "ts": 1716454222950435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222971893, "dur": 41, "args": { "External id": 100082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100082, "pid": 5, "tid": 7, "ts": 1716454222971893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950451, "dur": 11, "args": { "External id": 100082, "cbid": 211, "correlation": 100082 } }, { "ph": "s", "id": 100082, "pid": 76337, "tid": -914061504, "ts": 1716454222950451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222971935, "dur": 14, "args": { "External id": 100084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100084, "pid": 5, "tid": 7, "ts": 1716454222971935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950464, "dur": 5, "args": { "External id": 100084, "cbid": 211, "correlation": 100084 } }, { "ph": "s", "id": 100084, "pid": 76337, "tid": -914061504, "ts": 1716454222950464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222971950, "dur": 4, "args": { "External id": 100086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100086, "pid": 5, "tid": 7, "ts": 1716454222971950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950474, "dur": 6, "args": { "External id": 100086, "cbid": 211, "correlation": 100086 } }, { "ph": "s", "id": 100086, "pid": 76337, "tid": -914061504, "ts": 1716454222950474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222950482, "dur": 0, "args": { "External id": 100087, "cbid": 51, "correlation": 100087 } }, { "ph": "s", "id": 100087, "pid": 76337, "tid": -914061504, "ts": 1716454222950482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222971955, "dur": 689, "args": { "External id": 100088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100088, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100088, "pid": 5, "tid": 7, "ts": 1716454222971955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950483, "dur": 5, "args": { "External id": 100088, "cbid": 211, "correlation": 100088 } }, { "ph": "s", "id": 100088, "pid": 76337, "tid": -914061504, "ts": 1716454222950483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222972645, "dur": 59, "args": { "External id": 100093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100093, "pid": 5, "tid": 7, "ts": 1716454222972645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950511, "dur": 9, "args": { "External id": 100093, "cbid": 211, "correlation": 100093 } }, { "ph": "s", "id": 100093, "pid": 76337, "tid": -914061504, "ts": 1716454222950511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222972706, "dur": 4, "args": { "External id": 100101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100101, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100101, "pid": 5, "tid": 7, "ts": 1716454222972706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950554, "dur": 9, "args": { "External id": 100101, "cbid": 211, "correlation": 100101 } }, { "ph": "s", "id": 100101, "pid": 76337, "tid": -914061504, "ts": 1716454222950554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950619, "dur": 1, "args": { "External id": 100117, "cbid": 251, "correlation": 100117 } }, { "ph": "f", "id": 100117, "pid": 76337, "tid": -914061504, "ts": 1716454222950619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222950625, "dur": 0, "args": { "External id": 100119, "cbid": 251, "correlation": 100119 } }, { "ph": "f", "id": 100119, "pid": 76337, "tid": -914061504, "ts": 1716454222950625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222972710, "dur": 11, "args": { "External id": 100120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100120, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 100120, "pid": 5, "tid": 7, "ts": 1716454222972710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950626, "dur": 11, "args": { "External id": 100120, "cbid": 211, "correlation": 100120 } }, { "ph": "s", "id": 100120, "pid": 76337, "tid": -914061504, "ts": 1716454222950626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222972723, "dur": 5, "args": { "External id": 100122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100122, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 100122, "pid": 5, "tid": 7, "ts": 1716454222972723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950640, "dur": 6, "args": { "External id": 100122, "cbid": 211, "correlation": 100122 } }, { "ph": "s", "id": 100122, "pid": 76337, "tid": -914061504, "ts": 1716454222950640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222972729, "dur": 52, "args": { "External id": 100132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100132, "pid": 5, "tid": 7, "ts": 1716454222972729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950698, "dur": 12, "args": { "External id": 100132, "cbid": 211, "correlation": 100132 } }, { "ph": "s", "id": 100132, "pid": 76337, "tid": -914061504, "ts": 1716454222950698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222972782, "dur": 50, "args": { "External id": 100152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100152, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 100152, "pid": 5, "tid": 7, "ts": 1716454222972782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950764, "dur": 11, "args": { "External id": 100152, "cbid": 211, "correlation": 100152 } }, { "ph": "s", "id": 100152, "pid": 76337, "tid": -914061504, "ts": 1716454222950764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222972834, "dur": 4, "args": { "External id": 100164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100164, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100164, "pid": 5, "tid": 7, "ts": 1716454222972834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950785, "dur": 6, "args": { "External id": 100164, "cbid": 211, "correlation": 100164 } }, { "ph": "s", "id": 100164, "pid": 76337, "tid": -914061504, "ts": 1716454222950785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222972839, "dur": 55, "args": { "External id": 100167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100167, "pid": 5, "tid": 7, "ts": 1716454222972839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950803, "dur": 7, "args": { "External id": 100167, "cbid": 211, "correlation": 100167 } }, { "ph": "s", "id": 100167, "pid": 76337, "tid": -914061504, "ts": 1716454222950803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222972895, "dur": 37, "args": { "External id": 100176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100176, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100176, "pid": 5, "tid": 7, "ts": 1716454222972895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950843, "dur": 10, "args": { "External id": 100176, "cbid": 211, "correlation": 100176 } }, { "ph": "s", "id": 100176, "pid": 76337, "tid": -914061504, "ts": 1716454222950843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222950908, "dur": 0, "args": { "External id": 100186, "cbid": 317, "correlation": 100186 } }, { "ph": "f", "id": 100186, "pid": 76337, "tid": -914061504, "ts": 1716454222950908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222950908, "dur": 0, "args": { "External id": 100187, "cbid": 203, "correlation": 100187 } }, { "ph": "f", "id": 100187, "pid": 76337, "tid": -914061504, "ts": 1716454222950908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222950909, "dur": 0, "args": { "External id": 100188, "cbid": 205, "correlation": 100188 } }, { "ph": "f", "id": 100188, "pid": 76337, "tid": -914061504, "ts": 1716454222950909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222972933, "dur": 41, "args": { "External id": 100192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100192, "pid": 5, "tid": 7, "ts": 1716454222972933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950923, "dur": 12, "args": { "External id": 100192, "cbid": 211, "correlation": 100192 } }, { "ph": "s", "id": 100192, "pid": 76337, "tid": -914061504, "ts": 1716454222950923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222972975, "dur": 14, "args": { "External id": 100194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100194, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100194, "pid": 5, "tid": 7, "ts": 1716454222972975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950937, "dur": 5, "args": { "External id": 100194, "cbid": 211, "correlation": 100194 } }, { "ph": "s", "id": 100194, "pid": 76337, "tid": -914061504, "ts": 1716454222950937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222972990, "dur": 3, "args": { "External id": 100196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100196, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100196, "pid": 5, "tid": 7, "ts": 1716454222972990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950946, "dur": 6, "args": { "External id": 100196, "cbid": 211, "correlation": 100196 } }, { "ph": "s", "id": 100196, "pid": 76337, "tid": -914061504, "ts": 1716454222950946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222950955, "dur": 0, "args": { "External id": 100197, "cbid": 51, "correlation": 100197 } }, { "ph": "s", "id": 100197, "pid": 76337, "tid": -914061504, "ts": 1716454222950955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222972994, "dur": 685, "args": { "External id": 100198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100198, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100198, "pid": 5, "tid": 7, "ts": 1716454222972994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950956, "dur": 5, "args": { "External id": 100198, "cbid": 211, "correlation": 100198 } }, { "ph": "s", "id": 100198, "pid": 76337, "tid": -914061504, "ts": 1716454222950956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222973680, "dur": 57, "args": { "External id": 100203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100203, "pid": 5, "tid": 7, "ts": 1716454222973680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222950992, "dur": 11, "args": { "External id": 100203, "cbid": 211, "correlation": 100203 } }, { "ph": "s", "id": 100203, "pid": 76337, "tid": -914061504, "ts": 1716454222950992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222973739, "dur": 50, "args": { "External id": 100211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100211, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100211, "pid": 5, "tid": 7, "ts": 1716454222973739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951026, "dur": 9, "args": { "External id": 100211, "cbid": 211, "correlation": 100211 } }, { "ph": "s", "id": 100211, "pid": 76337, "tid": -914061504, "ts": 1716454222951026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222973790, "dur": 36, "args": { "External id": 100219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100219, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100219, "pid": 5, "tid": 7, "ts": 1716454222973790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951056, "dur": 8, "args": { "External id": 100219, "cbid": 211, "correlation": 100219 } }, { "ph": "s", "id": 100219, "pid": 76337, "tid": -914061504, "ts": 1716454222951056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222973827, "dur": 50, "args": { "External id": 100239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100239, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 100239, "pid": 5, "tid": 7, "ts": 1716454222973827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951135, "dur": 13, "args": { "External id": 100239, "cbid": 211, "correlation": 100239 } }, { "ph": "s", "id": 100239, "pid": 76337, "tid": -914061504, "ts": 1716454222951135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222973879, "dur": 4, "args": { "External id": 100251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100251, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100251, "pid": 5, "tid": 7, "ts": 1716454222973879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951158, "dur": 6, "args": { "External id": 100251, "cbid": 211, "correlation": 100251 } }, { "ph": "s", "id": 100251, "pid": 76337, "tid": -914061504, "ts": 1716454222951158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222973884, "dur": 54, "args": { "External id": 100254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100254, "pid": 5, "tid": 7, "ts": 1716454222973884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951175, "dur": 6, "args": { "External id": 100254, "cbid": 211, "correlation": 100254 } }, { "ph": "s", "id": 100254, "pid": 76337, "tid": -914061504, "ts": 1716454222951175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222951233, "dur": 0, "args": { "External id": 100265, "cbid": 317, "correlation": 100265 } }, { "ph": "f", "id": 100265, "pid": 76337, "tid": -914061504, "ts": 1716454222951233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222951234, "dur": 0, "args": { "External id": 100266, "cbid": 203, "correlation": 100266 } }, { "ph": "f", "id": 100266, "pid": 76337, "tid": -914061504, "ts": 1716454222951234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222951234, "dur": 0, "args": { "External id": 100267, "cbid": 205, "correlation": 100267 } }, { "ph": "f", "id": 100267, "pid": 76337, "tid": -914061504, "ts": 1716454222951234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951257, "dur": 1, "args": { "External id": 100271, "cbid": 251, "correlation": 100271 } }, { "ph": "f", "id": 100271, "pid": 76337, "tid": -914061504, "ts": 1716454222951257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951259, "dur": 0, "args": { "External id": 100272, "cbid": 251, "correlation": 100272 } }, { "ph": "f", "id": 100272, "pid": 76337, "tid": -914061504, "ts": 1716454222951259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951260, "dur": 0, "args": { "External id": 100273, "cbid": 251, "correlation": 100273 } }, { "ph": "f", "id": 100273, "pid": 76337, "tid": -914061504, "ts": 1716454222951260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951260, "dur": 0, "args": { "External id": 100274, "cbid": 251, "correlation": 100274 } }, { "ph": "f", "id": 100274, "pid": 76337, "tid": -914061504, "ts": 1716454222951260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951261, "dur": 0, "args": { "External id": 100275, "cbid": 251, "correlation": 100275 } }, { "ph": "f", "id": 100275, "pid": 76337, "tid": -914061504, "ts": 1716454222951261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951261, "dur": 0, "args": { "External id": 100276, "cbid": 251, "correlation": 100276 } }, { "ph": "f", "id": 100276, "pid": 76337, "tid": -914061504, "ts": 1716454222951261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951262, "dur": 0, "args": { "External id": 100277, "cbid": 251, "correlation": 100277 } }, { "ph": "f", "id": 100277, "pid": 76337, "tid": -914061504, "ts": 1716454222951262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951263, "dur": 0, "args": { "External id": 100278, "cbid": 251, "correlation": 100278 } }, { "ph": "f", "id": 100278, "pid": 76337, "tid": -914061504, "ts": 1716454222951263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951264, "dur": 0, "args": { "External id": 100279, "cbid": 251, "correlation": 100279 } }, { "ph": "f", "id": 100279, "pid": 76337, "tid": -914061504, "ts": 1716454222951264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222973939, "dur": 111, "args": { "External id": 100280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100280, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100280, "pid": 5, "tid": 7, "ts": 1716454222973939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951266, "dur": 12, "args": { "External id": 100280, "cbid": 211, "correlation": 100280 } }, { "ph": "s", "id": 100280, "pid": 76337, "tid": -914061504, "ts": 1716454222951266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222974051, "dur": 59, "args": { "External id": 100286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100286, "pid": 5, "tid": 7, "ts": 1716454222974051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951302, "dur": 9, "args": { "External id": 100286, "cbid": 211, "correlation": 100286 } }, { "ph": "s", "id": 100286, "pid": 76337, "tid": -914061504, "ts": 1716454222951302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222974111, "dur": 693, "args": { "External id": 100295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100295, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100295, "pid": 5, "tid": 7, "ts": 1716454222974111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951384, "dur": 15, "args": { "External id": 100295, "cbid": 211, "correlation": 100295 } }, { "ph": "s", "id": 100295, "pid": 76337, "tid": -914061504, "ts": 1716454222951384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222974805, "dur": 176, "args": { "External id": 100317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100317, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100317, "pid": 5, "tid": 7, "ts": 1716454222974805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951440, "dur": 11, "args": { "External id": 100317, "cbid": 211, "correlation": 100317 } }, { "ph": "s", "id": 100317, "pid": 76337, "tid": -914061504, "ts": 1716454222951440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951527, "dur": 1, "args": { "External id": 100328, "cbid": 251, "correlation": 100328 } }, { "ph": "f", "id": 100328, "pid": 76337, "tid": -914061504, "ts": 1716454222951527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222974983, "dur": 193, "args": { "External id": 100329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100329, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100329, "pid": 5, "tid": 7, "ts": 1716454222974983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951532, "dur": 14, "args": { "External id": 100329, "cbid": 211, "correlation": 100329 } }, { "ph": "s", "id": 100329, "pid": 76337, "tid": -914061504, "ts": 1716454222951532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951601, "dur": 1, "args": { "External id": 100340, "cbid": 251, "correlation": 100340 } }, { "ph": "f", "id": 100340, "pid": 76337, "tid": -914061504, "ts": 1716454222951601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222975177, "dur": 186, "args": { "External id": 100341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100341, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100341, "pid": 5, "tid": 7, "ts": 1716454222975177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951605, "dur": 11, "args": { "External id": 100341, "cbid": 211, "correlation": 100341 } }, { "ph": "s", "id": 100341, "pid": 76337, "tid": -914061504, "ts": 1716454222951605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951668, "dur": 1, "args": { "External id": 100352, "cbid": 251, "correlation": 100352 } }, { "ph": "f", "id": 100352, "pid": 76337, "tid": -914061504, "ts": 1716454222951668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222975364, "dur": 187, "args": { "External id": 100353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100353, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100353, "pid": 5, "tid": 7, "ts": 1716454222975364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951672, "dur": 11, "args": { "External id": 100353, "cbid": 211, "correlation": 100353 } }, { "ph": "s", "id": 100353, "pid": 76337, "tid": -914061504, "ts": 1716454222951672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222975552, "dur": 18220, "args": { "External id": 100374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100374, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100374, "pid": 5, "tid": 7, "ts": 1716454222975552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951752, "dur": 12, "args": { "External id": 100374, "cbid": 211, "correlation": 100374 } }, { "ph": "s", "id": 100374, "pid": 76337, "tid": -914061504, "ts": 1716454222951752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222951851, "dur": 1, "args": { "External id": 100392, "cbid": 251, "correlation": 100392 } }, { "ph": "f", "id": 100392, "pid": 76337, "tid": -914061504, "ts": 1716454222951851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222993773, "dur": 197, "args": { "External id": 100394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100394, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100394, "pid": 5, "tid": 7, "ts": 1716454222993773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951856, "dur": 13, "args": { "External id": 100394, "cbid": 211, "correlation": 100394 } }, { "ph": "s", "id": 100394, "pid": 76337, "tid": -914061504, "ts": 1716454222951856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222993972, "dur": 66, "args": { "External id": 100402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100402, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100402, "pid": 5, "tid": 7, "ts": 1716454222993972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951927, "dur": 12, "args": { "External id": 100402, "cbid": 211, "correlation": 100402 } }, { "ph": "s", "id": 100402, "pid": 76337, "tid": -914061504, "ts": 1716454222951927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222994040, "dur": 97, "args": { "External id": 100410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100410, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100410, "pid": 5, "tid": 7, "ts": 1716454222994040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222951966, "dur": 16, "args": { "External id": 100410, "cbid": 211, "correlation": 100410 } }, { "ph": "s", "id": 100410, "pid": 76337, "tid": -914061504, "ts": 1716454222951966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222994137, "dur": 53, "args": { "External id": 100421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100421, "pid": 5, "tid": 7, "ts": 1716454222994137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952047, "dur": 13, "args": { "External id": 100421, "cbid": 211, "correlation": 100421 } }, { "ph": "s", "id": 100421, "pid": 76337, "tid": -914061504, "ts": 1716454222952047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222994192, "dur": 90, "args": { "External id": 100443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100443, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100443, "pid": 5, "tid": 7, "ts": 1716454222994192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952078, "dur": 8, "args": { "External id": 100443, "cbid": 211, "correlation": 100443 } }, { "ph": "s", "id": 100443, "pid": 76337, "tid": -914061504, "ts": 1716454222952078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952163, "dur": 1, "args": { "External id": 100454, "cbid": 251, "correlation": 100454 } }, { "ph": "f", "id": 100454, "pid": 76337, "tid": -914061504, "ts": 1716454222952163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222994283, "dur": 103, "args": { "External id": 100455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100455, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100455, "pid": 5, "tid": 7, "ts": 1716454222994283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952168, "dur": 12, "args": { "External id": 100455, "cbid": 211, "correlation": 100455 } }, { "ph": "s", "id": 100455, "pid": 76337, "tid": -914061504, "ts": 1716454222952168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952239, "dur": 1, "args": { "External id": 100466, "cbid": 251, "correlation": 100466 } }, { "ph": "f", "id": 100466, "pid": 76337, "tid": -914061504, "ts": 1716454222952239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952242, "dur": 0, "args": { "External id": 100467, "cbid": 251, "correlation": 100467 } }, { "ph": "f", "id": 100467, "pid": 76337, "tid": -914061504, "ts": 1716454222952242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222994388, "dur": 10, "args": { "External id": 100468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100468, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 100468, "pid": 5, "tid": 7, "ts": 1716454222994388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952244, "dur": 12, "args": { "External id": 100468, "cbid": 211, "correlation": 100468 } }, { "ph": "s", "id": 100468, "pid": 76337, "tid": -914061504, "ts": 1716454222952244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222994400, "dur": 5, "args": { "External id": 100470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100470, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 100470, "pid": 5, "tid": 7, "ts": 1716454222994400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952259, "dur": 6, "args": { "External id": 100470, "cbid": 211, "correlation": 100470 } }, { "ph": "s", "id": 100470, "pid": 76337, "tid": -914061504, "ts": 1716454222952259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952320, "dur": 1, "args": { "External id": 100481, "cbid": 251, "correlation": 100481 } }, { "ph": "f", "id": 100481, "pid": 76337, "tid": -914061504, "ts": 1716454222952320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952323, "dur": 0, "args": { "External id": 100482, "cbid": 251, "correlation": 100482 } }, { "ph": "f", "id": 100482, "pid": 76337, "tid": -914061504, "ts": 1716454222952323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222994406, "dur": 6, "args": { "External id": 100483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100483, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 100483, "pid": 5, "tid": 7, "ts": 1716454222994406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952325, "dur": 11, "args": { "External id": 100483, "cbid": 211, "correlation": 100483 } }, { "ph": "s", "id": 100483, "pid": 76337, "tid": -914061504, "ts": 1716454222952325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222994413, "dur": 3, "args": { "External id": 100485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100485, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 100485, "pid": 5, "tid": 7, "ts": 1716454222994413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952338, "dur": 6, "args": { "External id": 100485, "cbid": 211, "correlation": 100485 } }, { "ph": "s", "id": 100485, "pid": 76337, "tid": -914061504, "ts": 1716454222952338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222994418, "dur": 152, "args": { "External id": 100506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100506, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100506, "pid": 5, "tid": 7, "ts": 1716454222994418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952412, "dur": 12, "args": { "External id": 100506, "cbid": 211, "correlation": 100506 } }, { "ph": "s", "id": 100506, "pid": 76337, "tid": -914061504, "ts": 1716454222952412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952508, "dur": 1, "args": { "External id": 100524, "cbid": 251, "correlation": 100524 } }, { "ph": "f", "id": 100524, "pid": 76337, "tid": -914061504, "ts": 1716454222952508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222994571, "dur": 107, "args": { "External id": 100526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100526, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100526, "pid": 5, "tid": 7, "ts": 1716454222994571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952514, "dur": 13, "args": { "External id": 100526, "cbid": 211, "correlation": 100526 } }, { "ph": "s", "id": 100526, "pid": 76337, "tid": -914061504, "ts": 1716454222952514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222994679, "dur": 34, "args": { "External id": 100534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100534, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100534, "pid": 5, "tid": 7, "ts": 1716454222994679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952584, "dur": 13, "args": { "External id": 100534, "cbid": 211, "correlation": 100534 } }, { "ph": "s", "id": 100534, "pid": 76337, "tid": -914061504, "ts": 1716454222952584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222994715, "dur": 66, "args": { "External id": 100542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100542, "pid": 5, "tid": 7, "ts": 1716454222994715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952625, "dur": 9, "args": { "External id": 100542, "cbid": 211, "correlation": 100542 } }, { "ph": "s", "id": 100542, "pid": 76337, "tid": -914061504, "ts": 1716454222952625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222994782, "dur": 90, "args": { "External id": 100564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100564, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100564, "pid": 5, "tid": 7, "ts": 1716454222994782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952675, "dur": 10, "args": { "External id": 100564, "cbid": 211, "correlation": 100564 } }, { "ph": "s", "id": 100564, "pid": 76337, "tid": -914061504, "ts": 1716454222952675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952762, "dur": 1, "args": { "External id": 100580, "cbid": 251, "correlation": 100580 } }, { "ph": "f", "id": 100580, "pid": 76337, "tid": -914061504, "ts": 1716454222952762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222994874, "dur": 564, "args": { "External id": 100582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100582, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100582, "pid": 5, "tid": 7, "ts": 1716454222994874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952768, "dur": 13, "args": { "External id": 100582, "cbid": 211, "correlation": 100582 } }, { "ph": "s", "id": 100582, "pid": 76337, "tid": -914061504, "ts": 1716454222952768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222995439, "dur": 240, "args": { "External id": 100590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100590, "pid": 5, "tid": 7, "ts": 1716454222995439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952833, "dur": 13, "args": { "External id": 100590, "cbid": 211, "correlation": 100590 } }, { "ph": "s", "id": 100590, "pid": 76337, "tid": -914061504, "ts": 1716454222952833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222995681, "dur": 251, "args": { "External id": 100598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100598, "pid": 5, "tid": 7, "ts": 1716454222995681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952864, "dur": 8, "args": { "External id": 100598, "cbid": 211, "correlation": 100598 } }, { "ph": "s", "id": 100598, "pid": 76337, "tid": -914061504, "ts": 1716454222952864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952947, "dur": 2, "args": { "External id": 100614, "cbid": 251, "correlation": 100614 } }, { "ph": "f", "id": 100614, "pid": 76337, "tid": -914061504, "ts": 1716454222952947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222952953, "dur": 0, "args": { "External id": 100616, "cbid": 251, "correlation": 100616 } }, { "ph": "f", "id": 100616, "pid": 76337, "tid": -914061504, "ts": 1716454222952953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454222995933, "dur": 356, "args": { "External id": 100617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100617, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 100617, "pid": 5, "tid": 7, "ts": 1716454222995933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222952955, "dur": 13, "args": { "External id": 100617, "cbid": 211, "correlation": 100617 } }, { "ph": "s", "id": 100617, "pid": 76337, "tid": -914061504, "ts": 1716454222952955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222996291, "dur": 50, "args": { "External id": 100625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100625, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100625, "pid": 5, "tid": 7, "ts": 1716454222996291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953007, "dur": 10, "args": { "External id": 100625, "cbid": 211, "correlation": 100625 } }, { "ph": "s", "id": 100625, "pid": 76337, "tid": -914061504, "ts": 1716454222953007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222996342, "dur": 154, "args": { "External id": 100636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100636, "pid": 5, "tid": 7, "ts": 1716454222996342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953075, "dur": 12, "args": { "External id": 100636, "cbid": 211, "correlation": 100636 } }, { "ph": "s", "id": 100636, "pid": 76337, "tid": -914061504, "ts": 1716454222953075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222953139, "dur": 0, "args": { "External id": 100648, "cbid": 317, "correlation": 100648 } }, { "ph": "f", "id": 100648, "pid": 76337, "tid": -914061504, "ts": 1716454222953139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222953140, "dur": 0, "args": { "External id": 100649, "cbid": 203, "correlation": 100649 } }, { "ph": "f", "id": 100649, "pid": 76337, "tid": -914061504, "ts": 1716454222953140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222953141, "dur": 0, "args": { "External id": 100650, "cbid": 205, "correlation": 100650 } }, { "ph": "f", "id": 100650, "pid": 76337, "tid": -914061504, "ts": 1716454222953141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953164, "dur": 1, "args": { "External id": 100654, "cbid": 251, "correlation": 100654 } }, { "ph": "f", "id": 100654, "pid": 76337, "tid": -914061504, "ts": 1716454222953164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953166, "dur": 0, "args": { "External id": 100655, "cbid": 251, "correlation": 100655 } }, { "ph": "f", "id": 100655, "pid": 76337, "tid": -914061504, "ts": 1716454222953166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953166, "dur": 0, "args": { "External id": 100656, "cbid": 251, "correlation": 100656 } }, { "ph": "f", "id": 100656, "pid": 76337, "tid": -914061504, "ts": 1716454222953166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953167, "dur": 0, "args": { "External id": 100657, "cbid": 251, "correlation": 100657 } }, { "ph": "f", "id": 100657, "pid": 76337, "tid": -914061504, "ts": 1716454222953167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953167, "dur": 0, "args": { "External id": 100658, "cbid": 251, "correlation": 100658 } }, { "ph": "f", "id": 100658, "pid": 76337, "tid": -914061504, "ts": 1716454222953167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953168, "dur": 0, "args": { "External id": 100659, "cbid": 251, "correlation": 100659 } }, { "ph": "f", "id": 100659, "pid": 76337, "tid": -914061504, "ts": 1716454222953168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953169, "dur": 0, "args": { "External id": 100660, "cbid": 251, "correlation": 100660 } }, { "ph": "f", "id": 100660, "pid": 76337, "tid": -914061504, "ts": 1716454222953169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953170, "dur": 0, "args": { "External id": 100661, "cbid": 251, "correlation": 100661 } }, { "ph": "f", "id": 100661, "pid": 76337, "tid": -914061504, "ts": 1716454222953170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953171, "dur": 0, "args": { "External id": 100662, "cbid": 251, "correlation": 100662 } }, { "ph": "f", "id": 100662, "pid": 76337, "tid": -914061504, "ts": 1716454222953171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222996498, "dur": 112, "args": { "External id": 100663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100663, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100663, "pid": 5, "tid": 7, "ts": 1716454222996498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953173, "dur": 12, "args": { "External id": 100663, "cbid": 211, "correlation": 100663 } }, { "ph": "s", "id": 100663, "pid": 76337, "tid": -914061504, "ts": 1716454222953173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222996612, "dur": 59, "args": { "External id": 100669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100669, "pid": 5, "tid": 7, "ts": 1716454222996612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953208, "dur": 9, "args": { "External id": 100669, "cbid": 211, "correlation": 100669 } }, { "ph": "s", "id": 100669, "pid": 76337, "tid": -914061504, "ts": 1716454222953208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222996672, "dur": 50, "args": { "External id": 100677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100677, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100677, "pid": 5, "tid": 7, "ts": 1716454222996672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953240, "dur": 9, "args": { "External id": 100677, "cbid": 211, "correlation": 100677 } }, { "ph": "s", "id": 100677, "pid": 76337, "tid": -914061504, "ts": 1716454222953240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222953313, "dur": 0, "args": { "External id": 100687, "cbid": 317, "correlation": 100687 } }, { "ph": "f", "id": 100687, "pid": 76337, "tid": -914061504, "ts": 1716454222953313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222953314, "dur": 0, "args": { "External id": 100688, "cbid": 203, "correlation": 100688 } }, { "ph": "f", "id": 100688, "pid": 76337, "tid": -914061504, "ts": 1716454222953314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222953314, "dur": 0, "args": { "External id": 100689, "cbid": 205, "correlation": 100689 } }, { "ph": "f", "id": 100689, "pid": 76337, "tid": -914061504, "ts": 1716454222953314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222996723, "dur": 41, "args": { "External id": 100693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100693, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100693, "pid": 5, "tid": 7, "ts": 1716454222996723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953330, "dur": 12, "args": { "External id": 100693, "cbid": 211, "correlation": 100693 } }, { "ph": "s", "id": 100693, "pid": 76337, "tid": -914061504, "ts": 1716454222953330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222996765, "dur": 14, "args": { "External id": 100695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100695, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100695, "pid": 5, "tid": 7, "ts": 1716454222996765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953346, "dur": 6, "args": { "External id": 100695, "cbid": 211, "correlation": 100695 } }, { "ph": "s", "id": 100695, "pid": 76337, "tid": -914061504, "ts": 1716454222953346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454222996782, "dur": 1, "args": { "External id": 100697, "device": 5, "context": 1, "stream": 7, "correlation": 100697, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 100697, "pid": 5, "tid": 7, "ts": 1716454222996782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222953366, "dur": 18, "args": { "External id": 100697, "cbid": 51, "correlation": 100697 } }, { "ph": "s", "id": 100697, "pid": 76337, "tid": -914061504, "ts": 1716454222953366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222996786, "dur": 352, "args": { "External id": 100698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100698, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100698, "pid": 5, "tid": 7, "ts": 1716454222996786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953386, "dur": 11, "args": { "External id": 100698, "cbid": 211, "correlation": 100698 } }, { "ph": "s", "id": 100698, "pid": 76337, "tid": -914061504, "ts": 1716454222953386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222997139, "dur": 12, "args": { "External id": 100700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100700, "pid": 5, "tid": 7, "ts": 1716454222997139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953405, "dur": 7, "args": { "External id": 100700, "cbid": 211, "correlation": 100700 } }, { "ph": "s", "id": 100700, "pid": 76337, "tid": -914061504, "ts": 1716454222953405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222997153, "dur": 15, "args": { "External id": 100706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100706, "pid": 5, "tid": 7, "ts": 1716454222997153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953435, "dur": 10, "args": { "External id": 100706, "cbid": 211, "correlation": 100706 } }, { "ph": "s", "id": 100706, "pid": 76337, "tid": -914061504, "ts": 1716454222953435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222997169, "dur": 18, "args": { "External id": 100726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100726, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 100726, "pid": 5, "tid": 7, "ts": 1716454222997169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953528, "dur": 13, "args": { "External id": 100726, "cbid": 211, "correlation": 100726 } }, { "ph": "s", "id": 100726, "pid": 76337, "tid": -914061504, "ts": 1716454222953528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222997188, "dur": 4, "args": { "External id": 100738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100738, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100738, "pid": 5, "tid": 7, "ts": 1716454222997188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953551, "dur": 6, "args": { "External id": 100738, "cbid": 211, "correlation": 100738 } }, { "ph": "s", "id": 100738, "pid": 76337, "tid": -914061504, "ts": 1716454222953551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222997194, "dur": 18, "args": { "External id": 100741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100741, "pid": 5, "tid": 7, "ts": 1716454222997194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953569, "dur": 6, "args": { "External id": 100741, "cbid": 211, "correlation": 100741 } }, { "ph": "s", "id": 100741, "pid": 76337, "tid": -914061504, "ts": 1716454222953569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222997213, "dur": 11, "args": { "External id": 100750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100750, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100750, "pid": 5, "tid": 7, "ts": 1716454222997213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953610, "dur": 9, "args": { "External id": 100750, "cbid": 211, "correlation": 100750 } }, { "ph": "s", "id": 100750, "pid": 76337, "tid": -914061504, "ts": 1716454222953610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222953664, "dur": 0, "args": { "External id": 100760, "cbid": 317, "correlation": 100760 } }, { "ph": "f", "id": 100760, "pid": 76337, "tid": -914061504, "ts": 1716454222953664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222953665, "dur": 0, "args": { "External id": 100761, "cbid": 203, "correlation": 100761 } }, { "ph": "f", "id": 100761, "pid": 76337, "tid": -914061504, "ts": 1716454222953665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222953666, "dur": 0, "args": { "External id": 100762, "cbid": 205, "correlation": 100762 } }, { "ph": "f", "id": 100762, "pid": 76337, "tid": -914061504, "ts": 1716454222953666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222997225, "dur": 11, "args": { "External id": 100766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100766, "pid": 5, "tid": 7, "ts": 1716454222997225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953681, "dur": 12, "args": { "External id": 100766, "cbid": 211, "correlation": 100766 } }, { "ph": "s", "id": 100766, "pid": 76337, "tid": -914061504, "ts": 1716454222953681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222997237, "dur": 24, "args": { "External id": 100768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100768, "pid": 5, "tid": 7, "ts": 1716454222997237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953696, "dur": 5, "args": { "External id": 100768, "cbid": 211, "correlation": 100768 } }, { "ph": "s", "id": 100768, "pid": 76337, "tid": -914061504, "ts": 1716454222953696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454222997262, "dur": 4, "args": { "External id": 100770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 100770, "pid": 5, "tid": 7, "ts": 1716454222997262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953707, "dur": 6, "args": { "External id": 100770, "cbid": 211, "correlation": 100770 } }, { "ph": "s", "id": 100770, "pid": 76337, "tid": -914061504, "ts": 1716454222953707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222953716, "dur": 0, "args": { "External id": 100771, "cbid": 51, "correlation": 100771 } }, { "ph": "s", "id": 100771, "pid": 76337, "tid": -914061504, "ts": 1716454222953716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454222997267, "dur": 354, "args": { "External id": 100772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100772, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100772, "pid": 5, "tid": 7, "ts": 1716454222997267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953717, "dur": 7, "args": { "External id": 100772, "cbid": 211, "correlation": 100772 } }, { "ph": "s", "id": 100772, "pid": 76337, "tid": -914061504, "ts": 1716454222953717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222997623, "dur": 20, "args": { "External id": 100773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100773, "pid": 5, "tid": 7, "ts": 1716454222997623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953727, "dur": 5, "args": { "External id": 100773, "cbid": 211, "correlation": 100773 } }, { "ph": "s", "id": 100773, "pid": 76337, "tid": -914061504, "ts": 1716454222953727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222997644, "dur": 32, "args": { "External id": 100779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100779, "pid": 5, "tid": 7, "ts": 1716454222997644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953755, "dur": 8, "args": { "External id": 100779, "cbid": 211, "correlation": 100779 } }, { "ph": "s", "id": 100779, "pid": 76337, "tid": -914061504, "ts": 1716454222953755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222997677, "dur": 3, "args": { "External id": 100787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100787, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 100787, "pid": 5, "tid": 7, "ts": 1716454222997677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953798, "dur": 9, "args": { "External id": 100787, "cbid": 211, "correlation": 100787 } }, { "ph": "s", "id": 100787, "pid": 76337, "tid": -914061504, "ts": 1716454222953798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953863, "dur": 1, "args": { "External id": 100803, "cbid": 251, "correlation": 100803 } }, { "ph": "f", "id": 100803, "pid": 76337, "tid": -914061504, "ts": 1716454222953863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222953868, "dur": 0, "args": { "External id": 100805, "cbid": 251, "correlation": 100805 } }, { "ph": "f", "id": 100805, "pid": 76337, "tid": -914061504, "ts": 1716454222953868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454222997682, "dur": 12, "args": { "External id": 100806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100806, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 100806, "pid": 5, "tid": 7, "ts": 1716454222997682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953870, "dur": 11, "args": { "External id": 100806, "cbid": 211, "correlation": 100806 } }, { "ph": "s", "id": 100806, "pid": 76337, "tid": -914061504, "ts": 1716454222953870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454222997696, "dur": 5, "args": { "External id": 100808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100808, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 100808, "pid": 5, "tid": 7, "ts": 1716454222997696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953883, "dur": 6, "args": { "External id": 100808, "cbid": 211, "correlation": 100808 } }, { "ph": "s", "id": 100808, "pid": 76337, "tid": -914061504, "ts": 1716454222953883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222997702, "dur": 30, "args": { "External id": 100818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100818, "pid": 5, "tid": 7, "ts": 1716454222997702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222953941, "dur": 12, "args": { "External id": 100818, "cbid": 211, "correlation": 100818 } }, { "ph": "s", "id": 100818, "pid": 76337, "tid": -914061504, "ts": 1716454222953941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222997733, "dur": 31, "args": { "External id": 100838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100838, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 100838, "pid": 5, "tid": 7, "ts": 1716454222997733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954016, "dur": 11, "args": { "External id": 100838, "cbid": 211, "correlation": 100838 } }, { "ph": "s", "id": 100838, "pid": 76337, "tid": -914061504, "ts": 1716454222954016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222997765, "dur": 5, "args": { "External id": 100850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100850, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 100850, "pid": 5, "tid": 7, "ts": 1716454222997765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954037, "dur": 7, "args": { "External id": 100850, "cbid": 211, "correlation": 100850 } }, { "ph": "s", "id": 100850, "pid": 76337, "tid": -914061504, "ts": 1716454222954037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222997772, "dur": 30, "args": { "External id": 100853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100853, "pid": 5, "tid": 7, "ts": 1716454222997772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954058, "dur": 6, "args": { "External id": 100853, "cbid": 211, "correlation": 100853 } }, { "ph": "s", "id": 100853, "pid": 76337, "tid": -914061504, "ts": 1716454222954058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222997803, "dur": 20, "args": { "External id": 100862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100862, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100862, "pid": 5, "tid": 7, "ts": 1716454222997803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954099, "dur": 10, "args": { "External id": 100862, "cbid": 211, "correlation": 100862 } }, { "ph": "s", "id": 100862, "pid": 76337, "tid": -914061504, "ts": 1716454222954099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222954162, "dur": 0, "args": { "External id": 100872, "cbid": 317, "correlation": 100872 } }, { "ph": "f", "id": 100872, "pid": 76337, "tid": -914061504, "ts": 1716454222954162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222954163, "dur": 0, "args": { "External id": 100873, "cbid": 203, "correlation": 100873 } }, { "ph": "f", "id": 100873, "pid": 76337, "tid": -914061504, "ts": 1716454222954163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222954163, "dur": 0, "args": { "External id": 100874, "cbid": 205, "correlation": 100874 } }, { "ph": "f", "id": 100874, "pid": 76337, "tid": -914061504, "ts": 1716454222954163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222997824, "dur": 23, "args": { "External id": 100878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100878, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100878, "pid": 5, "tid": 7, "ts": 1716454222997824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954182, "dur": 13, "args": { "External id": 100878, "cbid": 211, "correlation": 100878 } }, { "ph": "s", "id": 100878, "pid": 76337, "tid": -914061504, "ts": 1716454222954182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222997848, "dur": 43, "args": { "External id": 100880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100880, "pid": 5, "tid": 7, "ts": 1716454222997848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954197, "dur": 5, "args": { "External id": 100880, "cbid": 211, "correlation": 100880 } }, { "ph": "s", "id": 100880, "pid": 76337, "tid": -914061504, "ts": 1716454222954197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222997893, "dur": 647, "args": { "External id": 100882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100882, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100882, "pid": 5, "tid": 7, "ts": 1716454222997893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954211, "dur": 9, "args": { "External id": 100882, "cbid": 211, "correlation": 100882 } }, { "ph": "s", "id": 100882, "pid": 76337, "tid": -914061504, "ts": 1716454222954211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222998541, "dur": 20, "args": { "External id": 100884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100884, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100884, "pid": 5, "tid": 7, "ts": 1716454222998541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954224, "dur": 5, "args": { "External id": 100884, "cbid": 211, "correlation": 100884 } }, { "ph": "s", "id": 100884, "pid": 76337, "tid": -914061504, "ts": 1716454222954224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222998562, "dur": 33, "args": { "External id": 100890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100890, "pid": 5, "tid": 7, "ts": 1716454222998562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954253, "dur": 8, "args": { "External id": 100890, "cbid": 211, "correlation": 100890 } }, { "ph": "s", "id": 100890, "pid": 76337, "tid": -914061504, "ts": 1716454222954253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222954311, "dur": 0, "args": { "External id": 100900, "cbid": 317, "correlation": 100900 } }, { "ph": "f", "id": 100900, "pid": 76337, "tid": -914061504, "ts": 1716454222954311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222954312, "dur": 0, "args": { "External id": 100901, "cbid": 203, "correlation": 100901 } }, { "ph": "f", "id": 100901, "pid": 76337, "tid": -914061504, "ts": 1716454222954312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222954313, "dur": 0, "args": { "External id": 100902, "cbid": 205, "correlation": 100902 } }, { "ph": "f", "id": 100902, "pid": 76337, "tid": -914061504, "ts": 1716454222954313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954335, "dur": 1, "args": { "External id": 100906, "cbid": 251, "correlation": 100906 } }, { "ph": "f", "id": 100906, "pid": 76337, "tid": -914061504, "ts": 1716454222954335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954337, "dur": 0, "args": { "External id": 100907, "cbid": 251, "correlation": 100907 } }, { "ph": "f", "id": 100907, "pid": 76337, "tid": -914061504, "ts": 1716454222954337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954337, "dur": 0, "args": { "External id": 100908, "cbid": 251, "correlation": 100908 } }, { "ph": "f", "id": 100908, "pid": 76337, "tid": -914061504, "ts": 1716454222954337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954338, "dur": 0, "args": { "External id": 100909, "cbid": 251, "correlation": 100909 } }, { "ph": "f", "id": 100909, "pid": 76337, "tid": -914061504, "ts": 1716454222954338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954339, "dur": 0, "args": { "External id": 100910, "cbid": 251, "correlation": 100910 } }, { "ph": "f", "id": 100910, "pid": 76337, "tid": -914061504, "ts": 1716454222954339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954340, "dur": 0, "args": { "External id": 100911, "cbid": 251, "correlation": 100911 } }, { "ph": "f", "id": 100911, "pid": 76337, "tid": -914061504, "ts": 1716454222954340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954341, "dur": 0, "args": { "External id": 100912, "cbid": 251, "correlation": 100912 } }, { "ph": "f", "id": 100912, "pid": 76337, "tid": -914061504, "ts": 1716454222954341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954342, "dur": 0, "args": { "External id": 100913, "cbid": 251, "correlation": 100913 } }, { "ph": "f", "id": 100913, "pid": 76337, "tid": -914061504, "ts": 1716454222954342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954343, "dur": 0, "args": { "External id": 100914, "cbid": 251, "correlation": 100914 } }, { "ph": "f", "id": 100914, "pid": 76337, "tid": -914061504, "ts": 1716454222954343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454222998596, "dur": 51, "args": { "External id": 100915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100915, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 100915, "pid": 5, "tid": 7, "ts": 1716454222998596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954345, "dur": 12, "args": { "External id": 100915, "cbid": 211, "correlation": 100915 } }, { "ph": "s", "id": 100915, "pid": 76337, "tid": -914061504, "ts": 1716454222954345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222998648, "dur": 31, "args": { "External id": 100921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100921, "pid": 5, "tid": 7, "ts": 1716454222998648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954377, "dur": 8, "args": { "External id": 100921, "cbid": 211, "correlation": 100921 } }, { "ph": "s", "id": 100921, "pid": 76337, "tid": -914061504, "ts": 1716454222954377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222998681, "dur": 27, "args": { "External id": 100929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100929, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100929, "pid": 5, "tid": 7, "ts": 1716454222998681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954407, "dur": 8, "args": { "External id": 100929, "cbid": 211, "correlation": 100929 } }, { "ph": "s", "id": 100929, "pid": 76337, "tid": -914061504, "ts": 1716454222954407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454222998709, "dur": 20, "args": { "External id": 100937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100937, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100937, "pid": 5, "tid": 7, "ts": 1716454222998709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954436, "dur": 9, "args": { "External id": 100937, "cbid": 211, "correlation": 100937 } }, { "ph": "s", "id": 100937, "pid": 76337, "tid": -914061504, "ts": 1716454222954436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222998730, "dur": 29, "args": { "External id": 100957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100957, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 100957, "pid": 5, "tid": 7, "ts": 1716454222998730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954518, "dur": 12, "args": { "External id": 100957, "cbid": 211, "correlation": 100957 } }, { "ph": "s", "id": 100957, "pid": 76337, "tid": -914061504, "ts": 1716454222954518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454222998760, "dur": 5, "args": { "External id": 100969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100969, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 100969, "pid": 5, "tid": 7, "ts": 1716454222998760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954539, "dur": 7, "args": { "External id": 100969, "cbid": 211, "correlation": 100969 } }, { "ph": "s", "id": 100969, "pid": 76337, "tid": -914061504, "ts": 1716454222954539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222998767, "dur": 29, "args": { "External id": 100972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100972, "pid": 5, "tid": 7, "ts": 1716454222998767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954557, "dur": 6, "args": { "External id": 100972, "cbid": 211, "correlation": 100972 } }, { "ph": "s", "id": 100972, "pid": 76337, "tid": -914061504, "ts": 1716454222954557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222954615, "dur": 0, "args": { "External id": 100983, "cbid": 317, "correlation": 100983 } }, { "ph": "f", "id": 100983, "pid": 76337, "tid": -914061504, "ts": 1716454222954615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222954616, "dur": 0, "args": { "External id": 100984, "cbid": 203, "correlation": 100984 } }, { "ph": "f", "id": 100984, "pid": 76337, "tid": -914061504, "ts": 1716454222954616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222954617, "dur": 0, "args": { "External id": 100985, "cbid": 205, "correlation": 100985 } }, { "ph": "f", "id": 100985, "pid": 76337, "tid": -914061504, "ts": 1716454222954617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222998797, "dur": 22, "args": { "External id": 100989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100989, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100989, "pid": 5, "tid": 7, "ts": 1716454222998797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954632, "dur": 12, "args": { "External id": 100989, "cbid": 211, "correlation": 100989 } }, { "ph": "s", "id": 100989, "pid": 76337, "tid": -914061504, "ts": 1716454222954632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454222998820, "dur": 117, "args": { "External id": 100991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100991, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 100991, "pid": 5, "tid": 7, "ts": 1716454222998820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954653, "dur": 9, "args": { "External id": 100991, "cbid": 211, "correlation": 100991 } }, { "ph": "s", "id": 100991, "pid": 76337, "tid": -914061504, "ts": 1716454222954653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454222998939, "dur": 22, "args": { "External id": 100993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100993, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100993, "pid": 5, "tid": 7, "ts": 1716454222998939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954665, "dur": 5, "args": { "External id": 100993, "cbid": 211, "correlation": 100993 } }, { "ph": "s", "id": 100993, "pid": 76337, "tid": -914061504, "ts": 1716454222954665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454222998962, "dur": 32, "args": { "External id": 100999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 100999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 100999, "pid": 5, "tid": 7, "ts": 1716454222998962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954693, "dur": 9, "args": { "External id": 100999, "cbid": 211, "correlation": 100999 } }, { "ph": "s", "id": 100999, "pid": 76337, "tid": -914061504, "ts": 1716454222954693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454222998995, "dur": 197, "args": { "External id": 101008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101008, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101008, "pid": 5, "tid": 7, "ts": 1716454222998995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954774, "dur": 14, "args": { "External id": 101008, "cbid": 211, "correlation": 101008 } }, { "ph": "s", "id": 101008, "pid": 76337, "tid": -914061504, "ts": 1716454222954774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454222999193, "dur": 64, "args": { "External id": 101030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101030, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101030, "pid": 5, "tid": 7, "ts": 1716454222999193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954831, "dur": 10, "args": { "External id": 101030, "cbid": 211, "correlation": 101030 } }, { "ph": "s", "id": 101030, "pid": 76337, "tid": -914061504, "ts": 1716454222954831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222954922, "dur": 1, "args": { "External id": 101041, "cbid": 251, "correlation": 101041 } }, { "ph": "f", "id": 101041, "pid": 76337, "tid": -914061504, "ts": 1716454222954922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222999258, "dur": 152, "args": { "External id": 101042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101042, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101042, "pid": 5, "tid": 7, "ts": 1716454222999258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222954927, "dur": 13, "args": { "External id": 101042, "cbid": 211, "correlation": 101042 } }, { "ph": "s", "id": 101042, "pid": 76337, "tid": -914061504, "ts": 1716454222954927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955007, "dur": 1, "args": { "External id": 101053, "cbid": 251, "correlation": 101053 } }, { "ph": "f", "id": 101053, "pid": 76337, "tid": -914061504, "ts": 1716454222955007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222999411, "dur": 144, "args": { "External id": 101054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101054, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101054, "pid": 5, "tid": 7, "ts": 1716454222999411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955012, "dur": 12, "args": { "External id": 101054, "cbid": 211, "correlation": 101054 } }, { "ph": "s", "id": 101054, "pid": 76337, "tid": -914061504, "ts": 1716454222955012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955079, "dur": 1, "args": { "External id": 101065, "cbid": 251, "correlation": 101065 } }, { "ph": "f", "id": 101065, "pid": 76337, "tid": -914061504, "ts": 1716454222955079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454222999557, "dur": 143, "args": { "External id": 101066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101066, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101066, "pid": 5, "tid": 7, "ts": 1716454222999557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955084, "dur": 12, "args": { "External id": 101066, "cbid": 211, "correlation": 101066 } }, { "ph": "s", "id": 101066, "pid": 76337, "tid": -914061504, "ts": 1716454222955084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454222999701, "dur": 1909, "args": { "External id": 101087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101087, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 101087, "pid": 5, "tid": 7, "ts": 1716454222999701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955170, "dur": 14, "args": { "External id": 101087, "cbid": 211, "correlation": 101087 } }, { "ph": "s", "id": 101087, "pid": 76337, "tid": -914061504, "ts": 1716454222955170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955273, "dur": 1, "args": { "External id": 101105, "cbid": 251, "correlation": 101105 } }, { "ph": "f", "id": 101105, "pid": 76337, "tid": -914061504, "ts": 1716454222955273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223001612, "dur": 145, "args": { "External id": 101107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101107, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 101107, "pid": 5, "tid": 7, "ts": 1716454223001612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955279, "dur": 14, "args": { "External id": 101107, "cbid": 211, "correlation": 101107 } }, { "ph": "s", "id": 101107, "pid": 76337, "tid": -914061504, "ts": 1716454222955279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223001758, "dur": 36, "args": { "External id": 101115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101115, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101115, "pid": 5, "tid": 7, "ts": 1716454223001758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955352, "dur": 12, "args": { "External id": 101115, "cbid": 211, "correlation": 101115 } }, { "ph": "s", "id": 101115, "pid": 76337, "tid": -914061504, "ts": 1716454222955352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223001796, "dur": 51, "args": { "External id": 101123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101123, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101123, "pid": 5, "tid": 7, "ts": 1716454223001796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955391, "dur": 9, "args": { "External id": 101123, "cbid": 211, "correlation": 101123 } }, { "ph": "s", "id": 101123, "pid": 76337, "tid": -914061504, "ts": 1716454222955391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223001848, "dur": 30, "args": { "External id": 101134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101134, "pid": 5, "tid": 7, "ts": 1716454223001848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955464, "dur": 13, "args": { "External id": 101134, "cbid": 211, "correlation": 101134 } }, { "ph": "s", "id": 101134, "pid": 76337, "tid": -914061504, "ts": 1716454222955464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223001879, "dur": 34, "args": { "External id": 101156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101156, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101156, "pid": 5, "tid": 7, "ts": 1716454223001879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955496, "dur": 8, "args": { "External id": 101156, "cbid": 211, "correlation": 101156 } }, { "ph": "s", "id": 101156, "pid": 76337, "tid": -914061504, "ts": 1716454222955496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955580, "dur": 1, "args": { "External id": 101167, "cbid": 251, "correlation": 101167 } }, { "ph": "f", "id": 101167, "pid": 76337, "tid": -914061504, "ts": 1716454222955580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223001914, "dur": 86, "args": { "External id": 101168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101168, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101168, "pid": 5, "tid": 7, "ts": 1716454223001914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955585, "dur": 14, "args": { "External id": 101168, "cbid": 211, "correlation": 101168 } }, { "ph": "s", "id": 101168, "pid": 76337, "tid": -914061504, "ts": 1716454222955585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955656, "dur": 1, "args": { "External id": 101179, "cbid": 251, "correlation": 101179 } }, { "ph": "f", "id": 101179, "pid": 76337, "tid": -914061504, "ts": 1716454222955656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955659, "dur": 0, "args": { "External id": 101180, "cbid": 251, "correlation": 101180 } }, { "ph": "f", "id": 101180, "pid": 76337, "tid": -914061504, "ts": 1716454222955659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223002002, "dur": 11, "args": { "External id": 101181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101181, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 101181, "pid": 5, "tid": 7, "ts": 1716454223002002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955661, "dur": 11, "args": { "External id": 101181, "cbid": 211, "correlation": 101181 } }, { "ph": "s", "id": 101181, "pid": 76337, "tid": -914061504, "ts": 1716454222955661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223002014, "dur": 5, "args": { "External id": 101183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101183, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 101183, "pid": 5, "tid": 7, "ts": 1716454223002014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955675, "dur": 7, "args": { "External id": 101183, "cbid": 211, "correlation": 101183 } }, { "ph": "s", "id": 101183, "pid": 76337, "tid": -914061504, "ts": 1716454222955675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955735, "dur": 1, "args": { "External id": 101194, "cbid": 251, "correlation": 101194 } }, { "ph": "f", "id": 101194, "pid": 76337, "tid": -914061504, "ts": 1716454222955735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955738, "dur": 0, "args": { "External id": 101195, "cbid": 251, "correlation": 101195 } }, { "ph": "f", "id": 101195, "pid": 76337, "tid": -914061504, "ts": 1716454222955738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223002020, "dur": 7, "args": { "External id": 101196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101196, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 101196, "pid": 5, "tid": 7, "ts": 1716454223002020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955740, "dur": 12, "args": { "External id": 101196, "cbid": 211, "correlation": 101196 } }, { "ph": "s", "id": 101196, "pid": 76337, "tid": -914061504, "ts": 1716454222955740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223002028, "dur": 3, "args": { "External id": 101198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101198, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 101198, "pid": 5, "tid": 7, "ts": 1716454223002028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955754, "dur": 5, "args": { "External id": 101198, "cbid": 211, "correlation": 101198 } }, { "ph": "s", "id": 101198, "pid": 76337, "tid": -914061504, "ts": 1716454222955754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223002033, "dur": 90, "args": { "External id": 101219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101219, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 101219, "pid": 5, "tid": 7, "ts": 1716454223002033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955827, "dur": 13, "args": { "External id": 101219, "cbid": 211, "correlation": 101219 } }, { "ph": "s", "id": 101219, "pid": 76337, "tid": -914061504, "ts": 1716454222955827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222955925, "dur": 1, "args": { "External id": 101237, "cbid": 251, "correlation": 101237 } }, { "ph": "f", "id": 101237, "pid": 76337, "tid": -914061504, "ts": 1716454222955925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223002124, "dur": 97, "args": { "External id": 101239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101239, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101239, "pid": 5, "tid": 7, "ts": 1716454223002124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222955930, "dur": 13, "args": { "External id": 101239, "cbid": 211, "correlation": 101239 } }, { "ph": "s", "id": 101239, "pid": 76337, "tid": -914061504, "ts": 1716454222955930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223002223, "dur": 19, "args": { "External id": 101247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101247, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101247, "pid": 5, "tid": 7, "ts": 1716454223002223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956009, "dur": 12, "args": { "External id": 101247, "cbid": 211, "correlation": 101247 } }, { "ph": "s", "id": 101247, "pid": 76337, "tid": -914061504, "ts": 1716454222956009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223002243, "dur": 37, "args": { "External id": 101255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101255, "pid": 5, "tid": 7, "ts": 1716454223002243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956053, "dur": 9, "args": { "External id": 101255, "cbid": 211, "correlation": 101255 } }, { "ph": "s", "id": 101255, "pid": 76337, "tid": -914061504, "ts": 1716454222956053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223002281, "dur": 34, "args": { "External id": 101277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101277, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101277, "pid": 5, "tid": 7, "ts": 1716454223002281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956105, "dur": 10, "args": { "External id": 101277, "cbid": 211, "correlation": 101277 } }, { "ph": "s", "id": 101277, "pid": 76337, "tid": -914061504, "ts": 1716454222956105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222956194, "dur": 1, "args": { "External id": 101293, "cbid": 251, "correlation": 101293 } }, { "ph": "f", "id": 101293, "pid": 76337, "tid": -914061504, "ts": 1716454222956194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222956199, "dur": 0, "args": { "External id": 101295, "cbid": 251, "correlation": 101295 } }, { "ph": "f", "id": 101295, "pid": 76337, "tid": -914061504, "ts": 1716454222956199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223002316, "dur": 535, "args": { "External id": 101296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101296, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 101296, "pid": 5, "tid": 7, "ts": 1716454223002316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956203, "dur": 13, "args": { "External id": 101296, "cbid": 211, "correlation": 101296 } }, { "ph": "s", "id": 101296, "pid": 76337, "tid": -914061504, "ts": 1716454222956203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223002852, "dur": 122, "args": { "External id": 101304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101304, "pid": 5, "tid": 7, "ts": 1716454223002852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956267, "dur": 13, "args": { "External id": 101304, "cbid": 211, "correlation": 101304 } }, { "ph": "s", "id": 101304, "pid": 76337, "tid": -914061504, "ts": 1716454222956267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223002976, "dur": 128, "args": { "External id": 101312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101312, "pid": 5, "tid": 7, "ts": 1716454223002976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956299, "dur": 8, "args": { "External id": 101312, "cbid": 211, "correlation": 101312 } }, { "ph": "s", "id": 101312, "pid": 76337, "tid": -914061504, "ts": 1716454222956299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222956376, "dur": 1, "args": { "External id": 101328, "cbid": 251, "correlation": 101328 } }, { "ph": "f", "id": 101328, "pid": 76337, "tid": -914061504, "ts": 1716454222956376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223003105, "dur": 300, "args": { "External id": 101330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101330, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101330, "pid": 5, "tid": 7, "ts": 1716454223003105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956382, "dur": 13, "args": { "External id": 101330, "cbid": 211, "correlation": 101330 } }, { "ph": "s", "id": 101330, "pid": 76337, "tid": -914061504, "ts": 1716454222956382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223003406, "dur": 28, "args": { "External id": 101338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101338, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101338, "pid": 5, "tid": 7, "ts": 1716454223003406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956424, "dur": 10, "args": { "External id": 101338, "cbid": 211, "correlation": 101338 } }, { "ph": "s", "id": 101338, "pid": 76337, "tid": -914061504, "ts": 1716454222956424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223003435, "dur": 81, "args": { "External id": 101349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101349, "pid": 5, "tid": 7, "ts": 1716454223003435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956493, "dur": 13, "args": { "External id": 101349, "cbid": 211, "correlation": 101349 } }, { "ph": "s", "id": 101349, "pid": 76337, "tid": -914061504, "ts": 1716454222956493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222956558, "dur": 0, "args": { "External id": 101361, "cbid": 317, "correlation": 101361 } }, { "ph": "f", "id": 101361, "pid": 76337, "tid": -914061504, "ts": 1716454222956558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222956559, "dur": 0, "args": { "External id": 101362, "cbid": 203, "correlation": 101362 } }, { "ph": "f", "id": 101362, "pid": 76337, "tid": -914061504, "ts": 1716454222956559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222956559, "dur": 0, "args": { "External id": 101363, "cbid": 205, "correlation": 101363 } }, { "ph": "f", "id": 101363, "pid": 76337, "tid": -914061504, "ts": 1716454222956559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223003517, "dur": 22, "args": { "External id": 101367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101367, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101367, "pid": 5, "tid": 7, "ts": 1716454223003517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956574, "dur": 12, "args": { "External id": 101367, "cbid": 211, "correlation": 101367 } }, { "ph": "s", "id": 101367, "pid": 76337, "tid": -914061504, "ts": 1716454222956574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223003541, "dur": 118, "args": { "External id": 101369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101369, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101369, "pid": 5, "tid": 7, "ts": 1716454223003541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956593, "dur": 6, "args": { "External id": 101369, "cbid": 211, "correlation": 101369 } }, { "ph": "s", "id": 101369, "pid": 76337, "tid": -914061504, "ts": 1716454222956593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223003660, "dur": 22, "args": { "External id": 101371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101371, "pid": 5, "tid": 7, "ts": 1716454223003660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956603, "dur": 5, "args": { "External id": 101371, "cbid": 211, "correlation": 101371 } }, { "ph": "s", "id": 101371, "pid": 76337, "tid": -914061504, "ts": 1716454222956603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223003684, "dur": 33, "args": { "External id": 101377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101377, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101377, "pid": 5, "tid": 7, "ts": 1716454223003684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956630, "dur": 9, "args": { "External id": 101377, "cbid": 211, "correlation": 101377 } }, { "ph": "s", "id": 101377, "pid": 76337, "tid": -914061504, "ts": 1716454222956630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223003718, "dur": 26, "args": { "External id": 101385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101385, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101385, "pid": 5, "tid": 7, "ts": 1716454223003718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956662, "dur": 8, "args": { "External id": 101385, "cbid": 211, "correlation": 101385 } }, { "ph": "s", "id": 101385, "pid": 76337, "tid": -914061504, "ts": 1716454222956662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223003745, "dur": 30, "args": { "External id": 101405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101405, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 101405, "pid": 5, "tid": 7, "ts": 1716454223003745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956734, "dur": 12, "args": { "External id": 101405, "cbid": 211, "correlation": 101405 } }, { "ph": "s", "id": 101405, "pid": 76337, "tid": -914061504, "ts": 1716454222956734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223003777, "dur": 5, "args": { "External id": 101417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101417, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 101417, "pid": 5, "tid": 7, "ts": 1716454223003777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956756, "dur": 6, "args": { "External id": 101417, "cbid": 211, "correlation": 101417 } }, { "ph": "s", "id": 101417, "pid": 76337, "tid": -914061504, "ts": 1716454222956756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223003783, "dur": 31, "args": { "External id": 101420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101420, "pid": 5, "tid": 7, "ts": 1716454223003783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956774, "dur": 7, "args": { "External id": 101420, "cbid": 211, "correlation": 101420 } }, { "ph": "s", "id": 101420, "pid": 76337, "tid": -914061504, "ts": 1716454222956774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223003815, "dur": 21, "args": { "External id": 101429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101429, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101429, "pid": 5, "tid": 7, "ts": 1716454223003815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956813, "dur": 10, "args": { "External id": 101429, "cbid": 211, "correlation": 101429 } }, { "ph": "s", "id": 101429, "pid": 76337, "tid": -914061504, "ts": 1716454222956813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222956864, "dur": 0, "args": { "External id": 101439, "cbid": 317, "correlation": 101439 } }, { "ph": "f", "id": 101439, "pid": 76337, "tid": -914061504, "ts": 1716454222956864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222956865, "dur": 0, "args": { "External id": 101440, "cbid": 203, "correlation": 101440 } }, { "ph": "f", "id": 101440, "pid": 76337, "tid": -914061504, "ts": 1716454222956865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222956866, "dur": 0, "args": { "External id": 101441, "cbid": 205, "correlation": 101441 } }, { "ph": "f", "id": 101441, "pid": 76337, "tid": -914061504, "ts": 1716454222956866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223003838, "dur": 21, "args": { "External id": 101445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101445, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101445, "pid": 5, "tid": 7, "ts": 1716454223003838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956879, "dur": 11, "args": { "External id": 101445, "cbid": 211, "correlation": 101445 } }, { "ph": "s", "id": 101445, "pid": 76337, "tid": -914061504, "ts": 1716454222956879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223003860, "dur": 43, "args": { "External id": 101447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101447, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101447, "pid": 5, "tid": 7, "ts": 1716454223003860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956893, "dur": 6, "args": { "External id": 101447, "cbid": 211, "correlation": 101447 } }, { "ph": "s", "id": 101447, "pid": 76337, "tid": -914061504, "ts": 1716454222956893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223003904, "dur": 647, "args": { "External id": 101449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101449, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101449, "pid": 5, "tid": 7, "ts": 1716454223003904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956904, "dur": 6, "args": { "External id": 101449, "cbid": 211, "correlation": 101449 } }, { "ph": "s", "id": 101449, "pid": 76337, "tid": -914061504, "ts": 1716454222956904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223004553, "dur": 22, "args": { "External id": 101451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101451, "pid": 5, "tid": 7, "ts": 1716454223004553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956914, "dur": 5, "args": { "External id": 101451, "cbid": 211, "correlation": 101451 } }, { "ph": "s", "id": 101451, "pid": 76337, "tid": -914061504, "ts": 1716454222956914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223004577, "dur": 33, "args": { "External id": 101457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101457, "pid": 5, "tid": 7, "ts": 1716454223004577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956941, "dur": 9, "args": { "External id": 101457, "cbid": 211, "correlation": 101457 } }, { "ph": "s", "id": 101457, "pid": 76337, "tid": -914061504, "ts": 1716454222956941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223004611, "dur": 3, "args": { "External id": 101465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101465, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 101465, "pid": 5, "tid": 7, "ts": 1716454223004611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222956993, "dur": 11, "args": { "External id": 101465, "cbid": 211, "correlation": 101465 } }, { "ph": "s", "id": 101465, "pid": 76337, "tid": -914061504, "ts": 1716454222956993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222957059, "dur": 1, "args": { "External id": 101481, "cbid": 251, "correlation": 101481 } }, { "ph": "f", "id": 101481, "pid": 76337, "tid": -914061504, "ts": 1716454222957059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222957064, "dur": 0, "args": { "External id": 101483, "cbid": 251, "correlation": 101483 } }, { "ph": "f", "id": 101483, "pid": 76337, "tid": -914061504, "ts": 1716454222957064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223004615, "dur": 12, "args": { "External id": 101484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101484, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 101484, "pid": 5, "tid": 7, "ts": 1716454223004615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957066, "dur": 11, "args": { "External id": 101484, "cbid": 211, "correlation": 101484 } }, { "ph": "s", "id": 101484, "pid": 76337, "tid": -914061504, "ts": 1716454222957066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223004629, "dur": 5, "args": { "External id": 101486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101486, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 101486, "pid": 5, "tid": 7, "ts": 1716454223004629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957079, "dur": 5, "args": { "External id": 101486, "cbid": 211, "correlation": 101486 } }, { "ph": "s", "id": 101486, "pid": 76337, "tid": -914061504, "ts": 1716454222957079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223004635, "dur": 29, "args": { "External id": 101496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101496, "pid": 5, "tid": 7, "ts": 1716454223004635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957137, "dur": 13, "args": { "External id": 101496, "cbid": 211, "correlation": 101496 } }, { "ph": "s", "id": 101496, "pid": 76337, "tid": -914061504, "ts": 1716454222957137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223004665, "dur": 30, "args": { "External id": 101516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101516, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 101516, "pid": 5, "tid": 7, "ts": 1716454223004665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957204, "dur": 11, "args": { "External id": 101516, "cbid": 211, "correlation": 101516 } }, { "ph": "s", "id": 101516, "pid": 76337, "tid": -914061504, "ts": 1716454222957204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223004696, "dur": 5, "args": { "External id": 101528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101528, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 101528, "pid": 5, "tid": 7, "ts": 1716454223004696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957225, "dur": 6, "args": { "External id": 101528, "cbid": 211, "correlation": 101528 } }, { "ph": "s", "id": 101528, "pid": 76337, "tid": -914061504, "ts": 1716454222957225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223004702, "dur": 29, "args": { "External id": 101531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101531, "pid": 5, "tid": 7, "ts": 1716454223004702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957245, "dur": 6, "args": { "External id": 101531, "cbid": 211, "correlation": 101531 } }, { "ph": "s", "id": 101531, "pid": 76337, "tid": -914061504, "ts": 1716454222957245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223004733, "dur": 20, "args": { "External id": 101540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101540, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101540, "pid": 5, "tid": 7, "ts": 1716454223004733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957286, "dur": 10, "args": { "External id": 101540, "cbid": 211, "correlation": 101540 } }, { "ph": "s", "id": 101540, "pid": 76337, "tid": -914061504, "ts": 1716454222957286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222957349, "dur": 0, "args": { "External id": 101550, "cbid": 317, "correlation": 101550 } }, { "ph": "f", "id": 101550, "pid": 76337, "tid": -914061504, "ts": 1716454222957349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222957350, "dur": 0, "args": { "External id": 101551, "cbid": 203, "correlation": 101551 } }, { "ph": "f", "id": 101551, "pid": 76337, "tid": -914061504, "ts": 1716454222957350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222957351, "dur": 0, "args": { "External id": 101552, "cbid": 205, "correlation": 101552 } }, { "ph": "f", "id": 101552, "pid": 76337, "tid": -914061504, "ts": 1716454222957351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223004754, "dur": 23, "args": { "External id": 101556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101556, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101556, "pid": 5, "tid": 7, "ts": 1716454223004754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957364, "dur": 12, "args": { "External id": 101556, "cbid": 211, "correlation": 101556 } }, { "ph": "s", "id": 101556, "pid": 76337, "tid": -914061504, "ts": 1716454222957364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223004779, "dur": 43, "args": { "External id": 101558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101558, "pid": 5, "tid": 7, "ts": 1716454223004779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957378, "dur": 5, "args": { "External id": 101558, "cbid": 211, "correlation": 101558 } }, { "ph": "s", "id": 101558, "pid": 76337, "tid": -914061504, "ts": 1716454222957378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223004823, "dur": 640, "args": { "External id": 101560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101560, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101560, "pid": 5, "tid": 7, "ts": 1716454223004823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957390, "dur": 7, "args": { "External id": 101560, "cbid": 211, "correlation": 101560 } }, { "ph": "s", "id": 101560, "pid": 76337, "tid": -914061504, "ts": 1716454222957390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223005464, "dur": 22, "args": { "External id": 101562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101562, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101562, "pid": 5, "tid": 7, "ts": 1716454223005464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957400, "dur": 5, "args": { "External id": 101562, "cbid": 211, "correlation": 101562 } }, { "ph": "s", "id": 101562, "pid": 76337, "tid": -914061504, "ts": 1716454222957400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223005487, "dur": 32, "args": { "External id": 101568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101568, "pid": 5, "tid": 7, "ts": 1716454223005487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957427, "dur": 8, "args": { "External id": 101568, "cbid": 211, "correlation": 101568 } }, { "ph": "s", "id": 101568, "pid": 76337, "tid": -914061504, "ts": 1716454222957427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223005521, "dur": 26, "args": { "External id": 101576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101576, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101576, "pid": 5, "tid": 7, "ts": 1716454223005521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957459, "dur": 8, "args": { "External id": 101576, "cbid": 211, "correlation": 101576 } }, { "ph": "s", "id": 101576, "pid": 76337, "tid": -914061504, "ts": 1716454222957459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223005548, "dur": 20, "args": { "External id": 101584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101584, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101584, "pid": 5, "tid": 7, "ts": 1716454223005548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957488, "dur": 9, "args": { "External id": 101584, "cbid": 211, "correlation": 101584 } }, { "ph": "s", "id": 101584, "pid": 76337, "tid": -914061504, "ts": 1716454222957488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223005569, "dur": 30, "args": { "External id": 101604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101604, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 101604, "pid": 5, "tid": 7, "ts": 1716454223005569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957567, "dur": 12, "args": { "External id": 101604, "cbid": 211, "correlation": 101604 } }, { "ph": "s", "id": 101604, "pid": 76337, "tid": -914061504, "ts": 1716454222957567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223005600, "dur": 5, "args": { "External id": 101616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101616, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 101616, "pid": 5, "tid": 7, "ts": 1716454223005600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957590, "dur": 7, "args": { "External id": 101616, "cbid": 211, "correlation": 101616 } }, { "ph": "s", "id": 101616, "pid": 76337, "tid": -914061504, "ts": 1716454222957590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223005606, "dur": 29, "args": { "External id": 101619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101619, "pid": 5, "tid": 7, "ts": 1716454223005606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957607, "dur": 6, "args": { "External id": 101619, "cbid": 211, "correlation": 101619 } }, { "ph": "s", "id": 101619, "pid": 76337, "tid": -914061504, "ts": 1716454222957607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222957664, "dur": 0, "args": { "External id": 101630, "cbid": 317, "correlation": 101630 } }, { "ph": "f", "id": 101630, "pid": 76337, "tid": -914061504, "ts": 1716454222957664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222957665, "dur": 0, "args": { "External id": 101631, "cbid": 203, "correlation": 101631 } }, { "ph": "f", "id": 101631, "pid": 76337, "tid": -914061504, "ts": 1716454222957665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222957666, "dur": 0, "args": { "External id": 101632, "cbid": 205, "correlation": 101632 } }, { "ph": "f", "id": 101632, "pid": 76337, "tid": -914061504, "ts": 1716454222957666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223005637, "dur": 22, "args": { "External id": 101636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101636, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101636, "pid": 5, "tid": 7, "ts": 1716454223005637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957679, "dur": 12, "args": { "External id": 101636, "cbid": 211, "correlation": 101636 } }, { "ph": "s", "id": 101636, "pid": 76337, "tid": -914061504, "ts": 1716454222957679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223005660, "dur": 115, "args": { "External id": 101638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101638, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101638, "pid": 5, "tid": 7, "ts": 1716454223005660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957697, "dur": 6, "args": { "External id": 101638, "cbid": 211, "correlation": 101638 } }, { "ph": "s", "id": 101638, "pid": 76337, "tid": -914061504, "ts": 1716454222957697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223005776, "dur": 22, "args": { "External id": 101640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101640, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101640, "pid": 5, "tid": 7, "ts": 1716454223005776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957707, "dur": 5, "args": { "External id": 101640, "cbid": 211, "correlation": 101640 } }, { "ph": "s", "id": 101640, "pid": 76337, "tid": -914061504, "ts": 1716454222957707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223005799, "dur": 32, "args": { "External id": 101646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101646, "pid": 5, "tid": 7, "ts": 1716454223005799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957733, "dur": 9, "args": { "External id": 101646, "cbid": 211, "correlation": 101646 } }, { "ph": "s", "id": 101646, "pid": 76337, "tid": -914061504, "ts": 1716454222957733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223005833, "dur": 178, "args": { "External id": 101655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101655, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101655, "pid": 5, "tid": 7, "ts": 1716454223005833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957816, "dur": 14, "args": { "External id": 101655, "cbid": 211, "correlation": 101655 } }, { "ph": "s", "id": 101655, "pid": 76337, "tid": -914061504, "ts": 1716454222957816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223006012, "dur": 64, "args": { "External id": 101677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101677, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101677, "pid": 5, "tid": 7, "ts": 1716454223006012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957873, "dur": 10, "args": { "External id": 101677, "cbid": 211, "correlation": 101677 } }, { "ph": "s", "id": 101677, "pid": 76337, "tid": -914061504, "ts": 1716454222957873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222957961, "dur": 1, "args": { "External id": 101688, "cbid": 251, "correlation": 101688 } }, { "ph": "f", "id": 101688, "pid": 76337, "tid": -914061504, "ts": 1716454222957961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223006077, "dur": 149, "args": { "External id": 101689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101689, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101689, "pid": 5, "tid": 7, "ts": 1716454223006077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222957966, "dur": 21, "args": { "External id": 101689, "cbid": 211, "correlation": 101689 } }, { "ph": "s", "id": 101689, "pid": 76337, "tid": -914061504, "ts": 1716454222957966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958049, "dur": 1, "args": { "External id": 101700, "cbid": 251, "correlation": 101700 } }, { "ph": "f", "id": 101700, "pid": 76337, "tid": -914061504, "ts": 1716454222958049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223006227, "dur": 146, "args": { "External id": 101701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101701, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101701, "pid": 5, "tid": 7, "ts": 1716454223006227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958054, "dur": 12, "args": { "External id": 101701, "cbid": 211, "correlation": 101701 } }, { "ph": "s", "id": 101701, "pid": 76337, "tid": -914061504, "ts": 1716454222958054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958120, "dur": 1, "args": { "External id": 101712, "cbid": 251, "correlation": 101712 } }, { "ph": "f", "id": 101712, "pid": 76337, "tid": -914061504, "ts": 1716454222958120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223006374, "dur": 144, "args": { "External id": 101713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101713, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101713, "pid": 5, "tid": 7, "ts": 1716454223006374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958124, "dur": 11, "args": { "External id": 101713, "cbid": 211, "correlation": 101713 } }, { "ph": "s", "id": 101713, "pid": 76337, "tid": -914061504, "ts": 1716454222958124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223006518, "dur": 1908, "args": { "External id": 101734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101734, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 101734, "pid": 5, "tid": 7, "ts": 1716454223006518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958205, "dur": 13, "args": { "External id": 101734, "cbid": 211, "correlation": 101734 } }, { "ph": "s", "id": 101734, "pid": 76337, "tid": -914061504, "ts": 1716454222958205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958305, "dur": 1, "args": { "External id": 101752, "cbid": 251, "correlation": 101752 } }, { "ph": "f", "id": 101752, "pid": 76337, "tid": -914061504, "ts": 1716454222958305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223008428, "dur": 144, "args": { "External id": 101754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101754, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 101754, "pid": 5, "tid": 7, "ts": 1716454223008428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958311, "dur": 13, "args": { "External id": 101754, "cbid": 211, "correlation": 101754 } }, { "ph": "s", "id": 101754, "pid": 76337, "tid": -914061504, "ts": 1716454222958311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223008574, "dur": 35, "args": { "External id": 101762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101762, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101762, "pid": 5, "tid": 7, "ts": 1716454223008574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958382, "dur": 13, "args": { "External id": 101762, "cbid": 211, "correlation": 101762 } }, { "ph": "s", "id": 101762, "pid": 76337, "tid": -914061504, "ts": 1716454222958382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223008610, "dur": 50, "args": { "External id": 101770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101770, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101770, "pid": 5, "tid": 7, "ts": 1716454223008610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958422, "dur": 8, "args": { "External id": 101770, "cbid": 211, "correlation": 101770 } }, { "ph": "s", "id": 101770, "pid": 76337, "tid": -914061504, "ts": 1716454222958422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223008662, "dur": 30, "args": { "External id": 101781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101781, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101781, "pid": 5, "tid": 7, "ts": 1716454223008662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958495, "dur": 12, "args": { "External id": 101781, "cbid": 211, "correlation": 101781 } }, { "ph": "s", "id": 101781, "pid": 76337, "tid": -914061504, "ts": 1716454222958495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223008693, "dur": 34, "args": { "External id": 101803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101803, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101803, "pid": 5, "tid": 7, "ts": 1716454223008693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958526, "dur": 8, "args": { "External id": 101803, "cbid": 211, "correlation": 101803 } }, { "ph": "s", "id": 101803, "pid": 76337, "tid": -914061504, "ts": 1716454222958526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958610, "dur": 1, "args": { "External id": 101814, "cbid": 251, "correlation": 101814 } }, { "ph": "f", "id": 101814, "pid": 76337, "tid": -914061504, "ts": 1716454222958610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223008729, "dur": 89, "args": { "External id": 101815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101815, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101815, "pid": 5, "tid": 7, "ts": 1716454223008729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958616, "dur": 13, "args": { "External id": 101815, "cbid": 211, "correlation": 101815 } }, { "ph": "s", "id": 101815, "pid": 76337, "tid": -914061504, "ts": 1716454222958616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958684, "dur": 1, "args": { "External id": 101826, "cbid": 251, "correlation": 101826 } }, { "ph": "f", "id": 101826, "pid": 76337, "tid": -914061504, "ts": 1716454222958684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958688, "dur": 0, "args": { "External id": 101827, "cbid": 251, "correlation": 101827 } }, { "ph": "f", "id": 101827, "pid": 76337, "tid": -914061504, "ts": 1716454222958688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223008819, "dur": 11, "args": { "External id": 101828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101828, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 101828, "pid": 5, "tid": 7, "ts": 1716454223008819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958690, "dur": 12, "args": { "External id": 101828, "cbid": 211, "correlation": 101828 } }, { "ph": "s", "id": 101828, "pid": 76337, "tid": -914061504, "ts": 1716454222958690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223008832, "dur": 5, "args": { "External id": 101830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101830, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 101830, "pid": 5, "tid": 7, "ts": 1716454223008832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958704, "dur": 6, "args": { "External id": 101830, "cbid": 211, "correlation": 101830 } }, { "ph": "s", "id": 101830, "pid": 76337, "tid": -914061504, "ts": 1716454222958704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958762, "dur": 1, "args": { "External id": 101841, "cbid": 251, "correlation": 101841 } }, { "ph": "f", "id": 101841, "pid": 76337, "tid": -914061504, "ts": 1716454222958762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958765, "dur": 0, "args": { "External id": 101842, "cbid": 251, "correlation": 101842 } }, { "ph": "f", "id": 101842, "pid": 76337, "tid": -914061504, "ts": 1716454222958765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223008839, "dur": 7, "args": { "External id": 101843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101843, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 101843, "pid": 5, "tid": 7, "ts": 1716454223008839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958767, "dur": 11, "args": { "External id": 101843, "cbid": 211, "correlation": 101843 } }, { "ph": "s", "id": 101843, "pid": 76337, "tid": -914061504, "ts": 1716454222958767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223008847, "dur": 4, "args": { "External id": 101845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101845, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 101845, "pid": 5, "tid": 7, "ts": 1716454223008847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958780, "dur": 6, "args": { "External id": 101845, "cbid": 211, "correlation": 101845 } }, { "ph": "s", "id": 101845, "pid": 76337, "tid": -914061504, "ts": 1716454222958780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223008852, "dur": 90, "args": { "External id": 101866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101866, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 101866, "pid": 5, "tid": 7, "ts": 1716454223008852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958854, "dur": 12, "args": { "External id": 101866, "cbid": 211, "correlation": 101866 } }, { "ph": "s", "id": 101866, "pid": 76337, "tid": -914061504, "ts": 1716454222958854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222958953, "dur": 2, "args": { "External id": 101884, "cbid": 251, "correlation": 101884 } }, { "ph": "f", "id": 101884, "pid": 76337, "tid": -914061504, "ts": 1716454222958953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223008944, "dur": 96, "args": { "External id": 101886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101886, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101886, "pid": 5, "tid": 7, "ts": 1716454223008944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222958960, "dur": 21, "args": { "External id": 101886, "cbid": 211, "correlation": 101886 } }, { "ph": "s", "id": 101886, "pid": 76337, "tid": -914061504, "ts": 1716454222958960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223009041, "dur": 19, "args": { "External id": 101894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101894, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101894, "pid": 5, "tid": 7, "ts": 1716454223009041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959038, "dur": 13, "args": { "External id": 101894, "cbid": 211, "correlation": 101894 } }, { "ph": "s", "id": 101894, "pid": 76337, "tid": -914061504, "ts": 1716454222959038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223009061, "dur": 36, "args": { "External id": 101902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101902, "pid": 5, "tid": 7, "ts": 1716454223009061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959081, "dur": 10, "args": { "External id": 101902, "cbid": 211, "correlation": 101902 } }, { "ph": "s", "id": 101902, "pid": 76337, "tid": -914061504, "ts": 1716454222959081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223009099, "dur": 33, "args": { "External id": 101924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101924, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101924, "pid": 5, "tid": 7, "ts": 1716454223009099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959132, "dur": 11, "args": { "External id": 101924, "cbid": 211, "correlation": 101924 } }, { "ph": "s", "id": 101924, "pid": 76337, "tid": -914061504, "ts": 1716454222959132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222959222, "dur": 1, "args": { "External id": 101940, "cbid": 251, "correlation": 101940 } }, { "ph": "f", "id": 101940, "pid": 76337, "tid": -914061504, "ts": 1716454222959222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222959227, "dur": 0, "args": { "External id": 101942, "cbid": 251, "correlation": 101942 } }, { "ph": "f", "id": 101942, "pid": 76337, "tid": -914061504, "ts": 1716454222959227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223009133, "dur": 531, "args": { "External id": 101943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101943, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 101943, "pid": 5, "tid": 7, "ts": 1716454223009133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959230, "dur": 13, "args": { "External id": 101943, "cbid": 211, "correlation": 101943 } }, { "ph": "s", "id": 101943, "pid": 76337, "tid": -914061504, "ts": 1716454222959230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223009665, "dur": 123, "args": { "External id": 101951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101951, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101951, "pid": 5, "tid": 7, "ts": 1716454223009665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959296, "dur": 12, "args": { "External id": 101951, "cbid": 211, "correlation": 101951 } }, { "ph": "s", "id": 101951, "pid": 76337, "tid": -914061504, "ts": 1716454222959296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223009790, "dur": 129, "args": { "External id": 101959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101959, "pid": 5, "tid": 7, "ts": 1716454223009790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959326, "dur": 8, "args": { "External id": 101959, "cbid": 211, "correlation": 101959 } }, { "ph": "s", "id": 101959, "pid": 76337, "tid": -914061504, "ts": 1716454222959326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222959403, "dur": 1, "args": { "External id": 101975, "cbid": 251, "correlation": 101975 } }, { "ph": "f", "id": 101975, "pid": 76337, "tid": -914061504, "ts": 1716454222959403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223009919, "dur": 301, "args": { "External id": 101977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101977, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 101977, "pid": 5, "tid": 7, "ts": 1716454223009919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959408, "dur": 12, "args": { "External id": 101977, "cbid": 211, "correlation": 101977 } }, { "ph": "s", "id": 101977, "pid": 76337, "tid": -914061504, "ts": 1716454222959408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223010222, "dur": 27, "args": { "External id": 101985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101985, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101985, "pid": 5, "tid": 7, "ts": 1716454223010222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959451, "dur": 10, "args": { "External id": 101985, "cbid": 211, "correlation": 101985 } }, { "ph": "s", "id": 101985, "pid": 76337, "tid": -914061504, "ts": 1716454222959451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223010250, "dur": 80, "args": { "External id": 101996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 101996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 101996, "pid": 5, "tid": 7, "ts": 1716454223010250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959518, "dur": 12, "args": { "External id": 101996, "cbid": 211, "correlation": 101996 } }, { "ph": "s", "id": 101996, "pid": 76337, "tid": -914061504, "ts": 1716454222959518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222959581, "dur": 0, "args": { "External id": 102008, "cbid": 317, "correlation": 102008 } }, { "ph": "f", "id": 102008, "pid": 76337, "tid": -914061504, "ts": 1716454222959581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222959582, "dur": 0, "args": { "External id": 102009, "cbid": 203, "correlation": 102009 } }, { "ph": "f", "id": 102009, "pid": 76337, "tid": -914061504, "ts": 1716454222959582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222959583, "dur": 0, "args": { "External id": 102010, "cbid": 205, "correlation": 102010 } }, { "ph": "f", "id": 102010, "pid": 76337, "tid": -914061504, "ts": 1716454222959583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010331, "dur": 23, "args": { "External id": 102014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102014, "pid": 5, "tid": 7, "ts": 1716454223010331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959599, "dur": 12, "args": { "External id": 102014, "cbid": 211, "correlation": 102014 } }, { "ph": "s", "id": 102014, "pid": 76337, "tid": -914061504, "ts": 1716454222959599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223010355, "dur": 119, "args": { "External id": 102016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102016, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102016, "pid": 5, "tid": 7, "ts": 1716454223010355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959617, "dur": 6, "args": { "External id": 102016, "cbid": 211, "correlation": 102016 } }, { "ph": "s", "id": 102016, "pid": 76337, "tid": -914061504, "ts": 1716454222959617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010475, "dur": 23, "args": { "External id": 102018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102018, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102018, "pid": 5, "tid": 7, "ts": 1716454223010475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959627, "dur": 5, "args": { "External id": 102018, "cbid": 211, "correlation": 102018 } }, { "ph": "s", "id": 102018, "pid": 76337, "tid": -914061504, "ts": 1716454222959627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223010500, "dur": 32, "args": { "External id": 102024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102024, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102024, "pid": 5, "tid": 7, "ts": 1716454223010500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959655, "dur": 8, "args": { "External id": 102024, "cbid": 211, "correlation": 102024 } }, { "ph": "s", "id": 102024, "pid": 76337, "tid": -914061504, "ts": 1716454222959655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223010533, "dur": 27, "args": { "External id": 102032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102032, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102032, "pid": 5, "tid": 7, "ts": 1716454223010533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959687, "dur": 9, "args": { "External id": 102032, "cbid": 211, "correlation": 102032 } }, { "ph": "s", "id": 102032, "pid": 76337, "tid": -914061504, "ts": 1716454222959687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222959758, "dur": 0, "args": { "External id": 102042, "cbid": 317, "correlation": 102042 } }, { "ph": "f", "id": 102042, "pid": 76337, "tid": -914061504, "ts": 1716454222959758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222959759, "dur": 0, "args": { "External id": 102043, "cbid": 203, "correlation": 102043 } }, { "ph": "f", "id": 102043, "pid": 76337, "tid": -914061504, "ts": 1716454222959759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222959760, "dur": 0, "args": { "External id": 102044, "cbid": 205, "correlation": 102044 } }, { "ph": "f", "id": 102044, "pid": 76337, "tid": -914061504, "ts": 1716454222959760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010561, "dur": 23, "args": { "External id": 102048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102048, "pid": 5, "tid": 7, "ts": 1716454223010561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959774, "dur": 12, "args": { "External id": 102048, "cbid": 211, "correlation": 102048 } }, { "ph": "s", "id": 102048, "pid": 76337, "tid": -914061504, "ts": 1716454222959774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010586, "dur": 44, "args": { "External id": 102050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102050, "pid": 5, "tid": 7, "ts": 1716454223010586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959789, "dur": 6, "args": { "External id": 102050, "cbid": 211, "correlation": 102050 } }, { "ph": "s", "id": 102050, "pid": 76337, "tid": -914061504, "ts": 1716454222959789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223010631, "dur": 232, "args": { "External id": 102052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102052, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 102052, "pid": 5, "tid": 7, "ts": 1716454223010631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959801, "dur": 7, "args": { "External id": 102052, "cbid": 211, "correlation": 102052 } }, { "ph": "s", "id": 102052, "pid": 76337, "tid": -914061504, "ts": 1716454222959801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010864, "dur": 6, "args": { "External id": 102054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102054, "pid": 5, "tid": 7, "ts": 1716454223010864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959812, "dur": 5, "args": { "External id": 102054, "cbid": 211, "correlation": 102054 } }, { "ph": "s", "id": 102054, "pid": 76337, "tid": -914061504, "ts": 1716454222959812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223010872, "dur": 9, "args": { "External id": 102060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102060, "pid": 5, "tid": 7, "ts": 1716454223010872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959839, "dur": 9, "args": { "External id": 102060, "cbid": 211, "correlation": 102060 } }, { "ph": "s", "id": 102060, "pid": 76337, "tid": -914061504, "ts": 1716454222959839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223010882, "dur": 12, "args": { "External id": 102080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102080, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 102080, "pid": 5, "tid": 7, "ts": 1716454223010882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959931, "dur": 13, "args": { "External id": 102080, "cbid": 211, "correlation": 102080 } }, { "ph": "s", "id": 102080, "pid": 76337, "tid": -914061504, "ts": 1716454222959931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223010895, "dur": 5, "args": { "External id": 102092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102092, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 102092, "pid": 5, "tid": 7, "ts": 1716454223010895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959954, "dur": 6, "args": { "External id": 102092, "cbid": 211, "correlation": 102092 } }, { "ph": "s", "id": 102092, "pid": 76337, "tid": -914061504, "ts": 1716454222959954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223010901, "dur": 12, "args": { "External id": 102095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102095, "pid": 5, "tid": 7, "ts": 1716454223010901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222959972, "dur": 16, "args": { "External id": 102095, "cbid": 211, "correlation": 102095 } }, { "ph": "s", "id": 102095, "pid": 76337, "tid": -914061504, "ts": 1716454222959972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223010914, "dur": 7, "args": { "External id": 102104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102104, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102104, "pid": 5, "tid": 7, "ts": 1716454223010914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960022, "dur": 10, "args": { "External id": 102104, "cbid": 211, "correlation": 102104 } }, { "ph": "s", "id": 102104, "pid": 76337, "tid": -914061504, "ts": 1716454222960022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222960075, "dur": 0, "args": { "External id": 102114, "cbid": 317, "correlation": 102114 } }, { "ph": "f", "id": 102114, "pid": 76337, "tid": -914061504, "ts": 1716454222960075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222960076, "dur": 0, "args": { "External id": 102115, "cbid": 203, "correlation": 102115 } }, { "ph": "f", "id": 102115, "pid": 76337, "tid": -914061504, "ts": 1716454222960076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222960076, "dur": 0, "args": { "External id": 102116, "cbid": 205, "correlation": 102116 } }, { "ph": "f", "id": 102116, "pid": 76337, "tid": -914061504, "ts": 1716454222960076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010922, "dur": 5, "args": { "External id": 102120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102120, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102120, "pid": 5, "tid": 7, "ts": 1716454223010922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960093, "dur": 11, "args": { "External id": 102120, "cbid": 211, "correlation": 102120 } }, { "ph": "s", "id": 102120, "pid": 76337, "tid": -914061504, "ts": 1716454222960093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223010929, "dur": 82, "args": { "External id": 102122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102122, "pid": 5, "tid": 7, "ts": 1716454223010929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960108, "dur": 5, "args": { "External id": 102122, "cbid": 211, "correlation": 102122 } }, { "ph": "s", "id": 102122, "pid": 76337, "tid": -914061504, "ts": 1716454222960108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223011014, "dur": 1, "args": { "External id": 102124, "device": 5, "context": 1, "stream": 7, "correlation": 102124, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 102124, "pid": 5, "tid": 7, "ts": 1716454223011014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222960120, "dur": 8, "args": { "External id": 102124, "cbid": 51, "correlation": 102124 } }, { "ph": "s", "id": 102124, "pid": 76337, "tid": -914061504, "ts": 1716454222960120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223011017, "dur": 535, "args": { "External id": 102125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102125, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102125, "pid": 5, "tid": 7, "ts": 1716454223011017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960130, "dur": 9, "args": { "External id": 102125, "cbid": 211, "correlation": 102125 } }, { "ph": "s", "id": 102125, "pid": 76337, "tid": -914061504, "ts": 1716454222960130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223011554, "dur": 12, "args": { "External id": 102127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102127, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102127, "pid": 5, "tid": 7, "ts": 1716454223011554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960143, "dur": 5, "args": { "External id": 102127, "cbid": 211, "correlation": 102127 } }, { "ph": "s", "id": 102127, "pid": 76337, "tid": -914061504, "ts": 1716454222960143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223011567, "dur": 14, "args": { "External id": 102133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102133, "pid": 5, "tid": 7, "ts": 1716454223011567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960171, "dur": 8, "args": { "External id": 102133, "cbid": 211, "correlation": 102133 } }, { "ph": "s", "id": 102133, "pid": 76337, "tid": -914061504, "ts": 1716454222960171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223011583, "dur": 3, "args": { "External id": 102141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102141, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 102141, "pid": 5, "tid": 7, "ts": 1716454223011583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960214, "dur": 9, "args": { "External id": 102141, "cbid": 211, "correlation": 102141 } }, { "ph": "s", "id": 102141, "pid": 76337, "tid": -914061504, "ts": 1716454222960214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222960281, "dur": 1, "args": { "External id": 102157, "cbid": 251, "correlation": 102157 } }, { "ph": "f", "id": 102157, "pid": 76337, "tid": -914061504, "ts": 1716454222960281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222960287, "dur": 0, "args": { "External id": 102159, "cbid": 251, "correlation": 102159 } }, { "ph": "f", "id": 102159, "pid": 76337, "tid": -914061504, "ts": 1716454222960287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223011587, "dur": 14, "args": { "External id": 102160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102160, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102160, "pid": 5, "tid": 7, "ts": 1716454223011587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960289, "dur": 11, "args": { "External id": 102160, "cbid": 211, "correlation": 102160 } }, { "ph": "s", "id": 102160, "pid": 76337, "tid": -914061504, "ts": 1716454222960289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223011602, "dur": 5, "args": { "External id": 102162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102162, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102162, "pid": 5, "tid": 7, "ts": 1716454223011602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960302, "dur": 5, "args": { "External id": 102162, "cbid": 211, "correlation": 102162 } }, { "ph": "s", "id": 102162, "pid": 76337, "tid": -914061504, "ts": 1716454222960302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223011609, "dur": 18, "args": { "External id": 102172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102172, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102172, "pid": 5, "tid": 7, "ts": 1716454223011609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960360, "dur": 12, "args": { "External id": 102172, "cbid": 211, "correlation": 102172 } }, { "ph": "s", "id": 102172, "pid": 76337, "tid": -914061504, "ts": 1716454222960360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223011628, "dur": 17, "args": { "External id": 102192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102192, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 102192, "pid": 5, "tid": 7, "ts": 1716454223011628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960426, "dur": 12, "args": { "External id": 102192, "cbid": 211, "correlation": 102192 } }, { "ph": "s", "id": 102192, "pid": 76337, "tid": -914061504, "ts": 1716454222960426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223011646, "dur": 4, "args": { "External id": 102204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102204, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 102204, "pid": 5, "tid": 7, "ts": 1716454223011646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960447, "dur": 6, "args": { "External id": 102204, "cbid": 211, "correlation": 102204 } }, { "ph": "s", "id": 102204, "pid": 76337, "tid": -914061504, "ts": 1716454222960447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223011652, "dur": 18, "args": { "External id": 102207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102207, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102207, "pid": 5, "tid": 7, "ts": 1716454223011652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960466, "dur": 6, "args": { "External id": 102207, "cbid": 211, "correlation": 102207 } }, { "ph": "s", "id": 102207, "pid": 76337, "tid": -914061504, "ts": 1716454222960466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223011671, "dur": 10, "args": { "External id": 102216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102216, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102216, "pid": 5, "tid": 7, "ts": 1716454223011671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960506, "dur": 9, "args": { "External id": 102216, "cbid": 211, "correlation": 102216 } }, { "ph": "s", "id": 102216, "pid": 76337, "tid": -914061504, "ts": 1716454222960506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222960568, "dur": 0, "args": { "External id": 102226, "cbid": 317, "correlation": 102226 } }, { "ph": "f", "id": 102226, "pid": 76337, "tid": -914061504, "ts": 1716454222960568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222960569, "dur": 0, "args": { "External id": 102227, "cbid": 203, "correlation": 102227 } }, { "ph": "f", "id": 102227, "pid": 76337, "tid": -914061504, "ts": 1716454222960569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222960570, "dur": 0, "args": { "External id": 102228, "cbid": 205, "correlation": 102228 } }, { "ph": "f", "id": 102228, "pid": 76337, "tid": -914061504, "ts": 1716454222960570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223011682, "dur": 11, "args": { "External id": 102232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102232, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102232, "pid": 5, "tid": 7, "ts": 1716454223011682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960588, "dur": 12, "args": { "External id": 102232, "cbid": 211, "correlation": 102232 } }, { "ph": "s", "id": 102232, "pid": 76337, "tid": -914061504, "ts": 1716454222960588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223011694, "dur": 161, "args": { "External id": 102234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102234, "pid": 5, "tid": 7, "ts": 1716454223011694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960602, "dur": 5, "args": { "External id": 102234, "cbid": 211, "correlation": 102234 } }, { "ph": "s", "id": 102234, "pid": 76337, "tid": -914061504, "ts": 1716454222960602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223011858, "dur": 1, "args": { "External id": 102236, "device": 5, "context": 1, "stream": 7, "correlation": 102236, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 102236, "pid": 5, "tid": 7, "ts": 1716454223011858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222960614, "dur": 7, "args": { "External id": 102236, "cbid": 51, "correlation": 102236 } }, { "ph": "s", "id": 102236, "pid": 76337, "tid": -914061504, "ts": 1716454222960614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223011861, "dur": 655, "args": { "External id": 102237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102237, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102237, "pid": 5, "tid": 7, "ts": 1716454223011861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960622, "dur": 7, "args": { "External id": 102237, "cbid": 211, "correlation": 102237 } }, { "ph": "s", "id": 102237, "pid": 76337, "tid": -914061504, "ts": 1716454222960622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223012518, "dur": 11, "args": { "External id": 102239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102239, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102239, "pid": 5, "tid": 7, "ts": 1716454223012518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960634, "dur": 6, "args": { "External id": 102239, "cbid": 211, "correlation": 102239 } }, { "ph": "s", "id": 102239, "pid": 76337, "tid": -914061504, "ts": 1716454222960634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223012530, "dur": 14, "args": { "External id": 102245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102245, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102245, "pid": 5, "tid": 7, "ts": 1716454223012530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960662, "dur": 8, "args": { "External id": 102245, "cbid": 211, "correlation": 102245 } }, { "ph": "s", "id": 102245, "pid": 76337, "tid": -914061504, "ts": 1716454222960662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222960720, "dur": 0, "args": { "External id": 102255, "cbid": 317, "correlation": 102255 } }, { "ph": "f", "id": 102255, "pid": 76337, "tid": -914061504, "ts": 1716454222960720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222960720, "dur": 0, "args": { "External id": 102256, "cbid": 203, "correlation": 102256 } }, { "ph": "f", "id": 102256, "pid": 76337, "tid": -914061504, "ts": 1716454222960720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222960721, "dur": 0, "args": { "External id": 102257, "cbid": 205, "correlation": 102257 } }, { "ph": "f", "id": 102257, "pid": 76337, "tid": -914061504, "ts": 1716454222960721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223012546, "dur": 8, "args": { "External id": 102261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102261, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102261, "pid": 5, "tid": 7, "ts": 1716454223012546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960736, "dur": 11, "args": { "External id": 102261, "cbid": 211, "correlation": 102261 } }, { "ph": "s", "id": 102261, "pid": 76337, "tid": -914061504, "ts": 1716454222960736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223012556, "dur": 3, "args": { "External id": 102263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102263, "pid": 5, "tid": 7, "ts": 1716454223012556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960753, "dur": 6, "args": { "External id": 102263, "cbid": 211, "correlation": 102263 } }, { "ph": "s", "id": 102263, "pid": 76337, "tid": -914061504, "ts": 1716454222960753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222960762, "dur": 0, "args": { "External id": 102264, "cbid": 51, "correlation": 102264 } }, { "ph": "s", "id": 102264, "pid": 76337, "tid": -914061504, "ts": 1716454222960762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223012560, "dur": 56, "args": { "External id": 102265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102265, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 102265, "pid": 5, "tid": 7, "ts": 1716454223012560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960763, "dur": 5, "args": { "External id": 102265, "cbid": 211, "correlation": 102265 } }, { "ph": "s", "id": 102265, "pid": 76337, "tid": -914061504, "ts": 1716454222960763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223012618, "dur": 14, "args": { "External id": 102270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102270, "pid": 5, "tid": 7, "ts": 1716454223012618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960789, "dur": 8, "args": { "External id": 102270, "cbid": 211, "correlation": 102270 } }, { "ph": "s", "id": 102270, "pid": 76337, "tid": -914061504, "ts": 1716454222960789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223012633, "dur": 11, "args": { "External id": 102278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102278, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102278, "pid": 5, "tid": 7, "ts": 1716454223012633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960816, "dur": 7, "args": { "External id": 102278, "cbid": 211, "correlation": 102278 } }, { "ph": "s", "id": 102278, "pid": 76337, "tid": -914061504, "ts": 1716454222960816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223012645, "dur": 10, "args": { "External id": 102286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102286, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102286, "pid": 5, "tid": 7, "ts": 1716454223012645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960846, "dur": 8, "args": { "External id": 102286, "cbid": 211, "correlation": 102286 } }, { "ph": "s", "id": 102286, "pid": 76337, "tid": -914061504, "ts": 1716454222960846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223012656, "dur": 18, "args": { "External id": 102306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102306, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 102306, "pid": 5, "tid": 7, "ts": 1716454223012656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960925, "dur": 13, "args": { "External id": 102306, "cbid": 211, "correlation": 102306 } }, { "ph": "s", "id": 102306, "pid": 76337, "tid": -914061504, "ts": 1716454222960925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223012675, "dur": 5, "args": { "External id": 102318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102318, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 102318, "pid": 5, "tid": 7, "ts": 1716454223012675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960948, "dur": 6, "args": { "External id": 102318, "cbid": 211, "correlation": 102318 } }, { "ph": "s", "id": 102318, "pid": 76337, "tid": -914061504, "ts": 1716454222960948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223012681, "dur": 17, "args": { "External id": 102321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102321, "pid": 5, "tid": 7, "ts": 1716454223012681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222960966, "dur": 6, "args": { "External id": 102321, "cbid": 211, "correlation": 102321 } }, { "ph": "s", "id": 102321, "pid": 76337, "tid": -914061504, "ts": 1716454222960966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222961032, "dur": 0, "args": { "External id": 102332, "cbid": 317, "correlation": 102332 } }, { "ph": "f", "id": 102332, "pid": 76337, "tid": -914061504, "ts": 1716454222961032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222961033, "dur": 0, "args": { "External id": 102333, "cbid": 203, "correlation": 102333 } }, { "ph": "f", "id": 102333, "pid": 76337, "tid": -914061504, "ts": 1716454222961033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222961034, "dur": 0, "args": { "External id": 102334, "cbid": 205, "correlation": 102334 } }, { "ph": "f", "id": 102334, "pid": 76337, "tid": -914061504, "ts": 1716454222961034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223012700, "dur": 12, "args": { "External id": 102338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102338, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102338, "pid": 5, "tid": 7, "ts": 1716454223012700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961050, "dur": 12, "args": { "External id": 102338, "cbid": 211, "correlation": 102338 } }, { "ph": "s", "id": 102338, "pid": 76337, "tid": -914061504, "ts": 1716454222961050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223012713, "dur": 3, "args": { "External id": 102340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102340, "pid": 5, "tid": 7, "ts": 1716454223012713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961066, "dur": 6, "args": { "External id": 102340, "cbid": 211, "correlation": 102340 } }, { "ph": "s", "id": 102340, "pid": 76337, "tid": -914061504, "ts": 1716454222961066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222961076, "dur": 0, "args": { "External id": 102341, "cbid": 51, "correlation": 102341 } }, { "ph": "s", "id": 102341, "pid": 76337, "tid": -914061504, "ts": 1716454222961076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223012718, "dur": 95, "args": { "External id": 102342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102342, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 102342, "pid": 5, "tid": 7, "ts": 1716454223012718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961077, "dur": 7, "args": { "External id": 102342, "cbid": 211, "correlation": 102342 } }, { "ph": "s", "id": 102342, "pid": 76337, "tid": -914061504, "ts": 1716454222961077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223012814, "dur": 15, "args": { "External id": 102347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102347, "pid": 5, "tid": 7, "ts": 1716454223012814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961105, "dur": 8, "args": { "External id": 102347, "cbid": 211, "correlation": 102347 } }, { "ph": "s", "id": 102347, "pid": 76337, "tid": -914061504, "ts": 1716454222961105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223012831, "dur": 83, "args": { "External id": 102356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102356, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102356, "pid": 5, "tid": 7, "ts": 1716454223012831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961188, "dur": 14, "args": { "External id": 102356, "cbid": 211, "correlation": 102356 } }, { "ph": "s", "id": 102356, "pid": 76337, "tid": -914061504, "ts": 1716454222961188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223012915, "dur": 31, "args": { "External id": 102378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102378, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102378, "pid": 5, "tid": 7, "ts": 1716454223012915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961244, "dur": 10, "args": { "External id": 102378, "cbid": 211, "correlation": 102378 } }, { "ph": "s", "id": 102378, "pid": 76337, "tid": -914061504, "ts": 1716454222961244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222961338, "dur": 1, "args": { "External id": 102389, "cbid": 251, "correlation": 102389 } }, { "ph": "f", "id": 102389, "pid": 76337, "tid": -914061504, "ts": 1716454222961338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223012947, "dur": 163, "args": { "External id": 102390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102390, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102390, "pid": 5, "tid": 7, "ts": 1716454223012947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961343, "dur": 13, "args": { "External id": 102390, "cbid": 211, "correlation": 102390 } }, { "ph": "s", "id": 102390, "pid": 76337, "tid": -914061504, "ts": 1716454222961343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222961413, "dur": 1, "args": { "External id": 102401, "cbid": 251, "correlation": 102401 } }, { "ph": "f", "id": 102401, "pid": 76337, "tid": -914061504, "ts": 1716454222961413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223013110, "dur": 158, "args": { "External id": 102402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102402, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102402, "pid": 5, "tid": 7, "ts": 1716454223013110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961417, "dur": 11, "args": { "External id": 102402, "cbid": 211, "correlation": 102402 } }, { "ph": "s", "id": 102402, "pid": 76337, "tid": -914061504, "ts": 1716454222961417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222961484, "dur": 1, "args": { "External id": 102413, "cbid": 251, "correlation": 102413 } }, { "ph": "f", "id": 102413, "pid": 76337, "tid": -914061504, "ts": 1716454222961484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223013270, "dur": 156, "args": { "External id": 102414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102414, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102414, "pid": 5, "tid": 7, "ts": 1716454223013270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961488, "dur": 11, "args": { "External id": 102414, "cbid": 211, "correlation": 102414 } }, { "ph": "s", "id": 102414, "pid": 76337, "tid": -914061504, "ts": 1716454222961488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223013427, "dur": 332, "args": { "External id": 102439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102439, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102439, "pid": 5, "tid": 7, "ts": 1716454223013427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961576, "dur": 14, "args": { "External id": 102439, "cbid": 211, "correlation": 102439 } }, { "ph": "s", "id": 102439, "pid": 76337, "tid": -914061504, "ts": 1716454222961576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222961677, "dur": 1, "args": { "External id": 102457, "cbid": 251, "correlation": 102457 } }, { "ph": "f", "id": 102457, "pid": 76337, "tid": -914061504, "ts": 1716454222961677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223013760, "dur": 164, "args": { "External id": 102459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102459, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102459, "pid": 5, "tid": 7, "ts": 1716454223013760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961683, "dur": 13, "args": { "External id": 102459, "cbid": 211, "correlation": 102459 } }, { "ph": "s", "id": 102459, "pid": 76337, "tid": -914061504, "ts": 1716454222961683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223013926, "dur": 19, "args": { "External id": 102467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102467, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102467, "pid": 5, "tid": 7, "ts": 1716454223013926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961754, "dur": 12, "args": { "External id": 102467, "cbid": 211, "correlation": 102467 } }, { "ph": "s", "id": 102467, "pid": 76337, "tid": -914061504, "ts": 1716454222961754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223013946, "dur": 29, "args": { "External id": 102475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102475, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102475, "pid": 5, "tid": 7, "ts": 1716454223013946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961792, "dur": 8, "args": { "External id": 102475, "cbid": 211, "correlation": 102475 } }, { "ph": "s", "id": 102475, "pid": 76337, "tid": -914061504, "ts": 1716454222961792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223013976, "dur": 18, "args": { "External id": 102486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102486, "pid": 5, "tid": 7, "ts": 1716454223013976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961864, "dur": 12, "args": { "External id": 102486, "cbid": 211, "correlation": 102486 } }, { "ph": "s", "id": 102486, "pid": 76337, "tid": -914061504, "ts": 1716454222961864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223013995, "dur": 16, "args": { "External id": 102508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102508, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102508, "pid": 5, "tid": 7, "ts": 1716454223013995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961896, "dur": 8, "args": { "External id": 102508, "cbid": 211, "correlation": 102508 } }, { "ph": "s", "id": 102508, "pid": 76337, "tid": -914061504, "ts": 1716454222961896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222961991, "dur": 2, "args": { "External id": 102519, "cbid": 251, "correlation": 102519 } }, { "ph": "f", "id": 102519, "pid": 76337, "tid": -914061504, "ts": 1716454222961991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223014012, "dur": 88, "args": { "External id": 102520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102520, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102520, "pid": 5, "tid": 7, "ts": 1716454223014012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222961998, "dur": 15, "args": { "External id": 102520, "cbid": 211, "correlation": 102520 } }, { "ph": "s", "id": 102520, "pid": 76337, "tid": -914061504, "ts": 1716454222961998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962071, "dur": 1, "args": { "External id": 102531, "cbid": 251, "correlation": 102531 } }, { "ph": "f", "id": 102531, "pid": 76337, "tid": -914061504, "ts": 1716454222962071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962075, "dur": 0, "args": { "External id": 102532, "cbid": 251, "correlation": 102532 } }, { "ph": "f", "id": 102532, "pid": 76337, "tid": -914061504, "ts": 1716454222962075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223014102, "dur": 12, "args": { "External id": 102533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102533, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102533, "pid": 5, "tid": 7, "ts": 1716454223014102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962076, "dur": 12, "args": { "External id": 102533, "cbid": 211, "correlation": 102533 } }, { "ph": "s", "id": 102533, "pid": 76337, "tid": -914061504, "ts": 1716454222962076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223014115, "dur": 6, "args": { "External id": 102535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102535, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102535, "pid": 5, "tid": 7, "ts": 1716454223014115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962091, "dur": 6, "args": { "External id": 102535, "cbid": 211, "correlation": 102535 } }, { "ph": "s", "id": 102535, "pid": 76337, "tid": -914061504, "ts": 1716454222962091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962149, "dur": 1, "args": { "External id": 102546, "cbid": 251, "correlation": 102546 } }, { "ph": "f", "id": 102546, "pid": 76337, "tid": -914061504, "ts": 1716454222962149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962152, "dur": 0, "args": { "External id": 102547, "cbid": 251, "correlation": 102547 } }, { "ph": "f", "id": 102547, "pid": 76337, "tid": -914061504, "ts": 1716454222962152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223014122, "dur": 8, "args": { "External id": 102548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102548, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102548, "pid": 5, "tid": 7, "ts": 1716454223014122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962153, "dur": 11, "args": { "External id": 102548, "cbid": 211, "correlation": 102548 } }, { "ph": "s", "id": 102548, "pid": 76337, "tid": -914061504, "ts": 1716454222962153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223014131, "dur": 3, "args": { "External id": 102550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102550, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102550, "pid": 5, "tid": 7, "ts": 1716454223014131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962166, "dur": 6, "args": { "External id": 102550, "cbid": 211, "correlation": 102550 } }, { "ph": "s", "id": 102550, "pid": 76337, "tid": -914061504, "ts": 1716454222962166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223014136, "dur": 54, "args": { "External id": 102575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102575, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102575, "pid": 5, "tid": 7, "ts": 1716454223014136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962244, "dur": 12, "args": { "External id": 102575, "cbid": 211, "correlation": 102575 } }, { "ph": "s", "id": 102575, "pid": 76337, "tid": -914061504, "ts": 1716454222962244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962344, "dur": 2, "args": { "External id": 102593, "cbid": 251, "correlation": 102593 } }, { "ph": "f", "id": 102593, "pid": 76337, "tid": -914061504, "ts": 1716454222962344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223014191, "dur": 90, "args": { "External id": 102595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102595, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102595, "pid": 5, "tid": 7, "ts": 1716454223014191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962351, "dur": 14, "args": { "External id": 102595, "cbid": 211, "correlation": 102595 } }, { "ph": "s", "id": 102595, "pid": 76337, "tid": -914061504, "ts": 1716454222962351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223014282, "dur": 9, "args": { "External id": 102603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102603, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102603, "pid": 5, "tid": 7, "ts": 1716454223014282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962421, "dur": 12, "args": { "External id": 102603, "cbid": 211, "correlation": 102603 } }, { "ph": "s", "id": 102603, "pid": 76337, "tid": -914061504, "ts": 1716454222962421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223014293, "dur": 22, "args": { "External id": 102611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102611, "pid": 5, "tid": 7, "ts": 1716454223014293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962463, "dur": 9, "args": { "External id": 102611, "cbid": 211, "correlation": 102611 } }, { "ph": "s", "id": 102611, "pid": 76337, "tid": -914061504, "ts": 1716454222962463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223014316, "dur": 18, "args": { "External id": 102633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102633, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102633, "pid": 5, "tid": 7, "ts": 1716454223014316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962515, "dur": 10, "args": { "External id": 102633, "cbid": 211, "correlation": 102633 } }, { "ph": "s", "id": 102633, "pid": 76337, "tid": -914061504, "ts": 1716454222962515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962605, "dur": 2, "args": { "External id": 102649, "cbid": 251, "correlation": 102649 } }, { "ph": "f", "id": 102649, "pid": 76337, "tid": -914061504, "ts": 1716454222962605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962610, "dur": 0, "args": { "External id": 102651, "cbid": 251, "correlation": 102651 } }, { "ph": "f", "id": 102651, "pid": 76337, "tid": -914061504, "ts": 1716454222962610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223014336, "dur": 490, "args": { "External id": 102652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102652, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102652, "pid": 5, "tid": 7, "ts": 1716454223014336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962613, "dur": 14, "args": { "External id": 102652, "cbid": 211, "correlation": 102652 } }, { "ph": "s", "id": 102652, "pid": 76337, "tid": -914061504, "ts": 1716454222962613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223014827, "dur": 65, "args": { "External id": 102660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102660, "pid": 5, "tid": 7, "ts": 1716454223014827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962678, "dur": 12, "args": { "External id": 102660, "cbid": 211, "correlation": 102660 } }, { "ph": "s", "id": 102660, "pid": 76337, "tid": -914061504, "ts": 1716454222962678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223014893, "dur": 69, "args": { "External id": 102668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102668, "pid": 5, "tid": 7, "ts": 1716454223014893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962710, "dur": 8, "args": { "External id": 102668, "cbid": 211, "correlation": 102668 } }, { "ph": "s", "id": 102668, "pid": 76337, "tid": -914061504, "ts": 1716454222962710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222962791, "dur": 1, "args": { "External id": 102684, "cbid": 251, "correlation": 102684 } }, { "ph": "f", "id": 102684, "pid": 76337, "tid": -914061504, "ts": 1716454222962791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223014964, "dur": 1, "args": { "External id": 102686, "device": 5, "context": 1, "stream": 7, "correlation": 102686, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 102686, "pid": 5, "tid": 7, "ts": 1716454223014964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222962797, "dur": 11, "args": { "External id": 102686, "cbid": 51, "correlation": 102686 } }, { "ph": "s", "id": 102686, "pid": 76337, "tid": -914061504, "ts": 1716454222962797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223014968, "dur": 266, "args": { "External id": 102687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102687, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102687, "pid": 5, "tid": 7, "ts": 1716454223014968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962809, "dur": 11, "args": { "External id": 102687, "cbid": 211, "correlation": 102687 } }, { "ph": "s", "id": 102687, "pid": 76337, "tid": -914061504, "ts": 1716454222962809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223015236, "dur": 14, "args": { "External id": 102695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102695, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102695, "pid": 5, "tid": 7, "ts": 1716454223015236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962851, "dur": 10, "args": { "External id": 102695, "cbid": 211, "correlation": 102695 } }, { "ph": "s", "id": 102695, "pid": 76337, "tid": -914061504, "ts": 1716454222962851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223015251, "dur": 37, "args": { "External id": 102706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102706, "pid": 5, "tid": 7, "ts": 1716454223015251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222962918, "dur": 12, "args": { "External id": 102706, "cbid": 211, "correlation": 102706 } }, { "ph": "s", "id": 102706, "pid": 76337, "tid": -914061504, "ts": 1716454222962918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222962990, "dur": 0, "args": { "External id": 102718, "cbid": 317, "correlation": 102718 } }, { "ph": "f", "id": 102718, "pid": 76337, "tid": -914061504, "ts": 1716454222962990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222962991, "dur": 0, "args": { "External id": 102719, "cbid": 203, "correlation": 102719 } }, { "ph": "f", "id": 102719, "pid": 76337, "tid": -914061504, "ts": 1716454222962991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222962992, "dur": 0, "args": { "External id": 102720, "cbid": 205, "correlation": 102720 } }, { "ph": "f", "id": 102720, "pid": 76337, "tid": -914061504, "ts": 1716454222962992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223015289, "dur": 14, "args": { "External id": 102724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102724, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102724, "pid": 5, "tid": 7, "ts": 1716454223015289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963008, "dur": 12, "args": { "External id": 102724, "cbid": 211, "correlation": 102724 } }, { "ph": "s", "id": 102724, "pid": 76337, "tid": -914061504, "ts": 1716454222963008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223015304, "dur": 4, "args": { "External id": 102726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102726, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102726, "pid": 5, "tid": 7, "ts": 1716454223015304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963025, "dur": 6, "args": { "External id": 102726, "cbid": 211, "correlation": 102726 } }, { "ph": "s", "id": 102726, "pid": 76337, "tid": -914061504, "ts": 1716454222963025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222963033, "dur": 0, "args": { "External id": 102727, "cbid": 51, "correlation": 102727 } }, { "ph": "s", "id": 102727, "pid": 76337, "tid": -914061504, "ts": 1716454222963033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223015309, "dur": 96, "args": { "External id": 102728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102728, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 102728, "pid": 5, "tid": 7, "ts": 1716454223015309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963034, "dur": 5, "args": { "External id": 102728, "cbid": 211, "correlation": 102728 } }, { "ph": "s", "id": 102728, "pid": 76337, "tid": -914061504, "ts": 1716454222963034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223015406, "dur": 16, "args": { "External id": 102733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102733, "pid": 5, "tid": 7, "ts": 1716454223015406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963062, "dur": 8, "args": { "External id": 102733, "cbid": 211, "correlation": 102733 } }, { "ph": "s", "id": 102733, "pid": 76337, "tid": -914061504, "ts": 1716454222963062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223015423, "dur": 13, "args": { "External id": 102741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102741, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102741, "pid": 5, "tid": 7, "ts": 1716454223015423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963095, "dur": 8, "args": { "External id": 102741, "cbid": 211, "correlation": 102741 } }, { "ph": "s", "id": 102741, "pid": 76337, "tid": -914061504, "ts": 1716454222963095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223015437, "dur": 17, "args": { "External id": 102761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102761, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 102761, "pid": 5, "tid": 7, "ts": 1716454223015437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963167, "dur": 12, "args": { "External id": 102761, "cbid": 211, "correlation": 102761 } }, { "ph": "s", "id": 102761, "pid": 76337, "tid": -914061504, "ts": 1716454222963167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223015456, "dur": 4, "args": { "External id": 102773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102773, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 102773, "pid": 5, "tid": 7, "ts": 1716454223015456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963189, "dur": 7, "args": { "External id": 102773, "cbid": 211, "correlation": 102773 } }, { "ph": "s", "id": 102773, "pid": 76337, "tid": -914061504, "ts": 1716454222963189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223015461, "dur": 18, "args": { "External id": 102776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102776, "pid": 5, "tid": 7, "ts": 1716454223015461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963207, "dur": 6, "args": { "External id": 102776, "cbid": 211, "correlation": 102776 } }, { "ph": "s", "id": 102776, "pid": 76337, "tid": -914061504, "ts": 1716454222963207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223015480, "dur": 12, "args": { "External id": 102785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102785, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102785, "pid": 5, "tid": 7, "ts": 1716454223015480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963246, "dur": 10, "args": { "External id": 102785, "cbid": 211, "correlation": 102785 } }, { "ph": "s", "id": 102785, "pid": 76337, "tid": -914061504, "ts": 1716454222963246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222963298, "dur": 0, "args": { "External id": 102795, "cbid": 317, "correlation": 102795 } }, { "ph": "f", "id": 102795, "pid": 76337, "tid": -914061504, "ts": 1716454222963298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222963299, "dur": 0, "args": { "External id": 102796, "cbid": 203, "correlation": 102796 } }, { "ph": "f", "id": 102796, "pid": 76337, "tid": -914061504, "ts": 1716454222963299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222963300, "dur": 0, "args": { "External id": 102797, "cbid": 205, "correlation": 102797 } }, { "ph": "f", "id": 102797, "pid": 76337, "tid": -914061504, "ts": 1716454222963300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223015494, "dur": 11, "args": { "External id": 102801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102801, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102801, "pid": 5, "tid": 7, "ts": 1716454223015494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963313, "dur": 11, "args": { "External id": 102801, "cbid": 211, "correlation": 102801 } }, { "ph": "s", "id": 102801, "pid": 76337, "tid": -914061504, "ts": 1716454222963313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223015506, "dur": 160, "args": { "External id": 102803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102803, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102803, "pid": 5, "tid": 7, "ts": 1716454223015506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963327, "dur": 5, "args": { "External id": 102803, "cbid": 211, "correlation": 102803 } }, { "ph": "s", "id": 102803, "pid": 76337, "tid": -914061504, "ts": 1716454222963327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223015668, "dur": 1, "args": { "External id": 102805, "device": 5, "context": 1, "stream": 7, "correlation": 102805, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 102805, "pid": 5, "tid": 7, "ts": 1716454223015668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222963338, "dur": 7, "args": { "External id": 102805, "cbid": 51, "correlation": 102805 } }, { "ph": "s", "id": 102805, "pid": 76337, "tid": -914061504, "ts": 1716454222963338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223015672, "dur": 658, "args": { "External id": 102806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102806, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102806, "pid": 5, "tid": 7, "ts": 1716454223015672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963346, "dur": 6, "args": { "External id": 102806, "cbid": 211, "correlation": 102806 } }, { "ph": "s", "id": 102806, "pid": 76337, "tid": -914061504, "ts": 1716454222963346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223016331, "dur": 13, "args": { "External id": 102808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102808, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102808, "pid": 5, "tid": 7, "ts": 1716454223016331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963356, "dur": 5, "args": { "External id": 102808, "cbid": 211, "correlation": 102808 } }, { "ph": "s", "id": 102808, "pid": 76337, "tid": -914061504, "ts": 1716454222963356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223016346, "dur": 15, "args": { "External id": 102814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102814, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102814, "pid": 5, "tid": 7, "ts": 1716454223016346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963384, "dur": 9, "args": { "External id": 102814, "cbid": 211, "correlation": 102814 } }, { "ph": "s", "id": 102814, "pid": 76337, "tid": -914061504, "ts": 1716454222963384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223016362, "dur": 3, "args": { "External id": 102822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102822, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 102822, "pid": 5, "tid": 7, "ts": 1716454223016362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963428, "dur": 9, "args": { "External id": 102822, "cbid": 211, "correlation": 102822 } }, { "ph": "s", "id": 102822, "pid": 76337, "tid": -914061504, "ts": 1716454222963428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222963495, "dur": 1, "args": { "External id": 102838, "cbid": 251, "correlation": 102838 } }, { "ph": "f", "id": 102838, "pid": 76337, "tid": -914061504, "ts": 1716454222963495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222963500, "dur": 0, "args": { "External id": 102840, "cbid": 251, "correlation": 102840 } }, { "ph": "f", "id": 102840, "pid": 76337, "tid": -914061504, "ts": 1716454222963500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223016367, "dur": 14, "args": { "External id": 102841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102841, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102841, "pid": 5, "tid": 7, "ts": 1716454223016367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963502, "dur": 11, "args": { "External id": 102841, "cbid": 211, "correlation": 102841 } }, { "ph": "s", "id": 102841, "pid": 76337, "tid": -914061504, "ts": 1716454222963502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223016381, "dur": 5, "args": { "External id": 102843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102843, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102843, "pid": 5, "tid": 7, "ts": 1716454223016381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963515, "dur": 6, "args": { "External id": 102843, "cbid": 211, "correlation": 102843 } }, { "ph": "s", "id": 102843, "pid": 76337, "tid": -914061504, "ts": 1716454222963515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223016388, "dur": 17, "args": { "External id": 102853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102853, "pid": 5, "tid": 7, "ts": 1716454223016388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963573, "dur": 12, "args": { "External id": 102853, "cbid": 211, "correlation": 102853 } }, { "ph": "s", "id": 102853, "pid": 76337, "tid": -914061504, "ts": 1716454222963573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223016406, "dur": 17, "args": { "External id": 102873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102873, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 102873, "pid": 5, "tid": 7, "ts": 1716454223016406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963639, "dur": 11, "args": { "External id": 102873, "cbid": 211, "correlation": 102873 } }, { "ph": "s", "id": 102873, "pid": 76337, "tid": -914061504, "ts": 1716454222963639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223016425, "dur": 4, "args": { "External id": 102885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102885, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 102885, "pid": 5, "tid": 7, "ts": 1716454223016425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963660, "dur": 6, "args": { "External id": 102885, "cbid": 211, "correlation": 102885 } }, { "ph": "s", "id": 102885, "pid": 76337, "tid": -914061504, "ts": 1716454222963660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223016430, "dur": 17, "args": { "External id": 102888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102888, "pid": 5, "tid": 7, "ts": 1716454223016430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963679, "dur": 6, "args": { "External id": 102888, "cbid": 211, "correlation": 102888 } }, { "ph": "s", "id": 102888, "pid": 76337, "tid": -914061504, "ts": 1716454222963679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223016448, "dur": 10, "args": { "External id": 102897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102897, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102897, "pid": 5, "tid": 7, "ts": 1716454223016448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963719, "dur": 9, "args": { "External id": 102897, "cbid": 211, "correlation": 102897 } }, { "ph": "s", "id": 102897, "pid": 76337, "tid": -914061504, "ts": 1716454222963719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222963780, "dur": 0, "args": { "External id": 102907, "cbid": 317, "correlation": 102907 } }, { "ph": "f", "id": 102907, "pid": 76337, "tid": -914061504, "ts": 1716454222963780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222963781, "dur": 0, "args": { "External id": 102908, "cbid": 203, "correlation": 102908 } }, { "ph": "f", "id": 102908, "pid": 76337, "tid": -914061504, "ts": 1716454222963781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222963782, "dur": 0, "args": { "External id": 102909, "cbid": 205, "correlation": 102909 } }, { "ph": "f", "id": 102909, "pid": 76337, "tid": -914061504, "ts": 1716454222963782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223016460, "dur": 10, "args": { "External id": 102913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102913, "pid": 5, "tid": 7, "ts": 1716454223016460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963796, "dur": 12, "args": { "External id": 102913, "cbid": 211, "correlation": 102913 } }, { "ph": "s", "id": 102913, "pid": 76337, "tid": -914061504, "ts": 1716454222963796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223016471, "dur": 159, "args": { "External id": 102915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102915, "pid": 5, "tid": 7, "ts": 1716454223016471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963811, "dur": 5, "args": { "External id": 102915, "cbid": 211, "correlation": 102915 } }, { "ph": "s", "id": 102915, "pid": 76337, "tid": -914061504, "ts": 1716454222963811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223016633, "dur": 1, "args": { "External id": 102917, "device": 5, "context": 1, "stream": 7, "correlation": 102917, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 102917, "pid": 5, "tid": 7, "ts": 1716454223016633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222963822, "dur": 8, "args": { "External id": 102917, "cbid": 51, "correlation": 102917 } }, { "ph": "s", "id": 102917, "pid": 76337, "tid": -914061504, "ts": 1716454222963822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223016637, "dur": 640, "args": { "External id": 102918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102918, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 102918, "pid": 5, "tid": 7, "ts": 1716454223016637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963831, "dur": 6, "args": { "External id": 102918, "cbid": 211, "correlation": 102918 } }, { "ph": "s", "id": 102918, "pid": 76337, "tid": -914061504, "ts": 1716454222963831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223017279, "dur": 13, "args": { "External id": 102920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102920, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102920, "pid": 5, "tid": 7, "ts": 1716454223017279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963841, "dur": 6, "args": { "External id": 102920, "cbid": 211, "correlation": 102920 } }, { "ph": "s", "id": 102920, "pid": 76337, "tid": -914061504, "ts": 1716454222963841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223017293, "dur": 15, "args": { "External id": 102926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102926, "pid": 5, "tid": 7, "ts": 1716454223017293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963870, "dur": 9, "args": { "External id": 102926, "cbid": 211, "correlation": 102926 } }, { "ph": "s", "id": 102926, "pid": 76337, "tid": -914061504, "ts": 1716454222963870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223017309, "dur": 12, "args": { "External id": 102934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102934, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102934, "pid": 5, "tid": 7, "ts": 1716454223017309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963902, "dur": 8, "args": { "External id": 102934, "cbid": 211, "correlation": 102934 } }, { "ph": "s", "id": 102934, "pid": 76337, "tid": -914061504, "ts": 1716454222963902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223017323, "dur": 10, "args": { "External id": 102942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102942, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102942, "pid": 5, "tid": 7, "ts": 1716454223017323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222963930, "dur": 8, "args": { "External id": 102942, "cbid": 211, "correlation": 102942 } }, { "ph": "s", "id": 102942, "pid": 76337, "tid": -914061504, "ts": 1716454222963930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223017334, "dur": 18, "args": { "External id": 102962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102962, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 102962, "pid": 5, "tid": 7, "ts": 1716454223017334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964020, "dur": 12, "args": { "External id": 102962, "cbid": 211, "correlation": 102962 } }, { "ph": "s", "id": 102962, "pid": 76337, "tid": -914061504, "ts": 1716454222964020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223017353, "dur": 4, "args": { "External id": 102974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102974, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 102974, "pid": 5, "tid": 7, "ts": 1716454223017353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964042, "dur": 7, "args": { "External id": 102974, "cbid": 211, "correlation": 102974 } }, { "ph": "s", "id": 102974, "pid": 76337, "tid": -914061504, "ts": 1716454222964042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223017358, "dur": 16, "args": { "External id": 102977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102977, "pid": 5, "tid": 7, "ts": 1716454223017358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964060, "dur": 6, "args": { "External id": 102977, "cbid": 211, "correlation": 102977 } }, { "ph": "s", "id": 102977, "pid": 76337, "tid": -914061504, "ts": 1716454222964060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222964117, "dur": 0, "args": { "External id": 102988, "cbid": 317, "correlation": 102988 } }, { "ph": "f", "id": 102988, "pid": 76337, "tid": -914061504, "ts": 1716454222964117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222964118, "dur": 0, "args": { "External id": 102989, "cbid": 203, "correlation": 102989 } }, { "ph": "f", "id": 102989, "pid": 76337, "tid": -914061504, "ts": 1716454222964118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222964119, "dur": 0, "args": { "External id": 102990, "cbid": 205, "correlation": 102990 } }, { "ph": "f", "id": 102990, "pid": 76337, "tid": -914061504, "ts": 1716454222964119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223017376, "dur": 11, "args": { "External id": 102994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102994, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 102994, "pid": 5, "tid": 7, "ts": 1716454223017376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964132, "dur": 12, "args": { "External id": 102994, "cbid": 211, "correlation": 102994 } }, { "ph": "s", "id": 102994, "pid": 76337, "tid": -914061504, "ts": 1716454222964132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223017388, "dur": 4, "args": { "External id": 102996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 102996, "pid": 5, "tid": 7, "ts": 1716454223017388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964150, "dur": 6, "args": { "External id": 102996, "cbid": 211, "correlation": 102996 } }, { "ph": "s", "id": 102996, "pid": 76337, "tid": -914061504, "ts": 1716454222964150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222964158, "dur": 0, "args": { "External id": 102997, "cbid": 51, "correlation": 102997 } }, { "ph": "s", "id": 102997, "pid": 76337, "tid": -914061504, "ts": 1716454222964158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223017393, "dur": 93, "args": { "External id": 102998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 102998, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 102998, "pid": 5, "tid": 7, "ts": 1716454223017393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964159, "dur": 5, "args": { "External id": 102998, "cbid": 211, "correlation": 102998 } }, { "ph": "s", "id": 102998, "pid": 76337, "tid": -914061504, "ts": 1716454222964159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223017488, "dur": 15, "args": { "External id": 103003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103003, "pid": 5, "tid": 7, "ts": 1716454223017488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964185, "dur": 9, "args": { "External id": 103003, "cbid": 211, "correlation": 103003 } }, { "ph": "s", "id": 103003, "pid": 76337, "tid": -914061504, "ts": 1716454222964185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223017504, "dur": 81, "args": { "External id": 103012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103012, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103012, "pid": 5, "tid": 7, "ts": 1716454223017504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964268, "dur": 14, "args": { "External id": 103012, "cbid": 211, "correlation": 103012 } }, { "ph": "s", "id": 103012, "pid": 76337, "tid": -914061504, "ts": 1716454222964268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223017586, "dur": 30, "args": { "External id": 103034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103034, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103034, "pid": 5, "tid": 7, "ts": 1716454223017586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964324, "dur": 10, "args": { "External id": 103034, "cbid": 211, "correlation": 103034 } }, { "ph": "s", "id": 103034, "pid": 76337, "tid": -914061504, "ts": 1716454222964324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222964412, "dur": 1, "args": { "External id": 103045, "cbid": 251, "correlation": 103045 } }, { "ph": "f", "id": 103045, "pid": 76337, "tid": -914061504, "ts": 1716454222964412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223017618, "dur": 141, "args": { "External id": 103046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103046, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103046, "pid": 5, "tid": 7, "ts": 1716454223017618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964417, "dur": 12, "args": { "External id": 103046, "cbid": 211, "correlation": 103046 } }, { "ph": "s", "id": 103046, "pid": 76337, "tid": -914061504, "ts": 1716454222964417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222964486, "dur": 1, "args": { "External id": 103057, "cbid": 251, "correlation": 103057 } }, { "ph": "f", "id": 103057, "pid": 76337, "tid": -914061504, "ts": 1716454222964486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223017760, "dur": 155, "args": { "External id": 103058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103058, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103058, "pid": 5, "tid": 7, "ts": 1716454223017760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964490, "dur": 12, "args": { "External id": 103058, "cbid": 211, "correlation": 103058 } }, { "ph": "s", "id": 103058, "pid": 76337, "tid": -914061504, "ts": 1716454222964490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222964556, "dur": 1, "args": { "External id": 103069, "cbid": 251, "correlation": 103069 } }, { "ph": "f", "id": 103069, "pid": 76337, "tid": -914061504, "ts": 1716454222964556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223017916, "dur": 156, "args": { "External id": 103070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103070, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103070, "pid": 5, "tid": 7, "ts": 1716454223017916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964560, "dur": 11, "args": { "External id": 103070, "cbid": 211, "correlation": 103070 } }, { "ph": "s", "id": 103070, "pid": 76337, "tid": -914061504, "ts": 1716454222964560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223018074, "dur": 330, "args": { "External id": 103095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103095, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103095, "pid": 5, "tid": 7, "ts": 1716454223018074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964644, "dur": 13, "args": { "External id": 103095, "cbid": 211, "correlation": 103095 } }, { "ph": "s", "id": 103095, "pid": 76337, "tid": -914061504, "ts": 1716454222964644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222964742, "dur": 1, "args": { "External id": 103113, "cbid": 251, "correlation": 103113 } }, { "ph": "f", "id": 103113, "pid": 76337, "tid": -914061504, "ts": 1716454222964742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223018405, "dur": 163, "args": { "External id": 103115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103115, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103115, "pid": 5, "tid": 7, "ts": 1716454223018405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964748, "dur": 14, "args": { "External id": 103115, "cbid": 211, "correlation": 103115 } }, { "ph": "s", "id": 103115, "pid": 76337, "tid": -914061504, "ts": 1716454222964748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223018570, "dur": 19, "args": { "External id": 103123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103123, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103123, "pid": 5, "tid": 7, "ts": 1716454223018570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964819, "dur": 12, "args": { "External id": 103123, "cbid": 211, "correlation": 103123 } }, { "ph": "s", "id": 103123, "pid": 76337, "tid": -914061504, "ts": 1716454222964819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223018591, "dur": 27, "args": { "External id": 103131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103131, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103131, "pid": 5, "tid": 7, "ts": 1716454223018591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964858, "dur": 9, "args": { "External id": 103131, "cbid": 211, "correlation": 103131 } }, { "ph": "s", "id": 103131, "pid": 76337, "tid": -914061504, "ts": 1716454222964858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223018619, "dur": 18, "args": { "External id": 103142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103142, "pid": 5, "tid": 7, "ts": 1716454223018619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964929, "dur": 12, "args": { "External id": 103142, "cbid": 211, "correlation": 103142 } }, { "ph": "s", "id": 103142, "pid": 76337, "tid": -914061504, "ts": 1716454222964929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223018638, "dur": 16, "args": { "External id": 103164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103164, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103164, "pid": 5, "tid": 7, "ts": 1716454223018638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222964962, "dur": 7, "args": { "External id": 103164, "cbid": 211, "correlation": 103164 } }, { "ph": "s", "id": 103164, "pid": 76337, "tid": -914061504, "ts": 1716454222964962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965055, "dur": 1, "args": { "External id": 103175, "cbid": 251, "correlation": 103175 } }, { "ph": "f", "id": 103175, "pid": 76337, "tid": -914061504, "ts": 1716454222965055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223018655, "dur": 87, "args": { "External id": 103176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103176, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 103176, "pid": 5, "tid": 7, "ts": 1716454223018655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965061, "dur": 14, "args": { "External id": 103176, "cbid": 211, "correlation": 103176 } }, { "ph": "s", "id": 103176, "pid": 76337, "tid": -914061504, "ts": 1716454222965061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965130, "dur": 1, "args": { "External id": 103187, "cbid": 251, "correlation": 103187 } }, { "ph": "f", "id": 103187, "pid": 76337, "tid": -914061504, "ts": 1716454222965130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965133, "dur": 0, "args": { "External id": 103188, "cbid": 251, "correlation": 103188 } }, { "ph": "f", "id": 103188, "pid": 76337, "tid": -914061504, "ts": 1716454222965133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223018744, "dur": 12, "args": { "External id": 103189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103189, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103189, "pid": 5, "tid": 7, "ts": 1716454223018744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965135, "dur": 12, "args": { "External id": 103189, "cbid": 211, "correlation": 103189 } }, { "ph": "s", "id": 103189, "pid": 76337, "tid": -914061504, "ts": 1716454222965135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223018757, "dur": 6, "args": { "External id": 103191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103191, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103191, "pid": 5, "tid": 7, "ts": 1716454223018757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965148, "dur": 6, "args": { "External id": 103191, "cbid": 211, "correlation": 103191 } }, { "ph": "s", "id": 103191, "pid": 76337, "tid": -914061504, "ts": 1716454222965148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965205, "dur": 1, "args": { "External id": 103202, "cbid": 251, "correlation": 103202 } }, { "ph": "f", "id": 103202, "pid": 76337, "tid": -914061504, "ts": 1716454222965205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965209, "dur": 0, "args": { "External id": 103203, "cbid": 251, "correlation": 103203 } }, { "ph": "f", "id": 103203, "pid": 76337, "tid": -914061504, "ts": 1716454222965209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223018764, "dur": 8, "args": { "External id": 103204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103204, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103204, "pid": 5, "tid": 7, "ts": 1716454223018764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965211, "dur": 11, "args": { "External id": 103204, "cbid": 211, "correlation": 103204 } }, { "ph": "s", "id": 103204, "pid": 76337, "tid": -914061504, "ts": 1716454222965211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223018774, "dur": 3, "args": { "External id": 103206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103206, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103206, "pid": 5, "tid": 7, "ts": 1716454223018774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965224, "dur": 5, "args": { "External id": 103206, "cbid": 211, "correlation": 103206 } }, { "ph": "s", "id": 103206, "pid": 76337, "tid": -914061504, "ts": 1716454222965224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223018778, "dur": 54, "args": { "External id": 103231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103231, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103231, "pid": 5, "tid": 7, "ts": 1716454223018778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965300, "dur": 13, "args": { "External id": 103231, "cbid": 211, "correlation": 103231 } }, { "ph": "s", "id": 103231, "pid": 76337, "tid": -914061504, "ts": 1716454222965300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965398, "dur": 1, "args": { "External id": 103249, "cbid": 251, "correlation": 103249 } }, { "ph": "f", "id": 103249, "pid": 76337, "tid": -914061504, "ts": 1716454222965398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223018833, "dur": 88, "args": { "External id": 103251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103251, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 103251, "pid": 5, "tid": 7, "ts": 1716454223018833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965404, "dur": 14, "args": { "External id": 103251, "cbid": 211, "correlation": 103251 } }, { "ph": "s", "id": 103251, "pid": 76337, "tid": -914061504, "ts": 1716454222965404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223018923, "dur": 9, "args": { "External id": 103259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103259, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103259, "pid": 5, "tid": 7, "ts": 1716454223018923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965475, "dur": 12, "args": { "External id": 103259, "cbid": 211, "correlation": 103259 } }, { "ph": "s", "id": 103259, "pid": 76337, "tid": -914061504, "ts": 1716454222965475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223018934, "dur": 20, "args": { "External id": 103267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103267, "pid": 5, "tid": 7, "ts": 1716454223018934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965516, "dur": 9, "args": { "External id": 103267, "cbid": 211, "correlation": 103267 } }, { "ph": "s", "id": 103267, "pid": 76337, "tid": -914061504, "ts": 1716454222965516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223018955, "dur": 17, "args": { "External id": 103289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103289, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103289, "pid": 5, "tid": 7, "ts": 1716454223018955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965567, "dur": 10, "args": { "External id": 103289, "cbid": 211, "correlation": 103289 } }, { "ph": "s", "id": 103289, "pid": 76337, "tid": -914061504, "ts": 1716454222965567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965655, "dur": 1, "args": { "External id": 103305, "cbid": 251, "correlation": 103305 } }, { "ph": "f", "id": 103305, "pid": 76337, "tid": -914061504, "ts": 1716454222965655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965660, "dur": 0, "args": { "External id": 103307, "cbid": 251, "correlation": 103307 } }, { "ph": "f", "id": 103307, "pid": 76337, "tid": -914061504, "ts": 1716454222965660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223018974, "dur": 490, "args": { "External id": 103308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103308, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103308, "pid": 5, "tid": 7, "ts": 1716454223018974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965662, "dur": 12, "args": { "External id": 103308, "cbid": 211, "correlation": 103308 } }, { "ph": "s", "id": 103308, "pid": 76337, "tid": -914061504, "ts": 1716454222965662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223019465, "dur": 65, "args": { "External id": 103316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103316, "pid": 5, "tid": 7, "ts": 1716454223019465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965726, "dur": 12, "args": { "External id": 103316, "cbid": 211, "correlation": 103316 } }, { "ph": "s", "id": 103316, "pid": 76337, "tid": -914061504, "ts": 1716454222965726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223019531, "dur": 67, "args": { "External id": 103324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103324, "pid": 5, "tid": 7, "ts": 1716454223019531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965755, "dur": 9, "args": { "External id": 103324, "cbid": 211, "correlation": 103324 } }, { "ph": "s", "id": 103324, "pid": 76337, "tid": -914061504, "ts": 1716454222965755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222965835, "dur": 1, "args": { "External id": 103340, "cbid": 251, "correlation": 103340 } }, { "ph": "f", "id": 103340, "pid": 76337, "tid": -914061504, "ts": 1716454222965835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223019601, "dur": 1, "args": { "External id": 103342, "device": 5, "context": 1, "stream": 7, "correlation": 103342, "bytes": 240, "memory bandwidth (GB/s)": 0.1561483409238777 } }, { "ph": "f", "id": 103342, "pid": 5, "tid": 7, "ts": 1716454223019601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222965840, "dur": 10, "args": { "External id": 103342, "cbid": 51, "correlation": 103342 } }, { "ph": "s", "id": 103342, "pid": 76337, "tid": -914061504, "ts": 1716454222965840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223019605, "dur": 268, "args": { "External id": 103343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103343, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 103343, "pid": 5, "tid": 7, "ts": 1716454223019605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965851, "dur": 12, "args": { "External id": 103343, "cbid": 211, "correlation": 103343 } }, { "ph": "s", "id": 103343, "pid": 76337, "tid": -914061504, "ts": 1716454222965851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223019874, "dur": 14, "args": { "External id": 103351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103351, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103351, "pid": 5, "tid": 7, "ts": 1716454223019874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965893, "dur": 10, "args": { "External id": 103351, "cbid": 211, "correlation": 103351 } }, { "ph": "s", "id": 103351, "pid": 76337, "tid": -914061504, "ts": 1716454222965893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223019889, "dur": 37, "args": { "External id": 103362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103362, "pid": 5, "tid": 7, "ts": 1716454223019889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222965961, "dur": 20, "args": { "External id": 103362, "cbid": 211, "correlation": 103362 } }, { "ph": "s", "id": 103362, "pid": 76337, "tid": -914061504, "ts": 1716454222965961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222966035, "dur": 0, "args": { "External id": 103374, "cbid": 317, "correlation": 103374 } }, { "ph": "f", "id": 103374, "pid": 76337, "tid": -914061504, "ts": 1716454222966035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222966036, "dur": 0, "args": { "External id": 103375, "cbid": 203, "correlation": 103375 } }, { "ph": "f", "id": 103375, "pid": 76337, "tid": -914061504, "ts": 1716454222966036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222966036, "dur": 0, "args": { "External id": 103376, "cbid": 205, "correlation": 103376 } }, { "ph": "f", "id": 103376, "pid": 76337, "tid": -914061504, "ts": 1716454222966036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223019928, "dur": 13, "args": { "External id": 103380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103380, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103380, "pid": 5, "tid": 7, "ts": 1716454223019928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966051, "dur": 13, "args": { "External id": 103380, "cbid": 211, "correlation": 103380 } }, { "ph": "s", "id": 103380, "pid": 76337, "tid": -914061504, "ts": 1716454222966051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223019942, "dur": 4, "args": { "External id": 103382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 103382, "pid": 5, "tid": 7, "ts": 1716454223019942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966069, "dur": 5, "args": { "External id": 103382, "cbid": 211, "correlation": 103382 } }, { "ph": "s", "id": 103382, "pid": 76337, "tid": -914061504, "ts": 1716454222966069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222966078, "dur": 0, "args": { "External id": 103383, "cbid": 51, "correlation": 103383 } }, { "ph": "s", "id": 103383, "pid": 76337, "tid": -914061504, "ts": 1716454222966078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223019947, "dur": 96, "args": { "External id": 103384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103384, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 103384, "pid": 5, "tid": 7, "ts": 1716454223019947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966078, "dur": 5, "args": { "External id": 103384, "cbid": 211, "correlation": 103384 } }, { "ph": "s", "id": 103384, "pid": 76337, "tid": -914061504, "ts": 1716454222966078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223020045, "dur": 16, "args": { "External id": 103389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103389, "pid": 5, "tid": 7, "ts": 1716454223020045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966106, "dur": 9, "args": { "External id": 103389, "cbid": 211, "correlation": 103389 } }, { "ph": "s", "id": 103389, "pid": 76337, "tid": -914061504, "ts": 1716454222966106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223020062, "dur": 11, "args": { "External id": 103397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103397, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103397, "pid": 5, "tid": 7, "ts": 1716454223020062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966138, "dur": 8, "args": { "External id": 103397, "cbid": 211, "correlation": 103397 } }, { "ph": "s", "id": 103397, "pid": 76337, "tid": -914061504, "ts": 1716454222966138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222966208, "dur": 0, "args": { "External id": 103407, "cbid": 317, "correlation": 103407 } }, { "ph": "f", "id": 103407, "pid": 76337, "tid": -914061504, "ts": 1716454222966208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222966209, "dur": 0, "args": { "External id": 103408, "cbid": 203, "correlation": 103408 } }, { "ph": "f", "id": 103408, "pid": 76337, "tid": -914061504, "ts": 1716454222966209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222966210, "dur": 0, "args": { "External id": 103409, "cbid": 205, "correlation": 103409 } }, { "ph": "f", "id": 103409, "pid": 76337, "tid": -914061504, "ts": 1716454222966210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223020075, "dur": 11, "args": { "External id": 103413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103413, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103413, "pid": 5, "tid": 7, "ts": 1716454223020075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966224, "dur": 12, "args": { "External id": 103413, "cbid": 211, "correlation": 103413 } }, { "ph": "s", "id": 103413, "pid": 76337, "tid": -914061504, "ts": 1716454222966224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223020086, "dur": 159, "args": { "External id": 103415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103415, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103415, "pid": 5, "tid": 7, "ts": 1716454223020086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966239, "dur": 5, "args": { "External id": 103415, "cbid": 211, "correlation": 103415 } }, { "ph": "s", "id": 103415, "pid": 76337, "tid": -914061504, "ts": 1716454222966239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223020248, "dur": 1, "args": { "External id": 103417, "device": 5, "context": 1, "stream": 7, "correlation": 103417, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 103417, "pid": 5, "tid": 7, "ts": 1716454223020248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222966250, "dur": 7, "args": { "External id": 103417, "cbid": 51, "correlation": 103417 } }, { "ph": "s", "id": 103417, "pid": 76337, "tid": -914061504, "ts": 1716454222966250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223020252, "dur": 196, "args": { "External id": 103418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103418, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 103418, "pid": 5, "tid": 7, "ts": 1716454223020252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966259, "dur": 8, "args": { "External id": 103418, "cbid": 211, "correlation": 103418 } }, { "ph": "s", "id": 103418, "pid": 76337, "tid": -914061504, "ts": 1716454222966259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223020449, "dur": 6, "args": { "External id": 103420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103420, "pid": 5, "tid": 7, "ts": 1716454223020449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966270, "dur": 5, "args": { "External id": 103420, "cbid": 211, "correlation": 103420 } }, { "ph": "s", "id": 103420, "pid": 76337, "tid": -914061504, "ts": 1716454222966270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223020456, "dur": 6, "args": { "External id": 103426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103426, "pid": 5, "tid": 7, "ts": 1716454223020456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966298, "dur": 9, "args": { "External id": 103426, "cbid": 211, "correlation": 103426 } }, { "ph": "s", "id": 103426, "pid": 76337, "tid": -914061504, "ts": 1716454222966298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223020464, "dur": 10, "args": { "External id": 103446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103446, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 103446, "pid": 5, "tid": 7, "ts": 1716454223020464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966391, "dur": 12, "args": { "External id": 103446, "cbid": 211, "correlation": 103446 } }, { "ph": "s", "id": 103446, "pid": 76337, "tid": -914061504, "ts": 1716454222966391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223020476, "dur": 4, "args": { "External id": 103458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103458, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 103458, "pid": 5, "tid": 7, "ts": 1716454223020476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966415, "dur": 6, "args": { "External id": 103458, "cbid": 211, "correlation": 103458 } }, { "ph": "s", "id": 103458, "pid": 76337, "tid": -914061504, "ts": 1716454222966415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223020481, "dur": 8, "args": { "External id": 103461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103461, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103461, "pid": 5, "tid": 7, "ts": 1716454223020481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966434, "dur": 6, "args": { "External id": 103461, "cbid": 211, "correlation": 103461 } }, { "ph": "s", "id": 103461, "pid": 76337, "tid": -914061504, "ts": 1716454222966434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223020491, "dur": 5, "args": { "External id": 103470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103470, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103470, "pid": 5, "tid": 7, "ts": 1716454223020491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966475, "dur": 9, "args": { "External id": 103470, "cbid": 211, "correlation": 103470 } }, { "ph": "s", "id": 103470, "pid": 76337, "tid": -914061504, "ts": 1716454222966475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222966526, "dur": 0, "args": { "External id": 103480, "cbid": 317, "correlation": 103480 } }, { "ph": "f", "id": 103480, "pid": 76337, "tid": -914061504, "ts": 1716454222966526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222966527, "dur": 0, "args": { "External id": 103481, "cbid": 203, "correlation": 103481 } }, { "ph": "f", "id": 103481, "pid": 76337, "tid": -914061504, "ts": 1716454222966527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222966528, "dur": 0, "args": { "External id": 103482, "cbid": 205, "correlation": 103482 } }, { "ph": "f", "id": 103482, "pid": 76337, "tid": -914061504, "ts": 1716454222966528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223020497, "dur": 5, "args": { "External id": 103486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103486, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103486, "pid": 5, "tid": 7, "ts": 1716454223020497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966543, "dur": 12, "args": { "External id": 103486, "cbid": 211, "correlation": 103486 } }, { "ph": "s", "id": 103486, "pid": 76337, "tid": -914061504, "ts": 1716454222966543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223020503, "dur": 159, "args": { "External id": 103488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103488, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103488, "pid": 5, "tid": 7, "ts": 1716454223020503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966557, "dur": 6, "args": { "External id": 103488, "cbid": 211, "correlation": 103488 } }, { "ph": "s", "id": 103488, "pid": 76337, "tid": -914061504, "ts": 1716454222966557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223020665, "dur": 1, "args": { "External id": 103490, "device": 5, "context": 1, "stream": 7, "correlation": 103490, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 103490, "pid": 5, "tid": 7, "ts": 1716454223020665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222966568, "dur": 6, "args": { "External id": 103490, "cbid": 51, "correlation": 103490 } }, { "ph": "s", "id": 103490, "pid": 76337, "tid": -914061504, "ts": 1716454222966568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223020668, "dur": 264, "args": { "External id": 103491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103491, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103491, "pid": 5, "tid": 7, "ts": 1716454223020668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966576, "dur": 6, "args": { "External id": 103491, "cbid": 211, "correlation": 103491 } }, { "ph": "s", "id": 103491, "pid": 76337, "tid": -914061504, "ts": 1716454222966576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223020934, "dur": 6, "args": { "External id": 103493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103493, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103493, "pid": 5, "tid": 7, "ts": 1716454223020934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966587, "dur": 5, "args": { "External id": 103493, "cbid": 211, "correlation": 103493 } }, { "ph": "s", "id": 103493, "pid": 76337, "tid": -914061504, "ts": 1716454222966587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223020941, "dur": 6, "args": { "External id": 103499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103499, "pid": 5, "tid": 7, "ts": 1716454223020941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966615, "dur": 9, "args": { "External id": 103499, "cbid": 211, "correlation": 103499 } }, { "ph": "s", "id": 103499, "pid": 76337, "tid": -914061504, "ts": 1716454222966615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223020948, "dur": 3, "args": { "External id": 103507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103507, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 103507, "pid": 5, "tid": 7, "ts": 1716454223020948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966660, "dur": 9, "args": { "External id": 103507, "cbid": 211, "correlation": 103507 } }, { "ph": "s", "id": 103507, "pid": 76337, "tid": -914061504, "ts": 1716454222966660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222966727, "dur": 1, "args": { "External id": 103523, "cbid": 251, "correlation": 103523 } }, { "ph": "f", "id": 103523, "pid": 76337, "tid": -914061504, "ts": 1716454222966727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222966732, "dur": 0, "args": { "External id": 103525, "cbid": 251, "correlation": 103525 } }, { "ph": "f", "id": 103525, "pid": 76337, "tid": -914061504, "ts": 1716454222966732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223020953, "dur": 13, "args": { "External id": 103526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103526, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103526, "pid": 5, "tid": 7, "ts": 1716454223020953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966734, "dur": 11, "args": { "External id": 103526, "cbid": 211, "correlation": 103526 } }, { "ph": "s", "id": 103526, "pid": 76337, "tid": -914061504, "ts": 1716454222966734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223020967, "dur": 5, "args": { "External id": 103528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103528, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103528, "pid": 5, "tid": 7, "ts": 1716454223020967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966747, "dur": 6, "args": { "External id": 103528, "cbid": 211, "correlation": 103528 } }, { "ph": "s", "id": 103528, "pid": 76337, "tid": -914061504, "ts": 1716454222966747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223020973, "dur": 5, "args": { "External id": 103538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103538, "pid": 5, "tid": 7, "ts": 1716454223020973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966804, "dur": 13, "args": { "External id": 103538, "cbid": 211, "correlation": 103538 } }, { "ph": "s", "id": 103538, "pid": 76337, "tid": -914061504, "ts": 1716454222966804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223020980, "dur": 10, "args": { "External id": 103558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103558, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 103558, "pid": 5, "tid": 7, "ts": 1716454223020980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966873, "dur": 11, "args": { "External id": 103558, "cbid": 211, "correlation": 103558 } }, { "ph": "s", "id": 103558, "pid": 76337, "tid": -914061504, "ts": 1716454222966873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223020991, "dur": 4, "args": { "External id": 103570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103570, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 103570, "pid": 5, "tid": 7, "ts": 1716454223020991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966893, "dur": 6, "args": { "External id": 103570, "cbid": 211, "correlation": 103570 } }, { "ph": "s", "id": 103570, "pid": 76337, "tid": -914061504, "ts": 1716454222966893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223020996, "dur": 7, "args": { "External id": 103573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103573, "pid": 5, "tid": 7, "ts": 1716454223020996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966913, "dur": 6, "args": { "External id": 103573, "cbid": 211, "correlation": 103573 } }, { "ph": "s", "id": 103573, "pid": 76337, "tid": -914061504, "ts": 1716454222966913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223021004, "dur": 4, "args": { "External id": 103582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103582, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103582, "pid": 5, "tid": 7, "ts": 1716454223021004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222966953, "dur": 10, "args": { "External id": 103582, "cbid": 211, "correlation": 103582 } }, { "ph": "s", "id": 103582, "pid": 76337, "tid": -914061504, "ts": 1716454222966953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222967028, "dur": 0, "args": { "External id": 103592, "cbid": 317, "correlation": 103592 } }, { "ph": "f", "id": 103592, "pid": 76337, "tid": -914061504, "ts": 1716454222967028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222967029, "dur": 0, "args": { "External id": 103593, "cbid": 203, "correlation": 103593 } }, { "ph": "f", "id": 103593, "pid": 76337, "tid": -914061504, "ts": 1716454222967029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222967029, "dur": 0, "args": { "External id": 103594, "cbid": 205, "correlation": 103594 } }, { "ph": "f", "id": 103594, "pid": 76337, "tid": -914061504, "ts": 1716454222967029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021010, "dur": 5, "args": { "External id": 103598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103598, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103598, "pid": 5, "tid": 7, "ts": 1716454223021010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967043, "dur": 14, "args": { "External id": 103598, "cbid": 211, "correlation": 103598 } }, { "ph": "s", "id": 103598, "pid": 76337, "tid": -914061504, "ts": 1716454222967043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021016, "dur": 160, "args": { "External id": 103600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103600, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103600, "pid": 5, "tid": 7, "ts": 1716454223021016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967060, "dur": 5, "args": { "External id": 103600, "cbid": 211, "correlation": 103600 } }, { "ph": "s", "id": 103600, "pid": 76337, "tid": -914061504, "ts": 1716454222967060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223021178, "dur": 1, "args": { "External id": 103602, "device": 5, "context": 1, "stream": 7, "correlation": 103602, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 103602, "pid": 5, "tid": 7, "ts": 1716454223021178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222967071, "dur": 6, "args": { "External id": 103602, "cbid": 51, "correlation": 103602 } }, { "ph": "s", "id": 103602, "pid": 76337, "tid": -914061504, "ts": 1716454222967071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223021182, "dur": 255, "args": { "External id": 103603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103603, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103603, "pid": 5, "tid": 7, "ts": 1716454223021182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967078, "dur": 6, "args": { "External id": 103603, "cbid": 211, "correlation": 103603 } }, { "ph": "s", "id": 103603, "pid": 76337, "tid": -914061504, "ts": 1716454222967078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021438, "dur": 6, "args": { "External id": 103605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103605, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103605, "pid": 5, "tid": 7, "ts": 1716454223021438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967087, "dur": 5, "args": { "External id": 103605, "cbid": 211, "correlation": 103605 } }, { "ph": "s", "id": 103605, "pid": 76337, "tid": -914061504, "ts": 1716454222967087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223021445, "dur": 6, "args": { "External id": 103611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103611, "pid": 5, "tid": 7, "ts": 1716454223021445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967116, "dur": 8, "args": { "External id": 103611, "cbid": 211, "correlation": 103611 } }, { "ph": "s", "id": 103611, "pid": 76337, "tid": -914061504, "ts": 1716454222967116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223021452, "dur": 5, "args": { "External id": 103619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103619, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103619, "pid": 5, "tid": 7, "ts": 1716454223021452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967148, "dur": 9, "args": { "External id": 103619, "cbid": 211, "correlation": 103619 } }, { "ph": "s", "id": 103619, "pid": 76337, "tid": -914061504, "ts": 1716454222967148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223021458, "dur": 4, "args": { "External id": 103627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103627, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103627, "pid": 5, "tid": 7, "ts": 1716454223021458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967178, "dur": 8, "args": { "External id": 103627, "cbid": 211, "correlation": 103627 } }, { "ph": "s", "id": 103627, "pid": 76337, "tid": -914061504, "ts": 1716454222967178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223021464, "dur": 9, "args": { "External id": 103647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103647, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 103647, "pid": 5, "tid": 7, "ts": 1716454223021464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967253, "dur": 13, "args": { "External id": 103647, "cbid": 211, "correlation": 103647 } }, { "ph": "s", "id": 103647, "pid": 76337, "tid": -914061504, "ts": 1716454222967253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223021474, "dur": 4, "args": { "External id": 103659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103659, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 103659, "pid": 5, "tid": 7, "ts": 1716454223021474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967275, "dur": 6, "args": { "External id": 103659, "cbid": 211, "correlation": 103659 } }, { "ph": "s", "id": 103659, "pid": 76337, "tid": -914061504, "ts": 1716454222967275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223021479, "dur": 6, "args": { "External id": 103662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103662, "pid": 5, "tid": 7, "ts": 1716454223021479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967293, "dur": 6, "args": { "External id": 103662, "cbid": 211, "correlation": 103662 } }, { "ph": "s", "id": 103662, "pid": 76337, "tid": -914061504, "ts": 1716454222967293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223021487, "dur": 4, "args": { "External id": 103671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103671, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103671, "pid": 5, "tid": 7, "ts": 1716454223021487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967331, "dur": 10, "args": { "External id": 103671, "cbid": 211, "correlation": 103671 } }, { "ph": "s", "id": 103671, "pid": 76337, "tid": -914061504, "ts": 1716454222967331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222967383, "dur": 0, "args": { "External id": 103681, "cbid": 317, "correlation": 103681 } }, { "ph": "f", "id": 103681, "pid": 76337, "tid": -914061504, "ts": 1716454222967383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222967384, "dur": 0, "args": { "External id": 103682, "cbid": 203, "correlation": 103682 } }, { "ph": "f", "id": 103682, "pid": 76337, "tid": -914061504, "ts": 1716454222967384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222967384, "dur": 0, "args": { "External id": 103683, "cbid": 205, "correlation": 103683 } }, { "ph": "f", "id": 103683, "pid": 76337, "tid": -914061504, "ts": 1716454222967384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021493, "dur": 5, "args": { "External id": 103687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103687, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103687, "pid": 5, "tid": 7, "ts": 1716454223021493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967397, "dur": 13, "args": { "External id": 103687, "cbid": 211, "correlation": 103687 } }, { "ph": "s", "id": 103687, "pid": 76337, "tid": -914061504, "ts": 1716454222967397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021499, "dur": 160, "args": { "External id": 103689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103689, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103689, "pid": 5, "tid": 7, "ts": 1716454223021499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967413, "dur": 5, "args": { "External id": 103689, "cbid": 211, "correlation": 103689 } }, { "ph": "s", "id": 103689, "pid": 76337, "tid": -914061504, "ts": 1716454222967413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223021661, "dur": 1, "args": { "External id": 103691, "device": 5, "context": 1, "stream": 7, "correlation": 103691, "bytes": 240, "memory bandwidth (GB/s)": 0.13392857142857142 } }, { "ph": "f", "id": 103691, "pid": 5, "tid": 7, "ts": 1716454223021661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222967423, "dur": 6, "args": { "External id": 103691, "cbid": 51, "correlation": 103691 } }, { "ph": "s", "id": 103691, "pid": 76337, "tid": -914061504, "ts": 1716454222967423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223021665, "dur": 254, "args": { "External id": 103692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103692, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103692, "pid": 5, "tid": 7, "ts": 1716454223021665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967430, "dur": 6, "args": { "External id": 103692, "cbid": 211, "correlation": 103692 } }, { "ph": "s", "id": 103692, "pid": 76337, "tid": -914061504, "ts": 1716454222967430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021920, "dur": 6, "args": { "External id": 103694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103694, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103694, "pid": 5, "tid": 7, "ts": 1716454223021920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967439, "dur": 5, "args": { "External id": 103694, "cbid": 211, "correlation": 103694 } }, { "ph": "s", "id": 103694, "pid": 76337, "tid": -914061504, "ts": 1716454222967439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223021927, "dur": 6, "args": { "External id": 103700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103700, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103700, "pid": 5, "tid": 7, "ts": 1716454223021927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967468, "dur": 8, "args": { "External id": 103700, "cbid": 211, "correlation": 103700 } }, { "ph": "s", "id": 103700, "pid": 76337, "tid": -914061504, "ts": 1716454222967468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223021934, "dur": 3, "args": { "External id": 103708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103708, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 103708, "pid": 5, "tid": 7, "ts": 1716454223021934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967512, "dur": 9, "args": { "External id": 103708, "cbid": 211, "correlation": 103708 } }, { "ph": "s", "id": 103708, "pid": 76337, "tid": -914061504, "ts": 1716454222967512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222967575, "dur": 1, "args": { "External id": 103724, "cbid": 251, "correlation": 103724 } }, { "ph": "f", "id": 103724, "pid": 76337, "tid": -914061504, "ts": 1716454222967575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222967580, "dur": 0, "args": { "External id": 103726, "cbid": 251, "correlation": 103726 } }, { "ph": "f", "id": 103726, "pid": 76337, "tid": -914061504, "ts": 1716454222967580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223021939, "dur": 10, "args": { "External id": 103727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103727, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103727, "pid": 5, "tid": 7, "ts": 1716454223021939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967581, "dur": 11, "args": { "External id": 103727, "cbid": 211, "correlation": 103727 } }, { "ph": "s", "id": 103727, "pid": 76337, "tid": -914061504, "ts": 1716454222967581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223021950, "dur": 4, "args": { "External id": 103729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103729, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103729, "pid": 5, "tid": 7, "ts": 1716454223021950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967594, "dur": 6, "args": { "External id": 103729, "cbid": 211, "correlation": 103729 } }, { "ph": "s", "id": 103729, "pid": 76337, "tid": -914061504, "ts": 1716454222967594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223021955, "dur": 6, "args": { "External id": 103739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103739, "pid": 5, "tid": 7, "ts": 1716454223021955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967650, "dur": 13, "args": { "External id": 103739, "cbid": 211, "correlation": 103739 } }, { "ph": "s", "id": 103739, "pid": 76337, "tid": -914061504, "ts": 1716454222967650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223021962, "dur": 9, "args": { "External id": 103759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103759, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 103759, "pid": 5, "tid": 7, "ts": 1716454223021962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967718, "dur": 10, "args": { "External id": 103759, "cbid": 211, "correlation": 103759 } }, { "ph": "s", "id": 103759, "pid": 76337, "tid": -914061504, "ts": 1716454222967718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223021973, "dur": 3, "args": { "External id": 103771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103771, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 103771, "pid": 5, "tid": 7, "ts": 1716454223021973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967738, "dur": 6, "args": { "External id": 103771, "cbid": 211, "correlation": 103771 } }, { "ph": "s", "id": 103771, "pid": 76337, "tid": -914061504, "ts": 1716454222967738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223021977, "dur": 7, "args": { "External id": 103774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103774, "pid": 5, "tid": 7, "ts": 1716454223021977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967757, "dur": 7, "args": { "External id": 103774, "cbid": 211, "correlation": 103774 } }, { "ph": "s", "id": 103774, "pid": 76337, "tid": -914061504, "ts": 1716454222967757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223021985, "dur": 4, "args": { "External id": 103783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103783, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103783, "pid": 5, "tid": 7, "ts": 1716454223021985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967798, "dur": 10, "args": { "External id": 103783, "cbid": 211, "correlation": 103783 } }, { "ph": "s", "id": 103783, "pid": 76337, "tid": -914061504, "ts": 1716454222967798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222967861, "dur": 0, "args": { "External id": 103793, "cbid": 317, "correlation": 103793 } }, { "ph": "f", "id": 103793, "pid": 76337, "tid": -914061504, "ts": 1716454222967861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222967862, "dur": 0, "args": { "External id": 103794, "cbid": 203, "correlation": 103794 } }, { "ph": "f", "id": 103794, "pid": 76337, "tid": -914061504, "ts": 1716454222967862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222967862, "dur": 0, "args": { "External id": 103795, "cbid": 205, "correlation": 103795 } }, { "ph": "f", "id": 103795, "pid": 76337, "tid": -914061504, "ts": 1716454222967862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021991, "dur": 5, "args": { "External id": 103799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103799, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103799, "pid": 5, "tid": 7, "ts": 1716454223021991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967876, "dur": 12, "args": { "External id": 103799, "cbid": 211, "correlation": 103799 } }, { "ph": "s", "id": 103799, "pid": 76337, "tid": -914061504, "ts": 1716454222967876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223021998, "dur": 158, "args": { "External id": 103801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103801, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103801, "pid": 5, "tid": 7, "ts": 1716454223021998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967890, "dur": 5, "args": { "External id": 103801, "cbid": 211, "correlation": 103801 } }, { "ph": "s", "id": 103801, "pid": 76337, "tid": -914061504, "ts": 1716454222967890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223022158, "dur": 1, "args": { "External id": 103803, "device": 5, "context": 1, "stream": 7, "correlation": 103803, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 103803, "pid": 5, "tid": 7, "ts": 1716454223022158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222967901, "dur": 6, "args": { "External id": 103803, "cbid": 51, "correlation": 103803 } }, { "ph": "s", "id": 103803, "pid": 76337, "tid": -914061504, "ts": 1716454222967901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223022162, "dur": 254, "args": { "External id": 103804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103804, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103804, "pid": 5, "tid": 7, "ts": 1716454223022162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967909, "dur": 7, "args": { "External id": 103804, "cbid": 211, "correlation": 103804 } }, { "ph": "s", "id": 103804, "pid": 76337, "tid": -914061504, "ts": 1716454222967909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223022417, "dur": 6, "args": { "External id": 103806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103806, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103806, "pid": 5, "tid": 7, "ts": 1716454223022417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967919, "dur": 5, "args": { "External id": 103806, "cbid": 211, "correlation": 103806 } }, { "ph": "s", "id": 103806, "pid": 76337, "tid": -914061504, "ts": 1716454222967919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223022424, "dur": 6, "args": { "External id": 103812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103812, "pid": 5, "tid": 7, "ts": 1716454223022424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967947, "dur": 8, "args": { "External id": 103812, "cbid": 211, "correlation": 103812 } }, { "ph": "s", "id": 103812, "pid": 76337, "tid": -914061504, "ts": 1716454222967947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223022432, "dur": 5, "args": { "External id": 103820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103820, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103820, "pid": 5, "tid": 7, "ts": 1716454223022432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222967988, "dur": 9, "args": { "External id": 103820, "cbid": 211, "correlation": 103820 } }, { "ph": "s", "id": 103820, "pid": 76337, "tid": -914061504, "ts": 1716454222967988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223022438, "dur": 4, "args": { "External id": 103828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103828, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103828, "pid": 5, "tid": 7, "ts": 1716454223022438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968019, "dur": 8, "args": { "External id": 103828, "cbid": 211, "correlation": 103828 } }, { "ph": "s", "id": 103828, "pid": 76337, "tid": -914061504, "ts": 1716454222968019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223022444, "dur": 9, "args": { "External id": 103848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103848, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 103848, "pid": 5, "tid": 7, "ts": 1716454223022444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968124, "dur": 12, "args": { "External id": 103848, "cbid": 211, "correlation": 103848 } }, { "ph": "s", "id": 103848, "pid": 76337, "tid": -914061504, "ts": 1716454222968124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223022454, "dur": 4, "args": { "External id": 103860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103860, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 103860, "pid": 5, "tid": 7, "ts": 1716454223022454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968146, "dur": 6, "args": { "External id": 103860, "cbid": 211, "correlation": 103860 } }, { "ph": "s", "id": 103860, "pid": 76337, "tid": -914061504, "ts": 1716454222968146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223022459, "dur": 6, "args": { "External id": 103863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103863, "pid": 5, "tid": 7, "ts": 1716454223022459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968164, "dur": 7, "args": { "External id": 103863, "cbid": 211, "correlation": 103863 } }, { "ph": "s", "id": 103863, "pid": 76337, "tid": -914061504, "ts": 1716454222968164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223022467, "dur": 4, "args": { "External id": 103872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103872, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103872, "pid": 5, "tid": 7, "ts": 1716454223022467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968203, "dur": 10, "args": { "External id": 103872, "cbid": 211, "correlation": 103872 } }, { "ph": "s", "id": 103872, "pid": 76337, "tid": -914061504, "ts": 1716454222968203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222968255, "dur": 0, "args": { "External id": 103882, "cbid": 317, "correlation": 103882 } }, { "ph": "f", "id": 103882, "pid": 76337, "tid": -914061504, "ts": 1716454222968255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222968256, "dur": 0, "args": { "External id": 103883, "cbid": 203, "correlation": 103883 } }, { "ph": "f", "id": 103883, "pid": 76337, "tid": -914061504, "ts": 1716454222968256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222968257, "dur": 0, "args": { "External id": 103884, "cbid": 205, "correlation": 103884 } }, { "ph": "f", "id": 103884, "pid": 76337, "tid": -914061504, "ts": 1716454222968257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223022473, "dur": 5, "args": { "External id": 103888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103888, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103888, "pid": 5, "tid": 7, "ts": 1716454223022473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968271, "dur": 12, "args": { "External id": 103888, "cbid": 211, "correlation": 103888 } }, { "ph": "s", "id": 103888, "pid": 76337, "tid": -914061504, "ts": 1716454222968271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223022479, "dur": 160, "args": { "External id": 103890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103890, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103890, "pid": 5, "tid": 7, "ts": 1716454223022479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968285, "dur": 5, "args": { "External id": 103890, "cbid": 211, "correlation": 103890 } }, { "ph": "s", "id": 103890, "pid": 76337, "tid": -914061504, "ts": 1716454222968285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223022641, "dur": 1, "args": { "External id": 103892, "device": 5, "context": 1, "stream": 7, "correlation": 103892, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 103892, "pid": 5, "tid": 7, "ts": 1716454223022641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222968295, "dur": 7, "args": { "External id": 103892, "cbid": 51, "correlation": 103892 } }, { "ph": "s", "id": 103892, "pid": 76337, "tid": -914061504, "ts": 1716454222968295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223022645, "dur": 253, "args": { "External id": 103893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103893, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103893, "pid": 5, "tid": 7, "ts": 1716454223022645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968303, "dur": 7, "args": { "External id": 103893, "cbid": 211, "correlation": 103893 } }, { "ph": "s", "id": 103893, "pid": 76337, "tid": -914061504, "ts": 1716454222968303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223022900, "dur": 6, "args": { "External id": 103895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103895, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 103895, "pid": 5, "tid": 7, "ts": 1716454223022900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968313, "dur": 5, "args": { "External id": 103895, "cbid": 211, "correlation": 103895 } }, { "ph": "s", "id": 103895, "pid": 76337, "tid": -914061504, "ts": 1716454222968313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223022907, "dur": 6, "args": { "External id": 103901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103901, "pid": 5, "tid": 7, "ts": 1716454223022907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968341, "dur": 8, "args": { "External id": 103901, "cbid": 211, "correlation": 103901 } }, { "ph": "s", "id": 103901, "pid": 76337, "tid": -914061504, "ts": 1716454222968341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223022914, "dur": 3, "args": { "External id": 103909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103909, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 103909, "pid": 5, "tid": 7, "ts": 1716454223022914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968385, "dur": 9, "args": { "External id": 103909, "cbid": 211, "correlation": 103909 } }, { "ph": "s", "id": 103909, "pid": 76337, "tid": -914061504, "ts": 1716454222968385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222968446, "dur": 1, "args": { "External id": 103925, "cbid": 251, "correlation": 103925 } }, { "ph": "f", "id": 103925, "pid": 76337, "tid": -914061504, "ts": 1716454222968446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222968452, "dur": 0, "args": { "External id": 103927, "cbid": 251, "correlation": 103927 } }, { "ph": "f", "id": 103927, "pid": 76337, "tid": -914061504, "ts": 1716454222968452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223022918, "dur": 11, "args": { "External id": 103928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103928, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103928, "pid": 5, "tid": 7, "ts": 1716454223022918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968454, "dur": 11, "args": { "External id": 103928, "cbid": 211, "correlation": 103928 } }, { "ph": "s", "id": 103928, "pid": 76337, "tid": -914061504, "ts": 1716454222968454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223022931, "dur": 4, "args": { "External id": 103930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103930, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 103930, "pid": 5, "tid": 7, "ts": 1716454223022931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968467, "dur": 6, "args": { "External id": 103930, "cbid": 211, "correlation": 103930 } }, { "ph": "s", "id": 103930, "pid": 76337, "tid": -914061504, "ts": 1716454222968467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223022936, "dur": 5, "args": { "External id": 103940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103940, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103940, "pid": 5, "tid": 7, "ts": 1716454223022936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968524, "dur": 12, "args": { "External id": 103940, "cbid": 211, "correlation": 103940 } }, { "ph": "s", "id": 103940, "pid": 76337, "tid": -914061504, "ts": 1716454222968524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223022943, "dur": 10, "args": { "External id": 103960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103960, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 103960, "pid": 5, "tid": 7, "ts": 1716454223022943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968590, "dur": 11, "args": { "External id": 103960, "cbid": 211, "correlation": 103960 } }, { "ph": "s", "id": 103960, "pid": 76337, "tid": -914061504, "ts": 1716454222968590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223022954, "dur": 4, "args": { "External id": 103972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103972, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 103972, "pid": 5, "tid": 7, "ts": 1716454223022954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968612, "dur": 6, "args": { "External id": 103972, "cbid": 211, "correlation": 103972 } }, { "ph": "s", "id": 103972, "pid": 76337, "tid": -914061504, "ts": 1716454222968612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223022959, "dur": 7, "args": { "External id": 103975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103975, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103975, "pid": 5, "tid": 7, "ts": 1716454223022959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968631, "dur": 6, "args": { "External id": 103975, "cbid": 211, "correlation": 103975 } }, { "ph": "s", "id": 103975, "pid": 76337, "tid": -914061504, "ts": 1716454222968631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223022967, "dur": 4, "args": { "External id": 103984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 103984, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 103984, "pid": 5, "tid": 7, "ts": 1716454223022967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968671, "dur": 10, "args": { "External id": 103984, "cbid": 211, "correlation": 103984 } }, { "ph": "s", "id": 103984, "pid": 76337, "tid": -914061504, "ts": 1716454222968671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222968734, "dur": 0, "args": { "External id": 103994, "cbid": 317, "correlation": 103994 } }, { "ph": "f", "id": 103994, "pid": 76337, "tid": -914061504, "ts": 1716454222968734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222968735, "dur": 0, "args": { "External id": 103995, "cbid": 203, "correlation": 103995 } }, { "ph": "f", "id": 103995, "pid": 76337, "tid": -914061504, "ts": 1716454222968735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222968735, "dur": 0, "args": { "External id": 103996, "cbid": 205, "correlation": 103996 } }, { "ph": "f", "id": 103996, "pid": 76337, "tid": -914061504, "ts": 1716454222968735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223022972, "dur": 5, "args": { "External id": 104000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104000, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104000, "pid": 5, "tid": 7, "ts": 1716454223022972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968749, "dur": 13, "args": { "External id": 104000, "cbid": 211, "correlation": 104000 } }, { "ph": "s", "id": 104000, "pid": 76337, "tid": -914061504, "ts": 1716454222968749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223022979, "dur": 159, "args": { "External id": 104002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104002, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104002, "pid": 5, "tid": 7, "ts": 1716454223022979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968764, "dur": 5, "args": { "External id": 104002, "cbid": 211, "correlation": 104002 } }, { "ph": "s", "id": 104002, "pid": 76337, "tid": -914061504, "ts": 1716454222968764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223023140, "dur": 1, "args": { "External id": 104004, "device": 5, "context": 1, "stream": 7, "correlation": 104004, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 104004, "pid": 5, "tid": 7, "ts": 1716454223023140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222968775, "dur": 6, "args": { "External id": 104004, "cbid": 51, "correlation": 104004 } }, { "ph": "s", "id": 104004, "pid": 76337, "tid": -914061504, "ts": 1716454222968775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223023143, "dur": 254, "args": { "External id": 104005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104005, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104005, "pid": 5, "tid": 7, "ts": 1716454223023143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968782, "dur": 6, "args": { "External id": 104005, "cbid": 211, "correlation": 104005 } }, { "ph": "s", "id": 104005, "pid": 76337, "tid": -914061504, "ts": 1716454222968782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223023398, "dur": 5, "args": { "External id": 104007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104007, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104007, "pid": 5, "tid": 7, "ts": 1716454223023398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968792, "dur": 5, "args": { "External id": 104007, "cbid": 211, "correlation": 104007 } }, { "ph": "s", "id": 104007, "pid": 76337, "tid": -914061504, "ts": 1716454222968792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223023405, "dur": 6, "args": { "External id": 104013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104013, "pid": 5, "tid": 7, "ts": 1716454223023405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968820, "dur": 8, "args": { "External id": 104013, "cbid": 211, "correlation": 104013 } }, { "ph": "s", "id": 104013, "pid": 76337, "tid": -914061504, "ts": 1716454222968820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223023413, "dur": 5, "args": { "External id": 104021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104021, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104021, "pid": 5, "tid": 7, "ts": 1716454223023413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968852, "dur": 9, "args": { "External id": 104021, "cbid": 211, "correlation": 104021 } }, { "ph": "s", "id": 104021, "pid": 76337, "tid": -914061504, "ts": 1716454222968852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223023419, "dur": 4, "args": { "External id": 104029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104029, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104029, "pid": 5, "tid": 7, "ts": 1716454223023419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222968882, "dur": 8, "args": { "External id": 104029, "cbid": 211, "correlation": 104029 } }, { "ph": "s", "id": 104029, "pid": 76337, "tid": -914061504, "ts": 1716454222968882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223023424, "dur": 9, "args": { "External id": 104049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104049, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 104049, "pid": 5, "tid": 7, "ts": 1716454223023424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969033, "dur": 14, "args": { "External id": 104049, "cbid": 211, "correlation": 104049 } }, { "ph": "s", "id": 104049, "pid": 76337, "tid": -914061504, "ts": 1716454222969033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223023435, "dur": 4, "args": { "External id": 104061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104061, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 104061, "pid": 5, "tid": 7, "ts": 1716454223023435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969057, "dur": 7, "args": { "External id": 104061, "cbid": 211, "correlation": 104061 } }, { "ph": "s", "id": 104061, "pid": 76337, "tid": -914061504, "ts": 1716454222969057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223023440, "dur": 6, "args": { "External id": 104064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104064, "pid": 5, "tid": 7, "ts": 1716454223023440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969076, "dur": 7, "args": { "External id": 104064, "cbid": 211, "correlation": 104064 } }, { "ph": "s", "id": 104064, "pid": 76337, "tid": -914061504, "ts": 1716454222969076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222969135, "dur": 0, "args": { "External id": 104075, "cbid": 317, "correlation": 104075 } }, { "ph": "f", "id": 104075, "pid": 76337, "tid": -914061504, "ts": 1716454222969135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222969135, "dur": 0, "args": { "External id": 104076, "cbid": 203, "correlation": 104076 } }, { "ph": "f", "id": 104076, "pid": 76337, "tid": -914061504, "ts": 1716454222969135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222969136, "dur": 0, "args": { "External id": 104077, "cbid": 205, "correlation": 104077 } }, { "ph": "f", "id": 104077, "pid": 76337, "tid": -914061504, "ts": 1716454222969136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223023447, "dur": 5, "args": { "External id": 104081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104081, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104081, "pid": 5, "tid": 7, "ts": 1716454223023447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969153, "dur": 12, "args": { "External id": 104081, "cbid": 211, "correlation": 104081 } }, { "ph": "s", "id": 104081, "pid": 76337, "tid": -914061504, "ts": 1716454222969153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223023453, "dur": 36, "args": { "External id": 104083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104083, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 104083, "pid": 5, "tid": 7, "ts": 1716454223023453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969172, "dur": 9, "args": { "External id": 104083, "cbid": 211, "correlation": 104083 } }, { "ph": "s", "id": 104083, "pid": 76337, "tid": -914061504, "ts": 1716454222969172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223023491, "dur": 5, "args": { "External id": 104085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104085, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104085, "pid": 5, "tid": 7, "ts": 1716454223023491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969185, "dur": 5, "args": { "External id": 104085, "cbid": 211, "correlation": 104085 } }, { "ph": "s", "id": 104085, "pid": 76337, "tid": -914061504, "ts": 1716454222969185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223023497, "dur": 6, "args": { "External id": 104091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104091, "pid": 5, "tid": 7, "ts": 1716454223023497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969213, "dur": 8, "args": { "External id": 104091, "cbid": 211, "correlation": 104091 } }, { "ph": "s", "id": 104091, "pid": 76337, "tid": -914061504, "ts": 1716454222969213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223023504, "dur": 20, "args": { "External id": 104100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104100, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104100, "pid": 5, "tid": 7, "ts": 1716454223023504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969294, "dur": 14, "args": { "External id": 104100, "cbid": 211, "correlation": 104100 } }, { "ph": "s", "id": 104100, "pid": 76337, "tid": -914061504, "ts": 1716454222969294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223023525, "dur": 10, "args": { "External id": 104122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104122, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 104122, "pid": 5, "tid": 7, "ts": 1716454223023525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969351, "dur": 11, "args": { "External id": 104122, "cbid": 211, "correlation": 104122 } }, { "ph": "s", "id": 104122, "pid": 76337, "tid": -914061504, "ts": 1716454222969351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969441, "dur": 2, "args": { "External id": 104133, "cbid": 251, "correlation": 104133 } }, { "ph": "f", "id": 104133, "pid": 76337, "tid": -914061504, "ts": 1716454222969441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969445, "dur": 0, "args": { "External id": 104134, "cbid": 251, "correlation": 104134 } }, { "ph": "f", "id": 104134, "pid": 76337, "tid": -914061504, "ts": 1716454222969445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223023537, "dur": 53, "args": { "External id": 104135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104135, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 104135, "pid": 5, "tid": 7, "ts": 1716454223023537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969448, "dur": 14, "args": { "External id": 104135, "cbid": 211, "correlation": 104135 } }, { "ph": "s", "id": 104135, "pid": 76337, "tid": -914061504, "ts": 1716454222969448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969518, "dur": 1, "args": { "External id": 104146, "cbid": 251, "correlation": 104146 } }, { "ph": "f", "id": 104146, "pid": 76337, "tid": -914061504, "ts": 1716454222969518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969522, "dur": 0, "args": { "External id": 104147, "cbid": 251, "correlation": 104147 } }, { "ph": "f", "id": 104147, "pid": 76337, "tid": -914061504, "ts": 1716454222969522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223023591, "dur": 53, "args": { "External id": 104148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104148, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 104148, "pid": 5, "tid": 7, "ts": 1716454223023591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969524, "dur": 11, "args": { "External id": 104148, "cbid": 211, "correlation": 104148 } }, { "ph": "s", "id": 104148, "pid": 76337, "tid": -914061504, "ts": 1716454222969524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969589, "dur": 1, "args": { "External id": 104159, "cbid": 251, "correlation": 104159 } }, { "ph": "f", "id": 104159, "pid": 76337, "tid": -914061504, "ts": 1716454222969589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969592, "dur": 0, "args": { "External id": 104160, "cbid": 251, "correlation": 104160 } }, { "ph": "f", "id": 104160, "pid": 76337, "tid": -914061504, "ts": 1716454222969592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223023645, "dur": 53, "args": { "External id": 104161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104161, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 104161, "pid": 5, "tid": 7, "ts": 1716454223023645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969594, "dur": 11, "args": { "External id": 104161, "cbid": 211, "correlation": 104161 } }, { "ph": "s", "id": 104161, "pid": 76337, "tid": -914061504, "ts": 1716454222969594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223023700, "dur": 55, "args": { "External id": 104186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104186, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104186, "pid": 5, "tid": 7, "ts": 1716454223023700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969678, "dur": 13, "args": { "External id": 104186, "cbid": 211, "correlation": 104186 } }, { "ph": "s", "id": 104186, "pid": 76337, "tid": -914061504, "ts": 1716454222969678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222969779, "dur": 1, "args": { "External id": 104204, "cbid": 251, "correlation": 104204 } }, { "ph": "f", "id": 104204, "pid": 76337, "tid": -914061504, "ts": 1716454222969779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223023757, "dur": 61, "args": { "External id": 104206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104206, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 104206, "pid": 5, "tid": 7, "ts": 1716454223023757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969784, "dur": 13, "args": { "External id": 104206, "cbid": 211, "correlation": 104206 } }, { "ph": "s", "id": 104206, "pid": 76337, "tid": -914061504, "ts": 1716454222969784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223023819, "dur": 6, "args": { "External id": 104214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104214, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104214, "pid": 5, "tid": 7, "ts": 1716454223023819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969855, "dur": 13, "args": { "External id": 104214, "cbid": 211, "correlation": 104214 } }, { "ph": "s", "id": 104214, "pid": 76337, "tid": -914061504, "ts": 1716454222969855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223023826, "dur": 7, "args": { "External id": 104222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104222, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104222, "pid": 5, "tid": 7, "ts": 1716454223023826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969893, "dur": 8, "args": { "External id": 104222, "cbid": 211, "correlation": 104222 } }, { "ph": "s", "id": 104222, "pid": 76337, "tid": -914061504, "ts": 1716454222969893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223023835, "dur": 7, "args": { "External id": 104233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104233, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104233, "pid": 5, "tid": 7, "ts": 1716454223023835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222969966, "dur": 21, "args": { "External id": 104233, "cbid": 211, "correlation": 104233 } }, { "ph": "s", "id": 104233, "pid": 76337, "tid": -914061504, "ts": 1716454222969966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223023843, "dur": 8, "args": { "External id": 104255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104255, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 104255, "pid": 5, "tid": 7, "ts": 1716454223023843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970007, "dur": 9, "args": { "External id": 104255, "cbid": 211, "correlation": 104255 } }, { "ph": "s", "id": 104255, "pid": 76337, "tid": -914061504, "ts": 1716454222970007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970095, "dur": 2, "args": { "External id": 104266, "cbid": 251, "correlation": 104266 } }, { "ph": "f", "id": 104266, "pid": 76337, "tid": -914061504, "ts": 1716454222970095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223023854, "dur": 1, "args": { "External id": 104267, "device": 5, "context": 1, "stream": 7, "correlation": 104267, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 104267, "pid": 5, "tid": 7, "ts": 1716454223023854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222970100, "dur": 11, "args": { "External id": 104267, "cbid": 51, "correlation": 104267 } }, { "ph": "s", "id": 104267, "pid": 76337, "tid": -914061504, "ts": 1716454222970100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223023857, "dur": 36, "args": { "External id": 104268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104268, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 104268, "pid": 5, "tid": 7, "ts": 1716454223023857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970113, "dur": 13, "args": { "External id": 104268, "cbid": 211, "correlation": 104268 } }, { "ph": "s", "id": 104268, "pid": 76337, "tid": -914061504, "ts": 1716454222970113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970185, "dur": 1, "args": { "External id": 104279, "cbid": 251, "correlation": 104279 } }, { "ph": "f", "id": 104279, "pid": 76337, "tid": -914061504, "ts": 1716454222970185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970189, "dur": 0, "args": { "External id": 104280, "cbid": 251, "correlation": 104280 } }, { "ph": "f", "id": 104280, "pid": 76337, "tid": -914061504, "ts": 1716454222970189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223023895, "dur": 12, "args": { "External id": 104281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104281, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104281, "pid": 5, "tid": 7, "ts": 1716454223023895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970190, "dur": 12, "args": { "External id": 104281, "cbid": 211, "correlation": 104281 } }, { "ph": "s", "id": 104281, "pid": 76337, "tid": -914061504, "ts": 1716454222970190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223023908, "dur": 5, "args": { "External id": 104283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104283, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104283, "pid": 5, "tid": 7, "ts": 1716454223023908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970204, "dur": 7, "args": { "External id": 104283, "cbid": 211, "correlation": 104283 } }, { "ph": "s", "id": 104283, "pid": 76337, "tid": -914061504, "ts": 1716454222970204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970263, "dur": 1, "args": { "External id": 104294, "cbid": 251, "correlation": 104294 } }, { "ph": "f", "id": 104294, "pid": 76337, "tid": -914061504, "ts": 1716454222970263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970266, "dur": 0, "args": { "External id": 104295, "cbid": 251, "correlation": 104295 } }, { "ph": "f", "id": 104295, "pid": 76337, "tid": -914061504, "ts": 1716454222970266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223023915, "dur": 8, "args": { "External id": 104296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104296, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104296, "pid": 5, "tid": 7, "ts": 1716454223023915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970268, "dur": 11, "args": { "External id": 104296, "cbid": 211, "correlation": 104296 } }, { "ph": "s", "id": 104296, "pid": 76337, "tid": -914061504, "ts": 1716454222970268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223023924, "dur": 3, "args": { "External id": 104298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104298, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104298, "pid": 5, "tid": 7, "ts": 1716454223023924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970280, "dur": 5, "args": { "External id": 104298, "cbid": 211, "correlation": 104298 } }, { "ph": "s", "id": 104298, "pid": 76337, "tid": -914061504, "ts": 1716454222970280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223023929, "dur": 19, "args": { "External id": 104323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104323, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 104323, "pid": 5, "tid": 7, "ts": 1716454223023929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970361, "dur": 12, "args": { "External id": 104323, "cbid": 211, "correlation": 104323 } }, { "ph": "s", "id": 104323, "pid": 76337, "tid": -914061504, "ts": 1716454222970361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970462, "dur": 2, "args": { "External id": 104341, "cbid": 251, "correlation": 104341 } }, { "ph": "f", "id": 104341, "pid": 76337, "tid": -914061504, "ts": 1716454222970462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223023950, "dur": 1, "args": { "External id": 104343, "device": 5, "context": 1, "stream": 7, "correlation": 104343, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 104343, "pid": 5, "tid": 7, "ts": 1716454223023950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222970467, "dur": 10, "args": { "External id": 104343, "cbid": 51, "correlation": 104343 } }, { "ph": "s", "id": 104343, "pid": 76337, "tid": -914061504, "ts": 1716454222970467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223023954, "dur": 36, "args": { "External id": 104344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104344, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 104344, "pid": 5, "tid": 7, "ts": 1716454223023954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970479, "dur": 12, "args": { "External id": 104344, "cbid": 211, "correlation": 104344 } }, { "ph": "s", "id": 104344, "pid": 76337, "tid": -914061504, "ts": 1716454222970479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223023991, "dur": 4, "args": { "External id": 104352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104352, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104352, "pid": 5, "tid": 7, "ts": 1716454223023991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970550, "dur": 13, "args": { "External id": 104352, "cbid": 211, "correlation": 104352 } }, { "ph": "s", "id": 104352, "pid": 76337, "tid": -914061504, "ts": 1716454222970550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223023996, "dur": 8, "args": { "External id": 104360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104360, "pid": 5, "tid": 7, "ts": 1716454223023996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970592, "dur": 9, "args": { "External id": 104360, "cbid": 211, "correlation": 104360 } }, { "ph": "s", "id": 104360, "pid": 76337, "tid": -914061504, "ts": 1716454222970592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223024006, "dur": 8, "args": { "External id": 104382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104382, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 104382, "pid": 5, "tid": 7, "ts": 1716454223024006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970643, "dur": 10, "args": { "External id": 104382, "cbid": 211, "correlation": 104382 } }, { "ph": "s", "id": 104382, "pid": 76337, "tid": -914061504, "ts": 1716454222970643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970733, "dur": 1, "args": { "External id": 104398, "cbid": 251, "correlation": 104398 } }, { "ph": "f", "id": 104398, "pid": 76337, "tid": -914061504, "ts": 1716454222970733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970738, "dur": 0, "args": { "External id": 104400, "cbid": 251, "correlation": 104400 } }, { "ph": "f", "id": 104400, "pid": 76337, "tid": -914061504, "ts": 1716454222970738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223024015, "dur": 186, "args": { "External id": 104401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104401, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104401, "pid": 5, "tid": 7, "ts": 1716454223024015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970740, "dur": 13, "args": { "External id": 104401, "cbid": 211, "correlation": 104401 } }, { "ph": "s", "id": 104401, "pid": 76337, "tid": -914061504, "ts": 1716454222970740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024202, "dur": 21, "args": { "External id": 104409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104409, "pid": 5, "tid": 7, "ts": 1716454223024202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970807, "dur": 12, "args": { "External id": 104409, "cbid": 211, "correlation": 104409 } }, { "ph": "s", "id": 104409, "pid": 76337, "tid": -914061504, "ts": 1716454222970807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024225, "dur": 22, "args": { "External id": 104417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104417, "pid": 5, "tid": 7, "ts": 1716454223024225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970837, "dur": 8, "args": { "External id": 104417, "cbid": 211, "correlation": 104417 } }, { "ph": "s", "id": 104417, "pid": 76337, "tid": -914061504, "ts": 1716454222970837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222970919, "dur": 1, "args": { "External id": 104433, "cbid": 251, "correlation": 104433 } }, { "ph": "f", "id": 104433, "pid": 76337, "tid": -914061504, "ts": 1716454222970919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223024249, "dur": 1, "args": { "External id": 104435, "device": 5, "context": 1, "stream": 7, "correlation": 104435, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 104435, "pid": 5, "tid": 7, "ts": 1716454223024249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222970924, "dur": 8, "args": { "External id": 104435, "cbid": 51, "correlation": 104435 } }, { "ph": "s", "id": 104435, "pid": 76337, "tid": -914061504, "ts": 1716454222970924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223024252, "dur": 109, "args": { "External id": 104436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104436, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 104436, "pid": 5, "tid": 7, "ts": 1716454223024252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970934, "dur": 11, "args": { "External id": 104436, "cbid": 211, "correlation": 104436 } }, { "ph": "s", "id": 104436, "pid": 76337, "tid": -914061504, "ts": 1716454222970934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223024362, "dur": 5, "args": { "External id": 104444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104444, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104444, "pid": 5, "tid": 7, "ts": 1716454223024362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222970985, "dur": 10, "args": { "External id": 104444, "cbid": 211, "correlation": 104444 } }, { "ph": "s", "id": 104444, "pid": 76337, "tid": -914061504, "ts": 1716454222970985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024369, "dur": 10, "args": { "External id": 104455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104455, "pid": 5, "tid": 7, "ts": 1716454223024369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971056, "dur": 12, "args": { "External id": 104455, "cbid": 211, "correlation": 104455 } }, { "ph": "s", "id": 104455, "pid": 76337, "tid": -914061504, "ts": 1716454222971056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222971121, "dur": 0, "args": { "External id": 104467, "cbid": 317, "correlation": 104467 } }, { "ph": "f", "id": 104467, "pid": 76337, "tid": -914061504, "ts": 1716454222971121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222971122, "dur": 0, "args": { "External id": 104468, "cbid": 203, "correlation": 104468 } }, { "ph": "f", "id": 104468, "pid": 76337, "tid": -914061504, "ts": 1716454222971122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222971123, "dur": 0, "args": { "External id": 104469, "cbid": 205, "correlation": 104469 } }, { "ph": "f", "id": 104469, "pid": 76337, "tid": -914061504, "ts": 1716454222971123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024380, "dur": 5, "args": { "External id": 104473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104473, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104473, "pid": 5, "tid": 7, "ts": 1716454223024380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971139, "dur": 13, "args": { "External id": 104473, "cbid": 211, "correlation": 104473 } }, { "ph": "s", "id": 104473, "pid": 76337, "tid": -914061504, "ts": 1716454222971139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223024386, "dur": 37, "args": { "External id": 104475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104475, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 104475, "pid": 5, "tid": 7, "ts": 1716454223024386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971158, "dur": 7, "args": { "External id": 104475, "cbid": 211, "correlation": 104475 } }, { "ph": "s", "id": 104475, "pid": 76337, "tid": -914061504, "ts": 1716454222971158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024424, "dur": 6, "args": { "External id": 104477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104477, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104477, "pid": 5, "tid": 7, "ts": 1716454223024424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971168, "dur": 5, "args": { "External id": 104477, "cbid": 211, "correlation": 104477 } }, { "ph": "s", "id": 104477, "pid": 76337, "tid": -914061504, "ts": 1716454222971168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024431, "dur": 7, "args": { "External id": 104483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104483, "pid": 5, "tid": 7, "ts": 1716454223024431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971195, "dur": 9, "args": { "External id": 104483, "cbid": 211, "correlation": 104483 } }, { "ph": "s", "id": 104483, "pid": 76337, "tid": -914061504, "ts": 1716454222971195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223024440, "dur": 5, "args": { "External id": 104491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104491, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104491, "pid": 5, "tid": 7, "ts": 1716454223024440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971227, "dur": 8, "args": { "External id": 104491, "cbid": 211, "correlation": 104491 } }, { "ph": "s", "id": 104491, "pid": 76337, "tid": -914061504, "ts": 1716454222971227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223024446, "dur": 11, "args": { "External id": 104511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104511, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 104511, "pid": 5, "tid": 7, "ts": 1716454223024446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971301, "dur": 12, "args": { "External id": 104511, "cbid": 211, "correlation": 104511 } }, { "ph": "s", "id": 104511, "pid": 76337, "tid": -914061504, "ts": 1716454222971301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223024458, "dur": 4, "args": { "External id": 104523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104523, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 104523, "pid": 5, "tid": 7, "ts": 1716454223024458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971323, "dur": 6, "args": { "External id": 104523, "cbid": 211, "correlation": 104523 } }, { "ph": "s", "id": 104523, "pid": 76337, "tid": -914061504, "ts": 1716454222971323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024464, "dur": 8, "args": { "External id": 104526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104526, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104526, "pid": 5, "tid": 7, "ts": 1716454223024464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971341, "dur": 7, "args": { "External id": 104526, "cbid": 211, "correlation": 104526 } }, { "ph": "s", "id": 104526, "pid": 76337, "tid": -914061504, "ts": 1716454222971341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223024473, "dur": 5, "args": { "External id": 104535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104535, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104535, "pid": 5, "tid": 7, "ts": 1716454223024473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971380, "dur": 10, "args": { "External id": 104535, "cbid": 211, "correlation": 104535 } }, { "ph": "s", "id": 104535, "pid": 76337, "tid": -914061504, "ts": 1716454222971380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222971432, "dur": 0, "args": { "External id": 104545, "cbid": 317, "correlation": 104545 } }, { "ph": "f", "id": 104545, "pid": 76337, "tid": -914061504, "ts": 1716454222971432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222971432, "dur": 0, "args": { "External id": 104546, "cbid": 203, "correlation": 104546 } }, { "ph": "f", "id": 104546, "pid": 76337, "tid": -914061504, "ts": 1716454222971432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222971433, "dur": 0, "args": { "External id": 104547, "cbid": 205, "correlation": 104547 } }, { "ph": "f", "id": 104547, "pid": 76337, "tid": -914061504, "ts": 1716454222971433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024479, "dur": 5, "args": { "External id": 104551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104551, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104551, "pid": 5, "tid": 7, "ts": 1716454223024479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971446, "dur": 12, "args": { "External id": 104551, "cbid": 211, "correlation": 104551 } }, { "ph": "s", "id": 104551, "pid": 76337, "tid": -914061504, "ts": 1716454222971446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024486, "dur": 158, "args": { "External id": 104553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104553, "pid": 5, "tid": 7, "ts": 1716454223024486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971461, "dur": 5, "args": { "External id": 104553, "cbid": 211, "correlation": 104553 } }, { "ph": "s", "id": 104553, "pid": 76337, "tid": -914061504, "ts": 1716454222971461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223024646, "dur": 1, "args": { "External id": 104555, "device": 5, "context": 1, "stream": 7, "correlation": 104555, "bytes": 240, "memory bandwidth (GB/s)": 0.14423076923076922 } }, { "ph": "f", "id": 104555, "pid": 5, "tid": 7, "ts": 1716454223024646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222971471, "dur": 6, "args": { "External id": 104555, "cbid": 51, "correlation": 104555 } }, { "ph": "s", "id": 104555, "pid": 76337, "tid": -914061504, "ts": 1716454222971471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223024649, "dur": 266, "args": { "External id": 104556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104556, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104556, "pid": 5, "tid": 7, "ts": 1716454223024649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971479, "dur": 6, "args": { "External id": 104556, "cbid": 211, "correlation": 104556 } }, { "ph": "s", "id": 104556, "pid": 76337, "tid": -914061504, "ts": 1716454222971479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024917, "dur": 6, "args": { "External id": 104558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104558, "pid": 5, "tid": 7, "ts": 1716454223024917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971488, "dur": 5, "args": { "External id": 104558, "cbid": 211, "correlation": 104558 } }, { "ph": "s", "id": 104558, "pid": 76337, "tid": -914061504, "ts": 1716454222971488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024924, "dur": 6, "args": { "External id": 104564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104564, "pid": 5, "tid": 7, "ts": 1716454223024924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971516, "dur": 8, "args": { "External id": 104564, "cbid": 211, "correlation": 104564 } }, { "ph": "s", "id": 104564, "pid": 76337, "tid": -914061504, "ts": 1716454222971516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223024931, "dur": 3, "args": { "External id": 104572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104572, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 104572, "pid": 5, "tid": 7, "ts": 1716454223024931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971560, "dur": 9, "args": { "External id": 104572, "cbid": 211, "correlation": 104572 } }, { "ph": "s", "id": 104572, "pid": 76337, "tid": -914061504, "ts": 1716454222971560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222971624, "dur": 1, "args": { "External id": 104588, "cbid": 251, "correlation": 104588 } }, { "ph": "f", "id": 104588, "pid": 76337, "tid": -914061504, "ts": 1716454222971624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222971629, "dur": 0, "args": { "External id": 104590, "cbid": 251, "correlation": 104590 } }, { "ph": "f", "id": 104590, "pid": 76337, "tid": -914061504, "ts": 1716454222971629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223024936, "dur": 12, "args": { "External id": 104591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104591, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104591, "pid": 5, "tid": 7, "ts": 1716454223024936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971631, "dur": 11, "args": { "External id": 104591, "cbid": 211, "correlation": 104591 } }, { "ph": "s", "id": 104591, "pid": 76337, "tid": -914061504, "ts": 1716454222971631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223024949, "dur": 5, "args": { "External id": 104593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104593, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104593, "pid": 5, "tid": 7, "ts": 1716454223024949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971644, "dur": 6, "args": { "External id": 104593, "cbid": 211, "correlation": 104593 } }, { "ph": "s", "id": 104593, "pid": 76337, "tid": -914061504, "ts": 1716454222971644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024956, "dur": 6, "args": { "External id": 104603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104603, "pid": 5, "tid": 7, "ts": 1716454223024956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971701, "dur": 12, "args": { "External id": 104603, "cbid": 211, "correlation": 104603 } }, { "ph": "s", "id": 104603, "pid": 76337, "tid": -914061504, "ts": 1716454222971701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223024963, "dur": 10, "args": { "External id": 104623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104623, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 104623, "pid": 5, "tid": 7, "ts": 1716454223024963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971767, "dur": 10, "args": { "External id": 104623, "cbid": 211, "correlation": 104623 } }, { "ph": "s", "id": 104623, "pid": 76337, "tid": -914061504, "ts": 1716454222971767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223024974, "dur": 4, "args": { "External id": 104635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104635, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 104635, "pid": 5, "tid": 7, "ts": 1716454223024974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971787, "dur": 6, "args": { "External id": 104635, "cbid": 211, "correlation": 104635 } }, { "ph": "s", "id": 104635, "pid": 76337, "tid": -914061504, "ts": 1716454222971787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223024979, "dur": 7, "args": { "External id": 104638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104638, "pid": 5, "tid": 7, "ts": 1716454223024979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971806, "dur": 7, "args": { "External id": 104638, "cbid": 211, "correlation": 104638 } }, { "ph": "s", "id": 104638, "pid": 76337, "tid": -914061504, "ts": 1716454222971806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223024987, "dur": 4, "args": { "External id": 104647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104647, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104647, "pid": 5, "tid": 7, "ts": 1716454223024987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971846, "dur": 10, "args": { "External id": 104647, "cbid": 211, "correlation": 104647 } }, { "ph": "s", "id": 104647, "pid": 76337, "tid": -914061504, "ts": 1716454222971846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222971909, "dur": 0, "args": { "External id": 104657, "cbid": 317, "correlation": 104657 } }, { "ph": "f", "id": 104657, "pid": 76337, "tid": -914061504, "ts": 1716454222971909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222971910, "dur": 0, "args": { "External id": 104658, "cbid": 203, "correlation": 104658 } }, { "ph": "f", "id": 104658, "pid": 76337, "tid": -914061504, "ts": 1716454222971910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222971910, "dur": 0, "args": { "External id": 104659, "cbid": 205, "correlation": 104659 } }, { "ph": "f", "id": 104659, "pid": 76337, "tid": -914061504, "ts": 1716454222971910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024993, "dur": 5, "args": { "External id": 104663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104663, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104663, "pid": 5, "tid": 7, "ts": 1716454223024993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971924, "dur": 12, "args": { "External id": 104663, "cbid": 211, "correlation": 104663 } }, { "ph": "s", "id": 104663, "pid": 76337, "tid": -914061504, "ts": 1716454222971924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223024999, "dur": 160, "args": { "External id": 104665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104665, "pid": 5, "tid": 7, "ts": 1716454223024999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971938, "dur": 5, "args": { "External id": 104665, "cbid": 211, "correlation": 104665 } }, { "ph": "s", "id": 104665, "pid": 76337, "tid": -914061504, "ts": 1716454222971938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223025161, "dur": 1, "args": { "External id": 104667, "device": 5, "context": 1, "stream": 7, "correlation": 104667, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 104667, "pid": 5, "tid": 7, "ts": 1716454223025161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222971949, "dur": 7, "args": { "External id": 104667, "cbid": 51, "correlation": 104667 } }, { "ph": "s", "id": 104667, "pid": 76337, "tid": -914061504, "ts": 1716454222971949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223025164, "dur": 254, "args": { "External id": 104668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104668, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104668, "pid": 5, "tid": 7, "ts": 1716454223025164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971957, "dur": 6, "args": { "External id": 104668, "cbid": 211, "correlation": 104668 } }, { "ph": "s", "id": 104668, "pid": 76337, "tid": -914061504, "ts": 1716454222971957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223025420, "dur": 6, "args": { "External id": 104670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104670, "pid": 5, "tid": 7, "ts": 1716454223025420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222971966, "dur": 5, "args": { "External id": 104670, "cbid": 211, "correlation": 104670 } }, { "ph": "s", "id": 104670, "pid": 76337, "tid": -914061504, "ts": 1716454222971966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223025427, "dur": 6, "args": { "External id": 104676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104676, "pid": 5, "tid": 7, "ts": 1716454223025427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972004, "dur": 9, "args": { "External id": 104676, "cbid": 211, "correlation": 104676 } }, { "ph": "s", "id": 104676, "pid": 76337, "tid": -914061504, "ts": 1716454222972004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223025434, "dur": 5, "args": { "External id": 104684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104684, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104684, "pid": 5, "tid": 7, "ts": 1716454223025434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972037, "dur": 8, "args": { "External id": 104684, "cbid": 211, "correlation": 104684 } }, { "ph": "s", "id": 104684, "pid": 76337, "tid": -914061504, "ts": 1716454222972037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223025440, "dur": 4, "args": { "External id": 104692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104692, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104692, "pid": 5, "tid": 7, "ts": 1716454223025440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972067, "dur": 8, "args": { "External id": 104692, "cbid": 211, "correlation": 104692 } }, { "ph": "s", "id": 104692, "pid": 76337, "tid": -914061504, "ts": 1716454222972067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223025446, "dur": 11, "args": { "External id": 104701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104701, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104701, "pid": 5, "tid": 7, "ts": 1716454223025446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972156, "dur": 14, "args": { "External id": 104701, "cbid": 211, "correlation": 104701 } }, { "ph": "s", "id": 104701, "pid": 76337, "tid": -914061504, "ts": 1716454222972156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223025458, "dur": 12, "args": { "External id": 104721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104721, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 104721, "pid": 5, "tid": 7, "ts": 1716454223025458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972224, "dur": 11, "args": { "External id": 104721, "cbid": 211, "correlation": 104721 } }, { "ph": "s", "id": 104721, "pid": 76337, "tid": -914061504, "ts": 1716454222972224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223025472, "dur": 4, "args": { "External id": 104733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104733, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104733, "pid": 5, "tid": 7, "ts": 1716454223025472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972245, "dur": 7, "args": { "External id": 104733, "cbid": 211, "correlation": 104733 } }, { "ph": "s", "id": 104733, "pid": 76337, "tid": -914061504, "ts": 1716454222972245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223025477, "dur": 10, "args": { "External id": 104736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104736, "pid": 5, "tid": 7, "ts": 1716454223025477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972264, "dur": 6, "args": { "External id": 104736, "cbid": 211, "correlation": 104736 } }, { "ph": "s", "id": 104736, "pid": 76337, "tid": -914061504, "ts": 1716454222972264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223025488, "dur": 6, "args": { "External id": 104745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104745, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104745, "pid": 5, "tid": 7, "ts": 1716454223025488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972304, "dur": 9, "args": { "External id": 104745, "cbid": 211, "correlation": 104745 } }, { "ph": "s", "id": 104745, "pid": 76337, "tid": -914061504, "ts": 1716454222972304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222972357, "dur": 0, "args": { "External id": 104755, "cbid": 317, "correlation": 104755 } }, { "ph": "f", "id": 104755, "pid": 76337, "tid": -914061504, "ts": 1716454222972357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222972358, "dur": 0, "args": { "External id": 104756, "cbid": 203, "correlation": 104756 } }, { "ph": "f", "id": 104756, "pid": 76337, "tid": -914061504, "ts": 1716454222972358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222972359, "dur": 0, "args": { "External id": 104757, "cbid": 205, "correlation": 104757 } }, { "ph": "f", "id": 104757, "pid": 76337, "tid": -914061504, "ts": 1716454222972359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223025495, "dur": 6, "args": { "External id": 104761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104761, "pid": 5, "tid": 7, "ts": 1716454223025495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972375, "dur": 11, "args": { "External id": 104761, "cbid": 211, "correlation": 104761 } }, { "ph": "s", "id": 104761, "pid": 76337, "tid": -914061504, "ts": 1716454222972375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223025503, "dur": 313, "args": { "External id": 104763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104763, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104763, "pid": 5, "tid": 7, "ts": 1716454223025503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972388, "dur": 5, "args": { "External id": 104763, "cbid": 211, "correlation": 104763 } }, { "ph": "s", "id": 104763, "pid": 76337, "tid": -914061504, "ts": 1716454222972388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223025819, "dur": 1, "args": { "External id": 104765, "device": 5, "context": 1, "stream": 7, "correlation": 104765, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 104765, "pid": 5, "tid": 7, "ts": 1716454223025819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222972399, "dur": 8, "args": { "External id": 104765, "cbid": 51, "correlation": 104765 } }, { "ph": "s", "id": 104765, "pid": 76337, "tid": -914061504, "ts": 1716454222972399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223025822, "dur": 488, "args": { "External id": 104766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104766, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104766, "pid": 5, "tid": 7, "ts": 1716454223025822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972408, "dur": 6, "args": { "External id": 104766, "cbid": 211, "correlation": 104766 } }, { "ph": "s", "id": 104766, "pid": 76337, "tid": -914061504, "ts": 1716454222972408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223026312, "dur": 5, "args": { "External id": 104768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104768, "pid": 5, "tid": 7, "ts": 1716454223026312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972420, "dur": 5, "args": { "External id": 104768, "cbid": 211, "correlation": 104768 } }, { "ph": "s", "id": 104768, "pid": 76337, "tid": -914061504, "ts": 1716454222972420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223026318, "dur": 6, "args": { "External id": 104774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104774, "pid": 5, "tid": 7, "ts": 1716454223026318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972450, "dur": 9, "args": { "External id": 104774, "cbid": 211, "correlation": 104774 } }, { "ph": "s", "id": 104774, "pid": 76337, "tid": -914061504, "ts": 1716454222972450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223026325, "dur": 3, "args": { "External id": 104782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104782, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 104782, "pid": 5, "tid": 7, "ts": 1716454223026325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972492, "dur": 10, "args": { "External id": 104782, "cbid": 211, "correlation": 104782 } }, { "ph": "s", "id": 104782, "pid": 76337, "tid": -914061504, "ts": 1716454222972492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222972554, "dur": 1, "args": { "External id": 104798, "cbid": 251, "correlation": 104798 } }, { "ph": "f", "id": 104798, "pid": 76337, "tid": -914061504, "ts": 1716454222972554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222972559, "dur": 0, "args": { "External id": 104800, "cbid": 251, "correlation": 104800 } }, { "ph": "f", "id": 104800, "pid": 76337, "tid": -914061504, "ts": 1716454222972559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223026330, "dur": 12, "args": { "External id": 104801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104801, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104801, "pid": 5, "tid": 7, "ts": 1716454223026330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972561, "dur": 11, "args": { "External id": 104801, "cbid": 211, "correlation": 104801 } }, { "ph": "s", "id": 104801, "pid": 76337, "tid": -914061504, "ts": 1716454222972561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223026343, "dur": 5, "args": { "External id": 104803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104803, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104803, "pid": 5, "tid": 7, "ts": 1716454223026343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972574, "dur": 5, "args": { "External id": 104803, "cbid": 211, "correlation": 104803 } }, { "ph": "s", "id": 104803, "pid": 76337, "tid": -914061504, "ts": 1716454222972574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223026349, "dur": 6, "args": { "External id": 104813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104813, "pid": 5, "tid": 7, "ts": 1716454223026349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972630, "dur": 12, "args": { "External id": 104813, "cbid": 211, "correlation": 104813 } }, { "ph": "s", "id": 104813, "pid": 76337, "tid": -914061504, "ts": 1716454222972630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223026356, "dur": 9, "args": { "External id": 104833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104833, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 104833, "pid": 5, "tid": 7, "ts": 1716454223026356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972696, "dur": 11, "args": { "External id": 104833, "cbid": 211, "correlation": 104833 } }, { "ph": "s", "id": 104833, "pid": 76337, "tid": -914061504, "ts": 1716454222972696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223026366, "dur": 4, "args": { "External id": 104845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104845, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 104845, "pid": 5, "tid": 7, "ts": 1716454223026366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972716, "dur": 6, "args": { "External id": 104845, "cbid": 211, "correlation": 104845 } }, { "ph": "s", "id": 104845, "pid": 76337, "tid": -914061504, "ts": 1716454222972716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223026371, "dur": 6, "args": { "External id": 104848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104848, "pid": 5, "tid": 7, "ts": 1716454223026371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972736, "dur": 7, "args": { "External id": 104848, "cbid": 211, "correlation": 104848 } }, { "ph": "s", "id": 104848, "pid": 76337, "tid": -914061504, "ts": 1716454222972736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223026379, "dur": 4, "args": { "External id": 104857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104857, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104857, "pid": 5, "tid": 7, "ts": 1716454223026379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972776, "dur": 10, "args": { "External id": 104857, "cbid": 211, "correlation": 104857 } }, { "ph": "s", "id": 104857, "pid": 76337, "tid": -914061504, "ts": 1716454222972776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222972839, "dur": 0, "args": { "External id": 104867, "cbid": 317, "correlation": 104867 } }, { "ph": "f", "id": 104867, "pid": 76337, "tid": -914061504, "ts": 1716454222972839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222972840, "dur": 0, "args": { "External id": 104868, "cbid": 203, "correlation": 104868 } }, { "ph": "f", "id": 104868, "pid": 76337, "tid": -914061504, "ts": 1716454222972840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222972841, "dur": 0, "args": { "External id": 104869, "cbid": 205, "correlation": 104869 } }, { "ph": "f", "id": 104869, "pid": 76337, "tid": -914061504, "ts": 1716454222972841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223026385, "dur": 5, "args": { "External id": 104873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104873, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104873, "pid": 5, "tid": 7, "ts": 1716454223026385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972855, "dur": 12, "args": { "External id": 104873, "cbid": 211, "correlation": 104873 } }, { "ph": "s", "id": 104873, "pid": 76337, "tid": -914061504, "ts": 1716454222972855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223026391, "dur": 159, "args": { "External id": 104875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104875, "pid": 5, "tid": 7, "ts": 1716454223026391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972870, "dur": 5, "args": { "External id": 104875, "cbid": 211, "correlation": 104875 } }, { "ph": "s", "id": 104875, "pid": 76337, "tid": -914061504, "ts": 1716454222972870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223026553, "dur": 1, "args": { "External id": 104877, "device": 5, "context": 1, "stream": 7, "correlation": 104877, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 104877, "pid": 5, "tid": 7, "ts": 1716454223026553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222972880, "dur": 6, "args": { "External id": 104877, "cbid": 51, "correlation": 104877 } }, { "ph": "s", "id": 104877, "pid": 76337, "tid": -914061504, "ts": 1716454222972880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223026556, "dur": 254, "args": { "External id": 104878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104878, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104878, "pid": 5, "tid": 7, "ts": 1716454223026556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972887, "dur": 6, "args": { "External id": 104878, "cbid": 211, "correlation": 104878 } }, { "ph": "s", "id": 104878, "pid": 76337, "tid": -914061504, "ts": 1716454222972887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223026811, "dur": 6, "args": { "External id": 104880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104880, "pid": 5, "tid": 7, "ts": 1716454223026811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972897, "dur": 6, "args": { "External id": 104880, "cbid": 211, "correlation": 104880 } }, { "ph": "s", "id": 104880, "pid": 76337, "tid": -914061504, "ts": 1716454222972897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223026818, "dur": 6, "args": { "External id": 104886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104886, "pid": 5, "tid": 7, "ts": 1716454223026818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222972926, "dur": 8, "args": { "External id": 104886, "cbid": 211, "correlation": 104886 } }, { "ph": "s", "id": 104886, "pid": 76337, "tid": -914061504, "ts": 1716454222972926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222972994, "dur": 0, "args": { "External id": 104896, "cbid": 317, "correlation": 104896 } }, { "ph": "f", "id": 104896, "pid": 76337, "tid": -914061504, "ts": 1716454222972994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222972994, "dur": 0, "args": { "External id": 104897, "cbid": 203, "correlation": 104897 } }, { "ph": "f", "id": 104897, "pid": 76337, "tid": -914061504, "ts": 1716454222972994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222972995, "dur": 0, "args": { "External id": 104898, "cbid": 205, "correlation": 104898 } }, { "ph": "f", "id": 104898, "pid": 76337, "tid": -914061504, "ts": 1716454222972995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223026826, "dur": 8, "args": { "External id": 104902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104902, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104902, "pid": 5, "tid": 7, "ts": 1716454223026826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973010, "dur": 12, "args": { "External id": 104902, "cbid": 211, "correlation": 104902 } }, { "ph": "s", "id": 104902, "pid": 76337, "tid": -914061504, "ts": 1716454222973010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223026835, "dur": 3, "args": { "External id": 104904, "device": 5, "context": 1, "stream": 7, "correlation": 104904, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 104904, "pid": 5, "tid": 7, "ts": 1716454223026835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222973028, "dur": 15, "args": { "External id": 104904, "cbid": 51, "correlation": 104904 } }, { "ph": "s", "id": 104904, "pid": 76337, "tid": -914061504, "ts": 1716454222973028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223026839, "dur": 94, "args": { "External id": 104905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104905, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 104905, "pid": 5, "tid": 7, "ts": 1716454223026839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973045, "dur": 8, "args": { "External id": 104905, "cbid": 211, "correlation": 104905 } }, { "ph": "s", "id": 104905, "pid": 76337, "tid": -914061504, "ts": 1716454222973045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223026935, "dur": 6, "args": { "External id": 104907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104907, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104907, "pid": 5, "tid": 7, "ts": 1716454223026935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973056, "dur": 6, "args": { "External id": 104907, "cbid": 211, "correlation": 104907 } }, { "ph": "s", "id": 104907, "pid": 76337, "tid": -914061504, "ts": 1716454222973056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223026942, "dur": 6, "args": { "External id": 104913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104913, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104913, "pid": 5, "tid": 7, "ts": 1716454223026942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973084, "dur": 9, "args": { "External id": 104913, "cbid": 211, "correlation": 104913 } }, { "ph": "s", "id": 104913, "pid": 76337, "tid": -914061504, "ts": 1716454222973084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223026949, "dur": 5, "args": { "External id": 104921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104921, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104921, "pid": 5, "tid": 7, "ts": 1716454223026949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973113, "dur": 8, "args": { "External id": 104921, "cbid": 211, "correlation": 104921 } }, { "ph": "s", "id": 104921, "pid": 76337, "tid": -914061504, "ts": 1716454222973113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223026955, "dur": 4, "args": { "External id": 104929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104929, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 104929, "pid": 5, "tid": 7, "ts": 1716454223026955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973142, "dur": 9, "args": { "External id": 104929, "cbid": 211, "correlation": 104929 } }, { "ph": "s", "id": 104929, "pid": 76337, "tid": -914061504, "ts": 1716454222973142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223026961, "dur": 11, "args": { "External id": 104938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104938, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104938, "pid": 5, "tid": 7, "ts": 1716454223026961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973185, "dur": 11, "args": { "External id": 104938, "cbid": 211, "correlation": 104938 } }, { "ph": "s", "id": 104938, "pid": 76337, "tid": -914061504, "ts": 1716454222973185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223026973, "dur": 12, "args": { "External id": 104958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104958, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 104958, "pid": 5, "tid": 7, "ts": 1716454223026973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973256, "dur": 11, "args": { "External id": 104958, "cbid": 211, "correlation": 104958 } }, { "ph": "s", "id": 104958, "pid": 76337, "tid": -914061504, "ts": 1716454222973256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223026986, "dur": 4, "args": { "External id": 104970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104970, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 104970, "pid": 5, "tid": 7, "ts": 1716454223026986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973277, "dur": 6, "args": { "External id": 104970, "cbid": 211, "correlation": 104970 } }, { "ph": "s", "id": 104970, "pid": 76337, "tid": -914061504, "ts": 1716454222973277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223026992, "dur": 10, "args": { "External id": 104973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104973, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104973, "pid": 5, "tid": 7, "ts": 1716454223026992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973296, "dur": 6, "args": { "External id": 104973, "cbid": 211, "correlation": 104973 } }, { "ph": "s", "id": 104973, "pid": 76337, "tid": -914061504, "ts": 1716454222973296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223027003, "dur": 6, "args": { "External id": 104982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104982, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104982, "pid": 5, "tid": 7, "ts": 1716454223027003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973333, "dur": 10, "args": { "External id": 104982, "cbid": 211, "correlation": 104982 } }, { "ph": "s", "id": 104982, "pid": 76337, "tid": -914061504, "ts": 1716454222973333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222973385, "dur": 0, "args": { "External id": 104992, "cbid": 317, "correlation": 104992 } }, { "ph": "f", "id": 104992, "pid": 76337, "tid": -914061504, "ts": 1716454222973385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222973386, "dur": 0, "args": { "External id": 104993, "cbid": 203, "correlation": 104993 } }, { "ph": "f", "id": 104993, "pid": 76337, "tid": -914061504, "ts": 1716454222973386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222973387, "dur": 0, "args": { "External id": 104994, "cbid": 205, "correlation": 104994 } }, { "ph": "f", "id": 104994, "pid": 76337, "tid": -914061504, "ts": 1716454222973387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223027011, "dur": 6, "args": { "External id": 104998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 104998, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 104998, "pid": 5, "tid": 7, "ts": 1716454223027011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973401, "dur": 11, "args": { "External id": 104998, "cbid": 211, "correlation": 104998 } }, { "ph": "s", "id": 104998, "pid": 76337, "tid": -914061504, "ts": 1716454222973401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223027018, "dur": 312, "args": { "External id": 105000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105000, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105000, "pid": 5, "tid": 7, "ts": 1716454223027018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973414, "dur": 5, "args": { "External id": 105000, "cbid": 211, "correlation": 105000 } }, { "ph": "s", "id": 105000, "pid": 76337, "tid": -914061504, "ts": 1716454222973414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223027333, "dur": 1, "args": { "External id": 105002, "device": 5, "context": 1, "stream": 7, "correlation": 105002, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 105002, "pid": 5, "tid": 7, "ts": 1716454223027333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222973425, "dur": 6, "args": { "External id": 105002, "cbid": 51, "correlation": 105002 } }, { "ph": "s", "id": 105002, "pid": 76337, "tid": -914061504, "ts": 1716454222973425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223027337, "dur": 489, "args": { "External id": 105003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105003, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105003, "pid": 5, "tid": 7, "ts": 1716454223027337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973432, "dur": 6, "args": { "External id": 105003, "cbid": 211, "correlation": 105003 } }, { "ph": "s", "id": 105003, "pid": 76337, "tid": -914061504, "ts": 1716454222973432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223027827, "dur": 5, "args": { "External id": 105005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105005, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105005, "pid": 5, "tid": 7, "ts": 1716454223027827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973443, "dur": 6, "args": { "External id": 105005, "cbid": 211, "correlation": 105005 } }, { "ph": "s", "id": 105005, "pid": 76337, "tid": -914061504, "ts": 1716454222973443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223027834, "dur": 6, "args": { "External id": 105011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105011, "pid": 5, "tid": 7, "ts": 1716454223027834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973471, "dur": 8, "args": { "External id": 105011, "cbid": 211, "correlation": 105011 } }, { "ph": "s", "id": 105011, "pid": 76337, "tid": -914061504, "ts": 1716454222973471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223027841, "dur": 3, "args": { "External id": 105019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105019, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 105019, "pid": 5, "tid": 7, "ts": 1716454223027841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973515, "dur": 9, "args": { "External id": 105019, "cbid": 211, "correlation": 105019 } }, { "ph": "s", "id": 105019, "pid": 76337, "tid": -914061504, "ts": 1716454222973515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222973577, "dur": 1, "args": { "External id": 105035, "cbid": 251, "correlation": 105035 } }, { "ph": "f", "id": 105035, "pid": 76337, "tid": -914061504, "ts": 1716454222973577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222973582, "dur": 0, "args": { "External id": 105037, "cbid": 251, "correlation": 105037 } }, { "ph": "f", "id": 105037, "pid": 76337, "tid": -914061504, "ts": 1716454222973582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223027846, "dur": 12, "args": { "External id": 105038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105038, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105038, "pid": 5, "tid": 7, "ts": 1716454223027846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973584, "dur": 12, "args": { "External id": 105038, "cbid": 211, "correlation": 105038 } }, { "ph": "s", "id": 105038, "pid": 76337, "tid": -914061504, "ts": 1716454222973584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223027859, "dur": 5, "args": { "External id": 105040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105040, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105040, "pid": 5, "tid": 7, "ts": 1716454223027859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973598, "dur": 5, "args": { "External id": 105040, "cbid": 211, "correlation": 105040 } }, { "ph": "s", "id": 105040, "pid": 76337, "tid": -914061504, "ts": 1716454222973598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223027866, "dur": 6, "args": { "External id": 105050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105050, "pid": 5, "tid": 7, "ts": 1716454223027866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973654, "dur": 11, "args": { "External id": 105050, "cbid": 211, "correlation": 105050 } }, { "ph": "s", "id": 105050, "pid": 76337, "tid": -914061504, "ts": 1716454222973654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223027873, "dur": 9, "args": { "External id": 105070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105070, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 105070, "pid": 5, "tid": 7, "ts": 1716454223027873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973718, "dur": 10, "args": { "External id": 105070, "cbid": 211, "correlation": 105070 } }, { "ph": "s", "id": 105070, "pid": 76337, "tid": -914061504, "ts": 1716454222973718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223027883, "dur": 4, "args": { "External id": 105082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105082, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 105082, "pid": 5, "tid": 7, "ts": 1716454223027883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973738, "dur": 6, "args": { "External id": 105082, "cbid": 211, "correlation": 105082 } }, { "ph": "s", "id": 105082, "pid": 76337, "tid": -914061504, "ts": 1716454222973738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223027888, "dur": 7, "args": { "External id": 105085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105085, "pid": 5, "tid": 7, "ts": 1716454223027888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973758, "dur": 6, "args": { "External id": 105085, "cbid": 211, "correlation": 105085 } }, { "ph": "s", "id": 105085, "pid": 76337, "tid": -914061504, "ts": 1716454222973758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223027896, "dur": 4, "args": { "External id": 105094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105094, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105094, "pid": 5, "tid": 7, "ts": 1716454223027896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973799, "dur": 10, "args": { "External id": 105094, "cbid": 211, "correlation": 105094 } }, { "ph": "s", "id": 105094, "pid": 76337, "tid": -914061504, "ts": 1716454222973799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222973862, "dur": 0, "args": { "External id": 105104, "cbid": 317, "correlation": 105104 } }, { "ph": "f", "id": 105104, "pid": 76337, "tid": -914061504, "ts": 1716454222973862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222973863, "dur": 0, "args": { "External id": 105105, "cbid": 203, "correlation": 105105 } }, { "ph": "f", "id": 105105, "pid": 76337, "tid": -914061504, "ts": 1716454222973863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222973864, "dur": 0, "args": { "External id": 105106, "cbid": 205, "correlation": 105106 } }, { "ph": "f", "id": 105106, "pid": 76337, "tid": -914061504, "ts": 1716454222973864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223027902, "dur": 5, "args": { "External id": 105110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105110, "pid": 5, "tid": 7, "ts": 1716454223027902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973877, "dur": 12, "args": { "External id": 105110, "cbid": 211, "correlation": 105110 } }, { "ph": "s", "id": 105110, "pid": 76337, "tid": -914061504, "ts": 1716454222973877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223027908, "dur": 159, "args": { "External id": 105112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105112, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105112, "pid": 5, "tid": 7, "ts": 1716454223027908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973892, "dur": 6, "args": { "External id": 105112, "cbid": 211, "correlation": 105112 } }, { "ph": "s", "id": 105112, "pid": 76337, "tid": -914061504, "ts": 1716454222973892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223028070, "dur": 1, "args": { "External id": 105114, "device": 5, "context": 1, "stream": 7, "correlation": 105114, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 105114, "pid": 5, "tid": 7, "ts": 1716454223028070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222973903, "dur": 6, "args": { "External id": 105114, "cbid": 51, "correlation": 105114 } }, { "ph": "s", "id": 105114, "pid": 76337, "tid": -914061504, "ts": 1716454222973903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223028073, "dur": 254, "args": { "External id": 105115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105115, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105115, "pid": 5, "tid": 7, "ts": 1716454223028073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973910, "dur": 6, "args": { "External id": 105115, "cbid": 211, "correlation": 105115 } }, { "ph": "s", "id": 105115, "pid": 76337, "tid": -914061504, "ts": 1716454222973910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223028329, "dur": 6, "args": { "External id": 105117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105117, "pid": 5, "tid": 7, "ts": 1716454223028329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973920, "dur": 5, "args": { "External id": 105117, "cbid": 211, "correlation": 105117 } }, { "ph": "s", "id": 105117, "pid": 76337, "tid": -914061504, "ts": 1716454222973920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223028336, "dur": 6, "args": { "External id": 105123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105123, "pid": 5, "tid": 7, "ts": 1716454223028336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222973949, "dur": 9, "args": { "External id": 105123, "cbid": 211, "correlation": 105123 } }, { "ph": "s", "id": 105123, "pid": 76337, "tid": -914061504, "ts": 1716454222973949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222974018, "dur": 0, "args": { "External id": 105133, "cbid": 317, "correlation": 105133 } }, { "ph": "f", "id": 105133, "pid": 76337, "tid": -914061504, "ts": 1716454222974018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222974019, "dur": 0, "args": { "External id": 105134, "cbid": 203, "correlation": 105134 } }, { "ph": "f", "id": 105134, "pid": 76337, "tid": -914061504, "ts": 1716454222974019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222974020, "dur": 0, "args": { "External id": 105135, "cbid": 205, "correlation": 105135 } }, { "ph": "f", "id": 105135, "pid": 76337, "tid": -914061504, "ts": 1716454222974020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223028343, "dur": 7, "args": { "External id": 105139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105139, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105139, "pid": 5, "tid": 7, "ts": 1716454223028343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974031, "dur": 12, "args": { "External id": 105139, "cbid": 211, "correlation": 105139 } }, { "ph": "s", "id": 105139, "pid": 76337, "tid": -914061504, "ts": 1716454222974031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223028352, "dur": 3, "args": { "External id": 105141, "device": 5, "context": 1, "stream": 7, "correlation": 105141, "bytes": 4800, "memory bandwidth (GB/s)": 1.3888888888888888 } }, { "ph": "f", "id": 105141, "pid": 5, "tid": 7, "ts": 1716454223028352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222974049, "dur": 10, "args": { "External id": 105141, "cbid": 51, "correlation": 105141 } }, { "ph": "s", "id": 105141, "pid": 76337, "tid": -914061504, "ts": 1716454222974049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223028356, "dur": 94, "args": { "External id": 105142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105142, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 105142, "pid": 5, "tid": 7, "ts": 1716454223028356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974060, "dur": 6, "args": { "External id": 105142, "cbid": 211, "correlation": 105142 } }, { "ph": "s", "id": 105142, "pid": 76337, "tid": -914061504, "ts": 1716454222974060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223028452, "dur": 6, "args": { "External id": 105144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105144, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105144, "pid": 5, "tid": 7, "ts": 1716454223028452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974070, "dur": 5, "args": { "External id": 105144, "cbid": 211, "correlation": 105144 } }, { "ph": "s", "id": 105144, "pid": 76337, "tid": -914061504, "ts": 1716454222974070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223028459, "dur": 6, "args": { "External id": 105150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105150, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105150, "pid": 5, "tid": 7, "ts": 1716454223028459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974098, "dur": 9, "args": { "External id": 105150, "cbid": 211, "correlation": 105150 } }, { "ph": "s", "id": 105150, "pid": 76337, "tid": -914061504, "ts": 1716454222974098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223028466, "dur": 5, "args": { "External id": 105158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105158, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105158, "pid": 5, "tid": 7, "ts": 1716454223028466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974127, "dur": 8, "args": { "External id": 105158, "cbid": 211, "correlation": 105158 } }, { "ph": "s", "id": 105158, "pid": 76337, "tid": -914061504, "ts": 1716454222974127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223028472, "dur": 4, "args": { "External id": 105166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105166, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105166, "pid": 5, "tid": 7, "ts": 1716454223028472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974156, "dur": 9, "args": { "External id": 105166, "cbid": 211, "correlation": 105166 } }, { "ph": "s", "id": 105166, "pid": 76337, "tid": -914061504, "ts": 1716454222974156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223028478, "dur": 11, "args": { "External id": 105175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105175, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105175, "pid": 5, "tid": 7, "ts": 1716454223028478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974201, "dur": 10, "args": { "External id": 105175, "cbid": 211, "correlation": 105175 } }, { "ph": "s", "id": 105175, "pid": 76337, "tid": -914061504, "ts": 1716454222974201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223028490, "dur": 12, "args": { "External id": 105195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105195, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 105195, "pid": 5, "tid": 7, "ts": 1716454223028490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974270, "dur": 11, "args": { "External id": 105195, "cbid": 211, "correlation": 105195 } }, { "ph": "s", "id": 105195, "pid": 76337, "tid": -914061504, "ts": 1716454222974270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223028504, "dur": 5, "args": { "External id": 105207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105207, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105207, "pid": 5, "tid": 7, "ts": 1716454223028504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974291, "dur": 7, "args": { "External id": 105207, "cbid": 211, "correlation": 105207 } }, { "ph": "s", "id": 105207, "pid": 76337, "tid": -914061504, "ts": 1716454222974291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223028510, "dur": 11, "args": { "External id": 105210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105210, "pid": 5, "tid": 7, "ts": 1716454223028510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974309, "dur": 7, "args": { "External id": 105210, "cbid": 211, "correlation": 105210 } }, { "ph": "s", "id": 105210, "pid": 76337, "tid": -914061504, "ts": 1716454222974309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223028522, "dur": 6, "args": { "External id": 105219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105219, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105219, "pid": 5, "tid": 7, "ts": 1716454223028522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974348, "dur": 9, "args": { "External id": 105219, "cbid": 211, "correlation": 105219 } }, { "ph": "s", "id": 105219, "pid": 76337, "tid": -914061504, "ts": 1716454222974348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222974399, "dur": 0, "args": { "External id": 105229, "cbid": 317, "correlation": 105229 } }, { "ph": "f", "id": 105229, "pid": 76337, "tid": -914061504, "ts": 1716454222974399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222974399, "dur": 0, "args": { "External id": 105230, "cbid": 203, "correlation": 105230 } }, { "ph": "f", "id": 105230, "pid": 76337, "tid": -914061504, "ts": 1716454222974399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222974400, "dur": 0, "args": { "External id": 105231, "cbid": 205, "correlation": 105231 } }, { "ph": "f", "id": 105231, "pid": 76337, "tid": -914061504, "ts": 1716454222974400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223028529, "dur": 7, "args": { "External id": 105235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105235, "pid": 5, "tid": 7, "ts": 1716454223028529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974414, "dur": 11, "args": { "External id": 105235, "cbid": 211, "correlation": 105235 } }, { "ph": "s", "id": 105235, "pid": 76337, "tid": -914061504, "ts": 1716454222974414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223028537, "dur": 312, "args": { "External id": 105237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105237, "pid": 5, "tid": 7, "ts": 1716454223028537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974427, "dur": 5, "args": { "External id": 105237, "cbid": 211, "correlation": 105237 } }, { "ph": "s", "id": 105237, "pid": 76337, "tid": -914061504, "ts": 1716454222974427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223028851, "dur": 1, "args": { "External id": 105239, "device": 5, "context": 1, "stream": 7, "correlation": 105239, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 105239, "pid": 5, "tid": 7, "ts": 1716454223028851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222974438, "dur": 6, "args": { "External id": 105239, "cbid": 51, "correlation": 105239 } }, { "ph": "s", "id": 105239, "pid": 76337, "tid": -914061504, "ts": 1716454222974438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223028855, "dur": 488, "args": { "External id": 105240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105240, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105240, "pid": 5, "tid": 7, "ts": 1716454223028855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974445, "dur": 7, "args": { "External id": 105240, "cbid": 211, "correlation": 105240 } }, { "ph": "s", "id": 105240, "pid": 76337, "tid": -914061504, "ts": 1716454222974445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223029345, "dur": 5, "args": { "External id": 105242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105242, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105242, "pid": 5, "tid": 7, "ts": 1716454223029345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974455, "dur": 5, "args": { "External id": 105242, "cbid": 211, "correlation": 105242 } }, { "ph": "s", "id": 105242, "pid": 76337, "tid": -914061504, "ts": 1716454222974455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223029351, "dur": 6, "args": { "External id": 105248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105248, "pid": 5, "tid": 7, "ts": 1716454223029351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974483, "dur": 8, "args": { "External id": 105248, "cbid": 211, "correlation": 105248 } }, { "ph": "s", "id": 105248, "pid": 76337, "tid": -914061504, "ts": 1716454222974483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223029359, "dur": 3, "args": { "External id": 105256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105256, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 105256, "pid": 5, "tid": 7, "ts": 1716454223029359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974525, "dur": 9, "args": { "External id": 105256, "cbid": 211, "correlation": 105256 } }, { "ph": "s", "id": 105256, "pid": 76337, "tid": -914061504, "ts": 1716454222974525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222974586, "dur": 1, "args": { "External id": 105272, "cbid": 251, "correlation": 105272 } }, { "ph": "f", "id": 105272, "pid": 76337, "tid": -914061504, "ts": 1716454222974586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222974591, "dur": 0, "args": { "External id": 105274, "cbid": 251, "correlation": 105274 } }, { "ph": "f", "id": 105274, "pid": 76337, "tid": -914061504, "ts": 1716454222974591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223029363, "dur": 13, "args": { "External id": 105275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105275, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105275, "pid": 5, "tid": 7, "ts": 1716454223029363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974593, "dur": 12, "args": { "External id": 105275, "cbid": 211, "correlation": 105275 } }, { "ph": "s", "id": 105275, "pid": 76337, "tid": -914061504, "ts": 1716454222974593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223029378, "dur": 5, "args": { "External id": 105277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105277, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105277, "pid": 5, "tid": 7, "ts": 1716454223029378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974606, "dur": 5, "args": { "External id": 105277, "cbid": 211, "correlation": 105277 } }, { "ph": "s", "id": 105277, "pid": 76337, "tid": -914061504, "ts": 1716454222974606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223029384, "dur": 6, "args": { "External id": 105287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105287, "pid": 5, "tid": 7, "ts": 1716454223029384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974662, "dur": 11, "args": { "External id": 105287, "cbid": 211, "correlation": 105287 } }, { "ph": "s", "id": 105287, "pid": 76337, "tid": -914061504, "ts": 1716454222974662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223029391, "dur": 9, "args": { "External id": 105307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105307, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 105307, "pid": 5, "tid": 7, "ts": 1716454223029391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974727, "dur": 11, "args": { "External id": 105307, "cbid": 211, "correlation": 105307 } }, { "ph": "s", "id": 105307, "pid": 76337, "tid": -914061504, "ts": 1716454222974727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223029402, "dur": 4, "args": { "External id": 105319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105319, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 105319, "pid": 5, "tid": 7, "ts": 1716454223029402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974748, "dur": 6, "args": { "External id": 105319, "cbid": 211, "correlation": 105319 } }, { "ph": "s", "id": 105319, "pid": 76337, "tid": -914061504, "ts": 1716454222974748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223029406, "dur": 6, "args": { "External id": 105322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105322, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105322, "pid": 5, "tid": 7, "ts": 1716454223029406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974767, "dur": 6, "args": { "External id": 105322, "cbid": 211, "correlation": 105322 } }, { "ph": "s", "id": 105322, "pid": 76337, "tid": -914061504, "ts": 1716454222974767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223029414, "dur": 4, "args": { "External id": 105331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105331, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105331, "pid": 5, "tid": 7, "ts": 1716454223029414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974808, "dur": 10, "args": { "External id": 105331, "cbid": 211, "correlation": 105331 } }, { "ph": "s", "id": 105331, "pid": 76337, "tid": -914061504, "ts": 1716454222974808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222974870, "dur": 0, "args": { "External id": 105341, "cbid": 317, "correlation": 105341 } }, { "ph": "f", "id": 105341, "pid": 76337, "tid": -914061504, "ts": 1716454222974870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222974871, "dur": 0, "args": { "External id": 105342, "cbid": 203, "correlation": 105342 } }, { "ph": "f", "id": 105342, "pid": 76337, "tid": -914061504, "ts": 1716454222974871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222974872, "dur": 0, "args": { "External id": 105343, "cbid": 205, "correlation": 105343 } }, { "ph": "f", "id": 105343, "pid": 76337, "tid": -914061504, "ts": 1716454222974872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223029420, "dur": 5, "args": { "External id": 105347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105347, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105347, "pid": 5, "tid": 7, "ts": 1716454223029420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974886, "dur": 12, "args": { "External id": 105347, "cbid": 211, "correlation": 105347 } }, { "ph": "s", "id": 105347, "pid": 76337, "tid": -914061504, "ts": 1716454222974886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223029426, "dur": 160, "args": { "External id": 105349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105349, "pid": 5, "tid": 7, "ts": 1716454223029426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974900, "dur": 6, "args": { "External id": 105349, "cbid": 211, "correlation": 105349 } }, { "ph": "s", "id": 105349, "pid": 76337, "tid": -914061504, "ts": 1716454222974900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223029588, "dur": 1, "args": { "External id": 105351, "device": 5, "context": 1, "stream": 7, "correlation": 105351, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 105351, "pid": 5, "tid": 7, "ts": 1716454223029588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222974911, "dur": 7, "args": { "External id": 105351, "cbid": 51, "correlation": 105351 } }, { "ph": "s", "id": 105351, "pid": 76337, "tid": -914061504, "ts": 1716454222974911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223029592, "dur": 254, "args": { "External id": 105352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105352, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105352, "pid": 5, "tid": 7, "ts": 1716454223029592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974919, "dur": 6, "args": { "External id": 105352, "cbid": 211, "correlation": 105352 } }, { "ph": "s", "id": 105352, "pid": 76337, "tid": -914061504, "ts": 1716454222974919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223029847, "dur": 6, "args": { "External id": 105354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105354, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105354, "pid": 5, "tid": 7, "ts": 1716454223029847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974929, "dur": 5, "args": { "External id": 105354, "cbid": 211, "correlation": 105354 } }, { "ph": "s", "id": 105354, "pid": 76337, "tid": -914061504, "ts": 1716454222974929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223029854, "dur": 6, "args": { "External id": 105360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105360, "pid": 5, "tid": 7, "ts": 1716454223029854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222974958, "dur": 8, "args": { "External id": 105360, "cbid": 211, "correlation": 105360 } }, { "ph": "s", "id": 105360, "pid": 76337, "tid": -914061504, "ts": 1716454222974958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222975027, "dur": 0, "args": { "External id": 105370, "cbid": 317, "correlation": 105370 } }, { "ph": "f", "id": 105370, "pid": 76337, "tid": -914061504, "ts": 1716454222975027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222975028, "dur": 0, "args": { "External id": 105371, "cbid": 203, "correlation": 105371 } }, { "ph": "f", "id": 105371, "pid": 76337, "tid": -914061504, "ts": 1716454222975028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222975028, "dur": 0, "args": { "External id": 105372, "cbid": 205, "correlation": 105372 } }, { "ph": "f", "id": 105372, "pid": 76337, "tid": -914061504, "ts": 1716454222975028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223029861, "dur": 8, "args": { "External id": 105376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105376, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105376, "pid": 5, "tid": 7, "ts": 1716454223029861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975040, "dur": 13, "args": { "External id": 105376, "cbid": 211, "correlation": 105376 } }, { "ph": "s", "id": 105376, "pid": 76337, "tid": -914061504, "ts": 1716454222975040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223029871, "dur": 3, "args": { "External id": 105378, "device": 5, "context": 1, "stream": 7, "correlation": 105378, "bytes": 4800, "memory bandwidth (GB/s)": 1.514673398548438 } }, { "ph": "f", "id": 105378, "pid": 5, "tid": 7, "ts": 1716454223029871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222975058, "dur": 9, "args": { "External id": 105378, "cbid": 51, "correlation": 105378 } }, { "ph": "s", "id": 105378, "pid": 76337, "tid": -914061504, "ts": 1716454222975058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223029875, "dur": 93, "args": { "External id": 105379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105379, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 105379, "pid": 5, "tid": 7, "ts": 1716454223029875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975068, "dur": 6, "args": { "External id": 105379, "cbid": 211, "correlation": 105379 } }, { "ph": "s", "id": 105379, "pid": 76337, "tid": -914061504, "ts": 1716454222975068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223029969, "dur": 5, "args": { "External id": 105381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105381, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105381, "pid": 5, "tid": 7, "ts": 1716454223029969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975078, "dur": 5, "args": { "External id": 105381, "cbid": 211, "correlation": 105381 } }, { "ph": "s", "id": 105381, "pid": 76337, "tid": -914061504, "ts": 1716454222975078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223029976, "dur": 6, "args": { "External id": 105387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105387, "pid": 5, "tid": 7, "ts": 1716454223029976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975105, "dur": 8, "args": { "External id": 105387, "cbid": 211, "correlation": 105387 } }, { "ph": "s", "id": 105387, "pid": 76337, "tid": -914061504, "ts": 1716454222975105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223029983, "dur": 5, "args": { "External id": 105395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105395, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105395, "pid": 5, "tid": 7, "ts": 1716454223029983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975134, "dur": 8, "args": { "External id": 105395, "cbid": 211, "correlation": 105395 } }, { "ph": "s", "id": 105395, "pid": 76337, "tid": -914061504, "ts": 1716454222975134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223029990, "dur": 4, "args": { "External id": 105403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105403, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 105403, "pid": 5, "tid": 7, "ts": 1716454223029990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975163, "dur": 8, "args": { "External id": 105403, "cbid": 211, "correlation": 105403 } }, { "ph": "s", "id": 105403, "pid": 76337, "tid": -914061504, "ts": 1716454222975163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223029995, "dur": 14, "args": { "External id": 105414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105414, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105414, "pid": 5, "tid": 7, "ts": 1716454223029995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975245, "dur": 13, "args": { "External id": 105414, "cbid": 211, "correlation": 105414 } }, { "ph": "s", "id": 105414, "pid": 76337, "tid": -914061504, "ts": 1716454222975245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222975302, "dur": 0, "args": { "External id": 105424, "cbid": 317, "correlation": 105424 } }, { "ph": "f", "id": 105424, "pid": 76337, "tid": -914061504, "ts": 1716454222975302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222975303, "dur": 0, "args": { "External id": 105425, "cbid": 203, "correlation": 105425 } }, { "ph": "f", "id": 105425, "pid": 76337, "tid": -914061504, "ts": 1716454222975303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222975304, "dur": 0, "args": { "External id": 105426, "cbid": 205, "correlation": 105426 } }, { "ph": "f", "id": 105426, "pid": 76337, "tid": -914061504, "ts": 1716454222975304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223030011, "dur": 8, "args": { "External id": 105430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105430, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105430, "pid": 5, "tid": 7, "ts": 1716454223030011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975317, "dur": 11, "args": { "External id": 105430, "cbid": 211, "correlation": 105430 } }, { "ph": "s", "id": 105430, "pid": 76337, "tid": -914061504, "ts": 1716454222975317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223030020, "dur": 161, "args": { "External id": 105432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105432, "pid": 5, "tid": 7, "ts": 1716454223030020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975330, "dur": 5, "args": { "External id": 105432, "cbid": 211, "correlation": 105432 } }, { "ph": "s", "id": 105432, "pid": 76337, "tid": -914061504, "ts": 1716454222975330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223030183, "dur": 1, "args": { "External id": 105434, "device": 5, "context": 1, "stream": 7, "correlation": 105434, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 105434, "pid": 5, "tid": 7, "ts": 1716454223030183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222975341, "dur": 7, "args": { "External id": 105434, "cbid": 51, "correlation": 105434 } }, { "ph": "s", "id": 105434, "pid": 76337, "tid": -914061504, "ts": 1716454222975341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223030187, "dur": 643, "args": { "External id": 105435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105435, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105435, "pid": 5, "tid": 7, "ts": 1716454223030187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975349, "dur": 6, "args": { "External id": 105435, "cbid": 211, "correlation": 105435 } }, { "ph": "s", "id": 105435, "pid": 76337, "tid": -914061504, "ts": 1716454222975349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223030831, "dur": 11, "args": { "External id": 105437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105437, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105437, "pid": 5, "tid": 7, "ts": 1716454223030831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975359, "dur": 5, "args": { "External id": 105437, "cbid": 211, "correlation": 105437 } }, { "ph": "s", "id": 105437, "pid": 76337, "tid": -914061504, "ts": 1716454222975359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223030843, "dur": 14, "args": { "External id": 105443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105443, "pid": 5, "tid": 7, "ts": 1716454223030843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975387, "dur": 9, "args": { "External id": 105443, "cbid": 211, "correlation": 105443 } }, { "ph": "s", "id": 105443, "pid": 76337, "tid": -914061504, "ts": 1716454222975387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223030859, "dur": 30, "args": { "External id": 105452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105452, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105452, "pid": 5, "tid": 7, "ts": 1716454223030859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975476, "dur": 12, "args": { "External id": 105452, "cbid": 211, "correlation": 105452 } }, { "ph": "s", "id": 105452, "pid": 76337, "tid": -914061504, "ts": 1716454222975476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223030890, "dur": 30, "args": { "External id": 105472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105472, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 105472, "pid": 5, "tid": 7, "ts": 1716454223030890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975544, "dur": 12, "args": { "External id": 105472, "cbid": 211, "correlation": 105472 } }, { "ph": "s", "id": 105472, "pid": 76337, "tid": -914061504, "ts": 1716454222975544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223030921, "dur": 4, "args": { "External id": 105484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105484, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105484, "pid": 5, "tid": 7, "ts": 1716454223030921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975566, "dur": 6, "args": { "External id": 105484, "cbid": 211, "correlation": 105484 } }, { "ph": "s", "id": 105484, "pid": 76337, "tid": -914061504, "ts": 1716454222975566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223030927, "dur": 31, "args": { "External id": 105487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105487, "pid": 5, "tid": 7, "ts": 1716454223030927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975584, "dur": 8, "args": { "External id": 105487, "cbid": 211, "correlation": 105487 } }, { "ph": "s", "id": 105487, "pid": 76337, "tid": -914061504, "ts": 1716454222975584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223030960, "dur": 21, "args": { "External id": 105496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105496, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105496, "pid": 5, "tid": 7, "ts": 1716454223030960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975624, "dur": 10, "args": { "External id": 105496, "cbid": 211, "correlation": 105496 } }, { "ph": "s", "id": 105496, "pid": 76337, "tid": -914061504, "ts": 1716454222975624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222975676, "dur": 0, "args": { "External id": 105506, "cbid": 317, "correlation": 105506 } }, { "ph": "f", "id": 105506, "pid": 76337, "tid": -914061504, "ts": 1716454222975676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222975677, "dur": 0, "args": { "External id": 105507, "cbid": 203, "correlation": 105507 } }, { "ph": "f", "id": 105507, "pid": 76337, "tid": -914061504, "ts": 1716454222975677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222975678, "dur": 0, "args": { "External id": 105508, "cbid": 205, "correlation": 105508 } }, { "ph": "f", "id": 105508, "pid": 76337, "tid": -914061504, "ts": 1716454222975678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223030981, "dur": 23, "args": { "External id": 105512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105512, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105512, "pid": 5, "tid": 7, "ts": 1716454223030981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975692, "dur": 13, "args": { "External id": 105512, "cbid": 211, "correlation": 105512 } }, { "ph": "s", "id": 105512, "pid": 76337, "tid": -914061504, "ts": 1716454222975692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223031006, "dur": 314, "args": { "External id": 105514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105514, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105514, "pid": 5, "tid": 7, "ts": 1716454223031006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975708, "dur": 5, "args": { "External id": 105514, "cbid": 211, "correlation": 105514 } }, { "ph": "s", "id": 105514, "pid": 76337, "tid": -914061504, "ts": 1716454222975708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223031322, "dur": 1, "args": { "External id": 105516, "device": 5, "context": 1, "stream": 7, "correlation": 105516, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 105516, "pid": 5, "tid": 7, "ts": 1716454223031322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222975719, "dur": 7, "args": { "External id": 105516, "cbid": 51, "correlation": 105516 } }, { "ph": "s", "id": 105516, "pid": 76337, "tid": -914061504, "ts": 1716454222975719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223031325, "dur": 1227, "args": { "External id": 105517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105517, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105517, "pid": 5, "tid": 7, "ts": 1716454223031325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975727, "dur": 6, "args": { "External id": 105517, "cbid": 211, "correlation": 105517 } }, { "ph": "s", "id": 105517, "pid": 76337, "tid": -914061504, "ts": 1716454222975727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223032553, "dur": 12, "args": { "External id": 105519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105519, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105519, "pid": 5, "tid": 7, "ts": 1716454223032553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975737, "dur": 5, "args": { "External id": 105519, "cbid": 211, "correlation": 105519 } }, { "ph": "s", "id": 105519, "pid": 76337, "tid": -914061504, "ts": 1716454222975737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223032567, "dur": 14, "args": { "External id": 105525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105525, "pid": 5, "tid": 7, "ts": 1716454223032567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975765, "dur": 9, "args": { "External id": 105525, "cbid": 211, "correlation": 105525 } }, { "ph": "s", "id": 105525, "pid": 76337, "tid": -914061504, "ts": 1716454222975765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223032583, "dur": 3, "args": { "External id": 105533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105533, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 105533, "pid": 5, "tid": 7, "ts": 1716454223032583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975810, "dur": 9, "args": { "External id": 105533, "cbid": 211, "correlation": 105533 } }, { "ph": "s", "id": 105533, "pid": 76337, "tid": -914061504, "ts": 1716454222975810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222975872, "dur": 1, "args": { "External id": 105549, "cbid": 251, "correlation": 105549 } }, { "ph": "f", "id": 105549, "pid": 76337, "tid": -914061504, "ts": 1716454222975872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222975878, "dur": 0, "args": { "External id": 105551, "cbid": 251, "correlation": 105551 } }, { "ph": "f", "id": 105551, "pid": 76337, "tid": -914061504, "ts": 1716454222975878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223032587, "dur": 12, "args": { "External id": 105552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105552, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105552, "pid": 5, "tid": 7, "ts": 1716454223032587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975880, "dur": 11, "args": { "External id": 105552, "cbid": 211, "correlation": 105552 } }, { "ph": "s", "id": 105552, "pid": 76337, "tid": -914061504, "ts": 1716454222975880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223032601, "dur": 5, "args": { "External id": 105554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105554, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105554, "pid": 5, "tid": 7, "ts": 1716454223032601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975893, "dur": 6, "args": { "External id": 105554, "cbid": 211, "correlation": 105554 } }, { "ph": "s", "id": 105554, "pid": 76337, "tid": -914061504, "ts": 1716454222975893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223032607, "dur": 17, "args": { "External id": 105564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105564, "pid": 5, "tid": 7, "ts": 1716454223032607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222975950, "dur": 12, "args": { "External id": 105564, "cbid": 211, "correlation": 105564 } }, { "ph": "s", "id": 105564, "pid": 76337, "tid": -914061504, "ts": 1716454222975950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223032625, "dur": 17, "args": { "External id": 105584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105584, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 105584, "pid": 5, "tid": 7, "ts": 1716454223032625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976026, "dur": 11, "args": { "External id": 105584, "cbid": 211, "correlation": 105584 } }, { "ph": "s", "id": 105584, "pid": 76337, "tid": -914061504, "ts": 1716454222976026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223032644, "dur": 4, "args": { "External id": 105596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105596, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 105596, "pid": 5, "tid": 7, "ts": 1716454223032644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976047, "dur": 7, "args": { "External id": 105596, "cbid": 211, "correlation": 105596 } }, { "ph": "s", "id": 105596, "pid": 76337, "tid": -914061504, "ts": 1716454222976047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223032649, "dur": 16, "args": { "External id": 105599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105599, "pid": 5, "tid": 7, "ts": 1716454223032649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976067, "dur": 6, "args": { "External id": 105599, "cbid": 211, "correlation": 105599 } }, { "ph": "s", "id": 105599, "pid": 76337, "tid": -914061504, "ts": 1716454222976067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223032666, "dur": 11, "args": { "External id": 105608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105608, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105608, "pid": 5, "tid": 7, "ts": 1716454223032666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976108, "dur": 10, "args": { "External id": 105608, "cbid": 211, "correlation": 105608 } }, { "ph": "s", "id": 105608, "pid": 76337, "tid": -914061504, "ts": 1716454222976108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222976172, "dur": 0, "args": { "External id": 105618, "cbid": 317, "correlation": 105618 } }, { "ph": "f", "id": 105618, "pid": 76337, "tid": -914061504, "ts": 1716454222976172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222976173, "dur": 0, "args": { "External id": 105619, "cbid": 203, "correlation": 105619 } }, { "ph": "f", "id": 105619, "pid": 76337, "tid": -914061504, "ts": 1716454222976173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222976174, "dur": 0, "args": { "External id": 105620, "cbid": 205, "correlation": 105620 } }, { "ph": "f", "id": 105620, "pid": 76337, "tid": -914061504, "ts": 1716454222976174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223032678, "dur": 12, "args": { "External id": 105624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105624, "pid": 5, "tid": 7, "ts": 1716454223032678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976187, "dur": 12, "args": { "External id": 105624, "cbid": 211, "correlation": 105624 } }, { "ph": "s", "id": 105624, "pid": 76337, "tid": -914061504, "ts": 1716454222976187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223032692, "dur": 159, "args": { "External id": 105626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105626, "pid": 5, "tid": 7, "ts": 1716454223032692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976201, "dur": 6, "args": { "External id": 105626, "cbid": 211, "correlation": 105626 } }, { "ph": "s", "id": 105626, "pid": 76337, "tid": -914061504, "ts": 1716454222976201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223032853, "dur": 1, "args": { "External id": 105628, "device": 5, "context": 1, "stream": 7, "correlation": 105628, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 105628, "pid": 5, "tid": 7, "ts": 1716454223032853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222976213, "dur": 6, "args": { "External id": 105628, "cbid": 51, "correlation": 105628 } }, { "ph": "s", "id": 105628, "pid": 76337, "tid": -914061504, "ts": 1716454222976213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223032857, "dur": 638, "args": { "External id": 105629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105629, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105629, "pid": 5, "tid": 7, "ts": 1716454223032857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976220, "dur": 6, "args": { "External id": 105629, "cbid": 211, "correlation": 105629 } }, { "ph": "s", "id": 105629, "pid": 76337, "tid": -914061504, "ts": 1716454222976220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223033497, "dur": 12, "args": { "External id": 105631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105631, "pid": 5, "tid": 7, "ts": 1716454223033497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976230, "dur": 5, "args": { "External id": 105631, "cbid": 211, "correlation": 105631 } }, { "ph": "s", "id": 105631, "pid": 76337, "tid": -914061504, "ts": 1716454222976230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223033511, "dur": 14, "args": { "External id": 105637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105637, "pid": 5, "tid": 7, "ts": 1716454223033511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976259, "dur": 8, "args": { "External id": 105637, "cbid": 211, "correlation": 105637 } }, { "ph": "s", "id": 105637, "pid": 76337, "tid": -914061504, "ts": 1716454222976259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222976318, "dur": 0, "args": { "External id": 105647, "cbid": 317, "correlation": 105647 } }, { "ph": "f", "id": 105647, "pid": 76337, "tid": -914061504, "ts": 1716454222976318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222976319, "dur": 0, "args": { "External id": 105648, "cbid": 203, "correlation": 105648 } }, { "ph": "f", "id": 105648, "pid": 76337, "tid": -914061504, "ts": 1716454222976319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222976319, "dur": 0, "args": { "External id": 105649, "cbid": 205, "correlation": 105649 } }, { "ph": "f", "id": 105649, "pid": 76337, "tid": -914061504, "ts": 1716454222976319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223033526, "dur": 21, "args": { "External id": 105653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105653, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105653, "pid": 5, "tid": 7, "ts": 1716454223033526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976333, "dur": 12, "args": { "External id": 105653, "cbid": 211, "correlation": 105653 } }, { "ph": "s", "id": 105653, "pid": 76337, "tid": -914061504, "ts": 1716454222976333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223033549, "dur": 4, "args": { "External id": 105655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 105655, "pid": 5, "tid": 7, "ts": 1716454223033549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976350, "dur": 7, "args": { "External id": 105655, "cbid": 211, "correlation": 105655 } }, { "ph": "s", "id": 105655, "pid": 76337, "tid": -914061504, "ts": 1716454222976350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222976361, "dur": 0, "args": { "External id": 105656, "cbid": 51, "correlation": 105656 } }, { "ph": "s", "id": 105656, "pid": 76337, "tid": -914061504, "ts": 1716454222976361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223033554, "dur": 172, "args": { "External id": 105657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105657, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 105657, "pid": 5, "tid": 7, "ts": 1716454223033554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976362, "dur": 5, "args": { "External id": 105657, "cbid": 211, "correlation": 105657 } }, { "ph": "s", "id": 105657, "pid": 76337, "tid": -914061504, "ts": 1716454222976362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223033728, "dur": 15, "args": { "External id": 105662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105662, "pid": 5, "tid": 7, "ts": 1716454223033728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976388, "dur": 8, "args": { "External id": 105662, "cbid": 211, "correlation": 105662 } }, { "ph": "s", "id": 105662, "pid": 76337, "tid": -914061504, "ts": 1716454222976388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223033745, "dur": 12, "args": { "External id": 105670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105670, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105670, "pid": 5, "tid": 7, "ts": 1716454223033745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976417, "dur": 8, "args": { "External id": 105670, "cbid": 211, "correlation": 105670 } }, { "ph": "s", "id": 105670, "pid": 76337, "tid": -914061504, "ts": 1716454222976417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223033758, "dur": 10, "args": { "External id": 105678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105678, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105678, "pid": 5, "tid": 7, "ts": 1716454223033758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976445, "dur": 8, "args": { "External id": 105678, "cbid": 211, "correlation": 105678 } }, { "ph": "s", "id": 105678, "pid": 76337, "tid": -914061504, "ts": 1716454222976445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223033769, "dur": 18, "args": { "External id": 105698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105698, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 105698, "pid": 5, "tid": 7, "ts": 1716454223033769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976527, "dur": 12, "args": { "External id": 105698, "cbid": 211, "correlation": 105698 } }, { "ph": "s", "id": 105698, "pid": 76337, "tid": -914061504, "ts": 1716454222976527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223033787, "dur": 4, "args": { "External id": 105710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105710, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 105710, "pid": 5, "tid": 7, "ts": 1716454223033787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976549, "dur": 7, "args": { "External id": 105710, "cbid": 211, "correlation": 105710 } }, { "ph": "s", "id": 105710, "pid": 76337, "tid": -914061504, "ts": 1716454222976549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223033793, "dur": 16, "args": { "External id": 105713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105713, "pid": 5, "tid": 7, "ts": 1716454223033793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976568, "dur": 7, "args": { "External id": 105713, "cbid": 211, "correlation": 105713 } }, { "ph": "s", "id": 105713, "pid": 76337, "tid": -914061504, "ts": 1716454222976568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222976624, "dur": 0, "args": { "External id": 105724, "cbid": 317, "correlation": 105724 } }, { "ph": "f", "id": 105724, "pid": 76337, "tid": -914061504, "ts": 1716454222976624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222976625, "dur": 0, "args": { "External id": 105725, "cbid": 203, "correlation": 105725 } }, { "ph": "f", "id": 105725, "pid": 76337, "tid": -914061504, "ts": 1716454222976625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222976626, "dur": 0, "args": { "External id": 105726, "cbid": 205, "correlation": 105726 } }, { "ph": "f", "id": 105726, "pid": 76337, "tid": -914061504, "ts": 1716454222976626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223033811, "dur": 11, "args": { "External id": 105730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105730, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105730, "pid": 5, "tid": 7, "ts": 1716454223033811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976639, "dur": 12, "args": { "External id": 105730, "cbid": 211, "correlation": 105730 } }, { "ph": "s", "id": 105730, "pid": 76337, "tid": -914061504, "ts": 1716454222976639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223033823, "dur": 3, "args": { "External id": 105732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 105732, "pid": 5, "tid": 7, "ts": 1716454223033823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976655, "dur": 6, "args": { "External id": 105732, "cbid": 211, "correlation": 105732 } }, { "ph": "s", "id": 105732, "pid": 76337, "tid": -914061504, "ts": 1716454222976655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222976664, "dur": 0, "args": { "External id": 105733, "cbid": 51, "correlation": 105733 } }, { "ph": "s", "id": 105733, "pid": 76337, "tid": -914061504, "ts": 1716454222976664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223033828, "dur": 88, "args": { "External id": 105734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105734, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 105734, "pid": 5, "tid": 7, "ts": 1716454223033828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976665, "dur": 5, "args": { "External id": 105734, "cbid": 211, "correlation": 105734 } }, { "ph": "s", "id": 105734, "pid": 76337, "tid": -914061504, "ts": 1716454222976665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223033917, "dur": 15, "args": { "External id": 105739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105739, "pid": 5, "tid": 7, "ts": 1716454223033917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976691, "dur": 8, "args": { "External id": 105739, "cbid": 211, "correlation": 105739 } }, { "ph": "s", "id": 105739, "pid": 76337, "tid": -914061504, "ts": 1716454222976691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223033934, "dur": 82, "args": { "External id": 105748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105748, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105748, "pid": 5, "tid": 7, "ts": 1716454223033934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976773, "dur": 15, "args": { "External id": 105748, "cbid": 211, "correlation": 105748 } }, { "ph": "s", "id": 105748, "pid": 76337, "tid": -914061504, "ts": 1716454222976773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223034017, "dur": 30, "args": { "External id": 105770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105770, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105770, "pid": 5, "tid": 7, "ts": 1716454223034017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976831, "dur": 10, "args": { "External id": 105770, "cbid": 211, "correlation": 105770 } }, { "ph": "s", "id": 105770, "pid": 76337, "tid": -914061504, "ts": 1716454222976831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222976924, "dur": 1, "args": { "External id": 105781, "cbid": 251, "correlation": 105781 } }, { "ph": "f", "id": 105781, "pid": 76337, "tid": -914061504, "ts": 1716454222976924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223034048, "dur": 162, "args": { "External id": 105782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105782, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105782, "pid": 5, "tid": 7, "ts": 1716454223034048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222976929, "dur": 13, "args": { "External id": 105782, "cbid": 211, "correlation": 105782 } }, { "ph": "s", "id": 105782, "pid": 76337, "tid": -914061504, "ts": 1716454222976929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977011, "dur": 1, "args": { "External id": 105793, "cbid": 251, "correlation": 105793 } }, { "ph": "f", "id": 105793, "pid": 76337, "tid": -914061504, "ts": 1716454222977011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223034212, "dur": 156, "args": { "External id": 105794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105794, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105794, "pid": 5, "tid": 7, "ts": 1716454223034212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977015, "dur": 12, "args": { "External id": 105794, "cbid": 211, "correlation": 105794 } }, { "ph": "s", "id": 105794, "pid": 76337, "tid": -914061504, "ts": 1716454222977015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977082, "dur": 1, "args": { "External id": 105805, "cbid": 251, "correlation": 105805 } }, { "ph": "f", "id": 105805, "pid": 76337, "tid": -914061504, "ts": 1716454222977082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223034369, "dur": 157, "args": { "External id": 105806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105806, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105806, "pid": 5, "tid": 7, "ts": 1716454223034369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977086, "dur": 11, "args": { "External id": 105806, "cbid": 211, "correlation": 105806 } }, { "ph": "s", "id": 105806, "pid": 76337, "tid": -914061504, "ts": 1716454222977086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223034527, "dur": 330, "args": { "External id": 105831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105831, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105831, "pid": 5, "tid": 7, "ts": 1716454223034527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977174, "dur": 13, "args": { "External id": 105831, "cbid": 211, "correlation": 105831 } }, { "ph": "s", "id": 105831, "pid": 76337, "tid": -914061504, "ts": 1716454222977174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977276, "dur": 1, "args": { "External id": 105849, "cbid": 251, "correlation": 105849 } }, { "ph": "f", "id": 105849, "pid": 76337, "tid": -914061504, "ts": 1716454222977276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223034859, "dur": 165, "args": { "External id": 105851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105851, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105851, "pid": 5, "tid": 7, "ts": 1716454223034859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977282, "dur": 13, "args": { "External id": 105851, "cbid": 211, "correlation": 105851 } }, { "ph": "s", "id": 105851, "pid": 76337, "tid": -914061504, "ts": 1716454222977282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223035025, "dur": 19, "args": { "External id": 105859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105859, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105859, "pid": 5, "tid": 7, "ts": 1716454223035025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977353, "dur": 13, "args": { "External id": 105859, "cbid": 211, "correlation": 105859 } }, { "ph": "s", "id": 105859, "pid": 76337, "tid": -914061504, "ts": 1716454222977353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223035046, "dur": 27, "args": { "External id": 105867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105867, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105867, "pid": 5, "tid": 7, "ts": 1716454223035046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977392, "dur": 8, "args": { "External id": 105867, "cbid": 211, "correlation": 105867 } }, { "ph": "s", "id": 105867, "pid": 76337, "tid": -914061504, "ts": 1716454222977392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223035074, "dur": 19, "args": { "External id": 105878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105878, "pid": 5, "tid": 7, "ts": 1716454223035074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977467, "dur": 12, "args": { "External id": 105878, "cbid": 211, "correlation": 105878 } }, { "ph": "s", "id": 105878, "pid": 76337, "tid": -914061504, "ts": 1716454222977467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223035094, "dur": 16, "args": { "External id": 105900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105900, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105900, "pid": 5, "tid": 7, "ts": 1716454223035094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977497, "dur": 8, "args": { "External id": 105900, "cbid": 211, "correlation": 105900 } }, { "ph": "s", "id": 105900, "pid": 76337, "tid": -914061504, "ts": 1716454222977497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977584, "dur": 1, "args": { "External id": 105911, "cbid": 251, "correlation": 105911 } }, { "ph": "f", "id": 105911, "pid": 76337, "tid": -914061504, "ts": 1716454222977584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223035111, "dur": 87, "args": { "External id": 105912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105912, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 105912, "pid": 5, "tid": 7, "ts": 1716454223035111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977589, "dur": 13, "args": { "External id": 105912, "cbid": 211, "correlation": 105912 } }, { "ph": "s", "id": 105912, "pid": 76337, "tid": -914061504, "ts": 1716454222977589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977659, "dur": 1, "args": { "External id": 105923, "cbid": 251, "correlation": 105923 } }, { "ph": "f", "id": 105923, "pid": 76337, "tid": -914061504, "ts": 1716454222977659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977663, "dur": 0, "args": { "External id": 105924, "cbid": 251, "correlation": 105924 } }, { "ph": "f", "id": 105924, "pid": 76337, "tid": -914061504, "ts": 1716454222977663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223035200, "dur": 12, "args": { "External id": 105925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105925, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105925, "pid": 5, "tid": 7, "ts": 1716454223035200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977664, "dur": 12, "args": { "External id": 105925, "cbid": 211, "correlation": 105925 } }, { "ph": "s", "id": 105925, "pid": 76337, "tid": -914061504, "ts": 1716454222977664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223035213, "dur": 6, "args": { "External id": 105927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105927, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105927, "pid": 5, "tid": 7, "ts": 1716454223035213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977678, "dur": 6, "args": { "External id": 105927, "cbid": 211, "correlation": 105927 } }, { "ph": "s", "id": 105927, "pid": 76337, "tid": -914061504, "ts": 1716454222977678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977735, "dur": 1, "args": { "External id": 105938, "cbid": 251, "correlation": 105938 } }, { "ph": "f", "id": 105938, "pid": 76337, "tid": -914061504, "ts": 1716454222977735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977738, "dur": 0, "args": { "External id": 105939, "cbid": 251, "correlation": 105939 } }, { "ph": "f", "id": 105939, "pid": 76337, "tid": -914061504, "ts": 1716454222977738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223035220, "dur": 8, "args": { "External id": 105940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105940, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105940, "pid": 5, "tid": 7, "ts": 1716454223035220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977740, "dur": 12, "args": { "External id": 105940, "cbid": 211, "correlation": 105940 } }, { "ph": "s", "id": 105940, "pid": 76337, "tid": -914061504, "ts": 1716454222977740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223035230, "dur": 3, "args": { "External id": 105942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105942, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105942, "pid": 5, "tid": 7, "ts": 1716454223035230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977753, "dur": 6, "args": { "External id": 105942, "cbid": 211, "correlation": 105942 } }, { "ph": "s", "id": 105942, "pid": 76337, "tid": -914061504, "ts": 1716454222977753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223035234, "dur": 54, "args": { "External id": 105967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105967, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 105967, "pid": 5, "tid": 7, "ts": 1716454223035234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977831, "dur": 12, "args": { "External id": 105967, "cbid": 211, "correlation": 105967 } }, { "ph": "s", "id": 105967, "pid": 76337, "tid": -914061504, "ts": 1716454222977831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222977931, "dur": 1, "args": { "External id": 105985, "cbid": 251, "correlation": 105985 } }, { "ph": "f", "id": 105985, "pid": 76337, "tid": -914061504, "ts": 1716454222977931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223035289, "dur": 89, "args": { "External id": 105987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105987, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 105987, "pid": 5, "tid": 7, "ts": 1716454223035289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222977937, "dur": 14, "args": { "External id": 105987, "cbid": 211, "correlation": 105987 } }, { "ph": "s", "id": 105987, "pid": 76337, "tid": -914061504, "ts": 1716454222977937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223035379, "dur": 9, "args": { "External id": 105995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 105995, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 105995, "pid": 5, "tid": 7, "ts": 1716454223035379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978016, "dur": 13, "args": { "External id": 105995, "cbid": 211, "correlation": 105995 } }, { "ph": "s", "id": 105995, "pid": 76337, "tid": -914061504, "ts": 1716454222978016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223035390, "dur": 21, "args": { "External id": 106003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106003, "pid": 5, "tid": 7, "ts": 1716454223035390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978060, "dur": 9, "args": { "External id": 106003, "cbid": 211, "correlation": 106003 } }, { "ph": "s", "id": 106003, "pid": 76337, "tid": -914061504, "ts": 1716454222978060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223035412, "dur": 17, "args": { "External id": 106025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106025, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106025, "pid": 5, "tid": 7, "ts": 1716454223035412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978112, "dur": 10, "args": { "External id": 106025, "cbid": 211, "correlation": 106025 } }, { "ph": "s", "id": 106025, "pid": 76337, "tid": -914061504, "ts": 1716454222978112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222978201, "dur": 1, "args": { "External id": 106041, "cbid": 251, "correlation": 106041 } }, { "ph": "f", "id": 106041, "pid": 76337, "tid": -914061504, "ts": 1716454222978201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222978206, "dur": 0, "args": { "External id": 106043, "cbid": 251, "correlation": 106043 } }, { "ph": "f", "id": 106043, "pid": 76337, "tid": -914061504, "ts": 1716454222978206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223035430, "dur": 491, "args": { "External id": 106044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106044, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106044, "pid": 5, "tid": 7, "ts": 1716454223035430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978209, "dur": 12, "args": { "External id": 106044, "cbid": 211, "correlation": 106044 } }, { "ph": "s", "id": 106044, "pid": 76337, "tid": -914061504, "ts": 1716454222978209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223035923, "dur": 64, "args": { "External id": 106052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106052, "pid": 5, "tid": 7, "ts": 1716454223035923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978277, "dur": 13, "args": { "External id": 106052, "cbid": 211, "correlation": 106052 } }, { "ph": "s", "id": 106052, "pid": 76337, "tid": -914061504, "ts": 1716454222978277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223035989, "dur": 67, "args": { "External id": 106060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106060, "pid": 5, "tid": 7, "ts": 1716454223035989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978310, "dur": 8, "args": { "External id": 106060, "cbid": 211, "correlation": 106060 } }, { "ph": "s", "id": 106060, "pid": 76337, "tid": -914061504, "ts": 1716454222978310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222978390, "dur": 1, "args": { "External id": 106076, "cbid": 251, "correlation": 106076 } }, { "ph": "f", "id": 106076, "pid": 76337, "tid": -914061504, "ts": 1716454222978390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223036058, "dur": 1, "args": { "External id": 106078, "device": 5, "context": 1, "stream": 7, "correlation": 106078, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 106078, "pid": 5, "tid": 7, "ts": 1716454223036058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222978395, "dur": 10, "args": { "External id": 106078, "cbid": 51, "correlation": 106078 } }, { "ph": "s", "id": 106078, "pid": 76337, "tid": -914061504, "ts": 1716454222978395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223036062, "dur": 270, "args": { "External id": 106079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106079, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106079, "pid": 5, "tid": 7, "ts": 1716454223036062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978407, "dur": 11, "args": { "External id": 106079, "cbid": 211, "correlation": 106079 } }, { "ph": "s", "id": 106079, "pid": 76337, "tid": -914061504, "ts": 1716454222978407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223036333, "dur": 13, "args": { "External id": 106087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106087, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106087, "pid": 5, "tid": 7, "ts": 1716454223036333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978448, "dur": 11, "args": { "External id": 106087, "cbid": 211, "correlation": 106087 } }, { "ph": "s", "id": 106087, "pid": 76337, "tid": -914061504, "ts": 1716454222978448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223036348, "dur": 37, "args": { "External id": 106098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106098, "pid": 5, "tid": 7, "ts": 1716454223036348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978517, "dur": 12, "args": { "External id": 106098, "cbid": 211, "correlation": 106098 } }, { "ph": "s", "id": 106098, "pid": 76337, "tid": -914061504, "ts": 1716454222978517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222978583, "dur": 0, "args": { "External id": 106110, "cbid": 317, "correlation": 106110 } }, { "ph": "f", "id": 106110, "pid": 76337, "tid": -914061504, "ts": 1716454222978583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222978584, "dur": 0, "args": { "External id": 106111, "cbid": 203, "correlation": 106111 } }, { "ph": "f", "id": 106111, "pid": 76337, "tid": -914061504, "ts": 1716454222978584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222978585, "dur": 0, "args": { "External id": 106112, "cbid": 205, "correlation": 106112 } }, { "ph": "f", "id": 106112, "pid": 76337, "tid": -914061504, "ts": 1716454222978585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223036386, "dur": 12, "args": { "External id": 106116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106116, "pid": 5, "tid": 7, "ts": 1716454223036386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978601, "dur": 12, "args": { "External id": 106116, "cbid": 211, "correlation": 106116 } }, { "ph": "s", "id": 106116, "pid": 76337, "tid": -914061504, "ts": 1716454222978601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223036400, "dur": 4, "args": { "External id": 106118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106118, "pid": 5, "tid": 7, "ts": 1716454223036400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978618, "dur": 6, "args": { "External id": 106118, "cbid": 211, "correlation": 106118 } }, { "ph": "s", "id": 106118, "pid": 76337, "tid": -914061504, "ts": 1716454222978618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222978627, "dur": 0, "args": { "External id": 106119, "cbid": 51, "correlation": 106119 } }, { "ph": "s", "id": 106119, "pid": 76337, "tid": -914061504, "ts": 1716454222978627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223036406, "dur": 95, "args": { "External id": 106120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106120, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 106120, "pid": 5, "tid": 7, "ts": 1716454223036406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978628, "dur": 5, "args": { "External id": 106120, "cbid": 211, "correlation": 106120 } }, { "ph": "s", "id": 106120, "pid": 76337, "tid": -914061504, "ts": 1716454222978628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223036502, "dur": 16, "args": { "External id": 106125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106125, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106125, "pid": 5, "tid": 7, "ts": 1716454223036502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978656, "dur": 9, "args": { "External id": 106125, "cbid": 211, "correlation": 106125 } }, { "ph": "s", "id": 106125, "pid": 76337, "tid": -914061504, "ts": 1716454222978656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223036520, "dur": 12, "args": { "External id": 106133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106133, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106133, "pid": 5, "tid": 7, "ts": 1716454223036520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978687, "dur": 8, "args": { "External id": 106133, "cbid": 211, "correlation": 106133 } }, { "ph": "s", "id": 106133, "pid": 76337, "tid": -914061504, "ts": 1716454222978687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223036533, "dur": 30, "args": { "External id": 106142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106142, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106142, "pid": 5, "tid": 7, "ts": 1716454223036533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978727, "dur": 10, "args": { "External id": 106142, "cbid": 211, "correlation": 106142 } }, { "ph": "s", "id": 106142, "pid": 76337, "tid": -914061504, "ts": 1716454222978727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223036565, "dur": 30, "args": { "External id": 106162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106162, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 106162, "pid": 5, "tid": 7, "ts": 1716454223036565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978799, "dur": 12, "args": { "External id": 106162, "cbid": 211, "correlation": 106162 } }, { "ph": "s", "id": 106162, "pid": 76337, "tid": -914061504, "ts": 1716454222978799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223036596, "dur": 5, "args": { "External id": 106174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106174, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106174, "pid": 5, "tid": 7, "ts": 1716454223036596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978822, "dur": 6, "args": { "External id": 106174, "cbid": 211, "correlation": 106174 } }, { "ph": "s", "id": 106174, "pid": 76337, "tid": -914061504, "ts": 1716454222978822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223036602, "dur": 31, "args": { "External id": 106177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106177, "pid": 5, "tid": 7, "ts": 1716454223036602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978840, "dur": 6, "args": { "External id": 106177, "cbid": 211, "correlation": 106177 } }, { "ph": "s", "id": 106177, "pid": 76337, "tid": -914061504, "ts": 1716454222978840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223036635, "dur": 20, "args": { "External id": 106186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106186, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106186, "pid": 5, "tid": 7, "ts": 1716454223036635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978880, "dur": 10, "args": { "External id": 106186, "cbid": 211, "correlation": 106186 } }, { "ph": "s", "id": 106186, "pid": 76337, "tid": -914061504, "ts": 1716454222978880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222978932, "dur": 0, "args": { "External id": 106196, "cbid": 317, "correlation": 106196 } }, { "ph": "f", "id": 106196, "pid": 76337, "tid": -914061504, "ts": 1716454222978932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222978933, "dur": 0, "args": { "External id": 106197, "cbid": 203, "correlation": 106197 } }, { "ph": "f", "id": 106197, "pid": 76337, "tid": -914061504, "ts": 1716454222978933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222978933, "dur": 0, "args": { "External id": 106198, "cbid": 205, "correlation": 106198 } }, { "ph": "f", "id": 106198, "pid": 76337, "tid": -914061504, "ts": 1716454222978933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223036656, "dur": 23, "args": { "External id": 106202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106202, "pid": 5, "tid": 7, "ts": 1716454223036656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978947, "dur": 12, "args": { "External id": 106202, "cbid": 211, "correlation": 106202 } }, { "ph": "s", "id": 106202, "pid": 76337, "tid": -914061504, "ts": 1716454222978947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223036680, "dur": 314, "args": { "External id": 106204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106204, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106204, "pid": 5, "tid": 7, "ts": 1716454223036680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978961, "dur": 5, "args": { "External id": 106204, "cbid": 211, "correlation": 106204 } }, { "ph": "s", "id": 106204, "pid": 76337, "tid": -914061504, "ts": 1716454222978961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223036995, "dur": 1, "args": { "External id": 106206, "device": 5, "context": 1, "stream": 7, "correlation": 106206, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 106206, "pid": 5, "tid": 7, "ts": 1716454223036995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222978972, "dur": 15, "args": { "External id": 106206, "cbid": 51, "correlation": 106206 } }, { "ph": "s", "id": 106206, "pid": 76337, "tid": -914061504, "ts": 1716454222978972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223036999, "dur": 1241, "args": { "External id": 106207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106207, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106207, "pid": 5, "tid": 7, "ts": 1716454223036999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978988, "dur": 7, "args": { "External id": 106207, "cbid": 211, "correlation": 106207 } }, { "ph": "s", "id": 106207, "pid": 76337, "tid": -914061504, "ts": 1716454222978988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223038241, "dur": 13, "args": { "External id": 106209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106209, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106209, "pid": 5, "tid": 7, "ts": 1716454223038241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222978999, "dur": 6, "args": { "External id": 106209, "cbid": 211, "correlation": 106209 } }, { "ph": "s", "id": 106209, "pid": 76337, "tid": -914061504, "ts": 1716454222978999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223038256, "dur": 15, "args": { "External id": 106215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106215, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106215, "pid": 5, "tid": 7, "ts": 1716454223038256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979028, "dur": 9, "args": { "External id": 106215, "cbid": 211, "correlation": 106215 } }, { "ph": "s", "id": 106215, "pid": 76337, "tid": -914061504, "ts": 1716454222979028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223038272, "dur": 3, "args": { "External id": 106223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106223, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 106223, "pid": 5, "tid": 7, "ts": 1716454223038272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979074, "dur": 9, "args": { "External id": 106223, "cbid": 211, "correlation": 106223 } }, { "ph": "s", "id": 106223, "pid": 76337, "tid": -914061504, "ts": 1716454222979074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222979139, "dur": 1, "args": { "External id": 106239, "cbid": 251, "correlation": 106239 } }, { "ph": "f", "id": 106239, "pid": 76337, "tid": -914061504, "ts": 1716454222979139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222979145, "dur": 0, "args": { "External id": 106241, "cbid": 251, "correlation": 106241 } }, { "ph": "f", "id": 106241, "pid": 76337, "tid": -914061504, "ts": 1716454222979145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223038276, "dur": 13, "args": { "External id": 106242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106242, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106242, "pid": 5, "tid": 7, "ts": 1716454223038276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979146, "dur": 12, "args": { "External id": 106242, "cbid": 211, "correlation": 106242 } }, { "ph": "s", "id": 106242, "pid": 76337, "tid": -914061504, "ts": 1716454222979146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223038290, "dur": 5, "args": { "External id": 106244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106244, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106244, "pid": 5, "tid": 7, "ts": 1716454223038290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979160, "dur": 5, "args": { "External id": 106244, "cbid": 211, "correlation": 106244 } }, { "ph": "s", "id": 106244, "pid": 76337, "tid": -914061504, "ts": 1716454222979160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223038297, "dur": 17, "args": { "External id": 106254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106254, "pid": 5, "tid": 7, "ts": 1716454223038297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979217, "dur": 12, "args": { "External id": 106254, "cbid": 211, "correlation": 106254 } }, { "ph": "s", "id": 106254, "pid": 76337, "tid": -914061504, "ts": 1716454222979217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223038315, "dur": 17, "args": { "External id": 106274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106274, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 106274, "pid": 5, "tid": 7, "ts": 1716454223038315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979283, "dur": 11, "args": { "External id": 106274, "cbid": 211, "correlation": 106274 } }, { "ph": "s", "id": 106274, "pid": 76337, "tid": -914061504, "ts": 1716454222979283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223038333, "dur": 4, "args": { "External id": 106286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106286, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 106286, "pid": 5, "tid": 7, "ts": 1716454223038333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979304, "dur": 7, "args": { "External id": 106286, "cbid": 211, "correlation": 106286 } }, { "ph": "s", "id": 106286, "pid": 76337, "tid": -914061504, "ts": 1716454222979304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223038338, "dur": 17, "args": { "External id": 106289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106289, "pid": 5, "tid": 7, "ts": 1716454223038338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979323, "dur": 7, "args": { "External id": 106289, "cbid": 211, "correlation": 106289 } }, { "ph": "s", "id": 106289, "pid": 76337, "tid": -914061504, "ts": 1716454222979323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223038356, "dur": 11, "args": { "External id": 106298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106298, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106298, "pid": 5, "tid": 7, "ts": 1716454223038356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979364, "dur": 9, "args": { "External id": 106298, "cbid": 211, "correlation": 106298 } }, { "ph": "s", "id": 106298, "pid": 76337, "tid": -914061504, "ts": 1716454222979364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222979426, "dur": 0, "args": { "External id": 106308, "cbid": 317, "correlation": 106308 } }, { "ph": "f", "id": 106308, "pid": 76337, "tid": -914061504, "ts": 1716454222979426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222979427, "dur": 0, "args": { "External id": 106309, "cbid": 203, "correlation": 106309 } }, { "ph": "f", "id": 106309, "pid": 76337, "tid": -914061504, "ts": 1716454222979427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222979428, "dur": 0, "args": { "External id": 106310, "cbid": 205, "correlation": 106310 } }, { "ph": "f", "id": 106310, "pid": 76337, "tid": -914061504, "ts": 1716454222979428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223038369, "dur": 11, "args": { "External id": 106314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106314, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106314, "pid": 5, "tid": 7, "ts": 1716454223038369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979441, "dur": 12, "args": { "External id": 106314, "cbid": 211, "correlation": 106314 } }, { "ph": "s", "id": 106314, "pid": 76337, "tid": -914061504, "ts": 1716454222979441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223038381, "dur": 160, "args": { "External id": 106316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106316, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106316, "pid": 5, "tid": 7, "ts": 1716454223038381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979456, "dur": 6, "args": { "External id": 106316, "cbid": 211, "correlation": 106316 } }, { "ph": "s", "id": 106316, "pid": 76337, "tid": -914061504, "ts": 1716454222979456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223038544, "dur": 1, "args": { "External id": 106318, "device": 5, "context": 1, "stream": 7, "correlation": 106318, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 106318, "pid": 5, "tid": 7, "ts": 1716454223038544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222979467, "dur": 6, "args": { "External id": 106318, "cbid": 51, "correlation": 106318 } }, { "ph": "s", "id": 106318, "pid": 76337, "tid": -914061504, "ts": 1716454222979467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223038547, "dur": 639, "args": { "External id": 106319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106319, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106319, "pid": 5, "tid": 7, "ts": 1716454223038547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979475, "dur": 6, "args": { "External id": 106319, "cbid": 211, "correlation": 106319 } }, { "ph": "s", "id": 106319, "pid": 76337, "tid": -914061504, "ts": 1716454222979475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223039187, "dur": 12, "args": { "External id": 106321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106321, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106321, "pid": 5, "tid": 7, "ts": 1716454223039187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979485, "dur": 5, "args": { "External id": 106321, "cbid": 211, "correlation": 106321 } }, { "ph": "s", "id": 106321, "pid": 76337, "tid": -914061504, "ts": 1716454222979485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223039200, "dur": 15, "args": { "External id": 106327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106327, "pid": 5, "tid": 7, "ts": 1716454223039200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979514, "dur": 8, "args": { "External id": 106327, "cbid": 211, "correlation": 106327 } }, { "ph": "s", "id": 106327, "pid": 76337, "tid": -914061504, "ts": 1716454222979514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222979572, "dur": 0, "args": { "External id": 106337, "cbid": 317, "correlation": 106337 } }, { "ph": "f", "id": 106337, "pid": 76337, "tid": -914061504, "ts": 1716454222979572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222979572, "dur": 0, "args": { "External id": 106338, "cbid": 203, "correlation": 106338 } }, { "ph": "f", "id": 106338, "pid": 76337, "tid": -914061504, "ts": 1716454222979572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222979573, "dur": 0, "args": { "External id": 106339, "cbid": 205, "correlation": 106339 } }, { "ph": "f", "id": 106339, "pid": 76337, "tid": -914061504, "ts": 1716454222979573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223039216, "dur": 21, "args": { "External id": 106343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106343, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106343, "pid": 5, "tid": 7, "ts": 1716454223039216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979585, "dur": 11, "args": { "External id": 106343, "cbid": 211, "correlation": 106343 } }, { "ph": "s", "id": 106343, "pid": 76337, "tid": -914061504, "ts": 1716454222979585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223039238, "dur": 4, "args": { "External id": 106345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106345, "pid": 5, "tid": 7, "ts": 1716454223039238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979600, "dur": 6, "args": { "External id": 106345, "cbid": 211, "correlation": 106345 } }, { "ph": "s", "id": 106345, "pid": 76337, "tid": -914061504, "ts": 1716454222979600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222979610, "dur": 0, "args": { "External id": 106346, "cbid": 51, "correlation": 106346 } }, { "ph": "s", "id": 106346, "pid": 76337, "tid": -914061504, "ts": 1716454222979610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223039244, "dur": 168, "args": { "External id": 106347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106347, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 106347, "pid": 5, "tid": 7, "ts": 1716454223039244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979610, "dur": 5, "args": { "External id": 106347, "cbid": 211, "correlation": 106347 } }, { "ph": "s", "id": 106347, "pid": 76337, "tid": -914061504, "ts": 1716454222979610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223039413, "dur": 16, "args": { "External id": 106352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106352, "pid": 5, "tid": 7, "ts": 1716454223039413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979636, "dur": 8, "args": { "External id": 106352, "cbid": 211, "correlation": 106352 } }, { "ph": "s", "id": 106352, "pid": 76337, "tid": -914061504, "ts": 1716454222979636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223039430, "dur": 13, "args": { "External id": 106360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106360, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106360, "pid": 5, "tid": 7, "ts": 1716454223039430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979665, "dur": 8, "args": { "External id": 106360, "cbid": 211, "correlation": 106360 } }, { "ph": "s", "id": 106360, "pid": 76337, "tid": -914061504, "ts": 1716454222979665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223039445, "dur": 10, "args": { "External id": 106368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106368, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106368, "pid": 5, "tid": 7, "ts": 1716454223039445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979693, "dur": 8, "args": { "External id": 106368, "cbid": 211, "correlation": 106368 } }, { "ph": "s", "id": 106368, "pid": 76337, "tid": -914061504, "ts": 1716454222979693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223039456, "dur": 18, "args": { "External id": 106388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106388, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 106388, "pid": 5, "tid": 7, "ts": 1716454223039456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979775, "dur": 12, "args": { "External id": 106388, "cbid": 211, "correlation": 106388 } }, { "ph": "s", "id": 106388, "pid": 76337, "tid": -914061504, "ts": 1716454222979775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223039475, "dur": 5, "args": { "External id": 106400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106400, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 106400, "pid": 5, "tid": 7, "ts": 1716454223039475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979796, "dur": 6, "args": { "External id": 106400, "cbid": 211, "correlation": 106400 } }, { "ph": "s", "id": 106400, "pid": 76337, "tid": -914061504, "ts": 1716454222979796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223039481, "dur": 17, "args": { "External id": 106403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106403, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106403, "pid": 5, "tid": 7, "ts": 1716454223039481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979815, "dur": 6, "args": { "External id": 106403, "cbid": 211, "correlation": 106403 } }, { "ph": "s", "id": 106403, "pid": 76337, "tid": -914061504, "ts": 1716454222979815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222979873, "dur": 0, "args": { "External id": 106414, "cbid": 317, "correlation": 106414 } }, { "ph": "f", "id": 106414, "pid": 76337, "tid": -914061504, "ts": 1716454222979873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222979874, "dur": 0, "args": { "External id": 106415, "cbid": 203, "correlation": 106415 } }, { "ph": "f", "id": 106415, "pid": 76337, "tid": -914061504, "ts": 1716454222979874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222979875, "dur": 0, "args": { "External id": 106416, "cbid": 205, "correlation": 106416 } }, { "ph": "f", "id": 106416, "pid": 76337, "tid": -914061504, "ts": 1716454222979875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223039499, "dur": 11, "args": { "External id": 106420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106420, "pid": 5, "tid": 7, "ts": 1716454223039499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979887, "dur": 12, "args": { "External id": 106420, "cbid": 211, "correlation": 106420 } }, { "ph": "s", "id": 106420, "pid": 76337, "tid": -914061504, "ts": 1716454222979887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223039511, "dur": 4, "args": { "External id": 106422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106422, "pid": 5, "tid": 7, "ts": 1716454223039511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979903, "dur": 7, "args": { "External id": 106422, "cbid": 211, "correlation": 106422 } }, { "ph": "s", "id": 106422, "pid": 76337, "tid": -914061504, "ts": 1716454222979903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222979913, "dur": 0, "args": { "External id": 106423, "cbid": 51, "correlation": 106423 } }, { "ph": "s", "id": 106423, "pid": 76337, "tid": -914061504, "ts": 1716454222979913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223039516, "dur": 89, "args": { "External id": 106424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106424, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 106424, "pid": 5, "tid": 7, "ts": 1716454223039516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979914, "dur": 5, "args": { "External id": 106424, "cbid": 211, "correlation": 106424 } }, { "ph": "s", "id": 106424, "pid": 76337, "tid": -914061504, "ts": 1716454222979914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223039606, "dur": 16, "args": { "External id": 106429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106429, "pid": 5, "tid": 7, "ts": 1716454223039606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222979940, "dur": 8, "args": { "External id": 106429, "cbid": 211, "correlation": 106429 } }, { "ph": "s", "id": 106429, "pid": 76337, "tid": -914061504, "ts": 1716454222979940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223039623, "dur": 83, "args": { "External id": 106438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106438, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106438, "pid": 5, "tid": 7, "ts": 1716454223039623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980030, "dur": 14, "args": { "External id": 106438, "cbid": 211, "correlation": 106438 } }, { "ph": "s", "id": 106438, "pid": 76337, "tid": -914061504, "ts": 1716454222980030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223039708, "dur": 29, "args": { "External id": 106460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106460, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106460, "pid": 5, "tid": 7, "ts": 1716454223039708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980087, "dur": 10, "args": { "External id": 106460, "cbid": 211, "correlation": 106460 } }, { "ph": "s", "id": 106460, "pid": 76337, "tid": -914061504, "ts": 1716454222980087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980175, "dur": 1, "args": { "External id": 106471, "cbid": 251, "correlation": 106471 } }, { "ph": "f", "id": 106471, "pid": 76337, "tid": -914061504, "ts": 1716454222980175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223039738, "dur": 163, "args": { "External id": 106472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106472, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106472, "pid": 5, "tid": 7, "ts": 1716454223039738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980180, "dur": 13, "args": { "External id": 106472, "cbid": 211, "correlation": 106472 } }, { "ph": "s", "id": 106472, "pid": 76337, "tid": -914061504, "ts": 1716454222980180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980252, "dur": 1, "args": { "External id": 106483, "cbid": 251, "correlation": 106483 } }, { "ph": "f", "id": 106483, "pid": 76337, "tid": -914061504, "ts": 1716454222980252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223039901, "dur": 156, "args": { "External id": 106484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106484, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106484, "pid": 5, "tid": 7, "ts": 1716454223039901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980256, "dur": 12, "args": { "External id": 106484, "cbid": 211, "correlation": 106484 } }, { "ph": "s", "id": 106484, "pid": 76337, "tid": -914061504, "ts": 1716454222980256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980322, "dur": 1, "args": { "External id": 106495, "cbid": 251, "correlation": 106495 } }, { "ph": "f", "id": 106495, "pid": 76337, "tid": -914061504, "ts": 1716454222980322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223040059, "dur": 154, "args": { "External id": 106496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106496, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106496, "pid": 5, "tid": 7, "ts": 1716454223040059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980326, "dur": 11, "args": { "External id": 106496, "cbid": 211, "correlation": 106496 } }, { "ph": "s", "id": 106496, "pid": 76337, "tid": -914061504, "ts": 1716454222980326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223040215, "dur": 328, "args": { "External id": 106521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106521, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106521, "pid": 5, "tid": 7, "ts": 1716454223040215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980410, "dur": 13, "args": { "External id": 106521, "cbid": 211, "correlation": 106521 } }, { "ph": "s", "id": 106521, "pid": 76337, "tid": -914061504, "ts": 1716454222980410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980509, "dur": 1, "args": { "External id": 106539, "cbid": 251, "correlation": 106539 } }, { "ph": "f", "id": 106539, "pid": 76337, "tid": -914061504, "ts": 1716454222980509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223040544, "dur": 141, "args": { "External id": 106541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106541, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106541, "pid": 5, "tid": 7, "ts": 1716454223040544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980515, "dur": 13, "args": { "External id": 106541, "cbid": 211, "correlation": 106541 } }, { "ph": "s", "id": 106541, "pid": 76337, "tid": -914061504, "ts": 1716454222980515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223040687, "dur": 19, "args": { "External id": 106549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106549, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106549, "pid": 5, "tid": 7, "ts": 1716454223040687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980585, "dur": 12, "args": { "External id": 106549, "cbid": 211, "correlation": 106549 } }, { "ph": "s", "id": 106549, "pid": 76337, "tid": -914061504, "ts": 1716454222980585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223040707, "dur": 28, "args": { "External id": 106557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106557, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106557, "pid": 5, "tid": 7, "ts": 1716454223040707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980624, "dur": 8, "args": { "External id": 106557, "cbid": 211, "correlation": 106557 } }, { "ph": "s", "id": 106557, "pid": 76337, "tid": -914061504, "ts": 1716454222980624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223040736, "dur": 18, "args": { "External id": 106568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106568, "pid": 5, "tid": 7, "ts": 1716454223040736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980696, "dur": 13, "args": { "External id": 106568, "cbid": 211, "correlation": 106568 } }, { "ph": "s", "id": 106568, "pid": 76337, "tid": -914061504, "ts": 1716454222980696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223040756, "dur": 16, "args": { "External id": 106590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106590, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106590, "pid": 5, "tid": 7, "ts": 1716454223040756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980727, "dur": 8, "args": { "External id": 106590, "cbid": 211, "correlation": 106590 } }, { "ph": "s", "id": 106590, "pid": 76337, "tid": -914061504, "ts": 1716454222980727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980812, "dur": 1, "args": { "External id": 106601, "cbid": 251, "correlation": 106601 } }, { "ph": "f", "id": 106601, "pid": 76337, "tid": -914061504, "ts": 1716454222980812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223040773, "dur": 87, "args": { "External id": 106602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106602, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106602, "pid": 5, "tid": 7, "ts": 1716454223040773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980818, "dur": 13, "args": { "External id": 106602, "cbid": 211, "correlation": 106602 } }, { "ph": "s", "id": 106602, "pid": 76337, "tid": -914061504, "ts": 1716454222980818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980888, "dur": 1, "args": { "External id": 106613, "cbid": 251, "correlation": 106613 } }, { "ph": "f", "id": 106613, "pid": 76337, "tid": -914061504, "ts": 1716454222980888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980892, "dur": 0, "args": { "External id": 106614, "cbid": 251, "correlation": 106614 } }, { "ph": "f", "id": 106614, "pid": 76337, "tid": -914061504, "ts": 1716454222980892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223040862, "dur": 12, "args": { "External id": 106615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106615, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106615, "pid": 5, "tid": 7, "ts": 1716454223040862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980893, "dur": 11, "args": { "External id": 106615, "cbid": 211, "correlation": 106615 } }, { "ph": "s", "id": 106615, "pid": 76337, "tid": -914061504, "ts": 1716454222980893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223040876, "dur": 5, "args": { "External id": 106617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106617, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106617, "pid": 5, "tid": 7, "ts": 1716454223040876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980906, "dur": 5, "args": { "External id": 106617, "cbid": 211, "correlation": 106617 } }, { "ph": "s", "id": 106617, "pid": 76337, "tid": -914061504, "ts": 1716454222980906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980964, "dur": 1, "args": { "External id": 106628, "cbid": 251, "correlation": 106628 } }, { "ph": "f", "id": 106628, "pid": 76337, "tid": -914061504, "ts": 1716454222980964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222980967, "dur": 0, "args": { "External id": 106629, "cbid": 251, "correlation": 106629 } }, { "ph": "f", "id": 106629, "pid": 76337, "tid": -914061504, "ts": 1716454222980967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223040882, "dur": 8, "args": { "External id": 106630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106630, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106630, "pid": 5, "tid": 7, "ts": 1716454223040882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980969, "dur": 20, "args": { "External id": 106630, "cbid": 211, "correlation": 106630 } }, { "ph": "s", "id": 106630, "pid": 76337, "tid": -914061504, "ts": 1716454222980969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223040892, "dur": 3, "args": { "External id": 106632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106632, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106632, "pid": 5, "tid": 7, "ts": 1716454223040892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222980990, "dur": 6, "args": { "External id": 106632, "cbid": 211, "correlation": 106632 } }, { "ph": "s", "id": 106632, "pid": 76337, "tid": -914061504, "ts": 1716454222980990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223040896, "dur": 53, "args": { "External id": 106657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106657, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106657, "pid": 5, "tid": 7, "ts": 1716454223040896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981069, "dur": 12, "args": { "External id": 106657, "cbid": 211, "correlation": 106657 } }, { "ph": "s", "id": 106657, "pid": 76337, "tid": -914061504, "ts": 1716454222981069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222981168, "dur": 1, "args": { "External id": 106675, "cbid": 251, "correlation": 106675 } }, { "ph": "f", "id": 106675, "pid": 76337, "tid": -914061504, "ts": 1716454222981168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223040951, "dur": 88, "args": { "External id": 106677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106677, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106677, "pid": 5, "tid": 7, "ts": 1716454223040951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981174, "dur": 13, "args": { "External id": 106677, "cbid": 211, "correlation": 106677 } }, { "ph": "s", "id": 106677, "pid": 76337, "tid": -914061504, "ts": 1716454222981174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223041041, "dur": 9, "args": { "External id": 106685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106685, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106685, "pid": 5, "tid": 7, "ts": 1716454223041041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981243, "dur": 12, "args": { "External id": 106685, "cbid": 211, "correlation": 106685 } }, { "ph": "s", "id": 106685, "pid": 76337, "tid": -914061504, "ts": 1716454222981243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223041052, "dur": 20, "args": { "External id": 106693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106693, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106693, "pid": 5, "tid": 7, "ts": 1716454223041052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981284, "dur": 9, "args": { "External id": 106693, "cbid": 211, "correlation": 106693 } }, { "ph": "s", "id": 106693, "pid": 76337, "tid": -914061504, "ts": 1716454222981284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223041073, "dur": 18, "args": { "External id": 106715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106715, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106715, "pid": 5, "tid": 7, "ts": 1716454223041073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981336, "dur": 10, "args": { "External id": 106715, "cbid": 211, "correlation": 106715 } }, { "ph": "s", "id": 106715, "pid": 76337, "tid": -914061504, "ts": 1716454222981336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222981423, "dur": 1, "args": { "External id": 106731, "cbid": 251, "correlation": 106731 } }, { "ph": "f", "id": 106731, "pid": 76337, "tid": -914061504, "ts": 1716454222981423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222981428, "dur": 0, "args": { "External id": 106733, "cbid": 251, "correlation": 106733 } }, { "ph": "f", "id": 106733, "pid": 76337, "tid": -914061504, "ts": 1716454222981428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223041092, "dur": 493, "args": { "External id": 106734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106734, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106734, "pid": 5, "tid": 7, "ts": 1716454223041092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981429, "dur": 13, "args": { "External id": 106734, "cbid": 211, "correlation": 106734 } }, { "ph": "s", "id": 106734, "pid": 76337, "tid": -914061504, "ts": 1716454222981429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223041587, "dur": 66, "args": { "External id": 106742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106742, "pid": 5, "tid": 7, "ts": 1716454223041587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981495, "dur": 12, "args": { "External id": 106742, "cbid": 211, "correlation": 106742 } }, { "ph": "s", "id": 106742, "pid": 76337, "tid": -914061504, "ts": 1716454222981495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223041654, "dur": 66, "args": { "External id": 106750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106750, "pid": 5, "tid": 7, "ts": 1716454223041654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981526, "dur": 8, "args": { "External id": 106750, "cbid": 211, "correlation": 106750 } }, { "ph": "s", "id": 106750, "pid": 76337, "tid": -914061504, "ts": 1716454222981526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222981605, "dur": 1, "args": { "External id": 106766, "cbid": 251, "correlation": 106766 } }, { "ph": "f", "id": 106766, "pid": 76337, "tid": -914061504, "ts": 1716454222981605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223041722, "dur": 1, "args": { "External id": 106768, "device": 5, "context": 1, "stream": 7, "correlation": 106768, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 106768, "pid": 5, "tid": 7, "ts": 1716454223041722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222981610, "dur": 9, "args": { "External id": 106768, "cbid": 51, "correlation": 106768 } }, { "ph": "s", "id": 106768, "pid": 76337, "tid": -914061504, "ts": 1716454222981610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223041726, "dur": 264, "args": { "External id": 106769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106769, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106769, "pid": 5, "tid": 7, "ts": 1716454223041726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981622, "dur": 11, "args": { "External id": 106769, "cbid": 211, "correlation": 106769 } }, { "ph": "s", "id": 106769, "pid": 76337, "tid": -914061504, "ts": 1716454222981622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223041991, "dur": 14, "args": { "External id": 106777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106777, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106777, "pid": 5, "tid": 7, "ts": 1716454223041991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981663, "dur": 11, "args": { "External id": 106777, "cbid": 211, "correlation": 106777 } }, { "ph": "s", "id": 106777, "pid": 76337, "tid": -914061504, "ts": 1716454222981663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223042006, "dur": 37, "args": { "External id": 106788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106788, "pid": 5, "tid": 7, "ts": 1716454223042006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981732, "dur": 12, "args": { "External id": 106788, "cbid": 211, "correlation": 106788 } }, { "ph": "s", "id": 106788, "pid": 76337, "tid": -914061504, "ts": 1716454222981732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222981796, "dur": 0, "args": { "External id": 106800, "cbid": 317, "correlation": 106800 } }, { "ph": "f", "id": 106800, "pid": 76337, "tid": -914061504, "ts": 1716454222981796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222981797, "dur": 0, "args": { "External id": 106801, "cbid": 203, "correlation": 106801 } }, { "ph": "f", "id": 106801, "pid": 76337, "tid": -914061504, "ts": 1716454222981797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222981798, "dur": 0, "args": { "External id": 106802, "cbid": 205, "correlation": 106802 } }, { "ph": "f", "id": 106802, "pid": 76337, "tid": -914061504, "ts": 1716454222981798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223042045, "dur": 13, "args": { "External id": 106806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106806, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106806, "pid": 5, "tid": 7, "ts": 1716454223042045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981812, "dur": 13, "args": { "External id": 106806, "cbid": 211, "correlation": 106806 } }, { "ph": "s", "id": 106806, "pid": 76337, "tid": -914061504, "ts": 1716454222981812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223042060, "dur": 4, "args": { "External id": 106808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 106808, "pid": 5, "tid": 7, "ts": 1716454223042060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981829, "dur": 6, "args": { "External id": 106808, "cbid": 211, "correlation": 106808 } }, { "ph": "s", "id": 106808, "pid": 76337, "tid": -914061504, "ts": 1716454222981829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222981838, "dur": 0, "args": { "External id": 106809, "cbid": 51, "correlation": 106809 } }, { "ph": "s", "id": 106809, "pid": 76337, "tid": -914061504, "ts": 1716454222981838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223042065, "dur": 95, "args": { "External id": 106810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106810, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 106810, "pid": 5, "tid": 7, "ts": 1716454223042065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981838, "dur": 5, "args": { "External id": 106810, "cbid": 211, "correlation": 106810 } }, { "ph": "s", "id": 106810, "pid": 76337, "tid": -914061504, "ts": 1716454222981838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223042161, "dur": 16, "args": { "External id": 106815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106815, "pid": 5, "tid": 7, "ts": 1716454223042161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981865, "dur": 9, "args": { "External id": 106815, "cbid": 211, "correlation": 106815 } }, { "ph": "s", "id": 106815, "pid": 76337, "tid": -914061504, "ts": 1716454222981865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223042179, "dur": 12, "args": { "External id": 106823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106823, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106823, "pid": 5, "tid": 7, "ts": 1716454223042179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981897, "dur": 8, "args": { "External id": 106823, "cbid": 211, "correlation": 106823 } }, { "ph": "s", "id": 106823, "pid": 76337, "tid": -914061504, "ts": 1716454222981897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223042192, "dur": 26, "args": { "External id": 106832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106832, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106832, "pid": 5, "tid": 7, "ts": 1716454223042192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222981936, "dur": 10, "args": { "External id": 106832, "cbid": 211, "correlation": 106832 } }, { "ph": "s", "id": 106832, "pid": 76337, "tid": -914061504, "ts": 1716454222981936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223042219, "dur": 24, "args": { "External id": 106852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106852, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 106852, "pid": 5, "tid": 7, "ts": 1716454223042219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982023, "dur": 13, "args": { "External id": 106852, "cbid": 211, "correlation": 106852 } }, { "ph": "s", "id": 106852, "pid": 76337, "tid": -914061504, "ts": 1716454222982023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223042244, "dur": 5, "args": { "External id": 106864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106864, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 106864, "pid": 5, "tid": 7, "ts": 1716454223042244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982047, "dur": 6, "args": { "External id": 106864, "cbid": 211, "correlation": 106864 } }, { "ph": "s", "id": 106864, "pid": 76337, "tid": -914061504, "ts": 1716454222982047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223042250, "dur": 24, "args": { "External id": 106867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106867, "pid": 5, "tid": 7, "ts": 1716454223042250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982065, "dur": 7, "args": { "External id": 106867, "cbid": 211, "correlation": 106867 } }, { "ph": "s", "id": 106867, "pid": 76337, "tid": -914061504, "ts": 1716454222982065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223042276, "dur": 17, "args": { "External id": 106876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106876, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106876, "pid": 5, "tid": 7, "ts": 1716454223042276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982106, "dur": 10, "args": { "External id": 106876, "cbid": 211, "correlation": 106876 } }, { "ph": "s", "id": 106876, "pid": 76337, "tid": -914061504, "ts": 1716454222982106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222982158, "dur": 0, "args": { "External id": 106886, "cbid": 317, "correlation": 106886 } }, { "ph": "f", "id": 106886, "pid": 76337, "tid": -914061504, "ts": 1716454222982158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222982159, "dur": 0, "args": { "External id": 106887, "cbid": 203, "correlation": 106887 } }, { "ph": "f", "id": 106887, "pid": 76337, "tid": -914061504, "ts": 1716454222982159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222982160, "dur": 0, "args": { "External id": 106888, "cbid": 205, "correlation": 106888 } }, { "ph": "f", "id": 106888, "pid": 76337, "tid": -914061504, "ts": 1716454222982160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223042294, "dur": 17, "args": { "External id": 106892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106892, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106892, "pid": 5, "tid": 7, "ts": 1716454223042294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982175, "dur": 12, "args": { "External id": 106892, "cbid": 211, "correlation": 106892 } }, { "ph": "s", "id": 106892, "pid": 76337, "tid": -914061504, "ts": 1716454222982175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223042312, "dur": 236, "args": { "External id": 106894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106894, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106894, "pid": 5, "tid": 7, "ts": 1716454223042312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982189, "dur": 5, "args": { "External id": 106894, "cbid": 211, "correlation": 106894 } }, { "ph": "s", "id": 106894, "pid": 76337, "tid": -914061504, "ts": 1716454222982189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223042551, "dur": 1, "args": { "External id": 106896, "device": 5, "context": 1, "stream": 7, "correlation": 106896, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 106896, "pid": 5, "tid": 7, "ts": 1716454223042551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222982200, "dur": 8, "args": { "External id": 106896, "cbid": 51, "correlation": 106896 } }, { "ph": "s", "id": 106896, "pid": 76337, "tid": -914061504, "ts": 1716454222982200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223042554, "dur": 807, "args": { "External id": 106897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106897, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106897, "pid": 5, "tid": 7, "ts": 1716454223042554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982209, "dur": 6, "args": { "External id": 106897, "cbid": 211, "correlation": 106897 } }, { "ph": "s", "id": 106897, "pid": 76337, "tid": -914061504, "ts": 1716454222982209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223043362, "dur": 13, "args": { "External id": 106899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106899, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106899, "pid": 5, "tid": 7, "ts": 1716454223043362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982220, "dur": 6, "args": { "External id": 106899, "cbid": 211, "correlation": 106899 } }, { "ph": "s", "id": 106899, "pid": 76337, "tid": -914061504, "ts": 1716454222982220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223043377, "dur": 15, "args": { "External id": 106905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106905, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106905, "pid": 5, "tid": 7, "ts": 1716454223043377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982248, "dur": 8, "args": { "External id": 106905, "cbid": 211, "correlation": 106905 } }, { "ph": "s", "id": 106905, "pid": 76337, "tid": -914061504, "ts": 1716454222982248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223043393, "dur": 3, "args": { "External id": 106913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106913, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 106913, "pid": 5, "tid": 7, "ts": 1716454223043393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982292, "dur": 9, "args": { "External id": 106913, "cbid": 211, "correlation": 106913 } }, { "ph": "s", "id": 106913, "pid": 76337, "tid": -914061504, "ts": 1716454222982292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222982357, "dur": 1, "args": { "External id": 106929, "cbid": 251, "correlation": 106929 } }, { "ph": "f", "id": 106929, "pid": 76337, "tid": -914061504, "ts": 1716454222982357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222982362, "dur": 0, "args": { "External id": 106931, "cbid": 251, "correlation": 106931 } }, { "ph": "f", "id": 106931, "pid": 76337, "tid": -914061504, "ts": 1716454222982362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223043398, "dur": 14, "args": { "External id": 106932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106932, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106932, "pid": 5, "tid": 7, "ts": 1716454223043398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982364, "dur": 12, "args": { "External id": 106932, "cbid": 211, "correlation": 106932 } }, { "ph": "s", "id": 106932, "pid": 76337, "tid": -914061504, "ts": 1716454222982364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223043412, "dur": 5, "args": { "External id": 106934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106934, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 106934, "pid": 5, "tid": 7, "ts": 1716454223043412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982378, "dur": 6, "args": { "External id": 106934, "cbid": 211, "correlation": 106934 } }, { "ph": "s", "id": 106934, "pid": 76337, "tid": -914061504, "ts": 1716454222982378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223043419, "dur": 16, "args": { "External id": 106944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106944, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106944, "pid": 5, "tid": 7, "ts": 1716454223043419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982436, "dur": 12, "args": { "External id": 106944, "cbid": 211, "correlation": 106944 } }, { "ph": "s", "id": 106944, "pid": 76337, "tid": -914061504, "ts": 1716454222982436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223043436, "dur": 17, "args": { "External id": 106964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106964, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 106964, "pid": 5, "tid": 7, "ts": 1716454223043436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982502, "dur": 11, "args": { "External id": 106964, "cbid": 211, "correlation": 106964 } }, { "ph": "s", "id": 106964, "pid": 76337, "tid": -914061504, "ts": 1716454222982502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223043455, "dur": 4, "args": { "External id": 106976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106976, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 106976, "pid": 5, "tid": 7, "ts": 1716454223043455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982523, "dur": 7, "args": { "External id": 106976, "cbid": 211, "correlation": 106976 } }, { "ph": "s", "id": 106976, "pid": 76337, "tid": -914061504, "ts": 1716454222982523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223043460, "dur": 16, "args": { "External id": 106979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106979, "pid": 5, "tid": 7, "ts": 1716454223043460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982543, "dur": 6, "args": { "External id": 106979, "cbid": 211, "correlation": 106979 } }, { "ph": "s", "id": 106979, "pid": 76337, "tid": -914061504, "ts": 1716454222982543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223043478, "dur": 11, "args": { "External id": 106988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 106988, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 106988, "pid": 5, "tid": 7, "ts": 1716454223043478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982584, "dur": 10, "args": { "External id": 106988, "cbid": 211, "correlation": 106988 } }, { "ph": "s", "id": 106988, "pid": 76337, "tid": -914061504, "ts": 1716454222982584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222982646, "dur": 0, "args": { "External id": 106998, "cbid": 317, "correlation": 106998 } }, { "ph": "f", "id": 106998, "pid": 76337, "tid": -914061504, "ts": 1716454222982646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222982647, "dur": 0, "args": { "External id": 106999, "cbid": 203, "correlation": 106999 } }, { "ph": "f", "id": 106999, "pid": 76337, "tid": -914061504, "ts": 1716454222982647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222982648, "dur": 0, "args": { "External id": 107000, "cbid": 205, "correlation": 107000 } }, { "ph": "f", "id": 107000, "pid": 76337, "tid": -914061504, "ts": 1716454222982648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223043490, "dur": 11, "args": { "External id": 107004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107004, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107004, "pid": 5, "tid": 7, "ts": 1716454223043490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982661, "dur": 12, "args": { "External id": 107004, "cbid": 211, "correlation": 107004 } }, { "ph": "s", "id": 107004, "pid": 76337, "tid": -914061504, "ts": 1716454222982661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223043502, "dur": 160, "args": { "External id": 107006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107006, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107006, "pid": 5, "tid": 7, "ts": 1716454223043502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982676, "dur": 6, "args": { "External id": 107006, "cbid": 211, "correlation": 107006 } }, { "ph": "s", "id": 107006, "pid": 76337, "tid": -914061504, "ts": 1716454222982676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223043664, "dur": 1, "args": { "External id": 107008, "device": 5, "context": 1, "stream": 7, "correlation": 107008, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 107008, "pid": 5, "tid": 7, "ts": 1716454223043664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222982687, "dur": 6, "args": { "External id": 107008, "cbid": 51, "correlation": 107008 } }, { "ph": "s", "id": 107008, "pid": 76337, "tid": -914061504, "ts": 1716454222982687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223043668, "dur": 639, "args": { "External id": 107009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107009, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107009, "pid": 5, "tid": 7, "ts": 1716454223043668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982695, "dur": 6, "args": { "External id": 107009, "cbid": 211, "correlation": 107009 } }, { "ph": "s", "id": 107009, "pid": 76337, "tid": -914061504, "ts": 1716454222982695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223044308, "dur": 12, "args": { "External id": 107011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107011, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107011, "pid": 5, "tid": 7, "ts": 1716454223044308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982705, "dur": 5, "args": { "External id": 107011, "cbid": 211, "correlation": 107011 } }, { "ph": "s", "id": 107011, "pid": 76337, "tid": -914061504, "ts": 1716454222982705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223044322, "dur": 14, "args": { "External id": 107017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107017, "pid": 5, "tid": 7, "ts": 1716454223044322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982734, "dur": 9, "args": { "External id": 107017, "cbid": 211, "correlation": 107017 } }, { "ph": "s", "id": 107017, "pid": 76337, "tid": -914061504, "ts": 1716454222982734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222982793, "dur": 0, "args": { "External id": 107027, "cbid": 317, "correlation": 107027 } }, { "ph": "f", "id": 107027, "pid": 76337, "tid": -914061504, "ts": 1716454222982793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222982793, "dur": 0, "args": { "External id": 107028, "cbid": 203, "correlation": 107028 } }, { "ph": "f", "id": 107028, "pid": 76337, "tid": -914061504, "ts": 1716454222982793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222982794, "dur": 0, "args": { "External id": 107029, "cbid": 205, "correlation": 107029 } }, { "ph": "f", "id": 107029, "pid": 76337, "tid": -914061504, "ts": 1716454222982794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223044337, "dur": 17, "args": { "External id": 107033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107033, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107033, "pid": 5, "tid": 7, "ts": 1716454223044337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982807, "dur": 12, "args": { "External id": 107033, "cbid": 211, "correlation": 107033 } }, { "ph": "s", "id": 107033, "pid": 76337, "tid": -914061504, "ts": 1716454222982807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223044355, "dur": 4, "args": { "External id": 107035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107035, "pid": 5, "tid": 7, "ts": 1716454223044355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982824, "dur": 7, "args": { "External id": 107035, "cbid": 211, "correlation": 107035 } }, { "ph": "s", "id": 107035, "pid": 76337, "tid": -914061504, "ts": 1716454222982824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222982834, "dur": 0, "args": { "External id": 107036, "cbid": 51, "correlation": 107036 } }, { "ph": "s", "id": 107036, "pid": 76337, "tid": -914061504, "ts": 1716454222982834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223044361, "dur": 130, "args": { "External id": 107037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107037, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 107037, "pid": 5, "tid": 7, "ts": 1716454223044361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982834, "dur": 5, "args": { "External id": 107037, "cbid": 211, "correlation": 107037 } }, { "ph": "s", "id": 107037, "pid": 76337, "tid": -914061504, "ts": 1716454222982834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223044492, "dur": 15, "args": { "External id": 107042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107042, "pid": 5, "tid": 7, "ts": 1716454223044492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982860, "dur": 8, "args": { "External id": 107042, "cbid": 211, "correlation": 107042 } }, { "ph": "s", "id": 107042, "pid": 76337, "tid": -914061504, "ts": 1716454222982860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223044509, "dur": 12, "args": { "External id": 107050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107050, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107050, "pid": 5, "tid": 7, "ts": 1716454223044509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982889, "dur": 8, "args": { "External id": 107050, "cbid": 211, "correlation": 107050 } }, { "ph": "s", "id": 107050, "pid": 76337, "tid": -914061504, "ts": 1716454222982889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223044521, "dur": 10, "args": { "External id": 107058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107058, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107058, "pid": 5, "tid": 7, "ts": 1716454223044521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222982917, "dur": 9, "args": { "External id": 107058, "cbid": 211, "correlation": 107058 } }, { "ph": "s", "id": 107058, "pid": 76337, "tid": -914061504, "ts": 1716454222982917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223044533, "dur": 19, "args": { "External id": 107078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107078, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 107078, "pid": 5, "tid": 7, "ts": 1716454223044533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983008, "dur": 13, "args": { "External id": 107078, "cbid": 211, "correlation": 107078 } }, { "ph": "s", "id": 107078, "pid": 76337, "tid": -914061504, "ts": 1716454222983008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223044553, "dur": 4, "args": { "External id": 107090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107090, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 107090, "pid": 5, "tid": 7, "ts": 1716454223044553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983032, "dur": 6, "args": { "External id": 107090, "cbid": 211, "correlation": 107090 } }, { "ph": "s", "id": 107090, "pid": 76337, "tid": -914061504, "ts": 1716454222983032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223044558, "dur": 17, "args": { "External id": 107093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107093, "pid": 5, "tid": 7, "ts": 1716454223044558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983049, "dur": 7, "args": { "External id": 107093, "cbid": 211, "correlation": 107093 } }, { "ph": "s", "id": 107093, "pid": 76337, "tid": -914061504, "ts": 1716454222983049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222983107, "dur": 0, "args": { "External id": 107104, "cbid": 317, "correlation": 107104 } }, { "ph": "f", "id": 107104, "pid": 76337, "tid": -914061504, "ts": 1716454222983107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222983108, "dur": 0, "args": { "External id": 107105, "cbid": 203, "correlation": 107105 } }, { "ph": "f", "id": 107105, "pid": 76337, "tid": -914061504, "ts": 1716454222983108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222983108, "dur": 0, "args": { "External id": 107106, "cbid": 205, "correlation": 107106 } }, { "ph": "f", "id": 107106, "pid": 76337, "tid": -914061504, "ts": 1716454222983108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223044576, "dur": 11, "args": { "External id": 107110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107110, "pid": 5, "tid": 7, "ts": 1716454223044576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983121, "dur": 13, "args": { "External id": 107110, "cbid": 211, "correlation": 107110 } }, { "ph": "s", "id": 107110, "pid": 76337, "tid": -914061504, "ts": 1716454222983121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223044588, "dur": 3, "args": { "External id": 107112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107112, "pid": 5, "tid": 7, "ts": 1716454223044588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983138, "dur": 5, "args": { "External id": 107112, "cbid": 211, "correlation": 107112 } }, { "ph": "s", "id": 107112, "pid": 76337, "tid": -914061504, "ts": 1716454222983138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222983146, "dur": 0, "args": { "External id": 107113, "cbid": 51, "correlation": 107113 } }, { "ph": "s", "id": 107113, "pid": 76337, "tid": -914061504, "ts": 1716454222983146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223044592, "dur": 88, "args": { "External id": 107114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107114, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 107114, "pid": 5, "tid": 7, "ts": 1716454223044592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983147, "dur": 5, "args": { "External id": 107114, "cbid": 211, "correlation": 107114 } }, { "ph": "s", "id": 107114, "pid": 76337, "tid": -914061504, "ts": 1716454222983147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223044682, "dur": 15, "args": { "External id": 107119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107119, "pid": 5, "tid": 7, "ts": 1716454223044682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983173, "dur": 10, "args": { "External id": 107119, "cbid": 211, "correlation": 107119 } }, { "ph": "s", "id": 107119, "pid": 76337, "tid": -914061504, "ts": 1716454222983173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223044698, "dur": 82, "args": { "External id": 107128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107128, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107128, "pid": 5, "tid": 7, "ts": 1716454223044698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983254, "dur": 13, "args": { "External id": 107128, "cbid": 211, "correlation": 107128 } }, { "ph": "s", "id": 107128, "pid": 76337, "tid": -914061504, "ts": 1716454222983254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223044782, "dur": 29, "args": { "External id": 107150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107150, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107150, "pid": 5, "tid": 7, "ts": 1716454223044782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983311, "dur": 10, "args": { "External id": 107150, "cbid": 211, "correlation": 107150 } }, { "ph": "s", "id": 107150, "pid": 76337, "tid": -914061504, "ts": 1716454222983311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222983399, "dur": 1, "args": { "External id": 107161, "cbid": 251, "correlation": 107161 } }, { "ph": "f", "id": 107161, "pid": 76337, "tid": -914061504, "ts": 1716454222983399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223044813, "dur": 161, "args": { "External id": 107162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107162, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107162, "pid": 5, "tid": 7, "ts": 1716454223044813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983405, "dur": 13, "args": { "External id": 107162, "cbid": 211, "correlation": 107162 } }, { "ph": "s", "id": 107162, "pid": 76337, "tid": -914061504, "ts": 1716454222983405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222983475, "dur": 1, "args": { "External id": 107173, "cbid": 251, "correlation": 107173 } }, { "ph": "f", "id": 107173, "pid": 76337, "tid": -914061504, "ts": 1716454222983475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223044975, "dur": 155, "args": { "External id": 107174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107174, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107174, "pid": 5, "tid": 7, "ts": 1716454223044975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983480, "dur": 11, "args": { "External id": 107174, "cbid": 211, "correlation": 107174 } }, { "ph": "s", "id": 107174, "pid": 76337, "tid": -914061504, "ts": 1716454222983480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222983544, "dur": 1, "args": { "External id": 107185, "cbid": 251, "correlation": 107185 } }, { "ph": "f", "id": 107185, "pid": 76337, "tid": -914061504, "ts": 1716454222983544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223045131, "dur": 156, "args": { "External id": 107186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107186, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107186, "pid": 5, "tid": 7, "ts": 1716454223045131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983548, "dur": 11, "args": { "External id": 107186, "cbid": 211, "correlation": 107186 } }, { "ph": "s", "id": 107186, "pid": 76337, "tid": -914061504, "ts": 1716454222983548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223045289, "dur": 329, "args": { "External id": 107211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107211, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107211, "pid": 5, "tid": 7, "ts": 1716454223045289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983631, "dur": 12, "args": { "External id": 107211, "cbid": 211, "correlation": 107211 } }, { "ph": "s", "id": 107211, "pid": 76337, "tid": -914061504, "ts": 1716454222983631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222983730, "dur": 1, "args": { "External id": 107229, "cbid": 251, "correlation": 107229 } }, { "ph": "f", "id": 107229, "pid": 76337, "tid": -914061504, "ts": 1716454222983730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223045620, "dur": 161, "args": { "External id": 107231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107231, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107231, "pid": 5, "tid": 7, "ts": 1716454223045620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983737, "dur": 13, "args": { "External id": 107231, "cbid": 211, "correlation": 107231 } }, { "ph": "s", "id": 107231, "pid": 76337, "tid": -914061504, "ts": 1716454222983737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223045782, "dur": 19, "args": { "External id": 107239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107239, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107239, "pid": 5, "tid": 7, "ts": 1716454223045782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983805, "dur": 12, "args": { "External id": 107239, "cbid": 211, "correlation": 107239 } }, { "ph": "s", "id": 107239, "pid": 76337, "tid": -914061504, "ts": 1716454222983805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223045802, "dur": 27, "args": { "External id": 107247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107247, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107247, "pid": 5, "tid": 7, "ts": 1716454223045802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983844, "dur": 8, "args": { "External id": 107247, "cbid": 211, "correlation": 107247 } }, { "ph": "s", "id": 107247, "pid": 76337, "tid": -914061504, "ts": 1716454222983844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223045831, "dur": 19, "args": { "External id": 107258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107258, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107258, "pid": 5, "tid": 7, "ts": 1716454223045831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983915, "dur": 12, "args": { "External id": 107258, "cbid": 211, "correlation": 107258 } }, { "ph": "s", "id": 107258, "pid": 76337, "tid": -914061504, "ts": 1716454222983915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223045851, "dur": 16, "args": { "External id": 107280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107280, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107280, "pid": 5, "tid": 7, "ts": 1716454223045851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222983947, "dur": 8, "args": { "External id": 107280, "cbid": 211, "correlation": 107280 } }, { "ph": "s", "id": 107280, "pid": 76337, "tid": -914061504, "ts": 1716454222983947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984043, "dur": 1, "args": { "External id": 107291, "cbid": 251, "correlation": 107291 } }, { "ph": "f", "id": 107291, "pid": 76337, "tid": -914061504, "ts": 1716454222984043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223045868, "dur": 87, "args": { "External id": 107292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107292, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107292, "pid": 5, "tid": 7, "ts": 1716454223045868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984048, "dur": 13, "args": { "External id": 107292, "cbid": 211, "correlation": 107292 } }, { "ph": "s", "id": 107292, "pid": 76337, "tid": -914061504, "ts": 1716454222984048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984116, "dur": 1, "args": { "External id": 107303, "cbid": 251, "correlation": 107303 } }, { "ph": "f", "id": 107303, "pid": 76337, "tid": -914061504, "ts": 1716454222984116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984120, "dur": 0, "args": { "External id": 107304, "cbid": 251, "correlation": 107304 } }, { "ph": "f", "id": 107304, "pid": 76337, "tid": -914061504, "ts": 1716454222984120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223045957, "dur": 12, "args": { "External id": 107305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107305, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107305, "pid": 5, "tid": 7, "ts": 1716454223045957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984122, "dur": 12, "args": { "External id": 107305, "cbid": 211, "correlation": 107305 } }, { "ph": "s", "id": 107305, "pid": 76337, "tid": -914061504, "ts": 1716454222984122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223045970, "dur": 5, "args": { "External id": 107307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107307, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107307, "pid": 5, "tid": 7, "ts": 1716454223045970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984136, "dur": 6, "args": { "External id": 107307, "cbid": 211, "correlation": 107307 } }, { "ph": "s", "id": 107307, "pid": 76337, "tid": -914061504, "ts": 1716454222984136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984193, "dur": 1, "args": { "External id": 107318, "cbid": 251, "correlation": 107318 } }, { "ph": "f", "id": 107318, "pid": 76337, "tid": -914061504, "ts": 1716454222984193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984197, "dur": 0, "args": { "External id": 107319, "cbid": 251, "correlation": 107319 } }, { "ph": "f", "id": 107319, "pid": 76337, "tid": -914061504, "ts": 1716454222984197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223045977, "dur": 8, "args": { "External id": 107320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107320, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107320, "pid": 5, "tid": 7, "ts": 1716454223045977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984198, "dur": 12, "args": { "External id": 107320, "cbid": 211, "correlation": 107320 } }, { "ph": "s", "id": 107320, "pid": 76337, "tid": -914061504, "ts": 1716454222984198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223045986, "dur": 3, "args": { "External id": 107322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107322, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107322, "pid": 5, "tid": 7, "ts": 1716454223045986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984211, "dur": 5, "args": { "External id": 107322, "cbid": 211, "correlation": 107322 } }, { "ph": "s", "id": 107322, "pid": 76337, "tid": -914061504, "ts": 1716454222984211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223045991, "dur": 53, "args": { "External id": 107347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107347, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107347, "pid": 5, "tid": 7, "ts": 1716454223045991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984290, "dur": 13, "args": { "External id": 107347, "cbid": 211, "correlation": 107347 } }, { "ph": "s", "id": 107347, "pid": 76337, "tid": -914061504, "ts": 1716454222984290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984392, "dur": 1, "args": { "External id": 107365, "cbid": 251, "correlation": 107365 } }, { "ph": "f", "id": 107365, "pid": 76337, "tid": -914061504, "ts": 1716454222984392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223046045, "dur": 90, "args": { "External id": 107367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107367, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107367, "pid": 5, "tid": 7, "ts": 1716454223046045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984398, "dur": 13, "args": { "External id": 107367, "cbid": 211, "correlation": 107367 } }, { "ph": "s", "id": 107367, "pid": 76337, "tid": -914061504, "ts": 1716454222984398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223046136, "dur": 9, "args": { "External id": 107375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107375, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107375, "pid": 5, "tid": 7, "ts": 1716454223046136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984468, "dur": 12, "args": { "External id": 107375, "cbid": 211, "correlation": 107375 } }, { "ph": "s", "id": 107375, "pid": 76337, "tid": -914061504, "ts": 1716454222984468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223046147, "dur": 21, "args": { "External id": 107383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107383, "pid": 5, "tid": 7, "ts": 1716454223046147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984509, "dur": 9, "args": { "External id": 107383, "cbid": 211, "correlation": 107383 } }, { "ph": "s", "id": 107383, "pid": 76337, "tid": -914061504, "ts": 1716454222984509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223046170, "dur": 17, "args": { "External id": 107405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107405, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107405, "pid": 5, "tid": 7, "ts": 1716454223046170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984561, "dur": 10, "args": { "External id": 107405, "cbid": 211, "correlation": 107405 } }, { "ph": "s", "id": 107405, "pid": 76337, "tid": -914061504, "ts": 1716454222984561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984649, "dur": 1, "args": { "External id": 107421, "cbid": 251, "correlation": 107421 } }, { "ph": "f", "id": 107421, "pid": 76337, "tid": -914061504, "ts": 1716454222984649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984654, "dur": 0, "args": { "External id": 107423, "cbid": 251, "correlation": 107423 } }, { "ph": "f", "id": 107423, "pid": 76337, "tid": -914061504, "ts": 1716454222984654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223046188, "dur": 493, "args": { "External id": 107424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107424, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107424, "pid": 5, "tid": 7, "ts": 1716454223046188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984656, "dur": 12, "args": { "External id": 107424, "cbid": 211, "correlation": 107424 } }, { "ph": "s", "id": 107424, "pid": 76337, "tid": -914061504, "ts": 1716454222984656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223046683, "dur": 66, "args": { "External id": 107432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107432, "pid": 5, "tid": 7, "ts": 1716454223046683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984720, "dur": 13, "args": { "External id": 107432, "cbid": 211, "correlation": 107432 } }, { "ph": "s", "id": 107432, "pid": 76337, "tid": -914061504, "ts": 1716454222984720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223046750, "dur": 64, "args": { "External id": 107440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107440, "pid": 5, "tid": 7, "ts": 1716454223046750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984752, "dur": 8, "args": { "External id": 107440, "cbid": 211, "correlation": 107440 } }, { "ph": "s", "id": 107440, "pid": 76337, "tid": -914061504, "ts": 1716454222984752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222984831, "dur": 1, "args": { "External id": 107456, "cbid": 251, "correlation": 107456 } }, { "ph": "f", "id": 107456, "pid": 76337, "tid": -914061504, "ts": 1716454222984831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223046816, "dur": 1, "args": { "External id": 107458, "device": 5, "context": 1, "stream": 7, "correlation": 107458, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 107458, "pid": 5, "tid": 7, "ts": 1716454223046816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222984836, "dur": 9, "args": { "External id": 107458, "cbid": 51, "correlation": 107458 } }, { "ph": "s", "id": 107458, "pid": 76337, "tid": -914061504, "ts": 1716454222984836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223046820, "dur": 269, "args": { "External id": 107459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107459, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107459, "pid": 5, "tid": 7, "ts": 1716454223046820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984847, "dur": 11, "args": { "External id": 107459, "cbid": 211, "correlation": 107459 } }, { "ph": "s", "id": 107459, "pid": 76337, "tid": -914061504, "ts": 1716454222984847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223047090, "dur": 13, "args": { "External id": 107467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107467, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107467, "pid": 5, "tid": 7, "ts": 1716454223047090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984889, "dur": 10, "args": { "External id": 107467, "cbid": 211, "correlation": 107467 } }, { "ph": "s", "id": 107467, "pid": 76337, "tid": -914061504, "ts": 1716454222984889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223047105, "dur": 37, "args": { "External id": 107478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107478, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107478, "pid": 5, "tid": 7, "ts": 1716454223047105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222984957, "dur": 12, "args": { "External id": 107478, "cbid": 211, "correlation": 107478 } }, { "ph": "s", "id": 107478, "pid": 76337, "tid": -914061504, "ts": 1716454222984957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222985029, "dur": 0, "args": { "External id": 107490, "cbid": 317, "correlation": 107490 } }, { "ph": "f", "id": 107490, "pid": 76337, "tid": -914061504, "ts": 1716454222985029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222985030, "dur": 0, "args": { "External id": 107491, "cbid": 203, "correlation": 107491 } }, { "ph": "f", "id": 107491, "pid": 76337, "tid": -914061504, "ts": 1716454222985030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222985031, "dur": 0, "args": { "External id": 107492, "cbid": 205, "correlation": 107492 } }, { "ph": "f", "id": 107492, "pid": 76337, "tid": -914061504, "ts": 1716454222985031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223047143, "dur": 12, "args": { "External id": 107496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107496, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107496, "pid": 5, "tid": 7, "ts": 1716454223047143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985046, "dur": 12, "args": { "External id": 107496, "cbid": 211, "correlation": 107496 } }, { "ph": "s", "id": 107496, "pid": 76337, "tid": -914061504, "ts": 1716454222985046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223047156, "dur": 4, "args": { "External id": 107498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107498, "pid": 5, "tid": 7, "ts": 1716454223047156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985062, "dur": 6, "args": { "External id": 107498, "cbid": 211, "correlation": 107498 } }, { "ph": "s", "id": 107498, "pid": 76337, "tid": -914061504, "ts": 1716454222985062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222985071, "dur": 0, "args": { "External id": 107499, "cbid": 51, "correlation": 107499 } }, { "ph": "s", "id": 107499, "pid": 76337, "tid": -914061504, "ts": 1716454222985071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223047161, "dur": 96, "args": { "External id": 107500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107500, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 107500, "pid": 5, "tid": 7, "ts": 1716454223047161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985072, "dur": 5, "args": { "External id": 107500, "cbid": 211, "correlation": 107500 } }, { "ph": "s", "id": 107500, "pid": 76337, "tid": -914061504, "ts": 1716454222985072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223047259, "dur": 16, "args": { "External id": 107505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107505, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107505, "pid": 5, "tid": 7, "ts": 1716454223047259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985100, "dur": 9, "args": { "External id": 107505, "cbid": 211, "correlation": 107505 } }, { "ph": "s", "id": 107505, "pid": 76337, "tid": -914061504, "ts": 1716454222985100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223047276, "dur": 13, "args": { "External id": 107513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107513, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107513, "pid": 5, "tid": 7, "ts": 1716454223047276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985131, "dur": 9, "args": { "External id": 107513, "cbid": 211, "correlation": 107513 } }, { "ph": "s", "id": 107513, "pid": 76337, "tid": -914061504, "ts": 1716454222985131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223047290, "dur": 56, "args": { "External id": 107524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107524, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107524, "pid": 5, "tid": 7, "ts": 1716454223047290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985195, "dur": 11, "args": { "External id": 107524, "cbid": 211, "correlation": 107524 } }, { "ph": "s", "id": 107524, "pid": 76337, "tid": -914061504, "ts": 1716454222985195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222985249, "dur": 0, "args": { "External id": 107534, "cbid": 317, "correlation": 107534 } }, { "ph": "f", "id": 107534, "pid": 76337, "tid": -914061504, "ts": 1716454222985249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222985250, "dur": 0, "args": { "External id": 107535, "cbid": 203, "correlation": 107535 } }, { "ph": "f", "id": 107535, "pid": 76337, "tid": -914061504, "ts": 1716454222985250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222985251, "dur": 0, "args": { "External id": 107536, "cbid": 205, "correlation": 107536 } }, { "ph": "f", "id": 107536, "pid": 76337, "tid": -914061504, "ts": 1716454222985251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223047348, "dur": 39, "args": { "External id": 107540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107540, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107540, "pid": 5, "tid": 7, "ts": 1716454223047348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985267, "dur": 12, "args": { "External id": 107540, "cbid": 211, "correlation": 107540 } }, { "ph": "s", "id": 107540, "pid": 76337, "tid": -914061504, "ts": 1716454222985267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223047388, "dur": 160, "args": { "External id": 107542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107542, "pid": 5, "tid": 7, "ts": 1716454223047388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985281, "dur": 6, "args": { "External id": 107542, "cbid": 211, "correlation": 107542 } }, { "ph": "s", "id": 107542, "pid": 76337, "tid": -914061504, "ts": 1716454222985281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223047549, "dur": 1962, "args": { "External id": 107544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107544, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107544, "pid": 5, "tid": 7, "ts": 1716454223047549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985294, "dur": 8, "args": { "External id": 107544, "cbid": 211, "correlation": 107544 } }, { "ph": "s", "id": 107544, "pid": 76337, "tid": -914061504, "ts": 1716454222985294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223049513, "dur": 40, "args": { "External id": 107546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107546, "pid": 5, "tid": 7, "ts": 1716454223049513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985305, "dur": 5, "args": { "External id": 107546, "cbid": 211, "correlation": 107546 } }, { "ph": "s", "id": 107546, "pid": 76337, "tid": -914061504, "ts": 1716454222985305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223049553, "dur": 58, "args": { "External id": 107552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107552, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107552, "pid": 5, "tid": 7, "ts": 1716454223049553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985334, "dur": 8, "args": { "External id": 107552, "cbid": 211, "correlation": 107552 } }, { "ph": "s", "id": 107552, "pid": 76337, "tid": -914061504, "ts": 1716454222985334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223049613, "dur": 86, "args": { "External id": 107561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107561, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107561, "pid": 5, "tid": 7, "ts": 1716454223049613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985423, "dur": 13, "args": { "External id": 107561, "cbid": 211, "correlation": 107561 } }, { "ph": "s", "id": 107561, "pid": 76337, "tid": -914061504, "ts": 1716454222985423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223049700, "dur": 72, "args": { "External id": 107581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107581, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 107581, "pid": 5, "tid": 7, "ts": 1716454223049700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985494, "dur": 11, "args": { "External id": 107581, "cbid": 211, "correlation": 107581 } }, { "ph": "s", "id": 107581, "pid": 76337, "tid": -914061504, "ts": 1716454222985494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223049774, "dur": 5, "args": { "External id": 107593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107593, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 107593, "pid": 5, "tid": 7, "ts": 1716454223049774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985515, "dur": 6, "args": { "External id": 107593, "cbid": 211, "correlation": 107593 } }, { "ph": "s", "id": 107593, "pid": 76337, "tid": -914061504, "ts": 1716454222985515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223049780, "dur": 80, "args": { "External id": 107596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107596, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107596, "pid": 5, "tid": 7, "ts": 1716454223049780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985534, "dur": 7, "args": { "External id": 107596, "cbid": 211, "correlation": 107596 } }, { "ph": "s", "id": 107596, "pid": 76337, "tid": -914061504, "ts": 1716454222985534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223049862, "dur": 53, "args": { "External id": 107605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107605, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107605, "pid": 5, "tid": 7, "ts": 1716454223049862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985575, "dur": 11, "args": { "External id": 107605, "cbid": 211, "correlation": 107605 } }, { "ph": "s", "id": 107605, "pid": 76337, "tid": -914061504, "ts": 1716454222985575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222985627, "dur": 0, "args": { "External id": 107615, "cbid": 317, "correlation": 107615 } }, { "ph": "f", "id": 107615, "pid": 76337, "tid": -914061504, "ts": 1716454222985627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222985628, "dur": 0, "args": { "External id": 107616, "cbid": 203, "correlation": 107616 } }, { "ph": "f", "id": 107616, "pid": 76337, "tid": -914061504, "ts": 1716454222985628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222985629, "dur": 0, "args": { "External id": 107617, "cbid": 205, "correlation": 107617 } }, { "ph": "f", "id": 107617, "pid": 76337, "tid": -914061504, "ts": 1716454222985629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223049916, "dur": 57, "args": { "External id": 107621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107621, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107621, "pid": 5, "tid": 7, "ts": 1716454223049916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985646, "dur": 11, "args": { "External id": 107621, "cbid": 211, "correlation": 107621 } }, { "ph": "s", "id": 107621, "pid": 76337, "tid": -914061504, "ts": 1716454222985646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223049974, "dur": 119, "args": { "External id": 107623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107623, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107623, "pid": 5, "tid": 7, "ts": 1716454223049974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985660, "dur": 5, "args": { "External id": 107623, "cbid": 211, "correlation": 107623 } }, { "ph": "s", "id": 107623, "pid": 76337, "tid": -914061504, "ts": 1716454222985660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223050094, "dur": 1877, "args": { "External id": 107625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107625, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107625, "pid": 5, "tid": 7, "ts": 1716454223050094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985671, "dur": 6, "args": { "External id": 107625, "cbid": 211, "correlation": 107625 } }, { "ph": "s", "id": 107625, "pid": 76337, "tid": -914061504, "ts": 1716454222985671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223051973, "dur": 19, "args": { "External id": 107627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107627, "pid": 5, "tid": 7, "ts": 1716454223051973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985680, "dur": 6, "args": { "External id": 107627, "cbid": 211, "correlation": 107627 } }, { "ph": "s", "id": 107627, "pid": 76337, "tid": -914061504, "ts": 1716454222985680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223051993, "dur": 32, "args": { "External id": 107633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107633, "pid": 5, "tid": 7, "ts": 1716454223051993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985709, "dur": 8, "args": { "External id": 107633, "cbid": 211, "correlation": 107633 } }, { "ph": "s", "id": 107633, "pid": 76337, "tid": -914061504, "ts": 1716454222985709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223052027, "dur": 3, "args": { "External id": 107641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107641, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 107641, "pid": 5, "tid": 7, "ts": 1716454223052027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985753, "dur": 9, "args": { "External id": 107641, "cbid": 211, "correlation": 107641 } }, { "ph": "s", "id": 107641, "pid": 76337, "tid": -914061504, "ts": 1716454222985753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222985819, "dur": 1, "args": { "External id": 107657, "cbid": 251, "correlation": 107657 } }, { "ph": "f", "id": 107657, "pid": 76337, "tid": -914061504, "ts": 1716454222985819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222985824, "dur": 0, "args": { "External id": 107659, "cbid": 251, "correlation": 107659 } }, { "ph": "f", "id": 107659, "pid": 76337, "tid": -914061504, "ts": 1716454222985824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223052032, "dur": 12, "args": { "External id": 107660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107660, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 107660, "pid": 5, "tid": 7, "ts": 1716454223052032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985826, "dur": 12, "args": { "External id": 107660, "cbid": 211, "correlation": 107660 } }, { "ph": "s", "id": 107660, "pid": 76337, "tid": -914061504, "ts": 1716454222985826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223052045, "dur": 5, "args": { "External id": 107662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107662, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 107662, "pid": 5, "tid": 7, "ts": 1716454223052045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985840, "dur": 5, "args": { "External id": 107662, "cbid": 211, "correlation": 107662 } }, { "ph": "s", "id": 107662, "pid": 76337, "tid": -914061504, "ts": 1716454222985840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223052051, "dur": 29, "args": { "External id": 107672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107672, "pid": 5, "tid": 7, "ts": 1716454223052051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985897, "dur": 12, "args": { "External id": 107672, "cbid": 211, "correlation": 107672 } }, { "ph": "s", "id": 107672, "pid": 76337, "tid": -914061504, "ts": 1716454222985897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223052081, "dur": 30, "args": { "External id": 107692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107692, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 107692, "pid": 5, "tid": 7, "ts": 1716454223052081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985963, "dur": 19, "args": { "External id": 107692, "cbid": 211, "correlation": 107692 } }, { "ph": "s", "id": 107692, "pid": 76337, "tid": -914061504, "ts": 1716454222985963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223052113, "dur": 5, "args": { "External id": 107704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107704, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 107704, "pid": 5, "tid": 7, "ts": 1716454223052113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222985993, "dur": 7, "args": { "External id": 107704, "cbid": 211, "correlation": 107704 } }, { "ph": "s", "id": 107704, "pid": 76337, "tid": -914061504, "ts": 1716454222985993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223052119, "dur": 30, "args": { "External id": 107707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107707, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107707, "pid": 5, "tid": 7, "ts": 1716454223052119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986011, "dur": 6, "args": { "External id": 107707, "cbid": 211, "correlation": 107707 } }, { "ph": "s", "id": 107707, "pid": 76337, "tid": -914061504, "ts": 1716454222986011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223052150, "dur": 20, "args": { "External id": 107716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107716, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107716, "pid": 5, "tid": 7, "ts": 1716454223052150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986053, "dur": 10, "args": { "External id": 107716, "cbid": 211, "correlation": 107716 } }, { "ph": "s", "id": 107716, "pid": 76337, "tid": -914061504, "ts": 1716454222986053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222986116, "dur": 0, "args": { "External id": 107726, "cbid": 317, "correlation": 107726 } }, { "ph": "f", "id": 107726, "pid": 76337, "tid": -914061504, "ts": 1716454222986116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222986117, "dur": 0, "args": { "External id": 107727, "cbid": 203, "correlation": 107727 } }, { "ph": "f", "id": 107727, "pid": 76337, "tid": -914061504, "ts": 1716454222986117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222986117, "dur": 0, "args": { "External id": 107728, "cbid": 205, "correlation": 107728 } }, { "ph": "f", "id": 107728, "pid": 76337, "tid": -914061504, "ts": 1716454222986117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223052171, "dur": 22, "args": { "External id": 107732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107732, "pid": 5, "tid": 7, "ts": 1716454223052171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986133, "dur": 13, "args": { "External id": 107732, "cbid": 211, "correlation": 107732 } }, { "ph": "s", "id": 107732, "pid": 76337, "tid": -914061504, "ts": 1716454222986133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223052195, "dur": 43, "args": { "External id": 107734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107734, "pid": 5, "tid": 7, "ts": 1716454223052195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986148, "dur": 5, "args": { "External id": 107734, "cbid": 211, "correlation": 107734 } }, { "ph": "s", "id": 107734, "pid": 76337, "tid": -914061504, "ts": 1716454222986148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223052239, "dur": 641, "args": { "External id": 107736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107736, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107736, "pid": 5, "tid": 7, "ts": 1716454223052239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986159, "dur": 6, "args": { "External id": 107736, "cbid": 211, "correlation": 107736 } }, { "ph": "s", "id": 107736, "pid": 76337, "tid": -914061504, "ts": 1716454222986159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223052882, "dur": 22, "args": { "External id": 107738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107738, "pid": 5, "tid": 7, "ts": 1716454223052882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986169, "dur": 5, "args": { "External id": 107738, "cbid": 211, "correlation": 107738 } }, { "ph": "s", "id": 107738, "pid": 76337, "tid": -914061504, "ts": 1716454222986169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223052905, "dur": 32, "args": { "External id": 107744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107744, "pid": 5, "tid": 7, "ts": 1716454223052905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986197, "dur": 8, "args": { "External id": 107744, "cbid": 211, "correlation": 107744 } }, { "ph": "s", "id": 107744, "pid": 76337, "tid": -914061504, "ts": 1716454222986197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222986255, "dur": 0, "args": { "External id": 107754, "cbid": 317, "correlation": 107754 } }, { "ph": "f", "id": 107754, "pid": 76337, "tid": -914061504, "ts": 1716454222986255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222986255, "dur": 0, "args": { "External id": 107755, "cbid": 203, "correlation": 107755 } }, { "ph": "f", "id": 107755, "pid": 76337, "tid": -914061504, "ts": 1716454222986255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222986256, "dur": 0, "args": { "External id": 107756, "cbid": 205, "correlation": 107756 } }, { "ph": "f", "id": 107756, "pid": 76337, "tid": -914061504, "ts": 1716454222986256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223052938, "dur": 56, "args": { "External id": 107760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107760, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107760, "pid": 5, "tid": 7, "ts": 1716454223052938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986268, "dur": 11, "args": { "External id": 107760, "cbid": 211, "correlation": 107760 } }, { "ph": "s", "id": 107760, "pid": 76337, "tid": -914061504, "ts": 1716454222986268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223052996, "dur": 264, "args": { "External id": 107762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107762, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107762, "pid": 5, "tid": 7, "ts": 1716454223052996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986287, "dur": 8, "args": { "External id": 107762, "cbid": 211, "correlation": 107762 } }, { "ph": "s", "id": 107762, "pid": 76337, "tid": -914061504, "ts": 1716454222986287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223053261, "dur": 21, "args": { "External id": 107764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107764, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107764, "pid": 5, "tid": 7, "ts": 1716454223053261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986298, "dur": 5, "args": { "External id": 107764, "cbid": 211, "correlation": 107764 } }, { "ph": "s", "id": 107764, "pid": 76337, "tid": -914061504, "ts": 1716454222986298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223053283, "dur": 31, "args": { "External id": 107770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107770, "pid": 5, "tid": 7, "ts": 1716454223053283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986324, "dur": 9, "args": { "External id": 107770, "cbid": 211, "correlation": 107770 } }, { "ph": "s", "id": 107770, "pid": 76337, "tid": -914061504, "ts": 1716454222986324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223053316, "dur": 27, "args": { "External id": 107778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107778, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107778, "pid": 5, "tid": 7, "ts": 1716454223053316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986352, "dur": 8, "args": { "External id": 107778, "cbid": 211, "correlation": 107778 } }, { "ph": "s", "id": 107778, "pid": 76337, "tid": -914061504, "ts": 1716454222986352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223053344, "dur": 20, "args": { "External id": 107786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107786, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107786, "pid": 5, "tid": 7, "ts": 1716454223053344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986381, "dur": 8, "args": { "External id": 107786, "cbid": 211, "correlation": 107786 } }, { "ph": "s", "id": 107786, "pid": 76337, "tid": -914061504, "ts": 1716454222986381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223053365, "dur": 29, "args": { "External id": 107806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107806, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 107806, "pid": 5, "tid": 7, "ts": 1716454223053365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986462, "dur": 12, "args": { "External id": 107806, "cbid": 211, "correlation": 107806 } }, { "ph": "s", "id": 107806, "pid": 76337, "tid": -914061504, "ts": 1716454222986462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223053396, "dur": 5, "args": { "External id": 107818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107818, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 107818, "pid": 5, "tid": 7, "ts": 1716454223053396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986484, "dur": 6, "args": { "External id": 107818, "cbid": 211, "correlation": 107818 } }, { "ph": "s", "id": 107818, "pid": 76337, "tid": -914061504, "ts": 1716454222986484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223053402, "dur": 31, "args": { "External id": 107821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107821, "pid": 5, "tid": 7, "ts": 1716454223053402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986502, "dur": 7, "args": { "External id": 107821, "cbid": 211, "correlation": 107821 } }, { "ph": "s", "id": 107821, "pid": 76337, "tid": -914061504, "ts": 1716454222986502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222986560, "dur": 0, "args": { "External id": 107832, "cbid": 317, "correlation": 107832 } }, { "ph": "f", "id": 107832, "pid": 76337, "tid": -914061504, "ts": 1716454222986560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222986560, "dur": 0, "args": { "External id": 107833, "cbid": 203, "correlation": 107833 } }, { "ph": "f", "id": 107833, "pid": 76337, "tid": -914061504, "ts": 1716454222986560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222986561, "dur": 0, "args": { "External id": 107834, "cbid": 205, "correlation": 107834 } }, { "ph": "f", "id": 107834, "pid": 76337, "tid": -914061504, "ts": 1716454222986561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223053434, "dur": 22, "args": { "External id": 107838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107838, "pid": 5, "tid": 7, "ts": 1716454223053434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986574, "dur": 13, "args": { "External id": 107838, "cbid": 211, "correlation": 107838 } }, { "ph": "s", "id": 107838, "pid": 76337, "tid": -914061504, "ts": 1716454222986574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223053457, "dur": 104, "args": { "External id": 107840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107840, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107840, "pid": 5, "tid": 7, "ts": 1716454223053457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986594, "dur": 6, "args": { "External id": 107840, "cbid": 211, "correlation": 107840 } }, { "ph": "s", "id": 107840, "pid": 76337, "tid": -914061504, "ts": 1716454222986594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223053563, "dur": 20, "args": { "External id": 107842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107842, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107842, "pid": 5, "tid": 7, "ts": 1716454223053563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986603, "dur": 5, "args": { "External id": 107842, "cbid": 211, "correlation": 107842 } }, { "ph": "s", "id": 107842, "pid": 76337, "tid": -914061504, "ts": 1716454222986603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223053584, "dur": 32, "args": { "External id": 107848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107848, "pid": 5, "tid": 7, "ts": 1716454223053584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986631, "dur": 10, "args": { "External id": 107848, "cbid": 211, "correlation": 107848 } }, { "ph": "s", "id": 107848, "pid": 76337, "tid": -914061504, "ts": 1716454222986631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223053618, "dur": 198, "args": { "External id": 107857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107857, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107857, "pid": 5, "tid": 7, "ts": 1716454223053618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986714, "dur": 14, "args": { "External id": 107857, "cbid": 211, "correlation": 107857 } }, { "ph": "s", "id": 107857, "pid": 76337, "tid": -914061504, "ts": 1716454222986714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223053817, "dur": 63, "args": { "External id": 107879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107879, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107879, "pid": 5, "tid": 7, "ts": 1716454223053817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986770, "dur": 10, "args": { "External id": 107879, "cbid": 211, "correlation": 107879 } }, { "ph": "s", "id": 107879, "pid": 76337, "tid": -914061504, "ts": 1716454222986770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222986861, "dur": 2, "args": { "External id": 107890, "cbid": 251, "correlation": 107890 } }, { "ph": "f", "id": 107890, "pid": 76337, "tid": -914061504, "ts": 1716454222986861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223053881, "dur": 156, "args": { "External id": 107891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107891, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107891, "pid": 5, "tid": 7, "ts": 1716454223053881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986867, "dur": 13, "args": { "External id": 107891, "cbid": 211, "correlation": 107891 } }, { "ph": "s", "id": 107891, "pid": 76337, "tid": -914061504, "ts": 1716454222986867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222986938, "dur": 1, "args": { "External id": 107902, "cbid": 251, "correlation": 107902 } }, { "ph": "f", "id": 107902, "pid": 76337, "tid": -914061504, "ts": 1716454222986938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223054039, "dur": 143, "args": { "External id": 107903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107903, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107903, "pid": 5, "tid": 7, "ts": 1716454223054039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222986942, "dur": 11, "args": { "External id": 107903, "cbid": 211, "correlation": 107903 } }, { "ph": "s", "id": 107903, "pid": 76337, "tid": -914061504, "ts": 1716454222986942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987016, "dur": 1, "args": { "External id": 107914, "cbid": 251, "correlation": 107914 } }, { "ph": "f", "id": 107914, "pid": 76337, "tid": -914061504, "ts": 1716454222987016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223054184, "dur": 145, "args": { "External id": 107915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107915, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 107915, "pid": 5, "tid": 7, "ts": 1716454223054184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987020, "dur": 11, "args": { "External id": 107915, "cbid": 211, "correlation": 107915 } }, { "ph": "s", "id": 107915, "pid": 76337, "tid": -914061504, "ts": 1716454222987020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223054330, "dur": 1907, "args": { "External id": 107936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107936, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 107936, "pid": 5, "tid": 7, "ts": 1716454223054330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987106, "dur": 13, "args": { "External id": 107936, "cbid": 211, "correlation": 107936 } }, { "ph": "s", "id": 107936, "pid": 76337, "tid": -914061504, "ts": 1716454222987106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987206, "dur": 2, "args": { "External id": 107954, "cbid": 251, "correlation": 107954 } }, { "ph": "f", "id": 107954, "pid": 76337, "tid": -914061504, "ts": 1716454222987206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223056238, "dur": 148, "args": { "External id": 107956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107956, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 107956, "pid": 5, "tid": 7, "ts": 1716454223056238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987213, "dur": 13, "args": { "External id": 107956, "cbid": 211, "correlation": 107956 } }, { "ph": "s", "id": 107956, "pid": 76337, "tid": -914061504, "ts": 1716454222987213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223056388, "dur": 35, "args": { "External id": 107964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107964, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107964, "pid": 5, "tid": 7, "ts": 1716454223056388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987283, "dur": 13, "args": { "External id": 107964, "cbid": 211, "correlation": 107964 } }, { "ph": "s", "id": 107964, "pid": 76337, "tid": -914061504, "ts": 1716454222987283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223056424, "dur": 51, "args": { "External id": 107972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107972, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107972, "pid": 5, "tid": 7, "ts": 1716454223056424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987322, "dur": 8, "args": { "External id": 107972, "cbid": 211, "correlation": 107972 } }, { "ph": "s", "id": 107972, "pid": 76337, "tid": -914061504, "ts": 1716454222987322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223056477, "dur": 30, "args": { "External id": 107983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 107983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 107983, "pid": 5, "tid": 7, "ts": 1716454223056477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987392, "dur": 12, "args": { "External id": 107983, "cbid": 211, "correlation": 107983 } }, { "ph": "s", "id": 107983, "pid": 76337, "tid": -914061504, "ts": 1716454222987392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223056508, "dur": 33, "args": { "External id": 108005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108005, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108005, "pid": 5, "tid": 7, "ts": 1716454223056508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987423, "dur": 7, "args": { "External id": 108005, "cbid": 211, "correlation": 108005 } }, { "ph": "s", "id": 108005, "pid": 76337, "tid": -914061504, "ts": 1716454222987423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987508, "dur": 1, "args": { "External id": 108016, "cbid": 251, "correlation": 108016 } }, { "ph": "f", "id": 108016, "pid": 76337, "tid": -914061504, "ts": 1716454222987508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223056543, "dur": 75, "args": { "External id": 108017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108017, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108017, "pid": 5, "tid": 7, "ts": 1716454223056543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987513, "dur": 13, "args": { "External id": 108017, "cbid": 211, "correlation": 108017 } }, { "ph": "s", "id": 108017, "pid": 76337, "tid": -914061504, "ts": 1716454222987513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987583, "dur": 1, "args": { "External id": 108028, "cbid": 251, "correlation": 108028 } }, { "ph": "f", "id": 108028, "pid": 76337, "tid": -914061504, "ts": 1716454222987583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987587, "dur": 0, "args": { "External id": 108029, "cbid": 251, "correlation": 108029 } }, { "ph": "f", "id": 108029, "pid": 76337, "tid": -914061504, "ts": 1716454222987587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223056619, "dur": 11, "args": { "External id": 108030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108030, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 108030, "pid": 5, "tid": 7, "ts": 1716454223056619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987589, "dur": 12, "args": { "External id": 108030, "cbid": 211, "correlation": 108030 } }, { "ph": "s", "id": 108030, "pid": 76337, "tid": -914061504, "ts": 1716454222987589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223056631, "dur": 5, "args": { "External id": 108032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108032, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 108032, "pid": 5, "tid": 7, "ts": 1716454223056631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987602, "dur": 6, "args": { "External id": 108032, "cbid": 211, "correlation": 108032 } }, { "ph": "s", "id": 108032, "pid": 76337, "tid": -914061504, "ts": 1716454222987602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987660, "dur": 1, "args": { "External id": 108043, "cbid": 251, "correlation": 108043 } }, { "ph": "f", "id": 108043, "pid": 76337, "tid": -914061504, "ts": 1716454222987660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987664, "dur": 0, "args": { "External id": 108044, "cbid": 251, "correlation": 108044 } }, { "ph": "f", "id": 108044, "pid": 76337, "tid": -914061504, "ts": 1716454222987664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223056638, "dur": 7, "args": { "External id": 108045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108045, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 108045, "pid": 5, "tid": 7, "ts": 1716454223056638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987665, "dur": 11, "args": { "External id": 108045, "cbid": 211, "correlation": 108045 } }, { "ph": "s", "id": 108045, "pid": 76337, "tid": -914061504, "ts": 1716454222987665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223056646, "dur": 3, "args": { "External id": 108047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108047, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 108047, "pid": 5, "tid": 7, "ts": 1716454223056646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987678, "dur": 6, "args": { "External id": 108047, "cbid": 211, "correlation": 108047 } }, { "ph": "s", "id": 108047, "pid": 76337, "tid": -914061504, "ts": 1716454222987678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223056651, "dur": 89, "args": { "External id": 108068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108068, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 108068, "pid": 5, "tid": 7, "ts": 1716454223056651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987752, "dur": 12, "args": { "External id": 108068, "cbid": 211, "correlation": 108068 } }, { "ph": "s", "id": 108068, "pid": 76337, "tid": -914061504, "ts": 1716454222987752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222987849, "dur": 1, "args": { "External id": 108086, "cbid": 251, "correlation": 108086 } }, { "ph": "f", "id": 108086, "pid": 76337, "tid": -914061504, "ts": 1716454222987849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223056741, "dur": 96, "args": { "External id": 108088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108088, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108088, "pid": 5, "tid": 7, "ts": 1716454223056741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987855, "dur": 13, "args": { "External id": 108088, "cbid": 211, "correlation": 108088 } }, { "ph": "s", "id": 108088, "pid": 76337, "tid": -914061504, "ts": 1716454222987855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223056838, "dur": 19, "args": { "External id": 108096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108096, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108096, "pid": 5, "tid": 7, "ts": 1716454223056838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987924, "dur": 12, "args": { "External id": 108096, "cbid": 211, "correlation": 108096 } }, { "ph": "s", "id": 108096, "pid": 76337, "tid": -914061504, "ts": 1716454222987924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223056859, "dur": 37, "args": { "External id": 108104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108104, "pid": 5, "tid": 7, "ts": 1716454223056859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222987966, "dur": 17, "args": { "External id": 108104, "cbid": 211, "correlation": 108104 } }, { "ph": "s", "id": 108104, "pid": 76337, "tid": -914061504, "ts": 1716454222987966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223056897, "dur": 34, "args": { "External id": 108126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108126, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108126, "pid": 5, "tid": 7, "ts": 1716454223056897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988025, "dur": 11, "args": { "External id": 108126, "cbid": 211, "correlation": 108126 } }, { "ph": "s", "id": 108126, "pid": 76337, "tid": -914061504, "ts": 1716454222988025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222988116, "dur": 1, "args": { "External id": 108142, "cbid": 251, "correlation": 108142 } }, { "ph": "f", "id": 108142, "pid": 76337, "tid": -914061504, "ts": 1716454222988116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222988121, "dur": 0, "args": { "External id": 108144, "cbid": 251, "correlation": 108144 } }, { "ph": "f", "id": 108144, "pid": 76337, "tid": -914061504, "ts": 1716454222988121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223056932, "dur": 531, "args": { "External id": 108145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108145, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 108145, "pid": 5, "tid": 7, "ts": 1716454223056932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988126, "dur": 13, "args": { "External id": 108145, "cbid": 211, "correlation": 108145 } }, { "ph": "s", "id": 108145, "pid": 76337, "tid": -914061504, "ts": 1716454222988126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223057465, "dur": 123, "args": { "External id": 108153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108153, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108153, "pid": 5, "tid": 7, "ts": 1716454223057465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988192, "dur": 12, "args": { "External id": 108153, "cbid": 211, "correlation": 108153 } }, { "ph": "s", "id": 108153, "pid": 76337, "tid": -914061504, "ts": 1716454222988192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223057589, "dur": 128, "args": { "External id": 108161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108161, "pid": 5, "tid": 7, "ts": 1716454223057589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988222, "dur": 9, "args": { "External id": 108161, "cbid": 211, "correlation": 108161 } }, { "ph": "s", "id": 108161, "pid": 76337, "tid": -914061504, "ts": 1716454222988222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222988300, "dur": 1, "args": { "External id": 108177, "cbid": 251, "correlation": 108177 } }, { "ph": "f", "id": 108177, "pid": 76337, "tid": -914061504, "ts": 1716454222988300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223057718, "dur": 301, "args": { "External id": 108179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108179, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108179, "pid": 5, "tid": 7, "ts": 1716454223057718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988305, "dur": 12, "args": { "External id": 108179, "cbid": 211, "correlation": 108179 } }, { "ph": "s", "id": 108179, "pid": 76337, "tid": -914061504, "ts": 1716454222988305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223058021, "dur": 27, "args": { "External id": 108187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108187, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108187, "pid": 5, "tid": 7, "ts": 1716454223058021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988348, "dur": 10, "args": { "External id": 108187, "cbid": 211, "correlation": 108187 } }, { "ph": "s", "id": 108187, "pid": 76337, "tid": -914061504, "ts": 1716454222988348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223058049, "dur": 79, "args": { "External id": 108198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108198, "pid": 5, "tid": 7, "ts": 1716454223058049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988415, "dur": 12, "args": { "External id": 108198, "cbid": 211, "correlation": 108198 } }, { "ph": "s", "id": 108198, "pid": 76337, "tid": -914061504, "ts": 1716454222988415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222988479, "dur": 0, "args": { "External id": 108210, "cbid": 317, "correlation": 108210 } }, { "ph": "f", "id": 108210, "pid": 76337, "tid": -914061504, "ts": 1716454222988479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222988480, "dur": 0, "args": { "External id": 108211, "cbid": 203, "correlation": 108211 } }, { "ph": "f", "id": 108211, "pid": 76337, "tid": -914061504, "ts": 1716454222988480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222988481, "dur": 0, "args": { "External id": 108212, "cbid": 205, "correlation": 108212 } }, { "ph": "f", "id": 108212, "pid": 76337, "tid": -914061504, "ts": 1716454222988481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223058130, "dur": 22, "args": { "External id": 108216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108216, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108216, "pid": 5, "tid": 7, "ts": 1716454223058130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988496, "dur": 12, "args": { "External id": 108216, "cbid": 211, "correlation": 108216 } }, { "ph": "s", "id": 108216, "pid": 76337, "tid": -914061504, "ts": 1716454222988496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223058153, "dur": 117, "args": { "External id": 108218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108218, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108218, "pid": 5, "tid": 7, "ts": 1716454223058153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988514, "dur": 6, "args": { "External id": 108218, "cbid": 211, "correlation": 108218 } }, { "ph": "s", "id": 108218, "pid": 76337, "tid": -914061504, "ts": 1716454222988514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223058271, "dur": 23, "args": { "External id": 108220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108220, "pid": 5, "tid": 7, "ts": 1716454223058271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988525, "dur": 5, "args": { "External id": 108220, "cbid": 211, "correlation": 108220 } }, { "ph": "s", "id": 108220, "pid": 76337, "tid": -914061504, "ts": 1716454222988525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223058296, "dur": 33, "args": { "External id": 108226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108226, "pid": 5, "tid": 7, "ts": 1716454223058296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988553, "dur": 9, "args": { "External id": 108226, "cbid": 211, "correlation": 108226 } }, { "ph": "s", "id": 108226, "pid": 76337, "tid": -914061504, "ts": 1716454222988553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223058330, "dur": 27, "args": { "External id": 108234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108234, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108234, "pid": 5, "tid": 7, "ts": 1716454223058330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988586, "dur": 8, "args": { "External id": 108234, "cbid": 211, "correlation": 108234 } }, { "ph": "s", "id": 108234, "pid": 76337, "tid": -914061504, "ts": 1716454222988586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223058358, "dur": 53, "args": { "External id": 108243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108243, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108243, "pid": 5, "tid": 7, "ts": 1716454223058358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988624, "dur": 11, "args": { "External id": 108243, "cbid": 211, "correlation": 108243 } }, { "ph": "s", "id": 108243, "pid": 76337, "tid": -914061504, "ts": 1716454222988624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223058412, "dur": 52, "args": { "External id": 108263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108263, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 108263, "pid": 5, "tid": 7, "ts": 1716454223058412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988696, "dur": 11, "args": { "External id": 108263, "cbid": 211, "correlation": 108263 } }, { "ph": "s", "id": 108263, "pid": 76337, "tid": -914061504, "ts": 1716454222988696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223058466, "dur": 5, "args": { "External id": 108275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108275, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 108275, "pid": 5, "tid": 7, "ts": 1716454223058466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988717, "dur": 6, "args": { "External id": 108275, "cbid": 211, "correlation": 108275 } }, { "ph": "s", "id": 108275, "pid": 76337, "tid": -914061504, "ts": 1716454222988717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223058472, "dur": 56, "args": { "External id": 108278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108278, "pid": 5, "tid": 7, "ts": 1716454223058472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988735, "dur": 7, "args": { "External id": 108278, "cbid": 211, "correlation": 108278 } }, { "ph": "s", "id": 108278, "pid": 76337, "tid": -914061504, "ts": 1716454222988735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223058530, "dur": 36, "args": { "External id": 108287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108287, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108287, "pid": 5, "tid": 7, "ts": 1716454223058530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988773, "dur": 10, "args": { "External id": 108287, "cbid": 211, "correlation": 108287 } }, { "ph": "s", "id": 108287, "pid": 76337, "tid": -914061504, "ts": 1716454222988773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222988825, "dur": 0, "args": { "External id": 108297, "cbid": 317, "correlation": 108297 } }, { "ph": "f", "id": 108297, "pid": 76337, "tid": -914061504, "ts": 1716454222988825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222988826, "dur": 0, "args": { "External id": 108298, "cbid": 203, "correlation": 108298 } }, { "ph": "f", "id": 108298, "pid": 76337, "tid": -914061504, "ts": 1716454222988826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222988827, "dur": 0, "args": { "External id": 108299, "cbid": 205, "correlation": 108299 } }, { "ph": "f", "id": 108299, "pid": 76337, "tid": -914061504, "ts": 1716454222988827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223058567, "dur": 39, "args": { "External id": 108303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108303, "pid": 5, "tid": 7, "ts": 1716454223058567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988841, "dur": 11, "args": { "External id": 108303, "cbid": 211, "correlation": 108303 } }, { "ph": "s", "id": 108303, "pid": 76337, "tid": -914061504, "ts": 1716454222988841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223058608, "dur": 81, "args": { "External id": 108305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108305, "pid": 5, "tid": 7, "ts": 1716454223058608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988855, "dur": 5, "args": { "External id": 108305, "cbid": 211, "correlation": 108305 } }, { "ph": "s", "id": 108305, "pid": 76337, "tid": -914061504, "ts": 1716454222988855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223058691, "dur": 1267, "args": { "External id": 108307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108307, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108307, "pid": 5, "tid": 7, "ts": 1716454223058691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988866, "dur": 6, "args": { "External id": 108307, "cbid": 211, "correlation": 108307 } }, { "ph": "s", "id": 108307, "pid": 76337, "tid": -914061504, "ts": 1716454222988866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223059958, "dur": 22, "args": { "External id": 108309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108309, "pid": 5, "tid": 7, "ts": 1716454223059958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988876, "dur": 5, "args": { "External id": 108309, "cbid": 211, "correlation": 108309 } }, { "ph": "s", "id": 108309, "pid": 76337, "tid": -914061504, "ts": 1716454222988876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223059982, "dur": 32, "args": { "External id": 108315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108315, "pid": 5, "tid": 7, "ts": 1716454223059982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988904, "dur": 8, "args": { "External id": 108315, "cbid": 211, "correlation": 108315 } }, { "ph": "s", "id": 108315, "pid": 76337, "tid": -914061504, "ts": 1716454222988904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223060015, "dur": 3, "args": { "External id": 108323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108323, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 108323, "pid": 5, "tid": 7, "ts": 1716454223060015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222988946, "dur": 9, "args": { "External id": 108323, "cbid": 211, "correlation": 108323 } }, { "ph": "s", "id": 108323, "pid": 76337, "tid": -914061504, "ts": 1716454222988946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222989017, "dur": 1, "args": { "External id": 108339, "cbid": 251, "correlation": 108339 } }, { "ph": "f", "id": 108339, "pid": 76337, "tid": -914061504, "ts": 1716454222989017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222989022, "dur": 0, "args": { "External id": 108341, "cbid": 251, "correlation": 108341 } }, { "ph": "f", "id": 108341, "pid": 76337, "tid": -914061504, "ts": 1716454222989022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223060020, "dur": 12, "args": { "External id": 108342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108342, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 108342, "pid": 5, "tid": 7, "ts": 1716454223060020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989024, "dur": 12, "args": { "External id": 108342, "cbid": 211, "correlation": 108342 } }, { "ph": "s", "id": 108342, "pid": 76337, "tid": -914061504, "ts": 1716454222989024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223060033, "dur": 5, "args": { "External id": 108344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108344, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 108344, "pid": 5, "tid": 7, "ts": 1716454223060033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989038, "dur": 5, "args": { "External id": 108344, "cbid": 211, "correlation": 108344 } }, { "ph": "s", "id": 108344, "pid": 76337, "tid": -914061504, "ts": 1716454222989038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223060040, "dur": 30, "args": { "External id": 108354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108354, "pid": 5, "tid": 7, "ts": 1716454223060040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989095, "dur": 12, "args": { "External id": 108354, "cbid": 211, "correlation": 108354 } }, { "ph": "s", "id": 108354, "pid": 76337, "tid": -914061504, "ts": 1716454222989095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223060071, "dur": 30, "args": { "External id": 108374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108374, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 108374, "pid": 5, "tid": 7, "ts": 1716454223060071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989160, "dur": 11, "args": { "External id": 108374, "cbid": 211, "correlation": 108374 } }, { "ph": "s", "id": 108374, "pid": 76337, "tid": -914061504, "ts": 1716454222989160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223060102, "dur": 4, "args": { "External id": 108386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108386, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 108386, "pid": 5, "tid": 7, "ts": 1716454223060102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989180, "dur": 6, "args": { "External id": 108386, "cbid": 211, "correlation": 108386 } }, { "ph": "s", "id": 108386, "pid": 76337, "tid": -914061504, "ts": 1716454222989180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223060108, "dur": 30, "args": { "External id": 108389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108389, "pid": 5, "tid": 7, "ts": 1716454223060108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989199, "dur": 6, "args": { "External id": 108389, "cbid": 211, "correlation": 108389 } }, { "ph": "s", "id": 108389, "pid": 76337, "tid": -914061504, "ts": 1716454222989199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223060139, "dur": 21, "args": { "External id": 108398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108398, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108398, "pid": 5, "tid": 7, "ts": 1716454223060139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989240, "dur": 9, "args": { "External id": 108398, "cbid": 211, "correlation": 108398 } }, { "ph": "s", "id": 108398, "pid": 76337, "tid": -914061504, "ts": 1716454222989240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222989302, "dur": 0, "args": { "External id": 108408, "cbid": 317, "correlation": 108408 } }, { "ph": "f", "id": 108408, "pid": 76337, "tid": -914061504, "ts": 1716454222989302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222989303, "dur": 0, "args": { "External id": 108409, "cbid": 203, "correlation": 108409 } }, { "ph": "f", "id": 108409, "pid": 76337, "tid": -914061504, "ts": 1716454222989303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222989303, "dur": 0, "args": { "External id": 108410, "cbid": 205, "correlation": 108410 } }, { "ph": "f", "id": 108410, "pid": 76337, "tid": -914061504, "ts": 1716454222989303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223060161, "dur": 22, "args": { "External id": 108414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108414, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108414, "pid": 5, "tid": 7, "ts": 1716454223060161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989317, "dur": 11, "args": { "External id": 108414, "cbid": 211, "correlation": 108414 } }, { "ph": "s", "id": 108414, "pid": 76337, "tid": -914061504, "ts": 1716454222989317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223060184, "dur": 43, "args": { "External id": 108416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108416, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108416, "pid": 5, "tid": 7, "ts": 1716454223060184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989331, "dur": 6, "args": { "External id": 108416, "cbid": 211, "correlation": 108416 } }, { "ph": "s", "id": 108416, "pid": 76337, "tid": -914061504, "ts": 1716454222989331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223060229, "dur": 638, "args": { "External id": 108418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108418, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108418, "pid": 5, "tid": 7, "ts": 1716454223060229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989344, "dur": 6, "args": { "External id": 108418, "cbid": 211, "correlation": 108418 } }, { "ph": "s", "id": 108418, "pid": 76337, "tid": -914061504, "ts": 1716454222989344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223060868, "dur": 21, "args": { "External id": 108420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108420, "pid": 5, "tid": 7, "ts": 1716454223060868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989353, "dur": 5, "args": { "External id": 108420, "cbid": 211, "correlation": 108420 } }, { "ph": "s", "id": 108420, "pid": 76337, "tid": -914061504, "ts": 1716454222989353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223060891, "dur": 32, "args": { "External id": 108426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108426, "pid": 5, "tid": 7, "ts": 1716454223060891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989380, "dur": 10, "args": { "External id": 108426, "cbid": 211, "correlation": 108426 } }, { "ph": "s", "id": 108426, "pid": 76337, "tid": -914061504, "ts": 1716454222989380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222989440, "dur": 0, "args": { "External id": 108436, "cbid": 317, "correlation": 108436 } }, { "ph": "f", "id": 108436, "pid": 76337, "tid": -914061504, "ts": 1716454222989440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222989441, "dur": 0, "args": { "External id": 108437, "cbid": 203, "correlation": 108437 } }, { "ph": "f", "id": 108437, "pid": 76337, "tid": -914061504, "ts": 1716454222989441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222989442, "dur": 0, "args": { "External id": 108438, "cbid": 205, "correlation": 108438 } }, { "ph": "f", "id": 108438, "pid": 76337, "tid": -914061504, "ts": 1716454222989442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223060925, "dur": 38, "args": { "External id": 108442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108442, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108442, "pid": 5, "tid": 7, "ts": 1716454223060925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989453, "dur": 13, "args": { "External id": 108442, "cbid": 211, "correlation": 108442 } }, { "ph": "s", "id": 108442, "pid": 76337, "tid": -914061504, "ts": 1716454222989453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223060964, "dur": 186, "args": { "External id": 108444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108444, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108444, "pid": 5, "tid": 7, "ts": 1716454223060964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989472, "dur": 6, "args": { "External id": 108444, "cbid": 211, "correlation": 108444 } }, { "ph": "s", "id": 108444, "pid": 76337, "tid": -914061504, "ts": 1716454222989472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223061152, "dur": 21, "args": { "External id": 108446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108446, "pid": 5, "tid": 7, "ts": 1716454223061152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989481, "dur": 6, "args": { "External id": 108446, "cbid": 211, "correlation": 108446 } }, { "ph": "s", "id": 108446, "pid": 76337, "tid": -914061504, "ts": 1716454222989481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223061174, "dur": 32, "args": { "External id": 108452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108452, "pid": 5, "tid": 7, "ts": 1716454223061174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989508, "dur": 8, "args": { "External id": 108452, "cbid": 211, "correlation": 108452 } }, { "ph": "s", "id": 108452, "pid": 76337, "tid": -914061504, "ts": 1716454222989508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223061208, "dur": 27, "args": { "External id": 108460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108460, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108460, "pid": 5, "tid": 7, "ts": 1716454223061208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989537, "dur": 8, "args": { "External id": 108460, "cbid": 211, "correlation": 108460 } }, { "ph": "s", "id": 108460, "pid": 76337, "tid": -914061504, "ts": 1716454222989537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223061236, "dur": 20, "args": { "External id": 108468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108468, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108468, "pid": 5, "tid": 7, "ts": 1716454223061236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989565, "dur": 8, "args": { "External id": 108468, "cbid": 211, "correlation": 108468 } }, { "ph": "s", "id": 108468, "pid": 76337, "tid": -914061504, "ts": 1716454222989565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223061257, "dur": 30, "args": { "External id": 108488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108488, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 108488, "pid": 5, "tid": 7, "ts": 1716454223061257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989648, "dur": 12, "args": { "External id": 108488, "cbid": 211, "correlation": 108488 } }, { "ph": "s", "id": 108488, "pid": 76337, "tid": -914061504, "ts": 1716454222989648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223061288, "dur": 5, "args": { "External id": 108500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108500, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 108500, "pid": 5, "tid": 7, "ts": 1716454223061288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989670, "dur": 6, "args": { "External id": 108500, "cbid": 211, "correlation": 108500 } }, { "ph": "s", "id": 108500, "pid": 76337, "tid": -914061504, "ts": 1716454222989670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223061294, "dur": 29, "args": { "External id": 108503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108503, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108503, "pid": 5, "tid": 7, "ts": 1716454223061294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989688, "dur": 7, "args": { "External id": 108503, "cbid": 211, "correlation": 108503 } }, { "ph": "s", "id": 108503, "pid": 76337, "tid": -914061504, "ts": 1716454222989688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222989744, "dur": 0, "args": { "External id": 108514, "cbid": 317, "correlation": 108514 } }, { "ph": "f", "id": 108514, "pid": 76337, "tid": -914061504, "ts": 1716454222989744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222989745, "dur": 0, "args": { "External id": 108515, "cbid": 203, "correlation": 108515 } }, { "ph": "f", "id": 108515, "pid": 76337, "tid": -914061504, "ts": 1716454222989745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222989745, "dur": 0, "args": { "External id": 108516, "cbid": 205, "correlation": 108516 } }, { "ph": "f", "id": 108516, "pid": 76337, "tid": -914061504, "ts": 1716454222989745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223061325, "dur": 23, "args": { "External id": 108520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108520, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108520, "pid": 5, "tid": 7, "ts": 1716454223061325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989758, "dur": 12, "args": { "External id": 108520, "cbid": 211, "correlation": 108520 } }, { "ph": "s", "id": 108520, "pid": 76337, "tid": -914061504, "ts": 1716454222989758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223061349, "dur": 103, "args": { "External id": 108522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108522, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108522, "pid": 5, "tid": 7, "ts": 1716454223061349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989775, "dur": 6, "args": { "External id": 108522, "cbid": 211, "correlation": 108522 } }, { "ph": "s", "id": 108522, "pid": 76337, "tid": -914061504, "ts": 1716454222989775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223061454, "dur": 23, "args": { "External id": 108524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108524, "pid": 5, "tid": 7, "ts": 1716454223061454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989785, "dur": 6, "args": { "External id": 108524, "cbid": 211, "correlation": 108524 } }, { "ph": "s", "id": 108524, "pid": 76337, "tid": -914061504, "ts": 1716454222989785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223061478, "dur": 33, "args": { "External id": 108530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108530, "pid": 5, "tid": 7, "ts": 1716454223061478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989813, "dur": 9, "args": { "External id": 108530, "cbid": 211, "correlation": 108530 } }, { "ph": "s", "id": 108530, "pid": 76337, "tid": -914061504, "ts": 1716454222989813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223061512, "dur": 176, "args": { "External id": 108539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108539, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108539, "pid": 5, "tid": 7, "ts": 1716454223061512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989897, "dur": 14, "args": { "External id": 108539, "cbid": 211, "correlation": 108539 } }, { "ph": "s", "id": 108539, "pid": 76337, "tid": -914061504, "ts": 1716454222989897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223061688, "dur": 64, "args": { "External id": 108561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108561, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108561, "pid": 5, "tid": 7, "ts": 1716454223061688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222989955, "dur": 10, "args": { "External id": 108561, "cbid": 211, "correlation": 108561 } }, { "ph": "s", "id": 108561, "pid": 76337, "tid": -914061504, "ts": 1716454222989955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990054, "dur": 1, "args": { "External id": 108572, "cbid": 251, "correlation": 108572 } }, { "ph": "f", "id": 108572, "pid": 76337, "tid": -914061504, "ts": 1716454222990054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223061754, "dur": 151, "args": { "External id": 108573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108573, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108573, "pid": 5, "tid": 7, "ts": 1716454223061754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990059, "dur": 13, "args": { "External id": 108573, "cbid": 211, "correlation": 108573 } }, { "ph": "s", "id": 108573, "pid": 76337, "tid": -914061504, "ts": 1716454222990059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990129, "dur": 1, "args": { "External id": 108584, "cbid": 251, "correlation": 108584 } }, { "ph": "f", "id": 108584, "pid": 76337, "tid": -914061504, "ts": 1716454222990129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223061906, "dur": 145, "args": { "External id": 108585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108585, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108585, "pid": 5, "tid": 7, "ts": 1716454223061906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990133, "dur": 12, "args": { "External id": 108585, "cbid": 211, "correlation": 108585 } }, { "ph": "s", "id": 108585, "pid": 76337, "tid": -914061504, "ts": 1716454222990133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990198, "dur": 1, "args": { "External id": 108596, "cbid": 251, "correlation": 108596 } }, { "ph": "f", "id": 108596, "pid": 76337, "tid": -914061504, "ts": 1716454222990198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223062053, "dur": 141, "args": { "External id": 108597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108597, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108597, "pid": 5, "tid": 7, "ts": 1716454223062053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990202, "dur": 11, "args": { "External id": 108597, "cbid": 211, "correlation": 108597 } }, { "ph": "s", "id": 108597, "pid": 76337, "tid": -914061504, "ts": 1716454222990202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223062196, "dur": 1906, "args": { "External id": 108618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108618, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 108618, "pid": 5, "tid": 7, "ts": 1716454223062196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990281, "dur": 14, "args": { "External id": 108618, "cbid": 211, "correlation": 108618 } }, { "ph": "s", "id": 108618, "pid": 76337, "tid": -914061504, "ts": 1716454222990281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990379, "dur": 1, "args": { "External id": 108636, "cbid": 251, "correlation": 108636 } }, { "ph": "f", "id": 108636, "pid": 76337, "tid": -914061504, "ts": 1716454222990379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223064103, "dur": 146, "args": { "External id": 108638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108638, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 108638, "pid": 5, "tid": 7, "ts": 1716454223064103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990385, "dur": 14, "args": { "External id": 108638, "cbid": 211, "correlation": 108638 } }, { "ph": "s", "id": 108638, "pid": 76337, "tid": -914061504, "ts": 1716454222990385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223064250, "dur": 36, "args": { "External id": 108646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108646, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108646, "pid": 5, "tid": 7, "ts": 1716454223064250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990456, "dur": 12, "args": { "External id": 108646, "cbid": 211, "correlation": 108646 } }, { "ph": "s", "id": 108646, "pid": 76337, "tid": -914061504, "ts": 1716454222990456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223064287, "dur": 50, "args": { "External id": 108654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108654, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108654, "pid": 5, "tid": 7, "ts": 1716454223064287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990495, "dur": 8, "args": { "External id": 108654, "cbid": 211, "correlation": 108654 } }, { "ph": "s", "id": 108654, "pid": 76337, "tid": -914061504, "ts": 1716454222990495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223064339, "dur": 29, "args": { "External id": 108665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108665, "pid": 5, "tid": 7, "ts": 1716454223064339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990564, "dur": 13, "args": { "External id": 108665, "cbid": 211, "correlation": 108665 } }, { "ph": "s", "id": 108665, "pid": 76337, "tid": -914061504, "ts": 1716454222990564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223064369, "dur": 34, "args": { "External id": 108687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108687, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108687, "pid": 5, "tid": 7, "ts": 1716454223064369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990597, "dur": 7, "args": { "External id": 108687, "cbid": 211, "correlation": 108687 } }, { "ph": "s", "id": 108687, "pid": 76337, "tid": -914061504, "ts": 1716454222990597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990679, "dur": 1, "args": { "External id": 108698, "cbid": 251, "correlation": 108698 } }, { "ph": "f", "id": 108698, "pid": 76337, "tid": -914061504, "ts": 1716454222990679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223064404, "dur": 88, "args": { "External id": 108699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108699, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108699, "pid": 5, "tid": 7, "ts": 1716454223064404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990684, "dur": 14, "args": { "External id": 108699, "cbid": 211, "correlation": 108699 } }, { "ph": "s", "id": 108699, "pid": 76337, "tid": -914061504, "ts": 1716454222990684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990753, "dur": 1, "args": { "External id": 108710, "cbid": 251, "correlation": 108710 } }, { "ph": "f", "id": 108710, "pid": 76337, "tid": -914061504, "ts": 1716454222990753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990757, "dur": 0, "args": { "External id": 108711, "cbid": 251, "correlation": 108711 } }, { "ph": "f", "id": 108711, "pid": 76337, "tid": -914061504, "ts": 1716454222990757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223064493, "dur": 11, "args": { "External id": 108712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108712, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 108712, "pid": 5, "tid": 7, "ts": 1716454223064493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990759, "dur": 12, "args": { "External id": 108712, "cbid": 211, "correlation": 108712 } }, { "ph": "s", "id": 108712, "pid": 76337, "tid": -914061504, "ts": 1716454222990759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223064506, "dur": 5, "args": { "External id": 108714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108714, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 108714, "pid": 5, "tid": 7, "ts": 1716454223064506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990772, "dur": 6, "args": { "External id": 108714, "cbid": 211, "correlation": 108714 } }, { "ph": "s", "id": 108714, "pid": 76337, "tid": -914061504, "ts": 1716454222990772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990830, "dur": 1, "args": { "External id": 108725, "cbid": 251, "correlation": 108725 } }, { "ph": "f", "id": 108725, "pid": 76337, "tid": -914061504, "ts": 1716454222990830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222990834, "dur": 0, "args": { "External id": 108726, "cbid": 251, "correlation": 108726 } }, { "ph": "f", "id": 108726, "pid": 76337, "tid": -914061504, "ts": 1716454222990834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223064512, "dur": 7, "args": { "External id": 108727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108727, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 108727, "pid": 5, "tid": 7, "ts": 1716454223064512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990835, "dur": 12, "args": { "External id": 108727, "cbid": 211, "correlation": 108727 } }, { "ph": "s", "id": 108727, "pid": 76337, "tid": -914061504, "ts": 1716454222990835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223064521, "dur": 3, "args": { "External id": 108729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108729, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 108729, "pid": 5, "tid": 7, "ts": 1716454223064521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990849, "dur": 5, "args": { "External id": 108729, "cbid": 211, "correlation": 108729 } }, { "ph": "s", "id": 108729, "pid": 76337, "tid": -914061504, "ts": 1716454222990849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223064526, "dur": 89, "args": { "External id": 108750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108750, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 108750, "pid": 5, "tid": 7, "ts": 1716454223064526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222990921, "dur": 12, "args": { "External id": 108750, "cbid": 211, "correlation": 108750 } }, { "ph": "s", "id": 108750, "pid": 76337, "tid": -914061504, "ts": 1716454222990921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222991026, "dur": 1, "args": { "External id": 108768, "cbid": 251, "correlation": 108768 } }, { "ph": "f", "id": 108768, "pid": 76337, "tid": -914061504, "ts": 1716454222991026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223064616, "dur": 82, "args": { "External id": 108770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108770, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108770, "pid": 5, "tid": 7, "ts": 1716454223064616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991032, "dur": 14, "args": { "External id": 108770, "cbid": 211, "correlation": 108770 } }, { "ph": "s", "id": 108770, "pid": 76337, "tid": -914061504, "ts": 1716454222991032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223064700, "dur": 20, "args": { "External id": 108778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108778, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108778, "pid": 5, "tid": 7, "ts": 1716454223064700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991103, "dur": 12, "args": { "External id": 108778, "cbid": 211, "correlation": 108778 } }, { "ph": "s", "id": 108778, "pid": 76337, "tid": -914061504, "ts": 1716454222991103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223064721, "dur": 37, "args": { "External id": 108786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108786, "pid": 5, "tid": 7, "ts": 1716454223064721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991146, "dur": 9, "args": { "External id": 108786, "cbid": 211, "correlation": 108786 } }, { "ph": "s", "id": 108786, "pid": 76337, "tid": -914061504, "ts": 1716454222991146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223064759, "dur": 34, "args": { "External id": 108808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108808, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108808, "pid": 5, "tid": 7, "ts": 1716454223064759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991198, "dur": 10, "args": { "External id": 108808, "cbid": 211, "correlation": 108808 } }, { "ph": "s", "id": 108808, "pid": 76337, "tid": -914061504, "ts": 1716454222991198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222991288, "dur": 1, "args": { "External id": 108824, "cbid": 251, "correlation": 108824 } }, { "ph": "f", "id": 108824, "pid": 76337, "tid": -914061504, "ts": 1716454222991288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222991293, "dur": 0, "args": { "External id": 108826, "cbid": 251, "correlation": 108826 } }, { "ph": "f", "id": 108826, "pid": 76337, "tid": -914061504, "ts": 1716454222991293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223064794, "dur": 528, "args": { "External id": 108827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108827, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 108827, "pid": 5, "tid": 7, "ts": 1716454223064794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991297, "dur": 13, "args": { "External id": 108827, "cbid": 211, "correlation": 108827 } }, { "ph": "s", "id": 108827, "pid": 76337, "tid": -914061504, "ts": 1716454222991297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223065324, "dur": 122, "args": { "External id": 108835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108835, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108835, "pid": 5, "tid": 7, "ts": 1716454223065324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991362, "dur": 12, "args": { "External id": 108835, "cbid": 211, "correlation": 108835 } }, { "ph": "s", "id": 108835, "pid": 76337, "tid": -914061504, "ts": 1716454222991362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223065447, "dur": 128, "args": { "External id": 108843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108843, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108843, "pid": 5, "tid": 7, "ts": 1716454223065447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991393, "dur": 8, "args": { "External id": 108843, "cbid": 211, "correlation": 108843 } }, { "ph": "s", "id": 108843, "pid": 76337, "tid": -914061504, "ts": 1716454222991393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222991469, "dur": 1, "args": { "External id": 108859, "cbid": 251, "correlation": 108859 } }, { "ph": "f", "id": 108859, "pid": 76337, "tid": -914061504, "ts": 1716454222991469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223065576, "dur": 305, "args": { "External id": 108861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108861, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108861, "pid": 5, "tid": 7, "ts": 1716454223065576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991475, "dur": 13, "args": { "External id": 108861, "cbid": 211, "correlation": 108861 } }, { "ph": "s", "id": 108861, "pid": 76337, "tid": -914061504, "ts": 1716454222991475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223065882, "dur": 27, "args": { "External id": 108869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108869, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108869, "pid": 5, "tid": 7, "ts": 1716454223065882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991517, "dur": 10, "args": { "External id": 108869, "cbid": 211, "correlation": 108869 } }, { "ph": "s", "id": 108869, "pid": 76337, "tid": -914061504, "ts": 1716454222991517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223065910, "dur": 80, "args": { "External id": 108880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108880, "pid": 5, "tid": 7, "ts": 1716454223065910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991586, "dur": 13, "args": { "External id": 108880, "cbid": 211, "correlation": 108880 } }, { "ph": "s", "id": 108880, "pid": 76337, "tid": -914061504, "ts": 1716454222991586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222991650, "dur": 0, "args": { "External id": 108892, "cbid": 317, "correlation": 108892 } }, { "ph": "f", "id": 108892, "pid": 76337, "tid": -914061504, "ts": 1716454222991650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222991651, "dur": 0, "args": { "External id": 108893, "cbid": 203, "correlation": 108893 } }, { "ph": "f", "id": 108893, "pid": 76337, "tid": -914061504, "ts": 1716454222991651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222991652, "dur": 0, "args": { "External id": 108894, "cbid": 205, "correlation": 108894 } }, { "ph": "f", "id": 108894, "pid": 76337, "tid": -914061504, "ts": 1716454222991652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223065991, "dur": 24, "args": { "External id": 108898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108898, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108898, "pid": 5, "tid": 7, "ts": 1716454223065991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991666, "dur": 12, "args": { "External id": 108898, "cbid": 211, "correlation": 108898 } }, { "ph": "s", "id": 108898, "pid": 76337, "tid": -914061504, "ts": 1716454222991666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223066016, "dur": 117, "args": { "External id": 108900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108900, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108900, "pid": 5, "tid": 7, "ts": 1716454223066016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991685, "dur": 7, "args": { "External id": 108900, "cbid": 211, "correlation": 108900 } }, { "ph": "s", "id": 108900, "pid": 76337, "tid": -914061504, "ts": 1716454222991685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223066135, "dur": 23, "args": { "External id": 108902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108902, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108902, "pid": 5, "tid": 7, "ts": 1716454223066135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991696, "dur": 5, "args": { "External id": 108902, "cbid": 211, "correlation": 108902 } }, { "ph": "s", "id": 108902, "pid": 76337, "tid": -914061504, "ts": 1716454222991696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223066160, "dur": 33, "args": { "External id": 108908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108908, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108908, "pid": 5, "tid": 7, "ts": 1716454223066160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991723, "dur": 8, "args": { "External id": 108908, "cbid": 211, "correlation": 108908 } }, { "ph": "s", "id": 108908, "pid": 76337, "tid": -914061504, "ts": 1716454222991723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223066194, "dur": 26, "args": { "External id": 108916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108916, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108916, "pid": 5, "tid": 7, "ts": 1716454223066194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991755, "dur": 8, "args": { "External id": 108916, "cbid": 211, "correlation": 108916 } }, { "ph": "s", "id": 108916, "pid": 76337, "tid": -914061504, "ts": 1716454222991755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223066222, "dur": 43, "args": { "External id": 108925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108925, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108925, "pid": 5, "tid": 7, "ts": 1716454223066222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991795, "dur": 10, "args": { "External id": 108925, "cbid": 211, "correlation": 108925 } }, { "ph": "s", "id": 108925, "pid": 76337, "tid": -914061504, "ts": 1716454222991795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223066266, "dur": 44, "args": { "External id": 108945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108945, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 108945, "pid": 5, "tid": 7, "ts": 1716454223066266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991865, "dur": 11, "args": { "External id": 108945, "cbid": 211, "correlation": 108945 } }, { "ph": "s", "id": 108945, "pid": 76337, "tid": -914061504, "ts": 1716454222991865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223066311, "dur": 5, "args": { "External id": 108957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108957, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 108957, "pid": 5, "tid": 7, "ts": 1716454223066311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991886, "dur": 7, "args": { "External id": 108957, "cbid": 211, "correlation": 108957 } }, { "ph": "s", "id": 108957, "pid": 76337, "tid": -914061504, "ts": 1716454222991886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223066317, "dur": 42, "args": { "External id": 108960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108960, "pid": 5, "tid": 7, "ts": 1716454223066317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991905, "dur": 7, "args": { "External id": 108960, "cbid": 211, "correlation": 108960 } }, { "ph": "s", "id": 108960, "pid": 76337, "tid": -914061504, "ts": 1716454222991905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223066361, "dur": 30, "args": { "External id": 108969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108969, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108969, "pid": 5, "tid": 7, "ts": 1716454223066361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222991945, "dur": 10, "args": { "External id": 108969, "cbid": 211, "correlation": 108969 } }, { "ph": "s", "id": 108969, "pid": 76337, "tid": -914061504, "ts": 1716454222991945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222992005, "dur": 0, "args": { "External id": 108979, "cbid": 317, "correlation": 108979 } }, { "ph": "f", "id": 108979, "pid": 76337, "tid": -914061504, "ts": 1716454222992005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222992006, "dur": 0, "args": { "External id": 108980, "cbid": 203, "correlation": 108980 } }, { "ph": "f", "id": 108980, "pid": 76337, "tid": -914061504, "ts": 1716454222992006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222992007, "dur": 0, "args": { "External id": 108981, "cbid": 205, "correlation": 108981 } }, { "ph": "f", "id": 108981, "pid": 76337, "tid": -914061504, "ts": 1716454222992007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223066392, "dur": 30, "args": { "External id": 108985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108985, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108985, "pid": 5, "tid": 7, "ts": 1716454223066392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992022, "dur": 12, "args": { "External id": 108985, "cbid": 211, "correlation": 108985 } }, { "ph": "s", "id": 108985, "pid": 76337, "tid": -914061504, "ts": 1716454222992022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223066423, "dur": 62, "args": { "External id": 108987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108987, "pid": 5, "tid": 7, "ts": 1716454223066423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992036, "dur": 6, "args": { "External id": 108987, "cbid": 211, "correlation": 108987 } }, { "ph": "s", "id": 108987, "pid": 76337, "tid": -914061504, "ts": 1716454222992036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223066487, "dur": 958, "args": { "External id": 108989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108989, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 108989, "pid": 5, "tid": 7, "ts": 1716454223066487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992048, "dur": 6, "args": { "External id": 108989, "cbid": 211, "correlation": 108989 } }, { "ph": "s", "id": 108989, "pid": 76337, "tid": -914061504, "ts": 1716454222992048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223067446, "dur": 20, "args": { "External id": 108991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108991, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108991, "pid": 5, "tid": 7, "ts": 1716454223067446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992058, "dur": 5, "args": { "External id": 108991, "cbid": 211, "correlation": 108991 } }, { "ph": "s", "id": 108991, "pid": 76337, "tid": -914061504, "ts": 1716454222992058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223067468, "dur": 33, "args": { "External id": 108997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 108997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 108997, "pid": 5, "tid": 7, "ts": 1716454223067468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992085, "dur": 9, "args": { "External id": 108997, "cbid": 211, "correlation": 108997 } }, { "ph": "s", "id": 108997, "pid": 76337, "tid": -914061504, "ts": 1716454222992085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223067502, "dur": 3, "args": { "External id": 109005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109005, "pid": 5, "tid": 7, "ts": 1716454223067502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992129, "dur": 10, "args": { "External id": 109005, "cbid": 211, "correlation": 109005 } }, { "ph": "s", "id": 109005, "pid": 76337, "tid": -914061504, "ts": 1716454222992129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222992194, "dur": 1, "args": { "External id": 109021, "cbid": 251, "correlation": 109021 } }, { "ph": "f", "id": 109021, "pid": 76337, "tid": -914061504, "ts": 1716454222992194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222992200, "dur": 0, "args": { "External id": 109023, "cbid": 251, "correlation": 109023 } }, { "ph": "f", "id": 109023, "pid": 76337, "tid": -914061504, "ts": 1716454222992200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223067507, "dur": 12, "args": { "External id": 109024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109024, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 109024, "pid": 5, "tid": 7, "ts": 1716454223067507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992201, "dur": 11, "args": { "External id": 109024, "cbid": 211, "correlation": 109024 } }, { "ph": "s", "id": 109024, "pid": 76337, "tid": -914061504, "ts": 1716454222992201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223067521, "dur": 5, "args": { "External id": 109026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109026, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 109026, "pid": 5, "tid": 7, "ts": 1716454223067521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992214, "dur": 5, "args": { "External id": 109026, "cbid": 211, "correlation": 109026 } }, { "ph": "s", "id": 109026, "pid": 76337, "tid": -914061504, "ts": 1716454222992214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223067527, "dur": 28, "args": { "External id": 109036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109036, "pid": 5, "tid": 7, "ts": 1716454223067527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992272, "dur": 12, "args": { "External id": 109036, "cbid": 211, "correlation": 109036 } }, { "ph": "s", "id": 109036, "pid": 76337, "tid": -914061504, "ts": 1716454222992272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223067556, "dur": 31, "args": { "External id": 109056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109056, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 109056, "pid": 5, "tid": 7, "ts": 1716454223067556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992337, "dur": 11, "args": { "External id": 109056, "cbid": 211, "correlation": 109056 } }, { "ph": "s", "id": 109056, "pid": 76337, "tid": -914061504, "ts": 1716454222992337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223067589, "dur": 5, "args": { "External id": 109068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109068, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 109068, "pid": 5, "tid": 7, "ts": 1716454223067589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992357, "dur": 6, "args": { "External id": 109068, "cbid": 211, "correlation": 109068 } }, { "ph": "s", "id": 109068, "pid": 76337, "tid": -914061504, "ts": 1716454222992357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223067595, "dur": 30, "args": { "External id": 109071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109071, "pid": 5, "tid": 7, "ts": 1716454223067595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992375, "dur": 7, "args": { "External id": 109071, "cbid": 211, "correlation": 109071 } }, { "ph": "s", "id": 109071, "pid": 76337, "tid": -914061504, "ts": 1716454222992375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223067626, "dur": 21, "args": { "External id": 109080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109080, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109080, "pid": 5, "tid": 7, "ts": 1716454223067626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992416, "dur": 10, "args": { "External id": 109080, "cbid": 211, "correlation": 109080 } }, { "ph": "s", "id": 109080, "pid": 76337, "tid": -914061504, "ts": 1716454222992416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222992479, "dur": 0, "args": { "External id": 109090, "cbid": 317, "correlation": 109090 } }, { "ph": "f", "id": 109090, "pid": 76337, "tid": -914061504, "ts": 1716454222992479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222992480, "dur": 0, "args": { "External id": 109091, "cbid": 203, "correlation": 109091 } }, { "ph": "f", "id": 109091, "pid": 76337, "tid": -914061504, "ts": 1716454222992480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222992480, "dur": 0, "args": { "External id": 109092, "cbid": 205, "correlation": 109092 } }, { "ph": "f", "id": 109092, "pid": 76337, "tid": -914061504, "ts": 1716454222992480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223067648, "dur": 23, "args": { "External id": 109096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109096, "pid": 5, "tid": 7, "ts": 1716454223067648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992495, "dur": 12, "args": { "External id": 109096, "cbid": 211, "correlation": 109096 } }, { "ph": "s", "id": 109096, "pid": 76337, "tid": -914061504, "ts": 1716454222992495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223067672, "dur": 43, "args": { "External id": 109098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109098, "pid": 5, "tid": 7, "ts": 1716454223067672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992509, "dur": 5, "args": { "External id": 109098, "cbid": 211, "correlation": 109098 } }, { "ph": "s", "id": 109098, "pid": 76337, "tid": -914061504, "ts": 1716454222992509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223067717, "dur": 638, "args": { "External id": 109100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109100, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109100, "pid": 5, "tid": 7, "ts": 1716454223067717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992520, "dur": 5, "args": { "External id": 109100, "cbid": 211, "correlation": 109100 } }, { "ph": "s", "id": 109100, "pid": 76337, "tid": -914061504, "ts": 1716454222992520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223068356, "dur": 23, "args": { "External id": 109102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109102, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109102, "pid": 5, "tid": 7, "ts": 1716454223068356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992530, "dur": 5, "args": { "External id": 109102, "cbid": 211, "correlation": 109102 } }, { "ph": "s", "id": 109102, "pid": 76337, "tid": -914061504, "ts": 1716454222992530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223068380, "dur": 32, "args": { "External id": 109108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109108, "pid": 5, "tid": 7, "ts": 1716454223068380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992557, "dur": 8, "args": { "External id": 109108, "cbid": 211, "correlation": 109108 } }, { "ph": "s", "id": 109108, "pid": 76337, "tid": -914061504, "ts": 1716454222992557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222992614, "dur": 0, "args": { "External id": 109118, "cbid": 317, "correlation": 109118 } }, { "ph": "f", "id": 109118, "pid": 76337, "tid": -914061504, "ts": 1716454222992614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222992615, "dur": 0, "args": { "External id": 109119, "cbid": 203, "correlation": 109119 } }, { "ph": "f", "id": 109119, "pid": 76337, "tid": -914061504, "ts": 1716454222992615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222992616, "dur": 0, "args": { "External id": 109120, "cbid": 205, "correlation": 109120 } }, { "ph": "f", "id": 109120, "pid": 76337, "tid": -914061504, "ts": 1716454222992616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223068414, "dur": 30, "args": { "External id": 109124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109124, "pid": 5, "tid": 7, "ts": 1716454223068414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992630, "dur": 12, "args": { "External id": 109124, "cbid": 211, "correlation": 109124 } }, { "ph": "s", "id": 109124, "pid": 76337, "tid": -914061504, "ts": 1716454222992630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223068445, "dur": 150, "args": { "External id": 109126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109126, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109126, "pid": 5, "tid": 7, "ts": 1716454223068445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992648, "dur": 6, "args": { "External id": 109126, "cbid": 211, "correlation": 109126 } }, { "ph": "s", "id": 109126, "pid": 76337, "tid": -914061504, "ts": 1716454222992648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223068597, "dur": 24, "args": { "External id": 109128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109128, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109128, "pid": 5, "tid": 7, "ts": 1716454223068597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992657, "dur": 5, "args": { "External id": 109128, "cbid": 211, "correlation": 109128 } }, { "ph": "s", "id": 109128, "pid": 76337, "tid": -914061504, "ts": 1716454222992657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223068622, "dur": 32, "args": { "External id": 109134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109134, "pid": 5, "tid": 7, "ts": 1716454223068622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992683, "dur": 9, "args": { "External id": 109134, "cbid": 211, "correlation": 109134 } }, { "ph": "s", "id": 109134, "pid": 76337, "tid": -914061504, "ts": 1716454222992683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223068655, "dur": 27, "args": { "External id": 109142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109142, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109142, "pid": 5, "tid": 7, "ts": 1716454223068655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992712, "dur": 8, "args": { "External id": 109142, "cbid": 211, "correlation": 109142 } }, { "ph": "s", "id": 109142, "pid": 76337, "tid": -914061504, "ts": 1716454222992712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223068683, "dur": 19, "args": { "External id": 109150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109150, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109150, "pid": 5, "tid": 7, "ts": 1716454223068683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992741, "dur": 8, "args": { "External id": 109150, "cbid": 211, "correlation": 109150 } }, { "ph": "s", "id": 109150, "pid": 76337, "tid": -914061504, "ts": 1716454222992741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223068704, "dur": 30, "args": { "External id": 109170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109170, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 109170, "pid": 5, "tid": 7, "ts": 1716454223068704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992823, "dur": 13, "args": { "External id": 109170, "cbid": 211, "correlation": 109170 } }, { "ph": "s", "id": 109170, "pid": 76337, "tid": -914061504, "ts": 1716454222992823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223068735, "dur": 5, "args": { "External id": 109182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109182, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 109182, "pid": 5, "tid": 7, "ts": 1716454223068735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992847, "dur": 6, "args": { "External id": 109182, "cbid": 211, "correlation": 109182 } }, { "ph": "s", "id": 109182, "pid": 76337, "tid": -914061504, "ts": 1716454222992847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223068741, "dur": 30, "args": { "External id": 109185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109185, "pid": 5, "tid": 7, "ts": 1716454223068741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992864, "dur": 6, "args": { "External id": 109185, "cbid": 211, "correlation": 109185 } }, { "ph": "s", "id": 109185, "pid": 76337, "tid": -914061504, "ts": 1716454222992864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222992921, "dur": 0, "args": { "External id": 109196, "cbid": 317, "correlation": 109196 } }, { "ph": "f", "id": 109196, "pid": 76337, "tid": -914061504, "ts": 1716454222992921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222992922, "dur": 0, "args": { "External id": 109197, "cbid": 203, "correlation": 109197 } }, { "ph": "f", "id": 109197, "pid": 76337, "tid": -914061504, "ts": 1716454222992922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222992922, "dur": 0, "args": { "External id": 109198, "cbid": 205, "correlation": 109198 } }, { "ph": "f", "id": 109198, "pid": 76337, "tid": -914061504, "ts": 1716454222992922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223068773, "dur": 21, "args": { "External id": 109202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109202, "pid": 5, "tid": 7, "ts": 1716454223068773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992935, "dur": 12, "args": { "External id": 109202, "cbid": 211, "correlation": 109202 } }, { "ph": "s", "id": 109202, "pid": 76337, "tid": -914061504, "ts": 1716454222992935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223068795, "dur": 102, "args": { "External id": 109204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109204, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109204, "pid": 5, "tid": 7, "ts": 1716454223068795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992953, "dur": 6, "args": { "External id": 109204, "cbid": 211, "correlation": 109204 } }, { "ph": "s", "id": 109204, "pid": 76337, "tid": -914061504, "ts": 1716454222992953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223068899, "dur": 22, "args": { "External id": 109206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109206, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109206, "pid": 5, "tid": 7, "ts": 1716454223068899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992963, "dur": 5, "args": { "External id": 109206, "cbid": 211, "correlation": 109206 } }, { "ph": "s", "id": 109206, "pid": 76337, "tid": -914061504, "ts": 1716454222992963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223068922, "dur": 32, "args": { "External id": 109212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109212, "pid": 5, "tid": 7, "ts": 1716454223068922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222992998, "dur": 9, "args": { "External id": 109212, "cbid": 211, "correlation": 109212 } }, { "ph": "s", "id": 109212, "pid": 76337, "tid": -914061504, "ts": 1716454222992998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223068955, "dur": 170, "args": { "External id": 109221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109221, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109221, "pid": 5, "tid": 7, "ts": 1716454223068955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993080, "dur": 14, "args": { "External id": 109221, "cbid": 211, "correlation": 109221 } }, { "ph": "s", "id": 109221, "pid": 76337, "tid": -914061504, "ts": 1716454222993080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223069127, "dur": 64, "args": { "External id": 109243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109243, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109243, "pid": 5, "tid": 7, "ts": 1716454223069127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993137, "dur": 11, "args": { "External id": 109243, "cbid": 211, "correlation": 109243 } }, { "ph": "s", "id": 109243, "pid": 76337, "tid": -914061504, "ts": 1716454222993137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993224, "dur": 1, "args": { "External id": 109254, "cbid": 251, "correlation": 109254 } }, { "ph": "f", "id": 109254, "pid": 76337, "tid": -914061504, "ts": 1716454222993224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223069193, "dur": 147, "args": { "External id": 109255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109255, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109255, "pid": 5, "tid": 7, "ts": 1716454223069193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993230, "dur": 13, "args": { "External id": 109255, "cbid": 211, "correlation": 109255 } }, { "ph": "s", "id": 109255, "pid": 76337, "tid": -914061504, "ts": 1716454222993230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993300, "dur": 1, "args": { "External id": 109266, "cbid": 251, "correlation": 109266 } }, { "ph": "f", "id": 109266, "pid": 76337, "tid": -914061504, "ts": 1716454222993300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223069341, "dur": 141, "args": { "External id": 109267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109267, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109267, "pid": 5, "tid": 7, "ts": 1716454223069341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993303, "dur": 11, "args": { "External id": 109267, "cbid": 211, "correlation": 109267 } }, { "ph": "s", "id": 109267, "pid": 76337, "tid": -914061504, "ts": 1716454222993303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993369, "dur": 1, "args": { "External id": 109278, "cbid": 251, "correlation": 109278 } }, { "ph": "f", "id": 109278, "pid": 76337, "tid": -914061504, "ts": 1716454222993369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223069483, "dur": 144, "args": { "External id": 109279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109279, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109279, "pid": 5, "tid": 7, "ts": 1716454223069483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993373, "dur": 11, "args": { "External id": 109279, "cbid": 211, "correlation": 109279 } }, { "ph": "s", "id": 109279, "pid": 76337, "tid": -914061504, "ts": 1716454222993373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223069628, "dur": 1910, "args": { "External id": 109300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109300, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 109300, "pid": 5, "tid": 7, "ts": 1716454223069628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993455, "dur": 12, "args": { "External id": 109300, "cbid": 211, "correlation": 109300 } }, { "ph": "s", "id": 109300, "pid": 76337, "tid": -914061504, "ts": 1716454222993455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993553, "dur": 1, "args": { "External id": 109318, "cbid": 251, "correlation": 109318 } }, { "ph": "f", "id": 109318, "pid": 76337, "tid": -914061504, "ts": 1716454222993553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223071540, "dur": 145, "args": { "External id": 109320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109320, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 109320, "pid": 5, "tid": 7, "ts": 1716454223071540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993559, "dur": 13, "args": { "External id": 109320, "cbid": 211, "correlation": 109320 } }, { "ph": "s", "id": 109320, "pid": 76337, "tid": -914061504, "ts": 1716454222993559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223071686, "dur": 35, "args": { "External id": 109328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109328, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109328, "pid": 5, "tid": 7, "ts": 1716454223071686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993629, "dur": 13, "args": { "External id": 109328, "cbid": 211, "correlation": 109328 } }, { "ph": "s", "id": 109328, "pid": 76337, "tid": -914061504, "ts": 1716454222993629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223071723, "dur": 50, "args": { "External id": 109336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109336, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109336, "pid": 5, "tid": 7, "ts": 1716454223071723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993668, "dur": 8, "args": { "External id": 109336, "cbid": 211, "correlation": 109336 } }, { "ph": "s", "id": 109336, "pid": 76337, "tid": -914061504, "ts": 1716454222993668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223071774, "dur": 30, "args": { "External id": 109347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109347, "pid": 5, "tid": 7, "ts": 1716454223071774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993739, "dur": 12, "args": { "External id": 109347, "cbid": 211, "correlation": 109347 } }, { "ph": "s", "id": 109347, "pid": 76337, "tid": -914061504, "ts": 1716454222993739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223071806, "dur": 34, "args": { "External id": 109369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109369, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109369, "pid": 5, "tid": 7, "ts": 1716454223071806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993769, "dur": 8, "args": { "External id": 109369, "cbid": 211, "correlation": 109369 } }, { "ph": "s", "id": 109369, "pid": 76337, "tid": -914061504, "ts": 1716454222993769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993855, "dur": 1, "args": { "External id": 109380, "cbid": 251, "correlation": 109380 } }, { "ph": "f", "id": 109380, "pid": 76337, "tid": -914061504, "ts": 1716454222993855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223071841, "dur": 89, "args": { "External id": 109381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109381, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109381, "pid": 5, "tid": 7, "ts": 1716454223071841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993860, "dur": 13, "args": { "External id": 109381, "cbid": 211, "correlation": 109381 } }, { "ph": "s", "id": 109381, "pid": 76337, "tid": -914061504, "ts": 1716454222993860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993929, "dur": 1, "args": { "External id": 109392, "cbid": 251, "correlation": 109392 } }, { "ph": "f", "id": 109392, "pid": 76337, "tid": -914061504, "ts": 1716454222993929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222993933, "dur": 0, "args": { "External id": 109393, "cbid": 251, "correlation": 109393 } }, { "ph": "f", "id": 109393, "pid": 76337, "tid": -914061504, "ts": 1716454222993933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223071931, "dur": 11, "args": { "External id": 109394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109394, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 109394, "pid": 5, "tid": 7, "ts": 1716454223071931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993935, "dur": 12, "args": { "External id": 109394, "cbid": 211, "correlation": 109394 } }, { "ph": "s", "id": 109394, "pid": 76337, "tid": -914061504, "ts": 1716454222993935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223071944, "dur": 5, "args": { "External id": 109396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109396, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 109396, "pid": 5, "tid": 7, "ts": 1716454223071944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222993949, "dur": 6, "args": { "External id": 109396, "cbid": 211, "correlation": 109396 } }, { "ph": "s", "id": 109396, "pid": 76337, "tid": -914061504, "ts": 1716454222993949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222994016, "dur": 1, "args": { "External id": 109407, "cbid": 251, "correlation": 109407 } }, { "ph": "f", "id": 109407, "pid": 76337, "tid": -914061504, "ts": 1716454222994016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222994020, "dur": 0, "args": { "External id": 109408, "cbid": 251, "correlation": 109408 } }, { "ph": "f", "id": 109408, "pid": 76337, "tid": -914061504, "ts": 1716454222994020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223071950, "dur": 7, "args": { "External id": 109409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109409, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 109409, "pid": 5, "tid": 7, "ts": 1716454223071950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994022, "dur": 12, "args": { "External id": 109409, "cbid": 211, "correlation": 109409 } }, { "ph": "s", "id": 109409, "pid": 76337, "tid": -914061504, "ts": 1716454222994022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223071958, "dur": 4, "args": { "External id": 109411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109411, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 109411, "pid": 5, "tid": 7, "ts": 1716454223071958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994036, "dur": 6, "args": { "External id": 109411, "cbid": 211, "correlation": 109411 } }, { "ph": "s", "id": 109411, "pid": 76337, "tid": -914061504, "ts": 1716454222994036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223071963, "dur": 91, "args": { "External id": 109432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109432, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 109432, "pid": 5, "tid": 7, "ts": 1716454223071963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994109, "dur": 12, "args": { "External id": 109432, "cbid": 211, "correlation": 109432 } }, { "ph": "s", "id": 109432, "pid": 76337, "tid": -914061504, "ts": 1716454222994109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222994206, "dur": 1, "args": { "External id": 109450, "cbid": 251, "correlation": 109450 } }, { "ph": "f", "id": 109450, "pid": 76337, "tid": -914061504, "ts": 1716454222994206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223072055, "dur": 97, "args": { "External id": 109452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109452, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109452, "pid": 5, "tid": 7, "ts": 1716454223072055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994211, "dur": 13, "args": { "External id": 109452, "cbid": 211, "correlation": 109452 } }, { "ph": "s", "id": 109452, "pid": 76337, "tid": -914061504, "ts": 1716454222994211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223072154, "dur": 19, "args": { "External id": 109460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109460, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109460, "pid": 5, "tid": 7, "ts": 1716454223072154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994280, "dur": 13, "args": { "External id": 109460, "cbid": 211, "correlation": 109460 } }, { "ph": "s", "id": 109460, "pid": 76337, "tid": -914061504, "ts": 1716454222994280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223072175, "dur": 37, "args": { "External id": 109468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109468, "pid": 5, "tid": 7, "ts": 1716454223072175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994321, "dur": 9, "args": { "External id": 109468, "cbid": 211, "correlation": 109468 } }, { "ph": "s", "id": 109468, "pid": 76337, "tid": -914061504, "ts": 1716454222994321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223072213, "dur": 33, "args": { "External id": 109490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109490, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109490, "pid": 5, "tid": 7, "ts": 1716454223072213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994372, "dur": 10, "args": { "External id": 109490, "cbid": 211, "correlation": 109490 } }, { "ph": "s", "id": 109490, "pid": 76337, "tid": -914061504, "ts": 1716454222994372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222994460, "dur": 1, "args": { "External id": 109506, "cbid": 251, "correlation": 109506 } }, { "ph": "f", "id": 109506, "pid": 76337, "tid": -914061504, "ts": 1716454222994460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222994465, "dur": 0, "args": { "External id": 109508, "cbid": 251, "correlation": 109508 } }, { "ph": "f", "id": 109508, "pid": 76337, "tid": -914061504, "ts": 1716454222994465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223072248, "dur": 532, "args": { "External id": 109509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109509, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 109509, "pid": 5, "tid": 7, "ts": 1716454223072248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994468, "dur": 13, "args": { "External id": 109509, "cbid": 211, "correlation": 109509 } }, { "ph": "s", "id": 109509, "pid": 76337, "tid": -914061504, "ts": 1716454222994468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223072781, "dur": 123, "args": { "External id": 109517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109517, "pid": 5, "tid": 7, "ts": 1716454223072781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994532, "dur": 13, "args": { "External id": 109517, "cbid": 211, "correlation": 109517 } }, { "ph": "s", "id": 109517, "pid": 76337, "tid": -914061504, "ts": 1716454222994532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223072906, "dur": 129, "args": { "External id": 109525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109525, "pid": 5, "tid": 7, "ts": 1716454223072906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994563, "dur": 8, "args": { "External id": 109525, "cbid": 211, "correlation": 109525 } }, { "ph": "s", "id": 109525, "pid": 76337, "tid": -914061504, "ts": 1716454222994563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222994640, "dur": 1, "args": { "External id": 109541, "cbid": 251, "correlation": 109541 } }, { "ph": "f", "id": 109541, "pid": 76337, "tid": -914061504, "ts": 1716454222994640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223073036, "dur": 301, "args": { "External id": 109543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109543, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109543, "pid": 5, "tid": 7, "ts": 1716454223073036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994645, "dur": 12, "args": { "External id": 109543, "cbid": 211, "correlation": 109543 } }, { "ph": "s", "id": 109543, "pid": 76337, "tid": -914061504, "ts": 1716454222994645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223073339, "dur": 27, "args": { "External id": 109551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109551, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109551, "pid": 5, "tid": 7, "ts": 1716454223073339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994688, "dur": 10, "args": { "External id": 109551, "cbid": 211, "correlation": 109551 } }, { "ph": "s", "id": 109551, "pid": 76337, "tid": -914061504, "ts": 1716454222994688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223073367, "dur": 80, "args": { "External id": 109562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109562, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109562, "pid": 5, "tid": 7, "ts": 1716454223073367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994754, "dur": 12, "args": { "External id": 109562, "cbid": 211, "correlation": 109562 } }, { "ph": "s", "id": 109562, "pid": 76337, "tid": -914061504, "ts": 1716454222994754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222994816, "dur": 0, "args": { "External id": 109574, "cbid": 317, "correlation": 109574 } }, { "ph": "f", "id": 109574, "pid": 76337, "tid": -914061504, "ts": 1716454222994816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222994817, "dur": 0, "args": { "External id": 109575, "cbid": 203, "correlation": 109575 } }, { "ph": "f", "id": 109575, "pid": 76337, "tid": -914061504, "ts": 1716454222994817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222994818, "dur": 0, "args": { "External id": 109576, "cbid": 205, "correlation": 109576 } }, { "ph": "f", "id": 109576, "pid": 76337, "tid": -914061504, "ts": 1716454222994818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223073449, "dur": 23, "args": { "External id": 109580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109580, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109580, "pid": 5, "tid": 7, "ts": 1716454223073449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994832, "dur": 13, "args": { "External id": 109580, "cbid": 211, "correlation": 109580 } }, { "ph": "s", "id": 109580, "pid": 76337, "tid": -914061504, "ts": 1716454222994832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223073473, "dur": 119, "args": { "External id": 109582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109582, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109582, "pid": 5, "tid": 7, "ts": 1716454223073473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994852, "dur": 6, "args": { "External id": 109582, "cbid": 211, "correlation": 109582 } }, { "ph": "s", "id": 109582, "pid": 76337, "tid": -914061504, "ts": 1716454222994852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223073593, "dur": 22, "args": { "External id": 109584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109584, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109584, "pid": 5, "tid": 7, "ts": 1716454223073593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994861, "dur": 5, "args": { "External id": 109584, "cbid": 211, "correlation": 109584 } }, { "ph": "s", "id": 109584, "pid": 76337, "tid": -914061504, "ts": 1716454222994861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223073616, "dur": 32, "args": { "External id": 109590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109590, "pid": 5, "tid": 7, "ts": 1716454223073616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994889, "dur": 9, "args": { "External id": 109590, "cbid": 211, "correlation": 109590 } }, { "ph": "s", "id": 109590, "pid": 76337, "tid": -914061504, "ts": 1716454222994889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223073650, "dur": 28, "args": { "External id": 109598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109598, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109598, "pid": 5, "tid": 7, "ts": 1716454223073650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994921, "dur": 8, "args": { "External id": 109598, "cbid": 211, "correlation": 109598 } }, { "ph": "s", "id": 109598, "pid": 76337, "tid": -914061504, "ts": 1716454222994921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223073678, "dur": 100, "args": { "External id": 109609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109609, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109609, "pid": 5, "tid": 7, "ts": 1716454223073678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222994991, "dur": 12, "args": { "External id": 109609, "cbid": 211, "correlation": 109609 } }, { "ph": "s", "id": 109609, "pid": 76337, "tid": -914061504, "ts": 1716454222994991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222995047, "dur": 0, "args": { "External id": 109619, "cbid": 317, "correlation": 109619 } }, { "ph": "f", "id": 109619, "pid": 76337, "tid": -914061504, "ts": 1716454222995047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222995048, "dur": 0, "args": { "External id": 109620, "cbid": 203, "correlation": 109620 } }, { "ph": "f", "id": 109620, "pid": 76337, "tid": -914061504, "ts": 1716454222995048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222995049, "dur": 0, "args": { "External id": 109621, "cbid": 205, "correlation": 109621 } }, { "ph": "f", "id": 109621, "pid": 76337, "tid": -914061504, "ts": 1716454222995049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223073780, "dur": 75, "args": { "External id": 109625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109625, "pid": 5, "tid": 7, "ts": 1716454223073780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995062, "dur": 11, "args": { "External id": 109625, "cbid": 211, "correlation": 109625 } }, { "ph": "s", "id": 109625, "pid": 76337, "tid": -914061504, "ts": 1716454222995062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223073855, "dur": 43, "args": { "External id": 109627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109627, "pid": 5, "tid": 7, "ts": 1716454223073855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995076, "dur": 6, "args": { "External id": 109627, "cbid": 211, "correlation": 109627 } }, { "ph": "s", "id": 109627, "pid": 76337, "tid": -914061504, "ts": 1716454222995076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223073900, "dur": 4, "args": { "External id": 109629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109629, "pid": 5, "tid": 7, "ts": 1716454223073900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995087, "dur": 6, "args": { "External id": 109629, "cbid": 211, "correlation": 109629 } }, { "ph": "s", "id": 109629, "pid": 76337, "tid": -914061504, "ts": 1716454222995087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222995095, "dur": 0, "args": { "External id": 109630, "cbid": 51, "correlation": 109630 } }, { "ph": "s", "id": 109630, "pid": 76337, "tid": -914061504, "ts": 1716454222995095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223073905, "dur": 2228, "args": { "External id": 109631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109631, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109631, "pid": 5, "tid": 7, "ts": 1716454223073905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995096, "dur": 5, "args": { "External id": 109631, "cbid": 211, "correlation": 109631 } }, { "ph": "s", "id": 109631, "pid": 76337, "tid": -914061504, "ts": 1716454222995096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223076134, "dur": 112, "args": { "External id": 109636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109636, "pid": 5, "tid": 7, "ts": 1716454223076134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995125, "dur": 9, "args": { "External id": 109636, "cbid": 211, "correlation": 109636 } }, { "ph": "s", "id": 109636, "pid": 76337, "tid": -914061504, "ts": 1716454222995125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223076246, "dur": 162, "args": { "External id": 109645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109645, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109645, "pid": 5, "tid": 7, "ts": 1716454223076246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995217, "dur": 13, "args": { "External id": 109645, "cbid": 211, "correlation": 109645 } }, { "ph": "s", "id": 109645, "pid": 76337, "tid": -914061504, "ts": 1716454222995217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223076409, "dur": 127, "args": { "External id": 109665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109665, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 109665, "pid": 5, "tid": 7, "ts": 1716454223076409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995288, "dur": 11, "args": { "External id": 109665, "cbid": 211, "correlation": 109665 } }, { "ph": "s", "id": 109665, "pid": 76337, "tid": -914061504, "ts": 1716454222995288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223076537, "dur": 5, "args": { "External id": 109677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109677, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 109677, "pid": 5, "tid": 7, "ts": 1716454223076537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995309, "dur": 6, "args": { "External id": 109677, "cbid": 211, "correlation": 109677 } }, { "ph": "s", "id": 109677, "pid": 76337, "tid": -914061504, "ts": 1716454222995309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223076543, "dur": 159, "args": { "External id": 109680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109680, "pid": 5, "tid": 7, "ts": 1716454223076543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995327, "dur": 7, "args": { "External id": 109680, "cbid": 211, "correlation": 109680 } }, { "ph": "s", "id": 109680, "pid": 76337, "tid": -914061504, "ts": 1716454222995327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223076703, "dur": 101, "args": { "External id": 109689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109689, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109689, "pid": 5, "tid": 7, "ts": 1716454223076703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995367, "dur": 9, "args": { "External id": 109689, "cbid": 211, "correlation": 109689 } }, { "ph": "s", "id": 109689, "pid": 76337, "tid": -914061504, "ts": 1716454222995367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222995420, "dur": 0, "args": { "External id": 109699, "cbid": 317, "correlation": 109699 } }, { "ph": "f", "id": 109699, "pid": 76337, "tid": -914061504, "ts": 1716454222995420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222995421, "dur": 0, "args": { "External id": 109700, "cbid": 203, "correlation": 109700 } }, { "ph": "f", "id": 109700, "pid": 76337, "tid": -914061504, "ts": 1716454222995421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222995422, "dur": 0, "args": { "External id": 109701, "cbid": 205, "correlation": 109701 } }, { "ph": "f", "id": 109701, "pid": 76337, "tid": -914061504, "ts": 1716454222995422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223076806, "dur": 111, "args": { "External id": 109705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109705, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109705, "pid": 5, "tid": 7, "ts": 1716454223076806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995439, "dur": 11, "args": { "External id": 109705, "cbid": 211, "correlation": 109705 } }, { "ph": "s", "id": 109705, "pid": 76337, "tid": -914061504, "ts": 1716454222995439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223076918, "dur": 34, "args": { "External id": 109707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109707, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109707, "pid": 5, "tid": 7, "ts": 1716454223076918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995452, "dur": 5, "args": { "External id": 109707, "cbid": 211, "correlation": 109707 } }, { "ph": "s", "id": 109707, "pid": 76337, "tid": -914061504, "ts": 1716454222995452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223076953, "dur": 4, "args": { "External id": 109709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109709, "pid": 5, "tid": 7, "ts": 1716454223076953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995462, "dur": 5, "args": { "External id": 109709, "cbid": 211, "correlation": 109709 } }, { "ph": "s", "id": 109709, "pid": 76337, "tid": -914061504, "ts": 1716454222995462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222995471, "dur": 0, "args": { "External id": 109710, "cbid": 51, "correlation": 109710 } }, { "ph": "s", "id": 109710, "pid": 76337, "tid": -914061504, "ts": 1716454222995471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223076958, "dur": 1985, "args": { "External id": 109711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109711, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109711, "pid": 5, "tid": 7, "ts": 1716454223076958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995472, "dur": 5, "args": { "External id": 109711, "cbid": 211, "correlation": 109711 } }, { "ph": "s", "id": 109711, "pid": 76337, "tid": -914061504, "ts": 1716454222995472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223078944, "dur": 59, "args": { "External id": 109716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109716, "pid": 5, "tid": 7, "ts": 1716454223078944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995501, "dur": 8, "args": { "External id": 109716, "cbid": 211, "correlation": 109716 } }, { "ph": "s", "id": 109716, "pid": 76337, "tid": -914061504, "ts": 1716454222995501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223079004, "dur": 3, "args": { "External id": 109724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109724, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109724, "pid": 5, "tid": 7, "ts": 1716454223079004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995544, "dur": 9, "args": { "External id": 109724, "cbid": 211, "correlation": 109724 } }, { "ph": "s", "id": 109724, "pid": 76337, "tid": -914061504, "ts": 1716454222995544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222995609, "dur": 1, "args": { "External id": 109740, "cbid": 251, "correlation": 109740 } }, { "ph": "f", "id": 109740, "pid": 76337, "tid": -914061504, "ts": 1716454222995609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222995614, "dur": 0, "args": { "External id": 109742, "cbid": 251, "correlation": 109742 } }, { "ph": "f", "id": 109742, "pid": 76337, "tid": -914061504, "ts": 1716454222995614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223079009, "dur": 11, "args": { "External id": 109743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109743, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 109743, "pid": 5, "tid": 7, "ts": 1716454223079009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995616, "dur": 11, "args": { "External id": 109743, "cbid": 211, "correlation": 109743 } }, { "ph": "s", "id": 109743, "pid": 76337, "tid": -914061504, "ts": 1716454222995616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223079021, "dur": 5, "args": { "External id": 109745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109745, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 109745, "pid": 5, "tid": 7, "ts": 1716454223079021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995629, "dur": 6, "args": { "External id": 109745, "cbid": 211, "correlation": 109745 } }, { "ph": "s", "id": 109745, "pid": 76337, "tid": -914061504, "ts": 1716454222995629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223079027, "dur": 54, "args": { "External id": 109755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109755, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109755, "pid": 5, "tid": 7, "ts": 1716454223079027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995688, "dur": 11, "args": { "External id": 109755, "cbid": 211, "correlation": 109755 } }, { "ph": "s", "id": 109755, "pid": 76337, "tid": -914061504, "ts": 1716454222995688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223079083, "dur": 50, "args": { "External id": 109775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109775, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 109775, "pid": 5, "tid": 7, "ts": 1716454223079083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995753, "dur": 11, "args": { "External id": 109775, "cbid": 211, "correlation": 109775 } }, { "ph": "s", "id": 109775, "pid": 76337, "tid": -914061504, "ts": 1716454222995753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223079135, "dur": 4, "args": { "External id": 109787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109787, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109787, "pid": 5, "tid": 7, "ts": 1716454223079135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995774, "dur": 6, "args": { "External id": 109787, "cbid": 211, "correlation": 109787 } }, { "ph": "s", "id": 109787, "pid": 76337, "tid": -914061504, "ts": 1716454222995774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223079140, "dur": 54, "args": { "External id": 109790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109790, "pid": 5, "tid": 7, "ts": 1716454223079140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995792, "dur": 6, "args": { "External id": 109790, "cbid": 211, "correlation": 109790 } }, { "ph": "s", "id": 109790, "pid": 76337, "tid": -914061504, "ts": 1716454222995792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223079195, "dur": 37, "args": { "External id": 109799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109799, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109799, "pid": 5, "tid": 7, "ts": 1716454223079195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995832, "dur": 11, "args": { "External id": 109799, "cbid": 211, "correlation": 109799 } }, { "ph": "s", "id": 109799, "pid": 76337, "tid": -914061504, "ts": 1716454222995832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222995896, "dur": 0, "args": { "External id": 109809, "cbid": 317, "correlation": 109809 } }, { "ph": "f", "id": 109809, "pid": 76337, "tid": -914061504, "ts": 1716454222995896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222995897, "dur": 0, "args": { "External id": 109810, "cbid": 203, "correlation": 109810 } }, { "ph": "f", "id": 109810, "pid": 76337, "tid": -914061504, "ts": 1716454222995897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222995898, "dur": 0, "args": { "External id": 109811, "cbid": 205, "correlation": 109811 } }, { "ph": "f", "id": 109811, "pid": 76337, "tid": -914061504, "ts": 1716454222995898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223079233, "dur": 40, "args": { "External id": 109815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109815, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109815, "pid": 5, "tid": 7, "ts": 1716454223079233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995915, "dur": 12, "args": { "External id": 109815, "cbid": 211, "correlation": 109815 } }, { "ph": "s", "id": 109815, "pid": 76337, "tid": -914061504, "ts": 1716454222995915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223079274, "dur": 14, "args": { "External id": 109817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109817, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109817, "pid": 5, "tid": 7, "ts": 1716454223079274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995930, "dur": 5, "args": { "External id": 109817, "cbid": 211, "correlation": 109817 } }, { "ph": "s", "id": 109817, "pid": 76337, "tid": -914061504, "ts": 1716454222995930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223079290, "dur": 3, "args": { "External id": 109819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109819, "pid": 5, "tid": 7, "ts": 1716454223079290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995940, "dur": 6, "args": { "External id": 109819, "cbid": 211, "correlation": 109819 } }, { "ph": "s", "id": 109819, "pid": 76337, "tid": -914061504, "ts": 1716454222995940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222995948, "dur": 0, "args": { "External id": 109820, "cbid": 51, "correlation": 109820 } }, { "ph": "s", "id": 109820, "pid": 76337, "tid": -914061504, "ts": 1716454222995948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223079294, "dur": 688, "args": { "External id": 109821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109821, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109821, "pid": 5, "tid": 7, "ts": 1716454223079294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995949, "dur": 5, "args": { "External id": 109821, "cbid": 211, "correlation": 109821 } }, { "ph": "s", "id": 109821, "pid": 76337, "tid": -914061504, "ts": 1716454222995949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223079984, "dur": 59, "args": { "External id": 109826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109826, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109826, "pid": 5, "tid": 7, "ts": 1716454223079984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222995985, "dur": 10, "args": { "External id": 109826, "cbid": 211, "correlation": 109826 } }, { "ph": "s", "id": 109826, "pid": 76337, "tid": -914061504, "ts": 1716454222995985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222996044, "dur": 0, "args": { "External id": 109836, "cbid": 317, "correlation": 109836 } }, { "ph": "f", "id": 109836, "pid": 76337, "tid": -914061504, "ts": 1716454222996044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222996045, "dur": 0, "args": { "External id": 109837, "cbid": 203, "correlation": 109837 } }, { "ph": "f", "id": 109837, "pid": 76337, "tid": -914061504, "ts": 1716454222996045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222996046, "dur": 0, "args": { "External id": 109838, "cbid": 205, "correlation": 109838 } }, { "ph": "f", "id": 109838, "pid": 76337, "tid": -914061504, "ts": 1716454222996046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223080044, "dur": 3, "args": { "External id": 109842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109842, "pid": 5, "tid": 7, "ts": 1716454223080044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996062, "dur": 11, "args": { "External id": 109842, "cbid": 211, "correlation": 109842 } }, { "ph": "s", "id": 109842, "pid": 76337, "tid": -914061504, "ts": 1716454222996062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222996077, "dur": 0, "args": { "External id": 109843, "cbid": 51, "correlation": 109843 } }, { "ph": "s", "id": 109843, "pid": 76337, "tid": -914061504, "ts": 1716454222996077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454223080049, "dur": 262, "args": { "External id": 109844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109844, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109844, "pid": 5, "tid": 7, "ts": 1716454223080049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996079, "dur": 8, "args": { "External id": 109844, "cbid": 211, "correlation": 109844 } }, { "ph": "s", "id": 109844, "pid": 76337, "tid": -914061504, "ts": 1716454222996079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223080312, "dur": 59, "args": { "External id": 109849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109849, "pid": 5, "tid": 7, "ts": 1716454223080312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996108, "dur": 8, "args": { "External id": 109849, "cbid": 211, "correlation": 109849 } }, { "ph": "s", "id": 109849, "pid": 76337, "tid": -914061504, "ts": 1716454222996108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223080372, "dur": 50, "args": { "External id": 109857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109857, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109857, "pid": 5, "tid": 7, "ts": 1716454223080372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996137, "dur": 8, "args": { "External id": 109857, "cbid": 211, "correlation": 109857 } }, { "ph": "s", "id": 109857, "pid": 76337, "tid": -914061504, "ts": 1716454222996137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223080423, "dur": 35, "args": { "External id": 109865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109865, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109865, "pid": 5, "tid": 7, "ts": 1716454223080423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996165, "dur": 8, "args": { "External id": 109865, "cbid": 211, "correlation": 109865 } }, { "ph": "s", "id": 109865, "pid": 76337, "tid": -914061504, "ts": 1716454222996165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223080460, "dur": 50, "args": { "External id": 109885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109885, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 109885, "pid": 5, "tid": 7, "ts": 1716454223080460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996248, "dur": 12, "args": { "External id": 109885, "cbid": 211, "correlation": 109885 } }, { "ph": "s", "id": 109885, "pid": 76337, "tid": -914061504, "ts": 1716454222996248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223080511, "dur": 4, "args": { "External id": 109897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109897, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 109897, "pid": 5, "tid": 7, "ts": 1716454223080511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996270, "dur": 6, "args": { "External id": 109897, "cbid": 211, "correlation": 109897 } }, { "ph": "s", "id": 109897, "pid": 76337, "tid": -914061504, "ts": 1716454222996270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223080516, "dur": 55, "args": { "External id": 109900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109900, "pid": 5, "tid": 7, "ts": 1716454223080516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996288, "dur": 6, "args": { "External id": 109900, "cbid": 211, "correlation": 109900 } }, { "ph": "s", "id": 109900, "pid": 76337, "tid": -914061504, "ts": 1716454222996288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222996345, "dur": 0, "args": { "External id": 109911, "cbid": 317, "correlation": 109911 } }, { "ph": "f", "id": 109911, "pid": 76337, "tid": -914061504, "ts": 1716454222996345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222996345, "dur": 0, "args": { "External id": 109912, "cbid": 203, "correlation": 109912 } }, { "ph": "f", "id": 109912, "pid": 76337, "tid": -914061504, "ts": 1716454222996345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222996346, "dur": 0, "args": { "External id": 109913, "cbid": 205, "correlation": 109913 } }, { "ph": "f", "id": 109913, "pid": 76337, "tid": -914061504, "ts": 1716454222996346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996375, "dur": 2, "args": { "External id": 109917, "cbid": 251, "correlation": 109917 } }, { "ph": "f", "id": 109917, "pid": 76337, "tid": -914061504, "ts": 1716454222996375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996378, "dur": 0, "args": { "External id": 109918, "cbid": 251, "correlation": 109918 } }, { "ph": "f", "id": 109918, "pid": 76337, "tid": -914061504, "ts": 1716454222996378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996380, "dur": 0, "args": { "External id": 109919, "cbid": 251, "correlation": 109919 } }, { "ph": "f", "id": 109919, "pid": 76337, "tid": -914061504, "ts": 1716454222996380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996381, "dur": 1, "args": { "External id": 109920, "cbid": 251, "correlation": 109920 } }, { "ph": "f", "id": 109920, "pid": 76337, "tid": -914061504, "ts": 1716454222996381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996383, "dur": 1, "args": { "External id": 109921, "cbid": 251, "correlation": 109921 } }, { "ph": "f", "id": 109921, "pid": 76337, "tid": -914061504, "ts": 1716454222996383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996385, "dur": 1, "args": { "External id": 109922, "cbid": 251, "correlation": 109922 } }, { "ph": "f", "id": 109922, "pid": 76337, "tid": -914061504, "ts": 1716454222996385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996386, "dur": 1, "args": { "External id": 109923, "cbid": 251, "correlation": 109923 } }, { "ph": "f", "id": 109923, "pid": 76337, "tid": -914061504, "ts": 1716454222996386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996389, "dur": 1, "args": { "External id": 109924, "cbid": 251, "correlation": 109924 } }, { "ph": "f", "id": 109924, "pid": 76337, "tid": -914061504, "ts": 1716454222996389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996391, "dur": 0, "args": { "External id": 109925, "cbid": 251, "correlation": 109925 } }, { "ph": "f", "id": 109925, "pid": 76337, "tid": -914061504, "ts": 1716454222996391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223080572, "dur": 112, "args": { "External id": 109926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109926, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 109926, "pid": 5, "tid": 7, "ts": 1716454223080572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996394, "dur": 13, "args": { "External id": 109926, "cbid": 211, "correlation": 109926 } }, { "ph": "s", "id": 109926, "pid": 76337, "tid": -914061504, "ts": 1716454222996394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223080685, "dur": 59, "args": { "External id": 109932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109932, "pid": 5, "tid": 7, "ts": 1716454223080685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996429, "dur": 9, "args": { "External id": 109932, "cbid": 211, "correlation": 109932 } }, { "ph": "s", "id": 109932, "pid": 76337, "tid": -914061504, "ts": 1716454222996429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223080745, "dur": 494, "args": { "External id": 109941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109941, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109941, "pid": 5, "tid": 7, "ts": 1716454223080745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996512, "dur": 14, "args": { "External id": 109941, "cbid": 211, "correlation": 109941 } }, { "ph": "s", "id": 109941, "pid": 76337, "tid": -914061504, "ts": 1716454222996512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223081240, "dur": 178, "args": { "External id": 109963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109963, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 109963, "pid": 5, "tid": 7, "ts": 1716454223081240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996570, "dur": 11, "args": { "External id": 109963, "cbid": 211, "correlation": 109963 } }, { "ph": "s", "id": 109963, "pid": 76337, "tid": -914061504, "ts": 1716454222996570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996657, "dur": 1, "args": { "External id": 109974, "cbid": 251, "correlation": 109974 } }, { "ph": "f", "id": 109974, "pid": 76337, "tid": -914061504, "ts": 1716454222996657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223081419, "dur": 192, "args": { "External id": 109975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109975, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109975, "pid": 5, "tid": 7, "ts": 1716454223081419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996662, "dur": 13, "args": { "External id": 109975, "cbid": 211, "correlation": 109975 } }, { "ph": "s", "id": 109975, "pid": 76337, "tid": -914061504, "ts": 1716454222996662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996732, "dur": 1, "args": { "External id": 109986, "cbid": 251, "correlation": 109986 } }, { "ph": "f", "id": 109986, "pid": 76337, "tid": -914061504, "ts": 1716454222996732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223081612, "dur": 183, "args": { "External id": 109987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109987, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109987, "pid": 5, "tid": 7, "ts": 1716454223081612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996736, "dur": 11, "args": { "External id": 109987, "cbid": 211, "correlation": 109987 } }, { "ph": "s", "id": 109987, "pid": 76337, "tid": -914061504, "ts": 1716454222996736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996800, "dur": 1, "args": { "External id": 109998, "cbid": 251, "correlation": 109998 } }, { "ph": "f", "id": 109998, "pid": 76337, "tid": -914061504, "ts": 1716454222996800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223081797, "dur": 181, "args": { "External id": 109999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 109999, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 109999, "pid": 5, "tid": 7, "ts": 1716454223081797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996804, "dur": 11, "args": { "External id": 109999, "cbid": 211, "correlation": 109999 } }, { "ph": "s", "id": 109999, "pid": 76337, "tid": -914061504, "ts": 1716454222996804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223081979, "dur": 18291, "args": { "External id": 110020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110020, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110020, "pid": 5, "tid": 7, "ts": 1716454223081979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996888, "dur": 13, "args": { "External id": 110020, "cbid": 211, "correlation": 110020 } }, { "ph": "s", "id": 110020, "pid": 76337, "tid": -914061504, "ts": 1716454222996888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222996993, "dur": 1, "args": { "External id": 110038, "cbid": 251, "correlation": 110038 } }, { "ph": "f", "id": 110038, "pid": 76337, "tid": -914061504, "ts": 1716454222996993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223100271, "dur": 203, "args": { "External id": 110040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110040, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110040, "pid": 5, "tid": 7, "ts": 1716454223100271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222996999, "dur": 14, "args": { "External id": 110040, "cbid": 211, "correlation": 110040 } }, { "ph": "s", "id": 110040, "pid": 76337, "tid": -914061504, "ts": 1716454222996999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223100476, "dur": 66, "args": { "External id": 110048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110048, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110048, "pid": 5, "tid": 7, "ts": 1716454223100476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997070, "dur": 12, "args": { "External id": 110048, "cbid": 211, "correlation": 110048 } }, { "ph": "s", "id": 110048, "pid": 76337, "tid": -914061504, "ts": 1716454222997070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223100544, "dur": 96, "args": { "External id": 110056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110056, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110056, "pid": 5, "tid": 7, "ts": 1716454223100544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997109, "dur": 9, "args": { "External id": 110056, "cbid": 211, "correlation": 110056 } }, { "ph": "s", "id": 110056, "pid": 76337, "tid": -914061504, "ts": 1716454222997109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223100641, "dur": 55, "args": { "External id": 110067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110067, "pid": 5, "tid": 7, "ts": 1716454223100641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997181, "dur": 12, "args": { "External id": 110067, "cbid": 211, "correlation": 110067 } }, { "ph": "s", "id": 110067, "pid": 76337, "tid": -914061504, "ts": 1716454222997181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223100697, "dur": 92, "args": { "External id": 110089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110089, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110089, "pid": 5, "tid": 7, "ts": 1716454223100697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997212, "dur": 7, "args": { "External id": 110089, "cbid": 211, "correlation": 110089 } }, { "ph": "s", "id": 110089, "pid": 76337, "tid": -914061504, "ts": 1716454222997212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997296, "dur": 1, "args": { "External id": 110100, "cbid": 251, "correlation": 110100 } }, { "ph": "f", "id": 110100, "pid": 76337, "tid": -914061504, "ts": 1716454222997296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223100790, "dur": 103, "args": { "External id": 110101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110101, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110101, "pid": 5, "tid": 7, "ts": 1716454223100790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997301, "dur": 12, "args": { "External id": 110101, "cbid": 211, "correlation": 110101 } }, { "ph": "s", "id": 110101, "pid": 76337, "tid": -914061504, "ts": 1716454222997301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997376, "dur": 1, "args": { "External id": 110112, "cbid": 251, "correlation": 110112 } }, { "ph": "f", "id": 110112, "pid": 76337, "tid": -914061504, "ts": 1716454222997376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997380, "dur": 0, "args": { "External id": 110113, "cbid": 251, "correlation": 110113 } }, { "ph": "f", "id": 110113, "pid": 76337, "tid": -914061504, "ts": 1716454222997380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223100895, "dur": 10, "args": { "External id": 110114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110114, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 110114, "pid": 5, "tid": 7, "ts": 1716454223100895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997382, "dur": 13, "args": { "External id": 110114, "cbid": 211, "correlation": 110114 } }, { "ph": "s", "id": 110114, "pid": 76337, "tid": -914061504, "ts": 1716454222997382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223100906, "dur": 5, "args": { "External id": 110116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110116, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 110116, "pid": 5, "tid": 7, "ts": 1716454223100906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997399, "dur": 6, "args": { "External id": 110116, "cbid": 211, "correlation": 110116 } }, { "ph": "s", "id": 110116, "pid": 76337, "tid": -914061504, "ts": 1716454222997399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997459, "dur": 1, "args": { "External id": 110127, "cbid": 251, "correlation": 110127 } }, { "ph": "f", "id": 110127, "pid": 76337, "tid": -914061504, "ts": 1716454222997459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997463, "dur": 0, "args": { "External id": 110128, "cbid": 251, "correlation": 110128 } }, { "ph": "f", "id": 110128, "pid": 76337, "tid": -914061504, "ts": 1716454222997463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223100912, "dur": 6, "args": { "External id": 110129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110129, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 110129, "pid": 5, "tid": 7, "ts": 1716454223100912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997464, "dur": 12, "args": { "External id": 110129, "cbid": 211, "correlation": 110129 } }, { "ph": "s", "id": 110129, "pid": 76337, "tid": -914061504, "ts": 1716454222997464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223100919, "dur": 3, "args": { "External id": 110131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110131, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 110131, "pid": 5, "tid": 7, "ts": 1716454223100919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997478, "dur": 5, "args": { "External id": 110131, "cbid": 211, "correlation": 110131 } }, { "ph": "s", "id": 110131, "pid": 76337, "tid": -914061504, "ts": 1716454222997478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223100924, "dur": 156, "args": { "External id": 110152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110152, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110152, "pid": 5, "tid": 7, "ts": 1716454223100924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997551, "dur": 12, "args": { "External id": 110152, "cbid": 211, "correlation": 110152 } }, { "ph": "s", "id": 110152, "pid": 76337, "tid": -914061504, "ts": 1716454222997551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997648, "dur": 1, "args": { "External id": 110170, "cbid": 251, "correlation": 110170 } }, { "ph": "f", "id": 110170, "pid": 76337, "tid": -914061504, "ts": 1716454222997648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223101082, "dur": 106, "args": { "External id": 110172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110172, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110172, "pid": 5, "tid": 7, "ts": 1716454223101082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997654, "dur": 14, "args": { "External id": 110172, "cbid": 211, "correlation": 110172 } }, { "ph": "s", "id": 110172, "pid": 76337, "tid": -914061504, "ts": 1716454222997654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223101189, "dur": 35, "args": { "External id": 110180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110180, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110180, "pid": 5, "tid": 7, "ts": 1716454223101189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997726, "dur": 12, "args": { "External id": 110180, "cbid": 211, "correlation": 110180 } }, { "ph": "s", "id": 110180, "pid": 76337, "tid": -914061504, "ts": 1716454222997726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223101225, "dur": 67, "args": { "External id": 110188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110188, "pid": 5, "tid": 7, "ts": 1716454223101225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997766, "dur": 10, "args": { "External id": 110188, "cbid": 211, "correlation": 110188 } }, { "ph": "s", "id": 110188, "pid": 76337, "tid": -914061504, "ts": 1716454222997766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223101293, "dur": 92, "args": { "External id": 110210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110210, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110210, "pid": 5, "tid": 7, "ts": 1716454223101293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997818, "dur": 11, "args": { "External id": 110210, "cbid": 211, "correlation": 110210 } }, { "ph": "s", "id": 110210, "pid": 76337, "tid": -914061504, "ts": 1716454222997818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222997905, "dur": 1, "args": { "External id": 110226, "cbid": 251, "correlation": 110226 } }, { "ph": "f", "id": 110226, "pid": 76337, "tid": -914061504, "ts": 1716454222997905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223101386, "dur": 572, "args": { "External id": 110228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110228, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110228, "pid": 5, "tid": 7, "ts": 1716454223101386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997911, "dur": 12, "args": { "External id": 110228, "cbid": 211, "correlation": 110228 } }, { "ph": "s", "id": 110228, "pid": 76337, "tid": -914061504, "ts": 1716454222997911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223101960, "dur": 244, "args": { "External id": 110236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110236, "pid": 5, "tid": 7, "ts": 1716454223101960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222997984, "dur": 13, "args": { "External id": 110236, "cbid": 211, "correlation": 110236 } }, { "ph": "s", "id": 110236, "pid": 76337, "tid": -914061504, "ts": 1716454222997984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223102205, "dur": 251, "args": { "External id": 110244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110244, "pid": 5, "tid": 7, "ts": 1716454223102205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998017, "dur": 9, "args": { "External id": 110244, "cbid": 211, "correlation": 110244 } }, { "ph": "s", "id": 110244, "pid": 76337, "tid": -914061504, "ts": 1716454222998017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998101, "dur": 1, "args": { "External id": 110260, "cbid": 251, "correlation": 110260 } }, { "ph": "f", "id": 110260, "pid": 76337, "tid": -914061504, "ts": 1716454222998101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998106, "dur": 0, "args": { "External id": 110262, "cbid": 251, "correlation": 110262 } }, { "ph": "f", "id": 110262, "pid": 76337, "tid": -914061504, "ts": 1716454222998106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223102457, "dur": 355, "args": { "External id": 110263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110263, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 110263, "pid": 5, "tid": 7, "ts": 1716454223102457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998108, "dur": 12, "args": { "External id": 110263, "cbid": 211, "correlation": 110263 } }, { "ph": "s", "id": 110263, "pid": 76337, "tid": -914061504, "ts": 1716454222998108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223102813, "dur": 50, "args": { "External id": 110271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110271, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110271, "pid": 5, "tid": 7, "ts": 1716454223102813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998151, "dur": 9, "args": { "External id": 110271, "cbid": 211, "correlation": 110271 } }, { "ph": "s", "id": 110271, "pid": 76337, "tid": -914061504, "ts": 1716454222998151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223102865, "dur": 158, "args": { "External id": 110282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110282, "pid": 5, "tid": 7, "ts": 1716454223102865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998218, "dur": 13, "args": { "External id": 110282, "cbid": 211, "correlation": 110282 } }, { "ph": "s", "id": 110282, "pid": 76337, "tid": -914061504, "ts": 1716454222998218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222998283, "dur": 0, "args": { "External id": 110294, "cbid": 317, "correlation": 110294 } }, { "ph": "f", "id": 110294, "pid": 76337, "tid": -914061504, "ts": 1716454222998283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222998284, "dur": 0, "args": { "External id": 110295, "cbid": 203, "correlation": 110295 } }, { "ph": "f", "id": 110295, "pid": 76337, "tid": -914061504, "ts": 1716454222998284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222998285, "dur": 0, "args": { "External id": 110296, "cbid": 205, "correlation": 110296 } }, { "ph": "f", "id": 110296, "pid": 76337, "tid": -914061504, "ts": 1716454222998285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998308, "dur": 1, "args": { "External id": 110300, "cbid": 251, "correlation": 110300 } }, { "ph": "f", "id": 110300, "pid": 76337, "tid": -914061504, "ts": 1716454222998308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998310, "dur": 0, "args": { "External id": 110301, "cbid": 251, "correlation": 110301 } }, { "ph": "f", "id": 110301, "pid": 76337, "tid": -914061504, "ts": 1716454222998310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998311, "dur": 0, "args": { "External id": 110302, "cbid": 251, "correlation": 110302 } }, { "ph": "f", "id": 110302, "pid": 76337, "tid": -914061504, "ts": 1716454222998311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998312, "dur": 0, "args": { "External id": 110303, "cbid": 251, "correlation": 110303 } }, { "ph": "f", "id": 110303, "pid": 76337, "tid": -914061504, "ts": 1716454222998312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998312, "dur": 0, "args": { "External id": 110304, "cbid": 251, "correlation": 110304 } }, { "ph": "f", "id": 110304, "pid": 76337, "tid": -914061504, "ts": 1716454222998312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998313, "dur": 0, "args": { "External id": 110305, "cbid": 251, "correlation": 110305 } }, { "ph": "f", "id": 110305, "pid": 76337, "tid": -914061504, "ts": 1716454222998313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998314, "dur": 0, "args": { "External id": 110306, "cbid": 251, "correlation": 110306 } }, { "ph": "f", "id": 110306, "pid": 76337, "tid": -914061504, "ts": 1716454222998314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998315, "dur": 0, "args": { "External id": 110307, "cbid": 251, "correlation": 110307 } }, { "ph": "f", "id": 110307, "pid": 76337, "tid": -914061504, "ts": 1716454222998315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998316, "dur": 0, "args": { "External id": 110308, "cbid": 251, "correlation": 110308 } }, { "ph": "f", "id": 110308, "pid": 76337, "tid": -914061504, "ts": 1716454222998316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223103024, "dur": 115, "args": { "External id": 110309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110309, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110309, "pid": 5, "tid": 7, "ts": 1716454223103024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998318, "dur": 13, "args": { "External id": 110309, "cbid": 211, "correlation": 110309 } }, { "ph": "s", "id": 110309, "pid": 76337, "tid": -914061504, "ts": 1716454222998318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223103140, "dur": 59, "args": { "External id": 110315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110315, "pid": 5, "tid": 7, "ts": 1716454223103140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998353, "dur": 9, "args": { "External id": 110315, "cbid": 211, "correlation": 110315 } }, { "ph": "s", "id": 110315, "pid": 76337, "tid": -914061504, "ts": 1716454222998353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223103201, "dur": 49, "args": { "External id": 110323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110323, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110323, "pid": 5, "tid": 7, "ts": 1716454223103201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998386, "dur": 8, "args": { "External id": 110323, "cbid": 211, "correlation": 110323 } }, { "ph": "s", "id": 110323, "pid": 76337, "tid": -914061504, "ts": 1716454222998386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223103252, "dur": 99, "args": { "External id": 110332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110332, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110332, "pid": 5, "tid": 7, "ts": 1716454223103252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998425, "dur": 10, "args": { "External id": 110332, "cbid": 211, "correlation": 110332 } }, { "ph": "s", "id": 110332, "pid": 76337, "tid": -914061504, "ts": 1716454222998425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223103352, "dur": 91, "args": { "External id": 110352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110352, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 110352, "pid": 5, "tid": 7, "ts": 1716454223103352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998496, "dur": 11, "args": { "External id": 110352, "cbid": 211, "correlation": 110352 } }, { "ph": "s", "id": 110352, "pid": 76337, "tid": -914061504, "ts": 1716454222998496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223103444, "dur": 5, "args": { "External id": 110364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110364, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 110364, "pid": 5, "tid": 7, "ts": 1716454223103444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998517, "dur": 7, "args": { "External id": 110364, "cbid": 211, "correlation": 110364 } }, { "ph": "s", "id": 110364, "pid": 76337, "tid": -914061504, "ts": 1716454222998517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223103451, "dur": 109, "args": { "External id": 110367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110367, "pid": 5, "tid": 7, "ts": 1716454223103451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998536, "dur": 7, "args": { "External id": 110367, "cbid": 211, "correlation": 110367 } }, { "ph": "s", "id": 110367, "pid": 76337, "tid": -914061504, "ts": 1716454222998536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223103561, "dur": 70, "args": { "External id": 110376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110376, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110376, "pid": 5, "tid": 7, "ts": 1716454223103561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998575, "dur": 10, "args": { "External id": 110376, "cbid": 211, "correlation": 110376 } }, { "ph": "s", "id": 110376, "pid": 76337, "tid": -914061504, "ts": 1716454222998575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222998627, "dur": 0, "args": { "External id": 110386, "cbid": 317, "correlation": 110386 } }, { "ph": "f", "id": 110386, "pid": 76337, "tid": -914061504, "ts": 1716454222998627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222998628, "dur": 0, "args": { "External id": 110387, "cbid": 203, "correlation": 110387 } }, { "ph": "f", "id": 110387, "pid": 76337, "tid": -914061504, "ts": 1716454222998628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222998628, "dur": 0, "args": { "External id": 110388, "cbid": 205, "correlation": 110388 } }, { "ph": "f", "id": 110388, "pid": 76337, "tid": -914061504, "ts": 1716454222998628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223103633, "dur": 75, "args": { "External id": 110392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110392, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110392, "pid": 5, "tid": 7, "ts": 1716454223103633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998644, "dur": 12, "args": { "External id": 110392, "cbid": 211, "correlation": 110392 } }, { "ph": "s", "id": 110392, "pid": 76337, "tid": -914061504, "ts": 1716454222998644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223103709, "dur": 24, "args": { "External id": 110394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110394, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110394, "pid": 5, "tid": 7, "ts": 1716454223103709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998658, "dur": 5, "args": { "External id": 110394, "cbid": 211, "correlation": 110394 } }, { "ph": "s", "id": 110394, "pid": 76337, "tid": -914061504, "ts": 1716454222998658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223103735, "dur": 4, "args": { "External id": 110396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 110396, "pid": 5, "tid": 7, "ts": 1716454223103735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998668, "dur": 7, "args": { "External id": 110396, "cbid": 211, "correlation": 110396 } }, { "ph": "s", "id": 110396, "pid": 76337, "tid": -914061504, "ts": 1716454222998668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222998678, "dur": 0, "args": { "External id": 110397, "cbid": 51, "correlation": 110397 } }, { "ph": "s", "id": 110397, "pid": 76337, "tid": -914061504, "ts": 1716454222998678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223103740, "dur": 1360, "args": { "External id": 110398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110398, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110398, "pid": 5, "tid": 7, "ts": 1716454223103740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998679, "dur": 5, "args": { "External id": 110398, "cbid": 211, "correlation": 110398 } }, { "ph": "s", "id": 110398, "pid": 76337, "tid": -914061504, "ts": 1716454222998679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223105102, "dur": 59, "args": { "External id": 110403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110403, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110403, "pid": 5, "tid": 7, "ts": 1716454223105102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998706, "dur": 9, "args": { "External id": 110403, "cbid": 211, "correlation": 110403 } }, { "ph": "s", "id": 110403, "pid": 76337, "tid": -914061504, "ts": 1716454222998706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223105162, "dur": 3, "args": { "External id": 110411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110411, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 110411, "pid": 5, "tid": 7, "ts": 1716454223105162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998750, "dur": 9, "args": { "External id": 110411, "cbid": 211, "correlation": 110411 } }, { "ph": "s", "id": 110411, "pid": 76337, "tid": -914061504, "ts": 1716454222998750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998814, "dur": 1, "args": { "External id": 110427, "cbid": 251, "correlation": 110427 } }, { "ph": "f", "id": 110427, "pid": 76337, "tid": -914061504, "ts": 1716454222998814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222998820, "dur": 0, "args": { "External id": 110429, "cbid": 251, "correlation": 110429 } }, { "ph": "f", "id": 110429, "pid": 76337, "tid": -914061504, "ts": 1716454222998820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223105167, "dur": 11, "args": { "External id": 110430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110430, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 110430, "pid": 5, "tid": 7, "ts": 1716454223105167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998822, "dur": 12, "args": { "External id": 110430, "cbid": 211, "correlation": 110430 } }, { "ph": "s", "id": 110430, "pid": 76337, "tid": -914061504, "ts": 1716454222998822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223105180, "dur": 5, "args": { "External id": 110432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110432, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 110432, "pid": 5, "tid": 7, "ts": 1716454223105180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998836, "dur": 5, "args": { "External id": 110432, "cbid": 211, "correlation": 110432 } }, { "ph": "s", "id": 110432, "pid": 76337, "tid": -914061504, "ts": 1716454222998836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223105186, "dur": 54, "args": { "External id": 110442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110442, "pid": 5, "tid": 7, "ts": 1716454223105186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998893, "dur": 12, "args": { "External id": 110442, "cbid": 211, "correlation": 110442 } }, { "ph": "s", "id": 110442, "pid": 76337, "tid": -914061504, "ts": 1716454222998893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223105242, "dur": 51, "args": { "External id": 110462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110462, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 110462, "pid": 5, "tid": 7, "ts": 1716454223105242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998959, "dur": 10, "args": { "External id": 110462, "cbid": 211, "correlation": 110462 } }, { "ph": "s", "id": 110462, "pid": 76337, "tid": -914061504, "ts": 1716454222998959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223105294, "dur": 4, "args": { "External id": 110474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110474, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 110474, "pid": 5, "tid": 7, "ts": 1716454223105294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222998988, "dur": 7, "args": { "External id": 110474, "cbid": 211, "correlation": 110474 } }, { "ph": "s", "id": 110474, "pid": 76337, "tid": -914061504, "ts": 1716454222998988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223105299, "dur": 55, "args": { "External id": 110477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110477, "pid": 5, "tid": 7, "ts": 1716454223105299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999006, "dur": 7, "args": { "External id": 110477, "cbid": 211, "correlation": 110477 } }, { "ph": "s", "id": 110477, "pid": 76337, "tid": -914061504, "ts": 1716454222999006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223105356, "dur": 37, "args": { "External id": 110486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110486, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110486, "pid": 5, "tid": 7, "ts": 1716454223105356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999048, "dur": 10, "args": { "External id": 110486, "cbid": 211, "correlation": 110486 } }, { "ph": "s", "id": 110486, "pid": 76337, "tid": -914061504, "ts": 1716454222999048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222999111, "dur": 0, "args": { "External id": 110496, "cbid": 317, "correlation": 110496 } }, { "ph": "f", "id": 110496, "pid": 76337, "tid": -914061504, "ts": 1716454222999111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222999112, "dur": 0, "args": { "External id": 110497, "cbid": 203, "correlation": 110497 } }, { "ph": "f", "id": 110497, "pid": 76337, "tid": -914061504, "ts": 1716454222999112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222999112, "dur": 0, "args": { "External id": 110498, "cbid": 205, "correlation": 110498 } }, { "ph": "f", "id": 110498, "pid": 76337, "tid": -914061504, "ts": 1716454222999112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223105394, "dur": 39, "args": { "External id": 110502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110502, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110502, "pid": 5, "tid": 7, "ts": 1716454223105394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999127, "dur": 12, "args": { "External id": 110502, "cbid": 211, "correlation": 110502 } }, { "ph": "s", "id": 110502, "pid": 76337, "tid": -914061504, "ts": 1716454222999127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223105435, "dur": 15, "args": { "External id": 110504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110504, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110504, "pid": 5, "tid": 7, "ts": 1716454223105435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999141, "dur": 5, "args": { "External id": 110504, "cbid": 211, "correlation": 110504 } }, { "ph": "s", "id": 110504, "pid": 76337, "tid": -914061504, "ts": 1716454222999141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223105451, "dur": 3, "args": { "External id": 110506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110506, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 110506, "pid": 5, "tid": 7, "ts": 1716454223105451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999151, "dur": 5, "args": { "External id": 110506, "cbid": 211, "correlation": 110506 } }, { "ph": "s", "id": 110506, "pid": 76337, "tid": -914061504, "ts": 1716454222999151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454222999159, "dur": 0, "args": { "External id": 110507, "cbid": 51, "correlation": 110507 } }, { "ph": "s", "id": 110507, "pid": 76337, "tid": -914061504, "ts": 1716454222999159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223105455, "dur": 694, "args": { "External id": 110508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110508, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110508, "pid": 5, "tid": 7, "ts": 1716454223105455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999160, "dur": 5, "args": { "External id": 110508, "cbid": 211, "correlation": 110508 } }, { "ph": "s", "id": 110508, "pid": 76337, "tid": -914061504, "ts": 1716454222999160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223106150, "dur": 59, "args": { "External id": 110513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110513, "pid": 5, "tid": 7, "ts": 1716454223106150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999187, "dur": 8, "args": { "External id": 110513, "cbid": 211, "correlation": 110513 } }, { "ph": "s", "id": 110513, "pid": 76337, "tid": -914061504, "ts": 1716454222999187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222999244, "dur": 0, "args": { "External id": 110523, "cbid": 317, "correlation": 110523 } }, { "ph": "f", "id": 110523, "pid": 76337, "tid": -914061504, "ts": 1716454222999244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222999245, "dur": 0, "args": { "External id": 110524, "cbid": 203, "correlation": 110524 } }, { "ph": "f", "id": 110524, "pid": 76337, "tid": -914061504, "ts": 1716454222999245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222999246, "dur": 0, "args": { "External id": 110525, "cbid": 205, "correlation": 110525 } }, { "ph": "f", "id": 110525, "pid": 76337, "tid": -914061504, "ts": 1716454222999246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223106210, "dur": 74, "args": { "External id": 110529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110529, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110529, "pid": 5, "tid": 7, "ts": 1716454223106210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999258, "dur": 11, "args": { "External id": 110529, "cbid": 211, "correlation": 110529 } }, { "ph": "s", "id": 110529, "pid": 76337, "tid": -914061504, "ts": 1716454222999258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223106286, "dur": 207, "args": { "External id": 110531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110531, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110531, "pid": 5, "tid": 7, "ts": 1716454223106286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999278, "dur": 8, "args": { "External id": 110531, "cbid": 211, "correlation": 110531 } }, { "ph": "s", "id": 110531, "pid": 76337, "tid": -914061504, "ts": 1716454222999278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223106494, "dur": 38, "args": { "External id": 110533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110533, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110533, "pid": 5, "tid": 7, "ts": 1716454223106494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999290, "dur": 6, "args": { "External id": 110533, "cbid": 211, "correlation": 110533 } }, { "ph": "s", "id": 110533, "pid": 76337, "tid": -914061504, "ts": 1716454222999290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223106533, "dur": 59, "args": { "External id": 110539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110539, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110539, "pid": 5, "tid": 7, "ts": 1716454223106533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999316, "dur": 10, "args": { "External id": 110539, "cbid": 211, "correlation": 110539 } }, { "ph": "s", "id": 110539, "pid": 76337, "tid": -914061504, "ts": 1716454222999316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223106594, "dur": 50, "args": { "External id": 110547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110547, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110547, "pid": 5, "tid": 7, "ts": 1716454223106594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999347, "dur": 8, "args": { "External id": 110547, "cbid": 211, "correlation": 110547 } }, { "ph": "s", "id": 110547, "pid": 76337, "tid": -914061504, "ts": 1716454222999347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223106645, "dur": 36, "args": { "External id": 110555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110555, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110555, "pid": 5, "tid": 7, "ts": 1716454223106645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999377, "dur": 8, "args": { "External id": 110555, "cbid": 211, "correlation": 110555 } }, { "ph": "s", "id": 110555, "pid": 76337, "tid": -914061504, "ts": 1716454222999377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223106682, "dur": 53, "args": { "External id": 110575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110575, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 110575, "pid": 5, "tid": 7, "ts": 1716454223106682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999459, "dur": 13, "args": { "External id": 110575, "cbid": 211, "correlation": 110575 } }, { "ph": "s", "id": 110575, "pid": 76337, "tid": -914061504, "ts": 1716454222999459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223106737, "dur": 5, "args": { "External id": 110587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110587, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 110587, "pid": 5, "tid": 7, "ts": 1716454223106737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999483, "dur": 6, "args": { "External id": 110587, "cbid": 211, "correlation": 110587 } }, { "ph": "s", "id": 110587, "pid": 76337, "tid": -914061504, "ts": 1716454222999483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223106743, "dur": 55, "args": { "External id": 110590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110590, "pid": 5, "tid": 7, "ts": 1716454223106743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999500, "dur": 6, "args": { "External id": 110590, "cbid": 211, "correlation": 110590 } }, { "ph": "s", "id": 110590, "pid": 76337, "tid": -914061504, "ts": 1716454222999500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454222999557, "dur": 0, "args": { "External id": 110601, "cbid": 317, "correlation": 110601 } }, { "ph": "f", "id": 110601, "pid": 76337, "tid": -914061504, "ts": 1716454222999557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454222999557, "dur": 0, "args": { "External id": 110602, "cbid": 203, "correlation": 110602 } }, { "ph": "f", "id": 110602, "pid": 76337, "tid": -914061504, "ts": 1716454222999557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454222999558, "dur": 0, "args": { "External id": 110603, "cbid": 205, "correlation": 110603 } }, { "ph": "f", "id": 110603, "pid": 76337, "tid": -914061504, "ts": 1716454222999558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999581, "dur": 1, "args": { "External id": 110607, "cbid": 251, "correlation": 110607 } }, { "ph": "f", "id": 110607, "pid": 76337, "tid": -914061504, "ts": 1716454222999581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999583, "dur": 0, "args": { "External id": 110608, "cbid": 251, "correlation": 110608 } }, { "ph": "f", "id": 110608, "pid": 76337, "tid": -914061504, "ts": 1716454222999583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999583, "dur": 0, "args": { "External id": 110609, "cbid": 251, "correlation": 110609 } }, { "ph": "f", "id": 110609, "pid": 76337, "tid": -914061504, "ts": 1716454222999583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999584, "dur": 0, "args": { "External id": 110610, "cbid": 251, "correlation": 110610 } }, { "ph": "f", "id": 110610, "pid": 76337, "tid": -914061504, "ts": 1716454222999584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999585, "dur": 0, "args": { "External id": 110611, "cbid": 251, "correlation": 110611 } }, { "ph": "f", "id": 110611, "pid": 76337, "tid": -914061504, "ts": 1716454222999585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999585, "dur": 0, "args": { "External id": 110612, "cbid": 251, "correlation": 110612 } }, { "ph": "f", "id": 110612, "pid": 76337, "tid": -914061504, "ts": 1716454222999585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999586, "dur": 0, "args": { "External id": 110613, "cbid": 251, "correlation": 110613 } }, { "ph": "f", "id": 110613, "pid": 76337, "tid": -914061504, "ts": 1716454222999586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999587, "dur": 0, "args": { "External id": 110614, "cbid": 251, "correlation": 110614 } }, { "ph": "f", "id": 110614, "pid": 76337, "tid": -914061504, "ts": 1716454222999587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999588, "dur": 0, "args": { "External id": 110615, "cbid": 251, "correlation": 110615 } }, { "ph": "f", "id": 110615, "pid": 76337, "tid": -914061504, "ts": 1716454222999588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223106799, "dur": 111, "args": { "External id": 110616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110616, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110616, "pid": 5, "tid": 7, "ts": 1716454223106799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999590, "dur": 12, "args": { "External id": 110616, "cbid": 211, "correlation": 110616 } }, { "ph": "s", "id": 110616, "pid": 76337, "tid": -914061504, "ts": 1716454222999590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223106912, "dur": 59, "args": { "External id": 110622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110622, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110622, "pid": 5, "tid": 7, "ts": 1716454223106912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999626, "dur": 9, "args": { "External id": 110622, "cbid": 211, "correlation": 110622 } }, { "ph": "s", "id": 110622, "pid": 76337, "tid": -914061504, "ts": 1716454222999626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223106972, "dur": 556, "args": { "External id": 110631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110631, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110631, "pid": 5, "tid": 7, "ts": 1716454223106972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999707, "dur": 14, "args": { "External id": 110631, "cbid": 211, "correlation": 110631 } }, { "ph": "s", "id": 110631, "pid": 76337, "tid": -914061504, "ts": 1716454222999707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223107529, "dur": 181, "args": { "External id": 110653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110653, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110653, "pid": 5, "tid": 7, "ts": 1716454223107529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999764, "dur": 11, "args": { "External id": 110653, "cbid": 211, "correlation": 110653 } }, { "ph": "s", "id": 110653, "pid": 76337, "tid": -914061504, "ts": 1716454222999764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999851, "dur": 1, "args": { "External id": 110664, "cbid": 251, "correlation": 110664 } }, { "ph": "f", "id": 110664, "pid": 76337, "tid": -914061504, "ts": 1716454222999851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223107712, "dur": 196, "args": { "External id": 110665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110665, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110665, "pid": 5, "tid": 7, "ts": 1716454223107712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999856, "dur": 13, "args": { "External id": 110665, "cbid": 211, "correlation": 110665 } }, { "ph": "s", "id": 110665, "pid": 76337, "tid": -914061504, "ts": 1716454222999856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454222999924, "dur": 1, "args": { "External id": 110676, "cbid": 251, "correlation": 110676 } }, { "ph": "f", "id": 110676, "pid": 76337, "tid": -914061504, "ts": 1716454222999924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223107909, "dur": 191, "args": { "External id": 110677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110677, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110677, "pid": 5, "tid": 7, "ts": 1716454223107909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454222999928, "dur": 11, "args": { "External id": 110677, "cbid": 211, "correlation": 110677 } }, { "ph": "s", "id": 110677, "pid": 76337, "tid": -914061504, "ts": 1716454222999928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223000000, "dur": 1, "args": { "External id": 110688, "cbid": 251, "correlation": 110688 } }, { "ph": "f", "id": 110688, "pid": 76337, "tid": -914061504, "ts": 1716454223000000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223108101, "dur": 187, "args": { "External id": 110689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110689, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110689, "pid": 5, "tid": 7, "ts": 1716454223108101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000004, "dur": 11, "args": { "External id": 110689, "cbid": 211, "correlation": 110689 } }, { "ph": "s", "id": 110689, "pid": 76337, "tid": -914061504, "ts": 1716454223000004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223108289, "dur": 18578, "args": { "External id": 110710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110710, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110710, "pid": 5, "tid": 7, "ts": 1716454223108289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000087, "dur": 12, "args": { "External id": 110710, "cbid": 211, "correlation": 110710 } }, { "ph": "s", "id": 110710, "pid": 76337, "tid": -914061504, "ts": 1716454223000087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223000184, "dur": 1, "args": { "External id": 110728, "cbid": 251, "correlation": 110728 } }, { "ph": "f", "id": 110728, "pid": 76337, "tid": -914061504, "ts": 1716454223000184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223126868, "dur": 204, "args": { "External id": 110730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110730, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110730, "pid": 5, "tid": 7, "ts": 1716454223126868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000190, "dur": 13, "args": { "External id": 110730, "cbid": 211, "correlation": 110730 } }, { "ph": "s", "id": 110730, "pid": 76337, "tid": -914061504, "ts": 1716454223000190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223127073, "dur": 67, "args": { "External id": 110738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110738, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110738, "pid": 5, "tid": 7, "ts": 1716454223127073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000259, "dur": 12, "args": { "External id": 110738, "cbid": 211, "correlation": 110738 } }, { "ph": "s", "id": 110738, "pid": 76337, "tid": -914061504, "ts": 1716454223000259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223127141, "dur": 97, "args": { "External id": 110746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110746, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110746, "pid": 5, "tid": 7, "ts": 1716454223127141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000299, "dur": 8, "args": { "External id": 110746, "cbid": 211, "correlation": 110746 } }, { "ph": "s", "id": 110746, "pid": 76337, "tid": -914061504, "ts": 1716454223000299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223127240, "dur": 55, "args": { "External id": 110757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110757, "pid": 5, "tid": 7, "ts": 1716454223127240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000369, "dur": 13, "args": { "External id": 110757, "cbid": 211, "correlation": 110757 } }, { "ph": "s", "id": 110757, "pid": 76337, "tid": -914061504, "ts": 1716454223000369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223127297, "dur": 92, "args": { "External id": 110779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110779, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110779, "pid": 5, "tid": 7, "ts": 1716454223127297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223000401, "dur": 1219, "args": { "External id": 110779, "cbid": 211, "correlation": 110779 } }, { "ph": "s", "id": 110779, "pid": 76337, "tid": -914061504, "ts": 1716454223000401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223001698, "dur": 1, "args": { "External id": 110790, "cbid": 251, "correlation": 110790 } }, { "ph": "f", "id": 110790, "pid": 76337, "tid": -914061504, "ts": 1716454223001698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223127390, "dur": 106, "args": { "External id": 110791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110791, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110791, "pid": 5, "tid": 7, "ts": 1716454223127390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223001704, "dur": 64, "args": { "External id": 110791, "cbid": 211, "correlation": 110791 } }, { "ph": "s", "id": 110791, "pid": 76337, "tid": -914061504, "ts": 1716454223001704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223001828, "dur": 1, "args": { "External id": 110802, "cbid": 251, "correlation": 110802 } }, { "ph": "f", "id": 110802, "pid": 76337, "tid": -914061504, "ts": 1716454223001828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223001832, "dur": 0, "args": { "External id": 110803, "cbid": 251, "correlation": 110803 } }, { "ph": "f", "id": 110803, "pid": 76337, "tid": -914061504, "ts": 1716454223001832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223127498, "dur": 10, "args": { "External id": 110804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110804, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 110804, "pid": 5, "tid": 7, "ts": 1716454223127498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223001834, "dur": 12, "args": { "External id": 110804, "cbid": 211, "correlation": 110804 } }, { "ph": "s", "id": 110804, "pid": 76337, "tid": -914061504, "ts": 1716454223001834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223127509, "dur": 5, "args": { "External id": 110806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110806, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 110806, "pid": 5, "tid": 7, "ts": 1716454223127509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223001847, "dur": 6, "args": { "External id": 110806, "cbid": 211, "correlation": 110806 } }, { "ph": "s", "id": 110806, "pid": 76337, "tid": -914061504, "ts": 1716454223001847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223001908, "dur": 1, "args": { "External id": 110817, "cbid": 251, "correlation": 110817 } }, { "ph": "f", "id": 110817, "pid": 76337, "tid": -914061504, "ts": 1716454223001908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223001911, "dur": 0, "args": { "External id": 110818, "cbid": 251, "correlation": 110818 } }, { "ph": "f", "id": 110818, "pid": 76337, "tid": -914061504, "ts": 1716454223001911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223127516, "dur": 6, "args": { "External id": 110819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110819, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 110819, "pid": 5, "tid": 7, "ts": 1716454223127516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223001913, "dur": 12, "args": { "External id": 110819, "cbid": 211, "correlation": 110819 } }, { "ph": "s", "id": 110819, "pid": 76337, "tid": -914061504, "ts": 1716454223001913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223127523, "dur": 3, "args": { "External id": 110821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110821, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 110821, "pid": 5, "tid": 7, "ts": 1716454223127523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223001926, "dur": 6, "args": { "External id": 110821, "cbid": 211, "correlation": 110821 } }, { "ph": "s", "id": 110821, "pid": 76337, "tid": -914061504, "ts": 1716454223001926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223127528, "dur": 157, "args": { "External id": 110842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110842, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110842, "pid": 5, "tid": 7, "ts": 1716454223127528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002009, "dur": 13, "args": { "External id": 110842, "cbid": 211, "correlation": 110842 } }, { "ph": "s", "id": 110842, "pid": 76337, "tid": -914061504, "ts": 1716454223002009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002108, "dur": 1, "args": { "External id": 110860, "cbid": 251, "correlation": 110860 } }, { "ph": "f", "id": 110860, "pid": 76337, "tid": -914061504, "ts": 1716454223002108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223127686, "dur": 106, "args": { "External id": 110862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110862, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110862, "pid": 5, "tid": 7, "ts": 1716454223127686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002114, "dur": 14, "args": { "External id": 110862, "cbid": 211, "correlation": 110862 } }, { "ph": "s", "id": 110862, "pid": 76337, "tid": -914061504, "ts": 1716454223002114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223127793, "dur": 34, "args": { "External id": 110870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110870, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110870, "pid": 5, "tid": 7, "ts": 1716454223127793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002185, "dur": 12, "args": { "External id": 110870, "cbid": 211, "correlation": 110870 } }, { "ph": "s", "id": 110870, "pid": 76337, "tid": -914061504, "ts": 1716454223002185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223127829, "dur": 67, "args": { "External id": 110878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110878, "pid": 5, "tid": 7, "ts": 1716454223127829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002225, "dur": 9, "args": { "External id": 110878, "cbid": 211, "correlation": 110878 } }, { "ph": "s", "id": 110878, "pid": 76337, "tid": -914061504, "ts": 1716454223002225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223127897, "dur": 92, "args": { "External id": 110900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110900, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110900, "pid": 5, "tid": 7, "ts": 1716454223127897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002275, "dur": 11, "args": { "External id": 110900, "cbid": 211, "correlation": 110900 } }, { "ph": "s", "id": 110900, "pid": 76337, "tid": -914061504, "ts": 1716454223002275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002361, "dur": 1, "args": { "External id": 110916, "cbid": 251, "correlation": 110916 } }, { "ph": "f", "id": 110916, "pid": 76337, "tid": -914061504, "ts": 1716454223002361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223127990, "dur": 573, "args": { "External id": 110918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110918, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 110918, "pid": 5, "tid": 7, "ts": 1716454223127990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002366, "dur": 13, "args": { "External id": 110918, "cbid": 211, "correlation": 110918 } }, { "ph": "s", "id": 110918, "pid": 76337, "tid": -914061504, "ts": 1716454223002366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223128565, "dur": 244, "args": { "External id": 110926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110926, "pid": 5, "tid": 7, "ts": 1716454223128565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002432, "dur": 12, "args": { "External id": 110926, "cbid": 211, "correlation": 110926 } }, { "ph": "s", "id": 110926, "pid": 76337, "tid": -914061504, "ts": 1716454223002432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223128810, "dur": 252, "args": { "External id": 110934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110934, "pid": 5, "tid": 7, "ts": 1716454223128810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002462, "dur": 8, "args": { "External id": 110934, "cbid": 211, "correlation": 110934 } }, { "ph": "s", "id": 110934, "pid": 76337, "tid": -914061504, "ts": 1716454223002462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002545, "dur": 1, "args": { "External id": 110950, "cbid": 251, "correlation": 110950 } }, { "ph": "f", "id": 110950, "pid": 76337, "tid": -914061504, "ts": 1716454223002545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002550, "dur": 0, "args": { "External id": 110952, "cbid": 251, "correlation": 110952 } }, { "ph": "f", "id": 110952, "pid": 76337, "tid": -914061504, "ts": 1716454223002550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223129063, "dur": 359, "args": { "External id": 110953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110953, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 110953, "pid": 5, "tid": 7, "ts": 1716454223129063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002553, "dur": 13, "args": { "External id": 110953, "cbid": 211, "correlation": 110953 } }, { "ph": "s", "id": 110953, "pid": 76337, "tid": -914061504, "ts": 1716454223002553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223129423, "dur": 50, "args": { "External id": 110961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110961, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110961, "pid": 5, "tid": 7, "ts": 1716454223129423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002595, "dur": 10, "args": { "External id": 110961, "cbid": 211, "correlation": 110961 } }, { "ph": "s", "id": 110961, "pid": 76337, "tid": -914061504, "ts": 1716454223002595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223129475, "dur": 156, "args": { "External id": 110972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 110972, "pid": 5, "tid": 7, "ts": 1716454223129475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002661, "dur": 200, "args": { "External id": 110972, "cbid": 211, "correlation": 110972 } }, { "ph": "s", "id": 110972, "pid": 76337, "tid": -914061504, "ts": 1716454223002661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223002915, "dur": 0, "args": { "External id": 110984, "cbid": 317, "correlation": 110984 } }, { "ph": "f", "id": 110984, "pid": 76337, "tid": -914061504, "ts": 1716454223002915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223002916, "dur": 0, "args": { "External id": 110985, "cbid": 203, "correlation": 110985 } }, { "ph": "f", "id": 110985, "pid": 76337, "tid": -914061504, "ts": 1716454223002916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223002917, "dur": 0, "args": { "External id": 110986, "cbid": 205, "correlation": 110986 } }, { "ph": "f", "id": 110986, "pid": 76337, "tid": -914061504, "ts": 1716454223002917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002939, "dur": 1, "args": { "External id": 110990, "cbid": 251, "correlation": 110990 } }, { "ph": "f", "id": 110990, "pid": 76337, "tid": -914061504, "ts": 1716454223002939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002941, "dur": 0, "args": { "External id": 110991, "cbid": 251, "correlation": 110991 } }, { "ph": "f", "id": 110991, "pid": 76337, "tid": -914061504, "ts": 1716454223002941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002942, "dur": 0, "args": { "External id": 110992, "cbid": 251, "correlation": 110992 } }, { "ph": "f", "id": 110992, "pid": 76337, "tid": -914061504, "ts": 1716454223002942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002943, "dur": 0, "args": { "External id": 110993, "cbid": 251, "correlation": 110993 } }, { "ph": "f", "id": 110993, "pid": 76337, "tid": -914061504, "ts": 1716454223002943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002943, "dur": 0, "args": { "External id": 110994, "cbid": 251, "correlation": 110994 } }, { "ph": "f", "id": 110994, "pid": 76337, "tid": -914061504, "ts": 1716454223002943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002944, "dur": 0, "args": { "External id": 110995, "cbid": 251, "correlation": 110995 } }, { "ph": "f", "id": 110995, "pid": 76337, "tid": -914061504, "ts": 1716454223002944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002945, "dur": 0, "args": { "External id": 110996, "cbid": 251, "correlation": 110996 } }, { "ph": "f", "id": 110996, "pid": 76337, "tid": -914061504, "ts": 1716454223002945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002946, "dur": 0, "args": { "External id": 110997, "cbid": 251, "correlation": 110997 } }, { "ph": "f", "id": 110997, "pid": 76337, "tid": -914061504, "ts": 1716454223002946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223002947, "dur": 0, "args": { "External id": 110998, "cbid": 251, "correlation": 110998 } }, { "ph": "f", "id": 110998, "pid": 76337, "tid": -914061504, "ts": 1716454223002947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223129632, "dur": 115, "args": { "External id": 110999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 110999, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 110999, "pid": 5, "tid": 7, "ts": 1716454223129632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223002949, "dur": 38, "args": { "External id": 110999, "cbid": 211, "correlation": 110999 } }, { "ph": "s", "id": 110999, "pid": 76337, "tid": -914061504, "ts": 1716454223002949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223129748, "dur": 59, "args": { "External id": 111005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111005, "pid": 5, "tid": 7, "ts": 1716454223129748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003011, "dur": 102, "args": { "External id": 111005, "cbid": 211, "correlation": 111005 } }, { "ph": "s", "id": 111005, "pid": 76337, "tid": -914061504, "ts": 1716454223003011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223129809, "dur": 51, "args": { "External id": 111013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111013, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111013, "pid": 5, "tid": 7, "ts": 1716454223129809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003137, "dur": 277, "args": { "External id": 111013, "cbid": 211, "correlation": 111013 } }, { "ph": "s", "id": 111013, "pid": 76337, "tid": -914061504, "ts": 1716454223003137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223129861, "dur": 99, "args": { "External id": 111022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111022, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111022, "pid": 5, "tid": 7, "ts": 1716454223129861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003444, "dur": 11, "args": { "External id": 111022, "cbid": 211, "correlation": 111022 } }, { "ph": "s", "id": 111022, "pid": 76337, "tid": -914061504, "ts": 1716454223003444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223129961, "dur": 92, "args": { "External id": 111042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111042, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 111042, "pid": 5, "tid": 7, "ts": 1716454223129961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003516, "dur": 12, "args": { "External id": 111042, "cbid": 211, "correlation": 111042 } }, { "ph": "s", "id": 111042, "pid": 76337, "tid": -914061504, "ts": 1716454223003516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223130054, "dur": 5, "args": { "External id": 111054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111054, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 111054, "pid": 5, "tid": 7, "ts": 1716454223130054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003537, "dur": 10, "args": { "External id": 111054, "cbid": 211, "correlation": 111054 } }, { "ph": "s", "id": 111054, "pid": 76337, "tid": -914061504, "ts": 1716454223003537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223130060, "dur": 108, "args": { "External id": 111057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111057, "pid": 5, "tid": 7, "ts": 1716454223130060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003560, "dur": 107, "args": { "External id": 111057, "cbid": 211, "correlation": 111057 } }, { "ph": "s", "id": 111057, "pid": 76337, "tid": -914061504, "ts": 1716454223003560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223130169, "dur": 69, "args": { "External id": 111066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111066, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111066, "pid": 5, "tid": 7, "ts": 1716454223130169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003700, "dur": 10, "args": { "External id": 111066, "cbid": 211, "correlation": 111066 } }, { "ph": "s", "id": 111066, "pid": 76337, "tid": -914061504, "ts": 1716454223003700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223003752, "dur": 0, "args": { "External id": 111076, "cbid": 317, "correlation": 111076 } }, { "ph": "f", "id": 111076, "pid": 76337, "tid": -914061504, "ts": 1716454223003752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223003753, "dur": 0, "args": { "External id": 111077, "cbid": 203, "correlation": 111077 } }, { "ph": "f", "id": 111077, "pid": 76337, "tid": -914061504, "ts": 1716454223003753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223003753, "dur": 0, "args": { "External id": 111078, "cbid": 205, "correlation": 111078 } }, { "ph": "f", "id": 111078, "pid": 76337, "tid": -914061504, "ts": 1716454223003753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223130240, "dur": 75, "args": { "External id": 111082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111082, "pid": 5, "tid": 7, "ts": 1716454223130240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003768, "dur": 12, "args": { "External id": 111082, "cbid": 211, "correlation": 111082 } }, { "ph": "s", "id": 111082, "pid": 76337, "tid": -914061504, "ts": 1716454223003768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223130316, "dur": 24, "args": { "External id": 111084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111084, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111084, "pid": 5, "tid": 7, "ts": 1716454223130316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003782, "dur": 5, "args": { "External id": 111084, "cbid": 211, "correlation": 111084 } }, { "ph": "s", "id": 111084, "pid": 76337, "tid": -914061504, "ts": 1716454223003782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223130341, "dur": 3, "args": { "External id": 111086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111086, "pid": 5, "tid": 7, "ts": 1716454223130341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003791, "dur": 5, "args": { "External id": 111086, "cbid": 211, "correlation": 111086 } }, { "ph": "s", "id": 111086, "pid": 76337, "tid": -914061504, "ts": 1716454223003791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223003800, "dur": 0, "args": { "External id": 111087, "cbid": 51, "correlation": 111087 } }, { "ph": "s", "id": 111087, "pid": 76337, "tid": -914061504, "ts": 1716454223003800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223130346, "dur": 1359, "args": { "External id": 111088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111088, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111088, "pid": 5, "tid": 7, "ts": 1716454223130346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003801, "dur": 5, "args": { "External id": 111088, "cbid": 211, "correlation": 111088 } }, { "ph": "s", "id": 111088, "pid": 76337, "tid": -914061504, "ts": 1716454223003801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223131706, "dur": 59, "args": { "External id": 111093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111093, "pid": 5, "tid": 7, "ts": 1716454223131706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003828, "dur": 9, "args": { "External id": 111093, "cbid": 211, "correlation": 111093 } }, { "ph": "s", "id": 111093, "pid": 76337, "tid": -914061504, "ts": 1716454223003828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223131766, "dur": 3, "args": { "External id": 111101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111101, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111101, "pid": 5, "tid": 7, "ts": 1716454223131766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003872, "dur": 10, "args": { "External id": 111101, "cbid": 211, "correlation": 111101 } }, { "ph": "s", "id": 111101, "pid": 76337, "tid": -914061504, "ts": 1716454223003872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223003937, "dur": 1, "args": { "External id": 111117, "cbid": 251, "correlation": 111117 } }, { "ph": "f", "id": 111117, "pid": 76337, "tid": -914061504, "ts": 1716454223003937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223003942, "dur": 0, "args": { "External id": 111119, "cbid": 251, "correlation": 111119 } }, { "ph": "f", "id": 111119, "pid": 76337, "tid": -914061504, "ts": 1716454223003942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223131771, "dur": 11, "args": { "External id": 111120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111120, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 111120, "pid": 5, "tid": 7, "ts": 1716454223131771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003944, "dur": 11, "args": { "External id": 111120, "cbid": 211, "correlation": 111120 } }, { "ph": "s", "id": 111120, "pid": 76337, "tid": -914061504, "ts": 1716454223003944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223131783, "dur": 5, "args": { "External id": 111122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111122, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 111122, "pid": 5, "tid": 7, "ts": 1716454223131783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223003957, "dur": 7, "args": { "External id": 111122, "cbid": 211, "correlation": 111122 } }, { "ph": "s", "id": 111122, "pid": 76337, "tid": -914061504, "ts": 1716454223003957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223131790, "dur": 54, "args": { "External id": 111132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111132, "pid": 5, "tid": 7, "ts": 1716454223131790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004024, "dur": 538, "args": { "External id": 111132, "cbid": 211, "correlation": 111132 } }, { "ph": "s", "id": 111132, "pid": 76337, "tid": -914061504, "ts": 1716454223004024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223131845, "dur": 52, "args": { "External id": 111152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111152, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 111152, "pid": 5, "tid": 7, "ts": 1716454223131845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004616, "dur": 12, "args": { "External id": 111152, "cbid": 211, "correlation": 111152 } }, { "ph": "s", "id": 111152, "pid": 76337, "tid": -914061504, "ts": 1716454223004616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223131898, "dur": 4, "args": { "External id": 111164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111164, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111164, "pid": 5, "tid": 7, "ts": 1716454223131898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004638, "dur": 6, "args": { "External id": 111164, "cbid": 211, "correlation": 111164 } }, { "ph": "s", "id": 111164, "pid": 76337, "tid": -914061504, "ts": 1716454223004638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223131903, "dur": 55, "args": { "External id": 111167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111167, "pid": 5, "tid": 7, "ts": 1716454223131903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004656, "dur": 7, "args": { "External id": 111167, "cbid": 211, "correlation": 111167 } }, { "ph": "s", "id": 111167, "pid": 76337, "tid": -914061504, "ts": 1716454223004656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223131960, "dur": 36, "args": { "External id": 111176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111176, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111176, "pid": 5, "tid": 7, "ts": 1716454223131960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004697, "dur": 10, "args": { "External id": 111176, "cbid": 211, "correlation": 111176 } }, { "ph": "s", "id": 111176, "pid": 76337, "tid": -914061504, "ts": 1716454223004697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223004761, "dur": 0, "args": { "External id": 111186, "cbid": 317, "correlation": 111186 } }, { "ph": "f", "id": 111186, "pid": 76337, "tid": -914061504, "ts": 1716454223004761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223004762, "dur": 0, "args": { "External id": 111187, "cbid": 203, "correlation": 111187 } }, { "ph": "f", "id": 111187, "pid": 76337, "tid": -914061504, "ts": 1716454223004762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223004763, "dur": 0, "args": { "External id": 111188, "cbid": 205, "correlation": 111188 } }, { "ph": "f", "id": 111188, "pid": 76337, "tid": -914061504, "ts": 1716454223004763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223131998, "dur": 40, "args": { "External id": 111192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111192, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111192, "pid": 5, "tid": 7, "ts": 1716454223131998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004777, "dur": 12, "args": { "External id": 111192, "cbid": 211, "correlation": 111192 } }, { "ph": "s", "id": 111192, "pid": 76337, "tid": -914061504, "ts": 1716454223004777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223132039, "dur": 14, "args": { "External id": 111194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111194, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111194, "pid": 5, "tid": 7, "ts": 1716454223132039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004792, "dur": 5, "args": { "External id": 111194, "cbid": 211, "correlation": 111194 } }, { "ph": "s", "id": 111194, "pid": 76337, "tid": -914061504, "ts": 1716454223004792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223132054, "dur": 3, "args": { "External id": 111196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111196, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111196, "pid": 5, "tid": 7, "ts": 1716454223132054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004801, "dur": 5, "args": { "External id": 111196, "cbid": 211, "correlation": 111196 } }, { "ph": "s", "id": 111196, "pid": 76337, "tid": -914061504, "ts": 1716454223004801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223004810, "dur": 0, "args": { "External id": 111197, "cbid": 51, "correlation": 111197 } }, { "ph": "s", "id": 111197, "pid": 76337, "tid": -914061504, "ts": 1716454223004810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223132059, "dur": 694, "args": { "External id": 111198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111198, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111198, "pid": 5, "tid": 7, "ts": 1716454223132059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004810, "dur": 5, "args": { "External id": 111198, "cbid": 211, "correlation": 111198 } }, { "ph": "s", "id": 111198, "pid": 76337, "tid": -914061504, "ts": 1716454223004810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223132755, "dur": 59, "args": { "External id": 111203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111203, "pid": 5, "tid": 7, "ts": 1716454223132755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004838, "dur": 8, "args": { "External id": 111203, "cbid": 211, "correlation": 111203 } }, { "ph": "s", "id": 111203, "pid": 76337, "tid": -914061504, "ts": 1716454223004838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223004895, "dur": 0, "args": { "External id": 111213, "cbid": 317, "correlation": 111213 } }, { "ph": "f", "id": 111213, "pid": 76337, "tid": -914061504, "ts": 1716454223004895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223004896, "dur": 0, "args": { "External id": 111214, "cbid": 203, "correlation": 111214 } }, { "ph": "f", "id": 111214, "pid": 76337, "tid": -914061504, "ts": 1716454223004896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223004897, "dur": 0, "args": { "External id": 111215, "cbid": 205, "correlation": 111215 } }, { "ph": "f", "id": 111215, "pid": 76337, "tid": -914061504, "ts": 1716454223004897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223132815, "dur": 75, "args": { "External id": 111219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111219, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111219, "pid": 5, "tid": 7, "ts": 1716454223132815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004908, "dur": 12, "args": { "External id": 111219, "cbid": 211, "correlation": 111219 } }, { "ph": "s", "id": 111219, "pid": 76337, "tid": -914061504, "ts": 1716454223004908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223132892, "dur": 211, "args": { "External id": 111221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111221, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111221, "pid": 5, "tid": 7, "ts": 1716454223132892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004926, "dur": 7, "args": { "External id": 111221, "cbid": 211, "correlation": 111221 } }, { "ph": "s", "id": 111221, "pid": 76337, "tid": -914061504, "ts": 1716454223004926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223133104, "dur": 39, "args": { "External id": 111223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111223, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111223, "pid": 5, "tid": 7, "ts": 1716454223133104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004938, "dur": 5, "args": { "External id": 111223, "cbid": 211, "correlation": 111223 } }, { "ph": "s", "id": 111223, "pid": 76337, "tid": -914061504, "ts": 1716454223004938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223133144, "dur": 60, "args": { "External id": 111229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111229, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111229, "pid": 5, "tid": 7, "ts": 1716454223133144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223004964, "dur": 508, "args": { "External id": 111229, "cbid": 211, "correlation": 111229 } }, { "ph": "s", "id": 111229, "pid": 76337, "tid": -914061504, "ts": 1716454223004964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223133205, "dur": 51, "args": { "External id": 111237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111237, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111237, "pid": 5, "tid": 7, "ts": 1716454223133205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005495, "dur": 9, "args": { "External id": 111237, "cbid": 211, "correlation": 111237 } }, { "ph": "s", "id": 111237, "pid": 76337, "tid": -914061504, "ts": 1716454223005495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223133257, "dur": 35, "args": { "External id": 111245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111245, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111245, "pid": 5, "tid": 7, "ts": 1716454223133257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005524, "dur": 8, "args": { "External id": 111245, "cbid": 211, "correlation": 111245 } }, { "ph": "s", "id": 111245, "pid": 76337, "tid": -914061504, "ts": 1716454223005524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223133294, "dur": 51, "args": { "External id": 111265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111265, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 111265, "pid": 5, "tid": 7, "ts": 1716454223133294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005607, "dur": 12, "args": { "External id": 111265, "cbid": 211, "correlation": 111265 } }, { "ph": "s", "id": 111265, "pid": 76337, "tid": -914061504, "ts": 1716454223005607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223133346, "dur": 5, "args": { "External id": 111277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111277, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111277, "pid": 5, "tid": 7, "ts": 1716454223133346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005629, "dur": 6, "args": { "External id": 111277, "cbid": 211, "correlation": 111277 } }, { "ph": "s", "id": 111277, "pid": 76337, "tid": -914061504, "ts": 1716454223005629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223133352, "dur": 55, "args": { "External id": 111280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111280, "pid": 5, "tid": 7, "ts": 1716454223133352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005647, "dur": 6, "args": { "External id": 111280, "cbid": 211, "correlation": 111280 } }, { "ph": "s", "id": 111280, "pid": 76337, "tid": -914061504, "ts": 1716454223005647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223005704, "dur": 0, "args": { "External id": 111291, "cbid": 317, "correlation": 111291 } }, { "ph": "f", "id": 111291, "pid": 76337, "tid": -914061504, "ts": 1716454223005704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223005705, "dur": 0, "args": { "External id": 111292, "cbid": 203, "correlation": 111292 } }, { "ph": "f", "id": 111292, "pid": 76337, "tid": -914061504, "ts": 1716454223005705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223005706, "dur": 0, "args": { "External id": 111293, "cbid": 205, "correlation": 111293 } }, { "ph": "f", "id": 111293, "pid": 76337, "tid": -914061504, "ts": 1716454223005706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005727, "dur": 1, "args": { "External id": 111297, "cbid": 251, "correlation": 111297 } }, { "ph": "f", "id": 111297, "pid": 76337, "tid": -914061504, "ts": 1716454223005727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005729, "dur": 0, "args": { "External id": 111298, "cbid": 251, "correlation": 111298 } }, { "ph": "f", "id": 111298, "pid": 76337, "tid": -914061504, "ts": 1716454223005729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005730, "dur": 0, "args": { "External id": 111299, "cbid": 251, "correlation": 111299 } }, { "ph": "f", "id": 111299, "pid": 76337, "tid": -914061504, "ts": 1716454223005730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005730, "dur": 0, "args": { "External id": 111300, "cbid": 251, "correlation": 111300 } }, { "ph": "f", "id": 111300, "pid": 76337, "tid": -914061504, "ts": 1716454223005730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005731, "dur": 0, "args": { "External id": 111301, "cbid": 251, "correlation": 111301 } }, { "ph": "f", "id": 111301, "pid": 76337, "tid": -914061504, "ts": 1716454223005731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005732, "dur": 0, "args": { "External id": 111302, "cbid": 251, "correlation": 111302 } }, { "ph": "f", "id": 111302, "pid": 76337, "tid": -914061504, "ts": 1716454223005732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005733, "dur": 0, "args": { "External id": 111303, "cbid": 251, "correlation": 111303 } }, { "ph": "f", "id": 111303, "pid": 76337, "tid": -914061504, "ts": 1716454223005733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005734, "dur": 0, "args": { "External id": 111304, "cbid": 251, "correlation": 111304 } }, { "ph": "f", "id": 111304, "pid": 76337, "tid": -914061504, "ts": 1716454223005734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223005735, "dur": 0, "args": { "External id": 111305, "cbid": 251, "correlation": 111305 } }, { "ph": "f", "id": 111305, "pid": 76337, "tid": -914061504, "ts": 1716454223005735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223133408, "dur": 115, "args": { "External id": 111306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111306, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 111306, "pid": 5, "tid": 7, "ts": 1716454223133408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005737, "dur": 14, "args": { "External id": 111306, "cbid": 211, "correlation": 111306 } }, { "ph": "s", "id": 111306, "pid": 76337, "tid": -914061504, "ts": 1716454223005737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223133525, "dur": 59, "args": { "External id": 111312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111312, "pid": 5, "tid": 7, "ts": 1716454223133525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005773, "dur": 8, "args": { "External id": 111312, "cbid": 211, "correlation": 111312 } }, { "ph": "s", "id": 111312, "pid": 76337, "tid": -914061504, "ts": 1716454223005773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223133585, "dur": 583, "args": { "External id": 111321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111321, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111321, "pid": 5, "tid": 7, "ts": 1716454223133585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005857, "dur": 14, "args": { "External id": 111321, "cbid": 211, "correlation": 111321 } }, { "ph": "s", "id": 111321, "pid": 76337, "tid": -914061504, "ts": 1716454223005857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223134169, "dur": 180, "args": { "External id": 111343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111343, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111343, "pid": 5, "tid": 7, "ts": 1716454223134169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223005914, "dur": 10, "args": { "External id": 111343, "cbid": 211, "correlation": 111343 } }, { "ph": "s", "id": 111343, "pid": 76337, "tid": -914061504, "ts": 1716454223005914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223006009, "dur": 1, "args": { "External id": 111354, "cbid": 251, "correlation": 111354 } }, { "ph": "f", "id": 111354, "pid": 76337, "tid": -914061504, "ts": 1716454223006009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223134351, "dur": 197, "args": { "External id": 111355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111355, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111355, "pid": 5, "tid": 7, "ts": 1716454223134351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006014, "dur": 13, "args": { "External id": 111355, "cbid": 211, "correlation": 111355 } }, { "ph": "s", "id": 111355, "pid": 76337, "tid": -914061504, "ts": 1716454223006014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223006084, "dur": 1, "args": { "External id": 111366, "cbid": 251, "correlation": 111366 } }, { "ph": "f", "id": 111366, "pid": 76337, "tid": -914061504, "ts": 1716454223006084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223134549, "dur": 188, "args": { "External id": 111367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111367, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111367, "pid": 5, "tid": 7, "ts": 1716454223134549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006087, "dur": 12, "args": { "External id": 111367, "cbid": 211, "correlation": 111367 } }, { "ph": "s", "id": 111367, "pid": 76337, "tid": -914061504, "ts": 1716454223006087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223006151, "dur": 1, "args": { "External id": 111378, "cbid": 251, "correlation": 111378 } }, { "ph": "f", "id": 111378, "pid": 76337, "tid": -914061504, "ts": 1716454223006151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223134739, "dur": 189, "args": { "External id": 111379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111379, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111379, "pid": 5, "tid": 7, "ts": 1716454223134739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006155, "dur": 11, "args": { "External id": 111379, "cbid": 211, "correlation": 111379 } }, { "ph": "s", "id": 111379, "pid": 76337, "tid": -914061504, "ts": 1716454223006155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223134929, "dur": 18543, "args": { "External id": 111400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111400, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 111400, "pid": 5, "tid": 7, "ts": 1716454223134929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006235, "dur": 14, "args": { "External id": 111400, "cbid": 211, "correlation": 111400 } }, { "ph": "s", "id": 111400, "pid": 76337, "tid": -914061504, "ts": 1716454223006235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223006332, "dur": 1, "args": { "External id": 111418, "cbid": 251, "correlation": 111418 } }, { "ph": "f", "id": 111418, "pid": 76337, "tid": -914061504, "ts": 1716454223006332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223153474, "dur": 204, "args": { "External id": 111420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111420, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111420, "pid": 5, "tid": 7, "ts": 1716454223153474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006338, "dur": 14, "args": { "External id": 111420, "cbid": 211, "correlation": 111420 } }, { "ph": "s", "id": 111420, "pid": 76337, "tid": -914061504, "ts": 1716454223006338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223153679, "dur": 66, "args": { "External id": 111428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111428, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111428, "pid": 5, "tid": 7, "ts": 1716454223153679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006409, "dur": 12, "args": { "External id": 111428, "cbid": 211, "correlation": 111428 } }, { "ph": "s", "id": 111428, "pid": 76337, "tid": -914061504, "ts": 1716454223006409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223153747, "dur": 97, "args": { "External id": 111436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111436, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111436, "pid": 5, "tid": 7, "ts": 1716454223153747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006450, "dur": 77, "args": { "External id": 111436, "cbid": 211, "correlation": 111436 } }, { "ph": "s", "id": 111436, "pid": 76337, "tid": -914061504, "ts": 1716454223006450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223153845, "dur": 54, "args": { "External id": 111447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111447, "pid": 5, "tid": 7, "ts": 1716454223153845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223006590, "dur": 1848, "args": { "External id": 111447, "cbid": 211, "correlation": 111447 } }, { "ph": "s", "id": 111447, "pid": 76337, "tid": -914061504, "ts": 1716454223006590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223153901, "dur": 92, "args": { "External id": 111469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111469, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111469, "pid": 5, "tid": 7, "ts": 1716454223153901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008458, "dur": 124, "args": { "External id": 111469, "cbid": 211, "correlation": 111469 } }, { "ph": "s", "id": 111469, "pid": 76337, "tid": -914061504, "ts": 1716454223008458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223008660, "dur": 1, "args": { "External id": 111480, "cbid": 251, "correlation": 111480 } }, { "ph": "f", "id": 111480, "pid": 76337, "tid": -914061504, "ts": 1716454223008660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223153994, "dur": 104, "args": { "External id": 111481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111481, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111481, "pid": 5, "tid": 7, "ts": 1716454223153994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008666, "dur": 13, "args": { "External id": 111481, "cbid": 211, "correlation": 111481 } }, { "ph": "s", "id": 111481, "pid": 76337, "tid": -914061504, "ts": 1716454223008666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223008739, "dur": 1, "args": { "External id": 111492, "cbid": 251, "correlation": 111492 } }, { "ph": "f", "id": 111492, "pid": 76337, "tid": -914061504, "ts": 1716454223008739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223008743, "dur": 0, "args": { "External id": 111493, "cbid": 251, "correlation": 111493 } }, { "ph": "f", "id": 111493, "pid": 76337, "tid": -914061504, "ts": 1716454223008743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223154099, "dur": 10, "args": { "External id": 111494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111494, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111494, "pid": 5, "tid": 7, "ts": 1716454223154099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008745, "dur": 12, "args": { "External id": 111494, "cbid": 211, "correlation": 111494 } }, { "ph": "s", "id": 111494, "pid": 76337, "tid": -914061504, "ts": 1716454223008745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223154111, "dur": 5, "args": { "External id": 111496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111496, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 111496, "pid": 5, "tid": 7, "ts": 1716454223154111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008759, "dur": 6, "args": { "External id": 111496, "cbid": 211, "correlation": 111496 } }, { "ph": "s", "id": 111496, "pid": 76337, "tid": -914061504, "ts": 1716454223008759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223008819, "dur": 1, "args": { "External id": 111507, "cbid": 251, "correlation": 111507 } }, { "ph": "f", "id": 111507, "pid": 76337, "tid": -914061504, "ts": 1716454223008819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223008823, "dur": 0, "args": { "External id": 111508, "cbid": 251, "correlation": 111508 } }, { "ph": "f", "id": 111508, "pid": 76337, "tid": -914061504, "ts": 1716454223008823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223154117, "dur": 6, "args": { "External id": 111509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111509, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111509, "pid": 5, "tid": 7, "ts": 1716454223154117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008825, "dur": 12, "args": { "External id": 111509, "cbid": 211, "correlation": 111509 } }, { "ph": "s", "id": 111509, "pid": 76337, "tid": -914061504, "ts": 1716454223008825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223154125, "dur": 3, "args": { "External id": 111511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111511, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 111511, "pid": 5, "tid": 7, "ts": 1716454223154125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008838, "dur": 7, "args": { "External id": 111511, "cbid": 211, "correlation": 111511 } }, { "ph": "s", "id": 111511, "pid": 76337, "tid": -914061504, "ts": 1716454223008838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223154130, "dur": 157, "args": { "External id": 111532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111532, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 111532, "pid": 5, "tid": 7, "ts": 1716454223154130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223008914, "dur": 12, "args": { "External id": 111532, "cbid": 211, "correlation": 111532 } }, { "ph": "s", "id": 111532, "pid": 76337, "tid": -914061504, "ts": 1716454223008914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009018, "dur": 1, "args": { "External id": 111550, "cbid": 251, "correlation": 111550 } }, { "ph": "f", "id": 111550, "pid": 76337, "tid": -914061504, "ts": 1716454223009018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223154288, "dur": 106, "args": { "External id": 111552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111552, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 111552, "pid": 5, "tid": 7, "ts": 1716454223154288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009024, "dur": 14, "args": { "External id": 111552, "cbid": 211, "correlation": 111552 } }, { "ph": "s", "id": 111552, "pid": 76337, "tid": -914061504, "ts": 1716454223009024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223154395, "dur": 35, "args": { "External id": 111560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111560, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111560, "pid": 5, "tid": 7, "ts": 1716454223154395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009096, "dur": 12, "args": { "External id": 111560, "cbid": 211, "correlation": 111560 } }, { "ph": "s", "id": 111560, "pid": 76337, "tid": -914061504, "ts": 1716454223009096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223154431, "dur": 67, "args": { "External id": 111568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111568, "pid": 5, "tid": 7, "ts": 1716454223154431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009136, "dur": 10, "args": { "External id": 111568, "cbid": 211, "correlation": 111568 } }, { "ph": "s", "id": 111568, "pid": 76337, "tid": -914061504, "ts": 1716454223009136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223154500, "dur": 92, "args": { "External id": 111590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111590, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111590, "pid": 5, "tid": 7, "ts": 1716454223154500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009187, "dur": 11, "args": { "External id": 111590, "cbid": 211, "correlation": 111590 } }, { "ph": "s", "id": 111590, "pid": 76337, "tid": -914061504, "ts": 1716454223009187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009273, "dur": 1, "args": { "External id": 111606, "cbid": 251, "correlation": 111606 } }, { "ph": "f", "id": 111606, "pid": 76337, "tid": -914061504, "ts": 1716454223009273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223154593, "dur": 569, "args": { "External id": 111608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111608, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 111608, "pid": 5, "tid": 7, "ts": 1716454223154593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009279, "dur": 13, "args": { "External id": 111608, "cbid": 211, "correlation": 111608 } }, { "ph": "s", "id": 111608, "pid": 76337, "tid": -914061504, "ts": 1716454223009279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223155163, "dur": 244, "args": { "External id": 111616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111616, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111616, "pid": 5, "tid": 7, "ts": 1716454223155163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009344, "dur": 12, "args": { "External id": 111616, "cbid": 211, "correlation": 111616 } }, { "ph": "s", "id": 111616, "pid": 76337, "tid": -914061504, "ts": 1716454223009344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223155409, "dur": 253, "args": { "External id": 111624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111624, "pid": 5, "tid": 7, "ts": 1716454223155409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009374, "dur": 8, "args": { "External id": 111624, "cbid": 211, "correlation": 111624 } }, { "ph": "s", "id": 111624, "pid": 76337, "tid": -914061504, "ts": 1716454223009374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009455, "dur": 1, "args": { "External id": 111640, "cbid": 251, "correlation": 111640 } }, { "ph": "f", "id": 111640, "pid": 76337, "tid": -914061504, "ts": 1716454223009455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009460, "dur": 0, "args": { "External id": 111642, "cbid": 251, "correlation": 111642 } }, { "ph": "f", "id": 111642, "pid": 76337, "tid": -914061504, "ts": 1716454223009460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223155663, "dur": 359, "args": { "External id": 111643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111643, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111643, "pid": 5, "tid": 7, "ts": 1716454223155663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009463, "dur": 12, "args": { "External id": 111643, "cbid": 211, "correlation": 111643 } }, { "ph": "s", "id": 111643, "pid": 76337, "tid": -914061504, "ts": 1716454223009463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156024, "dur": 50, "args": { "External id": 111651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111651, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111651, "pid": 5, "tid": 7, "ts": 1716454223156024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009506, "dur": 167, "args": { "External id": 111651, "cbid": 211, "correlation": 111651 } }, { "ph": "s", "id": 111651, "pid": 76337, "tid": -914061504, "ts": 1716454223009506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223156075, "dur": 157, "args": { "External id": 111662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111662, "pid": 5, "tid": 7, "ts": 1716454223156075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009731, "dur": 68, "args": { "External id": 111662, "cbid": 211, "correlation": 111662 } }, { "ph": "s", "id": 111662, "pid": 76337, "tid": -914061504, "ts": 1716454223009731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223009852, "dur": 0, "args": { "External id": 111674, "cbid": 317, "correlation": 111674 } }, { "ph": "f", "id": 111674, "pid": 76337, "tid": -914061504, "ts": 1716454223009852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223009853, "dur": 0, "args": { "External id": 111675, "cbid": 203, "correlation": 111675 } }, { "ph": "f", "id": 111675, "pid": 76337, "tid": -914061504, "ts": 1716454223009853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223009854, "dur": 0, "args": { "External id": 111676, "cbid": 205, "correlation": 111676 } }, { "ph": "f", "id": 111676, "pid": 76337, "tid": -914061504, "ts": 1716454223009854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009877, "dur": 1, "args": { "External id": 111680, "cbid": 251, "correlation": 111680 } }, { "ph": "f", "id": 111680, "pid": 76337, "tid": -914061504, "ts": 1716454223009877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009879, "dur": 0, "args": { "External id": 111681, "cbid": 251, "correlation": 111681 } }, { "ph": "f", "id": 111681, "pid": 76337, "tid": -914061504, "ts": 1716454223009879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009880, "dur": 0, "args": { "External id": 111682, "cbid": 251, "correlation": 111682 } }, { "ph": "f", "id": 111682, "pid": 76337, "tid": -914061504, "ts": 1716454223009880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009881, "dur": 0, "args": { "External id": 111683, "cbid": 251, "correlation": 111683 } }, { "ph": "f", "id": 111683, "pid": 76337, "tid": -914061504, "ts": 1716454223009881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009881, "dur": 0, "args": { "External id": 111684, "cbid": 251, "correlation": 111684 } }, { "ph": "f", "id": 111684, "pid": 76337, "tid": -914061504, "ts": 1716454223009881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009882, "dur": 0, "args": { "External id": 111685, "cbid": 251, "correlation": 111685 } }, { "ph": "f", "id": 111685, "pid": 76337, "tid": -914061504, "ts": 1716454223009882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009883, "dur": 0, "args": { "External id": 111686, "cbid": 251, "correlation": 111686 } }, { "ph": "f", "id": 111686, "pid": 76337, "tid": -914061504, "ts": 1716454223009883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009884, "dur": 0, "args": { "External id": 111687, "cbid": 251, "correlation": 111687 } }, { "ph": "f", "id": 111687, "pid": 76337, "tid": -914061504, "ts": 1716454223009884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223009885, "dur": 0, "args": { "External id": 111688, "cbid": 251, "correlation": 111688 } }, { "ph": "f", "id": 111688, "pid": 76337, "tid": -914061504, "ts": 1716454223009885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223156234, "dur": 114, "args": { "External id": 111689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111689, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 111689, "pid": 5, "tid": 7, "ts": 1716454223156234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009887, "dur": 42, "args": { "External id": 111689, "cbid": 211, "correlation": 111689 } }, { "ph": "s", "id": 111689, "pid": 76337, "tid": -914061504, "ts": 1716454223009887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223156349, "dur": 60, "args": { "External id": 111695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111695, "pid": 5, "tid": 7, "ts": 1716454223156349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223009952, "dur": 279, "args": { "External id": 111695, "cbid": 211, "correlation": 111695 } }, { "ph": "s", "id": 111695, "pid": 76337, "tid": -914061504, "ts": 1716454223009952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156410, "dur": 50, "args": { "External id": 111703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111703, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111703, "pid": 5, "tid": 7, "ts": 1716454223156410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010258, "dur": 10, "args": { "External id": 111703, "cbid": 211, "correlation": 111703 } }, { "ph": "s", "id": 111703, "pid": 76337, "tid": -914061504, "ts": 1716454223010258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223156462, "dur": 53, "args": { "External id": 111723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111723, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 111723, "pid": 5, "tid": 7, "ts": 1716454223156462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010340, "dur": 13, "args": { "External id": 111723, "cbid": 211, "correlation": 111723 } }, { "ph": "s", "id": 111723, "pid": 76337, "tid": -914061504, "ts": 1716454223010340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223156516, "dur": 5, "args": { "External id": 111735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111735, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111735, "pid": 5, "tid": 7, "ts": 1716454223156516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010363, "dur": 6, "args": { "External id": 111735, "cbid": 211, "correlation": 111735 } }, { "ph": "s", "id": 111735, "pid": 76337, "tid": -914061504, "ts": 1716454223010363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223156522, "dur": 57, "args": { "External id": 111738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111738, "pid": 5, "tid": 7, "ts": 1716454223156522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010381, "dur": 101, "args": { "External id": 111738, "cbid": 211, "correlation": 111738 } }, { "ph": "s", "id": 111738, "pid": 76337, "tid": -914061504, "ts": 1716454223010381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156580, "dur": 37, "args": { "External id": 111747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111747, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111747, "pid": 5, "tid": 7, "ts": 1716454223156580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010522, "dur": 10, "args": { "External id": 111747, "cbid": 211, "correlation": 111747 } }, { "ph": "s", "id": 111747, "pid": 76337, "tid": -914061504, "ts": 1716454223010522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223010578, "dur": 0, "args": { "External id": 111757, "cbid": 317, "correlation": 111757 } }, { "ph": "f", "id": 111757, "pid": 76337, "tid": -914061504, "ts": 1716454223010578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223010579, "dur": 0, "args": { "External id": 111758, "cbid": 203, "correlation": 111758 } }, { "ph": "f", "id": 111758, "pid": 76337, "tid": -914061504, "ts": 1716454223010579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223010580, "dur": 0, "args": { "External id": 111759, "cbid": 205, "correlation": 111759 } }, { "ph": "f", "id": 111759, "pid": 76337, "tid": -914061504, "ts": 1716454223010580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223156619, "dur": 40, "args": { "External id": 111763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111763, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111763, "pid": 5, "tid": 7, "ts": 1716454223156619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010595, "dur": 12, "args": { "External id": 111763, "cbid": 211, "correlation": 111763 } }, { "ph": "s", "id": 111763, "pid": 76337, "tid": -914061504, "ts": 1716454223010595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223156660, "dur": 3, "args": { "External id": 111765, "device": 5, "context": 1, "stream": 7, "correlation": 111765, "bytes": 46080, "memory bandwidth (GB/s)": 12.413793103448276 } }, { "ph": "f", "id": 111765, "pid": 5, "tid": 7, "ts": 1716454223156660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223010610, "dur": 17, "args": { "External id": 111765, "cbid": 51, "correlation": 111765 } }, { "ph": "s", "id": 111765, "pid": 76337, "tid": -914061504, "ts": 1716454223010610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223010632, "dur": 2, "args": { "External id": 111767, "cbid": 200, "correlation": 111767 } }, { "ph": "f", "id": 111767, "pid": 76337, "tid": -914061504, "ts": 1716454223010632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223010635, "dur": 0, "args": { "External id": 111768, "cbid": 200, "correlation": 111768 } }, { "ph": "f", "id": 111768, "pid": 76337, "tid": -914061504, "ts": 1716454223010635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223010635, "dur": 0, "args": { "External id": 111769, "cbid": 200, "correlation": 111769 } }, { "ph": "f", "id": 111769, "pid": 76337, "tid": -914061504, "ts": 1716454223010635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223010636, "dur": 0, "args": { "External id": 111770, "cbid": 200, "correlation": 111770 } }, { "ph": "f", "id": 111770, "pid": 76337, "tid": -914061504, "ts": 1716454223010636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454223010637, "dur": 4, "args": { "External id": 111771, "cbid": 15, "correlation": 111771 } }, { "ph": "f", "id": 111771, "pid": 76337, "tid": -914061504, "ts": 1716454223010637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223010641, "dur": 1, "args": { "External id": 111772, "cbid": 251, "correlation": 111772 } }, { "ph": "f", "id": 111772, "pid": 76337, "tid": -914061504, "ts": 1716454223010641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454223156665, "dur": 24, "args": { "External id": 111773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111773, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111773, "pid": 5, "tid": 7, "ts": 1716454223156665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010644, "dur": 8, "args": { "External id": 111773, "cbid": 211, "correlation": 111773 } }, { "ph": "s", "id": 111773, "pid": 76337, "tid": -914061504, "ts": 1716454223010644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223156690, "dur": 4, "args": { "External id": 111775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 111775, "pid": 5, "tid": 7, "ts": 1716454223156690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010658, "dur": 7, "args": { "External id": 111775, "cbid": 211, "correlation": 111775 } }, { "ph": "s", "id": 111775, "pid": 76337, "tid": -914061504, "ts": 1716454223010658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223010669, "dur": 0, "args": { "External id": 111776, "cbid": 51, "correlation": 111776 } }, { "ph": "s", "id": 111776, "pid": 76337, "tid": -914061504, "ts": 1716454223010669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223156696, "dur": 187, "args": { "External id": 111777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111777, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111777, "pid": 5, "tid": 7, "ts": 1716454223156696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010670, "dur": 202, "args": { "External id": 111777, "cbid": 211, "correlation": 111777 } }, { "ph": "s", "id": 111777, "pid": 76337, "tid": -914061504, "ts": 1716454223010670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223156884, "dur": 6, "args": { "External id": 111778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111778, "pid": 5, "tid": 7, "ts": 1716454223156884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010875, "dur": 6, "args": { "External id": 111778, "cbid": 211, "correlation": 111778 } }, { "ph": "s", "id": 111778, "pid": 76337, "tid": -914061504, "ts": 1716454223010875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223156891, "dur": 5, "args": { "External id": 111784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 111784, "pid": 5, "tid": 7, "ts": 1716454223156891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223010906, "dur": 9, "args": { "External id": 111784, "cbid": 211, "correlation": 111784 } }, { "ph": "s", "id": 111784, "pid": 76337, "tid": -914061504, "ts": 1716454223010906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156897, "dur": 3, "args": { "External id": 111792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111792, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111792, "pid": 5, "tid": 7, "ts": 1716454223156897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223012585, "dur": 14, "args": { "External id": 111792, "cbid": 211, "correlation": 111792 } }, { "ph": "s", "id": 111792, "pid": 76337, "tid": -914061504, "ts": 1716454223012585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156901, "dur": 3, "args": { "External id": 111800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111800, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111800, "pid": 5, "tid": 7, "ts": 1716454223156901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223012625, "dur": 10, "args": { "External id": 111800, "cbid": 211, "correlation": 111800 } }, { "ph": "s", "id": 111800, "pid": 76337, "tid": -914061504, "ts": 1716454223012625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156906, "dur": 3, "args": { "External id": 111808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111808, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111808, "pid": 5, "tid": 7, "ts": 1716454223156906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223012651, "dur": 8, "args": { "External id": 111808, "cbid": 211, "correlation": 111808 } }, { "ph": "s", "id": 111808, "pid": 76337, "tid": -914061504, "ts": 1716454223012651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156910, "dur": 3, "args": { "External id": 111817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111817, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111817, "pid": 5, "tid": 7, "ts": 1716454223156910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223012825, "dur": 14, "args": { "External id": 111817, "cbid": 211, "correlation": 111817 } }, { "ph": "s", "id": 111817, "pid": 76337, "tid": -914061504, "ts": 1716454223012825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156914, "dur": 3, "args": { "External id": 111826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111826, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111826, "pid": 5, "tid": 7, "ts": 1716454223156914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223012856, "dur": 7, "args": { "External id": 111826, "cbid": 211, "correlation": 111826 } }, { "ph": "s", "id": 111826, "pid": 76337, "tid": -914061504, "ts": 1716454223012856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156919, "dur": 3, "args": { "External id": 111834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111834, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111834, "pid": 5, "tid": 7, "ts": 1716454223156919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223012881, "dur": 8, "args": { "External id": 111834, "cbid": 211, "correlation": 111834 } }, { "ph": "s", "id": 111834, "pid": 76337, "tid": -914061504, "ts": 1716454223012881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156923, "dur": 3, "args": { "External id": 111842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111842, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111842, "pid": 5, "tid": 7, "ts": 1716454223156923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223013152, "dur": 16, "args": { "External id": 111842, "cbid": 211, "correlation": 111842 } }, { "ph": "s", "id": 111842, "pid": 76337, "tid": -914061504, "ts": 1716454223013152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223156927, "dur": 3, "args": { "External id": 111850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111850, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 111850, "pid": 5, "tid": 7, "ts": 1716454223156927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223013184, "dur": 8, "args": { "External id": 111850, "cbid": 211, "correlation": 111850 } }, { "ph": "s", "id": 111850, "pid": 76337, "tid": -914061504, "ts": 1716454223013184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223156932, "dur": 1, "args": { "External id": 111860, "device": 5, "context": 1, "stream": 7, "correlation": 111860, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 111860, "pid": 5, "tid": 7, "ts": 1716454223156932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223013248, "dur": 35, "args": { "External id": 111860, "cbid": 41, "correlation": 111860 } }, { "ph": "s", "id": 111860, "pid": 76337, "tid": -914061504, "ts": 1716454223013248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223013284, "dur": 143664, "args": { "External id": 111861, "cbid": 131, "correlation": 111861 } }, { "ph": "f", "id": 111861, "pid": 76337, "tid": -914061504, "ts": 1716454223013284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223157100, "dur": 3, "args": { "External id": 111869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111869, "pid": 5, "tid": 7, "ts": 1716454223157100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157077, "dur": 24, "args": { "External id": 111869, "cbid": 211, "correlation": 111869 } }, { "ph": "s", "id": 111869, "pid": 76337, "tid": -914061504, "ts": 1716454223157077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157189, "dur": 3, "args": { "External id": 111878, "device": 5, "context": 1, "stream": 7, "correlation": 111878, "bytes": 8, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 111878, "pid": 5, "tid": 7, "ts": 1716454223157189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157161, "dur": 28, "args": { "External id": 111878, "cbid": 41, "correlation": 111878 } }, { "ph": "s", "id": 111878, "pid": 76337, "tid": -914061504, "ts": 1716454223157161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223157282, "dur": 4, "args": { "External id": 111888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111888, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 111888, "pid": 5, "tid": 7, "ts": 1716454223157282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157266, "dur": 17, "args": { "External id": 111888, "cbid": 211, "correlation": 111888 } }, { "ph": "s", "id": 111888, "pid": 76337, "tid": -914061504, "ts": 1716454223157266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157351, "dur": 1, "args": { "External id": 111898, "device": 5, "context": 1, "stream": 7, "correlation": 111898, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 111898, "pid": 5, "tid": 7, "ts": 1716454223157351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157333, "dur": 16, "args": { "External id": 111898, "cbid": 41, "correlation": 111898 } }, { "ph": "s", "id": 111898, "pid": 76337, "tid": -914061504, "ts": 1716454223157333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223157350, "dur": 8, "args": { "External id": 111899, "cbid": 131, "correlation": 111899 } }, { "ph": "f", "id": 111899, "pid": 76337, "tid": -914061504, "ts": 1716454223157350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157419, "dur": 3, "args": { "External id": 111906, "device": 5, "context": 1, "stream": 7, "correlation": 111906, "bytes": 98304, "memory bandwidth (GB/s)": 30.72 } }, { "ph": "f", "id": 111906, "pid": 5, "tid": 7, "ts": 1716454223157419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157397, "dur": 22, "args": { "External id": 111906, "cbid": 41, "correlation": 111906 } }, { "ph": "s", "id": 111906, "pid": 76337, "tid": -914061504, "ts": 1716454223157397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157508, "dur": 3, "args": { "External id": 111925, "device": 5, "context": 1, "stream": 7, "correlation": 111925, "bytes": 16, "memory bandwidth (GB/s)": 0.005320917858330562 } }, { "ph": "f", "id": 111925, "pid": 5, "tid": 7, "ts": 1716454223157508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157489, "dur": 17, "args": { "External id": 111925, "cbid": 41, "correlation": 111925 } }, { "ph": "s", "id": 111925, "pid": 76337, "tid": -914061504, "ts": 1716454223157489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223157547, "dur": 3, "args": { "External id": 111931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111931, "pid": 5, "tid": 7, "ts": 1716454223157547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157535, "dur": 12, "args": { "External id": 111931, "cbid": 211, "correlation": 111931 } }, { "ph": "s", "id": 111931, "pid": 76337, "tid": -914061504, "ts": 1716454223157535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454223157561, "dur": 6, "args": { "External id": 111933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111933, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 111933, "pid": 5, "tid": 7, "ts": 1716454223157561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157551, "dur": 9, "args": { "External id": 111933, "cbid": 211, "correlation": 111933 } }, { "ph": "s", "id": 111933, "pid": 76337, "tid": -914061504, "ts": 1716454223157551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454223157570, "dur": 3, "args": { "External id": 111935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111935, "pid": 5, "tid": 7, "ts": 1716454223157570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157562, "dur": 6, "args": { "External id": 111935, "cbid": 211, "correlation": 111935 } }, { "ph": "s", "id": 111935, "pid": 76337, "tid": -914061504, "ts": 1716454223157562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157605, "dur": 2, "args": { "External id": 111943, "device": 5, "context": 1, "stream": 7, "correlation": 111943, "bytes": 8, "memory bandwidth (GB/s)": 0.002777777777777778 } }, { "ph": "f", "id": 111943, "pid": 5, "tid": 7, "ts": 1716454223157605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157590, "dur": 14, "args": { "External id": 111943, "cbid": 41, "correlation": 111943 } }, { "ph": "s", "id": 111943, "pid": 76337, "tid": -914061504, "ts": 1716454223157590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223157653, "dur": 3, "args": { "External id": 111957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111957, "pid": 5, "tid": 7, "ts": 1716454223157653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157642, "dur": 12, "args": { "External id": 111957, "cbid": 211, "correlation": 111957 } }, { "ph": "s", "id": 111957, "pid": 76337, "tid": -914061504, "ts": 1716454223157642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223157673, "dur": 2, "args": { "External id": 111971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111971, "pid": 5, "tid": 7, "ts": 1716454223157673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157665, "dur": 7, "args": { "External id": 111971, "cbid": 211, "correlation": 111971 } }, { "ph": "s", "id": 111971, "pid": 76337, "tid": -914061504, "ts": 1716454223157665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223157707, "dur": 6, "args": { "External id": 111978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111978, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111978, "pid": 5, "tid": 7, "ts": 1716454223157707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157696, "dur": 11, "args": { "External id": 111978, "cbid": 211, "correlation": 111978 } }, { "ph": "s", "id": 111978, "pid": 76337, "tid": -914061504, "ts": 1716454223157696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223157718, "dur": 6, "args": { "External id": 111981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111981, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111981, "pid": 5, "tid": 7, "ts": 1716454223157718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157710, "dur": 7, "args": { "External id": 111981, "cbid": 211, "correlation": 111981 } }, { "ph": "s", "id": 111981, "pid": 76337, "tid": -914061504, "ts": 1716454223157710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454223157727, "dur": 3, "args": { "External id": 111983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 111983, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 111983, "pid": 5, "tid": 7, "ts": 1716454223157727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157720, "dur": 7, "args": { "External id": 111983, "cbid": 211, "correlation": 111983 } }, { "ph": "s", "id": 111983, "pid": 76337, "tid": -914061504, "ts": 1716454223157720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157747, "dur": 2, "args": { "External id": 111986, "device": 5, "context": 1, "stream": 7, "correlation": 111986, "bytes": 8, "memory bandwidth (GB/s)": 0.002777777777777778 } }, { "ph": "f", "id": 111986, "pid": 5, "tid": 7, "ts": 1716454223157747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157735, "dur": 12, "args": { "External id": 111986, "cbid": 41, "correlation": 111986 } }, { "ph": "s", "id": 111986, "pid": 76337, "tid": -914061504, "ts": 1716454223157735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223157799, "dur": 4, "args": { "External id": 112002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112002, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112002, "pid": 5, "tid": 7, "ts": 1716454223157799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157787, "dur": 13, "args": { "External id": 112002, "cbid": 211, "correlation": 112002 } }, { "ph": "s", "id": 112002, "pid": 76337, "tid": -914061504, "ts": 1716454223157787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223157821, "dur": 3, "args": { "External id": 112007, "device": 5, "context": 1, "stream": 7, "correlation": 112007, "bytes": 1, "memory bandwidth (GB/s)": 0.0003155569580309246 } }, { "ph": "f", "id": 112007, "pid": 5, "tid": 7, "ts": 1716454223157821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157805, "dur": 15, "args": { "External id": 112007, "cbid": 41, "correlation": 112007 } }, { "ph": "s", "id": 112007, "pid": 76337, "tid": -914061504, "ts": 1716454223157805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223157848, "dur": 1, "args": { "External id": 112013, "device": 5, "context": 1, "stream": 7, "correlation": 112013, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 112013, "pid": 5, "tid": 7, "ts": 1716454223157848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223157830, "dur": 27, "args": { "External id": 112013, "cbid": 41, "correlation": 112013 } }, { "ph": "s", "id": 112013, "pid": 76337, "tid": -914061504, "ts": 1716454223157830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223157858, "dur": 4, "args": { "External id": 112014, "cbid": 131, "correlation": 112014 } }, { "ph": "f", "id": 112014, "pid": 76337, "tid": -914061504, "ts": 1716454223157858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223157908, "dur": 3, "args": { "External id": 112022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112022, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112022, "pid": 5, "tid": 7, "ts": 1716454223157908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157896, "dur": 12, "args": { "External id": 112022, "cbid": 211, "correlation": 112022 } }, { "ph": "s", "id": 112022, "pid": 76337, "tid": -914061504, "ts": 1716454223157896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223157938, "dur": 3, "args": { "External id": 112032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112032, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112032, "pid": 5, "tid": 7, "ts": 1716454223157938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157929, "dur": 8, "args": { "External id": 112032, "cbid": 211, "correlation": 112032 } }, { "ph": "s", "id": 112032, "pid": 76337, "tid": -914061504, "ts": 1716454223157929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223157964, "dur": 3, "args": { "External id": 112041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112041, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112041, "pid": 5, "tid": 7, "ts": 1716454223157964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223157953, "dur": 11, "args": { "External id": 112041, "cbid": 211, "correlation": 112041 } }, { "ph": "s", "id": 112041, "pid": 76337, "tid": -914061504, "ts": 1716454223157953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223158086, "dur": 12, "args": { "External id": 112051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112051, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112051, "pid": 5, "tid": 7, "ts": 1716454223158086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158072, "dur": 15, "args": { "External id": 112051, "cbid": 211, "correlation": 112051 } }, { "ph": "s", "id": 112051, "pid": 76337, "tid": -914061504, "ts": 1716454223158072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223158128, "dur": 3, "args": { "External id": 112059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112059, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112059, "pid": 5, "tid": 7, "ts": 1716454223158128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158119, "dur": 8, "args": { "External id": 112059, "cbid": 211, "correlation": 112059 } }, { "ph": "s", "id": 112059, "pid": 76337, "tid": -914061504, "ts": 1716454223158119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223158172, "dur": 11, "args": { "External id": 112069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112069, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112069, "pid": 5, "tid": 7, "ts": 1716454223158172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158161, "dur": 11, "args": { "External id": 112069, "cbid": 211, "correlation": 112069 } }, { "ph": "s", "id": 112069, "pid": 76337, "tid": -914061504, "ts": 1716454223158161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223158202, "dur": 10, "args": { "External id": 112077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112077, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112077, "pid": 5, "tid": 7, "ts": 1716454223158202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158192, "dur": 9, "args": { "External id": 112077, "cbid": 211, "correlation": 112077 } }, { "ph": "s", "id": 112077, "pid": 76337, "tid": -914061504, "ts": 1716454223158192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223158229, "dur": 3, "args": { "External id": 112086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112086, "pid": 5, "tid": 7, "ts": 1716454223158229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158220, "dur": 9, "args": { "External id": 112086, "cbid": 211, "correlation": 112086 } }, { "ph": "s", "id": 112086, "pid": 76337, "tid": -914061504, "ts": 1716454223158220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223158253, "dur": 5, "args": { "External id": 112095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112095, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112095, "pid": 5, "tid": 7, "ts": 1716454223158253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158244, "dur": 8, "args": { "External id": 112095, "cbid": 211, "correlation": 112095 } }, { "ph": "s", "id": 112095, "pid": 76337, "tid": -914061504, "ts": 1716454223158244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223158291, "dur": 8, "args": { "External id": 112105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112105, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112105, "pid": 5, "tid": 7, "ts": 1716454223158291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158281, "dur": 10, "args": { "External id": 112105, "cbid": 211, "correlation": 112105 } }, { "ph": "s", "id": 112105, "pid": 76337, "tid": -914061504, "ts": 1716454223158281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223158606, "dur": 3, "args": { "External id": 112114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112114, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112114, "pid": 5, "tid": 7, "ts": 1716454223158606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158591, "dur": 16, "args": { "External id": 112114, "cbid": 211, "correlation": 112114 } }, { "ph": "s", "id": 112114, "pid": 76337, "tid": -914061504, "ts": 1716454223158591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223158635, "dur": 3, "args": { "External id": 112122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112122, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112122, "pid": 5, "tid": 7, "ts": 1716454223158635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158625, "dur": 10, "args": { "External id": 112122, "cbid": 211, "correlation": 112122 } }, { "ph": "s", "id": 112122, "pid": 76337, "tid": -914061504, "ts": 1716454223158625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223158686, "dur": 1, "args": { "External id": 112132, "device": 5, "context": 1, "stream": 7, "correlation": 112132, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 112132, "pid": 5, "tid": 7, "ts": 1716454223158686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223158671, "dur": 13, "args": { "External id": 112132, "cbid": 41, "correlation": 112132 } }, { "ph": "s", "id": 112132, "pid": 76337, "tid": -914061504, "ts": 1716454223158671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223158685, "dur": 8, "args": { "External id": 112133, "cbid": 131, "correlation": 112133 } }, { "ph": "f", "id": 112133, "pid": 76337, "tid": -914061504, "ts": 1716454223158685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223158775, "dur": 2, "args": { "External id": 112141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112141, "pid": 5, "tid": 7, "ts": 1716454223158775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158761, "dur": 14, "args": { "External id": 112141, "cbid": 211, "correlation": 112141 } }, { "ph": "s", "id": 112141, "pid": 76337, "tid": -914061504, "ts": 1716454223158761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223158847, "dur": 3, "args": { "External id": 112150, "device": 5, "context": 1, "stream": 7, "correlation": 112150, "bytes": 8, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 112150, "pid": 5, "tid": 7, "ts": 1716454223158847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223158830, "dur": 17, "args": { "External id": 112150, "cbid": 41, "correlation": 112150 } }, { "ph": "s", "id": 112150, "pid": 76337, "tid": -914061504, "ts": 1716454223158830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223158918, "dur": 3, "args": { "External id": 112160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112160, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112160, "pid": 5, "tid": 7, "ts": 1716454223158918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223158904, "dur": 15, "args": { "External id": 112160, "cbid": 211, "correlation": 112160 } }, { "ph": "s", "id": 112160, "pid": 76337, "tid": -914061504, "ts": 1716454223158904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223158970, "dur": 1, "args": { "External id": 112170, "device": 5, "context": 1, "stream": 7, "correlation": 112170, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 112170, "pid": 5, "tid": 7, "ts": 1716454223158970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223158956, "dur": 12, "args": { "External id": 112170, "cbid": 41, "correlation": 112170 } }, { "ph": "s", "id": 112170, "pid": 76337, "tid": -914061504, "ts": 1716454223158956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223158969, "dur": 14, "args": { "External id": 112171, "cbid": 131, "correlation": 112171 } }, { "ph": "f", "id": 112171, "pid": 76337, "tid": -914061504, "ts": 1716454223158969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223159038, "dur": 3, "args": { "External id": 112178, "device": 5, "context": 1, "stream": 7, "correlation": 112178, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 112178, "pid": 5, "tid": 7, "ts": 1716454223159038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159019, "dur": 19, "args": { "External id": 112178, "cbid": 41, "correlation": 112178 } }, { "ph": "s", "id": 112178, "pid": 76337, "tid": -914061504, "ts": 1716454223159019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223159087, "dur": 1, "args": { "External id": 112189, "device": 5, "context": 1, "stream": 7, "correlation": 112189, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 112189, "pid": 5, "tid": 7, "ts": 1716454223159087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159074, "dur": 10, "args": { "External id": 112189, "cbid": 41, "correlation": 112189 } }, { "ph": "s", "id": 112189, "pid": 76337, "tid": -914061504, "ts": 1716454223159074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159086, "dur": 8, "args": { "External id": 112190, "cbid": 131, "correlation": 112190 } }, { "ph": "f", "id": 112190, "pid": 76337, "tid": -914061504, "ts": 1716454223159086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159136, "dur": 3, "args": { "External id": 112198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112198, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112198, "pid": 5, "tid": 7, "ts": 1716454223159136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159123, "dur": 12, "args": { "External id": 112198, "cbid": 211, "correlation": 112198 } }, { "ph": "s", "id": 112198, "pid": 76337, "tid": -914061504, "ts": 1716454223159123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159164, "dur": 3, "args": { "External id": 112208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112208, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112208, "pid": 5, "tid": 7, "ts": 1716454223159164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159155, "dur": 8, "args": { "External id": 112208, "cbid": 211, "correlation": 112208 } }, { "ph": "s", "id": 112208, "pid": 76337, "tid": -914061504, "ts": 1716454223159155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159186, "dur": 3, "args": { "External id": 112217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112217, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112217, "pid": 5, "tid": 7, "ts": 1716454223159186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159178, "dur": 7, "args": { "External id": 112217, "cbid": 211, "correlation": 112217 } }, { "ph": "s", "id": 112217, "pid": 76337, "tid": -914061504, "ts": 1716454223159178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223159254, "dur": 5, "args": { "External id": 112225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112225, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112225, "pid": 5, "tid": 7, "ts": 1716454223159254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159241, "dur": 14, "args": { "External id": 112225, "cbid": 211, "correlation": 112225 } }, { "ph": "s", "id": 112225, "pid": 76337, "tid": -914061504, "ts": 1716454223159241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159294, "dur": 3, "args": { "External id": 112234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112234, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112234, "pid": 5, "tid": 7, "ts": 1716454223159294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159284, "dur": 9, "args": { "External id": 112234, "cbid": 211, "correlation": 112234 } }, { "ph": "s", "id": 112234, "pid": 76337, "tid": -914061504, "ts": 1716454223159284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159318, "dur": 3, "args": { "External id": 112243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112243, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112243, "pid": 5, "tid": 7, "ts": 1716454223159318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159309, "dur": 7, "args": { "External id": 112243, "cbid": 211, "correlation": 112243 } }, { "ph": "s", "id": 112243, "pid": 76337, "tid": -914061504, "ts": 1716454223159309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159379, "dur": 3, "args": { "External id": 112251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112251, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112251, "pid": 5, "tid": 7, "ts": 1716454223159379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159369, "dur": 10, "args": { "External id": 112251, "cbid": 211, "correlation": 112251 } }, { "ph": "s", "id": 112251, "pid": 76337, "tid": -914061504, "ts": 1716454223159369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223159438, "dur": 1, "args": { "External id": 112259, "device": 5, "context": 1, "stream": 7, "correlation": 112259, "bytes": 8, "memory bandwidth (GB/s)": 0.0043859649122807015 } }, { "ph": "f", "id": 112259, "pid": 5, "tid": 7, "ts": 1716454223159438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159422, "dur": 25, "args": { "External id": 112259, "cbid": 41, "correlation": 112259 } }, { "ph": "s", "id": 112259, "pid": 76337, "tid": -914061504, "ts": 1716454223159422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159448, "dur": 4, "args": { "External id": 112260, "cbid": 131, "correlation": 112260 } }, { "ph": "f", "id": 112260, "pid": 76337, "tid": -914061504, "ts": 1716454223159448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223159510, "dur": 1, "args": { "External id": 112270, "device": 5, "context": 1, "stream": 7, "correlation": 112270, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 112270, "pid": 5, "tid": 7, "ts": 1716454223159510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159497, "dur": 11, "args": { "External id": 112270, "cbid": 41, "correlation": 112270 } }, { "ph": "s", "id": 112270, "pid": 76337, "tid": -914061504, "ts": 1716454223159497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159509, "dur": 8, "args": { "External id": 112271, "cbid": 131, "correlation": 112271 } }, { "ph": "f", "id": 112271, "pid": 76337, "tid": -914061504, "ts": 1716454223159509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223159567, "dur": 1, "args": { "External id": 112280, "device": 5, "context": 1, "stream": 7, "correlation": 112280, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 112280, "pid": 5, "tid": 7, "ts": 1716454223159567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159556, "dur": 9, "args": { "External id": 112280, "cbid": 41, "correlation": 112280 } }, { "ph": "s", "id": 112280, "pid": 76337, "tid": -914061504, "ts": 1716454223159556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159565, "dur": 8, "args": { "External id": 112281, "cbid": 131, "correlation": 112281 } }, { "ph": "f", "id": 112281, "pid": 76337, "tid": -914061504, "ts": 1716454223159565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223159640, "dur": 4, "args": { "External id": 112288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112288, "pid": 5, "tid": 7, "ts": 1716454223159640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159623, "dur": 17, "args": { "External id": 112288, "cbid": 211, "correlation": 112288 } }, { "ph": "s", "id": 112288, "pid": 76337, "tid": -914061504, "ts": 1716454223159623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454223159676, "dur": 4, "args": { "External id": 112308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112308, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112308, "pid": 5, "tid": 7, "ts": 1716454223159676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159665, "dur": 11, "args": { "External id": 112308, "cbid": 211, "correlation": 112308 } }, { "ph": "s", "id": 112308, "pid": 76337, "tid": -914061504, "ts": 1716454223159665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223159677, "dur": 0, "args": { "External id": 112309, "cbid": 11, "correlation": 112309 } }, { "ph": "f", "id": 112309, "pid": 76337, "tid": -914061504, "ts": 1716454223159677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223159678, "dur": 0, "args": { "External id": 112310, "cbid": 11, "correlation": 112310 } }, { "ph": "f", "id": 112310, "pid": 76337, "tid": -914061504, "ts": 1716454223159678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223159691, "dur": 1, "args": { "External id": 112313, "device": 5, "context": 1, "stream": 7, "correlation": 112313, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 112313, "pid": 5, "tid": 7, "ts": 1716454223159691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159679, "dur": 21, "args": { "External id": 112313, "cbid": 41, "correlation": 112313 } }, { "ph": "s", "id": 112313, "pid": 76337, "tid": -914061504, "ts": 1716454223159679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159700, "dur": 3, "args": { "External id": 112314, "cbid": 131, "correlation": 112314 } }, { "ph": "f", "id": 112314, "pid": 76337, "tid": -914061504, "ts": 1716454223159700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223159728, "dur": 3, "args": { "External id": 112338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112338, "pid": 5, "tid": 7, "ts": 1716454223159728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159719, "dur": 9, "args": { "External id": 112338, "cbid": 211, "correlation": 112338 } }, { "ph": "s", "id": 112338, "pid": 76337, "tid": -914061504, "ts": 1716454223159719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223159728, "dur": 0, "args": { "External id": 112339, "cbid": 11, "correlation": 112339 } }, { "ph": "f", "id": 112339, "pid": 76337, "tid": -914061504, "ts": 1716454223159728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223159729, "dur": 0, "args": { "External id": 112340, "cbid": 11, "correlation": 112340 } }, { "ph": "f", "id": 112340, "pid": 76337, "tid": -914061504, "ts": 1716454223159729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223159730, "dur": 1, "args": { "External id": 112342, "cbid": 200, "correlation": 112342 } }, { "ph": "f", "id": 112342, "pid": 76337, "tid": -914061504, "ts": 1716454223159730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454223159741, "dur": 4, "args": { "External id": 112344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112344, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112344, "pid": 5, "tid": 7, "ts": 1716454223159741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159733, "dur": 8, "args": { "External id": 112344, "cbid": 211, "correlation": 112344 } }, { "ph": "s", "id": 112344, "pid": 76337, "tid": -914061504, "ts": 1716454223159733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223159741, "dur": 0, "args": { "External id": 112345, "cbid": 11, "correlation": 112345 } }, { "ph": "f", "id": 112345, "pid": 76337, "tid": -914061504, "ts": 1716454223159741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223159742, "dur": 0, "args": { "External id": 112346, "cbid": 11, "correlation": 112346 } }, { "ph": "f", "id": 112346, "pid": 76337, "tid": -914061504, "ts": 1716454223159742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223159780, "dur": 1, "args": { "External id": 112353, "device": 5, "context": 1, "stream": 7, "correlation": 112353, "bytes": 8, "memory bandwidth (GB/s)": 0.004901960784313725 } }, { "ph": "f", "id": 112353, "pid": 5, "tid": 7, "ts": 1716454223159780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159768, "dur": 20, "args": { "External id": 112353, "cbid": 41, "correlation": 112353 } }, { "ph": "s", "id": 112353, "pid": 76337, "tid": -914061504, "ts": 1716454223159768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159789, "dur": 3, "args": { "External id": 112354, "cbid": 131, "correlation": 112354 } }, { "ph": "f", "id": 112354, "pid": 76337, "tid": -914061504, "ts": 1716454223159789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223159840, "dur": 1, "args": { "External id": 112364, "device": 5, "context": 1, "stream": 7, "correlation": 112364, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 112364, "pid": 5, "tid": 7, "ts": 1716454223159840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223159827, "dur": 11, "args": { "External id": 112364, "cbid": 41, "correlation": 112364 } }, { "ph": "s", "id": 112364, "pid": 76337, "tid": -914061504, "ts": 1716454223159827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223159839, "dur": 8, "args": { "External id": 112365, "cbid": 131, "correlation": 112365 } }, { "ph": "f", "id": 112365, "pid": 76337, "tid": -914061504, "ts": 1716454223159839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223159909, "dur": 5, "args": { "External id": 112372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112372, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112372, "pid": 5, "tid": 7, "ts": 1716454223159909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159894, "dur": 15, "args": { "External id": 112372, "cbid": 211, "correlation": 112372 } }, { "ph": "s", "id": 112372, "pid": 76337, "tid": -914061504, "ts": 1716454223159894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223159986, "dur": 3, "args": { "External id": 112381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112381, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112381, "pid": 5, "tid": 7, "ts": 1716454223159986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223159966, "dur": 21, "args": { "External id": 112381, "cbid": 211, "correlation": 112381 } }, { "ph": "s", "id": 112381, "pid": 76337, "tid": -914061504, "ts": 1716454223159966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160025, "dur": 3, "args": { "External id": 112389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112389, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112389, "pid": 5, "tid": 7, "ts": 1716454223160025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160014, "dur": 10, "args": { "External id": 112389, "cbid": 211, "correlation": 112389 } }, { "ph": "s", "id": 112389, "pid": 76337, "tid": -914061504, "ts": 1716454223160014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160059, "dur": 4, "args": { "External id": 112397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112397, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112397, "pid": 5, "tid": 7, "ts": 1716454223160059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160047, "dur": 12, "args": { "External id": 112397, "cbid": 211, "correlation": 112397 } }, { "ph": "s", "id": 112397, "pid": 76337, "tid": -914061504, "ts": 1716454223160047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160090, "dur": 4, "args": { "External id": 112405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112405, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112405, "pid": 5, "tid": 7, "ts": 1716454223160090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160078, "dur": 11, "args": { "External id": 112405, "cbid": 211, "correlation": 112405 } }, { "ph": "s", "id": 112405, "pid": 76337, "tid": -914061504, "ts": 1716454223160078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160116, "dur": 3, "args": { "External id": 112413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112413, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112413, "pid": 5, "tid": 7, "ts": 1716454223160116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160107, "dur": 8, "args": { "External id": 112413, "cbid": 211, "correlation": 112413 } }, { "ph": "s", "id": 112413, "pid": 76337, "tid": -914061504, "ts": 1716454223160107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160142, "dur": 3, "args": { "External id": 112421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112421, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112421, "pid": 5, "tid": 7, "ts": 1716454223160142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160132, "dur": 9, "args": { "External id": 112421, "cbid": 211, "correlation": 112421 } }, { "ph": "s", "id": 112421, "pid": 76337, "tid": -914061504, "ts": 1716454223160132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223160163, "dur": 4, "args": { "External id": 112429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112429, "pid": 5, "tid": 7, "ts": 1716454223160163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160155, "dur": 7, "args": { "External id": 112429, "cbid": 211, "correlation": 112429 } }, { "ph": "s", "id": 112429, "pid": 76337, "tid": -914061504, "ts": 1716454223160155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223160181, "dur": 4, "args": { "External id": 112437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112437, "pid": 5, "tid": 7, "ts": 1716454223160181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160173, "dur": 6, "args": { "External id": 112437, "cbid": 211, "correlation": 112437 } }, { "ph": "s", "id": 112437, "pid": 76337, "tid": -914061504, "ts": 1716454223160173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160201, "dur": 3, "args": { "External id": 112445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112445, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112445, "pid": 5, "tid": 7, "ts": 1716454223160201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160193, "dur": 7, "args": { "External id": 112445, "cbid": 211, "correlation": 112445 } }, { "ph": "s", "id": 112445, "pid": 76337, "tid": -914061504, "ts": 1716454223160193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160258, "dur": 3, "args": { "External id": 112453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112453, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112453, "pid": 5, "tid": 7, "ts": 1716454223160258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160247, "dur": 11, "args": { "External id": 112453, "cbid": 211, "correlation": 112453 } }, { "ph": "s", "id": 112453, "pid": 76337, "tid": -914061504, "ts": 1716454223160247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223160284, "dur": 4, "args": { "External id": 112461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112461, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112461, "pid": 5, "tid": 7, "ts": 1716454223160284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160274, "dur": 9, "args": { "External id": 112461, "cbid": 211, "correlation": 112461 } }, { "ph": "s", "id": 112461, "pid": 76337, "tid": -914061504, "ts": 1716454223160274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223160307, "dur": 4, "args": { "External id": 112469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112469, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112469, "pid": 5, "tid": 7, "ts": 1716454223160307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160299, "dur": 7, "args": { "External id": 112469, "cbid": 211, "correlation": 112469 } }, { "ph": "s", "id": 112469, "pid": 76337, "tid": -914061504, "ts": 1716454223160299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160325, "dur": 3, "args": { "External id": 112477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112477, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 112477, "pid": 5, "tid": 7, "ts": 1716454223160325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160318, "dur": 6, "args": { "External id": 112477, "cbid": 211, "correlation": 112477 } }, { "ph": "s", "id": 112477, "pid": 76337, "tid": -914061504, "ts": 1716454223160318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223160732, "dur": 5, "args": { "External id": 112486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112486, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112486, "pid": 5, "tid": 7, "ts": 1716454223160732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160715, "dur": 17, "args": { "External id": 112486, "cbid": 211, "correlation": 112486 } }, { "ph": "s", "id": 112486, "pid": 76337, "tid": -914061504, "ts": 1716454223160715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223160768, "dur": 5, "args": { "External id": 112495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112495, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112495, "pid": 5, "tid": 7, "ts": 1716454223160768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160758, "dur": 9, "args": { "External id": 112495, "cbid": 211, "correlation": 112495 } }, { "ph": "s", "id": 112495, "pid": 76337, "tid": -914061504, "ts": 1716454223160758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223160896, "dur": 3, "args": { "External id": 112511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112511, "pid": 5, "tid": 7, "ts": 1716454223160896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160882, "dur": 14, "args": { "External id": 112511, "cbid": 211, "correlation": 112511 } }, { "ph": "s", "id": 112511, "pid": 76337, "tid": -914061504, "ts": 1716454223160882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160930, "dur": 3, "args": { "External id": 112519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112519, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112519, "pid": 5, "tid": 7, "ts": 1716454223160930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160919, "dur": 10, "args": { "External id": 112519, "cbid": 211, "correlation": 112519 } }, { "ph": "s", "id": 112519, "pid": 76337, "tid": -914061504, "ts": 1716454223160919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223160961, "dur": 3, "args": { "External id": 112527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112527, "pid": 5, "tid": 7, "ts": 1716454223160961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160952, "dur": 8, "args": { "External id": 112527, "cbid": 211, "correlation": 112527 } }, { "ph": "s", "id": 112527, "pid": 76337, "tid": -914061504, "ts": 1716454223160952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223161003, "dur": 4, "args": { "External id": 112535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112535, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112535, "pid": 5, "tid": 7, "ts": 1716454223161003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223160993, "dur": 10, "args": { "External id": 112535, "cbid": 211, "correlation": 112535 } }, { "ph": "s", "id": 112535, "pid": 76337, "tid": -914061504, "ts": 1716454223160993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223161059, "dur": 4, "args": { "External id": 112547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112547, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112547, "pid": 5, "tid": 7, "ts": 1716454223161059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161046, "dur": 13, "args": { "External id": 112547, "cbid": 211, "correlation": 112547 } }, { "ph": "s", "id": 112547, "pid": 76337, "tid": -914061504, "ts": 1716454223161046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223161105, "dur": 4, "args": { "External id": 112558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112558, "pid": 5, "tid": 7, "ts": 1716454223161105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161093, "dur": 11, "args": { "External id": 112558, "cbid": 211, "correlation": 112558 } }, { "ph": "s", "id": 112558, "pid": 76337, "tid": -914061504, "ts": 1716454223161093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223161136, "dur": 3, "args": { "External id": 112566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112566, "pid": 5, "tid": 7, "ts": 1716454223161136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161127, "dur": 8, "args": { "External id": 112566, "cbid": 211, "correlation": 112566 } }, { "ph": "s", "id": 112566, "pid": 76337, "tid": -914061504, "ts": 1716454223161127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223161169, "dur": 5, "args": { "External id": 112574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112574, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112574, "pid": 5, "tid": 7, "ts": 1716454223161169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161158, "dur": 11, "args": { "External id": 112574, "cbid": 211, "correlation": 112574 } }, { "ph": "s", "id": 112574, "pid": 76337, "tid": -914061504, "ts": 1716454223161158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223161199, "dur": 5, "args": { "External id": 112582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112582, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112582, "pid": 5, "tid": 7, "ts": 1716454223161199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161189, "dur": 9, "args": { "External id": 112582, "cbid": 211, "correlation": 112582 } }, { "ph": "s", "id": 112582, "pid": 76337, "tid": -914061504, "ts": 1716454223161189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223161231, "dur": 4, "args": { "External id": 112591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112591, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112591, "pid": 5, "tid": 7, "ts": 1716454223161231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161220, "dur": 10, "args": { "External id": 112591, "cbid": 211, "correlation": 112591 } }, { "ph": "s", "id": 112591, "pid": 76337, "tid": -914061504, "ts": 1716454223161220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223161291, "dur": 5, "args": { "External id": 112604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112604, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112604, "pid": 5, "tid": 7, "ts": 1716454223161291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161279, "dur": 13, "args": { "External id": 112604, "cbid": 211, "correlation": 112604 } }, { "ph": "s", "id": 112604, "pid": 76337, "tid": -914061504, "ts": 1716454223161279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223161332, "dur": 5, "args": { "External id": 112614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112614, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 112614, "pid": 5, "tid": 7, "ts": 1716454223161332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161321, "dur": 10, "args": { "External id": 112614, "cbid": 211, "correlation": 112614 } }, { "ph": "s", "id": 112614, "pid": 76337, "tid": -914061504, "ts": 1716454223161321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223161460, "dur": 4, "args": { "External id": 112631, "cbid": 251, "correlation": 112631 } }, { "ph": "f", "id": 112631, "pid": 76337, "tid": -914061504, "ts": 1716454223161460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454223161489, "dur": 12, "args": { "External id": 112633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112633, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112633, "pid": 5, "tid": 7, "ts": 1716454223161489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161472, "dur": 17, "args": { "External id": 112633, "cbid": 211, "correlation": 112633 } }, { "ph": "s", "id": 112633, "pid": 76337, "tid": -914061504, "ts": 1716454223161472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223161550, "dur": 4, "args": { "External id": 112641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112641, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112641, "pid": 5, "tid": 7, "ts": 1716454223161550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161538, "dur": 12, "args": { "External id": 112641, "cbid": 211, "correlation": 112641 } }, { "ph": "s", "id": 112641, "pid": 76337, "tid": -914061504, "ts": 1716454223161538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223161609, "dur": 2, "args": { "External id": 112657, "cbid": 251, "correlation": 112657 } }, { "ph": "f", "id": 112657, "pid": 76337, "tid": -914061504, "ts": 1716454223161609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223161615, "dur": 0, "args": { "External id": 112659, "cbid": 251, "correlation": 112659 } }, { "ph": "f", "id": 112659, "pid": 76337, "tid": -914061504, "ts": 1716454223161615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223161631, "dur": 13, "args": { "External id": 112660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112660, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112660, "pid": 5, "tid": 7, "ts": 1716454223161631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161617, "dur": 13, "args": { "External id": 112660, "cbid": 211, "correlation": 112660 } }, { "ph": "s", "id": 112660, "pid": 76337, "tid": -914061504, "ts": 1716454223161617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223161645, "dur": 5, "args": { "External id": 112662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112662, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112662, "pid": 5, "tid": 7, "ts": 1716454223161645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161635, "dur": 9, "args": { "External id": 112662, "cbid": 211, "correlation": 112662 } }, { "ph": "s", "id": 112662, "pid": 76337, "tid": -914061504, "ts": 1716454223161635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223161742, "dur": 1, "args": { "External id": 112672, "cbid": 317, "correlation": 112672 } }, { "ph": "f", "id": 112672, "pid": 76337, "tid": -914061504, "ts": 1716454223161742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223161744, "dur": 1, "args": { "External id": 112673, "cbid": 203, "correlation": 112673 } }, { "ph": "f", "id": 112673, "pid": 76337, "tid": -914061504, "ts": 1716454223161744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223161746, "dur": 1, "args": { "External id": 112674, "cbid": 205, "correlation": 112674 } }, { "ph": "f", "id": 112674, "pid": 76337, "tid": -914061504, "ts": 1716454223161746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223161802, "dur": 7, "args": { "External id": 112678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112678, "pid": 5, "tid": 7, "ts": 1716454223161802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161787, "dur": 14, "args": { "External id": 112678, "cbid": 211, "correlation": 112678 } }, { "ph": "s", "id": 112678, "pid": 76337, "tid": -914061504, "ts": 1716454223161787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223161812, "dur": 4, "args": { "External id": 112680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112680, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 112680, "pid": 5, "tid": 7, "ts": 1716454223161812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161805, "dur": 6, "args": { "External id": 112680, "cbid": 211, "correlation": 112680 } }, { "ph": "s", "id": 112680, "pid": 76337, "tid": -914061504, "ts": 1716454223161805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223161832, "dur": 3, "args": { "External id": 112682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112682, "pid": 5, "tid": 7, "ts": 1716454223161832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161822, "dur": 8, "args": { "External id": 112682, "cbid": 211, "correlation": 112682 } }, { "ph": "s", "id": 112682, "pid": 76337, "tid": -914061504, "ts": 1716454223161822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223161837, "dur": 1, "args": { "External id": 112683, "cbid": 51, "correlation": 112683 } }, { "ph": "s", "id": 112683, "pid": 76337, "tid": -914061504, "ts": 1716454223161837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223161848, "dur": 85, "args": { "External id": 112684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112684, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112684, "pid": 5, "tid": 7, "ts": 1716454223161848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161839, "dur": 8, "args": { "External id": 112684, "cbid": 211, "correlation": 112684 } }, { "ph": "s", "id": 112684, "pid": 76337, "tid": -914061504, "ts": 1716454223161839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223161934, "dur": 60, "args": { "External id": 112689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112689, "pid": 5, "tid": 7, "ts": 1716454223161934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223161875, "dur": 11, "args": { "External id": 112689, "cbid": 211, "correlation": 112689 } }, { "ph": "s", "id": 112689, "pid": 76337, "tid": -914061504, "ts": 1716454223161875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223163670, "dur": 52, "args": { "External id": 112709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112709, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 112709, "pid": 5, "tid": 7, "ts": 1716454223163670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163654, "dur": 15, "args": { "External id": 112709, "cbid": 211, "correlation": 112709 } }, { "ph": "s", "id": 112709, "pid": 76337, "tid": -914061504, "ts": 1716454223163654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223163723, "dur": 4, "args": { "External id": 112721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112721, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112721, "pid": 5, "tid": 7, "ts": 1716454223163723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163682, "dur": 8, "args": { "External id": 112721, "cbid": 211, "correlation": 112721 } }, { "ph": "s", "id": 112721, "pid": 76337, "tid": -914061504, "ts": 1716454223163682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223163729, "dur": 57, "args": { "External id": 112724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112724, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112724, "pid": 5, "tid": 7, "ts": 1716454223163729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163706, "dur": 8, "args": { "External id": 112724, "cbid": 211, "correlation": 112724 } }, { "ph": "s", "id": 112724, "pid": 76337, "tid": -914061504, "ts": 1716454223163706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223163787, "dur": 37, "args": { "External id": 112733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112733, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112733, "pid": 5, "tid": 7, "ts": 1716454223163787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163751, "dur": 11, "args": { "External id": 112733, "cbid": 211, "correlation": 112733 } }, { "ph": "s", "id": 112733, "pid": 76337, "tid": -914061504, "ts": 1716454223163751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223163808, "dur": 0, "args": { "External id": 112743, "cbid": 317, "correlation": 112743 } }, { "ph": "f", "id": 112743, "pid": 76337, "tid": -914061504, "ts": 1716454223163808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223163809, "dur": 0, "args": { "External id": 112744, "cbid": 203, "correlation": 112744 } }, { "ph": "f", "id": 112744, "pid": 76337, "tid": -914061504, "ts": 1716454223163809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223163809, "dur": 0, "args": { "External id": 112745, "cbid": 205, "correlation": 112745 } }, { "ph": "f", "id": 112745, "pid": 76337, "tid": -914061504, "ts": 1716454223163809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223163842, "dur": 40, "args": { "External id": 112749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112749, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112749, "pid": 5, "tid": 7, "ts": 1716454223163842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163828, "dur": 13, "args": { "External id": 112749, "cbid": 211, "correlation": 112749 } }, { "ph": "s", "id": 112749, "pid": 76337, "tid": -914061504, "ts": 1716454223163828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223163883, "dur": 14, "args": { "External id": 112751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112751, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112751, "pid": 5, "tid": 7, "ts": 1716454223163883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163844, "dur": 6, "args": { "External id": 112751, "cbid": 211, "correlation": 112751 } }, { "ph": "s", "id": 112751, "pid": 76337, "tid": -914061504, "ts": 1716454223163844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223163899, "dur": 3, "args": { "External id": 112753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112753, "pid": 5, "tid": 7, "ts": 1716454223163899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163854, "dur": 6, "args": { "External id": 112753, "cbid": 211, "correlation": 112753 } }, { "ph": "s", "id": 112753, "pid": 76337, "tid": -914061504, "ts": 1716454223163854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223163864, "dur": 0, "args": { "External id": 112754, "cbid": 51, "correlation": 112754 } }, { "ph": "s", "id": 112754, "pid": 76337, "tid": -914061504, "ts": 1716454223163864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223163903, "dur": 699, "args": { "External id": 112755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112755, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112755, "pid": 5, "tid": 7, "ts": 1716454223163903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163865, "dur": 7, "args": { "External id": 112755, "cbid": 211, "correlation": 112755 } }, { "ph": "s", "id": 112755, "pid": 76337, "tid": -914061504, "ts": 1716454223163865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223164603, "dur": 59, "args": { "External id": 112760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112760, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112760, "pid": 5, "tid": 7, "ts": 1716454223164603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163896, "dur": 9, "args": { "External id": 112760, "cbid": 211, "correlation": 112760 } }, { "ph": "s", "id": 112760, "pid": 76337, "tid": -914061504, "ts": 1716454223163896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223164663, "dur": 5, "args": { "External id": 112768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112768, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112768, "pid": 5, "tid": 7, "ts": 1716454223164663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223163939, "dur": 9, "args": { "External id": 112768, "cbid": 211, "correlation": 112768 } }, { "ph": "s", "id": 112768, "pid": 76337, "tid": -914061504, "ts": 1716454223163939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164011, "dur": 2, "args": { "External id": 112784, "cbid": 251, "correlation": 112784 } }, { "ph": "f", "id": 112784, "pid": 76337, "tid": -914061504, "ts": 1716454223164011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164017, "dur": 0, "args": { "External id": 112786, "cbid": 251, "correlation": 112786 } }, { "ph": "f", "id": 112786, "pid": 76337, "tid": -914061504, "ts": 1716454223164017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223164669, "dur": 9, "args": { "External id": 112787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112787, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 112787, "pid": 5, "tid": 7, "ts": 1716454223164669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164019, "dur": 13, "args": { "External id": 112787, "cbid": 211, "correlation": 112787 } }, { "ph": "s", "id": 112787, "pid": 76337, "tid": -914061504, "ts": 1716454223164019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223164680, "dur": 4, "args": { "External id": 112789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112789, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 112789, "pid": 5, "tid": 7, "ts": 1716454223164680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164034, "dur": 6, "args": { "External id": 112789, "cbid": 211, "correlation": 112789 } }, { "ph": "s", "id": 112789, "pid": 76337, "tid": -914061504, "ts": 1716454223164034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223164685, "dur": 55, "args": { "External id": 112799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112799, "pid": 5, "tid": 7, "ts": 1716454223164685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164094, "dur": 12, "args": { "External id": 112799, "cbid": 211, "correlation": 112799 } }, { "ph": "s", "id": 112799, "pid": 76337, "tid": -914061504, "ts": 1716454223164094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223164741, "dur": 51, "args": { "External id": 112819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112819, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 112819, "pid": 5, "tid": 7, "ts": 1716454223164741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164160, "dur": 11, "args": { "External id": 112819, "cbid": 211, "correlation": 112819 } }, { "ph": "s", "id": 112819, "pid": 76337, "tid": -914061504, "ts": 1716454223164160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223164794, "dur": 4, "args": { "External id": 112831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112831, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112831, "pid": 5, "tid": 7, "ts": 1716454223164794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164181, "dur": 7, "args": { "External id": 112831, "cbid": 211, "correlation": 112831 } }, { "ph": "s", "id": 112831, "pid": 76337, "tid": -914061504, "ts": 1716454223164181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223164799, "dur": 55, "args": { "External id": 112834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112834, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112834, "pid": 5, "tid": 7, "ts": 1716454223164799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164201, "dur": 6, "args": { "External id": 112834, "cbid": 211, "correlation": 112834 } }, { "ph": "s", "id": 112834, "pid": 76337, "tid": -914061504, "ts": 1716454223164201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223164855, "dur": 36, "args": { "External id": 112843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112843, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112843, "pid": 5, "tid": 7, "ts": 1716454223164855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164241, "dur": 10, "args": { "External id": 112843, "cbid": 211, "correlation": 112843 } }, { "ph": "s", "id": 112843, "pid": 76337, "tid": -914061504, "ts": 1716454223164241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223164312, "dur": 0, "args": { "External id": 112853, "cbid": 317, "correlation": 112853 } }, { "ph": "f", "id": 112853, "pid": 76337, "tid": -914061504, "ts": 1716454223164312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223164312, "dur": 0, "args": { "External id": 112854, "cbid": 203, "correlation": 112854 } }, { "ph": "f", "id": 112854, "pid": 76337, "tid": -914061504, "ts": 1716454223164312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223164313, "dur": 0, "args": { "External id": 112855, "cbid": 205, "correlation": 112855 } }, { "ph": "f", "id": 112855, "pid": 76337, "tid": -914061504, "ts": 1716454223164313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223164893, "dur": 40, "args": { "External id": 112859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112859, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112859, "pid": 5, "tid": 7, "ts": 1716454223164893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164327, "dur": 13, "args": { "External id": 112859, "cbid": 211, "correlation": 112859 } }, { "ph": "s", "id": 112859, "pid": 76337, "tid": -914061504, "ts": 1716454223164327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223164935, "dur": 14, "args": { "External id": 112861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112861, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112861, "pid": 5, "tid": 7, "ts": 1716454223164935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164343, "dur": 5, "args": { "External id": 112861, "cbid": 211, "correlation": 112861 } }, { "ph": "s", "id": 112861, "pid": 76337, "tid": -914061504, "ts": 1716454223164343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223164950, "dur": 3, "args": { "External id": 112863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112863, "pid": 5, "tid": 7, "ts": 1716454223164950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164352, "dur": 5, "args": { "External id": 112863, "cbid": 211, "correlation": 112863 } }, { "ph": "s", "id": 112863, "pid": 76337, "tid": -914061504, "ts": 1716454223164352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223164360, "dur": 0, "args": { "External id": 112864, "cbid": 51, "correlation": 112864 } }, { "ph": "s", "id": 112864, "pid": 76337, "tid": -914061504, "ts": 1716454223164360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223164955, "dur": 693, "args": { "External id": 112865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112865, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112865, "pid": 5, "tid": 7, "ts": 1716454223164955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164361, "dur": 5, "args": { "External id": 112865, "cbid": 211, "correlation": 112865 } }, { "ph": "s", "id": 112865, "pid": 76337, "tid": -914061504, "ts": 1716454223164361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223165649, "dur": 58, "args": { "External id": 112870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112870, "pid": 5, "tid": 7, "ts": 1716454223165649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164389, "dur": 8, "args": { "External id": 112870, "cbid": 211, "correlation": 112870 } }, { "ph": "s", "id": 112870, "pid": 76337, "tid": -914061504, "ts": 1716454223164389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223165709, "dur": 51, "args": { "External id": 112878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112878, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112878, "pid": 5, "tid": 7, "ts": 1716454223165709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164421, "dur": 9, "args": { "External id": 112878, "cbid": 211, "correlation": 112878 } }, { "ph": "s", "id": 112878, "pid": 76337, "tid": -914061504, "ts": 1716454223164421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223165761, "dur": 35, "args": { "External id": 112886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112886, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112886, "pid": 5, "tid": 7, "ts": 1716454223165761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164452, "dur": 9, "args": { "External id": 112886, "cbid": 211, "correlation": 112886 } }, { "ph": "s", "id": 112886, "pid": 76337, "tid": -914061504, "ts": 1716454223164452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223165798, "dur": 52, "args": { "External id": 112906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112906, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 112906, "pid": 5, "tid": 7, "ts": 1716454223165798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164535, "dur": 12, "args": { "External id": 112906, "cbid": 211, "correlation": 112906 } }, { "ph": "s", "id": 112906, "pid": 76337, "tid": -914061504, "ts": 1716454223164535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223165851, "dur": 4, "args": { "External id": 112918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112918, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 112918, "pid": 5, "tid": 7, "ts": 1716454223165851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164557, "dur": 6, "args": { "External id": 112918, "cbid": 211, "correlation": 112918 } }, { "ph": "s", "id": 112918, "pid": 76337, "tid": -914061504, "ts": 1716454223164557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223165857, "dur": 55, "args": { "External id": 112921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112921, "pid": 5, "tid": 7, "ts": 1716454223165857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164574, "dur": 6, "args": { "External id": 112921, "cbid": 211, "correlation": 112921 } }, { "ph": "s", "id": 112921, "pid": 76337, "tid": -914061504, "ts": 1716454223164574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223164632, "dur": 0, "args": { "External id": 112932, "cbid": 317, "correlation": 112932 } }, { "ph": "f", "id": 112932, "pid": 76337, "tid": -914061504, "ts": 1716454223164632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223164633, "dur": 0, "args": { "External id": 112933, "cbid": 203, "correlation": 112933 } }, { "ph": "f", "id": 112933, "pid": 76337, "tid": -914061504, "ts": 1716454223164633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223164634, "dur": 0, "args": { "External id": 112934, "cbid": 205, "correlation": 112934 } }, { "ph": "f", "id": 112934, "pid": 76337, "tid": -914061504, "ts": 1716454223164634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164667, "dur": 2, "args": { "External id": 112938, "cbid": 251, "correlation": 112938 } }, { "ph": "f", "id": 112938, "pid": 76337, "tid": -914061504, "ts": 1716454223164667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164671, "dur": 1, "args": { "External id": 112939, "cbid": 251, "correlation": 112939 } }, { "ph": "f", "id": 112939, "pid": 76337, "tid": -914061504, "ts": 1716454223164671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164673, "dur": 1, "args": { "External id": 112940, "cbid": 251, "correlation": 112940 } }, { "ph": "f", "id": 112940, "pid": 76337, "tid": -914061504, "ts": 1716454223164673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164676, "dur": 1, "args": { "External id": 112941, "cbid": 251, "correlation": 112941 } }, { "ph": "f", "id": 112941, "pid": 76337, "tid": -914061504, "ts": 1716454223164676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164678, "dur": 1, "args": { "External id": 112942, "cbid": 251, "correlation": 112942 } }, { "ph": "f", "id": 112942, "pid": 76337, "tid": -914061504, "ts": 1716454223164678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164680, "dur": 1, "args": { "External id": 112943, "cbid": 251, "correlation": 112943 } }, { "ph": "f", "id": 112943, "pid": 76337, "tid": -914061504, "ts": 1716454223164680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164682, "dur": 1, "args": { "External id": 112944, "cbid": 251, "correlation": 112944 } }, { "ph": "f", "id": 112944, "pid": 76337, "tid": -914061504, "ts": 1716454223164682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164684, "dur": 1, "args": { "External id": 112945, "cbid": 251, "correlation": 112945 } }, { "ph": "f", "id": 112945, "pid": 76337, "tid": -914061504, "ts": 1716454223164684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223164687, "dur": 0, "args": { "External id": 112946, "cbid": 251, "correlation": 112946 } }, { "ph": "f", "id": 112946, "pid": 76337, "tid": -914061504, "ts": 1716454223164687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223165913, "dur": 116, "args": { "External id": 112947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112947, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 112947, "pid": 5, "tid": 7, "ts": 1716454223165913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164691, "dur": 15, "args": { "External id": 112947, "cbid": 211, "correlation": 112947 } }, { "ph": "s", "id": 112947, "pid": 76337, "tid": -914061504, "ts": 1716454223164691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223166030, "dur": 60, "args": { "External id": 112953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112953, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112953, "pid": 5, "tid": 7, "ts": 1716454223166030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164729, "dur": 9, "args": { "External id": 112953, "cbid": 211, "correlation": 112953 } }, { "ph": "s", "id": 112953, "pid": 76337, "tid": -914061504, "ts": 1716454223164729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223166092, "dur": 566, "args": { "External id": 112962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112962, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112962, "pid": 5, "tid": 7, "ts": 1716454223166092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164822, "dur": 16, "args": { "External id": 112962, "cbid": 211, "correlation": 112962 } }, { "ph": "s", "id": 112962, "pid": 76337, "tid": -914061504, "ts": 1716454223164822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223166659, "dur": 181, "args": { "External id": 112984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112984, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 112984, "pid": 5, "tid": 7, "ts": 1716454223166659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223164897, "dur": 12, "args": { "External id": 112984, "cbid": 211, "correlation": 112984 } }, { "ph": "s", "id": 112984, "pid": 76337, "tid": -914061504, "ts": 1716454223164897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165124, "dur": 2, "args": { "External id": 112995, "cbid": 251, "correlation": 112995 } }, { "ph": "f", "id": 112995, "pid": 76337, "tid": -914061504, "ts": 1716454223165124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223166841, "dur": 194, "args": { "External id": 112996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 112996, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 112996, "pid": 5, "tid": 7, "ts": 1716454223166841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165132, "dur": 15, "args": { "External id": 112996, "cbid": 211, "correlation": 112996 } }, { "ph": "s", "id": 112996, "pid": 76337, "tid": -914061504, "ts": 1716454223165132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165209, "dur": 1, "args": { "External id": 113007, "cbid": 251, "correlation": 113007 } }, { "ph": "f", "id": 113007, "pid": 76337, "tid": -914061504, "ts": 1716454223165209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223167037, "dur": 189, "args": { "External id": 113008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113008, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113008, "pid": 5, "tid": 7, "ts": 1716454223167037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165213, "dur": 12, "args": { "External id": 113008, "cbid": 211, "correlation": 113008 } }, { "ph": "s", "id": 113008, "pid": 76337, "tid": -914061504, "ts": 1716454223165213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165278, "dur": 1, "args": { "External id": 113019, "cbid": 251, "correlation": 113019 } }, { "ph": "f", "id": 113019, "pid": 76337, "tid": -914061504, "ts": 1716454223165278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223167227, "dur": 188, "args": { "External id": 113020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113020, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113020, "pid": 5, "tid": 7, "ts": 1716454223167227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165282, "dur": 11, "args": { "External id": 113020, "cbid": 211, "correlation": 113020 } }, { "ph": "s", "id": 113020, "pid": 76337, "tid": -914061504, "ts": 1716454223165282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223167417, "dur": 18597, "args": { "External id": 113041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113041, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113041, "pid": 5, "tid": 7, "ts": 1716454223167417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165391, "dur": 15, "args": { "External id": 113041, "cbid": 211, "correlation": 113041 } }, { "ph": "s", "id": 113041, "pid": 76337, "tid": -914061504, "ts": 1716454223165391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165506, "dur": 2, "args": { "External id": 113059, "cbid": 251, "correlation": 113059 } }, { "ph": "f", "id": 113059, "pid": 76337, "tid": -914061504, "ts": 1716454223165506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223186015, "dur": 202, "args": { "External id": 113061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113061, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113061, "pid": 5, "tid": 7, "ts": 1716454223186015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165513, "dur": 13, "args": { "External id": 113061, "cbid": 211, "correlation": 113061 } }, { "ph": "s", "id": 113061, "pid": 76337, "tid": -914061504, "ts": 1716454223165513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223186218, "dur": 66, "args": { "External id": 113069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113069, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113069, "pid": 5, "tid": 7, "ts": 1716454223186218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165586, "dur": 12, "args": { "External id": 113069, "cbid": 211, "correlation": 113069 } }, { "ph": "s", "id": 113069, "pid": 76337, "tid": -914061504, "ts": 1716454223165586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223186286, "dur": 97, "args": { "External id": 113077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113077, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113077, "pid": 5, "tid": 7, "ts": 1716454223186286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165627, "dur": 9, "args": { "External id": 113077, "cbid": 211, "correlation": 113077 } }, { "ph": "s", "id": 113077, "pid": 76337, "tid": -914061504, "ts": 1716454223165627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223186384, "dur": 54, "args": { "External id": 113088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113088, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113088, "pid": 5, "tid": 7, "ts": 1716454223186384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165714, "dur": 14, "args": { "External id": 113088, "cbid": 211, "correlation": 113088 } }, { "ph": "s", "id": 113088, "pid": 76337, "tid": -914061504, "ts": 1716454223165714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223186440, "dur": 92, "args": { "External id": 113110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113110, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113110, "pid": 5, "tid": 7, "ts": 1716454223186440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165747, "dur": 8, "args": { "External id": 113110, "cbid": 211, "correlation": 113110 } }, { "ph": "s", "id": 113110, "pid": 76337, "tid": -914061504, "ts": 1716454223165747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165833, "dur": 1, "args": { "External id": 113121, "cbid": 251, "correlation": 113121 } }, { "ph": "f", "id": 113121, "pid": 76337, "tid": -914061504, "ts": 1716454223165833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223186534, "dur": 103, "args": { "External id": 113122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113122, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113122, "pid": 5, "tid": 7, "ts": 1716454223186534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165838, "dur": 13, "args": { "External id": 113122, "cbid": 211, "correlation": 113122 } }, { "ph": "s", "id": 113122, "pid": 76337, "tid": -914061504, "ts": 1716454223165838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165922, "dur": 1, "args": { "External id": 113133, "cbid": 251, "correlation": 113133 } }, { "ph": "f", "id": 113133, "pid": 76337, "tid": -914061504, "ts": 1716454223165922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223165926, "dur": 0, "args": { "External id": 113134, "cbid": 251, "correlation": 113134 } }, { "ph": "f", "id": 113134, "pid": 76337, "tid": -914061504, "ts": 1716454223165926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223186638, "dur": 10, "args": { "External id": 113135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113135, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 113135, "pid": 5, "tid": 7, "ts": 1716454223186638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165928, "dur": 14, "args": { "External id": 113135, "cbid": 211, "correlation": 113135 } }, { "ph": "s", "id": 113135, "pid": 76337, "tid": -914061504, "ts": 1716454223165928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223186650, "dur": 5, "args": { "External id": 113137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113137, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 113137, "pid": 5, "tid": 7, "ts": 1716454223186650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223165946, "dur": 7, "args": { "External id": 113137, "cbid": 211, "correlation": 113137 } }, { "ph": "s", "id": 113137, "pid": 76337, "tid": -914061504, "ts": 1716454223165946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166019, "dur": 1, "args": { "External id": 113148, "cbid": 251, "correlation": 113148 } }, { "ph": "f", "id": 113148, "pid": 76337, "tid": -914061504, "ts": 1716454223166019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166023, "dur": 0, "args": { "External id": 113149, "cbid": 251, "correlation": 113149 } }, { "ph": "f", "id": 113149, "pid": 76337, "tid": -914061504, "ts": 1716454223166023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223186656, "dur": 6, "args": { "External id": 113150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113150, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 113150, "pid": 5, "tid": 7, "ts": 1716454223186656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166024, "dur": 12, "args": { "External id": 113150, "cbid": 211, "correlation": 113150 } }, { "ph": "s", "id": 113150, "pid": 76337, "tid": -914061504, "ts": 1716454223166024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223186663, "dur": 3, "args": { "External id": 113152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113152, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 113152, "pid": 5, "tid": 7, "ts": 1716454223186663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166038, "dur": 5, "args": { "External id": 113152, "cbid": 211, "correlation": 113152 } }, { "ph": "s", "id": 113152, "pid": 76337, "tid": -914061504, "ts": 1716454223166038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223186668, "dur": 154, "args": { "External id": 113173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113173, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113173, "pid": 5, "tid": 7, "ts": 1716454223186668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166113, "dur": 12, "args": { "External id": 113173, "cbid": 211, "correlation": 113173 } }, { "ph": "s", "id": 113173, "pid": 76337, "tid": -914061504, "ts": 1716454223166113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166212, "dur": 2, "args": { "External id": 113191, "cbid": 251, "correlation": 113191 } }, { "ph": "f", "id": 113191, "pid": 76337, "tid": -914061504, "ts": 1716454223166212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223186824, "dur": 107, "args": { "External id": 113193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113193, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113193, "pid": 5, "tid": 7, "ts": 1716454223186824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166218, "dur": 14, "args": { "External id": 113193, "cbid": 211, "correlation": 113193 } }, { "ph": "s", "id": 113193, "pid": 76337, "tid": -914061504, "ts": 1716454223166218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223186932, "dur": 35, "args": { "External id": 113201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113201, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113201, "pid": 5, "tid": 7, "ts": 1716454223186932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166289, "dur": 12, "args": { "External id": 113201, "cbid": 211, "correlation": 113201 } }, { "ph": "s", "id": 113201, "pid": 76337, "tid": -914061504, "ts": 1716454223166289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223186969, "dur": 68, "args": { "External id": 113209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113209, "pid": 5, "tid": 7, "ts": 1716454223186969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166330, "dur": 9, "args": { "External id": 113209, "cbid": 211, "correlation": 113209 } }, { "ph": "s", "id": 113209, "pid": 76337, "tid": -914061504, "ts": 1716454223166330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223187039, "dur": 92, "args": { "External id": 113231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113231, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113231, "pid": 5, "tid": 7, "ts": 1716454223187039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166382, "dur": 10, "args": { "External id": 113231, "cbid": 211, "correlation": 113231 } }, { "ph": "s", "id": 113231, "pid": 76337, "tid": -914061504, "ts": 1716454223166382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166472, "dur": 1, "args": { "External id": 113247, "cbid": 251, "correlation": 113247 } }, { "ph": "f", "id": 113247, "pid": 76337, "tid": -914061504, "ts": 1716454223166472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223187132, "dur": 574, "args": { "External id": 113249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113249, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113249, "pid": 5, "tid": 7, "ts": 1716454223187132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166477, "dur": 13, "args": { "External id": 113249, "cbid": 211, "correlation": 113249 } }, { "ph": "s", "id": 113249, "pid": 76337, "tid": -914061504, "ts": 1716454223166477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223187708, "dur": 244, "args": { "External id": 113257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113257, "pid": 5, "tid": 7, "ts": 1716454223187708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166559, "dur": 15, "args": { "External id": 113257, "cbid": 211, "correlation": 113257 } }, { "ph": "s", "id": 113257, "pid": 76337, "tid": -914061504, "ts": 1716454223166559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223187953, "dur": 251, "args": { "External id": 113265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113265, "pid": 5, "tid": 7, "ts": 1716454223187953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166595, "dur": 9, "args": { "External id": 113265, "cbid": 211, "correlation": 113265 } }, { "ph": "s", "id": 113265, "pid": 76337, "tid": -914061504, "ts": 1716454223166595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166681, "dur": 2, "args": { "External id": 113281, "cbid": 251, "correlation": 113281 } }, { "ph": "f", "id": 113281, "pid": 76337, "tid": -914061504, "ts": 1716454223166681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166686, "dur": 0, "args": { "External id": 113283, "cbid": 251, "correlation": 113283 } }, { "ph": "f", "id": 113283, "pid": 76337, "tid": -914061504, "ts": 1716454223166686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223188206, "dur": 362, "args": { "External id": 113284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113284, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 113284, "pid": 5, "tid": 7, "ts": 1716454223188206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166691, "dur": 14, "args": { "External id": 113284, "cbid": 211, "correlation": 113284 } }, { "ph": "s", "id": 113284, "pid": 76337, "tid": -914061504, "ts": 1716454223166691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223188569, "dur": 50, "args": { "External id": 113292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113292, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113292, "pid": 5, "tid": 7, "ts": 1716454223188569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166735, "dur": 10, "args": { "External id": 113292, "cbid": 211, "correlation": 113292 } }, { "ph": "s", "id": 113292, "pid": 76337, "tid": -914061504, "ts": 1716454223166735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223188620, "dur": 158, "args": { "External id": 113303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113303, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113303, "pid": 5, "tid": 7, "ts": 1716454223188620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166806, "dur": 13, "args": { "External id": 113303, "cbid": 211, "correlation": 113303 } }, { "ph": "s", "id": 113303, "pid": 76337, "tid": -914061504, "ts": 1716454223166806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223166872, "dur": 0, "args": { "External id": 113315, "cbid": 317, "correlation": 113315 } }, { "ph": "f", "id": 113315, "pid": 76337, "tid": -914061504, "ts": 1716454223166872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223166873, "dur": 0, "args": { "External id": 113316, "cbid": 203, "correlation": 113316 } }, { "ph": "f", "id": 113316, "pid": 76337, "tid": -914061504, "ts": 1716454223166873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223166874, "dur": 0, "args": { "External id": 113317, "cbid": 205, "correlation": 113317 } }, { "ph": "f", "id": 113317, "pid": 76337, "tid": -914061504, "ts": 1716454223166874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166899, "dur": 1, "args": { "External id": 113321, "cbid": 251, "correlation": 113321 } }, { "ph": "f", "id": 113321, "pid": 76337, "tid": -914061504, "ts": 1716454223166899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166901, "dur": 0, "args": { "External id": 113322, "cbid": 251, "correlation": 113322 } }, { "ph": "f", "id": 113322, "pid": 76337, "tid": -914061504, "ts": 1716454223166901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166902, "dur": 0, "args": { "External id": 113323, "cbid": 251, "correlation": 113323 } }, { "ph": "f", "id": 113323, "pid": 76337, "tid": -914061504, "ts": 1716454223166902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166903, "dur": 0, "args": { "External id": 113324, "cbid": 251, "correlation": 113324 } }, { "ph": "f", "id": 113324, "pid": 76337, "tid": -914061504, "ts": 1716454223166903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166903, "dur": 1, "args": { "External id": 113325, "cbid": 251, "correlation": 113325 } }, { "ph": "f", "id": 113325, "pid": 76337, "tid": -914061504, "ts": 1716454223166903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166905, "dur": 0, "args": { "External id": 113326, "cbid": 251, "correlation": 113326 } }, { "ph": "f", "id": 113326, "pid": 76337, "tid": -914061504, "ts": 1716454223166905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166906, "dur": 0, "args": { "External id": 113327, "cbid": 251, "correlation": 113327 } }, { "ph": "f", "id": 113327, "pid": 76337, "tid": -914061504, "ts": 1716454223166906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166907, "dur": 1, "args": { "External id": 113328, "cbid": 251, "correlation": 113328 } }, { "ph": "f", "id": 113328, "pid": 76337, "tid": -914061504, "ts": 1716454223166907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223166909, "dur": 0, "args": { "External id": 113329, "cbid": 251, "correlation": 113329 } }, { "ph": "f", "id": 113329, "pid": 76337, "tid": -914061504, "ts": 1716454223166909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223188780, "dur": 117, "args": { "External id": 113330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113330, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113330, "pid": 5, "tid": 7, "ts": 1716454223188780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166911, "dur": 12, "args": { "External id": 113330, "cbid": 211, "correlation": 113330 } }, { "ph": "s", "id": 113330, "pid": 76337, "tid": -914061504, "ts": 1716454223166911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223188898, "dur": 60, "args": { "External id": 113336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113336, "pid": 5, "tid": 7, "ts": 1716454223188898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166946, "dur": 9, "args": { "External id": 113336, "cbid": 211, "correlation": 113336 } }, { "ph": "s", "id": 113336, "pid": 76337, "tid": -914061504, "ts": 1716454223166946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223188959, "dur": 51, "args": { "External id": 113344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113344, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113344, "pid": 5, "tid": 7, "ts": 1716454223188959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223166987, "dur": 9, "args": { "External id": 113344, "cbid": 211, "correlation": 113344 } }, { "ph": "s", "id": 113344, "pid": 76337, "tid": -914061504, "ts": 1716454223166987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223189012, "dur": 51, "args": { "External id": 113364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113364, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 113364, "pid": 5, "tid": 7, "ts": 1716454223189012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167062, "dur": 12, "args": { "External id": 113364, "cbid": 211, "correlation": 113364 } }, { "ph": "s", "id": 113364, "pid": 76337, "tid": -914061504, "ts": 1716454223167062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223189064, "dur": 4, "args": { "External id": 113376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113376, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 113376, "pid": 5, "tid": 7, "ts": 1716454223189064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167083, "dur": 6, "args": { "External id": 113376, "cbid": 211, "correlation": 113376 } }, { "ph": "s", "id": 113376, "pid": 76337, "tid": -914061504, "ts": 1716454223167083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223189070, "dur": 56, "args": { "External id": 113379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113379, "pid": 5, "tid": 7, "ts": 1716454223189070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167101, "dur": 7, "args": { "External id": 113379, "cbid": 211, "correlation": 113379 } }, { "ph": "s", "id": 113379, "pid": 76337, "tid": -914061504, "ts": 1716454223167101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223189127, "dur": 36, "args": { "External id": 113388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113388, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113388, "pid": 5, "tid": 7, "ts": 1716454223189127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167141, "dur": 10, "args": { "External id": 113388, "cbid": 211, "correlation": 113388 } }, { "ph": "s", "id": 113388, "pid": 76337, "tid": -914061504, "ts": 1716454223167141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223167192, "dur": 0, "args": { "External id": 113398, "cbid": 317, "correlation": 113398 } }, { "ph": "f", "id": 113398, "pid": 76337, "tid": -914061504, "ts": 1716454223167192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223167193, "dur": 0, "args": { "External id": 113399, "cbid": 203, "correlation": 113399 } }, { "ph": "f", "id": 113399, "pid": 76337, "tid": -914061504, "ts": 1716454223167193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223167194, "dur": 0, "args": { "External id": 113400, "cbid": 205, "correlation": 113400 } }, { "ph": "f", "id": 113400, "pid": 76337, "tid": -914061504, "ts": 1716454223167194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223189165, "dur": 41, "args": { "External id": 113404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113404, "pid": 5, "tid": 7, "ts": 1716454223189165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167209, "dur": 14, "args": { "External id": 113404, "cbid": 211, "correlation": 113404 } }, { "ph": "s", "id": 113404, "pid": 76337, "tid": -914061504, "ts": 1716454223167209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223189207, "dur": 15, "args": { "External id": 113406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113406, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113406, "pid": 5, "tid": 7, "ts": 1716454223189207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167225, "dur": 6, "args": { "External id": 113406, "cbid": 211, "correlation": 113406 } }, { "ph": "s", "id": 113406, "pid": 76337, "tid": -914061504, "ts": 1716454223167225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223189222, "dur": 3, "args": { "External id": 113408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 113408, "pid": 5, "tid": 7, "ts": 1716454223189222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167235, "dur": 6, "args": { "External id": 113408, "cbid": 211, "correlation": 113408 } }, { "ph": "s", "id": 113408, "pid": 76337, "tid": -914061504, "ts": 1716454223167235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223167244, "dur": 0, "args": { "External id": 113409, "cbid": 51, "correlation": 113409 } }, { "ph": "s", "id": 113409, "pid": 76337, "tid": -914061504, "ts": 1716454223167244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223189227, "dur": 701, "args": { "External id": 113410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113410, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113410, "pid": 5, "tid": 7, "ts": 1716454223189227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167245, "dur": 5, "args": { "External id": 113410, "cbid": 211, "correlation": 113410 } }, { "ph": "s", "id": 113410, "pid": 76337, "tid": -914061504, "ts": 1716454223167245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223189930, "dur": 59, "args": { "External id": 113415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113415, "pid": 5, "tid": 7, "ts": 1716454223189930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167273, "dur": 9, "args": { "External id": 113415, "cbid": 211, "correlation": 113415 } }, { "ph": "s", "id": 113415, "pid": 76337, "tid": -914061504, "ts": 1716454223167273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223189990, "dur": 4, "args": { "External id": 113423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113423, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 113423, "pid": 5, "tid": 7, "ts": 1716454223189990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167315, "dur": 9, "args": { "External id": 113423, "cbid": 211, "correlation": 113423 } }, { "ph": "s", "id": 113423, "pid": 76337, "tid": -914061504, "ts": 1716454223167315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223167382, "dur": 1, "args": { "External id": 113439, "cbid": 251, "correlation": 113439 } }, { "ph": "f", "id": 113439, "pid": 76337, "tid": -914061504, "ts": 1716454223167382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223167387, "dur": 0, "args": { "External id": 113441, "cbid": 251, "correlation": 113441 } }, { "ph": "f", "id": 113441, "pid": 76337, "tid": -914061504, "ts": 1716454223167387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223189996, "dur": 11, "args": { "External id": 113442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113442, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 113442, "pid": 5, "tid": 7, "ts": 1716454223189996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167389, "dur": 11, "args": { "External id": 113442, "cbid": 211, "correlation": 113442 } }, { "ph": "s", "id": 113442, "pid": 76337, "tid": -914061504, "ts": 1716454223167389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223190008, "dur": 5, "args": { "External id": 113444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113444, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 113444, "pid": 5, "tid": 7, "ts": 1716454223190008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167402, "dur": 6, "args": { "External id": 113444, "cbid": 211, "correlation": 113444 } }, { "ph": "s", "id": 113444, "pid": 76337, "tid": -914061504, "ts": 1716454223167402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223190015, "dur": 52, "args": { "External id": 113454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113454, "pid": 5, "tid": 7, "ts": 1716454223190015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167458, "dur": 13, "args": { "External id": 113454, "cbid": 211, "correlation": 113454 } }, { "ph": "s", "id": 113454, "pid": 76337, "tid": -914061504, "ts": 1716454223167458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223190068, "dur": 51, "args": { "External id": 113474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113474, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 113474, "pid": 5, "tid": 7, "ts": 1716454223190068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167525, "dur": 11, "args": { "External id": 113474, "cbid": 211, "correlation": 113474 } }, { "ph": "s", "id": 113474, "pid": 76337, "tid": -914061504, "ts": 1716454223167525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223190121, "dur": 4, "args": { "External id": 113486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113486, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 113486, "pid": 5, "tid": 7, "ts": 1716454223190121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167545, "dur": 6, "args": { "External id": 113486, "cbid": 211, "correlation": 113486 } }, { "ph": "s", "id": 113486, "pid": 76337, "tid": -914061504, "ts": 1716454223167545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223190126, "dur": 56, "args": { "External id": 113489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113489, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113489, "pid": 5, "tid": 7, "ts": 1716454223190126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167565, "dur": 7, "args": { "External id": 113489, "cbid": 211, "correlation": 113489 } }, { "ph": "s", "id": 113489, "pid": 76337, "tid": -914061504, "ts": 1716454223167565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223190183, "dur": 37, "args": { "External id": 113498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113498, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113498, "pid": 5, "tid": 7, "ts": 1716454223190183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167605, "dur": 11, "args": { "External id": 113498, "cbid": 211, "correlation": 113498 } }, { "ph": "s", "id": 113498, "pid": 76337, "tid": -914061504, "ts": 1716454223167605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223167668, "dur": 0, "args": { "External id": 113508, "cbid": 317, "correlation": 113508 } }, { "ph": "f", "id": 113508, "pid": 76337, "tid": -914061504, "ts": 1716454223167668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223167669, "dur": 0, "args": { "External id": 113509, "cbid": 203, "correlation": 113509 } }, { "ph": "f", "id": 113509, "pid": 76337, "tid": -914061504, "ts": 1716454223167669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223167670, "dur": 0, "args": { "External id": 113510, "cbid": 205, "correlation": 113510 } }, { "ph": "f", "id": 113510, "pid": 76337, "tid": -914061504, "ts": 1716454223167670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223190221, "dur": 40, "args": { "External id": 113514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113514, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113514, "pid": 5, "tid": 7, "ts": 1716454223190221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167684, "dur": 12, "args": { "External id": 113514, "cbid": 211, "correlation": 113514 } }, { "ph": "s", "id": 113514, "pid": 76337, "tid": -914061504, "ts": 1716454223167684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223190262, "dur": 14, "args": { "External id": 113516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113516, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113516, "pid": 5, "tid": 7, "ts": 1716454223190262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167698, "dur": 5, "args": { "External id": 113516, "cbid": 211, "correlation": 113516 } }, { "ph": "s", "id": 113516, "pid": 76337, "tid": -914061504, "ts": 1716454223167698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223190278, "dur": 3, "args": { "External id": 113518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113518, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 113518, "pid": 5, "tid": 7, "ts": 1716454223190278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167707, "dur": 6, "args": { "External id": 113518, "cbid": 211, "correlation": 113518 } }, { "ph": "s", "id": 113518, "pid": 76337, "tid": -914061504, "ts": 1716454223167707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223167716, "dur": 0, "args": { "External id": 113519, "cbid": 51, "correlation": 113519 } }, { "ph": "s", "id": 113519, "pid": 76337, "tid": -914061504, "ts": 1716454223167716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223190282, "dur": 693, "args": { "External id": 113520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113520, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113520, "pid": 5, "tid": 7, "ts": 1716454223190282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167717, "dur": 5, "args": { "External id": 113520, "cbid": 211, "correlation": 113520 } }, { "ph": "s", "id": 113520, "pid": 76337, "tid": -914061504, "ts": 1716454223167717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223190976, "dur": 59, "args": { "External id": 113525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113525, "pid": 5, "tid": 7, "ts": 1716454223190976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167744, "dur": 8, "args": { "External id": 113525, "cbid": 211, "correlation": 113525 } }, { "ph": "s", "id": 113525, "pid": 76337, "tid": -914061504, "ts": 1716454223167744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223191036, "dur": 50, "args": { "External id": 113533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113533, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113533, "pid": 5, "tid": 7, "ts": 1716454223191036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167776, "dur": 8, "args": { "External id": 113533, "cbid": 211, "correlation": 113533 } }, { "ph": "s", "id": 113533, "pid": 76337, "tid": -914061504, "ts": 1716454223167776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223191087, "dur": 35, "args": { "External id": 113541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113541, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113541, "pid": 5, "tid": 7, "ts": 1716454223191087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167806, "dur": 9, "args": { "External id": 113541, "cbid": 211, "correlation": 113541 } }, { "ph": "s", "id": 113541, "pid": 76337, "tid": -914061504, "ts": 1716454223167806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223191124, "dur": 51, "args": { "External id": 113561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113561, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 113561, "pid": 5, "tid": 7, "ts": 1716454223191124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167885, "dur": 12, "args": { "External id": 113561, "cbid": 211, "correlation": 113561 } }, { "ph": "s", "id": 113561, "pid": 76337, "tid": -914061504, "ts": 1716454223167885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223191177, "dur": 4, "args": { "External id": 113573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113573, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 113573, "pid": 5, "tid": 7, "ts": 1716454223191177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167906, "dur": 6, "args": { "External id": 113573, "cbid": 211, "correlation": 113573 } }, { "ph": "s", "id": 113573, "pid": 76337, "tid": -914061504, "ts": 1716454223167906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223191182, "dur": 56, "args": { "External id": 113576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113576, "pid": 5, "tid": 7, "ts": 1716454223191182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223167924, "dur": 7, "args": { "External id": 113576, "cbid": 211, "correlation": 113576 } }, { "ph": "s", "id": 113576, "pid": 76337, "tid": -914061504, "ts": 1716454223167924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223167989, "dur": 0, "args": { "External id": 113587, "cbid": 317, "correlation": 113587 } }, { "ph": "f", "id": 113587, "pid": 76337, "tid": -914061504, "ts": 1716454223167989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223167989, "dur": 0, "args": { "External id": 113588, "cbid": 203, "correlation": 113588 } }, { "ph": "f", "id": 113588, "pid": 76337, "tid": -914061504, "ts": 1716454223167989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223167990, "dur": 0, "args": { "External id": 113589, "cbid": 205, "correlation": 113589 } }, { "ph": "f", "id": 113589, "pid": 76337, "tid": -914061504, "ts": 1716454223167990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168011, "dur": 1, "args": { "External id": 113593, "cbid": 251, "correlation": 113593 } }, { "ph": "f", "id": 113593, "pid": 76337, "tid": -914061504, "ts": 1716454223168011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168014, "dur": 0, "args": { "External id": 113594, "cbid": 251, "correlation": 113594 } }, { "ph": "f", "id": 113594, "pid": 76337, "tid": -914061504, "ts": 1716454223168014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168015, "dur": 0, "args": { "External id": 113595, "cbid": 251, "correlation": 113595 } }, { "ph": "f", "id": 113595, "pid": 76337, "tid": -914061504, "ts": 1716454223168015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168016, "dur": 0, "args": { "External id": 113596, "cbid": 251, "correlation": 113596 } }, { "ph": "f", "id": 113596, "pid": 76337, "tid": -914061504, "ts": 1716454223168016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168016, "dur": 0, "args": { "External id": 113597, "cbid": 251, "correlation": 113597 } }, { "ph": "f", "id": 113597, "pid": 76337, "tid": -914061504, "ts": 1716454223168016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168017, "dur": 0, "args": { "External id": 113598, "cbid": 251, "correlation": 113598 } }, { "ph": "f", "id": 113598, "pid": 76337, "tid": -914061504, "ts": 1716454223168017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168018, "dur": 0, "args": { "External id": 113599, "cbid": 251, "correlation": 113599 } }, { "ph": "f", "id": 113599, "pid": 76337, "tid": -914061504, "ts": 1716454223168018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168018, "dur": 0, "args": { "External id": 113600, "cbid": 251, "correlation": 113600 } }, { "ph": "f", "id": 113600, "pid": 76337, "tid": -914061504, "ts": 1716454223168018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168020, "dur": 0, "args": { "External id": 113601, "cbid": 251, "correlation": 113601 } }, { "ph": "f", "id": 113601, "pid": 76337, "tid": -914061504, "ts": 1716454223168020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223191239, "dur": 111, "args": { "External id": 113602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113602, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113602, "pid": 5, "tid": 7, "ts": 1716454223191239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168022, "dur": 12, "args": { "External id": 113602, "cbid": 211, "correlation": 113602 } }, { "ph": "s", "id": 113602, "pid": 76337, "tid": -914061504, "ts": 1716454223168022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223191352, "dur": 59, "args": { "External id": 113608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113608, "pid": 5, "tid": 7, "ts": 1716454223191352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168057, "dur": 10, "args": { "External id": 113608, "cbid": 211, "correlation": 113608 } }, { "ph": "s", "id": 113608, "pid": 76337, "tid": -914061504, "ts": 1716454223168057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223191412, "dur": 670, "args": { "External id": 113617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113617, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113617, "pid": 5, "tid": 7, "ts": 1716454223191412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168140, "dur": 14, "args": { "External id": 113617, "cbid": 211, "correlation": 113617 } }, { "ph": "s", "id": 113617, "pid": 76337, "tid": -914061504, "ts": 1716454223168140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223192083, "dur": 181, "args": { "External id": 113639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113639, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113639, "pid": 5, "tid": 7, "ts": 1716454223192083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168198, "dur": 11, "args": { "External id": 113639, "cbid": 211, "correlation": 113639 } }, { "ph": "s", "id": 113639, "pid": 76337, "tid": -914061504, "ts": 1716454223168198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168284, "dur": 1, "args": { "External id": 113650, "cbid": 251, "correlation": 113650 } }, { "ph": "f", "id": 113650, "pid": 76337, "tid": -914061504, "ts": 1716454223168284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223192266, "dur": 195, "args": { "External id": 113651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113651, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113651, "pid": 5, "tid": 7, "ts": 1716454223192266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168289, "dur": 13, "args": { "External id": 113651, "cbid": 211, "correlation": 113651 } }, { "ph": "s", "id": 113651, "pid": 76337, "tid": -914061504, "ts": 1716454223168289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168357, "dur": 1, "args": { "External id": 113662, "cbid": 251, "correlation": 113662 } }, { "ph": "f", "id": 113662, "pid": 76337, "tid": -914061504, "ts": 1716454223168357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223192462, "dur": 185, "args": { "External id": 113663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113663, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113663, "pid": 5, "tid": 7, "ts": 1716454223192462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168362, "dur": 12, "args": { "External id": 113663, "cbid": 211, "correlation": 113663 } }, { "ph": "s", "id": 113663, "pid": 76337, "tid": -914061504, "ts": 1716454223168362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168425, "dur": 1, "args": { "External id": 113674, "cbid": 251, "correlation": 113674 } }, { "ph": "f", "id": 113674, "pid": 76337, "tid": -914061504, "ts": 1716454223168425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223192648, "dur": 188, "args": { "External id": 113675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113675, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113675, "pid": 5, "tid": 7, "ts": 1716454223192648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168429, "dur": 11, "args": { "External id": 113675, "cbid": 211, "correlation": 113675 } }, { "ph": "s", "id": 113675, "pid": 76337, "tid": -914061504, "ts": 1716454223168429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223192837, "dur": 18620, "args": { "External id": 113696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113696, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113696, "pid": 5, "tid": 7, "ts": 1716454223192837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168510, "dur": 13, "args": { "External id": 113696, "cbid": 211, "correlation": 113696 } }, { "ph": "s", "id": 113696, "pid": 76337, "tid": -914061504, "ts": 1716454223168510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168606, "dur": 1, "args": { "External id": 113714, "cbid": 251, "correlation": 113714 } }, { "ph": "f", "id": 113714, "pid": 76337, "tid": -914061504, "ts": 1716454223168606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223211458, "dur": 202, "args": { "External id": 113716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113716, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113716, "pid": 5, "tid": 7, "ts": 1716454223211458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168612, "dur": 14, "args": { "External id": 113716, "cbid": 211, "correlation": 113716 } }, { "ph": "s", "id": 113716, "pid": 76337, "tid": -914061504, "ts": 1716454223168612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223211662, "dur": 66, "args": { "External id": 113724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113724, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113724, "pid": 5, "tid": 7, "ts": 1716454223211662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168683, "dur": 12, "args": { "External id": 113724, "cbid": 211, "correlation": 113724 } }, { "ph": "s", "id": 113724, "pid": 76337, "tid": -914061504, "ts": 1716454223168683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223211729, "dur": 97, "args": { "External id": 113732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113732, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113732, "pid": 5, "tid": 7, "ts": 1716454223211729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168722, "dur": 9, "args": { "External id": 113732, "cbid": 211, "correlation": 113732 } }, { "ph": "s", "id": 113732, "pid": 76337, "tid": -914061504, "ts": 1716454223168722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223211827, "dur": 55, "args": { "External id": 113743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113743, "pid": 5, "tid": 7, "ts": 1716454223211827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168793, "dur": 12, "args": { "External id": 113743, "cbid": 211, "correlation": 113743 } }, { "ph": "s", "id": 113743, "pid": 76337, "tid": -914061504, "ts": 1716454223168793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223211884, "dur": 92, "args": { "External id": 113765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113765, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113765, "pid": 5, "tid": 7, "ts": 1716454223211884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168825, "dur": 8, "args": { "External id": 113765, "cbid": 211, "correlation": 113765 } }, { "ph": "s", "id": 113765, "pid": 76337, "tid": -914061504, "ts": 1716454223168825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168908, "dur": 1, "args": { "External id": 113776, "cbid": 251, "correlation": 113776 } }, { "ph": "f", "id": 113776, "pid": 76337, "tid": -914061504, "ts": 1716454223168908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223211977, "dur": 102, "args": { "External id": 113777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113777, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113777, "pid": 5, "tid": 7, "ts": 1716454223211977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168913, "dur": 13, "args": { "External id": 113777, "cbid": 211, "correlation": 113777 } }, { "ph": "s", "id": 113777, "pid": 76337, "tid": -914061504, "ts": 1716454223168913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168990, "dur": 1, "args": { "External id": 113788, "cbid": 251, "correlation": 113788 } }, { "ph": "f", "id": 113788, "pid": 76337, "tid": -914061504, "ts": 1716454223168990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223168994, "dur": 0, "args": { "External id": 113789, "cbid": 251, "correlation": 113789 } }, { "ph": "f", "id": 113789, "pid": 76337, "tid": -914061504, "ts": 1716454223168994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223212080, "dur": 10, "args": { "External id": 113790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113790, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 113790, "pid": 5, "tid": 7, "ts": 1716454223212080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223168996, "dur": 12, "args": { "External id": 113790, "cbid": 211, "correlation": 113790 } }, { "ph": "s", "id": 113790, "pid": 76337, "tid": -914061504, "ts": 1716454223168996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223212092, "dur": 5, "args": { "External id": 113792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113792, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 113792, "pid": 5, "tid": 7, "ts": 1716454223212092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169010, "dur": 6, "args": { "External id": 113792, "cbid": 211, "correlation": 113792 } }, { "ph": "s", "id": 113792, "pid": 76337, "tid": -914061504, "ts": 1716454223169010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169073, "dur": 1, "args": { "External id": 113803, "cbid": 251, "correlation": 113803 } }, { "ph": "f", "id": 113803, "pid": 76337, "tid": -914061504, "ts": 1716454223169073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169077, "dur": 0, "args": { "External id": 113804, "cbid": 251, "correlation": 113804 } }, { "ph": "f", "id": 113804, "pid": 76337, "tid": -914061504, "ts": 1716454223169077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223212098, "dur": 6, "args": { "External id": 113805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113805, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 113805, "pid": 5, "tid": 7, "ts": 1716454223212098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169079, "dur": 11, "args": { "External id": 113805, "cbid": 211, "correlation": 113805 } }, { "ph": "s", "id": 113805, "pid": 76337, "tid": -914061504, "ts": 1716454223169079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223212105, "dur": 3, "args": { "External id": 113807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113807, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 113807, "pid": 5, "tid": 7, "ts": 1716454223212105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169092, "dur": 5, "args": { "External id": 113807, "cbid": 211, "correlation": 113807 } }, { "ph": "s", "id": 113807, "pid": 76337, "tid": -914061504, "ts": 1716454223169092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223212110, "dur": 155, "args": { "External id": 113828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113828, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113828, "pid": 5, "tid": 7, "ts": 1716454223212110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169165, "dur": 13, "args": { "External id": 113828, "cbid": 211, "correlation": 113828 } }, { "ph": "s", "id": 113828, "pid": 76337, "tid": -914061504, "ts": 1716454223169165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169262, "dur": 1, "args": { "External id": 113846, "cbid": 251, "correlation": 113846 } }, { "ph": "f", "id": 113846, "pid": 76337, "tid": -914061504, "ts": 1716454223169262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223212267, "dur": 105, "args": { "External id": 113848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113848, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113848, "pid": 5, "tid": 7, "ts": 1716454223212267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169268, "dur": 14, "args": { "External id": 113848, "cbid": 211, "correlation": 113848 } }, { "ph": "s", "id": 113848, "pid": 76337, "tid": -914061504, "ts": 1716454223169268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223212374, "dur": 35, "args": { "External id": 113856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113856, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113856, "pid": 5, "tid": 7, "ts": 1716454223212374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169338, "dur": 12, "args": { "External id": 113856, "cbid": 211, "correlation": 113856 } }, { "ph": "s", "id": 113856, "pid": 76337, "tid": -914061504, "ts": 1716454223169338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223212410, "dur": 67, "args": { "External id": 113864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113864, "pid": 5, "tid": 7, "ts": 1716454223212410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169379, "dur": 9, "args": { "External id": 113864, "cbid": 211, "correlation": 113864 } }, { "ph": "s", "id": 113864, "pid": 76337, "tid": -914061504, "ts": 1716454223169379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223212478, "dur": 92, "args": { "External id": 113886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113886, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113886, "pid": 5, "tid": 7, "ts": 1716454223212478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169430, "dur": 10, "args": { "External id": 113886, "cbid": 211, "correlation": 113886 } }, { "ph": "s", "id": 113886, "pid": 76337, "tid": -914061504, "ts": 1716454223169430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169515, "dur": 1, "args": { "External id": 113902, "cbid": 251, "correlation": 113902 } }, { "ph": "f", "id": 113902, "pid": 76337, "tid": -914061504, "ts": 1716454223169515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223212572, "dur": 573, "args": { "External id": 113904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113904, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 113904, "pid": 5, "tid": 7, "ts": 1716454223212572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169520, "dur": 13, "args": { "External id": 113904, "cbid": 211, "correlation": 113904 } }, { "ph": "s", "id": 113904, "pid": 76337, "tid": -914061504, "ts": 1716454223169520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223213146, "dur": 240, "args": { "External id": 113912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113912, "pid": 5, "tid": 7, "ts": 1716454223213146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169586, "dur": 12, "args": { "External id": 113912, "cbid": 211, "correlation": 113912 } }, { "ph": "s", "id": 113912, "pid": 76337, "tid": -914061504, "ts": 1716454223169586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223213388, "dur": 253, "args": { "External id": 113920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113920, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113920, "pid": 5, "tid": 7, "ts": 1716454223213388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169616, "dur": 8, "args": { "External id": 113920, "cbid": 211, "correlation": 113920 } }, { "ph": "s", "id": 113920, "pid": 76337, "tid": -914061504, "ts": 1716454223169616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169697, "dur": 1, "args": { "External id": 113936, "cbid": 251, "correlation": 113936 } }, { "ph": "f", "id": 113936, "pid": 76337, "tid": -914061504, "ts": 1716454223169697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169702, "dur": 0, "args": { "External id": 113938, "cbid": 251, "correlation": 113938 } }, { "ph": "f", "id": 113938, "pid": 76337, "tid": -914061504, "ts": 1716454223169702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223213642, "dur": 358, "args": { "External id": 113939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113939, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 113939, "pid": 5, "tid": 7, "ts": 1716454223213642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169705, "dur": 13, "args": { "External id": 113939, "cbid": 211, "correlation": 113939 } }, { "ph": "s", "id": 113939, "pid": 76337, "tid": -914061504, "ts": 1716454223169705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223214001, "dur": 50, "args": { "External id": 113947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113947, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113947, "pid": 5, "tid": 7, "ts": 1716454223214001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169748, "dur": 9, "args": { "External id": 113947, "cbid": 211, "correlation": 113947 } }, { "ph": "s", "id": 113947, "pid": 76337, "tid": -914061504, "ts": 1716454223169748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223214052, "dur": 157, "args": { "External id": 113958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113958, "pid": 5, "tid": 7, "ts": 1716454223214052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169814, "dur": 12, "args": { "External id": 113958, "cbid": 211, "correlation": 113958 } }, { "ph": "s", "id": 113958, "pid": 76337, "tid": -914061504, "ts": 1716454223169814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223169878, "dur": 0, "args": { "External id": 113970, "cbid": 317, "correlation": 113970 } }, { "ph": "f", "id": 113970, "pid": 76337, "tid": -914061504, "ts": 1716454223169878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223169879, "dur": 0, "args": { "External id": 113971, "cbid": 203, "correlation": 113971 } }, { "ph": "f", "id": 113971, "pid": 76337, "tid": -914061504, "ts": 1716454223169879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223169880, "dur": 0, "args": { "External id": 113972, "cbid": 205, "correlation": 113972 } }, { "ph": "f", "id": 113972, "pid": 76337, "tid": -914061504, "ts": 1716454223169880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169902, "dur": 1, "args": { "External id": 113976, "cbid": 251, "correlation": 113976 } }, { "ph": "f", "id": 113976, "pid": 76337, "tid": -914061504, "ts": 1716454223169902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169904, "dur": 0, "args": { "External id": 113977, "cbid": 251, "correlation": 113977 } }, { "ph": "f", "id": 113977, "pid": 76337, "tid": -914061504, "ts": 1716454223169904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169904, "dur": 0, "args": { "External id": 113978, "cbid": 251, "correlation": 113978 } }, { "ph": "f", "id": 113978, "pid": 76337, "tid": -914061504, "ts": 1716454223169904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169905, "dur": 0, "args": { "External id": 113979, "cbid": 251, "correlation": 113979 } }, { "ph": "f", "id": 113979, "pid": 76337, "tid": -914061504, "ts": 1716454223169905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169906, "dur": 0, "args": { "External id": 113980, "cbid": 251, "correlation": 113980 } }, { "ph": "f", "id": 113980, "pid": 76337, "tid": -914061504, "ts": 1716454223169906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169906, "dur": 0, "args": { "External id": 113981, "cbid": 251, "correlation": 113981 } }, { "ph": "f", "id": 113981, "pid": 76337, "tid": -914061504, "ts": 1716454223169906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169907, "dur": 0, "args": { "External id": 113982, "cbid": 251, "correlation": 113982 } }, { "ph": "f", "id": 113982, "pid": 76337, "tid": -914061504, "ts": 1716454223169907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169908, "dur": 0, "args": { "External id": 113983, "cbid": 251, "correlation": 113983 } }, { "ph": "f", "id": 113983, "pid": 76337, "tid": -914061504, "ts": 1716454223169908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223169909, "dur": 0, "args": { "External id": 113984, "cbid": 251, "correlation": 113984 } }, { "ph": "f", "id": 113984, "pid": 76337, "tid": -914061504, "ts": 1716454223169909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223214211, "dur": 115, "args": { "External id": 113985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113985, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 113985, "pid": 5, "tid": 7, "ts": 1716454223214211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169911, "dur": 12, "args": { "External id": 113985, "cbid": 211, "correlation": 113985 } }, { "ph": "s", "id": 113985, "pid": 76337, "tid": -914061504, "ts": 1716454223169911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223214327, "dur": 59, "args": { "External id": 113991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113991, "pid": 5, "tid": 7, "ts": 1716454223214327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169946, "dur": 8, "args": { "External id": 113991, "cbid": 211, "correlation": 113991 } }, { "ph": "s", "id": 113991, "pid": 76337, "tid": -914061504, "ts": 1716454223169946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223214388, "dur": 50, "args": { "External id": 113999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 113999, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 113999, "pid": 5, "tid": 7, "ts": 1716454223214388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223169985, "dur": 9, "args": { "External id": 113999, "cbid": 211, "correlation": 113999 } }, { "ph": "s", "id": 113999, "pid": 76337, "tid": -914061504, "ts": 1716454223169985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223170060, "dur": 0, "args": { "External id": 114009, "cbid": 317, "correlation": 114009 } }, { "ph": "f", "id": 114009, "pid": 76337, "tid": -914061504, "ts": 1716454223170060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223170061, "dur": 0, "args": { "External id": 114010, "cbid": 203, "correlation": 114010 } }, { "ph": "f", "id": 114010, "pid": 76337, "tid": -914061504, "ts": 1716454223170061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223170061, "dur": 0, "args": { "External id": 114011, "cbid": 205, "correlation": 114011 } }, { "ph": "f", "id": 114011, "pid": 76337, "tid": -914061504, "ts": 1716454223170061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223214439, "dur": 41, "args": { "External id": 114015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114015, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114015, "pid": 5, "tid": 7, "ts": 1716454223214439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170078, "dur": 13, "args": { "External id": 114015, "cbid": 211, "correlation": 114015 } }, { "ph": "s", "id": 114015, "pid": 76337, "tid": -914061504, "ts": 1716454223170078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223214482, "dur": 14, "args": { "External id": 114017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114017, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114017, "pid": 5, "tid": 7, "ts": 1716454223214482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170093, "dur": 5, "args": { "External id": 114017, "cbid": 211, "correlation": 114017 } }, { "ph": "s", "id": 114017, "pid": 76337, "tid": -914061504, "ts": 1716454223170093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223214498, "dur": 1, "args": { "External id": 114019, "device": 5, "context": 1, "stream": 7, "correlation": 114019, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 114019, "pid": 5, "tid": 7, "ts": 1716454223214498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223170112, "dur": 17, "args": { "External id": 114019, "cbid": 51, "correlation": 114019 } }, { "ph": "s", "id": 114019, "pid": 76337, "tid": -914061504, "ts": 1716454223170112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223214502, "dur": 361, "args": { "External id": 114020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114020, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114020, "pid": 5, "tid": 7, "ts": 1716454223214502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170132, "dur": 9, "args": { "External id": 114020, "cbid": 211, "correlation": 114020 } }, { "ph": "s", "id": 114020, "pid": 76337, "tid": -914061504, "ts": 1716454223170132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223214865, "dur": 14, "args": { "External id": 114022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114022, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114022, "pid": 5, "tid": 7, "ts": 1716454223214865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170149, "dur": 7, "args": { "External id": 114022, "cbid": 211, "correlation": 114022 } }, { "ph": "s", "id": 114022, "pid": 76337, "tid": -914061504, "ts": 1716454223170149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223214880, "dur": 14, "args": { "External id": 114028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114028, "pid": 5, "tid": 7, "ts": 1716454223214880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170180, "dur": 8, "args": { "External id": 114028, "cbid": 211, "correlation": 114028 } }, { "ph": "s", "id": 114028, "pid": 76337, "tid": -914061504, "ts": 1716454223170180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223214896, "dur": 19, "args": { "External id": 114048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114048, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 114048, "pid": 5, "tid": 7, "ts": 1716454223214896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170271, "dur": 13, "args": { "External id": 114048, "cbid": 211, "correlation": 114048 } }, { "ph": "s", "id": 114048, "pid": 76337, "tid": -914061504, "ts": 1716454223170271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223214916, "dur": 4, "args": { "External id": 114060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114060, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 114060, "pid": 5, "tid": 7, "ts": 1716454223214916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170293, "dur": 6, "args": { "External id": 114060, "cbid": 211, "correlation": 114060 } }, { "ph": "s", "id": 114060, "pid": 76337, "tid": -914061504, "ts": 1716454223170293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223214921, "dur": 18, "args": { "External id": 114063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114063, "pid": 5, "tid": 7, "ts": 1716454223214921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170311, "dur": 7, "args": { "External id": 114063, "cbid": 211, "correlation": 114063 } }, { "ph": "s", "id": 114063, "pid": 76337, "tid": -914061504, "ts": 1716454223170311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223214940, "dur": 11, "args": { "External id": 114072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114072, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114072, "pid": 5, "tid": 7, "ts": 1716454223214940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170351, "dur": 10, "args": { "External id": 114072, "cbid": 211, "correlation": 114072 } }, { "ph": "s", "id": 114072, "pid": 76337, "tid": -914061504, "ts": 1716454223170351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223170407, "dur": 0, "args": { "External id": 114082, "cbid": 317, "correlation": 114082 } }, { "ph": "f", "id": 114082, "pid": 76337, "tid": -914061504, "ts": 1716454223170407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223170408, "dur": 0, "args": { "External id": 114083, "cbid": 203, "correlation": 114083 } }, { "ph": "f", "id": 114083, "pid": 76337, "tid": -914061504, "ts": 1716454223170408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223170409, "dur": 0, "args": { "External id": 114084, "cbid": 205, "correlation": 114084 } }, { "ph": "f", "id": 114084, "pid": 76337, "tid": -914061504, "ts": 1716454223170409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223214953, "dur": 11, "args": { "External id": 114088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114088, "pid": 5, "tid": 7, "ts": 1716454223214953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170424, "dur": 11, "args": { "External id": 114088, "cbid": 211, "correlation": 114088 } }, { "ph": "s", "id": 114088, "pid": 76337, "tid": -914061504, "ts": 1716454223170424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223214965, "dur": 25, "args": { "External id": 114090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114090, "pid": 5, "tid": 7, "ts": 1716454223214965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170438, "dur": 5, "args": { "External id": 114090, "cbid": 211, "correlation": 114090 } }, { "ph": "s", "id": 114090, "pid": 76337, "tid": -914061504, "ts": 1716454223170438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223214991, "dur": 4, "args": { "External id": 114092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 114092, "pid": 5, "tid": 7, "ts": 1716454223214991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170449, "dur": 5, "args": { "External id": 114092, "cbid": 211, "correlation": 114092 } }, { "ph": "s", "id": 114092, "pid": 76337, "tid": -914061504, "ts": 1716454223170449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223170459, "dur": 0, "args": { "External id": 114093, "cbid": 51, "correlation": 114093 } }, { "ph": "s", "id": 114093, "pid": 76337, "tid": -914061504, "ts": 1716454223170459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223214996, "dur": 355, "args": { "External id": 114094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114094, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114094, "pid": 5, "tid": 7, "ts": 1716454223214996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170460, "dur": 8, "args": { "External id": 114094, "cbid": 211, "correlation": 114094 } }, { "ph": "s", "id": 114094, "pid": 76337, "tid": -914061504, "ts": 1716454223170460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223215353, "dur": 20, "args": { "External id": 114095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114095, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114095, "pid": 5, "tid": 7, "ts": 1716454223215353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170471, "dur": 5, "args": { "External id": 114095, "cbid": 211, "correlation": 114095 } }, { "ph": "s", "id": 114095, "pid": 76337, "tid": -914061504, "ts": 1716454223170471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223215374, "dur": 33, "args": { "External id": 114101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114101, "pid": 5, "tid": 7, "ts": 1716454223215374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170499, "dur": 9, "args": { "External id": 114101, "cbid": 211, "correlation": 114101 } }, { "ph": "s", "id": 114101, "pid": 76337, "tid": -914061504, "ts": 1716454223170499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223215408, "dur": 5, "args": { "External id": 114109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114109, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 114109, "pid": 5, "tid": 7, "ts": 1716454223215408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170543, "dur": 9, "args": { "External id": 114109, "cbid": 211, "correlation": 114109 } }, { "ph": "s", "id": 114109, "pid": 76337, "tid": -914061504, "ts": 1716454223170543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223170608, "dur": 1, "args": { "External id": 114125, "cbid": 251, "correlation": 114125 } }, { "ph": "f", "id": 114125, "pid": 76337, "tid": -914061504, "ts": 1716454223170608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223170613, "dur": 0, "args": { "External id": 114127, "cbid": 251, "correlation": 114127 } }, { "ph": "f", "id": 114127, "pid": 76337, "tid": -914061504, "ts": 1716454223170613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223215414, "dur": 13, "args": { "External id": 114128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114128, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 114128, "pid": 5, "tid": 7, "ts": 1716454223215414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170615, "dur": 12, "args": { "External id": 114128, "cbid": 211, "correlation": 114128 } }, { "ph": "s", "id": 114128, "pid": 76337, "tid": -914061504, "ts": 1716454223170615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223215428, "dur": 5, "args": { "External id": 114130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114130, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 114130, "pid": 5, "tid": 7, "ts": 1716454223215428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170629, "dur": 6, "args": { "External id": 114130, "cbid": 211, "correlation": 114130 } }, { "ph": "s", "id": 114130, "pid": 76337, "tid": -914061504, "ts": 1716454223170629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223215435, "dur": 30, "args": { "External id": 114140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114140, "pid": 5, "tid": 7, "ts": 1716454223215435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170687, "dur": 11, "args": { "External id": 114140, "cbid": 211, "correlation": 114140 } }, { "ph": "s", "id": 114140, "pid": 76337, "tid": -914061504, "ts": 1716454223170687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223215466, "dur": 31, "args": { "External id": 114160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114160, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 114160, "pid": 5, "tid": 7, "ts": 1716454223215466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170752, "dur": 11, "args": { "External id": 114160, "cbid": 211, "correlation": 114160 } }, { "ph": "s", "id": 114160, "pid": 76337, "tid": -914061504, "ts": 1716454223170752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223215498, "dur": 4, "args": { "External id": 114172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114172, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 114172, "pid": 5, "tid": 7, "ts": 1716454223215498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170773, "dur": 7, "args": { "External id": 114172, "cbid": 211, "correlation": 114172 } }, { "ph": "s", "id": 114172, "pid": 76337, "tid": -914061504, "ts": 1716454223170773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223215504, "dur": 29, "args": { "External id": 114175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114175, "pid": 5, "tid": 7, "ts": 1716454223215504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170792, "dur": 7, "args": { "External id": 114175, "cbid": 211, "correlation": 114175 } }, { "ph": "s", "id": 114175, "pid": 76337, "tid": -914061504, "ts": 1716454223170792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223215534, "dur": 21, "args": { "External id": 114184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114184, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114184, "pid": 5, "tid": 7, "ts": 1716454223215534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170833, "dur": 9, "args": { "External id": 114184, "cbid": 211, "correlation": 114184 } }, { "ph": "s", "id": 114184, "pid": 76337, "tid": -914061504, "ts": 1716454223170833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223170896, "dur": 0, "args": { "External id": 114194, "cbid": 317, "correlation": 114194 } }, { "ph": "f", "id": 114194, "pid": 76337, "tid": -914061504, "ts": 1716454223170896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223170897, "dur": 0, "args": { "External id": 114195, "cbid": 203, "correlation": 114195 } }, { "ph": "f", "id": 114195, "pid": 76337, "tid": -914061504, "ts": 1716454223170897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223170898, "dur": 0, "args": { "External id": 114196, "cbid": 205, "correlation": 114196 } }, { "ph": "f", "id": 114196, "pid": 76337, "tid": -914061504, "ts": 1716454223170898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223215557, "dur": 23, "args": { "External id": 114200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114200, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114200, "pid": 5, "tid": 7, "ts": 1716454223215557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170917, "dur": 12, "args": { "External id": 114200, "cbid": 211, "correlation": 114200 } }, { "ph": "s", "id": 114200, "pid": 76337, "tid": -914061504, "ts": 1716454223170917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223215581, "dur": 44, "args": { "External id": 114202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114202, "pid": 5, "tid": 7, "ts": 1716454223215581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170932, "dur": 5, "args": { "External id": 114202, "cbid": 211, "correlation": 114202 } }, { "ph": "s", "id": 114202, "pid": 76337, "tid": -914061504, "ts": 1716454223170932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223215627, "dur": 651, "args": { "External id": 114204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114204, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114204, "pid": 5, "tid": 7, "ts": 1716454223215627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170946, "dur": 10, "args": { "External id": 114204, "cbid": 211, "correlation": 114204 } }, { "ph": "s", "id": 114204, "pid": 76337, "tid": -914061504, "ts": 1716454223170946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223216279, "dur": 21, "args": { "External id": 114206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114206, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114206, "pid": 5, "tid": 7, "ts": 1716454223216279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223170960, "dur": 5, "args": { "External id": 114206, "cbid": 211, "correlation": 114206 } }, { "ph": "s", "id": 114206, "pid": 76337, "tid": -914061504, "ts": 1716454223170960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223216301, "dur": 33, "args": { "External id": 114212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114212, "pid": 5, "tid": 7, "ts": 1716454223216301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171005, "dur": 9, "args": { "External id": 114212, "cbid": 211, "correlation": 114212 } }, { "ph": "s", "id": 114212, "pid": 76337, "tid": -914061504, "ts": 1716454223171005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223171066, "dur": 0, "args": { "External id": 114222, "cbid": 317, "correlation": 114222 } }, { "ph": "f", "id": 114222, "pid": 76337, "tid": -914061504, "ts": 1716454223171066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223171067, "dur": 0, "args": { "External id": 114223, "cbid": 203, "correlation": 114223 } }, { "ph": "f", "id": 114223, "pid": 76337, "tid": -914061504, "ts": 1716454223171067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223171068, "dur": 0, "args": { "External id": 114224, "cbid": 205, "correlation": 114224 } }, { "ph": "f", "id": 114224, "pid": 76337, "tid": -914061504, "ts": 1716454223171068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171090, "dur": 1, "args": { "External id": 114228, "cbid": 251, "correlation": 114228 } }, { "ph": "f", "id": 114228, "pid": 76337, "tid": -914061504, "ts": 1716454223171090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171092, "dur": 0, "args": { "External id": 114229, "cbid": 251, "correlation": 114229 } }, { "ph": "f", "id": 114229, "pid": 76337, "tid": -914061504, "ts": 1716454223171092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171093, "dur": 0, "args": { "External id": 114230, "cbid": 251, "correlation": 114230 } }, { "ph": "f", "id": 114230, "pid": 76337, "tid": -914061504, "ts": 1716454223171093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171093, "dur": 0, "args": { "External id": 114231, "cbid": 251, "correlation": 114231 } }, { "ph": "f", "id": 114231, "pid": 76337, "tid": -914061504, "ts": 1716454223171093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171094, "dur": 0, "args": { "External id": 114232, "cbid": 251, "correlation": 114232 } }, { "ph": "f", "id": 114232, "pid": 76337, "tid": -914061504, "ts": 1716454223171094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171095, "dur": 0, "args": { "External id": 114233, "cbid": 251, "correlation": 114233 } }, { "ph": "f", "id": 114233, "pid": 76337, "tid": -914061504, "ts": 1716454223171095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171095, "dur": 0, "args": { "External id": 114234, "cbid": 251, "correlation": 114234 } }, { "ph": "f", "id": 114234, "pid": 76337, "tid": -914061504, "ts": 1716454223171095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171096, "dur": 0, "args": { "External id": 114235, "cbid": 251, "correlation": 114235 } }, { "ph": "f", "id": 114235, "pid": 76337, "tid": -914061504, "ts": 1716454223171096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171097, "dur": 0, "args": { "External id": 114236, "cbid": 251, "correlation": 114236 } }, { "ph": "f", "id": 114236, "pid": 76337, "tid": -914061504, "ts": 1716454223171097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223216335, "dur": 52, "args": { "External id": 114237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114237, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 114237, "pid": 5, "tid": 7, "ts": 1716454223216335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171099, "dur": 12, "args": { "External id": 114237, "cbid": 211, "correlation": 114237 } }, { "ph": "s", "id": 114237, "pid": 76337, "tid": -914061504, "ts": 1716454223171099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223216388, "dur": 32, "args": { "External id": 114243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114243, "pid": 5, "tid": 7, "ts": 1716454223216388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171132, "dur": 8, "args": { "External id": 114243, "cbid": 211, "correlation": 114243 } }, { "ph": "s", "id": 114243, "pid": 76337, "tid": -914061504, "ts": 1716454223171132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223216422, "dur": 27, "args": { "External id": 114251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114251, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114251, "pid": 5, "tid": 7, "ts": 1716454223216422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171161, "dur": 8, "args": { "External id": 114251, "cbid": 211, "correlation": 114251 } }, { "ph": "s", "id": 114251, "pid": 76337, "tid": -914061504, "ts": 1716454223171161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223216450, "dur": 20, "args": { "External id": 114259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114259, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114259, "pid": 5, "tid": 7, "ts": 1716454223216450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171191, "dur": 8, "args": { "External id": 114259, "cbid": 211, "correlation": 114259 } }, { "ph": "s", "id": 114259, "pid": 76337, "tid": -914061504, "ts": 1716454223171191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223216470, "dur": 30, "args": { "External id": 114279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114279, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 114279, "pid": 5, "tid": 7, "ts": 1716454223216470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171272, "dur": 13, "args": { "External id": 114279, "cbid": 211, "correlation": 114279 } }, { "ph": "s", "id": 114279, "pid": 76337, "tid": -914061504, "ts": 1716454223171272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223216502, "dur": 4, "args": { "External id": 114291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114291, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 114291, "pid": 5, "tid": 7, "ts": 1716454223216502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171295, "dur": 6, "args": { "External id": 114291, "cbid": 211, "correlation": 114291 } }, { "ph": "s", "id": 114291, "pid": 76337, "tid": -914061504, "ts": 1716454223171295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223216508, "dur": 30, "args": { "External id": 114294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114294, "pid": 5, "tid": 7, "ts": 1716454223216508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171313, "dur": 6, "args": { "External id": 114294, "cbid": 211, "correlation": 114294 } }, { "ph": "s", "id": 114294, "pid": 76337, "tid": -914061504, "ts": 1716454223171313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223171371, "dur": 0, "args": { "External id": 114305, "cbid": 317, "correlation": 114305 } }, { "ph": "f", "id": 114305, "pid": 76337, "tid": -914061504, "ts": 1716454223171371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223171372, "dur": 0, "args": { "External id": 114306, "cbid": 203, "correlation": 114306 } }, { "ph": "f", "id": 114306, "pid": 76337, "tid": -914061504, "ts": 1716454223171372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223171372, "dur": 0, "args": { "External id": 114307, "cbid": 205, "correlation": 114307 } }, { "ph": "f", "id": 114307, "pid": 76337, "tid": -914061504, "ts": 1716454223171372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223216539, "dur": 22, "args": { "External id": 114311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114311, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114311, "pid": 5, "tid": 7, "ts": 1716454223216539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171389, "dur": 12, "args": { "External id": 114311, "cbid": 211, "correlation": 114311 } }, { "ph": "s", "id": 114311, "pid": 76337, "tid": -914061504, "ts": 1716454223171389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223216562, "dur": 121, "args": { "External id": 114313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114313, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114313, "pid": 5, "tid": 7, "ts": 1716454223216562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171409, "dur": 8, "args": { "External id": 114313, "cbid": 211, "correlation": 114313 } }, { "ph": "s", "id": 114313, "pid": 76337, "tid": -914061504, "ts": 1716454223171409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223216684, "dur": 21, "args": { "External id": 114315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114315, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114315, "pid": 5, "tid": 7, "ts": 1716454223216684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171422, "dur": 6, "args": { "External id": 114315, "cbid": 211, "correlation": 114315 } }, { "ph": "s", "id": 114315, "pid": 76337, "tid": -914061504, "ts": 1716454223171422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223216706, "dur": 32, "args": { "External id": 114321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114321, "pid": 5, "tid": 7, "ts": 1716454223216706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171450, "dur": 8, "args": { "External id": 114321, "cbid": 211, "correlation": 114321 } }, { "ph": "s", "id": 114321, "pid": 76337, "tid": -914061504, "ts": 1716454223171450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223216740, "dur": 188, "args": { "External id": 114330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114330, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114330, "pid": 5, "tid": 7, "ts": 1716454223216740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171533, "dur": 14, "args": { "External id": 114330, "cbid": 211, "correlation": 114330 } }, { "ph": "s", "id": 114330, "pid": 76337, "tid": -914061504, "ts": 1716454223171533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223216929, "dur": 66, "args": { "External id": 114352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114352, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114352, "pid": 5, "tid": 7, "ts": 1716454223216929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171591, "dur": 10, "args": { "External id": 114352, "cbid": 211, "correlation": 114352 } }, { "ph": "s", "id": 114352, "pid": 76337, "tid": -914061504, "ts": 1716454223171591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171683, "dur": 1, "args": { "External id": 114363, "cbid": 251, "correlation": 114363 } }, { "ph": "f", "id": 114363, "pid": 76337, "tid": -914061504, "ts": 1716454223171683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223216996, "dur": 155, "args": { "External id": 114364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114364, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114364, "pid": 5, "tid": 7, "ts": 1716454223216996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171688, "dur": 13, "args": { "External id": 114364, "cbid": 211, "correlation": 114364 } }, { "ph": "s", "id": 114364, "pid": 76337, "tid": -914061504, "ts": 1716454223171688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171758, "dur": 1, "args": { "External id": 114375, "cbid": 251, "correlation": 114375 } }, { "ph": "f", "id": 114375, "pid": 76337, "tid": -914061504, "ts": 1716454223171758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223217152, "dur": 148, "args": { "External id": 114376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114376, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114376, "pid": 5, "tid": 7, "ts": 1716454223217152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171762, "dur": 11, "args": { "External id": 114376, "cbid": 211, "correlation": 114376 } }, { "ph": "s", "id": 114376, "pid": 76337, "tid": -914061504, "ts": 1716454223171762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223171827, "dur": 1, "args": { "External id": 114387, "cbid": 251, "correlation": 114387 } }, { "ph": "f", "id": 114387, "pid": 76337, "tid": -914061504, "ts": 1716454223171827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223217302, "dur": 149, "args": { "External id": 114388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114388, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114388, "pid": 5, "tid": 7, "ts": 1716454223217302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171831, "dur": 11, "args": { "External id": 114388, "cbid": 211, "correlation": 114388 } }, { "ph": "s", "id": 114388, "pid": 76337, "tid": -914061504, "ts": 1716454223171831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223217452, "dur": 1947, "args": { "External id": 114409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114409, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 114409, "pid": 5, "tid": 7, "ts": 1716454223217452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223171914, "dur": 13, "args": { "External id": 114409, "cbid": 211, "correlation": 114409 } }, { "ph": "s", "id": 114409, "pid": 76337, "tid": -914061504, "ts": 1716454223171914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172021, "dur": 1, "args": { "External id": 114427, "cbid": 251, "correlation": 114427 } }, { "ph": "f", "id": 114427, "pid": 76337, "tid": -914061504, "ts": 1716454223172021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223219400, "dur": 147, "args": { "External id": 114429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114429, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 114429, "pid": 5, "tid": 7, "ts": 1716454223219400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172026, "dur": 14, "args": { "External id": 114429, "cbid": 211, "correlation": 114429 } }, { "ph": "s", "id": 114429, "pid": 76337, "tid": -914061504, "ts": 1716454223172026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223219549, "dur": 35, "args": { "External id": 114437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114437, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114437, "pid": 5, "tid": 7, "ts": 1716454223219549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172099, "dur": 12, "args": { "External id": 114437, "cbid": 211, "correlation": 114437 } }, { "ph": "s", "id": 114437, "pid": 76337, "tid": -914061504, "ts": 1716454223172099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223219585, "dur": 50, "args": { "External id": 114445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114445, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114445, "pid": 5, "tid": 7, "ts": 1716454223219585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172138, "dur": 9, "args": { "External id": 114445, "cbid": 211, "correlation": 114445 } }, { "ph": "s", "id": 114445, "pid": 76337, "tid": -914061504, "ts": 1716454223172138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223219637, "dur": 31, "args": { "External id": 114456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114456, "pid": 5, "tid": 7, "ts": 1716454223219637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172211, "dur": 13, "args": { "External id": 114456, "cbid": 211, "correlation": 114456 } }, { "ph": "s", "id": 114456, "pid": 76337, "tid": -914061504, "ts": 1716454223172211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223219669, "dur": 35, "args": { "External id": 114478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114478, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114478, "pid": 5, "tid": 7, "ts": 1716454223219669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172244, "dur": 7, "args": { "External id": 114478, "cbid": 211, "correlation": 114478 } }, { "ph": "s", "id": 114478, "pid": 76337, "tid": -914061504, "ts": 1716454223172244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172327, "dur": 1, "args": { "External id": 114489, "cbid": 251, "correlation": 114489 } }, { "ph": "f", "id": 114489, "pid": 76337, "tid": -914061504, "ts": 1716454223172327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223219705, "dur": 90, "args": { "External id": 114490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114490, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114490, "pid": 5, "tid": 7, "ts": 1716454223219705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172332, "dur": 14, "args": { "External id": 114490, "cbid": 211, "correlation": 114490 } }, { "ph": "s", "id": 114490, "pid": 76337, "tid": -914061504, "ts": 1716454223172332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172403, "dur": 1, "args": { "External id": 114501, "cbid": 251, "correlation": 114501 } }, { "ph": "f", "id": 114501, "pid": 76337, "tid": -914061504, "ts": 1716454223172403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172406, "dur": 0, "args": { "External id": 114502, "cbid": 251, "correlation": 114502 } }, { "ph": "f", "id": 114502, "pid": 76337, "tid": -914061504, "ts": 1716454223172406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223219797, "dur": 11, "args": { "External id": 114503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114503, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 114503, "pid": 5, "tid": 7, "ts": 1716454223219797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172408, "dur": 12, "args": { "External id": 114503, "cbid": 211, "correlation": 114503 } }, { "ph": "s", "id": 114503, "pid": 76337, "tid": -914061504, "ts": 1716454223172408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223219809, "dur": 5, "args": { "External id": 114505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114505, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 114505, "pid": 5, "tid": 7, "ts": 1716454223219809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172423, "dur": 7, "args": { "External id": 114505, "cbid": 211, "correlation": 114505 } }, { "ph": "s", "id": 114505, "pid": 76337, "tid": -914061504, "ts": 1716454223172423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172482, "dur": 1, "args": { "External id": 114516, "cbid": 251, "correlation": 114516 } }, { "ph": "f", "id": 114516, "pid": 76337, "tid": -914061504, "ts": 1716454223172482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172486, "dur": 0, "args": { "External id": 114517, "cbid": 251, "correlation": 114517 } }, { "ph": "f", "id": 114517, "pid": 76337, "tid": -914061504, "ts": 1716454223172486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223219815, "dur": 7, "args": { "External id": 114518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114518, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 114518, "pid": 5, "tid": 7, "ts": 1716454223219815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172487, "dur": 12, "args": { "External id": 114518, "cbid": 211, "correlation": 114518 } }, { "ph": "s", "id": 114518, "pid": 76337, "tid": -914061504, "ts": 1716454223172487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223219824, "dur": 3, "args": { "External id": 114520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114520, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 114520, "pid": 5, "tid": 7, "ts": 1716454223219824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172501, "dur": 5, "args": { "External id": 114520, "cbid": 211, "correlation": 114520 } }, { "ph": "s", "id": 114520, "pid": 76337, "tid": -914061504, "ts": 1716454223172501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223219828, "dur": 91, "args": { "External id": 114541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114541, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 114541, "pid": 5, "tid": 7, "ts": 1716454223219828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172574, "dur": 13, "args": { "External id": 114541, "cbid": 211, "correlation": 114541 } }, { "ph": "s", "id": 114541, "pid": 76337, "tid": -914061504, "ts": 1716454223172574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172670, "dur": 1, "args": { "External id": 114559, "cbid": 251, "correlation": 114559 } }, { "ph": "f", "id": 114559, "pid": 76337, "tid": -914061504, "ts": 1716454223172670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223219920, "dur": 101, "args": { "External id": 114561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114561, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114561, "pid": 5, "tid": 7, "ts": 1716454223219920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172676, "dur": 13, "args": { "External id": 114561, "cbid": 211, "correlation": 114561 } }, { "ph": "s", "id": 114561, "pid": 76337, "tid": -914061504, "ts": 1716454223172676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223220023, "dur": 19, "args": { "External id": 114569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114569, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114569, "pid": 5, "tid": 7, "ts": 1716454223220023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172745, "dur": 12, "args": { "External id": 114569, "cbid": 211, "correlation": 114569 } }, { "ph": "s", "id": 114569, "pid": 76337, "tid": -914061504, "ts": 1716454223172745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223220043, "dur": 37, "args": { "External id": 114577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114577, "pid": 5, "tid": 7, "ts": 1716454223220043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172786, "dur": 10, "args": { "External id": 114577, "cbid": 211, "correlation": 114577 } }, { "ph": "s", "id": 114577, "pid": 76337, "tid": -914061504, "ts": 1716454223172786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223220082, "dur": 34, "args": { "External id": 114599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114599, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114599, "pid": 5, "tid": 7, "ts": 1716454223220082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172837, "dur": 11, "args": { "External id": 114599, "cbid": 211, "correlation": 114599 } }, { "ph": "s", "id": 114599, "pid": 76337, "tid": -914061504, "ts": 1716454223172837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172927, "dur": 1, "args": { "External id": 114615, "cbid": 251, "correlation": 114615 } }, { "ph": "f", "id": 114615, "pid": 76337, "tid": -914061504, "ts": 1716454223172927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223172932, "dur": 0, "args": { "External id": 114617, "cbid": 251, "correlation": 114617 } }, { "ph": "f", "id": 114617, "pid": 76337, "tid": -914061504, "ts": 1716454223172932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223220117, "dur": 542, "args": { "External id": 114618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114618, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 114618, "pid": 5, "tid": 7, "ts": 1716454223220117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223172935, "dur": 14, "args": { "External id": 114618, "cbid": 211, "correlation": 114618 } }, { "ph": "s", "id": 114618, "pid": 76337, "tid": -914061504, "ts": 1716454223172935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223220661, "dur": 125, "args": { "External id": 114626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114626, "pid": 5, "tid": 7, "ts": 1716454223220661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173010, "dur": 13, "args": { "External id": 114626, "cbid": 211, "correlation": 114626 } }, { "ph": "s", "id": 114626, "pid": 76337, "tid": -914061504, "ts": 1716454223173010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223220787, "dur": 127, "args": { "External id": 114634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114634, "pid": 5, "tid": 7, "ts": 1716454223220787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173040, "dur": 9, "args": { "External id": 114634, "cbid": 211, "correlation": 114634 } }, { "ph": "s", "id": 114634, "pid": 76337, "tid": -914061504, "ts": 1716454223173040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223173120, "dur": 1, "args": { "External id": 114650, "cbid": 251, "correlation": 114650 } }, { "ph": "f", "id": 114650, "pid": 76337, "tid": -914061504, "ts": 1716454223173120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223220915, "dur": 304, "args": { "External id": 114652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114652, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114652, "pid": 5, "tid": 7, "ts": 1716454223220915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173126, "dur": 12, "args": { "External id": 114652, "cbid": 211, "correlation": 114652 } }, { "ph": "s", "id": 114652, "pid": 76337, "tid": -914061504, "ts": 1716454223173126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223221221, "dur": 27, "args": { "External id": 114660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114660, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114660, "pid": 5, "tid": 7, "ts": 1716454223221221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173169, "dur": 10, "args": { "External id": 114660, "cbid": 211, "correlation": 114660 } }, { "ph": "s", "id": 114660, "pid": 76337, "tid": -914061504, "ts": 1716454223173169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223221249, "dur": 82, "args": { "External id": 114671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114671, "pid": 5, "tid": 7, "ts": 1716454223221249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173237, "dur": 13, "args": { "External id": 114671, "cbid": 211, "correlation": 114671 } }, { "ph": "s", "id": 114671, "pid": 76337, "tid": -914061504, "ts": 1716454223173237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223173301, "dur": 0, "args": { "External id": 114683, "cbid": 317, "correlation": 114683 } }, { "ph": "f", "id": 114683, "pid": 76337, "tid": -914061504, "ts": 1716454223173301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223173302, "dur": 0, "args": { "External id": 114684, "cbid": 203, "correlation": 114684 } }, { "ph": "f", "id": 114684, "pid": 76337, "tid": -914061504, "ts": 1716454223173302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223173303, "dur": 0, "args": { "External id": 114685, "cbid": 205, "correlation": 114685 } }, { "ph": "f", "id": 114685, "pid": 76337, "tid": -914061504, "ts": 1716454223173303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223221332, "dur": 23, "args": { "External id": 114689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114689, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114689, "pid": 5, "tid": 7, "ts": 1716454223221332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173318, "dur": 12, "args": { "External id": 114689, "cbid": 211, "correlation": 114689 } }, { "ph": "s", "id": 114689, "pid": 76337, "tid": -914061504, "ts": 1716454223173318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223221356, "dur": 120, "args": { "External id": 114691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114691, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114691, "pid": 5, "tid": 7, "ts": 1716454223221356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173337, "dur": 7, "args": { "External id": 114691, "cbid": 211, "correlation": 114691 } }, { "ph": "s", "id": 114691, "pid": 76337, "tid": -914061504, "ts": 1716454223173337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223221478, "dur": 22, "args": { "External id": 114693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114693, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114693, "pid": 5, "tid": 7, "ts": 1716454223221478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173349, "dur": 5, "args": { "External id": 114693, "cbid": 211, "correlation": 114693 } }, { "ph": "s", "id": 114693, "pid": 76337, "tid": -914061504, "ts": 1716454223173349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223221501, "dur": 32, "args": { "External id": 114699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114699, "pid": 5, "tid": 7, "ts": 1716454223221501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173376, "dur": 8, "args": { "External id": 114699, "cbid": 211, "correlation": 114699 } }, { "ph": "s", "id": 114699, "pid": 76337, "tid": -914061504, "ts": 1716454223173376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223221535, "dur": 27, "args": { "External id": 114707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114707, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114707, "pid": 5, "tid": 7, "ts": 1716454223221535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173408, "dur": 8, "args": { "External id": 114707, "cbid": 211, "correlation": 114707 } }, { "ph": "s", "id": 114707, "pid": 76337, "tid": -914061504, "ts": 1716454223173408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223221563, "dur": 32, "args": { "External id": 114727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114727, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 114727, "pid": 5, "tid": 7, "ts": 1716454223221563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173480, "dur": 12, "args": { "External id": 114727, "cbid": 211, "correlation": 114727 } }, { "ph": "s", "id": 114727, "pid": 76337, "tid": -914061504, "ts": 1716454223173480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223221596, "dur": 5, "args": { "External id": 114739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114739, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 114739, "pid": 5, "tid": 7, "ts": 1716454223221596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173503, "dur": 6, "args": { "External id": 114739, "cbid": 211, "correlation": 114739 } }, { "ph": "s", "id": 114739, "pid": 76337, "tid": -914061504, "ts": 1716454223173503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223221602, "dur": 32, "args": { "External id": 114742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114742, "pid": 5, "tid": 7, "ts": 1716454223221602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173521, "dur": 6, "args": { "External id": 114742, "cbid": 211, "correlation": 114742 } }, { "ph": "s", "id": 114742, "pid": 76337, "tid": -914061504, "ts": 1716454223173521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223221635, "dur": 21, "args": { "External id": 114751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114751, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114751, "pid": 5, "tid": 7, "ts": 1716454223221635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173560, "dur": 10, "args": { "External id": 114751, "cbid": 211, "correlation": 114751 } }, { "ph": "s", "id": 114751, "pid": 76337, "tid": -914061504, "ts": 1716454223173560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223173611, "dur": 0, "args": { "External id": 114761, "cbid": 317, "correlation": 114761 } }, { "ph": "f", "id": 114761, "pid": 76337, "tid": -914061504, "ts": 1716454223173611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223173612, "dur": 0, "args": { "External id": 114762, "cbid": 203, "correlation": 114762 } }, { "ph": "f", "id": 114762, "pid": 76337, "tid": -914061504, "ts": 1716454223173612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223173613, "dur": 0, "args": { "External id": 114763, "cbid": 205, "correlation": 114763 } }, { "ph": "f", "id": 114763, "pid": 76337, "tid": -914061504, "ts": 1716454223173613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223221658, "dur": 24, "args": { "External id": 114767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114767, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114767, "pid": 5, "tid": 7, "ts": 1716454223221658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173625, "dur": 11, "args": { "External id": 114767, "cbid": 211, "correlation": 114767 } }, { "ph": "s", "id": 114767, "pid": 76337, "tid": -914061504, "ts": 1716454223173625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223221683, "dur": 44, "args": { "External id": 114769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114769, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114769, "pid": 5, "tid": 7, "ts": 1716454223221683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173639, "dur": 5, "args": { "External id": 114769, "cbid": 211, "correlation": 114769 } }, { "ph": "s", "id": 114769, "pid": 76337, "tid": -914061504, "ts": 1716454223173639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223221729, "dur": 653, "args": { "External id": 114771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114771, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114771, "pid": 5, "tid": 7, "ts": 1716454223221729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173651, "dur": 6, "args": { "External id": 114771, "cbid": 211, "correlation": 114771 } }, { "ph": "s", "id": 114771, "pid": 76337, "tid": -914061504, "ts": 1716454223173651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223222383, "dur": 22, "args": { "External id": 114773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114773, "pid": 5, "tid": 7, "ts": 1716454223222383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173661, "dur": 5, "args": { "External id": 114773, "cbid": 211, "correlation": 114773 } }, { "ph": "s", "id": 114773, "pid": 76337, "tid": -914061504, "ts": 1716454223173661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223222407, "dur": 33, "args": { "External id": 114779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114779, "pid": 5, "tid": 7, "ts": 1716454223222407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173687, "dur": 9, "args": { "External id": 114779, "cbid": 211, "correlation": 114779 } }, { "ph": "s", "id": 114779, "pid": 76337, "tid": -914061504, "ts": 1716454223173687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223222441, "dur": 4, "args": { "External id": 114787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114787, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 114787, "pid": 5, "tid": 7, "ts": 1716454223222441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173731, "dur": 9, "args": { "External id": 114787, "cbid": 211, "correlation": 114787 } }, { "ph": "s", "id": 114787, "pid": 76337, "tid": -914061504, "ts": 1716454223173731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223173795, "dur": 1, "args": { "External id": 114803, "cbid": 251, "correlation": 114803 } }, { "ph": "f", "id": 114803, "pid": 76337, "tid": -914061504, "ts": 1716454223173795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223173801, "dur": 0, "args": { "External id": 114805, "cbid": 251, "correlation": 114805 } }, { "ph": "f", "id": 114805, "pid": 76337, "tid": -914061504, "ts": 1716454223173801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223222446, "dur": 12, "args": { "External id": 114806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114806, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 114806, "pid": 5, "tid": 7, "ts": 1716454223222446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173803, "dur": 11, "args": { "External id": 114806, "cbid": 211, "correlation": 114806 } }, { "ph": "s", "id": 114806, "pid": 76337, "tid": -914061504, "ts": 1716454223173803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223222460, "dur": 5, "args": { "External id": 114808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114808, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 114808, "pid": 5, "tid": 7, "ts": 1716454223222460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173817, "dur": 5, "args": { "External id": 114808, "cbid": 211, "correlation": 114808 } }, { "ph": "s", "id": 114808, "pid": 76337, "tid": -914061504, "ts": 1716454223173817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223222466, "dur": 29, "args": { "External id": 114818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114818, "pid": 5, "tid": 7, "ts": 1716454223222466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173873, "dur": 12, "args": { "External id": 114818, "cbid": 211, "correlation": 114818 } }, { "ph": "s", "id": 114818, "pid": 76337, "tid": -914061504, "ts": 1716454223173873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223222496, "dur": 31, "args": { "External id": 114838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114838, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 114838, "pid": 5, "tid": 7, "ts": 1716454223222496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173938, "dur": 11, "args": { "External id": 114838, "cbid": 211, "correlation": 114838 } }, { "ph": "s", "id": 114838, "pid": 76337, "tid": -914061504, "ts": 1716454223173938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223222529, "dur": 4, "args": { "External id": 114850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114850, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 114850, "pid": 5, "tid": 7, "ts": 1716454223222529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173960, "dur": 6, "args": { "External id": 114850, "cbid": 211, "correlation": 114850 } }, { "ph": "s", "id": 114850, "pid": 76337, "tid": -914061504, "ts": 1716454223173960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223222534, "dur": 30, "args": { "External id": 114853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114853, "pid": 5, "tid": 7, "ts": 1716454223222534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223173987, "dur": 7, "args": { "External id": 114853, "cbid": 211, "correlation": 114853 } }, { "ph": "s", "id": 114853, "pid": 76337, "tid": -914061504, "ts": 1716454223173987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223222565, "dur": 21, "args": { "External id": 114862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114862, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114862, "pid": 5, "tid": 7, "ts": 1716454223222565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174030, "dur": 9, "args": { "External id": 114862, "cbid": 211, "correlation": 114862 } }, { "ph": "s", "id": 114862, "pid": 76337, "tid": -914061504, "ts": 1716454223174030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223174092, "dur": 0, "args": { "External id": 114872, "cbid": 317, "correlation": 114872 } }, { "ph": "f", "id": 114872, "pid": 76337, "tid": -914061504, "ts": 1716454223174092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223174093, "dur": 0, "args": { "External id": 114873, "cbid": 203, "correlation": 114873 } }, { "ph": "f", "id": 114873, "pid": 76337, "tid": -914061504, "ts": 1716454223174093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223174093, "dur": 0, "args": { "External id": 114874, "cbid": 205, "correlation": 114874 } }, { "ph": "f", "id": 114874, "pid": 76337, "tid": -914061504, "ts": 1716454223174093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223222588, "dur": 24, "args": { "External id": 114878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114878, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114878, "pid": 5, "tid": 7, "ts": 1716454223222588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174107, "dur": 12, "args": { "External id": 114878, "cbid": 211, "correlation": 114878 } }, { "ph": "s", "id": 114878, "pid": 76337, "tid": -914061504, "ts": 1716454223174107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223222613, "dur": 44, "args": { "External id": 114880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114880, "pid": 5, "tid": 7, "ts": 1716454223222613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174122, "dur": 5, "args": { "External id": 114880, "cbid": 211, "correlation": 114880 } }, { "ph": "s", "id": 114880, "pid": 76337, "tid": -914061504, "ts": 1716454223174122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223222658, "dur": 643, "args": { "External id": 114882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114882, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114882, "pid": 5, "tid": 7, "ts": 1716454223222658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174133, "dur": 6, "args": { "External id": 114882, "cbid": 211, "correlation": 114882 } }, { "ph": "s", "id": 114882, "pid": 76337, "tid": -914061504, "ts": 1716454223174133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223223302, "dur": 22, "args": { "External id": 114884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114884, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114884, "pid": 5, "tid": 7, "ts": 1716454223223302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174142, "dur": 5, "args": { "External id": 114884, "cbid": 211, "correlation": 114884 } }, { "ph": "s", "id": 114884, "pid": 76337, "tid": -914061504, "ts": 1716454223174142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223223326, "dur": 34, "args": { "External id": 114890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114890, "pid": 5, "tid": 7, "ts": 1716454223223326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174170, "dur": 8, "args": { "External id": 114890, "cbid": 211, "correlation": 114890 } }, { "ph": "s", "id": 114890, "pid": 76337, "tid": -914061504, "ts": 1716454223174170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223223361, "dur": 27, "args": { "External id": 114898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114898, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114898, "pid": 5, "tid": 7, "ts": 1716454223223361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174202, "dur": 9, "args": { "External id": 114898, "cbid": 211, "correlation": 114898 } }, { "ph": "s", "id": 114898, "pid": 76337, "tid": -914061504, "ts": 1716454223174202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223223389, "dur": 20, "args": { "External id": 114906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114906, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114906, "pid": 5, "tid": 7, "ts": 1716454223223389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174231, "dur": 8, "args": { "External id": 114906, "cbid": 211, "correlation": 114906 } }, { "ph": "s", "id": 114906, "pid": 76337, "tid": -914061504, "ts": 1716454223174231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223223410, "dur": 29, "args": { "External id": 114926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114926, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 114926, "pid": 5, "tid": 7, "ts": 1716454223223410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174311, "dur": 12, "args": { "External id": 114926, "cbid": 211, "correlation": 114926 } }, { "ph": "s", "id": 114926, "pid": 76337, "tid": -914061504, "ts": 1716454223174311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223223441, "dur": 4, "args": { "External id": 114938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114938, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 114938, "pid": 5, "tid": 7, "ts": 1716454223223441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174333, "dur": 6, "args": { "External id": 114938, "cbid": 211, "correlation": 114938 } }, { "ph": "s", "id": 114938, "pid": 76337, "tid": -914061504, "ts": 1716454223174333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223223446, "dur": 29, "args": { "External id": 114941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114941, "pid": 5, "tid": 7, "ts": 1716454223223446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174350, "dur": 6, "args": { "External id": 114941, "cbid": 211, "correlation": 114941 } }, { "ph": "s", "id": 114941, "pid": 76337, "tid": -914061504, "ts": 1716454223174350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223174408, "dur": 0, "args": { "External id": 114952, "cbid": 317, "correlation": 114952 } }, { "ph": "f", "id": 114952, "pid": 76337, "tid": -914061504, "ts": 1716454223174408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223174409, "dur": 0, "args": { "External id": 114953, "cbid": 203, "correlation": 114953 } }, { "ph": "f", "id": 114953, "pid": 76337, "tid": -914061504, "ts": 1716454223174409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223174410, "dur": 0, "args": { "External id": 114954, "cbid": 205, "correlation": 114954 } }, { "ph": "f", "id": 114954, "pid": 76337, "tid": -914061504, "ts": 1716454223174410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223223477, "dur": 22, "args": { "External id": 114958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114958, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114958, "pid": 5, "tid": 7, "ts": 1716454223223477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174424, "dur": 12, "args": { "External id": 114958, "cbid": 211, "correlation": 114958 } }, { "ph": "s", "id": 114958, "pid": 76337, "tid": -914061504, "ts": 1716454223174424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223223500, "dur": 116, "args": { "External id": 114960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114960, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 114960, "pid": 5, "tid": 7, "ts": 1716454223223500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174441, "dur": 6, "args": { "External id": 114960, "cbid": 211, "correlation": 114960 } }, { "ph": "s", "id": 114960, "pid": 76337, "tid": -914061504, "ts": 1716454223174441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223223617, "dur": 21, "args": { "External id": 114962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114962, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114962, "pid": 5, "tid": 7, "ts": 1716454223223617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174451, "dur": 5, "args": { "External id": 114962, "cbid": 211, "correlation": 114962 } }, { "ph": "s", "id": 114962, "pid": 76337, "tid": -914061504, "ts": 1716454223174451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223223640, "dur": 32, "args": { "External id": 114968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114968, "pid": 5, "tid": 7, "ts": 1716454223223640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174478, "dur": 8, "args": { "External id": 114968, "cbid": 211, "correlation": 114968 } }, { "ph": "s", "id": 114968, "pid": 76337, "tid": -914061504, "ts": 1716454223174478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223223673, "dur": 169, "args": { "External id": 114977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114977, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114977, "pid": 5, "tid": 7, "ts": 1716454223223673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174560, "dur": 14, "args": { "External id": 114977, "cbid": 211, "correlation": 114977 } }, { "ph": "s", "id": 114977, "pid": 76337, "tid": -914061504, "ts": 1716454223174560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223223844, "dur": 65, "args": { "External id": 114999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 114999, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 114999, "pid": 5, "tid": 7, "ts": 1716454223223844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174617, "dur": 10, "args": { "External id": 114999, "cbid": 211, "correlation": 114999 } }, { "ph": "s", "id": 114999, "pid": 76337, "tid": -914061504, "ts": 1716454223174617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223174703, "dur": 1, "args": { "External id": 115010, "cbid": 251, "correlation": 115010 } }, { "ph": "f", "id": 115010, "pid": 76337, "tid": -914061504, "ts": 1716454223174703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223223910, "dur": 156, "args": { "External id": 115011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115011, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115011, "pid": 5, "tid": 7, "ts": 1716454223223910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174709, "dur": 14, "args": { "External id": 115011, "cbid": 211, "correlation": 115011 } }, { "ph": "s", "id": 115011, "pid": 76337, "tid": -914061504, "ts": 1716454223174709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223174779, "dur": 1, "args": { "External id": 115022, "cbid": 251, "correlation": 115022 } }, { "ph": "f", "id": 115022, "pid": 76337, "tid": -914061504, "ts": 1716454223174779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223224067, "dur": 146, "args": { "External id": 115023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115023, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115023, "pid": 5, "tid": 7, "ts": 1716454223224067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174783, "dur": 11, "args": { "External id": 115023, "cbid": 211, "correlation": 115023 } }, { "ph": "s", "id": 115023, "pid": 76337, "tid": -914061504, "ts": 1716454223174783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223174848, "dur": 1, "args": { "External id": 115034, "cbid": 251, "correlation": 115034 } }, { "ph": "f", "id": 115034, "pid": 76337, "tid": -914061504, "ts": 1716454223174848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223224215, "dur": 144, "args": { "External id": 115035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115035, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115035, "pid": 5, "tid": 7, "ts": 1716454223224215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174852, "dur": 11, "args": { "External id": 115035, "cbid": 211, "correlation": 115035 } }, { "ph": "s", "id": 115035, "pid": 76337, "tid": -914061504, "ts": 1716454223174852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223224360, "dur": 1946, "args": { "External id": 115056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115056, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 115056, "pid": 5, "tid": 7, "ts": 1716454223224360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223174934, "dur": 12, "args": { "External id": 115056, "cbid": 211, "correlation": 115056 } }, { "ph": "s", "id": 115056, "pid": 76337, "tid": -914061504, "ts": 1716454223174934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175040, "dur": 1, "args": { "External id": 115074, "cbid": 251, "correlation": 115074 } }, { "ph": "f", "id": 115074, "pid": 76337, "tid": -914061504, "ts": 1716454223175040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223226307, "dur": 147, "args": { "External id": 115076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115076, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 115076, "pid": 5, "tid": 7, "ts": 1716454223226307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175045, "dur": 14, "args": { "External id": 115076, "cbid": 211, "correlation": 115076 } }, { "ph": "s", "id": 115076, "pid": 76337, "tid": -914061504, "ts": 1716454223175045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223226456, "dur": 35, "args": { "External id": 115084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115084, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115084, "pid": 5, "tid": 7, "ts": 1716454223226456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175116, "dur": 12, "args": { "External id": 115084, "cbid": 211, "correlation": 115084 } }, { "ph": "s", "id": 115084, "pid": 76337, "tid": -914061504, "ts": 1716454223175116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223226492, "dur": 50, "args": { "External id": 115092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115092, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115092, "pid": 5, "tid": 7, "ts": 1716454223226492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175155, "dur": 9, "args": { "External id": 115092, "cbid": 211, "correlation": 115092 } }, { "ph": "s", "id": 115092, "pid": 76337, "tid": -914061504, "ts": 1716454223175155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223226544, "dur": 30, "args": { "External id": 115103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115103, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115103, "pid": 5, "tid": 7, "ts": 1716454223226544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175227, "dur": 12, "args": { "External id": 115103, "cbid": 211, "correlation": 115103 } }, { "ph": "s", "id": 115103, "pid": 76337, "tid": -914061504, "ts": 1716454223175227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223226575, "dur": 35, "args": { "External id": 115125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115125, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115125, "pid": 5, "tid": 7, "ts": 1716454223226575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175256, "dur": 8, "args": { "External id": 115125, "cbid": 211, "correlation": 115125 } }, { "ph": "s", "id": 115125, "pid": 76337, "tid": -914061504, "ts": 1716454223175256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175342, "dur": 1, "args": { "External id": 115136, "cbid": 251, "correlation": 115136 } }, { "ph": "f", "id": 115136, "pid": 76337, "tid": -914061504, "ts": 1716454223175342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223226611, "dur": 89, "args": { "External id": 115137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115137, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115137, "pid": 5, "tid": 7, "ts": 1716454223226611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175347, "dur": 13, "args": { "External id": 115137, "cbid": 211, "correlation": 115137 } }, { "ph": "s", "id": 115137, "pid": 76337, "tid": -914061504, "ts": 1716454223175347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175417, "dur": 1, "args": { "External id": 115148, "cbid": 251, "correlation": 115148 } }, { "ph": "f", "id": 115148, "pid": 76337, "tid": -914061504, "ts": 1716454223175417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175420, "dur": 0, "args": { "External id": 115149, "cbid": 251, "correlation": 115149 } }, { "ph": "f", "id": 115149, "pid": 76337, "tid": -914061504, "ts": 1716454223175420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223226702, "dur": 12, "args": { "External id": 115150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115150, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 115150, "pid": 5, "tid": 7, "ts": 1716454223226702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175422, "dur": 12, "args": { "External id": 115150, "cbid": 211, "correlation": 115150 } }, { "ph": "s", "id": 115150, "pid": 76337, "tid": -914061504, "ts": 1716454223175422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223226715, "dur": 5, "args": { "External id": 115152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115152, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 115152, "pid": 5, "tid": 7, "ts": 1716454223226715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175435, "dur": 6, "args": { "External id": 115152, "cbid": 211, "correlation": 115152 } }, { "ph": "s", "id": 115152, "pid": 76337, "tid": -914061504, "ts": 1716454223175435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175493, "dur": 1, "args": { "External id": 115163, "cbid": 251, "correlation": 115163 } }, { "ph": "f", "id": 115163, "pid": 76337, "tid": -914061504, "ts": 1716454223175493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175497, "dur": 0, "args": { "External id": 115164, "cbid": 251, "correlation": 115164 } }, { "ph": "f", "id": 115164, "pid": 76337, "tid": -914061504, "ts": 1716454223175497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223226721, "dur": 7, "args": { "External id": 115165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115165, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 115165, "pid": 5, "tid": 7, "ts": 1716454223226721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175498, "dur": 12, "args": { "External id": 115165, "cbid": 211, "correlation": 115165 } }, { "ph": "s", "id": 115165, "pid": 76337, "tid": -914061504, "ts": 1716454223175498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223226730, "dur": 3, "args": { "External id": 115167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115167, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 115167, "pid": 5, "tid": 7, "ts": 1716454223226730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175511, "dur": 6, "args": { "External id": 115167, "cbid": 211, "correlation": 115167 } }, { "ph": "s", "id": 115167, "pid": 76337, "tid": -914061504, "ts": 1716454223175511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223226734, "dur": 92, "args": { "External id": 115188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115188, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 115188, "pid": 5, "tid": 7, "ts": 1716454223226734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175586, "dur": 12, "args": { "External id": 115188, "cbid": 211, "correlation": 115188 } }, { "ph": "s", "id": 115188, "pid": 76337, "tid": -914061504, "ts": 1716454223175586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175683, "dur": 1, "args": { "External id": 115206, "cbid": 251, "correlation": 115206 } }, { "ph": "f", "id": 115206, "pid": 76337, "tid": -914061504, "ts": 1716454223175683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223226827, "dur": 99, "args": { "External id": 115208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115208, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115208, "pid": 5, "tid": 7, "ts": 1716454223226827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175688, "dur": 13, "args": { "External id": 115208, "cbid": 211, "correlation": 115208 } }, { "ph": "s", "id": 115208, "pid": 76337, "tid": -914061504, "ts": 1716454223175688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223226928, "dur": 19, "args": { "External id": 115216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115216, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115216, "pid": 5, "tid": 7, "ts": 1716454223226928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175757, "dur": 13, "args": { "External id": 115216, "cbid": 211, "correlation": 115216 } }, { "ph": "s", "id": 115216, "pid": 76337, "tid": -914061504, "ts": 1716454223175757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223226948, "dur": 37, "args": { "External id": 115224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115224, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115224, "pid": 5, "tid": 7, "ts": 1716454223226948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175800, "dur": 9, "args": { "External id": 115224, "cbid": 211, "correlation": 115224 } }, { "ph": "s", "id": 115224, "pid": 76337, "tid": -914061504, "ts": 1716454223175800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223226986, "dur": 35, "args": { "External id": 115246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115246, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115246, "pid": 5, "tid": 7, "ts": 1716454223226986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175851, "dur": 10, "args": { "External id": 115246, "cbid": 211, "correlation": 115246 } }, { "ph": "s", "id": 115246, "pid": 76337, "tid": -914061504, "ts": 1716454223175851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175939, "dur": 1, "args": { "External id": 115262, "cbid": 251, "correlation": 115262 } }, { "ph": "f", "id": 115262, "pid": 76337, "tid": -914061504, "ts": 1716454223175939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223175944, "dur": 0, "args": { "External id": 115264, "cbid": 251, "correlation": 115264 } }, { "ph": "f", "id": 115264, "pid": 76337, "tid": -914061504, "ts": 1716454223175944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223227023, "dur": 541, "args": { "External id": 115265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115265, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 115265, "pid": 5, "tid": 7, "ts": 1716454223227023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223175948, "dur": 12, "args": { "External id": 115265, "cbid": 211, "correlation": 115265 } }, { "ph": "s", "id": 115265, "pid": 76337, "tid": -914061504, "ts": 1716454223175948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223227565, "dur": 125, "args": { "External id": 115273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115273, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115273, "pid": 5, "tid": 7, "ts": 1716454223227565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176019, "dur": 14, "args": { "External id": 115273, "cbid": 211, "correlation": 115273 } }, { "ph": "s", "id": 115273, "pid": 76337, "tid": -914061504, "ts": 1716454223176019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223227691, "dur": 127, "args": { "External id": 115281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115281, "pid": 5, "tid": 7, "ts": 1716454223227691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176051, "dur": 8, "args": { "External id": 115281, "cbid": 211, "correlation": 115281 } }, { "ph": "s", "id": 115281, "pid": 76337, "tid": -914061504, "ts": 1716454223176051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223176130, "dur": 1, "args": { "External id": 115297, "cbid": 251, "correlation": 115297 } }, { "ph": "f", "id": 115297, "pid": 76337, "tid": -914061504, "ts": 1716454223176130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223227819, "dur": 302, "args": { "External id": 115299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115299, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115299, "pid": 5, "tid": 7, "ts": 1716454223227819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176136, "dur": 12, "args": { "External id": 115299, "cbid": 211, "correlation": 115299 } }, { "ph": "s", "id": 115299, "pid": 76337, "tid": -914061504, "ts": 1716454223176136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223228123, "dur": 27, "args": { "External id": 115307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115307, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115307, "pid": 5, "tid": 7, "ts": 1716454223228123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176177, "dur": 10, "args": { "External id": 115307, "cbid": 211, "correlation": 115307 } }, { "ph": "s", "id": 115307, "pid": 76337, "tid": -914061504, "ts": 1716454223176177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223228151, "dur": 81, "args": { "External id": 115318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115318, "pid": 5, "tid": 7, "ts": 1716454223228151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176244, "dur": 12, "args": { "External id": 115318, "cbid": 211, "correlation": 115318 } }, { "ph": "s", "id": 115318, "pid": 76337, "tid": -914061504, "ts": 1716454223176244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223176307, "dur": 0, "args": { "External id": 115330, "cbid": 317, "correlation": 115330 } }, { "ph": "f", "id": 115330, "pid": 76337, "tid": -914061504, "ts": 1716454223176307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223176308, "dur": 0, "args": { "External id": 115331, "cbid": 203, "correlation": 115331 } }, { "ph": "f", "id": 115331, "pid": 76337, "tid": -914061504, "ts": 1716454223176308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223176309, "dur": 0, "args": { "External id": 115332, "cbid": 205, "correlation": 115332 } }, { "ph": "f", "id": 115332, "pid": 76337, "tid": -914061504, "ts": 1716454223176309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228233, "dur": 24, "args": { "External id": 115336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115336, "pid": 5, "tid": 7, "ts": 1716454223228233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176324, "dur": 13, "args": { "External id": 115336, "cbid": 211, "correlation": 115336 } }, { "ph": "s", "id": 115336, "pid": 76337, "tid": -914061504, "ts": 1716454223176324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223228258, "dur": 122, "args": { "External id": 115338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115338, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115338, "pid": 5, "tid": 7, "ts": 1716454223228258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176343, "dur": 6, "args": { "External id": 115338, "cbid": 211, "correlation": 115338 } }, { "ph": "s", "id": 115338, "pid": 76337, "tid": -914061504, "ts": 1716454223176343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228382, "dur": 22, "args": { "External id": 115340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115340, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115340, "pid": 5, "tid": 7, "ts": 1716454223228382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176353, "dur": 5, "args": { "External id": 115340, "cbid": 211, "correlation": 115340 } }, { "ph": "s", "id": 115340, "pid": 76337, "tid": -914061504, "ts": 1716454223176353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223228405, "dur": 33, "args": { "External id": 115346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115346, "pid": 5, "tid": 7, "ts": 1716454223228405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176380, "dur": 8, "args": { "External id": 115346, "cbid": 211, "correlation": 115346 } }, { "ph": "s", "id": 115346, "pid": 76337, "tid": -914061504, "ts": 1716454223176380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223228439, "dur": 27, "args": { "External id": 115354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115354, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115354, "pid": 5, "tid": 7, "ts": 1716454223228439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176412, "dur": 8, "args": { "External id": 115354, "cbid": 211, "correlation": 115354 } }, { "ph": "s", "id": 115354, "pid": 76337, "tid": -914061504, "ts": 1716454223176412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223176483, "dur": 0, "args": { "External id": 115364, "cbid": 317, "correlation": 115364 } }, { "ph": "f", "id": 115364, "pid": 76337, "tid": -914061504, "ts": 1716454223176483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223176484, "dur": 0, "args": { "External id": 115365, "cbid": 203, "correlation": 115365 } }, { "ph": "f", "id": 115365, "pid": 76337, "tid": -914061504, "ts": 1716454223176484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223176485, "dur": 0, "args": { "External id": 115366, "cbid": 205, "correlation": 115366 } }, { "ph": "f", "id": 115366, "pid": 76337, "tid": -914061504, "ts": 1716454223176485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228467, "dur": 24, "args": { "External id": 115370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115370, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115370, "pid": 5, "tid": 7, "ts": 1716454223228467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176499, "dur": 12, "args": { "External id": 115370, "cbid": 211, "correlation": 115370 } }, { "ph": "s", "id": 115370, "pid": 76337, "tid": -914061504, "ts": 1716454223176499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228492, "dur": 44, "args": { "External id": 115372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115372, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115372, "pid": 5, "tid": 7, "ts": 1716454223228492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176514, "dur": 5, "args": { "External id": 115372, "cbid": 211, "correlation": 115372 } }, { "ph": "s", "id": 115372, "pid": 76337, "tid": -914061504, "ts": 1716454223176514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223228537, "dur": 234, "args": { "External id": 115374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115374, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 115374, "pid": 5, "tid": 7, "ts": 1716454223228537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176526, "dur": 7, "args": { "External id": 115374, "cbid": 211, "correlation": 115374 } }, { "ph": "s", "id": 115374, "pid": 76337, "tid": -914061504, "ts": 1716454223176526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228772, "dur": 6, "args": { "External id": 115376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115376, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115376, "pid": 5, "tid": 7, "ts": 1716454223228772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176536, "dur": 5, "args": { "External id": 115376, "cbid": 211, "correlation": 115376 } }, { "ph": "s", "id": 115376, "pid": 76337, "tid": -914061504, "ts": 1716454223176536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223228780, "dur": 9, "args": { "External id": 115382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115382, "pid": 5, "tid": 7, "ts": 1716454223228780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176562, "dur": 8, "args": { "External id": 115382, "cbid": 211, "correlation": 115382 } }, { "ph": "s", "id": 115382, "pid": 76337, "tid": -914061504, "ts": 1716454223176562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223228790, "dur": 12, "args": { "External id": 115402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115402, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 115402, "pid": 5, "tid": 7, "ts": 1716454223228790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176653, "dur": 12, "args": { "External id": 115402, "cbid": 211, "correlation": 115402 } }, { "ph": "s", "id": 115402, "pid": 76337, "tid": -914061504, "ts": 1716454223176653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223228803, "dur": 4, "args": { "External id": 115414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115414, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 115414, "pid": 5, "tid": 7, "ts": 1716454223228803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176675, "dur": 7, "args": { "External id": 115414, "cbid": 211, "correlation": 115414 } }, { "ph": "s", "id": 115414, "pid": 76337, "tid": -914061504, "ts": 1716454223176675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223228808, "dur": 12, "args": { "External id": 115417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115417, "pid": 5, "tid": 7, "ts": 1716454223228808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176693, "dur": 6, "args": { "External id": 115417, "cbid": 211, "correlation": 115417 } }, { "ph": "s", "id": 115417, "pid": 76337, "tid": -914061504, "ts": 1716454223176693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223228822, "dur": 7, "args": { "External id": 115426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115426, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115426, "pid": 5, "tid": 7, "ts": 1716454223228822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176732, "dur": 10, "args": { "External id": 115426, "cbid": 211, "correlation": 115426 } }, { "ph": "s", "id": 115426, "pid": 76337, "tid": -914061504, "ts": 1716454223176732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223176784, "dur": 0, "args": { "External id": 115436, "cbid": 317, "correlation": 115436 } }, { "ph": "f", "id": 115436, "pid": 76337, "tid": -914061504, "ts": 1716454223176784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223176785, "dur": 0, "args": { "External id": 115437, "cbid": 203, "correlation": 115437 } }, { "ph": "f", "id": 115437, "pid": 76337, "tid": -914061504, "ts": 1716454223176785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223176786, "dur": 0, "args": { "External id": 115438, "cbid": 205, "correlation": 115438 } }, { "ph": "f", "id": 115438, "pid": 76337, "tid": -914061504, "ts": 1716454223176786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228830, "dur": 6, "args": { "External id": 115442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115442, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115442, "pid": 5, "tid": 7, "ts": 1716454223228830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176801, "dur": 11, "args": { "External id": 115442, "cbid": 211, "correlation": 115442 } }, { "ph": "s", "id": 115442, "pid": 76337, "tid": -914061504, "ts": 1716454223176801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223228836, "dur": 83, "args": { "External id": 115444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115444, "pid": 5, "tid": 7, "ts": 1716454223228836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176815, "dur": 5, "args": { "External id": 115444, "cbid": 211, "correlation": 115444 } }, { "ph": "s", "id": 115444, "pid": 76337, "tid": -914061504, "ts": 1716454223176815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223228922, "dur": 1, "args": { "External id": 115446, "device": 5, "context": 1, "stream": 7, "correlation": 115446, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 115446, "pid": 5, "tid": 7, "ts": 1716454223228922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223176828, "dur": 8, "args": { "External id": 115446, "cbid": 51, "correlation": 115446 } }, { "ph": "s", "id": 115446, "pid": 76337, "tid": -914061504, "ts": 1716454223176828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223228926, "dur": 539, "args": { "External id": 115447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115447, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115447, "pid": 5, "tid": 7, "ts": 1716454223228926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176839, "dur": 8, "args": { "External id": 115447, "cbid": 211, "correlation": 115447 } }, { "ph": "s", "id": 115447, "pid": 76337, "tid": -914061504, "ts": 1716454223176839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223229466, "dur": 12, "args": { "External id": 115449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115449, "pid": 5, "tid": 7, "ts": 1716454223229466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176851, "dur": 5, "args": { "External id": 115449, "cbid": 211, "correlation": 115449 } }, { "ph": "s", "id": 115449, "pid": 76337, "tid": -914061504, "ts": 1716454223176851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223229479, "dur": 14, "args": { "External id": 115455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115455, "pid": 5, "tid": 7, "ts": 1716454223229479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176879, "dur": 9, "args": { "External id": 115455, "cbid": 211, "correlation": 115455 } }, { "ph": "s", "id": 115455, "pid": 76337, "tid": -914061504, "ts": 1716454223176879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223229495, "dur": 4, "args": { "External id": 115463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115463, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 115463, "pid": 5, "tid": 7, "ts": 1716454223229495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223176923, "dur": 10, "args": { "External id": 115463, "cbid": 211, "correlation": 115463 } }, { "ph": "s", "id": 115463, "pid": 76337, "tid": -914061504, "ts": 1716454223176923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223176995, "dur": 1, "args": { "External id": 115479, "cbid": 251, "correlation": 115479 } }, { "ph": "f", "id": 115479, "pid": 76337, "tid": -914061504, "ts": 1716454223176995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223177000, "dur": 0, "args": { "External id": 115481, "cbid": 251, "correlation": 115481 } }, { "ph": "f", "id": 115481, "pid": 76337, "tid": -914061504, "ts": 1716454223177000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223229500, "dur": 12, "args": { "External id": 115482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115482, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115482, "pid": 5, "tid": 7, "ts": 1716454223229500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177002, "dur": 11, "args": { "External id": 115482, "cbid": 211, "correlation": 115482 } }, { "ph": "s", "id": 115482, "pid": 76337, "tid": -914061504, "ts": 1716454223177002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223229514, "dur": 5, "args": { "External id": 115484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115484, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115484, "pid": 5, "tid": 7, "ts": 1716454223229514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177016, "dur": 5, "args": { "External id": 115484, "cbid": 211, "correlation": 115484 } }, { "ph": "s", "id": 115484, "pid": 76337, "tid": -914061504, "ts": 1716454223177016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223229521, "dur": 17, "args": { "External id": 115494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115494, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115494, "pid": 5, "tid": 7, "ts": 1716454223229521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177074, "dur": 13, "args": { "External id": 115494, "cbid": 211, "correlation": 115494 } }, { "ph": "s", "id": 115494, "pid": 76337, "tid": -914061504, "ts": 1716454223177074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223229539, "dur": 19, "args": { "External id": 115514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115514, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 115514, "pid": 5, "tid": 7, "ts": 1716454223229539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177142, "dur": 11, "args": { "External id": 115514, "cbid": 211, "correlation": 115514 } }, { "ph": "s", "id": 115514, "pid": 76337, "tid": -914061504, "ts": 1716454223177142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223229560, "dur": 5, "args": { "External id": 115526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115526, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 115526, "pid": 5, "tid": 7, "ts": 1716454223229560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177163, "dur": 6, "args": { "External id": 115526, "cbid": 211, "correlation": 115526 } }, { "ph": "s", "id": 115526, "pid": 76337, "tid": -914061504, "ts": 1716454223177163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223229566, "dur": 17, "args": { "External id": 115529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115529, "pid": 5, "tid": 7, "ts": 1716454223229566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177181, "dur": 7, "args": { "External id": 115529, "cbid": 211, "correlation": 115529 } }, { "ph": "s", "id": 115529, "pid": 76337, "tid": -914061504, "ts": 1716454223177181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223229584, "dur": 11, "args": { "External id": 115538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115538, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115538, "pid": 5, "tid": 7, "ts": 1716454223229584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177222, "dur": 10, "args": { "External id": 115538, "cbid": 211, "correlation": 115538 } }, { "ph": "s", "id": 115538, "pid": 76337, "tid": -914061504, "ts": 1716454223177222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223177284, "dur": 0, "args": { "External id": 115548, "cbid": 317, "correlation": 115548 } }, { "ph": "f", "id": 115548, "pid": 76337, "tid": -914061504, "ts": 1716454223177284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223177285, "dur": 0, "args": { "External id": 115549, "cbid": 203, "correlation": 115549 } }, { "ph": "f", "id": 115549, "pid": 76337, "tid": -914061504, "ts": 1716454223177285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223177286, "dur": 0, "args": { "External id": 115550, "cbid": 205, "correlation": 115550 } }, { "ph": "f", "id": 115550, "pid": 76337, "tid": -914061504, "ts": 1716454223177286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223229597, "dur": 11, "args": { "External id": 115554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115554, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115554, "pid": 5, "tid": 7, "ts": 1716454223229597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177303, "dur": 12, "args": { "External id": 115554, "cbid": 211, "correlation": 115554 } }, { "ph": "s", "id": 115554, "pid": 76337, "tid": -914061504, "ts": 1716454223177303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223229609, "dur": 163, "args": { "External id": 115556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115556, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115556, "pid": 5, "tid": 7, "ts": 1716454223229609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177318, "dur": 5, "args": { "External id": 115556, "cbid": 211, "correlation": 115556 } }, { "ph": "s", "id": 115556, "pid": 76337, "tid": -914061504, "ts": 1716454223177318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223229773, "dur": 1, "args": { "External id": 115558, "device": 5, "context": 1, "stream": 7, "correlation": 115558, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 115558, "pid": 5, "tid": 7, "ts": 1716454223229773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223177329, "dur": 7, "args": { "External id": 115558, "cbid": 51, "correlation": 115558 } }, { "ph": "s", "id": 115558, "pid": 76337, "tid": -914061504, "ts": 1716454223177329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223229777, "dur": 662, "args": { "External id": 115559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115559, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115559, "pid": 5, "tid": 7, "ts": 1716454223229777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177338, "dur": 6, "args": { "External id": 115559, "cbid": 211, "correlation": 115559 } }, { "ph": "s", "id": 115559, "pid": 76337, "tid": -914061504, "ts": 1716454223177338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223230441, "dur": 13, "args": { "External id": 115561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115561, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115561, "pid": 5, "tid": 7, "ts": 1716454223230441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177348, "dur": 5, "args": { "External id": 115561, "cbid": 211, "correlation": 115561 } }, { "ph": "s", "id": 115561, "pid": 76337, "tid": -914061504, "ts": 1716454223177348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223230455, "dur": 15, "args": { "External id": 115567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115567, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115567, "pid": 5, "tid": 7, "ts": 1716454223230455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177377, "dur": 9, "args": { "External id": 115567, "cbid": 211, "correlation": 115567 } }, { "ph": "s", "id": 115567, "pid": 76337, "tid": -914061504, "ts": 1716454223177377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223177435, "dur": 0, "args": { "External id": 115577, "cbid": 317, "correlation": 115577 } }, { "ph": "f", "id": 115577, "pid": 76337, "tid": -914061504, "ts": 1716454223177435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223177436, "dur": 0, "args": { "External id": 115578, "cbid": 203, "correlation": 115578 } }, { "ph": "f", "id": 115578, "pid": 76337, "tid": -914061504, "ts": 1716454223177436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223177437, "dur": 0, "args": { "External id": 115579, "cbid": 205, "correlation": 115579 } }, { "ph": "f", "id": 115579, "pid": 76337, "tid": -914061504, "ts": 1716454223177437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223230471, "dur": 8, "args": { "External id": 115583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115583, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115583, "pid": 5, "tid": 7, "ts": 1716454223230471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177451, "dur": 12, "args": { "External id": 115583, "cbid": 211, "correlation": 115583 } }, { "ph": "s", "id": 115583, "pid": 76337, "tid": -914061504, "ts": 1716454223177451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223230481, "dur": 3, "args": { "External id": 115585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115585, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 115585, "pid": 5, "tid": 7, "ts": 1716454223230481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177468, "dur": 6, "args": { "External id": 115585, "cbid": 211, "correlation": 115585 } }, { "ph": "s", "id": 115585, "pid": 76337, "tid": -914061504, "ts": 1716454223177468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223177478, "dur": 0, "args": { "External id": 115586, "cbid": 51, "correlation": 115586 } }, { "ph": "s", "id": 115586, "pid": 76337, "tid": -914061504, "ts": 1716454223177478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223230486, "dur": 57, "args": { "External id": 115587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115587, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 115587, "pid": 5, "tid": 7, "ts": 1716454223230486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177479, "dur": 6, "args": { "External id": 115587, "cbid": 211, "correlation": 115587 } }, { "ph": "s", "id": 115587, "pid": 76337, "tid": -914061504, "ts": 1716454223177479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223230545, "dur": 14, "args": { "External id": 115592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115592, "pid": 5, "tid": 7, "ts": 1716454223230545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177504, "dur": 9, "args": { "External id": 115592, "cbid": 211, "correlation": 115592 } }, { "ph": "s", "id": 115592, "pid": 76337, "tid": -914061504, "ts": 1716454223177504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223230560, "dur": 11, "args": { "External id": 115600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115600, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115600, "pid": 5, "tid": 7, "ts": 1716454223230560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177535, "dur": 8, "args": { "External id": 115600, "cbid": 211, "correlation": 115600 } }, { "ph": "s", "id": 115600, "pid": 76337, "tid": -914061504, "ts": 1716454223177535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223230572, "dur": 10, "args": { "External id": 115608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115608, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115608, "pid": 5, "tid": 7, "ts": 1716454223230572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177564, "dur": 8, "args": { "External id": 115608, "cbid": 211, "correlation": 115608 } }, { "ph": "s", "id": 115608, "pid": 76337, "tid": -914061504, "ts": 1716454223177564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223230584, "dur": 18, "args": { "External id": 115628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115628, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 115628, "pid": 5, "tid": 7, "ts": 1716454223230584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177645, "dur": 12, "args": { "External id": 115628, "cbid": 211, "correlation": 115628 } }, { "ph": "s", "id": 115628, "pid": 76337, "tid": -914061504, "ts": 1716454223177645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223230603, "dur": 5, "args": { "External id": 115640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115640, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 115640, "pid": 5, "tid": 7, "ts": 1716454223230603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177667, "dur": 6, "args": { "External id": 115640, "cbid": 211, "correlation": 115640 } }, { "ph": "s", "id": 115640, "pid": 76337, "tid": -914061504, "ts": 1716454223177667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223230609, "dur": 17, "args": { "External id": 115643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115643, "pid": 5, "tid": 7, "ts": 1716454223230609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177686, "dur": 6, "args": { "External id": 115643, "cbid": 211, "correlation": 115643 } }, { "ph": "s", "id": 115643, "pid": 76337, "tid": -914061504, "ts": 1716454223177686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223177743, "dur": 0, "args": { "External id": 115654, "cbid": 317, "correlation": 115654 } }, { "ph": "f", "id": 115654, "pid": 76337, "tid": -914061504, "ts": 1716454223177743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223177744, "dur": 0, "args": { "External id": 115655, "cbid": 203, "correlation": 115655 } }, { "ph": "f", "id": 115655, "pid": 76337, "tid": -914061504, "ts": 1716454223177744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223177745, "dur": 0, "args": { "External id": 115656, "cbid": 205, "correlation": 115656 } }, { "ph": "f", "id": 115656, "pid": 76337, "tid": -914061504, "ts": 1716454223177745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223230627, "dur": 12, "args": { "External id": 115660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115660, "pid": 5, "tid": 7, "ts": 1716454223230627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177760, "dur": 11, "args": { "External id": 115660, "cbid": 211, "correlation": 115660 } }, { "ph": "s", "id": 115660, "pid": 76337, "tid": -914061504, "ts": 1716454223177760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223230640, "dur": 4, "args": { "External id": 115662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 115662, "pid": 5, "tid": 7, "ts": 1716454223230640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177777, "dur": 7, "args": { "External id": 115662, "cbid": 211, "correlation": 115662 } }, { "ph": "s", "id": 115662, "pid": 76337, "tid": -914061504, "ts": 1716454223177777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223177788, "dur": 0, "args": { "External id": 115663, "cbid": 51, "correlation": 115663 } }, { "ph": "s", "id": 115663, "pid": 76337, "tid": -914061504, "ts": 1716454223177788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223230645, "dur": 97, "args": { "External id": 115664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115664, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 115664, "pid": 5, "tid": 7, "ts": 1716454223230645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177789, "dur": 6, "args": { "External id": 115664, "cbid": 211, "correlation": 115664 } }, { "ph": "s", "id": 115664, "pid": 76337, "tid": -914061504, "ts": 1716454223177789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223230744, "dur": 15, "args": { "External id": 115669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115669, "pid": 5, "tid": 7, "ts": 1716454223230744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177816, "dur": 8, "args": { "External id": 115669, "cbid": 211, "correlation": 115669 } }, { "ph": "s", "id": 115669, "pid": 76337, "tid": -914061504, "ts": 1716454223177816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223230761, "dur": 84, "args": { "External id": 115678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115678, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115678, "pid": 5, "tid": 7, "ts": 1716454223230761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177898, "dur": 13, "args": { "External id": 115678, "cbid": 211, "correlation": 115678 } }, { "ph": "s", "id": 115678, "pid": 76337, "tid": -914061504, "ts": 1716454223177898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223230846, "dur": 31, "args": { "External id": 115700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115700, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115700, "pid": 5, "tid": 7, "ts": 1716454223230846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223177953, "dur": 10, "args": { "External id": 115700, "cbid": 211, "correlation": 115700 } }, { "ph": "s", "id": 115700, "pid": 76337, "tid": -914061504, "ts": 1716454223177953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178049, "dur": 1, "args": { "External id": 115711, "cbid": 251, "correlation": 115711 } }, { "ph": "f", "id": 115711, "pid": 76337, "tid": -914061504, "ts": 1716454223178049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223230879, "dur": 165, "args": { "External id": 115712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115712, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115712, "pid": 5, "tid": 7, "ts": 1716454223230879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178054, "dur": 13, "args": { "External id": 115712, "cbid": 211, "correlation": 115712 } }, { "ph": "s", "id": 115712, "pid": 76337, "tid": -914061504, "ts": 1716454223178054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178126, "dur": 1, "args": { "External id": 115723, "cbid": 251, "correlation": 115723 } }, { "ph": "f", "id": 115723, "pid": 76337, "tid": -914061504, "ts": 1716454223178126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223231045, "dur": 159, "args": { "External id": 115724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115724, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115724, "pid": 5, "tid": 7, "ts": 1716454223231045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178130, "dur": 12, "args": { "External id": 115724, "cbid": 211, "correlation": 115724 } }, { "ph": "s", "id": 115724, "pid": 76337, "tid": -914061504, "ts": 1716454223178130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178196, "dur": 1, "args": { "External id": 115735, "cbid": 251, "correlation": 115735 } }, { "ph": "f", "id": 115735, "pid": 76337, "tid": -914061504, "ts": 1716454223178196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223231206, "dur": 160, "args": { "External id": 115736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115736, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115736, "pid": 5, "tid": 7, "ts": 1716454223231206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178200, "dur": 11, "args": { "External id": 115736, "cbid": 211, "correlation": 115736 } }, { "ph": "s", "id": 115736, "pid": 76337, "tid": -914061504, "ts": 1716454223178200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223231367, "dur": 337, "args": { "External id": 115761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115761, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115761, "pid": 5, "tid": 7, "ts": 1716454223231367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178288, "dur": 13, "args": { "External id": 115761, "cbid": 211, "correlation": 115761 } }, { "ph": "s", "id": 115761, "pid": 76337, "tid": -914061504, "ts": 1716454223178288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178389, "dur": 1, "args": { "External id": 115779, "cbid": 251, "correlation": 115779 } }, { "ph": "f", "id": 115779, "pid": 76337, "tid": -914061504, "ts": 1716454223178389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223231705, "dur": 166, "args": { "External id": 115781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115781, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115781, "pid": 5, "tid": 7, "ts": 1716454223231705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178395, "dur": 13, "args": { "External id": 115781, "cbid": 211, "correlation": 115781 } }, { "ph": "s", "id": 115781, "pid": 76337, "tid": -914061504, "ts": 1716454223178395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223231872, "dur": 19, "args": { "External id": 115789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115789, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115789, "pid": 5, "tid": 7, "ts": 1716454223231872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178465, "dur": 12, "args": { "External id": 115789, "cbid": 211, "correlation": 115789 } }, { "ph": "s", "id": 115789, "pid": 76337, "tid": -914061504, "ts": 1716454223178465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223231892, "dur": 27, "args": { "External id": 115797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115797, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115797, "pid": 5, "tid": 7, "ts": 1716454223231892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178504, "dur": 8, "args": { "External id": 115797, "cbid": 211, "correlation": 115797 } }, { "ph": "s", "id": 115797, "pid": 76337, "tid": -914061504, "ts": 1716454223178504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223231921, "dur": 17, "args": { "External id": 115808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115808, "pid": 5, "tid": 7, "ts": 1716454223231921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178575, "dur": 12, "args": { "External id": 115808, "cbid": 211, "correlation": 115808 } }, { "ph": "s", "id": 115808, "pid": 76337, "tid": -914061504, "ts": 1716454223178575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223231939, "dur": 16, "args": { "External id": 115830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115830, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115830, "pid": 5, "tid": 7, "ts": 1716454223231939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178606, "dur": 8, "args": { "External id": 115830, "cbid": 211, "correlation": 115830 } }, { "ph": "s", "id": 115830, "pid": 76337, "tid": -914061504, "ts": 1716454223178606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178691, "dur": 2, "args": { "External id": 115841, "cbid": 251, "correlation": 115841 } }, { "ph": "f", "id": 115841, "pid": 76337, "tid": -914061504, "ts": 1716454223178691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223231957, "dur": 90, "args": { "External id": 115842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115842, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 115842, "pid": 5, "tid": 7, "ts": 1716454223231957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178699, "dur": 14, "args": { "External id": 115842, "cbid": 211, "correlation": 115842 } }, { "ph": "s", "id": 115842, "pid": 76337, "tid": -914061504, "ts": 1716454223178699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178769, "dur": 1, "args": { "External id": 115853, "cbid": 251, "correlation": 115853 } }, { "ph": "f", "id": 115853, "pid": 76337, "tid": -914061504, "ts": 1716454223178769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178772, "dur": 0, "args": { "External id": 115854, "cbid": 251, "correlation": 115854 } }, { "ph": "f", "id": 115854, "pid": 76337, "tid": -914061504, "ts": 1716454223178772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223232048, "dur": 13, "args": { "External id": 115855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115855, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115855, "pid": 5, "tid": 7, "ts": 1716454223232048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178774, "dur": 12, "args": { "External id": 115855, "cbid": 211, "correlation": 115855 } }, { "ph": "s", "id": 115855, "pid": 76337, "tid": -914061504, "ts": 1716454223178774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223232062, "dur": 6, "args": { "External id": 115857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115857, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115857, "pid": 5, "tid": 7, "ts": 1716454223232062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178787, "dur": 6, "args": { "External id": 115857, "cbid": 211, "correlation": 115857 } }, { "ph": "s", "id": 115857, "pid": 76337, "tid": -914061504, "ts": 1716454223178787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178846, "dur": 1, "args": { "External id": 115868, "cbid": 251, "correlation": 115868 } }, { "ph": "f", "id": 115868, "pid": 76337, "tid": -914061504, "ts": 1716454223178846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223178850, "dur": 0, "args": { "External id": 115869, "cbid": 251, "correlation": 115869 } }, { "ph": "f", "id": 115869, "pid": 76337, "tid": -914061504, "ts": 1716454223178850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223232069, "dur": 9, "args": { "External id": 115870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115870, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115870, "pid": 5, "tid": 7, "ts": 1716454223232069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178851, "dur": 11, "args": { "External id": 115870, "cbid": 211, "correlation": 115870 } }, { "ph": "s", "id": 115870, "pid": 76337, "tid": -914061504, "ts": 1716454223178851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223232079, "dur": 3, "args": { "External id": 115872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115872, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115872, "pid": 5, "tid": 7, "ts": 1716454223232079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178864, "dur": 5, "args": { "External id": 115872, "cbid": 211, "correlation": 115872 } }, { "ph": "s", "id": 115872, "pid": 76337, "tid": -914061504, "ts": 1716454223178864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223232083, "dur": 56, "args": { "External id": 115897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115897, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115897, "pid": 5, "tid": 7, "ts": 1716454223232083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223178940, "dur": 13, "args": { "External id": 115897, "cbid": 211, "correlation": 115897 } }, { "ph": "s", "id": 115897, "pid": 76337, "tid": -914061504, "ts": 1716454223178940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223179048, "dur": 2, "args": { "External id": 115915, "cbid": 251, "correlation": 115915 } }, { "ph": "f", "id": 115915, "pid": 76337, "tid": -914061504, "ts": 1716454223179048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223232140, "dur": 92, "args": { "External id": 115917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115917, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 115917, "pid": 5, "tid": 7, "ts": 1716454223232140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179055, "dur": 14, "args": { "External id": 115917, "cbid": 211, "correlation": 115917 } }, { "ph": "s", "id": 115917, "pid": 76337, "tid": -914061504, "ts": 1716454223179055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223232234, "dur": 10, "args": { "External id": 115925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115925, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115925, "pid": 5, "tid": 7, "ts": 1716454223232234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179126, "dur": 12, "args": { "External id": 115925, "cbid": 211, "correlation": 115925 } }, { "ph": "s", "id": 115925, "pid": 76337, "tid": -914061504, "ts": 1716454223179126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223232244, "dur": 21, "args": { "External id": 115933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115933, "pid": 5, "tid": 7, "ts": 1716454223232244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179168, "dur": 9, "args": { "External id": 115933, "cbid": 211, "correlation": 115933 } }, { "ph": "s", "id": 115933, "pid": 76337, "tid": -914061504, "ts": 1716454223179168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223232266, "dur": 17, "args": { "External id": 115955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115955, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115955, "pid": 5, "tid": 7, "ts": 1716454223232266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179220, "dur": 11, "args": { "External id": 115955, "cbid": 211, "correlation": 115955 } }, { "ph": "s", "id": 115955, "pid": 76337, "tid": -914061504, "ts": 1716454223179220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223179312, "dur": 2, "args": { "External id": 115971, "cbid": 251, "correlation": 115971 } }, { "ph": "f", "id": 115971, "pid": 76337, "tid": -914061504, "ts": 1716454223179312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223179318, "dur": 0, "args": { "External id": 115973, "cbid": 251, "correlation": 115973 } }, { "ph": "f", "id": 115973, "pid": 76337, "tid": -914061504, "ts": 1716454223179318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223232285, "dur": 495, "args": { "External id": 115974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115974, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 115974, "pid": 5, "tid": 7, "ts": 1716454223232285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179320, "dur": 14, "args": { "External id": 115974, "cbid": 211, "correlation": 115974 } }, { "ph": "s", "id": 115974, "pid": 76337, "tid": -914061504, "ts": 1716454223179320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223232782, "dur": 65, "args": { "External id": 115982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115982, "pid": 5, "tid": 7, "ts": 1716454223232782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179387, "dur": 13, "args": { "External id": 115982, "cbid": 211, "correlation": 115982 } }, { "ph": "s", "id": 115982, "pid": 76337, "tid": -914061504, "ts": 1716454223179387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223232848, "dur": 69, "args": { "External id": 115990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 115990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 115990, "pid": 5, "tid": 7, "ts": 1716454223232848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179418, "dur": 8, "args": { "External id": 115990, "cbid": 211, "correlation": 115990 } }, { "ph": "s", "id": 115990, "pid": 76337, "tid": -914061504, "ts": 1716454223179418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223179499, "dur": 1, "args": { "External id": 116006, "cbid": 251, "correlation": 116006 } }, { "ph": "f", "id": 116006, "pid": 76337, "tid": -914061504, "ts": 1716454223179499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223232919, "dur": 1, "args": { "External id": 116008, "device": 5, "context": 1, "stream": 7, "correlation": 116008, "bytes": 240, "memory bandwidth (GB/s)": 0.1563517915309446 } }, { "ph": "f", "id": 116008, "pid": 5, "tid": 7, "ts": 1716454223232919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223179505, "dur": 11, "args": { "External id": 116008, "cbid": 51, "correlation": 116008 } }, { "ph": "s", "id": 116008, "pid": 76337, "tid": -914061504, "ts": 1716454223179505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223232923, "dur": 268, "args": { "External id": 116009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116009, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116009, "pid": 5, "tid": 7, "ts": 1716454223232923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179518, "dur": 11, "args": { "External id": 116009, "cbid": 211, "correlation": 116009 } }, { "ph": "s", "id": 116009, "pid": 76337, "tid": -914061504, "ts": 1716454223179518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223233192, "dur": 14, "args": { "External id": 116017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116017, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116017, "pid": 5, "tid": 7, "ts": 1716454223233192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179560, "dur": 10, "args": { "External id": 116017, "cbid": 211, "correlation": 116017 } }, { "ph": "s", "id": 116017, "pid": 76337, "tid": -914061504, "ts": 1716454223179560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223233208, "dur": 38, "args": { "External id": 116028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116028, "pid": 5, "tid": 7, "ts": 1716454223233208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179627, "dur": 12, "args": { "External id": 116028, "cbid": 211, "correlation": 116028 } }, { "ph": "s", "id": 116028, "pid": 76337, "tid": -914061504, "ts": 1716454223179627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223179692, "dur": 0, "args": { "External id": 116040, "cbid": 317, "correlation": 116040 } }, { "ph": "f", "id": 116040, "pid": 76337, "tid": -914061504, "ts": 1716454223179692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223179692, "dur": 0, "args": { "External id": 116041, "cbid": 203, "correlation": 116041 } }, { "ph": "f", "id": 116041, "pid": 76337, "tid": -914061504, "ts": 1716454223179692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223179693, "dur": 0, "args": { "External id": 116042, "cbid": 205, "correlation": 116042 } }, { "ph": "f", "id": 116042, "pid": 76337, "tid": -914061504, "ts": 1716454223179693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223233247, "dur": 13, "args": { "External id": 116046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116046, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116046, "pid": 5, "tid": 7, "ts": 1716454223233247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179708, "dur": 12, "args": { "External id": 116046, "cbid": 211, "correlation": 116046 } }, { "ph": "s", "id": 116046, "pid": 76337, "tid": -914061504, "ts": 1716454223179708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223233261, "dur": 4, "args": { "External id": 116048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116048, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116048, "pid": 5, "tid": 7, "ts": 1716454223233261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179725, "dur": 6, "args": { "External id": 116048, "cbid": 211, "correlation": 116048 } }, { "ph": "s", "id": 116048, "pid": 76337, "tid": -914061504, "ts": 1716454223179725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223179734, "dur": 0, "args": { "External id": 116049, "cbid": 51, "correlation": 116049 } }, { "ph": "s", "id": 116049, "pid": 76337, "tid": -914061504, "ts": 1716454223179734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223233266, "dur": 98, "args": { "External id": 116050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116050, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 116050, "pid": 5, "tid": 7, "ts": 1716454223233266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179734, "dur": 5, "args": { "External id": 116050, "cbid": 211, "correlation": 116050 } }, { "ph": "s", "id": 116050, "pid": 76337, "tid": -914061504, "ts": 1716454223179734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223233365, "dur": 16, "args": { "External id": 116055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116055, "pid": 5, "tid": 7, "ts": 1716454223233365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179762, "dur": 8, "args": { "External id": 116055, "cbid": 211, "correlation": 116055 } }, { "ph": "s", "id": 116055, "pid": 76337, "tid": -914061504, "ts": 1716454223179762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223233383, "dur": 13, "args": { "External id": 116063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116063, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116063, "pid": 5, "tid": 7, "ts": 1716454223233383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179793, "dur": 8, "args": { "External id": 116063, "cbid": 211, "correlation": 116063 } }, { "ph": "s", "id": 116063, "pid": 76337, "tid": -914061504, "ts": 1716454223179793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223233398, "dur": 19, "args": { "External id": 116083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116083, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 116083, "pid": 5, "tid": 7, "ts": 1716454223233398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179866, "dur": 11, "args": { "External id": 116083, "cbid": 211, "correlation": 116083 } }, { "ph": "s", "id": 116083, "pid": 76337, "tid": -914061504, "ts": 1716454223179866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223233418, "dur": 5, "args": { "External id": 116095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116095, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 116095, "pid": 5, "tid": 7, "ts": 1716454223233418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179888, "dur": 6, "args": { "External id": 116095, "cbid": 211, "correlation": 116095 } }, { "ph": "s", "id": 116095, "pid": 76337, "tid": -914061504, "ts": 1716454223179888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223233424, "dur": 18, "args": { "External id": 116098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116098, "pid": 5, "tid": 7, "ts": 1716454223233424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179906, "dur": 7, "args": { "External id": 116098, "cbid": 211, "correlation": 116098 } }, { "ph": "s", "id": 116098, "pid": 76337, "tid": -914061504, "ts": 1716454223179906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223233444, "dur": 12, "args": { "External id": 116107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116107, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116107, "pid": 5, "tid": 7, "ts": 1716454223233444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223179945, "dur": 10, "args": { "External id": 116107, "cbid": 211, "correlation": 116107 } }, { "ph": "s", "id": 116107, "pid": 76337, "tid": -914061504, "ts": 1716454223179945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223180004, "dur": 0, "args": { "External id": 116117, "cbid": 317, "correlation": 116117 } }, { "ph": "f", "id": 116117, "pid": 76337, "tid": -914061504, "ts": 1716454223180004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223180005, "dur": 0, "args": { "External id": 116118, "cbid": 203, "correlation": 116118 } }, { "ph": "f", "id": 116118, "pid": 76337, "tid": -914061504, "ts": 1716454223180005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223180005, "dur": 0, "args": { "External id": 116119, "cbid": 205, "correlation": 116119 } }, { "ph": "f", "id": 116119, "pid": 76337, "tid": -914061504, "ts": 1716454223180005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223233456, "dur": 11, "args": { "External id": 116123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116123, "pid": 5, "tid": 7, "ts": 1716454223233456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180020, "dur": 12, "args": { "External id": 116123, "cbid": 211, "correlation": 116123 } }, { "ph": "s", "id": 116123, "pid": 76337, "tid": -914061504, "ts": 1716454223180020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223233469, "dur": 162, "args": { "External id": 116125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116125, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116125, "pid": 5, "tid": 7, "ts": 1716454223233469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180034, "dur": 5, "args": { "External id": 116125, "cbid": 211, "correlation": 116125 } }, { "ph": "s", "id": 116125, "pid": 76337, "tid": -914061504, "ts": 1716454223180034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223233633, "dur": 1, "args": { "External id": 116127, "device": 5, "context": 1, "stream": 7, "correlation": 116127, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 116127, "pid": 5, "tid": 7, "ts": 1716454223233633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223180045, "dur": 7, "args": { "External id": 116127, "cbid": 51, "correlation": 116127 } }, { "ph": "s", "id": 116127, "pid": 76337, "tid": -914061504, "ts": 1716454223180045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223233637, "dur": 661, "args": { "External id": 116128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116128, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116128, "pid": 5, "tid": 7, "ts": 1716454223233637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180053, "dur": 7, "args": { "External id": 116128, "cbid": 211, "correlation": 116128 } }, { "ph": "s", "id": 116128, "pid": 76337, "tid": -914061504, "ts": 1716454223180053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223234299, "dur": 14, "args": { "External id": 116130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116130, "pid": 5, "tid": 7, "ts": 1716454223234299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180065, "dur": 5, "args": { "External id": 116130, "cbid": 211, "correlation": 116130 } }, { "ph": "s", "id": 116130, "pid": 76337, "tid": -914061504, "ts": 1716454223180065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223234314, "dur": 14, "args": { "External id": 116136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116136, "pid": 5, "tid": 7, "ts": 1716454223234314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180093, "dur": 8, "args": { "External id": 116136, "cbid": 211, "correlation": 116136 } }, { "ph": "s", "id": 116136, "pid": 76337, "tid": -914061504, "ts": 1716454223180093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223234330, "dur": 4, "args": { "External id": 116144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116144, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 116144, "pid": 5, "tid": 7, "ts": 1716454223234330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180137, "dur": 9, "args": { "External id": 116144, "cbid": 211, "correlation": 116144 } }, { "ph": "s", "id": 116144, "pid": 76337, "tid": -914061504, "ts": 1716454223180137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223180202, "dur": 1, "args": { "External id": 116160, "cbid": 251, "correlation": 116160 } }, { "ph": "f", "id": 116160, "pid": 76337, "tid": -914061504, "ts": 1716454223180202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223180208, "dur": 0, "args": { "External id": 116162, "cbid": 251, "correlation": 116162 } }, { "ph": "f", "id": 116162, "pid": 76337, "tid": -914061504, "ts": 1716454223180208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223234335, "dur": 13, "args": { "External id": 116163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116163, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116163, "pid": 5, "tid": 7, "ts": 1716454223234335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180210, "dur": 11, "args": { "External id": 116163, "cbid": 211, "correlation": 116163 } }, { "ph": "s", "id": 116163, "pid": 76337, "tid": -914061504, "ts": 1716454223180210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223234350, "dur": 5, "args": { "External id": 116165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116165, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116165, "pid": 5, "tid": 7, "ts": 1716454223234350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180223, "dur": 5, "args": { "External id": 116165, "cbid": 211, "correlation": 116165 } }, { "ph": "s", "id": 116165, "pid": 76337, "tid": -914061504, "ts": 1716454223180223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223234356, "dur": 17, "args": { "External id": 116175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116175, "pid": 5, "tid": 7, "ts": 1716454223234356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180279, "dur": 12, "args": { "External id": 116175, "cbid": 211, "correlation": 116175 } }, { "ph": "s", "id": 116175, "pid": 76337, "tid": -914061504, "ts": 1716454223180279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223234375, "dur": 18, "args": { "External id": 116195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116195, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 116195, "pid": 5, "tid": 7, "ts": 1716454223234375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180345, "dur": 10, "args": { "External id": 116195, "cbid": 211, "correlation": 116195 } }, { "ph": "s", "id": 116195, "pid": 76337, "tid": -914061504, "ts": 1716454223180345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223234394, "dur": 4, "args": { "External id": 116207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116207, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 116207, "pid": 5, "tid": 7, "ts": 1716454223234394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180367, "dur": 6, "args": { "External id": 116207, "cbid": 211, "correlation": 116207 } }, { "ph": "s", "id": 116207, "pid": 76337, "tid": -914061504, "ts": 1716454223180367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223234400, "dur": 19, "args": { "External id": 116210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116210, "pid": 5, "tid": 7, "ts": 1716454223234400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180386, "dur": 6, "args": { "External id": 116210, "cbid": 211, "correlation": 116210 } }, { "ph": "s", "id": 116210, "pid": 76337, "tid": -914061504, "ts": 1716454223180386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223234420, "dur": 12, "args": { "External id": 116219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116219, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116219, "pid": 5, "tid": 7, "ts": 1716454223234420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180427, "dur": 9, "args": { "External id": 116219, "cbid": 211, "correlation": 116219 } }, { "ph": "s", "id": 116219, "pid": 76337, "tid": -914061504, "ts": 1716454223180427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223180489, "dur": 0, "args": { "External id": 116229, "cbid": 317, "correlation": 116229 } }, { "ph": "f", "id": 116229, "pid": 76337, "tid": -914061504, "ts": 1716454223180489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223180489, "dur": 0, "args": { "External id": 116230, "cbid": 203, "correlation": 116230 } }, { "ph": "f", "id": 116230, "pid": 76337, "tid": -914061504, "ts": 1716454223180489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223180490, "dur": 0, "args": { "External id": 116231, "cbid": 205, "correlation": 116231 } }, { "ph": "f", "id": 116231, "pid": 76337, "tid": -914061504, "ts": 1716454223180490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223234433, "dur": 11, "args": { "External id": 116235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116235, "pid": 5, "tid": 7, "ts": 1716454223234433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180504, "dur": 12, "args": { "External id": 116235, "cbid": 211, "correlation": 116235 } }, { "ph": "s", "id": 116235, "pid": 76337, "tid": -914061504, "ts": 1716454223180504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223234445, "dur": 163, "args": { "External id": 116237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116237, "pid": 5, "tid": 7, "ts": 1716454223234445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180519, "dur": 5, "args": { "External id": 116237, "cbid": 211, "correlation": 116237 } }, { "ph": "s", "id": 116237, "pid": 76337, "tid": -914061504, "ts": 1716454223180519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223234610, "dur": 1, "args": { "External id": 116239, "device": 5, "context": 1, "stream": 7, "correlation": 116239, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 116239, "pid": 5, "tid": 7, "ts": 1716454223234610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223180530, "dur": 6, "args": { "External id": 116239, "cbid": 51, "correlation": 116239 } }, { "ph": "s", "id": 116239, "pid": 76337, "tid": -914061504, "ts": 1716454223180530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223234614, "dur": 648, "args": { "External id": 116240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116240, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116240, "pid": 5, "tid": 7, "ts": 1716454223234614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180537, "dur": 6, "args": { "External id": 116240, "cbid": 211, "correlation": 116240 } }, { "ph": "s", "id": 116240, "pid": 76337, "tid": -914061504, "ts": 1716454223180537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223235263, "dur": 13, "args": { "External id": 116242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116242, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116242, "pid": 5, "tid": 7, "ts": 1716454223235263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180547, "dur": 5, "args": { "External id": 116242, "cbid": 211, "correlation": 116242 } }, { "ph": "s", "id": 116242, "pid": 76337, "tid": -914061504, "ts": 1716454223180547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223235278, "dur": 15, "args": { "External id": 116248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116248, "pid": 5, "tid": 7, "ts": 1716454223235278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180576, "dur": 9, "args": { "External id": 116248, "cbid": 211, "correlation": 116248 } }, { "ph": "s", "id": 116248, "pid": 76337, "tid": -914061504, "ts": 1716454223180576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223235294, "dur": 12, "args": { "External id": 116256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116256, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116256, "pid": 5, "tid": 7, "ts": 1716454223235294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180607, "dur": 9, "args": { "External id": 116256, "cbid": 211, "correlation": 116256 } }, { "ph": "s", "id": 116256, "pid": 76337, "tid": -914061504, "ts": 1716454223180607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223235307, "dur": 11, "args": { "External id": 116264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116264, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116264, "pid": 5, "tid": 7, "ts": 1716454223235307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180636, "dur": 8, "args": { "External id": 116264, "cbid": 211, "correlation": 116264 } }, { "ph": "s", "id": 116264, "pid": 76337, "tid": -914061504, "ts": 1716454223180636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223235319, "dur": 19, "args": { "External id": 116284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116284, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 116284, "pid": 5, "tid": 7, "ts": 1716454223235319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180715, "dur": 13, "args": { "External id": 116284, "cbid": 211, "correlation": 116284 } }, { "ph": "s", "id": 116284, "pid": 76337, "tid": -914061504, "ts": 1716454223180715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223235339, "dur": 4, "args": { "External id": 116296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116296, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 116296, "pid": 5, "tid": 7, "ts": 1716454223235339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180737, "dur": 6, "args": { "External id": 116296, "cbid": 211, "correlation": 116296 } }, { "ph": "s", "id": 116296, "pid": 76337, "tid": -914061504, "ts": 1716454223180737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223235345, "dur": 17, "args": { "External id": 116299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116299, "pid": 5, "tid": 7, "ts": 1716454223235345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180754, "dur": 7, "args": { "External id": 116299, "cbid": 211, "correlation": 116299 } }, { "ph": "s", "id": 116299, "pid": 76337, "tid": -914061504, "ts": 1716454223180754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223180812, "dur": 0, "args": { "External id": 116310, "cbid": 317, "correlation": 116310 } }, { "ph": "f", "id": 116310, "pid": 76337, "tid": -914061504, "ts": 1716454223180812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223180812, "dur": 0, "args": { "External id": 116311, "cbid": 203, "correlation": 116311 } }, { "ph": "f", "id": 116311, "pid": 76337, "tid": -914061504, "ts": 1716454223180812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223180813, "dur": 0, "args": { "External id": 116312, "cbid": 205, "correlation": 116312 } }, { "ph": "f", "id": 116312, "pid": 76337, "tid": -914061504, "ts": 1716454223180813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223235363, "dur": 12, "args": { "External id": 116316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116316, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116316, "pid": 5, "tid": 7, "ts": 1716454223235363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180827, "dur": 12, "args": { "External id": 116316, "cbid": 211, "correlation": 116316 } }, { "ph": "s", "id": 116316, "pid": 76337, "tid": -914061504, "ts": 1716454223180827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223235376, "dur": 4, "args": { "External id": 116318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116318, "pid": 5, "tid": 7, "ts": 1716454223235376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180843, "dur": 6, "args": { "External id": 116318, "cbid": 211, "correlation": 116318 } }, { "ph": "s", "id": 116318, "pid": 76337, "tid": -914061504, "ts": 1716454223180843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223180851, "dur": 0, "args": { "External id": 116319, "cbid": 51, "correlation": 116319 } }, { "ph": "s", "id": 116319, "pid": 76337, "tid": -914061504, "ts": 1716454223180851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223235381, "dur": 96, "args": { "External id": 116320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116320, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 116320, "pid": 5, "tid": 7, "ts": 1716454223235381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180852, "dur": 5, "args": { "External id": 116320, "cbid": 211, "correlation": 116320 } }, { "ph": "s", "id": 116320, "pid": 76337, "tid": -914061504, "ts": 1716454223180852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223235478, "dur": 15, "args": { "External id": 116325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116325, "pid": 5, "tid": 7, "ts": 1716454223235478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180879, "dur": 8, "args": { "External id": 116325, "cbid": 211, "correlation": 116325 } }, { "ph": "s", "id": 116325, "pid": 76337, "tid": -914061504, "ts": 1716454223180879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223235495, "dur": 83, "args": { "External id": 116334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116334, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116334, "pid": 5, "tid": 7, "ts": 1716454223235495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223180961, "dur": 23, "args": { "External id": 116334, "cbid": 211, "correlation": 116334 } }, { "ph": "s", "id": 116334, "pid": 76337, "tid": -914061504, "ts": 1716454223180961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223235579, "dur": 29, "args": { "External id": 116356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116356, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116356, "pid": 5, "tid": 7, "ts": 1716454223235579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181028, "dur": 11, "args": { "External id": 116356, "cbid": 211, "correlation": 116356 } }, { "ph": "s", "id": 116356, "pid": 76337, "tid": -914061504, "ts": 1716454223181028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181116, "dur": 1, "args": { "External id": 116367, "cbid": 251, "correlation": 116367 } }, { "ph": "f", "id": 116367, "pid": 76337, "tid": -914061504, "ts": 1716454223181116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223235610, "dur": 165, "args": { "External id": 116368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116368, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116368, "pid": 5, "tid": 7, "ts": 1716454223235610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181122, "dur": 13, "args": { "External id": 116368, "cbid": 211, "correlation": 116368 } }, { "ph": "s", "id": 116368, "pid": 76337, "tid": -914061504, "ts": 1716454223181122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181191, "dur": 1, "args": { "External id": 116379, "cbid": 251, "correlation": 116379 } }, { "ph": "f", "id": 116379, "pid": 76337, "tid": -914061504, "ts": 1716454223181191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223235777, "dur": 160, "args": { "External id": 116380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116380, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116380, "pid": 5, "tid": 7, "ts": 1716454223235777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181195, "dur": 11, "args": { "External id": 116380, "cbid": 211, "correlation": 116380 } }, { "ph": "s", "id": 116380, "pid": 76337, "tid": -914061504, "ts": 1716454223181195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181259, "dur": 1, "args": { "External id": 116391, "cbid": 251, "correlation": 116391 } }, { "ph": "f", "id": 116391, "pid": 76337, "tid": -914061504, "ts": 1716454223181259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223235938, "dur": 159, "args": { "External id": 116392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116392, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116392, "pid": 5, "tid": 7, "ts": 1716454223235938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181263, "dur": 12, "args": { "External id": 116392, "cbid": 211, "correlation": 116392 } }, { "ph": "s", "id": 116392, "pid": 76337, "tid": -914061504, "ts": 1716454223181263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223236098, "dur": 338, "args": { "External id": 116417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116417, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116417, "pid": 5, "tid": 7, "ts": 1716454223236098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181348, "dur": 13, "args": { "External id": 116417, "cbid": 211, "correlation": 116417 } }, { "ph": "s", "id": 116417, "pid": 76337, "tid": -914061504, "ts": 1716454223181348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181447, "dur": 1, "args": { "External id": 116435, "cbid": 251, "correlation": 116435 } }, { "ph": "f", "id": 116435, "pid": 76337, "tid": -914061504, "ts": 1716454223181447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223236438, "dur": 167, "args": { "External id": 116437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116437, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116437, "pid": 5, "tid": 7, "ts": 1716454223236438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181453, "dur": 13, "args": { "External id": 116437, "cbid": 211, "correlation": 116437 } }, { "ph": "s", "id": 116437, "pid": 76337, "tid": -914061504, "ts": 1716454223181453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223236606, "dur": 20, "args": { "External id": 116445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116445, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116445, "pid": 5, "tid": 7, "ts": 1716454223236606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181523, "dur": 12, "args": { "External id": 116445, "cbid": 211, "correlation": 116445 } }, { "ph": "s", "id": 116445, "pid": 76337, "tid": -914061504, "ts": 1716454223181523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223236627, "dur": 27, "args": { "External id": 116453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116453, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116453, "pid": 5, "tid": 7, "ts": 1716454223236627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181561, "dur": 9, "args": { "External id": 116453, "cbid": 211, "correlation": 116453 } }, { "ph": "s", "id": 116453, "pid": 76337, "tid": -914061504, "ts": 1716454223181561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223236656, "dur": 18, "args": { "External id": 116464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116464, "pid": 5, "tid": 7, "ts": 1716454223236656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181632, "dur": 12, "args": { "External id": 116464, "cbid": 211, "correlation": 116464 } }, { "ph": "s", "id": 116464, "pid": 76337, "tid": -914061504, "ts": 1716454223181632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223236675, "dur": 16, "args": { "External id": 116486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116486, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116486, "pid": 5, "tid": 7, "ts": 1716454223236675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181663, "dur": 7, "args": { "External id": 116486, "cbid": 211, "correlation": 116486 } }, { "ph": "s", "id": 116486, "pid": 76337, "tid": -914061504, "ts": 1716454223181663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181748, "dur": 1, "args": { "External id": 116497, "cbid": 251, "correlation": 116497 } }, { "ph": "f", "id": 116497, "pid": 76337, "tid": -914061504, "ts": 1716454223181748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223236693, "dur": 89, "args": { "External id": 116498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116498, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116498, "pid": 5, "tid": 7, "ts": 1716454223236693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181753, "dur": 13, "args": { "External id": 116498, "cbid": 211, "correlation": 116498 } }, { "ph": "s", "id": 116498, "pid": 76337, "tid": -914061504, "ts": 1716454223181753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181821, "dur": 1, "args": { "External id": 116509, "cbid": 251, "correlation": 116509 } }, { "ph": "f", "id": 116509, "pid": 76337, "tid": -914061504, "ts": 1716454223181821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181825, "dur": 0, "args": { "External id": 116510, "cbid": 251, "correlation": 116510 } }, { "ph": "f", "id": 116510, "pid": 76337, "tid": -914061504, "ts": 1716454223181825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223236784, "dur": 12, "args": { "External id": 116511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116511, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116511, "pid": 5, "tid": 7, "ts": 1716454223236784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181827, "dur": 12, "args": { "External id": 116511, "cbid": 211, "correlation": 116511 } }, { "ph": "s", "id": 116511, "pid": 76337, "tid": -914061504, "ts": 1716454223181827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223236797, "dur": 6, "args": { "External id": 116513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116513, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116513, "pid": 5, "tid": 7, "ts": 1716454223236797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181841, "dur": 6, "args": { "External id": 116513, "cbid": 211, "correlation": 116513 } }, { "ph": "s", "id": 116513, "pid": 76337, "tid": -914061504, "ts": 1716454223181841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181898, "dur": 1, "args": { "External id": 116524, "cbid": 251, "correlation": 116524 } }, { "ph": "f", "id": 116524, "pid": 76337, "tid": -914061504, "ts": 1716454223181898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223181902, "dur": 0, "args": { "External id": 116525, "cbid": 251, "correlation": 116525 } }, { "ph": "f", "id": 116525, "pid": 76337, "tid": -914061504, "ts": 1716454223181902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223236805, "dur": 8, "args": { "External id": 116526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116526, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116526, "pid": 5, "tid": 7, "ts": 1716454223236805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181903, "dur": 12, "args": { "External id": 116526, "cbid": 211, "correlation": 116526 } }, { "ph": "s", "id": 116526, "pid": 76337, "tid": -914061504, "ts": 1716454223181903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223236814, "dur": 4, "args": { "External id": 116528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116528, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116528, "pid": 5, "tid": 7, "ts": 1716454223236814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223181916, "dur": 6, "args": { "External id": 116528, "cbid": 211, "correlation": 116528 } }, { "ph": "s", "id": 116528, "pid": 76337, "tid": -914061504, "ts": 1716454223181916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223236819, "dur": 55, "args": { "External id": 116553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116553, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116553, "pid": 5, "tid": 7, "ts": 1716454223236819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182001, "dur": 12, "args": { "External id": 116553, "cbid": 211, "correlation": 116553 } }, { "ph": "s", "id": 116553, "pid": 76337, "tid": -914061504, "ts": 1716454223182001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223182100, "dur": 1, "args": { "External id": 116571, "cbid": 251, "correlation": 116571 } }, { "ph": "f", "id": 116571, "pid": 76337, "tid": -914061504, "ts": 1716454223182100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223236875, "dur": 91, "args": { "External id": 116573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116573, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116573, "pid": 5, "tid": 7, "ts": 1716454223236875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182105, "dur": 13, "args": { "External id": 116573, "cbid": 211, "correlation": 116573 } }, { "ph": "s", "id": 116573, "pid": 76337, "tid": -914061504, "ts": 1716454223182105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223236968, "dur": 9, "args": { "External id": 116581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116581, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116581, "pid": 5, "tid": 7, "ts": 1716454223236968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182175, "dur": 13, "args": { "External id": 116581, "cbid": 211, "correlation": 116581 } }, { "ph": "s", "id": 116581, "pid": 76337, "tid": -914061504, "ts": 1716454223182175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223236978, "dur": 21, "args": { "External id": 116589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116589, "pid": 5, "tid": 7, "ts": 1716454223236978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182217, "dur": 9, "args": { "External id": 116589, "cbid": 211, "correlation": 116589 } }, { "ph": "s", "id": 116589, "pid": 76337, "tid": -914061504, "ts": 1716454223182217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223237001, "dur": 18, "args": { "External id": 116611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116611, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116611, "pid": 5, "tid": 7, "ts": 1716454223237001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182267, "dur": 10, "args": { "External id": 116611, "cbid": 211, "correlation": 116611 } }, { "ph": "s", "id": 116611, "pid": 76337, "tid": -914061504, "ts": 1716454223182267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223182353, "dur": 1, "args": { "External id": 116627, "cbid": 251, "correlation": 116627 } }, { "ph": "f", "id": 116627, "pid": 76337, "tid": -914061504, "ts": 1716454223182353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223182357, "dur": 0, "args": { "External id": 116629, "cbid": 251, "correlation": 116629 } }, { "ph": "f", "id": 116629, "pid": 76337, "tid": -914061504, "ts": 1716454223182357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223237020, "dur": 495, "args": { "External id": 116630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116630, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116630, "pid": 5, "tid": 7, "ts": 1716454223237020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182359, "dur": 12, "args": { "External id": 116630, "cbid": 211, "correlation": 116630 } }, { "ph": "s", "id": 116630, "pid": 76337, "tid": -914061504, "ts": 1716454223182359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223237516, "dur": 66, "args": { "External id": 116638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116638, "pid": 5, "tid": 7, "ts": 1716454223237516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182423, "dur": 13, "args": { "External id": 116638, "cbid": 211, "correlation": 116638 } }, { "ph": "s", "id": 116638, "pid": 76337, "tid": -914061504, "ts": 1716454223182423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223237583, "dur": 68, "args": { "External id": 116646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116646, "pid": 5, "tid": 7, "ts": 1716454223237583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182454, "dur": 8, "args": { "External id": 116646, "cbid": 211, "correlation": 116646 } }, { "ph": "s", "id": 116646, "pid": 76337, "tid": -914061504, "ts": 1716454223182454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223182533, "dur": 1, "args": { "External id": 116662, "cbid": 251, "correlation": 116662 } }, { "ph": "f", "id": 116662, "pid": 76337, "tid": -914061504, "ts": 1716454223182533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223237653, "dur": 1, "args": { "External id": 116664, "device": 5, "context": 1, "stream": 7, "correlation": 116664, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 116664, "pid": 5, "tid": 7, "ts": 1716454223237653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223182538, "dur": 9, "args": { "External id": 116664, "cbid": 51, "correlation": 116664 } }, { "ph": "s", "id": 116664, "pid": 76337, "tid": -914061504, "ts": 1716454223182538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223237657, "dur": 272, "args": { "External id": 116665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116665, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116665, "pid": 5, "tid": 7, "ts": 1716454223237657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182548, "dur": 11, "args": { "External id": 116665, "cbid": 211, "correlation": 116665 } }, { "ph": "s", "id": 116665, "pid": 76337, "tid": -914061504, "ts": 1716454223182548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223237930, "dur": 14, "args": { "External id": 116673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116673, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116673, "pid": 5, "tid": 7, "ts": 1716454223237930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182590, "dur": 10, "args": { "External id": 116673, "cbid": 211, "correlation": 116673 } }, { "ph": "s", "id": 116673, "pid": 76337, "tid": -914061504, "ts": 1716454223182590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223237945, "dur": 38, "args": { "External id": 116684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116684, "pid": 5, "tid": 7, "ts": 1716454223237945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182658, "dur": 12, "args": { "External id": 116684, "cbid": 211, "correlation": 116684 } }, { "ph": "s", "id": 116684, "pid": 76337, "tid": -914061504, "ts": 1716454223182658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223182722, "dur": 0, "args": { "External id": 116696, "cbid": 317, "correlation": 116696 } }, { "ph": "f", "id": 116696, "pid": 76337, "tid": -914061504, "ts": 1716454223182722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223182723, "dur": 0, "args": { "External id": 116697, "cbid": 203, "correlation": 116697 } }, { "ph": "f", "id": 116697, "pid": 76337, "tid": -914061504, "ts": 1716454223182723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223182724, "dur": 0, "args": { "External id": 116698, "cbid": 205, "correlation": 116698 } }, { "ph": "f", "id": 116698, "pid": 76337, "tid": -914061504, "ts": 1716454223182724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223237984, "dur": 13, "args": { "External id": 116702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116702, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116702, "pid": 5, "tid": 7, "ts": 1716454223237984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182739, "dur": 12, "args": { "External id": 116702, "cbid": 211, "correlation": 116702 } }, { "ph": "s", "id": 116702, "pid": 76337, "tid": -914061504, "ts": 1716454223182739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223237998, "dur": 4, "args": { "External id": 116704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116704, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 116704, "pid": 5, "tid": 7, "ts": 1716454223237998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182756, "dur": 6, "args": { "External id": 116704, "cbid": 211, "correlation": 116704 } }, { "ph": "s", "id": 116704, "pid": 76337, "tid": -914061504, "ts": 1716454223182756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223182764, "dur": 0, "args": { "External id": 116705, "cbid": 51, "correlation": 116705 } }, { "ph": "s", "id": 116705, "pid": 76337, "tid": -914061504, "ts": 1716454223182764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223238004, "dur": 98, "args": { "External id": 116706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116706, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 116706, "pid": 5, "tid": 7, "ts": 1716454223238004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182765, "dur": 5, "args": { "External id": 116706, "cbid": 211, "correlation": 116706 } }, { "ph": "s", "id": 116706, "pid": 76337, "tid": -914061504, "ts": 1716454223182765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223238103, "dur": 17, "args": { "External id": 116711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116711, "pid": 5, "tid": 7, "ts": 1716454223238103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182792, "dur": 9, "args": { "External id": 116711, "cbid": 211, "correlation": 116711 } }, { "ph": "s", "id": 116711, "pid": 76337, "tid": -914061504, "ts": 1716454223182792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223238121, "dur": 13, "args": { "External id": 116719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116719, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116719, "pid": 5, "tid": 7, "ts": 1716454223238121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182824, "dur": 9, "args": { "External id": 116719, "cbid": 211, "correlation": 116719 } }, { "ph": "s", "id": 116719, "pid": 76337, "tid": -914061504, "ts": 1716454223182824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223182893, "dur": 0, "args": { "External id": 116729, "cbid": 317, "correlation": 116729 } }, { "ph": "f", "id": 116729, "pid": 76337, "tid": -914061504, "ts": 1716454223182893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223182894, "dur": 0, "args": { "External id": 116730, "cbid": 203, "correlation": 116730 } }, { "ph": "f", "id": 116730, "pid": 76337, "tid": -914061504, "ts": 1716454223182894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223182895, "dur": 0, "args": { "External id": 116731, "cbid": 205, "correlation": 116731 } }, { "ph": "f", "id": 116731, "pid": 76337, "tid": -914061504, "ts": 1716454223182895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223238135, "dur": 12, "args": { "External id": 116735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116735, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116735, "pid": 5, "tid": 7, "ts": 1716454223238135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182909, "dur": 12, "args": { "External id": 116735, "cbid": 211, "correlation": 116735 } }, { "ph": "s", "id": 116735, "pid": 76337, "tid": -914061504, "ts": 1716454223182909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223238149, "dur": 162, "args": { "External id": 116737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116737, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116737, "pid": 5, "tid": 7, "ts": 1716454223238149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182923, "dur": 5, "args": { "External id": 116737, "cbid": 211, "correlation": 116737 } }, { "ph": "s", "id": 116737, "pid": 76337, "tid": -914061504, "ts": 1716454223182923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223238313, "dur": 1, "args": { "External id": 116739, "device": 5, "context": 1, "stream": 7, "correlation": 116739, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 116739, "pid": 5, "tid": 7, "ts": 1716454223238313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223182935, "dur": 6, "args": { "External id": 116739, "cbid": 51, "correlation": 116739 } }, { "ph": "s", "id": 116739, "pid": 76337, "tid": -914061504, "ts": 1716454223182935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223238317, "dur": 198, "args": { "External id": 116740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116740, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 116740, "pid": 5, "tid": 7, "ts": 1716454223238317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182943, "dur": 8, "args": { "External id": 116740, "cbid": 211, "correlation": 116740 } }, { "ph": "s", "id": 116740, "pid": 76337, "tid": -914061504, "ts": 1716454223182943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223238516, "dur": 6, "args": { "External id": 116742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116742, "pid": 5, "tid": 7, "ts": 1716454223238516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182955, "dur": 5, "args": { "External id": 116742, "cbid": 211, "correlation": 116742 } }, { "ph": "s", "id": 116742, "pid": 76337, "tid": -914061504, "ts": 1716454223182955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223238523, "dur": 6, "args": { "External id": 116748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116748, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116748, "pid": 5, "tid": 7, "ts": 1716454223238523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223182992, "dur": 9, "args": { "External id": 116748, "cbid": 211, "correlation": 116748 } }, { "ph": "s", "id": 116748, "pid": 76337, "tid": -914061504, "ts": 1716454223182992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223238531, "dur": 11, "args": { "External id": 116768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116768, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 116768, "pid": 5, "tid": 7, "ts": 1716454223238531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183087, "dur": 12, "args": { "External id": 116768, "cbid": 211, "correlation": 116768 } }, { "ph": "s", "id": 116768, "pid": 76337, "tid": -914061504, "ts": 1716454223183087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223238543, "dur": 4, "args": { "External id": 116780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116780, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 116780, "pid": 5, "tid": 7, "ts": 1716454223238543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183110, "dur": 7, "args": { "External id": 116780, "cbid": 211, "correlation": 116780 } }, { "ph": "s", "id": 116780, "pid": 76337, "tid": -914061504, "ts": 1716454223183110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223238548, "dur": 8, "args": { "External id": 116783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116783, "pid": 5, "tid": 7, "ts": 1716454223238548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183129, "dur": 7, "args": { "External id": 116783, "cbid": 211, "correlation": 116783 } }, { "ph": "s", "id": 116783, "pid": 76337, "tid": -914061504, "ts": 1716454223183129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223238558, "dur": 6, "args": { "External id": 116792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116792, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116792, "pid": 5, "tid": 7, "ts": 1716454223238558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183169, "dur": 10, "args": { "External id": 116792, "cbid": 211, "correlation": 116792 } }, { "ph": "s", "id": 116792, "pid": 76337, "tid": -914061504, "ts": 1716454223183169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223183222, "dur": 0, "args": { "External id": 116802, "cbid": 317, "correlation": 116802 } }, { "ph": "f", "id": 116802, "pid": 76337, "tid": -914061504, "ts": 1716454223183222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223183223, "dur": 0, "args": { "External id": 116803, "cbid": 203, "correlation": 116803 } }, { "ph": "f", "id": 116803, "pid": 76337, "tid": -914061504, "ts": 1716454223183223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223183223, "dur": 0, "args": { "External id": 116804, "cbid": 205, "correlation": 116804 } }, { "ph": "f", "id": 116804, "pid": 76337, "tid": -914061504, "ts": 1716454223183223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223238565, "dur": 5, "args": { "External id": 116808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116808, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116808, "pid": 5, "tid": 7, "ts": 1716454223238565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183239, "dur": 11, "args": { "External id": 116808, "cbid": 211, "correlation": 116808 } }, { "ph": "s", "id": 116808, "pid": 76337, "tid": -914061504, "ts": 1716454223183239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223238571, "dur": 162, "args": { "External id": 116810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116810, "pid": 5, "tid": 7, "ts": 1716454223238571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183252, "dur": 5, "args": { "External id": 116810, "cbid": 211, "correlation": 116810 } }, { "ph": "s", "id": 116810, "pid": 76337, "tid": -914061504, "ts": 1716454223183252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223238736, "dur": 1, "args": { "External id": 116812, "device": 5, "context": 1, "stream": 7, "correlation": 116812, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 116812, "pid": 5, "tid": 7, "ts": 1716454223238736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223183263, "dur": 6, "args": { "External id": 116812, "cbid": 51, "correlation": 116812 } }, { "ph": "s", "id": 116812, "pid": 76337, "tid": -914061504, "ts": 1716454223183263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223238740, "dur": 270, "args": { "External id": 116813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116813, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116813, "pid": 5, "tid": 7, "ts": 1716454223238740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183270, "dur": 6, "args": { "External id": 116813, "cbid": 211, "correlation": 116813 } }, { "ph": "s", "id": 116813, "pid": 76337, "tid": -914061504, "ts": 1716454223183270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223239011, "dur": 6, "args": { "External id": 116815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116815, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116815, "pid": 5, "tid": 7, "ts": 1716454223239011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183281, "dur": 6, "args": { "External id": 116815, "cbid": 211, "correlation": 116815 } }, { "ph": "s", "id": 116815, "pid": 76337, "tid": -914061504, "ts": 1716454223183281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223239018, "dur": 7, "args": { "External id": 116821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116821, "pid": 5, "tid": 7, "ts": 1716454223239018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183309, "dur": 8, "args": { "External id": 116821, "cbid": 211, "correlation": 116821 } }, { "ph": "s", "id": 116821, "pid": 76337, "tid": -914061504, "ts": 1716454223183309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223239026, "dur": 4, "args": { "External id": 116829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116829, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 116829, "pid": 5, "tid": 7, "ts": 1716454223239026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183353, "dur": 9, "args": { "External id": 116829, "cbid": 211, "correlation": 116829 } }, { "ph": "s", "id": 116829, "pid": 76337, "tid": -914061504, "ts": 1716454223183353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223183418, "dur": 1, "args": { "External id": 116845, "cbid": 251, "correlation": 116845 } }, { "ph": "f", "id": 116845, "pid": 76337, "tid": -914061504, "ts": 1716454223183418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223183423, "dur": 0, "args": { "External id": 116847, "cbid": 251, "correlation": 116847 } }, { "ph": "f", "id": 116847, "pid": 76337, "tid": -914061504, "ts": 1716454223183423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223239031, "dur": 13, "args": { "External id": 116848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116848, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116848, "pid": 5, "tid": 7, "ts": 1716454223239031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183425, "dur": 12, "args": { "External id": 116848, "cbid": 211, "correlation": 116848 } }, { "ph": "s", "id": 116848, "pid": 76337, "tid": -914061504, "ts": 1716454223183425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223239046, "dur": 5, "args": { "External id": 116850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116850, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116850, "pid": 5, "tid": 7, "ts": 1716454223239046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183439, "dur": 5, "args": { "External id": 116850, "cbid": 211, "correlation": 116850 } }, { "ph": "s", "id": 116850, "pid": 76337, "tid": -914061504, "ts": 1716454223183439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223239052, "dur": 6, "args": { "External id": 116860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116860, "pid": 5, "tid": 7, "ts": 1716454223239052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183496, "dur": 12, "args": { "External id": 116860, "cbid": 211, "correlation": 116860 } }, { "ph": "s", "id": 116860, "pid": 76337, "tid": -914061504, "ts": 1716454223183496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223239059, "dur": 10, "args": { "External id": 116880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116880, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 116880, "pid": 5, "tid": 7, "ts": 1716454223239059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183562, "dur": 11, "args": { "External id": 116880, "cbid": 211, "correlation": 116880 } }, { "ph": "s", "id": 116880, "pid": 76337, "tid": -914061504, "ts": 1716454223183562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223239070, "dur": 4, "args": { "External id": 116892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116892, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 116892, "pid": 5, "tid": 7, "ts": 1716454223239070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183583, "dur": 7, "args": { "External id": 116892, "cbid": 211, "correlation": 116892 } }, { "ph": "s", "id": 116892, "pid": 76337, "tid": -914061504, "ts": 1716454223183583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223239076, "dur": 7, "args": { "External id": 116895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116895, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116895, "pid": 5, "tid": 7, "ts": 1716454223239076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183602, "dur": 6, "args": { "External id": 116895, "cbid": 211, "correlation": 116895 } }, { "ph": "s", "id": 116895, "pid": 76337, "tid": -914061504, "ts": 1716454223183602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223239084, "dur": 5, "args": { "External id": 116904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116904, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116904, "pid": 5, "tid": 7, "ts": 1716454223239084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183643, "dur": 10, "args": { "External id": 116904, "cbid": 211, "correlation": 116904 } }, { "ph": "s", "id": 116904, "pid": 76337, "tid": -914061504, "ts": 1716454223183643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223183705, "dur": 0, "args": { "External id": 116914, "cbid": 317, "correlation": 116914 } }, { "ph": "f", "id": 116914, "pid": 76337, "tid": -914061504, "ts": 1716454223183705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223183706, "dur": 0, "args": { "External id": 116915, "cbid": 203, "correlation": 116915 } }, { "ph": "f", "id": 116915, "pid": 76337, "tid": -914061504, "ts": 1716454223183706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223183707, "dur": 0, "args": { "External id": 116916, "cbid": 205, "correlation": 116916 } }, { "ph": "f", "id": 116916, "pid": 76337, "tid": -914061504, "ts": 1716454223183707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223239090, "dur": 5, "args": { "External id": 116920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116920, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116920, "pid": 5, "tid": 7, "ts": 1716454223239090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183721, "dur": 12, "args": { "External id": 116920, "cbid": 211, "correlation": 116920 } }, { "ph": "s", "id": 116920, "pid": 76337, "tid": -914061504, "ts": 1716454223183721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223239096, "dur": 163, "args": { "External id": 116922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116922, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116922, "pid": 5, "tid": 7, "ts": 1716454223239096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183736, "dur": 6, "args": { "External id": 116922, "cbid": 211, "correlation": 116922 } }, { "ph": "s", "id": 116922, "pid": 76337, "tid": -914061504, "ts": 1716454223183736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223239261, "dur": 1, "args": { "External id": 116924, "device": 5, "context": 1, "stream": 7, "correlation": 116924, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 116924, "pid": 5, "tid": 7, "ts": 1716454223239261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223183747, "dur": 6, "args": { "External id": 116924, "cbid": 51, "correlation": 116924 } }, { "ph": "s", "id": 116924, "pid": 76337, "tid": -914061504, "ts": 1716454223183747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223239265, "dur": 259, "args": { "External id": 116925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116925, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 116925, "pid": 5, "tid": 7, "ts": 1716454223239265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183754, "dur": 6, "args": { "External id": 116925, "cbid": 211, "correlation": 116925 } }, { "ph": "s", "id": 116925, "pid": 76337, "tid": -914061504, "ts": 1716454223183754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223239525, "dur": 6, "args": { "External id": 116927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116927, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 116927, "pid": 5, "tid": 7, "ts": 1716454223239525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183764, "dur": 5, "args": { "External id": 116927, "cbid": 211, "correlation": 116927 } }, { "ph": "s", "id": 116927, "pid": 76337, "tid": -914061504, "ts": 1716454223183764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223239532, "dur": 6, "args": { "External id": 116933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116933, "pid": 5, "tid": 7, "ts": 1716454223239532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183793, "dur": 9, "args": { "External id": 116933, "cbid": 211, "correlation": 116933 } }, { "ph": "s", "id": 116933, "pid": 76337, "tid": -914061504, "ts": 1716454223183793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223239540, "dur": 5, "args": { "External id": 116941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116941, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116941, "pid": 5, "tid": 7, "ts": 1716454223239540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183825, "dur": 8, "args": { "External id": 116941, "cbid": 211, "correlation": 116941 } }, { "ph": "s", "id": 116941, "pid": 76337, "tid": -914061504, "ts": 1716454223183825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223239546, "dur": 4, "args": { "External id": 116949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116949, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116949, "pid": 5, "tid": 7, "ts": 1716454223239546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183853, "dur": 8, "args": { "External id": 116949, "cbid": 211, "correlation": 116949 } }, { "ph": "s", "id": 116949, "pid": 76337, "tid": -914061504, "ts": 1716454223183853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223239552, "dur": 10, "args": { "External id": 116969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116969, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 116969, "pid": 5, "tid": 7, "ts": 1716454223239552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183927, "dur": 12, "args": { "External id": 116969, "cbid": 211, "correlation": 116969 } }, { "ph": "s", "id": 116969, "pid": 76337, "tid": -914061504, "ts": 1716454223183927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223239563, "dur": 4, "args": { "External id": 116981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116981, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 116981, "pid": 5, "tid": 7, "ts": 1716454223239563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183949, "dur": 6, "args": { "External id": 116981, "cbid": 211, "correlation": 116981 } }, { "ph": "s", "id": 116981, "pid": 76337, "tid": -914061504, "ts": 1716454223183949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223239568, "dur": 7, "args": { "External id": 116984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116984, "pid": 5, "tid": 7, "ts": 1716454223239568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223183967, "dur": 14, "args": { "External id": 116984, "cbid": 211, "correlation": 116984 } }, { "ph": "s", "id": 116984, "pid": 76337, "tid": -914061504, "ts": 1716454223183967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223239576, "dur": 5, "args": { "External id": 116993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 116993, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 116993, "pid": 5, "tid": 7, "ts": 1716454223239576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184014, "dur": 11, "args": { "External id": 116993, "cbid": 211, "correlation": 116993 } }, { "ph": "s", "id": 116993, "pid": 76337, "tid": -914061504, "ts": 1716454223184014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223184067, "dur": 0, "args": { "External id": 117003, "cbid": 317, "correlation": 117003 } }, { "ph": "f", "id": 117003, "pid": 76337, "tid": -914061504, "ts": 1716454223184067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223184068, "dur": 0, "args": { "External id": 117004, "cbid": 203, "correlation": 117004 } }, { "ph": "f", "id": 117004, "pid": 76337, "tid": -914061504, "ts": 1716454223184068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223184069, "dur": 0, "args": { "External id": 117005, "cbid": 205, "correlation": 117005 } }, { "ph": "f", "id": 117005, "pid": 76337, "tid": -914061504, "ts": 1716454223184069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223239582, "dur": 5, "args": { "External id": 117009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117009, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117009, "pid": 5, "tid": 7, "ts": 1716454223239582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184082, "dur": 12, "args": { "External id": 117009, "cbid": 211, "correlation": 117009 } }, { "ph": "s", "id": 117009, "pid": 76337, "tid": -914061504, "ts": 1716454223184082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223239588, "dur": 161, "args": { "External id": 117011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117011, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117011, "pid": 5, "tid": 7, "ts": 1716454223239588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184096, "dur": 5, "args": { "External id": 117011, "cbid": 211, "correlation": 117011 } }, { "ph": "s", "id": 117011, "pid": 76337, "tid": -914061504, "ts": 1716454223184096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223239752, "dur": 1, "args": { "External id": 117013, "device": 5, "context": 1, "stream": 7, "correlation": 117013, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 117013, "pid": 5, "tid": 7, "ts": 1716454223239752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223184107, "dur": 6, "args": { "External id": 117013, "cbid": 51, "correlation": 117013 } }, { "ph": "s", "id": 117013, "pid": 76337, "tid": -914061504, "ts": 1716454223184107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223239756, "dur": 257, "args": { "External id": 117014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117014, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117014, "pid": 5, "tid": 7, "ts": 1716454223239756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184114, "dur": 6, "args": { "External id": 117014, "cbid": 211, "correlation": 117014 } }, { "ph": "s", "id": 117014, "pid": 76337, "tid": -914061504, "ts": 1716454223184114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223240014, "dur": 6, "args": { "External id": 117016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117016, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117016, "pid": 5, "tid": 7, "ts": 1716454223240014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184124, "dur": 5, "args": { "External id": 117016, "cbid": 211, "correlation": 117016 } }, { "ph": "s", "id": 117016, "pid": 76337, "tid": -914061504, "ts": 1716454223184124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223240022, "dur": 6, "args": { "External id": 117022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117022, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117022, "pid": 5, "tid": 7, "ts": 1716454223240022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184152, "dur": 8, "args": { "External id": 117022, "cbid": 211, "correlation": 117022 } }, { "ph": "s", "id": 117022, "pid": 76337, "tid": -914061504, "ts": 1716454223184152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223240029, "dur": 4, "args": { "External id": 117030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117030, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 117030, "pid": 5, "tid": 7, "ts": 1716454223240029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184196, "dur": 9, "args": { "External id": 117030, "cbid": 211, "correlation": 117030 } }, { "ph": "s", "id": 117030, "pid": 76337, "tid": -914061504, "ts": 1716454223184196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223184258, "dur": 1, "args": { "External id": 117046, "cbid": 251, "correlation": 117046 } }, { "ph": "f", "id": 117046, "pid": 76337, "tid": -914061504, "ts": 1716454223184258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223184263, "dur": 0, "args": { "External id": 117048, "cbid": 251, "correlation": 117048 } }, { "ph": "f", "id": 117048, "pid": 76337, "tid": -914061504, "ts": 1716454223184263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223240034, "dur": 10, "args": { "External id": 117049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117049, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117049, "pid": 5, "tid": 7, "ts": 1716454223240034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184265, "dur": 11, "args": { "External id": 117049, "cbid": 211, "correlation": 117049 } }, { "ph": "s", "id": 117049, "pid": 76337, "tid": -914061504, "ts": 1716454223184265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223240045, "dur": 4, "args": { "External id": 117051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117051, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117051, "pid": 5, "tid": 7, "ts": 1716454223240045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184278, "dur": 5, "args": { "External id": 117051, "cbid": 211, "correlation": 117051 } }, { "ph": "s", "id": 117051, "pid": 76337, "tid": -914061504, "ts": 1716454223184278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223240050, "dur": 6, "args": { "External id": 117061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117061, "pid": 5, "tid": 7, "ts": 1716454223240050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184334, "dur": 13, "args": { "External id": 117061, "cbid": 211, "correlation": 117061 } }, { "ph": "s", "id": 117061, "pid": 76337, "tid": -914061504, "ts": 1716454223184334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223240057, "dur": 10, "args": { "External id": 117081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117081, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 117081, "pid": 5, "tid": 7, "ts": 1716454223240057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184401, "dur": 10, "args": { "External id": 117081, "cbid": 211, "correlation": 117081 } }, { "ph": "s", "id": 117081, "pid": 76337, "tid": -914061504, "ts": 1716454223184401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223240068, "dur": 4, "args": { "External id": 117093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117093, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 117093, "pid": 5, "tid": 7, "ts": 1716454223240068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184421, "dur": 6, "args": { "External id": 117093, "cbid": 211, "correlation": 117093 } }, { "ph": "s", "id": 117093, "pid": 76337, "tid": -914061504, "ts": 1716454223184421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223240073, "dur": 7, "args": { "External id": 117096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117096, "pid": 5, "tid": 7, "ts": 1716454223240073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184439, "dur": 7, "args": { "External id": 117096, "cbid": 211, "correlation": 117096 } }, { "ph": "s", "id": 117096, "pid": 76337, "tid": -914061504, "ts": 1716454223184439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223240082, "dur": 5, "args": { "External id": 117105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117105, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117105, "pid": 5, "tid": 7, "ts": 1716454223240082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184479, "dur": 10, "args": { "External id": 117105, "cbid": 211, "correlation": 117105 } }, { "ph": "s", "id": 117105, "pid": 76337, "tid": -914061504, "ts": 1716454223184479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223184541, "dur": 0, "args": { "External id": 117115, "cbid": 317, "correlation": 117115 } }, { "ph": "f", "id": 117115, "pid": 76337, "tid": -914061504, "ts": 1716454223184541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223184541, "dur": 0, "args": { "External id": 117116, "cbid": 203, "correlation": 117116 } }, { "ph": "f", "id": 117116, "pid": 76337, "tid": -914061504, "ts": 1716454223184541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223184542, "dur": 0, "args": { "External id": 117117, "cbid": 205, "correlation": 117117 } }, { "ph": "f", "id": 117117, "pid": 76337, "tid": -914061504, "ts": 1716454223184542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223240087, "dur": 5, "args": { "External id": 117121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117121, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117121, "pid": 5, "tid": 7, "ts": 1716454223240087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184557, "dur": 12, "args": { "External id": 117121, "cbid": 211, "correlation": 117121 } }, { "ph": "s", "id": 117121, "pid": 76337, "tid": -914061504, "ts": 1716454223184557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223240094, "dur": 162, "args": { "External id": 117123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117123, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117123, "pid": 5, "tid": 7, "ts": 1716454223240094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184571, "dur": 5, "args": { "External id": 117123, "cbid": 211, "correlation": 117123 } }, { "ph": "s", "id": 117123, "pid": 76337, "tid": -914061504, "ts": 1716454223184571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223240258, "dur": 1, "args": { "External id": 117125, "device": 5, "context": 1, "stream": 7, "correlation": 117125, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 117125, "pid": 5, "tid": 7, "ts": 1716454223240258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223184581, "dur": 6, "args": { "External id": 117125, "cbid": 51, "correlation": 117125 } }, { "ph": "s", "id": 117125, "pid": 76337, "tid": -914061504, "ts": 1716454223184581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223240261, "dur": 260, "args": { "External id": 117126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117126, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117126, "pid": 5, "tid": 7, "ts": 1716454223240261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184589, "dur": 6, "args": { "External id": 117126, "cbid": 211, "correlation": 117126 } }, { "ph": "s", "id": 117126, "pid": 76337, "tid": -914061504, "ts": 1716454223184589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223240523, "dur": 6, "args": { "External id": 117128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117128, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117128, "pid": 5, "tid": 7, "ts": 1716454223240523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184600, "dur": 5, "args": { "External id": 117128, "cbid": 211, "correlation": 117128 } }, { "ph": "s", "id": 117128, "pid": 76337, "tid": -914061504, "ts": 1716454223184600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223240530, "dur": 6, "args": { "External id": 117134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117134, "pid": 5, "tid": 7, "ts": 1716454223240530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184627, "dur": 9, "args": { "External id": 117134, "cbid": 211, "correlation": 117134 } }, { "ph": "s", "id": 117134, "pid": 76337, "tid": -914061504, "ts": 1716454223184627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223240537, "dur": 5, "args": { "External id": 117142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117142, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117142, "pid": 5, "tid": 7, "ts": 1716454223240537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184661, "dur": 8, "args": { "External id": 117142, "cbid": 211, "correlation": 117142 } }, { "ph": "s", "id": 117142, "pid": 76337, "tid": -914061504, "ts": 1716454223184661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223240543, "dur": 4, "args": { "External id": 117150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117150, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117150, "pid": 5, "tid": 7, "ts": 1716454223240543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184690, "dur": 9, "args": { "External id": 117150, "cbid": 211, "correlation": 117150 } }, { "ph": "s", "id": 117150, "pid": 76337, "tid": -914061504, "ts": 1716454223184690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223240549, "dur": 9, "args": { "External id": 117170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117170, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 117170, "pid": 5, "tid": 7, "ts": 1716454223240549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184794, "dur": 13, "args": { "External id": 117170, "cbid": 211, "correlation": 117170 } }, { "ph": "s", "id": 117170, "pid": 76337, "tid": -914061504, "ts": 1716454223184794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223240560, "dur": 4, "args": { "External id": 117182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117182, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 117182, "pid": 5, "tid": 7, "ts": 1716454223240560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184817, "dur": 6, "args": { "External id": 117182, "cbid": 211, "correlation": 117182 } }, { "ph": "s", "id": 117182, "pid": 76337, "tid": -914061504, "ts": 1716454223184817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223240565, "dur": 7, "args": { "External id": 117185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117185, "pid": 5, "tid": 7, "ts": 1716454223240565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184834, "dur": 7, "args": { "External id": 117185, "cbid": 211, "correlation": 117185 } }, { "ph": "s", "id": 117185, "pid": 76337, "tid": -914061504, "ts": 1716454223184834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223240573, "dur": 5, "args": { "External id": 117194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117194, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117194, "pid": 5, "tid": 7, "ts": 1716454223240573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184872, "dur": 9, "args": { "External id": 117194, "cbid": 211, "correlation": 117194 } }, { "ph": "s", "id": 117194, "pid": 76337, "tid": -914061504, "ts": 1716454223184872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223184924, "dur": 0, "args": { "External id": 117204, "cbid": 317, "correlation": 117204 } }, { "ph": "f", "id": 117204, "pid": 76337, "tid": -914061504, "ts": 1716454223184924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223184925, "dur": 0, "args": { "External id": 117205, "cbid": 203, "correlation": 117205 } }, { "ph": "f", "id": 117205, "pid": 76337, "tid": -914061504, "ts": 1716454223184925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223184926, "dur": 0, "args": { "External id": 117206, "cbid": 205, "correlation": 117206 } }, { "ph": "f", "id": 117206, "pid": 76337, "tid": -914061504, "ts": 1716454223184926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223240579, "dur": 5, "args": { "External id": 117210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117210, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117210, "pid": 5, "tid": 7, "ts": 1716454223240579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184939, "dur": 12, "args": { "External id": 117210, "cbid": 211, "correlation": 117210 } }, { "ph": "s", "id": 117210, "pid": 76337, "tid": -914061504, "ts": 1716454223184939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223240585, "dur": 162, "args": { "External id": 117212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117212, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117212, "pid": 5, "tid": 7, "ts": 1716454223240585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184953, "dur": 6, "args": { "External id": 117212, "cbid": 211, "correlation": 117212 } }, { "ph": "s", "id": 117212, "pid": 76337, "tid": -914061504, "ts": 1716454223184953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223240749, "dur": 1, "args": { "External id": 117214, "device": 5, "context": 1, "stream": 7, "correlation": 117214, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 117214, "pid": 5, "tid": 7, "ts": 1716454223240749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223184964, "dur": 6, "args": { "External id": 117214, "cbid": 51, "correlation": 117214 } }, { "ph": "s", "id": 117214, "pid": 76337, "tid": -914061504, "ts": 1716454223184964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223240753, "dur": 258, "args": { "External id": 117215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117215, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117215, "pid": 5, "tid": 7, "ts": 1716454223240753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184971, "dur": 14, "args": { "External id": 117215, "cbid": 211, "correlation": 117215 } }, { "ph": "s", "id": 117215, "pid": 76337, "tid": -914061504, "ts": 1716454223184971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223241012, "dur": 6, "args": { "External id": 117217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117217, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117217, "pid": 5, "tid": 7, "ts": 1716454223241012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223184990, "dur": 5, "args": { "External id": 117217, "cbid": 211, "correlation": 117217 } }, { "ph": "s", "id": 117217, "pid": 76337, "tid": -914061504, "ts": 1716454223184990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241019, "dur": 6, "args": { "External id": 117223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117223, "pid": 5, "tid": 7, "ts": 1716454223241019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185019, "dur": 8, "args": { "External id": 117223, "cbid": 211, "correlation": 117223 } }, { "ph": "s", "id": 117223, "pid": 76337, "tid": -914061504, "ts": 1716454223185019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223241027, "dur": 4, "args": { "External id": 117231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117231, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 117231, "pid": 5, "tid": 7, "ts": 1716454223241027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185063, "dur": 9, "args": { "External id": 117231, "cbid": 211, "correlation": 117231 } }, { "ph": "s", "id": 117231, "pid": 76337, "tid": -914061504, "ts": 1716454223185063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223185126, "dur": 1, "args": { "External id": 117247, "cbid": 251, "correlation": 117247 } }, { "ph": "f", "id": 117247, "pid": 76337, "tid": -914061504, "ts": 1716454223185126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223185131, "dur": 0, "args": { "External id": 117249, "cbid": 251, "correlation": 117249 } }, { "ph": "f", "id": 117249, "pid": 76337, "tid": -914061504, "ts": 1716454223185131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223241032, "dur": 11, "args": { "External id": 117250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117250, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117250, "pid": 5, "tid": 7, "ts": 1716454223241032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185133, "dur": 11, "args": { "External id": 117250, "cbid": 211, "correlation": 117250 } }, { "ph": "s", "id": 117250, "pid": 76337, "tid": -914061504, "ts": 1716454223185133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223241044, "dur": 4, "args": { "External id": 117252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117252, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117252, "pid": 5, "tid": 7, "ts": 1716454223241044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185146, "dur": 5, "args": { "External id": 117252, "cbid": 211, "correlation": 117252 } }, { "ph": "s", "id": 117252, "pid": 76337, "tid": -914061504, "ts": 1716454223185146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241049, "dur": 6, "args": { "External id": 117262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117262, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117262, "pid": 5, "tid": 7, "ts": 1716454223241049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185201, "dur": 12, "args": { "External id": 117262, "cbid": 211, "correlation": 117262 } }, { "ph": "s", "id": 117262, "pid": 76337, "tid": -914061504, "ts": 1716454223185201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223241056, "dur": 10, "args": { "External id": 117282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117282, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 117282, "pid": 5, "tid": 7, "ts": 1716454223241056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185268, "dur": 11, "args": { "External id": 117282, "cbid": 211, "correlation": 117282 } }, { "ph": "s", "id": 117282, "pid": 76337, "tid": -914061504, "ts": 1716454223185268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223241067, "dur": 4, "args": { "External id": 117294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117294, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 117294, "pid": 5, "tid": 7, "ts": 1716454223241067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185288, "dur": 6, "args": { "External id": 117294, "cbid": 211, "correlation": 117294 } }, { "ph": "s", "id": 117294, "pid": 76337, "tid": -914061504, "ts": 1716454223185288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241072, "dur": 7, "args": { "External id": 117297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117297, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117297, "pid": 5, "tid": 7, "ts": 1716454223241072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185308, "dur": 6, "args": { "External id": 117297, "cbid": 211, "correlation": 117297 } }, { "ph": "s", "id": 117297, "pid": 76337, "tid": -914061504, "ts": 1716454223185308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223241080, "dur": 5, "args": { "External id": 117306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117306, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117306, "pid": 5, "tid": 7, "ts": 1716454223241080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185348, "dur": 10, "args": { "External id": 117306, "cbid": 211, "correlation": 117306 } }, { "ph": "s", "id": 117306, "pid": 76337, "tid": -914061504, "ts": 1716454223185348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223185411, "dur": 0, "args": { "External id": 117316, "cbid": 317, "correlation": 117316 } }, { "ph": "f", "id": 117316, "pid": 76337, "tid": -914061504, "ts": 1716454223185411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223185412, "dur": 0, "args": { "External id": 117317, "cbid": 203, "correlation": 117317 } }, { "ph": "f", "id": 117317, "pid": 76337, "tid": -914061504, "ts": 1716454223185412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223185413, "dur": 0, "args": { "External id": 117318, "cbid": 205, "correlation": 117318 } }, { "ph": "f", "id": 117318, "pid": 76337, "tid": -914061504, "ts": 1716454223185413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223241086, "dur": 5, "args": { "External id": 117322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117322, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117322, "pid": 5, "tid": 7, "ts": 1716454223241086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185427, "dur": 12, "args": { "External id": 117322, "cbid": 211, "correlation": 117322 } }, { "ph": "s", "id": 117322, "pid": 76337, "tid": -914061504, "ts": 1716454223185427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223241092, "dur": 162, "args": { "External id": 117324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117324, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117324, "pid": 5, "tid": 7, "ts": 1716454223241092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185441, "dur": 5, "args": { "External id": 117324, "cbid": 211, "correlation": 117324 } }, { "ph": "s", "id": 117324, "pid": 76337, "tid": -914061504, "ts": 1716454223185441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223241256, "dur": 1, "args": { "External id": 117326, "device": 5, "context": 1, "stream": 7, "correlation": 117326, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 117326, "pid": 5, "tid": 7, "ts": 1716454223241256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223185452, "dur": 7, "args": { "External id": 117326, "cbid": 51, "correlation": 117326 } }, { "ph": "s", "id": 117326, "pid": 76337, "tid": -914061504, "ts": 1716454223185452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223241260, "dur": 258, "args": { "External id": 117327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117327, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117327, "pid": 5, "tid": 7, "ts": 1716454223241260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185460, "dur": 6, "args": { "External id": 117327, "cbid": 211, "correlation": 117327 } }, { "ph": "s", "id": 117327, "pid": 76337, "tid": -914061504, "ts": 1716454223185460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223241520, "dur": 6, "args": { "External id": 117329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117329, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117329, "pid": 5, "tid": 7, "ts": 1716454223241520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185469, "dur": 5, "args": { "External id": 117329, "cbid": 211, "correlation": 117329 } }, { "ph": "s", "id": 117329, "pid": 76337, "tid": -914061504, "ts": 1716454223185469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241527, "dur": 6, "args": { "External id": 117335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117335, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117335, "pid": 5, "tid": 7, "ts": 1716454223241527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185497, "dur": 9, "args": { "External id": 117335, "cbid": 211, "correlation": 117335 } }, { "ph": "s", "id": 117335, "pid": 76337, "tid": -914061504, "ts": 1716454223185497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223241535, "dur": 5, "args": { "External id": 117343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117343, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117343, "pid": 5, "tid": 7, "ts": 1716454223241535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185530, "dur": 8, "args": { "External id": 117343, "cbid": 211, "correlation": 117343 } }, { "ph": "s", "id": 117343, "pid": 76337, "tid": -914061504, "ts": 1716454223185530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223241541, "dur": 4, "args": { "External id": 117351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117351, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117351, "pid": 5, "tid": 7, "ts": 1716454223241541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185560, "dur": 8, "args": { "External id": 117351, "cbid": 211, "correlation": 117351 } }, { "ph": "s", "id": 117351, "pid": 76337, "tid": -914061504, "ts": 1716454223185560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223241547, "dur": 10, "args": { "External id": 117371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117371, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 117371, "pid": 5, "tid": 7, "ts": 1716454223241547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185702, "dur": 14, "args": { "External id": 117371, "cbid": 211, "correlation": 117371 } }, { "ph": "s", "id": 117371, "pid": 76337, "tid": -914061504, "ts": 1716454223185702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223241558, "dur": 4, "args": { "External id": 117383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117383, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 117383, "pid": 5, "tid": 7, "ts": 1716454223241558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185725, "dur": 6, "args": { "External id": 117383, "cbid": 211, "correlation": 117383 } }, { "ph": "s", "id": 117383, "pid": 76337, "tid": -914061504, "ts": 1716454223185725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241563, "dur": 7, "args": { "External id": 117386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117386, "pid": 5, "tid": 7, "ts": 1716454223241563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185744, "dur": 7, "args": { "External id": 117386, "cbid": 211, "correlation": 117386 } }, { "ph": "s", "id": 117386, "pid": 76337, "tid": -914061504, "ts": 1716454223185744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223185803, "dur": 0, "args": { "External id": 117397, "cbid": 317, "correlation": 117397 } }, { "ph": "f", "id": 117397, "pid": 76337, "tid": -914061504, "ts": 1716454223185803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223185804, "dur": 0, "args": { "External id": 117398, "cbid": 203, "correlation": 117398 } }, { "ph": "f", "id": 117398, "pid": 76337, "tid": -914061504, "ts": 1716454223185804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223185805, "dur": 0, "args": { "External id": 117399, "cbid": 205, "correlation": 117399 } }, { "ph": "f", "id": 117399, "pid": 76337, "tid": -914061504, "ts": 1716454223185805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223241571, "dur": 5, "args": { "External id": 117403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117403, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117403, "pid": 5, "tid": 7, "ts": 1716454223241571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185823, "dur": 12, "args": { "External id": 117403, "cbid": 211, "correlation": 117403 } }, { "ph": "s", "id": 117403, "pid": 76337, "tid": -914061504, "ts": 1716454223185823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223241577, "dur": 37, "args": { "External id": 117405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117405, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 117405, "pid": 5, "tid": 7, "ts": 1716454223241577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185843, "dur": 10, "args": { "External id": 117405, "cbid": 211, "correlation": 117405 } }, { "ph": "s", "id": 117405, "pid": 76337, "tid": -914061504, "ts": 1716454223185843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223241615, "dur": 5, "args": { "External id": 117407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117407, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117407, "pid": 5, "tid": 7, "ts": 1716454223241615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185857, "dur": 5, "args": { "External id": 117407, "cbid": 211, "correlation": 117407 } }, { "ph": "s", "id": 117407, "pid": 76337, "tid": -914061504, "ts": 1716454223185857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241622, "dur": 6, "args": { "External id": 117413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117413, "pid": 5, "tid": 7, "ts": 1716454223241622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185884, "dur": 8, "args": { "External id": 117413, "cbid": 211, "correlation": 117413 } }, { "ph": "s", "id": 117413, "pid": 76337, "tid": -914061504, "ts": 1716454223185884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223241629, "dur": 21, "args": { "External id": 117422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117422, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117422, "pid": 5, "tid": 7, "ts": 1716454223241629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223185968, "dur": 22, "args": { "External id": 117422, "cbid": 211, "correlation": 117422 } }, { "ph": "s", "id": 117422, "pid": 76337, "tid": -914061504, "ts": 1716454223185968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223241651, "dur": 11, "args": { "External id": 117444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117444, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 117444, "pid": 5, "tid": 7, "ts": 1716454223241651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186034, "dur": 10, "args": { "External id": 117444, "cbid": 211, "correlation": 117444 } }, { "ph": "s", "id": 117444, "pid": 76337, "tid": -914061504, "ts": 1716454223186034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186125, "dur": 2, "args": { "External id": 117455, "cbid": 251, "correlation": 117455 } }, { "ph": "f", "id": 117455, "pid": 76337, "tid": -914061504, "ts": 1716454223186125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186129, "dur": 0, "args": { "External id": 117456, "cbid": 251, "correlation": 117456 } }, { "ph": "f", "id": 117456, "pid": 76337, "tid": -914061504, "ts": 1716454223186129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223241663, "dur": 55, "args": { "External id": 117457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117457, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 117457, "pid": 5, "tid": 7, "ts": 1716454223241663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186132, "dur": 13, "args": { "External id": 117457, "cbid": 211, "correlation": 117457 } }, { "ph": "s", "id": 117457, "pid": 76337, "tid": -914061504, "ts": 1716454223186132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186203, "dur": 1, "args": { "External id": 117468, "cbid": 251, "correlation": 117468 } }, { "ph": "f", "id": 117468, "pid": 76337, "tid": -914061504, "ts": 1716454223186203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186206, "dur": 0, "args": { "External id": 117469, "cbid": 251, "correlation": 117469 } }, { "ph": "f", "id": 117469, "pid": 76337, "tid": -914061504, "ts": 1716454223186206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223241719, "dur": 54, "args": { "External id": 117470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117470, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 117470, "pid": 5, "tid": 7, "ts": 1716454223241719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186209, "dur": 11, "args": { "External id": 117470, "cbid": 211, "correlation": 117470 } }, { "ph": "s", "id": 117470, "pid": 76337, "tid": -914061504, "ts": 1716454223186209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186273, "dur": 1, "args": { "External id": 117481, "cbid": 251, "correlation": 117481 } }, { "ph": "f", "id": 117481, "pid": 76337, "tid": -914061504, "ts": 1716454223186273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186276, "dur": 0, "args": { "External id": 117482, "cbid": 251, "correlation": 117482 } }, { "ph": "f", "id": 117482, "pid": 76337, "tid": -914061504, "ts": 1716454223186276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223241775, "dur": 53, "args": { "External id": 117483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117483, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 117483, "pid": 5, "tid": 7, "ts": 1716454223241775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186278, "dur": 11, "args": { "External id": 117483, "cbid": 211, "correlation": 117483 } }, { "ph": "s", "id": 117483, "pid": 76337, "tid": -914061504, "ts": 1716454223186278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223241829, "dur": 56, "args": { "External id": 117508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117508, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117508, "pid": 5, "tid": 7, "ts": 1716454223241829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186364, "dur": 13, "args": { "External id": 117508, "cbid": 211, "correlation": 117508 } }, { "ph": "s", "id": 117508, "pid": 76337, "tid": -914061504, "ts": 1716454223186364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186465, "dur": 1, "args": { "External id": 117526, "cbid": 251, "correlation": 117526 } }, { "ph": "f", "id": 117526, "pid": 76337, "tid": -914061504, "ts": 1716454223186465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223241887, "dur": 65, "args": { "External id": 117528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117528, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 117528, "pid": 5, "tid": 7, "ts": 1716454223241887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186471, "dur": 13, "args": { "External id": 117528, "cbid": 211, "correlation": 117528 } }, { "ph": "s", "id": 117528, "pid": 76337, "tid": -914061504, "ts": 1716454223186471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223241953, "dur": 6, "args": { "External id": 117536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117536, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117536, "pid": 5, "tid": 7, "ts": 1716454223241953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186540, "dur": 12, "args": { "External id": 117536, "cbid": 211, "correlation": 117536 } }, { "ph": "s", "id": 117536, "pid": 76337, "tid": -914061504, "ts": 1716454223186540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223241960, "dur": 8, "args": { "External id": 117544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117544, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117544, "pid": 5, "tid": 7, "ts": 1716454223241960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186579, "dur": 8, "args": { "External id": 117544, "cbid": 211, "correlation": 117544 } }, { "ph": "s", "id": 117544, "pid": 76337, "tid": -914061504, "ts": 1716454223186579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223241969, "dur": 7, "args": { "External id": 117555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117555, "pid": 5, "tid": 7, "ts": 1716454223241969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186651, "dur": 14, "args": { "External id": 117555, "cbid": 211, "correlation": 117555 } }, { "ph": "s", "id": 117555, "pid": 76337, "tid": -914061504, "ts": 1716454223186651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223241977, "dur": 9, "args": { "External id": 117577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117577, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 117577, "pid": 5, "tid": 7, "ts": 1716454223241977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186684, "dur": 8, "args": { "External id": 117577, "cbid": 211, "correlation": 117577 } }, { "ph": "s", "id": 117577, "pid": 76337, "tid": -914061504, "ts": 1716454223186684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186770, "dur": 2, "args": { "External id": 117588, "cbid": 251, "correlation": 117588 } }, { "ph": "f", "id": 117588, "pid": 76337, "tid": -914061504, "ts": 1716454223186770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223241989, "dur": 1, "args": { "External id": 117589, "device": 5, "context": 1, "stream": 7, "correlation": 117589, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 117589, "pid": 5, "tid": 7, "ts": 1716454223241989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223186776, "dur": 10, "args": { "External id": 117589, "cbid": 51, "correlation": 117589 } }, { "ph": "s", "id": 117589, "pid": 76337, "tid": -914061504, "ts": 1716454223186776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223241992, "dur": 36, "args": { "External id": 117590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117590, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 117590, "pid": 5, "tid": 7, "ts": 1716454223241992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186788, "dur": 12, "args": { "External id": 117590, "cbid": 211, "correlation": 117590 } }, { "ph": "s", "id": 117590, "pid": 76337, "tid": -914061504, "ts": 1716454223186788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186858, "dur": 1, "args": { "External id": 117601, "cbid": 251, "correlation": 117601 } }, { "ph": "f", "id": 117601, "pid": 76337, "tid": -914061504, "ts": 1716454223186858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186863, "dur": 0, "args": { "External id": 117602, "cbid": 251, "correlation": 117602 } }, { "ph": "f", "id": 117602, "pid": 76337, "tid": -914061504, "ts": 1716454223186863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223242030, "dur": 11, "args": { "External id": 117603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117603, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117603, "pid": 5, "tid": 7, "ts": 1716454223242030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186864, "dur": 13, "args": { "External id": 117603, "cbid": 211, "correlation": 117603 } }, { "ph": "s", "id": 117603, "pid": 76337, "tid": -914061504, "ts": 1716454223186864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223242042, "dur": 5, "args": { "External id": 117605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117605, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117605, "pid": 5, "tid": 7, "ts": 1716454223242042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186879, "dur": 6, "args": { "External id": 117605, "cbid": 211, "correlation": 117605 } }, { "ph": "s", "id": 117605, "pid": 76337, "tid": -914061504, "ts": 1716454223186879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186936, "dur": 1, "args": { "External id": 117616, "cbid": 251, "correlation": 117616 } }, { "ph": "f", "id": 117616, "pid": 76337, "tid": -914061504, "ts": 1716454223186936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223186939, "dur": 0, "args": { "External id": 117617, "cbid": 251, "correlation": 117617 } }, { "ph": "f", "id": 117617, "pid": 76337, "tid": -914061504, "ts": 1716454223186939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223242049, "dur": 8, "args": { "External id": 117618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117618, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117618, "pid": 5, "tid": 7, "ts": 1716454223242049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186941, "dur": 12, "args": { "External id": 117618, "cbid": 211, "correlation": 117618 } }, { "ph": "s", "id": 117618, "pid": 76337, "tid": -914061504, "ts": 1716454223186941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223242059, "dur": 4, "args": { "External id": 117620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117620, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117620, "pid": 5, "tid": 7, "ts": 1716454223242059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223186954, "dur": 5, "args": { "External id": 117620, "cbid": 211, "correlation": 117620 } }, { "ph": "s", "id": 117620, "pid": 76337, "tid": -914061504, "ts": 1716454223186954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223242063, "dur": 19, "args": { "External id": 117645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117645, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 117645, "pid": 5, "tid": 7, "ts": 1716454223242063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187040, "dur": 13, "args": { "External id": 117645, "cbid": 211, "correlation": 117645 } }, { "ph": "s", "id": 117645, "pid": 76337, "tid": -914061504, "ts": 1716454223187040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223187141, "dur": 2, "args": { "External id": 117663, "cbid": 251, "correlation": 117663 } }, { "ph": "f", "id": 117663, "pid": 76337, "tid": -914061504, "ts": 1716454223187141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223242085, "dur": 1, "args": { "External id": 117665, "device": 5, "context": 1, "stream": 7, "correlation": 117665, "bytes": 480, "memory bandwidth (GB/s)": 0.3125 } }, { "ph": "f", "id": 117665, "pid": 5, "tid": 7, "ts": 1716454223242085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223187147, "dur": 10, "args": { "External id": 117665, "cbid": 51, "correlation": 117665 } }, { "ph": "s", "id": 117665, "pid": 76337, "tid": -914061504, "ts": 1716454223187147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223242089, "dur": 37, "args": { "External id": 117666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117666, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 117666, "pid": 5, "tid": 7, "ts": 1716454223242089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187159, "dur": 14, "args": { "External id": 117666, "cbid": 211, "correlation": 117666 } }, { "ph": "s", "id": 117666, "pid": 76337, "tid": -914061504, "ts": 1716454223187159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223242127, "dur": 4, "args": { "External id": 117674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117674, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117674, "pid": 5, "tid": 7, "ts": 1716454223242127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187232, "dur": 12, "args": { "External id": 117674, "cbid": 211, "correlation": 117674 } }, { "ph": "s", "id": 117674, "pid": 76337, "tid": -914061504, "ts": 1716454223187232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223242132, "dur": 8, "args": { "External id": 117682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117682, "pid": 5, "tid": 7, "ts": 1716454223242132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187274, "dur": 9, "args": { "External id": 117682, "cbid": 211, "correlation": 117682 } }, { "ph": "s", "id": 117682, "pid": 76337, "tid": -914061504, "ts": 1716454223187274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223242142, "dur": 8, "args": { "External id": 117704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117704, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 117704, "pid": 5, "tid": 7, "ts": 1716454223242142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187325, "dur": 10, "args": { "External id": 117704, "cbid": 211, "correlation": 117704 } }, { "ph": "s", "id": 117704, "pid": 76337, "tid": -914061504, "ts": 1716454223187325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223187415, "dur": 1, "args": { "External id": 117720, "cbid": 251, "correlation": 117720 } }, { "ph": "f", "id": 117720, "pid": 76337, "tid": -914061504, "ts": 1716454223187415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223187420, "dur": 0, "args": { "External id": 117722, "cbid": 251, "correlation": 117722 } }, { "ph": "f", "id": 117722, "pid": 76337, "tid": -914061504, "ts": 1716454223187420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223242151, "dur": 187, "args": { "External id": 117723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117723, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117723, "pid": 5, "tid": 7, "ts": 1716454223242151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187422, "dur": 13, "args": { "External id": 117723, "cbid": 211, "correlation": 117723 } }, { "ph": "s", "id": 117723, "pid": 76337, "tid": -914061504, "ts": 1716454223187422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223242340, "dur": 22, "args": { "External id": 117731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117731, "pid": 5, "tid": 7, "ts": 1716454223242340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187488, "dur": 13, "args": { "External id": 117731, "cbid": 211, "correlation": 117731 } }, { "ph": "s", "id": 117731, "pid": 76337, "tid": -914061504, "ts": 1716454223187488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223242364, "dur": 23, "args": { "External id": 117739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117739, "pid": 5, "tid": 7, "ts": 1716454223242364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187519, "dur": 9, "args": { "External id": 117739, "cbid": 211, "correlation": 117739 } }, { "ph": "s", "id": 117739, "pid": 76337, "tid": -914061504, "ts": 1716454223187519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223187602, "dur": 1, "args": { "External id": 117755, "cbid": 251, "correlation": 117755 } }, { "ph": "f", "id": 117755, "pid": 76337, "tid": -914061504, "ts": 1716454223187602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223242389, "dur": 1, "args": { "External id": 117757, "device": 5, "context": 1, "stream": 7, "correlation": 117757, "bytes": 120, "memory bandwidth (GB/s)": 0.07653061224489796 } }, { "ph": "f", "id": 117757, "pid": 5, "tid": 7, "ts": 1716454223242389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223187607, "dur": 9, "args": { "External id": 117757, "cbid": 51, "correlation": 117757 } }, { "ph": "s", "id": 117757, "pid": 76337, "tid": -914061504, "ts": 1716454223187607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223242392, "dur": 109, "args": { "External id": 117758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117758, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 117758, "pid": 5, "tid": 7, "ts": 1716454223242392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187617, "dur": 13, "args": { "External id": 117758, "cbid": 211, "correlation": 117758 } }, { "ph": "s", "id": 117758, "pid": 76337, "tid": -914061504, "ts": 1716454223187617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223242503, "dur": 5, "args": { "External id": 117766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117766, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117766, "pid": 5, "tid": 7, "ts": 1716454223242503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187660, "dur": 11, "args": { "External id": 117766, "cbid": 211, "correlation": 117766 } }, { "ph": "s", "id": 117766, "pid": 76337, "tid": -914061504, "ts": 1716454223187660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223242509, "dur": 10, "args": { "External id": 117777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117777, "pid": 5, "tid": 7, "ts": 1716454223242509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187729, "dur": 12, "args": { "External id": 117777, "cbid": 211, "correlation": 117777 } }, { "ph": "s", "id": 117777, "pid": 76337, "tid": -914061504, "ts": 1716454223187729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223187795, "dur": 0, "args": { "External id": 117789, "cbid": 317, "correlation": 117789 } }, { "ph": "f", "id": 117789, "pid": 76337, "tid": -914061504, "ts": 1716454223187795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223187796, "dur": 0, "args": { "External id": 117790, "cbid": 203, "correlation": 117790 } }, { "ph": "f", "id": 117790, "pid": 76337, "tid": -914061504, "ts": 1716454223187796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223187796, "dur": 0, "args": { "External id": 117791, "cbid": 205, "correlation": 117791 } }, { "ph": "f", "id": 117791, "pid": 76337, "tid": -914061504, "ts": 1716454223187796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223242520, "dur": 6, "args": { "External id": 117795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117795, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117795, "pid": 5, "tid": 7, "ts": 1716454223242520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187811, "dur": 13, "args": { "External id": 117795, "cbid": 211, "correlation": 117795 } }, { "ph": "s", "id": 117795, "pid": 76337, "tid": -914061504, "ts": 1716454223187811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223242527, "dur": 37, "args": { "External id": 117797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117797, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 117797, "pid": 5, "tid": 7, "ts": 1716454223242527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187831, "dur": 7, "args": { "External id": 117797, "cbid": 211, "correlation": 117797 } }, { "ph": "s", "id": 117797, "pid": 76337, "tid": -914061504, "ts": 1716454223187831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223242565, "dur": 6, "args": { "External id": 117799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117799, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117799, "pid": 5, "tid": 7, "ts": 1716454223242565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187841, "dur": 5, "args": { "External id": 117799, "cbid": 211, "correlation": 117799 } }, { "ph": "s", "id": 117799, "pid": 76337, "tid": -914061504, "ts": 1716454223187841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223242573, "dur": 7, "args": { "External id": 117805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117805, "pid": 5, "tid": 7, "ts": 1716454223242573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187868, "dur": 10, "args": { "External id": 117805, "cbid": 211, "correlation": 117805 } }, { "ph": "s", "id": 117805, "pid": 76337, "tid": -914061504, "ts": 1716454223187868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223242581, "dur": 5, "args": { "External id": 117813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117813, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117813, "pid": 5, "tid": 7, "ts": 1716454223242581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187900, "dur": 8, "args": { "External id": 117813, "cbid": 211, "correlation": 117813 } }, { "ph": "s", "id": 117813, "pid": 76337, "tid": -914061504, "ts": 1716454223187900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223242587, "dur": 11, "args": { "External id": 117833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117833, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 117833, "pid": 5, "tid": 7, "ts": 1716454223242587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223187972, "dur": 20, "args": { "External id": 117833, "cbid": 211, "correlation": 117833 } }, { "ph": "s", "id": 117833, "pid": 76337, "tid": -914061504, "ts": 1716454223187972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223242600, "dur": 4, "args": { "External id": 117845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117845, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 117845, "pid": 5, "tid": 7, "ts": 1716454223242600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188002, "dur": 6, "args": { "External id": 117845, "cbid": 211, "correlation": 117845 } }, { "ph": "s", "id": 117845, "pid": 76337, "tid": -914061504, "ts": 1716454223188002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223242605, "dur": 8, "args": { "External id": 117848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117848, "pid": 5, "tid": 7, "ts": 1716454223242605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188021, "dur": 7, "args": { "External id": 117848, "cbid": 211, "correlation": 117848 } }, { "ph": "s", "id": 117848, "pid": 76337, "tid": -914061504, "ts": 1716454223188021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223242615, "dur": 5, "args": { "External id": 117857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117857, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117857, "pid": 5, "tid": 7, "ts": 1716454223242615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188061, "dur": 10, "args": { "External id": 117857, "cbid": 211, "correlation": 117857 } }, { "ph": "s", "id": 117857, "pid": 76337, "tid": -914061504, "ts": 1716454223188061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223188112, "dur": 0, "args": { "External id": 117867, "cbid": 317, "correlation": 117867 } }, { "ph": "f", "id": 117867, "pid": 76337, "tid": -914061504, "ts": 1716454223188112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223188113, "dur": 0, "args": { "External id": 117868, "cbid": 203, "correlation": 117868 } }, { "ph": "f", "id": 117868, "pid": 76337, "tid": -914061504, "ts": 1716454223188113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223188114, "dur": 0, "args": { "External id": 117869, "cbid": 205, "correlation": 117869 } }, { "ph": "f", "id": 117869, "pid": 76337, "tid": -914061504, "ts": 1716454223188114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223242621, "dur": 5, "args": { "External id": 117873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117873, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117873, "pid": 5, "tid": 7, "ts": 1716454223242621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188128, "dur": 11, "args": { "External id": 117873, "cbid": 211, "correlation": 117873 } }, { "ph": "s", "id": 117873, "pid": 76337, "tid": -914061504, "ts": 1716454223188128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223242628, "dur": 162, "args": { "External id": 117875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117875, "pid": 5, "tid": 7, "ts": 1716454223242628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188142, "dur": 5, "args": { "External id": 117875, "cbid": 211, "correlation": 117875 } }, { "ph": "s", "id": 117875, "pid": 76337, "tid": -914061504, "ts": 1716454223188142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223242791, "dur": 1, "args": { "External id": 117877, "device": 5, "context": 1, "stream": 7, "correlation": 117877, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 117877, "pid": 5, "tid": 7, "ts": 1716454223242791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223188152, "dur": 6, "args": { "External id": 117877, "cbid": 51, "correlation": 117877 } }, { "ph": "s", "id": 117877, "pid": 76337, "tid": -914061504, "ts": 1716454223188152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223242795, "dur": 268, "args": { "External id": 117878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117878, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117878, "pid": 5, "tid": 7, "ts": 1716454223242795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188159, "dur": 6, "args": { "External id": 117878, "cbid": 211, "correlation": 117878 } }, { "ph": "s", "id": 117878, "pid": 76337, "tid": -914061504, "ts": 1716454223188159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223243065, "dur": 6, "args": { "External id": 117880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117880, "pid": 5, "tid": 7, "ts": 1716454223243065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188169, "dur": 6, "args": { "External id": 117880, "cbid": 211, "correlation": 117880 } }, { "ph": "s", "id": 117880, "pid": 76337, "tid": -914061504, "ts": 1716454223188169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223243072, "dur": 6, "args": { "External id": 117886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117886, "pid": 5, "tid": 7, "ts": 1716454223243072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188198, "dur": 8, "args": { "External id": 117886, "cbid": 211, "correlation": 117886 } }, { "ph": "s", "id": 117886, "pid": 76337, "tid": -914061504, "ts": 1716454223188198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223243079, "dur": 4, "args": { "External id": 117894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117894, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 117894, "pid": 5, "tid": 7, "ts": 1716454223243079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188242, "dur": 9, "args": { "External id": 117894, "cbid": 211, "correlation": 117894 } }, { "ph": "s", "id": 117894, "pid": 76337, "tid": -914061504, "ts": 1716454223188242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223188308, "dur": 1, "args": { "External id": 117910, "cbid": 251, "correlation": 117910 } }, { "ph": "f", "id": 117910, "pid": 76337, "tid": -914061504, "ts": 1716454223188308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223188313, "dur": 0, "args": { "External id": 117912, "cbid": 251, "correlation": 117912 } }, { "ph": "f", "id": 117912, "pid": 76337, "tid": -914061504, "ts": 1716454223188313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223243085, "dur": 12, "args": { "External id": 117913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117913, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117913, "pid": 5, "tid": 7, "ts": 1716454223243085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188315, "dur": 11, "args": { "External id": 117913, "cbid": 211, "correlation": 117913 } }, { "ph": "s", "id": 117913, "pid": 76337, "tid": -914061504, "ts": 1716454223188315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223243099, "dur": 5, "args": { "External id": 117915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117915, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117915, "pid": 5, "tid": 7, "ts": 1716454223243099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188328, "dur": 5, "args": { "External id": 117915, "cbid": 211, "correlation": 117915 } }, { "ph": "s", "id": 117915, "pid": 76337, "tid": -914061504, "ts": 1716454223188328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223243105, "dur": 6, "args": { "External id": 117925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117925, "pid": 5, "tid": 7, "ts": 1716454223243105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188386, "dur": 12, "args": { "External id": 117925, "cbid": 211, "correlation": 117925 } }, { "ph": "s", "id": 117925, "pid": 76337, "tid": -914061504, "ts": 1716454223188386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223243112, "dur": 10, "args": { "External id": 117945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117945, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 117945, "pid": 5, "tid": 7, "ts": 1716454223243112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188452, "dur": 10, "args": { "External id": 117945, "cbid": 211, "correlation": 117945 } }, { "ph": "s", "id": 117945, "pid": 76337, "tid": -914061504, "ts": 1716454223188452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223243123, "dur": 4, "args": { "External id": 117957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117957, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 117957, "pid": 5, "tid": 7, "ts": 1716454223243123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188472, "dur": 7, "args": { "External id": 117957, "cbid": 211, "correlation": 117957 } }, { "ph": "s", "id": 117957, "pid": 76337, "tid": -914061504, "ts": 1716454223188472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223243128, "dur": 7, "args": { "External id": 117960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117960, "pid": 5, "tid": 7, "ts": 1716454223243128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188492, "dur": 6, "args": { "External id": 117960, "cbid": 211, "correlation": 117960 } }, { "ph": "s", "id": 117960, "pid": 76337, "tid": -914061504, "ts": 1716454223188492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223243136, "dur": 5, "args": { "External id": 117969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117969, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117969, "pid": 5, "tid": 7, "ts": 1716454223243136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188532, "dur": 10, "args": { "External id": 117969, "cbid": 211, "correlation": 117969 } }, { "ph": "s", "id": 117969, "pid": 76337, "tid": -914061504, "ts": 1716454223188532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223188595, "dur": 0, "args": { "External id": 117979, "cbid": 317, "correlation": 117979 } }, { "ph": "f", "id": 117979, "pid": 76337, "tid": -914061504, "ts": 1716454223188595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223188596, "dur": 0, "args": { "External id": 117980, "cbid": 203, "correlation": 117980 } }, { "ph": "f", "id": 117980, "pid": 76337, "tid": -914061504, "ts": 1716454223188596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223188597, "dur": 0, "args": { "External id": 117981, "cbid": 205, "correlation": 117981 } }, { "ph": "f", "id": 117981, "pid": 76337, "tid": -914061504, "ts": 1716454223188597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223243142, "dur": 5, "args": { "External id": 117985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117985, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117985, "pid": 5, "tid": 7, "ts": 1716454223243142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188610, "dur": 12, "args": { "External id": 117985, "cbid": 211, "correlation": 117985 } }, { "ph": "s", "id": 117985, "pid": 76337, "tid": -914061504, "ts": 1716454223188610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223243149, "dur": 163, "args": { "External id": 117987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117987, "pid": 5, "tid": 7, "ts": 1716454223243149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188624, "dur": 6, "args": { "External id": 117987, "cbid": 211, "correlation": 117987 } }, { "ph": "s", "id": 117987, "pid": 76337, "tid": -914061504, "ts": 1716454223188624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223243314, "dur": 1, "args": { "External id": 117989, "device": 5, "context": 1, "stream": 7, "correlation": 117989, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 117989, "pid": 5, "tid": 7, "ts": 1716454223243314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223188636, "dur": 6, "args": { "External id": 117989, "cbid": 51, "correlation": 117989 } }, { "ph": "s", "id": 117989, "pid": 76337, "tid": -914061504, "ts": 1716454223188636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223243317, "dur": 259, "args": { "External id": 117990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117990, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 117990, "pid": 5, "tid": 7, "ts": 1716454223243317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188643, "dur": 6, "args": { "External id": 117990, "cbid": 211, "correlation": 117990 } }, { "ph": "s", "id": 117990, "pid": 76337, "tid": -914061504, "ts": 1716454223188643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223243578, "dur": 6, "args": { "External id": 117992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 117992, "pid": 5, "tid": 7, "ts": 1716454223243578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188653, "dur": 5, "args": { "External id": 117992, "cbid": 211, "correlation": 117992 } }, { "ph": "s", "id": 117992, "pid": 76337, "tid": -914061504, "ts": 1716454223188653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223243585, "dur": 6, "args": { "External id": 117998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 117998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 117998, "pid": 5, "tid": 7, "ts": 1716454223243585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188682, "dur": 9, "args": { "External id": 117998, "cbid": 211, "correlation": 117998 } }, { "ph": "s", "id": 117998, "pid": 76337, "tid": -914061504, "ts": 1716454223188682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223243592, "dur": 5, "args": { "External id": 118006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118006, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118006, "pid": 5, "tid": 7, "ts": 1716454223243592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188714, "dur": 8, "args": { "External id": 118006, "cbid": 211, "correlation": 118006 } }, { "ph": "s", "id": 118006, "pid": 76337, "tid": -914061504, "ts": 1716454223188714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223243599, "dur": 4, "args": { "External id": 118014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118014, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118014, "pid": 5, "tid": 7, "ts": 1716454223243599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188744, "dur": 8, "args": { "External id": 118014, "cbid": 211, "correlation": 118014 } }, { "ph": "s", "id": 118014, "pid": 76337, "tid": -914061504, "ts": 1716454223188744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223243605, "dur": 12, "args": { "External id": 118023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118023, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118023, "pid": 5, "tid": 7, "ts": 1716454223243605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188832, "dur": 13, "args": { "External id": 118023, "cbid": 211, "correlation": 118023 } }, { "ph": "s", "id": 118023, "pid": 76337, "tid": -914061504, "ts": 1716454223188832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223243618, "dur": 12, "args": { "External id": 118043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118043, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118043, "pid": 5, "tid": 7, "ts": 1716454223243618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188900, "dur": 11, "args": { "External id": 118043, "cbid": 211, "correlation": 118043 } }, { "ph": "s", "id": 118043, "pid": 76337, "tid": -914061504, "ts": 1716454223188900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223243631, "dur": 4, "args": { "External id": 118055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118055, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118055, "pid": 5, "tid": 7, "ts": 1716454223243631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188921, "dur": 6, "args": { "External id": 118055, "cbid": 211, "correlation": 118055 } }, { "ph": "s", "id": 118055, "pid": 76337, "tid": -914061504, "ts": 1716454223188921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223243637, "dur": 10, "args": { "External id": 118058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118058, "pid": 5, "tid": 7, "ts": 1716454223243637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188940, "dur": 7, "args": { "External id": 118058, "cbid": 211, "correlation": 118058 } }, { "ph": "s", "id": 118058, "pid": 76337, "tid": -914061504, "ts": 1716454223188940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223243648, "dur": 6, "args": { "External id": 118067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118067, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118067, "pid": 5, "tid": 7, "ts": 1716454223243648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223188986, "dur": 10, "args": { "External id": 118067, "cbid": 211, "correlation": 118067 } }, { "ph": "s", "id": 118067, "pid": 76337, "tid": -914061504, "ts": 1716454223188986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223189041, "dur": 0, "args": { "External id": 118077, "cbid": 317, "correlation": 118077 } }, { "ph": "f", "id": 118077, "pid": 76337, "tid": -914061504, "ts": 1716454223189041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223189042, "dur": 0, "args": { "External id": 118078, "cbid": 203, "correlation": 118078 } }, { "ph": "f", "id": 118078, "pid": 76337, "tid": -914061504, "ts": 1716454223189042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223189043, "dur": 0, "args": { "External id": 118079, "cbid": 205, "correlation": 118079 } }, { "ph": "f", "id": 118079, "pid": 76337, "tid": -914061504, "ts": 1716454223189043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223243656, "dur": 7, "args": { "External id": 118083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118083, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118083, "pid": 5, "tid": 7, "ts": 1716454223243656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189058, "dur": 11, "args": { "External id": 118083, "cbid": 211, "correlation": 118083 } }, { "ph": "s", "id": 118083, "pid": 76337, "tid": -914061504, "ts": 1716454223189058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223243663, "dur": 321, "args": { "External id": 118085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118085, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118085, "pid": 5, "tid": 7, "ts": 1716454223243663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189072, "dur": 5, "args": { "External id": 118085, "cbid": 211, "correlation": 118085 } }, { "ph": "s", "id": 118085, "pid": 76337, "tid": -914061504, "ts": 1716454223189072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223243986, "dur": 1, "args": { "External id": 118087, "device": 5, "context": 1, "stream": 7, "correlation": 118087, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 118087, "pid": 5, "tid": 7, "ts": 1716454223243986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223189083, "dur": 8, "args": { "External id": 118087, "cbid": 51, "correlation": 118087 } }, { "ph": "s", "id": 118087, "pid": 76337, "tid": -914061504, "ts": 1716454223189083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223243990, "dur": 494, "args": { "External id": 118088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118088, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118088, "pid": 5, "tid": 7, "ts": 1716454223243990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189092, "dur": 6, "args": { "External id": 118088, "cbid": 211, "correlation": 118088 } }, { "ph": "s", "id": 118088, "pid": 76337, "tid": -914061504, "ts": 1716454223189092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223244486, "dur": 5, "args": { "External id": 118090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118090, "pid": 5, "tid": 7, "ts": 1716454223244486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189102, "dur": 5, "args": { "External id": 118090, "cbid": 211, "correlation": 118090 } }, { "ph": "s", "id": 118090, "pid": 76337, "tid": -914061504, "ts": 1716454223189102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223244492, "dur": 6, "args": { "External id": 118096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118096, "pid": 5, "tid": 7, "ts": 1716454223244492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189132, "dur": 9, "args": { "External id": 118096, "cbid": 211, "correlation": 118096 } }, { "ph": "s", "id": 118096, "pid": 76337, "tid": -914061504, "ts": 1716454223189132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223244500, "dur": 4, "args": { "External id": 118104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118104, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 118104, "pid": 5, "tid": 7, "ts": 1716454223244500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189175, "dur": 10, "args": { "External id": 118104, "cbid": 211, "correlation": 118104 } }, { "ph": "s", "id": 118104, "pid": 76337, "tid": -914061504, "ts": 1716454223189175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223189240, "dur": 1, "args": { "External id": 118120, "cbid": 251, "correlation": 118120 } }, { "ph": "f", "id": 118120, "pid": 76337, "tid": -914061504, "ts": 1716454223189240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223189245, "dur": 0, "args": { "External id": 118122, "cbid": 251, "correlation": 118122 } }, { "ph": "f", "id": 118122, "pid": 76337, "tid": -914061504, "ts": 1716454223189245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223244505, "dur": 12, "args": { "External id": 118123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118123, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118123, "pid": 5, "tid": 7, "ts": 1716454223244505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189247, "dur": 11, "args": { "External id": 118123, "cbid": 211, "correlation": 118123 } }, { "ph": "s", "id": 118123, "pid": 76337, "tid": -914061504, "ts": 1716454223189247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223244518, "dur": 5, "args": { "External id": 118125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118125, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118125, "pid": 5, "tid": 7, "ts": 1716454223244518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189259, "dur": 5, "args": { "External id": 118125, "cbid": 211, "correlation": 118125 } }, { "ph": "s", "id": 118125, "pid": 76337, "tid": -914061504, "ts": 1716454223189259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223244524, "dur": 6, "args": { "External id": 118135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118135, "pid": 5, "tid": 7, "ts": 1716454223244524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189316, "dur": 12, "args": { "External id": 118135, "cbid": 211, "correlation": 118135 } }, { "ph": "s", "id": 118135, "pid": 76337, "tid": -914061504, "ts": 1716454223189316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223244531, "dur": 10, "args": { "External id": 118155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118155, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118155, "pid": 5, "tid": 7, "ts": 1716454223244531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189381, "dur": 11, "args": { "External id": 118155, "cbid": 211, "correlation": 118155 } }, { "ph": "s", "id": 118155, "pid": 76337, "tid": -914061504, "ts": 1716454223189381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223244542, "dur": 4, "args": { "External id": 118167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118167, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 118167, "pid": 5, "tid": 7, "ts": 1716454223244542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189402, "dur": 6, "args": { "External id": 118167, "cbid": 211, "correlation": 118167 } }, { "ph": "s", "id": 118167, "pid": 76337, "tid": -914061504, "ts": 1716454223189402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223244547, "dur": 7, "args": { "External id": 118170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118170, "pid": 5, "tid": 7, "ts": 1716454223244547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189421, "dur": 6, "args": { "External id": 118170, "cbid": 211, "correlation": 118170 } }, { "ph": "s", "id": 118170, "pid": 76337, "tid": -914061504, "ts": 1716454223189421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223244555, "dur": 5, "args": { "External id": 118179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118179, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118179, "pid": 5, "tid": 7, "ts": 1716454223244555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189461, "dur": 10, "args": { "External id": 118179, "cbid": 211, "correlation": 118179 } }, { "ph": "s", "id": 118179, "pid": 76337, "tid": -914061504, "ts": 1716454223189461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223189523, "dur": 0, "args": { "External id": 118189, "cbid": 317, "correlation": 118189 } }, { "ph": "f", "id": 118189, "pid": 76337, "tid": -914061504, "ts": 1716454223189523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223189524, "dur": 0, "args": { "External id": 118190, "cbid": 203, "correlation": 118190 } }, { "ph": "f", "id": 118190, "pid": 76337, "tid": -914061504, "ts": 1716454223189524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223189525, "dur": 0, "args": { "External id": 118191, "cbid": 205, "correlation": 118191 } }, { "ph": "f", "id": 118191, "pid": 76337, "tid": -914061504, "ts": 1716454223189525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223244561, "dur": 5, "args": { "External id": 118195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118195, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118195, "pid": 5, "tid": 7, "ts": 1716454223244561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189539, "dur": 12, "args": { "External id": 118195, "cbid": 211, "correlation": 118195 } }, { "ph": "s", "id": 118195, "pid": 76337, "tid": -914061504, "ts": 1716454223189539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223244567, "dur": 163, "args": { "External id": 118197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118197, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118197, "pid": 5, "tid": 7, "ts": 1716454223244567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189553, "dur": 5, "args": { "External id": 118197, "cbid": 211, "correlation": 118197 } }, { "ph": "s", "id": 118197, "pid": 76337, "tid": -914061504, "ts": 1716454223189553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223244733, "dur": 1, "args": { "External id": 118199, "device": 5, "context": 1, "stream": 7, "correlation": 118199, "bytes": 240, "memory bandwidth (GB/s)": 0.14705882352941177 } }, { "ph": "f", "id": 118199, "pid": 5, "tid": 7, "ts": 1716454223244733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223189564, "dur": 6, "args": { "External id": 118199, "cbid": 51, "correlation": 118199 } }, { "ph": "s", "id": 118199, "pid": 76337, "tid": -914061504, "ts": 1716454223189564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223244737, "dur": 258, "args": { "External id": 118200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118200, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118200, "pid": 5, "tid": 7, "ts": 1716454223244737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189571, "dur": 6, "args": { "External id": 118200, "cbid": 211, "correlation": 118200 } }, { "ph": "s", "id": 118200, "pid": 76337, "tid": -914061504, "ts": 1716454223189571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223244996, "dur": 6, "args": { "External id": 118202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118202, "pid": 5, "tid": 7, "ts": 1716454223244996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189581, "dur": 6, "args": { "External id": 118202, "cbid": 211, "correlation": 118202 } }, { "ph": "s", "id": 118202, "pid": 76337, "tid": -914061504, "ts": 1716454223189581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223245003, "dur": 6, "args": { "External id": 118208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118208, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118208, "pid": 5, "tid": 7, "ts": 1716454223245003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189610, "dur": 9, "args": { "External id": 118208, "cbid": 211, "correlation": 118208 } }, { "ph": "s", "id": 118208, "pid": 76337, "tid": -914061504, "ts": 1716454223189610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223189669, "dur": 0, "args": { "External id": 118218, "cbid": 317, "correlation": 118218 } }, { "ph": "f", "id": 118218, "pid": 76337, "tid": -914061504, "ts": 1716454223189669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223189670, "dur": 0, "args": { "External id": 118219, "cbid": 203, "correlation": 118219 } }, { "ph": "f", "id": 118219, "pid": 76337, "tid": -914061504, "ts": 1716454223189670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223189671, "dur": 0, "args": { "External id": 118220, "cbid": 205, "correlation": 118220 } }, { "ph": "f", "id": 118220, "pid": 76337, "tid": -914061504, "ts": 1716454223189671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223245011, "dur": 7, "args": { "External id": 118224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118224, "pid": 5, "tid": 7, "ts": 1716454223245011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189686, "dur": 12, "args": { "External id": 118224, "cbid": 211, "correlation": 118224 } }, { "ph": "s", "id": 118224, "pid": 76337, "tid": -914061504, "ts": 1716454223189686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223245020, "dur": 3, "args": { "External id": 118226, "device": 5, "context": 1, "stream": 7, "correlation": 118226, "bytes": 4800, "memory bandwidth (GB/s)": 1.499531396438613 } }, { "ph": "f", "id": 118226, "pid": 5, "tid": 7, "ts": 1716454223245020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223189704, "dur": 14, "args": { "External id": 118226, "cbid": 51, "correlation": 118226 } }, { "ph": "s", "id": 118226, "pid": 76337, "tid": -914061504, "ts": 1716454223189704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223245024, "dur": 95, "args": { "External id": 118227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118227, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 118227, "pid": 5, "tid": 7, "ts": 1716454223245024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189720, "dur": 6, "args": { "External id": 118227, "cbid": 211, "correlation": 118227 } }, { "ph": "s", "id": 118227, "pid": 76337, "tid": -914061504, "ts": 1716454223189720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223245121, "dur": 5, "args": { "External id": 118229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118229, "pid": 5, "tid": 7, "ts": 1716454223245121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189730, "dur": 5, "args": { "External id": 118229, "cbid": 211, "correlation": 118229 } }, { "ph": "s", "id": 118229, "pid": 76337, "tid": -914061504, "ts": 1716454223189730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223245127, "dur": 6, "args": { "External id": 118235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118235, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118235, "pid": 5, "tid": 7, "ts": 1716454223245127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189757, "dur": 8, "args": { "External id": 118235, "cbid": 211, "correlation": 118235 } }, { "ph": "s", "id": 118235, "pid": 76337, "tid": -914061504, "ts": 1716454223189757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223245135, "dur": 5, "args": { "External id": 118243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118243, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118243, "pid": 5, "tid": 7, "ts": 1716454223245135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189786, "dur": 8, "args": { "External id": 118243, "cbid": 211, "correlation": 118243 } }, { "ph": "s", "id": 118243, "pid": 76337, "tid": -914061504, "ts": 1716454223189786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223245142, "dur": 4, "args": { "External id": 118251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118251, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118251, "pid": 5, "tid": 7, "ts": 1716454223245142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189814, "dur": 8, "args": { "External id": 118251, "cbid": 211, "correlation": 118251 } }, { "ph": "s", "id": 118251, "pid": 76337, "tid": -914061504, "ts": 1716454223189814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223245147, "dur": 11, "args": { "External id": 118260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118260, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118260, "pid": 5, "tid": 7, "ts": 1716454223245147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189858, "dur": 10, "args": { "External id": 118260, "cbid": 211, "correlation": 118260 } }, { "ph": "s", "id": 118260, "pid": 76337, "tid": -914061504, "ts": 1716454223189858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223245160, "dur": 12, "args": { "External id": 118280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118280, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118280, "pid": 5, "tid": 7, "ts": 1716454223245160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189928, "dur": 12, "args": { "External id": 118280, "cbid": 211, "correlation": 118280 } }, { "ph": "s", "id": 118280, "pid": 76337, "tid": -914061504, "ts": 1716454223189928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223245173, "dur": 4, "args": { "External id": 118292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118292, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118292, "pid": 5, "tid": 7, "ts": 1716454223245173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189950, "dur": 6, "args": { "External id": 118292, "cbid": 211, "correlation": 118292 } }, { "ph": "s", "id": 118292, "pid": 76337, "tid": -914061504, "ts": 1716454223189950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223245179, "dur": 10, "args": { "External id": 118295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118295, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118295, "pid": 5, "tid": 7, "ts": 1716454223245179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223189967, "dur": 15, "args": { "External id": 118295, "cbid": 211, "correlation": 118295 } }, { "ph": "s", "id": 118295, "pid": 76337, "tid": -914061504, "ts": 1716454223189967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223245190, "dur": 7, "args": { "External id": 118304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118304, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118304, "pid": 5, "tid": 7, "ts": 1716454223245190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190016, "dur": 10, "args": { "External id": 118304, "cbid": 211, "correlation": 118304 } }, { "ph": "s", "id": 118304, "pid": 76337, "tid": -914061504, "ts": 1716454223190016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223190070, "dur": 0, "args": { "External id": 118314, "cbid": 317, "correlation": 118314 } }, { "ph": "f", "id": 118314, "pid": 76337, "tid": -914061504, "ts": 1716454223190070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223190070, "dur": 0, "args": { "External id": 118315, "cbid": 203, "correlation": 118315 } }, { "ph": "f", "id": 118315, "pid": 76337, "tid": -914061504, "ts": 1716454223190070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223190071, "dur": 0, "args": { "External id": 118316, "cbid": 205, "correlation": 118316 } }, { "ph": "f", "id": 118316, "pid": 76337, "tid": -914061504, "ts": 1716454223190071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223245198, "dur": 6, "args": { "External id": 118320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118320, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118320, "pid": 5, "tid": 7, "ts": 1716454223245198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190084, "dur": 12, "args": { "External id": 118320, "cbid": 211, "correlation": 118320 } }, { "ph": "s", "id": 118320, "pid": 76337, "tid": -914061504, "ts": 1716454223190084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223245206, "dur": 320, "args": { "External id": 118322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118322, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118322, "pid": 5, "tid": 7, "ts": 1716454223245206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190099, "dur": 5, "args": { "External id": 118322, "cbid": 211, "correlation": 118322 } }, { "ph": "s", "id": 118322, "pid": 76337, "tid": -914061504, "ts": 1716454223190099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223245528, "dur": 1, "args": { "External id": 118324, "device": 5, "context": 1, "stream": 7, "correlation": 118324, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 118324, "pid": 5, "tid": 7, "ts": 1716454223245528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223190110, "dur": 6, "args": { "External id": 118324, "cbid": 51, "correlation": 118324 } }, { "ph": "s", "id": 118324, "pid": 76337, "tid": -914061504, "ts": 1716454223190110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223245532, "dur": 497, "args": { "External id": 118325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118325, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118325, "pid": 5, "tid": 7, "ts": 1716454223245532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190118, "dur": 6, "args": { "External id": 118325, "cbid": 211, "correlation": 118325 } }, { "ph": "s", "id": 118325, "pid": 76337, "tid": -914061504, "ts": 1716454223190118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246031, "dur": 5, "args": { "External id": 118327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118327, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118327, "pid": 5, "tid": 7, "ts": 1716454223246031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190128, "dur": 5, "args": { "External id": 118327, "cbid": 211, "correlation": 118327 } }, { "ph": "s", "id": 118327, "pid": 76337, "tid": -914061504, "ts": 1716454223190128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223246037, "dur": 7, "args": { "External id": 118333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118333, "pid": 5, "tid": 7, "ts": 1716454223246037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190157, "dur": 9, "args": { "External id": 118333, "cbid": 211, "correlation": 118333 } }, { "ph": "s", "id": 118333, "pid": 76337, "tid": -914061504, "ts": 1716454223190157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223246045, "dur": 4, "args": { "External id": 118341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118341, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 118341, "pid": 5, "tid": 7, "ts": 1716454223246045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190201, "dur": 9, "args": { "External id": 118341, "cbid": 211, "correlation": 118341 } }, { "ph": "s", "id": 118341, "pid": 76337, "tid": -914061504, "ts": 1716454223190201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223190262, "dur": 1, "args": { "External id": 118357, "cbid": 251, "correlation": 118357 } }, { "ph": "f", "id": 118357, "pid": 76337, "tid": -914061504, "ts": 1716454223190262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223190268, "dur": 0, "args": { "External id": 118359, "cbid": 251, "correlation": 118359 } }, { "ph": "f", "id": 118359, "pid": 76337, "tid": -914061504, "ts": 1716454223190268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223246050, "dur": 13, "args": { "External id": 118360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118360, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118360, "pid": 5, "tid": 7, "ts": 1716454223246050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190270, "dur": 11, "args": { "External id": 118360, "cbid": 211, "correlation": 118360 } }, { "ph": "s", "id": 118360, "pid": 76337, "tid": -914061504, "ts": 1716454223190270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223246065, "dur": 5, "args": { "External id": 118362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118362, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118362, "pid": 5, "tid": 7, "ts": 1716454223246065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190282, "dur": 5, "args": { "External id": 118362, "cbid": 211, "correlation": 118362 } }, { "ph": "s", "id": 118362, "pid": 76337, "tid": -914061504, "ts": 1716454223190282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223246071, "dur": 6, "args": { "External id": 118372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118372, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118372, "pid": 5, "tid": 7, "ts": 1716454223246071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190338, "dur": 13, "args": { "External id": 118372, "cbid": 211, "correlation": 118372 } }, { "ph": "s", "id": 118372, "pid": 76337, "tid": -914061504, "ts": 1716454223190338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223246078, "dur": 10, "args": { "External id": 118392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118392, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118392, "pid": 5, "tid": 7, "ts": 1716454223246078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190406, "dur": 11, "args": { "External id": 118392, "cbid": 211, "correlation": 118392 } }, { "ph": "s", "id": 118392, "pid": 76337, "tid": -914061504, "ts": 1716454223190406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223246089, "dur": 4, "args": { "External id": 118404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118404, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 118404, "pid": 5, "tid": 7, "ts": 1716454223246089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190426, "dur": 6, "args": { "External id": 118404, "cbid": 211, "correlation": 118404 } }, { "ph": "s", "id": 118404, "pid": 76337, "tid": -914061504, "ts": 1716454223190426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223246095, "dur": 7, "args": { "External id": 118407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118407, "pid": 5, "tid": 7, "ts": 1716454223246095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190445, "dur": 7, "args": { "External id": 118407, "cbid": 211, "correlation": 118407 } }, { "ph": "s", "id": 118407, "pid": 76337, "tid": -914061504, "ts": 1716454223190445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223246103, "dur": 5, "args": { "External id": 118416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118416, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118416, "pid": 5, "tid": 7, "ts": 1716454223246103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190484, "dur": 10, "args": { "External id": 118416, "cbid": 211, "correlation": 118416 } }, { "ph": "s", "id": 118416, "pid": 76337, "tid": -914061504, "ts": 1716454223190484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223190547, "dur": 0, "args": { "External id": 118426, "cbid": 317, "correlation": 118426 } }, { "ph": "f", "id": 118426, "pid": 76337, "tid": -914061504, "ts": 1716454223190547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223190548, "dur": 0, "args": { "External id": 118427, "cbid": 203, "correlation": 118427 } }, { "ph": "f", "id": 118427, "pid": 76337, "tid": -914061504, "ts": 1716454223190548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223190549, "dur": 0, "args": { "External id": 118428, "cbid": 205, "correlation": 118428 } }, { "ph": "f", "id": 118428, "pid": 76337, "tid": -914061504, "ts": 1716454223190549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246109, "dur": 5, "args": { "External id": 118432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118432, "pid": 5, "tid": 7, "ts": 1716454223246109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190563, "dur": 12, "args": { "External id": 118432, "cbid": 211, "correlation": 118432 } }, { "ph": "s", "id": 118432, "pid": 76337, "tid": -914061504, "ts": 1716454223190563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246115, "dur": 161, "args": { "External id": 118434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118434, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118434, "pid": 5, "tid": 7, "ts": 1716454223246115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190578, "dur": 5, "args": { "External id": 118434, "cbid": 211, "correlation": 118434 } }, { "ph": "s", "id": 118434, "pid": 76337, "tid": -914061504, "ts": 1716454223190578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223246279, "dur": 1, "args": { "External id": 118436, "device": 5, "context": 1, "stream": 7, "correlation": 118436, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 118436, "pid": 5, "tid": 7, "ts": 1716454223246279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223190589, "dur": 6, "args": { "External id": 118436, "cbid": 51, "correlation": 118436 } }, { "ph": "s", "id": 118436, "pid": 76337, "tid": -914061504, "ts": 1716454223190589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223246282, "dur": 259, "args": { "External id": 118437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118437, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118437, "pid": 5, "tid": 7, "ts": 1716454223246282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190596, "dur": 7, "args": { "External id": 118437, "cbid": 211, "correlation": 118437 } }, { "ph": "s", "id": 118437, "pid": 76337, "tid": -914061504, "ts": 1716454223190596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246542, "dur": 6, "args": { "External id": 118439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118439, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118439, "pid": 5, "tid": 7, "ts": 1716454223246542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190607, "dur": 5, "args": { "External id": 118439, "cbid": 211, "correlation": 118439 } }, { "ph": "s", "id": 118439, "pid": 76337, "tid": -914061504, "ts": 1716454223190607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223246550, "dur": 6, "args": { "External id": 118445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118445, "pid": 5, "tid": 7, "ts": 1716454223246550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190635, "dur": 8, "args": { "External id": 118445, "cbid": 211, "correlation": 118445 } }, { "ph": "s", "id": 118445, "pid": 76337, "tid": -914061504, "ts": 1716454223190635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223190693, "dur": 0, "args": { "External id": 118455, "cbid": 317, "correlation": 118455 } }, { "ph": "f", "id": 118455, "pid": 76337, "tid": -914061504, "ts": 1716454223190693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223190694, "dur": 0, "args": { "External id": 118456, "cbid": 203, "correlation": 118456 } }, { "ph": "f", "id": 118456, "pid": 76337, "tid": -914061504, "ts": 1716454223190694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223190694, "dur": 0, "args": { "External id": 118457, "cbid": 205, "correlation": 118457 } }, { "ph": "f", "id": 118457, "pid": 76337, "tid": -914061504, "ts": 1716454223190694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246557, "dur": 8, "args": { "External id": 118461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118461, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118461, "pid": 5, "tid": 7, "ts": 1716454223246557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190707, "dur": 12, "args": { "External id": 118461, "cbid": 211, "correlation": 118461 } }, { "ph": "s", "id": 118461, "pid": 76337, "tid": -914061504, "ts": 1716454223190707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223246566, "dur": 3, "args": { "External id": 118463, "device": 5, "context": 1, "stream": 7, "correlation": 118463, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 118463, "pid": 5, "tid": 7, "ts": 1716454223246566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223190724, "dur": 9, "args": { "External id": 118463, "cbid": 51, "correlation": 118463 } }, { "ph": "s", "id": 118463, "pid": 76337, "tid": -914061504, "ts": 1716454223190724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223246570, "dur": 95, "args": { "External id": 118464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118464, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 118464, "pid": 5, "tid": 7, "ts": 1716454223246570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190734, "dur": 5, "args": { "External id": 118464, "cbid": 211, "correlation": 118464 } }, { "ph": "s", "id": 118464, "pid": 76337, "tid": -914061504, "ts": 1716454223190734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246666, "dur": 5, "args": { "External id": 118466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118466, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118466, "pid": 5, "tid": 7, "ts": 1716454223246666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190743, "dur": 5, "args": { "External id": 118466, "cbid": 211, "correlation": 118466 } }, { "ph": "s", "id": 118466, "pid": 76337, "tid": -914061504, "ts": 1716454223190743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223246673, "dur": 6, "args": { "External id": 118472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118472, "pid": 5, "tid": 7, "ts": 1716454223246673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190769, "dur": 8, "args": { "External id": 118472, "cbid": 211, "correlation": 118472 } }, { "ph": "s", "id": 118472, "pid": 76337, "tid": -914061504, "ts": 1716454223190769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223246681, "dur": 5, "args": { "External id": 118480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118480, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118480, "pid": 5, "tid": 7, "ts": 1716454223246681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190798, "dur": 9, "args": { "External id": 118480, "cbid": 211, "correlation": 118480 } }, { "ph": "s", "id": 118480, "pid": 76337, "tid": -914061504, "ts": 1716454223190798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223246687, "dur": 4, "args": { "External id": 118488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118488, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118488, "pid": 5, "tid": 7, "ts": 1716454223246687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190827, "dur": 8, "args": { "External id": 118488, "cbid": 211, "correlation": 118488 } }, { "ph": "s", "id": 118488, "pid": 76337, "tid": -914061504, "ts": 1716454223190827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223246693, "dur": 11, "args": { "External id": 118497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118497, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118497, "pid": 5, "tid": 7, "ts": 1716454223246693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190871, "dur": 10, "args": { "External id": 118497, "cbid": 211, "correlation": 118497 } }, { "ph": "s", "id": 118497, "pid": 76337, "tid": -914061504, "ts": 1716454223190871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223246705, "dur": 13, "args": { "External id": 118517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118517, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118517, "pid": 5, "tid": 7, "ts": 1716454223246705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190940, "dur": 11, "args": { "External id": 118517, "cbid": 211, "correlation": 118517 } }, { "ph": "s", "id": 118517, "pid": 76337, "tid": -914061504, "ts": 1716454223190940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223246719, "dur": 4, "args": { "External id": 118529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118529, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118529, "pid": 5, "tid": 7, "ts": 1716454223246719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190962, "dur": 6, "args": { "External id": 118529, "cbid": 211, "correlation": 118529 } }, { "ph": "s", "id": 118529, "pid": 76337, "tid": -914061504, "ts": 1716454223190962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223246724, "dur": 10, "args": { "External id": 118532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118532, "pid": 5, "tid": 7, "ts": 1716454223246724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223190987, "dur": 7, "args": { "External id": 118532, "cbid": 211, "correlation": 118532 } }, { "ph": "s", "id": 118532, "pid": 76337, "tid": -914061504, "ts": 1716454223190987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223246736, "dur": 7, "args": { "External id": 118541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118541, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118541, "pid": 5, "tid": 7, "ts": 1716454223246736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191027, "dur": 10, "args": { "External id": 118541, "cbid": 211, "correlation": 118541 } }, { "ph": "s", "id": 118541, "pid": 76337, "tid": -914061504, "ts": 1716454223191027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223191079, "dur": 0, "args": { "External id": 118551, "cbid": 317, "correlation": 118551 } }, { "ph": "f", "id": 118551, "pid": 76337, "tid": -914061504, "ts": 1716454223191079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223191080, "dur": 0, "args": { "External id": 118552, "cbid": 203, "correlation": 118552 } }, { "ph": "f", "id": 118552, "pid": 76337, "tid": -914061504, "ts": 1716454223191080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223191081, "dur": 0, "args": { "External id": 118553, "cbid": 205, "correlation": 118553 } }, { "ph": "f", "id": 118553, "pid": 76337, "tid": -914061504, "ts": 1716454223191081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246744, "dur": 7, "args": { "External id": 118557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118557, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118557, "pid": 5, "tid": 7, "ts": 1716454223246744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191094, "dur": 11, "args": { "External id": 118557, "cbid": 211, "correlation": 118557 } }, { "ph": "s", "id": 118557, "pid": 76337, "tid": -914061504, "ts": 1716454223191094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223246751, "dur": 320, "args": { "External id": 118559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118559, "pid": 5, "tid": 7, "ts": 1716454223246751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191108, "dur": 6, "args": { "External id": 118559, "cbid": 211, "correlation": 118559 } }, { "ph": "s", "id": 118559, "pid": 76337, "tid": -914061504, "ts": 1716454223191108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223247073, "dur": 1, "args": { "External id": 118561, "device": 5, "context": 1, "stream": 7, "correlation": 118561, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 118561, "pid": 5, "tid": 7, "ts": 1716454223247073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223191119, "dur": 6, "args": { "External id": 118561, "cbid": 51, "correlation": 118561 } }, { "ph": "s", "id": 118561, "pid": 76337, "tid": -914061504, "ts": 1716454223191119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223247077, "dur": 495, "args": { "External id": 118562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118562, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118562, "pid": 5, "tid": 7, "ts": 1716454223247077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191126, "dur": 6, "args": { "External id": 118562, "cbid": 211, "correlation": 118562 } }, { "ph": "s", "id": 118562, "pid": 76337, "tid": -914061504, "ts": 1716454223191126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223247573, "dur": 5, "args": { "External id": 118564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118564, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118564, "pid": 5, "tid": 7, "ts": 1716454223247573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191136, "dur": 5, "args": { "External id": 118564, "cbid": 211, "correlation": 118564 } }, { "ph": "s", "id": 118564, "pid": 76337, "tid": -914061504, "ts": 1716454223191136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223247580, "dur": 6, "args": { "External id": 118570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118570, "pid": 5, "tid": 7, "ts": 1716454223247580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191165, "dur": 9, "args": { "External id": 118570, "cbid": 211, "correlation": 118570 } }, { "ph": "s", "id": 118570, "pid": 76337, "tid": -914061504, "ts": 1716454223191165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223247588, "dur": 4, "args": { "External id": 118578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118578, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 118578, "pid": 5, "tid": 7, "ts": 1716454223247588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191208, "dur": 10, "args": { "External id": 118578, "cbid": 211, "correlation": 118578 } }, { "ph": "s", "id": 118578, "pid": 76337, "tid": -914061504, "ts": 1716454223191208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223191270, "dur": 1, "args": { "External id": 118594, "cbid": 251, "correlation": 118594 } }, { "ph": "f", "id": 118594, "pid": 76337, "tid": -914061504, "ts": 1716454223191270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223191275, "dur": 0, "args": { "External id": 118596, "cbid": 251, "correlation": 118596 } }, { "ph": "f", "id": 118596, "pid": 76337, "tid": -914061504, "ts": 1716454223191275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223247593, "dur": 13, "args": { "External id": 118597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118597, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118597, "pid": 5, "tid": 7, "ts": 1716454223247593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191277, "dur": 11, "args": { "External id": 118597, "cbid": 211, "correlation": 118597 } }, { "ph": "s", "id": 118597, "pid": 76337, "tid": -914061504, "ts": 1716454223191277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223247607, "dur": 5, "args": { "External id": 118599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118599, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118599, "pid": 5, "tid": 7, "ts": 1716454223247607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191290, "dur": 5, "args": { "External id": 118599, "cbid": 211, "correlation": 118599 } }, { "ph": "s", "id": 118599, "pid": 76337, "tid": -914061504, "ts": 1716454223191290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223247613, "dur": 6, "args": { "External id": 118609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118609, "pid": 5, "tid": 7, "ts": 1716454223247613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191347, "dur": 13, "args": { "External id": 118609, "cbid": 211, "correlation": 118609 } }, { "ph": "s", "id": 118609, "pid": 76337, "tid": -914061504, "ts": 1716454223191347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223247620, "dur": 9, "args": { "External id": 118629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118629, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118629, "pid": 5, "tid": 7, "ts": 1716454223247620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191413, "dur": 11, "args": { "External id": 118629, "cbid": 211, "correlation": 118629 } }, { "ph": "s", "id": 118629, "pid": 76337, "tid": -914061504, "ts": 1716454223191413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223247631, "dur": 4, "args": { "External id": 118641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118641, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 118641, "pid": 5, "tid": 7, "ts": 1716454223247631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191433, "dur": 6, "args": { "External id": 118641, "cbid": 211, "correlation": 118641 } }, { "ph": "s", "id": 118641, "pid": 76337, "tid": -914061504, "ts": 1716454223191433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223247636, "dur": 7, "args": { "External id": 118644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118644, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118644, "pid": 5, "tid": 7, "ts": 1716454223247636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191452, "dur": 6, "args": { "External id": 118644, "cbid": 211, "correlation": 118644 } }, { "ph": "s", "id": 118644, "pid": 76337, "tid": -914061504, "ts": 1716454223191452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223247644, "dur": 5, "args": { "External id": 118653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118653, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118653, "pid": 5, "tid": 7, "ts": 1716454223247644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191492, "dur": 10, "args": { "External id": 118653, "cbid": 211, "correlation": 118653 } }, { "ph": "s", "id": 118653, "pid": 76337, "tid": -914061504, "ts": 1716454223191492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223191555, "dur": 0, "args": { "External id": 118663, "cbid": 317, "correlation": 118663 } }, { "ph": "f", "id": 118663, "pid": 76337, "tid": -914061504, "ts": 1716454223191555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223191556, "dur": 0, "args": { "External id": 118664, "cbid": 203, "correlation": 118664 } }, { "ph": "f", "id": 118664, "pid": 76337, "tid": -914061504, "ts": 1716454223191556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223191556, "dur": 0, "args": { "External id": 118665, "cbid": 205, "correlation": 118665 } }, { "ph": "f", "id": 118665, "pid": 76337, "tid": -914061504, "ts": 1716454223191556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223247650, "dur": 5, "args": { "External id": 118669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118669, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118669, "pid": 5, "tid": 7, "ts": 1716454223247650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191571, "dur": 12, "args": { "External id": 118669, "cbid": 211, "correlation": 118669 } }, { "ph": "s", "id": 118669, "pid": 76337, "tid": -914061504, "ts": 1716454223191571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223247656, "dur": 162, "args": { "External id": 118671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118671, "pid": 5, "tid": 7, "ts": 1716454223247656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191585, "dur": 5, "args": { "External id": 118671, "cbid": 211, "correlation": 118671 } }, { "ph": "s", "id": 118671, "pid": 76337, "tid": -914061504, "ts": 1716454223191585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223247821, "dur": 1, "args": { "External id": 118673, "device": 5, "context": 1, "stream": 7, "correlation": 118673, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 118673, "pid": 5, "tid": 7, "ts": 1716454223247821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223191596, "dur": 8, "args": { "External id": 118673, "cbid": 51, "correlation": 118673 } }, { "ph": "s", "id": 118673, "pid": 76337, "tid": -914061504, "ts": 1716454223191596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223247824, "dur": 258, "args": { "External id": 118674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118674, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118674, "pid": 5, "tid": 7, "ts": 1716454223247824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191605, "dur": 6, "args": { "External id": 118674, "cbid": 211, "correlation": 118674 } }, { "ph": "s", "id": 118674, "pid": 76337, "tid": -914061504, "ts": 1716454223191605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223248084, "dur": 6, "args": { "External id": 118676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118676, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118676, "pid": 5, "tid": 7, "ts": 1716454223248084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191615, "dur": 5, "args": { "External id": 118676, "cbid": 211, "correlation": 118676 } }, { "ph": "s", "id": 118676, "pid": 76337, "tid": -914061504, "ts": 1716454223191615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223248091, "dur": 6, "args": { "External id": 118682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118682, "pid": 5, "tid": 7, "ts": 1716454223248091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191644, "dur": 8, "args": { "External id": 118682, "cbid": 211, "correlation": 118682 } }, { "ph": "s", "id": 118682, "pid": 76337, "tid": -914061504, "ts": 1716454223191644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223191702, "dur": 0, "args": { "External id": 118692, "cbid": 317, "correlation": 118692 } }, { "ph": "f", "id": 118692, "pid": 76337, "tid": -914061504, "ts": 1716454223191702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223191703, "dur": 0, "args": { "External id": 118693, "cbid": 203, "correlation": 118693 } }, { "ph": "f", "id": 118693, "pid": 76337, "tid": -914061504, "ts": 1716454223191703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223191704, "dur": 0, "args": { "External id": 118694, "cbid": 205, "correlation": 118694 } }, { "ph": "f", "id": 118694, "pid": 76337, "tid": -914061504, "ts": 1716454223191704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223248099, "dur": 8, "args": { "External id": 118698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118698, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118698, "pid": 5, "tid": 7, "ts": 1716454223248099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191716, "dur": 11, "args": { "External id": 118698, "cbid": 211, "correlation": 118698 } }, { "ph": "s", "id": 118698, "pid": 76337, "tid": -914061504, "ts": 1716454223191716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223248108, "dur": 3, "args": { "External id": 118700, "device": 5, "context": 1, "stream": 7, "correlation": 118700, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 118700, "pid": 5, "tid": 7, "ts": 1716454223248108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223191732, "dur": 9, "args": { "External id": 118700, "cbid": 51, "correlation": 118700 } }, { "ph": "s", "id": 118700, "pid": 76337, "tid": -914061504, "ts": 1716454223191732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223248112, "dur": 94, "args": { "External id": 118701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118701, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 118701, "pid": 5, "tid": 7, "ts": 1716454223248112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191742, "dur": 6, "args": { "External id": 118701, "cbid": 211, "correlation": 118701 } }, { "ph": "s", "id": 118701, "pid": 76337, "tid": -914061504, "ts": 1716454223191742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223248207, "dur": 6, "args": { "External id": 118703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118703, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118703, "pid": 5, "tid": 7, "ts": 1716454223248207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191751, "dur": 5, "args": { "External id": 118703, "cbid": 211, "correlation": 118703 } }, { "ph": "s", "id": 118703, "pid": 76337, "tid": -914061504, "ts": 1716454223191751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223248214, "dur": 6, "args": { "External id": 118709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118709, "pid": 5, "tid": 7, "ts": 1716454223248214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191778, "dur": 8, "args": { "External id": 118709, "cbid": 211, "correlation": 118709 } }, { "ph": "s", "id": 118709, "pid": 76337, "tid": -914061504, "ts": 1716454223191778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223248222, "dur": 5, "args": { "External id": 118717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118717, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118717, "pid": 5, "tid": 7, "ts": 1716454223248222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191807, "dur": 8, "args": { "External id": 118717, "cbid": 211, "correlation": 118717 } }, { "ph": "s", "id": 118717, "pid": 76337, "tid": -914061504, "ts": 1716454223191807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223248228, "dur": 4, "args": { "External id": 118725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118725, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 118725, "pid": 5, "tid": 7, "ts": 1716454223248228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191836, "dur": 8, "args": { "External id": 118725, "cbid": 211, "correlation": 118725 } }, { "ph": "s", "id": 118725, "pid": 76337, "tid": -914061504, "ts": 1716454223191836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223248234, "dur": 14, "args": { "External id": 118736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118736, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118736, "pid": 5, "tid": 7, "ts": 1716454223248234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191917, "dur": 13, "args": { "External id": 118736, "cbid": 211, "correlation": 118736 } }, { "ph": "s", "id": 118736, "pid": 76337, "tid": -914061504, "ts": 1716454223191917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223191982, "dur": 0, "args": { "External id": 118746, "cbid": 317, "correlation": 118746 } }, { "ph": "f", "id": 118746, "pid": 76337, "tid": -914061504, "ts": 1716454223191982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223191983, "dur": 0, "args": { "External id": 118747, "cbid": 203, "correlation": 118747 } }, { "ph": "f", "id": 118747, "pid": 76337, "tid": -914061504, "ts": 1716454223191983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223191984, "dur": 0, "args": { "External id": 118748, "cbid": 205, "correlation": 118748 } }, { "ph": "f", "id": 118748, "pid": 76337, "tid": -914061504, "ts": 1716454223191984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223248249, "dur": 9, "args": { "External id": 118752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118752, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118752, "pid": 5, "tid": 7, "ts": 1716454223248249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223191997, "dur": 12, "args": { "External id": 118752, "cbid": 211, "correlation": 118752 } }, { "ph": "s", "id": 118752, "pid": 76337, "tid": -914061504, "ts": 1716454223191997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223248260, "dur": 163, "args": { "External id": 118754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118754, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118754, "pid": 5, "tid": 7, "ts": 1716454223248260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192011, "dur": 5, "args": { "External id": 118754, "cbid": 211, "correlation": 118754 } }, { "ph": "s", "id": 118754, "pid": 76337, "tid": -914061504, "ts": 1716454223192011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223248425, "dur": 1, "args": { "External id": 118756, "device": 5, "context": 1, "stream": 7, "correlation": 118756, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 118756, "pid": 5, "tid": 7, "ts": 1716454223248425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223192022, "dur": 6, "args": { "External id": 118756, "cbid": 51, "correlation": 118756 } }, { "ph": "s", "id": 118756, "pid": 76337, "tid": -914061504, "ts": 1716454223192022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223248429, "dur": 649, "args": { "External id": 118757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118757, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118757, "pid": 5, "tid": 7, "ts": 1716454223248429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192029, "dur": 6, "args": { "External id": 118757, "cbid": 211, "correlation": 118757 } }, { "ph": "s", "id": 118757, "pid": 76337, "tid": -914061504, "ts": 1716454223192029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223249079, "dur": 13, "args": { "External id": 118759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118759, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118759, "pid": 5, "tid": 7, "ts": 1716454223249079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192039, "dur": 5, "args": { "External id": 118759, "cbid": 211, "correlation": 118759 } }, { "ph": "s", "id": 118759, "pid": 76337, "tid": -914061504, "ts": 1716454223192039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223249093, "dur": 15, "args": { "External id": 118765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118765, "pid": 5, "tid": 7, "ts": 1716454223249093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192068, "dur": 9, "args": { "External id": 118765, "cbid": 211, "correlation": 118765 } }, { "ph": "s", "id": 118765, "pid": 76337, "tid": -914061504, "ts": 1716454223192068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223249109, "dur": 30, "args": { "External id": 118774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118774, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118774, "pid": 5, "tid": 7, "ts": 1716454223249109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192158, "dur": 13, "args": { "External id": 118774, "cbid": 211, "correlation": 118774 } }, { "ph": "s", "id": 118774, "pid": 76337, "tid": -914061504, "ts": 1716454223192158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223249140, "dur": 30, "args": { "External id": 118794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118794, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118794, "pid": 5, "tid": 7, "ts": 1716454223249140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192228, "dur": 11, "args": { "External id": 118794, "cbid": 211, "correlation": 118794 } }, { "ph": "s", "id": 118794, "pid": 76337, "tid": -914061504, "ts": 1716454223192228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223249172, "dur": 4, "args": { "External id": 118806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118806, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118806, "pid": 5, "tid": 7, "ts": 1716454223249172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192248, "dur": 6, "args": { "External id": 118806, "cbid": 211, "correlation": 118806 } }, { "ph": "s", "id": 118806, "pid": 76337, "tid": -914061504, "ts": 1716454223192248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223249177, "dur": 31, "args": { "External id": 118809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118809, "pid": 5, "tid": 7, "ts": 1716454223249177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192266, "dur": 7, "args": { "External id": 118809, "cbid": 211, "correlation": 118809 } }, { "ph": "s", "id": 118809, "pid": 76337, "tid": -914061504, "ts": 1716454223192266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223249209, "dur": 22, "args": { "External id": 118818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118818, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118818, "pid": 5, "tid": 7, "ts": 1716454223249209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192305, "dur": 10, "args": { "External id": 118818, "cbid": 211, "correlation": 118818 } }, { "ph": "s", "id": 118818, "pid": 76337, "tid": -914061504, "ts": 1716454223192305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223192356, "dur": 0, "args": { "External id": 118828, "cbid": 317, "correlation": 118828 } }, { "ph": "f", "id": 118828, "pid": 76337, "tid": -914061504, "ts": 1716454223192356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223192357, "dur": 0, "args": { "External id": 118829, "cbid": 203, "correlation": 118829 } }, { "ph": "f", "id": 118829, "pid": 76337, "tid": -914061504, "ts": 1716454223192357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223192358, "dur": 0, "args": { "External id": 118830, "cbid": 205, "correlation": 118830 } }, { "ph": "f", "id": 118830, "pid": 76337, "tid": -914061504, "ts": 1716454223192358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223249232, "dur": 21, "args": { "External id": 118834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118834, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118834, "pid": 5, "tid": 7, "ts": 1716454223249232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192373, "dur": 11, "args": { "External id": 118834, "cbid": 211, "correlation": 118834 } }, { "ph": "s", "id": 118834, "pid": 76337, "tid": -914061504, "ts": 1716454223192373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223249255, "dur": 320, "args": { "External id": 118836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118836, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118836, "pid": 5, "tid": 7, "ts": 1716454223249255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192387, "dur": 5, "args": { "External id": 118836, "cbid": 211, "correlation": 118836 } }, { "ph": "s", "id": 118836, "pid": 76337, "tid": -914061504, "ts": 1716454223192387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223249577, "dur": 1, "args": { "External id": 118838, "device": 5, "context": 1, "stream": 7, "correlation": 118838, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 118838, "pid": 5, "tid": 7, "ts": 1716454223249577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223192398, "dur": 6, "args": { "External id": 118838, "cbid": 51, "correlation": 118838 } }, { "ph": "s", "id": 118838, "pid": 76337, "tid": -914061504, "ts": 1716454223192398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223249581, "dur": 1236, "args": { "External id": 118839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118839, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118839, "pid": 5, "tid": 7, "ts": 1716454223249581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192405, "dur": 6, "args": { "External id": 118839, "cbid": 211, "correlation": 118839 } }, { "ph": "s", "id": 118839, "pid": 76337, "tid": -914061504, "ts": 1716454223192405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223250819, "dur": 12, "args": { "External id": 118841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118841, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118841, "pid": 5, "tid": 7, "ts": 1716454223250819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192417, "dur": 6, "args": { "External id": 118841, "cbid": 211, "correlation": 118841 } }, { "ph": "s", "id": 118841, "pid": 76337, "tid": -914061504, "ts": 1716454223192417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223250832, "dur": 15, "args": { "External id": 118847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118847, "pid": 5, "tid": 7, "ts": 1716454223250832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192445, "dur": 8, "args": { "External id": 118847, "cbid": 211, "correlation": 118847 } }, { "ph": "s", "id": 118847, "pid": 76337, "tid": -914061504, "ts": 1716454223192445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223250849, "dur": 4, "args": { "External id": 118855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118855, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 118855, "pid": 5, "tid": 7, "ts": 1716454223250849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192490, "dur": 9, "args": { "External id": 118855, "cbid": 211, "correlation": 118855 } }, { "ph": "s", "id": 118855, "pid": 76337, "tid": -914061504, "ts": 1716454223192490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223192554, "dur": 1, "args": { "External id": 118871, "cbid": 251, "correlation": 118871 } }, { "ph": "f", "id": 118871, "pid": 76337, "tid": -914061504, "ts": 1716454223192554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223192559, "dur": 0, "args": { "External id": 118873, "cbid": 251, "correlation": 118873 } }, { "ph": "f", "id": 118873, "pid": 76337, "tid": -914061504, "ts": 1716454223192559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223250854, "dur": 12, "args": { "External id": 118874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118874, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118874, "pid": 5, "tid": 7, "ts": 1716454223250854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192560, "dur": 12, "args": { "External id": 118874, "cbid": 211, "correlation": 118874 } }, { "ph": "s", "id": 118874, "pid": 76337, "tid": -914061504, "ts": 1716454223192560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223250868, "dur": 5, "args": { "External id": 118876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118876, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118876, "pid": 5, "tid": 7, "ts": 1716454223250868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192574, "dur": 5, "args": { "External id": 118876, "cbid": 211, "correlation": 118876 } }, { "ph": "s", "id": 118876, "pid": 76337, "tid": -914061504, "ts": 1716454223192574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223250874, "dur": 16, "args": { "External id": 118886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118886, "pid": 5, "tid": 7, "ts": 1716454223250874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192630, "dur": 12, "args": { "External id": 118886, "cbid": 211, "correlation": 118886 } }, { "ph": "s", "id": 118886, "pid": 76337, "tid": -914061504, "ts": 1716454223192630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223250892, "dur": 17, "args": { "External id": 118906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118906, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 118906, "pid": 5, "tid": 7, "ts": 1716454223250892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192697, "dur": 10, "args": { "External id": 118906, "cbid": 211, "correlation": 118906 } }, { "ph": "s", "id": 118906, "pid": 76337, "tid": -914061504, "ts": 1716454223192697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223250911, "dur": 4, "args": { "External id": 118918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118918, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 118918, "pid": 5, "tid": 7, "ts": 1716454223250911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192717, "dur": 7, "args": { "External id": 118918, "cbid": 211, "correlation": 118918 } }, { "ph": "s", "id": 118918, "pid": 76337, "tid": -914061504, "ts": 1716454223192717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223250916, "dur": 17, "args": { "External id": 118921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118921, "pid": 5, "tid": 7, "ts": 1716454223250916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192736, "dur": 6, "args": { "External id": 118921, "cbid": 211, "correlation": 118921 } }, { "ph": "s", "id": 118921, "pid": 76337, "tid": -914061504, "ts": 1716454223192736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223250934, "dur": 11, "args": { "External id": 118930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118930, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118930, "pid": 5, "tid": 7, "ts": 1716454223250934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192776, "dur": 9, "args": { "External id": 118930, "cbid": 211, "correlation": 118930 } }, { "ph": "s", "id": 118930, "pid": 76337, "tid": -914061504, "ts": 1716454223192776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223192838, "dur": 0, "args": { "External id": 118940, "cbid": 317, "correlation": 118940 } }, { "ph": "f", "id": 118940, "pid": 76337, "tid": -914061504, "ts": 1716454223192838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223192839, "dur": 0, "args": { "External id": 118941, "cbid": 203, "correlation": 118941 } }, { "ph": "f", "id": 118941, "pid": 76337, "tid": -914061504, "ts": 1716454223192839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223192840, "dur": 0, "args": { "External id": 118942, "cbid": 205, "correlation": 118942 } }, { "ph": "f", "id": 118942, "pid": 76337, "tid": -914061504, "ts": 1716454223192840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223250946, "dur": 11, "args": { "External id": 118946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118946, "pid": 5, "tid": 7, "ts": 1716454223250946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192853, "dur": 12, "args": { "External id": 118946, "cbid": 211, "correlation": 118946 } }, { "ph": "s", "id": 118946, "pid": 76337, "tid": -914061504, "ts": 1716454223192853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223250958, "dur": 162, "args": { "External id": 118948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118948, "pid": 5, "tid": 7, "ts": 1716454223250958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192867, "dur": 5, "args": { "External id": 118948, "cbid": 211, "correlation": 118948 } }, { "ph": "s", "id": 118948, "pid": 76337, "tid": -914061504, "ts": 1716454223192867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223251123, "dur": 1, "args": { "External id": 118950, "device": 5, "context": 1, "stream": 7, "correlation": 118950, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 118950, "pid": 5, "tid": 7, "ts": 1716454223251123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223192879, "dur": 6, "args": { "External id": 118950, "cbid": 51, "correlation": 118950 } }, { "ph": "s", "id": 118950, "pid": 76337, "tid": -914061504, "ts": 1716454223192879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223251127, "dur": 648, "args": { "External id": 118951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118951, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 118951, "pid": 5, "tid": 7, "ts": 1716454223251127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192886, "dur": 6, "args": { "External id": 118951, "cbid": 211, "correlation": 118951 } }, { "ph": "s", "id": 118951, "pid": 76337, "tid": -914061504, "ts": 1716454223192886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223251776, "dur": 12, "args": { "External id": 118953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118953, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118953, "pid": 5, "tid": 7, "ts": 1716454223251776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192895, "dur": 5, "args": { "External id": 118953, "cbid": 211, "correlation": 118953 } }, { "ph": "s", "id": 118953, "pid": 76337, "tid": -914061504, "ts": 1716454223192895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223251789, "dur": 15, "args": { "External id": 118959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118959, "pid": 5, "tid": 7, "ts": 1716454223251789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223192923, "dur": 9, "args": { "External id": 118959, "cbid": 211, "correlation": 118959 } }, { "ph": "s", "id": 118959, "pid": 76337, "tid": -914061504, "ts": 1716454223192923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223192989, "dur": 0, "args": { "External id": 118969, "cbid": 317, "correlation": 118969 } }, { "ph": "f", "id": 118969, "pid": 76337, "tid": -914061504, "ts": 1716454223192989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223192990, "dur": 0, "args": { "External id": 118970, "cbid": 203, "correlation": 118970 } }, { "ph": "f", "id": 118970, "pid": 76337, "tid": -914061504, "ts": 1716454223192990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223192991, "dur": 0, "args": { "External id": 118971, "cbid": 205, "correlation": 118971 } }, { "ph": "f", "id": 118971, "pid": 76337, "tid": -914061504, "ts": 1716454223192991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223251805, "dur": 21, "args": { "External id": 118975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118975, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118975, "pid": 5, "tid": 7, "ts": 1716454223251805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193004, "dur": 12, "args": { "External id": 118975, "cbid": 211, "correlation": 118975 } }, { "ph": "s", "id": 118975, "pid": 76337, "tid": -914061504, "ts": 1716454223193004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223251827, "dur": 4, "args": { "External id": 118977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 118977, "pid": 5, "tid": 7, "ts": 1716454223251827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193022, "dur": 7, "args": { "External id": 118977, "cbid": 211, "correlation": 118977 } }, { "ph": "s", "id": 118977, "pid": 76337, "tid": -914061504, "ts": 1716454223193022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223193032, "dur": 0, "args": { "External id": 118978, "cbid": 51, "correlation": 118978 } }, { "ph": "s", "id": 118978, "pid": 76337, "tid": -914061504, "ts": 1716454223193032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223251832, "dur": 174, "args": { "External id": 118979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118979, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 118979, "pid": 5, "tid": 7, "ts": 1716454223251832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193032, "dur": 5, "args": { "External id": 118979, "cbid": 211, "correlation": 118979 } }, { "ph": "s", "id": 118979, "pid": 76337, "tid": -914061504, "ts": 1716454223193032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223252007, "dur": 16, "args": { "External id": 118984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118984, "pid": 5, "tid": 7, "ts": 1716454223252007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193058, "dur": 8, "args": { "External id": 118984, "cbid": 211, "correlation": 118984 } }, { "ph": "s", "id": 118984, "pid": 76337, "tid": -914061504, "ts": 1716454223193058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223252025, "dur": 11, "args": { "External id": 118992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 118992, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 118992, "pid": 5, "tid": 7, "ts": 1716454223252025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193087, "dur": 8, "args": { "External id": 118992, "cbid": 211, "correlation": 118992 } }, { "ph": "s", "id": 118992, "pid": 76337, "tid": -914061504, "ts": 1716454223193087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223252037, "dur": 10, "args": { "External id": 119000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119000, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119000, "pid": 5, "tid": 7, "ts": 1716454223252037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193116, "dur": 8, "args": { "External id": 119000, "cbid": 211, "correlation": 119000 } }, { "ph": "s", "id": 119000, "pid": 76337, "tid": -914061504, "ts": 1716454223193116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223252049, "dur": 19, "args": { "External id": 119020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119020, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 119020, "pid": 5, "tid": 7, "ts": 1716454223252049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193198, "dur": 12, "args": { "External id": 119020, "cbid": 211, "correlation": 119020 } }, { "ph": "s", "id": 119020, "pid": 76337, "tid": -914061504, "ts": 1716454223193198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223252070, "dur": 5, "args": { "External id": 119032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119032, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 119032, "pid": 5, "tid": 7, "ts": 1716454223252070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193220, "dur": 6, "args": { "External id": 119032, "cbid": 211, "correlation": 119032 } }, { "ph": "s", "id": 119032, "pid": 76337, "tid": -914061504, "ts": 1716454223193220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223252076, "dur": 17, "args": { "External id": 119035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119035, "pid": 5, "tid": 7, "ts": 1716454223252076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193238, "dur": 6, "args": { "External id": 119035, "cbid": 211, "correlation": 119035 } }, { "ph": "s", "id": 119035, "pid": 76337, "tid": -914061504, "ts": 1716454223193238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223193295, "dur": 0, "args": { "External id": 119046, "cbid": 317, "correlation": 119046 } }, { "ph": "f", "id": 119046, "pid": 76337, "tid": -914061504, "ts": 1716454223193295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223193296, "dur": 0, "args": { "External id": 119047, "cbid": 203, "correlation": 119047 } }, { "ph": "f", "id": 119047, "pid": 76337, "tid": -914061504, "ts": 1716454223193296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223193297, "dur": 0, "args": { "External id": 119048, "cbid": 205, "correlation": 119048 } }, { "ph": "f", "id": 119048, "pid": 76337, "tid": -914061504, "ts": 1716454223193297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223252094, "dur": 12, "args": { "External id": 119052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119052, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119052, "pid": 5, "tid": 7, "ts": 1716454223252094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193310, "dur": 12, "args": { "External id": 119052, "cbid": 211, "correlation": 119052 } }, { "ph": "s", "id": 119052, "pid": 76337, "tid": -914061504, "ts": 1716454223193310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223252107, "dur": 4, "args": { "External id": 119054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119054, "pid": 5, "tid": 7, "ts": 1716454223252107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193327, "dur": 6, "args": { "External id": 119054, "cbid": 211, "correlation": 119054 } }, { "ph": "s", "id": 119054, "pid": 76337, "tid": -914061504, "ts": 1716454223193327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223193336, "dur": 0, "args": { "External id": 119055, "cbid": 51, "correlation": 119055 } }, { "ph": "s", "id": 119055, "pid": 76337, "tid": -914061504, "ts": 1716454223193336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223252112, "dur": 91, "args": { "External id": 119056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119056, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 119056, "pid": 5, "tid": 7, "ts": 1716454223252112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193337, "dur": 5, "args": { "External id": 119056, "cbid": 211, "correlation": 119056 } }, { "ph": "s", "id": 119056, "pid": 76337, "tid": -914061504, "ts": 1716454223193337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223252204, "dur": 16, "args": { "External id": 119061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119061, "pid": 5, "tid": 7, "ts": 1716454223252204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193363, "dur": 9, "args": { "External id": 119061, "cbid": 211, "correlation": 119061 } }, { "ph": "s", "id": 119061, "pid": 76337, "tid": -914061504, "ts": 1716454223193363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223252221, "dur": 83, "args": { "External id": 119070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119070, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119070, "pid": 5, "tid": 7, "ts": 1716454223252221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193445, "dur": 14, "args": { "External id": 119070, "cbid": 211, "correlation": 119070 } }, { "ph": "s", "id": 119070, "pid": 76337, "tid": -914061504, "ts": 1716454223193445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223252306, "dur": 31, "args": { "External id": 119092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119092, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119092, "pid": 5, "tid": 7, "ts": 1716454223252306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193503, "dur": 10, "args": { "External id": 119092, "cbid": 211, "correlation": 119092 } }, { "ph": "s", "id": 119092, "pid": 76337, "tid": -914061504, "ts": 1716454223193503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223193597, "dur": 1, "args": { "External id": 119103, "cbid": 251, "correlation": 119103 } }, { "ph": "f", "id": 119103, "pid": 76337, "tid": -914061504, "ts": 1716454223193597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223252338, "dur": 140, "args": { "External id": 119104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119104, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119104, "pid": 5, "tid": 7, "ts": 1716454223252338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193603, "dur": 13, "args": { "External id": 119104, "cbid": 211, "correlation": 119104 } }, { "ph": "s", "id": 119104, "pid": 76337, "tid": -914061504, "ts": 1716454223193603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223193673, "dur": 1, "args": { "External id": 119115, "cbid": 251, "correlation": 119115 } }, { "ph": "f", "id": 119115, "pid": 76337, "tid": -914061504, "ts": 1716454223193673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223252479, "dur": 155, "args": { "External id": 119116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119116, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119116, "pid": 5, "tid": 7, "ts": 1716454223252479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193678, "dur": 12, "args": { "External id": 119116, "cbid": 211, "correlation": 119116 } }, { "ph": "s", "id": 119116, "pid": 76337, "tid": -914061504, "ts": 1716454223193678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223193742, "dur": 1, "args": { "External id": 119127, "cbid": 251, "correlation": 119127 } }, { "ph": "f", "id": 119127, "pid": 76337, "tid": -914061504, "ts": 1716454223193742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223252635, "dur": 157, "args": { "External id": 119128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119128, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119128, "pid": 5, "tid": 7, "ts": 1716454223252635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193746, "dur": 11, "args": { "External id": 119128, "cbid": 211, "correlation": 119128 } }, { "ph": "s", "id": 119128, "pid": 76337, "tid": -914061504, "ts": 1716454223193746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223252794, "dur": 334, "args": { "External id": 119153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119153, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119153, "pid": 5, "tid": 7, "ts": 1716454223252794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193832, "dur": 13, "args": { "External id": 119153, "cbid": 211, "correlation": 119153 } }, { "ph": "s", "id": 119153, "pid": 76337, "tid": -914061504, "ts": 1716454223193832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223193934, "dur": 1, "args": { "External id": 119171, "cbid": 251, "correlation": 119171 } }, { "ph": "f", "id": 119171, "pid": 76337, "tid": -914061504, "ts": 1716454223193934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223253129, "dur": 169, "args": { "External id": 119173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119173, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119173, "pid": 5, "tid": 7, "ts": 1716454223253129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223193940, "dur": 13, "args": { "External id": 119173, "cbid": 211, "correlation": 119173 } }, { "ph": "s", "id": 119173, "pid": 76337, "tid": -914061504, "ts": 1716454223193940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223253299, "dur": 19, "args": { "External id": 119181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119181, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119181, "pid": 5, "tid": 7, "ts": 1716454223253299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194019, "dur": 13, "args": { "External id": 119181, "cbid": 211, "correlation": 119181 } }, { "ph": "s", "id": 119181, "pid": 76337, "tid": -914061504, "ts": 1716454223194019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223253320, "dur": 28, "args": { "External id": 119189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119189, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119189, "pid": 5, "tid": 7, "ts": 1716454223253320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194060, "dur": 8, "args": { "External id": 119189, "cbid": 211, "correlation": 119189 } }, { "ph": "s", "id": 119189, "pid": 76337, "tid": -914061504, "ts": 1716454223194060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223253349, "dur": 19, "args": { "External id": 119200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119200, "pid": 5, "tid": 7, "ts": 1716454223253349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194133, "dur": 14, "args": { "External id": 119200, "cbid": 211, "correlation": 119200 } }, { "ph": "s", "id": 119200, "pid": 76337, "tid": -914061504, "ts": 1716454223194133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223253369, "dur": 16, "args": { "External id": 119222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119222, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119222, "pid": 5, "tid": 7, "ts": 1716454223253369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194165, "dur": 7, "args": { "External id": 119222, "cbid": 211, "correlation": 119222 } }, { "ph": "s", "id": 119222, "pid": 76337, "tid": -914061504, "ts": 1716454223194165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194251, "dur": 1, "args": { "External id": 119233, "cbid": 251, "correlation": 119233 } }, { "ph": "f", "id": 119233, "pid": 76337, "tid": -914061504, "ts": 1716454223194251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223253386, "dur": 89, "args": { "External id": 119234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119234, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119234, "pid": 5, "tid": 7, "ts": 1716454223253386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194256, "dur": 13, "args": { "External id": 119234, "cbid": 211, "correlation": 119234 } }, { "ph": "s", "id": 119234, "pid": 76337, "tid": -914061504, "ts": 1716454223194256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194328, "dur": 1, "args": { "External id": 119245, "cbid": 251, "correlation": 119245 } }, { "ph": "f", "id": 119245, "pid": 76337, "tid": -914061504, "ts": 1716454223194328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194332, "dur": 0, "args": { "External id": 119246, "cbid": 251, "correlation": 119246 } }, { "ph": "f", "id": 119246, "pid": 76337, "tid": -914061504, "ts": 1716454223194332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223253476, "dur": 13, "args": { "External id": 119247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119247, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119247, "pid": 5, "tid": 7, "ts": 1716454223253476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194334, "dur": 13, "args": { "External id": 119247, "cbid": 211, "correlation": 119247 } }, { "ph": "s", "id": 119247, "pid": 76337, "tid": -914061504, "ts": 1716454223194334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223253490, "dur": 6, "args": { "External id": 119249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119249, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119249, "pid": 5, "tid": 7, "ts": 1716454223253490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194348, "dur": 6, "args": { "External id": 119249, "cbid": 211, "correlation": 119249 } }, { "ph": "s", "id": 119249, "pid": 76337, "tid": -914061504, "ts": 1716454223194348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194406, "dur": 1, "args": { "External id": 119260, "cbid": 251, "correlation": 119260 } }, { "ph": "f", "id": 119260, "pid": 76337, "tid": -914061504, "ts": 1716454223194406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194410, "dur": 0, "args": { "External id": 119261, "cbid": 251, "correlation": 119261 } }, { "ph": "f", "id": 119261, "pid": 76337, "tid": -914061504, "ts": 1716454223194410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223253498, "dur": 8, "args": { "External id": 119262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119262, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119262, "pid": 5, "tid": 7, "ts": 1716454223253498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194411, "dur": 11, "args": { "External id": 119262, "cbid": 211, "correlation": 119262 } }, { "ph": "s", "id": 119262, "pid": 76337, "tid": -914061504, "ts": 1716454223194411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223253507, "dur": 4, "args": { "External id": 119264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119264, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119264, "pid": 5, "tid": 7, "ts": 1716454223253507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194424, "dur": 5, "args": { "External id": 119264, "cbid": 211, "correlation": 119264 } }, { "ph": "s", "id": 119264, "pid": 76337, "tid": -914061504, "ts": 1716454223194424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223253512, "dur": 56, "args": { "External id": 119289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119289, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119289, "pid": 5, "tid": 7, "ts": 1716454223253512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194501, "dur": 12, "args": { "External id": 119289, "cbid": 211, "correlation": 119289 } }, { "ph": "s", "id": 119289, "pid": 76337, "tid": -914061504, "ts": 1716454223194501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194599, "dur": 2, "args": { "External id": 119307, "cbid": 251, "correlation": 119307 } }, { "ph": "f", "id": 119307, "pid": 76337, "tid": -914061504, "ts": 1716454223194599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223253569, "dur": 93, "args": { "External id": 119309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119309, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119309, "pid": 5, "tid": 7, "ts": 1716454223253569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194605, "dur": 14, "args": { "External id": 119309, "cbid": 211, "correlation": 119309 } }, { "ph": "s", "id": 119309, "pid": 76337, "tid": -914061504, "ts": 1716454223194605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223253663, "dur": 9, "args": { "External id": 119317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119317, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119317, "pid": 5, "tid": 7, "ts": 1716454223253663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194675, "dur": 12, "args": { "External id": 119317, "cbid": 211, "correlation": 119317 } }, { "ph": "s", "id": 119317, "pid": 76337, "tid": -914061504, "ts": 1716454223194675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223253674, "dur": 21, "args": { "External id": 119325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119325, "pid": 5, "tid": 7, "ts": 1716454223253674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194717, "dur": 9, "args": { "External id": 119325, "cbid": 211, "correlation": 119325 } }, { "ph": "s", "id": 119325, "pid": 76337, "tid": -914061504, "ts": 1716454223194717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223253696, "dur": 17, "args": { "External id": 119347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119347, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119347, "pid": 5, "tid": 7, "ts": 1716454223253696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194768, "dur": 10, "args": { "External id": 119347, "cbid": 211, "correlation": 119347 } }, { "ph": "s", "id": 119347, "pid": 76337, "tid": -914061504, "ts": 1716454223194768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194856, "dur": 1, "args": { "External id": 119363, "cbid": 251, "correlation": 119363 } }, { "ph": "f", "id": 119363, "pid": 76337, "tid": -914061504, "ts": 1716454223194856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223194861, "dur": 0, "args": { "External id": 119365, "cbid": 251, "correlation": 119365 } }, { "ph": "f", "id": 119365, "pid": 76337, "tid": -914061504, "ts": 1716454223194861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223253715, "dur": 494, "args": { "External id": 119366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119366, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119366, "pid": 5, "tid": 7, "ts": 1716454223253715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194863, "dur": 13, "args": { "External id": 119366, "cbid": 211, "correlation": 119366 } }, { "ph": "s", "id": 119366, "pid": 76337, "tid": -914061504, "ts": 1716454223194863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223254210, "dur": 66, "args": { "External id": 119374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119374, "pid": 5, "tid": 7, "ts": 1716454223254210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194929, "dur": 13, "args": { "External id": 119374, "cbid": 211, "correlation": 119374 } }, { "ph": "s", "id": 119374, "pid": 76337, "tid": -914061504, "ts": 1716454223194929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223254277, "dur": 66, "args": { "External id": 119382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119382, "pid": 5, "tid": 7, "ts": 1716454223254277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223194961, "dur": 8, "args": { "External id": 119382, "cbid": 211, "correlation": 119382 } }, { "ph": "s", "id": 119382, "pid": 76337, "tid": -914061504, "ts": 1716454223194961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223195050, "dur": 1, "args": { "External id": 119398, "cbid": 251, "correlation": 119398 } }, { "ph": "f", "id": 119398, "pid": 76337, "tid": -914061504, "ts": 1716454223195050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223254345, "dur": 1, "args": { "External id": 119400, "device": 5, "context": 1, "stream": 7, "correlation": 119400, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 119400, "pid": 5, "tid": 7, "ts": 1716454223254345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223195055, "dur": 10, "args": { "External id": 119400, "cbid": 51, "correlation": 119400 } }, { "ph": "s", "id": 119400, "pid": 76337, "tid": -914061504, "ts": 1716454223195055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223254348, "dur": 274, "args": { "External id": 119401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119401, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119401, "pid": 5, "tid": 7, "ts": 1716454223254348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195066, "dur": 11, "args": { "External id": 119401, "cbid": 211, "correlation": 119401 } }, { "ph": "s", "id": 119401, "pid": 76337, "tid": -914061504, "ts": 1716454223195066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223254624, "dur": 14, "args": { "External id": 119409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119409, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119409, "pid": 5, "tid": 7, "ts": 1716454223254624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195109, "dur": 10, "args": { "External id": 119409, "cbid": 211, "correlation": 119409 } }, { "ph": "s", "id": 119409, "pid": 76337, "tid": -914061504, "ts": 1716454223195109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223254639, "dur": 38, "args": { "External id": 119420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119420, "pid": 5, "tid": 7, "ts": 1716454223254639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195177, "dur": 13, "args": { "External id": 119420, "cbid": 211, "correlation": 119420 } }, { "ph": "s", "id": 119420, "pid": 76337, "tid": -914061504, "ts": 1716454223195177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223195243, "dur": 0, "args": { "External id": 119432, "cbid": 317, "correlation": 119432 } }, { "ph": "f", "id": 119432, "pid": 76337, "tid": -914061504, "ts": 1716454223195243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223195244, "dur": 0, "args": { "External id": 119433, "cbid": 203, "correlation": 119433 } }, { "ph": "f", "id": 119433, "pid": 76337, "tid": -914061504, "ts": 1716454223195244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223195245, "dur": 0, "args": { "External id": 119434, "cbid": 205, "correlation": 119434 } }, { "ph": "f", "id": 119434, "pid": 76337, "tid": -914061504, "ts": 1716454223195245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223254678, "dur": 13, "args": { "External id": 119438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119438, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119438, "pid": 5, "tid": 7, "ts": 1716454223254678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195259, "dur": 12, "args": { "External id": 119438, "cbid": 211, "correlation": 119438 } }, { "ph": "s", "id": 119438, "pid": 76337, "tid": -914061504, "ts": 1716454223195259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223254692, "dur": 4, "args": { "External id": 119440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119440, "pid": 5, "tid": 7, "ts": 1716454223254692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195276, "dur": 6, "args": { "External id": 119440, "cbid": 211, "correlation": 119440 } }, { "ph": "s", "id": 119440, "pid": 76337, "tid": -914061504, "ts": 1716454223195276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223195284, "dur": 0, "args": { "External id": 119441, "cbid": 51, "correlation": 119441 } }, { "ph": "s", "id": 119441, "pid": 76337, "tid": -914061504, "ts": 1716454223195284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223254698, "dur": 98, "args": { "External id": 119442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119442, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 119442, "pid": 5, "tid": 7, "ts": 1716454223254698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195285, "dur": 6, "args": { "External id": 119442, "cbid": 211, "correlation": 119442 } }, { "ph": "s", "id": 119442, "pid": 76337, "tid": -914061504, "ts": 1716454223195285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223254797, "dur": 17, "args": { "External id": 119447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119447, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119447, "pid": 5, "tid": 7, "ts": 1716454223254797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195313, "dur": 8, "args": { "External id": 119447, "cbid": 211, "correlation": 119447 } }, { "ph": "s", "id": 119447, "pid": 76337, "tid": -914061504, "ts": 1716454223195313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223254815, "dur": 12, "args": { "External id": 119455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119455, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119455, "pid": 5, "tid": 7, "ts": 1716454223254815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195345, "dur": 8, "args": { "External id": 119455, "cbid": 211, "correlation": 119455 } }, { "ph": "s", "id": 119455, "pid": 76337, "tid": -914061504, "ts": 1716454223195345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223254828, "dur": 31, "args": { "External id": 119464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119464, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119464, "pid": 5, "tid": 7, "ts": 1716454223254828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195383, "dur": 11, "args": { "External id": 119464, "cbid": 211, "correlation": 119464 } }, { "ph": "s", "id": 119464, "pid": 76337, "tid": -914061504, "ts": 1716454223195383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223254860, "dur": 31, "args": { "External id": 119484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119484, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 119484, "pid": 5, "tid": 7, "ts": 1716454223254860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195455, "dur": 11, "args": { "External id": 119484, "cbid": 211, "correlation": 119484 } }, { "ph": "s", "id": 119484, "pid": 76337, "tid": -914061504, "ts": 1716454223195455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223254893, "dur": 5, "args": { "External id": 119496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119496, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119496, "pid": 5, "tid": 7, "ts": 1716454223254893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195476, "dur": 6, "args": { "External id": 119496, "cbid": 211, "correlation": 119496 } }, { "ph": "s", "id": 119496, "pid": 76337, "tid": -914061504, "ts": 1716454223195476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223254899, "dur": 31, "args": { "External id": 119499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119499, "pid": 5, "tid": 7, "ts": 1716454223254899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195495, "dur": 6, "args": { "External id": 119499, "cbid": 211, "correlation": 119499 } }, { "ph": "s", "id": 119499, "pid": 76337, "tid": -914061504, "ts": 1716454223195495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223254931, "dur": 21, "args": { "External id": 119508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119508, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119508, "pid": 5, "tid": 7, "ts": 1716454223254931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195534, "dur": 11, "args": { "External id": 119508, "cbid": 211, "correlation": 119508 } }, { "ph": "s", "id": 119508, "pid": 76337, "tid": -914061504, "ts": 1716454223195534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223195586, "dur": 0, "args": { "External id": 119518, "cbid": 317, "correlation": 119518 } }, { "ph": "f", "id": 119518, "pid": 76337, "tid": -914061504, "ts": 1716454223195586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223195587, "dur": 0, "args": { "External id": 119519, "cbid": 203, "correlation": 119519 } }, { "ph": "f", "id": 119519, "pid": 76337, "tid": -914061504, "ts": 1716454223195587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223195588, "dur": 0, "args": { "External id": 119520, "cbid": 205, "correlation": 119520 } }, { "ph": "f", "id": 119520, "pid": 76337, "tid": -914061504, "ts": 1716454223195588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223254954, "dur": 22, "args": { "External id": 119524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119524, "pid": 5, "tid": 7, "ts": 1716454223254954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195602, "dur": 11, "args": { "External id": 119524, "cbid": 211, "correlation": 119524 } }, { "ph": "s", "id": 119524, "pid": 76337, "tid": -914061504, "ts": 1716454223195602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223254977, "dur": 320, "args": { "External id": 119526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119526, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119526, "pid": 5, "tid": 7, "ts": 1716454223254977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195616, "dur": 5, "args": { "External id": 119526, "cbid": 211, "correlation": 119526 } }, { "ph": "s", "id": 119526, "pid": 76337, "tid": -914061504, "ts": 1716454223195616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223255299, "dur": 1, "args": { "External id": 119528, "device": 5, "context": 1, "stream": 7, "correlation": 119528, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 119528, "pid": 5, "tid": 7, "ts": 1716454223255299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223195627, "dur": 6, "args": { "External id": 119528, "cbid": 51, "correlation": 119528 } }, { "ph": "s", "id": 119528, "pid": 76337, "tid": -914061504, "ts": 1716454223195627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223255303, "dur": 1254, "args": { "External id": 119529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119529, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119529, "pid": 5, "tid": 7, "ts": 1716454223255303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195634, "dur": 6, "args": { "External id": 119529, "cbid": 211, "correlation": 119529 } }, { "ph": "s", "id": 119529, "pid": 76337, "tid": -914061504, "ts": 1716454223195634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223256559, "dur": 13, "args": { "External id": 119531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119531, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119531, "pid": 5, "tid": 7, "ts": 1716454223256559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195645, "dur": 5, "args": { "External id": 119531, "cbid": 211, "correlation": 119531 } }, { "ph": "s", "id": 119531, "pid": 76337, "tid": -914061504, "ts": 1716454223195645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223256573, "dur": 15, "args": { "External id": 119537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119537, "pid": 5, "tid": 7, "ts": 1716454223256573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195673, "dur": 8, "args": { "External id": 119537, "cbid": 211, "correlation": 119537 } }, { "ph": "s", "id": 119537, "pid": 76337, "tid": -914061504, "ts": 1716454223195673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223256589, "dur": 4, "args": { "External id": 119545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119545, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 119545, "pid": 5, "tid": 7, "ts": 1716454223256589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195717, "dur": 10, "args": { "External id": 119545, "cbid": 211, "correlation": 119545 } }, { "ph": "s", "id": 119545, "pid": 76337, "tid": -914061504, "ts": 1716454223195717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223195782, "dur": 1, "args": { "External id": 119561, "cbid": 251, "correlation": 119561 } }, { "ph": "f", "id": 119561, "pid": 76337, "tid": -914061504, "ts": 1716454223195782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223195787, "dur": 0, "args": { "External id": 119563, "cbid": 251, "correlation": 119563 } }, { "ph": "f", "id": 119563, "pid": 76337, "tid": -914061504, "ts": 1716454223195787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223256595, "dur": 13, "args": { "External id": 119564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119564, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119564, "pid": 5, "tid": 7, "ts": 1716454223256595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195789, "dur": 12, "args": { "External id": 119564, "cbid": 211, "correlation": 119564 } }, { "ph": "s", "id": 119564, "pid": 76337, "tid": -914061504, "ts": 1716454223195789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223256609, "dur": 5, "args": { "External id": 119566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119566, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119566, "pid": 5, "tid": 7, "ts": 1716454223256609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195803, "dur": 5, "args": { "External id": 119566, "cbid": 211, "correlation": 119566 } }, { "ph": "s", "id": 119566, "pid": 76337, "tid": -914061504, "ts": 1716454223195803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223256615, "dur": 17, "args": { "External id": 119576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119576, "pid": 5, "tid": 7, "ts": 1716454223256615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195861, "dur": 12, "args": { "External id": 119576, "cbid": 211, "correlation": 119576 } }, { "ph": "s", "id": 119576, "pid": 76337, "tid": -914061504, "ts": 1716454223195861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223256634, "dur": 17, "args": { "External id": 119596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119596, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 119596, "pid": 5, "tid": 7, "ts": 1716454223256634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195927, "dur": 11, "args": { "External id": 119596, "cbid": 211, "correlation": 119596 } }, { "ph": "s", "id": 119596, "pid": 76337, "tid": -914061504, "ts": 1716454223195927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223256653, "dur": 4, "args": { "External id": 119608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119608, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 119608, "pid": 5, "tid": 7, "ts": 1716454223256653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195949, "dur": 6, "args": { "External id": 119608, "cbid": 211, "correlation": 119608 } }, { "ph": "s", "id": 119608, "pid": 76337, "tid": -914061504, "ts": 1716454223195949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223256658, "dur": 17, "args": { "External id": 119611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119611, "pid": 5, "tid": 7, "ts": 1716454223256658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223195967, "dur": 14, "args": { "External id": 119611, "cbid": 211, "correlation": 119611 } }, { "ph": "s", "id": 119611, "pid": 76337, "tid": -914061504, "ts": 1716454223195967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223256677, "dur": 12, "args": { "External id": 119620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119620, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119620, "pid": 5, "tid": 7, "ts": 1716454223256677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196017, "dur": 10, "args": { "External id": 119620, "cbid": 211, "correlation": 119620 } }, { "ph": "s", "id": 119620, "pid": 76337, "tid": -914061504, "ts": 1716454223196017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223196080, "dur": 0, "args": { "External id": 119630, "cbid": 317, "correlation": 119630 } }, { "ph": "f", "id": 119630, "pid": 76337, "tid": -914061504, "ts": 1716454223196080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223196081, "dur": 0, "args": { "External id": 119631, "cbid": 203, "correlation": 119631 } }, { "ph": "f", "id": 119631, "pid": 76337, "tid": -914061504, "ts": 1716454223196081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223196081, "dur": 0, "args": { "External id": 119632, "cbid": 205, "correlation": 119632 } }, { "ph": "f", "id": 119632, "pid": 76337, "tid": -914061504, "ts": 1716454223196081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223256690, "dur": 12, "args": { "External id": 119636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119636, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119636, "pid": 5, "tid": 7, "ts": 1716454223256690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196095, "dur": 12, "args": { "External id": 119636, "cbid": 211, "correlation": 119636 } }, { "ph": "s", "id": 119636, "pid": 76337, "tid": -914061504, "ts": 1716454223196095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223256703, "dur": 162, "args": { "External id": 119638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119638, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119638, "pid": 5, "tid": 7, "ts": 1716454223256703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196110, "dur": 5, "args": { "External id": 119638, "cbid": 211, "correlation": 119638 } }, { "ph": "s", "id": 119638, "pid": 76337, "tid": -914061504, "ts": 1716454223196110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223256868, "dur": 1, "args": { "External id": 119640, "device": 5, "context": 1, "stream": 7, "correlation": 119640, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 119640, "pid": 5, "tid": 7, "ts": 1716454223256868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223196121, "dur": 6, "args": { "External id": 119640, "cbid": 51, "correlation": 119640 } }, { "ph": "s", "id": 119640, "pid": 76337, "tid": -914061504, "ts": 1716454223196121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223256871, "dur": 649, "args": { "External id": 119641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119641, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119641, "pid": 5, "tid": 7, "ts": 1716454223256871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196128, "dur": 6, "args": { "External id": 119641, "cbid": 211, "correlation": 119641 } }, { "ph": "s", "id": 119641, "pid": 76337, "tid": -914061504, "ts": 1716454223196128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223257521, "dur": 12, "args": { "External id": 119643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119643, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119643, "pid": 5, "tid": 7, "ts": 1716454223257521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196139, "dur": 5, "args": { "External id": 119643, "cbid": 211, "correlation": 119643 } }, { "ph": "s", "id": 119643, "pid": 76337, "tid": -914061504, "ts": 1716454223196139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223257535, "dur": 15, "args": { "External id": 119649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119649, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119649, "pid": 5, "tid": 7, "ts": 1716454223257535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196168, "dur": 8, "args": { "External id": 119649, "cbid": 211, "correlation": 119649 } }, { "ph": "s", "id": 119649, "pid": 76337, "tid": -914061504, "ts": 1716454223196168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223196226, "dur": 0, "args": { "External id": 119659, "cbid": 317, "correlation": 119659 } }, { "ph": "f", "id": 119659, "pid": 76337, "tid": -914061504, "ts": 1716454223196226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223196227, "dur": 0, "args": { "External id": 119660, "cbid": 203, "correlation": 119660 } }, { "ph": "f", "id": 119660, "pid": 76337, "tid": -914061504, "ts": 1716454223196227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223196228, "dur": 0, "args": { "External id": 119661, "cbid": 205, "correlation": 119661 } }, { "ph": "f", "id": 119661, "pid": 76337, "tid": -914061504, "ts": 1716454223196228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223257552, "dur": 21, "args": { "External id": 119665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119665, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119665, "pid": 5, "tid": 7, "ts": 1716454223257552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196239, "dur": 12, "args": { "External id": 119665, "cbid": 211, "correlation": 119665 } }, { "ph": "s", "id": 119665, "pid": 76337, "tid": -914061504, "ts": 1716454223196239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223257574, "dur": 4, "args": { "External id": 119667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119667, "pid": 5, "tid": 7, "ts": 1716454223257574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196256, "dur": 6, "args": { "External id": 119667, "cbid": 211, "correlation": 119667 } }, { "ph": "s", "id": 119667, "pid": 76337, "tid": -914061504, "ts": 1716454223196256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223196264, "dur": 0, "args": { "External id": 119668, "cbid": 51, "correlation": 119668 } }, { "ph": "s", "id": 119668, "pid": 76337, "tid": -914061504, "ts": 1716454223196264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223257579, "dur": 170, "args": { "External id": 119669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119669, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 119669, "pid": 5, "tid": 7, "ts": 1716454223257579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196265, "dur": 5, "args": { "External id": 119669, "cbid": 211, "correlation": 119669 } }, { "ph": "s", "id": 119669, "pid": 76337, "tid": -914061504, "ts": 1716454223196265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223257751, "dur": 15, "args": { "External id": 119674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119674, "pid": 5, "tid": 7, "ts": 1716454223257751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196291, "dur": 8, "args": { "External id": 119674, "cbid": 211, "correlation": 119674 } }, { "ph": "s", "id": 119674, "pid": 76337, "tid": -914061504, "ts": 1716454223196291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223257767, "dur": 13, "args": { "External id": 119682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119682, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119682, "pid": 5, "tid": 7, "ts": 1716454223257767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196320, "dur": 8, "args": { "External id": 119682, "cbid": 211, "correlation": 119682 } }, { "ph": "s", "id": 119682, "pid": 76337, "tid": -914061504, "ts": 1716454223196320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223257781, "dur": 10, "args": { "External id": 119690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119690, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119690, "pid": 5, "tid": 7, "ts": 1716454223257781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196347, "dur": 9, "args": { "External id": 119690, "cbid": 211, "correlation": 119690 } }, { "ph": "s", "id": 119690, "pid": 76337, "tid": -914061504, "ts": 1716454223196347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223257793, "dur": 19, "args": { "External id": 119710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119710, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 119710, "pid": 5, "tid": 7, "ts": 1716454223257793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196430, "dur": 12, "args": { "External id": 119710, "cbid": 211, "correlation": 119710 } }, { "ph": "s", "id": 119710, "pid": 76337, "tid": -914061504, "ts": 1716454223196430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223257813, "dur": 4, "args": { "External id": 119722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119722, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 119722, "pid": 5, "tid": 7, "ts": 1716454223257813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196452, "dur": 7, "args": { "External id": 119722, "cbid": 211, "correlation": 119722 } }, { "ph": "s", "id": 119722, "pid": 76337, "tid": -914061504, "ts": 1716454223196452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223257819, "dur": 17, "args": { "External id": 119725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119725, "pid": 5, "tid": 7, "ts": 1716454223257819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196470, "dur": 6, "args": { "External id": 119725, "cbid": 211, "correlation": 119725 } }, { "ph": "s", "id": 119725, "pid": 76337, "tid": -914061504, "ts": 1716454223196470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223196528, "dur": 0, "args": { "External id": 119736, "cbid": 317, "correlation": 119736 } }, { "ph": "f", "id": 119736, "pid": 76337, "tid": -914061504, "ts": 1716454223196528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223196528, "dur": 0, "args": { "External id": 119737, "cbid": 203, "correlation": 119737 } }, { "ph": "f", "id": 119737, "pid": 76337, "tid": -914061504, "ts": 1716454223196528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223196529, "dur": 0, "args": { "External id": 119738, "cbid": 205, "correlation": 119738 } }, { "ph": "f", "id": 119738, "pid": 76337, "tid": -914061504, "ts": 1716454223196529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223257837, "dur": 11, "args": { "External id": 119742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119742, "pid": 5, "tid": 7, "ts": 1716454223257837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196542, "dur": 12, "args": { "External id": 119742, "cbid": 211, "correlation": 119742 } }, { "ph": "s", "id": 119742, "pid": 76337, "tid": -914061504, "ts": 1716454223196542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223257850, "dur": 3, "args": { "External id": 119744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119744, "pid": 5, "tid": 7, "ts": 1716454223257850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196559, "dur": 6, "args": { "External id": 119744, "cbid": 211, "correlation": 119744 } }, { "ph": "s", "id": 119744, "pid": 76337, "tid": -914061504, "ts": 1716454223196559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223196567, "dur": 0, "args": { "External id": 119745, "cbid": 51, "correlation": 119745 } }, { "ph": "s", "id": 119745, "pid": 76337, "tid": -914061504, "ts": 1716454223196567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223257855, "dur": 92, "args": { "External id": 119746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119746, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 119746, "pid": 5, "tid": 7, "ts": 1716454223257855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196568, "dur": 5, "args": { "External id": 119746, "cbid": 211, "correlation": 119746 } }, { "ph": "s", "id": 119746, "pid": 76337, "tid": -914061504, "ts": 1716454223196568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223257948, "dur": 16, "args": { "External id": 119751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119751, "pid": 5, "tid": 7, "ts": 1716454223257948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196594, "dur": 9, "args": { "External id": 119751, "cbid": 211, "correlation": 119751 } }, { "ph": "s", "id": 119751, "pid": 76337, "tid": -914061504, "ts": 1716454223196594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223257964, "dur": 83, "args": { "External id": 119760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119760, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119760, "pid": 5, "tid": 7, "ts": 1716454223257964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196674, "dur": 14, "args": { "External id": 119760, "cbid": 211, "correlation": 119760 } }, { "ph": "s", "id": 119760, "pid": 76337, "tid": -914061504, "ts": 1716454223196674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223258049, "dur": 30, "args": { "External id": 119782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119782, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119782, "pid": 5, "tid": 7, "ts": 1716454223258049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196731, "dur": 10, "args": { "External id": 119782, "cbid": 211, "correlation": 119782 } }, { "ph": "s", "id": 119782, "pid": 76337, "tid": -914061504, "ts": 1716454223196731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223196819, "dur": 1, "args": { "External id": 119793, "cbid": 251, "correlation": 119793 } }, { "ph": "f", "id": 119793, "pid": 76337, "tid": -914061504, "ts": 1716454223196819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223258080, "dur": 164, "args": { "External id": 119794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119794, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119794, "pid": 5, "tid": 7, "ts": 1716454223258080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196824, "dur": 13, "args": { "External id": 119794, "cbid": 211, "correlation": 119794 } }, { "ph": "s", "id": 119794, "pid": 76337, "tid": -914061504, "ts": 1716454223196824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223196895, "dur": 1, "args": { "External id": 119805, "cbid": 251, "correlation": 119805 } }, { "ph": "f", "id": 119805, "pid": 76337, "tid": -914061504, "ts": 1716454223196895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223258245, "dur": 160, "args": { "External id": 119806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119806, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119806, "pid": 5, "tid": 7, "ts": 1716454223258245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196899, "dur": 12, "args": { "External id": 119806, "cbid": 211, "correlation": 119806 } }, { "ph": "s", "id": 119806, "pid": 76337, "tid": -914061504, "ts": 1716454223196899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223196966, "dur": 1, "args": { "External id": 119817, "cbid": 251, "correlation": 119817 } }, { "ph": "f", "id": 119817, "pid": 76337, "tid": -914061504, "ts": 1716454223196966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223258407, "dur": 157, "args": { "External id": 119818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119818, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119818, "pid": 5, "tid": 7, "ts": 1716454223258407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223196970, "dur": 18, "args": { "External id": 119818, "cbid": 211, "correlation": 119818 } }, { "ph": "s", "id": 119818, "pid": 76337, "tid": -914061504, "ts": 1716454223196970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223258565, "dur": 338, "args": { "External id": 119843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119843, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119843, "pid": 5, "tid": 7, "ts": 1716454223258565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197062, "dur": 12, "args": { "External id": 119843, "cbid": 211, "correlation": 119843 } }, { "ph": "s", "id": 119843, "pid": 76337, "tid": -914061504, "ts": 1716454223197062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197160, "dur": 1, "args": { "External id": 119861, "cbid": 251, "correlation": 119861 } }, { "ph": "f", "id": 119861, "pid": 76337, "tid": -914061504, "ts": 1716454223197160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223258904, "dur": 167, "args": { "External id": 119863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119863, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119863, "pid": 5, "tid": 7, "ts": 1716454223258904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197167, "dur": 13, "args": { "External id": 119863, "cbid": 211, "correlation": 119863 } }, { "ph": "s", "id": 119863, "pid": 76337, "tid": -914061504, "ts": 1716454223197167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223259073, "dur": 19, "args": { "External id": 119871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119871, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119871, "pid": 5, "tid": 7, "ts": 1716454223259073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197237, "dur": 12, "args": { "External id": 119871, "cbid": 211, "correlation": 119871 } }, { "ph": "s", "id": 119871, "pid": 76337, "tid": -914061504, "ts": 1716454223197237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223259094, "dur": 27, "args": { "External id": 119879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119879, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119879, "pid": 5, "tid": 7, "ts": 1716454223259094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197277, "dur": 8, "args": { "External id": 119879, "cbid": 211, "correlation": 119879 } }, { "ph": "s", "id": 119879, "pid": 76337, "tid": -914061504, "ts": 1716454223197277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223259122, "dur": 19, "args": { "External id": 119890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119890, "pid": 5, "tid": 7, "ts": 1716454223259122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197348, "dur": 12, "args": { "External id": 119890, "cbid": 211, "correlation": 119890 } }, { "ph": "s", "id": 119890, "pid": 76337, "tid": -914061504, "ts": 1716454223197348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223259143, "dur": 16, "args": { "External id": 119912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119912, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 119912, "pid": 5, "tid": 7, "ts": 1716454223259143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197380, "dur": 8, "args": { "External id": 119912, "cbid": 211, "correlation": 119912 } }, { "ph": "s", "id": 119912, "pid": 76337, "tid": -914061504, "ts": 1716454223197380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197465, "dur": 1, "args": { "External id": 119923, "cbid": 251, "correlation": 119923 } }, { "ph": "f", "id": 119923, "pid": 76337, "tid": -914061504, "ts": 1716454223197465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223259160, "dur": 89, "args": { "External id": 119924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119924, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119924, "pid": 5, "tid": 7, "ts": 1716454223259160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197470, "dur": 13, "args": { "External id": 119924, "cbid": 211, "correlation": 119924 } }, { "ph": "s", "id": 119924, "pid": 76337, "tid": -914061504, "ts": 1716454223197470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197539, "dur": 1, "args": { "External id": 119935, "cbid": 251, "correlation": 119935 } }, { "ph": "f", "id": 119935, "pid": 76337, "tid": -914061504, "ts": 1716454223197539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197543, "dur": 0, "args": { "External id": 119936, "cbid": 251, "correlation": 119936 } }, { "ph": "f", "id": 119936, "pid": 76337, "tid": -914061504, "ts": 1716454223197543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223259251, "dur": 12, "args": { "External id": 119937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119937, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119937, "pid": 5, "tid": 7, "ts": 1716454223259251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197545, "dur": 12, "args": { "External id": 119937, "cbid": 211, "correlation": 119937 } }, { "ph": "s", "id": 119937, "pid": 76337, "tid": -914061504, "ts": 1716454223197545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223259264, "dur": 6, "args": { "External id": 119939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119939, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119939, "pid": 5, "tid": 7, "ts": 1716454223259264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197558, "dur": 6, "args": { "External id": 119939, "cbid": 211, "correlation": 119939 } }, { "ph": "s", "id": 119939, "pid": 76337, "tid": -914061504, "ts": 1716454223197558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197615, "dur": 1, "args": { "External id": 119950, "cbid": 251, "correlation": 119950 } }, { "ph": "f", "id": 119950, "pid": 76337, "tid": -914061504, "ts": 1716454223197615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197619, "dur": 0, "args": { "External id": 119951, "cbid": 251, "correlation": 119951 } }, { "ph": "f", "id": 119951, "pid": 76337, "tid": -914061504, "ts": 1716454223197619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223259271, "dur": 9, "args": { "External id": 119952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119952, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119952, "pid": 5, "tid": 7, "ts": 1716454223259271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197621, "dur": 12, "args": { "External id": 119952, "cbid": 211, "correlation": 119952 } }, { "ph": "s", "id": 119952, "pid": 76337, "tid": -914061504, "ts": 1716454223197621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223259281, "dur": 4, "args": { "External id": 119954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119954, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119954, "pid": 5, "tid": 7, "ts": 1716454223259281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197634, "dur": 5, "args": { "External id": 119954, "cbid": 211, "correlation": 119954 } }, { "ph": "s", "id": 119954, "pid": 76337, "tid": -914061504, "ts": 1716454223197634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223259286, "dur": 55, "args": { "External id": 119979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119979, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 119979, "pid": 5, "tid": 7, "ts": 1716454223259286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197711, "dur": 13, "args": { "External id": 119979, "cbid": 211, "correlation": 119979 } }, { "ph": "s", "id": 119979, "pid": 76337, "tid": -914061504, "ts": 1716454223197711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223197809, "dur": 1, "args": { "External id": 119997, "cbid": 251, "correlation": 119997 } }, { "ph": "f", "id": 119997, "pid": 76337, "tid": -914061504, "ts": 1716454223197809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223259342, "dur": 91, "args": { "External id": 119999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 119999, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 119999, "pid": 5, "tid": 7, "ts": 1716454223259342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197815, "dur": 14, "args": { "External id": 119999, "cbid": 211, "correlation": 119999 } }, { "ph": "s", "id": 119999, "pid": 76337, "tid": -914061504, "ts": 1716454223197815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223259434, "dur": 9, "args": { "External id": 120007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120007, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120007, "pid": 5, "tid": 7, "ts": 1716454223259434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197886, "dur": 12, "args": { "External id": 120007, "cbid": 211, "correlation": 120007 } }, { "ph": "s", "id": 120007, "pid": 76337, "tid": -914061504, "ts": 1716454223197886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223259445, "dur": 20, "args": { "External id": 120015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120015, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120015, "pid": 5, "tid": 7, "ts": 1716454223259445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197926, "dur": 10, "args": { "External id": 120015, "cbid": 211, "correlation": 120015 } }, { "ph": "s", "id": 120015, "pid": 76337, "tid": -914061504, "ts": 1716454223197926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223259467, "dur": 18, "args": { "External id": 120037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120037, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120037, "pid": 5, "tid": 7, "ts": 1716454223259467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223197996, "dur": 11, "args": { "External id": 120037, "cbid": 211, "correlation": 120037 } }, { "ph": "s", "id": 120037, "pid": 76337, "tid": -914061504, "ts": 1716454223197996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223198086, "dur": 1, "args": { "External id": 120053, "cbid": 251, "correlation": 120053 } }, { "ph": "f", "id": 120053, "pid": 76337, "tid": -914061504, "ts": 1716454223198086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223198091, "dur": 0, "args": { "External id": 120055, "cbid": 251, "correlation": 120055 } }, { "ph": "f", "id": 120055, "pid": 76337, "tid": -914061504, "ts": 1716454223198091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223259486, "dur": 497, "args": { "External id": 120056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120056, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120056, "pid": 5, "tid": 7, "ts": 1716454223259486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198093, "dur": 12, "args": { "External id": 120056, "cbid": 211, "correlation": 120056 } }, { "ph": "s", "id": 120056, "pid": 76337, "tid": -914061504, "ts": 1716454223198093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223259984, "dur": 66, "args": { "External id": 120064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120064, "pid": 5, "tid": 7, "ts": 1716454223259984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198158, "dur": 12, "args": { "External id": 120064, "cbid": 211, "correlation": 120064 } }, { "ph": "s", "id": 120064, "pid": 76337, "tid": -914061504, "ts": 1716454223198158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223260052, "dur": 68, "args": { "External id": 120072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120072, "pid": 5, "tid": 7, "ts": 1716454223260052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198189, "dur": 8, "args": { "External id": 120072, "cbid": 211, "correlation": 120072 } }, { "ph": "s", "id": 120072, "pid": 76337, "tid": -914061504, "ts": 1716454223198189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223198268, "dur": 1, "args": { "External id": 120088, "cbid": 251, "correlation": 120088 } }, { "ph": "f", "id": 120088, "pid": 76337, "tid": -914061504, "ts": 1716454223198268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223260122, "dur": 1, "args": { "External id": 120090, "device": 5, "context": 1, "stream": 7, "correlation": 120090, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 120090, "pid": 5, "tid": 7, "ts": 1716454223260122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223198273, "dur": 10, "args": { "External id": 120090, "cbid": 51, "correlation": 120090 } }, { "ph": "s", "id": 120090, "pid": 76337, "tid": -914061504, "ts": 1716454223198273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223260126, "dur": 271, "args": { "External id": 120091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120091, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120091, "pid": 5, "tid": 7, "ts": 1716454223260126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198285, "dur": 11, "args": { "External id": 120091, "cbid": 211, "correlation": 120091 } }, { "ph": "s", "id": 120091, "pid": 76337, "tid": -914061504, "ts": 1716454223198285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223260398, "dur": 14, "args": { "External id": 120099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120099, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120099, "pid": 5, "tid": 7, "ts": 1716454223260398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198326, "dur": 11, "args": { "External id": 120099, "cbid": 211, "correlation": 120099 } }, { "ph": "s", "id": 120099, "pid": 76337, "tid": -914061504, "ts": 1716454223198326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223260413, "dur": 38, "args": { "External id": 120110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120110, "pid": 5, "tid": 7, "ts": 1716454223260413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198395, "dur": 12, "args": { "External id": 120110, "cbid": 211, "correlation": 120110 } }, { "ph": "s", "id": 120110, "pid": 76337, "tid": -914061504, "ts": 1716454223198395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223198459, "dur": 0, "args": { "External id": 120122, "cbid": 317, "correlation": 120122 } }, { "ph": "f", "id": 120122, "pid": 76337, "tid": -914061504, "ts": 1716454223198459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223198460, "dur": 0, "args": { "External id": 120123, "cbid": 203, "correlation": 120123 } }, { "ph": "f", "id": 120123, "pid": 76337, "tid": -914061504, "ts": 1716454223198460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223198460, "dur": 0, "args": { "External id": 120124, "cbid": 205, "correlation": 120124 } }, { "ph": "f", "id": 120124, "pid": 76337, "tid": -914061504, "ts": 1716454223198460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223260453, "dur": 15, "args": { "External id": 120128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120128, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120128, "pid": 5, "tid": 7, "ts": 1716454223260453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198475, "dur": 13, "args": { "External id": 120128, "cbid": 211, "correlation": 120128 } }, { "ph": "s", "id": 120128, "pid": 76337, "tid": -914061504, "ts": 1716454223198475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223260469, "dur": 4, "args": { "External id": 120130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120130, "pid": 5, "tid": 7, "ts": 1716454223260469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198493, "dur": 6, "args": { "External id": 120130, "cbid": 211, "correlation": 120130 } }, { "ph": "s", "id": 120130, "pid": 76337, "tid": -914061504, "ts": 1716454223198493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223198502, "dur": 0, "args": { "External id": 120131, "cbid": 51, "correlation": 120131 } }, { "ph": "s", "id": 120131, "pid": 76337, "tid": -914061504, "ts": 1716454223198502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223260474, "dur": 97, "args": { "External id": 120132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120132, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 120132, "pid": 5, "tid": 7, "ts": 1716454223260474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198503, "dur": 5, "args": { "External id": 120132, "cbid": 211, "correlation": 120132 } }, { "ph": "s", "id": 120132, "pid": 76337, "tid": -914061504, "ts": 1716454223198503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223260572, "dur": 16, "args": { "External id": 120137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120137, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120137, "pid": 5, "tid": 7, "ts": 1716454223260572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198530, "dur": 9, "args": { "External id": 120137, "cbid": 211, "correlation": 120137 } }, { "ph": "s", "id": 120137, "pid": 76337, "tid": -914061504, "ts": 1716454223198530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223260590, "dur": 11, "args": { "External id": 120145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120145, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120145, "pid": 5, "tid": 7, "ts": 1716454223260590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198563, "dur": 8, "args": { "External id": 120145, "cbid": 211, "correlation": 120145 } }, { "ph": "s", "id": 120145, "pid": 76337, "tid": -914061504, "ts": 1716454223198563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223260603, "dur": 25, "args": { "External id": 120154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120154, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120154, "pid": 5, "tid": 7, "ts": 1716454223260603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198601, "dur": 10, "args": { "External id": 120154, "cbid": 211, "correlation": 120154 } }, { "ph": "s", "id": 120154, "pid": 76337, "tid": -914061504, "ts": 1716454223198601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223260629, "dur": 23, "args": { "External id": 120174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120174, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 120174, "pid": 5, "tid": 7, "ts": 1716454223260629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198672, "dur": 13, "args": { "External id": 120174, "cbid": 211, "correlation": 120174 } }, { "ph": "s", "id": 120174, "pid": 76337, "tid": -914061504, "ts": 1716454223198672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223260654, "dur": 5, "args": { "External id": 120186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120186, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 120186, "pid": 5, "tid": 7, "ts": 1716454223260654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198694, "dur": 7, "args": { "External id": 120186, "cbid": 211, "correlation": 120186 } }, { "ph": "s", "id": 120186, "pid": 76337, "tid": -914061504, "ts": 1716454223198694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223260660, "dur": 25, "args": { "External id": 120189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120189, "pid": 5, "tid": 7, "ts": 1716454223260660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198712, "dur": 7, "args": { "External id": 120189, "cbid": 211, "correlation": 120189 } }, { "ph": "s", "id": 120189, "pid": 76337, "tid": -914061504, "ts": 1716454223198712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223260686, "dur": 17, "args": { "External id": 120198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120198, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120198, "pid": 5, "tid": 7, "ts": 1716454223260686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198752, "dur": 9, "args": { "External id": 120198, "cbid": 211, "correlation": 120198 } }, { "ph": "s", "id": 120198, "pid": 76337, "tid": -914061504, "ts": 1716454223198752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223198803, "dur": 0, "args": { "External id": 120208, "cbid": 317, "correlation": 120208 } }, { "ph": "f", "id": 120208, "pid": 76337, "tid": -914061504, "ts": 1716454223198803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223198803, "dur": 0, "args": { "External id": 120209, "cbid": 203, "correlation": 120209 } }, { "ph": "f", "id": 120209, "pid": 76337, "tid": -914061504, "ts": 1716454223198803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223198804, "dur": 0, "args": { "External id": 120210, "cbid": 205, "correlation": 120210 } }, { "ph": "f", "id": 120210, "pid": 76337, "tid": -914061504, "ts": 1716454223198804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223260704, "dur": 17, "args": { "External id": 120214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120214, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120214, "pid": 5, "tid": 7, "ts": 1716454223260704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198818, "dur": 11, "args": { "External id": 120214, "cbid": 211, "correlation": 120214 } }, { "ph": "s", "id": 120214, "pid": 76337, "tid": -914061504, "ts": 1716454223198818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223260723, "dur": 241, "args": { "External id": 120216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120216, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120216, "pid": 5, "tid": 7, "ts": 1716454223260723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198832, "dur": 6, "args": { "External id": 120216, "cbid": 211, "correlation": 120216 } }, { "ph": "s", "id": 120216, "pid": 76337, "tid": -914061504, "ts": 1716454223198832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223260967, "dur": 1, "args": { "External id": 120218, "device": 5, "context": 1, "stream": 7, "correlation": 120218, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 120218, "pid": 5, "tid": 7, "ts": 1716454223260967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223198844, "dur": 7, "args": { "External id": 120218, "cbid": 51, "correlation": 120218 } }, { "ph": "s", "id": 120218, "pid": 76337, "tid": -914061504, "ts": 1716454223198844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223260970, "dur": 808, "args": { "External id": 120219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120219, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120219, "pid": 5, "tid": 7, "ts": 1716454223260970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198853, "dur": 6, "args": { "External id": 120219, "cbid": 211, "correlation": 120219 } }, { "ph": "s", "id": 120219, "pid": 76337, "tid": -914061504, "ts": 1716454223198853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223261780, "dur": 14, "args": { "External id": 120221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120221, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120221, "pid": 5, "tid": 7, "ts": 1716454223261780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198863, "dur": 5, "args": { "External id": 120221, "cbid": 211, "correlation": 120221 } }, { "ph": "s", "id": 120221, "pid": 76337, "tid": -914061504, "ts": 1716454223198863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223261795, "dur": 14, "args": { "External id": 120227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120227, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120227, "pid": 5, "tid": 7, "ts": 1716454223261795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198892, "dur": 8, "args": { "External id": 120227, "cbid": 211, "correlation": 120227 } }, { "ph": "s", "id": 120227, "pid": 76337, "tid": -914061504, "ts": 1716454223198892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223261811, "dur": 5, "args": { "External id": 120235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120235, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 120235, "pid": 5, "tid": 7, "ts": 1716454223261811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223198935, "dur": 10, "args": { "External id": 120235, "cbid": 211, "correlation": 120235 } }, { "ph": "s", "id": 120235, "pid": 76337, "tid": -914061504, "ts": 1716454223198935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223199008, "dur": 1, "args": { "External id": 120251, "cbid": 251, "correlation": 120251 } }, { "ph": "f", "id": 120251, "pid": 76337, "tid": -914061504, "ts": 1716454223199008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223199013, "dur": 0, "args": { "External id": 120253, "cbid": 251, "correlation": 120253 } }, { "ph": "f", "id": 120253, "pid": 76337, "tid": -914061504, "ts": 1716454223199013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223261817, "dur": 13, "args": { "External id": 120254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120254, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120254, "pid": 5, "tid": 7, "ts": 1716454223261817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199014, "dur": 12, "args": { "External id": 120254, "cbid": 211, "correlation": 120254 } }, { "ph": "s", "id": 120254, "pid": 76337, "tid": -914061504, "ts": 1716454223199014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223261832, "dur": 5, "args": { "External id": 120256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120256, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120256, "pid": 5, "tid": 7, "ts": 1716454223261832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199028, "dur": 5, "args": { "External id": 120256, "cbid": 211, "correlation": 120256 } }, { "ph": "s", "id": 120256, "pid": 76337, "tid": -914061504, "ts": 1716454223199028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223261838, "dur": 19, "args": { "External id": 120266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120266, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120266, "pid": 5, "tid": 7, "ts": 1716454223261838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199086, "dur": 12, "args": { "External id": 120266, "cbid": 211, "correlation": 120266 } }, { "ph": "s", "id": 120266, "pid": 76337, "tid": -914061504, "ts": 1716454223199086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223261858, "dur": 18, "args": { "External id": 120286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120286, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 120286, "pid": 5, "tid": 7, "ts": 1716454223261858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199153, "dur": 10, "args": { "External id": 120286, "cbid": 211, "correlation": 120286 } }, { "ph": "s", "id": 120286, "pid": 76337, "tid": -914061504, "ts": 1716454223199153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223261877, "dur": 4, "args": { "External id": 120298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120298, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 120298, "pid": 5, "tid": 7, "ts": 1716454223261877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199173, "dur": 6, "args": { "External id": 120298, "cbid": 211, "correlation": 120298 } }, { "ph": "s", "id": 120298, "pid": 76337, "tid": -914061504, "ts": 1716454223199173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223261883, "dur": 16, "args": { "External id": 120301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120301, "pid": 5, "tid": 7, "ts": 1716454223261883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199192, "dur": 6, "args": { "External id": 120301, "cbid": 211, "correlation": 120301 } }, { "ph": "s", "id": 120301, "pid": 76337, "tid": -914061504, "ts": 1716454223199192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223261901, "dur": 11, "args": { "External id": 120310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120310, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120310, "pid": 5, "tid": 7, "ts": 1716454223261901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199231, "dur": 10, "args": { "External id": 120310, "cbid": 211, "correlation": 120310 } }, { "ph": "s", "id": 120310, "pid": 76337, "tid": -914061504, "ts": 1716454223199231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223199294, "dur": 0, "args": { "External id": 120320, "cbid": 317, "correlation": 120320 } }, { "ph": "f", "id": 120320, "pid": 76337, "tid": -914061504, "ts": 1716454223199294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223199294, "dur": 0, "args": { "External id": 120321, "cbid": 203, "correlation": 120321 } }, { "ph": "f", "id": 120321, "pid": 76337, "tid": -914061504, "ts": 1716454223199294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223199295, "dur": 0, "args": { "External id": 120322, "cbid": 205, "correlation": 120322 } }, { "ph": "f", "id": 120322, "pid": 76337, "tid": -914061504, "ts": 1716454223199295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223261913, "dur": 12, "args": { "External id": 120326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120326, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120326, "pid": 5, "tid": 7, "ts": 1716454223261913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199308, "dur": 12, "args": { "External id": 120326, "cbid": 211, "correlation": 120326 } }, { "ph": "s", "id": 120326, "pid": 76337, "tid": -914061504, "ts": 1716454223199308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223261926, "dur": 163, "args": { "External id": 120328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120328, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120328, "pid": 5, "tid": 7, "ts": 1716454223261926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199323, "dur": 5, "args": { "External id": 120328, "cbid": 211, "correlation": 120328 } }, { "ph": "s", "id": 120328, "pid": 76337, "tid": -914061504, "ts": 1716454223199323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223262091, "dur": 1, "args": { "External id": 120330, "device": 5, "context": 1, "stream": 7, "correlation": 120330, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 120330, "pid": 5, "tid": 7, "ts": 1716454223262091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223199333, "dur": 7, "args": { "External id": 120330, "cbid": 51, "correlation": 120330 } }, { "ph": "s", "id": 120330, "pid": 76337, "tid": -914061504, "ts": 1716454223199333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223262095, "dur": 649, "args": { "External id": 120331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120331, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120331, "pid": 5, "tid": 7, "ts": 1716454223262095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199342, "dur": 6, "args": { "External id": 120331, "cbid": 211, "correlation": 120331 } }, { "ph": "s", "id": 120331, "pid": 76337, "tid": -914061504, "ts": 1716454223199342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223262746, "dur": 12, "args": { "External id": 120333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120333, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120333, "pid": 5, "tid": 7, "ts": 1716454223262746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199352, "dur": 5, "args": { "External id": 120333, "cbid": 211, "correlation": 120333 } }, { "ph": "s", "id": 120333, "pid": 76337, "tid": -914061504, "ts": 1716454223199352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223262759, "dur": 15, "args": { "External id": 120339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120339, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120339, "pid": 5, "tid": 7, "ts": 1716454223262759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199381, "dur": 10, "args": { "External id": 120339, "cbid": 211, "correlation": 120339 } }, { "ph": "s", "id": 120339, "pid": 76337, "tid": -914061504, "ts": 1716454223199381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223199442, "dur": 0, "args": { "External id": 120349, "cbid": 317, "correlation": 120349 } }, { "ph": "f", "id": 120349, "pid": 76337, "tid": -914061504, "ts": 1716454223199442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223199442, "dur": 0, "args": { "External id": 120350, "cbid": 203, "correlation": 120350 } }, { "ph": "f", "id": 120350, "pid": 76337, "tid": -914061504, "ts": 1716454223199442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223199443, "dur": 0, "args": { "External id": 120351, "cbid": 205, "correlation": 120351 } }, { "ph": "f", "id": 120351, "pid": 76337, "tid": -914061504, "ts": 1716454223199443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223262775, "dur": 17, "args": { "External id": 120355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120355, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120355, "pid": 5, "tid": 7, "ts": 1716454223262775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199457, "dur": 12, "args": { "External id": 120355, "cbid": 211, "correlation": 120355 } }, { "ph": "s", "id": 120355, "pid": 76337, "tid": -914061504, "ts": 1716454223199457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223262794, "dur": 4, "args": { "External id": 120357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120357, "pid": 5, "tid": 7, "ts": 1716454223262794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199473, "dur": 6, "args": { "External id": 120357, "cbid": 211, "correlation": 120357 } }, { "ph": "s", "id": 120357, "pid": 76337, "tid": -914061504, "ts": 1716454223199473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223199482, "dur": 0, "args": { "External id": 120358, "cbid": 51, "correlation": 120358 } }, { "ph": "s", "id": 120358, "pid": 76337, "tid": -914061504, "ts": 1716454223199482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223262799, "dur": 132, "args": { "External id": 120359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120359, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 120359, "pid": 5, "tid": 7, "ts": 1716454223262799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199483, "dur": 6, "args": { "External id": 120359, "cbid": 211, "correlation": 120359 } }, { "ph": "s", "id": 120359, "pid": 76337, "tid": -914061504, "ts": 1716454223199483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223262933, "dur": 15, "args": { "External id": 120364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120364, "pid": 5, "tid": 7, "ts": 1716454223262933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199509, "dur": 8, "args": { "External id": 120364, "cbid": 211, "correlation": 120364 } }, { "ph": "s", "id": 120364, "pid": 76337, "tid": -914061504, "ts": 1716454223199509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223262949, "dur": 12, "args": { "External id": 120372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120372, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120372, "pid": 5, "tid": 7, "ts": 1716454223262949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199538, "dur": 8, "args": { "External id": 120372, "cbid": 211, "correlation": 120372 } }, { "ph": "s", "id": 120372, "pid": 76337, "tid": -914061504, "ts": 1716454223199538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223262963, "dur": 11, "args": { "External id": 120380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120380, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120380, "pid": 5, "tid": 7, "ts": 1716454223262963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199566, "dur": 8, "args": { "External id": 120380, "cbid": 211, "correlation": 120380 } }, { "ph": "s", "id": 120380, "pid": 76337, "tid": -914061504, "ts": 1716454223199566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223262975, "dur": 19, "args": { "External id": 120400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120400, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 120400, "pid": 5, "tid": 7, "ts": 1716454223262975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199650, "dur": 12, "args": { "External id": 120400, "cbid": 211, "correlation": 120400 } }, { "ph": "s", "id": 120400, "pid": 76337, "tid": -914061504, "ts": 1716454223199650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223262995, "dur": 4, "args": { "External id": 120412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120412, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 120412, "pid": 5, "tid": 7, "ts": 1716454223262995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199671, "dur": 6, "args": { "External id": 120412, "cbid": 211, "correlation": 120412 } }, { "ph": "s", "id": 120412, "pid": 76337, "tid": -914061504, "ts": 1716454223199671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223263001, "dur": 17, "args": { "External id": 120415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120415, "pid": 5, "tid": 7, "ts": 1716454223263001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199689, "dur": 7, "args": { "External id": 120415, "cbid": 211, "correlation": 120415 } }, { "ph": "s", "id": 120415, "pid": 76337, "tid": -914061504, "ts": 1716454223199689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223199747, "dur": 0, "args": { "External id": 120426, "cbid": 317, "correlation": 120426 } }, { "ph": "f", "id": 120426, "pid": 76337, "tid": -914061504, "ts": 1716454223199747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223199747, "dur": 0, "args": { "External id": 120427, "cbid": 203, "correlation": 120427 } }, { "ph": "f", "id": 120427, "pid": 76337, "tid": -914061504, "ts": 1716454223199747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223199748, "dur": 0, "args": { "External id": 120428, "cbid": 205, "correlation": 120428 } }, { "ph": "f", "id": 120428, "pid": 76337, "tid": -914061504, "ts": 1716454223199748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223263019, "dur": 12, "args": { "External id": 120432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120432, "pid": 5, "tid": 7, "ts": 1716454223263019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199761, "dur": 11, "args": { "External id": 120432, "cbid": 211, "correlation": 120432 } }, { "ph": "s", "id": 120432, "pid": 76337, "tid": -914061504, "ts": 1716454223199761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223263032, "dur": 3, "args": { "External id": 120434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120434, "pid": 5, "tid": 7, "ts": 1716454223263032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199777, "dur": 6, "args": { "External id": 120434, "cbid": 211, "correlation": 120434 } }, { "ph": "s", "id": 120434, "pid": 76337, "tid": -914061504, "ts": 1716454223199777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223199785, "dur": 0, "args": { "External id": 120435, "cbid": 51, "correlation": 120435 } }, { "ph": "s", "id": 120435, "pid": 76337, "tid": -914061504, "ts": 1716454223199785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223263037, "dur": 91, "args": { "External id": 120436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120436, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 120436, "pid": 5, "tid": 7, "ts": 1716454223263037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199786, "dur": 5, "args": { "External id": 120436, "cbid": 211, "correlation": 120436 } }, { "ph": "s", "id": 120436, "pid": 76337, "tid": -914061504, "ts": 1716454223199786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223263130, "dur": 15, "args": { "External id": 120441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120441, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120441, "pid": 5, "tid": 7, "ts": 1716454223263130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199812, "dur": 9, "args": { "External id": 120441, "cbid": 211, "correlation": 120441 } }, { "ph": "s", "id": 120441, "pid": 76337, "tid": -914061504, "ts": 1716454223199812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223263146, "dur": 85, "args": { "External id": 120450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120450, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120450, "pid": 5, "tid": 7, "ts": 1716454223263146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199892, "dur": 15, "args": { "External id": 120450, "cbid": 211, "correlation": 120450 } }, { "ph": "s", "id": 120450, "pid": 76337, "tid": -914061504, "ts": 1716454223199892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223263233, "dur": 31, "args": { "External id": 120472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120472, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120472, "pid": 5, "tid": 7, "ts": 1716454223263233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223199951, "dur": 10, "args": { "External id": 120472, "cbid": 211, "correlation": 120472 } }, { "ph": "s", "id": 120472, "pid": 76337, "tid": -914061504, "ts": 1716454223199951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200049, "dur": 1, "args": { "External id": 120483, "cbid": 251, "correlation": 120483 } }, { "ph": "f", "id": 120483, "pid": 76337, "tid": -914061504, "ts": 1716454223200049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223263265, "dur": 165, "args": { "External id": 120484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120484, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120484, "pid": 5, "tid": 7, "ts": 1716454223263265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200055, "dur": 14, "args": { "External id": 120484, "cbid": 211, "correlation": 120484 } }, { "ph": "s", "id": 120484, "pid": 76337, "tid": -914061504, "ts": 1716454223200055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200126, "dur": 1, "args": { "External id": 120495, "cbid": 251, "correlation": 120495 } }, { "ph": "f", "id": 120495, "pid": 76337, "tid": -914061504, "ts": 1716454223200126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223263432, "dur": 157, "args": { "External id": 120496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120496, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120496, "pid": 5, "tid": 7, "ts": 1716454223263432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200130, "dur": 11, "args": { "External id": 120496, "cbid": 211, "correlation": 120496 } }, { "ph": "s", "id": 120496, "pid": 76337, "tid": -914061504, "ts": 1716454223200130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200195, "dur": 1, "args": { "External id": 120507, "cbid": 251, "correlation": 120507 } }, { "ph": "f", "id": 120507, "pid": 76337, "tid": -914061504, "ts": 1716454223200195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223263589, "dur": 158, "args": { "External id": 120508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120508, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120508, "pid": 5, "tid": 7, "ts": 1716454223263589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200200, "dur": 11, "args": { "External id": 120508, "cbid": 211, "correlation": 120508 } }, { "ph": "s", "id": 120508, "pid": 76337, "tid": -914061504, "ts": 1716454223200200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223263749, "dur": 337, "args": { "External id": 120533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120533, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120533, "pid": 5, "tid": 7, "ts": 1716454223263749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200283, "dur": 12, "args": { "External id": 120533, "cbid": 211, "correlation": 120533 } }, { "ph": "s", "id": 120533, "pid": 76337, "tid": -914061504, "ts": 1716454223200283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200384, "dur": 1, "args": { "External id": 120551, "cbid": 251, "correlation": 120551 } }, { "ph": "f", "id": 120551, "pid": 76337, "tid": -914061504, "ts": 1716454223200384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223264088, "dur": 144, "args": { "External id": 120553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120553, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120553, "pid": 5, "tid": 7, "ts": 1716454223264088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200389, "dur": 14, "args": { "External id": 120553, "cbid": 211, "correlation": 120553 } }, { "ph": "s", "id": 120553, "pid": 76337, "tid": -914061504, "ts": 1716454223200389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223264233, "dur": 20, "args": { "External id": 120561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120561, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120561, "pid": 5, "tid": 7, "ts": 1716454223264233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200461, "dur": 12, "args": { "External id": 120561, "cbid": 211, "correlation": 120561 } }, { "ph": "s", "id": 120561, "pid": 76337, "tid": -914061504, "ts": 1716454223200461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223264254, "dur": 28, "args": { "External id": 120569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120569, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120569, "pid": 5, "tid": 7, "ts": 1716454223264254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200499, "dur": 9, "args": { "External id": 120569, "cbid": 211, "correlation": 120569 } }, { "ph": "s", "id": 120569, "pid": 76337, "tid": -914061504, "ts": 1716454223200499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223264283, "dur": 19, "args": { "External id": 120580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120580, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120580, "pid": 5, "tid": 7, "ts": 1716454223264283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200571, "dur": 12, "args": { "External id": 120580, "cbid": 211, "correlation": 120580 } }, { "ph": "s", "id": 120580, "pid": 76337, "tid": -914061504, "ts": 1716454223200571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223264303, "dur": 16, "args": { "External id": 120602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120602, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120602, "pid": 5, "tid": 7, "ts": 1716454223264303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200602, "dur": 8, "args": { "External id": 120602, "cbid": 211, "correlation": 120602 } }, { "ph": "s", "id": 120602, "pid": 76337, "tid": -914061504, "ts": 1716454223200602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200686, "dur": 1, "args": { "External id": 120613, "cbid": 251, "correlation": 120613 } }, { "ph": "f", "id": 120613, "pid": 76337, "tid": -914061504, "ts": 1716454223200686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223264321, "dur": 90, "args": { "External id": 120614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120614, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120614, "pid": 5, "tid": 7, "ts": 1716454223264321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200692, "dur": 14, "args": { "External id": 120614, "cbid": 211, "correlation": 120614 } }, { "ph": "s", "id": 120614, "pid": 76337, "tid": -914061504, "ts": 1716454223200692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200761, "dur": 1, "args": { "External id": 120625, "cbid": 251, "correlation": 120625 } }, { "ph": "f", "id": 120625, "pid": 76337, "tid": -914061504, "ts": 1716454223200761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200765, "dur": 0, "args": { "External id": 120626, "cbid": 251, "correlation": 120626 } }, { "ph": "f", "id": 120626, "pid": 76337, "tid": -914061504, "ts": 1716454223200765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223264412, "dur": 12, "args": { "External id": 120627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120627, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120627, "pid": 5, "tid": 7, "ts": 1716454223264412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200766, "dur": 12, "args": { "External id": 120627, "cbid": 211, "correlation": 120627 } }, { "ph": "s", "id": 120627, "pid": 76337, "tid": -914061504, "ts": 1716454223200766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223264426, "dur": 6, "args": { "External id": 120629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120629, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120629, "pid": 5, "tid": 7, "ts": 1716454223264426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200780, "dur": 6, "args": { "External id": 120629, "cbid": 211, "correlation": 120629 } }, { "ph": "s", "id": 120629, "pid": 76337, "tid": -914061504, "ts": 1716454223200780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200837, "dur": 1, "args": { "External id": 120640, "cbid": 251, "correlation": 120640 } }, { "ph": "f", "id": 120640, "pid": 76337, "tid": -914061504, "ts": 1716454223200837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223200840, "dur": 0, "args": { "External id": 120641, "cbid": 251, "correlation": 120641 } }, { "ph": "f", "id": 120641, "pid": 76337, "tid": -914061504, "ts": 1716454223200840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223264433, "dur": 8, "args": { "External id": 120642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120642, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120642, "pid": 5, "tid": 7, "ts": 1716454223264433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200841, "dur": 11, "args": { "External id": 120642, "cbid": 211, "correlation": 120642 } }, { "ph": "s", "id": 120642, "pid": 76337, "tid": -914061504, "ts": 1716454223200841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223264443, "dur": 4, "args": { "External id": 120644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120644, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120644, "pid": 5, "tid": 7, "ts": 1716454223264443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200855, "dur": 6, "args": { "External id": 120644, "cbid": 211, "correlation": 120644 } }, { "ph": "s", "id": 120644, "pid": 76337, "tid": -914061504, "ts": 1716454223200855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223264447, "dur": 55, "args": { "External id": 120669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120669, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120669, "pid": 5, "tid": 7, "ts": 1716454223264447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223200931, "dur": 12, "args": { "External id": 120669, "cbid": 211, "correlation": 120669 } }, { "ph": "s", "id": 120669, "pid": 76337, "tid": -914061504, "ts": 1716454223200931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223201038, "dur": 1, "args": { "External id": 120687, "cbid": 251, "correlation": 120687 } }, { "ph": "f", "id": 120687, "pid": 76337, "tid": -914061504, "ts": 1716454223201038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223264504, "dur": 93, "args": { "External id": 120689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120689, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120689, "pid": 5, "tid": 7, "ts": 1716454223264504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201044, "dur": 15, "args": { "External id": 120689, "cbid": 211, "correlation": 120689 } }, { "ph": "s", "id": 120689, "pid": 76337, "tid": -914061504, "ts": 1716454223201044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223264598, "dur": 10, "args": { "External id": 120697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120697, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120697, "pid": 5, "tid": 7, "ts": 1716454223264598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201116, "dur": 12, "args": { "External id": 120697, "cbid": 211, "correlation": 120697 } }, { "ph": "s", "id": 120697, "pid": 76337, "tid": -914061504, "ts": 1716454223201116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223264609, "dur": 20, "args": { "External id": 120705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120705, "pid": 5, "tid": 7, "ts": 1716454223264609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201156, "dur": 10, "args": { "External id": 120705, "cbid": 211, "correlation": 120705 } }, { "ph": "s", "id": 120705, "pid": 76337, "tid": -914061504, "ts": 1716454223201156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223264631, "dur": 17, "args": { "External id": 120727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120727, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120727, "pid": 5, "tid": 7, "ts": 1716454223264631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201207, "dur": 10, "args": { "External id": 120727, "cbid": 211, "correlation": 120727 } }, { "ph": "s", "id": 120727, "pid": 76337, "tid": -914061504, "ts": 1716454223201207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223201294, "dur": 1, "args": { "External id": 120743, "cbid": 251, "correlation": 120743 } }, { "ph": "f", "id": 120743, "pid": 76337, "tid": -914061504, "ts": 1716454223201294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223201298, "dur": 0, "args": { "External id": 120745, "cbid": 251, "correlation": 120745 } }, { "ph": "f", "id": 120745, "pid": 76337, "tid": -914061504, "ts": 1716454223201298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223264649, "dur": 497, "args": { "External id": 120746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120746, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120746, "pid": 5, "tid": 7, "ts": 1716454223264649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201300, "dur": 13, "args": { "External id": 120746, "cbid": 211, "correlation": 120746 } }, { "ph": "s", "id": 120746, "pid": 76337, "tid": -914061504, "ts": 1716454223201300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223265148, "dur": 66, "args": { "External id": 120754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120754, "pid": 5, "tid": 7, "ts": 1716454223265148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201367, "dur": 12, "args": { "External id": 120754, "cbid": 211, "correlation": 120754 } }, { "ph": "s", "id": 120754, "pid": 76337, "tid": -914061504, "ts": 1716454223201367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223265215, "dur": 67, "args": { "External id": 120762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120762, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120762, "pid": 5, "tid": 7, "ts": 1716454223265215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201397, "dur": 9, "args": { "External id": 120762, "cbid": 211, "correlation": 120762 } }, { "ph": "s", "id": 120762, "pid": 76337, "tid": -914061504, "ts": 1716454223201397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223201476, "dur": 1, "args": { "External id": 120778, "cbid": 251, "correlation": 120778 } }, { "ph": "f", "id": 120778, "pid": 76337, "tid": -914061504, "ts": 1716454223201476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223265284, "dur": 1, "args": { "External id": 120780, "device": 5, "context": 1, "stream": 7, "correlation": 120780, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 120780, "pid": 5, "tid": 7, "ts": 1716454223265284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223201481, "dur": 9, "args": { "External id": 120780, "cbid": 51, "correlation": 120780 } }, { "ph": "s", "id": 120780, "pid": 76337, "tid": -914061504, "ts": 1716454223201481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223265287, "dur": 271, "args": { "External id": 120781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120781, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120781, "pid": 5, "tid": 7, "ts": 1716454223265287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201492, "dur": 11, "args": { "External id": 120781, "cbid": 211, "correlation": 120781 } }, { "ph": "s", "id": 120781, "pid": 76337, "tid": -914061504, "ts": 1716454223201492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223265560, "dur": 13, "args": { "External id": 120789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120789, "pid": 5, "tid": 7, "ts": 1716454223265560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201534, "dur": 10, "args": { "External id": 120789, "cbid": 211, "correlation": 120789 } }, { "ph": "s", "id": 120789, "pid": 76337, "tid": -914061504, "ts": 1716454223201534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223265575, "dur": 38, "args": { "External id": 120800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120800, "pid": 5, "tid": 7, "ts": 1716454223265575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201601, "dur": 13, "args": { "External id": 120800, "cbid": 211, "correlation": 120800 } }, { "ph": "s", "id": 120800, "pid": 76337, "tid": -914061504, "ts": 1716454223201601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223201665, "dur": 0, "args": { "External id": 120812, "cbid": 317, "correlation": 120812 } }, { "ph": "f", "id": 120812, "pid": 76337, "tid": -914061504, "ts": 1716454223201665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223201666, "dur": 0, "args": { "External id": 120813, "cbid": 203, "correlation": 120813 } }, { "ph": "f", "id": 120813, "pid": 76337, "tid": -914061504, "ts": 1716454223201666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223201667, "dur": 0, "args": { "External id": 120814, "cbid": 205, "correlation": 120814 } }, { "ph": "f", "id": 120814, "pid": 76337, "tid": -914061504, "ts": 1716454223201667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223265614, "dur": 13, "args": { "External id": 120818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120818, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120818, "pid": 5, "tid": 7, "ts": 1716454223265614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201683, "dur": 12, "args": { "External id": 120818, "cbid": 211, "correlation": 120818 } }, { "ph": "s", "id": 120818, "pid": 76337, "tid": -914061504, "ts": 1716454223201683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223265628, "dur": 4, "args": { "External id": 120820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 120820, "pid": 5, "tid": 7, "ts": 1716454223265628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201699, "dur": 6, "args": { "External id": 120820, "cbid": 211, "correlation": 120820 } }, { "ph": "s", "id": 120820, "pid": 76337, "tid": -914061504, "ts": 1716454223201699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223201707, "dur": 0, "args": { "External id": 120821, "cbid": 51, "correlation": 120821 } }, { "ph": "s", "id": 120821, "pid": 76337, "tid": -914061504, "ts": 1716454223201707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223265633, "dur": 98, "args": { "External id": 120822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120822, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 120822, "pid": 5, "tid": 7, "ts": 1716454223265633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201708, "dur": 6, "args": { "External id": 120822, "cbid": 211, "correlation": 120822 } }, { "ph": "s", "id": 120822, "pid": 76337, "tid": -914061504, "ts": 1716454223201708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223265733, "dur": 16, "args": { "External id": 120827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120827, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120827, "pid": 5, "tid": 7, "ts": 1716454223265733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201735, "dur": 8, "args": { "External id": 120827, "cbid": 211, "correlation": 120827 } }, { "ph": "s", "id": 120827, "pid": 76337, "tid": -914061504, "ts": 1716454223201735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223265750, "dur": 13, "args": { "External id": 120835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120835, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120835, "pid": 5, "tid": 7, "ts": 1716454223265750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201767, "dur": 8, "args": { "External id": 120835, "cbid": 211, "correlation": 120835 } }, { "ph": "s", "id": 120835, "pid": 76337, "tid": -914061504, "ts": 1716454223201767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223265764, "dur": 57, "args": { "External id": 120846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120846, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120846, "pid": 5, "tid": 7, "ts": 1716454223265764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201828, "dur": 12, "args": { "External id": 120846, "cbid": 211, "correlation": 120846 } }, { "ph": "s", "id": 120846, "pid": 76337, "tid": -914061504, "ts": 1716454223201828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223201882, "dur": 0, "args": { "External id": 120856, "cbid": 317, "correlation": 120856 } }, { "ph": "f", "id": 120856, "pid": 76337, "tid": -914061504, "ts": 1716454223201882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223201883, "dur": 0, "args": { "External id": 120857, "cbid": 203, "correlation": 120857 } }, { "ph": "f", "id": 120857, "pid": 76337, "tid": -914061504, "ts": 1716454223201883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223201883, "dur": 0, "args": { "External id": 120858, "cbid": 205, "correlation": 120858 } }, { "ph": "f", "id": 120858, "pid": 76337, "tid": -914061504, "ts": 1716454223201883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223265822, "dur": 39, "args": { "External id": 120862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120862, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120862, "pid": 5, "tid": 7, "ts": 1716454223265822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201899, "dur": 12, "args": { "External id": 120862, "cbid": 211, "correlation": 120862 } }, { "ph": "s", "id": 120862, "pid": 76337, "tid": -914061504, "ts": 1716454223201899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223265863, "dur": 163, "args": { "External id": 120864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120864, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120864, "pid": 5, "tid": 7, "ts": 1716454223265863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201914, "dur": 5, "args": { "External id": 120864, "cbid": 211, "correlation": 120864 } }, { "ph": "s", "id": 120864, "pid": 76337, "tid": -914061504, "ts": 1716454223201914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223266027, "dur": 1960, "args": { "External id": 120866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120866, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120866, "pid": 5, "tid": 7, "ts": 1716454223266027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201925, "dur": 8, "args": { "External id": 120866, "cbid": 211, "correlation": 120866 } }, { "ph": "s", "id": 120866, "pid": 76337, "tid": -914061504, "ts": 1716454223201925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223267988, "dur": 38, "args": { "External id": 120868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120868, "pid": 5, "tid": 7, "ts": 1716454223267988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201937, "dur": 5, "args": { "External id": 120868, "cbid": 211, "correlation": 120868 } }, { "ph": "s", "id": 120868, "pid": 76337, "tid": -914061504, "ts": 1716454223201937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223268028, "dur": 60, "args": { "External id": 120874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120874, "pid": 5, "tid": 7, "ts": 1716454223268028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223201965, "dur": 16, "args": { "External id": 120874, "cbid": 211, "correlation": 120874 } }, { "ph": "s", "id": 120874, "pid": 76337, "tid": -914061504, "ts": 1716454223201965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223268089, "dur": 89, "args": { "External id": 120883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120883, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120883, "pid": 5, "tid": 7, "ts": 1716454223268089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202062, "dur": 14, "args": { "External id": 120883, "cbid": 211, "correlation": 120883 } }, { "ph": "s", "id": 120883, "pid": 76337, "tid": -914061504, "ts": 1716454223202062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223268179, "dur": 72, "args": { "External id": 120903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120903, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 120903, "pid": 5, "tid": 7, "ts": 1716454223268179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202131, "dur": 11, "args": { "External id": 120903, "cbid": 211, "correlation": 120903 } }, { "ph": "s", "id": 120903, "pid": 76337, "tid": -914061504, "ts": 1716454223202131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223268253, "dur": 4, "args": { "External id": 120915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120915, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 120915, "pid": 5, "tid": 7, "ts": 1716454223268253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202152, "dur": 6, "args": { "External id": 120915, "cbid": 211, "correlation": 120915 } }, { "ph": "s", "id": 120915, "pid": 76337, "tid": -914061504, "ts": 1716454223202152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223268259, "dur": 82, "args": { "External id": 120918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120918, "pid": 5, "tid": 7, "ts": 1716454223268259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202171, "dur": 6, "args": { "External id": 120918, "cbid": 211, "correlation": 120918 } }, { "ph": "s", "id": 120918, "pid": 76337, "tid": -914061504, "ts": 1716454223202171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223268342, "dur": 53, "args": { "External id": 120927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120927, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120927, "pid": 5, "tid": 7, "ts": 1716454223268342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202211, "dur": 10, "args": { "External id": 120927, "cbid": 211, "correlation": 120927 } }, { "ph": "s", "id": 120927, "pid": 76337, "tid": -914061504, "ts": 1716454223202211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223202263, "dur": 0, "args": { "External id": 120937, "cbid": 317, "correlation": 120937 } }, { "ph": "f", "id": 120937, "pid": 76337, "tid": -914061504, "ts": 1716454223202263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223202264, "dur": 0, "args": { "External id": 120938, "cbid": 203, "correlation": 120938 } }, { "ph": "f", "id": 120938, "pid": 76337, "tid": -914061504, "ts": 1716454223202264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223202265, "dur": 0, "args": { "External id": 120939, "cbid": 205, "correlation": 120939 } }, { "ph": "f", "id": 120939, "pid": 76337, "tid": -914061504, "ts": 1716454223202265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223268396, "dur": 57, "args": { "External id": 120943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120943, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120943, "pid": 5, "tid": 7, "ts": 1716454223268396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202281, "dur": 11, "args": { "External id": 120943, "cbid": 211, "correlation": 120943 } }, { "ph": "s", "id": 120943, "pid": 76337, "tid": -914061504, "ts": 1716454223202281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223268454, "dur": 122, "args": { "External id": 120945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120945, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120945, "pid": 5, "tid": 7, "ts": 1716454223268454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202295, "dur": 5, "args": { "External id": 120945, "cbid": 211, "correlation": 120945 } }, { "ph": "s", "id": 120945, "pid": 76337, "tid": -914061504, "ts": 1716454223202295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223268577, "dur": 1893, "args": { "External id": 120947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120947, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 120947, "pid": 5, "tid": 7, "ts": 1716454223268577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202306, "dur": 6, "args": { "External id": 120947, "cbid": 211, "correlation": 120947 } }, { "ph": "s", "id": 120947, "pid": 76337, "tid": -914061504, "ts": 1716454223202306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223270471, "dur": 19, "args": { "External id": 120949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120949, "pid": 5, "tid": 7, "ts": 1716454223270471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202316, "dur": 5, "args": { "External id": 120949, "cbid": 211, "correlation": 120949 } }, { "ph": "s", "id": 120949, "pid": 76337, "tid": -914061504, "ts": 1716454223202316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223270492, "dur": 33, "args": { "External id": 120955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120955, "pid": 5, "tid": 7, "ts": 1716454223270492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202344, "dur": 8, "args": { "External id": 120955, "cbid": 211, "correlation": 120955 } }, { "ph": "s", "id": 120955, "pid": 76337, "tid": -914061504, "ts": 1716454223202344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223270526, "dur": 4, "args": { "External id": 120963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120963, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 120963, "pid": 5, "tid": 7, "ts": 1716454223270526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202388, "dur": 9, "args": { "External id": 120963, "cbid": 211, "correlation": 120963 } }, { "ph": "s", "id": 120963, "pid": 76337, "tid": -914061504, "ts": 1716454223202388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223202452, "dur": 1, "args": { "External id": 120979, "cbid": 251, "correlation": 120979 } }, { "ph": "f", "id": 120979, "pid": 76337, "tid": -914061504, "ts": 1716454223202452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223202457, "dur": 0, "args": { "External id": 120981, "cbid": 251, "correlation": 120981 } }, { "ph": "f", "id": 120981, "pid": 76337, "tid": -914061504, "ts": 1716454223202457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223270532, "dur": 12, "args": { "External id": 120982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120982, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 120982, "pid": 5, "tid": 7, "ts": 1716454223270532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202459, "dur": 12, "args": { "External id": 120982, "cbid": 211, "correlation": 120982 } }, { "ph": "s", "id": 120982, "pid": 76337, "tid": -914061504, "ts": 1716454223202459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223270545, "dur": 5, "args": { "External id": 120984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120984, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 120984, "pid": 5, "tid": 7, "ts": 1716454223270545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202473, "dur": 5, "args": { "External id": 120984, "cbid": 211, "correlation": 120984 } }, { "ph": "s", "id": 120984, "pid": 76337, "tid": -914061504, "ts": 1716454223202473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223270551, "dur": 29, "args": { "External id": 120994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 120994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 120994, "pid": 5, "tid": 7, "ts": 1716454223270551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202530, "dur": 12, "args": { "External id": 120994, "cbid": 211, "correlation": 120994 } }, { "ph": "s", "id": 120994, "pid": 76337, "tid": -914061504, "ts": 1716454223202530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223270582, "dur": 31, "args": { "External id": 121014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121014, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 121014, "pid": 5, "tid": 7, "ts": 1716454223270582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202595, "dur": 11, "args": { "External id": 121014, "cbid": 211, "correlation": 121014 } }, { "ph": "s", "id": 121014, "pid": 76337, "tid": -914061504, "ts": 1716454223202595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223270614, "dur": 4, "args": { "External id": 121026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121026, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 121026, "pid": 5, "tid": 7, "ts": 1716454223270614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202615, "dur": 7, "args": { "External id": 121026, "cbid": 211, "correlation": 121026 } }, { "ph": "s", "id": 121026, "pid": 76337, "tid": -914061504, "ts": 1716454223202615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223270619, "dur": 30, "args": { "External id": 121029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121029, "pid": 5, "tid": 7, "ts": 1716454223270619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202633, "dur": 6, "args": { "External id": 121029, "cbid": 211, "correlation": 121029 } }, { "ph": "s", "id": 121029, "pid": 76337, "tid": -914061504, "ts": 1716454223202633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223270651, "dur": 21, "args": { "External id": 121038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121038, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121038, "pid": 5, "tid": 7, "ts": 1716454223270651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202673, "dur": 10, "args": { "External id": 121038, "cbid": 211, "correlation": 121038 } }, { "ph": "s", "id": 121038, "pid": 76337, "tid": -914061504, "ts": 1716454223202673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223202736, "dur": 0, "args": { "External id": 121048, "cbid": 317, "correlation": 121048 } }, { "ph": "f", "id": 121048, "pid": 76337, "tid": -914061504, "ts": 1716454223202736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223202737, "dur": 0, "args": { "External id": 121049, "cbid": 203, "correlation": 121049 } }, { "ph": "f", "id": 121049, "pid": 76337, "tid": -914061504, "ts": 1716454223202737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223202738, "dur": 0, "args": { "External id": 121050, "cbid": 205, "correlation": 121050 } }, { "ph": "f", "id": 121050, "pid": 76337, "tid": -914061504, "ts": 1716454223202738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223270673, "dur": 23, "args": { "External id": 121054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121054, "pid": 5, "tid": 7, "ts": 1716454223270673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202753, "dur": 12, "args": { "External id": 121054, "cbid": 211, "correlation": 121054 } }, { "ph": "s", "id": 121054, "pid": 76337, "tid": -914061504, "ts": 1716454223202753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223270697, "dur": 44, "args": { "External id": 121056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121056, "pid": 5, "tid": 7, "ts": 1716454223270697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202768, "dur": 5, "args": { "External id": 121056, "cbid": 211, "correlation": 121056 } }, { "ph": "s", "id": 121056, "pid": 76337, "tid": -914061504, "ts": 1716454223202768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223270742, "dur": 647, "args": { "External id": 121058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121058, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121058, "pid": 5, "tid": 7, "ts": 1716454223270742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202781, "dur": 6, "args": { "External id": 121058, "cbid": 211, "correlation": 121058 } }, { "ph": "s", "id": 121058, "pid": 76337, "tid": -914061504, "ts": 1716454223202781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223271390, "dur": 24, "args": { "External id": 121060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121060, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121060, "pid": 5, "tid": 7, "ts": 1716454223271390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202790, "dur": 5, "args": { "External id": 121060, "cbid": 211, "correlation": 121060 } }, { "ph": "s", "id": 121060, "pid": 76337, "tid": -914061504, "ts": 1716454223202790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223271415, "dur": 33, "args": { "External id": 121066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121066, "pid": 5, "tid": 7, "ts": 1716454223271415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202818, "dur": 9, "args": { "External id": 121066, "cbid": 211, "correlation": 121066 } }, { "ph": "s", "id": 121066, "pid": 76337, "tid": -914061504, "ts": 1716454223202818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223202876, "dur": 0, "args": { "External id": 121076, "cbid": 317, "correlation": 121076 } }, { "ph": "f", "id": 121076, "pid": 76337, "tid": -914061504, "ts": 1716454223202876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223202877, "dur": 0, "args": { "External id": 121077, "cbid": 203, "correlation": 121077 } }, { "ph": "f", "id": 121077, "pid": 76337, "tid": -914061504, "ts": 1716454223202877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223202877, "dur": 0, "args": { "External id": 121078, "cbid": 205, "correlation": 121078 } }, { "ph": "f", "id": 121078, "pid": 76337, "tid": -914061504, "ts": 1716454223202877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223271449, "dur": 57, "args": { "External id": 121082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121082, "pid": 5, "tid": 7, "ts": 1716454223271449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202890, "dur": 12, "args": { "External id": 121082, "cbid": 211, "correlation": 121082 } }, { "ph": "s", "id": 121082, "pid": 76337, "tid": -914061504, "ts": 1716454223202890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223271507, "dur": 270, "args": { "External id": 121084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121084, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121084, "pid": 5, "tid": 7, "ts": 1716454223271507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202908, "dur": 9, "args": { "External id": 121084, "cbid": 211, "correlation": 121084 } }, { "ph": "s", "id": 121084, "pid": 76337, "tid": -914061504, "ts": 1716454223202908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223271778, "dur": 22, "args": { "External id": 121086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121086, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121086, "pid": 5, "tid": 7, "ts": 1716454223271778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202921, "dur": 5, "args": { "External id": 121086, "cbid": 211, "correlation": 121086 } }, { "ph": "s", "id": 121086, "pid": 76337, "tid": -914061504, "ts": 1716454223202921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223271801, "dur": 32, "args": { "External id": 121092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121092, "pid": 5, "tid": 7, "ts": 1716454223271801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202946, "dur": 8, "args": { "External id": 121092, "cbid": 211, "correlation": 121092 } }, { "ph": "s", "id": 121092, "pid": 76337, "tid": -914061504, "ts": 1716454223202946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223271835, "dur": 27, "args": { "External id": 121100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121100, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121100, "pid": 5, "tid": 7, "ts": 1716454223271835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223202983, "dur": 8, "args": { "External id": 121100, "cbid": 211, "correlation": 121100 } }, { "ph": "s", "id": 121100, "pid": 76337, "tid": -914061504, "ts": 1716454223202983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223271863, "dur": 20, "args": { "External id": 121108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121108, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121108, "pid": 5, "tid": 7, "ts": 1716454223271863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203013, "dur": 9, "args": { "External id": 121108, "cbid": 211, "correlation": 121108 } }, { "ph": "s", "id": 121108, "pid": 76337, "tid": -914061504, "ts": 1716454223203013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223271885, "dur": 30, "args": { "External id": 121128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121128, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 121128, "pid": 5, "tid": 7, "ts": 1716454223271885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203095, "dur": 12, "args": { "External id": 121128, "cbid": 211, "correlation": 121128 } }, { "ph": "s", "id": 121128, "pid": 76337, "tid": -914061504, "ts": 1716454223203095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223271916, "dur": 4, "args": { "External id": 121140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121140, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 121140, "pid": 5, "tid": 7, "ts": 1716454223271916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203117, "dur": 7, "args": { "External id": 121140, "cbid": 211, "correlation": 121140 } }, { "ph": "s", "id": 121140, "pid": 76337, "tid": -914061504, "ts": 1716454223203117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223271922, "dur": 31, "args": { "External id": 121143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121143, "pid": 5, "tid": 7, "ts": 1716454223271922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203135, "dur": 6, "args": { "External id": 121143, "cbid": 211, "correlation": 121143 } }, { "ph": "s", "id": 121143, "pid": 76337, "tid": -914061504, "ts": 1716454223203135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223203192, "dur": 0, "args": { "External id": 121154, "cbid": 317, "correlation": 121154 } }, { "ph": "f", "id": 121154, "pid": 76337, "tid": -914061504, "ts": 1716454223203192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223203193, "dur": 0, "args": { "External id": 121155, "cbid": 203, "correlation": 121155 } }, { "ph": "f", "id": 121155, "pid": 76337, "tid": -914061504, "ts": 1716454223203193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223203194, "dur": 0, "args": { "External id": 121156, "cbid": 205, "correlation": 121156 } }, { "ph": "f", "id": 121156, "pid": 76337, "tid": -914061504, "ts": 1716454223203194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223271954, "dur": 22, "args": { "External id": 121160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121160, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121160, "pid": 5, "tid": 7, "ts": 1716454223271954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203208, "dur": 12, "args": { "External id": 121160, "cbid": 211, "correlation": 121160 } }, { "ph": "s", "id": 121160, "pid": 76337, "tid": -914061504, "ts": 1716454223203208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223271978, "dur": 105, "args": { "External id": 121162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121162, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121162, "pid": 5, "tid": 7, "ts": 1716454223271978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203226, "dur": 6, "args": { "External id": 121162, "cbid": 211, "correlation": 121162 } }, { "ph": "s", "id": 121162, "pid": 76337, "tid": -914061504, "ts": 1716454223203226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223272084, "dur": 23, "args": { "External id": 121164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121164, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121164, "pid": 5, "tid": 7, "ts": 1716454223272084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203236, "dur": 5, "args": { "External id": 121164, "cbid": 211, "correlation": 121164 } }, { "ph": "s", "id": 121164, "pid": 76337, "tid": -914061504, "ts": 1716454223203236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223272107, "dur": 32, "args": { "External id": 121170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121170, "pid": 5, "tid": 7, "ts": 1716454223272107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203263, "dur": 9, "args": { "External id": 121170, "cbid": 211, "correlation": 121170 } }, { "ph": "s", "id": 121170, "pid": 76337, "tid": -914061504, "ts": 1716454223203263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223272141, "dur": 190, "args": { "External id": 121179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121179, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121179, "pid": 5, "tid": 7, "ts": 1716454223272141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203344, "dur": 14, "args": { "External id": 121179, "cbid": 211, "correlation": 121179 } }, { "ph": "s", "id": 121179, "pid": 76337, "tid": -914061504, "ts": 1716454223203344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223272333, "dur": 65, "args": { "External id": 121201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121201, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121201, "pid": 5, "tid": 7, "ts": 1716454223272333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203402, "dur": 10, "args": { "External id": 121201, "cbid": 211, "correlation": 121201 } }, { "ph": "s", "id": 121201, "pid": 76337, "tid": -914061504, "ts": 1716454223203402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223203491, "dur": 2, "args": { "External id": 121212, "cbid": 251, "correlation": 121212 } }, { "ph": "f", "id": 121212, "pid": 76337, "tid": -914061504, "ts": 1716454223203491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223272399, "dur": 154, "args": { "External id": 121213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121213, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121213, "pid": 5, "tid": 7, "ts": 1716454223272399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203496, "dur": 13, "args": { "External id": 121213, "cbid": 211, "correlation": 121213 } }, { "ph": "s", "id": 121213, "pid": 76337, "tid": -914061504, "ts": 1716454223203496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223203566, "dur": 1, "args": { "External id": 121224, "cbid": 251, "correlation": 121224 } }, { "ph": "f", "id": 121224, "pid": 76337, "tid": -914061504, "ts": 1716454223203566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223272555, "dur": 148, "args": { "External id": 121225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121225, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121225, "pid": 5, "tid": 7, "ts": 1716454223272555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203570, "dur": 12, "args": { "External id": 121225, "cbid": 211, "correlation": 121225 } }, { "ph": "s", "id": 121225, "pid": 76337, "tid": -914061504, "ts": 1716454223203570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223203635, "dur": 1, "args": { "External id": 121236, "cbid": 251, "correlation": 121236 } }, { "ph": "f", "id": 121236, "pid": 76337, "tid": -914061504, "ts": 1716454223203635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223272704, "dur": 147, "args": { "External id": 121237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121237, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121237, "pid": 5, "tid": 7, "ts": 1716454223272704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203639, "dur": 11, "args": { "External id": 121237, "cbid": 211, "correlation": 121237 } }, { "ph": "s", "id": 121237, "pid": 76337, "tid": -914061504, "ts": 1716454223203639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223272852, "dur": 1949, "args": { "External id": 121258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121258, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 121258, "pid": 5, "tid": 7, "ts": 1716454223272852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203721, "dur": 13, "args": { "External id": 121258, "cbid": 211, "correlation": 121258 } }, { "ph": "s", "id": 121258, "pid": 76337, "tid": -914061504, "ts": 1716454223203721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223203819, "dur": 2, "args": { "External id": 121276, "cbid": 251, "correlation": 121276 } }, { "ph": "f", "id": 121276, "pid": 76337, "tid": -914061504, "ts": 1716454223203819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223274802, "dur": 149, "args": { "External id": 121278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121278, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 121278, "pid": 5, "tid": 7, "ts": 1716454223274802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203826, "dur": 14, "args": { "External id": 121278, "cbid": 211, "correlation": 121278 } }, { "ph": "s", "id": 121278, "pid": 76337, "tid": -914061504, "ts": 1716454223203826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223274953, "dur": 36, "args": { "External id": 121286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121286, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121286, "pid": 5, "tid": 7, "ts": 1716454223274953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203897, "dur": 12, "args": { "External id": 121286, "cbid": 211, "correlation": 121286 } }, { "ph": "s", "id": 121286, "pid": 76337, "tid": -914061504, "ts": 1716454223203897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223274990, "dur": 50, "args": { "External id": 121294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121294, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121294, "pid": 5, "tid": 7, "ts": 1716454223274990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223203937, "dur": 9, "args": { "External id": 121294, "cbid": 211, "correlation": 121294 } }, { "ph": "s", "id": 121294, "pid": 76337, "tid": -914061504, "ts": 1716454223203937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223275041, "dur": 30, "args": { "External id": 121305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121305, "pid": 5, "tid": 7, "ts": 1716454223275041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204017, "dur": 14, "args": { "External id": 121305, "cbid": 211, "correlation": 121305 } }, { "ph": "s", "id": 121305, "pid": 76337, "tid": -914061504, "ts": 1716454223204017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223275073, "dur": 35, "args": { "External id": 121327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121327, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121327, "pid": 5, "tid": 7, "ts": 1716454223275073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204049, "dur": 8, "args": { "External id": 121327, "cbid": 211, "correlation": 121327 } }, { "ph": "s", "id": 121327, "pid": 76337, "tid": -914061504, "ts": 1716454223204049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204134, "dur": 1, "args": { "External id": 121338, "cbid": 251, "correlation": 121338 } }, { "ph": "f", "id": 121338, "pid": 76337, "tid": -914061504, "ts": 1716454223204134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223275109, "dur": 91, "args": { "External id": 121339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121339, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121339, "pid": 5, "tid": 7, "ts": 1716454223275109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204139, "dur": 13, "args": { "External id": 121339, "cbid": 211, "correlation": 121339 } }, { "ph": "s", "id": 121339, "pid": 76337, "tid": -914061504, "ts": 1716454223204139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204208, "dur": 1, "args": { "External id": 121350, "cbid": 251, "correlation": 121350 } }, { "ph": "f", "id": 121350, "pid": 76337, "tid": -914061504, "ts": 1716454223204208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204212, "dur": 0, "args": { "External id": 121351, "cbid": 251, "correlation": 121351 } }, { "ph": "f", "id": 121351, "pid": 76337, "tid": -914061504, "ts": 1716454223204212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223275201, "dur": 11, "args": { "External id": 121352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121352, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 121352, "pid": 5, "tid": 7, "ts": 1716454223275201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204214, "dur": 12, "args": { "External id": 121352, "cbid": 211, "correlation": 121352 } }, { "ph": "s", "id": 121352, "pid": 76337, "tid": -914061504, "ts": 1716454223204214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223275214, "dur": 5, "args": { "External id": 121354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121354, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 121354, "pid": 5, "tid": 7, "ts": 1716454223275214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204228, "dur": 6, "args": { "External id": 121354, "cbid": 211, "correlation": 121354 } }, { "ph": "s", "id": 121354, "pid": 76337, "tid": -914061504, "ts": 1716454223204228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204285, "dur": 1, "args": { "External id": 121365, "cbid": 251, "correlation": 121365 } }, { "ph": "f", "id": 121365, "pid": 76337, "tid": -914061504, "ts": 1716454223204285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204288, "dur": 0, "args": { "External id": 121366, "cbid": 251, "correlation": 121366 } }, { "ph": "f", "id": 121366, "pid": 76337, "tid": -914061504, "ts": 1716454223204288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223275220, "dur": 7, "args": { "External id": 121367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121367, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 121367, "pid": 5, "tid": 7, "ts": 1716454223275220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204290, "dur": 11, "args": { "External id": 121367, "cbid": 211, "correlation": 121367 } }, { "ph": "s", "id": 121367, "pid": 76337, "tid": -914061504, "ts": 1716454223204290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223275228, "dur": 3, "args": { "External id": 121369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121369, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 121369, "pid": 5, "tid": 7, "ts": 1716454223275228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204303, "dur": 6, "args": { "External id": 121369, "cbid": 211, "correlation": 121369 } }, { "ph": "s", "id": 121369, "pid": 76337, "tid": -914061504, "ts": 1716454223204303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223275233, "dur": 93, "args": { "External id": 121390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121390, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 121390, "pid": 5, "tid": 7, "ts": 1716454223275233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204376, "dur": 13, "args": { "External id": 121390, "cbid": 211, "correlation": 121390 } }, { "ph": "s", "id": 121390, "pid": 76337, "tid": -914061504, "ts": 1716454223204376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204472, "dur": 1, "args": { "External id": 121408, "cbid": 251, "correlation": 121408 } }, { "ph": "f", "id": 121408, "pid": 76337, "tid": -914061504, "ts": 1716454223204472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223275327, "dur": 99, "args": { "External id": 121410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121410, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121410, "pid": 5, "tid": 7, "ts": 1716454223275327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204478, "dur": 13, "args": { "External id": 121410, "cbid": 211, "correlation": 121410 } }, { "ph": "s", "id": 121410, "pid": 76337, "tid": -914061504, "ts": 1716454223204478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223275427, "dur": 19, "args": { "External id": 121418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121418, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121418, "pid": 5, "tid": 7, "ts": 1716454223275427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204547, "dur": 12, "args": { "External id": 121418, "cbid": 211, "correlation": 121418 } }, { "ph": "s", "id": 121418, "pid": 76337, "tid": -914061504, "ts": 1716454223204547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223275448, "dur": 37, "args": { "External id": 121426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121426, "pid": 5, "tid": 7, "ts": 1716454223275448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204588, "dur": 9, "args": { "External id": 121426, "cbid": 211, "correlation": 121426 } }, { "ph": "s", "id": 121426, "pid": 76337, "tid": -914061504, "ts": 1716454223204588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223275486, "dur": 34, "args": { "External id": 121448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121448, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121448, "pid": 5, "tid": 7, "ts": 1716454223275486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204641, "dur": 9, "args": { "External id": 121448, "cbid": 211, "correlation": 121448 } }, { "ph": "s", "id": 121448, "pid": 76337, "tid": -914061504, "ts": 1716454223204641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204728, "dur": 1, "args": { "External id": 121464, "cbid": 251, "correlation": 121464 } }, { "ph": "f", "id": 121464, "pid": 76337, "tid": -914061504, "ts": 1716454223204728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204733, "dur": 0, "args": { "External id": 121466, "cbid": 251, "correlation": 121466 } }, { "ph": "f", "id": 121466, "pid": 76337, "tid": -914061504, "ts": 1716454223204733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223275522, "dur": 541, "args": { "External id": 121467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121467, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 121467, "pid": 5, "tid": 7, "ts": 1716454223275522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204739, "dur": 12, "args": { "External id": 121467, "cbid": 211, "correlation": 121467 } }, { "ph": "s", "id": 121467, "pid": 76337, "tid": -914061504, "ts": 1716454223204739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223276064, "dur": 126, "args": { "External id": 121475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121475, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121475, "pid": 5, "tid": 7, "ts": 1716454223276064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204803, "dur": 12, "args": { "External id": 121475, "cbid": 211, "correlation": 121475 } }, { "ph": "s", "id": 121475, "pid": 76337, "tid": -914061504, "ts": 1716454223204803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223276191, "dur": 130, "args": { "External id": 121483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121483, "pid": 5, "tid": 7, "ts": 1716454223276191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204834, "dur": 9, "args": { "External id": 121483, "cbid": 211, "correlation": 121483 } }, { "ph": "s", "id": 121483, "pid": 76337, "tid": -914061504, "ts": 1716454223204834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223204911, "dur": 1, "args": { "External id": 121499, "cbid": 251, "correlation": 121499 } }, { "ph": "f", "id": 121499, "pid": 76337, "tid": -914061504, "ts": 1716454223204911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223276323, "dur": 305, "args": { "External id": 121501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121501, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121501, "pid": 5, "tid": 7, "ts": 1716454223276323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204917, "dur": 12, "args": { "External id": 121501, "cbid": 211, "correlation": 121501 } }, { "ph": "s", "id": 121501, "pid": 76337, "tid": -914061504, "ts": 1716454223204917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223276629, "dur": 27, "args": { "External id": 121509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121509, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121509, "pid": 5, "tid": 7, "ts": 1716454223276629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223204959, "dur": 9, "args": { "External id": 121509, "cbid": 211, "correlation": 121509 } }, { "ph": "s", "id": 121509, "pid": 76337, "tid": -914061504, "ts": 1716454223204959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223276657, "dur": 81, "args": { "External id": 121520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121520, "pid": 5, "tid": 7, "ts": 1716454223276657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205035, "dur": 13, "args": { "External id": 121520, "cbid": 211, "correlation": 121520 } }, { "ph": "s", "id": 121520, "pid": 76337, "tid": -914061504, "ts": 1716454223205035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223205101, "dur": 0, "args": { "External id": 121532, "cbid": 317, "correlation": 121532 } }, { "ph": "f", "id": 121532, "pid": 76337, "tid": -914061504, "ts": 1716454223205101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223205102, "dur": 0, "args": { "External id": 121533, "cbid": 203, "correlation": 121533 } }, { "ph": "f", "id": 121533, "pid": 76337, "tid": -914061504, "ts": 1716454223205102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223205102, "dur": 0, "args": { "External id": 121534, "cbid": 205, "correlation": 121534 } }, { "ph": "f", "id": 121534, "pid": 76337, "tid": -914061504, "ts": 1716454223205102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223276739, "dur": 24, "args": { "External id": 121538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121538, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121538, "pid": 5, "tid": 7, "ts": 1716454223276739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205117, "dur": 12, "args": { "External id": 121538, "cbid": 211, "correlation": 121538 } }, { "ph": "s", "id": 121538, "pid": 76337, "tid": -914061504, "ts": 1716454223205117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223276764, "dur": 120, "args": { "External id": 121540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121540, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121540, "pid": 5, "tid": 7, "ts": 1716454223276764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205136, "dur": 7, "args": { "External id": 121540, "cbid": 211, "correlation": 121540 } }, { "ph": "s", "id": 121540, "pid": 76337, "tid": -914061504, "ts": 1716454223205136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223276885, "dur": 24, "args": { "External id": 121542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121542, "pid": 5, "tid": 7, "ts": 1716454223276885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205146, "dur": 5, "args": { "External id": 121542, "cbid": 211, "correlation": 121542 } }, { "ph": "s", "id": 121542, "pid": 76337, "tid": -914061504, "ts": 1716454223205146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223276911, "dur": 32, "args": { "External id": 121548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121548, "pid": 5, "tid": 7, "ts": 1716454223276911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205174, "dur": 9, "args": { "External id": 121548, "cbid": 211, "correlation": 121548 } }, { "ph": "s", "id": 121548, "pid": 76337, "tid": -914061504, "ts": 1716454223205174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223276945, "dur": 26, "args": { "External id": 121556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121556, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121556, "pid": 5, "tid": 7, "ts": 1716454223276945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205206, "dur": 8, "args": { "External id": 121556, "cbid": 211, "correlation": 121556 } }, { "ph": "s", "id": 121556, "pid": 76337, "tid": -914061504, "ts": 1716454223205206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223276972, "dur": 54, "args": { "External id": 121565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121565, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121565, "pid": 5, "tid": 7, "ts": 1716454223276972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205245, "dur": 10, "args": { "External id": 121565, "cbid": 211, "correlation": 121565 } }, { "ph": "s", "id": 121565, "pid": 76337, "tid": -914061504, "ts": 1716454223205245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223277028, "dur": 52, "args": { "External id": 121585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121585, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 121585, "pid": 5, "tid": 7, "ts": 1716454223277028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205315, "dur": 11, "args": { "External id": 121585, "cbid": 211, "correlation": 121585 } }, { "ph": "s", "id": 121585, "pid": 76337, "tid": -914061504, "ts": 1716454223205315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223277081, "dur": 4, "args": { "External id": 121597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121597, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 121597, "pid": 5, "tid": 7, "ts": 1716454223277081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205336, "dur": 7, "args": { "External id": 121597, "cbid": 211, "correlation": 121597 } }, { "ph": "s", "id": 121597, "pid": 76337, "tid": -914061504, "ts": 1716454223205336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223277087, "dur": 56, "args": { "External id": 121600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121600, "pid": 5, "tid": 7, "ts": 1716454223277087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205355, "dur": 7, "args": { "External id": 121600, "cbid": 211, "correlation": 121600 } }, { "ph": "s", "id": 121600, "pid": 76337, "tid": -914061504, "ts": 1716454223205355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223277144, "dur": 37, "args": { "External id": 121609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121609, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121609, "pid": 5, "tid": 7, "ts": 1716454223277144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205394, "dur": 10, "args": { "External id": 121609, "cbid": 211, "correlation": 121609 } }, { "ph": "s", "id": 121609, "pid": 76337, "tid": -914061504, "ts": 1716454223205394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223205447, "dur": 0, "args": { "External id": 121619, "cbid": 317, "correlation": 121619 } }, { "ph": "f", "id": 121619, "pid": 76337, "tid": -914061504, "ts": 1716454223205447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223205448, "dur": 0, "args": { "External id": 121620, "cbid": 203, "correlation": 121620 } }, { "ph": "f", "id": 121620, "pid": 76337, "tid": -914061504, "ts": 1716454223205448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223205449, "dur": 0, "args": { "External id": 121621, "cbid": 205, "correlation": 121621 } }, { "ph": "f", "id": 121621, "pid": 76337, "tid": -914061504, "ts": 1716454223205449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223277182, "dur": 40, "args": { "External id": 121625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121625, "pid": 5, "tid": 7, "ts": 1716454223277182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205464, "dur": 11, "args": { "External id": 121625, "cbid": 211, "correlation": 121625 } }, { "ph": "s", "id": 121625, "pid": 76337, "tid": -914061504, "ts": 1716454223205464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223277223, "dur": 83, "args": { "External id": 121627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121627, "pid": 5, "tid": 7, "ts": 1716454223277223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205477, "dur": 5, "args": { "External id": 121627, "cbid": 211, "correlation": 121627 } }, { "ph": "s", "id": 121627, "pid": 76337, "tid": -914061504, "ts": 1716454223205477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223277308, "dur": 1279, "args": { "External id": 121629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121629, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121629, "pid": 5, "tid": 7, "ts": 1716454223277308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205489, "dur": 7, "args": { "External id": 121629, "cbid": 211, "correlation": 121629 } }, { "ph": "s", "id": 121629, "pid": 76337, "tid": -914061504, "ts": 1716454223205489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223278588, "dur": 22, "args": { "External id": 121631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121631, "pid": 5, "tid": 7, "ts": 1716454223278588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205500, "dur": 5, "args": { "External id": 121631, "cbid": 211, "correlation": 121631 } }, { "ph": "s", "id": 121631, "pid": 76337, "tid": -914061504, "ts": 1716454223205500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223278611, "dur": 33, "args": { "External id": 121637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121637, "pid": 5, "tid": 7, "ts": 1716454223278611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205527, "dur": 8, "args": { "External id": 121637, "cbid": 211, "correlation": 121637 } }, { "ph": "s", "id": 121637, "pid": 76337, "tid": -914061504, "ts": 1716454223205527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223278646, "dur": 4, "args": { "External id": 121645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121645, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 121645, "pid": 5, "tid": 7, "ts": 1716454223278646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205570, "dur": 9, "args": { "External id": 121645, "cbid": 211, "correlation": 121645 } }, { "ph": "s", "id": 121645, "pid": 76337, "tid": -914061504, "ts": 1716454223205570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223205634, "dur": 1, "args": { "External id": 121661, "cbid": 251, "correlation": 121661 } }, { "ph": "f", "id": 121661, "pid": 76337, "tid": -914061504, "ts": 1716454223205634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223205640, "dur": 0, "args": { "External id": 121663, "cbid": 251, "correlation": 121663 } }, { "ph": "f", "id": 121663, "pid": 76337, "tid": -914061504, "ts": 1716454223205640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223278652, "dur": 12, "args": { "External id": 121664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121664, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 121664, "pid": 5, "tid": 7, "ts": 1716454223278652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205642, "dur": 11, "args": { "External id": 121664, "cbid": 211, "correlation": 121664 } }, { "ph": "s", "id": 121664, "pid": 76337, "tid": -914061504, "ts": 1716454223205642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223278665, "dur": 5, "args": { "External id": 121666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121666, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 121666, "pid": 5, "tid": 7, "ts": 1716454223278665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205655, "dur": 5, "args": { "External id": 121666, "cbid": 211, "correlation": 121666 } }, { "ph": "s", "id": 121666, "pid": 76337, "tid": -914061504, "ts": 1716454223205655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223278672, "dur": 29, "args": { "External id": 121676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121676, "pid": 5, "tid": 7, "ts": 1716454223278672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205711, "dur": 12, "args": { "External id": 121676, "cbid": 211, "correlation": 121676 } }, { "ph": "s", "id": 121676, "pid": 76337, "tid": -914061504, "ts": 1716454223205711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223278702, "dur": 31, "args": { "External id": 121696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121696, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 121696, "pid": 5, "tid": 7, "ts": 1716454223278702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205777, "dur": 11, "args": { "External id": 121696, "cbid": 211, "correlation": 121696 } }, { "ph": "s", "id": 121696, "pid": 76337, "tid": -914061504, "ts": 1716454223205777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223278735, "dur": 4, "args": { "External id": 121708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121708, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 121708, "pid": 5, "tid": 7, "ts": 1716454223278735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205798, "dur": 6, "args": { "External id": 121708, "cbid": 211, "correlation": 121708 } }, { "ph": "s", "id": 121708, "pid": 76337, "tid": -914061504, "ts": 1716454223205798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223278741, "dur": 30, "args": { "External id": 121711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121711, "pid": 5, "tid": 7, "ts": 1716454223278741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205817, "dur": 6, "args": { "External id": 121711, "cbid": 211, "correlation": 121711 } }, { "ph": "s", "id": 121711, "pid": 76337, "tid": -914061504, "ts": 1716454223205817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223278772, "dur": 20, "args": { "External id": 121720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121720, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121720, "pid": 5, "tid": 7, "ts": 1716454223278772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205857, "dur": 12, "args": { "External id": 121720, "cbid": 211, "correlation": 121720 } }, { "ph": "s", "id": 121720, "pid": 76337, "tid": -914061504, "ts": 1716454223205857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223205920, "dur": 0, "args": { "External id": 121730, "cbid": 317, "correlation": 121730 } }, { "ph": "f", "id": 121730, "pid": 76337, "tid": -914061504, "ts": 1716454223205920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223205921, "dur": 0, "args": { "External id": 121731, "cbid": 203, "correlation": 121731 } }, { "ph": "f", "id": 121731, "pid": 76337, "tid": -914061504, "ts": 1716454223205921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223205922, "dur": 0, "args": { "External id": 121732, "cbid": 205, "correlation": 121732 } }, { "ph": "f", "id": 121732, "pid": 76337, "tid": -914061504, "ts": 1716454223205922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223278793, "dur": 23, "args": { "External id": 121736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121736, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121736, "pid": 5, "tid": 7, "ts": 1716454223278793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205936, "dur": 13, "args": { "External id": 121736, "cbid": 211, "correlation": 121736 } }, { "ph": "s", "id": 121736, "pid": 76337, "tid": -914061504, "ts": 1716454223205936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223278817, "dur": 44, "args": { "External id": 121738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121738, "pid": 5, "tid": 7, "ts": 1716454223278817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205952, "dur": 5, "args": { "External id": 121738, "cbid": 211, "correlation": 121738 } }, { "ph": "s", "id": 121738, "pid": 76337, "tid": -914061504, "ts": 1716454223205952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223278863, "dur": 643, "args": { "External id": 121740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121740, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121740, "pid": 5, "tid": 7, "ts": 1716454223278863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205963, "dur": 6, "args": { "External id": 121740, "cbid": 211, "correlation": 121740 } }, { "ph": "s", "id": 121740, "pid": 76337, "tid": -914061504, "ts": 1716454223205963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223279508, "dur": 21, "args": { "External id": 121742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121742, "pid": 5, "tid": 7, "ts": 1716454223279508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223205972, "dur": 13, "args": { "External id": 121742, "cbid": 211, "correlation": 121742 } }, { "ph": "s", "id": 121742, "pid": 76337, "tid": -914061504, "ts": 1716454223205972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223279530, "dur": 33, "args": { "External id": 121748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121748, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121748, "pid": 5, "tid": 7, "ts": 1716454223279530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206009, "dur": 9, "args": { "External id": 121748, "cbid": 211, "correlation": 121748 } }, { "ph": "s", "id": 121748, "pid": 76337, "tid": -914061504, "ts": 1716454223206009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223206067, "dur": 0, "args": { "External id": 121758, "cbid": 317, "correlation": 121758 } }, { "ph": "f", "id": 121758, "pid": 76337, "tid": -914061504, "ts": 1716454223206067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223206068, "dur": 0, "args": { "External id": 121759, "cbid": 203, "correlation": 121759 } }, { "ph": "f", "id": 121759, "pid": 76337, "tid": -914061504, "ts": 1716454223206068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223206069, "dur": 0, "args": { "External id": 121760, "cbid": 205, "correlation": 121760 } }, { "ph": "f", "id": 121760, "pid": 76337, "tid": -914061504, "ts": 1716454223206069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223279564, "dur": 38, "args": { "External id": 121764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121764, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121764, "pid": 5, "tid": 7, "ts": 1716454223279564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206081, "dur": 11, "args": { "External id": 121764, "cbid": 211, "correlation": 121764 } }, { "ph": "s", "id": 121764, "pid": 76337, "tid": -914061504, "ts": 1716454223206081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223279604, "dur": 191, "args": { "External id": 121766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121766, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121766, "pid": 5, "tid": 7, "ts": 1716454223279604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206099, "dur": 6, "args": { "External id": 121766, "cbid": 211, "correlation": 121766 } }, { "ph": "s", "id": 121766, "pid": 76337, "tid": -914061504, "ts": 1716454223206099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223279796, "dur": 22, "args": { "External id": 121768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121768, "pid": 5, "tid": 7, "ts": 1716454223279796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206108, "dur": 5, "args": { "External id": 121768, "cbid": 211, "correlation": 121768 } }, { "ph": "s", "id": 121768, "pid": 76337, "tid": -914061504, "ts": 1716454223206108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223279820, "dur": 33, "args": { "External id": 121774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121774, "pid": 5, "tid": 7, "ts": 1716454223279820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206133, "dur": 8, "args": { "External id": 121774, "cbid": 211, "correlation": 121774 } }, { "ph": "s", "id": 121774, "pid": 76337, "tid": -914061504, "ts": 1716454223206133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223279854, "dur": 27, "args": { "External id": 121782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121782, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121782, "pid": 5, "tid": 7, "ts": 1716454223279854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206163, "dur": 7, "args": { "External id": 121782, "cbid": 211, "correlation": 121782 } }, { "ph": "s", "id": 121782, "pid": 76337, "tid": -914061504, "ts": 1716454223206163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223279882, "dur": 20, "args": { "External id": 121790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121790, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121790, "pid": 5, "tid": 7, "ts": 1716454223279882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206191, "dur": 9, "args": { "External id": 121790, "cbid": 211, "correlation": 121790 } }, { "ph": "s", "id": 121790, "pid": 76337, "tid": -914061504, "ts": 1716454223206191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223279904, "dur": 30, "args": { "External id": 121810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121810, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 121810, "pid": 5, "tid": 7, "ts": 1716454223279904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206274, "dur": 12, "args": { "External id": 121810, "cbid": 211, "correlation": 121810 } }, { "ph": "s", "id": 121810, "pid": 76337, "tid": -914061504, "ts": 1716454223206274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223279935, "dur": 4, "args": { "External id": 121822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121822, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 121822, "pid": 5, "tid": 7, "ts": 1716454223279935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206296, "dur": 7, "args": { "External id": 121822, "cbid": 211, "correlation": 121822 } }, { "ph": "s", "id": 121822, "pid": 76337, "tid": -914061504, "ts": 1716454223206296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223279940, "dur": 30, "args": { "External id": 121825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121825, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121825, "pid": 5, "tid": 7, "ts": 1716454223279940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206315, "dur": 6, "args": { "External id": 121825, "cbid": 211, "correlation": 121825 } }, { "ph": "s", "id": 121825, "pid": 76337, "tid": -914061504, "ts": 1716454223206315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223206371, "dur": 0, "args": { "External id": 121836, "cbid": 317, "correlation": 121836 } }, { "ph": "f", "id": 121836, "pid": 76337, "tid": -914061504, "ts": 1716454223206371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223206372, "dur": 0, "args": { "External id": 121837, "cbid": 203, "correlation": 121837 } }, { "ph": "f", "id": 121837, "pid": 76337, "tid": -914061504, "ts": 1716454223206372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223206373, "dur": 0, "args": { "External id": 121838, "cbid": 205, "correlation": 121838 } }, { "ph": "f", "id": 121838, "pid": 76337, "tid": -914061504, "ts": 1716454223206373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223279971, "dur": 22, "args": { "External id": 121842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121842, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121842, "pid": 5, "tid": 7, "ts": 1716454223279971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206387, "dur": 11, "args": { "External id": 121842, "cbid": 211, "correlation": 121842 } }, { "ph": "s", "id": 121842, "pid": 76337, "tid": -914061504, "ts": 1716454223206387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223279995, "dur": 105, "args": { "External id": 121844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121844, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121844, "pid": 5, "tid": 7, "ts": 1716454223279995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206405, "dur": 6, "args": { "External id": 121844, "cbid": 211, "correlation": 121844 } }, { "ph": "s", "id": 121844, "pid": 76337, "tid": -914061504, "ts": 1716454223206405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223280101, "dur": 21, "args": { "External id": 121846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121846, "pid": 5, "tid": 7, "ts": 1716454223280101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206414, "dur": 5, "args": { "External id": 121846, "cbid": 211, "correlation": 121846 } }, { "ph": "s", "id": 121846, "pid": 76337, "tid": -914061504, "ts": 1716454223206414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223280124, "dur": 32, "args": { "External id": 121852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121852, "pid": 5, "tid": 7, "ts": 1716454223280124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206441, "dur": 8, "args": { "External id": 121852, "cbid": 211, "correlation": 121852 } }, { "ph": "s", "id": 121852, "pid": 76337, "tid": -914061504, "ts": 1716454223206441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223280158, "dur": 191, "args": { "External id": 121861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121861, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121861, "pid": 5, "tid": 7, "ts": 1716454223280158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206521, "dur": 14, "args": { "External id": 121861, "cbid": 211, "correlation": 121861 } }, { "ph": "s", "id": 121861, "pid": 76337, "tid": -914061504, "ts": 1716454223206521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223280350, "dur": 65, "args": { "External id": 121883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121883, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121883, "pid": 5, "tid": 7, "ts": 1716454223280350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206578, "dur": 10, "args": { "External id": 121883, "cbid": 211, "correlation": 121883 } }, { "ph": "s", "id": 121883, "pid": 76337, "tid": -914061504, "ts": 1716454223206578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223206666, "dur": 1, "args": { "External id": 121894, "cbid": 251, "correlation": 121894 } }, { "ph": "f", "id": 121894, "pid": 76337, "tid": -914061504, "ts": 1716454223206666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223280416, "dur": 154, "args": { "External id": 121895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121895, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121895, "pid": 5, "tid": 7, "ts": 1716454223280416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206671, "dur": 13, "args": { "External id": 121895, "cbid": 211, "correlation": 121895 } }, { "ph": "s", "id": 121895, "pid": 76337, "tid": -914061504, "ts": 1716454223206671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223206742, "dur": 1, "args": { "External id": 121906, "cbid": 251, "correlation": 121906 } }, { "ph": "f", "id": 121906, "pid": 76337, "tid": -914061504, "ts": 1716454223206742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223280571, "dur": 145, "args": { "External id": 121907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121907, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121907, "pid": 5, "tid": 7, "ts": 1716454223280571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206746, "dur": 12, "args": { "External id": 121907, "cbid": 211, "correlation": 121907 } }, { "ph": "s", "id": 121907, "pid": 76337, "tid": -914061504, "ts": 1716454223206746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223206813, "dur": 1, "args": { "External id": 121918, "cbid": 251, "correlation": 121918 } }, { "ph": "f", "id": 121918, "pid": 76337, "tid": -914061504, "ts": 1716454223206813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223280718, "dur": 147, "args": { "External id": 121919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121919, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 121919, "pid": 5, "tid": 7, "ts": 1716454223280718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206817, "dur": 11, "args": { "External id": 121919, "cbid": 211, "correlation": 121919 } }, { "ph": "s", "id": 121919, "pid": 76337, "tid": -914061504, "ts": 1716454223206817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223280866, "dur": 1946, "args": { "External id": 121940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121940, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 121940, "pid": 5, "tid": 7, "ts": 1716454223280866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223206896, "dur": 13, "args": { "External id": 121940, "cbid": 211, "correlation": 121940 } }, { "ph": "s", "id": 121940, "pid": 76337, "tid": -914061504, "ts": 1716454223206896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207001, "dur": 1, "args": { "External id": 121958, "cbid": 251, "correlation": 121958 } }, { "ph": "f", "id": 121958, "pid": 76337, "tid": -914061504, "ts": 1716454223207001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223282814, "dur": 149, "args": { "External id": 121960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121960, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 121960, "pid": 5, "tid": 7, "ts": 1716454223282814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207007, "dur": 14, "args": { "External id": 121960, "cbid": 211, "correlation": 121960 } }, { "ph": "s", "id": 121960, "pid": 76337, "tid": -914061504, "ts": 1716454223207007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223282964, "dur": 35, "args": { "External id": 121968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121968, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121968, "pid": 5, "tid": 7, "ts": 1716454223282964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207079, "dur": 12, "args": { "External id": 121968, "cbid": 211, "correlation": 121968 } }, { "ph": "s", "id": 121968, "pid": 76337, "tid": -914061504, "ts": 1716454223207079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223283001, "dur": 51, "args": { "External id": 121976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121976, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121976, "pid": 5, "tid": 7, "ts": 1716454223283001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207118, "dur": 9, "args": { "External id": 121976, "cbid": 211, "correlation": 121976 } }, { "ph": "s", "id": 121976, "pid": 76337, "tid": -914061504, "ts": 1716454223207118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223283053, "dur": 31, "args": { "External id": 121987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 121987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 121987, "pid": 5, "tid": 7, "ts": 1716454223283053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207190, "dur": 12, "args": { "External id": 121987, "cbid": 211, "correlation": 121987 } }, { "ph": "s", "id": 121987, "pid": 76337, "tid": -914061504, "ts": 1716454223207190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223283085, "dur": 35, "args": { "External id": 122009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122009, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122009, "pid": 5, "tid": 7, "ts": 1716454223283085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207222, "dur": 7, "args": { "External id": 122009, "cbid": 211, "correlation": 122009 } }, { "ph": "s", "id": 122009, "pid": 76337, "tid": -914061504, "ts": 1716454223207222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207306, "dur": 1, "args": { "External id": 122020, "cbid": 251, "correlation": 122020 } }, { "ph": "f", "id": 122020, "pid": 76337, "tid": -914061504, "ts": 1716454223207306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223283121, "dur": 92, "args": { "External id": 122021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122021, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122021, "pid": 5, "tid": 7, "ts": 1716454223283121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207311, "dur": 13, "args": { "External id": 122021, "cbid": 211, "correlation": 122021 } }, { "ph": "s", "id": 122021, "pid": 76337, "tid": -914061504, "ts": 1716454223207311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207379, "dur": 1, "args": { "External id": 122032, "cbid": 251, "correlation": 122032 } }, { "ph": "f", "id": 122032, "pid": 76337, "tid": -914061504, "ts": 1716454223207379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207382, "dur": 0, "args": { "External id": 122033, "cbid": 251, "correlation": 122033 } }, { "ph": "f", "id": 122033, "pid": 76337, "tid": -914061504, "ts": 1716454223207382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223283214, "dur": 11, "args": { "External id": 122034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122034, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 122034, "pid": 5, "tid": 7, "ts": 1716454223283214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207384, "dur": 12, "args": { "External id": 122034, "cbid": 211, "correlation": 122034 } }, { "ph": "s", "id": 122034, "pid": 76337, "tid": -914061504, "ts": 1716454223207384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223283227, "dur": 5, "args": { "External id": 122036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122036, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 122036, "pid": 5, "tid": 7, "ts": 1716454223283227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207397, "dur": 6, "args": { "External id": 122036, "cbid": 211, "correlation": 122036 } }, { "ph": "s", "id": 122036, "pid": 76337, "tid": -914061504, "ts": 1716454223207397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207454, "dur": 1, "args": { "External id": 122047, "cbid": 251, "correlation": 122047 } }, { "ph": "f", "id": 122047, "pid": 76337, "tid": -914061504, "ts": 1716454223207454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207457, "dur": 0, "args": { "External id": 122048, "cbid": 251, "correlation": 122048 } }, { "ph": "f", "id": 122048, "pid": 76337, "tid": -914061504, "ts": 1716454223207457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223283233, "dur": 7, "args": { "External id": 122049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122049, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 122049, "pid": 5, "tid": 7, "ts": 1716454223283233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207459, "dur": 12, "args": { "External id": 122049, "cbid": 211, "correlation": 122049 } }, { "ph": "s", "id": 122049, "pid": 76337, "tid": -914061504, "ts": 1716454223207459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223283242, "dur": 3, "args": { "External id": 122051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122051, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 122051, "pid": 5, "tid": 7, "ts": 1716454223283242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207472, "dur": 6, "args": { "External id": 122051, "cbid": 211, "correlation": 122051 } }, { "ph": "s", "id": 122051, "pid": 76337, "tid": -914061504, "ts": 1716454223207472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223283246, "dur": 91, "args": { "External id": 122072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122072, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 122072, "pid": 5, "tid": 7, "ts": 1716454223283246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207545, "dur": 12, "args": { "External id": 122072, "cbid": 211, "correlation": 122072 } }, { "ph": "s", "id": 122072, "pid": 76337, "tid": -914061504, "ts": 1716454223207545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207643, "dur": 1, "args": { "External id": 122090, "cbid": 251, "correlation": 122090 } }, { "ph": "f", "id": 122090, "pid": 76337, "tid": -914061504, "ts": 1716454223207643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223283339, "dur": 98, "args": { "External id": 122092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122092, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122092, "pid": 5, "tid": 7, "ts": 1716454223283339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207648, "dur": 13, "args": { "External id": 122092, "cbid": 211, "correlation": 122092 } }, { "ph": "s", "id": 122092, "pid": 76337, "tid": -914061504, "ts": 1716454223207648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223283438, "dur": 19, "args": { "External id": 122100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122100, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122100, "pid": 5, "tid": 7, "ts": 1716454223283438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207716, "dur": 13, "args": { "External id": 122100, "cbid": 211, "correlation": 122100 } }, { "ph": "s", "id": 122100, "pid": 76337, "tid": -914061504, "ts": 1716454223207716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223283458, "dur": 38, "args": { "External id": 122108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122108, "pid": 5, "tid": 7, "ts": 1716454223283458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207758, "dur": 9, "args": { "External id": 122108, "cbid": 211, "correlation": 122108 } }, { "ph": "s", "id": 122108, "pid": 76337, "tid": -914061504, "ts": 1716454223207758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223283497, "dur": 35, "args": { "External id": 122130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122130, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122130, "pid": 5, "tid": 7, "ts": 1716454223283497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207808, "dur": 10, "args": { "External id": 122130, "cbid": 211, "correlation": 122130 } }, { "ph": "s", "id": 122130, "pid": 76337, "tid": -914061504, "ts": 1716454223207808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207897, "dur": 1, "args": { "External id": 122146, "cbid": 251, "correlation": 122146 } }, { "ph": "f", "id": 122146, "pid": 76337, "tid": -914061504, "ts": 1716454223207897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223207902, "dur": 0, "args": { "External id": 122148, "cbid": 251, "correlation": 122148 } }, { "ph": "f", "id": 122148, "pid": 76337, "tid": -914061504, "ts": 1716454223207902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223283533, "dur": 539, "args": { "External id": 122149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122149, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 122149, "pid": 5, "tid": 7, "ts": 1716454223283533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207905, "dur": 13, "args": { "External id": 122149, "cbid": 211, "correlation": 122149 } }, { "ph": "s", "id": 122149, "pid": 76337, "tid": -914061504, "ts": 1716454223207905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223284073, "dur": 126, "args": { "External id": 122157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122157, "pid": 5, "tid": 7, "ts": 1716454223284073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223207969, "dur": 21, "args": { "External id": 122157, "cbid": 211, "correlation": 122157 } }, { "ph": "s", "id": 122157, "pid": 76337, "tid": -914061504, "ts": 1716454223207969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223284201, "dur": 130, "args": { "External id": 122165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122165, "pid": 5, "tid": 7, "ts": 1716454223284201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208008, "dur": 8, "args": { "External id": 122165, "cbid": 211, "correlation": 122165 } }, { "ph": "s", "id": 122165, "pid": 76337, "tid": -914061504, "ts": 1716454223208008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223208084, "dur": 1, "args": { "External id": 122181, "cbid": 251, "correlation": 122181 } }, { "ph": "f", "id": 122181, "pid": 76337, "tid": -914061504, "ts": 1716454223208084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223284332, "dur": 303, "args": { "External id": 122183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122183, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122183, "pid": 5, "tid": 7, "ts": 1716454223284332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208090, "dur": 12, "args": { "External id": 122183, "cbid": 211, "correlation": 122183 } }, { "ph": "s", "id": 122183, "pid": 76337, "tid": -914061504, "ts": 1716454223208090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223284636, "dur": 27, "args": { "External id": 122191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122191, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122191, "pid": 5, "tid": 7, "ts": 1716454223284636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208133, "dur": 10, "args": { "External id": 122191, "cbid": 211, "correlation": 122191 } }, { "ph": "s", "id": 122191, "pid": 76337, "tid": -914061504, "ts": 1716454223208133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223284664, "dur": 81, "args": { "External id": 122202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122202, "pid": 5, "tid": 7, "ts": 1716454223284664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208200, "dur": 12, "args": { "External id": 122202, "cbid": 211, "correlation": 122202 } }, { "ph": "s", "id": 122202, "pid": 76337, "tid": -914061504, "ts": 1716454223208200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223208262, "dur": 0, "args": { "External id": 122214, "cbid": 317, "correlation": 122214 } }, { "ph": "f", "id": 122214, "pid": 76337, "tid": -914061504, "ts": 1716454223208262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223208263, "dur": 0, "args": { "External id": 122215, "cbid": 203, "correlation": 122215 } }, { "ph": "f", "id": 122215, "pid": 76337, "tid": -914061504, "ts": 1716454223208263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223208264, "dur": 0, "args": { "External id": 122216, "cbid": 205, "correlation": 122216 } }, { "ph": "f", "id": 122216, "pid": 76337, "tid": -914061504, "ts": 1716454223208264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223284747, "dur": 24, "args": { "External id": 122220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122220, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122220, "pid": 5, "tid": 7, "ts": 1716454223284747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208279, "dur": 12, "args": { "External id": 122220, "cbid": 211, "correlation": 122220 } }, { "ph": "s", "id": 122220, "pid": 76337, "tid": -914061504, "ts": 1716454223208279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223284772, "dur": 120, "args": { "External id": 122222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122222, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122222, "pid": 5, "tid": 7, "ts": 1716454223284772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208298, "dur": 6, "args": { "External id": 122222, "cbid": 211, "correlation": 122222 } }, { "ph": "s", "id": 122222, "pid": 76337, "tid": -914061504, "ts": 1716454223208298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223284893, "dur": 23, "args": { "External id": 122224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122224, "pid": 5, "tid": 7, "ts": 1716454223284893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208307, "dur": 5, "args": { "External id": 122224, "cbid": 211, "correlation": 122224 } }, { "ph": "s", "id": 122224, "pid": 76337, "tid": -914061504, "ts": 1716454223208307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223284918, "dur": 32, "args": { "External id": 122230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122230, "pid": 5, "tid": 7, "ts": 1716454223284918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208335, "dur": 8, "args": { "External id": 122230, "cbid": 211, "correlation": 122230 } }, { "ph": "s", "id": 122230, "pid": 76337, "tid": -914061504, "ts": 1716454223208335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223284952, "dur": 27, "args": { "External id": 122238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122238, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122238, "pid": 5, "tid": 7, "ts": 1716454223284952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208366, "dur": 8, "args": { "External id": 122238, "cbid": 211, "correlation": 122238 } }, { "ph": "s", "id": 122238, "pid": 76337, "tid": -914061504, "ts": 1716454223208366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223284980, "dur": 46, "args": { "External id": 122247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122247, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122247, "pid": 5, "tid": 7, "ts": 1716454223284980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208404, "dur": 10, "args": { "External id": 122247, "cbid": 211, "correlation": 122247 } }, { "ph": "s", "id": 122247, "pid": 76337, "tid": -914061504, "ts": 1716454223208404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223285027, "dur": 42, "args": { "External id": 122267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122267, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 122267, "pid": 5, "tid": 7, "ts": 1716454223285027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208475, "dur": 12, "args": { "External id": 122267, "cbid": 211, "correlation": 122267 } }, { "ph": "s", "id": 122267, "pid": 76337, "tid": -914061504, "ts": 1716454223208475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223285070, "dur": 5, "args": { "External id": 122279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122279, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 122279, "pid": 5, "tid": 7, "ts": 1716454223285070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208496, "dur": 6, "args": { "External id": 122279, "cbid": 211, "correlation": 122279 } }, { "ph": "s", "id": 122279, "pid": 76337, "tid": -914061504, "ts": 1716454223208496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223285076, "dur": 43, "args": { "External id": 122282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122282, "pid": 5, "tid": 7, "ts": 1716454223285076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208515, "dur": 7, "args": { "External id": 122282, "cbid": 211, "correlation": 122282 } }, { "ph": "s", "id": 122282, "pid": 76337, "tid": -914061504, "ts": 1716454223208515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223285120, "dur": 29, "args": { "External id": 122291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122291, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122291, "pid": 5, "tid": 7, "ts": 1716454223285120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208555, "dur": 10, "args": { "External id": 122291, "cbid": 211, "correlation": 122291 } }, { "ph": "s", "id": 122291, "pid": 76337, "tid": -914061504, "ts": 1716454223208555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223208606, "dur": 0, "args": { "External id": 122301, "cbid": 317, "correlation": 122301 } }, { "ph": "f", "id": 122301, "pid": 76337, "tid": -914061504, "ts": 1716454223208606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223208607, "dur": 0, "args": { "External id": 122302, "cbid": 203, "correlation": 122302 } }, { "ph": "f", "id": 122302, "pid": 76337, "tid": -914061504, "ts": 1716454223208607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223208608, "dur": 0, "args": { "External id": 122303, "cbid": 205, "correlation": 122303 } }, { "ph": "f", "id": 122303, "pid": 76337, "tid": -914061504, "ts": 1716454223208608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223285151, "dur": 31, "args": { "External id": 122307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122307, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122307, "pid": 5, "tid": 7, "ts": 1716454223285151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208622, "dur": 12, "args": { "External id": 122307, "cbid": 211, "correlation": 122307 } }, { "ph": "s", "id": 122307, "pid": 76337, "tid": -914061504, "ts": 1716454223208622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223285183, "dur": 63, "args": { "External id": 122309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122309, "pid": 5, "tid": 7, "ts": 1716454223285183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208637, "dur": 5, "args": { "External id": 122309, "cbid": 211, "correlation": 122309 } }, { "ph": "s", "id": 122309, "pid": 76337, "tid": -914061504, "ts": 1716454223208637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223285247, "dur": 963, "args": { "External id": 122311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122311, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122311, "pid": 5, "tid": 7, "ts": 1716454223285247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208648, "dur": 6, "args": { "External id": 122311, "cbid": 211, "correlation": 122311 } }, { "ph": "s", "id": 122311, "pid": 76337, "tid": -914061504, "ts": 1716454223208648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223286212, "dur": 23, "args": { "External id": 122313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122313, "pid": 5, "tid": 7, "ts": 1716454223286212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208658, "dur": 5, "args": { "External id": 122313, "cbid": 211, "correlation": 122313 } }, { "ph": "s", "id": 122313, "pid": 76337, "tid": -914061504, "ts": 1716454223208658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223286236, "dur": 33, "args": { "External id": 122319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122319, "pid": 5, "tid": 7, "ts": 1716454223286236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208687, "dur": 8, "args": { "External id": 122319, "cbid": 211, "correlation": 122319 } }, { "ph": "s", "id": 122319, "pid": 76337, "tid": -914061504, "ts": 1716454223208687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223286270, "dur": 4, "args": { "External id": 122327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122327, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 122327, "pid": 5, "tid": 7, "ts": 1716454223286270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208730, "dur": 10, "args": { "External id": 122327, "cbid": 211, "correlation": 122327 } }, { "ph": "s", "id": 122327, "pid": 76337, "tid": -914061504, "ts": 1716454223208730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223208795, "dur": 1, "args": { "External id": 122343, "cbid": 251, "correlation": 122343 } }, { "ph": "f", "id": 122343, "pid": 76337, "tid": -914061504, "ts": 1716454223208795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223208800, "dur": 0, "args": { "External id": 122345, "cbid": 251, "correlation": 122345 } }, { "ph": "f", "id": 122345, "pid": 76337, "tid": -914061504, "ts": 1716454223208800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223286276, "dur": 11, "args": { "External id": 122346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122346, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 122346, "pid": 5, "tid": 7, "ts": 1716454223286276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208802, "dur": 11, "args": { "External id": 122346, "cbid": 211, "correlation": 122346 } }, { "ph": "s", "id": 122346, "pid": 76337, "tid": -914061504, "ts": 1716454223208802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223286289, "dur": 5, "args": { "External id": 122348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122348, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 122348, "pid": 5, "tid": 7, "ts": 1716454223286289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208814, "dur": 5, "args": { "External id": 122348, "cbid": 211, "correlation": 122348 } }, { "ph": "s", "id": 122348, "pid": 76337, "tid": -914061504, "ts": 1716454223208814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223286295, "dur": 29, "args": { "External id": 122358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122358, "pid": 5, "tid": 7, "ts": 1716454223286295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208872, "dur": 12, "args": { "External id": 122358, "cbid": 211, "correlation": 122358 } }, { "ph": "s", "id": 122358, "pid": 76337, "tid": -914061504, "ts": 1716454223208872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223286325, "dur": 31, "args": { "External id": 122378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122378, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 122378, "pid": 5, "tid": 7, "ts": 1716454223286325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208938, "dur": 11, "args": { "External id": 122378, "cbid": 211, "correlation": 122378 } }, { "ph": "s", "id": 122378, "pid": 76337, "tid": -914061504, "ts": 1716454223208938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223286357, "dur": 4, "args": { "External id": 122390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122390, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 122390, "pid": 5, "tid": 7, "ts": 1716454223286357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208957, "dur": 6, "args": { "External id": 122390, "cbid": 211, "correlation": 122390 } }, { "ph": "s", "id": 122390, "pid": 76337, "tid": -914061504, "ts": 1716454223208957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223286363, "dur": 30, "args": { "External id": 122393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122393, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122393, "pid": 5, "tid": 7, "ts": 1716454223286363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223208984, "dur": 8, "args": { "External id": 122393, "cbid": 211, "correlation": 122393 } }, { "ph": "s", "id": 122393, "pid": 76337, "tid": -914061504, "ts": 1716454223208984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223286394, "dur": 21, "args": { "External id": 122402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122402, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122402, "pid": 5, "tid": 7, "ts": 1716454223286394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209027, "dur": 10, "args": { "External id": 122402, "cbid": 211, "correlation": 122402 } }, { "ph": "s", "id": 122402, "pid": 76337, "tid": -914061504, "ts": 1716454223209027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223209090, "dur": 0, "args": { "External id": 122412, "cbid": 317, "correlation": 122412 } }, { "ph": "f", "id": 122412, "pid": 76337, "tid": -914061504, "ts": 1716454223209090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223209091, "dur": 0, "args": { "External id": 122413, "cbid": 203, "correlation": 122413 } }, { "ph": "f", "id": 122413, "pid": 76337, "tid": -914061504, "ts": 1716454223209091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223209092, "dur": 0, "args": { "External id": 122414, "cbid": 205, "correlation": 122414 } }, { "ph": "f", "id": 122414, "pid": 76337, "tid": -914061504, "ts": 1716454223209092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223286416, "dur": 22, "args": { "External id": 122418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122418, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122418, "pid": 5, "tid": 7, "ts": 1716454223286416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209105, "dur": 12, "args": { "External id": 122418, "cbid": 211, "correlation": 122418 } }, { "ph": "s", "id": 122418, "pid": 76337, "tid": -914061504, "ts": 1716454223209105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223286439, "dur": 44, "args": { "External id": 122420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122420, "pid": 5, "tid": 7, "ts": 1716454223286439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209120, "dur": 5, "args": { "External id": 122420, "cbid": 211, "correlation": 122420 } }, { "ph": "s", "id": 122420, "pid": 76337, "tid": -914061504, "ts": 1716454223209120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223286485, "dur": 644, "args": { "External id": 122422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122422, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122422, "pid": 5, "tid": 7, "ts": 1716454223286485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209131, "dur": 6, "args": { "External id": 122422, "cbid": 211, "correlation": 122422 } }, { "ph": "s", "id": 122422, "pid": 76337, "tid": -914061504, "ts": 1716454223209131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223287130, "dur": 21, "args": { "External id": 122424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122424, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122424, "pid": 5, "tid": 7, "ts": 1716454223287130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209142, "dur": 5, "args": { "External id": 122424, "cbid": 211, "correlation": 122424 } }, { "ph": "s", "id": 122424, "pid": 76337, "tid": -914061504, "ts": 1716454223209142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223287152, "dur": 32, "args": { "External id": 122430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122430, "pid": 5, "tid": 7, "ts": 1716454223287152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209169, "dur": 9, "args": { "External id": 122430, "cbid": 211, "correlation": 122430 } }, { "ph": "s", "id": 122430, "pid": 76337, "tid": -914061504, "ts": 1716454223209169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223209227, "dur": 0, "args": { "External id": 122440, "cbid": 317, "correlation": 122440 } }, { "ph": "f", "id": 122440, "pid": 76337, "tid": -914061504, "ts": 1716454223209227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223209227, "dur": 0, "args": { "External id": 122441, "cbid": 203, "correlation": 122441 } }, { "ph": "f", "id": 122441, "pid": 76337, "tid": -914061504, "ts": 1716454223209227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223209228, "dur": 0, "args": { "External id": 122442, "cbid": 205, "correlation": 122442 } }, { "ph": "f", "id": 122442, "pid": 76337, "tid": -914061504, "ts": 1716454223209228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223287186, "dur": 31, "args": { "External id": 122446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122446, "pid": 5, "tid": 7, "ts": 1716454223287186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209242, "dur": 11, "args": { "External id": 122446, "cbid": 211, "correlation": 122446 } }, { "ph": "s", "id": 122446, "pid": 76337, "tid": -914061504, "ts": 1716454223209242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223287218, "dur": 152, "args": { "External id": 122448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122448, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122448, "pid": 5, "tid": 7, "ts": 1716454223287218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209260, "dur": 6, "args": { "External id": 122448, "cbid": 211, "correlation": 122448 } }, { "ph": "s", "id": 122448, "pid": 76337, "tid": -914061504, "ts": 1716454223209260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223287372, "dur": 23, "args": { "External id": 122450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122450, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122450, "pid": 5, "tid": 7, "ts": 1716454223287372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209270, "dur": 5, "args": { "External id": 122450, "cbid": 211, "correlation": 122450 } }, { "ph": "s", "id": 122450, "pid": 76337, "tid": -914061504, "ts": 1716454223209270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223287396, "dur": 32, "args": { "External id": 122456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122456, "pid": 5, "tid": 7, "ts": 1716454223287396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209296, "dur": 8, "args": { "External id": 122456, "cbid": 211, "correlation": 122456 } }, { "ph": "s", "id": 122456, "pid": 76337, "tid": -914061504, "ts": 1716454223209296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223287430, "dur": 27, "args": { "External id": 122464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122464, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122464, "pid": 5, "tid": 7, "ts": 1716454223287430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209324, "dur": 8, "args": { "External id": 122464, "cbid": 211, "correlation": 122464 } }, { "ph": "s", "id": 122464, "pid": 76337, "tid": -914061504, "ts": 1716454223209324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223287458, "dur": 20, "args": { "External id": 122472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122472, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122472, "pid": 5, "tid": 7, "ts": 1716454223287458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209352, "dur": 8, "args": { "External id": 122472, "cbid": 211, "correlation": 122472 } }, { "ph": "s", "id": 122472, "pid": 76337, "tid": -914061504, "ts": 1716454223209352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223287479, "dur": 29, "args": { "External id": 122492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122492, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 122492, "pid": 5, "tid": 7, "ts": 1716454223287479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209434, "dur": 13, "args": { "External id": 122492, "cbid": 211, "correlation": 122492 } }, { "ph": "s", "id": 122492, "pid": 76337, "tid": -914061504, "ts": 1716454223209434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223287510, "dur": 4, "args": { "External id": 122504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122504, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 122504, "pid": 5, "tid": 7, "ts": 1716454223287510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209457, "dur": 6, "args": { "External id": 122504, "cbid": 211, "correlation": 122504 } }, { "ph": "s", "id": 122504, "pid": 76337, "tid": -914061504, "ts": 1716454223209457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223287516, "dur": 32, "args": { "External id": 122507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122507, "pid": 5, "tid": 7, "ts": 1716454223287516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209474, "dur": 6, "args": { "External id": 122507, "cbid": 211, "correlation": 122507 } }, { "ph": "s", "id": 122507, "pid": 76337, "tid": -914061504, "ts": 1716454223209474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223209530, "dur": 0, "args": { "External id": 122518, "cbid": 317, "correlation": 122518 } }, { "ph": "f", "id": 122518, "pid": 76337, "tid": -914061504, "ts": 1716454223209530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223209531, "dur": 0, "args": { "External id": 122519, "cbid": 203, "correlation": 122519 } }, { "ph": "f", "id": 122519, "pid": 76337, "tid": -914061504, "ts": 1716454223209531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223209531, "dur": 0, "args": { "External id": 122520, "cbid": 205, "correlation": 122520 } }, { "ph": "f", "id": 122520, "pid": 76337, "tid": -914061504, "ts": 1716454223209531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223287549, "dur": 21, "args": { "External id": 122524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122524, "pid": 5, "tid": 7, "ts": 1716454223287549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209545, "dur": 11, "args": { "External id": 122524, "cbid": 211, "correlation": 122524 } }, { "ph": "s", "id": 122524, "pid": 76337, "tid": -914061504, "ts": 1716454223209545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223287572, "dur": 105, "args": { "External id": 122526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122526, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122526, "pid": 5, "tid": 7, "ts": 1716454223287572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209562, "dur": 6, "args": { "External id": 122526, "cbid": 211, "correlation": 122526 } }, { "ph": "s", "id": 122526, "pid": 76337, "tid": -914061504, "ts": 1716454223209562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223287678, "dur": 22, "args": { "External id": 122528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122528, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122528, "pid": 5, "tid": 7, "ts": 1716454223287678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209571, "dur": 5, "args": { "External id": 122528, "cbid": 211, "correlation": 122528 } }, { "ph": "s", "id": 122528, "pid": 76337, "tid": -914061504, "ts": 1716454223209571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223287701, "dur": 33, "args": { "External id": 122534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122534, "pid": 5, "tid": 7, "ts": 1716454223287701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209599, "dur": 8, "args": { "External id": 122534, "cbid": 211, "correlation": 122534 } }, { "ph": "s", "id": 122534, "pid": 76337, "tid": -914061504, "ts": 1716454223209599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223287735, "dur": 194, "args": { "External id": 122543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122543, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122543, "pid": 5, "tid": 7, "ts": 1716454223287735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209679, "dur": 14, "args": { "External id": 122543, "cbid": 211, "correlation": 122543 } }, { "ph": "s", "id": 122543, "pid": 76337, "tid": -914061504, "ts": 1716454223209679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223287930, "dur": 65, "args": { "External id": 122565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122565, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122565, "pid": 5, "tid": 7, "ts": 1716454223287930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209736, "dur": 11, "args": { "External id": 122565, "cbid": 211, "correlation": 122565 } }, { "ph": "s", "id": 122565, "pid": 76337, "tid": -914061504, "ts": 1716454223209736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223209822, "dur": 1, "args": { "External id": 122576, "cbid": 251, "correlation": 122576 } }, { "ph": "f", "id": 122576, "pid": 76337, "tid": -914061504, "ts": 1716454223209822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223287997, "dur": 150, "args": { "External id": 122577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122577, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122577, "pid": 5, "tid": 7, "ts": 1716454223287997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209827, "dur": 13, "args": { "External id": 122577, "cbid": 211, "correlation": 122577 } }, { "ph": "s", "id": 122577, "pid": 76337, "tid": -914061504, "ts": 1716454223209827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223209898, "dur": 1, "args": { "External id": 122588, "cbid": 251, "correlation": 122588 } }, { "ph": "f", "id": 122588, "pid": 76337, "tid": -914061504, "ts": 1716454223209898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223288149, "dur": 145, "args": { "External id": 122589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122589, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122589, "pid": 5, "tid": 7, "ts": 1716454223288149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209902, "dur": 11, "args": { "External id": 122589, "cbid": 211, "correlation": 122589 } }, { "ph": "s", "id": 122589, "pid": 76337, "tid": -914061504, "ts": 1716454223209902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223209967, "dur": 1, "args": { "External id": 122600, "cbid": 251, "correlation": 122600 } }, { "ph": "f", "id": 122600, "pid": 76337, "tid": -914061504, "ts": 1716454223209967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223288295, "dur": 145, "args": { "External id": 122601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122601, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122601, "pid": 5, "tid": 7, "ts": 1716454223288295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223209971, "dur": 19, "args": { "External id": 122601, "cbid": 211, "correlation": 122601 } }, { "ph": "s", "id": 122601, "pid": 76337, "tid": -914061504, "ts": 1716454223209971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223288442, "dur": 1943, "args": { "External id": 122622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122622, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 122622, "pid": 5, "tid": 7, "ts": 1716454223288442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210061, "dur": 12, "args": { "External id": 122622, "cbid": 211, "correlation": 122622 } }, { "ph": "s", "id": 122622, "pid": 76337, "tid": -914061504, "ts": 1716454223210061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210158, "dur": 1, "args": { "External id": 122640, "cbid": 251, "correlation": 122640 } }, { "ph": "f", "id": 122640, "pid": 76337, "tid": -914061504, "ts": 1716454223210158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223290386, "dur": 148, "args": { "External id": 122642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122642, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 122642, "pid": 5, "tid": 7, "ts": 1716454223290386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210164, "dur": 13, "args": { "External id": 122642, "cbid": 211, "correlation": 122642 } }, { "ph": "s", "id": 122642, "pid": 76337, "tid": -914061504, "ts": 1716454223210164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223290535, "dur": 36, "args": { "External id": 122650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122650, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122650, "pid": 5, "tid": 7, "ts": 1716454223290535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210234, "dur": 13, "args": { "External id": 122650, "cbid": 211, "correlation": 122650 } }, { "ph": "s", "id": 122650, "pid": 76337, "tid": -914061504, "ts": 1716454223210234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223290573, "dur": 51, "args": { "External id": 122658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122658, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122658, "pid": 5, "tid": 7, "ts": 1716454223290573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210273, "dur": 9, "args": { "External id": 122658, "cbid": 211, "correlation": 122658 } }, { "ph": "s", "id": 122658, "pid": 76337, "tid": -914061504, "ts": 1716454223210273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223290625, "dur": 30, "args": { "External id": 122669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122669, "pid": 5, "tid": 7, "ts": 1716454223290625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210345, "dur": 12, "args": { "External id": 122669, "cbid": 211, "correlation": 122669 } }, { "ph": "s", "id": 122669, "pid": 76337, "tid": -914061504, "ts": 1716454223210345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223290657, "dur": 34, "args": { "External id": 122691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122691, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122691, "pid": 5, "tid": 7, "ts": 1716454223290657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210375, "dur": 8, "args": { "External id": 122691, "cbid": 211, "correlation": 122691 } }, { "ph": "s", "id": 122691, "pid": 76337, "tid": -914061504, "ts": 1716454223210375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210459, "dur": 1, "args": { "External id": 122702, "cbid": 251, "correlation": 122702 } }, { "ph": "f", "id": 122702, "pid": 76337, "tid": -914061504, "ts": 1716454223210459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223290692, "dur": 90, "args": { "External id": 122703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122703, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122703, "pid": 5, "tid": 7, "ts": 1716454223290692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210464, "dur": 13, "args": { "External id": 122703, "cbid": 211, "correlation": 122703 } }, { "ph": "s", "id": 122703, "pid": 76337, "tid": -914061504, "ts": 1716454223210464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210534, "dur": 1, "args": { "External id": 122714, "cbid": 251, "correlation": 122714 } }, { "ph": "f", "id": 122714, "pid": 76337, "tid": -914061504, "ts": 1716454223210534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210537, "dur": 0, "args": { "External id": 122715, "cbid": 251, "correlation": 122715 } }, { "ph": "f", "id": 122715, "pid": 76337, "tid": -914061504, "ts": 1716454223210537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223290784, "dur": 11, "args": { "External id": 122716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122716, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 122716, "pid": 5, "tid": 7, "ts": 1716454223290784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210539, "dur": 12, "args": { "External id": 122716, "cbid": 211, "correlation": 122716 } }, { "ph": "s", "id": 122716, "pid": 76337, "tid": -914061504, "ts": 1716454223210539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223290796, "dur": 5, "args": { "External id": 122718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122718, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 122718, "pid": 5, "tid": 7, "ts": 1716454223290796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210553, "dur": 6, "args": { "External id": 122718, "cbid": 211, "correlation": 122718 } }, { "ph": "s", "id": 122718, "pid": 76337, "tid": -914061504, "ts": 1716454223210553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210610, "dur": 1, "args": { "External id": 122729, "cbid": 251, "correlation": 122729 } }, { "ph": "f", "id": 122729, "pid": 76337, "tid": -914061504, "ts": 1716454223210610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210613, "dur": 0, "args": { "External id": 122730, "cbid": 251, "correlation": 122730 } }, { "ph": "f", "id": 122730, "pid": 76337, "tid": -914061504, "ts": 1716454223210613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223290802, "dur": 8, "args": { "External id": 122731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122731, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 122731, "pid": 5, "tid": 7, "ts": 1716454223290802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210615, "dur": 11, "args": { "External id": 122731, "cbid": 211, "correlation": 122731 } }, { "ph": "s", "id": 122731, "pid": 76337, "tid": -914061504, "ts": 1716454223210615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223290811, "dur": 3, "args": { "External id": 122733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122733, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 122733, "pid": 5, "tid": 7, "ts": 1716454223290811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210628, "dur": 5, "args": { "External id": 122733, "cbid": 211, "correlation": 122733 } }, { "ph": "s", "id": 122733, "pid": 76337, "tid": -914061504, "ts": 1716454223210628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223290816, "dur": 91, "args": { "External id": 122754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122754, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 122754, "pid": 5, "tid": 7, "ts": 1716454223290816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210702, "dur": 13, "args": { "External id": 122754, "cbid": 211, "correlation": 122754 } }, { "ph": "s", "id": 122754, "pid": 76337, "tid": -914061504, "ts": 1716454223210702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223210799, "dur": 1, "args": { "External id": 122772, "cbid": 251, "correlation": 122772 } }, { "ph": "f", "id": 122772, "pid": 76337, "tid": -914061504, "ts": 1716454223210799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223290908, "dur": 85, "args": { "External id": 122774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122774, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122774, "pid": 5, "tid": 7, "ts": 1716454223290908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210805, "dur": 13, "args": { "External id": 122774, "cbid": 211, "correlation": 122774 } }, { "ph": "s", "id": 122774, "pid": 76337, "tid": -914061504, "ts": 1716454223210805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223290994, "dur": 19, "args": { "External id": 122782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122782, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122782, "pid": 5, "tid": 7, "ts": 1716454223290994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210874, "dur": 12, "args": { "External id": 122782, "cbid": 211, "correlation": 122782 } }, { "ph": "s", "id": 122782, "pid": 76337, "tid": -914061504, "ts": 1716454223210874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223291015, "dur": 38, "args": { "External id": 122790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122790, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122790, "pid": 5, "tid": 7, "ts": 1716454223291015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210916, "dur": 9, "args": { "External id": 122790, "cbid": 211, "correlation": 122790 } }, { "ph": "s", "id": 122790, "pid": 76337, "tid": -914061504, "ts": 1716454223210916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223291054, "dur": 34, "args": { "External id": 122812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122812, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122812, "pid": 5, "tid": 7, "ts": 1716454223291054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223210967, "dur": 17, "args": { "External id": 122812, "cbid": 211, "correlation": 122812 } }, { "ph": "s", "id": 122812, "pid": 76337, "tid": -914061504, "ts": 1716454223210967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223211064, "dur": 1, "args": { "External id": 122828, "cbid": 251, "correlation": 122828 } }, { "ph": "f", "id": 122828, "pid": 76337, "tid": -914061504, "ts": 1716454223211064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223211069, "dur": 0, "args": { "External id": 122830, "cbid": 251, "correlation": 122830 } }, { "ph": "f", "id": 122830, "pid": 76337, "tid": -914061504, "ts": 1716454223211069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223291090, "dur": 540, "args": { "External id": 122831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122831, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 122831, "pid": 5, "tid": 7, "ts": 1716454223291090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211072, "dur": 13, "args": { "External id": 122831, "cbid": 211, "correlation": 122831 } }, { "ph": "s", "id": 122831, "pid": 76337, "tid": -914061504, "ts": 1716454223211072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223291631, "dur": 126, "args": { "External id": 122839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122839, "pid": 5, "tid": 7, "ts": 1716454223291631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211137, "dur": 12, "args": { "External id": 122839, "cbid": 211, "correlation": 122839 } }, { "ph": "s", "id": 122839, "pid": 76337, "tid": -914061504, "ts": 1716454223211137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223291758, "dur": 129, "args": { "External id": 122847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122847, "pid": 5, "tid": 7, "ts": 1716454223291758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211167, "dur": 8, "args": { "External id": 122847, "cbid": 211, "correlation": 122847 } }, { "ph": "s", "id": 122847, "pid": 76337, "tid": -914061504, "ts": 1716454223211167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223211243, "dur": 1, "args": { "External id": 122863, "cbid": 251, "correlation": 122863 } }, { "ph": "f", "id": 122863, "pid": 76337, "tid": -914061504, "ts": 1716454223211243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223291889, "dur": 308, "args": { "External id": 122865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122865, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122865, "pid": 5, "tid": 7, "ts": 1716454223291889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211249, "dur": 13, "args": { "External id": 122865, "cbid": 211, "correlation": 122865 } }, { "ph": "s", "id": 122865, "pid": 76337, "tid": -914061504, "ts": 1716454223211249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223292198, "dur": 27, "args": { "External id": 122873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122873, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122873, "pid": 5, "tid": 7, "ts": 1716454223292198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211291, "dur": 10, "args": { "External id": 122873, "cbid": 211, "correlation": 122873 } }, { "ph": "s", "id": 122873, "pid": 76337, "tid": -914061504, "ts": 1716454223211291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223292227, "dur": 81, "args": { "External id": 122884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122884, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122884, "pid": 5, "tid": 7, "ts": 1716454223292227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211357, "dur": 12, "args": { "External id": 122884, "cbid": 211, "correlation": 122884 } }, { "ph": "s", "id": 122884, "pid": 76337, "tid": -914061504, "ts": 1716454223211357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223211421, "dur": 0, "args": { "External id": 122896, "cbid": 317, "correlation": 122896 } }, { "ph": "f", "id": 122896, "pid": 76337, "tid": -914061504, "ts": 1716454223211421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223211421, "dur": 0, "args": { "External id": 122897, "cbid": 203, "correlation": 122897 } }, { "ph": "f", "id": 122897, "pid": 76337, "tid": -914061504, "ts": 1716454223211421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223211422, "dur": 0, "args": { "External id": 122898, "cbid": 205, "correlation": 122898 } }, { "ph": "f", "id": 122898, "pid": 76337, "tid": -914061504, "ts": 1716454223211422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223292309, "dur": 22, "args": { "External id": 122902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122902, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122902, "pid": 5, "tid": 7, "ts": 1716454223292309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211437, "dur": 12, "args": { "External id": 122902, "cbid": 211, "correlation": 122902 } }, { "ph": "s", "id": 122902, "pid": 76337, "tid": -914061504, "ts": 1716454223211437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223292333, "dur": 121, "args": { "External id": 122904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122904, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122904, "pid": 5, "tid": 7, "ts": 1716454223292333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211456, "dur": 7, "args": { "External id": 122904, "cbid": 211, "correlation": 122904 } }, { "ph": "s", "id": 122904, "pid": 76337, "tid": -914061504, "ts": 1716454223211456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223292456, "dur": 23, "args": { "External id": 122906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122906, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122906, "pid": 5, "tid": 7, "ts": 1716454223292456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211467, "dur": 5, "args": { "External id": 122906, "cbid": 211, "correlation": 122906 } }, { "ph": "s", "id": 122906, "pid": 76337, "tid": -914061504, "ts": 1716454223211467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223292480, "dur": 33, "args": { "External id": 122912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122912, "pid": 5, "tid": 7, "ts": 1716454223292480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211494, "dur": 8, "args": { "External id": 122912, "cbid": 211, "correlation": 122912 } }, { "ph": "s", "id": 122912, "pid": 76337, "tid": -914061504, "ts": 1716454223211494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223292514, "dur": 26, "args": { "External id": 122920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122920, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122920, "pid": 5, "tid": 7, "ts": 1716454223292514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211525, "dur": 8, "args": { "External id": 122920, "cbid": 211, "correlation": 122920 } }, { "ph": "s", "id": 122920, "pid": 76337, "tid": -914061504, "ts": 1716454223211525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223292542, "dur": 102, "args": { "External id": 122931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122931, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122931, "pid": 5, "tid": 7, "ts": 1716454223292542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211586, "dur": 11, "args": { "External id": 122931, "cbid": 211, "correlation": 122931 } }, { "ph": "s", "id": 122931, "pid": 76337, "tid": -914061504, "ts": 1716454223211586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223211641, "dur": 0, "args": { "External id": 122941, "cbid": 317, "correlation": 122941 } }, { "ph": "f", "id": 122941, "pid": 76337, "tid": -914061504, "ts": 1716454223211641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223211641, "dur": 0, "args": { "External id": 122942, "cbid": 203, "correlation": 122942 } }, { "ph": "f", "id": 122942, "pid": 76337, "tid": -914061504, "ts": 1716454223211641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223211642, "dur": 0, "args": { "External id": 122943, "cbid": 205, "correlation": 122943 } }, { "ph": "f", "id": 122943, "pid": 76337, "tid": -914061504, "ts": 1716454223211642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223292645, "dur": 76, "args": { "External id": 122947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122947, "pid": 5, "tid": 7, "ts": 1716454223292645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211656, "dur": 13, "args": { "External id": 122947, "cbid": 211, "correlation": 122947 } }, { "ph": "s", "id": 122947, "pid": 76337, "tid": -914061504, "ts": 1716454223211656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223292722, "dur": 44, "args": { "External id": 122949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122949, "pid": 5, "tid": 7, "ts": 1716454223292722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211671, "dur": 5, "args": { "External id": 122949, "cbid": 211, "correlation": 122949 } }, { "ph": "s", "id": 122949, "pid": 76337, "tid": -914061504, "ts": 1716454223211671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223292767, "dur": 4, "args": { "External id": 122951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122951, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 122951, "pid": 5, "tid": 7, "ts": 1716454223292767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211681, "dur": 6, "args": { "External id": 122951, "cbid": 211, "correlation": 122951 } }, { "ph": "s", "id": 122951, "pid": 76337, "tid": -914061504, "ts": 1716454223211681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223211689, "dur": 0, "args": { "External id": 122952, "cbid": 51, "correlation": 122952 } }, { "ph": "s", "id": 122952, "pid": 76337, "tid": -914061504, "ts": 1716454223211689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223292773, "dur": 2225, "args": { "External id": 122953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122953, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 122953, "pid": 5, "tid": 7, "ts": 1716454223292773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211690, "dur": 5, "args": { "External id": 122953, "cbid": 211, "correlation": 122953 } }, { "ph": "s", "id": 122953, "pid": 76337, "tid": -914061504, "ts": 1716454223211690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223294999, "dur": 113, "args": { "External id": 122958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122958, "pid": 5, "tid": 7, "ts": 1716454223294999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211718, "dur": 8, "args": { "External id": 122958, "cbid": 211, "correlation": 122958 } }, { "ph": "s", "id": 122958, "pid": 76337, "tid": -914061504, "ts": 1716454223211718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223295114, "dur": 165, "args": { "External id": 122967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122967, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 122967, "pid": 5, "tid": 7, "ts": 1716454223295114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211808, "dur": 13, "args": { "External id": 122967, "cbid": 211, "correlation": 122967 } }, { "ph": "s", "id": 122967, "pid": 76337, "tid": -914061504, "ts": 1716454223211808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223295281, "dur": 128, "args": { "External id": 122987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122987, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 122987, "pid": 5, "tid": 7, "ts": 1716454223295281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211879, "dur": 11, "args": { "External id": 122987, "cbid": 211, "correlation": 122987 } }, { "ph": "s", "id": 122987, "pid": 76337, "tid": -914061504, "ts": 1716454223211879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223295410, "dur": 4, "args": { "External id": 122999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 122999, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 122999, "pid": 5, "tid": 7, "ts": 1716454223295410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211899, "dur": 6, "args": { "External id": 122999, "cbid": 211, "correlation": 122999 } }, { "ph": "s", "id": 122999, "pid": 76337, "tid": -914061504, "ts": 1716454223211899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223295416, "dur": 160, "args": { "External id": 123002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123002, "pid": 5, "tid": 7, "ts": 1716454223295416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211919, "dur": 6, "args": { "External id": 123002, "cbid": 211, "correlation": 123002 } }, { "ph": "s", "id": 123002, "pid": 76337, "tid": -914061504, "ts": 1716454223211919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223295578, "dur": 101, "args": { "External id": 123011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123011, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123011, "pid": 5, "tid": 7, "ts": 1716454223295578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223211958, "dur": 11, "args": { "External id": 123011, "cbid": 211, "correlation": 123011 } }, { "ph": "s", "id": 123011, "pid": 76337, "tid": -914061504, "ts": 1716454223211958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223212021, "dur": 0, "args": { "External id": 123021, "cbid": 317, "correlation": 123021 } }, { "ph": "f", "id": 123021, "pid": 76337, "tid": -914061504, "ts": 1716454223212021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223212022, "dur": 0, "args": { "External id": 123022, "cbid": 203, "correlation": 123022 } }, { "ph": "f", "id": 123022, "pid": 76337, "tid": -914061504, "ts": 1716454223212022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223212023, "dur": 0, "args": { "External id": 123023, "cbid": 205, "correlation": 123023 } }, { "ph": "f", "id": 123023, "pid": 76337, "tid": -914061504, "ts": 1716454223212023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223295680, "dur": 111, "args": { "External id": 123027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123027, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123027, "pid": 5, "tid": 7, "ts": 1716454223295680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212039, "dur": 12, "args": { "External id": 123027, "cbid": 211, "correlation": 123027 } }, { "ph": "s", "id": 123027, "pid": 76337, "tid": -914061504, "ts": 1716454223212039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223295792, "dur": 34, "args": { "External id": 123029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123029, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123029, "pid": 5, "tid": 7, "ts": 1716454223295792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212053, "dur": 5, "args": { "External id": 123029, "cbid": 211, "correlation": 123029 } }, { "ph": "s", "id": 123029, "pid": 76337, "tid": -914061504, "ts": 1716454223212053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223295827, "dur": 4, "args": { "External id": 123031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123031, "pid": 5, "tid": 7, "ts": 1716454223295827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212064, "dur": 6, "args": { "External id": 123031, "cbid": 211, "correlation": 123031 } }, { "ph": "s", "id": 123031, "pid": 76337, "tid": -914061504, "ts": 1716454223212064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223212072, "dur": 0, "args": { "External id": 123032, "cbid": 51, "correlation": 123032 } }, { "ph": "s", "id": 123032, "pid": 76337, "tid": -914061504, "ts": 1716454223212072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223295832, "dur": 2007, "args": { "External id": 123033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123033, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123033, "pid": 5, "tid": 7, "ts": 1716454223295832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212073, "dur": 6, "args": { "External id": 123033, "cbid": 211, "correlation": 123033 } }, { "ph": "s", "id": 123033, "pid": 76337, "tid": -914061504, "ts": 1716454223212073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223297840, "dur": 59, "args": { "External id": 123038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123038, "pid": 5, "tid": 7, "ts": 1716454223297840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212102, "dur": 10, "args": { "External id": 123038, "cbid": 211, "correlation": 123038 } }, { "ph": "s", "id": 123038, "pid": 76337, "tid": -914061504, "ts": 1716454223212102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223297901, "dur": 4, "args": { "External id": 123046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123046, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123046, "pid": 5, "tid": 7, "ts": 1716454223297901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212146, "dur": 9, "args": { "External id": 123046, "cbid": 211, "correlation": 123046 } }, { "ph": "s", "id": 123046, "pid": 76337, "tid": -914061504, "ts": 1716454223212146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212213, "dur": 2, "args": { "External id": 123062, "cbid": 251, "correlation": 123062 } }, { "ph": "f", "id": 123062, "pid": 76337, "tid": -914061504, "ts": 1716454223212213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212219, "dur": 0, "args": { "External id": 123064, "cbid": 251, "correlation": 123064 } }, { "ph": "f", "id": 123064, "pid": 76337, "tid": -914061504, "ts": 1716454223212219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223297906, "dur": 11, "args": { "External id": 123065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123065, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 123065, "pid": 5, "tid": 7, "ts": 1716454223297906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212221, "dur": 11, "args": { "External id": 123065, "cbid": 211, "correlation": 123065 } }, { "ph": "s", "id": 123065, "pid": 76337, "tid": -914061504, "ts": 1716454223212221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223297919, "dur": 5, "args": { "External id": 123067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123067, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 123067, "pid": 5, "tid": 7, "ts": 1716454223297919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212234, "dur": 5, "args": { "External id": 123067, "cbid": 211, "correlation": 123067 } }, { "ph": "s", "id": 123067, "pid": 76337, "tid": -914061504, "ts": 1716454223212234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223297925, "dur": 54, "args": { "External id": 123077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123077, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123077, "pid": 5, "tid": 7, "ts": 1716454223297925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212291, "dur": 12, "args": { "External id": 123077, "cbid": 211, "correlation": 123077 } }, { "ph": "s", "id": 123077, "pid": 76337, "tid": -914061504, "ts": 1716454223212291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223297981, "dur": 50, "args": { "External id": 123097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123097, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 123097, "pid": 5, "tid": 7, "ts": 1716454223297981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212358, "dur": 11, "args": { "External id": 123097, "cbid": 211, "correlation": 123097 } }, { "ph": "s", "id": 123097, "pid": 76337, "tid": -914061504, "ts": 1716454223212358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223298033, "dur": 4, "args": { "External id": 123109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123109, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123109, "pid": 5, "tid": 7, "ts": 1716454223298033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212380, "dur": 6, "args": { "External id": 123109, "cbid": 211, "correlation": 123109 } }, { "ph": "s", "id": 123109, "pid": 76337, "tid": -914061504, "ts": 1716454223212380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223298038, "dur": 56, "args": { "External id": 123112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123112, "pid": 5, "tid": 7, "ts": 1716454223298038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212398, "dur": 6, "args": { "External id": 123112, "cbid": 211, "correlation": 123112 } }, { "ph": "s", "id": 123112, "pid": 76337, "tid": -914061504, "ts": 1716454223212398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223298095, "dur": 37, "args": { "External id": 123121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123121, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123121, "pid": 5, "tid": 7, "ts": 1716454223298095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212438, "dur": 10, "args": { "External id": 123121, "cbid": 211, "correlation": 123121 } }, { "ph": "s", "id": 123121, "pid": 76337, "tid": -914061504, "ts": 1716454223212438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223212502, "dur": 0, "args": { "External id": 123131, "cbid": 317, "correlation": 123131 } }, { "ph": "f", "id": 123131, "pid": 76337, "tid": -914061504, "ts": 1716454223212502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223212503, "dur": 0, "args": { "External id": 123132, "cbid": 203, "correlation": 123132 } }, { "ph": "f", "id": 123132, "pid": 76337, "tid": -914061504, "ts": 1716454223212503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223212503, "dur": 0, "args": { "External id": 123133, "cbid": 205, "correlation": 123133 } }, { "ph": "f", "id": 123133, "pid": 76337, "tid": -914061504, "ts": 1716454223212503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223298133, "dur": 41, "args": { "External id": 123137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123137, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123137, "pid": 5, "tid": 7, "ts": 1716454223298133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212520, "dur": 12, "args": { "External id": 123137, "cbid": 211, "correlation": 123137 } }, { "ph": "s", "id": 123137, "pid": 76337, "tid": -914061504, "ts": 1716454223212520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223298176, "dur": 14, "args": { "External id": 123139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123139, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123139, "pid": 5, "tid": 7, "ts": 1716454223298176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212534, "dur": 5, "args": { "External id": 123139, "cbid": 211, "correlation": 123139 } }, { "ph": "s", "id": 123139, "pid": 76337, "tid": -914061504, "ts": 1716454223212534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223298191, "dur": 3, "args": { "External id": 123141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123141, "pid": 5, "tid": 7, "ts": 1716454223298191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212543, "dur": 5, "args": { "External id": 123141, "cbid": 211, "correlation": 123141 } }, { "ph": "s", "id": 123141, "pid": 76337, "tid": -914061504, "ts": 1716454223212543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223212551, "dur": 0, "args": { "External id": 123142, "cbid": 51, "correlation": 123142 } }, { "ph": "s", "id": 123142, "pid": 76337, "tid": -914061504, "ts": 1716454223212551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223298196, "dur": 700, "args": { "External id": 123143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123143, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123143, "pid": 5, "tid": 7, "ts": 1716454223298196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212552, "dur": 5, "args": { "External id": 123143, "cbid": 211, "correlation": 123143 } }, { "ph": "s", "id": 123143, "pid": 76337, "tid": -914061504, "ts": 1716454223212552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223298897, "dur": 60, "args": { "External id": 123148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123148, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123148, "pid": 5, "tid": 7, "ts": 1716454223298897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212580, "dur": 9, "args": { "External id": 123148, "cbid": 211, "correlation": 123148 } }, { "ph": "s", "id": 123148, "pid": 76337, "tid": -914061504, "ts": 1716454223212580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223212638, "dur": 0, "args": { "External id": 123158, "cbid": 317, "correlation": 123158 } }, { "ph": "f", "id": 123158, "pid": 76337, "tid": -914061504, "ts": 1716454223212638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223212639, "dur": 0, "args": { "External id": 123159, "cbid": 203, "correlation": 123159 } }, { "ph": "f", "id": 123159, "pid": 76337, "tid": -914061504, "ts": 1716454223212639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223212639, "dur": 0, "args": { "External id": 123160, "cbid": 205, "correlation": 123160 } }, { "ph": "f", "id": 123160, "pid": 76337, "tid": -914061504, "ts": 1716454223212639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223298958, "dur": 3, "args": { "External id": 123164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123164, "pid": 5, "tid": 7, "ts": 1716454223298958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212655, "dur": 13, "args": { "External id": 123164, "cbid": 211, "correlation": 123164 } }, { "ph": "s", "id": 123164, "pid": 76337, "tid": -914061504, "ts": 1716454223212655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223212672, "dur": 0, "args": { "External id": 123165, "cbid": 51, "correlation": 123165 } }, { "ph": "s", "id": 123165, "pid": 76337, "tid": -914061504, "ts": 1716454223212672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454223298963, "dur": 266, "args": { "External id": 123166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123166, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123166, "pid": 5, "tid": 7, "ts": 1716454223298963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212673, "dur": 7, "args": { "External id": 123166, "cbid": 211, "correlation": 123166 } }, { "ph": "s", "id": 123166, "pid": 76337, "tid": -914061504, "ts": 1716454223212673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223299230, "dur": 60, "args": { "External id": 123171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123171, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123171, "pid": 5, "tid": 7, "ts": 1716454223299230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212700, "dur": 8, "args": { "External id": 123171, "cbid": 211, "correlation": 123171 } }, { "ph": "s", "id": 123171, "pid": 76337, "tid": -914061504, "ts": 1716454223212700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223299291, "dur": 50, "args": { "External id": 123179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123179, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123179, "pid": 5, "tid": 7, "ts": 1716454223299291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212729, "dur": 8, "args": { "External id": 123179, "cbid": 211, "correlation": 123179 } }, { "ph": "s", "id": 123179, "pid": 76337, "tid": -914061504, "ts": 1716454223212729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223299342, "dur": 35, "args": { "External id": 123187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123187, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123187, "pid": 5, "tid": 7, "ts": 1716454223299342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212756, "dur": 9, "args": { "External id": 123187, "cbid": 211, "correlation": 123187 } }, { "ph": "s", "id": 123187, "pid": 76337, "tid": -914061504, "ts": 1716454223212756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223299378, "dur": 51, "args": { "External id": 123207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123207, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 123207, "pid": 5, "tid": 7, "ts": 1716454223299378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212838, "dur": 12, "args": { "External id": 123207, "cbid": 211, "correlation": 123207 } }, { "ph": "s", "id": 123207, "pid": 76337, "tid": -914061504, "ts": 1716454223212838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223299431, "dur": 4, "args": { "External id": 123219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123219, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123219, "pid": 5, "tid": 7, "ts": 1716454223299431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212860, "dur": 6, "args": { "External id": 123219, "cbid": 211, "correlation": 123219 } }, { "ph": "s", "id": 123219, "pid": 76337, "tid": -914061504, "ts": 1716454223212860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223299437, "dur": 54, "args": { "External id": 123222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123222, "pid": 5, "tid": 7, "ts": 1716454223299437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212877, "dur": 7, "args": { "External id": 123222, "cbid": 211, "correlation": 123222 } }, { "ph": "s", "id": 123222, "pid": 76337, "tid": -914061504, "ts": 1716454223212877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223212935, "dur": 0, "args": { "External id": 123233, "cbid": 317, "correlation": 123233 } }, { "ph": "f", "id": 123233, "pid": 76337, "tid": -914061504, "ts": 1716454223212935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223212935, "dur": 0, "args": { "External id": 123234, "cbid": 203, "correlation": 123234 } }, { "ph": "f", "id": 123234, "pid": 76337, "tid": -914061504, "ts": 1716454223212935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223212936, "dur": 0, "args": { "External id": 123235, "cbid": 205, "correlation": 123235 } }, { "ph": "f", "id": 123235, "pid": 76337, "tid": -914061504, "ts": 1716454223212936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212966, "dur": 2, "args": { "External id": 123239, "cbid": 251, "correlation": 123239 } }, { "ph": "f", "id": 123239, "pid": 76337, "tid": -914061504, "ts": 1716454223212966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212969, "dur": 1, "args": { "External id": 123240, "cbid": 251, "correlation": 123240 } }, { "ph": "f", "id": 123240, "pid": 76337, "tid": -914061504, "ts": 1716454223212969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212971, "dur": 1, "args": { "External id": 123241, "cbid": 251, "correlation": 123241 } }, { "ph": "f", "id": 123241, "pid": 76337, "tid": -914061504, "ts": 1716454223212971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212973, "dur": 8, "args": { "External id": 123242, "cbid": 251, "correlation": 123242 } }, { "ph": "f", "id": 123242, "pid": 76337, "tid": -914061504, "ts": 1716454223212973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212982, "dur": 0, "args": { "External id": 123243, "cbid": 251, "correlation": 123243 } }, { "ph": "f", "id": 123243, "pid": 76337, "tid": -914061504, "ts": 1716454223212982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212983, "dur": 1, "args": { "External id": 123244, "cbid": 251, "correlation": 123244 } }, { "ph": "f", "id": 123244, "pid": 76337, "tid": -914061504, "ts": 1716454223212983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212985, "dur": 1, "args": { "External id": 123245, "cbid": 251, "correlation": 123245 } }, { "ph": "f", "id": 123245, "pid": 76337, "tid": -914061504, "ts": 1716454223212985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212986, "dur": 0, "args": { "External id": 123246, "cbid": 251, "correlation": 123246 } }, { "ph": "f", "id": 123246, "pid": 76337, "tid": -914061504, "ts": 1716454223212986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223212988, "dur": 0, "args": { "External id": 123247, "cbid": 251, "correlation": 123247 } }, { "ph": "f", "id": 123247, "pid": 76337, "tid": -914061504, "ts": 1716454223212988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223299492, "dur": 116, "args": { "External id": 123248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123248, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 123248, "pid": 5, "tid": 7, "ts": 1716454223299492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223212991, "dur": 13, "args": { "External id": 123248, "cbid": 211, "correlation": 123248 } }, { "ph": "s", "id": 123248, "pid": 76337, "tid": -914061504, "ts": 1716454223212991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223299610, "dur": 60, "args": { "External id": 123254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123254, "pid": 5, "tid": 7, "ts": 1716454223299610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213028, "dur": 9, "args": { "External id": 123254, "cbid": 211, "correlation": 123254 } }, { "ph": "s", "id": 123254, "pid": 76337, "tid": -914061504, "ts": 1716454223213028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223299671, "dur": 532, "args": { "External id": 123263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123263, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123263, "pid": 5, "tid": 7, "ts": 1716454223299671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213109, "dur": 14, "args": { "External id": 123263, "cbid": 211, "correlation": 123263 } }, { "ph": "s", "id": 123263, "pid": 76337, "tid": -914061504, "ts": 1716454223213109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223300204, "dur": 181, "args": { "External id": 123285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123285, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123285, "pid": 5, "tid": 7, "ts": 1716454223300204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213168, "dur": 10, "args": { "External id": 123285, "cbid": 211, "correlation": 123285 } }, { "ph": "s", "id": 123285, "pid": 76337, "tid": -914061504, "ts": 1716454223213168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213255, "dur": 2, "args": { "External id": 123296, "cbid": 251, "correlation": 123296 } }, { "ph": "f", "id": 123296, "pid": 76337, "tid": -914061504, "ts": 1716454223213255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223300386, "dur": 200, "args": { "External id": 123297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123297, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123297, "pid": 5, "tid": 7, "ts": 1716454223300386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213260, "dur": 14, "args": { "External id": 123297, "cbid": 211, "correlation": 123297 } }, { "ph": "s", "id": 123297, "pid": 76337, "tid": -914061504, "ts": 1716454223213260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213330, "dur": 1, "args": { "External id": 123308, "cbid": 251, "correlation": 123308 } }, { "ph": "f", "id": 123308, "pid": 76337, "tid": -914061504, "ts": 1716454223213330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223300587, "dur": 185, "args": { "External id": 123309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123309, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123309, "pid": 5, "tid": 7, "ts": 1716454223300587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213334, "dur": 11, "args": { "External id": 123309, "cbid": 211, "correlation": 123309 } }, { "ph": "s", "id": 123309, "pid": 76337, "tid": -914061504, "ts": 1716454223213334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213397, "dur": 1, "args": { "External id": 123320, "cbid": 251, "correlation": 123320 } }, { "ph": "f", "id": 123320, "pid": 76337, "tid": -914061504, "ts": 1716454223213397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223300774, "dur": 183, "args": { "External id": 123321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123321, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123321, "pid": 5, "tid": 7, "ts": 1716454223300774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213401, "dur": 12, "args": { "External id": 123321, "cbid": 211, "correlation": 123321 } }, { "ph": "s", "id": 123321, "pid": 76337, "tid": -914061504, "ts": 1716454223213401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223300959, "dur": 18570, "args": { "External id": 123342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123342, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 123342, "pid": 5, "tid": 7, "ts": 1716454223300959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213484, "dur": 13, "args": { "External id": 123342, "cbid": 211, "correlation": 123342 } }, { "ph": "s", "id": 123342, "pid": 76337, "tid": -914061504, "ts": 1716454223213484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213581, "dur": 1, "args": { "External id": 123360, "cbid": 251, "correlation": 123360 } }, { "ph": "f", "id": 123360, "pid": 76337, "tid": -914061504, "ts": 1716454223213581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223319530, "dur": 200, "args": { "External id": 123362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123362, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123362, "pid": 5, "tid": 7, "ts": 1716454223319530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213586, "dur": 13, "args": { "External id": 123362, "cbid": 211, "correlation": 123362 } }, { "ph": "s", "id": 123362, "pid": 76337, "tid": -914061504, "ts": 1716454223213586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223319731, "dur": 66, "args": { "External id": 123370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123370, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123370, "pid": 5, "tid": 7, "ts": 1716454223319731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213656, "dur": 13, "args": { "External id": 123370, "cbid": 211, "correlation": 123370 } }, { "ph": "s", "id": 123370, "pid": 76337, "tid": -914061504, "ts": 1716454223213656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223319799, "dur": 97, "args": { "External id": 123378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123378, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123378, "pid": 5, "tid": 7, "ts": 1716454223319799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213696, "dur": 8, "args": { "External id": 123378, "cbid": 211, "correlation": 123378 } }, { "ph": "s", "id": 123378, "pid": 76337, "tid": -914061504, "ts": 1716454223213696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223319897, "dur": 54, "args": { "External id": 123389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123389, "pid": 5, "tid": 7, "ts": 1716454223319897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213766, "dur": 13, "args": { "External id": 123389, "cbid": 211, "correlation": 123389 } }, { "ph": "s", "id": 123389, "pid": 76337, "tid": -914061504, "ts": 1716454223213766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223319952, "dur": 92, "args": { "External id": 123411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123411, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123411, "pid": 5, "tid": 7, "ts": 1716454223319952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213797, "dur": 7, "args": { "External id": 123411, "cbid": 211, "correlation": 123411 } }, { "ph": "s", "id": 123411, "pid": 76337, "tid": -914061504, "ts": 1716454223213797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213880, "dur": 1, "args": { "External id": 123422, "cbid": 251, "correlation": 123422 } }, { "ph": "f", "id": 123422, "pid": 76337, "tid": -914061504, "ts": 1716454223213880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223320045, "dur": 102, "args": { "External id": 123423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123423, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123423, "pid": 5, "tid": 7, "ts": 1716454223320045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213885, "dur": 13, "args": { "External id": 123423, "cbid": 211, "correlation": 123423 } }, { "ph": "s", "id": 123423, "pid": 76337, "tid": -914061504, "ts": 1716454223213885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213959, "dur": 1, "args": { "External id": 123434, "cbid": 251, "correlation": 123434 } }, { "ph": "f", "id": 123434, "pid": 76337, "tid": -914061504, "ts": 1716454223213959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223213963, "dur": 0, "args": { "External id": 123435, "cbid": 251, "correlation": 123435 } }, { "ph": "f", "id": 123435, "pid": 76337, "tid": -914061504, "ts": 1716454223213963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223320148, "dur": 10, "args": { "External id": 123436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123436, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 123436, "pid": 5, "tid": 7, "ts": 1716454223320148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213965, "dur": 21, "args": { "External id": 123436, "cbid": 211, "correlation": 123436 } }, { "ph": "s", "id": 123436, "pid": 76337, "tid": -914061504, "ts": 1716454223213965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223320160, "dur": 5, "args": { "External id": 123438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123438, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 123438, "pid": 5, "tid": 7, "ts": 1716454223320160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223213990, "dur": 7, "args": { "External id": 123438, "cbid": 211, "correlation": 123438 } }, { "ph": "s", "id": 123438, "pid": 76337, "tid": -914061504, "ts": 1716454223213990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214053, "dur": 1, "args": { "External id": 123449, "cbid": 251, "correlation": 123449 } }, { "ph": "f", "id": 123449, "pid": 76337, "tid": -914061504, "ts": 1716454223214053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214056, "dur": 0, "args": { "External id": 123450, "cbid": 251, "correlation": 123450 } }, { "ph": "f", "id": 123450, "pid": 76337, "tid": -914061504, "ts": 1716454223214056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223320166, "dur": 6, "args": { "External id": 123451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123451, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 123451, "pid": 5, "tid": 7, "ts": 1716454223320166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214058, "dur": 12, "args": { "External id": 123451, "cbid": 211, "correlation": 123451 } }, { "ph": "s", "id": 123451, "pid": 76337, "tid": -914061504, "ts": 1716454223214058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223320174, "dur": 3, "args": { "External id": 123453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123453, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 123453, "pid": 5, "tid": 7, "ts": 1716454223320174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214072, "dur": 6, "args": { "External id": 123453, "cbid": 211, "correlation": 123453 } }, { "ph": "s", "id": 123453, "pid": 76337, "tid": -914061504, "ts": 1716454223214072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223320179, "dur": 156, "args": { "External id": 123474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123474, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 123474, "pid": 5, "tid": 7, "ts": 1716454223320179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214145, "dur": 12, "args": { "External id": 123474, "cbid": 211, "correlation": 123474 } }, { "ph": "s", "id": 123474, "pid": 76337, "tid": -914061504, "ts": 1716454223214145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214240, "dur": 1, "args": { "External id": 123492, "cbid": 251, "correlation": 123492 } }, { "ph": "f", "id": 123492, "pid": 76337, "tid": -914061504, "ts": 1716454223214240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223320336, "dur": 107, "args": { "External id": 123494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123494, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 123494, "pid": 5, "tid": 7, "ts": 1716454223320336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214247, "dur": 13, "args": { "External id": 123494, "cbid": 211, "correlation": 123494 } }, { "ph": "s", "id": 123494, "pid": 76337, "tid": -914061504, "ts": 1716454223214247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223320444, "dur": 35, "args": { "External id": 123502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123502, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123502, "pid": 5, "tid": 7, "ts": 1716454223320444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214317, "dur": 12, "args": { "External id": 123502, "cbid": 211, "correlation": 123502 } }, { "ph": "s", "id": 123502, "pid": 76337, "tid": -914061504, "ts": 1716454223214317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223320480, "dur": 68, "args": { "External id": 123510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123510, "pid": 5, "tid": 7, "ts": 1716454223320480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214357, "dur": 9, "args": { "External id": 123510, "cbid": 211, "correlation": 123510 } }, { "ph": "s", "id": 123510, "pid": 76337, "tid": -914061504, "ts": 1716454223214357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223320550, "dur": 91, "args": { "External id": 123532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123532, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123532, "pid": 5, "tid": 7, "ts": 1716454223320550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214408, "dur": 10, "args": { "External id": 123532, "cbid": 211, "correlation": 123532 } }, { "ph": "s", "id": 123532, "pid": 76337, "tid": -914061504, "ts": 1716454223214408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214496, "dur": 1, "args": { "External id": 123548, "cbid": 251, "correlation": 123548 } }, { "ph": "f", "id": 123548, "pid": 76337, "tid": -914061504, "ts": 1716454223214496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223320643, "dur": 571, "args": { "External id": 123550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123550, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123550, "pid": 5, "tid": 7, "ts": 1716454223320643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214502, "dur": 12, "args": { "External id": 123550, "cbid": 211, "correlation": 123550 } }, { "ph": "s", "id": 123550, "pid": 76337, "tid": -914061504, "ts": 1716454223214502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223321215, "dur": 244, "args": { "External id": 123558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123558, "pid": 5, "tid": 7, "ts": 1716454223321215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214567, "dur": 13, "args": { "External id": 123558, "cbid": 211, "correlation": 123558 } }, { "ph": "s", "id": 123558, "pid": 76337, "tid": -914061504, "ts": 1716454223214567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223321460, "dur": 251, "args": { "External id": 123566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123566, "pid": 5, "tid": 7, "ts": 1716454223321460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214600, "dur": 8, "args": { "External id": 123566, "cbid": 211, "correlation": 123566 } }, { "ph": "s", "id": 123566, "pid": 76337, "tid": -914061504, "ts": 1716454223214600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214682, "dur": 1, "args": { "External id": 123582, "cbid": 251, "correlation": 123582 } }, { "ph": "f", "id": 123582, "pid": 76337, "tid": -914061504, "ts": 1716454223214682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214687, "dur": 0, "args": { "External id": 123584, "cbid": 251, "correlation": 123584 } }, { "ph": "f", "id": 123584, "pid": 76337, "tid": -914061504, "ts": 1716454223214687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223321713, "dur": 352, "args": { "External id": 123585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123585, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 123585, "pid": 5, "tid": 7, "ts": 1716454223321713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214690, "dur": 13, "args": { "External id": 123585, "cbid": 211, "correlation": 123585 } }, { "ph": "s", "id": 123585, "pid": 76337, "tid": -914061504, "ts": 1716454223214690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223322066, "dur": 50, "args": { "External id": 123593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123593, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123593, "pid": 5, "tid": 7, "ts": 1716454223322066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214732, "dur": 10, "args": { "External id": 123593, "cbid": 211, "correlation": 123593 } }, { "ph": "s", "id": 123593, "pid": 76337, "tid": -914061504, "ts": 1716454223214732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223322117, "dur": 157, "args": { "External id": 123604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123604, "pid": 5, "tid": 7, "ts": 1716454223322117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214800, "dur": 12, "args": { "External id": 123604, "cbid": 211, "correlation": 123604 } }, { "ph": "s", "id": 123604, "pid": 76337, "tid": -914061504, "ts": 1716454223214800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223214864, "dur": 0, "args": { "External id": 123616, "cbid": 317, "correlation": 123616 } }, { "ph": "f", "id": 123616, "pid": 76337, "tid": -914061504, "ts": 1716454223214864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223214865, "dur": 0, "args": { "External id": 123617, "cbid": 203, "correlation": 123617 } }, { "ph": "f", "id": 123617, "pid": 76337, "tid": -914061504, "ts": 1716454223214865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223214866, "dur": 0, "args": { "External id": 123618, "cbid": 205, "correlation": 123618 } }, { "ph": "f", "id": 123618, "pid": 76337, "tid": -914061504, "ts": 1716454223214866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214890, "dur": 1, "args": { "External id": 123622, "cbid": 251, "correlation": 123622 } }, { "ph": "f", "id": 123622, "pid": 76337, "tid": -914061504, "ts": 1716454223214890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214892, "dur": 0, "args": { "External id": 123623, "cbid": 251, "correlation": 123623 } }, { "ph": "f", "id": 123623, "pid": 76337, "tid": -914061504, "ts": 1716454223214892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214893, "dur": 0, "args": { "External id": 123624, "cbid": 251, "correlation": 123624 } }, { "ph": "f", "id": 123624, "pid": 76337, "tid": -914061504, "ts": 1716454223214893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214894, "dur": 0, "args": { "External id": 123625, "cbid": 251, "correlation": 123625 } }, { "ph": "f", "id": 123625, "pid": 76337, "tid": -914061504, "ts": 1716454223214894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214895, "dur": 0, "args": { "External id": 123626, "cbid": 251, "correlation": 123626 } }, { "ph": "f", "id": 123626, "pid": 76337, "tid": -914061504, "ts": 1716454223214895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214895, "dur": 0, "args": { "External id": 123627, "cbid": 251, "correlation": 123627 } }, { "ph": "f", "id": 123627, "pid": 76337, "tid": -914061504, "ts": 1716454223214895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214896, "dur": 0, "args": { "External id": 123628, "cbid": 251, "correlation": 123628 } }, { "ph": "f", "id": 123628, "pid": 76337, "tid": -914061504, "ts": 1716454223214896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214897, "dur": 0, "args": { "External id": 123629, "cbid": 251, "correlation": 123629 } }, { "ph": "f", "id": 123629, "pid": 76337, "tid": -914061504, "ts": 1716454223214897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223214898, "dur": 0, "args": { "External id": 123630, "cbid": 251, "correlation": 123630 } }, { "ph": "f", "id": 123630, "pid": 76337, "tid": -914061504, "ts": 1716454223214898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223322276, "dur": 115, "args": { "External id": 123631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123631, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 123631, "pid": 5, "tid": 7, "ts": 1716454223322276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214900, "dur": 12, "args": { "External id": 123631, "cbid": 211, "correlation": 123631 } }, { "ph": "s", "id": 123631, "pid": 76337, "tid": -914061504, "ts": 1716454223214900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223322392, "dur": 60, "args": { "External id": 123637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123637, "pid": 5, "tid": 7, "ts": 1716454223322392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214936, "dur": 9, "args": { "External id": 123637, "cbid": 211, "correlation": 123637 } }, { "ph": "s", "id": 123637, "pid": 76337, "tid": -914061504, "ts": 1716454223214936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223322453, "dur": 50, "args": { "External id": 123645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123645, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123645, "pid": 5, "tid": 7, "ts": 1716454223322453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223214968, "dur": 18, "args": { "External id": 123645, "cbid": 211, "correlation": 123645 } }, { "ph": "s", "id": 123645, "pid": 76337, "tid": -914061504, "ts": 1716454223214968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223322505, "dur": 101, "args": { "External id": 123654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123654, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123654, "pid": 5, "tid": 7, "ts": 1716454223322505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215017, "dur": 11, "args": { "External id": 123654, "cbid": 211, "correlation": 123654 } }, { "ph": "s", "id": 123654, "pid": 76337, "tid": -914061504, "ts": 1716454223215017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223322606, "dur": 92, "args": { "External id": 123674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123674, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 123674, "pid": 5, "tid": 7, "ts": 1716454223322606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215090, "dur": 12, "args": { "External id": 123674, "cbid": 211, "correlation": 123674 } }, { "ph": "s", "id": 123674, "pid": 76337, "tid": -914061504, "ts": 1716454223215090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223322700, "dur": 4, "args": { "External id": 123686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123686, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 123686, "pid": 5, "tid": 7, "ts": 1716454223322700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215111, "dur": 7, "args": { "External id": 123686, "cbid": 211, "correlation": 123686 } }, { "ph": "s", "id": 123686, "pid": 76337, "tid": -914061504, "ts": 1716454223215111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223322706, "dur": 109, "args": { "External id": 123689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123689, "pid": 5, "tid": 7, "ts": 1716454223322706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215130, "dur": 6, "args": { "External id": 123689, "cbid": 211, "correlation": 123689 } }, { "ph": "s", "id": 123689, "pid": 76337, "tid": -914061504, "ts": 1716454223215130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223322816, "dur": 69, "args": { "External id": 123698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123698, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123698, "pid": 5, "tid": 7, "ts": 1716454223322816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215169, "dur": 11, "args": { "External id": 123698, "cbid": 211, "correlation": 123698 } }, { "ph": "s", "id": 123698, "pid": 76337, "tid": -914061504, "ts": 1716454223215169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223215221, "dur": 0, "args": { "External id": 123708, "cbid": 317, "correlation": 123708 } }, { "ph": "f", "id": 123708, "pid": 76337, "tid": -914061504, "ts": 1716454223215221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223215222, "dur": 0, "args": { "External id": 123709, "cbid": 203, "correlation": 123709 } }, { "ph": "f", "id": 123709, "pid": 76337, "tid": -914061504, "ts": 1716454223215222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223215223, "dur": 0, "args": { "External id": 123710, "cbid": 205, "correlation": 123710 } }, { "ph": "f", "id": 123710, "pid": 76337, "tid": -914061504, "ts": 1716454223215223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223322886, "dur": 76, "args": { "External id": 123714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123714, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123714, "pid": 5, "tid": 7, "ts": 1716454223322886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215238, "dur": 11, "args": { "External id": 123714, "cbid": 211, "correlation": 123714 } }, { "ph": "s", "id": 123714, "pid": 76337, "tid": -914061504, "ts": 1716454223215238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223322964, "dur": 24, "args": { "External id": 123716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123716, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123716, "pid": 5, "tid": 7, "ts": 1716454223322964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215252, "dur": 5, "args": { "External id": 123716, "cbid": 211, "correlation": 123716 } }, { "ph": "s", "id": 123716, "pid": 76337, "tid": -914061504, "ts": 1716454223215252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223322989, "dur": 4, "args": { "External id": 123718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123718, "pid": 5, "tid": 7, "ts": 1716454223322989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215262, "dur": 6, "args": { "External id": 123718, "cbid": 211, "correlation": 123718 } }, { "ph": "s", "id": 123718, "pid": 76337, "tid": -914061504, "ts": 1716454223215262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223215271, "dur": 0, "args": { "External id": 123719, "cbid": 51, "correlation": 123719 } }, { "ph": "s", "id": 123719, "pid": 76337, "tid": -914061504, "ts": 1716454223215271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223322995, "dur": 1364, "args": { "External id": 123720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123720, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123720, "pid": 5, "tid": 7, "ts": 1716454223322995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215272, "dur": 6, "args": { "External id": 123720, "cbid": 211, "correlation": 123720 } }, { "ph": "s", "id": 123720, "pid": 76337, "tid": -914061504, "ts": 1716454223215272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223324360, "dur": 60, "args": { "External id": 123725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123725, "pid": 5, "tid": 7, "ts": 1716454223324360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215300, "dur": 8, "args": { "External id": 123725, "cbid": 211, "correlation": 123725 } }, { "ph": "s", "id": 123725, "pid": 76337, "tid": -914061504, "ts": 1716454223215300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223324421, "dur": 5, "args": { "External id": 123733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123733, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123733, "pid": 5, "tid": 7, "ts": 1716454223324421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215343, "dur": 9, "args": { "External id": 123733, "cbid": 211, "correlation": 123733 } }, { "ph": "s", "id": 123733, "pid": 76337, "tid": -914061504, "ts": 1716454223215343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223215410, "dur": 1, "args": { "External id": 123749, "cbid": 251, "correlation": 123749 } }, { "ph": "f", "id": 123749, "pid": 76337, "tid": -914061504, "ts": 1716454223215410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223215415, "dur": 0, "args": { "External id": 123751, "cbid": 251, "correlation": 123751 } }, { "ph": "f", "id": 123751, "pid": 76337, "tid": -914061504, "ts": 1716454223215415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223324427, "dur": 11, "args": { "External id": 123752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123752, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 123752, "pid": 5, "tid": 7, "ts": 1716454223324427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215417, "dur": 12, "args": { "External id": 123752, "cbid": 211, "correlation": 123752 } }, { "ph": "s", "id": 123752, "pid": 76337, "tid": -914061504, "ts": 1716454223215417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223324440, "dur": 5, "args": { "External id": 123754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123754, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 123754, "pid": 5, "tid": 7, "ts": 1716454223324440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215431, "dur": 6, "args": { "External id": 123754, "cbid": 211, "correlation": 123754 } }, { "ph": "s", "id": 123754, "pid": 76337, "tid": -914061504, "ts": 1716454223215431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223324446, "dur": 53, "args": { "External id": 123764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123764, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123764, "pid": 5, "tid": 7, "ts": 1716454223324446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215489, "dur": 12, "args": { "External id": 123764, "cbid": 211, "correlation": 123764 } }, { "ph": "s", "id": 123764, "pid": 76337, "tid": -914061504, "ts": 1716454223215489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223324501, "dur": 53, "args": { "External id": 123784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123784, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 123784, "pid": 5, "tid": 7, "ts": 1716454223324501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215555, "dur": 11, "args": { "External id": 123784, "cbid": 211, "correlation": 123784 } }, { "ph": "s", "id": 123784, "pid": 76337, "tid": -914061504, "ts": 1716454223215555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223324555, "dur": 4, "args": { "External id": 123796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123796, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123796, "pid": 5, "tid": 7, "ts": 1716454223324555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215576, "dur": 7, "args": { "External id": 123796, "cbid": 211, "correlation": 123796 } }, { "ph": "s", "id": 123796, "pid": 76337, "tid": -914061504, "ts": 1716454223215576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223324560, "dur": 55, "args": { "External id": 123799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123799, "pid": 5, "tid": 7, "ts": 1716454223324560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215595, "dur": 6, "args": { "External id": 123799, "cbid": 211, "correlation": 123799 } }, { "ph": "s", "id": 123799, "pid": 76337, "tid": -914061504, "ts": 1716454223215595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223324617, "dur": 36, "args": { "External id": 123808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123808, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123808, "pid": 5, "tid": 7, "ts": 1716454223324617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215637, "dur": 10, "args": { "External id": 123808, "cbid": 211, "correlation": 123808 } }, { "ph": "s", "id": 123808, "pid": 76337, "tid": -914061504, "ts": 1716454223215637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223215700, "dur": 0, "args": { "External id": 123818, "cbid": 317, "correlation": 123818 } }, { "ph": "f", "id": 123818, "pid": 76337, "tid": -914061504, "ts": 1716454223215700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223215701, "dur": 0, "args": { "External id": 123819, "cbid": 203, "correlation": 123819 } }, { "ph": "f", "id": 123819, "pid": 76337, "tid": -914061504, "ts": 1716454223215701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223215702, "dur": 0, "args": { "External id": 123820, "cbid": 205, "correlation": 123820 } }, { "ph": "f", "id": 123820, "pid": 76337, "tid": -914061504, "ts": 1716454223215702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223324654, "dur": 41, "args": { "External id": 123824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123824, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123824, "pid": 5, "tid": 7, "ts": 1716454223324654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215716, "dur": 12, "args": { "External id": 123824, "cbid": 211, "correlation": 123824 } }, { "ph": "s", "id": 123824, "pid": 76337, "tid": -914061504, "ts": 1716454223215716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223324696, "dur": 14, "args": { "External id": 123826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123826, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123826, "pid": 5, "tid": 7, "ts": 1716454223324696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215731, "dur": 6, "args": { "External id": 123826, "cbid": 211, "correlation": 123826 } }, { "ph": "s", "id": 123826, "pid": 76337, "tid": -914061504, "ts": 1716454223215731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223324712, "dur": 3, "args": { "External id": 123828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123828, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123828, "pid": 5, "tid": 7, "ts": 1716454223324712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215741, "dur": 5, "args": { "External id": 123828, "cbid": 211, "correlation": 123828 } }, { "ph": "s", "id": 123828, "pid": 76337, "tid": -914061504, "ts": 1716454223215741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223215749, "dur": 0, "args": { "External id": 123829, "cbid": 51, "correlation": 123829 } }, { "ph": "s", "id": 123829, "pid": 76337, "tid": -914061504, "ts": 1716454223215749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223324717, "dur": 698, "args": { "External id": 123830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123830, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123830, "pid": 5, "tid": 7, "ts": 1716454223324717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215750, "dur": 5, "args": { "External id": 123830, "cbid": 211, "correlation": 123830 } }, { "ph": "s", "id": 123830, "pid": 76337, "tid": -914061504, "ts": 1716454223215750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223325416, "dur": 60, "args": { "External id": 123835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123835, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123835, "pid": 5, "tid": 7, "ts": 1716454223325416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215776, "dur": 10, "args": { "External id": 123835, "cbid": 211, "correlation": 123835 } }, { "ph": "s", "id": 123835, "pid": 76337, "tid": -914061504, "ts": 1716454223215776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223215834, "dur": 0, "args": { "External id": 123845, "cbid": 317, "correlation": 123845 } }, { "ph": "f", "id": 123845, "pid": 76337, "tid": -914061504, "ts": 1716454223215834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223215835, "dur": 0, "args": { "External id": 123846, "cbid": 203, "correlation": 123846 } }, { "ph": "f", "id": 123846, "pid": 76337, "tid": -914061504, "ts": 1716454223215835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223215836, "dur": 0, "args": { "External id": 123847, "cbid": 205, "correlation": 123847 } }, { "ph": "f", "id": 123847, "pid": 76337, "tid": -914061504, "ts": 1716454223215836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223325477, "dur": 75, "args": { "External id": 123851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123851, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123851, "pid": 5, "tid": 7, "ts": 1716454223325477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215849, "dur": 12, "args": { "External id": 123851, "cbid": 211, "correlation": 123851 } }, { "ph": "s", "id": 123851, "pid": 76337, "tid": -914061504, "ts": 1716454223215849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223325553, "dur": 208, "args": { "External id": 123853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123853, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123853, "pid": 5, "tid": 7, "ts": 1716454223325553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215868, "dur": 8, "args": { "External id": 123853, "cbid": 211, "correlation": 123853 } }, { "ph": "s", "id": 123853, "pid": 76337, "tid": -914061504, "ts": 1716454223215868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223325763, "dur": 40, "args": { "External id": 123855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123855, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123855, "pid": 5, "tid": 7, "ts": 1716454223325763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223215881, "dur": 406, "args": { "External id": 123855, "cbid": 211, "correlation": 123855 } }, { "ph": "s", "id": 123855, "pid": 76337, "tid": -914061504, "ts": 1716454223215881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223325804, "dur": 58, "args": { "External id": 123861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123861, "pid": 5, "tid": 7, "ts": 1716454223325804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216308, "dur": 9, "args": { "External id": 123861, "cbid": 211, "correlation": 123861 } }, { "ph": "s", "id": 123861, "pid": 76337, "tid": -914061504, "ts": 1716454223216308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223325864, "dur": 50, "args": { "External id": 123869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123869, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123869, "pid": 5, "tid": 7, "ts": 1716454223325864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216339, "dur": 8, "args": { "External id": 123869, "cbid": 211, "correlation": 123869 } }, { "ph": "s", "id": 123869, "pid": 76337, "tid": -914061504, "ts": 1716454223216339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223325915, "dur": 35, "args": { "External id": 123877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123877, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123877, "pid": 5, "tid": 7, "ts": 1716454223325915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216368, "dur": 28, "args": { "External id": 123877, "cbid": 211, "correlation": 123877 } }, { "ph": "s", "id": 123877, "pid": 76337, "tid": -914061504, "ts": 1716454223216368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223325951, "dur": 50, "args": { "External id": 123897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123897, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 123897, "pid": 5, "tid": 7, "ts": 1716454223325951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216473, "dur": 14, "args": { "External id": 123897, "cbid": 211, "correlation": 123897 } }, { "ph": "s", "id": 123897, "pid": 76337, "tid": -914061504, "ts": 1716454223216473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223326003, "dur": 4, "args": { "External id": 123909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123909, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 123909, "pid": 5, "tid": 7, "ts": 1716454223326003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216496, "dur": 6, "args": { "External id": 123909, "cbid": 211, "correlation": 123909 } }, { "ph": "s", "id": 123909, "pid": 76337, "tid": -914061504, "ts": 1716454223216496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223326009, "dur": 55, "args": { "External id": 123912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123912, "pid": 5, "tid": 7, "ts": 1716454223326009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216514, "dur": 6, "args": { "External id": 123912, "cbid": 211, "correlation": 123912 } }, { "ph": "s", "id": 123912, "pid": 76337, "tid": -914061504, "ts": 1716454223216514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223216571, "dur": 0, "args": { "External id": 123923, "cbid": 317, "correlation": 123923 } }, { "ph": "f", "id": 123923, "pid": 76337, "tid": -914061504, "ts": 1716454223216571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223216572, "dur": 0, "args": { "External id": 123924, "cbid": 203, "correlation": 123924 } }, { "ph": "f", "id": 123924, "pid": 76337, "tid": -914061504, "ts": 1716454223216572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223216573, "dur": 0, "args": { "External id": 123925, "cbid": 205, "correlation": 123925 } }, { "ph": "f", "id": 123925, "pid": 76337, "tid": -914061504, "ts": 1716454223216573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216595, "dur": 1, "args": { "External id": 123929, "cbid": 251, "correlation": 123929 } }, { "ph": "f", "id": 123929, "pid": 76337, "tid": -914061504, "ts": 1716454223216595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216597, "dur": 0, "args": { "External id": 123930, "cbid": 251, "correlation": 123930 } }, { "ph": "f", "id": 123930, "pid": 76337, "tid": -914061504, "ts": 1716454223216597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216598, "dur": 0, "args": { "External id": 123931, "cbid": 251, "correlation": 123931 } }, { "ph": "f", "id": 123931, "pid": 76337, "tid": -914061504, "ts": 1716454223216598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216599, "dur": 0, "args": { "External id": 123932, "cbid": 251, "correlation": 123932 } }, { "ph": "f", "id": 123932, "pid": 76337, "tid": -914061504, "ts": 1716454223216599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216599, "dur": 0, "args": { "External id": 123933, "cbid": 251, "correlation": 123933 } }, { "ph": "f", "id": 123933, "pid": 76337, "tid": -914061504, "ts": 1716454223216599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216600, "dur": 0, "args": { "External id": 123934, "cbid": 251, "correlation": 123934 } }, { "ph": "f", "id": 123934, "pid": 76337, "tid": -914061504, "ts": 1716454223216600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216601, "dur": 0, "args": { "External id": 123935, "cbid": 251, "correlation": 123935 } }, { "ph": "f", "id": 123935, "pid": 76337, "tid": -914061504, "ts": 1716454223216601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216602, "dur": 0, "args": { "External id": 123936, "cbid": 251, "correlation": 123936 } }, { "ph": "f", "id": 123936, "pid": 76337, "tid": -914061504, "ts": 1716454223216602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216603, "dur": 0, "args": { "External id": 123937, "cbid": 251, "correlation": 123937 } }, { "ph": "f", "id": 123937, "pid": 76337, "tid": -914061504, "ts": 1716454223216603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223326065, "dur": 113, "args": { "External id": 123938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123938, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 123938, "pid": 5, "tid": 7, "ts": 1716454223326065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216605, "dur": 13, "args": { "External id": 123938, "cbid": 211, "correlation": 123938 } }, { "ph": "s", "id": 123938, "pid": 76337, "tid": -914061504, "ts": 1716454223216605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223326179, "dur": 60, "args": { "External id": 123944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123944, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123944, "pid": 5, "tid": 7, "ts": 1716454223326179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216642, "dur": 9, "args": { "External id": 123944, "cbid": 211, "correlation": 123944 } }, { "ph": "s", "id": 123944, "pid": 76337, "tid": -914061504, "ts": 1716454223216642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223326240, "dur": 476, "args": { "External id": 123953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123953, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123953, "pid": 5, "tid": 7, "ts": 1716454223326240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216722, "dur": 15, "args": { "External id": 123953, "cbid": 211, "correlation": 123953 } }, { "ph": "s", "id": 123953, "pid": 76337, "tid": -914061504, "ts": 1716454223216722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223326718, "dur": 181, "args": { "External id": 123975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123975, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 123975, "pid": 5, "tid": 7, "ts": 1716454223326718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216780, "dur": 11, "args": { "External id": 123975, "cbid": 211, "correlation": 123975 } }, { "ph": "s", "id": 123975, "pid": 76337, "tid": -914061504, "ts": 1716454223216780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216867, "dur": 1, "args": { "External id": 123986, "cbid": 251, "correlation": 123986 } }, { "ph": "f", "id": 123986, "pid": 76337, "tid": -914061504, "ts": 1716454223216867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223326901, "dur": 199, "args": { "External id": 123987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123987, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123987, "pid": 5, "tid": 7, "ts": 1716454223326901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216872, "dur": 13, "args": { "External id": 123987, "cbid": 211, "correlation": 123987 } }, { "ph": "s", "id": 123987, "pid": 76337, "tid": -914061504, "ts": 1716454223216872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223216942, "dur": 1, "args": { "External id": 123998, "cbid": 251, "correlation": 123998 } }, { "ph": "f", "id": 123998, "pid": 76337, "tid": -914061504, "ts": 1716454223216942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223327101, "dur": 194, "args": { "External id": 123999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 123999, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 123999, "pid": 5, "tid": 7, "ts": 1716454223327101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223216946, "dur": 11, "args": { "External id": 123999, "cbid": 211, "correlation": 123999 } }, { "ph": "s", "id": 123999, "pid": 76337, "tid": -914061504, "ts": 1716454223216946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223217019, "dur": 1, "args": { "External id": 124010, "cbid": 251, "correlation": 124010 } }, { "ph": "f", "id": 124010, "pid": 76337, "tid": -914061504, "ts": 1716454223217019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223327296, "dur": 189, "args": { "External id": 124011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124011, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124011, "pid": 5, "tid": 7, "ts": 1716454223327296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217023, "dur": 11, "args": { "External id": 124011, "cbid": 211, "correlation": 124011 } }, { "ph": "s", "id": 124011, "pid": 76337, "tid": -914061504, "ts": 1716454223217023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223327487, "dur": 18589, "args": { "External id": 124032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124032, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124032, "pid": 5, "tid": 7, "ts": 1716454223327487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217104, "dur": 13, "args": { "External id": 124032, "cbid": 211, "correlation": 124032 } }, { "ph": "s", "id": 124032, "pid": 76337, "tid": -914061504, "ts": 1716454223217104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223217200, "dur": 1, "args": { "External id": 124050, "cbid": 251, "correlation": 124050 } }, { "ph": "f", "id": 124050, "pid": 76337, "tid": -914061504, "ts": 1716454223217200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223346078, "dur": 202, "args": { "External id": 124052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124052, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124052, "pid": 5, "tid": 7, "ts": 1716454223346078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217206, "dur": 13, "args": { "External id": 124052, "cbid": 211, "correlation": 124052 } }, { "ph": "s", "id": 124052, "pid": 76337, "tid": -914061504, "ts": 1716454223217206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223346281, "dur": 66, "args": { "External id": 124060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124060, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124060, "pid": 5, "tid": 7, "ts": 1716454223346281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217277, "dur": 12, "args": { "External id": 124060, "cbid": 211, "correlation": 124060 } }, { "ph": "s", "id": 124060, "pid": 76337, "tid": -914061504, "ts": 1716454223217277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223346348, "dur": 96, "args": { "External id": 124068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124068, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124068, "pid": 5, "tid": 7, "ts": 1716454223346348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217316, "dur": 8, "args": { "External id": 124068, "cbid": 211, "correlation": 124068 } }, { "ph": "s", "id": 124068, "pid": 76337, "tid": -914061504, "ts": 1716454223217316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223346446, "dur": 54, "args": { "External id": 124079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124079, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124079, "pid": 5, "tid": 7, "ts": 1716454223346446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217387, "dur": 74, "args": { "External id": 124079, "cbid": 211, "correlation": 124079 } }, { "ph": "s", "id": 124079, "pid": 76337, "tid": -914061504, "ts": 1716454223217387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223346501, "dur": 92, "args": { "External id": 124101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124101, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124101, "pid": 5, "tid": 7, "ts": 1716454223346501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223217480, "dur": 1929, "args": { "External id": 124101, "cbid": 211, "correlation": 124101 } }, { "ph": "s", "id": 124101, "pid": 76337, "tid": -914061504, "ts": 1716454223217480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223219486, "dur": 1, "args": { "External id": 124112, "cbid": 251, "correlation": 124112 } }, { "ph": "f", "id": 124112, "pid": 76337, "tid": -914061504, "ts": 1716454223219486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223346595, "dur": 102, "args": { "External id": 124113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124113, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124113, "pid": 5, "tid": 7, "ts": 1716454223346595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219492, "dur": 67, "args": { "External id": 124113, "cbid": 211, "correlation": 124113 } }, { "ph": "s", "id": 124113, "pid": 76337, "tid": -914061504, "ts": 1716454223219492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223219617, "dur": 1, "args": { "External id": 124124, "cbid": 251, "correlation": 124124 } }, { "ph": "f", "id": 124124, "pid": 76337, "tid": -914061504, "ts": 1716454223219617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223219621, "dur": 0, "args": { "External id": 124125, "cbid": 251, "correlation": 124125 } }, { "ph": "f", "id": 124125, "pid": 76337, "tid": -914061504, "ts": 1716454223219621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223346698, "dur": 10, "args": { "External id": 124126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124126, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 124126, "pid": 5, "tid": 7, "ts": 1716454223346698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219623, "dur": 13, "args": { "External id": 124126, "cbid": 211, "correlation": 124126 } }, { "ph": "s", "id": 124126, "pid": 76337, "tid": -914061504, "ts": 1716454223219623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223346710, "dur": 5, "args": { "External id": 124128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124128, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 124128, "pid": 5, "tid": 7, "ts": 1716454223346710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219637, "dur": 6, "args": { "External id": 124128, "cbid": 211, "correlation": 124128 } }, { "ph": "s", "id": 124128, "pid": 76337, "tid": -914061504, "ts": 1716454223219637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223219698, "dur": 1, "args": { "External id": 124139, "cbid": 251, "correlation": 124139 } }, { "ph": "f", "id": 124139, "pid": 76337, "tid": -914061504, "ts": 1716454223219698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223219701, "dur": 0, "args": { "External id": 124140, "cbid": 251, "correlation": 124140 } }, { "ph": "f", "id": 124140, "pid": 76337, "tid": -914061504, "ts": 1716454223219701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223346716, "dur": 6, "args": { "External id": 124141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124141, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 124141, "pid": 5, "tid": 7, "ts": 1716454223346716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219703, "dur": 12, "args": { "External id": 124141, "cbid": 211, "correlation": 124141 } }, { "ph": "s", "id": 124141, "pid": 76337, "tid": -914061504, "ts": 1716454223219703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223346724, "dur": 3, "args": { "External id": 124143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124143, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 124143, "pid": 5, "tid": 7, "ts": 1716454223346724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219717, "dur": 5, "args": { "External id": 124143, "cbid": 211, "correlation": 124143 } }, { "ph": "s", "id": 124143, "pid": 76337, "tid": -914061504, "ts": 1716454223219717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223346729, "dur": 155, "args": { "External id": 124164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124164, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124164, "pid": 5, "tid": 7, "ts": 1716454223346729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219789, "dur": 16, "args": { "External id": 124164, "cbid": 211, "correlation": 124164 } }, { "ph": "s", "id": 124164, "pid": 76337, "tid": -914061504, "ts": 1716454223219789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223219891, "dur": 1, "args": { "External id": 124182, "cbid": 251, "correlation": 124182 } }, { "ph": "f", "id": 124182, "pid": 76337, "tid": -914061504, "ts": 1716454223219891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223346885, "dur": 108, "args": { "External id": 124184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124184, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124184, "pid": 5, "tid": 7, "ts": 1716454223346885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219897, "dur": 13, "args": { "External id": 124184, "cbid": 211, "correlation": 124184 } }, { "ph": "s", "id": 124184, "pid": 76337, "tid": -914061504, "ts": 1716454223219897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223346994, "dur": 35, "args": { "External id": 124192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124192, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124192, "pid": 5, "tid": 7, "ts": 1716454223346994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223219967, "dur": 19, "args": { "External id": 124192, "cbid": 211, "correlation": 124192 } }, { "ph": "s", "id": 124192, "pid": 76337, "tid": -914061504, "ts": 1716454223219967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223347030, "dur": 67, "args": { "External id": 124200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124200, "pid": 5, "tid": 7, "ts": 1716454223347030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220017, "dur": 10, "args": { "External id": 124200, "cbid": 211, "correlation": 124200 } }, { "ph": "s", "id": 124200, "pid": 76337, "tid": -914061504, "ts": 1716454223220017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223347098, "dur": 91, "args": { "External id": 124222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124222, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124222, "pid": 5, "tid": 7, "ts": 1716454223347098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220069, "dur": 10, "args": { "External id": 124222, "cbid": 211, "correlation": 124222 } }, { "ph": "s", "id": 124222, "pid": 76337, "tid": -914061504, "ts": 1716454223220069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220154, "dur": 1, "args": { "External id": 124238, "cbid": 251, "correlation": 124238 } }, { "ph": "f", "id": 124238, "pid": 76337, "tid": -914061504, "ts": 1716454223220154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223347191, "dur": 574, "args": { "External id": 124240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124240, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124240, "pid": 5, "tid": 7, "ts": 1716454223347191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220159, "dur": 14, "args": { "External id": 124240, "cbid": 211, "correlation": 124240 } }, { "ph": "s", "id": 124240, "pid": 76337, "tid": -914061504, "ts": 1716454223220159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223347766, "dur": 244, "args": { "External id": 124248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124248, "pid": 5, "tid": 7, "ts": 1716454223347766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220226, "dur": 12, "args": { "External id": 124248, "cbid": 211, "correlation": 124248 } }, { "ph": "s", "id": 124248, "pid": 76337, "tid": -914061504, "ts": 1716454223220226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223348011, "dur": 253, "args": { "External id": 124256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124256, "pid": 5, "tid": 7, "ts": 1716454223348011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220256, "dur": 9, "args": { "External id": 124256, "cbid": 211, "correlation": 124256 } }, { "ph": "s", "id": 124256, "pid": 76337, "tid": -914061504, "ts": 1716454223220256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220337, "dur": 1, "args": { "External id": 124272, "cbid": 251, "correlation": 124272 } }, { "ph": "f", "id": 124272, "pid": 76337, "tid": -914061504, "ts": 1716454223220337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220342, "dur": 0, "args": { "External id": 124274, "cbid": 251, "correlation": 124274 } }, { "ph": "f", "id": 124274, "pid": 76337, "tid": -914061504, "ts": 1716454223220342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223348265, "dur": 357, "args": { "External id": 124275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124275, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 124275, "pid": 5, "tid": 7, "ts": 1716454223348265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220345, "dur": 12, "args": { "External id": 124275, "cbid": 211, "correlation": 124275 } }, { "ph": "s", "id": 124275, "pid": 76337, "tid": -914061504, "ts": 1716454223220345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223348624, "dur": 50, "args": { "External id": 124283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124283, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124283, "pid": 5, "tid": 7, "ts": 1716454223348624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220387, "dur": 10, "args": { "External id": 124283, "cbid": 211, "correlation": 124283 } }, { "ph": "s", "id": 124283, "pid": 76337, "tid": -914061504, "ts": 1716454223220387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223348676, "dur": 157, "args": { "External id": 124294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124294, "pid": 5, "tid": 7, "ts": 1716454223348676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220452, "dur": 217, "args": { "External id": 124294, "cbid": 211, "correlation": 124294 } }, { "ph": "s", "id": 124294, "pid": 76337, "tid": -914061504, "ts": 1716454223220452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223220722, "dur": 0, "args": { "External id": 124306, "cbid": 317, "correlation": 124306 } }, { "ph": "f", "id": 124306, "pid": 76337, "tid": -914061504, "ts": 1716454223220722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223220723, "dur": 0, "args": { "External id": 124307, "cbid": 203, "correlation": 124307 } }, { "ph": "f", "id": 124307, "pid": 76337, "tid": -914061504, "ts": 1716454223220723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223220724, "dur": 0, "args": { "External id": 124308, "cbid": 205, "correlation": 124308 } }, { "ph": "f", "id": 124308, "pid": 76337, "tid": -914061504, "ts": 1716454223220724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220747, "dur": 1, "args": { "External id": 124312, "cbid": 251, "correlation": 124312 } }, { "ph": "f", "id": 124312, "pid": 76337, "tid": -914061504, "ts": 1716454223220747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220749, "dur": 0, "args": { "External id": 124313, "cbid": 251, "correlation": 124313 } }, { "ph": "f", "id": 124313, "pid": 76337, "tid": -914061504, "ts": 1716454223220749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220749, "dur": 0, "args": { "External id": 124314, "cbid": 251, "correlation": 124314 } }, { "ph": "f", "id": 124314, "pid": 76337, "tid": -914061504, "ts": 1716454223220749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220750, "dur": 0, "args": { "External id": 124315, "cbid": 251, "correlation": 124315 } }, { "ph": "f", "id": 124315, "pid": 76337, "tid": -914061504, "ts": 1716454223220750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220751, "dur": 0, "args": { "External id": 124316, "cbid": 251, "correlation": 124316 } }, { "ph": "f", "id": 124316, "pid": 76337, "tid": -914061504, "ts": 1716454223220751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220751, "dur": 0, "args": { "External id": 124317, "cbid": 251, "correlation": 124317 } }, { "ph": "f", "id": 124317, "pid": 76337, "tid": -914061504, "ts": 1716454223220751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220752, "dur": 0, "args": { "External id": 124318, "cbid": 251, "correlation": 124318 } }, { "ph": "f", "id": 124318, "pid": 76337, "tid": -914061504, "ts": 1716454223220752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220753, "dur": 0, "args": { "External id": 124319, "cbid": 251, "correlation": 124319 } }, { "ph": "f", "id": 124319, "pid": 76337, "tid": -914061504, "ts": 1716454223220753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223220754, "dur": 0, "args": { "External id": 124320, "cbid": 251, "correlation": 124320 } }, { "ph": "f", "id": 124320, "pid": 76337, "tid": -914061504, "ts": 1716454223220754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223348834, "dur": 115, "args": { "External id": 124321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124321, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124321, "pid": 5, "tid": 7, "ts": 1716454223348834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220756, "dur": 40, "args": { "External id": 124321, "cbid": 211, "correlation": 124321 } }, { "ph": "s", "id": 124321, "pid": 76337, "tid": -914061504, "ts": 1716454223220756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223348950, "dur": 59, "args": { "External id": 124327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124327, "pid": 5, "tid": 7, "ts": 1716454223348950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220820, "dur": 103, "args": { "External id": 124327, "cbid": 211, "correlation": 124327 } }, { "ph": "s", "id": 124327, "pid": 76337, "tid": -914061504, "ts": 1716454223220820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223349010, "dur": 49, "args": { "External id": 124335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124335, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124335, "pid": 5, "tid": 7, "ts": 1716454223349010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223220946, "dur": 284, "args": { "External id": 124335, "cbid": 211, "correlation": 124335 } }, { "ph": "s", "id": 124335, "pid": 76337, "tid": -914061504, "ts": 1716454223220946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223349061, "dur": 98, "args": { "External id": 124344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124344, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124344, "pid": 5, "tid": 7, "ts": 1716454223349061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221261, "dur": 11, "args": { "External id": 124344, "cbid": 211, "correlation": 124344 } }, { "ph": "s", "id": 124344, "pid": 76337, "tid": -914061504, "ts": 1716454223221261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223349161, "dur": 92, "args": { "External id": 124364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124364, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 124364, "pid": 5, "tid": 7, "ts": 1716454223349161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221334, "dur": 12, "args": { "External id": 124364, "cbid": 211, "correlation": 124364 } }, { "ph": "s", "id": 124364, "pid": 76337, "tid": -914061504, "ts": 1716454223221334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223349254, "dur": 4, "args": { "External id": 124376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124376, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 124376, "pid": 5, "tid": 7, "ts": 1716454223349254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221356, "dur": 6, "args": { "External id": 124376, "cbid": 211, "correlation": 124376 } }, { "ph": "s", "id": 124376, "pid": 76337, "tid": -914061504, "ts": 1716454223221356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223349260, "dur": 107, "args": { "External id": 124379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124379, "pid": 5, "tid": 7, "ts": 1716454223349260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221374, "dur": 111, "args": { "External id": 124379, "cbid": 211, "correlation": 124379 } }, { "ph": "s", "id": 124379, "pid": 76337, "tid": -914061504, "ts": 1716454223221374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223349368, "dur": 69, "args": { "External id": 124388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124388, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124388, "pid": 5, "tid": 7, "ts": 1716454223349368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221519, "dur": 10, "args": { "External id": 124388, "cbid": 211, "correlation": 124388 } }, { "ph": "s", "id": 124388, "pid": 76337, "tid": -914061504, "ts": 1716454223221519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223221571, "dur": 0, "args": { "External id": 124398, "cbid": 317, "correlation": 124398 } }, { "ph": "f", "id": 124398, "pid": 76337, "tid": -914061504, "ts": 1716454223221571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223221572, "dur": 0, "args": { "External id": 124399, "cbid": 203, "correlation": 124399 } }, { "ph": "f", "id": 124399, "pid": 76337, "tid": -914061504, "ts": 1716454223221572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223221573, "dur": 0, "args": { "External id": 124400, "cbid": 205, "correlation": 124400 } }, { "ph": "f", "id": 124400, "pid": 76337, "tid": -914061504, "ts": 1716454223221573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223349438, "dur": 75, "args": { "External id": 124404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124404, "pid": 5, "tid": 7, "ts": 1716454223349438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221587, "dur": 12, "args": { "External id": 124404, "cbid": 211, "correlation": 124404 } }, { "ph": "s", "id": 124404, "pid": 76337, "tid": -914061504, "ts": 1716454223221587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223349515, "dur": 24, "args": { "External id": 124406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124406, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124406, "pid": 5, "tid": 7, "ts": 1716454223349515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221601, "dur": 5, "args": { "External id": 124406, "cbid": 211, "correlation": 124406 } }, { "ph": "s", "id": 124406, "pid": 76337, "tid": -914061504, "ts": 1716454223221601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223349540, "dur": 4, "args": { "External id": 124408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 124408, "pid": 5, "tid": 7, "ts": 1716454223349540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221611, "dur": 6, "args": { "External id": 124408, "cbid": 211, "correlation": 124408 } }, { "ph": "s", "id": 124408, "pid": 76337, "tid": -914061504, "ts": 1716454223221611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223221620, "dur": 0, "args": { "External id": 124409, "cbid": 51, "correlation": 124409 } }, { "ph": "s", "id": 124409, "pid": 76337, "tid": -914061504, "ts": 1716454223221620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223349545, "dur": 1359, "args": { "External id": 124410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124410, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124410, "pid": 5, "tid": 7, "ts": 1716454223349545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221621, "dur": 5, "args": { "External id": 124410, "cbid": 211, "correlation": 124410 } }, { "ph": "s", "id": 124410, "pid": 76337, "tid": -914061504, "ts": 1716454223221621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223350905, "dur": 59, "args": { "External id": 124415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124415, "pid": 5, "tid": 7, "ts": 1716454223350905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221648, "dur": 9, "args": { "External id": 124415, "cbid": 211, "correlation": 124415 } }, { "ph": "s", "id": 124415, "pid": 76337, "tid": -914061504, "ts": 1716454223221648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223350965, "dur": 4, "args": { "External id": 124423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124423, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 124423, "pid": 5, "tid": 7, "ts": 1716454223350965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221693, "dur": 10, "args": { "External id": 124423, "cbid": 211, "correlation": 124423 } }, { "ph": "s", "id": 124423, "pid": 76337, "tid": -914061504, "ts": 1716454223221693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223221760, "dur": 2, "args": { "External id": 124439, "cbid": 251, "correlation": 124439 } }, { "ph": "f", "id": 124439, "pid": 76337, "tid": -914061504, "ts": 1716454223221760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223221766, "dur": 0, "args": { "External id": 124441, "cbid": 251, "correlation": 124441 } }, { "ph": "f", "id": 124441, "pid": 76337, "tid": -914061504, "ts": 1716454223221766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223350971, "dur": 11, "args": { "External id": 124442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124442, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 124442, "pid": 5, "tid": 7, "ts": 1716454223350971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221768, "dur": 12, "args": { "External id": 124442, "cbid": 211, "correlation": 124442 } }, { "ph": "s", "id": 124442, "pid": 76337, "tid": -914061504, "ts": 1716454223221768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223350983, "dur": 5, "args": { "External id": 124444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124444, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 124444, "pid": 5, "tid": 7, "ts": 1716454223350983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221782, "dur": 6, "args": { "External id": 124444, "cbid": 211, "correlation": 124444 } }, { "ph": "s", "id": 124444, "pid": 76337, "tid": -914061504, "ts": 1716454223221782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223350989, "dur": 54, "args": { "External id": 124454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124454, "pid": 5, "tid": 7, "ts": 1716454223350989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223221841, "dur": 553, "args": { "External id": 124454, "cbid": 211, "correlation": 124454 } }, { "ph": "s", "id": 124454, "pid": 76337, "tid": -914061504, "ts": 1716454223221841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223351045, "dur": 51, "args": { "External id": 124474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124474, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 124474, "pid": 5, "tid": 7, "ts": 1716454223351045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222450, "dur": 12, "args": { "External id": 124474, "cbid": 211, "correlation": 124474 } }, { "ph": "s", "id": 124474, "pid": 76337, "tid": -914061504, "ts": 1716454223222450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223351098, "dur": 4, "args": { "External id": 124486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124486, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 124486, "pid": 5, "tid": 7, "ts": 1716454223351098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222471, "dur": 7, "args": { "External id": 124486, "cbid": 211, "correlation": 124486 } }, { "ph": "s", "id": 124486, "pid": 76337, "tid": -914061504, "ts": 1716454223222471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223351103, "dur": 55, "args": { "External id": 124489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124489, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124489, "pid": 5, "tid": 7, "ts": 1716454223351103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222492, "dur": 7, "args": { "External id": 124489, "cbid": 211, "correlation": 124489 } }, { "ph": "s", "id": 124489, "pid": 76337, "tid": -914061504, "ts": 1716454223222492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223351159, "dur": 36, "args": { "External id": 124498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124498, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124498, "pid": 5, "tid": 7, "ts": 1716454223351159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222533, "dur": 10, "args": { "External id": 124498, "cbid": 211, "correlation": 124498 } }, { "ph": "s", "id": 124498, "pid": 76337, "tid": -914061504, "ts": 1716454223222533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223222595, "dur": 0, "args": { "External id": 124508, "cbid": 317, "correlation": 124508 } }, { "ph": "f", "id": 124508, "pid": 76337, "tid": -914061504, "ts": 1716454223222595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223222596, "dur": 0, "args": { "External id": 124509, "cbid": 203, "correlation": 124509 } }, { "ph": "f", "id": 124509, "pid": 76337, "tid": -914061504, "ts": 1716454223222596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223222597, "dur": 0, "args": { "External id": 124510, "cbid": 205, "correlation": 124510 } }, { "ph": "f", "id": 124510, "pid": 76337, "tid": -914061504, "ts": 1716454223222597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223351197, "dur": 39, "args": { "External id": 124514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124514, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124514, "pid": 5, "tid": 7, "ts": 1716454223351197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222611, "dur": 12, "args": { "External id": 124514, "cbid": 211, "correlation": 124514 } }, { "ph": "s", "id": 124514, "pid": 76337, "tid": -914061504, "ts": 1716454223222611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223351238, "dur": 14, "args": { "External id": 124516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124516, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124516, "pid": 5, "tid": 7, "ts": 1716454223351238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222625, "dur": 6, "args": { "External id": 124516, "cbid": 211, "correlation": 124516 } }, { "ph": "s", "id": 124516, "pid": 76337, "tid": -914061504, "ts": 1716454223222625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223351253, "dur": 3, "args": { "External id": 124518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124518, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 124518, "pid": 5, "tid": 7, "ts": 1716454223351253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222635, "dur": 5, "args": { "External id": 124518, "cbid": 211, "correlation": 124518 } }, { "ph": "s", "id": 124518, "pid": 76337, "tid": -914061504, "ts": 1716454223222635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223222643, "dur": 0, "args": { "External id": 124519, "cbid": 51, "correlation": 124519 } }, { "ph": "s", "id": 124519, "pid": 76337, "tid": -914061504, "ts": 1716454223222643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223351258, "dur": 694, "args": { "External id": 124520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124520, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124520, "pid": 5, "tid": 7, "ts": 1716454223351258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222644, "dur": 5, "args": { "External id": 124520, "cbid": 211, "correlation": 124520 } }, { "ph": "s", "id": 124520, "pid": 76337, "tid": -914061504, "ts": 1716454223222644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223351954, "dur": 59, "args": { "External id": 124525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124525, "pid": 5, "tid": 7, "ts": 1716454223351954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222671, "dur": 9, "args": { "External id": 124525, "cbid": 211, "correlation": 124525 } }, { "ph": "s", "id": 124525, "pid": 76337, "tid": -914061504, "ts": 1716454223222671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223222728, "dur": 0, "args": { "External id": 124535, "cbid": 317, "correlation": 124535 } }, { "ph": "f", "id": 124535, "pid": 76337, "tid": -914061504, "ts": 1716454223222728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223222729, "dur": 0, "args": { "External id": 124536, "cbid": 203, "correlation": 124536 } }, { "ph": "f", "id": 124536, "pid": 76337, "tid": -914061504, "ts": 1716454223222729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223222730, "dur": 0, "args": { "External id": 124537, "cbid": 205, "correlation": 124537 } }, { "ph": "f", "id": 124537, "pid": 76337, "tid": -914061504, "ts": 1716454223222730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223352014, "dur": 74, "args": { "External id": 124541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124541, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124541, "pid": 5, "tid": 7, "ts": 1716454223352014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222742, "dur": 12, "args": { "External id": 124541, "cbid": 211, "correlation": 124541 } }, { "ph": "s", "id": 124541, "pid": 76337, "tid": -914061504, "ts": 1716454223222742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223352090, "dur": 205, "args": { "External id": 124543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124543, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124543, "pid": 5, "tid": 7, "ts": 1716454223352090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222760, "dur": 6, "args": { "External id": 124543, "cbid": 211, "correlation": 124543 } }, { "ph": "s", "id": 124543, "pid": 76337, "tid": -914061504, "ts": 1716454223222760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223352296, "dur": 38, "args": { "External id": 124545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124545, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124545, "pid": 5, "tid": 7, "ts": 1716454223352296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222771, "dur": 5, "args": { "External id": 124545, "cbid": 211, "correlation": 124545 } }, { "ph": "s", "id": 124545, "pid": 76337, "tid": -914061504, "ts": 1716454223222771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223352335, "dur": 60, "args": { "External id": 124551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124551, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124551, "pid": 5, "tid": 7, "ts": 1716454223352335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223222798, "dur": 513, "args": { "External id": 124551, "cbid": 211, "correlation": 124551 } }, { "ph": "s", "id": 124551, "pid": 76337, "tid": -914061504, "ts": 1716454223222798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223352396, "dur": 50, "args": { "External id": 124559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124559, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124559, "pid": 5, "tid": 7, "ts": 1716454223352396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223332, "dur": 9, "args": { "External id": 124559, "cbid": 211, "correlation": 124559 } }, { "ph": "s", "id": 124559, "pid": 76337, "tid": -914061504, "ts": 1716454223223332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223352448, "dur": 35, "args": { "External id": 124567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124567, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124567, "pid": 5, "tid": 7, "ts": 1716454223352448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223363, "dur": 8, "args": { "External id": 124567, "cbid": 211, "correlation": 124567 } }, { "ph": "s", "id": 124567, "pid": 76337, "tid": -914061504, "ts": 1716454223223363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223352484, "dur": 53, "args": { "External id": 124587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124587, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 124587, "pid": 5, "tid": 7, "ts": 1716454223352484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223445, "dur": 13, "args": { "External id": 124587, "cbid": 211, "correlation": 124587 } }, { "ph": "s", "id": 124587, "pid": 76337, "tid": -914061504, "ts": 1716454223223445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223352539, "dur": 4, "args": { "External id": 124599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124599, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 124599, "pid": 5, "tid": 7, "ts": 1716454223352539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223468, "dur": 6, "args": { "External id": 124599, "cbid": 211, "correlation": 124599 } }, { "ph": "s", "id": 124599, "pid": 76337, "tid": -914061504, "ts": 1716454223223468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223352544, "dur": 54, "args": { "External id": 124602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124602, "pid": 5, "tid": 7, "ts": 1716454223352544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223486, "dur": 7, "args": { "External id": 124602, "cbid": 211, "correlation": 124602 } }, { "ph": "s", "id": 124602, "pid": 76337, "tid": -914061504, "ts": 1716454223223486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223223543, "dur": 0, "args": { "External id": 124613, "cbid": 317, "correlation": 124613 } }, { "ph": "f", "id": 124613, "pid": 76337, "tid": -914061504, "ts": 1716454223223543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223223544, "dur": 0, "args": { "External id": 124614, "cbid": 203, "correlation": 124614 } }, { "ph": "f", "id": 124614, "pid": 76337, "tid": -914061504, "ts": 1716454223223544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223223545, "dur": 0, "args": { "External id": 124615, "cbid": 205, "correlation": 124615 } }, { "ph": "f", "id": 124615, "pid": 76337, "tid": -914061504, "ts": 1716454223223545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223567, "dur": 1, "args": { "External id": 124619, "cbid": 251, "correlation": 124619 } }, { "ph": "f", "id": 124619, "pid": 76337, "tid": -914061504, "ts": 1716454223223567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223570, "dur": 0, "args": { "External id": 124620, "cbid": 251, "correlation": 124620 } }, { "ph": "f", "id": 124620, "pid": 76337, "tid": -914061504, "ts": 1716454223223570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223570, "dur": 0, "args": { "External id": 124621, "cbid": 251, "correlation": 124621 } }, { "ph": "f", "id": 124621, "pid": 76337, "tid": -914061504, "ts": 1716454223223570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223571, "dur": 0, "args": { "External id": 124622, "cbid": 251, "correlation": 124622 } }, { "ph": "f", "id": 124622, "pid": 76337, "tid": -914061504, "ts": 1716454223223571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223572, "dur": 0, "args": { "External id": 124623, "cbid": 251, "correlation": 124623 } }, { "ph": "f", "id": 124623, "pid": 76337, "tid": -914061504, "ts": 1716454223223572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223572, "dur": 0, "args": { "External id": 124624, "cbid": 251, "correlation": 124624 } }, { "ph": "f", "id": 124624, "pid": 76337, "tid": -914061504, "ts": 1716454223223572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223573, "dur": 0, "args": { "External id": 124625, "cbid": 251, "correlation": 124625 } }, { "ph": "f", "id": 124625, "pid": 76337, "tid": -914061504, "ts": 1716454223223573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223574, "dur": 0, "args": { "External id": 124626, "cbid": 251, "correlation": 124626 } }, { "ph": "f", "id": 124626, "pid": 76337, "tid": -914061504, "ts": 1716454223223574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223575, "dur": 0, "args": { "External id": 124627, "cbid": 251, "correlation": 124627 } }, { "ph": "f", "id": 124627, "pid": 76337, "tid": -914061504, "ts": 1716454223223575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223352600, "dur": 113, "args": { "External id": 124628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124628, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124628, "pid": 5, "tid": 7, "ts": 1716454223352600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223577, "dur": 12, "args": { "External id": 124628, "cbid": 211, "correlation": 124628 } }, { "ph": "s", "id": 124628, "pid": 76337, "tid": -914061504, "ts": 1716454223223577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223352714, "dur": 59, "args": { "External id": 124634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124634, "pid": 5, "tid": 7, "ts": 1716454223352714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223612, "dur": 9, "args": { "External id": 124634, "cbid": 211, "correlation": 124634 } }, { "ph": "s", "id": 124634, "pid": 76337, "tid": -914061504, "ts": 1716454223223612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223352775, "dur": 591, "args": { "External id": 124643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124643, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124643, "pid": 5, "tid": 7, "ts": 1716454223352775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223695, "dur": 14, "args": { "External id": 124643, "cbid": 211, "correlation": 124643 } }, { "ph": "s", "id": 124643, "pid": 76337, "tid": -914061504, "ts": 1716454223223695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223353367, "dur": 180, "args": { "External id": 124665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124665, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124665, "pid": 5, "tid": 7, "ts": 1716454223353367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223753, "dur": 11, "args": { "External id": 124665, "cbid": 211, "correlation": 124665 } }, { "ph": "s", "id": 124665, "pid": 76337, "tid": -914061504, "ts": 1716454223223753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223841, "dur": 1, "args": { "External id": 124676, "cbid": 251, "correlation": 124676 } }, { "ph": "f", "id": 124676, "pid": 76337, "tid": -914061504, "ts": 1716454223223841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223353549, "dur": 196, "args": { "External id": 124677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124677, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124677, "pid": 5, "tid": 7, "ts": 1716454223353549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223846, "dur": 14, "args": { "External id": 124677, "cbid": 211, "correlation": 124677 } }, { "ph": "s", "id": 124677, "pid": 76337, "tid": -914061504, "ts": 1716454223223846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223915, "dur": 1, "args": { "External id": 124688, "cbid": 251, "correlation": 124688 } }, { "ph": "f", "id": 124688, "pid": 76337, "tid": -914061504, "ts": 1716454223223915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223353746, "dur": 189, "args": { "External id": 124689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124689, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124689, "pid": 5, "tid": 7, "ts": 1716454223353746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223919, "dur": 11, "args": { "External id": 124689, "cbid": 211, "correlation": 124689 } }, { "ph": "s", "id": 124689, "pid": 76337, "tid": -914061504, "ts": 1716454223223919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223223989, "dur": 1, "args": { "External id": 124700, "cbid": 251, "correlation": 124700 } }, { "ph": "f", "id": 124700, "pid": 76337, "tid": -914061504, "ts": 1716454223223989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223353936, "dur": 187, "args": { "External id": 124701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124701, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124701, "pid": 5, "tid": 7, "ts": 1716454223353936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223223994, "dur": 13, "args": { "External id": 124701, "cbid": 211, "correlation": 124701 } }, { "ph": "s", "id": 124701, "pid": 76337, "tid": -914061504, "ts": 1716454223223994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223354125, "dur": 18569, "args": { "External id": 124722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124722, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124722, "pid": 5, "tid": 7, "ts": 1716454223354125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223224076, "dur": 12, "args": { "External id": 124722, "cbid": 211, "correlation": 124722 } }, { "ph": "s", "id": 124722, "pid": 76337, "tid": -914061504, "ts": 1716454223224076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223224174, "dur": 1, "args": { "External id": 124740, "cbid": 251, "correlation": 124740 } }, { "ph": "f", "id": 124740, "pid": 76337, "tid": -914061504, "ts": 1716454223224174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223372695, "dur": 205, "args": { "External id": 124742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124742, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124742, "pid": 5, "tid": 7, "ts": 1716454223372695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223224180, "dur": 13, "args": { "External id": 124742, "cbid": 211, "correlation": 124742 } }, { "ph": "s", "id": 124742, "pid": 76337, "tid": -914061504, "ts": 1716454223224180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223372902, "dur": 66, "args": { "External id": 124750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124750, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124750, "pid": 5, "tid": 7, "ts": 1716454223372902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223224250, "dur": 13, "args": { "External id": 124750, "cbid": 211, "correlation": 124750 } }, { "ph": "s", "id": 124750, "pid": 76337, "tid": -914061504, "ts": 1716454223224250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223372970, "dur": 97, "args": { "External id": 124758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124758, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124758, "pid": 5, "tid": 7, "ts": 1716454223372970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223224290, "dur": 78, "args": { "External id": 124758, "cbid": 211, "correlation": 124758 } }, { "ph": "s", "id": 124758, "pid": 76337, "tid": -914061504, "ts": 1716454223224290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223373068, "dur": 54, "args": { "External id": 124769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124769, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124769, "pid": 5, "tid": 7, "ts": 1716454223373068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223224431, "dur": 1887, "args": { "External id": 124769, "cbid": 211, "correlation": 124769 } }, { "ph": "s", "id": 124769, "pid": 76337, "tid": -914061504, "ts": 1716454223224431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223373123, "dur": 91, "args": { "External id": 124791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124791, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124791, "pid": 5, "tid": 7, "ts": 1716454223373123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226337, "dur": 126, "args": { "External id": 124791, "cbid": 211, "correlation": 124791 } }, { "ph": "s", "id": 124791, "pid": 76337, "tid": -914061504, "ts": 1716454223226337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223226540, "dur": 1, "args": { "External id": 124802, "cbid": 251, "correlation": 124802 } }, { "ph": "f", "id": 124802, "pid": 76337, "tid": -914061504, "ts": 1716454223226540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223373215, "dur": 102, "args": { "External id": 124803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124803, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124803, "pid": 5, "tid": 7, "ts": 1716454223373215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226546, "dur": 13, "args": { "External id": 124803, "cbid": 211, "correlation": 124803 } }, { "ph": "s", "id": 124803, "pid": 76337, "tid": -914061504, "ts": 1716454223226546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223226617, "dur": 1, "args": { "External id": 124814, "cbid": 251, "correlation": 124814 } }, { "ph": "f", "id": 124814, "pid": 76337, "tid": -914061504, "ts": 1716454223226617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223226621, "dur": 0, "args": { "External id": 124815, "cbid": 251, "correlation": 124815 } }, { "ph": "f", "id": 124815, "pid": 76337, "tid": -914061504, "ts": 1716454223226621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223373319, "dur": 10, "args": { "External id": 124816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124816, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 124816, "pid": 5, "tid": 7, "ts": 1716454223373319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226622, "dur": 13, "args": { "External id": 124816, "cbid": 211, "correlation": 124816 } }, { "ph": "s", "id": 124816, "pid": 76337, "tid": -914061504, "ts": 1716454223226622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223373330, "dur": 5, "args": { "External id": 124818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124818, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 124818, "pid": 5, "tid": 7, "ts": 1716454223373330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226637, "dur": 6, "args": { "External id": 124818, "cbid": 211, "correlation": 124818 } }, { "ph": "s", "id": 124818, "pid": 76337, "tid": -914061504, "ts": 1716454223226637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223226699, "dur": 1, "args": { "External id": 124829, "cbid": 251, "correlation": 124829 } }, { "ph": "f", "id": 124829, "pid": 76337, "tid": -914061504, "ts": 1716454223226699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223226702, "dur": 0, "args": { "External id": 124830, "cbid": 251, "correlation": 124830 } }, { "ph": "f", "id": 124830, "pid": 76337, "tid": -914061504, "ts": 1716454223226702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223373337, "dur": 6, "args": { "External id": 124831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124831, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 124831, "pid": 5, "tid": 7, "ts": 1716454223373337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226703, "dur": 12, "args": { "External id": 124831, "cbid": 211, "correlation": 124831 } }, { "ph": "s", "id": 124831, "pid": 76337, "tid": -914061504, "ts": 1716454223226703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223373344, "dur": 3, "args": { "External id": 124833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124833, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 124833, "pid": 5, "tid": 7, "ts": 1716454223373344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226717, "dur": 6, "args": { "External id": 124833, "cbid": 211, "correlation": 124833 } }, { "ph": "s", "id": 124833, "pid": 76337, "tid": -914061504, "ts": 1716454223226717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223373349, "dur": 155, "args": { "External id": 124854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124854, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124854, "pid": 5, "tid": 7, "ts": 1716454223373349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226791, "dur": 12, "args": { "External id": 124854, "cbid": 211, "correlation": 124854 } }, { "ph": "s", "id": 124854, "pid": 76337, "tid": -914061504, "ts": 1716454223226791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223226888, "dur": 1, "args": { "External id": 124872, "cbid": 251, "correlation": 124872 } }, { "ph": "f", "id": 124872, "pid": 76337, "tid": -914061504, "ts": 1716454223226888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223373506, "dur": 108, "args": { "External id": 124874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124874, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 124874, "pid": 5, "tid": 7, "ts": 1716454223373506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226894, "dur": 13, "args": { "External id": 124874, "cbid": 211, "correlation": 124874 } }, { "ph": "s", "id": 124874, "pid": 76337, "tid": -914061504, "ts": 1716454223226894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223373615, "dur": 35, "args": { "External id": 124882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124882, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124882, "pid": 5, "tid": 7, "ts": 1716454223373615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223226963, "dur": 22, "args": { "External id": 124882, "cbid": 211, "correlation": 124882 } }, { "ph": "s", "id": 124882, "pid": 76337, "tid": -914061504, "ts": 1716454223226963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223373651, "dur": 67, "args": { "External id": 124890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124890, "pid": 5, "tid": 7, "ts": 1716454223373651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227014, "dur": 10, "args": { "External id": 124890, "cbid": 211, "correlation": 124890 } }, { "ph": "s", "id": 124890, "pid": 76337, "tid": -914061504, "ts": 1716454223227014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223373719, "dur": 92, "args": { "External id": 124912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124912, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124912, "pid": 5, "tid": 7, "ts": 1716454223373719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227066, "dur": 10, "args": { "External id": 124912, "cbid": 211, "correlation": 124912 } }, { "ph": "s", "id": 124912, "pid": 76337, "tid": -914061504, "ts": 1716454223227066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227151, "dur": 1, "args": { "External id": 124928, "cbid": 251, "correlation": 124928 } }, { "ph": "f", "id": 124928, "pid": 76337, "tid": -914061504, "ts": 1716454223227151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223373812, "dur": 570, "args": { "External id": 124930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124930, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 124930, "pid": 5, "tid": 7, "ts": 1716454223373812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227157, "dur": 13, "args": { "External id": 124930, "cbid": 211, "correlation": 124930 } }, { "ph": "s", "id": 124930, "pid": 76337, "tid": -914061504, "ts": 1716454223227157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223374383, "dur": 242, "args": { "External id": 124938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124938, "pid": 5, "tid": 7, "ts": 1716454223374383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227222, "dur": 13, "args": { "External id": 124938, "cbid": 211, "correlation": 124938 } }, { "ph": "s", "id": 124938, "pid": 76337, "tid": -914061504, "ts": 1716454223227222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223374627, "dur": 251, "args": { "External id": 124946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124946, "pid": 5, "tid": 7, "ts": 1716454223374627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227254, "dur": 8, "args": { "External id": 124946, "cbid": 211, "correlation": 124946 } }, { "ph": "s", "id": 124946, "pid": 76337, "tid": -914061504, "ts": 1716454223227254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227335, "dur": 1, "args": { "External id": 124962, "cbid": 251, "correlation": 124962 } }, { "ph": "f", "id": 124962, "pid": 76337, "tid": -914061504, "ts": 1716454223227335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227340, "dur": 0, "args": { "External id": 124964, "cbid": 251, "correlation": 124964 } }, { "ph": "f", "id": 124964, "pid": 76337, "tid": -914061504, "ts": 1716454223227340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223374879, "dur": 358, "args": { "External id": 124965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124965, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 124965, "pid": 5, "tid": 7, "ts": 1716454223374879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227343, "dur": 12, "args": { "External id": 124965, "cbid": 211, "correlation": 124965 } }, { "ph": "s", "id": 124965, "pid": 76337, "tid": -914061504, "ts": 1716454223227343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223375238, "dur": 50, "args": { "External id": 124973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124973, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124973, "pid": 5, "tid": 7, "ts": 1716454223375238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227385, "dur": 188, "args": { "External id": 124973, "cbid": 211, "correlation": 124973 } }, { "ph": "s", "id": 124973, "pid": 76337, "tid": -914061504, "ts": 1716454223227385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223375289, "dur": 158, "args": { "External id": 124984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 124984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 124984, "pid": 5, "tid": 7, "ts": 1716454223375289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227631, "dur": 69, "args": { "External id": 124984, "cbid": 211, "correlation": 124984 } }, { "ph": "s", "id": 124984, "pid": 76337, "tid": -914061504, "ts": 1716454223227631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223227753, "dur": 0, "args": { "External id": 124996, "cbid": 317, "correlation": 124996 } }, { "ph": "f", "id": 124996, "pid": 76337, "tid": -914061504, "ts": 1716454223227753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223227754, "dur": 0, "args": { "External id": 124997, "cbid": 203, "correlation": 124997 } }, { "ph": "f", "id": 124997, "pid": 76337, "tid": -914061504, "ts": 1716454223227754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223227754, "dur": 0, "args": { "External id": 124998, "cbid": 205, "correlation": 124998 } }, { "ph": "f", "id": 124998, "pid": 76337, "tid": -914061504, "ts": 1716454223227754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227778, "dur": 1, "args": { "External id": 125002, "cbid": 251, "correlation": 125002 } }, { "ph": "f", "id": 125002, "pid": 76337, "tid": -914061504, "ts": 1716454223227778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227780, "dur": 0, "args": { "External id": 125003, "cbid": 251, "correlation": 125003 } }, { "ph": "f", "id": 125003, "pid": 76337, "tid": -914061504, "ts": 1716454223227780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227781, "dur": 0, "args": { "External id": 125004, "cbid": 251, "correlation": 125004 } }, { "ph": "f", "id": 125004, "pid": 76337, "tid": -914061504, "ts": 1716454223227781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227781, "dur": 0, "args": { "External id": 125005, "cbid": 251, "correlation": 125005 } }, { "ph": "f", "id": 125005, "pid": 76337, "tid": -914061504, "ts": 1716454223227781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227782, "dur": 0, "args": { "External id": 125006, "cbid": 251, "correlation": 125006 } }, { "ph": "f", "id": 125006, "pid": 76337, "tid": -914061504, "ts": 1716454223227782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227783, "dur": 0, "args": { "External id": 125007, "cbid": 251, "correlation": 125007 } }, { "ph": "f", "id": 125007, "pid": 76337, "tid": -914061504, "ts": 1716454223227783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227784, "dur": 0, "args": { "External id": 125008, "cbid": 251, "correlation": 125008 } }, { "ph": "f", "id": 125008, "pid": 76337, "tid": -914061504, "ts": 1716454223227784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227784, "dur": 0, "args": { "External id": 125009, "cbid": 251, "correlation": 125009 } }, { "ph": "f", "id": 125009, "pid": 76337, "tid": -914061504, "ts": 1716454223227784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223227785, "dur": 0, "args": { "External id": 125010, "cbid": 251, "correlation": 125010 } }, { "ph": "f", "id": 125010, "pid": 76337, "tid": -914061504, "ts": 1716454223227785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223375448, "dur": 114, "args": { "External id": 125011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125011, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 125011, "pid": 5, "tid": 7, "ts": 1716454223375448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227787, "dur": 41, "args": { "External id": 125011, "cbid": 211, "correlation": 125011 } }, { "ph": "s", "id": 125011, "pid": 76337, "tid": -914061504, "ts": 1716454223227787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223375564, "dur": 60, "args": { "External id": 125017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125017, "pid": 5, "tid": 7, "ts": 1716454223375564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223227851, "dur": 280, "args": { "External id": 125017, "cbid": 211, "correlation": 125017 } }, { "ph": "s", "id": 125017, "pid": 76337, "tid": -914061504, "ts": 1716454223227851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223375626, "dur": 50, "args": { "External id": 125025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125025, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125025, "pid": 5, "tid": 7, "ts": 1716454223375626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228155, "dur": 9, "args": { "External id": 125025, "cbid": 211, "correlation": 125025 } }, { "ph": "s", "id": 125025, "pid": 76337, "tid": -914061504, "ts": 1716454223228155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223375677, "dur": 53, "args": { "External id": 125045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125045, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 125045, "pid": 5, "tid": 7, "ts": 1716454223375677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228236, "dur": 12, "args": { "External id": 125045, "cbid": 211, "correlation": 125045 } }, { "ph": "s", "id": 125045, "pid": 76337, "tid": -914061504, "ts": 1716454223228236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223375731, "dur": 4, "args": { "External id": 125057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125057, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 125057, "pid": 5, "tid": 7, "ts": 1716454223375731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228258, "dur": 7, "args": { "External id": 125057, "cbid": 211, "correlation": 125057 } }, { "ph": "s", "id": 125057, "pid": 76337, "tid": -914061504, "ts": 1716454223228258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223375737, "dur": 56, "args": { "External id": 125060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125060, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125060, "pid": 5, "tid": 7, "ts": 1716454223375737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228277, "dur": 112, "args": { "External id": 125060, "cbid": 211, "correlation": 125060 } }, { "ph": "s", "id": 125060, "pid": 76337, "tid": -914061504, "ts": 1716454223228277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223375794, "dur": 37, "args": { "External id": 125069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125069, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125069, "pid": 5, "tid": 7, "ts": 1716454223375794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228426, "dur": 11, "args": { "External id": 125069, "cbid": 211, "correlation": 125069 } }, { "ph": "s", "id": 125069, "pid": 76337, "tid": -914061504, "ts": 1716454223228426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223228482, "dur": 0, "args": { "External id": 125079, "cbid": 317, "correlation": 125079 } }, { "ph": "f", "id": 125079, "pid": 76337, "tid": -914061504, "ts": 1716454223228482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223228483, "dur": 0, "args": { "External id": 125080, "cbid": 203, "correlation": 125080 } }, { "ph": "f", "id": 125080, "pid": 76337, "tid": -914061504, "ts": 1716454223228483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223228484, "dur": 0, "args": { "External id": 125081, "cbid": 205, "correlation": 125081 } }, { "ph": "f", "id": 125081, "pid": 76337, "tid": -914061504, "ts": 1716454223228484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223375832, "dur": 42, "args": { "External id": 125085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125085, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125085, "pid": 5, "tid": 7, "ts": 1716454223375832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228500, "dur": 12, "args": { "External id": 125085, "cbid": 211, "correlation": 125085 } }, { "ph": "s", "id": 125085, "pid": 76337, "tid": -914061504, "ts": 1716454223228500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223375875, "dur": 3, "args": { "External id": 125087, "device": 5, "context": 1, "stream": 7, "correlation": 125087, "bytes": 46080, "memory bandwidth (GB/s)": 11.80327868852459 } }, { "ph": "f", "id": 125087, "pid": 5, "tid": 7, "ts": 1716454223375875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223228515, "dur": 17, "args": { "External id": 125087, "cbid": 51, "correlation": 125087 } }, { "ph": "s", "id": 125087, "pid": 76337, "tid": -914061504, "ts": 1716454223228515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223228538, "dur": 2, "args": { "External id": 125089, "cbid": 200, "correlation": 125089 } }, { "ph": "f", "id": 125089, "pid": 76337, "tid": -914061504, "ts": 1716454223228538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223228540, "dur": 0, "args": { "External id": 125090, "cbid": 200, "correlation": 125090 } }, { "ph": "f", "id": 125090, "pid": 76337, "tid": -914061504, "ts": 1716454223228540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223228541, "dur": 0, "args": { "External id": 125091, "cbid": 200, "correlation": 125091 } }, { "ph": "f", "id": 125091, "pid": 76337, "tid": -914061504, "ts": 1716454223228541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223228541, "dur": 0, "args": { "External id": 125092, "cbid": 200, "correlation": 125092 } }, { "ph": "f", "id": 125092, "pid": 76337, "tid": -914061504, "ts": 1716454223228541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454223228543, "dur": 4, "args": { "External id": 125093, "cbid": 15, "correlation": 125093 } }, { "ph": "f", "id": 125093, "pid": 76337, "tid": -914061504, "ts": 1716454223228543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223228547, "dur": 1, "args": { "External id": 125094, "cbid": 251, "correlation": 125094 } }, { "ph": "f", "id": 125094, "pid": 76337, "tid": -914061504, "ts": 1716454223228547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454223375880, "dur": 25, "args": { "External id": 125095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125095, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125095, "pid": 5, "tid": 7, "ts": 1716454223375880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228550, "dur": 8, "args": { "External id": 125095, "cbid": 211, "correlation": 125095 } }, { "ph": "s", "id": 125095, "pid": 76337, "tid": -914061504, "ts": 1716454223228550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223375906, "dur": 4, "args": { "External id": 125097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 125097, "pid": 5, "tid": 7, "ts": 1716454223375906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228564, "dur": 6, "args": { "External id": 125097, "cbid": 211, "correlation": 125097 } }, { "ph": "s", "id": 125097, "pid": 76337, "tid": -914061504, "ts": 1716454223228564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223228575, "dur": 0, "args": { "External id": 125098, "cbid": 51, "correlation": 125098 } }, { "ph": "s", "id": 125098, "pid": 76337, "tid": -914061504, "ts": 1716454223228575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223375912, "dur": 187, "args": { "External id": 125099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125099, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125099, "pid": 5, "tid": 7, "ts": 1716454223375912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228576, "dur": 203, "args": { "External id": 125099, "cbid": 211, "correlation": 125099 } }, { "ph": "s", "id": 125099, "pid": 76337, "tid": -914061504, "ts": 1716454223228576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223376101, "dur": 6, "args": { "External id": 125100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125100, "pid": 5, "tid": 7, "ts": 1716454223376101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228782, "dur": 6, "args": { "External id": 125100, "cbid": 211, "correlation": 125100 } }, { "ph": "s", "id": 125100, "pid": 76337, "tid": -914061504, "ts": 1716454223228782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223376108, "dur": 5, "args": { "External id": 125106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 125106, "pid": 5, "tid": 7, "ts": 1716454223376108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223228812, "dur": 9, "args": { "External id": 125106, "cbid": 211, "correlation": 125106 } }, { "ph": "s", "id": 125106, "pid": 76337, "tid": -914061504, "ts": 1716454223228812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376114, "dur": 3, "args": { "External id": 125114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125114, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125114, "pid": 5, "tid": 7, "ts": 1716454223376114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223230504, "dur": 15, "args": { "External id": 125114, "cbid": 211, "correlation": 125114 } }, { "ph": "s", "id": 125114, "pid": 76337, "tid": -914061504, "ts": 1716454223230504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376118, "dur": 3, "args": { "External id": 125122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125122, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125122, "pid": 5, "tid": 7, "ts": 1716454223376118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223230545, "dur": 10, "args": { "External id": 125122, "cbid": 211, "correlation": 125122 } }, { "ph": "s", "id": 125122, "pid": 76337, "tid": -914061504, "ts": 1716454223230545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376123, "dur": 3, "args": { "External id": 125130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125130, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125130, "pid": 5, "tid": 7, "ts": 1716454223376123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223230572, "dur": 9, "args": { "External id": 125130, "cbid": 211, "correlation": 125130 } }, { "ph": "s", "id": 125130, "pid": 76337, "tid": -914061504, "ts": 1716454223230572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376127, "dur": 3, "args": { "External id": 125139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125139, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125139, "pid": 5, "tid": 7, "ts": 1716454223376127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223230745, "dur": 14, "args": { "External id": 125139, "cbid": 211, "correlation": 125139 } }, { "ph": "s", "id": 125139, "pid": 76337, "tid": -914061504, "ts": 1716454223230745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376131, "dur": 3, "args": { "External id": 125148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125148, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125148, "pid": 5, "tid": 7, "ts": 1716454223376131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223230775, "dur": 7, "args": { "External id": 125148, "cbid": 211, "correlation": 125148 } }, { "ph": "s", "id": 125148, "pid": 76337, "tid": -914061504, "ts": 1716454223230775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376135, "dur": 3, "args": { "External id": 125156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125156, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125156, "pid": 5, "tid": 7, "ts": 1716454223376135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223230799, "dur": 8, "args": { "External id": 125156, "cbid": 211, "correlation": 125156 } }, { "ph": "s", "id": 125156, "pid": 76337, "tid": -914061504, "ts": 1716454223230799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376139, "dur": 3, "args": { "External id": 125164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125164, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125164, "pid": 5, "tid": 7, "ts": 1716454223376139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223231066, "dur": 16, "args": { "External id": 125164, "cbid": 211, "correlation": 125164 } }, { "ph": "s", "id": 125164, "pid": 76337, "tid": -914061504, "ts": 1716454223231066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376143, "dur": 3, "args": { "External id": 125172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125172, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125172, "pid": 5, "tid": 7, "ts": 1716454223376143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223231098, "dur": 8, "args": { "External id": 125172, "cbid": 211, "correlation": 125172 } }, { "ph": "s", "id": 125172, "pid": 76337, "tid": -914061504, "ts": 1716454223231098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376148, "dur": 1, "args": { "External id": 125182, "device": 5, "context": 1, "stream": 7, "correlation": 125182, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 125182, "pid": 5, "tid": 7, "ts": 1716454223376148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223231164, "dur": 32, "args": { "External id": 125182, "cbid": 41, "correlation": 125182 } }, { "ph": "s", "id": 125182, "pid": 76337, "tid": -914061504, "ts": 1716454223231164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223231197, "dur": 144968, "args": { "External id": 125183, "cbid": 131, "correlation": 125183 } }, { "ph": "f", "id": 125183, "pid": 76337, "tid": -914061504, "ts": 1716454223231197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223376322, "dur": 3, "args": { "External id": 125191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125191, "pid": 5, "tid": 7, "ts": 1716454223376322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376299, "dur": 24, "args": { "External id": 125191, "cbid": 211, "correlation": 125191 } }, { "ph": "s", "id": 125191, "pid": 76337, "tid": -914061504, "ts": 1716454223376299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376413, "dur": 3, "args": { "External id": 125200, "device": 5, "context": 1, "stream": 7, "correlation": 125200, "bytes": 8, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 125200, "pid": 5, "tid": 7, "ts": 1716454223376413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223376384, "dur": 29, "args": { "External id": 125200, "cbid": 41, "correlation": 125200 } }, { "ph": "s", "id": 125200, "pid": 76337, "tid": -914061504, "ts": 1716454223376384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223376512, "dur": 4, "args": { "External id": 125210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125210, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125210, "pid": 5, "tid": 7, "ts": 1716454223376512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376496, "dur": 17, "args": { "External id": 125210, "cbid": 211, "correlation": 125210 } }, { "ph": "s", "id": 125210, "pid": 76337, "tid": -914061504, "ts": 1716454223376496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376585, "dur": 1, "args": { "External id": 125220, "device": 5, "context": 1, "stream": 7, "correlation": 125220, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 125220, "pid": 5, "tid": 7, "ts": 1716454223376585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223376566, "dur": 17, "args": { "External id": 125220, "cbid": 41, "correlation": 125220 } }, { "ph": "s", "id": 125220, "pid": 76337, "tid": -914061504, "ts": 1716454223376566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223376584, "dur": 8, "args": { "External id": 125221, "cbid": 131, "correlation": 125221 } }, { "ph": "f", "id": 125221, "pid": 76337, "tid": -914061504, "ts": 1716454223376584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376654, "dur": 3, "args": { "External id": 125228, "device": 5, "context": 1, "stream": 7, "correlation": 125228, "bytes": 98304, "memory bandwidth (GB/s)": 30.72 } }, { "ph": "f", "id": 125228, "pid": 5, "tid": 7, "ts": 1716454223376654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223376632, "dur": 22, "args": { "External id": 125228, "cbid": 41, "correlation": 125228 } }, { "ph": "s", "id": 125228, "pid": 76337, "tid": -914061504, "ts": 1716454223376632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376740, "dur": 3, "args": { "External id": 125247, "device": 5, "context": 1, "stream": 7, "correlation": 125247, "bytes": 16, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 125247, "pid": 5, "tid": 7, "ts": 1716454223376740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223376722, "dur": 18, "args": { "External id": 125247, "cbid": 41, "correlation": 125247 } }, { "ph": "s", "id": 125247, "pid": 76337, "tid": -914061504, "ts": 1716454223376722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223376777, "dur": 3, "args": { "External id": 125253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125253, "pid": 5, "tid": 7, "ts": 1716454223376777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376766, "dur": 11, "args": { "External id": 125253, "cbid": 211, "correlation": 125253 } }, { "ph": "s", "id": 125253, "pid": 76337, "tid": -914061504, "ts": 1716454223376766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454223376791, "dur": 6, "args": { "External id": 125255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125255, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 125255, "pid": 5, "tid": 7, "ts": 1716454223376791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376781, "dur": 10, "args": { "External id": 125255, "cbid": 211, "correlation": 125255 } }, { "ph": "s", "id": 125255, "pid": 76337, "tid": -914061504, "ts": 1716454223376781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454223376800, "dur": 3, "args": { "External id": 125257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125257, "pid": 5, "tid": 7, "ts": 1716454223376800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376792, "dur": 7, "args": { "External id": 125257, "cbid": 211, "correlation": 125257 } }, { "ph": "s", "id": 125257, "pid": 76337, "tid": -914061504, "ts": 1716454223376792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376835, "dur": 2, "args": { "External id": 125265, "device": 5, "context": 1, "stream": 7, "correlation": 125265, "bytes": 8, "memory bandwidth (GB/s)": 0.002777777777777778 } }, { "ph": "f", "id": 125265, "pid": 5, "tid": 7, "ts": 1716454223376835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223376821, "dur": 13, "args": { "External id": 125265, "cbid": 41, "correlation": 125265 } }, { "ph": "s", "id": 125265, "pid": 76337, "tid": -914061504, "ts": 1716454223376821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223376883, "dur": 3, "args": { "External id": 125279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125279, "pid": 5, "tid": 7, "ts": 1716454223376883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376872, "dur": 12, "args": { "External id": 125279, "cbid": 211, "correlation": 125279 } }, { "ph": "s", "id": 125279, "pid": 76337, "tid": -914061504, "ts": 1716454223376872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223376903, "dur": 2, "args": { "External id": 125293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125293, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125293, "pid": 5, "tid": 7, "ts": 1716454223376903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376895, "dur": 6, "args": { "External id": 125293, "cbid": 211, "correlation": 125293 } }, { "ph": "s", "id": 125293, "pid": 76337, "tid": -914061504, "ts": 1716454223376895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223376939, "dur": 6, "args": { "External id": 125300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125300, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125300, "pid": 5, "tid": 7, "ts": 1716454223376939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376929, "dur": 10, "args": { "External id": 125300, "cbid": 211, "correlation": 125300 } }, { "ph": "s", "id": 125300, "pid": 76337, "tid": -914061504, "ts": 1716454223376929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223376949, "dur": 6, "args": { "External id": 125303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125303, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125303, "pid": 5, "tid": 7, "ts": 1716454223376949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376942, "dur": 7, "args": { "External id": 125303, "cbid": 211, "correlation": 125303 } }, { "ph": "s", "id": 125303, "pid": 76337, "tid": -914061504, "ts": 1716454223376942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454223376958, "dur": 3, "args": { "External id": 125305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125305, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125305, "pid": 5, "tid": 7, "ts": 1716454223376958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223376950, "dur": 7, "args": { "External id": 125305, "cbid": 211, "correlation": 125305 } }, { "ph": "s", "id": 125305, "pid": 76337, "tid": -914061504, "ts": 1716454223376950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223376984, "dur": 2, "args": { "External id": 125308, "device": 5, "context": 1, "stream": 7, "correlation": 125308, "bytes": 8, "memory bandwidth (GB/s)": 0.002777777777777778 } }, { "ph": "f", "id": 125308, "pid": 5, "tid": 7, "ts": 1716454223376984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223376965, "dur": 19, "args": { "External id": 125308, "cbid": 41, "correlation": 125308 } }, { "ph": "s", "id": 125308, "pid": 76337, "tid": -914061504, "ts": 1716454223376965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223377039, "dur": 4, "args": { "External id": 125324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125324, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125324, "pid": 5, "tid": 7, "ts": 1716454223377039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377026, "dur": 14, "args": { "External id": 125324, "cbid": 211, "correlation": 125324 } }, { "ph": "s", "id": 125324, "pid": 76337, "tid": -914061504, "ts": 1716454223377026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223377060, "dur": 3, "args": { "External id": 125329, "device": 5, "context": 1, "stream": 7, "correlation": 125329, "bytes": 1, "memory bandwidth (GB/s)": 0.0003156565656565657 } }, { "ph": "f", "id": 125329, "pid": 5, "tid": 7, "ts": 1716454223377060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223377045, "dur": 15, "args": { "External id": 125329, "cbid": 41, "correlation": 125329 } }, { "ph": "s", "id": 125329, "pid": 76337, "tid": -914061504, "ts": 1716454223377045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223377088, "dur": 1, "args": { "External id": 125335, "device": 5, "context": 1, "stream": 7, "correlation": 125335, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 125335, "pid": 5, "tid": 7, "ts": 1716454223377088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223377068, "dur": 29, "args": { "External id": 125335, "cbid": 41, "correlation": 125335 } }, { "ph": "s", "id": 125335, "pid": 76337, "tid": -914061504, "ts": 1716454223377068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223377098, "dur": 4, "args": { "External id": 125336, "cbid": 131, "correlation": 125336 } }, { "ph": "f", "id": 125336, "pid": 76337, "tid": -914061504, "ts": 1716454223377098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377151, "dur": 3, "args": { "External id": 125344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125344, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125344, "pid": 5, "tid": 7, "ts": 1716454223377151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377137, "dur": 14, "args": { "External id": 125344, "cbid": 211, "correlation": 125344 } }, { "ph": "s", "id": 125344, "pid": 76337, "tid": -914061504, "ts": 1716454223377137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377182, "dur": 3, "args": { "External id": 125354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125354, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125354, "pid": 5, "tid": 7, "ts": 1716454223377182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377172, "dur": 9, "args": { "External id": 125354, "cbid": 211, "correlation": 125354 } }, { "ph": "s", "id": 125354, "pid": 76337, "tid": -914061504, "ts": 1716454223377172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377212, "dur": 3, "args": { "External id": 125363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125363, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125363, "pid": 5, "tid": 7, "ts": 1716454223377212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377198, "dur": 13, "args": { "External id": 125363, "cbid": 211, "correlation": 125363 } }, { "ph": "s", "id": 125363, "pid": 76337, "tid": -914061504, "ts": 1716454223377198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223377329, "dur": 12, "args": { "External id": 125373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125373, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125373, "pid": 5, "tid": 7, "ts": 1716454223377329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377315, "dur": 15, "args": { "External id": 125373, "cbid": 211, "correlation": 125373 } }, { "ph": "s", "id": 125373, "pid": 76337, "tid": -914061504, "ts": 1716454223377315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377368, "dur": 3, "args": { "External id": 125381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125381, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125381, "pid": 5, "tid": 7, "ts": 1716454223377368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377359, "dur": 8, "args": { "External id": 125381, "cbid": 211, "correlation": 125381 } }, { "ph": "s", "id": 125381, "pid": 76337, "tid": -914061504, "ts": 1716454223377359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223377413, "dur": 12, "args": { "External id": 125391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125391, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125391, "pid": 5, "tid": 7, "ts": 1716454223377413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377402, "dur": 11, "args": { "External id": 125391, "cbid": 211, "correlation": 125391 } }, { "ph": "s", "id": 125391, "pid": 76337, "tid": -914061504, "ts": 1716454223377402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223377445, "dur": 10, "args": { "External id": 125399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125399, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125399, "pid": 5, "tid": 7, "ts": 1716454223377445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377436, "dur": 9, "args": { "External id": 125399, "cbid": 211, "correlation": 125399 } }, { "ph": "s", "id": 125399, "pid": 76337, "tid": -914061504, "ts": 1716454223377436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377472, "dur": 3, "args": { "External id": 125408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125408, "pid": 5, "tid": 7, "ts": 1716454223377472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377463, "dur": 9, "args": { "External id": 125408, "cbid": 211, "correlation": 125408 } }, { "ph": "s", "id": 125408, "pid": 76337, "tid": -914061504, "ts": 1716454223377463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223377495, "dur": 5, "args": { "External id": 125417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125417, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125417, "pid": 5, "tid": 7, "ts": 1716454223377495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377487, "dur": 7, "args": { "External id": 125417, "cbid": 211, "correlation": 125417 } }, { "ph": "s", "id": 125417, "pid": 76337, "tid": -914061504, "ts": 1716454223377487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223377534, "dur": 8, "args": { "External id": 125427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125427, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125427, "pid": 5, "tid": 7, "ts": 1716454223377534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377523, "dur": 11, "args": { "External id": 125427, "cbid": 211, "correlation": 125427 } }, { "ph": "s", "id": 125427, "pid": 76337, "tid": -914061504, "ts": 1716454223377523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377849, "dur": 3, "args": { "External id": 125436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125436, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125436, "pid": 5, "tid": 7, "ts": 1716454223377849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377834, "dur": 15, "args": { "External id": 125436, "cbid": 211, "correlation": 125436 } }, { "ph": "s", "id": 125436, "pid": 76337, "tid": -914061504, "ts": 1716454223377834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223377878, "dur": 3, "args": { "External id": 125444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125444, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125444, "pid": 5, "tid": 7, "ts": 1716454223377878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223377867, "dur": 10, "args": { "External id": 125444, "cbid": 211, "correlation": 125444 } }, { "ph": "s", "id": 125444, "pid": 76337, "tid": -914061504, "ts": 1716454223377867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223377930, "dur": 1, "args": { "External id": 125454, "device": 5, "context": 1, "stream": 7, "correlation": 125454, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 125454, "pid": 5, "tid": 7, "ts": 1716454223377930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223377914, "dur": 14, "args": { "External id": 125454, "cbid": 41, "correlation": 125454 } }, { "ph": "s", "id": 125454, "pid": 76337, "tid": -914061504, "ts": 1716454223377914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223377929, "dur": 8, "args": { "External id": 125455, "cbid": 131, "correlation": 125455 } }, { "ph": "f", "id": 125455, "pid": 76337, "tid": -914061504, "ts": 1716454223377929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378028, "dur": 2, "args": { "External id": 125463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125463, "pid": 5, "tid": 7, "ts": 1716454223378028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378012, "dur": 16, "args": { "External id": 125463, "cbid": 211, "correlation": 125463 } }, { "ph": "s", "id": 125463, "pid": 76337, "tid": -914061504, "ts": 1716454223378012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223378101, "dur": 3, "args": { "External id": 125472, "device": 5, "context": 1, "stream": 7, "correlation": 125472, "bytes": 8, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 125472, "pid": 5, "tid": 7, "ts": 1716454223378101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378083, "dur": 18, "args": { "External id": 125472, "cbid": 41, "correlation": 125472 } }, { "ph": "s", "id": 125472, "pid": 76337, "tid": -914061504, "ts": 1716454223378083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223378172, "dur": 3, "args": { "External id": 125482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125482, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125482, "pid": 5, "tid": 7, "ts": 1716454223378172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378158, "dur": 15, "args": { "External id": 125482, "cbid": 211, "correlation": 125482 } }, { "ph": "s", "id": 125482, "pid": 76337, "tid": -914061504, "ts": 1716454223378158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223378225, "dur": 1, "args": { "External id": 125492, "device": 5, "context": 1, "stream": 7, "correlation": 125492, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 125492, "pid": 5, "tid": 7, "ts": 1716454223378225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378210, "dur": 13, "args": { "External id": 125492, "cbid": 41, "correlation": 125492 } }, { "ph": "s", "id": 125492, "pid": 76337, "tid": -914061504, "ts": 1716454223378210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223378224, "dur": 7, "args": { "External id": 125493, "cbid": 131, "correlation": 125493 } }, { "ph": "f", "id": 125493, "pid": 76337, "tid": -914061504, "ts": 1716454223378224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223378285, "dur": 3, "args": { "External id": 125500, "device": 5, "context": 1, "stream": 7, "correlation": 125500, "bytes": 98304, "memory bandwidth (GB/s)": 31.670103092783506 } }, { "ph": "f", "id": 125500, "pid": 5, "tid": 7, "ts": 1716454223378285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378264, "dur": 20, "args": { "External id": 125500, "cbid": 41, "correlation": 125500 } }, { "ph": "s", "id": 125500, "pid": 76337, "tid": -914061504, "ts": 1716454223378264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223378332, "dur": 1, "args": { "External id": 125511, "device": 5, "context": 1, "stream": 7, "correlation": 125511, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 125511, "pid": 5, "tid": 7, "ts": 1716454223378332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378319, "dur": 11, "args": { "External id": 125511, "cbid": 41, "correlation": 125511 } }, { "ph": "s", "id": 125511, "pid": 76337, "tid": -914061504, "ts": 1716454223378319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223378331, "dur": 8, "args": { "External id": 125512, "cbid": 131, "correlation": 125512 } }, { "ph": "f", "id": 125512, "pid": 76337, "tid": -914061504, "ts": 1716454223378331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378381, "dur": 3, "args": { "External id": 125520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125520, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125520, "pid": 5, "tid": 7, "ts": 1716454223378381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378367, "dur": 14, "args": { "External id": 125520, "cbid": 211, "correlation": 125520 } }, { "ph": "s", "id": 125520, "pid": 76337, "tid": -914061504, "ts": 1716454223378367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378410, "dur": 3, "args": { "External id": 125530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125530, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125530, "pid": 5, "tid": 7, "ts": 1716454223378410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378401, "dur": 8, "args": { "External id": 125530, "cbid": 211, "correlation": 125530 } }, { "ph": "s", "id": 125530, "pid": 76337, "tid": -914061504, "ts": 1716454223378401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378433, "dur": 3, "args": { "External id": 125539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125539, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125539, "pid": 5, "tid": 7, "ts": 1716454223378433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378424, "dur": 8, "args": { "External id": 125539, "cbid": 211, "correlation": 125539 } }, { "ph": "s", "id": 125539, "pid": 76337, "tid": -914061504, "ts": 1716454223378424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223378500, "dur": 5, "args": { "External id": 125547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125547, "pid": 5, "tid": 7, "ts": 1716454223378500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378487, "dur": 13, "args": { "External id": 125547, "cbid": 211, "correlation": 125547 } }, { "ph": "s", "id": 125547, "pid": 76337, "tid": -914061504, "ts": 1716454223378487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378541, "dur": 3, "args": { "External id": 125556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125556, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125556, "pid": 5, "tid": 7, "ts": 1716454223378541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378531, "dur": 9, "args": { "External id": 125556, "cbid": 211, "correlation": 125556 } }, { "ph": "s", "id": 125556, "pid": 76337, "tid": -914061504, "ts": 1716454223378531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378564, "dur": 3, "args": { "External id": 125565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125565, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125565, "pid": 5, "tid": 7, "ts": 1716454223378564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378555, "dur": 7, "args": { "External id": 125565, "cbid": 211, "correlation": 125565 } }, { "ph": "s", "id": 125565, "pid": 76337, "tid": -914061504, "ts": 1716454223378555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223378625, "dur": 3, "args": { "External id": 125573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125573, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125573, "pid": 5, "tid": 7, "ts": 1716454223378625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378614, "dur": 10, "args": { "External id": 125573, "cbid": 211, "correlation": 125573 } }, { "ph": "s", "id": 125573, "pid": 76337, "tid": -914061504, "ts": 1716454223378614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223378683, "dur": 1, "args": { "External id": 125581, "device": 5, "context": 1, "stream": 7, "correlation": 125581, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 125581, "pid": 5, "tid": 7, "ts": 1716454223378683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378667, "dur": 26, "args": { "External id": 125581, "cbid": 41, "correlation": 125581 } }, { "ph": "s", "id": 125581, "pid": 76337, "tid": -914061504, "ts": 1716454223378667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223378694, "dur": 3, "args": { "External id": 125582, "cbid": 131, "correlation": 125582 } }, { "ph": "f", "id": 125582, "pid": 76337, "tid": -914061504, "ts": 1716454223378694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223378755, "dur": 1, "args": { "External id": 125592, "device": 5, "context": 1, "stream": 7, "correlation": 125592, "bytes": 42, "memory bandwidth (GB/s)": 0.027925531914893616 } }, { "ph": "f", "id": 125592, "pid": 5, "tid": 7, "ts": 1716454223378755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378742, "dur": 10, "args": { "External id": 125592, "cbid": 41, "correlation": 125592 } }, { "ph": "s", "id": 125592, "pid": 76337, "tid": -914061504, "ts": 1716454223378742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223378754, "dur": 8, "args": { "External id": 125593, "cbid": 131, "correlation": 125593 } }, { "ph": "f", "id": 125593, "pid": 76337, "tid": -914061504, "ts": 1716454223378754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223378809, "dur": 1, "args": { "External id": 125602, "device": 5, "context": 1, "stream": 7, "correlation": 125602, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 125602, "pid": 5, "tid": 7, "ts": 1716454223378809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378798, "dur": 8, "args": { "External id": 125602, "cbid": 41, "correlation": 125602 } }, { "ph": "s", "id": 125602, "pid": 76337, "tid": -914061504, "ts": 1716454223378798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223378807, "dur": 8, "args": { "External id": 125603, "cbid": 131, "correlation": 125603 } }, { "ph": "f", "id": 125603, "pid": 76337, "tid": -914061504, "ts": 1716454223378807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223378880, "dur": 4, "args": { "External id": 125610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125610, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125610, "pid": 5, "tid": 7, "ts": 1716454223378880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378864, "dur": 17, "args": { "External id": 125610, "cbid": 211, "correlation": 125610 } }, { "ph": "s", "id": 125610, "pid": 76337, "tid": -914061504, "ts": 1716454223378864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454223378918, "dur": 4, "args": { "External id": 125630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125630, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125630, "pid": 5, "tid": 7, "ts": 1716454223378918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378906, "dur": 13, "args": { "External id": 125630, "cbid": 211, "correlation": 125630 } }, { "ph": "s", "id": 125630, "pid": 76337, "tid": -914061504, "ts": 1716454223378906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223378920, "dur": 0, "args": { "External id": 125631, "cbid": 11, "correlation": 125631 } }, { "ph": "f", "id": 125631, "pid": 76337, "tid": -914061504, "ts": 1716454223378920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223378921, "dur": 0, "args": { "External id": 125632, "cbid": 11, "correlation": 125632 } }, { "ph": "f", "id": 125632, "pid": 76337, "tid": -914061504, "ts": 1716454223378921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223378934, "dur": 1, "args": { "External id": 125635, "device": 5, "context": 1, "stream": 7, "correlation": 125635, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 125635, "pid": 5, "tid": 7, "ts": 1716454223378934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223378922, "dur": 21, "args": { "External id": 125635, "cbid": 41, "correlation": 125635 } }, { "ph": "s", "id": 125635, "pid": 76337, "tid": -914061504, "ts": 1716454223378922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223378944, "dur": 3, "args": { "External id": 125636, "cbid": 131, "correlation": 125636 } }, { "ph": "f", "id": 125636, "pid": 76337, "tid": -914061504, "ts": 1716454223378944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223378971, "dur": 3, "args": { "External id": 125660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125660, "pid": 5, "tid": 7, "ts": 1716454223378971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378961, "dur": 9, "args": { "External id": 125660, "cbid": 211, "correlation": 125660 } }, { "ph": "s", "id": 125660, "pid": 76337, "tid": -914061504, "ts": 1716454223378961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223378971, "dur": 0, "args": { "External id": 125661, "cbid": 11, "correlation": 125661 } }, { "ph": "f", "id": 125661, "pid": 76337, "tid": -914061504, "ts": 1716454223378971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223378972, "dur": 0, "args": { "External id": 125662, "cbid": 11, "correlation": 125662 } }, { "ph": "f", "id": 125662, "pid": 76337, "tid": -914061504, "ts": 1716454223378972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223378980, "dur": 1, "args": { "External id": 125664, "cbid": 200, "correlation": 125664 } }, { "ph": "f", "id": 125664, "pid": 76337, "tid": -914061504, "ts": 1716454223378980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454223378991, "dur": 4, "args": { "External id": 125666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125666, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125666, "pid": 5, "tid": 7, "ts": 1716454223378991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223378983, "dur": 8, "args": { "External id": 125666, "cbid": 211, "correlation": 125666 } }, { "ph": "s", "id": 125666, "pid": 76337, "tid": -914061504, "ts": 1716454223378983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223378992, "dur": 0, "args": { "External id": 125667, "cbid": 11, "correlation": 125667 } }, { "ph": "f", "id": 125667, "pid": 76337, "tid": -914061504, "ts": 1716454223378992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223378993, "dur": 0, "args": { "External id": 125668, "cbid": 11, "correlation": 125668 } }, { "ph": "f", "id": 125668, "pid": 76337, "tid": -914061504, "ts": 1716454223378993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223379032, "dur": 1, "args": { "External id": 125675, "device": 5, "context": 1, "stream": 7, "correlation": 125675, "bytes": 8, "memory bandwidth (GB/s)": 0.004901960784313725 } }, { "ph": "f", "id": 125675, "pid": 5, "tid": 7, "ts": 1716454223379032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223379020, "dur": 20, "args": { "External id": 125675, "cbid": 41, "correlation": 125675 } }, { "ph": "s", "id": 125675, "pid": 76337, "tid": -914061504, "ts": 1716454223379020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223379041, "dur": 3, "args": { "External id": 125676, "cbid": 131, "correlation": 125676 } }, { "ph": "f", "id": 125676, "pid": 76337, "tid": -914061504, "ts": 1716454223379041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223379092, "dur": 1, "args": { "External id": 125686, "device": 5, "context": 1, "stream": 7, "correlation": 125686, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 125686, "pid": 5, "tid": 7, "ts": 1716454223379092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223379080, "dur": 10, "args": { "External id": 125686, "cbid": 41, "correlation": 125686 } }, { "ph": "s", "id": 125686, "pid": 76337, "tid": -914061504, "ts": 1716454223379080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223379090, "dur": 7, "args": { "External id": 125687, "cbid": 131, "correlation": 125687 } }, { "ph": "f", "id": 125687, "pid": 76337, "tid": -914061504, "ts": 1716454223379090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223379161, "dur": 5, "args": { "External id": 125694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125694, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125694, "pid": 5, "tid": 7, "ts": 1716454223379161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379145, "dur": 17, "args": { "External id": 125694, "cbid": 211, "correlation": 125694 } }, { "ph": "s", "id": 125694, "pid": 76337, "tid": -914061504, "ts": 1716454223379145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379231, "dur": 3, "args": { "External id": 125703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125703, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125703, "pid": 5, "tid": 7, "ts": 1716454223379231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379218, "dur": 13, "args": { "External id": 125703, "cbid": 211, "correlation": 125703 } }, { "ph": "s", "id": 125703, "pid": 76337, "tid": -914061504, "ts": 1716454223379218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379266, "dur": 3, "args": { "External id": 125711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125711, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125711, "pid": 5, "tid": 7, "ts": 1716454223379266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379256, "dur": 10, "args": { "External id": 125711, "cbid": 211, "correlation": 125711 } }, { "ph": "s", "id": 125711, "pid": 76337, "tid": -914061504, "ts": 1716454223379256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379300, "dur": 4, "args": { "External id": 125719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125719, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125719, "pid": 5, "tid": 7, "ts": 1716454223379300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379287, "dur": 13, "args": { "External id": 125719, "cbid": 211, "correlation": 125719 } }, { "ph": "s", "id": 125719, "pid": 76337, "tid": -914061504, "ts": 1716454223379287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379330, "dur": 4, "args": { "External id": 125727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125727, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125727, "pid": 5, "tid": 7, "ts": 1716454223379330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379319, "dur": 9, "args": { "External id": 125727, "cbid": 211, "correlation": 125727 } }, { "ph": "s", "id": 125727, "pid": 76337, "tid": -914061504, "ts": 1716454223379319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379355, "dur": 3, "args": { "External id": 125735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125735, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125735, "pid": 5, "tid": 7, "ts": 1716454223379355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379346, "dur": 8, "args": { "External id": 125735, "cbid": 211, "correlation": 125735 } }, { "ph": "s", "id": 125735, "pid": 76337, "tid": -914061504, "ts": 1716454223379346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379380, "dur": 3, "args": { "External id": 125743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125743, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125743, "pid": 5, "tid": 7, "ts": 1716454223379380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379371, "dur": 8, "args": { "External id": 125743, "cbid": 211, "correlation": 125743 } }, { "ph": "s", "id": 125743, "pid": 76337, "tid": -914061504, "ts": 1716454223379371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223379402, "dur": 4, "args": { "External id": 125751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125751, "pid": 5, "tid": 7, "ts": 1716454223379402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379393, "dur": 8, "args": { "External id": 125751, "cbid": 211, "correlation": 125751 } }, { "ph": "s", "id": 125751, "pid": 76337, "tid": -914061504, "ts": 1716454223379393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223379421, "dur": 5, "args": { "External id": 125759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125759, "pid": 5, "tid": 7, "ts": 1716454223379421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379413, "dur": 6, "args": { "External id": 125759, "cbid": 211, "correlation": 125759 } }, { "ph": "s", "id": 125759, "pid": 76337, "tid": -914061504, "ts": 1716454223379413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379440, "dur": 3, "args": { "External id": 125767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125767, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125767, "pid": 5, "tid": 7, "ts": 1716454223379440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379432, "dur": 6, "args": { "External id": 125767, "cbid": 211, "correlation": 125767 } }, { "ph": "s", "id": 125767, "pid": 76337, "tid": -914061504, "ts": 1716454223379432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379499, "dur": 3, "args": { "External id": 125775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125775, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 125775, "pid": 5, "tid": 7, "ts": 1716454223379499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379487, "dur": 12, "args": { "External id": 125775, "cbid": 211, "correlation": 125775 } }, { "ph": "s", "id": 125775, "pid": 76337, "tid": -914061504, "ts": 1716454223379487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223379524, "dur": 4, "args": { "External id": 125783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125783, "pid": 5, "tid": 7, "ts": 1716454223379524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379515, "dur": 8, "args": { "External id": 125783, "cbid": 211, "correlation": 125783 } }, { "ph": "s", "id": 125783, "pid": 76337, "tid": -914061504, "ts": 1716454223379515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223379546, "dur": 4, "args": { "External id": 125791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125791, "pid": 5, "tid": 7, "ts": 1716454223379546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379537, "dur": 8, "args": { "External id": 125791, "cbid": 211, "correlation": 125791 } }, { "ph": "s", "id": 125791, "pid": 76337, "tid": -914061504, "ts": 1716454223379537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223379566, "dur": 3, "args": { "External id": 125799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125799, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 125799, "pid": 5, "tid": 7, "ts": 1716454223379566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379558, "dur": 6, "args": { "External id": 125799, "cbid": 211, "correlation": 125799 } }, { "ph": "s", "id": 125799, "pid": 76337, "tid": -914061504, "ts": 1716454223379558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223379963, "dur": 5, "args": { "External id": 125808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125808, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125808, "pid": 5, "tid": 7, "ts": 1716454223379963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379947, "dur": 17, "args": { "External id": 125808, "cbid": 211, "correlation": 125808 } }, { "ph": "s", "id": 125808, "pid": 76337, "tid": -914061504, "ts": 1716454223379947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223380008, "dur": 5, "args": { "External id": 125817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125817, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125817, "pid": 5, "tid": 7, "ts": 1716454223380008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223379997, "dur": 10, "args": { "External id": 125817, "cbid": 211, "correlation": 125817 } }, { "ph": "s", "id": 125817, "pid": 76337, "tid": -914061504, "ts": 1716454223379997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223380135, "dur": 3, "args": { "External id": 125833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125833, "pid": 5, "tid": 7, "ts": 1716454223380135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380121, "dur": 14, "args": { "External id": 125833, "cbid": 211, "correlation": 125833 } }, { "ph": "s", "id": 125833, "pid": 76337, "tid": -914061504, "ts": 1716454223380121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380168, "dur": 3, "args": { "External id": 125841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125841, "pid": 5, "tid": 7, "ts": 1716454223380168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380159, "dur": 9, "args": { "External id": 125841, "cbid": 211, "correlation": 125841 } }, { "ph": "s", "id": 125841, "pid": 76337, "tid": -914061504, "ts": 1716454223380159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380200, "dur": 3, "args": { "External id": 125849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125849, "pid": 5, "tid": 7, "ts": 1716454223380200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380190, "dur": 8, "args": { "External id": 125849, "cbid": 211, "correlation": 125849 } }, { "ph": "s", "id": 125849, "pid": 76337, "tid": -914061504, "ts": 1716454223380190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380231, "dur": 4, "args": { "External id": 125857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125857, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125857, "pid": 5, "tid": 7, "ts": 1716454223380231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380221, "dur": 9, "args": { "External id": 125857, "cbid": 211, "correlation": 125857 } }, { "ph": "s", "id": 125857, "pid": 76337, "tid": -914061504, "ts": 1716454223380221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223380287, "dur": 4, "args": { "External id": 125869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125869, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125869, "pid": 5, "tid": 7, "ts": 1716454223380287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380274, "dur": 13, "args": { "External id": 125869, "cbid": 211, "correlation": 125869 } }, { "ph": "s", "id": 125869, "pid": 76337, "tid": -914061504, "ts": 1716454223380274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223380332, "dur": 4, "args": { "External id": 125880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125880, "pid": 5, "tid": 7, "ts": 1716454223380332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380320, "dur": 12, "args": { "External id": 125880, "cbid": 211, "correlation": 125880 } }, { "ph": "s", "id": 125880, "pid": 76337, "tid": -914061504, "ts": 1716454223380320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380363, "dur": 3, "args": { "External id": 125888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125888, "pid": 5, "tid": 7, "ts": 1716454223380363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380353, "dur": 8, "args": { "External id": 125888, "cbid": 211, "correlation": 125888 } }, { "ph": "s", "id": 125888, "pid": 76337, "tid": -914061504, "ts": 1716454223380353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380397, "dur": 5, "args": { "External id": 125896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125896, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125896, "pid": 5, "tid": 7, "ts": 1716454223380397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380387, "dur": 10, "args": { "External id": 125896, "cbid": 211, "correlation": 125896 } }, { "ph": "s", "id": 125896, "pid": 76337, "tid": -914061504, "ts": 1716454223380387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380427, "dur": 5, "args": { "External id": 125904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125904, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125904, "pid": 5, "tid": 7, "ts": 1716454223380427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380416, "dur": 10, "args": { "External id": 125904, "cbid": 211, "correlation": 125904 } }, { "ph": "s", "id": 125904, "pid": 76337, "tid": -914061504, "ts": 1716454223380416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223380457, "dur": 4, "args": { "External id": 125913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125913, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125913, "pid": 5, "tid": 7, "ts": 1716454223380457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380447, "dur": 9, "args": { "External id": 125913, "cbid": 211, "correlation": 125913 } }, { "ph": "s", "id": 125913, "pid": 76337, "tid": -914061504, "ts": 1716454223380447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223380518, "dur": 5, "args": { "External id": 125926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125926, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 125926, "pid": 5, "tid": 7, "ts": 1716454223380518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380505, "dur": 14, "args": { "External id": 125926, "cbid": 211, "correlation": 125926 } }, { "ph": "s", "id": 125926, "pid": 76337, "tid": -914061504, "ts": 1716454223380505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223380559, "dur": 5, "args": { "External id": 125936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125936, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 125936, "pid": 5, "tid": 7, "ts": 1716454223380559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380548, "dur": 10, "args": { "External id": 125936, "cbid": 211, "correlation": 125936 } }, { "ph": "s", "id": 125936, "pid": 76337, "tid": -914061504, "ts": 1716454223380548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223380686, "dur": 5, "args": { "External id": 125953, "cbid": 251, "correlation": 125953 } }, { "ph": "f", "id": 125953, "pid": 76337, "tid": -914061504, "ts": 1716454223380686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454223380717, "dur": 12, "args": { "External id": 125955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125955, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 125955, "pid": 5, "tid": 7, "ts": 1716454223380717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380700, "dur": 18, "args": { "External id": 125955, "cbid": 211, "correlation": 125955 } }, { "ph": "s", "id": 125955, "pid": 76337, "tid": -914061504, "ts": 1716454223380700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223380776, "dur": 3, "args": { "External id": 125963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125963, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 125963, "pid": 5, "tid": 7, "ts": 1716454223380776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380764, "dur": 12, "args": { "External id": 125963, "cbid": 211, "correlation": 125963 } }, { "ph": "s", "id": 125963, "pid": 76337, "tid": -914061504, "ts": 1716454223380764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223380835, "dur": 2, "args": { "External id": 125979, "cbid": 251, "correlation": 125979 } }, { "ph": "f", "id": 125979, "pid": 76337, "tid": -914061504, "ts": 1716454223380835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223380841, "dur": 0, "args": { "External id": 125981, "cbid": 251, "correlation": 125981 } }, { "ph": "f", "id": 125981, "pid": 76337, "tid": -914061504, "ts": 1716454223380841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223380857, "dur": 14, "args": { "External id": 125982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125982, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 125982, "pid": 5, "tid": 7, "ts": 1716454223380857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380844, "dur": 14, "args": { "External id": 125982, "cbid": 211, "correlation": 125982 } }, { "ph": "s", "id": 125982, "pid": 76337, "tid": -914061504, "ts": 1716454223380844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223380873, "dur": 5, "args": { "External id": 125984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 125984, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 125984, "pid": 5, "tid": 7, "ts": 1716454223380873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223380862, "dur": 9, "args": { "External id": 125984, "cbid": 211, "correlation": 125984 } }, { "ph": "s", "id": 125984, "pid": 76337, "tid": -914061504, "ts": 1716454223380862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223380982, "dur": 1, "args": { "External id": 125994, "cbid": 317, "correlation": 125994 } }, { "ph": "f", "id": 125994, "pid": 76337, "tid": -914061504, "ts": 1716454223380982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223380983, "dur": 1, "args": { "External id": 125995, "cbid": 203, "correlation": 125995 } }, { "ph": "f", "id": 125995, "pid": 76337, "tid": -914061504, "ts": 1716454223380983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223380985, "dur": 1, "args": { "External id": 125996, "cbid": 205, "correlation": 125996 } }, { "ph": "f", "id": 125996, "pid": 76337, "tid": -914061504, "ts": 1716454223380985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223381043, "dur": 6, "args": { "External id": 126000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126000, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126000, "pid": 5, "tid": 7, "ts": 1716454223381043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223381027, "dur": 16, "args": { "External id": 126000, "cbid": 211, "correlation": 126000 } }, { "ph": "s", "id": 126000, "pid": 76337, "tid": -914061504, "ts": 1716454223381027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223381055, "dur": 4, "args": { "External id": 126002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126002, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 126002, "pid": 5, "tid": 7, "ts": 1716454223381055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223381047, "dur": 6, "args": { "External id": 126002, "cbid": 211, "correlation": 126002 } }, { "ph": "s", "id": 126002, "pid": 76337, "tid": -914061504, "ts": 1716454223381047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223381074, "dur": 3, "args": { "External id": 126004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126004, "pid": 5, "tid": 7, "ts": 1716454223381074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223381065, "dur": 8, "args": { "External id": 126004, "cbid": 211, "correlation": 126004 } }, { "ph": "s", "id": 126004, "pid": 76337, "tid": -914061504, "ts": 1716454223381065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223381080, "dur": 1, "args": { "External id": 126005, "cbid": 51, "correlation": 126005 } }, { "ph": "s", "id": 126005, "pid": 76337, "tid": -914061504, "ts": 1716454223381080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223381091, "dur": 84, "args": { "External id": 126006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126006, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126006, "pid": 5, "tid": 7, "ts": 1716454223381091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223381082, "dur": 8, "args": { "External id": 126006, "cbid": 211, "correlation": 126006 } }, { "ph": "s", "id": 126006, "pid": 76337, "tid": -914061504, "ts": 1716454223381082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223381177, "dur": 60, "args": { "External id": 126011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126011, "pid": 5, "tid": 7, "ts": 1716454223381177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223381120, "dur": 11, "args": { "External id": 126011, "cbid": 211, "correlation": 126011 } }, { "ph": "s", "id": 126011, "pid": 76337, "tid": -914061504, "ts": 1716454223381120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223382959, "dur": 51, "args": { "External id": 126031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126031, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 126031, "pid": 5, "tid": 7, "ts": 1716454223382959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223382943, "dur": 16, "args": { "External id": 126031, "cbid": 211, "correlation": 126031 } }, { "ph": "s", "id": 126031, "pid": 76337, "tid": -914061504, "ts": 1716454223382943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223383011, "dur": 5, "args": { "External id": 126043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126043, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126043, "pid": 5, "tid": 7, "ts": 1716454223383011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223382970, "dur": 16, "args": { "External id": 126043, "cbid": 211, "correlation": 126043 } }, { "ph": "s", "id": 126043, "pid": 76337, "tid": -914061504, "ts": 1716454223382970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223383017, "dur": 58, "args": { "External id": 126046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126046, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126046, "pid": 5, "tid": 7, "ts": 1716454223383017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383002, "dur": 7, "args": { "External id": 126046, "cbid": 211, "correlation": 126046 } }, { "ph": "s", "id": 126046, "pid": 76337, "tid": -914061504, "ts": 1716454223383002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223383076, "dur": 36, "args": { "External id": 126055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126055, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126055, "pid": 5, "tid": 7, "ts": 1716454223383076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383047, "dur": 10, "args": { "External id": 126055, "cbid": 211, "correlation": 126055 } }, { "ph": "s", "id": 126055, "pid": 76337, "tid": -914061504, "ts": 1716454223383047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223383105, "dur": 0, "args": { "External id": 126065, "cbid": 317, "correlation": 126065 } }, { "ph": "f", "id": 126065, "pid": 76337, "tid": -914061504, "ts": 1716454223383105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223383105, "dur": 0, "args": { "External id": 126066, "cbid": 203, "correlation": 126066 } }, { "ph": "f", "id": 126066, "pid": 76337, "tid": -914061504, "ts": 1716454223383105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223383106, "dur": 0, "args": { "External id": 126067, "cbid": 205, "correlation": 126067 } }, { "ph": "f", "id": 126067, "pid": 76337, "tid": -914061504, "ts": 1716454223383106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223383139, "dur": 40, "args": { "External id": 126071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126071, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126071, "pid": 5, "tid": 7, "ts": 1716454223383139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383124, "dur": 13, "args": { "External id": 126071, "cbid": 211, "correlation": 126071 } }, { "ph": "s", "id": 126071, "pid": 76337, "tid": -914061504, "ts": 1716454223383124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223383180, "dur": 14, "args": { "External id": 126073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126073, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126073, "pid": 5, "tid": 7, "ts": 1716454223383180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383140, "dur": 6, "args": { "External id": 126073, "cbid": 211, "correlation": 126073 } }, { "ph": "s", "id": 126073, "pid": 76337, "tid": -914061504, "ts": 1716454223383140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223383195, "dur": 3, "args": { "External id": 126075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126075, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126075, "pid": 5, "tid": 7, "ts": 1716454223383195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383152, "dur": 6, "args": { "External id": 126075, "cbid": 211, "correlation": 126075 } }, { "ph": "s", "id": 126075, "pid": 76337, "tid": -914061504, "ts": 1716454223383152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223383162, "dur": 0, "args": { "External id": 126076, "cbid": 51, "correlation": 126076 } }, { "ph": "s", "id": 126076, "pid": 76337, "tid": -914061504, "ts": 1716454223383162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223383200, "dur": 698, "args": { "External id": 126077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126077, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126077, "pid": 5, "tid": 7, "ts": 1716454223383200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383163, "dur": 7, "args": { "External id": 126077, "cbid": 211, "correlation": 126077 } }, { "ph": "s", "id": 126077, "pid": 76337, "tid": -914061504, "ts": 1716454223383163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223383900, "dur": 58, "args": { "External id": 126082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126082, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126082, "pid": 5, "tid": 7, "ts": 1716454223383900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383194, "dur": 8, "args": { "External id": 126082, "cbid": 211, "correlation": 126082 } }, { "ph": "s", "id": 126082, "pid": 76337, "tid": -914061504, "ts": 1716454223383194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223383960, "dur": 3, "args": { "External id": 126090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126090, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126090, "pid": 5, "tid": 7, "ts": 1716454223383960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383238, "dur": 10, "args": { "External id": 126090, "cbid": 211, "correlation": 126090 } }, { "ph": "s", "id": 126090, "pid": 76337, "tid": -914061504, "ts": 1716454223383238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383304, "dur": 2, "args": { "External id": 126106, "cbid": 251, "correlation": 126106 } }, { "ph": "f", "id": 126106, "pid": 76337, "tid": -914061504, "ts": 1716454223383304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383310, "dur": 0, "args": { "External id": 126108, "cbid": 251, "correlation": 126108 } }, { "ph": "f", "id": 126108, "pid": 76337, "tid": -914061504, "ts": 1716454223383310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223383964, "dur": 9, "args": { "External id": 126109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126109, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 126109, "pid": 5, "tid": 7, "ts": 1716454223383964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383312, "dur": 13, "args": { "External id": 126109, "cbid": 211, "correlation": 126109 } }, { "ph": "s", "id": 126109, "pid": 76337, "tid": -914061504, "ts": 1716454223383312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223383975, "dur": 4, "args": { "External id": 126111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126111, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 126111, "pid": 5, "tid": 7, "ts": 1716454223383975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383326, "dur": 6, "args": { "External id": 126111, "cbid": 211, "correlation": 126111 } }, { "ph": "s", "id": 126111, "pid": 76337, "tid": -914061504, "ts": 1716454223383326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223383980, "dur": 53, "args": { "External id": 126121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126121, "pid": 5, "tid": 7, "ts": 1716454223383980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383386, "dur": 11, "args": { "External id": 126121, "cbid": 211, "correlation": 126121 } }, { "ph": "s", "id": 126121, "pid": 76337, "tid": -914061504, "ts": 1716454223383386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223384035, "dur": 51, "args": { "External id": 126141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126141, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 126141, "pid": 5, "tid": 7, "ts": 1716454223384035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383452, "dur": 11, "args": { "External id": 126141, "cbid": 211, "correlation": 126141 } }, { "ph": "s", "id": 126141, "pid": 76337, "tid": -914061504, "ts": 1716454223383452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223384087, "dur": 4, "args": { "External id": 126153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126153, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126153, "pid": 5, "tid": 7, "ts": 1716454223384087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383473, "dur": 7, "args": { "External id": 126153, "cbid": 211, "correlation": 126153 } }, { "ph": "s", "id": 126153, "pid": 76337, "tid": -914061504, "ts": 1716454223383473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223384092, "dur": 55, "args": { "External id": 126156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126156, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126156, "pid": 5, "tid": 7, "ts": 1716454223384092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383492, "dur": 6, "args": { "External id": 126156, "cbid": 211, "correlation": 126156 } }, { "ph": "s", "id": 126156, "pid": 76337, "tid": -914061504, "ts": 1716454223383492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223384149, "dur": 38, "args": { "External id": 126165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126165, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126165, "pid": 5, "tid": 7, "ts": 1716454223384149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383534, "dur": 10, "args": { "External id": 126165, "cbid": 211, "correlation": 126165 } }, { "ph": "s", "id": 126165, "pid": 76337, "tid": -914061504, "ts": 1716454223383534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223383604, "dur": 0, "args": { "External id": 126175, "cbid": 317, "correlation": 126175 } }, { "ph": "f", "id": 126175, "pid": 76337, "tid": -914061504, "ts": 1716454223383604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223383605, "dur": 0, "args": { "External id": 126176, "cbid": 203, "correlation": 126176 } }, { "ph": "f", "id": 126176, "pid": 76337, "tid": -914061504, "ts": 1716454223383605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223383606, "dur": 0, "args": { "External id": 126177, "cbid": 205, "correlation": 126177 } }, { "ph": "f", "id": 126177, "pid": 76337, "tid": -914061504, "ts": 1716454223383606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223384188, "dur": 40, "args": { "External id": 126181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126181, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126181, "pid": 5, "tid": 7, "ts": 1716454223384188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383619, "dur": 13, "args": { "External id": 126181, "cbid": 211, "correlation": 126181 } }, { "ph": "s", "id": 126181, "pid": 76337, "tid": -914061504, "ts": 1716454223383619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223384229, "dur": 14, "args": { "External id": 126183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126183, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126183, "pid": 5, "tid": 7, "ts": 1716454223384229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383634, "dur": 5, "args": { "External id": 126183, "cbid": 211, "correlation": 126183 } }, { "ph": "s", "id": 126183, "pid": 76337, "tid": -914061504, "ts": 1716454223383634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223384244, "dur": 3, "args": { "External id": 126185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126185, "pid": 5, "tid": 7, "ts": 1716454223384244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383643, "dur": 5, "args": { "External id": 126185, "cbid": 211, "correlation": 126185 } }, { "ph": "s", "id": 126185, "pid": 76337, "tid": -914061504, "ts": 1716454223383643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223383652, "dur": 0, "args": { "External id": 126186, "cbid": 51, "correlation": 126186 } }, { "ph": "s", "id": 126186, "pid": 76337, "tid": -914061504, "ts": 1716454223383652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223384249, "dur": 693, "args": { "External id": 126187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126187, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126187, "pid": 5, "tid": 7, "ts": 1716454223384249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383653, "dur": 5, "args": { "External id": 126187, "cbid": 211, "correlation": 126187 } }, { "ph": "s", "id": 126187, "pid": 76337, "tid": -914061504, "ts": 1716454223383653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223384944, "dur": 59, "args": { "External id": 126192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126192, "pid": 5, "tid": 7, "ts": 1716454223384944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383681, "dur": 9, "args": { "External id": 126192, "cbid": 211, "correlation": 126192 } }, { "ph": "s", "id": 126192, "pid": 76337, "tid": -914061504, "ts": 1716454223383681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223385004, "dur": 50, "args": { "External id": 126200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126200, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126200, "pid": 5, "tid": 7, "ts": 1716454223385004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383713, "dur": 8, "args": { "External id": 126200, "cbid": 211, "correlation": 126200 } }, { "ph": "s", "id": 126200, "pid": 76337, "tid": -914061504, "ts": 1716454223383713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223385055, "dur": 35, "args": { "External id": 126208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126208, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126208, "pid": 5, "tid": 7, "ts": 1716454223385055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383745, "dur": 10, "args": { "External id": 126208, "cbid": 211, "correlation": 126208 } }, { "ph": "s", "id": 126208, "pid": 76337, "tid": -914061504, "ts": 1716454223383745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223385092, "dur": 52, "args": { "External id": 126228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126228, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 126228, "pid": 5, "tid": 7, "ts": 1716454223385092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383827, "dur": 13, "args": { "External id": 126228, "cbid": 211, "correlation": 126228 } }, { "ph": "s", "id": 126228, "pid": 76337, "tid": -914061504, "ts": 1716454223383827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223385145, "dur": 4, "args": { "External id": 126240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126240, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126240, "pid": 5, "tid": 7, "ts": 1716454223385145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383849, "dur": 6, "args": { "External id": 126240, "cbid": 211, "correlation": 126240 } }, { "ph": "s", "id": 126240, "pid": 76337, "tid": -914061504, "ts": 1716454223383849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223385150, "dur": 55, "args": { "External id": 126243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126243, "pid": 5, "tid": 7, "ts": 1716454223385150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383866, "dur": 6, "args": { "External id": 126243, "cbid": 211, "correlation": 126243 } }, { "ph": "s", "id": 126243, "pid": 76337, "tid": -914061504, "ts": 1716454223383866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223383924, "dur": 0, "args": { "External id": 126254, "cbid": 317, "correlation": 126254 } }, { "ph": "f", "id": 126254, "pid": 76337, "tid": -914061504, "ts": 1716454223383924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223383924, "dur": 0, "args": { "External id": 126255, "cbid": 203, "correlation": 126255 } }, { "ph": "f", "id": 126255, "pid": 76337, "tid": -914061504, "ts": 1716454223383924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223383925, "dur": 0, "args": { "External id": 126256, "cbid": 205, "correlation": 126256 } }, { "ph": "f", "id": 126256, "pid": 76337, "tid": -914061504, "ts": 1716454223383925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383961, "dur": 2, "args": { "External id": 126260, "cbid": 251, "correlation": 126260 } }, { "ph": "f", "id": 126260, "pid": 76337, "tid": -914061504, "ts": 1716454223383961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383964, "dur": 1, "args": { "External id": 126261, "cbid": 251, "correlation": 126261 } }, { "ph": "f", "id": 126261, "pid": 76337, "tid": -914061504, "ts": 1716454223383964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383966, "dur": 1, "args": { "External id": 126262, "cbid": 251, "correlation": 126262 } }, { "ph": "f", "id": 126262, "pid": 76337, "tid": -914061504, "ts": 1716454223383966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383969, "dur": 1, "args": { "External id": 126263, "cbid": 251, "correlation": 126263 } }, { "ph": "f", "id": 126263, "pid": 76337, "tid": -914061504, "ts": 1716454223383969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383971, "dur": 1, "args": { "External id": 126264, "cbid": 251, "correlation": 126264 } }, { "ph": "f", "id": 126264, "pid": 76337, "tid": -914061504, "ts": 1716454223383971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383982, "dur": 1, "args": { "External id": 126265, "cbid": 251, "correlation": 126265 } }, { "ph": "f", "id": 126265, "pid": 76337, "tid": -914061504, "ts": 1716454223383982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383983, "dur": 1, "args": { "External id": 126266, "cbid": 251, "correlation": 126266 } }, { "ph": "f", "id": 126266, "pid": 76337, "tid": -914061504, "ts": 1716454223383983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383985, "dur": 1, "args": { "External id": 126267, "cbid": 251, "correlation": 126267 } }, { "ph": "f", "id": 126267, "pid": 76337, "tid": -914061504, "ts": 1716454223383985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223383988, "dur": 0, "args": { "External id": 126268, "cbid": 251, "correlation": 126268 } }, { "ph": "f", "id": 126268, "pid": 76337, "tid": -914061504, "ts": 1716454223383988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223385206, "dur": 113, "args": { "External id": 126269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126269, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 126269, "pid": 5, "tid": 7, "ts": 1716454223385206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223383992, "dur": 15, "args": { "External id": 126269, "cbid": 211, "correlation": 126269 } }, { "ph": "s", "id": 126269, "pid": 76337, "tid": -914061504, "ts": 1716454223383992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223385321, "dur": 60, "args": { "External id": 126275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126275, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126275, "pid": 5, "tid": 7, "ts": 1716454223385321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384031, "dur": 9, "args": { "External id": 126275, "cbid": 211, "correlation": 126275 } }, { "ph": "s", "id": 126275, "pid": 76337, "tid": -914061504, "ts": 1716454223384031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223385382, "dur": 623, "args": { "External id": 126284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126284, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126284, "pid": 5, "tid": 7, "ts": 1716454223385382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384124, "dur": 17, "args": { "External id": 126284, "cbid": 211, "correlation": 126284 } }, { "ph": "s", "id": 126284, "pid": 76337, "tid": -914061504, "ts": 1716454223384124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223386007, "dur": 181, "args": { "External id": 126306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126306, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126306, "pid": 5, "tid": 7, "ts": 1716454223386007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384197, "dur": 11, "args": { "External id": 126306, "cbid": 211, "correlation": 126306 } }, { "ph": "s", "id": 126306, "pid": 76337, "tid": -914061504, "ts": 1716454223384197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223384314, "dur": 2, "args": { "External id": 126317, "cbid": 251, "correlation": 126317 } }, { "ph": "f", "id": 126317, "pid": 76337, "tid": -914061504, "ts": 1716454223384314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223386189, "dur": 196, "args": { "External id": 126318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126318, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126318, "pid": 5, "tid": 7, "ts": 1716454223386189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384321, "dur": 14, "args": { "External id": 126318, "cbid": 211, "correlation": 126318 } }, { "ph": "s", "id": 126318, "pid": 76337, "tid": -914061504, "ts": 1716454223384321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223384394, "dur": 1, "args": { "External id": 126329, "cbid": 251, "correlation": 126329 } }, { "ph": "f", "id": 126329, "pid": 76337, "tid": -914061504, "ts": 1716454223384394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223386386, "dur": 188, "args": { "External id": 126330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126330, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126330, "pid": 5, "tid": 7, "ts": 1716454223386386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384399, "dur": 12, "args": { "External id": 126330, "cbid": 211, "correlation": 126330 } }, { "ph": "s", "id": 126330, "pid": 76337, "tid": -914061504, "ts": 1716454223384399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223384465, "dur": 1, "args": { "External id": 126341, "cbid": 251, "correlation": 126341 } }, { "ph": "f", "id": 126341, "pid": 76337, "tid": -914061504, "ts": 1716454223384465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223386576, "dur": 186, "args": { "External id": 126342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126342, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126342, "pid": 5, "tid": 7, "ts": 1716454223386576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384469, "dur": 11, "args": { "External id": 126342, "cbid": 211, "correlation": 126342 } }, { "ph": "s", "id": 126342, "pid": 76337, "tid": -914061504, "ts": 1716454223384469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223386763, "dur": 18594, "args": { "External id": 126363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126363, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 126363, "pid": 5, "tid": 7, "ts": 1716454223386763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384578, "dur": 14, "args": { "External id": 126363, "cbid": 211, "correlation": 126363 } }, { "ph": "s", "id": 126363, "pid": 76337, "tid": -914061504, "ts": 1716454223384578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223384690, "dur": 2, "args": { "External id": 126381, "cbid": 251, "correlation": 126381 } }, { "ph": "f", "id": 126381, "pid": 76337, "tid": -914061504, "ts": 1716454223384690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223405358, "dur": 203, "args": { "External id": 126383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126383, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126383, "pid": 5, "tid": 7, "ts": 1716454223405358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384697, "dur": 14, "args": { "External id": 126383, "cbid": 211, "correlation": 126383 } }, { "ph": "s", "id": 126383, "pid": 76337, "tid": -914061504, "ts": 1716454223384697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223405562, "dur": 66, "args": { "External id": 126391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126391, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126391, "pid": 5, "tid": 7, "ts": 1716454223405562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384770, "dur": 12, "args": { "External id": 126391, "cbid": 211, "correlation": 126391 } }, { "ph": "s", "id": 126391, "pid": 76337, "tid": -914061504, "ts": 1716454223384770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223405630, "dur": 98, "args": { "External id": 126399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126399, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126399, "pid": 5, "tid": 7, "ts": 1716454223405630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384811, "dur": 8, "args": { "External id": 126399, "cbid": 211, "correlation": 126399 } }, { "ph": "s", "id": 126399, "pid": 76337, "tid": -914061504, "ts": 1716454223384811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223405729, "dur": 53, "args": { "External id": 126410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126410, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126410, "pid": 5, "tid": 7, "ts": 1716454223405729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384895, "dur": 15, "args": { "External id": 126410, "cbid": 211, "correlation": 126410 } }, { "ph": "s", "id": 126410, "pid": 76337, "tid": -914061504, "ts": 1716454223384895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223405784, "dur": 92, "args": { "External id": 126432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126432, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126432, "pid": 5, "tid": 7, "ts": 1716454223405784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223384928, "dur": 8, "args": { "External id": 126432, "cbid": 211, "correlation": 126432 } }, { "ph": "s", "id": 126432, "pid": 76337, "tid": -914061504, "ts": 1716454223384928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385020, "dur": 1, "args": { "External id": 126443, "cbid": 251, "correlation": 126443 } }, { "ph": "f", "id": 126443, "pid": 76337, "tid": -914061504, "ts": 1716454223385020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223405877, "dur": 102, "args": { "External id": 126444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126444, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126444, "pid": 5, "tid": 7, "ts": 1716454223405877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385026, "dur": 13, "args": { "External id": 126444, "cbid": 211, "correlation": 126444 } }, { "ph": "s", "id": 126444, "pid": 76337, "tid": -914061504, "ts": 1716454223385026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385112, "dur": 1, "args": { "External id": 126455, "cbid": 251, "correlation": 126455 } }, { "ph": "f", "id": 126455, "pid": 76337, "tid": -914061504, "ts": 1716454223385112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385116, "dur": 0, "args": { "External id": 126456, "cbid": 251, "correlation": 126456 } }, { "ph": "f", "id": 126456, "pid": 76337, "tid": -914061504, "ts": 1716454223385116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223405980, "dur": 11, "args": { "External id": 126457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126457, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 126457, "pid": 5, "tid": 7, "ts": 1716454223405980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385118, "dur": 13, "args": { "External id": 126457, "cbid": 211, "correlation": 126457 } }, { "ph": "s", "id": 126457, "pid": 76337, "tid": -914061504, "ts": 1716454223385118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223405992, "dur": 5, "args": { "External id": 126459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126459, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 126459, "pid": 5, "tid": 7, "ts": 1716454223405992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385134, "dur": 8, "args": { "External id": 126459, "cbid": 211, "correlation": 126459 } }, { "ph": "s", "id": 126459, "pid": 76337, "tid": -914061504, "ts": 1716454223385134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385198, "dur": 1, "args": { "External id": 126470, "cbid": 251, "correlation": 126470 } }, { "ph": "f", "id": 126470, "pid": 76337, "tid": -914061504, "ts": 1716454223385198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385201, "dur": 0, "args": { "External id": 126471, "cbid": 251, "correlation": 126471 } }, { "ph": "f", "id": 126471, "pid": 76337, "tid": -914061504, "ts": 1716454223385201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223405998, "dur": 6, "args": { "External id": 126472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126472, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 126472, "pid": 5, "tid": 7, "ts": 1716454223405998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385203, "dur": 12, "args": { "External id": 126472, "cbid": 211, "correlation": 126472 } }, { "ph": "s", "id": 126472, "pid": 76337, "tid": -914061504, "ts": 1716454223385203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223406006, "dur": 3, "args": { "External id": 126474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126474, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 126474, "pid": 5, "tid": 7, "ts": 1716454223406006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385216, "dur": 5, "args": { "External id": 126474, "cbid": 211, "correlation": 126474 } }, { "ph": "s", "id": 126474, "pid": 76337, "tid": -914061504, "ts": 1716454223385216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223406010, "dur": 156, "args": { "External id": 126495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126495, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 126495, "pid": 5, "tid": 7, "ts": 1716454223406010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385289, "dur": 12, "args": { "External id": 126495, "cbid": 211, "correlation": 126495 } }, { "ph": "s", "id": 126495, "pid": 76337, "tid": -914061504, "ts": 1716454223385289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385386, "dur": 2, "args": { "External id": 126513, "cbid": 251, "correlation": 126513 } }, { "ph": "f", "id": 126513, "pid": 76337, "tid": -914061504, "ts": 1716454223385386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223406167, "dur": 105, "args": { "External id": 126515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126515, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 126515, "pid": 5, "tid": 7, "ts": 1716454223406167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385394, "dur": 13, "args": { "External id": 126515, "cbid": 211, "correlation": 126515 } }, { "ph": "s", "id": 126515, "pid": 76337, "tid": -914061504, "ts": 1716454223385394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223406274, "dur": 35, "args": { "External id": 126523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126523, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126523, "pid": 5, "tid": 7, "ts": 1716454223406274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385464, "dur": 12, "args": { "External id": 126523, "cbid": 211, "correlation": 126523 } }, { "ph": "s", "id": 126523, "pid": 76337, "tid": -914061504, "ts": 1716454223385464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223406310, "dur": 66, "args": { "External id": 126531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126531, "pid": 5, "tid": 7, "ts": 1716454223406310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385504, "dur": 9, "args": { "External id": 126531, "cbid": 211, "correlation": 126531 } }, { "ph": "s", "id": 126531, "pid": 76337, "tid": -914061504, "ts": 1716454223385504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223406378, "dur": 92, "args": { "External id": 126553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126553, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126553, "pid": 5, "tid": 7, "ts": 1716454223406378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385555, "dur": 10, "args": { "External id": 126553, "cbid": 211, "correlation": 126553 } }, { "ph": "s", "id": 126553, "pid": 76337, "tid": -914061504, "ts": 1716454223385555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385645, "dur": 1, "args": { "External id": 126569, "cbid": 251, "correlation": 126569 } }, { "ph": "f", "id": 126569, "pid": 76337, "tid": -914061504, "ts": 1716454223385645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223406471, "dur": 576, "args": { "External id": 126571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126571, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126571, "pid": 5, "tid": 7, "ts": 1716454223406471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385651, "dur": 12, "args": { "External id": 126571, "cbid": 211, "correlation": 126571 } }, { "ph": "s", "id": 126571, "pid": 76337, "tid": -914061504, "ts": 1716454223385651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223407049, "dur": 246, "args": { "External id": 126579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126579, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126579, "pid": 5, "tid": 7, "ts": 1716454223407049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385728, "dur": 15, "args": { "External id": 126579, "cbid": 211, "correlation": 126579 } }, { "ph": "s", "id": 126579, "pid": 76337, "tid": -914061504, "ts": 1716454223385728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223407296, "dur": 252, "args": { "External id": 126587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126587, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126587, "pid": 5, "tid": 7, "ts": 1716454223407296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385763, "dur": 9, "args": { "External id": 126587, "cbid": 211, "correlation": 126587 } }, { "ph": "s", "id": 126587, "pid": 76337, "tid": -914061504, "ts": 1716454223385763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385849, "dur": 2, "args": { "External id": 126603, "cbid": 251, "correlation": 126603 } }, { "ph": "f", "id": 126603, "pid": 76337, "tid": -914061504, "ts": 1716454223385849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223385854, "dur": 0, "args": { "External id": 126605, "cbid": 251, "correlation": 126605 } }, { "ph": "f", "id": 126605, "pid": 76337, "tid": -914061504, "ts": 1716454223385854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223407550, "dur": 362, "args": { "External id": 126606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126606, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 126606, "pid": 5, "tid": 7, "ts": 1716454223407550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385860, "dur": 13, "args": { "External id": 126606, "cbid": 211, "correlation": 126606 } }, { "ph": "s", "id": 126606, "pid": 76337, "tid": -914061504, "ts": 1716454223385860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223407913, "dur": 50, "args": { "External id": 126614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126614, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126614, "pid": 5, "tid": 7, "ts": 1716454223407913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385903, "dur": 9, "args": { "External id": 126614, "cbid": 211, "correlation": 126614 } }, { "ph": "s", "id": 126614, "pid": 76337, "tid": -914061504, "ts": 1716454223385903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223407964, "dur": 158, "args": { "External id": 126625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126625, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126625, "pid": 5, "tid": 7, "ts": 1716454223407964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223385981, "dur": 14, "args": { "External id": 126625, "cbid": 211, "correlation": 126625 } }, { "ph": "s", "id": 126625, "pid": 76337, "tid": -914061504, "ts": 1716454223385981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223386049, "dur": 0, "args": { "External id": 126637, "cbid": 317, "correlation": 126637 } }, { "ph": "f", "id": 126637, "pid": 76337, "tid": -914061504, "ts": 1716454223386049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223386050, "dur": 0, "args": { "External id": 126638, "cbid": 203, "correlation": 126638 } }, { "ph": "f", "id": 126638, "pid": 76337, "tid": -914061504, "ts": 1716454223386050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223386051, "dur": 0, "args": { "External id": 126639, "cbid": 205, "correlation": 126639 } }, { "ph": "f", "id": 126639, "pid": 76337, "tid": -914061504, "ts": 1716454223386051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386075, "dur": 1, "args": { "External id": 126643, "cbid": 251, "correlation": 126643 } }, { "ph": "f", "id": 126643, "pid": 76337, "tid": -914061504, "ts": 1716454223386075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386077, "dur": 0, "args": { "External id": 126644, "cbid": 251, "correlation": 126644 } }, { "ph": "f", "id": 126644, "pid": 76337, "tid": -914061504, "ts": 1716454223386077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386078, "dur": 0, "args": { "External id": 126645, "cbid": 251, "correlation": 126645 } }, { "ph": "f", "id": 126645, "pid": 76337, "tid": -914061504, "ts": 1716454223386078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386078, "dur": 0, "args": { "External id": 126646, "cbid": 251, "correlation": 126646 } }, { "ph": "f", "id": 126646, "pid": 76337, "tid": -914061504, "ts": 1716454223386078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386079, "dur": 0, "args": { "External id": 126647, "cbid": 251, "correlation": 126647 } }, { "ph": "f", "id": 126647, "pid": 76337, "tid": -914061504, "ts": 1716454223386079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386080, "dur": 0, "args": { "External id": 126648, "cbid": 251, "correlation": 126648 } }, { "ph": "f", "id": 126648, "pid": 76337, "tid": -914061504, "ts": 1716454223386080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386081, "dur": 0, "args": { "External id": 126649, "cbid": 251, "correlation": 126649 } }, { "ph": "f", "id": 126649, "pid": 76337, "tid": -914061504, "ts": 1716454223386081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386081, "dur": 0, "args": { "External id": 126650, "cbid": 251, "correlation": 126650 } }, { "ph": "f", "id": 126650, "pid": 76337, "tid": -914061504, "ts": 1716454223386081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386083, "dur": 0, "args": { "External id": 126651, "cbid": 251, "correlation": 126651 } }, { "ph": "f", "id": 126651, "pid": 76337, "tid": -914061504, "ts": 1716454223386083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223408124, "dur": 114, "args": { "External id": 126652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126652, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 126652, "pid": 5, "tid": 7, "ts": 1716454223408124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386085, "dur": 13, "args": { "External id": 126652, "cbid": 211, "correlation": 126652 } }, { "ph": "s", "id": 126652, "pid": 76337, "tid": -914061504, "ts": 1716454223386085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223408239, "dur": 59, "args": { "External id": 126658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126658, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126658, "pid": 5, "tid": 7, "ts": 1716454223408239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386120, "dur": 9, "args": { "External id": 126658, "cbid": 211, "correlation": 126658 } }, { "ph": "s", "id": 126658, "pid": 76337, "tid": -914061504, "ts": 1716454223386120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223408299, "dur": 50, "args": { "External id": 126666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126666, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126666, "pid": 5, "tid": 7, "ts": 1716454223408299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386153, "dur": 8, "args": { "External id": 126666, "cbid": 211, "correlation": 126666 } }, { "ph": "s", "id": 126666, "pid": 76337, "tid": -914061504, "ts": 1716454223386153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223408351, "dur": 52, "args": { "External id": 126686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126686, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 126686, "pid": 5, "tid": 7, "ts": 1716454223408351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386227, "dur": 12, "args": { "External id": 126686, "cbid": 211, "correlation": 126686 } }, { "ph": "s", "id": 126686, "pid": 76337, "tid": -914061504, "ts": 1716454223386227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223408404, "dur": 5, "args": { "External id": 126698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126698, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126698, "pid": 5, "tid": 7, "ts": 1716454223408404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386250, "dur": 7, "args": { "External id": 126698, "cbid": 211, "correlation": 126698 } }, { "ph": "s", "id": 126698, "pid": 76337, "tid": -914061504, "ts": 1716454223386250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223408410, "dur": 56, "args": { "External id": 126701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126701, "pid": 5, "tid": 7, "ts": 1716454223408410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386268, "dur": 7, "args": { "External id": 126701, "cbid": 211, "correlation": 126701 } }, { "ph": "s", "id": 126701, "pid": 76337, "tid": -914061504, "ts": 1716454223386268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223408467, "dur": 36, "args": { "External id": 126710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126710, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126710, "pid": 5, "tid": 7, "ts": 1716454223408467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386308, "dur": 10, "args": { "External id": 126710, "cbid": 211, "correlation": 126710 } }, { "ph": "s", "id": 126710, "pid": 76337, "tid": -914061504, "ts": 1716454223386308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223386359, "dur": 0, "args": { "External id": 126720, "cbid": 317, "correlation": 126720 } }, { "ph": "f", "id": 126720, "pid": 76337, "tid": -914061504, "ts": 1716454223386359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223386360, "dur": 0, "args": { "External id": 126721, "cbid": 203, "correlation": 126721 } }, { "ph": "f", "id": 126721, "pid": 76337, "tid": -914061504, "ts": 1716454223386360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223386361, "dur": 0, "args": { "External id": 126722, "cbid": 205, "correlation": 126722 } }, { "ph": "f", "id": 126722, "pid": 76337, "tid": -914061504, "ts": 1716454223386361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223408505, "dur": 41, "args": { "External id": 126726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126726, "pid": 5, "tid": 7, "ts": 1716454223408505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386375, "dur": 11, "args": { "External id": 126726, "cbid": 211, "correlation": 126726 } }, { "ph": "s", "id": 126726, "pid": 76337, "tid": -914061504, "ts": 1716454223386375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223408548, "dur": 14, "args": { "External id": 126728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126728, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126728, "pid": 5, "tid": 7, "ts": 1716454223408548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386389, "dur": 6, "args": { "External id": 126728, "cbid": 211, "correlation": 126728 } }, { "ph": "s", "id": 126728, "pid": 76337, "tid": -914061504, "ts": 1716454223386389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223408564, "dur": 4, "args": { "External id": 126730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126730, "pid": 5, "tid": 7, "ts": 1716454223408564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386400, "dur": 5, "args": { "External id": 126730, "cbid": 211, "correlation": 126730 } }, { "ph": "s", "id": 126730, "pid": 76337, "tid": -914061504, "ts": 1716454223386400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223386408, "dur": 0, "args": { "External id": 126731, "cbid": 51, "correlation": 126731 } }, { "ph": "s", "id": 126731, "pid": 76337, "tid": -914061504, "ts": 1716454223386408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223408569, "dur": 699, "args": { "External id": 126732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126732, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126732, "pid": 5, "tid": 7, "ts": 1716454223408569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386409, "dur": 6, "args": { "External id": 126732, "cbid": 211, "correlation": 126732 } }, { "ph": "s", "id": 126732, "pid": 76337, "tid": -914061504, "ts": 1716454223386409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223409270, "dur": 59, "args": { "External id": 126737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126737, "pid": 5, "tid": 7, "ts": 1716454223409270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386436, "dur": 9, "args": { "External id": 126737, "cbid": 211, "correlation": 126737 } }, { "ph": "s", "id": 126737, "pid": 76337, "tid": -914061504, "ts": 1716454223386436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223409330, "dur": 4, "args": { "External id": 126745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126745, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126745, "pid": 5, "tid": 7, "ts": 1716454223409330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386480, "dur": 9, "args": { "External id": 126745, "cbid": 211, "correlation": 126745 } }, { "ph": "s", "id": 126745, "pid": 76337, "tid": -914061504, "ts": 1716454223386480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386546, "dur": 1, "args": { "External id": 126761, "cbid": 251, "correlation": 126761 } }, { "ph": "f", "id": 126761, "pid": 76337, "tid": -914061504, "ts": 1716454223386546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223386551, "dur": 0, "args": { "External id": 126763, "cbid": 251, "correlation": 126763 } }, { "ph": "f", "id": 126763, "pid": 76337, "tid": -914061504, "ts": 1716454223386551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223409335, "dur": 11, "args": { "External id": 126764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126764, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 126764, "pid": 5, "tid": 7, "ts": 1716454223409335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386553, "dur": 11, "args": { "External id": 126764, "cbid": 211, "correlation": 126764 } }, { "ph": "s", "id": 126764, "pid": 76337, "tid": -914061504, "ts": 1716454223386553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223409347, "dur": 5, "args": { "External id": 126766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126766, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 126766, "pid": 5, "tid": 7, "ts": 1716454223409347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386566, "dur": 5, "args": { "External id": 126766, "cbid": 211, "correlation": 126766 } }, { "ph": "s", "id": 126766, "pid": 76337, "tid": -914061504, "ts": 1716454223386566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223409354, "dur": 53, "args": { "External id": 126776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126776, "pid": 5, "tid": 7, "ts": 1716454223409354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386622, "dur": 12, "args": { "External id": 126776, "cbid": 211, "correlation": 126776 } }, { "ph": "s", "id": 126776, "pid": 76337, "tid": -914061504, "ts": 1716454223386622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223409409, "dur": 50, "args": { "External id": 126796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126796, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 126796, "pid": 5, "tid": 7, "ts": 1716454223409409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386689, "dur": 11, "args": { "External id": 126796, "cbid": 211, "correlation": 126796 } }, { "ph": "s", "id": 126796, "pid": 76337, "tid": -914061504, "ts": 1716454223386689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223409460, "dur": 4, "args": { "External id": 126808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126808, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126808, "pid": 5, "tid": 7, "ts": 1716454223409460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386710, "dur": 6, "args": { "External id": 126808, "cbid": 211, "correlation": 126808 } }, { "ph": "s", "id": 126808, "pid": 76337, "tid": -914061504, "ts": 1716454223386710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223409465, "dur": 55, "args": { "External id": 126811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126811, "pid": 5, "tid": 7, "ts": 1716454223409465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386728, "dur": 6, "args": { "External id": 126811, "cbid": 211, "correlation": 126811 } }, { "ph": "s", "id": 126811, "pid": 76337, "tid": -914061504, "ts": 1716454223386728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223409522, "dur": 36, "args": { "External id": 126820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126820, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126820, "pid": 5, "tid": 7, "ts": 1716454223409522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386769, "dur": 9, "args": { "External id": 126820, "cbid": 211, "correlation": 126820 } }, { "ph": "s", "id": 126820, "pid": 76337, "tid": -914061504, "ts": 1716454223386769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223386831, "dur": 0, "args": { "External id": 126830, "cbid": 317, "correlation": 126830 } }, { "ph": "f", "id": 126830, "pid": 76337, "tid": -914061504, "ts": 1716454223386831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223386832, "dur": 0, "args": { "External id": 126831, "cbid": 203, "correlation": 126831 } }, { "ph": "f", "id": 126831, "pid": 76337, "tid": -914061504, "ts": 1716454223386832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223386833, "dur": 0, "args": { "External id": 126832, "cbid": 205, "correlation": 126832 } }, { "ph": "f", "id": 126832, "pid": 76337, "tid": -914061504, "ts": 1716454223386833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223409560, "dur": 41, "args": { "External id": 126836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126836, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126836, "pid": 5, "tid": 7, "ts": 1716454223409560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386846, "dur": 13, "args": { "External id": 126836, "cbid": 211, "correlation": 126836 } }, { "ph": "s", "id": 126836, "pid": 76337, "tid": -914061504, "ts": 1716454223386846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223409602, "dur": 14, "args": { "External id": 126838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126838, "pid": 5, "tid": 7, "ts": 1716454223409602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386861, "dur": 5, "args": { "External id": 126838, "cbid": 211, "correlation": 126838 } }, { "ph": "s", "id": 126838, "pid": 76337, "tid": -914061504, "ts": 1716454223386861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223409617, "dur": 3, "args": { "External id": 126840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126840, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126840, "pid": 5, "tid": 7, "ts": 1716454223409617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386870, "dur": 5, "args": { "External id": 126840, "cbid": 211, "correlation": 126840 } }, { "ph": "s", "id": 126840, "pid": 76337, "tid": -914061504, "ts": 1716454223386870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223386878, "dur": 0, "args": { "External id": 126841, "cbid": 51, "correlation": 126841 } }, { "ph": "s", "id": 126841, "pid": 76337, "tid": -914061504, "ts": 1716454223386878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223409622, "dur": 692, "args": { "External id": 126842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126842, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126842, "pid": 5, "tid": 7, "ts": 1716454223409622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386879, "dur": 5, "args": { "External id": 126842, "cbid": 211, "correlation": 126842 } }, { "ph": "s", "id": 126842, "pid": 76337, "tid": -914061504, "ts": 1716454223386879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223410315, "dur": 59, "args": { "External id": 126847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126847, "pid": 5, "tid": 7, "ts": 1716454223410315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386907, "dur": 8, "args": { "External id": 126847, "cbid": 211, "correlation": 126847 } }, { "ph": "s", "id": 126847, "pid": 76337, "tid": -914061504, "ts": 1716454223386907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223410375, "dur": 50, "args": { "External id": 126855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126855, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126855, "pid": 5, "tid": 7, "ts": 1716454223410375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386938, "dur": 9, "args": { "External id": 126855, "cbid": 211, "correlation": 126855 } }, { "ph": "s", "id": 126855, "pid": 76337, "tid": -914061504, "ts": 1716454223386938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223410427, "dur": 35, "args": { "External id": 126863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126863, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126863, "pid": 5, "tid": 7, "ts": 1716454223410427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223386968, "dur": 16, "args": { "External id": 126863, "cbid": 211, "correlation": 126863 } }, { "ph": "s", "id": 126863, "pid": 76337, "tid": -914061504, "ts": 1716454223386968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223410463, "dur": 52, "args": { "External id": 126883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126883, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 126883, "pid": 5, "tid": 7, "ts": 1716454223410463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387058, "dur": 12, "args": { "External id": 126883, "cbid": 211, "correlation": 126883 } }, { "ph": "s", "id": 126883, "pid": 76337, "tid": -914061504, "ts": 1716454223387058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223410516, "dur": 4, "args": { "External id": 126895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126895, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 126895, "pid": 5, "tid": 7, "ts": 1716454223410516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387079, "dur": 6, "args": { "External id": 126895, "cbid": 211, "correlation": 126895 } }, { "ph": "s", "id": 126895, "pid": 76337, "tid": -914061504, "ts": 1716454223387079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223410521, "dur": 55, "args": { "External id": 126898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126898, "pid": 5, "tid": 7, "ts": 1716454223410521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387097, "dur": 7, "args": { "External id": 126898, "cbid": 211, "correlation": 126898 } }, { "ph": "s", "id": 126898, "pid": 76337, "tid": -914061504, "ts": 1716454223387097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223387155, "dur": 0, "args": { "External id": 126909, "cbid": 317, "correlation": 126909 } }, { "ph": "f", "id": 126909, "pid": 76337, "tid": -914061504, "ts": 1716454223387155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223387155, "dur": 0, "args": { "External id": 126910, "cbid": 203, "correlation": 126910 } }, { "ph": "f", "id": 126910, "pid": 76337, "tid": -914061504, "ts": 1716454223387155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223387156, "dur": 0, "args": { "External id": 126911, "cbid": 205, "correlation": 126911 } }, { "ph": "f", "id": 126911, "pid": 76337, "tid": -914061504, "ts": 1716454223387156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387178, "dur": 1, "args": { "External id": 126915, "cbid": 251, "correlation": 126915 } }, { "ph": "f", "id": 126915, "pid": 76337, "tid": -914061504, "ts": 1716454223387178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387180, "dur": 0, "args": { "External id": 126916, "cbid": 251, "correlation": 126916 } }, { "ph": "f", "id": 126916, "pid": 76337, "tid": -914061504, "ts": 1716454223387180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387181, "dur": 0, "args": { "External id": 126917, "cbid": 251, "correlation": 126917 } }, { "ph": "f", "id": 126917, "pid": 76337, "tid": -914061504, "ts": 1716454223387181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387181, "dur": 0, "args": { "External id": 126918, "cbid": 251, "correlation": 126918 } }, { "ph": "f", "id": 126918, "pid": 76337, "tid": -914061504, "ts": 1716454223387181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387182, "dur": 0, "args": { "External id": 126919, "cbid": 251, "correlation": 126919 } }, { "ph": "f", "id": 126919, "pid": 76337, "tid": -914061504, "ts": 1716454223387182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387183, "dur": 0, "args": { "External id": 126920, "cbid": 251, "correlation": 126920 } }, { "ph": "f", "id": 126920, "pid": 76337, "tid": -914061504, "ts": 1716454223387183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387183, "dur": 0, "args": { "External id": 126921, "cbid": 251, "correlation": 126921 } }, { "ph": "f", "id": 126921, "pid": 76337, "tid": -914061504, "ts": 1716454223387183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387184, "dur": 0, "args": { "External id": 126922, "cbid": 251, "correlation": 126922 } }, { "ph": "f", "id": 126922, "pid": 76337, "tid": -914061504, "ts": 1716454223387184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387185, "dur": 0, "args": { "External id": 126923, "cbid": 251, "correlation": 126923 } }, { "ph": "f", "id": 126923, "pid": 76337, "tid": -914061504, "ts": 1716454223387185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223410577, "dur": 112, "args": { "External id": 126924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126924, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 126924, "pid": 5, "tid": 7, "ts": 1716454223410577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387187, "dur": 13, "args": { "External id": 126924, "cbid": 211, "correlation": 126924 } }, { "ph": "s", "id": 126924, "pid": 76337, "tid": -914061504, "ts": 1716454223387187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223410691, "dur": 60, "args": { "External id": 126930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126930, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126930, "pid": 5, "tid": 7, "ts": 1716454223410691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387223, "dur": 9, "args": { "External id": 126930, "cbid": 211, "correlation": 126930 } }, { "ph": "s", "id": 126930, "pid": 76337, "tid": -914061504, "ts": 1716454223387223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223410751, "dur": 659, "args": { "External id": 126939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126939, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126939, "pid": 5, "tid": 7, "ts": 1716454223410751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387307, "dur": 14, "args": { "External id": 126939, "cbid": 211, "correlation": 126939 } }, { "ph": "s", "id": 126939, "pid": 76337, "tid": -914061504, "ts": 1716454223387307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223411412, "dur": 180, "args": { "External id": 126961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126961, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 126961, "pid": 5, "tid": 7, "ts": 1716454223411412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387365, "dur": 10, "args": { "External id": 126961, "cbid": 211, "correlation": 126961 } }, { "ph": "s", "id": 126961, "pid": 76337, "tid": -914061504, "ts": 1716454223387365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387451, "dur": 1, "args": { "External id": 126972, "cbid": 251, "correlation": 126972 } }, { "ph": "f", "id": 126972, "pid": 76337, "tid": -914061504, "ts": 1716454223387451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223411593, "dur": 192, "args": { "External id": 126973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126973, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126973, "pid": 5, "tid": 7, "ts": 1716454223411593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387456, "dur": 13, "args": { "External id": 126973, "cbid": 211, "correlation": 126973 } }, { "ph": "s", "id": 126973, "pid": 76337, "tid": -914061504, "ts": 1716454223387456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387524, "dur": 1, "args": { "External id": 126984, "cbid": 251, "correlation": 126984 } }, { "ph": "f", "id": 126984, "pid": 76337, "tid": -914061504, "ts": 1716454223387524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223411787, "dur": 189, "args": { "External id": 126985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126985, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126985, "pid": 5, "tid": 7, "ts": 1716454223411787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387528, "dur": 12, "args": { "External id": 126985, "cbid": 211, "correlation": 126985 } }, { "ph": "s", "id": 126985, "pid": 76337, "tid": -914061504, "ts": 1716454223387528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387590, "dur": 1, "args": { "External id": 126996, "cbid": 251, "correlation": 126996 } }, { "ph": "f", "id": 126996, "pid": 76337, "tid": -914061504, "ts": 1716454223387590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223411977, "dur": 185, "args": { "External id": 126997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 126997, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 126997, "pid": 5, "tid": 7, "ts": 1716454223411977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387594, "dur": 12, "args": { "External id": 126997, "cbid": 211, "correlation": 126997 } }, { "ph": "s", "id": 126997, "pid": 76337, "tid": -914061504, "ts": 1716454223387594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223412164, "dur": 18627, "args": { "External id": 127018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127018, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127018, "pid": 5, "tid": 7, "ts": 1716454223412164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387674, "dur": 12, "args": { "External id": 127018, "cbid": 211, "correlation": 127018 } }, { "ph": "s", "id": 127018, "pid": 76337, "tid": -914061504, "ts": 1716454223387674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223387771, "dur": 1, "args": { "External id": 127036, "cbid": 251, "correlation": 127036 } }, { "ph": "f", "id": 127036, "pid": 76337, "tid": -914061504, "ts": 1716454223387771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223430792, "dur": 199, "args": { "External id": 127038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127038, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127038, "pid": 5, "tid": 7, "ts": 1716454223430792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387777, "dur": 13, "args": { "External id": 127038, "cbid": 211, "correlation": 127038 } }, { "ph": "s", "id": 127038, "pid": 76337, "tid": -914061504, "ts": 1716454223387777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223430993, "dur": 67, "args": { "External id": 127046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127046, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127046, "pid": 5, "tid": 7, "ts": 1716454223430993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387848, "dur": 13, "args": { "External id": 127046, "cbid": 211, "correlation": 127046 } }, { "ph": "s", "id": 127046, "pid": 76337, "tid": -914061504, "ts": 1716454223387848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223431061, "dur": 96, "args": { "External id": 127054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127054, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127054, "pid": 5, "tid": 7, "ts": 1716454223431061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387888, "dur": 8, "args": { "External id": 127054, "cbid": 211, "correlation": 127054 } }, { "ph": "s", "id": 127054, "pid": 76337, "tid": -914061504, "ts": 1716454223387888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223431159, "dur": 54, "args": { "External id": 127065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127065, "pid": 5, "tid": 7, "ts": 1716454223431159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387958, "dur": 13, "args": { "External id": 127065, "cbid": 211, "correlation": 127065 } }, { "ph": "s", "id": 127065, "pid": 76337, "tid": -914061504, "ts": 1716454223387958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223431215, "dur": 91, "args": { "External id": 127087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127087, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127087, "pid": 5, "tid": 7, "ts": 1716454223431215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223387998, "dur": 9, "args": { "External id": 127087, "cbid": 211, "correlation": 127087 } }, { "ph": "s", "id": 127087, "pid": 76337, "tid": -914061504, "ts": 1716454223387998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388084, "dur": 1, "args": { "External id": 127098, "cbid": 251, "correlation": 127098 } }, { "ph": "f", "id": 127098, "pid": 76337, "tid": -914061504, "ts": 1716454223388084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223431308, "dur": 103, "args": { "External id": 127099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127099, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127099, "pid": 5, "tid": 7, "ts": 1716454223431308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388089, "dur": 13, "args": { "External id": 127099, "cbid": 211, "correlation": 127099 } }, { "ph": "s", "id": 127099, "pid": 76337, "tid": -914061504, "ts": 1716454223388089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388161, "dur": 1, "args": { "External id": 127110, "cbid": 251, "correlation": 127110 } }, { "ph": "f", "id": 127110, "pid": 76337, "tid": -914061504, "ts": 1716454223388161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388165, "dur": 0, "args": { "External id": 127111, "cbid": 251, "correlation": 127111 } }, { "ph": "f", "id": 127111, "pid": 76337, "tid": -914061504, "ts": 1716454223388165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223431412, "dur": 10, "args": { "External id": 127112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127112, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 127112, "pid": 5, "tid": 7, "ts": 1716454223431412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388166, "dur": 12, "args": { "External id": 127112, "cbid": 211, "correlation": 127112 } }, { "ph": "s", "id": 127112, "pid": 76337, "tid": -914061504, "ts": 1716454223388166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223431423, "dur": 5, "args": { "External id": 127114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127114, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 127114, "pid": 5, "tid": 7, "ts": 1716454223431423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388180, "dur": 6, "args": { "External id": 127114, "cbid": 211, "correlation": 127114 } }, { "ph": "s", "id": 127114, "pid": 76337, "tid": -914061504, "ts": 1716454223388180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388241, "dur": 1, "args": { "External id": 127125, "cbid": 251, "correlation": 127125 } }, { "ph": "f", "id": 127125, "pid": 76337, "tid": -914061504, "ts": 1716454223388241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388244, "dur": 0, "args": { "External id": 127126, "cbid": 251, "correlation": 127126 } }, { "ph": "f", "id": 127126, "pid": 76337, "tid": -914061504, "ts": 1716454223388244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223431429, "dur": 6, "args": { "External id": 127127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127127, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 127127, "pid": 5, "tid": 7, "ts": 1716454223431429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388245, "dur": 12, "args": { "External id": 127127, "cbid": 211, "correlation": 127127 } }, { "ph": "s", "id": 127127, "pid": 76337, "tid": -914061504, "ts": 1716454223388245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223431437, "dur": 3, "args": { "External id": 127129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127129, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 127129, "pid": 5, "tid": 7, "ts": 1716454223431437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388259, "dur": 5, "args": { "External id": 127129, "cbid": 211, "correlation": 127129 } }, { "ph": "s", "id": 127129, "pid": 76337, "tid": -914061504, "ts": 1716454223388259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223431442, "dur": 152, "args": { "External id": 127150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127150, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127150, "pid": 5, "tid": 7, "ts": 1716454223431442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388332, "dur": 12, "args": { "External id": 127150, "cbid": 211, "correlation": 127150 } }, { "ph": "s", "id": 127150, "pid": 76337, "tid": -914061504, "ts": 1716454223388332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388428, "dur": 1, "args": { "External id": 127168, "cbid": 251, "correlation": 127168 } }, { "ph": "f", "id": 127168, "pid": 76337, "tid": -914061504, "ts": 1716454223388428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223431595, "dur": 107, "args": { "External id": 127170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127170, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127170, "pid": 5, "tid": 7, "ts": 1716454223431595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388434, "dur": 13, "args": { "External id": 127170, "cbid": 211, "correlation": 127170 } }, { "ph": "s", "id": 127170, "pid": 76337, "tid": -914061504, "ts": 1716454223388434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223431704, "dur": 35, "args": { "External id": 127178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127178, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127178, "pid": 5, "tid": 7, "ts": 1716454223431704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388502, "dur": 13, "args": { "External id": 127178, "cbid": 211, "correlation": 127178 } }, { "ph": "s", "id": 127178, "pid": 76337, "tid": -914061504, "ts": 1716454223388502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223431740, "dur": 66, "args": { "External id": 127186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127186, "pid": 5, "tid": 7, "ts": 1716454223431740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388543, "dur": 9, "args": { "External id": 127186, "cbid": 211, "correlation": 127186 } }, { "ph": "s", "id": 127186, "pid": 76337, "tid": -914061504, "ts": 1716454223388543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223431808, "dur": 91, "args": { "External id": 127208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127208, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127208, "pid": 5, "tid": 7, "ts": 1716454223431808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388593, "dur": 10, "args": { "External id": 127208, "cbid": 211, "correlation": 127208 } }, { "ph": "s", "id": 127208, "pid": 76337, "tid": -914061504, "ts": 1716454223388593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388679, "dur": 1, "args": { "External id": 127224, "cbid": 251, "correlation": 127224 } }, { "ph": "f", "id": 127224, "pid": 76337, "tid": -914061504, "ts": 1716454223388679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223431901, "dur": 572, "args": { "External id": 127226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127226, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127226, "pid": 5, "tid": 7, "ts": 1716454223431901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388685, "dur": 12, "args": { "External id": 127226, "cbid": 211, "correlation": 127226 } }, { "ph": "s", "id": 127226, "pid": 76337, "tid": -914061504, "ts": 1716454223388685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223432474, "dur": 244, "args": { "External id": 127234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127234, "pid": 5, "tid": 7, "ts": 1716454223432474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388749, "dur": 13, "args": { "External id": 127234, "cbid": 211, "correlation": 127234 } }, { "ph": "s", "id": 127234, "pid": 76337, "tid": -914061504, "ts": 1716454223388749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223432719, "dur": 250, "args": { "External id": 127242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127242, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127242, "pid": 5, "tid": 7, "ts": 1716454223432719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388780, "dur": 8, "args": { "External id": 127242, "cbid": 211, "correlation": 127242 } }, { "ph": "s", "id": 127242, "pid": 76337, "tid": -914061504, "ts": 1716454223388780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388862, "dur": 1, "args": { "External id": 127258, "cbid": 251, "correlation": 127258 } }, { "ph": "f", "id": 127258, "pid": 76337, "tid": -914061504, "ts": 1716454223388862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223388867, "dur": 0, "args": { "External id": 127260, "cbid": 251, "correlation": 127260 } }, { "ph": "f", "id": 127260, "pid": 76337, "tid": -914061504, "ts": 1716454223388867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223432971, "dur": 359, "args": { "External id": 127261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127261, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 127261, "pid": 5, "tid": 7, "ts": 1716454223432971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388870, "dur": 12, "args": { "External id": 127261, "cbid": 211, "correlation": 127261 } }, { "ph": "s", "id": 127261, "pid": 76337, "tid": -914061504, "ts": 1716454223388870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223433331, "dur": 50, "args": { "External id": 127269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127269, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127269, "pid": 5, "tid": 7, "ts": 1716454223433331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388913, "dur": 10, "args": { "External id": 127269, "cbid": 211, "correlation": 127269 } }, { "ph": "s", "id": 127269, "pid": 76337, "tid": -914061504, "ts": 1716454223388913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223433382, "dur": 157, "args": { "External id": 127280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127280, "pid": 5, "tid": 7, "ts": 1716454223433382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223388987, "dur": 13, "args": { "External id": 127280, "cbid": 211, "correlation": 127280 } }, { "ph": "s", "id": 127280, "pid": 76337, "tid": -914061504, "ts": 1716454223388987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223389052, "dur": 0, "args": { "External id": 127292, "cbid": 317, "correlation": 127292 } }, { "ph": "f", "id": 127292, "pid": 76337, "tid": -914061504, "ts": 1716454223389052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223389053, "dur": 0, "args": { "External id": 127293, "cbid": 203, "correlation": 127293 } }, { "ph": "f", "id": 127293, "pid": 76337, "tid": -914061504, "ts": 1716454223389053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223389054, "dur": 0, "args": { "External id": 127294, "cbid": 205, "correlation": 127294 } }, { "ph": "f", "id": 127294, "pid": 76337, "tid": -914061504, "ts": 1716454223389054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389076, "dur": 1, "args": { "External id": 127298, "cbid": 251, "correlation": 127298 } }, { "ph": "f", "id": 127298, "pid": 76337, "tid": -914061504, "ts": 1716454223389076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389078, "dur": 0, "args": { "External id": 127299, "cbid": 251, "correlation": 127299 } }, { "ph": "f", "id": 127299, "pid": 76337, "tid": -914061504, "ts": 1716454223389078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389079, "dur": 0, "args": { "External id": 127300, "cbid": 251, "correlation": 127300 } }, { "ph": "f", "id": 127300, "pid": 76337, "tid": -914061504, "ts": 1716454223389079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389080, "dur": 0, "args": { "External id": 127301, "cbid": 251, "correlation": 127301 } }, { "ph": "f", "id": 127301, "pid": 76337, "tid": -914061504, "ts": 1716454223389080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389080, "dur": 0, "args": { "External id": 127302, "cbid": 251, "correlation": 127302 } }, { "ph": "f", "id": 127302, "pid": 76337, "tid": -914061504, "ts": 1716454223389080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389081, "dur": 0, "args": { "External id": 127303, "cbid": 251, "correlation": 127303 } }, { "ph": "f", "id": 127303, "pid": 76337, "tid": -914061504, "ts": 1716454223389081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389082, "dur": 0, "args": { "External id": 127304, "cbid": 251, "correlation": 127304 } }, { "ph": "f", "id": 127304, "pid": 76337, "tid": -914061504, "ts": 1716454223389082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389083, "dur": 0, "args": { "External id": 127305, "cbid": 251, "correlation": 127305 } }, { "ph": "f", "id": 127305, "pid": 76337, "tid": -914061504, "ts": 1716454223389083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389084, "dur": 0, "args": { "External id": 127306, "cbid": 251, "correlation": 127306 } }, { "ph": "f", "id": 127306, "pid": 76337, "tid": -914061504, "ts": 1716454223389084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223433540, "dur": 115, "args": { "External id": 127307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127307, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127307, "pid": 5, "tid": 7, "ts": 1716454223433540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389086, "dur": 12, "args": { "External id": 127307, "cbid": 211, "correlation": 127307 } }, { "ph": "s", "id": 127307, "pid": 76337, "tid": -914061504, "ts": 1716454223389086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223433657, "dur": 59, "args": { "External id": 127313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127313, "pid": 5, "tid": 7, "ts": 1716454223433657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389121, "dur": 9, "args": { "External id": 127313, "cbid": 211, "correlation": 127313 } }, { "ph": "s", "id": 127313, "pid": 76337, "tid": -914061504, "ts": 1716454223389121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223433717, "dur": 50, "args": { "External id": 127321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127321, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127321, "pid": 5, "tid": 7, "ts": 1716454223433717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389152, "dur": 9, "args": { "External id": 127321, "cbid": 211, "correlation": 127321 } }, { "ph": "s", "id": 127321, "pid": 76337, "tid": -914061504, "ts": 1716454223389152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223389224, "dur": 0, "args": { "External id": 127331, "cbid": 317, "correlation": 127331 } }, { "ph": "f", "id": 127331, "pid": 76337, "tid": -914061504, "ts": 1716454223389224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223389225, "dur": 0, "args": { "External id": 127332, "cbid": 203, "correlation": 127332 } }, { "ph": "f", "id": 127332, "pid": 76337, "tid": -914061504, "ts": 1716454223389225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223389226, "dur": 0, "args": { "External id": 127333, "cbid": 205, "correlation": 127333 } }, { "ph": "f", "id": 127333, "pid": 76337, "tid": -914061504, "ts": 1716454223389226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223433768, "dur": 40, "args": { "External id": 127337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127337, "pid": 5, "tid": 7, "ts": 1716454223433768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389243, "dur": 13, "args": { "External id": 127337, "cbid": 211, "correlation": 127337 } }, { "ph": "s", "id": 127337, "pid": 76337, "tid": -914061504, "ts": 1716454223389243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223433810, "dur": 14, "args": { "External id": 127339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127339, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127339, "pid": 5, "tid": 7, "ts": 1716454223433810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389258, "dur": 5, "args": { "External id": 127339, "cbid": 211, "correlation": 127339 } }, { "ph": "s", "id": 127339, "pid": 76337, "tid": -914061504, "ts": 1716454223389258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223433827, "dur": 1, "args": { "External id": 127341, "device": 5, "context": 1, "stream": 7, "correlation": 127341, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 127341, "pid": 5, "tid": 7, "ts": 1716454223433827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223389277, "dur": 17, "args": { "External id": 127341, "cbid": 51, "correlation": 127341 } }, { "ph": "s", "id": 127341, "pid": 76337, "tid": -914061504, "ts": 1716454223389277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223433830, "dur": 360, "args": { "External id": 127342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127342, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127342, "pid": 5, "tid": 7, "ts": 1716454223433830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389296, "dur": 10, "args": { "External id": 127342, "cbid": 211, "correlation": 127342 } }, { "ph": "s", "id": 127342, "pid": 76337, "tid": -914061504, "ts": 1716454223389296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223434192, "dur": 14, "args": { "External id": 127344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127344, "pid": 5, "tid": 7, "ts": 1716454223434192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389314, "dur": 7, "args": { "External id": 127344, "cbid": 211, "correlation": 127344 } }, { "ph": "s", "id": 127344, "pid": 76337, "tid": -914061504, "ts": 1716454223389314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223434206, "dur": 15, "args": { "External id": 127350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127350, "pid": 5, "tid": 7, "ts": 1716454223434206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389343, "dur": 9, "args": { "External id": 127350, "cbid": 211, "correlation": 127350 } }, { "ph": "s", "id": 127350, "pid": 76337, "tid": -914061504, "ts": 1716454223389343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223434223, "dur": 18, "args": { "External id": 127370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127370, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 127370, "pid": 5, "tid": 7, "ts": 1716454223434223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389435, "dur": 12, "args": { "External id": 127370, "cbid": 211, "correlation": 127370 } }, { "ph": "s", "id": 127370, "pid": 76337, "tid": -914061504, "ts": 1716454223389435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223434243, "dur": 4, "args": { "External id": 127382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127382, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 127382, "pid": 5, "tid": 7, "ts": 1716454223434243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389459, "dur": 6, "args": { "External id": 127382, "cbid": 211, "correlation": 127382 } }, { "ph": "s", "id": 127382, "pid": 76337, "tid": -914061504, "ts": 1716454223389459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223434249, "dur": 19, "args": { "External id": 127385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127385, "pid": 5, "tid": 7, "ts": 1716454223434249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389477, "dur": 6, "args": { "External id": 127385, "cbid": 211, "correlation": 127385 } }, { "ph": "s", "id": 127385, "pid": 76337, "tid": -914061504, "ts": 1716454223389477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223434269, "dur": 11, "args": { "External id": 127394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127394, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127394, "pid": 5, "tid": 7, "ts": 1716454223434269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389516, "dur": 9, "args": { "External id": 127394, "cbid": 211, "correlation": 127394 } }, { "ph": "s", "id": 127394, "pid": 76337, "tid": -914061504, "ts": 1716454223389516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223389571, "dur": 0, "args": { "External id": 127404, "cbid": 317, "correlation": 127404 } }, { "ph": "f", "id": 127404, "pid": 76337, "tid": -914061504, "ts": 1716454223389571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223389572, "dur": 0, "args": { "External id": 127405, "cbid": 203, "correlation": 127405 } }, { "ph": "f", "id": 127405, "pid": 76337, "tid": -914061504, "ts": 1716454223389572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223389573, "dur": 0, "args": { "External id": 127406, "cbid": 205, "correlation": 127406 } }, { "ph": "f", "id": 127406, "pid": 76337, "tid": -914061504, "ts": 1716454223389573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223434281, "dur": 11, "args": { "External id": 127410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127410, "pid": 5, "tid": 7, "ts": 1716454223434281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389589, "dur": 11, "args": { "External id": 127410, "cbid": 211, "correlation": 127410 } }, { "ph": "s", "id": 127410, "pid": 76337, "tid": -914061504, "ts": 1716454223389589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223434293, "dur": 24, "args": { "External id": 127412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127412, "pid": 5, "tid": 7, "ts": 1716454223434293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389603, "dur": 6, "args": { "External id": 127412, "cbid": 211, "correlation": 127412 } }, { "ph": "s", "id": 127412, "pid": 76337, "tid": -914061504, "ts": 1716454223389603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223434319, "dur": 4, "args": { "External id": 127414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127414, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 127414, "pid": 5, "tid": 7, "ts": 1716454223434319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389614, "dur": 5, "args": { "External id": 127414, "cbid": 211, "correlation": 127414 } }, { "ph": "s", "id": 127414, "pid": 76337, "tid": -914061504, "ts": 1716454223389614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223389623, "dur": 0, "args": { "External id": 127415, "cbid": 51, "correlation": 127415 } }, { "ph": "s", "id": 127415, "pid": 76337, "tid": -914061504, "ts": 1716454223389623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223434324, "dur": 355, "args": { "External id": 127416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127416, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127416, "pid": 5, "tid": 7, "ts": 1716454223434324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389624, "dur": 7, "args": { "External id": 127416, "cbid": 211, "correlation": 127416 } }, { "ph": "s", "id": 127416, "pid": 76337, "tid": -914061504, "ts": 1716454223389624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223434680, "dur": 20, "args": { "External id": 127417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127417, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127417, "pid": 5, "tid": 7, "ts": 1716454223434680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389634, "dur": 5, "args": { "External id": 127417, "cbid": 211, "correlation": 127417 } }, { "ph": "s", "id": 127417, "pid": 76337, "tid": -914061504, "ts": 1716454223389634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223434701, "dur": 33, "args": { "External id": 127423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127423, "pid": 5, "tid": 7, "ts": 1716454223434701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389662, "dur": 8, "args": { "External id": 127423, "cbid": 211, "correlation": 127423 } }, { "ph": "s", "id": 127423, "pid": 76337, "tid": -914061504, "ts": 1716454223389662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223434735, "dur": 3, "args": { "External id": 127431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127431, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 127431, "pid": 5, "tid": 7, "ts": 1716454223434735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389704, "dur": 9, "args": { "External id": 127431, "cbid": 211, "correlation": 127431 } }, { "ph": "s", "id": 127431, "pid": 76337, "tid": -914061504, "ts": 1716454223389704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389768, "dur": 1, "args": { "External id": 127447, "cbid": 251, "correlation": 127447 } }, { "ph": "f", "id": 127447, "pid": 76337, "tid": -914061504, "ts": 1716454223389768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223389773, "dur": 0, "args": { "External id": 127449, "cbid": 251, "correlation": 127449 } }, { "ph": "f", "id": 127449, "pid": 76337, "tid": -914061504, "ts": 1716454223389773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223434740, "dur": 12, "args": { "External id": 127450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127450, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 127450, "pid": 5, "tid": 7, "ts": 1716454223434740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389775, "dur": 11, "args": { "External id": 127450, "cbid": 211, "correlation": 127450 } }, { "ph": "s", "id": 127450, "pid": 76337, "tid": -914061504, "ts": 1716454223389775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223434753, "dur": 5, "args": { "External id": 127452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127452, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 127452, "pid": 5, "tid": 7, "ts": 1716454223434753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389788, "dur": 5, "args": { "External id": 127452, "cbid": 211, "correlation": 127452 } }, { "ph": "s", "id": 127452, "pid": 76337, "tid": -914061504, "ts": 1716454223389788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223434759, "dur": 29, "args": { "External id": 127462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127462, "pid": 5, "tid": 7, "ts": 1716454223434759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389847, "dur": 13, "args": { "External id": 127462, "cbid": 211, "correlation": 127462 } }, { "ph": "s", "id": 127462, "pid": 76337, "tid": -914061504, "ts": 1716454223389847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223434790, "dur": 32, "args": { "External id": 127482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127482, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 127482, "pid": 5, "tid": 7, "ts": 1716454223434790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389914, "dur": 11, "args": { "External id": 127482, "cbid": 211, "correlation": 127482 } }, { "ph": "s", "id": 127482, "pid": 76337, "tid": -914061504, "ts": 1716454223389914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223434823, "dur": 4, "args": { "External id": 127494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127494, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 127494, "pid": 5, "tid": 7, "ts": 1716454223434823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389935, "dur": 6, "args": { "External id": 127494, "cbid": 211, "correlation": 127494 } }, { "ph": "s", "id": 127494, "pid": 76337, "tid": -914061504, "ts": 1716454223389935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223434828, "dur": 30, "args": { "External id": 127497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127497, "pid": 5, "tid": 7, "ts": 1716454223434828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223389954, "dur": 6, "args": { "External id": 127497, "cbid": 211, "correlation": 127497 } }, { "ph": "s", "id": 127497, "pid": 76337, "tid": -914061504, "ts": 1716454223389954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223434859, "dur": 20, "args": { "External id": 127506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127506, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127506, "pid": 5, "tid": 7, "ts": 1716454223434859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390005, "dur": 10, "args": { "External id": 127506, "cbid": 211, "correlation": 127506 } }, { "ph": "s", "id": 127506, "pid": 76337, "tid": -914061504, "ts": 1716454223390005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223390069, "dur": 0, "args": { "External id": 127516, "cbid": 317, "correlation": 127516 } }, { "ph": "f", "id": 127516, "pid": 76337, "tid": -914061504, "ts": 1716454223390069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223390070, "dur": 0, "args": { "External id": 127517, "cbid": 203, "correlation": 127517 } }, { "ph": "f", "id": 127517, "pid": 76337, "tid": -914061504, "ts": 1716454223390070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223390071, "dur": 0, "args": { "External id": 127518, "cbid": 205, "correlation": 127518 } }, { "ph": "f", "id": 127518, "pid": 76337, "tid": -914061504, "ts": 1716454223390071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223434880, "dur": 22, "args": { "External id": 127522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127522, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127522, "pid": 5, "tid": 7, "ts": 1716454223434880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390088, "dur": 13, "args": { "External id": 127522, "cbid": 211, "correlation": 127522 } }, { "ph": "s", "id": 127522, "pid": 76337, "tid": -914061504, "ts": 1716454223390088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223434904, "dur": 43, "args": { "External id": 127524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127524, "pid": 5, "tid": 7, "ts": 1716454223434904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390104, "dur": 5, "args": { "External id": 127524, "cbid": 211, "correlation": 127524 } }, { "ph": "s", "id": 127524, "pid": 76337, "tid": -914061504, "ts": 1716454223390104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223434949, "dur": 649, "args": { "External id": 127526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127526, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127526, "pid": 5, "tid": 7, "ts": 1716454223434949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390118, "dur": 9, "args": { "External id": 127526, "cbid": 211, "correlation": 127526 } }, { "ph": "s", "id": 127526, "pid": 76337, "tid": -914061504, "ts": 1716454223390118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223435599, "dur": 22, "args": { "External id": 127528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127528, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127528, "pid": 5, "tid": 7, "ts": 1716454223435599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390131, "dur": 5, "args": { "External id": 127528, "cbid": 211, "correlation": 127528 } }, { "ph": "s", "id": 127528, "pid": 76337, "tid": -914061504, "ts": 1716454223390131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223435623, "dur": 33, "args": { "External id": 127534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127534, "pid": 5, "tid": 7, "ts": 1716454223435623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390159, "dur": 8, "args": { "External id": 127534, "cbid": 211, "correlation": 127534 } }, { "ph": "s", "id": 127534, "pid": 76337, "tid": -914061504, "ts": 1716454223390159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223390216, "dur": 0, "args": { "External id": 127544, "cbid": 317, "correlation": 127544 } }, { "ph": "f", "id": 127544, "pid": 76337, "tid": -914061504, "ts": 1716454223390216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223390217, "dur": 0, "args": { "External id": 127545, "cbid": 203, "correlation": 127545 } }, { "ph": "f", "id": 127545, "pid": 76337, "tid": -914061504, "ts": 1716454223390217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223390218, "dur": 0, "args": { "External id": 127546, "cbid": 205, "correlation": 127546 } }, { "ph": "f", "id": 127546, "pid": 76337, "tid": -914061504, "ts": 1716454223390218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390239, "dur": 1, "args": { "External id": 127550, "cbid": 251, "correlation": 127550 } }, { "ph": "f", "id": 127550, "pid": 76337, "tid": -914061504, "ts": 1716454223390239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390241, "dur": 0, "args": { "External id": 127551, "cbid": 251, "correlation": 127551 } }, { "ph": "f", "id": 127551, "pid": 76337, "tid": -914061504, "ts": 1716454223390241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390242, "dur": 0, "args": { "External id": 127552, "cbid": 251, "correlation": 127552 } }, { "ph": "f", "id": 127552, "pid": 76337, "tid": -914061504, "ts": 1716454223390242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390243, "dur": 0, "args": { "External id": 127553, "cbid": 251, "correlation": 127553 } }, { "ph": "f", "id": 127553, "pid": 76337, "tid": -914061504, "ts": 1716454223390243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390243, "dur": 0, "args": { "External id": 127554, "cbid": 251, "correlation": 127554 } }, { "ph": "f", "id": 127554, "pid": 76337, "tid": -914061504, "ts": 1716454223390243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390244, "dur": 0, "args": { "External id": 127555, "cbid": 251, "correlation": 127555 } }, { "ph": "f", "id": 127555, "pid": 76337, "tid": -914061504, "ts": 1716454223390244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390245, "dur": 0, "args": { "External id": 127556, "cbid": 251, "correlation": 127556 } }, { "ph": "f", "id": 127556, "pid": 76337, "tid": -914061504, "ts": 1716454223390245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390245, "dur": 0, "args": { "External id": 127557, "cbid": 251, "correlation": 127557 } }, { "ph": "f", "id": 127557, "pid": 76337, "tid": -914061504, "ts": 1716454223390245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390247, "dur": 0, "args": { "External id": 127558, "cbid": 251, "correlation": 127558 } }, { "ph": "f", "id": 127558, "pid": 76337, "tid": -914061504, "ts": 1716454223390247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223435658, "dur": 51, "args": { "External id": 127559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127559, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127559, "pid": 5, "tid": 7, "ts": 1716454223435658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390249, "dur": 12, "args": { "External id": 127559, "cbid": 211, "correlation": 127559 } }, { "ph": "s", "id": 127559, "pid": 76337, "tid": -914061504, "ts": 1716454223390249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223435710, "dur": 32, "args": { "External id": 127565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127565, "pid": 5, "tid": 7, "ts": 1716454223435710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390283, "dur": 9, "args": { "External id": 127565, "cbid": 211, "correlation": 127565 } }, { "ph": "s", "id": 127565, "pid": 76337, "tid": -914061504, "ts": 1716454223390283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223435743, "dur": 27, "args": { "External id": 127573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127573, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127573, "pid": 5, "tid": 7, "ts": 1716454223435743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390313, "dur": 8, "args": { "External id": 127573, "cbid": 211, "correlation": 127573 } }, { "ph": "s", "id": 127573, "pid": 76337, "tid": -914061504, "ts": 1716454223390313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223435771, "dur": 19, "args": { "External id": 127581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127581, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127581, "pid": 5, "tid": 7, "ts": 1716454223435771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390342, "dur": 9, "args": { "External id": 127581, "cbid": 211, "correlation": 127581 } }, { "ph": "s", "id": 127581, "pid": 76337, "tid": -914061504, "ts": 1716454223390342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223435792, "dur": 29, "args": { "External id": 127601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127601, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 127601, "pid": 5, "tid": 7, "ts": 1716454223435792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390424, "dur": 12, "args": { "External id": 127601, "cbid": 211, "correlation": 127601 } }, { "ph": "s", "id": 127601, "pid": 76337, "tid": -914061504, "ts": 1716454223390424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223435823, "dur": 4, "args": { "External id": 127613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127613, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 127613, "pid": 5, "tid": 7, "ts": 1716454223435823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390446, "dur": 7, "args": { "External id": 127613, "cbid": 211, "correlation": 127613 } }, { "ph": "s", "id": 127613, "pid": 76337, "tid": -914061504, "ts": 1716454223390446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223435828, "dur": 30, "args": { "External id": 127616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127616, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127616, "pid": 5, "tid": 7, "ts": 1716454223435828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390464, "dur": 7, "args": { "External id": 127616, "cbid": 211, "correlation": 127616 } }, { "ph": "s", "id": 127616, "pid": 76337, "tid": -914061504, "ts": 1716454223390464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223390521, "dur": 0, "args": { "External id": 127627, "cbid": 317, "correlation": 127627 } }, { "ph": "f", "id": 127627, "pid": 76337, "tid": -914061504, "ts": 1716454223390521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223390522, "dur": 0, "args": { "External id": 127628, "cbid": 203, "correlation": 127628 } }, { "ph": "f", "id": 127628, "pid": 76337, "tid": -914061504, "ts": 1716454223390522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223390523, "dur": 0, "args": { "External id": 127629, "cbid": 205, "correlation": 127629 } }, { "ph": "f", "id": 127629, "pid": 76337, "tid": -914061504, "ts": 1716454223390523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223435859, "dur": 21, "args": { "External id": 127633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127633, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127633, "pid": 5, "tid": 7, "ts": 1716454223435859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390538, "dur": 12, "args": { "External id": 127633, "cbid": 211, "correlation": 127633 } }, { "ph": "s", "id": 127633, "pid": 76337, "tid": -914061504, "ts": 1716454223390538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223435882, "dur": 120, "args": { "External id": 127635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127635, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127635, "pid": 5, "tid": 7, "ts": 1716454223435882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390558, "dur": 8, "args": { "External id": 127635, "cbid": 211, "correlation": 127635 } }, { "ph": "s", "id": 127635, "pid": 76337, "tid": -914061504, "ts": 1716454223390558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223436003, "dur": 21, "args": { "External id": 127637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127637, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127637, "pid": 5, "tid": 7, "ts": 1716454223436003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390570, "dur": 5, "args": { "External id": 127637, "cbid": 211, "correlation": 127637 } }, { "ph": "s", "id": 127637, "pid": 76337, "tid": -914061504, "ts": 1716454223390570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223436026, "dur": 33, "args": { "External id": 127643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127643, "pid": 5, "tid": 7, "ts": 1716454223436026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390597, "dur": 9, "args": { "External id": 127643, "cbid": 211, "correlation": 127643 } }, { "ph": "s", "id": 127643, "pid": 76337, "tid": -914061504, "ts": 1716454223390597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223436060, "dur": 165, "args": { "External id": 127652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127652, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127652, "pid": 5, "tid": 7, "ts": 1716454223436060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390679, "dur": 14, "args": { "External id": 127652, "cbid": 211, "correlation": 127652 } }, { "ph": "s", "id": 127652, "pid": 76337, "tid": -914061504, "ts": 1716454223390679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223436226, "dur": 64, "args": { "External id": 127674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127674, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127674, "pid": 5, "tid": 7, "ts": 1716454223436226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390737, "dur": 10, "args": { "External id": 127674, "cbid": 211, "correlation": 127674 } }, { "ph": "s", "id": 127674, "pid": 76337, "tid": -914061504, "ts": 1716454223390737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390827, "dur": 1, "args": { "External id": 127685, "cbid": 251, "correlation": 127685 } }, { "ph": "f", "id": 127685, "pid": 76337, "tid": -914061504, "ts": 1716454223390827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223436291, "dur": 155, "args": { "External id": 127686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127686, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127686, "pid": 5, "tid": 7, "ts": 1716454223436291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390832, "dur": 13, "args": { "External id": 127686, "cbid": 211, "correlation": 127686 } }, { "ph": "s", "id": 127686, "pid": 76337, "tid": -914061504, "ts": 1716454223390832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390902, "dur": 1, "args": { "External id": 127697, "cbid": 251, "correlation": 127697 } }, { "ph": "f", "id": 127697, "pid": 76337, "tid": -914061504, "ts": 1716454223390902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223436448, "dur": 147, "args": { "External id": 127698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127698, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127698, "pid": 5, "tid": 7, "ts": 1716454223436448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390907, "dur": 11, "args": { "External id": 127698, "cbid": 211, "correlation": 127698 } }, { "ph": "s", "id": 127698, "pid": 76337, "tid": -914061504, "ts": 1716454223390907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223390971, "dur": 1, "args": { "External id": 127709, "cbid": 251, "correlation": 127709 } }, { "ph": "f", "id": 127709, "pid": 76337, "tid": -914061504, "ts": 1716454223390971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223436596, "dur": 144, "args": { "External id": 127710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127710, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127710, "pid": 5, "tid": 7, "ts": 1716454223436596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223390982, "dur": 12, "args": { "External id": 127710, "cbid": 211, "correlation": 127710 } }, { "ph": "s", "id": 127710, "pid": 76337, "tid": -914061504, "ts": 1716454223390982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223436741, "dur": 1935, "args": { "External id": 127731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127731, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127731, "pid": 5, "tid": 7, "ts": 1716454223436741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391068, "dur": 13, "args": { "External id": 127731, "cbid": 211, "correlation": 127731 } }, { "ph": "s", "id": 127731, "pid": 76337, "tid": -914061504, "ts": 1716454223391068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391168, "dur": 1, "args": { "External id": 127749, "cbid": 251, "correlation": 127749 } }, { "ph": "f", "id": 127749, "pid": 76337, "tid": -914061504, "ts": 1716454223391168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223438677, "dur": 146, "args": { "External id": 127751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127751, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 127751, "pid": 5, "tid": 7, "ts": 1716454223438677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391174, "dur": 13, "args": { "External id": 127751, "cbid": 211, "correlation": 127751 } }, { "ph": "s", "id": 127751, "pid": 76337, "tid": -914061504, "ts": 1716454223391174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223438824, "dur": 35, "args": { "External id": 127759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127759, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127759, "pid": 5, "tid": 7, "ts": 1716454223438824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391245, "dur": 13, "args": { "External id": 127759, "cbid": 211, "correlation": 127759 } }, { "ph": "s", "id": 127759, "pid": 76337, "tid": -914061504, "ts": 1716454223391245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223438861, "dur": 51, "args": { "External id": 127767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127767, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127767, "pid": 5, "tid": 7, "ts": 1716454223438861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391285, "dur": 8, "args": { "External id": 127767, "cbid": 211, "correlation": 127767 } }, { "ph": "s", "id": 127767, "pid": 76337, "tid": -914061504, "ts": 1716454223391285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223438913, "dur": 30, "args": { "External id": 127778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127778, "pid": 5, "tid": 7, "ts": 1716454223438913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391358, "dur": 13, "args": { "External id": 127778, "cbid": 211, "correlation": 127778 } }, { "ph": "s", "id": 127778, "pid": 76337, "tid": -914061504, "ts": 1716454223391358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223438945, "dur": 34, "args": { "External id": 127800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127800, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127800, "pid": 5, "tid": 7, "ts": 1716454223438945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391389, "dur": 7, "args": { "External id": 127800, "cbid": 211, "correlation": 127800 } }, { "ph": "s", "id": 127800, "pid": 76337, "tid": -914061504, "ts": 1716454223391389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391474, "dur": 1, "args": { "External id": 127811, "cbid": 251, "correlation": 127811 } }, { "ph": "f", "id": 127811, "pid": 76337, "tid": -914061504, "ts": 1716454223391474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223438980, "dur": 90, "args": { "External id": 127812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127812, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127812, "pid": 5, "tid": 7, "ts": 1716454223438980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391479, "dur": 12, "args": { "External id": 127812, "cbid": 211, "correlation": 127812 } }, { "ph": "s", "id": 127812, "pid": 76337, "tid": -914061504, "ts": 1716454223391479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391550, "dur": 1, "args": { "External id": 127823, "cbid": 251, "correlation": 127823 } }, { "ph": "f", "id": 127823, "pid": 76337, "tid": -914061504, "ts": 1716454223391550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391554, "dur": 0, "args": { "External id": 127824, "cbid": 251, "correlation": 127824 } }, { "ph": "f", "id": 127824, "pid": 76337, "tid": -914061504, "ts": 1716454223391554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223439071, "dur": 11, "args": { "External id": 127825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127825, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 127825, "pid": 5, "tid": 7, "ts": 1716454223439071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391556, "dur": 12, "args": { "External id": 127825, "cbid": 211, "correlation": 127825 } }, { "ph": "s", "id": 127825, "pid": 76337, "tid": -914061504, "ts": 1716454223391556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223439084, "dur": 5, "args": { "External id": 127827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127827, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 127827, "pid": 5, "tid": 7, "ts": 1716454223439084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391570, "dur": 7, "args": { "External id": 127827, "cbid": 211, "correlation": 127827 } }, { "ph": "s", "id": 127827, "pid": 76337, "tid": -914061504, "ts": 1716454223391570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391629, "dur": 1, "args": { "External id": 127838, "cbid": 251, "correlation": 127838 } }, { "ph": "f", "id": 127838, "pid": 76337, "tid": -914061504, "ts": 1716454223391629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391632, "dur": 0, "args": { "External id": 127839, "cbid": 251, "correlation": 127839 } }, { "ph": "f", "id": 127839, "pid": 76337, "tid": -914061504, "ts": 1716454223391632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223439091, "dur": 7, "args": { "External id": 127840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127840, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 127840, "pid": 5, "tid": 7, "ts": 1716454223439091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391633, "dur": 11, "args": { "External id": 127840, "cbid": 211, "correlation": 127840 } }, { "ph": "s", "id": 127840, "pid": 76337, "tid": -914061504, "ts": 1716454223391633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223439099, "dur": 3, "args": { "External id": 127842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127842, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 127842, "pid": 5, "tid": 7, "ts": 1716454223439099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391646, "dur": 6, "args": { "External id": 127842, "cbid": 211, "correlation": 127842 } }, { "ph": "s", "id": 127842, "pid": 76337, "tid": -914061504, "ts": 1716454223391646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223439104, "dur": 90, "args": { "External id": 127863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127863, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 127863, "pid": 5, "tid": 7, "ts": 1716454223439104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391720, "dur": 12, "args": { "External id": 127863, "cbid": 211, "correlation": 127863 } }, { "ph": "s", "id": 127863, "pid": 76337, "tid": -914061504, "ts": 1716454223391720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223391817, "dur": 1, "args": { "External id": 127881, "cbid": 251, "correlation": 127881 } }, { "ph": "f", "id": 127881, "pid": 76337, "tid": -914061504, "ts": 1716454223391817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223439195, "dur": 96, "args": { "External id": 127883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127883, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127883, "pid": 5, "tid": 7, "ts": 1716454223439195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391823, "dur": 13, "args": { "External id": 127883, "cbid": 211, "correlation": 127883 } }, { "ph": "s", "id": 127883, "pid": 76337, "tid": -914061504, "ts": 1716454223391823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223439292, "dur": 19, "args": { "External id": 127891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127891, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127891, "pid": 5, "tid": 7, "ts": 1716454223439292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391891, "dur": 13, "args": { "External id": 127891, "cbid": 211, "correlation": 127891 } }, { "ph": "s", "id": 127891, "pid": 76337, "tid": -914061504, "ts": 1716454223391891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223439313, "dur": 37, "args": { "External id": 127899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127899, "pid": 5, "tid": 7, "ts": 1716454223439313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391933, "dur": 9, "args": { "External id": 127899, "cbid": 211, "correlation": 127899 } }, { "ph": "s", "id": 127899, "pid": 76337, "tid": -914061504, "ts": 1716454223391933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223439351, "dur": 35, "args": { "External id": 127921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127921, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127921, "pid": 5, "tid": 7, "ts": 1716454223439351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223391991, "dur": 11, "args": { "External id": 127921, "cbid": 211, "correlation": 127921 } }, { "ph": "s", "id": 127921, "pid": 76337, "tid": -914061504, "ts": 1716454223391991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223392081, "dur": 1, "args": { "External id": 127937, "cbid": 251, "correlation": 127937 } }, { "ph": "f", "id": 127937, "pid": 76337, "tid": -914061504, "ts": 1716454223392081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223392086, "dur": 0, "args": { "External id": 127939, "cbid": 251, "correlation": 127939 } }, { "ph": "f", "id": 127939, "pid": 76337, "tid": -914061504, "ts": 1716454223392086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223439387, "dur": 537, "args": { "External id": 127940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127940, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 127940, "pid": 5, "tid": 7, "ts": 1716454223439387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392090, "dur": 13, "args": { "External id": 127940, "cbid": 211, "correlation": 127940 } }, { "ph": "s", "id": 127940, "pid": 76337, "tid": -914061504, "ts": 1716454223392090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223439926, "dur": 126, "args": { "External id": 127948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127948, "pid": 5, "tid": 7, "ts": 1716454223439926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392157, "dur": 12, "args": { "External id": 127948, "cbid": 211, "correlation": 127948 } }, { "ph": "s", "id": 127948, "pid": 76337, "tid": -914061504, "ts": 1716454223392157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223440053, "dur": 126, "args": { "External id": 127956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127956, "pid": 5, "tid": 7, "ts": 1716454223440053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392186, "dur": 8, "args": { "External id": 127956, "cbid": 211, "correlation": 127956 } }, { "ph": "s", "id": 127956, "pid": 76337, "tid": -914061504, "ts": 1716454223392186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223392265, "dur": 1, "args": { "External id": 127972, "cbid": 251, "correlation": 127972 } }, { "ph": "f", "id": 127972, "pid": 76337, "tid": -914061504, "ts": 1716454223392265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223440181, "dur": 299, "args": { "External id": 127974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127974, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 127974, "pid": 5, "tid": 7, "ts": 1716454223440181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392271, "dur": 13, "args": { "External id": 127974, "cbid": 211, "correlation": 127974 } }, { "ph": "s", "id": 127974, "pid": 76337, "tid": -914061504, "ts": 1716454223392271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223440481, "dur": 28, "args": { "External id": 127982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127982, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127982, "pid": 5, "tid": 7, "ts": 1716454223440481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392314, "dur": 10, "args": { "External id": 127982, "cbid": 211, "correlation": 127982 } }, { "ph": "s", "id": 127982, "pid": 76337, "tid": -914061504, "ts": 1716454223392314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223440510, "dur": 81, "args": { "External id": 127993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 127993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 127993, "pid": 5, "tid": 7, "ts": 1716454223440510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392383, "dur": 12, "args": { "External id": 127993, "cbid": 211, "correlation": 127993 } }, { "ph": "s", "id": 127993, "pid": 76337, "tid": -914061504, "ts": 1716454223392383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223392446, "dur": 0, "args": { "External id": 128005, "cbid": 317, "correlation": 128005 } }, { "ph": "f", "id": 128005, "pid": 76337, "tid": -914061504, "ts": 1716454223392446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223392447, "dur": 0, "args": { "External id": 128006, "cbid": 203, "correlation": 128006 } }, { "ph": "f", "id": 128006, "pid": 76337, "tid": -914061504, "ts": 1716454223392447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223392448, "dur": 0, "args": { "External id": 128007, "cbid": 205, "correlation": 128007 } }, { "ph": "f", "id": 128007, "pid": 76337, "tid": -914061504, "ts": 1716454223392448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223440592, "dur": 24, "args": { "External id": 128011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128011, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128011, "pid": 5, "tid": 7, "ts": 1716454223440592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392463, "dur": 12, "args": { "External id": 128011, "cbid": 211, "correlation": 128011 } }, { "ph": "s", "id": 128011, "pid": 76337, "tid": -914061504, "ts": 1716454223392463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223440617, "dur": 119, "args": { "External id": 128013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128013, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128013, "pid": 5, "tid": 7, "ts": 1716454223440617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392481, "dur": 7, "args": { "External id": 128013, "cbid": 211, "correlation": 128013 } }, { "ph": "s", "id": 128013, "pid": 76337, "tid": -914061504, "ts": 1716454223392481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223440737, "dur": 22, "args": { "External id": 128015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128015, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128015, "pid": 5, "tid": 7, "ts": 1716454223440737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392492, "dur": 5, "args": { "External id": 128015, "cbid": 211, "correlation": 128015 } }, { "ph": "s", "id": 128015, "pid": 76337, "tid": -914061504, "ts": 1716454223392492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223440761, "dur": 32, "args": { "External id": 128021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128021, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128021, "pid": 5, "tid": 7, "ts": 1716454223440761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392520, "dur": 8, "args": { "External id": 128021, "cbid": 211, "correlation": 128021 } }, { "ph": "s", "id": 128021, "pid": 76337, "tid": -914061504, "ts": 1716454223392520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223440795, "dur": 27, "args": { "External id": 128029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128029, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128029, "pid": 5, "tid": 7, "ts": 1716454223440795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392551, "dur": 8, "args": { "External id": 128029, "cbid": 211, "correlation": 128029 } }, { "ph": "s", "id": 128029, "pid": 76337, "tid": -914061504, "ts": 1716454223392551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223440823, "dur": 30, "args": { "External id": 128049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128049, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 128049, "pid": 5, "tid": 7, "ts": 1716454223440823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392623, "dur": 12, "args": { "External id": 128049, "cbid": 211, "correlation": 128049 } }, { "ph": "s", "id": 128049, "pid": 76337, "tid": -914061504, "ts": 1716454223392623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223440854, "dur": 5, "args": { "External id": 128061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128061, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 128061, "pid": 5, "tid": 7, "ts": 1716454223440854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392646, "dur": 7, "args": { "External id": 128061, "cbid": 211, "correlation": 128061 } }, { "ph": "s", "id": 128061, "pid": 76337, "tid": -914061504, "ts": 1716454223392646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223440860, "dur": 31, "args": { "External id": 128064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128064, "pid": 5, "tid": 7, "ts": 1716454223440860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392664, "dur": 8, "args": { "External id": 128064, "cbid": 211, "correlation": 128064 } }, { "ph": "s", "id": 128064, "pid": 76337, "tid": -914061504, "ts": 1716454223392664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223440893, "dur": 22, "args": { "External id": 128073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128073, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128073, "pid": 5, "tid": 7, "ts": 1716454223440893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392704, "dur": 9, "args": { "External id": 128073, "cbid": 211, "correlation": 128073 } }, { "ph": "s", "id": 128073, "pid": 76337, "tid": -914061504, "ts": 1716454223392704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223392755, "dur": 0, "args": { "External id": 128083, "cbid": 317, "correlation": 128083 } }, { "ph": "f", "id": 128083, "pid": 76337, "tid": -914061504, "ts": 1716454223392755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223392756, "dur": 0, "args": { "External id": 128084, "cbid": 203, "correlation": 128084 } }, { "ph": "f", "id": 128084, "pid": 76337, "tid": -914061504, "ts": 1716454223392756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223392757, "dur": 0, "args": { "External id": 128085, "cbid": 205, "correlation": 128085 } }, { "ph": "f", "id": 128085, "pid": 76337, "tid": -914061504, "ts": 1716454223392757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223440916, "dur": 23, "args": { "External id": 128089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128089, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128089, "pid": 5, "tid": 7, "ts": 1716454223440916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392770, "dur": 11, "args": { "External id": 128089, "cbid": 211, "correlation": 128089 } }, { "ph": "s", "id": 128089, "pid": 76337, "tid": -914061504, "ts": 1716454223392770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223440940, "dur": 43, "args": { "External id": 128091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128091, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128091, "pid": 5, "tid": 7, "ts": 1716454223440940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392784, "dur": 5, "args": { "External id": 128091, "cbid": 211, "correlation": 128091 } }, { "ph": "s", "id": 128091, "pid": 76337, "tid": -914061504, "ts": 1716454223392784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223440985, "dur": 654, "args": { "External id": 128093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128093, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128093, "pid": 5, "tid": 7, "ts": 1716454223440985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392795, "dur": 6, "args": { "External id": 128093, "cbid": 211, "correlation": 128093 } }, { "ph": "s", "id": 128093, "pid": 76337, "tid": -914061504, "ts": 1716454223392795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223441640, "dur": 23, "args": { "External id": 128095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128095, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128095, "pid": 5, "tid": 7, "ts": 1716454223441640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392805, "dur": 5, "args": { "External id": 128095, "cbid": 211, "correlation": 128095 } }, { "ph": "s", "id": 128095, "pid": 76337, "tid": -914061504, "ts": 1716454223392805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223441664, "dur": 33, "args": { "External id": 128101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128101, "pid": 5, "tid": 7, "ts": 1716454223441664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392832, "dur": 9, "args": { "External id": 128101, "cbid": 211, "correlation": 128101 } }, { "ph": "s", "id": 128101, "pid": 76337, "tid": -914061504, "ts": 1716454223392832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223441698, "dur": 3, "args": { "External id": 128109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128109, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 128109, "pid": 5, "tid": 7, "ts": 1716454223441698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392876, "dur": 9, "args": { "External id": 128109, "cbid": 211, "correlation": 128109 } }, { "ph": "s", "id": 128109, "pid": 76337, "tid": -914061504, "ts": 1716454223392876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223392941, "dur": 1, "args": { "External id": 128125, "cbid": 251, "correlation": 128125 } }, { "ph": "f", "id": 128125, "pid": 76337, "tid": -914061504, "ts": 1716454223392941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223392946, "dur": 0, "args": { "External id": 128127, "cbid": 251, "correlation": 128127 } }, { "ph": "f", "id": 128127, "pid": 76337, "tid": -914061504, "ts": 1716454223392946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223441703, "dur": 11, "args": { "External id": 128128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128128, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 128128, "pid": 5, "tid": 7, "ts": 1716454223441703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392948, "dur": 11, "args": { "External id": 128128, "cbid": 211, "correlation": 128128 } }, { "ph": "s", "id": 128128, "pid": 76337, "tid": -914061504, "ts": 1716454223392948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223441716, "dur": 5, "args": { "External id": 128130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128130, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 128130, "pid": 5, "tid": 7, "ts": 1716454223441716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223392961, "dur": 5, "args": { "External id": 128130, "cbid": 211, "correlation": 128130 } }, { "ph": "s", "id": 128130, "pid": 76337, "tid": -914061504, "ts": 1716454223392961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223441722, "dur": 29, "args": { "External id": 128140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128140, "pid": 5, "tid": 7, "ts": 1716454223441722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393027, "dur": 13, "args": { "External id": 128140, "cbid": 211, "correlation": 128140 } }, { "ph": "s", "id": 128140, "pid": 76337, "tid": -914061504, "ts": 1716454223393027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223441753, "dur": 31, "args": { "External id": 128160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128160, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 128160, "pid": 5, "tid": 7, "ts": 1716454223441753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393093, "dur": 12, "args": { "External id": 128160, "cbid": 211, "correlation": 128160 } }, { "ph": "s", "id": 128160, "pid": 76337, "tid": -914061504, "ts": 1716454223393093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223441785, "dur": 4, "args": { "External id": 128172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128172, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 128172, "pid": 5, "tid": 7, "ts": 1716454223441785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393115, "dur": 6, "args": { "External id": 128172, "cbid": 211, "correlation": 128172 } }, { "ph": "s", "id": 128172, "pid": 76337, "tid": -914061504, "ts": 1716454223393115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223441790, "dur": 29, "args": { "External id": 128175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128175, "pid": 5, "tid": 7, "ts": 1716454223441790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393133, "dur": 7, "args": { "External id": 128175, "cbid": 211, "correlation": 128175 } }, { "ph": "s", "id": 128175, "pid": 76337, "tid": -914061504, "ts": 1716454223393133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223441820, "dur": 19, "args": { "External id": 128184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128184, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128184, "pid": 5, "tid": 7, "ts": 1716454223441820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393175, "dur": 10, "args": { "External id": 128184, "cbid": 211, "correlation": 128184 } }, { "ph": "s", "id": 128184, "pid": 76337, "tid": -914061504, "ts": 1716454223393175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223393237, "dur": 0, "args": { "External id": 128194, "cbid": 317, "correlation": 128194 } }, { "ph": "f", "id": 128194, "pid": 76337, "tid": -914061504, "ts": 1716454223393237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223393238, "dur": 0, "args": { "External id": 128195, "cbid": 203, "correlation": 128195 } }, { "ph": "f", "id": 128195, "pid": 76337, "tid": -914061504, "ts": 1716454223393238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223393239, "dur": 0, "args": { "External id": 128196, "cbid": 205, "correlation": 128196 } }, { "ph": "f", "id": 128196, "pid": 76337, "tid": -914061504, "ts": 1716454223393239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223441841, "dur": 24, "args": { "External id": 128200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128200, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128200, "pid": 5, "tid": 7, "ts": 1716454223441841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393252, "dur": 12, "args": { "External id": 128200, "cbid": 211, "correlation": 128200 } }, { "ph": "s", "id": 128200, "pid": 76337, "tid": -914061504, "ts": 1716454223393252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223441867, "dur": 44, "args": { "External id": 128202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128202, "pid": 5, "tid": 7, "ts": 1716454223441867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393267, "dur": 5, "args": { "External id": 128202, "cbid": 211, "correlation": 128202 } }, { "ph": "s", "id": 128202, "pid": 76337, "tid": -914061504, "ts": 1716454223393267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223441912, "dur": 644, "args": { "External id": 128204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128204, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128204, "pid": 5, "tid": 7, "ts": 1716454223441912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393278, "dur": 6, "args": { "External id": 128204, "cbid": 211, "correlation": 128204 } }, { "ph": "s", "id": 128204, "pid": 76337, "tid": -914061504, "ts": 1716454223393278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223442557, "dur": 22, "args": { "External id": 128206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128206, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128206, "pid": 5, "tid": 7, "ts": 1716454223442557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393288, "dur": 5, "args": { "External id": 128206, "cbid": 211, "correlation": 128206 } }, { "ph": "s", "id": 128206, "pid": 76337, "tid": -914061504, "ts": 1716454223393288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223442580, "dur": 33, "args": { "External id": 128212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128212, "pid": 5, "tid": 7, "ts": 1716454223442580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393316, "dur": 8, "args": { "External id": 128212, "cbid": 211, "correlation": 128212 } }, { "ph": "s", "id": 128212, "pid": 76337, "tid": -914061504, "ts": 1716454223393316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223442615, "dur": 28, "args": { "External id": 128220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128220, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128220, "pid": 5, "tid": 7, "ts": 1716454223442615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393348, "dur": 9, "args": { "External id": 128220, "cbid": 211, "correlation": 128220 } }, { "ph": "s", "id": 128220, "pid": 76337, "tid": -914061504, "ts": 1716454223393348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223442644, "dur": 20, "args": { "External id": 128228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128228, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128228, "pid": 5, "tid": 7, "ts": 1716454223442644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393377, "dur": 8, "args": { "External id": 128228, "cbid": 211, "correlation": 128228 } }, { "ph": "s", "id": 128228, "pid": 76337, "tid": -914061504, "ts": 1716454223393377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223442665, "dur": 30, "args": { "External id": 128248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128248, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 128248, "pid": 5, "tid": 7, "ts": 1716454223442665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393456, "dur": 13, "args": { "External id": 128248, "cbid": 211, "correlation": 128248 } }, { "ph": "s", "id": 128248, "pid": 76337, "tid": -914061504, "ts": 1716454223393456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223442696, "dur": 4, "args": { "External id": 128260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128260, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 128260, "pid": 5, "tid": 7, "ts": 1716454223442696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393478, "dur": 6, "args": { "External id": 128260, "cbid": 211, "correlation": 128260 } }, { "ph": "s", "id": 128260, "pid": 76337, "tid": -914061504, "ts": 1716454223393478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223442702, "dur": 30, "args": { "External id": 128263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128263, "pid": 5, "tid": 7, "ts": 1716454223442702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393495, "dur": 7, "args": { "External id": 128263, "cbid": 211, "correlation": 128263 } }, { "ph": "s", "id": 128263, "pid": 76337, "tid": -914061504, "ts": 1716454223393495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223393553, "dur": 0, "args": { "External id": 128274, "cbid": 317, "correlation": 128274 } }, { "ph": "f", "id": 128274, "pid": 76337, "tid": -914061504, "ts": 1716454223393553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223393554, "dur": 0, "args": { "External id": 128275, "cbid": 203, "correlation": 128275 } }, { "ph": "f", "id": 128275, "pid": 76337, "tid": -914061504, "ts": 1716454223393554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223393555, "dur": 0, "args": { "External id": 128276, "cbid": 205, "correlation": 128276 } }, { "ph": "f", "id": 128276, "pid": 76337, "tid": -914061504, "ts": 1716454223393555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223442733, "dur": 22, "args": { "External id": 128280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128280, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128280, "pid": 5, "tid": 7, "ts": 1716454223442733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393568, "dur": 12, "args": { "External id": 128280, "cbid": 211, "correlation": 128280 } }, { "ph": "s", "id": 128280, "pid": 76337, "tid": -914061504, "ts": 1716454223393568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223442757, "dur": 116, "args": { "External id": 128282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128282, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128282, "pid": 5, "tid": 7, "ts": 1716454223442757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393586, "dur": 6, "args": { "External id": 128282, "cbid": 211, "correlation": 128282 } }, { "ph": "s", "id": 128282, "pid": 76337, "tid": -914061504, "ts": 1716454223393586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223442874, "dur": 23, "args": { "External id": 128284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128284, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128284, "pid": 5, "tid": 7, "ts": 1716454223442874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393595, "dur": 5, "args": { "External id": 128284, "cbid": 211, "correlation": 128284 } }, { "ph": "s", "id": 128284, "pid": 76337, "tid": -914061504, "ts": 1716454223393595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223442898, "dur": 33, "args": { "External id": 128290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128290, "pid": 5, "tid": 7, "ts": 1716454223442898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393623, "dur": 8, "args": { "External id": 128290, "cbid": 211, "correlation": 128290 } }, { "ph": "s", "id": 128290, "pid": 76337, "tid": -914061504, "ts": 1716454223393623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223442933, "dur": 163, "args": { "External id": 128299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128299, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128299, "pid": 5, "tid": 7, "ts": 1716454223442933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393706, "dur": 14, "args": { "External id": 128299, "cbid": 211, "correlation": 128299 } }, { "ph": "s", "id": 128299, "pid": 76337, "tid": -914061504, "ts": 1716454223393706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223443097, "dur": 64, "args": { "External id": 128321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128321, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128321, "pid": 5, "tid": 7, "ts": 1716454223443097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393764, "dur": 10, "args": { "External id": 128321, "cbid": 211, "correlation": 128321 } }, { "ph": "s", "id": 128321, "pid": 76337, "tid": -914061504, "ts": 1716454223393764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223393851, "dur": 1, "args": { "External id": 128332, "cbid": 251, "correlation": 128332 } }, { "ph": "f", "id": 128332, "pid": 76337, "tid": -914061504, "ts": 1716454223393851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223443163, "dur": 155, "args": { "External id": 128333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128333, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128333, "pid": 5, "tid": 7, "ts": 1716454223443163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393856, "dur": 14, "args": { "External id": 128333, "cbid": 211, "correlation": 128333 } }, { "ph": "s", "id": 128333, "pid": 76337, "tid": -914061504, "ts": 1716454223393856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223393927, "dur": 1, "args": { "External id": 128344, "cbid": 251, "correlation": 128344 } }, { "ph": "f", "id": 128344, "pid": 76337, "tid": -914061504, "ts": 1716454223393927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223443319, "dur": 145, "args": { "External id": 128345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128345, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128345, "pid": 5, "tid": 7, "ts": 1716454223443319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223393931, "dur": 11, "args": { "External id": 128345, "cbid": 211, "correlation": 128345 } }, { "ph": "s", "id": 128345, "pid": 76337, "tid": -914061504, "ts": 1716454223393931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394004, "dur": 1, "args": { "External id": 128356, "cbid": 251, "correlation": 128356 } }, { "ph": "f", "id": 128356, "pid": 76337, "tid": -914061504, "ts": 1716454223394004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223443465, "dur": 141, "args": { "External id": 128357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128357, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128357, "pid": 5, "tid": 7, "ts": 1716454223443465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394008, "dur": 12, "args": { "External id": 128357, "cbid": 211, "correlation": 128357 } }, { "ph": "s", "id": 128357, "pid": 76337, "tid": -914061504, "ts": 1716454223394008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223443607, "dur": 1937, "args": { "External id": 128378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128378, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 128378, "pid": 5, "tid": 7, "ts": 1716454223443607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394090, "dur": 13, "args": { "External id": 128378, "cbid": 211, "correlation": 128378 } }, { "ph": "s", "id": 128378, "pid": 76337, "tid": -914061504, "ts": 1716454223394090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394189, "dur": 1, "args": { "External id": 128396, "cbid": 251, "correlation": 128396 } }, { "ph": "f", "id": 128396, "pid": 76337, "tid": -914061504, "ts": 1716454223394189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223445546, "dur": 147, "args": { "External id": 128398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128398, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 128398, "pid": 5, "tid": 7, "ts": 1716454223445546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394194, "dur": 13, "args": { "External id": 128398, "cbid": 211, "correlation": 128398 } }, { "ph": "s", "id": 128398, "pid": 76337, "tid": -914061504, "ts": 1716454223394194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223445694, "dur": 36, "args": { "External id": 128406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128406, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128406, "pid": 5, "tid": 7, "ts": 1716454223445694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394266, "dur": 12, "args": { "External id": 128406, "cbid": 211, "correlation": 128406 } }, { "ph": "s", "id": 128406, "pid": 76337, "tid": -914061504, "ts": 1716454223394266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223445731, "dur": 50, "args": { "External id": 128414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128414, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128414, "pid": 5, "tid": 7, "ts": 1716454223445731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394304, "dur": 9, "args": { "External id": 128414, "cbid": 211, "correlation": 128414 } }, { "ph": "s", "id": 128414, "pid": 76337, "tid": -914061504, "ts": 1716454223394304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223445783, "dur": 30, "args": { "External id": 128425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128425, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128425, "pid": 5, "tid": 7, "ts": 1716454223445783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394376, "dur": 12, "args": { "External id": 128425, "cbid": 211, "correlation": 128425 } }, { "ph": "s", "id": 128425, "pid": 76337, "tid": -914061504, "ts": 1716454223394376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223445814, "dur": 34, "args": { "External id": 128447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128447, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128447, "pid": 5, "tid": 7, "ts": 1716454223445814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394406, "dur": 8, "args": { "External id": 128447, "cbid": 211, "correlation": 128447 } }, { "ph": "s", "id": 128447, "pid": 76337, "tid": -914061504, "ts": 1716454223394406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394491, "dur": 1, "args": { "External id": 128458, "cbid": 251, "correlation": 128458 } }, { "ph": "f", "id": 128458, "pid": 76337, "tid": -914061504, "ts": 1716454223394491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223445849, "dur": 91, "args": { "External id": 128459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128459, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128459, "pid": 5, "tid": 7, "ts": 1716454223445849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394496, "dur": 13, "args": { "External id": 128459, "cbid": 211, "correlation": 128459 } }, { "ph": "s", "id": 128459, "pid": 76337, "tid": -914061504, "ts": 1716454223394496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394565, "dur": 1, "args": { "External id": 128470, "cbid": 251, "correlation": 128470 } }, { "ph": "f", "id": 128470, "pid": 76337, "tid": -914061504, "ts": 1716454223394565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394568, "dur": 0, "args": { "External id": 128471, "cbid": 251, "correlation": 128471 } }, { "ph": "f", "id": 128471, "pid": 76337, "tid": -914061504, "ts": 1716454223394568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223445942, "dur": 11, "args": { "External id": 128472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128472, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 128472, "pid": 5, "tid": 7, "ts": 1716454223445942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394570, "dur": 12, "args": { "External id": 128472, "cbid": 211, "correlation": 128472 } }, { "ph": "s", "id": 128472, "pid": 76337, "tid": -914061504, "ts": 1716454223394570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223445954, "dur": 5, "args": { "External id": 128474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128474, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 128474, "pid": 5, "tid": 7, "ts": 1716454223445954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394583, "dur": 6, "args": { "External id": 128474, "cbid": 211, "correlation": 128474 } }, { "ph": "s", "id": 128474, "pid": 76337, "tid": -914061504, "ts": 1716454223394583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394641, "dur": 1, "args": { "External id": 128485, "cbid": 251, "correlation": 128485 } }, { "ph": "f", "id": 128485, "pid": 76337, "tid": -914061504, "ts": 1716454223394641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394645, "dur": 0, "args": { "External id": 128486, "cbid": 251, "correlation": 128486 } }, { "ph": "f", "id": 128486, "pid": 76337, "tid": -914061504, "ts": 1716454223394645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223445960, "dur": 7, "args": { "External id": 128487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128487, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 128487, "pid": 5, "tid": 7, "ts": 1716454223445960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394646, "dur": 12, "args": { "External id": 128487, "cbid": 211, "correlation": 128487 } }, { "ph": "s", "id": 128487, "pid": 76337, "tid": -914061504, "ts": 1716454223394646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223445969, "dur": 3, "args": { "External id": 128489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128489, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 128489, "pid": 5, "tid": 7, "ts": 1716454223445969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394660, "dur": 6, "args": { "External id": 128489, "cbid": 211, "correlation": 128489 } }, { "ph": "s", "id": 128489, "pid": 76337, "tid": -914061504, "ts": 1716454223394660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223445973, "dur": 92, "args": { "External id": 128510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128510, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 128510, "pid": 5, "tid": 7, "ts": 1716454223445973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394732, "dur": 12, "args": { "External id": 128510, "cbid": 211, "correlation": 128510 } }, { "ph": "s", "id": 128510, "pid": 76337, "tid": -914061504, "ts": 1716454223394732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223394828, "dur": 1, "args": { "External id": 128528, "cbid": 251, "correlation": 128528 } }, { "ph": "f", "id": 128528, "pid": 76337, "tid": -914061504, "ts": 1716454223394828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223446067, "dur": 98, "args": { "External id": 128530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128530, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128530, "pid": 5, "tid": 7, "ts": 1716454223446067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394833, "dur": 13, "args": { "External id": 128530, "cbid": 211, "correlation": 128530 } }, { "ph": "s", "id": 128530, "pid": 76337, "tid": -914061504, "ts": 1716454223394833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223446165, "dur": 20, "args": { "External id": 128538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128538, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128538, "pid": 5, "tid": 7, "ts": 1716454223446165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394903, "dur": 13, "args": { "External id": 128538, "cbid": 211, "correlation": 128538 } }, { "ph": "s", "id": 128538, "pid": 76337, "tid": -914061504, "ts": 1716454223394903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223446186, "dur": 38, "args": { "External id": 128546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128546, "pid": 5, "tid": 7, "ts": 1716454223446186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223394945, "dur": 9, "args": { "External id": 128546, "cbid": 211, "correlation": 128546 } }, { "ph": "s", "id": 128546, "pid": 76337, "tid": -914061504, "ts": 1716454223394945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223446226, "dur": 34, "args": { "External id": 128568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128568, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128568, "pid": 5, "tid": 7, "ts": 1716454223446226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395004, "dur": 11, "args": { "External id": 128568, "cbid": 211, "correlation": 128568 } }, { "ph": "s", "id": 128568, "pid": 76337, "tid": -914061504, "ts": 1716454223395004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223395094, "dur": 1, "args": { "External id": 128584, "cbid": 251, "correlation": 128584 } }, { "ph": "f", "id": 128584, "pid": 76337, "tid": -914061504, "ts": 1716454223395094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223395099, "dur": 0, "args": { "External id": 128586, "cbid": 251, "correlation": 128586 } }, { "ph": "f", "id": 128586, "pid": 76337, "tid": -914061504, "ts": 1716454223395099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223446261, "dur": 535, "args": { "External id": 128587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128587, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 128587, "pid": 5, "tid": 7, "ts": 1716454223446261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395102, "dur": 14, "args": { "External id": 128587, "cbid": 211, "correlation": 128587 } }, { "ph": "s", "id": 128587, "pid": 76337, "tid": -914061504, "ts": 1716454223395102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223446797, "dur": 125, "args": { "External id": 128595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128595, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128595, "pid": 5, "tid": 7, "ts": 1716454223446797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395168, "dur": 12, "args": { "External id": 128595, "cbid": 211, "correlation": 128595 } }, { "ph": "s", "id": 128595, "pid": 76337, "tid": -914061504, "ts": 1716454223395168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223446924, "dur": 128, "args": { "External id": 128603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128603, "pid": 5, "tid": 7, "ts": 1716454223446924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395198, "dur": 8, "args": { "External id": 128603, "cbid": 211, "correlation": 128603 } }, { "ph": "s", "id": 128603, "pid": 76337, "tid": -914061504, "ts": 1716454223395198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223395274, "dur": 1, "args": { "External id": 128619, "cbid": 251, "correlation": 128619 } }, { "ph": "f", "id": 128619, "pid": 76337, "tid": -914061504, "ts": 1716454223395274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223447053, "dur": 301, "args": { "External id": 128621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128621, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128621, "pid": 5, "tid": 7, "ts": 1716454223447053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395280, "dur": 12, "args": { "External id": 128621, "cbid": 211, "correlation": 128621 } }, { "ph": "s", "id": 128621, "pid": 76337, "tid": -914061504, "ts": 1716454223395280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223447356, "dur": 27, "args": { "External id": 128629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128629, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128629, "pid": 5, "tid": 7, "ts": 1716454223447356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395323, "dur": 9, "args": { "External id": 128629, "cbid": 211, "correlation": 128629 } }, { "ph": "s", "id": 128629, "pid": 76337, "tid": -914061504, "ts": 1716454223395323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223447385, "dur": 81, "args": { "External id": 128640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128640, "pid": 5, "tid": 7, "ts": 1716454223447385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395390, "dur": 12, "args": { "External id": 128640, "cbid": 211, "correlation": 128640 } }, { "ph": "s", "id": 128640, "pid": 76337, "tid": -914061504, "ts": 1716454223395390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223395454, "dur": 0, "args": { "External id": 128652, "cbid": 317, "correlation": 128652 } }, { "ph": "f", "id": 128652, "pid": 76337, "tid": -914061504, "ts": 1716454223395454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223395455, "dur": 0, "args": { "External id": 128653, "cbid": 203, "correlation": 128653 } }, { "ph": "f", "id": 128653, "pid": 76337, "tid": -914061504, "ts": 1716454223395455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223395455, "dur": 0, "args": { "External id": 128654, "cbid": 205, "correlation": 128654 } }, { "ph": "f", "id": 128654, "pid": 76337, "tid": -914061504, "ts": 1716454223395455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223447467, "dur": 23, "args": { "External id": 128658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128658, "pid": 5, "tid": 7, "ts": 1716454223447467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395471, "dur": 12, "args": { "External id": 128658, "cbid": 211, "correlation": 128658 } }, { "ph": "s", "id": 128658, "pid": 76337, "tid": -914061504, "ts": 1716454223395471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223447491, "dur": 121, "args": { "External id": 128660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128660, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128660, "pid": 5, "tid": 7, "ts": 1716454223447491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395489, "dur": 6, "args": { "External id": 128660, "cbid": 211, "correlation": 128660 } }, { "ph": "s", "id": 128660, "pid": 76337, "tid": -914061504, "ts": 1716454223395489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223447613, "dur": 23, "args": { "External id": 128662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128662, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128662, "pid": 5, "tid": 7, "ts": 1716454223447613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395499, "dur": 5, "args": { "External id": 128662, "cbid": 211, "correlation": 128662 } }, { "ph": "s", "id": 128662, "pid": 76337, "tid": -914061504, "ts": 1716454223395499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223447638, "dur": 32, "args": { "External id": 128668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128668, "pid": 5, "tid": 7, "ts": 1716454223447638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395527, "dur": 8, "args": { "External id": 128668, "cbid": 211, "correlation": 128668 } }, { "ph": "s", "id": 128668, "pid": 76337, "tid": -914061504, "ts": 1716454223395527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223447671, "dur": 27, "args": { "External id": 128676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128676, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128676, "pid": 5, "tid": 7, "ts": 1716454223447671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395558, "dur": 9, "args": { "External id": 128676, "cbid": 211, "correlation": 128676 } }, { "ph": "s", "id": 128676, "pid": 76337, "tid": -914061504, "ts": 1716454223395558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223395629, "dur": 0, "args": { "External id": 128686, "cbid": 317, "correlation": 128686 } }, { "ph": "f", "id": 128686, "pid": 76337, "tid": -914061504, "ts": 1716454223395629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223395630, "dur": 0, "args": { "External id": 128687, "cbid": 203, "correlation": 128687 } }, { "ph": "f", "id": 128687, "pid": 76337, "tid": -914061504, "ts": 1716454223395630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223395631, "dur": 0, "args": { "External id": 128688, "cbid": 205, "correlation": 128688 } }, { "ph": "f", "id": 128688, "pid": 76337, "tid": -914061504, "ts": 1716454223395631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223447700, "dur": 24, "args": { "External id": 128692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128692, "pid": 5, "tid": 7, "ts": 1716454223447700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395645, "dur": 12, "args": { "External id": 128692, "cbid": 211, "correlation": 128692 } }, { "ph": "s", "id": 128692, "pid": 76337, "tid": -914061504, "ts": 1716454223395645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223447724, "dur": 44, "args": { "External id": 128694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128694, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128694, "pid": 5, "tid": 7, "ts": 1716454223447724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395659, "dur": 6, "args": { "External id": 128694, "cbid": 211, "correlation": 128694 } }, { "ph": "s", "id": 128694, "pid": 76337, "tid": -914061504, "ts": 1716454223395659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223447770, "dur": 235, "args": { "External id": 128696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128696, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 128696, "pid": 5, "tid": 7, "ts": 1716454223447770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395672, "dur": 6, "args": { "External id": 128696, "cbid": 211, "correlation": 128696 } }, { "ph": "s", "id": 128696, "pid": 76337, "tid": -914061504, "ts": 1716454223395672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223448006, "dur": 7, "args": { "External id": 128698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128698, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128698, "pid": 5, "tid": 7, "ts": 1716454223448006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395682, "dur": 5, "args": { "External id": 128698, "cbid": 211, "correlation": 128698 } }, { "ph": "s", "id": 128698, "pid": 76337, "tid": -914061504, "ts": 1716454223395682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223448014, "dur": 9, "args": { "External id": 128704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128704, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128704, "pid": 5, "tid": 7, "ts": 1716454223448014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395708, "dur": 9, "args": { "External id": 128704, "cbid": 211, "correlation": 128704 } }, { "ph": "s", "id": 128704, "pid": 76337, "tid": -914061504, "ts": 1716454223395708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223448024, "dur": 12, "args": { "External id": 128724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128724, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 128724, "pid": 5, "tid": 7, "ts": 1716454223448024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395801, "dur": 12, "args": { "External id": 128724, "cbid": 211, "correlation": 128724 } }, { "ph": "s", "id": 128724, "pid": 76337, "tid": -914061504, "ts": 1716454223395801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223448037, "dur": 4, "args": { "External id": 128736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128736, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 128736, "pid": 5, "tid": 7, "ts": 1716454223448037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395824, "dur": 6, "args": { "External id": 128736, "cbid": 211, "correlation": 128736 } }, { "ph": "s", "id": 128736, "pid": 76337, "tid": -914061504, "ts": 1716454223395824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223448042, "dur": 12, "args": { "External id": 128739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128739, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128739, "pid": 5, "tid": 7, "ts": 1716454223448042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395842, "dur": 7, "args": { "External id": 128739, "cbid": 211, "correlation": 128739 } }, { "ph": "s", "id": 128739, "pid": 76337, "tid": -914061504, "ts": 1716454223395842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223448056, "dur": 7, "args": { "External id": 128748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128748, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128748, "pid": 5, "tid": 7, "ts": 1716454223448056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395881, "dur": 9, "args": { "External id": 128748, "cbid": 211, "correlation": 128748 } }, { "ph": "s", "id": 128748, "pid": 76337, "tid": -914061504, "ts": 1716454223395881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223395934, "dur": 0, "args": { "External id": 128758, "cbid": 317, "correlation": 128758 } }, { "ph": "f", "id": 128758, "pid": 76337, "tid": -914061504, "ts": 1716454223395934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223395935, "dur": 0, "args": { "External id": 128759, "cbid": 203, "correlation": 128759 } }, { "ph": "f", "id": 128759, "pid": 76337, "tid": -914061504, "ts": 1716454223395935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223395935, "dur": 0, "args": { "External id": 128760, "cbid": 205, "correlation": 128760 } }, { "ph": "f", "id": 128760, "pid": 76337, "tid": -914061504, "ts": 1716454223395935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223448064, "dur": 6, "args": { "External id": 128764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128764, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128764, "pid": 5, "tid": 7, "ts": 1716454223448064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395951, "dur": 11, "args": { "External id": 128764, "cbid": 211, "correlation": 128764 } }, { "ph": "s", "id": 128764, "pid": 76337, "tid": -914061504, "ts": 1716454223395951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223448071, "dur": 83, "args": { "External id": 128766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128766, "pid": 5, "tid": 7, "ts": 1716454223448071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395965, "dur": 6, "args": { "External id": 128766, "cbid": 211, "correlation": 128766 } }, { "ph": "s", "id": 128766, "pid": 76337, "tid": -914061504, "ts": 1716454223395965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223448156, "dur": 1, "args": { "External id": 128768, "device": 5, "context": 1, "stream": 7, "correlation": 128768, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 128768, "pid": 5, "tid": 7, "ts": 1716454223448156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223395986, "dur": 9, "args": { "External id": 128768, "cbid": 51, "correlation": 128768 } }, { "ph": "s", "id": 128768, "pid": 76337, "tid": -914061504, "ts": 1716454223395986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223448160, "dur": 537, "args": { "External id": 128769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128769, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128769, "pid": 5, "tid": 7, "ts": 1716454223448160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223395997, "dur": 8, "args": { "External id": 128769, "cbid": 211, "correlation": 128769 } }, { "ph": "s", "id": 128769, "pid": 76337, "tid": -914061504, "ts": 1716454223395997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223448698, "dur": 11, "args": { "External id": 128771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128771, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128771, "pid": 5, "tid": 7, "ts": 1716454223448698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396009, "dur": 5, "args": { "External id": 128771, "cbid": 211, "correlation": 128771 } }, { "ph": "s", "id": 128771, "pid": 76337, "tid": -914061504, "ts": 1716454223396009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223448710, "dur": 14, "args": { "External id": 128777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128777, "pid": 5, "tid": 7, "ts": 1716454223448710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396038, "dur": 8, "args": { "External id": 128777, "cbid": 211, "correlation": 128777 } }, { "ph": "s", "id": 128777, "pid": 76337, "tid": -914061504, "ts": 1716454223396038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223448726, "dur": 3, "args": { "External id": 128785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128785, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 128785, "pid": 5, "tid": 7, "ts": 1716454223448726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396082, "dur": 9, "args": { "External id": 128785, "cbid": 211, "correlation": 128785 } }, { "ph": "s", "id": 128785, "pid": 76337, "tid": -914061504, "ts": 1716454223396082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223396146, "dur": 1, "args": { "External id": 128801, "cbid": 251, "correlation": 128801 } }, { "ph": "f", "id": 128801, "pid": 76337, "tid": -914061504, "ts": 1716454223396146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223396152, "dur": 0, "args": { "External id": 128803, "cbid": 251, "correlation": 128803 } }, { "ph": "f", "id": 128803, "pid": 76337, "tid": -914061504, "ts": 1716454223396152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223448731, "dur": 13, "args": { "External id": 128804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128804, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128804, "pid": 5, "tid": 7, "ts": 1716454223448731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396153, "dur": 11, "args": { "External id": 128804, "cbid": 211, "correlation": 128804 } }, { "ph": "s", "id": 128804, "pid": 76337, "tid": -914061504, "ts": 1716454223396153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223448745, "dur": 5, "args": { "External id": 128806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128806, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128806, "pid": 5, "tid": 7, "ts": 1716454223448745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396166, "dur": 7, "args": { "External id": 128806, "cbid": 211, "correlation": 128806 } }, { "ph": "s", "id": 128806, "pid": 76337, "tid": -914061504, "ts": 1716454223396166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223448751, "dur": 17, "args": { "External id": 128816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128816, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128816, "pid": 5, "tid": 7, "ts": 1716454223448751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396225, "dur": 12, "args": { "External id": 128816, "cbid": 211, "correlation": 128816 } }, { "ph": "s", "id": 128816, "pid": 76337, "tid": -914061504, "ts": 1716454223396225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223448770, "dur": 19, "args": { "External id": 128836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128836, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 128836, "pid": 5, "tid": 7, "ts": 1716454223448770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396291, "dur": 11, "args": { "External id": 128836, "cbid": 211, "correlation": 128836 } }, { "ph": "s", "id": 128836, "pid": 76337, "tid": -914061504, "ts": 1716454223396291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223448790, "dur": 5, "args": { "External id": 128848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128848, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 128848, "pid": 5, "tid": 7, "ts": 1716454223448790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396312, "dur": 6, "args": { "External id": 128848, "cbid": 211, "correlation": 128848 } }, { "ph": "s", "id": 128848, "pid": 76337, "tid": -914061504, "ts": 1716454223396312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223448796, "dur": 17, "args": { "External id": 128851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128851, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128851, "pid": 5, "tid": 7, "ts": 1716454223448796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396332, "dur": 6, "args": { "External id": 128851, "cbid": 211, "correlation": 128851 } }, { "ph": "s", "id": 128851, "pid": 76337, "tid": -914061504, "ts": 1716454223396332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223448815, "dur": 10, "args": { "External id": 128860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128860, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128860, "pid": 5, "tid": 7, "ts": 1716454223448815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396372, "dur": 9, "args": { "External id": 128860, "cbid": 211, "correlation": 128860 } }, { "ph": "s", "id": 128860, "pid": 76337, "tid": -914061504, "ts": 1716454223396372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223396434, "dur": 0, "args": { "External id": 128870, "cbid": 317, "correlation": 128870 } }, { "ph": "f", "id": 128870, "pid": 76337, "tid": -914061504, "ts": 1716454223396434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223396435, "dur": 0, "args": { "External id": 128871, "cbid": 203, "correlation": 128871 } }, { "ph": "f", "id": 128871, "pid": 76337, "tid": -914061504, "ts": 1716454223396435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223396435, "dur": 0, "args": { "External id": 128872, "cbid": 205, "correlation": 128872 } }, { "ph": "f", "id": 128872, "pid": 76337, "tid": -914061504, "ts": 1716454223396435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223448826, "dur": 11, "args": { "External id": 128876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128876, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128876, "pid": 5, "tid": 7, "ts": 1716454223448826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396453, "dur": 12, "args": { "External id": 128876, "cbid": 211, "correlation": 128876 } }, { "ph": "s", "id": 128876, "pid": 76337, "tid": -914061504, "ts": 1716454223396453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223448838, "dur": 162, "args": { "External id": 128878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128878, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128878, "pid": 5, "tid": 7, "ts": 1716454223448838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396467, "dur": 6, "args": { "External id": 128878, "cbid": 211, "correlation": 128878 } }, { "ph": "s", "id": 128878, "pid": 76337, "tid": -914061504, "ts": 1716454223396467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223449002, "dur": 1, "args": { "External id": 128880, "device": 5, "context": 1, "stream": 7, "correlation": 128880, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 128880, "pid": 5, "tid": 7, "ts": 1716454223449002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223396480, "dur": 7, "args": { "External id": 128880, "cbid": 51, "correlation": 128880 } }, { "ph": "s", "id": 128880, "pid": 76337, "tid": -914061504, "ts": 1716454223396480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223449006, "dur": 659, "args": { "External id": 128881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128881, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 128881, "pid": 5, "tid": 7, "ts": 1716454223449006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396488, "dur": 6, "args": { "External id": 128881, "cbid": 211, "correlation": 128881 } }, { "ph": "s", "id": 128881, "pid": 76337, "tid": -914061504, "ts": 1716454223396488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223449666, "dur": 12, "args": { "External id": 128883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128883, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128883, "pid": 5, "tid": 7, "ts": 1716454223449666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396498, "dur": 5, "args": { "External id": 128883, "cbid": 211, "correlation": 128883 } }, { "ph": "s", "id": 128883, "pid": 76337, "tid": -914061504, "ts": 1716454223396498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223449680, "dur": 15, "args": { "External id": 128889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128889, "pid": 5, "tid": 7, "ts": 1716454223449680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396528, "dur": 9, "args": { "External id": 128889, "cbid": 211, "correlation": 128889 } }, { "ph": "s", "id": 128889, "pid": 76337, "tid": -914061504, "ts": 1716454223396528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223396585, "dur": 0, "args": { "External id": 128899, "cbid": 317, "correlation": 128899 } }, { "ph": "f", "id": 128899, "pid": 76337, "tid": -914061504, "ts": 1716454223396585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223396586, "dur": 0, "args": { "External id": 128900, "cbid": 203, "correlation": 128900 } }, { "ph": "f", "id": 128900, "pid": 76337, "tid": -914061504, "ts": 1716454223396586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223396587, "dur": 0, "args": { "External id": 128901, "cbid": 205, "correlation": 128901 } }, { "ph": "f", "id": 128901, "pid": 76337, "tid": -914061504, "ts": 1716454223396587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223449696, "dur": 8, "args": { "External id": 128905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128905, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128905, "pid": 5, "tid": 7, "ts": 1716454223449696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396602, "dur": 12, "args": { "External id": 128905, "cbid": 211, "correlation": 128905 } }, { "ph": "s", "id": 128905, "pid": 76337, "tid": -914061504, "ts": 1716454223396602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223449705, "dur": 4, "args": { "External id": 128907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128907, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 128907, "pid": 5, "tid": 7, "ts": 1716454223449705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396619, "dur": 6, "args": { "External id": 128907, "cbid": 211, "correlation": 128907 } }, { "ph": "s", "id": 128907, "pid": 76337, "tid": -914061504, "ts": 1716454223396619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223396628, "dur": 0, "args": { "External id": 128908, "cbid": 51, "correlation": 128908 } }, { "ph": "s", "id": 128908, "pid": 76337, "tid": -914061504, "ts": 1716454223396628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223449710, "dur": 57, "args": { "External id": 128909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128909, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 128909, "pid": 5, "tid": 7, "ts": 1716454223449710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396629, "dur": 5, "args": { "External id": 128909, "cbid": 211, "correlation": 128909 } }, { "ph": "s", "id": 128909, "pid": 76337, "tid": -914061504, "ts": 1716454223396629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223449768, "dur": 14, "args": { "External id": 128914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128914, "pid": 5, "tid": 7, "ts": 1716454223449768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396654, "dur": 8, "args": { "External id": 128914, "cbid": 211, "correlation": 128914 } }, { "ph": "s", "id": 128914, "pid": 76337, "tid": -914061504, "ts": 1716454223396654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223449783, "dur": 12, "args": { "External id": 128922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128922, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128922, "pid": 5, "tid": 7, "ts": 1716454223449783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396683, "dur": 8, "args": { "External id": 128922, "cbid": 211, "correlation": 128922 } }, { "ph": "s", "id": 128922, "pid": 76337, "tid": -914061504, "ts": 1716454223396683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223449797, "dur": 10, "args": { "External id": 128930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128930, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128930, "pid": 5, "tid": 7, "ts": 1716454223449797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396712, "dur": 9, "args": { "External id": 128930, "cbid": 211, "correlation": 128930 } }, { "ph": "s", "id": 128930, "pid": 76337, "tid": -914061504, "ts": 1716454223396712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223449809, "dur": 18, "args": { "External id": 128950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128950, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 128950, "pid": 5, "tid": 7, "ts": 1716454223449809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396792, "dur": 12, "args": { "External id": 128950, "cbid": 211, "correlation": 128950 } }, { "ph": "s", "id": 128950, "pid": 76337, "tid": -914061504, "ts": 1716454223396792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223449829, "dur": 5, "args": { "External id": 128962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128962, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 128962, "pid": 5, "tid": 7, "ts": 1716454223449829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396814, "dur": 7, "args": { "External id": 128962, "cbid": 211, "correlation": 128962 } }, { "ph": "s", "id": 128962, "pid": 76337, "tid": -914061504, "ts": 1716454223396814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223449835, "dur": 17, "args": { "External id": 128965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128965, "pid": 5, "tid": 7, "ts": 1716454223449835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396832, "dur": 6, "args": { "External id": 128965, "cbid": 211, "correlation": 128965 } }, { "ph": "s", "id": 128965, "pid": 76337, "tid": -914061504, "ts": 1716454223396832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223396887, "dur": 0, "args": { "External id": 128976, "cbid": 317, "correlation": 128976 } }, { "ph": "f", "id": 128976, "pid": 76337, "tid": -914061504, "ts": 1716454223396887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223396888, "dur": 0, "args": { "External id": 128977, "cbid": 203, "correlation": 128977 } }, { "ph": "f", "id": 128977, "pid": 76337, "tid": -914061504, "ts": 1716454223396888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223396889, "dur": 0, "args": { "External id": 128978, "cbid": 205, "correlation": 128978 } }, { "ph": "f", "id": 128978, "pid": 76337, "tid": -914061504, "ts": 1716454223396889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223449853, "dur": 12, "args": { "External id": 128982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128982, "pid": 5, "tid": 7, "ts": 1716454223449853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396904, "dur": 12, "args": { "External id": 128982, "cbid": 211, "correlation": 128982 } }, { "ph": "s", "id": 128982, "pid": 76337, "tid": -914061504, "ts": 1716454223396904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223449866, "dur": 3, "args": { "External id": 128984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 128984, "pid": 5, "tid": 7, "ts": 1716454223449866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396921, "dur": 6, "args": { "External id": 128984, "cbid": 211, "correlation": 128984 } }, { "ph": "s", "id": 128984, "pid": 76337, "tid": -914061504, "ts": 1716454223396921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223396930, "dur": 0, "args": { "External id": 128985, "cbid": 51, "correlation": 128985 } }, { "ph": "s", "id": 128985, "pid": 76337, "tid": -914061504, "ts": 1716454223396930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223449871, "dur": 97, "args": { "External id": 128986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128986, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 128986, "pid": 5, "tid": 7, "ts": 1716454223449871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396931, "dur": 6, "args": { "External id": 128986, "cbid": 211, "correlation": 128986 } }, { "ph": "s", "id": 128986, "pid": 76337, "tid": -914061504, "ts": 1716454223396931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223449969, "dur": 15, "args": { "External id": 128991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 128991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 128991, "pid": 5, "tid": 7, "ts": 1716454223449969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223396959, "dur": 9, "args": { "External id": 128991, "cbid": 211, "correlation": 128991 } }, { "ph": "s", "id": 128991, "pid": 76337, "tid": -914061504, "ts": 1716454223396959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223449986, "dur": 84, "args": { "External id": 129000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129000, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129000, "pid": 5, "tid": 7, "ts": 1716454223449986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397049, "dur": 14, "args": { "External id": 129000, "cbid": 211, "correlation": 129000 } }, { "ph": "s", "id": 129000, "pid": 76337, "tid": -914061504, "ts": 1716454223397049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223450071, "dur": 30, "args": { "External id": 129022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129022, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129022, "pid": 5, "tid": 7, "ts": 1716454223450071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397105, "dur": 10, "args": { "External id": 129022, "cbid": 211, "correlation": 129022 } }, { "ph": "s", "id": 129022, "pid": 76337, "tid": -914061504, "ts": 1716454223397105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397197, "dur": 1, "args": { "External id": 129033, "cbid": 251, "correlation": 129033 } }, { "ph": "f", "id": 129033, "pid": 76337, "tid": -914061504, "ts": 1716454223397197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223450102, "dur": 162, "args": { "External id": 129034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129034, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129034, "pid": 5, "tid": 7, "ts": 1716454223450102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397202, "dur": 13, "args": { "External id": 129034, "cbid": 211, "correlation": 129034 } }, { "ph": "s", "id": 129034, "pid": 76337, "tid": -914061504, "ts": 1716454223397202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397273, "dur": 1, "args": { "External id": 129045, "cbid": 251, "correlation": 129045 } }, { "ph": "f", "id": 129045, "pid": 76337, "tid": -914061504, "ts": 1716454223397273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223450266, "dur": 159, "args": { "External id": 129046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129046, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129046, "pid": 5, "tid": 7, "ts": 1716454223450266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397277, "dur": 11, "args": { "External id": 129046, "cbid": 211, "correlation": 129046 } }, { "ph": "s", "id": 129046, "pid": 76337, "tid": -914061504, "ts": 1716454223397277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397341, "dur": 1, "args": { "External id": 129057, "cbid": 251, "correlation": 129057 } }, { "ph": "f", "id": 129057, "pid": 76337, "tid": -914061504, "ts": 1716454223397341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223450426, "dur": 159, "args": { "External id": 129058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129058, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129058, "pid": 5, "tid": 7, "ts": 1716454223450426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397345, "dur": 11, "args": { "External id": 129058, "cbid": 211, "correlation": 129058 } }, { "ph": "s", "id": 129058, "pid": 76337, "tid": -914061504, "ts": 1716454223397345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223450587, "dur": 338, "args": { "External id": 129083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129083, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129083, "pid": 5, "tid": 7, "ts": 1716454223450587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397430, "dur": 13, "args": { "External id": 129083, "cbid": 211, "correlation": 129083 } }, { "ph": "s", "id": 129083, "pid": 76337, "tid": -914061504, "ts": 1716454223397430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397532, "dur": 1, "args": { "External id": 129101, "cbid": 251, "correlation": 129101 } }, { "ph": "f", "id": 129101, "pid": 76337, "tid": -914061504, "ts": 1716454223397532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223450926, "dur": 166, "args": { "External id": 129103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129103, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129103, "pid": 5, "tid": 7, "ts": 1716454223450926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397538, "dur": 13, "args": { "External id": 129103, "cbid": 211, "correlation": 129103 } }, { "ph": "s", "id": 129103, "pid": 76337, "tid": -914061504, "ts": 1716454223397538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223451093, "dur": 19, "args": { "External id": 129111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129111, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129111, "pid": 5, "tid": 7, "ts": 1716454223451093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397608, "dur": 13, "args": { "External id": 129111, "cbid": 211, "correlation": 129111 } }, { "ph": "s", "id": 129111, "pid": 76337, "tid": -914061504, "ts": 1716454223397608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223451113, "dur": 28, "args": { "External id": 129119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129119, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129119, "pid": 5, "tid": 7, "ts": 1716454223451113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397648, "dur": 8, "args": { "External id": 129119, "cbid": 211, "correlation": 129119 } }, { "ph": "s", "id": 129119, "pid": 76337, "tid": -914061504, "ts": 1716454223397648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223451143, "dur": 18, "args": { "External id": 129130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129130, "pid": 5, "tid": 7, "ts": 1716454223451143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397720, "dur": 12, "args": { "External id": 129130, "cbid": 211, "correlation": 129130 } }, { "ph": "s", "id": 129130, "pid": 76337, "tid": -914061504, "ts": 1716454223397720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223451162, "dur": 16, "args": { "External id": 129152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129152, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129152, "pid": 5, "tid": 7, "ts": 1716454223451162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397751, "dur": 7, "args": { "External id": 129152, "cbid": 211, "correlation": 129152 } }, { "ph": "s", "id": 129152, "pid": 76337, "tid": -914061504, "ts": 1716454223397751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397837, "dur": 1, "args": { "External id": 129163, "cbid": 251, "correlation": 129163 } }, { "ph": "f", "id": 129163, "pid": 76337, "tid": -914061504, "ts": 1716454223397837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223451180, "dur": 89, "args": { "External id": 129164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129164, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129164, "pid": 5, "tid": 7, "ts": 1716454223451180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397844, "dur": 15, "args": { "External id": 129164, "cbid": 211, "correlation": 129164 } }, { "ph": "s", "id": 129164, "pid": 76337, "tid": -914061504, "ts": 1716454223397844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397917, "dur": 1, "args": { "External id": 129175, "cbid": 251, "correlation": 129175 } }, { "ph": "f", "id": 129175, "pid": 76337, "tid": -914061504, "ts": 1716454223397917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223397921, "dur": 0, "args": { "External id": 129176, "cbid": 251, "correlation": 129176 } }, { "ph": "f", "id": 129176, "pid": 76337, "tid": -914061504, "ts": 1716454223397921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223451270, "dur": 12, "args": { "External id": 129177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129177, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129177, "pid": 5, "tid": 7, "ts": 1716454223451270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397923, "dur": 12, "args": { "External id": 129177, "cbid": 211, "correlation": 129177 } }, { "ph": "s", "id": 129177, "pid": 76337, "tid": -914061504, "ts": 1716454223397923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223451283, "dur": 6, "args": { "External id": 129179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129179, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129179, "pid": 5, "tid": 7, "ts": 1716454223451283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223397936, "dur": 6, "args": { "External id": 129179, "cbid": 211, "correlation": 129179 } }, { "ph": "s", "id": 129179, "pid": 76337, "tid": -914061504, "ts": 1716454223397936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223398003, "dur": 1, "args": { "External id": 129190, "cbid": 251, "correlation": 129190 } }, { "ph": "f", "id": 129190, "pid": 76337, "tid": -914061504, "ts": 1716454223398003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223398006, "dur": 0, "args": { "External id": 129191, "cbid": 251, "correlation": 129191 } }, { "ph": "f", "id": 129191, "pid": 76337, "tid": -914061504, "ts": 1716454223398006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223451290, "dur": 8, "args": { "External id": 129192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129192, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129192, "pid": 5, "tid": 7, "ts": 1716454223451290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398008, "dur": 13, "args": { "External id": 129192, "cbid": 211, "correlation": 129192 } }, { "ph": "s", "id": 129192, "pid": 76337, "tid": -914061504, "ts": 1716454223398008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223451299, "dur": 3, "args": { "External id": 129194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129194, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129194, "pid": 5, "tid": 7, "ts": 1716454223451299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398022, "dur": 5, "args": { "External id": 129194, "cbid": 211, "correlation": 129194 } }, { "ph": "s", "id": 129194, "pid": 76337, "tid": -914061504, "ts": 1716454223398022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223451304, "dur": 55, "args": { "External id": 129219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129219, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129219, "pid": 5, "tid": 7, "ts": 1716454223451304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398099, "dur": 13, "args": { "External id": 129219, "cbid": 211, "correlation": 129219 } }, { "ph": "s", "id": 129219, "pid": 76337, "tid": -914061504, "ts": 1716454223398099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223398200, "dur": 2, "args": { "External id": 129237, "cbid": 251, "correlation": 129237 } }, { "ph": "f", "id": 129237, "pid": 76337, "tid": -914061504, "ts": 1716454223398200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223451360, "dur": 91, "args": { "External id": 129239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129239, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129239, "pid": 5, "tid": 7, "ts": 1716454223451360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398206, "dur": 16, "args": { "External id": 129239, "cbid": 211, "correlation": 129239 } }, { "ph": "s", "id": 129239, "pid": 76337, "tid": -914061504, "ts": 1716454223398206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223451452, "dur": 10, "args": { "External id": 129247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129247, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129247, "pid": 5, "tid": 7, "ts": 1716454223451452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398278, "dur": 12, "args": { "External id": 129247, "cbid": 211, "correlation": 129247 } }, { "ph": "s", "id": 129247, "pid": 76337, "tid": -914061504, "ts": 1716454223398278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223451463, "dur": 21, "args": { "External id": 129255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129255, "pid": 5, "tid": 7, "ts": 1716454223451463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398320, "dur": 9, "args": { "External id": 129255, "cbid": 211, "correlation": 129255 } }, { "ph": "s", "id": 129255, "pid": 76337, "tid": -914061504, "ts": 1716454223398320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223451485, "dur": 19, "args": { "External id": 129277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129277, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129277, "pid": 5, "tid": 7, "ts": 1716454223451485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398371, "dur": 10, "args": { "External id": 129277, "cbid": 211, "correlation": 129277 } }, { "ph": "s", "id": 129277, "pid": 76337, "tid": -914061504, "ts": 1716454223398371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223398460, "dur": 2, "args": { "External id": 129293, "cbid": 251, "correlation": 129293 } }, { "ph": "f", "id": 129293, "pid": 76337, "tid": -914061504, "ts": 1716454223398460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223398466, "dur": 0, "args": { "External id": 129295, "cbid": 251, "correlation": 129295 } }, { "ph": "f", "id": 129295, "pid": 76337, "tid": -914061504, "ts": 1716454223398466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223451505, "dur": 495, "args": { "External id": 129296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129296, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129296, "pid": 5, "tid": 7, "ts": 1716454223451505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398469, "dur": 15, "args": { "External id": 129296, "cbid": 211, "correlation": 129296 } }, { "ph": "s", "id": 129296, "pid": 76337, "tid": -914061504, "ts": 1716454223398469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223452001, "dur": 65, "args": { "External id": 129304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129304, "pid": 5, "tid": 7, "ts": 1716454223452001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398537, "dur": 12, "args": { "External id": 129304, "cbid": 211, "correlation": 129304 } }, { "ph": "s", "id": 129304, "pid": 76337, "tid": -914061504, "ts": 1716454223398537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223452067, "dur": 69, "args": { "External id": 129312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129312, "pid": 5, "tid": 7, "ts": 1716454223452067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398568, "dur": 8, "args": { "External id": 129312, "cbid": 211, "correlation": 129312 } }, { "ph": "s", "id": 129312, "pid": 76337, "tid": -914061504, "ts": 1716454223398568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223398648, "dur": 1, "args": { "External id": 129328, "cbid": 251, "correlation": 129328 } }, { "ph": "f", "id": 129328, "pid": 76337, "tid": -914061504, "ts": 1716454223398648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223452139, "dur": 1, "args": { "External id": 129330, "device": 5, "context": 1, "stream": 7, "correlation": 129330, "bytes": 240, "memory bandwidth (GB/s)": 0.1388888888888889 } }, { "ph": "f", "id": 129330, "pid": 5, "tid": 7, "ts": 1716454223452139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223398653, "dur": 13, "args": { "External id": 129330, "cbid": 51, "correlation": 129330 } }, { "ph": "s", "id": 129330, "pid": 76337, "tid": -914061504, "ts": 1716454223398653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223452143, "dur": 269, "args": { "External id": 129331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129331, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129331, "pid": 5, "tid": 7, "ts": 1716454223452143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398667, "dur": 11, "args": { "External id": 129331, "cbid": 211, "correlation": 129331 } }, { "ph": "s", "id": 129331, "pid": 76337, "tid": -914061504, "ts": 1716454223398667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223452413, "dur": 14, "args": { "External id": 129339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129339, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129339, "pid": 5, "tid": 7, "ts": 1716454223452413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398708, "dur": 11, "args": { "External id": 129339, "cbid": 211, "correlation": 129339 } }, { "ph": "s", "id": 129339, "pid": 76337, "tid": -914061504, "ts": 1716454223398708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223452428, "dur": 38, "args": { "External id": 129350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129350, "pid": 5, "tid": 7, "ts": 1716454223452428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398777, "dur": 12, "args": { "External id": 129350, "cbid": 211, "correlation": 129350 } }, { "ph": "s", "id": 129350, "pid": 76337, "tid": -914061504, "ts": 1716454223398777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223398843, "dur": 0, "args": { "External id": 129362, "cbid": 317, "correlation": 129362 } }, { "ph": "f", "id": 129362, "pid": 76337, "tid": -914061504, "ts": 1716454223398843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223398844, "dur": 0, "args": { "External id": 129363, "cbid": 203, "correlation": 129363 } }, { "ph": "f", "id": 129363, "pid": 76337, "tid": -914061504, "ts": 1716454223398844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223398844, "dur": 0, "args": { "External id": 129364, "cbid": 205, "correlation": 129364 } }, { "ph": "f", "id": 129364, "pid": 76337, "tid": -914061504, "ts": 1716454223398844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223452467, "dur": 12, "args": { "External id": 129368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129368, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129368, "pid": 5, "tid": 7, "ts": 1716454223452467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398860, "dur": 13, "args": { "External id": 129368, "cbid": 211, "correlation": 129368 } }, { "ph": "s", "id": 129368, "pid": 76337, "tid": -914061504, "ts": 1716454223398860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223452480, "dur": 4, "args": { "External id": 129370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129370, "pid": 5, "tid": 7, "ts": 1716454223452480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398877, "dur": 6, "args": { "External id": 129370, "cbid": 211, "correlation": 129370 } }, { "ph": "s", "id": 129370, "pid": 76337, "tid": -914061504, "ts": 1716454223398877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223398886, "dur": 0, "args": { "External id": 129371, "cbid": 51, "correlation": 129371 } }, { "ph": "s", "id": 129371, "pid": 76337, "tid": -914061504, "ts": 1716454223398886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223452486, "dur": 97, "args": { "External id": 129372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129372, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 129372, "pid": 5, "tid": 7, "ts": 1716454223452486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398887, "dur": 5, "args": { "External id": 129372, "cbid": 211, "correlation": 129372 } }, { "ph": "s", "id": 129372, "pid": 76337, "tid": -914061504, "ts": 1716454223398887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223452585, "dur": 16, "args": { "External id": 129377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129377, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129377, "pid": 5, "tid": 7, "ts": 1716454223452585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398914, "dur": 8, "args": { "External id": 129377, "cbid": 211, "correlation": 129377 } }, { "ph": "s", "id": 129377, "pid": 76337, "tid": -914061504, "ts": 1716454223398914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223452602, "dur": 12, "args": { "External id": 129385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129385, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129385, "pid": 5, "tid": 7, "ts": 1716454223452602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223398945, "dur": 8, "args": { "External id": 129385, "cbid": 211, "correlation": 129385 } }, { "ph": "s", "id": 129385, "pid": 76337, "tid": -914061504, "ts": 1716454223398945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223452615, "dur": 18, "args": { "External id": 129405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129405, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 129405, "pid": 5, "tid": 7, "ts": 1716454223452615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399028, "dur": 12, "args": { "External id": 129405, "cbid": 211, "correlation": 129405 } }, { "ph": "s", "id": 129405, "pid": 76337, "tid": -914061504, "ts": 1716454223399028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223452635, "dur": 5, "args": { "External id": 129417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129417, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 129417, "pid": 5, "tid": 7, "ts": 1716454223452635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399051, "dur": 6, "args": { "External id": 129417, "cbid": 211, "correlation": 129417 } }, { "ph": "s", "id": 129417, "pid": 76337, "tid": -914061504, "ts": 1716454223399051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223452641, "dur": 18, "args": { "External id": 129420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129420, "pid": 5, "tid": 7, "ts": 1716454223452641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399069, "dur": 6, "args": { "External id": 129420, "cbid": 211, "correlation": 129420 } }, { "ph": "s", "id": 129420, "pid": 76337, "tid": -914061504, "ts": 1716454223399069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223452660, "dur": 11, "args": { "External id": 129429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129429, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129429, "pid": 5, "tid": 7, "ts": 1716454223452660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399107, "dur": 11, "args": { "External id": 129429, "cbid": 211, "correlation": 129429 } }, { "ph": "s", "id": 129429, "pid": 76337, "tid": -914061504, "ts": 1716454223399107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223399159, "dur": 0, "args": { "External id": 129439, "cbid": 317, "correlation": 129439 } }, { "ph": "f", "id": 129439, "pid": 76337, "tid": -914061504, "ts": 1716454223399159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223399160, "dur": 0, "args": { "External id": 129440, "cbid": 203, "correlation": 129440 } }, { "ph": "f", "id": 129440, "pid": 76337, "tid": -914061504, "ts": 1716454223399160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223399161, "dur": 0, "args": { "External id": 129441, "cbid": 205, "correlation": 129441 } }, { "ph": "f", "id": 129441, "pid": 76337, "tid": -914061504, "ts": 1716454223399161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223452672, "dur": 13, "args": { "External id": 129445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129445, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129445, "pid": 5, "tid": 7, "ts": 1716454223452672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399175, "dur": 11, "args": { "External id": 129445, "cbid": 211, "correlation": 129445 } }, { "ph": "s", "id": 129445, "pid": 76337, "tid": -914061504, "ts": 1716454223399175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223452687, "dur": 161, "args": { "External id": 129447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129447, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129447, "pid": 5, "tid": 7, "ts": 1716454223452687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399189, "dur": 5, "args": { "External id": 129447, "cbid": 211, "correlation": 129447 } }, { "ph": "s", "id": 129447, "pid": 76337, "tid": -914061504, "ts": 1716454223399189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223452850, "dur": 1, "args": { "External id": 129449, "device": 5, "context": 1, "stream": 7, "correlation": 129449, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 129449, "pid": 5, "tid": 7, "ts": 1716454223452850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223399200, "dur": 7, "args": { "External id": 129449, "cbid": 51, "correlation": 129449 } }, { "ph": "s", "id": 129449, "pid": 76337, "tid": -914061504, "ts": 1716454223399200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223452853, "dur": 660, "args": { "External id": 129450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129450, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129450, "pid": 5, "tid": 7, "ts": 1716454223452853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399207, "dur": 7, "args": { "External id": 129450, "cbid": 211, "correlation": 129450 } }, { "ph": "s", "id": 129450, "pid": 76337, "tid": -914061504, "ts": 1716454223399207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223453514, "dur": 13, "args": { "External id": 129452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129452, "pid": 5, "tid": 7, "ts": 1716454223453514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399218, "dur": 5, "args": { "External id": 129452, "cbid": 211, "correlation": 129452 } }, { "ph": "s", "id": 129452, "pid": 76337, "tid": -914061504, "ts": 1716454223399218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223453529, "dur": 15, "args": { "External id": 129458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129458, "pid": 5, "tid": 7, "ts": 1716454223453529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399246, "dur": 8, "args": { "External id": 129458, "cbid": 211, "correlation": 129458 } }, { "ph": "s", "id": 129458, "pid": 76337, "tid": -914061504, "ts": 1716454223399246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223453545, "dur": 3, "args": { "External id": 129466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129466, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 129466, "pid": 5, "tid": 7, "ts": 1716454223453545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399290, "dur": 10, "args": { "External id": 129466, "cbid": 211, "correlation": 129466 } }, { "ph": "s", "id": 129466, "pid": 76337, "tid": -914061504, "ts": 1716454223399290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223399358, "dur": 1, "args": { "External id": 129482, "cbid": 251, "correlation": 129482 } }, { "ph": "f", "id": 129482, "pid": 76337, "tid": -914061504, "ts": 1716454223399358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223399364, "dur": 0, "args": { "External id": 129484, "cbid": 251, "correlation": 129484 } }, { "ph": "f", "id": 129484, "pid": 76337, "tid": -914061504, "ts": 1716454223399364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223453550, "dur": 13, "args": { "External id": 129485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129485, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129485, "pid": 5, "tid": 7, "ts": 1716454223453550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399366, "dur": 11, "args": { "External id": 129485, "cbid": 211, "correlation": 129485 } }, { "ph": "s", "id": 129485, "pid": 76337, "tid": -914061504, "ts": 1716454223399366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223453565, "dur": 5, "args": { "External id": 129487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129487, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129487, "pid": 5, "tid": 7, "ts": 1716454223453565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399379, "dur": 5, "args": { "External id": 129487, "cbid": 211, "correlation": 129487 } }, { "ph": "s", "id": 129487, "pid": 76337, "tid": -914061504, "ts": 1716454223399379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223453571, "dur": 17, "args": { "External id": 129497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129497, "pid": 5, "tid": 7, "ts": 1716454223453571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399438, "dur": 12, "args": { "External id": 129497, "cbid": 211, "correlation": 129497 } }, { "ph": "s", "id": 129497, "pid": 76337, "tid": -914061504, "ts": 1716454223399438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223453589, "dur": 18, "args": { "External id": 129517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129517, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 129517, "pid": 5, "tid": 7, "ts": 1716454223453589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399504, "dur": 11, "args": { "External id": 129517, "cbid": 211, "correlation": 129517 } }, { "ph": "s", "id": 129517, "pid": 76337, "tid": -914061504, "ts": 1716454223399504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223453609, "dur": 4, "args": { "External id": 129529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129529, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 129529, "pid": 5, "tid": 7, "ts": 1716454223453609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399525, "dur": 6, "args": { "External id": 129529, "cbid": 211, "correlation": 129529 } }, { "ph": "s", "id": 129529, "pid": 76337, "tid": -914061504, "ts": 1716454223399525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223453614, "dur": 16, "args": { "External id": 129532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129532, "pid": 5, "tid": 7, "ts": 1716454223453614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399544, "dur": 6, "args": { "External id": 129532, "cbid": 211, "correlation": 129532 } }, { "ph": "s", "id": 129532, "pid": 76337, "tid": -914061504, "ts": 1716454223399544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223453632, "dur": 11, "args": { "External id": 129541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129541, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129541, "pid": 5, "tid": 7, "ts": 1716454223453632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399584, "dur": 10, "args": { "External id": 129541, "cbid": 211, "correlation": 129541 } }, { "ph": "s", "id": 129541, "pid": 76337, "tid": -914061504, "ts": 1716454223399584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223399646, "dur": 0, "args": { "External id": 129551, "cbid": 317, "correlation": 129551 } }, { "ph": "f", "id": 129551, "pid": 76337, "tid": -914061504, "ts": 1716454223399646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223399646, "dur": 0, "args": { "External id": 129552, "cbid": 203, "correlation": 129552 } }, { "ph": "f", "id": 129552, "pid": 76337, "tid": -914061504, "ts": 1716454223399646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223399647, "dur": 0, "args": { "External id": 129553, "cbid": 205, "correlation": 129553 } }, { "ph": "f", "id": 129553, "pid": 76337, "tid": -914061504, "ts": 1716454223399647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223453645, "dur": 11, "args": { "External id": 129557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129557, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129557, "pid": 5, "tid": 7, "ts": 1716454223453645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399662, "dur": 12, "args": { "External id": 129557, "cbid": 211, "correlation": 129557 } }, { "ph": "s", "id": 129557, "pid": 76337, "tid": -914061504, "ts": 1716454223399662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223453656, "dur": 162, "args": { "External id": 129559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129559, "pid": 5, "tid": 7, "ts": 1716454223453656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399676, "dur": 5, "args": { "External id": 129559, "cbid": 211, "correlation": 129559 } }, { "ph": "s", "id": 129559, "pid": 76337, "tid": -914061504, "ts": 1716454223399676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223453820, "dur": 1, "args": { "External id": 129561, "device": 5, "context": 1, "stream": 7, "correlation": 129561, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 129561, "pid": 5, "tid": 7, "ts": 1716454223453820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223399687, "dur": 6, "args": { "External id": 129561, "cbid": 51, "correlation": 129561 } }, { "ph": "s", "id": 129561, "pid": 76337, "tid": -914061504, "ts": 1716454223399687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223453824, "dur": 648, "args": { "External id": 129562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129562, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129562, "pid": 5, "tid": 7, "ts": 1716454223453824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399694, "dur": 6, "args": { "External id": 129562, "cbid": 211, "correlation": 129562 } }, { "ph": "s", "id": 129562, "pid": 76337, "tid": -914061504, "ts": 1716454223399694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223454473, "dur": 13, "args": { "External id": 129564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129564, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129564, "pid": 5, "tid": 7, "ts": 1716454223454473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399704, "dur": 5, "args": { "External id": 129564, "cbid": 211, "correlation": 129564 } }, { "ph": "s", "id": 129564, "pid": 76337, "tid": -914061504, "ts": 1716454223399704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223454487, "dur": 15, "args": { "External id": 129570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129570, "pid": 5, "tid": 7, "ts": 1716454223454487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399732, "dur": 9, "args": { "External id": 129570, "cbid": 211, "correlation": 129570 } }, { "ph": "s", "id": 129570, "pid": 76337, "tid": -914061504, "ts": 1716454223399732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223454503, "dur": 12, "args": { "External id": 129578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129578, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129578, "pid": 5, "tid": 7, "ts": 1716454223454503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399765, "dur": 8, "args": { "External id": 129578, "cbid": 211, "correlation": 129578 } }, { "ph": "s", "id": 129578, "pid": 76337, "tid": -914061504, "ts": 1716454223399765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223454516, "dur": 10, "args": { "External id": 129586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129586, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129586, "pid": 5, "tid": 7, "ts": 1716454223454516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399793, "dur": 8, "args": { "External id": 129586, "cbid": 211, "correlation": 129586 } }, { "ph": "s", "id": 129586, "pid": 76337, "tid": -914061504, "ts": 1716454223399793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223454527, "dur": 19, "args": { "External id": 129606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129606, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 129606, "pid": 5, "tid": 7, "ts": 1716454223454527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399873, "dur": 12, "args": { "External id": 129606, "cbid": 211, "correlation": 129606 } }, { "ph": "s", "id": 129606, "pid": 76337, "tid": -914061504, "ts": 1716454223399873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223454547, "dur": 4, "args": { "External id": 129618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129618, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 129618, "pid": 5, "tid": 7, "ts": 1716454223454547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399895, "dur": 6, "args": { "External id": 129618, "cbid": 211, "correlation": 129618 } }, { "ph": "s", "id": 129618, "pid": 76337, "tid": -914061504, "ts": 1716454223399895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223454552, "dur": 17, "args": { "External id": 129621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129621, "pid": 5, "tid": 7, "ts": 1716454223454552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399913, "dur": 7, "args": { "External id": 129621, "cbid": 211, "correlation": 129621 } }, { "ph": "s", "id": 129621, "pid": 76337, "tid": -914061504, "ts": 1716454223399913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223399969, "dur": 0, "args": { "External id": 129632, "cbid": 317, "correlation": 129632 } }, { "ph": "f", "id": 129632, "pid": 76337, "tid": -914061504, "ts": 1716454223399969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223399970, "dur": 0, "args": { "External id": 129633, "cbid": 203, "correlation": 129633 } }, { "ph": "f", "id": 129633, "pid": 76337, "tid": -914061504, "ts": 1716454223399970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223399971, "dur": 0, "args": { "External id": 129634, "cbid": 205, "correlation": 129634 } }, { "ph": "f", "id": 129634, "pid": 76337, "tid": -914061504, "ts": 1716454223399971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223454571, "dur": 11, "args": { "External id": 129638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129638, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129638, "pid": 5, "tid": 7, "ts": 1716454223454571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223399992, "dur": 12, "args": { "External id": 129638, "cbid": 211, "correlation": 129638 } }, { "ph": "s", "id": 129638, "pid": 76337, "tid": -914061504, "ts": 1716454223399992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223454583, "dur": 4, "args": { "External id": 129640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129640, "pid": 5, "tid": 7, "ts": 1716454223454583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400009, "dur": 6, "args": { "External id": 129640, "cbid": 211, "correlation": 129640 } }, { "ph": "s", "id": 129640, "pid": 76337, "tid": -914061504, "ts": 1716454223400009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223400018, "dur": 0, "args": { "External id": 129641, "cbid": 51, "correlation": 129641 } }, { "ph": "s", "id": 129641, "pid": 76337, "tid": -914061504, "ts": 1716454223400018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223454588, "dur": 94, "args": { "External id": 129642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129642, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 129642, "pid": 5, "tid": 7, "ts": 1716454223454588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400019, "dur": 5, "args": { "External id": 129642, "cbid": 211, "correlation": 129642 } }, { "ph": "s", "id": 129642, "pid": 76337, "tid": -914061504, "ts": 1716454223400019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223454684, "dur": 15, "args": { "External id": 129647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129647, "pid": 5, "tid": 7, "ts": 1716454223454684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400046, "dur": 8, "args": { "External id": 129647, "cbid": 211, "correlation": 129647 } }, { "ph": "s", "id": 129647, "pid": 76337, "tid": -914061504, "ts": 1716454223400046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223454700, "dur": 83, "args": { "External id": 129656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129656, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129656, "pid": 5, "tid": 7, "ts": 1716454223454700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400127, "dur": 14, "args": { "External id": 129656, "cbid": 211, "correlation": 129656 } }, { "ph": "s", "id": 129656, "pid": 76337, "tid": -914061504, "ts": 1716454223400127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223454785, "dur": 30, "args": { "External id": 129678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129678, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129678, "pid": 5, "tid": 7, "ts": 1716454223454785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400183, "dur": 10, "args": { "External id": 129678, "cbid": 211, "correlation": 129678 } }, { "ph": "s", "id": 129678, "pid": 76337, "tid": -914061504, "ts": 1716454223400183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400271, "dur": 1, "args": { "External id": 129689, "cbid": 251, "correlation": 129689 } }, { "ph": "f", "id": 129689, "pid": 76337, "tid": -914061504, "ts": 1716454223400271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223454816, "dur": 162, "args": { "External id": 129690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129690, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129690, "pid": 5, "tid": 7, "ts": 1716454223454816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400276, "dur": 13, "args": { "External id": 129690, "cbid": 211, "correlation": 129690 } }, { "ph": "s", "id": 129690, "pid": 76337, "tid": -914061504, "ts": 1716454223400276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400345, "dur": 1, "args": { "External id": 129701, "cbid": 251, "correlation": 129701 } }, { "ph": "f", "id": 129701, "pid": 76337, "tid": -914061504, "ts": 1716454223400345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223454979, "dur": 160, "args": { "External id": 129702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129702, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129702, "pid": 5, "tid": 7, "ts": 1716454223454979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400349, "dur": 11, "args": { "External id": 129702, "cbid": 211, "correlation": 129702 } }, { "ph": "s", "id": 129702, "pid": 76337, "tid": -914061504, "ts": 1716454223400349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400415, "dur": 1, "args": { "External id": 129713, "cbid": 251, "correlation": 129713 } }, { "ph": "f", "id": 129713, "pid": 76337, "tid": -914061504, "ts": 1716454223400415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223455140, "dur": 158, "args": { "External id": 129714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129714, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129714, "pid": 5, "tid": 7, "ts": 1716454223455140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400419, "dur": 11, "args": { "External id": 129714, "cbid": 211, "correlation": 129714 } }, { "ph": "s", "id": 129714, "pid": 76337, "tid": -914061504, "ts": 1716454223400419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223455299, "dur": 334, "args": { "External id": 129739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129739, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129739, "pid": 5, "tid": 7, "ts": 1716454223455299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400501, "dur": 13, "args": { "External id": 129739, "cbid": 211, "correlation": 129739 } }, { "ph": "s", "id": 129739, "pid": 76337, "tid": -914061504, "ts": 1716454223400501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400599, "dur": 1, "args": { "External id": 129757, "cbid": 251, "correlation": 129757 } }, { "ph": "f", "id": 129757, "pid": 76337, "tid": -914061504, "ts": 1716454223400599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223455635, "dur": 167, "args": { "External id": 129759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129759, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129759, "pid": 5, "tid": 7, "ts": 1716454223455635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400605, "dur": 14, "args": { "External id": 129759, "cbid": 211, "correlation": 129759 } }, { "ph": "s", "id": 129759, "pid": 76337, "tid": -914061504, "ts": 1716454223400605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223455803, "dur": 19, "args": { "External id": 129767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129767, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129767, "pid": 5, "tid": 7, "ts": 1716454223455803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400675, "dur": 12, "args": { "External id": 129767, "cbid": 211, "correlation": 129767 } }, { "ph": "s", "id": 129767, "pid": 76337, "tid": -914061504, "ts": 1716454223400675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223455823, "dur": 27, "args": { "External id": 129775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129775, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129775, "pid": 5, "tid": 7, "ts": 1716454223455823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400713, "dur": 8, "args": { "External id": 129775, "cbid": 211, "correlation": 129775 } }, { "ph": "s", "id": 129775, "pid": 76337, "tid": -914061504, "ts": 1716454223400713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223455852, "dur": 18, "args": { "External id": 129786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129786, "pid": 5, "tid": 7, "ts": 1716454223455852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400782, "dur": 13, "args": { "External id": 129786, "cbid": 211, "correlation": 129786 } }, { "ph": "s", "id": 129786, "pid": 76337, "tid": -914061504, "ts": 1716454223400782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223455871, "dur": 17, "args": { "External id": 129808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129808, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129808, "pid": 5, "tid": 7, "ts": 1716454223455871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400814, "dur": 8, "args": { "External id": 129808, "cbid": 211, "correlation": 129808 } }, { "ph": "s", "id": 129808, "pid": 76337, "tid": -914061504, "ts": 1716454223400814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400897, "dur": 1, "args": { "External id": 129819, "cbid": 251, "correlation": 129819 } }, { "ph": "f", "id": 129819, "pid": 76337, "tid": -914061504, "ts": 1716454223400897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223455889, "dur": 88, "args": { "External id": 129820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129820, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129820, "pid": 5, "tid": 7, "ts": 1716454223455889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400902, "dur": 13, "args": { "External id": 129820, "cbid": 211, "correlation": 129820 } }, { "ph": "s", "id": 129820, "pid": 76337, "tid": -914061504, "ts": 1716454223400902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400970, "dur": 1, "args": { "External id": 129831, "cbid": 251, "correlation": 129831 } }, { "ph": "f", "id": 129831, "pid": 76337, "tid": -914061504, "ts": 1716454223400970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223400982, "dur": 0, "args": { "External id": 129832, "cbid": 251, "correlation": 129832 } }, { "ph": "f", "id": 129832, "pid": 76337, "tid": -914061504, "ts": 1716454223400982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223455979, "dur": 12, "args": { "External id": 129833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129833, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129833, "pid": 5, "tid": 7, "ts": 1716454223455979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400984, "dur": 13, "args": { "External id": 129833, "cbid": 211, "correlation": 129833 } }, { "ph": "s", "id": 129833, "pid": 76337, "tid": -914061504, "ts": 1716454223400984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223455993, "dur": 6, "args": { "External id": 129835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129835, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129835, "pid": 5, "tid": 7, "ts": 1716454223455993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223400999, "dur": 6, "args": { "External id": 129835, "cbid": 211, "correlation": 129835 } }, { "ph": "s", "id": 129835, "pid": 76337, "tid": -914061504, "ts": 1716454223400999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223401059, "dur": 1, "args": { "External id": 129846, "cbid": 251, "correlation": 129846 } }, { "ph": "f", "id": 129846, "pid": 76337, "tid": -914061504, "ts": 1716454223401059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223401062, "dur": 0, "args": { "External id": 129847, "cbid": 251, "correlation": 129847 } }, { "ph": "f", "id": 129847, "pid": 76337, "tid": -914061504, "ts": 1716454223401062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223456000, "dur": 8, "args": { "External id": 129848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129848, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129848, "pid": 5, "tid": 7, "ts": 1716454223456000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401064, "dur": 11, "args": { "External id": 129848, "cbid": 211, "correlation": 129848 } }, { "ph": "s", "id": 129848, "pid": 76337, "tid": -914061504, "ts": 1716454223401064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223456009, "dur": 3, "args": { "External id": 129850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129850, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129850, "pid": 5, "tid": 7, "ts": 1716454223456009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401077, "dur": 6, "args": { "External id": 129850, "cbid": 211, "correlation": 129850 } }, { "ph": "s", "id": 129850, "pid": 76337, "tid": -914061504, "ts": 1716454223401077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223456013, "dur": 55, "args": { "External id": 129875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129875, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129875, "pid": 5, "tid": 7, "ts": 1716454223456013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401153, "dur": 13, "args": { "External id": 129875, "cbid": 211, "correlation": 129875 } }, { "ph": "s", "id": 129875, "pid": 76337, "tid": -914061504, "ts": 1716454223401153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223401251, "dur": 1, "args": { "External id": 129893, "cbid": 251, "correlation": 129893 } }, { "ph": "f", "id": 129893, "pid": 76337, "tid": -914061504, "ts": 1716454223401251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223456069, "dur": 89, "args": { "External id": 129895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129895, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129895, "pid": 5, "tid": 7, "ts": 1716454223456069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401257, "dur": 15, "args": { "External id": 129895, "cbid": 211, "correlation": 129895 } }, { "ph": "s", "id": 129895, "pid": 76337, "tid": -914061504, "ts": 1716454223401257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223456160, "dur": 10, "args": { "External id": 129903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129903, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129903, "pid": 5, "tid": 7, "ts": 1716454223456160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401327, "dur": 12, "args": { "External id": 129903, "cbid": 211, "correlation": 129903 } }, { "ph": "s", "id": 129903, "pid": 76337, "tid": -914061504, "ts": 1716454223401327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223456171, "dur": 20, "args": { "External id": 129911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129911, "pid": 5, "tid": 7, "ts": 1716454223456171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401368, "dur": 9, "args": { "External id": 129911, "cbid": 211, "correlation": 129911 } }, { "ph": "s", "id": 129911, "pid": 76337, "tid": -914061504, "ts": 1716454223401368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223456193, "dur": 17, "args": { "External id": 129933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129933, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129933, "pid": 5, "tid": 7, "ts": 1716454223456193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401419, "dur": 10, "args": { "External id": 129933, "cbid": 211, "correlation": 129933 } }, { "ph": "s", "id": 129933, "pid": 76337, "tid": -914061504, "ts": 1716454223401419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223401505, "dur": 1, "args": { "External id": 129949, "cbid": 251, "correlation": 129949 } }, { "ph": "f", "id": 129949, "pid": 76337, "tid": -914061504, "ts": 1716454223401505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223401510, "dur": 0, "args": { "External id": 129951, "cbid": 251, "correlation": 129951 } }, { "ph": "f", "id": 129951, "pid": 76337, "tid": -914061504, "ts": 1716454223401510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223456211, "dur": 495, "args": { "External id": 129952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129952, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 129952, "pid": 5, "tid": 7, "ts": 1716454223456211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401512, "dur": 13, "args": { "External id": 129952, "cbid": 211, "correlation": 129952 } }, { "ph": "s", "id": 129952, "pid": 76337, "tid": -914061504, "ts": 1716454223401512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223456707, "dur": 65, "args": { "External id": 129960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129960, "pid": 5, "tid": 7, "ts": 1716454223456707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401577, "dur": 12, "args": { "External id": 129960, "cbid": 211, "correlation": 129960 } }, { "ph": "s", "id": 129960, "pid": 76337, "tid": -914061504, "ts": 1716454223401577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223456774, "dur": 68, "args": { "External id": 129968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129968, "pid": 5, "tid": 7, "ts": 1716454223456774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401606, "dur": 9, "args": { "External id": 129968, "cbid": 211, "correlation": 129968 } }, { "ph": "s", "id": 129968, "pid": 76337, "tid": -914061504, "ts": 1716454223401606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223401686, "dur": 1, "args": { "External id": 129984, "cbid": 251, "correlation": 129984 } }, { "ph": "f", "id": 129984, "pid": 76337, "tid": -914061504, "ts": 1716454223401686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223456845, "dur": 1, "args": { "External id": 129986, "device": 5, "context": 1, "stream": 7, "correlation": 129986, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 129986, "pid": 5, "tid": 7, "ts": 1716454223456845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223401691, "dur": 9, "args": { "External id": 129986, "cbid": 51, "correlation": 129986 } }, { "ph": "s", "id": 129986, "pid": 76337, "tid": -914061504, "ts": 1716454223401691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223456848, "dur": 269, "args": { "External id": 129987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129987, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 129987, "pid": 5, "tid": 7, "ts": 1716454223456848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401701, "dur": 11, "args": { "External id": 129987, "cbid": 211, "correlation": 129987 } }, { "ph": "s", "id": 129987, "pid": 76337, "tid": -914061504, "ts": 1716454223401701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223457119, "dur": 14, "args": { "External id": 129995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 129995, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 129995, "pid": 5, "tid": 7, "ts": 1716454223457119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401744, "dur": 10, "args": { "External id": 129995, "cbid": 211, "correlation": 129995 } }, { "ph": "s", "id": 129995, "pid": 76337, "tid": -914061504, "ts": 1716454223401744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223457134, "dur": 37, "args": { "External id": 130006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130006, "pid": 5, "tid": 7, "ts": 1716454223457134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401811, "dur": 13, "args": { "External id": 130006, "cbid": 211, "correlation": 130006 } }, { "ph": "s", "id": 130006, "pid": 76337, "tid": -914061504, "ts": 1716454223401811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223401875, "dur": 0, "args": { "External id": 130018, "cbid": 317, "correlation": 130018 } }, { "ph": "f", "id": 130018, "pid": 76337, "tid": -914061504, "ts": 1716454223401875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223401876, "dur": 0, "args": { "External id": 130019, "cbid": 203, "correlation": 130019 } }, { "ph": "f", "id": 130019, "pid": 76337, "tid": -914061504, "ts": 1716454223401876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223401877, "dur": 0, "args": { "External id": 130020, "cbid": 205, "correlation": 130020 } }, { "ph": "f", "id": 130020, "pid": 76337, "tid": -914061504, "ts": 1716454223401877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223457172, "dur": 13, "args": { "External id": 130024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130024, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130024, "pid": 5, "tid": 7, "ts": 1716454223457172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401891, "dur": 12, "args": { "External id": 130024, "cbid": 211, "correlation": 130024 } }, { "ph": "s", "id": 130024, "pid": 76337, "tid": -914061504, "ts": 1716454223401891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223457187, "dur": 4, "args": { "External id": 130026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 130026, "pid": 5, "tid": 7, "ts": 1716454223457187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401907, "dur": 6, "args": { "External id": 130026, "cbid": 211, "correlation": 130026 } }, { "ph": "s", "id": 130026, "pid": 76337, "tid": -914061504, "ts": 1716454223401907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223401917, "dur": 0, "args": { "External id": 130027, "cbid": 51, "correlation": 130027 } }, { "ph": "s", "id": 130027, "pid": 76337, "tid": -914061504, "ts": 1716454223401917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223457192, "dur": 97, "args": { "External id": 130028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130028, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 130028, "pid": 5, "tid": 7, "ts": 1716454223457192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401917, "dur": 5, "args": { "External id": 130028, "cbid": 211, "correlation": 130028 } }, { "ph": "s", "id": 130028, "pid": 76337, "tid": -914061504, "ts": 1716454223401917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223457291, "dur": 17, "args": { "External id": 130033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130033, "pid": 5, "tid": 7, "ts": 1716454223457291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401944, "dur": 9, "args": { "External id": 130033, "cbid": 211, "correlation": 130033 } }, { "ph": "s", "id": 130033, "pid": 76337, "tid": -914061504, "ts": 1716454223401944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223457309, "dur": 12, "args": { "External id": 130041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130041, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130041, "pid": 5, "tid": 7, "ts": 1716454223457309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223401985, "dur": 9, "args": { "External id": 130041, "cbid": 211, "correlation": 130041 } }, { "ph": "s", "id": 130041, "pid": 76337, "tid": -914061504, "ts": 1716454223401985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223402055, "dur": 0, "args": { "External id": 130051, "cbid": 317, "correlation": 130051 } }, { "ph": "f", "id": 130051, "pid": 76337, "tid": -914061504, "ts": 1716454223402055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223402056, "dur": 0, "args": { "External id": 130052, "cbid": 203, "correlation": 130052 } }, { "ph": "f", "id": 130052, "pid": 76337, "tid": -914061504, "ts": 1716454223402056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223402056, "dur": 0, "args": { "External id": 130053, "cbid": 205, "correlation": 130053 } }, { "ph": "f", "id": 130053, "pid": 76337, "tid": -914061504, "ts": 1716454223402056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223457322, "dur": 12, "args": { "External id": 130057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130057, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130057, "pid": 5, "tid": 7, "ts": 1716454223457322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402071, "dur": 12, "args": { "External id": 130057, "cbid": 211, "correlation": 130057 } }, { "ph": "s", "id": 130057, "pid": 76337, "tid": -914061504, "ts": 1716454223402071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223457335, "dur": 162, "args": { "External id": 130059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130059, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130059, "pid": 5, "tid": 7, "ts": 1716454223457335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402086, "dur": 5, "args": { "External id": 130059, "cbid": 211, "correlation": 130059 } }, { "ph": "s", "id": 130059, "pid": 76337, "tid": -914061504, "ts": 1716454223402086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223457500, "dur": 1, "args": { "External id": 130061, "device": 5, "context": 1, "stream": 7, "correlation": 130061, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 130061, "pid": 5, "tid": 7, "ts": 1716454223457500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223402098, "dur": 6, "args": { "External id": 130061, "cbid": 51, "correlation": 130061 } }, { "ph": "s", "id": 130061, "pid": 76337, "tid": -914061504, "ts": 1716454223402098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223457504, "dur": 197, "args": { "External id": 130062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130062, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 130062, "pid": 5, "tid": 7, "ts": 1716454223457504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402105, "dur": 9, "args": { "External id": 130062, "cbid": 211, "correlation": 130062 } }, { "ph": "s", "id": 130062, "pid": 76337, "tid": -914061504, "ts": 1716454223402105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223457702, "dur": 6, "args": { "External id": 130064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130064, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130064, "pid": 5, "tid": 7, "ts": 1716454223457702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402120, "dur": 5, "args": { "External id": 130064, "cbid": 211, "correlation": 130064 } }, { "ph": "s", "id": 130064, "pid": 76337, "tid": -914061504, "ts": 1716454223402120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223457709, "dur": 6, "args": { "External id": 130070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130070, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130070, "pid": 5, "tid": 7, "ts": 1716454223457709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402147, "dur": 8, "args": { "External id": 130070, "cbid": 211, "correlation": 130070 } }, { "ph": "s", "id": 130070, "pid": 76337, "tid": -914061504, "ts": 1716454223402147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223457716, "dur": 10, "args": { "External id": 130090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130090, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130090, "pid": 5, "tid": 7, "ts": 1716454223457716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402241, "dur": 12, "args": { "External id": 130090, "cbid": 211, "correlation": 130090 } }, { "ph": "s", "id": 130090, "pid": 76337, "tid": -914061504, "ts": 1716454223402241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223457728, "dur": 4, "args": { "External id": 130102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130102, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130102, "pid": 5, "tid": 7, "ts": 1716454223457728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402264, "dur": 6, "args": { "External id": 130102, "cbid": 211, "correlation": 130102 } }, { "ph": "s", "id": 130102, "pid": 76337, "tid": -914061504, "ts": 1716454223402264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223457733, "dur": 9, "args": { "External id": 130105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130105, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130105, "pid": 5, "tid": 7, "ts": 1716454223457733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402283, "dur": 6, "args": { "External id": 130105, "cbid": 211, "correlation": 130105 } }, { "ph": "s", "id": 130105, "pid": 76337, "tid": -914061504, "ts": 1716454223402283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223457743, "dur": 5, "args": { "External id": 130114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130114, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130114, "pid": 5, "tid": 7, "ts": 1716454223457743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402323, "dur": 9, "args": { "External id": 130114, "cbid": 211, "correlation": 130114 } }, { "ph": "s", "id": 130114, "pid": 76337, "tid": -914061504, "ts": 1716454223402323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223402374, "dur": 0, "args": { "External id": 130124, "cbid": 317, "correlation": 130124 } }, { "ph": "f", "id": 130124, "pid": 76337, "tid": -914061504, "ts": 1716454223402374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223402375, "dur": 0, "args": { "External id": 130125, "cbid": 203, "correlation": 130125 } }, { "ph": "f", "id": 130125, "pid": 76337, "tid": -914061504, "ts": 1716454223402375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223402376, "dur": 0, "args": { "External id": 130126, "cbid": 205, "correlation": 130126 } }, { "ph": "f", "id": 130126, "pid": 76337, "tid": -914061504, "ts": 1716454223402376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223457750, "dur": 5, "args": { "External id": 130130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130130, "pid": 5, "tid": 7, "ts": 1716454223457750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402391, "dur": 11, "args": { "External id": 130130, "cbid": 211, "correlation": 130130 } }, { "ph": "s", "id": 130130, "pid": 76337, "tid": -914061504, "ts": 1716454223402391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223457756, "dur": 161, "args": { "External id": 130132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130132, "pid": 5, "tid": 7, "ts": 1716454223457756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402405, "dur": 5, "args": { "External id": 130132, "cbid": 211, "correlation": 130132 } }, { "ph": "s", "id": 130132, "pid": 76337, "tid": -914061504, "ts": 1716454223402405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223457919, "dur": 1, "args": { "External id": 130134, "device": 5, "context": 1, "stream": 7, "correlation": 130134, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 130134, "pid": 5, "tid": 7, "ts": 1716454223457919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223402416, "dur": 6, "args": { "External id": 130134, "cbid": 51, "correlation": 130134 } }, { "ph": "s", "id": 130134, "pid": 76337, "tid": -914061504, "ts": 1716454223402416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223457923, "dur": 268, "args": { "External id": 130135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130135, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130135, "pid": 5, "tid": 7, "ts": 1716454223457923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402423, "dur": 6, "args": { "External id": 130135, "cbid": 211, "correlation": 130135 } }, { "ph": "s", "id": 130135, "pid": 76337, "tid": -914061504, "ts": 1716454223402423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223458192, "dur": 6, "args": { "External id": 130137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130137, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130137, "pid": 5, "tid": 7, "ts": 1716454223458192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402433, "dur": 5, "args": { "External id": 130137, "cbid": 211, "correlation": 130137 } }, { "ph": "s", "id": 130137, "pid": 76337, "tid": -914061504, "ts": 1716454223402433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223458199, "dur": 6, "args": { "External id": 130143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130143, "pid": 5, "tid": 7, "ts": 1716454223458199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402462, "dur": 9, "args": { "External id": 130143, "cbid": 211, "correlation": 130143 } }, { "ph": "s", "id": 130143, "pid": 76337, "tid": -914061504, "ts": 1716454223402462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223458207, "dur": 3, "args": { "External id": 130151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130151, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 130151, "pid": 5, "tid": 7, "ts": 1716454223458207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402505, "dur": 10, "args": { "External id": 130151, "cbid": 211, "correlation": 130151 } }, { "ph": "s", "id": 130151, "pid": 76337, "tid": -914061504, "ts": 1716454223402505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223402571, "dur": 1, "args": { "External id": 130167, "cbid": 251, "correlation": 130167 } }, { "ph": "f", "id": 130167, "pid": 76337, "tid": -914061504, "ts": 1716454223402571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223402576, "dur": 0, "args": { "External id": 130169, "cbid": 251, "correlation": 130169 } }, { "ph": "f", "id": 130169, "pid": 76337, "tid": -914061504, "ts": 1716454223402576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223458211, "dur": 13, "args": { "External id": 130170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130170, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130170, "pid": 5, "tid": 7, "ts": 1716454223458211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402578, "dur": 11, "args": { "External id": 130170, "cbid": 211, "correlation": 130170 } }, { "ph": "s", "id": 130170, "pid": 76337, "tid": -914061504, "ts": 1716454223402578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223458225, "dur": 5, "args": { "External id": 130172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130172, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130172, "pid": 5, "tid": 7, "ts": 1716454223458225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402591, "dur": 5, "args": { "External id": 130172, "cbid": 211, "correlation": 130172 } }, { "ph": "s", "id": 130172, "pid": 76337, "tid": -914061504, "ts": 1716454223402591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223458232, "dur": 6, "args": { "External id": 130182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130182, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130182, "pid": 5, "tid": 7, "ts": 1716454223458232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402648, "dur": 12, "args": { "External id": 130182, "cbid": 211, "correlation": 130182 } }, { "ph": "s", "id": 130182, "pid": 76337, "tid": -914061504, "ts": 1716454223402648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223458239, "dur": 10, "args": { "External id": 130202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130202, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130202, "pid": 5, "tid": 7, "ts": 1716454223458239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402716, "dur": 11, "args": { "External id": 130202, "cbid": 211, "correlation": 130202 } }, { "ph": "s", "id": 130202, "pid": 76337, "tid": -914061504, "ts": 1716454223402716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223458250, "dur": 4, "args": { "External id": 130214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130214, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130214, "pid": 5, "tid": 7, "ts": 1716454223458250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402736, "dur": 6, "args": { "External id": 130214, "cbid": 211, "correlation": 130214 } }, { "ph": "s", "id": 130214, "pid": 76337, "tid": -914061504, "ts": 1716454223402736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223458255, "dur": 7, "args": { "External id": 130217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130217, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130217, "pid": 5, "tid": 7, "ts": 1716454223458255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402756, "dur": 7, "args": { "External id": 130217, "cbid": 211, "correlation": 130217 } }, { "ph": "s", "id": 130217, "pid": 76337, "tid": -914061504, "ts": 1716454223402756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223458263, "dur": 5, "args": { "External id": 130226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130226, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130226, "pid": 5, "tid": 7, "ts": 1716454223458263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402796, "dur": 10, "args": { "External id": 130226, "cbid": 211, "correlation": 130226 } }, { "ph": "s", "id": 130226, "pid": 76337, "tid": -914061504, "ts": 1716454223402796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223402860, "dur": 0, "args": { "External id": 130236, "cbid": 317, "correlation": 130236 } }, { "ph": "f", "id": 130236, "pid": 76337, "tid": -914061504, "ts": 1716454223402860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223402860, "dur": 0, "args": { "External id": 130237, "cbid": 203, "correlation": 130237 } }, { "ph": "f", "id": 130237, "pid": 76337, "tid": -914061504, "ts": 1716454223402860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223402861, "dur": 0, "args": { "External id": 130238, "cbid": 205, "correlation": 130238 } }, { "ph": "f", "id": 130238, "pid": 76337, "tid": -914061504, "ts": 1716454223402861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223458269, "dur": 5, "args": { "External id": 130242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130242, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130242, "pid": 5, "tid": 7, "ts": 1716454223458269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402876, "dur": 12, "args": { "External id": 130242, "cbid": 211, "correlation": 130242 } }, { "ph": "s", "id": 130242, "pid": 76337, "tid": -914061504, "ts": 1716454223402876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223458276, "dur": 162, "args": { "External id": 130244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130244, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130244, "pid": 5, "tid": 7, "ts": 1716454223458276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402890, "dur": 5, "args": { "External id": 130244, "cbid": 211, "correlation": 130244 } }, { "ph": "s", "id": 130244, "pid": 76337, "tid": -914061504, "ts": 1716454223402890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223458440, "dur": 1, "args": { "External id": 130246, "device": 5, "context": 1, "stream": 7, "correlation": 130246, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 130246, "pid": 5, "tid": 7, "ts": 1716454223458440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223402900, "dur": 6, "args": { "External id": 130246, "cbid": 51, "correlation": 130246 } }, { "ph": "s", "id": 130246, "pid": 76337, "tid": -914061504, "ts": 1716454223402900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223458443, "dur": 259, "args": { "External id": 130247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130247, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130247, "pid": 5, "tid": 7, "ts": 1716454223458443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402908, "dur": 7, "args": { "External id": 130247, "cbid": 211, "correlation": 130247 } }, { "ph": "s", "id": 130247, "pid": 76337, "tid": -914061504, "ts": 1716454223402908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223458704, "dur": 6, "args": { "External id": 130249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130249, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130249, "pid": 5, "tid": 7, "ts": 1716454223458704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402919, "dur": 5, "args": { "External id": 130249, "cbid": 211, "correlation": 130249 } }, { "ph": "s", "id": 130249, "pid": 76337, "tid": -914061504, "ts": 1716454223402919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223458711, "dur": 6, "args": { "External id": 130255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130255, "pid": 5, "tid": 7, "ts": 1716454223458711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402947, "dur": 9, "args": { "External id": 130255, "cbid": 211, "correlation": 130255 } }, { "ph": "s", "id": 130255, "pid": 76337, "tid": -914061504, "ts": 1716454223402947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223458718, "dur": 5, "args": { "External id": 130263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130263, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130263, "pid": 5, "tid": 7, "ts": 1716454223458718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223402988, "dur": 9, "args": { "External id": 130263, "cbid": 211, "correlation": 130263 } }, { "ph": "s", "id": 130263, "pid": 76337, "tid": -914061504, "ts": 1716454223402988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223458725, "dur": 4, "args": { "External id": 130271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130271, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130271, "pid": 5, "tid": 7, "ts": 1716454223458725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403020, "dur": 8, "args": { "External id": 130271, "cbid": 211, "correlation": 130271 } }, { "ph": "s", "id": 130271, "pid": 76337, "tid": -914061504, "ts": 1716454223403020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223458731, "dur": 9, "args": { "External id": 130291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130291, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130291, "pid": 5, "tid": 7, "ts": 1716454223458731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403094, "dur": 12, "args": { "External id": 130291, "cbid": 211, "correlation": 130291 } }, { "ph": "s", "id": 130291, "pid": 76337, "tid": -914061504, "ts": 1716454223403094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223458741, "dur": 4, "args": { "External id": 130303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130303, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130303, "pid": 5, "tid": 7, "ts": 1716454223458741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403117, "dur": 6, "args": { "External id": 130303, "cbid": 211, "correlation": 130303 } }, { "ph": "s", "id": 130303, "pid": 76337, "tid": -914061504, "ts": 1716454223403117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223458746, "dur": 7, "args": { "External id": 130306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130306, "pid": 5, "tid": 7, "ts": 1716454223458746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403134, "dur": 7, "args": { "External id": 130306, "cbid": 211, "correlation": 130306 } }, { "ph": "s", "id": 130306, "pid": 76337, "tid": -914061504, "ts": 1716454223403134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223458754, "dur": 5, "args": { "External id": 130315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130315, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130315, "pid": 5, "tid": 7, "ts": 1716454223458754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403172, "dur": 9, "args": { "External id": 130315, "cbid": 211, "correlation": 130315 } }, { "ph": "s", "id": 130315, "pid": 76337, "tid": -914061504, "ts": 1716454223403172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223403223, "dur": 0, "args": { "External id": 130325, "cbid": 317, "correlation": 130325 } }, { "ph": "f", "id": 130325, "pid": 76337, "tid": -914061504, "ts": 1716454223403223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223403224, "dur": 0, "args": { "External id": 130326, "cbid": 203, "correlation": 130326 } }, { "ph": "f", "id": 130326, "pid": 76337, "tid": -914061504, "ts": 1716454223403224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223403225, "dur": 0, "args": { "External id": 130327, "cbid": 205, "correlation": 130327 } }, { "ph": "f", "id": 130327, "pid": 76337, "tid": -914061504, "ts": 1716454223403225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223458760, "dur": 5, "args": { "External id": 130331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130331, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130331, "pid": 5, "tid": 7, "ts": 1716454223458760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403238, "dur": 11, "args": { "External id": 130331, "cbid": 211, "correlation": 130331 } }, { "ph": "s", "id": 130331, "pid": 76337, "tid": -914061504, "ts": 1716454223403238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223458766, "dur": 160, "args": { "External id": 130333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130333, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130333, "pid": 5, "tid": 7, "ts": 1716454223458766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403252, "dur": 5, "args": { "External id": 130333, "cbid": 211, "correlation": 130333 } }, { "ph": "s", "id": 130333, "pid": 76337, "tid": -914061504, "ts": 1716454223403252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223458929, "dur": 1, "args": { "External id": 130335, "device": 5, "context": 1, "stream": 7, "correlation": 130335, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 130335, "pid": 5, "tid": 7, "ts": 1716454223458929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223403262, "dur": 7, "args": { "External id": 130335, "cbid": 51, "correlation": 130335 } }, { "ph": "s", "id": 130335, "pid": 76337, "tid": -914061504, "ts": 1716454223403262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223458932, "dur": 256, "args": { "External id": 130336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130336, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130336, "pid": 5, "tid": 7, "ts": 1716454223458932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403270, "dur": 6, "args": { "External id": 130336, "cbid": 211, "correlation": 130336 } }, { "ph": "s", "id": 130336, "pid": 76337, "tid": -914061504, "ts": 1716454223403270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223459190, "dur": 6, "args": { "External id": 130338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130338, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130338, "pid": 5, "tid": 7, "ts": 1716454223459190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403279, "dur": 5, "args": { "External id": 130338, "cbid": 211, "correlation": 130338 } }, { "ph": "s", "id": 130338, "pid": 76337, "tid": -914061504, "ts": 1716454223403279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223459197, "dur": 6, "args": { "External id": 130344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130344, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130344, "pid": 5, "tid": 7, "ts": 1716454223459197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403307, "dur": 9, "args": { "External id": 130344, "cbid": 211, "correlation": 130344 } }, { "ph": "s", "id": 130344, "pid": 76337, "tid": -914061504, "ts": 1716454223403307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223459205, "dur": 3, "args": { "External id": 130352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130352, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 130352, "pid": 5, "tid": 7, "ts": 1716454223459205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403351, "dur": 9, "args": { "External id": 130352, "cbid": 211, "correlation": 130352 } }, { "ph": "s", "id": 130352, "pid": 76337, "tid": -914061504, "ts": 1716454223403351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223403414, "dur": 1, "args": { "External id": 130368, "cbid": 251, "correlation": 130368 } }, { "ph": "f", "id": 130368, "pid": 76337, "tid": -914061504, "ts": 1716454223403414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223403419, "dur": 0, "args": { "External id": 130370, "cbid": 251, "correlation": 130370 } }, { "ph": "f", "id": 130370, "pid": 76337, "tid": -914061504, "ts": 1716454223403419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223459209, "dur": 10, "args": { "External id": 130371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130371, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130371, "pid": 5, "tid": 7, "ts": 1716454223459209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403421, "dur": 11, "args": { "External id": 130371, "cbid": 211, "correlation": 130371 } }, { "ph": "s", "id": 130371, "pid": 76337, "tid": -914061504, "ts": 1716454223403421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223459220, "dur": 3, "args": { "External id": 130373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130373, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130373, "pid": 5, "tid": 7, "ts": 1716454223459220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403433, "dur": 5, "args": { "External id": 130373, "cbid": 211, "correlation": 130373 } }, { "ph": "s", "id": 130373, "pid": 76337, "tid": -914061504, "ts": 1716454223403433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223459225, "dur": 6, "args": { "External id": 130383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130383, "pid": 5, "tid": 7, "ts": 1716454223459225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403490, "dur": 12, "args": { "External id": 130383, "cbid": 211, "correlation": 130383 } }, { "ph": "s", "id": 130383, "pid": 76337, "tid": -914061504, "ts": 1716454223403490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223459232, "dur": 9, "args": { "External id": 130403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130403, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130403, "pid": 5, "tid": 7, "ts": 1716454223459232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403555, "dur": 11, "args": { "External id": 130403, "cbid": 211, "correlation": 130403 } }, { "ph": "s", "id": 130403, "pid": 76337, "tid": -914061504, "ts": 1716454223403555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223459243, "dur": 4, "args": { "External id": 130415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130415, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130415, "pid": 5, "tid": 7, "ts": 1716454223459243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403576, "dur": 6, "args": { "External id": 130415, "cbid": 211, "correlation": 130415 } }, { "ph": "s", "id": 130415, "pid": 76337, "tid": -914061504, "ts": 1716454223403576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223459248, "dur": 6, "args": { "External id": 130418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130418, "pid": 5, "tid": 7, "ts": 1716454223459248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403594, "dur": 6, "args": { "External id": 130418, "cbid": 211, "correlation": 130418 } }, { "ph": "s", "id": 130418, "pid": 76337, "tid": -914061504, "ts": 1716454223403594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223459256, "dur": 5, "args": { "External id": 130427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130427, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130427, "pid": 5, "tid": 7, "ts": 1716454223459256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403634, "dur": 10, "args": { "External id": 130427, "cbid": 211, "correlation": 130427 } }, { "ph": "s", "id": 130427, "pid": 76337, "tid": -914061504, "ts": 1716454223403634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223403696, "dur": 0, "args": { "External id": 130437, "cbid": 317, "correlation": 130437 } }, { "ph": "f", "id": 130437, "pid": 76337, "tid": -914061504, "ts": 1716454223403696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223403697, "dur": 0, "args": { "External id": 130438, "cbid": 203, "correlation": 130438 } }, { "ph": "f", "id": 130438, "pid": 76337, "tid": -914061504, "ts": 1716454223403697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223403698, "dur": 0, "args": { "External id": 130439, "cbid": 205, "correlation": 130439 } }, { "ph": "f", "id": 130439, "pid": 76337, "tid": -914061504, "ts": 1716454223403698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223459262, "dur": 5, "args": { "External id": 130443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130443, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130443, "pid": 5, "tid": 7, "ts": 1716454223459262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403712, "dur": 13, "args": { "External id": 130443, "cbid": 211, "correlation": 130443 } }, { "ph": "s", "id": 130443, "pid": 76337, "tid": -914061504, "ts": 1716454223403712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223459268, "dur": 161, "args": { "External id": 130445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130445, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130445, "pid": 5, "tid": 7, "ts": 1716454223459268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403727, "dur": 5, "args": { "External id": 130445, "cbid": 211, "correlation": 130445 } }, { "ph": "s", "id": 130445, "pid": 76337, "tid": -914061504, "ts": 1716454223403727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223459431, "dur": 1, "args": { "External id": 130447, "device": 5, "context": 1, "stream": 7, "correlation": 130447, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 130447, "pid": 5, "tid": 7, "ts": 1716454223459431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223403738, "dur": 7, "args": { "External id": 130447, "cbid": 51, "correlation": 130447 } }, { "ph": "s", "id": 130447, "pid": 76337, "tid": -914061504, "ts": 1716454223403738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223459435, "dur": 258, "args": { "External id": 130448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130448, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130448, "pid": 5, "tid": 7, "ts": 1716454223459435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403746, "dur": 6, "args": { "External id": 130448, "cbid": 211, "correlation": 130448 } }, { "ph": "s", "id": 130448, "pid": 76337, "tid": -914061504, "ts": 1716454223403746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223459695, "dur": 6, "args": { "External id": 130450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130450, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130450, "pid": 5, "tid": 7, "ts": 1716454223459695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403755, "dur": 5, "args": { "External id": 130450, "cbid": 211, "correlation": 130450 } }, { "ph": "s", "id": 130450, "pid": 76337, "tid": -914061504, "ts": 1716454223403755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223459702, "dur": 6, "args": { "External id": 130456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130456, "pid": 5, "tid": 7, "ts": 1716454223459702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403784, "dur": 9, "args": { "External id": 130456, "cbid": 211, "correlation": 130456 } }, { "ph": "s", "id": 130456, "pid": 76337, "tid": -914061504, "ts": 1716454223403784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223459709, "dur": 5, "args": { "External id": 130464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130464, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130464, "pid": 5, "tid": 7, "ts": 1716454223459709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403818, "dur": 9, "args": { "External id": 130464, "cbid": 211, "correlation": 130464 } }, { "ph": "s", "id": 130464, "pid": 76337, "tid": -914061504, "ts": 1716454223403818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223459715, "dur": 4, "args": { "External id": 130472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130472, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130472, "pid": 5, "tid": 7, "ts": 1716454223459715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403847, "dur": 8, "args": { "External id": 130472, "cbid": 211, "correlation": 130472 } }, { "ph": "s", "id": 130472, "pid": 76337, "tid": -914061504, "ts": 1716454223403847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223459721, "dur": 9, "args": { "External id": 130492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130492, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130492, "pid": 5, "tid": 7, "ts": 1716454223459721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403950, "dur": 13, "args": { "External id": 130492, "cbid": 211, "correlation": 130492 } }, { "ph": "s", "id": 130492, "pid": 76337, "tid": -914061504, "ts": 1716454223403950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223459732, "dur": 4, "args": { "External id": 130504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130504, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130504, "pid": 5, "tid": 7, "ts": 1716454223459732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223403982, "dur": 7, "args": { "External id": 130504, "cbid": 211, "correlation": 130504 } }, { "ph": "s", "id": 130504, "pid": 76337, "tid": -914061504, "ts": 1716454223403982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223459737, "dur": 7, "args": { "External id": 130507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130507, "pid": 5, "tid": 7, "ts": 1716454223459737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404001, "dur": 7, "args": { "External id": 130507, "cbid": 211, "correlation": 130507 } }, { "ph": "s", "id": 130507, "pid": 76337, "tid": -914061504, "ts": 1716454223404001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223459744, "dur": 5, "args": { "External id": 130516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130516, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130516, "pid": 5, "tid": 7, "ts": 1716454223459744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404041, "dur": 9, "args": { "External id": 130516, "cbid": 211, "correlation": 130516 } }, { "ph": "s", "id": 130516, "pid": 76337, "tid": -914061504, "ts": 1716454223404041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223404094, "dur": 0, "args": { "External id": 130526, "cbid": 317, "correlation": 130526 } }, { "ph": "f", "id": 130526, "pid": 76337, "tid": -914061504, "ts": 1716454223404094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223404095, "dur": 0, "args": { "External id": 130527, "cbid": 203, "correlation": 130527 } }, { "ph": "f", "id": 130527, "pid": 76337, "tid": -914061504, "ts": 1716454223404095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223404096, "dur": 0, "args": { "External id": 130528, "cbid": 205, "correlation": 130528 } }, { "ph": "f", "id": 130528, "pid": 76337, "tid": -914061504, "ts": 1716454223404096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223459750, "dur": 5, "args": { "External id": 130532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130532, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130532, "pid": 5, "tid": 7, "ts": 1716454223459750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404109, "dur": 12, "args": { "External id": 130532, "cbid": 211, "correlation": 130532 } }, { "ph": "s", "id": 130532, "pid": 76337, "tid": -914061504, "ts": 1716454223404109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223459756, "dur": 161, "args": { "External id": 130534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130534, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130534, "pid": 5, "tid": 7, "ts": 1716454223459756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404123, "dur": 5, "args": { "External id": 130534, "cbid": 211, "correlation": 130534 } }, { "ph": "s", "id": 130534, "pid": 76337, "tid": -914061504, "ts": 1716454223404123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223459920, "dur": 1, "args": { "External id": 130536, "device": 5, "context": 1, "stream": 7, "correlation": 130536, "bytes": 240, "memory bandwidth (GB/s)": 0.1388888888888889 } }, { "ph": "f", "id": 130536, "pid": 5, "tid": 7, "ts": 1716454223459920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223404134, "dur": 6, "args": { "External id": 130536, "cbid": 51, "correlation": 130536 } }, { "ph": "s", "id": 130536, "pid": 76337, "tid": -914061504, "ts": 1716454223404134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223459924, "dur": 256, "args": { "External id": 130537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130537, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130537, "pid": 5, "tid": 7, "ts": 1716454223459924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404141, "dur": 6, "args": { "External id": 130537, "cbid": 211, "correlation": 130537 } }, { "ph": "s", "id": 130537, "pid": 76337, "tid": -914061504, "ts": 1716454223404141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223460181, "dur": 6, "args": { "External id": 130539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130539, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130539, "pid": 5, "tid": 7, "ts": 1716454223460181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404150, "dur": 5, "args": { "External id": 130539, "cbid": 211, "correlation": 130539 } }, { "ph": "s", "id": 130539, "pid": 76337, "tid": -914061504, "ts": 1716454223404150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223460189, "dur": 6, "args": { "External id": 130545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130545, "pid": 5, "tid": 7, "ts": 1716454223460189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404179, "dur": 8, "args": { "External id": 130545, "cbid": 211, "correlation": 130545 } }, { "ph": "s", "id": 130545, "pid": 76337, "tid": -914061504, "ts": 1716454223404179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223460196, "dur": 3, "args": { "External id": 130553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130553, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 130553, "pid": 5, "tid": 7, "ts": 1716454223460196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404223, "dur": 9, "args": { "External id": 130553, "cbid": 211, "correlation": 130553 } }, { "ph": "s", "id": 130553, "pid": 76337, "tid": -914061504, "ts": 1716454223404223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223404285, "dur": 1, "args": { "External id": 130569, "cbid": 251, "correlation": 130569 } }, { "ph": "f", "id": 130569, "pid": 76337, "tid": -914061504, "ts": 1716454223404285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223404291, "dur": 0, "args": { "External id": 130571, "cbid": 251, "correlation": 130571 } }, { "ph": "f", "id": 130571, "pid": 76337, "tid": -914061504, "ts": 1716454223404291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223460200, "dur": 10, "args": { "External id": 130572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130572, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130572, "pid": 5, "tid": 7, "ts": 1716454223460200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404293, "dur": 11, "args": { "External id": 130572, "cbid": 211, "correlation": 130572 } }, { "ph": "s", "id": 130572, "pid": 76337, "tid": -914061504, "ts": 1716454223404293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223460212, "dur": 4, "args": { "External id": 130574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130574, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130574, "pid": 5, "tid": 7, "ts": 1716454223460212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404306, "dur": 5, "args": { "External id": 130574, "cbid": 211, "correlation": 130574 } }, { "ph": "s", "id": 130574, "pid": 76337, "tid": -914061504, "ts": 1716454223404306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223460217, "dur": 6, "args": { "External id": 130584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130584, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130584, "pid": 5, "tid": 7, "ts": 1716454223460217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404362, "dur": 13, "args": { "External id": 130584, "cbid": 211, "correlation": 130584 } }, { "ph": "s", "id": 130584, "pid": 76337, "tid": -914061504, "ts": 1716454223404362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223460224, "dur": 10, "args": { "External id": 130604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130604, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130604, "pid": 5, "tid": 7, "ts": 1716454223460224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404429, "dur": 10, "args": { "External id": 130604, "cbid": 211, "correlation": 130604 } }, { "ph": "s", "id": 130604, "pid": 76337, "tid": -914061504, "ts": 1716454223404429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223460235, "dur": 4, "args": { "External id": 130616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130616, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130616, "pid": 5, "tid": 7, "ts": 1716454223460235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404448, "dur": 6, "args": { "External id": 130616, "cbid": 211, "correlation": 130616 } }, { "ph": "s", "id": 130616, "pid": 76337, "tid": -914061504, "ts": 1716454223404448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223460240, "dur": 7, "args": { "External id": 130619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130619, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130619, "pid": 5, "tid": 7, "ts": 1716454223460240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404466, "dur": 7, "args": { "External id": 130619, "cbid": 211, "correlation": 130619 } }, { "ph": "s", "id": 130619, "pid": 76337, "tid": -914061504, "ts": 1716454223404466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223460248, "dur": 5, "args": { "External id": 130628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130628, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130628, "pid": 5, "tid": 7, "ts": 1716454223460248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404507, "dur": 10, "args": { "External id": 130628, "cbid": 211, "correlation": 130628 } }, { "ph": "s", "id": 130628, "pid": 76337, "tid": -914061504, "ts": 1716454223404507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223404570, "dur": 0, "args": { "External id": 130638, "cbid": 317, "correlation": 130638 } }, { "ph": "f", "id": 130638, "pid": 76337, "tid": -914061504, "ts": 1716454223404570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223404571, "dur": 0, "args": { "External id": 130639, "cbid": 203, "correlation": 130639 } }, { "ph": "f", "id": 130639, "pid": 76337, "tid": -914061504, "ts": 1716454223404571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223404572, "dur": 0, "args": { "External id": 130640, "cbid": 205, "correlation": 130640 } }, { "ph": "f", "id": 130640, "pid": 76337, "tid": -914061504, "ts": 1716454223404572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223460254, "dur": 5, "args": { "External id": 130644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130644, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130644, "pid": 5, "tid": 7, "ts": 1716454223460254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404586, "dur": 12, "args": { "External id": 130644, "cbid": 211, "correlation": 130644 } }, { "ph": "s", "id": 130644, "pid": 76337, "tid": -914061504, "ts": 1716454223404586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223460260, "dur": 161, "args": { "External id": 130646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130646, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130646, "pid": 5, "tid": 7, "ts": 1716454223460260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404600, "dur": 5, "args": { "External id": 130646, "cbid": 211, "correlation": 130646 } }, { "ph": "s", "id": 130646, "pid": 76337, "tid": -914061504, "ts": 1716454223404600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223460423, "dur": 1, "args": { "External id": 130648, "device": 5, "context": 1, "stream": 7, "correlation": 130648, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 130648, "pid": 5, "tid": 7, "ts": 1716454223460423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223404611, "dur": 6, "args": { "External id": 130648, "cbid": 51, "correlation": 130648 } }, { "ph": "s", "id": 130648, "pid": 76337, "tid": -914061504, "ts": 1716454223404611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223460427, "dur": 257, "args": { "External id": 130649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130649, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130649, "pid": 5, "tid": 7, "ts": 1716454223460427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404618, "dur": 7, "args": { "External id": 130649, "cbid": 211, "correlation": 130649 } }, { "ph": "s", "id": 130649, "pid": 76337, "tid": -914061504, "ts": 1716454223404618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223460685, "dur": 6, "args": { "External id": 130651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130651, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130651, "pid": 5, "tid": 7, "ts": 1716454223460685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404628, "dur": 5, "args": { "External id": 130651, "cbid": 211, "correlation": 130651 } }, { "ph": "s", "id": 130651, "pid": 76337, "tid": -914061504, "ts": 1716454223404628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223460692, "dur": 6, "args": { "External id": 130657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130657, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130657, "pid": 5, "tid": 7, "ts": 1716454223460692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404656, "dur": 8, "args": { "External id": 130657, "cbid": 211, "correlation": 130657 } }, { "ph": "s", "id": 130657, "pid": 76337, "tid": -914061504, "ts": 1716454223404656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223460699, "dur": 5, "args": { "External id": 130665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130665, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130665, "pid": 5, "tid": 7, "ts": 1716454223460699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404689, "dur": 8, "args": { "External id": 130665, "cbid": 211, "correlation": 130665 } }, { "ph": "s", "id": 130665, "pid": 76337, "tid": -914061504, "ts": 1716454223404689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223460706, "dur": 4, "args": { "External id": 130673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130673, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130673, "pid": 5, "tid": 7, "ts": 1716454223460706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404718, "dur": 9, "args": { "External id": 130673, "cbid": 211, "correlation": 130673 } }, { "ph": "s", "id": 130673, "pid": 76337, "tid": -914061504, "ts": 1716454223404718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223460712, "dur": 9, "args": { "External id": 130693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130693, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 130693, "pid": 5, "tid": 7, "ts": 1716454223460712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404861, "dur": 14, "args": { "External id": 130693, "cbid": 211, "correlation": 130693 } }, { "ph": "s", "id": 130693, "pid": 76337, "tid": -914061504, "ts": 1716454223404861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223460722, "dur": 4, "args": { "External id": 130705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130705, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 130705, "pid": 5, "tid": 7, "ts": 1716454223460722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404884, "dur": 6, "args": { "External id": 130705, "cbid": 211, "correlation": 130705 } }, { "ph": "s", "id": 130705, "pid": 76337, "tid": -914061504, "ts": 1716454223404884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223460727, "dur": 7, "args": { "External id": 130708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130708, "pid": 5, "tid": 7, "ts": 1716454223460727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404903, "dur": 6, "args": { "External id": 130708, "cbid": 211, "correlation": 130708 } }, { "ph": "s", "id": 130708, "pid": 76337, "tid": -914061504, "ts": 1716454223404903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223404961, "dur": 0, "args": { "External id": 130719, "cbid": 317, "correlation": 130719 } }, { "ph": "f", "id": 130719, "pid": 76337, "tid": -914061504, "ts": 1716454223404961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223404961, "dur": 0, "args": { "External id": 130720, "cbid": 203, "correlation": 130720 } }, { "ph": "f", "id": 130720, "pid": 76337, "tid": -914061504, "ts": 1716454223404961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223404962, "dur": 0, "args": { "External id": 130721, "cbid": 205, "correlation": 130721 } }, { "ph": "f", "id": 130721, "pid": 76337, "tid": -914061504, "ts": 1716454223404962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223460735, "dur": 5, "args": { "External id": 130725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130725, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130725, "pid": 5, "tid": 7, "ts": 1716454223460735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223404989, "dur": 12, "args": { "External id": 130725, "cbid": 211, "correlation": 130725 } }, { "ph": "s", "id": 130725, "pid": 76337, "tid": -914061504, "ts": 1716454223404989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223460742, "dur": 37, "args": { "External id": 130727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130727, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 130727, "pid": 5, "tid": 7, "ts": 1716454223460742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405008, "dur": 10, "args": { "External id": 130727, "cbid": 211, "correlation": 130727 } }, { "ph": "s", "id": 130727, "pid": 76337, "tid": -914061504, "ts": 1716454223405008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223460780, "dur": 5, "args": { "External id": 130729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130729, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130729, "pid": 5, "tid": 7, "ts": 1716454223460780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405022, "dur": 5, "args": { "External id": 130729, "cbid": 211, "correlation": 130729 } }, { "ph": "s", "id": 130729, "pid": 76337, "tid": -914061504, "ts": 1716454223405022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223460786, "dur": 6, "args": { "External id": 130735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130735, "pid": 5, "tid": 7, "ts": 1716454223460786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405050, "dur": 8, "args": { "External id": 130735, "cbid": 211, "correlation": 130735 } }, { "ph": "s", "id": 130735, "pid": 76337, "tid": -914061504, "ts": 1716454223405050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223460793, "dur": 20, "args": { "External id": 130744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130744, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130744, "pid": 5, "tid": 7, "ts": 1716454223460793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405133, "dur": 14, "args": { "External id": 130744, "cbid": 211, "correlation": 130744 } }, { "ph": "s", "id": 130744, "pid": 76337, "tid": -914061504, "ts": 1716454223405133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223460815, "dur": 11, "args": { "External id": 130766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130766, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 130766, "pid": 5, "tid": 7, "ts": 1716454223460815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405190, "dur": 10, "args": { "External id": 130766, "cbid": 211, "correlation": 130766 } }, { "ph": "s", "id": 130766, "pid": 76337, "tid": -914061504, "ts": 1716454223405190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405279, "dur": 2, "args": { "External id": 130777, "cbid": 251, "correlation": 130777 } }, { "ph": "f", "id": 130777, "pid": 76337, "tid": -914061504, "ts": 1716454223405279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405284, "dur": 0, "args": { "External id": 130778, "cbid": 251, "correlation": 130778 } }, { "ph": "f", "id": 130778, "pid": 76337, "tid": -914061504, "ts": 1716454223405284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223460827, "dur": 53, "args": { "External id": 130779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130779, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 130779, "pid": 5, "tid": 7, "ts": 1716454223460827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405287, "dur": 14, "args": { "External id": 130779, "cbid": 211, "correlation": 130779 } }, { "ph": "s", "id": 130779, "pid": 76337, "tid": -914061504, "ts": 1716454223405287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405358, "dur": 1, "args": { "External id": 130790, "cbid": 251, "correlation": 130790 } }, { "ph": "f", "id": 130790, "pid": 76337, "tid": -914061504, "ts": 1716454223405358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405362, "dur": 0, "args": { "External id": 130791, "cbid": 251, "correlation": 130791 } }, { "ph": "f", "id": 130791, "pid": 76337, "tid": -914061504, "ts": 1716454223405362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223460882, "dur": 52, "args": { "External id": 130792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130792, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 130792, "pid": 5, "tid": 7, "ts": 1716454223460882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405364, "dur": 12, "args": { "External id": 130792, "cbid": 211, "correlation": 130792 } }, { "ph": "s", "id": 130792, "pid": 76337, "tid": -914061504, "ts": 1716454223405364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405429, "dur": 1, "args": { "External id": 130803, "cbid": 251, "correlation": 130803 } }, { "ph": "f", "id": 130803, "pid": 76337, "tid": -914061504, "ts": 1716454223405429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405432, "dur": 0, "args": { "External id": 130804, "cbid": 251, "correlation": 130804 } }, { "ph": "f", "id": 130804, "pid": 76337, "tid": -914061504, "ts": 1716454223405432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223460935, "dur": 53, "args": { "External id": 130805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130805, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 130805, "pid": 5, "tid": 7, "ts": 1716454223460935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405434, "dur": 11, "args": { "External id": 130805, "cbid": 211, "correlation": 130805 } }, { "ph": "s", "id": 130805, "pid": 76337, "tid": -914061504, "ts": 1716454223405434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223460990, "dur": 55, "args": { "External id": 130830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130830, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130830, "pid": 5, "tid": 7, "ts": 1716454223460990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405517, "dur": 13, "args": { "External id": 130830, "cbid": 211, "correlation": 130830 } }, { "ph": "s", "id": 130830, "pid": 76337, "tid": -914061504, "ts": 1716454223405517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405616, "dur": 1, "args": { "External id": 130848, "cbid": 251, "correlation": 130848 } }, { "ph": "f", "id": 130848, "pid": 76337, "tid": -914061504, "ts": 1716454223405616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223461047, "dur": 62, "args": { "External id": 130850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130850, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 130850, "pid": 5, "tid": 7, "ts": 1716454223461047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405622, "dur": 13, "args": { "External id": 130850, "cbid": 211, "correlation": 130850 } }, { "ph": "s", "id": 130850, "pid": 76337, "tid": -914061504, "ts": 1716454223405622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223461110, "dur": 6, "args": { "External id": 130858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130858, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130858, "pid": 5, "tid": 7, "ts": 1716454223461110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405692, "dur": 12, "args": { "External id": 130858, "cbid": 211, "correlation": 130858 } }, { "ph": "s", "id": 130858, "pid": 76337, "tid": -914061504, "ts": 1716454223405692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223461118, "dur": 7, "args": { "External id": 130866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130866, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 130866, "pid": 5, "tid": 7, "ts": 1716454223461118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405730, "dur": 8, "args": { "External id": 130866, "cbid": 211, "correlation": 130866 } }, { "ph": "s", "id": 130866, "pid": 76337, "tid": -914061504, "ts": 1716454223405730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461126, "dur": 7, "args": { "External id": 130877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130877, "pid": 5, "tid": 7, "ts": 1716454223461126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405802, "dur": 12, "args": { "External id": 130877, "cbid": 211, "correlation": 130877 } }, { "ph": "s", "id": 130877, "pid": 76337, "tid": -914061504, "ts": 1716454223405802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223461135, "dur": 8, "args": { "External id": 130899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130899, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 130899, "pid": 5, "tid": 7, "ts": 1716454223461135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405835, "dur": 7, "args": { "External id": 130899, "cbid": 211, "correlation": 130899 } }, { "ph": "s", "id": 130899, "pid": 76337, "tid": -914061504, "ts": 1716454223405835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223405921, "dur": 2, "args": { "External id": 130910, "cbid": 251, "correlation": 130910 } }, { "ph": "f", "id": 130910, "pid": 76337, "tid": -914061504, "ts": 1716454223405921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223461145, "dur": 1, "args": { "External id": 130911, "device": 5, "context": 1, "stream": 7, "correlation": 130911, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 130911, "pid": 5, "tid": 7, "ts": 1716454223461145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223405927, "dur": 11, "args": { "External id": 130911, "cbid": 51, "correlation": 130911 } }, { "ph": "s", "id": 130911, "pid": 76337, "tid": -914061504, "ts": 1716454223405927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223461149, "dur": 36, "args": { "External id": 130912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130912, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 130912, "pid": 5, "tid": 7, "ts": 1716454223461149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223405939, "dur": 12, "args": { "External id": 130912, "cbid": 211, "correlation": 130912 } }, { "ph": "s", "id": 130912, "pid": 76337, "tid": -914061504, "ts": 1716454223405939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406019, "dur": 1, "args": { "External id": 130923, "cbid": 251, "correlation": 130923 } }, { "ph": "f", "id": 130923, "pid": 76337, "tid": -914061504, "ts": 1716454223406019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406023, "dur": 0, "args": { "External id": 130924, "cbid": 251, "correlation": 130924 } }, { "ph": "f", "id": 130924, "pid": 76337, "tid": -914061504, "ts": 1716454223406023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223461187, "dur": 12, "args": { "External id": 130925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130925, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130925, "pid": 5, "tid": 7, "ts": 1716454223461187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406025, "dur": 13, "args": { "External id": 130925, "cbid": 211, "correlation": 130925 } }, { "ph": "s", "id": 130925, "pid": 76337, "tid": -914061504, "ts": 1716454223406025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223461200, "dur": 5, "args": { "External id": 130927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130927, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130927, "pid": 5, "tid": 7, "ts": 1716454223461200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406041, "dur": 6, "args": { "External id": 130927, "cbid": 211, "correlation": 130927 } }, { "ph": "s", "id": 130927, "pid": 76337, "tid": -914061504, "ts": 1716454223406041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406099, "dur": 1, "args": { "External id": 130938, "cbid": 251, "correlation": 130938 } }, { "ph": "f", "id": 130938, "pid": 76337, "tid": -914061504, "ts": 1716454223406099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406102, "dur": 0, "args": { "External id": 130939, "cbid": 251, "correlation": 130939 } }, { "ph": "f", "id": 130939, "pid": 76337, "tid": -914061504, "ts": 1716454223406102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223461206, "dur": 8, "args": { "External id": 130940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130940, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130940, "pid": 5, "tid": 7, "ts": 1716454223461206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406104, "dur": 11, "args": { "External id": 130940, "cbid": 211, "correlation": 130940 } }, { "ph": "s", "id": 130940, "pid": 76337, "tid": -914061504, "ts": 1716454223406104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223461216, "dur": 4, "args": { "External id": 130942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130942, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 130942, "pid": 5, "tid": 7, "ts": 1716454223461216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406117, "dur": 5, "args": { "External id": 130942, "cbid": 211, "correlation": 130942 } }, { "ph": "s", "id": 130942, "pid": 76337, "tid": -914061504, "ts": 1716454223406117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223461221, "dur": 20, "args": { "External id": 130967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130967, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 130967, "pid": 5, "tid": 7, "ts": 1716454223461221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406196, "dur": 12, "args": { "External id": 130967, "cbid": 211, "correlation": 130967 } }, { "ph": "s", "id": 130967, "pid": 76337, "tid": -914061504, "ts": 1716454223406196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406296, "dur": 2, "args": { "External id": 130985, "cbid": 251, "correlation": 130985 } }, { "ph": "f", "id": 130985, "pid": 76337, "tid": -914061504, "ts": 1716454223406296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223461243, "dur": 1, "args": { "External id": 130987, "device": 5, "context": 1, "stream": 7, "correlation": 130987, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 130987, "pid": 5, "tid": 7, "ts": 1716454223461243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223406302, "dur": 10, "args": { "External id": 130987, "cbid": 51, "correlation": 130987 } }, { "ph": "s", "id": 130987, "pid": 76337, "tid": -914061504, "ts": 1716454223406302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223461246, "dur": 36, "args": { "External id": 130988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130988, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 130988, "pid": 5, "tid": 7, "ts": 1716454223461246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406314, "dur": 13, "args": { "External id": 130988, "cbid": 211, "correlation": 130988 } }, { "ph": "s", "id": 130988, "pid": 76337, "tid": -914061504, "ts": 1716454223406314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223461284, "dur": 4, "args": { "External id": 130996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 130996, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 130996, "pid": 5, "tid": 7, "ts": 1716454223461284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406386, "dur": 12, "args": { "External id": 130996, "cbid": 211, "correlation": 130996 } }, { "ph": "s", "id": 130996, "pid": 76337, "tid": -914061504, "ts": 1716454223406386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461290, "dur": 8, "args": { "External id": 131004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131004, "pid": 5, "tid": 7, "ts": 1716454223461290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406428, "dur": 9, "args": { "External id": 131004, "cbid": 211, "correlation": 131004 } }, { "ph": "s", "id": 131004, "pid": 76337, "tid": -914061504, "ts": 1716454223406428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223461299, "dur": 8, "args": { "External id": 131026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131026, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 131026, "pid": 5, "tid": 7, "ts": 1716454223461299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406479, "dur": 10, "args": { "External id": 131026, "cbid": 211, "correlation": 131026 } }, { "ph": "s", "id": 131026, "pid": 76337, "tid": -914061504, "ts": 1716454223406479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406569, "dur": 1, "args": { "External id": 131042, "cbid": 251, "correlation": 131042 } }, { "ph": "f", "id": 131042, "pid": 76337, "tid": -914061504, "ts": 1716454223406569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406575, "dur": 0, "args": { "External id": 131044, "cbid": 251, "correlation": 131044 } }, { "ph": "f", "id": 131044, "pid": 76337, "tid": -914061504, "ts": 1716454223406575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223461308, "dur": 190, "args": { "External id": 131045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131045, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131045, "pid": 5, "tid": 7, "ts": 1716454223461308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406577, "dur": 13, "args": { "External id": 131045, "cbid": 211, "correlation": 131045 } }, { "ph": "s", "id": 131045, "pid": 76337, "tid": -914061504, "ts": 1716454223406577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461500, "dur": 21, "args": { "External id": 131053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131053, "pid": 5, "tid": 7, "ts": 1716454223461500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406644, "dur": 13, "args": { "External id": 131053, "cbid": 211, "correlation": 131053 } }, { "ph": "s", "id": 131053, "pid": 76337, "tid": -914061504, "ts": 1716454223406644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461522, "dur": 21, "args": { "External id": 131061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131061, "pid": 5, "tid": 7, "ts": 1716454223461522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406675, "dur": 9, "args": { "External id": 131061, "cbid": 211, "correlation": 131061 } }, { "ph": "s", "id": 131061, "pid": 76337, "tid": -914061504, "ts": 1716454223406675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223406756, "dur": 1, "args": { "External id": 131077, "cbid": 251, "correlation": 131077 } }, { "ph": "f", "id": 131077, "pid": 76337, "tid": -914061504, "ts": 1716454223406756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223461545, "dur": 1, "args": { "External id": 131079, "device": 5, "context": 1, "stream": 7, "correlation": 131079, "bytes": 120, "memory bandwidth (GB/s)": 0.0797872340425532 } }, { "ph": "f", "id": 131079, "pid": 5, "tid": 7, "ts": 1716454223461545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223406761, "dur": 8, "args": { "External id": 131079, "cbid": 51, "correlation": 131079 } }, { "ph": "s", "id": 131079, "pid": 76337, "tid": -914061504, "ts": 1716454223406761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223461549, "dur": 109, "args": { "External id": 131080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131080, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 131080, "pid": 5, "tid": 7, "ts": 1716454223461549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406771, "dur": 12, "args": { "External id": 131080, "cbid": 211, "correlation": 131080 } }, { "ph": "s", "id": 131080, "pid": 76337, "tid": -914061504, "ts": 1716454223406771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223461659, "dur": 5, "args": { "External id": 131088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131088, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131088, "pid": 5, "tid": 7, "ts": 1716454223461659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406814, "dur": 10, "args": { "External id": 131088, "cbid": 211, "correlation": 131088 } }, { "ph": "s", "id": 131088, "pid": 76337, "tid": -914061504, "ts": 1716454223406814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461666, "dur": 9, "args": { "External id": 131099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131099, "pid": 5, "tid": 7, "ts": 1716454223461666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406883, "dur": 12, "args": { "External id": 131099, "cbid": 211, "correlation": 131099 } }, { "ph": "s", "id": 131099, "pid": 76337, "tid": -914061504, "ts": 1716454223406883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223406947, "dur": 0, "args": { "External id": 131111, "cbid": 317, "correlation": 131111 } }, { "ph": "f", "id": 131111, "pid": 76337, "tid": -914061504, "ts": 1716454223406947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223406947, "dur": 0, "args": { "External id": 131112, "cbid": 203, "correlation": 131112 } }, { "ph": "f", "id": 131112, "pid": 76337, "tid": -914061504, "ts": 1716454223406947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223406948, "dur": 0, "args": { "External id": 131113, "cbid": 205, "correlation": 131113 } }, { "ph": "f", "id": 131113, "pid": 76337, "tid": -914061504, "ts": 1716454223406948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223461676, "dur": 6, "args": { "External id": 131117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131117, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131117, "pid": 5, "tid": 7, "ts": 1716454223461676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406963, "dur": 20, "args": { "External id": 131117, "cbid": 211, "correlation": 131117 } }, { "ph": "s", "id": 131117, "pid": 76337, "tid": -914061504, "ts": 1716454223406963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223461683, "dur": 36, "args": { "External id": 131119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131119, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 131119, "pid": 5, "tid": 7, "ts": 1716454223461683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223406990, "dur": 7, "args": { "External id": 131119, "cbid": 211, "correlation": 131119 } }, { "ph": "s", "id": 131119, "pid": 76337, "tid": -914061504, "ts": 1716454223406990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223461721, "dur": 6, "args": { "External id": 131121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131121, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131121, "pid": 5, "tid": 7, "ts": 1716454223461721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407001, "dur": 5, "args": { "External id": 131121, "cbid": 211, "correlation": 131121 } }, { "ph": "s", "id": 131121, "pid": 76337, "tid": -914061504, "ts": 1716454223407001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461728, "dur": 7, "args": { "External id": 131127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131127, "pid": 5, "tid": 7, "ts": 1716454223461728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407030, "dur": 9, "args": { "External id": 131127, "cbid": 211, "correlation": 131127 } }, { "ph": "s", "id": 131127, "pid": 76337, "tid": -914061504, "ts": 1716454223407030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223461737, "dur": 5, "args": { "External id": 131135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131135, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131135, "pid": 5, "tid": 7, "ts": 1716454223461737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407061, "dur": 8, "args": { "External id": 131135, "cbid": 211, "correlation": 131135 } }, { "ph": "s", "id": 131135, "pid": 76337, "tid": -914061504, "ts": 1716454223407061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223461743, "dur": 11, "args": { "External id": 131155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131155, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131155, "pid": 5, "tid": 7, "ts": 1716454223461743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407135, "dur": 11, "args": { "External id": 131155, "cbid": 211, "correlation": 131155 } }, { "ph": "s", "id": 131155, "pid": 76337, "tid": -914061504, "ts": 1716454223407135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223461755, "dur": 4, "args": { "External id": 131167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131167, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 131167, "pid": 5, "tid": 7, "ts": 1716454223461755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407156, "dur": 6, "args": { "External id": 131167, "cbid": 211, "correlation": 131167 } }, { "ph": "s", "id": 131167, "pid": 76337, "tid": -914061504, "ts": 1716454223407156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223461761, "dur": 9, "args": { "External id": 131170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131170, "pid": 5, "tid": 7, "ts": 1716454223461761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407175, "dur": 7, "args": { "External id": 131170, "cbid": 211, "correlation": 131170 } }, { "ph": "s", "id": 131170, "pid": 76337, "tid": -914061504, "ts": 1716454223407175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223461771, "dur": 5, "args": { "External id": 131179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131179, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131179, "pid": 5, "tid": 7, "ts": 1716454223461771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407214, "dur": 10, "args": { "External id": 131179, "cbid": 211, "correlation": 131179 } }, { "ph": "s", "id": 131179, "pid": 76337, "tid": -914061504, "ts": 1716454223407214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223407266, "dur": 0, "args": { "External id": 131189, "cbid": 317, "correlation": 131189 } }, { "ph": "f", "id": 131189, "pid": 76337, "tid": -914061504, "ts": 1716454223407266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223407266, "dur": 0, "args": { "External id": 131190, "cbid": 203, "correlation": 131190 } }, { "ph": "f", "id": 131190, "pid": 76337, "tid": -914061504, "ts": 1716454223407266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223407267, "dur": 0, "args": { "External id": 131191, "cbid": 205, "correlation": 131191 } }, { "ph": "f", "id": 131191, "pid": 76337, "tid": -914061504, "ts": 1716454223407267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223461777, "dur": 5, "args": { "External id": 131195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131195, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131195, "pid": 5, "tid": 7, "ts": 1716454223461777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407281, "dur": 12, "args": { "External id": 131195, "cbid": 211, "correlation": 131195 } }, { "ph": "s", "id": 131195, "pid": 76337, "tid": -914061504, "ts": 1716454223407281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223461784, "dur": 161, "args": { "External id": 131197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131197, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131197, "pid": 5, "tid": 7, "ts": 1716454223461784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407295, "dur": 5, "args": { "External id": 131197, "cbid": 211, "correlation": 131197 } }, { "ph": "s", "id": 131197, "pid": 76337, "tid": -914061504, "ts": 1716454223407295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223461947, "dur": 1, "args": { "External id": 131199, "device": 5, "context": 1, "stream": 7, "correlation": 131199, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 131199, "pid": 5, "tid": 7, "ts": 1716454223461947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223407307, "dur": 6, "args": { "External id": 131199, "cbid": 51, "correlation": 131199 } }, { "ph": "s", "id": 131199, "pid": 76337, "tid": -914061504, "ts": 1716454223407307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223461951, "dur": 268, "args": { "External id": 131200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131200, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131200, "pid": 5, "tid": 7, "ts": 1716454223461951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407314, "dur": 6, "args": { "External id": 131200, "cbid": 211, "correlation": 131200 } }, { "ph": "s", "id": 131200, "pid": 76337, "tid": -914061504, "ts": 1716454223407314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223462220, "dur": 6, "args": { "External id": 131202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131202, "pid": 5, "tid": 7, "ts": 1716454223462220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407324, "dur": 5, "args": { "External id": 131202, "cbid": 211, "correlation": 131202 } }, { "ph": "s", "id": 131202, "pid": 76337, "tid": -914061504, "ts": 1716454223407324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223462227, "dur": 6, "args": { "External id": 131208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131208, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131208, "pid": 5, "tid": 7, "ts": 1716454223462227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407352, "dur": 8, "args": { "External id": 131208, "cbid": 211, "correlation": 131208 } }, { "ph": "s", "id": 131208, "pid": 76337, "tid": -914061504, "ts": 1716454223407352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223462235, "dur": 3, "args": { "External id": 131216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131216, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 131216, "pid": 5, "tid": 7, "ts": 1716454223462235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407396, "dur": 9, "args": { "External id": 131216, "cbid": 211, "correlation": 131216 } }, { "ph": "s", "id": 131216, "pid": 76337, "tid": -914061504, "ts": 1716454223407396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223407460, "dur": 1, "args": { "External id": 131232, "cbid": 251, "correlation": 131232 } }, { "ph": "f", "id": 131232, "pid": 76337, "tid": -914061504, "ts": 1716454223407460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223407465, "dur": 0, "args": { "External id": 131234, "cbid": 251, "correlation": 131234 } }, { "ph": "f", "id": 131234, "pid": 76337, "tid": -914061504, "ts": 1716454223407465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223462239, "dur": 13, "args": { "External id": 131235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131235, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131235, "pid": 5, "tid": 7, "ts": 1716454223462239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407467, "dur": 11, "args": { "External id": 131235, "cbid": 211, "correlation": 131235 } }, { "ph": "s", "id": 131235, "pid": 76337, "tid": -914061504, "ts": 1716454223407467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223462253, "dur": 5, "args": { "External id": 131237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131237, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131237, "pid": 5, "tid": 7, "ts": 1716454223462253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407480, "dur": 6, "args": { "External id": 131237, "cbid": 211, "correlation": 131237 } }, { "ph": "s", "id": 131237, "pid": 76337, "tid": -914061504, "ts": 1716454223407480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223462260, "dur": 6, "args": { "External id": 131247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131247, "pid": 5, "tid": 7, "ts": 1716454223462260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407538, "dur": 12, "args": { "External id": 131247, "cbid": 211, "correlation": 131247 } }, { "ph": "s", "id": 131247, "pid": 76337, "tid": -914061504, "ts": 1716454223407538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223462266, "dur": 10, "args": { "External id": 131267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131267, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131267, "pid": 5, "tid": 7, "ts": 1716454223462266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407603, "dur": 11, "args": { "External id": 131267, "cbid": 211, "correlation": 131267 } }, { "ph": "s", "id": 131267, "pid": 76337, "tid": -914061504, "ts": 1716454223407603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223462278, "dur": 4, "args": { "External id": 131279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131279, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 131279, "pid": 5, "tid": 7, "ts": 1716454223462278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407624, "dur": 6, "args": { "External id": 131279, "cbid": 211, "correlation": 131279 } }, { "ph": "s", "id": 131279, "pid": 76337, "tid": -914061504, "ts": 1716454223407624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223462283, "dur": 7, "args": { "External id": 131282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131282, "pid": 5, "tid": 7, "ts": 1716454223462283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407643, "dur": 6, "args": { "External id": 131282, "cbid": 211, "correlation": 131282 } }, { "ph": "s", "id": 131282, "pid": 76337, "tid": -914061504, "ts": 1716454223407643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223462291, "dur": 5, "args": { "External id": 131291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131291, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131291, "pid": 5, "tid": 7, "ts": 1716454223462291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407683, "dur": 10, "args": { "External id": 131291, "cbid": 211, "correlation": 131291 } }, { "ph": "s", "id": 131291, "pid": 76337, "tid": -914061504, "ts": 1716454223407683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223407745, "dur": 0, "args": { "External id": 131301, "cbid": 317, "correlation": 131301 } }, { "ph": "f", "id": 131301, "pid": 76337, "tid": -914061504, "ts": 1716454223407745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223407746, "dur": 0, "args": { "External id": 131302, "cbid": 203, "correlation": 131302 } }, { "ph": "f", "id": 131302, "pid": 76337, "tid": -914061504, "ts": 1716454223407746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223407747, "dur": 0, "args": { "External id": 131303, "cbid": 205, "correlation": 131303 } }, { "ph": "f", "id": 131303, "pid": 76337, "tid": -914061504, "ts": 1716454223407747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223462296, "dur": 5, "args": { "External id": 131307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131307, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131307, "pid": 5, "tid": 7, "ts": 1716454223462296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407760, "dur": 12, "args": { "External id": 131307, "cbid": 211, "correlation": 131307 } }, { "ph": "s", "id": 131307, "pid": 76337, "tid": -914061504, "ts": 1716454223407760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223462303, "dur": 161, "args": { "External id": 131309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131309, "pid": 5, "tid": 7, "ts": 1716454223462303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407775, "dur": 5, "args": { "External id": 131309, "cbid": 211, "correlation": 131309 } }, { "ph": "s", "id": 131309, "pid": 76337, "tid": -914061504, "ts": 1716454223407775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223462466, "dur": 1, "args": { "External id": 131311, "device": 5, "context": 1, "stream": 7, "correlation": 131311, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 131311, "pid": 5, "tid": 7, "ts": 1716454223462466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223407786, "dur": 6, "args": { "External id": 131311, "cbid": 51, "correlation": 131311 } }, { "ph": "s", "id": 131311, "pid": 76337, "tid": -914061504, "ts": 1716454223407786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223462470, "dur": 257, "args": { "External id": 131312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131312, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131312, "pid": 5, "tid": 7, "ts": 1716454223462470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407793, "dur": 6, "args": { "External id": 131312, "cbid": 211, "correlation": 131312 } }, { "ph": "s", "id": 131312, "pid": 76337, "tid": -914061504, "ts": 1716454223407793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223462728, "dur": 6, "args": { "External id": 131314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131314, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131314, "pid": 5, "tid": 7, "ts": 1716454223462728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407803, "dur": 5, "args": { "External id": 131314, "cbid": 211, "correlation": 131314 } }, { "ph": "s", "id": 131314, "pid": 76337, "tid": -914061504, "ts": 1716454223407803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223462736, "dur": 6, "args": { "External id": 131320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131320, "pid": 5, "tid": 7, "ts": 1716454223462736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407831, "dur": 9, "args": { "External id": 131320, "cbid": 211, "correlation": 131320 } }, { "ph": "s", "id": 131320, "pid": 76337, "tid": -914061504, "ts": 1716454223407831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223462743, "dur": 5, "args": { "External id": 131328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131328, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131328, "pid": 5, "tid": 7, "ts": 1716454223462743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407864, "dur": 8, "args": { "External id": 131328, "cbid": 211, "correlation": 131328 } }, { "ph": "s", "id": 131328, "pid": 76337, "tid": -914061504, "ts": 1716454223407864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223462749, "dur": 4, "args": { "External id": 131336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131336, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131336, "pid": 5, "tid": 7, "ts": 1716454223462749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407894, "dur": 8, "args": { "External id": 131336, "cbid": 211, "correlation": 131336 } }, { "ph": "s", "id": 131336, "pid": 76337, "tid": -914061504, "ts": 1716454223407894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223462755, "dur": 12, "args": { "External id": 131345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131345, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131345, "pid": 5, "tid": 7, "ts": 1716454223462755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223407989, "dur": 14, "args": { "External id": 131345, "cbid": 211, "correlation": 131345 } }, { "ph": "s", "id": 131345, "pid": 76337, "tid": -914061504, "ts": 1716454223407989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223462768, "dur": 12, "args": { "External id": 131365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131365, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131365, "pid": 5, "tid": 7, "ts": 1716454223462768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408058, "dur": 11, "args": { "External id": 131365, "cbid": 211, "correlation": 131365 } }, { "ph": "s", "id": 131365, "pid": 76337, "tid": -914061504, "ts": 1716454223408058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223462781, "dur": 4, "args": { "External id": 131377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131377, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131377, "pid": 5, "tid": 7, "ts": 1716454223462781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408079, "dur": 7, "args": { "External id": 131377, "cbid": 211, "correlation": 131377 } }, { "ph": "s", "id": 131377, "pid": 76337, "tid": -914061504, "ts": 1716454223408079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223462786, "dur": 10, "args": { "External id": 131380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131380, "pid": 5, "tid": 7, "ts": 1716454223462786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408098, "dur": 7, "args": { "External id": 131380, "cbid": 211, "correlation": 131380 } }, { "ph": "s", "id": 131380, "pid": 76337, "tid": -914061504, "ts": 1716454223408098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223462797, "dur": 6, "args": { "External id": 131389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131389, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131389, "pid": 5, "tid": 7, "ts": 1716454223462797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408136, "dur": 9, "args": { "External id": 131389, "cbid": 211, "correlation": 131389 } }, { "ph": "s", "id": 131389, "pid": 76337, "tid": -914061504, "ts": 1716454223408136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223408189, "dur": 0, "args": { "External id": 131399, "cbid": 317, "correlation": 131399 } }, { "ph": "f", "id": 131399, "pid": 76337, "tid": -914061504, "ts": 1716454223408189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223408190, "dur": 0, "args": { "External id": 131400, "cbid": 203, "correlation": 131400 } }, { "ph": "f", "id": 131400, "pid": 76337, "tid": -914061504, "ts": 1716454223408190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223408191, "dur": 0, "args": { "External id": 131401, "cbid": 205, "correlation": 131401 } }, { "ph": "f", "id": 131401, "pid": 76337, "tid": -914061504, "ts": 1716454223408191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223462804, "dur": 6, "args": { "External id": 131405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131405, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131405, "pid": 5, "tid": 7, "ts": 1716454223462804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408207, "dur": 11, "args": { "External id": 131405, "cbid": 211, "correlation": 131405 } }, { "ph": "s", "id": 131405, "pid": 76337, "tid": -914061504, "ts": 1716454223408207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223462812, "dur": 318, "args": { "External id": 131407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131407, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131407, "pid": 5, "tid": 7, "ts": 1716454223462812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408220, "dur": 5, "args": { "External id": 131407, "cbid": 211, "correlation": 131407 } }, { "ph": "s", "id": 131407, "pid": 76337, "tid": -914061504, "ts": 1716454223408220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223463133, "dur": 1, "args": { "External id": 131409, "device": 5, "context": 1, "stream": 7, "correlation": 131409, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 131409, "pid": 5, "tid": 7, "ts": 1716454223463133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223408231, "dur": 9, "args": { "External id": 131409, "cbid": 51, "correlation": 131409 } }, { "ph": "s", "id": 131409, "pid": 76337, "tid": -914061504, "ts": 1716454223408231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223463136, "dur": 492, "args": { "External id": 131410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131410, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131410, "pid": 5, "tid": 7, "ts": 1716454223463136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408241, "dur": 6, "args": { "External id": 131410, "cbid": 211, "correlation": 131410 } }, { "ph": "s", "id": 131410, "pid": 76337, "tid": -914061504, "ts": 1716454223408241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223463630, "dur": 6, "args": { "External id": 131412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131412, "pid": 5, "tid": 7, "ts": 1716454223463630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408251, "dur": 5, "args": { "External id": 131412, "cbid": 211, "correlation": 131412 } }, { "ph": "s", "id": 131412, "pid": 76337, "tid": -914061504, "ts": 1716454223408251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223463637, "dur": 6, "args": { "External id": 131418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131418, "pid": 5, "tid": 7, "ts": 1716454223463637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408280, "dur": 9, "args": { "External id": 131418, "cbid": 211, "correlation": 131418 } }, { "ph": "s", "id": 131418, "pid": 76337, "tid": -914061504, "ts": 1716454223408280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223463644, "dur": 3, "args": { "External id": 131426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131426, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 131426, "pid": 5, "tid": 7, "ts": 1716454223463644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408323, "dur": 11, "args": { "External id": 131426, "cbid": 211, "correlation": 131426 } }, { "ph": "s", "id": 131426, "pid": 76337, "tid": -914061504, "ts": 1716454223408323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223408388, "dur": 1, "args": { "External id": 131442, "cbid": 251, "correlation": 131442 } }, { "ph": "f", "id": 131442, "pid": 76337, "tid": -914061504, "ts": 1716454223408388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223408393, "dur": 0, "args": { "External id": 131444, "cbid": 251, "correlation": 131444 } }, { "ph": "f", "id": 131444, "pid": 76337, "tid": -914061504, "ts": 1716454223408393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223463649, "dur": 11, "args": { "External id": 131445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131445, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131445, "pid": 5, "tid": 7, "ts": 1716454223463649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408394, "dur": 11, "args": { "External id": 131445, "cbid": 211, "correlation": 131445 } }, { "ph": "s", "id": 131445, "pid": 76337, "tid": -914061504, "ts": 1716454223408394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223463661, "dur": 4, "args": { "External id": 131447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131447, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131447, "pid": 5, "tid": 7, "ts": 1716454223463661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408407, "dur": 6, "args": { "External id": 131447, "cbid": 211, "correlation": 131447 } }, { "ph": "s", "id": 131447, "pid": 76337, "tid": -914061504, "ts": 1716454223408407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223463667, "dur": 6, "args": { "External id": 131457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131457, "pid": 5, "tid": 7, "ts": 1716454223463667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408464, "dur": 12, "args": { "External id": 131457, "cbid": 211, "correlation": 131457 } }, { "ph": "s", "id": 131457, "pid": 76337, "tid": -914061504, "ts": 1716454223408464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223463674, "dur": 9, "args": { "External id": 131477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131477, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131477, "pid": 5, "tid": 7, "ts": 1716454223463674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408529, "dur": 11, "args": { "External id": 131477, "cbid": 211, "correlation": 131477 } }, { "ph": "s", "id": 131477, "pid": 76337, "tid": -914061504, "ts": 1716454223408529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223463685, "dur": 4, "args": { "External id": 131489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131489, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 131489, "pid": 5, "tid": 7, "ts": 1716454223463685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408551, "dur": 6, "args": { "External id": 131489, "cbid": 211, "correlation": 131489 } }, { "ph": "s", "id": 131489, "pid": 76337, "tid": -914061504, "ts": 1716454223408551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223463690, "dur": 6, "args": { "External id": 131492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131492, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131492, "pid": 5, "tid": 7, "ts": 1716454223463690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408569, "dur": 6, "args": { "External id": 131492, "cbid": 211, "correlation": 131492 } }, { "ph": "s", "id": 131492, "pid": 76337, "tid": -914061504, "ts": 1716454223408569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223463698, "dur": 5, "args": { "External id": 131501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131501, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131501, "pid": 5, "tid": 7, "ts": 1716454223463698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408610, "dur": 10, "args": { "External id": 131501, "cbid": 211, "correlation": 131501 } }, { "ph": "s", "id": 131501, "pid": 76337, "tid": -914061504, "ts": 1716454223408610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223408672, "dur": 0, "args": { "External id": 131511, "cbid": 317, "correlation": 131511 } }, { "ph": "f", "id": 131511, "pid": 76337, "tid": -914061504, "ts": 1716454223408672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223408673, "dur": 0, "args": { "External id": 131512, "cbid": 203, "correlation": 131512 } }, { "ph": "f", "id": 131512, "pid": 76337, "tid": -914061504, "ts": 1716454223408673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223408674, "dur": 0, "args": { "External id": 131513, "cbid": 205, "correlation": 131513 } }, { "ph": "f", "id": 131513, "pid": 76337, "tid": -914061504, "ts": 1716454223408674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223463704, "dur": 5, "args": { "External id": 131517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131517, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131517, "pid": 5, "tid": 7, "ts": 1716454223463704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408689, "dur": 12, "args": { "External id": 131517, "cbid": 211, "correlation": 131517 } }, { "ph": "s", "id": 131517, "pid": 76337, "tid": -914061504, "ts": 1716454223408689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223463710, "dur": 162, "args": { "External id": 131519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131519, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131519, "pid": 5, "tid": 7, "ts": 1716454223463710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408703, "dur": 6, "args": { "External id": 131519, "cbid": 211, "correlation": 131519 } }, { "ph": "s", "id": 131519, "pid": 76337, "tid": -914061504, "ts": 1716454223408703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223463874, "dur": 1, "args": { "External id": 131521, "device": 5, "context": 1, "stream": 7, "correlation": 131521, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 131521, "pid": 5, "tid": 7, "ts": 1716454223463874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223408714, "dur": 6, "args": { "External id": 131521, "cbid": 51, "correlation": 131521 } }, { "ph": "s", "id": 131521, "pid": 76337, "tid": -914061504, "ts": 1716454223408714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223463878, "dur": 257, "args": { "External id": 131522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131522, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131522, "pid": 5, "tid": 7, "ts": 1716454223463878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408721, "dur": 6, "args": { "External id": 131522, "cbid": 211, "correlation": 131522 } }, { "ph": "s", "id": 131522, "pid": 76337, "tid": -914061504, "ts": 1716454223408721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223464136, "dur": 6, "args": { "External id": 131524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131524, "pid": 5, "tid": 7, "ts": 1716454223464136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408731, "dur": 5, "args": { "External id": 131524, "cbid": 211, "correlation": 131524 } }, { "ph": "s", "id": 131524, "pid": 76337, "tid": -914061504, "ts": 1716454223408731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223464144, "dur": 6, "args": { "External id": 131530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131530, "pid": 5, "tid": 7, "ts": 1716454223464144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408759, "dur": 8, "args": { "External id": 131530, "cbid": 211, "correlation": 131530 } }, { "ph": "s", "id": 131530, "pid": 76337, "tid": -914061504, "ts": 1716454223408759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223408817, "dur": 0, "args": { "External id": 131540, "cbid": 317, "correlation": 131540 } }, { "ph": "f", "id": 131540, "pid": 76337, "tid": -914061504, "ts": 1716454223408817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223408818, "dur": 0, "args": { "External id": 131541, "cbid": 203, "correlation": 131541 } }, { "ph": "f", "id": 131541, "pid": 76337, "tid": -914061504, "ts": 1716454223408818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223408819, "dur": 0, "args": { "External id": 131542, "cbid": 205, "correlation": 131542 } }, { "ph": "f", "id": 131542, "pid": 76337, "tid": -914061504, "ts": 1716454223408819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223464151, "dur": 8, "args": { "External id": 131546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131546, "pid": 5, "tid": 7, "ts": 1716454223464151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408833, "dur": 13, "args": { "External id": 131546, "cbid": 211, "correlation": 131546 } }, { "ph": "s", "id": 131546, "pid": 76337, "tid": -914061504, "ts": 1716454223408833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223464160, "dur": 3, "args": { "External id": 131548, "device": 5, "context": 1, "stream": 7, "correlation": 131548, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 131548, "pid": 5, "tid": 7, "ts": 1716454223464160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223408852, "dur": 14, "args": { "External id": 131548, "cbid": 51, "correlation": 131548 } }, { "ph": "s", "id": 131548, "pid": 76337, "tid": -914061504, "ts": 1716454223408852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223464164, "dur": 97, "args": { "External id": 131549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131549, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 131549, "pid": 5, "tid": 7, "ts": 1716454223464164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408867, "dur": 6, "args": { "External id": 131549, "cbid": 211, "correlation": 131549 } }, { "ph": "s", "id": 131549, "pid": 76337, "tid": -914061504, "ts": 1716454223408867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223464263, "dur": 6, "args": { "External id": 131551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131551, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131551, "pid": 5, "tid": 7, "ts": 1716454223464263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408877, "dur": 5, "args": { "External id": 131551, "cbid": 211, "correlation": 131551 } }, { "ph": "s", "id": 131551, "pid": 76337, "tid": -914061504, "ts": 1716454223408877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223464270, "dur": 6, "args": { "External id": 131557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131557, "pid": 5, "tid": 7, "ts": 1716454223464270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408904, "dur": 8, "args": { "External id": 131557, "cbid": 211, "correlation": 131557 } }, { "ph": "s", "id": 131557, "pid": 76337, "tid": -914061504, "ts": 1716454223408904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223464277, "dur": 5, "args": { "External id": 131565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131565, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131565, "pid": 5, "tid": 7, "ts": 1716454223464277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408932, "dur": 9, "args": { "External id": 131565, "cbid": 211, "correlation": 131565 } }, { "ph": "s", "id": 131565, "pid": 76337, "tid": -914061504, "ts": 1716454223408932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223464283, "dur": 4, "args": { "External id": 131573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131573, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131573, "pid": 5, "tid": 7, "ts": 1716454223464283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223408961, "dur": 8, "args": { "External id": 131573, "cbid": 211, "correlation": 131573 } }, { "ph": "s", "id": 131573, "pid": 76337, "tid": -914061504, "ts": 1716454223408961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223464289, "dur": 11, "args": { "External id": 131582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131582, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131582, "pid": 5, "tid": 7, "ts": 1716454223464289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409015, "dur": 11, "args": { "External id": 131582, "cbid": 211, "correlation": 131582 } }, { "ph": "s", "id": 131582, "pid": 76337, "tid": -914061504, "ts": 1716454223409015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223464301, "dur": 12, "args": { "External id": 131602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131602, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131602, "pid": 5, "tid": 7, "ts": 1716454223464301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409085, "dur": 12, "args": { "External id": 131602, "cbid": 211, "correlation": 131602 } }, { "ph": "s", "id": 131602, "pid": 76337, "tid": -914061504, "ts": 1716454223409085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223464315, "dur": 4, "args": { "External id": 131614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131614, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131614, "pid": 5, "tid": 7, "ts": 1716454223464315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409107, "dur": 6, "args": { "External id": 131614, "cbid": 211, "correlation": 131614 } }, { "ph": "s", "id": 131614, "pid": 76337, "tid": -914061504, "ts": 1716454223409107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223464320, "dur": 10, "args": { "External id": 131617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131617, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131617, "pid": 5, "tid": 7, "ts": 1716454223464320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409125, "dur": 6, "args": { "External id": 131617, "cbid": 211, "correlation": 131617 } }, { "ph": "s", "id": 131617, "pid": 76337, "tid": -914061504, "ts": 1716454223409125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223464332, "dur": 6, "args": { "External id": 131626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131626, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131626, "pid": 5, "tid": 7, "ts": 1716454223464332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409163, "dur": 9, "args": { "External id": 131626, "cbid": 211, "correlation": 131626 } }, { "ph": "s", "id": 131626, "pid": 76337, "tid": -914061504, "ts": 1716454223409163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223409214, "dur": 0, "args": { "External id": 131636, "cbid": 317, "correlation": 131636 } }, { "ph": "f", "id": 131636, "pid": 76337, "tid": -914061504, "ts": 1716454223409214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223409215, "dur": 0, "args": { "External id": 131637, "cbid": 203, "correlation": 131637 } }, { "ph": "f", "id": 131637, "pid": 76337, "tid": -914061504, "ts": 1716454223409215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223409216, "dur": 0, "args": { "External id": 131638, "cbid": 205, "correlation": 131638 } }, { "ph": "f", "id": 131638, "pid": 76337, "tid": -914061504, "ts": 1716454223409216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223464340, "dur": 7, "args": { "External id": 131642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131642, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131642, "pid": 5, "tid": 7, "ts": 1716454223464340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409229, "dur": 12, "args": { "External id": 131642, "cbid": 211, "correlation": 131642 } }, { "ph": "s", "id": 131642, "pid": 76337, "tid": -914061504, "ts": 1716454223409229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223464348, "dur": 318, "args": { "External id": 131644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131644, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131644, "pid": 5, "tid": 7, "ts": 1716454223464348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409244, "dur": 6, "args": { "External id": 131644, "cbid": 211, "correlation": 131644 } }, { "ph": "s", "id": 131644, "pid": 76337, "tid": -914061504, "ts": 1716454223409244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223464667, "dur": 1, "args": { "External id": 131646, "device": 5, "context": 1, "stream": 7, "correlation": 131646, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 131646, "pid": 5, "tid": 7, "ts": 1716454223464667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223409255, "dur": 6, "args": { "External id": 131646, "cbid": 51, "correlation": 131646 } }, { "ph": "s", "id": 131646, "pid": 76337, "tid": -914061504, "ts": 1716454223409255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223464671, "dur": 492, "args": { "External id": 131647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131647, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131647, "pid": 5, "tid": 7, "ts": 1716454223464671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409262, "dur": 6, "args": { "External id": 131647, "cbid": 211, "correlation": 131647 } }, { "ph": "s", "id": 131647, "pid": 76337, "tid": -914061504, "ts": 1716454223409262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465165, "dur": 6, "args": { "External id": 131649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131649, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131649, "pid": 5, "tid": 7, "ts": 1716454223465165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409271, "dur": 5, "args": { "External id": 131649, "cbid": 211, "correlation": 131649 } }, { "ph": "s", "id": 131649, "pid": 76337, "tid": -914061504, "ts": 1716454223409271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223465172, "dur": 6, "args": { "External id": 131655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131655, "pid": 5, "tid": 7, "ts": 1716454223465172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409300, "dur": 8, "args": { "External id": 131655, "cbid": 211, "correlation": 131655 } }, { "ph": "s", "id": 131655, "pid": 76337, "tid": -914061504, "ts": 1716454223409300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223465179, "dur": 3, "args": { "External id": 131663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131663, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 131663, "pid": 5, "tid": 7, "ts": 1716454223465179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409344, "dur": 9, "args": { "External id": 131663, "cbid": 211, "correlation": 131663 } }, { "ph": "s", "id": 131663, "pid": 76337, "tid": -914061504, "ts": 1716454223409344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223409405, "dur": 1, "args": { "External id": 131679, "cbid": 251, "correlation": 131679 } }, { "ph": "f", "id": 131679, "pid": 76337, "tid": -914061504, "ts": 1716454223409405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223409410, "dur": 0, "args": { "External id": 131681, "cbid": 251, "correlation": 131681 } }, { "ph": "f", "id": 131681, "pid": 76337, "tid": -914061504, "ts": 1716454223409410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223465184, "dur": 13, "args": { "External id": 131682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131682, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131682, "pid": 5, "tid": 7, "ts": 1716454223465184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409412, "dur": 11, "args": { "External id": 131682, "cbid": 211, "correlation": 131682 } }, { "ph": "s", "id": 131682, "pid": 76337, "tid": -914061504, "ts": 1716454223409412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223465198, "dur": 5, "args": { "External id": 131684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131684, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131684, "pid": 5, "tid": 7, "ts": 1716454223465198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409425, "dur": 5, "args": { "External id": 131684, "cbid": 211, "correlation": 131684 } }, { "ph": "s", "id": 131684, "pid": 76337, "tid": -914061504, "ts": 1716454223409425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223465204, "dur": 6, "args": { "External id": 131694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131694, "pid": 5, "tid": 7, "ts": 1716454223465204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409482, "dur": 13, "args": { "External id": 131694, "cbid": 211, "correlation": 131694 } }, { "ph": "s", "id": 131694, "pid": 76337, "tid": -914061504, "ts": 1716454223409482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223465212, "dur": 9, "args": { "External id": 131714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131714, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131714, "pid": 5, "tid": 7, "ts": 1716454223465212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409549, "dur": 11, "args": { "External id": 131714, "cbid": 211, "correlation": 131714 } }, { "ph": "s", "id": 131714, "pid": 76337, "tid": -914061504, "ts": 1716454223409549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223465222, "dur": 3, "args": { "External id": 131726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131726, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 131726, "pid": 5, "tid": 7, "ts": 1716454223465222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409569, "dur": 6, "args": { "External id": 131726, "cbid": 211, "correlation": 131726 } }, { "ph": "s", "id": 131726, "pid": 76337, "tid": -914061504, "ts": 1716454223409569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223465227, "dur": 7, "args": { "External id": 131729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131729, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131729, "pid": 5, "tid": 7, "ts": 1716454223465227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409588, "dur": 7, "args": { "External id": 131729, "cbid": 211, "correlation": 131729 } }, { "ph": "s", "id": 131729, "pid": 76337, "tid": -914061504, "ts": 1716454223409588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223465235, "dur": 5, "args": { "External id": 131738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131738, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131738, "pid": 5, "tid": 7, "ts": 1716454223465235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409628, "dur": 10, "args": { "External id": 131738, "cbid": 211, "correlation": 131738 } }, { "ph": "s", "id": 131738, "pid": 76337, "tid": -914061504, "ts": 1716454223409628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223409691, "dur": 0, "args": { "External id": 131748, "cbid": 317, "correlation": 131748 } }, { "ph": "f", "id": 131748, "pid": 76337, "tid": -914061504, "ts": 1716454223409691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223409692, "dur": 0, "args": { "External id": 131749, "cbid": 203, "correlation": 131749 } }, { "ph": "f", "id": 131749, "pid": 76337, "tid": -914061504, "ts": 1716454223409692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223409693, "dur": 0, "args": { "External id": 131750, "cbid": 205, "correlation": 131750 } }, { "ph": "f", "id": 131750, "pid": 76337, "tid": -914061504, "ts": 1716454223409693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465241, "dur": 5, "args": { "External id": 131754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131754, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131754, "pid": 5, "tid": 7, "ts": 1716454223465241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409707, "dur": 12, "args": { "External id": 131754, "cbid": 211, "correlation": 131754 } }, { "ph": "s", "id": 131754, "pid": 76337, "tid": -914061504, "ts": 1716454223409707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465247, "dur": 161, "args": { "External id": 131756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131756, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131756, "pid": 5, "tid": 7, "ts": 1716454223465247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409721, "dur": 5, "args": { "External id": 131756, "cbid": 211, "correlation": 131756 } }, { "ph": "s", "id": 131756, "pid": 76337, "tid": -914061504, "ts": 1716454223409721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223465410, "dur": 1, "args": { "External id": 131758, "device": 5, "context": 1, "stream": 7, "correlation": 131758, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 131758, "pid": 5, "tid": 7, "ts": 1716454223465410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223409731, "dur": 7, "args": { "External id": 131758, "cbid": 51, "correlation": 131758 } }, { "ph": "s", "id": 131758, "pid": 76337, "tid": -914061504, "ts": 1716454223409731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223465414, "dur": 257, "args": { "External id": 131759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131759, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131759, "pid": 5, "tid": 7, "ts": 1716454223465414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409739, "dur": 7, "args": { "External id": 131759, "cbid": 211, "correlation": 131759 } }, { "ph": "s", "id": 131759, "pid": 76337, "tid": -914061504, "ts": 1716454223409739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465673, "dur": 6, "args": { "External id": 131761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131761, "pid": 5, "tid": 7, "ts": 1716454223465673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409750, "dur": 5, "args": { "External id": 131761, "cbid": 211, "correlation": 131761 } }, { "ph": "s", "id": 131761, "pid": 76337, "tid": -914061504, "ts": 1716454223409750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223465680, "dur": 6, "args": { "External id": 131767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131767, "pid": 5, "tid": 7, "ts": 1716454223465680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409778, "dur": 8, "args": { "External id": 131767, "cbid": 211, "correlation": 131767 } }, { "ph": "s", "id": 131767, "pid": 76337, "tid": -914061504, "ts": 1716454223409778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223409836, "dur": 0, "args": { "External id": 131777, "cbid": 317, "correlation": 131777 } }, { "ph": "f", "id": 131777, "pid": 76337, "tid": -914061504, "ts": 1716454223409836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223409837, "dur": 0, "args": { "External id": 131778, "cbid": 203, "correlation": 131778 } }, { "ph": "f", "id": 131778, "pid": 76337, "tid": -914061504, "ts": 1716454223409837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223409838, "dur": 0, "args": { "External id": 131779, "cbid": 205, "correlation": 131779 } }, { "ph": "f", "id": 131779, "pid": 76337, "tid": -914061504, "ts": 1716454223409838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465687, "dur": 8, "args": { "External id": 131783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131783, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131783, "pid": 5, "tid": 7, "ts": 1716454223465687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409850, "dur": 12, "args": { "External id": 131783, "cbid": 211, "correlation": 131783 } }, { "ph": "s", "id": 131783, "pid": 76337, "tid": -914061504, "ts": 1716454223409850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223465696, "dur": 3, "args": { "External id": 131785, "device": 5, "context": 1, "stream": 7, "correlation": 131785, "bytes": 4800, "memory bandwidth (GB/s)": 1.4705882352941178 } }, { "ph": "f", "id": 131785, "pid": 5, "tid": 7, "ts": 1716454223465696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223409867, "dur": 9, "args": { "External id": 131785, "cbid": 51, "correlation": 131785 } }, { "ph": "s", "id": 131785, "pid": 76337, "tid": -914061504, "ts": 1716454223409867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223465700, "dur": 95, "args": { "External id": 131786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131786, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 131786, "pid": 5, "tid": 7, "ts": 1716454223465700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409877, "dur": 6, "args": { "External id": 131786, "cbid": 211, "correlation": 131786 } }, { "ph": "s", "id": 131786, "pid": 76337, "tid": -914061504, "ts": 1716454223409877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465797, "dur": 5, "args": { "External id": 131788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131788, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131788, "pid": 5, "tid": 7, "ts": 1716454223465797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409886, "dur": 5, "args": { "External id": 131788, "cbid": 211, "correlation": 131788 } }, { "ph": "s", "id": 131788, "pid": 76337, "tid": -914061504, "ts": 1716454223409886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223465804, "dur": 6, "args": { "External id": 131794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131794, "pid": 5, "tid": 7, "ts": 1716454223465804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409913, "dur": 8, "args": { "External id": 131794, "cbid": 211, "correlation": 131794 } }, { "ph": "s", "id": 131794, "pid": 76337, "tid": -914061504, "ts": 1716454223409913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223465811, "dur": 5, "args": { "External id": 131802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131802, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131802, "pid": 5, "tid": 7, "ts": 1716454223465811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409942, "dur": 9, "args": { "External id": 131802, "cbid": 211, "correlation": 131802 } }, { "ph": "s", "id": 131802, "pid": 76337, "tid": -914061504, "ts": 1716454223409942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223465817, "dur": 4, "args": { "External id": 131810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131810, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131810, "pid": 5, "tid": 7, "ts": 1716454223465817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223409971, "dur": 17, "args": { "External id": 131810, "cbid": 211, "correlation": 131810 } }, { "ph": "s", "id": 131810, "pid": 76337, "tid": -914061504, "ts": 1716454223409971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223465823, "dur": 11, "args": { "External id": 131819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131819, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131819, "pid": 5, "tid": 7, "ts": 1716454223465823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410025, "dur": 10, "args": { "External id": 131819, "cbid": 211, "correlation": 131819 } }, { "ph": "s", "id": 131819, "pid": 76337, "tid": -914061504, "ts": 1716454223410025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223465835, "dur": 13, "args": { "External id": 131839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131839, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131839, "pid": 5, "tid": 7, "ts": 1716454223465835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410095, "dur": 12, "args": { "External id": 131839, "cbid": 211, "correlation": 131839 } }, { "ph": "s", "id": 131839, "pid": 76337, "tid": -914061504, "ts": 1716454223410095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223465849, "dur": 4, "args": { "External id": 131851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131851, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131851, "pid": 5, "tid": 7, "ts": 1716454223465849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410117, "dur": 6, "args": { "External id": 131851, "cbid": 211, "correlation": 131851 } }, { "ph": "s", "id": 131851, "pid": 76337, "tid": -914061504, "ts": 1716454223410117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223465854, "dur": 11, "args": { "External id": 131854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131854, "pid": 5, "tid": 7, "ts": 1716454223465854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410135, "dur": 7, "args": { "External id": 131854, "cbid": 211, "correlation": 131854 } }, { "ph": "s", "id": 131854, "pid": 76337, "tid": -914061504, "ts": 1716454223410135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223465866, "dur": 6, "args": { "External id": 131863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131863, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131863, "pid": 5, "tid": 7, "ts": 1716454223465866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410173, "dur": 9, "args": { "External id": 131863, "cbid": 211, "correlation": 131863 } }, { "ph": "s", "id": 131863, "pid": 76337, "tid": -914061504, "ts": 1716454223410173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223410224, "dur": 0, "args": { "External id": 131873, "cbid": 317, "correlation": 131873 } }, { "ph": "f", "id": 131873, "pid": 76337, "tid": -914061504, "ts": 1716454223410224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223410225, "dur": 0, "args": { "External id": 131874, "cbid": 203, "correlation": 131874 } }, { "ph": "f", "id": 131874, "pid": 76337, "tid": -914061504, "ts": 1716454223410225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223410226, "dur": 0, "args": { "External id": 131875, "cbid": 205, "correlation": 131875 } }, { "ph": "f", "id": 131875, "pid": 76337, "tid": -914061504, "ts": 1716454223410226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465874, "dur": 7, "args": { "External id": 131879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131879, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131879, "pid": 5, "tid": 7, "ts": 1716454223465874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410239, "dur": 12, "args": { "External id": 131879, "cbid": 211, "correlation": 131879 } }, { "ph": "s", "id": 131879, "pid": 76337, "tid": -914061504, "ts": 1716454223410239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223465882, "dur": 318, "args": { "External id": 131881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131881, "pid": 5, "tid": 7, "ts": 1716454223465882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410254, "dur": 6, "args": { "External id": 131881, "cbid": 211, "correlation": 131881 } }, { "ph": "s", "id": 131881, "pid": 76337, "tid": -914061504, "ts": 1716454223410254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223466202, "dur": 1, "args": { "External id": 131883, "device": 5, "context": 1, "stream": 7, "correlation": 131883, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 131883, "pid": 5, "tid": 7, "ts": 1716454223466202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223410265, "dur": 6, "args": { "External id": 131883, "cbid": 51, "correlation": 131883 } }, { "ph": "s", "id": 131883, "pid": 76337, "tid": -914061504, "ts": 1716454223410265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223466205, "dur": 492, "args": { "External id": 131884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131884, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131884, "pid": 5, "tid": 7, "ts": 1716454223466205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410272, "dur": 6, "args": { "External id": 131884, "cbid": 211, "correlation": 131884 } }, { "ph": "s", "id": 131884, "pid": 76337, "tid": -914061504, "ts": 1716454223410272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223466699, "dur": 5, "args": { "External id": 131886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131886, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131886, "pid": 5, "tid": 7, "ts": 1716454223466699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410282, "dur": 5, "args": { "External id": 131886, "cbid": 211, "correlation": 131886 } }, { "ph": "s", "id": 131886, "pid": 76337, "tid": -914061504, "ts": 1716454223410282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223466705, "dur": 7, "args": { "External id": 131892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131892, "pid": 5, "tid": 7, "ts": 1716454223466705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410310, "dur": 9, "args": { "External id": 131892, "cbid": 211, "correlation": 131892 } }, { "ph": "s", "id": 131892, "pid": 76337, "tid": -914061504, "ts": 1716454223410310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223466713, "dur": 3, "args": { "External id": 131900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131900, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 131900, "pid": 5, "tid": 7, "ts": 1716454223466713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410354, "dur": 9, "args": { "External id": 131900, "cbid": 211, "correlation": 131900 } }, { "ph": "s", "id": 131900, "pid": 76337, "tid": -914061504, "ts": 1716454223410354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223410415, "dur": 1, "args": { "External id": 131916, "cbid": 251, "correlation": 131916 } }, { "ph": "f", "id": 131916, "pid": 76337, "tid": -914061504, "ts": 1716454223410415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223410421, "dur": 0, "args": { "External id": 131918, "cbid": 251, "correlation": 131918 } }, { "ph": "f", "id": 131918, "pid": 76337, "tid": -914061504, "ts": 1716454223410421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223466718, "dur": 12, "args": { "External id": 131919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131919, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131919, "pid": 5, "tid": 7, "ts": 1716454223466718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410422, "dur": 11, "args": { "External id": 131919, "cbid": 211, "correlation": 131919 } }, { "ph": "s", "id": 131919, "pid": 76337, "tid": -914061504, "ts": 1716454223410422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223466732, "dur": 5, "args": { "External id": 131921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131921, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131921, "pid": 5, "tid": 7, "ts": 1716454223466732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410435, "dur": 5, "args": { "External id": 131921, "cbid": 211, "correlation": 131921 } }, { "ph": "s", "id": 131921, "pid": 76337, "tid": -914061504, "ts": 1716454223410435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223466738, "dur": 6, "args": { "External id": 131931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131931, "pid": 5, "tid": 7, "ts": 1716454223466738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410491, "dur": 13, "args": { "External id": 131931, "cbid": 211, "correlation": 131931 } }, { "ph": "s", "id": 131931, "pid": 76337, "tid": -914061504, "ts": 1716454223410491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223466745, "dur": 9, "args": { "External id": 131951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131951, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 131951, "pid": 5, "tid": 7, "ts": 1716454223466745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410558, "dur": 11, "args": { "External id": 131951, "cbid": 211, "correlation": 131951 } }, { "ph": "s", "id": 131951, "pid": 76337, "tid": -914061504, "ts": 1716454223410558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223466756, "dur": 4, "args": { "External id": 131963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131963, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 131963, "pid": 5, "tid": 7, "ts": 1716454223466756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410578, "dur": 6, "args": { "External id": 131963, "cbid": 211, "correlation": 131963 } }, { "ph": "s", "id": 131963, "pid": 76337, "tid": -914061504, "ts": 1716454223410578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223466761, "dur": 7, "args": { "External id": 131966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131966, "pid": 5, "tid": 7, "ts": 1716454223466761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410598, "dur": 7, "args": { "External id": 131966, "cbid": 211, "correlation": 131966 } }, { "ph": "s", "id": 131966, "pid": 76337, "tid": -914061504, "ts": 1716454223410598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223466768, "dur": 5, "args": { "External id": 131975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131975, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 131975, "pid": 5, "tid": 7, "ts": 1716454223466768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410637, "dur": 10, "args": { "External id": 131975, "cbid": 211, "correlation": 131975 } }, { "ph": "s", "id": 131975, "pid": 76337, "tid": -914061504, "ts": 1716454223410637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223410700, "dur": 0, "args": { "External id": 131985, "cbid": 317, "correlation": 131985 } }, { "ph": "f", "id": 131985, "pid": 76337, "tid": -914061504, "ts": 1716454223410700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223410702, "dur": 0, "args": { "External id": 131986, "cbid": 203, "correlation": 131986 } }, { "ph": "f", "id": 131986, "pid": 76337, "tid": -914061504, "ts": 1716454223410702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223410702, "dur": 0, "args": { "External id": 131987, "cbid": 205, "correlation": 131987 } }, { "ph": "f", "id": 131987, "pid": 76337, "tid": -914061504, "ts": 1716454223410702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223466774, "dur": 5, "args": { "External id": 131991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131991, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131991, "pid": 5, "tid": 7, "ts": 1716454223466774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410716, "dur": 12, "args": { "External id": 131991, "cbid": 211, "correlation": 131991 } }, { "ph": "s", "id": 131991, "pid": 76337, "tid": -914061504, "ts": 1716454223410716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223466781, "dur": 162, "args": { "External id": 131993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131993, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131993, "pid": 5, "tid": 7, "ts": 1716454223466781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410731, "dur": 5, "args": { "External id": 131993, "cbid": 211, "correlation": 131993 } }, { "ph": "s", "id": 131993, "pid": 76337, "tid": -914061504, "ts": 1716454223410731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223466945, "dur": 1, "args": { "External id": 131995, "device": 5, "context": 1, "stream": 7, "correlation": 131995, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 131995, "pid": 5, "tid": 7, "ts": 1716454223466945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223410741, "dur": 7, "args": { "External id": 131995, "cbid": 51, "correlation": 131995 } }, { "ph": "s", "id": 131995, "pid": 76337, "tid": -914061504, "ts": 1716454223410741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223466948, "dur": 257, "args": { "External id": 131996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131996, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 131996, "pid": 5, "tid": 7, "ts": 1716454223466948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410750, "dur": 6, "args": { "External id": 131996, "cbid": 211, "correlation": 131996 } }, { "ph": "s", "id": 131996, "pid": 76337, "tid": -914061504, "ts": 1716454223410750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223467207, "dur": 6, "args": { "External id": 131998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 131998, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 131998, "pid": 5, "tid": 7, "ts": 1716454223467207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410759, "dur": 5, "args": { "External id": 131998, "cbid": 211, "correlation": 131998 } }, { "ph": "s", "id": 131998, "pid": 76337, "tid": -914061504, "ts": 1716454223410759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223467214, "dur": 6, "args": { "External id": 132004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 132004, "pid": 5, "tid": 7, "ts": 1716454223467214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410788, "dur": 9, "args": { "External id": 132004, "cbid": 211, "correlation": 132004 } }, { "ph": "s", "id": 132004, "pid": 76337, "tid": -914061504, "ts": 1716454223410788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223410846, "dur": 0, "args": { "External id": 132014, "cbid": 317, "correlation": 132014 } }, { "ph": "f", "id": 132014, "pid": 76337, "tid": -914061504, "ts": 1716454223410846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223410847, "dur": 0, "args": { "External id": 132015, "cbid": 203, "correlation": 132015 } }, { "ph": "f", "id": 132015, "pid": 76337, "tid": -914061504, "ts": 1716454223410847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223410848, "dur": 0, "args": { "External id": 132016, "cbid": 205, "correlation": 132016 } }, { "ph": "f", "id": 132016, "pid": 76337, "tid": -914061504, "ts": 1716454223410848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223467221, "dur": 8, "args": { "External id": 132020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132020, "pid": 5, "tid": 7, "ts": 1716454223467221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410860, "dur": 12, "args": { "External id": 132020, "cbid": 211, "correlation": 132020 } }, { "ph": "s", "id": 132020, "pid": 76337, "tid": -914061504, "ts": 1716454223410860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223467231, "dur": 3, "args": { "External id": 132022, "device": 5, "context": 1, "stream": 7, "correlation": 132022, "bytes": 4800, "memory bandwidth (GB/s)": 1.5151515151515151 } }, { "ph": "f", "id": 132022, "pid": 5, "tid": 7, "ts": 1716454223467231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223410877, "dur": 9, "args": { "External id": 132022, "cbid": 51, "correlation": 132022 } }, { "ph": "s", "id": 132022, "pid": 76337, "tid": -914061504, "ts": 1716454223410877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223467235, "dur": 94, "args": { "External id": 132023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132023, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 132023, "pid": 5, "tid": 7, "ts": 1716454223467235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410887, "dur": 6, "args": { "External id": 132023, "cbid": 211, "correlation": 132023 } }, { "ph": "s", "id": 132023, "pid": 76337, "tid": -914061504, "ts": 1716454223410887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223467330, "dur": 5, "args": { "External id": 132025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132025, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132025, "pid": 5, "tid": 7, "ts": 1716454223467330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410896, "dur": 6, "args": { "External id": 132025, "cbid": 211, "correlation": 132025 } }, { "ph": "s", "id": 132025, "pid": 76337, "tid": -914061504, "ts": 1716454223410896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223467337, "dur": 6, "args": { "External id": 132031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 132031, "pid": 5, "tid": 7, "ts": 1716454223467337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410924, "dur": 8, "args": { "External id": 132031, "cbid": 211, "correlation": 132031 } }, { "ph": "s", "id": 132031, "pid": 76337, "tid": -914061504, "ts": 1716454223410924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223467344, "dur": 5, "args": { "External id": 132039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132039, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 132039, "pid": 5, "tid": 7, "ts": 1716454223467344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410954, "dur": 7, "args": { "External id": 132039, "cbid": 211, "correlation": 132039 } }, { "ph": "s", "id": 132039, "pid": 76337, "tid": -914061504, "ts": 1716454223410954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223467350, "dur": 4, "args": { "External id": 132047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132047, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 132047, "pid": 5, "tid": 7, "ts": 1716454223467350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223410990, "dur": 9, "args": { "External id": 132047, "cbid": 211, "correlation": 132047 } }, { "ph": "s", "id": 132047, "pid": 76337, "tid": -914061504, "ts": 1716454223410990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223467356, "dur": 14, "args": { "External id": 132058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132058, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132058, "pid": 5, "tid": 7, "ts": 1716454223467356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411072, "dur": 13, "args": { "External id": 132058, "cbid": 211, "correlation": 132058 } }, { "ph": "s", "id": 132058, "pid": 76337, "tid": -914061504, "ts": 1716454223411072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223411127, "dur": 0, "args": { "External id": 132068, "cbid": 317, "correlation": 132068 } }, { "ph": "f", "id": 132068, "pid": 76337, "tid": -914061504, "ts": 1716454223411127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223411128, "dur": 0, "args": { "External id": 132069, "cbid": 203, "correlation": 132069 } }, { "ph": "f", "id": 132069, "pid": 76337, "tid": -914061504, "ts": 1716454223411128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223411129, "dur": 0, "args": { "External id": 132070, "cbid": 205, "correlation": 132070 } }, { "ph": "f", "id": 132070, "pid": 76337, "tid": -914061504, "ts": 1716454223411129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223467371, "dur": 9, "args": { "External id": 132074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132074, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132074, "pid": 5, "tid": 7, "ts": 1716454223467371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411142, "dur": 12, "args": { "External id": 132074, "cbid": 211, "correlation": 132074 } }, { "ph": "s", "id": 132074, "pid": 76337, "tid": -914061504, "ts": 1716454223411142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223467381, "dur": 162, "args": { "External id": 132076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132076, "pid": 5, "tid": 7, "ts": 1716454223467381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411157, "dur": 5, "args": { "External id": 132076, "cbid": 211, "correlation": 132076 } }, { "ph": "s", "id": 132076, "pid": 76337, "tid": -914061504, "ts": 1716454223411157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223467546, "dur": 1, "args": { "External id": 132078, "device": 5, "context": 1, "stream": 7, "correlation": 132078, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 132078, "pid": 5, "tid": 7, "ts": 1716454223467546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223411167, "dur": 6, "args": { "External id": 132078, "cbid": 51, "correlation": 132078 } }, { "ph": "s", "id": 132078, "pid": 76337, "tid": -914061504, "ts": 1716454223411167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223467550, "dur": 647, "args": { "External id": 132079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132079, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132079, "pid": 5, "tid": 7, "ts": 1716454223467550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411174, "dur": 6, "args": { "External id": 132079, "cbid": 211, "correlation": 132079 } }, { "ph": "s", "id": 132079, "pid": 76337, "tid": -914061504, "ts": 1716454223411174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223468198, "dur": 12, "args": { "External id": 132081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132081, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132081, "pid": 5, "tid": 7, "ts": 1716454223468198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411184, "dur": 5, "args": { "External id": 132081, "cbid": 211, "correlation": 132081 } }, { "ph": "s", "id": 132081, "pid": 76337, "tid": -914061504, "ts": 1716454223411184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223468211, "dur": 14, "args": { "External id": 132087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132087, "pid": 5, "tid": 7, "ts": 1716454223468211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411213, "dur": 8, "args": { "External id": 132087, "cbid": 211, "correlation": 132087 } }, { "ph": "s", "id": 132087, "pid": 76337, "tid": -914061504, "ts": 1716454223411213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223468227, "dur": 31, "args": { "External id": 132096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132096, "pid": 5, "tid": 7, "ts": 1716454223468227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411303, "dur": 13, "args": { "External id": 132096, "cbid": 211, "correlation": 132096 } }, { "ph": "s", "id": 132096, "pid": 76337, "tid": -914061504, "ts": 1716454223411303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223468259, "dur": 30, "args": { "External id": 132116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132116, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 132116, "pid": 5, "tid": 7, "ts": 1716454223468259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411370, "dur": 11, "args": { "External id": 132116, "cbid": 211, "correlation": 132116 } }, { "ph": "s", "id": 132116, "pid": 76337, "tid": -914061504, "ts": 1716454223411370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223468291, "dur": 4, "args": { "External id": 132128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132128, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132128, "pid": 5, "tid": 7, "ts": 1716454223468291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411391, "dur": 6, "args": { "External id": 132128, "cbid": 211, "correlation": 132128 } }, { "ph": "s", "id": 132128, "pid": 76337, "tid": -914061504, "ts": 1716454223411391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223468296, "dur": 30, "args": { "External id": 132131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132131, "pid": 5, "tid": 7, "ts": 1716454223468296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411409, "dur": 6, "args": { "External id": 132131, "cbid": 211, "correlation": 132131 } }, { "ph": "s", "id": 132131, "pid": 76337, "tid": -914061504, "ts": 1716454223411409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223468327, "dur": 21, "args": { "External id": 132140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132140, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132140, "pid": 5, "tid": 7, "ts": 1716454223468327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411447, "dur": 10, "args": { "External id": 132140, "cbid": 211, "correlation": 132140 } }, { "ph": "s", "id": 132140, "pid": 76337, "tid": -914061504, "ts": 1716454223411447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223411499, "dur": 0, "args": { "External id": 132150, "cbid": 317, "correlation": 132150 } }, { "ph": "f", "id": 132150, "pid": 76337, "tid": -914061504, "ts": 1716454223411499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223411500, "dur": 0, "args": { "External id": 132151, "cbid": 203, "correlation": 132151 } }, { "ph": "f", "id": 132151, "pid": 76337, "tid": -914061504, "ts": 1716454223411500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223411501, "dur": 0, "args": { "External id": 132152, "cbid": 205, "correlation": 132152 } }, { "ph": "f", "id": 132152, "pid": 76337, "tid": -914061504, "ts": 1716454223411501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223468350, "dur": 22, "args": { "External id": 132156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132156, "pid": 5, "tid": 7, "ts": 1716454223468350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411516, "dur": 11, "args": { "External id": 132156, "cbid": 211, "correlation": 132156 } }, { "ph": "s", "id": 132156, "pid": 76337, "tid": -914061504, "ts": 1716454223411516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223468373, "dur": 318, "args": { "External id": 132158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132158, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132158, "pid": 5, "tid": 7, "ts": 1716454223468373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411530, "dur": 5, "args": { "External id": 132158, "cbid": 211, "correlation": 132158 } }, { "ph": "s", "id": 132158, "pid": 76337, "tid": -914061504, "ts": 1716454223411530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223468693, "dur": 1, "args": { "External id": 132160, "device": 5, "context": 1, "stream": 7, "correlation": 132160, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 132160, "pid": 5, "tid": 7, "ts": 1716454223468693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223411541, "dur": 6, "args": { "External id": 132160, "cbid": 51, "correlation": 132160 } }, { "ph": "s", "id": 132160, "pid": 76337, "tid": -914061504, "ts": 1716454223411541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223468697, "dur": 1231, "args": { "External id": 132161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132161, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132161, "pid": 5, "tid": 7, "ts": 1716454223468697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411548, "dur": 7, "args": { "External id": 132161, "cbid": 211, "correlation": 132161 } }, { "ph": "s", "id": 132161, "pid": 76337, "tid": -914061504, "ts": 1716454223411548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223469929, "dur": 13, "args": { "External id": 132163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132163, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132163, "pid": 5, "tid": 7, "ts": 1716454223469929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411558, "dur": 5, "args": { "External id": 132163, "cbid": 211, "correlation": 132163 } }, { "ph": "s", "id": 132163, "pid": 76337, "tid": -914061504, "ts": 1716454223411558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223469943, "dur": 14, "args": { "External id": 132169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132169, "pid": 5, "tid": 7, "ts": 1716454223469943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411586, "dur": 8, "args": { "External id": 132169, "cbid": 211, "correlation": 132169 } }, { "ph": "s", "id": 132169, "pid": 76337, "tid": -914061504, "ts": 1716454223411586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223469959, "dur": 3, "args": { "External id": 132177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132177, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 132177, "pid": 5, "tid": 7, "ts": 1716454223469959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411631, "dur": 9, "args": { "External id": 132177, "cbid": 211, "correlation": 132177 } }, { "ph": "s", "id": 132177, "pid": 76337, "tid": -914061504, "ts": 1716454223411631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223411693, "dur": 1, "args": { "External id": 132193, "cbid": 251, "correlation": 132193 } }, { "ph": "f", "id": 132193, "pid": 76337, "tid": -914061504, "ts": 1716454223411693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223411698, "dur": 0, "args": { "External id": 132195, "cbid": 251, "correlation": 132195 } }, { "ph": "f", "id": 132195, "pid": 76337, "tid": -914061504, "ts": 1716454223411698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223469964, "dur": 12, "args": { "External id": 132196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132196, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132196, "pid": 5, "tid": 7, "ts": 1716454223469964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411700, "dur": 12, "args": { "External id": 132196, "cbid": 211, "correlation": 132196 } }, { "ph": "s", "id": 132196, "pid": 76337, "tid": -914061504, "ts": 1716454223411700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223469977, "dur": 5, "args": { "External id": 132198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132198, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132198, "pid": 5, "tid": 7, "ts": 1716454223469977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411714, "dur": 5, "args": { "External id": 132198, "cbid": 211, "correlation": 132198 } }, { "ph": "s", "id": 132198, "pid": 76337, "tid": -914061504, "ts": 1716454223411714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223469983, "dur": 17, "args": { "External id": 132208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132208, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132208, "pid": 5, "tid": 7, "ts": 1716454223469983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411770, "dur": 12, "args": { "External id": 132208, "cbid": 211, "correlation": 132208 } }, { "ph": "s", "id": 132208, "pid": 76337, "tid": -914061504, "ts": 1716454223411770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223470001, "dur": 17, "args": { "External id": 132228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132228, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 132228, "pid": 5, "tid": 7, "ts": 1716454223470001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411836, "dur": 10, "args": { "External id": 132228, "cbid": 211, "correlation": 132228 } }, { "ph": "s", "id": 132228, "pid": 76337, "tid": -914061504, "ts": 1716454223411836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223470020, "dur": 5, "args": { "External id": 132240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132240, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 132240, "pid": 5, "tid": 7, "ts": 1716454223470020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411856, "dur": 7, "args": { "External id": 132240, "cbid": 211, "correlation": 132240 } }, { "ph": "s", "id": 132240, "pid": 76337, "tid": -914061504, "ts": 1716454223411856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223470026, "dur": 16, "args": { "External id": 132243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132243, "pid": 5, "tid": 7, "ts": 1716454223470026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411875, "dur": 6, "args": { "External id": 132243, "cbid": 211, "correlation": 132243 } }, { "ph": "s", "id": 132243, "pid": 76337, "tid": -914061504, "ts": 1716454223411875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223470043, "dur": 12, "args": { "External id": 132252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132252, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132252, "pid": 5, "tid": 7, "ts": 1716454223470043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223411915, "dur": 9, "args": { "External id": 132252, "cbid": 211, "correlation": 132252 } }, { "ph": "s", "id": 132252, "pid": 76337, "tid": -914061504, "ts": 1716454223411915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223411985, "dur": 0, "args": { "External id": 132262, "cbid": 317, "correlation": 132262 } }, { "ph": "f", "id": 132262, "pid": 76337, "tid": -914061504, "ts": 1716454223411985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223411986, "dur": 0, "args": { "External id": 132263, "cbid": 203, "correlation": 132263 } }, { "ph": "f", "id": 132263, "pid": 76337, "tid": -914061504, "ts": 1716454223411986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223411986, "dur": 0, "args": { "External id": 132264, "cbid": 205, "correlation": 132264 } }, { "ph": "f", "id": 132264, "pid": 76337, "tid": -914061504, "ts": 1716454223411986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223470056, "dur": 11, "args": { "External id": 132268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132268, "pid": 5, "tid": 7, "ts": 1716454223470056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412000, "dur": 13, "args": { "External id": 132268, "cbid": 211, "correlation": 132268 } }, { "ph": "s", "id": 132268, "pid": 76337, "tid": -914061504, "ts": 1716454223412000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223470069, "dur": 162, "args": { "External id": 132270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132270, "pid": 5, "tid": 7, "ts": 1716454223470069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412015, "dur": 5, "args": { "External id": 132270, "cbid": 211, "correlation": 132270 } }, { "ph": "s", "id": 132270, "pid": 76337, "tid": -914061504, "ts": 1716454223412015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223470233, "dur": 1, "args": { "External id": 132272, "device": 5, "context": 1, "stream": 7, "correlation": 132272, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 132272, "pid": 5, "tid": 7, "ts": 1716454223470233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223412026, "dur": 8, "args": { "External id": 132272, "cbid": 51, "correlation": 132272 } }, { "ph": "s", "id": 132272, "pid": 76337, "tid": -914061504, "ts": 1716454223412026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223470237, "dur": 646, "args": { "External id": 132273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132273, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132273, "pid": 5, "tid": 7, "ts": 1716454223470237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412035, "dur": 6, "args": { "External id": 132273, "cbid": 211, "correlation": 132273 } }, { "ph": "s", "id": 132273, "pid": 76337, "tid": -914061504, "ts": 1716454223412035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223470884, "dur": 13, "args": { "External id": 132275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132275, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132275, "pid": 5, "tid": 7, "ts": 1716454223470884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412045, "dur": 5, "args": { "External id": 132275, "cbid": 211, "correlation": 132275 } }, { "ph": "s", "id": 132275, "pid": 76337, "tid": -914061504, "ts": 1716454223412045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223470898, "dur": 15, "args": { "External id": 132281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132281, "pid": 5, "tid": 7, "ts": 1716454223470898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412073, "dur": 8, "args": { "External id": 132281, "cbid": 211, "correlation": 132281 } }, { "ph": "s", "id": 132281, "pid": 76337, "tid": -914061504, "ts": 1716454223412073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223412132, "dur": 0, "args": { "External id": 132291, "cbid": 317, "correlation": 132291 } }, { "ph": "f", "id": 132291, "pid": 76337, "tid": -914061504, "ts": 1716454223412132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223412133, "dur": 0, "args": { "External id": 132292, "cbid": 203, "correlation": 132292 } }, { "ph": "f", "id": 132292, "pid": 76337, "tid": -914061504, "ts": 1716454223412133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223412134, "dur": 0, "args": { "External id": 132293, "cbid": 205, "correlation": 132293 } }, { "ph": "f", "id": 132293, "pid": 76337, "tid": -914061504, "ts": 1716454223412134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223470914, "dur": 20, "args": { "External id": 132297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132297, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132297, "pid": 5, "tid": 7, "ts": 1716454223470914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412147, "dur": 12, "args": { "External id": 132297, "cbid": 211, "correlation": 132297 } }, { "ph": "s", "id": 132297, "pid": 76337, "tid": -914061504, "ts": 1716454223412147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223470936, "dur": 4, "args": { "External id": 132299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132299, "pid": 5, "tid": 7, "ts": 1716454223470936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412165, "dur": 6, "args": { "External id": 132299, "cbid": 211, "correlation": 132299 } }, { "ph": "s", "id": 132299, "pid": 76337, "tid": -914061504, "ts": 1716454223412165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223412174, "dur": 0, "args": { "External id": 132300, "cbid": 51, "correlation": 132300 } }, { "ph": "s", "id": 132300, "pid": 76337, "tid": -914061504, "ts": 1716454223412174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223470941, "dur": 174, "args": { "External id": 132301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132301, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 132301, "pid": 5, "tid": 7, "ts": 1716454223470941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412175, "dur": 5, "args": { "External id": 132301, "cbid": 211, "correlation": 132301 } }, { "ph": "s", "id": 132301, "pid": 76337, "tid": -914061504, "ts": 1716454223412175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223471116, "dur": 17, "args": { "External id": 132306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132306, "pid": 5, "tid": 7, "ts": 1716454223471116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412200, "dur": 8, "args": { "External id": 132306, "cbid": 211, "correlation": 132306 } }, { "ph": "s", "id": 132306, "pid": 76337, "tid": -914061504, "ts": 1716454223412200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223471134, "dur": 12, "args": { "External id": 132314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132314, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132314, "pid": 5, "tid": 7, "ts": 1716454223471134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412228, "dur": 8, "args": { "External id": 132314, "cbid": 211, "correlation": 132314 } }, { "ph": "s", "id": 132314, "pid": 76337, "tid": -914061504, "ts": 1716454223412228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223471148, "dur": 10, "args": { "External id": 132322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132322, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132322, "pid": 5, "tid": 7, "ts": 1716454223471148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412256, "dur": 9, "args": { "External id": 132322, "cbid": 211, "correlation": 132322 } }, { "ph": "s", "id": 132322, "pid": 76337, "tid": -914061504, "ts": 1716454223412256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223471159, "dur": 18, "args": { "External id": 132342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132342, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 132342, "pid": 5, "tid": 7, "ts": 1716454223471159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412338, "dur": 12, "args": { "External id": 132342, "cbid": 211, "correlation": 132342 } }, { "ph": "s", "id": 132342, "pid": 76337, "tid": -914061504, "ts": 1716454223412338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223471178, "dur": 4, "args": { "External id": 132354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132354, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 132354, "pid": 5, "tid": 7, "ts": 1716454223471178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412361, "dur": 7, "args": { "External id": 132354, "cbid": 211, "correlation": 132354 } }, { "ph": "s", "id": 132354, "pid": 76337, "tid": -914061504, "ts": 1716454223412361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223471184, "dur": 17, "args": { "External id": 132357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132357, "pid": 5, "tid": 7, "ts": 1716454223471184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412379, "dur": 6, "args": { "External id": 132357, "cbid": 211, "correlation": 132357 } }, { "ph": "s", "id": 132357, "pid": 76337, "tid": -914061504, "ts": 1716454223412379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223412436, "dur": 0, "args": { "External id": 132368, "cbid": 317, "correlation": 132368 } }, { "ph": "f", "id": 132368, "pid": 76337, "tid": -914061504, "ts": 1716454223412436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223412437, "dur": 0, "args": { "External id": 132369, "cbid": 203, "correlation": 132369 } }, { "ph": "f", "id": 132369, "pid": 76337, "tid": -914061504, "ts": 1716454223412437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223412437, "dur": 0, "args": { "External id": 132370, "cbid": 205, "correlation": 132370 } }, { "ph": "f", "id": 132370, "pid": 76337, "tid": -914061504, "ts": 1716454223412437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223471202, "dur": 11, "args": { "External id": 132374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132374, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132374, "pid": 5, "tid": 7, "ts": 1716454223471202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412451, "dur": 12, "args": { "External id": 132374, "cbid": 211, "correlation": 132374 } }, { "ph": "s", "id": 132374, "pid": 76337, "tid": -914061504, "ts": 1716454223412451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223471214, "dur": 3, "args": { "External id": 132376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132376, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132376, "pid": 5, "tid": 7, "ts": 1716454223471214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412468, "dur": 6, "args": { "External id": 132376, "cbid": 211, "correlation": 132376 } }, { "ph": "s", "id": 132376, "pid": 76337, "tid": -914061504, "ts": 1716454223412468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223412476, "dur": 0, "args": { "External id": 132377, "cbid": 51, "correlation": 132377 } }, { "ph": "s", "id": 132377, "pid": 76337, "tid": -914061504, "ts": 1716454223412476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223471218, "dur": 90, "args": { "External id": 132378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132378, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 132378, "pid": 5, "tid": 7, "ts": 1716454223471218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412477, "dur": 5, "args": { "External id": 132378, "cbid": 211, "correlation": 132378 } }, { "ph": "s", "id": 132378, "pid": 76337, "tid": -914061504, "ts": 1716454223412477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223471309, "dur": 15, "args": { "External id": 132383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132383, "pid": 5, "tid": 7, "ts": 1716454223471309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412503, "dur": 9, "args": { "External id": 132383, "cbid": 211, "correlation": 132383 } }, { "ph": "s", "id": 132383, "pid": 76337, "tid": -914061504, "ts": 1716454223412503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223471326, "dur": 84, "args": { "External id": 132392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132392, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132392, "pid": 5, "tid": 7, "ts": 1716454223471326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412585, "dur": 15, "args": { "External id": 132392, "cbid": 211, "correlation": 132392 } }, { "ph": "s", "id": 132392, "pid": 76337, "tid": -914061504, "ts": 1716454223412585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223471411, "dur": 31, "args": { "External id": 132414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132414, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132414, "pid": 5, "tid": 7, "ts": 1716454223471411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412643, "dur": 10, "args": { "External id": 132414, "cbid": 211, "correlation": 132414 } }, { "ph": "s", "id": 132414, "pid": 76337, "tid": -914061504, "ts": 1716454223412643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223412736, "dur": 1, "args": { "External id": 132425, "cbid": 251, "correlation": 132425 } }, { "ph": "f", "id": 132425, "pid": 76337, "tid": -914061504, "ts": 1716454223412736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223471443, "dur": 162, "args": { "External id": 132426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132426, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132426, "pid": 5, "tid": 7, "ts": 1716454223471443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412742, "dur": 14, "args": { "External id": 132426, "cbid": 211, "correlation": 132426 } }, { "ph": "s", "id": 132426, "pid": 76337, "tid": -914061504, "ts": 1716454223412742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223412814, "dur": 1, "args": { "External id": 132437, "cbid": 251, "correlation": 132437 } }, { "ph": "f", "id": 132437, "pid": 76337, "tid": -914061504, "ts": 1716454223412814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223471606, "dur": 156, "args": { "External id": 132438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132438, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132438, "pid": 5, "tid": 7, "ts": 1716454223471606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412818, "dur": 11, "args": { "External id": 132438, "cbid": 211, "correlation": 132438 } }, { "ph": "s", "id": 132438, "pid": 76337, "tid": -914061504, "ts": 1716454223412818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223412884, "dur": 1, "args": { "External id": 132449, "cbid": 251, "correlation": 132449 } }, { "ph": "f", "id": 132449, "pid": 76337, "tid": -914061504, "ts": 1716454223412884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223471763, "dur": 157, "args": { "External id": 132450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132450, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132450, "pid": 5, "tid": 7, "ts": 1716454223471763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412888, "dur": 11, "args": { "External id": 132450, "cbid": 211, "correlation": 132450 } }, { "ph": "s", "id": 132450, "pid": 76337, "tid": -914061504, "ts": 1716454223412888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223471922, "dur": 336, "args": { "External id": 132475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132475, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132475, "pid": 5, "tid": 7, "ts": 1716454223471922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223412983, "dur": 13, "args": { "External id": 132475, "cbid": 211, "correlation": 132475 } }, { "ph": "s", "id": 132475, "pid": 76337, "tid": -914061504, "ts": 1716454223412983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413087, "dur": 1, "args": { "External id": 132493, "cbid": 251, "correlation": 132493 } }, { "ph": "f", "id": 132493, "pid": 76337, "tid": -914061504, "ts": 1716454223413087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223472259, "dur": 166, "args": { "External id": 132495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132495, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132495, "pid": 5, "tid": 7, "ts": 1716454223472259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413093, "dur": 13, "args": { "External id": 132495, "cbid": 211, "correlation": 132495 } }, { "ph": "s", "id": 132495, "pid": 76337, "tid": -914061504, "ts": 1716454223413093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223472427, "dur": 19, "args": { "External id": 132503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132503, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132503, "pid": 5, "tid": 7, "ts": 1716454223472427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413163, "dur": 13, "args": { "External id": 132503, "cbid": 211, "correlation": 132503 } }, { "ph": "s", "id": 132503, "pid": 76337, "tid": -914061504, "ts": 1716454223413163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223472447, "dur": 28, "args": { "External id": 132511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132511, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132511, "pid": 5, "tid": 7, "ts": 1716454223472447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413203, "dur": 8, "args": { "External id": 132511, "cbid": 211, "correlation": 132511 } }, { "ph": "s", "id": 132511, "pid": 76337, "tid": -914061504, "ts": 1716454223413203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223472476, "dur": 18, "args": { "External id": 132522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132522, "pid": 5, "tid": 7, "ts": 1716454223472476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413277, "dur": 13, "args": { "External id": 132522, "cbid": 211, "correlation": 132522 } }, { "ph": "s", "id": 132522, "pid": 76337, "tid": -914061504, "ts": 1716454223413277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223472496, "dur": 16, "args": { "External id": 132544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132544, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132544, "pid": 5, "tid": 7, "ts": 1716454223472496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413309, "dur": 8, "args": { "External id": 132544, "cbid": 211, "correlation": 132544 } }, { "ph": "s", "id": 132544, "pid": 76337, "tid": -914061504, "ts": 1716454223413309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413395, "dur": 2, "args": { "External id": 132555, "cbid": 251, "correlation": 132555 } }, { "ph": "f", "id": 132555, "pid": 76337, "tid": -914061504, "ts": 1716454223413395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223472513, "dur": 89, "args": { "External id": 132556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132556, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132556, "pid": 5, "tid": 7, "ts": 1716454223472513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413400, "dur": 13, "args": { "External id": 132556, "cbid": 211, "correlation": 132556 } }, { "ph": "s", "id": 132556, "pid": 76337, "tid": -914061504, "ts": 1716454223413400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413471, "dur": 1, "args": { "External id": 132567, "cbid": 251, "correlation": 132567 } }, { "ph": "f", "id": 132567, "pid": 76337, "tid": -914061504, "ts": 1716454223413471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413475, "dur": 0, "args": { "External id": 132568, "cbid": 251, "correlation": 132568 } }, { "ph": "f", "id": 132568, "pid": 76337, "tid": -914061504, "ts": 1716454223413475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223472604, "dur": 12, "args": { "External id": 132569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132569, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132569, "pid": 5, "tid": 7, "ts": 1716454223472604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413477, "dur": 12, "args": { "External id": 132569, "cbid": 211, "correlation": 132569 } }, { "ph": "s", "id": 132569, "pid": 76337, "tid": -914061504, "ts": 1716454223413477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223472617, "dur": 6, "args": { "External id": 132571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132571, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132571, "pid": 5, "tid": 7, "ts": 1716454223472617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413491, "dur": 6, "args": { "External id": 132571, "cbid": 211, "correlation": 132571 } }, { "ph": "s", "id": 132571, "pid": 76337, "tid": -914061504, "ts": 1716454223413491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413548, "dur": 1, "args": { "External id": 132582, "cbid": 251, "correlation": 132582 } }, { "ph": "f", "id": 132582, "pid": 76337, "tid": -914061504, "ts": 1716454223413548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413552, "dur": 0, "args": { "External id": 132583, "cbid": 251, "correlation": 132583 } }, { "ph": "f", "id": 132583, "pid": 76337, "tid": -914061504, "ts": 1716454223413552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223472624, "dur": 9, "args": { "External id": 132584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132584, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132584, "pid": 5, "tid": 7, "ts": 1716454223472624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413553, "dur": 12, "args": { "External id": 132584, "cbid": 211, "correlation": 132584 } }, { "ph": "s", "id": 132584, "pid": 76337, "tid": -914061504, "ts": 1716454223413553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223472634, "dur": 3, "args": { "External id": 132586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132586, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132586, "pid": 5, "tid": 7, "ts": 1716454223472634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413567, "dur": 5, "args": { "External id": 132586, "cbid": 211, "correlation": 132586 } }, { "ph": "s", "id": 132586, "pid": 76337, "tid": -914061504, "ts": 1716454223413567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223472638, "dur": 55, "args": { "External id": 132611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132611, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132611, "pid": 5, "tid": 7, "ts": 1716454223472638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413643, "dur": 12, "args": { "External id": 132611, "cbid": 211, "correlation": 132611 } }, { "ph": "s", "id": 132611, "pid": 76337, "tid": -914061504, "ts": 1716454223413643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223413742, "dur": 2, "args": { "External id": 132629, "cbid": 251, "correlation": 132629 } }, { "ph": "f", "id": 132629, "pid": 76337, "tid": -914061504, "ts": 1716454223413742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223472694, "dur": 90, "args": { "External id": 132631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132631, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132631, "pid": 5, "tid": 7, "ts": 1716454223472694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413748, "dur": 14, "args": { "External id": 132631, "cbid": 211, "correlation": 132631 } }, { "ph": "s", "id": 132631, "pid": 76337, "tid": -914061504, "ts": 1716454223413748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223472785, "dur": 9, "args": { "External id": 132639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132639, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132639, "pid": 5, "tid": 7, "ts": 1716454223472785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413818, "dur": 12, "args": { "External id": 132639, "cbid": 211, "correlation": 132639 } }, { "ph": "s", "id": 132639, "pid": 76337, "tid": -914061504, "ts": 1716454223413818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223472796, "dur": 20, "args": { "External id": 132647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132647, "pid": 5, "tid": 7, "ts": 1716454223472796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413858, "dur": 10, "args": { "External id": 132647, "cbid": 211, "correlation": 132647 } }, { "ph": "s", "id": 132647, "pid": 76337, "tid": -914061504, "ts": 1716454223413858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223472817, "dur": 17, "args": { "External id": 132669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132669, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132669, "pid": 5, "tid": 7, "ts": 1716454223472817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223413909, "dur": 11, "args": { "External id": 132669, "cbid": 211, "correlation": 132669 } }, { "ph": "s", "id": 132669, "pid": 76337, "tid": -914061504, "ts": 1716454223413909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223414006, "dur": 1, "args": { "External id": 132685, "cbid": 251, "correlation": 132685 } }, { "ph": "f", "id": 132685, "pid": 76337, "tid": -914061504, "ts": 1716454223414006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223414011, "dur": 0, "args": { "External id": 132687, "cbid": 251, "correlation": 132687 } }, { "ph": "f", "id": 132687, "pid": 76337, "tid": -914061504, "ts": 1716454223414011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223472835, "dur": 495, "args": { "External id": 132688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132688, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132688, "pid": 5, "tid": 7, "ts": 1716454223472835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414013, "dur": 13, "args": { "External id": 132688, "cbid": 211, "correlation": 132688 } }, { "ph": "s", "id": 132688, "pid": 76337, "tid": -914061504, "ts": 1716454223414013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223473331, "dur": 66, "args": { "External id": 132696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132696, "pid": 5, "tid": 7, "ts": 1716454223473331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414081, "dur": 13, "args": { "External id": 132696, "cbid": 211, "correlation": 132696 } }, { "ph": "s", "id": 132696, "pid": 76337, "tid": -914061504, "ts": 1716454223414081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223473399, "dur": 66, "args": { "External id": 132704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132704, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132704, "pid": 5, "tid": 7, "ts": 1716454223473399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414113, "dur": 8, "args": { "External id": 132704, "cbid": 211, "correlation": 132704 } }, { "ph": "s", "id": 132704, "pid": 76337, "tid": -914061504, "ts": 1716454223414113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223414193, "dur": 1, "args": { "External id": 132720, "cbid": 251, "correlation": 132720 } }, { "ph": "f", "id": 132720, "pid": 76337, "tid": -914061504, "ts": 1716454223414193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223473468, "dur": 1, "args": { "External id": 132722, "device": 5, "context": 1, "stream": 7, "correlation": 132722, "bytes": 240, "memory bandwidth (GB/s)": 0.1595744680851064 } }, { "ph": "f", "id": 132722, "pid": 5, "tid": 7, "ts": 1716454223473468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223414198, "dur": 11, "args": { "External id": 132722, "cbid": 51, "correlation": 132722 } }, { "ph": "s", "id": 132722, "pid": 76337, "tid": -914061504, "ts": 1716454223414198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223473471, "dur": 272, "args": { "External id": 132723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132723, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132723, "pid": 5, "tid": 7, "ts": 1716454223473471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414211, "dur": 11, "args": { "External id": 132723, "cbid": 211, "correlation": 132723 } }, { "ph": "s", "id": 132723, "pid": 76337, "tid": -914061504, "ts": 1716454223414211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223473745, "dur": 14, "args": { "External id": 132731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132731, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132731, "pid": 5, "tid": 7, "ts": 1716454223473745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414251, "dur": 11, "args": { "External id": 132731, "cbid": 211, "correlation": 132731 } }, { "ph": "s", "id": 132731, "pid": 76337, "tid": -914061504, "ts": 1716454223414251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223473760, "dur": 38, "args": { "External id": 132742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132742, "pid": 5, "tid": 7, "ts": 1716454223473760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414321, "dur": 12, "args": { "External id": 132742, "cbid": 211, "correlation": 132742 } }, { "ph": "s", "id": 132742, "pid": 76337, "tid": -914061504, "ts": 1716454223414321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223414385, "dur": 0, "args": { "External id": 132754, "cbid": 317, "correlation": 132754 } }, { "ph": "f", "id": 132754, "pid": 76337, "tid": -914061504, "ts": 1716454223414385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223414386, "dur": 0, "args": { "External id": 132755, "cbid": 203, "correlation": 132755 } }, { "ph": "f", "id": 132755, "pid": 76337, "tid": -914061504, "ts": 1716454223414386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223414387, "dur": 0, "args": { "External id": 132756, "cbid": 205, "correlation": 132756 } }, { "ph": "f", "id": 132756, "pid": 76337, "tid": -914061504, "ts": 1716454223414387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223473799, "dur": 13, "args": { "External id": 132760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132760, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132760, "pid": 5, "tid": 7, "ts": 1716454223473799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414403, "dur": 13, "args": { "External id": 132760, "cbid": 211, "correlation": 132760 } }, { "ph": "s", "id": 132760, "pid": 76337, "tid": -914061504, "ts": 1716454223414403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223473814, "dur": 4, "args": { "External id": 132762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132762, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132762, "pid": 5, "tid": 7, "ts": 1716454223473814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414420, "dur": 6, "args": { "External id": 132762, "cbid": 211, "correlation": 132762 } }, { "ph": "s", "id": 132762, "pid": 76337, "tid": -914061504, "ts": 1716454223414420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223414429, "dur": 0, "args": { "External id": 132763, "cbid": 51, "correlation": 132763 } }, { "ph": "s", "id": 132763, "pid": 76337, "tid": -914061504, "ts": 1716454223414429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223473819, "dur": 97, "args": { "External id": 132764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132764, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 132764, "pid": 5, "tid": 7, "ts": 1716454223473819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414429, "dur": 5, "args": { "External id": 132764, "cbid": 211, "correlation": 132764 } }, { "ph": "s", "id": 132764, "pid": 76337, "tid": -914061504, "ts": 1716454223414429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223473916, "dur": 16, "args": { "External id": 132769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132769, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132769, "pid": 5, "tid": 7, "ts": 1716454223473916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414457, "dur": 9, "args": { "External id": 132769, "cbid": 211, "correlation": 132769 } }, { "ph": "s", "id": 132769, "pid": 76337, "tid": -914061504, "ts": 1716454223414457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223473934, "dur": 12, "args": { "External id": 132777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132777, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132777, "pid": 5, "tid": 7, "ts": 1716454223473934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414489, "dur": 8, "args": { "External id": 132777, "cbid": 211, "correlation": 132777 } }, { "ph": "s", "id": 132777, "pid": 76337, "tid": -914061504, "ts": 1716454223414489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223473947, "dur": 30, "args": { "External id": 132786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132786, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132786, "pid": 5, "tid": 7, "ts": 1716454223473947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414528, "dur": 10, "args": { "External id": 132786, "cbid": 211, "correlation": 132786 } }, { "ph": "s", "id": 132786, "pid": 76337, "tid": -914061504, "ts": 1716454223414528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223473978, "dur": 30, "args": { "External id": 132806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132806, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 132806, "pid": 5, "tid": 7, "ts": 1716454223473978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414598, "dur": 11, "args": { "External id": 132806, "cbid": 211, "correlation": 132806 } }, { "ph": "s", "id": 132806, "pid": 76337, "tid": -914061504, "ts": 1716454223414598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223474010, "dur": 5, "args": { "External id": 132818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132818, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132818, "pid": 5, "tid": 7, "ts": 1716454223474010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414620, "dur": 6, "args": { "External id": 132818, "cbid": 211, "correlation": 132818 } }, { "ph": "s", "id": 132818, "pid": 76337, "tid": -914061504, "ts": 1716454223414620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223474016, "dur": 31, "args": { "External id": 132821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132821, "pid": 5, "tid": 7, "ts": 1716454223474016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414638, "dur": 6, "args": { "External id": 132821, "cbid": 211, "correlation": 132821 } }, { "ph": "s", "id": 132821, "pid": 76337, "tid": -914061504, "ts": 1716454223414638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223474048, "dur": 21, "args": { "External id": 132830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132830, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132830, "pid": 5, "tid": 7, "ts": 1716454223474048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414677, "dur": 9, "args": { "External id": 132830, "cbid": 211, "correlation": 132830 } }, { "ph": "s", "id": 132830, "pid": 76337, "tid": -914061504, "ts": 1716454223414677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223414729, "dur": 0, "args": { "External id": 132840, "cbid": 317, "correlation": 132840 } }, { "ph": "f", "id": 132840, "pid": 76337, "tid": -914061504, "ts": 1716454223414729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223414730, "dur": 0, "args": { "External id": 132841, "cbid": 203, "correlation": 132841 } }, { "ph": "f", "id": 132841, "pid": 76337, "tid": -914061504, "ts": 1716454223414730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223414730, "dur": 0, "args": { "External id": 132842, "cbid": 205, "correlation": 132842 } }, { "ph": "f", "id": 132842, "pid": 76337, "tid": -914061504, "ts": 1716454223414730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223474070, "dur": 22, "args": { "External id": 132846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132846, "pid": 5, "tid": 7, "ts": 1716454223474070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414743, "dur": 11, "args": { "External id": 132846, "cbid": 211, "correlation": 132846 } }, { "ph": "s", "id": 132846, "pid": 76337, "tid": -914061504, "ts": 1716454223414743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223474094, "dur": 318, "args": { "External id": 132848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132848, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132848, "pid": 5, "tid": 7, "ts": 1716454223474094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414757, "dur": 5, "args": { "External id": 132848, "cbid": 211, "correlation": 132848 } }, { "ph": "s", "id": 132848, "pid": 76337, "tid": -914061504, "ts": 1716454223414757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223474414, "dur": 1, "args": { "External id": 132850, "device": 5, "context": 1, "stream": 7, "correlation": 132850, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 132850, "pid": 5, "tid": 7, "ts": 1716454223474414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223414769, "dur": 6, "args": { "External id": 132850, "cbid": 51, "correlation": 132850 } }, { "ph": "s", "id": 132850, "pid": 76337, "tid": -914061504, "ts": 1716454223414769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223474418, "dur": 1250, "args": { "External id": 132851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132851, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132851, "pid": 5, "tid": 7, "ts": 1716454223474418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414776, "dur": 6, "args": { "External id": 132851, "cbid": 211, "correlation": 132851 } }, { "ph": "s", "id": 132851, "pid": 76337, "tid": -914061504, "ts": 1716454223414776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223475669, "dur": 13, "args": { "External id": 132853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132853, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132853, "pid": 5, "tid": 7, "ts": 1716454223475669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414786, "dur": 5, "args": { "External id": 132853, "cbid": 211, "correlation": 132853 } }, { "ph": "s", "id": 132853, "pid": 76337, "tid": -914061504, "ts": 1716454223414786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223475683, "dur": 15, "args": { "External id": 132859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132859, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132859, "pid": 5, "tid": 7, "ts": 1716454223475683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414815, "dur": 8, "args": { "External id": 132859, "cbid": 211, "correlation": 132859 } }, { "ph": "s", "id": 132859, "pid": 76337, "tid": -914061504, "ts": 1716454223414815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223475699, "dur": 4, "args": { "External id": 132867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132867, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 132867, "pid": 5, "tid": 7, "ts": 1716454223475699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414858, "dur": 10, "args": { "External id": 132867, "cbid": 211, "correlation": 132867 } }, { "ph": "s", "id": 132867, "pid": 76337, "tid": -914061504, "ts": 1716454223414858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223414924, "dur": 1, "args": { "External id": 132883, "cbid": 251, "correlation": 132883 } }, { "ph": "f", "id": 132883, "pid": 76337, "tid": -914061504, "ts": 1716454223414924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223414929, "dur": 0, "args": { "External id": 132885, "cbid": 251, "correlation": 132885 } }, { "ph": "f", "id": 132885, "pid": 76337, "tid": -914061504, "ts": 1716454223414929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223475704, "dur": 13, "args": { "External id": 132886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132886, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132886, "pid": 5, "tid": 7, "ts": 1716454223475704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414931, "dur": 11, "args": { "External id": 132886, "cbid": 211, "correlation": 132886 } }, { "ph": "s", "id": 132886, "pid": 76337, "tid": -914061504, "ts": 1716454223414931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223475719, "dur": 5, "args": { "External id": 132888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132888, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132888, "pid": 5, "tid": 7, "ts": 1716454223475719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223414944, "dur": 5, "args": { "External id": 132888, "cbid": 211, "correlation": 132888 } }, { "ph": "s", "id": 132888, "pid": 76337, "tid": -914061504, "ts": 1716454223414944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223475725, "dur": 18, "args": { "External id": 132898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132898, "pid": 5, "tid": 7, "ts": 1716454223475725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415010, "dur": 13, "args": { "External id": 132898, "cbid": 211, "correlation": 132898 } }, { "ph": "s", "id": 132898, "pid": 76337, "tid": -914061504, "ts": 1716454223415010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223475744, "dur": 17, "args": { "External id": 132918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132918, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 132918, "pid": 5, "tid": 7, "ts": 1716454223475744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415078, "dur": 11, "args": { "External id": 132918, "cbid": 211, "correlation": 132918 } }, { "ph": "s", "id": 132918, "pid": 76337, "tid": -914061504, "ts": 1716454223415078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223475762, "dur": 4, "args": { "External id": 132930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132930, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 132930, "pid": 5, "tid": 7, "ts": 1716454223475762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415099, "dur": 6, "args": { "External id": 132930, "cbid": 211, "correlation": 132930 } }, { "ph": "s", "id": 132930, "pid": 76337, "tid": -914061504, "ts": 1716454223415099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223475768, "dur": 16, "args": { "External id": 132933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132933, "pid": 5, "tid": 7, "ts": 1716454223475768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415117, "dur": 7, "args": { "External id": 132933, "cbid": 211, "correlation": 132933 } }, { "ph": "s", "id": 132933, "pid": 76337, "tid": -914061504, "ts": 1716454223415117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223475785, "dur": 11, "args": { "External id": 132942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132942, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132942, "pid": 5, "tid": 7, "ts": 1716454223475785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415157, "dur": 11, "args": { "External id": 132942, "cbid": 211, "correlation": 132942 } }, { "ph": "s", "id": 132942, "pid": 76337, "tid": -914061504, "ts": 1716454223415157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223415220, "dur": 0, "args": { "External id": 132952, "cbid": 317, "correlation": 132952 } }, { "ph": "f", "id": 132952, "pid": 76337, "tid": -914061504, "ts": 1716454223415220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223415221, "dur": 0, "args": { "External id": 132953, "cbid": 203, "correlation": 132953 } }, { "ph": "f", "id": 132953, "pid": 76337, "tid": -914061504, "ts": 1716454223415221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223415222, "dur": 0, "args": { "External id": 132954, "cbid": 205, "correlation": 132954 } }, { "ph": "f", "id": 132954, "pid": 76337, "tid": -914061504, "ts": 1716454223415222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223475798, "dur": 12, "args": { "External id": 132958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132958, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132958, "pid": 5, "tid": 7, "ts": 1716454223475798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415236, "dur": 12, "args": { "External id": 132958, "cbid": 211, "correlation": 132958 } }, { "ph": "s", "id": 132958, "pid": 76337, "tid": -914061504, "ts": 1716454223415236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223475811, "dur": 162, "args": { "External id": 132960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132960, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132960, "pid": 5, "tid": 7, "ts": 1716454223475811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415250, "dur": 5, "args": { "External id": 132960, "cbid": 211, "correlation": 132960 } }, { "ph": "s", "id": 132960, "pid": 76337, "tid": -914061504, "ts": 1716454223415250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223475976, "dur": 1, "args": { "External id": 132962, "device": 5, "context": 1, "stream": 7, "correlation": 132962, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 132962, "pid": 5, "tid": 7, "ts": 1716454223475976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223415261, "dur": 7, "args": { "External id": 132962, "cbid": 51, "correlation": 132962 } }, { "ph": "s", "id": 132962, "pid": 76337, "tid": -914061504, "ts": 1716454223415261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223475979, "dur": 646, "args": { "External id": 132963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132963, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 132963, "pid": 5, "tid": 7, "ts": 1716454223475979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415269, "dur": 6, "args": { "External id": 132963, "cbid": 211, "correlation": 132963 } }, { "ph": "s", "id": 132963, "pid": 76337, "tid": -914061504, "ts": 1716454223415269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223476627, "dur": 13, "args": { "External id": 132965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132965, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132965, "pid": 5, "tid": 7, "ts": 1716454223476627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415279, "dur": 5, "args": { "External id": 132965, "cbid": 211, "correlation": 132965 } }, { "ph": "s", "id": 132965, "pid": 76337, "tid": -914061504, "ts": 1716454223415279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223476641, "dur": 15, "args": { "External id": 132971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132971, "pid": 5, "tid": 7, "ts": 1716454223476641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415306, "dur": 9, "args": { "External id": 132971, "cbid": 211, "correlation": 132971 } }, { "ph": "s", "id": 132971, "pid": 76337, "tid": -914061504, "ts": 1716454223415306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223415366, "dur": 0, "args": { "External id": 132981, "cbid": 317, "correlation": 132981 } }, { "ph": "f", "id": 132981, "pid": 76337, "tid": -914061504, "ts": 1716454223415366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223415366, "dur": 0, "args": { "External id": 132982, "cbid": 203, "correlation": 132982 } }, { "ph": "f", "id": 132982, "pid": 76337, "tid": -914061504, "ts": 1716454223415366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223415367, "dur": 0, "args": { "External id": 132983, "cbid": 205, "correlation": 132983 } }, { "ph": "f", "id": 132983, "pid": 76337, "tid": -914061504, "ts": 1716454223415367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223476657, "dur": 21, "args": { "External id": 132987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132987, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132987, "pid": 5, "tid": 7, "ts": 1716454223476657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415378, "dur": 11, "args": { "External id": 132987, "cbid": 211, "correlation": 132987 } }, { "ph": "s", "id": 132987, "pid": 76337, "tid": -914061504, "ts": 1716454223415378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223476679, "dur": 4, "args": { "External id": 132989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 132989, "pid": 5, "tid": 7, "ts": 1716454223476679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415393, "dur": 6, "args": { "External id": 132989, "cbid": 211, "correlation": 132989 } }, { "ph": "s", "id": 132989, "pid": 76337, "tid": -914061504, "ts": 1716454223415393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223415402, "dur": 0, "args": { "External id": 132990, "cbid": 51, "correlation": 132990 } }, { "ph": "s", "id": 132990, "pid": 76337, "tid": -914061504, "ts": 1716454223415402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223476684, "dur": 170, "args": { "External id": 132991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132991, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 132991, "pid": 5, "tid": 7, "ts": 1716454223476684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415402, "dur": 5, "args": { "External id": 132991, "cbid": 211, "correlation": 132991 } }, { "ph": "s", "id": 132991, "pid": 76337, "tid": -914061504, "ts": 1716454223415402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223476855, "dur": 15, "args": { "External id": 132996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 132996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 132996, "pid": 5, "tid": 7, "ts": 1716454223476855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415428, "dur": 8, "args": { "External id": 132996, "cbid": 211, "correlation": 132996 } }, { "ph": "s", "id": 132996, "pid": 76337, "tid": -914061504, "ts": 1716454223415428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223476871, "dur": 13, "args": { "External id": 133004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133004, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133004, "pid": 5, "tid": 7, "ts": 1716454223476871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415457, "dur": 8, "args": { "External id": 133004, "cbid": 211, "correlation": 133004 } }, { "ph": "s", "id": 133004, "pid": 76337, "tid": -914061504, "ts": 1716454223415457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223476886, "dur": 10, "args": { "External id": 133012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133012, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133012, "pid": 5, "tid": 7, "ts": 1716454223476886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415486, "dur": 8, "args": { "External id": 133012, "cbid": 211, "correlation": 133012 } }, { "ph": "s", "id": 133012, "pid": 76337, "tid": -914061504, "ts": 1716454223415486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223476898, "dur": 18, "args": { "External id": 133032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133032, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 133032, "pid": 5, "tid": 7, "ts": 1716454223476898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415568, "dur": 12, "args": { "External id": 133032, "cbid": 211, "correlation": 133032 } }, { "ph": "s", "id": 133032, "pid": 76337, "tid": -914061504, "ts": 1716454223415568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223476917, "dur": 4, "args": { "External id": 133044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133044, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 133044, "pid": 5, "tid": 7, "ts": 1716454223476917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415589, "dur": 6, "args": { "External id": 133044, "cbid": 211, "correlation": 133044 } }, { "ph": "s", "id": 133044, "pid": 76337, "tid": -914061504, "ts": 1716454223415589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223476923, "dur": 17, "args": { "External id": 133047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133047, "pid": 5, "tid": 7, "ts": 1716454223476923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415607, "dur": 7, "args": { "External id": 133047, "cbid": 211, "correlation": 133047 } }, { "ph": "s", "id": 133047, "pid": 76337, "tid": -914061504, "ts": 1716454223415607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223415664, "dur": 0, "args": { "External id": 133058, "cbid": 317, "correlation": 133058 } }, { "ph": "f", "id": 133058, "pid": 76337, "tid": -914061504, "ts": 1716454223415664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223415665, "dur": 0, "args": { "External id": 133059, "cbid": 203, "correlation": 133059 } }, { "ph": "f", "id": 133059, "pid": 76337, "tid": -914061504, "ts": 1716454223415665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223415666, "dur": 0, "args": { "External id": 133060, "cbid": 205, "correlation": 133060 } }, { "ph": "f", "id": 133060, "pid": 76337, "tid": -914061504, "ts": 1716454223415666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223476941, "dur": 11, "args": { "External id": 133064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133064, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133064, "pid": 5, "tid": 7, "ts": 1716454223476941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415679, "dur": 12, "args": { "External id": 133064, "cbid": 211, "correlation": 133064 } }, { "ph": "s", "id": 133064, "pid": 76337, "tid": -914061504, "ts": 1716454223415679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223476953, "dur": 3, "args": { "External id": 133066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133066, "pid": 5, "tid": 7, "ts": 1716454223476953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415695, "dur": 5, "args": { "External id": 133066, "cbid": 211, "correlation": 133066 } }, { "ph": "s", "id": 133066, "pid": 76337, "tid": -914061504, "ts": 1716454223415695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223415703, "dur": 0, "args": { "External id": 133067, "cbid": 51, "correlation": 133067 } }, { "ph": "s", "id": 133067, "pid": 76337, "tid": -914061504, "ts": 1716454223415703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223476958, "dur": 90, "args": { "External id": 133068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133068, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 133068, "pid": 5, "tid": 7, "ts": 1716454223476958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415703, "dur": 5, "args": { "External id": 133068, "cbid": 211, "correlation": 133068 } }, { "ph": "s", "id": 133068, "pid": 76337, "tid": -914061504, "ts": 1716454223415703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223477049, "dur": 16, "args": { "External id": 133073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133073, "pid": 5, "tid": 7, "ts": 1716454223477049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415730, "dur": 8, "args": { "External id": 133073, "cbid": 211, "correlation": 133073 } }, { "ph": "s", "id": 133073, "pid": 76337, "tid": -914061504, "ts": 1716454223415730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223477066, "dur": 84, "args": { "External id": 133082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133082, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133082, "pid": 5, "tid": 7, "ts": 1716454223477066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415811, "dur": 14, "args": { "External id": 133082, "cbid": 211, "correlation": 133082 } }, { "ph": "s", "id": 133082, "pid": 76337, "tid": -914061504, "ts": 1716454223415811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223477151, "dur": 30, "args": { "External id": 133104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133104, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133104, "pid": 5, "tid": 7, "ts": 1716454223477151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415866, "dur": 11, "args": { "External id": 133104, "cbid": 211, "correlation": 133104 } }, { "ph": "s", "id": 133104, "pid": 76337, "tid": -914061504, "ts": 1716454223415866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223415954, "dur": 1, "args": { "External id": 133115, "cbid": 251, "correlation": 133115 } }, { "ph": "f", "id": 133115, "pid": 76337, "tid": -914061504, "ts": 1716454223415954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223477182, "dur": 163, "args": { "External id": 133116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133116, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133116, "pid": 5, "tid": 7, "ts": 1716454223477182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223415960, "dur": 22, "args": { "External id": 133116, "cbid": 211, "correlation": 133116 } }, { "ph": "s", "id": 133116, "pid": 76337, "tid": -914061504, "ts": 1716454223415960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416041, "dur": 1, "args": { "External id": 133127, "cbid": 251, "correlation": 133127 } }, { "ph": "f", "id": 133127, "pid": 76337, "tid": -914061504, "ts": 1716454223416041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223477347, "dur": 160, "args": { "External id": 133128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133128, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133128, "pid": 5, "tid": 7, "ts": 1716454223477347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416045, "dur": 12, "args": { "External id": 133128, "cbid": 211, "correlation": 133128 } }, { "ph": "s", "id": 133128, "pid": 76337, "tid": -914061504, "ts": 1716454223416045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416111, "dur": 1, "args": { "External id": 133139, "cbid": 251, "correlation": 133139 } }, { "ph": "f", "id": 133139, "pid": 76337, "tid": -914061504, "ts": 1716454223416111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223477508, "dur": 157, "args": { "External id": 133140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133140, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133140, "pid": 5, "tid": 7, "ts": 1716454223477508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416115, "dur": 12, "args": { "External id": 133140, "cbid": 211, "correlation": 133140 } }, { "ph": "s", "id": 133140, "pid": 76337, "tid": -914061504, "ts": 1716454223416115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223477667, "dur": 333, "args": { "External id": 133165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133165, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133165, "pid": 5, "tid": 7, "ts": 1716454223477667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416198, "dur": 12, "args": { "External id": 133165, "cbid": 211, "correlation": 133165 } }, { "ph": "s", "id": 133165, "pid": 76337, "tid": -914061504, "ts": 1716454223416198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416297, "dur": 1, "args": { "External id": 133183, "cbid": 251, "correlation": 133183 } }, { "ph": "f", "id": 133183, "pid": 76337, "tid": -914061504, "ts": 1716454223416297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223478001, "dur": 168, "args": { "External id": 133185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133185, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133185, "pid": 5, "tid": 7, "ts": 1716454223478001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416302, "dur": 13, "args": { "External id": 133185, "cbid": 211, "correlation": 133185 } }, { "ph": "s", "id": 133185, "pid": 76337, "tid": -914061504, "ts": 1716454223416302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223478171, "dur": 19, "args": { "External id": 133193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133193, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133193, "pid": 5, "tid": 7, "ts": 1716454223478171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416373, "dur": 12, "args": { "External id": 133193, "cbid": 211, "correlation": 133193 } }, { "ph": "s", "id": 133193, "pid": 76337, "tid": -914061504, "ts": 1716454223416373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223478191, "dur": 29, "args": { "External id": 133201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133201, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133201, "pid": 5, "tid": 7, "ts": 1716454223478191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416412, "dur": 9, "args": { "External id": 133201, "cbid": 211, "correlation": 133201 } }, { "ph": "s", "id": 133201, "pid": 76337, "tid": -914061504, "ts": 1716454223416412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223478221, "dur": 19, "args": { "External id": 133212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133212, "pid": 5, "tid": 7, "ts": 1716454223478221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416484, "dur": 12, "args": { "External id": 133212, "cbid": 211, "correlation": 133212 } }, { "ph": "s", "id": 133212, "pid": 76337, "tid": -914061504, "ts": 1716454223416484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223478241, "dur": 16, "args": { "External id": 133234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133234, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133234, "pid": 5, "tid": 7, "ts": 1716454223478241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416514, "dur": 8, "args": { "External id": 133234, "cbid": 211, "correlation": 133234 } }, { "ph": "s", "id": 133234, "pid": 76337, "tid": -914061504, "ts": 1716454223416514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416599, "dur": 1, "args": { "External id": 133245, "cbid": 251, "correlation": 133245 } }, { "ph": "f", "id": 133245, "pid": 76337, "tid": -914061504, "ts": 1716454223416599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223478258, "dur": 89, "args": { "External id": 133246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133246, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133246, "pid": 5, "tid": 7, "ts": 1716454223478258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416604, "dur": 13, "args": { "External id": 133246, "cbid": 211, "correlation": 133246 } }, { "ph": "s", "id": 133246, "pid": 76337, "tid": -914061504, "ts": 1716454223416604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416674, "dur": 1, "args": { "External id": 133257, "cbid": 251, "correlation": 133257 } }, { "ph": "f", "id": 133257, "pid": 76337, "tid": -914061504, "ts": 1716454223416674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416678, "dur": 0, "args": { "External id": 133258, "cbid": 251, "correlation": 133258 } }, { "ph": "f", "id": 133258, "pid": 76337, "tid": -914061504, "ts": 1716454223416678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223478349, "dur": 13, "args": { "External id": 133259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133259, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133259, "pid": 5, "tid": 7, "ts": 1716454223478349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416680, "dur": 11, "args": { "External id": 133259, "cbid": 211, "correlation": 133259 } }, { "ph": "s", "id": 133259, "pid": 76337, "tid": -914061504, "ts": 1716454223416680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223478363, "dur": 6, "args": { "External id": 133261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133261, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133261, "pid": 5, "tid": 7, "ts": 1716454223478363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416693, "dur": 6, "args": { "External id": 133261, "cbid": 211, "correlation": 133261 } }, { "ph": "s", "id": 133261, "pid": 76337, "tid": -914061504, "ts": 1716454223416693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416750, "dur": 1, "args": { "External id": 133272, "cbid": 251, "correlation": 133272 } }, { "ph": "f", "id": 133272, "pid": 76337, "tid": -914061504, "ts": 1716454223416750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416753, "dur": 0, "args": { "External id": 133273, "cbid": 251, "correlation": 133273 } }, { "ph": "f", "id": 133273, "pid": 76337, "tid": -914061504, "ts": 1716454223416753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223478369, "dur": 8, "args": { "External id": 133274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133274, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133274, "pid": 5, "tid": 7, "ts": 1716454223478369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416755, "dur": 12, "args": { "External id": 133274, "cbid": 211, "correlation": 133274 } }, { "ph": "s", "id": 133274, "pid": 76337, "tid": -914061504, "ts": 1716454223416755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223478379, "dur": 3, "args": { "External id": 133276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133276, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133276, "pid": 5, "tid": 7, "ts": 1716454223478379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416768, "dur": 6, "args": { "External id": 133276, "cbid": 211, "correlation": 133276 } }, { "ph": "s", "id": 133276, "pid": 76337, "tid": -914061504, "ts": 1716454223416768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223478383, "dur": 54, "args": { "External id": 133301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133301, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133301, "pid": 5, "tid": 7, "ts": 1716454223478383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416844, "dur": 12, "args": { "External id": 133301, "cbid": 211, "correlation": 133301 } }, { "ph": "s", "id": 133301, "pid": 76337, "tid": -914061504, "ts": 1716454223416844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223416944, "dur": 1, "args": { "External id": 133319, "cbid": 251, "correlation": 133319 } }, { "ph": "f", "id": 133319, "pid": 76337, "tid": -914061504, "ts": 1716454223416944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223478438, "dur": 91, "args": { "External id": 133321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133321, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133321, "pid": 5, "tid": 7, "ts": 1716454223478438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223416950, "dur": 14, "args": { "External id": 133321, "cbid": 211, "correlation": 133321 } }, { "ph": "s", "id": 133321, "pid": 76337, "tid": -914061504, "ts": 1716454223416950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223478530, "dur": 9, "args": { "External id": 133329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133329, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133329, "pid": 5, "tid": 7, "ts": 1716454223478530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417029, "dur": 13, "args": { "External id": 133329, "cbid": 211, "correlation": 133329 } }, { "ph": "s", "id": 133329, "pid": 76337, "tid": -914061504, "ts": 1716454223417029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223478541, "dur": 21, "args": { "External id": 133337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133337, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133337, "pid": 5, "tid": 7, "ts": 1716454223478541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417071, "dur": 10, "args": { "External id": 133337, "cbid": 211, "correlation": 133337 } }, { "ph": "s", "id": 133337, "pid": 76337, "tid": -914061504, "ts": 1716454223417071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223478564, "dur": 18, "args": { "External id": 133359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133359, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133359, "pid": 5, "tid": 7, "ts": 1716454223478564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417123, "dur": 10, "args": { "External id": 133359, "cbid": 211, "correlation": 133359 } }, { "ph": "s", "id": 133359, "pid": 76337, "tid": -914061504, "ts": 1716454223417123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223417208, "dur": 1, "args": { "External id": 133375, "cbid": 251, "correlation": 133375 } }, { "ph": "f", "id": 133375, "pid": 76337, "tid": -914061504, "ts": 1716454223417208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223417214, "dur": 0, "args": { "External id": 133377, "cbid": 251, "correlation": 133377 } }, { "ph": "f", "id": 133377, "pid": 76337, "tid": -914061504, "ts": 1716454223417214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223478583, "dur": 495, "args": { "External id": 133378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133378, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133378, "pid": 5, "tid": 7, "ts": 1716454223478583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417215, "dur": 14, "args": { "External id": 133378, "cbid": 211, "correlation": 133378 } }, { "ph": "s", "id": 133378, "pid": 76337, "tid": -914061504, "ts": 1716454223417215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223479079, "dur": 65, "args": { "External id": 133386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133386, "pid": 5, "tid": 7, "ts": 1716454223479079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417282, "dur": 12, "args": { "External id": 133386, "cbid": 211, "correlation": 133386 } }, { "ph": "s", "id": 133386, "pid": 76337, "tid": -914061504, "ts": 1716454223417282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223479146, "dur": 67, "args": { "External id": 133394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133394, "pid": 5, "tid": 7, "ts": 1716454223479146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417312, "dur": 9, "args": { "External id": 133394, "cbid": 211, "correlation": 133394 } }, { "ph": "s", "id": 133394, "pid": 76337, "tid": -914061504, "ts": 1716454223417312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223417392, "dur": 1, "args": { "External id": 133410, "cbid": 251, "correlation": 133410 } }, { "ph": "f", "id": 133410, "pid": 76337, "tid": -914061504, "ts": 1716454223417392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223479215, "dur": 1, "args": { "External id": 133412, "device": 5, "context": 1, "stream": 7, "correlation": 133412, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 133412, "pid": 5, "tid": 7, "ts": 1716454223479215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223417397, "dur": 9, "args": { "External id": 133412, "cbid": 51, "correlation": 133412 } }, { "ph": "s", "id": 133412, "pid": 76337, "tid": -914061504, "ts": 1716454223417397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223479218, "dur": 268, "args": { "External id": 133413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133413, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133413, "pid": 5, "tid": 7, "ts": 1716454223479218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417408, "dur": 11, "args": { "External id": 133413, "cbid": 211, "correlation": 133413 } }, { "ph": "s", "id": 133413, "pid": 76337, "tid": -914061504, "ts": 1716454223417408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223479488, "dur": 14, "args": { "External id": 133421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133421, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133421, "pid": 5, "tid": 7, "ts": 1716454223479488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417450, "dur": 10, "args": { "External id": 133421, "cbid": 211, "correlation": 133421 } }, { "ph": "s", "id": 133421, "pid": 76337, "tid": -914061504, "ts": 1716454223417450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223479503, "dur": 37, "args": { "External id": 133432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133432, "pid": 5, "tid": 7, "ts": 1716454223479503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417518, "dur": 13, "args": { "External id": 133432, "cbid": 211, "correlation": 133432 } }, { "ph": "s", "id": 133432, "pid": 76337, "tid": -914061504, "ts": 1716454223417518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223417583, "dur": 0, "args": { "External id": 133444, "cbid": 317, "correlation": 133444 } }, { "ph": "f", "id": 133444, "pid": 76337, "tid": -914061504, "ts": 1716454223417583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223417584, "dur": 0, "args": { "External id": 133445, "cbid": 203, "correlation": 133445 } }, { "ph": "f", "id": 133445, "pid": 76337, "tid": -914061504, "ts": 1716454223417584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223417585, "dur": 0, "args": { "External id": 133446, "cbid": 205, "correlation": 133446 } }, { "ph": "f", "id": 133446, "pid": 76337, "tid": -914061504, "ts": 1716454223417585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223479541, "dur": 12, "args": { "External id": 133450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133450, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133450, "pid": 5, "tid": 7, "ts": 1716454223479541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417599, "dur": 12, "args": { "External id": 133450, "cbid": 211, "correlation": 133450 } }, { "ph": "s", "id": 133450, "pid": 76337, "tid": -914061504, "ts": 1716454223417599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223479555, "dur": 4, "args": { "External id": 133452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133452, "pid": 5, "tid": 7, "ts": 1716454223479555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417615, "dur": 6, "args": { "External id": 133452, "cbid": 211, "correlation": 133452 } }, { "ph": "s", "id": 133452, "pid": 76337, "tid": -914061504, "ts": 1716454223417615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223417624, "dur": 0, "args": { "External id": 133453, "cbid": 51, "correlation": 133453 } }, { "ph": "s", "id": 133453, "pid": 76337, "tid": -914061504, "ts": 1716454223417624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223479560, "dur": 95, "args": { "External id": 133454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133454, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 133454, "pid": 5, "tid": 7, "ts": 1716454223479560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417625, "dur": 5, "args": { "External id": 133454, "cbid": 211, "correlation": 133454 } }, { "ph": "s", "id": 133454, "pid": 76337, "tid": -914061504, "ts": 1716454223417625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223479657, "dur": 17, "args": { "External id": 133459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133459, "pid": 5, "tid": 7, "ts": 1716454223479657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417652, "dur": 9, "args": { "External id": 133459, "cbid": 211, "correlation": 133459 } }, { "ph": "s", "id": 133459, "pid": 76337, "tid": -914061504, "ts": 1716454223417652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223479675, "dur": 12, "args": { "External id": 133467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133467, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133467, "pid": 5, "tid": 7, "ts": 1716454223479675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417684, "dur": 8, "args": { "External id": 133467, "cbid": 211, "correlation": 133467 } }, { "ph": "s", "id": 133467, "pid": 76337, "tid": -914061504, "ts": 1716454223417684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223479688, "dur": 25, "args": { "External id": 133476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133476, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133476, "pid": 5, "tid": 7, "ts": 1716454223479688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417723, "dur": 10, "args": { "External id": 133476, "cbid": 211, "correlation": 133476 } }, { "ph": "s", "id": 133476, "pid": 76337, "tid": -914061504, "ts": 1716454223417723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223479715, "dur": 24, "args": { "External id": 133496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133496, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 133496, "pid": 5, "tid": 7, "ts": 1716454223479715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417793, "dur": 11, "args": { "External id": 133496, "cbid": 211, "correlation": 133496 } }, { "ph": "s", "id": 133496, "pid": 76337, "tid": -914061504, "ts": 1716454223417793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223479740, "dur": 5, "args": { "External id": 133508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133508, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 133508, "pid": 5, "tid": 7, "ts": 1716454223479740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417814, "dur": 6, "args": { "External id": 133508, "cbid": 211, "correlation": 133508 } }, { "ph": "s", "id": 133508, "pid": 76337, "tid": -914061504, "ts": 1716454223417814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223479746, "dur": 26, "args": { "External id": 133511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133511, "pid": 5, "tid": 7, "ts": 1716454223479746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417833, "dur": 7, "args": { "External id": 133511, "cbid": 211, "correlation": 133511 } }, { "ph": "s", "id": 133511, "pid": 76337, "tid": -914061504, "ts": 1716454223417833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223479773, "dur": 17, "args": { "External id": 133520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133520, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133520, "pid": 5, "tid": 7, "ts": 1716454223479773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417872, "dur": 10, "args": { "External id": 133520, "cbid": 211, "correlation": 133520 } }, { "ph": "s", "id": 133520, "pid": 76337, "tid": -914061504, "ts": 1716454223417872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223417924, "dur": 0, "args": { "External id": 133530, "cbid": 317, "correlation": 133530 } }, { "ph": "f", "id": 133530, "pid": 76337, "tid": -914061504, "ts": 1716454223417924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223417925, "dur": 0, "args": { "External id": 133531, "cbid": 203, "correlation": 133531 } }, { "ph": "f", "id": 133531, "pid": 76337, "tid": -914061504, "ts": 1716454223417925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223417926, "dur": 0, "args": { "External id": 133532, "cbid": 205, "correlation": 133532 } }, { "ph": "f", "id": 133532, "pid": 76337, "tid": -914061504, "ts": 1716454223417926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223479791, "dur": 17, "args": { "External id": 133536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133536, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133536, "pid": 5, "tid": 7, "ts": 1716454223479791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417940, "dur": 11, "args": { "External id": 133536, "cbid": 211, "correlation": 133536 } }, { "ph": "s", "id": 133536, "pid": 76337, "tid": -914061504, "ts": 1716454223417940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223479809, "dur": 241, "args": { "External id": 133538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133538, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133538, "pid": 5, "tid": 7, "ts": 1716454223479809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417954, "dur": 5, "args": { "External id": 133538, "cbid": 211, "correlation": 133538 } }, { "ph": "s", "id": 133538, "pid": 76337, "tid": -914061504, "ts": 1716454223417954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223480053, "dur": 1, "args": { "External id": 133540, "device": 5, "context": 1, "stream": 7, "correlation": 133540, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 133540, "pid": 5, "tid": 7, "ts": 1716454223480053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223417966, "dur": 16, "args": { "External id": 133540, "cbid": 51, "correlation": 133540 } }, { "ph": "s", "id": 133540, "pid": 76337, "tid": -914061504, "ts": 1716454223417966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223480056, "dur": 808, "args": { "External id": 133541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133541, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133541, "pid": 5, "tid": 7, "ts": 1716454223480056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417983, "dur": 7, "args": { "External id": 133541, "cbid": 211, "correlation": 133541 } }, { "ph": "s", "id": 133541, "pid": 76337, "tid": -914061504, "ts": 1716454223417983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223480866, "dur": 13, "args": { "External id": 133543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133543, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133543, "pid": 5, "tid": 7, "ts": 1716454223480866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223417994, "dur": 5, "args": { "External id": 133543, "cbid": 211, "correlation": 133543 } }, { "ph": "s", "id": 133543, "pid": 76337, "tid": -914061504, "ts": 1716454223417994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223480880, "dur": 15, "args": { "External id": 133549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133549, "pid": 5, "tid": 7, "ts": 1716454223480880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418023, "dur": 8, "args": { "External id": 133549, "cbid": 211, "correlation": 133549 } }, { "ph": "s", "id": 133549, "pid": 76337, "tid": -914061504, "ts": 1716454223418023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223480897, "dur": 3, "args": { "External id": 133557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133557, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 133557, "pid": 5, "tid": 7, "ts": 1716454223480897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418067, "dur": 10, "args": { "External id": 133557, "cbid": 211, "correlation": 133557 } }, { "ph": "s", "id": 133557, "pid": 76337, "tid": -914061504, "ts": 1716454223418067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223418133, "dur": 1, "args": { "External id": 133573, "cbid": 251, "correlation": 133573 } }, { "ph": "f", "id": 133573, "pid": 76337, "tid": -914061504, "ts": 1716454223418133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223418138, "dur": 0, "args": { "External id": 133575, "cbid": 251, "correlation": 133575 } }, { "ph": "f", "id": 133575, "pid": 76337, "tid": -914061504, "ts": 1716454223418138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223480901, "dur": 13, "args": { "External id": 133576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133576, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133576, "pid": 5, "tid": 7, "ts": 1716454223480901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418140, "dur": 11, "args": { "External id": 133576, "cbid": 211, "correlation": 133576 } }, { "ph": "s", "id": 133576, "pid": 76337, "tid": -914061504, "ts": 1716454223418140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223480916, "dur": 5, "args": { "External id": 133578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133578, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133578, "pid": 5, "tid": 7, "ts": 1716454223480916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418153, "dur": 5, "args": { "External id": 133578, "cbid": 211, "correlation": 133578 } }, { "ph": "s", "id": 133578, "pid": 76337, "tid": -914061504, "ts": 1716454223418153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223480922, "dur": 17, "args": { "External id": 133588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133588, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133588, "pid": 5, "tid": 7, "ts": 1716454223480922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418211, "dur": 13, "args": { "External id": 133588, "cbid": 211, "correlation": 133588 } }, { "ph": "s", "id": 133588, "pid": 76337, "tid": -914061504, "ts": 1716454223418211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223480940, "dur": 18, "args": { "External id": 133608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133608, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 133608, "pid": 5, "tid": 7, "ts": 1716454223480940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418278, "dur": 10, "args": { "External id": 133608, "cbid": 211, "correlation": 133608 } }, { "ph": "s", "id": 133608, "pid": 76337, "tid": -914061504, "ts": 1716454223418278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223480959, "dur": 4, "args": { "External id": 133620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133620, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 133620, "pid": 5, "tid": 7, "ts": 1716454223480959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418298, "dur": 6, "args": { "External id": 133620, "cbid": 211, "correlation": 133620 } }, { "ph": "s", "id": 133620, "pid": 76337, "tid": -914061504, "ts": 1716454223418298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223480965, "dur": 16, "args": { "External id": 133623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133623, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133623, "pid": 5, "tid": 7, "ts": 1716454223480965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418317, "dur": 7, "args": { "External id": 133623, "cbid": 211, "correlation": 133623 } }, { "ph": "s", "id": 133623, "pid": 76337, "tid": -914061504, "ts": 1716454223418317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223480982, "dur": 11, "args": { "External id": 133632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133632, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133632, "pid": 5, "tid": 7, "ts": 1716454223480982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418357, "dur": 10, "args": { "External id": 133632, "cbid": 211, "correlation": 133632 } }, { "ph": "s", "id": 133632, "pid": 76337, "tid": -914061504, "ts": 1716454223418357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223418419, "dur": 0, "args": { "External id": 133642, "cbid": 317, "correlation": 133642 } }, { "ph": "f", "id": 133642, "pid": 76337, "tid": -914061504, "ts": 1716454223418419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223418420, "dur": 0, "args": { "External id": 133643, "cbid": 203, "correlation": 133643 } }, { "ph": "f", "id": 133643, "pid": 76337, "tid": -914061504, "ts": 1716454223418420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223418421, "dur": 0, "args": { "External id": 133644, "cbid": 205, "correlation": 133644 } }, { "ph": "f", "id": 133644, "pid": 76337, "tid": -914061504, "ts": 1716454223418421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223480994, "dur": 11, "args": { "External id": 133648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133648, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133648, "pid": 5, "tid": 7, "ts": 1716454223480994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418434, "dur": 12, "args": { "External id": 133648, "cbid": 211, "correlation": 133648 } }, { "ph": "s", "id": 133648, "pid": 76337, "tid": -914061504, "ts": 1716454223418434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223481007, "dur": 163, "args": { "External id": 133650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133650, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133650, "pid": 5, "tid": 7, "ts": 1716454223481007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418448, "dur": 5, "args": { "External id": 133650, "cbid": 211, "correlation": 133650 } }, { "ph": "s", "id": 133650, "pid": 76337, "tid": -914061504, "ts": 1716454223418448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223481172, "dur": 1, "args": { "External id": 133652, "device": 5, "context": 1, "stream": 7, "correlation": 133652, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 133652, "pid": 5, "tid": 7, "ts": 1716454223481172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223418459, "dur": 7, "args": { "External id": 133652, "cbid": 51, "correlation": 133652 } }, { "ph": "s", "id": 133652, "pid": 76337, "tid": -914061504, "ts": 1716454223418459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223481176, "dur": 645, "args": { "External id": 133653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133653, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133653, "pid": 5, "tid": 7, "ts": 1716454223481176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418467, "dur": 6, "args": { "External id": 133653, "cbid": 211, "correlation": 133653 } }, { "ph": "s", "id": 133653, "pid": 76337, "tid": -914061504, "ts": 1716454223418467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223481822, "dur": 13, "args": { "External id": 133655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133655, "pid": 5, "tid": 7, "ts": 1716454223481822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418478, "dur": 5, "args": { "External id": 133655, "cbid": 211, "correlation": 133655 } }, { "ph": "s", "id": 133655, "pid": 76337, "tid": -914061504, "ts": 1716454223418478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223481837, "dur": 14, "args": { "External id": 133661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133661, "pid": 5, "tid": 7, "ts": 1716454223481837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418506, "dur": 9, "args": { "External id": 133661, "cbid": 211, "correlation": 133661 } }, { "ph": "s", "id": 133661, "pid": 76337, "tid": -914061504, "ts": 1716454223418506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223418565, "dur": 0, "args": { "External id": 133671, "cbid": 317, "correlation": 133671 } }, { "ph": "f", "id": 133671, "pid": 76337, "tid": -914061504, "ts": 1716454223418565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223418566, "dur": 0, "args": { "External id": 133672, "cbid": 203, "correlation": 133672 } }, { "ph": "f", "id": 133672, "pid": 76337, "tid": -914061504, "ts": 1716454223418566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223418566, "dur": 0, "args": { "External id": 133673, "cbid": 205, "correlation": 133673 } }, { "ph": "f", "id": 133673, "pid": 76337, "tid": -914061504, "ts": 1716454223418566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223481852, "dur": 17, "args": { "External id": 133677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133677, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133677, "pid": 5, "tid": 7, "ts": 1716454223481852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418580, "dur": 12, "args": { "External id": 133677, "cbid": 211, "correlation": 133677 } }, { "ph": "s", "id": 133677, "pid": 76337, "tid": -914061504, "ts": 1716454223418580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223481871, "dur": 4, "args": { "External id": 133679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133679, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133679, "pid": 5, "tid": 7, "ts": 1716454223481871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418596, "dur": 6, "args": { "External id": 133679, "cbid": 211, "correlation": 133679 } }, { "ph": "s", "id": 133679, "pid": 76337, "tid": -914061504, "ts": 1716454223418596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223418605, "dur": 0, "args": { "External id": 133680, "cbid": 51, "correlation": 133680 } }, { "ph": "s", "id": 133680, "pid": 76337, "tid": -914061504, "ts": 1716454223418605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223481876, "dur": 131, "args": { "External id": 133681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133681, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 133681, "pid": 5, "tid": 7, "ts": 1716454223481876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418606, "dur": 5, "args": { "External id": 133681, "cbid": 211, "correlation": 133681 } }, { "ph": "s", "id": 133681, "pid": 76337, "tid": -914061504, "ts": 1716454223418606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223482008, "dur": 15, "args": { "External id": 133686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133686, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133686, "pid": 5, "tid": 7, "ts": 1716454223482008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418632, "dur": 8, "args": { "External id": 133686, "cbid": 211, "correlation": 133686 } }, { "ph": "s", "id": 133686, "pid": 76337, "tid": -914061504, "ts": 1716454223418632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223482024, "dur": 12, "args": { "External id": 133694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133694, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133694, "pid": 5, "tid": 7, "ts": 1716454223482024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418660, "dur": 8, "args": { "External id": 133694, "cbid": 211, "correlation": 133694 } }, { "ph": "s", "id": 133694, "pid": 76337, "tid": -914061504, "ts": 1716454223418660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223482037, "dur": 10, "args": { "External id": 133702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133702, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133702, "pid": 5, "tid": 7, "ts": 1716454223482037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418689, "dur": 8, "args": { "External id": 133702, "cbid": 211, "correlation": 133702 } }, { "ph": "s", "id": 133702, "pid": 76337, "tid": -914061504, "ts": 1716454223418689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223482049, "dur": 18, "args": { "External id": 133722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133722, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 133722, "pid": 5, "tid": 7, "ts": 1716454223482049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418770, "dur": 13, "args": { "External id": 133722, "cbid": 211, "correlation": 133722 } }, { "ph": "s", "id": 133722, "pid": 76337, "tid": -914061504, "ts": 1716454223418770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223482068, "dur": 4, "args": { "External id": 133734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133734, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 133734, "pid": 5, "tid": 7, "ts": 1716454223482068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418793, "dur": 6, "args": { "External id": 133734, "cbid": 211, "correlation": 133734 } }, { "ph": "s", "id": 133734, "pid": 76337, "tid": -914061504, "ts": 1716454223418793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223482074, "dur": 17, "args": { "External id": 133737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133737, "pid": 5, "tid": 7, "ts": 1716454223482074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418811, "dur": 7, "args": { "External id": 133737, "cbid": 211, "correlation": 133737 } }, { "ph": "s", "id": 133737, "pid": 76337, "tid": -914061504, "ts": 1716454223418811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223418867, "dur": 0, "args": { "External id": 133748, "cbid": 317, "correlation": 133748 } }, { "ph": "f", "id": 133748, "pid": 76337, "tid": -914061504, "ts": 1716454223418867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223418868, "dur": 0, "args": { "External id": 133749, "cbid": 203, "correlation": 133749 } }, { "ph": "f", "id": 133749, "pid": 76337, "tid": -914061504, "ts": 1716454223418868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223418869, "dur": 0, "args": { "External id": 133750, "cbid": 205, "correlation": 133750 } }, { "ph": "f", "id": 133750, "pid": 76337, "tid": -914061504, "ts": 1716454223418869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223482092, "dur": 11, "args": { "External id": 133754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133754, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133754, "pid": 5, "tid": 7, "ts": 1716454223482092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418882, "dur": 12, "args": { "External id": 133754, "cbid": 211, "correlation": 133754 } }, { "ph": "s", "id": 133754, "pid": 76337, "tid": -914061504, "ts": 1716454223418882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223482104, "dur": 3, "args": { "External id": 133756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133756, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133756, "pid": 5, "tid": 7, "ts": 1716454223482104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418898, "dur": 5, "args": { "External id": 133756, "cbid": 211, "correlation": 133756 } }, { "ph": "s", "id": 133756, "pid": 76337, "tid": -914061504, "ts": 1716454223418898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223418906, "dur": 0, "args": { "External id": 133757, "cbid": 51, "correlation": 133757 } }, { "ph": "s", "id": 133757, "pid": 76337, "tid": -914061504, "ts": 1716454223418906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223482109, "dur": 90, "args": { "External id": 133758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133758, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 133758, "pid": 5, "tid": 7, "ts": 1716454223482109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418907, "dur": 5, "args": { "External id": 133758, "cbid": 211, "correlation": 133758 } }, { "ph": "s", "id": 133758, "pid": 76337, "tid": -914061504, "ts": 1716454223418907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223482200, "dur": 15, "args": { "External id": 133763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133763, "pid": 5, "tid": 7, "ts": 1716454223482200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223418933, "dur": 8, "args": { "External id": 133763, "cbid": 211, "correlation": 133763 } }, { "ph": "s", "id": 133763, "pid": 76337, "tid": -914061504, "ts": 1716454223418933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223482217, "dur": 83, "args": { "External id": 133772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133772, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133772, "pid": 5, "tid": 7, "ts": 1716454223482217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419023, "dur": 14, "args": { "External id": 133772, "cbid": 211, "correlation": 133772 } }, { "ph": "s", "id": 133772, "pid": 76337, "tid": -914061504, "ts": 1716454223419023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223482301, "dur": 30, "args": { "External id": 133794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133794, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133794, "pid": 5, "tid": 7, "ts": 1716454223482301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419081, "dur": 10, "args": { "External id": 133794, "cbid": 211, "correlation": 133794 } }, { "ph": "s", "id": 133794, "pid": 76337, "tid": -914061504, "ts": 1716454223419081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419169, "dur": 1, "args": { "External id": 133805, "cbid": 251, "correlation": 133805 } }, { "ph": "f", "id": 133805, "pid": 76337, "tid": -914061504, "ts": 1716454223419169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223482332, "dur": 164, "args": { "External id": 133806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133806, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133806, "pid": 5, "tid": 7, "ts": 1716454223482332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419175, "dur": 13, "args": { "External id": 133806, "cbid": 211, "correlation": 133806 } }, { "ph": "s", "id": 133806, "pid": 76337, "tid": -914061504, "ts": 1716454223419175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419244, "dur": 1, "args": { "External id": 133817, "cbid": 251, "correlation": 133817 } }, { "ph": "f", "id": 133817, "pid": 76337, "tid": -914061504, "ts": 1716454223419244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223482498, "dur": 156, "args": { "External id": 133818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133818, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133818, "pid": 5, "tid": 7, "ts": 1716454223482498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419248, "dur": 11, "args": { "External id": 133818, "cbid": 211, "correlation": 133818 } }, { "ph": "s", "id": 133818, "pid": 76337, "tid": -914061504, "ts": 1716454223419248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419314, "dur": 1, "args": { "External id": 133829, "cbid": 251, "correlation": 133829 } }, { "ph": "f", "id": 133829, "pid": 76337, "tid": -914061504, "ts": 1716454223419314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223482656, "dur": 158, "args": { "External id": 133830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133830, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133830, "pid": 5, "tid": 7, "ts": 1716454223482656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419318, "dur": 12, "args": { "External id": 133830, "cbid": 211, "correlation": 133830 } }, { "ph": "s", "id": 133830, "pid": 76337, "tid": -914061504, "ts": 1716454223419318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223482815, "dur": 335, "args": { "External id": 133855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133855, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133855, "pid": 5, "tid": 7, "ts": 1716454223482815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419401, "dur": 12, "args": { "External id": 133855, "cbid": 211, "correlation": 133855 } }, { "ph": "s", "id": 133855, "pid": 76337, "tid": -914061504, "ts": 1716454223419401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419501, "dur": 1, "args": { "External id": 133873, "cbid": 251, "correlation": 133873 } }, { "ph": "f", "id": 133873, "pid": 76337, "tid": -914061504, "ts": 1716454223419501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223483152, "dur": 164, "args": { "External id": 133875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133875, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133875, "pid": 5, "tid": 7, "ts": 1716454223483152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419507, "dur": 13, "args": { "External id": 133875, "cbid": 211, "correlation": 133875 } }, { "ph": "s", "id": 133875, "pid": 76337, "tid": -914061504, "ts": 1716454223419507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223483318, "dur": 20, "args": { "External id": 133883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133883, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133883, "pid": 5, "tid": 7, "ts": 1716454223483318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419578, "dur": 12, "args": { "External id": 133883, "cbid": 211, "correlation": 133883 } }, { "ph": "s", "id": 133883, "pid": 76337, "tid": -914061504, "ts": 1716454223419578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223483338, "dur": 28, "args": { "External id": 133891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133891, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133891, "pid": 5, "tid": 7, "ts": 1716454223483338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419616, "dur": 9, "args": { "External id": 133891, "cbid": 211, "correlation": 133891 } }, { "ph": "s", "id": 133891, "pid": 76337, "tid": -914061504, "ts": 1716454223419616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223483368, "dur": 19, "args": { "External id": 133902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133902, "pid": 5, "tid": 7, "ts": 1716454223483368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419689, "dur": 13, "args": { "External id": 133902, "cbid": 211, "correlation": 133902 } }, { "ph": "s", "id": 133902, "pid": 76337, "tid": -914061504, "ts": 1716454223419689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223483388, "dur": 16, "args": { "External id": 133924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133924, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 133924, "pid": 5, "tid": 7, "ts": 1716454223483388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419719, "dur": 8, "args": { "External id": 133924, "cbid": 211, "correlation": 133924 } }, { "ph": "s", "id": 133924, "pid": 76337, "tid": -914061504, "ts": 1716454223419719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419805, "dur": 1, "args": { "External id": 133935, "cbid": 251, "correlation": 133935 } }, { "ph": "f", "id": 133935, "pid": 76337, "tid": -914061504, "ts": 1716454223419805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223483405, "dur": 88, "args": { "External id": 133936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133936, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 133936, "pid": 5, "tid": 7, "ts": 1716454223483405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419810, "dur": 13, "args": { "External id": 133936, "cbid": 211, "correlation": 133936 } }, { "ph": "s", "id": 133936, "pid": 76337, "tid": -914061504, "ts": 1716454223419810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419880, "dur": 1, "args": { "External id": 133947, "cbid": 251, "correlation": 133947 } }, { "ph": "f", "id": 133947, "pid": 76337, "tid": -914061504, "ts": 1716454223419880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419883, "dur": 0, "args": { "External id": 133948, "cbid": 251, "correlation": 133948 } }, { "ph": "f", "id": 133948, "pid": 76337, "tid": -914061504, "ts": 1716454223419883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223483495, "dur": 12, "args": { "External id": 133949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133949, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133949, "pid": 5, "tid": 7, "ts": 1716454223483495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419885, "dur": 12, "args": { "External id": 133949, "cbid": 211, "correlation": 133949 } }, { "ph": "s", "id": 133949, "pid": 76337, "tid": -914061504, "ts": 1716454223419885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223483508, "dur": 5, "args": { "External id": 133951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133951, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133951, "pid": 5, "tid": 7, "ts": 1716454223483508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419899, "dur": 6, "args": { "External id": 133951, "cbid": 211, "correlation": 133951 } }, { "ph": "s", "id": 133951, "pid": 76337, "tid": -914061504, "ts": 1716454223419899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419955, "dur": 1, "args": { "External id": 133962, "cbid": 251, "correlation": 133962 } }, { "ph": "f", "id": 133962, "pid": 76337, "tid": -914061504, "ts": 1716454223419955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223419958, "dur": 0, "args": { "External id": 133963, "cbid": 251, "correlation": 133963 } }, { "ph": "f", "id": 133963, "pid": 76337, "tid": -914061504, "ts": 1716454223419958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223483515, "dur": 8, "args": { "External id": 133964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133964, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133964, "pid": 5, "tid": 7, "ts": 1716454223483515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419960, "dur": 11, "args": { "External id": 133964, "cbid": 211, "correlation": 133964 } }, { "ph": "s", "id": 133964, "pid": 76337, "tid": -914061504, "ts": 1716454223419960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223483524, "dur": 3, "args": { "External id": 133966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133966, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133966, "pid": 5, "tid": 7, "ts": 1716454223483524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223419981, "dur": 6, "args": { "External id": 133966, "cbid": 211, "correlation": 133966 } }, { "ph": "s", "id": 133966, "pid": 76337, "tid": -914061504, "ts": 1716454223419981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223483529, "dur": 55, "args": { "External id": 133991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 133991, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 133991, "pid": 5, "tid": 7, "ts": 1716454223483529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420058, "dur": 12, "args": { "External id": 133991, "cbid": 211, "correlation": 133991 } }, { "ph": "s", "id": 133991, "pid": 76337, "tid": -914061504, "ts": 1716454223420058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223420157, "dur": 1, "args": { "External id": 134009, "cbid": 251, "correlation": 134009 } }, { "ph": "f", "id": 134009, "pid": 76337, "tid": -914061504, "ts": 1716454223420157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223483584, "dur": 90, "args": { "External id": 134011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134011, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 134011, "pid": 5, "tid": 7, "ts": 1716454223483584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420162, "dur": 13, "args": { "External id": 134011, "cbid": 211, "correlation": 134011 } }, { "ph": "s", "id": 134011, "pid": 76337, "tid": -914061504, "ts": 1716454223420162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223483675, "dur": 9, "args": { "External id": 134019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134019, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134019, "pid": 5, "tid": 7, "ts": 1716454223483675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420232, "dur": 12, "args": { "External id": 134019, "cbid": 211, "correlation": 134019 } }, { "ph": "s", "id": 134019, "pid": 76337, "tid": -914061504, "ts": 1716454223420232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223483686, "dur": 21, "args": { "External id": 134027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134027, "pid": 5, "tid": 7, "ts": 1716454223483686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420274, "dur": 10, "args": { "External id": 134027, "cbid": 211, "correlation": 134027 } }, { "ph": "s", "id": 134027, "pid": 76337, "tid": -914061504, "ts": 1716454223420274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223483708, "dur": 18, "args": { "External id": 134049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134049, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134049, "pid": 5, "tid": 7, "ts": 1716454223483708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420326, "dur": 10, "args": { "External id": 134049, "cbid": 211, "correlation": 134049 } }, { "ph": "s", "id": 134049, "pid": 76337, "tid": -914061504, "ts": 1716454223420326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223420413, "dur": 1, "args": { "External id": 134065, "cbid": 251, "correlation": 134065 } }, { "ph": "f", "id": 134065, "pid": 76337, "tid": -914061504, "ts": 1716454223420413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223420418, "dur": 0, "args": { "External id": 134067, "cbid": 251, "correlation": 134067 } }, { "ph": "f", "id": 134067, "pid": 76337, "tid": -914061504, "ts": 1716454223420418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223483727, "dur": 496, "args": { "External id": 134068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134068, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134068, "pid": 5, "tid": 7, "ts": 1716454223483727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420420, "dur": 13, "args": { "External id": 134068, "cbid": 211, "correlation": 134068 } }, { "ph": "s", "id": 134068, "pid": 76337, "tid": -914061504, "ts": 1716454223420420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223484224, "dur": 66, "args": { "External id": 134076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134076, "pid": 5, "tid": 7, "ts": 1716454223484224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420487, "dur": 12, "args": { "External id": 134076, "cbid": 211, "correlation": 134076 } }, { "ph": "s", "id": 134076, "pid": 76337, "tid": -914061504, "ts": 1716454223420487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223484291, "dur": 68, "args": { "External id": 134084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134084, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134084, "pid": 5, "tid": 7, "ts": 1716454223484291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420517, "dur": 8, "args": { "External id": 134084, "cbid": 211, "correlation": 134084 } }, { "ph": "s", "id": 134084, "pid": 76337, "tid": -914061504, "ts": 1716454223420517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223420597, "dur": 1, "args": { "External id": 134100, "cbid": 251, "correlation": 134100 } }, { "ph": "f", "id": 134100, "pid": 76337, "tid": -914061504, "ts": 1716454223420597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223484361, "dur": 1, "args": { "External id": 134102, "device": 5, "context": 1, "stream": 7, "correlation": 134102, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 134102, "pid": 5, "tid": 7, "ts": 1716454223484361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223420602, "dur": 9, "args": { "External id": 134102, "cbid": 51, "correlation": 134102 } }, { "ph": "s", "id": 134102, "pid": 76337, "tid": -914061504, "ts": 1716454223420602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223484365, "dur": 269, "args": { "External id": 134103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134103, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 134103, "pid": 5, "tid": 7, "ts": 1716454223484365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420612, "dur": 11, "args": { "External id": 134103, "cbid": 211, "correlation": 134103 } }, { "ph": "s", "id": 134103, "pid": 76337, "tid": -914061504, "ts": 1716454223420612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223484636, "dur": 14, "args": { "External id": 134111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134111, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134111, "pid": 5, "tid": 7, "ts": 1716454223484636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420656, "dur": 10, "args": { "External id": 134111, "cbid": 211, "correlation": 134111 } }, { "ph": "s", "id": 134111, "pid": 76337, "tid": -914061504, "ts": 1716454223420656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223484651, "dur": 37, "args": { "External id": 134122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134122, "pid": 5, "tid": 7, "ts": 1716454223484651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420724, "dur": 14, "args": { "External id": 134122, "cbid": 211, "correlation": 134122 } }, { "ph": "s", "id": 134122, "pid": 76337, "tid": -914061504, "ts": 1716454223420724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223420790, "dur": 0, "args": { "External id": 134134, "cbid": 317, "correlation": 134134 } }, { "ph": "f", "id": 134134, "pid": 76337, "tid": -914061504, "ts": 1716454223420790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223420791, "dur": 0, "args": { "External id": 134135, "cbid": 203, "correlation": 134135 } }, { "ph": "f", "id": 134135, "pid": 76337, "tid": -914061504, "ts": 1716454223420791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223420791, "dur": 0, "args": { "External id": 134136, "cbid": 205, "correlation": 134136 } }, { "ph": "f", "id": 134136, "pid": 76337, "tid": -914061504, "ts": 1716454223420791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223484689, "dur": 13, "args": { "External id": 134140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134140, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134140, "pid": 5, "tid": 7, "ts": 1716454223484689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420806, "dur": 12, "args": { "External id": 134140, "cbid": 211, "correlation": 134140 } }, { "ph": "s", "id": 134140, "pid": 76337, "tid": -914061504, "ts": 1716454223420806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223484704, "dur": 4, "args": { "External id": 134142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 134142, "pid": 5, "tid": 7, "ts": 1716454223484704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420823, "dur": 6, "args": { "External id": 134142, "cbid": 211, "correlation": 134142 } }, { "ph": "s", "id": 134142, "pid": 76337, "tid": -914061504, "ts": 1716454223420823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223420832, "dur": 0, "args": { "External id": 134143, "cbid": 51, "correlation": 134143 } }, { "ph": "s", "id": 134143, "pid": 76337, "tid": -914061504, "ts": 1716454223420832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223484709, "dur": 96, "args": { "External id": 134144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134144, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 134144, "pid": 5, "tid": 7, "ts": 1716454223484709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420833, "dur": 5, "args": { "External id": 134144, "cbid": 211, "correlation": 134144 } }, { "ph": "s", "id": 134144, "pid": 76337, "tid": -914061504, "ts": 1716454223420833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223484806, "dur": 16, "args": { "External id": 134149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134149, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134149, "pid": 5, "tid": 7, "ts": 1716454223484806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420860, "dur": 9, "args": { "External id": 134149, "cbid": 211, "correlation": 134149 } }, { "ph": "s", "id": 134149, "pid": 76337, "tid": -914061504, "ts": 1716454223420860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223484824, "dur": 12, "args": { "External id": 134157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134157, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134157, "pid": 5, "tid": 7, "ts": 1716454223484824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420892, "dur": 8, "args": { "External id": 134157, "cbid": 211, "correlation": 134157 } }, { "ph": "s", "id": 134157, "pid": 76337, "tid": -914061504, "ts": 1716454223420892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223484838, "dur": 56, "args": { "External id": 134168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134168, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134168, "pid": 5, "tid": 7, "ts": 1716454223484838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223420955, "dur": 12, "args": { "External id": 134168, "cbid": 211, "correlation": 134168 } }, { "ph": "s", "id": 134168, "pid": 76337, "tid": -914061504, "ts": 1716454223420955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223421019, "dur": 0, "args": { "External id": 134178, "cbid": 317, "correlation": 134178 } }, { "ph": "f", "id": 134178, "pid": 76337, "tid": -914061504, "ts": 1716454223421019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223421020, "dur": 0, "args": { "External id": 134179, "cbid": 203, "correlation": 134179 } }, { "ph": "f", "id": 134179, "pid": 76337, "tid": -914061504, "ts": 1716454223421020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223421020, "dur": 0, "args": { "External id": 134180, "cbid": 205, "correlation": 134180 } }, { "ph": "f", "id": 134180, "pid": 76337, "tid": -914061504, "ts": 1716454223421020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223484895, "dur": 38, "args": { "External id": 134184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134184, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134184, "pid": 5, "tid": 7, "ts": 1716454223484895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421037, "dur": 12, "args": { "External id": 134184, "cbid": 211, "correlation": 134184 } }, { "ph": "s", "id": 134184, "pid": 76337, "tid": -914061504, "ts": 1716454223421037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223484934, "dur": 162, "args": { "External id": 134186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134186, "pid": 5, "tid": 7, "ts": 1716454223484934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421051, "dur": 5, "args": { "External id": 134186, "cbid": 211, "correlation": 134186 } }, { "ph": "s", "id": 134186, "pid": 76337, "tid": -914061504, "ts": 1716454223421051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223485097, "dur": 1984, "args": { "External id": 134188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134188, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134188, "pid": 5, "tid": 7, "ts": 1716454223485097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421063, "dur": 8, "args": { "External id": 134188, "cbid": 211, "correlation": 134188 } }, { "ph": "s", "id": 134188, "pid": 76337, "tid": -914061504, "ts": 1716454223421063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223487082, "dur": 38, "args": { "External id": 134190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134190, "pid": 5, "tid": 7, "ts": 1716454223487082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421074, "dur": 6, "args": { "External id": 134190, "cbid": 211, "correlation": 134190 } }, { "ph": "s", "id": 134190, "pid": 76337, "tid": -914061504, "ts": 1716454223421074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223487122, "dur": 60, "args": { "External id": 134196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134196, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134196, "pid": 5, "tid": 7, "ts": 1716454223487122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421104, "dur": 8, "args": { "External id": 134196, "cbid": 211, "correlation": 134196 } }, { "ph": "s", "id": 134196, "pid": 76337, "tid": -914061504, "ts": 1716454223421104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223487183, "dur": 84, "args": { "External id": 134205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134205, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134205, "pid": 5, "tid": 7, "ts": 1716454223487183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421193, "dur": 13, "args": { "External id": 134205, "cbid": 211, "correlation": 134205 } }, { "ph": "s", "id": 134205, "pid": 76337, "tid": -914061504, "ts": 1716454223421193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223487268, "dur": 72, "args": { "External id": 134225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134225, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 134225, "pid": 5, "tid": 7, "ts": 1716454223487268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421263, "dur": 11, "args": { "External id": 134225, "cbid": 211, "correlation": 134225 } }, { "ph": "s", "id": 134225, "pid": 76337, "tid": -914061504, "ts": 1716454223421263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223487342, "dur": 5, "args": { "External id": 134237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134237, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 134237, "pid": 5, "tid": 7, "ts": 1716454223487342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421285, "dur": 6, "args": { "External id": 134237, "cbid": 211, "correlation": 134237 } }, { "ph": "s", "id": 134237, "pid": 76337, "tid": -914061504, "ts": 1716454223421285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223487348, "dur": 80, "args": { "External id": 134240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134240, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134240, "pid": 5, "tid": 7, "ts": 1716454223487348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421304, "dur": 7, "args": { "External id": 134240, "cbid": 211, "correlation": 134240 } }, { "ph": "s", "id": 134240, "pid": 76337, "tid": -914061504, "ts": 1716454223421304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223487429, "dur": 53, "args": { "External id": 134249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134249, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134249, "pid": 5, "tid": 7, "ts": 1716454223487429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421345, "dur": 10, "args": { "External id": 134249, "cbid": 211, "correlation": 134249 } }, { "ph": "s", "id": 134249, "pid": 76337, "tid": -914061504, "ts": 1716454223421345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223421396, "dur": 0, "args": { "External id": 134259, "cbid": 317, "correlation": 134259 } }, { "ph": "f", "id": 134259, "pid": 76337, "tid": -914061504, "ts": 1716454223421396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223421397, "dur": 0, "args": { "External id": 134260, "cbid": 203, "correlation": 134260 } }, { "ph": "f", "id": 134260, "pid": 76337, "tid": -914061504, "ts": 1716454223421397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223421397, "dur": 0, "args": { "External id": 134261, "cbid": 205, "correlation": 134261 } }, { "ph": "f", "id": 134261, "pid": 76337, "tid": -914061504, "ts": 1716454223421397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223487484, "dur": 57, "args": { "External id": 134265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134265, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134265, "pid": 5, "tid": 7, "ts": 1716454223487484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421413, "dur": 12, "args": { "External id": 134265, "cbid": 211, "correlation": 134265 } }, { "ph": "s", "id": 134265, "pid": 76337, "tid": -914061504, "ts": 1716454223421413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223487542, "dur": 122, "args": { "External id": 134267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134267, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134267, "pid": 5, "tid": 7, "ts": 1716454223487542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421428, "dur": 5, "args": { "External id": 134267, "cbid": 211, "correlation": 134267 } }, { "ph": "s", "id": 134267, "pid": 76337, "tid": -914061504, "ts": 1716454223421428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223487665, "dur": 1889, "args": { "External id": 134269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134269, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134269, "pid": 5, "tid": 7, "ts": 1716454223487665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421439, "dur": 6, "args": { "External id": 134269, "cbid": 211, "correlation": 134269 } }, { "ph": "s", "id": 134269, "pid": 76337, "tid": -914061504, "ts": 1716454223421439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223489555, "dur": 20, "args": { "External id": 134271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134271, "pid": 5, "tid": 7, "ts": 1716454223489555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421449, "dur": 5, "args": { "External id": 134271, "cbid": 211, "correlation": 134271 } }, { "ph": "s", "id": 134271, "pid": 76337, "tid": -914061504, "ts": 1716454223421449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223489577, "dur": 32, "args": { "External id": 134277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134277, "pid": 5, "tid": 7, "ts": 1716454223489577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421477, "dur": 8, "args": { "External id": 134277, "cbid": 211, "correlation": 134277 } }, { "ph": "s", "id": 134277, "pid": 76337, "tid": -914061504, "ts": 1716454223421477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223489611, "dur": 3, "args": { "External id": 134285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134285, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 134285, "pid": 5, "tid": 7, "ts": 1716454223489611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421520, "dur": 10, "args": { "External id": 134285, "cbid": 211, "correlation": 134285 } }, { "ph": "s", "id": 134285, "pid": 76337, "tid": -914061504, "ts": 1716454223421520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223421587, "dur": 1, "args": { "External id": 134301, "cbid": 251, "correlation": 134301 } }, { "ph": "f", "id": 134301, "pid": 76337, "tid": -914061504, "ts": 1716454223421587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223421592, "dur": 0, "args": { "External id": 134303, "cbid": 251, "correlation": 134303 } }, { "ph": "f", "id": 134303, "pid": 76337, "tid": -914061504, "ts": 1716454223421592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223489615, "dur": 12, "args": { "External id": 134304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134304, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 134304, "pid": 5, "tid": 7, "ts": 1716454223489615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421594, "dur": 11, "args": { "External id": 134304, "cbid": 211, "correlation": 134304 } }, { "ph": "s", "id": 134304, "pid": 76337, "tid": -914061504, "ts": 1716454223421594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223489629, "dur": 5, "args": { "External id": 134306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134306, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 134306, "pid": 5, "tid": 7, "ts": 1716454223489629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421607, "dur": 6, "args": { "External id": 134306, "cbid": 211, "correlation": 134306 } }, { "ph": "s", "id": 134306, "pid": 76337, "tid": -914061504, "ts": 1716454223421607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223489635, "dur": 28, "args": { "External id": 134316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134316, "pid": 5, "tid": 7, "ts": 1716454223489635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421665, "dur": 13, "args": { "External id": 134316, "cbid": 211, "correlation": 134316 } }, { "ph": "s", "id": 134316, "pid": 76337, "tid": -914061504, "ts": 1716454223421665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223489665, "dur": 30, "args": { "External id": 134336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134336, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 134336, "pid": 5, "tid": 7, "ts": 1716454223489665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421733, "dur": 10, "args": { "External id": 134336, "cbid": 211, "correlation": 134336 } }, { "ph": "s", "id": 134336, "pid": 76337, "tid": -914061504, "ts": 1716454223421733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223489696, "dur": 4, "args": { "External id": 134348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134348, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 134348, "pid": 5, "tid": 7, "ts": 1716454223489696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421753, "dur": 6, "args": { "External id": 134348, "cbid": 211, "correlation": 134348 } }, { "ph": "s", "id": 134348, "pid": 76337, "tid": -914061504, "ts": 1716454223421753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223489701, "dur": 30, "args": { "External id": 134351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134351, "pid": 5, "tid": 7, "ts": 1716454223489701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421771, "dur": 8, "args": { "External id": 134351, "cbid": 211, "correlation": 134351 } }, { "ph": "s", "id": 134351, "pid": 76337, "tid": -914061504, "ts": 1716454223421771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223489732, "dur": 20, "args": { "External id": 134360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134360, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134360, "pid": 5, "tid": 7, "ts": 1716454223489732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421812, "dur": 10, "args": { "External id": 134360, "cbid": 211, "correlation": 134360 } }, { "ph": "s", "id": 134360, "pid": 76337, "tid": -914061504, "ts": 1716454223421812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223421875, "dur": 0, "args": { "External id": 134370, "cbid": 317, "correlation": 134370 } }, { "ph": "f", "id": 134370, "pid": 76337, "tid": -914061504, "ts": 1716454223421875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223421876, "dur": 0, "args": { "External id": 134371, "cbid": 203, "correlation": 134371 } }, { "ph": "f", "id": 134371, "pid": 76337, "tid": -914061504, "ts": 1716454223421876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223421877, "dur": 0, "args": { "External id": 134372, "cbid": 205, "correlation": 134372 } }, { "ph": "f", "id": 134372, "pid": 76337, "tid": -914061504, "ts": 1716454223421877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223489754, "dur": 21, "args": { "External id": 134376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134376, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134376, "pid": 5, "tid": 7, "ts": 1716454223489754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421892, "dur": 12, "args": { "External id": 134376, "cbid": 211, "correlation": 134376 } }, { "ph": "s", "id": 134376, "pid": 76337, "tid": -914061504, "ts": 1716454223421892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223489777, "dur": 43, "args": { "External id": 134378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134378, "pid": 5, "tid": 7, "ts": 1716454223489777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421907, "dur": 5, "args": { "External id": 134378, "cbid": 211, "correlation": 134378 } }, { "ph": "s", "id": 134378, "pid": 76337, "tid": -914061504, "ts": 1716454223421907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223489821, "dur": 645, "args": { "External id": 134380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134380, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134380, "pid": 5, "tid": 7, "ts": 1716454223489821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421919, "dur": 6, "args": { "External id": 134380, "cbid": 211, "correlation": 134380 } }, { "ph": "s", "id": 134380, "pid": 76337, "tid": -914061504, "ts": 1716454223421919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223490468, "dur": 22, "args": { "External id": 134382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134382, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134382, "pid": 5, "tid": 7, "ts": 1716454223490468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421929, "dur": 6, "args": { "External id": 134382, "cbid": 211, "correlation": 134382 } }, { "ph": "s", "id": 134382, "pid": 76337, "tid": -914061504, "ts": 1716454223421929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223490492, "dur": 32, "args": { "External id": 134388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134388, "pid": 5, "tid": 7, "ts": 1716454223490492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223421957, "dur": 8, "args": { "External id": 134388, "cbid": 211, "correlation": 134388 } }, { "ph": "s", "id": 134388, "pid": 76337, "tid": -914061504, "ts": 1716454223421957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223422025, "dur": 0, "args": { "External id": 134398, "cbid": 317, "correlation": 134398 } }, { "ph": "f", "id": 134398, "pid": 76337, "tid": -914061504, "ts": 1716454223422025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223422026, "dur": 0, "args": { "External id": 134399, "cbid": 203, "correlation": 134399 } }, { "ph": "f", "id": 134399, "pid": 76337, "tid": -914061504, "ts": 1716454223422026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223422026, "dur": 0, "args": { "External id": 134400, "cbid": 205, "correlation": 134400 } }, { "ph": "f", "id": 134400, "pid": 76337, "tid": -914061504, "ts": 1716454223422026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223490525, "dur": 56, "args": { "External id": 134404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134404, "pid": 5, "tid": 7, "ts": 1716454223490525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422040, "dur": 12, "args": { "External id": 134404, "cbid": 211, "correlation": 134404 } }, { "ph": "s", "id": 134404, "pid": 76337, "tid": -914061504, "ts": 1716454223422040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223490583, "dur": 268, "args": { "External id": 134406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134406, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134406, "pid": 5, "tid": 7, "ts": 1716454223490583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422058, "dur": 7, "args": { "External id": 134406, "cbid": 211, "correlation": 134406 } }, { "ph": "s", "id": 134406, "pid": 76337, "tid": -914061504, "ts": 1716454223422058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223490852, "dur": 21, "args": { "External id": 134408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134408, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134408, "pid": 5, "tid": 7, "ts": 1716454223490852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422069, "dur": 5, "args": { "External id": 134408, "cbid": 211, "correlation": 134408 } }, { "ph": "s", "id": 134408, "pid": 76337, "tid": -914061504, "ts": 1716454223422069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223490874, "dur": 33, "args": { "External id": 134414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134414, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134414, "pid": 5, "tid": 7, "ts": 1716454223490874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422096, "dur": 8, "args": { "External id": 134414, "cbid": 211, "correlation": 134414 } }, { "ph": "s", "id": 134414, "pid": 76337, "tid": -914061504, "ts": 1716454223422096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223490909, "dur": 28, "args": { "External id": 134422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134422, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134422, "pid": 5, "tid": 7, "ts": 1716454223490909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422124, "dur": 8, "args": { "External id": 134422, "cbid": 211, "correlation": 134422 } }, { "ph": "s", "id": 134422, "pid": 76337, "tid": -914061504, "ts": 1716454223422124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223490937, "dur": 20, "args": { "External id": 134430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134430, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134430, "pid": 5, "tid": 7, "ts": 1716454223490937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422152, "dur": 8, "args": { "External id": 134430, "cbid": 211, "correlation": 134430 } }, { "ph": "s", "id": 134430, "pid": 76337, "tid": -914061504, "ts": 1716454223422152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223490959, "dur": 29, "args": { "External id": 134450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134450, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 134450, "pid": 5, "tid": 7, "ts": 1716454223490959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422234, "dur": 12, "args": { "External id": 134450, "cbid": 211, "correlation": 134450 } }, { "ph": "s", "id": 134450, "pid": 76337, "tid": -914061504, "ts": 1716454223422234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223490990, "dur": 4, "args": { "External id": 134462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134462, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 134462, "pid": 5, "tid": 7, "ts": 1716454223490990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422255, "dur": 6, "args": { "External id": 134462, "cbid": 211, "correlation": 134462 } }, { "ph": "s", "id": 134462, "pid": 76337, "tid": -914061504, "ts": 1716454223422255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223490995, "dur": 32, "args": { "External id": 134465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134465, "pid": 5, "tid": 7, "ts": 1716454223490995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422273, "dur": 7, "args": { "External id": 134465, "cbid": 211, "correlation": 134465 } }, { "ph": "s", "id": 134465, "pid": 76337, "tid": -914061504, "ts": 1716454223422273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223422330, "dur": 0, "args": { "External id": 134476, "cbid": 317, "correlation": 134476 } }, { "ph": "f", "id": 134476, "pid": 76337, "tid": -914061504, "ts": 1716454223422330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223422331, "dur": 0, "args": { "External id": 134477, "cbid": 203, "correlation": 134477 } }, { "ph": "f", "id": 134477, "pid": 76337, "tid": -914061504, "ts": 1716454223422331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223422332, "dur": 0, "args": { "External id": 134478, "cbid": 205, "correlation": 134478 } }, { "ph": "f", "id": 134478, "pid": 76337, "tid": -914061504, "ts": 1716454223422332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223491029, "dur": 21, "args": { "External id": 134482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134482, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134482, "pid": 5, "tid": 7, "ts": 1716454223491029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422346, "dur": 12, "args": { "External id": 134482, "cbid": 211, "correlation": 134482 } }, { "ph": "s", "id": 134482, "pid": 76337, "tid": -914061504, "ts": 1716454223422346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223491051, "dur": 105, "args": { "External id": 134484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134484, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134484, "pid": 5, "tid": 7, "ts": 1716454223491051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422364, "dur": 6, "args": { "External id": 134484, "cbid": 211, "correlation": 134484 } }, { "ph": "s", "id": 134484, "pid": 76337, "tid": -914061504, "ts": 1716454223422364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223491157, "dur": 22, "args": { "External id": 134486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134486, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134486, "pid": 5, "tid": 7, "ts": 1716454223491157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422374, "dur": 5, "args": { "External id": 134486, "cbid": 211, "correlation": 134486 } }, { "ph": "s", "id": 134486, "pid": 76337, "tid": -914061504, "ts": 1716454223422374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223491180, "dur": 32, "args": { "External id": 134492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134492, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134492, "pid": 5, "tid": 7, "ts": 1716454223491180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422401, "dur": 8, "args": { "External id": 134492, "cbid": 211, "correlation": 134492 } }, { "ph": "s", "id": 134492, "pid": 76337, "tid": -914061504, "ts": 1716454223422401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223491214, "dur": 178, "args": { "External id": 134501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134501, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134501, "pid": 5, "tid": 7, "ts": 1716454223491214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422483, "dur": 13, "args": { "External id": 134501, "cbid": 211, "correlation": 134501 } }, { "ph": "s", "id": 134501, "pid": 76337, "tid": -914061504, "ts": 1716454223422483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223491394, "dur": 64, "args": { "External id": 134523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134523, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134523, "pid": 5, "tid": 7, "ts": 1716454223491394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422541, "dur": 10, "args": { "External id": 134523, "cbid": 211, "correlation": 134523 } }, { "ph": "s", "id": 134523, "pid": 76337, "tid": -914061504, "ts": 1716454223422541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223422630, "dur": 2, "args": { "External id": 134534, "cbid": 251, "correlation": 134534 } }, { "ph": "f", "id": 134534, "pid": 76337, "tid": -914061504, "ts": 1716454223422630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223491459, "dur": 155, "args": { "External id": 134535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134535, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134535, "pid": 5, "tid": 7, "ts": 1716454223491459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422636, "dur": 13, "args": { "External id": 134535, "cbid": 211, "correlation": 134535 } }, { "ph": "s", "id": 134535, "pid": 76337, "tid": -914061504, "ts": 1716454223422636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223422705, "dur": 1, "args": { "External id": 134546, "cbid": 251, "correlation": 134546 } }, { "ph": "f", "id": 134546, "pid": 76337, "tid": -914061504, "ts": 1716454223422705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223491615, "dur": 148, "args": { "External id": 134547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134547, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134547, "pid": 5, "tid": 7, "ts": 1716454223491615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422709, "dur": 11, "args": { "External id": 134547, "cbid": 211, "correlation": 134547 } }, { "ph": "s", "id": 134547, "pid": 76337, "tid": -914061504, "ts": 1716454223422709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223422773, "dur": 1, "args": { "External id": 134558, "cbid": 251, "correlation": 134558 } }, { "ph": "f", "id": 134558, "pid": 76337, "tid": -914061504, "ts": 1716454223422773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223491765, "dur": 143, "args": { "External id": 134559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134559, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134559, "pid": 5, "tid": 7, "ts": 1716454223491765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422777, "dur": 12, "args": { "External id": 134559, "cbid": 211, "correlation": 134559 } }, { "ph": "s", "id": 134559, "pid": 76337, "tid": -914061504, "ts": 1716454223422777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223491909, "dur": 1919, "args": { "External id": 134580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134580, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 134580, "pid": 5, "tid": 7, "ts": 1716454223491909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422859, "dur": 13, "args": { "External id": 134580, "cbid": 211, "correlation": 134580 } }, { "ph": "s", "id": 134580, "pid": 76337, "tid": -914061504, "ts": 1716454223422859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223422959, "dur": 2, "args": { "External id": 134598, "cbid": 251, "correlation": 134598 } }, { "ph": "f", "id": 134598, "pid": 76337, "tid": -914061504, "ts": 1716454223422959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223493829, "dur": 147, "args": { "External id": 134600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134600, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 134600, "pid": 5, "tid": 7, "ts": 1716454223493829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223422965, "dur": 28, "args": { "External id": 134600, "cbid": 211, "correlation": 134600 } }, { "ph": "s", "id": 134600, "pid": 76337, "tid": -914061504, "ts": 1716454223422965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223493977, "dur": 36, "args": { "External id": 134608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134608, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134608, "pid": 5, "tid": 7, "ts": 1716454223493977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423051, "dur": 12, "args": { "External id": 134608, "cbid": 211, "correlation": 134608 } }, { "ph": "s", "id": 134608, "pid": 76337, "tid": -914061504, "ts": 1716454223423051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223494014, "dur": 51, "args": { "External id": 134616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134616, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134616, "pid": 5, "tid": 7, "ts": 1716454223494014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423091, "dur": 8, "args": { "External id": 134616, "cbid": 211, "correlation": 134616 } }, { "ph": "s", "id": 134616, "pid": 76337, "tid": -914061504, "ts": 1716454223423091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223494066, "dur": 30, "args": { "External id": 134627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134627, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134627, "pid": 5, "tid": 7, "ts": 1716454223494066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423162, "dur": 13, "args": { "External id": 134627, "cbid": 211, "correlation": 134627 } }, { "ph": "s", "id": 134627, "pid": 76337, "tid": -914061504, "ts": 1716454223423162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223494097, "dur": 33, "args": { "External id": 134649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134649, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134649, "pid": 5, "tid": 7, "ts": 1716454223494097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423194, "dur": 8, "args": { "External id": 134649, "cbid": 211, "correlation": 134649 } }, { "ph": "s", "id": 134649, "pid": 76337, "tid": -914061504, "ts": 1716454223423194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423279, "dur": 1, "args": { "External id": 134660, "cbid": 251, "correlation": 134660 } }, { "ph": "f", "id": 134660, "pid": 76337, "tid": -914061504, "ts": 1716454223423279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223494132, "dur": 75, "args": { "External id": 134661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134661, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134661, "pid": 5, "tid": 7, "ts": 1716454223494132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423284, "dur": 13, "args": { "External id": 134661, "cbid": 211, "correlation": 134661 } }, { "ph": "s", "id": 134661, "pid": 76337, "tid": -914061504, "ts": 1716454223423284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423355, "dur": 1, "args": { "External id": 134672, "cbid": 251, "correlation": 134672 } }, { "ph": "f", "id": 134672, "pid": 76337, "tid": -914061504, "ts": 1716454223423355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423359, "dur": 0, "args": { "External id": 134673, "cbid": 251, "correlation": 134673 } }, { "ph": "f", "id": 134673, "pid": 76337, "tid": -914061504, "ts": 1716454223423359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223494208, "dur": 12, "args": { "External id": 134674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134674, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 134674, "pid": 5, "tid": 7, "ts": 1716454223494208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423360, "dur": 12, "args": { "External id": 134674, "cbid": 211, "correlation": 134674 } }, { "ph": "s", "id": 134674, "pid": 76337, "tid": -914061504, "ts": 1716454223423360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223494222, "dur": 5, "args": { "External id": 134676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134676, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 134676, "pid": 5, "tid": 7, "ts": 1716454223494222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423374, "dur": 6, "args": { "External id": 134676, "cbid": 211, "correlation": 134676 } }, { "ph": "s", "id": 134676, "pid": 76337, "tid": -914061504, "ts": 1716454223423374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423431, "dur": 1, "args": { "External id": 134687, "cbid": 251, "correlation": 134687 } }, { "ph": "f", "id": 134687, "pid": 76337, "tid": -914061504, "ts": 1716454223423431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423434, "dur": 0, "args": { "External id": 134688, "cbid": 251, "correlation": 134688 } }, { "ph": "f", "id": 134688, "pid": 76337, "tid": -914061504, "ts": 1716454223423434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223494228, "dur": 7, "args": { "External id": 134689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134689, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 134689, "pid": 5, "tid": 7, "ts": 1716454223494228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423436, "dur": 12, "args": { "External id": 134689, "cbid": 211, "correlation": 134689 } }, { "ph": "s", "id": 134689, "pid": 76337, "tid": -914061504, "ts": 1716454223423436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223494237, "dur": 3, "args": { "External id": 134691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134691, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 134691, "pid": 5, "tid": 7, "ts": 1716454223494237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423450, "dur": 5, "args": { "External id": 134691, "cbid": 211, "correlation": 134691 } }, { "ph": "s", "id": 134691, "pid": 76337, "tid": -914061504, "ts": 1716454223423450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223494241, "dur": 90, "args": { "External id": 134712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134712, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 134712, "pid": 5, "tid": 7, "ts": 1716454223494241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423522, "dur": 13, "args": { "External id": 134712, "cbid": 211, "correlation": 134712 } }, { "ph": "s", "id": 134712, "pid": 76337, "tid": -914061504, "ts": 1716454223423522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423619, "dur": 1, "args": { "External id": 134730, "cbid": 251, "correlation": 134730 } }, { "ph": "f", "id": 134730, "pid": 76337, "tid": -914061504, "ts": 1716454223423619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223494333, "dur": 96, "args": { "External id": 134732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134732, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134732, "pid": 5, "tid": 7, "ts": 1716454223494333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423624, "dur": 13, "args": { "External id": 134732, "cbid": 211, "correlation": 134732 } }, { "ph": "s", "id": 134732, "pid": 76337, "tid": -914061504, "ts": 1716454223423624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223494430, "dur": 19, "args": { "External id": 134740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134740, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134740, "pid": 5, "tid": 7, "ts": 1716454223494430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423694, "dur": 13, "args": { "External id": 134740, "cbid": 211, "correlation": 134740 } }, { "ph": "s", "id": 134740, "pid": 76337, "tid": -914061504, "ts": 1716454223423694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223494450, "dur": 39, "args": { "External id": 134748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134748, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134748, "pid": 5, "tid": 7, "ts": 1716454223494450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423736, "dur": 10, "args": { "External id": 134748, "cbid": 211, "correlation": 134748 } }, { "ph": "s", "id": 134748, "pid": 76337, "tid": -914061504, "ts": 1716454223423736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223494489, "dur": 34, "args": { "External id": 134770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134770, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134770, "pid": 5, "tid": 7, "ts": 1716454223494489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423787, "dur": 11, "args": { "External id": 134770, "cbid": 211, "correlation": 134770 } }, { "ph": "s", "id": 134770, "pid": 76337, "tid": -914061504, "ts": 1716454223423787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423875, "dur": 1, "args": { "External id": 134786, "cbid": 251, "correlation": 134786 } }, { "ph": "f", "id": 134786, "pid": 76337, "tid": -914061504, "ts": 1716454223423875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223423880, "dur": 0, "args": { "External id": 134788, "cbid": 251, "correlation": 134788 } }, { "ph": "f", "id": 134788, "pid": 76337, "tid": -914061504, "ts": 1716454223423880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223494525, "dur": 536, "args": { "External id": 134789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134789, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 134789, "pid": 5, "tid": 7, "ts": 1716454223494525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423885, "dur": 14, "args": { "External id": 134789, "cbid": 211, "correlation": 134789 } }, { "ph": "s", "id": 134789, "pid": 76337, "tid": -914061504, "ts": 1716454223423885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223495063, "dur": 125, "args": { "External id": 134797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134797, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134797, "pid": 5, "tid": 7, "ts": 1716454223495063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423951, "dur": 12, "args": { "External id": 134797, "cbid": 211, "correlation": 134797 } }, { "ph": "s", "id": 134797, "pid": 76337, "tid": -914061504, "ts": 1716454223423951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223495189, "dur": 127, "args": { "External id": 134805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134805, "pid": 5, "tid": 7, "ts": 1716454223495189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223423990, "dur": 10, "args": { "External id": 134805, "cbid": 211, "correlation": 134805 } }, { "ph": "s", "id": 134805, "pid": 76337, "tid": -914061504, "ts": 1716454223423990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223424070, "dur": 1, "args": { "External id": 134821, "cbid": 251, "correlation": 134821 } }, { "ph": "f", "id": 134821, "pid": 76337, "tid": -914061504, "ts": 1716454223424070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223495317, "dur": 310, "args": { "External id": 134823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134823, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134823, "pid": 5, "tid": 7, "ts": 1716454223495317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424076, "dur": 12, "args": { "External id": 134823, "cbid": 211, "correlation": 134823 } }, { "ph": "s", "id": 134823, "pid": 76337, "tid": -914061504, "ts": 1716454223424076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223495629, "dur": 27, "args": { "External id": 134831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134831, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134831, "pid": 5, "tid": 7, "ts": 1716454223495629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424118, "dur": 10, "args": { "External id": 134831, "cbid": 211, "correlation": 134831 } }, { "ph": "s", "id": 134831, "pid": 76337, "tid": -914061504, "ts": 1716454223424118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223495658, "dur": 80, "args": { "External id": 134842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134842, "pid": 5, "tid": 7, "ts": 1716454223495658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424185, "dur": 13, "args": { "External id": 134842, "cbid": 211, "correlation": 134842 } }, { "ph": "s", "id": 134842, "pid": 76337, "tid": -914061504, "ts": 1716454223424185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223424251, "dur": 0, "args": { "External id": 134854, "cbid": 317, "correlation": 134854 } }, { "ph": "f", "id": 134854, "pid": 76337, "tid": -914061504, "ts": 1716454223424251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223424251, "dur": 0, "args": { "External id": 134855, "cbid": 203, "correlation": 134855 } }, { "ph": "f", "id": 134855, "pid": 76337, "tid": -914061504, "ts": 1716454223424251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223424252, "dur": 0, "args": { "External id": 134856, "cbid": 205, "correlation": 134856 } }, { "ph": "f", "id": 134856, "pid": 76337, "tid": -914061504, "ts": 1716454223424252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223495739, "dur": 22, "args": { "External id": 134860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134860, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134860, "pid": 5, "tid": 7, "ts": 1716454223495739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424268, "dur": 12, "args": { "External id": 134860, "cbid": 211, "correlation": 134860 } }, { "ph": "s", "id": 134860, "pid": 76337, "tid": -914061504, "ts": 1716454223424268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223495763, "dur": 119, "args": { "External id": 134862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134862, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134862, "pid": 5, "tid": 7, "ts": 1716454223495763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424286, "dur": 7, "args": { "External id": 134862, "cbid": 211, "correlation": 134862 } }, { "ph": "s", "id": 134862, "pid": 76337, "tid": -914061504, "ts": 1716454223424286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223495883, "dur": 23, "args": { "External id": 134864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134864, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134864, "pid": 5, "tid": 7, "ts": 1716454223495883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424298, "dur": 5, "args": { "External id": 134864, "cbid": 211, "correlation": 134864 } }, { "ph": "s", "id": 134864, "pid": 76337, "tid": -914061504, "ts": 1716454223424298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223495908, "dur": 33, "args": { "External id": 134870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134870, "pid": 5, "tid": 7, "ts": 1716454223495908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424325, "dur": 8, "args": { "External id": 134870, "cbid": 211, "correlation": 134870 } }, { "ph": "s", "id": 134870, "pid": 76337, "tid": -914061504, "ts": 1716454223424325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223495942, "dur": 27, "args": { "External id": 134878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134878, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134878, "pid": 5, "tid": 7, "ts": 1716454223495942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424357, "dur": 8, "args": { "External id": 134878, "cbid": 211, "correlation": 134878 } }, { "ph": "s", "id": 134878, "pid": 76337, "tid": -914061504, "ts": 1716454223424357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223495969, "dur": 53, "args": { "External id": 134887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134887, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134887, "pid": 5, "tid": 7, "ts": 1716454223495969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424397, "dur": 10, "args": { "External id": 134887, "cbid": 211, "correlation": 134887 } }, { "ph": "s", "id": 134887, "pid": 76337, "tid": -914061504, "ts": 1716454223424397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223496024, "dur": 52, "args": { "External id": 134907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134907, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 134907, "pid": 5, "tid": 7, "ts": 1716454223496024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424468, "dur": 11, "args": { "External id": 134907, "cbid": 211, "correlation": 134907 } }, { "ph": "s", "id": 134907, "pid": 76337, "tid": -914061504, "ts": 1716454223424468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223496077, "dur": 4, "args": { "External id": 134919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134919, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 134919, "pid": 5, "tid": 7, "ts": 1716454223496077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424489, "dur": 7, "args": { "External id": 134919, "cbid": 211, "correlation": 134919 } }, { "ph": "s", "id": 134919, "pid": 76337, "tid": -914061504, "ts": 1716454223424489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223496083, "dur": 55, "args": { "External id": 134922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134922, "pid": 5, "tid": 7, "ts": 1716454223496083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424507, "dur": 6, "args": { "External id": 134922, "cbid": 211, "correlation": 134922 } }, { "ph": "s", "id": 134922, "pid": 76337, "tid": -914061504, "ts": 1716454223424507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223496139, "dur": 37, "args": { "External id": 134931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134931, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134931, "pid": 5, "tid": 7, "ts": 1716454223496139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424547, "dur": 10, "args": { "External id": 134931, "cbid": 211, "correlation": 134931 } }, { "ph": "s", "id": 134931, "pid": 76337, "tid": -914061504, "ts": 1716454223424547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223424599, "dur": 0, "args": { "External id": 134941, "cbid": 317, "correlation": 134941 } }, { "ph": "f", "id": 134941, "pid": 76337, "tid": -914061504, "ts": 1716454223424599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223424600, "dur": 0, "args": { "External id": 134942, "cbid": 203, "correlation": 134942 } }, { "ph": "f", "id": 134942, "pid": 76337, "tid": -914061504, "ts": 1716454223424600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223424601, "dur": 0, "args": { "External id": 134943, "cbid": 205, "correlation": 134943 } }, { "ph": "f", "id": 134943, "pid": 76337, "tid": -914061504, "ts": 1716454223424601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223496178, "dur": 39, "args": { "External id": 134947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134947, "pid": 5, "tid": 7, "ts": 1716454223496178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424615, "dur": 12, "args": { "External id": 134947, "cbid": 211, "correlation": 134947 } }, { "ph": "s", "id": 134947, "pid": 76337, "tid": -914061504, "ts": 1716454223424615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223496218, "dur": 83, "args": { "External id": 134949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134949, "pid": 5, "tid": 7, "ts": 1716454223496218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424629, "dur": 5, "args": { "External id": 134949, "cbid": 211, "correlation": 134949 } }, { "ph": "s", "id": 134949, "pid": 76337, "tid": -914061504, "ts": 1716454223424629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223496302, "dur": 1266, "args": { "External id": 134951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134951, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 134951, "pid": 5, "tid": 7, "ts": 1716454223496302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424640, "dur": 7, "args": { "External id": 134951, "cbid": 211, "correlation": 134951 } }, { "ph": "s", "id": 134951, "pid": 76337, "tid": -914061504, "ts": 1716454223424640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223497569, "dur": 21, "args": { "External id": 134953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134953, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134953, "pid": 5, "tid": 7, "ts": 1716454223497569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424651, "dur": 5, "args": { "External id": 134953, "cbid": 211, "correlation": 134953 } }, { "ph": "s", "id": 134953, "pid": 76337, "tid": -914061504, "ts": 1716454223424651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223497592, "dur": 33, "args": { "External id": 134959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134959, "pid": 5, "tid": 7, "ts": 1716454223497592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424678, "dur": 8, "args": { "External id": 134959, "cbid": 211, "correlation": 134959 } }, { "ph": "s", "id": 134959, "pid": 76337, "tid": -914061504, "ts": 1716454223424678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223497626, "dur": 3, "args": { "External id": 134967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134967, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 134967, "pid": 5, "tid": 7, "ts": 1716454223497626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424721, "dur": 9, "args": { "External id": 134967, "cbid": 211, "correlation": 134967 } }, { "ph": "s", "id": 134967, "pid": 76337, "tid": -914061504, "ts": 1716454223424721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223424785, "dur": 1, "args": { "External id": 134983, "cbid": 251, "correlation": 134983 } }, { "ph": "f", "id": 134983, "pid": 76337, "tid": -914061504, "ts": 1716454223424785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223424790, "dur": 0, "args": { "External id": 134985, "cbid": 251, "correlation": 134985 } }, { "ph": "f", "id": 134985, "pid": 76337, "tid": -914061504, "ts": 1716454223424790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223497630, "dur": 12, "args": { "External id": 134986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134986, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 134986, "pid": 5, "tid": 7, "ts": 1716454223497630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424793, "dur": 11, "args": { "External id": 134986, "cbid": 211, "correlation": 134986 } }, { "ph": "s", "id": 134986, "pid": 76337, "tid": -914061504, "ts": 1716454223424793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223497643, "dur": 5, "args": { "External id": 134988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134988, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 134988, "pid": 5, "tid": 7, "ts": 1716454223497643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424806, "dur": 5, "args": { "External id": 134988, "cbid": 211, "correlation": 134988 } }, { "ph": "s", "id": 134988, "pid": 76337, "tid": -914061504, "ts": 1716454223424806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223497650, "dur": 28, "args": { "External id": 134998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 134998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 134998, "pid": 5, "tid": 7, "ts": 1716454223497650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424862, "dur": 12, "args": { "External id": 134998, "cbid": 211, "correlation": 134998 } }, { "ph": "s", "id": 134998, "pid": 76337, "tid": -914061504, "ts": 1716454223424862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223497679, "dur": 31, "args": { "External id": 135018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135018, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 135018, "pid": 5, "tid": 7, "ts": 1716454223497679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424928, "dur": 11, "args": { "External id": 135018, "cbid": 211, "correlation": 135018 } }, { "ph": "s", "id": 135018, "pid": 76337, "tid": -914061504, "ts": 1716454223424928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223497712, "dur": 4, "args": { "External id": 135030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135030, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 135030, "pid": 5, "tid": 7, "ts": 1716454223497712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424949, "dur": 6, "args": { "External id": 135030, "cbid": 211, "correlation": 135030 } }, { "ph": "s", "id": 135030, "pid": 76337, "tid": -914061504, "ts": 1716454223424949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223497717, "dur": 29, "args": { "External id": 135033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135033, "pid": 5, "tid": 7, "ts": 1716454223497717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223424968, "dur": 15, "args": { "External id": 135033, "cbid": 211, "correlation": 135033 } }, { "ph": "s", "id": 135033, "pid": 76337, "tid": -914061504, "ts": 1716454223424968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223497747, "dur": 20, "args": { "External id": 135042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135042, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135042, "pid": 5, "tid": 7, "ts": 1716454223497747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425019, "dur": 11, "args": { "External id": 135042, "cbid": 211, "correlation": 135042 } }, { "ph": "s", "id": 135042, "pid": 76337, "tid": -914061504, "ts": 1716454223425019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223425083, "dur": 0, "args": { "External id": 135052, "cbid": 317, "correlation": 135052 } }, { "ph": "f", "id": 135052, "pid": 76337, "tid": -914061504, "ts": 1716454223425083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223425083, "dur": 0, "args": { "External id": 135053, "cbid": 203, "correlation": 135053 } }, { "ph": "f", "id": 135053, "pid": 76337, "tid": -914061504, "ts": 1716454223425083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223425084, "dur": 0, "args": { "External id": 135054, "cbid": 205, "correlation": 135054 } }, { "ph": "f", "id": 135054, "pid": 76337, "tid": -914061504, "ts": 1716454223425084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223497769, "dur": 21, "args": { "External id": 135058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135058, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135058, "pid": 5, "tid": 7, "ts": 1716454223497769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425098, "dur": 12, "args": { "External id": 135058, "cbid": 211, "correlation": 135058 } }, { "ph": "s", "id": 135058, "pid": 76337, "tid": -914061504, "ts": 1716454223425098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223497791, "dur": 43, "args": { "External id": 135060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135060, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135060, "pid": 5, "tid": 7, "ts": 1716454223497791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425113, "dur": 5, "args": { "External id": 135060, "cbid": 211, "correlation": 135060 } }, { "ph": "s", "id": 135060, "pid": 76337, "tid": -914061504, "ts": 1716454223425113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223497836, "dur": 639, "args": { "External id": 135062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135062, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135062, "pid": 5, "tid": 7, "ts": 1716454223497836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425124, "dur": 6, "args": { "External id": 135062, "cbid": 211, "correlation": 135062 } }, { "ph": "s", "id": 135062, "pid": 76337, "tid": -914061504, "ts": 1716454223425124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223498476, "dur": 21, "args": { "External id": 135064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135064, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135064, "pid": 5, "tid": 7, "ts": 1716454223498476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425133, "dur": 5, "args": { "External id": 135064, "cbid": 211, "correlation": 135064 } }, { "ph": "s", "id": 135064, "pid": 76337, "tid": -914061504, "ts": 1716454223425133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223498498, "dur": 32, "args": { "External id": 135070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135070, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135070, "pid": 5, "tid": 7, "ts": 1716454223498498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425162, "dur": 8, "args": { "External id": 135070, "cbid": 211, "correlation": 135070 } }, { "ph": "s", "id": 135070, "pid": 76337, "tid": -914061504, "ts": 1716454223425162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223425220, "dur": 0, "args": { "External id": 135080, "cbid": 317, "correlation": 135080 } }, { "ph": "f", "id": 135080, "pid": 76337, "tid": -914061504, "ts": 1716454223425220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223425220, "dur": 0, "args": { "External id": 135081, "cbid": 203, "correlation": 135081 } }, { "ph": "f", "id": 135081, "pid": 76337, "tid": -914061504, "ts": 1716454223425220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223425221, "dur": 0, "args": { "External id": 135082, "cbid": 205, "correlation": 135082 } }, { "ph": "f", "id": 135082, "pid": 76337, "tid": -914061504, "ts": 1716454223425221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223498531, "dur": 38, "args": { "External id": 135086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135086, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135086, "pid": 5, "tid": 7, "ts": 1716454223498531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425233, "dur": 11, "args": { "External id": 135086, "cbid": 211, "correlation": 135086 } }, { "ph": "s", "id": 135086, "pid": 76337, "tid": -914061504, "ts": 1716454223425233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223498571, "dur": 187, "args": { "External id": 135088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135088, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135088, "pid": 5, "tid": 7, "ts": 1716454223498571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425250, "dur": 6, "args": { "External id": 135088, "cbid": 211, "correlation": 135088 } }, { "ph": "s", "id": 135088, "pid": 76337, "tid": -914061504, "ts": 1716454223425250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223498759, "dur": 22, "args": { "External id": 135090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135090, "pid": 5, "tid": 7, "ts": 1716454223498759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425260, "dur": 5, "args": { "External id": 135090, "cbid": 211, "correlation": 135090 } }, { "ph": "s", "id": 135090, "pid": 76337, "tid": -914061504, "ts": 1716454223425260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223498782, "dur": 32, "args": { "External id": 135096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135096, "pid": 5, "tid": 7, "ts": 1716454223498782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425285, "dur": 8, "args": { "External id": 135096, "cbid": 211, "correlation": 135096 } }, { "ph": "s", "id": 135096, "pid": 76337, "tid": -914061504, "ts": 1716454223425285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223498815, "dur": 27, "args": { "External id": 135104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135104, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135104, "pid": 5, "tid": 7, "ts": 1716454223498815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425313, "dur": 8, "args": { "External id": 135104, "cbid": 211, "correlation": 135104 } }, { "ph": "s", "id": 135104, "pid": 76337, "tid": -914061504, "ts": 1716454223425313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223498843, "dur": 20, "args": { "External id": 135112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135112, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135112, "pid": 5, "tid": 7, "ts": 1716454223498843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425341, "dur": 9, "args": { "External id": 135112, "cbid": 211, "correlation": 135112 } }, { "ph": "s", "id": 135112, "pid": 76337, "tid": -914061504, "ts": 1716454223425341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223498864, "dur": 30, "args": { "External id": 135132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135132, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 135132, "pid": 5, "tid": 7, "ts": 1716454223498864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425425, "dur": 12, "args": { "External id": 135132, "cbid": 211, "correlation": 135132 } }, { "ph": "s", "id": 135132, "pid": 76337, "tid": -914061504, "ts": 1716454223425425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223498895, "dur": 4, "args": { "External id": 135144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135144, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 135144, "pid": 5, "tid": 7, "ts": 1716454223498895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425447, "dur": 7, "args": { "External id": 135144, "cbid": 211, "correlation": 135144 } }, { "ph": "s", "id": 135144, "pid": 76337, "tid": -914061504, "ts": 1716454223425447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223498900, "dur": 31, "args": { "External id": 135147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135147, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135147, "pid": 5, "tid": 7, "ts": 1716454223498900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425465, "dur": 6, "args": { "External id": 135147, "cbid": 211, "correlation": 135147 } }, { "ph": "s", "id": 135147, "pid": 76337, "tid": -914061504, "ts": 1716454223425465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223425520, "dur": 0, "args": { "External id": 135158, "cbid": 317, "correlation": 135158 } }, { "ph": "f", "id": 135158, "pid": 76337, "tid": -914061504, "ts": 1716454223425520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223425521, "dur": 0, "args": { "External id": 135159, "cbid": 203, "correlation": 135159 } }, { "ph": "f", "id": 135159, "pid": 76337, "tid": -914061504, "ts": 1716454223425521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223425522, "dur": 0, "args": { "External id": 135160, "cbid": 205, "correlation": 135160 } }, { "ph": "f", "id": 135160, "pid": 76337, "tid": -914061504, "ts": 1716454223425522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223498932, "dur": 22, "args": { "External id": 135164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135164, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135164, "pid": 5, "tid": 7, "ts": 1716454223498932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425535, "dur": 12, "args": { "External id": 135164, "cbid": 211, "correlation": 135164 } }, { "ph": "s", "id": 135164, "pid": 76337, "tid": -914061504, "ts": 1716454223425535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223498956, "dur": 103, "args": { "External id": 135166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135166, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135166, "pid": 5, "tid": 7, "ts": 1716454223498956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425553, "dur": 6, "args": { "External id": 135166, "cbid": 211, "correlation": 135166 } }, { "ph": "s", "id": 135166, "pid": 76337, "tid": -914061504, "ts": 1716454223425553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223499060, "dur": 23, "args": { "External id": 135168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135168, "pid": 5, "tid": 7, "ts": 1716454223499060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425563, "dur": 5, "args": { "External id": 135168, "cbid": 211, "correlation": 135168 } }, { "ph": "s", "id": 135168, "pid": 76337, "tid": -914061504, "ts": 1716454223425563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223499084, "dur": 32, "args": { "External id": 135174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135174, "pid": 5, "tid": 7, "ts": 1716454223499084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425590, "dur": 9, "args": { "External id": 135174, "cbid": 211, "correlation": 135174 } }, { "ph": "s", "id": 135174, "pid": 76337, "tid": -914061504, "ts": 1716454223425590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223499118, "dur": 194, "args": { "External id": 135183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135183, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135183, "pid": 5, "tid": 7, "ts": 1716454223499118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425673, "dur": 14, "args": { "External id": 135183, "cbid": 211, "correlation": 135183 } }, { "ph": "s", "id": 135183, "pid": 76337, "tid": -914061504, "ts": 1716454223425673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223499313, "dur": 65, "args": { "External id": 135205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135205, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135205, "pid": 5, "tid": 7, "ts": 1716454223499313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425731, "dur": 10, "args": { "External id": 135205, "cbid": 211, "correlation": 135205 } }, { "ph": "s", "id": 135205, "pid": 76337, "tid": -914061504, "ts": 1716454223425731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223425819, "dur": 1, "args": { "External id": 135216, "cbid": 251, "correlation": 135216 } }, { "ph": "f", "id": 135216, "pid": 76337, "tid": -914061504, "ts": 1716454223425819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223499379, "dur": 152, "args": { "External id": 135217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135217, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135217, "pid": 5, "tid": 7, "ts": 1716454223499379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425825, "dur": 13, "args": { "External id": 135217, "cbid": 211, "correlation": 135217 } }, { "ph": "s", "id": 135217, "pid": 76337, "tid": -914061504, "ts": 1716454223425825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223425894, "dur": 1, "args": { "External id": 135228, "cbid": 251, "correlation": 135228 } }, { "ph": "f", "id": 135228, "pid": 76337, "tid": -914061504, "ts": 1716454223425894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223499532, "dur": 146, "args": { "External id": 135229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135229, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135229, "pid": 5, "tid": 7, "ts": 1716454223499532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425898, "dur": 12, "args": { "External id": 135229, "cbid": 211, "correlation": 135229 } }, { "ph": "s", "id": 135229, "pid": 76337, "tid": -914061504, "ts": 1716454223425898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223425964, "dur": 1, "args": { "External id": 135240, "cbid": 251, "correlation": 135240 } }, { "ph": "f", "id": 135240, "pid": 76337, "tid": -914061504, "ts": 1716454223425964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223499680, "dur": 145, "args": { "External id": 135241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135241, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135241, "pid": 5, "tid": 7, "ts": 1716454223499680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223425968, "dur": 20, "args": { "External id": 135241, "cbid": 211, "correlation": 135241 } }, { "ph": "s", "id": 135241, "pid": 76337, "tid": -914061504, "ts": 1716454223425968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223499826, "dur": 1910, "args": { "External id": 135262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135262, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 135262, "pid": 5, "tid": 7, "ts": 1716454223499826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426059, "dur": 12, "args": { "External id": 135262, "cbid": 211, "correlation": 135262 } }, { "ph": "s", "id": 135262, "pid": 76337, "tid": -914061504, "ts": 1716454223426059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426156, "dur": 1, "args": { "External id": 135280, "cbid": 251, "correlation": 135280 } }, { "ph": "f", "id": 135280, "pid": 76337, "tid": -914061504, "ts": 1716454223426156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223501737, "dur": 147, "args": { "External id": 135282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135282, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 135282, "pid": 5, "tid": 7, "ts": 1716454223501737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426162, "dur": 13, "args": { "External id": 135282, "cbid": 211, "correlation": 135282 } }, { "ph": "s", "id": 135282, "pid": 76337, "tid": -914061504, "ts": 1716454223426162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223501886, "dur": 35, "args": { "External id": 135290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135290, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135290, "pid": 5, "tid": 7, "ts": 1716454223501886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426232, "dur": 12, "args": { "External id": 135290, "cbid": 211, "correlation": 135290 } }, { "ph": "s", "id": 135290, "pid": 76337, "tid": -914061504, "ts": 1716454223426232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223501922, "dur": 51, "args": { "External id": 135298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135298, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135298, "pid": 5, "tid": 7, "ts": 1716454223501922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426271, "dur": 9, "args": { "External id": 135298, "cbid": 211, "correlation": 135298 } }, { "ph": "s", "id": 135298, "pid": 76337, "tid": -914061504, "ts": 1716454223426271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223501974, "dur": 30, "args": { "External id": 135309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135309, "pid": 5, "tid": 7, "ts": 1716454223501974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426342, "dur": 13, "args": { "External id": 135309, "cbid": 211, "correlation": 135309 } }, { "ph": "s", "id": 135309, "pid": 76337, "tid": -914061504, "ts": 1716454223426342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223502005, "dur": 34, "args": { "External id": 135331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135331, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135331, "pid": 5, "tid": 7, "ts": 1716454223502005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426374, "dur": 8, "args": { "External id": 135331, "cbid": 211, "correlation": 135331 } }, { "ph": "s", "id": 135331, "pid": 76337, "tid": -914061504, "ts": 1716454223426374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426458, "dur": 1, "args": { "External id": 135342, "cbid": 251, "correlation": 135342 } }, { "ph": "f", "id": 135342, "pid": 76337, "tid": -914061504, "ts": 1716454223426458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223502041, "dur": 89, "args": { "External id": 135343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135343, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135343, "pid": 5, "tid": 7, "ts": 1716454223502041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426463, "dur": 13, "args": { "External id": 135343, "cbid": 211, "correlation": 135343 } }, { "ph": "s", "id": 135343, "pid": 76337, "tid": -914061504, "ts": 1716454223426463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426532, "dur": 1, "args": { "External id": 135354, "cbid": 251, "correlation": 135354 } }, { "ph": "f", "id": 135354, "pid": 76337, "tid": -914061504, "ts": 1716454223426532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426536, "dur": 0, "args": { "External id": 135355, "cbid": 251, "correlation": 135355 } }, { "ph": "f", "id": 135355, "pid": 76337, "tid": -914061504, "ts": 1716454223426536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223502131, "dur": 11, "args": { "External id": 135356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135356, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 135356, "pid": 5, "tid": 7, "ts": 1716454223502131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426537, "dur": 12, "args": { "External id": 135356, "cbid": 211, "correlation": 135356 } }, { "ph": "s", "id": 135356, "pid": 76337, "tid": -914061504, "ts": 1716454223426537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223502143, "dur": 5, "args": { "External id": 135358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135358, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 135358, "pid": 5, "tid": 7, "ts": 1716454223502143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426551, "dur": 5, "args": { "External id": 135358, "cbid": 211, "correlation": 135358 } }, { "ph": "s", "id": 135358, "pid": 76337, "tid": -914061504, "ts": 1716454223426551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426608, "dur": 1, "args": { "External id": 135369, "cbid": 251, "correlation": 135369 } }, { "ph": "f", "id": 135369, "pid": 76337, "tid": -914061504, "ts": 1716454223426608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426612, "dur": 0, "args": { "External id": 135370, "cbid": 251, "correlation": 135370 } }, { "ph": "f", "id": 135370, "pid": 76337, "tid": -914061504, "ts": 1716454223426612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223502149, "dur": 7, "args": { "External id": 135371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135371, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 135371, "pid": 5, "tid": 7, "ts": 1716454223502149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426613, "dur": 12, "args": { "External id": 135371, "cbid": 211, "correlation": 135371 } }, { "ph": "s", "id": 135371, "pid": 76337, "tid": -914061504, "ts": 1716454223426613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223502157, "dur": 3, "args": { "External id": 135373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135373, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 135373, "pid": 5, "tid": 7, "ts": 1716454223502157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426627, "dur": 5, "args": { "External id": 135373, "cbid": 211, "correlation": 135373 } }, { "ph": "s", "id": 135373, "pid": 76337, "tid": -914061504, "ts": 1716454223426627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223502162, "dur": 89, "args": { "External id": 135394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135394, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 135394, "pid": 5, "tid": 7, "ts": 1716454223502162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426700, "dur": 13, "args": { "External id": 135394, "cbid": 211, "correlation": 135394 } }, { "ph": "s", "id": 135394, "pid": 76337, "tid": -914061504, "ts": 1716454223426700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223426797, "dur": 1, "args": { "External id": 135412, "cbid": 251, "correlation": 135412 } }, { "ph": "f", "id": 135412, "pid": 76337, "tid": -914061504, "ts": 1716454223426797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223502253, "dur": 97, "args": { "External id": 135414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135414, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135414, "pid": 5, "tid": 7, "ts": 1716454223502253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426803, "dur": 13, "args": { "External id": 135414, "cbid": 211, "correlation": 135414 } }, { "ph": "s", "id": 135414, "pid": 76337, "tid": -914061504, "ts": 1716454223426803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223502351, "dur": 19, "args": { "External id": 135422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135422, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135422, "pid": 5, "tid": 7, "ts": 1716454223502351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426872, "dur": 12, "args": { "External id": 135422, "cbid": 211, "correlation": 135422 } }, { "ph": "s", "id": 135422, "pid": 76337, "tid": -914061504, "ts": 1716454223426872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223502371, "dur": 37, "args": { "External id": 135430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135430, "pid": 5, "tid": 7, "ts": 1716454223502371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426913, "dur": 10, "args": { "External id": 135430, "cbid": 211, "correlation": 135430 } }, { "ph": "s", "id": 135430, "pid": 76337, "tid": -914061504, "ts": 1716454223426913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223502410, "dur": 34, "args": { "External id": 135452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135452, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135452, "pid": 5, "tid": 7, "ts": 1716454223502410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223426965, "dur": 19, "args": { "External id": 135452, "cbid": 211, "correlation": 135452 } }, { "ph": "s", "id": 135452, "pid": 76337, "tid": -914061504, "ts": 1716454223426965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223427065, "dur": 1, "args": { "External id": 135468, "cbid": 251, "correlation": 135468 } }, { "ph": "f", "id": 135468, "pid": 76337, "tid": -914061504, "ts": 1716454223427065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223427070, "dur": 0, "args": { "External id": 135470, "cbid": 251, "correlation": 135470 } }, { "ph": "f", "id": 135470, "pid": 76337, "tid": -914061504, "ts": 1716454223427070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223502445, "dur": 529, "args": { "External id": 135471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135471, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 135471, "pid": 5, "tid": 7, "ts": 1716454223502445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427074, "dur": 13, "args": { "External id": 135471, "cbid": 211, "correlation": 135471 } }, { "ph": "s", "id": 135471, "pid": 76337, "tid": -914061504, "ts": 1716454223427074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223502976, "dur": 123, "args": { "External id": 135479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135479, "pid": 5, "tid": 7, "ts": 1716454223502976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427140, "dur": 13, "args": { "External id": 135479, "cbid": 211, "correlation": 135479 } }, { "ph": "s", "id": 135479, "pid": 76337, "tid": -914061504, "ts": 1716454223427140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223503100, "dur": 128, "args": { "External id": 135487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135487, "pid": 5, "tid": 7, "ts": 1716454223503100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427169, "dur": 9, "args": { "External id": 135487, "cbid": 211, "correlation": 135487 } }, { "ph": "s", "id": 135487, "pid": 76337, "tid": -914061504, "ts": 1716454223427169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223427246, "dur": 1, "args": { "External id": 135503, "cbid": 251, "correlation": 135503 } }, { "ph": "f", "id": 135503, "pid": 76337, "tid": -914061504, "ts": 1716454223427246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223503229, "dur": 303, "args": { "External id": 135505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135505, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135505, "pid": 5, "tid": 7, "ts": 1716454223503229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427252, "dur": 12, "args": { "External id": 135505, "cbid": 211, "correlation": 135505 } }, { "ph": "s", "id": 135505, "pid": 76337, "tid": -914061504, "ts": 1716454223427252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223503533, "dur": 27, "args": { "External id": 135513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135513, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135513, "pid": 5, "tid": 7, "ts": 1716454223503533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427294, "dur": 9, "args": { "External id": 135513, "cbid": 211, "correlation": 135513 } }, { "ph": "s", "id": 135513, "pid": 76337, "tid": -914061504, "ts": 1716454223427294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223503561, "dur": 80, "args": { "External id": 135524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135524, "pid": 5, "tid": 7, "ts": 1716454223503561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427360, "dur": 13, "args": { "External id": 135524, "cbid": 211, "correlation": 135524 } }, { "ph": "s", "id": 135524, "pid": 76337, "tid": -914061504, "ts": 1716454223427360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223427425, "dur": 0, "args": { "External id": 135536, "cbid": 317, "correlation": 135536 } }, { "ph": "f", "id": 135536, "pid": 76337, "tid": -914061504, "ts": 1716454223427425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223427426, "dur": 0, "args": { "External id": 135537, "cbid": 203, "correlation": 135537 } }, { "ph": "f", "id": 135537, "pid": 76337, "tid": -914061504, "ts": 1716454223427426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223427427, "dur": 0, "args": { "External id": 135538, "cbid": 205, "correlation": 135538 } }, { "ph": "f", "id": 135538, "pid": 76337, "tid": -914061504, "ts": 1716454223427427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223503642, "dur": 23, "args": { "External id": 135542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135542, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135542, "pid": 5, "tid": 7, "ts": 1716454223503642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427441, "dur": 12, "args": { "External id": 135542, "cbid": 211, "correlation": 135542 } }, { "ph": "s", "id": 135542, "pid": 76337, "tid": -914061504, "ts": 1716454223427441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223503667, "dur": 119, "args": { "External id": 135544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135544, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135544, "pid": 5, "tid": 7, "ts": 1716454223503667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427459, "dur": 6, "args": { "External id": 135544, "cbid": 211, "correlation": 135544 } }, { "ph": "s", "id": 135544, "pid": 76337, "tid": -914061504, "ts": 1716454223427459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223503787, "dur": 23, "args": { "External id": 135546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135546, "pid": 5, "tid": 7, "ts": 1716454223503787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427470, "dur": 6, "args": { "External id": 135546, "cbid": 211, "correlation": 135546 } }, { "ph": "s", "id": 135546, "pid": 76337, "tid": -914061504, "ts": 1716454223427470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223503811, "dur": 33, "args": { "External id": 135552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135552, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135552, "pid": 5, "tid": 7, "ts": 1716454223503811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427498, "dur": 8, "args": { "External id": 135552, "cbid": 211, "correlation": 135552 } }, { "ph": "s", "id": 135552, "pid": 76337, "tid": -914061504, "ts": 1716454223427498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223503845, "dur": 27, "args": { "External id": 135560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135560, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135560, "pid": 5, "tid": 7, "ts": 1716454223503845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427530, "dur": 8, "args": { "External id": 135560, "cbid": 211, "correlation": 135560 } }, { "ph": "s", "id": 135560, "pid": 76337, "tid": -914061504, "ts": 1716454223427530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223503873, "dur": 45, "args": { "External id": 135569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135569, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135569, "pid": 5, "tid": 7, "ts": 1716454223503873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427567, "dur": 11, "args": { "External id": 135569, "cbid": 211, "correlation": 135569 } }, { "ph": "s", "id": 135569, "pid": 76337, "tid": -914061504, "ts": 1716454223427567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223503919, "dur": 42, "args": { "External id": 135589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135589, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 135589, "pid": 5, "tid": 7, "ts": 1716454223503919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427638, "dur": 11, "args": { "External id": 135589, "cbid": 211, "correlation": 135589 } }, { "ph": "s", "id": 135589, "pid": 76337, "tid": -914061504, "ts": 1716454223427638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223503962, "dur": 5, "args": { "External id": 135601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135601, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 135601, "pid": 5, "tid": 7, "ts": 1716454223503962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427659, "dur": 6, "args": { "External id": 135601, "cbid": 211, "correlation": 135601 } }, { "ph": "s", "id": 135601, "pid": 76337, "tid": -914061504, "ts": 1716454223427659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223503968, "dur": 43, "args": { "External id": 135604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135604, "pid": 5, "tid": 7, "ts": 1716454223503968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427679, "dur": 6, "args": { "External id": 135604, "cbid": 211, "correlation": 135604 } }, { "ph": "s", "id": 135604, "pid": 76337, "tid": -914061504, "ts": 1716454223427679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223504013, "dur": 29, "args": { "External id": 135613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135613, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135613, "pid": 5, "tid": 7, "ts": 1716454223504013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427717, "dur": 11, "args": { "External id": 135613, "cbid": 211, "correlation": 135613 } }, { "ph": "s", "id": 135613, "pid": 76337, "tid": -914061504, "ts": 1716454223427717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223427769, "dur": 0, "args": { "External id": 135623, "cbid": 317, "correlation": 135623 } }, { "ph": "f", "id": 135623, "pid": 76337, "tid": -914061504, "ts": 1716454223427769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223427770, "dur": 0, "args": { "External id": 135624, "cbid": 203, "correlation": 135624 } }, { "ph": "f", "id": 135624, "pid": 76337, "tid": -914061504, "ts": 1716454223427770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223427771, "dur": 0, "args": { "External id": 135625, "cbid": 205, "correlation": 135625 } }, { "ph": "f", "id": 135625, "pid": 76337, "tid": -914061504, "ts": 1716454223427771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223504043, "dur": 30, "args": { "External id": 135629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135629, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135629, "pid": 5, "tid": 7, "ts": 1716454223504043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427786, "dur": 12, "args": { "External id": 135629, "cbid": 211, "correlation": 135629 } }, { "ph": "s", "id": 135629, "pid": 76337, "tid": -914061504, "ts": 1716454223427786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223504074, "dur": 63, "args": { "External id": 135631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135631, "pid": 5, "tid": 7, "ts": 1716454223504074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427800, "dur": 5, "args": { "External id": 135631, "cbid": 211, "correlation": 135631 } }, { "ph": "s", "id": 135631, "pid": 76337, "tid": -914061504, "ts": 1716454223427800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223504138, "dur": 958, "args": { "External id": 135633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135633, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135633, "pid": 5, "tid": 7, "ts": 1716454223504138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427811, "dur": 6, "args": { "External id": 135633, "cbid": 211, "correlation": 135633 } }, { "ph": "s", "id": 135633, "pid": 76337, "tid": -914061504, "ts": 1716454223427811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223505097, "dur": 20, "args": { "External id": 135635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135635, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135635, "pid": 5, "tid": 7, "ts": 1716454223505097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427821, "dur": 6, "args": { "External id": 135635, "cbid": 211, "correlation": 135635 } }, { "ph": "s", "id": 135635, "pid": 76337, "tid": -914061504, "ts": 1716454223427821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223505119, "dur": 32, "args": { "External id": 135641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135641, "pid": 5, "tid": 7, "ts": 1716454223505119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427849, "dur": 8, "args": { "External id": 135641, "cbid": 211, "correlation": 135641 } }, { "ph": "s", "id": 135641, "pid": 76337, "tid": -914061504, "ts": 1716454223427849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223505153, "dur": 3, "args": { "External id": 135649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135649, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 135649, "pid": 5, "tid": 7, "ts": 1716454223505153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427892, "dur": 9, "args": { "External id": 135649, "cbid": 211, "correlation": 135649 } }, { "ph": "s", "id": 135649, "pid": 76337, "tid": -914061504, "ts": 1716454223427892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223427956, "dur": 1, "args": { "External id": 135665, "cbid": 251, "correlation": 135665 } }, { "ph": "f", "id": 135665, "pid": 76337, "tid": -914061504, "ts": 1716454223427956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223427962, "dur": 0, "args": { "External id": 135667, "cbid": 251, "correlation": 135667 } }, { "ph": "f", "id": 135667, "pid": 76337, "tid": -914061504, "ts": 1716454223427962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223505157, "dur": 12, "args": { "External id": 135668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135668, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 135668, "pid": 5, "tid": 7, "ts": 1716454223505157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427963, "dur": 19, "args": { "External id": 135668, "cbid": 211, "correlation": 135668 } }, { "ph": "s", "id": 135668, "pid": 76337, "tid": -914061504, "ts": 1716454223427963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223505171, "dur": 5, "args": { "External id": 135670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135670, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 135670, "pid": 5, "tid": 7, "ts": 1716454223505171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223427985, "dur": 6, "args": { "External id": 135670, "cbid": 211, "correlation": 135670 } }, { "ph": "s", "id": 135670, "pid": 76337, "tid": -914061504, "ts": 1716454223427985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223505177, "dur": 29, "args": { "External id": 135680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135680, "pid": 5, "tid": 7, "ts": 1716454223505177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428043, "dur": 12, "args": { "External id": 135680, "cbid": 211, "correlation": 135680 } }, { "ph": "s", "id": 135680, "pid": 76337, "tid": -914061504, "ts": 1716454223428043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223505207, "dur": 31, "args": { "External id": 135700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135700, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 135700, "pid": 5, "tid": 7, "ts": 1716454223505207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428108, "dur": 11, "args": { "External id": 135700, "cbid": 211, "correlation": 135700 } }, { "ph": "s", "id": 135700, "pid": 76337, "tid": -914061504, "ts": 1716454223428108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223505239, "dur": 4, "args": { "External id": 135712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135712, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 135712, "pid": 5, "tid": 7, "ts": 1716454223505239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428128, "dur": 7, "args": { "External id": 135712, "cbid": 211, "correlation": 135712 } }, { "ph": "s", "id": 135712, "pid": 76337, "tid": -914061504, "ts": 1716454223428128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223505244, "dur": 29, "args": { "External id": 135715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135715, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135715, "pid": 5, "tid": 7, "ts": 1716454223505244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428147, "dur": 6, "args": { "External id": 135715, "cbid": 211, "correlation": 135715 } }, { "ph": "s", "id": 135715, "pid": 76337, "tid": -914061504, "ts": 1716454223428147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223505275, "dur": 20, "args": { "External id": 135724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135724, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135724, "pid": 5, "tid": 7, "ts": 1716454223505275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428188, "dur": 9, "args": { "External id": 135724, "cbid": 211, "correlation": 135724 } }, { "ph": "s", "id": 135724, "pid": 76337, "tid": -914061504, "ts": 1716454223428188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223428250, "dur": 0, "args": { "External id": 135734, "cbid": 317, "correlation": 135734 } }, { "ph": "f", "id": 135734, "pid": 76337, "tid": -914061504, "ts": 1716454223428250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223428251, "dur": 0, "args": { "External id": 135735, "cbid": 203, "correlation": 135735 } }, { "ph": "f", "id": 135735, "pid": 76337, "tid": -914061504, "ts": 1716454223428251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223428251, "dur": 0, "args": { "External id": 135736, "cbid": 205, "correlation": 135736 } }, { "ph": "f", "id": 135736, "pid": 76337, "tid": -914061504, "ts": 1716454223428251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223505296, "dur": 23, "args": { "External id": 135740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135740, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135740, "pid": 5, "tid": 7, "ts": 1716454223505296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428265, "dur": 12, "args": { "External id": 135740, "cbid": 211, "correlation": 135740 } }, { "ph": "s", "id": 135740, "pid": 76337, "tid": -914061504, "ts": 1716454223428265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223505320, "dur": 43, "args": { "External id": 135742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135742, "pid": 5, "tid": 7, "ts": 1716454223505320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428279, "dur": 6, "args": { "External id": 135742, "cbid": 211, "correlation": 135742 } }, { "ph": "s", "id": 135742, "pid": 76337, "tid": -914061504, "ts": 1716454223428279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223505365, "dur": 638, "args": { "External id": 135744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135744, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135744, "pid": 5, "tid": 7, "ts": 1716454223505365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428291, "dur": 6, "args": { "External id": 135744, "cbid": 211, "correlation": 135744 } }, { "ph": "s", "id": 135744, "pid": 76337, "tid": -914061504, "ts": 1716454223428291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223506004, "dur": 21, "args": { "External id": 135746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135746, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135746, "pid": 5, "tid": 7, "ts": 1716454223506004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428301, "dur": 5, "args": { "External id": 135746, "cbid": 211, "correlation": 135746 } }, { "ph": "s", "id": 135746, "pid": 76337, "tid": -914061504, "ts": 1716454223428301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223506026, "dur": 33, "args": { "External id": 135752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135752, "pid": 5, "tid": 7, "ts": 1716454223506026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428329, "dur": 9, "args": { "External id": 135752, "cbid": 211, "correlation": 135752 } }, { "ph": "s", "id": 135752, "pid": 76337, "tid": -914061504, "ts": 1716454223428329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223428387, "dur": 0, "args": { "External id": 135762, "cbid": 317, "correlation": 135762 } }, { "ph": "f", "id": 135762, "pid": 76337, "tid": -914061504, "ts": 1716454223428387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223428387, "dur": 0, "args": { "External id": 135763, "cbid": 203, "correlation": 135763 } }, { "ph": "f", "id": 135763, "pid": 76337, "tid": -914061504, "ts": 1716454223428387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223428388, "dur": 0, "args": { "External id": 135764, "cbid": 205, "correlation": 135764 } }, { "ph": "f", "id": 135764, "pid": 76337, "tid": -914061504, "ts": 1716454223428388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223506061, "dur": 30, "args": { "External id": 135768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135768, "pid": 5, "tid": 7, "ts": 1716454223506061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428402, "dur": 12, "args": { "External id": 135768, "cbid": 211, "correlation": 135768 } }, { "ph": "s", "id": 135768, "pid": 76337, "tid": -914061504, "ts": 1716454223428402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223506092, "dur": 150, "args": { "External id": 135770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135770, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135770, "pid": 5, "tid": 7, "ts": 1716454223506092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428420, "dur": 6, "args": { "External id": 135770, "cbid": 211, "correlation": 135770 } }, { "ph": "s", "id": 135770, "pid": 76337, "tid": -914061504, "ts": 1716454223428420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223506243, "dur": 20, "args": { "External id": 135772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135772, "pid": 5, "tid": 7, "ts": 1716454223506243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428430, "dur": 6, "args": { "External id": 135772, "cbid": 211, "correlation": 135772 } }, { "ph": "s", "id": 135772, "pid": 76337, "tid": -914061504, "ts": 1716454223428430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223506264, "dur": 32, "args": { "External id": 135778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135778, "pid": 5, "tid": 7, "ts": 1716454223506264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428456, "dur": 8, "args": { "External id": 135778, "cbid": 211, "correlation": 135778 } }, { "ph": "s", "id": 135778, "pid": 76337, "tid": -914061504, "ts": 1716454223428456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223506297, "dur": 27, "args": { "External id": 135786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135786, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135786, "pid": 5, "tid": 7, "ts": 1716454223506297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428484, "dur": 8, "args": { "External id": 135786, "cbid": 211, "correlation": 135786 } }, { "ph": "s", "id": 135786, "pid": 76337, "tid": -914061504, "ts": 1716454223428484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223506326, "dur": 20, "args": { "External id": 135794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135794, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135794, "pid": 5, "tid": 7, "ts": 1716454223506326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428512, "dur": 8, "args": { "External id": 135794, "cbid": 211, "correlation": 135794 } }, { "ph": "s", "id": 135794, "pid": 76337, "tid": -914061504, "ts": 1716454223428512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223506347, "dur": 29, "args": { "External id": 135814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135814, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 135814, "pid": 5, "tid": 7, "ts": 1716454223506347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428595, "dur": 12, "args": { "External id": 135814, "cbid": 211, "correlation": 135814 } }, { "ph": "s", "id": 135814, "pid": 76337, "tid": -914061504, "ts": 1716454223428595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223506378, "dur": 5, "args": { "External id": 135826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135826, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 135826, "pid": 5, "tid": 7, "ts": 1716454223506378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428617, "dur": 6, "args": { "External id": 135826, "cbid": 211, "correlation": 135826 } }, { "ph": "s", "id": 135826, "pid": 76337, "tid": -914061504, "ts": 1716454223428617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223506384, "dur": 30, "args": { "External id": 135829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135829, "pid": 5, "tid": 7, "ts": 1716454223506384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428635, "dur": 6, "args": { "External id": 135829, "cbid": 211, "correlation": 135829 } }, { "ph": "s", "id": 135829, "pid": 76337, "tid": -914061504, "ts": 1716454223428635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223428691, "dur": 0, "args": { "External id": 135840, "cbid": 317, "correlation": 135840 } }, { "ph": "f", "id": 135840, "pid": 76337, "tid": -914061504, "ts": 1716454223428691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223428692, "dur": 0, "args": { "External id": 135841, "cbid": 203, "correlation": 135841 } }, { "ph": "f", "id": 135841, "pid": 76337, "tid": -914061504, "ts": 1716454223428692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223428693, "dur": 0, "args": { "External id": 135842, "cbid": 205, "correlation": 135842 } }, { "ph": "f", "id": 135842, "pid": 76337, "tid": -914061504, "ts": 1716454223428693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223506415, "dur": 22, "args": { "External id": 135846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135846, "pid": 5, "tid": 7, "ts": 1716454223506415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428706, "dur": 12, "args": { "External id": 135846, "cbid": 211, "correlation": 135846 } }, { "ph": "s", "id": 135846, "pid": 76337, "tid": -914061504, "ts": 1716454223428706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223506438, "dur": 103, "args": { "External id": 135848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135848, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135848, "pid": 5, "tid": 7, "ts": 1716454223506438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428724, "dur": 6, "args": { "External id": 135848, "cbid": 211, "correlation": 135848 } }, { "ph": "s", "id": 135848, "pid": 76337, "tid": -914061504, "ts": 1716454223428724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223506543, "dur": 21, "args": { "External id": 135850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135850, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135850, "pid": 5, "tid": 7, "ts": 1716454223506543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428733, "dur": 6, "args": { "External id": 135850, "cbid": 211, "correlation": 135850 } }, { "ph": "s", "id": 135850, "pid": 76337, "tid": -914061504, "ts": 1716454223428733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223506565, "dur": 32, "args": { "External id": 135856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135856, "pid": 5, "tid": 7, "ts": 1716454223506565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428761, "dur": 8, "args": { "External id": 135856, "cbid": 211, "correlation": 135856 } }, { "ph": "s", "id": 135856, "pid": 76337, "tid": -914061504, "ts": 1716454223428761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223506598, "dur": 184, "args": { "External id": 135865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135865, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135865, "pid": 5, "tid": 7, "ts": 1716454223506598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428844, "dur": 14, "args": { "External id": 135865, "cbid": 211, "correlation": 135865 } }, { "ph": "s", "id": 135865, "pid": 76337, "tid": -914061504, "ts": 1716454223428844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223506784, "dur": 64, "args": { "External id": 135887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135887, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135887, "pid": 5, "tid": 7, "ts": 1716454223506784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223428901, "dur": 10, "args": { "External id": 135887, "cbid": 211, "correlation": 135887 } }, { "ph": "s", "id": 135887, "pid": 76337, "tid": -914061504, "ts": 1716454223428901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223428995, "dur": 1, "args": { "External id": 135898, "cbid": 251, "correlation": 135898 } }, { "ph": "f", "id": 135898, "pid": 76337, "tid": -914061504, "ts": 1716454223428995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223506849, "dur": 151, "args": { "External id": 135899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135899, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135899, "pid": 5, "tid": 7, "ts": 1716454223506849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429001, "dur": 13, "args": { "External id": 135899, "cbid": 211, "correlation": 135899 } }, { "ph": "s", "id": 135899, "pid": 76337, "tid": -914061504, "ts": 1716454223429001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429072, "dur": 1, "args": { "External id": 135910, "cbid": 251, "correlation": 135910 } }, { "ph": "f", "id": 135910, "pid": 76337, "tid": -914061504, "ts": 1716454223429072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223507002, "dur": 142, "args": { "External id": 135911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135911, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135911, "pid": 5, "tid": 7, "ts": 1716454223507002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429077, "dur": 12, "args": { "External id": 135911, "cbid": 211, "correlation": 135911 } }, { "ph": "s", "id": 135911, "pid": 76337, "tid": -914061504, "ts": 1716454223429077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429143, "dur": 1, "args": { "External id": 135922, "cbid": 251, "correlation": 135922 } }, { "ph": "f", "id": 135922, "pid": 76337, "tid": -914061504, "ts": 1716454223429143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223507146, "dur": 144, "args": { "External id": 135923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135923, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 135923, "pid": 5, "tid": 7, "ts": 1716454223507146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429147, "dur": 11, "args": { "External id": 135923, "cbid": 211, "correlation": 135923 } }, { "ph": "s", "id": 135923, "pid": 76337, "tid": -914061504, "ts": 1716454223429147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223507292, "dur": 1908, "args": { "External id": 135944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135944, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 135944, "pid": 5, "tid": 7, "ts": 1716454223507292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429227, "dur": 13, "args": { "External id": 135944, "cbid": 211, "correlation": 135944 } }, { "ph": "s", "id": 135944, "pid": 76337, "tid": -914061504, "ts": 1716454223429227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429324, "dur": 1, "args": { "External id": 135962, "cbid": 251, "correlation": 135962 } }, { "ph": "f", "id": 135962, "pid": 76337, "tid": -914061504, "ts": 1716454223429324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223509200, "dur": 147, "args": { "External id": 135964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135964, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 135964, "pid": 5, "tid": 7, "ts": 1716454223509200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429330, "dur": 14, "args": { "External id": 135964, "cbid": 211, "correlation": 135964 } }, { "ph": "s", "id": 135964, "pid": 76337, "tid": -914061504, "ts": 1716454223429330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223509349, "dur": 35, "args": { "External id": 135972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135972, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135972, "pid": 5, "tid": 7, "ts": 1716454223509349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429401, "dur": 12, "args": { "External id": 135972, "cbid": 211, "correlation": 135972 } }, { "ph": "s", "id": 135972, "pid": 76337, "tid": -914061504, "ts": 1716454223429401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223509385, "dur": 50, "args": { "External id": 135980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135980, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135980, "pid": 5, "tid": 7, "ts": 1716454223509385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429440, "dur": 9, "args": { "External id": 135980, "cbid": 211, "correlation": 135980 } }, { "ph": "s", "id": 135980, "pid": 76337, "tid": -914061504, "ts": 1716454223429440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223509437, "dur": 29, "args": { "External id": 135991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 135991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 135991, "pid": 5, "tid": 7, "ts": 1716454223509437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429510, "dur": 13, "args": { "External id": 135991, "cbid": 211, "correlation": 135991 } }, { "ph": "s", "id": 135991, "pid": 76337, "tid": -914061504, "ts": 1716454223429510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223509468, "dur": 35, "args": { "External id": 136013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136013, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136013, "pid": 5, "tid": 7, "ts": 1716454223509468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429541, "dur": 8, "args": { "External id": 136013, "cbid": 211, "correlation": 136013 } }, { "ph": "s", "id": 136013, "pid": 76337, "tid": -914061504, "ts": 1716454223429541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429626, "dur": 1, "args": { "External id": 136024, "cbid": 251, "correlation": 136024 } }, { "ph": "f", "id": 136024, "pid": 76337, "tid": -914061504, "ts": 1716454223429626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223509504, "dur": 87, "args": { "External id": 136025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136025, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136025, "pid": 5, "tid": 7, "ts": 1716454223509504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429631, "dur": 13, "args": { "External id": 136025, "cbid": 211, "correlation": 136025 } }, { "ph": "s", "id": 136025, "pid": 76337, "tid": -914061504, "ts": 1716454223429631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429699, "dur": 1, "args": { "External id": 136036, "cbid": 251, "correlation": 136036 } }, { "ph": "f", "id": 136036, "pid": 76337, "tid": -914061504, "ts": 1716454223429699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429703, "dur": 0, "args": { "External id": 136037, "cbid": 251, "correlation": 136037 } }, { "ph": "f", "id": 136037, "pid": 76337, "tid": -914061504, "ts": 1716454223429703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223509592, "dur": 11, "args": { "External id": 136038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136038, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 136038, "pid": 5, "tid": 7, "ts": 1716454223509592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429705, "dur": 12, "args": { "External id": 136038, "cbid": 211, "correlation": 136038 } }, { "ph": "s", "id": 136038, "pid": 76337, "tid": -914061504, "ts": 1716454223429705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223509605, "dur": 5, "args": { "External id": 136040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136040, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 136040, "pid": 5, "tid": 7, "ts": 1716454223509605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429718, "dur": 6, "args": { "External id": 136040, "cbid": 211, "correlation": 136040 } }, { "ph": "s", "id": 136040, "pid": 76337, "tid": -914061504, "ts": 1716454223429718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429776, "dur": 1, "args": { "External id": 136051, "cbid": 251, "correlation": 136051 } }, { "ph": "f", "id": 136051, "pid": 76337, "tid": -914061504, "ts": 1716454223429776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429779, "dur": 0, "args": { "External id": 136052, "cbid": 251, "correlation": 136052 } }, { "ph": "f", "id": 136052, "pid": 76337, "tid": -914061504, "ts": 1716454223429779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223509611, "dur": 7, "args": { "External id": 136053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136053, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 136053, "pid": 5, "tid": 7, "ts": 1716454223509611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429780, "dur": 12, "args": { "External id": 136053, "cbid": 211, "correlation": 136053 } }, { "ph": "s", "id": 136053, "pid": 76337, "tid": -914061504, "ts": 1716454223429780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223509619, "dur": 4, "args": { "External id": 136055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136055, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 136055, "pid": 5, "tid": 7, "ts": 1716454223509619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429794, "dur": 5, "args": { "External id": 136055, "cbid": 211, "correlation": 136055 } }, { "ph": "s", "id": 136055, "pid": 76337, "tid": -914061504, "ts": 1716454223429794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223509624, "dur": 90, "args": { "External id": 136076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136076, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 136076, "pid": 5, "tid": 7, "ts": 1716454223509624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429866, "dur": 12, "args": { "External id": 136076, "cbid": 211, "correlation": 136076 } }, { "ph": "s", "id": 136076, "pid": 76337, "tid": -914061504, "ts": 1716454223429866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223429963, "dur": 1, "args": { "External id": 136094, "cbid": 251, "correlation": 136094 } }, { "ph": "f", "id": 136094, "pid": 76337, "tid": -914061504, "ts": 1716454223429963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223509716, "dur": 97, "args": { "External id": 136096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136096, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136096, "pid": 5, "tid": 7, "ts": 1716454223509716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223429969, "dur": 20, "args": { "External id": 136096, "cbid": 211, "correlation": 136096 } }, { "ph": "s", "id": 136096, "pid": 76337, "tid": -914061504, "ts": 1716454223429969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223509814, "dur": 19, "args": { "External id": 136104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136104, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136104, "pid": 5, "tid": 7, "ts": 1716454223509814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430045, "dur": 12, "args": { "External id": 136104, "cbid": 211, "correlation": 136104 } }, { "ph": "s", "id": 136104, "pid": 76337, "tid": -914061504, "ts": 1716454223430045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223509834, "dur": 38, "args": { "External id": 136112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136112, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136112, "pid": 5, "tid": 7, "ts": 1716454223509834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430087, "dur": 10, "args": { "External id": 136112, "cbid": 211, "correlation": 136112 } }, { "ph": "s", "id": 136112, "pid": 76337, "tid": -914061504, "ts": 1716454223430087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223509874, "dur": 34, "args": { "External id": 136134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136134, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136134, "pid": 5, "tid": 7, "ts": 1716454223509874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430137, "dur": 11, "args": { "External id": 136134, "cbid": 211, "correlation": 136134 } }, { "ph": "s", "id": 136134, "pid": 76337, "tid": -914061504, "ts": 1716454223430137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223430226, "dur": 1, "args": { "External id": 136150, "cbid": 251, "correlation": 136150 } }, { "ph": "f", "id": 136150, "pid": 76337, "tid": -914061504, "ts": 1716454223430226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223430231, "dur": 0, "args": { "External id": 136152, "cbid": 251, "correlation": 136152 } }, { "ph": "f", "id": 136152, "pid": 76337, "tid": -914061504, "ts": 1716454223430231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223509909, "dur": 532, "args": { "External id": 136153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136153, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 136153, "pid": 5, "tid": 7, "ts": 1716454223509909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430234, "dur": 13, "args": { "External id": 136153, "cbid": 211, "correlation": 136153 } }, { "ph": "s", "id": 136153, "pid": 76337, "tid": -914061504, "ts": 1716454223430234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223510443, "dur": 124, "args": { "External id": 136161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136161, "pid": 5, "tid": 7, "ts": 1716454223510443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430299, "dur": 12, "args": { "External id": 136161, "cbid": 211, "correlation": 136161 } }, { "ph": "s", "id": 136161, "pid": 76337, "tid": -914061504, "ts": 1716454223430299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223510569, "dur": 129, "args": { "External id": 136169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136169, "pid": 5, "tid": 7, "ts": 1716454223510569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430328, "dur": 8, "args": { "External id": 136169, "cbid": 211, "correlation": 136169 } }, { "ph": "s", "id": 136169, "pid": 76337, "tid": -914061504, "ts": 1716454223430328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223430405, "dur": 1, "args": { "External id": 136185, "cbid": 251, "correlation": 136185 } }, { "ph": "f", "id": 136185, "pid": 76337, "tid": -914061504, "ts": 1716454223430405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223510700, "dur": 303, "args": { "External id": 136187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136187, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136187, "pid": 5, "tid": 7, "ts": 1716454223510700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430411, "dur": 12, "args": { "External id": 136187, "cbid": 211, "correlation": 136187 } }, { "ph": "s", "id": 136187, "pid": 76337, "tid": -914061504, "ts": 1716454223430411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223511004, "dur": 27, "args": { "External id": 136195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136195, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136195, "pid": 5, "tid": 7, "ts": 1716454223511004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430452, "dur": 10, "args": { "External id": 136195, "cbid": 211, "correlation": 136195 } }, { "ph": "s", "id": 136195, "pid": 76337, "tid": -914061504, "ts": 1716454223430452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223511032, "dur": 80, "args": { "External id": 136206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136206, "pid": 5, "tid": 7, "ts": 1716454223511032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430519, "dur": 12, "args": { "External id": 136206, "cbid": 211, "correlation": 136206 } }, { "ph": "s", "id": 136206, "pid": 76337, "tid": -914061504, "ts": 1716454223430519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223430582, "dur": 0, "args": { "External id": 136218, "cbid": 317, "correlation": 136218 } }, { "ph": "f", "id": 136218, "pid": 76337, "tid": -914061504, "ts": 1716454223430582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223430583, "dur": 0, "args": { "External id": 136219, "cbid": 203, "correlation": 136219 } }, { "ph": "f", "id": 136219, "pid": 76337, "tid": -914061504, "ts": 1716454223430583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223430584, "dur": 0, "args": { "External id": 136220, "cbid": 205, "correlation": 136220 } }, { "ph": "f", "id": 136220, "pid": 76337, "tid": -914061504, "ts": 1716454223430584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223511113, "dur": 24, "args": { "External id": 136224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136224, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136224, "pid": 5, "tid": 7, "ts": 1716454223511113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430599, "dur": 12, "args": { "External id": 136224, "cbid": 211, "correlation": 136224 } }, { "ph": "s", "id": 136224, "pid": 76337, "tid": -914061504, "ts": 1716454223430599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223511138, "dur": 119, "args": { "External id": 136226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136226, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136226, "pid": 5, "tid": 7, "ts": 1716454223511138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430618, "dur": 6, "args": { "External id": 136226, "cbid": 211, "correlation": 136226 } }, { "ph": "s", "id": 136226, "pid": 76337, "tid": -914061504, "ts": 1716454223430618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223511258, "dur": 23, "args": { "External id": 136228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136228, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136228, "pid": 5, "tid": 7, "ts": 1716454223511258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430628, "dur": 5, "args": { "External id": 136228, "cbid": 211, "correlation": 136228 } }, { "ph": "s", "id": 136228, "pid": 76337, "tid": -914061504, "ts": 1716454223430628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223511283, "dur": 33, "args": { "External id": 136234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136234, "pid": 5, "tid": 7, "ts": 1716454223511283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430656, "dur": 9, "args": { "External id": 136234, "cbid": 211, "correlation": 136234 } }, { "ph": "s", "id": 136234, "pid": 76337, "tid": -914061504, "ts": 1716454223430656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223511316, "dur": 27, "args": { "External id": 136242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136242, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136242, "pid": 5, "tid": 7, "ts": 1716454223511316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430687, "dur": 9, "args": { "External id": 136242, "cbid": 211, "correlation": 136242 } }, { "ph": "s", "id": 136242, "pid": 76337, "tid": -914061504, "ts": 1716454223430687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223511344, "dur": 101, "args": { "External id": 136253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136253, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136253, "pid": 5, "tid": 7, "ts": 1716454223511344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430749, "dur": 11, "args": { "External id": 136253, "cbid": 211, "correlation": 136253 } }, { "ph": "s", "id": 136253, "pid": 76337, "tid": -914061504, "ts": 1716454223430749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223430805, "dur": 0, "args": { "External id": 136263, "cbid": 317, "correlation": 136263 } }, { "ph": "f", "id": 136263, "pid": 76337, "tid": -914061504, "ts": 1716454223430805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223430805, "dur": 0, "args": { "External id": 136264, "cbid": 203, "correlation": 136264 } }, { "ph": "f", "id": 136264, "pid": 76337, "tid": -914061504, "ts": 1716454223430805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223430806, "dur": 0, "args": { "External id": 136265, "cbid": 205, "correlation": 136265 } }, { "ph": "f", "id": 136265, "pid": 76337, "tid": -914061504, "ts": 1716454223430806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223511446, "dur": 74, "args": { "External id": 136269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136269, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136269, "pid": 5, "tid": 7, "ts": 1716454223511446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430820, "dur": 11, "args": { "External id": 136269, "cbid": 211, "correlation": 136269 } }, { "ph": "s", "id": 136269, "pid": 76337, "tid": -914061504, "ts": 1716454223430820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223511522, "dur": 43, "args": { "External id": 136271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136271, "pid": 5, "tid": 7, "ts": 1716454223511522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430834, "dur": 5, "args": { "External id": 136271, "cbid": 211, "correlation": 136271 } }, { "ph": "s", "id": 136271, "pid": 76337, "tid": -914061504, "ts": 1716454223430834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223511567, "dur": 4, "args": { "External id": 136273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136273, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136273, "pid": 5, "tid": 7, "ts": 1716454223511567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430845, "dur": 6, "args": { "External id": 136273, "cbid": 211, "correlation": 136273 } }, { "ph": "s", "id": 136273, "pid": 76337, "tid": -914061504, "ts": 1716454223430845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223430854, "dur": 0, "args": { "External id": 136274, "cbid": 51, "correlation": 136274 } }, { "ph": "s", "id": 136274, "pid": 76337, "tid": -914061504, "ts": 1716454223430854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223511572, "dur": 2226, "args": { "External id": 136275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136275, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136275, "pid": 5, "tid": 7, "ts": 1716454223511572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430855, "dur": 5, "args": { "External id": 136275, "cbid": 211, "correlation": 136275 } }, { "ph": "s", "id": 136275, "pid": 76337, "tid": -914061504, "ts": 1716454223430855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223513799, "dur": 112, "args": { "External id": 136280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136280, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136280, "pid": 5, "tid": 7, "ts": 1716454223513799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430883, "dur": 8, "args": { "External id": 136280, "cbid": 211, "correlation": 136280 } }, { "ph": "s", "id": 136280, "pid": 76337, "tid": -914061504, "ts": 1716454223430883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223513912, "dur": 168, "args": { "External id": 136289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136289, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136289, "pid": 5, "tid": 7, "ts": 1716454223513912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223430984, "dur": 15, "args": { "External id": 136289, "cbid": 211, "correlation": 136289 } }, { "ph": "s", "id": 136289, "pid": 76337, "tid": -914061504, "ts": 1716454223430984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223514082, "dur": 129, "args": { "External id": 136309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136309, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 136309, "pid": 5, "tid": 7, "ts": 1716454223514082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431057, "dur": 11, "args": { "External id": 136309, "cbid": 211, "correlation": 136309 } }, { "ph": "s", "id": 136309, "pid": 76337, "tid": -914061504, "ts": 1716454223431057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223514212, "dur": 5, "args": { "External id": 136321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136321, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 136321, "pid": 5, "tid": 7, "ts": 1716454223514212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431078, "dur": 6, "args": { "External id": 136321, "cbid": 211, "correlation": 136321 } }, { "ph": "s", "id": 136321, "pid": 76337, "tid": -914061504, "ts": 1716454223431078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223514218, "dur": 156, "args": { "External id": 136324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136324, "pid": 5, "tid": 7, "ts": 1716454223514218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431097, "dur": 7, "args": { "External id": 136324, "cbid": 211, "correlation": 136324 } }, { "ph": "s", "id": 136324, "pid": 76337, "tid": -914061504, "ts": 1716454223431097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223514376, "dur": 101, "args": { "External id": 136333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136333, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136333, "pid": 5, "tid": 7, "ts": 1716454223514376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431136, "dur": 10, "args": { "External id": 136333, "cbid": 211, "correlation": 136333 } }, { "ph": "s", "id": 136333, "pid": 76337, "tid": -914061504, "ts": 1716454223431136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223431189, "dur": 0, "args": { "External id": 136343, "cbid": 317, "correlation": 136343 } }, { "ph": "f", "id": 136343, "pid": 76337, "tid": -914061504, "ts": 1716454223431189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223431190, "dur": 0, "args": { "External id": 136344, "cbid": 203, "correlation": 136344 } }, { "ph": "f", "id": 136344, "pid": 76337, "tid": -914061504, "ts": 1716454223431190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223431191, "dur": 0, "args": { "External id": 136345, "cbid": 205, "correlation": 136345 } }, { "ph": "f", "id": 136345, "pid": 76337, "tid": -914061504, "ts": 1716454223431191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223514478, "dur": 110, "args": { "External id": 136349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136349, "pid": 5, "tid": 7, "ts": 1716454223514478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431207, "dur": 11, "args": { "External id": 136349, "cbid": 211, "correlation": 136349 } }, { "ph": "s", "id": 136349, "pid": 76337, "tid": -914061504, "ts": 1716454223431207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223514589, "dur": 34, "args": { "External id": 136351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136351, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136351, "pid": 5, "tid": 7, "ts": 1716454223514589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431221, "dur": 5, "args": { "External id": 136351, "cbid": 211, "correlation": 136351 } }, { "ph": "s", "id": 136351, "pid": 76337, "tid": -914061504, "ts": 1716454223431221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223514624, "dur": 4, "args": { "External id": 136353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136353, "pid": 5, "tid": 7, "ts": 1716454223514624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431231, "dur": 6, "args": { "External id": 136353, "cbid": 211, "correlation": 136353 } }, { "ph": "s", "id": 136353, "pid": 76337, "tid": -914061504, "ts": 1716454223431231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223431240, "dur": 0, "args": { "External id": 136354, "cbid": 51, "correlation": 136354 } }, { "ph": "s", "id": 136354, "pid": 76337, "tid": -914061504, "ts": 1716454223431240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223514629, "dur": 1987, "args": { "External id": 136355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136355, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136355, "pid": 5, "tid": 7, "ts": 1716454223514629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431241, "dur": 6, "args": { "External id": 136355, "cbid": 211, "correlation": 136355 } }, { "ph": "s", "id": 136355, "pid": 76337, "tid": -914061504, "ts": 1716454223431241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223516617, "dur": 59, "args": { "External id": 136360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136360, "pid": 5, "tid": 7, "ts": 1716454223516617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431270, "dur": 8, "args": { "External id": 136360, "cbid": 211, "correlation": 136360 } }, { "ph": "s", "id": 136360, "pid": 76337, "tid": -914061504, "ts": 1716454223431270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223516677, "dur": 3, "args": { "External id": 136368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136368, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136368, "pid": 5, "tid": 7, "ts": 1716454223516677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431314, "dur": 10, "args": { "External id": 136368, "cbid": 211, "correlation": 136368 } }, { "ph": "s", "id": 136368, "pid": 76337, "tid": -914061504, "ts": 1716454223431314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223431379, "dur": 1, "args": { "External id": 136384, "cbid": 251, "correlation": 136384 } }, { "ph": "f", "id": 136384, "pid": 76337, "tid": -914061504, "ts": 1716454223431379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223431384, "dur": 0, "args": { "External id": 136386, "cbid": 251, "correlation": 136386 } }, { "ph": "f", "id": 136386, "pid": 76337, "tid": -914061504, "ts": 1716454223431384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223516682, "dur": 11, "args": { "External id": 136387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136387, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 136387, "pid": 5, "tid": 7, "ts": 1716454223516682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431386, "dur": 12, "args": { "External id": 136387, "cbid": 211, "correlation": 136387 } }, { "ph": "s", "id": 136387, "pid": 76337, "tid": -914061504, "ts": 1716454223431386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223516694, "dur": 5, "args": { "External id": 136389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136389, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 136389, "pid": 5, "tid": 7, "ts": 1716454223516694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431400, "dur": 6, "args": { "External id": 136389, "cbid": 211, "correlation": 136389 } }, { "ph": "s", "id": 136389, "pid": 76337, "tid": -914061504, "ts": 1716454223431400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223516701, "dur": 54, "args": { "External id": 136399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136399, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136399, "pid": 5, "tid": 7, "ts": 1716454223516701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431458, "dur": 12, "args": { "External id": 136399, "cbid": 211, "correlation": 136399 } }, { "ph": "s", "id": 136399, "pid": 76337, "tid": -914061504, "ts": 1716454223431458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223516756, "dur": 49, "args": { "External id": 136419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136419, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 136419, "pid": 5, "tid": 7, "ts": 1716454223516756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431524, "dur": 11, "args": { "External id": 136419, "cbid": 211, "correlation": 136419 } }, { "ph": "s", "id": 136419, "pid": 76337, "tid": -914061504, "ts": 1716454223431524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223516807, "dur": 4, "args": { "External id": 136431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136431, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136431, "pid": 5, "tid": 7, "ts": 1716454223516807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431544, "dur": 7, "args": { "External id": 136431, "cbid": 211, "correlation": 136431 } }, { "ph": "s", "id": 136431, "pid": 76337, "tid": -914061504, "ts": 1716454223431544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223516812, "dur": 55, "args": { "External id": 136434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136434, "pid": 5, "tid": 7, "ts": 1716454223516812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431563, "dur": 6, "args": { "External id": 136434, "cbid": 211, "correlation": 136434 } }, { "ph": "s", "id": 136434, "pid": 76337, "tid": -914061504, "ts": 1716454223431563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223516869, "dur": 36, "args": { "External id": 136443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136443, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136443, "pid": 5, "tid": 7, "ts": 1716454223516869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431604, "dur": 10, "args": { "External id": 136443, "cbid": 211, "correlation": 136443 } }, { "ph": "s", "id": 136443, "pid": 76337, "tid": -914061504, "ts": 1716454223431604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223431666, "dur": 0, "args": { "External id": 136453, "cbid": 317, "correlation": 136453 } }, { "ph": "f", "id": 136453, "pid": 76337, "tid": -914061504, "ts": 1716454223431666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223431667, "dur": 0, "args": { "External id": 136454, "cbid": 203, "correlation": 136454 } }, { "ph": "f", "id": 136454, "pid": 76337, "tid": -914061504, "ts": 1716454223431667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223431668, "dur": 0, "args": { "External id": 136455, "cbid": 205, "correlation": 136455 } }, { "ph": "f", "id": 136455, "pid": 76337, "tid": -914061504, "ts": 1716454223431668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223516906, "dur": 40, "args": { "External id": 136459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136459, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136459, "pid": 5, "tid": 7, "ts": 1716454223516906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431684, "dur": 12, "args": { "External id": 136459, "cbid": 211, "correlation": 136459 } }, { "ph": "s", "id": 136459, "pid": 76337, "tid": -914061504, "ts": 1716454223431684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223516947, "dur": 14, "args": { "External id": 136461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136461, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136461, "pid": 5, "tid": 7, "ts": 1716454223516947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431699, "dur": 6, "args": { "External id": 136461, "cbid": 211, "correlation": 136461 } }, { "ph": "s", "id": 136461, "pid": 76337, "tid": -914061504, "ts": 1716454223431699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223516963, "dur": 3, "args": { "External id": 136463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136463, "pid": 5, "tid": 7, "ts": 1716454223516963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431709, "dur": 5, "args": { "External id": 136463, "cbid": 211, "correlation": 136463 } }, { "ph": "s", "id": 136463, "pid": 76337, "tid": -914061504, "ts": 1716454223431709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223431718, "dur": 0, "args": { "External id": 136464, "cbid": 51, "correlation": 136464 } }, { "ph": "s", "id": 136464, "pid": 76337, "tid": -914061504, "ts": 1716454223431718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223516968, "dur": 692, "args": { "External id": 136465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136465, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136465, "pid": 5, "tid": 7, "ts": 1716454223516968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431719, "dur": 5, "args": { "External id": 136465, "cbid": 211, "correlation": 136465 } }, { "ph": "s", "id": 136465, "pid": 76337, "tid": -914061504, "ts": 1716454223431719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223517661, "dur": 59, "args": { "External id": 136470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136470, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136470, "pid": 5, "tid": 7, "ts": 1716454223517661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431746, "dur": 10, "args": { "External id": 136470, "cbid": 211, "correlation": 136470 } }, { "ph": "s", "id": 136470, "pid": 76337, "tid": -914061504, "ts": 1716454223431746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223431805, "dur": 0, "args": { "External id": 136480, "cbid": 317, "correlation": 136480 } }, { "ph": "f", "id": 136480, "pid": 76337, "tid": -914061504, "ts": 1716454223431805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223431806, "dur": 0, "args": { "External id": 136481, "cbid": 203, "correlation": 136481 } }, { "ph": "f", "id": 136481, "pid": 76337, "tid": -914061504, "ts": 1716454223431806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223431806, "dur": 0, "args": { "External id": 136482, "cbid": 205, "correlation": 136482 } }, { "ph": "f", "id": 136482, "pid": 76337, "tid": -914061504, "ts": 1716454223431806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223517721, "dur": 3, "args": { "External id": 136486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136486, "pid": 5, "tid": 7, "ts": 1716454223517721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431821, "dur": 12, "args": { "External id": 136486, "cbid": 211, "correlation": 136486 } }, { "ph": "s", "id": 136486, "pid": 76337, "tid": -914061504, "ts": 1716454223431821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223431837, "dur": 0, "args": { "External id": 136487, "cbid": 51, "correlation": 136487 } }, { "ph": "s", "id": 136487, "pid": 76337, "tid": -914061504, "ts": 1716454223431837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454223517725, "dur": 262, "args": { "External id": 136488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136488, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136488, "pid": 5, "tid": 7, "ts": 1716454223517725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431838, "dur": 7, "args": { "External id": 136488, "cbid": 211, "correlation": 136488 } }, { "ph": "s", "id": 136488, "pid": 76337, "tid": -914061504, "ts": 1716454223431838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223517989, "dur": 58, "args": { "External id": 136493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136493, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136493, "pid": 5, "tid": 7, "ts": 1716454223517989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431866, "dur": 8, "args": { "External id": 136493, "cbid": 211, "correlation": 136493 } }, { "ph": "s", "id": 136493, "pid": 76337, "tid": -914061504, "ts": 1716454223431866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223518049, "dur": 50, "args": { "External id": 136501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136501, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136501, "pid": 5, "tid": 7, "ts": 1716454223518049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431894, "dur": 8, "args": { "External id": 136501, "cbid": 211, "correlation": 136501 } }, { "ph": "s", "id": 136501, "pid": 76337, "tid": -914061504, "ts": 1716454223431894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223518099, "dur": 35, "args": { "External id": 136509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136509, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136509, "pid": 5, "tid": 7, "ts": 1716454223518099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223431923, "dur": 8, "args": { "External id": 136509, "cbid": 211, "correlation": 136509 } }, { "ph": "s", "id": 136509, "pid": 76337, "tid": -914061504, "ts": 1716454223431923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223518136, "dur": 53, "args": { "External id": 136529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136529, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 136529, "pid": 5, "tid": 7, "ts": 1716454223518136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432018, "dur": 13, "args": { "External id": 136529, "cbid": 211, "correlation": 136529 } }, { "ph": "s", "id": 136529, "pid": 76337, "tid": -914061504, "ts": 1716454223432018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223518190, "dur": 4, "args": { "External id": 136541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136541, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 136541, "pid": 5, "tid": 7, "ts": 1716454223518190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432040, "dur": 6, "args": { "External id": 136541, "cbid": 211, "correlation": 136541 } }, { "ph": "s", "id": 136541, "pid": 76337, "tid": -914061504, "ts": 1716454223432040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223518195, "dur": 54, "args": { "External id": 136544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136544, "pid": 5, "tid": 7, "ts": 1716454223518195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432059, "dur": 7, "args": { "External id": 136544, "cbid": 211, "correlation": 136544 } }, { "ph": "s", "id": 136544, "pid": 76337, "tid": -914061504, "ts": 1716454223432059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223432117, "dur": 0, "args": { "External id": 136555, "cbid": 317, "correlation": 136555 } }, { "ph": "f", "id": 136555, "pid": 76337, "tid": -914061504, "ts": 1716454223432117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223432117, "dur": 0, "args": { "External id": 136556, "cbid": 203, "correlation": 136556 } }, { "ph": "f", "id": 136556, "pid": 76337, "tid": -914061504, "ts": 1716454223432117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223432118, "dur": 0, "args": { "External id": 136557, "cbid": 205, "correlation": 136557 } }, { "ph": "f", "id": 136557, "pid": 76337, "tid": -914061504, "ts": 1716454223432118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432146, "dur": 2, "args": { "External id": 136561, "cbid": 251, "correlation": 136561 } }, { "ph": "f", "id": 136561, "pid": 76337, "tid": -914061504, "ts": 1716454223432146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432149, "dur": 1, "args": { "External id": 136562, "cbid": 251, "correlation": 136562 } }, { "ph": "f", "id": 136562, "pid": 76337, "tid": -914061504, "ts": 1716454223432149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432151, "dur": 1, "args": { "External id": 136563, "cbid": 251, "correlation": 136563 } }, { "ph": "f", "id": 136563, "pid": 76337, "tid": -914061504, "ts": 1716454223432151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432153, "dur": 1, "args": { "External id": 136564, "cbid": 251, "correlation": 136564 } }, { "ph": "f", "id": 136564, "pid": 76337, "tid": -914061504, "ts": 1716454223432153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432155, "dur": 1, "args": { "External id": 136565, "cbid": 251, "correlation": 136565 } }, { "ph": "f", "id": 136565, "pid": 76337, "tid": -914061504, "ts": 1716454223432155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432156, "dur": 1, "args": { "External id": 136566, "cbid": 251, "correlation": 136566 } }, { "ph": "f", "id": 136566, "pid": 76337, "tid": -914061504, "ts": 1716454223432156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432158, "dur": 0, "args": { "External id": 136567, "cbid": 251, "correlation": 136567 } }, { "ph": "f", "id": 136567, "pid": 76337, "tid": -914061504, "ts": 1716454223432158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432159, "dur": 1, "args": { "External id": 136568, "cbid": 251, "correlation": 136568 } }, { "ph": "f", "id": 136568, "pid": 76337, "tid": -914061504, "ts": 1716454223432159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432161, "dur": 0, "args": { "External id": 136569, "cbid": 251, "correlation": 136569 } }, { "ph": "f", "id": 136569, "pid": 76337, "tid": -914061504, "ts": 1716454223432161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223518251, "dur": 113, "args": { "External id": 136570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136570, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 136570, "pid": 5, "tid": 7, "ts": 1716454223518251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432163, "dur": 12, "args": { "External id": 136570, "cbid": 211, "correlation": 136570 } }, { "ph": "s", "id": 136570, "pid": 76337, "tid": -914061504, "ts": 1716454223432163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223518365, "dur": 59, "args": { "External id": 136576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136576, "pid": 5, "tid": 7, "ts": 1716454223518365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432198, "dur": 10, "args": { "External id": 136576, "cbid": 211, "correlation": 136576 } }, { "ph": "s", "id": 136576, "pid": 76337, "tid": -914061504, "ts": 1716454223432198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223518425, "dur": 609, "args": { "External id": 136585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136585, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136585, "pid": 5, "tid": 7, "ts": 1716454223518425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432281, "dur": 13, "args": { "External id": 136585, "cbid": 211, "correlation": 136585 } }, { "ph": "s", "id": 136585, "pid": 76337, "tid": -914061504, "ts": 1716454223432281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223519035, "dur": 179, "args": { "External id": 136607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136607, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136607, "pid": 5, "tid": 7, "ts": 1716454223519035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432339, "dur": 10, "args": { "External id": 136607, "cbid": 211, "correlation": 136607 } }, { "ph": "s", "id": 136607, "pid": 76337, "tid": -914061504, "ts": 1716454223432339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432428, "dur": 1, "args": { "External id": 136618, "cbid": 251, "correlation": 136618 } }, { "ph": "f", "id": 136618, "pid": 76337, "tid": -914061504, "ts": 1716454223432428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223519215, "dur": 196, "args": { "External id": 136619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136619, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136619, "pid": 5, "tid": 7, "ts": 1716454223519215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432433, "dur": 12, "args": { "External id": 136619, "cbid": 211, "correlation": 136619 } }, { "ph": "s", "id": 136619, "pid": 76337, "tid": -914061504, "ts": 1716454223432433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432502, "dur": 1, "args": { "External id": 136630, "cbid": 251, "correlation": 136630 } }, { "ph": "f", "id": 136630, "pid": 76337, "tid": -914061504, "ts": 1716454223432502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223519412, "dur": 185, "args": { "External id": 136631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136631, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136631, "pid": 5, "tid": 7, "ts": 1716454223519412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432506, "dur": 12, "args": { "External id": 136631, "cbid": 211, "correlation": 136631 } }, { "ph": "s", "id": 136631, "pid": 76337, "tid": -914061504, "ts": 1716454223432506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432569, "dur": 1, "args": { "External id": 136642, "cbid": 251, "correlation": 136642 } }, { "ph": "f", "id": 136642, "pid": 76337, "tid": -914061504, "ts": 1716454223432569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223519599, "dur": 187, "args": { "External id": 136643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136643, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136643, "pid": 5, "tid": 7, "ts": 1716454223519599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432573, "dur": 12, "args": { "External id": 136643, "cbid": 211, "correlation": 136643 } }, { "ph": "s", "id": 136643, "pid": 76337, "tid": -914061504, "ts": 1716454223432573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223519787, "dur": 18362, "args": { "External id": 136664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136664, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 136664, "pid": 5, "tid": 7, "ts": 1716454223519787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432655, "dur": 14, "args": { "External id": 136664, "cbid": 211, "correlation": 136664 } }, { "ph": "s", "id": 136664, "pid": 76337, "tid": -914061504, "ts": 1716454223432655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223432753, "dur": 1, "args": { "External id": 136682, "cbid": 251, "correlation": 136682 } }, { "ph": "f", "id": 136682, "pid": 76337, "tid": -914061504, "ts": 1716454223432753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223538150, "dur": 200, "args": { "External id": 136684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136684, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136684, "pid": 5, "tid": 7, "ts": 1716454223538150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432758, "dur": 14, "args": { "External id": 136684, "cbid": 211, "correlation": 136684 } }, { "ph": "s", "id": 136684, "pid": 76337, "tid": -914061504, "ts": 1716454223432758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223538352, "dur": 67, "args": { "External id": 136692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136692, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136692, "pid": 5, "tid": 7, "ts": 1716454223538352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432829, "dur": 12, "args": { "External id": 136692, "cbid": 211, "correlation": 136692 } }, { "ph": "s", "id": 136692, "pid": 76337, "tid": -914061504, "ts": 1716454223432829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223538420, "dur": 97, "args": { "External id": 136700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136700, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136700, "pid": 5, "tid": 7, "ts": 1716454223538420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432868, "dur": 8, "args": { "External id": 136700, "cbid": 211, "correlation": 136700 } }, { "ph": "s", "id": 136700, "pid": 76337, "tid": -914061504, "ts": 1716454223432868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223538518, "dur": 55, "args": { "External id": 136711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136711, "pid": 5, "tid": 7, "ts": 1716454223538518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432939, "dur": 12, "args": { "External id": 136711, "cbid": 211, "correlation": 136711 } }, { "ph": "s", "id": 136711, "pid": 76337, "tid": -914061504, "ts": 1716454223432939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223538574, "dur": 91, "args": { "External id": 136733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136733, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136733, "pid": 5, "tid": 7, "ts": 1716454223538574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223432971, "dur": 18, "args": { "External id": 136733, "cbid": 211, "correlation": 136733 } }, { "ph": "s", "id": 136733, "pid": 76337, "tid": -914061504, "ts": 1716454223432971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433065, "dur": 1, "args": { "External id": 136744, "cbid": 251, "correlation": 136744 } }, { "ph": "f", "id": 136744, "pid": 76337, "tid": -914061504, "ts": 1716454223433065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223538666, "dur": 105, "args": { "External id": 136745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136745, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136745, "pid": 5, "tid": 7, "ts": 1716454223538666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433070, "dur": 13, "args": { "External id": 136745, "cbid": 211, "correlation": 136745 } }, { "ph": "s", "id": 136745, "pid": 76337, "tid": -914061504, "ts": 1716454223433070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433144, "dur": 1, "args": { "External id": 136756, "cbid": 251, "correlation": 136756 } }, { "ph": "f", "id": 136756, "pid": 76337, "tid": -914061504, "ts": 1716454223433144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433148, "dur": 0, "args": { "External id": 136757, "cbid": 251, "correlation": 136757 } }, { "ph": "f", "id": 136757, "pid": 76337, "tid": -914061504, "ts": 1716454223433148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223538772, "dur": 11, "args": { "External id": 136758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136758, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 136758, "pid": 5, "tid": 7, "ts": 1716454223538772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433150, "dur": 13, "args": { "External id": 136758, "cbid": 211, "correlation": 136758 } }, { "ph": "s", "id": 136758, "pid": 76337, "tid": -914061504, "ts": 1716454223433150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223538784, "dur": 5, "args": { "External id": 136760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136760, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 136760, "pid": 5, "tid": 7, "ts": 1716454223538784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433166, "dur": 8, "args": { "External id": 136760, "cbid": 211, "correlation": 136760 } }, { "ph": "s", "id": 136760, "pid": 76337, "tid": -914061504, "ts": 1716454223433166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433228, "dur": 1, "args": { "External id": 136771, "cbid": 251, "correlation": 136771 } }, { "ph": "f", "id": 136771, "pid": 76337, "tid": -914061504, "ts": 1716454223433228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433232, "dur": 0, "args": { "External id": 136772, "cbid": 251, "correlation": 136772 } }, { "ph": "f", "id": 136772, "pid": 76337, "tid": -914061504, "ts": 1716454223433232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223538791, "dur": 6, "args": { "External id": 136773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136773, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 136773, "pid": 5, "tid": 7, "ts": 1716454223538791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433233, "dur": 11, "args": { "External id": 136773, "cbid": 211, "correlation": 136773 } }, { "ph": "s", "id": 136773, "pid": 76337, "tid": -914061504, "ts": 1716454223433233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223538798, "dur": 3, "args": { "External id": 136775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136775, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 136775, "pid": 5, "tid": 7, "ts": 1716454223538798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433246, "dur": 5, "args": { "External id": 136775, "cbid": 211, "correlation": 136775 } }, { "ph": "s", "id": 136775, "pid": 76337, "tid": -914061504, "ts": 1716454223433246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223538803, "dur": 155, "args": { "External id": 136796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136796, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 136796, "pid": 5, "tid": 7, "ts": 1716454223538803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433320, "dur": 13, "args": { "External id": 136796, "cbid": 211, "correlation": 136796 } }, { "ph": "s", "id": 136796, "pid": 76337, "tid": -914061504, "ts": 1716454223433320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433414, "dur": 1, "args": { "External id": 136814, "cbid": 251, "correlation": 136814 } }, { "ph": "f", "id": 136814, "pid": 76337, "tid": -914061504, "ts": 1716454223433414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223538959, "dur": 104, "args": { "External id": 136816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136816, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 136816, "pid": 5, "tid": 7, "ts": 1716454223538959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433420, "dur": 14, "args": { "External id": 136816, "cbid": 211, "correlation": 136816 } }, { "ph": "s", "id": 136816, "pid": 76337, "tid": -914061504, "ts": 1716454223433420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223539065, "dur": 35, "args": { "External id": 136824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136824, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136824, "pid": 5, "tid": 7, "ts": 1716454223539065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433490, "dur": 12, "args": { "External id": 136824, "cbid": 211, "correlation": 136824 } }, { "ph": "s", "id": 136824, "pid": 76337, "tid": -914061504, "ts": 1716454223433490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223539101, "dur": 68, "args": { "External id": 136832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136832, "pid": 5, "tid": 7, "ts": 1716454223539101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433530, "dur": 9, "args": { "External id": 136832, "cbid": 211, "correlation": 136832 } }, { "ph": "s", "id": 136832, "pid": 76337, "tid": -914061504, "ts": 1716454223433530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223539170, "dur": 90, "args": { "External id": 136854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136854, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136854, "pid": 5, "tid": 7, "ts": 1716454223539170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433582, "dur": 10, "args": { "External id": 136854, "cbid": 211, "correlation": 136854 } }, { "ph": "s", "id": 136854, "pid": 76337, "tid": -914061504, "ts": 1716454223433582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433668, "dur": 1, "args": { "External id": 136870, "cbid": 251, "correlation": 136870 } }, { "ph": "f", "id": 136870, "pid": 76337, "tid": -914061504, "ts": 1716454223433668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223539262, "dur": 567, "args": { "External id": 136872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136872, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 136872, "pid": 5, "tid": 7, "ts": 1716454223539262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433674, "dur": 13, "args": { "External id": 136872, "cbid": 211, "correlation": 136872 } }, { "ph": "s", "id": 136872, "pid": 76337, "tid": -914061504, "ts": 1716454223433674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223539830, "dur": 242, "args": { "External id": 136880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136880, "pid": 5, "tid": 7, "ts": 1716454223539830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433740, "dur": 13, "args": { "External id": 136880, "cbid": 211, "correlation": 136880 } }, { "ph": "s", "id": 136880, "pid": 76337, "tid": -914061504, "ts": 1716454223433740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223540074, "dur": 249, "args": { "External id": 136888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136888, "pid": 5, "tid": 7, "ts": 1716454223540074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433771, "dur": 9, "args": { "External id": 136888, "cbid": 211, "correlation": 136888 } }, { "ph": "s", "id": 136888, "pid": 76337, "tid": -914061504, "ts": 1716454223433771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433853, "dur": 1, "args": { "External id": 136904, "cbid": 251, "correlation": 136904 } }, { "ph": "f", "id": 136904, "pid": 76337, "tid": -914061504, "ts": 1716454223433853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223433858, "dur": 0, "args": { "External id": 136906, "cbid": 251, "correlation": 136906 } }, { "ph": "f", "id": 136906, "pid": 76337, "tid": -914061504, "ts": 1716454223433858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223540324, "dur": 356, "args": { "External id": 136907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136907, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 136907, "pid": 5, "tid": 7, "ts": 1716454223540324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433861, "dur": 13, "args": { "External id": 136907, "cbid": 211, "correlation": 136907 } }, { "ph": "s", "id": 136907, "pid": 76337, "tid": -914061504, "ts": 1716454223433861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223540682, "dur": 50, "args": { "External id": 136915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136915, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136915, "pid": 5, "tid": 7, "ts": 1716454223540682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433903, "dur": 10, "args": { "External id": 136915, "cbid": 211, "correlation": 136915 } }, { "ph": "s", "id": 136915, "pid": 76337, "tid": -914061504, "ts": 1716454223433903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223540733, "dur": 157, "args": { "External id": 136926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136926, "pid": 5, "tid": 7, "ts": 1716454223540733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223433970, "dur": 21, "args": { "External id": 136926, "cbid": 211, "correlation": 136926 } }, { "ph": "s", "id": 136926, "pid": 76337, "tid": -914061504, "ts": 1716454223433970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223434044, "dur": 0, "args": { "External id": 136938, "cbid": 317, "correlation": 136938 } }, { "ph": "f", "id": 136938, "pid": 76337, "tid": -914061504, "ts": 1716454223434044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223434045, "dur": 0, "args": { "External id": 136939, "cbid": 203, "correlation": 136939 } }, { "ph": "f", "id": 136939, "pid": 76337, "tid": -914061504, "ts": 1716454223434045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223434045, "dur": 0, "args": { "External id": 136940, "cbid": 205, "correlation": 136940 } }, { "ph": "f", "id": 136940, "pid": 76337, "tid": -914061504, "ts": 1716454223434045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434069, "dur": 1, "args": { "External id": 136944, "cbid": 251, "correlation": 136944 } }, { "ph": "f", "id": 136944, "pid": 76337, "tid": -914061504, "ts": 1716454223434069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434071, "dur": 0, "args": { "External id": 136945, "cbid": 251, "correlation": 136945 } }, { "ph": "f", "id": 136945, "pid": 76337, "tid": -914061504, "ts": 1716454223434071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434072, "dur": 0, "args": { "External id": 136946, "cbid": 251, "correlation": 136946 } }, { "ph": "f", "id": 136946, "pid": 76337, "tid": -914061504, "ts": 1716454223434072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434073, "dur": 0, "args": { "External id": 136947, "cbid": 251, "correlation": 136947 } }, { "ph": "f", "id": 136947, "pid": 76337, "tid": -914061504, "ts": 1716454223434073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434074, "dur": 0, "args": { "External id": 136948, "cbid": 251, "correlation": 136948 } }, { "ph": "f", "id": 136948, "pid": 76337, "tid": -914061504, "ts": 1716454223434074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434075, "dur": 0, "args": { "External id": 136949, "cbid": 251, "correlation": 136949 } }, { "ph": "f", "id": 136949, "pid": 76337, "tid": -914061504, "ts": 1716454223434075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434076, "dur": 0, "args": { "External id": 136950, "cbid": 251, "correlation": 136950 } }, { "ph": "f", "id": 136950, "pid": 76337, "tid": -914061504, "ts": 1716454223434076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434077, "dur": 0, "args": { "External id": 136951, "cbid": 251, "correlation": 136951 } }, { "ph": "f", "id": 136951, "pid": 76337, "tid": -914061504, "ts": 1716454223434077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434078, "dur": 0, "args": { "External id": 136952, "cbid": 251, "correlation": 136952 } }, { "ph": "f", "id": 136952, "pid": 76337, "tid": -914061504, "ts": 1716454223434078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223540891, "dur": 114, "args": { "External id": 136953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136953, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 136953, "pid": 5, "tid": 7, "ts": 1716454223540891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434081, "dur": 13, "args": { "External id": 136953, "cbid": 211, "correlation": 136953 } }, { "ph": "s", "id": 136953, "pid": 76337, "tid": -914061504, "ts": 1716454223434081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223541006, "dur": 59, "args": { "External id": 136959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136959, "pid": 5, "tid": 7, "ts": 1716454223541006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434116, "dur": 9, "args": { "External id": 136959, "cbid": 211, "correlation": 136959 } }, { "ph": "s", "id": 136959, "pid": 76337, "tid": -914061504, "ts": 1716454223434116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223541067, "dur": 49, "args": { "External id": 136967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136967, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136967, "pid": 5, "tid": 7, "ts": 1716454223541067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434147, "dur": 8, "args": { "External id": 136967, "cbid": 211, "correlation": 136967 } }, { "ph": "s", "id": 136967, "pid": 76337, "tid": -914061504, "ts": 1716454223434147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223541117, "dur": 97, "args": { "External id": 136976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136976, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 136976, "pid": 5, "tid": 7, "ts": 1716454223541117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434187, "dur": 10, "args": { "External id": 136976, "cbid": 211, "correlation": 136976 } }, { "ph": "s", "id": 136976, "pid": 76337, "tid": -914061504, "ts": 1716454223434187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223541216, "dur": 92, "args": { "External id": 136996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 136996, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 136996, "pid": 5, "tid": 7, "ts": 1716454223541216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434258, "dur": 11, "args": { "External id": 136996, "cbid": 211, "correlation": 136996 } }, { "ph": "s", "id": 136996, "pid": 76337, "tid": -914061504, "ts": 1716454223434258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223541309, "dur": 4, "args": { "External id": 137008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137008, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 137008, "pid": 5, "tid": 7, "ts": 1716454223541309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434280, "dur": 7, "args": { "External id": 137008, "cbid": 211, "correlation": 137008 } }, { "ph": "s", "id": 137008, "pid": 76337, "tid": -914061504, "ts": 1716454223434280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223541315, "dur": 107, "args": { "External id": 137011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137011, "pid": 5, "tid": 7, "ts": 1716454223541315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434298, "dur": 6, "args": { "External id": 137011, "cbid": 211, "correlation": 137011 } }, { "ph": "s", "id": 137011, "pid": 76337, "tid": -914061504, "ts": 1716454223434298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223541423, "dur": 69, "args": { "External id": 137020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137020, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137020, "pid": 5, "tid": 7, "ts": 1716454223541423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434338, "dur": 10, "args": { "External id": 137020, "cbid": 211, "correlation": 137020 } }, { "ph": "s", "id": 137020, "pid": 76337, "tid": -914061504, "ts": 1716454223434338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223434390, "dur": 0, "args": { "External id": 137030, "cbid": 317, "correlation": 137030 } }, { "ph": "f", "id": 137030, "pid": 76337, "tid": -914061504, "ts": 1716454223434390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223434391, "dur": 0, "args": { "External id": 137031, "cbid": 203, "correlation": 137031 } }, { "ph": "f", "id": 137031, "pid": 76337, "tid": -914061504, "ts": 1716454223434391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223434391, "dur": 0, "args": { "External id": 137032, "cbid": 205, "correlation": 137032 } }, { "ph": "f", "id": 137032, "pid": 76337, "tid": -914061504, "ts": 1716454223434391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223541493, "dur": 75, "args": { "External id": 137036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137036, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137036, "pid": 5, "tid": 7, "ts": 1716454223541493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434407, "dur": 12, "args": { "External id": 137036, "cbid": 211, "correlation": 137036 } }, { "ph": "s", "id": 137036, "pid": 76337, "tid": -914061504, "ts": 1716454223434407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223541569, "dur": 23, "args": { "External id": 137038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137038, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137038, "pid": 5, "tid": 7, "ts": 1716454223541569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434421, "dur": 5, "args": { "External id": 137038, "cbid": 211, "correlation": 137038 } }, { "ph": "s", "id": 137038, "pid": 76337, "tid": -914061504, "ts": 1716454223434421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223541593, "dur": 4, "args": { "External id": 137040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137040, "pid": 5, "tid": 7, "ts": 1716454223541593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434431, "dur": 6, "args": { "External id": 137040, "cbid": 211, "correlation": 137040 } }, { "ph": "s", "id": 137040, "pid": 76337, "tid": -914061504, "ts": 1716454223434431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223434440, "dur": 0, "args": { "External id": 137041, "cbid": 51, "correlation": 137041 } }, { "ph": "s", "id": 137041, "pid": 76337, "tid": -914061504, "ts": 1716454223434440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223541599, "dur": 1349, "args": { "External id": 137042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137042, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137042, "pid": 5, "tid": 7, "ts": 1716454223541599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434441, "dur": 5, "args": { "External id": 137042, "cbid": 211, "correlation": 137042 } }, { "ph": "s", "id": 137042, "pid": 76337, "tid": -914061504, "ts": 1716454223434441, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223542949, "dur": 60, "args": { "External id": 137047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137047, "pid": 5, "tid": 7, "ts": 1716454223542949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434468, "dur": 8, "args": { "External id": 137047, "cbid": 211, "correlation": 137047 } }, { "ph": "s", "id": 137047, "pid": 76337, "tid": -914061504, "ts": 1716454223434468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223543010, "dur": 4, "args": { "External id": 137055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137055, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137055, "pid": 5, "tid": 7, "ts": 1716454223543010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434512, "dur": 10, "args": { "External id": 137055, "cbid": 211, "correlation": 137055 } }, { "ph": "s", "id": 137055, "pid": 76337, "tid": -914061504, "ts": 1716454223434512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434577, "dur": 1, "args": { "External id": 137071, "cbid": 251, "correlation": 137071 } }, { "ph": "f", "id": 137071, "pid": 76337, "tid": -914061504, "ts": 1716454223434577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223434582, "dur": 0, "args": { "External id": 137073, "cbid": 251, "correlation": 137073 } }, { "ph": "f", "id": 137073, "pid": 76337, "tid": -914061504, "ts": 1716454223434582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223543015, "dur": 10, "args": { "External id": 137074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137074, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 137074, "pid": 5, "tid": 7, "ts": 1716454223543015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434585, "dur": 12, "args": { "External id": 137074, "cbid": 211, "correlation": 137074 } }, { "ph": "s", "id": 137074, "pid": 76337, "tid": -914061504, "ts": 1716454223434585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223543026, "dur": 5, "args": { "External id": 137076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137076, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 137076, "pid": 5, "tid": 7, "ts": 1716454223543026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434598, "dur": 88, "args": { "External id": 137076, "cbid": 211, "correlation": 137076 } }, { "ph": "s", "id": 137076, "pid": 76337, "tid": -914061504, "ts": 1716454223434598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223543033, "dur": 55, "args": { "External id": 137086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137086, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137086, "pid": 5, "tid": 7, "ts": 1716454223543033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434739, "dur": 13, "args": { "External id": 137086, "cbid": 211, "correlation": 137086 } }, { "ph": "s", "id": 137086, "pid": 76337, "tid": -914061504, "ts": 1716454223434739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223543089, "dur": 51, "args": { "External id": 137106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137106, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 137106, "pid": 5, "tid": 7, "ts": 1716454223543089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434806, "dur": 11, "args": { "External id": 137106, "cbid": 211, "correlation": 137106 } }, { "ph": "s", "id": 137106, "pid": 76337, "tid": -914061504, "ts": 1716454223434806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223543141, "dur": 4, "args": { "External id": 137118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137118, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137118, "pid": 5, "tid": 7, "ts": 1716454223543141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434827, "dur": 6, "args": { "External id": 137118, "cbid": 211, "correlation": 137118 } }, { "ph": "s", "id": 137118, "pid": 76337, "tid": -914061504, "ts": 1716454223434827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223543147, "dur": 56, "args": { "External id": 137121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137121, "pid": 5, "tid": 7, "ts": 1716454223543147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434845, "dur": 7, "args": { "External id": 137121, "cbid": 211, "correlation": 137121 } }, { "ph": "s", "id": 137121, "pid": 76337, "tid": -914061504, "ts": 1716454223434845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223543204, "dur": 37, "args": { "External id": 137130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137130, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137130, "pid": 5, "tid": 7, "ts": 1716454223543204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434886, "dur": 10, "args": { "External id": 137130, "cbid": 211, "correlation": 137130 } }, { "ph": "s", "id": 137130, "pid": 76337, "tid": -914061504, "ts": 1716454223434886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223434950, "dur": 0, "args": { "External id": 137140, "cbid": 317, "correlation": 137140 } }, { "ph": "f", "id": 137140, "pid": 76337, "tid": -914061504, "ts": 1716454223434950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223434951, "dur": 0, "args": { "External id": 137141, "cbid": 203, "correlation": 137141 } }, { "ph": "f", "id": 137141, "pid": 76337, "tid": -914061504, "ts": 1716454223434951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223434952, "dur": 0, "args": { "External id": 137142, "cbid": 205, "correlation": 137142 } }, { "ph": "f", "id": 137142, "pid": 76337, "tid": -914061504, "ts": 1716454223434952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223543242, "dur": 40, "args": { "External id": 137146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137146, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137146, "pid": 5, "tid": 7, "ts": 1716454223543242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434966, "dur": 20, "args": { "External id": 137146, "cbid": 211, "correlation": 137146 } }, { "ph": "s", "id": 137146, "pid": 76337, "tid": -914061504, "ts": 1716454223434966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223543283, "dur": 14, "args": { "External id": 137148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137148, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137148, "pid": 5, "tid": 7, "ts": 1716454223543283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434989, "dur": 6, "args": { "External id": 137148, "cbid": 211, "correlation": 137148 } }, { "ph": "s", "id": 137148, "pid": 76337, "tid": -914061504, "ts": 1716454223434989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223543299, "dur": 3, "args": { "External id": 137150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137150, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137150, "pid": 5, "tid": 7, "ts": 1716454223543299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223434999, "dur": 6, "args": { "External id": 137150, "cbid": 211, "correlation": 137150 } }, { "ph": "s", "id": 137150, "pid": 76337, "tid": -914061504, "ts": 1716454223434999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223435007, "dur": 0, "args": { "External id": 137151, "cbid": 51, "correlation": 137151 } }, { "ph": "s", "id": 137151, "pid": 76337, "tid": -914061504, "ts": 1716454223435007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223543303, "dur": 686, "args": { "External id": 137152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137152, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137152, "pid": 5, "tid": 7, "ts": 1716454223543303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435008, "dur": 5, "args": { "External id": 137152, "cbid": 211, "correlation": 137152 } }, { "ph": "s", "id": 137152, "pid": 76337, "tid": -914061504, "ts": 1716454223435008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223543991, "dur": 60, "args": { "External id": 137157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137157, "pid": 5, "tid": 7, "ts": 1716454223543991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435035, "dur": 8, "args": { "External id": 137157, "cbid": 211, "correlation": 137157 } }, { "ph": "s", "id": 137157, "pid": 76337, "tid": -914061504, "ts": 1716454223435035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223435093, "dur": 0, "args": { "External id": 137167, "cbid": 317, "correlation": 137167 } }, { "ph": "f", "id": 137167, "pid": 76337, "tid": -914061504, "ts": 1716454223435093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223435094, "dur": 0, "args": { "External id": 137168, "cbid": 203, "correlation": 137168 } }, { "ph": "f", "id": 137168, "pid": 76337, "tid": -914061504, "ts": 1716454223435094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223435094, "dur": 0, "args": { "External id": 137169, "cbid": 205, "correlation": 137169 } }, { "ph": "f", "id": 137169, "pid": 76337, "tid": -914061504, "ts": 1716454223435094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223544052, "dur": 75, "args": { "External id": 137173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137173, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137173, "pid": 5, "tid": 7, "ts": 1716454223544052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435108, "dur": 12, "args": { "External id": 137173, "cbid": 211, "correlation": 137173 } }, { "ph": "s", "id": 137173, "pid": 76337, "tid": -914061504, "ts": 1716454223435108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223544128, "dur": 207, "args": { "External id": 137175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137175, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137175, "pid": 5, "tid": 7, "ts": 1716454223544128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435128, "dur": 8, "args": { "External id": 137175, "cbid": 211, "correlation": 137175 } }, { "ph": "s", "id": 137175, "pid": 76337, "tid": -914061504, "ts": 1716454223435128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223544337, "dur": 39, "args": { "External id": 137177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137177, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137177, "pid": 5, "tid": 7, "ts": 1716454223544337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435140, "dur": 466, "args": { "External id": 137177, "cbid": 211, "correlation": 137177 } }, { "ph": "s", "id": 137177, "pid": 76337, "tid": -914061504, "ts": 1716454223435140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223544377, "dur": 58, "args": { "External id": 137183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137183, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137183, "pid": 5, "tid": 7, "ts": 1716454223544377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435627, "dur": 9, "args": { "External id": 137183, "cbid": 211, "correlation": 137183 } }, { "ph": "s", "id": 137183, "pid": 76337, "tid": -914061504, "ts": 1716454223435627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223544437, "dur": 50, "args": { "External id": 137191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137191, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137191, "pid": 5, "tid": 7, "ts": 1716454223544437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435657, "dur": 8, "args": { "External id": 137191, "cbid": 211, "correlation": 137191 } }, { "ph": "s", "id": 137191, "pid": 76337, "tid": -914061504, "ts": 1716454223435657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223544488, "dur": 35, "args": { "External id": 137199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137199, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137199, "pid": 5, "tid": 7, "ts": 1716454223544488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435686, "dur": 32, "args": { "External id": 137199, "cbid": 211, "correlation": 137199 } }, { "ph": "s", "id": 137199, "pid": 76337, "tid": -914061504, "ts": 1716454223435686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223544524, "dur": 51, "args": { "External id": 137219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137219, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 137219, "pid": 5, "tid": 7, "ts": 1716454223544524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435794, "dur": 12, "args": { "External id": 137219, "cbid": 211, "correlation": 137219 } }, { "ph": "s", "id": 137219, "pid": 76337, "tid": -914061504, "ts": 1716454223435794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223544576, "dur": 4, "args": { "External id": 137231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137231, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137231, "pid": 5, "tid": 7, "ts": 1716454223544576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435816, "dur": 6, "args": { "External id": 137231, "cbid": 211, "correlation": 137231 } }, { "ph": "s", "id": 137231, "pid": 76337, "tid": -914061504, "ts": 1716454223435816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223544582, "dur": 54, "args": { "External id": 137234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137234, "pid": 5, "tid": 7, "ts": 1716454223544582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435834, "dur": 6, "args": { "External id": 137234, "cbid": 211, "correlation": 137234 } }, { "ph": "s", "id": 137234, "pid": 76337, "tid": -914061504, "ts": 1716454223435834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223435891, "dur": 0, "args": { "External id": 137245, "cbid": 317, "correlation": 137245 } }, { "ph": "f", "id": 137245, "pid": 76337, "tid": -914061504, "ts": 1716454223435891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223435892, "dur": 0, "args": { "External id": 137246, "cbid": 203, "correlation": 137246 } }, { "ph": "f", "id": 137246, "pid": 76337, "tid": -914061504, "ts": 1716454223435892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223435892, "dur": 0, "args": { "External id": 137247, "cbid": 205, "correlation": 137247 } }, { "ph": "f", "id": 137247, "pid": 76337, "tid": -914061504, "ts": 1716454223435892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435914, "dur": 1, "args": { "External id": 137251, "cbid": 251, "correlation": 137251 } }, { "ph": "f", "id": 137251, "pid": 76337, "tid": -914061504, "ts": 1716454223435914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435916, "dur": 0, "args": { "External id": 137252, "cbid": 251, "correlation": 137252 } }, { "ph": "f", "id": 137252, "pid": 76337, "tid": -914061504, "ts": 1716454223435916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435917, "dur": 0, "args": { "External id": 137253, "cbid": 251, "correlation": 137253 } }, { "ph": "f", "id": 137253, "pid": 76337, "tid": -914061504, "ts": 1716454223435917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435918, "dur": 0, "args": { "External id": 137254, "cbid": 251, "correlation": 137254 } }, { "ph": "f", "id": 137254, "pid": 76337, "tid": -914061504, "ts": 1716454223435918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435919, "dur": 0, "args": { "External id": 137255, "cbid": 251, "correlation": 137255 } }, { "ph": "f", "id": 137255, "pid": 76337, "tid": -914061504, "ts": 1716454223435919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435920, "dur": 0, "args": { "External id": 137256, "cbid": 251, "correlation": 137256 } }, { "ph": "f", "id": 137256, "pid": 76337, "tid": -914061504, "ts": 1716454223435920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435920, "dur": 0, "args": { "External id": 137257, "cbid": 251, "correlation": 137257 } }, { "ph": "f", "id": 137257, "pid": 76337, "tid": -914061504, "ts": 1716454223435920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435921, "dur": 0, "args": { "External id": 137258, "cbid": 251, "correlation": 137258 } }, { "ph": "f", "id": 137258, "pid": 76337, "tid": -914061504, "ts": 1716454223435921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223435923, "dur": 0, "args": { "External id": 137259, "cbid": 251, "correlation": 137259 } }, { "ph": "f", "id": 137259, "pid": 76337, "tid": -914061504, "ts": 1716454223435923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223544637, "dur": 112, "args": { "External id": 137260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137260, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 137260, "pid": 5, "tid": 7, "ts": 1716454223544637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435925, "dur": 13, "args": { "External id": 137260, "cbid": 211, "correlation": 137260 } }, { "ph": "s", "id": 137260, "pid": 76337, "tid": -914061504, "ts": 1716454223435925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223544750, "dur": 59, "args": { "External id": 137266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137266, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137266, "pid": 5, "tid": 7, "ts": 1716454223544750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223435960, "dur": 9, "args": { "External id": 137266, "cbid": 211, "correlation": 137266 } }, { "ph": "s", "id": 137266, "pid": 76337, "tid": -914061504, "ts": 1716454223435960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223544810, "dur": 662, "args": { "External id": 137275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137275, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137275, "pid": 5, "tid": 7, "ts": 1716454223544810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436050, "dur": 14, "args": { "External id": 137275, "cbid": 211, "correlation": 137275 } }, { "ph": "s", "id": 137275, "pid": 76337, "tid": -914061504, "ts": 1716454223436050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223545473, "dur": 179, "args": { "External id": 137297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137297, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137297, "pid": 5, "tid": 7, "ts": 1716454223545473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436107, "dur": 10, "args": { "External id": 137297, "cbid": 211, "correlation": 137297 } }, { "ph": "s", "id": 137297, "pid": 76337, "tid": -914061504, "ts": 1716454223436107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223436194, "dur": 1, "args": { "External id": 137308, "cbid": 251, "correlation": 137308 } }, { "ph": "f", "id": 137308, "pid": 76337, "tid": -914061504, "ts": 1716454223436194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223545654, "dur": 197, "args": { "External id": 137309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137309, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137309, "pid": 5, "tid": 7, "ts": 1716454223545654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436199, "dur": 13, "args": { "External id": 137309, "cbid": 211, "correlation": 137309 } }, { "ph": "s", "id": 137309, "pid": 76337, "tid": -914061504, "ts": 1716454223436199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223436266, "dur": 1, "args": { "External id": 137320, "cbid": 251, "correlation": 137320 } }, { "ph": "f", "id": 137320, "pid": 76337, "tid": -914061504, "ts": 1716454223436266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223545853, "dur": 184, "args": { "External id": 137321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137321, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137321, "pid": 5, "tid": 7, "ts": 1716454223545853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436271, "dur": 12, "args": { "External id": 137321, "cbid": 211, "correlation": 137321 } }, { "ph": "s", "id": 137321, "pid": 76337, "tid": -914061504, "ts": 1716454223436271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223436335, "dur": 1, "args": { "External id": 137332, "cbid": 251, "correlation": 137332 } }, { "ph": "f", "id": 137332, "pid": 76337, "tid": -914061504, "ts": 1716454223436335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223546038, "dur": 184, "args": { "External id": 137333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137333, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137333, "pid": 5, "tid": 7, "ts": 1716454223546038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436339, "dur": 11, "args": { "External id": 137333, "cbid": 211, "correlation": 137333 } }, { "ph": "s", "id": 137333, "pid": 76337, "tid": -914061504, "ts": 1716454223436339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223546224, "dur": 18358, "args": { "External id": 137354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137354, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 137354, "pid": 5, "tid": 7, "ts": 1716454223546224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436418, "dur": 12, "args": { "External id": 137354, "cbid": 211, "correlation": 137354 } }, { "ph": "s", "id": 137354, "pid": 76337, "tid": -914061504, "ts": 1716454223436418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223436515, "dur": 1, "args": { "External id": 137372, "cbid": 251, "correlation": 137372 } }, { "ph": "f", "id": 137372, "pid": 76337, "tid": -914061504, "ts": 1716454223436515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223564583, "dur": 198, "args": { "External id": 137374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137374, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137374, "pid": 5, "tid": 7, "ts": 1716454223564583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436521, "dur": 13, "args": { "External id": 137374, "cbid": 211, "correlation": 137374 } }, { "ph": "s", "id": 137374, "pid": 76337, "tid": -914061504, "ts": 1716454223436521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223564783, "dur": 67, "args": { "External id": 137382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137382, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137382, "pid": 5, "tid": 7, "ts": 1716454223564783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436590, "dur": 12, "args": { "External id": 137382, "cbid": 211, "correlation": 137382 } }, { "ph": "s", "id": 137382, "pid": 76337, "tid": -914061504, "ts": 1716454223436590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223564851, "dur": 97, "args": { "External id": 137390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137390, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137390, "pid": 5, "tid": 7, "ts": 1716454223564851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436628, "dur": 8, "args": { "External id": 137390, "cbid": 211, "correlation": 137390 } }, { "ph": "s", "id": 137390, "pid": 76337, "tid": -914061504, "ts": 1716454223436628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223564949, "dur": 53, "args": { "External id": 137401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137401, "pid": 5, "tid": 7, "ts": 1716454223564949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436700, "dur": 51, "args": { "External id": 137401, "cbid": 211, "correlation": 137401 } }, { "ph": "s", "id": 137401, "pid": 76337, "tid": -914061504, "ts": 1716454223436700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223565003, "dur": 91, "args": { "External id": 137423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137423, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137423, "pid": 5, "tid": 7, "ts": 1716454223565003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223436769, "dur": 1917, "args": { "External id": 137423, "cbid": 211, "correlation": 137423 } }, { "ph": "s", "id": 137423, "pid": 76337, "tid": -914061504, "ts": 1716454223436769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223438762, "dur": 1, "args": { "External id": 137434, "cbid": 251, "correlation": 137434 } }, { "ph": "f", "id": 137434, "pid": 76337, "tid": -914061504, "ts": 1716454223438762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223565095, "dur": 103, "args": { "External id": 137435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137435, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137435, "pid": 5, "tid": 7, "ts": 1716454223565095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223438767, "dur": 67, "args": { "External id": 137435, "cbid": 211, "correlation": 137435 } }, { "ph": "s", "id": 137435, "pid": 76337, "tid": -914061504, "ts": 1716454223438767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223438893, "dur": 1, "args": { "External id": 137446, "cbid": 251, "correlation": 137446 } }, { "ph": "f", "id": 137446, "pid": 76337, "tid": -914061504, "ts": 1716454223438893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223438897, "dur": 0, "args": { "External id": 137447, "cbid": 251, "correlation": 137447 } }, { "ph": "f", "id": 137447, "pid": 76337, "tid": -914061504, "ts": 1716454223438897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223565199, "dur": 11, "args": { "External id": 137448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137448, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 137448, "pid": 5, "tid": 7, "ts": 1716454223565199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223438899, "dur": 13, "args": { "External id": 137448, "cbid": 211, "correlation": 137448 } }, { "ph": "s", "id": 137448, "pid": 76337, "tid": -914061504, "ts": 1716454223438899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223565211, "dur": 5, "args": { "External id": 137450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137450, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 137450, "pid": 5, "tid": 7, "ts": 1716454223565211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223438914, "dur": 6, "args": { "External id": 137450, "cbid": 211, "correlation": 137450 } }, { "ph": "s", "id": 137450, "pid": 76337, "tid": -914061504, "ts": 1716454223438914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223438983, "dur": 1, "args": { "External id": 137461, "cbid": 251, "correlation": 137461 } }, { "ph": "f", "id": 137461, "pid": 76337, "tid": -914061504, "ts": 1716454223438983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223438988, "dur": 0, "args": { "External id": 137462, "cbid": 251, "correlation": 137462 } }, { "ph": "f", "id": 137462, "pid": 76337, "tid": -914061504, "ts": 1716454223438988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223565217, "dur": 6, "args": { "External id": 137463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137463, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 137463, "pid": 5, "tid": 7, "ts": 1716454223565217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223438989, "dur": 12, "args": { "External id": 137463, "cbid": 211, "correlation": 137463 } }, { "ph": "s", "id": 137463, "pid": 76337, "tid": -914061504, "ts": 1716454223438989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223565225, "dur": 3, "args": { "External id": 137465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137465, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 137465, "pid": 5, "tid": 7, "ts": 1716454223565225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439003, "dur": 5, "args": { "External id": 137465, "cbid": 211, "correlation": 137465 } }, { "ph": "s", "id": 137465, "pid": 76337, "tid": -914061504, "ts": 1716454223439003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223565230, "dur": 156, "args": { "External id": 137486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137486, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 137486, "pid": 5, "tid": 7, "ts": 1716454223565230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439076, "dur": 13, "args": { "External id": 137486, "cbid": 211, "correlation": 137486 } }, { "ph": "s", "id": 137486, "pid": 76337, "tid": -914061504, "ts": 1716454223439076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223439173, "dur": 1, "args": { "External id": 137504, "cbid": 251, "correlation": 137504 } }, { "ph": "f", "id": 137504, "pid": 76337, "tid": -914061504, "ts": 1716454223439173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223565387, "dur": 106, "args": { "External id": 137506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137506, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 137506, "pid": 5, "tid": 7, "ts": 1716454223565387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439179, "dur": 14, "args": { "External id": 137506, "cbid": 211, "correlation": 137506 } }, { "ph": "s", "id": 137506, "pid": 76337, "tid": -914061504, "ts": 1716454223439179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223565495, "dur": 35, "args": { "External id": 137514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137514, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137514, "pid": 5, "tid": 7, "ts": 1716454223565495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439249, "dur": 12, "args": { "External id": 137514, "cbid": 211, "correlation": 137514 } }, { "ph": "s", "id": 137514, "pid": 76337, "tid": -914061504, "ts": 1716454223439249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223565531, "dur": 68, "args": { "External id": 137522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137522, "pid": 5, "tid": 7, "ts": 1716454223565531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439290, "dur": 9, "args": { "External id": 137522, "cbid": 211, "correlation": 137522 } }, { "ph": "s", "id": 137522, "pid": 76337, "tid": -914061504, "ts": 1716454223439290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223565600, "dur": 91, "args": { "External id": 137544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137544, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137544, "pid": 5, "tid": 7, "ts": 1716454223565600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439341, "dur": 10, "args": { "External id": 137544, "cbid": 211, "correlation": 137544 } }, { "ph": "s", "id": 137544, "pid": 76337, "tid": -914061504, "ts": 1716454223439341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223439425, "dur": 1, "args": { "External id": 137560, "cbid": 251, "correlation": 137560 } }, { "ph": "f", "id": 137560, "pid": 76337, "tid": -914061504, "ts": 1716454223439425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223565692, "dur": 566, "args": { "External id": 137562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137562, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137562, "pid": 5, "tid": 7, "ts": 1716454223565692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439431, "dur": 14, "args": { "External id": 137562, "cbid": 211, "correlation": 137562 } }, { "ph": "s", "id": 137562, "pid": 76337, "tid": -914061504, "ts": 1716454223439431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223566260, "dur": 242, "args": { "External id": 137570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137570, "pid": 5, "tid": 7, "ts": 1716454223566260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439498, "dur": 12, "args": { "External id": 137570, "cbid": 211, "correlation": 137570 } }, { "ph": "s", "id": 137570, "pid": 76337, "tid": -914061504, "ts": 1716454223439498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223566503, "dur": 252, "args": { "External id": 137578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137578, "pid": 5, "tid": 7, "ts": 1716454223566503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439528, "dur": 8, "args": { "External id": 137578, "cbid": 211, "correlation": 137578 } }, { "ph": "s", "id": 137578, "pid": 76337, "tid": -914061504, "ts": 1716454223439528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223439609, "dur": 1, "args": { "External id": 137594, "cbid": 251, "correlation": 137594 } }, { "ph": "f", "id": 137594, "pid": 76337, "tid": -914061504, "ts": 1716454223439609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223439614, "dur": 0, "args": { "External id": 137596, "cbid": 251, "correlation": 137596 } }, { "ph": "f", "id": 137596, "pid": 76337, "tid": -914061504, "ts": 1716454223439614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223566756, "dur": 356, "args": { "External id": 137597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137597, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 137597, "pid": 5, "tid": 7, "ts": 1716454223566756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439617, "dur": 13, "args": { "External id": 137597, "cbid": 211, "correlation": 137597 } }, { "ph": "s", "id": 137597, "pid": 76337, "tid": -914061504, "ts": 1716454223439617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223567114, "dur": 50, "args": { "External id": 137605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137605, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137605, "pid": 5, "tid": 7, "ts": 1716454223567114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439659, "dur": 10, "args": { "External id": 137605, "cbid": 211, "correlation": 137605 } }, { "ph": "s", "id": 137605, "pid": 76337, "tid": -914061504, "ts": 1716454223439659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223567165, "dur": 156, "args": { "External id": 137616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137616, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137616, "pid": 5, "tid": 7, "ts": 1716454223567165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223439727, "dur": 209, "args": { "External id": 137616, "cbid": 211, "correlation": 137616 } }, { "ph": "s", "id": 137616, "pid": 76337, "tid": -914061504, "ts": 1716454223439727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223439995, "dur": 0, "args": { "External id": 137628, "cbid": 317, "correlation": 137628 } }, { "ph": "f", "id": 137628, "pid": 76337, "tid": -914061504, "ts": 1716454223439995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223439996, "dur": 0, "args": { "External id": 137629, "cbid": 203, "correlation": 137629 } }, { "ph": "f", "id": 137629, "pid": 76337, "tid": -914061504, "ts": 1716454223439996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223439997, "dur": 0, "args": { "External id": 137630, "cbid": 205, "correlation": 137630 } }, { "ph": "f", "id": 137630, "pid": 76337, "tid": -914061504, "ts": 1716454223439997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440019, "dur": 1, "args": { "External id": 137634, "cbid": 251, "correlation": 137634 } }, { "ph": "f", "id": 137634, "pid": 76337, "tid": -914061504, "ts": 1716454223440019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440021, "dur": 0, "args": { "External id": 137635, "cbid": 251, "correlation": 137635 } }, { "ph": "f", "id": 137635, "pid": 76337, "tid": -914061504, "ts": 1716454223440021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440022, "dur": 0, "args": { "External id": 137636, "cbid": 251, "correlation": 137636 } }, { "ph": "f", "id": 137636, "pid": 76337, "tid": -914061504, "ts": 1716454223440022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440023, "dur": 0, "args": { "External id": 137637, "cbid": 251, "correlation": 137637 } }, { "ph": "f", "id": 137637, "pid": 76337, "tid": -914061504, "ts": 1716454223440023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440024, "dur": 0, "args": { "External id": 137638, "cbid": 251, "correlation": 137638 } }, { "ph": "f", "id": 137638, "pid": 76337, "tid": -914061504, "ts": 1716454223440024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440025, "dur": 0, "args": { "External id": 137639, "cbid": 251, "correlation": 137639 } }, { "ph": "f", "id": 137639, "pid": 76337, "tid": -914061504, "ts": 1716454223440025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440026, "dur": 0, "args": { "External id": 137640, "cbid": 251, "correlation": 137640 } }, { "ph": "f", "id": 137640, "pid": 76337, "tid": -914061504, "ts": 1716454223440026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440027, "dur": 0, "args": { "External id": 137641, "cbid": 251, "correlation": 137641 } }, { "ph": "f", "id": 137641, "pid": 76337, "tid": -914061504, "ts": 1716454223440027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223440029, "dur": 0, "args": { "External id": 137642, "cbid": 251, "correlation": 137642 } }, { "ph": "f", "id": 137642, "pid": 76337, "tid": -914061504, "ts": 1716454223440029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223567322, "dur": 113, "args": { "External id": 137643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137643, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 137643, "pid": 5, "tid": 7, "ts": 1716454223567322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440031, "dur": 31, "args": { "External id": 137643, "cbid": 211, "correlation": 137643 } }, { "ph": "s", "id": 137643, "pid": 76337, "tid": -914061504, "ts": 1716454223440031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223567437, "dur": 58, "args": { "External id": 137649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137649, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137649, "pid": 5, "tid": 7, "ts": 1716454223567437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440086, "dur": 102, "args": { "External id": 137649, "cbid": 211, "correlation": 137649 } }, { "ph": "s", "id": 137649, "pid": 76337, "tid": -914061504, "ts": 1716454223440086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223567496, "dur": 50, "args": { "External id": 137657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137657, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137657, "pid": 5, "tid": 7, "ts": 1716454223567496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440212, "dur": 277, "args": { "External id": 137657, "cbid": 211, "correlation": 137657 } }, { "ph": "s", "id": 137657, "pid": 76337, "tid": -914061504, "ts": 1716454223440212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223567547, "dur": 97, "args": { "External id": 137666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137666, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137666, "pid": 5, "tid": 7, "ts": 1716454223567547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440518, "dur": 11, "args": { "External id": 137666, "cbid": 211, "correlation": 137666 } }, { "ph": "s", "id": 137666, "pid": 76337, "tid": -914061504, "ts": 1716454223440518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223567645, "dur": 91, "args": { "External id": 137686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137686, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 137686, "pid": 5, "tid": 7, "ts": 1716454223567645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440591, "dur": 11, "args": { "External id": 137686, "cbid": 211, "correlation": 137686 } }, { "ph": "s", "id": 137686, "pid": 76337, "tid": -914061504, "ts": 1716454223440591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223567738, "dur": 4, "args": { "External id": 137698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137698, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 137698, "pid": 5, "tid": 7, "ts": 1716454223567738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440612, "dur": 12, "args": { "External id": 137698, "cbid": 211, "correlation": 137698 } }, { "ph": "s", "id": 137698, "pid": 76337, "tid": -914061504, "ts": 1716454223440612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223567743, "dur": 106, "args": { "External id": 137701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137701, "pid": 5, "tid": 7, "ts": 1716454223567743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440636, "dur": 108, "args": { "External id": 137701, "cbid": 211, "correlation": 137701 } }, { "ph": "s", "id": 137701, "pid": 76337, "tid": -914061504, "ts": 1716454223440636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223567851, "dur": 69, "args": { "External id": 137710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137710, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137710, "pid": 5, "tid": 7, "ts": 1716454223567851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440778, "dur": 10, "args": { "External id": 137710, "cbid": 211, "correlation": 137710 } }, { "ph": "s", "id": 137710, "pid": 76337, "tid": -914061504, "ts": 1716454223440778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223440830, "dur": 0, "args": { "External id": 137720, "cbid": 317, "correlation": 137720 } }, { "ph": "f", "id": 137720, "pid": 76337, "tid": -914061504, "ts": 1716454223440830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223440831, "dur": 0, "args": { "External id": 137721, "cbid": 203, "correlation": 137721 } }, { "ph": "f", "id": 137721, "pid": 76337, "tid": -914061504, "ts": 1716454223440831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223440832, "dur": 0, "args": { "External id": 137722, "cbid": 205, "correlation": 137722 } }, { "ph": "f", "id": 137722, "pid": 76337, "tid": -914061504, "ts": 1716454223440832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223567921, "dur": 77, "args": { "External id": 137726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137726, "pid": 5, "tid": 7, "ts": 1716454223567921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440846, "dur": 13, "args": { "External id": 137726, "cbid": 211, "correlation": 137726 } }, { "ph": "s", "id": 137726, "pid": 76337, "tid": -914061504, "ts": 1716454223440846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223567999, "dur": 24, "args": { "External id": 137728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137728, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137728, "pid": 5, "tid": 7, "ts": 1716454223567999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440861, "dur": 5, "args": { "External id": 137728, "cbid": 211, "correlation": 137728 } }, { "ph": "s", "id": 137728, "pid": 76337, "tid": -914061504, "ts": 1716454223440861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223568025, "dur": 3, "args": { "External id": 137730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137730, "pid": 5, "tid": 7, "ts": 1716454223568025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440871, "dur": 5, "args": { "External id": 137730, "cbid": 211, "correlation": 137730 } }, { "ph": "s", "id": 137730, "pid": 76337, "tid": -914061504, "ts": 1716454223440871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223440879, "dur": 0, "args": { "External id": 137731, "cbid": 51, "correlation": 137731 } }, { "ph": "s", "id": 137731, "pid": 76337, "tid": -914061504, "ts": 1716454223440879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223568029, "dur": 1346, "args": { "External id": 137732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137732, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137732, "pid": 5, "tid": 7, "ts": 1716454223568029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440880, "dur": 5, "args": { "External id": 137732, "cbid": 211, "correlation": 137732 } }, { "ph": "s", "id": 137732, "pid": 76337, "tid": -914061504, "ts": 1716454223440880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223569377, "dur": 59, "args": { "External id": 137737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137737, "pid": 5, "tid": 7, "ts": 1716454223569377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440907, "dur": 8, "args": { "External id": 137737, "cbid": 211, "correlation": 137737 } }, { "ph": "s", "id": 137737, "pid": 76337, "tid": -914061504, "ts": 1716454223440907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223569438, "dur": 4, "args": { "External id": 137745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137745, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137745, "pid": 5, "tid": 7, "ts": 1716454223569438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223440951, "dur": 10, "args": { "External id": 137745, "cbid": 211, "correlation": 137745 } }, { "ph": "s", "id": 137745, "pid": 76337, "tid": -914061504, "ts": 1716454223440951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223441023, "dur": 1, "args": { "External id": 137761, "cbid": 251, "correlation": 137761 } }, { "ph": "f", "id": 137761, "pid": 76337, "tid": -914061504, "ts": 1716454223441023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223441028, "dur": 0, "args": { "External id": 137763, "cbid": 251, "correlation": 137763 } }, { "ph": "f", "id": 137763, "pid": 76337, "tid": -914061504, "ts": 1716454223441028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223569443, "dur": 11, "args": { "External id": 137764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137764, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 137764, "pid": 5, "tid": 7, "ts": 1716454223569443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441030, "dur": 12, "args": { "External id": 137764, "cbid": 211, "correlation": 137764 } }, { "ph": "s", "id": 137764, "pid": 76337, "tid": -914061504, "ts": 1716454223441030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223569455, "dur": 5, "args": { "External id": 137766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137766, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 137766, "pid": 5, "tid": 7, "ts": 1716454223569455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441044, "dur": 6, "args": { "External id": 137766, "cbid": 211, "correlation": 137766 } }, { "ph": "s", "id": 137766, "pid": 76337, "tid": -914061504, "ts": 1716454223441044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223569461, "dur": 55, "args": { "External id": 137776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137776, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137776, "pid": 5, "tid": 7, "ts": 1716454223569461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441102, "dur": 547, "args": { "External id": 137776, "cbid": 211, "correlation": 137776 } }, { "ph": "s", "id": 137776, "pid": 76337, "tid": -914061504, "ts": 1716454223441102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223569518, "dur": 51, "args": { "External id": 137796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137796, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 137796, "pid": 5, "tid": 7, "ts": 1716454223569518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441704, "dur": 11, "args": { "External id": 137796, "cbid": 211, "correlation": 137796 } }, { "ph": "s", "id": 137796, "pid": 76337, "tid": -914061504, "ts": 1716454223441704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223569570, "dur": 4, "args": { "External id": 137808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137808, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137808, "pid": 5, "tid": 7, "ts": 1716454223569570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441724, "dur": 7, "args": { "External id": 137808, "cbid": 211, "correlation": 137808 } }, { "ph": "s", "id": 137808, "pid": 76337, "tid": -914061504, "ts": 1716454223441724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223569575, "dur": 55, "args": { "External id": 137811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137811, "pid": 5, "tid": 7, "ts": 1716454223569575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441743, "dur": 7, "args": { "External id": 137811, "cbid": 211, "correlation": 137811 } }, { "ph": "s", "id": 137811, "pid": 76337, "tid": -914061504, "ts": 1716454223441743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223569632, "dur": 37, "args": { "External id": 137820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137820, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137820, "pid": 5, "tid": 7, "ts": 1716454223569632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441784, "dur": 10, "args": { "External id": 137820, "cbid": 211, "correlation": 137820 } }, { "ph": "s", "id": 137820, "pid": 76337, "tid": -914061504, "ts": 1716454223441784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223441846, "dur": 0, "args": { "External id": 137830, "cbid": 317, "correlation": 137830 } }, { "ph": "f", "id": 137830, "pid": 76337, "tid": -914061504, "ts": 1716454223441846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223441847, "dur": 0, "args": { "External id": 137831, "cbid": 203, "correlation": 137831 } }, { "ph": "f", "id": 137831, "pid": 76337, "tid": -914061504, "ts": 1716454223441847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223441848, "dur": 0, "args": { "External id": 137832, "cbid": 205, "correlation": 137832 } }, { "ph": "f", "id": 137832, "pid": 76337, "tid": -914061504, "ts": 1716454223441848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223569671, "dur": 40, "args": { "External id": 137836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137836, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137836, "pid": 5, "tid": 7, "ts": 1716454223569671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441862, "dur": 12, "args": { "External id": 137836, "cbid": 211, "correlation": 137836 } }, { "ph": "s", "id": 137836, "pid": 76337, "tid": -914061504, "ts": 1716454223441862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223569712, "dur": 14, "args": { "External id": 137838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137838, "pid": 5, "tid": 7, "ts": 1716454223569712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441876, "dur": 5, "args": { "External id": 137838, "cbid": 211, "correlation": 137838 } }, { "ph": "s", "id": 137838, "pid": 76337, "tid": -914061504, "ts": 1716454223441876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223569727, "dur": 3, "args": { "External id": 137840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137840, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137840, "pid": 5, "tid": 7, "ts": 1716454223569727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441886, "dur": 6, "args": { "External id": 137840, "cbid": 211, "correlation": 137840 } }, { "ph": "s", "id": 137840, "pid": 76337, "tid": -914061504, "ts": 1716454223441886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223441894, "dur": 0, "args": { "External id": 137841, "cbid": 51, "correlation": 137841 } }, { "ph": "s", "id": 137841, "pid": 76337, "tid": -914061504, "ts": 1716454223441894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223569732, "dur": 703, "args": { "External id": 137842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137842, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137842, "pid": 5, "tid": 7, "ts": 1716454223569732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441895, "dur": 5, "args": { "External id": 137842, "cbid": 211, "correlation": 137842 } }, { "ph": "s", "id": 137842, "pid": 76337, "tid": -914061504, "ts": 1716454223441895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223570436, "dur": 60, "args": { "External id": 137847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137847, "pid": 5, "tid": 7, "ts": 1716454223570436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223441922, "dur": 9, "args": { "External id": 137847, "cbid": 211, "correlation": 137847 } }, { "ph": "s", "id": 137847, "pid": 76337, "tid": -914061504, "ts": 1716454223441922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223441987, "dur": 0, "args": { "External id": 137857, "cbid": 317, "correlation": 137857 } }, { "ph": "f", "id": 137857, "pid": 76337, "tid": -914061504, "ts": 1716454223441987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223441988, "dur": 0, "args": { "External id": 137858, "cbid": 203, "correlation": 137858 } }, { "ph": "f", "id": 137858, "pid": 76337, "tid": -914061504, "ts": 1716454223441988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223441988, "dur": 0, "args": { "External id": 137859, "cbid": 205, "correlation": 137859 } }, { "ph": "f", "id": 137859, "pid": 76337, "tid": -914061504, "ts": 1716454223441988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223570497, "dur": 75, "args": { "External id": 137863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137863, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137863, "pid": 5, "tid": 7, "ts": 1716454223570497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442000, "dur": 12, "args": { "External id": 137863, "cbid": 211, "correlation": 137863 } }, { "ph": "s", "id": 137863, "pid": 76337, "tid": -914061504, "ts": 1716454223442000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223570574, "dur": 207, "args": { "External id": 137865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137865, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137865, "pid": 5, "tid": 7, "ts": 1716454223570574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442018, "dur": 6, "args": { "External id": 137865, "cbid": 211, "correlation": 137865 } }, { "ph": "s", "id": 137865, "pid": 76337, "tid": -914061504, "ts": 1716454223442018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223570782, "dur": 39, "args": { "External id": 137867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137867, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137867, "pid": 5, "tid": 7, "ts": 1716454223570782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442029, "dur": 6, "args": { "External id": 137867, "cbid": 211, "correlation": 137867 } }, { "ph": "s", "id": 137867, "pid": 76337, "tid": -914061504, "ts": 1716454223442029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223570822, "dur": 60, "args": { "External id": 137873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137873, "pid": 5, "tid": 7, "ts": 1716454223570822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442056, "dur": 509, "args": { "External id": 137873, "cbid": 211, "correlation": 137873 } }, { "ph": "s", "id": 137873, "pid": 76337, "tid": -914061504, "ts": 1716454223442056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223570883, "dur": 50, "args": { "External id": 137881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137881, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137881, "pid": 5, "tid": 7, "ts": 1716454223570883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442586, "dur": 8, "args": { "External id": 137881, "cbid": 211, "correlation": 137881 } }, { "ph": "s", "id": 137881, "pid": 76337, "tid": -914061504, "ts": 1716454223442586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223570934, "dur": 35, "args": { "External id": 137889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137889, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137889, "pid": 5, "tid": 7, "ts": 1716454223570934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442616, "dur": 9, "args": { "External id": 137889, "cbid": 211, "correlation": 137889 } }, { "ph": "s", "id": 137889, "pid": 76337, "tid": -914061504, "ts": 1716454223442616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223570971, "dur": 53, "args": { "External id": 137909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137909, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 137909, "pid": 5, "tid": 7, "ts": 1716454223570971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442700, "dur": 13, "args": { "External id": 137909, "cbid": 211, "correlation": 137909 } }, { "ph": "s", "id": 137909, "pid": 76337, "tid": -914061504, "ts": 1716454223442700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223571025, "dur": 4, "args": { "External id": 137921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137921, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 137921, "pid": 5, "tid": 7, "ts": 1716454223571025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442723, "dur": 6, "args": { "External id": 137921, "cbid": 211, "correlation": 137921 } }, { "ph": "s", "id": 137921, "pid": 76337, "tid": -914061504, "ts": 1716454223442723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223571030, "dur": 55, "args": { "External id": 137924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137924, "pid": 5, "tid": 7, "ts": 1716454223571030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442740, "dur": 7, "args": { "External id": 137924, "cbid": 211, "correlation": 137924 } }, { "ph": "s", "id": 137924, "pid": 76337, "tid": -914061504, "ts": 1716454223442740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223442797, "dur": 0, "args": { "External id": 137935, "cbid": 317, "correlation": 137935 } }, { "ph": "f", "id": 137935, "pid": 76337, "tid": -914061504, "ts": 1716454223442797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223442798, "dur": 0, "args": { "External id": 137936, "cbid": 203, "correlation": 137936 } }, { "ph": "f", "id": 137936, "pid": 76337, "tid": -914061504, "ts": 1716454223442798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223442799, "dur": 0, "args": { "External id": 137937, "cbid": 205, "correlation": 137937 } }, { "ph": "f", "id": 137937, "pid": 76337, "tid": -914061504, "ts": 1716454223442799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442822, "dur": 1, "args": { "External id": 137941, "cbid": 251, "correlation": 137941 } }, { "ph": "f", "id": 137941, "pid": 76337, "tid": -914061504, "ts": 1716454223442822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442824, "dur": 0, "args": { "External id": 137942, "cbid": 251, "correlation": 137942 } }, { "ph": "f", "id": 137942, "pid": 76337, "tid": -914061504, "ts": 1716454223442824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442824, "dur": 0, "args": { "External id": 137943, "cbid": 251, "correlation": 137943 } }, { "ph": "f", "id": 137943, "pid": 76337, "tid": -914061504, "ts": 1716454223442824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442825, "dur": 0, "args": { "External id": 137944, "cbid": 251, "correlation": 137944 } }, { "ph": "f", "id": 137944, "pid": 76337, "tid": -914061504, "ts": 1716454223442825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442826, "dur": 0, "args": { "External id": 137945, "cbid": 251, "correlation": 137945 } }, { "ph": "f", "id": 137945, "pid": 76337, "tid": -914061504, "ts": 1716454223442826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442827, "dur": 0, "args": { "External id": 137946, "cbid": 251, "correlation": 137946 } }, { "ph": "f", "id": 137946, "pid": 76337, "tid": -914061504, "ts": 1716454223442827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442827, "dur": 0, "args": { "External id": 137947, "cbid": 251, "correlation": 137947 } }, { "ph": "f", "id": 137947, "pid": 76337, "tid": -914061504, "ts": 1716454223442827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442828, "dur": 0, "args": { "External id": 137948, "cbid": 251, "correlation": 137948 } }, { "ph": "f", "id": 137948, "pid": 76337, "tid": -914061504, "ts": 1716454223442828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223442829, "dur": 0, "args": { "External id": 137949, "cbid": 251, "correlation": 137949 } }, { "ph": "f", "id": 137949, "pid": 76337, "tid": -914061504, "ts": 1716454223442829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223571087, "dur": 116, "args": { "External id": 137950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137950, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 137950, "pid": 5, "tid": 7, "ts": 1716454223571087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442831, "dur": 12, "args": { "External id": 137950, "cbid": 211, "correlation": 137950 } }, { "ph": "s", "id": 137950, "pid": 76337, "tid": -914061504, "ts": 1716454223442831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223571204, "dur": 59, "args": { "External id": 137956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137956, "pid": 5, "tid": 7, "ts": 1716454223571204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442866, "dur": 9, "args": { "External id": 137956, "cbid": 211, "correlation": 137956 } }, { "ph": "s", "id": 137956, "pid": 76337, "tid": -914061504, "ts": 1716454223442866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223571265, "dur": 598, "args": { "External id": 137965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137965, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137965, "pid": 5, "tid": 7, "ts": 1716454223571265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223442949, "dur": 14, "args": { "External id": 137965, "cbid": 211, "correlation": 137965 } }, { "ph": "s", "id": 137965, "pid": 76337, "tid": -914061504, "ts": 1716454223442949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223571865, "dur": 184, "args": { "External id": 137987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137987, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 137987, "pid": 5, "tid": 7, "ts": 1716454223571865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443017, "dur": 12, "args": { "External id": 137987, "cbid": 211, "correlation": 137987 } }, { "ph": "s", "id": 137987, "pid": 76337, "tid": -914061504, "ts": 1716454223443017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223443103, "dur": 1, "args": { "External id": 137998, "cbid": 251, "correlation": 137998 } }, { "ph": "f", "id": 137998, "pid": 76337, "tid": -914061504, "ts": 1716454223443103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223572051, "dur": 198, "args": { "External id": 137999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 137999, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 137999, "pid": 5, "tid": 7, "ts": 1716454223572051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443108, "dur": 14, "args": { "External id": 137999, "cbid": 211, "correlation": 137999 } }, { "ph": "s", "id": 137999, "pid": 76337, "tid": -914061504, "ts": 1716454223443108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223443177, "dur": 1, "args": { "External id": 138010, "cbid": 251, "correlation": 138010 } }, { "ph": "f", "id": 138010, "pid": 76337, "tid": -914061504, "ts": 1716454223443177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223572250, "dur": 190, "args": { "External id": 138011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138011, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 138011, "pid": 5, "tid": 7, "ts": 1716454223572250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443181, "dur": 11, "args": { "External id": 138011, "cbid": 211, "correlation": 138011 } }, { "ph": "s", "id": 138011, "pid": 76337, "tid": -914061504, "ts": 1716454223443181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223443244, "dur": 1, "args": { "External id": 138022, "cbid": 251, "correlation": 138022 } }, { "ph": "f", "id": 138022, "pid": 76337, "tid": -914061504, "ts": 1716454223443244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223572442, "dur": 187, "args": { "External id": 138023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138023, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 138023, "pid": 5, "tid": 7, "ts": 1716454223572442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443247, "dur": 11, "args": { "External id": 138023, "cbid": 211, "correlation": 138023 } }, { "ph": "s", "id": 138023, "pid": 76337, "tid": -914061504, "ts": 1716454223443247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223572630, "dur": 18954, "args": { "External id": 138044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138044, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 138044, "pid": 5, "tid": 7, "ts": 1716454223572630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443328, "dur": 12, "args": { "External id": 138044, "cbid": 211, "correlation": 138044 } }, { "ph": "s", "id": 138044, "pid": 76337, "tid": -914061504, "ts": 1716454223443328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223443423, "dur": 1, "args": { "External id": 138062, "cbid": 251, "correlation": 138062 } }, { "ph": "f", "id": 138062, "pid": 76337, "tid": -914061504, "ts": 1716454223443423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223591586, "dur": 205, "args": { "External id": 138064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138064, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 138064, "pid": 5, "tid": 7, "ts": 1716454223591586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443429, "dur": 13, "args": { "External id": 138064, "cbid": 211, "correlation": 138064 } }, { "ph": "s", "id": 138064, "pid": 76337, "tid": -914061504, "ts": 1716454223443429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223591792, "dur": 66, "args": { "External id": 138072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138072, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138072, "pid": 5, "tid": 7, "ts": 1716454223591792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443498, "dur": 12, "args": { "External id": 138072, "cbid": 211, "correlation": 138072 } }, { "ph": "s", "id": 138072, "pid": 76337, "tid": -914061504, "ts": 1716454223443498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223591860, "dur": 97, "args": { "External id": 138080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138080, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138080, "pid": 5, "tid": 7, "ts": 1716454223591860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443537, "dur": 78, "args": { "External id": 138080, "cbid": 211, "correlation": 138080 } }, { "ph": "s", "id": 138080, "pid": 76337, "tid": -914061504, "ts": 1716454223443537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223591958, "dur": 55, "args": { "External id": 138091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138091, "pid": 5, "tid": 7, "ts": 1716454223591958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223443679, "dur": 1877, "args": { "External id": 138091, "cbid": 211, "correlation": 138091 } }, { "ph": "s", "id": 138091, "pid": 76337, "tid": -914061504, "ts": 1716454223443679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223592014, "dur": 94, "args": { "External id": 138113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138113, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138113, "pid": 5, "tid": 7, "ts": 1716454223592014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223445576, "dur": 126, "args": { "External id": 138113, "cbid": 211, "correlation": 138113 } }, { "ph": "s", "id": 138113, "pid": 76337, "tid": -914061504, "ts": 1716454223445576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223445778, "dur": 1, "args": { "External id": 138124, "cbid": 251, "correlation": 138124 } }, { "ph": "f", "id": 138124, "pid": 76337, "tid": -914061504, "ts": 1716454223445778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223592109, "dur": 109, "args": { "External id": 138125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138125, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 138125, "pid": 5, "tid": 7, "ts": 1716454223592109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223445783, "dur": 14, "args": { "External id": 138125, "cbid": 211, "correlation": 138125 } }, { "ph": "s", "id": 138125, "pid": 76337, "tid": -914061504, "ts": 1716454223445783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223445856, "dur": 1, "args": { "External id": 138136, "cbid": 251, "correlation": 138136 } }, { "ph": "f", "id": 138136, "pid": 76337, "tid": -914061504, "ts": 1716454223445856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223445859, "dur": 0, "args": { "External id": 138137, "cbid": 251, "correlation": 138137 } }, { "ph": "f", "id": 138137, "pid": 76337, "tid": -914061504, "ts": 1716454223445859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223592219, "dur": 11, "args": { "External id": 138138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138138, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138138, "pid": 5, "tid": 7, "ts": 1716454223592219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223445861, "dur": 12, "args": { "External id": 138138, "cbid": 211, "correlation": 138138 } }, { "ph": "s", "id": 138138, "pid": 76337, "tid": -914061504, "ts": 1716454223445861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223592232, "dur": 5, "args": { "External id": 138140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138140, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 138140, "pid": 5, "tid": 7, "ts": 1716454223592232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223445875, "dur": 6, "args": { "External id": 138140, "cbid": 211, "correlation": 138140 } }, { "ph": "s", "id": 138140, "pid": 76337, "tid": -914061504, "ts": 1716454223445875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223445936, "dur": 1, "args": { "External id": 138151, "cbid": 251, "correlation": 138151 } }, { "ph": "f", "id": 138151, "pid": 76337, "tid": -914061504, "ts": 1716454223445936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223445939, "dur": 0, "args": { "External id": 138152, "cbid": 251, "correlation": 138152 } }, { "ph": "f", "id": 138152, "pid": 76337, "tid": -914061504, "ts": 1716454223445939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223592238, "dur": 6, "args": { "External id": 138153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138153, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138153, "pid": 5, "tid": 7, "ts": 1716454223592238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223445941, "dur": 12, "args": { "External id": 138153, "cbid": 211, "correlation": 138153 } }, { "ph": "s", "id": 138153, "pid": 76337, "tid": -914061504, "ts": 1716454223445941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223592246, "dur": 4, "args": { "External id": 138155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138155, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 138155, "pid": 5, "tid": 7, "ts": 1716454223592246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223445955, "dur": 6, "args": { "External id": 138155, "cbid": 211, "correlation": 138155 } }, { "ph": "s", "id": 138155, "pid": 76337, "tid": -914061504, "ts": 1716454223445955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223592251, "dur": 159, "args": { "External id": 138176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138176, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 138176, "pid": 5, "tid": 7, "ts": 1716454223592251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446036, "dur": 13, "args": { "External id": 138176, "cbid": 211, "correlation": 138176 } }, { "ph": "s", "id": 138176, "pid": 76337, "tid": -914061504, "ts": 1716454223446036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223446131, "dur": 1, "args": { "External id": 138194, "cbid": 251, "correlation": 138194 } }, { "ph": "f", "id": 138194, "pid": 76337, "tid": -914061504, "ts": 1716454223446131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223592411, "dur": 108, "args": { "External id": 138196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138196, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 138196, "pid": 5, "tid": 7, "ts": 1716454223592411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446137, "dur": 14, "args": { "External id": 138196, "cbid": 211, "correlation": 138196 } }, { "ph": "s", "id": 138196, "pid": 76337, "tid": -914061504, "ts": 1716454223446137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223592520, "dur": 34, "args": { "External id": 138204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138204, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138204, "pid": 5, "tid": 7, "ts": 1716454223592520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446207, "dur": 12, "args": { "External id": 138204, "cbid": 211, "correlation": 138204 } }, { "ph": "s", "id": 138204, "pid": 76337, "tid": -914061504, "ts": 1716454223446207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223592556, "dur": 68, "args": { "External id": 138212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138212, "pid": 5, "tid": 7, "ts": 1716454223592556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446248, "dur": 9, "args": { "External id": 138212, "cbid": 211, "correlation": 138212 } }, { "ph": "s", "id": 138212, "pid": 76337, "tid": -914061504, "ts": 1716454223446248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223592626, "dur": 94, "args": { "External id": 138234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138234, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138234, "pid": 5, "tid": 7, "ts": 1716454223592626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446299, "dur": 10, "args": { "External id": 138234, "cbid": 211, "correlation": 138234 } }, { "ph": "s", "id": 138234, "pid": 76337, "tid": -914061504, "ts": 1716454223446299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223446384, "dur": 1, "args": { "External id": 138250, "cbid": 251, "correlation": 138250 } }, { "ph": "f", "id": 138250, "pid": 76337, "tid": -914061504, "ts": 1716454223446384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223592721, "dur": 584, "args": { "External id": 138252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138252, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 138252, "pid": 5, "tid": 7, "ts": 1716454223592721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446390, "dur": 13, "args": { "External id": 138252, "cbid": 211, "correlation": 138252 } }, { "ph": "s", "id": 138252, "pid": 76337, "tid": -914061504, "ts": 1716454223446390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223593306, "dur": 247, "args": { "External id": 138260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138260, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138260, "pid": 5, "tid": 7, "ts": 1716454223593306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446456, "dur": 12, "args": { "External id": 138260, "cbid": 211, "correlation": 138260 } }, { "ph": "s", "id": 138260, "pid": 76337, "tid": -914061504, "ts": 1716454223446456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223593555, "dur": 253, "args": { "External id": 138268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138268, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138268, "pid": 5, "tid": 7, "ts": 1716454223593555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446485, "dur": 8, "args": { "External id": 138268, "cbid": 211, "correlation": 138268 } }, { "ph": "s", "id": 138268, "pid": 76337, "tid": -914061504, "ts": 1716454223446485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223446566, "dur": 1, "args": { "External id": 138284, "cbid": 251, "correlation": 138284 } }, { "ph": "f", "id": 138284, "pid": 76337, "tid": -914061504, "ts": 1716454223446566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223446571, "dur": 0, "args": { "External id": 138286, "cbid": 251, "correlation": 138286 } }, { "ph": "f", "id": 138286, "pid": 76337, "tid": -914061504, "ts": 1716454223446571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223593810, "dur": 362, "args": { "External id": 138287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138287, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138287, "pid": 5, "tid": 7, "ts": 1716454223593810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446574, "dur": 12, "args": { "External id": 138287, "cbid": 211, "correlation": 138287 } }, { "ph": "s", "id": 138287, "pid": 76337, "tid": -914061504, "ts": 1716454223446574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223594174, "dur": 50, "args": { "External id": 138295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138295, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138295, "pid": 5, "tid": 7, "ts": 1716454223594174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446616, "dur": 189, "args": { "External id": 138295, "cbid": 211, "correlation": 138295 } }, { "ph": "s", "id": 138295, "pid": 76337, "tid": -914061504, "ts": 1716454223446616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223594225, "dur": 161, "args": { "External id": 138306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138306, "pid": 5, "tid": 7, "ts": 1716454223594225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223446864, "dur": 69, "args": { "External id": 138306, "cbid": 211, "correlation": 138306 } }, { "ph": "s", "id": 138306, "pid": 76337, "tid": -914061504, "ts": 1716454223446864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223446996, "dur": 0, "args": { "External id": 138318, "cbid": 317, "correlation": 138318 } }, { "ph": "f", "id": 138318, "pid": 76337, "tid": -914061504, "ts": 1716454223446996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223446996, "dur": 0, "args": { "External id": 138319, "cbid": 203, "correlation": 138319 } }, { "ph": "f", "id": 138319, "pid": 76337, "tid": -914061504, "ts": 1716454223446996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223446997, "dur": 0, "args": { "External id": 138320, "cbid": 205, "correlation": 138320 } }, { "ph": "f", "id": 138320, "pid": 76337, "tid": -914061504, "ts": 1716454223446997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447020, "dur": 1, "args": { "External id": 138324, "cbid": 251, "correlation": 138324 } }, { "ph": "f", "id": 138324, "pid": 76337, "tid": -914061504, "ts": 1716454223447020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447022, "dur": 0, "args": { "External id": 138325, "cbid": 251, "correlation": 138325 } }, { "ph": "f", "id": 138325, "pid": 76337, "tid": -914061504, "ts": 1716454223447022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447022, "dur": 0, "args": { "External id": 138326, "cbid": 251, "correlation": 138326 } }, { "ph": "f", "id": 138326, "pid": 76337, "tid": -914061504, "ts": 1716454223447022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447023, "dur": 0, "args": { "External id": 138327, "cbid": 251, "correlation": 138327 } }, { "ph": "f", "id": 138327, "pid": 76337, "tid": -914061504, "ts": 1716454223447023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447024, "dur": 0, "args": { "External id": 138328, "cbid": 251, "correlation": 138328 } }, { "ph": "f", "id": 138328, "pid": 76337, "tid": -914061504, "ts": 1716454223447024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447025, "dur": 0, "args": { "External id": 138329, "cbid": 251, "correlation": 138329 } }, { "ph": "f", "id": 138329, "pid": 76337, "tid": -914061504, "ts": 1716454223447025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447026, "dur": 0, "args": { "External id": 138330, "cbid": 251, "correlation": 138330 } }, { "ph": "f", "id": 138330, "pid": 76337, "tid": -914061504, "ts": 1716454223447026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447027, "dur": 0, "args": { "External id": 138331, "cbid": 251, "correlation": 138331 } }, { "ph": "f", "id": 138331, "pid": 76337, "tid": -914061504, "ts": 1716454223447027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447028, "dur": 0, "args": { "External id": 138332, "cbid": 251, "correlation": 138332 } }, { "ph": "f", "id": 138332, "pid": 76337, "tid": -914061504, "ts": 1716454223447028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223594388, "dur": 118, "args": { "External id": 138333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138333, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 138333, "pid": 5, "tid": 7, "ts": 1716454223594388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447030, "dur": 33, "args": { "External id": 138333, "cbid": 211, "correlation": 138333 } }, { "ph": "s", "id": 138333, "pid": 76337, "tid": -914061504, "ts": 1716454223447030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223594508, "dur": 60, "args": { "External id": 138339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138339, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138339, "pid": 5, "tid": 7, "ts": 1716454223594508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447085, "dur": 279, "args": { "External id": 138339, "cbid": 211, "correlation": 138339 } }, { "ph": "s", "id": 138339, "pid": 76337, "tid": -914061504, "ts": 1716454223447085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223594569, "dur": 50, "args": { "External id": 138347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138347, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138347, "pid": 5, "tid": 7, "ts": 1716454223594569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447388, "dur": 8, "args": { "External id": 138347, "cbid": 211, "correlation": 138347 } }, { "ph": "s", "id": 138347, "pid": 76337, "tid": -914061504, "ts": 1716454223447388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223594620, "dur": 53, "args": { "External id": 138367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138367, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 138367, "pid": 5, "tid": 7, "ts": 1716454223594620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447467, "dur": 12, "args": { "External id": 138367, "cbid": 211, "correlation": 138367 } }, { "ph": "s", "id": 138367, "pid": 76337, "tid": -914061504, "ts": 1716454223447467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223594674, "dur": 4, "args": { "External id": 138379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138379, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 138379, "pid": 5, "tid": 7, "ts": 1716454223594674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447490, "dur": 11, "args": { "External id": 138379, "cbid": 211, "correlation": 138379 } }, { "ph": "s", "id": 138379, "pid": 76337, "tid": -914061504, "ts": 1716454223447490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223594680, "dur": 55, "args": { "External id": 138382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138382, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138382, "pid": 5, "tid": 7, "ts": 1716454223594680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447512, "dur": 108, "args": { "External id": 138382, "cbid": 211, "correlation": 138382 } }, { "ph": "s", "id": 138382, "pid": 76337, "tid": -914061504, "ts": 1716454223447512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223594737, "dur": 37, "args": { "External id": 138391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138391, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138391, "pid": 5, "tid": 7, "ts": 1716454223594737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447659, "dur": 10, "args": { "External id": 138391, "cbid": 211, "correlation": 138391 } }, { "ph": "s", "id": 138391, "pid": 76337, "tid": -914061504, "ts": 1716454223447659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223447714, "dur": 0, "args": { "External id": 138401, "cbid": 317, "correlation": 138401 } }, { "ph": "f", "id": 138401, "pid": 76337, "tid": -914061504, "ts": 1716454223447714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223447715, "dur": 0, "args": { "External id": 138402, "cbid": 203, "correlation": 138402 } }, { "ph": "f", "id": 138402, "pid": 76337, "tid": -914061504, "ts": 1716454223447715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223447716, "dur": 0, "args": { "External id": 138403, "cbid": 205, "correlation": 138403 } }, { "ph": "f", "id": 138403, "pid": 76337, "tid": -914061504, "ts": 1716454223447716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223594775, "dur": 41, "args": { "External id": 138407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138407, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138407, "pid": 5, "tid": 7, "ts": 1716454223594775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447731, "dur": 12, "args": { "External id": 138407, "cbid": 211, "correlation": 138407 } }, { "ph": "s", "id": 138407, "pid": 76337, "tid": -914061504, "ts": 1716454223447731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223594818, "dur": 3, "args": { "External id": 138409, "device": 5, "context": 1, "stream": 7, "correlation": 138409, "bytes": 46080, "memory bandwidth (GB/s)": 12.203389830508474 } }, { "ph": "f", "id": 138409, "pid": 5, "tid": 7, "ts": 1716454223594818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223447747, "dur": 17, "args": { "External id": 138409, "cbid": 51, "correlation": 138409 } }, { "ph": "s", "id": 138409, "pid": 76337, "tid": -914061504, "ts": 1716454223447747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223447769, "dur": 1, "args": { "External id": 138411, "cbid": 200, "correlation": 138411 } }, { "ph": "f", "id": 138411, "pid": 76337, "tid": -914061504, "ts": 1716454223447769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223447771, "dur": 0, "args": { "External id": 138412, "cbid": 200, "correlation": 138412 } }, { "ph": "f", "id": 138412, "pid": 76337, "tid": -914061504, "ts": 1716454223447771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223447772, "dur": 0, "args": { "External id": 138413, "cbid": 200, "correlation": 138413 } }, { "ph": "f", "id": 138413, "pid": 76337, "tid": -914061504, "ts": 1716454223447772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223447772, "dur": 0, "args": { "External id": 138414, "cbid": 200, "correlation": 138414 } }, { "ph": "f", "id": 138414, "pid": 76337, "tid": -914061504, "ts": 1716454223447772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454223447773, "dur": 4, "args": { "External id": 138415, "cbid": 15, "correlation": 138415 } }, { "ph": "f", "id": 138415, "pid": 76337, "tid": -914061504, "ts": 1716454223447773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223447778, "dur": 1, "args": { "External id": 138416, "cbid": 251, "correlation": 138416 } }, { "ph": "f", "id": 138416, "pid": 76337, "tid": -914061504, "ts": 1716454223447778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454223594823, "dur": 24, "args": { "External id": 138417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138417, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138417, "pid": 5, "tid": 7, "ts": 1716454223594823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447781, "dur": 8, "args": { "External id": 138417, "cbid": 211, "correlation": 138417 } }, { "ph": "s", "id": 138417, "pid": 76337, "tid": -914061504, "ts": 1716454223447781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223594848, "dur": 4, "args": { "External id": 138419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 138419, "pid": 5, "tid": 7, "ts": 1716454223594848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447795, "dur": 6, "args": { "External id": 138419, "cbid": 211, "correlation": 138419 } }, { "ph": "s", "id": 138419, "pid": 76337, "tid": -914061504, "ts": 1716454223447795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223447804, "dur": 0, "args": { "External id": 138420, "cbid": 51, "correlation": 138420 } }, { "ph": "s", "id": 138420, "pid": 76337, "tid": -914061504, "ts": 1716454223447804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223594853, "dur": 191, "args": { "External id": 138421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138421, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138421, "pid": 5, "tid": 7, "ts": 1716454223594853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223447805, "dur": 209, "args": { "External id": 138421, "cbid": 211, "correlation": 138421 } }, { "ph": "s", "id": 138421, "pid": 76337, "tid": -914061504, "ts": 1716454223447805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223595045, "dur": 6, "args": { "External id": 138422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138422, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138422, "pid": 5, "tid": 7, "ts": 1716454223595045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223448018, "dur": 6, "args": { "External id": 138422, "cbid": 211, "correlation": 138422 } }, { "ph": "s", "id": 138422, "pid": 76337, "tid": -914061504, "ts": 1716454223448018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223595053, "dur": 5, "args": { "External id": 138428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 138428, "pid": 5, "tid": 7, "ts": 1716454223595053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223448049, "dur": 9, "args": { "External id": 138428, "cbid": 211, "correlation": 138428 } }, { "ph": "s", "id": 138428, "pid": 76337, "tid": -914061504, "ts": 1716454223448049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595060, "dur": 3, "args": { "External id": 138436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138436, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138436, "pid": 5, "tid": 7, "ts": 1716454223595060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223449717, "dur": 21, "args": { "External id": 138436, "cbid": 211, "correlation": 138436 } }, { "ph": "s", "id": 138436, "pid": 76337, "tid": -914061504, "ts": 1716454223449717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595064, "dur": 3, "args": { "External id": 138444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138444, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138444, "pid": 5, "tid": 7, "ts": 1716454223595064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223449766, "dur": 12, "args": { "External id": 138444, "cbid": 211, "correlation": 138444 } }, { "ph": "s", "id": 138444, "pid": 76337, "tid": -914061504, "ts": 1716454223449766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595068, "dur": 3, "args": { "External id": 138452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138452, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138452, "pid": 5, "tid": 7, "ts": 1716454223595068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223449797, "dur": 8, "args": { "External id": 138452, "cbid": 211, "correlation": 138452 } }, { "ph": "s", "id": 138452, "pid": 76337, "tid": -914061504, "ts": 1716454223449797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595072, "dur": 3, "args": { "External id": 138461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138461, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138461, "pid": 5, "tid": 7, "ts": 1716454223595072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223449987, "dur": 14, "args": { "External id": 138461, "cbid": 211, "correlation": 138461 } }, { "ph": "s", "id": 138461, "pid": 76337, "tid": -914061504, "ts": 1716454223449987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595076, "dur": 3, "args": { "External id": 138470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138470, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138470, "pid": 5, "tid": 7, "ts": 1716454223595076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223450017, "dur": 7, "args": { "External id": 138470, "cbid": 211, "correlation": 138470 } }, { "ph": "s", "id": 138470, "pid": 76337, "tid": -914061504, "ts": 1716454223450017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595080, "dur": 3, "args": { "External id": 138478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138478, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138478, "pid": 5, "tid": 7, "ts": 1716454223595080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223450045, "dur": 9, "args": { "External id": 138478, "cbid": 211, "correlation": 138478 } }, { "ph": "s", "id": 138478, "pid": 76337, "tid": -914061504, "ts": 1716454223450045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595085, "dur": 3, "args": { "External id": 138486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138486, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138486, "pid": 5, "tid": 7, "ts": 1716454223595085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223450306, "dur": 15, "args": { "External id": 138486, "cbid": 211, "correlation": 138486 } }, { "ph": "s", "id": 138486, "pid": 76337, "tid": -914061504, "ts": 1716454223450306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595089, "dur": 3, "args": { "External id": 138494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138494, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138494, "pid": 5, "tid": 7, "ts": 1716454223595089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223450336, "dur": 8, "args": { "External id": 138494, "cbid": 211, "correlation": 138494 } }, { "ph": "s", "id": 138494, "pid": 76337, "tid": -914061504, "ts": 1716454223450336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223595094, "dur": 1, "args": { "External id": 138504, "device": 5, "context": 1, "stream": 7, "correlation": 138504, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 138504, "pid": 5, "tid": 7, "ts": 1716454223595094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223450405, "dur": 36, "args": { "External id": 138504, "cbid": 41, "correlation": 138504 } }, { "ph": "s", "id": 138504, "pid": 76337, "tid": -914061504, "ts": 1716454223450405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223450442, "dur": 144670, "args": { "External id": 138505, "cbid": 131, "correlation": 138505 } }, { "ph": "f", "id": 138505, "pid": 76337, "tid": -914061504, "ts": 1716454223450442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223595305, "dur": 3, "args": { "External id": 138513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138513, "pid": 5, "tid": 7, "ts": 1716454223595305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595273, "dur": 35, "args": { "External id": 138513, "cbid": 211, "correlation": 138513 } }, { "ph": "s", "id": 138513, "pid": 76337, "tid": -914061504, "ts": 1716454223595273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223595409, "dur": 3, "args": { "External id": 138522, "device": 5, "context": 1, "stream": 7, "correlation": 138522, "bytes": 8, "memory bandwidth (GB/s)": 0.002380952380952381 } }, { "ph": "f", "id": 138522, "pid": 5, "tid": 7, "ts": 1716454223595409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223595372, "dur": 38, "args": { "External id": 138522, "cbid": 41, "correlation": 138522 } }, { "ph": "s", "id": 138522, "pid": 76337, "tid": -914061504, "ts": 1716454223595372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223595511, "dur": 4, "args": { "External id": 138532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138532, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138532, "pid": 5, "tid": 7, "ts": 1716454223595511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595495, "dur": 17, "args": { "External id": 138532, "cbid": 211, "correlation": 138532 } }, { "ph": "s", "id": 138532, "pid": 76337, "tid": -914061504, "ts": 1716454223595495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223595594, "dur": 1, "args": { "External id": 138542, "device": 5, "context": 1, "stream": 7, "correlation": 138542, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 138542, "pid": 5, "tid": 7, "ts": 1716454223595594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223595574, "dur": 18, "args": { "External id": 138542, "cbid": 41, "correlation": 138542 } }, { "ph": "s", "id": 138542, "pid": 76337, "tid": -914061504, "ts": 1716454223595574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223595593, "dur": 8, "args": { "External id": 138543, "cbid": 131, "correlation": 138543 } }, { "ph": "f", "id": 138543, "pid": 76337, "tid": -914061504, "ts": 1716454223595593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223595672, "dur": 3, "args": { "External id": 138550, "device": 5, "context": 1, "stream": 7, "correlation": 138550, "bytes": 98304, "memory bandwidth (GB/s)": 30.11764705882353 } }, { "ph": "f", "id": 138550, "pid": 5, "tid": 7, "ts": 1716454223595672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223595645, "dur": 26, "args": { "External id": 138550, "cbid": 41, "correlation": 138550 } }, { "ph": "s", "id": 138550, "pid": 76337, "tid": -914061504, "ts": 1716454223595645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223595764, "dur": 3, "args": { "External id": 138569, "device": 5, "context": 1, "stream": 7, "correlation": 138569, "bytes": 16, "memory bandwidth (GB/s)": 0.005263157894736842 } }, { "ph": "f", "id": 138569, "pid": 5, "tid": 7, "ts": 1716454223595764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223595745, "dur": 18, "args": { "External id": 138569, "cbid": 41, "correlation": 138569 } }, { "ph": "s", "id": 138569, "pid": 76337, "tid": -914061504, "ts": 1716454223595745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223595804, "dur": 3, "args": { "External id": 138575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138575, "pid": 5, "tid": 7, "ts": 1716454223595804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595793, "dur": 12, "args": { "External id": 138575, "cbid": 211, "correlation": 138575 } }, { "ph": "s", "id": 138575, "pid": 76337, "tid": -914061504, "ts": 1716454223595793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454223595819, "dur": 6, "args": { "External id": 138577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138577, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 138577, "pid": 5, "tid": 7, "ts": 1716454223595819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595809, "dur": 9, "args": { "External id": 138577, "cbid": 211, "correlation": 138577 } }, { "ph": "s", "id": 138577, "pid": 76337, "tid": -914061504, "ts": 1716454223595809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454223595827, "dur": 3, "args": { "External id": 138579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138579, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138579, "pid": 5, "tid": 7, "ts": 1716454223595827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595819, "dur": 7, "args": { "External id": 138579, "cbid": 211, "correlation": 138579 } }, { "ph": "s", "id": 138579, "pid": 76337, "tid": -914061504, "ts": 1716454223595819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223595862, "dur": 2, "args": { "External id": 138587, "device": 5, "context": 1, "stream": 7, "correlation": 138587, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 138587, "pid": 5, "tid": 7, "ts": 1716454223595862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223595848, "dur": 13, "args": { "External id": 138587, "cbid": 41, "correlation": 138587 } }, { "ph": "s", "id": 138587, "pid": 76337, "tid": -914061504, "ts": 1716454223595848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223595912, "dur": 3, "args": { "External id": 138601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138601, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138601, "pid": 5, "tid": 7, "ts": 1716454223595912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595901, "dur": 12, "args": { "External id": 138601, "cbid": 211, "correlation": 138601 } }, { "ph": "s", "id": 138601, "pid": 76337, "tid": -914061504, "ts": 1716454223595901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223595931, "dur": 2, "args": { "External id": 138615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138615, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138615, "pid": 5, "tid": 7, "ts": 1716454223595931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595924, "dur": 6, "args": { "External id": 138615, "cbid": 211, "correlation": 138615 } }, { "ph": "s", "id": 138615, "pid": 76337, "tid": -914061504, "ts": 1716454223595924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223595969, "dur": 6, "args": { "External id": 138622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138622, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138622, "pid": 5, "tid": 7, "ts": 1716454223595969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595959, "dur": 11, "args": { "External id": 138622, "cbid": 211, "correlation": 138622 } }, { "ph": "s", "id": 138622, "pid": 76337, "tid": -914061504, "ts": 1716454223595959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223595988, "dur": 6, "args": { "External id": 138625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138625, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138625, "pid": 5, "tid": 7, "ts": 1716454223595988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595980, "dur": 8, "args": { "External id": 138625, "cbid": 211, "correlation": 138625 } }, { "ph": "s", "id": 138625, "pid": 76337, "tid": -914061504, "ts": 1716454223595980, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454223595998, "dur": 3, "args": { "External id": 138627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138627, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138627, "pid": 5, "tid": 7, "ts": 1716454223595998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223595990, "dur": 7, "args": { "External id": 138627, "cbid": 211, "correlation": 138627 } }, { "ph": "s", "id": 138627, "pid": 76337, "tid": -914061504, "ts": 1716454223595990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223596019, "dur": 2, "args": { "External id": 138630, "device": 5, "context": 1, "stream": 7, "correlation": 138630, "bytes": 8, "memory bandwidth (GB/s)": 0.002717391304347826 } }, { "ph": "f", "id": 138630, "pid": 5, "tid": 7, "ts": 1716454223596019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223596006, "dur": 12, "args": { "External id": 138630, "cbid": 41, "correlation": 138630 } }, { "ph": "s", "id": 138630, "pid": 76337, "tid": -914061504, "ts": 1716454223596006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223596072, "dur": 4, "args": { "External id": 138646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138646, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138646, "pid": 5, "tid": 7, "ts": 1716454223596072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596059, "dur": 13, "args": { "External id": 138646, "cbid": 211, "correlation": 138646 } }, { "ph": "s", "id": 138646, "pid": 76337, "tid": -914061504, "ts": 1716454223596059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223596092, "dur": 3, "args": { "External id": 138651, "device": 5, "context": 1, "stream": 7, "correlation": 138651, "bytes": 1, "memory bandwidth (GB/s)": 0.0003094059405940594 } }, { "ph": "f", "id": 138651, "pid": 5, "tid": 7, "ts": 1716454223596092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223596077, "dur": 14, "args": { "External id": 138651, "cbid": 41, "correlation": 138651 } }, { "ph": "s", "id": 138651, "pid": 76337, "tid": -914061504, "ts": 1716454223596077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223596119, "dur": 1, "args": { "External id": 138657, "device": 5, "context": 1, "stream": 7, "correlation": 138657, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 138657, "pid": 5, "tid": 7, "ts": 1716454223596119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223596101, "dur": 27, "args": { "External id": 138657, "cbid": 41, "correlation": 138657 } }, { "ph": "s", "id": 138657, "pid": 76337, "tid": -914061504, "ts": 1716454223596101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223596129, "dur": 4, "args": { "External id": 138658, "cbid": 131, "correlation": 138658 } }, { "ph": "f", "id": 138658, "pid": 76337, "tid": -914061504, "ts": 1716454223596129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596181, "dur": 3, "args": { "External id": 138666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138666, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138666, "pid": 5, "tid": 7, "ts": 1716454223596181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596168, "dur": 13, "args": { "External id": 138666, "cbid": 211, "correlation": 138666 } }, { "ph": "s", "id": 138666, "pid": 76337, "tid": -914061504, "ts": 1716454223596168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596212, "dur": 3, "args": { "External id": 138676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138676, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138676, "pid": 5, "tid": 7, "ts": 1716454223596212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596203, "dur": 8, "args": { "External id": 138676, "cbid": 211, "correlation": 138676 } }, { "ph": "s", "id": 138676, "pid": 76337, "tid": -914061504, "ts": 1716454223596203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596243, "dur": 3, "args": { "External id": 138685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138685, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138685, "pid": 5, "tid": 7, "ts": 1716454223596243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596229, "dur": 13, "args": { "External id": 138685, "cbid": 211, "correlation": 138685 } }, { "ph": "s", "id": 138685, "pid": 76337, "tid": -914061504, "ts": 1716454223596229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223596363, "dur": 12, "args": { "External id": 138695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138695, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138695, "pid": 5, "tid": 7, "ts": 1716454223596363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596349, "dur": 14, "args": { "External id": 138695, "cbid": 211, "correlation": 138695 } }, { "ph": "s", "id": 138695, "pid": 76337, "tid": -914061504, "ts": 1716454223596349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596402, "dur": 3, "args": { "External id": 138703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138703, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138703, "pid": 5, "tid": 7, "ts": 1716454223596402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596392, "dur": 9, "args": { "External id": 138703, "cbid": 211, "correlation": 138703 } }, { "ph": "s", "id": 138703, "pid": 76337, "tid": -914061504, "ts": 1716454223596392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223596448, "dur": 11, "args": { "External id": 138713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138713, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138713, "pid": 5, "tid": 7, "ts": 1716454223596448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596437, "dur": 11, "args": { "External id": 138713, "cbid": 211, "correlation": 138713 } }, { "ph": "s", "id": 138713, "pid": 76337, "tid": -914061504, "ts": 1716454223596437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223596481, "dur": 10, "args": { "External id": 138721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138721, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138721, "pid": 5, "tid": 7, "ts": 1716454223596481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596472, "dur": 9, "args": { "External id": 138721, "cbid": 211, "correlation": 138721 } }, { "ph": "s", "id": 138721, "pid": 76337, "tid": -914061504, "ts": 1716454223596472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596508, "dur": 3, "args": { "External id": 138730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138730, "pid": 5, "tid": 7, "ts": 1716454223596508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596500, "dur": 9, "args": { "External id": 138730, "cbid": 211, "correlation": 138730 } }, { "ph": "s", "id": 138730, "pid": 76337, "tid": -914061504, "ts": 1716454223596500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223596533, "dur": 5, "args": { "External id": 138739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138739, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138739, "pid": 5, "tid": 7, "ts": 1716454223596533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596524, "dur": 8, "args": { "External id": 138739, "cbid": 211, "correlation": 138739 } }, { "ph": "s", "id": 138739, "pid": 76337, "tid": -914061504, "ts": 1716454223596524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223596571, "dur": 8, "args": { "External id": 138749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138749, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138749, "pid": 5, "tid": 7, "ts": 1716454223596571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596561, "dur": 10, "args": { "External id": 138749, "cbid": 211, "correlation": 138749 } }, { "ph": "s", "id": 138749, "pid": 76337, "tid": -914061504, "ts": 1716454223596561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596898, "dur": 3, "args": { "External id": 138758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138758, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138758, "pid": 5, "tid": 7, "ts": 1716454223596898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596883, "dur": 15, "args": { "External id": 138758, "cbid": 211, "correlation": 138758 } }, { "ph": "s", "id": 138758, "pid": 76337, "tid": -914061504, "ts": 1716454223596883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223596928, "dur": 3, "args": { "External id": 138766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138766, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138766, "pid": 5, "tid": 7, "ts": 1716454223596928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223596918, "dur": 9, "args": { "External id": 138766, "cbid": 211, "correlation": 138766 } }, { "ph": "s", "id": 138766, "pid": 76337, "tid": -914061504, "ts": 1716454223596918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223596987, "dur": 1, "args": { "External id": 138776, "device": 5, "context": 1, "stream": 7, "correlation": 138776, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 138776, "pid": 5, "tid": 7, "ts": 1716454223596987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223596964, "dur": 21, "args": { "External id": 138776, "cbid": 41, "correlation": 138776 } }, { "ph": "s", "id": 138776, "pid": 76337, "tid": -914061504, "ts": 1716454223596964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223596986, "dur": 7, "args": { "External id": 138777, "cbid": 131, "correlation": 138777 } }, { "ph": "f", "id": 138777, "pid": 76337, "tid": -914061504, "ts": 1716454223596986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597075, "dur": 2, "args": { "External id": 138785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138785, "pid": 5, "tid": 7, "ts": 1716454223597075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597061, "dur": 14, "args": { "External id": 138785, "cbid": 211, "correlation": 138785 } }, { "ph": "s", "id": 138785, "pid": 76337, "tid": -914061504, "ts": 1716454223597061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223597146, "dur": 3, "args": { "External id": 138794, "device": 5, "context": 1, "stream": 7, "correlation": 138794, "bytes": 8, "memory bandwidth (GB/s)": 0.002577319587628866 } }, { "ph": "f", "id": 138794, "pid": 5, "tid": 7, "ts": 1716454223597146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597129, "dur": 17, "args": { "External id": 138794, "cbid": 41, "correlation": 138794 } }, { "ph": "s", "id": 138794, "pid": 76337, "tid": -914061504, "ts": 1716454223597129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223597216, "dur": 3, "args": { "External id": 138804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138804, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 138804, "pid": 5, "tid": 7, "ts": 1716454223597216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597202, "dur": 14, "args": { "External id": 138804, "cbid": 211, "correlation": 138804 } }, { "ph": "s", "id": 138804, "pid": 76337, "tid": -914061504, "ts": 1716454223597202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223597267, "dur": 1, "args": { "External id": 138814, "device": 5, "context": 1, "stream": 7, "correlation": 138814, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 138814, "pid": 5, "tid": 7, "ts": 1716454223597267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597253, "dur": 12, "args": { "External id": 138814, "cbid": 41, "correlation": 138814 } }, { "ph": "s", "id": 138814, "pid": 76337, "tid": -914061504, "ts": 1716454223597253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223597266, "dur": 8, "args": { "External id": 138815, "cbid": 131, "correlation": 138815 } }, { "ph": "f", "id": 138815, "pid": 76337, "tid": -914061504, "ts": 1716454223597266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223597367, "dur": 3, "args": { "External id": 138822, "device": 5, "context": 1, "stream": 7, "correlation": 138822, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 138822, "pid": 5, "tid": 7, "ts": 1716454223597367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597307, "dur": 60, "args": { "External id": 138822, "cbid": 41, "correlation": 138822 } }, { "ph": "s", "id": 138822, "pid": 76337, "tid": -914061504, "ts": 1716454223597307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223597419, "dur": 1, "args": { "External id": 138833, "device": 5, "context": 1, "stream": 7, "correlation": 138833, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 138833, "pid": 5, "tid": 7, "ts": 1716454223597419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597406, "dur": 12, "args": { "External id": 138833, "cbid": 41, "correlation": 138833 } }, { "ph": "s", "id": 138833, "pid": 76337, "tid": -914061504, "ts": 1716454223597406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223597418, "dur": 8, "args": { "External id": 138834, "cbid": 131, "correlation": 138834 } }, { "ph": "f", "id": 138834, "pid": 76337, "tid": -914061504, "ts": 1716454223597418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597469, "dur": 3, "args": { "External id": 138842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138842, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138842, "pid": 5, "tid": 7, "ts": 1716454223597469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597455, "dur": 13, "args": { "External id": 138842, "cbid": 211, "correlation": 138842 } }, { "ph": "s", "id": 138842, "pid": 76337, "tid": -914061504, "ts": 1716454223597455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597497, "dur": 3, "args": { "External id": 138852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138852, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138852, "pid": 5, "tid": 7, "ts": 1716454223597497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597488, "dur": 8, "args": { "External id": 138852, "cbid": 211, "correlation": 138852 } }, { "ph": "s", "id": 138852, "pid": 76337, "tid": -914061504, "ts": 1716454223597488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597521, "dur": 3, "args": { "External id": 138861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138861, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138861, "pid": 5, "tid": 7, "ts": 1716454223597521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597512, "dur": 8, "args": { "External id": 138861, "cbid": 211, "correlation": 138861 } }, { "ph": "s", "id": 138861, "pid": 76337, "tid": -914061504, "ts": 1716454223597512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223597590, "dur": 6, "args": { "External id": 138869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138869, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138869, "pid": 5, "tid": 7, "ts": 1716454223597590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597577, "dur": 13, "args": { "External id": 138869, "cbid": 211, "correlation": 138869 } }, { "ph": "s", "id": 138869, "pid": 76337, "tid": -914061504, "ts": 1716454223597577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597630, "dur": 3, "args": { "External id": 138878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138878, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138878, "pid": 5, "tid": 7, "ts": 1716454223597630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597620, "dur": 9, "args": { "External id": 138878, "cbid": 211, "correlation": 138878 } }, { "ph": "s", "id": 138878, "pid": 76337, "tid": -914061504, "ts": 1716454223597620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597652, "dur": 3, "args": { "External id": 138887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138887, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138887, "pid": 5, "tid": 7, "ts": 1716454223597652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597644, "dur": 7, "args": { "External id": 138887, "cbid": 211, "correlation": 138887 } }, { "ph": "s", "id": 138887, "pid": 76337, "tid": -914061504, "ts": 1716454223597644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223597715, "dur": 3, "args": { "External id": 138895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138895, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 138895, "pid": 5, "tid": 7, "ts": 1716454223597715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597704, "dur": 10, "args": { "External id": 138895, "cbid": 211, "correlation": 138895 } }, { "ph": "s", "id": 138895, "pid": 76337, "tid": -914061504, "ts": 1716454223597704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223597775, "dur": 1, "args": { "External id": 138903, "device": 5, "context": 1, "stream": 7, "correlation": 138903, "bytes": 8, "memory bandwidth (GB/s)": 0.0043859649122807015 } }, { "ph": "f", "id": 138903, "pid": 5, "tid": 7, "ts": 1716454223597775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597759, "dur": 26, "args": { "External id": 138903, "cbid": 41, "correlation": 138903 } }, { "ph": "s", "id": 138903, "pid": 76337, "tid": -914061504, "ts": 1716454223597759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223597785, "dur": 3, "args": { "External id": 138904, "cbid": 131, "correlation": 138904 } }, { "ph": "f", "id": 138904, "pid": 76337, "tid": -914061504, "ts": 1716454223597785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223597846, "dur": 1, "args": { "External id": 138914, "device": 5, "context": 1, "stream": 7, "correlation": 138914, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 138914, "pid": 5, "tid": 7, "ts": 1716454223597846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597834, "dur": 10, "args": { "External id": 138914, "cbid": 41, "correlation": 138914 } }, { "ph": "s", "id": 138914, "pid": 76337, "tid": -914061504, "ts": 1716454223597834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223597845, "dur": 8, "args": { "External id": 138915, "cbid": 131, "correlation": 138915 } }, { "ph": "f", "id": 138915, "pid": 76337, "tid": -914061504, "ts": 1716454223597845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223597904, "dur": 1, "args": { "External id": 138924, "device": 5, "context": 1, "stream": 7, "correlation": 138924, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 138924, "pid": 5, "tid": 7, "ts": 1716454223597904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223597893, "dur": 8, "args": { "External id": 138924, "cbid": 41, "correlation": 138924 } }, { "ph": "s", "id": 138924, "pid": 76337, "tid": -914061504, "ts": 1716454223597893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223597903, "dur": 7, "args": { "External id": 138925, "cbid": 131, "correlation": 138925 } }, { "ph": "f", "id": 138925, "pid": 76337, "tid": -914061504, "ts": 1716454223597903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223597984, "dur": 4, "args": { "External id": 138932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138932, "pid": 5, "tid": 7, "ts": 1716454223597984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223597961, "dur": 25, "args": { "External id": 138932, "cbid": 211, "correlation": 138932 } }, { "ph": "s", "id": 138932, "pid": 76337, "tid": -914061504, "ts": 1716454223597961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454223598023, "dur": 4, "args": { "External id": 138952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138952, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138952, "pid": 5, "tid": 7, "ts": 1716454223598023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598011, "dur": 13, "args": { "External id": 138952, "cbid": 211, "correlation": 138952 } }, { "ph": "s", "id": 138952, "pid": 76337, "tid": -914061504, "ts": 1716454223598011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223598025, "dur": 0, "args": { "External id": 138953, "cbid": 11, "correlation": 138953 } }, { "ph": "f", "id": 138953, "pid": 76337, "tid": -914061504, "ts": 1716454223598025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223598026, "dur": 0, "args": { "External id": 138954, "cbid": 11, "correlation": 138954 } }, { "ph": "f", "id": 138954, "pid": 76337, "tid": -914061504, "ts": 1716454223598026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223598040, "dur": 1, "args": { "External id": 138957, "device": 5, "context": 1, "stream": 7, "correlation": 138957, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 138957, "pid": 5, "tid": 7, "ts": 1716454223598040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223598027, "dur": 22, "args": { "External id": 138957, "cbid": 41, "correlation": 138957 } }, { "ph": "s", "id": 138957, "pid": 76337, "tid": -914061504, "ts": 1716454223598027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223598050, "dur": 3, "args": { "External id": 138958, "cbid": 131, "correlation": 138958 } }, { "ph": "f", "id": 138958, "pid": 76337, "tid": -914061504, "ts": 1716454223598050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223598077, "dur": 3, "args": { "External id": 138982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138982, "pid": 5, "tid": 7, "ts": 1716454223598077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598068, "dur": 9, "args": { "External id": 138982, "cbid": 211, "correlation": 138982 } }, { "ph": "s", "id": 138982, "pid": 76337, "tid": -914061504, "ts": 1716454223598068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223598078, "dur": 0, "args": { "External id": 138983, "cbid": 11, "correlation": 138983 } }, { "ph": "f", "id": 138983, "pid": 76337, "tid": -914061504, "ts": 1716454223598078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223598078, "dur": 0, "args": { "External id": 138984, "cbid": 11, "correlation": 138984 } }, { "ph": "f", "id": 138984, "pid": 76337, "tid": -914061504, "ts": 1716454223598078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223598080, "dur": 1, "args": { "External id": 138986, "cbid": 200, "correlation": 138986 } }, { "ph": "f", "id": 138986, "pid": 76337, "tid": -914061504, "ts": 1716454223598080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454223598090, "dur": 4, "args": { "External id": 138988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 138988, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 138988, "pid": 5, "tid": 7, "ts": 1716454223598090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598083, "dur": 8, "args": { "External id": 138988, "cbid": 211, "correlation": 138988 } }, { "ph": "s", "id": 138988, "pid": 76337, "tid": -914061504, "ts": 1716454223598083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223598091, "dur": 0, "args": { "External id": 138989, "cbid": 11, "correlation": 138989 } }, { "ph": "f", "id": 138989, "pid": 76337, "tid": -914061504, "ts": 1716454223598091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223598092, "dur": 0, "args": { "External id": 138990, "cbid": 11, "correlation": 138990 } }, { "ph": "f", "id": 138990, "pid": 76337, "tid": -914061504, "ts": 1716454223598092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223598128, "dur": 1, "args": { "External id": 138997, "device": 5, "context": 1, "stream": 7, "correlation": 138997, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 138997, "pid": 5, "tid": 7, "ts": 1716454223598128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223598117, "dur": 20, "args": { "External id": 138997, "cbid": 41, "correlation": 138997 } }, { "ph": "s", "id": 138997, "pid": 76337, "tid": -914061504, "ts": 1716454223598117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223598137, "dur": 3, "args": { "External id": 138998, "cbid": 131, "correlation": 138998 } }, { "ph": "f", "id": 138998, "pid": 76337, "tid": -914061504, "ts": 1716454223598137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223598188, "dur": 1, "args": { "External id": 139008, "device": 5, "context": 1, "stream": 7, "correlation": 139008, "bytes": 8, "memory bandwidth (GB/s)": 0.00510204081632653 } }, { "ph": "f", "id": 139008, "pid": 5, "tid": 7, "ts": 1716454223598188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223598176, "dur": 9, "args": { "External id": 139008, "cbid": 41, "correlation": 139008 } }, { "ph": "s", "id": 139008, "pid": 76337, "tid": -914061504, "ts": 1716454223598176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223598186, "dur": 8, "args": { "External id": 139009, "cbid": 131, "correlation": 139009 } }, { "ph": "f", "id": 139009, "pid": 76337, "tid": -914061504, "ts": 1716454223598186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223598259, "dur": 5, "args": { "External id": 139016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139016, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139016, "pid": 5, "tid": 7, "ts": 1716454223598259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598243, "dur": 16, "args": { "External id": 139016, "cbid": 211, "correlation": 139016 } }, { "ph": "s", "id": 139016, "pid": 76337, "tid": -914061504, "ts": 1716454223598243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598329, "dur": 3, "args": { "External id": 139025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139025, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139025, "pid": 5, "tid": 7, "ts": 1716454223598329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598317, "dur": 12, "args": { "External id": 139025, "cbid": 211, "correlation": 139025 } }, { "ph": "s", "id": 139025, "pid": 76337, "tid": -914061504, "ts": 1716454223598317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598366, "dur": 3, "args": { "External id": 139033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139033, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139033, "pid": 5, "tid": 7, "ts": 1716454223598366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598356, "dur": 10, "args": { "External id": 139033, "cbid": 211, "correlation": 139033 } }, { "ph": "s", "id": 139033, "pid": 76337, "tid": -914061504, "ts": 1716454223598356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598399, "dur": 4, "args": { "External id": 139041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139041, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139041, "pid": 5, "tid": 7, "ts": 1716454223598399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598387, "dur": 12, "args": { "External id": 139041, "cbid": 211, "correlation": 139041 } }, { "ph": "s", "id": 139041, "pid": 76337, "tid": -914061504, "ts": 1716454223598387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598428, "dur": 4, "args": { "External id": 139049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139049, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139049, "pid": 5, "tid": 7, "ts": 1716454223598428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598418, "dur": 9, "args": { "External id": 139049, "cbid": 211, "correlation": 139049 } }, { "ph": "s", "id": 139049, "pid": 76337, "tid": -914061504, "ts": 1716454223598418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598455, "dur": 3, "args": { "External id": 139057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139057, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139057, "pid": 5, "tid": 7, "ts": 1716454223598455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598446, "dur": 8, "args": { "External id": 139057, "cbid": 211, "correlation": 139057 } }, { "ph": "s", "id": 139057, "pid": 76337, "tid": -914061504, "ts": 1716454223598446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598480, "dur": 3, "args": { "External id": 139065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139065, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139065, "pid": 5, "tid": 7, "ts": 1716454223598480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598471, "dur": 9, "args": { "External id": 139065, "cbid": 211, "correlation": 139065 } }, { "ph": "s", "id": 139065, "pid": 76337, "tid": -914061504, "ts": 1716454223598471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223598504, "dur": 4, "args": { "External id": 139073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139073, "pid": 5, "tid": 7, "ts": 1716454223598504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598495, "dur": 7, "args": { "External id": 139073, "cbid": 211, "correlation": 139073 } }, { "ph": "s", "id": 139073, "pid": 76337, "tid": -914061504, "ts": 1716454223598495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223598522, "dur": 5, "args": { "External id": 139081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139081, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139081, "pid": 5, "tid": 7, "ts": 1716454223598522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598514, "dur": 7, "args": { "External id": 139081, "cbid": 211, "correlation": 139081 } }, { "ph": "s", "id": 139081, "pid": 76337, "tid": -914061504, "ts": 1716454223598514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598542, "dur": 3, "args": { "External id": 139089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139089, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139089, "pid": 5, "tid": 7, "ts": 1716454223598542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598533, "dur": 7, "args": { "External id": 139089, "cbid": 211, "correlation": 139089 } }, { "ph": "s", "id": 139089, "pid": 76337, "tid": -914061504, "ts": 1716454223598533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598599, "dur": 3, "args": { "External id": 139097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139097, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139097, "pid": 5, "tid": 7, "ts": 1716454223598599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598587, "dur": 11, "args": { "External id": 139097, "cbid": 211, "correlation": 139097 } }, { "ph": "s", "id": 139097, "pid": 76337, "tid": -914061504, "ts": 1716454223598587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223598624, "dur": 4, "args": { "External id": 139105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139105, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139105, "pid": 5, "tid": 7, "ts": 1716454223598624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598615, "dur": 8, "args": { "External id": 139105, "cbid": 211, "correlation": 139105 } }, { "ph": "s", "id": 139105, "pid": 76337, "tid": -914061504, "ts": 1716454223598615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223598647, "dur": 4, "args": { "External id": 139113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139113, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139113, "pid": 5, "tid": 7, "ts": 1716454223598647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598638, "dur": 8, "args": { "External id": 139113, "cbid": 211, "correlation": 139113 } }, { "ph": "s", "id": 139113, "pid": 76337, "tid": -914061504, "ts": 1716454223598638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223598666, "dur": 3, "args": { "External id": 139121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139121, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139121, "pid": 5, "tid": 7, "ts": 1716454223598666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223598658, "dur": 6, "args": { "External id": 139121, "cbid": 211, "correlation": 139121 } }, { "ph": "s", "id": 139121, "pid": 76337, "tid": -914061504, "ts": 1716454223598658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223599069, "dur": 5, "args": { "External id": 139130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139130, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139130, "pid": 5, "tid": 7, "ts": 1716454223599069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599053, "dur": 16, "args": { "External id": 139130, "cbid": 211, "correlation": 139130 } }, { "ph": "s", "id": 139130, "pid": 76337, "tid": -914061504, "ts": 1716454223599053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223599107, "dur": 5, "args": { "External id": 139139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139139, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139139, "pid": 5, "tid": 7, "ts": 1716454223599107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599096, "dur": 10, "args": { "External id": 139139, "cbid": 211, "correlation": 139139 } }, { "ph": "s", "id": 139139, "pid": 76337, "tid": -914061504, "ts": 1716454223599096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223599237, "dur": 3, "args": { "External id": 139155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139155, "pid": 5, "tid": 7, "ts": 1716454223599237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599223, "dur": 14, "args": { "External id": 139155, "cbid": 211, "correlation": 139155 } }, { "ph": "s", "id": 139155, "pid": 76337, "tid": -914061504, "ts": 1716454223599223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599270, "dur": 3, "args": { "External id": 139163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139163, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139163, "pid": 5, "tid": 7, "ts": 1716454223599270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599260, "dur": 9, "args": { "External id": 139163, "cbid": 211, "correlation": 139163 } }, { "ph": "s", "id": 139163, "pid": 76337, "tid": -914061504, "ts": 1716454223599260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599301, "dur": 3, "args": { "External id": 139171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139171, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139171, "pid": 5, "tid": 7, "ts": 1716454223599301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599291, "dur": 8, "args": { "External id": 139171, "cbid": 211, "correlation": 139171 } }, { "ph": "s", "id": 139171, "pid": 76337, "tid": -914061504, "ts": 1716454223599291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599331, "dur": 4, "args": { "External id": 139179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139179, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139179, "pid": 5, "tid": 7, "ts": 1716454223599331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599321, "dur": 8, "args": { "External id": 139179, "cbid": 211, "correlation": 139179 } }, { "ph": "s", "id": 139179, "pid": 76337, "tid": -914061504, "ts": 1716454223599321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223599386, "dur": 4, "args": { "External id": 139191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139191, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139191, "pid": 5, "tid": 7, "ts": 1716454223599386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599373, "dur": 13, "args": { "External id": 139191, "cbid": 211, "correlation": 139191 } }, { "ph": "s", "id": 139191, "pid": 76337, "tid": -914061504, "ts": 1716454223599373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223599432, "dur": 4, "args": { "External id": 139202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139202, "pid": 5, "tid": 7, "ts": 1716454223599432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599420, "dur": 11, "args": { "External id": 139202, "cbid": 211, "correlation": 139202 } }, { "ph": "s", "id": 139202, "pid": 76337, "tid": -914061504, "ts": 1716454223599420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599464, "dur": 3, "args": { "External id": 139210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139210, "pid": 5, "tid": 7, "ts": 1716454223599464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599454, "dur": 9, "args": { "External id": 139210, "cbid": 211, "correlation": 139210 } }, { "ph": "s", "id": 139210, "pid": 76337, "tid": -914061504, "ts": 1716454223599454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599498, "dur": 5, "args": { "External id": 139218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139218, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139218, "pid": 5, "tid": 7, "ts": 1716454223599498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599488, "dur": 10, "args": { "External id": 139218, "cbid": 211, "correlation": 139218 } }, { "ph": "s", "id": 139218, "pid": 76337, "tid": -914061504, "ts": 1716454223599488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599527, "dur": 5, "args": { "External id": 139226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139226, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139226, "pid": 5, "tid": 7, "ts": 1716454223599527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599517, "dur": 9, "args": { "External id": 139226, "cbid": 211, "correlation": 139226 } }, { "ph": "s", "id": 139226, "pid": 76337, "tid": -914061504, "ts": 1716454223599517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223599558, "dur": 4, "args": { "External id": 139235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139235, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139235, "pid": 5, "tid": 7, "ts": 1716454223599558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599548, "dur": 10, "args": { "External id": 139235, "cbid": 211, "correlation": 139235 } }, { "ph": "s", "id": 139235, "pid": 76337, "tid": -914061504, "ts": 1716454223599548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223599619, "dur": 5, "args": { "External id": 139248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139248, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139248, "pid": 5, "tid": 7, "ts": 1716454223599619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599606, "dur": 13, "args": { "External id": 139248, "cbid": 211, "correlation": 139248 } }, { "ph": "s", "id": 139248, "pid": 76337, "tid": -914061504, "ts": 1716454223599606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223599659, "dur": 5, "args": { "External id": 139258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139258, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139258, "pid": 5, "tid": 7, "ts": 1716454223599659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599648, "dur": 11, "args": { "External id": 139258, "cbid": 211, "correlation": 139258 } }, { "ph": "s", "id": 139258, "pid": 76337, "tid": -914061504, "ts": 1716454223599648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223599788, "dur": 5, "args": { "External id": 139275, "cbid": 251, "correlation": 139275 } }, { "ph": "f", "id": 139275, "pid": 76337, "tid": -914061504, "ts": 1716454223599788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454223599817, "dur": 11, "args": { "External id": 139277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139277, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139277, "pid": 5, "tid": 7, "ts": 1716454223599817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599802, "dur": 17, "args": { "External id": 139277, "cbid": 211, "correlation": 139277 } }, { "ph": "s", "id": 139277, "pid": 76337, "tid": -914061504, "ts": 1716454223599802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223599880, "dur": 3, "args": { "External id": 139285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139285, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139285, "pid": 5, "tid": 7, "ts": 1716454223599880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599867, "dur": 12, "args": { "External id": 139285, "cbid": 211, "correlation": 139285 } }, { "ph": "s", "id": 139285, "pid": 76337, "tid": -914061504, "ts": 1716454223599867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223599938, "dur": 2, "args": { "External id": 139301, "cbid": 251, "correlation": 139301 } }, { "ph": "f", "id": 139301, "pid": 76337, "tid": -914061504, "ts": 1716454223599938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223599944, "dur": 0, "args": { "External id": 139303, "cbid": 251, "correlation": 139303 } }, { "ph": "f", "id": 139303, "pid": 76337, "tid": -914061504, "ts": 1716454223599944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223599960, "dur": 14, "args": { "External id": 139304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139304, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139304, "pid": 5, "tid": 7, "ts": 1716454223599960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599946, "dur": 14, "args": { "External id": 139304, "cbid": 211, "correlation": 139304 } }, { "ph": "s", "id": 139304, "pid": 76337, "tid": -914061504, "ts": 1716454223599946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223599975, "dur": 5, "args": { "External id": 139306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139306, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139306, "pid": 5, "tid": 7, "ts": 1716454223599975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223599965, "dur": 17, "args": { "External id": 139306, "cbid": 211, "correlation": 139306 } }, { "ph": "s", "id": 139306, "pid": 76337, "tid": -914061504, "ts": 1716454223599965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223600087, "dur": 1, "args": { "External id": 139316, "cbid": 317, "correlation": 139316 } }, { "ph": "f", "id": 139316, "pid": 76337, "tid": -914061504, "ts": 1716454223600087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223600089, "dur": 1, "args": { "External id": 139317, "cbid": 203, "correlation": 139317 } }, { "ph": "f", "id": 139317, "pid": 76337, "tid": -914061504, "ts": 1716454223600089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223600091, "dur": 1, "args": { "External id": 139318, "cbid": 205, "correlation": 139318 } }, { "ph": "f", "id": 139318, "pid": 76337, "tid": -914061504, "ts": 1716454223600091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223600149, "dur": 7, "args": { "External id": 139322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139322, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139322, "pid": 5, "tid": 7, "ts": 1716454223600149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223600134, "dur": 15, "args": { "External id": 139322, "cbid": 211, "correlation": 139322 } }, { "ph": "s", "id": 139322, "pid": 76337, "tid": -914061504, "ts": 1716454223600134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223600160, "dur": 4, "args": { "External id": 139324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139324, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 139324, "pid": 5, "tid": 7, "ts": 1716454223600160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223600153, "dur": 6, "args": { "External id": 139324, "cbid": 211, "correlation": 139324 } }, { "ph": "s", "id": 139324, "pid": 76337, "tid": -914061504, "ts": 1716454223600153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223600180, "dur": 4, "args": { "External id": 139326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139326, "pid": 5, "tid": 7, "ts": 1716454223600180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223600170, "dur": 9, "args": { "External id": 139326, "cbid": 211, "correlation": 139326 } }, { "ph": "s", "id": 139326, "pid": 76337, "tid": -914061504, "ts": 1716454223600170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223600186, "dur": 0, "args": { "External id": 139327, "cbid": 51, "correlation": 139327 } }, { "ph": "s", "id": 139327, "pid": 76337, "tid": -914061504, "ts": 1716454223600186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223600198, "dur": 87, "args": { "External id": 139328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139328, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139328, "pid": 5, "tid": 7, "ts": 1716454223600198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223600188, "dur": 9, "args": { "External id": 139328, "cbid": 211, "correlation": 139328 } }, { "ph": "s", "id": 139328, "pid": 76337, "tid": -914061504, "ts": 1716454223600188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223600286, "dur": 61, "args": { "External id": 139333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139333, "pid": 5, "tid": 7, "ts": 1716454223600286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223600228, "dur": 10, "args": { "External id": 139333, "cbid": 211, "correlation": 139333 } }, { "ph": "s", "id": 139333, "pid": 76337, "tid": -914061504, "ts": 1716454223600228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223602055, "dur": 51, "args": { "External id": 139353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139353, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 139353, "pid": 5, "tid": 7, "ts": 1716454223602055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602038, "dur": 17, "args": { "External id": 139353, "cbid": 211, "correlation": 139353 } }, { "ph": "s", "id": 139353, "pid": 76337, "tid": -914061504, "ts": 1716454223602038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223602108, "dur": 4, "args": { "External id": 139365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139365, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139365, "pid": 5, "tid": 7, "ts": 1716454223602108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602066, "dur": 8, "args": { "External id": 139365, "cbid": 211, "correlation": 139365 } }, { "ph": "s", "id": 139365, "pid": 76337, "tid": -914061504, "ts": 1716454223602066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223602114, "dur": 58, "args": { "External id": 139368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139368, "pid": 5, "tid": 7, "ts": 1716454223602114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602090, "dur": 8, "args": { "External id": 139368, "cbid": 211, "correlation": 139368 } }, { "ph": "s", "id": 139368, "pid": 76337, "tid": -914061504, "ts": 1716454223602090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223602173, "dur": 37, "args": { "External id": 139377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139377, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139377, "pid": 5, "tid": 7, "ts": 1716454223602173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602135, "dur": 11, "args": { "External id": 139377, "cbid": 211, "correlation": 139377 } }, { "ph": "s", "id": 139377, "pid": 76337, "tid": -914061504, "ts": 1716454223602135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223602194, "dur": 0, "args": { "External id": 139387, "cbid": 317, "correlation": 139387 } }, { "ph": "f", "id": 139387, "pid": 76337, "tid": -914061504, "ts": 1716454223602194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223602194, "dur": 0, "args": { "External id": 139388, "cbid": 203, "correlation": 139388 } }, { "ph": "f", "id": 139388, "pid": 76337, "tid": -914061504, "ts": 1716454223602194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223602195, "dur": 0, "args": { "External id": 139389, "cbid": 205, "correlation": 139389 } }, { "ph": "f", "id": 139389, "pid": 76337, "tid": -914061504, "ts": 1716454223602195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223602227, "dur": 40, "args": { "External id": 139393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139393, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139393, "pid": 5, "tid": 7, "ts": 1716454223602227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602214, "dur": 12, "args": { "External id": 139393, "cbid": 211, "correlation": 139393 } }, { "ph": "s", "id": 139393, "pid": 76337, "tid": -914061504, "ts": 1716454223602214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223602269, "dur": 14, "args": { "External id": 139395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139395, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139395, "pid": 5, "tid": 7, "ts": 1716454223602269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602229, "dur": 6, "args": { "External id": 139395, "cbid": 211, "correlation": 139395 } }, { "ph": "s", "id": 139395, "pid": 76337, "tid": -914061504, "ts": 1716454223602229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223602285, "dur": 3, "args": { "External id": 139397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139397, "pid": 5, "tid": 7, "ts": 1716454223602285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602240, "dur": 6, "args": { "External id": 139397, "cbid": 211, "correlation": 139397 } }, { "ph": "s", "id": 139397, "pid": 76337, "tid": -914061504, "ts": 1716454223602240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223602250, "dur": 0, "args": { "External id": 139398, "cbid": 51, "correlation": 139398 } }, { "ph": "s", "id": 139398, "pid": 76337, "tid": -914061504, "ts": 1716454223602250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223602289, "dur": 714, "args": { "External id": 139399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139399, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139399, "pid": 5, "tid": 7, "ts": 1716454223602289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602251, "dur": 7, "args": { "External id": 139399, "cbid": 211, "correlation": 139399 } }, { "ph": "s", "id": 139399, "pid": 76337, "tid": -914061504, "ts": 1716454223602251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223603004, "dur": 60, "args": { "External id": 139404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139404, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139404, "pid": 5, "tid": 7, "ts": 1716454223603004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602281, "dur": 10, "args": { "External id": 139404, "cbid": 211, "correlation": 139404 } }, { "ph": "s", "id": 139404, "pid": 76337, "tid": -914061504, "ts": 1716454223602281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223603066, "dur": 5, "args": { "External id": 139412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139412, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139412, "pid": 5, "tid": 7, "ts": 1716454223603066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602325, "dur": 10, "args": { "External id": 139412, "cbid": 211, "correlation": 139412 } }, { "ph": "s", "id": 139412, "pid": 76337, "tid": -914061504, "ts": 1716454223602325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223602393, "dur": 2, "args": { "External id": 139428, "cbid": 251, "correlation": 139428 } }, { "ph": "f", "id": 139428, "pid": 76337, "tid": -914061504, "ts": 1716454223602393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223602399, "dur": 0, "args": { "External id": 139430, "cbid": 251, "correlation": 139430 } }, { "ph": "f", "id": 139430, "pid": 76337, "tid": -914061504, "ts": 1716454223602399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223603072, "dur": 9, "args": { "External id": 139431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139431, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 139431, "pid": 5, "tid": 7, "ts": 1716454223603072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602400, "dur": 12, "args": { "External id": 139431, "cbid": 211, "correlation": 139431 } }, { "ph": "s", "id": 139431, "pid": 76337, "tid": -914061504, "ts": 1716454223602400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223603082, "dur": 4, "args": { "External id": 139433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139433, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 139433, "pid": 5, "tid": 7, "ts": 1716454223603082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602414, "dur": 6, "args": { "External id": 139433, "cbid": 211, "correlation": 139433 } }, { "ph": "s", "id": 139433, "pid": 76337, "tid": -914061504, "ts": 1716454223602414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223603087, "dur": 55, "args": { "External id": 139443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139443, "pid": 5, "tid": 7, "ts": 1716454223603087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602473, "dur": 13, "args": { "External id": 139443, "cbid": 211, "correlation": 139443 } }, { "ph": "s", "id": 139443, "pid": 76337, "tid": -914061504, "ts": 1716454223602473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223603144, "dur": 52, "args": { "External id": 139463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139463, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 139463, "pid": 5, "tid": 7, "ts": 1716454223603144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602542, "dur": 10, "args": { "External id": 139463, "cbid": 211, "correlation": 139463 } }, { "ph": "s", "id": 139463, "pid": 76337, "tid": -914061504, "ts": 1716454223602542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223603197, "dur": 4, "args": { "External id": 139475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139475, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139475, "pid": 5, "tid": 7, "ts": 1716454223603197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602561, "dur": 7, "args": { "External id": 139475, "cbid": 211, "correlation": 139475 } }, { "ph": "s", "id": 139475, "pid": 76337, "tid": -914061504, "ts": 1716454223602561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223603202, "dur": 56, "args": { "External id": 139478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139478, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139478, "pid": 5, "tid": 7, "ts": 1716454223603202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602579, "dur": 7, "args": { "External id": 139478, "cbid": 211, "correlation": 139478 } }, { "ph": "s", "id": 139478, "pid": 76337, "tid": -914061504, "ts": 1716454223602579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223603259, "dur": 37, "args": { "External id": 139487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139487, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139487, "pid": 5, "tid": 7, "ts": 1716454223603259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602620, "dur": 10, "args": { "External id": 139487, "cbid": 211, "correlation": 139487 } }, { "ph": "s", "id": 139487, "pid": 76337, "tid": -914061504, "ts": 1716454223602620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223602693, "dur": 0, "args": { "External id": 139497, "cbid": 317, "correlation": 139497 } }, { "ph": "f", "id": 139497, "pid": 76337, "tid": -914061504, "ts": 1716454223602693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223602694, "dur": 0, "args": { "External id": 139498, "cbid": 203, "correlation": 139498 } }, { "ph": "f", "id": 139498, "pid": 76337, "tid": -914061504, "ts": 1716454223602694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223602694, "dur": 0, "args": { "External id": 139499, "cbid": 205, "correlation": 139499 } }, { "ph": "f", "id": 139499, "pid": 76337, "tid": -914061504, "ts": 1716454223602694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223603297, "dur": 40, "args": { "External id": 139503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139503, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139503, "pid": 5, "tid": 7, "ts": 1716454223603297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602708, "dur": 12, "args": { "External id": 139503, "cbid": 211, "correlation": 139503 } }, { "ph": "s", "id": 139503, "pid": 76337, "tid": -914061504, "ts": 1716454223602708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223603339, "dur": 14, "args": { "External id": 139505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139505, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139505, "pid": 5, "tid": 7, "ts": 1716454223603339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602723, "dur": 5, "args": { "External id": 139505, "cbid": 211, "correlation": 139505 } }, { "ph": "s", "id": 139505, "pid": 76337, "tid": -914061504, "ts": 1716454223602723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223603355, "dur": 3, "args": { "External id": 139507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139507, "pid": 5, "tid": 7, "ts": 1716454223603355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602732, "dur": 6, "args": { "External id": 139507, "cbid": 211, "correlation": 139507 } }, { "ph": "s", "id": 139507, "pid": 76337, "tid": -914061504, "ts": 1716454223602732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223602742, "dur": 0, "args": { "External id": 139508, "cbid": 51, "correlation": 139508 } }, { "ph": "s", "id": 139508, "pid": 76337, "tid": -914061504, "ts": 1716454223602742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223603360, "dur": 707, "args": { "External id": 139509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139509, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139509, "pid": 5, "tid": 7, "ts": 1716454223603360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602742, "dur": 5, "args": { "External id": 139509, "cbid": 211, "correlation": 139509 } }, { "ph": "s", "id": 139509, "pid": 76337, "tid": -914061504, "ts": 1716454223602742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223604068, "dur": 60, "args": { "External id": 139514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139514, "pid": 5, "tid": 7, "ts": 1716454223604068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602770, "dur": 9, "args": { "External id": 139514, "cbid": 211, "correlation": 139514 } }, { "ph": "s", "id": 139514, "pid": 76337, "tid": -914061504, "ts": 1716454223602770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223604130, "dur": 50, "args": { "External id": 139522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139522, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139522, "pid": 5, "tid": 7, "ts": 1716454223604130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602803, "dur": 9, "args": { "External id": 139522, "cbid": 211, "correlation": 139522 } }, { "ph": "s", "id": 139522, "pid": 76337, "tid": -914061504, "ts": 1716454223602803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223604181, "dur": 35, "args": { "External id": 139530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139530, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139530, "pid": 5, "tid": 7, "ts": 1716454223604181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602834, "dur": 10, "args": { "External id": 139530, "cbid": 211, "correlation": 139530 } }, { "ph": "s", "id": 139530, "pid": 76337, "tid": -914061504, "ts": 1716454223602834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223604218, "dur": 52, "args": { "External id": 139550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139550, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 139550, "pid": 5, "tid": 7, "ts": 1716454223604218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602917, "dur": 12, "args": { "External id": 139550, "cbid": 211, "correlation": 139550 } }, { "ph": "s", "id": 139550, "pid": 76337, "tid": -914061504, "ts": 1716454223602917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223604271, "dur": 4, "args": { "External id": 139562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139562, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 139562, "pid": 5, "tid": 7, "ts": 1716454223604271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602939, "dur": 7, "args": { "External id": 139562, "cbid": 211, "correlation": 139562 } }, { "ph": "s", "id": 139562, "pid": 76337, "tid": -914061504, "ts": 1716454223602939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223604276, "dur": 58, "args": { "External id": 139565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139565, "pid": 5, "tid": 7, "ts": 1716454223604276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223602957, "dur": 8, "args": { "External id": 139565, "cbid": 211, "correlation": 139565 } }, { "ph": "s", "id": 139565, "pid": 76337, "tid": -914061504, "ts": 1716454223602957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223603024, "dur": 0, "args": { "External id": 139576, "cbid": 317, "correlation": 139576 } }, { "ph": "f", "id": 139576, "pid": 76337, "tid": -914061504, "ts": 1716454223603024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223603025, "dur": 0, "args": { "External id": 139577, "cbid": 203, "correlation": 139577 } }, { "ph": "f", "id": 139577, "pid": 76337, "tid": -914061504, "ts": 1716454223603025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223603026, "dur": 0, "args": { "External id": 139578, "cbid": 205, "correlation": 139578 } }, { "ph": "f", "id": 139578, "pid": 76337, "tid": -914061504, "ts": 1716454223603026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603060, "dur": 2, "args": { "External id": 139582, "cbid": 251, "correlation": 139582 } }, { "ph": "f", "id": 139582, "pid": 76337, "tid": -914061504, "ts": 1716454223603060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603063, "dur": 1, "args": { "External id": 139583, "cbid": 251, "correlation": 139583 } }, { "ph": "f", "id": 139583, "pid": 76337, "tid": -914061504, "ts": 1716454223603063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603065, "dur": 1, "args": { "External id": 139584, "cbid": 251, "correlation": 139584 } }, { "ph": "f", "id": 139584, "pid": 76337, "tid": -914061504, "ts": 1716454223603065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603068, "dur": 1, "args": { "External id": 139585, "cbid": 251, "correlation": 139585 } }, { "ph": "f", "id": 139585, "pid": 76337, "tid": -914061504, "ts": 1716454223603068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603069, "dur": 1, "args": { "External id": 139586, "cbid": 251, "correlation": 139586 } }, { "ph": "f", "id": 139586, "pid": 76337, "tid": -914061504, "ts": 1716454223603069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603071, "dur": 1, "args": { "External id": 139587, "cbid": 251, "correlation": 139587 } }, { "ph": "f", "id": 139587, "pid": 76337, "tid": -914061504, "ts": 1716454223603071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603074, "dur": 1, "args": { "External id": 139588, "cbid": 251, "correlation": 139588 } }, { "ph": "f", "id": 139588, "pid": 76337, "tid": -914061504, "ts": 1716454223603074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603076, "dur": 1, "args": { "External id": 139589, "cbid": 251, "correlation": 139589 } }, { "ph": "f", "id": 139589, "pid": 76337, "tid": -914061504, "ts": 1716454223603076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603079, "dur": 0, "args": { "External id": 139590, "cbid": 251, "correlation": 139590 } }, { "ph": "f", "id": 139590, "pid": 76337, "tid": -914061504, "ts": 1716454223603079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223604335, "dur": 119, "args": { "External id": 139591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139591, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 139591, "pid": 5, "tid": 7, "ts": 1716454223604335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603083, "dur": 15, "args": { "External id": 139591, "cbid": 211, "correlation": 139591 } }, { "ph": "s", "id": 139591, "pid": 76337, "tid": -914061504, "ts": 1716454223603083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223604455, "dur": 60, "args": { "External id": 139597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139597, "pid": 5, "tid": 7, "ts": 1716454223604455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603121, "dur": 9, "args": { "External id": 139597, "cbid": 211, "correlation": 139597 } }, { "ph": "s", "id": 139597, "pid": 76337, "tid": -914061504, "ts": 1716454223603121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223604517, "dur": 644, "args": { "External id": 139606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139606, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139606, "pid": 5, "tid": 7, "ts": 1716454223604517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603216, "dur": 16, "args": { "External id": 139606, "cbid": 211, "correlation": 139606 } }, { "ph": "s", "id": 139606, "pid": 76337, "tid": -914061504, "ts": 1716454223603216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223605162, "dur": 184, "args": { "External id": 139628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139628, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139628, "pid": 5, "tid": 7, "ts": 1716454223605162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603289, "dur": 12, "args": { "External id": 139628, "cbid": 211, "correlation": 139628 } }, { "ph": "s", "id": 139628, "pid": 76337, "tid": -914061504, "ts": 1716454223603289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603404, "dur": 2, "args": { "External id": 139639, "cbid": 251, "correlation": 139639 } }, { "ph": "f", "id": 139639, "pid": 76337, "tid": -914061504, "ts": 1716454223603404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223605348, "dur": 197, "args": { "External id": 139640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139640, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139640, "pid": 5, "tid": 7, "ts": 1716454223605348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603412, "dur": 14, "args": { "External id": 139640, "cbid": 211, "correlation": 139640 } }, { "ph": "s", "id": 139640, "pid": 76337, "tid": -914061504, "ts": 1716454223603412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603485, "dur": 1, "args": { "External id": 139651, "cbid": 251, "correlation": 139651 } }, { "ph": "f", "id": 139651, "pid": 76337, "tid": -914061504, "ts": 1716454223603485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223605546, "dur": 194, "args": { "External id": 139652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139652, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139652, "pid": 5, "tid": 7, "ts": 1716454223605546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603489, "dur": 11, "args": { "External id": 139652, "cbid": 211, "correlation": 139652 } }, { "ph": "s", "id": 139652, "pid": 76337, "tid": -914061504, "ts": 1716454223603489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603555, "dur": 1, "args": { "External id": 139663, "cbid": 251, "correlation": 139663 } }, { "ph": "f", "id": 139663, "pid": 76337, "tid": -914061504, "ts": 1716454223603555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223605742, "dur": 192, "args": { "External id": 139664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139664, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139664, "pid": 5, "tid": 7, "ts": 1716454223605742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603559, "dur": 11, "args": { "External id": 139664, "cbid": 211, "correlation": 139664 } }, { "ph": "s", "id": 139664, "pid": 76337, "tid": -914061504, "ts": 1716454223603559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223605935, "dur": 19011, "args": { "External id": 139685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139685, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 139685, "pid": 5, "tid": 7, "ts": 1716454223605935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603666, "dur": 16, "args": { "External id": 139685, "cbid": 211, "correlation": 139685 } }, { "ph": "s", "id": 139685, "pid": 76337, "tid": -914061504, "ts": 1716454223603666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223603780, "dur": 2, "args": { "External id": 139703, "cbid": 251, "correlation": 139703 } }, { "ph": "f", "id": 139703, "pid": 76337, "tid": -914061504, "ts": 1716454223603780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223624947, "dur": 206, "args": { "External id": 139705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139705, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139705, "pid": 5, "tid": 7, "ts": 1716454223624947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603787, "dur": 14, "args": { "External id": 139705, "cbid": 211, "correlation": 139705 } }, { "ph": "s", "id": 139705, "pid": 76337, "tid": -914061504, "ts": 1716454223603787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223625155, "dur": 66, "args": { "External id": 139713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139713, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139713, "pid": 5, "tid": 7, "ts": 1716454223625155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603860, "dur": 13, "args": { "External id": 139713, "cbid": 211, "correlation": 139713 } }, { "ph": "s", "id": 139713, "pid": 76337, "tid": -914061504, "ts": 1716454223603860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223625222, "dur": 98, "args": { "External id": 139721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139721, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139721, "pid": 5, "tid": 7, "ts": 1716454223625222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603901, "dur": 9, "args": { "External id": 139721, "cbid": 211, "correlation": 139721 } }, { "ph": "s", "id": 139721, "pid": 76337, "tid": -914061504, "ts": 1716454223603901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223625322, "dur": 55, "args": { "External id": 139732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139732, "pid": 5, "tid": 7, "ts": 1716454223625322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223603996, "dur": 15, "args": { "External id": 139732, "cbid": 211, "correlation": 139732 } }, { "ph": "s", "id": 139732, "pid": 76337, "tid": -914061504, "ts": 1716454223603996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223625378, "dur": 94, "args": { "External id": 139754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139754, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139754, "pid": 5, "tid": 7, "ts": 1716454223625378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604031, "dur": 7, "args": { "External id": 139754, "cbid": 211, "correlation": 139754 } }, { "ph": "s", "id": 139754, "pid": 76337, "tid": -914061504, "ts": 1716454223604031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604116, "dur": 1, "args": { "External id": 139765, "cbid": 251, "correlation": 139765 } }, { "ph": "f", "id": 139765, "pid": 76337, "tid": -914061504, "ts": 1716454223604116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223625474, "dur": 106, "args": { "External id": 139766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139766, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139766, "pid": 5, "tid": 7, "ts": 1716454223625474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604121, "dur": 13, "args": { "External id": 139766, "cbid": 211, "correlation": 139766 } }, { "ph": "s", "id": 139766, "pid": 76337, "tid": -914061504, "ts": 1716454223604121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604203, "dur": 1, "args": { "External id": 139777, "cbid": 251, "correlation": 139777 } }, { "ph": "f", "id": 139777, "pid": 76337, "tid": -914061504, "ts": 1716454223604203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604207, "dur": 0, "args": { "External id": 139778, "cbid": 251, "correlation": 139778 } }, { "ph": "f", "id": 139778, "pid": 76337, "tid": -914061504, "ts": 1716454223604207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223625582, "dur": 10, "args": { "External id": 139779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139779, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139779, "pid": 5, "tid": 7, "ts": 1716454223625582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604209, "dur": 14, "args": { "External id": 139779, "cbid": 211, "correlation": 139779 } }, { "ph": "s", "id": 139779, "pid": 76337, "tid": -914061504, "ts": 1716454223604209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223625593, "dur": 5, "args": { "External id": 139781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139781, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 139781, "pid": 5, "tid": 7, "ts": 1716454223625593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604226, "dur": 8, "args": { "External id": 139781, "cbid": 211, "correlation": 139781 } }, { "ph": "s", "id": 139781, "pid": 76337, "tid": -914061504, "ts": 1716454223604226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604289, "dur": 1, "args": { "External id": 139792, "cbid": 251, "correlation": 139792 } }, { "ph": "f", "id": 139792, "pid": 76337, "tid": -914061504, "ts": 1716454223604289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604293, "dur": 0, "args": { "External id": 139793, "cbid": 251, "correlation": 139793 } }, { "ph": "f", "id": 139793, "pid": 76337, "tid": -914061504, "ts": 1716454223604293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223625600, "dur": 6, "args": { "External id": 139794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139794, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 139794, "pid": 5, "tid": 7, "ts": 1716454223625600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604294, "dur": 12, "args": { "External id": 139794, "cbid": 211, "correlation": 139794 } }, { "ph": "s", "id": 139794, "pid": 76337, "tid": -914061504, "ts": 1716454223604294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223625607, "dur": 4, "args": { "External id": 139796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139796, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 139796, "pid": 5, "tid": 7, "ts": 1716454223625607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604308, "dur": 5, "args": { "External id": 139796, "cbid": 211, "correlation": 139796 } }, { "ph": "s", "id": 139796, "pid": 76337, "tid": -914061504, "ts": 1716454223604308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223625612, "dur": 158, "args": { "External id": 139817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139817, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 139817, "pid": 5, "tid": 7, "ts": 1716454223625612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604381, "dur": 13, "args": { "External id": 139817, "cbid": 211, "correlation": 139817 } }, { "ph": "s", "id": 139817, "pid": 76337, "tid": -914061504, "ts": 1716454223604381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604479, "dur": 2, "args": { "External id": 139835, "cbid": 251, "correlation": 139835 } }, { "ph": "f", "id": 139835, "pid": 76337, "tid": -914061504, "ts": 1716454223604479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223625771, "dur": 107, "args": { "External id": 139837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139837, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 139837, "pid": 5, "tid": 7, "ts": 1716454223625771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604486, "dur": 14, "args": { "External id": 139837, "cbid": 211, "correlation": 139837 } }, { "ph": "s", "id": 139837, "pid": 76337, "tid": -914061504, "ts": 1716454223604486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223625880, "dur": 35, "args": { "External id": 139845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139845, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139845, "pid": 5, "tid": 7, "ts": 1716454223625880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604557, "dur": 13, "args": { "External id": 139845, "cbid": 211, "correlation": 139845 } }, { "ph": "s", "id": 139845, "pid": 76337, "tid": -914061504, "ts": 1716454223604557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223625916, "dur": 69, "args": { "External id": 139853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139853, "pid": 5, "tid": 7, "ts": 1716454223625916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604598, "dur": 9, "args": { "External id": 139853, "cbid": 211, "correlation": 139853 } }, { "ph": "s", "id": 139853, "pid": 76337, "tid": -914061504, "ts": 1716454223604598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223625987, "dur": 94, "args": { "External id": 139875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139875, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139875, "pid": 5, "tid": 7, "ts": 1716454223625987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604650, "dur": 10, "args": { "External id": 139875, "cbid": 211, "correlation": 139875 } }, { "ph": "s", "id": 139875, "pid": 76337, "tid": -914061504, "ts": 1716454223604650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604742, "dur": 1, "args": { "External id": 139891, "cbid": 251, "correlation": 139891 } }, { "ph": "f", "id": 139891, "pid": 76337, "tid": -914061504, "ts": 1716454223604742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223626082, "dur": 587, "args": { "External id": 139893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139893, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 139893, "pid": 5, "tid": 7, "ts": 1716454223626082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604748, "dur": 13, "args": { "External id": 139893, "cbid": 211, "correlation": 139893 } }, { "ph": "s", "id": 139893, "pid": 76337, "tid": -914061504, "ts": 1716454223604748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223626671, "dur": 248, "args": { "External id": 139901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139901, "pid": 5, "tid": 7, "ts": 1716454223626671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604827, "dur": 15, "args": { "External id": 139901, "cbid": 211, "correlation": 139901 } }, { "ph": "s", "id": 139901, "pid": 76337, "tid": -914061504, "ts": 1716454223604827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223626920, "dur": 255, "args": { "External id": 139909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139909, "pid": 5, "tid": 7, "ts": 1716454223626920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604863, "dur": 9, "args": { "External id": 139909, "cbid": 211, "correlation": 139909 } }, { "ph": "s", "id": 139909, "pid": 76337, "tid": -914061504, "ts": 1716454223604863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604947, "dur": 2, "args": { "External id": 139925, "cbid": 251, "correlation": 139925 } }, { "ph": "f", "id": 139925, "pid": 76337, "tid": -914061504, "ts": 1716454223604947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223604953, "dur": 0, "args": { "External id": 139927, "cbid": 251, "correlation": 139927 } }, { "ph": "f", "id": 139927, "pid": 76337, "tid": -914061504, "ts": 1716454223604953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223627177, "dur": 363, "args": { "External id": 139928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139928, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 139928, "pid": 5, "tid": 7, "ts": 1716454223627177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223604957, "dur": 15, "args": { "External id": 139928, "cbid": 211, "correlation": 139928 } }, { "ph": "s", "id": 139928, "pid": 76337, "tid": -914061504, "ts": 1716454223604957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223627541, "dur": 50, "args": { "External id": 139936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139936, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139936, "pid": 5, "tid": 7, "ts": 1716454223627541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605009, "dur": 12, "args": { "External id": 139936, "cbid": 211, "correlation": 139936 } }, { "ph": "s", "id": 139936, "pid": 76337, "tid": -914061504, "ts": 1716454223605009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223627592, "dur": 160, "args": { "External id": 139947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139947, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139947, "pid": 5, "tid": 7, "ts": 1716454223627592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605082, "dur": 12, "args": { "External id": 139947, "cbid": 211, "correlation": 139947 } }, { "ph": "s", "id": 139947, "pid": 76337, "tid": -914061504, "ts": 1716454223605082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223605148, "dur": 0, "args": { "External id": 139959, "cbid": 317, "correlation": 139959 } }, { "ph": "f", "id": 139959, "pid": 76337, "tid": -914061504, "ts": 1716454223605148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223605149, "dur": 0, "args": { "External id": 139960, "cbid": 203, "correlation": 139960 } }, { "ph": "f", "id": 139960, "pid": 76337, "tid": -914061504, "ts": 1716454223605149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223605150, "dur": 0, "args": { "External id": 139961, "cbid": 205, "correlation": 139961 } }, { "ph": "f", "id": 139961, "pid": 76337, "tid": -914061504, "ts": 1716454223605150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605175, "dur": 1, "args": { "External id": 139965, "cbid": 251, "correlation": 139965 } }, { "ph": "f", "id": 139965, "pid": 76337, "tid": -914061504, "ts": 1716454223605175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605177, "dur": 0, "args": { "External id": 139966, "cbid": 251, "correlation": 139966 } }, { "ph": "f", "id": 139966, "pid": 76337, "tid": -914061504, "ts": 1716454223605177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605177, "dur": 0, "args": { "External id": 139967, "cbid": 251, "correlation": 139967 } }, { "ph": "f", "id": 139967, "pid": 76337, "tid": -914061504, "ts": 1716454223605177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605178, "dur": 0, "args": { "External id": 139968, "cbid": 251, "correlation": 139968 } }, { "ph": "f", "id": 139968, "pid": 76337, "tid": -914061504, "ts": 1716454223605178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605179, "dur": 1, "args": { "External id": 139969, "cbid": 251, "correlation": 139969 } }, { "ph": "f", "id": 139969, "pid": 76337, "tid": -914061504, "ts": 1716454223605179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605180, "dur": 0, "args": { "External id": 139970, "cbid": 251, "correlation": 139970 } }, { "ph": "f", "id": 139970, "pid": 76337, "tid": -914061504, "ts": 1716454223605180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605181, "dur": 0, "args": { "External id": 139971, "cbid": 251, "correlation": 139971 } }, { "ph": "f", "id": 139971, "pid": 76337, "tid": -914061504, "ts": 1716454223605181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605182, "dur": 0, "args": { "External id": 139972, "cbid": 251, "correlation": 139972 } }, { "ph": "f", "id": 139972, "pid": 76337, "tid": -914061504, "ts": 1716454223605182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605183, "dur": 0, "args": { "External id": 139973, "cbid": 251, "correlation": 139973 } }, { "ph": "f", "id": 139973, "pid": 76337, "tid": -914061504, "ts": 1716454223605183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223627754, "dur": 117, "args": { "External id": 139974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139974, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 139974, "pid": 5, "tid": 7, "ts": 1716454223627754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605185, "dur": 12, "args": { "External id": 139974, "cbid": 211, "correlation": 139974 } }, { "ph": "s", "id": 139974, "pid": 76337, "tid": -914061504, "ts": 1716454223605185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223627873, "dur": 61, "args": { "External id": 139980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139980, "pid": 5, "tid": 7, "ts": 1716454223627873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605220, "dur": 9, "args": { "External id": 139980, "cbid": 211, "correlation": 139980 } }, { "ph": "s", "id": 139980, "pid": 76337, "tid": -914061504, "ts": 1716454223605220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223627935, "dur": 50, "args": { "External id": 139988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 139988, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 139988, "pid": 5, "tid": 7, "ts": 1716454223627935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605253, "dur": 8, "args": { "External id": 139988, "cbid": 211, "correlation": 139988 } }, { "ph": "s", "id": 139988, "pid": 76337, "tid": -914061504, "ts": 1716454223605253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223627986, "dur": 52, "args": { "External id": 140008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140008, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 140008, "pid": 5, "tid": 7, "ts": 1716454223627986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605328, "dur": 11, "args": { "External id": 140008, "cbid": 211, "correlation": 140008 } }, { "ph": "s", "id": 140008, "pid": 76337, "tid": -914061504, "ts": 1716454223605328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223628039, "dur": 5, "args": { "External id": 140020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140020, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140020, "pid": 5, "tid": 7, "ts": 1716454223628039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605349, "dur": 6, "args": { "External id": 140020, "cbid": 211, "correlation": 140020 } }, { "ph": "s", "id": 140020, "pid": 76337, "tid": -914061504, "ts": 1716454223605349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223628045, "dur": 58, "args": { "External id": 140023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140023, "pid": 5, "tid": 7, "ts": 1716454223628045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605368, "dur": 6, "args": { "External id": 140023, "cbid": 211, "correlation": 140023 } }, { "ph": "s", "id": 140023, "pid": 76337, "tid": -914061504, "ts": 1716454223605368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223628104, "dur": 37, "args": { "External id": 140032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140032, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140032, "pid": 5, "tid": 7, "ts": 1716454223628104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605407, "dur": 11, "args": { "External id": 140032, "cbid": 211, "correlation": 140032 } }, { "ph": "s", "id": 140032, "pid": 76337, "tid": -914061504, "ts": 1716454223605407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223605460, "dur": 0, "args": { "External id": 140042, "cbid": 317, "correlation": 140042 } }, { "ph": "f", "id": 140042, "pid": 76337, "tid": -914061504, "ts": 1716454223605460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223605460, "dur": 0, "args": { "External id": 140043, "cbid": 203, "correlation": 140043 } }, { "ph": "f", "id": 140043, "pid": 76337, "tid": -914061504, "ts": 1716454223605460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223605461, "dur": 0, "args": { "External id": 140044, "cbid": 205, "correlation": 140044 } }, { "ph": "f", "id": 140044, "pid": 76337, "tid": -914061504, "ts": 1716454223605461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223628142, "dur": 40, "args": { "External id": 140048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140048, "pid": 5, "tid": 7, "ts": 1716454223628142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605476, "dur": 12, "args": { "External id": 140048, "cbid": 211, "correlation": 140048 } }, { "ph": "s", "id": 140048, "pid": 76337, "tid": -914061504, "ts": 1716454223605476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223628184, "dur": 15, "args": { "External id": 140050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140050, "pid": 5, "tid": 7, "ts": 1716454223628184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605490, "dur": 5, "args": { "External id": 140050, "cbid": 211, "correlation": 140050 } }, { "ph": "s", "id": 140050, "pid": 76337, "tid": -914061504, "ts": 1716454223605490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223628200, "dur": 4, "args": { "External id": 140052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140052, "pid": 5, "tid": 7, "ts": 1716454223628200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605499, "dur": 5, "args": { "External id": 140052, "cbid": 211, "correlation": 140052 } }, { "ph": "s", "id": 140052, "pid": 76337, "tid": -914061504, "ts": 1716454223605499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223605508, "dur": 0, "args": { "External id": 140053, "cbid": 51, "correlation": 140053 } }, { "ph": "s", "id": 140053, "pid": 76337, "tid": -914061504, "ts": 1716454223605508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223628205, "dur": 715, "args": { "External id": 140054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140054, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140054, "pid": 5, "tid": 7, "ts": 1716454223628205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605509, "dur": 6, "args": { "External id": 140054, "cbid": 211, "correlation": 140054 } }, { "ph": "s", "id": 140054, "pid": 76337, "tid": -914061504, "ts": 1716454223605509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223628921, "dur": 60, "args": { "External id": 140059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140059, "pid": 5, "tid": 7, "ts": 1716454223628921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605537, "dur": 8, "args": { "External id": 140059, "cbid": 211, "correlation": 140059 } }, { "ph": "s", "id": 140059, "pid": 76337, "tid": -914061504, "ts": 1716454223605537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223628982, "dur": 4, "args": { "External id": 140067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140067, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140067, "pid": 5, "tid": 7, "ts": 1716454223628982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605580, "dur": 10, "args": { "External id": 140067, "cbid": 211, "correlation": 140067 } }, { "ph": "s", "id": 140067, "pid": 76337, "tid": -914061504, "ts": 1716454223605580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605645, "dur": 1, "args": { "External id": 140083, "cbid": 251, "correlation": 140083 } }, { "ph": "f", "id": 140083, "pid": 76337, "tid": -914061504, "ts": 1716454223605645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223605650, "dur": 0, "args": { "External id": 140085, "cbid": 251, "correlation": 140085 } }, { "ph": "f", "id": 140085, "pid": 76337, "tid": -914061504, "ts": 1716454223605650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223628987, "dur": 11, "args": { "External id": 140086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140086, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 140086, "pid": 5, "tid": 7, "ts": 1716454223628987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605652, "dur": 12, "args": { "External id": 140086, "cbid": 211, "correlation": 140086 } }, { "ph": "s", "id": 140086, "pid": 76337, "tid": -914061504, "ts": 1716454223605652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223629000, "dur": 5, "args": { "External id": 140088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140088, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 140088, "pid": 5, "tid": 7, "ts": 1716454223629000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605666, "dur": 6, "args": { "External id": 140088, "cbid": 211, "correlation": 140088 } }, { "ph": "s", "id": 140088, "pid": 76337, "tid": -914061504, "ts": 1716454223605666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223629007, "dur": 55, "args": { "External id": 140098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140098, "pid": 5, "tid": 7, "ts": 1716454223629007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605723, "dur": 12, "args": { "External id": 140098, "cbid": 211, "correlation": 140098 } }, { "ph": "s", "id": 140098, "pid": 76337, "tid": -914061504, "ts": 1716454223605723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223629063, "dur": 51, "args": { "External id": 140118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140118, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 140118, "pid": 5, "tid": 7, "ts": 1716454223629063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605791, "dur": 11, "args": { "External id": 140118, "cbid": 211, "correlation": 140118 } }, { "ph": "s", "id": 140118, "pid": 76337, "tid": -914061504, "ts": 1716454223605791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223629115, "dur": 4, "args": { "External id": 140130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140130, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140130, "pid": 5, "tid": 7, "ts": 1716454223629115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605811, "dur": 7, "args": { "External id": 140130, "cbid": 211, "correlation": 140130 } }, { "ph": "s", "id": 140130, "pid": 76337, "tid": -914061504, "ts": 1716454223605811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223629120, "dur": 56, "args": { "External id": 140133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140133, "pid": 5, "tid": 7, "ts": 1716454223629120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605830, "dur": 7, "args": { "External id": 140133, "cbid": 211, "correlation": 140133 } }, { "ph": "s", "id": 140133, "pid": 76337, "tid": -914061504, "ts": 1716454223605830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223629177, "dur": 37, "args": { "External id": 140142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140142, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140142, "pid": 5, "tid": 7, "ts": 1716454223629177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605870, "dur": 10, "args": { "External id": 140142, "cbid": 211, "correlation": 140142 } }, { "ph": "s", "id": 140142, "pid": 76337, "tid": -914061504, "ts": 1716454223605870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223605934, "dur": 0, "args": { "External id": 140152, "cbid": 317, "correlation": 140152 } }, { "ph": "f", "id": 140152, "pid": 76337, "tid": -914061504, "ts": 1716454223605934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223605935, "dur": 0, "args": { "External id": 140153, "cbid": 203, "correlation": 140153 } }, { "ph": "f", "id": 140153, "pid": 76337, "tid": -914061504, "ts": 1716454223605935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223605936, "dur": 0, "args": { "External id": 140154, "cbid": 205, "correlation": 140154 } }, { "ph": "f", "id": 140154, "pid": 76337, "tid": -914061504, "ts": 1716454223605936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223629216, "dur": 40, "args": { "External id": 140158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140158, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140158, "pid": 5, "tid": 7, "ts": 1716454223629216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605949, "dur": 12, "args": { "External id": 140158, "cbid": 211, "correlation": 140158 } }, { "ph": "s", "id": 140158, "pid": 76337, "tid": -914061504, "ts": 1716454223605949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223629257, "dur": 15, "args": { "External id": 140160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140160, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140160, "pid": 5, "tid": 7, "ts": 1716454223629257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605963, "dur": 5, "args": { "External id": 140160, "cbid": 211, "correlation": 140160 } }, { "ph": "s", "id": 140160, "pid": 76337, "tid": -914061504, "ts": 1716454223605963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223629273, "dur": 3, "args": { "External id": 140162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140162, "pid": 5, "tid": 7, "ts": 1716454223629273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605981, "dur": 6, "args": { "External id": 140162, "cbid": 211, "correlation": 140162 } }, { "ph": "s", "id": 140162, "pid": 76337, "tid": -914061504, "ts": 1716454223605981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223605990, "dur": 0, "args": { "External id": 140163, "cbid": 51, "correlation": 140163 } }, { "ph": "s", "id": 140163, "pid": 76337, "tid": -914061504, "ts": 1716454223605990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223629278, "dur": 708, "args": { "External id": 140164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140164, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140164, "pid": 5, "tid": 7, "ts": 1716454223629278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223605991, "dur": 5, "args": { "External id": 140164, "cbid": 211, "correlation": 140164 } }, { "ph": "s", "id": 140164, "pid": 76337, "tid": -914061504, "ts": 1716454223605991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223629988, "dur": 60, "args": { "External id": 140169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140169, "pid": 5, "tid": 7, "ts": 1716454223629988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606019, "dur": 9, "args": { "External id": 140169, "cbid": 211, "correlation": 140169 } }, { "ph": "s", "id": 140169, "pid": 76337, "tid": -914061504, "ts": 1716454223606019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223630049, "dur": 50, "args": { "External id": 140177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140177, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140177, "pid": 5, "tid": 7, "ts": 1716454223630049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606052, "dur": 8, "args": { "External id": 140177, "cbid": 211, "correlation": 140177 } }, { "ph": "s", "id": 140177, "pid": 76337, "tid": -914061504, "ts": 1716454223606052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223630101, "dur": 36, "args": { "External id": 140185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140185, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140185, "pid": 5, "tid": 7, "ts": 1716454223630101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606081, "dur": 8, "args": { "External id": 140185, "cbid": 211, "correlation": 140185 } }, { "ph": "s", "id": 140185, "pid": 76337, "tid": -914061504, "ts": 1716454223606081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223630138, "dur": 51, "args": { "External id": 140205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140205, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 140205, "pid": 5, "tid": 7, "ts": 1716454223630138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606159, "dur": 13, "args": { "External id": 140205, "cbid": 211, "correlation": 140205 } }, { "ph": "s", "id": 140205, "pid": 76337, "tid": -914061504, "ts": 1716454223606159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223630191, "dur": 4, "args": { "External id": 140217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140217, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140217, "pid": 5, "tid": 7, "ts": 1716454223630191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606182, "dur": 6, "args": { "External id": 140217, "cbid": 211, "correlation": 140217 } }, { "ph": "s", "id": 140217, "pid": 76337, "tid": -914061504, "ts": 1716454223606182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223630196, "dur": 56, "args": { "External id": 140220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140220, "pid": 5, "tid": 7, "ts": 1716454223630196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606199, "dur": 7, "args": { "External id": 140220, "cbid": 211, "correlation": 140220 } }, { "ph": "s", "id": 140220, "pid": 76337, "tid": -914061504, "ts": 1716454223606199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223606257, "dur": 0, "args": { "External id": 140231, "cbid": 317, "correlation": 140231 } }, { "ph": "f", "id": 140231, "pid": 76337, "tid": -914061504, "ts": 1716454223606257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223606258, "dur": 0, "args": { "External id": 140232, "cbid": 203, "correlation": 140232 } }, { "ph": "f", "id": 140232, "pid": 76337, "tid": -914061504, "ts": 1716454223606258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223606258, "dur": 0, "args": { "External id": 140233, "cbid": 205, "correlation": 140233 } }, { "ph": "f", "id": 140233, "pid": 76337, "tid": -914061504, "ts": 1716454223606258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606281, "dur": 1, "args": { "External id": 140237, "cbid": 251, "correlation": 140237 } }, { "ph": "f", "id": 140237, "pid": 76337, "tid": -914061504, "ts": 1716454223606281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606283, "dur": 0, "args": { "External id": 140238, "cbid": 251, "correlation": 140238 } }, { "ph": "f", "id": 140238, "pid": 76337, "tid": -914061504, "ts": 1716454223606283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606284, "dur": 0, "args": { "External id": 140239, "cbid": 251, "correlation": 140239 } }, { "ph": "f", "id": 140239, "pid": 76337, "tid": -914061504, "ts": 1716454223606284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606284, "dur": 0, "args": { "External id": 140240, "cbid": 251, "correlation": 140240 } }, { "ph": "f", "id": 140240, "pid": 76337, "tid": -914061504, "ts": 1716454223606284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606285, "dur": 0, "args": { "External id": 140241, "cbid": 251, "correlation": 140241 } }, { "ph": "f", "id": 140241, "pid": 76337, "tid": -914061504, "ts": 1716454223606285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606286, "dur": 0, "args": { "External id": 140242, "cbid": 251, "correlation": 140242 } }, { "ph": "f", "id": 140242, "pid": 76337, "tid": -914061504, "ts": 1716454223606286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606287, "dur": 0, "args": { "External id": 140243, "cbid": 251, "correlation": 140243 } }, { "ph": "f", "id": 140243, "pid": 76337, "tid": -914061504, "ts": 1716454223606287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606287, "dur": 0, "args": { "External id": 140244, "cbid": 251, "correlation": 140244 } }, { "ph": "f", "id": 140244, "pid": 76337, "tid": -914061504, "ts": 1716454223606287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606289, "dur": 0, "args": { "External id": 140245, "cbid": 251, "correlation": 140245 } }, { "ph": "f", "id": 140245, "pid": 76337, "tid": -914061504, "ts": 1716454223606289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223630253, "dur": 113, "args": { "External id": 140246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140246, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 140246, "pid": 5, "tid": 7, "ts": 1716454223630253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606291, "dur": 12, "args": { "External id": 140246, "cbid": 211, "correlation": 140246 } }, { "ph": "s", "id": 140246, "pid": 76337, "tid": -914061504, "ts": 1716454223606291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223630368, "dur": 59, "args": { "External id": 140252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140252, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140252, "pid": 5, "tid": 7, "ts": 1716454223630368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606326, "dur": 9, "args": { "External id": 140252, "cbid": 211, "correlation": 140252 } }, { "ph": "s", "id": 140252, "pid": 76337, "tid": -914061504, "ts": 1716454223606326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223630429, "dur": 525, "args": { "External id": 140261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140261, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140261, "pid": 5, "tid": 7, "ts": 1716454223630429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606408, "dur": 15, "args": { "External id": 140261, "cbid": 211, "correlation": 140261 } }, { "ph": "s", "id": 140261, "pid": 76337, "tid": -914061504, "ts": 1716454223606408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223630955, "dur": 184, "args": { "External id": 140283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140283, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140283, "pid": 5, "tid": 7, "ts": 1716454223630955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606465, "dur": 11, "args": { "External id": 140283, "cbid": 211, "correlation": 140283 } }, { "ph": "s", "id": 140283, "pid": 76337, "tid": -914061504, "ts": 1716454223606465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606549, "dur": 1, "args": { "External id": 140294, "cbid": 251, "correlation": 140294 } }, { "ph": "f", "id": 140294, "pid": 76337, "tid": -914061504, "ts": 1716454223606549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223631141, "dur": 198, "args": { "External id": 140295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140295, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140295, "pid": 5, "tid": 7, "ts": 1716454223631141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606554, "dur": 13, "args": { "External id": 140295, "cbid": 211, "correlation": 140295 } }, { "ph": "s", "id": 140295, "pid": 76337, "tid": -914061504, "ts": 1716454223606554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606623, "dur": 1, "args": { "External id": 140306, "cbid": 251, "correlation": 140306 } }, { "ph": "f", "id": 140306, "pid": 76337, "tid": -914061504, "ts": 1716454223606623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223631339, "dur": 187, "args": { "External id": 140307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140307, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140307, "pid": 5, "tid": 7, "ts": 1716454223631339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606627, "dur": 11, "args": { "External id": 140307, "cbid": 211, "correlation": 140307 } }, { "ph": "s", "id": 140307, "pid": 76337, "tid": -914061504, "ts": 1716454223606627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606688, "dur": 1, "args": { "External id": 140318, "cbid": 251, "correlation": 140318 } }, { "ph": "f", "id": 140318, "pid": 76337, "tid": -914061504, "ts": 1716454223606688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223631527, "dur": 188, "args": { "External id": 140319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140319, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140319, "pid": 5, "tid": 7, "ts": 1716454223631527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606692, "dur": 11, "args": { "External id": 140319, "cbid": 211, "correlation": 140319 } }, { "ph": "s", "id": 140319, "pid": 76337, "tid": -914061504, "ts": 1716454223606692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223631717, "dur": 19029, "args": { "External id": 140340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140340, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 140340, "pid": 5, "tid": 7, "ts": 1716454223631717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606774, "dur": 12, "args": { "External id": 140340, "cbid": 211, "correlation": 140340 } }, { "ph": "s", "id": 140340, "pid": 76337, "tid": -914061504, "ts": 1716454223606774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223606870, "dur": 1, "args": { "External id": 140358, "cbid": 251, "correlation": 140358 } }, { "ph": "f", "id": 140358, "pid": 76337, "tid": -914061504, "ts": 1716454223606870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223650747, "dur": 202, "args": { "External id": 140360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140360, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140360, "pid": 5, "tid": 7, "ts": 1716454223650747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606877, "dur": 13, "args": { "External id": 140360, "cbid": 211, "correlation": 140360 } }, { "ph": "s", "id": 140360, "pid": 76337, "tid": -914061504, "ts": 1716454223606877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223650950, "dur": 66, "args": { "External id": 140368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140368, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140368, "pid": 5, "tid": 7, "ts": 1716454223650950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606946, "dur": 12, "args": { "External id": 140368, "cbid": 211, "correlation": 140368 } }, { "ph": "s", "id": 140368, "pid": 76337, "tid": -914061504, "ts": 1716454223606946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223651018, "dur": 97, "args": { "External id": 140376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140376, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140376, "pid": 5, "tid": 7, "ts": 1716454223651018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223606993, "dur": 10, "args": { "External id": 140376, "cbid": 211, "correlation": 140376 } }, { "ph": "s", "id": 140376, "pid": 76337, "tid": -914061504, "ts": 1716454223606993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223651116, "dur": 55, "args": { "External id": 140387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140387, "pid": 5, "tid": 7, "ts": 1716454223651116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607066, "dur": 13, "args": { "External id": 140387, "cbid": 211, "correlation": 140387 } }, { "ph": "s", "id": 140387, "pid": 76337, "tid": -914061504, "ts": 1716454223607066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223651172, "dur": 93, "args": { "External id": 140409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140409, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140409, "pid": 5, "tid": 7, "ts": 1716454223651172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607098, "dur": 7, "args": { "External id": 140409, "cbid": 211, "correlation": 140409 } }, { "ph": "s", "id": 140409, "pid": 76337, "tid": -914061504, "ts": 1716454223607098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607182, "dur": 1, "args": { "External id": 140420, "cbid": 251, "correlation": 140420 } }, { "ph": "f", "id": 140420, "pid": 76337, "tid": -914061504, "ts": 1716454223607182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223651267, "dur": 106, "args": { "External id": 140421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140421, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140421, "pid": 5, "tid": 7, "ts": 1716454223651267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607187, "dur": 13, "args": { "External id": 140421, "cbid": 211, "correlation": 140421 } }, { "ph": "s", "id": 140421, "pid": 76337, "tid": -914061504, "ts": 1716454223607187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607257, "dur": 1, "args": { "External id": 140432, "cbid": 251, "correlation": 140432 } }, { "ph": "f", "id": 140432, "pid": 76337, "tid": -914061504, "ts": 1716454223607257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607260, "dur": 0, "args": { "External id": 140433, "cbid": 251, "correlation": 140433 } }, { "ph": "f", "id": 140433, "pid": 76337, "tid": -914061504, "ts": 1716454223607260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223651374, "dur": 10, "args": { "External id": 140434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140434, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 140434, "pid": 5, "tid": 7, "ts": 1716454223651374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607262, "dur": 12, "args": { "External id": 140434, "cbid": 211, "correlation": 140434 } }, { "ph": "s", "id": 140434, "pid": 76337, "tid": -914061504, "ts": 1716454223607262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223651385, "dur": 5, "args": { "External id": 140436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140436, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 140436, "pid": 5, "tid": 7, "ts": 1716454223651385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607276, "dur": 6, "args": { "External id": 140436, "cbid": 211, "correlation": 140436 } }, { "ph": "s", "id": 140436, "pid": 76337, "tid": -914061504, "ts": 1716454223607276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607336, "dur": 1, "args": { "External id": 140447, "cbid": 251, "correlation": 140447 } }, { "ph": "f", "id": 140447, "pid": 76337, "tid": -914061504, "ts": 1716454223607336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607339, "dur": 0, "args": { "External id": 140448, "cbid": 251, "correlation": 140448 } }, { "ph": "f", "id": 140448, "pid": 76337, "tid": -914061504, "ts": 1716454223607339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223651391, "dur": 6, "args": { "External id": 140449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140449, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 140449, "pid": 5, "tid": 7, "ts": 1716454223651391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607341, "dur": 12, "args": { "External id": 140449, "cbid": 211, "correlation": 140449 } }, { "ph": "s", "id": 140449, "pid": 76337, "tid": -914061504, "ts": 1716454223607341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223651399, "dur": 3, "args": { "External id": 140451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140451, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 140451, "pid": 5, "tid": 7, "ts": 1716454223651399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607354, "dur": 6, "args": { "External id": 140451, "cbid": 211, "correlation": 140451 } }, { "ph": "s", "id": 140451, "pid": 76337, "tid": -914061504, "ts": 1716454223607354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223651404, "dur": 158, "args": { "External id": 140472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140472, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 140472, "pid": 5, "tid": 7, "ts": 1716454223651404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607429, "dur": 12, "args": { "External id": 140472, "cbid": 211, "correlation": 140472 } }, { "ph": "s", "id": 140472, "pid": 76337, "tid": -914061504, "ts": 1716454223607429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607525, "dur": 1, "args": { "External id": 140490, "cbid": 251, "correlation": 140490 } }, { "ph": "f", "id": 140490, "pid": 76337, "tid": -914061504, "ts": 1716454223607525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223651563, "dur": 107, "args": { "External id": 140492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140492, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 140492, "pid": 5, "tid": 7, "ts": 1716454223651563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607531, "dur": 13, "args": { "External id": 140492, "cbid": 211, "correlation": 140492 } }, { "ph": "s", "id": 140492, "pid": 76337, "tid": -914061504, "ts": 1716454223607531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223651671, "dur": 35, "args": { "External id": 140500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140500, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140500, "pid": 5, "tid": 7, "ts": 1716454223651671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607600, "dur": 12, "args": { "External id": 140500, "cbid": 211, "correlation": 140500 } }, { "ph": "s", "id": 140500, "pid": 76337, "tid": -914061504, "ts": 1716454223607600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223651708, "dur": 67, "args": { "External id": 140508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140508, "pid": 5, "tid": 7, "ts": 1716454223651708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607641, "dur": 9, "args": { "External id": 140508, "cbid": 211, "correlation": 140508 } }, { "ph": "s", "id": 140508, "pid": 76337, "tid": -914061504, "ts": 1716454223607641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223651777, "dur": 94, "args": { "External id": 140530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140530, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140530, "pid": 5, "tid": 7, "ts": 1716454223651777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607693, "dur": 10, "args": { "External id": 140530, "cbid": 211, "correlation": 140530 } }, { "ph": "s", "id": 140530, "pid": 76337, "tid": -914061504, "ts": 1716454223607693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607777, "dur": 1, "args": { "External id": 140546, "cbid": 251, "correlation": 140546 } }, { "ph": "f", "id": 140546, "pid": 76337, "tid": -914061504, "ts": 1716454223607777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223651872, "dur": 580, "args": { "External id": 140548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140548, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140548, "pid": 5, "tid": 7, "ts": 1716454223651872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607784, "dur": 12, "args": { "External id": 140548, "cbid": 211, "correlation": 140548 } }, { "ph": "s", "id": 140548, "pid": 76337, "tid": -914061504, "ts": 1716454223607784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223652453, "dur": 246, "args": { "External id": 140556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140556, "pid": 5, "tid": 7, "ts": 1716454223652453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607848, "dur": 12, "args": { "External id": 140556, "cbid": 211, "correlation": 140556 } }, { "ph": "s", "id": 140556, "pid": 76337, "tid": -914061504, "ts": 1716454223607848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223652700, "dur": 252, "args": { "External id": 140564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140564, "pid": 5, "tid": 7, "ts": 1716454223652700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607877, "dur": 9, "args": { "External id": 140564, "cbid": 211, "correlation": 140564 } }, { "ph": "s", "id": 140564, "pid": 76337, "tid": -914061504, "ts": 1716454223607877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607958, "dur": 1, "args": { "External id": 140580, "cbid": 251, "correlation": 140580 } }, { "ph": "f", "id": 140580, "pid": 76337, "tid": -914061504, "ts": 1716454223607958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223607963, "dur": 0, "args": { "External id": 140582, "cbid": 251, "correlation": 140582 } }, { "ph": "f", "id": 140582, "pid": 76337, "tid": -914061504, "ts": 1716454223607963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223652953, "dur": 360, "args": { "External id": 140583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140583, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 140583, "pid": 5, "tid": 7, "ts": 1716454223652953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223607966, "dur": 21, "args": { "External id": 140583, "cbid": 211, "correlation": 140583 } }, { "ph": "s", "id": 140583, "pid": 76337, "tid": -914061504, "ts": 1716454223607966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223653315, "dur": 49, "args": { "External id": 140591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140591, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140591, "pid": 5, "tid": 7, "ts": 1716454223653315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608017, "dur": 10, "args": { "External id": 140591, "cbid": 211, "correlation": 140591 } }, { "ph": "s", "id": 140591, "pid": 76337, "tid": -914061504, "ts": 1716454223608017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223653365, "dur": 157, "args": { "External id": 140602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140602, "pid": 5, "tid": 7, "ts": 1716454223653365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608084, "dur": 12, "args": { "External id": 140602, "cbid": 211, "correlation": 140602 } }, { "ph": "s", "id": 140602, "pid": 76337, "tid": -914061504, "ts": 1716454223608084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223608148, "dur": 0, "args": { "External id": 140614, "cbid": 317, "correlation": 140614 } }, { "ph": "f", "id": 140614, "pid": 76337, "tid": -914061504, "ts": 1716454223608148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223608149, "dur": 0, "args": { "External id": 140615, "cbid": 203, "correlation": 140615 } }, { "ph": "f", "id": 140615, "pid": 76337, "tid": -914061504, "ts": 1716454223608149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223608149, "dur": 0, "args": { "External id": 140616, "cbid": 205, "correlation": 140616 } }, { "ph": "f", "id": 140616, "pid": 76337, "tid": -914061504, "ts": 1716454223608149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608172, "dur": 1, "args": { "External id": 140620, "cbid": 251, "correlation": 140620 } }, { "ph": "f", "id": 140620, "pid": 76337, "tid": -914061504, "ts": 1716454223608172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608173, "dur": 0, "args": { "External id": 140621, "cbid": 251, "correlation": 140621 } }, { "ph": "f", "id": 140621, "pid": 76337, "tid": -914061504, "ts": 1716454223608173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608174, "dur": 0, "args": { "External id": 140622, "cbid": 251, "correlation": 140622 } }, { "ph": "f", "id": 140622, "pid": 76337, "tid": -914061504, "ts": 1716454223608174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608175, "dur": 0, "args": { "External id": 140623, "cbid": 251, "correlation": 140623 } }, { "ph": "f", "id": 140623, "pid": 76337, "tid": -914061504, "ts": 1716454223608175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608176, "dur": 0, "args": { "External id": 140624, "cbid": 251, "correlation": 140624 } }, { "ph": "f", "id": 140624, "pid": 76337, "tid": -914061504, "ts": 1716454223608176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608176, "dur": 0, "args": { "External id": 140625, "cbid": 251, "correlation": 140625 } }, { "ph": "f", "id": 140625, "pid": 76337, "tid": -914061504, "ts": 1716454223608176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608177, "dur": 0, "args": { "External id": 140626, "cbid": 251, "correlation": 140626 } }, { "ph": "f", "id": 140626, "pid": 76337, "tid": -914061504, "ts": 1716454223608177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608178, "dur": 0, "args": { "External id": 140627, "cbid": 251, "correlation": 140627 } }, { "ph": "f", "id": 140627, "pid": 76337, "tid": -914061504, "ts": 1716454223608178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608179, "dur": 0, "args": { "External id": 140628, "cbid": 251, "correlation": 140628 } }, { "ph": "f", "id": 140628, "pid": 76337, "tid": -914061504, "ts": 1716454223608179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223653524, "dur": 115, "args": { "External id": 140629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140629, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 140629, "pid": 5, "tid": 7, "ts": 1716454223653524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608181, "dur": 13, "args": { "External id": 140629, "cbid": 211, "correlation": 140629 } }, { "ph": "s", "id": 140629, "pid": 76337, "tid": -914061504, "ts": 1716454223608181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223653641, "dur": 59, "args": { "External id": 140635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140635, "pid": 5, "tid": 7, "ts": 1716454223653641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608216, "dur": 9, "args": { "External id": 140635, "cbid": 211, "correlation": 140635 } }, { "ph": "s", "id": 140635, "pid": 76337, "tid": -914061504, "ts": 1716454223608216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223653702, "dur": 50, "args": { "External id": 140643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140643, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140643, "pid": 5, "tid": 7, "ts": 1716454223653702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608248, "dur": 8, "args": { "External id": 140643, "cbid": 211, "correlation": 140643 } }, { "ph": "s", "id": 140643, "pid": 76337, "tid": -914061504, "ts": 1716454223608248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223608319, "dur": 0, "args": { "External id": 140653, "cbid": 317, "correlation": 140653 } }, { "ph": "f", "id": 140653, "pid": 76337, "tid": -914061504, "ts": 1716454223608319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223608320, "dur": 0, "args": { "External id": 140654, "cbid": 203, "correlation": 140654 } }, { "ph": "f", "id": 140654, "pid": 76337, "tid": -914061504, "ts": 1716454223608320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223608321, "dur": 0, "args": { "External id": 140655, "cbid": 205, "correlation": 140655 } }, { "ph": "f", "id": 140655, "pid": 76337, "tid": -914061504, "ts": 1716454223608321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223653753, "dur": 41, "args": { "External id": 140659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140659, "pid": 5, "tid": 7, "ts": 1716454223653753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608337, "dur": 12, "args": { "External id": 140659, "cbid": 211, "correlation": 140659 } }, { "ph": "s", "id": 140659, "pid": 76337, "tid": -914061504, "ts": 1716454223608337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223653795, "dur": 15, "args": { "External id": 140661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140661, "pid": 5, "tid": 7, "ts": 1716454223653795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608352, "dur": 5, "args": { "External id": 140661, "cbid": 211, "correlation": 140661 } }, { "ph": "s", "id": 140661, "pid": 76337, "tid": -914061504, "ts": 1716454223608352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223653812, "dur": 1, "args": { "External id": 140663, "device": 5, "context": 1, "stream": 7, "correlation": 140663, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 140663, "pid": 5, "tid": 7, "ts": 1716454223653812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223608372, "dur": 17, "args": { "External id": 140663, "cbid": 51, "correlation": 140663 } }, { "ph": "s", "id": 140663, "pid": 76337, "tid": -914061504, "ts": 1716454223608372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223653815, "dur": 364, "args": { "External id": 140664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140664, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140664, "pid": 5, "tid": 7, "ts": 1716454223653815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608390, "dur": 10, "args": { "External id": 140664, "cbid": 211, "correlation": 140664 } }, { "ph": "s", "id": 140664, "pid": 76337, "tid": -914061504, "ts": 1716454223608390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223654181, "dur": 13, "args": { "External id": 140666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140666, "pid": 5, "tid": 7, "ts": 1716454223654181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608408, "dur": 7, "args": { "External id": 140666, "cbid": 211, "correlation": 140666 } }, { "ph": "s", "id": 140666, "pid": 76337, "tid": -914061504, "ts": 1716454223608408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223654195, "dur": 15, "args": { "External id": 140672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140672, "pid": 5, "tid": 7, "ts": 1716454223654195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608440, "dur": 9, "args": { "External id": 140672, "cbid": 211, "correlation": 140672 } }, { "ph": "s", "id": 140672, "pid": 76337, "tid": -914061504, "ts": 1716454223608440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223654211, "dur": 19, "args": { "External id": 140692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140692, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 140692, "pid": 5, "tid": 7, "ts": 1716454223654211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608534, "dur": 12, "args": { "External id": 140692, "cbid": 211, "correlation": 140692 } }, { "ph": "s", "id": 140692, "pid": 76337, "tid": -914061504, "ts": 1716454223608534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223654231, "dur": 4, "args": { "External id": 140704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140704, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140704, "pid": 5, "tid": 7, "ts": 1716454223654231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608556, "dur": 6, "args": { "External id": 140704, "cbid": 211, "correlation": 140704 } }, { "ph": "s", "id": 140704, "pid": 76337, "tid": -914061504, "ts": 1716454223608556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223654237, "dur": 17, "args": { "External id": 140707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140707, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140707, "pid": 5, "tid": 7, "ts": 1716454223654237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608574, "dur": 8, "args": { "External id": 140707, "cbid": 211, "correlation": 140707 } }, { "ph": "s", "id": 140707, "pid": 76337, "tid": -914061504, "ts": 1716454223608574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223654256, "dur": 11, "args": { "External id": 140716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140716, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140716, "pid": 5, "tid": 7, "ts": 1716454223654256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608614, "dur": 10, "args": { "External id": 140716, "cbid": 211, "correlation": 140716 } }, { "ph": "s", "id": 140716, "pid": 76337, "tid": -914061504, "ts": 1716454223608614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223608668, "dur": 0, "args": { "External id": 140726, "cbid": 317, "correlation": 140726 } }, { "ph": "f", "id": 140726, "pid": 76337, "tid": -914061504, "ts": 1716454223608668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223608669, "dur": 0, "args": { "External id": 140727, "cbid": 203, "correlation": 140727 } }, { "ph": "f", "id": 140727, "pid": 76337, "tid": -914061504, "ts": 1716454223608669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223608670, "dur": 0, "args": { "External id": 140728, "cbid": 205, "correlation": 140728 } }, { "ph": "f", "id": 140728, "pid": 76337, "tid": -914061504, "ts": 1716454223608670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223654268, "dur": 11, "args": { "External id": 140732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140732, "pid": 5, "tid": 7, "ts": 1716454223654268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608685, "dur": 11, "args": { "External id": 140732, "cbid": 211, "correlation": 140732 } }, { "ph": "s", "id": 140732, "pid": 76337, "tid": -914061504, "ts": 1716454223608685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223654281, "dur": 25, "args": { "External id": 140734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140734, "pid": 5, "tid": 7, "ts": 1716454223654281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608699, "dur": 5, "args": { "External id": 140734, "cbid": 211, "correlation": 140734 } }, { "ph": "s", "id": 140734, "pid": 76337, "tid": -914061504, "ts": 1716454223608699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223654307, "dur": 3, "args": { "External id": 140736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 140736, "pid": 5, "tid": 7, "ts": 1716454223654307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608710, "dur": 5, "args": { "External id": 140736, "cbid": 211, "correlation": 140736 } }, { "ph": "s", "id": 140736, "pid": 76337, "tid": -914061504, "ts": 1716454223608710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223608719, "dur": 0, "args": { "External id": 140737, "cbid": 51, "correlation": 140737 } }, { "ph": "s", "id": 140737, "pid": 76337, "tid": -914061504, "ts": 1716454223608719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223654311, "dur": 357, "args": { "External id": 140738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140738, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140738, "pid": 5, "tid": 7, "ts": 1716454223654311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608720, "dur": 9, "args": { "External id": 140738, "cbid": 211, "correlation": 140738 } }, { "ph": "s", "id": 140738, "pid": 76337, "tid": -914061504, "ts": 1716454223608720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223654669, "dur": 22, "args": { "External id": 140739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140739, "pid": 5, "tid": 7, "ts": 1716454223654669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608732, "dur": 5, "args": { "External id": 140739, "cbid": 211, "correlation": 140739 } }, { "ph": "s", "id": 140739, "pid": 76337, "tid": -914061504, "ts": 1716454223608732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223654692, "dur": 32, "args": { "External id": 140745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140745, "pid": 5, "tid": 7, "ts": 1716454223654692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608759, "dur": 8, "args": { "External id": 140745, "cbid": 211, "correlation": 140745 } }, { "ph": "s", "id": 140745, "pid": 76337, "tid": -914061504, "ts": 1716454223608759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223654726, "dur": 4, "args": { "External id": 140753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140753, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 140753, "pid": 5, "tid": 7, "ts": 1716454223654726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608802, "dur": 9, "args": { "External id": 140753, "cbid": 211, "correlation": 140753 } }, { "ph": "s", "id": 140753, "pid": 76337, "tid": -914061504, "ts": 1716454223608802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608867, "dur": 1, "args": { "External id": 140769, "cbid": 251, "correlation": 140769 } }, { "ph": "f", "id": 140769, "pid": 76337, "tid": -914061504, "ts": 1716454223608867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223608872, "dur": 0, "args": { "External id": 140771, "cbid": 251, "correlation": 140771 } }, { "ph": "f", "id": 140771, "pid": 76337, "tid": -914061504, "ts": 1716454223608872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223654732, "dur": 13, "args": { "External id": 140772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140772, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 140772, "pid": 5, "tid": 7, "ts": 1716454223654732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608874, "dur": 12, "args": { "External id": 140772, "cbid": 211, "correlation": 140772 } }, { "ph": "s", "id": 140772, "pid": 76337, "tid": -914061504, "ts": 1716454223608874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223654745, "dur": 5, "args": { "External id": 140774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140774, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 140774, "pid": 5, "tid": 7, "ts": 1716454223654745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608888, "dur": 5, "args": { "External id": 140774, "cbid": 211, "correlation": 140774 } }, { "ph": "s", "id": 140774, "pid": 76337, "tid": -914061504, "ts": 1716454223608888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223654752, "dur": 30, "args": { "External id": 140784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140784, "pid": 5, "tid": 7, "ts": 1716454223654752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223608946, "dur": 12, "args": { "External id": 140784, "cbid": 211, "correlation": 140784 } }, { "ph": "s", "id": 140784, "pid": 76337, "tid": -914061504, "ts": 1716454223608946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223654783, "dur": 31, "args": { "External id": 140804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140804, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 140804, "pid": 5, "tid": 7, "ts": 1716454223654783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609021, "dur": 12, "args": { "External id": 140804, "cbid": 211, "correlation": 140804 } }, { "ph": "s", "id": 140804, "pid": 76337, "tid": -914061504, "ts": 1716454223609021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223654815, "dur": 4, "args": { "External id": 140816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140816, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 140816, "pid": 5, "tid": 7, "ts": 1716454223654815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609043, "dur": 6, "args": { "External id": 140816, "cbid": 211, "correlation": 140816 } }, { "ph": "s", "id": 140816, "pid": 76337, "tid": -914061504, "ts": 1716454223609043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223654820, "dur": 30, "args": { "External id": 140819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140819, "pid": 5, "tid": 7, "ts": 1716454223654820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609061, "dur": 7, "args": { "External id": 140819, "cbid": 211, "correlation": 140819 } }, { "ph": "s", "id": 140819, "pid": 76337, "tid": -914061504, "ts": 1716454223609061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223654852, "dur": 21, "args": { "External id": 140828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140828, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140828, "pid": 5, "tid": 7, "ts": 1716454223654852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609103, "dur": 9, "args": { "External id": 140828, "cbid": 211, "correlation": 140828 } }, { "ph": "s", "id": 140828, "pid": 76337, "tid": -914061504, "ts": 1716454223609103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223609166, "dur": 0, "args": { "External id": 140838, "cbid": 317, "correlation": 140838 } }, { "ph": "f", "id": 140838, "pid": 76337, "tid": -914061504, "ts": 1716454223609166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223609167, "dur": 0, "args": { "External id": 140839, "cbid": 203, "correlation": 140839 } }, { "ph": "f", "id": 140839, "pid": 76337, "tid": -914061504, "ts": 1716454223609167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223609167, "dur": 0, "args": { "External id": 140840, "cbid": 205, "correlation": 140840 } }, { "ph": "f", "id": 140840, "pid": 76337, "tid": -914061504, "ts": 1716454223609167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223654874, "dur": 22, "args": { "External id": 140844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140844, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140844, "pid": 5, "tid": 7, "ts": 1716454223654874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609185, "dur": 12, "args": { "External id": 140844, "cbid": 211, "correlation": 140844 } }, { "ph": "s", "id": 140844, "pid": 76337, "tid": -914061504, "ts": 1716454223609185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223654897, "dur": 44, "args": { "External id": 140846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140846, "pid": 5, "tid": 7, "ts": 1716454223654897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609200, "dur": 5, "args": { "External id": 140846, "cbid": 211, "correlation": 140846 } }, { "ph": "s", "id": 140846, "pid": 76337, "tid": -914061504, "ts": 1716454223609200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223654942, "dur": 659, "args": { "External id": 140848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140848, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140848, "pid": 5, "tid": 7, "ts": 1716454223654942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609214, "dur": 10, "args": { "External id": 140848, "cbid": 211, "correlation": 140848 } }, { "ph": "s", "id": 140848, "pid": 76337, "tid": -914061504, "ts": 1716454223609214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223655603, "dur": 22, "args": { "External id": 140850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140850, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140850, "pid": 5, "tid": 7, "ts": 1716454223655603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609228, "dur": 5, "args": { "External id": 140850, "cbid": 211, "correlation": 140850 } }, { "ph": "s", "id": 140850, "pid": 76337, "tid": -914061504, "ts": 1716454223609228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223655626, "dur": 33, "args": { "External id": 140856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140856, "pid": 5, "tid": 7, "ts": 1716454223655626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609256, "dur": 8, "args": { "External id": 140856, "cbid": 211, "correlation": 140856 } }, { "ph": "s", "id": 140856, "pid": 76337, "tid": -914061504, "ts": 1716454223609256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223609315, "dur": 0, "args": { "External id": 140866, "cbid": 317, "correlation": 140866 } }, { "ph": "f", "id": 140866, "pid": 76337, "tid": -914061504, "ts": 1716454223609315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223609316, "dur": 0, "args": { "External id": 140867, "cbid": 203, "correlation": 140867 } }, { "ph": "f", "id": 140867, "pid": 76337, "tid": -914061504, "ts": 1716454223609316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223609316, "dur": 0, "args": { "External id": 140868, "cbid": 205, "correlation": 140868 } }, { "ph": "f", "id": 140868, "pid": 76337, "tid": -914061504, "ts": 1716454223609316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609339, "dur": 1, "args": { "External id": 140872, "cbid": 251, "correlation": 140872 } }, { "ph": "f", "id": 140872, "pid": 76337, "tid": -914061504, "ts": 1716454223609339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609341, "dur": 0, "args": { "External id": 140873, "cbid": 251, "correlation": 140873 } }, { "ph": "f", "id": 140873, "pid": 76337, "tid": -914061504, "ts": 1716454223609341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609342, "dur": 0, "args": { "External id": 140874, "cbid": 251, "correlation": 140874 } }, { "ph": "f", "id": 140874, "pid": 76337, "tid": -914061504, "ts": 1716454223609342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609343, "dur": 0, "args": { "External id": 140875, "cbid": 251, "correlation": 140875 } }, { "ph": "f", "id": 140875, "pid": 76337, "tid": -914061504, "ts": 1716454223609343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609343, "dur": 0, "args": { "External id": 140876, "cbid": 251, "correlation": 140876 } }, { "ph": "f", "id": 140876, "pid": 76337, "tid": -914061504, "ts": 1716454223609343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609344, "dur": 0, "args": { "External id": 140877, "cbid": 251, "correlation": 140877 } }, { "ph": "f", "id": 140877, "pid": 76337, "tid": -914061504, "ts": 1716454223609344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609345, "dur": 0, "args": { "External id": 140878, "cbid": 251, "correlation": 140878 } }, { "ph": "f", "id": 140878, "pid": 76337, "tid": -914061504, "ts": 1716454223609345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609345, "dur": 0, "args": { "External id": 140879, "cbid": 251, "correlation": 140879 } }, { "ph": "f", "id": 140879, "pid": 76337, "tid": -914061504, "ts": 1716454223609345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609347, "dur": 0, "args": { "External id": 140880, "cbid": 251, "correlation": 140880 } }, { "ph": "f", "id": 140880, "pid": 76337, "tid": -914061504, "ts": 1716454223609347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223655661, "dur": 52, "args": { "External id": 140881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140881, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 140881, "pid": 5, "tid": 7, "ts": 1716454223655661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609349, "dur": 12, "args": { "External id": 140881, "cbid": 211, "correlation": 140881 } }, { "ph": "s", "id": 140881, "pid": 76337, "tid": -914061504, "ts": 1716454223609349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223655714, "dur": 32, "args": { "External id": 140887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140887, "pid": 5, "tid": 7, "ts": 1716454223655714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609381, "dur": 8, "args": { "External id": 140887, "cbid": 211, "correlation": 140887 } }, { "ph": "s", "id": 140887, "pid": 76337, "tid": -914061504, "ts": 1716454223609381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223655748, "dur": 27, "args": { "External id": 140895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140895, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140895, "pid": 5, "tid": 7, "ts": 1716454223655748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609410, "dur": 8, "args": { "External id": 140895, "cbid": 211, "correlation": 140895 } }, { "ph": "s", "id": 140895, "pid": 76337, "tid": -914061504, "ts": 1716454223609410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223655776, "dur": 19, "args": { "External id": 140903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140903, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140903, "pid": 5, "tid": 7, "ts": 1716454223655776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609439, "dur": 8, "args": { "External id": 140903, "cbid": 211, "correlation": 140903 } }, { "ph": "s", "id": 140903, "pid": 76337, "tid": -914061504, "ts": 1716454223609439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223655796, "dur": 30, "args": { "External id": 140923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140923, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 140923, "pid": 5, "tid": 7, "ts": 1716454223655796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609520, "dur": 13, "args": { "External id": 140923, "cbid": 211, "correlation": 140923 } }, { "ph": "s", "id": 140923, "pid": 76337, "tid": -914061504, "ts": 1716454223609520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223655828, "dur": 4, "args": { "External id": 140935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140935, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 140935, "pid": 5, "tid": 7, "ts": 1716454223655828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609542, "dur": 6, "args": { "External id": 140935, "cbid": 211, "correlation": 140935 } }, { "ph": "s", "id": 140935, "pid": 76337, "tid": -914061504, "ts": 1716454223609542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223655833, "dur": 30, "args": { "External id": 140938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140938, "pid": 5, "tid": 7, "ts": 1716454223655833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609560, "dur": 6, "args": { "External id": 140938, "cbid": 211, "correlation": 140938 } }, { "ph": "s", "id": 140938, "pid": 76337, "tid": -914061504, "ts": 1716454223609560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223609618, "dur": 0, "args": { "External id": 140949, "cbid": 317, "correlation": 140949 } }, { "ph": "f", "id": 140949, "pid": 76337, "tid": -914061504, "ts": 1716454223609618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223609618, "dur": 0, "args": { "External id": 140950, "cbid": 203, "correlation": 140950 } }, { "ph": "f", "id": 140950, "pid": 76337, "tid": -914061504, "ts": 1716454223609618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223609619, "dur": 0, "args": { "External id": 140951, "cbid": 205, "correlation": 140951 } }, { "ph": "f", "id": 140951, "pid": 76337, "tid": -914061504, "ts": 1716454223609619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223655864, "dur": 22, "args": { "External id": 140955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140955, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140955, "pid": 5, "tid": 7, "ts": 1716454223655864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609635, "dur": 11, "args": { "External id": 140955, "cbid": 211, "correlation": 140955 } }, { "ph": "s", "id": 140955, "pid": 76337, "tid": -914061504, "ts": 1716454223609635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223655887, "dur": 121, "args": { "External id": 140957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140957, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 140957, "pid": 5, "tid": 7, "ts": 1716454223655887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609655, "dur": 8, "args": { "External id": 140957, "cbid": 211, "correlation": 140957 } }, { "ph": "s", "id": 140957, "pid": 76337, "tid": -914061504, "ts": 1716454223609655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223656010, "dur": 22, "args": { "External id": 140959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140959, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140959, "pid": 5, "tid": 7, "ts": 1716454223656010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609666, "dur": 5, "args": { "External id": 140959, "cbid": 211, "correlation": 140959 } }, { "ph": "s", "id": 140959, "pid": 76337, "tid": -914061504, "ts": 1716454223609666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223656033, "dur": 32, "args": { "External id": 140965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140965, "pid": 5, "tid": 7, "ts": 1716454223656033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609694, "dur": 8, "args": { "External id": 140965, "cbid": 211, "correlation": 140965 } }, { "ph": "s", "id": 140965, "pid": 76337, "tid": -914061504, "ts": 1716454223609694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223656066, "dur": 191, "args": { "External id": 140974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140974, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140974, "pid": 5, "tid": 7, "ts": 1716454223656066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609776, "dur": 14, "args": { "External id": 140974, "cbid": 211, "correlation": 140974 } }, { "ph": "s", "id": 140974, "pid": 76337, "tid": -914061504, "ts": 1716454223609776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223656259, "dur": 65, "args": { "External id": 140996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 140996, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 140996, "pid": 5, "tid": 7, "ts": 1716454223656259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609834, "dur": 10, "args": { "External id": 140996, "cbid": 211, "correlation": 140996 } }, { "ph": "s", "id": 140996, "pid": 76337, "tid": -914061504, "ts": 1716454223609834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223609925, "dur": 1, "args": { "External id": 141007, "cbid": 251, "correlation": 141007 } }, { "ph": "f", "id": 141007, "pid": 76337, "tid": -914061504, "ts": 1716454223609925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223656325, "dur": 156, "args": { "External id": 141008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141008, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141008, "pid": 5, "tid": 7, "ts": 1716454223656325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223609931, "dur": 13, "args": { "External id": 141008, "cbid": 211, "correlation": 141008 } }, { "ph": "s", "id": 141008, "pid": 76337, "tid": -914061504, "ts": 1716454223609931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610009, "dur": 1, "args": { "External id": 141019, "cbid": 251, "correlation": 141019 } }, { "ph": "f", "id": 141019, "pid": 76337, "tid": -914061504, "ts": 1716454223610009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223656483, "dur": 147, "args": { "External id": 141020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141020, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141020, "pid": 5, "tid": 7, "ts": 1716454223656483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610013, "dur": 12, "args": { "External id": 141020, "cbid": 211, "correlation": 141020 } }, { "ph": "s", "id": 141020, "pid": 76337, "tid": -914061504, "ts": 1716454223610013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610079, "dur": 1, "args": { "External id": 141031, "cbid": 251, "correlation": 141031 } }, { "ph": "f", "id": 141031, "pid": 76337, "tid": -914061504, "ts": 1716454223610079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223656631, "dur": 144, "args": { "External id": 141032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141032, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141032, "pid": 5, "tid": 7, "ts": 1716454223656631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610083, "dur": 11, "args": { "External id": 141032, "cbid": 211, "correlation": 141032 } }, { "ph": "s", "id": 141032, "pid": 76337, "tid": -914061504, "ts": 1716454223610083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223656777, "dur": 1954, "args": { "External id": 141053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141053, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 141053, "pid": 5, "tid": 7, "ts": 1716454223656777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610167, "dur": 14, "args": { "External id": 141053, "cbid": 211, "correlation": 141053 } }, { "ph": "s", "id": 141053, "pid": 76337, "tid": -914061504, "ts": 1716454223610167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610267, "dur": 1, "args": { "External id": 141071, "cbid": 251, "correlation": 141071 } }, { "ph": "f", "id": 141071, "pid": 76337, "tid": -914061504, "ts": 1716454223610267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223658732, "dur": 147, "args": { "External id": 141073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141073, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 141073, "pid": 5, "tid": 7, "ts": 1716454223658732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610273, "dur": 14, "args": { "External id": 141073, "cbid": 211, "correlation": 141073 } }, { "ph": "s", "id": 141073, "pid": 76337, "tid": -914061504, "ts": 1716454223610273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223658881, "dur": 36, "args": { "External id": 141081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141081, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141081, "pid": 5, "tid": 7, "ts": 1716454223658881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610345, "dur": 12, "args": { "External id": 141081, "cbid": 211, "correlation": 141081 } }, { "ph": "s", "id": 141081, "pid": 76337, "tid": -914061504, "ts": 1716454223610345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223658918, "dur": 51, "args": { "External id": 141089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141089, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141089, "pid": 5, "tid": 7, "ts": 1716454223658918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610385, "dur": 8, "args": { "External id": 141089, "cbid": 211, "correlation": 141089 } }, { "ph": "s", "id": 141089, "pid": 76337, "tid": -914061504, "ts": 1716454223610385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223658970, "dur": 30, "args": { "External id": 141100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141100, "pid": 5, "tid": 7, "ts": 1716454223658970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610457, "dur": 13, "args": { "External id": 141100, "cbid": 211, "correlation": 141100 } }, { "ph": "s", "id": 141100, "pid": 76337, "tid": -914061504, "ts": 1716454223610457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223659001, "dur": 34, "args": { "External id": 141122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141122, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141122, "pid": 5, "tid": 7, "ts": 1716454223659001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610489, "dur": 8, "args": { "External id": 141122, "cbid": 211, "correlation": 141122 } }, { "ph": "s", "id": 141122, "pid": 76337, "tid": -914061504, "ts": 1716454223610489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610573, "dur": 1, "args": { "External id": 141133, "cbid": 251, "correlation": 141133 } }, { "ph": "f", "id": 141133, "pid": 76337, "tid": -914061504, "ts": 1716454223610573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223659037, "dur": 91, "args": { "External id": 141134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141134, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141134, "pid": 5, "tid": 7, "ts": 1716454223659037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610579, "dur": 14, "args": { "External id": 141134, "cbid": 211, "correlation": 141134 } }, { "ph": "s", "id": 141134, "pid": 76337, "tid": -914061504, "ts": 1716454223610579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610648, "dur": 1, "args": { "External id": 141145, "cbid": 251, "correlation": 141145 } }, { "ph": "f", "id": 141145, "pid": 76337, "tid": -914061504, "ts": 1716454223610648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610652, "dur": 0, "args": { "External id": 141146, "cbid": 251, "correlation": 141146 } }, { "ph": "f", "id": 141146, "pid": 76337, "tid": -914061504, "ts": 1716454223610652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223659129, "dur": 11, "args": { "External id": 141147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141147, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 141147, "pid": 5, "tid": 7, "ts": 1716454223659129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610654, "dur": 12, "args": { "External id": 141147, "cbid": 211, "correlation": 141147 } }, { "ph": "s", "id": 141147, "pid": 76337, "tid": -914061504, "ts": 1716454223610654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223659141, "dur": 5, "args": { "External id": 141149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141149, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 141149, "pid": 5, "tid": 7, "ts": 1716454223659141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610669, "dur": 7, "args": { "External id": 141149, "cbid": 211, "correlation": 141149 } }, { "ph": "s", "id": 141149, "pid": 76337, "tid": -914061504, "ts": 1716454223610669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610727, "dur": 1, "args": { "External id": 141160, "cbid": 251, "correlation": 141160 } }, { "ph": "f", "id": 141160, "pid": 76337, "tid": -914061504, "ts": 1716454223610727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610731, "dur": 0, "args": { "External id": 141161, "cbid": 251, "correlation": 141161 } }, { "ph": "f", "id": 141161, "pid": 76337, "tid": -914061504, "ts": 1716454223610731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223659148, "dur": 7, "args": { "External id": 141162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141162, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 141162, "pid": 5, "tid": 7, "ts": 1716454223659148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610732, "dur": 12, "args": { "External id": 141162, "cbid": 211, "correlation": 141162 } }, { "ph": "s", "id": 141162, "pid": 76337, "tid": -914061504, "ts": 1716454223610732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223659156, "dur": 3, "args": { "External id": 141164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141164, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 141164, "pid": 5, "tid": 7, "ts": 1716454223659156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610746, "dur": 5, "args": { "External id": 141164, "cbid": 211, "correlation": 141164 } }, { "ph": "s", "id": 141164, "pid": 76337, "tid": -914061504, "ts": 1716454223610746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223659160, "dur": 92, "args": { "External id": 141185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141185, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 141185, "pid": 5, "tid": 7, "ts": 1716454223659160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610819, "dur": 12, "args": { "External id": 141185, "cbid": 211, "correlation": 141185 } }, { "ph": "s", "id": 141185, "pid": 76337, "tid": -914061504, "ts": 1716454223610819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223610914, "dur": 1, "args": { "External id": 141203, "cbid": 251, "correlation": 141203 } }, { "ph": "f", "id": 141203, "pid": 76337, "tid": -914061504, "ts": 1716454223610914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223659253, "dur": 99, "args": { "External id": 141205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141205, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141205, "pid": 5, "tid": 7, "ts": 1716454223659253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610919, "dur": 13, "args": { "External id": 141205, "cbid": 211, "correlation": 141205 } }, { "ph": "s", "id": 141205, "pid": 76337, "tid": -914061504, "ts": 1716454223610919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223659353, "dur": 19, "args": { "External id": 141213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141213, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141213, "pid": 5, "tid": 7, "ts": 1716454223659353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223610997, "dur": 12, "args": { "External id": 141213, "cbid": 211, "correlation": 141213 } }, { "ph": "s", "id": 141213, "pid": 76337, "tid": -914061504, "ts": 1716454223610997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223659373, "dur": 37, "args": { "External id": 141221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141221, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141221, "pid": 5, "tid": 7, "ts": 1716454223659373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611038, "dur": 10, "args": { "External id": 141221, "cbid": 211, "correlation": 141221 } }, { "ph": "s", "id": 141221, "pid": 76337, "tid": -914061504, "ts": 1716454223611038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223659412, "dur": 35, "args": { "External id": 141243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141243, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141243, "pid": 5, "tid": 7, "ts": 1716454223659412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611090, "dur": 10, "args": { "External id": 141243, "cbid": 211, "correlation": 141243 } }, { "ph": "s", "id": 141243, "pid": 76337, "tid": -914061504, "ts": 1716454223611090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223611178, "dur": 1, "args": { "External id": 141259, "cbid": 251, "correlation": 141259 } }, { "ph": "f", "id": 141259, "pid": 76337, "tid": -914061504, "ts": 1716454223611178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223611184, "dur": 0, "args": { "External id": 141261, "cbid": 251, "correlation": 141261 } }, { "ph": "f", "id": 141261, "pid": 76337, "tid": -914061504, "ts": 1716454223611184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223659448, "dur": 543, "args": { "External id": 141262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141262, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 141262, "pid": 5, "tid": 7, "ts": 1716454223659448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611187, "dur": 14, "args": { "External id": 141262, "cbid": 211, "correlation": 141262 } }, { "ph": "s", "id": 141262, "pid": 76337, "tid": -914061504, "ts": 1716454223611187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223659993, "dur": 125, "args": { "External id": 141270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141270, "pid": 5, "tid": 7, "ts": 1716454223659993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611254, "dur": 12, "args": { "External id": 141270, "cbid": 211, "correlation": 141270 } }, { "ph": "s", "id": 141270, "pid": 76337, "tid": -914061504, "ts": 1716454223611254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223660119, "dur": 127, "args": { "External id": 141278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141278, "pid": 5, "tid": 7, "ts": 1716454223660119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611284, "dur": 9, "args": { "External id": 141278, "cbid": 211, "correlation": 141278 } }, { "ph": "s", "id": 141278, "pid": 76337, "tid": -914061504, "ts": 1716454223611284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223611364, "dur": 1, "args": { "External id": 141294, "cbid": 251, "correlation": 141294 } }, { "ph": "f", "id": 141294, "pid": 76337, "tid": -914061504, "ts": 1716454223611364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223660248, "dur": 311, "args": { "External id": 141296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141296, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141296, "pid": 5, "tid": 7, "ts": 1716454223660248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611370, "dur": 13, "args": { "External id": 141296, "cbid": 211, "correlation": 141296 } }, { "ph": "s", "id": 141296, "pid": 76337, "tid": -914061504, "ts": 1716454223611370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223660560, "dur": 27, "args": { "External id": 141304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141304, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141304, "pid": 5, "tid": 7, "ts": 1716454223660560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611412, "dur": 10, "args": { "External id": 141304, "cbid": 211, "correlation": 141304 } }, { "ph": "s", "id": 141304, "pid": 76337, "tid": -914061504, "ts": 1716454223611412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223660588, "dur": 82, "args": { "External id": 141315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141315, "pid": 5, "tid": 7, "ts": 1716454223660588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611480, "dur": 13, "args": { "External id": 141315, "cbid": 211, "correlation": 141315 } }, { "ph": "s", "id": 141315, "pid": 76337, "tid": -914061504, "ts": 1716454223611480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223611543, "dur": 0, "args": { "External id": 141327, "cbid": 317, "correlation": 141327 } }, { "ph": "f", "id": 141327, "pid": 76337, "tid": -914061504, "ts": 1716454223611543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223611544, "dur": 0, "args": { "External id": 141328, "cbid": 203, "correlation": 141328 } }, { "ph": "f", "id": 141328, "pid": 76337, "tid": -914061504, "ts": 1716454223611544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223611545, "dur": 0, "args": { "External id": 141329, "cbid": 205, "correlation": 141329 } }, { "ph": "f", "id": 141329, "pid": 76337, "tid": -914061504, "ts": 1716454223611545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223660671, "dur": 22, "args": { "External id": 141333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141333, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141333, "pid": 5, "tid": 7, "ts": 1716454223660671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611560, "dur": 12, "args": { "External id": 141333, "cbid": 211, "correlation": 141333 } }, { "ph": "s", "id": 141333, "pid": 76337, "tid": -914061504, "ts": 1716454223611560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223660695, "dur": 121, "args": { "External id": 141335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141335, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141335, "pid": 5, "tid": 7, "ts": 1716454223660695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611579, "dur": 6, "args": { "External id": 141335, "cbid": 211, "correlation": 141335 } }, { "ph": "s", "id": 141335, "pid": 76337, "tid": -914061504, "ts": 1716454223611579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223660818, "dur": 23, "args": { "External id": 141337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141337, "pid": 5, "tid": 7, "ts": 1716454223660818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611590, "dur": 5, "args": { "External id": 141337, "cbid": 211, "correlation": 141337 } }, { "ph": "s", "id": 141337, "pid": 76337, "tid": -914061504, "ts": 1716454223611590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223660842, "dur": 32, "args": { "External id": 141343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141343, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141343, "pid": 5, "tid": 7, "ts": 1716454223660842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611617, "dur": 8, "args": { "External id": 141343, "cbid": 211, "correlation": 141343 } }, { "ph": "s", "id": 141343, "pid": 76337, "tid": -914061504, "ts": 1716454223611617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223660876, "dur": 27, "args": { "External id": 141351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141351, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141351, "pid": 5, "tid": 7, "ts": 1716454223660876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611650, "dur": 8, "args": { "External id": 141351, "cbid": 211, "correlation": 141351 } }, { "ph": "s", "id": 141351, "pid": 76337, "tid": -914061504, "ts": 1716454223611650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223660904, "dur": 30, "args": { "External id": 141371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141371, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 141371, "pid": 5, "tid": 7, "ts": 1716454223660904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611722, "dur": 11, "args": { "External id": 141371, "cbid": 211, "correlation": 141371 } }, { "ph": "s", "id": 141371, "pid": 76337, "tid": -914061504, "ts": 1716454223611722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223660935, "dur": 4, "args": { "External id": 141383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141383, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 141383, "pid": 5, "tid": 7, "ts": 1716454223660935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611743, "dur": 6, "args": { "External id": 141383, "cbid": 211, "correlation": 141383 } }, { "ph": "s", "id": 141383, "pid": 76337, "tid": -914061504, "ts": 1716454223611743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223660941, "dur": 31, "args": { "External id": 141386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141386, "pid": 5, "tid": 7, "ts": 1716454223660941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611761, "dur": 6, "args": { "External id": 141386, "cbid": 211, "correlation": 141386 } }, { "ph": "s", "id": 141386, "pid": 76337, "tid": -914061504, "ts": 1716454223611761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223660973, "dur": 21, "args": { "External id": 141395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141395, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141395, "pid": 5, "tid": 7, "ts": 1716454223660973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611800, "dur": 9, "args": { "External id": 141395, "cbid": 211, "correlation": 141395 } }, { "ph": "s", "id": 141395, "pid": 76337, "tid": -914061504, "ts": 1716454223611800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223611851, "dur": 0, "args": { "External id": 141405, "cbid": 317, "correlation": 141405 } }, { "ph": "f", "id": 141405, "pid": 76337, "tid": -914061504, "ts": 1716454223611851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223611852, "dur": 0, "args": { "External id": 141406, "cbid": 203, "correlation": 141406 } }, { "ph": "f", "id": 141406, "pid": 76337, "tid": -914061504, "ts": 1716454223611852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223611852, "dur": 0, "args": { "External id": 141407, "cbid": 205, "correlation": 141407 } }, { "ph": "f", "id": 141407, "pid": 76337, "tid": -914061504, "ts": 1716454223611852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223660995, "dur": 23, "args": { "External id": 141411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141411, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141411, "pid": 5, "tid": 7, "ts": 1716454223660995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611865, "dur": 11, "args": { "External id": 141411, "cbid": 211, "correlation": 141411 } }, { "ph": "s", "id": 141411, "pid": 76337, "tid": -914061504, "ts": 1716454223611865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223661020, "dur": 44, "args": { "External id": 141413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141413, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141413, "pid": 5, "tid": 7, "ts": 1716454223661020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611878, "dur": 5, "args": { "External id": 141413, "cbid": 211, "correlation": 141413 } }, { "ph": "s", "id": 141413, "pid": 76337, "tid": -914061504, "ts": 1716454223611878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223661065, "dur": 655, "args": { "External id": 141415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141415, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141415, "pid": 5, "tid": 7, "ts": 1716454223661065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611889, "dur": 6, "args": { "External id": 141415, "cbid": 211, "correlation": 141415 } }, { "ph": "s", "id": 141415, "pid": 76337, "tid": -914061504, "ts": 1716454223611889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223661722, "dur": 20, "args": { "External id": 141417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141417, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141417, "pid": 5, "tid": 7, "ts": 1716454223661722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611900, "dur": 5, "args": { "External id": 141417, "cbid": 211, "correlation": 141417 } }, { "ph": "s", "id": 141417, "pid": 76337, "tid": -914061504, "ts": 1716454223611900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223661743, "dur": 33, "args": { "External id": 141423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141423, "pid": 5, "tid": 7, "ts": 1716454223661743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611926, "dur": 8, "args": { "External id": 141423, "cbid": 211, "correlation": 141423 } }, { "ph": "s", "id": 141423, "pid": 76337, "tid": -914061504, "ts": 1716454223611926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223661777, "dur": 4, "args": { "External id": 141431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141431, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 141431, "pid": 5, "tid": 7, "ts": 1716454223661777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223611970, "dur": 23, "args": { "External id": 141431, "cbid": 211, "correlation": 141431 } }, { "ph": "s", "id": 141431, "pid": 76337, "tid": -914061504, "ts": 1716454223611970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223612049, "dur": 1, "args": { "External id": 141447, "cbid": 251, "correlation": 141447 } }, { "ph": "f", "id": 141447, "pid": 76337, "tid": -914061504, "ts": 1716454223612049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223612054, "dur": 0, "args": { "External id": 141449, "cbid": 251, "correlation": 141449 } }, { "ph": "f", "id": 141449, "pid": 76337, "tid": -914061504, "ts": 1716454223612054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223661783, "dur": 12, "args": { "External id": 141450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141450, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 141450, "pid": 5, "tid": 7, "ts": 1716454223661783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612056, "dur": 11, "args": { "External id": 141450, "cbid": 211, "correlation": 141450 } }, { "ph": "s", "id": 141450, "pid": 76337, "tid": -914061504, "ts": 1716454223612056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223661796, "dur": 5, "args": { "External id": 141452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141452, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 141452, "pid": 5, "tid": 7, "ts": 1716454223661796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612069, "dur": 5, "args": { "External id": 141452, "cbid": 211, "correlation": 141452 } }, { "ph": "s", "id": 141452, "pid": 76337, "tid": -914061504, "ts": 1716454223612069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223661803, "dur": 29, "args": { "External id": 141462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141462, "pid": 5, "tid": 7, "ts": 1716454223661803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612127, "dur": 12, "args": { "External id": 141462, "cbid": 211, "correlation": 141462 } }, { "ph": "s", "id": 141462, "pid": 76337, "tid": -914061504, "ts": 1716454223612127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223661833, "dur": 31, "args": { "External id": 141482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141482, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 141482, "pid": 5, "tid": 7, "ts": 1716454223661833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612193, "dur": 11, "args": { "External id": 141482, "cbid": 211, "correlation": 141482 } }, { "ph": "s", "id": 141482, "pid": 76337, "tid": -914061504, "ts": 1716454223612193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223661865, "dur": 4, "args": { "External id": 141494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141494, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 141494, "pid": 5, "tid": 7, "ts": 1716454223661865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612213, "dur": 6, "args": { "External id": 141494, "cbid": 211, "correlation": 141494 } }, { "ph": "s", "id": 141494, "pid": 76337, "tid": -914061504, "ts": 1716454223612213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223661871, "dur": 29, "args": { "External id": 141497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141497, "pid": 5, "tid": 7, "ts": 1716454223661871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612232, "dur": 6, "args": { "External id": 141497, "cbid": 211, "correlation": 141497 } }, { "ph": "s", "id": 141497, "pid": 76337, "tid": -914061504, "ts": 1716454223612232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223661901, "dur": 20, "args": { "External id": 141506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141506, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141506, "pid": 5, "tid": 7, "ts": 1716454223661901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612273, "dur": 10, "args": { "External id": 141506, "cbid": 211, "correlation": 141506 } }, { "ph": "s", "id": 141506, "pid": 76337, "tid": -914061504, "ts": 1716454223612273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223612336, "dur": 0, "args": { "External id": 141516, "cbid": 317, "correlation": 141516 } }, { "ph": "f", "id": 141516, "pid": 76337, "tid": -914061504, "ts": 1716454223612336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223612337, "dur": 0, "args": { "External id": 141517, "cbid": 203, "correlation": 141517 } }, { "ph": "f", "id": 141517, "pid": 76337, "tid": -914061504, "ts": 1716454223612337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223612337, "dur": 0, "args": { "External id": 141518, "cbid": 205, "correlation": 141518 } }, { "ph": "f", "id": 141518, "pid": 76337, "tid": -914061504, "ts": 1716454223612337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223661923, "dur": 23, "args": { "External id": 141522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141522, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141522, "pid": 5, "tid": 7, "ts": 1716454223661923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612351, "dur": 12, "args": { "External id": 141522, "cbid": 211, "correlation": 141522 } }, { "ph": "s", "id": 141522, "pid": 76337, "tid": -914061504, "ts": 1716454223612351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223661947, "dur": 44, "args": { "External id": 141524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141524, "pid": 5, "tid": 7, "ts": 1716454223661947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612366, "dur": 5, "args": { "External id": 141524, "cbid": 211, "correlation": 141524 } }, { "ph": "s", "id": 141524, "pid": 76337, "tid": -914061504, "ts": 1716454223612366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223661993, "dur": 645, "args": { "External id": 141526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141526, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141526, "pid": 5, "tid": 7, "ts": 1716454223661993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612377, "dur": 6, "args": { "External id": 141526, "cbid": 211, "correlation": 141526 } }, { "ph": "s", "id": 141526, "pid": 76337, "tid": -914061504, "ts": 1716454223612377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223662639, "dur": 22, "args": { "External id": 141528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141528, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141528, "pid": 5, "tid": 7, "ts": 1716454223662639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612386, "dur": 5, "args": { "External id": 141528, "cbid": 211, "correlation": 141528 } }, { "ph": "s", "id": 141528, "pid": 76337, "tid": -914061504, "ts": 1716454223612386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223662663, "dur": 33, "args": { "External id": 141534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141534, "pid": 5, "tid": 7, "ts": 1716454223662663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612414, "dur": 8, "args": { "External id": 141534, "cbid": 211, "correlation": 141534 } }, { "ph": "s", "id": 141534, "pid": 76337, "tid": -914061504, "ts": 1716454223612414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223662697, "dur": 27, "args": { "External id": 141542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141542, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141542, "pid": 5, "tid": 7, "ts": 1716454223662697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612447, "dur": 9, "args": { "External id": 141542, "cbid": 211, "correlation": 141542 } }, { "ph": "s", "id": 141542, "pid": 76337, "tid": -914061504, "ts": 1716454223612447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223662725, "dur": 20, "args": { "External id": 141550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141550, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141550, "pid": 5, "tid": 7, "ts": 1716454223662725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612476, "dur": 8, "args": { "External id": 141550, "cbid": 211, "correlation": 141550 } }, { "ph": "s", "id": 141550, "pid": 76337, "tid": -914061504, "ts": 1716454223612476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223662746, "dur": 30, "args": { "External id": 141570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141570, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 141570, "pid": 5, "tid": 7, "ts": 1716454223662746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612557, "dur": 13, "args": { "External id": 141570, "cbid": 211, "correlation": 141570 } }, { "ph": "s", "id": 141570, "pid": 76337, "tid": -914061504, "ts": 1716454223612557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223662777, "dur": 4, "args": { "External id": 141582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141582, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 141582, "pid": 5, "tid": 7, "ts": 1716454223662777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612579, "dur": 6, "args": { "External id": 141582, "cbid": 211, "correlation": 141582 } }, { "ph": "s", "id": 141582, "pid": 76337, "tid": -914061504, "ts": 1716454223612579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223662782, "dur": 30, "args": { "External id": 141585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141585, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141585, "pid": 5, "tid": 7, "ts": 1716454223662782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612597, "dur": 7, "args": { "External id": 141585, "cbid": 211, "correlation": 141585 } }, { "ph": "s", "id": 141585, "pid": 76337, "tid": -914061504, "ts": 1716454223612597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223612654, "dur": 0, "args": { "External id": 141596, "cbid": 317, "correlation": 141596 } }, { "ph": "f", "id": 141596, "pid": 76337, "tid": -914061504, "ts": 1716454223612654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223612655, "dur": 0, "args": { "External id": 141597, "cbid": 203, "correlation": 141597 } }, { "ph": "f", "id": 141597, "pid": 76337, "tid": -914061504, "ts": 1716454223612655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223612656, "dur": 0, "args": { "External id": 141598, "cbid": 205, "correlation": 141598 } }, { "ph": "f", "id": 141598, "pid": 76337, "tid": -914061504, "ts": 1716454223612656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223662814, "dur": 22, "args": { "External id": 141602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141602, "pid": 5, "tid": 7, "ts": 1716454223662814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612669, "dur": 12, "args": { "External id": 141602, "cbid": 211, "correlation": 141602 } }, { "ph": "s", "id": 141602, "pid": 76337, "tid": -914061504, "ts": 1716454223612669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223662837, "dur": 117, "args": { "External id": 141604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141604, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141604, "pid": 5, "tid": 7, "ts": 1716454223662837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612687, "dur": 6, "args": { "External id": 141604, "cbid": 211, "correlation": 141604 } }, { "ph": "s", "id": 141604, "pid": 76337, "tid": -914061504, "ts": 1716454223612687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223662955, "dur": 23, "args": { "External id": 141606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141606, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141606, "pid": 5, "tid": 7, "ts": 1716454223662955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612696, "dur": 6, "args": { "External id": 141606, "cbid": 211, "correlation": 141606 } }, { "ph": "s", "id": 141606, "pid": 76337, "tid": -914061504, "ts": 1716454223612696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223662980, "dur": 33, "args": { "External id": 141612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141612, "pid": 5, "tid": 7, "ts": 1716454223662980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612724, "dur": 8, "args": { "External id": 141612, "cbid": 211, "correlation": 141612 } }, { "ph": "s", "id": 141612, "pid": 76337, "tid": -914061504, "ts": 1716454223612724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223663014, "dur": 207, "args": { "External id": 141621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141621, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141621, "pid": 5, "tid": 7, "ts": 1716454223663014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612806, "dur": 13, "args": { "External id": 141621, "cbid": 211, "correlation": 141621 } }, { "ph": "s", "id": 141621, "pid": 76337, "tid": -914061504, "ts": 1716454223612806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223663222, "dur": 66, "args": { "External id": 141643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141643, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141643, "pid": 5, "tid": 7, "ts": 1716454223663222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612863, "dur": 10, "args": { "External id": 141643, "cbid": 211, "correlation": 141643 } }, { "ph": "s", "id": 141643, "pid": 76337, "tid": -914061504, "ts": 1716454223612863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223612951, "dur": 1, "args": { "External id": 141654, "cbid": 251, "correlation": 141654 } }, { "ph": "f", "id": 141654, "pid": 76337, "tid": -914061504, "ts": 1716454223612951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223663290, "dur": 154, "args": { "External id": 141655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141655, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141655, "pid": 5, "tid": 7, "ts": 1716454223663290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223612957, "dur": 13, "args": { "External id": 141655, "cbid": 211, "correlation": 141655 } }, { "ph": "s", "id": 141655, "pid": 76337, "tid": -914061504, "ts": 1716454223612957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613036, "dur": 1, "args": { "External id": 141666, "cbid": 251, "correlation": 141666 } }, { "ph": "f", "id": 141666, "pid": 76337, "tid": -914061504, "ts": 1716454223613036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223663445, "dur": 148, "args": { "External id": 141667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141667, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141667, "pid": 5, "tid": 7, "ts": 1716454223663445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613040, "dur": 12, "args": { "External id": 141667, "cbid": 211, "correlation": 141667 } }, { "ph": "s", "id": 141667, "pid": 76337, "tid": -914061504, "ts": 1716454223613040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613105, "dur": 1, "args": { "External id": 141678, "cbid": 251, "correlation": 141678 } }, { "ph": "f", "id": 141678, "pid": 76337, "tid": -914061504, "ts": 1716454223613105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223663595, "dur": 148, "args": { "External id": 141679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141679, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141679, "pid": 5, "tid": 7, "ts": 1716454223663595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613110, "dur": 11, "args": { "External id": 141679, "cbid": 211, "correlation": 141679 } }, { "ph": "s", "id": 141679, "pid": 76337, "tid": -914061504, "ts": 1716454223613110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223663744, "dur": 1956, "args": { "External id": 141700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141700, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 141700, "pid": 5, "tid": 7, "ts": 1716454223663744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613189, "dur": 12, "args": { "External id": 141700, "cbid": 211, "correlation": 141700 } }, { "ph": "s", "id": 141700, "pid": 76337, "tid": -914061504, "ts": 1716454223613189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613287, "dur": 1, "args": { "External id": 141718, "cbid": 251, "correlation": 141718 } }, { "ph": "f", "id": 141718, "pid": 76337, "tid": -914061504, "ts": 1716454223613287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223665701, "dur": 149, "args": { "External id": 141720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141720, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 141720, "pid": 5, "tid": 7, "ts": 1716454223665701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613293, "dur": 13, "args": { "External id": 141720, "cbid": 211, "correlation": 141720 } }, { "ph": "s", "id": 141720, "pid": 76337, "tid": -914061504, "ts": 1716454223613293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223665851, "dur": 35, "args": { "External id": 141728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141728, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141728, "pid": 5, "tid": 7, "ts": 1716454223665851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613364, "dur": 12, "args": { "External id": 141728, "cbid": 211, "correlation": 141728 } }, { "ph": "s", "id": 141728, "pid": 76337, "tid": -914061504, "ts": 1716454223613364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223665887, "dur": 50, "args": { "External id": 141736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141736, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141736, "pid": 5, "tid": 7, "ts": 1716454223665887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613402, "dur": 9, "args": { "External id": 141736, "cbid": 211, "correlation": 141736 } }, { "ph": "s", "id": 141736, "pid": 76337, "tid": -914061504, "ts": 1716454223613402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223665939, "dur": 30, "args": { "External id": 141747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141747, "pid": 5, "tid": 7, "ts": 1716454223665939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613473, "dur": 12, "args": { "External id": 141747, "cbid": 211, "correlation": 141747 } }, { "ph": "s", "id": 141747, "pid": 76337, "tid": -914061504, "ts": 1716454223613473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223665971, "dur": 35, "args": { "External id": 141769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141769, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141769, "pid": 5, "tid": 7, "ts": 1716454223665971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613503, "dur": 8, "args": { "External id": 141769, "cbid": 211, "correlation": 141769 } }, { "ph": "s", "id": 141769, "pid": 76337, "tid": -914061504, "ts": 1716454223613503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613588, "dur": 1, "args": { "External id": 141780, "cbid": 251, "correlation": 141780 } }, { "ph": "f", "id": 141780, "pid": 76337, "tid": -914061504, "ts": 1716454223613588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223666006, "dur": 90, "args": { "External id": 141781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141781, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141781, "pid": 5, "tid": 7, "ts": 1716454223666006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613594, "dur": 13, "args": { "External id": 141781, "cbid": 211, "correlation": 141781 } }, { "ph": "s", "id": 141781, "pid": 76337, "tid": -914061504, "ts": 1716454223613594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613664, "dur": 1, "args": { "External id": 141792, "cbid": 251, "correlation": 141792 } }, { "ph": "f", "id": 141792, "pid": 76337, "tid": -914061504, "ts": 1716454223613664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613668, "dur": 0, "args": { "External id": 141793, "cbid": 251, "correlation": 141793 } }, { "ph": "f", "id": 141793, "pid": 76337, "tid": -914061504, "ts": 1716454223613668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223666098, "dur": 11, "args": { "External id": 141794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141794, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 141794, "pid": 5, "tid": 7, "ts": 1716454223666098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613669, "dur": 12, "args": { "External id": 141794, "cbid": 211, "correlation": 141794 } }, { "ph": "s", "id": 141794, "pid": 76337, "tid": -914061504, "ts": 1716454223613669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223666110, "dur": 5, "args": { "External id": 141796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141796, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 141796, "pid": 5, "tid": 7, "ts": 1716454223666110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613683, "dur": 6, "args": { "External id": 141796, "cbid": 211, "correlation": 141796 } }, { "ph": "s", "id": 141796, "pid": 76337, "tid": -914061504, "ts": 1716454223613683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613740, "dur": 1, "args": { "External id": 141807, "cbid": 251, "correlation": 141807 } }, { "ph": "f", "id": 141807, "pid": 76337, "tid": -914061504, "ts": 1716454223613740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613743, "dur": 0, "args": { "External id": 141808, "cbid": 251, "correlation": 141808 } }, { "ph": "f", "id": 141808, "pid": 76337, "tid": -914061504, "ts": 1716454223613743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223666117, "dur": 7, "args": { "External id": 141809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141809, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 141809, "pid": 5, "tid": 7, "ts": 1716454223666117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613745, "dur": 12, "args": { "External id": 141809, "cbid": 211, "correlation": 141809 } }, { "ph": "s", "id": 141809, "pid": 76337, "tid": -914061504, "ts": 1716454223613745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223666125, "dur": 3, "args": { "External id": 141811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141811, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 141811, "pid": 5, "tid": 7, "ts": 1716454223666125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613758, "dur": 6, "args": { "External id": 141811, "cbid": 211, "correlation": 141811 } }, { "ph": "s", "id": 141811, "pid": 76337, "tid": -914061504, "ts": 1716454223613758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223666129, "dur": 92, "args": { "External id": 141832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141832, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 141832, "pid": 5, "tid": 7, "ts": 1716454223666129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613832, "dur": 12, "args": { "External id": 141832, "cbid": 211, "correlation": 141832 } }, { "ph": "s", "id": 141832, "pid": 76337, "tid": -914061504, "ts": 1716454223613832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223613928, "dur": 1, "args": { "External id": 141850, "cbid": 251, "correlation": 141850 } }, { "ph": "f", "id": 141850, "pid": 76337, "tid": -914061504, "ts": 1716454223613928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223666223, "dur": 100, "args": { "External id": 141852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141852, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141852, "pid": 5, "tid": 7, "ts": 1716454223666223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223613934, "dur": 13, "args": { "External id": 141852, "cbid": 211, "correlation": 141852 } }, { "ph": "s", "id": 141852, "pid": 76337, "tid": -914061504, "ts": 1716454223613934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223666324, "dur": 19, "args": { "External id": 141860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141860, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141860, "pid": 5, "tid": 7, "ts": 1716454223666324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614011, "dur": 13, "args": { "External id": 141860, "cbid": 211, "correlation": 141860 } }, { "ph": "s", "id": 141860, "pid": 76337, "tid": -914061504, "ts": 1716454223614011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223666344, "dur": 37, "args": { "External id": 141868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141868, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141868, "pid": 5, "tid": 7, "ts": 1716454223666344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614054, "dur": 10, "args": { "External id": 141868, "cbid": 211, "correlation": 141868 } }, { "ph": "s", "id": 141868, "pid": 76337, "tid": -914061504, "ts": 1716454223614054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223666383, "dur": 34, "args": { "External id": 141890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141890, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141890, "pid": 5, "tid": 7, "ts": 1716454223666383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614105, "dur": 11, "args": { "External id": 141890, "cbid": 211, "correlation": 141890 } }, { "ph": "s", "id": 141890, "pid": 76337, "tid": -914061504, "ts": 1716454223614105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223614194, "dur": 1, "args": { "External id": 141906, "cbid": 251, "correlation": 141906 } }, { "ph": "f", "id": 141906, "pid": 76337, "tid": -914061504, "ts": 1716454223614194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223614199, "dur": 0, "args": { "External id": 141908, "cbid": 251, "correlation": 141908 } }, { "ph": "f", "id": 141908, "pid": 76337, "tid": -914061504, "ts": 1716454223614199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223666418, "dur": 541, "args": { "External id": 141909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141909, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 141909, "pid": 5, "tid": 7, "ts": 1716454223666418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614202, "dur": 13, "args": { "External id": 141909, "cbid": 211, "correlation": 141909 } }, { "ph": "s", "id": 141909, "pid": 76337, "tid": -914061504, "ts": 1716454223614202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223666961, "dur": 127, "args": { "External id": 141917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141917, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141917, "pid": 5, "tid": 7, "ts": 1716454223666961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614267, "dur": 12, "args": { "External id": 141917, "cbid": 211, "correlation": 141917 } }, { "ph": "s", "id": 141917, "pid": 76337, "tid": -914061504, "ts": 1716454223614267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223667089, "dur": 128, "args": { "External id": 141925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141925, "pid": 5, "tid": 7, "ts": 1716454223667089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614296, "dur": 8, "args": { "External id": 141925, "cbid": 211, "correlation": 141925 } }, { "ph": "s", "id": 141925, "pid": 76337, "tid": -914061504, "ts": 1716454223614296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223614373, "dur": 1, "args": { "External id": 141941, "cbid": 251, "correlation": 141941 } }, { "ph": "f", "id": 141941, "pid": 76337, "tid": -914061504, "ts": 1716454223614373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223667218, "dur": 304, "args": { "External id": 141943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141943, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141943, "pid": 5, "tid": 7, "ts": 1716454223667218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614378, "dur": 12, "args": { "External id": 141943, "cbid": 211, "correlation": 141943 } }, { "ph": "s", "id": 141943, "pid": 76337, "tid": -914061504, "ts": 1716454223614378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223667524, "dur": 28, "args": { "External id": 141951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141951, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141951, "pid": 5, "tid": 7, "ts": 1716454223667524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614421, "dur": 9, "args": { "External id": 141951, "cbid": 211, "correlation": 141951 } }, { "ph": "s", "id": 141951, "pid": 76337, "tid": -914061504, "ts": 1716454223614421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223667553, "dur": 82, "args": { "External id": 141962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141962, "pid": 5, "tid": 7, "ts": 1716454223667553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614488, "dur": 12, "args": { "External id": 141962, "cbid": 211, "correlation": 141962 } }, { "ph": "s", "id": 141962, "pid": 76337, "tid": -914061504, "ts": 1716454223614488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223614550, "dur": 0, "args": { "External id": 141974, "cbid": 317, "correlation": 141974 } }, { "ph": "f", "id": 141974, "pid": 76337, "tid": -914061504, "ts": 1716454223614550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223614551, "dur": 0, "args": { "External id": 141975, "cbid": 203, "correlation": 141975 } }, { "ph": "f", "id": 141975, "pid": 76337, "tid": -914061504, "ts": 1716454223614551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223614552, "dur": 0, "args": { "External id": 141976, "cbid": 205, "correlation": 141976 } }, { "ph": "f", "id": 141976, "pid": 76337, "tid": -914061504, "ts": 1716454223614552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223667636, "dur": 24, "args": { "External id": 141980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141980, "pid": 5, "tid": 7, "ts": 1716454223667636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614567, "dur": 12, "args": { "External id": 141980, "cbid": 211, "correlation": 141980 } }, { "ph": "s", "id": 141980, "pid": 76337, "tid": -914061504, "ts": 1716454223614567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223667661, "dur": 122, "args": { "External id": 141982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141982, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 141982, "pid": 5, "tid": 7, "ts": 1716454223667661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614585, "dur": 6, "args": { "External id": 141982, "cbid": 211, "correlation": 141982 } }, { "ph": "s", "id": 141982, "pid": 76337, "tid": -914061504, "ts": 1716454223614585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223667784, "dur": 23, "args": { "External id": 141984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141984, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141984, "pid": 5, "tid": 7, "ts": 1716454223667784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614595, "dur": 5, "args": { "External id": 141984, "cbid": 211, "correlation": 141984 } }, { "ph": "s", "id": 141984, "pid": 76337, "tid": -914061504, "ts": 1716454223614595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223667809, "dur": 33, "args": { "External id": 141990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141990, "pid": 5, "tid": 7, "ts": 1716454223667809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614623, "dur": 8, "args": { "External id": 141990, "cbid": 211, "correlation": 141990 } }, { "ph": "s", "id": 141990, "pid": 76337, "tid": -914061504, "ts": 1716454223614623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223667843, "dur": 27, "args": { "External id": 141998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 141998, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 141998, "pid": 5, "tid": 7, "ts": 1716454223667843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614654, "dur": 8, "args": { "External id": 141998, "cbid": 211, "correlation": 141998 } }, { "ph": "s", "id": 141998, "pid": 76337, "tid": -914061504, "ts": 1716454223614654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223614726, "dur": 0, "args": { "External id": 142008, "cbid": 317, "correlation": 142008 } }, { "ph": "f", "id": 142008, "pid": 76337, "tid": -914061504, "ts": 1716454223614726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223614726, "dur": 0, "args": { "External id": 142009, "cbid": 203, "correlation": 142009 } }, { "ph": "f", "id": 142009, "pid": 76337, "tid": -914061504, "ts": 1716454223614726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223614727, "dur": 0, "args": { "External id": 142010, "cbid": 205, "correlation": 142010 } }, { "ph": "f", "id": 142010, "pid": 76337, "tid": -914061504, "ts": 1716454223614727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223667871, "dur": 22, "args": { "External id": 142014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142014, "pid": 5, "tid": 7, "ts": 1716454223667871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614742, "dur": 12, "args": { "External id": 142014, "cbid": 211, "correlation": 142014 } }, { "ph": "s", "id": 142014, "pid": 76337, "tid": -914061504, "ts": 1716454223614742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223667894, "dur": 44, "args": { "External id": 142016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142016, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142016, "pid": 5, "tid": 7, "ts": 1716454223667894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614757, "dur": 5, "args": { "External id": 142016, "cbid": 211, "correlation": 142016 } }, { "ph": "s", "id": 142016, "pid": 76337, "tid": -914061504, "ts": 1716454223614757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223667939, "dur": 234, "args": { "External id": 142018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142018, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 142018, "pid": 5, "tid": 7, "ts": 1716454223667939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614769, "dur": 7, "args": { "External id": 142018, "cbid": 211, "correlation": 142018 } }, { "ph": "s", "id": 142018, "pid": 76337, "tid": -914061504, "ts": 1716454223614769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223668174, "dur": 7, "args": { "External id": 142020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142020, "pid": 5, "tid": 7, "ts": 1716454223668174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614778, "dur": 5, "args": { "External id": 142020, "cbid": 211, "correlation": 142020 } }, { "ph": "s", "id": 142020, "pid": 76337, "tid": -914061504, "ts": 1716454223614778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223668182, "dur": 9, "args": { "External id": 142026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142026, "pid": 5, "tid": 7, "ts": 1716454223668182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614804, "dur": 8, "args": { "External id": 142026, "cbid": 211, "correlation": 142026 } }, { "ph": "s", "id": 142026, "pid": 76337, "tid": -914061504, "ts": 1716454223614804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223668192, "dur": 12, "args": { "External id": 142046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142046, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 142046, "pid": 5, "tid": 7, "ts": 1716454223668192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614896, "dur": 12, "args": { "External id": 142046, "cbid": 211, "correlation": 142046 } }, { "ph": "s", "id": 142046, "pid": 76337, "tid": -914061504, "ts": 1716454223614896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223668205, "dur": 4, "args": { "External id": 142058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142058, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 142058, "pid": 5, "tid": 7, "ts": 1716454223668205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614918, "dur": 7, "args": { "External id": 142058, "cbid": 211, "correlation": 142058 } }, { "ph": "s", "id": 142058, "pid": 76337, "tid": -914061504, "ts": 1716454223614918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223668211, "dur": 12, "args": { "External id": 142061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142061, "pid": 5, "tid": 7, "ts": 1716454223668211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614936, "dur": 6, "args": { "External id": 142061, "cbid": 211, "correlation": 142061 } }, { "ph": "s", "id": 142061, "pid": 76337, "tid": -914061504, "ts": 1716454223614936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223668224, "dur": 7, "args": { "External id": 142070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142070, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142070, "pid": 5, "tid": 7, "ts": 1716454223668224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223614983, "dur": 10, "args": { "External id": 142070, "cbid": 211, "correlation": 142070 } }, { "ph": "s", "id": 142070, "pid": 76337, "tid": -914061504, "ts": 1716454223614983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223615036, "dur": 0, "args": { "External id": 142080, "cbid": 317, "correlation": 142080 } }, { "ph": "f", "id": 142080, "pid": 76337, "tid": -914061504, "ts": 1716454223615036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223615037, "dur": 0, "args": { "External id": 142081, "cbid": 203, "correlation": 142081 } }, { "ph": "f", "id": 142081, "pid": 76337, "tid": -914061504, "ts": 1716454223615037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223615038, "dur": 0, "args": { "External id": 142082, "cbid": 205, "correlation": 142082 } }, { "ph": "f", "id": 142082, "pid": 76337, "tid": -914061504, "ts": 1716454223615038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223668233, "dur": 6, "args": { "External id": 142086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142086, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142086, "pid": 5, "tid": 7, "ts": 1716454223668233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615053, "dur": 11, "args": { "External id": 142086, "cbid": 211, "correlation": 142086 } }, { "ph": "s", "id": 142086, "pid": 76337, "tid": -914061504, "ts": 1716454223615053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223668239, "dur": 84, "args": { "External id": 142088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142088, "pid": 5, "tid": 7, "ts": 1716454223668239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615067, "dur": 6, "args": { "External id": 142088, "cbid": 211, "correlation": 142088 } }, { "ph": "s", "id": 142088, "pid": 76337, "tid": -914061504, "ts": 1716454223615067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223668326, "dur": 1, "args": { "External id": 142090, "device": 5, "context": 1, "stream": 7, "correlation": 142090, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 142090, "pid": 5, "tid": 7, "ts": 1716454223668326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223615080, "dur": 9, "args": { "External id": 142090, "cbid": 51, "correlation": 142090 } }, { "ph": "s", "id": 142090, "pid": 76337, "tid": -914061504, "ts": 1716454223615080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223668329, "dur": 543, "args": { "External id": 142091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142091, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142091, "pid": 5, "tid": 7, "ts": 1716454223668329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615091, "dur": 8, "args": { "External id": 142091, "cbid": 211, "correlation": 142091 } }, { "ph": "s", "id": 142091, "pid": 76337, "tid": -914061504, "ts": 1716454223615091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223668873, "dur": 11, "args": { "External id": 142093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142093, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142093, "pid": 5, "tid": 7, "ts": 1716454223668873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615102, "dur": 5, "args": { "External id": 142093, "cbid": 211, "correlation": 142093 } }, { "ph": "s", "id": 142093, "pid": 76337, "tid": -914061504, "ts": 1716454223615102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223668886, "dur": 14, "args": { "External id": 142099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142099, "pid": 5, "tid": 7, "ts": 1716454223668886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615130, "dur": 8, "args": { "External id": 142099, "cbid": 211, "correlation": 142099 } }, { "ph": "s", "id": 142099, "pid": 76337, "tid": -914061504, "ts": 1716454223615130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223668902, "dur": 4, "args": { "External id": 142107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142107, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 142107, "pid": 5, "tid": 7, "ts": 1716454223668902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615174, "dur": 9, "args": { "External id": 142107, "cbid": 211, "correlation": 142107 } }, { "ph": "s", "id": 142107, "pid": 76337, "tid": -914061504, "ts": 1716454223615174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223615239, "dur": 1, "args": { "External id": 142123, "cbid": 251, "correlation": 142123 } }, { "ph": "f", "id": 142123, "pid": 76337, "tid": -914061504, "ts": 1716454223615239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223615244, "dur": 0, "args": { "External id": 142125, "cbid": 251, "correlation": 142125 } }, { "ph": "f", "id": 142125, "pid": 76337, "tid": -914061504, "ts": 1716454223615244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223668907, "dur": 13, "args": { "External id": 142126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142126, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142126, "pid": 5, "tid": 7, "ts": 1716454223668907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615246, "dur": 11, "args": { "External id": 142126, "cbid": 211, "correlation": 142126 } }, { "ph": "s", "id": 142126, "pid": 76337, "tid": -914061504, "ts": 1716454223615246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223668922, "dur": 5, "args": { "External id": 142128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142128, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142128, "pid": 5, "tid": 7, "ts": 1716454223668922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615258, "dur": 5, "args": { "External id": 142128, "cbid": 211, "correlation": 142128 } }, { "ph": "s", "id": 142128, "pid": 76337, "tid": -914061504, "ts": 1716454223615258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223668929, "dur": 16, "args": { "External id": 142138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142138, "pid": 5, "tid": 7, "ts": 1716454223668929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615315, "dur": 13, "args": { "External id": 142138, "cbid": 211, "correlation": 142138 } }, { "ph": "s", "id": 142138, "pid": 76337, "tid": -914061504, "ts": 1716454223615315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223668946, "dur": 19, "args": { "External id": 142158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142158, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 142158, "pid": 5, "tid": 7, "ts": 1716454223668946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615383, "dur": 10, "args": { "External id": 142158, "cbid": 211, "correlation": 142158 } }, { "ph": "s", "id": 142158, "pid": 76337, "tid": -914061504, "ts": 1716454223615383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223668967, "dur": 5, "args": { "External id": 142170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142170, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 142170, "pid": 5, "tid": 7, "ts": 1716454223668967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615404, "dur": 6, "args": { "External id": 142170, "cbid": 211, "correlation": 142170 } }, { "ph": "s", "id": 142170, "pid": 76337, "tid": -914061504, "ts": 1716454223615404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223668973, "dur": 17, "args": { "External id": 142173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142173, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142173, "pid": 5, "tid": 7, "ts": 1716454223668973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615423, "dur": 7, "args": { "External id": 142173, "cbid": 211, "correlation": 142173 } }, { "ph": "s", "id": 142173, "pid": 76337, "tid": -914061504, "ts": 1716454223615423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223668991, "dur": 11, "args": { "External id": 142182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142182, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142182, "pid": 5, "tid": 7, "ts": 1716454223668991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615462, "dur": 10, "args": { "External id": 142182, "cbid": 211, "correlation": 142182 } }, { "ph": "s", "id": 142182, "pid": 76337, "tid": -914061504, "ts": 1716454223615462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223615524, "dur": 0, "args": { "External id": 142192, "cbid": 317, "correlation": 142192 } }, { "ph": "f", "id": 142192, "pid": 76337, "tid": -914061504, "ts": 1716454223615524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223615525, "dur": 0, "args": { "External id": 142193, "cbid": 203, "correlation": 142193 } }, { "ph": "f", "id": 142193, "pid": 76337, "tid": -914061504, "ts": 1716454223615525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223615526, "dur": 0, "args": { "External id": 142194, "cbid": 205, "correlation": 142194 } }, { "ph": "f", "id": 142194, "pid": 76337, "tid": -914061504, "ts": 1716454223615526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223669003, "dur": 11, "args": { "External id": 142198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142198, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142198, "pid": 5, "tid": 7, "ts": 1716454223669003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615543, "dur": 12, "args": { "External id": 142198, "cbid": 211, "correlation": 142198 } }, { "ph": "s", "id": 142198, "pid": 76337, "tid": -914061504, "ts": 1716454223615543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223669016, "dur": 163, "args": { "External id": 142200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142200, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142200, "pid": 5, "tid": 7, "ts": 1716454223669016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615557, "dur": 5, "args": { "External id": 142200, "cbid": 211, "correlation": 142200 } }, { "ph": "s", "id": 142200, "pid": 76337, "tid": -914061504, "ts": 1716454223615557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223669181, "dur": 1, "args": { "External id": 142202, "device": 5, "context": 1, "stream": 7, "correlation": 142202, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 142202, "pid": 5, "tid": 7, "ts": 1716454223669181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223615569, "dur": 8, "args": { "External id": 142202, "cbid": 51, "correlation": 142202 } }, { "ph": "s", "id": 142202, "pid": 76337, "tid": -914061504, "ts": 1716454223615569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223669184, "dur": 663, "args": { "External id": 142203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142203, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142203, "pid": 5, "tid": 7, "ts": 1716454223669184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615578, "dur": 6, "args": { "External id": 142203, "cbid": 211, "correlation": 142203 } }, { "ph": "s", "id": 142203, "pid": 76337, "tid": -914061504, "ts": 1716454223615578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223669848, "dur": 14, "args": { "External id": 142205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142205, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142205, "pid": 5, "tid": 7, "ts": 1716454223669848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615589, "dur": 5, "args": { "External id": 142205, "cbid": 211, "correlation": 142205 } }, { "ph": "s", "id": 142205, "pid": 76337, "tid": -914061504, "ts": 1716454223615589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223669863, "dur": 15, "args": { "External id": 142211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142211, "pid": 5, "tid": 7, "ts": 1716454223669863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615617, "dur": 10, "args": { "External id": 142211, "cbid": 211, "correlation": 142211 } }, { "ph": "s", "id": 142211, "pid": 76337, "tid": -914061504, "ts": 1716454223615617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223615676, "dur": 0, "args": { "External id": 142221, "cbid": 317, "correlation": 142221 } }, { "ph": "f", "id": 142221, "pid": 76337, "tid": -914061504, "ts": 1716454223615676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223615677, "dur": 0, "args": { "External id": 142222, "cbid": 203, "correlation": 142222 } }, { "ph": "f", "id": 142222, "pid": 76337, "tid": -914061504, "ts": 1716454223615677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223615677, "dur": 0, "args": { "External id": 142223, "cbid": 205, "correlation": 142223 } }, { "ph": "f", "id": 142223, "pid": 76337, "tid": -914061504, "ts": 1716454223615677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223669879, "dur": 8, "args": { "External id": 142227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142227, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142227, "pid": 5, "tid": 7, "ts": 1716454223669879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615692, "dur": 11, "args": { "External id": 142227, "cbid": 211, "correlation": 142227 } }, { "ph": "s", "id": 142227, "pid": 76337, "tid": -914061504, "ts": 1716454223615692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223669889, "dur": 3, "args": { "External id": 142229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142229, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142229, "pid": 5, "tid": 7, "ts": 1716454223669889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615709, "dur": 6, "args": { "External id": 142229, "cbid": 211, "correlation": 142229 } }, { "ph": "s", "id": 142229, "pid": 76337, "tid": -914061504, "ts": 1716454223615709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223615719, "dur": 0, "args": { "External id": 142230, "cbid": 51, "correlation": 142230 } }, { "ph": "s", "id": 142230, "pid": 76337, "tid": -914061504, "ts": 1716454223615719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223669894, "dur": 58, "args": { "External id": 142231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142231, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 142231, "pid": 5, "tid": 7, "ts": 1716454223669894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615720, "dur": 5, "args": { "External id": 142231, "cbid": 211, "correlation": 142231 } }, { "ph": "s", "id": 142231, "pid": 76337, "tid": -914061504, "ts": 1716454223615720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223669953, "dur": 14, "args": { "External id": 142236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142236, "pid": 5, "tid": 7, "ts": 1716454223669953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615746, "dur": 8, "args": { "External id": 142236, "cbid": 211, "correlation": 142236 } }, { "ph": "s", "id": 142236, "pid": 76337, "tid": -914061504, "ts": 1716454223615746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223669968, "dur": 11, "args": { "External id": 142244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142244, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142244, "pid": 5, "tid": 7, "ts": 1716454223669968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615775, "dur": 8, "args": { "External id": 142244, "cbid": 211, "correlation": 142244 } }, { "ph": "s", "id": 142244, "pid": 76337, "tid": -914061504, "ts": 1716454223615775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223669981, "dur": 10, "args": { "External id": 142252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142252, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142252, "pid": 5, "tid": 7, "ts": 1716454223669981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615803, "dur": 8, "args": { "External id": 142252, "cbid": 211, "correlation": 142252 } }, { "ph": "s", "id": 142252, "pid": 76337, "tid": -914061504, "ts": 1716454223615803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223669992, "dur": 19, "args": { "External id": 142272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142272, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 142272, "pid": 5, "tid": 7, "ts": 1716454223669992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615885, "dur": 12, "args": { "External id": 142272, "cbid": 211, "correlation": 142272 } }, { "ph": "s", "id": 142272, "pid": 76337, "tid": -914061504, "ts": 1716454223615885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223670013, "dur": 5, "args": { "External id": 142284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142284, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 142284, "pid": 5, "tid": 7, "ts": 1716454223670013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615907, "dur": 6, "args": { "External id": 142284, "cbid": 211, "correlation": 142284 } }, { "ph": "s", "id": 142284, "pid": 76337, "tid": -914061504, "ts": 1716454223615907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223670019, "dur": 17, "args": { "External id": 142287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142287, "pid": 5, "tid": 7, "ts": 1716454223670019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223615925, "dur": 6, "args": { "External id": 142287, "cbid": 211, "correlation": 142287 } }, { "ph": "s", "id": 142287, "pid": 76337, "tid": -914061504, "ts": 1716454223615925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223615989, "dur": 0, "args": { "External id": 142298, "cbid": 317, "correlation": 142298 } }, { "ph": "f", "id": 142298, "pid": 76337, "tid": -914061504, "ts": 1716454223615989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223615990, "dur": 0, "args": { "External id": 142299, "cbid": 203, "correlation": 142299 } }, { "ph": "f", "id": 142299, "pid": 76337, "tid": -914061504, "ts": 1716454223615990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223615991, "dur": 0, "args": { "External id": 142300, "cbid": 205, "correlation": 142300 } }, { "ph": "f", "id": 142300, "pid": 76337, "tid": -914061504, "ts": 1716454223615991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223670037, "dur": 12, "args": { "External id": 142304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142304, "pid": 5, "tid": 7, "ts": 1716454223670037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616006, "dur": 12, "args": { "External id": 142304, "cbid": 211, "correlation": 142304 } }, { "ph": "s", "id": 142304, "pid": 76337, "tid": -914061504, "ts": 1716454223616006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223670050, "dur": 3, "args": { "External id": 142306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142306, "pid": 5, "tid": 7, "ts": 1716454223670050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616024, "dur": 5, "args": { "External id": 142306, "cbid": 211, "correlation": 142306 } }, { "ph": "s", "id": 142306, "pid": 76337, "tid": -914061504, "ts": 1716454223616024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223616033, "dur": 0, "args": { "External id": 142307, "cbid": 51, "correlation": 142307 } }, { "ph": "s", "id": 142307, "pid": 76337, "tid": -914061504, "ts": 1716454223616033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223670054, "dur": 97, "args": { "External id": 142308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142308, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 142308, "pid": 5, "tid": 7, "ts": 1716454223670054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616034, "dur": 5, "args": { "External id": 142308, "cbid": 211, "correlation": 142308 } }, { "ph": "s", "id": 142308, "pid": 76337, "tid": -914061504, "ts": 1716454223616034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223670153, "dur": 15, "args": { "External id": 142313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142313, "pid": 5, "tid": 7, "ts": 1716454223670153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616061, "dur": 9, "args": { "External id": 142313, "cbid": 211, "correlation": 142313 } }, { "ph": "s", "id": 142313, "pid": 76337, "tid": -914061504, "ts": 1716454223616061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223670170, "dur": 84, "args": { "External id": 142322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142322, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142322, "pid": 5, "tid": 7, "ts": 1716454223670170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616143, "dur": 14, "args": { "External id": 142322, "cbid": 211, "correlation": 142322 } }, { "ph": "s", "id": 142322, "pid": 76337, "tid": -914061504, "ts": 1716454223616143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223670255, "dur": 31, "args": { "External id": 142344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142344, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142344, "pid": 5, "tid": 7, "ts": 1716454223670255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616200, "dur": 10, "args": { "External id": 142344, "cbid": 211, "correlation": 142344 } }, { "ph": "s", "id": 142344, "pid": 76337, "tid": -914061504, "ts": 1716454223616200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223616289, "dur": 1, "args": { "External id": 142355, "cbid": 251, "correlation": 142355 } }, { "ph": "f", "id": 142355, "pid": 76337, "tid": -914061504, "ts": 1716454223616289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223670287, "dur": 165, "args": { "External id": 142356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142356, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142356, "pid": 5, "tid": 7, "ts": 1716454223670287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616295, "dur": 13, "args": { "External id": 142356, "cbid": 211, "correlation": 142356 } }, { "ph": "s", "id": 142356, "pid": 76337, "tid": -914061504, "ts": 1716454223616295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223616365, "dur": 1, "args": { "External id": 142367, "cbid": 251, "correlation": 142367 } }, { "ph": "f", "id": 142367, "pid": 76337, "tid": -914061504, "ts": 1716454223616365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223670453, "dur": 159, "args": { "External id": 142368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142368, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142368, "pid": 5, "tid": 7, "ts": 1716454223670453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616369, "dur": 12, "args": { "External id": 142368, "cbid": 211, "correlation": 142368 } }, { "ph": "s", "id": 142368, "pid": 76337, "tid": -914061504, "ts": 1716454223616369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223616434, "dur": 1, "args": { "External id": 142379, "cbid": 251, "correlation": 142379 } }, { "ph": "f", "id": 142379, "pid": 76337, "tid": -914061504, "ts": 1716454223616434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223670613, "dur": 158, "args": { "External id": 142380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142380, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142380, "pid": 5, "tid": 7, "ts": 1716454223670613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616438, "dur": 11, "args": { "External id": 142380, "cbid": 211, "correlation": 142380 } }, { "ph": "s", "id": 142380, "pid": 76337, "tid": -914061504, "ts": 1716454223616438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223670773, "dur": 337, "args": { "External id": 142405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142405, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142405, "pid": 5, "tid": 7, "ts": 1716454223670773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616524, "dur": 13, "args": { "External id": 142405, "cbid": 211, "correlation": 142405 } }, { "ph": "s", "id": 142405, "pid": 76337, "tid": -914061504, "ts": 1716454223616524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223616625, "dur": 1, "args": { "External id": 142423, "cbid": 251, "correlation": 142423 } }, { "ph": "f", "id": 142423, "pid": 76337, "tid": -914061504, "ts": 1716454223616625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223671111, "dur": 168, "args": { "External id": 142425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142425, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142425, "pid": 5, "tid": 7, "ts": 1716454223671111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616632, "dur": 13, "args": { "External id": 142425, "cbid": 211, "correlation": 142425 } }, { "ph": "s", "id": 142425, "pid": 76337, "tid": -914061504, "ts": 1716454223616632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223671281, "dur": 20, "args": { "External id": 142433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142433, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142433, "pid": 5, "tid": 7, "ts": 1716454223671281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616701, "dur": 12, "args": { "External id": 142433, "cbid": 211, "correlation": 142433 } }, { "ph": "s", "id": 142433, "pid": 76337, "tid": -914061504, "ts": 1716454223616701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223671302, "dur": 28, "args": { "External id": 142441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142441, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142441, "pid": 5, "tid": 7, "ts": 1716454223671302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616740, "dur": 9, "args": { "External id": 142441, "cbid": 211, "correlation": 142441 } }, { "ph": "s", "id": 142441, "pid": 76337, "tid": -914061504, "ts": 1716454223616740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223671332, "dur": 18, "args": { "External id": 142452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142452, "pid": 5, "tid": 7, "ts": 1716454223671332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616812, "dur": 12, "args": { "External id": 142452, "cbid": 211, "correlation": 142452 } }, { "ph": "s", "id": 142452, "pid": 76337, "tid": -914061504, "ts": 1716454223616812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223671351, "dur": 16, "args": { "External id": 142474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142474, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142474, "pid": 5, "tid": 7, "ts": 1716454223671351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616844, "dur": 7, "args": { "External id": 142474, "cbid": 211, "correlation": 142474 } }, { "ph": "s", "id": 142474, "pid": 76337, "tid": -914061504, "ts": 1716454223616844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223616929, "dur": 1, "args": { "External id": 142485, "cbid": 251, "correlation": 142485 } }, { "ph": "f", "id": 142485, "pid": 76337, "tid": -914061504, "ts": 1716454223616929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223671368, "dur": 89, "args": { "External id": 142486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142486, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142486, "pid": 5, "tid": 7, "ts": 1716454223671368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223616936, "dur": 15, "args": { "External id": 142486, "cbid": 211, "correlation": 142486 } }, { "ph": "s", "id": 142486, "pid": 76337, "tid": -914061504, "ts": 1716454223616936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617015, "dur": 1, "args": { "External id": 142497, "cbid": 251, "correlation": 142497 } }, { "ph": "f", "id": 142497, "pid": 76337, "tid": -914061504, "ts": 1716454223617015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617019, "dur": 0, "args": { "External id": 142498, "cbid": 251, "correlation": 142498 } }, { "ph": "f", "id": 142498, "pid": 76337, "tid": -914061504, "ts": 1716454223617019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223671459, "dur": 12, "args": { "External id": 142499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142499, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142499, "pid": 5, "tid": 7, "ts": 1716454223671459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617021, "dur": 13, "args": { "External id": 142499, "cbid": 211, "correlation": 142499 } }, { "ph": "s", "id": 142499, "pid": 76337, "tid": -914061504, "ts": 1716454223617021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223671472, "dur": 5, "args": { "External id": 142501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142501, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142501, "pid": 5, "tid": 7, "ts": 1716454223671472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617035, "dur": 6, "args": { "External id": 142501, "cbid": 211, "correlation": 142501 } }, { "ph": "s", "id": 142501, "pid": 76337, "tid": -914061504, "ts": 1716454223617035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617093, "dur": 1, "args": { "External id": 142512, "cbid": 251, "correlation": 142512 } }, { "ph": "f", "id": 142512, "pid": 76337, "tid": -914061504, "ts": 1716454223617093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617096, "dur": 0, "args": { "External id": 142513, "cbid": 251, "correlation": 142513 } }, { "ph": "f", "id": 142513, "pid": 76337, "tid": -914061504, "ts": 1716454223617096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223671479, "dur": 8, "args": { "External id": 142514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142514, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142514, "pid": 5, "tid": 7, "ts": 1716454223671479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617098, "dur": 11, "args": { "External id": 142514, "cbid": 211, "correlation": 142514 } }, { "ph": "s", "id": 142514, "pid": 76337, "tid": -914061504, "ts": 1716454223617098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223671488, "dur": 3, "args": { "External id": 142516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142516, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142516, "pid": 5, "tid": 7, "ts": 1716454223671488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617111, "dur": 5, "args": { "External id": 142516, "cbid": 211, "correlation": 142516 } }, { "ph": "s", "id": 142516, "pid": 76337, "tid": -914061504, "ts": 1716454223617111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223671493, "dur": 56, "args": { "External id": 142541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142541, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142541, "pid": 5, "tid": 7, "ts": 1716454223671493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617188, "dur": 12, "args": { "External id": 142541, "cbid": 211, "correlation": 142541 } }, { "ph": "s", "id": 142541, "pid": 76337, "tid": -914061504, "ts": 1716454223617188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617286, "dur": 2, "args": { "External id": 142559, "cbid": 251, "correlation": 142559 } }, { "ph": "f", "id": 142559, "pid": 76337, "tid": -914061504, "ts": 1716454223617286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223671550, "dur": 91, "args": { "External id": 142561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142561, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142561, "pid": 5, "tid": 7, "ts": 1716454223671550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617293, "dur": 14, "args": { "External id": 142561, "cbid": 211, "correlation": 142561 } }, { "ph": "s", "id": 142561, "pid": 76337, "tid": -914061504, "ts": 1716454223617293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223671643, "dur": 10, "args": { "External id": 142569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142569, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142569, "pid": 5, "tid": 7, "ts": 1716454223671643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617363, "dur": 12, "args": { "External id": 142569, "cbid": 211, "correlation": 142569 } }, { "ph": "s", "id": 142569, "pid": 76337, "tid": -914061504, "ts": 1716454223617363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223671654, "dur": 20, "args": { "External id": 142577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142577, "pid": 5, "tid": 7, "ts": 1716454223671654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617405, "dur": 9, "args": { "External id": 142577, "cbid": 211, "correlation": 142577 } }, { "ph": "s", "id": 142577, "pid": 76337, "tid": -914061504, "ts": 1716454223617405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223671675, "dur": 17, "args": { "External id": 142599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142599, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142599, "pid": 5, "tid": 7, "ts": 1716454223671675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617456, "dur": 10, "args": { "External id": 142599, "cbid": 211, "correlation": 142599 } }, { "ph": "s", "id": 142599, "pid": 76337, "tid": -914061504, "ts": 1716454223617456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617547, "dur": 2, "args": { "External id": 142615, "cbid": 251, "correlation": 142615 } }, { "ph": "f", "id": 142615, "pid": 76337, "tid": -914061504, "ts": 1716454223617547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617552, "dur": 0, "args": { "External id": 142617, "cbid": 251, "correlation": 142617 } }, { "ph": "f", "id": 142617, "pid": 76337, "tid": -914061504, "ts": 1716454223617552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223671694, "dur": 496, "args": { "External id": 142618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142618, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142618, "pid": 5, "tid": 7, "ts": 1716454223671694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617555, "dur": 15, "args": { "External id": 142618, "cbid": 211, "correlation": 142618 } }, { "ph": "s", "id": 142618, "pid": 76337, "tid": -914061504, "ts": 1716454223617555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223672191, "dur": 67, "args": { "External id": 142626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142626, "pid": 5, "tid": 7, "ts": 1716454223672191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617622, "dur": 12, "args": { "External id": 142626, "cbid": 211, "correlation": 142626 } }, { "ph": "s", "id": 142626, "pid": 76337, "tid": -914061504, "ts": 1716454223617622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223672259, "dur": 68, "args": { "External id": 142634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142634, "pid": 5, "tid": 7, "ts": 1716454223672259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617654, "dur": 9, "args": { "External id": 142634, "cbid": 211, "correlation": 142634 } }, { "ph": "s", "id": 142634, "pid": 76337, "tid": -914061504, "ts": 1716454223617654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223617734, "dur": 1, "args": { "External id": 142650, "cbid": 251, "correlation": 142650 } }, { "ph": "f", "id": 142650, "pid": 76337, "tid": -914061504, "ts": 1716454223617734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223672329, "dur": 1, "args": { "External id": 142652, "device": 5, "context": 1, "stream": 7, "correlation": 142652, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 142652, "pid": 5, "tid": 7, "ts": 1716454223672329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223617740, "dur": 12, "args": { "External id": 142652, "cbid": 51, "correlation": 142652 } }, { "ph": "s", "id": 142652, "pid": 76337, "tid": -914061504, "ts": 1716454223617740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223672333, "dur": 272, "args": { "External id": 142653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142653, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142653, "pid": 5, "tid": 7, "ts": 1716454223672333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617754, "dur": 11, "args": { "External id": 142653, "cbid": 211, "correlation": 142653 } }, { "ph": "s", "id": 142653, "pid": 76337, "tid": -914061504, "ts": 1716454223617754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223672606, "dur": 14, "args": { "External id": 142661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142661, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142661, "pid": 5, "tid": 7, "ts": 1716454223672606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617796, "dur": 10, "args": { "External id": 142661, "cbid": 211, "correlation": 142661 } }, { "ph": "s", "id": 142661, "pid": 76337, "tid": -914061504, "ts": 1716454223617796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223672621, "dur": 38, "args": { "External id": 142672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142672, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142672, "pid": 5, "tid": 7, "ts": 1716454223672621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617862, "dur": 12, "args": { "External id": 142672, "cbid": 211, "correlation": 142672 } }, { "ph": "s", "id": 142672, "pid": 76337, "tid": -914061504, "ts": 1716454223617862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223617928, "dur": 0, "args": { "External id": 142684, "cbid": 317, "correlation": 142684 } }, { "ph": "f", "id": 142684, "pid": 76337, "tid": -914061504, "ts": 1716454223617928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223617929, "dur": 0, "args": { "External id": 142685, "cbid": 203, "correlation": 142685 } }, { "ph": "f", "id": 142685, "pid": 76337, "tid": -914061504, "ts": 1716454223617929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223617930, "dur": 0, "args": { "External id": 142686, "cbid": 205, "correlation": 142686 } }, { "ph": "f", "id": 142686, "pid": 76337, "tid": -914061504, "ts": 1716454223617930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223672661, "dur": 13, "args": { "External id": 142690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142690, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142690, "pid": 5, "tid": 7, "ts": 1716454223672661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617946, "dur": 12, "args": { "External id": 142690, "cbid": 211, "correlation": 142690 } }, { "ph": "s", "id": 142690, "pid": 76337, "tid": -914061504, "ts": 1716454223617946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223672675, "dur": 4, "args": { "External id": 142692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142692, "pid": 5, "tid": 7, "ts": 1716454223672675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617962, "dur": 6, "args": { "External id": 142692, "cbid": 211, "correlation": 142692 } }, { "ph": "s", "id": 142692, "pid": 76337, "tid": -914061504, "ts": 1716454223617962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223617970, "dur": 0, "args": { "External id": 142693, "cbid": 51, "correlation": 142693 } }, { "ph": "s", "id": 142693, "pid": 76337, "tid": -914061504, "ts": 1716454223617970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223672680, "dur": 97, "args": { "External id": 142694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142694, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 142694, "pid": 5, "tid": 7, "ts": 1716454223672680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223617971, "dur": 14, "args": { "External id": 142694, "cbid": 211, "correlation": 142694 } }, { "ph": "s", "id": 142694, "pid": 76337, "tid": -914061504, "ts": 1716454223617971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223672778, "dur": 16, "args": { "External id": 142699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142699, "pid": 5, "tid": 7, "ts": 1716454223672778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618008, "dur": 9, "args": { "External id": 142699, "cbid": 211, "correlation": 142699 } }, { "ph": "s", "id": 142699, "pid": 76337, "tid": -914061504, "ts": 1716454223618008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223672796, "dur": 12, "args": { "External id": 142707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142707, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142707, "pid": 5, "tid": 7, "ts": 1716454223672796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618041, "dur": 9, "args": { "External id": 142707, "cbid": 211, "correlation": 142707 } }, { "ph": "s", "id": 142707, "pid": 76337, "tid": -914061504, "ts": 1716454223618041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223672809, "dur": 18, "args": { "External id": 142727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142727, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 142727, "pid": 5, "tid": 7, "ts": 1716454223672809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618113, "dur": 11, "args": { "External id": 142727, "cbid": 211, "correlation": 142727 } }, { "ph": "s", "id": 142727, "pid": 76337, "tid": -914061504, "ts": 1716454223618113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223672828, "dur": 5, "args": { "External id": 142739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142739, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 142739, "pid": 5, "tid": 7, "ts": 1716454223672828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618135, "dur": 7, "args": { "External id": 142739, "cbid": 211, "correlation": 142739 } }, { "ph": "s", "id": 142739, "pid": 76337, "tid": -914061504, "ts": 1716454223618135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223672835, "dur": 18, "args": { "External id": 142742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142742, "pid": 5, "tid": 7, "ts": 1716454223672835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618153, "dur": 7, "args": { "External id": 142742, "cbid": 211, "correlation": 142742 } }, { "ph": "s", "id": 142742, "pid": 76337, "tid": -914061504, "ts": 1716454223618153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223672854, "dur": 13, "args": { "External id": 142751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142751, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142751, "pid": 5, "tid": 7, "ts": 1716454223672854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618193, "dur": 9, "args": { "External id": 142751, "cbid": 211, "correlation": 142751 } }, { "ph": "s", "id": 142751, "pid": 76337, "tid": -914061504, "ts": 1716454223618193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223618244, "dur": 0, "args": { "External id": 142761, "cbid": 317, "correlation": 142761 } }, { "ph": "f", "id": 142761, "pid": 76337, "tid": -914061504, "ts": 1716454223618244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223618245, "dur": 0, "args": { "External id": 142762, "cbid": 203, "correlation": 142762 } }, { "ph": "f", "id": 142762, "pid": 76337, "tid": -914061504, "ts": 1716454223618245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223618246, "dur": 0, "args": { "External id": 142763, "cbid": 205, "correlation": 142763 } }, { "ph": "f", "id": 142763, "pid": 76337, "tid": -914061504, "ts": 1716454223618246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223672868, "dur": 11, "args": { "External id": 142767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142767, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142767, "pid": 5, "tid": 7, "ts": 1716454223672868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618260, "dur": 11, "args": { "External id": 142767, "cbid": 211, "correlation": 142767 } }, { "ph": "s", "id": 142767, "pid": 76337, "tid": -914061504, "ts": 1716454223618260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223672881, "dur": 163, "args": { "External id": 142769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142769, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142769, "pid": 5, "tid": 7, "ts": 1716454223672881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618273, "dur": 5, "args": { "External id": 142769, "cbid": 211, "correlation": 142769 } }, { "ph": "s", "id": 142769, "pid": 76337, "tid": -914061504, "ts": 1716454223618273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223673046, "dur": 1, "args": { "External id": 142771, "device": 5, "context": 1, "stream": 7, "correlation": 142771, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 142771, "pid": 5, "tid": 7, "ts": 1716454223673046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223618284, "dur": 7, "args": { "External id": 142771, "cbid": 51, "correlation": 142771 } }, { "ph": "s", "id": 142771, "pid": 76337, "tid": -914061504, "ts": 1716454223618284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223673050, "dur": 664, "args": { "External id": 142772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142772, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142772, "pid": 5, "tid": 7, "ts": 1716454223673050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618292, "dur": 6, "args": { "External id": 142772, "cbid": 211, "correlation": 142772 } }, { "ph": "s", "id": 142772, "pid": 76337, "tid": -914061504, "ts": 1716454223618292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223673715, "dur": 13, "args": { "External id": 142774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142774, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142774, "pid": 5, "tid": 7, "ts": 1716454223673715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618302, "dur": 5, "args": { "External id": 142774, "cbid": 211, "correlation": 142774 } }, { "ph": "s", "id": 142774, "pid": 76337, "tid": -914061504, "ts": 1716454223618302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223673730, "dur": 14, "args": { "External id": 142780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142780, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142780, "pid": 5, "tid": 7, "ts": 1716454223673730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618330, "dur": 9, "args": { "External id": 142780, "cbid": 211, "correlation": 142780 } }, { "ph": "s", "id": 142780, "pid": 76337, "tid": -914061504, "ts": 1716454223618330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223673746, "dur": 4, "args": { "External id": 142788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142788, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 142788, "pid": 5, "tid": 7, "ts": 1716454223673746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618374, "dur": 9, "args": { "External id": 142788, "cbid": 211, "correlation": 142788 } }, { "ph": "s", "id": 142788, "pid": 76337, "tid": -914061504, "ts": 1716454223618374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223618438, "dur": 1, "args": { "External id": 142804, "cbid": 251, "correlation": 142804 } }, { "ph": "f", "id": 142804, "pid": 76337, "tid": -914061504, "ts": 1716454223618438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223618444, "dur": 0, "args": { "External id": 142806, "cbid": 251, "correlation": 142806 } }, { "ph": "f", "id": 142806, "pid": 76337, "tid": -914061504, "ts": 1716454223618444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223673751, "dur": 13, "args": { "External id": 142807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142807, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142807, "pid": 5, "tid": 7, "ts": 1716454223673751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618446, "dur": 11, "args": { "External id": 142807, "cbid": 211, "correlation": 142807 } }, { "ph": "s", "id": 142807, "pid": 76337, "tid": -914061504, "ts": 1716454223618446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223673766, "dur": 5, "args": { "External id": 142809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142809, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142809, "pid": 5, "tid": 7, "ts": 1716454223673766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618459, "dur": 5, "args": { "External id": 142809, "cbid": 211, "correlation": 142809 } }, { "ph": "s", "id": 142809, "pid": 76337, "tid": -914061504, "ts": 1716454223618459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223673772, "dur": 17, "args": { "External id": 142819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142819, "pid": 5, "tid": 7, "ts": 1716454223673772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618517, "dur": 12, "args": { "External id": 142819, "cbid": 211, "correlation": 142819 } }, { "ph": "s", "id": 142819, "pid": 76337, "tid": -914061504, "ts": 1716454223618517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223673790, "dur": 17, "args": { "External id": 142839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142839, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 142839, "pid": 5, "tid": 7, "ts": 1716454223673790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618583, "dur": 11, "args": { "External id": 142839, "cbid": 211, "correlation": 142839 } }, { "ph": "s", "id": 142839, "pid": 76337, "tid": -914061504, "ts": 1716454223618583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223673809, "dur": 4, "args": { "External id": 142851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142851, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 142851, "pid": 5, "tid": 7, "ts": 1716454223673809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618604, "dur": 6, "args": { "External id": 142851, "cbid": 211, "correlation": 142851 } }, { "ph": "s", "id": 142851, "pid": 76337, "tid": -914061504, "ts": 1716454223618604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223673815, "dur": 17, "args": { "External id": 142854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142854, "pid": 5, "tid": 7, "ts": 1716454223673815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618622, "dur": 6, "args": { "External id": 142854, "cbid": 211, "correlation": 142854 } }, { "ph": "s", "id": 142854, "pid": 76337, "tid": -914061504, "ts": 1716454223618622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223673833, "dur": 11, "args": { "External id": 142863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142863, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142863, "pid": 5, "tid": 7, "ts": 1716454223673833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618663, "dur": 9, "args": { "External id": 142863, "cbid": 211, "correlation": 142863 } }, { "ph": "s", "id": 142863, "pid": 76337, "tid": -914061504, "ts": 1716454223618663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223618724, "dur": 0, "args": { "External id": 142873, "cbid": 317, "correlation": 142873 } }, { "ph": "f", "id": 142873, "pid": 76337, "tid": -914061504, "ts": 1716454223618724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223618725, "dur": 0, "args": { "External id": 142874, "cbid": 203, "correlation": 142874 } }, { "ph": "f", "id": 142874, "pid": 76337, "tid": -914061504, "ts": 1716454223618725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223618726, "dur": 0, "args": { "External id": 142875, "cbid": 205, "correlation": 142875 } }, { "ph": "f", "id": 142875, "pid": 76337, "tid": -914061504, "ts": 1716454223618726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223673846, "dur": 11, "args": { "External id": 142879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142879, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142879, "pid": 5, "tid": 7, "ts": 1716454223673846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618739, "dur": 13, "args": { "External id": 142879, "cbid": 211, "correlation": 142879 } }, { "ph": "s", "id": 142879, "pid": 76337, "tid": -914061504, "ts": 1716454223618739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223673858, "dur": 164, "args": { "External id": 142881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142881, "pid": 5, "tid": 7, "ts": 1716454223673858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618754, "dur": 5, "args": { "External id": 142881, "cbid": 211, "correlation": 142881 } }, { "ph": "s", "id": 142881, "pid": 76337, "tid": -914061504, "ts": 1716454223618754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223674024, "dur": 1, "args": { "External id": 142883, "device": 5, "context": 1, "stream": 7, "correlation": 142883, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 142883, "pid": 5, "tid": 7, "ts": 1716454223674024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223618765, "dur": 6, "args": { "External id": 142883, "cbid": 51, "correlation": 142883 } }, { "ph": "s", "id": 142883, "pid": 76337, "tid": -914061504, "ts": 1716454223618765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223674028, "dur": 650, "args": { "External id": 142884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142884, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 142884, "pid": 5, "tid": 7, "ts": 1716454223674028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618773, "dur": 6, "args": { "External id": 142884, "cbid": 211, "correlation": 142884 } }, { "ph": "s", "id": 142884, "pid": 76337, "tid": -914061504, "ts": 1716454223618773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223674679, "dur": 13, "args": { "External id": 142886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142886, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142886, "pid": 5, "tid": 7, "ts": 1716454223674679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618782, "dur": 5, "args": { "External id": 142886, "cbid": 211, "correlation": 142886 } }, { "ph": "s", "id": 142886, "pid": 76337, "tid": -914061504, "ts": 1716454223618782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223674694, "dur": 15, "args": { "External id": 142892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142892, "pid": 5, "tid": 7, "ts": 1716454223674694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618811, "dur": 8, "args": { "External id": 142892, "cbid": 211, "correlation": 142892 } }, { "ph": "s", "id": 142892, "pid": 76337, "tid": -914061504, "ts": 1716454223618811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223674710, "dur": 12, "args": { "External id": 142900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142900, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142900, "pid": 5, "tid": 7, "ts": 1716454223674710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618842, "dur": 9, "args": { "External id": 142900, "cbid": 211, "correlation": 142900 } }, { "ph": "s", "id": 142900, "pid": 76337, "tid": -914061504, "ts": 1716454223618842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223674723, "dur": 10, "args": { "External id": 142908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142908, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142908, "pid": 5, "tid": 7, "ts": 1716454223674723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618872, "dur": 8, "args": { "External id": 142908, "cbid": 211, "correlation": 142908 } }, { "ph": "s", "id": 142908, "pid": 76337, "tid": -914061504, "ts": 1716454223618872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223674735, "dur": 19, "args": { "External id": 142928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142928, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 142928, "pid": 5, "tid": 7, "ts": 1716454223674735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618951, "dur": 12, "args": { "External id": 142928, "cbid": 211, "correlation": 142928 } }, { "ph": "s", "id": 142928, "pid": 76337, "tid": -914061504, "ts": 1716454223618951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223674755, "dur": 4, "args": { "External id": 142940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142940, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 142940, "pid": 5, "tid": 7, "ts": 1716454223674755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223618981, "dur": 7, "args": { "External id": 142940, "cbid": 211, "correlation": 142940 } }, { "ph": "s", "id": 142940, "pid": 76337, "tid": -914061504, "ts": 1716454223618981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223674760, "dur": 16, "args": { "External id": 142943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142943, "pid": 5, "tid": 7, "ts": 1716454223674760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619000, "dur": 7, "args": { "External id": 142943, "cbid": 211, "correlation": 142943 } }, { "ph": "s", "id": 142943, "pid": 76337, "tid": -914061504, "ts": 1716454223619000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223619058, "dur": 0, "args": { "External id": 142954, "cbid": 317, "correlation": 142954 } }, { "ph": "f", "id": 142954, "pid": 76337, "tid": -914061504, "ts": 1716454223619058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223619058, "dur": 0, "args": { "External id": 142955, "cbid": 203, "correlation": 142955 } }, { "ph": "f", "id": 142955, "pid": 76337, "tid": -914061504, "ts": 1716454223619058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223619059, "dur": 0, "args": { "External id": 142956, "cbid": 205, "correlation": 142956 } }, { "ph": "f", "id": 142956, "pid": 76337, "tid": -914061504, "ts": 1716454223619059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223674778, "dur": 11, "args": { "External id": 142960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142960, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142960, "pid": 5, "tid": 7, "ts": 1716454223674778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619072, "dur": 12, "args": { "External id": 142960, "cbid": 211, "correlation": 142960 } }, { "ph": "s", "id": 142960, "pid": 76337, "tid": -914061504, "ts": 1716454223619072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223674790, "dur": 4, "args": { "External id": 142962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 142962, "pid": 5, "tid": 7, "ts": 1716454223674790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619088, "dur": 6, "args": { "External id": 142962, "cbid": 211, "correlation": 142962 } }, { "ph": "s", "id": 142962, "pid": 76337, "tid": -914061504, "ts": 1716454223619088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223619097, "dur": 0, "args": { "External id": 142963, "cbid": 51, "correlation": 142963 } }, { "ph": "s", "id": 142963, "pid": 76337, "tid": -914061504, "ts": 1716454223619097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223674795, "dur": 94, "args": { "External id": 142964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142964, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 142964, "pid": 5, "tid": 7, "ts": 1716454223674795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619098, "dur": 6, "args": { "External id": 142964, "cbid": 211, "correlation": 142964 } }, { "ph": "s", "id": 142964, "pid": 76337, "tid": -914061504, "ts": 1716454223619098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223674891, "dur": 15, "args": { "External id": 142969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142969, "pid": 5, "tid": 7, "ts": 1716454223674891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619125, "dur": 8, "args": { "External id": 142969, "cbid": 211, "correlation": 142969 } }, { "ph": "s", "id": 142969, "pid": 76337, "tid": -914061504, "ts": 1716454223619125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223674907, "dur": 85, "args": { "External id": 142978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 142978, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 142978, "pid": 5, "tid": 7, "ts": 1716454223674907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619207, "dur": 14, "args": { "External id": 142978, "cbid": 211, "correlation": 142978 } }, { "ph": "s", "id": 142978, "pid": 76337, "tid": -914061504, "ts": 1716454223619207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223674993, "dur": 31, "args": { "External id": 143000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143000, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143000, "pid": 5, "tid": 7, "ts": 1716454223674993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619264, "dur": 10, "args": { "External id": 143000, "cbid": 211, "correlation": 143000 } }, { "ph": "s", "id": 143000, "pid": 76337, "tid": -914061504, "ts": 1716454223619264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223619351, "dur": 1, "args": { "External id": 143011, "cbid": 251, "correlation": 143011 } }, { "ph": "f", "id": 143011, "pid": 76337, "tid": -914061504, "ts": 1716454223619351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223675025, "dur": 163, "args": { "External id": 143012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143012, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143012, "pid": 5, "tid": 7, "ts": 1716454223675025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619356, "dur": 13, "args": { "External id": 143012, "cbid": 211, "correlation": 143012 } }, { "ph": "s", "id": 143012, "pid": 76337, "tid": -914061504, "ts": 1716454223619356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223619425, "dur": 1, "args": { "External id": 143023, "cbid": 251, "correlation": 143023 } }, { "ph": "f", "id": 143023, "pid": 76337, "tid": -914061504, "ts": 1716454223619425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223675189, "dur": 159, "args": { "External id": 143024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143024, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143024, "pid": 5, "tid": 7, "ts": 1716454223675189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619429, "dur": 11, "args": { "External id": 143024, "cbid": 211, "correlation": 143024 } }, { "ph": "s", "id": 143024, "pid": 76337, "tid": -914061504, "ts": 1716454223619429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223619492, "dur": 1, "args": { "External id": 143035, "cbid": 251, "correlation": 143035 } }, { "ph": "f", "id": 143035, "pid": 76337, "tid": -914061504, "ts": 1716454223619492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223675350, "dur": 160, "args": { "External id": 143036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143036, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143036, "pid": 5, "tid": 7, "ts": 1716454223675350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619497, "dur": 12, "args": { "External id": 143036, "cbid": 211, "correlation": 143036 } }, { "ph": "s", "id": 143036, "pid": 76337, "tid": -914061504, "ts": 1716454223619497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223675511, "dur": 339, "args": { "External id": 143061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143061, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143061, "pid": 5, "tid": 7, "ts": 1716454223675511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619580, "dur": 12, "args": { "External id": 143061, "cbid": 211, "correlation": 143061 } }, { "ph": "s", "id": 143061, "pid": 76337, "tid": -914061504, "ts": 1716454223619580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223619680, "dur": 1, "args": { "External id": 143079, "cbid": 251, "correlation": 143079 } }, { "ph": "f", "id": 143079, "pid": 76337, "tid": -914061504, "ts": 1716454223619680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223675851, "dur": 166, "args": { "External id": 143081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143081, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143081, "pid": 5, "tid": 7, "ts": 1716454223675851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619685, "dur": 13, "args": { "External id": 143081, "cbid": 211, "correlation": 143081 } }, { "ph": "s", "id": 143081, "pid": 76337, "tid": -914061504, "ts": 1716454223619685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223676019, "dur": 19, "args": { "External id": 143089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143089, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143089, "pid": 5, "tid": 7, "ts": 1716454223676019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619757, "dur": 12, "args": { "External id": 143089, "cbid": 211, "correlation": 143089 } }, { "ph": "s", "id": 143089, "pid": 76337, "tid": -914061504, "ts": 1716454223619757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223676040, "dur": 28, "args": { "External id": 143097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143097, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143097, "pid": 5, "tid": 7, "ts": 1716454223676040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619795, "dur": 9, "args": { "External id": 143097, "cbid": 211, "correlation": 143097 } }, { "ph": "s", "id": 143097, "pid": 76337, "tid": -914061504, "ts": 1716454223619795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223676069, "dur": 18, "args": { "External id": 143108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143108, "pid": 5, "tid": 7, "ts": 1716454223676069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619866, "dur": 12, "args": { "External id": 143108, "cbid": 211, "correlation": 143108 } }, { "ph": "s", "id": 143108, "pid": 76337, "tid": -914061504, "ts": 1716454223619866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223676088, "dur": 16, "args": { "External id": 143130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143130, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143130, "pid": 5, "tid": 7, "ts": 1716454223676088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619897, "dur": 8, "args": { "External id": 143130, "cbid": 211, "correlation": 143130 } }, { "ph": "s", "id": 143130, "pid": 76337, "tid": -914061504, "ts": 1716454223619897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223619989, "dur": 1, "args": { "External id": 143141, "cbid": 251, "correlation": 143141 } }, { "ph": "f", "id": 143141, "pid": 76337, "tid": -914061504, "ts": 1716454223619989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223676105, "dur": 88, "args": { "External id": 143142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143142, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 143142, "pid": 5, "tid": 7, "ts": 1716454223676105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223619994, "dur": 14, "args": { "External id": 143142, "cbid": 211, "correlation": 143142 } }, { "ph": "s", "id": 143142, "pid": 76337, "tid": -914061504, "ts": 1716454223619994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620064, "dur": 1, "args": { "External id": 143153, "cbid": 251, "correlation": 143153 } }, { "ph": "f", "id": 143153, "pid": 76337, "tid": -914061504, "ts": 1716454223620064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620068, "dur": 0, "args": { "External id": 143154, "cbid": 251, "correlation": 143154 } }, { "ph": "f", "id": 143154, "pid": 76337, "tid": -914061504, "ts": 1716454223620068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223676195, "dur": 12, "args": { "External id": 143155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143155, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143155, "pid": 5, "tid": 7, "ts": 1716454223676195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620069, "dur": 12, "args": { "External id": 143155, "cbid": 211, "correlation": 143155 } }, { "ph": "s", "id": 143155, "pid": 76337, "tid": -914061504, "ts": 1716454223620069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223676209, "dur": 6, "args": { "External id": 143157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143157, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143157, "pid": 5, "tid": 7, "ts": 1716454223676209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620083, "dur": 6, "args": { "External id": 143157, "cbid": 211, "correlation": 143157 } }, { "ph": "s", "id": 143157, "pid": 76337, "tid": -914061504, "ts": 1716454223620083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620140, "dur": 1, "args": { "External id": 143168, "cbid": 251, "correlation": 143168 } }, { "ph": "f", "id": 143168, "pid": 76337, "tid": -914061504, "ts": 1716454223620140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620144, "dur": 0, "args": { "External id": 143169, "cbid": 251, "correlation": 143169 } }, { "ph": "f", "id": 143169, "pid": 76337, "tid": -914061504, "ts": 1716454223620144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223676215, "dur": 8, "args": { "External id": 143170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143170, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143170, "pid": 5, "tid": 7, "ts": 1716454223676215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620145, "dur": 12, "args": { "External id": 143170, "cbid": 211, "correlation": 143170 } }, { "ph": "s", "id": 143170, "pid": 76337, "tid": -914061504, "ts": 1716454223620145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223676225, "dur": 4, "args": { "External id": 143172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143172, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143172, "pid": 5, "tid": 7, "ts": 1716454223676225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620159, "dur": 5, "args": { "External id": 143172, "cbid": 211, "correlation": 143172 } }, { "ph": "s", "id": 143172, "pid": 76337, "tid": -914061504, "ts": 1716454223620159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223676230, "dur": 55, "args": { "External id": 143197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143197, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143197, "pid": 5, "tid": 7, "ts": 1716454223676230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620236, "dur": 12, "args": { "External id": 143197, "cbid": 211, "correlation": 143197 } }, { "ph": "s", "id": 143197, "pid": 76337, "tid": -914061504, "ts": 1716454223620236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620333, "dur": 1, "args": { "External id": 143215, "cbid": 251, "correlation": 143215 } }, { "ph": "f", "id": 143215, "pid": 76337, "tid": -914061504, "ts": 1716454223620333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223676287, "dur": 91, "args": { "External id": 143217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143217, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 143217, "pid": 5, "tid": 7, "ts": 1716454223676287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620339, "dur": 13, "args": { "External id": 143217, "cbid": 211, "correlation": 143217 } }, { "ph": "s", "id": 143217, "pid": 76337, "tid": -914061504, "ts": 1716454223620339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223676380, "dur": 10, "args": { "External id": 143225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143225, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143225, "pid": 5, "tid": 7, "ts": 1716454223676380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620408, "dur": 12, "args": { "External id": 143225, "cbid": 211, "correlation": 143225 } }, { "ph": "s", "id": 143225, "pid": 76337, "tid": -914061504, "ts": 1716454223620408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223676391, "dur": 21, "args": { "External id": 143233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143233, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143233, "pid": 5, "tid": 7, "ts": 1716454223676391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620449, "dur": 10, "args": { "External id": 143233, "cbid": 211, "correlation": 143233 } }, { "ph": "s", "id": 143233, "pid": 76337, "tid": -914061504, "ts": 1716454223620449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223676413, "dur": 17, "args": { "External id": 143255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143255, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143255, "pid": 5, "tid": 7, "ts": 1716454223676413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620500, "dur": 11, "args": { "External id": 143255, "cbid": 211, "correlation": 143255 } }, { "ph": "s", "id": 143255, "pid": 76337, "tid": -914061504, "ts": 1716454223620500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620587, "dur": 1, "args": { "External id": 143271, "cbid": 251, "correlation": 143271 } }, { "ph": "f", "id": 143271, "pid": 76337, "tid": -914061504, "ts": 1716454223620587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620592, "dur": 0, "args": { "External id": 143273, "cbid": 251, "correlation": 143273 } }, { "ph": "f", "id": 143273, "pid": 76337, "tid": -914061504, "ts": 1716454223620592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223676432, "dur": 494, "args": { "External id": 143274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143274, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143274, "pid": 5, "tid": 7, "ts": 1716454223676432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620594, "dur": 13, "args": { "External id": 143274, "cbid": 211, "correlation": 143274 } }, { "ph": "s", "id": 143274, "pid": 76337, "tid": -914061504, "ts": 1716454223620594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223676927, "dur": 67, "args": { "External id": 143282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143282, "pid": 5, "tid": 7, "ts": 1716454223676927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620658, "dur": 13, "args": { "External id": 143282, "cbid": 211, "correlation": 143282 } }, { "ph": "s", "id": 143282, "pid": 76337, "tid": -914061504, "ts": 1716454223620658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223676995, "dur": 67, "args": { "External id": 143290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143290, "pid": 5, "tid": 7, "ts": 1716454223676995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620688, "dur": 8, "args": { "External id": 143290, "cbid": 211, "correlation": 143290 } }, { "ph": "s", "id": 143290, "pid": 76337, "tid": -914061504, "ts": 1716454223620688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223620767, "dur": 1, "args": { "External id": 143306, "cbid": 251, "correlation": 143306 } }, { "ph": "f", "id": 143306, "pid": 76337, "tid": -914061504, "ts": 1716454223620767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223677064, "dur": 1, "args": { "External id": 143308, "device": 5, "context": 1, "stream": 7, "correlation": 143308, "bytes": 240, "memory bandwidth (GB/s)": 0.15315890236119975 } }, { "ph": "f", "id": 143308, "pid": 5, "tid": 7, "ts": 1716454223677064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223620772, "dur": 9, "args": { "External id": 143308, "cbid": 51, "correlation": 143308 } }, { "ph": "s", "id": 143308, "pid": 76337, "tid": -914061504, "ts": 1716454223620772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223677067, "dur": 272, "args": { "External id": 143309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143309, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 143309, "pid": 5, "tid": 7, "ts": 1716454223677067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620782, "dur": 11, "args": { "External id": 143309, "cbid": 211, "correlation": 143309 } }, { "ph": "s", "id": 143309, "pid": 76337, "tid": -914061504, "ts": 1716454223620782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223677340, "dur": 14, "args": { "External id": 143317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143317, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143317, "pid": 5, "tid": 7, "ts": 1716454223677340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620825, "dur": 10, "args": { "External id": 143317, "cbid": 211, "correlation": 143317 } }, { "ph": "s", "id": 143317, "pid": 76337, "tid": -914061504, "ts": 1716454223620825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223677356, "dur": 37, "args": { "External id": 143328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143328, "pid": 5, "tid": 7, "ts": 1716454223677356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620892, "dur": 12, "args": { "External id": 143328, "cbid": 211, "correlation": 143328 } }, { "ph": "s", "id": 143328, "pid": 76337, "tid": -914061504, "ts": 1716454223620892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223620956, "dur": 0, "args": { "External id": 143340, "cbid": 317, "correlation": 143340 } }, { "ph": "f", "id": 143340, "pid": 76337, "tid": -914061504, "ts": 1716454223620956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223620957, "dur": 0, "args": { "External id": 143341, "cbid": 203, "correlation": 143341 } }, { "ph": "f", "id": 143341, "pid": 76337, "tid": -914061504, "ts": 1716454223620957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223620958, "dur": 0, "args": { "External id": 143342, "cbid": 205, "correlation": 143342 } }, { "ph": "f", "id": 143342, "pid": 76337, "tid": -914061504, "ts": 1716454223620958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223677394, "dur": 12, "args": { "External id": 143346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143346, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143346, "pid": 5, "tid": 7, "ts": 1716454223677394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620981, "dur": 13, "args": { "External id": 143346, "cbid": 211, "correlation": 143346 } }, { "ph": "s", "id": 143346, "pid": 76337, "tid": -914061504, "ts": 1716454223620981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223677408, "dur": 4, "args": { "External id": 143348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143348, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 143348, "pid": 5, "tid": 7, "ts": 1716454223677408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223620998, "dur": 6, "args": { "External id": 143348, "cbid": 211, "correlation": 143348 } }, { "ph": "s", "id": 143348, "pid": 76337, "tid": -914061504, "ts": 1716454223620998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223621007, "dur": 0, "args": { "External id": 143349, "cbid": 51, "correlation": 143349 } }, { "ph": "s", "id": 143349, "pid": 76337, "tid": -914061504, "ts": 1716454223621007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223677413, "dur": 98, "args": { "External id": 143350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143350, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 143350, "pid": 5, "tid": 7, "ts": 1716454223677413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621008, "dur": 6, "args": { "External id": 143350, "cbid": 211, "correlation": 143350 } }, { "ph": "s", "id": 143350, "pid": 76337, "tid": -914061504, "ts": 1716454223621008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223677512, "dur": 17, "args": { "External id": 143355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143355, "pid": 5, "tid": 7, "ts": 1716454223677512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621036, "dur": 8, "args": { "External id": 143355, "cbid": 211, "correlation": 143355 } }, { "ph": "s", "id": 143355, "pid": 76337, "tid": -914061504, "ts": 1716454223621036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223677531, "dur": 12, "args": { "External id": 143363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143363, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143363, "pid": 5, "tid": 7, "ts": 1716454223677531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621069, "dur": 8, "args": { "External id": 143363, "cbid": 211, "correlation": 143363 } }, { "ph": "s", "id": 143363, "pid": 76337, "tid": -914061504, "ts": 1716454223621069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223621138, "dur": 0, "args": { "External id": 143373, "cbid": 317, "correlation": 143373 } }, { "ph": "f", "id": 143373, "pid": 76337, "tid": -914061504, "ts": 1716454223621138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223621139, "dur": 0, "args": { "External id": 143374, "cbid": 203, "correlation": 143374 } }, { "ph": "f", "id": 143374, "pid": 76337, "tid": -914061504, "ts": 1716454223621139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223621140, "dur": 0, "args": { "External id": 143375, "cbid": 205, "correlation": 143375 } }, { "ph": "f", "id": 143375, "pid": 76337, "tid": -914061504, "ts": 1716454223621140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223677544, "dur": 12, "args": { "External id": 143379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143379, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143379, "pid": 5, "tid": 7, "ts": 1716454223677544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621154, "dur": 14, "args": { "External id": 143379, "cbid": 211, "correlation": 143379 } }, { "ph": "s", "id": 143379, "pid": 76337, "tid": -914061504, "ts": 1716454223621154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223677558, "dur": 163, "args": { "External id": 143381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143381, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143381, "pid": 5, "tid": 7, "ts": 1716454223677558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621170, "dur": 5, "args": { "External id": 143381, "cbid": 211, "correlation": 143381 } }, { "ph": "s", "id": 143381, "pid": 76337, "tid": -914061504, "ts": 1716454223621170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223677723, "dur": 1, "args": { "External id": 143383, "device": 5, "context": 1, "stream": 7, "correlation": 143383, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 143383, "pid": 5, "tid": 7, "ts": 1716454223677723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223621181, "dur": 6, "args": { "External id": 143383, "cbid": 51, "correlation": 143383 } }, { "ph": "s", "id": 143383, "pid": 76337, "tid": -914061504, "ts": 1716454223621181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223677726, "dur": 198, "args": { "External id": 143384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143384, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 143384, "pid": 5, "tid": 7, "ts": 1716454223677726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621189, "dur": 8, "args": { "External id": 143384, "cbid": 211, "correlation": 143384 } }, { "ph": "s", "id": 143384, "pid": 76337, "tid": -914061504, "ts": 1716454223621189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223677925, "dur": 7, "args": { "External id": 143386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143386, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143386, "pid": 5, "tid": 7, "ts": 1716454223677925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621201, "dur": 5, "args": { "External id": 143386, "cbid": 211, "correlation": 143386 } }, { "ph": "s", "id": 143386, "pid": 76337, "tid": -914061504, "ts": 1716454223621201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223677933, "dur": 6, "args": { "External id": 143392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143392, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143392, "pid": 5, "tid": 7, "ts": 1716454223677933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621230, "dur": 8, "args": { "External id": 143392, "cbid": 211, "correlation": 143392 } }, { "ph": "s", "id": 143392, "pid": 76337, "tid": -914061504, "ts": 1716454223621230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223677941, "dur": 11, "args": { "External id": 143412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143412, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 143412, "pid": 5, "tid": 7, "ts": 1716454223677941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621323, "dur": 12, "args": { "External id": 143412, "cbid": 211, "correlation": 143412 } }, { "ph": "s", "id": 143412, "pid": 76337, "tid": -914061504, "ts": 1716454223621323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223677953, "dur": 4, "args": { "External id": 143424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143424, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 143424, "pid": 5, "tid": 7, "ts": 1716454223677953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621345, "dur": 6, "args": { "External id": 143424, "cbid": 211, "correlation": 143424 } }, { "ph": "s", "id": 143424, "pid": 76337, "tid": -914061504, "ts": 1716454223621345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223677959, "dur": 9, "args": { "External id": 143427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143427, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143427, "pid": 5, "tid": 7, "ts": 1716454223677959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621365, "dur": 6, "args": { "External id": 143427, "cbid": 211, "correlation": 143427 } }, { "ph": "s", "id": 143427, "pid": 76337, "tid": -914061504, "ts": 1716454223621365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223677969, "dur": 5, "args": { "External id": 143436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143436, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143436, "pid": 5, "tid": 7, "ts": 1716454223677969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621404, "dur": 10, "args": { "External id": 143436, "cbid": 211, "correlation": 143436 } }, { "ph": "s", "id": 143436, "pid": 76337, "tid": -914061504, "ts": 1716454223621404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223621455, "dur": 0, "args": { "External id": 143446, "cbid": 317, "correlation": 143446 } }, { "ph": "f", "id": 143446, "pid": 76337, "tid": -914061504, "ts": 1716454223621455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223621456, "dur": 0, "args": { "External id": 143447, "cbid": 203, "correlation": 143447 } }, { "ph": "f", "id": 143447, "pid": 76337, "tid": -914061504, "ts": 1716454223621456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223621457, "dur": 0, "args": { "External id": 143448, "cbid": 205, "correlation": 143448 } }, { "ph": "f", "id": 143448, "pid": 76337, "tid": -914061504, "ts": 1716454223621457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223677975, "dur": 5, "args": { "External id": 143452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143452, "pid": 5, "tid": 7, "ts": 1716454223677975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621472, "dur": 11, "args": { "External id": 143452, "cbid": 211, "correlation": 143452 } }, { "ph": "s", "id": 143452, "pid": 76337, "tid": -914061504, "ts": 1716454223621472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223677982, "dur": 163, "args": { "External id": 143454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143454, "pid": 5, "tid": 7, "ts": 1716454223677982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621486, "dur": 5, "args": { "External id": 143454, "cbid": 211, "correlation": 143454 } }, { "ph": "s", "id": 143454, "pid": 76337, "tid": -914061504, "ts": 1716454223621486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223678147, "dur": 1, "args": { "External id": 143456, "device": 5, "context": 1, "stream": 7, "correlation": 143456, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 143456, "pid": 5, "tid": 7, "ts": 1716454223678147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223621497, "dur": 6, "args": { "External id": 143456, "cbid": 51, "correlation": 143456 } }, { "ph": "s", "id": 143456, "pid": 76337, "tid": -914061504, "ts": 1716454223621497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223678151, "dur": 269, "args": { "External id": 143457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143457, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143457, "pid": 5, "tid": 7, "ts": 1716454223678151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621504, "dur": 6, "args": { "External id": 143457, "cbid": 211, "correlation": 143457 } }, { "ph": "s", "id": 143457, "pid": 76337, "tid": -914061504, "ts": 1716454223621504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223678422, "dur": 6, "args": { "External id": 143459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143459, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143459, "pid": 5, "tid": 7, "ts": 1716454223678422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621516, "dur": 5, "args": { "External id": 143459, "cbid": 211, "correlation": 143459 } }, { "ph": "s", "id": 143459, "pid": 76337, "tid": -914061504, "ts": 1716454223621516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223678429, "dur": 6, "args": { "External id": 143465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143465, "pid": 5, "tid": 7, "ts": 1716454223678429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621544, "dur": 8, "args": { "External id": 143465, "cbid": 211, "correlation": 143465 } }, { "ph": "s", "id": 143465, "pid": 76337, "tid": -914061504, "ts": 1716454223621544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223678436, "dur": 4, "args": { "External id": 143473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143473, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 143473, "pid": 5, "tid": 7, "ts": 1716454223678436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621587, "dur": 9, "args": { "External id": 143473, "cbid": 211, "correlation": 143473 } }, { "ph": "s", "id": 143473, "pid": 76337, "tid": -914061504, "ts": 1716454223621587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223621653, "dur": 1, "args": { "External id": 143489, "cbid": 251, "correlation": 143489 } }, { "ph": "f", "id": 143489, "pid": 76337, "tid": -914061504, "ts": 1716454223621653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223621659, "dur": 0, "args": { "External id": 143491, "cbid": 251, "correlation": 143491 } }, { "ph": "f", "id": 143491, "pid": 76337, "tid": -914061504, "ts": 1716454223621659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223678441, "dur": 13, "args": { "External id": 143492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143492, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143492, "pid": 5, "tid": 7, "ts": 1716454223678441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621661, "dur": 11, "args": { "External id": 143492, "cbid": 211, "correlation": 143492 } }, { "ph": "s", "id": 143492, "pid": 76337, "tid": -914061504, "ts": 1716454223621661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223678456, "dur": 5, "args": { "External id": 143494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143494, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143494, "pid": 5, "tid": 7, "ts": 1716454223678456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621674, "dur": 5, "args": { "External id": 143494, "cbid": 211, "correlation": 143494 } }, { "ph": "s", "id": 143494, "pid": 76337, "tid": -914061504, "ts": 1716454223621674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223678462, "dur": 6, "args": { "External id": 143504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143504, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143504, "pid": 5, "tid": 7, "ts": 1716454223678462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621730, "dur": 12, "args": { "External id": 143504, "cbid": 211, "correlation": 143504 } }, { "ph": "s", "id": 143504, "pid": 76337, "tid": -914061504, "ts": 1716454223621730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223678469, "dur": 10, "args": { "External id": 143524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143524, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 143524, "pid": 5, "tid": 7, "ts": 1716454223678469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621796, "dur": 11, "args": { "External id": 143524, "cbid": 211, "correlation": 143524 } }, { "ph": "s", "id": 143524, "pid": 76337, "tid": -914061504, "ts": 1716454223621796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223678480, "dur": 4, "args": { "External id": 143536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143536, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 143536, "pid": 5, "tid": 7, "ts": 1716454223678480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621817, "dur": 6, "args": { "External id": 143536, "cbid": 211, "correlation": 143536 } }, { "ph": "s", "id": 143536, "pid": 76337, "tid": -914061504, "ts": 1716454223621817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223678485, "dur": 7, "args": { "External id": 143539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143539, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143539, "pid": 5, "tid": 7, "ts": 1716454223678485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621837, "dur": 6, "args": { "External id": 143539, "cbid": 211, "correlation": 143539 } }, { "ph": "s", "id": 143539, "pid": 76337, "tid": -914061504, "ts": 1716454223621837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223678494, "dur": 5, "args": { "External id": 143548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143548, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143548, "pid": 5, "tid": 7, "ts": 1716454223678494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621877, "dur": 10, "args": { "External id": 143548, "cbid": 211, "correlation": 143548 } }, { "ph": "s", "id": 143548, "pid": 76337, "tid": -914061504, "ts": 1716454223621877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223621940, "dur": 0, "args": { "External id": 143558, "cbid": 317, "correlation": 143558 } }, { "ph": "f", "id": 143558, "pid": 76337, "tid": -914061504, "ts": 1716454223621940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223621941, "dur": 0, "args": { "External id": 143559, "cbid": 203, "correlation": 143559 } }, { "ph": "f", "id": 143559, "pid": 76337, "tid": -914061504, "ts": 1716454223621941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223621942, "dur": 0, "args": { "External id": 143560, "cbid": 205, "correlation": 143560 } }, { "ph": "f", "id": 143560, "pid": 76337, "tid": -914061504, "ts": 1716454223621942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223678500, "dur": 5, "args": { "External id": 143564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143564, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143564, "pid": 5, "tid": 7, "ts": 1716454223678500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621955, "dur": 13, "args": { "External id": 143564, "cbid": 211, "correlation": 143564 } }, { "ph": "s", "id": 143564, "pid": 76337, "tid": -914061504, "ts": 1716454223621955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223678506, "dur": 163, "args": { "External id": 143566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143566, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143566, "pid": 5, "tid": 7, "ts": 1716454223678506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621971, "dur": 14, "args": { "External id": 143566, "cbid": 211, "correlation": 143566 } }, { "ph": "s", "id": 143566, "pid": 76337, "tid": -914061504, "ts": 1716454223621971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223678671, "dur": 1, "args": { "External id": 143568, "device": 5, "context": 1, "stream": 7, "correlation": 143568, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 143568, "pid": 5, "tid": 7, "ts": 1716454223678671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223621991, "dur": 6, "args": { "External id": 143568, "cbid": 51, "correlation": 143568 } }, { "ph": "s", "id": 143568, "pid": 76337, "tid": -914061504, "ts": 1716454223621991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223678675, "dur": 260, "args": { "External id": 143569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143569, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143569, "pid": 5, "tid": 7, "ts": 1716454223678675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223621998, "dur": 6, "args": { "External id": 143569, "cbid": 211, "correlation": 143569 } }, { "ph": "s", "id": 143569, "pid": 76337, "tid": -914061504, "ts": 1716454223621998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223678937, "dur": 6, "args": { "External id": 143571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143571, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143571, "pid": 5, "tid": 7, "ts": 1716454223678937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622008, "dur": 5, "args": { "External id": 143571, "cbid": 211, "correlation": 143571 } }, { "ph": "s", "id": 143571, "pid": 76337, "tid": -914061504, "ts": 1716454223622008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223678944, "dur": 6, "args": { "External id": 143577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143577, "pid": 5, "tid": 7, "ts": 1716454223678944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622037, "dur": 9, "args": { "External id": 143577, "cbid": 211, "correlation": 143577 } }, { "ph": "s", "id": 143577, "pid": 76337, "tid": -914061504, "ts": 1716454223622037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223678952, "dur": 5, "args": { "External id": 143585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143585, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143585, "pid": 5, "tid": 7, "ts": 1716454223678952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622072, "dur": 8, "args": { "External id": 143585, "cbid": 211, "correlation": 143585 } }, { "ph": "s", "id": 143585, "pid": 76337, "tid": -914061504, "ts": 1716454223622072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223678958, "dur": 4, "args": { "External id": 143593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143593, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143593, "pid": 5, "tid": 7, "ts": 1716454223678958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622101, "dur": 8, "args": { "External id": 143593, "cbid": 211, "correlation": 143593 } }, { "ph": "s", "id": 143593, "pid": 76337, "tid": -914061504, "ts": 1716454223622101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223678964, "dur": 10, "args": { "External id": 143613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143613, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 143613, "pid": 5, "tid": 7, "ts": 1716454223678964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622178, "dur": 12, "args": { "External id": 143613, "cbid": 211, "correlation": 143613 } }, { "ph": "s", "id": 143613, "pid": 76337, "tid": -914061504, "ts": 1716454223622178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223678975, "dur": 3, "args": { "External id": 143625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143625, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 143625, "pid": 5, "tid": 7, "ts": 1716454223678975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622199, "dur": 6, "args": { "External id": 143625, "cbid": 211, "correlation": 143625 } }, { "ph": "s", "id": 143625, "pid": 76337, "tid": -914061504, "ts": 1716454223622199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223678980, "dur": 6, "args": { "External id": 143628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143628, "pid": 5, "tid": 7, "ts": 1716454223678980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622217, "dur": 7, "args": { "External id": 143628, "cbid": 211, "correlation": 143628 } }, { "ph": "s", "id": 143628, "pid": 76337, "tid": -914061504, "ts": 1716454223622217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223678988, "dur": 5, "args": { "External id": 143637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143637, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143637, "pid": 5, "tid": 7, "ts": 1716454223678988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622256, "dur": 9, "args": { "External id": 143637, "cbid": 211, "correlation": 143637 } }, { "ph": "s", "id": 143637, "pid": 76337, "tid": -914061504, "ts": 1716454223622256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223622309, "dur": 0, "args": { "External id": 143647, "cbid": 317, "correlation": 143647 } }, { "ph": "f", "id": 143647, "pid": 76337, "tid": -914061504, "ts": 1716454223622309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223622310, "dur": 0, "args": { "External id": 143648, "cbid": 203, "correlation": 143648 } }, { "ph": "f", "id": 143648, "pid": 76337, "tid": -914061504, "ts": 1716454223622310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223622310, "dur": 0, "args": { "External id": 143649, "cbid": 205, "correlation": 143649 } }, { "ph": "f", "id": 143649, "pid": 76337, "tid": -914061504, "ts": 1716454223622310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223678993, "dur": 5, "args": { "External id": 143653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143653, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143653, "pid": 5, "tid": 7, "ts": 1716454223678993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622324, "dur": 11, "args": { "External id": 143653, "cbid": 211, "correlation": 143653 } }, { "ph": "s", "id": 143653, "pid": 76337, "tid": -914061504, "ts": 1716454223622324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223679000, "dur": 163, "args": { "External id": 143655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143655, "pid": 5, "tid": 7, "ts": 1716454223679000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622338, "dur": 5, "args": { "External id": 143655, "cbid": 211, "correlation": 143655 } }, { "ph": "s", "id": 143655, "pid": 76337, "tid": -914061504, "ts": 1716454223622338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223679165, "dur": 1, "args": { "External id": 143657, "device": 5, "context": 1, "stream": 7, "correlation": 143657, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 143657, "pid": 5, "tid": 7, "ts": 1716454223679165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223622349, "dur": 6, "args": { "External id": 143657, "cbid": 51, "correlation": 143657 } }, { "ph": "s", "id": 143657, "pid": 76337, "tid": -914061504, "ts": 1716454223622349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223679169, "dur": 259, "args": { "External id": 143658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143658, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143658, "pid": 5, "tid": 7, "ts": 1716454223679169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622356, "dur": 6, "args": { "External id": 143658, "cbid": 211, "correlation": 143658 } }, { "ph": "s", "id": 143658, "pid": 76337, "tid": -914061504, "ts": 1716454223622356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223679429, "dur": 6, "args": { "External id": 143660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143660, "pid": 5, "tid": 7, "ts": 1716454223679429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622365, "dur": 5, "args": { "External id": 143660, "cbid": 211, "correlation": 143660 } }, { "ph": "s", "id": 143660, "pid": 76337, "tid": -914061504, "ts": 1716454223622365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223679436, "dur": 6, "args": { "External id": 143666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143666, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143666, "pid": 5, "tid": 7, "ts": 1716454223679436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622393, "dur": 8, "args": { "External id": 143666, "cbid": 211, "correlation": 143666 } }, { "ph": "s", "id": 143666, "pid": 76337, "tid": -914061504, "ts": 1716454223622393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223679443, "dur": 4, "args": { "External id": 143674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143674, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 143674, "pid": 5, "tid": 7, "ts": 1716454223679443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622437, "dur": 9, "args": { "External id": 143674, "cbid": 211, "correlation": 143674 } }, { "ph": "s", "id": 143674, "pid": 76337, "tid": -914061504, "ts": 1716454223622437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223622499, "dur": 1, "args": { "External id": 143690, "cbid": 251, "correlation": 143690 } }, { "ph": "f", "id": 143690, "pid": 76337, "tid": -914061504, "ts": 1716454223622499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223622504, "dur": 0, "args": { "External id": 143692, "cbid": 251, "correlation": 143692 } }, { "ph": "f", "id": 143692, "pid": 76337, "tid": -914061504, "ts": 1716454223622504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223679448, "dur": 10, "args": { "External id": 143693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143693, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143693, "pid": 5, "tid": 7, "ts": 1716454223679448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622506, "dur": 11, "args": { "External id": 143693, "cbid": 211, "correlation": 143693 } }, { "ph": "s", "id": 143693, "pid": 76337, "tid": -914061504, "ts": 1716454223622506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223679459, "dur": 4, "args": { "External id": 143695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143695, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143695, "pid": 5, "tid": 7, "ts": 1716454223679459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622520, "dur": 6, "args": { "External id": 143695, "cbid": 211, "correlation": 143695 } }, { "ph": "s", "id": 143695, "pid": 76337, "tid": -914061504, "ts": 1716454223622520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223679464, "dur": 6, "args": { "External id": 143705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143705, "pid": 5, "tid": 7, "ts": 1716454223679464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622576, "dur": 12, "args": { "External id": 143705, "cbid": 211, "correlation": 143705 } }, { "ph": "s", "id": 143705, "pid": 76337, "tid": -914061504, "ts": 1716454223622576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223679471, "dur": 10, "args": { "External id": 143725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143725, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 143725, "pid": 5, "tid": 7, "ts": 1716454223679471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622641, "dur": 10, "args": { "External id": 143725, "cbid": 211, "correlation": 143725 } }, { "ph": "s", "id": 143725, "pid": 76337, "tid": -914061504, "ts": 1716454223622641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223679482, "dur": 4, "args": { "External id": 143737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143737, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 143737, "pid": 5, "tid": 7, "ts": 1716454223679482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622662, "dur": 6, "args": { "External id": 143737, "cbid": 211, "correlation": 143737 } }, { "ph": "s", "id": 143737, "pid": 76337, "tid": -914061504, "ts": 1716454223622662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223679488, "dur": 7, "args": { "External id": 143740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143740, "pid": 5, "tid": 7, "ts": 1716454223679488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622681, "dur": 6, "args": { "External id": 143740, "cbid": 211, "correlation": 143740 } }, { "ph": "s", "id": 143740, "pid": 76337, "tid": -914061504, "ts": 1716454223622681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223679496, "dur": 5, "args": { "External id": 143749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143749, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143749, "pid": 5, "tid": 7, "ts": 1716454223679496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622721, "dur": 11, "args": { "External id": 143749, "cbid": 211, "correlation": 143749 } }, { "ph": "s", "id": 143749, "pid": 76337, "tid": -914061504, "ts": 1716454223622721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223622785, "dur": 0, "args": { "External id": 143759, "cbid": 317, "correlation": 143759 } }, { "ph": "f", "id": 143759, "pid": 76337, "tid": -914061504, "ts": 1716454223622785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223622786, "dur": 0, "args": { "External id": 143760, "cbid": 203, "correlation": 143760 } }, { "ph": "f", "id": 143760, "pid": 76337, "tid": -914061504, "ts": 1716454223622786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223622787, "dur": 0, "args": { "External id": 143761, "cbid": 205, "correlation": 143761 } }, { "ph": "f", "id": 143761, "pid": 76337, "tid": -914061504, "ts": 1716454223622787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223679502, "dur": 5, "args": { "External id": 143765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143765, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143765, "pid": 5, "tid": 7, "ts": 1716454223679502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622800, "dur": 12, "args": { "External id": 143765, "cbid": 211, "correlation": 143765 } }, { "ph": "s", "id": 143765, "pid": 76337, "tid": -914061504, "ts": 1716454223622800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223679508, "dur": 163, "args": { "External id": 143767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143767, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143767, "pid": 5, "tid": 7, "ts": 1716454223679508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622814, "dur": 5, "args": { "External id": 143767, "cbid": 211, "correlation": 143767 } }, { "ph": "s", "id": 143767, "pid": 76337, "tid": -914061504, "ts": 1716454223622814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223679673, "dur": 1, "args": { "External id": 143769, "device": 5, "context": 1, "stream": 7, "correlation": 143769, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 143769, "pid": 5, "tid": 7, "ts": 1716454223679673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223622825, "dur": 7, "args": { "External id": 143769, "cbid": 51, "correlation": 143769 } }, { "ph": "s", "id": 143769, "pid": 76337, "tid": -914061504, "ts": 1716454223622825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223679677, "dur": 260, "args": { "External id": 143770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143770, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143770, "pid": 5, "tid": 7, "ts": 1716454223679677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622833, "dur": 6, "args": { "External id": 143770, "cbid": 211, "correlation": 143770 } }, { "ph": "s", "id": 143770, "pid": 76337, "tid": -914061504, "ts": 1716454223622833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223679938, "dur": 6, "args": { "External id": 143772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143772, "pid": 5, "tid": 7, "ts": 1716454223679938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622843, "dur": 5, "args": { "External id": 143772, "cbid": 211, "correlation": 143772 } }, { "ph": "s", "id": 143772, "pid": 76337, "tid": -914061504, "ts": 1716454223622843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223679945, "dur": 6, "args": { "External id": 143778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143778, "pid": 5, "tid": 7, "ts": 1716454223679945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622871, "dur": 9, "args": { "External id": 143778, "cbid": 211, "correlation": 143778 } }, { "ph": "s", "id": 143778, "pid": 76337, "tid": -914061504, "ts": 1716454223622871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223679953, "dur": 5, "args": { "External id": 143786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143786, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143786, "pid": 5, "tid": 7, "ts": 1716454223679953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622904, "dur": 8, "args": { "External id": 143786, "cbid": 211, "correlation": 143786 } }, { "ph": "s", "id": 143786, "pid": 76337, "tid": -914061504, "ts": 1716454223622904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223679959, "dur": 4, "args": { "External id": 143794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143794, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143794, "pid": 5, "tid": 7, "ts": 1716454223679959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223622934, "dur": 8, "args": { "External id": 143794, "cbid": 211, "correlation": 143794 } }, { "ph": "s", "id": 143794, "pid": 76337, "tid": -914061504, "ts": 1716454223622934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223679965, "dur": 10, "args": { "External id": 143814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143814, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 143814, "pid": 5, "tid": 7, "ts": 1716454223679965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623046, "dur": 13, "args": { "External id": 143814, "cbid": 211, "correlation": 143814 } }, { "ph": "s", "id": 143814, "pid": 76337, "tid": -914061504, "ts": 1716454223623046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223679976, "dur": 4, "args": { "External id": 143826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143826, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 143826, "pid": 5, "tid": 7, "ts": 1716454223679976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623068, "dur": 6, "args": { "External id": 143826, "cbid": 211, "correlation": 143826 } }, { "ph": "s", "id": 143826, "pid": 76337, "tid": -914061504, "ts": 1716454223623068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223679981, "dur": 7, "args": { "External id": 143829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143829, "pid": 5, "tid": 7, "ts": 1716454223679981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623086, "dur": 6, "args": { "External id": 143829, "cbid": 211, "correlation": 143829 } }, { "ph": "s", "id": 143829, "pid": 76337, "tid": -914061504, "ts": 1716454223623086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223679989, "dur": 5, "args": { "External id": 143838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143838, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143838, "pid": 5, "tid": 7, "ts": 1716454223679989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623124, "dur": 10, "args": { "External id": 143838, "cbid": 211, "correlation": 143838 } }, { "ph": "s", "id": 143838, "pid": 76337, "tid": -914061504, "ts": 1716454223623124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223623177, "dur": 0, "args": { "External id": 143848, "cbid": 317, "correlation": 143848 } }, { "ph": "f", "id": 143848, "pid": 76337, "tid": -914061504, "ts": 1716454223623177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223623177, "dur": 0, "args": { "External id": 143849, "cbid": 203, "correlation": 143849 } }, { "ph": "f", "id": 143849, "pid": 76337, "tid": -914061504, "ts": 1716454223623177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223623178, "dur": 0, "args": { "External id": 143850, "cbid": 205, "correlation": 143850 } }, { "ph": "f", "id": 143850, "pid": 76337, "tid": -914061504, "ts": 1716454223623178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223679995, "dur": 5, "args": { "External id": 143854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143854, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143854, "pid": 5, "tid": 7, "ts": 1716454223679995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623192, "dur": 11, "args": { "External id": 143854, "cbid": 211, "correlation": 143854 } }, { "ph": "s", "id": 143854, "pid": 76337, "tid": -914061504, "ts": 1716454223623192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223680001, "dur": 162, "args": { "External id": 143856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143856, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143856, "pid": 5, "tid": 7, "ts": 1716454223680001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623206, "dur": 5, "args": { "External id": 143856, "cbid": 211, "correlation": 143856 } }, { "ph": "s", "id": 143856, "pid": 76337, "tid": -914061504, "ts": 1716454223623206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223680165, "dur": 1, "args": { "External id": 143858, "device": 5, "context": 1, "stream": 7, "correlation": 143858, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 143858, "pid": 5, "tid": 7, "ts": 1716454223680165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223623216, "dur": 7, "args": { "External id": 143858, "cbid": 51, "correlation": 143858 } }, { "ph": "s", "id": 143858, "pid": 76337, "tid": -914061504, "ts": 1716454223623216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223680169, "dur": 260, "args": { "External id": 143859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143859, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143859, "pid": 5, "tid": 7, "ts": 1716454223680169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623224, "dur": 6, "args": { "External id": 143859, "cbid": 211, "correlation": 143859 } }, { "ph": "s", "id": 143859, "pid": 76337, "tid": -914061504, "ts": 1716454223623224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223680430, "dur": 6, "args": { "External id": 143861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143861, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143861, "pid": 5, "tid": 7, "ts": 1716454223680430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623235, "dur": 5, "args": { "External id": 143861, "cbid": 211, "correlation": 143861 } }, { "ph": "s", "id": 143861, "pid": 76337, "tid": -914061504, "ts": 1716454223623235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223680438, "dur": 6, "args": { "External id": 143867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143867, "pid": 5, "tid": 7, "ts": 1716454223680438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623264, "dur": 9, "args": { "External id": 143867, "cbid": 211, "correlation": 143867 } }, { "ph": "s", "id": 143867, "pid": 76337, "tid": -914061504, "ts": 1716454223623264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223680445, "dur": 3, "args": { "External id": 143875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143875, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 143875, "pid": 5, "tid": 7, "ts": 1716454223680445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623307, "dur": 9, "args": { "External id": 143875, "cbid": 211, "correlation": 143875 } }, { "ph": "s", "id": 143875, "pid": 76337, "tid": -914061504, "ts": 1716454223623307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223623369, "dur": 1, "args": { "External id": 143891, "cbid": 251, "correlation": 143891 } }, { "ph": "f", "id": 143891, "pid": 76337, "tid": -914061504, "ts": 1716454223623369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223623374, "dur": 0, "args": { "External id": 143893, "cbid": 251, "correlation": 143893 } }, { "ph": "f", "id": 143893, "pid": 76337, "tid": -914061504, "ts": 1716454223623374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223680450, "dur": 10, "args": { "External id": 143894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143894, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143894, "pid": 5, "tid": 7, "ts": 1716454223680450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623376, "dur": 11, "args": { "External id": 143894, "cbid": 211, "correlation": 143894 } }, { "ph": "s", "id": 143894, "pid": 76337, "tid": -914061504, "ts": 1716454223623376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223680461, "dur": 4, "args": { "External id": 143896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143896, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143896, "pid": 5, "tid": 7, "ts": 1716454223680461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623389, "dur": 5, "args": { "External id": 143896, "cbid": 211, "correlation": 143896 } }, { "ph": "s", "id": 143896, "pid": 76337, "tid": -914061504, "ts": 1716454223623389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223680466, "dur": 6, "args": { "External id": 143906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143906, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143906, "pid": 5, "tid": 7, "ts": 1716454223680466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623445, "dur": 12, "args": { "External id": 143906, "cbid": 211, "correlation": 143906 } }, { "ph": "s", "id": 143906, "pid": 76337, "tid": -914061504, "ts": 1716454223623445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223680473, "dur": 10, "args": { "External id": 143926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143926, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 143926, "pid": 5, "tid": 7, "ts": 1716454223680473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623510, "dur": 11, "args": { "External id": 143926, "cbid": 211, "correlation": 143926 } }, { "ph": "s", "id": 143926, "pid": 76337, "tid": -914061504, "ts": 1716454223623510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223680484, "dur": 4, "args": { "External id": 143938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143938, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 143938, "pid": 5, "tid": 7, "ts": 1716454223680484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623531, "dur": 7, "args": { "External id": 143938, "cbid": 211, "correlation": 143938 } }, { "ph": "s", "id": 143938, "pid": 76337, "tid": -914061504, "ts": 1716454223623531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223680489, "dur": 7, "args": { "External id": 143941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143941, "pid": 5, "tid": 7, "ts": 1716454223680489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623551, "dur": 6, "args": { "External id": 143941, "cbid": 211, "correlation": 143941 } }, { "ph": "s", "id": 143941, "pid": 76337, "tid": -914061504, "ts": 1716454223623551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223680497, "dur": 5, "args": { "External id": 143950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143950, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143950, "pid": 5, "tid": 7, "ts": 1716454223680497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623591, "dur": 9, "args": { "External id": 143950, "cbid": 211, "correlation": 143950 } }, { "ph": "s", "id": 143950, "pid": 76337, "tid": -914061504, "ts": 1716454223623591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223623654, "dur": 0, "args": { "External id": 143960, "cbid": 317, "correlation": 143960 } }, { "ph": "f", "id": 143960, "pid": 76337, "tid": -914061504, "ts": 1716454223623654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223623654, "dur": 0, "args": { "External id": 143961, "cbid": 203, "correlation": 143961 } }, { "ph": "f", "id": 143961, "pid": 76337, "tid": -914061504, "ts": 1716454223623654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223623655, "dur": 0, "args": { "External id": 143962, "cbid": 205, "correlation": 143962 } }, { "ph": "f", "id": 143962, "pid": 76337, "tid": -914061504, "ts": 1716454223623655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223680503, "dur": 5, "args": { "External id": 143966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143966, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143966, "pid": 5, "tid": 7, "ts": 1716454223680503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623669, "dur": 12, "args": { "External id": 143966, "cbid": 211, "correlation": 143966 } }, { "ph": "s", "id": 143966, "pid": 76337, "tid": -914061504, "ts": 1716454223623669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223680509, "dur": 163, "args": { "External id": 143968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143968, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143968, "pid": 5, "tid": 7, "ts": 1716454223680509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623683, "dur": 6, "args": { "External id": 143968, "cbid": 211, "correlation": 143968 } }, { "ph": "s", "id": 143968, "pid": 76337, "tid": -914061504, "ts": 1716454223623683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223680675, "dur": 1, "args": { "External id": 143970, "device": 5, "context": 1, "stream": 7, "correlation": 143970, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 143970, "pid": 5, "tid": 7, "ts": 1716454223680675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223623694, "dur": 6, "args": { "External id": 143970, "cbid": 51, "correlation": 143970 } }, { "ph": "s", "id": 143970, "pid": 76337, "tid": -914061504, "ts": 1716454223623694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223680678, "dur": 260, "args": { "External id": 143971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143971, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 143971, "pid": 5, "tid": 7, "ts": 1716454223680678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623702, "dur": 6, "args": { "External id": 143971, "cbid": 211, "correlation": 143971 } }, { "ph": "s", "id": 143971, "pid": 76337, "tid": -914061504, "ts": 1716454223623702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223680939, "dur": 6, "args": { "External id": 143973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143973, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 143973, "pid": 5, "tid": 7, "ts": 1716454223680939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623711, "dur": 5, "args": { "External id": 143973, "cbid": 211, "correlation": 143973 } }, { "ph": "s", "id": 143973, "pid": 76337, "tid": -914061504, "ts": 1716454223623711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223680947, "dur": 6, "args": { "External id": 143979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143979, "pid": 5, "tid": 7, "ts": 1716454223680947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623740, "dur": 8, "args": { "External id": 143979, "cbid": 211, "correlation": 143979 } }, { "ph": "s", "id": 143979, "pid": 76337, "tid": -914061504, "ts": 1716454223623740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223680955, "dur": 5, "args": { "External id": 143987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143987, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143987, "pid": 5, "tid": 7, "ts": 1716454223680955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623772, "dur": 8, "args": { "External id": 143987, "cbid": 211, "correlation": 143987 } }, { "ph": "s", "id": 143987, "pid": 76337, "tid": -914061504, "ts": 1716454223623772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223680961, "dur": 4, "args": { "External id": 143995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 143995, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 143995, "pid": 5, "tid": 7, "ts": 1716454223680961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623801, "dur": 8, "args": { "External id": 143995, "cbid": 211, "correlation": 143995 } }, { "ph": "s", "id": 143995, "pid": 76337, "tid": -914061504, "ts": 1716454223623801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223680967, "dur": 9, "args": { "External id": 144015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144015, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 144015, "pid": 5, "tid": 7, "ts": 1716454223680967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623939, "dur": 14, "args": { "External id": 144015, "cbid": 211, "correlation": 144015 } }, { "ph": "s", "id": 144015, "pid": 76337, "tid": -914061504, "ts": 1716454223623939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223680978, "dur": 4, "args": { "External id": 144027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144027, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 144027, "pid": 5, "tid": 7, "ts": 1716454223680978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623962, "dur": 7, "args": { "External id": 144027, "cbid": 211, "correlation": 144027 } }, { "ph": "s", "id": 144027, "pid": 76337, "tid": -914061504, "ts": 1716454223623962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223680983, "dur": 7, "args": { "External id": 144030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144030, "pid": 5, "tid": 7, "ts": 1716454223680983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223623989, "dur": 8, "args": { "External id": 144030, "cbid": 211, "correlation": 144030 } }, { "ph": "s", "id": 144030, "pid": 76337, "tid": -914061504, "ts": 1716454223623989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223624050, "dur": 0, "args": { "External id": 144041, "cbid": 317, "correlation": 144041 } }, { "ph": "f", "id": 144041, "pid": 76337, "tid": -914061504, "ts": 1716454223624050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223624051, "dur": 0, "args": { "External id": 144042, "cbid": 203, "correlation": 144042 } }, { "ph": "f", "id": 144042, "pid": 76337, "tid": -914061504, "ts": 1716454223624051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223624052, "dur": 0, "args": { "External id": 144043, "cbid": 205, "correlation": 144043 } }, { "ph": "f", "id": 144043, "pid": 76337, "tid": -914061504, "ts": 1716454223624052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223680991, "dur": 5, "args": { "External id": 144047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144047, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144047, "pid": 5, "tid": 7, "ts": 1716454223680991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624069, "dur": 12, "args": { "External id": 144047, "cbid": 211, "correlation": 144047 } }, { "ph": "s", "id": 144047, "pid": 76337, "tid": -914061504, "ts": 1716454223624069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223680997, "dur": 37, "args": { "External id": 144049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144049, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 144049, "pid": 5, "tid": 7, "ts": 1716454223680997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624088, "dur": 9, "args": { "External id": 144049, "cbid": 211, "correlation": 144049 } }, { "ph": "s", "id": 144049, "pid": 76337, "tid": -914061504, "ts": 1716454223624088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223681036, "dur": 5, "args": { "External id": 144051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144051, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144051, "pid": 5, "tid": 7, "ts": 1716454223681036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624101, "dur": 5, "args": { "External id": 144051, "cbid": 211, "correlation": 144051 } }, { "ph": "s", "id": 144051, "pid": 76337, "tid": -914061504, "ts": 1716454223624101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681042, "dur": 6, "args": { "External id": 144057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144057, "pid": 5, "tid": 7, "ts": 1716454223681042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624128, "dur": 9, "args": { "External id": 144057, "cbid": 211, "correlation": 144057 } }, { "ph": "s", "id": 144057, "pid": 76337, "tid": -914061504, "ts": 1716454223624128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223681049, "dur": 20, "args": { "External id": 144066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144066, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144066, "pid": 5, "tid": 7, "ts": 1716454223681049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624211, "dur": 14, "args": { "External id": 144066, "cbid": 211, "correlation": 144066 } }, { "ph": "s", "id": 144066, "pid": 76337, "tid": -914061504, "ts": 1716454223624211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223681071, "dur": 11, "args": { "External id": 144088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144088, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 144088, "pid": 5, "tid": 7, "ts": 1716454223681071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624268, "dur": 10, "args": { "External id": 144088, "cbid": 211, "correlation": 144088 } }, { "ph": "s", "id": 144088, "pid": 76337, "tid": -914061504, "ts": 1716454223624268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624361, "dur": 2, "args": { "External id": 144099, "cbid": 251, "correlation": 144099 } }, { "ph": "f", "id": 144099, "pid": 76337, "tid": -914061504, "ts": 1716454223624361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624366, "dur": 0, "args": { "External id": 144100, "cbid": 251, "correlation": 144100 } }, { "ph": "f", "id": 144100, "pid": 76337, "tid": -914061504, "ts": 1716454223624366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223681083, "dur": 54, "args": { "External id": 144101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144101, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 144101, "pid": 5, "tid": 7, "ts": 1716454223681083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624369, "dur": 14, "args": { "External id": 144101, "cbid": 211, "correlation": 144101 } }, { "ph": "s", "id": 144101, "pid": 76337, "tid": -914061504, "ts": 1716454223624369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624440, "dur": 1, "args": { "External id": 144112, "cbid": 251, "correlation": 144112 } }, { "ph": "f", "id": 144112, "pid": 76337, "tid": -914061504, "ts": 1716454223624440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624444, "dur": 0, "args": { "External id": 144113, "cbid": 251, "correlation": 144113 } }, { "ph": "f", "id": 144113, "pid": 76337, "tid": -914061504, "ts": 1716454223624444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223681138, "dur": 54, "args": { "External id": 144114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144114, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 144114, "pid": 5, "tid": 7, "ts": 1716454223681138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624446, "dur": 11, "args": { "External id": 144114, "cbid": 211, "correlation": 144114 } }, { "ph": "s", "id": 144114, "pid": 76337, "tid": -914061504, "ts": 1716454223624446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624508, "dur": 1, "args": { "External id": 144125, "cbid": 251, "correlation": 144125 } }, { "ph": "f", "id": 144125, "pid": 76337, "tid": -914061504, "ts": 1716454223624508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624511, "dur": 0, "args": { "External id": 144126, "cbid": 251, "correlation": 144126 } }, { "ph": "f", "id": 144126, "pid": 76337, "tid": -914061504, "ts": 1716454223624511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223681193, "dur": 54, "args": { "External id": 144127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144127, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 144127, "pid": 5, "tid": 7, "ts": 1716454223681193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624513, "dur": 11, "args": { "External id": 144127, "cbid": 211, "correlation": 144127 } }, { "ph": "s", "id": 144127, "pid": 76337, "tid": -914061504, "ts": 1716454223624513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223681248, "dur": 58, "args": { "External id": 144152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144152, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144152, "pid": 5, "tid": 7, "ts": 1716454223681248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624598, "dur": 13, "args": { "External id": 144152, "cbid": 211, "correlation": 144152 } }, { "ph": "s", "id": 144152, "pid": 76337, "tid": -914061504, "ts": 1716454223624598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223624698, "dur": 1, "args": { "External id": 144170, "cbid": 251, "correlation": 144170 } }, { "ph": "f", "id": 144170, "pid": 76337, "tid": -914061504, "ts": 1716454223624698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223681307, "dur": 64, "args": { "External id": 144172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144172, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 144172, "pid": 5, "tid": 7, "ts": 1716454223681307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624703, "dur": 13, "args": { "External id": 144172, "cbid": 211, "correlation": 144172 } }, { "ph": "s", "id": 144172, "pid": 76337, "tid": -914061504, "ts": 1716454223624703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223681372, "dur": 6, "args": { "External id": 144180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144180, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144180, "pid": 5, "tid": 7, "ts": 1716454223681372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624773, "dur": 13, "args": { "External id": 144180, "cbid": 211, "correlation": 144180 } }, { "ph": "s", "id": 144180, "pid": 76337, "tid": -914061504, "ts": 1716454223624773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223681379, "dur": 7, "args": { "External id": 144188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144188, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144188, "pid": 5, "tid": 7, "ts": 1716454223681379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624812, "dur": 8, "args": { "External id": 144188, "cbid": 211, "correlation": 144188 } }, { "ph": "s", "id": 144188, "pid": 76337, "tid": -914061504, "ts": 1716454223624812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681387, "dur": 8, "args": { "External id": 144199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144199, "pid": 5, "tid": 7, "ts": 1716454223681387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624883, "dur": 14, "args": { "External id": 144199, "cbid": 211, "correlation": 144199 } }, { "ph": "s", "id": 144199, "pid": 76337, "tid": -914061504, "ts": 1716454223624883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223681396, "dur": 8, "args": { "External id": 144221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144221, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 144221, "pid": 5, "tid": 7, "ts": 1716454223681396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223624915, "dur": 8, "args": { "External id": 144221, "cbid": 211, "correlation": 144221 } }, { "ph": "s", "id": 144221, "pid": 76337, "tid": -914061504, "ts": 1716454223624915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625011, "dur": 2, "args": { "External id": 144232, "cbid": 251, "correlation": 144232 } }, { "ph": "f", "id": 144232, "pid": 76337, "tid": -914061504, "ts": 1716454223625011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223681406, "dur": 1, "args": { "External id": 144233, "device": 5, "context": 1, "stream": 7, "correlation": 144233, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 144233, "pid": 5, "tid": 7, "ts": 1716454223681406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223625016, "dur": 11, "args": { "External id": 144233, "cbid": 51, "correlation": 144233 } }, { "ph": "s", "id": 144233, "pid": 76337, "tid": -914061504, "ts": 1716454223625016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223681410, "dur": 37, "args": { "External id": 144234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144234, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 144234, "pid": 5, "tid": 7, "ts": 1716454223681410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625028, "dur": 13, "args": { "External id": 144234, "cbid": 211, "correlation": 144234 } }, { "ph": "s", "id": 144234, "pid": 76337, "tid": -914061504, "ts": 1716454223625028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625101, "dur": 1, "args": { "External id": 144245, "cbid": 251, "correlation": 144245 } }, { "ph": "f", "id": 144245, "pid": 76337, "tid": -914061504, "ts": 1716454223625101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625105, "dur": 0, "args": { "External id": 144246, "cbid": 251, "correlation": 144246 } }, { "ph": "f", "id": 144246, "pid": 76337, "tid": -914061504, "ts": 1716454223625105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223681448, "dur": 12, "args": { "External id": 144247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144247, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144247, "pid": 5, "tid": 7, "ts": 1716454223681448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625106, "dur": 12, "args": { "External id": 144247, "cbid": 211, "correlation": 144247 } }, { "ph": "s", "id": 144247, "pid": 76337, "tid": -914061504, "ts": 1716454223625106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223681461, "dur": 5, "args": { "External id": 144249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144249, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144249, "pid": 5, "tid": 7, "ts": 1716454223681461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625121, "dur": 6, "args": { "External id": 144249, "cbid": 211, "correlation": 144249 } }, { "ph": "s", "id": 144249, "pid": 76337, "tid": -914061504, "ts": 1716454223625121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625178, "dur": 1, "args": { "External id": 144260, "cbid": 251, "correlation": 144260 } }, { "ph": "f", "id": 144260, "pid": 76337, "tid": -914061504, "ts": 1716454223625178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625182, "dur": 0, "args": { "External id": 144261, "cbid": 251, "correlation": 144261 } }, { "ph": "f", "id": 144261, "pid": 76337, "tid": -914061504, "ts": 1716454223625182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223681467, "dur": 8, "args": { "External id": 144262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144262, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144262, "pid": 5, "tid": 7, "ts": 1716454223681467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625183, "dur": 12, "args": { "External id": 144262, "cbid": 211, "correlation": 144262 } }, { "ph": "s", "id": 144262, "pid": 76337, "tid": -914061504, "ts": 1716454223625183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223681476, "dur": 4, "args": { "External id": 144264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144264, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144264, "pid": 5, "tid": 7, "ts": 1716454223681476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625197, "dur": 5, "args": { "External id": 144264, "cbid": 211, "correlation": 144264 } }, { "ph": "s", "id": 144264, "pid": 76337, "tid": -914061504, "ts": 1716454223625197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223681481, "dur": 20, "args": { "External id": 144289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144289, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 144289, "pid": 5, "tid": 7, "ts": 1716454223681481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625274, "dur": 13, "args": { "External id": 144289, "cbid": 211, "correlation": 144289 } }, { "ph": "s", "id": 144289, "pid": 76337, "tid": -914061504, "ts": 1716454223625274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625374, "dur": 2, "args": { "External id": 144307, "cbid": 251, "correlation": 144307 } }, { "ph": "f", "id": 144307, "pid": 76337, "tid": -914061504, "ts": 1716454223625374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223681503, "dur": 1, "args": { "External id": 144309, "device": 5, "context": 1, "stream": 7, "correlation": 144309, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 144309, "pid": 5, "tid": 7, "ts": 1716454223681503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223625380, "dur": 12, "args": { "External id": 144309, "cbid": 51, "correlation": 144309 } }, { "ph": "s", "id": 144309, "pid": 76337, "tid": -914061504, "ts": 1716454223625380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223681507, "dur": 36, "args": { "External id": 144310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144310, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 144310, "pid": 5, "tid": 7, "ts": 1716454223681507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625393, "dur": 12, "args": { "External id": 144310, "cbid": 211, "correlation": 144310 } }, { "ph": "s", "id": 144310, "pid": 76337, "tid": -914061504, "ts": 1716454223625393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223681545, "dur": 4, "args": { "External id": 144318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144318, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144318, "pid": 5, "tid": 7, "ts": 1716454223681545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625464, "dur": 12, "args": { "External id": 144318, "cbid": 211, "correlation": 144318 } }, { "ph": "s", "id": 144318, "pid": 76337, "tid": -914061504, "ts": 1716454223625464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681550, "dur": 8, "args": { "External id": 144326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144326, "pid": 5, "tid": 7, "ts": 1716454223681550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625505, "dur": 9, "args": { "External id": 144326, "cbid": 211, "correlation": 144326 } }, { "ph": "s", "id": 144326, "pid": 76337, "tid": -914061504, "ts": 1716454223625505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223681559, "dur": 8, "args": { "External id": 144348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144348, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 144348, "pid": 5, "tid": 7, "ts": 1716454223681559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625557, "dur": 10, "args": { "External id": 144348, "cbid": 211, "correlation": 144348 } }, { "ph": "s", "id": 144348, "pid": 76337, "tid": -914061504, "ts": 1716454223625557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625650, "dur": 1, "args": { "External id": 144364, "cbid": 251, "correlation": 144364 } }, { "ph": "f", "id": 144364, "pid": 76337, "tid": -914061504, "ts": 1716454223625650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625655, "dur": 0, "args": { "External id": 144366, "cbid": 251, "correlation": 144366 } }, { "ph": "f", "id": 144366, "pid": 76337, "tid": -914061504, "ts": 1716454223625655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223681569, "dur": 192, "args": { "External id": 144367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144367, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144367, "pid": 5, "tid": 7, "ts": 1716454223681569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625657, "dur": 13, "args": { "External id": 144367, "cbid": 211, "correlation": 144367 } }, { "ph": "s", "id": 144367, "pid": 76337, "tid": -914061504, "ts": 1716454223625657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681762, "dur": 20, "args": { "External id": 144375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144375, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144375, "pid": 5, "tid": 7, "ts": 1716454223681762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625723, "dur": 12, "args": { "External id": 144375, "cbid": 211, "correlation": 144375 } }, { "ph": "s", "id": 144375, "pid": 76337, "tid": -914061504, "ts": 1716454223625723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681784, "dur": 22, "args": { "External id": 144383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144383, "pid": 5, "tid": 7, "ts": 1716454223681784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625754, "dur": 8, "args": { "External id": 144383, "cbid": 211, "correlation": 144383 } }, { "ph": "s", "id": 144383, "pid": 76337, "tid": -914061504, "ts": 1716454223625754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223625835, "dur": 1, "args": { "External id": 144399, "cbid": 251, "correlation": 144399 } }, { "ph": "f", "id": 144399, "pid": 76337, "tid": -914061504, "ts": 1716454223625835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223681808, "dur": 1, "args": { "External id": 144401, "device": 5, "context": 1, "stream": 7, "correlation": 144401, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 144401, "pid": 5, "tid": 7, "ts": 1716454223681808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223625840, "dur": 10, "args": { "External id": 144401, "cbid": 51, "correlation": 144401 } }, { "ph": "s", "id": 144401, "pid": 76337, "tid": -914061504, "ts": 1716454223625840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223681812, "dur": 109, "args": { "External id": 144402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144402, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 144402, "pid": 5, "tid": 7, "ts": 1716454223681812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625851, "dur": 12, "args": { "External id": 144402, "cbid": 211, "correlation": 144402 } }, { "ph": "s", "id": 144402, "pid": 76337, "tid": -914061504, "ts": 1716454223625851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223681922, "dur": 5, "args": { "External id": 144410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144410, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144410, "pid": 5, "tid": 7, "ts": 1716454223681922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625894, "dur": 11, "args": { "External id": 144410, "cbid": 211, "correlation": 144410 } }, { "ph": "s", "id": 144410, "pid": 76337, "tid": -914061504, "ts": 1716454223625894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681929, "dur": 10, "args": { "External id": 144421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144421, "pid": 5, "tid": 7, "ts": 1716454223681929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223625964, "dur": 20, "args": { "External id": 144421, "cbid": 211, "correlation": 144421 } }, { "ph": "s", "id": 144421, "pid": 76337, "tid": -914061504, "ts": 1716454223625964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223626039, "dur": 0, "args": { "External id": 144433, "cbid": 317, "correlation": 144433 } }, { "ph": "f", "id": 144433, "pid": 76337, "tid": -914061504, "ts": 1716454223626039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223626040, "dur": 0, "args": { "External id": 144434, "cbid": 203, "correlation": 144434 } }, { "ph": "f", "id": 144434, "pid": 76337, "tid": -914061504, "ts": 1716454223626040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223626041, "dur": 0, "args": { "External id": 144435, "cbid": 205, "correlation": 144435 } }, { "ph": "f", "id": 144435, "pid": 76337, "tid": -914061504, "ts": 1716454223626041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223681939, "dur": 6, "args": { "External id": 144439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144439, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144439, "pid": 5, "tid": 7, "ts": 1716454223681939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626056, "dur": 12, "args": { "External id": 144439, "cbid": 211, "correlation": 144439 } }, { "ph": "s", "id": 144439, "pid": 76337, "tid": -914061504, "ts": 1716454223626056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223681947, "dur": 37, "args": { "External id": 144441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144441, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 144441, "pid": 5, "tid": 7, "ts": 1716454223681947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626075, "dur": 7, "args": { "External id": 144441, "cbid": 211, "correlation": 144441 } }, { "ph": "s", "id": 144441, "pid": 76337, "tid": -914061504, "ts": 1716454223626075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223681985, "dur": 6, "args": { "External id": 144443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144443, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144443, "pid": 5, "tid": 7, "ts": 1716454223681985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626086, "dur": 5, "args": { "External id": 144443, "cbid": 211, "correlation": 144443 } }, { "ph": "s", "id": 144443, "pid": 76337, "tid": -914061504, "ts": 1716454223626086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223681992, "dur": 7, "args": { "External id": 144449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144449, "pid": 5, "tid": 7, "ts": 1716454223681992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626114, "dur": 9, "args": { "External id": 144449, "cbid": 211, "correlation": 144449 } }, { "ph": "s", "id": 144449, "pid": 76337, "tid": -914061504, "ts": 1716454223626114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223682001, "dur": 5, "args": { "External id": 144457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144457, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144457, "pid": 5, "tid": 7, "ts": 1716454223682001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626145, "dur": 9, "args": { "External id": 144457, "cbid": 211, "correlation": 144457 } }, { "ph": "s", "id": 144457, "pid": 76337, "tid": -914061504, "ts": 1716454223626145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223682007, "dur": 11, "args": { "External id": 144477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144477, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 144477, "pid": 5, "tid": 7, "ts": 1716454223682007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626217, "dur": 12, "args": { "External id": 144477, "cbid": 211, "correlation": 144477 } }, { "ph": "s", "id": 144477, "pid": 76337, "tid": -914061504, "ts": 1716454223626217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223682019, "dur": 4, "args": { "External id": 144489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144489, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 144489, "pid": 5, "tid": 7, "ts": 1716454223682019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626239, "dur": 6, "args": { "External id": 144489, "cbid": 211, "correlation": 144489 } }, { "ph": "s", "id": 144489, "pid": 76337, "tid": -914061504, "ts": 1716454223626239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223682025, "dur": 8, "args": { "External id": 144492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144492, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144492, "pid": 5, "tid": 7, "ts": 1716454223682025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626258, "dur": 6, "args": { "External id": 144492, "cbid": 211, "correlation": 144492 } }, { "ph": "s", "id": 144492, "pid": 76337, "tid": -914061504, "ts": 1716454223626258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223682035, "dur": 5, "args": { "External id": 144501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144501, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144501, "pid": 5, "tid": 7, "ts": 1716454223682035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626296, "dur": 11, "args": { "External id": 144501, "cbid": 211, "correlation": 144501 } }, { "ph": "s", "id": 144501, "pid": 76337, "tid": -914061504, "ts": 1716454223626296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223626349, "dur": 0, "args": { "External id": 144511, "cbid": 317, "correlation": 144511 } }, { "ph": "f", "id": 144511, "pid": 76337, "tid": -914061504, "ts": 1716454223626349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223626349, "dur": 0, "args": { "External id": 144512, "cbid": 203, "correlation": 144512 } }, { "ph": "f", "id": 144512, "pid": 76337, "tid": -914061504, "ts": 1716454223626349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223626350, "dur": 0, "args": { "External id": 144513, "cbid": 205, "correlation": 144513 } }, { "ph": "f", "id": 144513, "pid": 76337, "tid": -914061504, "ts": 1716454223626350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223682041, "dur": 5, "args": { "External id": 144517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144517, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144517, "pid": 5, "tid": 7, "ts": 1716454223682041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626364, "dur": 11, "args": { "External id": 144517, "cbid": 211, "correlation": 144517 } }, { "ph": "s", "id": 144517, "pid": 76337, "tid": -914061504, "ts": 1716454223626364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223682047, "dur": 163, "args": { "External id": 144519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144519, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144519, "pid": 5, "tid": 7, "ts": 1716454223682047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626378, "dur": 5, "args": { "External id": 144519, "cbid": 211, "correlation": 144519 } }, { "ph": "s", "id": 144519, "pid": 76337, "tid": -914061504, "ts": 1716454223626378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223682213, "dur": 1, "args": { "External id": 144521, "device": 5, "context": 1, "stream": 7, "correlation": 144521, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 144521, "pid": 5, "tid": 7, "ts": 1716454223682213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223626388, "dur": 6, "args": { "External id": 144521, "cbid": 51, "correlation": 144521 } }, { "ph": "s", "id": 144521, "pid": 76337, "tid": -914061504, "ts": 1716454223626388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223682217, "dur": 270, "args": { "External id": 144522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144522, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144522, "pid": 5, "tid": 7, "ts": 1716454223682217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626395, "dur": 7, "args": { "External id": 144522, "cbid": 211, "correlation": 144522 } }, { "ph": "s", "id": 144522, "pid": 76337, "tid": -914061504, "ts": 1716454223626395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223682488, "dur": 6, "args": { "External id": 144524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144524, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144524, "pid": 5, "tid": 7, "ts": 1716454223682488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626406, "dur": 5, "args": { "External id": 144524, "cbid": 211, "correlation": 144524 } }, { "ph": "s", "id": 144524, "pid": 76337, "tid": -914061504, "ts": 1716454223626406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223682495, "dur": 6, "args": { "External id": 144530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144530, "pid": 5, "tid": 7, "ts": 1716454223682495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626433, "dur": 8, "args": { "External id": 144530, "cbid": 211, "correlation": 144530 } }, { "ph": "s", "id": 144530, "pid": 76337, "tid": -914061504, "ts": 1716454223626433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223682503, "dur": 4, "args": { "External id": 144538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144538, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 144538, "pid": 5, "tid": 7, "ts": 1716454223682503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626478, "dur": 9, "args": { "External id": 144538, "cbid": 211, "correlation": 144538 } }, { "ph": "s", "id": 144538, "pid": 76337, "tid": -914061504, "ts": 1716454223626478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223626543, "dur": 1, "args": { "External id": 144554, "cbid": 251, "correlation": 144554 } }, { "ph": "f", "id": 144554, "pid": 76337, "tid": -914061504, "ts": 1716454223626543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223626549, "dur": 0, "args": { "External id": 144556, "cbid": 251, "correlation": 144556 } }, { "ph": "f", "id": 144556, "pid": 76337, "tid": -914061504, "ts": 1716454223626549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223682508, "dur": 13, "args": { "External id": 144557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144557, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144557, "pid": 5, "tid": 7, "ts": 1716454223682508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626551, "dur": 11, "args": { "External id": 144557, "cbid": 211, "correlation": 144557 } }, { "ph": "s", "id": 144557, "pid": 76337, "tid": -914061504, "ts": 1716454223626551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223682522, "dur": 5, "args": { "External id": 144559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144559, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144559, "pid": 5, "tid": 7, "ts": 1716454223682522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626564, "dur": 5, "args": { "External id": 144559, "cbid": 211, "correlation": 144559 } }, { "ph": "s", "id": 144559, "pid": 76337, "tid": -914061504, "ts": 1716454223626564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223682528, "dur": 6, "args": { "External id": 144569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144569, "pid": 5, "tid": 7, "ts": 1716454223682528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626621, "dur": 12, "args": { "External id": 144569, "cbid": 211, "correlation": 144569 } }, { "ph": "s", "id": 144569, "pid": 76337, "tid": -914061504, "ts": 1716454223626621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223682535, "dur": 10, "args": { "External id": 144589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144589, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 144589, "pid": 5, "tid": 7, "ts": 1716454223682535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626687, "dur": 10, "args": { "External id": 144589, "cbid": 211, "correlation": 144589 } }, { "ph": "s", "id": 144589, "pid": 76337, "tid": -914061504, "ts": 1716454223626687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223682547, "dur": 4, "args": { "External id": 144601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144601, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 144601, "pid": 5, "tid": 7, "ts": 1716454223682547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626708, "dur": 6, "args": { "External id": 144601, "cbid": 211, "correlation": 144601 } }, { "ph": "s", "id": 144601, "pid": 76337, "tid": -914061504, "ts": 1716454223626708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223682552, "dur": 7, "args": { "External id": 144604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144604, "pid": 5, "tid": 7, "ts": 1716454223682552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626727, "dur": 6, "args": { "External id": 144604, "cbid": 211, "correlation": 144604 } }, { "ph": "s", "id": 144604, "pid": 76337, "tid": -914061504, "ts": 1716454223626727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223682560, "dur": 5, "args": { "External id": 144613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144613, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144613, "pid": 5, "tid": 7, "ts": 1716454223682560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626768, "dur": 9, "args": { "External id": 144613, "cbid": 211, "correlation": 144613 } }, { "ph": "s", "id": 144613, "pid": 76337, "tid": -914061504, "ts": 1716454223626768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223626829, "dur": 0, "args": { "External id": 144623, "cbid": 317, "correlation": 144623 } }, { "ph": "f", "id": 144623, "pid": 76337, "tid": -914061504, "ts": 1716454223626829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223626830, "dur": 0, "args": { "External id": 144624, "cbid": 203, "correlation": 144624 } }, { "ph": "f", "id": 144624, "pid": 76337, "tid": -914061504, "ts": 1716454223626830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223626831, "dur": 0, "args": { "External id": 144625, "cbid": 205, "correlation": 144625 } }, { "ph": "f", "id": 144625, "pid": 76337, "tid": -914061504, "ts": 1716454223626831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223682566, "dur": 5, "args": { "External id": 144629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144629, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144629, "pid": 5, "tid": 7, "ts": 1716454223682566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626845, "dur": 13, "args": { "External id": 144629, "cbid": 211, "correlation": 144629 } }, { "ph": "s", "id": 144629, "pid": 76337, "tid": -914061504, "ts": 1716454223626845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223682572, "dur": 164, "args": { "External id": 144631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144631, "pid": 5, "tid": 7, "ts": 1716454223682572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626860, "dur": 5, "args": { "External id": 144631, "cbid": 211, "correlation": 144631 } }, { "ph": "s", "id": 144631, "pid": 76337, "tid": -914061504, "ts": 1716454223626860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223682738, "dur": 1, "args": { "External id": 144633, "device": 5, "context": 1, "stream": 7, "correlation": 144633, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 144633, "pid": 5, "tid": 7, "ts": 1716454223682738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223626871, "dur": 6, "args": { "External id": 144633, "cbid": 51, "correlation": 144633 } }, { "ph": "s", "id": 144633, "pid": 76337, "tid": -914061504, "ts": 1716454223626871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223682742, "dur": 260, "args": { "External id": 144634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144634, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144634, "pid": 5, "tid": 7, "ts": 1716454223682742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626878, "dur": 6, "args": { "External id": 144634, "cbid": 211, "correlation": 144634 } }, { "ph": "s", "id": 144634, "pid": 76337, "tid": -914061504, "ts": 1716454223626878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223683003, "dur": 6, "args": { "External id": 144636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144636, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144636, "pid": 5, "tid": 7, "ts": 1716454223683003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626888, "dur": 5, "args": { "External id": 144636, "cbid": 211, "correlation": 144636 } }, { "ph": "s", "id": 144636, "pid": 76337, "tid": -914061504, "ts": 1716454223626888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223683011, "dur": 6, "args": { "External id": 144642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144642, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144642, "pid": 5, "tid": 7, "ts": 1716454223683011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626917, "dur": 8, "args": { "External id": 144642, "cbid": 211, "correlation": 144642 } }, { "ph": "s", "id": 144642, "pid": 76337, "tid": -914061504, "ts": 1716454223626917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223683018, "dur": 5, "args": { "External id": 144650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144650, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144650, "pid": 5, "tid": 7, "ts": 1716454223683018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626949, "dur": 9, "args": { "External id": 144650, "cbid": 211, "correlation": 144650 } }, { "ph": "s", "id": 144650, "pid": 76337, "tid": -914061504, "ts": 1716454223626949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223683024, "dur": 4, "args": { "External id": 144658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144658, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144658, "pid": 5, "tid": 7, "ts": 1716454223683024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223626988, "dur": 9, "args": { "External id": 144658, "cbid": 211, "correlation": 144658 } }, { "ph": "s", "id": 144658, "pid": 76337, "tid": -914061504, "ts": 1716454223626988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223683030, "dur": 12, "args": { "External id": 144667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144667, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144667, "pid": 5, "tid": 7, "ts": 1716454223683030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223627079, "dur": 14, "args": { "External id": 144667, "cbid": 211, "correlation": 144667 } }, { "ph": "s", "id": 144667, "pid": 76337, "tid": -914061504, "ts": 1716454223627079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223683043, "dur": 12, "args": { "External id": 144687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144687, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 144687, "pid": 5, "tid": 7, "ts": 1716454223683043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223627147, "dur": 12, "args": { "External id": 144687, "cbid": 211, "correlation": 144687 } }, { "ph": "s", "id": 144687, "pid": 76337, "tid": -914061504, "ts": 1716454223627147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223683057, "dur": 4, "args": { "External id": 144699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144699, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144699, "pid": 5, "tid": 7, "ts": 1716454223683057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223627169, "dur": 6, "args": { "External id": 144699, "cbid": 211, "correlation": 144699 } }, { "ph": "s", "id": 144699, "pid": 76337, "tid": -914061504, "ts": 1716454223627169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223683062, "dur": 10, "args": { "External id": 144702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144702, "pid": 5, "tid": 7, "ts": 1716454223683062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223627187, "dur": 7, "args": { "External id": 144702, "cbid": 211, "correlation": 144702 } }, { "ph": "s", "id": 144702, "pid": 76337, "tid": -914061504, "ts": 1716454223627187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223683074, "dur": 6, "args": { "External id": 144711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144711, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144711, "pid": 5, "tid": 7, "ts": 1716454223683074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630514, "dur": 37, "args": { "External id": 144711, "cbid": 211, "correlation": 144711 } }, { "ph": "s", "id": 144711, "pid": 76337, "tid": -914061504, "ts": 1716454223630514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223630680, "dur": 1, "args": { "External id": 144721, "cbid": 317, "correlation": 144721 } }, { "ph": "f", "id": 144721, "pid": 76337, "tid": -914061504, "ts": 1716454223630680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223630681, "dur": 1, "args": { "External id": 144722, "cbid": 203, "correlation": 144722 } }, { "ph": "f", "id": 144722, "pid": 76337, "tid": -914061504, "ts": 1716454223630681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223630683, "dur": 1, "args": { "External id": 144723, "cbid": 205, "correlation": 144723 } }, { "ph": "f", "id": 144723, "pid": 76337, "tid": -914061504, "ts": 1716454223630683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223683081, "dur": 6, "args": { "External id": 144727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144727, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144727, "pid": 5, "tid": 7, "ts": 1716454223683081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630751, "dur": 16, "args": { "External id": 144727, "cbid": 211, "correlation": 144727 } }, { "ph": "s", "id": 144727, "pid": 76337, "tid": -914061504, "ts": 1716454223630751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223683089, "dur": 320, "args": { "External id": 144729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144729, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144729, "pid": 5, "tid": 7, "ts": 1716454223683089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630770, "dur": 5, "args": { "External id": 144729, "cbid": 211, "correlation": 144729 } }, { "ph": "s", "id": 144729, "pid": 76337, "tid": -914061504, "ts": 1716454223630770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223683411, "dur": 1, "args": { "External id": 144731, "device": 5, "context": 1, "stream": 7, "correlation": 144731, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 144731, "pid": 5, "tid": 7, "ts": 1716454223683411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223630788, "dur": 16, "args": { "External id": 144731, "cbid": 51, "correlation": 144731 } }, { "ph": "s", "id": 144731, "pid": 76337, "tid": -914061504, "ts": 1716454223630788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223683415, "dur": 496, "args": { "External id": 144732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144732, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144732, "pid": 5, "tid": 7, "ts": 1716454223683415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630806, "dur": 9, "args": { "External id": 144732, "cbid": 211, "correlation": 144732 } }, { "ph": "s", "id": 144732, "pid": 76337, "tid": -914061504, "ts": 1716454223630806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223683912, "dur": 5, "args": { "External id": 144734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144734, "pid": 5, "tid": 7, "ts": 1716454223683912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630822, "dur": 7, "args": { "External id": 144734, "cbid": 211, "correlation": 144734 } }, { "ph": "s", "id": 144734, "pid": 76337, "tid": -914061504, "ts": 1716454223630822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223683919, "dur": 6, "args": { "External id": 144740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144740, "pid": 5, "tid": 7, "ts": 1716454223683919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630912, "dur": 12, "args": { "External id": 144740, "cbid": 211, "correlation": 144740 } }, { "ph": "s", "id": 144740, "pid": 76337, "tid": -914061504, "ts": 1716454223630912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223683926, "dur": 3, "args": { "External id": 144748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144748, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 144748, "pid": 5, "tid": 7, "ts": 1716454223683926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223630982, "dur": 11, "args": { "External id": 144748, "cbid": 211, "correlation": 144748 } }, { "ph": "s", "id": 144748, "pid": 76337, "tid": -914061504, "ts": 1716454223630982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223631119, "dur": 4, "args": { "External id": 144764, "cbid": 251, "correlation": 144764 } }, { "ph": "f", "id": 144764, "pid": 76337, "tid": -914061504, "ts": 1716454223631119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223631128, "dur": 0, "args": { "External id": 144766, "cbid": 251, "correlation": 144766 } }, { "ph": "f", "id": 144766, "pid": 76337, "tid": -914061504, "ts": 1716454223631128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223683931, "dur": 10, "args": { "External id": 144767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144767, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144767, "pid": 5, "tid": 7, "ts": 1716454223683931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631132, "dur": 16, "args": { "External id": 144767, "cbid": 211, "correlation": 144767 } }, { "ph": "s", "id": 144767, "pid": 76337, "tid": -914061504, "ts": 1716454223631132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223683943, "dur": 5, "args": { "External id": 144769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144769, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144769, "pid": 5, "tid": 7, "ts": 1716454223683943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631153, "dur": 8, "args": { "External id": 144769, "cbid": 211, "correlation": 144769 } }, { "ph": "s", "id": 144769, "pid": 76337, "tid": -914061504, "ts": 1716454223631153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223683949, "dur": 6, "args": { "External id": 144779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144779, "pid": 5, "tid": 7, "ts": 1716454223683949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631241, "dur": 12, "args": { "External id": 144779, "cbid": 211, "correlation": 144779 } }, { "ph": "s", "id": 144779, "pid": 76337, "tid": -914061504, "ts": 1716454223631241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223683956, "dur": 9, "args": { "External id": 144799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144799, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 144799, "pid": 5, "tid": 7, "ts": 1716454223683956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631349, "dur": 13, "args": { "External id": 144799, "cbid": 211, "correlation": 144799 } }, { "ph": "s", "id": 144799, "pid": 76337, "tid": -914061504, "ts": 1716454223631349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223683966, "dur": 4, "args": { "External id": 144811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144811, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 144811, "pid": 5, "tid": 7, "ts": 1716454223683966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631374, "dur": 8, "args": { "External id": 144811, "cbid": 211, "correlation": 144811 } }, { "ph": "s", "id": 144811, "pid": 76337, "tid": -914061504, "ts": 1716454223631374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223683972, "dur": 7, "args": { "External id": 144814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144814, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144814, "pid": 5, "tid": 7, "ts": 1716454223683972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631413, "dur": 7, "args": { "External id": 144814, "cbid": 211, "correlation": 144814 } }, { "ph": "s", "id": 144814, "pid": 76337, "tid": -914061504, "ts": 1716454223631413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223683980, "dur": 5, "args": { "External id": 144823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144823, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144823, "pid": 5, "tid": 7, "ts": 1716454223683980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631457, "dur": 9, "args": { "External id": 144823, "cbid": 211, "correlation": 144823 } }, { "ph": "s", "id": 144823, "pid": 76337, "tid": -914061504, "ts": 1716454223631457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223631535, "dur": 0, "args": { "External id": 144833, "cbid": 317, "correlation": 144833 } }, { "ph": "f", "id": 144833, "pid": 76337, "tid": -914061504, "ts": 1716454223631535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223631536, "dur": 1, "args": { "External id": 144834, "cbid": 203, "correlation": 144834 } }, { "ph": "f", "id": 144834, "pid": 76337, "tid": -914061504, "ts": 1716454223631536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223631538, "dur": 0, "args": { "External id": 144835, "cbid": 205, "correlation": 144835 } }, { "ph": "f", "id": 144835, "pid": 76337, "tid": -914061504, "ts": 1716454223631538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223683986, "dur": 5, "args": { "External id": 144839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144839, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144839, "pid": 5, "tid": 7, "ts": 1716454223683986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631704, "dur": 13, "args": { "External id": 144839, "cbid": 211, "correlation": 144839 } }, { "ph": "s", "id": 144839, "pid": 76337, "tid": -914061504, "ts": 1716454223631704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223683992, "dur": 163, "args": { "External id": 144841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144841, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144841, "pid": 5, "tid": 7, "ts": 1716454223683992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631719, "dur": 5, "args": { "External id": 144841, "cbid": 211, "correlation": 144841 } }, { "ph": "s", "id": 144841, "pid": 76337, "tid": -914061504, "ts": 1716454223631719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223684157, "dur": 1, "args": { "External id": 144843, "device": 5, "context": 1, "stream": 7, "correlation": 144843, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 144843, "pid": 5, "tid": 7, "ts": 1716454223684157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223631733, "dur": 10, "args": { "External id": 144843, "cbid": 51, "correlation": 144843 } }, { "ph": "s", "id": 144843, "pid": 76337, "tid": -914061504, "ts": 1716454223631733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223684160, "dur": 260, "args": { "External id": 144844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144844, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144844, "pid": 5, "tid": 7, "ts": 1716454223684160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631744, "dur": 7, "args": { "External id": 144844, "cbid": 211, "correlation": 144844 } }, { "ph": "s", "id": 144844, "pid": 76337, "tid": -914061504, "ts": 1716454223631744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223684422, "dur": 5, "args": { "External id": 144846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144846, "pid": 5, "tid": 7, "ts": 1716454223684422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631756, "dur": 5, "args": { "External id": 144846, "cbid": 211, "correlation": 144846 } }, { "ph": "s", "id": 144846, "pid": 76337, "tid": -914061504, "ts": 1716454223631756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223684428, "dur": 6, "args": { "External id": 144852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144852, "pid": 5, "tid": 7, "ts": 1716454223684428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631787, "dur": 8, "args": { "External id": 144852, "cbid": 211, "correlation": 144852 } }, { "ph": "s", "id": 144852, "pid": 76337, "tid": -914061504, "ts": 1716454223631787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223631848, "dur": 0, "args": { "External id": 144862, "cbid": 317, "correlation": 144862 } }, { "ph": "f", "id": 144862, "pid": 76337, "tid": -914061504, "ts": 1716454223631848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223631849, "dur": 0, "args": { "External id": 144863, "cbid": 203, "correlation": 144863 } }, { "ph": "f", "id": 144863, "pid": 76337, "tid": -914061504, "ts": 1716454223631849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223631849, "dur": 0, "args": { "External id": 144864, "cbid": 205, "correlation": 144864 } }, { "ph": "f", "id": 144864, "pid": 76337, "tid": -914061504, "ts": 1716454223631849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223684436, "dur": 8, "args": { "External id": 144868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144868, "pid": 5, "tid": 7, "ts": 1716454223684436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631920, "dur": 12, "args": { "External id": 144868, "cbid": 211, "correlation": 144868 } }, { "ph": "s", "id": 144868, "pid": 76337, "tid": -914061504, "ts": 1716454223631920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223684445, "dur": 3, "args": { "External id": 144870, "device": 5, "context": 1, "stream": 7, "correlation": 144870, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 144870, "pid": 5, "tid": 7, "ts": 1716454223684445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223631939, "dur": 14, "args": { "External id": 144870, "cbid": 51, "correlation": 144870 } }, { "ph": "s", "id": 144870, "pid": 76337, "tid": -914061504, "ts": 1716454223631939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223684449, "dur": 97, "args": { "External id": 144871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144871, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 144871, "pid": 5, "tid": 7, "ts": 1716454223684449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631955, "dur": 9, "args": { "External id": 144871, "cbid": 211, "correlation": 144871 } }, { "ph": "s", "id": 144871, "pid": 76337, "tid": -914061504, "ts": 1716454223631955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223684548, "dur": 6, "args": { "External id": 144873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144873, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144873, "pid": 5, "tid": 7, "ts": 1716454223684548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223631967, "dur": 18, "args": { "External id": 144873, "cbid": 211, "correlation": 144873 } }, { "ph": "s", "id": 144873, "pid": 76337, "tid": -914061504, "ts": 1716454223631967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223684555, "dur": 6, "args": { "External id": 144879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144879, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144879, "pid": 5, "tid": 7, "ts": 1716454223684555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632009, "dur": 9, "args": { "External id": 144879, "cbid": 211, "correlation": 144879 } }, { "ph": "s", "id": 144879, "pid": 76337, "tid": -914061504, "ts": 1716454223632009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223684562, "dur": 5, "args": { "External id": 144887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144887, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144887, "pid": 5, "tid": 7, "ts": 1716454223684562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632042, "dur": 9, "args": { "External id": 144887, "cbid": 211, "correlation": 144887 } }, { "ph": "s", "id": 144887, "pid": 76337, "tid": -914061504, "ts": 1716454223632042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223684569, "dur": 4, "args": { "External id": 144895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144895, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144895, "pid": 5, "tid": 7, "ts": 1716454223684569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632092, "dur": 10, "args": { "External id": 144895, "cbid": 211, "correlation": 144895 } }, { "ph": "s", "id": 144895, "pid": 76337, "tid": -914061504, "ts": 1716454223632092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223684574, "dur": 11, "args": { "External id": 144904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144904, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144904, "pid": 5, "tid": 7, "ts": 1716454223684574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632166, "dur": 14, "args": { "External id": 144904, "cbid": 211, "correlation": 144904 } }, { "ph": "s", "id": 144904, "pid": 76337, "tid": -914061504, "ts": 1716454223632166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223684587, "dur": 12, "args": { "External id": 144924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144924, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 144924, "pid": 5, "tid": 7, "ts": 1716454223684587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632248, "dur": 11, "args": { "External id": 144924, "cbid": 211, "correlation": 144924 } }, { "ph": "s", "id": 144924, "pid": 76337, "tid": -914061504, "ts": 1716454223632248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223684600, "dur": 4, "args": { "External id": 144936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144936, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144936, "pid": 5, "tid": 7, "ts": 1716454223684600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632269, "dur": 7, "args": { "External id": 144936, "cbid": 211, "correlation": 144936 } }, { "ph": "s", "id": 144936, "pid": 76337, "tid": -914061504, "ts": 1716454223632269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223684606, "dur": 11, "args": { "External id": 144939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144939, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144939, "pid": 5, "tid": 7, "ts": 1716454223684606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632290, "dur": 6, "args": { "External id": 144939, "cbid": 211, "correlation": 144939 } }, { "ph": "s", "id": 144939, "pid": 76337, "tid": -914061504, "ts": 1716454223632290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223684617, "dur": 6, "args": { "External id": 144948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144948, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144948, "pid": 5, "tid": 7, "ts": 1716454223684617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632331, "dur": 9, "args": { "External id": 144948, "cbid": 211, "correlation": 144948 } }, { "ph": "s", "id": 144948, "pid": 76337, "tid": -914061504, "ts": 1716454223632331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223632385, "dur": 0, "args": { "External id": 144958, "cbid": 317, "correlation": 144958 } }, { "ph": "f", "id": 144958, "pid": 76337, "tid": -914061504, "ts": 1716454223632385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223632386, "dur": 0, "args": { "External id": 144959, "cbid": 203, "correlation": 144959 } }, { "ph": "f", "id": 144959, "pid": 76337, "tid": -914061504, "ts": 1716454223632386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223632387, "dur": 0, "args": { "External id": 144960, "cbid": 205, "correlation": 144960 } }, { "ph": "f", "id": 144960, "pid": 76337, "tid": -914061504, "ts": 1716454223632387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223684625, "dur": 7, "args": { "External id": 144964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144964, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144964, "pid": 5, "tid": 7, "ts": 1716454223684625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632410, "dur": 12, "args": { "External id": 144964, "cbid": 211, "correlation": 144964 } }, { "ph": "s", "id": 144964, "pid": 76337, "tid": -914061504, "ts": 1716454223632410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223684633, "dur": 321, "args": { "External id": 144966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144966, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144966, "pid": 5, "tid": 7, "ts": 1716454223684633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632425, "dur": 6, "args": { "External id": 144966, "cbid": 211, "correlation": 144966 } }, { "ph": "s", "id": 144966, "pid": 76337, "tid": -914061504, "ts": 1716454223632425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223684956, "dur": 1, "args": { "External id": 144968, "device": 5, "context": 1, "stream": 7, "correlation": 144968, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 144968, "pid": 5, "tid": 7, "ts": 1716454223684956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223632437, "dur": 7, "args": { "External id": 144968, "cbid": 51, "correlation": 144968 } }, { "ph": "s", "id": 144968, "pid": 76337, "tid": -914061504, "ts": 1716454223632437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223684960, "dur": 499, "args": { "External id": 144969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144969, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 144969, "pid": 5, "tid": 7, "ts": 1716454223684960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632445, "dur": 6, "args": { "External id": 144969, "cbid": 211, "correlation": 144969 } }, { "ph": "s", "id": 144969, "pid": 76337, "tid": -914061504, "ts": 1716454223632445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223685460, "dur": 5, "args": { "External id": 144971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144971, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 144971, "pid": 5, "tid": 7, "ts": 1716454223685460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632454, "dur": 5, "args": { "External id": 144971, "cbid": 211, "correlation": 144971 } }, { "ph": "s", "id": 144971, "pid": 76337, "tid": -914061504, "ts": 1716454223632454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223685467, "dur": 6, "args": { "External id": 144977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 144977, "pid": 5, "tid": 7, "ts": 1716454223685467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632483, "dur": 8, "args": { "External id": 144977, "cbid": 211, "correlation": 144977 } }, { "ph": "s", "id": 144977, "pid": 76337, "tid": -914061504, "ts": 1716454223632483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223685474, "dur": 4, "args": { "External id": 144985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 144985, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 144985, "pid": 5, "tid": 7, "ts": 1716454223685474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632526, "dur": 9, "args": { "External id": 144985, "cbid": 211, "correlation": 144985 } }, { "ph": "s", "id": 144985, "pid": 76337, "tid": -914061504, "ts": 1716454223632526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223632608, "dur": 2, "args": { "External id": 145001, "cbid": 251, "correlation": 145001 } }, { "ph": "f", "id": 145001, "pid": 76337, "tid": -914061504, "ts": 1716454223632608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223632614, "dur": 0, "args": { "External id": 145003, "cbid": 251, "correlation": 145003 } }, { "ph": "f", "id": 145003, "pid": 76337, "tid": -914061504, "ts": 1716454223632614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223685480, "dur": 12, "args": { "External id": 145004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145004, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145004, "pid": 5, "tid": 7, "ts": 1716454223685480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632616, "dur": 14, "args": { "External id": 145004, "cbid": 211, "correlation": 145004 } }, { "ph": "s", "id": 145004, "pid": 76337, "tid": -914061504, "ts": 1716454223632616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223685494, "dur": 5, "args": { "External id": 145006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145006, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145006, "pid": 5, "tid": 7, "ts": 1716454223685494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632633, "dur": 6, "args": { "External id": 145006, "cbid": 211, "correlation": 145006 } }, { "ph": "s", "id": 145006, "pid": 76337, "tid": -914061504, "ts": 1716454223632633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223685500, "dur": 6, "args": { "External id": 145016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145016, "pid": 5, "tid": 7, "ts": 1716454223685500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632693, "dur": 12, "args": { "External id": 145016, "cbid": 211, "correlation": 145016 } }, { "ph": "s", "id": 145016, "pid": 76337, "tid": -914061504, "ts": 1716454223632693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223685507, "dur": 9, "args": { "External id": 145036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145036, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 145036, "pid": 5, "tid": 7, "ts": 1716454223685507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632759, "dur": 11, "args": { "External id": 145036, "cbid": 211, "correlation": 145036 } }, { "ph": "s", "id": 145036, "pid": 76337, "tid": -914061504, "ts": 1716454223632759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223685518, "dur": 4, "args": { "External id": 145048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145048, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 145048, "pid": 5, "tid": 7, "ts": 1716454223685518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632781, "dur": 6, "args": { "External id": 145048, "cbid": 211, "correlation": 145048 } }, { "ph": "s", "id": 145048, "pid": 76337, "tid": -914061504, "ts": 1716454223632781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223685524, "dur": 7, "args": { "External id": 145051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145051, "pid": 5, "tid": 7, "ts": 1716454223685524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632798, "dur": 6, "args": { "External id": 145051, "cbid": 211, "correlation": 145051 } }, { "ph": "s", "id": 145051, "pid": 76337, "tid": -914061504, "ts": 1716454223632798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223685532, "dur": 5, "args": { "External id": 145060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145060, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145060, "pid": 5, "tid": 7, "ts": 1716454223685532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632838, "dur": 9, "args": { "External id": 145060, "cbid": 211, "correlation": 145060 } }, { "ph": "s", "id": 145060, "pid": 76337, "tid": -914061504, "ts": 1716454223632838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223632903, "dur": 0, "args": { "External id": 145070, "cbid": 317, "correlation": 145070 } }, { "ph": "f", "id": 145070, "pid": 76337, "tid": -914061504, "ts": 1716454223632903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223632904, "dur": 0, "args": { "External id": 145071, "cbid": 203, "correlation": 145071 } }, { "ph": "f", "id": 145071, "pid": 76337, "tid": -914061504, "ts": 1716454223632904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223632905, "dur": 0, "args": { "External id": 145072, "cbid": 205, "correlation": 145072 } }, { "ph": "f", "id": 145072, "pid": 76337, "tid": -914061504, "ts": 1716454223632905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223685537, "dur": 5, "args": { "External id": 145076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145076, "pid": 5, "tid": 7, "ts": 1716454223685537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632920, "dur": 13, "args": { "External id": 145076, "cbid": 211, "correlation": 145076 } }, { "ph": "s", "id": 145076, "pid": 76337, "tid": -914061504, "ts": 1716454223632920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223685544, "dur": 162, "args": { "External id": 145078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145078, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145078, "pid": 5, "tid": 7, "ts": 1716454223685544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632935, "dur": 5, "args": { "External id": 145078, "cbid": 211, "correlation": 145078 } }, { "ph": "s", "id": 145078, "pid": 76337, "tid": -914061504, "ts": 1716454223632935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223685708, "dur": 1, "args": { "External id": 145080, "device": 5, "context": 1, "stream": 7, "correlation": 145080, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 145080, "pid": 5, "tid": 7, "ts": 1716454223685708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223632945, "dur": 6, "args": { "External id": 145080, "cbid": 51, "correlation": 145080 } }, { "ph": "s", "id": 145080, "pid": 76337, "tid": -914061504, "ts": 1716454223632945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223685712, "dur": 260, "args": { "External id": 145081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145081, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145081, "pid": 5, "tid": 7, "ts": 1716454223685712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632952, "dur": 6, "args": { "External id": 145081, "cbid": 211, "correlation": 145081 } }, { "ph": "s", "id": 145081, "pid": 76337, "tid": -914061504, "ts": 1716454223632952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223685973, "dur": 6, "args": { "External id": 145083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145083, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145083, "pid": 5, "tid": 7, "ts": 1716454223685973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223632962, "dur": 5, "args": { "External id": 145083, "cbid": 211, "correlation": 145083 } }, { "ph": "s", "id": 145083, "pid": 76337, "tid": -914061504, "ts": 1716454223632962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223685980, "dur": 6, "args": { "External id": 145089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145089, "pid": 5, "tid": 7, "ts": 1716454223685980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633000, "dur": 9, "args": { "External id": 145089, "cbid": 211, "correlation": 145089 } }, { "ph": "s", "id": 145089, "pid": 76337, "tid": -914061504, "ts": 1716454223633000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223633061, "dur": 0, "args": { "External id": 145099, "cbid": 317, "correlation": 145099 } }, { "ph": "f", "id": 145099, "pid": 76337, "tid": -914061504, "ts": 1716454223633061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223633061, "dur": 0, "args": { "External id": 145100, "cbid": 203, "correlation": 145100 } }, { "ph": "f", "id": 145100, "pid": 76337, "tid": -914061504, "ts": 1716454223633061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223633062, "dur": 0, "args": { "External id": 145101, "cbid": 205, "correlation": 145101 } }, { "ph": "f", "id": 145101, "pid": 76337, "tid": -914061504, "ts": 1716454223633062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223685987, "dur": 8, "args": { "External id": 145105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145105, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145105, "pid": 5, "tid": 7, "ts": 1716454223685987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633077, "dur": 11, "args": { "External id": 145105, "cbid": 211, "correlation": 145105 } }, { "ph": "s", "id": 145105, "pid": 76337, "tid": -914061504, "ts": 1716454223633077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223685997, "dur": 3, "args": { "External id": 145107, "device": 5, "context": 1, "stream": 7, "correlation": 145107, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 145107, "pid": 5, "tid": 7, "ts": 1716454223685997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223633093, "dur": 10, "args": { "External id": 145107, "cbid": 51, "correlation": 145107 } }, { "ph": "s", "id": 145107, "pid": 76337, "tid": -914061504, "ts": 1716454223633093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223686001, "dur": 95, "args": { "External id": 145108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145108, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 145108, "pid": 5, "tid": 7, "ts": 1716454223686001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633104, "dur": 6, "args": { "External id": 145108, "cbid": 211, "correlation": 145108 } }, { "ph": "s", "id": 145108, "pid": 76337, "tid": -914061504, "ts": 1716454223633104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223686098, "dur": 5, "args": { "External id": 145110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145110, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145110, "pid": 5, "tid": 7, "ts": 1716454223686098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633113, "dur": 5, "args": { "External id": 145110, "cbid": 211, "correlation": 145110 } }, { "ph": "s", "id": 145110, "pid": 76337, "tid": -914061504, "ts": 1716454223633113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223686105, "dur": 6, "args": { "External id": 145116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145116, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145116, "pid": 5, "tid": 7, "ts": 1716454223686105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633140, "dur": 8, "args": { "External id": 145116, "cbid": 211, "correlation": 145116 } }, { "ph": "s", "id": 145116, "pid": 76337, "tid": -914061504, "ts": 1716454223633140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223686112, "dur": 5, "args": { "External id": 145124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145124, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145124, "pid": 5, "tid": 7, "ts": 1716454223686112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633169, "dur": 9, "args": { "External id": 145124, "cbid": 211, "correlation": 145124 } }, { "ph": "s", "id": 145124, "pid": 76337, "tid": -914061504, "ts": 1716454223633169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223686118, "dur": 4, "args": { "External id": 145132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145132, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145132, "pid": 5, "tid": 7, "ts": 1716454223686118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633199, "dur": 8, "args": { "External id": 145132, "cbid": 211, "correlation": 145132 } }, { "ph": "s", "id": 145132, "pid": 76337, "tid": -914061504, "ts": 1716454223633199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223686124, "dur": 11, "args": { "External id": 145141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145141, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145141, "pid": 5, "tid": 7, "ts": 1716454223686124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633243, "dur": 10, "args": { "External id": 145141, "cbid": 211, "correlation": 145141 } }, { "ph": "s", "id": 145141, "pid": 76337, "tid": -914061504, "ts": 1716454223633243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223686137, "dur": 13, "args": { "External id": 145161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145161, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 145161, "pid": 5, "tid": 7, "ts": 1716454223686137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633313, "dur": 11, "args": { "External id": 145161, "cbid": 211, "correlation": 145161 } }, { "ph": "s", "id": 145161, "pid": 76337, "tid": -914061504, "ts": 1716454223633313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223686150, "dur": 4, "args": { "External id": 145173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145173, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145173, "pid": 5, "tid": 7, "ts": 1716454223686150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633334, "dur": 6, "args": { "External id": 145173, "cbid": 211, "correlation": 145173 } }, { "ph": "s", "id": 145173, "pid": 76337, "tid": -914061504, "ts": 1716454223633334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223686156, "dur": 10, "args": { "External id": 145176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145176, "pid": 5, "tid": 7, "ts": 1716454223686156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633352, "dur": 7, "args": { "External id": 145176, "cbid": 211, "correlation": 145176 } }, { "ph": "s", "id": 145176, "pid": 76337, "tid": -914061504, "ts": 1716454223633352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223686168, "dur": 6, "args": { "External id": 145185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145185, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145185, "pid": 5, "tid": 7, "ts": 1716454223686168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633390, "dur": 9, "args": { "External id": 145185, "cbid": 211, "correlation": 145185 } }, { "ph": "s", "id": 145185, "pid": 76337, "tid": -914061504, "ts": 1716454223633390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223633442, "dur": 0, "args": { "External id": 145195, "cbid": 317, "correlation": 145195 } }, { "ph": "f", "id": 145195, "pid": 76337, "tid": -914061504, "ts": 1716454223633442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223633443, "dur": 0, "args": { "External id": 145196, "cbid": 203, "correlation": 145196 } }, { "ph": "f", "id": 145196, "pid": 76337, "tid": -914061504, "ts": 1716454223633443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223633443, "dur": 0, "args": { "External id": 145197, "cbid": 205, "correlation": 145197 } }, { "ph": "f", "id": 145197, "pid": 76337, "tid": -914061504, "ts": 1716454223633443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223686175, "dur": 7, "args": { "External id": 145201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145201, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145201, "pid": 5, "tid": 7, "ts": 1716454223686175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633458, "dur": 11, "args": { "External id": 145201, "cbid": 211, "correlation": 145201 } }, { "ph": "s", "id": 145201, "pid": 76337, "tid": -914061504, "ts": 1716454223633458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223686183, "dur": 319, "args": { "External id": 145203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145203, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145203, "pid": 5, "tid": 7, "ts": 1716454223686183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633471, "dur": 6, "args": { "External id": 145203, "cbid": 211, "correlation": 145203 } }, { "ph": "s", "id": 145203, "pid": 76337, "tid": -914061504, "ts": 1716454223633471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223686504, "dur": 1, "args": { "External id": 145205, "device": 5, "context": 1, "stream": 7, "correlation": 145205, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 145205, "pid": 5, "tid": 7, "ts": 1716454223686504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223633483, "dur": 6, "args": { "External id": 145205, "cbid": 51, "correlation": 145205 } }, { "ph": "s", "id": 145205, "pid": 76337, "tid": -914061504, "ts": 1716454223633483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223686508, "dur": 496, "args": { "External id": 145206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145206, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145206, "pid": 5, "tid": 7, "ts": 1716454223686508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633490, "dur": 6, "args": { "External id": 145206, "cbid": 211, "correlation": 145206 } }, { "ph": "s", "id": 145206, "pid": 76337, "tid": -914061504, "ts": 1716454223633490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687006, "dur": 5, "args": { "External id": 145208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145208, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145208, "pid": 5, "tid": 7, "ts": 1716454223687006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633500, "dur": 5, "args": { "External id": 145208, "cbid": 211, "correlation": 145208 } }, { "ph": "s", "id": 145208, "pid": 76337, "tid": -914061504, "ts": 1716454223633500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223687013, "dur": 7, "args": { "External id": 145214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145214, "pid": 5, "tid": 7, "ts": 1716454223687013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633528, "dur": 9, "args": { "External id": 145214, "cbid": 211, "correlation": 145214 } }, { "ph": "s", "id": 145214, "pid": 76337, "tid": -914061504, "ts": 1716454223633528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223687021, "dur": 4, "args": { "External id": 145222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145222, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 145222, "pid": 5, "tid": 7, "ts": 1716454223687021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633572, "dur": 9, "args": { "External id": 145222, "cbid": 211, "correlation": 145222 } }, { "ph": "s", "id": 145222, "pid": 76337, "tid": -914061504, "ts": 1716454223633572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223633635, "dur": 1, "args": { "External id": 145238, "cbid": 251, "correlation": 145238 } }, { "ph": "f", "id": 145238, "pid": 76337, "tid": -914061504, "ts": 1716454223633635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223633640, "dur": 0, "args": { "External id": 145240, "cbid": 251, "correlation": 145240 } }, { "ph": "f", "id": 145240, "pid": 76337, "tid": -914061504, "ts": 1716454223633640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223687026, "dur": 13, "args": { "External id": 145241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145241, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145241, "pid": 5, "tid": 7, "ts": 1716454223687026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633642, "dur": 12, "args": { "External id": 145241, "cbid": 211, "correlation": 145241 } }, { "ph": "s", "id": 145241, "pid": 76337, "tid": -914061504, "ts": 1716454223633642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223687040, "dur": 5, "args": { "External id": 145243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145243, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145243, "pid": 5, "tid": 7, "ts": 1716454223687040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633656, "dur": 5, "args": { "External id": 145243, "cbid": 211, "correlation": 145243 } }, { "ph": "s", "id": 145243, "pid": 76337, "tid": -914061504, "ts": 1716454223633656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223687047, "dur": 6, "args": { "External id": 145253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145253, "pid": 5, "tid": 7, "ts": 1716454223687047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633712, "dur": 12, "args": { "External id": 145253, "cbid": 211, "correlation": 145253 } }, { "ph": "s", "id": 145253, "pid": 76337, "tid": -914061504, "ts": 1716454223633712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223687054, "dur": 9, "args": { "External id": 145273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145273, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 145273, "pid": 5, "tid": 7, "ts": 1716454223687054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633777, "dur": 11, "args": { "External id": 145273, "cbid": 211, "correlation": 145273 } }, { "ph": "s", "id": 145273, "pid": 76337, "tid": -914061504, "ts": 1716454223633777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223687064, "dur": 4, "args": { "External id": 145285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145285, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 145285, "pid": 5, "tid": 7, "ts": 1716454223687064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633798, "dur": 6, "args": { "External id": 145285, "cbid": 211, "correlation": 145285 } }, { "ph": "s", "id": 145285, "pid": 76337, "tid": -914061504, "ts": 1716454223633798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223687070, "dur": 7, "args": { "External id": 145288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145288, "pid": 5, "tid": 7, "ts": 1716454223687070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633816, "dur": 6, "args": { "External id": 145288, "cbid": 211, "correlation": 145288 } }, { "ph": "s", "id": 145288, "pid": 76337, "tid": -914061504, "ts": 1716454223633816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223687078, "dur": 5, "args": { "External id": 145297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145297, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145297, "pid": 5, "tid": 7, "ts": 1716454223687078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633856, "dur": 9, "args": { "External id": 145297, "cbid": 211, "correlation": 145297 } }, { "ph": "s", "id": 145297, "pid": 76337, "tid": -914061504, "ts": 1716454223633856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223633919, "dur": 0, "args": { "External id": 145307, "cbid": 317, "correlation": 145307 } }, { "ph": "f", "id": 145307, "pid": 76337, "tid": -914061504, "ts": 1716454223633919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223633920, "dur": 0, "args": { "External id": 145308, "cbid": 203, "correlation": 145308 } }, { "ph": "f", "id": 145308, "pid": 76337, "tid": -914061504, "ts": 1716454223633920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223633921, "dur": 0, "args": { "External id": 145309, "cbid": 205, "correlation": 145309 } }, { "ph": "f", "id": 145309, "pid": 76337, "tid": -914061504, "ts": 1716454223633921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687084, "dur": 5, "args": { "External id": 145313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145313, "pid": 5, "tid": 7, "ts": 1716454223687084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633934, "dur": 13, "args": { "External id": 145313, "cbid": 211, "correlation": 145313 } }, { "ph": "s", "id": 145313, "pid": 76337, "tid": -914061504, "ts": 1716454223633934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687090, "dur": 162, "args": { "External id": 145315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145315, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145315, "pid": 5, "tid": 7, "ts": 1716454223687090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633949, "dur": 5, "args": { "External id": 145315, "cbid": 211, "correlation": 145315 } }, { "ph": "s", "id": 145315, "pid": 76337, "tid": -914061504, "ts": 1716454223633949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223687255, "dur": 1, "args": { "External id": 145317, "device": 5, "context": 1, "stream": 7, "correlation": 145317, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 145317, "pid": 5, "tid": 7, "ts": 1716454223687255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223633960, "dur": 6, "args": { "External id": 145317, "cbid": 51, "correlation": 145317 } }, { "ph": "s", "id": 145317, "pid": 76337, "tid": -914061504, "ts": 1716454223633960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223687258, "dur": 260, "args": { "External id": 145318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145318, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145318, "pid": 5, "tid": 7, "ts": 1716454223687258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633967, "dur": 14, "args": { "External id": 145318, "cbid": 211, "correlation": 145318 } }, { "ph": "s", "id": 145318, "pid": 76337, "tid": -914061504, "ts": 1716454223633967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687520, "dur": 6, "args": { "External id": 145320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145320, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145320, "pid": 5, "tid": 7, "ts": 1716454223687520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223633985, "dur": 6, "args": { "External id": 145320, "cbid": 211, "correlation": 145320 } }, { "ph": "s", "id": 145320, "pid": 76337, "tid": -914061504, "ts": 1716454223633985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223687527, "dur": 6, "args": { "External id": 145326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145326, "pid": 5, "tid": 7, "ts": 1716454223687527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634015, "dur": 8, "args": { "External id": 145326, "cbid": 211, "correlation": 145326 } }, { "ph": "s", "id": 145326, "pid": 76337, "tid": -914061504, "ts": 1716454223634015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223634074, "dur": 0, "args": { "External id": 145336, "cbid": 317, "correlation": 145336 } }, { "ph": "f", "id": 145336, "pid": 76337, "tid": -914061504, "ts": 1716454223634074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223634074, "dur": 0, "args": { "External id": 145337, "cbid": 203, "correlation": 145337 } }, { "ph": "f", "id": 145337, "pid": 76337, "tid": -914061504, "ts": 1716454223634074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223634075, "dur": 0, "args": { "External id": 145338, "cbid": 205, "correlation": 145338 } }, { "ph": "f", "id": 145338, "pid": 76337, "tid": -914061504, "ts": 1716454223634075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687535, "dur": 8, "args": { "External id": 145342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145342, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145342, "pid": 5, "tid": 7, "ts": 1716454223687535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634087, "dur": 13, "args": { "External id": 145342, "cbid": 211, "correlation": 145342 } }, { "ph": "s", "id": 145342, "pid": 76337, "tid": -914061504, "ts": 1716454223634087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223687544, "dur": 3, "args": { "External id": 145344, "device": 5, "context": 1, "stream": 7, "correlation": 145344, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 145344, "pid": 5, "tid": 7, "ts": 1716454223687544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223634105, "dur": 9, "args": { "External id": 145344, "cbid": 51, "correlation": 145344 } }, { "ph": "s", "id": 145344, "pid": 76337, "tid": -914061504, "ts": 1716454223634105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223687548, "dur": 93, "args": { "External id": 145345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145345, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 145345, "pid": 5, "tid": 7, "ts": 1716454223687548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634115, "dur": 6, "args": { "External id": 145345, "cbid": 211, "correlation": 145345 } }, { "ph": "s", "id": 145345, "pid": 76337, "tid": -914061504, "ts": 1716454223634115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687642, "dur": 5, "args": { "External id": 145347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145347, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145347, "pid": 5, "tid": 7, "ts": 1716454223687642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634124, "dur": 5, "args": { "External id": 145347, "cbid": 211, "correlation": 145347 } }, { "ph": "s", "id": 145347, "pid": 76337, "tid": -914061504, "ts": 1716454223634124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223687649, "dur": 6, "args": { "External id": 145353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145353, "pid": 5, "tid": 7, "ts": 1716454223687649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634152, "dur": 8, "args": { "External id": 145353, "cbid": 211, "correlation": 145353 } }, { "ph": "s", "id": 145353, "pid": 76337, "tid": -914061504, "ts": 1716454223634152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223687656, "dur": 5, "args": { "External id": 145361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145361, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145361, "pid": 5, "tid": 7, "ts": 1716454223687656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634180, "dur": 9, "args": { "External id": 145361, "cbid": 211, "correlation": 145361 } }, { "ph": "s", "id": 145361, "pid": 76337, "tid": -914061504, "ts": 1716454223634180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223687663, "dur": 4, "args": { "External id": 145369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145369, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 145369, "pid": 5, "tid": 7, "ts": 1716454223687663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634209, "dur": 8, "args": { "External id": 145369, "cbid": 211, "correlation": 145369 } }, { "ph": "s", "id": 145369, "pid": 76337, "tid": -914061504, "ts": 1716454223634209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223687669, "dur": 14, "args": { "External id": 145380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145380, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145380, "pid": 5, "tid": 7, "ts": 1716454223687669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634300, "dur": 12, "args": { "External id": 145380, "cbid": 211, "correlation": 145380 } }, { "ph": "s", "id": 145380, "pid": 76337, "tid": -914061504, "ts": 1716454223634300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223634356, "dur": 0, "args": { "External id": 145390, "cbid": 317, "correlation": 145390 } }, { "ph": "f", "id": 145390, "pid": 76337, "tid": -914061504, "ts": 1716454223634356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223634357, "dur": 0, "args": { "External id": 145391, "cbid": 203, "correlation": 145391 } }, { "ph": "f", "id": 145391, "pid": 76337, "tid": -914061504, "ts": 1716454223634357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223634358, "dur": 0, "args": { "External id": 145392, "cbid": 205, "correlation": 145392 } }, { "ph": "f", "id": 145392, "pid": 76337, "tid": -914061504, "ts": 1716454223634358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687684, "dur": 9, "args": { "External id": 145396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145396, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145396, "pid": 5, "tid": 7, "ts": 1716454223687684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634373, "dur": 12, "args": { "External id": 145396, "cbid": 211, "correlation": 145396 } }, { "ph": "s", "id": 145396, "pid": 76337, "tid": -914061504, "ts": 1716454223634373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223687694, "dur": 164, "args": { "External id": 145398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145398, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145398, "pid": 5, "tid": 7, "ts": 1716454223687694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634388, "dur": 5, "args": { "External id": 145398, "cbid": 211, "correlation": 145398 } }, { "ph": "s", "id": 145398, "pid": 76337, "tid": -914061504, "ts": 1716454223634388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223687860, "dur": 1, "args": { "External id": 145400, "device": 5, "context": 1, "stream": 7, "correlation": 145400, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 145400, "pid": 5, "tid": 7, "ts": 1716454223687860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223634399, "dur": 6, "args": { "External id": 145400, "cbid": 51, "correlation": 145400 } }, { "ph": "s", "id": 145400, "pid": 76337, "tid": -914061504, "ts": 1716454223634399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223687863, "dur": 650, "args": { "External id": 145401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145401, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145401, "pid": 5, "tid": 7, "ts": 1716454223687863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634407, "dur": 6, "args": { "External id": 145401, "cbid": 211, "correlation": 145401 } }, { "ph": "s", "id": 145401, "pid": 76337, "tid": -914061504, "ts": 1716454223634407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223688514, "dur": 12, "args": { "External id": 145403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145403, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145403, "pid": 5, "tid": 7, "ts": 1716454223688514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634417, "dur": 5, "args": { "External id": 145403, "cbid": 211, "correlation": 145403 } }, { "ph": "s", "id": 145403, "pid": 76337, "tid": -914061504, "ts": 1716454223634417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223688528, "dur": 14, "args": { "External id": 145409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145409, "pid": 5, "tid": 7, "ts": 1716454223688528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634445, "dur": 8, "args": { "External id": 145409, "cbid": 211, "correlation": 145409 } }, { "ph": "s", "id": 145409, "pid": 76337, "tid": -914061504, "ts": 1716454223634445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223688543, "dur": 31, "args": { "External id": 145418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145418, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145418, "pid": 5, "tid": 7, "ts": 1716454223688543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634557, "dur": 12, "args": { "External id": 145418, "cbid": 211, "correlation": 145418 } }, { "ph": "s", "id": 145418, "pid": 76337, "tid": -914061504, "ts": 1716454223634557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223688575, "dur": 30, "args": { "External id": 145438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145438, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 145438, "pid": 5, "tid": 7, "ts": 1716454223688575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634627, "dur": 13, "args": { "External id": 145438, "cbid": 211, "correlation": 145438 } }, { "ph": "s", "id": 145438, "pid": 76337, "tid": -914061504, "ts": 1716454223634627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223688607, "dur": 4, "args": { "External id": 145450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145450, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145450, "pid": 5, "tid": 7, "ts": 1716454223688607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634649, "dur": 6, "args": { "External id": 145450, "cbid": 211, "correlation": 145450 } }, { "ph": "s", "id": 145450, "pid": 76337, "tid": -914061504, "ts": 1716454223634649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223688612, "dur": 30, "args": { "External id": 145453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145453, "pid": 5, "tid": 7, "ts": 1716454223688612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634668, "dur": 6, "args": { "External id": 145453, "cbid": 211, "correlation": 145453 } }, { "ph": "s", "id": 145453, "pid": 76337, "tid": -914061504, "ts": 1716454223634668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223688644, "dur": 20, "args": { "External id": 145462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145462, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145462, "pid": 5, "tid": 7, "ts": 1716454223688644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634706, "dur": 9, "args": { "External id": 145462, "cbid": 211, "correlation": 145462 } }, { "ph": "s", "id": 145462, "pid": 76337, "tid": -914061504, "ts": 1716454223634706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223634758, "dur": 0, "args": { "External id": 145472, "cbid": 317, "correlation": 145472 } }, { "ph": "f", "id": 145472, "pid": 76337, "tid": -914061504, "ts": 1716454223634758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223634759, "dur": 0, "args": { "External id": 145473, "cbid": 203, "correlation": 145473 } }, { "ph": "f", "id": 145473, "pid": 76337, "tid": -914061504, "ts": 1716454223634759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223634760, "dur": 0, "args": { "External id": 145474, "cbid": 205, "correlation": 145474 } }, { "ph": "f", "id": 145474, "pid": 76337, "tid": -914061504, "ts": 1716454223634760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223688665, "dur": 22, "args": { "External id": 145478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145478, "pid": 5, "tid": 7, "ts": 1716454223688665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634774, "dur": 11, "args": { "External id": 145478, "cbid": 211, "correlation": 145478 } }, { "ph": "s", "id": 145478, "pid": 76337, "tid": -914061504, "ts": 1716454223634774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223688688, "dur": 321, "args": { "External id": 145480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145480, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145480, "pid": 5, "tid": 7, "ts": 1716454223688688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634788, "dur": 5, "args": { "External id": 145480, "cbid": 211, "correlation": 145480 } }, { "ph": "s", "id": 145480, "pid": 76337, "tid": -914061504, "ts": 1716454223634788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223689011, "dur": 1, "args": { "External id": 145482, "device": 5, "context": 1, "stream": 7, "correlation": 145482, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 145482, "pid": 5, "tid": 7, "ts": 1716454223689011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223634799, "dur": 6, "args": { "External id": 145482, "cbid": 51, "correlation": 145482 } }, { "ph": "s", "id": 145482, "pid": 76337, "tid": -914061504, "ts": 1716454223634799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223689015, "dur": 1244, "args": { "External id": 145483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145483, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145483, "pid": 5, "tid": 7, "ts": 1716454223689015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634806, "dur": 6, "args": { "External id": 145483, "cbid": 211, "correlation": 145483 } }, { "ph": "s", "id": 145483, "pid": 76337, "tid": -914061504, "ts": 1716454223634806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223690260, "dur": 12, "args": { "External id": 145485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145485, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145485, "pid": 5, "tid": 7, "ts": 1716454223690260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634815, "dur": 5, "args": { "External id": 145485, "cbid": 211, "correlation": 145485 } }, { "ph": "s", "id": 145485, "pid": 76337, "tid": -914061504, "ts": 1716454223634815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223690273, "dur": 15, "args": { "External id": 145491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145491, "pid": 5, "tid": 7, "ts": 1716454223690273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634844, "dur": 9, "args": { "External id": 145491, "cbid": 211, "correlation": 145491 } }, { "ph": "s", "id": 145491, "pid": 76337, "tid": -914061504, "ts": 1716454223634844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223690289, "dur": 4, "args": { "External id": 145499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145499, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 145499, "pid": 5, "tid": 7, "ts": 1716454223690289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634889, "dur": 10, "args": { "External id": 145499, "cbid": 211, "correlation": 145499 } }, { "ph": "s", "id": 145499, "pid": 76337, "tid": -914061504, "ts": 1716454223634889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223634951, "dur": 1, "args": { "External id": 145515, "cbid": 251, "correlation": 145515 } }, { "ph": "f", "id": 145515, "pid": 76337, "tid": -914061504, "ts": 1716454223634951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223634957, "dur": 0, "args": { "External id": 145517, "cbid": 251, "correlation": 145517 } }, { "ph": "f", "id": 145517, "pid": 76337, "tid": -914061504, "ts": 1716454223634957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223690295, "dur": 12, "args": { "External id": 145518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145518, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145518, "pid": 5, "tid": 7, "ts": 1716454223690295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634959, "dur": 11, "args": { "External id": 145518, "cbid": 211, "correlation": 145518 } }, { "ph": "s", "id": 145518, "pid": 76337, "tid": -914061504, "ts": 1716454223634959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223690308, "dur": 5, "args": { "External id": 145520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145520, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145520, "pid": 5, "tid": 7, "ts": 1716454223690308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223634971, "dur": 14, "args": { "External id": 145520, "cbid": 211, "correlation": 145520 } }, { "ph": "s", "id": 145520, "pid": 76337, "tid": -914061504, "ts": 1716454223634971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223690314, "dur": 17, "args": { "External id": 145530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145530, "pid": 5, "tid": 7, "ts": 1716454223690314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635037, "dur": 13, "args": { "External id": 145530, "cbid": 211, "correlation": 145530 } }, { "ph": "s", "id": 145530, "pid": 76337, "tid": -914061504, "ts": 1716454223635037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223690333, "dur": 19, "args": { "External id": 145550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145550, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 145550, "pid": 5, "tid": 7, "ts": 1716454223690333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635103, "dur": 10, "args": { "External id": 145550, "cbid": 211, "correlation": 145550 } }, { "ph": "s", "id": 145550, "pid": 76337, "tid": -914061504, "ts": 1716454223635103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223690353, "dur": 4, "args": { "External id": 145562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145562, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 145562, "pid": 5, "tid": 7, "ts": 1716454223690353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635124, "dur": 6, "args": { "External id": 145562, "cbid": 211, "correlation": 145562 } }, { "ph": "s", "id": 145562, "pid": 76337, "tid": -914061504, "ts": 1716454223635124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223690358, "dur": 17, "args": { "External id": 145565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145565, "pid": 5, "tid": 7, "ts": 1716454223690358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635142, "dur": 6, "args": { "External id": 145565, "cbid": 211, "correlation": 145565 } }, { "ph": "s", "id": 145565, "pid": 76337, "tid": -914061504, "ts": 1716454223635142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223690377, "dur": 11, "args": { "External id": 145574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145574, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145574, "pid": 5, "tid": 7, "ts": 1716454223690377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635182, "dur": 11, "args": { "External id": 145574, "cbid": 211, "correlation": 145574 } }, { "ph": "s", "id": 145574, "pid": 76337, "tid": -914061504, "ts": 1716454223635182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223635246, "dur": 0, "args": { "External id": 145584, "cbid": 317, "correlation": 145584 } }, { "ph": "f", "id": 145584, "pid": 76337, "tid": -914061504, "ts": 1716454223635246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223635247, "dur": 0, "args": { "External id": 145585, "cbid": 203, "correlation": 145585 } }, { "ph": "f", "id": 145585, "pid": 76337, "tid": -914061504, "ts": 1716454223635247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223635248, "dur": 0, "args": { "External id": 145586, "cbid": 205, "correlation": 145586 } }, { "ph": "f", "id": 145586, "pid": 76337, "tid": -914061504, "ts": 1716454223635248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223690389, "dur": 12, "args": { "External id": 145590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145590, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145590, "pid": 5, "tid": 7, "ts": 1716454223690389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635262, "dur": 12, "args": { "External id": 145590, "cbid": 211, "correlation": 145590 } }, { "ph": "s", "id": 145590, "pid": 76337, "tid": -914061504, "ts": 1716454223635262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223690402, "dur": 163, "args": { "External id": 145592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145592, "pid": 5, "tid": 7, "ts": 1716454223690402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635276, "dur": 5, "args": { "External id": 145592, "cbid": 211, "correlation": 145592 } }, { "ph": "s", "id": 145592, "pid": 76337, "tid": -914061504, "ts": 1716454223635276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223690567, "dur": 1, "args": { "External id": 145594, "device": 5, "context": 1, "stream": 7, "correlation": 145594, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 145594, "pid": 5, "tid": 7, "ts": 1716454223690567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223635287, "dur": 7, "args": { "External id": 145594, "cbid": 51, "correlation": 145594 } }, { "ph": "s", "id": 145594, "pid": 76337, "tid": -914061504, "ts": 1716454223635287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223690571, "dur": 650, "args": { "External id": 145595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145595, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145595, "pid": 5, "tid": 7, "ts": 1716454223690571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635295, "dur": 6, "args": { "External id": 145595, "cbid": 211, "correlation": 145595 } }, { "ph": "s", "id": 145595, "pid": 76337, "tid": -914061504, "ts": 1716454223635295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223691222, "dur": 13, "args": { "External id": 145597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145597, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145597, "pid": 5, "tid": 7, "ts": 1716454223691222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635305, "dur": 5, "args": { "External id": 145597, "cbid": 211, "correlation": 145597 } }, { "ph": "s", "id": 145597, "pid": 76337, "tid": -914061504, "ts": 1716454223635305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223691236, "dur": 15, "args": { "External id": 145603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145603, "pid": 5, "tid": 7, "ts": 1716454223691236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635332, "dur": 9, "args": { "External id": 145603, "cbid": 211, "correlation": 145603 } }, { "ph": "s", "id": 145603, "pid": 76337, "tid": -914061504, "ts": 1716454223635332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223635391, "dur": 0, "args": { "External id": 145613, "cbid": 317, "correlation": 145613 } }, { "ph": "f", "id": 145613, "pid": 76337, "tid": -914061504, "ts": 1716454223635391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223635392, "dur": 0, "args": { "External id": 145614, "cbid": 203, "correlation": 145614 } }, { "ph": "f", "id": 145614, "pid": 76337, "tid": -914061504, "ts": 1716454223635392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223635393, "dur": 0, "args": { "External id": 145615, "cbid": 205, "correlation": 145615 } }, { "ph": "f", "id": 145615, "pid": 76337, "tid": -914061504, "ts": 1716454223635393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223691253, "dur": 22, "args": { "External id": 145619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145619, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145619, "pid": 5, "tid": 7, "ts": 1716454223691253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635408, "dur": 11, "args": { "External id": 145619, "cbid": 211, "correlation": 145619 } }, { "ph": "s", "id": 145619, "pid": 76337, "tid": -914061504, "ts": 1716454223635408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223691276, "dur": 4, "args": { "External id": 145621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 145621, "pid": 5, "tid": 7, "ts": 1716454223691276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635427, "dur": 9, "args": { "External id": 145621, "cbid": 211, "correlation": 145621 } }, { "ph": "s", "id": 145621, "pid": 76337, "tid": -914061504, "ts": 1716454223635427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223635442, "dur": 0, "args": { "External id": 145622, "cbid": 51, "correlation": 145622 } }, { "ph": "s", "id": 145622, "pid": 76337, "tid": -914061504, "ts": 1716454223635442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223691281, "dur": 176, "args": { "External id": 145623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145623, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 145623, "pid": 5, "tid": 7, "ts": 1716454223691281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635443, "dur": 8, "args": { "External id": 145623, "cbid": 211, "correlation": 145623 } }, { "ph": "s", "id": 145623, "pid": 76337, "tid": -914061504, "ts": 1716454223635443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223691458, "dur": 16, "args": { "External id": 145628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145628, "pid": 5, "tid": 7, "ts": 1716454223691458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635472, "dur": 8, "args": { "External id": 145628, "cbid": 211, "correlation": 145628 } }, { "ph": "s", "id": 145628, "pid": 76337, "tid": -914061504, "ts": 1716454223635472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223691476, "dur": 12, "args": { "External id": 145636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145636, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145636, "pid": 5, "tid": 7, "ts": 1716454223691476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635500, "dur": 7, "args": { "External id": 145636, "cbid": 211, "correlation": 145636 } }, { "ph": "s", "id": 145636, "pid": 76337, "tid": -914061504, "ts": 1716454223635500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223691490, "dur": 10, "args": { "External id": 145644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145644, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145644, "pid": 5, "tid": 7, "ts": 1716454223691490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635529, "dur": 9, "args": { "External id": 145644, "cbid": 211, "correlation": 145644 } }, { "ph": "s", "id": 145644, "pid": 76337, "tid": -914061504, "ts": 1716454223635529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223691501, "dur": 19, "args": { "External id": 145664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145664, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 145664, "pid": 5, "tid": 7, "ts": 1716454223691501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635615, "dur": 12, "args": { "External id": 145664, "cbid": 211, "correlation": 145664 } }, { "ph": "s", "id": 145664, "pid": 76337, "tid": -914061504, "ts": 1716454223635615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223691522, "dur": 5, "args": { "External id": 145676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145676, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 145676, "pid": 5, "tid": 7, "ts": 1716454223691522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635636, "dur": 7, "args": { "External id": 145676, "cbid": 211, "correlation": 145676 } }, { "ph": "s", "id": 145676, "pid": 76337, "tid": -914061504, "ts": 1716454223635636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223691527, "dur": 17, "args": { "External id": 145679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145679, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145679, "pid": 5, "tid": 7, "ts": 1716454223691527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635654, "dur": 6, "args": { "External id": 145679, "cbid": 211, "correlation": 145679 } }, { "ph": "s", "id": 145679, "pid": 76337, "tid": -914061504, "ts": 1716454223635654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223635712, "dur": 0, "args": { "External id": 145690, "cbid": 317, "correlation": 145690 } }, { "ph": "f", "id": 145690, "pid": 76337, "tid": -914061504, "ts": 1716454223635712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223635713, "dur": 0, "args": { "External id": 145691, "cbid": 203, "correlation": 145691 } }, { "ph": "f", "id": 145691, "pid": 76337, "tid": -914061504, "ts": 1716454223635713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223635714, "dur": 0, "args": { "External id": 145692, "cbid": 205, "correlation": 145692 } }, { "ph": "f", "id": 145692, "pid": 76337, "tid": -914061504, "ts": 1716454223635714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223691546, "dur": 12, "args": { "External id": 145696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145696, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145696, "pid": 5, "tid": 7, "ts": 1716454223691546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635730, "dur": 12, "args": { "External id": 145696, "cbid": 211, "correlation": 145696 } }, { "ph": "s", "id": 145696, "pid": 76337, "tid": -914061504, "ts": 1716454223635730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223691559, "dur": 3, "args": { "External id": 145698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 145698, "pid": 5, "tid": 7, "ts": 1716454223691559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635747, "dur": 7, "args": { "External id": 145698, "cbid": 211, "correlation": 145698 } }, { "ph": "s", "id": 145698, "pid": 76337, "tid": -914061504, "ts": 1716454223635747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223635756, "dur": 0, "args": { "External id": 145699, "cbid": 51, "correlation": 145699 } }, { "ph": "s", "id": 145699, "pid": 76337, "tid": -914061504, "ts": 1716454223635756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223691564, "dur": 91, "args": { "External id": 145700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145700, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 145700, "pid": 5, "tid": 7, "ts": 1716454223691564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635757, "dur": 5, "args": { "External id": 145700, "cbid": 211, "correlation": 145700 } }, { "ph": "s", "id": 145700, "pid": 76337, "tid": -914061504, "ts": 1716454223635757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223691656, "dur": 16, "args": { "External id": 145705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145705, "pid": 5, "tid": 7, "ts": 1716454223691656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635784, "dur": 9, "args": { "External id": 145705, "cbid": 211, "correlation": 145705 } }, { "ph": "s", "id": 145705, "pid": 76337, "tid": -914061504, "ts": 1716454223635784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223691673, "dur": 84, "args": { "External id": 145714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145714, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145714, "pid": 5, "tid": 7, "ts": 1716454223691673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635883, "dur": 16, "args": { "External id": 145714, "cbid": 211, "correlation": 145714 } }, { "ph": "s", "id": 145714, "pid": 76337, "tid": -914061504, "ts": 1716454223635883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223691759, "dur": 30, "args": { "External id": 145736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145736, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145736, "pid": 5, "tid": 7, "ts": 1716454223691759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223635956, "dur": 12, "args": { "External id": 145736, "cbid": 211, "correlation": 145736 } }, { "ph": "s", "id": 145736, "pid": 76337, "tid": -914061504, "ts": 1716454223635956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636091, "dur": 2, "args": { "External id": 145747, "cbid": 251, "correlation": 145747 } }, { "ph": "f", "id": 145747, "pid": 76337, "tid": -914061504, "ts": 1716454223636091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223691790, "dur": 164, "args": { "External id": 145748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145748, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145748, "pid": 5, "tid": 7, "ts": 1716454223691790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636100, "dur": 15, "args": { "External id": 145748, "cbid": 211, "correlation": 145748 } }, { "ph": "s", "id": 145748, "pid": 76337, "tid": -914061504, "ts": 1716454223636100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636179, "dur": 1, "args": { "External id": 145759, "cbid": 251, "correlation": 145759 } }, { "ph": "f", "id": 145759, "pid": 76337, "tid": -914061504, "ts": 1716454223636179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223691956, "dur": 160, "args": { "External id": 145760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145760, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145760, "pid": 5, "tid": 7, "ts": 1716454223691956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636183, "dur": 11, "args": { "External id": 145760, "cbid": 211, "correlation": 145760 } }, { "ph": "s", "id": 145760, "pid": 76337, "tid": -914061504, "ts": 1716454223636183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636249, "dur": 1, "args": { "External id": 145771, "cbid": 251, "correlation": 145771 } }, { "ph": "f", "id": 145771, "pid": 76337, "tid": -914061504, "ts": 1716454223636249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223692117, "dur": 161, "args": { "External id": 145772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145772, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145772, "pid": 5, "tid": 7, "ts": 1716454223692117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636253, "dur": 11, "args": { "External id": 145772, "cbid": 211, "correlation": 145772 } }, { "ph": "s", "id": 145772, "pid": 76337, "tid": -914061504, "ts": 1716454223636253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223692280, "dur": 339, "args": { "External id": 145797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145797, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145797, "pid": 5, "tid": 7, "ts": 1716454223692280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636365, "dur": 16, "args": { "External id": 145797, "cbid": 211, "correlation": 145797 } }, { "ph": "s", "id": 145797, "pid": 76337, "tid": -914061504, "ts": 1716454223636365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636486, "dur": 2, "args": { "External id": 145815, "cbid": 251, "correlation": 145815 } }, { "ph": "f", "id": 145815, "pid": 76337, "tid": -914061504, "ts": 1716454223636486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223692620, "dur": 167, "args": { "External id": 145817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145817, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145817, "pid": 5, "tid": 7, "ts": 1716454223692620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636494, "dur": 13, "args": { "External id": 145817, "cbid": 211, "correlation": 145817 } }, { "ph": "s", "id": 145817, "pid": 76337, "tid": -914061504, "ts": 1716454223636494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223692788, "dur": 19, "args": { "External id": 145825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145825, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145825, "pid": 5, "tid": 7, "ts": 1716454223692788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636564, "dur": 13, "args": { "External id": 145825, "cbid": 211, "correlation": 145825 } }, { "ph": "s", "id": 145825, "pid": 76337, "tid": -914061504, "ts": 1716454223636564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223692809, "dur": 28, "args": { "External id": 145833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145833, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145833, "pid": 5, "tid": 7, "ts": 1716454223692809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636603, "dur": 8, "args": { "External id": 145833, "cbid": 211, "correlation": 145833 } }, { "ph": "s", "id": 145833, "pid": 76337, "tid": -914061504, "ts": 1716454223636603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223692838, "dur": 18, "args": { "External id": 145844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145844, "pid": 5, "tid": 7, "ts": 1716454223692838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636698, "dur": 15, "args": { "External id": 145844, "cbid": 211, "correlation": 145844 } }, { "ph": "s", "id": 145844, "pid": 76337, "tid": -914061504, "ts": 1716454223636698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223692857, "dur": 16, "args": { "External id": 145866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145866, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145866, "pid": 5, "tid": 7, "ts": 1716454223692857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636732, "dur": 7, "args": { "External id": 145866, "cbid": 211, "correlation": 145866 } }, { "ph": "s", "id": 145866, "pid": 76337, "tid": -914061504, "ts": 1716454223636732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636819, "dur": 2, "args": { "External id": 145877, "cbid": 251, "correlation": 145877 } }, { "ph": "f", "id": 145877, "pid": 76337, "tid": -914061504, "ts": 1716454223636819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223692875, "dur": 90, "args": { "External id": 145878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145878, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 145878, "pid": 5, "tid": 7, "ts": 1716454223692875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636826, "dur": 13, "args": { "External id": 145878, "cbid": 211, "correlation": 145878 } }, { "ph": "s", "id": 145878, "pid": 76337, "tid": -914061504, "ts": 1716454223636826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636904, "dur": 1, "args": { "External id": 145889, "cbid": 251, "correlation": 145889 } }, { "ph": "f", "id": 145889, "pid": 76337, "tid": -914061504, "ts": 1716454223636904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636907, "dur": 0, "args": { "External id": 145890, "cbid": 251, "correlation": 145890 } }, { "ph": "f", "id": 145890, "pid": 76337, "tid": -914061504, "ts": 1716454223636907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223692966, "dur": 12, "args": { "External id": 145891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145891, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145891, "pid": 5, "tid": 7, "ts": 1716454223692966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636909, "dur": 12, "args": { "External id": 145891, "cbid": 211, "correlation": 145891 } }, { "ph": "s", "id": 145891, "pid": 76337, "tid": -914061504, "ts": 1716454223636909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223692980, "dur": 5, "args": { "External id": 145893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145893, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145893, "pid": 5, "tid": 7, "ts": 1716454223692980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636924, "dur": 6, "args": { "External id": 145893, "cbid": 211, "correlation": 145893 } }, { "ph": "s", "id": 145893, "pid": 76337, "tid": -914061504, "ts": 1716454223636924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636992, "dur": 1, "args": { "External id": 145904, "cbid": 251, "correlation": 145904 } }, { "ph": "f", "id": 145904, "pid": 76337, "tid": -914061504, "ts": 1716454223636992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223636996, "dur": 0, "args": { "External id": 145905, "cbid": 251, "correlation": 145905 } }, { "ph": "f", "id": 145905, "pid": 76337, "tid": -914061504, "ts": 1716454223636996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223692986, "dur": 9, "args": { "External id": 145906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145906, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145906, "pid": 5, "tid": 7, "ts": 1716454223692986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223636997, "dur": 13, "args": { "External id": 145906, "cbid": 211, "correlation": 145906 } }, { "ph": "s", "id": 145906, "pid": 76337, "tid": -914061504, "ts": 1716454223636997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223692996, "dur": 4, "args": { "External id": 145908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145908, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145908, "pid": 5, "tid": 7, "ts": 1716454223692996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637012, "dur": 6, "args": { "External id": 145908, "cbid": 211, "correlation": 145908 } }, { "ph": "s", "id": 145908, "pid": 76337, "tid": -914061504, "ts": 1716454223637012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223693001, "dur": 54, "args": { "External id": 145933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145933, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 145933, "pid": 5, "tid": 7, "ts": 1716454223693001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637090, "dur": 12, "args": { "External id": 145933, "cbid": 211, "correlation": 145933 } }, { "ph": "s", "id": 145933, "pid": 76337, "tid": -914061504, "ts": 1716454223637090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223637189, "dur": 2, "args": { "External id": 145951, "cbid": 251, "correlation": 145951 } }, { "ph": "f", "id": 145951, "pid": 76337, "tid": -914061504, "ts": 1716454223637189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223693057, "dur": 91, "args": { "External id": 145953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145953, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 145953, "pid": 5, "tid": 7, "ts": 1716454223693057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637196, "dur": 14, "args": { "External id": 145953, "cbid": 211, "correlation": 145953 } }, { "ph": "s", "id": 145953, "pid": 76337, "tid": -914061504, "ts": 1716454223637196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223693149, "dur": 10, "args": { "External id": 145961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145961, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145961, "pid": 5, "tid": 7, "ts": 1716454223693149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637267, "dur": 12, "args": { "External id": 145961, "cbid": 211, "correlation": 145961 } }, { "ph": "s", "id": 145961, "pid": 76337, "tid": -914061504, "ts": 1716454223637267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223693160, "dur": 20, "args": { "External id": 145969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145969, "pid": 5, "tid": 7, "ts": 1716454223693160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637309, "dur": 9, "args": { "External id": 145969, "cbid": 211, "correlation": 145969 } }, { "ph": "s", "id": 145969, "pid": 76337, "tid": -914061504, "ts": 1716454223637309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223693182, "dur": 17, "args": { "External id": 145991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 145991, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 145991, "pid": 5, "tid": 7, "ts": 1716454223693182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637361, "dur": 10, "args": { "External id": 145991, "cbid": 211, "correlation": 145991 } }, { "ph": "s", "id": 145991, "pid": 76337, "tid": -914061504, "ts": 1716454223637361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223637458, "dur": 2, "args": { "External id": 146007, "cbid": 251, "correlation": 146007 } }, { "ph": "f", "id": 146007, "pid": 76337, "tid": -914061504, "ts": 1716454223637458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223637465, "dur": 0, "args": { "External id": 146009, "cbid": 251, "correlation": 146009 } }, { "ph": "f", "id": 146009, "pid": 76337, "tid": -914061504, "ts": 1716454223637465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223693201, "dur": 494, "args": { "External id": 146010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146010, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146010, "pid": 5, "tid": 7, "ts": 1716454223693201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637468, "dur": 15, "args": { "External id": 146010, "cbid": 211, "correlation": 146010 } }, { "ph": "s", "id": 146010, "pid": 76337, "tid": -914061504, "ts": 1716454223637468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223693697, "dur": 66, "args": { "External id": 146018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146018, "pid": 5, "tid": 7, "ts": 1716454223693697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637556, "dur": 14, "args": { "External id": 146018, "cbid": 211, "correlation": 146018 } }, { "ph": "s", "id": 146018, "pid": 76337, "tid": -914061504, "ts": 1716454223637556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223693764, "dur": 67, "args": { "External id": 146026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146026, "pid": 5, "tid": 7, "ts": 1716454223693764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637597, "dur": 11, "args": { "External id": 146026, "cbid": 211, "correlation": 146026 } }, { "ph": "s", "id": 146026, "pid": 76337, "tid": -914061504, "ts": 1716454223637597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223637681, "dur": 1, "args": { "External id": 146042, "cbid": 251, "correlation": 146042 } }, { "ph": "f", "id": 146042, "pid": 76337, "tid": -914061504, "ts": 1716454223637681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223693833, "dur": 1, "args": { "External id": 146044, "device": 5, "context": 1, "stream": 7, "correlation": 146044, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 146044, "pid": 5, "tid": 7, "ts": 1716454223693833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223637686, "dur": 11, "args": { "External id": 146044, "cbid": 51, "correlation": 146044 } }, { "ph": "s", "id": 146044, "pid": 76337, "tid": -914061504, "ts": 1716454223637686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223693836, "dur": 271, "args": { "External id": 146045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146045, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146045, "pid": 5, "tid": 7, "ts": 1716454223693836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637699, "dur": 11, "args": { "External id": 146045, "cbid": 211, "correlation": 146045 } }, { "ph": "s", "id": 146045, "pid": 76337, "tid": -914061504, "ts": 1716454223637699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223694109, "dur": 14, "args": { "External id": 146053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146053, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146053, "pid": 5, "tid": 7, "ts": 1716454223694109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637740, "dur": 10, "args": { "External id": 146053, "cbid": 211, "correlation": 146053 } }, { "ph": "s", "id": 146053, "pid": 76337, "tid": -914061504, "ts": 1716454223637740, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223694125, "dur": 38, "args": { "External id": 146064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146064, "pid": 5, "tid": 7, "ts": 1716454223694125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637813, "dur": 12, "args": { "External id": 146064, "cbid": 211, "correlation": 146064 } }, { "ph": "s", "id": 146064, "pid": 76337, "tid": -914061504, "ts": 1716454223637813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223637878, "dur": 0, "args": { "External id": 146076, "cbid": 317, "correlation": 146076 } }, { "ph": "f", "id": 146076, "pid": 76337, "tid": -914061504, "ts": 1716454223637878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223637879, "dur": 0, "args": { "External id": 146077, "cbid": 203, "correlation": 146077 } }, { "ph": "f", "id": 146077, "pid": 76337, "tid": -914061504, "ts": 1716454223637879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223637880, "dur": 0, "args": { "External id": 146078, "cbid": 205, "correlation": 146078 } }, { "ph": "f", "id": 146078, "pid": 76337, "tid": -914061504, "ts": 1716454223637880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223694164, "dur": 13, "args": { "External id": 146082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146082, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146082, "pid": 5, "tid": 7, "ts": 1716454223694164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637895, "dur": 12, "args": { "External id": 146082, "cbid": 211, "correlation": 146082 } }, { "ph": "s", "id": 146082, "pid": 76337, "tid": -914061504, "ts": 1716454223637895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223694179, "dur": 4, "args": { "External id": 146084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146084, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146084, "pid": 5, "tid": 7, "ts": 1716454223694179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637912, "dur": 6, "args": { "External id": 146084, "cbid": 211, "correlation": 146084 } }, { "ph": "s", "id": 146084, "pid": 76337, "tid": -914061504, "ts": 1716454223637912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223637921, "dur": 0, "args": { "External id": 146085, "cbid": 51, "correlation": 146085 } }, { "ph": "s", "id": 146085, "pid": 76337, "tid": -914061504, "ts": 1716454223637921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223694184, "dur": 97, "args": { "External id": 146086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146086, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 146086, "pid": 5, "tid": 7, "ts": 1716454223694184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637921, "dur": 7, "args": { "External id": 146086, "cbid": 211, "correlation": 146086 } }, { "ph": "s", "id": 146086, "pid": 76337, "tid": -914061504, "ts": 1716454223637921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223694282, "dur": 17, "args": { "External id": 146091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146091, "pid": 5, "tid": 7, "ts": 1716454223694282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637950, "dur": 9, "args": { "External id": 146091, "cbid": 211, "correlation": 146091 } }, { "ph": "s", "id": 146091, "pid": 76337, "tid": -914061504, "ts": 1716454223637950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223694300, "dur": 12, "args": { "External id": 146099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146099, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146099, "pid": 5, "tid": 7, "ts": 1716454223694300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223637990, "dur": 9, "args": { "External id": 146099, "cbid": 211, "correlation": 146099 } }, { "ph": "s", "id": 146099, "pid": 76337, "tid": -914061504, "ts": 1716454223637990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223694313, "dur": 31, "args": { "External id": 146108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146108, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146108, "pid": 5, "tid": 7, "ts": 1716454223694313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638031, "dur": 10, "args": { "External id": 146108, "cbid": 211, "correlation": 146108 } }, { "ph": "s", "id": 146108, "pid": 76337, "tid": -914061504, "ts": 1716454223638031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223694345, "dur": 31, "args": { "External id": 146128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146128, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 146128, "pid": 5, "tid": 7, "ts": 1716454223694345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638105, "dur": 12, "args": { "External id": 146128, "cbid": 211, "correlation": 146128 } }, { "ph": "s", "id": 146128, "pid": 76337, "tid": -914061504, "ts": 1716454223638105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223694378, "dur": 5, "args": { "External id": 146140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146140, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146140, "pid": 5, "tid": 7, "ts": 1716454223694378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638127, "dur": 6, "args": { "External id": 146140, "cbid": 211, "correlation": 146140 } }, { "ph": "s", "id": 146140, "pid": 76337, "tid": -914061504, "ts": 1716454223638127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223694383, "dur": 31, "args": { "External id": 146143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146143, "pid": 5, "tid": 7, "ts": 1716454223694383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638145, "dur": 6, "args": { "External id": 146143, "cbid": 211, "correlation": 146143 } }, { "ph": "s", "id": 146143, "pid": 76337, "tid": -914061504, "ts": 1716454223638145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223694415, "dur": 20, "args": { "External id": 146152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146152, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146152, "pid": 5, "tid": 7, "ts": 1716454223694415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638184, "dur": 9, "args": { "External id": 146152, "cbid": 211, "correlation": 146152 } }, { "ph": "s", "id": 146152, "pid": 76337, "tid": -914061504, "ts": 1716454223638184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223638235, "dur": 0, "args": { "External id": 146162, "cbid": 317, "correlation": 146162 } }, { "ph": "f", "id": 146162, "pid": 76337, "tid": -914061504, "ts": 1716454223638235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223638236, "dur": 0, "args": { "External id": 146163, "cbid": 203, "correlation": 146163 } }, { "ph": "f", "id": 146163, "pid": 76337, "tid": -914061504, "ts": 1716454223638236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223638237, "dur": 0, "args": { "External id": 146164, "cbid": 205, "correlation": 146164 } }, { "ph": "f", "id": 146164, "pid": 76337, "tid": -914061504, "ts": 1716454223638237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223694437, "dur": 23, "args": { "External id": 146168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146168, "pid": 5, "tid": 7, "ts": 1716454223694437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638251, "dur": 13, "args": { "External id": 146168, "cbid": 211, "correlation": 146168 } }, { "ph": "s", "id": 146168, "pid": 76337, "tid": -914061504, "ts": 1716454223638251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223694461, "dur": 322, "args": { "External id": 146170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146170, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146170, "pid": 5, "tid": 7, "ts": 1716454223694461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638266, "dur": 5, "args": { "External id": 146170, "cbid": 211, "correlation": 146170 } }, { "ph": "s", "id": 146170, "pid": 76337, "tid": -914061504, "ts": 1716454223638266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223694785, "dur": 1, "args": { "External id": 146172, "device": 5, "context": 1, "stream": 7, "correlation": 146172, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 146172, "pid": 5, "tid": 7, "ts": 1716454223694785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223638277, "dur": 6, "args": { "External id": 146172, "cbid": 51, "correlation": 146172 } }, { "ph": "s", "id": 146172, "pid": 76337, "tid": -914061504, "ts": 1716454223638277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223694789, "dur": 1260, "args": { "External id": 146173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146173, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146173, "pid": 5, "tid": 7, "ts": 1716454223694789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638284, "dur": 6, "args": { "External id": 146173, "cbid": 211, "correlation": 146173 } }, { "ph": "s", "id": 146173, "pid": 76337, "tid": -914061504, "ts": 1716454223638284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223696051, "dur": 13, "args": { "External id": 146175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146175, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146175, "pid": 5, "tid": 7, "ts": 1716454223696051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638294, "dur": 5, "args": { "External id": 146175, "cbid": 211, "correlation": 146175 } }, { "ph": "s", "id": 146175, "pid": 76337, "tid": -914061504, "ts": 1716454223638294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223696065, "dur": 15, "args": { "External id": 146181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146181, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146181, "pid": 5, "tid": 7, "ts": 1716454223696065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638323, "dur": 8, "args": { "External id": 146181, "cbid": 211, "correlation": 146181 } }, { "ph": "s", "id": 146181, "pid": 76337, "tid": -914061504, "ts": 1716454223638323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223696081, "dur": 4, "args": { "External id": 146189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146189, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 146189, "pid": 5, "tid": 7, "ts": 1716454223696081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638367, "dur": 9, "args": { "External id": 146189, "cbid": 211, "correlation": 146189 } }, { "ph": "s", "id": 146189, "pid": 76337, "tid": -914061504, "ts": 1716454223638367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223638431, "dur": 1, "args": { "External id": 146205, "cbid": 251, "correlation": 146205 } }, { "ph": "f", "id": 146205, "pid": 76337, "tid": -914061504, "ts": 1716454223638431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223638436, "dur": 0, "args": { "External id": 146207, "cbid": 251, "correlation": 146207 } }, { "ph": "f", "id": 146207, "pid": 76337, "tid": -914061504, "ts": 1716454223638436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223696087, "dur": 13, "args": { "External id": 146208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146208, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146208, "pid": 5, "tid": 7, "ts": 1716454223696087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638438, "dur": 11, "args": { "External id": 146208, "cbid": 211, "correlation": 146208 } }, { "ph": "s", "id": 146208, "pid": 76337, "tid": -914061504, "ts": 1716454223638438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223696101, "dur": 5, "args": { "External id": 146210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146210, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146210, "pid": 5, "tid": 7, "ts": 1716454223696101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638451, "dur": 5, "args": { "External id": 146210, "cbid": 211, "correlation": 146210 } }, { "ph": "s", "id": 146210, "pid": 76337, "tid": -914061504, "ts": 1716454223638451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223696108, "dur": 17, "args": { "External id": 146220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146220, "pid": 5, "tid": 7, "ts": 1716454223696108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638510, "dur": 13, "args": { "External id": 146220, "cbid": 211, "correlation": 146220 } }, { "ph": "s", "id": 146220, "pid": 76337, "tid": -914061504, "ts": 1716454223638510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223696126, "dur": 18, "args": { "External id": 146240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146240, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 146240, "pid": 5, "tid": 7, "ts": 1716454223696126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638576, "dur": 11, "args": { "External id": 146240, "cbid": 211, "correlation": 146240 } }, { "ph": "s", "id": 146240, "pid": 76337, "tid": -914061504, "ts": 1716454223638576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223696146, "dur": 4, "args": { "External id": 146252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146252, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 146252, "pid": 5, "tid": 7, "ts": 1716454223696146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638596, "dur": 6, "args": { "External id": 146252, "cbid": 211, "correlation": 146252 } }, { "ph": "s", "id": 146252, "pid": 76337, "tid": -914061504, "ts": 1716454223638596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223696152, "dur": 16, "args": { "External id": 146255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146255, "pid": 5, "tid": 7, "ts": 1716454223696152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638615, "dur": 6, "args": { "External id": 146255, "cbid": 211, "correlation": 146255 } }, { "ph": "s", "id": 146255, "pid": 76337, "tid": -914061504, "ts": 1716454223638615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223696169, "dur": 11, "args": { "External id": 146264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146264, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146264, "pid": 5, "tid": 7, "ts": 1716454223696169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638656, "dur": 11, "args": { "External id": 146264, "cbid": 211, "correlation": 146264 } }, { "ph": "s", "id": 146264, "pid": 76337, "tid": -914061504, "ts": 1716454223638656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223638720, "dur": 0, "args": { "External id": 146274, "cbid": 317, "correlation": 146274 } }, { "ph": "f", "id": 146274, "pid": 76337, "tid": -914061504, "ts": 1716454223638720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223638721, "dur": 0, "args": { "External id": 146275, "cbid": 203, "correlation": 146275 } }, { "ph": "f", "id": 146275, "pid": 76337, "tid": -914061504, "ts": 1716454223638721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223638721, "dur": 0, "args": { "External id": 146276, "cbid": 205, "correlation": 146276 } }, { "ph": "f", "id": 146276, "pid": 76337, "tid": -914061504, "ts": 1716454223638721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223696182, "dur": 12, "args": { "External id": 146280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146280, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146280, "pid": 5, "tid": 7, "ts": 1716454223696182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638737, "dur": 12, "args": { "External id": 146280, "cbid": 211, "correlation": 146280 } }, { "ph": "s", "id": 146280, "pid": 76337, "tid": -914061504, "ts": 1716454223638737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223696195, "dur": 164, "args": { "External id": 146282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146282, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146282, "pid": 5, "tid": 7, "ts": 1716454223696195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638752, "dur": 5, "args": { "External id": 146282, "cbid": 211, "correlation": 146282 } }, { "ph": "s", "id": 146282, "pid": 76337, "tid": -914061504, "ts": 1716454223638752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223696361, "dur": 1, "args": { "External id": 146284, "device": 5, "context": 1, "stream": 7, "correlation": 146284, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 146284, "pid": 5, "tid": 7, "ts": 1716454223696361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223638763, "dur": 6, "args": { "External id": 146284, "cbid": 51, "correlation": 146284 } }, { "ph": "s", "id": 146284, "pid": 76337, "tid": -914061504, "ts": 1716454223638763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223696364, "dur": 651, "args": { "External id": 146285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146285, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146285, "pid": 5, "tid": 7, "ts": 1716454223696364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638770, "dur": 6, "args": { "External id": 146285, "cbid": 211, "correlation": 146285 } }, { "ph": "s", "id": 146285, "pid": 76337, "tid": -914061504, "ts": 1716454223638770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223697016, "dur": 13, "args": { "External id": 146287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146287, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146287, "pid": 5, "tid": 7, "ts": 1716454223697016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638781, "dur": 5, "args": { "External id": 146287, "cbid": 211, "correlation": 146287 } }, { "ph": "s", "id": 146287, "pid": 76337, "tid": -914061504, "ts": 1716454223638781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223697030, "dur": 15, "args": { "External id": 146293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146293, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146293, "pid": 5, "tid": 7, "ts": 1716454223697030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638809, "dur": 9, "args": { "External id": 146293, "cbid": 211, "correlation": 146293 } }, { "ph": "s", "id": 146293, "pid": 76337, "tid": -914061504, "ts": 1716454223638809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223638868, "dur": 0, "args": { "External id": 146303, "cbid": 317, "correlation": 146303 } }, { "ph": "f", "id": 146303, "pid": 76337, "tid": -914061504, "ts": 1716454223638868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223638869, "dur": 0, "args": { "External id": 146304, "cbid": 203, "correlation": 146304 } }, { "ph": "f", "id": 146304, "pid": 76337, "tid": -914061504, "ts": 1716454223638869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223638870, "dur": 0, "args": { "External id": 146305, "cbid": 205, "correlation": 146305 } }, { "ph": "f", "id": 146305, "pid": 76337, "tid": -914061504, "ts": 1716454223638870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223697046, "dur": 20, "args": { "External id": 146309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146309, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146309, "pid": 5, "tid": 7, "ts": 1716454223697046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638882, "dur": 11, "args": { "External id": 146309, "cbid": 211, "correlation": 146309 } }, { "ph": "s", "id": 146309, "pid": 76337, "tid": -914061504, "ts": 1716454223638882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223697068, "dur": 4, "args": { "External id": 146311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146311, "pid": 5, "tid": 7, "ts": 1716454223697068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638897, "dur": 5, "args": { "External id": 146311, "cbid": 211, "correlation": 146311 } }, { "ph": "s", "id": 146311, "pid": 76337, "tid": -914061504, "ts": 1716454223638897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223638905, "dur": 0, "args": { "External id": 146312, "cbid": 51, "correlation": 146312 } }, { "ph": "s", "id": 146312, "pid": 76337, "tid": -914061504, "ts": 1716454223638905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223697073, "dur": 172, "args": { "External id": 146313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146313, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 146313, "pid": 5, "tid": 7, "ts": 1716454223697073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638906, "dur": 5, "args": { "External id": 146313, "cbid": 211, "correlation": 146313 } }, { "ph": "s", "id": 146313, "pid": 76337, "tid": -914061504, "ts": 1716454223638906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223697247, "dur": 16, "args": { "External id": 146318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146318, "pid": 5, "tid": 7, "ts": 1716454223697247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638932, "dur": 8, "args": { "External id": 146318, "cbid": 211, "correlation": 146318 } }, { "ph": "s", "id": 146318, "pid": 76337, "tid": -914061504, "ts": 1716454223638932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223697264, "dur": 12, "args": { "External id": 146326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146326, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146326, "pid": 5, "tid": 7, "ts": 1716454223697264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638959, "dur": 9, "args": { "External id": 146326, "cbid": 211, "correlation": 146326 } }, { "ph": "s", "id": 146326, "pid": 76337, "tid": -914061504, "ts": 1716454223638959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223697277, "dur": 10, "args": { "External id": 146334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146334, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146334, "pid": 5, "tid": 7, "ts": 1716454223697277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223638998, "dur": 9, "args": { "External id": 146334, "cbid": 211, "correlation": 146334 } }, { "ph": "s", "id": 146334, "pid": 76337, "tid": -914061504, "ts": 1716454223638998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223697288, "dur": 20, "args": { "External id": 146354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146354, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 146354, "pid": 5, "tid": 7, "ts": 1716454223697288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639082, "dur": 12, "args": { "External id": 146354, "cbid": 211, "correlation": 146354 } }, { "ph": "s", "id": 146354, "pid": 76337, "tid": -914061504, "ts": 1716454223639082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223697309, "dur": 4, "args": { "External id": 146366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146366, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 146366, "pid": 5, "tid": 7, "ts": 1716454223697309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639104, "dur": 6, "args": { "External id": 146366, "cbid": 211, "correlation": 146366 } }, { "ph": "s", "id": 146366, "pid": 76337, "tid": -914061504, "ts": 1716454223639104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223697315, "dur": 17, "args": { "External id": 146369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146369, "pid": 5, "tid": 7, "ts": 1716454223697315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639122, "dur": 6, "args": { "External id": 146369, "cbid": 211, "correlation": 146369 } }, { "ph": "s", "id": 146369, "pid": 76337, "tid": -914061504, "ts": 1716454223639122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223639179, "dur": 0, "args": { "External id": 146380, "cbid": 317, "correlation": 146380 } }, { "ph": "f", "id": 146380, "pid": 76337, "tid": -914061504, "ts": 1716454223639179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223639180, "dur": 0, "args": { "External id": 146381, "cbid": 203, "correlation": 146381 } }, { "ph": "f", "id": 146381, "pid": 76337, "tid": -914061504, "ts": 1716454223639180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223639180, "dur": 0, "args": { "External id": 146382, "cbid": 205, "correlation": 146382 } }, { "ph": "f", "id": 146382, "pid": 76337, "tid": -914061504, "ts": 1716454223639180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223697333, "dur": 12, "args": { "External id": 146386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146386, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146386, "pid": 5, "tid": 7, "ts": 1716454223697333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639194, "dur": 11, "args": { "External id": 146386, "cbid": 211, "correlation": 146386 } }, { "ph": "s", "id": 146386, "pid": 76337, "tid": -914061504, "ts": 1716454223639194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223697346, "dur": 3, "args": { "External id": 146388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146388, "pid": 5, "tid": 7, "ts": 1716454223697346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639209, "dur": 6, "args": { "External id": 146388, "cbid": 211, "correlation": 146388 } }, { "ph": "s", "id": 146388, "pid": 76337, "tid": -914061504, "ts": 1716454223639209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223639219, "dur": 0, "args": { "External id": 146389, "cbid": 51, "correlation": 146389 } }, { "ph": "s", "id": 146389, "pid": 76337, "tid": -914061504, "ts": 1716454223639219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223697351, "dur": 90, "args": { "External id": 146390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146390, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 146390, "pid": 5, "tid": 7, "ts": 1716454223697351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639219, "dur": 5, "args": { "External id": 146390, "cbid": 211, "correlation": 146390 } }, { "ph": "s", "id": 146390, "pid": 76337, "tid": -914061504, "ts": 1716454223639219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223697443, "dur": 16, "args": { "External id": 146395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146395, "pid": 5, "tid": 7, "ts": 1716454223697443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639245, "dur": 8, "args": { "External id": 146395, "cbid": 211, "correlation": 146395 } }, { "ph": "s", "id": 146395, "pid": 76337, "tid": -914061504, "ts": 1716454223639245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223697460, "dur": 85, "args": { "External id": 146404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146404, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146404, "pid": 5, "tid": 7, "ts": 1716454223697460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639327, "dur": 14, "args": { "External id": 146404, "cbid": 211, "correlation": 146404 } }, { "ph": "s", "id": 146404, "pid": 76337, "tid": -914061504, "ts": 1716454223639327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223697546, "dur": 30, "args": { "External id": 146426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146426, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146426, "pid": 5, "tid": 7, "ts": 1716454223697546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639383, "dur": 10, "args": { "External id": 146426, "cbid": 211, "correlation": 146426 } }, { "ph": "s", "id": 146426, "pid": 76337, "tid": -914061504, "ts": 1716454223639383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223639473, "dur": 2, "args": { "External id": 146437, "cbid": 251, "correlation": 146437 } }, { "ph": "f", "id": 146437, "pid": 76337, "tid": -914061504, "ts": 1716454223639473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223697577, "dur": 164, "args": { "External id": 146438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146438, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146438, "pid": 5, "tid": 7, "ts": 1716454223697577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639479, "dur": 13, "args": { "External id": 146438, "cbid": 211, "correlation": 146438 } }, { "ph": "s", "id": 146438, "pid": 76337, "tid": -914061504, "ts": 1716454223639479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223639548, "dur": 1, "args": { "External id": 146449, "cbid": 251, "correlation": 146449 } }, { "ph": "f", "id": 146449, "pid": 76337, "tid": -914061504, "ts": 1716454223639548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223697742, "dur": 160, "args": { "External id": 146450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146450, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146450, "pid": 5, "tid": 7, "ts": 1716454223697742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639552, "dur": 11, "args": { "External id": 146450, "cbid": 211, "correlation": 146450 } }, { "ph": "s", "id": 146450, "pid": 76337, "tid": -914061504, "ts": 1716454223639552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223639619, "dur": 1, "args": { "External id": 146461, "cbid": 251, "correlation": 146461 } }, { "ph": "f", "id": 146461, "pid": 76337, "tid": -914061504, "ts": 1716454223639619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223697904, "dur": 161, "args": { "External id": 146462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146462, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146462, "pid": 5, "tid": 7, "ts": 1716454223697904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639623, "dur": 11, "args": { "External id": 146462, "cbid": 211, "correlation": 146462 } }, { "ph": "s", "id": 146462, "pid": 76337, "tid": -914061504, "ts": 1716454223639623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223698067, "dur": 335, "args": { "External id": 146487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146487, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146487, "pid": 5, "tid": 7, "ts": 1716454223698067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639708, "dur": 13, "args": { "External id": 146487, "cbid": 211, "correlation": 146487 } }, { "ph": "s", "id": 146487, "pid": 76337, "tid": -914061504, "ts": 1716454223639708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223639810, "dur": 1, "args": { "External id": 146505, "cbid": 251, "correlation": 146505 } }, { "ph": "f", "id": 146505, "pid": 76337, "tid": -914061504, "ts": 1716454223639810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223698403, "dur": 169, "args": { "External id": 146507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146507, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146507, "pid": 5, "tid": 7, "ts": 1716454223698403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639816, "dur": 14, "args": { "External id": 146507, "cbid": 211, "correlation": 146507 } }, { "ph": "s", "id": 146507, "pid": 76337, "tid": -914061504, "ts": 1716454223639816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223698574, "dur": 19, "args": { "External id": 146515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146515, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146515, "pid": 5, "tid": 7, "ts": 1716454223698574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639887, "dur": 12, "args": { "External id": 146515, "cbid": 211, "correlation": 146515 } }, { "ph": "s", "id": 146515, "pid": 76337, "tid": -914061504, "ts": 1716454223639887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223698594, "dur": 27, "args": { "External id": 146523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146523, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146523, "pid": 5, "tid": 7, "ts": 1716454223698594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223639927, "dur": 8, "args": { "External id": 146523, "cbid": 211, "correlation": 146523 } }, { "ph": "s", "id": 146523, "pid": 76337, "tid": -914061504, "ts": 1716454223639927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223698623, "dur": 18, "args": { "External id": 146534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146534, "pid": 5, "tid": 7, "ts": 1716454223698623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640007, "dur": 14, "args": { "External id": 146534, "cbid": 211, "correlation": 146534 } }, { "ph": "s", "id": 146534, "pid": 76337, "tid": -914061504, "ts": 1716454223640007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223698642, "dur": 16, "args": { "External id": 146556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146556, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146556, "pid": 5, "tid": 7, "ts": 1716454223698642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640039, "dur": 8, "args": { "External id": 146556, "cbid": 211, "correlation": 146556 } }, { "ph": "s", "id": 146556, "pid": 76337, "tid": -914061504, "ts": 1716454223640039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640127, "dur": 1, "args": { "External id": 146567, "cbid": 251, "correlation": 146567 } }, { "ph": "f", "id": 146567, "pid": 76337, "tid": -914061504, "ts": 1716454223640127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223698660, "dur": 89, "args": { "External id": 146568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146568, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146568, "pid": 5, "tid": 7, "ts": 1716454223698660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640132, "dur": 13, "args": { "External id": 146568, "cbid": 211, "correlation": 146568 } }, { "ph": "s", "id": 146568, "pid": 76337, "tid": -914061504, "ts": 1716454223640132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640200, "dur": 1, "args": { "External id": 146579, "cbid": 251, "correlation": 146579 } }, { "ph": "f", "id": 146579, "pid": 76337, "tid": -914061504, "ts": 1716454223640200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640204, "dur": 0, "args": { "External id": 146580, "cbid": 251, "correlation": 146580 } }, { "ph": "f", "id": 146580, "pid": 76337, "tid": -914061504, "ts": 1716454223640204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223698750, "dur": 12, "args": { "External id": 146581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146581, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146581, "pid": 5, "tid": 7, "ts": 1716454223698750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640206, "dur": 12, "args": { "External id": 146581, "cbid": 211, "correlation": 146581 } }, { "ph": "s", "id": 146581, "pid": 76337, "tid": -914061504, "ts": 1716454223640206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223698764, "dur": 5, "args": { "External id": 146583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146583, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146583, "pid": 5, "tid": 7, "ts": 1716454223698764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640219, "dur": 6, "args": { "External id": 146583, "cbid": 211, "correlation": 146583 } }, { "ph": "s", "id": 146583, "pid": 76337, "tid": -914061504, "ts": 1716454223640219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640278, "dur": 1, "args": { "External id": 146594, "cbid": 251, "correlation": 146594 } }, { "ph": "f", "id": 146594, "pid": 76337, "tid": -914061504, "ts": 1716454223640278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640282, "dur": 0, "args": { "External id": 146595, "cbid": 251, "correlation": 146595 } }, { "ph": "f", "id": 146595, "pid": 76337, "tid": -914061504, "ts": 1716454223640282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223698771, "dur": 8, "args": { "External id": 146596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146596, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146596, "pid": 5, "tid": 7, "ts": 1716454223698771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640283, "dur": 11, "args": { "External id": 146596, "cbid": 211, "correlation": 146596 } }, { "ph": "s", "id": 146596, "pid": 76337, "tid": -914061504, "ts": 1716454223640283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223698780, "dur": 4, "args": { "External id": 146598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146598, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146598, "pid": 5, "tid": 7, "ts": 1716454223698780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640296, "dur": 6, "args": { "External id": 146598, "cbid": 211, "correlation": 146598 } }, { "ph": "s", "id": 146598, "pid": 76337, "tid": -914061504, "ts": 1716454223640296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223698785, "dur": 55, "args": { "External id": 146623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146623, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146623, "pid": 5, "tid": 7, "ts": 1716454223698785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640373, "dur": 13, "args": { "External id": 146623, "cbid": 211, "correlation": 146623 } }, { "ph": "s", "id": 146623, "pid": 76337, "tid": -914061504, "ts": 1716454223640373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640472, "dur": 1, "args": { "External id": 146641, "cbid": 251, "correlation": 146641 } }, { "ph": "f", "id": 146641, "pid": 76337, "tid": -914061504, "ts": 1716454223640472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223698842, "dur": 91, "args": { "External id": 146643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146643, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146643, "pid": 5, "tid": 7, "ts": 1716454223698842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640478, "dur": 14, "args": { "External id": 146643, "cbid": 211, "correlation": 146643 } }, { "ph": "s", "id": 146643, "pid": 76337, "tid": -914061504, "ts": 1716454223640478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223698935, "dur": 10, "args": { "External id": 146651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146651, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146651, "pid": 5, "tid": 7, "ts": 1716454223698935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640549, "dur": 12, "args": { "External id": 146651, "cbid": 211, "correlation": 146651 } }, { "ph": "s", "id": 146651, "pid": 76337, "tid": -914061504, "ts": 1716454223640549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223698946, "dur": 21, "args": { "External id": 146659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146659, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146659, "pid": 5, "tid": 7, "ts": 1716454223698946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640591, "dur": 9, "args": { "External id": 146659, "cbid": 211, "correlation": 146659 } }, { "ph": "s", "id": 146659, "pid": 76337, "tid": -914061504, "ts": 1716454223640591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223698968, "dur": 18, "args": { "External id": 146681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146681, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146681, "pid": 5, "tid": 7, "ts": 1716454223698968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640643, "dur": 10, "args": { "External id": 146681, "cbid": 211, "correlation": 146681 } }, { "ph": "s", "id": 146681, "pid": 76337, "tid": -914061504, "ts": 1716454223640643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640729, "dur": 1, "args": { "External id": 146697, "cbid": 251, "correlation": 146697 } }, { "ph": "f", "id": 146697, "pid": 76337, "tid": -914061504, "ts": 1716454223640729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640735, "dur": 0, "args": { "External id": 146699, "cbid": 251, "correlation": 146699 } }, { "ph": "f", "id": 146699, "pid": 76337, "tid": -914061504, "ts": 1716454223640735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223698987, "dur": 496, "args": { "External id": 146700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146700, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146700, "pid": 5, "tid": 7, "ts": 1716454223698987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640737, "dur": 12, "args": { "External id": 146700, "cbid": 211, "correlation": 146700 } }, { "ph": "s", "id": 146700, "pid": 76337, "tid": -914061504, "ts": 1716454223640737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223699484, "dur": 67, "args": { "External id": 146708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146708, "pid": 5, "tid": 7, "ts": 1716454223699484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640801, "dur": 13, "args": { "External id": 146708, "cbid": 211, "correlation": 146708 } }, { "ph": "s", "id": 146708, "pid": 76337, "tid": -914061504, "ts": 1716454223640801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223699553, "dur": 67, "args": { "External id": 146716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146716, "pid": 5, "tid": 7, "ts": 1716454223699553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640833, "dur": 8, "args": { "External id": 146716, "cbid": 211, "correlation": 146716 } }, { "ph": "s", "id": 146716, "pid": 76337, "tid": -914061504, "ts": 1716454223640833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223640914, "dur": 1, "args": { "External id": 146732, "cbid": 251, "correlation": 146732 } }, { "ph": "f", "id": 146732, "pid": 76337, "tid": -914061504, "ts": 1716454223640914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223699622, "dur": 1, "args": { "External id": 146734, "device": 5, "context": 1, "stream": 7, "correlation": 146734, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 146734, "pid": 5, "tid": 7, "ts": 1716454223699622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223640918, "dur": 9, "args": { "External id": 146734, "cbid": 51, "correlation": 146734 } }, { "ph": "s", "id": 146734, "pid": 76337, "tid": -914061504, "ts": 1716454223640918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223699626, "dur": 271, "args": { "External id": 146735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146735, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146735, "pid": 5, "tid": 7, "ts": 1716454223699626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640929, "dur": 12, "args": { "External id": 146735, "cbid": 211, "correlation": 146735 } }, { "ph": "s", "id": 146735, "pid": 76337, "tid": -914061504, "ts": 1716454223640929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223699898, "dur": 14, "args": { "External id": 146743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146743, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146743, "pid": 5, "tid": 7, "ts": 1716454223699898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223640972, "dur": 20, "args": { "External id": 146743, "cbid": 211, "correlation": 146743 } }, { "ph": "s", "id": 146743, "pid": 76337, "tid": -914061504, "ts": 1716454223640972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223699913, "dur": 37, "args": { "External id": 146754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146754, "pid": 5, "tid": 7, "ts": 1716454223699913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641051, "dur": 12, "args": { "External id": 146754, "cbid": 211, "correlation": 146754 } }, { "ph": "s", "id": 146754, "pid": 76337, "tid": -914061504, "ts": 1716454223641051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223641116, "dur": 0, "args": { "External id": 146766, "cbid": 317, "correlation": 146766 } }, { "ph": "f", "id": 146766, "pid": 76337, "tid": -914061504, "ts": 1716454223641116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223641117, "dur": 0, "args": { "External id": 146767, "cbid": 203, "correlation": 146767 } }, { "ph": "f", "id": 146767, "pid": 76337, "tid": -914061504, "ts": 1716454223641117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223641117, "dur": 0, "args": { "External id": 146768, "cbid": 205, "correlation": 146768 } }, { "ph": "f", "id": 146768, "pid": 76337, "tid": -914061504, "ts": 1716454223641117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223699951, "dur": 14, "args": { "External id": 146772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146772, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146772, "pid": 5, "tid": 7, "ts": 1716454223699951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641133, "dur": 12, "args": { "External id": 146772, "cbid": 211, "correlation": 146772 } }, { "ph": "s", "id": 146772, "pid": 76337, "tid": -914061504, "ts": 1716454223641133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223699967, "dur": 4, "args": { "External id": 146774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 146774, "pid": 5, "tid": 7, "ts": 1716454223699967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641150, "dur": 6, "args": { "External id": 146774, "cbid": 211, "correlation": 146774 } }, { "ph": "s", "id": 146774, "pid": 76337, "tid": -914061504, "ts": 1716454223641150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223641158, "dur": 0, "args": { "External id": 146775, "cbid": 51, "correlation": 146775 } }, { "ph": "s", "id": 146775, "pid": 76337, "tid": -914061504, "ts": 1716454223641158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223699973, "dur": 98, "args": { "External id": 146776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146776, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 146776, "pid": 5, "tid": 7, "ts": 1716454223699973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641159, "dur": 5, "args": { "External id": 146776, "cbid": 211, "correlation": 146776 } }, { "ph": "s", "id": 146776, "pid": 76337, "tid": -914061504, "ts": 1716454223641159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223700072, "dur": 16, "args": { "External id": 146781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146781, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146781, "pid": 5, "tid": 7, "ts": 1716454223700072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641187, "dur": 8, "args": { "External id": 146781, "cbid": 211, "correlation": 146781 } }, { "ph": "s", "id": 146781, "pid": 76337, "tid": -914061504, "ts": 1716454223641187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223700090, "dur": 12, "args": { "External id": 146789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146789, "pid": 5, "tid": 7, "ts": 1716454223700090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641218, "dur": 8, "args": { "External id": 146789, "cbid": 211, "correlation": 146789 } }, { "ph": "s", "id": 146789, "pid": 76337, "tid": -914061504, "ts": 1716454223641218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223700103, "dur": 25, "args": { "External id": 146798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146798, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146798, "pid": 5, "tid": 7, "ts": 1716454223700103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641256, "dur": 10, "args": { "External id": 146798, "cbid": 211, "correlation": 146798 } }, { "ph": "s", "id": 146798, "pid": 76337, "tid": -914061504, "ts": 1716454223641256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223700129, "dur": 25, "args": { "External id": 146818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146818, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 146818, "pid": 5, "tid": 7, "ts": 1716454223700129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641327, "dur": 12, "args": { "External id": 146818, "cbid": 211, "correlation": 146818 } }, { "ph": "s", "id": 146818, "pid": 76337, "tid": -914061504, "ts": 1716454223641327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223700155, "dur": 5, "args": { "External id": 146830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146830, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 146830, "pid": 5, "tid": 7, "ts": 1716454223700155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641349, "dur": 6, "args": { "External id": 146830, "cbid": 211, "correlation": 146830 } }, { "ph": "s", "id": 146830, "pid": 76337, "tid": -914061504, "ts": 1716454223641349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223700161, "dur": 25, "args": { "External id": 146833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146833, "pid": 5, "tid": 7, "ts": 1716454223700161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641367, "dur": 6, "args": { "External id": 146833, "cbid": 211, "correlation": 146833 } }, { "ph": "s", "id": 146833, "pid": 76337, "tid": -914061504, "ts": 1716454223641367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223700187, "dur": 17, "args": { "External id": 146842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146842, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146842, "pid": 5, "tid": 7, "ts": 1716454223700187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641406, "dur": 9, "args": { "External id": 146842, "cbid": 211, "correlation": 146842 } }, { "ph": "s", "id": 146842, "pid": 76337, "tid": -914061504, "ts": 1716454223641406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223641457, "dur": 0, "args": { "External id": 146852, "cbid": 317, "correlation": 146852 } }, { "ph": "f", "id": 146852, "pid": 76337, "tid": -914061504, "ts": 1716454223641457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223641458, "dur": 0, "args": { "External id": 146853, "cbid": 203, "correlation": 146853 } }, { "ph": "f", "id": 146853, "pid": 76337, "tid": -914061504, "ts": 1716454223641458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223641459, "dur": 0, "args": { "External id": 146854, "cbid": 205, "correlation": 146854 } }, { "ph": "f", "id": 146854, "pid": 76337, "tid": -914061504, "ts": 1716454223641459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223700206, "dur": 18, "args": { "External id": 146858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146858, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146858, "pid": 5, "tid": 7, "ts": 1716454223700206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641474, "dur": 12, "args": { "External id": 146858, "cbid": 211, "correlation": 146858 } }, { "ph": "s", "id": 146858, "pid": 76337, "tid": -914061504, "ts": 1716454223641474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223700225, "dur": 243, "args": { "External id": 146860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146860, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146860, "pid": 5, "tid": 7, "ts": 1716454223700225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641489, "dur": 5, "args": { "External id": 146860, "cbid": 211, "correlation": 146860 } }, { "ph": "s", "id": 146860, "pid": 76337, "tid": -914061504, "ts": 1716454223641489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223700470, "dur": 1, "args": { "External id": 146862, "device": 5, "context": 1, "stream": 7, "correlation": 146862, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 146862, "pid": 5, "tid": 7, "ts": 1716454223700470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223641500, "dur": 8, "args": { "External id": 146862, "cbid": 51, "correlation": 146862 } }, { "ph": "s", "id": 146862, "pid": 76337, "tid": -914061504, "ts": 1716454223641500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223700474, "dur": 814, "args": { "External id": 146863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146863, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146863, "pid": 5, "tid": 7, "ts": 1716454223700474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641509, "dur": 6, "args": { "External id": 146863, "cbid": 211, "correlation": 146863 } }, { "ph": "s", "id": 146863, "pid": 76337, "tid": -914061504, "ts": 1716454223641509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223701289, "dur": 14, "args": { "External id": 146865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146865, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146865, "pid": 5, "tid": 7, "ts": 1716454223701289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641519, "dur": 6, "args": { "External id": 146865, "cbid": 211, "correlation": 146865 } }, { "ph": "s", "id": 146865, "pid": 76337, "tid": -914061504, "ts": 1716454223641519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223701303, "dur": 15, "args": { "External id": 146871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146871, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146871, "pid": 5, "tid": 7, "ts": 1716454223701303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641548, "dur": 8, "args": { "External id": 146871, "cbid": 211, "correlation": 146871 } }, { "ph": "s", "id": 146871, "pid": 76337, "tid": -914061504, "ts": 1716454223641548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223701319, "dur": 4, "args": { "External id": 146879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146879, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 146879, "pid": 5, "tid": 7, "ts": 1716454223701319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641592, "dur": 9, "args": { "External id": 146879, "cbid": 211, "correlation": 146879 } }, { "ph": "s", "id": 146879, "pid": 76337, "tid": -914061504, "ts": 1716454223641592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223641656, "dur": 1, "args": { "External id": 146895, "cbid": 251, "correlation": 146895 } }, { "ph": "f", "id": 146895, "pid": 76337, "tid": -914061504, "ts": 1716454223641656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223641661, "dur": 0, "args": { "External id": 146897, "cbid": 251, "correlation": 146897 } }, { "ph": "f", "id": 146897, "pid": 76337, "tid": -914061504, "ts": 1716454223641661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223701325, "dur": 13, "args": { "External id": 146898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146898, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146898, "pid": 5, "tid": 7, "ts": 1716454223701325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641663, "dur": 11, "args": { "External id": 146898, "cbid": 211, "correlation": 146898 } }, { "ph": "s", "id": 146898, "pid": 76337, "tid": -914061504, "ts": 1716454223641663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223701339, "dur": 5, "args": { "External id": 146900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146900, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146900, "pid": 5, "tid": 7, "ts": 1716454223701339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641676, "dur": 5, "args": { "External id": 146900, "cbid": 211, "correlation": 146900 } }, { "ph": "s", "id": 146900, "pid": 76337, "tid": -914061504, "ts": 1716454223641676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223701346, "dur": 17, "args": { "External id": 146910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146910, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146910, "pid": 5, "tid": 7, "ts": 1716454223701346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641734, "dur": 12, "args": { "External id": 146910, "cbid": 211, "correlation": 146910 } }, { "ph": "s", "id": 146910, "pid": 76337, "tid": -914061504, "ts": 1716454223641734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223701364, "dur": 17, "args": { "External id": 146930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146930, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 146930, "pid": 5, "tid": 7, "ts": 1716454223701364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641799, "dur": 11, "args": { "External id": 146930, "cbid": 211, "correlation": 146930 } }, { "ph": "s", "id": 146930, "pid": 76337, "tid": -914061504, "ts": 1716454223641799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223701383, "dur": 5, "args": { "External id": 146942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146942, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 146942, "pid": 5, "tid": 7, "ts": 1716454223701383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641820, "dur": 6, "args": { "External id": 146942, "cbid": 211, "correlation": 146942 } }, { "ph": "s", "id": 146942, "pid": 76337, "tid": -914061504, "ts": 1716454223641820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223701389, "dur": 17, "args": { "External id": 146945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146945, "pid": 5, "tid": 7, "ts": 1716454223701389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641838, "dur": 7, "args": { "External id": 146945, "cbid": 211, "correlation": 146945 } }, { "ph": "s", "id": 146945, "pid": 76337, "tid": -914061504, "ts": 1716454223641838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223701407, "dur": 11, "args": { "External id": 146954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146954, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146954, "pid": 5, "tid": 7, "ts": 1716454223701407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641879, "dur": 10, "args": { "External id": 146954, "cbid": 211, "correlation": 146954 } }, { "ph": "s", "id": 146954, "pid": 76337, "tid": -914061504, "ts": 1716454223641879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223641942, "dur": 0, "args": { "External id": 146964, "cbid": 317, "correlation": 146964 } }, { "ph": "f", "id": 146964, "pid": 76337, "tid": -914061504, "ts": 1716454223641942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223641943, "dur": 0, "args": { "External id": 146965, "cbid": 203, "correlation": 146965 } }, { "ph": "f", "id": 146965, "pid": 76337, "tid": -914061504, "ts": 1716454223641943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223641943, "dur": 0, "args": { "External id": 146966, "cbid": 205, "correlation": 146966 } }, { "ph": "f", "id": 146966, "pid": 76337, "tid": -914061504, "ts": 1716454223641943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223701419, "dur": 11, "args": { "External id": 146970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146970, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146970, "pid": 5, "tid": 7, "ts": 1716454223701419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641959, "dur": 12, "args": { "External id": 146970, "cbid": 211, "correlation": 146970 } }, { "ph": "s", "id": 146970, "pid": 76337, "tid": -914061504, "ts": 1716454223641959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223701432, "dur": 163, "args": { "External id": 146972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146972, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146972, "pid": 5, "tid": 7, "ts": 1716454223701432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223641983, "dur": 6, "args": { "External id": 146972, "cbid": 211, "correlation": 146972 } }, { "ph": "s", "id": 146972, "pid": 76337, "tid": -914061504, "ts": 1716454223641983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223701597, "dur": 1, "args": { "External id": 146974, "device": 5, "context": 1, "stream": 7, "correlation": 146974, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 146974, "pid": 5, "tid": 7, "ts": 1716454223701597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223641995, "dur": 6, "args": { "External id": 146974, "cbid": 51, "correlation": 146974 } }, { "ph": "s", "id": 146974, "pid": 76337, "tid": -914061504, "ts": 1716454223641995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223701600, "dur": 650, "args": { "External id": 146975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146975, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 146975, "pid": 5, "tid": 7, "ts": 1716454223701600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642003, "dur": 6, "args": { "External id": 146975, "cbid": 211, "correlation": 146975 } }, { "ph": "s", "id": 146975, "pid": 76337, "tid": -914061504, "ts": 1716454223642003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223702252, "dur": 12, "args": { "External id": 146977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146977, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146977, "pid": 5, "tid": 7, "ts": 1716454223702252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642013, "dur": 5, "args": { "External id": 146977, "cbid": 211, "correlation": 146977 } }, { "ph": "s", "id": 146977, "pid": 76337, "tid": -914061504, "ts": 1716454223642013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223702266, "dur": 15, "args": { "External id": 146983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146983, "pid": 5, "tid": 7, "ts": 1716454223702266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642042, "dur": 8, "args": { "External id": 146983, "cbid": 211, "correlation": 146983 } }, { "ph": "s", "id": 146983, "pid": 76337, "tid": -914061504, "ts": 1716454223642042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223642101, "dur": 0, "args": { "External id": 146993, "cbid": 317, "correlation": 146993 } }, { "ph": "f", "id": 146993, "pid": 76337, "tid": -914061504, "ts": 1716454223642101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223642102, "dur": 0, "args": { "External id": 146994, "cbid": 203, "correlation": 146994 } }, { "ph": "f", "id": 146994, "pid": 76337, "tid": -914061504, "ts": 1716454223642102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223642102, "dur": 0, "args": { "External id": 146995, "cbid": 205, "correlation": 146995 } }, { "ph": "f", "id": 146995, "pid": 76337, "tid": -914061504, "ts": 1716454223642102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223702282, "dur": 17, "args": { "External id": 146999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 146999, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 146999, "pid": 5, "tid": 7, "ts": 1716454223702282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642116, "dur": 11, "args": { "External id": 146999, "cbid": 211, "correlation": 146999 } }, { "ph": "s", "id": 146999, "pid": 76337, "tid": -914061504, "ts": 1716454223642116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223702300, "dur": 4, "args": { "External id": 147001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147001, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147001, "pid": 5, "tid": 7, "ts": 1716454223702300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642132, "dur": 7, "args": { "External id": 147001, "cbid": 211, "correlation": 147001 } }, { "ph": "s", "id": 147001, "pid": 76337, "tid": -914061504, "ts": 1716454223642132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223642142, "dur": 0, "args": { "External id": 147002, "cbid": 51, "correlation": 147002 } }, { "ph": "s", "id": 147002, "pid": 76337, "tid": -914061504, "ts": 1716454223642142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223702305, "dur": 132, "args": { "External id": 147003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147003, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 147003, "pid": 5, "tid": 7, "ts": 1716454223702305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642142, "dur": 5, "args": { "External id": 147003, "cbid": 211, "correlation": 147003 } }, { "ph": "s", "id": 147003, "pid": 76337, "tid": -914061504, "ts": 1716454223642142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223702439, "dur": 15, "args": { "External id": 147008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147008, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147008, "pid": 5, "tid": 7, "ts": 1716454223702439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642167, "dur": 8, "args": { "External id": 147008, "cbid": 211, "correlation": 147008 } }, { "ph": "s", "id": 147008, "pid": 76337, "tid": -914061504, "ts": 1716454223642167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223702455, "dur": 14, "args": { "External id": 147016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147016, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147016, "pid": 5, "tid": 7, "ts": 1716454223702455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642196, "dur": 8, "args": { "External id": 147016, "cbid": 211, "correlation": 147016 } }, { "ph": "s", "id": 147016, "pid": 76337, "tid": -914061504, "ts": 1716454223642196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223702470, "dur": 10, "args": { "External id": 147024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147024, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147024, "pid": 5, "tid": 7, "ts": 1716454223702470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642225, "dur": 8, "args": { "External id": 147024, "cbid": 211, "correlation": 147024 } }, { "ph": "s", "id": 147024, "pid": 76337, "tid": -914061504, "ts": 1716454223642225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223702482, "dur": 19, "args": { "External id": 147044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147044, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 147044, "pid": 5, "tid": 7, "ts": 1716454223702482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642308, "dur": 12, "args": { "External id": 147044, "cbid": 211, "correlation": 147044 } }, { "ph": "s", "id": 147044, "pid": 76337, "tid": -914061504, "ts": 1716454223642308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223702502, "dur": 4, "args": { "External id": 147056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147056, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 147056, "pid": 5, "tid": 7, "ts": 1716454223702502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642330, "dur": 6, "args": { "External id": 147056, "cbid": 211, "correlation": 147056 } }, { "ph": "s", "id": 147056, "pid": 76337, "tid": -914061504, "ts": 1716454223642330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223702507, "dur": 18, "args": { "External id": 147059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147059, "pid": 5, "tid": 7, "ts": 1716454223702507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642348, "dur": 6, "args": { "External id": 147059, "cbid": 211, "correlation": 147059 } }, { "ph": "s", "id": 147059, "pid": 76337, "tid": -914061504, "ts": 1716454223642348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223642404, "dur": 0, "args": { "External id": 147070, "cbid": 317, "correlation": 147070 } }, { "ph": "f", "id": 147070, "pid": 76337, "tid": -914061504, "ts": 1716454223642404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223642404, "dur": 0, "args": { "External id": 147071, "cbid": 203, "correlation": 147071 } }, { "ph": "f", "id": 147071, "pid": 76337, "tid": -914061504, "ts": 1716454223642404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223642405, "dur": 0, "args": { "External id": 147072, "cbid": 205, "correlation": 147072 } }, { "ph": "f", "id": 147072, "pid": 76337, "tid": -914061504, "ts": 1716454223642405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223702526, "dur": 11, "args": { "External id": 147076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147076, "pid": 5, "tid": 7, "ts": 1716454223702526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642418, "dur": 11, "args": { "External id": 147076, "cbid": 211, "correlation": 147076 } }, { "ph": "s", "id": 147076, "pid": 76337, "tid": -914061504, "ts": 1716454223642418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223702539, "dur": 3, "args": { "External id": 147078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147078, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147078, "pid": 5, "tid": 7, "ts": 1716454223702539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642434, "dur": 7, "args": { "External id": 147078, "cbid": 211, "correlation": 147078 } }, { "ph": "s", "id": 147078, "pid": 76337, "tid": -914061504, "ts": 1716454223642434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223642444, "dur": 0, "args": { "External id": 147079, "cbid": 51, "correlation": 147079 } }, { "ph": "s", "id": 147079, "pid": 76337, "tid": -914061504, "ts": 1716454223642444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223702544, "dur": 90, "args": { "External id": 147080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147080, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 147080, "pid": 5, "tid": 7, "ts": 1716454223702544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642444, "dur": 5, "args": { "External id": 147080, "cbid": 211, "correlation": 147080 } }, { "ph": "s", "id": 147080, "pid": 76337, "tid": -914061504, "ts": 1716454223642444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223702635, "dur": 16, "args": { "External id": 147085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147085, "pid": 5, "tid": 7, "ts": 1716454223702635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642470, "dur": 8, "args": { "External id": 147085, "cbid": 211, "correlation": 147085 } }, { "ph": "s", "id": 147085, "pid": 76337, "tid": -914061504, "ts": 1716454223642470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223702652, "dur": 85, "args": { "External id": 147094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147094, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147094, "pid": 5, "tid": 7, "ts": 1716454223702652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642550, "dur": 14, "args": { "External id": 147094, "cbid": 211, "correlation": 147094 } }, { "ph": "s", "id": 147094, "pid": 76337, "tid": -914061504, "ts": 1716454223642550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223702738, "dur": 30, "args": { "External id": 147116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147116, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147116, "pid": 5, "tid": 7, "ts": 1716454223702738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642606, "dur": 10, "args": { "External id": 147116, "cbid": 211, "correlation": 147116 } }, { "ph": "s", "id": 147116, "pid": 76337, "tid": -914061504, "ts": 1716454223642606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223642696, "dur": 1, "args": { "External id": 147127, "cbid": 251, "correlation": 147127 } }, { "ph": "f", "id": 147127, "pid": 76337, "tid": -914061504, "ts": 1716454223642696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223702770, "dur": 165, "args": { "External id": 147128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147128, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147128, "pid": 5, "tid": 7, "ts": 1716454223702770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642701, "dur": 13, "args": { "External id": 147128, "cbid": 211, "correlation": 147128 } }, { "ph": "s", "id": 147128, "pid": 76337, "tid": -914061504, "ts": 1716454223642701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223642771, "dur": 1, "args": { "External id": 147139, "cbid": 251, "correlation": 147139 } }, { "ph": "f", "id": 147139, "pid": 76337, "tid": -914061504, "ts": 1716454223642771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223702936, "dur": 158, "args": { "External id": 147140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147140, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147140, "pid": 5, "tid": 7, "ts": 1716454223702936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642775, "dur": 12, "args": { "External id": 147140, "cbid": 211, "correlation": 147140 } }, { "ph": "s", "id": 147140, "pid": 76337, "tid": -914061504, "ts": 1716454223642775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223642840, "dur": 1, "args": { "External id": 147151, "cbid": 251, "correlation": 147151 } }, { "ph": "f", "id": 147151, "pid": 76337, "tid": -914061504, "ts": 1716454223642840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223703095, "dur": 158, "args": { "External id": 147152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147152, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147152, "pid": 5, "tid": 7, "ts": 1716454223703095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642844, "dur": 11, "args": { "External id": 147152, "cbid": 211, "correlation": 147152 } }, { "ph": "s", "id": 147152, "pid": 76337, "tid": -914061504, "ts": 1716454223642844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223703255, "dur": 334, "args": { "External id": 147177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147177, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147177, "pid": 5, "tid": 7, "ts": 1716454223703255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223642926, "dur": 13, "args": { "External id": 147177, "cbid": 211, "correlation": 147177 } }, { "ph": "s", "id": 147177, "pid": 76337, "tid": -914061504, "ts": 1716454223642926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643035, "dur": 1, "args": { "External id": 147195, "cbid": 251, "correlation": 147195 } }, { "ph": "f", "id": 147195, "pid": 76337, "tid": -914061504, "ts": 1716454223643035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223703590, "dur": 164, "args": { "External id": 147197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147197, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147197, "pid": 5, "tid": 7, "ts": 1716454223703590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643041, "dur": 14, "args": { "External id": 147197, "cbid": 211, "correlation": 147197 } }, { "ph": "s", "id": 147197, "pid": 76337, "tid": -914061504, "ts": 1716454223643041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223703755, "dur": 20, "args": { "External id": 147205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147205, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147205, "pid": 5, "tid": 7, "ts": 1716454223703755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643113, "dur": 12, "args": { "External id": 147205, "cbid": 211, "correlation": 147205 } }, { "ph": "s", "id": 147205, "pid": 76337, "tid": -914061504, "ts": 1716454223643113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223703776, "dur": 28, "args": { "External id": 147213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147213, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147213, "pid": 5, "tid": 7, "ts": 1716454223703776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643153, "dur": 8, "args": { "External id": 147213, "cbid": 211, "correlation": 147213 } }, { "ph": "s", "id": 147213, "pid": 76337, "tid": -914061504, "ts": 1716454223643153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223703805, "dur": 19, "args": { "External id": 147224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147224, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147224, "pid": 5, "tid": 7, "ts": 1716454223703805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643224, "dur": 12, "args": { "External id": 147224, "cbid": 211, "correlation": 147224 } }, { "ph": "s", "id": 147224, "pid": 76337, "tid": -914061504, "ts": 1716454223643224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223703825, "dur": 16, "args": { "External id": 147246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147246, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147246, "pid": 5, "tid": 7, "ts": 1716454223703825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643256, "dur": 7, "args": { "External id": 147246, "cbid": 211, "correlation": 147246 } }, { "ph": "s", "id": 147246, "pid": 76337, "tid": -914061504, "ts": 1716454223643256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643341, "dur": 1, "args": { "External id": 147257, "cbid": 251, "correlation": 147257 } }, { "ph": "f", "id": 147257, "pid": 76337, "tid": -914061504, "ts": 1716454223643341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223703843, "dur": 90, "args": { "External id": 147258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147258, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147258, "pid": 5, "tid": 7, "ts": 1716454223703843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643346, "dur": 13, "args": { "External id": 147258, "cbid": 211, "correlation": 147258 } }, { "ph": "s", "id": 147258, "pid": 76337, "tid": -914061504, "ts": 1716454223643346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643415, "dur": 1, "args": { "External id": 147269, "cbid": 251, "correlation": 147269 } }, { "ph": "f", "id": 147269, "pid": 76337, "tid": -914061504, "ts": 1716454223643415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643419, "dur": 0, "args": { "External id": 147270, "cbid": 251, "correlation": 147270 } }, { "ph": "f", "id": 147270, "pid": 76337, "tid": -914061504, "ts": 1716454223643419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223703934, "dur": 12, "args": { "External id": 147271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147271, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147271, "pid": 5, "tid": 7, "ts": 1716454223703934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643420, "dur": 11, "args": { "External id": 147271, "cbid": 211, "correlation": 147271 } }, { "ph": "s", "id": 147271, "pid": 76337, "tid": -914061504, "ts": 1716454223643420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223703948, "dur": 5, "args": { "External id": 147273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147273, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147273, "pid": 5, "tid": 7, "ts": 1716454223703948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643433, "dur": 7, "args": { "External id": 147273, "cbid": 211, "correlation": 147273 } }, { "ph": "s", "id": 147273, "pid": 76337, "tid": -914061504, "ts": 1716454223643433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643492, "dur": 1, "args": { "External id": 147284, "cbid": 251, "correlation": 147284 } }, { "ph": "f", "id": 147284, "pid": 76337, "tid": -914061504, "ts": 1716454223643492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643495, "dur": 0, "args": { "External id": 147285, "cbid": 251, "correlation": 147285 } }, { "ph": "f", "id": 147285, "pid": 76337, "tid": -914061504, "ts": 1716454223643495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223703954, "dur": 8, "args": { "External id": 147286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147286, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147286, "pid": 5, "tid": 7, "ts": 1716454223703954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643497, "dur": 12, "args": { "External id": 147286, "cbid": 211, "correlation": 147286 } }, { "ph": "s", "id": 147286, "pid": 76337, "tid": -914061504, "ts": 1716454223643497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223703964, "dur": 4, "args": { "External id": 147288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147288, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147288, "pid": 5, "tid": 7, "ts": 1716454223703964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643510, "dur": 5, "args": { "External id": 147288, "cbid": 211, "correlation": 147288 } }, { "ph": "s", "id": 147288, "pid": 76337, "tid": -914061504, "ts": 1716454223643510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223703969, "dur": 56, "args": { "External id": 147313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147313, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147313, "pid": 5, "tid": 7, "ts": 1716454223703969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643585, "dur": 12, "args": { "External id": 147313, "cbid": 211, "correlation": 147313 } }, { "ph": "s", "id": 147313, "pid": 76337, "tid": -914061504, "ts": 1716454223643585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643683, "dur": 1, "args": { "External id": 147331, "cbid": 251, "correlation": 147331 } }, { "ph": "f", "id": 147331, "pid": 76337, "tid": -914061504, "ts": 1716454223643683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223704026, "dur": 91, "args": { "External id": 147333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147333, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147333, "pid": 5, "tid": 7, "ts": 1716454223704026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643689, "dur": 13, "args": { "External id": 147333, "cbid": 211, "correlation": 147333 } }, { "ph": "s", "id": 147333, "pid": 76337, "tid": -914061504, "ts": 1716454223643689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223704118, "dur": 9, "args": { "External id": 147341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147341, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147341, "pid": 5, "tid": 7, "ts": 1716454223704118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643760, "dur": 12, "args": { "External id": 147341, "cbid": 211, "correlation": 147341 } }, { "ph": "s", "id": 147341, "pid": 76337, "tid": -914061504, "ts": 1716454223643760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223704129, "dur": 22, "args": { "External id": 147349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147349, "pid": 5, "tid": 7, "ts": 1716454223704129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643801, "dur": 10, "args": { "External id": 147349, "cbid": 211, "correlation": 147349 } }, { "ph": "s", "id": 147349, "pid": 76337, "tid": -914061504, "ts": 1716454223643801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223704152, "dur": 18, "args": { "External id": 147371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147371, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147371, "pid": 5, "tid": 7, "ts": 1716454223704152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643853, "dur": 11, "args": { "External id": 147371, "cbid": 211, "correlation": 147371 } }, { "ph": "s", "id": 147371, "pid": 76337, "tid": -914061504, "ts": 1716454223643853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643940, "dur": 1, "args": { "External id": 147387, "cbid": 251, "correlation": 147387 } }, { "ph": "f", "id": 147387, "pid": 76337, "tid": -914061504, "ts": 1716454223643940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223643945, "dur": 0, "args": { "External id": 147389, "cbid": 251, "correlation": 147389 } }, { "ph": "f", "id": 147389, "pid": 76337, "tid": -914061504, "ts": 1716454223643945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223704171, "dur": 496, "args": { "External id": 147390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147390, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147390, "pid": 5, "tid": 7, "ts": 1716454223704171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223643947, "dur": 13, "args": { "External id": 147390, "cbid": 211, "correlation": 147390 } }, { "ph": "s", "id": 147390, "pid": 76337, "tid": -914061504, "ts": 1716454223643947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223704669, "dur": 67, "args": { "External id": 147398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147398, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147398, "pid": 5, "tid": 7, "ts": 1716454223704669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644022, "dur": 13, "args": { "External id": 147398, "cbid": 211, "correlation": 147398 } }, { "ph": "s", "id": 147398, "pid": 76337, "tid": -914061504, "ts": 1716454223644022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223704737, "dur": 66, "args": { "External id": 147406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147406, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147406, "pid": 5, "tid": 7, "ts": 1716454223704737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644053, "dur": 9, "args": { "External id": 147406, "cbid": 211, "correlation": 147406 } }, { "ph": "s", "id": 147406, "pid": 76337, "tid": -914061504, "ts": 1716454223644053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223644132, "dur": 1, "args": { "External id": 147422, "cbid": 251, "correlation": 147422 } }, { "ph": "f", "id": 147422, "pid": 76337, "tid": -914061504, "ts": 1716454223644132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223704806, "dur": 1, "args": { "External id": 147424, "device": 5, "context": 1, "stream": 7, "correlation": 147424, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 147424, "pid": 5, "tid": 7, "ts": 1716454223704806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223644137, "dur": 9, "args": { "External id": 147424, "cbid": 51, "correlation": 147424 } }, { "ph": "s", "id": 147424, "pid": 76337, "tid": -914061504, "ts": 1716454223644137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223704809, "dur": 270, "args": { "External id": 147425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147425, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147425, "pid": 5, "tid": 7, "ts": 1716454223704809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644148, "dur": 12, "args": { "External id": 147425, "cbid": 211, "correlation": 147425 } }, { "ph": "s", "id": 147425, "pid": 76337, "tid": -914061504, "ts": 1716454223644148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223705081, "dur": 14, "args": { "External id": 147433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147433, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147433, "pid": 5, "tid": 7, "ts": 1716454223705081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644191, "dur": 10, "args": { "External id": 147433, "cbid": 211, "correlation": 147433 } }, { "ph": "s", "id": 147433, "pid": 76337, "tid": -914061504, "ts": 1716454223644191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223705096, "dur": 38, "args": { "External id": 147444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147444, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147444, "pid": 5, "tid": 7, "ts": 1716454223705096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644259, "dur": 13, "args": { "External id": 147444, "cbid": 211, "correlation": 147444 } }, { "ph": "s", "id": 147444, "pid": 76337, "tid": -914061504, "ts": 1716454223644259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223644324, "dur": 0, "args": { "External id": 147456, "cbid": 317, "correlation": 147456 } }, { "ph": "f", "id": 147456, "pid": 76337, "tid": -914061504, "ts": 1716454223644324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223644324, "dur": 0, "args": { "External id": 147457, "cbid": 203, "correlation": 147457 } }, { "ph": "f", "id": 147457, "pid": 76337, "tid": -914061504, "ts": 1716454223644324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223644325, "dur": 0, "args": { "External id": 147458, "cbid": 205, "correlation": 147458 } }, { "ph": "f", "id": 147458, "pid": 76337, "tid": -914061504, "ts": 1716454223644325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223705136, "dur": 13, "args": { "External id": 147462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147462, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147462, "pid": 5, "tid": 7, "ts": 1716454223705136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644340, "dur": 12, "args": { "External id": 147462, "cbid": 211, "correlation": 147462 } }, { "ph": "s", "id": 147462, "pid": 76337, "tid": -914061504, "ts": 1716454223644340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223705150, "dur": 4, "args": { "External id": 147464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147464, "pid": 5, "tid": 7, "ts": 1716454223705150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644356, "dur": 6, "args": { "External id": 147464, "cbid": 211, "correlation": 147464 } }, { "ph": "s", "id": 147464, "pid": 76337, "tid": -914061504, "ts": 1716454223644356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223644365, "dur": 0, "args": { "External id": 147465, "cbid": 51, "correlation": 147465 } }, { "ph": "s", "id": 147465, "pid": 76337, "tid": -914061504, "ts": 1716454223644365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223705155, "dur": 98, "args": { "External id": 147466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147466, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 147466, "pid": 5, "tid": 7, "ts": 1716454223705155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644366, "dur": 5, "args": { "External id": 147466, "cbid": 211, "correlation": 147466 } }, { "ph": "s", "id": 147466, "pid": 76337, "tid": -914061504, "ts": 1716454223644366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223705254, "dur": 16, "args": { "External id": 147471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147471, "pid": 5, "tid": 7, "ts": 1716454223705254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644393, "dur": 8, "args": { "External id": 147471, "cbid": 211, "correlation": 147471 } }, { "ph": "s", "id": 147471, "pid": 76337, "tid": -914061504, "ts": 1716454223644393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223705272, "dur": 12, "args": { "External id": 147479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147479, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147479, "pid": 5, "tid": 7, "ts": 1716454223705272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644425, "dur": 8, "args": { "External id": 147479, "cbid": 211, "correlation": 147479 } }, { "ph": "s", "id": 147479, "pid": 76337, "tid": -914061504, "ts": 1716454223644425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223705285, "dur": 56, "args": { "External id": 147490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147490, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147490, "pid": 5, "tid": 7, "ts": 1716454223705285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644488, "dur": 12, "args": { "External id": 147490, "cbid": 211, "correlation": 147490 } }, { "ph": "s", "id": 147490, "pid": 76337, "tid": -914061504, "ts": 1716454223644488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223644543, "dur": 0, "args": { "External id": 147500, "cbid": 317, "correlation": 147500 } }, { "ph": "f", "id": 147500, "pid": 76337, "tid": -914061504, "ts": 1716454223644543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223644544, "dur": 0, "args": { "External id": 147501, "cbid": 203, "correlation": 147501 } }, { "ph": "f", "id": 147501, "pid": 76337, "tid": -914061504, "ts": 1716454223644544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223644544, "dur": 0, "args": { "External id": 147502, "cbid": 205, "correlation": 147502 } }, { "ph": "f", "id": 147502, "pid": 76337, "tid": -914061504, "ts": 1716454223644544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223705342, "dur": 40, "args": { "External id": 147506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147506, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147506, "pid": 5, "tid": 7, "ts": 1716454223705342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644562, "dur": 11, "args": { "External id": 147506, "cbid": 211, "correlation": 147506 } }, { "ph": "s", "id": 147506, "pid": 76337, "tid": -914061504, "ts": 1716454223644562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223705384, "dur": 164, "args": { "External id": 147508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147508, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147508, "pid": 5, "tid": 7, "ts": 1716454223705384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644575, "dur": 5, "args": { "External id": 147508, "cbid": 211, "correlation": 147508 } }, { "ph": "s", "id": 147508, "pid": 76337, "tid": -914061504, "ts": 1716454223644575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223705549, "dur": 1973, "args": { "External id": 147510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147510, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147510, "pid": 5, "tid": 7, "ts": 1716454223705549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644588, "dur": 10, "args": { "External id": 147510, "cbid": 211, "correlation": 147510 } }, { "ph": "s", "id": 147510, "pid": 76337, "tid": -914061504, "ts": 1716454223644588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223707524, "dur": 40, "args": { "External id": 147512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147512, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147512, "pid": 5, "tid": 7, "ts": 1716454223707524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644602, "dur": 6, "args": { "External id": 147512, "cbid": 211, "correlation": 147512 } }, { "ph": "s", "id": 147512, "pid": 76337, "tid": -914061504, "ts": 1716454223644602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223707565, "dur": 60, "args": { "External id": 147518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147518, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147518, "pid": 5, "tid": 7, "ts": 1716454223707565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644630, "dur": 9, "args": { "External id": 147518, "cbid": 211, "correlation": 147518 } }, { "ph": "s", "id": 147518, "pid": 76337, "tid": -914061504, "ts": 1716454223644630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223707626, "dur": 83, "args": { "External id": 147527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147527, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147527, "pid": 5, "tid": 7, "ts": 1716454223707626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644722, "dur": 13, "args": { "External id": 147527, "cbid": 211, "correlation": 147527 } }, { "ph": "s", "id": 147527, "pid": 76337, "tid": -914061504, "ts": 1716454223644722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223707711, "dur": 73, "args": { "External id": 147547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147547, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 147547, "pid": 5, "tid": 7, "ts": 1716454223707711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644792, "dur": 11, "args": { "External id": 147547, "cbid": 211, "correlation": 147547 } }, { "ph": "s", "id": 147547, "pid": 76337, "tid": -914061504, "ts": 1716454223644792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223707785, "dur": 5, "args": { "External id": 147559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147559, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 147559, "pid": 5, "tid": 7, "ts": 1716454223707785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644815, "dur": 6, "args": { "External id": 147559, "cbid": 211, "correlation": 147559 } }, { "ph": "s", "id": 147559, "pid": 76337, "tid": -914061504, "ts": 1716454223644815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223707791, "dur": 81, "args": { "External id": 147562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147562, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147562, "pid": 5, "tid": 7, "ts": 1716454223707791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644833, "dur": 6, "args": { "External id": 147562, "cbid": 211, "correlation": 147562 } }, { "ph": "s", "id": 147562, "pid": 76337, "tid": -914061504, "ts": 1716454223644833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223707873, "dur": 53, "args": { "External id": 147571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147571, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147571, "pid": 5, "tid": 7, "ts": 1716454223707873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644874, "dur": 9, "args": { "External id": 147571, "cbid": 211, "correlation": 147571 } }, { "ph": "s", "id": 147571, "pid": 76337, "tid": -914061504, "ts": 1716454223644874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223644925, "dur": 0, "args": { "External id": 147581, "cbid": 317, "correlation": 147581 } }, { "ph": "f", "id": 147581, "pid": 76337, "tid": -914061504, "ts": 1716454223644925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223644926, "dur": 0, "args": { "External id": 147582, "cbid": 203, "correlation": 147582 } }, { "ph": "f", "id": 147582, "pid": 76337, "tid": -914061504, "ts": 1716454223644926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223644926, "dur": 0, "args": { "External id": 147583, "cbid": 205, "correlation": 147583 } }, { "ph": "f", "id": 147583, "pid": 76337, "tid": -914061504, "ts": 1716454223644926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223707928, "dur": 58, "args": { "External id": 147587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147587, "pid": 5, "tid": 7, "ts": 1716454223707928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644944, "dur": 11, "args": { "External id": 147587, "cbid": 211, "correlation": 147587 } }, { "ph": "s", "id": 147587, "pid": 76337, "tid": -914061504, "ts": 1716454223644944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223707987, "dur": 123, "args": { "External id": 147589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147589, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147589, "pid": 5, "tid": 7, "ts": 1716454223707987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644958, "dur": 6, "args": { "External id": 147589, "cbid": 211, "correlation": 147589 } }, { "ph": "s", "id": 147589, "pid": 76337, "tid": -914061504, "ts": 1716454223644958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223708111, "dur": 1902, "args": { "External id": 147591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147591, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147591, "pid": 5, "tid": 7, "ts": 1716454223708111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644970, "dur": 14, "args": { "External id": 147591, "cbid": 211, "correlation": 147591 } }, { "ph": "s", "id": 147591, "pid": 76337, "tid": -914061504, "ts": 1716454223644970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223710015, "dur": 20, "args": { "External id": 147593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147593, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147593, "pid": 5, "tid": 7, "ts": 1716454223710015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223644988, "dur": 5, "args": { "External id": 147593, "cbid": 211, "correlation": 147593 } }, { "ph": "s", "id": 147593, "pid": 76337, "tid": -914061504, "ts": 1716454223644988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223710036, "dur": 33, "args": { "External id": 147599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147599, "pid": 5, "tid": 7, "ts": 1716454223710036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645017, "dur": 8, "args": { "External id": 147599, "cbid": 211, "correlation": 147599 } }, { "ph": "s", "id": 147599, "pid": 76337, "tid": -914061504, "ts": 1716454223645017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223710071, "dur": 4, "args": { "External id": 147607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147607, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 147607, "pid": 5, "tid": 7, "ts": 1716454223710071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645062, "dur": 9, "args": { "External id": 147607, "cbid": 211, "correlation": 147607 } }, { "ph": "s", "id": 147607, "pid": 76337, "tid": -914061504, "ts": 1716454223645062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223645125, "dur": 1, "args": { "External id": 147623, "cbid": 251, "correlation": 147623 } }, { "ph": "f", "id": 147623, "pid": 76337, "tid": -914061504, "ts": 1716454223645125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223645131, "dur": 0, "args": { "External id": 147625, "cbid": 251, "correlation": 147625 } }, { "ph": "f", "id": 147625, "pid": 76337, "tid": -914061504, "ts": 1716454223645131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223710076, "dur": 12, "args": { "External id": 147626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147626, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 147626, "pid": 5, "tid": 7, "ts": 1716454223710076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645133, "dur": 11, "args": { "External id": 147626, "cbid": 211, "correlation": 147626 } }, { "ph": "s", "id": 147626, "pid": 76337, "tid": -914061504, "ts": 1716454223645133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223710090, "dur": 5, "args": { "External id": 147628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147628, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 147628, "pid": 5, "tid": 7, "ts": 1716454223710090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645146, "dur": 6, "args": { "External id": 147628, "cbid": 211, "correlation": 147628 } }, { "ph": "s", "id": 147628, "pid": 76337, "tid": -914061504, "ts": 1716454223645146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223710096, "dur": 29, "args": { "External id": 147638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147638, "pid": 5, "tid": 7, "ts": 1716454223710096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645205, "dur": 13, "args": { "External id": 147638, "cbid": 211, "correlation": 147638 } }, { "ph": "s", "id": 147638, "pid": 76337, "tid": -914061504, "ts": 1716454223645205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223710127, "dur": 31, "args": { "External id": 147658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147658, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 147658, "pid": 5, "tid": 7, "ts": 1716454223710127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645271, "dur": 10, "args": { "External id": 147658, "cbid": 211, "correlation": 147658 } }, { "ph": "s", "id": 147658, "pid": 76337, "tid": -914061504, "ts": 1716454223645271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223710159, "dur": 4, "args": { "External id": 147670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147670, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 147670, "pid": 5, "tid": 7, "ts": 1716454223710159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645292, "dur": 6, "args": { "External id": 147670, "cbid": 211, "correlation": 147670 } }, { "ph": "s", "id": 147670, "pid": 76337, "tid": -914061504, "ts": 1716454223645292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223710165, "dur": 30, "args": { "External id": 147673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147673, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147673, "pid": 5, "tid": 7, "ts": 1716454223710165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645311, "dur": 6, "args": { "External id": 147673, "cbid": 211, "correlation": 147673 } }, { "ph": "s", "id": 147673, "pid": 76337, "tid": -914061504, "ts": 1716454223645311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223710197, "dur": 21, "args": { "External id": 147682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147682, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147682, "pid": 5, "tid": 7, "ts": 1716454223710197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645350, "dur": 10, "args": { "External id": 147682, "cbid": 211, "correlation": 147682 } }, { "ph": "s", "id": 147682, "pid": 76337, "tid": -914061504, "ts": 1716454223645350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223645412, "dur": 0, "args": { "External id": 147692, "cbid": 317, "correlation": 147692 } }, { "ph": "f", "id": 147692, "pid": 76337, "tid": -914061504, "ts": 1716454223645412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223645413, "dur": 0, "args": { "External id": 147693, "cbid": 203, "correlation": 147693 } }, { "ph": "f", "id": 147693, "pid": 76337, "tid": -914061504, "ts": 1716454223645413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223645414, "dur": 0, "args": { "External id": 147694, "cbid": 205, "correlation": 147694 } }, { "ph": "f", "id": 147694, "pid": 76337, "tid": -914061504, "ts": 1716454223645414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223710219, "dur": 22, "args": { "External id": 147698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147698, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147698, "pid": 5, "tid": 7, "ts": 1716454223710219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645431, "dur": 12, "args": { "External id": 147698, "cbid": 211, "correlation": 147698 } }, { "ph": "s", "id": 147698, "pid": 76337, "tid": -914061504, "ts": 1716454223645431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223710242, "dur": 45, "args": { "External id": 147700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147700, "pid": 5, "tid": 7, "ts": 1716454223710242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645445, "dur": 5, "args": { "External id": 147700, "cbid": 211, "correlation": 147700 } }, { "ph": "s", "id": 147700, "pid": 76337, "tid": -914061504, "ts": 1716454223645445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223710288, "dur": 648, "args": { "External id": 147702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147702, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147702, "pid": 5, "tid": 7, "ts": 1716454223710288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645457, "dur": 8, "args": { "External id": 147702, "cbid": 211, "correlation": 147702 } }, { "ph": "s", "id": 147702, "pid": 76337, "tid": -914061504, "ts": 1716454223645457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223710938, "dur": 23, "args": { "External id": 147704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147704, "pid": 5, "tid": 7, "ts": 1716454223710938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645468, "dur": 5, "args": { "External id": 147704, "cbid": 211, "correlation": 147704 } }, { "ph": "s", "id": 147704, "pid": 76337, "tid": -914061504, "ts": 1716454223645468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223710962, "dur": 33, "args": { "External id": 147710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147710, "pid": 5, "tid": 7, "ts": 1716454223710962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645495, "dur": 8, "args": { "External id": 147710, "cbid": 211, "correlation": 147710 } }, { "ph": "s", "id": 147710, "pid": 76337, "tid": -914061504, "ts": 1716454223645495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223645552, "dur": 0, "args": { "External id": 147720, "cbid": 317, "correlation": 147720 } }, { "ph": "f", "id": 147720, "pid": 76337, "tid": -914061504, "ts": 1716454223645552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223645553, "dur": 0, "args": { "External id": 147721, "cbid": 203, "correlation": 147721 } }, { "ph": "f", "id": 147721, "pid": 76337, "tid": -914061504, "ts": 1716454223645553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223645554, "dur": 0, "args": { "External id": 147722, "cbid": 205, "correlation": 147722 } }, { "ph": "f", "id": 147722, "pid": 76337, "tid": -914061504, "ts": 1716454223645554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223710996, "dur": 56, "args": { "External id": 147726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147726, "pid": 5, "tid": 7, "ts": 1716454223710996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645567, "dur": 11, "args": { "External id": 147726, "cbid": 211, "correlation": 147726 } }, { "ph": "s", "id": 147726, "pid": 76337, "tid": -914061504, "ts": 1716454223645567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223711054, "dur": 271, "args": { "External id": 147728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147728, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147728, "pid": 5, "tid": 7, "ts": 1716454223711054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645585, "dur": 8, "args": { "External id": 147728, "cbid": 211, "correlation": 147728 } }, { "ph": "s", "id": 147728, "pid": 76337, "tid": -914061504, "ts": 1716454223645585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223711326, "dur": 21, "args": { "External id": 147730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147730, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147730, "pid": 5, "tid": 7, "ts": 1716454223711326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645597, "dur": 5, "args": { "External id": 147730, "cbid": 211, "correlation": 147730 } }, { "ph": "s", "id": 147730, "pid": 76337, "tid": -914061504, "ts": 1716454223645597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223711349, "dur": 33, "args": { "External id": 147736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147736, "pid": 5, "tid": 7, "ts": 1716454223711349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645622, "dur": 8, "args": { "External id": 147736, "cbid": 211, "correlation": 147736 } }, { "ph": "s", "id": 147736, "pid": 76337, "tid": -914061504, "ts": 1716454223645622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223711383, "dur": 27, "args": { "External id": 147744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147744, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147744, "pid": 5, "tid": 7, "ts": 1716454223711383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645650, "dur": 8, "args": { "External id": 147744, "cbid": 211, "correlation": 147744 } }, { "ph": "s", "id": 147744, "pid": 76337, "tid": -914061504, "ts": 1716454223645650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223711411, "dur": 19, "args": { "External id": 147752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147752, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147752, "pid": 5, "tid": 7, "ts": 1716454223711411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645679, "dur": 9, "args": { "External id": 147752, "cbid": 211, "correlation": 147752 } }, { "ph": "s", "id": 147752, "pid": 76337, "tid": -914061504, "ts": 1716454223645679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223711432, "dur": 30, "args": { "External id": 147772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147772, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 147772, "pid": 5, "tid": 7, "ts": 1716454223711432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645761, "dur": 12, "args": { "External id": 147772, "cbid": 211, "correlation": 147772 } }, { "ph": "s", "id": 147772, "pid": 76337, "tid": -914061504, "ts": 1716454223645761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223711464, "dur": 4, "args": { "External id": 147784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147784, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 147784, "pid": 5, "tid": 7, "ts": 1716454223711464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645782, "dur": 6, "args": { "External id": 147784, "cbid": 211, "correlation": 147784 } }, { "ph": "s", "id": 147784, "pid": 76337, "tid": -914061504, "ts": 1716454223645782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223711469, "dur": 31, "args": { "External id": 147787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147787, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147787, "pid": 5, "tid": 7, "ts": 1716454223711469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645800, "dur": 6, "args": { "External id": 147787, "cbid": 211, "correlation": 147787 } }, { "ph": "s", "id": 147787, "pid": 76337, "tid": -914061504, "ts": 1716454223645800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223645855, "dur": 0, "args": { "External id": 147798, "cbid": 317, "correlation": 147798 } }, { "ph": "f", "id": 147798, "pid": 76337, "tid": -914061504, "ts": 1716454223645855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223645856, "dur": 0, "args": { "External id": 147799, "cbid": 203, "correlation": 147799 } }, { "ph": "f", "id": 147799, "pid": 76337, "tid": -914061504, "ts": 1716454223645856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223645857, "dur": 0, "args": { "External id": 147800, "cbid": 205, "correlation": 147800 } }, { "ph": "f", "id": 147800, "pid": 76337, "tid": -914061504, "ts": 1716454223645857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223711502, "dur": 22, "args": { "External id": 147804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147804, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147804, "pid": 5, "tid": 7, "ts": 1716454223711502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645873, "dur": 11, "args": { "External id": 147804, "cbid": 211, "correlation": 147804 } }, { "ph": "s", "id": 147804, "pid": 76337, "tid": -914061504, "ts": 1716454223645873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223711525, "dur": 106, "args": { "External id": 147806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147806, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147806, "pid": 5, "tid": 7, "ts": 1716454223711525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645890, "dur": 6, "args": { "External id": 147806, "cbid": 211, "correlation": 147806 } }, { "ph": "s", "id": 147806, "pid": 76337, "tid": -914061504, "ts": 1716454223645890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223711632, "dur": 23, "args": { "External id": 147808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147808, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147808, "pid": 5, "tid": 7, "ts": 1716454223711632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645900, "dur": 5, "args": { "External id": 147808, "cbid": 211, "correlation": 147808 } }, { "ph": "s", "id": 147808, "pid": 76337, "tid": -914061504, "ts": 1716454223645900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223711656, "dur": 32, "args": { "External id": 147814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147814, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147814, "pid": 5, "tid": 7, "ts": 1716454223711656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223645927, "dur": 8, "args": { "External id": 147814, "cbid": 211, "correlation": 147814 } }, { "ph": "s", "id": 147814, "pid": 76337, "tid": -914061504, "ts": 1716454223645927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223711690, "dur": 195, "args": { "External id": 147823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147823, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147823, "pid": 5, "tid": 7, "ts": 1716454223711690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646020, "dur": 15, "args": { "External id": 147823, "cbid": 211, "correlation": 147823 } }, { "ph": "s", "id": 147823, "pid": 76337, "tid": -914061504, "ts": 1716454223646020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223711886, "dur": 64, "args": { "External id": 147845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147845, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147845, "pid": 5, "tid": 7, "ts": 1716454223711886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646079, "dur": 10, "args": { "External id": 147845, "cbid": 211, "correlation": 147845 } }, { "ph": "s", "id": 147845, "pid": 76337, "tid": -914061504, "ts": 1716454223646079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646170, "dur": 1, "args": { "External id": 147856, "cbid": 251, "correlation": 147856 } }, { "ph": "f", "id": 147856, "pid": 76337, "tid": -914061504, "ts": 1716454223646170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223711951, "dur": 155, "args": { "External id": 147857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147857, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147857, "pid": 5, "tid": 7, "ts": 1716454223711951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646176, "dur": 13, "args": { "External id": 147857, "cbid": 211, "correlation": 147857 } }, { "ph": "s", "id": 147857, "pid": 76337, "tid": -914061504, "ts": 1716454223646176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646246, "dur": 1, "args": { "External id": 147868, "cbid": 251, "correlation": 147868 } }, { "ph": "f", "id": 147868, "pid": 76337, "tid": -914061504, "ts": 1716454223646246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223712108, "dur": 148, "args": { "External id": 147869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147869, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147869, "pid": 5, "tid": 7, "ts": 1716454223712108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646250, "dur": 12, "args": { "External id": 147869, "cbid": 211, "correlation": 147869 } }, { "ph": "s", "id": 147869, "pid": 76337, "tid": -914061504, "ts": 1716454223646250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646316, "dur": 1, "args": { "External id": 147880, "cbid": 251, "correlation": 147880 } }, { "ph": "f", "id": 147880, "pid": 76337, "tid": -914061504, "ts": 1716454223646316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223712258, "dur": 144, "args": { "External id": 147881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147881, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147881, "pid": 5, "tid": 7, "ts": 1716454223712258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646320, "dur": 11, "args": { "External id": 147881, "cbid": 211, "correlation": 147881 } }, { "ph": "s", "id": 147881, "pid": 76337, "tid": -914061504, "ts": 1716454223646320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223712403, "dur": 1955, "args": { "External id": 147902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147902, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 147902, "pid": 5, "tid": 7, "ts": 1716454223712403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646402, "dur": 14, "args": { "External id": 147902, "cbid": 211, "correlation": 147902 } }, { "ph": "s", "id": 147902, "pid": 76337, "tid": -914061504, "ts": 1716454223646402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646502, "dur": 2, "args": { "External id": 147920, "cbid": 251, "correlation": 147920 } }, { "ph": "f", "id": 147920, "pid": 76337, "tid": -914061504, "ts": 1716454223646502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223714360, "dur": 149, "args": { "External id": 147922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147922, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 147922, "pid": 5, "tid": 7, "ts": 1716454223714360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646510, "dur": 14, "args": { "External id": 147922, "cbid": 211, "correlation": 147922 } }, { "ph": "s", "id": 147922, "pid": 76337, "tid": -914061504, "ts": 1716454223646510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223714510, "dur": 36, "args": { "External id": 147930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147930, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147930, "pid": 5, "tid": 7, "ts": 1716454223714510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646582, "dur": 12, "args": { "External id": 147930, "cbid": 211, "correlation": 147930 } }, { "ph": "s", "id": 147930, "pid": 76337, "tid": -914061504, "ts": 1716454223646582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223714547, "dur": 51, "args": { "External id": 147938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147938, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147938, "pid": 5, "tid": 7, "ts": 1716454223714547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646621, "dur": 8, "args": { "External id": 147938, "cbid": 211, "correlation": 147938 } }, { "ph": "s", "id": 147938, "pid": 76337, "tid": -914061504, "ts": 1716454223646621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223714599, "dur": 30, "args": { "External id": 147949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147949, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147949, "pid": 5, "tid": 7, "ts": 1716454223714599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646692, "dur": 12, "args": { "External id": 147949, "cbid": 211, "correlation": 147949 } }, { "ph": "s", "id": 147949, "pid": 76337, "tid": -914061504, "ts": 1716454223646692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223714630, "dur": 35, "args": { "External id": 147971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147971, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 147971, "pid": 5, "tid": 7, "ts": 1716454223714630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646725, "dur": 7, "args": { "External id": 147971, "cbid": 211, "correlation": 147971 } }, { "ph": "s", "id": 147971, "pid": 76337, "tid": -914061504, "ts": 1716454223646725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646809, "dur": 1, "args": { "External id": 147982, "cbid": 251, "correlation": 147982 } }, { "ph": "f", "id": 147982, "pid": 76337, "tid": -914061504, "ts": 1716454223646809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223714666, "dur": 90, "args": { "External id": 147983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147983, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 147983, "pid": 5, "tid": 7, "ts": 1716454223714666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646815, "dur": 13, "args": { "External id": 147983, "cbid": 211, "correlation": 147983 } }, { "ph": "s", "id": 147983, "pid": 76337, "tid": -914061504, "ts": 1716454223646815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646884, "dur": 1, "args": { "External id": 147994, "cbid": 251, "correlation": 147994 } }, { "ph": "f", "id": 147994, "pid": 76337, "tid": -914061504, "ts": 1716454223646884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646888, "dur": 0, "args": { "External id": 147995, "cbid": 251, "correlation": 147995 } }, { "ph": "f", "id": 147995, "pid": 76337, "tid": -914061504, "ts": 1716454223646888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223714758, "dur": 11, "args": { "External id": 147996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147996, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 147996, "pid": 5, "tid": 7, "ts": 1716454223714758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646889, "dur": 12, "args": { "External id": 147996, "cbid": 211, "correlation": 147996 } }, { "ph": "s", "id": 147996, "pid": 76337, "tid": -914061504, "ts": 1716454223646889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223714770, "dur": 5, "args": { "External id": 147998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 147998, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 147998, "pid": 5, "tid": 7, "ts": 1716454223714770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646902, "dur": 5, "args": { "External id": 147998, "cbid": 211, "correlation": 147998 } }, { "ph": "s", "id": 147998, "pid": 76337, "tid": -914061504, "ts": 1716454223646902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646958, "dur": 1, "args": { "External id": 148009, "cbid": 251, "correlation": 148009 } }, { "ph": "f", "id": 148009, "pid": 76337, "tid": -914061504, "ts": 1716454223646958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223646962, "dur": 0, "args": { "External id": 148010, "cbid": 251, "correlation": 148010 } }, { "ph": "f", "id": 148010, "pid": 76337, "tid": -914061504, "ts": 1716454223646962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223714776, "dur": 8, "args": { "External id": 148011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148011, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 148011, "pid": 5, "tid": 7, "ts": 1716454223714776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646963, "dur": 24, "args": { "External id": 148011, "cbid": 211, "correlation": 148011 } }, { "ph": "s", "id": 148011, "pid": 76337, "tid": -914061504, "ts": 1716454223646963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223714785, "dur": 3, "args": { "External id": 148013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148013, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 148013, "pid": 5, "tid": 7, "ts": 1716454223714785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223646989, "dur": 6, "args": { "External id": 148013, "cbid": 211, "correlation": 148013 } }, { "ph": "s", "id": 148013, "pid": 76337, "tid": -914061504, "ts": 1716454223646989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223714790, "dur": 93, "args": { "External id": 148034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148034, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 148034, "pid": 5, "tid": 7, "ts": 1716454223714790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647064, "dur": 13, "args": { "External id": 148034, "cbid": 211, "correlation": 148034 } }, { "ph": "s", "id": 148034, "pid": 76337, "tid": -914061504, "ts": 1716454223647064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223647161, "dur": 1, "args": { "External id": 148052, "cbid": 251, "correlation": 148052 } }, { "ph": "f", "id": 148052, "pid": 76337, "tid": -914061504, "ts": 1716454223647161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223714884, "dur": 98, "args": { "External id": 148054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148054, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148054, "pid": 5, "tid": 7, "ts": 1716454223714884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647167, "dur": 14, "args": { "External id": 148054, "cbid": 211, "correlation": 148054 } }, { "ph": "s", "id": 148054, "pid": 76337, "tid": -914061504, "ts": 1716454223647167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223714984, "dur": 19, "args": { "External id": 148062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148062, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148062, "pid": 5, "tid": 7, "ts": 1716454223714984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647237, "dur": 12, "args": { "External id": 148062, "cbid": 211, "correlation": 148062 } }, { "ph": "s", "id": 148062, "pid": 76337, "tid": -914061504, "ts": 1716454223647237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223715004, "dur": 37, "args": { "External id": 148070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148070, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148070, "pid": 5, "tid": 7, "ts": 1716454223715004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647278, "dur": 9, "args": { "External id": 148070, "cbid": 211, "correlation": 148070 } }, { "ph": "s", "id": 148070, "pid": 76337, "tid": -914061504, "ts": 1716454223647278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223715042, "dur": 35, "args": { "External id": 148092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148092, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148092, "pid": 5, "tid": 7, "ts": 1716454223715042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647330, "dur": 10, "args": { "External id": 148092, "cbid": 211, "correlation": 148092 } }, { "ph": "s", "id": 148092, "pid": 76337, "tid": -914061504, "ts": 1716454223647330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223647418, "dur": 1, "args": { "External id": 148108, "cbid": 251, "correlation": 148108 } }, { "ph": "f", "id": 148108, "pid": 76337, "tid": -914061504, "ts": 1716454223647418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223647424, "dur": 0, "args": { "External id": 148110, "cbid": 251, "correlation": 148110 } }, { "ph": "f", "id": 148110, "pid": 76337, "tid": -914061504, "ts": 1716454223647424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223715078, "dur": 543, "args": { "External id": 148111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148111, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 148111, "pid": 5, "tid": 7, "ts": 1716454223715078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647429, "dur": 12, "args": { "External id": 148111, "cbid": 211, "correlation": 148111 } }, { "ph": "s", "id": 148111, "pid": 76337, "tid": -914061504, "ts": 1716454223647429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223715623, "dur": 127, "args": { "External id": 148119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148119, "pid": 5, "tid": 7, "ts": 1716454223715623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647493, "dur": 12, "args": { "External id": 148119, "cbid": 211, "correlation": 148119 } }, { "ph": "s", "id": 148119, "pid": 76337, "tid": -914061504, "ts": 1716454223647493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223715751, "dur": 129, "args": { "External id": 148127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148127, "pid": 5, "tid": 7, "ts": 1716454223715751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647526, "dur": 8, "args": { "External id": 148127, "cbid": 211, "correlation": 148127 } }, { "ph": "s", "id": 148127, "pid": 76337, "tid": -914061504, "ts": 1716454223647526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223647603, "dur": 1, "args": { "External id": 148143, "cbid": 251, "correlation": 148143 } }, { "ph": "f", "id": 148143, "pid": 76337, "tid": -914061504, "ts": 1716454223647603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223715882, "dur": 314, "args": { "External id": 148145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148145, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148145, "pid": 5, "tid": 7, "ts": 1716454223715882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647608, "dur": 13, "args": { "External id": 148145, "cbid": 211, "correlation": 148145 } }, { "ph": "s", "id": 148145, "pid": 76337, "tid": -914061504, "ts": 1716454223647608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223716197, "dur": 27, "args": { "External id": 148153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148153, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148153, "pid": 5, "tid": 7, "ts": 1716454223716197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647650, "dur": 10, "args": { "External id": 148153, "cbid": 211, "correlation": 148153 } }, { "ph": "s", "id": 148153, "pid": 76337, "tid": -914061504, "ts": 1716454223647650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223716226, "dur": 81, "args": { "External id": 148164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148164, "pid": 5, "tid": 7, "ts": 1716454223716226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647718, "dur": 13, "args": { "External id": 148164, "cbid": 211, "correlation": 148164 } }, { "ph": "s", "id": 148164, "pid": 76337, "tid": -914061504, "ts": 1716454223647718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223647782, "dur": 0, "args": { "External id": 148176, "cbid": 317, "correlation": 148176 } }, { "ph": "f", "id": 148176, "pid": 76337, "tid": -914061504, "ts": 1716454223647782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223647783, "dur": 0, "args": { "External id": 148177, "cbid": 203, "correlation": 148177 } }, { "ph": "f", "id": 148177, "pid": 76337, "tid": -914061504, "ts": 1716454223647783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223647784, "dur": 0, "args": { "External id": 148178, "cbid": 205, "correlation": 148178 } }, { "ph": "f", "id": 148178, "pid": 76337, "tid": -914061504, "ts": 1716454223647784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223716308, "dur": 23, "args": { "External id": 148182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148182, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148182, "pid": 5, "tid": 7, "ts": 1716454223716308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647799, "dur": 12, "args": { "External id": 148182, "cbid": 211, "correlation": 148182 } }, { "ph": "s", "id": 148182, "pid": 76337, "tid": -914061504, "ts": 1716454223647799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223716332, "dur": 120, "args": { "External id": 148184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148184, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148184, "pid": 5, "tid": 7, "ts": 1716454223716332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647817, "dur": 7, "args": { "External id": 148184, "cbid": 211, "correlation": 148184 } }, { "ph": "s", "id": 148184, "pid": 76337, "tid": -914061504, "ts": 1716454223647817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223716454, "dur": 22, "args": { "External id": 148186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148186, "pid": 5, "tid": 7, "ts": 1716454223716454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647828, "dur": 5, "args": { "External id": 148186, "cbid": 211, "correlation": 148186 } }, { "ph": "s", "id": 148186, "pid": 76337, "tid": -914061504, "ts": 1716454223647828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223716477, "dur": 33, "args": { "External id": 148192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148192, "pid": 5, "tid": 7, "ts": 1716454223716477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647855, "dur": 8, "args": { "External id": 148192, "cbid": 211, "correlation": 148192 } }, { "ph": "s", "id": 148192, "pid": 76337, "tid": -914061504, "ts": 1716454223647855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223716512, "dur": 26, "args": { "External id": 148200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148200, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148200, "pid": 5, "tid": 7, "ts": 1716454223716512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647887, "dur": 8, "args": { "External id": 148200, "cbid": 211, "correlation": 148200 } }, { "ph": "s", "id": 148200, "pid": 76337, "tid": -914061504, "ts": 1716454223647887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223716540, "dur": 53, "args": { "External id": 148209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148209, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148209, "pid": 5, "tid": 7, "ts": 1716454223716540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223647925, "dur": 11, "args": { "External id": 148209, "cbid": 211, "correlation": 148209 } }, { "ph": "s", "id": 148209, "pid": 76337, "tid": -914061504, "ts": 1716454223647925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223716594, "dur": 51, "args": { "External id": 148229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148229, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 148229, "pid": 5, "tid": 7, "ts": 1716454223716594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648006, "dur": 12, "args": { "External id": 148229, "cbid": 211, "correlation": 148229 } }, { "ph": "s", "id": 148229, "pid": 76337, "tid": -914061504, "ts": 1716454223648006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223716646, "dur": 5, "args": { "External id": 148241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148241, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 148241, "pid": 5, "tid": 7, "ts": 1716454223716646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648029, "dur": 6, "args": { "External id": 148241, "cbid": 211, "correlation": 148241 } }, { "ph": "s", "id": 148241, "pid": 76337, "tid": -914061504, "ts": 1716454223648029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223716652, "dur": 56, "args": { "External id": 148244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148244, "pid": 5, "tid": 7, "ts": 1716454223716652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648047, "dur": 6, "args": { "External id": 148244, "cbid": 211, "correlation": 148244 } }, { "ph": "s", "id": 148244, "pid": 76337, "tid": -914061504, "ts": 1716454223648047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223716709, "dur": 37, "args": { "External id": 148253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148253, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148253, "pid": 5, "tid": 7, "ts": 1716454223716709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648087, "dur": 9, "args": { "External id": 148253, "cbid": 211, "correlation": 148253 } }, { "ph": "s", "id": 148253, "pid": 76337, "tid": -914061504, "ts": 1716454223648087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223648138, "dur": 0, "args": { "External id": 148263, "cbid": 317, "correlation": 148263 } }, { "ph": "f", "id": 148263, "pid": 76337, "tid": -914061504, "ts": 1716454223648138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223648138, "dur": 0, "args": { "External id": 148264, "cbid": 203, "correlation": 148264 } }, { "ph": "f", "id": 148264, "pid": 76337, "tid": -914061504, "ts": 1716454223648138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223648139, "dur": 0, "args": { "External id": 148265, "cbid": 205, "correlation": 148265 } }, { "ph": "f", "id": 148265, "pid": 76337, "tid": -914061504, "ts": 1716454223648139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223716748, "dur": 40, "args": { "External id": 148269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148269, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148269, "pid": 5, "tid": 7, "ts": 1716454223716748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648154, "dur": 11, "args": { "External id": 148269, "cbid": 211, "correlation": 148269 } }, { "ph": "s", "id": 148269, "pid": 76337, "tid": -914061504, "ts": 1716454223648154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223716789, "dur": 84, "args": { "External id": 148271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148271, "pid": 5, "tid": 7, "ts": 1716454223716789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648168, "dur": 5, "args": { "External id": 148271, "cbid": 211, "correlation": 148271 } }, { "ph": "s", "id": 148271, "pid": 76337, "tid": -914061504, "ts": 1716454223648168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223716874, "dur": 1282, "args": { "External id": 148273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148273, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148273, "pid": 5, "tid": 7, "ts": 1716454223716874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648179, "dur": 6, "args": { "External id": 148273, "cbid": 211, "correlation": 148273 } }, { "ph": "s", "id": 148273, "pid": 76337, "tid": -914061504, "ts": 1716454223648179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223718157, "dur": 20, "args": { "External id": 148275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148275, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148275, "pid": 5, "tid": 7, "ts": 1716454223718157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648189, "dur": 6, "args": { "External id": 148275, "cbid": 211, "correlation": 148275 } }, { "ph": "s", "id": 148275, "pid": 76337, "tid": -914061504, "ts": 1716454223648189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223718179, "dur": 33, "args": { "External id": 148281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148281, "pid": 5, "tid": 7, "ts": 1716454223718179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648218, "dur": 8, "args": { "External id": 148281, "cbid": 211, "correlation": 148281 } }, { "ph": "s", "id": 148281, "pid": 76337, "tid": -914061504, "ts": 1716454223648218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223718213, "dur": 4, "args": { "External id": 148289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148289, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 148289, "pid": 5, "tid": 7, "ts": 1716454223718213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648260, "dur": 9, "args": { "External id": 148289, "cbid": 211, "correlation": 148289 } }, { "ph": "s", "id": 148289, "pid": 76337, "tid": -914061504, "ts": 1716454223648260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223648323, "dur": 1, "args": { "External id": 148305, "cbid": 251, "correlation": 148305 } }, { "ph": "f", "id": 148305, "pid": 76337, "tid": -914061504, "ts": 1716454223648323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223648329, "dur": 0, "args": { "External id": 148307, "cbid": 251, "correlation": 148307 } }, { "ph": "f", "id": 148307, "pid": 76337, "tid": -914061504, "ts": 1716454223648329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223718218, "dur": 12, "args": { "External id": 148308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148308, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 148308, "pid": 5, "tid": 7, "ts": 1716454223718218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648331, "dur": 11, "args": { "External id": 148308, "cbid": 211, "correlation": 148308 } }, { "ph": "s", "id": 148308, "pid": 76337, "tid": -914061504, "ts": 1716454223648331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223718232, "dur": 5, "args": { "External id": 148310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148310, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 148310, "pid": 5, "tid": 7, "ts": 1716454223718232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648344, "dur": 5, "args": { "External id": 148310, "cbid": 211, "correlation": 148310 } }, { "ph": "s", "id": 148310, "pid": 76337, "tid": -914061504, "ts": 1716454223648344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223718238, "dur": 30, "args": { "External id": 148320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148320, "pid": 5, "tid": 7, "ts": 1716454223718238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648402, "dur": 12, "args": { "External id": 148320, "cbid": 211, "correlation": 148320 } }, { "ph": "s", "id": 148320, "pid": 76337, "tid": -914061504, "ts": 1716454223648402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223718269, "dur": 31, "args": { "External id": 148340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148340, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 148340, "pid": 5, "tid": 7, "ts": 1716454223718269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648468, "dur": 12, "args": { "External id": 148340, "cbid": 211, "correlation": 148340 } }, { "ph": "s", "id": 148340, "pid": 76337, "tid": -914061504, "ts": 1716454223648468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223718302, "dur": 4, "args": { "External id": 148352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148352, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 148352, "pid": 5, "tid": 7, "ts": 1716454223718302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648489, "dur": 6, "args": { "External id": 148352, "cbid": 211, "correlation": 148352 } }, { "ph": "s", "id": 148352, "pid": 76337, "tid": -914061504, "ts": 1716454223648489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223718307, "dur": 30, "args": { "External id": 148355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148355, "pid": 5, "tid": 7, "ts": 1716454223718307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648506, "dur": 7, "args": { "External id": 148355, "cbid": 211, "correlation": 148355 } }, { "ph": "s", "id": 148355, "pid": 76337, "tid": -914061504, "ts": 1716454223648506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223718338, "dur": 20, "args": { "External id": 148364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148364, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148364, "pid": 5, "tid": 7, "ts": 1716454223718338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648548, "dur": 11, "args": { "External id": 148364, "cbid": 211, "correlation": 148364 } }, { "ph": "s", "id": 148364, "pid": 76337, "tid": -914061504, "ts": 1716454223648548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223648610, "dur": 0, "args": { "External id": 148374, "cbid": 317, "correlation": 148374 } }, { "ph": "f", "id": 148374, "pid": 76337, "tid": -914061504, "ts": 1716454223648610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223648611, "dur": 0, "args": { "External id": 148375, "cbid": 203, "correlation": 148375 } }, { "ph": "f", "id": 148375, "pid": 76337, "tid": -914061504, "ts": 1716454223648611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223648612, "dur": 0, "args": { "External id": 148376, "cbid": 205, "correlation": 148376 } }, { "ph": "f", "id": 148376, "pid": 76337, "tid": -914061504, "ts": 1716454223648612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223718360, "dur": 22, "args": { "External id": 148380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148380, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148380, "pid": 5, "tid": 7, "ts": 1716454223718360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648629, "dur": 12, "args": { "External id": 148380, "cbid": 211, "correlation": 148380 } }, { "ph": "s", "id": 148380, "pid": 76337, "tid": -914061504, "ts": 1716454223648629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223718383, "dur": 44, "args": { "External id": 148382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148382, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148382, "pid": 5, "tid": 7, "ts": 1716454223718383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648644, "dur": 5, "args": { "External id": 148382, "cbid": 211, "correlation": 148382 } }, { "ph": "s", "id": 148382, "pid": 76337, "tid": -914061504, "ts": 1716454223648644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223718428, "dur": 645, "args": { "External id": 148384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148384, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148384, "pid": 5, "tid": 7, "ts": 1716454223718428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648654, "dur": 6, "args": { "External id": 148384, "cbid": 211, "correlation": 148384 } }, { "ph": "s", "id": 148384, "pid": 76337, "tid": -914061504, "ts": 1716454223648654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223719075, "dur": 22, "args": { "External id": 148386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148386, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148386, "pid": 5, "tid": 7, "ts": 1716454223719075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648664, "dur": 5, "args": { "External id": 148386, "cbid": 211, "correlation": 148386 } }, { "ph": "s", "id": 148386, "pid": 76337, "tid": -914061504, "ts": 1716454223648664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223719098, "dur": 33, "args": { "External id": 148392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148392, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148392, "pid": 5, "tid": 7, "ts": 1716454223719098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648692, "dur": 8, "args": { "External id": 148392, "cbid": 211, "correlation": 148392 } }, { "ph": "s", "id": 148392, "pid": 76337, "tid": -914061504, "ts": 1716454223648692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223648750, "dur": 0, "args": { "External id": 148402, "cbid": 317, "correlation": 148402 } }, { "ph": "f", "id": 148402, "pid": 76337, "tid": -914061504, "ts": 1716454223648750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223648751, "dur": 0, "args": { "External id": 148403, "cbid": 203, "correlation": 148403 } }, { "ph": "f", "id": 148403, "pid": 76337, "tid": -914061504, "ts": 1716454223648751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223648752, "dur": 0, "args": { "External id": 148404, "cbid": 205, "correlation": 148404 } }, { "ph": "f", "id": 148404, "pid": 76337, "tid": -914061504, "ts": 1716454223648752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223719132, "dur": 38, "args": { "External id": 148408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148408, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148408, "pid": 5, "tid": 7, "ts": 1716454223719132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648764, "dur": 12, "args": { "External id": 148408, "cbid": 211, "correlation": 148408 } }, { "ph": "s", "id": 148408, "pid": 76337, "tid": -914061504, "ts": 1716454223648764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223719172, "dur": 189, "args": { "External id": 148410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148410, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148410, "pid": 5, "tid": 7, "ts": 1716454223719172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648782, "dur": 7, "args": { "External id": 148410, "cbid": 211, "correlation": 148410 } }, { "ph": "s", "id": 148410, "pid": 76337, "tid": -914061504, "ts": 1716454223648782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223719363, "dur": 22, "args": { "External id": 148412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148412, "pid": 5, "tid": 7, "ts": 1716454223719363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648793, "dur": 5, "args": { "External id": 148412, "cbid": 211, "correlation": 148412 } }, { "ph": "s", "id": 148412, "pid": 76337, "tid": -914061504, "ts": 1716454223648793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223719386, "dur": 32, "args": { "External id": 148418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148418, "pid": 5, "tid": 7, "ts": 1716454223719386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648818, "dur": 8, "args": { "External id": 148418, "cbid": 211, "correlation": 148418 } }, { "ph": "s", "id": 148418, "pid": 76337, "tid": -914061504, "ts": 1716454223648818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223719420, "dur": 27, "args": { "External id": 148426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148426, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148426, "pid": 5, "tid": 7, "ts": 1716454223719420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648847, "dur": 8, "args": { "External id": 148426, "cbid": 211, "correlation": 148426 } }, { "ph": "s", "id": 148426, "pid": 76337, "tid": -914061504, "ts": 1716454223648847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223719449, "dur": 20, "args": { "External id": 148434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148434, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148434, "pid": 5, "tid": 7, "ts": 1716454223719449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648875, "dur": 9, "args": { "External id": 148434, "cbid": 211, "correlation": 148434 } }, { "ph": "s", "id": 148434, "pid": 76337, "tid": -914061504, "ts": 1716454223648875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223719470, "dur": 31, "args": { "External id": 148454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148454, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 148454, "pid": 5, "tid": 7, "ts": 1716454223719470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648958, "dur": 12, "args": { "External id": 148454, "cbid": 211, "correlation": 148454 } }, { "ph": "s", "id": 148454, "pid": 76337, "tid": -914061504, "ts": 1716454223648958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223719502, "dur": 5, "args": { "External id": 148466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148466, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 148466, "pid": 5, "tid": 7, "ts": 1716454223719502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223648988, "dur": 7, "args": { "External id": 148466, "cbid": 211, "correlation": 148466 } }, { "ph": "s", "id": 148466, "pid": 76337, "tid": -914061504, "ts": 1716454223648988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223719508, "dur": 30, "args": { "External id": 148469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148469, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148469, "pid": 5, "tid": 7, "ts": 1716454223719508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649007, "dur": 7, "args": { "External id": 148469, "cbid": 211, "correlation": 148469 } }, { "ph": "s", "id": 148469, "pid": 76337, "tid": -914061504, "ts": 1716454223649007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223649065, "dur": 0, "args": { "External id": 148480, "cbid": 317, "correlation": 148480 } }, { "ph": "f", "id": 148480, "pid": 76337, "tid": -914061504, "ts": 1716454223649065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223649066, "dur": 0, "args": { "External id": 148481, "cbid": 203, "correlation": 148481 } }, { "ph": "f", "id": 148481, "pid": 76337, "tid": -914061504, "ts": 1716454223649066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223649066, "dur": 0, "args": { "External id": 148482, "cbid": 205, "correlation": 148482 } }, { "ph": "f", "id": 148482, "pid": 76337, "tid": -914061504, "ts": 1716454223649066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223719539, "dur": 24, "args": { "External id": 148486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148486, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148486, "pid": 5, "tid": 7, "ts": 1716454223719539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649079, "dur": 12, "args": { "External id": 148486, "cbid": 211, "correlation": 148486 } }, { "ph": "s", "id": 148486, "pid": 76337, "tid": -914061504, "ts": 1716454223649079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223719564, "dur": 104, "args": { "External id": 148488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148488, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148488, "pid": 5, "tid": 7, "ts": 1716454223719564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649097, "dur": 7, "args": { "External id": 148488, "cbid": 211, "correlation": 148488 } }, { "ph": "s", "id": 148488, "pid": 76337, "tid": -914061504, "ts": 1716454223649097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223719670, "dur": 23, "args": { "External id": 148490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148490, "pid": 5, "tid": 7, "ts": 1716454223719670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649108, "dur": 5, "args": { "External id": 148490, "cbid": 211, "correlation": 148490 } }, { "ph": "s", "id": 148490, "pid": 76337, "tid": -914061504, "ts": 1716454223649108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223719694, "dur": 33, "args": { "External id": 148496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148496, "pid": 5, "tid": 7, "ts": 1716454223719694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649135, "dur": 8, "args": { "External id": 148496, "cbid": 211, "correlation": 148496 } }, { "ph": "s", "id": 148496, "pid": 76337, "tid": -914061504, "ts": 1716454223649135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223719728, "dur": 179, "args": { "External id": 148505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148505, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148505, "pid": 5, "tid": 7, "ts": 1716454223719728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649216, "dur": 15, "args": { "External id": 148505, "cbid": 211, "correlation": 148505 } }, { "ph": "s", "id": 148505, "pid": 76337, "tid": -914061504, "ts": 1716454223649216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223719909, "dur": 65, "args": { "External id": 148527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148527, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148527, "pid": 5, "tid": 7, "ts": 1716454223719909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649274, "dur": 11, "args": { "External id": 148527, "cbid": 211, "correlation": 148527 } }, { "ph": "s", "id": 148527, "pid": 76337, "tid": -914061504, "ts": 1716454223649274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223649363, "dur": 2, "args": { "External id": 148538, "cbid": 251, "correlation": 148538 } }, { "ph": "f", "id": 148538, "pid": 76337, "tid": -914061504, "ts": 1716454223649363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223719975, "dur": 156, "args": { "External id": 148539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148539, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148539, "pid": 5, "tid": 7, "ts": 1716454223719975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649368, "dur": 13, "args": { "External id": 148539, "cbid": 211, "correlation": 148539 } }, { "ph": "s", "id": 148539, "pid": 76337, "tid": -914061504, "ts": 1716454223649368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223649438, "dur": 1, "args": { "External id": 148550, "cbid": 251, "correlation": 148550 } }, { "ph": "f", "id": 148550, "pid": 76337, "tid": -914061504, "ts": 1716454223649438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223720132, "dur": 146, "args": { "External id": 148551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148551, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148551, "pid": 5, "tid": 7, "ts": 1716454223720132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649442, "dur": 11, "args": { "External id": 148551, "cbid": 211, "correlation": 148551 } }, { "ph": "s", "id": 148551, "pid": 76337, "tid": -914061504, "ts": 1716454223649442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223649508, "dur": 1, "args": { "External id": 148562, "cbid": 251, "correlation": 148562 } }, { "ph": "f", "id": 148562, "pid": 76337, "tid": -914061504, "ts": 1716454223649508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223720279, "dur": 146, "args": { "External id": 148563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148563, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148563, "pid": 5, "tid": 7, "ts": 1716454223720279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649512, "dur": 11, "args": { "External id": 148563, "cbid": 211, "correlation": 148563 } }, { "ph": "s", "id": 148563, "pid": 76337, "tid": -914061504, "ts": 1716454223649512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223720426, "dur": 1944, "args": { "External id": 148584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148584, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 148584, "pid": 5, "tid": 7, "ts": 1716454223720426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649593, "dur": 13, "args": { "External id": 148584, "cbid": 211, "correlation": 148584 } }, { "ph": "s", "id": 148584, "pid": 76337, "tid": -914061504, "ts": 1716454223649593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223649697, "dur": 1, "args": { "External id": 148602, "cbid": 251, "correlation": 148602 } }, { "ph": "f", "id": 148602, "pid": 76337, "tid": -914061504, "ts": 1716454223649697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223722371, "dur": 148, "args": { "External id": 148604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148604, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 148604, "pid": 5, "tid": 7, "ts": 1716454223722371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649703, "dur": 13, "args": { "External id": 148604, "cbid": 211, "correlation": 148604 } }, { "ph": "s", "id": 148604, "pid": 76337, "tid": -914061504, "ts": 1716454223649703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223722521, "dur": 36, "args": { "External id": 148612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148612, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148612, "pid": 5, "tid": 7, "ts": 1716454223722521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649776, "dur": 13, "args": { "External id": 148612, "cbid": 211, "correlation": 148612 } }, { "ph": "s", "id": 148612, "pid": 76337, "tid": -914061504, "ts": 1716454223649776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223722558, "dur": 51, "args": { "External id": 148620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148620, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148620, "pid": 5, "tid": 7, "ts": 1716454223722558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649816, "dur": 9, "args": { "External id": 148620, "cbid": 211, "correlation": 148620 } }, { "ph": "s", "id": 148620, "pid": 76337, "tid": -914061504, "ts": 1716454223649816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223722610, "dur": 29, "args": { "External id": 148631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148631, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148631, "pid": 5, "tid": 7, "ts": 1716454223722610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649896, "dur": 13, "args": { "External id": 148631, "cbid": 211, "correlation": 148631 } }, { "ph": "s", "id": 148631, "pid": 76337, "tid": -914061504, "ts": 1716454223649896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223722641, "dur": 34, "args": { "External id": 148653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148653, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148653, "pid": 5, "tid": 7, "ts": 1716454223722641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223649928, "dur": 9, "args": { "External id": 148653, "cbid": 211, "correlation": 148653 } }, { "ph": "s", "id": 148653, "pid": 76337, "tid": -914061504, "ts": 1716454223649928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650028, "dur": 1, "args": { "External id": 148664, "cbid": 251, "correlation": 148664 } }, { "ph": "f", "id": 148664, "pid": 76337, "tid": -914061504, "ts": 1716454223650028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223722676, "dur": 89, "args": { "External id": 148665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148665, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148665, "pid": 5, "tid": 7, "ts": 1716454223722676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650033, "dur": 15, "args": { "External id": 148665, "cbid": 211, "correlation": 148665 } }, { "ph": "s", "id": 148665, "pid": 76337, "tid": -914061504, "ts": 1716454223650033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650109, "dur": 1, "args": { "External id": 148676, "cbid": 251, "correlation": 148676 } }, { "ph": "f", "id": 148676, "pid": 76337, "tid": -914061504, "ts": 1716454223650109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650113, "dur": 0, "args": { "External id": 148677, "cbid": 251, "correlation": 148677 } }, { "ph": "f", "id": 148677, "pid": 76337, "tid": -914061504, "ts": 1716454223650113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223722767, "dur": 11, "args": { "External id": 148678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148678, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 148678, "pid": 5, "tid": 7, "ts": 1716454223722767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650115, "dur": 12, "args": { "External id": 148678, "cbid": 211, "correlation": 148678 } }, { "ph": "s", "id": 148678, "pid": 76337, "tid": -914061504, "ts": 1716454223650115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223722780, "dur": 5, "args": { "External id": 148680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148680, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 148680, "pid": 5, "tid": 7, "ts": 1716454223722780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650131, "dur": 7, "args": { "External id": 148680, "cbid": 211, "correlation": 148680 } }, { "ph": "s", "id": 148680, "pid": 76337, "tid": -914061504, "ts": 1716454223650131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650189, "dur": 1, "args": { "External id": 148691, "cbid": 251, "correlation": 148691 } }, { "ph": "f", "id": 148691, "pid": 76337, "tid": -914061504, "ts": 1716454223650189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650192, "dur": 0, "args": { "External id": 148692, "cbid": 251, "correlation": 148692 } }, { "ph": "f", "id": 148692, "pid": 76337, "tid": -914061504, "ts": 1716454223650192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223722786, "dur": 8, "args": { "External id": 148693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148693, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 148693, "pid": 5, "tid": 7, "ts": 1716454223722786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650194, "dur": 12, "args": { "External id": 148693, "cbid": 211, "correlation": 148693 } }, { "ph": "s", "id": 148693, "pid": 76337, "tid": -914061504, "ts": 1716454223650194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223722795, "dur": 3, "args": { "External id": 148695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148695, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 148695, "pid": 5, "tid": 7, "ts": 1716454223722795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650207, "dur": 5, "args": { "External id": 148695, "cbid": 211, "correlation": 148695 } }, { "ph": "s", "id": 148695, "pid": 76337, "tid": -914061504, "ts": 1716454223650207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223722800, "dur": 91, "args": { "External id": 148716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148716, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 148716, "pid": 5, "tid": 7, "ts": 1716454223722800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650285, "dur": 13, "args": { "External id": 148716, "cbid": 211, "correlation": 148716 } }, { "ph": "s", "id": 148716, "pid": 76337, "tid": -914061504, "ts": 1716454223650285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650386, "dur": 1, "args": { "External id": 148734, "cbid": 251, "correlation": 148734 } }, { "ph": "f", "id": 148734, "pid": 76337, "tid": -914061504, "ts": 1716454223650386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223722892, "dur": 97, "args": { "External id": 148736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148736, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148736, "pid": 5, "tid": 7, "ts": 1716454223722892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650392, "dur": 13, "args": { "External id": 148736, "cbid": 211, "correlation": 148736 } }, { "ph": "s", "id": 148736, "pid": 76337, "tid": -914061504, "ts": 1716454223650392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223722991, "dur": 19, "args": { "External id": 148744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148744, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148744, "pid": 5, "tid": 7, "ts": 1716454223722991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650461, "dur": 12, "args": { "External id": 148744, "cbid": 211, "correlation": 148744 } }, { "ph": "s", "id": 148744, "pid": 76337, "tid": -914061504, "ts": 1716454223650461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223723011, "dur": 38, "args": { "External id": 148752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148752, "pid": 5, "tid": 7, "ts": 1716454223723011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650504, "dur": 9, "args": { "External id": 148752, "cbid": 211, "correlation": 148752 } }, { "ph": "s", "id": 148752, "pid": 76337, "tid": -914061504, "ts": 1716454223650504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223723050, "dur": 35, "args": { "External id": 148774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148774, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148774, "pid": 5, "tid": 7, "ts": 1716454223723050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650555, "dur": 10, "args": { "External id": 148774, "cbid": 211, "correlation": 148774 } }, { "ph": "s", "id": 148774, "pid": 76337, "tid": -914061504, "ts": 1716454223650555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650650, "dur": 1, "args": { "External id": 148790, "cbid": 251, "correlation": 148790 } }, { "ph": "f", "id": 148790, "pid": 76337, "tid": -914061504, "ts": 1716454223650650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650655, "dur": 0, "args": { "External id": 148792, "cbid": 251, "correlation": 148792 } }, { "ph": "f", "id": 148792, "pid": 76337, "tid": -914061504, "ts": 1716454223650655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223723086, "dur": 540, "args": { "External id": 148793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148793, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 148793, "pid": 5, "tid": 7, "ts": 1716454223723086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650659, "dur": 12, "args": { "External id": 148793, "cbid": 211, "correlation": 148793 } }, { "ph": "s", "id": 148793, "pid": 76337, "tid": -914061504, "ts": 1716454223650659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223723627, "dur": 124, "args": { "External id": 148801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148801, "pid": 5, "tid": 7, "ts": 1716454223723627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650733, "dur": 14, "args": { "External id": 148801, "cbid": 211, "correlation": 148801 } }, { "ph": "s", "id": 148801, "pid": 76337, "tid": -914061504, "ts": 1716454223650733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223723753, "dur": 128, "args": { "External id": 148809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148809, "pid": 5, "tid": 7, "ts": 1716454223723753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650768, "dur": 10, "args": { "External id": 148809, "cbid": 211, "correlation": 148809 } }, { "ph": "s", "id": 148809, "pid": 76337, "tid": -914061504, "ts": 1716454223650768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223650847, "dur": 1, "args": { "External id": 148825, "cbid": 251, "correlation": 148825 } }, { "ph": "f", "id": 148825, "pid": 76337, "tid": -914061504, "ts": 1716454223650847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223723882, "dur": 306, "args": { "External id": 148827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148827, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148827, "pid": 5, "tid": 7, "ts": 1716454223723882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650853, "dur": 12, "args": { "External id": 148827, "cbid": 211, "correlation": 148827 } }, { "ph": "s", "id": 148827, "pid": 76337, "tid": -914061504, "ts": 1716454223650853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223724190, "dur": 27, "args": { "External id": 148835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148835, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148835, "pid": 5, "tid": 7, "ts": 1716454223724190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650895, "dur": 10, "args": { "External id": 148835, "cbid": 211, "correlation": 148835 } }, { "ph": "s", "id": 148835, "pid": 76337, "tid": -914061504, "ts": 1716454223650895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223724218, "dur": 81, "args": { "External id": 148846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148846, "pid": 5, "tid": 7, "ts": 1716454223724218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223650967, "dur": 21, "args": { "External id": 148846, "cbid": 211, "correlation": 148846 } }, { "ph": "s", "id": 148846, "pid": 76337, "tid": -914061504, "ts": 1716454223650967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223651051, "dur": 0, "args": { "External id": 148858, "cbid": 317, "correlation": 148858 } }, { "ph": "f", "id": 148858, "pid": 76337, "tid": -914061504, "ts": 1716454223651051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223651052, "dur": 1, "args": { "External id": 148859, "cbid": 203, "correlation": 148859 } }, { "ph": "f", "id": 148859, "pid": 76337, "tid": -914061504, "ts": 1716454223651052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223651054, "dur": 0, "args": { "External id": 148860, "cbid": 205, "correlation": 148860 } }, { "ph": "f", "id": 148860, "pid": 76337, "tid": -914061504, "ts": 1716454223651054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223724301, "dur": 24, "args": { "External id": 148864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148864, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148864, "pid": 5, "tid": 7, "ts": 1716454223724301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651077, "dur": 13, "args": { "External id": 148864, "cbid": 211, "correlation": 148864 } }, { "ph": "s", "id": 148864, "pid": 76337, "tid": -914061504, "ts": 1716454223651077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223724326, "dur": 121, "args": { "External id": 148866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148866, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148866, "pid": 5, "tid": 7, "ts": 1716454223724326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651100, "dur": 7, "args": { "External id": 148866, "cbid": 211, "correlation": 148866 } }, { "ph": "s", "id": 148866, "pid": 76337, "tid": -914061504, "ts": 1716454223651100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223724448, "dur": 24, "args": { "External id": 148868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148868, "pid": 5, "tid": 7, "ts": 1716454223724448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651112, "dur": 9, "args": { "External id": 148868, "cbid": 211, "correlation": 148868 } }, { "ph": "s", "id": 148868, "pid": 76337, "tid": -914061504, "ts": 1716454223651112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223724473, "dur": 33, "args": { "External id": 148874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148874, "pid": 5, "tid": 7, "ts": 1716454223724473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651145, "dur": 8, "args": { "External id": 148874, "cbid": 211, "correlation": 148874 } }, { "ph": "s", "id": 148874, "pid": 76337, "tid": -914061504, "ts": 1716454223651145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223724507, "dur": 27, "args": { "External id": 148882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148882, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148882, "pid": 5, "tid": 7, "ts": 1716454223724507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651177, "dur": 8, "args": { "External id": 148882, "cbid": 211, "correlation": 148882 } }, { "ph": "s", "id": 148882, "pid": 76337, "tid": -914061504, "ts": 1716454223651177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223724535, "dur": 45, "args": { "External id": 148891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148891, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148891, "pid": 5, "tid": 7, "ts": 1716454223724535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651224, "dur": 11, "args": { "External id": 148891, "cbid": 211, "correlation": 148891 } }, { "ph": "s", "id": 148891, "pid": 76337, "tid": -914061504, "ts": 1716454223651224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223724581, "dur": 42, "args": { "External id": 148911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148911, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 148911, "pid": 5, "tid": 7, "ts": 1716454223724581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651309, "dur": 12, "args": { "External id": 148911, "cbid": 211, "correlation": 148911 } }, { "ph": "s", "id": 148911, "pid": 76337, "tid": -914061504, "ts": 1716454223651309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223724625, "dur": 5, "args": { "External id": 148923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148923, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 148923, "pid": 5, "tid": 7, "ts": 1716454223724625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651332, "dur": 7, "args": { "External id": 148923, "cbid": 211, "correlation": 148923 } }, { "ph": "s", "id": 148923, "pid": 76337, "tid": -914061504, "ts": 1716454223651332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223724631, "dur": 44, "args": { "External id": 148926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148926, "pid": 5, "tid": 7, "ts": 1716454223724631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651353, "dur": 6, "args": { "External id": 148926, "cbid": 211, "correlation": 148926 } }, { "ph": "s", "id": 148926, "pid": 76337, "tid": -914061504, "ts": 1716454223651353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223724676, "dur": 30, "args": { "External id": 148935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148935, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148935, "pid": 5, "tid": 7, "ts": 1716454223724676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651398, "dur": 10, "args": { "External id": 148935, "cbid": 211, "correlation": 148935 } }, { "ph": "s", "id": 148935, "pid": 76337, "tid": -914061504, "ts": 1716454223651398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223651452, "dur": 0, "args": { "External id": 148945, "cbid": 317, "correlation": 148945 } }, { "ph": "f", "id": 148945, "pid": 76337, "tid": -914061504, "ts": 1716454223651452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223651452, "dur": 0, "args": { "External id": 148946, "cbid": 203, "correlation": 148946 } }, { "ph": "f", "id": 148946, "pid": 76337, "tid": -914061504, "ts": 1716454223651452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223651453, "dur": 0, "args": { "External id": 148947, "cbid": 205, "correlation": 148947 } }, { "ph": "f", "id": 148947, "pid": 76337, "tid": -914061504, "ts": 1716454223651453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223724706, "dur": 30, "args": { "External id": 148951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148951, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148951, "pid": 5, "tid": 7, "ts": 1716454223724706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651471, "dur": 12, "args": { "External id": 148951, "cbid": 211, "correlation": 148951 } }, { "ph": "s", "id": 148951, "pid": 76337, "tid": -914061504, "ts": 1716454223651471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223724738, "dur": 64, "args": { "External id": 148953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148953, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148953, "pid": 5, "tid": 7, "ts": 1716454223724738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651485, "dur": 5, "args": { "External id": 148953, "cbid": 211, "correlation": 148953 } }, { "ph": "s", "id": 148953, "pid": 76337, "tid": -914061504, "ts": 1716454223651485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223724803, "dur": 967, "args": { "External id": 148955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148955, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 148955, "pid": 5, "tid": 7, "ts": 1716454223724803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651496, "dur": 8, "args": { "External id": 148955, "cbid": 211, "correlation": 148955 } }, { "ph": "s", "id": 148955, "pid": 76337, "tid": -914061504, "ts": 1716454223651496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223725771, "dur": 21, "args": { "External id": 148957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148957, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148957, "pid": 5, "tid": 7, "ts": 1716454223725771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651508, "dur": 5, "args": { "External id": 148957, "cbid": 211, "correlation": 148957 } }, { "ph": "s", "id": 148957, "pid": 76337, "tid": -914061504, "ts": 1716454223651508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223725794, "dur": 33, "args": { "External id": 148963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 148963, "pid": 5, "tid": 7, "ts": 1716454223725794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651536, "dur": 8, "args": { "External id": 148963, "cbid": 211, "correlation": 148963 } }, { "ph": "s", "id": 148963, "pid": 76337, "tid": -914061504, "ts": 1716454223651536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223725828, "dur": 4, "args": { "External id": 148971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148971, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 148971, "pid": 5, "tid": 7, "ts": 1716454223725828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651579, "dur": 9, "args": { "External id": 148971, "cbid": 211, "correlation": 148971 } }, { "ph": "s", "id": 148971, "pid": 76337, "tid": -914061504, "ts": 1716454223651579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223651647, "dur": 1, "args": { "External id": 148987, "cbid": 251, "correlation": 148987 } }, { "ph": "f", "id": 148987, "pid": 76337, "tid": -914061504, "ts": 1716454223651647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223651652, "dur": 0, "args": { "External id": 148989, "cbid": 251, "correlation": 148989 } }, { "ph": "f", "id": 148989, "pid": 76337, "tid": -914061504, "ts": 1716454223651652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223725834, "dur": 12, "args": { "External id": 148990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148990, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 148990, "pid": 5, "tid": 7, "ts": 1716454223725834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651654, "dur": 12, "args": { "External id": 148990, "cbid": 211, "correlation": 148990 } }, { "ph": "s", "id": 148990, "pid": 76337, "tid": -914061504, "ts": 1716454223651654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223725847, "dur": 5, "args": { "External id": 148992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 148992, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 148992, "pid": 5, "tid": 7, "ts": 1716454223725847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651668, "dur": 6, "args": { "External id": 148992, "cbid": 211, "correlation": 148992 } }, { "ph": "s", "id": 148992, "pid": 76337, "tid": -914061504, "ts": 1716454223651668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223725854, "dur": 29, "args": { "External id": 149002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149002, "pid": 5, "tid": 7, "ts": 1716454223725854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651730, "dur": 12, "args": { "External id": 149002, "cbid": 211, "correlation": 149002 } }, { "ph": "s", "id": 149002, "pid": 76337, "tid": -914061504, "ts": 1716454223651730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223725884, "dur": 30, "args": { "External id": 149022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149022, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 149022, "pid": 5, "tid": 7, "ts": 1716454223725884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651796, "dur": 10, "args": { "External id": 149022, "cbid": 211, "correlation": 149022 } }, { "ph": "s", "id": 149022, "pid": 76337, "tid": -914061504, "ts": 1716454223651796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223725915, "dur": 4, "args": { "External id": 149034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149034, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 149034, "pid": 5, "tid": 7, "ts": 1716454223725915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651816, "dur": 7, "args": { "External id": 149034, "cbid": 211, "correlation": 149034 } }, { "ph": "s", "id": 149034, "pid": 76337, "tid": -914061504, "ts": 1716454223651816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223725921, "dur": 30, "args": { "External id": 149037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149037, "pid": 5, "tid": 7, "ts": 1716454223725921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651835, "dur": 6, "args": { "External id": 149037, "cbid": 211, "correlation": 149037 } }, { "ph": "s", "id": 149037, "pid": 76337, "tid": -914061504, "ts": 1716454223651835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223725952, "dur": 20, "args": { "External id": 149046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149046, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149046, "pid": 5, "tid": 7, "ts": 1716454223725952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651876, "dur": 9, "args": { "External id": 149046, "cbid": 211, "correlation": 149046 } }, { "ph": "s", "id": 149046, "pid": 76337, "tid": -914061504, "ts": 1716454223651876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223651938, "dur": 0, "args": { "External id": 149056, "cbid": 317, "correlation": 149056 } }, { "ph": "f", "id": 149056, "pid": 76337, "tid": -914061504, "ts": 1716454223651938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223651939, "dur": 0, "args": { "External id": 149057, "cbid": 203, "correlation": 149057 } }, { "ph": "f", "id": 149057, "pid": 76337, "tid": -914061504, "ts": 1716454223651939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223651940, "dur": 0, "args": { "External id": 149058, "cbid": 205, "correlation": 149058 } }, { "ph": "f", "id": 149058, "pid": 76337, "tid": -914061504, "ts": 1716454223651940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223725973, "dur": 22, "args": { "External id": 149062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149062, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149062, "pid": 5, "tid": 7, "ts": 1716454223725973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651957, "dur": 12, "args": { "External id": 149062, "cbid": 211, "correlation": 149062 } }, { "ph": "s", "id": 149062, "pid": 76337, "tid": -914061504, "ts": 1716454223651957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223725997, "dur": 44, "args": { "External id": 149064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149064, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149064, "pid": 5, "tid": 7, "ts": 1716454223725997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651972, "dur": 14, "args": { "External id": 149064, "cbid": 211, "correlation": 149064 } }, { "ph": "s", "id": 149064, "pid": 76337, "tid": -914061504, "ts": 1716454223651972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223726042, "dur": 644, "args": { "External id": 149066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149066, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149066, "pid": 5, "tid": 7, "ts": 1716454223726042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223651992, "dur": 7, "args": { "External id": 149066, "cbid": 211, "correlation": 149066 } }, { "ph": "s", "id": 149066, "pid": 76337, "tid": -914061504, "ts": 1716454223651992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223726687, "dur": 22, "args": { "External id": 149068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149068, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149068, "pid": 5, "tid": 7, "ts": 1716454223726687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652003, "dur": 5, "args": { "External id": 149068, "cbid": 211, "correlation": 149068 } }, { "ph": "s", "id": 149068, "pid": 76337, "tid": -914061504, "ts": 1716454223652003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223726711, "dur": 33, "args": { "External id": 149074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149074, "pid": 5, "tid": 7, "ts": 1716454223726711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652032, "dur": 8, "args": { "External id": 149074, "cbid": 211, "correlation": 149074 } }, { "ph": "s", "id": 149074, "pid": 76337, "tid": -914061504, "ts": 1716454223652032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223652090, "dur": 0, "args": { "External id": 149084, "cbid": 317, "correlation": 149084 } }, { "ph": "f", "id": 149084, "pid": 76337, "tid": -914061504, "ts": 1716454223652090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223652090, "dur": 0, "args": { "External id": 149085, "cbid": 203, "correlation": 149085 } }, { "ph": "f", "id": 149085, "pid": 76337, "tid": -914061504, "ts": 1716454223652090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223652091, "dur": 0, "args": { "External id": 149086, "cbid": 205, "correlation": 149086 } }, { "ph": "f", "id": 149086, "pid": 76337, "tid": -914061504, "ts": 1716454223652091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223726745, "dur": 30, "args": { "External id": 149090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149090, "pid": 5, "tid": 7, "ts": 1716454223726745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652105, "dur": 12, "args": { "External id": 149090, "cbid": 211, "correlation": 149090 } }, { "ph": "s", "id": 149090, "pid": 76337, "tid": -914061504, "ts": 1716454223652105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223726776, "dur": 154, "args": { "External id": 149092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149092, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149092, "pid": 5, "tid": 7, "ts": 1716454223726776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652123, "dur": 6, "args": { "External id": 149092, "cbid": 211, "correlation": 149092 } }, { "ph": "s", "id": 149092, "pid": 76337, "tid": -914061504, "ts": 1716454223652123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223726931, "dur": 24, "args": { "External id": 149094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149094, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149094, "pid": 5, "tid": 7, "ts": 1716454223726931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652133, "dur": 5, "args": { "External id": 149094, "cbid": 211, "correlation": 149094 } }, { "ph": "s", "id": 149094, "pid": 76337, "tid": -914061504, "ts": 1716454223652133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223726956, "dur": 32, "args": { "External id": 149100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149100, "pid": 5, "tid": 7, "ts": 1716454223726956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652158, "dur": 8, "args": { "External id": 149100, "cbid": 211, "correlation": 149100 } }, { "ph": "s", "id": 149100, "pid": 76337, "tid": -914061504, "ts": 1716454223652158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223726990, "dur": 27, "args": { "External id": 149108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149108, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149108, "pid": 5, "tid": 7, "ts": 1716454223726990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652186, "dur": 7, "args": { "External id": 149108, "cbid": 211, "correlation": 149108 } }, { "ph": "s", "id": 149108, "pid": 76337, "tid": -914061504, "ts": 1716454223652186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223727018, "dur": 20, "args": { "External id": 149116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149116, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149116, "pid": 5, "tid": 7, "ts": 1716454223727018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652215, "dur": 11, "args": { "External id": 149116, "cbid": 211, "correlation": 149116 } }, { "ph": "s", "id": 149116, "pid": 76337, "tid": -914061504, "ts": 1716454223652215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223727039, "dur": 29, "args": { "External id": 149136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149136, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 149136, "pid": 5, "tid": 7, "ts": 1716454223727039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652300, "dur": 12, "args": { "External id": 149136, "cbid": 211, "correlation": 149136 } }, { "ph": "s", "id": 149136, "pid": 76337, "tid": -914061504, "ts": 1716454223652300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223727070, "dur": 5, "args": { "External id": 149148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149148, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 149148, "pid": 5, "tid": 7, "ts": 1716454223727070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652322, "dur": 6, "args": { "External id": 149148, "cbid": 211, "correlation": 149148 } }, { "ph": "s", "id": 149148, "pid": 76337, "tid": -914061504, "ts": 1716454223652322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223727076, "dur": 31, "args": { "External id": 149151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149151, "pid": 5, "tid": 7, "ts": 1716454223727076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652340, "dur": 6, "args": { "External id": 149151, "cbid": 211, "correlation": 149151 } }, { "ph": "s", "id": 149151, "pid": 76337, "tid": -914061504, "ts": 1716454223652340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223652395, "dur": 0, "args": { "External id": 149162, "cbid": 317, "correlation": 149162 } }, { "ph": "f", "id": 149162, "pid": 76337, "tid": -914061504, "ts": 1716454223652395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223652396, "dur": 0, "args": { "External id": 149163, "cbid": 203, "correlation": 149163 } }, { "ph": "f", "id": 149163, "pid": 76337, "tid": -914061504, "ts": 1716454223652396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223652397, "dur": 0, "args": { "External id": 149164, "cbid": 205, "correlation": 149164 } }, { "ph": "f", "id": 149164, "pid": 76337, "tid": -914061504, "ts": 1716454223652397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223727108, "dur": 22, "args": { "External id": 149168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149168, "pid": 5, "tid": 7, "ts": 1716454223727108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652410, "dur": 12, "args": { "External id": 149168, "cbid": 211, "correlation": 149168 } }, { "ph": "s", "id": 149168, "pid": 76337, "tid": -914061504, "ts": 1716454223652410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223727131, "dur": 105, "args": { "External id": 149170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149170, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149170, "pid": 5, "tid": 7, "ts": 1716454223727131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652428, "dur": 6, "args": { "External id": 149170, "cbid": 211, "correlation": 149170 } }, { "ph": "s", "id": 149170, "pid": 76337, "tid": -914061504, "ts": 1716454223652428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223727237, "dur": 22, "args": { "External id": 149172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149172, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149172, "pid": 5, "tid": 7, "ts": 1716454223727237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652437, "dur": 5, "args": { "External id": 149172, "cbid": 211, "correlation": 149172 } }, { "ph": "s", "id": 149172, "pid": 76337, "tid": -914061504, "ts": 1716454223652437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223727260, "dur": 32, "args": { "External id": 149178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149178, "pid": 5, "tid": 7, "ts": 1716454223727260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652464, "dur": 8, "args": { "External id": 149178, "cbid": 211, "correlation": 149178 } }, { "ph": "s", "id": 149178, "pid": 76337, "tid": -914061504, "ts": 1716454223652464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223727293, "dur": 193, "args": { "External id": 149187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149187, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149187, "pid": 5, "tid": 7, "ts": 1716454223727293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652549, "dur": 16, "args": { "External id": 149187, "cbid": 211, "correlation": 149187 } }, { "ph": "s", "id": 149187, "pid": 76337, "tid": -914061504, "ts": 1716454223652549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223727487, "dur": 65, "args": { "External id": 149209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149209, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149209, "pid": 5, "tid": 7, "ts": 1716454223727487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652610, "dur": 10, "args": { "External id": 149209, "cbid": 211, "correlation": 149209 } }, { "ph": "s", "id": 149209, "pid": 76337, "tid": -914061504, "ts": 1716454223652610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223652701, "dur": 2, "args": { "External id": 149220, "cbid": 251, "correlation": 149220 } }, { "ph": "f", "id": 149220, "pid": 76337, "tid": -914061504, "ts": 1716454223652701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223727553, "dur": 149, "args": { "External id": 149221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149221, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149221, "pid": 5, "tid": 7, "ts": 1716454223727553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652708, "dur": 13, "args": { "External id": 149221, "cbid": 211, "correlation": 149221 } }, { "ph": "s", "id": 149221, "pid": 76337, "tid": -914061504, "ts": 1716454223652708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223652780, "dur": 1, "args": { "External id": 149232, "cbid": 251, "correlation": 149232 } }, { "ph": "f", "id": 149232, "pid": 76337, "tid": -914061504, "ts": 1716454223652780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223727704, "dur": 146, "args": { "External id": 149233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149233, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149233, "pid": 5, "tid": 7, "ts": 1716454223727704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652784, "dur": 11, "args": { "External id": 149233, "cbid": 211, "correlation": 149233 } }, { "ph": "s", "id": 149233, "pid": 76337, "tid": -914061504, "ts": 1716454223652784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223652850, "dur": 1, "args": { "External id": 149244, "cbid": 251, "correlation": 149244 } }, { "ph": "f", "id": 149244, "pid": 76337, "tid": -914061504, "ts": 1716454223652850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223727851, "dur": 144, "args": { "External id": 149245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149245, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149245, "pid": 5, "tid": 7, "ts": 1716454223727851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652854, "dur": 11, "args": { "External id": 149245, "cbid": 211, "correlation": 149245 } }, { "ph": "s", "id": 149245, "pid": 76337, "tid": -914061504, "ts": 1716454223652854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223727996, "dur": 1948, "args": { "External id": 149266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149266, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 149266, "pid": 5, "tid": 7, "ts": 1716454223727996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223652937, "dur": 13, "args": { "External id": 149266, "cbid": 211, "correlation": 149266 } }, { "ph": "s", "id": 149266, "pid": 76337, "tid": -914061504, "ts": 1716454223652937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653047, "dur": 1, "args": { "External id": 149284, "cbid": 251, "correlation": 149284 } }, { "ph": "f", "id": 149284, "pid": 76337, "tid": -914061504, "ts": 1716454223653047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223729945, "dur": 151, "args": { "External id": 149286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149286, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 149286, "pid": 5, "tid": 7, "ts": 1716454223729945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653052, "dur": 13, "args": { "External id": 149286, "cbid": 211, "correlation": 149286 } }, { "ph": "s", "id": 149286, "pid": 76337, "tid": -914061504, "ts": 1716454223653052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223730097, "dur": 36, "args": { "External id": 149294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149294, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149294, "pid": 5, "tid": 7, "ts": 1716454223730097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653123, "dur": 13, "args": { "External id": 149294, "cbid": 211, "correlation": 149294 } }, { "ph": "s", "id": 149294, "pid": 76337, "tid": -914061504, "ts": 1716454223653123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223730134, "dur": 51, "args": { "External id": 149302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149302, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149302, "pid": 5, "tid": 7, "ts": 1716454223730134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653162, "dur": 9, "args": { "External id": 149302, "cbid": 211, "correlation": 149302 } }, { "ph": "s", "id": 149302, "pid": 76337, "tid": -914061504, "ts": 1716454223653162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223730186, "dur": 29, "args": { "External id": 149313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149313, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149313, "pid": 5, "tid": 7, "ts": 1716454223730186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653237, "dur": 13, "args": { "External id": 149313, "cbid": 211, "correlation": 149313 } }, { "ph": "s", "id": 149313, "pid": 76337, "tid": -914061504, "ts": 1716454223653237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223730216, "dur": 35, "args": { "External id": 149335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149335, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149335, "pid": 5, "tid": 7, "ts": 1716454223730216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653268, "dur": 7, "args": { "External id": 149335, "cbid": 211, "correlation": 149335 } }, { "ph": "s", "id": 149335, "pid": 76337, "tid": -914061504, "ts": 1716454223653268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653352, "dur": 1, "args": { "External id": 149346, "cbid": 251, "correlation": 149346 } }, { "ph": "f", "id": 149346, "pid": 76337, "tid": -914061504, "ts": 1716454223653352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223730252, "dur": 89, "args": { "External id": 149347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149347, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149347, "pid": 5, "tid": 7, "ts": 1716454223730252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653357, "dur": 12, "args": { "External id": 149347, "cbid": 211, "correlation": 149347 } }, { "ph": "s", "id": 149347, "pid": 76337, "tid": -914061504, "ts": 1716454223653357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653427, "dur": 1, "args": { "External id": 149358, "cbid": 251, "correlation": 149358 } }, { "ph": "f", "id": 149358, "pid": 76337, "tid": -914061504, "ts": 1716454223653427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653431, "dur": 0, "args": { "External id": 149359, "cbid": 251, "correlation": 149359 } }, { "ph": "f", "id": 149359, "pid": 76337, "tid": -914061504, "ts": 1716454223653431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223730342, "dur": 12, "args": { "External id": 149360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149360, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 149360, "pid": 5, "tid": 7, "ts": 1716454223730342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653433, "dur": 11, "args": { "External id": 149360, "cbid": 211, "correlation": 149360 } }, { "ph": "s", "id": 149360, "pid": 76337, "tid": -914061504, "ts": 1716454223653433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223730355, "dur": 5, "args": { "External id": 149362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149362, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 149362, "pid": 5, "tid": 7, "ts": 1716454223730355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653446, "dur": 5, "args": { "External id": 149362, "cbid": 211, "correlation": 149362 } }, { "ph": "s", "id": 149362, "pid": 76337, "tid": -914061504, "ts": 1716454223653446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653504, "dur": 1, "args": { "External id": 149373, "cbid": 251, "correlation": 149373 } }, { "ph": "f", "id": 149373, "pid": 76337, "tid": -914061504, "ts": 1716454223653504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653507, "dur": 0, "args": { "External id": 149374, "cbid": 251, "correlation": 149374 } }, { "ph": "f", "id": 149374, "pid": 76337, "tid": -914061504, "ts": 1716454223653507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223730362, "dur": 7, "args": { "External id": 149375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149375, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 149375, "pid": 5, "tid": 7, "ts": 1716454223730362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653508, "dur": 11, "args": { "External id": 149375, "cbid": 211, "correlation": 149375 } }, { "ph": "s", "id": 149375, "pid": 76337, "tid": -914061504, "ts": 1716454223653508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223730371, "dur": 3, "args": { "External id": 149377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149377, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 149377, "pid": 5, "tid": 7, "ts": 1716454223730371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653521, "dur": 6, "args": { "External id": 149377, "cbid": 211, "correlation": 149377 } }, { "ph": "s", "id": 149377, "pid": 76337, "tid": -914061504, "ts": 1716454223653521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223730375, "dur": 91, "args": { "External id": 149398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149398, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 149398, "pid": 5, "tid": 7, "ts": 1716454223730375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653595, "dur": 12, "args": { "External id": 149398, "cbid": 211, "correlation": 149398 } }, { "ph": "s", "id": 149398, "pid": 76337, "tid": -914061504, "ts": 1716454223653595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653694, "dur": 1, "args": { "External id": 149416, "cbid": 251, "correlation": 149416 } }, { "ph": "f", "id": 149416, "pid": 76337, "tid": -914061504, "ts": 1716454223653694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223730467, "dur": 98, "args": { "External id": 149418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149418, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149418, "pid": 5, "tid": 7, "ts": 1716454223730467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653699, "dur": 13, "args": { "External id": 149418, "cbid": 211, "correlation": 149418 } }, { "ph": "s", "id": 149418, "pid": 76337, "tid": -914061504, "ts": 1716454223653699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223730567, "dur": 20, "args": { "External id": 149426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149426, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149426, "pid": 5, "tid": 7, "ts": 1716454223730567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653767, "dur": 13, "args": { "External id": 149426, "cbid": 211, "correlation": 149426 } }, { "ph": "s", "id": 149426, "pid": 76337, "tid": -914061504, "ts": 1716454223653767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223730588, "dur": 38, "args": { "External id": 149434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149434, "pid": 5, "tid": 7, "ts": 1716454223730588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653809, "dur": 9, "args": { "External id": 149434, "cbid": 211, "correlation": 149434 } }, { "ph": "s", "id": 149434, "pid": 76337, "tid": -914061504, "ts": 1716454223653809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223730627, "dur": 34, "args": { "External id": 149456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149456, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149456, "pid": 5, "tid": 7, "ts": 1716454223730627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653861, "dur": 10, "args": { "External id": 149456, "cbid": 211, "correlation": 149456 } }, { "ph": "s", "id": 149456, "pid": 76337, "tid": -914061504, "ts": 1716454223653861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653951, "dur": 1, "args": { "External id": 149472, "cbid": 251, "correlation": 149472 } }, { "ph": "f", "id": 149472, "pid": 76337, "tid": -914061504, "ts": 1716454223653951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223653956, "dur": 0, "args": { "External id": 149474, "cbid": 251, "correlation": 149474 } }, { "ph": "f", "id": 149474, "pid": 76337, "tid": -914061504, "ts": 1716454223653956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223730663, "dur": 538, "args": { "External id": 149475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149475, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 149475, "pid": 5, "tid": 7, "ts": 1716454223730663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223653959, "dur": 12, "args": { "External id": 149475, "cbid": 211, "correlation": 149475 } }, { "ph": "s", "id": 149475, "pid": 76337, "tid": -914061504, "ts": 1716454223653959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223731202, "dur": 127, "args": { "External id": 149483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149483, "pid": 5, "tid": 7, "ts": 1716454223731202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654037, "dur": 14, "args": { "External id": 149483, "cbid": 211, "correlation": 149483 } }, { "ph": "s", "id": 149483, "pid": 76337, "tid": -914061504, "ts": 1716454223654037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223731331, "dur": 128, "args": { "External id": 149491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149491, "pid": 5, "tid": 7, "ts": 1716454223731331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654070, "dur": 9, "args": { "External id": 149491, "cbid": 211, "correlation": 149491 } }, { "ph": "s", "id": 149491, "pid": 76337, "tid": -914061504, "ts": 1716454223654070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223654148, "dur": 1, "args": { "External id": 149507, "cbid": 251, "correlation": 149507 } }, { "ph": "f", "id": 149507, "pid": 76337, "tid": -914061504, "ts": 1716454223654148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223731460, "dur": 307, "args": { "External id": 149509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149509, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149509, "pid": 5, "tid": 7, "ts": 1716454223731460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654154, "dur": 12, "args": { "External id": 149509, "cbid": 211, "correlation": 149509 } }, { "ph": "s", "id": 149509, "pid": 76337, "tid": -914061504, "ts": 1716454223654154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223731768, "dur": 27, "args": { "External id": 149517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149517, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149517, "pid": 5, "tid": 7, "ts": 1716454223731768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654196, "dur": 10, "args": { "External id": 149517, "cbid": 211, "correlation": 149517 } }, { "ph": "s", "id": 149517, "pid": 76337, "tid": -914061504, "ts": 1716454223654196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223731797, "dur": 81, "args": { "External id": 149528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149528, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149528, "pid": 5, "tid": 7, "ts": 1716454223731797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654263, "dur": 13, "args": { "External id": 149528, "cbid": 211, "correlation": 149528 } }, { "ph": "s", "id": 149528, "pid": 76337, "tid": -914061504, "ts": 1716454223654263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223654327, "dur": 0, "args": { "External id": 149540, "cbid": 317, "correlation": 149540 } }, { "ph": "f", "id": 149540, "pid": 76337, "tid": -914061504, "ts": 1716454223654327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223654328, "dur": 0, "args": { "External id": 149541, "cbid": 203, "correlation": 149541 } }, { "ph": "f", "id": 149541, "pid": 76337, "tid": -914061504, "ts": 1716454223654328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223654329, "dur": 0, "args": { "External id": 149542, "cbid": 205, "correlation": 149542 } }, { "ph": "f", "id": 149542, "pid": 76337, "tid": -914061504, "ts": 1716454223654329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223731880, "dur": 22, "args": { "External id": 149546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149546, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149546, "pid": 5, "tid": 7, "ts": 1716454223731880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654344, "dur": 12, "args": { "External id": 149546, "cbid": 211, "correlation": 149546 } }, { "ph": "s", "id": 149546, "pid": 76337, "tid": -914061504, "ts": 1716454223654344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223731903, "dur": 119, "args": { "External id": 149548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149548, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149548, "pid": 5, "tid": 7, "ts": 1716454223731903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654362, "dur": 7, "args": { "External id": 149548, "cbid": 211, "correlation": 149548 } }, { "ph": "s", "id": 149548, "pid": 76337, "tid": -914061504, "ts": 1716454223654362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223732023, "dur": 23, "args": { "External id": 149550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149550, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149550, "pid": 5, "tid": 7, "ts": 1716454223732023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654373, "dur": 5, "args": { "External id": 149550, "cbid": 211, "correlation": 149550 } }, { "ph": "s", "id": 149550, "pid": 76337, "tid": -914061504, "ts": 1716454223654373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223732047, "dur": 33, "args": { "External id": 149556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149556, "pid": 5, "tid": 7, "ts": 1716454223732047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654401, "dur": 8, "args": { "External id": 149556, "cbid": 211, "correlation": 149556 } }, { "ph": "s", "id": 149556, "pid": 76337, "tid": -914061504, "ts": 1716454223654401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223732082, "dur": 27, "args": { "External id": 149564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149564, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149564, "pid": 5, "tid": 7, "ts": 1716454223732082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654432, "dur": 8, "args": { "External id": 149564, "cbid": 211, "correlation": 149564 } }, { "ph": "s", "id": 149564, "pid": 76337, "tid": -914061504, "ts": 1716454223654432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223732109, "dur": 101, "args": { "External id": 149575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149575, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149575, "pid": 5, "tid": 7, "ts": 1716454223732109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654499, "dur": 12, "args": { "External id": 149575, "cbid": 211, "correlation": 149575 } }, { "ph": "s", "id": 149575, "pid": 76337, "tid": -914061504, "ts": 1716454223654499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223654555, "dur": 0, "args": { "External id": 149585, "cbid": 317, "correlation": 149585 } }, { "ph": "f", "id": 149585, "pid": 76337, "tid": -914061504, "ts": 1716454223654555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223654556, "dur": 0, "args": { "External id": 149586, "cbid": 203, "correlation": 149586 } }, { "ph": "f", "id": 149586, "pid": 76337, "tid": -914061504, "ts": 1716454223654556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223654557, "dur": 0, "args": { "External id": 149587, "cbid": 205, "correlation": 149587 } }, { "ph": "f", "id": 149587, "pid": 76337, "tid": -914061504, "ts": 1716454223654557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223732212, "dur": 74, "args": { "External id": 149591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149591, "pid": 5, "tid": 7, "ts": 1716454223732212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654573, "dur": 11, "args": { "External id": 149591, "cbid": 211, "correlation": 149591 } }, { "ph": "s", "id": 149591, "pid": 76337, "tid": -914061504, "ts": 1716454223654573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223732287, "dur": 44, "args": { "External id": 149593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149593, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149593, "pid": 5, "tid": 7, "ts": 1716454223732287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654587, "dur": 5, "args": { "External id": 149593, "cbid": 211, "correlation": 149593 } }, { "ph": "s", "id": 149593, "pid": 76337, "tid": -914061504, "ts": 1716454223654587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223732333, "dur": 4, "args": { "External id": 149595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149595, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149595, "pid": 5, "tid": 7, "ts": 1716454223732333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654598, "dur": 7, "args": { "External id": 149595, "cbid": 211, "correlation": 149595 } }, { "ph": "s", "id": 149595, "pid": 76337, "tid": -914061504, "ts": 1716454223654598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223654609, "dur": 0, "args": { "External id": 149596, "cbid": 51, "correlation": 149596 } }, { "ph": "s", "id": 149596, "pid": 76337, "tid": -914061504, "ts": 1716454223654609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223732338, "dur": 2236, "args": { "External id": 149597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149597, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149597, "pid": 5, "tid": 7, "ts": 1716454223732338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654611, "dur": 7, "args": { "External id": 149597, "cbid": 211, "correlation": 149597 } }, { "ph": "s", "id": 149597, "pid": 76337, "tid": -914061504, "ts": 1716454223654611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223734576, "dur": 113, "args": { "External id": 149602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149602, "pid": 5, "tid": 7, "ts": 1716454223734576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654640, "dur": 8, "args": { "External id": 149602, "cbid": 211, "correlation": 149602 } }, { "ph": "s", "id": 149602, "pid": 76337, "tid": -914061504, "ts": 1716454223654640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223734690, "dur": 165, "args": { "External id": 149611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149611, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149611, "pid": 5, "tid": 7, "ts": 1716454223734690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654738, "dur": 13, "args": { "External id": 149611, "cbid": 211, "correlation": 149611 } }, { "ph": "s", "id": 149611, "pid": 76337, "tid": -914061504, "ts": 1716454223654738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223734857, "dur": 127, "args": { "External id": 149631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149631, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 149631, "pid": 5, "tid": 7, "ts": 1716454223734857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654809, "dur": 12, "args": { "External id": 149631, "cbid": 211, "correlation": 149631 } }, { "ph": "s", "id": 149631, "pid": 76337, "tid": -914061504, "ts": 1716454223654809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223734985, "dur": 4, "args": { "External id": 149643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149643, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 149643, "pid": 5, "tid": 7, "ts": 1716454223734985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654831, "dur": 8, "args": { "External id": 149643, "cbid": 211, "correlation": 149643 } }, { "ph": "s", "id": 149643, "pid": 76337, "tid": -914061504, "ts": 1716454223654831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223734991, "dur": 158, "args": { "External id": 149646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149646, "pid": 5, "tid": 7, "ts": 1716454223734991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654851, "dur": 6, "args": { "External id": 149646, "cbid": 211, "correlation": 149646 } }, { "ph": "s", "id": 149646, "pid": 76337, "tid": -914061504, "ts": 1716454223654851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223735151, "dur": 101, "args": { "External id": 149655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149655, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149655, "pid": 5, "tid": 7, "ts": 1716454223735151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654891, "dur": 9, "args": { "External id": 149655, "cbid": 211, "correlation": 149655 } }, { "ph": "s", "id": 149655, "pid": 76337, "tid": -914061504, "ts": 1716454223654891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223654943, "dur": 0, "args": { "External id": 149665, "cbid": 317, "correlation": 149665 } }, { "ph": "f", "id": 149665, "pid": 76337, "tid": -914061504, "ts": 1716454223654943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223654944, "dur": 0, "args": { "External id": 149666, "cbid": 203, "correlation": 149666 } }, { "ph": "f", "id": 149666, "pid": 76337, "tid": -914061504, "ts": 1716454223654944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223654945, "dur": 0, "args": { "External id": 149667, "cbid": 205, "correlation": 149667 } }, { "ph": "f", "id": 149667, "pid": 76337, "tid": -914061504, "ts": 1716454223654945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223735253, "dur": 111, "args": { "External id": 149671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149671, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149671, "pid": 5, "tid": 7, "ts": 1716454223735253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654965, "dur": 22, "args": { "External id": 149671, "cbid": 211, "correlation": 149671 } }, { "ph": "s", "id": 149671, "pid": 76337, "tid": -914061504, "ts": 1716454223654965, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223735365, "dur": 34, "args": { "External id": 149673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149673, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149673, "pid": 5, "tid": 7, "ts": 1716454223735365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223654989, "dur": 5, "args": { "External id": 149673, "cbid": 211, "correlation": 149673 } }, { "ph": "s", "id": 149673, "pid": 76337, "tid": -914061504, "ts": 1716454223654989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223735400, "dur": 4, "args": { "External id": 149675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149675, "pid": 5, "tid": 7, "ts": 1716454223735400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655000, "dur": 5, "args": { "External id": 149675, "cbid": 211, "correlation": 149675 } }, { "ph": "s", "id": 149675, "pid": 76337, "tid": -914061504, "ts": 1716454223655000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223655009, "dur": 0, "args": { "External id": 149676, "cbid": 51, "correlation": 149676 } }, { "ph": "s", "id": 149676, "pid": 76337, "tid": -914061504, "ts": 1716454223655009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223735405, "dur": 2003, "args": { "External id": 149677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149677, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149677, "pid": 5, "tid": 7, "ts": 1716454223735405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655010, "dur": 7, "args": { "External id": 149677, "cbid": 211, "correlation": 149677 } }, { "ph": "s", "id": 149677, "pid": 76337, "tid": -914061504, "ts": 1716454223655010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223737409, "dur": 59, "args": { "External id": 149682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149682, "pid": 5, "tid": 7, "ts": 1716454223737409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655041, "dur": 8, "args": { "External id": 149682, "cbid": 211, "correlation": 149682 } }, { "ph": "s", "id": 149682, "pid": 76337, "tid": -914061504, "ts": 1716454223655041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223737470, "dur": 4, "args": { "External id": 149690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149690, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149690, "pid": 5, "tid": 7, "ts": 1716454223737470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655084, "dur": 10, "args": { "External id": 149690, "cbid": 211, "correlation": 149690 } }, { "ph": "s", "id": 149690, "pid": 76337, "tid": -914061504, "ts": 1716454223655084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655150, "dur": 1, "args": { "External id": 149706, "cbid": 251, "correlation": 149706 } }, { "ph": "f", "id": 149706, "pid": 76337, "tid": -914061504, "ts": 1716454223655150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655155, "dur": 0, "args": { "External id": 149708, "cbid": 251, "correlation": 149708 } }, { "ph": "f", "id": 149708, "pid": 76337, "tid": -914061504, "ts": 1716454223655155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223737475, "dur": 11, "args": { "External id": 149709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149709, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 149709, "pid": 5, "tid": 7, "ts": 1716454223737475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655157, "dur": 12, "args": { "External id": 149709, "cbid": 211, "correlation": 149709 } }, { "ph": "s", "id": 149709, "pid": 76337, "tid": -914061504, "ts": 1716454223655157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223737488, "dur": 5, "args": { "External id": 149711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149711, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 149711, "pid": 5, "tid": 7, "ts": 1716454223737488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655171, "dur": 6, "args": { "External id": 149711, "cbid": 211, "correlation": 149711 } }, { "ph": "s", "id": 149711, "pid": 76337, "tid": -914061504, "ts": 1716454223655171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223737494, "dur": 53, "args": { "External id": 149721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149721, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149721, "pid": 5, "tid": 7, "ts": 1716454223737494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655229, "dur": 12, "args": { "External id": 149721, "cbid": 211, "correlation": 149721 } }, { "ph": "s", "id": 149721, "pid": 76337, "tid": -914061504, "ts": 1716454223655229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223737549, "dur": 51, "args": { "External id": 149741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149741, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 149741, "pid": 5, "tid": 7, "ts": 1716454223737549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655296, "dur": 10, "args": { "External id": 149741, "cbid": 211, "correlation": 149741 } }, { "ph": "s", "id": 149741, "pid": 76337, "tid": -914061504, "ts": 1716454223655296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223737601, "dur": 4, "args": { "External id": 149753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149753, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149753, "pid": 5, "tid": 7, "ts": 1716454223737601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655317, "dur": 7, "args": { "External id": 149753, "cbid": 211, "correlation": 149753 } }, { "ph": "s", "id": 149753, "pid": 76337, "tid": -914061504, "ts": 1716454223655317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223737606, "dur": 55, "args": { "External id": 149756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149756, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149756, "pid": 5, "tid": 7, "ts": 1716454223737606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655335, "dur": 6, "args": { "External id": 149756, "cbid": 211, "correlation": 149756 } }, { "ph": "s", "id": 149756, "pid": 76337, "tid": -914061504, "ts": 1716454223655335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223737663, "dur": 36, "args": { "External id": 149765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149765, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149765, "pid": 5, "tid": 7, "ts": 1716454223737663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655376, "dur": 9, "args": { "External id": 149765, "cbid": 211, "correlation": 149765 } }, { "ph": "s", "id": 149765, "pid": 76337, "tid": -914061504, "ts": 1716454223655376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223655439, "dur": 0, "args": { "External id": 149775, "cbid": 317, "correlation": 149775 } }, { "ph": "f", "id": 149775, "pid": 76337, "tid": -914061504, "ts": 1716454223655439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223655440, "dur": 0, "args": { "External id": 149776, "cbid": 203, "correlation": 149776 } }, { "ph": "f", "id": 149776, "pid": 76337, "tid": -914061504, "ts": 1716454223655440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223655441, "dur": 0, "args": { "External id": 149777, "cbid": 205, "correlation": 149777 } }, { "ph": "f", "id": 149777, "pid": 76337, "tid": -914061504, "ts": 1716454223655441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223737699, "dur": 39, "args": { "External id": 149781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149781, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149781, "pid": 5, "tid": 7, "ts": 1716454223737699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655458, "dur": 14, "args": { "External id": 149781, "cbid": 211, "correlation": 149781 } }, { "ph": "s", "id": 149781, "pid": 76337, "tid": -914061504, "ts": 1716454223655458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223737740, "dur": 14, "args": { "External id": 149783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149783, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149783, "pid": 5, "tid": 7, "ts": 1716454223737740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655475, "dur": 5, "args": { "External id": 149783, "cbid": 211, "correlation": 149783 } }, { "ph": "s", "id": 149783, "pid": 76337, "tid": -914061504, "ts": 1716454223655475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223737755, "dur": 4, "args": { "External id": 149785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149785, "pid": 5, "tid": 7, "ts": 1716454223737755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655485, "dur": 5, "args": { "External id": 149785, "cbid": 211, "correlation": 149785 } }, { "ph": "s", "id": 149785, "pid": 76337, "tid": -914061504, "ts": 1716454223655485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223655493, "dur": 0, "args": { "External id": 149786, "cbid": 51, "correlation": 149786 } }, { "ph": "s", "id": 149786, "pid": 76337, "tid": -914061504, "ts": 1716454223655493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223737760, "dur": 696, "args": { "External id": 149787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149787, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149787, "pid": 5, "tid": 7, "ts": 1716454223737760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655494, "dur": 5, "args": { "External id": 149787, "cbid": 211, "correlation": 149787 } }, { "ph": "s", "id": 149787, "pid": 76337, "tid": -914061504, "ts": 1716454223655494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223738457, "dur": 59, "args": { "External id": 149792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149792, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149792, "pid": 5, "tid": 7, "ts": 1716454223738457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655522, "dur": 8, "args": { "External id": 149792, "cbid": 211, "correlation": 149792 } }, { "ph": "s", "id": 149792, "pid": 76337, "tid": -914061504, "ts": 1716454223655522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223655579, "dur": 0, "args": { "External id": 149802, "cbid": 317, "correlation": 149802 } }, { "ph": "f", "id": 149802, "pid": 76337, "tid": -914061504, "ts": 1716454223655579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223655580, "dur": 0, "args": { "External id": 149803, "cbid": 203, "correlation": 149803 } }, { "ph": "f", "id": 149803, "pid": 76337, "tid": -914061504, "ts": 1716454223655580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223655581, "dur": 0, "args": { "External id": 149804, "cbid": 205, "correlation": 149804 } }, { "ph": "f", "id": 149804, "pid": 76337, "tid": -914061504, "ts": 1716454223655581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223738518, "dur": 4, "args": { "External id": 149808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149808, "pid": 5, "tid": 7, "ts": 1716454223738518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655598, "dur": 11, "args": { "External id": 149808, "cbid": 211, "correlation": 149808 } }, { "ph": "s", "id": 149808, "pid": 76337, "tid": -914061504, "ts": 1716454223655598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223655613, "dur": 0, "args": { "External id": 149809, "cbid": 51, "correlation": 149809 } }, { "ph": "s", "id": 149809, "pid": 76337, "tid": -914061504, "ts": 1716454223655613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454223738523, "dur": 265, "args": { "External id": 149810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149810, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149810, "pid": 5, "tid": 7, "ts": 1716454223738523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655614, "dur": 8, "args": { "External id": 149810, "cbid": 211, "correlation": 149810 } }, { "ph": "s", "id": 149810, "pid": 76337, "tid": -914061504, "ts": 1716454223655614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223738789, "dur": 59, "args": { "External id": 149815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149815, "pid": 5, "tid": 7, "ts": 1716454223738789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655642, "dur": 8, "args": { "External id": 149815, "cbid": 211, "correlation": 149815 } }, { "ph": "s", "id": 149815, "pid": 76337, "tid": -914061504, "ts": 1716454223655642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223738849, "dur": 50, "args": { "External id": 149823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149823, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149823, "pid": 5, "tid": 7, "ts": 1716454223738849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655671, "dur": 9, "args": { "External id": 149823, "cbid": 211, "correlation": 149823 } }, { "ph": "s", "id": 149823, "pid": 76337, "tid": -914061504, "ts": 1716454223655671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223738900, "dur": 35, "args": { "External id": 149831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149831, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149831, "pid": 5, "tid": 7, "ts": 1716454223738900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655701, "dur": 8, "args": { "External id": 149831, "cbid": 211, "correlation": 149831 } }, { "ph": "s", "id": 149831, "pid": 76337, "tid": -914061504, "ts": 1716454223655701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223738936, "dur": 50, "args": { "External id": 149851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149851, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 149851, "pid": 5, "tid": 7, "ts": 1716454223738936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655782, "dur": 12, "args": { "External id": 149851, "cbid": 211, "correlation": 149851 } }, { "ph": "s", "id": 149851, "pid": 76337, "tid": -914061504, "ts": 1716454223655782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223738988, "dur": 4, "args": { "External id": 149863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149863, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 149863, "pid": 5, "tid": 7, "ts": 1716454223738988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655804, "dur": 6, "args": { "External id": 149863, "cbid": 211, "correlation": 149863 } }, { "ph": "s", "id": 149863, "pid": 76337, "tid": -914061504, "ts": 1716454223655804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223738993, "dur": 55, "args": { "External id": 149866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149866, "pid": 5, "tid": 7, "ts": 1716454223738993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655822, "dur": 6, "args": { "External id": 149866, "cbid": 211, "correlation": 149866 } }, { "ph": "s", "id": 149866, "pid": 76337, "tid": -914061504, "ts": 1716454223655822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223655878, "dur": 0, "args": { "External id": 149877, "cbid": 317, "correlation": 149877 } }, { "ph": "f", "id": 149877, "pid": 76337, "tid": -914061504, "ts": 1716454223655878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223655879, "dur": 0, "args": { "External id": 149878, "cbid": 203, "correlation": 149878 } }, { "ph": "f", "id": 149878, "pid": 76337, "tid": -914061504, "ts": 1716454223655879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223655880, "dur": 0, "args": { "External id": 149879, "cbid": 205, "correlation": 149879 } }, { "ph": "f", "id": 149879, "pid": 76337, "tid": -914061504, "ts": 1716454223655880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655910, "dur": 2, "args": { "External id": 149883, "cbid": 251, "correlation": 149883 } }, { "ph": "f", "id": 149883, "pid": 76337, "tid": -914061504, "ts": 1716454223655910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655914, "dur": 1, "args": { "External id": 149884, "cbid": 251, "correlation": 149884 } }, { "ph": "f", "id": 149884, "pid": 76337, "tid": -914061504, "ts": 1716454223655914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655916, "dur": 1, "args": { "External id": 149885, "cbid": 251, "correlation": 149885 } }, { "ph": "f", "id": 149885, "pid": 76337, "tid": -914061504, "ts": 1716454223655916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655918, "dur": 1, "args": { "External id": 149886, "cbid": 251, "correlation": 149886 } }, { "ph": "f", "id": 149886, "pid": 76337, "tid": -914061504, "ts": 1716454223655918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655920, "dur": 1, "args": { "External id": 149887, "cbid": 251, "correlation": 149887 } }, { "ph": "f", "id": 149887, "pid": 76337, "tid": -914061504, "ts": 1716454223655920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655922, "dur": 1, "args": { "External id": 149888, "cbid": 251, "correlation": 149888 } }, { "ph": "f", "id": 149888, "pid": 76337, "tid": -914061504, "ts": 1716454223655922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655924, "dur": 1, "args": { "External id": 149889, "cbid": 251, "correlation": 149889 } }, { "ph": "f", "id": 149889, "pid": 76337, "tid": -914061504, "ts": 1716454223655924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655926, "dur": 1, "args": { "External id": 149890, "cbid": 251, "correlation": 149890 } }, { "ph": "f", "id": 149890, "pid": 76337, "tid": -914061504, "ts": 1716454223655926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223655929, "dur": 0, "args": { "External id": 149891, "cbid": 251, "correlation": 149891 } }, { "ph": "f", "id": 149891, "pid": 76337, "tid": -914061504, "ts": 1716454223655929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223739049, "dur": 115, "args": { "External id": 149892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149892, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 149892, "pid": 5, "tid": 7, "ts": 1716454223739049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655931, "dur": 14, "args": { "External id": 149892, "cbid": 211, "correlation": 149892 } }, { "ph": "s", "id": 149892, "pid": 76337, "tid": -914061504, "ts": 1716454223655931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223739165, "dur": 59, "args": { "External id": 149898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149898, "pid": 5, "tid": 7, "ts": 1716454223739165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223655968, "dur": 17, "args": { "External id": 149898, "cbid": 211, "correlation": 149898 } }, { "ph": "s", "id": 149898, "pid": 76337, "tid": -914061504, "ts": 1716454223655968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223739226, "dur": 608, "args": { "External id": 149907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149907, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149907, "pid": 5, "tid": 7, "ts": 1716454223739226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656059, "dur": 15, "args": { "External id": 149907, "cbid": 211, "correlation": 149907 } }, { "ph": "s", "id": 149907, "pid": 76337, "tid": -914061504, "ts": 1716454223656059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223739835, "dur": 181, "args": { "External id": 149929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149929, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 149929, "pid": 5, "tid": 7, "ts": 1716454223739835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656117, "dur": 10, "args": { "External id": 149929, "cbid": 211, "correlation": 149929 } }, { "ph": "s", "id": 149929, "pid": 76337, "tid": -914061504, "ts": 1716454223656117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656203, "dur": 1, "args": { "External id": 149940, "cbid": 251, "correlation": 149940 } }, { "ph": "f", "id": 149940, "pid": 76337, "tid": -914061504, "ts": 1716454223656203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223740017, "dur": 199, "args": { "External id": 149941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149941, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149941, "pid": 5, "tid": 7, "ts": 1716454223740017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656208, "dur": 14, "args": { "External id": 149941, "cbid": 211, "correlation": 149941 } }, { "ph": "s", "id": 149941, "pid": 76337, "tid": -914061504, "ts": 1716454223656208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656277, "dur": 1, "args": { "External id": 149952, "cbid": 251, "correlation": 149952 } }, { "ph": "f", "id": 149952, "pid": 76337, "tid": -914061504, "ts": 1716454223656277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223740217, "dur": 191, "args": { "External id": 149953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149953, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149953, "pid": 5, "tid": 7, "ts": 1716454223740217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656281, "dur": 11, "args": { "External id": 149953, "cbid": 211, "correlation": 149953 } }, { "ph": "s", "id": 149953, "pid": 76337, "tid": -914061504, "ts": 1716454223656281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656344, "dur": 1, "args": { "External id": 149964, "cbid": 251, "correlation": 149964 } }, { "ph": "f", "id": 149964, "pid": 76337, "tid": -914061504, "ts": 1716454223656344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223740409, "dur": 187, "args": { "External id": 149965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149965, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 149965, "pid": 5, "tid": 7, "ts": 1716454223740409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656348, "dur": 12, "args": { "External id": 149965, "cbid": 211, "correlation": 149965 } }, { "ph": "s", "id": 149965, "pid": 76337, "tid": -914061504, "ts": 1716454223656348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223740598, "dur": 18557, "args": { "External id": 149986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 149986, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 149986, "pid": 5, "tid": 7, "ts": 1716454223740598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656430, "dur": 14, "args": { "External id": 149986, "cbid": 211, "correlation": 149986 } }, { "ph": "s", "id": 149986, "pid": 76337, "tid": -914061504, "ts": 1716454223656430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656529, "dur": 1, "args": { "External id": 150004, "cbid": 251, "correlation": 150004 } }, { "ph": "f", "id": 150004, "pid": 76337, "tid": -914061504, "ts": 1716454223656529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223759156, "dur": 202, "args": { "External id": 150006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150006, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150006, "pid": 5, "tid": 7, "ts": 1716454223759156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656535, "dur": 13, "args": { "External id": 150006, "cbid": 211, "correlation": 150006 } }, { "ph": "s", "id": 150006, "pid": 76337, "tid": -914061504, "ts": 1716454223656535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223759360, "dur": 67, "args": { "External id": 150014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150014, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150014, "pid": 5, "tid": 7, "ts": 1716454223759360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656604, "dur": 13, "args": { "External id": 150014, "cbid": 211, "correlation": 150014 } }, { "ph": "s", "id": 150014, "pid": 76337, "tid": -914061504, "ts": 1716454223656604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223759428, "dur": 96, "args": { "External id": 150022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150022, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150022, "pid": 5, "tid": 7, "ts": 1716454223759428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656643, "dur": 9, "args": { "External id": 150022, "cbid": 211, "correlation": 150022 } }, { "ph": "s", "id": 150022, "pid": 76337, "tid": -914061504, "ts": 1716454223656643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223759526, "dur": 55, "args": { "External id": 150033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150033, "pid": 5, "tid": 7, "ts": 1716454223759526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656716, "dur": 12, "args": { "External id": 150033, "cbid": 211, "correlation": 150033 } }, { "ph": "s", "id": 150033, "pid": 76337, "tid": -914061504, "ts": 1716454223656716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223759582, "dur": 91, "args": { "External id": 150055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150055, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150055, "pid": 5, "tid": 7, "ts": 1716454223759582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656747, "dur": 7, "args": { "External id": 150055, "cbid": 211, "correlation": 150055 } }, { "ph": "s", "id": 150055, "pid": 76337, "tid": -914061504, "ts": 1716454223656747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656830, "dur": 1, "args": { "External id": 150066, "cbid": 251, "correlation": 150066 } }, { "ph": "f", "id": 150066, "pid": 76337, "tid": -914061504, "ts": 1716454223656830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223759675, "dur": 103, "args": { "External id": 150067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150067, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150067, "pid": 5, "tid": 7, "ts": 1716454223759675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656835, "dur": 12, "args": { "External id": 150067, "cbid": 211, "correlation": 150067 } }, { "ph": "s", "id": 150067, "pid": 76337, "tid": -914061504, "ts": 1716454223656835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656908, "dur": 2, "args": { "External id": 150078, "cbid": 251, "correlation": 150078 } }, { "ph": "f", "id": 150078, "pid": 76337, "tid": -914061504, "ts": 1716454223656908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223656912, "dur": 0, "args": { "External id": 150079, "cbid": 251, "correlation": 150079 } }, { "ph": "f", "id": 150079, "pid": 76337, "tid": -914061504, "ts": 1716454223656912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223759779, "dur": 10, "args": { "External id": 150080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150080, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 150080, "pid": 5, "tid": 7, "ts": 1716454223759779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656915, "dur": 13, "args": { "External id": 150080, "cbid": 211, "correlation": 150080 } }, { "ph": "s", "id": 150080, "pid": 76337, "tid": -914061504, "ts": 1716454223656915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223759790, "dur": 5, "args": { "External id": 150082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150082, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 150082, "pid": 5, "tid": 7, "ts": 1716454223759790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223656931, "dur": 7, "args": { "External id": 150082, "cbid": 211, "correlation": 150082 } }, { "ph": "s", "id": 150082, "pid": 76337, "tid": -914061504, "ts": 1716454223656931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657002, "dur": 1, "args": { "External id": 150093, "cbid": 251, "correlation": 150093 } }, { "ph": "f", "id": 150093, "pid": 76337, "tid": -914061504, "ts": 1716454223657002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657006, "dur": 0, "args": { "External id": 150094, "cbid": 251, "correlation": 150094 } }, { "ph": "f", "id": 150094, "pid": 76337, "tid": -914061504, "ts": 1716454223657006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223759796, "dur": 6, "args": { "External id": 150095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150095, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 150095, "pid": 5, "tid": 7, "ts": 1716454223759796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657007, "dur": 13, "args": { "External id": 150095, "cbid": 211, "correlation": 150095 } }, { "ph": "s", "id": 150095, "pid": 76337, "tid": -914061504, "ts": 1716454223657007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223759804, "dur": 3, "args": { "External id": 150097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150097, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 150097, "pid": 5, "tid": 7, "ts": 1716454223759804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657022, "dur": 5, "args": { "External id": 150097, "cbid": 211, "correlation": 150097 } }, { "ph": "s", "id": 150097, "pid": 76337, "tid": -914061504, "ts": 1716454223657022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223759808, "dur": 155, "args": { "External id": 150118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150118, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150118, "pid": 5, "tid": 7, "ts": 1716454223759808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657096, "dur": 12, "args": { "External id": 150118, "cbid": 211, "correlation": 150118 } }, { "ph": "s", "id": 150118, "pid": 76337, "tid": -914061504, "ts": 1716454223657096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657193, "dur": 2, "args": { "External id": 150136, "cbid": 251, "correlation": 150136 } }, { "ph": "f", "id": 150136, "pid": 76337, "tid": -914061504, "ts": 1716454223657193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223759965, "dur": 107, "args": { "External id": 150138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150138, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150138, "pid": 5, "tid": 7, "ts": 1716454223759965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657199, "dur": 14, "args": { "External id": 150138, "cbid": 211, "correlation": 150138 } }, { "ph": "s", "id": 150138, "pid": 76337, "tid": -914061504, "ts": 1716454223657199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223760074, "dur": 35, "args": { "External id": 150146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150146, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150146, "pid": 5, "tid": 7, "ts": 1716454223760074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657271, "dur": 12, "args": { "External id": 150146, "cbid": 211, "correlation": 150146 } }, { "ph": "s", "id": 150146, "pid": 76337, "tid": -914061504, "ts": 1716454223657271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223760110, "dur": 66, "args": { "External id": 150154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150154, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150154, "pid": 5, "tid": 7, "ts": 1716454223760110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657311, "dur": 10, "args": { "External id": 150154, "cbid": 211, "correlation": 150154 } }, { "ph": "s", "id": 150154, "pid": 76337, "tid": -914061504, "ts": 1716454223657311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223760177, "dur": 92, "args": { "External id": 150176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150176, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150176, "pid": 5, "tid": 7, "ts": 1716454223760177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657362, "dur": 10, "args": { "External id": 150176, "cbid": 211, "correlation": 150176 } }, { "ph": "s", "id": 150176, "pid": 76337, "tid": -914061504, "ts": 1716454223657362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657449, "dur": 1, "args": { "External id": 150192, "cbid": 251, "correlation": 150192 } }, { "ph": "f", "id": 150192, "pid": 76337, "tid": -914061504, "ts": 1716454223657449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223760270, "dur": 572, "args": { "External id": 150194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150194, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150194, "pid": 5, "tid": 7, "ts": 1716454223760270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657455, "dur": 12, "args": { "External id": 150194, "cbid": 211, "correlation": 150194 } }, { "ph": "s", "id": 150194, "pid": 76337, "tid": -914061504, "ts": 1716454223657455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223760844, "dur": 244, "args": { "External id": 150202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150202, "pid": 5, "tid": 7, "ts": 1716454223760844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657520, "dur": 12, "args": { "External id": 150202, "cbid": 211, "correlation": 150202 } }, { "ph": "s", "id": 150202, "pid": 76337, "tid": -914061504, "ts": 1716454223657520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223761089, "dur": 252, "args": { "External id": 150210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150210, "pid": 5, "tid": 7, "ts": 1716454223761089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657552, "dur": 8, "args": { "External id": 150210, "cbid": 211, "correlation": 150210 } }, { "ph": "s", "id": 150210, "pid": 76337, "tid": -914061504, "ts": 1716454223657552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657634, "dur": 1, "args": { "External id": 150226, "cbid": 251, "correlation": 150226 } }, { "ph": "f", "id": 150226, "pid": 76337, "tid": -914061504, "ts": 1716454223657634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657638, "dur": 0, "args": { "External id": 150228, "cbid": 251, "correlation": 150228 } }, { "ph": "f", "id": 150228, "pid": 76337, "tid": -914061504, "ts": 1716454223657638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223761342, "dur": 357, "args": { "External id": 150229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150229, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 150229, "pid": 5, "tid": 7, "ts": 1716454223761342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657641, "dur": 12, "args": { "External id": 150229, "cbid": 211, "correlation": 150229 } }, { "ph": "s", "id": 150229, "pid": 76337, "tid": -914061504, "ts": 1716454223657641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223761700, "dur": 50, "args": { "External id": 150237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150237, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150237, "pid": 5, "tid": 7, "ts": 1716454223761700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657684, "dur": 10, "args": { "External id": 150237, "cbid": 211, "correlation": 150237 } }, { "ph": "s", "id": 150237, "pid": 76337, "tid": -914061504, "ts": 1716454223657684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223761752, "dur": 157, "args": { "External id": 150248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150248, "pid": 5, "tid": 7, "ts": 1716454223761752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657751, "dur": 12, "args": { "External id": 150248, "cbid": 211, "correlation": 150248 } }, { "ph": "s", "id": 150248, "pid": 76337, "tid": -914061504, "ts": 1716454223657751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223657815, "dur": 0, "args": { "External id": 150260, "cbid": 317, "correlation": 150260 } }, { "ph": "f", "id": 150260, "pid": 76337, "tid": -914061504, "ts": 1716454223657815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223657816, "dur": 0, "args": { "External id": 150261, "cbid": 203, "correlation": 150261 } }, { "ph": "f", "id": 150261, "pid": 76337, "tid": -914061504, "ts": 1716454223657816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223657817, "dur": 0, "args": { "External id": 150262, "cbid": 205, "correlation": 150262 } }, { "ph": "f", "id": 150262, "pid": 76337, "tid": -914061504, "ts": 1716454223657817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657842, "dur": 1, "args": { "External id": 150266, "cbid": 251, "correlation": 150266 } }, { "ph": "f", "id": 150266, "pid": 76337, "tid": -914061504, "ts": 1716454223657842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657844, "dur": 0, "args": { "External id": 150267, "cbid": 251, "correlation": 150267 } }, { "ph": "f", "id": 150267, "pid": 76337, "tid": -914061504, "ts": 1716454223657844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657845, "dur": 0, "args": { "External id": 150268, "cbid": 251, "correlation": 150268 } }, { "ph": "f", "id": 150268, "pid": 76337, "tid": -914061504, "ts": 1716454223657845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657846, "dur": 0, "args": { "External id": 150269, "cbid": 251, "correlation": 150269 } }, { "ph": "f", "id": 150269, "pid": 76337, "tid": -914061504, "ts": 1716454223657846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657846, "dur": 0, "args": { "External id": 150270, "cbid": 251, "correlation": 150270 } }, { "ph": "f", "id": 150270, "pid": 76337, "tid": -914061504, "ts": 1716454223657846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657847, "dur": 0, "args": { "External id": 150271, "cbid": 251, "correlation": 150271 } }, { "ph": "f", "id": 150271, "pid": 76337, "tid": -914061504, "ts": 1716454223657847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657848, "dur": 0, "args": { "External id": 150272, "cbid": 251, "correlation": 150272 } }, { "ph": "f", "id": 150272, "pid": 76337, "tid": -914061504, "ts": 1716454223657848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657849, "dur": 0, "args": { "External id": 150273, "cbid": 251, "correlation": 150273 } }, { "ph": "f", "id": 150273, "pid": 76337, "tid": -914061504, "ts": 1716454223657849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223657850, "dur": 0, "args": { "External id": 150274, "cbid": 251, "correlation": 150274 } }, { "ph": "f", "id": 150274, "pid": 76337, "tid": -914061504, "ts": 1716454223657850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223761910, "dur": 114, "args": { "External id": 150275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150275, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150275, "pid": 5, "tid": 7, "ts": 1716454223761910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657852, "dur": 12, "args": { "External id": 150275, "cbid": 211, "correlation": 150275 } }, { "ph": "s", "id": 150275, "pid": 76337, "tid": -914061504, "ts": 1716454223657852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223762026, "dur": 60, "args": { "External id": 150281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150281, "pid": 5, "tid": 7, "ts": 1716454223762026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657887, "dur": 9, "args": { "External id": 150281, "cbid": 211, "correlation": 150281 } }, { "ph": "s", "id": 150281, "pid": 76337, "tid": -914061504, "ts": 1716454223657887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223762087, "dur": 50, "args": { "External id": 150289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150289, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150289, "pid": 5, "tid": 7, "ts": 1716454223762087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657918, "dur": 9, "args": { "External id": 150289, "cbid": 211, "correlation": 150289 } }, { "ph": "s", "id": 150289, "pid": 76337, "tid": -914061504, "ts": 1716454223657918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223762138, "dur": 98, "args": { "External id": 150298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150298, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150298, "pid": 5, "tid": 7, "ts": 1716454223762138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223657958, "dur": 10, "args": { "External id": 150298, "cbid": 211, "correlation": 150298 } }, { "ph": "s", "id": 150298, "pid": 76337, "tid": -914061504, "ts": 1716454223657958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223762237, "dur": 92, "args": { "External id": 150318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150318, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 150318, "pid": 5, "tid": 7, "ts": 1716454223762237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658039, "dur": 12, "args": { "External id": 150318, "cbid": 211, "correlation": 150318 } }, { "ph": "s", "id": 150318, "pid": 76337, "tid": -914061504, "ts": 1716454223658039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223762330, "dur": 5, "args": { "External id": 150330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150330, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 150330, "pid": 5, "tid": 7, "ts": 1716454223762330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658061, "dur": 7, "args": { "External id": 150330, "cbid": 211, "correlation": 150330 } }, { "ph": "s", "id": 150330, "pid": 76337, "tid": -914061504, "ts": 1716454223658061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223762337, "dur": 111, "args": { "External id": 150333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150333, "pid": 5, "tid": 7, "ts": 1716454223762337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658081, "dur": 6, "args": { "External id": 150333, "cbid": 211, "correlation": 150333 } }, { "ph": "s", "id": 150333, "pid": 76337, "tid": -914061504, "ts": 1716454223658081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223762449, "dur": 69, "args": { "External id": 150342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150342, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150342, "pid": 5, "tid": 7, "ts": 1716454223762449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658120, "dur": 10, "args": { "External id": 150342, "cbid": 211, "correlation": 150342 } }, { "ph": "s", "id": 150342, "pid": 76337, "tid": -914061504, "ts": 1716454223658120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223658170, "dur": 0, "args": { "External id": 150352, "cbid": 317, "correlation": 150352 } }, { "ph": "f", "id": 150352, "pid": 76337, "tid": -914061504, "ts": 1716454223658170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223658171, "dur": 0, "args": { "External id": 150353, "cbid": 203, "correlation": 150353 } }, { "ph": "f", "id": 150353, "pid": 76337, "tid": -914061504, "ts": 1716454223658171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223658172, "dur": 0, "args": { "External id": 150354, "cbid": 205, "correlation": 150354 } }, { "ph": "f", "id": 150354, "pid": 76337, "tid": -914061504, "ts": 1716454223658172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223762519, "dur": 77, "args": { "External id": 150358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150358, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150358, "pid": 5, "tid": 7, "ts": 1716454223762519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658188, "dur": 11, "args": { "External id": 150358, "cbid": 211, "correlation": 150358 } }, { "ph": "s", "id": 150358, "pid": 76337, "tid": -914061504, "ts": 1716454223658188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223762598, "dur": 24, "args": { "External id": 150360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150360, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150360, "pid": 5, "tid": 7, "ts": 1716454223762598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658202, "dur": 5, "args": { "External id": 150360, "cbid": 211, "correlation": 150360 } }, { "ph": "s", "id": 150360, "pid": 76337, "tid": -914061504, "ts": 1716454223658202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223762623, "dur": 4, "args": { "External id": 150362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 150362, "pid": 5, "tid": 7, "ts": 1716454223762623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658213, "dur": 5, "args": { "External id": 150362, "cbid": 211, "correlation": 150362 } }, { "ph": "s", "id": 150362, "pid": 76337, "tid": -914061504, "ts": 1716454223658213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223658222, "dur": 0, "args": { "External id": 150363, "cbid": 51, "correlation": 150363 } }, { "ph": "s", "id": 150363, "pid": 76337, "tid": -914061504, "ts": 1716454223658222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223762629, "dur": 1361, "args": { "External id": 150364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150364, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150364, "pid": 5, "tid": 7, "ts": 1716454223762629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658223, "dur": 5, "args": { "External id": 150364, "cbid": 211, "correlation": 150364 } }, { "ph": "s", "id": 150364, "pid": 76337, "tid": -914061504, "ts": 1716454223658223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223763991, "dur": 59, "args": { "External id": 150369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150369, "pid": 5, "tid": 7, "ts": 1716454223763991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658249, "dur": 8, "args": { "External id": 150369, "cbid": 211, "correlation": 150369 } }, { "ph": "s", "id": 150369, "pid": 76337, "tid": -914061504, "ts": 1716454223658249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223764051, "dur": 4, "args": { "External id": 150377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150377, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 150377, "pid": 5, "tid": 7, "ts": 1716454223764051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658293, "dur": 10, "args": { "External id": 150377, "cbid": 211, "correlation": 150377 } }, { "ph": "s", "id": 150377, "pid": 76337, "tid": -914061504, "ts": 1716454223658293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223658358, "dur": 1, "args": { "External id": 150393, "cbid": 251, "correlation": 150393 } }, { "ph": "f", "id": 150393, "pid": 76337, "tid": -914061504, "ts": 1716454223658358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223658363, "dur": 0, "args": { "External id": 150395, "cbid": 251, "correlation": 150395 } }, { "ph": "f", "id": 150395, "pid": 76337, "tid": -914061504, "ts": 1716454223658363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223764057, "dur": 11, "args": { "External id": 150396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150396, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 150396, "pid": 5, "tid": 7, "ts": 1716454223764057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658365, "dur": 12, "args": { "External id": 150396, "cbid": 211, "correlation": 150396 } }, { "ph": "s", "id": 150396, "pid": 76337, "tid": -914061504, "ts": 1716454223658365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223764069, "dur": 5, "args": { "External id": 150398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150398, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 150398, "pid": 5, "tid": 7, "ts": 1716454223764069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658379, "dur": 5, "args": { "External id": 150398, "cbid": 211, "correlation": 150398 } }, { "ph": "s", "id": 150398, "pid": 76337, "tid": -914061504, "ts": 1716454223658379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223764076, "dur": 55, "args": { "External id": 150408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150408, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150408, "pid": 5, "tid": 7, "ts": 1716454223764076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658437, "dur": 12, "args": { "External id": 150408, "cbid": 211, "correlation": 150408 } }, { "ph": "s", "id": 150408, "pid": 76337, "tid": -914061504, "ts": 1716454223658437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223764132, "dur": 52, "args": { "External id": 150428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150428, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 150428, "pid": 5, "tid": 7, "ts": 1716454223764132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658503, "dur": 10, "args": { "External id": 150428, "cbid": 211, "correlation": 150428 } }, { "ph": "s", "id": 150428, "pid": 76337, "tid": -914061504, "ts": 1716454223658503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223764186, "dur": 4, "args": { "External id": 150440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150440, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 150440, "pid": 5, "tid": 7, "ts": 1716454223764186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658523, "dur": 6, "args": { "External id": 150440, "cbid": 211, "correlation": 150440 } }, { "ph": "s", "id": 150440, "pid": 76337, "tid": -914061504, "ts": 1716454223658523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223764191, "dur": 55, "args": { "External id": 150443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150443, "pid": 5, "tid": 7, "ts": 1716454223764191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658541, "dur": 6, "args": { "External id": 150443, "cbid": 211, "correlation": 150443 } }, { "ph": "s", "id": 150443, "pid": 76337, "tid": -914061504, "ts": 1716454223658541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223764247, "dur": 36, "args": { "External id": 150452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150452, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150452, "pid": 5, "tid": 7, "ts": 1716454223764247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658581, "dur": 10, "args": { "External id": 150452, "cbid": 211, "correlation": 150452 } }, { "ph": "s", "id": 150452, "pid": 76337, "tid": -914061504, "ts": 1716454223658581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223658642, "dur": 0, "args": { "External id": 150462, "cbid": 317, "correlation": 150462 } }, { "ph": "f", "id": 150462, "pid": 76337, "tid": -914061504, "ts": 1716454223658642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223658643, "dur": 0, "args": { "External id": 150463, "cbid": 203, "correlation": 150463 } }, { "ph": "f", "id": 150463, "pid": 76337, "tid": -914061504, "ts": 1716454223658643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223658644, "dur": 0, "args": { "External id": 150464, "cbid": 205, "correlation": 150464 } }, { "ph": "f", "id": 150464, "pid": 76337, "tid": -914061504, "ts": 1716454223658644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223764285, "dur": 40, "args": { "External id": 150468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150468, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150468, "pid": 5, "tid": 7, "ts": 1716454223764285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658660, "dur": 12, "args": { "External id": 150468, "cbid": 211, "correlation": 150468 } }, { "ph": "s", "id": 150468, "pid": 76337, "tid": -914061504, "ts": 1716454223658660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223764327, "dur": 14, "args": { "External id": 150470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150470, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150470, "pid": 5, "tid": 7, "ts": 1716454223764327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658675, "dur": 6, "args": { "External id": 150470, "cbid": 211, "correlation": 150470 } }, { "ph": "s", "id": 150470, "pid": 76337, "tid": -914061504, "ts": 1716454223658675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223764342, "dur": 3, "args": { "External id": 150472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 150472, "pid": 5, "tid": 7, "ts": 1716454223764342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658685, "dur": 5, "args": { "External id": 150472, "cbid": 211, "correlation": 150472 } }, { "ph": "s", "id": 150472, "pid": 76337, "tid": -914061504, "ts": 1716454223658685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223658693, "dur": 0, "args": { "External id": 150473, "cbid": 51, "correlation": 150473 } }, { "ph": "s", "id": 150473, "pid": 76337, "tid": -914061504, "ts": 1716454223658693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223764347, "dur": 696, "args": { "External id": 150474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150474, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150474, "pid": 5, "tid": 7, "ts": 1716454223764347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658693, "dur": 5, "args": { "External id": 150474, "cbid": 211, "correlation": 150474 } }, { "ph": "s", "id": 150474, "pid": 76337, "tid": -914061504, "ts": 1716454223658693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223765045, "dur": 59, "args": { "External id": 150479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150479, "pid": 5, "tid": 7, "ts": 1716454223765045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658720, "dur": 9, "args": { "External id": 150479, "cbid": 211, "correlation": 150479 } }, { "ph": "s", "id": 150479, "pid": 76337, "tid": -914061504, "ts": 1716454223658720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223658778, "dur": 0, "args": { "External id": 150489, "cbid": 317, "correlation": 150489 } }, { "ph": "f", "id": 150489, "pid": 76337, "tid": -914061504, "ts": 1716454223658778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223658779, "dur": 0, "args": { "External id": 150490, "cbid": 203, "correlation": 150490 } }, { "ph": "f", "id": 150490, "pid": 76337, "tid": -914061504, "ts": 1716454223658779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223658780, "dur": 0, "args": { "External id": 150491, "cbid": 205, "correlation": 150491 } }, { "ph": "f", "id": 150491, "pid": 76337, "tid": -914061504, "ts": 1716454223658780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223765106, "dur": 76, "args": { "External id": 150495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150495, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150495, "pid": 5, "tid": 7, "ts": 1716454223765106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658793, "dur": 11, "args": { "External id": 150495, "cbid": 211, "correlation": 150495 } }, { "ph": "s", "id": 150495, "pid": 76337, "tid": -914061504, "ts": 1716454223658793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223765183, "dur": 207, "args": { "External id": 150497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150497, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150497, "pid": 5, "tid": 7, "ts": 1716454223765183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658812, "dur": 8, "args": { "External id": 150497, "cbid": 211, "correlation": 150497 } }, { "ph": "s", "id": 150497, "pid": 76337, "tid": -914061504, "ts": 1716454223658812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223765392, "dur": 39, "args": { "External id": 150499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150499, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150499, "pid": 5, "tid": 7, "ts": 1716454223765392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658824, "dur": 6, "args": { "External id": 150499, "cbid": 211, "correlation": 150499 } }, { "ph": "s", "id": 150499, "pid": 76337, "tid": -914061504, "ts": 1716454223658824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223765433, "dur": 59, "args": { "External id": 150505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150505, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150505, "pid": 5, "tid": 7, "ts": 1716454223765433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658851, "dur": 9, "args": { "External id": 150505, "cbid": 211, "correlation": 150505 } }, { "ph": "s", "id": 150505, "pid": 76337, "tid": -914061504, "ts": 1716454223658851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223765493, "dur": 50, "args": { "External id": 150513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150513, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150513, "pid": 5, "tid": 7, "ts": 1716454223765493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658881, "dur": 8, "args": { "External id": 150513, "cbid": 211, "correlation": 150513 } }, { "ph": "s", "id": 150513, "pid": 76337, "tid": -914061504, "ts": 1716454223658881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223765545, "dur": 35, "args": { "External id": 150521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150521, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150521, "pid": 5, "tid": 7, "ts": 1716454223765545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223658910, "dur": 8, "args": { "External id": 150521, "cbid": 211, "correlation": 150521 } }, { "ph": "s", "id": 150521, "pid": 76337, "tid": -914061504, "ts": 1716454223658910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223765581, "dur": 52, "args": { "External id": 150541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150541, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 150541, "pid": 5, "tid": 7, "ts": 1716454223765581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659000, "dur": 12, "args": { "External id": 150541, "cbid": 211, "correlation": 150541 } }, { "ph": "s", "id": 150541, "pid": 76337, "tid": -914061504, "ts": 1716454223659000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223765635, "dur": 5, "args": { "External id": 150553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150553, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 150553, "pid": 5, "tid": 7, "ts": 1716454223765635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659022, "dur": 6, "args": { "External id": 150553, "cbid": 211, "correlation": 150553 } }, { "ph": "s", "id": 150553, "pid": 76337, "tid": -914061504, "ts": 1716454223659022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223765641, "dur": 55, "args": { "External id": 150556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150556, "pid": 5, "tid": 7, "ts": 1716454223765641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659041, "dur": 6, "args": { "External id": 150556, "cbid": 211, "correlation": 150556 } }, { "ph": "s", "id": 150556, "pid": 76337, "tid": -914061504, "ts": 1716454223659041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223659098, "dur": 0, "args": { "External id": 150567, "cbid": 317, "correlation": 150567 } }, { "ph": "f", "id": 150567, "pid": 76337, "tid": -914061504, "ts": 1716454223659098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223659099, "dur": 0, "args": { "External id": 150568, "cbid": 203, "correlation": 150568 } }, { "ph": "f", "id": 150568, "pid": 76337, "tid": -914061504, "ts": 1716454223659099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223659100, "dur": 0, "args": { "External id": 150569, "cbid": 205, "correlation": 150569 } }, { "ph": "f", "id": 150569, "pid": 76337, "tid": -914061504, "ts": 1716454223659100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659122, "dur": 1, "args": { "External id": 150573, "cbid": 251, "correlation": 150573 } }, { "ph": "f", "id": 150573, "pid": 76337, "tid": -914061504, "ts": 1716454223659122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659124, "dur": 0, "args": { "External id": 150574, "cbid": 251, "correlation": 150574 } }, { "ph": "f", "id": 150574, "pid": 76337, "tid": -914061504, "ts": 1716454223659124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659124, "dur": 0, "args": { "External id": 150575, "cbid": 251, "correlation": 150575 } }, { "ph": "f", "id": 150575, "pid": 76337, "tid": -914061504, "ts": 1716454223659124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659125, "dur": 0, "args": { "External id": 150576, "cbid": 251, "correlation": 150576 } }, { "ph": "f", "id": 150576, "pid": 76337, "tid": -914061504, "ts": 1716454223659125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659126, "dur": 0, "args": { "External id": 150577, "cbid": 251, "correlation": 150577 } }, { "ph": "f", "id": 150577, "pid": 76337, "tid": -914061504, "ts": 1716454223659126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659127, "dur": 0, "args": { "External id": 150578, "cbid": 251, "correlation": 150578 } }, { "ph": "f", "id": 150578, "pid": 76337, "tid": -914061504, "ts": 1716454223659127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659128, "dur": 0, "args": { "External id": 150579, "cbid": 251, "correlation": 150579 } }, { "ph": "f", "id": 150579, "pid": 76337, "tid": -914061504, "ts": 1716454223659128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659129, "dur": 0, "args": { "External id": 150580, "cbid": 251, "correlation": 150580 } }, { "ph": "f", "id": 150580, "pid": 76337, "tid": -914061504, "ts": 1716454223659129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659130, "dur": 0, "args": { "External id": 150581, "cbid": 251, "correlation": 150581 } }, { "ph": "f", "id": 150581, "pid": 76337, "tid": -914061504, "ts": 1716454223659130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223765697, "dur": 114, "args": { "External id": 150582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150582, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150582, "pid": 5, "tid": 7, "ts": 1716454223765697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659133, "dur": 13, "args": { "External id": 150582, "cbid": 211, "correlation": 150582 } }, { "ph": "s", "id": 150582, "pid": 76337, "tid": -914061504, "ts": 1716454223659133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223765813, "dur": 60, "args": { "External id": 150588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150588, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150588, "pid": 5, "tid": 7, "ts": 1716454223765813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659168, "dur": 9, "args": { "External id": 150588, "cbid": 211, "correlation": 150588 } }, { "ph": "s", "id": 150588, "pid": 76337, "tid": -914061504, "ts": 1716454223659168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223765874, "dur": 506, "args": { "External id": 150597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150597, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150597, "pid": 5, "tid": 7, "ts": 1716454223765874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659251, "dur": 13, "args": { "External id": 150597, "cbid": 211, "correlation": 150597 } }, { "ph": "s", "id": 150597, "pid": 76337, "tid": -914061504, "ts": 1716454223659251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223766380, "dur": 180, "args": { "External id": 150619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150619, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150619, "pid": 5, "tid": 7, "ts": 1716454223766380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659309, "dur": 9, "args": { "External id": 150619, "cbid": 211, "correlation": 150619 } }, { "ph": "s", "id": 150619, "pid": 76337, "tid": -914061504, "ts": 1716454223659309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659397, "dur": 1, "args": { "External id": 150630, "cbid": 251, "correlation": 150630 } }, { "ph": "f", "id": 150630, "pid": 76337, "tid": -914061504, "ts": 1716454223659397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223766562, "dur": 198, "args": { "External id": 150631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150631, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150631, "pid": 5, "tid": 7, "ts": 1716454223766562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659402, "dur": 12, "args": { "External id": 150631, "cbid": 211, "correlation": 150631 } }, { "ph": "s", "id": 150631, "pid": 76337, "tid": -914061504, "ts": 1716454223659402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659469, "dur": 1, "args": { "External id": 150642, "cbid": 251, "correlation": 150642 } }, { "ph": "f", "id": 150642, "pid": 76337, "tid": -914061504, "ts": 1716454223659469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223766761, "dur": 189, "args": { "External id": 150643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150643, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150643, "pid": 5, "tid": 7, "ts": 1716454223766761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659473, "dur": 12, "args": { "External id": 150643, "cbid": 211, "correlation": 150643 } }, { "ph": "s", "id": 150643, "pid": 76337, "tid": -914061504, "ts": 1716454223659473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659537, "dur": 1, "args": { "External id": 150654, "cbid": 251, "correlation": 150654 } }, { "ph": "f", "id": 150654, "pid": 76337, "tid": -914061504, "ts": 1716454223659537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223766951, "dur": 192, "args": { "External id": 150655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150655, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150655, "pid": 5, "tid": 7, "ts": 1716454223766951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659541, "dur": 11, "args": { "External id": 150655, "cbid": 211, "correlation": 150655 } }, { "ph": "s", "id": 150655, "pid": 76337, "tid": -914061504, "ts": 1716454223659541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223767145, "dur": 18618, "args": { "External id": 150676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150676, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150676, "pid": 5, "tid": 7, "ts": 1716454223767145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659621, "dur": 12, "args": { "External id": 150676, "cbid": 211, "correlation": 150676 } }, { "ph": "s", "id": 150676, "pid": 76337, "tid": -914061504, "ts": 1716454223659621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223659717, "dur": 1, "args": { "External id": 150694, "cbid": 251, "correlation": 150694 } }, { "ph": "f", "id": 150694, "pid": 76337, "tid": -914061504, "ts": 1716454223659717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223785764, "dur": 199, "args": { "External id": 150696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150696, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150696, "pid": 5, "tid": 7, "ts": 1716454223785764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659723, "dur": 14, "args": { "External id": 150696, "cbid": 211, "correlation": 150696 } }, { "ph": "s", "id": 150696, "pid": 76337, "tid": -914061504, "ts": 1716454223659723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223785965, "dur": 66, "args": { "External id": 150704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150704, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150704, "pid": 5, "tid": 7, "ts": 1716454223785965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659795, "dur": 12, "args": { "External id": 150704, "cbid": 211, "correlation": 150704 } }, { "ph": "s", "id": 150704, "pid": 76337, "tid": -914061504, "ts": 1716454223659795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223786033, "dur": 96, "args": { "External id": 150712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150712, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150712, "pid": 5, "tid": 7, "ts": 1716454223786033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659833, "dur": 9, "args": { "External id": 150712, "cbid": 211, "correlation": 150712 } }, { "ph": "s", "id": 150712, "pid": 76337, "tid": -914061504, "ts": 1716454223659833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223786130, "dur": 55, "args": { "External id": 150723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150723, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150723, "pid": 5, "tid": 7, "ts": 1716454223786130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659905, "dur": 12, "args": { "External id": 150723, "cbid": 211, "correlation": 150723 } }, { "ph": "s", "id": 150723, "pid": 76337, "tid": -914061504, "ts": 1716454223659905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223786186, "dur": 92, "args": { "External id": 150745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150745, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150745, "pid": 5, "tid": 7, "ts": 1716454223786186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223659935, "dur": 8, "args": { "External id": 150745, "cbid": 211, "correlation": 150745 } }, { "ph": "s", "id": 150745, "pid": 76337, "tid": -914061504, "ts": 1716454223659935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660028, "dur": 1, "args": { "External id": 150756, "cbid": 251, "correlation": 150756 } }, { "ph": "f", "id": 150756, "pid": 76337, "tid": -914061504, "ts": 1716454223660028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223786279, "dur": 105, "args": { "External id": 150757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150757, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150757, "pid": 5, "tid": 7, "ts": 1716454223786279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660033, "dur": 14, "args": { "External id": 150757, "cbid": 211, "correlation": 150757 } }, { "ph": "s", "id": 150757, "pid": 76337, "tid": -914061504, "ts": 1716454223660033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660104, "dur": 1, "args": { "External id": 150768, "cbid": 251, "correlation": 150768 } }, { "ph": "f", "id": 150768, "pid": 76337, "tid": -914061504, "ts": 1716454223660104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660107, "dur": 0, "args": { "External id": 150769, "cbid": 251, "correlation": 150769 } }, { "ph": "f", "id": 150769, "pid": 76337, "tid": -914061504, "ts": 1716454223660107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223786386, "dur": 11, "args": { "External id": 150770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150770, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 150770, "pid": 5, "tid": 7, "ts": 1716454223786386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660109, "dur": 12, "args": { "External id": 150770, "cbid": 211, "correlation": 150770 } }, { "ph": "s", "id": 150770, "pid": 76337, "tid": -914061504, "ts": 1716454223660109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223786398, "dur": 5, "args": { "External id": 150772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150772, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 150772, "pid": 5, "tid": 7, "ts": 1716454223786398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660123, "dur": 6, "args": { "External id": 150772, "cbid": 211, "correlation": 150772 } }, { "ph": "s", "id": 150772, "pid": 76337, "tid": -914061504, "ts": 1716454223660123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660184, "dur": 1, "args": { "External id": 150783, "cbid": 251, "correlation": 150783 } }, { "ph": "f", "id": 150783, "pid": 76337, "tid": -914061504, "ts": 1716454223660184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660187, "dur": 0, "args": { "External id": 150784, "cbid": 251, "correlation": 150784 } }, { "ph": "f", "id": 150784, "pid": 76337, "tid": -914061504, "ts": 1716454223660187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223786404, "dur": 6, "args": { "External id": 150785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150785, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 150785, "pid": 5, "tid": 7, "ts": 1716454223786404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660189, "dur": 12, "args": { "External id": 150785, "cbid": 211, "correlation": 150785 } }, { "ph": "s", "id": 150785, "pid": 76337, "tid": -914061504, "ts": 1716454223660189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223786411, "dur": 3, "args": { "External id": 150787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150787, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 150787, "pid": 5, "tid": 7, "ts": 1716454223786411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660202, "dur": 7, "args": { "External id": 150787, "cbid": 211, "correlation": 150787 } }, { "ph": "s", "id": 150787, "pid": 76337, "tid": -914061504, "ts": 1716454223660202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223786416, "dur": 155, "args": { "External id": 150808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150808, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150808, "pid": 5, "tid": 7, "ts": 1716454223786416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660277, "dur": 13, "args": { "External id": 150808, "cbid": 211, "correlation": 150808 } }, { "ph": "s", "id": 150808, "pid": 76337, "tid": -914061504, "ts": 1716454223660277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660374, "dur": 1, "args": { "External id": 150826, "cbid": 251, "correlation": 150826 } }, { "ph": "f", "id": 150826, "pid": 76337, "tid": -914061504, "ts": 1716454223660374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223786572, "dur": 107, "args": { "External id": 150828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150828, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150828, "pid": 5, "tid": 7, "ts": 1716454223786572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660380, "dur": 14, "args": { "External id": 150828, "cbid": 211, "correlation": 150828 } }, { "ph": "s", "id": 150828, "pid": 76337, "tid": -914061504, "ts": 1716454223660380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223786680, "dur": 35, "args": { "External id": 150836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150836, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150836, "pid": 5, "tid": 7, "ts": 1716454223786680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660450, "dur": 12, "args": { "External id": 150836, "cbid": 211, "correlation": 150836 } }, { "ph": "s", "id": 150836, "pid": 76337, "tid": -914061504, "ts": 1716454223660450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223786717, "dur": 67, "args": { "External id": 150844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150844, "pid": 5, "tid": 7, "ts": 1716454223786717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660490, "dur": 9, "args": { "External id": 150844, "cbid": 211, "correlation": 150844 } }, { "ph": "s", "id": 150844, "pid": 76337, "tid": -914061504, "ts": 1716454223660490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223786785, "dur": 92, "args": { "External id": 150866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150866, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150866, "pid": 5, "tid": 7, "ts": 1716454223786785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660542, "dur": 10, "args": { "External id": 150866, "cbid": 211, "correlation": 150866 } }, { "ph": "s", "id": 150866, "pid": 76337, "tid": -914061504, "ts": 1716454223660542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660629, "dur": 1, "args": { "External id": 150882, "cbid": 251, "correlation": 150882 } }, { "ph": "f", "id": 150882, "pid": 76337, "tid": -914061504, "ts": 1716454223660629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223786878, "dur": 569, "args": { "External id": 150884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150884, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 150884, "pid": 5, "tid": 7, "ts": 1716454223786878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660635, "dur": 13, "args": { "External id": 150884, "cbid": 211, "correlation": 150884 } }, { "ph": "s", "id": 150884, "pid": 76337, "tid": -914061504, "ts": 1716454223660635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223787449, "dur": 243, "args": { "External id": 150892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150892, "pid": 5, "tid": 7, "ts": 1716454223787449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660700, "dur": 13, "args": { "External id": 150892, "cbid": 211, "correlation": 150892 } }, { "ph": "s", "id": 150892, "pid": 76337, "tid": -914061504, "ts": 1716454223660700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223787694, "dur": 252, "args": { "External id": 150900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150900, "pid": 5, "tid": 7, "ts": 1716454223787694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660732, "dur": 9, "args": { "External id": 150900, "cbid": 211, "correlation": 150900 } }, { "ph": "s", "id": 150900, "pid": 76337, "tid": -914061504, "ts": 1716454223660732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660814, "dur": 1, "args": { "External id": 150916, "cbid": 251, "correlation": 150916 } }, { "ph": "f", "id": 150916, "pid": 76337, "tid": -914061504, "ts": 1716454223660814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223660820, "dur": 0, "args": { "External id": 150918, "cbid": 251, "correlation": 150918 } }, { "ph": "f", "id": 150918, "pid": 76337, "tid": -914061504, "ts": 1716454223660820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223787947, "dur": 357, "args": { "External id": 150919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150919, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 150919, "pid": 5, "tid": 7, "ts": 1716454223787947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660822, "dur": 13, "args": { "External id": 150919, "cbid": 211, "correlation": 150919 } }, { "ph": "s", "id": 150919, "pid": 76337, "tid": -914061504, "ts": 1716454223660822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223788306, "dur": 50, "args": { "External id": 150927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150927, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150927, "pid": 5, "tid": 7, "ts": 1716454223788306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660865, "dur": 10, "args": { "External id": 150927, "cbid": 211, "correlation": 150927 } }, { "ph": "s", "id": 150927, "pid": 76337, "tid": -914061504, "ts": 1716454223660865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223788357, "dur": 158, "args": { "External id": 150938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150938, "pid": 5, "tid": 7, "ts": 1716454223788357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223660932, "dur": 14, "args": { "External id": 150938, "cbid": 211, "correlation": 150938 } }, { "ph": "s", "id": 150938, "pid": 76337, "tid": -914061504, "ts": 1716454223660932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223661009, "dur": 0, "args": { "External id": 150950, "cbid": 317, "correlation": 150950 } }, { "ph": "f", "id": 150950, "pid": 76337, "tid": -914061504, "ts": 1716454223661009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223661010, "dur": 0, "args": { "External id": 150951, "cbid": 203, "correlation": 150951 } }, { "ph": "f", "id": 150951, "pid": 76337, "tid": -914061504, "ts": 1716454223661010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223661011, "dur": 0, "args": { "External id": 150952, "cbid": 205, "correlation": 150952 } }, { "ph": "f", "id": 150952, "pid": 76337, "tid": -914061504, "ts": 1716454223661011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661035, "dur": 1, "args": { "External id": 150956, "cbid": 251, "correlation": 150956 } }, { "ph": "f", "id": 150956, "pid": 76337, "tid": -914061504, "ts": 1716454223661035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661037, "dur": 0, "args": { "External id": 150957, "cbid": 251, "correlation": 150957 } }, { "ph": "f", "id": 150957, "pid": 76337, "tid": -914061504, "ts": 1716454223661037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661038, "dur": 0, "args": { "External id": 150958, "cbid": 251, "correlation": 150958 } }, { "ph": "f", "id": 150958, "pid": 76337, "tid": -914061504, "ts": 1716454223661038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661039, "dur": 0, "args": { "External id": 150959, "cbid": 251, "correlation": 150959 } }, { "ph": "f", "id": 150959, "pid": 76337, "tid": -914061504, "ts": 1716454223661039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661040, "dur": 0, "args": { "External id": 150960, "cbid": 251, "correlation": 150960 } }, { "ph": "f", "id": 150960, "pid": 76337, "tid": -914061504, "ts": 1716454223661040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661040, "dur": 0, "args": { "External id": 150961, "cbid": 251, "correlation": 150961 } }, { "ph": "f", "id": 150961, "pid": 76337, "tid": -914061504, "ts": 1716454223661040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661041, "dur": 0, "args": { "External id": 150962, "cbid": 251, "correlation": 150962 } }, { "ph": "f", "id": 150962, "pid": 76337, "tid": -914061504, "ts": 1716454223661041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661042, "dur": 0, "args": { "External id": 150963, "cbid": 251, "correlation": 150963 } }, { "ph": "f", "id": 150963, "pid": 76337, "tid": -914061504, "ts": 1716454223661042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661044, "dur": 0, "args": { "External id": 150964, "cbid": 251, "correlation": 150964 } }, { "ph": "f", "id": 150964, "pid": 76337, "tid": -914061504, "ts": 1716454223661044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223788517, "dur": 115, "args": { "External id": 150965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150965, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 150965, "pid": 5, "tid": 7, "ts": 1716454223788517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661046, "dur": 12, "args": { "External id": 150965, "cbid": 211, "correlation": 150965 } }, { "ph": "s", "id": 150965, "pid": 76337, "tid": -914061504, "ts": 1716454223661046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223788633, "dur": 59, "args": { "External id": 150971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150971, "pid": 5, "tid": 7, "ts": 1716454223788633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661081, "dur": 9, "args": { "External id": 150971, "cbid": 211, "correlation": 150971 } }, { "ph": "s", "id": 150971, "pid": 76337, "tid": -914061504, "ts": 1716454223661081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223788693, "dur": 50, "args": { "External id": 150979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150979, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150979, "pid": 5, "tid": 7, "ts": 1716454223788693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661114, "dur": 8, "args": { "External id": 150979, "cbid": 211, "correlation": 150979 } }, { "ph": "s", "id": 150979, "pid": 76337, "tid": -914061504, "ts": 1716454223661114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223788744, "dur": 98, "args": { "External id": 150988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 150988, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 150988, "pid": 5, "tid": 7, "ts": 1716454223788744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661152, "dur": 10, "args": { "External id": 150988, "cbid": 211, "correlation": 150988 } }, { "ph": "s", "id": 150988, "pid": 76337, "tid": -914061504, "ts": 1716454223661152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223788843, "dur": 92, "args": { "External id": 151008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151008, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 151008, "pid": 5, "tid": 7, "ts": 1716454223788843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661223, "dur": 11, "args": { "External id": 151008, "cbid": 211, "correlation": 151008 } }, { "ph": "s", "id": 151008, "pid": 76337, "tid": -914061504, "ts": 1716454223661223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223788937, "dur": 5, "args": { "External id": 151020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151020, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 151020, "pid": 5, "tid": 7, "ts": 1716454223788937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661244, "dur": 7, "args": { "External id": 151020, "cbid": 211, "correlation": 151020 } }, { "ph": "s", "id": 151020, "pid": 76337, "tid": -914061504, "ts": 1716454223661244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223788943, "dur": 109, "args": { "External id": 151023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151023, "pid": 5, "tid": 7, "ts": 1716454223788943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661263, "dur": 6, "args": { "External id": 151023, "cbid": 211, "correlation": 151023 } }, { "ph": "s", "id": 151023, "pid": 76337, "tid": -914061504, "ts": 1716454223661263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223789053, "dur": 69, "args": { "External id": 151032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151032, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151032, "pid": 5, "tid": 7, "ts": 1716454223789053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661303, "dur": 10, "args": { "External id": 151032, "cbid": 211, "correlation": 151032 } }, { "ph": "s", "id": 151032, "pid": 76337, "tid": -914061504, "ts": 1716454223661303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223661354, "dur": 0, "args": { "External id": 151042, "cbid": 317, "correlation": 151042 } }, { "ph": "f", "id": 151042, "pid": 76337, "tid": -914061504, "ts": 1716454223661354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223661355, "dur": 0, "args": { "External id": 151043, "cbid": 203, "correlation": 151043 } }, { "ph": "f", "id": 151043, "pid": 76337, "tid": -914061504, "ts": 1716454223661355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223661356, "dur": 0, "args": { "External id": 151044, "cbid": 205, "correlation": 151044 } }, { "ph": "f", "id": 151044, "pid": 76337, "tid": -914061504, "ts": 1716454223661356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223789123, "dur": 77, "args": { "External id": 151048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151048, "pid": 5, "tid": 7, "ts": 1716454223789123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661370, "dur": 11, "args": { "External id": 151048, "cbid": 211, "correlation": 151048 } }, { "ph": "s", "id": 151048, "pid": 76337, "tid": -914061504, "ts": 1716454223661370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223789201, "dur": 24, "args": { "External id": 151050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151050, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151050, "pid": 5, "tid": 7, "ts": 1716454223789201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661384, "dur": 5, "args": { "External id": 151050, "cbid": 211, "correlation": 151050 } }, { "ph": "s", "id": 151050, "pid": 76337, "tid": -914061504, "ts": 1716454223661384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223789227, "dur": 4, "args": { "External id": 151052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151052, "pid": 5, "tid": 7, "ts": 1716454223789227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661394, "dur": 6, "args": { "External id": 151052, "cbid": 211, "correlation": 151052 } }, { "ph": "s", "id": 151052, "pid": 76337, "tid": -914061504, "ts": 1716454223661394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223661403, "dur": 0, "args": { "External id": 151053, "cbid": 51, "correlation": 151053 } }, { "ph": "s", "id": 151053, "pid": 76337, "tid": -914061504, "ts": 1716454223661403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223789232, "dur": 1357, "args": { "External id": 151054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151054, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151054, "pid": 5, "tid": 7, "ts": 1716454223789232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661404, "dur": 5, "args": { "External id": 151054, "cbid": 211, "correlation": 151054 } }, { "ph": "s", "id": 151054, "pid": 76337, "tid": -914061504, "ts": 1716454223661404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223790590, "dur": 59, "args": { "External id": 151059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151059, "pid": 5, "tid": 7, "ts": 1716454223790590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661430, "dur": 9, "args": { "External id": 151059, "cbid": 211, "correlation": 151059 } }, { "ph": "s", "id": 151059, "pid": 76337, "tid": -914061504, "ts": 1716454223661430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223790651, "dur": 4, "args": { "External id": 151067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151067, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151067, "pid": 5, "tid": 7, "ts": 1716454223790651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661475, "dur": 9, "args": { "External id": 151067, "cbid": 211, "correlation": 151067 } }, { "ph": "s", "id": 151067, "pid": 76337, "tid": -914061504, "ts": 1716454223661475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661540, "dur": 1, "args": { "External id": 151083, "cbid": 251, "correlation": 151083 } }, { "ph": "f", "id": 151083, "pid": 76337, "tid": -914061504, "ts": 1716454223661540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223661546, "dur": 0, "args": { "External id": 151085, "cbid": 251, "correlation": 151085 } }, { "ph": "f", "id": 151085, "pid": 76337, "tid": -914061504, "ts": 1716454223661546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223790656, "dur": 11, "args": { "External id": 151086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151086, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 151086, "pid": 5, "tid": 7, "ts": 1716454223790656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661548, "dur": 11, "args": { "External id": 151086, "cbid": 211, "correlation": 151086 } }, { "ph": "s", "id": 151086, "pid": 76337, "tid": -914061504, "ts": 1716454223661548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223790668, "dur": 5, "args": { "External id": 151088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151088, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 151088, "pid": 5, "tid": 7, "ts": 1716454223790668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661561, "dur": 5, "args": { "External id": 151088, "cbid": 211, "correlation": 151088 } }, { "ph": "s", "id": 151088, "pid": 76337, "tid": -914061504, "ts": 1716454223661561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223790675, "dur": 53, "args": { "External id": 151098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151098, "pid": 5, "tid": 7, "ts": 1716454223790675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661619, "dur": 113, "args": { "External id": 151098, "cbid": 211, "correlation": 151098 } }, { "ph": "s", "id": 151098, "pid": 76337, "tid": -914061504, "ts": 1716454223661619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223790730, "dur": 50, "args": { "External id": 151118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151118, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 151118, "pid": 5, "tid": 7, "ts": 1716454223790730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661786, "dur": 11, "args": { "External id": 151118, "cbid": 211, "correlation": 151118 } }, { "ph": "s", "id": 151118, "pid": 76337, "tid": -914061504, "ts": 1716454223661786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223790781, "dur": 4, "args": { "External id": 151130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151130, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151130, "pid": 5, "tid": 7, "ts": 1716454223790781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661806, "dur": 6, "args": { "External id": 151130, "cbid": 211, "correlation": 151130 } }, { "ph": "s", "id": 151130, "pid": 76337, "tid": -914061504, "ts": 1716454223661806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223790786, "dur": 56, "args": { "External id": 151133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151133, "pid": 5, "tid": 7, "ts": 1716454223790786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661825, "dur": 7, "args": { "External id": 151133, "cbid": 211, "correlation": 151133 } }, { "ph": "s", "id": 151133, "pid": 76337, "tid": -914061504, "ts": 1716454223661825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223790843, "dur": 37, "args": { "External id": 151142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151142, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151142, "pid": 5, "tid": 7, "ts": 1716454223790843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661865, "dur": 10, "args": { "External id": 151142, "cbid": 211, "correlation": 151142 } }, { "ph": "s", "id": 151142, "pid": 76337, "tid": -914061504, "ts": 1716454223661865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223661929, "dur": 0, "args": { "External id": 151152, "cbid": 317, "correlation": 151152 } }, { "ph": "f", "id": 151152, "pid": 76337, "tid": -914061504, "ts": 1716454223661929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223661930, "dur": 0, "args": { "External id": 151153, "cbid": 203, "correlation": 151153 } }, { "ph": "f", "id": 151153, "pid": 76337, "tid": -914061504, "ts": 1716454223661930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223661931, "dur": 0, "args": { "External id": 151154, "cbid": 205, "correlation": 151154 } }, { "ph": "f", "id": 151154, "pid": 76337, "tid": -914061504, "ts": 1716454223661931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223790881, "dur": 40, "args": { "External id": 151158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151158, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151158, "pid": 5, "tid": 7, "ts": 1716454223790881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661945, "dur": 12, "args": { "External id": 151158, "cbid": 211, "correlation": 151158 } }, { "ph": "s", "id": 151158, "pid": 76337, "tid": -914061504, "ts": 1716454223661945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223790923, "dur": 14, "args": { "External id": 151160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151160, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151160, "pid": 5, "tid": 7, "ts": 1716454223790923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661959, "dur": 5, "args": { "External id": 151160, "cbid": 211, "correlation": 151160 } }, { "ph": "s", "id": 151160, "pid": 76337, "tid": -914061504, "ts": 1716454223661959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223790939, "dur": 4, "args": { "External id": 151162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151162, "pid": 5, "tid": 7, "ts": 1716454223790939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661969, "dur": 15, "args": { "External id": 151162, "cbid": 211, "correlation": 151162 } }, { "ph": "s", "id": 151162, "pid": 76337, "tid": -914061504, "ts": 1716454223661969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223661987, "dur": 0, "args": { "External id": 151163, "cbid": 51, "correlation": 151163 } }, { "ph": "s", "id": 151163, "pid": 76337, "tid": -914061504, "ts": 1716454223661987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223790944, "dur": 694, "args": { "External id": 151164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151164, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151164, "pid": 5, "tid": 7, "ts": 1716454223790944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223661988, "dur": 5, "args": { "External id": 151164, "cbid": 211, "correlation": 151164 } }, { "ph": "s", "id": 151164, "pid": 76337, "tid": -914061504, "ts": 1716454223661988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223791639, "dur": 59, "args": { "External id": 151169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151169, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151169, "pid": 5, "tid": 7, "ts": 1716454223791639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662017, "dur": 9, "args": { "External id": 151169, "cbid": 211, "correlation": 151169 } }, { "ph": "s", "id": 151169, "pid": 76337, "tid": -914061504, "ts": 1716454223662017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223662075, "dur": 0, "args": { "External id": 151179, "cbid": 317, "correlation": 151179 } }, { "ph": "f", "id": 151179, "pid": 76337, "tid": -914061504, "ts": 1716454223662075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223662076, "dur": 0, "args": { "External id": 151180, "cbid": 203, "correlation": 151180 } }, { "ph": "f", "id": 151180, "pid": 76337, "tid": -914061504, "ts": 1716454223662076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223662076, "dur": 0, "args": { "External id": 151181, "cbid": 205, "correlation": 151181 } }, { "ph": "f", "id": 151181, "pid": 76337, "tid": -914061504, "ts": 1716454223662076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223791699, "dur": 75, "args": { "External id": 151185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151185, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151185, "pid": 5, "tid": 7, "ts": 1716454223791699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662088, "dur": 12, "args": { "External id": 151185, "cbid": 211, "correlation": 151185 } }, { "ph": "s", "id": 151185, "pid": 76337, "tid": -914061504, "ts": 1716454223662088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223791776, "dur": 205, "args": { "External id": 151187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151187, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151187, "pid": 5, "tid": 7, "ts": 1716454223791776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662106, "dur": 7, "args": { "External id": 151187, "cbid": 211, "correlation": 151187 } }, { "ph": "s", "id": 151187, "pid": 76337, "tid": -914061504, "ts": 1716454223662106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223791983, "dur": 39, "args": { "External id": 151189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151189, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151189, "pid": 5, "tid": 7, "ts": 1716454223791983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662118, "dur": 5, "args": { "External id": 151189, "cbid": 211, "correlation": 151189 } }, { "ph": "s", "id": 151189, "pid": 76337, "tid": -914061504, "ts": 1716454223662118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223792023, "dur": 59, "args": { "External id": 151195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151195, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151195, "pid": 5, "tid": 7, "ts": 1716454223792023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662143, "dur": 503, "args": { "External id": 151195, "cbid": 211, "correlation": 151195 } }, { "ph": "s", "id": 151195, "pid": 76337, "tid": -914061504, "ts": 1716454223662143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223792084, "dur": 50, "args": { "External id": 151203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151203, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151203, "pid": 5, "tid": 7, "ts": 1716454223792084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662667, "dur": 8, "args": { "External id": 151203, "cbid": 211, "correlation": 151203 } }, { "ph": "s", "id": 151203, "pid": 76337, "tid": -914061504, "ts": 1716454223662667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223792136, "dur": 35, "args": { "External id": 151211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151211, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151211, "pid": 5, "tid": 7, "ts": 1716454223792136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662696, "dur": 8, "args": { "External id": 151211, "cbid": 211, "correlation": 151211 } }, { "ph": "s", "id": 151211, "pid": 76337, "tid": -914061504, "ts": 1716454223662696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223792172, "dur": 52, "args": { "External id": 151231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151231, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 151231, "pid": 5, "tid": 7, "ts": 1716454223792172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662778, "dur": 13, "args": { "External id": 151231, "cbid": 211, "correlation": 151231 } }, { "ph": "s", "id": 151231, "pid": 76337, "tid": -914061504, "ts": 1716454223662778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223792226, "dur": 4, "args": { "External id": 151243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151243, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151243, "pid": 5, "tid": 7, "ts": 1716454223792226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662801, "dur": 6, "args": { "External id": 151243, "cbid": 211, "correlation": 151243 } }, { "ph": "s", "id": 151243, "pid": 76337, "tid": -914061504, "ts": 1716454223662801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223792231, "dur": 55, "args": { "External id": 151246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151246, "pid": 5, "tid": 7, "ts": 1716454223792231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662818, "dur": 6, "args": { "External id": 151246, "cbid": 211, "correlation": 151246 } }, { "ph": "s", "id": 151246, "pid": 76337, "tid": -914061504, "ts": 1716454223662818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223662876, "dur": 0, "args": { "External id": 151257, "cbid": 317, "correlation": 151257 } }, { "ph": "f", "id": 151257, "pid": 76337, "tid": -914061504, "ts": 1716454223662876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223662876, "dur": 0, "args": { "External id": 151258, "cbid": 203, "correlation": 151258 } }, { "ph": "f", "id": 151258, "pid": 76337, "tid": -914061504, "ts": 1716454223662876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223662877, "dur": 0, "args": { "External id": 151259, "cbid": 205, "correlation": 151259 } }, { "ph": "f", "id": 151259, "pid": 76337, "tid": -914061504, "ts": 1716454223662877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662900, "dur": 1, "args": { "External id": 151263, "cbid": 251, "correlation": 151263 } }, { "ph": "f", "id": 151263, "pid": 76337, "tid": -914061504, "ts": 1716454223662900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662902, "dur": 0, "args": { "External id": 151264, "cbid": 251, "correlation": 151264 } }, { "ph": "f", "id": 151264, "pid": 76337, "tid": -914061504, "ts": 1716454223662902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662902, "dur": 0, "args": { "External id": 151265, "cbid": 251, "correlation": 151265 } }, { "ph": "f", "id": 151265, "pid": 76337, "tid": -914061504, "ts": 1716454223662902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662903, "dur": 0, "args": { "External id": 151266, "cbid": 251, "correlation": 151266 } }, { "ph": "f", "id": 151266, "pid": 76337, "tid": -914061504, "ts": 1716454223662903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662904, "dur": 0, "args": { "External id": 151267, "cbid": 251, "correlation": 151267 } }, { "ph": "f", "id": 151267, "pid": 76337, "tid": -914061504, "ts": 1716454223662904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662905, "dur": 0, "args": { "External id": 151268, "cbid": 251, "correlation": 151268 } }, { "ph": "f", "id": 151268, "pid": 76337, "tid": -914061504, "ts": 1716454223662905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662906, "dur": 0, "args": { "External id": 151269, "cbid": 251, "correlation": 151269 } }, { "ph": "f", "id": 151269, "pid": 76337, "tid": -914061504, "ts": 1716454223662906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662907, "dur": 0, "args": { "External id": 151270, "cbid": 251, "correlation": 151270 } }, { "ph": "f", "id": 151270, "pid": 76337, "tid": -914061504, "ts": 1716454223662907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223662908, "dur": 0, "args": { "External id": 151271, "cbid": 251, "correlation": 151271 } }, { "ph": "f", "id": 151271, "pid": 76337, "tid": -914061504, "ts": 1716454223662908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223792287, "dur": 115, "args": { "External id": 151272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151272, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 151272, "pid": 5, "tid": 7, "ts": 1716454223792287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662910, "dur": 13, "args": { "External id": 151272, "cbid": 211, "correlation": 151272 } }, { "ph": "s", "id": 151272, "pid": 76337, "tid": -914061504, "ts": 1716454223662910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223792404, "dur": 60, "args": { "External id": 151278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151278, "pid": 5, "tid": 7, "ts": 1716454223792404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223662946, "dur": 9, "args": { "External id": 151278, "cbid": 211, "correlation": 151278 } }, { "ph": "s", "id": 151278, "pid": 76337, "tid": -914061504, "ts": 1716454223662946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223792465, "dur": 553, "args": { "External id": 151287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151287, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151287, "pid": 5, "tid": 7, "ts": 1716454223792465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663038, "dur": 14, "args": { "External id": 151287, "cbid": 211, "correlation": 151287 } }, { "ph": "s", "id": 151287, "pid": 76337, "tid": -914061504, "ts": 1716454223663038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223793019, "dur": 181, "args": { "External id": 151309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151309, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151309, "pid": 5, "tid": 7, "ts": 1716454223793019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663096, "dur": 10, "args": { "External id": 151309, "cbid": 211, "correlation": 151309 } }, { "ph": "s", "id": 151309, "pid": 76337, "tid": -914061504, "ts": 1716454223663096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223663182, "dur": 1, "args": { "External id": 151320, "cbid": 251, "correlation": 151320 } }, { "ph": "f", "id": 151320, "pid": 76337, "tid": -914061504, "ts": 1716454223663182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223793201, "dur": 196, "args": { "External id": 151321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151321, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151321, "pid": 5, "tid": 7, "ts": 1716454223793201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663187, "dur": 13, "args": { "External id": 151321, "cbid": 211, "correlation": 151321 } }, { "ph": "s", "id": 151321, "pid": 76337, "tid": -914061504, "ts": 1716454223663187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223663256, "dur": 1, "args": { "External id": 151332, "cbid": 251, "correlation": 151332 } }, { "ph": "f", "id": 151332, "pid": 76337, "tid": -914061504, "ts": 1716454223663256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223793399, "dur": 188, "args": { "External id": 151333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151333, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151333, "pid": 5, "tid": 7, "ts": 1716454223793399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663260, "dur": 11, "args": { "External id": 151333, "cbid": 211, "correlation": 151333 } }, { "ph": "s", "id": 151333, "pid": 76337, "tid": -914061504, "ts": 1716454223663260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223663322, "dur": 1, "args": { "External id": 151344, "cbid": 251, "correlation": 151344 } }, { "ph": "f", "id": 151344, "pid": 76337, "tid": -914061504, "ts": 1716454223663322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223793588, "dur": 186, "args": { "External id": 151345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151345, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151345, "pid": 5, "tid": 7, "ts": 1716454223793588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663326, "dur": 12, "args": { "External id": 151345, "cbid": 211, "correlation": 151345 } }, { "ph": "s", "id": 151345, "pid": 76337, "tid": -914061504, "ts": 1716454223663326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223793776, "dur": 18554, "args": { "External id": 151366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151366, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 151366, "pid": 5, "tid": 7, "ts": 1716454223793776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663408, "dur": 12, "args": { "External id": 151366, "cbid": 211, "correlation": 151366 } }, { "ph": "s", "id": 151366, "pid": 76337, "tid": -914061504, "ts": 1716454223663408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223663505, "dur": 1, "args": { "External id": 151384, "cbid": 251, "correlation": 151384 } }, { "ph": "f", "id": 151384, "pid": 76337, "tid": -914061504, "ts": 1716454223663505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223812331, "dur": 202, "args": { "External id": 151386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151386, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151386, "pid": 5, "tid": 7, "ts": 1716454223812331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663511, "dur": 13, "args": { "External id": 151386, "cbid": 211, "correlation": 151386 } }, { "ph": "s", "id": 151386, "pid": 76337, "tid": -914061504, "ts": 1716454223663511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223812535, "dur": 66, "args": { "External id": 151394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151394, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151394, "pid": 5, "tid": 7, "ts": 1716454223812535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663582, "dur": 22, "args": { "External id": 151394, "cbid": 211, "correlation": 151394 } }, { "ph": "s", "id": 151394, "pid": 76337, "tid": -914061504, "ts": 1716454223663582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223812602, "dur": 97, "args": { "External id": 151402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151402, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151402, "pid": 5, "tid": 7, "ts": 1716454223812602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663631, "dur": 120, "args": { "External id": 151402, "cbid": 211, "correlation": 151402 } }, { "ph": "s", "id": 151402, "pid": 76337, "tid": -914061504, "ts": 1716454223663631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223812700, "dur": 53, "args": { "External id": 151413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151413, "pid": 5, "tid": 7, "ts": 1716454223812700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223663815, "dur": 1896, "args": { "External id": 151413, "cbid": 211, "correlation": 151413 } }, { "ph": "s", "id": 151413, "pid": 76337, "tid": -914061504, "ts": 1716454223663815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223812755, "dur": 92, "args": { "External id": 151435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151435, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151435, "pid": 5, "tid": 7, "ts": 1716454223812755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223665730, "dur": 128, "args": { "External id": 151435, "cbid": 211, "correlation": 151435 } }, { "ph": "s", "id": 151435, "pid": 76337, "tid": -914061504, "ts": 1716454223665730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223665937, "dur": 1, "args": { "External id": 151446, "cbid": 251, "correlation": 151446 } }, { "ph": "f", "id": 151446, "pid": 76337, "tid": -914061504, "ts": 1716454223665937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223812848, "dur": 102, "args": { "External id": 151447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151447, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151447, "pid": 5, "tid": 7, "ts": 1716454223812848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223665942, "dur": 13, "args": { "External id": 151447, "cbid": 211, "correlation": 151447 } }, { "ph": "s", "id": 151447, "pid": 76337, "tid": -914061504, "ts": 1716454223665942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666024, "dur": 1, "args": { "External id": 151458, "cbid": 251, "correlation": 151458 } }, { "ph": "f", "id": 151458, "pid": 76337, "tid": -914061504, "ts": 1716454223666024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666028, "dur": 0, "args": { "External id": 151459, "cbid": 251, "correlation": 151459 } }, { "ph": "f", "id": 151459, "pid": 76337, "tid": -914061504, "ts": 1716454223666028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223812952, "dur": 10, "args": { "External id": 151460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151460, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151460, "pid": 5, "tid": 7, "ts": 1716454223812952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666030, "dur": 13, "args": { "External id": 151460, "cbid": 211, "correlation": 151460 } }, { "ph": "s", "id": 151460, "pid": 76337, "tid": -914061504, "ts": 1716454223666030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223812963, "dur": 5, "args": { "External id": 151462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151462, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 151462, "pid": 5, "tid": 7, "ts": 1716454223812963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666044, "dur": 6, "args": { "External id": 151462, "cbid": 211, "correlation": 151462 } }, { "ph": "s", "id": 151462, "pid": 76337, "tid": -914061504, "ts": 1716454223666044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666106, "dur": 1, "args": { "External id": 151473, "cbid": 251, "correlation": 151473 } }, { "ph": "f", "id": 151473, "pid": 76337, "tid": -914061504, "ts": 1716454223666106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666109, "dur": 0, "args": { "External id": 151474, "cbid": 251, "correlation": 151474 } }, { "ph": "f", "id": 151474, "pid": 76337, "tid": -914061504, "ts": 1716454223666109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223812970, "dur": 6, "args": { "External id": 151475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151475, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151475, "pid": 5, "tid": 7, "ts": 1716454223812970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666111, "dur": 13, "args": { "External id": 151475, "cbid": 211, "correlation": 151475 } }, { "ph": "s", "id": 151475, "pid": 76337, "tid": -914061504, "ts": 1716454223666111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223812977, "dur": 3, "args": { "External id": 151477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151477, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 151477, "pid": 5, "tid": 7, "ts": 1716454223812977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666125, "dur": 6, "args": { "External id": 151477, "cbid": 211, "correlation": 151477 } }, { "ph": "s", "id": 151477, "pid": 76337, "tid": -914061504, "ts": 1716454223666125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223812982, "dur": 156, "args": { "External id": 151498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151498, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 151498, "pid": 5, "tid": 7, "ts": 1716454223812982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666200, "dur": 12, "args": { "External id": 151498, "cbid": 211, "correlation": 151498 } }, { "ph": "s", "id": 151498, "pid": 76337, "tid": -914061504, "ts": 1716454223666200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666297, "dur": 1, "args": { "External id": 151516, "cbid": 251, "correlation": 151516 } }, { "ph": "f", "id": 151516, "pid": 76337, "tid": -914061504, "ts": 1716454223666297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223813139, "dur": 107, "args": { "External id": 151518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151518, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 151518, "pid": 5, "tid": 7, "ts": 1716454223813139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666302, "dur": 13, "args": { "External id": 151518, "cbid": 211, "correlation": 151518 } }, { "ph": "s", "id": 151518, "pid": 76337, "tid": -914061504, "ts": 1716454223666302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223813248, "dur": 34, "args": { "External id": 151526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151526, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151526, "pid": 5, "tid": 7, "ts": 1716454223813248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666371, "dur": 13, "args": { "External id": 151526, "cbid": 211, "correlation": 151526 } }, { "ph": "s", "id": 151526, "pid": 76337, "tid": -914061504, "ts": 1716454223666371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223813284, "dur": 68, "args": { "External id": 151534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151534, "pid": 5, "tid": 7, "ts": 1716454223813284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666412, "dur": 9, "args": { "External id": 151534, "cbid": 211, "correlation": 151534 } }, { "ph": "s", "id": 151534, "pid": 76337, "tid": -914061504, "ts": 1716454223666412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223813353, "dur": 91, "args": { "External id": 151556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151556, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151556, "pid": 5, "tid": 7, "ts": 1716454223813353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666463, "dur": 10, "args": { "External id": 151556, "cbid": 211, "correlation": 151556 } }, { "ph": "s", "id": 151556, "pid": 76337, "tid": -914061504, "ts": 1716454223666463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666550, "dur": 1, "args": { "External id": 151572, "cbid": 251, "correlation": 151572 } }, { "ph": "f", "id": 151572, "pid": 76337, "tid": -914061504, "ts": 1716454223666550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223813446, "dur": 575, "args": { "External id": 151574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151574, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 151574, "pid": 5, "tid": 7, "ts": 1716454223813446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666556, "dur": 12, "args": { "External id": 151574, "cbid": 211, "correlation": 151574 } }, { "ph": "s", "id": 151574, "pid": 76337, "tid": -914061504, "ts": 1716454223666556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223814022, "dur": 242, "args": { "External id": 151582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151582, "pid": 5, "tid": 7, "ts": 1716454223814022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666620, "dur": 13, "args": { "External id": 151582, "cbid": 211, "correlation": 151582 } }, { "ph": "s", "id": 151582, "pid": 76337, "tid": -914061504, "ts": 1716454223666620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223814265, "dur": 252, "args": { "External id": 151590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151590, "pid": 5, "tid": 7, "ts": 1716454223814265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666651, "dur": 8, "args": { "External id": 151590, "cbid": 211, "correlation": 151590 } }, { "ph": "s", "id": 151590, "pid": 76337, "tid": -914061504, "ts": 1716454223666651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666733, "dur": 1, "args": { "External id": 151606, "cbid": 251, "correlation": 151606 } }, { "ph": "f", "id": 151606, "pid": 76337, "tid": -914061504, "ts": 1716454223666733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223666738, "dur": 0, "args": { "External id": 151608, "cbid": 251, "correlation": 151608 } }, { "ph": "f", "id": 151608, "pid": 76337, "tid": -914061504, "ts": 1716454223666738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223814518, "dur": 361, "args": { "External id": 151609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151609, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151609, "pid": 5, "tid": 7, "ts": 1716454223814518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666741, "dur": 13, "args": { "External id": 151609, "cbid": 211, "correlation": 151609 } }, { "ph": "s", "id": 151609, "pid": 76337, "tid": -914061504, "ts": 1716454223666741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223814880, "dur": 50, "args": { "External id": 151617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151617, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151617, "pid": 5, "tid": 7, "ts": 1716454223814880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223666783, "dur": 186, "args": { "External id": 151617, "cbid": 211, "correlation": 151617 } }, { "ph": "s", "id": 151617, "pid": 76337, "tid": -914061504, "ts": 1716454223666783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223814932, "dur": 158, "args": { "External id": 151628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151628, "pid": 5, "tid": 7, "ts": 1716454223814932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667035, "dur": 64, "args": { "External id": 151628, "cbid": 211, "correlation": 151628 } }, { "ph": "s", "id": 151628, "pid": 76337, "tid": -914061504, "ts": 1716454223667035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223667151, "dur": 0, "args": { "External id": 151640, "cbid": 317, "correlation": 151640 } }, { "ph": "f", "id": 151640, "pid": 76337, "tid": -914061504, "ts": 1716454223667151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223667152, "dur": 0, "args": { "External id": 151641, "cbid": 203, "correlation": 151641 } }, { "ph": "f", "id": 151641, "pid": 76337, "tid": -914061504, "ts": 1716454223667152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223667152, "dur": 0, "args": { "External id": 151642, "cbid": 205, "correlation": 151642 } }, { "ph": "f", "id": 151642, "pid": 76337, "tid": -914061504, "ts": 1716454223667152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667176, "dur": 1, "args": { "External id": 151646, "cbid": 251, "correlation": 151646 } }, { "ph": "f", "id": 151646, "pid": 76337, "tid": -914061504, "ts": 1716454223667176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667178, "dur": 0, "args": { "External id": 151647, "cbid": 251, "correlation": 151647 } }, { "ph": "f", "id": 151647, "pid": 76337, "tid": -914061504, "ts": 1716454223667178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667179, "dur": 0, "args": { "External id": 151648, "cbid": 251, "correlation": 151648 } }, { "ph": "f", "id": 151648, "pid": 76337, "tid": -914061504, "ts": 1716454223667179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667179, "dur": 0, "args": { "External id": 151649, "cbid": 251, "correlation": 151649 } }, { "ph": "f", "id": 151649, "pid": 76337, "tid": -914061504, "ts": 1716454223667179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667180, "dur": 0, "args": { "External id": 151650, "cbid": 251, "correlation": 151650 } }, { "ph": "f", "id": 151650, "pid": 76337, "tid": -914061504, "ts": 1716454223667180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667181, "dur": 0, "args": { "External id": 151651, "cbid": 251, "correlation": 151651 } }, { "ph": "f", "id": 151651, "pid": 76337, "tid": -914061504, "ts": 1716454223667181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667182, "dur": 0, "args": { "External id": 151652, "cbid": 251, "correlation": 151652 } }, { "ph": "f", "id": 151652, "pid": 76337, "tid": -914061504, "ts": 1716454223667182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667183, "dur": 0, "args": { "External id": 151653, "cbid": 251, "correlation": 151653 } }, { "ph": "f", "id": 151653, "pid": 76337, "tid": -914061504, "ts": 1716454223667183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667184, "dur": 0, "args": { "External id": 151654, "cbid": 251, "correlation": 151654 } }, { "ph": "f", "id": 151654, "pid": 76337, "tid": -914061504, "ts": 1716454223667184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223815091, "dur": 114, "args": { "External id": 151655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151655, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 151655, "pid": 5, "tid": 7, "ts": 1716454223815091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667186, "dur": 42, "args": { "External id": 151655, "cbid": 211, "correlation": 151655 } }, { "ph": "s", "id": 151655, "pid": 76337, "tid": -914061504, "ts": 1716454223667186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223815206, "dur": 61, "args": { "External id": 151661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151661, "pid": 5, "tid": 7, "ts": 1716454223815206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667251, "dur": 281, "args": { "External id": 151661, "cbid": 211, "correlation": 151661 } }, { "ph": "s", "id": 151661, "pid": 76337, "tid": -914061504, "ts": 1716454223667251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815269, "dur": 50, "args": { "External id": 151669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151669, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151669, "pid": 5, "tid": 7, "ts": 1716454223815269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667555, "dur": 9, "args": { "External id": 151669, "cbid": 211, "correlation": 151669 } }, { "ph": "s", "id": 151669, "pid": 76337, "tid": -914061504, "ts": 1716454223667555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223815319, "dur": 52, "args": { "External id": 151689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151689, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 151689, "pid": 5, "tid": 7, "ts": 1716454223815319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667635, "dur": 12, "args": { "External id": 151689, "cbid": 211, "correlation": 151689 } }, { "ph": "s", "id": 151689, "pid": 76337, "tid": -914061504, "ts": 1716454223667635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223815372, "dur": 5, "args": { "External id": 151701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151701, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151701, "pid": 5, "tid": 7, "ts": 1716454223815372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667657, "dur": 11, "args": { "External id": 151701, "cbid": 211, "correlation": 151701 } }, { "ph": "s", "id": 151701, "pid": 76337, "tid": -914061504, "ts": 1716454223667657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223815378, "dur": 56, "args": { "External id": 151704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151704, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151704, "pid": 5, "tid": 7, "ts": 1716454223815378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667680, "dur": 110, "args": { "External id": 151704, "cbid": 211, "correlation": 151704 } }, { "ph": "s", "id": 151704, "pid": 76337, "tid": -914061504, "ts": 1716454223667680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815435, "dur": 37, "args": { "External id": 151713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151713, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151713, "pid": 5, "tid": 7, "ts": 1716454223815435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667830, "dur": 11, "args": { "External id": 151713, "cbid": 211, "correlation": 151713 } }, { "ph": "s", "id": 151713, "pid": 76337, "tid": -914061504, "ts": 1716454223667830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223667887, "dur": 0, "args": { "External id": 151723, "cbid": 317, "correlation": 151723 } }, { "ph": "f", "id": 151723, "pid": 76337, "tid": -914061504, "ts": 1716454223667887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223667888, "dur": 0, "args": { "External id": 151724, "cbid": 203, "correlation": 151724 } }, { "ph": "f", "id": 151724, "pid": 76337, "tid": -914061504, "ts": 1716454223667888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223667889, "dur": 0, "args": { "External id": 151725, "cbid": 205, "correlation": 151725 } }, { "ph": "f", "id": 151725, "pid": 76337, "tid": -914061504, "ts": 1716454223667889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223815473, "dur": 41, "args": { "External id": 151729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151729, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151729, "pid": 5, "tid": 7, "ts": 1716454223815473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667905, "dur": 12, "args": { "External id": 151729, "cbid": 211, "correlation": 151729 } }, { "ph": "s", "id": 151729, "pid": 76337, "tid": -914061504, "ts": 1716454223667905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223815516, "dur": 4, "args": { "External id": 151731, "device": 5, "context": 1, "stream": 7, "correlation": 151731, "bytes": 46080, "memory bandwidth (GB/s)": 11.428571428571429 } }, { "ph": "f", "id": 151731, "pid": 5, "tid": 7, "ts": 1716454223815516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223667920, "dur": 23, "args": { "External id": 151731, "cbid": 51, "correlation": 151731 } }, { "ph": "s", "id": 151731, "pid": 76337, "tid": -914061504, "ts": 1716454223667920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223667949, "dur": 1, "args": { "External id": 151733, "cbid": 200, "correlation": 151733 } }, { "ph": "f", "id": 151733, "pid": 76337, "tid": -914061504, "ts": 1716454223667949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223667951, "dur": 0, "args": { "External id": 151734, "cbid": 200, "correlation": 151734 } }, { "ph": "f", "id": 151734, "pid": 76337, "tid": -914061504, "ts": 1716454223667951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223667951, "dur": 0, "args": { "External id": 151735, "cbid": 200, "correlation": 151735 } }, { "ph": "f", "id": 151735, "pid": 76337, "tid": -914061504, "ts": 1716454223667951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223667952, "dur": 0, "args": { "External id": 151736, "cbid": 200, "correlation": 151736 } }, { "ph": "f", "id": 151736, "pid": 76337, "tid": -914061504, "ts": 1716454223667952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454223667953, "dur": 4, "args": { "External id": 151737, "cbid": 15, "correlation": 151737 } }, { "ph": "f", "id": 151737, "pid": 76337, "tid": -914061504, "ts": 1716454223667953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223667958, "dur": 1, "args": { "External id": 151738, "cbid": 251, "correlation": 151738 } }, { "ph": "f", "id": 151738, "pid": 76337, "tid": -914061504, "ts": 1716454223667958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454223815521, "dur": 25, "args": { "External id": 151739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151739, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151739, "pid": 5, "tid": 7, "ts": 1716454223815521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667961, "dur": 9, "args": { "External id": 151739, "cbid": 211, "correlation": 151739 } }, { "ph": "s", "id": 151739, "pid": 76337, "tid": -914061504, "ts": 1716454223667961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223815548, "dur": 4, "args": { "External id": 151741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 151741, "pid": 5, "tid": 7, "ts": 1716454223815548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667983, "dur": 6, "args": { "External id": 151741, "cbid": 211, "correlation": 151741 } }, { "ph": "s", "id": 151741, "pid": 76337, "tid": -914061504, "ts": 1716454223667983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223667994, "dur": 0, "args": { "External id": 151742, "cbid": 51, "correlation": 151742 } }, { "ph": "s", "id": 151742, "pid": 76337, "tid": -914061504, "ts": 1716454223667994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223815553, "dur": 187, "args": { "External id": 151743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151743, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151743, "pid": 5, "tid": 7, "ts": 1716454223815553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223667995, "dur": 187, "args": { "External id": 151743, "cbid": 211, "correlation": 151743 } }, { "ph": "s", "id": 151743, "pid": 76337, "tid": -914061504, "ts": 1716454223667995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223815741, "dur": 6, "args": { "External id": 151744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151744, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151744, "pid": 5, "tid": 7, "ts": 1716454223815741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223668185, "dur": 6, "args": { "External id": 151744, "cbid": 211, "correlation": 151744 } }, { "ph": "s", "id": 151744, "pid": 76337, "tid": -914061504, "ts": 1716454223668185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223815749, "dur": 5, "args": { "External id": 151750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 151750, "pid": 5, "tid": 7, "ts": 1716454223815749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223668216, "dur": 9, "args": { "External id": 151750, "cbid": 211, "correlation": 151750 } }, { "ph": "s", "id": 151750, "pid": 76337, "tid": -914061504, "ts": 1716454223668216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815755, "dur": 3, "args": { "External id": 151758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151758, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151758, "pid": 5, "tid": 7, "ts": 1716454223815755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223669997, "dur": 17, "args": { "External id": 151758, "cbid": 211, "correlation": 151758 } }, { "ph": "s", "id": 151758, "pid": 76337, "tid": -914061504, "ts": 1716454223669997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815759, "dur": 3, "args": { "External id": 151766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151766, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151766, "pid": 5, "tid": 7, "ts": 1716454223815759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670040, "dur": 10, "args": { "External id": 151766, "cbid": 211, "correlation": 151766 } }, { "ph": "s", "id": 151766, "pid": 76337, "tid": -914061504, "ts": 1716454223670040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815763, "dur": 3, "args": { "External id": 151774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151774, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151774, "pid": 5, "tid": 7, "ts": 1716454223815763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670067, "dur": 8, "args": { "External id": 151774, "cbid": 211, "correlation": 151774 } }, { "ph": "s", "id": 151774, "pid": 76337, "tid": -914061504, "ts": 1716454223670067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815767, "dur": 3, "args": { "External id": 151783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151783, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151783, "pid": 5, "tid": 7, "ts": 1716454223815767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670246, "dur": 14, "args": { "External id": 151783, "cbid": 211, "correlation": 151783 } }, { "ph": "s", "id": 151783, "pid": 76337, "tid": -914061504, "ts": 1716454223670246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815771, "dur": 3, "args": { "External id": 151792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151792, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151792, "pid": 5, "tid": 7, "ts": 1716454223815771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670276, "dur": 8, "args": { "External id": 151792, "cbid": 211, "correlation": 151792 } }, { "ph": "s", "id": 151792, "pid": 76337, "tid": -914061504, "ts": 1716454223670276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815775, "dur": 3, "args": { "External id": 151800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151800, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151800, "pid": 5, "tid": 7, "ts": 1716454223815775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670301, "dur": 8, "args": { "External id": 151800, "cbid": 211, "correlation": 151800 } }, { "ph": "s", "id": 151800, "pid": 76337, "tid": -914061504, "ts": 1716454223670301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815780, "dur": 3, "args": { "External id": 151808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151808, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151808, "pid": 5, "tid": 7, "ts": 1716454223815780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670561, "dur": 15, "args": { "External id": 151808, "cbid": 211, "correlation": 151808 } }, { "ph": "s", "id": 151808, "pid": 76337, "tid": -914061504, "ts": 1716454223670561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815784, "dur": 3, "args": { "External id": 151816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151816, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151816, "pid": 5, "tid": 7, "ts": 1716454223815784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223670591, "dur": 7, "args": { "External id": 151816, "cbid": 211, "correlation": 151816 } }, { "ph": "s", "id": 151816, "pid": 76337, "tid": -914061504, "ts": 1716454223670591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223815789, "dur": 1, "args": { "External id": 151826, "device": 5, "context": 1, "stream": 7, "correlation": 151826, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 151826, "pid": 5, "tid": 7, "ts": 1716454223815789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223670657, "dur": 35, "args": { "External id": 151826, "cbid": 41, "correlation": 151826 } }, { "ph": "s", "id": 151826, "pid": 76337, "tid": -914061504, "ts": 1716454223670657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223670693, "dur": 145114, "args": { "External id": 151827, "cbid": 131, "correlation": 151827 } }, { "ph": "f", "id": 151827, "pid": 76337, "tid": -914061504, "ts": 1716454223670693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223815961, "dur": 3, "args": { "External id": 151835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151835, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151835, "pid": 5, "tid": 7, "ts": 1716454223815961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223815939, "dur": 24, "args": { "External id": 151835, "cbid": 211, "correlation": 151835 } }, { "ph": "s", "id": 151835, "pid": 76337, "tid": -914061504, "ts": 1716454223815939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816066, "dur": 3, "args": { "External id": 151844, "device": 5, "context": 1, "stream": 7, "correlation": 151844, "bytes": 8, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 151844, "pid": 5, "tid": 7, "ts": 1716454223816066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816035, "dur": 31, "args": { "External id": 151844, "cbid": 41, "correlation": 151844 } }, { "ph": "s", "id": 151844, "pid": 76337, "tid": -914061504, "ts": 1716454223816035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223816165, "dur": 4, "args": { "External id": 151854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151854, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 151854, "pid": 5, "tid": 7, "ts": 1716454223816165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816149, "dur": 18, "args": { "External id": 151854, "cbid": 211, "correlation": 151854 } }, { "ph": "s", "id": 151854, "pid": 76337, "tid": -914061504, "ts": 1716454223816149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816241, "dur": 1, "args": { "External id": 151864, "device": 5, "context": 1, "stream": 7, "correlation": 151864, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 151864, "pid": 5, "tid": 7, "ts": 1716454223816241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816222, "dur": 17, "args": { "External id": 151864, "cbid": 41, "correlation": 151864 } }, { "ph": "s", "id": 151864, "pid": 76337, "tid": -914061504, "ts": 1716454223816222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223816240, "dur": 8, "args": { "External id": 151865, "cbid": 131, "correlation": 151865 } }, { "ph": "f", "id": 151865, "pid": 76337, "tid": -914061504, "ts": 1716454223816240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816312, "dur": 3, "args": { "External id": 151872, "device": 5, "context": 1, "stream": 7, "correlation": 151872, "bytes": 98304, "memory bandwidth (GB/s)": 30.72 } }, { "ph": "f", "id": 151872, "pid": 5, "tid": 7, "ts": 1716454223816312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816287, "dur": 25, "args": { "External id": 151872, "cbid": 41, "correlation": 151872 } }, { "ph": "s", "id": 151872, "pid": 76337, "tid": -914061504, "ts": 1716454223816287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816401, "dur": 3, "args": { "External id": 151891, "device": 5, "context": 1, "stream": 7, "correlation": 151891, "bytes": 16, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 151891, "pid": 5, "tid": 7, "ts": 1716454223816401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816383, "dur": 18, "args": { "External id": 151891, "cbid": 41, "correlation": 151891 } }, { "ph": "s", "id": 151891, "pid": 76337, "tid": -914061504, "ts": 1716454223816383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223816440, "dur": 3, "args": { "External id": 151897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151897, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151897, "pid": 5, "tid": 7, "ts": 1716454223816440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816428, "dur": 11, "args": { "External id": 151897, "cbid": 211, "correlation": 151897 } }, { "ph": "s", "id": 151897, "pid": 76337, "tid": -914061504, "ts": 1716454223816428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454223816454, "dur": 6, "args": { "External id": 151899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151899, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 151899, "pid": 5, "tid": 7, "ts": 1716454223816454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816443, "dur": 10, "args": { "External id": 151899, "cbid": 211, "correlation": 151899 } }, { "ph": "s", "id": 151899, "pid": 76337, "tid": -914061504, "ts": 1716454223816443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454223816463, "dur": 3, "args": { "External id": 151901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151901, "pid": 5, "tid": 7, "ts": 1716454223816463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816455, "dur": 7, "args": { "External id": 151901, "cbid": 211, "correlation": 151901 } }, { "ph": "s", "id": 151901, "pid": 76337, "tid": -914061504, "ts": 1716454223816455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816497, "dur": 2, "args": { "External id": 151909, "device": 5, "context": 1, "stream": 7, "correlation": 151909, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 151909, "pid": 5, "tid": 7, "ts": 1716454223816497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816483, "dur": 13, "args": { "External id": 151909, "cbid": 41, "correlation": 151909 } }, { "ph": "s", "id": 151909, "pid": 76337, "tid": -914061504, "ts": 1716454223816483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223816545, "dur": 3, "args": { "External id": 151923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151923, "pid": 5, "tid": 7, "ts": 1716454223816545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816534, "dur": 12, "args": { "External id": 151923, "cbid": 211, "correlation": 151923 } }, { "ph": "s", "id": 151923, "pid": 76337, "tid": -914061504, "ts": 1716454223816534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223816564, "dur": 3, "args": { "External id": 151937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151937, "pid": 5, "tid": 7, "ts": 1716454223816564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816557, "dur": 6, "args": { "External id": 151937, "cbid": 211, "correlation": 151937 } }, { "ph": "s", "id": 151937, "pid": 76337, "tid": -914061504, "ts": 1716454223816557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223816600, "dur": 6, "args": { "External id": 151944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151944, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151944, "pid": 5, "tid": 7, "ts": 1716454223816600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816588, "dur": 12, "args": { "External id": 151944, "cbid": 211, "correlation": 151944 } }, { "ph": "s", "id": 151944, "pid": 76337, "tid": -914061504, "ts": 1716454223816588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454223816610, "dur": 6, "args": { "External id": 151947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151947, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151947, "pid": 5, "tid": 7, "ts": 1716454223816610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816603, "dur": 6, "args": { "External id": 151947, "cbid": 211, "correlation": 151947 } }, { "ph": "s", "id": 151947, "pid": 76337, "tid": -914061504, "ts": 1716454223816603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454223816619, "dur": 3, "args": { "External id": 151949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151949, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151949, "pid": 5, "tid": 7, "ts": 1716454223816619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816611, "dur": 6, "args": { "External id": 151949, "cbid": 211, "correlation": 151949 } }, { "ph": "s", "id": 151949, "pid": 76337, "tid": -914061504, "ts": 1716454223816611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816638, "dur": 2, "args": { "External id": 151952, "device": 5, "context": 1, "stream": 7, "correlation": 151952, "bytes": 8, "memory bandwidth (GB/s)": 0.002777777777777778 } }, { "ph": "f", "id": 151952, "pid": 5, "tid": 7, "ts": 1716454223816638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816626, "dur": 11, "args": { "External id": 151952, "cbid": 41, "correlation": 151952 } }, { "ph": "s", "id": 151952, "pid": 76337, "tid": -914061504, "ts": 1716454223816626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223816691, "dur": 4, "args": { "External id": 151968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151968, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 151968, "pid": 5, "tid": 7, "ts": 1716454223816691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816678, "dur": 13, "args": { "External id": 151968, "cbid": 211, "correlation": 151968 } }, { "ph": "s", "id": 151968, "pid": 76337, "tid": -914061504, "ts": 1716454223816678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223816713, "dur": 3, "args": { "External id": 151973, "device": 5, "context": 1, "stream": 7, "correlation": 151973, "bytes": 1, "memory bandwidth (GB/s)": 0.0003156565656565657 } }, { "ph": "f", "id": 151973, "pid": 5, "tid": 7, "ts": 1716454223816713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816697, "dur": 14, "args": { "External id": 151973, "cbid": 41, "correlation": 151973 } }, { "ph": "s", "id": 151973, "pid": 76337, "tid": -914061504, "ts": 1716454223816697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223816740, "dur": 1, "args": { "External id": 151979, "device": 5, "context": 1, "stream": 7, "correlation": 151979, "bytes": 1, "memory bandwidth (GB/s)": 0.0005896226415094339 } }, { "ph": "f", "id": 151979, "pid": 5, "tid": 7, "ts": 1716454223816740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223816721, "dur": 28, "args": { "External id": 151979, "cbid": 41, "correlation": 151979 } }, { "ph": "s", "id": 151979, "pid": 76337, "tid": -914061504, "ts": 1716454223816721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223816750, "dur": 3, "args": { "External id": 151980, "cbid": 131, "correlation": 151980 } }, { "ph": "f", "id": 151980, "pid": 76337, "tid": -914061504, "ts": 1716454223816750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223816800, "dur": 3, "args": { "External id": 151988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151988, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151988, "pid": 5, "tid": 7, "ts": 1716454223816800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816786, "dur": 13, "args": { "External id": 151988, "cbid": 211, "correlation": 151988 } }, { "ph": "s", "id": 151988, "pid": 76337, "tid": -914061504, "ts": 1716454223816786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223816830, "dur": 3, "args": { "External id": 151998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 151998, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 151998, "pid": 5, "tid": 7, "ts": 1716454223816830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816821, "dur": 8, "args": { "External id": 151998, "cbid": 211, "correlation": 151998 } }, { "ph": "s", "id": 151998, "pid": 76337, "tid": -914061504, "ts": 1716454223816821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223816855, "dur": 3, "args": { "External id": 152007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152007, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152007, "pid": 5, "tid": 7, "ts": 1716454223816855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816845, "dur": 9, "args": { "External id": 152007, "cbid": 211, "correlation": 152007 } }, { "ph": "s", "id": 152007, "pid": 76337, "tid": -914061504, "ts": 1716454223816845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223816968, "dur": 12, "args": { "External id": 152017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152017, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152017, "pid": 5, "tid": 7, "ts": 1716454223816968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223816953, "dur": 16, "args": { "External id": 152017, "cbid": 211, "correlation": 152017 } }, { "ph": "s", "id": 152017, "pid": 76337, "tid": -914061504, "ts": 1716454223816953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223817019, "dur": 3, "args": { "External id": 152025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152025, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152025, "pid": 5, "tid": 7, "ts": 1716454223817019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817008, "dur": 9, "args": { "External id": 152025, "cbid": 211, "correlation": 152025 } }, { "ph": "s", "id": 152025, "pid": 76337, "tid": -914061504, "ts": 1716454223817008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223817064, "dur": 12, "args": { "External id": 152035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152035, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152035, "pid": 5, "tid": 7, "ts": 1716454223817064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817052, "dur": 11, "args": { "External id": 152035, "cbid": 211, "correlation": 152035 } }, { "ph": "s", "id": 152035, "pid": 76337, "tid": -914061504, "ts": 1716454223817052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223817095, "dur": 10, "args": { "External id": 152043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152043, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152043, "pid": 5, "tid": 7, "ts": 1716454223817095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817086, "dur": 9, "args": { "External id": 152043, "cbid": 211, "correlation": 152043 } }, { "ph": "s", "id": 152043, "pid": 76337, "tid": -914061504, "ts": 1716454223817086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223817121, "dur": 3, "args": { "External id": 152052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152052, "pid": 5, "tid": 7, "ts": 1716454223817121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817112, "dur": 9, "args": { "External id": 152052, "cbid": 211, "correlation": 152052 } }, { "ph": "s", "id": 152052, "pid": 76337, "tid": -914061504, "ts": 1716454223817112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223817148, "dur": 5, "args": { "External id": 152061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152061, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152061, "pid": 5, "tid": 7, "ts": 1716454223817148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817136, "dur": 10, "args": { "External id": 152061, "cbid": 211, "correlation": 152061 } }, { "ph": "s", "id": 152061, "pid": 76337, "tid": -914061504, "ts": 1716454223817136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223817187, "dur": 8, "args": { "External id": 152071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152071, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152071, "pid": 5, "tid": 7, "ts": 1716454223817187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817175, "dur": 12, "args": { "External id": 152071, "cbid": 211, "correlation": 152071 } }, { "ph": "s", "id": 152071, "pid": 76337, "tid": -914061504, "ts": 1716454223817175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223817514, "dur": 3, "args": { "External id": 152080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152080, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152080, "pid": 5, "tid": 7, "ts": 1716454223817514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817498, "dur": 15, "args": { "External id": 152080, "cbid": 211, "correlation": 152080 } }, { "ph": "s", "id": 152080, "pid": 76337, "tid": -914061504, "ts": 1716454223817498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223817542, "dur": 3, "args": { "External id": 152088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152088, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152088, "pid": 5, "tid": 7, "ts": 1716454223817542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817532, "dur": 9, "args": { "External id": 152088, "cbid": 211, "correlation": 152088 } }, { "ph": "s", "id": 152088, "pid": 76337, "tid": -914061504, "ts": 1716454223817532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223817593, "dur": 1, "args": { "External id": 152098, "device": 5, "context": 1, "stream": 7, "correlation": 152098, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 152098, "pid": 5, "tid": 7, "ts": 1716454223817593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223817578, "dur": 13, "args": { "External id": 152098, "cbid": 41, "correlation": 152098 } }, { "ph": "s", "id": 152098, "pid": 76337, "tid": -914061504, "ts": 1716454223817578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223817592, "dur": 8, "args": { "External id": 152099, "cbid": 131, "correlation": 152099 } }, { "ph": "f", "id": 152099, "pid": 76337, "tid": -914061504, "ts": 1716454223817592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223817683, "dur": 2, "args": { "External id": 152107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152107, "pid": 5, "tid": 7, "ts": 1716454223817683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817669, "dur": 14, "args": { "External id": 152107, "cbid": 211, "correlation": 152107 } }, { "ph": "s", "id": 152107, "pid": 76337, "tid": -914061504, "ts": 1716454223817669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223817755, "dur": 3, "args": { "External id": 152116, "device": 5, "context": 1, "stream": 7, "correlation": 152116, "bytes": 8, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 152116, "pid": 5, "tid": 7, "ts": 1716454223817755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223817737, "dur": 17, "args": { "External id": 152116, "cbid": 41, "correlation": 152116 } }, { "ph": "s", "id": 152116, "pid": 76337, "tid": -914061504, "ts": 1716454223817737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223817828, "dur": 3, "args": { "External id": 152126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152126, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152126, "pid": 5, "tid": 7, "ts": 1716454223817828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223817814, "dur": 14, "args": { "External id": 152126, "cbid": 211, "correlation": 152126 } }, { "ph": "s", "id": 152126, "pid": 76337, "tid": -914061504, "ts": 1716454223817814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223817880, "dur": 1, "args": { "External id": 152136, "device": 5, "context": 1, "stream": 7, "correlation": 152136, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 152136, "pid": 5, "tid": 7, "ts": 1716454223817880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223817865, "dur": 12, "args": { "External id": 152136, "cbid": 41, "correlation": 152136 } }, { "ph": "s", "id": 152136, "pid": 76337, "tid": -914061504, "ts": 1716454223817865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223817879, "dur": 8, "args": { "External id": 152137, "cbid": 131, "correlation": 152137 } }, { "ph": "f", "id": 152137, "pid": 76337, "tid": -914061504, "ts": 1716454223817879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454223817939, "dur": 3, "args": { "External id": 152144, "device": 5, "context": 1, "stream": 7, "correlation": 152144, "bytes": 98304, "memory bandwidth (GB/s)": 32 } }, { "ph": "f", "id": 152144, "pid": 5, "tid": 7, "ts": 1716454223817939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223817920, "dur": 19, "args": { "External id": 152144, "cbid": 41, "correlation": 152144 } }, { "ph": "s", "id": 152144, "pid": 76337, "tid": -914061504, "ts": 1716454223817920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223817996, "dur": 1, "args": { "External id": 152155, "device": 5, "context": 1, "stream": 7, "correlation": 152155, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 152155, "pid": 5, "tid": 7, "ts": 1716454223817996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223817983, "dur": 10, "args": { "External id": 152155, "cbid": 41, "correlation": 152155 } }, { "ph": "s", "id": 152155, "pid": 76337, "tid": -914061504, "ts": 1716454223817983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223817995, "dur": 8, "args": { "External id": 152156, "cbid": 131, "correlation": 152156 } }, { "ph": "f", "id": 152156, "pid": 76337, "tid": -914061504, "ts": 1716454223817995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818046, "dur": 3, "args": { "External id": 152164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152164, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152164, "pid": 5, "tid": 7, "ts": 1716454223818046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818033, "dur": 13, "args": { "External id": 152164, "cbid": 211, "correlation": 152164 } }, { "ph": "s", "id": 152164, "pid": 76337, "tid": -914061504, "ts": 1716454223818033, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818075, "dur": 3, "args": { "External id": 152174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152174, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152174, "pid": 5, "tid": 7, "ts": 1716454223818075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818066, "dur": 8, "args": { "External id": 152174, "cbid": 211, "correlation": 152174 } }, { "ph": "s", "id": 152174, "pid": 76337, "tid": -914061504, "ts": 1716454223818066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818097, "dur": 3, "args": { "External id": 152183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152183, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152183, "pid": 5, "tid": 7, "ts": 1716454223818097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818089, "dur": 7, "args": { "External id": 152183, "cbid": 211, "correlation": 152183 } }, { "ph": "s", "id": 152183, "pid": 76337, "tid": -914061504, "ts": 1716454223818089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223818167, "dur": 6, "args": { "External id": 152191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152191, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152191, "pid": 5, "tid": 7, "ts": 1716454223818167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818153, "dur": 15, "args": { "External id": 152191, "cbid": 211, "correlation": 152191 } }, { "ph": "s", "id": 152191, "pid": 76337, "tid": -914061504, "ts": 1716454223818153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818207, "dur": 3, "args": { "External id": 152200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152200, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152200, "pid": 5, "tid": 7, "ts": 1716454223818207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818196, "dur": 10, "args": { "External id": 152200, "cbid": 211, "correlation": 152200 } }, { "ph": "s", "id": 152200, "pid": 76337, "tid": -914061504, "ts": 1716454223818196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818230, "dur": 3, "args": { "External id": 152209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152209, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152209, "pid": 5, "tid": 7, "ts": 1716454223818230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818222, "dur": 7, "args": { "External id": 152209, "cbid": 211, "correlation": 152209 } }, { "ph": "s", "id": 152209, "pid": 76337, "tid": -914061504, "ts": 1716454223818222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818291, "dur": 3, "args": { "External id": 152217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152217, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152217, "pid": 5, "tid": 7, "ts": 1716454223818291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818280, "dur": 9, "args": { "External id": 152217, "cbid": 211, "correlation": 152217 } }, { "ph": "s", "id": 152217, "pid": 76337, "tid": -914061504, "ts": 1716454223818280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223818350, "dur": 1, "args": { "External id": 152225, "device": 5, "context": 1, "stream": 7, "correlation": 152225, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 152225, "pid": 5, "tid": 7, "ts": 1716454223818350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223818333, "dur": 27, "args": { "External id": 152225, "cbid": 41, "correlation": 152225 } }, { "ph": "s", "id": 152225, "pid": 76337, "tid": -914061504, "ts": 1716454223818333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223818361, "dur": 3, "args": { "External id": 152226, "cbid": 131, "correlation": 152226 } }, { "ph": "f", "id": 152226, "pid": 76337, "tid": -914061504, "ts": 1716454223818361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223818422, "dur": 1, "args": { "External id": 152236, "device": 5, "context": 1, "stream": 7, "correlation": 152236, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 152236, "pid": 5, "tid": 7, "ts": 1716454223818422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223818410, "dur": 10, "args": { "External id": 152236, "cbid": 41, "correlation": 152236 } }, { "ph": "s", "id": 152236, "pid": 76337, "tid": -914061504, "ts": 1716454223818410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223818421, "dur": 8, "args": { "External id": 152237, "cbid": 131, "correlation": 152237 } }, { "ph": "f", "id": 152237, "pid": 76337, "tid": -914061504, "ts": 1716454223818421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223818478, "dur": 1, "args": { "External id": 152246, "device": 5, "context": 1, "stream": 7, "correlation": 152246, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 152246, "pid": 5, "tid": 7, "ts": 1716454223818478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223818467, "dur": 8, "args": { "External id": 152246, "cbid": 41, "correlation": 152246 } }, { "ph": "s", "id": 152246, "pid": 76337, "tid": -914061504, "ts": 1716454223818467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223818476, "dur": 8, "args": { "External id": 152247, "cbid": 131, "correlation": 152247 } }, { "ph": "f", "id": 152247, "pid": 76337, "tid": -914061504, "ts": 1716454223818476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223818551, "dur": 4, "args": { "External id": 152254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152254, "pid": 5, "tid": 7, "ts": 1716454223818551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818533, "dur": 18, "args": { "External id": 152254, "cbid": 211, "correlation": 152254 } }, { "ph": "s", "id": 152254, "pid": 76337, "tid": -914061504, "ts": 1716454223818533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454223818586, "dur": 4, "args": { "External id": 152274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152274, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152274, "pid": 5, "tid": 7, "ts": 1716454223818586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818575, "dur": 12, "args": { "External id": 152274, "cbid": 211, "correlation": 152274 } }, { "ph": "s", "id": 152274, "pid": 76337, "tid": -914061504, "ts": 1716454223818575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223818588, "dur": 0, "args": { "External id": 152275, "cbid": 11, "correlation": 152275 } }, { "ph": "f", "id": 152275, "pid": 76337, "tid": -914061504, "ts": 1716454223818588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223818588, "dur": 0, "args": { "External id": 152276, "cbid": 11, "correlation": 152276 } }, { "ph": "f", "id": 152276, "pid": 76337, "tid": -914061504, "ts": 1716454223818588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223818603, "dur": 1, "args": { "External id": 152279, "device": 5, "context": 1, "stream": 7, "correlation": 152279, "bytes": 4, "memory bandwidth (GB/s)": 0.002403846153846154 } }, { "ph": "f", "id": 152279, "pid": 5, "tid": 7, "ts": 1716454223818603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223818590, "dur": 22, "args": { "External id": 152279, "cbid": 41, "correlation": 152279 } }, { "ph": "s", "id": 152279, "pid": 76337, "tid": -914061504, "ts": 1716454223818590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223818612, "dur": 3, "args": { "External id": 152280, "cbid": 131, "correlation": 152280 } }, { "ph": "f", "id": 152280, "pid": 76337, "tid": -914061504, "ts": 1716454223818612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454223818640, "dur": 3, "args": { "External id": 152304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152304, "pid": 5, "tid": 7, "ts": 1716454223818640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818631, "dur": 9, "args": { "External id": 152304, "cbid": 211, "correlation": 152304 } }, { "ph": "s", "id": 152304, "pid": 76337, "tid": -914061504, "ts": 1716454223818631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223818641, "dur": 0, "args": { "External id": 152305, "cbid": 11, "correlation": 152305 } }, { "ph": "f", "id": 152305, "pid": 76337, "tid": -914061504, "ts": 1716454223818641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223818642, "dur": 0, "args": { "External id": 152306, "cbid": 11, "correlation": 152306 } }, { "ph": "f", "id": 152306, "pid": 76337, "tid": -914061504, "ts": 1716454223818642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223818643, "dur": 1, "args": { "External id": 152308, "cbid": 200, "correlation": 152308 } }, { "ph": "f", "id": 152308, "pid": 76337, "tid": -914061504, "ts": 1716454223818643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454223818654, "dur": 4, "args": { "External id": 152310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152310, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152310, "pid": 5, "tid": 7, "ts": 1716454223818654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818646, "dur": 8, "args": { "External id": 152310, "cbid": 211, "correlation": 152310 } }, { "ph": "s", "id": 152310, "pid": 76337, "tid": -914061504, "ts": 1716454223818646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223818655, "dur": 0, "args": { "External id": 152311, "cbid": 11, "correlation": 152311 } }, { "ph": "f", "id": 152311, "pid": 76337, "tid": -914061504, "ts": 1716454223818655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454223818655, "dur": 0, "args": { "External id": 152312, "cbid": 11, "correlation": 152312 } }, { "ph": "f", "id": 152312, "pid": 76337, "tid": -914061504, "ts": 1716454223818655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454223818692, "dur": 1, "args": { "External id": 152319, "device": 5, "context": 1, "stream": 7, "correlation": 152319, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 152319, "pid": 5, "tid": 7, "ts": 1716454223818692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223818681, "dur": 20, "args": { "External id": 152319, "cbid": 41, "correlation": 152319 } }, { "ph": "s", "id": 152319, "pid": 76337, "tid": -914061504, "ts": 1716454223818681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223818701, "dur": 3, "args": { "External id": 152320, "cbid": 131, "correlation": 152320 } }, { "ph": "f", "id": 152320, "pid": 76337, "tid": -914061504, "ts": 1716454223818701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454223818752, "dur": 1, "args": { "External id": 152330, "device": 5, "context": 1, "stream": 7, "correlation": 152330, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 152330, "pid": 5, "tid": 7, "ts": 1716454223818752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223818739, "dur": 11, "args": { "External id": 152330, "cbid": 41, "correlation": 152330 } }, { "ph": "s", "id": 152330, "pid": 76337, "tid": -914061504, "ts": 1716454223818739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223818750, "dur": 8, "args": { "External id": 152331, "cbid": 131, "correlation": 152331 } }, { "ph": "f", "id": 152331, "pid": 76337, "tid": -914061504, "ts": 1716454223818750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223818821, "dur": 5, "args": { "External id": 152338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152338, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152338, "pid": 5, "tid": 7, "ts": 1716454223818821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818806, "dur": 15, "args": { "External id": 152338, "cbid": 211, "correlation": 152338 } }, { "ph": "s", "id": 152338, "pid": 76337, "tid": -914061504, "ts": 1716454223818806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818894, "dur": 3, "args": { "External id": 152347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152347, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152347, "pid": 5, "tid": 7, "ts": 1716454223818894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818880, "dur": 13, "args": { "External id": 152347, "cbid": 211, "correlation": 152347 } }, { "ph": "s", "id": 152347, "pid": 76337, "tid": -914061504, "ts": 1716454223818880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818931, "dur": 3, "args": { "External id": 152355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152355, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152355, "pid": 5, "tid": 7, "ts": 1716454223818931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818921, "dur": 10, "args": { "External id": 152355, "cbid": 211, "correlation": 152355 } }, { "ph": "s", "id": 152355, "pid": 76337, "tid": -914061504, "ts": 1716454223818921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223818963, "dur": 4, "args": { "External id": 152363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152363, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152363, "pid": 5, "tid": 7, "ts": 1716454223818963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818953, "dur": 11, "args": { "External id": 152363, "cbid": 211, "correlation": 152363 } }, { "ph": "s", "id": 152363, "pid": 76337, "tid": -914061504, "ts": 1716454223818953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819002, "dur": 4, "args": { "External id": 152371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152371, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152371, "pid": 5, "tid": 7, "ts": 1716454223819002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223818991, "dur": 11, "args": { "External id": 152371, "cbid": 211, "correlation": 152371 } }, { "ph": "s", "id": 152371, "pid": 76337, "tid": -914061504, "ts": 1716454223818991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819030, "dur": 3, "args": { "External id": 152379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152379, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152379, "pid": 5, "tid": 7, "ts": 1716454223819030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819020, "dur": 9, "args": { "External id": 152379, "cbid": 211, "correlation": 152379 } }, { "ph": "s", "id": 152379, "pid": 76337, "tid": -914061504, "ts": 1716454223819020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819056, "dur": 3, "args": { "External id": 152387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152387, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152387, "pid": 5, "tid": 7, "ts": 1716454223819056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819047, "dur": 8, "args": { "External id": 152387, "cbid": 211, "correlation": 152387 } }, { "ph": "s", "id": 152387, "pid": 76337, "tid": -914061504, "ts": 1716454223819047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223819077, "dur": 4, "args": { "External id": 152395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152395, "pid": 5, "tid": 7, "ts": 1716454223819077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819069, "dur": 7, "args": { "External id": 152395, "cbid": 211, "correlation": 152395 } }, { "ph": "s", "id": 152395, "pid": 76337, "tid": -914061504, "ts": 1716454223819069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223819097, "dur": 4, "args": { "External id": 152403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152403, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152403, "pid": 5, "tid": 7, "ts": 1716454223819097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819089, "dur": 6, "args": { "External id": 152403, "cbid": 211, "correlation": 152403 } }, { "ph": "s", "id": 152403, "pid": 76337, "tid": -914061504, "ts": 1716454223819089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819116, "dur": 3, "args": { "External id": 152411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152411, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152411, "pid": 5, "tid": 7, "ts": 1716454223819116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819108, "dur": 7, "args": { "External id": 152411, "cbid": 211, "correlation": 152411 } }, { "ph": "s", "id": 152411, "pid": 76337, "tid": -914061504, "ts": 1716454223819108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819172, "dur": 3, "args": { "External id": 152419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152419, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152419, "pid": 5, "tid": 7, "ts": 1716454223819172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819162, "dur": 10, "args": { "External id": 152419, "cbid": 211, "correlation": 152419 } }, { "ph": "s", "id": 152419, "pid": 76337, "tid": -914061504, "ts": 1716454223819162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223819199, "dur": 4, "args": { "External id": 152427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152427, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152427, "pid": 5, "tid": 7, "ts": 1716454223819199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819190, "dur": 8, "args": { "External id": 152427, "cbid": 211, "correlation": 152427 } }, { "ph": "s", "id": 152427, "pid": 76337, "tid": -914061504, "ts": 1716454223819190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223819221, "dur": 4, "args": { "External id": 152435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152435, "pid": 5, "tid": 7, "ts": 1716454223819221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819213, "dur": 7, "args": { "External id": 152435, "cbid": 211, "correlation": 152435 } }, { "ph": "s", "id": 152435, "pid": 76337, "tid": -914061504, "ts": 1716454223819213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819241, "dur": 3, "args": { "External id": 152443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152443, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 152443, "pid": 5, "tid": 7, "ts": 1716454223819241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819233, "dur": 7, "args": { "External id": 152443, "cbid": 211, "correlation": 152443 } }, { "ph": "s", "id": 152443, "pid": 76337, "tid": -914061504, "ts": 1716454223819233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223819653, "dur": 5, "args": { "External id": 152452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152452, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152452, "pid": 5, "tid": 7, "ts": 1716454223819653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819637, "dur": 17, "args": { "External id": 152452, "cbid": 211, "correlation": 152452 } }, { "ph": "s", "id": 152452, "pid": 76337, "tid": -914061504, "ts": 1716454223819637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223819689, "dur": 5, "args": { "External id": 152461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152461, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152461, "pid": 5, "tid": 7, "ts": 1716454223819689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819678, "dur": 10, "args": { "External id": 152461, "cbid": 211, "correlation": 152461 } }, { "ph": "s", "id": 152461, "pid": 76337, "tid": -914061504, "ts": 1716454223819678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454223819818, "dur": 3, "args": { "External id": 152477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152477, "pid": 5, "tid": 7, "ts": 1716454223819818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819804, "dur": 15, "args": { "External id": 152477, "cbid": 211, "correlation": 152477 } }, { "ph": "s", "id": 152477, "pid": 76337, "tid": -914061504, "ts": 1716454223819804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819852, "dur": 3, "args": { "External id": 152485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152485, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152485, "pid": 5, "tid": 7, "ts": 1716454223819852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819842, "dur": 8, "args": { "External id": 152485, "cbid": 211, "correlation": 152485 } }, { "ph": "s", "id": 152485, "pid": 76337, "tid": -914061504, "ts": 1716454223819842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819883, "dur": 3, "args": { "External id": 152493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152493, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152493, "pid": 5, "tid": 7, "ts": 1716454223819883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819872, "dur": 10, "args": { "External id": 152493, "cbid": 211, "correlation": 152493 } }, { "ph": "s", "id": 152493, "pid": 76337, "tid": -914061504, "ts": 1716454223819872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223819913, "dur": 4, "args": { "External id": 152501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152501, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152501, "pid": 5, "tid": 7, "ts": 1716454223819913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819904, "dur": 8, "args": { "External id": 152501, "cbid": 211, "correlation": 152501 } }, { "ph": "s", "id": 152501, "pid": 76337, "tid": -914061504, "ts": 1716454223819904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454223819967, "dur": 4, "args": { "External id": 152513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152513, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152513, "pid": 5, "tid": 7, "ts": 1716454223819967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223819955, "dur": 13, "args": { "External id": 152513, "cbid": 211, "correlation": 152513 } }, { "ph": "s", "id": 152513, "pid": 76337, "tid": -914061504, "ts": 1716454223819955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223820023, "dur": 4, "args": { "External id": 152524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152524, "pid": 5, "tid": 7, "ts": 1716454223820023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820011, "dur": 12, "args": { "External id": 152524, "cbid": 211, "correlation": 152524 } }, { "ph": "s", "id": 152524, "pid": 76337, "tid": -914061504, "ts": 1716454223820011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223820054, "dur": 3, "args": { "External id": 152532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152532, "pid": 5, "tid": 7, "ts": 1716454223820054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820045, "dur": 8, "args": { "External id": 152532, "cbid": 211, "correlation": 152532 } }, { "ph": "s", "id": 152532, "pid": 76337, "tid": -914061504, "ts": 1716454223820045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223820088, "dur": 5, "args": { "External id": 152540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152540, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152540, "pid": 5, "tid": 7, "ts": 1716454223820088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820078, "dur": 9, "args": { "External id": 152540, "cbid": 211, "correlation": 152540 } }, { "ph": "s", "id": 152540, "pid": 76337, "tid": -914061504, "ts": 1716454223820078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223820117, "dur": 5, "args": { "External id": 152548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152548, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152548, "pid": 5, "tid": 7, "ts": 1716454223820117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820107, "dur": 9, "args": { "External id": 152548, "cbid": 211, "correlation": 152548 } }, { "ph": "s", "id": 152548, "pid": 76337, "tid": -914061504, "ts": 1716454223820107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223820147, "dur": 4, "args": { "External id": 152557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152557, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152557, "pid": 5, "tid": 7, "ts": 1716454223820147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820137, "dur": 9, "args": { "External id": 152557, "cbid": 211, "correlation": 152557 } }, { "ph": "s", "id": 152557, "pid": 76337, "tid": -914061504, "ts": 1716454223820137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223820207, "dur": 5, "args": { "External id": 152570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152570, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152570, "pid": 5, "tid": 7, "ts": 1716454223820207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820194, "dur": 14, "args": { "External id": 152570, "cbid": 211, "correlation": 152570 } }, { "ph": "s", "id": 152570, "pid": 76337, "tid": -914061504, "ts": 1716454223820194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454223820248, "dur": 5, "args": { "External id": 152580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152580, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 152580, "pid": 5, "tid": 7, "ts": 1716454223820248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820237, "dur": 10, "args": { "External id": 152580, "cbid": 211, "correlation": 152580 } }, { "ph": "s", "id": 152580, "pid": 76337, "tid": -914061504, "ts": 1716454223820237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223820376, "dur": 4, "args": { "External id": 152597, "cbid": 251, "correlation": 152597 } }, { "ph": "f", "id": 152597, "pid": 76337, "tid": -914061504, "ts": 1716454223820376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454223820406, "dur": 12, "args": { "External id": 152599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152599, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152599, "pid": 5, "tid": 7, "ts": 1716454223820406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820389, "dur": 17, "args": { "External id": 152599, "cbid": 211, "correlation": 152599 } }, { "ph": "s", "id": 152599, "pid": 76337, "tid": -914061504, "ts": 1716454223820389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223820467, "dur": 3, "args": { "External id": 152607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152607, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152607, "pid": 5, "tid": 7, "ts": 1716454223820467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820455, "dur": 12, "args": { "External id": 152607, "cbid": 211, "correlation": 152607 } }, { "ph": "s", "id": 152607, "pid": 76337, "tid": -914061504, "ts": 1716454223820455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223820523, "dur": 2, "args": { "External id": 152623, "cbid": 251, "correlation": 152623 } }, { "ph": "f", "id": 152623, "pid": 76337, "tid": -914061504, "ts": 1716454223820523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223820529, "dur": 0, "args": { "External id": 152625, "cbid": 251, "correlation": 152625 } }, { "ph": "f", "id": 152625, "pid": 76337, "tid": -914061504, "ts": 1716454223820529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223820546, "dur": 14, "args": { "External id": 152626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152626, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152626, "pid": 5, "tid": 7, "ts": 1716454223820546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820532, "dur": 14, "args": { "External id": 152626, "cbid": 211, "correlation": 152626 } }, { "ph": "s", "id": 152626, "pid": 76337, "tid": -914061504, "ts": 1716454223820532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223820561, "dur": 5, "args": { "External id": 152628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152628, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152628, "pid": 5, "tid": 7, "ts": 1716454223820561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820550, "dur": 9, "args": { "External id": 152628, "cbid": 211, "correlation": 152628 } }, { "ph": "s", "id": 152628, "pid": 76337, "tid": -914061504, "ts": 1716454223820550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223820659, "dur": 1, "args": { "External id": 152638, "cbid": 317, "correlation": 152638 } }, { "ph": "f", "id": 152638, "pid": 76337, "tid": -914061504, "ts": 1716454223820659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223820661, "dur": 1, "args": { "External id": 152639, "cbid": 203, "correlation": 152639 } }, { "ph": "f", "id": 152639, "pid": 76337, "tid": -914061504, "ts": 1716454223820661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223820663, "dur": 1, "args": { "External id": 152640, "cbid": 205, "correlation": 152640 } }, { "ph": "f", "id": 152640, "pid": 76337, "tid": -914061504, "ts": 1716454223820663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223820717, "dur": 7, "args": { "External id": 152644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152644, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152644, "pid": 5, "tid": 7, "ts": 1716454223820717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820703, "dur": 14, "args": { "External id": 152644, "cbid": 211, "correlation": 152644 } }, { "ph": "s", "id": 152644, "pid": 76337, "tid": -914061504, "ts": 1716454223820703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223820728, "dur": 4, "args": { "External id": 152646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152646, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 152646, "pid": 5, "tid": 7, "ts": 1716454223820728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820720, "dur": 6, "args": { "External id": 152646, "cbid": 211, "correlation": 152646 } }, { "ph": "s", "id": 152646, "pid": 76337, "tid": -914061504, "ts": 1716454223820720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223820746, "dur": 3, "args": { "External id": 152648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152648, "pid": 5, "tid": 7, "ts": 1716454223820746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820737, "dur": 8, "args": { "External id": 152648, "cbid": 211, "correlation": 152648 } }, { "ph": "s", "id": 152648, "pid": 76337, "tid": -914061504, "ts": 1716454223820737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223820752, "dur": 0, "args": { "External id": 152649, "cbid": 51, "correlation": 152649 } }, { "ph": "s", "id": 152649, "pid": 76337, "tid": -914061504, "ts": 1716454223820752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223820761, "dur": 85, "args": { "External id": 152650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152650, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152650, "pid": 5, "tid": 7, "ts": 1716454223820761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820753, "dur": 7, "args": { "External id": 152650, "cbid": 211, "correlation": 152650 } }, { "ph": "s", "id": 152650, "pid": 76337, "tid": -914061504, "ts": 1716454223820753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223820848, "dur": 60, "args": { "External id": 152655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152655, "pid": 5, "tid": 7, "ts": 1716454223820848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223820790, "dur": 10, "args": { "External id": 152655, "cbid": 211, "correlation": 152655 } }, { "ph": "s", "id": 152655, "pid": 76337, "tid": -914061504, "ts": 1716454223820790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223822614, "dur": 53, "args": { "External id": 152675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152675, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 152675, "pid": 5, "tid": 7, "ts": 1716454223822614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822598, "dur": 16, "args": { "External id": 152675, "cbid": 211, "correlation": 152675 } }, { "ph": "s", "id": 152675, "pid": 76337, "tid": -914061504, "ts": 1716454223822598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223822669, "dur": 5, "args": { "External id": 152687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152687, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152687, "pid": 5, "tid": 7, "ts": 1716454223822669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822628, "dur": 8, "args": { "External id": 152687, "cbid": 211, "correlation": 152687 } }, { "ph": "s", "id": 152687, "pid": 76337, "tid": -914061504, "ts": 1716454223822628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223822675, "dur": 57, "args": { "External id": 152690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152690, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152690, "pid": 5, "tid": 7, "ts": 1716454223822675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822652, "dur": 7, "args": { "External id": 152690, "cbid": 211, "correlation": 152690 } }, { "ph": "s", "id": 152690, "pid": 76337, "tid": -914061504, "ts": 1716454223822652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223822734, "dur": 36, "args": { "External id": 152699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152699, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152699, "pid": 5, "tid": 7, "ts": 1716454223822734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822696, "dur": 10, "args": { "External id": 152699, "cbid": 211, "correlation": 152699 } }, { "ph": "s", "id": 152699, "pid": 76337, "tid": -914061504, "ts": 1716454223822696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223822754, "dur": 0, "args": { "External id": 152709, "cbid": 317, "correlation": 152709 } }, { "ph": "f", "id": 152709, "pid": 76337, "tid": -914061504, "ts": 1716454223822754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223822755, "dur": 0, "args": { "External id": 152710, "cbid": 203, "correlation": 152710 } }, { "ph": "f", "id": 152710, "pid": 76337, "tid": -914061504, "ts": 1716454223822755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223822756, "dur": 0, "args": { "External id": 152711, "cbid": 205, "correlation": 152711 } }, { "ph": "f", "id": 152711, "pid": 76337, "tid": -914061504, "ts": 1716454223822756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223822788, "dur": 39, "args": { "External id": 152715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152715, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152715, "pid": 5, "tid": 7, "ts": 1716454223822788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822774, "dur": 13, "args": { "External id": 152715, "cbid": 211, "correlation": 152715 } }, { "ph": "s", "id": 152715, "pid": 76337, "tid": -914061504, "ts": 1716454223822774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223822828, "dur": 14, "args": { "External id": 152717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152717, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152717, "pid": 5, "tid": 7, "ts": 1716454223822828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822790, "dur": 6, "args": { "External id": 152717, "cbid": 211, "correlation": 152717 } }, { "ph": "s", "id": 152717, "pid": 76337, "tid": -914061504, "ts": 1716454223822790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223822844, "dur": 3, "args": { "External id": 152719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152719, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152719, "pid": 5, "tid": 7, "ts": 1716454223822844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822801, "dur": 6, "args": { "External id": 152719, "cbid": 211, "correlation": 152719 } }, { "ph": "s", "id": 152719, "pid": 76337, "tid": -914061504, "ts": 1716454223822801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223822811, "dur": 0, "args": { "External id": 152720, "cbid": 51, "correlation": 152720 } }, { "ph": "s", "id": 152720, "pid": 76337, "tid": -914061504, "ts": 1716454223822811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223822848, "dur": 699, "args": { "External id": 152721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152721, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152721, "pid": 5, "tid": 7, "ts": 1716454223822848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822812, "dur": 6, "args": { "External id": 152721, "cbid": 211, "correlation": 152721 } }, { "ph": "s", "id": 152721, "pid": 76337, "tid": -914061504, "ts": 1716454223822812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223823548, "dur": 58, "args": { "External id": 152726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152726, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152726, "pid": 5, "tid": 7, "ts": 1716454223823548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822843, "dur": 8, "args": { "External id": 152726, "cbid": 211, "correlation": 152726 } }, { "ph": "s", "id": 152726, "pid": 76337, "tid": -914061504, "ts": 1716454223822843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223823608, "dur": 4, "args": { "External id": 152734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152734, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152734, "pid": 5, "tid": 7, "ts": 1716454223823608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822886, "dur": 9, "args": { "External id": 152734, "cbid": 211, "correlation": 152734 } }, { "ph": "s", "id": 152734, "pid": 76337, "tid": -914061504, "ts": 1716454223822886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223822952, "dur": 1, "args": { "External id": 152750, "cbid": 251, "correlation": 152750 } }, { "ph": "f", "id": 152750, "pid": 76337, "tid": -914061504, "ts": 1716454223822952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223822957, "dur": 0, "args": { "External id": 152752, "cbid": 251, "correlation": 152752 } }, { "ph": "f", "id": 152752, "pid": 76337, "tid": -914061504, "ts": 1716454223822957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223823613, "dur": 9, "args": { "External id": 152753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152753, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 152753, "pid": 5, "tid": 7, "ts": 1716454223823613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822960, "dur": 11, "args": { "External id": 152753, "cbid": 211, "correlation": 152753 } }, { "ph": "s", "id": 152753, "pid": 76337, "tid": -914061504, "ts": 1716454223822960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223823624, "dur": 4, "args": { "External id": 152755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152755, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 152755, "pid": 5, "tid": 7, "ts": 1716454223823624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223822981, "dur": 6, "args": { "External id": 152755, "cbid": 211, "correlation": 152755 } }, { "ph": "s", "id": 152755, "pid": 76337, "tid": -914061504, "ts": 1716454223822981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223823629, "dur": 55, "args": { "External id": 152765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152765, "pid": 5, "tid": 7, "ts": 1716454223823629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823042, "dur": 12, "args": { "External id": 152765, "cbid": 211, "correlation": 152765 } }, { "ph": "s", "id": 152765, "pid": 76337, "tid": -914061504, "ts": 1716454223823042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223823685, "dur": 51, "args": { "External id": 152785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152785, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 152785, "pid": 5, "tid": 7, "ts": 1716454223823685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823109, "dur": 11, "args": { "External id": 152785, "cbid": 211, "correlation": 152785 } }, { "ph": "s", "id": 152785, "pid": 76337, "tid": -914061504, "ts": 1716454223823109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223823738, "dur": 4, "args": { "External id": 152797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152797, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152797, "pid": 5, "tid": 7, "ts": 1716454223823738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823132, "dur": 6, "args": { "External id": 152797, "cbid": 211, "correlation": 152797 } }, { "ph": "s", "id": 152797, "pid": 76337, "tid": -914061504, "ts": 1716454223823132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223823742, "dur": 55, "args": { "External id": 152800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152800, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152800, "pid": 5, "tid": 7, "ts": 1716454223823742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823151, "dur": 6, "args": { "External id": 152800, "cbid": 211, "correlation": 152800 } }, { "ph": "s", "id": 152800, "pid": 76337, "tid": -914061504, "ts": 1716454223823151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223823799, "dur": 37, "args": { "External id": 152809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152809, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152809, "pid": 5, "tid": 7, "ts": 1716454223823799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823192, "dur": 9, "args": { "External id": 152809, "cbid": 211, "correlation": 152809 } }, { "ph": "s", "id": 152809, "pid": 76337, "tid": -914061504, "ts": 1716454223823192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223823261, "dur": 0, "args": { "External id": 152819, "cbid": 317, "correlation": 152819 } }, { "ph": "f", "id": 152819, "pid": 76337, "tid": -914061504, "ts": 1716454223823261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223823262, "dur": 0, "args": { "External id": 152820, "cbid": 203, "correlation": 152820 } }, { "ph": "f", "id": 152820, "pid": 76337, "tid": -914061504, "ts": 1716454223823262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223823263, "dur": 0, "args": { "External id": 152821, "cbid": 205, "correlation": 152821 } }, { "ph": "f", "id": 152821, "pid": 76337, "tid": -914061504, "ts": 1716454223823263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223823837, "dur": 40, "args": { "External id": 152825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152825, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152825, "pid": 5, "tid": 7, "ts": 1716454223823837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823279, "dur": 13, "args": { "External id": 152825, "cbid": 211, "correlation": 152825 } }, { "ph": "s", "id": 152825, "pid": 76337, "tid": -914061504, "ts": 1716454223823279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223823879, "dur": 14, "args": { "External id": 152827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152827, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152827, "pid": 5, "tid": 7, "ts": 1716454223823879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823294, "dur": 5, "args": { "External id": 152827, "cbid": 211, "correlation": 152827 } }, { "ph": "s", "id": 152827, "pid": 76337, "tid": -914061504, "ts": 1716454223823294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223823894, "dur": 3, "args": { "External id": 152829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152829, "pid": 5, "tid": 7, "ts": 1716454223823894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823303, "dur": 6, "args": { "External id": 152829, "cbid": 211, "correlation": 152829 } }, { "ph": "s", "id": 152829, "pid": 76337, "tid": -914061504, "ts": 1716454223823303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223823312, "dur": 0, "args": { "External id": 152830, "cbid": 51, "correlation": 152830 } }, { "ph": "s", "id": 152830, "pid": 76337, "tid": -914061504, "ts": 1716454223823312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223823898, "dur": 694, "args": { "External id": 152831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152831, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152831, "pid": 5, "tid": 7, "ts": 1716454223823898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823312, "dur": 5, "args": { "External id": 152831, "cbid": 211, "correlation": 152831 } }, { "ph": "s", "id": 152831, "pid": 76337, "tid": -914061504, "ts": 1716454223823312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223824594, "dur": 59, "args": { "External id": 152836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152836, "pid": 5, "tid": 7, "ts": 1716454223824594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823341, "dur": 9, "args": { "External id": 152836, "cbid": 211, "correlation": 152836 } }, { "ph": "s", "id": 152836, "pid": 76337, "tid": -914061504, "ts": 1716454223823341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223824654, "dur": 50, "args": { "External id": 152844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152844, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152844, "pid": 5, "tid": 7, "ts": 1716454223824654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823373, "dur": 8, "args": { "External id": 152844, "cbid": 211, "correlation": 152844 } }, { "ph": "s", "id": 152844, "pid": 76337, "tid": -914061504, "ts": 1716454223823373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223824705, "dur": 35, "args": { "External id": 152852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152852, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152852, "pid": 5, "tid": 7, "ts": 1716454223824705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823403, "dur": 9, "args": { "External id": 152852, "cbid": 211, "correlation": 152852 } }, { "ph": "s", "id": 152852, "pid": 76337, "tid": -914061504, "ts": 1716454223823403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223824742, "dur": 50, "args": { "External id": 152872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152872, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 152872, "pid": 5, "tid": 7, "ts": 1716454223824742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823488, "dur": 12, "args": { "External id": 152872, "cbid": 211, "correlation": 152872 } }, { "ph": "s", "id": 152872, "pid": 76337, "tid": -914061504, "ts": 1716454223823488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223824793, "dur": 4, "args": { "External id": 152884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152884, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 152884, "pid": 5, "tid": 7, "ts": 1716454223824793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823510, "dur": 6, "args": { "External id": 152884, "cbid": 211, "correlation": 152884 } }, { "ph": "s", "id": 152884, "pid": 76337, "tid": -914061504, "ts": 1716454223823510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223824798, "dur": 56, "args": { "External id": 152887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152887, "pid": 5, "tid": 7, "ts": 1716454223824798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823528, "dur": 7, "args": { "External id": 152887, "cbid": 211, "correlation": 152887 } }, { "ph": "s", "id": 152887, "pid": 76337, "tid": -914061504, "ts": 1716454223823528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223823586, "dur": 0, "args": { "External id": 152898, "cbid": 317, "correlation": 152898 } }, { "ph": "f", "id": 152898, "pid": 76337, "tid": -914061504, "ts": 1716454223823586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223823587, "dur": 0, "args": { "External id": 152899, "cbid": 203, "correlation": 152899 } }, { "ph": "f", "id": 152899, "pid": 76337, "tid": -914061504, "ts": 1716454223823587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223823588, "dur": 0, "args": { "External id": 152900, "cbid": 205, "correlation": 152900 } }, { "ph": "f", "id": 152900, "pid": 76337, "tid": -914061504, "ts": 1716454223823588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823622, "dur": 3, "args": { "External id": 152904, "cbid": 251, "correlation": 152904 } }, { "ph": "f", "id": 152904, "pid": 76337, "tid": -914061504, "ts": 1716454223823622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823626, "dur": 1, "args": { "External id": 152905, "cbid": 251, "correlation": 152905 } }, { "ph": "f", "id": 152905, "pid": 76337, "tid": -914061504, "ts": 1716454223823626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823628, "dur": 1, "args": { "External id": 152906, "cbid": 251, "correlation": 152906 } }, { "ph": "f", "id": 152906, "pid": 76337, "tid": -914061504, "ts": 1716454223823628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823630, "dur": 1, "args": { "External id": 152907, "cbid": 251, "correlation": 152907 } }, { "ph": "f", "id": 152907, "pid": 76337, "tid": -914061504, "ts": 1716454223823630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823632, "dur": 1, "args": { "External id": 152908, "cbid": 251, "correlation": 152908 } }, { "ph": "f", "id": 152908, "pid": 76337, "tid": -914061504, "ts": 1716454223823632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823635, "dur": 1, "args": { "External id": 152909, "cbid": 251, "correlation": 152909 } }, { "ph": "f", "id": 152909, "pid": 76337, "tid": -914061504, "ts": 1716454223823635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823637, "dur": 1, "args": { "External id": 152910, "cbid": 251, "correlation": 152910 } }, { "ph": "f", "id": 152910, "pid": 76337, "tid": -914061504, "ts": 1716454223823637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823639, "dur": 1, "args": { "External id": 152911, "cbid": 251, "correlation": 152911 } }, { "ph": "f", "id": 152911, "pid": 76337, "tid": -914061504, "ts": 1716454223823639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823641, "dur": 0, "args": { "External id": 152912, "cbid": 251, "correlation": 152912 } }, { "ph": "f", "id": 152912, "pid": 76337, "tid": -914061504, "ts": 1716454223823641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223824856, "dur": 115, "args": { "External id": 152913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152913, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 152913, "pid": 5, "tid": 7, "ts": 1716454223824856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823645, "dur": 14, "args": { "External id": 152913, "cbid": 211, "correlation": 152913 } }, { "ph": "s", "id": 152913, "pid": 76337, "tid": -914061504, "ts": 1716454223823645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223824972, "dur": 60, "args": { "External id": 152919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152919, "pid": 5, "tid": 7, "ts": 1716454223824972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823682, "dur": 9, "args": { "External id": 152919, "cbid": 211, "correlation": 152919 } }, { "ph": "s", "id": 152919, "pid": 76337, "tid": -914061504, "ts": 1716454223823682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223825033, "dur": 654, "args": { "External id": 152928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152928, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152928, "pid": 5, "tid": 7, "ts": 1716454223825033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823777, "dur": 15, "args": { "External id": 152928, "cbid": 211, "correlation": 152928 } }, { "ph": "s", "id": 152928, "pid": 76337, "tid": -914061504, "ts": 1716454223823777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223825689, "dur": 180, "args": { "External id": 152950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152950, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 152950, "pid": 5, "tid": 7, "ts": 1716454223825689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823846, "dur": 12, "args": { "External id": 152950, "cbid": 211, "correlation": 152950 } }, { "ph": "s", "id": 152950, "pid": 76337, "tid": -914061504, "ts": 1716454223823846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223823960, "dur": 2, "args": { "External id": 152961, "cbid": 251, "correlation": 152961 } }, { "ph": "f", "id": 152961, "pid": 76337, "tid": -914061504, "ts": 1716454223823960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223825870, "dur": 194, "args": { "External id": 152962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152962, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152962, "pid": 5, "tid": 7, "ts": 1716454223825870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223823967, "dur": 23, "args": { "External id": 152962, "cbid": 211, "correlation": 152962 } }, { "ph": "s", "id": 152962, "pid": 76337, "tid": -914061504, "ts": 1716454223823967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824051, "dur": 1, "args": { "External id": 152973, "cbid": 251, "correlation": 152973 } }, { "ph": "f", "id": 152973, "pid": 76337, "tid": -914061504, "ts": 1716454223824051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223826066, "dur": 188, "args": { "External id": 152974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152974, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152974, "pid": 5, "tid": 7, "ts": 1716454223826066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824055, "dur": 13, "args": { "External id": 152974, "cbid": 211, "correlation": 152974 } }, { "ph": "s", "id": 152974, "pid": 76337, "tid": -914061504, "ts": 1716454223824055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824121, "dur": 1, "args": { "External id": 152985, "cbid": 251, "correlation": 152985 } }, { "ph": "f", "id": 152985, "pid": 76337, "tid": -914061504, "ts": 1716454223824121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223826255, "dur": 191, "args": { "External id": 152986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 152986, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 152986, "pid": 5, "tid": 7, "ts": 1716454223826255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824124, "dur": 11, "args": { "External id": 152986, "cbid": 211, "correlation": 152986 } }, { "ph": "s", "id": 152986, "pid": 76337, "tid": -914061504, "ts": 1716454223824124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223826447, "dur": 18561, "args": { "External id": 153007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153007, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153007, "pid": 5, "tid": 7, "ts": 1716454223826447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824232, "dur": 14, "args": { "External id": 153007, "cbid": 211, "correlation": 153007 } }, { "ph": "s", "id": 153007, "pid": 76337, "tid": -914061504, "ts": 1716454223824232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824343, "dur": 2, "args": { "External id": 153025, "cbid": 251, "correlation": 153025 } }, { "ph": "f", "id": 153025, "pid": 76337, "tid": -914061504, "ts": 1716454223824343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223845009, "dur": 203, "args": { "External id": 153027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153027, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153027, "pid": 5, "tid": 7, "ts": 1716454223845009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824349, "dur": 14, "args": { "External id": 153027, "cbid": 211, "correlation": 153027 } }, { "ph": "s", "id": 153027, "pid": 76337, "tid": -914061504, "ts": 1716454223824349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223845213, "dur": 66, "args": { "External id": 153035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153035, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153035, "pid": 5, "tid": 7, "ts": 1716454223845213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824422, "dur": 12, "args": { "External id": 153035, "cbid": 211, "correlation": 153035 } }, { "ph": "s", "id": 153035, "pid": 76337, "tid": -914061504, "ts": 1716454223824422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223845281, "dur": 97, "args": { "External id": 153043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153043, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153043, "pid": 5, "tid": 7, "ts": 1716454223845281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824463, "dur": 8, "args": { "External id": 153043, "cbid": 211, "correlation": 153043 } }, { "ph": "s", "id": 153043, "pid": 76337, "tid": -914061504, "ts": 1716454223824463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223845379, "dur": 55, "args": { "External id": 153054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153054, "pid": 5, "tid": 7, "ts": 1716454223845379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824548, "dur": 16, "args": { "External id": 153054, "cbid": 211, "correlation": 153054 } }, { "ph": "s", "id": 153054, "pid": 76337, "tid": -914061504, "ts": 1716454223824548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223845435, "dur": 91, "args": { "External id": 153076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153076, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153076, "pid": 5, "tid": 7, "ts": 1716454223845435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824583, "dur": 7, "args": { "External id": 153076, "cbid": 211, "correlation": 153076 } }, { "ph": "s", "id": 153076, "pid": 76337, "tid": -914061504, "ts": 1716454223824583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824666, "dur": 1, "args": { "External id": 153087, "cbid": 251, "correlation": 153087 } }, { "ph": "f", "id": 153087, "pid": 76337, "tid": -914061504, "ts": 1716454223824666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223845528, "dur": 103, "args": { "External id": 153088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153088, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153088, "pid": 5, "tid": 7, "ts": 1716454223845528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824671, "dur": 13, "args": { "External id": 153088, "cbid": 211, "correlation": 153088 } }, { "ph": "s", "id": 153088, "pid": 76337, "tid": -914061504, "ts": 1716454223824671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824749, "dur": 2, "args": { "External id": 153099, "cbid": 251, "correlation": 153099 } }, { "ph": "f", "id": 153099, "pid": 76337, "tid": -914061504, "ts": 1716454223824749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824754, "dur": 0, "args": { "External id": 153100, "cbid": 251, "correlation": 153100 } }, { "ph": "f", "id": 153100, "pid": 76337, "tid": -914061504, "ts": 1716454223824754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223845633, "dur": 10, "args": { "External id": 153101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153101, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 153101, "pid": 5, "tid": 7, "ts": 1716454223845633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824756, "dur": 13, "args": { "External id": 153101, "cbid": 211, "correlation": 153101 } }, { "ph": "s", "id": 153101, "pid": 76337, "tid": -914061504, "ts": 1716454223824756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223845644, "dur": 5, "args": { "External id": 153103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153103, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 153103, "pid": 5, "tid": 7, "ts": 1716454223845644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824773, "dur": 7, "args": { "External id": 153103, "cbid": 211, "correlation": 153103 } }, { "ph": "s", "id": 153103, "pid": 76337, "tid": -914061504, "ts": 1716454223824773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824834, "dur": 1, "args": { "External id": 153114, "cbid": 251, "correlation": 153114 } }, { "ph": "f", "id": 153114, "pid": 76337, "tid": -914061504, "ts": 1716454223824834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223824838, "dur": 0, "args": { "External id": 153115, "cbid": 251, "correlation": 153115 } }, { "ph": "f", "id": 153115, "pid": 76337, "tid": -914061504, "ts": 1716454223824838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223845650, "dur": 6, "args": { "External id": 153116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153116, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 153116, "pid": 5, "tid": 7, "ts": 1716454223845650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824839, "dur": 12, "args": { "External id": 153116, "cbid": 211, "correlation": 153116 } }, { "ph": "s", "id": 153116, "pid": 76337, "tid": -914061504, "ts": 1716454223824839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223845658, "dur": 3, "args": { "External id": 153118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153118, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 153118, "pid": 5, "tid": 7, "ts": 1716454223845658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824853, "dur": 7, "args": { "External id": 153118, "cbid": 211, "correlation": 153118 } }, { "ph": "s", "id": 153118, "pid": 76337, "tid": -914061504, "ts": 1716454223824853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223845662, "dur": 154, "args": { "External id": 153139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153139, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153139, "pid": 5, "tid": 7, "ts": 1716454223845662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223824929, "dur": 12, "args": { "External id": 153139, "cbid": 211, "correlation": 153139 } }, { "ph": "s", "id": 153139, "pid": 76337, "tid": -914061504, "ts": 1716454223824929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825036, "dur": 2, "args": { "External id": 153157, "cbid": 251, "correlation": 153157 } }, { "ph": "f", "id": 153157, "pid": 76337, "tid": -914061504, "ts": 1716454223825036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223845818, "dur": 105, "args": { "External id": 153159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153159, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153159, "pid": 5, "tid": 7, "ts": 1716454223845818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825043, "dur": 15, "args": { "External id": 153159, "cbid": 211, "correlation": 153159 } }, { "ph": "s", "id": 153159, "pid": 76337, "tid": -914061504, "ts": 1716454223825043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223845924, "dur": 35, "args": { "External id": 153167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153167, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153167, "pid": 5, "tid": 7, "ts": 1716454223845924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825115, "dur": 12, "args": { "External id": 153167, "cbid": 211, "correlation": 153167 } }, { "ph": "s", "id": 153167, "pid": 76337, "tid": -914061504, "ts": 1716454223825115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223845960, "dur": 68, "args": { "External id": 153175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153175, "pid": 5, "tid": 7, "ts": 1716454223845960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825155, "dur": 9, "args": { "External id": 153175, "cbid": 211, "correlation": 153175 } }, { "ph": "s", "id": 153175, "pid": 76337, "tid": -914061504, "ts": 1716454223825155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223846029, "dur": 92, "args": { "External id": 153197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153197, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153197, "pid": 5, "tid": 7, "ts": 1716454223846029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825206, "dur": 10, "args": { "External id": 153197, "cbid": 211, "correlation": 153197 } }, { "ph": "s", "id": 153197, "pid": 76337, "tid": -914061504, "ts": 1716454223825206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825297, "dur": 1, "args": { "External id": 153213, "cbid": 251, "correlation": 153213 } }, { "ph": "f", "id": 153213, "pid": 76337, "tid": -914061504, "ts": 1716454223825297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223846122, "dur": 582, "args": { "External id": 153215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153215, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153215, "pid": 5, "tid": 7, "ts": 1716454223846122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825302, "dur": 13, "args": { "External id": 153215, "cbid": 211, "correlation": 153215 } }, { "ph": "s", "id": 153215, "pid": 76337, "tid": -914061504, "ts": 1716454223825302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223846705, "dur": 244, "args": { "External id": 153223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153223, "pid": 5, "tid": 7, "ts": 1716454223846705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825380, "dur": 14, "args": { "External id": 153223, "cbid": 211, "correlation": 153223 } }, { "ph": "s", "id": 153223, "pid": 76337, "tid": -914061504, "ts": 1716454223825380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223846950, "dur": 251, "args": { "External id": 153231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153231, "pid": 5, "tid": 7, "ts": 1716454223846950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825415, "dur": 8, "args": { "External id": 153231, "cbid": 211, "correlation": 153231 } }, { "ph": "s", "id": 153231, "pid": 76337, "tid": -914061504, "ts": 1716454223825415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825497, "dur": 2, "args": { "External id": 153247, "cbid": 251, "correlation": 153247 } }, { "ph": "f", "id": 153247, "pid": 76337, "tid": -914061504, "ts": 1716454223825497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825503, "dur": 0, "args": { "External id": 153249, "cbid": 251, "correlation": 153249 } }, { "ph": "f", "id": 153249, "pid": 76337, "tid": -914061504, "ts": 1716454223825503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223847203, "dur": 360, "args": { "External id": 153250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153250, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 153250, "pid": 5, "tid": 7, "ts": 1716454223847203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825507, "dur": 13, "args": { "External id": 153250, "cbid": 211, "correlation": 153250 } }, { "ph": "s", "id": 153250, "pid": 76337, "tid": -914061504, "ts": 1716454223825507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223847565, "dur": 50, "args": { "External id": 153258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153258, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153258, "pid": 5, "tid": 7, "ts": 1716454223847565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825551, "dur": 10, "args": { "External id": 153258, "cbid": 211, "correlation": 153258 } }, { "ph": "s", "id": 153258, "pid": 76337, "tid": -914061504, "ts": 1716454223825551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223847616, "dur": 158, "args": { "External id": 153269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153269, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153269, "pid": 5, "tid": 7, "ts": 1716454223847616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825620, "dur": 12, "args": { "External id": 153269, "cbid": 211, "correlation": 153269 } }, { "ph": "s", "id": 153269, "pid": 76337, "tid": -914061504, "ts": 1716454223825620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223825684, "dur": 0, "args": { "External id": 153281, "cbid": 317, "correlation": 153281 } }, { "ph": "f", "id": 153281, "pid": 76337, "tid": -914061504, "ts": 1716454223825684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223825685, "dur": 0, "args": { "External id": 153282, "cbid": 203, "correlation": 153282 } }, { "ph": "f", "id": 153282, "pid": 76337, "tid": -914061504, "ts": 1716454223825685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223825686, "dur": 0, "args": { "External id": 153283, "cbid": 205, "correlation": 153283 } }, { "ph": "f", "id": 153283, "pid": 76337, "tid": -914061504, "ts": 1716454223825686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825710, "dur": 1, "args": { "External id": 153287, "cbid": 251, "correlation": 153287 } }, { "ph": "f", "id": 153287, "pid": 76337, "tid": -914061504, "ts": 1716454223825710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825712, "dur": 0, "args": { "External id": 153288, "cbid": 251, "correlation": 153288 } }, { "ph": "f", "id": 153288, "pid": 76337, "tid": -914061504, "ts": 1716454223825712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825712, "dur": 0, "args": { "External id": 153289, "cbid": 251, "correlation": 153289 } }, { "ph": "f", "id": 153289, "pid": 76337, "tid": -914061504, "ts": 1716454223825712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825713, "dur": 0, "args": { "External id": 153290, "cbid": 251, "correlation": 153290 } }, { "ph": "f", "id": 153290, "pid": 76337, "tid": -914061504, "ts": 1716454223825713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825714, "dur": 0, "args": { "External id": 153291, "cbid": 251, "correlation": 153291 } }, { "ph": "f", "id": 153291, "pid": 76337, "tid": -914061504, "ts": 1716454223825714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825715, "dur": 0, "args": { "External id": 153292, "cbid": 251, "correlation": 153292 } }, { "ph": "f", "id": 153292, "pid": 76337, "tid": -914061504, "ts": 1716454223825715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825716, "dur": 0, "args": { "External id": 153293, "cbid": 251, "correlation": 153293 } }, { "ph": "f", "id": 153293, "pid": 76337, "tid": -914061504, "ts": 1716454223825716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825717, "dur": 0, "args": { "External id": 153294, "cbid": 251, "correlation": 153294 } }, { "ph": "f", "id": 153294, "pid": 76337, "tid": -914061504, "ts": 1716454223825717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223825718, "dur": 0, "args": { "External id": 153295, "cbid": 251, "correlation": 153295 } }, { "ph": "f", "id": 153295, "pid": 76337, "tid": -914061504, "ts": 1716454223825718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223847775, "dur": 114, "args": { "External id": 153296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153296, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153296, "pid": 5, "tid": 7, "ts": 1716454223847775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825720, "dur": 12, "args": { "External id": 153296, "cbid": 211, "correlation": 153296 } }, { "ph": "s", "id": 153296, "pid": 76337, "tid": -914061504, "ts": 1716454223825720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223847890, "dur": 60, "args": { "External id": 153302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153302, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153302, "pid": 5, "tid": 7, "ts": 1716454223847890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825755, "dur": 9, "args": { "External id": 153302, "cbid": 211, "correlation": 153302 } }, { "ph": "s", "id": 153302, "pid": 76337, "tid": -914061504, "ts": 1716454223825755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223847951, "dur": 49, "args": { "External id": 153310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153310, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153310, "pid": 5, "tid": 7, "ts": 1716454223847951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825788, "dur": 8, "args": { "External id": 153310, "cbid": 211, "correlation": 153310 } }, { "ph": "s", "id": 153310, "pid": 76337, "tid": -914061504, "ts": 1716454223825788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223848001, "dur": 51, "args": { "External id": 153330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153330, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 153330, "pid": 5, "tid": 7, "ts": 1716454223848001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825862, "dur": 12, "args": { "External id": 153330, "cbid": 211, "correlation": 153330 } }, { "ph": "s", "id": 153330, "pid": 76337, "tid": -914061504, "ts": 1716454223825862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223848054, "dur": 4, "args": { "External id": 153342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153342, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 153342, "pid": 5, "tid": 7, "ts": 1716454223848054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825883, "dur": 6, "args": { "External id": 153342, "cbid": 211, "correlation": 153342 } }, { "ph": "s", "id": 153342, "pid": 76337, "tid": -914061504, "ts": 1716454223825883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223848060, "dur": 55, "args": { "External id": 153345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153345, "pid": 5, "tid": 7, "ts": 1716454223848060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825901, "dur": 8, "args": { "External id": 153345, "cbid": 211, "correlation": 153345 } }, { "ph": "s", "id": 153345, "pid": 76337, "tid": -914061504, "ts": 1716454223825901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223848116, "dur": 36, "args": { "External id": 153354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153354, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153354, "pid": 5, "tid": 7, "ts": 1716454223848116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223825941, "dur": 10, "args": { "External id": 153354, "cbid": 211, "correlation": 153354 } }, { "ph": "s", "id": 153354, "pid": 76337, "tid": -914061504, "ts": 1716454223825941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223826001, "dur": 0, "args": { "External id": 153364, "cbid": 317, "correlation": 153364 } }, { "ph": "f", "id": 153364, "pid": 76337, "tid": -914061504, "ts": 1716454223826001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223826002, "dur": 0, "args": { "External id": 153365, "cbid": 203, "correlation": 153365 } }, { "ph": "f", "id": 153365, "pid": 76337, "tid": -914061504, "ts": 1716454223826002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223826002, "dur": 0, "args": { "External id": 153366, "cbid": 205, "correlation": 153366 } }, { "ph": "f", "id": 153366, "pid": 76337, "tid": -914061504, "ts": 1716454223826002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223848154, "dur": 41, "args": { "External id": 153370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153370, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153370, "pid": 5, "tid": 7, "ts": 1716454223848154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826017, "dur": 12, "args": { "External id": 153370, "cbid": 211, "correlation": 153370 } }, { "ph": "s", "id": 153370, "pid": 76337, "tid": -914061504, "ts": 1716454223826017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223848196, "dur": 14, "args": { "External id": 153372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153372, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153372, "pid": 5, "tid": 7, "ts": 1716454223848196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826031, "dur": 5, "args": { "External id": 153372, "cbid": 211, "correlation": 153372 } }, { "ph": "s", "id": 153372, "pid": 76337, "tid": -914061504, "ts": 1716454223826031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223848212, "dur": 3, "args": { "External id": 153374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 153374, "pid": 5, "tid": 7, "ts": 1716454223848212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826041, "dur": 6, "args": { "External id": 153374, "cbid": 211, "correlation": 153374 } }, { "ph": "s", "id": 153374, "pid": 76337, "tid": -914061504, "ts": 1716454223826041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223826050, "dur": 0, "args": { "External id": 153375, "cbid": 51, "correlation": 153375 } }, { "ph": "s", "id": 153375, "pid": 76337, "tid": -914061504, "ts": 1716454223826050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223848216, "dur": 699, "args": { "External id": 153376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153376, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153376, "pid": 5, "tid": 7, "ts": 1716454223848216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826050, "dur": 6, "args": { "External id": 153376, "cbid": 211, "correlation": 153376 } }, { "ph": "s", "id": 153376, "pid": 76337, "tid": -914061504, "ts": 1716454223826050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223848916, "dur": 59, "args": { "External id": 153381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153381, "pid": 5, "tid": 7, "ts": 1716454223848916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826079, "dur": 9, "args": { "External id": 153381, "cbid": 211, "correlation": 153381 } }, { "ph": "s", "id": 153381, "pid": 76337, "tid": -914061504, "ts": 1716454223826079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223848976, "dur": 4, "args": { "External id": 153389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153389, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 153389, "pid": 5, "tid": 7, "ts": 1716454223848976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826123, "dur": 10, "args": { "External id": 153389, "cbid": 211, "correlation": 153389 } }, { "ph": "s", "id": 153389, "pid": 76337, "tid": -914061504, "ts": 1716454223826123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826188, "dur": 1, "args": { "External id": 153405, "cbid": 251, "correlation": 153405 } }, { "ph": "f", "id": 153405, "pid": 76337, "tid": -914061504, "ts": 1716454223826188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826192, "dur": 0, "args": { "External id": 153407, "cbid": 251, "correlation": 153407 } }, { "ph": "f", "id": 153407, "pid": 76337, "tid": -914061504, "ts": 1716454223826192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223848981, "dur": 11, "args": { "External id": 153408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153408, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 153408, "pid": 5, "tid": 7, "ts": 1716454223848981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826194, "dur": 12, "args": { "External id": 153408, "cbid": 211, "correlation": 153408 } }, { "ph": "s", "id": 153408, "pid": 76337, "tid": -914061504, "ts": 1716454223826194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223848993, "dur": 5, "args": { "External id": 153410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153410, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 153410, "pid": 5, "tid": 7, "ts": 1716454223848993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826208, "dur": 6, "args": { "External id": 153410, "cbid": 211, "correlation": 153410 } }, { "ph": "s", "id": 153410, "pid": 76337, "tid": -914061504, "ts": 1716454223826208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223849000, "dur": 51, "args": { "External id": 153420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153420, "pid": 5, "tid": 7, "ts": 1716454223849000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826265, "dur": 12, "args": { "External id": 153420, "cbid": 211, "correlation": 153420 } }, { "ph": "s", "id": 153420, "pid": 76337, "tid": -914061504, "ts": 1716454223826265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223849053, "dur": 51, "args": { "External id": 153440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153440, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 153440, "pid": 5, "tid": 7, "ts": 1716454223849053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826331, "dur": 10, "args": { "External id": 153440, "cbid": 211, "correlation": 153440 } }, { "ph": "s", "id": 153440, "pid": 76337, "tid": -914061504, "ts": 1716454223826331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223849105, "dur": 4, "args": { "External id": 153452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153452, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 153452, "pid": 5, "tid": 7, "ts": 1716454223849105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826352, "dur": 6, "args": { "External id": 153452, "cbid": 211, "correlation": 153452 } }, { "ph": "s", "id": 153452, "pid": 76337, "tid": -914061504, "ts": 1716454223826352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223849111, "dur": 55, "args": { "External id": 153455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153455, "pid": 5, "tid": 7, "ts": 1716454223849111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826371, "dur": 7, "args": { "External id": 153455, "cbid": 211, "correlation": 153455 } }, { "ph": "s", "id": 153455, "pid": 76337, "tid": -914061504, "ts": 1716454223826371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223849167, "dur": 36, "args": { "External id": 153464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153464, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153464, "pid": 5, "tid": 7, "ts": 1716454223849167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826411, "dur": 10, "args": { "External id": 153464, "cbid": 211, "correlation": 153464 } }, { "ph": "s", "id": 153464, "pid": 76337, "tid": -914061504, "ts": 1716454223826411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223826474, "dur": 0, "args": { "External id": 153474, "cbid": 317, "correlation": 153474 } }, { "ph": "f", "id": 153474, "pid": 76337, "tid": -914061504, "ts": 1716454223826474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223826474, "dur": 0, "args": { "External id": 153475, "cbid": 203, "correlation": 153475 } }, { "ph": "f", "id": 153475, "pid": 76337, "tid": -914061504, "ts": 1716454223826474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223826475, "dur": 0, "args": { "External id": 153476, "cbid": 205, "correlation": 153476 } }, { "ph": "f", "id": 153476, "pid": 76337, "tid": -914061504, "ts": 1716454223826475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223849204, "dur": 40, "args": { "External id": 153480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153480, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153480, "pid": 5, "tid": 7, "ts": 1716454223849204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826489, "dur": 12, "args": { "External id": 153480, "cbid": 211, "correlation": 153480 } }, { "ph": "s", "id": 153480, "pid": 76337, "tid": -914061504, "ts": 1716454223826489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223849245, "dur": 14, "args": { "External id": 153482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153482, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153482, "pid": 5, "tid": 7, "ts": 1716454223849245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826503, "dur": 6, "args": { "External id": 153482, "cbid": 211, "correlation": 153482 } }, { "ph": "s", "id": 153482, "pid": 76337, "tid": -914061504, "ts": 1716454223826503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223849260, "dur": 3, "args": { "External id": 153484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 153484, "pid": 5, "tid": 7, "ts": 1716454223849260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826513, "dur": 5, "args": { "External id": 153484, "cbid": 211, "correlation": 153484 } }, { "ph": "s", "id": 153484, "pid": 76337, "tid": -914061504, "ts": 1716454223826513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223826522, "dur": 0, "args": { "External id": 153485, "cbid": 51, "correlation": 153485 } }, { "ph": "s", "id": 153485, "pid": 76337, "tid": -914061504, "ts": 1716454223826522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223849265, "dur": 693, "args": { "External id": 153486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153486, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153486, "pid": 5, "tid": 7, "ts": 1716454223849265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826523, "dur": 5, "args": { "External id": 153486, "cbid": 211, "correlation": 153486 } }, { "ph": "s", "id": 153486, "pid": 76337, "tid": -914061504, "ts": 1716454223826523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223849960, "dur": 58, "args": { "External id": 153491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153491, "pid": 5, "tid": 7, "ts": 1716454223849960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826550, "dur": 9, "args": { "External id": 153491, "cbid": 211, "correlation": 153491 } }, { "ph": "s", "id": 153491, "pid": 76337, "tid": -914061504, "ts": 1716454223826550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223850019, "dur": 50, "args": { "External id": 153499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153499, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153499, "pid": 5, "tid": 7, "ts": 1716454223850019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826583, "dur": 8, "args": { "External id": 153499, "cbid": 211, "correlation": 153499 } }, { "ph": "s", "id": 153499, "pid": 76337, "tid": -914061504, "ts": 1716454223826583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223850070, "dur": 35, "args": { "External id": 153507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153507, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153507, "pid": 5, "tid": 7, "ts": 1716454223850070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826612, "dur": 8, "args": { "External id": 153507, "cbid": 211, "correlation": 153507 } }, { "ph": "s", "id": 153507, "pid": 76337, "tid": -914061504, "ts": 1716454223826612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223850106, "dur": 51, "args": { "External id": 153527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153527, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 153527, "pid": 5, "tid": 7, "ts": 1716454223850106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826690, "dur": 12, "args": { "External id": 153527, "cbid": 211, "correlation": 153527 } }, { "ph": "s", "id": 153527, "pid": 76337, "tid": -914061504, "ts": 1716454223826690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223850158, "dur": 4, "args": { "External id": 153539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153539, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 153539, "pid": 5, "tid": 7, "ts": 1716454223850158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826712, "dur": 7, "args": { "External id": 153539, "cbid": 211, "correlation": 153539 } }, { "ph": "s", "id": 153539, "pid": 76337, "tid": -914061504, "ts": 1716454223826712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223850163, "dur": 55, "args": { "External id": 153542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153542, "pid": 5, "tid": 7, "ts": 1716454223850163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826731, "dur": 6, "args": { "External id": 153542, "cbid": 211, "correlation": 153542 } }, { "ph": "s", "id": 153542, "pid": 76337, "tid": -914061504, "ts": 1716454223826731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223826788, "dur": 0, "args": { "External id": 153553, "cbid": 317, "correlation": 153553 } }, { "ph": "f", "id": 153553, "pid": 76337, "tid": -914061504, "ts": 1716454223826788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223826789, "dur": 0, "args": { "External id": 153554, "cbid": 203, "correlation": 153554 } }, { "ph": "f", "id": 153554, "pid": 76337, "tid": -914061504, "ts": 1716454223826789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223826789, "dur": 0, "args": { "External id": 153555, "cbid": 205, "correlation": 153555 } }, { "ph": "f", "id": 153555, "pid": 76337, "tid": -914061504, "ts": 1716454223826789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826811, "dur": 1, "args": { "External id": 153559, "cbid": 251, "correlation": 153559 } }, { "ph": "f", "id": 153559, "pid": 76337, "tid": -914061504, "ts": 1716454223826811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826813, "dur": 0, "args": { "External id": 153560, "cbid": 251, "correlation": 153560 } }, { "ph": "f", "id": 153560, "pid": 76337, "tid": -914061504, "ts": 1716454223826813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826814, "dur": 0, "args": { "External id": 153561, "cbid": 251, "correlation": 153561 } }, { "ph": "f", "id": 153561, "pid": 76337, "tid": -914061504, "ts": 1716454223826814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826815, "dur": 0, "args": { "External id": 153562, "cbid": 251, "correlation": 153562 } }, { "ph": "f", "id": 153562, "pid": 76337, "tid": -914061504, "ts": 1716454223826815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826815, "dur": 0, "args": { "External id": 153563, "cbid": 251, "correlation": 153563 } }, { "ph": "f", "id": 153563, "pid": 76337, "tid": -914061504, "ts": 1716454223826815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826816, "dur": 0, "args": { "External id": 153564, "cbid": 251, "correlation": 153564 } }, { "ph": "f", "id": 153564, "pid": 76337, "tid": -914061504, "ts": 1716454223826816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826817, "dur": 0, "args": { "External id": 153565, "cbid": 251, "correlation": 153565 } }, { "ph": "f", "id": 153565, "pid": 76337, "tid": -914061504, "ts": 1716454223826817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826818, "dur": 0, "args": { "External id": 153566, "cbid": 251, "correlation": 153566 } }, { "ph": "f", "id": 153566, "pid": 76337, "tid": -914061504, "ts": 1716454223826818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223826819, "dur": 0, "args": { "External id": 153567, "cbid": 251, "correlation": 153567 } }, { "ph": "f", "id": 153567, "pid": 76337, "tid": -914061504, "ts": 1716454223826819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223850220, "dur": 114, "args": { "External id": 153568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153568, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153568, "pid": 5, "tid": 7, "ts": 1716454223850220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826822, "dur": 12, "args": { "External id": 153568, "cbid": 211, "correlation": 153568 } }, { "ph": "s", "id": 153568, "pid": 76337, "tid": -914061504, "ts": 1716454223826822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223850335, "dur": 60, "args": { "External id": 153574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153574, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153574, "pid": 5, "tid": 7, "ts": 1716454223850335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826856, "dur": 9, "args": { "External id": 153574, "cbid": 211, "correlation": 153574 } }, { "ph": "s", "id": 153574, "pid": 76337, "tid": -914061504, "ts": 1716454223826856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223850396, "dur": 676, "args": { "External id": 153583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153583, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153583, "pid": 5, "tid": 7, "ts": 1716454223850396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223826939, "dur": 13, "args": { "External id": 153583, "cbid": 211, "correlation": 153583 } }, { "ph": "s", "id": 153583, "pid": 76337, "tid": -914061504, "ts": 1716454223826939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223851073, "dur": 180, "args": { "External id": 153605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153605, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153605, "pid": 5, "tid": 7, "ts": 1716454223851073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827003, "dur": 10, "args": { "External id": 153605, "cbid": 211, "correlation": 153605 } }, { "ph": "s", "id": 153605, "pid": 76337, "tid": -914061504, "ts": 1716454223827003, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827090, "dur": 1, "args": { "External id": 153616, "cbid": 251, "correlation": 153616 } }, { "ph": "f", "id": 153616, "pid": 76337, "tid": -914061504, "ts": 1716454223827090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223851255, "dur": 196, "args": { "External id": 153617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153617, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153617, "pid": 5, "tid": 7, "ts": 1716454223851255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827095, "dur": 14, "args": { "External id": 153617, "cbid": 211, "correlation": 153617 } }, { "ph": "s", "id": 153617, "pid": 76337, "tid": -914061504, "ts": 1716454223827095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827163, "dur": 1, "args": { "External id": 153628, "cbid": 251, "correlation": 153628 } }, { "ph": "f", "id": 153628, "pid": 76337, "tid": -914061504, "ts": 1716454223827163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223851453, "dur": 188, "args": { "External id": 153629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153629, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153629, "pid": 5, "tid": 7, "ts": 1716454223851453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827168, "dur": 12, "args": { "External id": 153629, "cbid": 211, "correlation": 153629 } }, { "ph": "s", "id": 153629, "pid": 76337, "tid": -914061504, "ts": 1716454223827168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827232, "dur": 1, "args": { "External id": 153640, "cbid": 251, "correlation": 153640 } }, { "ph": "f", "id": 153640, "pid": 76337, "tid": -914061504, "ts": 1716454223827232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223851642, "dur": 190, "args": { "External id": 153641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153641, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153641, "pid": 5, "tid": 7, "ts": 1716454223851642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827236, "dur": 11, "args": { "External id": 153641, "cbid": 211, "correlation": 153641 } }, { "ph": "s", "id": 153641, "pid": 76337, "tid": -914061504, "ts": 1716454223827236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223851833, "dur": 18577, "args": { "External id": 153662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153662, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153662, "pid": 5, "tid": 7, "ts": 1716454223851833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827316, "dur": 13, "args": { "External id": 153662, "cbid": 211, "correlation": 153662 } }, { "ph": "s", "id": 153662, "pid": 76337, "tid": -914061504, "ts": 1716454223827316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827413, "dur": 1, "args": { "External id": 153680, "cbid": 251, "correlation": 153680 } }, { "ph": "f", "id": 153680, "pid": 76337, "tid": -914061504, "ts": 1716454223827413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223870411, "dur": 198, "args": { "External id": 153682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153682, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153682, "pid": 5, "tid": 7, "ts": 1716454223870411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827419, "dur": 13, "args": { "External id": 153682, "cbid": 211, "correlation": 153682 } }, { "ph": "s", "id": 153682, "pid": 76337, "tid": -914061504, "ts": 1716454223827419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223870611, "dur": 66, "args": { "External id": 153690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153690, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153690, "pid": 5, "tid": 7, "ts": 1716454223870611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827489, "dur": 12, "args": { "External id": 153690, "cbid": 211, "correlation": 153690 } }, { "ph": "s", "id": 153690, "pid": 76337, "tid": -914061504, "ts": 1716454223827489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223870678, "dur": 96, "args": { "External id": 153698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153698, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153698, "pid": 5, "tid": 7, "ts": 1716454223870678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827528, "dur": 9, "args": { "External id": 153698, "cbid": 211, "correlation": 153698 } }, { "ph": "s", "id": 153698, "pid": 76337, "tid": -914061504, "ts": 1716454223827528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223870776, "dur": 54, "args": { "External id": 153709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153709, "pid": 5, "tid": 7, "ts": 1716454223870776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827600, "dur": 12, "args": { "External id": 153709, "cbid": 211, "correlation": 153709 } }, { "ph": "s", "id": 153709, "pid": 76337, "tid": -914061504, "ts": 1716454223827600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223870831, "dur": 92, "args": { "External id": 153731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153731, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153731, "pid": 5, "tid": 7, "ts": 1716454223870831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827632, "dur": 8, "args": { "External id": 153731, "cbid": 211, "correlation": 153731 } }, { "ph": "s", "id": 153731, "pid": 76337, "tid": -914061504, "ts": 1716454223827632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827714, "dur": 1, "args": { "External id": 153742, "cbid": 251, "correlation": 153742 } }, { "ph": "f", "id": 153742, "pid": 76337, "tid": -914061504, "ts": 1716454223827714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223870924, "dur": 106, "args": { "External id": 153743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153743, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153743, "pid": 5, "tid": 7, "ts": 1716454223870924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827720, "dur": 13, "args": { "External id": 153743, "cbid": 211, "correlation": 153743 } }, { "ph": "s", "id": 153743, "pid": 76337, "tid": -914061504, "ts": 1716454223827720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827791, "dur": 1, "args": { "External id": 153754, "cbid": 251, "correlation": 153754 } }, { "ph": "f", "id": 153754, "pid": 76337, "tid": -914061504, "ts": 1716454223827791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827795, "dur": 0, "args": { "External id": 153755, "cbid": 251, "correlation": 153755 } }, { "ph": "f", "id": 153755, "pid": 76337, "tid": -914061504, "ts": 1716454223827795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223871031, "dur": 10, "args": { "External id": 153756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153756, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 153756, "pid": 5, "tid": 7, "ts": 1716454223871031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827797, "dur": 12, "args": { "External id": 153756, "cbid": 211, "correlation": 153756 } }, { "ph": "s", "id": 153756, "pid": 76337, "tid": -914061504, "ts": 1716454223827797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223871043, "dur": 5, "args": { "External id": 153758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153758, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 153758, "pid": 5, "tid": 7, "ts": 1716454223871043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827811, "dur": 5, "args": { "External id": 153758, "cbid": 211, "correlation": 153758 } }, { "ph": "s", "id": 153758, "pid": 76337, "tid": -914061504, "ts": 1716454223827811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827871, "dur": 1, "args": { "External id": 153769, "cbid": 251, "correlation": 153769 } }, { "ph": "f", "id": 153769, "pid": 76337, "tid": -914061504, "ts": 1716454223827871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223827874, "dur": 0, "args": { "External id": 153770, "cbid": 251, "correlation": 153770 } }, { "ph": "f", "id": 153770, "pid": 76337, "tid": -914061504, "ts": 1716454223827874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223871049, "dur": 6, "args": { "External id": 153771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153771, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 153771, "pid": 5, "tid": 7, "ts": 1716454223871049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827876, "dur": 12, "args": { "External id": 153771, "cbid": 211, "correlation": 153771 } }, { "ph": "s", "id": 153771, "pid": 76337, "tid": -914061504, "ts": 1716454223827876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223871057, "dur": 3, "args": { "External id": 153773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153773, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 153773, "pid": 5, "tid": 7, "ts": 1716454223871057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827890, "dur": 5, "args": { "External id": 153773, "cbid": 211, "correlation": 153773 } }, { "ph": "s", "id": 153773, "pid": 76337, "tid": -914061504, "ts": 1716454223827890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223871062, "dur": 155, "args": { "External id": 153794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153794, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153794, "pid": 5, "tid": 7, "ts": 1716454223871062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223827963, "dur": 21, "args": { "External id": 153794, "cbid": 211, "correlation": 153794 } }, { "ph": "s", "id": 153794, "pid": 76337, "tid": -914061504, "ts": 1716454223827963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828070, "dur": 1, "args": { "External id": 153812, "cbid": 251, "correlation": 153812 } }, { "ph": "f", "id": 153812, "pid": 76337, "tid": -914061504, "ts": 1716454223828070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223871218, "dur": 107, "args": { "External id": 153814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153814, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153814, "pid": 5, "tid": 7, "ts": 1716454223871218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828076, "dur": 13, "args": { "External id": 153814, "cbid": 211, "correlation": 153814 } }, { "ph": "s", "id": 153814, "pid": 76337, "tid": -914061504, "ts": 1716454223828076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223871327, "dur": 35, "args": { "External id": 153822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153822, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153822, "pid": 5, "tid": 7, "ts": 1716454223871327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828145, "dur": 12, "args": { "External id": 153822, "cbid": 211, "correlation": 153822 } }, { "ph": "s", "id": 153822, "pid": 76337, "tid": -914061504, "ts": 1716454223828145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223871363, "dur": 68, "args": { "External id": 153830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153830, "pid": 5, "tid": 7, "ts": 1716454223871363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828186, "dur": 9, "args": { "External id": 153830, "cbid": 211, "correlation": 153830 } }, { "ph": "s", "id": 153830, "pid": 76337, "tid": -914061504, "ts": 1716454223828186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223871433, "dur": 92, "args": { "External id": 153852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153852, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153852, "pid": 5, "tid": 7, "ts": 1716454223871433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828237, "dur": 10, "args": { "External id": 153852, "cbid": 211, "correlation": 153852 } }, { "ph": "s", "id": 153852, "pid": 76337, "tid": -914061504, "ts": 1716454223828237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828324, "dur": 1, "args": { "External id": 153868, "cbid": 251, "correlation": 153868 } }, { "ph": "f", "id": 153868, "pid": 76337, "tid": -914061504, "ts": 1716454223828324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223871526, "dur": 569, "args": { "External id": 153870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153870, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153870, "pid": 5, "tid": 7, "ts": 1716454223871526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828330, "dur": 12, "args": { "External id": 153870, "cbid": 211, "correlation": 153870 } }, { "ph": "s", "id": 153870, "pid": 76337, "tid": -914061504, "ts": 1716454223828330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223872096, "dur": 244, "args": { "External id": 153878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153878, "pid": 5, "tid": 7, "ts": 1716454223872096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828394, "dur": 12, "args": { "External id": 153878, "cbid": 211, "correlation": 153878 } }, { "ph": "s", "id": 153878, "pid": 76337, "tid": -914061504, "ts": 1716454223828394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223872341, "dur": 253, "args": { "External id": 153886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153886, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153886, "pid": 5, "tid": 7, "ts": 1716454223872341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828424, "dur": 9, "args": { "External id": 153886, "cbid": 211, "correlation": 153886 } }, { "ph": "s", "id": 153886, "pid": 76337, "tid": -914061504, "ts": 1716454223828424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828506, "dur": 1, "args": { "External id": 153902, "cbid": 251, "correlation": 153902 } }, { "ph": "f", "id": 153902, "pid": 76337, "tid": -914061504, "ts": 1716454223828506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828510, "dur": 0, "args": { "External id": 153904, "cbid": 251, "correlation": 153904 } }, { "ph": "f", "id": 153904, "pid": 76337, "tid": -914061504, "ts": 1716454223828510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223872595, "dur": 362, "args": { "External id": 153905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153905, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 153905, "pid": 5, "tid": 7, "ts": 1716454223872595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828513, "dur": 13, "args": { "External id": 153905, "cbid": 211, "correlation": 153905 } }, { "ph": "s", "id": 153905, "pid": 76337, "tid": -914061504, "ts": 1716454223828513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223872958, "dur": 49, "args": { "External id": 153913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153913, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153913, "pid": 5, "tid": 7, "ts": 1716454223872958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828555, "dur": 9, "args": { "External id": 153913, "cbid": 211, "correlation": 153913 } }, { "ph": "s", "id": 153913, "pid": 76337, "tid": -914061504, "ts": 1716454223828555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223873008, "dur": 158, "args": { "External id": 153924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153924, "pid": 5, "tid": 7, "ts": 1716454223873008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828623, "dur": 12, "args": { "External id": 153924, "cbid": 211, "correlation": 153924 } }, { "ph": "s", "id": 153924, "pid": 76337, "tid": -914061504, "ts": 1716454223828623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223828686, "dur": 0, "args": { "External id": 153936, "cbid": 317, "correlation": 153936 } }, { "ph": "f", "id": 153936, "pid": 76337, "tid": -914061504, "ts": 1716454223828686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223828687, "dur": 0, "args": { "External id": 153937, "cbid": 203, "correlation": 153937 } }, { "ph": "f", "id": 153937, "pid": 76337, "tid": -914061504, "ts": 1716454223828687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223828687, "dur": 0, "args": { "External id": 153938, "cbid": 205, "correlation": 153938 } }, { "ph": "f", "id": 153938, "pid": 76337, "tid": -914061504, "ts": 1716454223828687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828710, "dur": 1, "args": { "External id": 153942, "cbid": 251, "correlation": 153942 } }, { "ph": "f", "id": 153942, "pid": 76337, "tid": -914061504, "ts": 1716454223828710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828712, "dur": 0, "args": { "External id": 153943, "cbid": 251, "correlation": 153943 } }, { "ph": "f", "id": 153943, "pid": 76337, "tid": -914061504, "ts": 1716454223828712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828712, "dur": 0, "args": { "External id": 153944, "cbid": 251, "correlation": 153944 } }, { "ph": "f", "id": 153944, "pid": 76337, "tid": -914061504, "ts": 1716454223828712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828713, "dur": 0, "args": { "External id": 153945, "cbid": 251, "correlation": 153945 } }, { "ph": "f", "id": 153945, "pid": 76337, "tid": -914061504, "ts": 1716454223828713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828714, "dur": 0, "args": { "External id": 153946, "cbid": 251, "correlation": 153946 } }, { "ph": "f", "id": 153946, "pid": 76337, "tid": -914061504, "ts": 1716454223828714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828715, "dur": 0, "args": { "External id": 153947, "cbid": 251, "correlation": 153947 } }, { "ph": "f", "id": 153947, "pid": 76337, "tid": -914061504, "ts": 1716454223828715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828715, "dur": 0, "args": { "External id": 153948, "cbid": 251, "correlation": 153948 } }, { "ph": "f", "id": 153948, "pid": 76337, "tid": -914061504, "ts": 1716454223828715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828716, "dur": 0, "args": { "External id": 153949, "cbid": 251, "correlation": 153949 } }, { "ph": "f", "id": 153949, "pid": 76337, "tid": -914061504, "ts": 1716454223828716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223828718, "dur": 0, "args": { "External id": 153950, "cbid": 251, "correlation": 153950 } }, { "ph": "f", "id": 153950, "pid": 76337, "tid": -914061504, "ts": 1716454223828718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223873167, "dur": 115, "args": { "External id": 153951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153951, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 153951, "pid": 5, "tid": 7, "ts": 1716454223873167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828720, "dur": 13, "args": { "External id": 153951, "cbid": 211, "correlation": 153951 } }, { "ph": "s", "id": 153951, "pid": 76337, "tid": -914061504, "ts": 1716454223828720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223873284, "dur": 59, "args": { "External id": 153957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153957, "pid": 5, "tid": 7, "ts": 1716454223873284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828755, "dur": 9, "args": { "External id": 153957, "cbid": 211, "correlation": 153957 } }, { "ph": "s", "id": 153957, "pid": 76337, "tid": -914061504, "ts": 1716454223828755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223873345, "dur": 49, "args": { "External id": 153965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153965, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153965, "pid": 5, "tid": 7, "ts": 1716454223873345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828787, "dur": 8, "args": { "External id": 153965, "cbid": 211, "correlation": 153965 } }, { "ph": "s", "id": 153965, "pid": 76337, "tid": -914061504, "ts": 1716454223828787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223828859, "dur": 0, "args": { "External id": 153975, "cbid": 317, "correlation": 153975 } }, { "ph": "f", "id": 153975, "pid": 76337, "tid": -914061504, "ts": 1716454223828859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223828860, "dur": 0, "args": { "External id": 153976, "cbid": 203, "correlation": 153976 } }, { "ph": "f", "id": 153976, "pid": 76337, "tid": -914061504, "ts": 1716454223828860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223828860, "dur": 0, "args": { "External id": 153977, "cbid": 205, "correlation": 153977 } }, { "ph": "f", "id": 153977, "pid": 76337, "tid": -914061504, "ts": 1716454223828860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223873396, "dur": 41, "args": { "External id": 153981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153981, "pid": 5, "tid": 7, "ts": 1716454223873396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828877, "dur": 12, "args": { "External id": 153981, "cbid": 211, "correlation": 153981 } }, { "ph": "s", "id": 153981, "pid": 76337, "tid": -914061504, "ts": 1716454223828877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223873438, "dur": 14, "args": { "External id": 153983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153983, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153983, "pid": 5, "tid": 7, "ts": 1716454223873438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828892, "dur": 5, "args": { "External id": 153983, "cbid": 211, "correlation": 153983 } }, { "ph": "s", "id": 153983, "pid": 76337, "tid": -914061504, "ts": 1716454223828892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223873454, "dur": 1, "args": { "External id": 153985, "device": 5, "context": 1, "stream": 7, "correlation": 153985, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 153985, "pid": 5, "tid": 7, "ts": 1716454223873454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223828911, "dur": 19, "args": { "External id": 153985, "cbid": 51, "correlation": 153985 } }, { "ph": "s", "id": 153985, "pid": 76337, "tid": -914061504, "ts": 1716454223828911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223873458, "dur": 361, "args": { "External id": 153986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153986, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 153986, "pid": 5, "tid": 7, "ts": 1716454223873458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828932, "dur": 9, "args": { "External id": 153986, "cbid": 211, "correlation": 153986 } }, { "ph": "s", "id": 153986, "pid": 76337, "tid": -914061504, "ts": 1716454223828932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223873820, "dur": 14, "args": { "External id": 153988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153988, "pid": 5, "tid": 7, "ts": 1716454223873820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828950, "dur": 8, "args": { "External id": 153988, "cbid": 211, "correlation": 153988 } }, { "ph": "s", "id": 153988, "pid": 76337, "tid": -914061504, "ts": 1716454223828950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223873836, "dur": 15, "args": { "External id": 153994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 153994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 153994, "pid": 5, "tid": 7, "ts": 1716454223873836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223828989, "dur": 9, "args": { "External id": 153994, "cbid": 211, "correlation": 153994 } }, { "ph": "s", "id": 153994, "pid": 76337, "tid": -914061504, "ts": 1716454223828989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223873852, "dur": 19, "args": { "External id": 154014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154014, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 154014, "pid": 5, "tid": 7, "ts": 1716454223873852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829083, "dur": 12, "args": { "External id": 154014, "cbid": 211, "correlation": 154014 } }, { "ph": "s", "id": 154014, "pid": 76337, "tid": -914061504, "ts": 1716454223829083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223873871, "dur": 4, "args": { "External id": 154026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154026, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 154026, "pid": 5, "tid": 7, "ts": 1716454223873871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829105, "dur": 7, "args": { "External id": 154026, "cbid": 211, "correlation": 154026 } }, { "ph": "s", "id": 154026, "pid": 76337, "tid": -914061504, "ts": 1716454223829105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223873877, "dur": 18, "args": { "External id": 154029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154029, "pid": 5, "tid": 7, "ts": 1716454223873877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829124, "dur": 7, "args": { "External id": 154029, "cbid": 211, "correlation": 154029 } }, { "ph": "s", "id": 154029, "pid": 76337, "tid": -914061504, "ts": 1716454223829124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223873897, "dur": 13, "args": { "External id": 154038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154038, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154038, "pid": 5, "tid": 7, "ts": 1716454223873897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829163, "dur": 10, "args": { "External id": 154038, "cbid": 211, "correlation": 154038 } }, { "ph": "s", "id": 154038, "pid": 76337, "tid": -914061504, "ts": 1716454223829163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223829218, "dur": 0, "args": { "External id": 154048, "cbid": 317, "correlation": 154048 } }, { "ph": "f", "id": 154048, "pid": 76337, "tid": -914061504, "ts": 1716454223829218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223829219, "dur": 0, "args": { "External id": 154049, "cbid": 203, "correlation": 154049 } }, { "ph": "f", "id": 154049, "pid": 76337, "tid": -914061504, "ts": 1716454223829219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223829220, "dur": 0, "args": { "External id": 154050, "cbid": 205, "correlation": 154050 } }, { "ph": "f", "id": 154050, "pid": 76337, "tid": -914061504, "ts": 1716454223829220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223873911, "dur": 11, "args": { "External id": 154054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154054, "pid": 5, "tid": 7, "ts": 1716454223873911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829235, "dur": 11, "args": { "External id": 154054, "cbid": 211, "correlation": 154054 } }, { "ph": "s", "id": 154054, "pid": 76337, "tid": -914061504, "ts": 1716454223829235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223873923, "dur": 24, "args": { "External id": 154056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154056, "pid": 5, "tid": 7, "ts": 1716454223873923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829249, "dur": 6, "args": { "External id": 154056, "cbid": 211, "correlation": 154056 } }, { "ph": "s", "id": 154056, "pid": 76337, "tid": -914061504, "ts": 1716454223829249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223873949, "dur": 3, "args": { "External id": 154058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 154058, "pid": 5, "tid": 7, "ts": 1716454223873949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829261, "dur": 5, "args": { "External id": 154058, "cbid": 211, "correlation": 154058 } }, { "ph": "s", "id": 154058, "pid": 76337, "tid": -914061504, "ts": 1716454223829261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223829270, "dur": 0, "args": { "External id": 154059, "cbid": 51, "correlation": 154059 } }, { "ph": "s", "id": 154059, "pid": 76337, "tid": -914061504, "ts": 1716454223829270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223873953, "dur": 357, "args": { "External id": 154060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154060, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154060, "pid": 5, "tid": 7, "ts": 1716454223873953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829271, "dur": 7, "args": { "External id": 154060, "cbid": 211, "correlation": 154060 } }, { "ph": "s", "id": 154060, "pid": 76337, "tid": -914061504, "ts": 1716454223829271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223874312, "dur": 20, "args": { "External id": 154061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154061, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154061, "pid": 5, "tid": 7, "ts": 1716454223874312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829281, "dur": 5, "args": { "External id": 154061, "cbid": 211, "correlation": 154061 } }, { "ph": "s", "id": 154061, "pid": 76337, "tid": -914061504, "ts": 1716454223829281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223874333, "dur": 33, "args": { "External id": 154067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154067, "pid": 5, "tid": 7, "ts": 1716454223874333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829308, "dur": 9, "args": { "External id": 154067, "cbid": 211, "correlation": 154067 } }, { "ph": "s", "id": 154067, "pid": 76337, "tid": -914061504, "ts": 1716454223829308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223874368, "dur": 3, "args": { "External id": 154075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154075, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 154075, "pid": 5, "tid": 7, "ts": 1716454223874368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829351, "dur": 10, "args": { "External id": 154075, "cbid": 211, "correlation": 154075 } }, { "ph": "s", "id": 154075, "pid": 76337, "tid": -914061504, "ts": 1716454223829351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829416, "dur": 1, "args": { "External id": 154091, "cbid": 251, "correlation": 154091 } }, { "ph": "f", "id": 154091, "pid": 76337, "tid": -914061504, "ts": 1716454223829416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829421, "dur": 0, "args": { "External id": 154093, "cbid": 251, "correlation": 154093 } }, { "ph": "f", "id": 154093, "pid": 76337, "tid": -914061504, "ts": 1716454223829421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223874372, "dur": 13, "args": { "External id": 154094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154094, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 154094, "pid": 5, "tid": 7, "ts": 1716454223874372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829423, "dur": 11, "args": { "External id": 154094, "cbid": 211, "correlation": 154094 } }, { "ph": "s", "id": 154094, "pid": 76337, "tid": -914061504, "ts": 1716454223829423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223874386, "dur": 5, "args": { "External id": 154096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 154096, "pid": 5, "tid": 7, "ts": 1716454223874386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829436, "dur": 6, "args": { "External id": 154096, "cbid": 211, "correlation": 154096 } }, { "ph": "s", "id": 154096, "pid": 76337, "tid": -914061504, "ts": 1716454223829436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223874392, "dur": 29, "args": { "External id": 154106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154106, "pid": 5, "tid": 7, "ts": 1716454223874392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829494, "dur": 13, "args": { "External id": 154106, "cbid": 211, "correlation": 154106 } }, { "ph": "s", "id": 154106, "pid": 76337, "tid": -914061504, "ts": 1716454223829494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223874423, "dur": 30, "args": { "External id": 154126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154126, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 154126, "pid": 5, "tid": 7, "ts": 1716454223874423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829563, "dur": 11, "args": { "External id": 154126, "cbid": 211, "correlation": 154126 } }, { "ph": "s", "id": 154126, "pid": 76337, "tid": -914061504, "ts": 1716454223829563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223874454, "dur": 4, "args": { "External id": 154138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154138, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 154138, "pid": 5, "tid": 7, "ts": 1716454223874454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829584, "dur": 6, "args": { "External id": 154138, "cbid": 211, "correlation": 154138 } }, { "ph": "s", "id": 154138, "pid": 76337, "tid": -914061504, "ts": 1716454223829584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223874460, "dur": 29, "args": { "External id": 154141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154141, "pid": 5, "tid": 7, "ts": 1716454223874460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829603, "dur": 8, "args": { "External id": 154141, "cbid": 211, "correlation": 154141 } }, { "ph": "s", "id": 154141, "pid": 76337, "tid": -914061504, "ts": 1716454223829603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223874490, "dur": 20, "args": { "External id": 154150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154150, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154150, "pid": 5, "tid": 7, "ts": 1716454223874490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829644, "dur": 10, "args": { "External id": 154150, "cbid": 211, "correlation": 154150 } }, { "ph": "s", "id": 154150, "pid": 76337, "tid": -914061504, "ts": 1716454223829644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223829708, "dur": 0, "args": { "External id": 154160, "cbid": 317, "correlation": 154160 } }, { "ph": "f", "id": 154160, "pid": 76337, "tid": -914061504, "ts": 1716454223829708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223829709, "dur": 0, "args": { "External id": 154161, "cbid": 203, "correlation": 154161 } }, { "ph": "f", "id": 154161, "pid": 76337, "tid": -914061504, "ts": 1716454223829709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223829710, "dur": 0, "args": { "External id": 154162, "cbid": 205, "correlation": 154162 } }, { "ph": "f", "id": 154162, "pid": 76337, "tid": -914061504, "ts": 1716454223829710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223874512, "dur": 23, "args": { "External id": 154166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154166, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154166, "pid": 5, "tid": 7, "ts": 1716454223874512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829727, "dur": 12, "args": { "External id": 154166, "cbid": 211, "correlation": 154166 } }, { "ph": "s", "id": 154166, "pid": 76337, "tid": -914061504, "ts": 1716454223829727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223874536, "dur": 44, "args": { "External id": 154168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154168, "pid": 5, "tid": 7, "ts": 1716454223874536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829742, "dur": 5, "args": { "External id": 154168, "cbid": 211, "correlation": 154168 } }, { "ph": "s", "id": 154168, "pid": 76337, "tid": -914061504, "ts": 1716454223829742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223874581, "dur": 655, "args": { "External id": 154170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154170, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154170, "pid": 5, "tid": 7, "ts": 1716454223874581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829756, "dur": 9, "args": { "External id": 154170, "cbid": 211, "correlation": 154170 } }, { "ph": "s", "id": 154170, "pid": 76337, "tid": -914061504, "ts": 1716454223829756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223875237, "dur": 21, "args": { "External id": 154172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154172, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154172, "pid": 5, "tid": 7, "ts": 1716454223875237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829769, "dur": 5, "args": { "External id": 154172, "cbid": 211, "correlation": 154172 } }, { "ph": "s", "id": 154172, "pid": 76337, "tid": -914061504, "ts": 1716454223829769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223875259, "dur": 33, "args": { "External id": 154178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154178, "pid": 5, "tid": 7, "ts": 1716454223875259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829797, "dur": 9, "args": { "External id": 154178, "cbid": 211, "correlation": 154178 } }, { "ph": "s", "id": 154178, "pid": 76337, "tid": -914061504, "ts": 1716454223829797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223829856, "dur": 0, "args": { "External id": 154188, "cbid": 317, "correlation": 154188 } }, { "ph": "f", "id": 154188, "pid": 76337, "tid": -914061504, "ts": 1716454223829856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223829857, "dur": 0, "args": { "External id": 154189, "cbid": 203, "correlation": 154189 } }, { "ph": "f", "id": 154189, "pid": 76337, "tid": -914061504, "ts": 1716454223829857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223829858, "dur": 0, "args": { "External id": 154190, "cbid": 205, "correlation": 154190 } }, { "ph": "f", "id": 154190, "pid": 76337, "tid": -914061504, "ts": 1716454223829858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829880, "dur": 1, "args": { "External id": 154194, "cbid": 251, "correlation": 154194 } }, { "ph": "f", "id": 154194, "pid": 76337, "tid": -914061504, "ts": 1716454223829880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829882, "dur": 0, "args": { "External id": 154195, "cbid": 251, "correlation": 154195 } }, { "ph": "f", "id": 154195, "pid": 76337, "tid": -914061504, "ts": 1716454223829882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829882, "dur": 0, "args": { "External id": 154196, "cbid": 251, "correlation": 154196 } }, { "ph": "f", "id": 154196, "pid": 76337, "tid": -914061504, "ts": 1716454223829882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829883, "dur": 0, "args": { "External id": 154197, "cbid": 251, "correlation": 154197 } }, { "ph": "f", "id": 154197, "pid": 76337, "tid": -914061504, "ts": 1716454223829883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829884, "dur": 0, "args": { "External id": 154198, "cbid": 251, "correlation": 154198 } }, { "ph": "f", "id": 154198, "pid": 76337, "tid": -914061504, "ts": 1716454223829884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829884, "dur": 0, "args": { "External id": 154199, "cbid": 251, "correlation": 154199 } }, { "ph": "f", "id": 154199, "pid": 76337, "tid": -914061504, "ts": 1716454223829884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829885, "dur": 0, "args": { "External id": 154200, "cbid": 251, "correlation": 154200 } }, { "ph": "f", "id": 154200, "pid": 76337, "tid": -914061504, "ts": 1716454223829885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829886, "dur": 0, "args": { "External id": 154201, "cbid": 251, "correlation": 154201 } }, { "ph": "f", "id": 154201, "pid": 76337, "tid": -914061504, "ts": 1716454223829886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223829887, "dur": 0, "args": { "External id": 154202, "cbid": 251, "correlation": 154202 } }, { "ph": "f", "id": 154202, "pid": 76337, "tid": -914061504, "ts": 1716454223829887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223875293, "dur": 52, "args": { "External id": 154203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154203, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 154203, "pid": 5, "tid": 7, "ts": 1716454223875293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829889, "dur": 12, "args": { "External id": 154203, "cbid": 211, "correlation": 154203 } }, { "ph": "s", "id": 154203, "pid": 76337, "tid": -914061504, "ts": 1716454223829889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223875347, "dur": 32, "args": { "External id": 154209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154209, "pid": 5, "tid": 7, "ts": 1716454223875347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829922, "dur": 8, "args": { "External id": 154209, "cbid": 211, "correlation": 154209 } }, { "ph": "s", "id": 154209, "pid": 76337, "tid": -914061504, "ts": 1716454223829922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223875380, "dur": 27, "args": { "External id": 154217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154217, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154217, "pid": 5, "tid": 7, "ts": 1716454223875380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829951, "dur": 8, "args": { "External id": 154217, "cbid": 211, "correlation": 154217 } }, { "ph": "s", "id": 154217, "pid": 76337, "tid": -914061504, "ts": 1716454223829951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223875408, "dur": 20, "args": { "External id": 154225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154225, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154225, "pid": 5, "tid": 7, "ts": 1716454223875408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223829988, "dur": 9, "args": { "External id": 154225, "cbid": 211, "correlation": 154225 } }, { "ph": "s", "id": 154225, "pid": 76337, "tid": -914061504, "ts": 1716454223829988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223875429, "dur": 31, "args": { "External id": 154245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154245, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 154245, "pid": 5, "tid": 7, "ts": 1716454223875429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830071, "dur": 12, "args": { "External id": 154245, "cbid": 211, "correlation": 154245 } }, { "ph": "s", "id": 154245, "pid": 76337, "tid": -914061504, "ts": 1716454223830071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223875461, "dur": 4, "args": { "External id": 154257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154257, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 154257, "pid": 5, "tid": 7, "ts": 1716454223875461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830093, "dur": 6, "args": { "External id": 154257, "cbid": 211, "correlation": 154257 } }, { "ph": "s", "id": 154257, "pid": 76337, "tid": -914061504, "ts": 1716454223830093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223875466, "dur": 31, "args": { "External id": 154260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154260, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154260, "pid": 5, "tid": 7, "ts": 1716454223875466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830112, "dur": 6, "args": { "External id": 154260, "cbid": 211, "correlation": 154260 } }, { "ph": "s", "id": 154260, "pid": 76337, "tid": -914061504, "ts": 1716454223830112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223830169, "dur": 0, "args": { "External id": 154271, "cbid": 317, "correlation": 154271 } }, { "ph": "f", "id": 154271, "pid": 76337, "tid": -914061504, "ts": 1716454223830169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223830170, "dur": 0, "args": { "External id": 154272, "cbid": 203, "correlation": 154272 } }, { "ph": "f", "id": 154272, "pid": 76337, "tid": -914061504, "ts": 1716454223830170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223830171, "dur": 0, "args": { "External id": 154273, "cbid": 205, "correlation": 154273 } }, { "ph": "f", "id": 154273, "pid": 76337, "tid": -914061504, "ts": 1716454223830171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223875499, "dur": 21, "args": { "External id": 154277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154277, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154277, "pid": 5, "tid": 7, "ts": 1716454223875499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830186, "dur": 11, "args": { "External id": 154277, "cbid": 211, "correlation": 154277 } }, { "ph": "s", "id": 154277, "pid": 76337, "tid": -914061504, "ts": 1716454223830186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223875521, "dur": 120, "args": { "External id": 154279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154279, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154279, "pid": 5, "tid": 7, "ts": 1716454223875521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830206, "dur": 8, "args": { "External id": 154279, "cbid": 211, "correlation": 154279 } }, { "ph": "s", "id": 154279, "pid": 76337, "tid": -914061504, "ts": 1716454223830206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223875642, "dur": 21, "args": { "External id": 154281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154281, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154281, "pid": 5, "tid": 7, "ts": 1716454223875642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830218, "dur": 5, "args": { "External id": 154281, "cbid": 211, "correlation": 154281 } }, { "ph": "s", "id": 154281, "pid": 76337, "tid": -914061504, "ts": 1716454223830218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223875664, "dur": 32, "args": { "External id": 154287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154287, "pid": 5, "tid": 7, "ts": 1716454223875664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830245, "dur": 9, "args": { "External id": 154287, "cbid": 211, "correlation": 154287 } }, { "ph": "s", "id": 154287, "pid": 76337, "tid": -914061504, "ts": 1716454223830245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223875698, "dur": 180, "args": { "External id": 154296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154296, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154296, "pid": 5, "tid": 7, "ts": 1716454223875698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830327, "dur": 13, "args": { "External id": 154296, "cbid": 211, "correlation": 154296 } }, { "ph": "s", "id": 154296, "pid": 76337, "tid": -914061504, "ts": 1716454223830327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223875879, "dur": 65, "args": { "External id": 154318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154318, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154318, "pid": 5, "tid": 7, "ts": 1716454223875879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830384, "dur": 10, "args": { "External id": 154318, "cbid": 211, "correlation": 154318 } }, { "ph": "s", "id": 154318, "pid": 76337, "tid": -914061504, "ts": 1716454223830384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223830476, "dur": 1, "args": { "External id": 154329, "cbid": 251, "correlation": 154329 } }, { "ph": "f", "id": 154329, "pid": 76337, "tid": -914061504, "ts": 1716454223830476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223875946, "dur": 156, "args": { "External id": 154330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154330, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154330, "pid": 5, "tid": 7, "ts": 1716454223875946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830481, "dur": 13, "args": { "External id": 154330, "cbid": 211, "correlation": 154330 } }, { "ph": "s", "id": 154330, "pid": 76337, "tid": -914061504, "ts": 1716454223830481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223830551, "dur": 1, "args": { "External id": 154341, "cbid": 251, "correlation": 154341 } }, { "ph": "f", "id": 154341, "pid": 76337, "tid": -914061504, "ts": 1716454223830551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223876103, "dur": 146, "args": { "External id": 154342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154342, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154342, "pid": 5, "tid": 7, "ts": 1716454223876103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830555, "dur": 11, "args": { "External id": 154342, "cbid": 211, "correlation": 154342 } }, { "ph": "s", "id": 154342, "pid": 76337, "tid": -914061504, "ts": 1716454223830555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223830620, "dur": 1, "args": { "External id": 154353, "cbid": 251, "correlation": 154353 } }, { "ph": "f", "id": 154353, "pid": 76337, "tid": -914061504, "ts": 1716454223830620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223876250, "dur": 144, "args": { "External id": 154354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154354, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154354, "pid": 5, "tid": 7, "ts": 1716454223876250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830624, "dur": 11, "args": { "External id": 154354, "cbid": 211, "correlation": 154354 } }, { "ph": "s", "id": 154354, "pid": 76337, "tid": -914061504, "ts": 1716454223830624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223876395, "dur": 1945, "args": { "External id": 154375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154375, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 154375, "pid": 5, "tid": 7, "ts": 1716454223876395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830707, "dur": 13, "args": { "External id": 154375, "cbid": 211, "correlation": 154375 } }, { "ph": "s", "id": 154375, "pid": 76337, "tid": -914061504, "ts": 1716454223830707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223830809, "dur": 1, "args": { "External id": 154393, "cbid": 251, "correlation": 154393 } }, { "ph": "f", "id": 154393, "pid": 76337, "tid": -914061504, "ts": 1716454223830809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223878342, "dur": 146, "args": { "External id": 154395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154395, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 154395, "pid": 5, "tid": 7, "ts": 1716454223878342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830815, "dur": 13, "args": { "External id": 154395, "cbid": 211, "correlation": 154395 } }, { "ph": "s", "id": 154395, "pid": 76337, "tid": -914061504, "ts": 1716454223830815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223878489, "dur": 35, "args": { "External id": 154403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154403, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154403, "pid": 5, "tid": 7, "ts": 1716454223878489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830886, "dur": 12, "args": { "External id": 154403, "cbid": 211, "correlation": 154403 } }, { "ph": "s", "id": 154403, "pid": 76337, "tid": -914061504, "ts": 1716454223830886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223878526, "dur": 50, "args": { "External id": 154411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154411, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154411, "pid": 5, "tid": 7, "ts": 1716454223878526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223830926, "dur": 9, "args": { "External id": 154411, "cbid": 211, "correlation": 154411 } }, { "ph": "s", "id": 154411, "pid": 76337, "tid": -914061504, "ts": 1716454223830926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223878577, "dur": 30, "args": { "External id": 154422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154422, "pid": 5, "tid": 7, "ts": 1716454223878577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831008, "dur": 13, "args": { "External id": 154422, "cbid": 211, "correlation": 154422 } }, { "ph": "s", "id": 154422, "pid": 76337, "tid": -914061504, "ts": 1716454223831008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223878608, "dur": 34, "args": { "External id": 154444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154444, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154444, "pid": 5, "tid": 7, "ts": 1716454223878608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831039, "dur": 8, "args": { "External id": 154444, "cbid": 211, "correlation": 154444 } }, { "ph": "s", "id": 154444, "pid": 76337, "tid": -914061504, "ts": 1716454223831039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831124, "dur": 1, "args": { "External id": 154455, "cbid": 251, "correlation": 154455 } }, { "ph": "f", "id": 154455, "pid": 76337, "tid": -914061504, "ts": 1716454223831124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223878644, "dur": 90, "args": { "External id": 154456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154456, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154456, "pid": 5, "tid": 7, "ts": 1716454223878644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831129, "dur": 13, "args": { "External id": 154456, "cbid": 211, "correlation": 154456 } }, { "ph": "s", "id": 154456, "pid": 76337, "tid": -914061504, "ts": 1716454223831129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831199, "dur": 1, "args": { "External id": 154467, "cbid": 251, "correlation": 154467 } }, { "ph": "f", "id": 154467, "pid": 76337, "tid": -914061504, "ts": 1716454223831199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831203, "dur": 0, "args": { "External id": 154468, "cbid": 251, "correlation": 154468 } }, { "ph": "f", "id": 154468, "pid": 76337, "tid": -914061504, "ts": 1716454223831203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223878736, "dur": 11, "args": { "External id": 154469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154469, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 154469, "pid": 5, "tid": 7, "ts": 1716454223878736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831204, "dur": 12, "args": { "External id": 154469, "cbid": 211, "correlation": 154469 } }, { "ph": "s", "id": 154469, "pid": 76337, "tid": -914061504, "ts": 1716454223831204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223878748, "dur": 5, "args": { "External id": 154471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154471, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 154471, "pid": 5, "tid": 7, "ts": 1716454223878748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831219, "dur": 8, "args": { "External id": 154471, "cbid": 211, "correlation": 154471 } }, { "ph": "s", "id": 154471, "pid": 76337, "tid": -914061504, "ts": 1716454223831219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831278, "dur": 1, "args": { "External id": 154482, "cbid": 251, "correlation": 154482 } }, { "ph": "f", "id": 154482, "pid": 76337, "tid": -914061504, "ts": 1716454223831278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831282, "dur": 0, "args": { "External id": 154483, "cbid": 251, "correlation": 154483 } }, { "ph": "f", "id": 154483, "pid": 76337, "tid": -914061504, "ts": 1716454223831282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223878754, "dur": 7, "args": { "External id": 154484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154484, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 154484, "pid": 5, "tid": 7, "ts": 1716454223878754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831283, "dur": 12, "args": { "External id": 154484, "cbid": 211, "correlation": 154484 } }, { "ph": "s", "id": 154484, "pid": 76337, "tid": -914061504, "ts": 1716454223831283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223878763, "dur": 3, "args": { "External id": 154486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154486, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 154486, "pid": 5, "tid": 7, "ts": 1716454223878763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831297, "dur": 6, "args": { "External id": 154486, "cbid": 211, "correlation": 154486 } }, { "ph": "s", "id": 154486, "pid": 76337, "tid": -914061504, "ts": 1716454223831297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223878767, "dur": 92, "args": { "External id": 154507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154507, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 154507, "pid": 5, "tid": 7, "ts": 1716454223878767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831370, "dur": 13, "args": { "External id": 154507, "cbid": 211, "correlation": 154507 } }, { "ph": "s", "id": 154507, "pid": 76337, "tid": -914061504, "ts": 1716454223831370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831469, "dur": 1, "args": { "External id": 154525, "cbid": 251, "correlation": 154525 } }, { "ph": "f", "id": 154525, "pid": 76337, "tid": -914061504, "ts": 1716454223831469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223878861, "dur": 100, "args": { "External id": 154527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154527, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154527, "pid": 5, "tid": 7, "ts": 1716454223878861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831475, "dur": 13, "args": { "External id": 154527, "cbid": 211, "correlation": 154527 } }, { "ph": "s", "id": 154527, "pid": 76337, "tid": -914061504, "ts": 1716454223831475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223878962, "dur": 19, "args": { "External id": 154535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154535, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154535, "pid": 5, "tid": 7, "ts": 1716454223878962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831542, "dur": 12, "args": { "External id": 154535, "cbid": 211, "correlation": 154535 } }, { "ph": "s", "id": 154535, "pid": 76337, "tid": -914061504, "ts": 1716454223831542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223878982, "dur": 37, "args": { "External id": 154543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154543, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154543, "pid": 5, "tid": 7, "ts": 1716454223878982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831584, "dur": 9, "args": { "External id": 154543, "cbid": 211, "correlation": 154543 } }, { "ph": "s", "id": 154543, "pid": 76337, "tid": -914061504, "ts": 1716454223831584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223879020, "dur": 34, "args": { "External id": 154565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154565, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154565, "pid": 5, "tid": 7, "ts": 1716454223879020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831635, "dur": 11, "args": { "External id": 154565, "cbid": 211, "correlation": 154565 } }, { "ph": "s", "id": 154565, "pid": 76337, "tid": -914061504, "ts": 1716454223831635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831725, "dur": 1, "args": { "External id": 154581, "cbid": 251, "correlation": 154581 } }, { "ph": "f", "id": 154581, "pid": 76337, "tid": -914061504, "ts": 1716454223831725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831730, "dur": 0, "args": { "External id": 154583, "cbid": 251, "correlation": 154583 } }, { "ph": "f", "id": 154583, "pid": 76337, "tid": -914061504, "ts": 1716454223831730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223879055, "dur": 540, "args": { "External id": 154584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154584, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 154584, "pid": 5, "tid": 7, "ts": 1716454223879055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831733, "dur": 13, "args": { "External id": 154584, "cbid": 211, "correlation": 154584 } }, { "ph": "s", "id": 154584, "pid": 76337, "tid": -914061504, "ts": 1716454223831733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223879596, "dur": 125, "args": { "External id": 154592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154592, "pid": 5, "tid": 7, "ts": 1716454223879596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831799, "dur": 12, "args": { "External id": 154592, "cbid": 211, "correlation": 154592 } }, { "ph": "s", "id": 154592, "pid": 76337, "tid": -914061504, "ts": 1716454223831799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223879723, "dur": 127, "args": { "External id": 154600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154600, "pid": 5, "tid": 7, "ts": 1716454223879723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831829, "dur": 8, "args": { "External id": 154600, "cbid": 211, "correlation": 154600 } }, { "ph": "s", "id": 154600, "pid": 76337, "tid": -914061504, "ts": 1716454223831829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223831907, "dur": 1, "args": { "External id": 154616, "cbid": 251, "correlation": 154616 } }, { "ph": "f", "id": 154616, "pid": 76337, "tid": -914061504, "ts": 1716454223831907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223879852, "dur": 305, "args": { "External id": 154618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154618, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154618, "pid": 5, "tid": 7, "ts": 1716454223879852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831913, "dur": 12, "args": { "External id": 154618, "cbid": 211, "correlation": 154618 } }, { "ph": "s", "id": 154618, "pid": 76337, "tid": -914061504, "ts": 1716454223831913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223880158, "dur": 27, "args": { "External id": 154626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154626, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154626, "pid": 5, "tid": 7, "ts": 1716454223880158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223831955, "dur": 10, "args": { "External id": 154626, "cbid": 211, "correlation": 154626 } }, { "ph": "s", "id": 154626, "pid": 76337, "tid": -914061504, "ts": 1716454223831955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223880186, "dur": 82, "args": { "External id": 154637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154637, "pid": 5, "tid": 7, "ts": 1716454223880186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832034, "dur": 13, "args": { "External id": 154637, "cbid": 211, "correlation": 154637 } }, { "ph": "s", "id": 154637, "pid": 76337, "tid": -914061504, "ts": 1716454223832034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223832099, "dur": 0, "args": { "External id": 154649, "cbid": 317, "correlation": 154649 } }, { "ph": "f", "id": 154649, "pid": 76337, "tid": -914061504, "ts": 1716454223832099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223832100, "dur": 0, "args": { "External id": 154650, "cbid": 203, "correlation": 154650 } }, { "ph": "f", "id": 154650, "pid": 76337, "tid": -914061504, "ts": 1716454223832100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223832101, "dur": 0, "args": { "External id": 154651, "cbid": 205, "correlation": 154651 } }, { "ph": "f", "id": 154651, "pid": 76337, "tid": -914061504, "ts": 1716454223832101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223880269, "dur": 23, "args": { "External id": 154655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154655, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154655, "pid": 5, "tid": 7, "ts": 1716454223880269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832116, "dur": 12, "args": { "External id": 154655, "cbid": 211, "correlation": 154655 } }, { "ph": "s", "id": 154655, "pid": 76337, "tid": -914061504, "ts": 1716454223832116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223880294, "dur": 120, "args": { "External id": 154657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154657, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154657, "pid": 5, "tid": 7, "ts": 1716454223880294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832135, "dur": 7, "args": { "External id": 154657, "cbid": 211, "correlation": 154657 } }, { "ph": "s", "id": 154657, "pid": 76337, "tid": -914061504, "ts": 1716454223832135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223880415, "dur": 22, "args": { "External id": 154659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154659, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154659, "pid": 5, "tid": 7, "ts": 1716454223880415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832146, "dur": 5, "args": { "External id": 154659, "cbid": 211, "correlation": 154659 } }, { "ph": "s", "id": 154659, "pid": 76337, "tid": -914061504, "ts": 1716454223832146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223880438, "dur": 33, "args": { "External id": 154665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154665, "pid": 5, "tid": 7, "ts": 1716454223880438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832173, "dur": 8, "args": { "External id": 154665, "cbid": 211, "correlation": 154665 } }, { "ph": "s", "id": 154665, "pid": 76337, "tid": -914061504, "ts": 1716454223832173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223880473, "dur": 27, "args": { "External id": 154673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154673, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154673, "pid": 5, "tid": 7, "ts": 1716454223880473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832205, "dur": 8, "args": { "External id": 154673, "cbid": 211, "correlation": 154673 } }, { "ph": "s", "id": 154673, "pid": 76337, "tid": -914061504, "ts": 1716454223832205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223880501, "dur": 31, "args": { "External id": 154693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154693, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 154693, "pid": 5, "tid": 7, "ts": 1716454223880501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832277, "dur": 12, "args": { "External id": 154693, "cbid": 211, "correlation": 154693 } }, { "ph": "s", "id": 154693, "pid": 76337, "tid": -914061504, "ts": 1716454223832277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223880533, "dur": 4, "args": { "External id": 154705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154705, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 154705, "pid": 5, "tid": 7, "ts": 1716454223880533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832300, "dur": 6, "args": { "External id": 154705, "cbid": 211, "correlation": 154705 } }, { "ph": "s", "id": 154705, "pid": 76337, "tid": -914061504, "ts": 1716454223832300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223880539, "dur": 31, "args": { "External id": 154708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154708, "pid": 5, "tid": 7, "ts": 1716454223880539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832318, "dur": 7, "args": { "External id": 154708, "cbid": 211, "correlation": 154708 } }, { "ph": "s", "id": 154708, "pid": 76337, "tid": -914061504, "ts": 1716454223832318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223880571, "dur": 21, "args": { "External id": 154717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154717, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154717, "pid": 5, "tid": 7, "ts": 1716454223880571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832357, "dur": 9, "args": { "External id": 154717, "cbid": 211, "correlation": 154717 } }, { "ph": "s", "id": 154717, "pid": 76337, "tid": -914061504, "ts": 1716454223832357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223832408, "dur": 0, "args": { "External id": 154727, "cbid": 317, "correlation": 154727 } }, { "ph": "f", "id": 154727, "pid": 76337, "tid": -914061504, "ts": 1716454223832408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223832408, "dur": 0, "args": { "External id": 154728, "cbid": 203, "correlation": 154728 } }, { "ph": "f", "id": 154728, "pid": 76337, "tid": -914061504, "ts": 1716454223832408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223832409, "dur": 0, "args": { "External id": 154729, "cbid": 205, "correlation": 154729 } }, { "ph": "f", "id": 154729, "pid": 76337, "tid": -914061504, "ts": 1716454223832409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223880594, "dur": 22, "args": { "External id": 154733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154733, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154733, "pid": 5, "tid": 7, "ts": 1716454223880594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832423, "dur": 11, "args": { "External id": 154733, "cbid": 211, "correlation": 154733 } }, { "ph": "s", "id": 154733, "pid": 76337, "tid": -914061504, "ts": 1716454223832423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223880618, "dur": 44, "args": { "External id": 154735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154735, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154735, "pid": 5, "tid": 7, "ts": 1716454223880618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832437, "dur": 5, "args": { "External id": 154735, "cbid": 211, "correlation": 154735 } }, { "ph": "s", "id": 154735, "pid": 76337, "tid": -914061504, "ts": 1716454223832437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223880663, "dur": 654, "args": { "External id": 154737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154737, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154737, "pid": 5, "tid": 7, "ts": 1716454223880663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832448, "dur": 7, "args": { "External id": 154737, "cbid": 211, "correlation": 154737 } }, { "ph": "s", "id": 154737, "pid": 76337, "tid": -914061504, "ts": 1716454223832448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223881318, "dur": 22, "args": { "External id": 154739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154739, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154739, "pid": 5, "tid": 7, "ts": 1716454223881318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832458, "dur": 5, "args": { "External id": 154739, "cbid": 211, "correlation": 154739 } }, { "ph": "s", "id": 154739, "pid": 76337, "tid": -914061504, "ts": 1716454223832458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223881341, "dur": 33, "args": { "External id": 154745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154745, "pid": 5, "tid": 7, "ts": 1716454223881341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832485, "dur": 8, "args": { "External id": 154745, "cbid": 211, "correlation": 154745 } }, { "ph": "s", "id": 154745, "pid": 76337, "tid": -914061504, "ts": 1716454223832485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223881375, "dur": 3, "args": { "External id": 154753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154753, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 154753, "pid": 5, "tid": 7, "ts": 1716454223881375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832528, "dur": 9, "args": { "External id": 154753, "cbid": 211, "correlation": 154753 } }, { "ph": "s", "id": 154753, "pid": 76337, "tid": -914061504, "ts": 1716454223832528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223832594, "dur": 2, "args": { "External id": 154769, "cbid": 251, "correlation": 154769 } }, { "ph": "f", "id": 154769, "pid": 76337, "tid": -914061504, "ts": 1716454223832594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223832600, "dur": 0, "args": { "External id": 154771, "cbid": 251, "correlation": 154771 } }, { "ph": "f", "id": 154771, "pid": 76337, "tid": -914061504, "ts": 1716454223832600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223881380, "dur": 12, "args": { "External id": 154772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154772, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 154772, "pid": 5, "tid": 7, "ts": 1716454223881380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832602, "dur": 11, "args": { "External id": 154772, "cbid": 211, "correlation": 154772 } }, { "ph": "s", "id": 154772, "pid": 76337, "tid": -914061504, "ts": 1716454223832602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223881393, "dur": 5, "args": { "External id": 154774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154774, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 154774, "pid": 5, "tid": 7, "ts": 1716454223881393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832615, "dur": 5, "args": { "External id": 154774, "cbid": 211, "correlation": 154774 } }, { "ph": "s", "id": 154774, "pid": 76337, "tid": -914061504, "ts": 1716454223832615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223881399, "dur": 30, "args": { "External id": 154784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154784, "pid": 5, "tid": 7, "ts": 1716454223881399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832672, "dur": 12, "args": { "External id": 154784, "cbid": 211, "correlation": 154784 } }, { "ph": "s", "id": 154784, "pid": 76337, "tid": -914061504, "ts": 1716454223832672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223881430, "dur": 30, "args": { "External id": 154804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154804, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 154804, "pid": 5, "tid": 7, "ts": 1716454223881430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832738, "dur": 11, "args": { "External id": 154804, "cbid": 211, "correlation": 154804 } }, { "ph": "s", "id": 154804, "pid": 76337, "tid": -914061504, "ts": 1716454223832738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223881461, "dur": 4, "args": { "External id": 154816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154816, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 154816, "pid": 5, "tid": 7, "ts": 1716454223881461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832758, "dur": 6, "args": { "External id": 154816, "cbid": 211, "correlation": 154816 } }, { "ph": "s", "id": 154816, "pid": 76337, "tid": -914061504, "ts": 1716454223832758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223881466, "dur": 30, "args": { "External id": 154819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154819, "pid": 5, "tid": 7, "ts": 1716454223881466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832776, "dur": 6, "args": { "External id": 154819, "cbid": 211, "correlation": 154819 } }, { "ph": "s", "id": 154819, "pid": 76337, "tid": -914061504, "ts": 1716454223832776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223881498, "dur": 20, "args": { "External id": 154828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154828, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154828, "pid": 5, "tid": 7, "ts": 1716454223881498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832817, "dur": 9, "args": { "External id": 154828, "cbid": 211, "correlation": 154828 } }, { "ph": "s", "id": 154828, "pid": 76337, "tid": -914061504, "ts": 1716454223832817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223832879, "dur": 0, "args": { "External id": 154838, "cbid": 317, "correlation": 154838 } }, { "ph": "f", "id": 154838, "pid": 76337, "tid": -914061504, "ts": 1716454223832879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223832880, "dur": 0, "args": { "External id": 154839, "cbid": 203, "correlation": 154839 } }, { "ph": "f", "id": 154839, "pid": 76337, "tid": -914061504, "ts": 1716454223832880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223832881, "dur": 0, "args": { "External id": 154840, "cbid": 205, "correlation": 154840 } }, { "ph": "f", "id": 154840, "pid": 76337, "tid": -914061504, "ts": 1716454223832881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223881520, "dur": 22, "args": { "External id": 154844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154844, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154844, "pid": 5, "tid": 7, "ts": 1716454223881520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832894, "dur": 13, "args": { "External id": 154844, "cbid": 211, "correlation": 154844 } }, { "ph": "s", "id": 154844, "pid": 76337, "tid": -914061504, "ts": 1716454223832894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223881543, "dur": 44, "args": { "External id": 154846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154846, "pid": 5, "tid": 7, "ts": 1716454223881543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832909, "dur": 5, "args": { "External id": 154846, "cbid": 211, "correlation": 154846 } }, { "ph": "s", "id": 154846, "pid": 76337, "tid": -914061504, "ts": 1716454223832909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223881588, "dur": 645, "args": { "External id": 154848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154848, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154848, "pid": 5, "tid": 7, "ts": 1716454223881588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832920, "dur": 6, "args": { "External id": 154848, "cbid": 211, "correlation": 154848 } }, { "ph": "s", "id": 154848, "pid": 76337, "tid": -914061504, "ts": 1716454223832920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223882234, "dur": 22, "args": { "External id": 154850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154850, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154850, "pid": 5, "tid": 7, "ts": 1716454223882234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832929, "dur": 5, "args": { "External id": 154850, "cbid": 211, "correlation": 154850 } }, { "ph": "s", "id": 154850, "pid": 76337, "tid": -914061504, "ts": 1716454223832929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223882257, "dur": 35, "args": { "External id": 154856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154856, "pid": 5, "tid": 7, "ts": 1716454223882257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832957, "dur": 8, "args": { "External id": 154856, "cbid": 211, "correlation": 154856 } }, { "ph": "s", "id": 154856, "pid": 76337, "tid": -914061504, "ts": 1716454223832957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223882293, "dur": 26, "args": { "External id": 154864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154864, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154864, "pid": 5, "tid": 7, "ts": 1716454223882293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223832999, "dur": 9, "args": { "External id": 154864, "cbid": 211, "correlation": 154864 } }, { "ph": "s", "id": 154864, "pid": 76337, "tid": -914061504, "ts": 1716454223832999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223882321, "dur": 20, "args": { "External id": 154872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154872, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154872, "pid": 5, "tid": 7, "ts": 1716454223882321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833029, "dur": 9, "args": { "External id": 154872, "cbid": 211, "correlation": 154872 } }, { "ph": "s", "id": 154872, "pid": 76337, "tid": -914061504, "ts": 1716454223833029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223882342, "dur": 30, "args": { "External id": 154892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154892, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 154892, "pid": 5, "tid": 7, "ts": 1716454223882342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833109, "dur": 12, "args": { "External id": 154892, "cbid": 211, "correlation": 154892 } }, { "ph": "s", "id": 154892, "pid": 76337, "tid": -914061504, "ts": 1716454223833109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223882374, "dur": 4, "args": { "External id": 154904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154904, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 154904, "pid": 5, "tid": 7, "ts": 1716454223882374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833132, "dur": 6, "args": { "External id": 154904, "cbid": 211, "correlation": 154904 } }, { "ph": "s", "id": 154904, "pid": 76337, "tid": -914061504, "ts": 1716454223833132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223882379, "dur": 29, "args": { "External id": 154907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154907, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154907, "pid": 5, "tid": 7, "ts": 1716454223882379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833149, "dur": 7, "args": { "External id": 154907, "cbid": 211, "correlation": 154907 } }, { "ph": "s", "id": 154907, "pid": 76337, "tid": -914061504, "ts": 1716454223833149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223833207, "dur": 0, "args": { "External id": 154918, "cbid": 317, "correlation": 154918 } }, { "ph": "f", "id": 154918, "pid": 76337, "tid": -914061504, "ts": 1716454223833207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223833208, "dur": 0, "args": { "External id": 154919, "cbid": 203, "correlation": 154919 } }, { "ph": "f", "id": 154919, "pid": 76337, "tid": -914061504, "ts": 1716454223833208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223833209, "dur": 0, "args": { "External id": 154920, "cbid": 205, "correlation": 154920 } }, { "ph": "f", "id": 154920, "pid": 76337, "tid": -914061504, "ts": 1716454223833209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223882410, "dur": 22, "args": { "External id": 154924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154924, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154924, "pid": 5, "tid": 7, "ts": 1716454223882410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833222, "dur": 12, "args": { "External id": 154924, "cbid": 211, "correlation": 154924 } }, { "ph": "s", "id": 154924, "pid": 76337, "tid": -914061504, "ts": 1716454223833222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223882433, "dur": 118, "args": { "External id": 154926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154926, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154926, "pid": 5, "tid": 7, "ts": 1716454223882433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833239, "dur": 7, "args": { "External id": 154926, "cbid": 211, "correlation": 154926 } }, { "ph": "s", "id": 154926, "pid": 76337, "tid": -914061504, "ts": 1716454223833239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223882552, "dur": 22, "args": { "External id": 154928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154928, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154928, "pid": 5, "tid": 7, "ts": 1716454223882552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833250, "dur": 5, "args": { "External id": 154928, "cbid": 211, "correlation": 154928 } }, { "ph": "s", "id": 154928, "pid": 76337, "tid": -914061504, "ts": 1716454223833250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223882575, "dur": 32, "args": { "External id": 154934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154934, "pid": 5, "tid": 7, "ts": 1716454223882575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833277, "dur": 8, "args": { "External id": 154934, "cbid": 211, "correlation": 154934 } }, { "ph": "s", "id": 154934, "pid": 76337, "tid": -914061504, "ts": 1716454223833277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223882609, "dur": 200, "args": { "External id": 154943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154943, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154943, "pid": 5, "tid": 7, "ts": 1716454223882609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833358, "dur": 14, "args": { "External id": 154943, "cbid": 211, "correlation": 154943 } }, { "ph": "s", "id": 154943, "pid": 76337, "tid": -914061504, "ts": 1716454223833358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223882810, "dur": 65, "args": { "External id": 154965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154965, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 154965, "pid": 5, "tid": 7, "ts": 1716454223882810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833413, "dur": 10, "args": { "External id": 154965, "cbid": 211, "correlation": 154965 } }, { "ph": "s", "id": 154965, "pid": 76337, "tid": -914061504, "ts": 1716454223833413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223833499, "dur": 1, "args": { "External id": 154976, "cbid": 251, "correlation": 154976 } }, { "ph": "f", "id": 154976, "pid": 76337, "tid": -914061504, "ts": 1716454223833499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223882876, "dur": 154, "args": { "External id": 154977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154977, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154977, "pid": 5, "tid": 7, "ts": 1716454223882876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833504, "dur": 13, "args": { "External id": 154977, "cbid": 211, "correlation": 154977 } }, { "ph": "s", "id": 154977, "pid": 76337, "tid": -914061504, "ts": 1716454223833504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223833574, "dur": 1, "args": { "External id": 154988, "cbid": 251, "correlation": 154988 } }, { "ph": "f", "id": 154988, "pid": 76337, "tid": -914061504, "ts": 1716454223833574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223883032, "dur": 146, "args": { "External id": 154989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 154989, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 154989, "pid": 5, "tid": 7, "ts": 1716454223883032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833578, "dur": 12, "args": { "External id": 154989, "cbid": 211, "correlation": 154989 } }, { "ph": "s", "id": 154989, "pid": 76337, "tid": -914061504, "ts": 1716454223833578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223833645, "dur": 1, "args": { "External id": 155000, "cbid": 251, "correlation": 155000 } }, { "ph": "f", "id": 155000, "pid": 76337, "tid": -914061504, "ts": 1716454223833645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223883179, "dur": 149, "args": { "External id": 155001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155001, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155001, "pid": 5, "tid": 7, "ts": 1716454223883179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833649, "dur": 11, "args": { "External id": 155001, "cbid": 211, "correlation": 155001 } }, { "ph": "s", "id": 155001, "pid": 76337, "tid": -914061504, "ts": 1716454223833649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223883329, "dur": 1945, "args": { "External id": 155022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155022, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 155022, "pid": 5, "tid": 7, "ts": 1716454223883329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833729, "dur": 14, "args": { "External id": 155022, "cbid": 211, "correlation": 155022 } }, { "ph": "s", "id": 155022, "pid": 76337, "tid": -914061504, "ts": 1716454223833729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223833829, "dur": 1, "args": { "External id": 155040, "cbid": 251, "correlation": 155040 } }, { "ph": "f", "id": 155040, "pid": 76337, "tid": -914061504, "ts": 1716454223833829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223885276, "dur": 147, "args": { "External id": 155042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155042, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155042, "pid": 5, "tid": 7, "ts": 1716454223885276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833835, "dur": 14, "args": { "External id": 155042, "cbid": 211, "correlation": 155042 } }, { "ph": "s", "id": 155042, "pid": 76337, "tid": -914061504, "ts": 1716454223833835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223885425, "dur": 36, "args": { "External id": 155050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155050, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155050, "pid": 5, "tid": 7, "ts": 1716454223885425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833905, "dur": 12, "args": { "External id": 155050, "cbid": 211, "correlation": 155050 } }, { "ph": "s", "id": 155050, "pid": 76337, "tid": -914061504, "ts": 1716454223833905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223885461, "dur": 51, "args": { "External id": 155058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155058, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155058, "pid": 5, "tid": 7, "ts": 1716454223885461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223833945, "dur": 8, "args": { "External id": 155058, "cbid": 211, "correlation": 155058 } }, { "ph": "s", "id": 155058, "pid": 76337, "tid": -914061504, "ts": 1716454223833945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223885514, "dur": 31, "args": { "External id": 155069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155069, "pid": 5, "tid": 7, "ts": 1716454223885514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834027, "dur": 14, "args": { "External id": 155069, "cbid": 211, "correlation": 155069 } }, { "ph": "s", "id": 155069, "pid": 76337, "tid": -914061504, "ts": 1716454223834027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223885545, "dur": 35, "args": { "External id": 155091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155091, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155091, "pid": 5, "tid": 7, "ts": 1716454223885545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834060, "dur": 7, "args": { "External id": 155091, "cbid": 211, "correlation": 155091 } }, { "ph": "s", "id": 155091, "pid": 76337, "tid": -914061504, "ts": 1716454223834060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834144, "dur": 1, "args": { "External id": 155102, "cbid": 251, "correlation": 155102 } }, { "ph": "f", "id": 155102, "pid": 76337, "tid": -914061504, "ts": 1716454223834144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223885582, "dur": 87, "args": { "External id": 155103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155103, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155103, "pid": 5, "tid": 7, "ts": 1716454223885582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834149, "dur": 13, "args": { "External id": 155103, "cbid": 211, "correlation": 155103 } }, { "ph": "s", "id": 155103, "pid": 76337, "tid": -914061504, "ts": 1716454223834149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834218, "dur": 1, "args": { "External id": 155114, "cbid": 251, "correlation": 155114 } }, { "ph": "f", "id": 155114, "pid": 76337, "tid": -914061504, "ts": 1716454223834218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834221, "dur": 0, "args": { "External id": 155115, "cbid": 251, "correlation": 155115 } }, { "ph": "f", "id": 155115, "pid": 76337, "tid": -914061504, "ts": 1716454223834221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223885670, "dur": 11, "args": { "External id": 155116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155116, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 155116, "pid": 5, "tid": 7, "ts": 1716454223885670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834223, "dur": 11, "args": { "External id": 155116, "cbid": 211, "correlation": 155116 } }, { "ph": "s", "id": 155116, "pid": 76337, "tid": -914061504, "ts": 1716454223834223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223885683, "dur": 5, "args": { "External id": 155118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155118, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 155118, "pid": 5, "tid": 7, "ts": 1716454223885683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834236, "dur": 6, "args": { "External id": 155118, "cbid": 211, "correlation": 155118 } }, { "ph": "s", "id": 155118, "pid": 76337, "tid": -914061504, "ts": 1716454223834236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834294, "dur": 1, "args": { "External id": 155129, "cbid": 251, "correlation": 155129 } }, { "ph": "f", "id": 155129, "pid": 76337, "tid": -914061504, "ts": 1716454223834294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834298, "dur": 0, "args": { "External id": 155130, "cbid": 251, "correlation": 155130 } }, { "ph": "f", "id": 155130, "pid": 76337, "tid": -914061504, "ts": 1716454223834298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223885690, "dur": 7, "args": { "External id": 155131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155131, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 155131, "pid": 5, "tid": 7, "ts": 1716454223885690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834300, "dur": 11, "args": { "External id": 155131, "cbid": 211, "correlation": 155131 } }, { "ph": "s", "id": 155131, "pid": 76337, "tid": -914061504, "ts": 1716454223834300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223885698, "dur": 3, "args": { "External id": 155133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155133, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 155133, "pid": 5, "tid": 7, "ts": 1716454223885698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834313, "dur": 5, "args": { "External id": 155133, "cbid": 211, "correlation": 155133 } }, { "ph": "s", "id": 155133, "pid": 76337, "tid": -914061504, "ts": 1716454223834313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223885702, "dur": 91, "args": { "External id": 155154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155154, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 155154, "pid": 5, "tid": 7, "ts": 1716454223885702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834386, "dur": 13, "args": { "External id": 155154, "cbid": 211, "correlation": 155154 } }, { "ph": "s", "id": 155154, "pid": 76337, "tid": -914061504, "ts": 1716454223834386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834484, "dur": 1, "args": { "External id": 155172, "cbid": 251, "correlation": 155172 } }, { "ph": "f", "id": 155172, "pid": 76337, "tid": -914061504, "ts": 1716454223834484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223885795, "dur": 99, "args": { "External id": 155174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155174, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155174, "pid": 5, "tid": 7, "ts": 1716454223885795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834489, "dur": 14, "args": { "External id": 155174, "cbid": 211, "correlation": 155174 } }, { "ph": "s", "id": 155174, "pid": 76337, "tid": -914061504, "ts": 1716454223834489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223885895, "dur": 19, "args": { "External id": 155182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155182, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155182, "pid": 5, "tid": 7, "ts": 1716454223885895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834559, "dur": 12, "args": { "External id": 155182, "cbid": 211, "correlation": 155182 } }, { "ph": "s", "id": 155182, "pid": 76337, "tid": -914061504, "ts": 1716454223834559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223885915, "dur": 37, "args": { "External id": 155190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155190, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155190, "pid": 5, "tid": 7, "ts": 1716454223885915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834601, "dur": 9, "args": { "External id": 155190, "cbid": 211, "correlation": 155190 } }, { "ph": "s", "id": 155190, "pid": 76337, "tid": -914061504, "ts": 1716454223834601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223885953, "dur": 34, "args": { "External id": 155212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155212, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155212, "pid": 5, "tid": 7, "ts": 1716454223885953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834653, "dur": 10, "args": { "External id": 155212, "cbid": 211, "correlation": 155212 } }, { "ph": "s", "id": 155212, "pid": 76337, "tid": -914061504, "ts": 1716454223834653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834741, "dur": 1, "args": { "External id": 155228, "cbid": 251, "correlation": 155228 } }, { "ph": "f", "id": 155228, "pid": 76337, "tid": -914061504, "ts": 1716454223834741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834746, "dur": 0, "args": { "External id": 155230, "cbid": 251, "correlation": 155230 } }, { "ph": "f", "id": 155230, "pid": 76337, "tid": -914061504, "ts": 1716454223834746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223885989, "dur": 540, "args": { "External id": 155231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155231, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155231, "pid": 5, "tid": 7, "ts": 1716454223885989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834750, "dur": 13, "args": { "External id": 155231, "cbid": 211, "correlation": 155231 } }, { "ph": "s", "id": 155231, "pid": 76337, "tid": -914061504, "ts": 1716454223834750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223886531, "dur": 126, "args": { "External id": 155239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155239, "pid": 5, "tid": 7, "ts": 1716454223886531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834815, "dur": 12, "args": { "External id": 155239, "cbid": 211, "correlation": 155239 } }, { "ph": "s", "id": 155239, "pid": 76337, "tid": -914061504, "ts": 1716454223834815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223886658, "dur": 126, "args": { "External id": 155247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155247, "pid": 5, "tid": 7, "ts": 1716454223886658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834846, "dur": 8, "args": { "External id": 155247, "cbid": 211, "correlation": 155247 } }, { "ph": "s", "id": 155247, "pid": 76337, "tid": -914061504, "ts": 1716454223834846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223834924, "dur": 1, "args": { "External id": 155263, "cbid": 251, "correlation": 155263 } }, { "ph": "f", "id": 155263, "pid": 76337, "tid": -914061504, "ts": 1716454223834924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223886785, "dur": 305, "args": { "External id": 155265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155265, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155265, "pid": 5, "tid": 7, "ts": 1716454223886785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834929, "dur": 12, "args": { "External id": 155265, "cbid": 211, "correlation": 155265 } }, { "ph": "s", "id": 155265, "pid": 76337, "tid": -914061504, "ts": 1716454223834929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223887091, "dur": 27, "args": { "External id": 155273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155273, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155273, "pid": 5, "tid": 7, "ts": 1716454223887091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223834972, "dur": 22, "args": { "External id": 155273, "cbid": 211, "correlation": 155273 } }, { "ph": "s", "id": 155273, "pid": 76337, "tid": -914061504, "ts": 1716454223834972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223887120, "dur": 81, "args": { "External id": 155284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155284, "pid": 5, "tid": 7, "ts": 1716454223887120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835054, "dur": 12, "args": { "External id": 155284, "cbid": 211, "correlation": 155284 } }, { "ph": "s", "id": 155284, "pid": 76337, "tid": -914061504, "ts": 1716454223835054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223835118, "dur": 0, "args": { "External id": 155296, "cbid": 317, "correlation": 155296 } }, { "ph": "f", "id": 155296, "pid": 76337, "tid": -914061504, "ts": 1716454223835118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223835119, "dur": 0, "args": { "External id": 155297, "cbid": 203, "correlation": 155297 } }, { "ph": "f", "id": 155297, "pid": 76337, "tid": -914061504, "ts": 1716454223835119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223835120, "dur": 0, "args": { "External id": 155298, "cbid": 205, "correlation": 155298 } }, { "ph": "f", "id": 155298, "pid": 76337, "tid": -914061504, "ts": 1716454223835120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887203, "dur": 23, "args": { "External id": 155302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155302, "pid": 5, "tid": 7, "ts": 1716454223887203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835136, "dur": 13, "args": { "External id": 155302, "cbid": 211, "correlation": 155302 } }, { "ph": "s", "id": 155302, "pid": 76337, "tid": -914061504, "ts": 1716454223835136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223887227, "dur": 121, "args": { "External id": 155304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155304, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155304, "pid": 5, "tid": 7, "ts": 1716454223887227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835155, "dur": 6, "args": { "External id": 155304, "cbid": 211, "correlation": 155304 } }, { "ph": "s", "id": 155304, "pid": 76337, "tid": -914061504, "ts": 1716454223835155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887349, "dur": 24, "args": { "External id": 155306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155306, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155306, "pid": 5, "tid": 7, "ts": 1716454223887349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835165, "dur": 5, "args": { "External id": 155306, "cbid": 211, "correlation": 155306 } }, { "ph": "s", "id": 155306, "pid": 76337, "tid": -914061504, "ts": 1716454223835165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223887374, "dur": 32, "args": { "External id": 155312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155312, "pid": 5, "tid": 7, "ts": 1716454223887374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835193, "dur": 9, "args": { "External id": 155312, "cbid": 211, "correlation": 155312 } }, { "ph": "s", "id": 155312, "pid": 76337, "tid": -914061504, "ts": 1716454223835193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223887408, "dur": 27, "args": { "External id": 155320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155320, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155320, "pid": 5, "tid": 7, "ts": 1716454223887408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835225, "dur": 8, "args": { "External id": 155320, "cbid": 211, "correlation": 155320 } }, { "ph": "s", "id": 155320, "pid": 76337, "tid": -914061504, "ts": 1716454223835225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223835296, "dur": 0, "args": { "External id": 155330, "cbid": 317, "correlation": 155330 } }, { "ph": "f", "id": 155330, "pid": 76337, "tid": -914061504, "ts": 1716454223835296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223835297, "dur": 0, "args": { "External id": 155331, "cbid": 203, "correlation": 155331 } }, { "ph": "f", "id": 155331, "pid": 76337, "tid": -914061504, "ts": 1716454223835297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223835298, "dur": 0, "args": { "External id": 155332, "cbid": 205, "correlation": 155332 } }, { "ph": "f", "id": 155332, "pid": 76337, "tid": -914061504, "ts": 1716454223835298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887436, "dur": 23, "args": { "External id": 155336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155336, "pid": 5, "tid": 7, "ts": 1716454223887436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835312, "dur": 12, "args": { "External id": 155336, "cbid": 211, "correlation": 155336 } }, { "ph": "s", "id": 155336, "pid": 76337, "tid": -914061504, "ts": 1716454223835312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887460, "dur": 44, "args": { "External id": 155338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155338, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155338, "pid": 5, "tid": 7, "ts": 1716454223887460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835327, "dur": 5, "args": { "External id": 155338, "cbid": 211, "correlation": 155338 } }, { "ph": "s", "id": 155338, "pid": 76337, "tid": -914061504, "ts": 1716454223835327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223887506, "dur": 235, "args": { "External id": 155340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155340, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 155340, "pid": 5, "tid": 7, "ts": 1716454223887506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835338, "dur": 8, "args": { "External id": 155340, "cbid": 211, "correlation": 155340 } }, { "ph": "s", "id": 155340, "pid": 76337, "tid": -914061504, "ts": 1716454223835338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887742, "dur": 6, "args": { "External id": 155342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155342, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155342, "pid": 5, "tid": 7, "ts": 1716454223887742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835350, "dur": 5, "args": { "External id": 155342, "cbid": 211, "correlation": 155342 } }, { "ph": "s", "id": 155342, "pid": 76337, "tid": -914061504, "ts": 1716454223835350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223887750, "dur": 9, "args": { "External id": 155348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155348, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155348, "pid": 5, "tid": 7, "ts": 1716454223887750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835376, "dur": 8, "args": { "External id": 155348, "cbid": 211, "correlation": 155348 } }, { "ph": "s", "id": 155348, "pid": 76337, "tid": -914061504, "ts": 1716454223835376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223887760, "dur": 12, "args": { "External id": 155368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155368, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 155368, "pid": 5, "tid": 7, "ts": 1716454223887760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835468, "dur": 12, "args": { "External id": 155368, "cbid": 211, "correlation": 155368 } }, { "ph": "s", "id": 155368, "pid": 76337, "tid": -914061504, "ts": 1716454223835468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223887773, "dur": 4, "args": { "External id": 155380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155380, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 155380, "pid": 5, "tid": 7, "ts": 1716454223887773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835491, "dur": 7, "args": { "External id": 155380, "cbid": 211, "correlation": 155380 } }, { "ph": "s", "id": 155380, "pid": 76337, "tid": -914061504, "ts": 1716454223835491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223887779, "dur": 12, "args": { "External id": 155383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155383, "pid": 5, "tid": 7, "ts": 1716454223887779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835509, "dur": 6, "args": { "External id": 155383, "cbid": 211, "correlation": 155383 } }, { "ph": "s", "id": 155383, "pid": 76337, "tid": -914061504, "ts": 1716454223835509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223887792, "dur": 7, "args": { "External id": 155392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155392, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155392, "pid": 5, "tid": 7, "ts": 1716454223887792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835549, "dur": 9, "args": { "External id": 155392, "cbid": 211, "correlation": 155392 } }, { "ph": "s", "id": 155392, "pid": 76337, "tid": -914061504, "ts": 1716454223835549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223835601, "dur": 0, "args": { "External id": 155402, "cbid": 317, "correlation": 155402 } }, { "ph": "f", "id": 155402, "pid": 76337, "tid": -914061504, "ts": 1716454223835601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223835602, "dur": 0, "args": { "External id": 155403, "cbid": 203, "correlation": 155403 } }, { "ph": "f", "id": 155403, "pid": 76337, "tid": -914061504, "ts": 1716454223835602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223835603, "dur": 0, "args": { "External id": 155404, "cbid": 205, "correlation": 155404 } }, { "ph": "f", "id": 155404, "pid": 76337, "tid": -914061504, "ts": 1716454223835603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887801, "dur": 6, "args": { "External id": 155408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155408, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155408, "pid": 5, "tid": 7, "ts": 1716454223887801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835618, "dur": 11, "args": { "External id": 155408, "cbid": 211, "correlation": 155408 } }, { "ph": "s", "id": 155408, "pid": 76337, "tid": -914061504, "ts": 1716454223835618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223887807, "dur": 84, "args": { "External id": 155410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155410, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155410, "pid": 5, "tid": 7, "ts": 1716454223887807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835632, "dur": 5, "args": { "External id": 155410, "cbid": 211, "correlation": 155410 } }, { "ph": "s", "id": 155410, "pid": 76337, "tid": -914061504, "ts": 1716454223835632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223887893, "dur": 1, "args": { "External id": 155412, "device": 5, "context": 1, "stream": 7, "correlation": 155412, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 155412, "pid": 5, "tid": 7, "ts": 1716454223887893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223835645, "dur": 9, "args": { "External id": 155412, "cbid": 51, "correlation": 155412 } }, { "ph": "s", "id": 155412, "pid": 76337, "tid": -914061504, "ts": 1716454223835645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223887897, "dur": 537, "args": { "External id": 155413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155413, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155413, "pid": 5, "tid": 7, "ts": 1716454223887897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835656, "dur": 9, "args": { "External id": 155413, "cbid": 211, "correlation": 155413 } }, { "ph": "s", "id": 155413, "pid": 76337, "tid": -914061504, "ts": 1716454223835656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223888435, "dur": 12, "args": { "External id": 155415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155415, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155415, "pid": 5, "tid": 7, "ts": 1716454223888435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835668, "dur": 5, "args": { "External id": 155415, "cbid": 211, "correlation": 155415 } }, { "ph": "s", "id": 155415, "pid": 76337, "tid": -914061504, "ts": 1716454223835668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223888449, "dur": 14, "args": { "External id": 155421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155421, "pid": 5, "tid": 7, "ts": 1716454223888449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835697, "dur": 8, "args": { "External id": 155421, "cbid": 211, "correlation": 155421 } }, { "ph": "s", "id": 155421, "pid": 76337, "tid": -914061504, "ts": 1716454223835697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223888465, "dur": 3, "args": { "External id": 155429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155429, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 155429, "pid": 5, "tid": 7, "ts": 1716454223888465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835741, "dur": 9, "args": { "External id": 155429, "cbid": 211, "correlation": 155429 } }, { "ph": "s", "id": 155429, "pid": 76337, "tid": -914061504, "ts": 1716454223835741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223835805, "dur": 1, "args": { "External id": 155445, "cbid": 251, "correlation": 155445 } }, { "ph": "f", "id": 155445, "pid": 76337, "tid": -914061504, "ts": 1716454223835805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223835810, "dur": 0, "args": { "External id": 155447, "cbid": 251, "correlation": 155447 } }, { "ph": "f", "id": 155447, "pid": 76337, "tid": -914061504, "ts": 1716454223835810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223888470, "dur": 13, "args": { "External id": 155448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155448, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155448, "pid": 5, "tid": 7, "ts": 1716454223888470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835812, "dur": 11, "args": { "External id": 155448, "cbid": 211, "correlation": 155448 } }, { "ph": "s", "id": 155448, "pid": 76337, "tid": -914061504, "ts": 1716454223835812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223888484, "dur": 5, "args": { "External id": 155450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155450, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155450, "pid": 5, "tid": 7, "ts": 1716454223888484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835825, "dur": 5, "args": { "External id": 155450, "cbid": 211, "correlation": 155450 } }, { "ph": "s", "id": 155450, "pid": 76337, "tid": -914061504, "ts": 1716454223835825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223888490, "dur": 17, "args": { "External id": 155460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155460, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155460, "pid": 5, "tid": 7, "ts": 1716454223888490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835883, "dur": 13, "args": { "External id": 155460, "cbid": 211, "correlation": 155460 } }, { "ph": "s", "id": 155460, "pid": 76337, "tid": -914061504, "ts": 1716454223835883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223888509, "dur": 18, "args": { "External id": 155480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155480, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 155480, "pid": 5, "tid": 7, "ts": 1716454223888509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835951, "dur": 10, "args": { "External id": 155480, "cbid": 211, "correlation": 155480 } }, { "ph": "s", "id": 155480, "pid": 76337, "tid": -914061504, "ts": 1716454223835951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223888528, "dur": 5, "args": { "External id": 155492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155492, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 155492, "pid": 5, "tid": 7, "ts": 1716454223888528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223835972, "dur": 16, "args": { "External id": 155492, "cbid": 211, "correlation": 155492 } }, { "ph": "s", "id": 155492, "pid": 76337, "tid": -914061504, "ts": 1716454223835972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223888534, "dur": 16, "args": { "External id": 155495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155495, "pid": 5, "tid": 7, "ts": 1716454223888534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836001, "dur": 7, "args": { "External id": 155495, "cbid": 211, "correlation": 155495 } }, { "ph": "s", "id": 155495, "pid": 76337, "tid": -914061504, "ts": 1716454223836001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223888552, "dur": 11, "args": { "External id": 155504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155504, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155504, "pid": 5, "tid": 7, "ts": 1716454223888552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836043, "dur": 10, "args": { "External id": 155504, "cbid": 211, "correlation": 155504 } }, { "ph": "s", "id": 155504, "pid": 76337, "tid": -914061504, "ts": 1716454223836043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223836105, "dur": 0, "args": { "External id": 155514, "cbid": 317, "correlation": 155514 } }, { "ph": "f", "id": 155514, "pid": 76337, "tid": -914061504, "ts": 1716454223836105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223836105, "dur": 0, "args": { "External id": 155515, "cbid": 203, "correlation": 155515 } }, { "ph": "f", "id": 155515, "pid": 76337, "tid": -914061504, "ts": 1716454223836105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223836106, "dur": 0, "args": { "External id": 155516, "cbid": 205, "correlation": 155516 } }, { "ph": "f", "id": 155516, "pid": 76337, "tid": -914061504, "ts": 1716454223836106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223888564, "dur": 11, "args": { "External id": 155520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155520, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155520, "pid": 5, "tid": 7, "ts": 1716454223888564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836123, "dur": 12, "args": { "External id": 155520, "cbid": 211, "correlation": 155520 } }, { "ph": "s", "id": 155520, "pid": 76337, "tid": -914061504, "ts": 1716454223836123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223888576, "dur": 163, "args": { "External id": 155522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155522, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155522, "pid": 5, "tid": 7, "ts": 1716454223888576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836138, "dur": 5, "args": { "External id": 155522, "cbid": 211, "correlation": 155522 } }, { "ph": "s", "id": 155522, "pid": 76337, "tid": -914061504, "ts": 1716454223836138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223888741, "dur": 1, "args": { "External id": 155524, "device": 5, "context": 1, "stream": 7, "correlation": 155524, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 155524, "pid": 5, "tid": 7, "ts": 1716454223888741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223836151, "dur": 6, "args": { "External id": 155524, "cbid": 51, "correlation": 155524 } }, { "ph": "s", "id": 155524, "pid": 76337, "tid": -914061504, "ts": 1716454223836151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223888745, "dur": 662, "args": { "External id": 155525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155525, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155525, "pid": 5, "tid": 7, "ts": 1716454223888745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836159, "dur": 7, "args": { "External id": 155525, "cbid": 211, "correlation": 155525 } }, { "ph": "s", "id": 155525, "pid": 76337, "tid": -914061504, "ts": 1716454223836159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223889409, "dur": 13, "args": { "External id": 155527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155527, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155527, "pid": 5, "tid": 7, "ts": 1716454223889409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836169, "dur": 5, "args": { "External id": 155527, "cbid": 211, "correlation": 155527 } }, { "ph": "s", "id": 155527, "pid": 76337, "tid": -914061504, "ts": 1716454223836169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223889423, "dur": 15, "args": { "External id": 155533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155533, "pid": 5, "tid": 7, "ts": 1716454223889423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836198, "dur": 8, "args": { "External id": 155533, "cbid": 211, "correlation": 155533 } }, { "ph": "s", "id": 155533, "pid": 76337, "tid": -914061504, "ts": 1716454223836198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223836256, "dur": 0, "args": { "External id": 155543, "cbid": 317, "correlation": 155543 } }, { "ph": "f", "id": 155543, "pid": 76337, "tid": -914061504, "ts": 1716454223836256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223836257, "dur": 0, "args": { "External id": 155544, "cbid": 203, "correlation": 155544 } }, { "ph": "f", "id": 155544, "pid": 76337, "tid": -914061504, "ts": 1716454223836257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223836257, "dur": 0, "args": { "External id": 155545, "cbid": 205, "correlation": 155545 } }, { "ph": "f", "id": 155545, "pid": 76337, "tid": -914061504, "ts": 1716454223836257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223889439, "dur": 9, "args": { "External id": 155549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155549, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155549, "pid": 5, "tid": 7, "ts": 1716454223889439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836272, "dur": 11, "args": { "External id": 155549, "cbid": 211, "correlation": 155549 } }, { "ph": "s", "id": 155549, "pid": 76337, "tid": -914061504, "ts": 1716454223836272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223889449, "dur": 4, "args": { "External id": 155551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155551, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155551, "pid": 5, "tid": 7, "ts": 1716454223889449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836289, "dur": 6, "args": { "External id": 155551, "cbid": 211, "correlation": 155551 } }, { "ph": "s", "id": 155551, "pid": 76337, "tid": -914061504, "ts": 1716454223836289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223836298, "dur": 0, "args": { "External id": 155552, "cbid": 51, "correlation": 155552 } }, { "ph": "s", "id": 155552, "pid": 76337, "tid": -914061504, "ts": 1716454223836298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223889454, "dur": 56, "args": { "External id": 155553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155553, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 155553, "pid": 5, "tid": 7, "ts": 1716454223889454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836299, "dur": 5, "args": { "External id": 155553, "cbid": 211, "correlation": 155553 } }, { "ph": "s", "id": 155553, "pid": 76337, "tid": -914061504, "ts": 1716454223836299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223889511, "dur": 14, "args": { "External id": 155558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155558, "pid": 5, "tid": 7, "ts": 1716454223889511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836325, "dur": 9, "args": { "External id": 155558, "cbid": 211, "correlation": 155558 } }, { "ph": "s", "id": 155558, "pid": 76337, "tid": -914061504, "ts": 1716454223836325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223889527, "dur": 12, "args": { "External id": 155566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155566, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155566, "pid": 5, "tid": 7, "ts": 1716454223889527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836354, "dur": 8, "args": { "External id": 155566, "cbid": 211, "correlation": 155566 } }, { "ph": "s", "id": 155566, "pid": 76337, "tid": -914061504, "ts": 1716454223836354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223889540, "dur": 10, "args": { "External id": 155574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155574, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155574, "pid": 5, "tid": 7, "ts": 1716454223889540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836383, "dur": 9, "args": { "External id": 155574, "cbid": 211, "correlation": 155574 } }, { "ph": "s", "id": 155574, "pid": 76337, "tid": -914061504, "ts": 1716454223836383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223889552, "dur": 18, "args": { "External id": 155594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155594, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 155594, "pid": 5, "tid": 7, "ts": 1716454223889552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836463, "dur": 12, "args": { "External id": 155594, "cbid": 211, "correlation": 155594 } }, { "ph": "s", "id": 155594, "pid": 76337, "tid": -914061504, "ts": 1716454223836463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223889571, "dur": 4, "args": { "External id": 155606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155606, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 155606, "pid": 5, "tid": 7, "ts": 1716454223889571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836485, "dur": 7, "args": { "External id": 155606, "cbid": 211, "correlation": 155606 } }, { "ph": "s", "id": 155606, "pid": 76337, "tid": -914061504, "ts": 1716454223836485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223889577, "dur": 17, "args": { "External id": 155609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155609, "pid": 5, "tid": 7, "ts": 1716454223889577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836504, "dur": 6, "args": { "External id": 155609, "cbid": 211, "correlation": 155609 } }, { "ph": "s", "id": 155609, "pid": 76337, "tid": -914061504, "ts": 1716454223836504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223836561, "dur": 0, "args": { "External id": 155620, "cbid": 317, "correlation": 155620 } }, { "ph": "f", "id": 155620, "pid": 76337, "tid": -914061504, "ts": 1716454223836561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223836562, "dur": 0, "args": { "External id": 155621, "cbid": 203, "correlation": 155621 } }, { "ph": "f", "id": 155621, "pid": 76337, "tid": -914061504, "ts": 1716454223836562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223836563, "dur": 0, "args": { "External id": 155622, "cbid": 205, "correlation": 155622 } }, { "ph": "f", "id": 155622, "pid": 76337, "tid": -914061504, "ts": 1716454223836563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223889595, "dur": 11, "args": { "External id": 155626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155626, "pid": 5, "tid": 7, "ts": 1716454223889595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836578, "dur": 13, "args": { "External id": 155626, "cbid": 211, "correlation": 155626 } }, { "ph": "s", "id": 155626, "pid": 76337, "tid": -914061504, "ts": 1716454223836578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223889607, "dur": 3, "args": { "External id": 155628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155628, "pid": 5, "tid": 7, "ts": 1716454223889607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836596, "dur": 6, "args": { "External id": 155628, "cbid": 211, "correlation": 155628 } }, { "ph": "s", "id": 155628, "pid": 76337, "tid": -914061504, "ts": 1716454223836596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223836604, "dur": 0, "args": { "External id": 155629, "cbid": 51, "correlation": 155629 } }, { "ph": "s", "id": 155629, "pid": 76337, "tid": -914061504, "ts": 1716454223836604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223889612, "dur": 98, "args": { "External id": 155630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155630, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 155630, "pid": 5, "tid": 7, "ts": 1716454223889612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836605, "dur": 6, "args": { "External id": 155630, "cbid": 211, "correlation": 155630 } }, { "ph": "s", "id": 155630, "pid": 76337, "tid": -914061504, "ts": 1716454223836605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223889711, "dur": 16, "args": { "External id": 155635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155635, "pid": 5, "tid": 7, "ts": 1716454223889711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836634, "dur": 9, "args": { "External id": 155635, "cbid": 211, "correlation": 155635 } }, { "ph": "s", "id": 155635, "pid": 76337, "tid": -914061504, "ts": 1716454223836634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223889728, "dur": 84, "args": { "External id": 155644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155644, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155644, "pid": 5, "tid": 7, "ts": 1716454223889728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836717, "dur": 14, "args": { "External id": 155644, "cbid": 211, "correlation": 155644 } }, { "ph": "s", "id": 155644, "pid": 76337, "tid": -914061504, "ts": 1716454223836717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223889814, "dur": 30, "args": { "External id": 155666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155666, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155666, "pid": 5, "tid": 7, "ts": 1716454223889814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836774, "dur": 10, "args": { "External id": 155666, "cbid": 211, "correlation": 155666 } }, { "ph": "s", "id": 155666, "pid": 76337, "tid": -914061504, "ts": 1716454223836774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223836865, "dur": 1, "args": { "External id": 155677, "cbid": 251, "correlation": 155677 } }, { "ph": "f", "id": 155677, "pid": 76337, "tid": -914061504, "ts": 1716454223836865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223889845, "dur": 163, "args": { "External id": 155678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155678, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155678, "pid": 5, "tid": 7, "ts": 1716454223889845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836871, "dur": 13, "args": { "External id": 155678, "cbid": 211, "correlation": 155678 } }, { "ph": "s", "id": 155678, "pid": 76337, "tid": -914061504, "ts": 1716454223836871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223836944, "dur": 1, "args": { "External id": 155689, "cbid": 251, "correlation": 155689 } }, { "ph": "f", "id": 155689, "pid": 76337, "tid": -914061504, "ts": 1716454223836944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223890009, "dur": 157, "args": { "External id": 155690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155690, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155690, "pid": 5, "tid": 7, "ts": 1716454223890009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223836948, "dur": 11, "args": { "External id": 155690, "cbid": 211, "correlation": 155690 } }, { "ph": "s", "id": 155690, "pid": 76337, "tid": -914061504, "ts": 1716454223836948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837021, "dur": 1, "args": { "External id": 155701, "cbid": 251, "correlation": 155701 } }, { "ph": "f", "id": 155701, "pid": 76337, "tid": -914061504, "ts": 1716454223837021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223890167, "dur": 160, "args": { "External id": 155702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155702, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155702, "pid": 5, "tid": 7, "ts": 1716454223890167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837026, "dur": 13, "args": { "External id": 155702, "cbid": 211, "correlation": 155702 } }, { "ph": "s", "id": 155702, "pid": 76337, "tid": -914061504, "ts": 1716454223837026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223890329, "dur": 340, "args": { "External id": 155727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155727, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155727, "pid": 5, "tid": 7, "ts": 1716454223890329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837114, "dur": 13, "args": { "External id": 155727, "cbid": 211, "correlation": 155727 } }, { "ph": "s", "id": 155727, "pid": 76337, "tid": -914061504, "ts": 1716454223837114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837217, "dur": 1, "args": { "External id": 155745, "cbid": 251, "correlation": 155745 } }, { "ph": "f", "id": 155745, "pid": 76337, "tid": -914061504, "ts": 1716454223837217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223890670, "dur": 166, "args": { "External id": 155747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155747, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155747, "pid": 5, "tid": 7, "ts": 1716454223890670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837223, "dur": 14, "args": { "External id": 155747, "cbid": 211, "correlation": 155747 } }, { "ph": "s", "id": 155747, "pid": 76337, "tid": -914061504, "ts": 1716454223837223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223890837, "dur": 19, "args": { "External id": 155755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155755, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155755, "pid": 5, "tid": 7, "ts": 1716454223890837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837293, "dur": 12, "args": { "External id": 155755, "cbid": 211, "correlation": 155755 } }, { "ph": "s", "id": 155755, "pid": 76337, "tid": -914061504, "ts": 1716454223837293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223890858, "dur": 28, "args": { "External id": 155763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155763, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155763, "pid": 5, "tid": 7, "ts": 1716454223890858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837332, "dur": 9, "args": { "External id": 155763, "cbid": 211, "correlation": 155763 } }, { "ph": "s", "id": 155763, "pid": 76337, "tid": -914061504, "ts": 1716454223837332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223890887, "dur": 20, "args": { "External id": 155774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155774, "pid": 5, "tid": 7, "ts": 1716454223890887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837403, "dur": 12, "args": { "External id": 155774, "cbid": 211, "correlation": 155774 } }, { "ph": "s", "id": 155774, "pid": 76337, "tid": -914061504, "ts": 1716454223837403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223890908, "dur": 16, "args": { "External id": 155796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155796, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155796, "pid": 5, "tid": 7, "ts": 1716454223890908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837434, "dur": 8, "args": { "External id": 155796, "cbid": 211, "correlation": 155796 } }, { "ph": "s", "id": 155796, "pid": 76337, "tid": -914061504, "ts": 1716454223837434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837519, "dur": 1, "args": { "External id": 155807, "cbid": 251, "correlation": 155807 } }, { "ph": "f", "id": 155807, "pid": 76337, "tid": -914061504, "ts": 1716454223837519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223890925, "dur": 91, "args": { "External id": 155808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155808, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155808, "pid": 5, "tid": 7, "ts": 1716454223890925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837526, "dur": 15, "args": { "External id": 155808, "cbid": 211, "correlation": 155808 } }, { "ph": "s", "id": 155808, "pid": 76337, "tid": -914061504, "ts": 1716454223837526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837599, "dur": 1, "args": { "External id": 155819, "cbid": 251, "correlation": 155819 } }, { "ph": "f", "id": 155819, "pid": 76337, "tid": -914061504, "ts": 1716454223837599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837603, "dur": 0, "args": { "External id": 155820, "cbid": 251, "correlation": 155820 } }, { "ph": "f", "id": 155820, "pid": 76337, "tid": -914061504, "ts": 1716454223837603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223891017, "dur": 12, "args": { "External id": 155821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155821, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155821, "pid": 5, "tid": 7, "ts": 1716454223891017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837605, "dur": 12, "args": { "External id": 155821, "cbid": 211, "correlation": 155821 } }, { "ph": "s", "id": 155821, "pid": 76337, "tid": -914061504, "ts": 1716454223837605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223891031, "dur": 6, "args": { "External id": 155823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155823, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155823, "pid": 5, "tid": 7, "ts": 1716454223891031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837618, "dur": 6, "args": { "External id": 155823, "cbid": 211, "correlation": 155823 } }, { "ph": "s", "id": 155823, "pid": 76337, "tid": -914061504, "ts": 1716454223837618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837675, "dur": 1, "args": { "External id": 155834, "cbid": 251, "correlation": 155834 } }, { "ph": "f", "id": 155834, "pid": 76337, "tid": -914061504, "ts": 1716454223837675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837678, "dur": 0, "args": { "External id": 155835, "cbid": 251, "correlation": 155835 } }, { "ph": "f", "id": 155835, "pid": 76337, "tid": -914061504, "ts": 1716454223837678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223891038, "dur": 8, "args": { "External id": 155836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155836, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155836, "pid": 5, "tid": 7, "ts": 1716454223891038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837680, "dur": 12, "args": { "External id": 155836, "cbid": 211, "correlation": 155836 } }, { "ph": "s", "id": 155836, "pid": 76337, "tid": -914061504, "ts": 1716454223837680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223891047, "dur": 3, "args": { "External id": 155838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155838, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155838, "pid": 5, "tid": 7, "ts": 1716454223891047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837693, "dur": 5, "args": { "External id": 155838, "cbid": 211, "correlation": 155838 } }, { "ph": "s", "id": 155838, "pid": 76337, "tid": -914061504, "ts": 1716454223837693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223891052, "dur": 54, "args": { "External id": 155863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155863, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155863, "pid": 5, "tid": 7, "ts": 1716454223891052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837770, "dur": 12, "args": { "External id": 155863, "cbid": 211, "correlation": 155863 } }, { "ph": "s", "id": 155863, "pid": 76337, "tid": -914061504, "ts": 1716454223837770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223837869, "dur": 2, "args": { "External id": 155881, "cbid": 251, "correlation": 155881 } }, { "ph": "f", "id": 155881, "pid": 76337, "tid": -914061504, "ts": 1716454223837869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223891107, "dur": 92, "args": { "External id": 155883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155883, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155883, "pid": 5, "tid": 7, "ts": 1716454223891107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837876, "dur": 16, "args": { "External id": 155883, "cbid": 211, "correlation": 155883 } }, { "ph": "s", "id": 155883, "pid": 76337, "tid": -914061504, "ts": 1716454223837876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223891201, "dur": 10, "args": { "External id": 155891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155891, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155891, "pid": 5, "tid": 7, "ts": 1716454223891201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223837948, "dur": 12, "args": { "External id": 155891, "cbid": 211, "correlation": 155891 } }, { "ph": "s", "id": 155891, "pid": 76337, "tid": -914061504, "ts": 1716454223837948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223891212, "dur": 21, "args": { "External id": 155899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155899, "pid": 5, "tid": 7, "ts": 1716454223891212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838001, "dur": 10, "args": { "External id": 155899, "cbid": 211, "correlation": 155899 } }, { "ph": "s", "id": 155899, "pid": 76337, "tid": -914061504, "ts": 1716454223838001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223891234, "dur": 17, "args": { "External id": 155921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155921, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155921, "pid": 5, "tid": 7, "ts": 1716454223891234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838054, "dur": 10, "args": { "External id": 155921, "cbid": 211, "correlation": 155921 } }, { "ph": "s", "id": 155921, "pid": 76337, "tid": -914061504, "ts": 1716454223838054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223838143, "dur": 2, "args": { "External id": 155937, "cbid": 251, "correlation": 155937 } }, { "ph": "f", "id": 155937, "pid": 76337, "tid": -914061504, "ts": 1716454223838143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223838148, "dur": 0, "args": { "External id": 155939, "cbid": 251, "correlation": 155939 } }, { "ph": "f", "id": 155939, "pid": 76337, "tid": -914061504, "ts": 1716454223838148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223891252, "dur": 494, "args": { "External id": 155940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155940, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 155940, "pid": 5, "tid": 7, "ts": 1716454223891252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838151, "dur": 14, "args": { "External id": 155940, "cbid": 211, "correlation": 155940 } }, { "ph": "s", "id": 155940, "pid": 76337, "tid": -914061504, "ts": 1716454223838151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223891747, "dur": 66, "args": { "External id": 155948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155948, "pid": 5, "tid": 7, "ts": 1716454223891747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838218, "dur": 12, "args": { "External id": 155948, "cbid": 211, "correlation": 155948 } }, { "ph": "s", "id": 155948, "pid": 76337, "tid": -914061504, "ts": 1716454223838218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223891815, "dur": 68, "args": { "External id": 155956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155956, "pid": 5, "tid": 7, "ts": 1716454223891815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838249, "dur": 8, "args": { "External id": 155956, "cbid": 211, "correlation": 155956 } }, { "ph": "s", "id": 155956, "pid": 76337, "tid": -914061504, "ts": 1716454223838249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223838330, "dur": 1, "args": { "External id": 155972, "cbid": 251, "correlation": 155972 } }, { "ph": "f", "id": 155972, "pid": 76337, "tid": -914061504, "ts": 1716454223838330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223891885, "dur": 1, "args": { "External id": 155974, "device": 5, "context": 1, "stream": 7, "correlation": 155974, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 155974, "pid": 5, "tid": 7, "ts": 1716454223891885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223838336, "dur": 11, "args": { "External id": 155974, "cbid": 51, "correlation": 155974 } }, { "ph": "s", "id": 155974, "pid": 76337, "tid": -914061504, "ts": 1716454223838336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223891888, "dur": 270, "args": { "External id": 155975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155975, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 155975, "pid": 5, "tid": 7, "ts": 1716454223891888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838349, "dur": 11, "args": { "External id": 155975, "cbid": 211, "correlation": 155975 } }, { "ph": "s", "id": 155975, "pid": 76337, "tid": -914061504, "ts": 1716454223838349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223892159, "dur": 14, "args": { "External id": 155983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155983, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155983, "pid": 5, "tid": 7, "ts": 1716454223892159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838390, "dur": 10, "args": { "External id": 155983, "cbid": 211, "correlation": 155983 } }, { "ph": "s", "id": 155983, "pid": 76337, "tid": -914061504, "ts": 1716454223838390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223892174, "dur": 37, "args": { "External id": 155994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 155994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 155994, "pid": 5, "tid": 7, "ts": 1716454223892174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838458, "dur": 12, "args": { "External id": 155994, "cbid": 211, "correlation": 155994 } }, { "ph": "s", "id": 155994, "pid": 76337, "tid": -914061504, "ts": 1716454223838458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223838522, "dur": 0, "args": { "External id": 156006, "cbid": 317, "correlation": 156006 } }, { "ph": "f", "id": 156006, "pid": 76337, "tid": -914061504, "ts": 1716454223838522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223838522, "dur": 0, "args": { "External id": 156007, "cbid": 203, "correlation": 156007 } }, { "ph": "f", "id": 156007, "pid": 76337, "tid": -914061504, "ts": 1716454223838522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223838523, "dur": 0, "args": { "External id": 156008, "cbid": 205, "correlation": 156008 } }, { "ph": "f", "id": 156008, "pid": 76337, "tid": -914061504, "ts": 1716454223838523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223892213, "dur": 13, "args": { "External id": 156012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156012, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156012, "pid": 5, "tid": 7, "ts": 1716454223892213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838540, "dur": 12, "args": { "External id": 156012, "cbid": 211, "correlation": 156012 } }, { "ph": "s", "id": 156012, "pid": 76337, "tid": -914061504, "ts": 1716454223838540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223892227, "dur": 4, "args": { "External id": 156014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156014, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 156014, "pid": 5, "tid": 7, "ts": 1716454223892227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838556, "dur": 6, "args": { "External id": 156014, "cbid": 211, "correlation": 156014 } }, { "ph": "s", "id": 156014, "pid": 76337, "tid": -914061504, "ts": 1716454223838556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223838565, "dur": 0, "args": { "External id": 156015, "cbid": 51, "correlation": 156015 } }, { "ph": "s", "id": 156015, "pid": 76337, "tid": -914061504, "ts": 1716454223838565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223892233, "dur": 97, "args": { "External id": 156016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156016, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 156016, "pid": 5, "tid": 7, "ts": 1716454223892233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838566, "dur": 5, "args": { "External id": 156016, "cbid": 211, "correlation": 156016 } }, { "ph": "s", "id": 156016, "pid": 76337, "tid": -914061504, "ts": 1716454223838566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223892332, "dur": 16, "args": { "External id": 156021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156021, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156021, "pid": 5, "tid": 7, "ts": 1716454223892332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838593, "dur": 8, "args": { "External id": 156021, "cbid": 211, "correlation": 156021 } }, { "ph": "s", "id": 156021, "pid": 76337, "tid": -914061504, "ts": 1716454223838593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223892350, "dur": 13, "args": { "External id": 156029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156029, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156029, "pid": 5, "tid": 7, "ts": 1716454223892350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838624, "dur": 9, "args": { "External id": 156029, "cbid": 211, "correlation": 156029 } }, { "ph": "s", "id": 156029, "pid": 76337, "tid": -914061504, "ts": 1716454223838624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223892363, "dur": 18, "args": { "External id": 156049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156049, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 156049, "pid": 5, "tid": 7, "ts": 1716454223892363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838697, "dur": 11, "args": { "External id": 156049, "cbid": 211, "correlation": 156049 } }, { "ph": "s", "id": 156049, "pid": 76337, "tid": -914061504, "ts": 1716454223838697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223892382, "dur": 5, "args": { "External id": 156061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156061, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 156061, "pid": 5, "tid": 7, "ts": 1716454223892382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838718, "dur": 6, "args": { "External id": 156061, "cbid": 211, "correlation": 156061 } }, { "ph": "s", "id": 156061, "pid": 76337, "tid": -914061504, "ts": 1716454223838718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223892388, "dur": 17, "args": { "External id": 156064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156064, "pid": 5, "tid": 7, "ts": 1716454223892388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838737, "dur": 6, "args": { "External id": 156064, "cbid": 211, "correlation": 156064 } }, { "ph": "s", "id": 156064, "pid": 76337, "tid": -914061504, "ts": 1716454223838737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223892407, "dur": 13, "args": { "External id": 156073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156073, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156073, "pid": 5, "tid": 7, "ts": 1716454223892407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838775, "dur": 10, "args": { "External id": 156073, "cbid": 211, "correlation": 156073 } }, { "ph": "s", "id": 156073, "pid": 76337, "tid": -914061504, "ts": 1716454223838775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223838825, "dur": 0, "args": { "External id": 156083, "cbid": 317, "correlation": 156083 } }, { "ph": "f", "id": 156083, "pid": 76337, "tid": -914061504, "ts": 1716454223838825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223838826, "dur": 0, "args": { "External id": 156084, "cbid": 203, "correlation": 156084 } }, { "ph": "f", "id": 156084, "pid": 76337, "tid": -914061504, "ts": 1716454223838826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223838827, "dur": 0, "args": { "External id": 156085, "cbid": 205, "correlation": 156085 } }, { "ph": "f", "id": 156085, "pid": 76337, "tid": -914061504, "ts": 1716454223838827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223892421, "dur": 11, "args": { "External id": 156089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156089, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156089, "pid": 5, "tid": 7, "ts": 1716454223892421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838841, "dur": 11, "args": { "External id": 156089, "cbid": 211, "correlation": 156089 } }, { "ph": "s", "id": 156089, "pid": 76337, "tid": -914061504, "ts": 1716454223838841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223892434, "dur": 162, "args": { "External id": 156091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156091, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156091, "pid": 5, "tid": 7, "ts": 1716454223892434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838855, "dur": 5, "args": { "External id": 156091, "cbid": 211, "correlation": 156091 } }, { "ph": "s", "id": 156091, "pid": 76337, "tid": -914061504, "ts": 1716454223838855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223892598, "dur": 1, "args": { "External id": 156093, "device": 5, "context": 1, "stream": 7, "correlation": 156093, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 156093, "pid": 5, "tid": 7, "ts": 1716454223892598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223838866, "dur": 6, "args": { "External id": 156093, "cbid": 51, "correlation": 156093 } }, { "ph": "s", "id": 156093, "pid": 76337, "tid": -914061504, "ts": 1716454223838866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223892602, "dur": 661, "args": { "External id": 156094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156094, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156094, "pid": 5, "tid": 7, "ts": 1716454223892602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838873, "dur": 6, "args": { "External id": 156094, "cbid": 211, "correlation": 156094 } }, { "ph": "s", "id": 156094, "pid": 76337, "tid": -914061504, "ts": 1716454223838873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223893264, "dur": 13, "args": { "External id": 156096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156096, "pid": 5, "tid": 7, "ts": 1716454223893264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838884, "dur": 5, "args": { "External id": 156096, "cbid": 211, "correlation": 156096 } }, { "ph": "s", "id": 156096, "pid": 76337, "tid": -914061504, "ts": 1716454223838884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223893278, "dur": 15, "args": { "External id": 156102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156102, "pid": 5, "tid": 7, "ts": 1716454223893278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838912, "dur": 8, "args": { "External id": 156102, "cbid": 211, "correlation": 156102 } }, { "ph": "s", "id": 156102, "pid": 76337, "tid": -914061504, "ts": 1716454223838912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223893294, "dur": 3, "args": { "External id": 156110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156110, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 156110, "pid": 5, "tid": 7, "ts": 1716454223893294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223838956, "dur": 9, "args": { "External id": 156110, "cbid": 211, "correlation": 156110 } }, { "ph": "s", "id": 156110, "pid": 76337, "tid": -914061504, "ts": 1716454223838956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223839028, "dur": 1, "args": { "External id": 156126, "cbid": 251, "correlation": 156126 } }, { "ph": "f", "id": 156126, "pid": 76337, "tid": -914061504, "ts": 1716454223839028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223839033, "dur": 0, "args": { "External id": 156128, "cbid": 251, "correlation": 156128 } }, { "ph": "f", "id": 156128, "pid": 76337, "tid": -914061504, "ts": 1716454223839033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223893299, "dur": 14, "args": { "External id": 156129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156129, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156129, "pid": 5, "tid": 7, "ts": 1716454223893299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839035, "dur": 12, "args": { "External id": 156129, "cbid": 211, "correlation": 156129 } }, { "ph": "s", "id": 156129, "pid": 76337, "tid": -914061504, "ts": 1716454223839035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223893314, "dur": 5, "args": { "External id": 156131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156131, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156131, "pid": 5, "tid": 7, "ts": 1716454223893314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839049, "dur": 5, "args": { "External id": 156131, "cbid": 211, "correlation": 156131 } }, { "ph": "s", "id": 156131, "pid": 76337, "tid": -914061504, "ts": 1716454223839049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223893321, "dur": 17, "args": { "External id": 156141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156141, "pid": 5, "tid": 7, "ts": 1716454223893321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839107, "dur": 12, "args": { "External id": 156141, "cbid": 211, "correlation": 156141 } }, { "ph": "s", "id": 156141, "pid": 76337, "tid": -914061504, "ts": 1716454223839107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223893338, "dur": 17, "args": { "External id": 156161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156161, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 156161, "pid": 5, "tid": 7, "ts": 1716454223893338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839172, "dur": 11, "args": { "External id": 156161, "cbid": 211, "correlation": 156161 } }, { "ph": "s", "id": 156161, "pid": 76337, "tid": -914061504, "ts": 1716454223839172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223893357, "dur": 4, "args": { "External id": 156173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156173, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 156173, "pid": 5, "tid": 7, "ts": 1716454223893357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839193, "dur": 6, "args": { "External id": 156173, "cbid": 211, "correlation": 156173 } }, { "ph": "s", "id": 156173, "pid": 76337, "tid": -914061504, "ts": 1716454223839193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223893362, "dur": 17, "args": { "External id": 156176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156176, "pid": 5, "tid": 7, "ts": 1716454223893362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839211, "dur": 7, "args": { "External id": 156176, "cbid": 211, "correlation": 156176 } }, { "ph": "s", "id": 156176, "pid": 76337, "tid": -914061504, "ts": 1716454223839211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223893380, "dur": 12, "args": { "External id": 156185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156185, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156185, "pid": 5, "tid": 7, "ts": 1716454223893380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839252, "dur": 10, "args": { "External id": 156185, "cbid": 211, "correlation": 156185 } }, { "ph": "s", "id": 156185, "pid": 76337, "tid": -914061504, "ts": 1716454223839252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223839313, "dur": 0, "args": { "External id": 156195, "cbid": 317, "correlation": 156195 } }, { "ph": "f", "id": 156195, "pid": 76337, "tid": -914061504, "ts": 1716454223839313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223839314, "dur": 0, "args": { "External id": 156196, "cbid": 203, "correlation": 156196 } }, { "ph": "f", "id": 156196, "pid": 76337, "tid": -914061504, "ts": 1716454223839314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223839315, "dur": 0, "args": { "External id": 156197, "cbid": 205, "correlation": 156197 } }, { "ph": "f", "id": 156197, "pid": 76337, "tid": -914061504, "ts": 1716454223839315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223893394, "dur": 11, "args": { "External id": 156201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156201, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156201, "pid": 5, "tid": 7, "ts": 1716454223893394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839329, "dur": 12, "args": { "External id": 156201, "cbid": 211, "correlation": 156201 } }, { "ph": "s", "id": 156201, "pid": 76337, "tid": -914061504, "ts": 1716454223839329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223893406, "dur": 163, "args": { "External id": 156203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156203, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156203, "pid": 5, "tid": 7, "ts": 1716454223893406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839343, "dur": 5, "args": { "External id": 156203, "cbid": 211, "correlation": 156203 } }, { "ph": "s", "id": 156203, "pid": 76337, "tid": -914061504, "ts": 1716454223839343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223893571, "dur": 1, "args": { "External id": 156205, "device": 5, "context": 1, "stream": 7, "correlation": 156205, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 156205, "pid": 5, "tid": 7, "ts": 1716454223893571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223839354, "dur": 6, "args": { "External id": 156205, "cbid": 51, "correlation": 156205 } }, { "ph": "s", "id": 156205, "pid": 76337, "tid": -914061504, "ts": 1716454223839354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223893575, "dur": 646, "args": { "External id": 156206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156206, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156206, "pid": 5, "tid": 7, "ts": 1716454223893575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839361, "dur": 6, "args": { "External id": 156206, "cbid": 211, "correlation": 156206 } }, { "ph": "s", "id": 156206, "pid": 76337, "tid": -914061504, "ts": 1716454223839361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223894222, "dur": 12, "args": { "External id": 156208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156208, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156208, "pid": 5, "tid": 7, "ts": 1716454223894222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839373, "dur": 6, "args": { "External id": 156208, "cbid": 211, "correlation": 156208 } }, { "ph": "s", "id": 156208, "pid": 76337, "tid": -914061504, "ts": 1716454223839373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223894235, "dur": 15, "args": { "External id": 156214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156214, "pid": 5, "tid": 7, "ts": 1716454223894235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839402, "dur": 9, "args": { "External id": 156214, "cbid": 211, "correlation": 156214 } }, { "ph": "s", "id": 156214, "pid": 76337, "tid": -914061504, "ts": 1716454223839402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223894251, "dur": 12, "args": { "External id": 156222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156222, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156222, "pid": 5, "tid": 7, "ts": 1716454223894251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839434, "dur": 8, "args": { "External id": 156222, "cbid": 211, "correlation": 156222 } }, { "ph": "s", "id": 156222, "pid": 76337, "tid": -914061504, "ts": 1716454223839434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223894264, "dur": 10, "args": { "External id": 156230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156230, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156230, "pid": 5, "tid": 7, "ts": 1716454223894264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839462, "dur": 9, "args": { "External id": 156230, "cbid": 211, "correlation": 156230 } }, { "ph": "s", "id": 156230, "pid": 76337, "tid": -914061504, "ts": 1716454223839462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223894276, "dur": 20, "args": { "External id": 156250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156250, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 156250, "pid": 5, "tid": 7, "ts": 1716454223894276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839540, "dur": 12, "args": { "External id": 156250, "cbid": 211, "correlation": 156250 } }, { "ph": "s", "id": 156250, "pid": 76337, "tid": -914061504, "ts": 1716454223839540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223894297, "dur": 4, "args": { "External id": 156262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156262, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 156262, "pid": 5, "tid": 7, "ts": 1716454223894297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839562, "dur": 6, "args": { "External id": 156262, "cbid": 211, "correlation": 156262 } }, { "ph": "s", "id": 156262, "pid": 76337, "tid": -914061504, "ts": 1716454223839562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223894303, "dur": 16, "args": { "External id": 156265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156265, "pid": 5, "tid": 7, "ts": 1716454223894303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839581, "dur": 6, "args": { "External id": 156265, "cbid": 211, "correlation": 156265 } }, { "ph": "s", "id": 156265, "pid": 76337, "tid": -914061504, "ts": 1716454223839581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223839637, "dur": 0, "args": { "External id": 156276, "cbid": 317, "correlation": 156276 } }, { "ph": "f", "id": 156276, "pid": 76337, "tid": -914061504, "ts": 1716454223839637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223839638, "dur": 0, "args": { "External id": 156277, "cbid": 203, "correlation": 156277 } }, { "ph": "f", "id": 156277, "pid": 76337, "tid": -914061504, "ts": 1716454223839638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223839639, "dur": 0, "args": { "External id": 156278, "cbid": 205, "correlation": 156278 } }, { "ph": "f", "id": 156278, "pid": 76337, "tid": -914061504, "ts": 1716454223839639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223894320, "dur": 12, "args": { "External id": 156282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156282, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156282, "pid": 5, "tid": 7, "ts": 1716454223894320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839652, "dur": 11, "args": { "External id": 156282, "cbid": 211, "correlation": 156282 } }, { "ph": "s", "id": 156282, "pid": 76337, "tid": -914061504, "ts": 1716454223839652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223894334, "dur": 4, "args": { "External id": 156284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 156284, "pid": 5, "tid": 7, "ts": 1716454223894334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839668, "dur": 7, "args": { "External id": 156284, "cbid": 211, "correlation": 156284 } }, { "ph": "s", "id": 156284, "pid": 76337, "tid": -914061504, "ts": 1716454223839668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223839677, "dur": 0, "args": { "External id": 156285, "cbid": 51, "correlation": 156285 } }, { "ph": "s", "id": 156285, "pid": 76337, "tid": -914061504, "ts": 1716454223839677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223894339, "dur": 95, "args": { "External id": 156286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156286, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 156286, "pid": 5, "tid": 7, "ts": 1716454223894339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839678, "dur": 5, "args": { "External id": 156286, "cbid": 211, "correlation": 156286 } }, { "ph": "s", "id": 156286, "pid": 76337, "tid": -914061504, "ts": 1716454223839678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223894434, "dur": 16, "args": { "External id": 156291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156291, "pid": 5, "tid": 7, "ts": 1716454223894434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839704, "dur": 8, "args": { "External id": 156291, "cbid": 211, "correlation": 156291 } }, { "ph": "s", "id": 156291, "pid": 76337, "tid": -914061504, "ts": 1716454223839704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223894451, "dur": 84, "args": { "External id": 156300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156300, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156300, "pid": 5, "tid": 7, "ts": 1716454223894451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839787, "dur": 13, "args": { "External id": 156300, "cbid": 211, "correlation": 156300 } }, { "ph": "s", "id": 156300, "pid": 76337, "tid": -914061504, "ts": 1716454223839787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223894536, "dur": 30, "args": { "External id": 156322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156322, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156322, "pid": 5, "tid": 7, "ts": 1716454223894536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839842, "dur": 10, "args": { "External id": 156322, "cbid": 211, "correlation": 156322 } }, { "ph": "s", "id": 156322, "pid": 76337, "tid": -914061504, "ts": 1716454223839842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223839930, "dur": 1, "args": { "External id": 156333, "cbid": 251, "correlation": 156333 } }, { "ph": "f", "id": 156333, "pid": 76337, "tid": -914061504, "ts": 1716454223839930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223894567, "dur": 142, "args": { "External id": 156334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156334, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156334, "pid": 5, "tid": 7, "ts": 1716454223894567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223839935, "dur": 13, "args": { "External id": 156334, "cbid": 211, "correlation": 156334 } }, { "ph": "s", "id": 156334, "pid": 76337, "tid": -914061504, "ts": 1716454223839935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840013, "dur": 1, "args": { "External id": 156345, "cbid": 251, "correlation": 156345 } }, { "ph": "f", "id": 156345, "pid": 76337, "tid": -914061504, "ts": 1716454223840013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223894711, "dur": 159, "args": { "External id": 156346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156346, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156346, "pid": 5, "tid": 7, "ts": 1716454223894711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840017, "dur": 13, "args": { "External id": 156346, "cbid": 211, "correlation": 156346 } }, { "ph": "s", "id": 156346, "pid": 76337, "tid": -914061504, "ts": 1716454223840017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840085, "dur": 1, "args": { "External id": 156357, "cbid": 251, "correlation": 156357 } }, { "ph": "f", "id": 156357, "pid": 76337, "tid": -914061504, "ts": 1716454223840085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223894872, "dur": 158, "args": { "External id": 156358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156358, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156358, "pid": 5, "tid": 7, "ts": 1716454223894872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840088, "dur": 11, "args": { "External id": 156358, "cbid": 211, "correlation": 156358 } }, { "ph": "s", "id": 156358, "pid": 76337, "tid": -914061504, "ts": 1716454223840088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223895031, "dur": 339, "args": { "External id": 156383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156383, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156383, "pid": 5, "tid": 7, "ts": 1716454223895031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840171, "dur": 13, "args": { "External id": 156383, "cbid": 211, "correlation": 156383 } }, { "ph": "s", "id": 156383, "pid": 76337, "tid": -914061504, "ts": 1716454223840171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840271, "dur": 1, "args": { "External id": 156401, "cbid": 251, "correlation": 156401 } }, { "ph": "f", "id": 156401, "pid": 76337, "tid": -914061504, "ts": 1716454223840271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223895371, "dur": 167, "args": { "External id": 156403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156403, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156403, "pid": 5, "tid": 7, "ts": 1716454223895371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840277, "dur": 14, "args": { "External id": 156403, "cbid": 211, "correlation": 156403 } }, { "ph": "s", "id": 156403, "pid": 76337, "tid": -914061504, "ts": 1716454223840277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223895540, "dur": 19, "args": { "External id": 156411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156411, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156411, "pid": 5, "tid": 7, "ts": 1716454223895540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840347, "dur": 12, "args": { "External id": 156411, "cbid": 211, "correlation": 156411 } }, { "ph": "s", "id": 156411, "pid": 76337, "tid": -914061504, "ts": 1716454223840347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223895560, "dur": 28, "args": { "External id": 156419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156419, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156419, "pid": 5, "tid": 7, "ts": 1716454223895560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840385, "dur": 8, "args": { "External id": 156419, "cbid": 211, "correlation": 156419 } }, { "ph": "s", "id": 156419, "pid": 76337, "tid": -914061504, "ts": 1716454223840385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223895589, "dur": 18, "args": { "External id": 156430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156430, "pid": 5, "tid": 7, "ts": 1716454223895589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840455, "dur": 12, "args": { "External id": 156430, "cbid": 211, "correlation": 156430 } }, { "ph": "s", "id": 156430, "pid": 76337, "tid": -914061504, "ts": 1716454223840455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223895609, "dur": 16, "args": { "External id": 156452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156452, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156452, "pid": 5, "tid": 7, "ts": 1716454223895609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840487, "dur": 7, "args": { "External id": 156452, "cbid": 211, "correlation": 156452 } }, { "ph": "s", "id": 156452, "pid": 76337, "tid": -914061504, "ts": 1716454223840487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840572, "dur": 1, "args": { "External id": 156463, "cbid": 251, "correlation": 156463 } }, { "ph": "f", "id": 156463, "pid": 76337, "tid": -914061504, "ts": 1716454223840572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223895626, "dur": 89, "args": { "External id": 156464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156464, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 156464, "pid": 5, "tid": 7, "ts": 1716454223895626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840578, "dur": 14, "args": { "External id": 156464, "cbid": 211, "correlation": 156464 } }, { "ph": "s", "id": 156464, "pid": 76337, "tid": -914061504, "ts": 1716454223840578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840648, "dur": 1, "args": { "External id": 156475, "cbid": 251, "correlation": 156475 } }, { "ph": "f", "id": 156475, "pid": 76337, "tid": -914061504, "ts": 1716454223840648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840651, "dur": 0, "args": { "External id": 156476, "cbid": 251, "correlation": 156476 } }, { "ph": "f", "id": 156476, "pid": 76337, "tid": -914061504, "ts": 1716454223840651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223895716, "dur": 13, "args": { "External id": 156477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156477, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156477, "pid": 5, "tid": 7, "ts": 1716454223895716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840653, "dur": 12, "args": { "External id": 156477, "cbid": 211, "correlation": 156477 } }, { "ph": "s", "id": 156477, "pid": 76337, "tid": -914061504, "ts": 1716454223840653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223895730, "dur": 5, "args": { "External id": 156479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156479, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156479, "pid": 5, "tid": 7, "ts": 1716454223895730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840667, "dur": 6, "args": { "External id": 156479, "cbid": 211, "correlation": 156479 } }, { "ph": "s", "id": 156479, "pid": 76337, "tid": -914061504, "ts": 1716454223840667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840724, "dur": 1, "args": { "External id": 156490, "cbid": 251, "correlation": 156490 } }, { "ph": "f", "id": 156490, "pid": 76337, "tid": -914061504, "ts": 1716454223840724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840728, "dur": 0, "args": { "External id": 156491, "cbid": 251, "correlation": 156491 } }, { "ph": "f", "id": 156491, "pid": 76337, "tid": -914061504, "ts": 1716454223840728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223895737, "dur": 8, "args": { "External id": 156492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156492, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156492, "pid": 5, "tid": 7, "ts": 1716454223895737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840730, "dur": 12, "args": { "External id": 156492, "cbid": 211, "correlation": 156492 } }, { "ph": "s", "id": 156492, "pid": 76337, "tid": -914061504, "ts": 1716454223840730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223895746, "dur": 3, "args": { "External id": 156494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156494, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156494, "pid": 5, "tid": 7, "ts": 1716454223895746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840743, "dur": 5, "args": { "External id": 156494, "cbid": 211, "correlation": 156494 } }, { "ph": "s", "id": 156494, "pid": 76337, "tid": -914061504, "ts": 1716454223840743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223895751, "dur": 54, "args": { "External id": 156519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156519, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156519, "pid": 5, "tid": 7, "ts": 1716454223895751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840819, "dur": 13, "args": { "External id": 156519, "cbid": 211, "correlation": 156519 } }, { "ph": "s", "id": 156519, "pid": 76337, "tid": -914061504, "ts": 1716454223840819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223840918, "dur": 1, "args": { "External id": 156537, "cbid": 251, "correlation": 156537 } }, { "ph": "f", "id": 156537, "pid": 76337, "tid": -914061504, "ts": 1716454223840918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223895806, "dur": 91, "args": { "External id": 156539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156539, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 156539, "pid": 5, "tid": 7, "ts": 1716454223895806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223840924, "dur": 14, "args": { "External id": 156539, "cbid": 211, "correlation": 156539 } }, { "ph": "s", "id": 156539, "pid": 76337, "tid": -914061504, "ts": 1716454223840924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223895898, "dur": 10, "args": { "External id": 156547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156547, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156547, "pid": 5, "tid": 7, "ts": 1716454223895898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841004, "dur": 12, "args": { "External id": 156547, "cbid": 211, "correlation": 156547 } }, { "ph": "s", "id": 156547, "pid": 76337, "tid": -914061504, "ts": 1716454223841004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223895909, "dur": 21, "args": { "External id": 156555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156555, "pid": 5, "tid": 7, "ts": 1716454223895909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841047, "dur": 9, "args": { "External id": 156555, "cbid": 211, "correlation": 156555 } }, { "ph": "s", "id": 156555, "pid": 76337, "tid": -914061504, "ts": 1716454223841047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223895932, "dur": 18, "args": { "External id": 156577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156577, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156577, "pid": 5, "tid": 7, "ts": 1716454223895932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841098, "dur": 10, "args": { "External id": 156577, "cbid": 211, "correlation": 156577 } }, { "ph": "s", "id": 156577, "pid": 76337, "tid": -914061504, "ts": 1716454223841098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223841186, "dur": 1, "args": { "External id": 156593, "cbid": 251, "correlation": 156593 } }, { "ph": "f", "id": 156593, "pid": 76337, "tid": -914061504, "ts": 1716454223841186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223841191, "dur": 0, "args": { "External id": 156595, "cbid": 251, "correlation": 156595 } }, { "ph": "f", "id": 156595, "pid": 76337, "tid": -914061504, "ts": 1716454223841191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223895951, "dur": 495, "args": { "External id": 156596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156596, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156596, "pid": 5, "tid": 7, "ts": 1716454223895951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841193, "dur": 12, "args": { "External id": 156596, "cbid": 211, "correlation": 156596 } }, { "ph": "s", "id": 156596, "pid": 76337, "tid": -914061504, "ts": 1716454223841193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223896447, "dur": 66, "args": { "External id": 156604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156604, "pid": 5, "tid": 7, "ts": 1716454223896447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841257, "dur": 12, "args": { "External id": 156604, "cbid": 211, "correlation": 156604 } }, { "ph": "s", "id": 156604, "pid": 76337, "tid": -914061504, "ts": 1716454223841257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223896514, "dur": 69, "args": { "External id": 156612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156612, "pid": 5, "tid": 7, "ts": 1716454223896514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841288, "dur": 8, "args": { "External id": 156612, "cbid": 211, "correlation": 156612 } }, { "ph": "s", "id": 156612, "pid": 76337, "tid": -914061504, "ts": 1716454223841288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223841366, "dur": 1, "args": { "External id": 156628, "cbid": 251, "correlation": 156628 } }, { "ph": "f", "id": 156628, "pid": 76337, "tid": -914061504, "ts": 1716454223841366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223896585, "dur": 1, "args": { "External id": 156630, "device": 5, "context": 1, "stream": 7, "correlation": 156630, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 156630, "pid": 5, "tid": 7, "ts": 1716454223896585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223841371, "dur": 10, "args": { "External id": 156630, "cbid": 51, "correlation": 156630 } }, { "ph": "s", "id": 156630, "pid": 76337, "tid": -914061504, "ts": 1716454223841371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223896589, "dur": 271, "args": { "External id": 156631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156631, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 156631, "pid": 5, "tid": 7, "ts": 1716454223896589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841384, "dur": 11, "args": { "External id": 156631, "cbid": 211, "correlation": 156631 } }, { "ph": "s", "id": 156631, "pid": 76337, "tid": -914061504, "ts": 1716454223841384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223896862, "dur": 14, "args": { "External id": 156639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156639, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156639, "pid": 5, "tid": 7, "ts": 1716454223896862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841426, "dur": 11, "args": { "External id": 156639, "cbid": 211, "correlation": 156639 } }, { "ph": "s", "id": 156639, "pid": 76337, "tid": -914061504, "ts": 1716454223841426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223896877, "dur": 37, "args": { "External id": 156650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156650, "pid": 5, "tid": 7, "ts": 1716454223896877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841494, "dur": 12, "args": { "External id": 156650, "cbid": 211, "correlation": 156650 } }, { "ph": "s", "id": 156650, "pid": 76337, "tid": -914061504, "ts": 1716454223841494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223841558, "dur": 0, "args": { "External id": 156662, "cbid": 317, "correlation": 156662 } }, { "ph": "f", "id": 156662, "pid": 76337, "tid": -914061504, "ts": 1716454223841558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223841559, "dur": 0, "args": { "External id": 156663, "cbid": 203, "correlation": 156663 } }, { "ph": "f", "id": 156663, "pid": 76337, "tid": -914061504, "ts": 1716454223841559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223841559, "dur": 0, "args": { "External id": 156664, "cbid": 205, "correlation": 156664 } }, { "ph": "f", "id": 156664, "pid": 76337, "tid": -914061504, "ts": 1716454223841559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223896915, "dur": 13, "args": { "External id": 156668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156668, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156668, "pid": 5, "tid": 7, "ts": 1716454223896915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841575, "dur": 13, "args": { "External id": 156668, "cbid": 211, "correlation": 156668 } }, { "ph": "s", "id": 156668, "pid": 76337, "tid": -914061504, "ts": 1716454223841575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223896929, "dur": 4, "args": { "External id": 156670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 156670, "pid": 5, "tid": 7, "ts": 1716454223896929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841593, "dur": 5, "args": { "External id": 156670, "cbid": 211, "correlation": 156670 } }, { "ph": "s", "id": 156670, "pid": 76337, "tid": -914061504, "ts": 1716454223841593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223841601, "dur": 0, "args": { "External id": 156671, "cbid": 51, "correlation": 156671 } }, { "ph": "s", "id": 156671, "pid": 76337, "tid": -914061504, "ts": 1716454223841601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223896935, "dur": 97, "args": { "External id": 156672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156672, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 156672, "pid": 5, "tid": 7, "ts": 1716454223896935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841602, "dur": 5, "args": { "External id": 156672, "cbid": 211, "correlation": 156672 } }, { "ph": "s", "id": 156672, "pid": 76337, "tid": -914061504, "ts": 1716454223841602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223897033, "dur": 17, "args": { "External id": 156677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156677, "pid": 5, "tid": 7, "ts": 1716454223897033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841629, "dur": 9, "args": { "External id": 156677, "cbid": 211, "correlation": 156677 } }, { "ph": "s", "id": 156677, "pid": 76337, "tid": -914061504, "ts": 1716454223841629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223897051, "dur": 12, "args": { "External id": 156685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156685, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156685, "pid": 5, "tid": 7, "ts": 1716454223897051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841660, "dur": 8, "args": { "External id": 156685, "cbid": 211, "correlation": 156685 } }, { "ph": "s", "id": 156685, "pid": 76337, "tid": -914061504, "ts": 1716454223841660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223841730, "dur": 0, "args": { "External id": 156695, "cbid": 317, "correlation": 156695 } }, { "ph": "f", "id": 156695, "pid": 76337, "tid": -914061504, "ts": 1716454223841730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223841731, "dur": 0, "args": { "External id": 156696, "cbid": 203, "correlation": 156696 } }, { "ph": "f", "id": 156696, "pid": 76337, "tid": -914061504, "ts": 1716454223841731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223841732, "dur": 0, "args": { "External id": 156697, "cbid": 205, "correlation": 156697 } }, { "ph": "f", "id": 156697, "pid": 76337, "tid": -914061504, "ts": 1716454223841732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223897064, "dur": 11, "args": { "External id": 156701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156701, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156701, "pid": 5, "tid": 7, "ts": 1716454223897064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841746, "dur": 12, "args": { "External id": 156701, "cbid": 211, "correlation": 156701 } }, { "ph": "s", "id": 156701, "pid": 76337, "tid": -914061504, "ts": 1716454223841746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223897077, "dur": 162, "args": { "External id": 156703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156703, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156703, "pid": 5, "tid": 7, "ts": 1716454223897077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841761, "dur": 5, "args": { "External id": 156703, "cbid": 211, "correlation": 156703 } }, { "ph": "s", "id": 156703, "pid": 76337, "tid": -914061504, "ts": 1716454223841761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223897241, "dur": 1, "args": { "External id": 156705, "device": 5, "context": 1, "stream": 7, "correlation": 156705, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 156705, "pid": 5, "tid": 7, "ts": 1716454223897241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223841772, "dur": 7, "args": { "External id": 156705, "cbid": 51, "correlation": 156705 } }, { "ph": "s", "id": 156705, "pid": 76337, "tid": -914061504, "ts": 1716454223841772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223897245, "dur": 197, "args": { "External id": 156706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156706, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 156706, "pid": 5, "tid": 7, "ts": 1716454223897245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841781, "dur": 8, "args": { "External id": 156706, "cbid": 211, "correlation": 156706 } }, { "ph": "s", "id": 156706, "pid": 76337, "tid": -914061504, "ts": 1716454223841781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223897444, "dur": 6, "args": { "External id": 156708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156708, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156708, "pid": 5, "tid": 7, "ts": 1716454223897444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841793, "dur": 5, "args": { "External id": 156708, "cbid": 211, "correlation": 156708 } }, { "ph": "s", "id": 156708, "pid": 76337, "tid": -914061504, "ts": 1716454223841793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223897451, "dur": 6, "args": { "External id": 156714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156714, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156714, "pid": 5, "tid": 7, "ts": 1716454223897451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841820, "dur": 9, "args": { "External id": 156714, "cbid": 211, "correlation": 156714 } }, { "ph": "s", "id": 156714, "pid": 76337, "tid": -914061504, "ts": 1716454223841820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223897459, "dur": 11, "args": { "External id": 156734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156734, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 156734, "pid": 5, "tid": 7, "ts": 1716454223897459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841913, "dur": 12, "args": { "External id": 156734, "cbid": 211, "correlation": 156734 } }, { "ph": "s", "id": 156734, "pid": 76337, "tid": -914061504, "ts": 1716454223841913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223897471, "dur": 4, "args": { "External id": 156746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156746, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 156746, "pid": 5, "tid": 7, "ts": 1716454223897471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841937, "dur": 6, "args": { "External id": 156746, "cbid": 211, "correlation": 156746 } }, { "ph": "s", "id": 156746, "pid": 76337, "tid": -914061504, "ts": 1716454223841937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223897477, "dur": 9, "args": { "External id": 156749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156749, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156749, "pid": 5, "tid": 7, "ts": 1716454223897477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223841956, "dur": 7, "args": { "External id": 156749, "cbid": 211, "correlation": 156749 } }, { "ph": "s", "id": 156749, "pid": 76337, "tid": -914061504, "ts": 1716454223841956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223897487, "dur": 5, "args": { "External id": 156758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156758, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156758, "pid": 5, "tid": 7, "ts": 1716454223897487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842006, "dur": 10, "args": { "External id": 156758, "cbid": 211, "correlation": 156758 } }, { "ph": "s", "id": 156758, "pid": 76337, "tid": -914061504, "ts": 1716454223842006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223842059, "dur": 0, "args": { "External id": 156768, "cbid": 317, "correlation": 156768 } }, { "ph": "f", "id": 156768, "pid": 76337, "tid": -914061504, "ts": 1716454223842059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223842060, "dur": 0, "args": { "External id": 156769, "cbid": 203, "correlation": 156769 } }, { "ph": "f", "id": 156769, "pid": 76337, "tid": -914061504, "ts": 1716454223842060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223842061, "dur": 0, "args": { "External id": 156770, "cbid": 205, "correlation": 156770 } }, { "ph": "f", "id": 156770, "pid": 76337, "tid": -914061504, "ts": 1716454223842061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223897493, "dur": 5, "args": { "External id": 156774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156774, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156774, "pid": 5, "tid": 7, "ts": 1716454223897493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842076, "dur": 12, "args": { "External id": 156774, "cbid": 211, "correlation": 156774 } }, { "ph": "s", "id": 156774, "pid": 76337, "tid": -914061504, "ts": 1716454223842076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223897499, "dur": 162, "args": { "External id": 156776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156776, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156776, "pid": 5, "tid": 7, "ts": 1716454223897499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842091, "dur": 5, "args": { "External id": 156776, "cbid": 211, "correlation": 156776 } }, { "ph": "s", "id": 156776, "pid": 76337, "tid": -914061504, "ts": 1716454223842091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223897663, "dur": 1, "args": { "External id": 156778, "device": 5, "context": 1, "stream": 7, "correlation": 156778, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 156778, "pid": 5, "tid": 7, "ts": 1716454223897663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223842102, "dur": 6, "args": { "External id": 156778, "cbid": 51, "correlation": 156778 } }, { "ph": "s", "id": 156778, "pid": 76337, "tid": -914061504, "ts": 1716454223842102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223897667, "dur": 270, "args": { "External id": 156779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156779, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156779, "pid": 5, "tid": 7, "ts": 1716454223897667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842109, "dur": 6, "args": { "External id": 156779, "cbid": 211, "correlation": 156779 } }, { "ph": "s", "id": 156779, "pid": 76337, "tid": -914061504, "ts": 1716454223842109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223897938, "dur": 6, "args": { "External id": 156781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156781, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156781, "pid": 5, "tid": 7, "ts": 1716454223897938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842120, "dur": 5, "args": { "External id": 156781, "cbid": 211, "correlation": 156781 } }, { "ph": "s", "id": 156781, "pid": 76337, "tid": -914061504, "ts": 1716454223842120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223897946, "dur": 6, "args": { "External id": 156787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156787, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156787, "pid": 5, "tid": 7, "ts": 1716454223897946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842149, "dur": 8, "args": { "External id": 156787, "cbid": 211, "correlation": 156787 } }, { "ph": "s", "id": 156787, "pid": 76337, "tid": -914061504, "ts": 1716454223842149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223897953, "dur": 3, "args": { "External id": 156795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156795, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 156795, "pid": 5, "tid": 7, "ts": 1716454223897953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842192, "dur": 9, "args": { "External id": 156795, "cbid": 211, "correlation": 156795 } }, { "ph": "s", "id": 156795, "pid": 76337, "tid": -914061504, "ts": 1716454223842192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223842257, "dur": 1, "args": { "External id": 156811, "cbid": 251, "correlation": 156811 } }, { "ph": "f", "id": 156811, "pid": 76337, "tid": -914061504, "ts": 1716454223842257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223842262, "dur": 0, "args": { "External id": 156813, "cbid": 251, "correlation": 156813 } }, { "ph": "f", "id": 156813, "pid": 76337, "tid": -914061504, "ts": 1716454223842262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223897958, "dur": 14, "args": { "External id": 156814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156814, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156814, "pid": 5, "tid": 7, "ts": 1716454223897958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842264, "dur": 12, "args": { "External id": 156814, "cbid": 211, "correlation": 156814 } }, { "ph": "s", "id": 156814, "pid": 76337, "tid": -914061504, "ts": 1716454223842264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223897972, "dur": 5, "args": { "External id": 156816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156816, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156816, "pid": 5, "tid": 7, "ts": 1716454223897972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842278, "dur": 5, "args": { "External id": 156816, "cbid": 211, "correlation": 156816 } }, { "ph": "s", "id": 156816, "pid": 76337, "tid": -914061504, "ts": 1716454223842278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223897979, "dur": 6, "args": { "External id": 156826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156826, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156826, "pid": 5, "tid": 7, "ts": 1716454223897979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842338, "dur": 12, "args": { "External id": 156826, "cbid": 211, "correlation": 156826 } }, { "ph": "s", "id": 156826, "pid": 76337, "tid": -914061504, "ts": 1716454223842338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223897986, "dur": 10, "args": { "External id": 156846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156846, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 156846, "pid": 5, "tid": 7, "ts": 1716454223897986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842405, "dur": 11, "args": { "External id": 156846, "cbid": 211, "correlation": 156846 } }, { "ph": "s", "id": 156846, "pid": 76337, "tid": -914061504, "ts": 1716454223842405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223897997, "dur": 4, "args": { "External id": 156858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156858, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 156858, "pid": 5, "tid": 7, "ts": 1716454223897997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842426, "dur": 6, "args": { "External id": 156858, "cbid": 211, "correlation": 156858 } }, { "ph": "s", "id": 156858, "pid": 76337, "tid": -914061504, "ts": 1716454223842426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223898002, "dur": 7, "args": { "External id": 156861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156861, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156861, "pid": 5, "tid": 7, "ts": 1716454223898002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842445, "dur": 6, "args": { "External id": 156861, "cbid": 211, "correlation": 156861 } }, { "ph": "s", "id": 156861, "pid": 76337, "tid": -914061504, "ts": 1716454223842445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223898011, "dur": 5, "args": { "External id": 156870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156870, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156870, "pid": 5, "tid": 7, "ts": 1716454223898011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842485, "dur": 10, "args": { "External id": 156870, "cbid": 211, "correlation": 156870 } }, { "ph": "s", "id": 156870, "pid": 76337, "tid": -914061504, "ts": 1716454223842485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223842548, "dur": 0, "args": { "External id": 156880, "cbid": 317, "correlation": 156880 } }, { "ph": "f", "id": 156880, "pid": 76337, "tid": -914061504, "ts": 1716454223842548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223842549, "dur": 0, "args": { "External id": 156881, "cbid": 203, "correlation": 156881 } }, { "ph": "f", "id": 156881, "pid": 76337, "tid": -914061504, "ts": 1716454223842549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223842550, "dur": 0, "args": { "External id": 156882, "cbid": 205, "correlation": 156882 } }, { "ph": "f", "id": 156882, "pid": 76337, "tid": -914061504, "ts": 1716454223842550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223898017, "dur": 5, "args": { "External id": 156886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156886, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156886, "pid": 5, "tid": 7, "ts": 1716454223898017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842564, "dur": 13, "args": { "External id": 156886, "cbid": 211, "correlation": 156886 } }, { "ph": "s", "id": 156886, "pid": 76337, "tid": -914061504, "ts": 1716454223842564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223898023, "dur": 163, "args": { "External id": 156888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156888, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156888, "pid": 5, "tid": 7, "ts": 1716454223898023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842579, "dur": 5, "args": { "External id": 156888, "cbid": 211, "correlation": 156888 } }, { "ph": "s", "id": 156888, "pid": 76337, "tid": -914061504, "ts": 1716454223842579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223898188, "dur": 1, "args": { "External id": 156890, "device": 5, "context": 1, "stream": 7, "correlation": 156890, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 156890, "pid": 5, "tid": 7, "ts": 1716454223898188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223842590, "dur": 6, "args": { "External id": 156890, "cbid": 51, "correlation": 156890 } }, { "ph": "s", "id": 156890, "pid": 76337, "tid": -914061504, "ts": 1716454223842590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223898192, "dur": 259, "args": { "External id": 156891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156891, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156891, "pid": 5, "tid": 7, "ts": 1716454223898192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842597, "dur": 6, "args": { "External id": 156891, "cbid": 211, "correlation": 156891 } }, { "ph": "s", "id": 156891, "pid": 76337, "tid": -914061504, "ts": 1716454223842597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223898452, "dur": 6, "args": { "External id": 156893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156893, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156893, "pid": 5, "tid": 7, "ts": 1716454223898452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842607, "dur": 5, "args": { "External id": 156893, "cbid": 211, "correlation": 156893 } }, { "ph": "s", "id": 156893, "pid": 76337, "tid": -914061504, "ts": 1716454223842607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223898459, "dur": 6, "args": { "External id": 156899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156899, "pid": 5, "tid": 7, "ts": 1716454223898459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842636, "dur": 8, "args": { "External id": 156899, "cbid": 211, "correlation": 156899 } }, { "ph": "s", "id": 156899, "pid": 76337, "tid": -914061504, "ts": 1716454223842636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223898467, "dur": 5, "args": { "External id": 156907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156907, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156907, "pid": 5, "tid": 7, "ts": 1716454223898467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842668, "dur": 9, "args": { "External id": 156907, "cbid": 211, "correlation": 156907 } }, { "ph": "s", "id": 156907, "pid": 76337, "tid": -914061504, "ts": 1716454223842668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223898473, "dur": 4, "args": { "External id": 156915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156915, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156915, "pid": 5, "tid": 7, "ts": 1716454223898473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842697, "dur": 8, "args": { "External id": 156915, "cbid": 211, "correlation": 156915 } }, { "ph": "s", "id": 156915, "pid": 76337, "tid": -914061504, "ts": 1716454223842697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223898479, "dur": 10, "args": { "External id": 156935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156935, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 156935, "pid": 5, "tid": 7, "ts": 1716454223898479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842770, "dur": 12, "args": { "External id": 156935, "cbid": 211, "correlation": 156935 } }, { "ph": "s", "id": 156935, "pid": 76337, "tid": -914061504, "ts": 1716454223842770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223898490, "dur": 4, "args": { "External id": 156947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156947, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 156947, "pid": 5, "tid": 7, "ts": 1716454223898490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842792, "dur": 6, "args": { "External id": 156947, "cbid": 211, "correlation": 156947 } }, { "ph": "s", "id": 156947, "pid": 76337, "tid": -914061504, "ts": 1716454223842792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223898495, "dur": 6, "args": { "External id": 156950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156950, "pid": 5, "tid": 7, "ts": 1716454223898495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842809, "dur": 7, "args": { "External id": 156950, "cbid": 211, "correlation": 156950 } }, { "ph": "s", "id": 156950, "pid": 76337, "tid": -914061504, "ts": 1716454223842809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223898502, "dur": 5, "args": { "External id": 156959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156959, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156959, "pid": 5, "tid": 7, "ts": 1716454223898502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842848, "dur": 9, "args": { "External id": 156959, "cbid": 211, "correlation": 156959 } }, { "ph": "s", "id": 156959, "pid": 76337, "tid": -914061504, "ts": 1716454223842848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223842900, "dur": 0, "args": { "External id": 156969, "cbid": 317, "correlation": 156969 } }, { "ph": "f", "id": 156969, "pid": 76337, "tid": -914061504, "ts": 1716454223842900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223842901, "dur": 0, "args": { "External id": 156970, "cbid": 203, "correlation": 156970 } }, { "ph": "f", "id": 156970, "pid": 76337, "tid": -914061504, "ts": 1716454223842901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223842902, "dur": 0, "args": { "External id": 156971, "cbid": 205, "correlation": 156971 } }, { "ph": "f", "id": 156971, "pid": 76337, "tid": -914061504, "ts": 1716454223842902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223898508, "dur": 5, "args": { "External id": 156975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156975, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156975, "pid": 5, "tid": 7, "ts": 1716454223898508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842915, "dur": 11, "args": { "External id": 156975, "cbid": 211, "correlation": 156975 } }, { "ph": "s", "id": 156975, "pid": 76337, "tid": -914061504, "ts": 1716454223842915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223898514, "dur": 162, "args": { "External id": 156977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156977, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156977, "pid": 5, "tid": 7, "ts": 1716454223898514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842929, "dur": 6, "args": { "External id": 156977, "cbid": 211, "correlation": 156977 } }, { "ph": "s", "id": 156977, "pid": 76337, "tid": -914061504, "ts": 1716454223842929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223898679, "dur": 1, "args": { "External id": 156979, "device": 5, "context": 1, "stream": 7, "correlation": 156979, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 156979, "pid": 5, "tid": 7, "ts": 1716454223898679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223842940, "dur": 6, "args": { "External id": 156979, "cbid": 51, "correlation": 156979 } }, { "ph": "s", "id": 156979, "pid": 76337, "tid": -914061504, "ts": 1716454223842940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223898683, "dur": 258, "args": { "External id": 156980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156980, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 156980, "pid": 5, "tid": 7, "ts": 1716454223898683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842947, "dur": 6, "args": { "External id": 156980, "cbid": 211, "correlation": 156980 } }, { "ph": "s", "id": 156980, "pid": 76337, "tid": -914061504, "ts": 1716454223842947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223898942, "dur": 6, "args": { "External id": 156982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 156982, "pid": 5, "tid": 7, "ts": 1716454223898942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842956, "dur": 5, "args": { "External id": 156982, "cbid": 211, "correlation": 156982 } }, { "ph": "s", "id": 156982, "pid": 76337, "tid": -914061504, "ts": 1716454223842956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223898949, "dur": 6, "args": { "External id": 156988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156988, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 156988, "pid": 5, "tid": 7, "ts": 1716454223898949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223842993, "dur": 9, "args": { "External id": 156988, "cbid": 211, "correlation": 156988 } }, { "ph": "s", "id": 156988, "pid": 76337, "tid": -914061504, "ts": 1716454223842993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223898956, "dur": 3, "args": { "External id": 156996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 156996, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 156996, "pid": 5, "tid": 7, "ts": 1716454223898956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843039, "dur": 9, "args": { "External id": 156996, "cbid": 211, "correlation": 156996 } }, { "ph": "s", "id": 156996, "pid": 76337, "tid": -914061504, "ts": 1716454223843039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223843102, "dur": 1, "args": { "External id": 157012, "cbid": 251, "correlation": 157012 } }, { "ph": "f", "id": 157012, "pid": 76337, "tid": -914061504, "ts": 1716454223843102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223843107, "dur": 0, "args": { "External id": 157014, "cbid": 251, "correlation": 157014 } }, { "ph": "f", "id": 157014, "pid": 76337, "tid": -914061504, "ts": 1716454223843107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223898961, "dur": 10, "args": { "External id": 157015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157015, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157015, "pid": 5, "tid": 7, "ts": 1716454223898961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843108, "dur": 11, "args": { "External id": 157015, "cbid": 211, "correlation": 157015 } }, { "ph": "s", "id": 157015, "pid": 76337, "tid": -914061504, "ts": 1716454223843108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223898972, "dur": 4, "args": { "External id": 157017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157017, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157017, "pid": 5, "tid": 7, "ts": 1716454223898972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843121, "dur": 6, "args": { "External id": 157017, "cbid": 211, "correlation": 157017 } }, { "ph": "s", "id": 157017, "pid": 76337, "tid": -914061504, "ts": 1716454223843121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223898977, "dur": 6, "args": { "External id": 157027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157027, "pid": 5, "tid": 7, "ts": 1716454223898977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843178, "dur": 12, "args": { "External id": 157027, "cbid": 211, "correlation": 157027 } }, { "ph": "s", "id": 157027, "pid": 76337, "tid": -914061504, "ts": 1716454223843178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223898984, "dur": 9, "args": { "External id": 157047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157047, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 157047, "pid": 5, "tid": 7, "ts": 1716454223898984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843243, "dur": 10, "args": { "External id": 157047, "cbid": 211, "correlation": 157047 } }, { "ph": "s", "id": 157047, "pid": 76337, "tid": -914061504, "ts": 1716454223843243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223898995, "dur": 4, "args": { "External id": 157059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157059, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 157059, "pid": 5, "tid": 7, "ts": 1716454223898995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843264, "dur": 6, "args": { "External id": 157059, "cbid": 211, "correlation": 157059 } }, { "ph": "s", "id": 157059, "pid": 76337, "tid": -914061504, "ts": 1716454223843264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223899000, "dur": 7, "args": { "External id": 157062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157062, "pid": 5, "tid": 7, "ts": 1716454223899000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843283, "dur": 6, "args": { "External id": 157062, "cbid": 211, "correlation": 157062 } }, { "ph": "s", "id": 157062, "pid": 76337, "tid": -914061504, "ts": 1716454223843283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223899008, "dur": 5, "args": { "External id": 157071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157071, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157071, "pid": 5, "tid": 7, "ts": 1716454223899008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843322, "dur": 10, "args": { "External id": 157071, "cbid": 211, "correlation": 157071 } }, { "ph": "s", "id": 157071, "pid": 76337, "tid": -914061504, "ts": 1716454223843322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223843385, "dur": 0, "args": { "External id": 157081, "cbid": 317, "correlation": 157081 } }, { "ph": "f", "id": 157081, "pid": 76337, "tid": -914061504, "ts": 1716454223843385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223843385, "dur": 0, "args": { "External id": 157082, "cbid": 203, "correlation": 157082 } }, { "ph": "f", "id": 157082, "pid": 76337, "tid": -914061504, "ts": 1716454223843385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223843386, "dur": 0, "args": { "External id": 157083, "cbid": 205, "correlation": 157083 } }, { "ph": "f", "id": 157083, "pid": 76337, "tid": -914061504, "ts": 1716454223843386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223899014, "dur": 5, "args": { "External id": 157087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157087, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157087, "pid": 5, "tid": 7, "ts": 1716454223899014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843400, "dur": 12, "args": { "External id": 157087, "cbid": 211, "correlation": 157087 } }, { "ph": "s", "id": 157087, "pid": 76337, "tid": -914061504, "ts": 1716454223843400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223899021, "dur": 162, "args": { "External id": 157089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157089, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157089, "pid": 5, "tid": 7, "ts": 1716454223899021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843414, "dur": 5, "args": { "External id": 157089, "cbid": 211, "correlation": 157089 } }, { "ph": "s", "id": 157089, "pid": 76337, "tid": -914061504, "ts": 1716454223843414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223899185, "dur": 1, "args": { "External id": 157091, "device": 5, "context": 1, "stream": 7, "correlation": 157091, "bytes": 240, "memory bandwidth (GB/s)": 0.15 } }, { "ph": "f", "id": 157091, "pid": 5, "tid": 7, "ts": 1716454223899185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223843425, "dur": 6, "args": { "External id": 157091, "cbid": 51, "correlation": 157091 } }, { "ph": "s", "id": 157091, "pid": 76337, "tid": -914061504, "ts": 1716454223843425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223899188, "dur": 258, "args": { "External id": 157092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157092, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157092, "pid": 5, "tid": 7, "ts": 1716454223899188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843432, "dur": 6, "args": { "External id": 157092, "cbid": 211, "correlation": 157092 } }, { "ph": "s", "id": 157092, "pid": 76337, "tid": -914061504, "ts": 1716454223843432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223899447, "dur": 6, "args": { "External id": 157094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157094, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157094, "pid": 5, "tid": 7, "ts": 1716454223899447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843442, "dur": 5, "args": { "External id": 157094, "cbid": 211, "correlation": 157094 } }, { "ph": "s", "id": 157094, "pid": 76337, "tid": -914061504, "ts": 1716454223843442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223899455, "dur": 6, "args": { "External id": 157100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157100, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157100, "pid": 5, "tid": 7, "ts": 1716454223899455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843470, "dur": 10, "args": { "External id": 157100, "cbid": 211, "correlation": 157100 } }, { "ph": "s", "id": 157100, "pid": 76337, "tid": -914061504, "ts": 1716454223843470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223899462, "dur": 5, "args": { "External id": 157108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157108, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157108, "pid": 5, "tid": 7, "ts": 1716454223899462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843503, "dur": 8, "args": { "External id": 157108, "cbid": 211, "correlation": 157108 } }, { "ph": "s", "id": 157108, "pid": 76337, "tid": -914061504, "ts": 1716454223843503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223899468, "dur": 4, "args": { "External id": 157116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157116, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157116, "pid": 5, "tid": 7, "ts": 1716454223899468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843532, "dur": 8, "args": { "External id": 157116, "cbid": 211, "correlation": 157116 } }, { "ph": "s", "id": 157116, "pid": 76337, "tid": -914061504, "ts": 1716454223843532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223899474, "dur": 10, "args": { "External id": 157136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157136, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 157136, "pid": 5, "tid": 7, "ts": 1716454223899474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843637, "dur": 13, "args": { "External id": 157136, "cbid": 211, "correlation": 157136 } }, { "ph": "s", "id": 157136, "pid": 76337, "tid": -914061504, "ts": 1716454223843637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223899485, "dur": 4, "args": { "External id": 157148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157148, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 157148, "pid": 5, "tid": 7, "ts": 1716454223899485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843659, "dur": 6, "args": { "External id": 157148, "cbid": 211, "correlation": 157148 } }, { "ph": "s", "id": 157148, "pid": 76337, "tid": -914061504, "ts": 1716454223843659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223899490, "dur": 7, "args": { "External id": 157151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157151, "pid": 5, "tid": 7, "ts": 1716454223899490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843678, "dur": 7, "args": { "External id": 157151, "cbid": 211, "correlation": 157151 } }, { "ph": "s", "id": 157151, "pid": 76337, "tid": -914061504, "ts": 1716454223843678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223899498, "dur": 5, "args": { "External id": 157160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157160, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157160, "pid": 5, "tid": 7, "ts": 1716454223899498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843715, "dur": 10, "args": { "External id": 157160, "cbid": 211, "correlation": 157160 } }, { "ph": "s", "id": 157160, "pid": 76337, "tid": -914061504, "ts": 1716454223843715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223843768, "dur": 0, "args": { "External id": 157170, "cbid": 317, "correlation": 157170 } }, { "ph": "f", "id": 157170, "pid": 76337, "tid": -914061504, "ts": 1716454223843768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223843769, "dur": 0, "args": { "External id": 157171, "cbid": 203, "correlation": 157171 } }, { "ph": "f", "id": 157171, "pid": 76337, "tid": -914061504, "ts": 1716454223843769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223843770, "dur": 0, "args": { "External id": 157172, "cbid": 205, "correlation": 157172 } }, { "ph": "f", "id": 157172, "pid": 76337, "tid": -914061504, "ts": 1716454223843770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223899504, "dur": 5, "args": { "External id": 157176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157176, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157176, "pid": 5, "tid": 7, "ts": 1716454223899504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843783, "dur": 11, "args": { "External id": 157176, "cbid": 211, "correlation": 157176 } }, { "ph": "s", "id": 157176, "pid": 76337, "tid": -914061504, "ts": 1716454223843783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223899510, "dur": 162, "args": { "External id": 157178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157178, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157178, "pid": 5, "tid": 7, "ts": 1716454223899510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843797, "dur": 5, "args": { "External id": 157178, "cbid": 211, "correlation": 157178 } }, { "ph": "s", "id": 157178, "pid": 76337, "tid": -914061504, "ts": 1716454223843797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223899674, "dur": 1, "args": { "External id": 157180, "device": 5, "context": 1, "stream": 7, "correlation": 157180, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 157180, "pid": 5, "tid": 7, "ts": 1716454223899674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223843808, "dur": 6, "args": { "External id": 157180, "cbid": 51, "correlation": 157180 } }, { "ph": "s", "id": 157180, "pid": 76337, "tid": -914061504, "ts": 1716454223843808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223899678, "dur": 258, "args": { "External id": 157181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157181, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157181, "pid": 5, "tid": 7, "ts": 1716454223899678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843815, "dur": 6, "args": { "External id": 157181, "cbid": 211, "correlation": 157181 } }, { "ph": "s", "id": 157181, "pid": 76337, "tid": -914061504, "ts": 1716454223843815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223899937, "dur": 6, "args": { "External id": 157183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157183, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157183, "pid": 5, "tid": 7, "ts": 1716454223899937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843826, "dur": 5, "args": { "External id": 157183, "cbid": 211, "correlation": 157183 } }, { "ph": "s", "id": 157183, "pid": 76337, "tid": -914061504, "ts": 1716454223843826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223899944, "dur": 6, "args": { "External id": 157189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157189, "pid": 5, "tid": 7, "ts": 1716454223899944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843854, "dur": 9, "args": { "External id": 157189, "cbid": 211, "correlation": 157189 } }, { "ph": "s", "id": 157189, "pid": 76337, "tid": -914061504, "ts": 1716454223843854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223899952, "dur": 3, "args": { "External id": 157197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157197, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 157197, "pid": 5, "tid": 7, "ts": 1716454223899952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843898, "dur": 9, "args": { "External id": 157197, "cbid": 211, "correlation": 157197 } }, { "ph": "s", "id": 157197, "pid": 76337, "tid": -914061504, "ts": 1716454223843898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223843959, "dur": 1, "args": { "External id": 157213, "cbid": 251, "correlation": 157213 } }, { "ph": "f", "id": 157213, "pid": 76337, "tid": -914061504, "ts": 1716454223843959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223843965, "dur": 0, "args": { "External id": 157215, "cbid": 251, "correlation": 157215 } }, { "ph": "f", "id": 157215, "pid": 76337, "tid": -914061504, "ts": 1716454223843965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223899956, "dur": 11, "args": { "External id": 157216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157216, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157216, "pid": 5, "tid": 7, "ts": 1716454223899956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843967, "dur": 20, "args": { "External id": 157216, "cbid": 211, "correlation": 157216 } }, { "ph": "s", "id": 157216, "pid": 76337, "tid": -914061504, "ts": 1716454223843967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223899968, "dur": 4, "args": { "External id": 157218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157218, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157218, "pid": 5, "tid": 7, "ts": 1716454223899968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223843989, "dur": 6, "args": { "External id": 157218, "cbid": 211, "correlation": 157218 } }, { "ph": "s", "id": 157218, "pid": 76337, "tid": -914061504, "ts": 1716454223843989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223899973, "dur": 6, "args": { "External id": 157228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157228, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157228, "pid": 5, "tid": 7, "ts": 1716454223899973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844047, "dur": 12, "args": { "External id": 157228, "cbid": 211, "correlation": 157228 } }, { "ph": "s", "id": 157228, "pid": 76337, "tid": -914061504, "ts": 1716454223844047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223899980, "dur": 10, "args": { "External id": 157248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157248, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 157248, "pid": 5, "tid": 7, "ts": 1716454223899980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844114, "dur": 12, "args": { "External id": 157248, "cbid": 211, "correlation": 157248 } }, { "ph": "s", "id": 157248, "pid": 76337, "tid": -914061504, "ts": 1716454223844114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223899991, "dur": 4, "args": { "External id": 157260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157260, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 157260, "pid": 5, "tid": 7, "ts": 1716454223899991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844136, "dur": 6, "args": { "External id": 157260, "cbid": 211, "correlation": 157260 } }, { "ph": "s", "id": 157260, "pid": 76337, "tid": -914061504, "ts": 1716454223844136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223899996, "dur": 7, "args": { "External id": 157263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157263, "pid": 5, "tid": 7, "ts": 1716454223899996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844153, "dur": 6, "args": { "External id": 157263, "cbid": 211, "correlation": 157263 } }, { "ph": "s", "id": 157263, "pid": 76337, "tid": -914061504, "ts": 1716454223844153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223900004, "dur": 5, "args": { "External id": 157272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157272, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157272, "pid": 5, "tid": 7, "ts": 1716454223900004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844194, "dur": 10, "args": { "External id": 157272, "cbid": 211, "correlation": 157272 } }, { "ph": "s", "id": 157272, "pid": 76337, "tid": -914061504, "ts": 1716454223844194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223844257, "dur": 0, "args": { "External id": 157282, "cbid": 317, "correlation": 157282 } }, { "ph": "f", "id": 157282, "pid": 76337, "tid": -914061504, "ts": 1716454223844257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223844257, "dur": 0, "args": { "External id": 157283, "cbid": 203, "correlation": 157283 } }, { "ph": "f", "id": 157283, "pid": 76337, "tid": -914061504, "ts": 1716454223844257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223844258, "dur": 0, "args": { "External id": 157284, "cbid": 205, "correlation": 157284 } }, { "ph": "f", "id": 157284, "pid": 76337, "tid": -914061504, "ts": 1716454223844258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223900010, "dur": 5, "args": { "External id": 157288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157288, "pid": 5, "tid": 7, "ts": 1716454223900010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844273, "dur": 12, "args": { "External id": 157288, "cbid": 211, "correlation": 157288 } }, { "ph": "s", "id": 157288, "pid": 76337, "tid": -914061504, "ts": 1716454223844273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223900016, "dur": 162, "args": { "External id": 157290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157290, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157290, "pid": 5, "tid": 7, "ts": 1716454223900016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844288, "dur": 5, "args": { "External id": 157290, "cbid": 211, "correlation": 157290 } }, { "ph": "s", "id": 157290, "pid": 76337, "tid": -914061504, "ts": 1716454223844288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223900180, "dur": 1, "args": { "External id": 157292, "device": 5, "context": 1, "stream": 7, "correlation": 157292, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 157292, "pid": 5, "tid": 7, "ts": 1716454223900180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223844298, "dur": 6, "args": { "External id": 157292, "cbid": 51, "correlation": 157292 } }, { "ph": "s", "id": 157292, "pid": 76337, "tid": -914061504, "ts": 1716454223844298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223900184, "dur": 258, "args": { "External id": 157293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157293, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157293, "pid": 5, "tid": 7, "ts": 1716454223900184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844305, "dur": 6, "args": { "External id": 157293, "cbid": 211, "correlation": 157293 } }, { "ph": "s", "id": 157293, "pid": 76337, "tid": -914061504, "ts": 1716454223844305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223900443, "dur": 6, "args": { "External id": 157295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157295, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157295, "pid": 5, "tid": 7, "ts": 1716454223900443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844315, "dur": 5, "args": { "External id": 157295, "cbid": 211, "correlation": 157295 } }, { "ph": "s", "id": 157295, "pid": 76337, "tid": -914061504, "ts": 1716454223844315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223900450, "dur": 6, "args": { "External id": 157301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157301, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157301, "pid": 5, "tid": 7, "ts": 1716454223900450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844343, "dur": 9, "args": { "External id": 157301, "cbid": 211, "correlation": 157301 } }, { "ph": "s", "id": 157301, "pid": 76337, "tid": -914061504, "ts": 1716454223844343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223900458, "dur": 5, "args": { "External id": 157309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157309, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157309, "pid": 5, "tid": 7, "ts": 1716454223900458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844376, "dur": 8, "args": { "External id": 157309, "cbid": 211, "correlation": 157309 } }, { "ph": "s", "id": 157309, "pid": 76337, "tid": -914061504, "ts": 1716454223844376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223900464, "dur": 4, "args": { "External id": 157317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157317, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157317, "pid": 5, "tid": 7, "ts": 1716454223900464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844406, "dur": 8, "args": { "External id": 157317, "cbid": 211, "correlation": 157317 } }, { "ph": "s", "id": 157317, "pid": 76337, "tid": -914061504, "ts": 1716454223844406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223900470, "dur": 10, "args": { "External id": 157337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157337, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 157337, "pid": 5, "tid": 7, "ts": 1716454223900470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844548, "dur": 13, "args": { "External id": 157337, "cbid": 211, "correlation": 157337 } }, { "ph": "s", "id": 157337, "pid": 76337, "tid": -914061504, "ts": 1716454223844548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223900481, "dur": 4, "args": { "External id": 157349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157349, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 157349, "pid": 5, "tid": 7, "ts": 1716454223900481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844571, "dur": 7, "args": { "External id": 157349, "cbid": 211, "correlation": 157349 } }, { "ph": "s", "id": 157349, "pid": 76337, "tid": -914061504, "ts": 1716454223844571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223900486, "dur": 6, "args": { "External id": 157352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157352, "pid": 5, "tid": 7, "ts": 1716454223900486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844590, "dur": 7, "args": { "External id": 157352, "cbid": 211, "correlation": 157352 } }, { "ph": "s", "id": 157352, "pid": 76337, "tid": -914061504, "ts": 1716454223844590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223844649, "dur": 0, "args": { "External id": 157363, "cbid": 317, "correlation": 157363 } }, { "ph": "f", "id": 157363, "pid": 76337, "tid": -914061504, "ts": 1716454223844649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223844650, "dur": 0, "args": { "External id": 157364, "cbid": 203, "correlation": 157364 } }, { "ph": "f", "id": 157364, "pid": 76337, "tid": -914061504, "ts": 1716454223844650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223844651, "dur": 0, "args": { "External id": 157365, "cbid": 205, "correlation": 157365 } }, { "ph": "f", "id": 157365, "pid": 76337, "tid": -914061504, "ts": 1716454223844651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223900494, "dur": 5, "args": { "External id": 157369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157369, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157369, "pid": 5, "tid": 7, "ts": 1716454223900494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844669, "dur": 13, "args": { "External id": 157369, "cbid": 211, "correlation": 157369 } }, { "ph": "s", "id": 157369, "pid": 76337, "tid": -914061504, "ts": 1716454223844669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223900500, "dur": 37, "args": { "External id": 157371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157371, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 157371, "pid": 5, "tid": 7, "ts": 1716454223900500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844688, "dur": 9, "args": { "External id": 157371, "cbid": 211, "correlation": 157371 } }, { "ph": "s", "id": 157371, "pid": 76337, "tid": -914061504, "ts": 1716454223844688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223900538, "dur": 5, "args": { "External id": 157373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157373, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157373, "pid": 5, "tid": 7, "ts": 1716454223900538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844701, "dur": 5, "args": { "External id": 157373, "cbid": 211, "correlation": 157373 } }, { "ph": "s", "id": 157373, "pid": 76337, "tid": -914061504, "ts": 1716454223844701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223900545, "dur": 6, "args": { "External id": 157379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157379, "pid": 5, "tid": 7, "ts": 1716454223900545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844728, "dur": 9, "args": { "External id": 157379, "cbid": 211, "correlation": 157379 } }, { "ph": "s", "id": 157379, "pid": 76337, "tid": -914061504, "ts": 1716454223844728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223900552, "dur": 20, "args": { "External id": 157388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157388, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157388, "pid": 5, "tid": 7, "ts": 1716454223900552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844811, "dur": 14, "args": { "External id": 157388, "cbid": 211, "correlation": 157388 } }, { "ph": "s", "id": 157388, "pid": 76337, "tid": -914061504, "ts": 1716454223844811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223900574, "dur": 11, "args": { "External id": 157410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157410, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 157410, "pid": 5, "tid": 7, "ts": 1716454223900574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844867, "dur": 11, "args": { "External id": 157410, "cbid": 211, "correlation": 157410 } }, { "ph": "s", "id": 157410, "pid": 76337, "tid": -914061504, "ts": 1716454223844867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223844960, "dur": 2, "args": { "External id": 157421, "cbid": 251, "correlation": 157421 } }, { "ph": "f", "id": 157421, "pid": 76337, "tid": -914061504, "ts": 1716454223844960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223844965, "dur": 0, "args": { "External id": 157422, "cbid": 251, "correlation": 157422 } }, { "ph": "f", "id": 157422, "pid": 76337, "tid": -914061504, "ts": 1716454223844965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223900586, "dur": 54, "args": { "External id": 157423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157423, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 157423, "pid": 5, "tid": 7, "ts": 1716454223900586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223844969, "dur": 22, "args": { "External id": 157423, "cbid": 211, "correlation": 157423 } }, { "ph": "s", "id": 157423, "pid": 76337, "tid": -914061504, "ts": 1716454223844969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845050, "dur": 1, "args": { "External id": 157434, "cbid": 251, "correlation": 157434 } }, { "ph": "f", "id": 157434, "pid": 76337, "tid": -914061504, "ts": 1716454223845050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845054, "dur": 0, "args": { "External id": 157435, "cbid": 251, "correlation": 157435 } }, { "ph": "f", "id": 157435, "pid": 76337, "tid": -914061504, "ts": 1716454223845054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223900641, "dur": 53, "args": { "External id": 157436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157436, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 157436, "pid": 5, "tid": 7, "ts": 1716454223900641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845056, "dur": 12, "args": { "External id": 157436, "cbid": 211, "correlation": 157436 } }, { "ph": "s", "id": 157436, "pid": 76337, "tid": -914061504, "ts": 1716454223845056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845123, "dur": 1, "args": { "External id": 157447, "cbid": 251, "correlation": 157447 } }, { "ph": "f", "id": 157447, "pid": 76337, "tid": -914061504, "ts": 1716454223845123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845126, "dur": 0, "args": { "External id": 157448, "cbid": 251, "correlation": 157448 } }, { "ph": "f", "id": 157448, "pid": 76337, "tid": -914061504, "ts": 1716454223845126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223900695, "dur": 53, "args": { "External id": 157449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157449, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 157449, "pid": 5, "tid": 7, "ts": 1716454223900695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845128, "dur": 11, "args": { "External id": 157449, "cbid": 211, "correlation": 157449 } }, { "ph": "s", "id": 157449, "pid": 76337, "tid": -914061504, "ts": 1716454223845128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223900750, "dur": 55, "args": { "External id": 157474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157474, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157474, "pid": 5, "tid": 7, "ts": 1716454223900750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845213, "dur": 14, "args": { "External id": 157474, "cbid": 211, "correlation": 157474 } }, { "ph": "s", "id": 157474, "pid": 76337, "tid": -914061504, "ts": 1716454223845213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845313, "dur": 1, "args": { "External id": 157492, "cbid": 251, "correlation": 157492 } }, { "ph": "f", "id": 157492, "pid": 76337, "tid": -914061504, "ts": 1716454223845313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223900806, "dur": 63, "args": { "External id": 157494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157494, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 157494, "pid": 5, "tid": 7, "ts": 1716454223900806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845320, "dur": 14, "args": { "External id": 157494, "cbid": 211, "correlation": 157494 } }, { "ph": "s", "id": 157494, "pid": 76337, "tid": -914061504, "ts": 1716454223845320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223900871, "dur": 6, "args": { "External id": 157502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157502, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157502, "pid": 5, "tid": 7, "ts": 1716454223900871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845390, "dur": 12, "args": { "External id": 157502, "cbid": 211, "correlation": 157502 } }, { "ph": "s", "id": 157502, "pid": 76337, "tid": -914061504, "ts": 1716454223845390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223900878, "dur": 7, "args": { "External id": 157510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157510, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157510, "pid": 5, "tid": 7, "ts": 1716454223900878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845428, "dur": 8, "args": { "External id": 157510, "cbid": 211, "correlation": 157510 } }, { "ph": "s", "id": 157510, "pid": 76337, "tid": -914061504, "ts": 1716454223845428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223900886, "dur": 8, "args": { "External id": 157521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157521, "pid": 5, "tid": 7, "ts": 1716454223900886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845500, "dur": 13, "args": { "External id": 157521, "cbid": 211, "correlation": 157521 } }, { "ph": "s", "id": 157521, "pid": 76337, "tid": -914061504, "ts": 1716454223845500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223900895, "dur": 9, "args": { "External id": 157543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157543, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 157543, "pid": 5, "tid": 7, "ts": 1716454223900895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845533, "dur": 7, "args": { "External id": 157543, "cbid": 211, "correlation": 157543 } }, { "ph": "s", "id": 157543, "pid": 76337, "tid": -914061504, "ts": 1716454223845533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845618, "dur": 3, "args": { "External id": 157554, "cbid": 251, "correlation": 157554 } }, { "ph": "f", "id": 157554, "pid": 76337, "tid": -914061504, "ts": 1716454223845618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223900906, "dur": 1, "args": { "External id": 157555, "device": 5, "context": 1, "stream": 7, "correlation": 157555, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 157555, "pid": 5, "tid": 7, "ts": 1716454223900906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223845625, "dur": 10, "args": { "External id": 157555, "cbid": 51, "correlation": 157555 } }, { "ph": "s", "id": 157555, "pid": 76337, "tid": -914061504, "ts": 1716454223845625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223900910, "dur": 36, "args": { "External id": 157556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157556, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 157556, "pid": 5, "tid": 7, "ts": 1716454223900910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845636, "dur": 12, "args": { "External id": 157556, "cbid": 211, "correlation": 157556 } }, { "ph": "s", "id": 157556, "pid": 76337, "tid": -914061504, "ts": 1716454223845636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845707, "dur": 1, "args": { "External id": 157567, "cbid": 251, "correlation": 157567 } }, { "ph": "f", "id": 157567, "pid": 76337, "tid": -914061504, "ts": 1716454223845707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845711, "dur": 0, "args": { "External id": 157568, "cbid": 251, "correlation": 157568 } }, { "ph": "f", "id": 157568, "pid": 76337, "tid": -914061504, "ts": 1716454223845711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223900947, "dur": 12, "args": { "External id": 157569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157569, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157569, "pid": 5, "tid": 7, "ts": 1716454223900947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845713, "dur": 13, "args": { "External id": 157569, "cbid": 211, "correlation": 157569 } }, { "ph": "s", "id": 157569, "pid": 76337, "tid": -914061504, "ts": 1716454223845713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223900960, "dur": 5, "args": { "External id": 157571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157571, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157571, "pid": 5, "tid": 7, "ts": 1716454223900960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845727, "dur": 6, "args": { "External id": 157571, "cbid": 211, "correlation": 157571 } }, { "ph": "s", "id": 157571, "pid": 76337, "tid": -914061504, "ts": 1716454223845727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845783, "dur": 1, "args": { "External id": 157582, "cbid": 251, "correlation": 157582 } }, { "ph": "f", "id": 157582, "pid": 76337, "tid": -914061504, "ts": 1716454223845783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845787, "dur": 0, "args": { "External id": 157583, "cbid": 251, "correlation": 157583 } }, { "ph": "f", "id": 157583, "pid": 76337, "tid": -914061504, "ts": 1716454223845787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223900967, "dur": 8, "args": { "External id": 157584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157584, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157584, "pid": 5, "tid": 7, "ts": 1716454223900967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845788, "dur": 11, "args": { "External id": 157584, "cbid": 211, "correlation": 157584 } }, { "ph": "s", "id": 157584, "pid": 76337, "tid": -914061504, "ts": 1716454223845788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223900976, "dur": 4, "args": { "External id": 157586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157586, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157586, "pid": 5, "tid": 7, "ts": 1716454223900976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845801, "dur": 5, "args": { "External id": 157586, "cbid": 211, "correlation": 157586 } }, { "ph": "s", "id": 157586, "pid": 76337, "tid": -914061504, "ts": 1716454223845801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223900981, "dur": 20, "args": { "External id": 157611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157611, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 157611, "pid": 5, "tid": 7, "ts": 1716454223900981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223845880, "dur": 12, "args": { "External id": 157611, "cbid": 211, "correlation": 157611 } }, { "ph": "s", "id": 157611, "pid": 76337, "tid": -914061504, "ts": 1716454223845880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223845987, "dur": 2, "args": { "External id": 157629, "cbid": 251, "correlation": 157629 } }, { "ph": "f", "id": 157629, "pid": 76337, "tid": -914061504, "ts": 1716454223845987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223901003, "dur": 1, "args": { "External id": 157631, "device": 5, "context": 1, "stream": 7, "correlation": 157631, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 157631, "pid": 5, "tid": 7, "ts": 1716454223901003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223845993, "dur": 10, "args": { "External id": 157631, "cbid": 51, "correlation": 157631 } }, { "ph": "s", "id": 157631, "pid": 76337, "tid": -914061504, "ts": 1716454223845993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223901007, "dur": 36, "args": { "External id": 157632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157632, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 157632, "pid": 5, "tid": 7, "ts": 1716454223901007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846004, "dur": 13, "args": { "External id": 157632, "cbid": 211, "correlation": 157632 } }, { "ph": "s", "id": 157632, "pid": 76337, "tid": -914061504, "ts": 1716454223846004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223901045, "dur": 4, "args": { "External id": 157640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157640, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157640, "pid": 5, "tid": 7, "ts": 1716454223901045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846077, "dur": 12, "args": { "External id": 157640, "cbid": 211, "correlation": 157640 } }, { "ph": "s", "id": 157640, "pid": 76337, "tid": -914061504, "ts": 1716454223846077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901050, "dur": 8, "args": { "External id": 157648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157648, "pid": 5, "tid": 7, "ts": 1716454223901050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846118, "dur": 10, "args": { "External id": 157648, "cbid": 211, "correlation": 157648 } }, { "ph": "s", "id": 157648, "pid": 76337, "tid": -914061504, "ts": 1716454223846118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223901060, "dur": 8, "args": { "External id": 157670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157670, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 157670, "pid": 5, "tid": 7, "ts": 1716454223901060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846170, "dur": 10, "args": { "External id": 157670, "cbid": 211, "correlation": 157670 } }, { "ph": "s", "id": 157670, "pid": 76337, "tid": -914061504, "ts": 1716454223846170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223846261, "dur": 1, "args": { "External id": 157686, "cbid": 251, "correlation": 157686 } }, { "ph": "f", "id": 157686, "pid": 76337, "tid": -914061504, "ts": 1716454223846261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223846266, "dur": 0, "args": { "External id": 157688, "cbid": 251, "correlation": 157688 } }, { "ph": "f", "id": 157688, "pid": 76337, "tid": -914061504, "ts": 1716454223846266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223901069, "dur": 190, "args": { "External id": 157689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157689, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157689, "pid": 5, "tid": 7, "ts": 1716454223901069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846268, "dur": 14, "args": { "External id": 157689, "cbid": 211, "correlation": 157689 } }, { "ph": "s", "id": 157689, "pid": 76337, "tid": -914061504, "ts": 1716454223846268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901260, "dur": 21, "args": { "External id": 157697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157697, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157697, "pid": 5, "tid": 7, "ts": 1716454223901260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846335, "dur": 13, "args": { "External id": 157697, "cbid": 211, "correlation": 157697 } }, { "ph": "s", "id": 157697, "pid": 76337, "tid": -914061504, "ts": 1716454223846335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901282, "dur": 22, "args": { "External id": 157705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157705, "pid": 5, "tid": 7, "ts": 1716454223901282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846366, "dur": 9, "args": { "External id": 157705, "cbid": 211, "correlation": 157705 } }, { "ph": "s", "id": 157705, "pid": 76337, "tid": -914061504, "ts": 1716454223846366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223846449, "dur": 1, "args": { "External id": 157721, "cbid": 251, "correlation": 157721 } }, { "ph": "f", "id": 157721, "pid": 76337, "tid": -914061504, "ts": 1716454223846449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223901306, "dur": 1, "args": { "External id": 157723, "device": 5, "context": 1, "stream": 7, "correlation": 157723, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 157723, "pid": 5, "tid": 7, "ts": 1716454223901306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223846454, "dur": 8, "args": { "External id": 157723, "cbid": 51, "correlation": 157723 } }, { "ph": "s", "id": 157723, "pid": 76337, "tid": -914061504, "ts": 1716454223846454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223901310, "dur": 108, "args": { "External id": 157724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157724, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 157724, "pid": 5, "tid": 7, "ts": 1716454223901310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846464, "dur": 13, "args": { "External id": 157724, "cbid": 211, "correlation": 157724 } }, { "ph": "s", "id": 157724, "pid": 76337, "tid": -914061504, "ts": 1716454223846464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223901420, "dur": 5, "args": { "External id": 157732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157732, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157732, "pid": 5, "tid": 7, "ts": 1716454223901420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846506, "dur": 10, "args": { "External id": 157732, "cbid": 211, "correlation": 157732 } }, { "ph": "s", "id": 157732, "pid": 76337, "tid": -914061504, "ts": 1716454223846506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901426, "dur": 10, "args": { "External id": 157743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157743, "pid": 5, "tid": 7, "ts": 1716454223901426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846577, "dur": 12, "args": { "External id": 157743, "cbid": 211, "correlation": 157743 } }, { "ph": "s", "id": 157743, "pid": 76337, "tid": -914061504, "ts": 1716454223846577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223846640, "dur": 0, "args": { "External id": 157755, "cbid": 317, "correlation": 157755 } }, { "ph": "f", "id": 157755, "pid": 76337, "tid": -914061504, "ts": 1716454223846640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223846641, "dur": 0, "args": { "External id": 157756, "cbid": 203, "correlation": 157756 } }, { "ph": "f", "id": 157756, "pid": 76337, "tid": -914061504, "ts": 1716454223846641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223846642, "dur": 0, "args": { "External id": 157757, "cbid": 205, "correlation": 157757 } }, { "ph": "f", "id": 157757, "pid": 76337, "tid": -914061504, "ts": 1716454223846642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223901437, "dur": 6, "args": { "External id": 157761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157761, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157761, "pid": 5, "tid": 7, "ts": 1716454223901437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846658, "dur": 13, "args": { "External id": 157761, "cbid": 211, "correlation": 157761 } }, { "ph": "s", "id": 157761, "pid": 76337, "tid": -914061504, "ts": 1716454223846658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223901444, "dur": 38, "args": { "External id": 157763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157763, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 157763, "pid": 5, "tid": 7, "ts": 1716454223901444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846677, "dur": 7, "args": { "External id": 157763, "cbid": 211, "correlation": 157763 } }, { "ph": "s", "id": 157763, "pid": 76337, "tid": -914061504, "ts": 1716454223846677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223901483, "dur": 6, "args": { "External id": 157765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157765, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157765, "pid": 5, "tid": 7, "ts": 1716454223901483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846688, "dur": 5, "args": { "External id": 157765, "cbid": 211, "correlation": 157765 } }, { "ph": "s", "id": 157765, "pid": 76337, "tid": -914061504, "ts": 1716454223846688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901490, "dur": 7, "args": { "External id": 157771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157771, "pid": 5, "tid": 7, "ts": 1716454223901490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846715, "dur": 10, "args": { "External id": 157771, "cbid": 211, "correlation": 157771 } }, { "ph": "s", "id": 157771, "pid": 76337, "tid": -914061504, "ts": 1716454223846715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223901499, "dur": 5, "args": { "External id": 157779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157779, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157779, "pid": 5, "tid": 7, "ts": 1716454223901499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846747, "dur": 8, "args": { "External id": 157779, "cbid": 211, "correlation": 157779 } }, { "ph": "s", "id": 157779, "pid": 76337, "tid": -914061504, "ts": 1716454223846747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223901505, "dur": 11, "args": { "External id": 157799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157799, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 157799, "pid": 5, "tid": 7, "ts": 1716454223901505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846820, "dur": 12, "args": { "External id": 157799, "cbid": 211, "correlation": 157799 } }, { "ph": "s", "id": 157799, "pid": 76337, "tid": -914061504, "ts": 1716454223846820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223901518, "dur": 4, "args": { "External id": 157811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157811, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 157811, "pid": 5, "tid": 7, "ts": 1716454223901518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846842, "dur": 6, "args": { "External id": 157811, "cbid": 211, "correlation": 157811 } }, { "ph": "s", "id": 157811, "pid": 76337, "tid": -914061504, "ts": 1716454223846842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901524, "dur": 9, "args": { "External id": 157814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157814, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157814, "pid": 5, "tid": 7, "ts": 1716454223901524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846860, "dur": 7, "args": { "External id": 157814, "cbid": 211, "correlation": 157814 } }, { "ph": "s", "id": 157814, "pid": 76337, "tid": -914061504, "ts": 1716454223846860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223901534, "dur": 5, "args": { "External id": 157823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157823, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157823, "pid": 5, "tid": 7, "ts": 1716454223901534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846898, "dur": 10, "args": { "External id": 157823, "cbid": 211, "correlation": 157823 } }, { "ph": "s", "id": 157823, "pid": 76337, "tid": -914061504, "ts": 1716454223846898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223846949, "dur": 0, "args": { "External id": 157833, "cbid": 317, "correlation": 157833 } }, { "ph": "f", "id": 157833, "pid": 76337, "tid": -914061504, "ts": 1716454223846949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223846950, "dur": 0, "args": { "External id": 157834, "cbid": 203, "correlation": 157834 } }, { "ph": "f", "id": 157834, "pid": 76337, "tid": -914061504, "ts": 1716454223846950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223846951, "dur": 0, "args": { "External id": 157835, "cbid": 205, "correlation": 157835 } }, { "ph": "f", "id": 157835, "pid": 76337, "tid": -914061504, "ts": 1716454223846951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223901541, "dur": 5, "args": { "External id": 157839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157839, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157839, "pid": 5, "tid": 7, "ts": 1716454223901541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846964, "dur": 20, "args": { "External id": 157839, "cbid": 211, "correlation": 157839 } }, { "ph": "s", "id": 157839, "pid": 76337, "tid": -914061504, "ts": 1716454223846964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223901547, "dur": 162, "args": { "External id": 157841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157841, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157841, "pid": 5, "tid": 7, "ts": 1716454223901547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223846987, "dur": 5, "args": { "External id": 157841, "cbid": 211, "correlation": 157841 } }, { "ph": "s", "id": 157841, "pid": 76337, "tid": -914061504, "ts": 1716454223846987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223901711, "dur": 1, "args": { "External id": 157843, "device": 5, "context": 1, "stream": 7, "correlation": 157843, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 157843, "pid": 5, "tid": 7, "ts": 1716454223901711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223846998, "dur": 7, "args": { "External id": 157843, "cbid": 51, "correlation": 157843 } }, { "ph": "s", "id": 157843, "pid": 76337, "tid": -914061504, "ts": 1716454223846998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223901715, "dur": 269, "args": { "External id": 157844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157844, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157844, "pid": 5, "tid": 7, "ts": 1716454223901715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847007, "dur": 6, "args": { "External id": 157844, "cbid": 211, "correlation": 157844 } }, { "ph": "s", "id": 157844, "pid": 76337, "tid": -914061504, "ts": 1716454223847007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223901985, "dur": 6, "args": { "External id": 157846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157846, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157846, "pid": 5, "tid": 7, "ts": 1716454223901985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847017, "dur": 6, "args": { "External id": 157846, "cbid": 211, "correlation": 157846 } }, { "ph": "s", "id": 157846, "pid": 76337, "tid": -914061504, "ts": 1716454223847017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223901992, "dur": 6, "args": { "External id": 157852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157852, "pid": 5, "tid": 7, "ts": 1716454223901992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847046, "dur": 8, "args": { "External id": 157852, "cbid": 211, "correlation": 157852 } }, { "ph": "s", "id": 157852, "pid": 76337, "tid": -914061504, "ts": 1716454223847046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223901999, "dur": 3, "args": { "External id": 157860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157860, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 157860, "pid": 5, "tid": 7, "ts": 1716454223901999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847091, "dur": 9, "args": { "External id": 157860, "cbid": 211, "correlation": 157860 } }, { "ph": "s", "id": 157860, "pid": 76337, "tid": -914061504, "ts": 1716454223847091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223847156, "dur": 1, "args": { "External id": 157876, "cbid": 251, "correlation": 157876 } }, { "ph": "f", "id": 157876, "pid": 76337, "tid": -914061504, "ts": 1716454223847156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223847161, "dur": 0, "args": { "External id": 157878, "cbid": 251, "correlation": 157878 } }, { "ph": "f", "id": 157878, "pid": 76337, "tid": -914061504, "ts": 1716454223847161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223902004, "dur": 12, "args": { "External id": 157879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157879, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157879, "pid": 5, "tid": 7, "ts": 1716454223902004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847163, "dur": 12, "args": { "External id": 157879, "cbid": 211, "correlation": 157879 } }, { "ph": "s", "id": 157879, "pid": 76337, "tid": -914061504, "ts": 1716454223847163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223902017, "dur": 5, "args": { "External id": 157881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157881, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157881, "pid": 5, "tid": 7, "ts": 1716454223902017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847177, "dur": 6, "args": { "External id": 157881, "cbid": 211, "correlation": 157881 } }, { "ph": "s", "id": 157881, "pid": 76337, "tid": -914061504, "ts": 1716454223847177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223902024, "dur": 6, "args": { "External id": 157891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157891, "pid": 5, "tid": 7, "ts": 1716454223902024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847234, "dur": 12, "args": { "External id": 157891, "cbid": 211, "correlation": 157891 } }, { "ph": "s", "id": 157891, "pid": 76337, "tid": -914061504, "ts": 1716454223847234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223902031, "dur": 10, "args": { "External id": 157911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157911, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 157911, "pid": 5, "tid": 7, "ts": 1716454223902031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847299, "dur": 11, "args": { "External id": 157911, "cbid": 211, "correlation": 157911 } }, { "ph": "s", "id": 157911, "pid": 76337, "tid": -914061504, "ts": 1716454223847299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223902042, "dur": 4, "args": { "External id": 157923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157923, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 157923, "pid": 5, "tid": 7, "ts": 1716454223902042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847320, "dur": 7, "args": { "External id": 157923, "cbid": 211, "correlation": 157923 } }, { "ph": "s", "id": 157923, "pid": 76337, "tid": -914061504, "ts": 1716454223847320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223902047, "dur": 7, "args": { "External id": 157926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157926, "pid": 5, "tid": 7, "ts": 1716454223902047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847339, "dur": 6, "args": { "External id": 157926, "cbid": 211, "correlation": 157926 } }, { "ph": "s", "id": 157926, "pid": 76337, "tid": -914061504, "ts": 1716454223847339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223902055, "dur": 5, "args": { "External id": 157935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157935, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157935, "pid": 5, "tid": 7, "ts": 1716454223902055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847379, "dur": 10, "args": { "External id": 157935, "cbid": 211, "correlation": 157935 } }, { "ph": "s", "id": 157935, "pid": 76337, "tid": -914061504, "ts": 1716454223847379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223847442, "dur": 0, "args": { "External id": 157945, "cbid": 317, "correlation": 157945 } }, { "ph": "f", "id": 157945, "pid": 76337, "tid": -914061504, "ts": 1716454223847442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223847443, "dur": 0, "args": { "External id": 157946, "cbid": 203, "correlation": 157946 } }, { "ph": "f", "id": 157946, "pid": 76337, "tid": -914061504, "ts": 1716454223847443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223847444, "dur": 0, "args": { "External id": 157947, "cbid": 205, "correlation": 157947 } }, { "ph": "f", "id": 157947, "pid": 76337, "tid": -914061504, "ts": 1716454223847444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223902061, "dur": 5, "args": { "External id": 157951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157951, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157951, "pid": 5, "tid": 7, "ts": 1716454223902061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847457, "dur": 12, "args": { "External id": 157951, "cbid": 211, "correlation": 157951 } }, { "ph": "s", "id": 157951, "pid": 76337, "tid": -914061504, "ts": 1716454223847457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223902067, "dur": 161, "args": { "External id": 157953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157953, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157953, "pid": 5, "tid": 7, "ts": 1716454223902067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847472, "dur": 5, "args": { "External id": 157953, "cbid": 211, "correlation": 157953 } }, { "ph": "s", "id": 157953, "pid": 76337, "tid": -914061504, "ts": 1716454223847472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223902230, "dur": 1, "args": { "External id": 157955, "device": 5, "context": 1, "stream": 7, "correlation": 157955, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 157955, "pid": 5, "tid": 7, "ts": 1716454223902230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223847483, "dur": 6, "args": { "External id": 157955, "cbid": 51, "correlation": 157955 } }, { "ph": "s", "id": 157955, "pid": 76337, "tid": -914061504, "ts": 1716454223847483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223902234, "dur": 258, "args": { "External id": 157956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157956, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 157956, "pid": 5, "tid": 7, "ts": 1716454223902234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847490, "dur": 6, "args": { "External id": 157956, "cbid": 211, "correlation": 157956 } }, { "ph": "s", "id": 157956, "pid": 76337, "tid": -914061504, "ts": 1716454223847490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223902494, "dur": 6, "args": { "External id": 157958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157958, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157958, "pid": 5, "tid": 7, "ts": 1716454223902494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847499, "dur": 5, "args": { "External id": 157958, "cbid": 211, "correlation": 157958 } }, { "ph": "s", "id": 157958, "pid": 76337, "tid": -914061504, "ts": 1716454223847499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223902501, "dur": 6, "args": { "External id": 157964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157964, "pid": 5, "tid": 7, "ts": 1716454223902501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847528, "dur": 8, "args": { "External id": 157964, "cbid": 211, "correlation": 157964 } }, { "ph": "s", "id": 157964, "pid": 76337, "tid": -914061504, "ts": 1716454223847528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223902509, "dur": 5, "args": { "External id": 157972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157972, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157972, "pid": 5, "tid": 7, "ts": 1716454223902509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847560, "dur": 9, "args": { "External id": 157972, "cbid": 211, "correlation": 157972 } }, { "ph": "s", "id": 157972, "pid": 76337, "tid": -914061504, "ts": 1716454223847560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223902515, "dur": 4, "args": { "External id": 157980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157980, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 157980, "pid": 5, "tid": 7, "ts": 1716454223902515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847590, "dur": 8, "args": { "External id": 157980, "cbid": 211, "correlation": 157980 } }, { "ph": "s", "id": 157980, "pid": 76337, "tid": -914061504, "ts": 1716454223847590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223902521, "dur": 11, "args": { "External id": 157989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 157989, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 157989, "pid": 5, "tid": 7, "ts": 1716454223902521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847678, "dur": 13, "args": { "External id": 157989, "cbid": 211, "correlation": 157989 } }, { "ph": "s", "id": 157989, "pid": 76337, "tid": -914061504, "ts": 1716454223847678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223902533, "dur": 12, "args": { "External id": 158009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158009, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158009, "pid": 5, "tid": 7, "ts": 1716454223902533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847746, "dur": 11, "args": { "External id": 158009, "cbid": 211, "correlation": 158009 } }, { "ph": "s", "id": 158009, "pid": 76337, "tid": -914061504, "ts": 1716454223847746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223902547, "dur": 4, "args": { "External id": 158021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158021, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158021, "pid": 5, "tid": 7, "ts": 1716454223902547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847767, "dur": 7, "args": { "External id": 158021, "cbid": 211, "correlation": 158021 } }, { "ph": "s", "id": 158021, "pid": 76337, "tid": -914061504, "ts": 1716454223847767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223902552, "dur": 10, "args": { "External id": 158024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158024, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158024, "pid": 5, "tid": 7, "ts": 1716454223902552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847786, "dur": 6, "args": { "External id": 158024, "cbid": 211, "correlation": 158024 } }, { "ph": "s", "id": 158024, "pid": 76337, "tid": -914061504, "ts": 1716454223847786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223902563, "dur": 6, "args": { "External id": 158033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158033, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158033, "pid": 5, "tid": 7, "ts": 1716454223902563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847824, "dur": 10, "args": { "External id": 158033, "cbid": 211, "correlation": 158033 } }, { "ph": "s", "id": 158033, "pid": 76337, "tid": -914061504, "ts": 1716454223847824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223847877, "dur": 0, "args": { "External id": 158043, "cbid": 317, "correlation": 158043 } }, { "ph": "f", "id": 158043, "pid": 76337, "tid": -914061504, "ts": 1716454223847877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223847878, "dur": 0, "args": { "External id": 158044, "cbid": 203, "correlation": 158044 } }, { "ph": "f", "id": 158044, "pid": 76337, "tid": -914061504, "ts": 1716454223847878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223847879, "dur": 0, "args": { "External id": 158045, "cbid": 205, "correlation": 158045 } }, { "ph": "f", "id": 158045, "pid": 76337, "tid": -914061504, "ts": 1716454223847879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223902571, "dur": 7, "args": { "External id": 158049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158049, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158049, "pid": 5, "tid": 7, "ts": 1716454223902571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847894, "dur": 11, "args": { "External id": 158049, "cbid": 211, "correlation": 158049 } }, { "ph": "s", "id": 158049, "pid": 76337, "tid": -914061504, "ts": 1716454223847894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223902579, "dur": 319, "args": { "External id": 158051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158051, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158051, "pid": 5, "tid": 7, "ts": 1716454223902579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847908, "dur": 5, "args": { "External id": 158051, "cbid": 211, "correlation": 158051 } }, { "ph": "s", "id": 158051, "pid": 76337, "tid": -914061504, "ts": 1716454223847908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223902900, "dur": 1, "args": { "External id": 158053, "device": 5, "context": 1, "stream": 7, "correlation": 158053, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 158053, "pid": 5, "tid": 7, "ts": 1716454223902900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223847919, "dur": 8, "args": { "External id": 158053, "cbid": 51, "correlation": 158053 } }, { "ph": "s", "id": 158053, "pid": 76337, "tid": -914061504, "ts": 1716454223847919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223902903, "dur": 494, "args": { "External id": 158054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158054, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158054, "pid": 5, "tid": 7, "ts": 1716454223902903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847928, "dur": 6, "args": { "External id": 158054, "cbid": 211, "correlation": 158054 } }, { "ph": "s", "id": 158054, "pid": 76337, "tid": -914061504, "ts": 1716454223847928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223903398, "dur": 5, "args": { "External id": 158056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158056, "pid": 5, "tid": 7, "ts": 1716454223903398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847938, "dur": 5, "args": { "External id": 158056, "cbid": 211, "correlation": 158056 } }, { "ph": "s", "id": 158056, "pid": 76337, "tid": -914061504, "ts": 1716454223847938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223903405, "dur": 6, "args": { "External id": 158062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158062, "pid": 5, "tid": 7, "ts": 1716454223903405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223847966, "dur": 18, "args": { "External id": 158062, "cbid": 211, "correlation": 158062 } }, { "ph": "s", "id": 158062, "pid": 76337, "tid": -914061504, "ts": 1716454223847966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223903413, "dur": 3, "args": { "External id": 158070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158070, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 158070, "pid": 5, "tid": 7, "ts": 1716454223903413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848019, "dur": 11, "args": { "External id": 158070, "cbid": 211, "correlation": 158070 } }, { "ph": "s", "id": 158070, "pid": 76337, "tid": -914061504, "ts": 1716454223848019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223848084, "dur": 1, "args": { "External id": 158086, "cbid": 251, "correlation": 158086 } }, { "ph": "f", "id": 158086, "pid": 76337, "tid": -914061504, "ts": 1716454223848084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223848089, "dur": 0, "args": { "External id": 158088, "cbid": 251, "correlation": 158088 } }, { "ph": "f", "id": 158088, "pid": 76337, "tid": -914061504, "ts": 1716454223848089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223903417, "dur": 11, "args": { "External id": 158089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158089, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158089, "pid": 5, "tid": 7, "ts": 1716454223903417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848091, "dur": 11, "args": { "External id": 158089, "cbid": 211, "correlation": 158089 } }, { "ph": "s", "id": 158089, "pid": 76337, "tid": -914061504, "ts": 1716454223848091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223903430, "dur": 5, "args": { "External id": 158091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158091, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158091, "pid": 5, "tid": 7, "ts": 1716454223903430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848104, "dur": 6, "args": { "External id": 158091, "cbid": 211, "correlation": 158091 } }, { "ph": "s", "id": 158091, "pid": 76337, "tid": -914061504, "ts": 1716454223848104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223903436, "dur": 6, "args": { "External id": 158101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158101, "pid": 5, "tid": 7, "ts": 1716454223903436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848161, "dur": 13, "args": { "External id": 158101, "cbid": 211, "correlation": 158101 } }, { "ph": "s", "id": 158101, "pid": 76337, "tid": -914061504, "ts": 1716454223848161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223903443, "dur": 9, "args": { "External id": 158121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158121, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158121, "pid": 5, "tid": 7, "ts": 1716454223903443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848229, "dur": 11, "args": { "External id": 158121, "cbid": 211, "correlation": 158121 } }, { "ph": "s", "id": 158121, "pid": 76337, "tid": -914061504, "ts": 1716454223848229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223903454, "dur": 4, "args": { "External id": 158133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158133, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 158133, "pid": 5, "tid": 7, "ts": 1716454223903454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848250, "dur": 6, "args": { "External id": 158133, "cbid": 211, "correlation": 158133 } }, { "ph": "s", "id": 158133, "pid": 76337, "tid": -914061504, "ts": 1716454223848250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223903459, "dur": 7, "args": { "External id": 158136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158136, "pid": 5, "tid": 7, "ts": 1716454223903459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848268, "dur": 7, "args": { "External id": 158136, "cbid": 211, "correlation": 158136 } }, { "ph": "s", "id": 158136, "pid": 76337, "tid": -914061504, "ts": 1716454223848268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223903467, "dur": 5, "args": { "External id": 158145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158145, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158145, "pid": 5, "tid": 7, "ts": 1716454223903467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848308, "dur": 10, "args": { "External id": 158145, "cbid": 211, "correlation": 158145 } }, { "ph": "s", "id": 158145, "pid": 76337, "tid": -914061504, "ts": 1716454223848308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223848371, "dur": 0, "args": { "External id": 158155, "cbid": 317, "correlation": 158155 } }, { "ph": "f", "id": 158155, "pid": 76337, "tid": -914061504, "ts": 1716454223848371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223848372, "dur": 0, "args": { "External id": 158156, "cbid": 203, "correlation": 158156 } }, { "ph": "f", "id": 158156, "pid": 76337, "tid": -914061504, "ts": 1716454223848372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223848372, "dur": 0, "args": { "External id": 158157, "cbid": 205, "correlation": 158157 } }, { "ph": "f", "id": 158157, "pid": 76337, "tid": -914061504, "ts": 1716454223848372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223903473, "dur": 5, "args": { "External id": 158161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158161, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158161, "pid": 5, "tid": 7, "ts": 1716454223903473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848387, "dur": 12, "args": { "External id": 158161, "cbid": 211, "correlation": 158161 } }, { "ph": "s", "id": 158161, "pid": 76337, "tid": -914061504, "ts": 1716454223848387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223903480, "dur": 162, "args": { "External id": 158163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158163, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158163, "pid": 5, "tid": 7, "ts": 1716454223903480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848401, "dur": 5, "args": { "External id": 158163, "cbid": 211, "correlation": 158163 } }, { "ph": "s", "id": 158163, "pid": 76337, "tid": -914061504, "ts": 1716454223848401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223903644, "dur": 1, "args": { "External id": 158165, "device": 5, "context": 1, "stream": 7, "correlation": 158165, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 158165, "pid": 5, "tid": 7, "ts": 1716454223903644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223848412, "dur": 6, "args": { "External id": 158165, "cbid": 51, "correlation": 158165 } }, { "ph": "s", "id": 158165, "pid": 76337, "tid": -914061504, "ts": 1716454223848412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223903648, "dur": 258, "args": { "External id": 158166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158166, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158166, "pid": 5, "tid": 7, "ts": 1716454223903648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848419, "dur": 6, "args": { "External id": 158166, "cbid": 211, "correlation": 158166 } }, { "ph": "s", "id": 158166, "pid": 76337, "tid": -914061504, "ts": 1716454223848419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223903907, "dur": 6, "args": { "External id": 158168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158168, "pid": 5, "tid": 7, "ts": 1716454223903907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848430, "dur": 5, "args": { "External id": 158168, "cbid": 211, "correlation": 158168 } }, { "ph": "s", "id": 158168, "pid": 76337, "tid": -914061504, "ts": 1716454223848430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223903915, "dur": 6, "args": { "External id": 158174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158174, "pid": 5, "tid": 7, "ts": 1716454223903915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848458, "dur": 8, "args": { "External id": 158174, "cbid": 211, "correlation": 158174 } }, { "ph": "s", "id": 158174, "pid": 76337, "tid": -914061504, "ts": 1716454223848458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223848516, "dur": 0, "args": { "External id": 158184, "cbid": 317, "correlation": 158184 } }, { "ph": "f", "id": 158184, "pid": 76337, "tid": -914061504, "ts": 1716454223848516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223848517, "dur": 0, "args": { "External id": 158185, "cbid": 203, "correlation": 158185 } }, { "ph": "f", "id": 158185, "pid": 76337, "tid": -914061504, "ts": 1716454223848517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223848518, "dur": 0, "args": { "External id": 158186, "cbid": 205, "correlation": 158186 } }, { "ph": "f", "id": 158186, "pid": 76337, "tid": -914061504, "ts": 1716454223848518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223903922, "dur": 8, "args": { "External id": 158190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158190, "pid": 5, "tid": 7, "ts": 1716454223903922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848534, "dur": 11, "args": { "External id": 158190, "cbid": 211, "correlation": 158190 } }, { "ph": "s", "id": 158190, "pid": 76337, "tid": -914061504, "ts": 1716454223848534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223903931, "dur": 3, "args": { "External id": 158192, "device": 5, "context": 1, "stream": 7, "correlation": 158192, "bytes": 4800, "memory bandwidth (GB/s)": 1.4150943396226414 } }, { "ph": "f", "id": 158192, "pid": 5, "tid": 7, "ts": 1716454223903931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223848551, "dur": 14, "args": { "External id": 158192, "cbid": 51, "correlation": 158192 } }, { "ph": "s", "id": 158192, "pid": 76337, "tid": -914061504, "ts": 1716454223848551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223903936, "dur": 96, "args": { "External id": 158193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158193, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 158193, "pid": 5, "tid": 7, "ts": 1716454223903936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848567, "dur": 7, "args": { "External id": 158193, "cbid": 211, "correlation": 158193 } }, { "ph": "s", "id": 158193, "pid": 76337, "tid": -914061504, "ts": 1716454223848567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223904033, "dur": 5, "args": { "External id": 158195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158195, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158195, "pid": 5, "tid": 7, "ts": 1716454223904033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848578, "dur": 5, "args": { "External id": 158195, "cbid": 211, "correlation": 158195 } }, { "ph": "s", "id": 158195, "pid": 76337, "tid": -914061504, "ts": 1716454223848578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223904040, "dur": 6, "args": { "External id": 158201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158201, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158201, "pid": 5, "tid": 7, "ts": 1716454223904040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848605, "dur": 8, "args": { "External id": 158201, "cbid": 211, "correlation": 158201 } }, { "ph": "s", "id": 158201, "pid": 76337, "tid": -914061504, "ts": 1716454223848605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223904047, "dur": 5, "args": { "External id": 158209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158209, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158209, "pid": 5, "tid": 7, "ts": 1716454223904047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848635, "dur": 7, "args": { "External id": 158209, "cbid": 211, "correlation": 158209 } }, { "ph": "s", "id": 158209, "pid": 76337, "tid": -914061504, "ts": 1716454223848635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223904053, "dur": 4, "args": { "External id": 158217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158217, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158217, "pid": 5, "tid": 7, "ts": 1716454223904053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848663, "dur": 8, "args": { "External id": 158217, "cbid": 211, "correlation": 158217 } }, { "ph": "s", "id": 158217, "pid": 76337, "tid": -914061504, "ts": 1716454223848663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223904059, "dur": 11, "args": { "External id": 158226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158226, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158226, "pid": 5, "tid": 7, "ts": 1716454223904059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848707, "dur": 10, "args": { "External id": 158226, "cbid": 211, "correlation": 158226 } }, { "ph": "s", "id": 158226, "pid": 76337, "tid": -914061504, "ts": 1716454223848707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223904072, "dur": 12, "args": { "External id": 158246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158246, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158246, "pid": 5, "tid": 7, "ts": 1716454223904072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848775, "dur": 12, "args": { "External id": 158246, "cbid": 211, "correlation": 158246 } }, { "ph": "s", "id": 158246, "pid": 76337, "tid": -914061504, "ts": 1716454223848775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223904085, "dur": 4, "args": { "External id": 158258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158258, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158258, "pid": 5, "tid": 7, "ts": 1716454223904085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848796, "dur": 6, "args": { "External id": 158258, "cbid": 211, "correlation": 158258 } }, { "ph": "s", "id": 158258, "pid": 76337, "tid": -914061504, "ts": 1716454223848796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223904090, "dur": 10, "args": { "External id": 158261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158261, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158261, "pid": 5, "tid": 7, "ts": 1716454223904090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848814, "dur": 6, "args": { "External id": 158261, "cbid": 211, "correlation": 158261 } }, { "ph": "s", "id": 158261, "pid": 76337, "tid": -914061504, "ts": 1716454223848814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223904102, "dur": 7, "args": { "External id": 158270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158270, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158270, "pid": 5, "tid": 7, "ts": 1716454223904102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848852, "dur": 9, "args": { "External id": 158270, "cbid": 211, "correlation": 158270 } }, { "ph": "s", "id": 158270, "pid": 76337, "tid": -914061504, "ts": 1716454223848852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223848904, "dur": 0, "args": { "External id": 158280, "cbid": 317, "correlation": 158280 } }, { "ph": "f", "id": 158280, "pid": 76337, "tid": -914061504, "ts": 1716454223848904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223848905, "dur": 0, "args": { "External id": 158281, "cbid": 203, "correlation": 158281 } }, { "ph": "f", "id": 158281, "pid": 76337, "tid": -914061504, "ts": 1716454223848905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223848906, "dur": 0, "args": { "External id": 158282, "cbid": 205, "correlation": 158282 } }, { "ph": "f", "id": 158282, "pid": 76337, "tid": -914061504, "ts": 1716454223848906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223904110, "dur": 7, "args": { "External id": 158286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158286, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158286, "pid": 5, "tid": 7, "ts": 1716454223904110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848920, "dur": 12, "args": { "External id": 158286, "cbid": 211, "correlation": 158286 } }, { "ph": "s", "id": 158286, "pid": 76337, "tid": -914061504, "ts": 1716454223848920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223904118, "dur": 319, "args": { "External id": 158288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158288, "pid": 5, "tid": 7, "ts": 1716454223904118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848934, "dur": 5, "args": { "External id": 158288, "cbid": 211, "correlation": 158288 } }, { "ph": "s", "id": 158288, "pid": 76337, "tid": -914061504, "ts": 1716454223848934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223904440, "dur": 1, "args": { "External id": 158290, "device": 5, "context": 1, "stream": 7, "correlation": 158290, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 158290, "pid": 5, "tid": 7, "ts": 1716454223904440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223848945, "dur": 6, "args": { "External id": 158290, "cbid": 51, "correlation": 158290 } }, { "ph": "s", "id": 158290, "pid": 76337, "tid": -914061504, "ts": 1716454223848945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223904443, "dur": 495, "args": { "External id": 158291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158291, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158291, "pid": 5, "tid": 7, "ts": 1716454223904443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848952, "dur": 6, "args": { "External id": 158291, "cbid": 211, "correlation": 158291 } }, { "ph": "s", "id": 158291, "pid": 76337, "tid": -914061504, "ts": 1716454223848952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223904940, "dur": 6, "args": { "External id": 158293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158293, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158293, "pid": 5, "tid": 7, "ts": 1716454223904940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848961, "dur": 5, "args": { "External id": 158293, "cbid": 211, "correlation": 158293 } }, { "ph": "s", "id": 158293, "pid": 76337, "tid": -914061504, "ts": 1716454223848961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223904947, "dur": 6, "args": { "External id": 158299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158299, "pid": 5, "tid": 7, "ts": 1716454223904947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223848997, "dur": 9, "args": { "External id": 158299, "cbid": 211, "correlation": 158299 } }, { "ph": "s", "id": 158299, "pid": 76337, "tid": -914061504, "ts": 1716454223848997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223904955, "dur": 3, "args": { "External id": 158307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158307, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 158307, "pid": 5, "tid": 7, "ts": 1716454223904955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849042, "dur": 9, "args": { "External id": 158307, "cbid": 211, "correlation": 158307 } }, { "ph": "s", "id": 158307, "pid": 76337, "tid": -914061504, "ts": 1716454223849042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223849104, "dur": 1, "args": { "External id": 158323, "cbid": 251, "correlation": 158323 } }, { "ph": "f", "id": 158323, "pid": 76337, "tid": -914061504, "ts": 1716454223849104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223849108, "dur": 0, "args": { "External id": 158325, "cbid": 251, "correlation": 158325 } }, { "ph": "f", "id": 158325, "pid": 76337, "tid": -914061504, "ts": 1716454223849108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223904959, "dur": 12, "args": { "External id": 158326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158326, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158326, "pid": 5, "tid": 7, "ts": 1716454223904959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849110, "dur": 11, "args": { "External id": 158326, "cbid": 211, "correlation": 158326 } }, { "ph": "s", "id": 158326, "pid": 76337, "tid": -914061504, "ts": 1716454223849110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223904972, "dur": 5, "args": { "External id": 158328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158328, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158328, "pid": 5, "tid": 7, "ts": 1716454223904972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849123, "dur": 6, "args": { "External id": 158328, "cbid": 211, "correlation": 158328 } }, { "ph": "s", "id": 158328, "pid": 76337, "tid": -914061504, "ts": 1716454223849123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223904979, "dur": 6, "args": { "External id": 158338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158338, "pid": 5, "tid": 7, "ts": 1716454223904979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849181, "dur": 12, "args": { "External id": 158338, "cbid": 211, "correlation": 158338 } }, { "ph": "s", "id": 158338, "pid": 76337, "tid": -914061504, "ts": 1716454223849181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223904986, "dur": 9, "args": { "External id": 158358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158358, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158358, "pid": 5, "tid": 7, "ts": 1716454223904986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849246, "dur": 11, "args": { "External id": 158358, "cbid": 211, "correlation": 158358 } }, { "ph": "s", "id": 158358, "pid": 76337, "tid": -914061504, "ts": 1716454223849246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223904997, "dur": 4, "args": { "External id": 158370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158370, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 158370, "pid": 5, "tid": 7, "ts": 1716454223904997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849267, "dur": 6, "args": { "External id": 158370, "cbid": 211, "correlation": 158370 } }, { "ph": "s", "id": 158370, "pid": 76337, "tid": -914061504, "ts": 1716454223849267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223905002, "dur": 7, "args": { "External id": 158373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158373, "pid": 5, "tid": 7, "ts": 1716454223905002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849287, "dur": 6, "args": { "External id": 158373, "cbid": 211, "correlation": 158373 } }, { "ph": "s", "id": 158373, "pid": 76337, "tid": -914061504, "ts": 1716454223849287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223905010, "dur": 5, "args": { "External id": 158382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158382, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158382, "pid": 5, "tid": 7, "ts": 1716454223905010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849328, "dur": 10, "args": { "External id": 158382, "cbid": 211, "correlation": 158382 } }, { "ph": "s", "id": 158382, "pid": 76337, "tid": -914061504, "ts": 1716454223849328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223849392, "dur": 0, "args": { "External id": 158392, "cbid": 317, "correlation": 158392 } }, { "ph": "f", "id": 158392, "pid": 76337, "tid": -914061504, "ts": 1716454223849392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223849393, "dur": 0, "args": { "External id": 158393, "cbid": 203, "correlation": 158393 } }, { "ph": "f", "id": 158393, "pid": 76337, "tid": -914061504, "ts": 1716454223849393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223849394, "dur": 0, "args": { "External id": 158394, "cbid": 205, "correlation": 158394 } }, { "ph": "f", "id": 158394, "pid": 76337, "tid": -914061504, "ts": 1716454223849394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905016, "dur": 5, "args": { "External id": 158398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158398, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158398, "pid": 5, "tid": 7, "ts": 1716454223905016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849409, "dur": 12, "args": { "External id": 158398, "cbid": 211, "correlation": 158398 } }, { "ph": "s", "id": 158398, "pid": 76337, "tid": -914061504, "ts": 1716454223849409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905022, "dur": 162, "args": { "External id": 158400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158400, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158400, "pid": 5, "tid": 7, "ts": 1716454223905022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849425, "dur": 6, "args": { "External id": 158400, "cbid": 211, "correlation": 158400 } }, { "ph": "s", "id": 158400, "pid": 76337, "tid": -914061504, "ts": 1716454223849425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223905186, "dur": 1, "args": { "External id": 158402, "device": 5, "context": 1, "stream": 7, "correlation": 158402, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 158402, "pid": 5, "tid": 7, "ts": 1716454223905186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223849436, "dur": 7, "args": { "External id": 158402, "cbid": 51, "correlation": 158402 } }, { "ph": "s", "id": 158402, "pid": 76337, "tid": -914061504, "ts": 1716454223849436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223905190, "dur": 258, "args": { "External id": 158403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158403, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158403, "pid": 5, "tid": 7, "ts": 1716454223905190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849445, "dur": 6, "args": { "External id": 158403, "cbid": 211, "correlation": 158403 } }, { "ph": "s", "id": 158403, "pid": 76337, "tid": -914061504, "ts": 1716454223849445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905449, "dur": 6, "args": { "External id": 158405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158405, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158405, "pid": 5, "tid": 7, "ts": 1716454223905449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849456, "dur": 5, "args": { "External id": 158405, "cbid": 211, "correlation": 158405 } }, { "ph": "s", "id": 158405, "pid": 76337, "tid": -914061504, "ts": 1716454223849456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223905456, "dur": 6, "args": { "External id": 158411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158411, "pid": 5, "tid": 7, "ts": 1716454223905456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849486, "dur": 9, "args": { "External id": 158411, "cbid": 211, "correlation": 158411 } }, { "ph": "s", "id": 158411, "pid": 76337, "tid": -914061504, "ts": 1716454223849486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223849545, "dur": 0, "args": { "External id": 158421, "cbid": 317, "correlation": 158421 } }, { "ph": "f", "id": 158421, "pid": 76337, "tid": -914061504, "ts": 1716454223849545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223849546, "dur": 0, "args": { "External id": 158422, "cbid": 203, "correlation": 158422 } }, { "ph": "f", "id": 158422, "pid": 76337, "tid": -914061504, "ts": 1716454223849546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223849546, "dur": 0, "args": { "External id": 158423, "cbid": 205, "correlation": 158423 } }, { "ph": "f", "id": 158423, "pid": 76337, "tid": -914061504, "ts": 1716454223849546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905464, "dur": 8, "args": { "External id": 158427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158427, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158427, "pid": 5, "tid": 7, "ts": 1716454223905464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849559, "dur": 12, "args": { "External id": 158427, "cbid": 211, "correlation": 158427 } }, { "ph": "s", "id": 158427, "pid": 76337, "tid": -914061504, "ts": 1716454223849559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223905473, "dur": 3, "args": { "External id": 158429, "device": 5, "context": 1, "stream": 7, "correlation": 158429, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 158429, "pid": 5, "tid": 7, "ts": 1716454223905473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223849576, "dur": 11, "args": { "External id": 158429, "cbid": 51, "correlation": 158429 } }, { "ph": "s", "id": 158429, "pid": 76337, "tid": -914061504, "ts": 1716454223849576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223905477, "dur": 96, "args": { "External id": 158430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158430, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 158430, "pid": 5, "tid": 7, "ts": 1716454223905477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849589, "dur": 7, "args": { "External id": 158430, "cbid": 211, "correlation": 158430 } }, { "ph": "s", "id": 158430, "pid": 76337, "tid": -914061504, "ts": 1716454223849589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905575, "dur": 6, "args": { "External id": 158432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158432, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158432, "pid": 5, "tid": 7, "ts": 1716454223905575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849599, "dur": 5, "args": { "External id": 158432, "cbid": 211, "correlation": 158432 } }, { "ph": "s", "id": 158432, "pid": 76337, "tid": -914061504, "ts": 1716454223849599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223905582, "dur": 6, "args": { "External id": 158438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158438, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158438, "pid": 5, "tid": 7, "ts": 1716454223905582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849626, "dur": 9, "args": { "External id": 158438, "cbid": 211, "correlation": 158438 } }, { "ph": "s", "id": 158438, "pid": 76337, "tid": -914061504, "ts": 1716454223849626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223905589, "dur": 5, "args": { "External id": 158446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158446, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158446, "pid": 5, "tid": 7, "ts": 1716454223905589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849656, "dur": 8, "args": { "External id": 158446, "cbid": 211, "correlation": 158446 } }, { "ph": "s", "id": 158446, "pid": 76337, "tid": -914061504, "ts": 1716454223849656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223905595, "dur": 4, "args": { "External id": 158454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158454, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158454, "pid": 5, "tid": 7, "ts": 1716454223905595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849688, "dur": 8, "args": { "External id": 158454, "cbid": 211, "correlation": 158454 } }, { "ph": "s", "id": 158454, "pid": 76337, "tid": -914061504, "ts": 1716454223849688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223905601, "dur": 11, "args": { "External id": 158463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158463, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158463, "pid": 5, "tid": 7, "ts": 1716454223905601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849738, "dur": 11, "args": { "External id": 158463, "cbid": 211, "correlation": 158463 } }, { "ph": "s", "id": 158463, "pid": 76337, "tid": -914061504, "ts": 1716454223849738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223905614, "dur": 12, "args": { "External id": 158483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158483, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158483, "pid": 5, "tid": 7, "ts": 1716454223905614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849814, "dur": 12, "args": { "External id": 158483, "cbid": 211, "correlation": 158483 } }, { "ph": "s", "id": 158483, "pid": 76337, "tid": -914061504, "ts": 1716454223849814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223905628, "dur": 4, "args": { "External id": 158495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158495, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158495, "pid": 5, "tid": 7, "ts": 1716454223905628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849836, "dur": 6, "args": { "External id": 158495, "cbid": 211, "correlation": 158495 } }, { "ph": "s", "id": 158495, "pid": 76337, "tid": -914061504, "ts": 1716454223849836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223905633, "dur": 11, "args": { "External id": 158498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158498, "pid": 5, "tid": 7, "ts": 1716454223905633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849855, "dur": 7, "args": { "External id": 158498, "cbid": 211, "correlation": 158498 } }, { "ph": "s", "id": 158498, "pid": 76337, "tid": -914061504, "ts": 1716454223849855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223905645, "dur": 6, "args": { "External id": 158507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158507, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158507, "pid": 5, "tid": 7, "ts": 1716454223905645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849895, "dur": 10, "args": { "External id": 158507, "cbid": 211, "correlation": 158507 } }, { "ph": "s", "id": 158507, "pid": 76337, "tid": -914061504, "ts": 1716454223849895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223849947, "dur": 0, "args": { "External id": 158517, "cbid": 317, "correlation": 158517 } }, { "ph": "f", "id": 158517, "pid": 76337, "tid": -914061504, "ts": 1716454223849947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223849948, "dur": 0, "args": { "External id": 158518, "cbid": 203, "correlation": 158518 } }, { "ph": "f", "id": 158518, "pid": 76337, "tid": -914061504, "ts": 1716454223849948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223849949, "dur": 0, "args": { "External id": 158519, "cbid": 205, "correlation": 158519 } }, { "ph": "f", "id": 158519, "pid": 76337, "tid": -914061504, "ts": 1716454223849949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905653, "dur": 7, "args": { "External id": 158523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158523, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158523, "pid": 5, "tid": 7, "ts": 1716454223905653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849963, "dur": 22, "args": { "External id": 158523, "cbid": 211, "correlation": 158523 } }, { "ph": "s", "id": 158523, "pid": 76337, "tid": -914061504, "ts": 1716454223849963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223905661, "dur": 319, "args": { "External id": 158525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158525, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158525, "pid": 5, "tid": 7, "ts": 1716454223905661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223849987, "dur": 5, "args": { "External id": 158525, "cbid": 211, "correlation": 158525 } }, { "ph": "s", "id": 158525, "pid": 76337, "tid": -914061504, "ts": 1716454223849987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223905981, "dur": 1, "args": { "External id": 158527, "device": 5, "context": 1, "stream": 7, "correlation": 158527, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 158527, "pid": 5, "tid": 7, "ts": 1716454223905981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223849998, "dur": 7, "args": { "External id": 158527, "cbid": 51, "correlation": 158527 } }, { "ph": "s", "id": 158527, "pid": 76337, "tid": -914061504, "ts": 1716454223849998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223905985, "dur": 496, "args": { "External id": 158528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158528, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158528, "pid": 5, "tid": 7, "ts": 1716454223905985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850006, "dur": 6, "args": { "External id": 158528, "cbid": 211, "correlation": 158528 } }, { "ph": "s", "id": 158528, "pid": 76337, "tid": -914061504, "ts": 1716454223850006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223906482, "dur": 5, "args": { "External id": 158530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158530, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158530, "pid": 5, "tid": 7, "ts": 1716454223906482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850016, "dur": 5, "args": { "External id": 158530, "cbid": 211, "correlation": 158530 } }, { "ph": "s", "id": 158530, "pid": 76337, "tid": -914061504, "ts": 1716454223850016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223906489, "dur": 6, "args": { "External id": 158536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158536, "pid": 5, "tid": 7, "ts": 1716454223906489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850046, "dur": 9, "args": { "External id": 158536, "cbid": 211, "correlation": 158536 } }, { "ph": "s", "id": 158536, "pid": 76337, "tid": -914061504, "ts": 1716454223850046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223906496, "dur": 3, "args": { "External id": 158544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158544, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 158544, "pid": 5, "tid": 7, "ts": 1716454223906496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850090, "dur": 9, "args": { "External id": 158544, "cbid": 211, "correlation": 158544 } }, { "ph": "s", "id": 158544, "pid": 76337, "tid": -914061504, "ts": 1716454223850090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223850167, "dur": 1, "args": { "External id": 158560, "cbid": 251, "correlation": 158560 } }, { "ph": "f", "id": 158560, "pid": 76337, "tid": -914061504, "ts": 1716454223850167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223850174, "dur": 0, "args": { "External id": 158562, "cbid": 251, "correlation": 158562 } }, { "ph": "f", "id": 158562, "pid": 76337, "tid": -914061504, "ts": 1716454223850174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223906500, "dur": 12, "args": { "External id": 158563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158563, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158563, "pid": 5, "tid": 7, "ts": 1716454223906500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850176, "dur": 12, "args": { "External id": 158563, "cbid": 211, "correlation": 158563 } }, { "ph": "s", "id": 158563, "pid": 76337, "tid": -914061504, "ts": 1716454223850176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223906514, "dur": 5, "args": { "External id": 158565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158565, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158565, "pid": 5, "tid": 7, "ts": 1716454223906514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850191, "dur": 6, "args": { "External id": 158565, "cbid": 211, "correlation": 158565 } }, { "ph": "s", "id": 158565, "pid": 76337, "tid": -914061504, "ts": 1716454223850191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223906521, "dur": 6, "args": { "External id": 158575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158575, "pid": 5, "tid": 7, "ts": 1716454223906521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850249, "dur": 12, "args": { "External id": 158575, "cbid": 211, "correlation": 158575 } }, { "ph": "s", "id": 158575, "pid": 76337, "tid": -914061504, "ts": 1716454223850249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223906528, "dur": 9, "args": { "External id": 158595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158595, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158595, "pid": 5, "tid": 7, "ts": 1716454223906528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850315, "dur": 11, "args": { "External id": 158595, "cbid": 211, "correlation": 158595 } }, { "ph": "s", "id": 158595, "pid": 76337, "tid": -914061504, "ts": 1716454223850315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223906538, "dur": 4, "args": { "External id": 158607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158607, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 158607, "pid": 5, "tid": 7, "ts": 1716454223906538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850336, "dur": 6, "args": { "External id": 158607, "cbid": 211, "correlation": 158607 } }, { "ph": "s", "id": 158607, "pid": 76337, "tid": -914061504, "ts": 1716454223850336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223906543, "dur": 7, "args": { "External id": 158610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158610, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158610, "pid": 5, "tid": 7, "ts": 1716454223906543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850355, "dur": 6, "args": { "External id": 158610, "cbid": 211, "correlation": 158610 } }, { "ph": "s", "id": 158610, "pid": 76337, "tid": -914061504, "ts": 1716454223850355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223906552, "dur": 5, "args": { "External id": 158619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158619, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158619, "pid": 5, "tid": 7, "ts": 1716454223906552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850396, "dur": 10, "args": { "External id": 158619, "cbid": 211, "correlation": 158619 } }, { "ph": "s", "id": 158619, "pid": 76337, "tid": -914061504, "ts": 1716454223850396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223850459, "dur": 0, "args": { "External id": 158629, "cbid": 317, "correlation": 158629 } }, { "ph": "f", "id": 158629, "pid": 76337, "tid": -914061504, "ts": 1716454223850459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223850459, "dur": 0, "args": { "External id": 158630, "cbid": 203, "correlation": 158630 } }, { "ph": "f", "id": 158630, "pid": 76337, "tid": -914061504, "ts": 1716454223850459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223850460, "dur": 0, "args": { "External id": 158631, "cbid": 205, "correlation": 158631 } }, { "ph": "f", "id": 158631, "pid": 76337, "tid": -914061504, "ts": 1716454223850460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223906558, "dur": 5, "args": { "External id": 158635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158635, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158635, "pid": 5, "tid": 7, "ts": 1716454223906558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850475, "dur": 12, "args": { "External id": 158635, "cbid": 211, "correlation": 158635 } }, { "ph": "s", "id": 158635, "pid": 76337, "tid": -914061504, "ts": 1716454223850475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223906564, "dur": 162, "args": { "External id": 158637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158637, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158637, "pid": 5, "tid": 7, "ts": 1716454223906564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850489, "dur": 5, "args": { "External id": 158637, "cbid": 211, "correlation": 158637 } }, { "ph": "s", "id": 158637, "pid": 76337, "tid": -914061504, "ts": 1716454223850489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223906729, "dur": 1, "args": { "External id": 158639, "device": 5, "context": 1, "stream": 7, "correlation": 158639, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 158639, "pid": 5, "tid": 7, "ts": 1716454223906729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223850500, "dur": 6, "args": { "External id": 158639, "cbid": 51, "correlation": 158639 } }, { "ph": "s", "id": 158639, "pid": 76337, "tid": -914061504, "ts": 1716454223850500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223906732, "dur": 258, "args": { "External id": 158640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158640, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158640, "pid": 5, "tid": 7, "ts": 1716454223906732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850507, "dur": 6, "args": { "External id": 158640, "cbid": 211, "correlation": 158640 } }, { "ph": "s", "id": 158640, "pid": 76337, "tid": -914061504, "ts": 1716454223850507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223906992, "dur": 6, "args": { "External id": 158642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158642, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158642, "pid": 5, "tid": 7, "ts": 1716454223906992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850518, "dur": 5, "args": { "External id": 158642, "cbid": 211, "correlation": 158642 } }, { "ph": "s", "id": 158642, "pid": 76337, "tid": -914061504, "ts": 1716454223850518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223906999, "dur": 6, "args": { "External id": 158648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158648, "pid": 5, "tid": 7, "ts": 1716454223906999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850547, "dur": 8, "args": { "External id": 158648, "cbid": 211, "correlation": 158648 } }, { "ph": "s", "id": 158648, "pid": 76337, "tid": -914061504, "ts": 1716454223850547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223850605, "dur": 0, "args": { "External id": 158658, "cbid": 317, "correlation": 158658 } }, { "ph": "f", "id": 158658, "pid": 76337, "tid": -914061504, "ts": 1716454223850605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223850606, "dur": 0, "args": { "External id": 158659, "cbid": 203, "correlation": 158659 } }, { "ph": "f", "id": 158659, "pid": 76337, "tid": -914061504, "ts": 1716454223850606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223850606, "dur": 0, "args": { "External id": 158660, "cbid": 205, "correlation": 158660 } }, { "ph": "f", "id": 158660, "pid": 76337, "tid": -914061504, "ts": 1716454223850606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223907007, "dur": 8, "args": { "External id": 158664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158664, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158664, "pid": 5, "tid": 7, "ts": 1716454223907007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850618, "dur": 13, "args": { "External id": 158664, "cbid": 211, "correlation": 158664 } }, { "ph": "s", "id": 158664, "pid": 76337, "tid": -914061504, "ts": 1716454223850618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223907016, "dur": 3, "args": { "External id": 158666, "device": 5, "context": 1, "stream": 7, "correlation": 158666, "bytes": 4800, "memory bandwidth (GB/s)": 1.5151515151515151 } }, { "ph": "f", "id": 158666, "pid": 5, "tid": 7, "ts": 1716454223907016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223850636, "dur": 9, "args": { "External id": 158666, "cbid": 51, "correlation": 158666 } }, { "ph": "s", "id": 158666, "pid": 76337, "tid": -914061504, "ts": 1716454223850636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223907020, "dur": 94, "args": { "External id": 158667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158667, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 158667, "pid": 5, "tid": 7, "ts": 1716454223907020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850646, "dur": 6, "args": { "External id": 158667, "cbid": 211, "correlation": 158667 } }, { "ph": "s", "id": 158667, "pid": 76337, "tid": -914061504, "ts": 1716454223850646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223907115, "dur": 5, "args": { "External id": 158669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158669, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158669, "pid": 5, "tid": 7, "ts": 1716454223907115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850655, "dur": 5, "args": { "External id": 158669, "cbid": 211, "correlation": 158669 } }, { "ph": "s", "id": 158669, "pid": 76337, "tid": -914061504, "ts": 1716454223850655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223907122, "dur": 6, "args": { "External id": 158675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158675, "pid": 5, "tid": 7, "ts": 1716454223907122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850683, "dur": 8, "args": { "External id": 158675, "cbid": 211, "correlation": 158675 } }, { "ph": "s", "id": 158675, "pid": 76337, "tid": -914061504, "ts": 1716454223850683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223907129, "dur": 5, "args": { "External id": 158683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158683, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158683, "pid": 5, "tid": 7, "ts": 1716454223907129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850712, "dur": 8, "args": { "External id": 158683, "cbid": 211, "correlation": 158683 } }, { "ph": "s", "id": 158683, "pid": 76337, "tid": -914061504, "ts": 1716454223850712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223907135, "dur": 4, "args": { "External id": 158691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158691, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 158691, "pid": 5, "tid": 7, "ts": 1716454223907135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850741, "dur": 8, "args": { "External id": 158691, "cbid": 211, "correlation": 158691 } }, { "ph": "s", "id": 158691, "pid": 76337, "tid": -914061504, "ts": 1716454223850741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223907141, "dur": 14, "args": { "External id": 158702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158702, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158702, "pid": 5, "tid": 7, "ts": 1716454223907141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850827, "dur": 12, "args": { "External id": 158702, "cbid": 211, "correlation": 158702 } }, { "ph": "s", "id": 158702, "pid": 76337, "tid": -914061504, "ts": 1716454223850827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223850883, "dur": 0, "args": { "External id": 158712, "cbid": 317, "correlation": 158712 } }, { "ph": "f", "id": 158712, "pid": 76337, "tid": -914061504, "ts": 1716454223850883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223850884, "dur": 0, "args": { "External id": 158713, "cbid": 203, "correlation": 158713 } }, { "ph": "f", "id": 158713, "pid": 76337, "tid": -914061504, "ts": 1716454223850884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223850885, "dur": 0, "args": { "External id": 158714, "cbid": 205, "correlation": 158714 } }, { "ph": "f", "id": 158714, "pid": 76337, "tid": -914061504, "ts": 1716454223850885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223907157, "dur": 8, "args": { "External id": 158718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158718, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158718, "pid": 5, "tid": 7, "ts": 1716454223907157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850899, "dur": 12, "args": { "External id": 158718, "cbid": 211, "correlation": 158718 } }, { "ph": "s", "id": 158718, "pid": 76337, "tid": -914061504, "ts": 1716454223850899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223907166, "dur": 163, "args": { "External id": 158720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158720, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158720, "pid": 5, "tid": 7, "ts": 1716454223907166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850913, "dur": 5, "args": { "External id": 158720, "cbid": 211, "correlation": 158720 } }, { "ph": "s", "id": 158720, "pid": 76337, "tid": -914061504, "ts": 1716454223850913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223907331, "dur": 1, "args": { "External id": 158722, "device": 5, "context": 1, "stream": 7, "correlation": 158722, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 158722, "pid": 5, "tid": 7, "ts": 1716454223907331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223850924, "dur": 7, "args": { "External id": 158722, "cbid": 51, "correlation": 158722 } }, { "ph": "s", "id": 158722, "pid": 76337, "tid": -914061504, "ts": 1716454223850924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223907335, "dur": 648, "args": { "External id": 158723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158723, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158723, "pid": 5, "tid": 7, "ts": 1716454223907335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850932, "dur": 6, "args": { "External id": 158723, "cbid": 211, "correlation": 158723 } }, { "ph": "s", "id": 158723, "pid": 76337, "tid": -914061504, "ts": 1716454223850932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223907984, "dur": 12, "args": { "External id": 158725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158725, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158725, "pid": 5, "tid": 7, "ts": 1716454223907984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850941, "dur": 5, "args": { "External id": 158725, "cbid": 211, "correlation": 158725 } }, { "ph": "s", "id": 158725, "pid": 76337, "tid": -914061504, "ts": 1716454223850941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223907997, "dur": 15, "args": { "External id": 158731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158731, "pid": 5, "tid": 7, "ts": 1716454223907997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223850970, "dur": 20, "args": { "External id": 158731, "cbid": 211, "correlation": 158731 } }, { "ph": "s", "id": 158731, "pid": 76337, "tid": -914061504, "ts": 1716454223850970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223908014, "dur": 32, "args": { "External id": 158740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158740, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158740, "pid": 5, "tid": 7, "ts": 1716454223908014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851081, "dur": 13, "args": { "External id": 158740, "cbid": 211, "correlation": 158740 } }, { "ph": "s", "id": 158740, "pid": 76337, "tid": -914061504, "ts": 1716454223851081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223908047, "dur": 31, "args": { "External id": 158760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158760, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158760, "pid": 5, "tid": 7, "ts": 1716454223908047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851150, "dur": 11, "args": { "External id": 158760, "cbid": 211, "correlation": 158760 } }, { "ph": "s", "id": 158760, "pid": 76337, "tid": -914061504, "ts": 1716454223851150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223908079, "dur": 5, "args": { "External id": 158772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158772, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158772, "pid": 5, "tid": 7, "ts": 1716454223908079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851170, "dur": 6, "args": { "External id": 158772, "cbid": 211, "correlation": 158772 } }, { "ph": "s", "id": 158772, "pid": 76337, "tid": -914061504, "ts": 1716454223851170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223908085, "dur": 30, "args": { "External id": 158775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158775, "pid": 5, "tid": 7, "ts": 1716454223908085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851189, "dur": 7, "args": { "External id": 158775, "cbid": 211, "correlation": 158775 } }, { "ph": "s", "id": 158775, "pid": 76337, "tid": -914061504, "ts": 1716454223851189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223908117, "dur": 22, "args": { "External id": 158784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158784, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158784, "pid": 5, "tid": 7, "ts": 1716454223908117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851229, "dur": 10, "args": { "External id": 158784, "cbid": 211, "correlation": 158784 } }, { "ph": "s", "id": 158784, "pid": 76337, "tid": -914061504, "ts": 1716454223851229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223851281, "dur": 0, "args": { "External id": 158794, "cbid": 317, "correlation": 158794 } }, { "ph": "f", "id": 158794, "pid": 76337, "tid": -914061504, "ts": 1716454223851281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223851282, "dur": 0, "args": { "External id": 158795, "cbid": 203, "correlation": 158795 } }, { "ph": "f", "id": 158795, "pid": 76337, "tid": -914061504, "ts": 1716454223851282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223851283, "dur": 0, "args": { "External id": 158796, "cbid": 205, "correlation": 158796 } }, { "ph": "f", "id": 158796, "pid": 76337, "tid": -914061504, "ts": 1716454223851283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223908140, "dur": 22, "args": { "External id": 158800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158800, "pid": 5, "tid": 7, "ts": 1716454223908140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851299, "dur": 11, "args": { "External id": 158800, "cbid": 211, "correlation": 158800 } }, { "ph": "s", "id": 158800, "pid": 76337, "tid": -914061504, "ts": 1716454223851299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223908163, "dur": 320, "args": { "External id": 158802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158802, "pid": 5, "tid": 7, "ts": 1716454223908163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851312, "dur": 5, "args": { "External id": 158802, "cbid": 211, "correlation": 158802 } }, { "ph": "s", "id": 158802, "pid": 76337, "tid": -914061504, "ts": 1716454223851312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223908486, "dur": 1, "args": { "External id": 158804, "device": 5, "context": 1, "stream": 7, "correlation": 158804, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 158804, "pid": 5, "tid": 7, "ts": 1716454223908486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223851323, "dur": 8, "args": { "External id": 158804, "cbid": 51, "correlation": 158804 } }, { "ph": "s", "id": 158804, "pid": 76337, "tid": -914061504, "ts": 1716454223851323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223908489, "dur": 1237, "args": { "External id": 158805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158805, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158805, "pid": 5, "tid": 7, "ts": 1716454223908489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851332, "dur": 6, "args": { "External id": 158805, "cbid": 211, "correlation": 158805 } }, { "ph": "s", "id": 158805, "pid": 76337, "tid": -914061504, "ts": 1716454223851332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223909728, "dur": 12, "args": { "External id": 158807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158807, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158807, "pid": 5, "tid": 7, "ts": 1716454223909728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851342, "dur": 5, "args": { "External id": 158807, "cbid": 211, "correlation": 158807 } }, { "ph": "s", "id": 158807, "pid": 76337, "tid": -914061504, "ts": 1716454223851342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223909741, "dur": 14, "args": { "External id": 158813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158813, "pid": 5, "tid": 7, "ts": 1716454223909741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851370, "dur": 10, "args": { "External id": 158813, "cbid": 211, "correlation": 158813 } }, { "ph": "s", "id": 158813, "pid": 76337, "tid": -914061504, "ts": 1716454223851370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223909757, "dur": 3, "args": { "External id": 158821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158821, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 158821, "pid": 5, "tid": 7, "ts": 1716454223909757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851415, "dur": 10, "args": { "External id": 158821, "cbid": 211, "correlation": 158821 } }, { "ph": "s", "id": 158821, "pid": 76337, "tid": -914061504, "ts": 1716454223851415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223851483, "dur": 1, "args": { "External id": 158837, "cbid": 251, "correlation": 158837 } }, { "ph": "f", "id": 158837, "pid": 76337, "tid": -914061504, "ts": 1716454223851483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223851488, "dur": 0, "args": { "External id": 158839, "cbid": 251, "correlation": 158839 } }, { "ph": "f", "id": 158839, "pid": 76337, "tid": -914061504, "ts": 1716454223851488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223909762, "dur": 12, "args": { "External id": 158840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158840, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158840, "pid": 5, "tid": 7, "ts": 1716454223909762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851490, "dur": 11, "args": { "External id": 158840, "cbid": 211, "correlation": 158840 } }, { "ph": "s", "id": 158840, "pid": 76337, "tid": -914061504, "ts": 1716454223851490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223909775, "dur": 5, "args": { "External id": 158842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158842, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158842, "pid": 5, "tid": 7, "ts": 1716454223909775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851503, "dur": 6, "args": { "External id": 158842, "cbid": 211, "correlation": 158842 } }, { "ph": "s", "id": 158842, "pid": 76337, "tid": -914061504, "ts": 1716454223851503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223909782, "dur": 17, "args": { "External id": 158852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158852, "pid": 5, "tid": 7, "ts": 1716454223909782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851561, "dur": 13, "args": { "External id": 158852, "cbid": 211, "correlation": 158852 } }, { "ph": "s", "id": 158852, "pid": 76337, "tid": -914061504, "ts": 1716454223851561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223909800, "dur": 17, "args": { "External id": 158872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158872, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158872, "pid": 5, "tid": 7, "ts": 1716454223909800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851628, "dur": 11, "args": { "External id": 158872, "cbid": 211, "correlation": 158872 } }, { "ph": "s", "id": 158872, "pid": 76337, "tid": -914061504, "ts": 1716454223851628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223909818, "dur": 4, "args": { "External id": 158884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158884, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 158884, "pid": 5, "tid": 7, "ts": 1716454223909818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851649, "dur": 6, "args": { "External id": 158884, "cbid": 211, "correlation": 158884 } }, { "ph": "s", "id": 158884, "pid": 76337, "tid": -914061504, "ts": 1716454223851649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223909824, "dur": 16, "args": { "External id": 158887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158887, "pid": 5, "tid": 7, "ts": 1716454223909824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851668, "dur": 7, "args": { "External id": 158887, "cbid": 211, "correlation": 158887 } }, { "ph": "s", "id": 158887, "pid": 76337, "tid": -914061504, "ts": 1716454223851668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223909842, "dur": 11, "args": { "External id": 158896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158896, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158896, "pid": 5, "tid": 7, "ts": 1716454223909842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851708, "dur": 10, "args": { "External id": 158896, "cbid": 211, "correlation": 158896 } }, { "ph": "s", "id": 158896, "pid": 76337, "tid": -914061504, "ts": 1716454223851708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223851770, "dur": 0, "args": { "External id": 158906, "cbid": 317, "correlation": 158906 } }, { "ph": "f", "id": 158906, "pid": 76337, "tid": -914061504, "ts": 1716454223851770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223851771, "dur": 0, "args": { "External id": 158907, "cbid": 203, "correlation": 158907 } }, { "ph": "f", "id": 158907, "pid": 76337, "tid": -914061504, "ts": 1716454223851771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223851771, "dur": 0, "args": { "External id": 158908, "cbid": 205, "correlation": 158908 } }, { "ph": "f", "id": 158908, "pid": 76337, "tid": -914061504, "ts": 1716454223851771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223909854, "dur": 11, "args": { "External id": 158912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158912, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158912, "pid": 5, "tid": 7, "ts": 1716454223909854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851786, "dur": 12, "args": { "External id": 158912, "cbid": 211, "correlation": 158912 } }, { "ph": "s", "id": 158912, "pid": 76337, "tid": -914061504, "ts": 1716454223851786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223909867, "dur": 162, "args": { "External id": 158914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158914, "pid": 5, "tid": 7, "ts": 1716454223909867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851800, "dur": 5, "args": { "External id": 158914, "cbid": 211, "correlation": 158914 } }, { "ph": "s", "id": 158914, "pid": 76337, "tid": -914061504, "ts": 1716454223851800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223910031, "dur": 1, "args": { "External id": 158916, "device": 5, "context": 1, "stream": 7, "correlation": 158916, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 158916, "pid": 5, "tid": 7, "ts": 1716454223910031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223851811, "dur": 6, "args": { "External id": 158916, "cbid": 51, "correlation": 158916 } }, { "ph": "s", "id": 158916, "pid": 76337, "tid": -914061504, "ts": 1716454223851811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223910034, "dur": 646, "args": { "External id": 158917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158917, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 158917, "pid": 5, "tid": 7, "ts": 1716454223910034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851818, "dur": 6, "args": { "External id": 158917, "cbid": 211, "correlation": 158917 } }, { "ph": "s", "id": 158917, "pid": 76337, "tid": -914061504, "ts": 1716454223851818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223910682, "dur": 12, "args": { "External id": 158919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158919, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158919, "pid": 5, "tid": 7, "ts": 1716454223910682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851829, "dur": 5, "args": { "External id": 158919, "cbid": 211, "correlation": 158919 } }, { "ph": "s", "id": 158919, "pid": 76337, "tid": -914061504, "ts": 1716454223851829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223910696, "dur": 15, "args": { "External id": 158925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158925, "pid": 5, "tid": 7, "ts": 1716454223910696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851857, "dur": 9, "args": { "External id": 158925, "cbid": 211, "correlation": 158925 } }, { "ph": "s", "id": 158925, "pid": 76337, "tid": -914061504, "ts": 1716454223851857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223851916, "dur": 0, "args": { "External id": 158935, "cbid": 317, "correlation": 158935 } }, { "ph": "f", "id": 158935, "pid": 76337, "tid": -914061504, "ts": 1716454223851916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223851917, "dur": 0, "args": { "External id": 158936, "cbid": 203, "correlation": 158936 } }, { "ph": "f", "id": 158936, "pid": 76337, "tid": -914061504, "ts": 1716454223851917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223851917, "dur": 0, "args": { "External id": 158937, "cbid": 205, "correlation": 158937 } }, { "ph": "f", "id": 158937, "pid": 76337, "tid": -914061504, "ts": 1716454223851917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223910712, "dur": 21, "args": { "External id": 158941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158941, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158941, "pid": 5, "tid": 7, "ts": 1716454223910712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851932, "dur": 11, "args": { "External id": 158941, "cbid": 211, "correlation": 158941 } }, { "ph": "s", "id": 158941, "pid": 76337, "tid": -914061504, "ts": 1716454223851932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223910734, "dur": 4, "args": { "External id": 158943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 158943, "pid": 5, "tid": 7, "ts": 1716454223910734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851950, "dur": 7, "args": { "External id": 158943, "cbid": 211, "correlation": 158943 } }, { "ph": "s", "id": 158943, "pid": 76337, "tid": -914061504, "ts": 1716454223851950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223851962, "dur": 0, "args": { "External id": 158944, "cbid": 51, "correlation": 158944 } }, { "ph": "s", "id": 158944, "pid": 76337, "tid": -914061504, "ts": 1716454223851962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223910739, "dur": 174, "args": { "External id": 158945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158945, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 158945, "pid": 5, "tid": 7, "ts": 1716454223910739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223851963, "dur": 7, "args": { "External id": 158945, "cbid": 211, "correlation": 158945 } }, { "ph": "s", "id": 158945, "pid": 76337, "tid": -914061504, "ts": 1716454223851963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223910914, "dur": 16, "args": { "External id": 158950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158950, "pid": 5, "tid": 7, "ts": 1716454223910914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852000, "dur": 9, "args": { "External id": 158950, "cbid": 211, "correlation": 158950 } }, { "ph": "s", "id": 158950, "pid": 76337, "tid": -914061504, "ts": 1716454223852000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223910931, "dur": 12, "args": { "External id": 158958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158958, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158958, "pid": 5, "tid": 7, "ts": 1716454223910931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852030, "dur": 8, "args": { "External id": 158958, "cbid": 211, "correlation": 158958 } }, { "ph": "s", "id": 158958, "pid": 76337, "tid": -914061504, "ts": 1716454223852030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223910945, "dur": 11, "args": { "External id": 158966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158966, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 158966, "pid": 5, "tid": 7, "ts": 1716454223910945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852059, "dur": 9, "args": { "External id": 158966, "cbid": 211, "correlation": 158966 } }, { "ph": "s", "id": 158966, "pid": 76337, "tid": -914061504, "ts": 1716454223852059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223910957, "dur": 18, "args": { "External id": 158986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158986, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 158986, "pid": 5, "tid": 7, "ts": 1716454223910957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852144, "dur": 12, "args": { "External id": 158986, "cbid": 211, "correlation": 158986 } }, { "ph": "s", "id": 158986, "pid": 76337, "tid": -914061504, "ts": 1716454223852144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223910976, "dur": 4, "args": { "External id": 158998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 158998, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 158998, "pid": 5, "tid": 7, "ts": 1716454223910976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852166, "dur": 6, "args": { "External id": 158998, "cbid": 211, "correlation": 158998 } }, { "ph": "s", "id": 158998, "pid": 76337, "tid": -914061504, "ts": 1716454223852166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223910982, "dur": 17, "args": { "External id": 159001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159001, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159001, "pid": 5, "tid": 7, "ts": 1716454223910982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852184, "dur": 6, "args": { "External id": 159001, "cbid": 211, "correlation": 159001 } }, { "ph": "s", "id": 159001, "pid": 76337, "tid": -914061504, "ts": 1716454223852184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223852242, "dur": 0, "args": { "External id": 159012, "cbid": 317, "correlation": 159012 } }, { "ph": "f", "id": 159012, "pid": 76337, "tid": -914061504, "ts": 1716454223852242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223852243, "dur": 0, "args": { "External id": 159013, "cbid": 203, "correlation": 159013 } }, { "ph": "f", "id": 159013, "pid": 76337, "tid": -914061504, "ts": 1716454223852243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223852243, "dur": 0, "args": { "External id": 159014, "cbid": 205, "correlation": 159014 } }, { "ph": "f", "id": 159014, "pid": 76337, "tid": -914061504, "ts": 1716454223852243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223911000, "dur": 11, "args": { "External id": 159018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159018, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159018, "pid": 5, "tid": 7, "ts": 1716454223911000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852258, "dur": 12, "args": { "External id": 159018, "cbid": 211, "correlation": 159018 } }, { "ph": "s", "id": 159018, "pid": 76337, "tid": -914061504, "ts": 1716454223852258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223911012, "dur": 3, "args": { "External id": 159020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159020, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159020, "pid": 5, "tid": 7, "ts": 1716454223911012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852275, "dur": 6, "args": { "External id": 159020, "cbid": 211, "correlation": 159020 } }, { "ph": "s", "id": 159020, "pid": 76337, "tid": -914061504, "ts": 1716454223852275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223852284, "dur": 0, "args": { "External id": 159021, "cbid": 51, "correlation": 159021 } }, { "ph": "s", "id": 159021, "pid": 76337, "tid": -914061504, "ts": 1716454223852284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223911017, "dur": 90, "args": { "External id": 159022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159022, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 159022, "pid": 5, "tid": 7, "ts": 1716454223911017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852285, "dur": 5, "args": { "External id": 159022, "cbid": 211, "correlation": 159022 } }, { "ph": "s", "id": 159022, "pid": 76337, "tid": -914061504, "ts": 1716454223852285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223911109, "dur": 16, "args": { "External id": 159027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159027, "pid": 5, "tid": 7, "ts": 1716454223911109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852311, "dur": 8, "args": { "External id": 159027, "cbid": 211, "correlation": 159027 } }, { "ph": "s", "id": 159027, "pid": 76337, "tid": -914061504, "ts": 1716454223852311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223911126, "dur": 83, "args": { "External id": 159036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159036, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159036, "pid": 5, "tid": 7, "ts": 1716454223911126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852401, "dur": 16, "args": { "External id": 159036, "cbid": 211, "correlation": 159036 } }, { "ph": "s", "id": 159036, "pid": 76337, "tid": -914061504, "ts": 1716454223852401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223911210, "dur": 31, "args": { "External id": 159058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159058, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159058, "pid": 5, "tid": 7, "ts": 1716454223911210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852471, "dur": 11, "args": { "External id": 159058, "cbid": 211, "correlation": 159058 } }, { "ph": "s", "id": 159058, "pid": 76337, "tid": -914061504, "ts": 1716454223852471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223852583, "dur": 2, "args": { "External id": 159069, "cbid": 251, "correlation": 159069 } }, { "ph": "f", "id": 159069, "pid": 76337, "tid": -914061504, "ts": 1716454223852583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223911242, "dur": 161, "args": { "External id": 159070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159070, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159070, "pid": 5, "tid": 7, "ts": 1716454223911242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852591, "dur": 14, "args": { "External id": 159070, "cbid": 211, "correlation": 159070 } }, { "ph": "s", "id": 159070, "pid": 76337, "tid": -914061504, "ts": 1716454223852591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223852666, "dur": 1, "args": { "External id": 159081, "cbid": 251, "correlation": 159081 } }, { "ph": "f", "id": 159081, "pid": 76337, "tid": -914061504, "ts": 1716454223852666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223911405, "dur": 159, "args": { "External id": 159082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159082, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159082, "pid": 5, "tid": 7, "ts": 1716454223911405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852670, "dur": 12, "args": { "External id": 159082, "cbid": 211, "correlation": 159082 } }, { "ph": "s", "id": 159082, "pid": 76337, "tid": -914061504, "ts": 1716454223852670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223852738, "dur": 1, "args": { "External id": 159093, "cbid": 251, "correlation": 159093 } }, { "ph": "f", "id": 159093, "pid": 76337, "tid": -914061504, "ts": 1716454223852738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223911565, "dur": 135, "args": { "External id": 159094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159094, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159094, "pid": 5, "tid": 7, "ts": 1716454223911565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852742, "dur": 11, "args": { "External id": 159094, "cbid": 211, "correlation": 159094 } }, { "ph": "s", "id": 159094, "pid": 76337, "tid": -914061504, "ts": 1716454223852742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223911701, "dur": 334, "args": { "External id": 159119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159119, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159119, "pid": 5, "tid": 7, "ts": 1716454223911701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852848, "dur": 16, "args": { "External id": 159119, "cbid": 211, "correlation": 159119 } }, { "ph": "s", "id": 159119, "pid": 76337, "tid": -914061504, "ts": 1716454223852848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223852965, "dur": 2, "args": { "External id": 159137, "cbid": 251, "correlation": 159137 } }, { "ph": "f", "id": 159137, "pid": 76337, "tid": -914061504, "ts": 1716454223852965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223912037, "dur": 165, "args": { "External id": 159139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159139, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159139, "pid": 5, "tid": 7, "ts": 1716454223912037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223852971, "dur": 22, "args": { "External id": 159139, "cbid": 211, "correlation": 159139 } }, { "ph": "s", "id": 159139, "pid": 76337, "tid": -914061504, "ts": 1716454223852971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223912202, "dur": 19, "args": { "External id": 159147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159147, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159147, "pid": 5, "tid": 7, "ts": 1716454223912202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853051, "dur": 13, "args": { "External id": 159147, "cbid": 211, "correlation": 159147 } }, { "ph": "s", "id": 159147, "pid": 76337, "tid": -914061504, "ts": 1716454223853051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223912223, "dur": 28, "args": { "External id": 159155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159155, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159155, "pid": 5, "tid": 7, "ts": 1716454223912223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853091, "dur": 9, "args": { "External id": 159155, "cbid": 211, "correlation": 159155 } }, { "ph": "s", "id": 159155, "pid": 76337, "tid": -914061504, "ts": 1716454223853091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223912252, "dur": 18, "args": { "External id": 159166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159166, "pid": 5, "tid": 7, "ts": 1716454223912252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853179, "dur": 14, "args": { "External id": 159166, "cbid": 211, "correlation": 159166 } }, { "ph": "s", "id": 159166, "pid": 76337, "tid": -914061504, "ts": 1716454223853179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223912272, "dur": 16, "args": { "External id": 159188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159188, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159188, "pid": 5, "tid": 7, "ts": 1716454223912272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853212, "dur": 8, "args": { "External id": 159188, "cbid": 211, "correlation": 159188 } }, { "ph": "s", "id": 159188, "pid": 76337, "tid": -914061504, "ts": 1716454223853212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853298, "dur": 2, "args": { "External id": 159199, "cbid": 251, "correlation": 159199 } }, { "ph": "f", "id": 159199, "pid": 76337, "tid": -914061504, "ts": 1716454223853298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223912289, "dur": 89, "args": { "External id": 159200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159200, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159200, "pid": 5, "tid": 7, "ts": 1716454223912289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853305, "dur": 13, "args": { "External id": 159200, "cbid": 211, "correlation": 159200 } }, { "ph": "s", "id": 159200, "pid": 76337, "tid": -914061504, "ts": 1716454223853305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853381, "dur": 1, "args": { "External id": 159211, "cbid": 251, "correlation": 159211 } }, { "ph": "f", "id": 159211, "pid": 76337, "tid": -914061504, "ts": 1716454223853381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853384, "dur": 0, "args": { "External id": 159212, "cbid": 251, "correlation": 159212 } }, { "ph": "f", "id": 159212, "pid": 76337, "tid": -914061504, "ts": 1716454223853384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223912379, "dur": 12, "args": { "External id": 159213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159213, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159213, "pid": 5, "tid": 7, "ts": 1716454223912379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853386, "dur": 12, "args": { "External id": 159213, "cbid": 211, "correlation": 159213 } }, { "ph": "s", "id": 159213, "pid": 76337, "tid": -914061504, "ts": 1716454223853386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223912393, "dur": 6, "args": { "External id": 159215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159215, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159215, "pid": 5, "tid": 7, "ts": 1716454223912393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853400, "dur": 8, "args": { "External id": 159215, "cbid": 211, "correlation": 159215 } }, { "ph": "s", "id": 159215, "pid": 76337, "tid": -914061504, "ts": 1716454223853400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853460, "dur": 1, "args": { "External id": 159226, "cbid": 251, "correlation": 159226 } }, { "ph": "f", "id": 159226, "pid": 76337, "tid": -914061504, "ts": 1716454223853460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853463, "dur": 0, "args": { "External id": 159227, "cbid": 251, "correlation": 159227 } }, { "ph": "f", "id": 159227, "pid": 76337, "tid": -914061504, "ts": 1716454223853463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223912400, "dur": 9, "args": { "External id": 159228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159228, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159228, "pid": 5, "tid": 7, "ts": 1716454223912400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853464, "dur": 11, "args": { "External id": 159228, "cbid": 211, "correlation": 159228 } }, { "ph": "s", "id": 159228, "pid": 76337, "tid": -914061504, "ts": 1716454223853464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223912410, "dur": 3, "args": { "External id": 159230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159230, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159230, "pid": 5, "tid": 7, "ts": 1716454223912410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853477, "dur": 6, "args": { "External id": 159230, "cbid": 211, "correlation": 159230 } }, { "ph": "s", "id": 159230, "pid": 76337, "tid": -914061504, "ts": 1716454223853477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223912414, "dur": 55, "args": { "External id": 159255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159255, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159255, "pid": 5, "tid": 7, "ts": 1716454223912414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853557, "dur": 12, "args": { "External id": 159255, "cbid": 211, "correlation": 159255 } }, { "ph": "s", "id": 159255, "pid": 76337, "tid": -914061504, "ts": 1716454223853557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853658, "dur": 2, "args": { "External id": 159273, "cbid": 251, "correlation": 159273 } }, { "ph": "f", "id": 159273, "pid": 76337, "tid": -914061504, "ts": 1716454223853658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223912471, "dur": 90, "args": { "External id": 159275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159275, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159275, "pid": 5, "tid": 7, "ts": 1716454223912471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853665, "dur": 14, "args": { "External id": 159275, "cbid": 211, "correlation": 159275 } }, { "ph": "s", "id": 159275, "pid": 76337, "tid": -914061504, "ts": 1716454223853665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223912562, "dur": 10, "args": { "External id": 159283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159283, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159283, "pid": 5, "tid": 7, "ts": 1716454223912562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853735, "dur": 12, "args": { "External id": 159283, "cbid": 211, "correlation": 159283 } }, { "ph": "s", "id": 159283, "pid": 76337, "tid": -914061504, "ts": 1716454223853735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223912573, "dur": 21, "args": { "External id": 159291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159291, "pid": 5, "tid": 7, "ts": 1716454223912573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853777, "dur": 9, "args": { "External id": 159291, "cbid": 211, "correlation": 159291 } }, { "ph": "s", "id": 159291, "pid": 76337, "tid": -914061504, "ts": 1716454223853777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223912595, "dur": 17, "args": { "External id": 159313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159313, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159313, "pid": 5, "tid": 7, "ts": 1716454223912595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853829, "dur": 10, "args": { "External id": 159313, "cbid": 211, "correlation": 159313 } }, { "ph": "s", "id": 159313, "pid": 76337, "tid": -914061504, "ts": 1716454223853829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853926, "dur": 1, "args": { "External id": 159329, "cbid": 251, "correlation": 159329 } }, { "ph": "f", "id": 159329, "pid": 76337, "tid": -914061504, "ts": 1716454223853926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223853931, "dur": 0, "args": { "External id": 159331, "cbid": 251, "correlation": 159331 } }, { "ph": "f", "id": 159331, "pid": 76337, "tid": -914061504, "ts": 1716454223853931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223912614, "dur": 493, "args": { "External id": 159332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159332, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159332, "pid": 5, "tid": 7, "ts": 1716454223912614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223853934, "dur": 15, "args": { "External id": 159332, "cbid": 211, "correlation": 159332 } }, { "ph": "s", "id": 159332, "pid": 76337, "tid": -914061504, "ts": 1716454223853934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223913108, "dur": 65, "args": { "External id": 159340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159340, "pid": 5, "tid": 7, "ts": 1716454223913108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854023, "dur": 13, "args": { "External id": 159340, "cbid": 211, "correlation": 159340 } }, { "ph": "s", "id": 159340, "pid": 76337, "tid": -914061504, "ts": 1716454223854023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223913174, "dur": 67, "args": { "External id": 159348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159348, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159348, "pid": 5, "tid": 7, "ts": 1716454223913174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854061, "dur": 9, "args": { "External id": 159348, "cbid": 211, "correlation": 159348 } }, { "ph": "s", "id": 159348, "pid": 76337, "tid": -914061504, "ts": 1716454223854061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223854144, "dur": 1, "args": { "External id": 159364, "cbid": 251, "correlation": 159364 } }, { "ph": "f", "id": 159364, "pid": 76337, "tid": -914061504, "ts": 1716454223854144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223913244, "dur": 1, "args": { "External id": 159366, "device": 5, "context": 1, "stream": 7, "correlation": 159366, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 159366, "pid": 5, "tid": 7, "ts": 1716454223913244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223854149, "dur": 11, "args": { "External id": 159366, "cbid": 51, "correlation": 159366 } }, { "ph": "s", "id": 159366, "pid": 76337, "tid": -914061504, "ts": 1716454223854149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223913247, "dur": 273, "args": { "External id": 159367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159367, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159367, "pid": 5, "tid": 7, "ts": 1716454223913247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854162, "dur": 11, "args": { "External id": 159367, "cbid": 211, "correlation": 159367 } }, { "ph": "s", "id": 159367, "pid": 76337, "tid": -914061504, "ts": 1716454223854162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223913522, "dur": 13, "args": { "External id": 159375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159375, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159375, "pid": 5, "tid": 7, "ts": 1716454223913522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854205, "dur": 10, "args": { "External id": 159375, "cbid": 211, "correlation": 159375 } }, { "ph": "s", "id": 159375, "pid": 76337, "tid": -914061504, "ts": 1716454223854205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223913537, "dur": 37, "args": { "External id": 159386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159386, "pid": 5, "tid": 7, "ts": 1716454223913537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854277, "dur": 12, "args": { "External id": 159386, "cbid": 211, "correlation": 159386 } }, { "ph": "s", "id": 159386, "pid": 76337, "tid": -914061504, "ts": 1716454223854277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223854341, "dur": 0, "args": { "External id": 159398, "cbid": 317, "correlation": 159398 } }, { "ph": "f", "id": 159398, "pid": 76337, "tid": -914061504, "ts": 1716454223854341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223854343, "dur": 0, "args": { "External id": 159399, "cbid": 203, "correlation": 159399 } }, { "ph": "f", "id": 159399, "pid": 76337, "tid": -914061504, "ts": 1716454223854343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223854343, "dur": 0, "args": { "External id": 159400, "cbid": 205, "correlation": 159400 } }, { "ph": "f", "id": 159400, "pid": 76337, "tid": -914061504, "ts": 1716454223854343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223913575, "dur": 13, "args": { "External id": 159404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159404, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159404, "pid": 5, "tid": 7, "ts": 1716454223913575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854360, "dur": 12, "args": { "External id": 159404, "cbid": 211, "correlation": 159404 } }, { "ph": "s", "id": 159404, "pid": 76337, "tid": -914061504, "ts": 1716454223854360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223913590, "dur": 4, "args": { "External id": 159406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159406, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159406, "pid": 5, "tid": 7, "ts": 1716454223913590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854376, "dur": 6, "args": { "External id": 159406, "cbid": 211, "correlation": 159406 } }, { "ph": "s", "id": 159406, "pid": 76337, "tid": -914061504, "ts": 1716454223854376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223854385, "dur": 0, "args": { "External id": 159407, "cbid": 51, "correlation": 159407 } }, { "ph": "s", "id": 159407, "pid": 76337, "tid": -914061504, "ts": 1716454223854385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223913595, "dur": 97, "args": { "External id": 159408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159408, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 159408, "pid": 5, "tid": 7, "ts": 1716454223913595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854385, "dur": 5, "args": { "External id": 159408, "cbid": 211, "correlation": 159408 } }, { "ph": "s", "id": 159408, "pid": 76337, "tid": -914061504, "ts": 1716454223854385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223913694, "dur": 16, "args": { "External id": 159413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159413, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159413, "pid": 5, "tid": 7, "ts": 1716454223913694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854413, "dur": 9, "args": { "External id": 159413, "cbid": 211, "correlation": 159413 } }, { "ph": "s", "id": 159413, "pid": 76337, "tid": -914061504, "ts": 1716454223854413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223913712, "dur": 12, "args": { "External id": 159421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159421, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159421, "pid": 5, "tid": 7, "ts": 1716454223913712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854445, "dur": 9, "args": { "External id": 159421, "cbid": 211, "correlation": 159421 } }, { "ph": "s", "id": 159421, "pid": 76337, "tid": -914061504, "ts": 1716454223854445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223913725, "dur": 29, "args": { "External id": 159430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159430, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159430, "pid": 5, "tid": 7, "ts": 1716454223913725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854485, "dur": 10, "args": { "External id": 159430, "cbid": 211, "correlation": 159430 } }, { "ph": "s", "id": 159430, "pid": 76337, "tid": -914061504, "ts": 1716454223854485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223913755, "dur": 31, "args": { "External id": 159450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159450, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 159450, "pid": 5, "tid": 7, "ts": 1716454223913755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854559, "dur": 12, "args": { "External id": 159450, "cbid": 211, "correlation": 159450 } }, { "ph": "s", "id": 159450, "pid": 76337, "tid": -914061504, "ts": 1716454223854559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223913787, "dur": 6, "args": { "External id": 159462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159462, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159462, "pid": 5, "tid": 7, "ts": 1716454223913787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854581, "dur": 6, "args": { "External id": 159462, "cbid": 211, "correlation": 159462 } }, { "ph": "s", "id": 159462, "pid": 76337, "tid": -914061504, "ts": 1716454223854581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223913795, "dur": 31, "args": { "External id": 159465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159465, "pid": 5, "tid": 7, "ts": 1716454223913795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854599, "dur": 6, "args": { "External id": 159465, "cbid": 211, "correlation": 159465 } }, { "ph": "s", "id": 159465, "pid": 76337, "tid": -914061504, "ts": 1716454223854599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223913826, "dur": 21, "args": { "External id": 159474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159474, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159474, "pid": 5, "tid": 7, "ts": 1716454223913826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854638, "dur": 10, "args": { "External id": 159474, "cbid": 211, "correlation": 159474 } }, { "ph": "s", "id": 159474, "pid": 76337, "tid": -914061504, "ts": 1716454223854638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223854689, "dur": 0, "args": { "External id": 159484, "cbid": 317, "correlation": 159484 } }, { "ph": "f", "id": 159484, "pid": 76337, "tid": -914061504, "ts": 1716454223854689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223854690, "dur": 0, "args": { "External id": 159485, "cbid": 203, "correlation": 159485 } }, { "ph": "f", "id": 159485, "pid": 76337, "tid": -914061504, "ts": 1716454223854690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223854691, "dur": 0, "args": { "External id": 159486, "cbid": 205, "correlation": 159486 } }, { "ph": "f", "id": 159486, "pid": 76337, "tid": -914061504, "ts": 1716454223854691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223913849, "dur": 21, "args": { "External id": 159490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159490, "pid": 5, "tid": 7, "ts": 1716454223913849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854705, "dur": 11, "args": { "External id": 159490, "cbid": 211, "correlation": 159490 } }, { "ph": "s", "id": 159490, "pid": 76337, "tid": -914061504, "ts": 1716454223854705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223913872, "dur": 319, "args": { "External id": 159492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159492, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159492, "pid": 5, "tid": 7, "ts": 1716454223913872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854719, "dur": 5, "args": { "External id": 159492, "cbid": 211, "correlation": 159492 } }, { "ph": "s", "id": 159492, "pid": 76337, "tid": -914061504, "ts": 1716454223854719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223914193, "dur": 1, "args": { "External id": 159494, "device": 5, "context": 1, "stream": 7, "correlation": 159494, "bytes": 960, "memory bandwidth (GB/s)": 0.5885959534028203 } }, { "ph": "f", "id": 159494, "pid": 5, "tid": 7, "ts": 1716454223914193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223854730, "dur": 7, "args": { "External id": 159494, "cbid": 51, "correlation": 159494 } }, { "ph": "s", "id": 159494, "pid": 76337, "tid": -914061504, "ts": 1716454223854730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223914197, "dur": 1254, "args": { "External id": 159495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159495, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159495, "pid": 5, "tid": 7, "ts": 1716454223914197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854738, "dur": 7, "args": { "External id": 159495, "cbid": 211, "correlation": 159495 } }, { "ph": "s", "id": 159495, "pid": 76337, "tid": -914061504, "ts": 1716454223854738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223915452, "dur": 13, "args": { "External id": 159497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159497, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159497, "pid": 5, "tid": 7, "ts": 1716454223915452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854750, "dur": 5, "args": { "External id": 159497, "cbid": 211, "correlation": 159497 } }, { "ph": "s", "id": 159497, "pid": 76337, "tid": -914061504, "ts": 1716454223854750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223915466, "dur": 15, "args": { "External id": 159503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159503, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159503, "pid": 5, "tid": 7, "ts": 1716454223915466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854777, "dur": 8, "args": { "External id": 159503, "cbid": 211, "correlation": 159503 } }, { "ph": "s", "id": 159503, "pid": 76337, "tid": -914061504, "ts": 1716454223854777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223915482, "dur": 4, "args": { "External id": 159511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159511, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 159511, "pid": 5, "tid": 7, "ts": 1716454223915482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854821, "dur": 9, "args": { "External id": 159511, "cbid": 211, "correlation": 159511 } }, { "ph": "s", "id": 159511, "pid": 76337, "tid": -914061504, "ts": 1716454223854821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223854887, "dur": 1, "args": { "External id": 159527, "cbid": 251, "correlation": 159527 } }, { "ph": "f", "id": 159527, "pid": 76337, "tid": -914061504, "ts": 1716454223854887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223854893, "dur": 0, "args": { "External id": 159529, "cbid": 251, "correlation": 159529 } }, { "ph": "f", "id": 159529, "pid": 76337, "tid": -914061504, "ts": 1716454223854893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223915487, "dur": 13, "args": { "External id": 159530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159530, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159530, "pid": 5, "tid": 7, "ts": 1716454223915487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854895, "dur": 12, "args": { "External id": 159530, "cbid": 211, "correlation": 159530 } }, { "ph": "s", "id": 159530, "pid": 76337, "tid": -914061504, "ts": 1716454223854895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223915501, "dur": 5, "args": { "External id": 159532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159532, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159532, "pid": 5, "tid": 7, "ts": 1716454223915501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854908, "dur": 5, "args": { "External id": 159532, "cbid": 211, "correlation": 159532 } }, { "ph": "s", "id": 159532, "pid": 76337, "tid": -914061504, "ts": 1716454223854908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223915508, "dur": 17, "args": { "External id": 159542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159542, "pid": 5, "tid": 7, "ts": 1716454223915508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223854966, "dur": 21, "args": { "External id": 159542, "cbid": 211, "correlation": 159542 } }, { "ph": "s", "id": 159542, "pid": 76337, "tid": -914061504, "ts": 1716454223854966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223915526, "dur": 17, "args": { "External id": 159562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159562, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 159562, "pid": 5, "tid": 7, "ts": 1716454223915526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855041, "dur": 12, "args": { "External id": 159562, "cbid": 211, "correlation": 159562 } }, { "ph": "s", "id": 159562, "pid": 76337, "tid": -914061504, "ts": 1716454223855041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223915544, "dur": 4, "args": { "External id": 159574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159574, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 159574, "pid": 5, "tid": 7, "ts": 1716454223915544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855064, "dur": 6, "args": { "External id": 159574, "cbid": 211, "correlation": 159574 } }, { "ph": "s", "id": 159574, "pid": 76337, "tid": -914061504, "ts": 1716454223855064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223915550, "dur": 16, "args": { "External id": 159577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159577, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159577, "pid": 5, "tid": 7, "ts": 1716454223915550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855082, "dur": 6, "args": { "External id": 159577, "cbid": 211, "correlation": 159577 } }, { "ph": "s", "id": 159577, "pid": 76337, "tid": -914061504, "ts": 1716454223855082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223915568, "dur": 11, "args": { "External id": 159586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159586, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159586, "pid": 5, "tid": 7, "ts": 1716454223915568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855123, "dur": 9, "args": { "External id": 159586, "cbid": 211, "correlation": 159586 } }, { "ph": "s", "id": 159586, "pid": 76337, "tid": -914061504, "ts": 1716454223855123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223855184, "dur": 0, "args": { "External id": 159596, "cbid": 317, "correlation": 159596 } }, { "ph": "f", "id": 159596, "pid": 76337, "tid": -914061504, "ts": 1716454223855184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223855185, "dur": 0, "args": { "External id": 159597, "cbid": 203, "correlation": 159597 } }, { "ph": "f", "id": 159597, "pid": 76337, "tid": -914061504, "ts": 1716454223855185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223855186, "dur": 0, "args": { "External id": 159598, "cbid": 205, "correlation": 159598 } }, { "ph": "f", "id": 159598, "pid": 76337, "tid": -914061504, "ts": 1716454223855186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223915581, "dur": 12, "args": { "External id": 159602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159602, "pid": 5, "tid": 7, "ts": 1716454223915581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855200, "dur": 12, "args": { "External id": 159602, "cbid": 211, "correlation": 159602 } }, { "ph": "s", "id": 159602, "pid": 76337, "tid": -914061504, "ts": 1716454223855200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223915594, "dur": 162, "args": { "External id": 159604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159604, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159604, "pid": 5, "tid": 7, "ts": 1716454223915594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855214, "dur": 5, "args": { "External id": 159604, "cbid": 211, "correlation": 159604 } }, { "ph": "s", "id": 159604, "pid": 76337, "tid": -914061504, "ts": 1716454223855214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223915758, "dur": 1, "args": { "External id": 159606, "device": 5, "context": 1, "stream": 7, "correlation": 159606, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 159606, "pid": 5, "tid": 7, "ts": 1716454223915758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223855226, "dur": 6, "args": { "External id": 159606, "cbid": 51, "correlation": 159606 } }, { "ph": "s", "id": 159606, "pid": 76337, "tid": -914061504, "ts": 1716454223855226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223915762, "dur": 648, "args": { "External id": 159607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159607, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159607, "pid": 5, "tid": 7, "ts": 1716454223915762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855233, "dur": 6, "args": { "External id": 159607, "cbid": 211, "correlation": 159607 } }, { "ph": "s", "id": 159607, "pid": 76337, "tid": -914061504, "ts": 1716454223855233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223916411, "dur": 12, "args": { "External id": 159609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159609, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159609, "pid": 5, "tid": 7, "ts": 1716454223916411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855244, "dur": 6, "args": { "External id": 159609, "cbid": 211, "correlation": 159609 } }, { "ph": "s", "id": 159609, "pid": 76337, "tid": -914061504, "ts": 1716454223855244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223916424, "dur": 15, "args": { "External id": 159615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159615, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159615, "pid": 5, "tid": 7, "ts": 1716454223916424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855273, "dur": 9, "args": { "External id": 159615, "cbid": 211, "correlation": 159615 } }, { "ph": "s", "id": 159615, "pid": 76337, "tid": -914061504, "ts": 1716454223855273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223855331, "dur": 0, "args": { "External id": 159625, "cbid": 317, "correlation": 159625 } }, { "ph": "f", "id": 159625, "pid": 76337, "tid": -914061504, "ts": 1716454223855331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223855332, "dur": 0, "args": { "External id": 159626, "cbid": 203, "correlation": 159626 } }, { "ph": "f", "id": 159626, "pid": 76337, "tid": -914061504, "ts": 1716454223855332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223855332, "dur": 0, "args": { "External id": 159627, "cbid": 205, "correlation": 159627 } }, { "ph": "f", "id": 159627, "pid": 76337, "tid": -914061504, "ts": 1716454223855332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223916441, "dur": 21, "args": { "External id": 159631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159631, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159631, "pid": 5, "tid": 7, "ts": 1716454223916441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855346, "dur": 11, "args": { "External id": 159631, "cbid": 211, "correlation": 159631 } }, { "ph": "s", "id": 159631, "pid": 76337, "tid": -914061504, "ts": 1716454223855346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223916463, "dur": 4, "args": { "External id": 159633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159633, "pid": 5, "tid": 7, "ts": 1716454223916463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855361, "dur": 5, "args": { "External id": 159633, "cbid": 211, "correlation": 159633 } }, { "ph": "s", "id": 159633, "pid": 76337, "tid": -914061504, "ts": 1716454223855361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223855369, "dur": 0, "args": { "External id": 159634, "cbid": 51, "correlation": 159634 } }, { "ph": "s", "id": 159634, "pid": 76337, "tid": -914061504, "ts": 1716454223855369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223916468, "dur": 171, "args": { "External id": 159635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159635, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 159635, "pid": 5, "tid": 7, "ts": 1716454223916468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855370, "dur": 5, "args": { "External id": 159635, "cbid": 211, "correlation": 159635 } }, { "ph": "s", "id": 159635, "pid": 76337, "tid": -914061504, "ts": 1716454223855370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223916640, "dur": 16, "args": { "External id": 159640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159640, "pid": 5, "tid": 7, "ts": 1716454223916640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855396, "dur": 8, "args": { "External id": 159640, "cbid": 211, "correlation": 159640 } }, { "ph": "s", "id": 159640, "pid": 76337, "tid": -914061504, "ts": 1716454223855396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223916658, "dur": 12, "args": { "External id": 159648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159648, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159648, "pid": 5, "tid": 7, "ts": 1716454223916658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855424, "dur": 8, "args": { "External id": 159648, "cbid": 211, "correlation": 159648 } }, { "ph": "s", "id": 159648, "pid": 76337, "tid": -914061504, "ts": 1716454223855424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223916671, "dur": 10, "args": { "External id": 159656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159656, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159656, "pid": 5, "tid": 7, "ts": 1716454223916671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855454, "dur": 8, "args": { "External id": 159656, "cbid": 211, "correlation": 159656 } }, { "ph": "s", "id": 159656, "pid": 76337, "tid": -914061504, "ts": 1716454223855454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223916682, "dur": 18, "args": { "External id": 159676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159676, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 159676, "pid": 5, "tid": 7, "ts": 1716454223916682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855534, "dur": 13, "args": { "External id": 159676, "cbid": 211, "correlation": 159676 } }, { "ph": "s", "id": 159676, "pid": 76337, "tid": -914061504, "ts": 1716454223855534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223916702, "dur": 5, "args": { "External id": 159688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159688, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 159688, "pid": 5, "tid": 7, "ts": 1716454223916702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855557, "dur": 7, "args": { "External id": 159688, "cbid": 211, "correlation": 159688 } }, { "ph": "s", "id": 159688, "pid": 76337, "tid": -914061504, "ts": 1716454223855557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223916707, "dur": 17, "args": { "External id": 159691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159691, "pid": 5, "tid": 7, "ts": 1716454223916707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855575, "dur": 6, "args": { "External id": 159691, "cbid": 211, "correlation": 159691 } }, { "ph": "s", "id": 159691, "pid": 76337, "tid": -914061504, "ts": 1716454223855575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223855633, "dur": 0, "args": { "External id": 159702, "cbid": 317, "correlation": 159702 } }, { "ph": "f", "id": 159702, "pid": 76337, "tid": -914061504, "ts": 1716454223855633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223855634, "dur": 0, "args": { "External id": 159703, "cbid": 203, "correlation": 159703 } }, { "ph": "f", "id": 159703, "pid": 76337, "tid": -914061504, "ts": 1716454223855634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223855635, "dur": 0, "args": { "External id": 159704, "cbid": 205, "correlation": 159704 } }, { "ph": "f", "id": 159704, "pid": 76337, "tid": -914061504, "ts": 1716454223855635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223916726, "dur": 11, "args": { "External id": 159708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159708, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159708, "pid": 5, "tid": 7, "ts": 1716454223916726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855648, "dur": 11, "args": { "External id": 159708, "cbid": 211, "correlation": 159708 } }, { "ph": "s", "id": 159708, "pid": 76337, "tid": -914061504, "ts": 1716454223855648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223916738, "dur": 3, "args": { "External id": 159710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159710, "pid": 5, "tid": 7, "ts": 1716454223916738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855664, "dur": 5, "args": { "External id": 159710, "cbid": 211, "correlation": 159710 } }, { "ph": "s", "id": 159710, "pid": 76337, "tid": -914061504, "ts": 1716454223855664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223855672, "dur": 0, "args": { "External id": 159711, "cbid": 51, "correlation": 159711 } }, { "ph": "s", "id": 159711, "pid": 76337, "tid": -914061504, "ts": 1716454223855672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223916742, "dur": 91, "args": { "External id": 159712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159712, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 159712, "pid": 5, "tid": 7, "ts": 1716454223916742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855673, "dur": 5, "args": { "External id": 159712, "cbid": 211, "correlation": 159712 } }, { "ph": "s", "id": 159712, "pid": 76337, "tid": -914061504, "ts": 1716454223855673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223916835, "dur": 16, "args": { "External id": 159717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159717, "pid": 5, "tid": 7, "ts": 1716454223916835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855700, "dur": 8, "args": { "External id": 159717, "cbid": 211, "correlation": 159717 } }, { "ph": "s", "id": 159717, "pid": 76337, "tid": -914061504, "ts": 1716454223855700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223916852, "dur": 84, "args": { "External id": 159726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159726, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159726, "pid": 5, "tid": 7, "ts": 1716454223916852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855780, "dur": 16, "args": { "External id": 159726, "cbid": 211, "correlation": 159726 } }, { "ph": "s", "id": 159726, "pid": 76337, "tid": -914061504, "ts": 1716454223855780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223916938, "dur": 30, "args": { "External id": 159748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159748, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159748, "pid": 5, "tid": 7, "ts": 1716454223916938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855839, "dur": 11, "args": { "External id": 159748, "cbid": 211, "correlation": 159748 } }, { "ph": "s", "id": 159748, "pid": 76337, "tid": -914061504, "ts": 1716454223855839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223855927, "dur": 2, "args": { "External id": 159759, "cbid": 251, "correlation": 159759 } }, { "ph": "f", "id": 159759, "pid": 76337, "tid": -914061504, "ts": 1716454223855927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223916969, "dur": 164, "args": { "External id": 159760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159760, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159760, "pid": 5, "tid": 7, "ts": 1716454223916969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223855932, "dur": 14, "args": { "External id": 159760, "cbid": 211, "correlation": 159760 } }, { "ph": "s", "id": 159760, "pid": 76337, "tid": -914061504, "ts": 1716454223855932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856013, "dur": 1, "args": { "External id": 159771, "cbid": 251, "correlation": 159771 } }, { "ph": "f", "id": 159771, "pid": 76337, "tid": -914061504, "ts": 1716454223856013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223917135, "dur": 160, "args": { "External id": 159772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159772, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159772, "pid": 5, "tid": 7, "ts": 1716454223917135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856017, "dur": 12, "args": { "External id": 159772, "cbid": 211, "correlation": 159772 } }, { "ph": "s", "id": 159772, "pid": 76337, "tid": -914061504, "ts": 1716454223856017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856085, "dur": 1, "args": { "External id": 159783, "cbid": 251, "correlation": 159783 } }, { "ph": "f", "id": 159783, "pid": 76337, "tid": -914061504, "ts": 1716454223856085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223917296, "dur": 160, "args": { "External id": 159784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159784, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159784, "pid": 5, "tid": 7, "ts": 1716454223917296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856089, "dur": 12, "args": { "External id": 159784, "cbid": 211, "correlation": 159784 } }, { "ph": "s", "id": 159784, "pid": 76337, "tid": -914061504, "ts": 1716454223856089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223917458, "dur": 336, "args": { "External id": 159809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159809, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159809, "pid": 5, "tid": 7, "ts": 1716454223917458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856174, "dur": 13, "args": { "External id": 159809, "cbid": 211, "correlation": 159809 } }, { "ph": "s", "id": 159809, "pid": 76337, "tid": -914061504, "ts": 1716454223856174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856275, "dur": 1, "args": { "External id": 159827, "cbid": 251, "correlation": 159827 } }, { "ph": "f", "id": 159827, "pid": 76337, "tid": -914061504, "ts": 1716454223856275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223917795, "dur": 168, "args": { "External id": 159829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159829, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159829, "pid": 5, "tid": 7, "ts": 1716454223917795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856281, "dur": 14, "args": { "External id": 159829, "cbid": 211, "correlation": 159829 } }, { "ph": "s", "id": 159829, "pid": 76337, "tid": -914061504, "ts": 1716454223856281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223917964, "dur": 19, "args": { "External id": 159837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159837, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159837, "pid": 5, "tid": 7, "ts": 1716454223917964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856351, "dur": 12, "args": { "External id": 159837, "cbid": 211, "correlation": 159837 } }, { "ph": "s", "id": 159837, "pid": 76337, "tid": -914061504, "ts": 1716454223856351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223917985, "dur": 27, "args": { "External id": 159845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159845, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159845, "pid": 5, "tid": 7, "ts": 1716454223917985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856390, "dur": 9, "args": { "External id": 159845, "cbid": 211, "correlation": 159845 } }, { "ph": "s", "id": 159845, "pid": 76337, "tid": -914061504, "ts": 1716454223856390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223918013, "dur": 19, "args": { "External id": 159856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159856, "pid": 5, "tid": 7, "ts": 1716454223918013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856462, "dur": 12, "args": { "External id": 159856, "cbid": 211, "correlation": 159856 } }, { "ph": "s", "id": 159856, "pid": 76337, "tid": -914061504, "ts": 1716454223856462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223918033, "dur": 16, "args": { "External id": 159878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159878, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159878, "pid": 5, "tid": 7, "ts": 1716454223918033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856493, "dur": 8, "args": { "External id": 159878, "cbid": 211, "correlation": 159878 } }, { "ph": "s", "id": 159878, "pid": 76337, "tid": -914061504, "ts": 1716454223856493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856579, "dur": 1, "args": { "External id": 159889, "cbid": 251, "correlation": 159889 } }, { "ph": "f", "id": 159889, "pid": 76337, "tid": -914061504, "ts": 1716454223856579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223918051, "dur": 89, "args": { "External id": 159890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159890, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159890, "pid": 5, "tid": 7, "ts": 1716454223918051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856584, "dur": 14, "args": { "External id": 159890, "cbid": 211, "correlation": 159890 } }, { "ph": "s", "id": 159890, "pid": 76337, "tid": -914061504, "ts": 1716454223856584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856653, "dur": 1, "args": { "External id": 159901, "cbid": 251, "correlation": 159901 } }, { "ph": "f", "id": 159901, "pid": 76337, "tid": -914061504, "ts": 1716454223856653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856657, "dur": 0, "args": { "External id": 159902, "cbid": 251, "correlation": 159902 } }, { "ph": "f", "id": 159902, "pid": 76337, "tid": -914061504, "ts": 1716454223856657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223918141, "dur": 13, "args": { "External id": 159903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159903, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159903, "pid": 5, "tid": 7, "ts": 1716454223918141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856659, "dur": 12, "args": { "External id": 159903, "cbid": 211, "correlation": 159903 } }, { "ph": "s", "id": 159903, "pid": 76337, "tid": -914061504, "ts": 1716454223856659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223918155, "dur": 5, "args": { "External id": 159905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159905, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159905, "pid": 5, "tid": 7, "ts": 1716454223918155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856672, "dur": 6, "args": { "External id": 159905, "cbid": 211, "correlation": 159905 } }, { "ph": "s", "id": 159905, "pid": 76337, "tid": -914061504, "ts": 1716454223856672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856730, "dur": 1, "args": { "External id": 159916, "cbid": 251, "correlation": 159916 } }, { "ph": "f", "id": 159916, "pid": 76337, "tid": -914061504, "ts": 1716454223856730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856733, "dur": 0, "args": { "External id": 159917, "cbid": 251, "correlation": 159917 } }, { "ph": "f", "id": 159917, "pid": 76337, "tid": -914061504, "ts": 1716454223856733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223918162, "dur": 8, "args": { "External id": 159918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159918, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159918, "pid": 5, "tid": 7, "ts": 1716454223918162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856734, "dur": 13, "args": { "External id": 159918, "cbid": 211, "correlation": 159918 } }, { "ph": "s", "id": 159918, "pid": 76337, "tid": -914061504, "ts": 1716454223856734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223918171, "dur": 3, "args": { "External id": 159920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159920, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159920, "pid": 5, "tid": 7, "ts": 1716454223918171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856749, "dur": 6, "args": { "External id": 159920, "cbid": 211, "correlation": 159920 } }, { "ph": "s", "id": 159920, "pid": 76337, "tid": -914061504, "ts": 1716454223856749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223918176, "dur": 56, "args": { "External id": 159945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159945, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 159945, "pid": 5, "tid": 7, "ts": 1716454223918176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856825, "dur": 12, "args": { "External id": 159945, "cbid": 211, "correlation": 159945 } }, { "ph": "s", "id": 159945, "pid": 76337, "tid": -914061504, "ts": 1716454223856825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223856923, "dur": 1, "args": { "External id": 159963, "cbid": 251, "correlation": 159963 } }, { "ph": "f", "id": 159963, "pid": 76337, "tid": -914061504, "ts": 1716454223856923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223918233, "dur": 91, "args": { "External id": 159965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159965, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 159965, "pid": 5, "tid": 7, "ts": 1716454223918233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223856929, "dur": 14, "args": { "External id": 159965, "cbid": 211, "correlation": 159965 } }, { "ph": "s", "id": 159965, "pid": 76337, "tid": -914061504, "ts": 1716454223856929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223918325, "dur": 9, "args": { "External id": 159973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159973, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159973, "pid": 5, "tid": 7, "ts": 1716454223918325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857009, "dur": 12, "args": { "External id": 159973, "cbid": 211, "correlation": 159973 } }, { "ph": "s", "id": 159973, "pid": 76337, "tid": -914061504, "ts": 1716454223857009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223918336, "dur": 20, "args": { "External id": 159981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 159981, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 159981, "pid": 5, "tid": 7, "ts": 1716454223918336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857052, "dur": 9, "args": { "External id": 159981, "cbid": 211, "correlation": 159981 } }, { "ph": "s", "id": 159981, "pid": 76337, "tid": -914061504, "ts": 1716454223857052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223918358, "dur": 17, "args": { "External id": 160003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160003, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160003, "pid": 5, "tid": 7, "ts": 1716454223918358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857104, "dur": 10, "args": { "External id": 160003, "cbid": 211, "correlation": 160003 } }, { "ph": "s", "id": 160003, "pid": 76337, "tid": -914061504, "ts": 1716454223857104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223857191, "dur": 1, "args": { "External id": 160019, "cbid": 251, "correlation": 160019 } }, { "ph": "f", "id": 160019, "pid": 76337, "tid": -914061504, "ts": 1716454223857191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223857197, "dur": 0, "args": { "External id": 160021, "cbid": 251, "correlation": 160021 } }, { "ph": "f", "id": 160021, "pid": 76337, "tid": -914061504, "ts": 1716454223857197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223918376, "dur": 498, "args": { "External id": 160022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160022, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160022, "pid": 5, "tid": 7, "ts": 1716454223918376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857199, "dur": 13, "args": { "External id": 160022, "cbid": 211, "correlation": 160022 } }, { "ph": "s", "id": 160022, "pid": 76337, "tid": -914061504, "ts": 1716454223857199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223918876, "dur": 67, "args": { "External id": 160030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160030, "pid": 5, "tid": 7, "ts": 1716454223918876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857266, "dur": 12, "args": { "External id": 160030, "cbid": 211, "correlation": 160030 } }, { "ph": "s", "id": 160030, "pid": 76337, "tid": -914061504, "ts": 1716454223857266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223918944, "dur": 66, "args": { "External id": 160038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160038, "pid": 5, "tid": 7, "ts": 1716454223918944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857297, "dur": 9, "args": { "External id": 160038, "cbid": 211, "correlation": 160038 } }, { "ph": "s", "id": 160038, "pid": 76337, "tid": -914061504, "ts": 1716454223857297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223857376, "dur": 1, "args": { "External id": 160054, "cbid": 251, "correlation": 160054 } }, { "ph": "f", "id": 160054, "pid": 76337, "tid": -914061504, "ts": 1716454223857376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223919013, "dur": 1, "args": { "External id": 160056, "device": 5, "context": 1, "stream": 7, "correlation": 160056, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 160056, "pid": 5, "tid": 7, "ts": 1716454223919013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223857381, "dur": 9, "args": { "External id": 160056, "cbid": 51, "correlation": 160056 } }, { "ph": "s", "id": 160056, "pid": 76337, "tid": -914061504, "ts": 1716454223857381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223919016, "dur": 271, "args": { "External id": 160057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160057, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160057, "pid": 5, "tid": 7, "ts": 1716454223919016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857392, "dur": 11, "args": { "External id": 160057, "cbid": 211, "correlation": 160057 } }, { "ph": "s", "id": 160057, "pid": 76337, "tid": -914061504, "ts": 1716454223857392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223919289, "dur": 14, "args": { "External id": 160065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160065, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160065, "pid": 5, "tid": 7, "ts": 1716454223919289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857434, "dur": 11, "args": { "External id": 160065, "cbid": 211, "correlation": 160065 } }, { "ph": "s", "id": 160065, "pid": 76337, "tid": -914061504, "ts": 1716454223857434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223919303, "dur": 38, "args": { "External id": 160076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160076, "pid": 5, "tid": 7, "ts": 1716454223919303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857503, "dur": 12, "args": { "External id": 160076, "cbid": 211, "correlation": 160076 } }, { "ph": "s", "id": 160076, "pid": 76337, "tid": -914061504, "ts": 1716454223857503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223857568, "dur": 0, "args": { "External id": 160088, "cbid": 317, "correlation": 160088 } }, { "ph": "f", "id": 160088, "pid": 76337, "tid": -914061504, "ts": 1716454223857568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223857568, "dur": 0, "args": { "External id": 160089, "cbid": 203, "correlation": 160089 } }, { "ph": "f", "id": 160089, "pid": 76337, "tid": -914061504, "ts": 1716454223857568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223857569, "dur": 0, "args": { "External id": 160090, "cbid": 205, "correlation": 160090 } }, { "ph": "f", "id": 160090, "pid": 76337, "tid": -914061504, "ts": 1716454223857569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223919342, "dur": 13, "args": { "External id": 160094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160094, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160094, "pid": 5, "tid": 7, "ts": 1716454223919342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857584, "dur": 13, "args": { "External id": 160094, "cbid": 211, "correlation": 160094 } }, { "ph": "s", "id": 160094, "pid": 76337, "tid": -914061504, "ts": 1716454223857584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223919357, "dur": 4, "args": { "External id": 160096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160096, "pid": 5, "tid": 7, "ts": 1716454223919357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857601, "dur": 6, "args": { "External id": 160096, "cbid": 211, "correlation": 160096 } }, { "ph": "s", "id": 160096, "pid": 76337, "tid": -914061504, "ts": 1716454223857601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223857610, "dur": 0, "args": { "External id": 160097, "cbid": 51, "correlation": 160097 } }, { "ph": "s", "id": 160097, "pid": 76337, "tid": -914061504, "ts": 1716454223857610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223919362, "dur": 96, "args": { "External id": 160098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160098, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 160098, "pid": 5, "tid": 7, "ts": 1716454223919362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857611, "dur": 5, "args": { "External id": 160098, "cbid": 211, "correlation": 160098 } }, { "ph": "s", "id": 160098, "pid": 76337, "tid": -914061504, "ts": 1716454223857611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223919460, "dur": 16, "args": { "External id": 160103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160103, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160103, "pid": 5, "tid": 7, "ts": 1716454223919460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857637, "dur": 10, "args": { "External id": 160103, "cbid": 211, "correlation": 160103 } }, { "ph": "s", "id": 160103, "pid": 76337, "tid": -914061504, "ts": 1716454223857637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223919477, "dur": 11, "args": { "External id": 160111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160111, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160111, "pid": 5, "tid": 7, "ts": 1716454223919477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857670, "dur": 8, "args": { "External id": 160111, "cbid": 211, "correlation": 160111 } }, { "ph": "s", "id": 160111, "pid": 76337, "tid": -914061504, "ts": 1716454223857670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223919490, "dur": 25, "args": { "External id": 160120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160120, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160120, "pid": 5, "tid": 7, "ts": 1716454223919490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857709, "dur": 10, "args": { "External id": 160120, "cbid": 211, "correlation": 160120 } }, { "ph": "s", "id": 160120, "pid": 76337, "tid": -914061504, "ts": 1716454223857709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223919516, "dur": 24, "args": { "External id": 160140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160140, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 160140, "pid": 5, "tid": 7, "ts": 1716454223919516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857780, "dur": 12, "args": { "External id": 160140, "cbid": 211, "correlation": 160140 } }, { "ph": "s", "id": 160140, "pid": 76337, "tid": -914061504, "ts": 1716454223857780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223919541, "dur": 5, "args": { "External id": 160152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160152, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 160152, "pid": 5, "tid": 7, "ts": 1716454223919541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857803, "dur": 6, "args": { "External id": 160152, "cbid": 211, "correlation": 160152 } }, { "ph": "s", "id": 160152, "pid": 76337, "tid": -914061504, "ts": 1716454223857803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223919547, "dur": 24, "args": { "External id": 160155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160155, "pid": 5, "tid": 7, "ts": 1716454223919547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857821, "dur": 6, "args": { "External id": 160155, "cbid": 211, "correlation": 160155 } }, { "ph": "s", "id": 160155, "pid": 76337, "tid": -914061504, "ts": 1716454223857821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223919572, "dur": 17, "args": { "External id": 160164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160164, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160164, "pid": 5, "tid": 7, "ts": 1716454223919572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857860, "dur": 9, "args": { "External id": 160164, "cbid": 211, "correlation": 160164 } }, { "ph": "s", "id": 160164, "pid": 76337, "tid": -914061504, "ts": 1716454223857860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223857911, "dur": 0, "args": { "External id": 160174, "cbid": 317, "correlation": 160174 } }, { "ph": "f", "id": 160174, "pid": 76337, "tid": -914061504, "ts": 1716454223857911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223857912, "dur": 0, "args": { "External id": 160175, "cbid": 203, "correlation": 160175 } }, { "ph": "f", "id": 160175, "pid": 76337, "tid": -914061504, "ts": 1716454223857912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223857912, "dur": 0, "args": { "External id": 160176, "cbid": 205, "correlation": 160176 } }, { "ph": "f", "id": 160176, "pid": 76337, "tid": -914061504, "ts": 1716454223857912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223919591, "dur": 18, "args": { "External id": 160180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160180, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160180, "pid": 5, "tid": 7, "ts": 1716454223919591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857927, "dur": 11, "args": { "External id": 160180, "cbid": 211, "correlation": 160180 } }, { "ph": "s", "id": 160180, "pid": 76337, "tid": -914061504, "ts": 1716454223857927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223919610, "dur": 241, "args": { "External id": 160182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160182, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160182, "pid": 5, "tid": 7, "ts": 1716454223919610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857940, "dur": 5, "args": { "External id": 160182, "cbid": 211, "correlation": 160182 } }, { "ph": "s", "id": 160182, "pid": 76337, "tid": -914061504, "ts": 1716454223857940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223919852, "dur": 1, "args": { "External id": 160184, "device": 5, "context": 1, "stream": 7, "correlation": 160184, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 160184, "pid": 5, "tid": 7, "ts": 1716454223919852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223857952, "dur": 8, "args": { "External id": 160184, "cbid": 51, "correlation": 160184 } }, { "ph": "s", "id": 160184, "pid": 76337, "tid": -914061504, "ts": 1716454223857952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223919856, "dur": 810, "args": { "External id": 160185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160185, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160185, "pid": 5, "tid": 7, "ts": 1716454223919856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857961, "dur": 6, "args": { "External id": 160185, "cbid": 211, "correlation": 160185 } }, { "ph": "s", "id": 160185, "pid": 76337, "tid": -914061504, "ts": 1716454223857961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223920667, "dur": 14, "args": { "External id": 160187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160187, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160187, "pid": 5, "tid": 7, "ts": 1716454223920667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223857972, "dur": 14, "args": { "External id": 160187, "cbid": 211, "correlation": 160187 } }, { "ph": "s", "id": 160187, "pid": 76337, "tid": -914061504, "ts": 1716454223857972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223920682, "dur": 15, "args": { "External id": 160193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160193, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160193, "pid": 5, "tid": 7, "ts": 1716454223920682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858009, "dur": 9, "args": { "External id": 160193, "cbid": 211, "correlation": 160193 } }, { "ph": "s", "id": 160193, "pid": 76337, "tid": -914061504, "ts": 1716454223858009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223920699, "dur": 3, "args": { "External id": 160201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160201, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 160201, "pid": 5, "tid": 7, "ts": 1716454223920699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858054, "dur": 9, "args": { "External id": 160201, "cbid": 211, "correlation": 160201 } }, { "ph": "s", "id": 160201, "pid": 76337, "tid": -914061504, "ts": 1716454223858054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223858118, "dur": 1, "args": { "External id": 160217, "cbid": 251, "correlation": 160217 } }, { "ph": "f", "id": 160217, "pid": 76337, "tid": -914061504, "ts": 1716454223858118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223858123, "dur": 0, "args": { "External id": 160219, "cbid": 251, "correlation": 160219 } }, { "ph": "f", "id": 160219, "pid": 76337, "tid": -914061504, "ts": 1716454223858123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223920704, "dur": 14, "args": { "External id": 160220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160220, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160220, "pid": 5, "tid": 7, "ts": 1716454223920704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858125, "dur": 11, "args": { "External id": 160220, "cbid": 211, "correlation": 160220 } }, { "ph": "s", "id": 160220, "pid": 76337, "tid": -914061504, "ts": 1716454223858125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223920719, "dur": 5, "args": { "External id": 160222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160222, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160222, "pid": 5, "tid": 7, "ts": 1716454223920719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858138, "dur": 6, "args": { "External id": 160222, "cbid": 211, "correlation": 160222 } }, { "ph": "s", "id": 160222, "pid": 76337, "tid": -914061504, "ts": 1716454223858138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223920725, "dur": 16, "args": { "External id": 160232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160232, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160232, "pid": 5, "tid": 7, "ts": 1716454223920725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858197, "dur": 12, "args": { "External id": 160232, "cbid": 211, "correlation": 160232 } }, { "ph": "s", "id": 160232, "pid": 76337, "tid": -914061504, "ts": 1716454223858197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223920743, "dur": 17, "args": { "External id": 160252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160252, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 160252, "pid": 5, "tid": 7, "ts": 1716454223920743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858262, "dur": 10, "args": { "External id": 160252, "cbid": 211, "correlation": 160252 } }, { "ph": "s", "id": 160252, "pid": 76337, "tid": -914061504, "ts": 1716454223858262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223920762, "dur": 4, "args": { "External id": 160264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160264, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 160264, "pid": 5, "tid": 7, "ts": 1716454223920762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858282, "dur": 6, "args": { "External id": 160264, "cbid": 211, "correlation": 160264 } }, { "ph": "s", "id": 160264, "pid": 76337, "tid": -914061504, "ts": 1716454223858282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223920767, "dur": 17, "args": { "External id": 160267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160267, "pid": 5, "tid": 7, "ts": 1716454223920767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858301, "dur": 6, "args": { "External id": 160267, "cbid": 211, "correlation": 160267 } }, { "ph": "s", "id": 160267, "pid": 76337, "tid": -914061504, "ts": 1716454223858301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223920785, "dur": 10, "args": { "External id": 160276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160276, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160276, "pid": 5, "tid": 7, "ts": 1716454223920785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858341, "dur": 11, "args": { "External id": 160276, "cbid": 211, "correlation": 160276 } }, { "ph": "s", "id": 160276, "pid": 76337, "tid": -914061504, "ts": 1716454223858341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223858404, "dur": 0, "args": { "External id": 160286, "cbid": 317, "correlation": 160286 } }, { "ph": "f", "id": 160286, "pid": 76337, "tid": -914061504, "ts": 1716454223858404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223858405, "dur": 0, "args": { "External id": 160287, "cbid": 203, "correlation": 160287 } }, { "ph": "f", "id": 160287, "pid": 76337, "tid": -914061504, "ts": 1716454223858405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223858405, "dur": 0, "args": { "External id": 160288, "cbid": 205, "correlation": 160288 } }, { "ph": "f", "id": 160288, "pid": 76337, "tid": -914061504, "ts": 1716454223858405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223920797, "dur": 12, "args": { "External id": 160292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160292, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160292, "pid": 5, "tid": 7, "ts": 1716454223920797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858419, "dur": 12, "args": { "External id": 160292, "cbid": 211, "correlation": 160292 } }, { "ph": "s", "id": 160292, "pid": 76337, "tid": -914061504, "ts": 1716454223858419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223920810, "dur": 162, "args": { "External id": 160294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160294, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160294, "pid": 5, "tid": 7, "ts": 1716454223920810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858433, "dur": 5, "args": { "External id": 160294, "cbid": 211, "correlation": 160294 } }, { "ph": "s", "id": 160294, "pid": 76337, "tid": -914061504, "ts": 1716454223858433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223920974, "dur": 1, "args": { "External id": 160296, "device": 5, "context": 1, "stream": 7, "correlation": 160296, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 160296, "pid": 5, "tid": 7, "ts": 1716454223920974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223858444, "dur": 6, "args": { "External id": 160296, "cbid": 51, "correlation": 160296 } }, { "ph": "s", "id": 160296, "pid": 76337, "tid": -914061504, "ts": 1716454223858444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223920978, "dur": 646, "args": { "External id": 160297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160297, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160297, "pid": 5, "tid": 7, "ts": 1716454223920978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858452, "dur": 6, "args": { "External id": 160297, "cbid": 211, "correlation": 160297 } }, { "ph": "s", "id": 160297, "pid": 76337, "tid": -914061504, "ts": 1716454223858452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223921626, "dur": 12, "args": { "External id": 160299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160299, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160299, "pid": 5, "tid": 7, "ts": 1716454223921626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858463, "dur": 5, "args": { "External id": 160299, "cbid": 211, "correlation": 160299 } }, { "ph": "s", "id": 160299, "pid": 76337, "tid": -914061504, "ts": 1716454223858463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223921639, "dur": 15, "args": { "External id": 160305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160305, "pid": 5, "tid": 7, "ts": 1716454223921639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858491, "dur": 9, "args": { "External id": 160305, "cbid": 211, "correlation": 160305 } }, { "ph": "s", "id": 160305, "pid": 76337, "tid": -914061504, "ts": 1716454223858491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223858550, "dur": 0, "args": { "External id": 160315, "cbid": 317, "correlation": 160315 } }, { "ph": "f", "id": 160315, "pid": 76337, "tid": -914061504, "ts": 1716454223858550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223858551, "dur": 0, "args": { "External id": 160316, "cbid": 203, "correlation": 160316 } }, { "ph": "f", "id": 160316, "pid": 76337, "tid": -914061504, "ts": 1716454223858551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223858552, "dur": 0, "args": { "External id": 160317, "cbid": 205, "correlation": 160317 } }, { "ph": "f", "id": 160317, "pid": 76337, "tid": -914061504, "ts": 1716454223858552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223921655, "dur": 17, "args": { "External id": 160321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160321, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160321, "pid": 5, "tid": 7, "ts": 1716454223921655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858566, "dur": 12, "args": { "External id": 160321, "cbid": 211, "correlation": 160321 } }, { "ph": "s", "id": 160321, "pid": 76337, "tid": -914061504, "ts": 1716454223858566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223921673, "dur": 4, "args": { "External id": 160323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160323, "pid": 5, "tid": 7, "ts": 1716454223921673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858582, "dur": 6, "args": { "External id": 160323, "cbid": 211, "correlation": 160323 } }, { "ph": "s", "id": 160323, "pid": 76337, "tid": -914061504, "ts": 1716454223858582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223858591, "dur": 0, "args": { "External id": 160324, "cbid": 51, "correlation": 160324 } }, { "ph": "s", "id": 160324, "pid": 76337, "tid": -914061504, "ts": 1716454223858591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223921679, "dur": 131, "args": { "External id": 160325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160325, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 160325, "pid": 5, "tid": 7, "ts": 1716454223921679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858592, "dur": 5, "args": { "External id": 160325, "cbid": 211, "correlation": 160325 } }, { "ph": "s", "id": 160325, "pid": 76337, "tid": -914061504, "ts": 1716454223858592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223921811, "dur": 15, "args": { "External id": 160330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160330, "pid": 5, "tid": 7, "ts": 1716454223921811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858618, "dur": 8, "args": { "External id": 160330, "cbid": 211, "correlation": 160330 } }, { "ph": "s", "id": 160330, "pid": 76337, "tid": -914061504, "ts": 1716454223858618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223921828, "dur": 11, "args": { "External id": 160338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160338, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160338, "pid": 5, "tid": 7, "ts": 1716454223921828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858647, "dur": 8, "args": { "External id": 160338, "cbid": 211, "correlation": 160338 } }, { "ph": "s", "id": 160338, "pid": 76337, "tid": -914061504, "ts": 1716454223858647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223921841, "dur": 10, "args": { "External id": 160346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160346, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160346, "pid": 5, "tid": 7, "ts": 1716454223921841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858676, "dur": 8, "args": { "External id": 160346, "cbid": 211, "correlation": 160346 } }, { "ph": "s", "id": 160346, "pid": 76337, "tid": -914061504, "ts": 1716454223858676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223921852, "dur": 18, "args": { "External id": 160366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160366, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 160366, "pid": 5, "tid": 7, "ts": 1716454223921852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858758, "dur": 12, "args": { "External id": 160366, "cbid": 211, "correlation": 160366 } }, { "ph": "s", "id": 160366, "pid": 76337, "tid": -914061504, "ts": 1716454223858758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223921871, "dur": 5, "args": { "External id": 160378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160378, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 160378, "pid": 5, "tid": 7, "ts": 1716454223921871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858780, "dur": 6, "args": { "External id": 160378, "cbid": 211, "correlation": 160378 } }, { "ph": "s", "id": 160378, "pid": 76337, "tid": -914061504, "ts": 1716454223858780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223921877, "dur": 17, "args": { "External id": 160381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160381, "pid": 5, "tid": 7, "ts": 1716454223921877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858798, "dur": 7, "args": { "External id": 160381, "cbid": 211, "correlation": 160381 } }, { "ph": "s", "id": 160381, "pid": 76337, "tid": -914061504, "ts": 1716454223858798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223858856, "dur": 0, "args": { "External id": 160392, "cbid": 317, "correlation": 160392 } }, { "ph": "f", "id": 160392, "pid": 76337, "tid": -914061504, "ts": 1716454223858856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223858857, "dur": 0, "args": { "External id": 160393, "cbid": 203, "correlation": 160393 } }, { "ph": "f", "id": 160393, "pid": 76337, "tid": -914061504, "ts": 1716454223858857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223858858, "dur": 0, "args": { "External id": 160394, "cbid": 205, "correlation": 160394 } }, { "ph": "f", "id": 160394, "pid": 76337, "tid": -914061504, "ts": 1716454223858858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223921896, "dur": 12, "args": { "External id": 160398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160398, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160398, "pid": 5, "tid": 7, "ts": 1716454223921896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858871, "dur": 11, "args": { "External id": 160398, "cbid": 211, "correlation": 160398 } }, { "ph": "s", "id": 160398, "pid": 76337, "tid": -914061504, "ts": 1716454223858871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223921909, "dur": 3, "args": { "External id": 160400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160400, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160400, "pid": 5, "tid": 7, "ts": 1716454223921909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858887, "dur": 7, "args": { "External id": 160400, "cbid": 211, "correlation": 160400 } }, { "ph": "s", "id": 160400, "pid": 76337, "tid": -914061504, "ts": 1716454223858887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223858896, "dur": 0, "args": { "External id": 160401, "cbid": 51, "correlation": 160401 } }, { "ph": "s", "id": 160401, "pid": 76337, "tid": -914061504, "ts": 1716454223858896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223921914, "dur": 91, "args": { "External id": 160402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160402, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 160402, "pid": 5, "tid": 7, "ts": 1716454223921914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858897, "dur": 5, "args": { "External id": 160402, "cbid": 211, "correlation": 160402 } }, { "ph": "s", "id": 160402, "pid": 76337, "tid": -914061504, "ts": 1716454223858897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223922006, "dur": 16, "args": { "External id": 160407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160407, "pid": 5, "tid": 7, "ts": 1716454223922006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223858923, "dur": 9, "args": { "External id": 160407, "cbid": 211, "correlation": 160407 } }, { "ph": "s", "id": 160407, "pid": 76337, "tid": -914061504, "ts": 1716454223858923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223922023, "dur": 84, "args": { "External id": 160416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160416, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160416, "pid": 5, "tid": 7, "ts": 1716454223922023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859014, "dur": 14, "args": { "External id": 160416, "cbid": 211, "correlation": 160416 } }, { "ph": "s", "id": 160416, "pid": 76337, "tid": -914061504, "ts": 1716454223859014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223922109, "dur": 30, "args": { "External id": 160438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160438, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160438, "pid": 5, "tid": 7, "ts": 1716454223922109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859071, "dur": 10, "args": { "External id": 160438, "cbid": 211, "correlation": 160438 } }, { "ph": "s", "id": 160438, "pid": 76337, "tid": -914061504, "ts": 1716454223859071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859159, "dur": 1, "args": { "External id": 160449, "cbid": 251, "correlation": 160449 } }, { "ph": "f", "id": 160449, "pid": 76337, "tid": -914061504, "ts": 1716454223859159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223922140, "dur": 164, "args": { "External id": 160450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160450, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160450, "pid": 5, "tid": 7, "ts": 1716454223922140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859164, "dur": 13, "args": { "External id": 160450, "cbid": 211, "correlation": 160450 } }, { "ph": "s", "id": 160450, "pid": 76337, "tid": -914061504, "ts": 1716454223859164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859234, "dur": 1, "args": { "External id": 160461, "cbid": 251, "correlation": 160461 } }, { "ph": "f", "id": 160461, "pid": 76337, "tid": -914061504, "ts": 1716454223859234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223922305, "dur": 157, "args": { "External id": 160462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160462, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160462, "pid": 5, "tid": 7, "ts": 1716454223922305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859238, "dur": 12, "args": { "External id": 160462, "cbid": 211, "correlation": 160462 } }, { "ph": "s", "id": 160462, "pid": 76337, "tid": -914061504, "ts": 1716454223859238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859304, "dur": 1, "args": { "External id": 160473, "cbid": 251, "correlation": 160473 } }, { "ph": "f", "id": 160473, "pid": 76337, "tid": -914061504, "ts": 1716454223859304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223922463, "dur": 160, "args": { "External id": 160474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160474, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160474, "pid": 5, "tid": 7, "ts": 1716454223922463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859308, "dur": 11, "args": { "External id": 160474, "cbid": 211, "correlation": 160474 } }, { "ph": "s", "id": 160474, "pid": 76337, "tid": -914061504, "ts": 1716454223859308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223922624, "dur": 338, "args": { "External id": 160499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160499, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160499, "pid": 5, "tid": 7, "ts": 1716454223922624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859392, "dur": 12, "args": { "External id": 160499, "cbid": 211, "correlation": 160499 } }, { "ph": "s", "id": 160499, "pid": 76337, "tid": -914061504, "ts": 1716454223859392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859491, "dur": 1, "args": { "External id": 160517, "cbid": 251, "correlation": 160517 } }, { "ph": "f", "id": 160517, "pid": 76337, "tid": -914061504, "ts": 1716454223859491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223922964, "dur": 166, "args": { "External id": 160519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160519, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160519, "pid": 5, "tid": 7, "ts": 1716454223922964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859497, "dur": 13, "args": { "External id": 160519, "cbid": 211, "correlation": 160519 } }, { "ph": "s", "id": 160519, "pid": 76337, "tid": -914061504, "ts": 1716454223859497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223923132, "dur": 19, "args": { "External id": 160527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160527, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160527, "pid": 5, "tid": 7, "ts": 1716454223923132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859568, "dur": 12, "args": { "External id": 160527, "cbid": 211, "correlation": 160527 } }, { "ph": "s", "id": 160527, "pid": 76337, "tid": -914061504, "ts": 1716454223859568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223923152, "dur": 28, "args": { "External id": 160535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160535, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160535, "pid": 5, "tid": 7, "ts": 1716454223923152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859607, "dur": 8, "args": { "External id": 160535, "cbid": 211, "correlation": 160535 } }, { "ph": "s", "id": 160535, "pid": 76337, "tid": -914061504, "ts": 1716454223859607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223923181, "dur": 18, "args": { "External id": 160546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160546, "pid": 5, "tid": 7, "ts": 1716454223923181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859677, "dur": 13, "args": { "External id": 160546, "cbid": 211, "correlation": 160546 } }, { "ph": "s", "id": 160546, "pid": 76337, "tid": -914061504, "ts": 1716454223859677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223923200, "dur": 16, "args": { "External id": 160568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160568, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160568, "pid": 5, "tid": 7, "ts": 1716454223923200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859709, "dur": 7, "args": { "External id": 160568, "cbid": 211, "correlation": 160568 } }, { "ph": "s", "id": 160568, "pid": 76337, "tid": -914061504, "ts": 1716454223859709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859794, "dur": 1, "args": { "External id": 160579, "cbid": 251, "correlation": 160579 } }, { "ph": "f", "id": 160579, "pid": 76337, "tid": -914061504, "ts": 1716454223859794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223923218, "dur": 89, "args": { "External id": 160580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160580, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160580, "pid": 5, "tid": 7, "ts": 1716454223923218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859799, "dur": 12, "args": { "External id": 160580, "cbid": 211, "correlation": 160580 } }, { "ph": "s", "id": 160580, "pid": 76337, "tid": -914061504, "ts": 1716454223859799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859868, "dur": 1, "args": { "External id": 160591, "cbid": 251, "correlation": 160591 } }, { "ph": "f", "id": 160591, "pid": 76337, "tid": -914061504, "ts": 1716454223859868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859871, "dur": 0, "args": { "External id": 160592, "cbid": 251, "correlation": 160592 } }, { "ph": "f", "id": 160592, "pid": 76337, "tid": -914061504, "ts": 1716454223859871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223923309, "dur": 12, "args": { "External id": 160593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160593, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160593, "pid": 5, "tid": 7, "ts": 1716454223923309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859873, "dur": 12, "args": { "External id": 160593, "cbid": 211, "correlation": 160593 } }, { "ph": "s", "id": 160593, "pid": 76337, "tid": -914061504, "ts": 1716454223859873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223923322, "dur": 5, "args": { "External id": 160595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160595, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160595, "pid": 5, "tid": 7, "ts": 1716454223923322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859887, "dur": 5, "args": { "External id": 160595, "cbid": 211, "correlation": 160595 } }, { "ph": "s", "id": 160595, "pid": 76337, "tid": -914061504, "ts": 1716454223859887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859943, "dur": 1, "args": { "External id": 160606, "cbid": 251, "correlation": 160606 } }, { "ph": "f", "id": 160606, "pid": 76337, "tid": -914061504, "ts": 1716454223859943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223859947, "dur": 0, "args": { "External id": 160607, "cbid": 251, "correlation": 160607 } }, { "ph": "f", "id": 160607, "pid": 76337, "tid": -914061504, "ts": 1716454223859947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223923329, "dur": 9, "args": { "External id": 160608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160608, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160608, "pid": 5, "tid": 7, "ts": 1716454223923329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859949, "dur": 11, "args": { "External id": 160608, "cbid": 211, "correlation": 160608 } }, { "ph": "s", "id": 160608, "pid": 76337, "tid": -914061504, "ts": 1716454223859949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223923339, "dur": 3, "args": { "External id": 160610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160610, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160610, "pid": 5, "tid": 7, "ts": 1716454223923339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223859962, "dur": 5, "args": { "External id": 160610, "cbid": 211, "correlation": 160610 } }, { "ph": "s", "id": 160610, "pid": 76337, "tid": -914061504, "ts": 1716454223859962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223923344, "dur": 56, "args": { "External id": 160635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160635, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160635, "pid": 5, "tid": 7, "ts": 1716454223923344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860049, "dur": 14, "args": { "External id": 160635, "cbid": 211, "correlation": 160635 } }, { "ph": "s", "id": 160635, "pid": 76337, "tid": -914061504, "ts": 1716454223860049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223860149, "dur": 1, "args": { "External id": 160653, "cbid": 251, "correlation": 160653 } }, { "ph": "f", "id": 160653, "pid": 76337, "tid": -914061504, "ts": 1716454223860149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223923401, "dur": 92, "args": { "External id": 160655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160655, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160655, "pid": 5, "tid": 7, "ts": 1716454223923401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860156, "dur": 13, "args": { "External id": 160655, "cbid": 211, "correlation": 160655 } }, { "ph": "s", "id": 160655, "pid": 76337, "tid": -914061504, "ts": 1716454223860156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223923494, "dur": 10, "args": { "External id": 160663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160663, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160663, "pid": 5, "tid": 7, "ts": 1716454223923494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860225, "dur": 12, "args": { "External id": 160663, "cbid": 211, "correlation": 160663 } }, { "ph": "s", "id": 160663, "pid": 76337, "tid": -914061504, "ts": 1716454223860225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223923505, "dur": 21, "args": { "External id": 160671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160671, "pid": 5, "tid": 7, "ts": 1716454223923505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860267, "dur": 9, "args": { "External id": 160671, "cbid": 211, "correlation": 160671 } }, { "ph": "s", "id": 160671, "pid": 76337, "tid": -914061504, "ts": 1716454223860267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223923528, "dur": 18, "args": { "External id": 160693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160693, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160693, "pid": 5, "tid": 7, "ts": 1716454223923528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860319, "dur": 10, "args": { "External id": 160693, "cbid": 211, "correlation": 160693 } }, { "ph": "s", "id": 160693, "pid": 76337, "tid": -914061504, "ts": 1716454223860319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223860406, "dur": 1, "args": { "External id": 160709, "cbid": 251, "correlation": 160709 } }, { "ph": "f", "id": 160709, "pid": 76337, "tid": -914061504, "ts": 1716454223860406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223860411, "dur": 0, "args": { "External id": 160711, "cbid": 251, "correlation": 160711 } }, { "ph": "f", "id": 160711, "pid": 76337, "tid": -914061504, "ts": 1716454223860411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223923547, "dur": 498, "args": { "External id": 160712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160712, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160712, "pid": 5, "tid": 7, "ts": 1716454223923547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860413, "dur": 12, "args": { "External id": 160712, "cbid": 211, "correlation": 160712 } }, { "ph": "s", "id": 160712, "pid": 76337, "tid": -914061504, "ts": 1716454223860413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223924047, "dur": 66, "args": { "External id": 160720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160720, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160720, "pid": 5, "tid": 7, "ts": 1716454223924047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860478, "dur": 12, "args": { "External id": 160720, "cbid": 211, "correlation": 160720 } }, { "ph": "s", "id": 160720, "pid": 76337, "tid": -914061504, "ts": 1716454223860478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223924114, "dur": 67, "args": { "External id": 160728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160728, "pid": 5, "tid": 7, "ts": 1716454223924114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860509, "dur": 8, "args": { "External id": 160728, "cbid": 211, "correlation": 160728 } }, { "ph": "s", "id": 160728, "pid": 76337, "tid": -914061504, "ts": 1716454223860509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223860587, "dur": 1, "args": { "External id": 160744, "cbid": 251, "correlation": 160744 } }, { "ph": "f", "id": 160744, "pid": 76337, "tid": -914061504, "ts": 1716454223860587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454223924183, "dur": 1, "args": { "External id": 160746, "device": 5, "context": 1, "stream": 7, "correlation": 160746, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 160746, "pid": 5, "tid": 7, "ts": 1716454223924183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223860592, "dur": 9, "args": { "External id": 160746, "cbid": 51, "correlation": 160746 } }, { "ph": "s", "id": 160746, "pid": 76337, "tid": -914061504, "ts": 1716454223860592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223924187, "dur": 269, "args": { "External id": 160747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160747, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160747, "pid": 5, "tid": 7, "ts": 1716454223924187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860604, "dur": 11, "args": { "External id": 160747, "cbid": 211, "correlation": 160747 } }, { "ph": "s", "id": 160747, "pid": 76337, "tid": -914061504, "ts": 1716454223860604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223924458, "dur": 13, "args": { "External id": 160755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160755, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160755, "pid": 5, "tid": 7, "ts": 1716454223924458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860645, "dur": 12, "args": { "External id": 160755, "cbid": 211, "correlation": 160755 } }, { "ph": "s", "id": 160755, "pid": 76337, "tid": -914061504, "ts": 1716454223860645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223924473, "dur": 38, "args": { "External id": 160766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160766, "pid": 5, "tid": 7, "ts": 1716454223924473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860715, "dur": 12, "args": { "External id": 160766, "cbid": 211, "correlation": 160766 } }, { "ph": "s", "id": 160766, "pid": 76337, "tid": -914061504, "ts": 1716454223860715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223860779, "dur": 0, "args": { "External id": 160778, "cbid": 317, "correlation": 160778 } }, { "ph": "f", "id": 160778, "pid": 76337, "tid": -914061504, "ts": 1716454223860779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223860779, "dur": 0, "args": { "External id": 160779, "cbid": 203, "correlation": 160779 } }, { "ph": "f", "id": 160779, "pid": 76337, "tid": -914061504, "ts": 1716454223860779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223860780, "dur": 0, "args": { "External id": 160780, "cbid": 205, "correlation": 160780 } }, { "ph": "f", "id": 160780, "pid": 76337, "tid": -914061504, "ts": 1716454223860780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223924511, "dur": 13, "args": { "External id": 160784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160784, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160784, "pid": 5, "tid": 7, "ts": 1716454223924511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860796, "dur": 13, "args": { "External id": 160784, "cbid": 211, "correlation": 160784 } }, { "ph": "s", "id": 160784, "pid": 76337, "tid": -914061504, "ts": 1716454223860796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223924526, "dur": 4, "args": { "External id": 160786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 160786, "pid": 5, "tid": 7, "ts": 1716454223924526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860813, "dur": 6, "args": { "External id": 160786, "cbid": 211, "correlation": 160786 } }, { "ph": "s", "id": 160786, "pid": 76337, "tid": -914061504, "ts": 1716454223860813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223860821, "dur": 0, "args": { "External id": 160787, "cbid": 51, "correlation": 160787 } }, { "ph": "s", "id": 160787, "pid": 76337, "tid": -914061504, "ts": 1716454223860821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223924531, "dur": 97, "args": { "External id": 160788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160788, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 160788, "pid": 5, "tid": 7, "ts": 1716454223924531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860822, "dur": 5, "args": { "External id": 160788, "cbid": 211, "correlation": 160788 } }, { "ph": "s", "id": 160788, "pid": 76337, "tid": -914061504, "ts": 1716454223860822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223924629, "dur": 17, "args": { "External id": 160793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160793, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160793, "pid": 5, "tid": 7, "ts": 1716454223924629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860849, "dur": 9, "args": { "External id": 160793, "cbid": 211, "correlation": 160793 } }, { "ph": "s", "id": 160793, "pid": 76337, "tid": -914061504, "ts": 1716454223860849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223924648, "dur": 12, "args": { "External id": 160801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160801, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160801, "pid": 5, "tid": 7, "ts": 1716454223924648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860881, "dur": 8, "args": { "External id": 160801, "cbid": 211, "correlation": 160801 } }, { "ph": "s", "id": 160801, "pid": 76337, "tid": -914061504, "ts": 1716454223860881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223924661, "dur": 57, "args": { "External id": 160812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160812, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160812, "pid": 5, "tid": 7, "ts": 1716454223924661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223860945, "dur": 13, "args": { "External id": 160812, "cbid": 211, "correlation": 160812 } }, { "ph": "s", "id": 160812, "pid": 76337, "tid": -914061504, "ts": 1716454223860945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223861013, "dur": 0, "args": { "External id": 160822, "cbid": 317, "correlation": 160822 } }, { "ph": "f", "id": 160822, "pid": 76337, "tid": -914061504, "ts": 1716454223861013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223861013, "dur": 0, "args": { "External id": 160823, "cbid": 203, "correlation": 160823 } }, { "ph": "f", "id": 160823, "pid": 76337, "tid": -914061504, "ts": 1716454223861013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223861014, "dur": 0, "args": { "External id": 160824, "cbid": 205, "correlation": 160824 } }, { "ph": "f", "id": 160824, "pid": 76337, "tid": -914061504, "ts": 1716454223861014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223924720, "dur": 38, "args": { "External id": 160828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160828, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160828, "pid": 5, "tid": 7, "ts": 1716454223924720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861030, "dur": 12, "args": { "External id": 160828, "cbid": 211, "correlation": 160828 } }, { "ph": "s", "id": 160828, "pid": 76337, "tid": -914061504, "ts": 1716454223861030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223924760, "dur": 162, "args": { "External id": 160830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160830, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160830, "pid": 5, "tid": 7, "ts": 1716454223924760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861045, "dur": 5, "args": { "External id": 160830, "cbid": 211, "correlation": 160830 } }, { "ph": "s", "id": 160830, "pid": 76337, "tid": -914061504, "ts": 1716454223861045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223924923, "dur": 1965, "args": { "External id": 160832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160832, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160832, "pid": 5, "tid": 7, "ts": 1716454223924923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861057, "dur": 9, "args": { "External id": 160832, "cbid": 211, "correlation": 160832 } }, { "ph": "s", "id": 160832, "pid": 76337, "tid": -914061504, "ts": 1716454223861057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223926890, "dur": 39, "args": { "External id": 160834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160834, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160834, "pid": 5, "tid": 7, "ts": 1716454223926890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861070, "dur": 5, "args": { "External id": 160834, "cbid": 211, "correlation": 160834 } }, { "ph": "s", "id": 160834, "pid": 76337, "tid": -914061504, "ts": 1716454223861070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223926930, "dur": 59, "args": { "External id": 160840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160840, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160840, "pid": 5, "tid": 7, "ts": 1716454223926930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861098, "dur": 9, "args": { "External id": 160840, "cbid": 211, "correlation": 160840 } }, { "ph": "s", "id": 160840, "pid": 76337, "tid": -914061504, "ts": 1716454223861098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223926990, "dur": 87, "args": { "External id": 160849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160849, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160849, "pid": 5, "tid": 7, "ts": 1716454223926990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861190, "dur": 13, "args": { "External id": 160849, "cbid": 211, "correlation": 160849 } }, { "ph": "s", "id": 160849, "pid": 76337, "tid": -914061504, "ts": 1716454223861190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223927078, "dur": 73, "args": { "External id": 160869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160869, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 160869, "pid": 5, "tid": 7, "ts": 1716454223927078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861262, "dur": 11, "args": { "External id": 160869, "cbid": 211, "correlation": 160869 } }, { "ph": "s", "id": 160869, "pid": 76337, "tid": -914061504, "ts": 1716454223861262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223927153, "dur": 5, "args": { "External id": 160881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160881, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 160881, "pid": 5, "tid": 7, "ts": 1716454223927153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861283, "dur": 7, "args": { "External id": 160881, "cbid": 211, "correlation": 160881 } }, { "ph": "s", "id": 160881, "pid": 76337, "tid": -914061504, "ts": 1716454223861283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223927159, "dur": 82, "args": { "External id": 160884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160884, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160884, "pid": 5, "tid": 7, "ts": 1716454223927159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861302, "dur": 8, "args": { "External id": 160884, "cbid": 211, "correlation": 160884 } }, { "ph": "s", "id": 160884, "pid": 76337, "tid": -914061504, "ts": 1716454223861302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223927243, "dur": 53, "args": { "External id": 160893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160893, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160893, "pid": 5, "tid": 7, "ts": 1716454223927243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861342, "dur": 10, "args": { "External id": 160893, "cbid": 211, "correlation": 160893 } }, { "ph": "s", "id": 160893, "pid": 76337, "tid": -914061504, "ts": 1716454223861342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223861393, "dur": 0, "args": { "External id": 160903, "cbid": 317, "correlation": 160903 } }, { "ph": "f", "id": 160903, "pid": 76337, "tid": -914061504, "ts": 1716454223861393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223861394, "dur": 0, "args": { "External id": 160904, "cbid": 203, "correlation": 160904 } }, { "ph": "f", "id": 160904, "pid": 76337, "tid": -914061504, "ts": 1716454223861394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223861394, "dur": 0, "args": { "External id": 160905, "cbid": 205, "correlation": 160905 } }, { "ph": "f", "id": 160905, "pid": 76337, "tid": -914061504, "ts": 1716454223861394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223927296, "dur": 57, "args": { "External id": 160909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160909, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160909, "pid": 5, "tid": 7, "ts": 1716454223927296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861411, "dur": 11, "args": { "External id": 160909, "cbid": 211, "correlation": 160909 } }, { "ph": "s", "id": 160909, "pid": 76337, "tid": -914061504, "ts": 1716454223861411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223927355, "dur": 122, "args": { "External id": 160911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160911, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160911, "pid": 5, "tid": 7, "ts": 1716454223927355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861425, "dur": 5, "args": { "External id": 160911, "cbid": 211, "correlation": 160911 } }, { "ph": "s", "id": 160911, "pid": 76337, "tid": -914061504, "ts": 1716454223861425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223927478, "dur": 1894, "args": { "External id": 160913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160913, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 160913, "pid": 5, "tid": 7, "ts": 1716454223927478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861436, "dur": 6, "args": { "External id": 160913, "cbid": 211, "correlation": 160913 } }, { "ph": "s", "id": 160913, "pid": 76337, "tid": -914061504, "ts": 1716454223861436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223929373, "dur": 22, "args": { "External id": 160915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160915, "pid": 5, "tid": 7, "ts": 1716454223929373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861446, "dur": 5, "args": { "External id": 160915, "cbid": 211, "correlation": 160915 } }, { "ph": "s", "id": 160915, "pid": 76337, "tid": -914061504, "ts": 1716454223861446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223929396, "dur": 32, "args": { "External id": 160921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160921, "pid": 5, "tid": 7, "ts": 1716454223929396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861474, "dur": 9, "args": { "External id": 160921, "cbid": 211, "correlation": 160921 } }, { "ph": "s", "id": 160921, "pid": 76337, "tid": -914061504, "ts": 1716454223861474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223929429, "dur": 3, "args": { "External id": 160929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160929, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 160929, "pid": 5, "tid": 7, "ts": 1716454223929429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861518, "dur": 10, "args": { "External id": 160929, "cbid": 211, "correlation": 160929 } }, { "ph": "s", "id": 160929, "pid": 76337, "tid": -914061504, "ts": 1716454223861518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223861584, "dur": 1, "args": { "External id": 160945, "cbid": 251, "correlation": 160945 } }, { "ph": "f", "id": 160945, "pid": 76337, "tid": -914061504, "ts": 1716454223861584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223861589, "dur": 0, "args": { "External id": 160947, "cbid": 251, "correlation": 160947 } }, { "ph": "f", "id": 160947, "pid": 76337, "tid": -914061504, "ts": 1716454223861589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223929434, "dur": 12, "args": { "External id": 160948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160948, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 160948, "pid": 5, "tid": 7, "ts": 1716454223929434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861591, "dur": 11, "args": { "External id": 160948, "cbid": 211, "correlation": 160948 } }, { "ph": "s", "id": 160948, "pid": 76337, "tid": -914061504, "ts": 1716454223861591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223929447, "dur": 5, "args": { "External id": 160950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160950, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 160950, "pid": 5, "tid": 7, "ts": 1716454223929447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861604, "dur": 6, "args": { "External id": 160950, "cbid": 211, "correlation": 160950 } }, { "ph": "s", "id": 160950, "pid": 76337, "tid": -914061504, "ts": 1716454223861604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223929454, "dur": 29, "args": { "External id": 160960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160960, "pid": 5, "tid": 7, "ts": 1716454223929454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861662, "dur": 12, "args": { "External id": 160960, "cbid": 211, "correlation": 160960 } }, { "ph": "s", "id": 160960, "pid": 76337, "tid": -914061504, "ts": 1716454223861662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223929484, "dur": 31, "args": { "External id": 160980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160980, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 160980, "pid": 5, "tid": 7, "ts": 1716454223929484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861727, "dur": 11, "args": { "External id": 160980, "cbid": 211, "correlation": 160980 } }, { "ph": "s", "id": 160980, "pid": 76337, "tid": -914061504, "ts": 1716454223861727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223929516, "dur": 4, "args": { "External id": 160992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160992, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 160992, "pid": 5, "tid": 7, "ts": 1716454223929516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861748, "dur": 6, "args": { "External id": 160992, "cbid": 211, "correlation": 160992 } }, { "ph": "s", "id": 160992, "pid": 76337, "tid": -914061504, "ts": 1716454223861748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223929522, "dur": 30, "args": { "External id": 160995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 160995, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 160995, "pid": 5, "tid": 7, "ts": 1716454223929522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861766, "dur": 6, "args": { "External id": 160995, "cbid": 211, "correlation": 160995 } }, { "ph": "s", "id": 160995, "pid": 76337, "tid": -914061504, "ts": 1716454223861766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223929554, "dur": 20, "args": { "External id": 161004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161004, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161004, "pid": 5, "tid": 7, "ts": 1716454223929554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861805, "dur": 10, "args": { "External id": 161004, "cbid": 211, "correlation": 161004 } }, { "ph": "s", "id": 161004, "pid": 76337, "tid": -914061504, "ts": 1716454223861805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223861868, "dur": 0, "args": { "External id": 161014, "cbid": 317, "correlation": 161014 } }, { "ph": "f", "id": 161014, "pid": 76337, "tid": -914061504, "ts": 1716454223861868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223861869, "dur": 0, "args": { "External id": 161015, "cbid": 203, "correlation": 161015 } }, { "ph": "f", "id": 161015, "pid": 76337, "tid": -914061504, "ts": 1716454223861869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223861869, "dur": 0, "args": { "External id": 161016, "cbid": 205, "correlation": 161016 } }, { "ph": "f", "id": 161016, "pid": 76337, "tid": -914061504, "ts": 1716454223861869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223929575, "dur": 22, "args": { "External id": 161020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161020, "pid": 5, "tid": 7, "ts": 1716454223929575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861885, "dur": 12, "args": { "External id": 161020, "cbid": 211, "correlation": 161020 } }, { "ph": "s", "id": 161020, "pid": 76337, "tid": -914061504, "ts": 1716454223861885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223929598, "dur": 44, "args": { "External id": 161022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161022, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161022, "pid": 5, "tid": 7, "ts": 1716454223929598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861899, "dur": 5, "args": { "External id": 161022, "cbid": 211, "correlation": 161022 } }, { "ph": "s", "id": 161022, "pid": 76337, "tid": -914061504, "ts": 1716454223861899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223929643, "dur": 645, "args": { "External id": 161024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161024, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161024, "pid": 5, "tid": 7, "ts": 1716454223929643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861912, "dur": 6, "args": { "External id": 161024, "cbid": 211, "correlation": 161024 } }, { "ph": "s", "id": 161024, "pid": 76337, "tid": -914061504, "ts": 1716454223861912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223930290, "dur": 21, "args": { "External id": 161026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161026, "pid": 5, "tid": 7, "ts": 1716454223930290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861921, "dur": 5, "args": { "External id": 161026, "cbid": 211, "correlation": 161026 } }, { "ph": "s", "id": 161026, "pid": 76337, "tid": -914061504, "ts": 1716454223861921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223930312, "dur": 33, "args": { "External id": 161032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161032, "pid": 5, "tid": 7, "ts": 1716454223930312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223861949, "dur": 9, "args": { "External id": 161032, "cbid": 211, "correlation": 161032 } }, { "ph": "s", "id": 161032, "pid": 76337, "tid": -914061504, "ts": 1716454223861949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223862018, "dur": 0, "args": { "External id": 161042, "cbid": 317, "correlation": 161042 } }, { "ph": "f", "id": 161042, "pid": 76337, "tid": -914061504, "ts": 1716454223862018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223862018, "dur": 0, "args": { "External id": 161043, "cbid": 203, "correlation": 161043 } }, { "ph": "f", "id": 161043, "pid": 76337, "tid": -914061504, "ts": 1716454223862018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223862019, "dur": 0, "args": { "External id": 161044, "cbid": 205, "correlation": 161044 } }, { "ph": "f", "id": 161044, "pid": 76337, "tid": -914061504, "ts": 1716454223862019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223930347, "dur": 57, "args": { "External id": 161048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161048, "pid": 5, "tid": 7, "ts": 1716454223930347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862031, "dur": 12, "args": { "External id": 161048, "cbid": 211, "correlation": 161048 } }, { "ph": "s", "id": 161048, "pid": 76337, "tid": -914061504, "ts": 1716454223862031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223930405, "dur": 268, "args": { "External id": 161050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161050, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161050, "pid": 5, "tid": 7, "ts": 1716454223930405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862050, "dur": 9, "args": { "External id": 161050, "cbid": 211, "correlation": 161050 } }, { "ph": "s", "id": 161050, "pid": 76337, "tid": -914061504, "ts": 1716454223862050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223930674, "dur": 22, "args": { "External id": 161052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161052, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161052, "pid": 5, "tid": 7, "ts": 1716454223930674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862062, "dur": 5, "args": { "External id": 161052, "cbid": 211, "correlation": 161052 } }, { "ph": "s", "id": 161052, "pid": 76337, "tid": -914061504, "ts": 1716454223862062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223930697, "dur": 32, "args": { "External id": 161058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161058, "pid": 5, "tid": 7, "ts": 1716454223930697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862088, "dur": 8, "args": { "External id": 161058, "cbid": 211, "correlation": 161058 } }, { "ph": "s", "id": 161058, "pid": 76337, "tid": -914061504, "ts": 1716454223862088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223930731, "dur": 27, "args": { "External id": 161066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161066, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161066, "pid": 5, "tid": 7, "ts": 1716454223930731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862117, "dur": 8, "args": { "External id": 161066, "cbid": 211, "correlation": 161066 } }, { "ph": "s", "id": 161066, "pid": 76337, "tid": -914061504, "ts": 1716454223862117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223930759, "dur": 20, "args": { "External id": 161074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161074, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161074, "pid": 5, "tid": 7, "ts": 1716454223930759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862145, "dur": 9, "args": { "External id": 161074, "cbid": 211, "correlation": 161074 } }, { "ph": "s", "id": 161074, "pid": 76337, "tid": -914061504, "ts": 1716454223862145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223930780, "dur": 30, "args": { "External id": 161094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161094, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 161094, "pid": 5, "tid": 7, "ts": 1716454223930780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862228, "dur": 12, "args": { "External id": 161094, "cbid": 211, "correlation": 161094 } }, { "ph": "s", "id": 161094, "pid": 76337, "tid": -914061504, "ts": 1716454223862228, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223930812, "dur": 4, "args": { "External id": 161106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161106, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 161106, "pid": 5, "tid": 7, "ts": 1716454223930812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862249, "dur": 7, "args": { "External id": 161106, "cbid": 211, "correlation": 161106 } }, { "ph": "s", "id": 161106, "pid": 76337, "tid": -914061504, "ts": 1716454223862249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223930817, "dur": 31, "args": { "External id": 161109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161109, "pid": 5, "tid": 7, "ts": 1716454223930817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862267, "dur": 8, "args": { "External id": 161109, "cbid": 211, "correlation": 161109 } }, { "ph": "s", "id": 161109, "pid": 76337, "tid": -914061504, "ts": 1716454223862267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223862326, "dur": 0, "args": { "External id": 161120, "cbid": 317, "correlation": 161120 } }, { "ph": "f", "id": 161120, "pid": 76337, "tid": -914061504, "ts": 1716454223862326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223862327, "dur": 0, "args": { "External id": 161121, "cbid": 203, "correlation": 161121 } }, { "ph": "f", "id": 161121, "pid": 76337, "tid": -914061504, "ts": 1716454223862327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223862327, "dur": 0, "args": { "External id": 161122, "cbid": 205, "correlation": 161122 } }, { "ph": "f", "id": 161122, "pid": 76337, "tid": -914061504, "ts": 1716454223862327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223930850, "dur": 22, "args": { "External id": 161126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161126, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161126, "pid": 5, "tid": 7, "ts": 1716454223930850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862341, "dur": 12, "args": { "External id": 161126, "cbid": 211, "correlation": 161126 } }, { "ph": "s", "id": 161126, "pid": 76337, "tid": -914061504, "ts": 1716454223862341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223930873, "dur": 104, "args": { "External id": 161128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161128, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161128, "pid": 5, "tid": 7, "ts": 1716454223930873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862360, "dur": 6, "args": { "External id": 161128, "cbid": 211, "correlation": 161128 } }, { "ph": "s", "id": 161128, "pid": 76337, "tid": -914061504, "ts": 1716454223862360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223930978, "dur": 23, "args": { "External id": 161130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161130, "pid": 5, "tid": 7, "ts": 1716454223930978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862369, "dur": 5, "args": { "External id": 161130, "cbid": 211, "correlation": 161130 } }, { "ph": "s", "id": 161130, "pid": 76337, "tid": -914061504, "ts": 1716454223862369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223931003, "dur": 33, "args": { "External id": 161136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161136, "pid": 5, "tid": 7, "ts": 1716454223931003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862396, "dur": 9, "args": { "External id": 161136, "cbid": 211, "correlation": 161136 } }, { "ph": "s", "id": 161136, "pid": 76337, "tid": -914061504, "ts": 1716454223862396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223931037, "dur": 204, "args": { "External id": 161145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161145, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161145, "pid": 5, "tid": 7, "ts": 1716454223931037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862478, "dur": 13, "args": { "External id": 161145, "cbid": 211, "correlation": 161145 } }, { "ph": "s", "id": 161145, "pid": 76337, "tid": -914061504, "ts": 1716454223862478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223931242, "dur": 65, "args": { "External id": 161167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161167, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161167, "pid": 5, "tid": 7, "ts": 1716454223931242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862536, "dur": 10, "args": { "External id": 161167, "cbid": 211, "correlation": 161167 } }, { "ph": "s", "id": 161167, "pid": 76337, "tid": -914061504, "ts": 1716454223862536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223862626, "dur": 1, "args": { "External id": 161178, "cbid": 251, "correlation": 161178 } }, { "ph": "f", "id": 161178, "pid": 76337, "tid": -914061504, "ts": 1716454223862626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223931309, "dur": 154, "args": { "External id": 161179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161179, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161179, "pid": 5, "tid": 7, "ts": 1716454223931309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862631, "dur": 13, "args": { "External id": 161179, "cbid": 211, "correlation": 161179 } }, { "ph": "s", "id": 161179, "pid": 76337, "tid": -914061504, "ts": 1716454223862631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223862704, "dur": 1, "args": { "External id": 161190, "cbid": 251, "correlation": 161190 } }, { "ph": "f", "id": 161190, "pid": 76337, "tid": -914061504, "ts": 1716454223862704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223931465, "dur": 147, "args": { "External id": 161191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161191, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161191, "pid": 5, "tid": 7, "ts": 1716454223931465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862708, "dur": 11, "args": { "External id": 161191, "cbid": 211, "correlation": 161191 } }, { "ph": "s", "id": 161191, "pid": 76337, "tid": -914061504, "ts": 1716454223862708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223862772, "dur": 1, "args": { "External id": 161202, "cbid": 251, "correlation": 161202 } }, { "ph": "f", "id": 161202, "pid": 76337, "tid": -914061504, "ts": 1716454223862772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223931613, "dur": 145, "args": { "External id": 161203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161203, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161203, "pid": 5, "tid": 7, "ts": 1716454223931613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862776, "dur": 11, "args": { "External id": 161203, "cbid": 211, "correlation": 161203 } }, { "ph": "s", "id": 161203, "pid": 76337, "tid": -914061504, "ts": 1716454223862776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223931760, "dur": 1948, "args": { "External id": 161224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161224, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 161224, "pid": 5, "tid": 7, "ts": 1716454223931760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862860, "dur": 14, "args": { "External id": 161224, "cbid": 211, "correlation": 161224 } }, { "ph": "s", "id": 161224, "pid": 76337, "tid": -914061504, "ts": 1716454223862860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223862964, "dur": 2, "args": { "External id": 161242, "cbid": 251, "correlation": 161242 } }, { "ph": "f", "id": 161242, "pid": 76337, "tid": -914061504, "ts": 1716454223862964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223933709, "dur": 148, "args": { "External id": 161244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161244, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 161244, "pid": 5, "tid": 7, "ts": 1716454223933709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223862970, "dur": 23, "args": { "External id": 161244, "cbid": 211, "correlation": 161244 } }, { "ph": "s", "id": 161244, "pid": 76337, "tid": -914061504, "ts": 1716454223862970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223933858, "dur": 35, "args": { "External id": 161252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161252, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161252, "pid": 5, "tid": 7, "ts": 1716454223933858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863052, "dur": 12, "args": { "External id": 161252, "cbid": 211, "correlation": 161252 } }, { "ph": "s", "id": 161252, "pid": 76337, "tid": -914061504, "ts": 1716454223863052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223933895, "dur": 50, "args": { "External id": 161260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161260, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161260, "pid": 5, "tid": 7, "ts": 1716454223933895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863090, "dur": 9, "args": { "External id": 161260, "cbid": 211, "correlation": 161260 } }, { "ph": "s", "id": 161260, "pid": 76337, "tid": -914061504, "ts": 1716454223863090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223933947, "dur": 30, "args": { "External id": 161271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161271, "pid": 5, "tid": 7, "ts": 1716454223933947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863163, "dur": 12, "args": { "External id": 161271, "cbid": 211, "correlation": 161271 } }, { "ph": "s", "id": 161271, "pid": 76337, "tid": -914061504, "ts": 1716454223863163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223933979, "dur": 35, "args": { "External id": 161293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161293, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161293, "pid": 5, "tid": 7, "ts": 1716454223933979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863194, "dur": 8, "args": { "External id": 161293, "cbid": 211, "correlation": 161293 } }, { "ph": "s", "id": 161293, "pid": 76337, "tid": -914061504, "ts": 1716454223863194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863279, "dur": 1, "args": { "External id": 161304, "cbid": 251, "correlation": 161304 } }, { "ph": "f", "id": 161304, "pid": 76337, "tid": -914061504, "ts": 1716454223863279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223934015, "dur": 91, "args": { "External id": 161305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161305, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161305, "pid": 5, "tid": 7, "ts": 1716454223934015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863284, "dur": 13, "args": { "External id": 161305, "cbid": 211, "correlation": 161305 } }, { "ph": "s", "id": 161305, "pid": 76337, "tid": -914061504, "ts": 1716454223863284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863354, "dur": 1, "args": { "External id": 161316, "cbid": 251, "correlation": 161316 } }, { "ph": "f", "id": 161316, "pid": 76337, "tid": -914061504, "ts": 1716454223863354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863358, "dur": 0, "args": { "External id": 161317, "cbid": 251, "correlation": 161317 } }, { "ph": "f", "id": 161317, "pid": 76337, "tid": -914061504, "ts": 1716454223863358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223934107, "dur": 11, "args": { "External id": 161318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161318, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 161318, "pid": 5, "tid": 7, "ts": 1716454223934107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863359, "dur": 11, "args": { "External id": 161318, "cbid": 211, "correlation": 161318 } }, { "ph": "s", "id": 161318, "pid": 76337, "tid": -914061504, "ts": 1716454223863359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223934119, "dur": 5, "args": { "External id": 161320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161320, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 161320, "pid": 5, "tid": 7, "ts": 1716454223934119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863373, "dur": 6, "args": { "External id": 161320, "cbid": 211, "correlation": 161320 } }, { "ph": "s", "id": 161320, "pid": 76337, "tid": -914061504, "ts": 1716454223863373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863430, "dur": 1, "args": { "External id": 161331, "cbid": 251, "correlation": 161331 } }, { "ph": "f", "id": 161331, "pid": 76337, "tid": -914061504, "ts": 1716454223863430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863434, "dur": 0, "args": { "External id": 161332, "cbid": 251, "correlation": 161332 } }, { "ph": "f", "id": 161332, "pid": 76337, "tid": -914061504, "ts": 1716454223863434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223934126, "dur": 7, "args": { "External id": 161333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161333, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 161333, "pid": 5, "tid": 7, "ts": 1716454223934126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863435, "dur": 12, "args": { "External id": 161333, "cbid": 211, "correlation": 161333 } }, { "ph": "s", "id": 161333, "pid": 76337, "tid": -914061504, "ts": 1716454223863435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223934135, "dur": 4, "args": { "External id": 161335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161335, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 161335, "pid": 5, "tid": 7, "ts": 1716454223934135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863449, "dur": 6, "args": { "External id": 161335, "cbid": 211, "correlation": 161335 } }, { "ph": "s", "id": 161335, "pid": 76337, "tid": -914061504, "ts": 1716454223863449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223934139, "dur": 91, "args": { "External id": 161356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161356, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 161356, "pid": 5, "tid": 7, "ts": 1716454223934139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863523, "dur": 12, "args": { "External id": 161356, "cbid": 211, "correlation": 161356 } }, { "ph": "s", "id": 161356, "pid": 76337, "tid": -914061504, "ts": 1716454223863523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863620, "dur": 1, "args": { "External id": 161374, "cbid": 251, "correlation": 161374 } }, { "ph": "f", "id": 161374, "pid": 76337, "tid": -914061504, "ts": 1716454223863620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223934232, "dur": 98, "args": { "External id": 161376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161376, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161376, "pid": 5, "tid": 7, "ts": 1716454223934232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863626, "dur": 13, "args": { "External id": 161376, "cbid": 211, "correlation": 161376 } }, { "ph": "s", "id": 161376, "pid": 76337, "tid": -914061504, "ts": 1716454223863626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223934332, "dur": 19, "args": { "External id": 161384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161384, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161384, "pid": 5, "tid": 7, "ts": 1716454223934332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863695, "dur": 13, "args": { "External id": 161384, "cbid": 211, "correlation": 161384 } }, { "ph": "s", "id": 161384, "pid": 76337, "tid": -914061504, "ts": 1716454223863695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223934352, "dur": 38, "args": { "External id": 161392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161392, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161392, "pid": 5, "tid": 7, "ts": 1716454223934352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863736, "dur": 10, "args": { "External id": 161392, "cbid": 211, "correlation": 161392 } }, { "ph": "s", "id": 161392, "pid": 76337, "tid": -914061504, "ts": 1716454223863736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223934392, "dur": 34, "args": { "External id": 161414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161414, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161414, "pid": 5, "tid": 7, "ts": 1716454223934392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863788, "dur": 11, "args": { "External id": 161414, "cbid": 211, "correlation": 161414 } }, { "ph": "s", "id": 161414, "pid": 76337, "tid": -914061504, "ts": 1716454223863788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863877, "dur": 1, "args": { "External id": 161430, "cbid": 251, "correlation": 161430 } }, { "ph": "f", "id": 161430, "pid": 76337, "tid": -914061504, "ts": 1716454223863877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223863882, "dur": 0, "args": { "External id": 161432, "cbid": 251, "correlation": 161432 } }, { "ph": "f", "id": 161432, "pid": 76337, "tid": -914061504, "ts": 1716454223863882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223934427, "dur": 541, "args": { "External id": 161433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161433, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 161433, "pid": 5, "tid": 7, "ts": 1716454223934427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863886, "dur": 13, "args": { "External id": 161433, "cbid": 211, "correlation": 161433 } }, { "ph": "s", "id": 161433, "pid": 76337, "tid": -914061504, "ts": 1716454223863886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223934970, "dur": 125, "args": { "External id": 161441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161441, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161441, "pid": 5, "tid": 7, "ts": 1716454223934970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863953, "dur": 12, "args": { "External id": 161441, "cbid": 211, "correlation": 161441 } }, { "ph": "s", "id": 161441, "pid": 76337, "tid": -914061504, "ts": 1716454223863953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223935096, "dur": 131, "args": { "External id": 161449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161449, "pid": 5, "tid": 7, "ts": 1716454223935096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223863998, "dur": 10, "args": { "External id": 161449, "cbid": 211, "correlation": 161449 } }, { "ph": "s", "id": 161449, "pid": 76337, "tid": -914061504, "ts": 1716454223863998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223864077, "dur": 1, "args": { "External id": 161465, "cbid": 251, "correlation": 161465 } }, { "ph": "f", "id": 161465, "pid": 76337, "tid": -914061504, "ts": 1716454223864077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223935228, "dur": 307, "args": { "External id": 161467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161467, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161467, "pid": 5, "tid": 7, "ts": 1716454223935228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864083, "dur": 13, "args": { "External id": 161467, "cbid": 211, "correlation": 161467 } }, { "ph": "s", "id": 161467, "pid": 76337, "tid": -914061504, "ts": 1716454223864083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223935537, "dur": 27, "args": { "External id": 161475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161475, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161475, "pid": 5, "tid": 7, "ts": 1716454223935537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864126, "dur": 9, "args": { "External id": 161475, "cbid": 211, "correlation": 161475 } }, { "ph": "s", "id": 161475, "pid": 76337, "tid": -914061504, "ts": 1716454223864126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223935566, "dur": 81, "args": { "External id": 161486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161486, "pid": 5, "tid": 7, "ts": 1716454223935566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864193, "dur": 13, "args": { "External id": 161486, "cbid": 211, "correlation": 161486 } }, { "ph": "s", "id": 161486, "pid": 76337, "tid": -914061504, "ts": 1716454223864193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223864258, "dur": 0, "args": { "External id": 161498, "cbid": 317, "correlation": 161498 } }, { "ph": "f", "id": 161498, "pid": 76337, "tid": -914061504, "ts": 1716454223864258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223864259, "dur": 0, "args": { "External id": 161499, "cbid": 203, "correlation": 161499 } }, { "ph": "f", "id": 161499, "pid": 76337, "tid": -914061504, "ts": 1716454223864259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223864260, "dur": 0, "args": { "External id": 161500, "cbid": 205, "correlation": 161500 } }, { "ph": "f", "id": 161500, "pid": 76337, "tid": -914061504, "ts": 1716454223864260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223935648, "dur": 23, "args": { "External id": 161504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161504, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161504, "pid": 5, "tid": 7, "ts": 1716454223935648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864275, "dur": 12, "args": { "External id": 161504, "cbid": 211, "correlation": 161504 } }, { "ph": "s", "id": 161504, "pid": 76337, "tid": -914061504, "ts": 1716454223864275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223935672, "dur": 120, "args": { "External id": 161506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161506, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161506, "pid": 5, "tid": 7, "ts": 1716454223935672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864294, "dur": 7, "args": { "External id": 161506, "cbid": 211, "correlation": 161506 } }, { "ph": "s", "id": 161506, "pid": 76337, "tid": -914061504, "ts": 1716454223864294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223935794, "dur": 23, "args": { "External id": 161508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161508, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161508, "pid": 5, "tid": 7, "ts": 1716454223935794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864304, "dur": 5, "args": { "External id": 161508, "cbid": 211, "correlation": 161508 } }, { "ph": "s", "id": 161508, "pid": 76337, "tid": -914061504, "ts": 1716454223864304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223935818, "dur": 33, "args": { "External id": 161514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161514, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161514, "pid": 5, "tid": 7, "ts": 1716454223935818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864332, "dur": 8, "args": { "External id": 161514, "cbid": 211, "correlation": 161514 } }, { "ph": "s", "id": 161514, "pid": 76337, "tid": -914061504, "ts": 1716454223864332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223935852, "dur": 27, "args": { "External id": 161522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161522, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161522, "pid": 5, "tid": 7, "ts": 1716454223935852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864363, "dur": 8, "args": { "External id": 161522, "cbid": 211, "correlation": 161522 } }, { "ph": "s", "id": 161522, "pid": 76337, "tid": -914061504, "ts": 1716454223864363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223935880, "dur": 53, "args": { "External id": 161531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161531, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161531, "pid": 5, "tid": 7, "ts": 1716454223935880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864403, "dur": 10, "args": { "External id": 161531, "cbid": 211, "correlation": 161531 } }, { "ph": "s", "id": 161531, "pid": 76337, "tid": -914061504, "ts": 1716454223864403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223935935, "dur": 54, "args": { "External id": 161551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161551, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 161551, "pid": 5, "tid": 7, "ts": 1716454223935935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864473, "dur": 11, "args": { "External id": 161551, "cbid": 211, "correlation": 161551 } }, { "ph": "s", "id": 161551, "pid": 76337, "tid": -914061504, "ts": 1716454223864473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223935990, "dur": 4, "args": { "External id": 161563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161563, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 161563, "pid": 5, "tid": 7, "ts": 1716454223935990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864495, "dur": 6, "args": { "External id": 161563, "cbid": 211, "correlation": 161563 } }, { "ph": "s", "id": 161563, "pid": 76337, "tid": -914061504, "ts": 1716454223864495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223935996, "dur": 57, "args": { "External id": 161566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161566, "pid": 5, "tid": 7, "ts": 1716454223935996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864513, "dur": 6, "args": { "External id": 161566, "cbid": 211, "correlation": 161566 } }, { "ph": "s", "id": 161566, "pid": 76337, "tid": -914061504, "ts": 1716454223864513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223936055, "dur": 38, "args": { "External id": 161575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161575, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161575, "pid": 5, "tid": 7, "ts": 1716454223936055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864554, "dur": 9, "args": { "External id": 161575, "cbid": 211, "correlation": 161575 } }, { "ph": "s", "id": 161575, "pid": 76337, "tid": -914061504, "ts": 1716454223864554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223864607, "dur": 0, "args": { "External id": 161585, "cbid": 317, "correlation": 161585 } }, { "ph": "f", "id": 161585, "pid": 76337, "tid": -914061504, "ts": 1716454223864607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223864608, "dur": 0, "args": { "External id": 161586, "cbid": 203, "correlation": 161586 } }, { "ph": "f", "id": 161586, "pid": 76337, "tid": -914061504, "ts": 1716454223864608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223864608, "dur": 0, "args": { "External id": 161587, "cbid": 205, "correlation": 161587 } }, { "ph": "f", "id": 161587, "pid": 76337, "tid": -914061504, "ts": 1716454223864608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223936094, "dur": 39, "args": { "External id": 161591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161591, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161591, "pid": 5, "tid": 7, "ts": 1716454223936094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864622, "dur": 11, "args": { "External id": 161591, "cbid": 211, "correlation": 161591 } }, { "ph": "s", "id": 161591, "pid": 76337, "tid": -914061504, "ts": 1716454223864622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223936134, "dur": 83, "args": { "External id": 161593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161593, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161593, "pid": 5, "tid": 7, "ts": 1716454223936134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864636, "dur": 5, "args": { "External id": 161593, "cbid": 211, "correlation": 161593 } }, { "ph": "s", "id": 161593, "pid": 76337, "tid": -914061504, "ts": 1716454223864636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223936218, "dur": 1279, "args": { "External id": 161595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161595, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161595, "pid": 5, "tid": 7, "ts": 1716454223936218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864648, "dur": 7, "args": { "External id": 161595, "cbid": 211, "correlation": 161595 } }, { "ph": "s", "id": 161595, "pid": 76337, "tid": -914061504, "ts": 1716454223864648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223937498, "dur": 23, "args": { "External id": 161597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161597, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161597, "pid": 5, "tid": 7, "ts": 1716454223937498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864658, "dur": 5, "args": { "External id": 161597, "cbid": 211, "correlation": 161597 } }, { "ph": "s", "id": 161597, "pid": 76337, "tid": -914061504, "ts": 1716454223864658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223937523, "dur": 33, "args": { "External id": 161603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161603, "pid": 5, "tid": 7, "ts": 1716454223937523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864685, "dur": 9, "args": { "External id": 161603, "cbid": 211, "correlation": 161603 } }, { "ph": "s", "id": 161603, "pid": 76337, "tid": -914061504, "ts": 1716454223864685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223937557, "dur": 3, "args": { "External id": 161611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161611, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 161611, "pid": 5, "tid": 7, "ts": 1716454223937557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864729, "dur": 9, "args": { "External id": 161611, "cbid": 211, "correlation": 161611 } }, { "ph": "s", "id": 161611, "pid": 76337, "tid": -914061504, "ts": 1716454223864729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223864797, "dur": 1, "args": { "External id": 161627, "cbid": 251, "correlation": 161627 } }, { "ph": "f", "id": 161627, "pid": 76337, "tid": -914061504, "ts": 1716454223864797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223864802, "dur": 0, "args": { "External id": 161629, "cbid": 251, "correlation": 161629 } }, { "ph": "f", "id": 161629, "pid": 76337, "tid": -914061504, "ts": 1716454223864802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223937562, "dur": 12, "args": { "External id": 161630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161630, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 161630, "pid": 5, "tid": 7, "ts": 1716454223937562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864804, "dur": 11, "args": { "External id": 161630, "cbid": 211, "correlation": 161630 } }, { "ph": "s", "id": 161630, "pid": 76337, "tid": -914061504, "ts": 1716454223864804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223937575, "dur": 5, "args": { "External id": 161632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161632, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 161632, "pid": 5, "tid": 7, "ts": 1716454223937575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864817, "dur": 5, "args": { "External id": 161632, "cbid": 211, "correlation": 161632 } }, { "ph": "s", "id": 161632, "pid": 76337, "tid": -914061504, "ts": 1716454223864817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223937581, "dur": 29, "args": { "External id": 161642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161642, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161642, "pid": 5, "tid": 7, "ts": 1716454223937581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864875, "dur": 12, "args": { "External id": 161642, "cbid": 211, "correlation": 161642 } }, { "ph": "s", "id": 161642, "pid": 76337, "tid": -914061504, "ts": 1716454223864875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223937612, "dur": 31, "args": { "External id": 161662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161662, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 161662, "pid": 5, "tid": 7, "ts": 1716454223937612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864941, "dur": 11, "args": { "External id": 161662, "cbid": 211, "correlation": 161662 } }, { "ph": "s", "id": 161662, "pid": 76337, "tid": -914061504, "ts": 1716454223864941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223937644, "dur": 4, "args": { "External id": 161674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161674, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 161674, "pid": 5, "tid": 7, "ts": 1716454223937644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864962, "dur": 6, "args": { "External id": 161674, "cbid": 211, "correlation": 161674 } }, { "ph": "s", "id": 161674, "pid": 76337, "tid": -914061504, "ts": 1716454223864962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223937649, "dur": 30, "args": { "External id": 161677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161677, "pid": 5, "tid": 7, "ts": 1716454223937649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223864988, "dur": 8, "args": { "External id": 161677, "cbid": 211, "correlation": 161677 } }, { "ph": "s", "id": 161677, "pid": 76337, "tid": -914061504, "ts": 1716454223864988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223937680, "dur": 20, "args": { "External id": 161686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161686, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161686, "pid": 5, "tid": 7, "ts": 1716454223937680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865031, "dur": 10, "args": { "External id": 161686, "cbid": 211, "correlation": 161686 } }, { "ph": "s", "id": 161686, "pid": 76337, "tid": -914061504, "ts": 1716454223865031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223865093, "dur": 0, "args": { "External id": 161696, "cbid": 317, "correlation": 161696 } }, { "ph": "f", "id": 161696, "pid": 76337, "tid": -914061504, "ts": 1716454223865093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223865094, "dur": 0, "args": { "External id": 161697, "cbid": 203, "correlation": 161697 } }, { "ph": "f", "id": 161697, "pid": 76337, "tid": -914061504, "ts": 1716454223865094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223865095, "dur": 0, "args": { "External id": 161698, "cbid": 205, "correlation": 161698 } }, { "ph": "f", "id": 161698, "pid": 76337, "tid": -914061504, "ts": 1716454223865095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223937701, "dur": 23, "args": { "External id": 161702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161702, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161702, "pid": 5, "tid": 7, "ts": 1716454223937701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865110, "dur": 12, "args": { "External id": 161702, "cbid": 211, "correlation": 161702 } }, { "ph": "s", "id": 161702, "pid": 76337, "tid": -914061504, "ts": 1716454223865110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223937725, "dur": 44, "args": { "External id": 161704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161704, "pid": 5, "tid": 7, "ts": 1716454223937725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865124, "dur": 5, "args": { "External id": 161704, "cbid": 211, "correlation": 161704 } }, { "ph": "s", "id": 161704, "pid": 76337, "tid": -914061504, "ts": 1716454223865124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223937770, "dur": 644, "args": { "External id": 161706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161706, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161706, "pid": 5, "tid": 7, "ts": 1716454223937770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865135, "dur": 6, "args": { "External id": 161706, "cbid": 211, "correlation": 161706 } }, { "ph": "s", "id": 161706, "pid": 76337, "tid": -914061504, "ts": 1716454223865135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223938416, "dur": 21, "args": { "External id": 161708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161708, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161708, "pid": 5, "tid": 7, "ts": 1716454223938416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865145, "dur": 6, "args": { "External id": 161708, "cbid": 211, "correlation": 161708 } }, { "ph": "s", "id": 161708, "pid": 76337, "tid": -914061504, "ts": 1716454223865145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223938438, "dur": 33, "args": { "External id": 161714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161714, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161714, "pid": 5, "tid": 7, "ts": 1716454223938438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865173, "dur": 9, "args": { "External id": 161714, "cbid": 211, "correlation": 161714 } }, { "ph": "s", "id": 161714, "pid": 76337, "tid": -914061504, "ts": 1716454223865173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223865233, "dur": 0, "args": { "External id": 161724, "cbid": 317, "correlation": 161724 } }, { "ph": "f", "id": 161724, "pid": 76337, "tid": -914061504, "ts": 1716454223865233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223865233, "dur": 0, "args": { "External id": 161725, "cbid": 203, "correlation": 161725 } }, { "ph": "f", "id": 161725, "pid": 76337, "tid": -914061504, "ts": 1716454223865233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223865234, "dur": 0, "args": { "External id": 161726, "cbid": 205, "correlation": 161726 } }, { "ph": "f", "id": 161726, "pid": 76337, "tid": -914061504, "ts": 1716454223865234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223938473, "dur": 39, "args": { "External id": 161730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161730, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161730, "pid": 5, "tid": 7, "ts": 1716454223938473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865247, "dur": 14, "args": { "External id": 161730, "cbid": 211, "correlation": 161730 } }, { "ph": "s", "id": 161730, "pid": 76337, "tid": -914061504, "ts": 1716454223865247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223938513, "dur": 192, "args": { "External id": 161732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161732, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161732, "pid": 5, "tid": 7, "ts": 1716454223938513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865266, "dur": 6, "args": { "External id": 161732, "cbid": 211, "correlation": 161732 } }, { "ph": "s", "id": 161732, "pid": 76337, "tid": -914061504, "ts": 1716454223865266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223938707, "dur": 22, "args": { "External id": 161734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161734, "pid": 5, "tid": 7, "ts": 1716454223938707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865277, "dur": 5, "args": { "External id": 161734, "cbid": 211, "correlation": 161734 } }, { "ph": "s", "id": 161734, "pid": 76337, "tid": -914061504, "ts": 1716454223865277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223938730, "dur": 33, "args": { "External id": 161740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161740, "pid": 5, "tid": 7, "ts": 1716454223938730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865303, "dur": 8, "args": { "External id": 161740, "cbid": 211, "correlation": 161740 } }, { "ph": "s", "id": 161740, "pid": 76337, "tid": -914061504, "ts": 1716454223865303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223938764, "dur": 27, "args": { "External id": 161748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161748, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161748, "pid": 5, "tid": 7, "ts": 1716454223938764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865331, "dur": 8, "args": { "External id": 161748, "cbid": 211, "correlation": 161748 } }, { "ph": "s", "id": 161748, "pid": 76337, "tid": -914061504, "ts": 1716454223865331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223938792, "dur": 19, "args": { "External id": 161756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161756, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161756, "pid": 5, "tid": 7, "ts": 1716454223938792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865361, "dur": 8, "args": { "External id": 161756, "cbid": 211, "correlation": 161756 } }, { "ph": "s", "id": 161756, "pid": 76337, "tid": -914061504, "ts": 1716454223865361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223938813, "dur": 31, "args": { "External id": 161776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161776, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 161776, "pid": 5, "tid": 7, "ts": 1716454223938813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865443, "dur": 13, "args": { "External id": 161776, "cbid": 211, "correlation": 161776 } }, { "ph": "s", "id": 161776, "pid": 76337, "tid": -914061504, "ts": 1716454223865443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223938845, "dur": 4, "args": { "External id": 161788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161788, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 161788, "pid": 5, "tid": 7, "ts": 1716454223938845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865466, "dur": 6, "args": { "External id": 161788, "cbid": 211, "correlation": 161788 } }, { "ph": "s", "id": 161788, "pid": 76337, "tid": -914061504, "ts": 1716454223865466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223938850, "dur": 30, "args": { "External id": 161791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161791, "pid": 5, "tid": 7, "ts": 1716454223938850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865483, "dur": 7, "args": { "External id": 161791, "cbid": 211, "correlation": 161791 } }, { "ph": "s", "id": 161791, "pid": 76337, "tid": -914061504, "ts": 1716454223865483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223865540, "dur": 0, "args": { "External id": 161802, "cbid": 317, "correlation": 161802 } }, { "ph": "f", "id": 161802, "pid": 76337, "tid": -914061504, "ts": 1716454223865540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223865541, "dur": 0, "args": { "External id": 161803, "cbid": 203, "correlation": 161803 } }, { "ph": "f", "id": 161803, "pid": 76337, "tid": -914061504, "ts": 1716454223865541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223865541, "dur": 0, "args": { "External id": 161804, "cbid": 205, "correlation": 161804 } }, { "ph": "f", "id": 161804, "pid": 76337, "tid": -914061504, "ts": 1716454223865541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223938882, "dur": 24, "args": { "External id": 161808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161808, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161808, "pid": 5, "tid": 7, "ts": 1716454223938882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865555, "dur": 12, "args": { "External id": 161808, "cbid": 211, "correlation": 161808 } }, { "ph": "s", "id": 161808, "pid": 76337, "tid": -914061504, "ts": 1716454223865555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223938907, "dur": 105, "args": { "External id": 161810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161810, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161810, "pid": 5, "tid": 7, "ts": 1716454223938907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865573, "dur": 6, "args": { "External id": 161810, "cbid": 211, "correlation": 161810 } }, { "ph": "s", "id": 161810, "pid": 76337, "tid": -914061504, "ts": 1716454223865573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223939013, "dur": 22, "args": { "External id": 161812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161812, "pid": 5, "tid": 7, "ts": 1716454223939013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865582, "dur": 5, "args": { "External id": 161812, "cbid": 211, "correlation": 161812 } }, { "ph": "s", "id": 161812, "pid": 76337, "tid": -914061504, "ts": 1716454223865582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223939037, "dur": 33, "args": { "External id": 161818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161818, "pid": 5, "tid": 7, "ts": 1716454223939037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865609, "dur": 8, "args": { "External id": 161818, "cbid": 211, "correlation": 161818 } }, { "ph": "s", "id": 161818, "pid": 76337, "tid": -914061504, "ts": 1716454223865609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223939071, "dur": 200, "args": { "External id": 161827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161827, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161827, "pid": 5, "tid": 7, "ts": 1716454223939071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865690, "dur": 15, "args": { "External id": 161827, "cbid": 211, "correlation": 161827 } }, { "ph": "s", "id": 161827, "pid": 76337, "tid": -914061504, "ts": 1716454223865690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223939273, "dur": 65, "args": { "External id": 161849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161849, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161849, "pid": 5, "tid": 7, "ts": 1716454223939273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865748, "dur": 10, "args": { "External id": 161849, "cbid": 211, "correlation": 161849 } }, { "ph": "s", "id": 161849, "pid": 76337, "tid": -914061504, "ts": 1716454223865748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223865836, "dur": 1, "args": { "External id": 161860, "cbid": 251, "correlation": 161860 } }, { "ph": "f", "id": 161860, "pid": 76337, "tid": -914061504, "ts": 1716454223865836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223939339, "dur": 155, "args": { "External id": 161861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161861, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161861, "pid": 5, "tid": 7, "ts": 1716454223939339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865840, "dur": 14, "args": { "External id": 161861, "cbid": 211, "correlation": 161861 } }, { "ph": "s", "id": 161861, "pid": 76337, "tid": -914061504, "ts": 1716454223865840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223865911, "dur": 1, "args": { "External id": 161872, "cbid": 251, "correlation": 161872 } }, { "ph": "f", "id": 161872, "pid": 76337, "tid": -914061504, "ts": 1716454223865911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223939496, "dur": 145, "args": { "External id": 161873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161873, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161873, "pid": 5, "tid": 7, "ts": 1716454223939496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865915, "dur": 11, "args": { "External id": 161873, "cbid": 211, "correlation": 161873 } }, { "ph": "s", "id": 161873, "pid": 76337, "tid": -914061504, "ts": 1716454223865915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223865988, "dur": 1, "args": { "External id": 161884, "cbid": 251, "correlation": 161884 } }, { "ph": "f", "id": 161884, "pid": 76337, "tid": -914061504, "ts": 1716454223865988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223939642, "dur": 145, "args": { "External id": 161885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161885, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161885, "pid": 5, "tid": 7, "ts": 1716454223939642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223865992, "dur": 13, "args": { "External id": 161885, "cbid": 211, "correlation": 161885 } }, { "ph": "s", "id": 161885, "pid": 76337, "tid": -914061504, "ts": 1716454223865992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223939789, "dur": 1948, "args": { "External id": 161906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161906, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 161906, "pid": 5, "tid": 7, "ts": 1716454223939789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866075, "dur": 13, "args": { "External id": 161906, "cbid": 211, "correlation": 161906 } }, { "ph": "s", "id": 161906, "pid": 76337, "tid": -914061504, "ts": 1716454223866075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866176, "dur": 1, "args": { "External id": 161924, "cbid": 251, "correlation": 161924 } }, { "ph": "f", "id": 161924, "pid": 76337, "tid": -914061504, "ts": 1716454223866176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223941739, "dur": 148, "args": { "External id": 161926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161926, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 161926, "pid": 5, "tid": 7, "ts": 1716454223941739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866182, "dur": 13, "args": { "External id": 161926, "cbid": 211, "correlation": 161926 } }, { "ph": "s", "id": 161926, "pid": 76337, "tid": -914061504, "ts": 1716454223866182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223941888, "dur": 35, "args": { "External id": 161934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161934, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161934, "pid": 5, "tid": 7, "ts": 1716454223941888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866253, "dur": 12, "args": { "External id": 161934, "cbid": 211, "correlation": 161934 } }, { "ph": "s", "id": 161934, "pid": 76337, "tid": -914061504, "ts": 1716454223866253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223941925, "dur": 51, "args": { "External id": 161942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161942, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161942, "pid": 5, "tid": 7, "ts": 1716454223941925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866291, "dur": 9, "args": { "External id": 161942, "cbid": 211, "correlation": 161942 } }, { "ph": "s", "id": 161942, "pid": 76337, "tid": -914061504, "ts": 1716454223866291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223941978, "dur": 30, "args": { "External id": 161953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161953, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161953, "pid": 5, "tid": 7, "ts": 1716454223941978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866364, "dur": 13, "args": { "External id": 161953, "cbid": 211, "correlation": 161953 } }, { "ph": "s", "id": 161953, "pid": 76337, "tid": -914061504, "ts": 1716454223866364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223942009, "dur": 34, "args": { "External id": 161975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161975, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 161975, "pid": 5, "tid": 7, "ts": 1716454223942009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866395, "dur": 8, "args": { "External id": 161975, "cbid": 211, "correlation": 161975 } }, { "ph": "s", "id": 161975, "pid": 76337, "tid": -914061504, "ts": 1716454223866395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866479, "dur": 1, "args": { "External id": 161986, "cbid": 251, "correlation": 161986 } }, { "ph": "f", "id": 161986, "pid": 76337, "tid": -914061504, "ts": 1716454223866479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223942044, "dur": 77, "args": { "External id": 161987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 161987, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 161987, "pid": 5, "tid": 7, "ts": 1716454223942044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866484, "dur": 13, "args": { "External id": 161987, "cbid": 211, "correlation": 161987 } }, { "ph": "s", "id": 161987, "pid": 76337, "tid": -914061504, "ts": 1716454223866484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866553, "dur": 1, "args": { "External id": 161998, "cbid": 251, "correlation": 161998 } }, { "ph": "f", "id": 161998, "pid": 76337, "tid": -914061504, "ts": 1716454223866553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866557, "dur": 0, "args": { "External id": 161999, "cbid": 251, "correlation": 161999 } }, { "ph": "f", "id": 161999, "pid": 76337, "tid": -914061504, "ts": 1716454223866557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223942122, "dur": 12, "args": { "External id": 162000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162000, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 162000, "pid": 5, "tid": 7, "ts": 1716454223942122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866558, "dur": 11, "args": { "External id": 162000, "cbid": 211, "correlation": 162000 } }, { "ph": "s", "id": 162000, "pid": 76337, "tid": -914061504, "ts": 1716454223866558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223942135, "dur": 5, "args": { "External id": 162002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162002, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 162002, "pid": 5, "tid": 7, "ts": 1716454223942135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866571, "dur": 6, "args": { "External id": 162002, "cbid": 211, "correlation": 162002 } }, { "ph": "s", "id": 162002, "pid": 76337, "tid": -914061504, "ts": 1716454223866571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866630, "dur": 1, "args": { "External id": 162013, "cbid": 251, "correlation": 162013 } }, { "ph": "f", "id": 162013, "pid": 76337, "tid": -914061504, "ts": 1716454223866630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866634, "dur": 0, "args": { "External id": 162014, "cbid": 251, "correlation": 162014 } }, { "ph": "f", "id": 162014, "pid": 76337, "tid": -914061504, "ts": 1716454223866634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223942142, "dur": 7, "args": { "External id": 162015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162015, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 162015, "pid": 5, "tid": 7, "ts": 1716454223942142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866635, "dur": 13, "args": { "External id": 162015, "cbid": 211, "correlation": 162015 } }, { "ph": "s", "id": 162015, "pid": 76337, "tid": -914061504, "ts": 1716454223866635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223942150, "dur": 4, "args": { "External id": 162017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162017, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 162017, "pid": 5, "tid": 7, "ts": 1716454223942150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866649, "dur": 5, "args": { "External id": 162017, "cbid": 211, "correlation": 162017 } }, { "ph": "s", "id": 162017, "pid": 76337, "tid": -914061504, "ts": 1716454223866649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223942155, "dur": 91, "args": { "External id": 162038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162038, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 162038, "pid": 5, "tid": 7, "ts": 1716454223942155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866722, "dur": 12, "args": { "External id": 162038, "cbid": 211, "correlation": 162038 } }, { "ph": "s", "id": 162038, "pid": 76337, "tid": -914061504, "ts": 1716454223866722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223866820, "dur": 1, "args": { "External id": 162056, "cbid": 251, "correlation": 162056 } }, { "ph": "f", "id": 162056, "pid": 76337, "tid": -914061504, "ts": 1716454223866820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223942248, "dur": 99, "args": { "External id": 162058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162058, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162058, "pid": 5, "tid": 7, "ts": 1716454223942248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866825, "dur": 13, "args": { "External id": 162058, "cbid": 211, "correlation": 162058 } }, { "ph": "s", "id": 162058, "pid": 76337, "tid": -914061504, "ts": 1716454223866825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223942348, "dur": 19, "args": { "External id": 162066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162066, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162066, "pid": 5, "tid": 7, "ts": 1716454223942348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866895, "dur": 12, "args": { "External id": 162066, "cbid": 211, "correlation": 162066 } }, { "ph": "s", "id": 162066, "pid": 76337, "tid": -914061504, "ts": 1716454223866895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223942369, "dur": 37, "args": { "External id": 162074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162074, "pid": 5, "tid": 7, "ts": 1716454223942369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866936, "dur": 10, "args": { "External id": 162074, "cbid": 211, "correlation": 162074 } }, { "ph": "s", "id": 162074, "pid": 76337, "tid": -914061504, "ts": 1716454223866936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223942407, "dur": 35, "args": { "External id": 162096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162096, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162096, "pid": 5, "tid": 7, "ts": 1716454223942407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223866997, "dur": 11, "args": { "External id": 162096, "cbid": 211, "correlation": 162096 } }, { "ph": "s", "id": 162096, "pid": 76337, "tid": -914061504, "ts": 1716454223866997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223867088, "dur": 1, "args": { "External id": 162112, "cbid": 251, "correlation": 162112 } }, { "ph": "f", "id": 162112, "pid": 76337, "tid": -914061504, "ts": 1716454223867088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223867093, "dur": 0, "args": { "External id": 162114, "cbid": 251, "correlation": 162114 } }, { "ph": "f", "id": 162114, "pid": 76337, "tid": -914061504, "ts": 1716454223867093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223942443, "dur": 536, "args": { "External id": 162115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162115, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 162115, "pid": 5, "tid": 7, "ts": 1716454223942443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867097, "dur": 13, "args": { "External id": 162115, "cbid": 211, "correlation": 162115 } }, { "ph": "s", "id": 162115, "pid": 76337, "tid": -914061504, "ts": 1716454223867097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223942981, "dur": 125, "args": { "External id": 162123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162123, "pid": 5, "tid": 7, "ts": 1716454223942981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867163, "dur": 12, "args": { "External id": 162123, "cbid": 211, "correlation": 162123 } }, { "ph": "s", "id": 162123, "pid": 76337, "tid": -914061504, "ts": 1716454223867163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223943107, "dur": 129, "args": { "External id": 162131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162131, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162131, "pid": 5, "tid": 7, "ts": 1716454223943107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867194, "dur": 8, "args": { "External id": 162131, "cbid": 211, "correlation": 162131 } }, { "ph": "s", "id": 162131, "pid": 76337, "tid": -914061504, "ts": 1716454223867194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223867270, "dur": 1, "args": { "External id": 162147, "cbid": 251, "correlation": 162147 } }, { "ph": "f", "id": 162147, "pid": 76337, "tid": -914061504, "ts": 1716454223867270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223943237, "dur": 306, "args": { "External id": 162149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162149, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162149, "pid": 5, "tid": 7, "ts": 1716454223943237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867276, "dur": 12, "args": { "External id": 162149, "cbid": 211, "correlation": 162149 } }, { "ph": "s", "id": 162149, "pid": 76337, "tid": -914061504, "ts": 1716454223867276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223943543, "dur": 27, "args": { "External id": 162157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162157, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162157, "pid": 5, "tid": 7, "ts": 1716454223943543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867318, "dur": 10, "args": { "External id": 162157, "cbid": 211, "correlation": 162157 } }, { "ph": "s", "id": 162157, "pid": 76337, "tid": -914061504, "ts": 1716454223867318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223943572, "dur": 81, "args": { "External id": 162168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162168, "pid": 5, "tid": 7, "ts": 1716454223943572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867385, "dur": 13, "args": { "External id": 162168, "cbid": 211, "correlation": 162168 } }, { "ph": "s", "id": 162168, "pid": 76337, "tid": -914061504, "ts": 1716454223867385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223867449, "dur": 0, "args": { "External id": 162180, "cbid": 317, "correlation": 162180 } }, { "ph": "f", "id": 162180, "pid": 76337, "tid": -914061504, "ts": 1716454223867449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223867450, "dur": 0, "args": { "External id": 162181, "cbid": 203, "correlation": 162181 } }, { "ph": "f", "id": 162181, "pid": 76337, "tid": -914061504, "ts": 1716454223867450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223867451, "dur": 0, "args": { "External id": 162182, "cbid": 205, "correlation": 162182 } }, { "ph": "f", "id": 162182, "pid": 76337, "tid": -914061504, "ts": 1716454223867451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223943654, "dur": 24, "args": { "External id": 162186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162186, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162186, "pid": 5, "tid": 7, "ts": 1716454223943654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867465, "dur": 12, "args": { "External id": 162186, "cbid": 211, "correlation": 162186 } }, { "ph": "s", "id": 162186, "pid": 76337, "tid": -914061504, "ts": 1716454223867465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223943680, "dur": 120, "args": { "External id": 162188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162188, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162188, "pid": 5, "tid": 7, "ts": 1716454223943680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867484, "dur": 7, "args": { "External id": 162188, "cbid": 211, "correlation": 162188 } }, { "ph": "s", "id": 162188, "pid": 76337, "tid": -914061504, "ts": 1716454223867484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223943802, "dur": 23, "args": { "External id": 162190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162190, "pid": 5, "tid": 7, "ts": 1716454223943802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867495, "dur": 5, "args": { "External id": 162190, "cbid": 211, "correlation": 162190 } }, { "ph": "s", "id": 162190, "pid": 76337, "tid": -914061504, "ts": 1716454223867495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223943826, "dur": 33, "args": { "External id": 162196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162196, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162196, "pid": 5, "tid": 7, "ts": 1716454223943826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867522, "dur": 8, "args": { "External id": 162196, "cbid": 211, "correlation": 162196 } }, { "ph": "s", "id": 162196, "pid": 76337, "tid": -914061504, "ts": 1716454223867522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223943861, "dur": 26, "args": { "External id": 162204, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162204, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162204, "pid": 5, "tid": 7, "ts": 1716454223943861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867554, "dur": 8, "args": { "External id": 162204, "cbid": 211, "correlation": 162204 } }, { "ph": "s", "id": 162204, "pid": 76337, "tid": -914061504, "ts": 1716454223867554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223943888, "dur": 48, "args": { "External id": 162213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162213, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162213, "pid": 5, "tid": 7, "ts": 1716454223943888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867592, "dur": 10, "args": { "External id": 162213, "cbid": 211, "correlation": 162213 } }, { "ph": "s", "id": 162213, "pid": 76337, "tid": -914061504, "ts": 1716454223867592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223943938, "dur": 43, "args": { "External id": 162233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162233, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 162233, "pid": 5, "tid": 7, "ts": 1716454223943938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867663, "dur": 11, "args": { "External id": 162233, "cbid": 211, "correlation": 162233 } }, { "ph": "s", "id": 162233, "pid": 76337, "tid": -914061504, "ts": 1716454223867663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223943982, "dur": 5, "args": { "External id": 162245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162245, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 162245, "pid": 5, "tid": 7, "ts": 1716454223943982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867683, "dur": 6, "args": { "External id": 162245, "cbid": 211, "correlation": 162245 } }, { "ph": "s", "id": 162245, "pid": 76337, "tid": -914061504, "ts": 1716454223867683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223943989, "dur": 43, "args": { "External id": 162248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162248, "pid": 5, "tid": 7, "ts": 1716454223943989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867702, "dur": 6, "args": { "External id": 162248, "cbid": 211, "correlation": 162248 } }, { "ph": "s", "id": 162248, "pid": 76337, "tid": -914061504, "ts": 1716454223867702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223944034, "dur": 29, "args": { "External id": 162257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162257, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162257, "pid": 5, "tid": 7, "ts": 1716454223944034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867741, "dur": 10, "args": { "External id": 162257, "cbid": 211, "correlation": 162257 } }, { "ph": "s", "id": 162257, "pid": 76337, "tid": -914061504, "ts": 1716454223867741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223867792, "dur": 0, "args": { "External id": 162267, "cbid": 317, "correlation": 162267 } }, { "ph": "f", "id": 162267, "pid": 76337, "tid": -914061504, "ts": 1716454223867792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223867793, "dur": 0, "args": { "External id": 162268, "cbid": 203, "correlation": 162268 } }, { "ph": "f", "id": 162268, "pid": 76337, "tid": -914061504, "ts": 1716454223867793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223867794, "dur": 0, "args": { "External id": 162269, "cbid": 205, "correlation": 162269 } }, { "ph": "f", "id": 162269, "pid": 76337, "tid": -914061504, "ts": 1716454223867794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223944064, "dur": 32, "args": { "External id": 162273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162273, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162273, "pid": 5, "tid": 7, "ts": 1716454223944064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867808, "dur": 11, "args": { "External id": 162273, "cbid": 211, "correlation": 162273 } }, { "ph": "s", "id": 162273, "pid": 76337, "tid": -914061504, "ts": 1716454223867808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223944097, "dur": 64, "args": { "External id": 162275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162275, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162275, "pid": 5, "tid": 7, "ts": 1716454223944097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867822, "dur": 5, "args": { "External id": 162275, "cbid": 211, "correlation": 162275 } }, { "ph": "s", "id": 162275, "pid": 76337, "tid": -914061504, "ts": 1716454223867822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223944162, "dur": 968, "args": { "External id": 162277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162277, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162277, "pid": 5, "tid": 7, "ts": 1716454223944162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867833, "dur": 7, "args": { "External id": 162277, "cbid": 211, "correlation": 162277 } }, { "ph": "s", "id": 162277, "pid": 76337, "tid": -914061504, "ts": 1716454223867833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223945131, "dur": 21, "args": { "External id": 162279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162279, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162279, "pid": 5, "tid": 7, "ts": 1716454223945131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867845, "dur": 5, "args": { "External id": 162279, "cbid": 211, "correlation": 162279 } }, { "ph": "s", "id": 162279, "pid": 76337, "tid": -914061504, "ts": 1716454223867845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223945154, "dur": 33, "args": { "External id": 162285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162285, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162285, "pid": 5, "tid": 7, "ts": 1716454223945154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867871, "dur": 8, "args": { "External id": 162285, "cbid": 211, "correlation": 162285 } }, { "ph": "s", "id": 162285, "pid": 76337, "tid": -914061504, "ts": 1716454223867871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223945188, "dur": 3, "args": { "External id": 162293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162293, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 162293, "pid": 5, "tid": 7, "ts": 1716454223945188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867914, "dur": 9, "args": { "External id": 162293, "cbid": 211, "correlation": 162293 } }, { "ph": "s", "id": 162293, "pid": 76337, "tid": -914061504, "ts": 1716454223867914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223867987, "dur": 1, "args": { "External id": 162309, "cbid": 251, "correlation": 162309 } }, { "ph": "f", "id": 162309, "pid": 76337, "tid": -914061504, "ts": 1716454223867987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223867993, "dur": 0, "args": { "External id": 162311, "cbid": 251, "correlation": 162311 } }, { "ph": "f", "id": 162311, "pid": 76337, "tid": -914061504, "ts": 1716454223867993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223945193, "dur": 12, "args": { "External id": 162312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162312, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 162312, "pid": 5, "tid": 7, "ts": 1716454223945193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223867995, "dur": 12, "args": { "External id": 162312, "cbid": 211, "correlation": 162312 } }, { "ph": "s", "id": 162312, "pid": 76337, "tid": -914061504, "ts": 1716454223867995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223945206, "dur": 5, "args": { "External id": 162314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162314, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 162314, "pid": 5, "tid": 7, "ts": 1716454223945206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868008, "dur": 5, "args": { "External id": 162314, "cbid": 211, "correlation": 162314 } }, { "ph": "s", "id": 162314, "pid": 76337, "tid": -914061504, "ts": 1716454223868008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223945213, "dur": 29, "args": { "External id": 162324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162324, "pid": 5, "tid": 7, "ts": 1716454223945213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868066, "dur": 13, "args": { "External id": 162324, "cbid": 211, "correlation": 162324 } }, { "ph": "s", "id": 162324, "pid": 76337, "tid": -914061504, "ts": 1716454223868066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223945243, "dur": 32, "args": { "External id": 162344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162344, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 162344, "pid": 5, "tid": 7, "ts": 1716454223945243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868133, "dur": 11, "args": { "External id": 162344, "cbid": 211, "correlation": 162344 } }, { "ph": "s", "id": 162344, "pid": 76337, "tid": -914061504, "ts": 1716454223868133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223945276, "dur": 4, "args": { "External id": 162356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162356, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 162356, "pid": 5, "tid": 7, "ts": 1716454223945276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868153, "dur": 6, "args": { "External id": 162356, "cbid": 211, "correlation": 162356 } }, { "ph": "s", "id": 162356, "pid": 76337, "tid": -914061504, "ts": 1716454223868153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223945281, "dur": 30, "args": { "External id": 162359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162359, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162359, "pid": 5, "tid": 7, "ts": 1716454223945281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868171, "dur": 7, "args": { "External id": 162359, "cbid": 211, "correlation": 162359 } }, { "ph": "s", "id": 162359, "pid": 76337, "tid": -914061504, "ts": 1716454223868171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223945313, "dur": 22, "args": { "External id": 162368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162368, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162368, "pid": 5, "tid": 7, "ts": 1716454223945313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868211, "dur": 10, "args": { "External id": 162368, "cbid": 211, "correlation": 162368 } }, { "ph": "s", "id": 162368, "pid": 76337, "tid": -914061504, "ts": 1716454223868211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223868274, "dur": 0, "args": { "External id": 162378, "cbid": 317, "correlation": 162378 } }, { "ph": "f", "id": 162378, "pid": 76337, "tid": -914061504, "ts": 1716454223868274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223868275, "dur": 0, "args": { "External id": 162379, "cbid": 203, "correlation": 162379 } }, { "ph": "f", "id": 162379, "pid": 76337, "tid": -914061504, "ts": 1716454223868275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223868275, "dur": 0, "args": { "External id": 162380, "cbid": 205, "correlation": 162380 } }, { "ph": "f", "id": 162380, "pid": 76337, "tid": -914061504, "ts": 1716454223868275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223945336, "dur": 22, "args": { "External id": 162384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162384, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162384, "pid": 5, "tid": 7, "ts": 1716454223945336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868290, "dur": 12, "args": { "External id": 162384, "cbid": 211, "correlation": 162384 } }, { "ph": "s", "id": 162384, "pid": 76337, "tid": -914061504, "ts": 1716454223868290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223945359, "dur": 45, "args": { "External id": 162386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162386, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162386, "pid": 5, "tid": 7, "ts": 1716454223945359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868304, "dur": 5, "args": { "External id": 162386, "cbid": 211, "correlation": 162386 } }, { "ph": "s", "id": 162386, "pid": 76337, "tid": -914061504, "ts": 1716454223868304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223945405, "dur": 646, "args": { "External id": 162388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162388, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162388, "pid": 5, "tid": 7, "ts": 1716454223945405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868315, "dur": 6, "args": { "External id": 162388, "cbid": 211, "correlation": 162388 } }, { "ph": "s", "id": 162388, "pid": 76337, "tid": -914061504, "ts": 1716454223868315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223946053, "dur": 23, "args": { "External id": 162390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162390, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162390, "pid": 5, "tid": 7, "ts": 1716454223946053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868325, "dur": 5, "args": { "External id": 162390, "cbid": 211, "correlation": 162390 } }, { "ph": "s", "id": 162390, "pid": 76337, "tid": -914061504, "ts": 1716454223868325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223946077, "dur": 32, "args": { "External id": 162396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162396, "pid": 5, "tid": 7, "ts": 1716454223946077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868353, "dur": 8, "args": { "External id": 162396, "cbid": 211, "correlation": 162396 } }, { "ph": "s", "id": 162396, "pid": 76337, "tid": -914061504, "ts": 1716454223868353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223868411, "dur": 0, "args": { "External id": 162406, "cbid": 317, "correlation": 162406 } }, { "ph": "f", "id": 162406, "pid": 76337, "tid": -914061504, "ts": 1716454223868411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223868411, "dur": 0, "args": { "External id": 162407, "cbid": 203, "correlation": 162407 } }, { "ph": "f", "id": 162407, "pid": 76337, "tid": -914061504, "ts": 1716454223868411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223868412, "dur": 0, "args": { "External id": 162408, "cbid": 205, "correlation": 162408 } }, { "ph": "f", "id": 162408, "pid": 76337, "tid": -914061504, "ts": 1716454223868412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223946110, "dur": 30, "args": { "External id": 162412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162412, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162412, "pid": 5, "tid": 7, "ts": 1716454223946110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868426, "dur": 12, "args": { "External id": 162412, "cbid": 211, "correlation": 162412 } }, { "ph": "s", "id": 162412, "pid": 76337, "tid": -914061504, "ts": 1716454223868426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223946141, "dur": 153, "args": { "External id": 162414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162414, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162414, "pid": 5, "tid": 7, "ts": 1716454223946141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868445, "dur": 7, "args": { "External id": 162414, "cbid": 211, "correlation": 162414 } }, { "ph": "s", "id": 162414, "pid": 76337, "tid": -914061504, "ts": 1716454223868445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223946296, "dur": 22, "args": { "External id": 162416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162416, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162416, "pid": 5, "tid": 7, "ts": 1716454223946296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868455, "dur": 5, "args": { "External id": 162416, "cbid": 211, "correlation": 162416 } }, { "ph": "s", "id": 162416, "pid": 76337, "tid": -914061504, "ts": 1716454223868455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223946319, "dur": 32, "args": { "External id": 162422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162422, "pid": 5, "tid": 7, "ts": 1716454223946319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868480, "dur": 9, "args": { "External id": 162422, "cbid": 211, "correlation": 162422 } }, { "ph": "s", "id": 162422, "pid": 76337, "tid": -914061504, "ts": 1716454223868480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223946352, "dur": 27, "args": { "External id": 162430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162430, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162430, "pid": 5, "tid": 7, "ts": 1716454223946352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868509, "dur": 8, "args": { "External id": 162430, "cbid": 211, "correlation": 162430 } }, { "ph": "s", "id": 162430, "pid": 76337, "tid": -914061504, "ts": 1716454223868509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223946381, "dur": 20, "args": { "External id": 162438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162438, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162438, "pid": 5, "tid": 7, "ts": 1716454223946381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868537, "dur": 8, "args": { "External id": 162438, "cbid": 211, "correlation": 162438 } }, { "ph": "s", "id": 162438, "pid": 76337, "tid": -914061504, "ts": 1716454223868537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223946402, "dur": 30, "args": { "External id": 162458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162458, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 162458, "pid": 5, "tid": 7, "ts": 1716454223946402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868618, "dur": 12, "args": { "External id": 162458, "cbid": 211, "correlation": 162458 } }, { "ph": "s", "id": 162458, "pid": 76337, "tid": -914061504, "ts": 1716454223868618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223946433, "dur": 4, "args": { "External id": 162470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162470, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 162470, "pid": 5, "tid": 7, "ts": 1716454223946433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868640, "dur": 7, "args": { "External id": 162470, "cbid": 211, "correlation": 162470 } }, { "ph": "s", "id": 162470, "pid": 76337, "tid": -914061504, "ts": 1716454223868640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223946439, "dur": 31, "args": { "External id": 162473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162473, "pid": 5, "tid": 7, "ts": 1716454223946439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868659, "dur": 6, "args": { "External id": 162473, "cbid": 211, "correlation": 162473 } }, { "ph": "s", "id": 162473, "pid": 76337, "tid": -914061504, "ts": 1716454223868659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223868715, "dur": 0, "args": { "External id": 162484, "cbid": 317, "correlation": 162484 } }, { "ph": "f", "id": 162484, "pid": 76337, "tid": -914061504, "ts": 1716454223868715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223868716, "dur": 0, "args": { "External id": 162485, "cbid": 203, "correlation": 162485 } }, { "ph": "f", "id": 162485, "pid": 76337, "tid": -914061504, "ts": 1716454223868716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223868717, "dur": 0, "args": { "External id": 162486, "cbid": 205, "correlation": 162486 } }, { "ph": "f", "id": 162486, "pid": 76337, "tid": -914061504, "ts": 1716454223868717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223946471, "dur": 21, "args": { "External id": 162490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162490, "pid": 5, "tid": 7, "ts": 1716454223946471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868730, "dur": 12, "args": { "External id": 162490, "cbid": 211, "correlation": 162490 } }, { "ph": "s", "id": 162490, "pid": 76337, "tid": -914061504, "ts": 1716454223868730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223946493, "dur": 105, "args": { "External id": 162492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162492, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162492, "pid": 5, "tid": 7, "ts": 1716454223946493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868748, "dur": 6, "args": { "External id": 162492, "cbid": 211, "correlation": 162492 } }, { "ph": "s", "id": 162492, "pid": 76337, "tid": -914061504, "ts": 1716454223868748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223946600, "dur": 22, "args": { "External id": 162494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162494, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162494, "pid": 5, "tid": 7, "ts": 1716454223946600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868757, "dur": 5, "args": { "External id": 162494, "cbid": 211, "correlation": 162494 } }, { "ph": "s", "id": 162494, "pid": 76337, "tid": -914061504, "ts": 1716454223868757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223946624, "dur": 33, "args": { "External id": 162500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162500, "pid": 5, "tid": 7, "ts": 1716454223946624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868784, "dur": 9, "args": { "External id": 162500, "cbid": 211, "correlation": 162500 } }, { "ph": "s", "id": 162500, "pid": 76337, "tid": -914061504, "ts": 1716454223868784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223946658, "dur": 183, "args": { "External id": 162509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162509, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162509, "pid": 5, "tid": 7, "ts": 1716454223946658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868866, "dur": 14, "args": { "External id": 162509, "cbid": 211, "correlation": 162509 } }, { "ph": "s", "id": 162509, "pid": 76337, "tid": -914061504, "ts": 1716454223868866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223946842, "dur": 65, "args": { "External id": 162531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162531, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162531, "pid": 5, "tid": 7, "ts": 1716454223946842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223868922, "dur": 10, "args": { "External id": 162531, "cbid": 211, "correlation": 162531 } }, { "ph": "s", "id": 162531, "pid": 76337, "tid": -914061504, "ts": 1716454223868922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869019, "dur": 1, "args": { "External id": 162542, "cbid": 251, "correlation": 162542 } }, { "ph": "f", "id": 162542, "pid": 76337, "tid": -914061504, "ts": 1716454223869019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223946909, "dur": 151, "args": { "External id": 162543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162543, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162543, "pid": 5, "tid": 7, "ts": 1716454223946909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869024, "dur": 13, "args": { "External id": 162543, "cbid": 211, "correlation": 162543 } }, { "ph": "s", "id": 162543, "pid": 76337, "tid": -914061504, "ts": 1716454223869024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869095, "dur": 1, "args": { "External id": 162554, "cbid": 251, "correlation": 162554 } }, { "ph": "f", "id": 162554, "pid": 76337, "tid": -914061504, "ts": 1716454223869095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223947060, "dur": 143, "args": { "External id": 162555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162555, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162555, "pid": 5, "tid": 7, "ts": 1716454223947060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869099, "dur": 11, "args": { "External id": 162555, "cbid": 211, "correlation": 162555 } }, { "ph": "s", "id": 162555, "pid": 76337, "tid": -914061504, "ts": 1716454223869099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869163, "dur": 1, "args": { "External id": 162566, "cbid": 251, "correlation": 162566 } }, { "ph": "f", "id": 162566, "pid": 76337, "tid": -914061504, "ts": 1716454223869163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223947205, "dur": 145, "args": { "External id": 162567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162567, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162567, "pid": 5, "tid": 7, "ts": 1716454223947205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869167, "dur": 11, "args": { "External id": 162567, "cbid": 211, "correlation": 162567 } }, { "ph": "s", "id": 162567, "pid": 76337, "tid": -914061504, "ts": 1716454223869167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223947352, "dur": 1950, "args": { "External id": 162588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162588, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 162588, "pid": 5, "tid": 7, "ts": 1716454223947352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869248, "dur": 13, "args": { "External id": 162588, "cbid": 211, "correlation": 162588 } }, { "ph": "s", "id": 162588, "pid": 76337, "tid": -914061504, "ts": 1716454223869248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869349, "dur": 1, "args": { "External id": 162606, "cbid": 251, "correlation": 162606 } }, { "ph": "f", "id": 162606, "pid": 76337, "tid": -914061504, "ts": 1716454223869349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223949303, "dur": 146, "args": { "External id": 162608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162608, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 162608, "pid": 5, "tid": 7, "ts": 1716454223949303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869355, "dur": 13, "args": { "External id": 162608, "cbid": 211, "correlation": 162608 } }, { "ph": "s", "id": 162608, "pid": 76337, "tid": -914061504, "ts": 1716454223869355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223949451, "dur": 36, "args": { "External id": 162616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162616, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162616, "pid": 5, "tid": 7, "ts": 1716454223949451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869425, "dur": 12, "args": { "External id": 162616, "cbid": 211, "correlation": 162616 } }, { "ph": "s", "id": 162616, "pid": 76337, "tid": -914061504, "ts": 1716454223869425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223949488, "dur": 50, "args": { "External id": 162624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162624, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162624, "pid": 5, "tid": 7, "ts": 1716454223949488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869465, "dur": 8, "args": { "External id": 162624, "cbid": 211, "correlation": 162624 } }, { "ph": "s", "id": 162624, "pid": 76337, "tid": -914061504, "ts": 1716454223869465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223949540, "dur": 30, "args": { "External id": 162635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162635, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162635, "pid": 5, "tid": 7, "ts": 1716454223949540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869535, "dur": 13, "args": { "External id": 162635, "cbid": 211, "correlation": 162635 } }, { "ph": "s", "id": 162635, "pid": 76337, "tid": -914061504, "ts": 1716454223869535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223949571, "dur": 34, "args": { "External id": 162657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162657, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162657, "pid": 5, "tid": 7, "ts": 1716454223949571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869566, "dur": 7, "args": { "External id": 162657, "cbid": 211, "correlation": 162657 } }, { "ph": "s", "id": 162657, "pid": 76337, "tid": -914061504, "ts": 1716454223869566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869651, "dur": 1, "args": { "External id": 162668, "cbid": 251, "correlation": 162668 } }, { "ph": "f", "id": 162668, "pid": 76337, "tid": -914061504, "ts": 1716454223869651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223949607, "dur": 76, "args": { "External id": 162669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162669, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162669, "pid": 5, "tid": 7, "ts": 1716454223949607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869656, "dur": 13, "args": { "External id": 162669, "cbid": 211, "correlation": 162669 } }, { "ph": "s", "id": 162669, "pid": 76337, "tid": -914061504, "ts": 1716454223869656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869725, "dur": 1, "args": { "External id": 162680, "cbid": 251, "correlation": 162680 } }, { "ph": "f", "id": 162680, "pid": 76337, "tid": -914061504, "ts": 1716454223869725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869728, "dur": 0, "args": { "External id": 162681, "cbid": 251, "correlation": 162681 } }, { "ph": "f", "id": 162681, "pid": 76337, "tid": -914061504, "ts": 1716454223869728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223949684, "dur": 11, "args": { "External id": 162682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162682, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 162682, "pid": 5, "tid": 7, "ts": 1716454223949684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869730, "dur": 12, "args": { "External id": 162682, "cbid": 211, "correlation": 162682 } }, { "ph": "s", "id": 162682, "pid": 76337, "tid": -914061504, "ts": 1716454223869730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223949697, "dur": 5, "args": { "External id": 162684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162684, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 162684, "pid": 5, "tid": 7, "ts": 1716454223949697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869744, "dur": 5, "args": { "External id": 162684, "cbid": 211, "correlation": 162684 } }, { "ph": "s", "id": 162684, "pid": 76337, "tid": -914061504, "ts": 1716454223869744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869801, "dur": 1, "args": { "External id": 162695, "cbid": 251, "correlation": 162695 } }, { "ph": "f", "id": 162695, "pid": 76337, "tid": -914061504, "ts": 1716454223869801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869805, "dur": 0, "args": { "External id": 162696, "cbid": 251, "correlation": 162696 } }, { "ph": "f", "id": 162696, "pid": 76337, "tid": -914061504, "ts": 1716454223869805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223949703, "dur": 7, "args": { "External id": 162697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162697, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 162697, "pid": 5, "tid": 7, "ts": 1716454223949703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869807, "dur": 12, "args": { "External id": 162697, "cbid": 211, "correlation": 162697 } }, { "ph": "s", "id": 162697, "pid": 76337, "tid": -914061504, "ts": 1716454223869807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223949712, "dur": 4, "args": { "External id": 162699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162699, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 162699, "pid": 5, "tid": 7, "ts": 1716454223949712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869820, "dur": 5, "args": { "External id": 162699, "cbid": 211, "correlation": 162699 } }, { "ph": "s", "id": 162699, "pid": 76337, "tid": -914061504, "ts": 1716454223869820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223949717, "dur": 92, "args": { "External id": 162720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162720, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 162720, "pid": 5, "tid": 7, "ts": 1716454223949717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223869893, "dur": 13, "args": { "External id": 162720, "cbid": 211, "correlation": 162720 } }, { "ph": "s", "id": 162720, "pid": 76337, "tid": -914061504, "ts": 1716454223869893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223869999, "dur": 1, "args": { "External id": 162738, "cbid": 251, "correlation": 162738 } }, { "ph": "f", "id": 162738, "pid": 76337, "tid": -914061504, "ts": 1716454223869999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223949810, "dur": 100, "args": { "External id": 162740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162740, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162740, "pid": 5, "tid": 7, "ts": 1716454223949810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870005, "dur": 14, "args": { "External id": 162740, "cbid": 211, "correlation": 162740 } }, { "ph": "s", "id": 162740, "pid": 76337, "tid": -914061504, "ts": 1716454223870005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223949912, "dur": 19, "args": { "External id": 162748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162748, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162748, "pid": 5, "tid": 7, "ts": 1716454223949912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870076, "dur": 12, "args": { "External id": 162748, "cbid": 211, "correlation": 162748 } }, { "ph": "s", "id": 162748, "pid": 76337, "tid": -914061504, "ts": 1716454223870076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223949932, "dur": 38, "args": { "External id": 162756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162756, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162756, "pid": 5, "tid": 7, "ts": 1716454223949932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870117, "dur": 9, "args": { "External id": 162756, "cbid": 211, "correlation": 162756 } }, { "ph": "s", "id": 162756, "pid": 76337, "tid": -914061504, "ts": 1716454223870117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223949972, "dur": 34, "args": { "External id": 162778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162778, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162778, "pid": 5, "tid": 7, "ts": 1716454223949972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870168, "dur": 10, "args": { "External id": 162778, "cbid": 211, "correlation": 162778 } }, { "ph": "s", "id": 162778, "pid": 76337, "tid": -914061504, "ts": 1716454223870168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223870258, "dur": 1, "args": { "External id": 162794, "cbid": 251, "correlation": 162794 } }, { "ph": "f", "id": 162794, "pid": 76337, "tid": -914061504, "ts": 1716454223870258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223870263, "dur": 0, "args": { "External id": 162796, "cbid": 251, "correlation": 162796 } }, { "ph": "f", "id": 162796, "pid": 76337, "tid": -914061504, "ts": 1716454223870263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223950007, "dur": 539, "args": { "External id": 162797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162797, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 162797, "pid": 5, "tid": 7, "ts": 1716454223950007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870267, "dur": 13, "args": { "External id": 162797, "cbid": 211, "correlation": 162797 } }, { "ph": "s", "id": 162797, "pid": 76337, "tid": -914061504, "ts": 1716454223870267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223950547, "dur": 127, "args": { "External id": 162805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162805, "pid": 5, "tid": 7, "ts": 1716454223950547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870332, "dur": 12, "args": { "External id": 162805, "cbid": 211, "correlation": 162805 } }, { "ph": "s", "id": 162805, "pid": 76337, "tid": -914061504, "ts": 1716454223870332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223950675, "dur": 130, "args": { "External id": 162813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162813, "pid": 5, "tid": 7, "ts": 1716454223950675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870363, "dur": 8, "args": { "External id": 162813, "cbid": 211, "correlation": 162813 } }, { "ph": "s", "id": 162813, "pid": 76337, "tid": -914061504, "ts": 1716454223870363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223870439, "dur": 1, "args": { "External id": 162829, "cbid": 251, "correlation": 162829 } }, { "ph": "f", "id": 162829, "pid": 76337, "tid": -914061504, "ts": 1716454223870439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223950807, "dur": 302, "args": { "External id": 162831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162831, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162831, "pid": 5, "tid": 7, "ts": 1716454223950807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870444, "dur": 13, "args": { "External id": 162831, "cbid": 211, "correlation": 162831 } }, { "ph": "s", "id": 162831, "pid": 76337, "tid": -914061504, "ts": 1716454223870444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223951110, "dur": 27, "args": { "External id": 162839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162839, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162839, "pid": 5, "tid": 7, "ts": 1716454223951110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870487, "dur": 10, "args": { "External id": 162839, "cbid": 211, "correlation": 162839 } }, { "ph": "s", "id": 162839, "pid": 76337, "tid": -914061504, "ts": 1716454223870487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223951139, "dur": 81, "args": { "External id": 162850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162850, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162850, "pid": 5, "tid": 7, "ts": 1716454223951139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870553, "dur": 12, "args": { "External id": 162850, "cbid": 211, "correlation": 162850 } }, { "ph": "s", "id": 162850, "pid": 76337, "tid": -914061504, "ts": 1716454223870553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223870616, "dur": 0, "args": { "External id": 162862, "cbid": 317, "correlation": 162862 } }, { "ph": "f", "id": 162862, "pid": 76337, "tid": -914061504, "ts": 1716454223870616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223870617, "dur": 0, "args": { "External id": 162863, "cbid": 203, "correlation": 162863 } }, { "ph": "f", "id": 162863, "pid": 76337, "tid": -914061504, "ts": 1716454223870617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223870618, "dur": 0, "args": { "External id": 162864, "cbid": 205, "correlation": 162864 } }, { "ph": "f", "id": 162864, "pid": 76337, "tid": -914061504, "ts": 1716454223870618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223951221, "dur": 23, "args": { "External id": 162868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162868, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162868, "pid": 5, "tid": 7, "ts": 1716454223951221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870632, "dur": 12, "args": { "External id": 162868, "cbid": 211, "correlation": 162868 } }, { "ph": "s", "id": 162868, "pid": 76337, "tid": -914061504, "ts": 1716454223870632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223951246, "dur": 120, "args": { "External id": 162870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162870, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162870, "pid": 5, "tid": 7, "ts": 1716454223951246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870651, "dur": 7, "args": { "External id": 162870, "cbid": 211, "correlation": 162870 } }, { "ph": "s", "id": 162870, "pid": 76337, "tid": -914061504, "ts": 1716454223870651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223951367, "dur": 24, "args": { "External id": 162872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162872, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162872, "pid": 5, "tid": 7, "ts": 1716454223951367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870662, "dur": 5, "args": { "External id": 162872, "cbid": 211, "correlation": 162872 } }, { "ph": "s", "id": 162872, "pid": 76337, "tid": -914061504, "ts": 1716454223870662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223951393, "dur": 33, "args": { "External id": 162878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162878, "pid": 5, "tid": 7, "ts": 1716454223951393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870689, "dur": 9, "args": { "External id": 162878, "cbid": 211, "correlation": 162878 } }, { "ph": "s", "id": 162878, "pid": 76337, "tid": -914061504, "ts": 1716454223870689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223951427, "dur": 27, "args": { "External id": 162886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162886, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162886, "pid": 5, "tid": 7, "ts": 1716454223951427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870721, "dur": 8, "args": { "External id": 162886, "cbid": 211, "correlation": 162886 } }, { "ph": "s", "id": 162886, "pid": 76337, "tid": -914061504, "ts": 1716454223870721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454223951455, "dur": 101, "args": { "External id": 162897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162897, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162897, "pid": 5, "tid": 7, "ts": 1716454223951455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870783, "dur": 11, "args": { "External id": 162897, "cbid": 211, "correlation": 162897 } }, { "ph": "s", "id": 162897, "pid": 76337, "tid": -914061504, "ts": 1716454223870783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223870837, "dur": 0, "args": { "External id": 162907, "cbid": 317, "correlation": 162907 } }, { "ph": "f", "id": 162907, "pid": 76337, "tid": -914061504, "ts": 1716454223870837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223870838, "dur": 0, "args": { "External id": 162908, "cbid": 203, "correlation": 162908 } }, { "ph": "f", "id": 162908, "pid": 76337, "tid": -914061504, "ts": 1716454223870838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223870839, "dur": 0, "args": { "External id": 162909, "cbid": 205, "correlation": 162909 } }, { "ph": "f", "id": 162909, "pid": 76337, "tid": -914061504, "ts": 1716454223870839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223951558, "dur": 75, "args": { "External id": 162913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162913, "pid": 5, "tid": 7, "ts": 1716454223951558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870853, "dur": 12, "args": { "External id": 162913, "cbid": 211, "correlation": 162913 } }, { "ph": "s", "id": 162913, "pid": 76337, "tid": -914061504, "ts": 1716454223870853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223951635, "dur": 43, "args": { "External id": 162915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162915, "pid": 5, "tid": 7, "ts": 1716454223951635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870867, "dur": 5, "args": { "External id": 162915, "cbid": 211, "correlation": 162915 } }, { "ph": "s", "id": 162915, "pid": 76337, "tid": -914061504, "ts": 1716454223870867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223951679, "dur": 4, "args": { "External id": 162917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162917, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 162917, "pid": 5, "tid": 7, "ts": 1716454223951679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870878, "dur": 6, "args": { "External id": 162917, "cbid": 211, "correlation": 162917 } }, { "ph": "s", "id": 162917, "pid": 76337, "tid": -914061504, "ts": 1716454223870878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223870887, "dur": 0, "args": { "External id": 162918, "cbid": 51, "correlation": 162918 } }, { "ph": "s", "id": 162918, "pid": 76337, "tid": -914061504, "ts": 1716454223870887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223951684, "dur": 2231, "args": { "External id": 162919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162919, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162919, "pid": 5, "tid": 7, "ts": 1716454223951684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870888, "dur": 6, "args": { "External id": 162919, "cbid": 211, "correlation": 162919 } }, { "ph": "s", "id": 162919, "pid": 76337, "tid": -914061504, "ts": 1716454223870888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223953917, "dur": 114, "args": { "External id": 162924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162924, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162924, "pid": 5, "tid": 7, "ts": 1716454223953917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223870916, "dur": 8, "args": { "External id": 162924, "cbid": 211, "correlation": 162924 } }, { "ph": "s", "id": 162924, "pid": 76337, "tid": -914061504, "ts": 1716454223870916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223954032, "dur": 164, "args": { "External id": 162933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162933, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162933, "pid": 5, "tid": 7, "ts": 1716454223954032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871016, "dur": 13, "args": { "External id": 162933, "cbid": 211, "correlation": 162933 } }, { "ph": "s", "id": 162933, "pid": 76337, "tid": -914061504, "ts": 1716454223871016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223954198, "dur": 128, "args": { "External id": 162953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162953, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 162953, "pid": 5, "tid": 7, "ts": 1716454223954198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871088, "dur": 11, "args": { "External id": 162953, "cbid": 211, "correlation": 162953 } }, { "ph": "s", "id": 162953, "pid": 76337, "tid": -914061504, "ts": 1716454223871088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223954327, "dur": 5, "args": { "External id": 162965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162965, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 162965, "pid": 5, "tid": 7, "ts": 1716454223954327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871110, "dur": 6, "args": { "External id": 162965, "cbid": 211, "correlation": 162965 } }, { "ph": "s", "id": 162965, "pid": 76337, "tid": -914061504, "ts": 1716454223871110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223954333, "dur": 160, "args": { "External id": 162968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162968, "pid": 5, "tid": 7, "ts": 1716454223954333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871128, "dur": 6, "args": { "External id": 162968, "cbid": 211, "correlation": 162968 } }, { "ph": "s", "id": 162968, "pid": 76337, "tid": -914061504, "ts": 1716454223871128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223954494, "dur": 102, "args": { "External id": 162977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162977, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162977, "pid": 5, "tid": 7, "ts": 1716454223954494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871169, "dur": 10, "args": { "External id": 162977, "cbid": 211, "correlation": 162977 } }, { "ph": "s", "id": 162977, "pid": 76337, "tid": -914061504, "ts": 1716454223871169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223871221, "dur": 0, "args": { "External id": 162987, "cbid": 317, "correlation": 162987 } }, { "ph": "f", "id": 162987, "pid": 76337, "tid": -914061504, "ts": 1716454223871221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223871222, "dur": 0, "args": { "External id": 162988, "cbid": 203, "correlation": 162988 } }, { "ph": "f", "id": 162988, "pid": 76337, "tid": -914061504, "ts": 1716454223871222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223871223, "dur": 0, "args": { "External id": 162989, "cbid": 205, "correlation": 162989 } }, { "ph": "f", "id": 162989, "pid": 76337, "tid": -914061504, "ts": 1716454223871223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223954597, "dur": 112, "args": { "External id": 162993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162993, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162993, "pid": 5, "tid": 7, "ts": 1716454223954597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871239, "dur": 11, "args": { "External id": 162993, "cbid": 211, "correlation": 162993 } }, { "ph": "s", "id": 162993, "pid": 76337, "tid": -914061504, "ts": 1716454223871239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223954710, "dur": 34, "args": { "External id": 162995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162995, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 162995, "pid": 5, "tid": 7, "ts": 1716454223954710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871253, "dur": 6, "args": { "External id": 162995, "cbid": 211, "correlation": 162995 } }, { "ph": "s", "id": 162995, "pid": 76337, "tid": -914061504, "ts": 1716454223871253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223954746, "dur": 4, "args": { "External id": 162997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 162997, "pid": 5, "tid": 7, "ts": 1716454223954746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871264, "dur": 6, "args": { "External id": 162997, "cbid": 211, "correlation": 162997 } }, { "ph": "s", "id": 162997, "pid": 76337, "tid": -914061504, "ts": 1716454223871264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223871273, "dur": 0, "args": { "External id": 162998, "cbid": 51, "correlation": 162998 } }, { "ph": "s", "id": 162998, "pid": 76337, "tid": -914061504, "ts": 1716454223871273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223954751, "dur": 2008, "args": { "External id": 162999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 162999, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 162999, "pid": 5, "tid": 7, "ts": 1716454223954751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871274, "dur": 6, "args": { "External id": 162999, "cbid": 211, "correlation": 162999 } }, { "ph": "s", "id": 162999, "pid": 76337, "tid": -914061504, "ts": 1716454223871274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223956760, "dur": 59, "args": { "External id": 163004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163004, "pid": 5, "tid": 7, "ts": 1716454223956760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871304, "dur": 9, "args": { "External id": 163004, "cbid": 211, "correlation": 163004 } }, { "ph": "s", "id": 163004, "pid": 76337, "tid": -914061504, "ts": 1716454223871304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223956820, "dur": 3, "args": { "External id": 163012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163012, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163012, "pid": 5, "tid": 7, "ts": 1716454223956820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871347, "dur": 10, "args": { "External id": 163012, "cbid": 211, "correlation": 163012 } }, { "ph": "s", "id": 163012, "pid": 76337, "tid": -914061504, "ts": 1716454223871347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223871412, "dur": 1, "args": { "External id": 163028, "cbid": 251, "correlation": 163028 } }, { "ph": "f", "id": 163028, "pid": 76337, "tid": -914061504, "ts": 1716454223871412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223871418, "dur": 0, "args": { "External id": 163030, "cbid": 251, "correlation": 163030 } }, { "ph": "f", "id": 163030, "pid": 76337, "tid": -914061504, "ts": 1716454223871418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223956825, "dur": 11, "args": { "External id": 163031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163031, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 163031, "pid": 5, "tid": 7, "ts": 1716454223956825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871419, "dur": 11, "args": { "External id": 163031, "cbid": 211, "correlation": 163031 } }, { "ph": "s", "id": 163031, "pid": 76337, "tid": -914061504, "ts": 1716454223871419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223956838, "dur": 5, "args": { "External id": 163033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163033, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 163033, "pid": 5, "tid": 7, "ts": 1716454223956838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871433, "dur": 6, "args": { "External id": 163033, "cbid": 211, "correlation": 163033 } }, { "ph": "s", "id": 163033, "pid": 76337, "tid": -914061504, "ts": 1716454223871433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223956844, "dur": 55, "args": { "External id": 163043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163043, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163043, "pid": 5, "tid": 7, "ts": 1716454223956844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871491, "dur": 12, "args": { "External id": 163043, "cbid": 211, "correlation": 163043 } }, { "ph": "s", "id": 163043, "pid": 76337, "tid": -914061504, "ts": 1716454223871491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223956901, "dur": 52, "args": { "External id": 163063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163063, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 163063, "pid": 5, "tid": 7, "ts": 1716454223956901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871558, "dur": 11, "args": { "External id": 163063, "cbid": 211, "correlation": 163063 } }, { "ph": "s", "id": 163063, "pid": 76337, "tid": -914061504, "ts": 1716454223871558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223956954, "dur": 4, "args": { "External id": 163075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163075, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163075, "pid": 5, "tid": 7, "ts": 1716454223956954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871579, "dur": 6, "args": { "External id": 163075, "cbid": 211, "correlation": 163075 } }, { "ph": "s", "id": 163075, "pid": 76337, "tid": -914061504, "ts": 1716454223871579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223956959, "dur": 55, "args": { "External id": 163078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163078, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163078, "pid": 5, "tid": 7, "ts": 1716454223956959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871597, "dur": 7, "args": { "External id": 163078, "cbid": 211, "correlation": 163078 } }, { "ph": "s", "id": 163078, "pid": 76337, "tid": -914061504, "ts": 1716454223871597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223957015, "dur": 37, "args": { "External id": 163087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163087, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163087, "pid": 5, "tid": 7, "ts": 1716454223957015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871637, "dur": 10, "args": { "External id": 163087, "cbid": 211, "correlation": 163087 } }, { "ph": "s", "id": 163087, "pid": 76337, "tid": -914061504, "ts": 1716454223871637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223871701, "dur": 0, "args": { "External id": 163097, "cbid": 317, "correlation": 163097 } }, { "ph": "f", "id": 163097, "pid": 76337, "tid": -914061504, "ts": 1716454223871701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223871702, "dur": 0, "args": { "External id": 163098, "cbid": 203, "correlation": 163098 } }, { "ph": "f", "id": 163098, "pid": 76337, "tid": -914061504, "ts": 1716454223871702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223871703, "dur": 0, "args": { "External id": 163099, "cbid": 205, "correlation": 163099 } }, { "ph": "f", "id": 163099, "pid": 76337, "tid": -914061504, "ts": 1716454223871703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223957054, "dur": 40, "args": { "External id": 163103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163103, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163103, "pid": 5, "tid": 7, "ts": 1716454223957054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871721, "dur": 12, "args": { "External id": 163103, "cbid": 211, "correlation": 163103 } }, { "ph": "s", "id": 163103, "pid": 76337, "tid": -914061504, "ts": 1716454223871721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223957095, "dur": 14, "args": { "External id": 163105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163105, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163105, "pid": 5, "tid": 7, "ts": 1716454223957095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871735, "dur": 5, "args": { "External id": 163105, "cbid": 211, "correlation": 163105 } }, { "ph": "s", "id": 163105, "pid": 76337, "tid": -914061504, "ts": 1716454223871735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223957110, "dur": 3, "args": { "External id": 163107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163107, "pid": 5, "tid": 7, "ts": 1716454223957110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871745, "dur": 6, "args": { "External id": 163107, "cbid": 211, "correlation": 163107 } }, { "ph": "s", "id": 163107, "pid": 76337, "tid": -914061504, "ts": 1716454223871745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223871755, "dur": 0, "args": { "External id": 163108, "cbid": 51, "correlation": 163108 } }, { "ph": "s", "id": 163108, "pid": 76337, "tid": -914061504, "ts": 1716454223871755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223957115, "dur": 699, "args": { "External id": 163109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163109, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163109, "pid": 5, "tid": 7, "ts": 1716454223957115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871755, "dur": 5, "args": { "External id": 163109, "cbid": 211, "correlation": 163109 } }, { "ph": "s", "id": 163109, "pid": 76337, "tid": -914061504, "ts": 1716454223871755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223957816, "dur": 59, "args": { "External id": 163114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163114, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163114, "pid": 5, "tid": 7, "ts": 1716454223957816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871783, "dur": 8, "args": { "External id": 163114, "cbid": 211, "correlation": 163114 } }, { "ph": "s", "id": 163114, "pid": 76337, "tid": -914061504, "ts": 1716454223871783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223871840, "dur": 0, "args": { "External id": 163124, "cbid": 317, "correlation": 163124 } }, { "ph": "f", "id": 163124, "pid": 76337, "tid": -914061504, "ts": 1716454223871840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223871841, "dur": 0, "args": { "External id": 163125, "cbid": 203, "correlation": 163125 } }, { "ph": "f", "id": 163125, "pid": 76337, "tid": -914061504, "ts": 1716454223871841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223871841, "dur": 0, "args": { "External id": 163126, "cbid": 205, "correlation": 163126 } }, { "ph": "f", "id": 163126, "pid": 76337, "tid": -914061504, "ts": 1716454223871841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223957876, "dur": 4, "args": { "External id": 163130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163130, "pid": 5, "tid": 7, "ts": 1716454223957876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871858, "dur": 12, "args": { "External id": 163130, "cbid": 211, "correlation": 163130 } }, { "ph": "s", "id": 163130, "pid": 76337, "tid": -914061504, "ts": 1716454223871858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223871873, "dur": 0, "args": { "External id": 163131, "cbid": 51, "correlation": 163131 } }, { "ph": "s", "id": 163131, "pid": 76337, "tid": -914061504, "ts": 1716454223871873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454223957881, "dur": 266, "args": { "External id": 163132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163132, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163132, "pid": 5, "tid": 7, "ts": 1716454223957881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871874, "dur": 7, "args": { "External id": 163132, "cbid": 211, "correlation": 163132 } }, { "ph": "s", "id": 163132, "pid": 76337, "tid": -914061504, "ts": 1716454223871874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223958149, "dur": 59, "args": { "External id": 163137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163137, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163137, "pid": 5, "tid": 7, "ts": 1716454223958149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871903, "dur": 8, "args": { "External id": 163137, "cbid": 211, "correlation": 163137 } }, { "ph": "s", "id": 163137, "pid": 76337, "tid": -914061504, "ts": 1716454223871903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223958209, "dur": 50, "args": { "External id": 163145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163145, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163145, "pid": 5, "tid": 7, "ts": 1716454223958209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871931, "dur": 8, "args": { "External id": 163145, "cbid": 211, "correlation": 163145 } }, { "ph": "s", "id": 163145, "pid": 76337, "tid": -914061504, "ts": 1716454223871931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223958260, "dur": 35, "args": { "External id": 163153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163153, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163153, "pid": 5, "tid": 7, "ts": 1716454223958260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223871960, "dur": 8, "args": { "External id": 163153, "cbid": 211, "correlation": 163153 } }, { "ph": "s", "id": 163153, "pid": 76337, "tid": -914061504, "ts": 1716454223871960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223958297, "dur": 53, "args": { "External id": 163173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163173, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 163173, "pid": 5, "tid": 7, "ts": 1716454223958297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872050, "dur": 13, "args": { "External id": 163173, "cbid": 211, "correlation": 163173 } }, { "ph": "s", "id": 163173, "pid": 76337, "tid": -914061504, "ts": 1716454223872050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223958351, "dur": 4, "args": { "External id": 163185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163185, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163185, "pid": 5, "tid": 7, "ts": 1716454223958351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872073, "dur": 7, "args": { "External id": 163185, "cbid": 211, "correlation": 163185 } }, { "ph": "s", "id": 163185, "pid": 76337, "tid": -914061504, "ts": 1716454223872073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223958356, "dur": 55, "args": { "External id": 163188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163188, "pid": 5, "tid": 7, "ts": 1716454223958356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872091, "dur": 7, "args": { "External id": 163188, "cbid": 211, "correlation": 163188 } }, { "ph": "s", "id": 163188, "pid": 76337, "tid": -914061504, "ts": 1716454223872091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223872149, "dur": 0, "args": { "External id": 163199, "cbid": 317, "correlation": 163199 } }, { "ph": "f", "id": 163199, "pid": 76337, "tid": -914061504, "ts": 1716454223872149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223872150, "dur": 0, "args": { "External id": 163200, "cbid": 203, "correlation": 163200 } }, { "ph": "f", "id": 163200, "pid": 76337, "tid": -914061504, "ts": 1716454223872150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223872151, "dur": 0, "args": { "External id": 163201, "cbid": 205, "correlation": 163201 } }, { "ph": "f", "id": 163201, "pid": 76337, "tid": -914061504, "ts": 1716454223872151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872183, "dur": 2, "args": { "External id": 163205, "cbid": 251, "correlation": 163205 } }, { "ph": "f", "id": 163205, "pid": 76337, "tid": -914061504, "ts": 1716454223872183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872187, "dur": 1, "args": { "External id": 163206, "cbid": 251, "correlation": 163206 } }, { "ph": "f", "id": 163206, "pid": 76337, "tid": -914061504, "ts": 1716454223872187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872189, "dur": 1, "args": { "External id": 163207, "cbid": 251, "correlation": 163207 } }, { "ph": "f", "id": 163207, "pid": 76337, "tid": -914061504, "ts": 1716454223872189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872191, "dur": 1, "args": { "External id": 163208, "cbid": 251, "correlation": 163208 } }, { "ph": "f", "id": 163208, "pid": 76337, "tid": -914061504, "ts": 1716454223872191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872193, "dur": 1, "args": { "External id": 163209, "cbid": 251, "correlation": 163209 } }, { "ph": "f", "id": 163209, "pid": 76337, "tid": -914061504, "ts": 1716454223872193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872195, "dur": 1, "args": { "External id": 163210, "cbid": 251, "correlation": 163210 } }, { "ph": "f", "id": 163210, "pid": 76337, "tid": -914061504, "ts": 1716454223872195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872197, "dur": 1, "args": { "External id": 163211, "cbid": 251, "correlation": 163211 } }, { "ph": "f", "id": 163211, "pid": 76337, "tid": -914061504, "ts": 1716454223872197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872199, "dur": 1, "args": { "External id": 163212, "cbid": 251, "correlation": 163212 } }, { "ph": "f", "id": 163212, "pid": 76337, "tid": -914061504, "ts": 1716454223872199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872202, "dur": 0, "args": { "External id": 163213, "cbid": 251, "correlation": 163213 } }, { "ph": "f", "id": 163213, "pid": 76337, "tid": -914061504, "ts": 1716454223872202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223958412, "dur": 116, "args": { "External id": 163214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163214, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163214, "pid": 5, "tid": 7, "ts": 1716454223958412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872204, "dur": 13, "args": { "External id": 163214, "cbid": 211, "correlation": 163214 } }, { "ph": "s", "id": 163214, "pid": 76337, "tid": -914061504, "ts": 1716454223872204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223958529, "dur": 60, "args": { "External id": 163220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163220, "pid": 5, "tid": 7, "ts": 1716454223958529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872241, "dur": 10, "args": { "External id": 163220, "cbid": 211, "correlation": 163220 } }, { "ph": "s", "id": 163220, "pid": 76337, "tid": -914061504, "ts": 1716454223872241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223958591, "dur": 556, "args": { "External id": 163229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163229, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163229, "pid": 5, "tid": 7, "ts": 1716454223958591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872324, "dur": 14, "args": { "External id": 163229, "cbid": 211, "correlation": 163229 } }, { "ph": "s", "id": 163229, "pid": 76337, "tid": -914061504, "ts": 1716454223872324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223959149, "dur": 183, "args": { "External id": 163251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163251, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163251, "pid": 5, "tid": 7, "ts": 1716454223959149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872382, "dur": 10, "args": { "External id": 163251, "cbid": 211, "correlation": 163251 } }, { "ph": "s", "id": 163251, "pid": 76337, "tid": -914061504, "ts": 1716454223872382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872471, "dur": 1, "args": { "External id": 163262, "cbid": 251, "correlation": 163262 } }, { "ph": "f", "id": 163262, "pid": 76337, "tid": -914061504, "ts": 1716454223872471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223959334, "dur": 197, "args": { "External id": 163263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163263, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163263, "pid": 5, "tid": 7, "ts": 1716454223959334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872476, "dur": 13, "args": { "External id": 163263, "cbid": 211, "correlation": 163263 } }, { "ph": "s", "id": 163263, "pid": 76337, "tid": -914061504, "ts": 1716454223872476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872546, "dur": 1, "args": { "External id": 163274, "cbid": 251, "correlation": 163274 } }, { "ph": "f", "id": 163274, "pid": 76337, "tid": -914061504, "ts": 1716454223872546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223959532, "dur": 189, "args": { "External id": 163275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163275, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163275, "pid": 5, "tid": 7, "ts": 1716454223959532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872551, "dur": 12, "args": { "External id": 163275, "cbid": 211, "correlation": 163275 } }, { "ph": "s", "id": 163275, "pid": 76337, "tid": -914061504, "ts": 1716454223872551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872614, "dur": 1, "args": { "External id": 163286, "cbid": 251, "correlation": 163286 } }, { "ph": "f", "id": 163286, "pid": 76337, "tid": -914061504, "ts": 1716454223872614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223959723, "dur": 187, "args": { "External id": 163287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163287, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163287, "pid": 5, "tid": 7, "ts": 1716454223959723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872618, "dur": 12, "args": { "External id": 163287, "cbid": 211, "correlation": 163287 } }, { "ph": "s", "id": 163287, "pid": 76337, "tid": -914061504, "ts": 1716454223872618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223959911, "dur": 18741, "args": { "External id": 163308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163308, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163308, "pid": 5, "tid": 7, "ts": 1716454223959911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872703, "dur": 13, "args": { "External id": 163308, "cbid": 211, "correlation": 163308 } }, { "ph": "s", "id": 163308, "pid": 76337, "tid": -914061504, "ts": 1716454223872703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223872803, "dur": 1, "args": { "External id": 163326, "cbid": 251, "correlation": 163326 } }, { "ph": "f", "id": 163326, "pid": 76337, "tid": -914061504, "ts": 1716454223872803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223978653, "dur": 203, "args": { "External id": 163328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163328, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163328, "pid": 5, "tid": 7, "ts": 1716454223978653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872808, "dur": 13, "args": { "External id": 163328, "cbid": 211, "correlation": 163328 } }, { "ph": "s", "id": 163328, "pid": 76337, "tid": -914061504, "ts": 1716454223872808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223978858, "dur": 67, "args": { "External id": 163336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163336, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163336, "pid": 5, "tid": 7, "ts": 1716454223978858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872878, "dur": 12, "args": { "External id": 163336, "cbid": 211, "correlation": 163336 } }, { "ph": "s", "id": 163336, "pid": 76337, "tid": -914061504, "ts": 1716454223872878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223978926, "dur": 97, "args": { "External id": 163344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163344, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163344, "pid": 5, "tid": 7, "ts": 1716454223978926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223872917, "dur": 9, "args": { "External id": 163344, "cbid": 211, "correlation": 163344 } }, { "ph": "s", "id": 163344, "pid": 76337, "tid": -914061504, "ts": 1716454223872917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223979025, "dur": 55, "args": { "External id": 163355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163355, "pid": 5, "tid": 7, "ts": 1716454223979025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873002, "dur": 13, "args": { "External id": 163355, "cbid": 211, "correlation": 163355 } }, { "ph": "s", "id": 163355, "pid": 76337, "tid": -914061504, "ts": 1716454223873002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223979081, "dur": 93, "args": { "External id": 163377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163377, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163377, "pid": 5, "tid": 7, "ts": 1716454223979081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873034, "dur": 7, "args": { "External id": 163377, "cbid": 211, "correlation": 163377 } }, { "ph": "s", "id": 163377, "pid": 76337, "tid": -914061504, "ts": 1716454223873034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873119, "dur": 1, "args": { "External id": 163388, "cbid": 251, "correlation": 163388 } }, { "ph": "f", "id": 163388, "pid": 76337, "tid": -914061504, "ts": 1716454223873119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223979176, "dur": 105, "args": { "External id": 163389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163389, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163389, "pid": 5, "tid": 7, "ts": 1716454223979176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873124, "dur": 13, "args": { "External id": 163389, "cbid": 211, "correlation": 163389 } }, { "ph": "s", "id": 163389, "pid": 76337, "tid": -914061504, "ts": 1716454223873124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873199, "dur": 2, "args": { "External id": 163400, "cbid": 251, "correlation": 163400 } }, { "ph": "f", "id": 163400, "pid": 76337, "tid": -914061504, "ts": 1716454223873199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873204, "dur": 0, "args": { "External id": 163401, "cbid": 251, "correlation": 163401 } }, { "ph": "f", "id": 163401, "pid": 76337, "tid": -914061504, "ts": 1716454223873204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223979282, "dur": 10, "args": { "External id": 163402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163402, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 163402, "pid": 5, "tid": 7, "ts": 1716454223979282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873206, "dur": 13, "args": { "External id": 163402, "cbid": 211, "correlation": 163402 } }, { "ph": "s", "id": 163402, "pid": 76337, "tid": -914061504, "ts": 1716454223873206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223979294, "dur": 5, "args": { "External id": 163404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163404, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 163404, "pid": 5, "tid": 7, "ts": 1716454223979294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873223, "dur": 7, "args": { "External id": 163404, "cbid": 211, "correlation": 163404 } }, { "ph": "s", "id": 163404, "pid": 76337, "tid": -914061504, "ts": 1716454223873223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873285, "dur": 1, "args": { "External id": 163415, "cbid": 251, "correlation": 163415 } }, { "ph": "f", "id": 163415, "pid": 76337, "tid": -914061504, "ts": 1716454223873285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873289, "dur": 0, "args": { "External id": 163416, "cbid": 251, "correlation": 163416 } }, { "ph": "f", "id": 163416, "pid": 76337, "tid": -914061504, "ts": 1716454223873289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223979300, "dur": 6, "args": { "External id": 163417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163417, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 163417, "pid": 5, "tid": 7, "ts": 1716454223979300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873290, "dur": 13, "args": { "External id": 163417, "cbid": 211, "correlation": 163417 } }, { "ph": "s", "id": 163417, "pid": 76337, "tid": -914061504, "ts": 1716454223873290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223979308, "dur": 3, "args": { "External id": 163419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163419, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 163419, "pid": 5, "tid": 7, "ts": 1716454223979308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873304, "dur": 5, "args": { "External id": 163419, "cbid": 211, "correlation": 163419 } }, { "ph": "s", "id": 163419, "pid": 76337, "tid": -914061504, "ts": 1716454223873304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223979313, "dur": 158, "args": { "External id": 163440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163440, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163440, "pid": 5, "tid": 7, "ts": 1716454223979313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873379, "dur": 12, "args": { "External id": 163440, "cbid": 211, "correlation": 163440 } }, { "ph": "s", "id": 163440, "pid": 76337, "tid": -914061504, "ts": 1716454223873379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873475, "dur": 1, "args": { "External id": 163458, "cbid": 251, "correlation": 163458 } }, { "ph": "f", "id": 163458, "pid": 76337, "tid": -914061504, "ts": 1716454223873475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223979472, "dur": 106, "args": { "External id": 163460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163460, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163460, "pid": 5, "tid": 7, "ts": 1716454223979472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873481, "dur": 14, "args": { "External id": 163460, "cbid": 211, "correlation": 163460 } }, { "ph": "s", "id": 163460, "pid": 76337, "tid": -914061504, "ts": 1716454223873481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223979579, "dur": 35, "args": { "External id": 163468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163468, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163468, "pid": 5, "tid": 7, "ts": 1716454223979579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873552, "dur": 12, "args": { "External id": 163468, "cbid": 211, "correlation": 163468 } }, { "ph": "s", "id": 163468, "pid": 76337, "tid": -914061504, "ts": 1716454223873552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223979616, "dur": 67, "args": { "External id": 163476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163476, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163476, "pid": 5, "tid": 7, "ts": 1716454223979616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873593, "dur": 10, "args": { "External id": 163476, "cbid": 211, "correlation": 163476 } }, { "ph": "s", "id": 163476, "pid": 76337, "tid": -914061504, "ts": 1716454223873593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223979684, "dur": 93, "args": { "External id": 163498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163498, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163498, "pid": 5, "tid": 7, "ts": 1716454223979684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873644, "dur": 10, "args": { "External id": 163498, "cbid": 211, "correlation": 163498 } }, { "ph": "s", "id": 163498, "pid": 76337, "tid": -914061504, "ts": 1716454223873644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873729, "dur": 1, "args": { "External id": 163514, "cbid": 251, "correlation": 163514 } }, { "ph": "f", "id": 163514, "pid": 76337, "tid": -914061504, "ts": 1716454223873729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223979779, "dur": 582, "args": { "External id": 163516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163516, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163516, "pid": 5, "tid": 7, "ts": 1716454223979779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873734, "dur": 13, "args": { "External id": 163516, "cbid": 211, "correlation": 163516 } }, { "ph": "s", "id": 163516, "pid": 76337, "tid": -914061504, "ts": 1716454223873734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223980363, "dur": 247, "args": { "External id": 163524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163524, "pid": 5, "tid": 7, "ts": 1716454223980363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873800, "dur": 12, "args": { "External id": 163524, "cbid": 211, "correlation": 163524 } }, { "ph": "s", "id": 163524, "pid": 76337, "tid": -914061504, "ts": 1716454223873800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223980611, "dur": 251, "args": { "External id": 163532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163532, "pid": 5, "tid": 7, "ts": 1716454223980611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873831, "dur": 9, "args": { "External id": 163532, "cbid": 211, "correlation": 163532 } }, { "ph": "s", "id": 163532, "pid": 76337, "tid": -914061504, "ts": 1716454223873831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873914, "dur": 1, "args": { "External id": 163548, "cbid": 251, "correlation": 163548 } }, { "ph": "f", "id": 163548, "pid": 76337, "tid": -914061504, "ts": 1716454223873914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223873918, "dur": 0, "args": { "External id": 163550, "cbid": 251, "correlation": 163550 } }, { "ph": "f", "id": 163550, "pid": 76337, "tid": -914061504, "ts": 1716454223873918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454223980863, "dur": 357, "args": { "External id": 163551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163551, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 163551, "pid": 5, "tid": 7, "ts": 1716454223980863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873921, "dur": 13, "args": { "External id": 163551, "cbid": 211, "correlation": 163551 } }, { "ph": "s", "id": 163551, "pid": 76337, "tid": -914061504, "ts": 1716454223873921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223981222, "dur": 50, "args": { "External id": 163559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163559, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163559, "pid": 5, "tid": 7, "ts": 1716454223981222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223873964, "dur": 17, "args": { "External id": 163559, "cbid": 211, "correlation": 163559 } }, { "ph": "s", "id": 163559, "pid": 76337, "tid": -914061504, "ts": 1716454223873964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223981273, "dur": 160, "args": { "External id": 163570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163570, "pid": 5, "tid": 7, "ts": 1716454223981273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874041, "dur": 14, "args": { "External id": 163570, "cbid": 211, "correlation": 163570 } }, { "ph": "s", "id": 163570, "pid": 76337, "tid": -914061504, "ts": 1716454223874041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223874107, "dur": 0, "args": { "External id": 163582, "cbid": 317, "correlation": 163582 } }, { "ph": "f", "id": 163582, "pid": 76337, "tid": -914061504, "ts": 1716454223874107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223874108, "dur": 0, "args": { "External id": 163583, "cbid": 203, "correlation": 163583 } }, { "ph": "f", "id": 163583, "pid": 76337, "tid": -914061504, "ts": 1716454223874108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223874108, "dur": 0, "args": { "External id": 163584, "cbid": 205, "correlation": 163584 } }, { "ph": "f", "id": 163584, "pid": 76337, "tid": -914061504, "ts": 1716454223874108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874134, "dur": 1, "args": { "External id": 163588, "cbid": 251, "correlation": 163588 } }, { "ph": "f", "id": 163588, "pid": 76337, "tid": -914061504, "ts": 1716454223874134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874136, "dur": 0, "args": { "External id": 163589, "cbid": 251, "correlation": 163589 } }, { "ph": "f", "id": 163589, "pid": 76337, "tid": -914061504, "ts": 1716454223874136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874137, "dur": 0, "args": { "External id": 163590, "cbid": 251, "correlation": 163590 } }, { "ph": "f", "id": 163590, "pid": 76337, "tid": -914061504, "ts": 1716454223874137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874138, "dur": 0, "args": { "External id": 163591, "cbid": 251, "correlation": 163591 } }, { "ph": "f", "id": 163591, "pid": 76337, "tid": -914061504, "ts": 1716454223874138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874139, "dur": 0, "args": { "External id": 163592, "cbid": 251, "correlation": 163592 } }, { "ph": "f", "id": 163592, "pid": 76337, "tid": -914061504, "ts": 1716454223874139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874140, "dur": 0, "args": { "External id": 163593, "cbid": 251, "correlation": 163593 } }, { "ph": "f", "id": 163593, "pid": 76337, "tid": -914061504, "ts": 1716454223874140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874141, "dur": 0, "args": { "External id": 163594, "cbid": 251, "correlation": 163594 } }, { "ph": "f", "id": 163594, "pid": 76337, "tid": -914061504, "ts": 1716454223874141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874142, "dur": 0, "args": { "External id": 163595, "cbid": 251, "correlation": 163595 } }, { "ph": "f", "id": 163595, "pid": 76337, "tid": -914061504, "ts": 1716454223874142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874144, "dur": 0, "args": { "External id": 163596, "cbid": 251, "correlation": 163596 } }, { "ph": "f", "id": 163596, "pid": 76337, "tid": -914061504, "ts": 1716454223874144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223981435, "dur": 116, "args": { "External id": 163597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163597, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163597, "pid": 5, "tid": 7, "ts": 1716454223981435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874146, "dur": 13, "args": { "External id": 163597, "cbid": 211, "correlation": 163597 } }, { "ph": "s", "id": 163597, "pid": 76337, "tid": -914061504, "ts": 1716454223874146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223981552, "dur": 60, "args": { "External id": 163603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163603, "pid": 5, "tid": 7, "ts": 1716454223981552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874181, "dur": 9, "args": { "External id": 163603, "cbid": 211, "correlation": 163603 } }, { "ph": "s", "id": 163603, "pid": 76337, "tid": -914061504, "ts": 1716454223874181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223981614, "dur": 50, "args": { "External id": 163611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163611, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163611, "pid": 5, "tid": 7, "ts": 1716454223981614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874212, "dur": 8, "args": { "External id": 163611, "cbid": 211, "correlation": 163611 } }, { "ph": "s", "id": 163611, "pid": 76337, "tid": -914061504, "ts": 1716454223874212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223981665, "dur": 98, "args": { "External id": 163620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163620, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163620, "pid": 5, "tid": 7, "ts": 1716454223981665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874251, "dur": 10, "args": { "External id": 163620, "cbid": 211, "correlation": 163620 } }, { "ph": "s", "id": 163620, "pid": 76337, "tid": -914061504, "ts": 1716454223874251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223981764, "dur": 92, "args": { "External id": 163640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163640, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 163640, "pid": 5, "tid": 7, "ts": 1716454223981764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874323, "dur": 11, "args": { "External id": 163640, "cbid": 211, "correlation": 163640 } }, { "ph": "s", "id": 163640, "pid": 76337, "tid": -914061504, "ts": 1716454223874323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223981858, "dur": 5, "args": { "External id": 163652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163652, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 163652, "pid": 5, "tid": 7, "ts": 1716454223981858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874344, "dur": 7, "args": { "External id": 163652, "cbid": 211, "correlation": 163652 } }, { "ph": "s", "id": 163652, "pid": 76337, "tid": -914061504, "ts": 1716454223874344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223981864, "dur": 109, "args": { "External id": 163655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163655, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163655, "pid": 5, "tid": 7, "ts": 1716454223981864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874362, "dur": 7, "args": { "External id": 163655, "cbid": 211, "correlation": 163655 } }, { "ph": "s", "id": 163655, "pid": 76337, "tid": -914061504, "ts": 1716454223874362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223981974, "dur": 69, "args": { "External id": 163664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163664, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163664, "pid": 5, "tid": 7, "ts": 1716454223981974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874402, "dur": 9, "args": { "External id": 163664, "cbid": 211, "correlation": 163664 } }, { "ph": "s", "id": 163664, "pid": 76337, "tid": -914061504, "ts": 1716454223874402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223874454, "dur": 0, "args": { "External id": 163674, "cbid": 317, "correlation": 163674 } }, { "ph": "f", "id": 163674, "pid": 76337, "tid": -914061504, "ts": 1716454223874454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223874454, "dur": 0, "args": { "External id": 163675, "cbid": 203, "correlation": 163675 } }, { "ph": "f", "id": 163675, "pid": 76337, "tid": -914061504, "ts": 1716454223874454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223874455, "dur": 0, "args": { "External id": 163676, "cbid": 205, "correlation": 163676 } }, { "ph": "f", "id": 163676, "pid": 76337, "tid": -914061504, "ts": 1716454223874455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223982045, "dur": 76, "args": { "External id": 163680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163680, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163680, "pid": 5, "tid": 7, "ts": 1716454223982045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874470, "dur": 11, "args": { "External id": 163680, "cbid": 211, "correlation": 163680 } }, { "ph": "s", "id": 163680, "pid": 76337, "tid": -914061504, "ts": 1716454223874470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223982122, "dur": 24, "args": { "External id": 163682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163682, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163682, "pid": 5, "tid": 7, "ts": 1716454223982122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874484, "dur": 5, "args": { "External id": 163682, "cbid": 211, "correlation": 163682 } }, { "ph": "s", "id": 163682, "pid": 76337, "tid": -914061504, "ts": 1716454223874484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223982148, "dur": 4, "args": { "External id": 163684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163684, "pid": 5, "tid": 7, "ts": 1716454223982148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874494, "dur": 6, "args": { "External id": 163684, "cbid": 211, "correlation": 163684 } }, { "ph": "s", "id": 163684, "pid": 76337, "tid": -914061504, "ts": 1716454223874494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223874503, "dur": 0, "args": { "External id": 163685, "cbid": 51, "correlation": 163685 } }, { "ph": "s", "id": 163685, "pid": 76337, "tid": -914061504, "ts": 1716454223874503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223982153, "dur": 1379, "args": { "External id": 163686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163686, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163686, "pid": 5, "tid": 7, "ts": 1716454223982153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874504, "dur": 5, "args": { "External id": 163686, "cbid": 211, "correlation": 163686 } }, { "ph": "s", "id": 163686, "pid": 76337, "tid": -914061504, "ts": 1716454223874504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223983534, "dur": 59, "args": { "External id": 163691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163691, "pid": 5, "tid": 7, "ts": 1716454223983534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874531, "dur": 9, "args": { "External id": 163691, "cbid": 211, "correlation": 163691 } }, { "ph": "s", "id": 163691, "pid": 76337, "tid": -914061504, "ts": 1716454223874531, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223983595, "dur": 3, "args": { "External id": 163699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163699, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163699, "pid": 5, "tid": 7, "ts": 1716454223983595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874575, "dur": 9, "args": { "External id": 163699, "cbid": 211, "correlation": 163699 } }, { "ph": "s", "id": 163699, "pid": 76337, "tid": -914061504, "ts": 1716454223874575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874640, "dur": 2, "args": { "External id": 163715, "cbid": 251, "correlation": 163715 } }, { "ph": "f", "id": 163715, "pid": 76337, "tid": -914061504, "ts": 1716454223874640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223874646, "dur": 0, "args": { "External id": 163717, "cbid": 251, "correlation": 163717 } }, { "ph": "f", "id": 163717, "pid": 76337, "tid": -914061504, "ts": 1716454223874646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454223983599, "dur": 11, "args": { "External id": 163718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163718, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 163718, "pid": 5, "tid": 7, "ts": 1716454223983599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874648, "dur": 11, "args": { "External id": 163718, "cbid": 211, "correlation": 163718 } }, { "ph": "s", "id": 163718, "pid": 76337, "tid": -914061504, "ts": 1716454223874648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454223983612, "dur": 5, "args": { "External id": 163720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163720, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 163720, "pid": 5, "tid": 7, "ts": 1716454223983612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874661, "dur": 5, "args": { "External id": 163720, "cbid": 211, "correlation": 163720 } }, { "ph": "s", "id": 163720, "pid": 76337, "tid": -914061504, "ts": 1716454223874661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223983618, "dur": 55, "args": { "External id": 163730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163730, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163730, "pid": 5, "tid": 7, "ts": 1716454223983618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874719, "dur": 12, "args": { "External id": 163730, "cbid": 211, "correlation": 163730 } }, { "ph": "s", "id": 163730, "pid": 76337, "tid": -914061504, "ts": 1716454223874719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223983675, "dur": 53, "args": { "External id": 163750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163750, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 163750, "pid": 5, "tid": 7, "ts": 1716454223983675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874784, "dur": 12, "args": { "External id": 163750, "cbid": 211, "correlation": 163750 } }, { "ph": "s", "id": 163750, "pid": 76337, "tid": -914061504, "ts": 1716454223874784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223983729, "dur": 4, "args": { "External id": 163762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163762, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163762, "pid": 5, "tid": 7, "ts": 1716454223983729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874805, "dur": 6, "args": { "External id": 163762, "cbid": 211, "correlation": 163762 } }, { "ph": "s", "id": 163762, "pid": 76337, "tid": -914061504, "ts": 1716454223874805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223983734, "dur": 56, "args": { "External id": 163765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163765, "pid": 5, "tid": 7, "ts": 1716454223983734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874823, "dur": 7, "args": { "External id": 163765, "cbid": 211, "correlation": 163765 } }, { "ph": "s", "id": 163765, "pid": 76337, "tid": -914061504, "ts": 1716454223874823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223983792, "dur": 36, "args": { "External id": 163774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163774, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163774, "pid": 5, "tid": 7, "ts": 1716454223983792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874864, "dur": 10, "args": { "External id": 163774, "cbid": 211, "correlation": 163774 } }, { "ph": "s", "id": 163774, "pid": 76337, "tid": -914061504, "ts": 1716454223874864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223874927, "dur": 0, "args": { "External id": 163784, "cbid": 317, "correlation": 163784 } }, { "ph": "f", "id": 163784, "pid": 76337, "tid": -914061504, "ts": 1716454223874927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223874928, "dur": 0, "args": { "External id": 163785, "cbid": 203, "correlation": 163785 } }, { "ph": "f", "id": 163785, "pid": 76337, "tid": -914061504, "ts": 1716454223874928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223874929, "dur": 0, "args": { "External id": 163786, "cbid": 205, "correlation": 163786 } }, { "ph": "f", "id": 163786, "pid": 76337, "tid": -914061504, "ts": 1716454223874929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223983829, "dur": 40, "args": { "External id": 163790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163790, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163790, "pid": 5, "tid": 7, "ts": 1716454223983829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874944, "dur": 13, "args": { "External id": 163790, "cbid": 211, "correlation": 163790 } }, { "ph": "s", "id": 163790, "pid": 76337, "tid": -914061504, "ts": 1716454223874944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223983871, "dur": 14, "args": { "External id": 163792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163792, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163792, "pid": 5, "tid": 7, "ts": 1716454223983871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874959, "dur": 5, "args": { "External id": 163792, "cbid": 211, "correlation": 163792 } }, { "ph": "s", "id": 163792, "pid": 76337, "tid": -914061504, "ts": 1716454223874959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454223983886, "dur": 3, "args": { "External id": 163794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163794, "pid": 5, "tid": 7, "ts": 1716454223983886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874968, "dur": 16, "args": { "External id": 163794, "cbid": 211, "correlation": 163794 } }, { "ph": "s", "id": 163794, "pid": 76337, "tid": -914061504, "ts": 1716454223874968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223874987, "dur": 0, "args": { "External id": 163795, "cbid": 51, "correlation": 163795 } }, { "ph": "s", "id": 163795, "pid": 76337, "tid": -914061504, "ts": 1716454223874987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454223983891, "dur": 706, "args": { "External id": 163796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163796, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163796, "pid": 5, "tid": 7, "ts": 1716454223983891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223874988, "dur": 5, "args": { "External id": 163796, "cbid": 211, "correlation": 163796 } }, { "ph": "s", "id": 163796, "pid": 76337, "tid": -914061504, "ts": 1716454223874988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223984598, "dur": 60, "args": { "External id": 163801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163801, "pid": 5, "tid": 7, "ts": 1716454223984598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875017, "dur": 9, "args": { "External id": 163801, "cbid": 211, "correlation": 163801 } }, { "ph": "s", "id": 163801, "pid": 76337, "tid": -914061504, "ts": 1716454223875017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223875075, "dur": 0, "args": { "External id": 163811, "cbid": 317, "correlation": 163811 } }, { "ph": "f", "id": 163811, "pid": 76337, "tid": -914061504, "ts": 1716454223875075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223875075, "dur": 0, "args": { "External id": 163812, "cbid": 203, "correlation": 163812 } }, { "ph": "f", "id": 163812, "pid": 76337, "tid": -914061504, "ts": 1716454223875075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223875076, "dur": 0, "args": { "External id": 163813, "cbid": 205, "correlation": 163813 } }, { "ph": "f", "id": 163813, "pid": 76337, "tid": -914061504, "ts": 1716454223875076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223984659, "dur": 74, "args": { "External id": 163817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163817, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163817, "pid": 5, "tid": 7, "ts": 1716454223984659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875089, "dur": 12, "args": { "External id": 163817, "cbid": 211, "correlation": 163817 } }, { "ph": "s", "id": 163817, "pid": 76337, "tid": -914061504, "ts": 1716454223875089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454223984734, "dur": 208, "args": { "External id": 163819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163819, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163819, "pid": 5, "tid": 7, "ts": 1716454223984734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875109, "dur": 8, "args": { "External id": 163819, "cbid": 211, "correlation": 163819 } }, { "ph": "s", "id": 163819, "pid": 76337, "tid": -914061504, "ts": 1716454223875109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454223984944, "dur": 39, "args": { "External id": 163821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163821, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163821, "pid": 5, "tid": 7, "ts": 1716454223984944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875121, "dur": 122, "args": { "External id": 163821, "cbid": 211, "correlation": 163821 } }, { "ph": "s", "id": 163821, "pid": 76337, "tid": -914061504, "ts": 1716454223875121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223984983, "dur": 59, "args": { "External id": 163827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163827, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163827, "pid": 5, "tid": 7, "ts": 1716454223984983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875267, "dur": 9, "args": { "External id": 163827, "cbid": 211, "correlation": 163827 } }, { "ph": "s", "id": 163827, "pid": 76337, "tid": -914061504, "ts": 1716454223875267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223985044, "dur": 50, "args": { "External id": 163835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163835, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163835, "pid": 5, "tid": 7, "ts": 1716454223985044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875297, "dur": 8, "args": { "External id": 163835, "cbid": 211, "correlation": 163835 } }, { "ph": "s", "id": 163835, "pid": 76337, "tid": -914061504, "ts": 1716454223875297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454223985095, "dur": 35, "args": { "External id": 163843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163843, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163843, "pid": 5, "tid": 7, "ts": 1716454223985095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875326, "dur": 28, "args": { "External id": 163843, "cbid": 211, "correlation": 163843 } }, { "ph": "s", "id": 163843, "pid": 76337, "tid": -914061504, "ts": 1716454223875326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223985131, "dur": 51, "args": { "External id": 163863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163863, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 163863, "pid": 5, "tid": 7, "ts": 1716454223985131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875430, "dur": 13, "args": { "External id": 163863, "cbid": 211, "correlation": 163863 } }, { "ph": "s", "id": 163863, "pid": 76337, "tid": -914061504, "ts": 1716454223875430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454223985184, "dur": 4, "args": { "External id": 163875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163875, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 163875, "pid": 5, "tid": 7, "ts": 1716454223985184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875453, "dur": 6, "args": { "External id": 163875, "cbid": 211, "correlation": 163875 } }, { "ph": "s", "id": 163875, "pid": 76337, "tid": -914061504, "ts": 1716454223875453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223985190, "dur": 56, "args": { "External id": 163878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163878, "pid": 5, "tid": 7, "ts": 1716454223985190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875471, "dur": 6, "args": { "External id": 163878, "cbid": 211, "correlation": 163878 } }, { "ph": "s", "id": 163878, "pid": 76337, "tid": -914061504, "ts": 1716454223875471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223875528, "dur": 0, "args": { "External id": 163889, "cbid": 317, "correlation": 163889 } }, { "ph": "f", "id": 163889, "pid": 76337, "tid": -914061504, "ts": 1716454223875528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223875529, "dur": 0, "args": { "External id": 163890, "cbid": 203, "correlation": 163890 } }, { "ph": "f", "id": 163890, "pid": 76337, "tid": -914061504, "ts": 1716454223875529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223875529, "dur": 0, "args": { "External id": 163891, "cbid": 205, "correlation": 163891 } }, { "ph": "f", "id": 163891, "pid": 76337, "tid": -914061504, "ts": 1716454223875529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875552, "dur": 1, "args": { "External id": 163895, "cbid": 251, "correlation": 163895 } }, { "ph": "f", "id": 163895, "pid": 76337, "tid": -914061504, "ts": 1716454223875552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875555, "dur": 0, "args": { "External id": 163896, "cbid": 251, "correlation": 163896 } }, { "ph": "f", "id": 163896, "pid": 76337, "tid": -914061504, "ts": 1716454223875555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875555, "dur": 0, "args": { "External id": 163897, "cbid": 251, "correlation": 163897 } }, { "ph": "f", "id": 163897, "pid": 76337, "tid": -914061504, "ts": 1716454223875555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875557, "dur": 0, "args": { "External id": 163898, "cbid": 251, "correlation": 163898 } }, { "ph": "f", "id": 163898, "pid": 76337, "tid": -914061504, "ts": 1716454223875557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875557, "dur": 0, "args": { "External id": 163899, "cbid": 251, "correlation": 163899 } }, { "ph": "f", "id": 163899, "pid": 76337, "tid": -914061504, "ts": 1716454223875557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875558, "dur": 0, "args": { "External id": 163900, "cbid": 251, "correlation": 163900 } }, { "ph": "f", "id": 163900, "pid": 76337, "tid": -914061504, "ts": 1716454223875558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875559, "dur": 0, "args": { "External id": 163901, "cbid": 251, "correlation": 163901 } }, { "ph": "f", "id": 163901, "pid": 76337, "tid": -914061504, "ts": 1716454223875559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875560, "dur": 0, "args": { "External id": 163902, "cbid": 251, "correlation": 163902 } }, { "ph": "f", "id": 163902, "pid": 76337, "tid": -914061504, "ts": 1716454223875560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875562, "dur": 0, "args": { "External id": 163903, "cbid": 251, "correlation": 163903 } }, { "ph": "f", "id": 163903, "pid": 76337, "tid": -914061504, "ts": 1716454223875562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454223985247, "dur": 114, "args": { "External id": 163904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163904, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163904, "pid": 5, "tid": 7, "ts": 1716454223985247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875564, "dur": 12, "args": { "External id": 163904, "cbid": 211, "correlation": 163904 } }, { "ph": "s", "id": 163904, "pid": 76337, "tid": -914061504, "ts": 1716454223875564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454223985362, "dur": 59, "args": { "External id": 163910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163910, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163910, "pid": 5, "tid": 7, "ts": 1716454223985362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875599, "dur": 9, "args": { "External id": 163910, "cbid": 211, "correlation": 163910 } }, { "ph": "s", "id": 163910, "pid": 76337, "tid": -914061504, "ts": 1716454223875599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454223985423, "dur": 631, "args": { "External id": 163919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163919, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163919, "pid": 5, "tid": 7, "ts": 1716454223985423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875682, "dur": 14, "args": { "External id": 163919, "cbid": 211, "correlation": 163919 } }, { "ph": "s", "id": 163919, "pid": 76337, "tid": -914061504, "ts": 1716454223875682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454223986056, "dur": 184, "args": { "External id": 163941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163941, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 163941, "pid": 5, "tid": 7, "ts": 1716454223986056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875739, "dur": 10, "args": { "External id": 163941, "cbid": 211, "correlation": 163941 } }, { "ph": "s", "id": 163941, "pid": 76337, "tid": -914061504, "ts": 1716454223875739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875826, "dur": 1, "args": { "External id": 163952, "cbid": 251, "correlation": 163952 } }, { "ph": "f", "id": 163952, "pid": 76337, "tid": -914061504, "ts": 1716454223875826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223986241, "dur": 199, "args": { "External id": 163953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163953, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163953, "pid": 5, "tid": 7, "ts": 1716454223986241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875831, "dur": 14, "args": { "External id": 163953, "cbid": 211, "correlation": 163953 } }, { "ph": "s", "id": 163953, "pid": 76337, "tid": -914061504, "ts": 1716454223875831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875901, "dur": 1, "args": { "External id": 163964, "cbid": 251, "correlation": 163964 } }, { "ph": "f", "id": 163964, "pid": 76337, "tid": -914061504, "ts": 1716454223875901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223986441, "dur": 189, "args": { "External id": 163965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163965, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163965, "pid": 5, "tid": 7, "ts": 1716454223986441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223875906, "dur": 12, "args": { "External id": 163965, "cbid": 211, "correlation": 163965 } }, { "ph": "s", "id": 163965, "pid": 76337, "tid": -914061504, "ts": 1716454223875906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223875969, "dur": 1, "args": { "External id": 163976, "cbid": 251, "correlation": 163976 } }, { "ph": "f", "id": 163976, "pid": 76337, "tid": -914061504, "ts": 1716454223875969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454223986632, "dur": 192, "args": { "External id": 163977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163977, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 163977, "pid": 5, "tid": 7, "ts": 1716454223986632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876080, "dur": 13, "args": { "External id": 163977, "cbid": 211, "correlation": 163977 } }, { "ph": "s", "id": 163977, "pid": 76337, "tid": -914061504, "ts": 1716454223876080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454223986825, "dur": 18867, "args": { "External id": 163998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 163998, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 163998, "pid": 5, "tid": 7, "ts": 1716454223986825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876173, "dur": 12, "args": { "External id": 163998, "cbid": 211, "correlation": 163998 } }, { "ph": "s", "id": 163998, "pid": 76337, "tid": -914061504, "ts": 1716454223876173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223876275, "dur": 1, "args": { "External id": 164016, "cbid": 251, "correlation": 164016 } }, { "ph": "f", "id": 164016, "pid": 76337, "tid": -914061504, "ts": 1716454223876275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224005694, "dur": 203, "args": { "External id": 164018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164018, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164018, "pid": 5, "tid": 7, "ts": 1716454224005694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876281, "dur": 13, "args": { "External id": 164018, "cbid": 211, "correlation": 164018 } }, { "ph": "s", "id": 164018, "pid": 76337, "tid": -914061504, "ts": 1716454223876281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224005898, "dur": 66, "args": { "External id": 164026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164026, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164026, "pid": 5, "tid": 7, "ts": 1716454224005898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876352, "dur": 12, "args": { "External id": 164026, "cbid": 211, "correlation": 164026 } }, { "ph": "s", "id": 164026, "pid": 76337, "tid": -914061504, "ts": 1716454223876352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224005966, "dur": 96, "args": { "External id": 164034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164034, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164034, "pid": 5, "tid": 7, "ts": 1716454224005966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876391, "dur": 10, "args": { "External id": 164034, "cbid": 211, "correlation": 164034 } }, { "ph": "s", "id": 164034, "pid": 76337, "tid": -914061504, "ts": 1716454223876391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224006064, "dur": 55, "args": { "External id": 164045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164045, "pid": 5, "tid": 7, "ts": 1716454224006064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876464, "dur": 13, "args": { "External id": 164045, "cbid": 211, "correlation": 164045 } }, { "ph": "s", "id": 164045, "pid": 76337, "tid": -914061504, "ts": 1716454223876464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224006120, "dur": 93, "args": { "External id": 164067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164067, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164067, "pid": 5, "tid": 7, "ts": 1716454224006120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223876496, "dur": 1854, "args": { "External id": 164067, "cbid": 211, "correlation": 164067 } }, { "ph": "s", "id": 164067, "pid": 76337, "tid": -914061504, "ts": 1716454223876496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223878429, "dur": 1, "args": { "External id": 164078, "cbid": 251, "correlation": 164078 } }, { "ph": "f", "id": 164078, "pid": 76337, "tid": -914061504, "ts": 1716454223878429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224006215, "dur": 107, "args": { "External id": 164079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164079, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164079, "pid": 5, "tid": 7, "ts": 1716454224006215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878434, "dur": 64, "args": { "External id": 164079, "cbid": 211, "correlation": 164079 } }, { "ph": "s", "id": 164079, "pid": 76337, "tid": -914061504, "ts": 1716454223878434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223878560, "dur": 1, "args": { "External id": 164090, "cbid": 251, "correlation": 164090 } }, { "ph": "f", "id": 164090, "pid": 76337, "tid": -914061504, "ts": 1716454223878560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223878564, "dur": 0, "args": { "External id": 164091, "cbid": 251, "correlation": 164091 } }, { "ph": "f", "id": 164091, "pid": 76337, "tid": -914061504, "ts": 1716454223878564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224006322, "dur": 10, "args": { "External id": 164092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164092, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 164092, "pid": 5, "tid": 7, "ts": 1716454224006322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878566, "dur": 13, "args": { "External id": 164092, "cbid": 211, "correlation": 164092 } }, { "ph": "s", "id": 164092, "pid": 76337, "tid": -914061504, "ts": 1716454223878566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224006334, "dur": 5, "args": { "External id": 164094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164094, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 164094, "pid": 5, "tid": 7, "ts": 1716454224006334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878580, "dur": 6, "args": { "External id": 164094, "cbid": 211, "correlation": 164094 } }, { "ph": "s", "id": 164094, "pid": 76337, "tid": -914061504, "ts": 1716454223878580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223878641, "dur": 1, "args": { "External id": 164105, "cbid": 251, "correlation": 164105 } }, { "ph": "f", "id": 164105, "pid": 76337, "tid": -914061504, "ts": 1716454223878641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223878645, "dur": 0, "args": { "External id": 164106, "cbid": 251, "correlation": 164106 } }, { "ph": "f", "id": 164106, "pid": 76337, "tid": -914061504, "ts": 1716454223878645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224006340, "dur": 6, "args": { "External id": 164107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164107, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 164107, "pid": 5, "tid": 7, "ts": 1716454224006340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878646, "dur": 12, "args": { "External id": 164107, "cbid": 211, "correlation": 164107 } }, { "ph": "s", "id": 164107, "pid": 76337, "tid": -914061504, "ts": 1716454223878646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224006348, "dur": 4, "args": { "External id": 164109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164109, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 164109, "pid": 5, "tid": 7, "ts": 1716454224006348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878659, "dur": 6, "args": { "External id": 164109, "cbid": 211, "correlation": 164109 } }, { "ph": "s", "id": 164109, "pid": 76337, "tid": -914061504, "ts": 1716454223878659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224006353, "dur": 158, "args": { "External id": 164130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164130, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164130, "pid": 5, "tid": 7, "ts": 1716454224006353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878734, "dur": 12, "args": { "External id": 164130, "cbid": 211, "correlation": 164130 } }, { "ph": "s", "id": 164130, "pid": 76337, "tid": -914061504, "ts": 1716454223878734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223878831, "dur": 2, "args": { "External id": 164148, "cbid": 251, "correlation": 164148 } }, { "ph": "f", "id": 164148, "pid": 76337, "tid": -914061504, "ts": 1716454223878831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224006512, "dur": 107, "args": { "External id": 164150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164150, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164150, "pid": 5, "tid": 7, "ts": 1716454224006512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878837, "dur": 13, "args": { "External id": 164150, "cbid": 211, "correlation": 164150 } }, { "ph": "s", "id": 164150, "pid": 76337, "tid": -914061504, "ts": 1716454223878837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224006621, "dur": 35, "args": { "External id": 164158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164158, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164158, "pid": 5, "tid": 7, "ts": 1716454224006621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878907, "dur": 13, "args": { "External id": 164158, "cbid": 211, "correlation": 164158 } }, { "ph": "s", "id": 164158, "pid": 76337, "tid": -914061504, "ts": 1716454223878907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224006657, "dur": 68, "args": { "External id": 164166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164166, "pid": 5, "tid": 7, "ts": 1716454224006657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223878949, "dur": 9, "args": { "External id": 164166, "cbid": 211, "correlation": 164166 } }, { "ph": "s", "id": 164166, "pid": 76337, "tid": -914061504, "ts": 1716454223878949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224006726, "dur": 93, "args": { "External id": 164188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164188, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164188, "pid": 5, "tid": 7, "ts": 1716454224006726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879009, "dur": 11, "args": { "External id": 164188, "cbid": 211, "correlation": 164188 } }, { "ph": "s", "id": 164188, "pid": 76337, "tid": -914061504, "ts": 1716454223879009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879097, "dur": 1, "args": { "External id": 164204, "cbid": 251, "correlation": 164204 } }, { "ph": "f", "id": 164204, "pid": 76337, "tid": -914061504, "ts": 1716454223879097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224006821, "dur": 579, "args": { "External id": 164206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164206, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164206, "pid": 5, "tid": 7, "ts": 1716454224006821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879103, "dur": 13, "args": { "External id": 164206, "cbid": 211, "correlation": 164206 } }, { "ph": "s", "id": 164206, "pid": 76337, "tid": -914061504, "ts": 1716454223879103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224007401, "dur": 244, "args": { "External id": 164214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164214, "pid": 5, "tid": 7, "ts": 1716454224007401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879169, "dur": 12, "args": { "External id": 164214, "cbid": 211, "correlation": 164214 } }, { "ph": "s", "id": 164214, "pid": 76337, "tid": -914061504, "ts": 1716454223879169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224007647, "dur": 252, "args": { "External id": 164222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164222, "pid": 5, "tid": 7, "ts": 1716454224007647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879199, "dur": 9, "args": { "External id": 164222, "cbid": 211, "correlation": 164222 } }, { "ph": "s", "id": 164222, "pid": 76337, "tid": -914061504, "ts": 1716454223879199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879281, "dur": 1, "args": { "External id": 164238, "cbid": 251, "correlation": 164238 } }, { "ph": "f", "id": 164238, "pid": 76337, "tid": -914061504, "ts": 1716454223879281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879286, "dur": 0, "args": { "External id": 164240, "cbid": 251, "correlation": 164240 } }, { "ph": "f", "id": 164240, "pid": 76337, "tid": -914061504, "ts": 1716454223879286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224007900, "dur": 362, "args": { "External id": 164241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164241, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 164241, "pid": 5, "tid": 7, "ts": 1716454224007900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879289, "dur": 12, "args": { "External id": 164241, "cbid": 211, "correlation": 164241 } }, { "ph": "s", "id": 164241, "pid": 76337, "tid": -914061504, "ts": 1716454223879289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224008263, "dur": 50, "args": { "External id": 164249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164249, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164249, "pid": 5, "tid": 7, "ts": 1716454224008263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879330, "dur": 10, "args": { "External id": 164249, "cbid": 211, "correlation": 164249 } }, { "ph": "s", "id": 164249, "pid": 76337, "tid": -914061504, "ts": 1716454223879330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224008315, "dur": 159, "args": { "External id": 164260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164260, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164260, "pid": 5, "tid": 7, "ts": 1716454224008315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879397, "dur": 208, "args": { "External id": 164260, "cbid": 211, "correlation": 164260 } }, { "ph": "s", "id": 164260, "pid": 76337, "tid": -914061504, "ts": 1716454223879397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223879659, "dur": 0, "args": { "External id": 164272, "cbid": 317, "correlation": 164272 } }, { "ph": "f", "id": 164272, "pid": 76337, "tid": -914061504, "ts": 1716454223879659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223879660, "dur": 0, "args": { "External id": 164273, "cbid": 203, "correlation": 164273 } }, { "ph": "f", "id": 164273, "pid": 76337, "tid": -914061504, "ts": 1716454223879660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223879661, "dur": 0, "args": { "External id": 164274, "cbid": 205, "correlation": 164274 } }, { "ph": "f", "id": 164274, "pid": 76337, "tid": -914061504, "ts": 1716454223879661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879684, "dur": 1, "args": { "External id": 164278, "cbid": 251, "correlation": 164278 } }, { "ph": "f", "id": 164278, "pid": 76337, "tid": -914061504, "ts": 1716454223879684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879686, "dur": 0, "args": { "External id": 164279, "cbid": 251, "correlation": 164279 } }, { "ph": "f", "id": 164279, "pid": 76337, "tid": -914061504, "ts": 1716454223879686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879687, "dur": 0, "args": { "External id": 164280, "cbid": 251, "correlation": 164280 } }, { "ph": "f", "id": 164280, "pid": 76337, "tid": -914061504, "ts": 1716454223879687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879688, "dur": 0, "args": { "External id": 164281, "cbid": 251, "correlation": 164281 } }, { "ph": "f", "id": 164281, "pid": 76337, "tid": -914061504, "ts": 1716454223879688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879688, "dur": 0, "args": { "External id": 164282, "cbid": 251, "correlation": 164282 } }, { "ph": "f", "id": 164282, "pid": 76337, "tid": -914061504, "ts": 1716454223879688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879689, "dur": 0, "args": { "External id": 164283, "cbid": 251, "correlation": 164283 } }, { "ph": "f", "id": 164283, "pid": 76337, "tid": -914061504, "ts": 1716454223879689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879690, "dur": 0, "args": { "External id": 164284, "cbid": 251, "correlation": 164284 } }, { "ph": "f", "id": 164284, "pid": 76337, "tid": -914061504, "ts": 1716454223879690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879691, "dur": 0, "args": { "External id": 164285, "cbid": 251, "correlation": 164285 } }, { "ph": "f", "id": 164285, "pid": 76337, "tid": -914061504, "ts": 1716454223879691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223879692, "dur": 0, "args": { "External id": 164286, "cbid": 251, "correlation": 164286 } }, { "ph": "f", "id": 164286, "pid": 76337, "tid": -914061504, "ts": 1716454223879692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224008475, "dur": 115, "args": { "External id": 164287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164287, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164287, "pid": 5, "tid": 7, "ts": 1716454224008475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879694, "dur": 38, "args": { "External id": 164287, "cbid": 211, "correlation": 164287 } }, { "ph": "s", "id": 164287, "pid": 76337, "tid": -914061504, "ts": 1716454223879694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224008592, "dur": 60, "args": { "External id": 164293, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164293, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164293, "pid": 5, "tid": 7, "ts": 1716454224008592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879755, "dur": 104, "args": { "External id": 164293, "cbid": 211, "correlation": 164293 } }, { "ph": "s", "id": 164293, "pid": 76337, "tid": -914061504, "ts": 1716454223879755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224008653, "dur": 50, "args": { "External id": 164301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164301, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164301, "pid": 5, "tid": 7, "ts": 1716454224008653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223879883, "dur": 283, "args": { "External id": 164301, "cbid": 211, "correlation": 164301 } }, { "ph": "s", "id": 164301, "pid": 76337, "tid": -914061504, "ts": 1716454223879883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224008704, "dur": 99, "args": { "External id": 164310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164310, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164310, "pid": 5, "tid": 7, "ts": 1716454224008704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880198, "dur": 10, "args": { "External id": 164310, "cbid": 211, "correlation": 164310 } }, { "ph": "s", "id": 164310, "pid": 76337, "tid": -914061504, "ts": 1716454223880198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224008805, "dur": 92, "args": { "External id": 164330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164330, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 164330, "pid": 5, "tid": 7, "ts": 1716454224008805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880269, "dur": 12, "args": { "External id": 164330, "cbid": 211, "correlation": 164330 } }, { "ph": "s", "id": 164330, "pid": 76337, "tid": -914061504, "ts": 1716454223880269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224008898, "dur": 5, "args": { "External id": 164342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164342, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 164342, "pid": 5, "tid": 7, "ts": 1716454224008898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880291, "dur": 10, "args": { "External id": 164342, "cbid": 211, "correlation": 164342 } }, { "ph": "s", "id": 164342, "pid": 76337, "tid": -914061504, "ts": 1716454223880291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224008904, "dur": 112, "args": { "External id": 164345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164345, "pid": 5, "tid": 7, "ts": 1716454224008904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880313, "dur": 108, "args": { "External id": 164345, "cbid": 211, "correlation": 164345 } }, { "ph": "s", "id": 164345, "pid": 76337, "tid": -914061504, "ts": 1716454223880313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224009017, "dur": 69, "args": { "External id": 164354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164354, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164354, "pid": 5, "tid": 7, "ts": 1716454224009017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880456, "dur": 10, "args": { "External id": 164354, "cbid": 211, "correlation": 164354 } }, { "ph": "s", "id": 164354, "pid": 76337, "tid": -914061504, "ts": 1716454223880456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223880508, "dur": 0, "args": { "External id": 164364, "cbid": 317, "correlation": 164364 } }, { "ph": "f", "id": 164364, "pid": 76337, "tid": -914061504, "ts": 1716454223880508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223880509, "dur": 0, "args": { "External id": 164365, "cbid": 203, "correlation": 164365 } }, { "ph": "f", "id": 164365, "pid": 76337, "tid": -914061504, "ts": 1716454223880509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223880509, "dur": 0, "args": { "External id": 164366, "cbid": 205, "correlation": 164366 } }, { "ph": "f", "id": 164366, "pid": 76337, "tid": -914061504, "ts": 1716454223880509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224009088, "dur": 76, "args": { "External id": 164370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164370, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164370, "pid": 5, "tid": 7, "ts": 1716454224009088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880525, "dur": 12, "args": { "External id": 164370, "cbid": 211, "correlation": 164370 } }, { "ph": "s", "id": 164370, "pid": 76337, "tid": -914061504, "ts": 1716454223880525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224009165, "dur": 25, "args": { "External id": 164372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164372, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164372, "pid": 5, "tid": 7, "ts": 1716454224009165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880539, "dur": 5, "args": { "External id": 164372, "cbid": 211, "correlation": 164372 } }, { "ph": "s", "id": 164372, "pid": 76337, "tid": -914061504, "ts": 1716454223880539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224009191, "dur": 4, "args": { "External id": 164374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 164374, "pid": 5, "tid": 7, "ts": 1716454224009191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880549, "dur": 5, "args": { "External id": 164374, "cbid": 211, "correlation": 164374 } }, { "ph": "s", "id": 164374, "pid": 76337, "tid": -914061504, "ts": 1716454223880549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223880558, "dur": 0, "args": { "External id": 164375, "cbid": 51, "correlation": 164375 } }, { "ph": "s", "id": 164375, "pid": 76337, "tid": -914061504, "ts": 1716454223880558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224009196, "dur": 1379, "args": { "External id": 164376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164376, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164376, "pid": 5, "tid": 7, "ts": 1716454224009196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880559, "dur": 5, "args": { "External id": 164376, "cbid": 211, "correlation": 164376 } }, { "ph": "s", "id": 164376, "pid": 76337, "tid": -914061504, "ts": 1716454223880559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224010576, "dur": 60, "args": { "External id": 164381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164381, "pid": 5, "tid": 7, "ts": 1716454224010576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880586, "dur": 9, "args": { "External id": 164381, "cbid": 211, "correlation": 164381 } }, { "ph": "s", "id": 164381, "pid": 76337, "tid": -914061504, "ts": 1716454223880586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224010638, "dur": 3, "args": { "External id": 164389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164389, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 164389, "pid": 5, "tid": 7, "ts": 1716454224010638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880630, "dur": 10, "args": { "External id": 164389, "cbid": 211, "correlation": 164389 } }, { "ph": "s", "id": 164389, "pid": 76337, "tid": -914061504, "ts": 1716454223880630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223880697, "dur": 1, "args": { "External id": 164405, "cbid": 251, "correlation": 164405 } }, { "ph": "f", "id": 164405, "pid": 76337, "tid": -914061504, "ts": 1716454223880697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223880703, "dur": 0, "args": { "External id": 164407, "cbid": 251, "correlation": 164407 } }, { "ph": "f", "id": 164407, "pid": 76337, "tid": -914061504, "ts": 1716454223880703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224010642, "dur": 11, "args": { "External id": 164408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164408, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 164408, "pid": 5, "tid": 7, "ts": 1716454224010642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880705, "dur": 12, "args": { "External id": 164408, "cbid": 211, "correlation": 164408 } }, { "ph": "s", "id": 164408, "pid": 76337, "tid": -914061504, "ts": 1716454223880705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224010655, "dur": 5, "args": { "External id": 164410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164410, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 164410, "pid": 5, "tid": 7, "ts": 1716454224010655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880718, "dur": 6, "args": { "External id": 164410, "cbid": 211, "correlation": 164410 } }, { "ph": "s", "id": 164410, "pid": 76337, "tid": -914061504, "ts": 1716454223880718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224010662, "dur": 54, "args": { "External id": 164420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164420, "pid": 5, "tid": 7, "ts": 1716454224010662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223880777, "dur": 551, "args": { "External id": 164420, "cbid": 211, "correlation": 164420 } }, { "ph": "s", "id": 164420, "pid": 76337, "tid": -914061504, "ts": 1716454223880777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224010717, "dur": 53, "args": { "External id": 164440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164440, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 164440, "pid": 5, "tid": 7, "ts": 1716454224010717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881383, "dur": 11, "args": { "External id": 164440, "cbid": 211, "correlation": 164440 } }, { "ph": "s", "id": 164440, "pid": 76337, "tid": -914061504, "ts": 1716454223881383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224010771, "dur": 4, "args": { "External id": 164452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164452, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 164452, "pid": 5, "tid": 7, "ts": 1716454224010771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881403, "dur": 7, "args": { "External id": 164452, "cbid": 211, "correlation": 164452 } }, { "ph": "s", "id": 164452, "pid": 76337, "tid": -914061504, "ts": 1716454223881403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224010776, "dur": 56, "args": { "External id": 164455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164455, "pid": 5, "tid": 7, "ts": 1716454224010776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881423, "dur": 7, "args": { "External id": 164455, "cbid": 211, "correlation": 164455 } }, { "ph": "s", "id": 164455, "pid": 76337, "tid": -914061504, "ts": 1716454223881423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224010833, "dur": 37, "args": { "External id": 164464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164464, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164464, "pid": 5, "tid": 7, "ts": 1716454224010833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881463, "dur": 10, "args": { "External id": 164464, "cbid": 211, "correlation": 164464 } }, { "ph": "s", "id": 164464, "pid": 76337, "tid": -914061504, "ts": 1716454223881463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223881527, "dur": 0, "args": { "External id": 164474, "cbid": 317, "correlation": 164474 } }, { "ph": "f", "id": 164474, "pid": 76337, "tid": -914061504, "ts": 1716454223881527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223881528, "dur": 0, "args": { "External id": 164475, "cbid": 203, "correlation": 164475 } }, { "ph": "f", "id": 164475, "pid": 76337, "tid": -914061504, "ts": 1716454223881528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223881529, "dur": 0, "args": { "External id": 164476, "cbid": 205, "correlation": 164476 } }, { "ph": "f", "id": 164476, "pid": 76337, "tid": -914061504, "ts": 1716454223881529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224010871, "dur": 41, "args": { "External id": 164480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164480, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164480, "pid": 5, "tid": 7, "ts": 1716454224010871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881543, "dur": 12, "args": { "External id": 164480, "cbid": 211, "correlation": 164480 } }, { "ph": "s", "id": 164480, "pid": 76337, "tid": -914061504, "ts": 1716454223881543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224010914, "dur": 14, "args": { "External id": 164482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164482, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164482, "pid": 5, "tid": 7, "ts": 1716454224010914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881557, "dur": 6, "args": { "External id": 164482, "cbid": 211, "correlation": 164482 } }, { "ph": "s", "id": 164482, "pid": 76337, "tid": -914061504, "ts": 1716454223881557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224010929, "dur": 3, "args": { "External id": 164484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 164484, "pid": 5, "tid": 7, "ts": 1716454224010929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881567, "dur": 5, "args": { "External id": 164484, "cbid": 211, "correlation": 164484 } }, { "ph": "s", "id": 164484, "pid": 76337, "tid": -914061504, "ts": 1716454223881567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223881576, "dur": 0, "args": { "External id": 164485, "cbid": 51, "correlation": 164485 } }, { "ph": "s", "id": 164485, "pid": 76337, "tid": -914061504, "ts": 1716454223881576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224010934, "dur": 703, "args": { "External id": 164486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164486, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164486, "pid": 5, "tid": 7, "ts": 1716454224010934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881577, "dur": 5, "args": { "External id": 164486, "cbid": 211, "correlation": 164486 } }, { "ph": "s", "id": 164486, "pid": 76337, "tid": -914061504, "ts": 1716454223881577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224011639, "dur": 60, "args": { "External id": 164491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164491, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164491, "pid": 5, "tid": 7, "ts": 1716454224011639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881604, "dur": 9, "args": { "External id": 164491, "cbid": 211, "correlation": 164491 } }, { "ph": "s", "id": 164491, "pid": 76337, "tid": -914061504, "ts": 1716454223881604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223881662, "dur": 0, "args": { "External id": 164501, "cbid": 317, "correlation": 164501 } }, { "ph": "f", "id": 164501, "pid": 76337, "tid": -914061504, "ts": 1716454223881662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223881663, "dur": 0, "args": { "External id": 164502, "cbid": 203, "correlation": 164502 } }, { "ph": "f", "id": 164502, "pid": 76337, "tid": -914061504, "ts": 1716454223881663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223881663, "dur": 0, "args": { "External id": 164503, "cbid": 205, "correlation": 164503 } }, { "ph": "f", "id": 164503, "pid": 76337, "tid": -914061504, "ts": 1716454223881663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224011700, "dur": 75, "args": { "External id": 164507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164507, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164507, "pid": 5, "tid": 7, "ts": 1716454224011700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881676, "dur": 12, "args": { "External id": 164507, "cbid": 211, "correlation": 164507 } }, { "ph": "s", "id": 164507, "pid": 76337, "tid": -914061504, "ts": 1716454223881676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224011776, "dur": 211, "args": { "External id": 164509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164509, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164509, "pid": 5, "tid": 7, "ts": 1716454224011776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881694, "dur": 6, "args": { "External id": 164509, "cbid": 211, "correlation": 164509 } }, { "ph": "s", "id": 164509, "pid": 76337, "tid": -914061504, "ts": 1716454223881694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224011988, "dur": 39, "args": { "External id": 164511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164511, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164511, "pid": 5, "tid": 7, "ts": 1716454224011988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881705, "dur": 6, "args": { "External id": 164511, "cbid": 211, "correlation": 164511 } }, { "ph": "s", "id": 164511, "pid": 76337, "tid": -914061504, "ts": 1716454223881705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224012029, "dur": 60, "args": { "External id": 164517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164517, "pid": 5, "tid": 7, "ts": 1716454224012029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223881733, "dur": 510, "args": { "External id": 164517, "cbid": 211, "correlation": 164517 } }, { "ph": "s", "id": 164517, "pid": 76337, "tid": -914061504, "ts": 1716454223881733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224012090, "dur": 50, "args": { "External id": 164525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164525, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164525, "pid": 5, "tid": 7, "ts": 1716454224012090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882264, "dur": 8, "args": { "External id": 164525, "cbid": 211, "correlation": 164525 } }, { "ph": "s", "id": 164525, "pid": 76337, "tid": -914061504, "ts": 1716454223882264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224012142, "dur": 36, "args": { "External id": 164533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164533, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164533, "pid": 5, "tid": 7, "ts": 1716454224012142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882294, "dur": 8, "args": { "External id": 164533, "cbid": 211, "correlation": 164533 } }, { "ph": "s", "id": 164533, "pid": 76337, "tid": -914061504, "ts": 1716454223882294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224012179, "dur": 50, "args": { "External id": 164553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164553, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 164553, "pid": 5, "tid": 7, "ts": 1716454224012179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882377, "dur": 13, "args": { "External id": 164553, "cbid": 211, "correlation": 164553 } }, { "ph": "s", "id": 164553, "pid": 76337, "tid": -914061504, "ts": 1716454223882377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224012230, "dur": 4, "args": { "External id": 164565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164565, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 164565, "pid": 5, "tid": 7, "ts": 1716454224012230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882400, "dur": 6, "args": { "External id": 164565, "cbid": 211, "correlation": 164565 } }, { "ph": "s", "id": 164565, "pid": 76337, "tid": -914061504, "ts": 1716454223882400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224012236, "dur": 55, "args": { "External id": 164568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164568, "pid": 5, "tid": 7, "ts": 1716454224012236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882417, "dur": 7, "args": { "External id": 164568, "cbid": 211, "correlation": 164568 } }, { "ph": "s", "id": 164568, "pid": 76337, "tid": -914061504, "ts": 1716454223882417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223882474, "dur": 0, "args": { "External id": 164579, "cbid": 317, "correlation": 164579 } }, { "ph": "f", "id": 164579, "pid": 76337, "tid": -914061504, "ts": 1716454223882474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223882475, "dur": 0, "args": { "External id": 164580, "cbid": 203, "correlation": 164580 } }, { "ph": "f", "id": 164580, "pid": 76337, "tid": -914061504, "ts": 1716454223882475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223882476, "dur": 0, "args": { "External id": 164581, "cbid": 205, "correlation": 164581 } }, { "ph": "f", "id": 164581, "pid": 76337, "tid": -914061504, "ts": 1716454223882476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882498, "dur": 1, "args": { "External id": 164585, "cbid": 251, "correlation": 164585 } }, { "ph": "f", "id": 164585, "pid": 76337, "tid": -914061504, "ts": 1716454223882498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882500, "dur": 0, "args": { "External id": 164586, "cbid": 251, "correlation": 164586 } }, { "ph": "f", "id": 164586, "pid": 76337, "tid": -914061504, "ts": 1716454223882500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882501, "dur": 0, "args": { "External id": 164587, "cbid": 251, "correlation": 164587 } }, { "ph": "f", "id": 164587, "pid": 76337, "tid": -914061504, "ts": 1716454223882501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882502, "dur": 0, "args": { "External id": 164588, "cbid": 251, "correlation": 164588 } }, { "ph": "f", "id": 164588, "pid": 76337, "tid": -914061504, "ts": 1716454223882502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882502, "dur": 0, "args": { "External id": 164589, "cbid": 251, "correlation": 164589 } }, { "ph": "f", "id": 164589, "pid": 76337, "tid": -914061504, "ts": 1716454223882502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882503, "dur": 0, "args": { "External id": 164590, "cbid": 251, "correlation": 164590 } }, { "ph": "f", "id": 164590, "pid": 76337, "tid": -914061504, "ts": 1716454223882503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882504, "dur": 0, "args": { "External id": 164591, "cbid": 251, "correlation": 164591 } }, { "ph": "f", "id": 164591, "pid": 76337, "tid": -914061504, "ts": 1716454223882504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882505, "dur": 0, "args": { "External id": 164592, "cbid": 251, "correlation": 164592 } }, { "ph": "f", "id": 164592, "pid": 76337, "tid": -914061504, "ts": 1716454223882505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882506, "dur": 0, "args": { "External id": 164593, "cbid": 251, "correlation": 164593 } }, { "ph": "f", "id": 164593, "pid": 76337, "tid": -914061504, "ts": 1716454223882506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224012293, "dur": 113, "args": { "External id": 164594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164594, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164594, "pid": 5, "tid": 7, "ts": 1716454224012293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882508, "dur": 12, "args": { "External id": 164594, "cbid": 211, "correlation": 164594 } }, { "ph": "s", "id": 164594, "pid": 76337, "tid": -914061504, "ts": 1716454223882508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224012407, "dur": 61, "args": { "External id": 164600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164600, "pid": 5, "tid": 7, "ts": 1716454224012407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882543, "dur": 9, "args": { "External id": 164600, "cbid": 211, "correlation": 164600 } }, { "ph": "s", "id": 164600, "pid": 76337, "tid": -914061504, "ts": 1716454223882543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224012469, "dur": 598, "args": { "External id": 164609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164609, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164609, "pid": 5, "tid": 7, "ts": 1716454224012469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882625, "dur": 14, "args": { "External id": 164609, "cbid": 211, "correlation": 164609 } }, { "ph": "s", "id": 164609, "pid": 76337, "tid": -914061504, "ts": 1716454223882625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224013068, "dur": 183, "args": { "External id": 164631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164631, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164631, "pid": 5, "tid": 7, "ts": 1716454224013068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882682, "dur": 10, "args": { "External id": 164631, "cbid": 211, "correlation": 164631 } }, { "ph": "s", "id": 164631, "pid": 76337, "tid": -914061504, "ts": 1716454223882682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882768, "dur": 1, "args": { "External id": 164642, "cbid": 251, "correlation": 164642 } }, { "ph": "f", "id": 164642, "pid": 76337, "tid": -914061504, "ts": 1716454223882768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224013252, "dur": 197, "args": { "External id": 164643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164643, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164643, "pid": 5, "tid": 7, "ts": 1716454224013252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882773, "dur": 14, "args": { "External id": 164643, "cbid": 211, "correlation": 164643 } }, { "ph": "s", "id": 164643, "pid": 76337, "tid": -914061504, "ts": 1716454223882773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882841, "dur": 1, "args": { "External id": 164654, "cbid": 251, "correlation": 164654 } }, { "ph": "f", "id": 164654, "pid": 76337, "tid": -914061504, "ts": 1716454223882841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224013450, "dur": 191, "args": { "External id": 164655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164655, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164655, "pid": 5, "tid": 7, "ts": 1716454224013450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882845, "dur": 11, "args": { "External id": 164655, "cbid": 211, "correlation": 164655 } }, { "ph": "s", "id": 164655, "pid": 76337, "tid": -914061504, "ts": 1716454223882845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223882908, "dur": 1, "args": { "External id": 164666, "cbid": 251, "correlation": 164666 } }, { "ph": "f", "id": 164666, "pid": 76337, "tid": -914061504, "ts": 1716454223882908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224013643, "dur": 188, "args": { "External id": 164667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164667, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164667, "pid": 5, "tid": 7, "ts": 1716454224013643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223882912, "dur": 12, "args": { "External id": 164667, "cbid": 211, "correlation": 164667 } }, { "ph": "s", "id": 164667, "pid": 76337, "tid": -914061504, "ts": 1716454223882912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224013832, "dur": 18753, "args": { "External id": 164688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164688, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164688, "pid": 5, "tid": 7, "ts": 1716454224013832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223883001, "dur": 13, "args": { "External id": 164688, "cbid": 211, "correlation": 164688 } }, { "ph": "s", "id": 164688, "pid": 76337, "tid": -914061504, "ts": 1716454223883001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223883099, "dur": 1, "args": { "External id": 164706, "cbid": 251, "correlation": 164706 } }, { "ph": "f", "id": 164706, "pid": 76337, "tid": -914061504, "ts": 1716454223883099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224032586, "dur": 203, "args": { "External id": 164708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164708, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164708, "pid": 5, "tid": 7, "ts": 1716454224032586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223883105, "dur": 13, "args": { "External id": 164708, "cbid": 211, "correlation": 164708 } }, { "ph": "s", "id": 164708, "pid": 76337, "tid": -914061504, "ts": 1716454223883105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224032791, "dur": 67, "args": { "External id": 164716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164716, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164716, "pid": 5, "tid": 7, "ts": 1716454224032791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223883175, "dur": 12, "args": { "External id": 164716, "cbid": 211, "correlation": 164716 } }, { "ph": "s", "id": 164716, "pid": 76337, "tid": -914061504, "ts": 1716454223883175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224032859, "dur": 98, "args": { "External id": 164724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164724, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164724, "pid": 5, "tid": 7, "ts": 1716454224032859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223883214, "dur": 124, "args": { "External id": 164724, "cbid": 211, "correlation": 164724 } }, { "ph": "s", "id": 164724, "pid": 76337, "tid": -914061504, "ts": 1716454223883214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224032958, "dur": 55, "args": { "External id": 164735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164735, "pid": 5, "tid": 7, "ts": 1716454224032958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223883401, "dur": 1885, "args": { "External id": 164735, "cbid": 211, "correlation": 164735 } }, { "ph": "s", "id": 164735, "pid": 76337, "tid": -914061504, "ts": 1716454223883401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224033015, "dur": 93, "args": { "External id": 164757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164757, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164757, "pid": 5, "tid": 7, "ts": 1716454224033015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885306, "dur": 125, "args": { "External id": 164757, "cbid": 211, "correlation": 164757 } }, { "ph": "s", "id": 164757, "pid": 76337, "tid": -914061504, "ts": 1716454223885306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223885509, "dur": 1, "args": { "External id": 164768, "cbid": 251, "correlation": 164768 } }, { "ph": "f", "id": 164768, "pid": 76337, "tid": -914061504, "ts": 1716454223885509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224033108, "dur": 104, "args": { "External id": 164769, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164769, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164769, "pid": 5, "tid": 7, "ts": 1716454224033108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885514, "dur": 13, "args": { "External id": 164769, "cbid": 211, "correlation": 164769 } }, { "ph": "s", "id": 164769, "pid": 76337, "tid": -914061504, "ts": 1716454223885514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223885588, "dur": 1, "args": { "External id": 164780, "cbid": 251, "correlation": 164780 } }, { "ph": "f", "id": 164780, "pid": 76337, "tid": -914061504, "ts": 1716454223885588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223885592, "dur": 0, "args": { "External id": 164781, "cbid": 251, "correlation": 164781 } }, { "ph": "f", "id": 164781, "pid": 76337, "tid": -914061504, "ts": 1716454223885592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224033214, "dur": 11, "args": { "External id": 164782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164782, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 164782, "pid": 5, "tid": 7, "ts": 1716454224033214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885593, "dur": 12, "args": { "External id": 164782, "cbid": 211, "correlation": 164782 } }, { "ph": "s", "id": 164782, "pid": 76337, "tid": -914061504, "ts": 1716454223885593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224033226, "dur": 5, "args": { "External id": 164784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164784, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 164784, "pid": 5, "tid": 7, "ts": 1716454224033226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885607, "dur": 6, "args": { "External id": 164784, "cbid": 211, "correlation": 164784 } }, { "ph": "s", "id": 164784, "pid": 76337, "tid": -914061504, "ts": 1716454223885607, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223885668, "dur": 1, "args": { "External id": 164795, "cbid": 251, "correlation": 164795 } }, { "ph": "f", "id": 164795, "pid": 76337, "tid": -914061504, "ts": 1716454223885668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223885671, "dur": 0, "args": { "External id": 164796, "cbid": 251, "correlation": 164796 } }, { "ph": "f", "id": 164796, "pid": 76337, "tid": -914061504, "ts": 1716454223885671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224033233, "dur": 6, "args": { "External id": 164797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164797, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 164797, "pid": 5, "tid": 7, "ts": 1716454224033233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885673, "dur": 12, "args": { "External id": 164797, "cbid": 211, "correlation": 164797 } }, { "ph": "s", "id": 164797, "pid": 76337, "tid": -914061504, "ts": 1716454223885673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224033241, "dur": 3, "args": { "External id": 164799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164799, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 164799, "pid": 5, "tid": 7, "ts": 1716454224033241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885687, "dur": 7, "args": { "External id": 164799, "cbid": 211, "correlation": 164799 } }, { "ph": "s", "id": 164799, "pid": 76337, "tid": -914061504, "ts": 1716454223885687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224033245, "dur": 157, "args": { "External id": 164820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164820, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164820, "pid": 5, "tid": 7, "ts": 1716454224033245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885761, "dur": 12, "args": { "External id": 164820, "cbid": 211, "correlation": 164820 } }, { "ph": "s", "id": 164820, "pid": 76337, "tid": -914061504, "ts": 1716454223885761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223885857, "dur": 1, "args": { "External id": 164838, "cbid": 251, "correlation": 164838 } }, { "ph": "f", "id": 164838, "pid": 76337, "tid": -914061504, "ts": 1716454223885857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224033403, "dur": 109, "args": { "External id": 164840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164840, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164840, "pid": 5, "tid": 7, "ts": 1716454224033403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885863, "dur": 13, "args": { "External id": 164840, "cbid": 211, "correlation": 164840 } }, { "ph": "s", "id": 164840, "pid": 76337, "tid": -914061504, "ts": 1716454223885863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224033513, "dur": 35, "args": { "External id": 164848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164848, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164848, "pid": 5, "tid": 7, "ts": 1716454224033513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885932, "dur": 13, "args": { "External id": 164848, "cbid": 211, "correlation": 164848 } }, { "ph": "s", "id": 164848, "pid": 76337, "tid": -914061504, "ts": 1716454223885932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224033549, "dur": 67, "args": { "External id": 164856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164856, "pid": 5, "tid": 7, "ts": 1716454224033549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223885981, "dur": 11, "args": { "External id": 164856, "cbid": 211, "correlation": 164856 } }, { "ph": "s", "id": 164856, "pid": 76337, "tid": -914061504, "ts": 1716454223885981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224033618, "dur": 93, "args": { "External id": 164878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164878, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164878, "pid": 5, "tid": 7, "ts": 1716454224033618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886034, "dur": 11, "args": { "External id": 164878, "cbid": 211, "correlation": 164878 } }, { "ph": "s", "id": 164878, "pid": 76337, "tid": -914061504, "ts": 1716454223886034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886121, "dur": 1, "args": { "External id": 164894, "cbid": 251, "correlation": 164894 } }, { "ph": "f", "id": 164894, "pid": 76337, "tid": -914061504, "ts": 1716454223886121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224033712, "dur": 578, "args": { "External id": 164896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164896, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 164896, "pid": 5, "tid": 7, "ts": 1716454224033712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886127, "dur": 13, "args": { "External id": 164896, "cbid": 211, "correlation": 164896 } }, { "ph": "s", "id": 164896, "pid": 76337, "tid": -914061504, "ts": 1716454223886127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224034291, "dur": 243, "args": { "External id": 164904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164904, "pid": 5, "tid": 7, "ts": 1716454224034291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886192, "dur": 12, "args": { "External id": 164904, "cbid": 211, "correlation": 164904 } }, { "ph": "s", "id": 164904, "pid": 76337, "tid": -914061504, "ts": 1716454223886192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224034536, "dur": 254, "args": { "External id": 164912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164912, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164912, "pid": 5, "tid": 7, "ts": 1716454224034536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886222, "dur": 8, "args": { "External id": 164912, "cbid": 211, "correlation": 164912 } }, { "ph": "s", "id": 164912, "pid": 76337, "tid": -914061504, "ts": 1716454223886222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886304, "dur": 1, "args": { "External id": 164928, "cbid": 251, "correlation": 164928 } }, { "ph": "f", "id": 164928, "pid": 76337, "tid": -914061504, "ts": 1716454223886304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886309, "dur": 0, "args": { "External id": 164930, "cbid": 251, "correlation": 164930 } }, { "ph": "f", "id": 164930, "pid": 76337, "tid": -914061504, "ts": 1716454223886309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224034791, "dur": 361, "args": { "External id": 164931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164931, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 164931, "pid": 5, "tid": 7, "ts": 1716454224034791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886311, "dur": 12, "args": { "External id": 164931, "cbid": 211, "correlation": 164931 } }, { "ph": "s", "id": 164931, "pid": 76337, "tid": -914061504, "ts": 1716454223886311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224035153, "dur": 50, "args": { "External id": 164939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164939, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164939, "pid": 5, "tid": 7, "ts": 1716454224035153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886354, "dur": 185, "args": { "External id": 164939, "cbid": 211, "correlation": 164939 } }, { "ph": "s", "id": 164939, "pid": 76337, "tid": -914061504, "ts": 1716454223886354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224035204, "dur": 158, "args": { "External id": 164950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164950, "pid": 5, "tid": 7, "ts": 1716454224035204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886597, "dur": 71, "args": { "External id": 164950, "cbid": 211, "correlation": 164950 } }, { "ph": "s", "id": 164950, "pid": 76337, "tid": -914061504, "ts": 1716454223886597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223886722, "dur": 0, "args": { "External id": 164962, "cbid": 317, "correlation": 164962 } }, { "ph": "f", "id": 164962, "pid": 76337, "tid": -914061504, "ts": 1716454223886722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223886723, "dur": 0, "args": { "External id": 164963, "cbid": 203, "correlation": 164963 } }, { "ph": "f", "id": 164963, "pid": 76337, "tid": -914061504, "ts": 1716454223886723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223886724, "dur": 0, "args": { "External id": 164964, "cbid": 205, "correlation": 164964 } }, { "ph": "f", "id": 164964, "pid": 76337, "tid": -914061504, "ts": 1716454223886724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886747, "dur": 1, "args": { "External id": 164968, "cbid": 251, "correlation": 164968 } }, { "ph": "f", "id": 164968, "pid": 76337, "tid": -914061504, "ts": 1716454223886747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886749, "dur": 0, "args": { "External id": 164969, "cbid": 251, "correlation": 164969 } }, { "ph": "f", "id": 164969, "pid": 76337, "tid": -914061504, "ts": 1716454223886749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886750, "dur": 0, "args": { "External id": 164970, "cbid": 251, "correlation": 164970 } }, { "ph": "f", "id": 164970, "pid": 76337, "tid": -914061504, "ts": 1716454223886750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886750, "dur": 0, "args": { "External id": 164971, "cbid": 251, "correlation": 164971 } }, { "ph": "f", "id": 164971, "pid": 76337, "tid": -914061504, "ts": 1716454223886750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886751, "dur": 0, "args": { "External id": 164972, "cbid": 251, "correlation": 164972 } }, { "ph": "f", "id": 164972, "pid": 76337, "tid": -914061504, "ts": 1716454223886751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886752, "dur": 0, "args": { "External id": 164973, "cbid": 251, "correlation": 164973 } }, { "ph": "f", "id": 164973, "pid": 76337, "tid": -914061504, "ts": 1716454223886752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886753, "dur": 0, "args": { "External id": 164974, "cbid": 251, "correlation": 164974 } }, { "ph": "f", "id": 164974, "pid": 76337, "tid": -914061504, "ts": 1716454223886753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886753, "dur": 0, "args": { "External id": 164975, "cbid": 251, "correlation": 164975 } }, { "ph": "f", "id": 164975, "pid": 76337, "tid": -914061504, "ts": 1716454223886753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223886754, "dur": 0, "args": { "External id": 164976, "cbid": 251, "correlation": 164976 } }, { "ph": "f", "id": 164976, "pid": 76337, "tid": -914061504, "ts": 1716454223886754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224035364, "dur": 116, "args": { "External id": 164977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164977, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 164977, "pid": 5, "tid": 7, "ts": 1716454224035364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886757, "dur": 38, "args": { "External id": 164977, "cbid": 211, "correlation": 164977 } }, { "ph": "s", "id": 164977, "pid": 76337, "tid": -914061504, "ts": 1716454223886757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224035481, "dur": 59, "args": { "External id": 164983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164983, "pid": 5, "tid": 7, "ts": 1716454224035481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223886817, "dur": 283, "args": { "External id": 164983, "cbid": 211, "correlation": 164983 } }, { "ph": "s", "id": 164983, "pid": 76337, "tid": -914061504, "ts": 1716454223886817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224035542, "dur": 49, "args": { "External id": 164991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 164991, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 164991, "pid": 5, "tid": 7, "ts": 1716454224035542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887124, "dur": 9, "args": { "External id": 164991, "cbid": 211, "correlation": 164991 } }, { "ph": "s", "id": 164991, "pid": 76337, "tid": -914061504, "ts": 1716454223887124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224035593, "dur": 51, "args": { "External id": 165011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165011, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 165011, "pid": 5, "tid": 7, "ts": 1716454224035593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887206, "dur": 11, "args": { "External id": 165011, "cbid": 211, "correlation": 165011 } }, { "ph": "s", "id": 165011, "pid": 76337, "tid": -914061504, "ts": 1716454223887206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224035645, "dur": 4, "args": { "External id": 165023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165023, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 165023, "pid": 5, "tid": 7, "ts": 1716454224035645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887227, "dur": 6, "args": { "External id": 165023, "cbid": 211, "correlation": 165023 } }, { "ph": "s", "id": 165023, "pid": 76337, "tid": -914061504, "ts": 1716454223887227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224035651, "dur": 56, "args": { "External id": 165026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165026, "pid": 5, "tid": 7, "ts": 1716454224035651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887245, "dur": 111, "args": { "External id": 165026, "cbid": 211, "correlation": 165026 } }, { "ph": "s", "id": 165026, "pid": 76337, "tid": -914061504, "ts": 1716454223887245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224035708, "dur": 37, "args": { "External id": 165035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165035, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165035, "pid": 5, "tid": 7, "ts": 1716454224035708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887395, "dur": 11, "args": { "External id": 165035, "cbid": 211, "correlation": 165035 } }, { "ph": "s", "id": 165035, "pid": 76337, "tid": -914061504, "ts": 1716454223887395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454223887452, "dur": 0, "args": { "External id": 165045, "cbid": 317, "correlation": 165045 } }, { "ph": "f", "id": 165045, "pid": 76337, "tid": -914061504, "ts": 1716454223887452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454223887453, "dur": 0, "args": { "External id": 165046, "cbid": 203, "correlation": 165046 } }, { "ph": "f", "id": 165046, "pid": 76337, "tid": -914061504, "ts": 1716454223887453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454223887454, "dur": 0, "args": { "External id": 165047, "cbid": 205, "correlation": 165047 } }, { "ph": "f", "id": 165047, "pid": 76337, "tid": -914061504, "ts": 1716454223887454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224035747, "dur": 41, "args": { "External id": 165051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165051, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165051, "pid": 5, "tid": 7, "ts": 1716454224035747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887470, "dur": 12, "args": { "External id": 165051, "cbid": 211, "correlation": 165051 } }, { "ph": "s", "id": 165051, "pid": 76337, "tid": -914061504, "ts": 1716454223887470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224035789, "dur": 3, "args": { "External id": 165053, "device": 5, "context": 1, "stream": 7, "correlation": 165053, "bytes": 46080, "memory bandwidth (GB/s)": 11.80327868852459 } }, { "ph": "f", "id": 165053, "pid": 5, "tid": 7, "ts": 1716454224035789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223887486, "dur": 20, "args": { "External id": 165053, "cbid": 51, "correlation": 165053 } }, { "ph": "s", "id": 165053, "pid": 76337, "tid": -914061504, "ts": 1716454223887486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223887511, "dur": 2, "args": { "External id": 165055, "cbid": 200, "correlation": 165055 } }, { "ph": "f", "id": 165055, "pid": 76337, "tid": -914061504, "ts": 1716454223887511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223887514, "dur": 0, "args": { "External id": 165056, "cbid": 200, "correlation": 165056 } }, { "ph": "f", "id": 165056, "pid": 76337, "tid": -914061504, "ts": 1716454223887514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223887515, "dur": 0, "args": { "External id": 165057, "cbid": 200, "correlation": 165057 } }, { "ph": "f", "id": 165057, "pid": 76337, "tid": -914061504, "ts": 1716454223887515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454223887515, "dur": 0, "args": { "External id": 165058, "cbid": 200, "correlation": 165058 } }, { "ph": "f", "id": 165058, "pid": 76337, "tid": -914061504, "ts": 1716454223887515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454223887516, "dur": 4, "args": { "External id": 165059, "cbid": 15, "correlation": 165059 } }, { "ph": "f", "id": 165059, "pid": 76337, "tid": -914061504, "ts": 1716454223887516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454223887521, "dur": 1, "args": { "External id": 165060, "cbid": 251, "correlation": 165060 } }, { "ph": "f", "id": 165060, "pid": 76337, "tid": -914061504, "ts": 1716454223887521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454224035794, "dur": 24, "args": { "External id": 165061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165061, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165061, "pid": 5, "tid": 7, "ts": 1716454224035794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887524, "dur": 8, "args": { "External id": 165061, "cbid": 211, "correlation": 165061 } }, { "ph": "s", "id": 165061, "pid": 76337, "tid": -914061504, "ts": 1716454223887524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224035819, "dur": 4, "args": { "External id": 165063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 165063, "pid": 5, "tid": 7, "ts": 1716454224035819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887537, "dur": 6, "args": { "External id": 165063, "cbid": 211, "correlation": 165063 } }, { "ph": "s", "id": 165063, "pid": 76337, "tid": -914061504, "ts": 1716454223887537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223887546, "dur": 0, "args": { "External id": 165064, "cbid": 51, "correlation": 165064 } }, { "ph": "s", "id": 165064, "pid": 76337, "tid": -914061504, "ts": 1716454223887546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224035824, "dur": 190, "args": { "External id": 165065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165065, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165065, "pid": 5, "tid": 7, "ts": 1716454224035824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887547, "dur": 202, "args": { "External id": 165065, "cbid": 211, "correlation": 165065 } }, { "ph": "s", "id": 165065, "pid": 76337, "tid": -914061504, "ts": 1716454223887547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224036015, "dur": 6, "args": { "External id": 165066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165066, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165066, "pid": 5, "tid": 7, "ts": 1716454224036015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887753, "dur": 6, "args": { "External id": 165066, "cbid": 211, "correlation": 165066 } }, { "ph": "s", "id": 165066, "pid": 76337, "tid": -914061504, "ts": 1716454223887753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224036023, "dur": 5, "args": { "External id": 165072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 165072, "pid": 5, "tid": 7, "ts": 1716454224036023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223887784, "dur": 9, "args": { "External id": 165072, "cbid": 211, "correlation": 165072 } }, { "ph": "s", "id": 165072, "pid": 76337, "tid": -914061504, "ts": 1716454223887784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036029, "dur": 3, "args": { "External id": 165080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165080, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165080, "pid": 5, "tid": 7, "ts": 1716454224036029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223889484, "dur": 15, "args": { "External id": 165080, "cbid": 211, "correlation": 165080 } }, { "ph": "s", "id": 165080, "pid": 76337, "tid": -914061504, "ts": 1716454223889484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036033, "dur": 3, "args": { "External id": 165088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165088, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165088, "pid": 5, "tid": 7, "ts": 1716454224036033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223889524, "dur": 11, "args": { "External id": 165088, "cbid": 211, "correlation": 165088 } }, { "ph": "s", "id": 165088, "pid": 76337, "tid": -914061504, "ts": 1716454223889524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036038, "dur": 3, "args": { "External id": 165096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165096, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165096, "pid": 5, "tid": 7, "ts": 1716454224036038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223889551, "dur": 8, "args": { "External id": 165096, "cbid": 211, "correlation": 165096 } }, { "ph": "s", "id": 165096, "pid": 76337, "tid": -914061504, "ts": 1716454223889551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036042, "dur": 3, "args": { "External id": 165105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165105, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165105, "pid": 5, "tid": 7, "ts": 1716454224036042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223889730, "dur": 14, "args": { "External id": 165105, "cbid": 211, "correlation": 165105 } }, { "ph": "s", "id": 165105, "pid": 76337, "tid": -914061504, "ts": 1716454223889730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036047, "dur": 3, "args": { "External id": 165114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165114, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165114, "pid": 5, "tid": 7, "ts": 1716454224036047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223889760, "dur": 7, "args": { "External id": 165114, "cbid": 211, "correlation": 165114 } }, { "ph": "s", "id": 165114, "pid": 76337, "tid": -914061504, "ts": 1716454223889760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036051, "dur": 3, "args": { "External id": 165122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165122, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165122, "pid": 5, "tid": 7, "ts": 1716454224036051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223889784, "dur": 8, "args": { "External id": 165122, "cbid": 211, "correlation": 165122 } }, { "ph": "s", "id": 165122, "pid": 76337, "tid": -914061504, "ts": 1716454223889784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036055, "dur": 3, "args": { "External id": 165130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165130, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165130, "pid": 5, "tid": 7, "ts": 1716454224036055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223890055, "dur": 16, "args": { "External id": 165130, "cbid": 211, "correlation": 165130 } }, { "ph": "s", "id": 165130, "pid": 76337, "tid": -914061504, "ts": 1716454223890055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036060, "dur": 3, "args": { "External id": 165138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165138, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165138, "pid": 5, "tid": 7, "ts": 1716454224036060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454223890087, "dur": 7, "args": { "External id": 165138, "cbid": 211, "correlation": 165138 } }, { "ph": "s", "id": 165138, "pid": 76337, "tid": -914061504, "ts": 1716454223890087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036065, "dur": 1, "args": { "External id": 165148, "device": 5, "context": 1, "stream": 7, "correlation": 165148, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 165148, "pid": 5, "tid": 7, "ts": 1716454224036065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454223890154, "dur": 35, "args": { "External id": 165148, "cbid": 41, "correlation": 165148 } }, { "ph": "s", "id": 165148, "pid": 76337, "tid": -914061504, "ts": 1716454223890154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454223890191, "dur": 145891, "args": { "External id": 165149, "cbid": 131, "correlation": 165149 } }, { "ph": "f", "id": 165149, "pid": 76337, "tid": -914061504, "ts": 1716454223890191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224036228, "dur": 3, "args": { "External id": 165157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165157, "pid": 5, "tid": 7, "ts": 1716454224036228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036206, "dur": 23, "args": { "External id": 165157, "cbid": 211, "correlation": 165157 } }, { "ph": "s", "id": 165157, "pid": 76337, "tid": -914061504, "ts": 1716454224036206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036317, "dur": 3, "args": { "External id": 165166, "device": 5, "context": 1, "stream": 7, "correlation": 165166, "bytes": 8, "memory bandwidth (GB/s)": 0.002380952380952381 } }, { "ph": "f", "id": 165166, "pid": 5, "tid": 7, "ts": 1716454224036317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036288, "dur": 28, "args": { "External id": 165166, "cbid": 41, "correlation": 165166 } }, { "ph": "s", "id": 165166, "pid": 76337, "tid": -914061504, "ts": 1716454224036288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224036406, "dur": 4, "args": { "External id": 165176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165176, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165176, "pid": 5, "tid": 7, "ts": 1716454224036406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036390, "dur": 16, "args": { "External id": 165176, "cbid": 211, "correlation": 165176 } }, { "ph": "s", "id": 165176, "pid": 76337, "tid": -914061504, "ts": 1716454224036390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036509, "dur": 1, "args": { "External id": 165186, "device": 5, "context": 1, "stream": 7, "correlation": 165186, "bytes": 8, "memory bandwidth (GB/s)": 0.004545454545454545 } }, { "ph": "f", "id": 165186, "pid": 5, "tid": 7, "ts": 1716454224036509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036491, "dur": 16, "args": { "External id": 165186, "cbid": 41, "correlation": 165186 } }, { "ph": "s", "id": 165186, "pid": 76337, "tid": -914061504, "ts": 1716454224036491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224036508, "dur": 9, "args": { "External id": 165187, "cbid": 131, "correlation": 165187 } }, { "ph": "f", "id": 165187, "pid": 76337, "tid": -914061504, "ts": 1716454224036508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036578, "dur": 3, "args": { "External id": 165194, "device": 5, "context": 1, "stream": 7, "correlation": 165194, "bytes": 98304, "memory bandwidth (GB/s)": 30.415841584158414 } }, { "ph": "f", "id": 165194, "pid": 5, "tid": 7, "ts": 1716454224036578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036555, "dur": 23, "args": { "External id": 165194, "cbid": 41, "correlation": 165194 } }, { "ph": "s", "id": 165194, "pid": 76337, "tid": -914061504, "ts": 1716454224036555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036663, "dur": 3, "args": { "External id": 165213, "device": 5, "context": 1, "stream": 7, "correlation": 165213, "bytes": 16, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 165213, "pid": 5, "tid": 7, "ts": 1716454224036663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036644, "dur": 18, "args": { "External id": 165213, "cbid": 41, "correlation": 165213 } }, { "ph": "s", "id": 165213, "pid": 76337, "tid": -914061504, "ts": 1716454224036644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224036702, "dur": 3, "args": { "External id": 165219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165219, "pid": 5, "tid": 7, "ts": 1716454224036702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036690, "dur": 12, "args": { "External id": 165219, "cbid": 211, "correlation": 165219 } }, { "ph": "s", "id": 165219, "pid": 76337, "tid": -914061504, "ts": 1716454224036690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454224036716, "dur": 6, "args": { "External id": 165221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165221, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 165221, "pid": 5, "tid": 7, "ts": 1716454224036716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036706, "dur": 9, "args": { "External id": 165221, "cbid": 211, "correlation": 165221 } }, { "ph": "s", "id": 165221, "pid": 76337, "tid": -914061504, "ts": 1716454224036706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454224036724, "dur": 3, "args": { "External id": 165223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165223, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165223, "pid": 5, "tid": 7, "ts": 1716454224036724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036716, "dur": 6, "args": { "External id": 165223, "cbid": 211, "correlation": 165223 } }, { "ph": "s", "id": 165223, "pid": 76337, "tid": -914061504, "ts": 1716454224036716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036757, "dur": 2, "args": { "External id": 165231, "device": 5, "context": 1, "stream": 7, "correlation": 165231, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 165231, "pid": 5, "tid": 7, "ts": 1716454224036757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036743, "dur": 13, "args": { "External id": 165231, "cbid": 41, "correlation": 165231 } }, { "ph": "s", "id": 165231, "pid": 76337, "tid": -914061504, "ts": 1716454224036743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224036802, "dur": 3, "args": { "External id": 165245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165245, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165245, "pid": 5, "tid": 7, "ts": 1716454224036802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036792, "dur": 12, "args": { "External id": 165245, "cbid": 211, "correlation": 165245 } }, { "ph": "s", "id": 165245, "pid": 76337, "tid": -914061504, "ts": 1716454224036792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224036825, "dur": 2, "args": { "External id": 165259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165259, "pid": 5, "tid": 7, "ts": 1716454224036825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036817, "dur": 6, "args": { "External id": 165259, "cbid": 211, "correlation": 165259 } }, { "ph": "s", "id": 165259, "pid": 76337, "tid": -914061504, "ts": 1716454224036817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224036874, "dur": 6, "args": { "External id": 165266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165266, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165266, "pid": 5, "tid": 7, "ts": 1716454224036874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036863, "dur": 12, "args": { "External id": 165266, "cbid": 211, "correlation": 165266 } }, { "ph": "s", "id": 165266, "pid": 76337, "tid": -914061504, "ts": 1716454224036863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224036885, "dur": 6, "args": { "External id": 165269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165269, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165269, "pid": 5, "tid": 7, "ts": 1716454224036885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036878, "dur": 7, "args": { "External id": 165269, "cbid": 211, "correlation": 165269 } }, { "ph": "s", "id": 165269, "pid": 76337, "tid": -914061504, "ts": 1716454224036878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454224036894, "dur": 3, "args": { "External id": 165271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165271, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165271, "pid": 5, "tid": 7, "ts": 1716454224036894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036886, "dur": 7, "args": { "External id": 165271, "cbid": 211, "correlation": 165271 } }, { "ph": "s", "id": 165271, "pid": 76337, "tid": -914061504, "ts": 1716454224036886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036918, "dur": 2, "args": { "External id": 165274, "device": 5, "context": 1, "stream": 7, "correlation": 165274, "bytes": 8, "memory bandwidth (GB/s)": 0.002777777777777778 } }, { "ph": "f", "id": 165274, "pid": 5, "tid": 7, "ts": 1716454224036918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036904, "dur": 13, "args": { "External id": 165274, "cbid": 41, "correlation": 165274 } }, { "ph": "s", "id": 165274, "pid": 76337, "tid": -914061504, "ts": 1716454224036904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224036969, "dur": 4, "args": { "External id": 165290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165290, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165290, "pid": 5, "tid": 7, "ts": 1716454224036969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224036957, "dur": 12, "args": { "External id": 165290, "cbid": 211, "correlation": 165290 } }, { "ph": "s", "id": 165290, "pid": 76337, "tid": -914061504, "ts": 1716454224036957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224036997, "dur": 3, "args": { "External id": 165295, "device": 5, "context": 1, "stream": 7, "correlation": 165295, "bytes": 1, "memory bandwidth (GB/s)": 0.0003125 } }, { "ph": "f", "id": 165295, "pid": 5, "tid": 7, "ts": 1716454224036997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224036982, "dur": 14, "args": { "External id": 165295, "cbid": 41, "correlation": 165295 } }, { "ph": "s", "id": 165295, "pid": 76337, "tid": -914061504, "ts": 1716454224036982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224037023, "dur": 1, "args": { "External id": 165301, "device": 5, "context": 1, "stream": 7, "correlation": 165301, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 165301, "pid": 5, "tid": 7, "ts": 1716454224037023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224037006, "dur": 26, "args": { "External id": 165301, "cbid": 41, "correlation": 165301 } }, { "ph": "s", "id": 165301, "pid": 76337, "tid": -914061504, "ts": 1716454224037006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224037032, "dur": 3, "args": { "External id": 165302, "cbid": 131, "correlation": 165302 } }, { "ph": "f", "id": 165302, "pid": 76337, "tid": -914061504, "ts": 1716454224037032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037082, "dur": 3, "args": { "External id": 165310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165310, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165310, "pid": 5, "tid": 7, "ts": 1716454224037082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037069, "dur": 12, "args": { "External id": 165310, "cbid": 211, "correlation": 165310 } }, { "ph": "s", "id": 165310, "pid": 76337, "tid": -914061504, "ts": 1716454224037069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037113, "dur": 3, "args": { "External id": 165320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165320, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165320, "pid": 5, "tid": 7, "ts": 1716454224037113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037103, "dur": 9, "args": { "External id": 165320, "cbid": 211, "correlation": 165320 } }, { "ph": "s", "id": 165320, "pid": 76337, "tid": -914061504, "ts": 1716454224037103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037136, "dur": 3, "args": { "External id": 165329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165329, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165329, "pid": 5, "tid": 7, "ts": 1716454224037136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037127, "dur": 7, "args": { "External id": 165329, "cbid": 211, "correlation": 165329 } }, { "ph": "s", "id": 165329, "pid": 76337, "tid": -914061504, "ts": 1716454224037127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224037244, "dur": 12, "args": { "External id": 165339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165339, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165339, "pid": 5, "tid": 7, "ts": 1716454224037244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037227, "dur": 19, "args": { "External id": 165339, "cbid": 211, "correlation": 165339 } }, { "ph": "s", "id": 165339, "pid": 76337, "tid": -914061504, "ts": 1716454224037227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037283, "dur": 3, "args": { "External id": 165347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165347, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165347, "pid": 5, "tid": 7, "ts": 1716454224037283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037274, "dur": 8, "args": { "External id": 165347, "cbid": 211, "correlation": 165347 } }, { "ph": "s", "id": 165347, "pid": 76337, "tid": -914061504, "ts": 1716454224037274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224037327, "dur": 12, "args": { "External id": 165357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165357, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165357, "pid": 5, "tid": 7, "ts": 1716454224037327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037315, "dur": 11, "args": { "External id": 165357, "cbid": 211, "correlation": 165357 } }, { "ph": "s", "id": 165357, "pid": 76337, "tid": -914061504, "ts": 1716454224037315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224037356, "dur": 10, "args": { "External id": 165365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165365, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165365, "pid": 5, "tid": 7, "ts": 1716454224037356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037346, "dur": 8, "args": { "External id": 165365, "cbid": 211, "correlation": 165365 } }, { "ph": "s", "id": 165365, "pid": 76337, "tid": -914061504, "ts": 1716454224037346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037382, "dur": 3, "args": { "External id": 165374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165374, "pid": 5, "tid": 7, "ts": 1716454224037382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037373, "dur": 9, "args": { "External id": 165374, "cbid": 211, "correlation": 165374 } }, { "ph": "s", "id": 165374, "pid": 76337, "tid": -914061504, "ts": 1716454224037373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224037406, "dur": 5, "args": { "External id": 165383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165383, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165383, "pid": 5, "tid": 7, "ts": 1716454224037406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037397, "dur": 7, "args": { "External id": 165383, "cbid": 211, "correlation": 165383 } }, { "ph": "s", "id": 165383, "pid": 76337, "tid": -914061504, "ts": 1716454224037397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224037442, "dur": 8, "args": { "External id": 165393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165393, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165393, "pid": 5, "tid": 7, "ts": 1716454224037442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037432, "dur": 11, "args": { "External id": 165393, "cbid": 211, "correlation": 165393 } }, { "ph": "s", "id": 165393, "pid": 76337, "tid": -914061504, "ts": 1716454224037432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037754, "dur": 3, "args": { "External id": 165402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165402, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165402, "pid": 5, "tid": 7, "ts": 1716454224037754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037738, "dur": 16, "args": { "External id": 165402, "cbid": 211, "correlation": 165402 } }, { "ph": "s", "id": 165402, "pid": 76337, "tid": -914061504, "ts": 1716454224037738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037780, "dur": 3, "args": { "External id": 165410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165410, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165410, "pid": 5, "tid": 7, "ts": 1716454224037780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037771, "dur": 8, "args": { "External id": 165410, "cbid": 211, "correlation": 165410 } }, { "ph": "s", "id": 165410, "pid": 76337, "tid": -914061504, "ts": 1716454224037771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224037832, "dur": 1, "args": { "External id": 165420, "device": 5, "context": 1, "stream": 7, "correlation": 165420, "bytes": 8, "memory bandwidth (GB/s)": 0.0053226879574184965 } }, { "ph": "f", "id": 165420, "pid": 5, "tid": 7, "ts": 1716454224037832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224037817, "dur": 13, "args": { "External id": 165420, "cbid": 41, "correlation": 165420 } }, { "ph": "s", "id": 165420, "pid": 76337, "tid": -914061504, "ts": 1716454224037817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224037831, "dur": 8, "args": { "External id": 165421, "cbid": 131, "correlation": 165421 } }, { "ph": "f", "id": 165421, "pid": 76337, "tid": -914061504, "ts": 1716454224037831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224037921, "dur": 2, "args": { "External id": 165429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165429, "pid": 5, "tid": 7, "ts": 1716454224037921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224037907, "dur": 14, "args": { "External id": 165429, "cbid": 211, "correlation": 165429 } }, { "ph": "s", "id": 165429, "pid": 76337, "tid": -914061504, "ts": 1716454224037907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038003, "dur": 3, "args": { "External id": 165438, "device": 5, "context": 1, "stream": 7, "correlation": 165438, "bytes": 8, "memory bandwidth (GB/s)": 0.0026041666666666665 } }, { "ph": "f", "id": 165438, "pid": 5, "tid": 7, "ts": 1716454224038003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224037984, "dur": 19, "args": { "External id": 165438, "cbid": 41, "correlation": 165438 } }, { "ph": "s", "id": 165438, "pid": 76337, "tid": -914061504, "ts": 1716454224037984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224038075, "dur": 3, "args": { "External id": 165448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165448, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165448, "pid": 5, "tid": 7, "ts": 1716454224038075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038061, "dur": 13, "args": { "External id": 165448, "cbid": 211, "correlation": 165448 } }, { "ph": "s", "id": 165448, "pid": 76337, "tid": -914061504, "ts": 1716454224038061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038128, "dur": 1, "args": { "External id": 165458, "device": 5, "context": 1, "stream": 7, "correlation": 165458, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 165458, "pid": 5, "tid": 7, "ts": 1716454224038128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038113, "dur": 13, "args": { "External id": 165458, "cbid": 41, "correlation": 165458 } }, { "ph": "s", "id": 165458, "pid": 76337, "tid": -914061504, "ts": 1716454224038113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038127, "dur": 8, "args": { "External id": 165459, "cbid": 131, "correlation": 165459 } }, { "ph": "f", "id": 165459, "pid": 76337, "tid": -914061504, "ts": 1716454224038127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038188, "dur": 3, "args": { "External id": 165466, "device": 5, "context": 1, "stream": 7, "correlation": 165466, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 165466, "pid": 5, "tid": 7, "ts": 1716454224038188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038169, "dur": 18, "args": { "External id": 165466, "cbid": 41, "correlation": 165466 } }, { "ph": "s", "id": 165466, "pid": 76337, "tid": -914061504, "ts": 1716454224038169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038235, "dur": 1, "args": { "External id": 165477, "device": 5, "context": 1, "stream": 7, "correlation": 165477, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 165477, "pid": 5, "tid": 7, "ts": 1716454224038235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038223, "dur": 10, "args": { "External id": 165477, "cbid": 41, "correlation": 165477 } }, { "ph": "s", "id": 165477, "pid": 76337, "tid": -914061504, "ts": 1716454224038223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038234, "dur": 8, "args": { "External id": 165478, "cbid": 131, "correlation": 165478 } }, { "ph": "f", "id": 165478, "pid": 76337, "tid": -914061504, "ts": 1716454224038234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224038283, "dur": 3, "args": { "External id": 165486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165486, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165486, "pid": 5, "tid": 7, "ts": 1716454224038283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038270, "dur": 13, "args": { "External id": 165486, "cbid": 211, "correlation": 165486 } }, { "ph": "s", "id": 165486, "pid": 76337, "tid": -914061504, "ts": 1716454224038270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224038313, "dur": 3, "args": { "External id": 165496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165496, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165496, "pid": 5, "tid": 7, "ts": 1716454224038313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038303, "dur": 8, "args": { "External id": 165496, "cbid": 211, "correlation": 165496 } }, { "ph": "s", "id": 165496, "pid": 76337, "tid": -914061504, "ts": 1716454224038303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224038334, "dur": 3, "args": { "External id": 165505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165505, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165505, "pid": 5, "tid": 7, "ts": 1716454224038334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038326, "dur": 7, "args": { "External id": 165505, "cbid": 211, "correlation": 165505 } }, { "ph": "s", "id": 165505, "pid": 76337, "tid": -914061504, "ts": 1716454224038326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224038402, "dur": 5, "args": { "External id": 165513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165513, "pid": 5, "tid": 7, "ts": 1716454224038402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038388, "dur": 14, "args": { "External id": 165513, "cbid": 211, "correlation": 165513 } }, { "ph": "s", "id": 165513, "pid": 76337, "tid": -914061504, "ts": 1716454224038388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224038442, "dur": 3, "args": { "External id": 165522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165522, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165522, "pid": 5, "tid": 7, "ts": 1716454224038442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038432, "dur": 9, "args": { "External id": 165522, "cbid": 211, "correlation": 165522 } }, { "ph": "s", "id": 165522, "pid": 76337, "tid": -914061504, "ts": 1716454224038432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224038465, "dur": 3, "args": { "External id": 165531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165531, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165531, "pid": 5, "tid": 7, "ts": 1716454224038465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038457, "dur": 7, "args": { "External id": 165531, "cbid": 211, "correlation": 165531 } }, { "ph": "s", "id": 165531, "pid": 76337, "tid": -914061504, "ts": 1716454224038457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224038527, "dur": 3, "args": { "External id": 165539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165539, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165539, "pid": 5, "tid": 7, "ts": 1716454224038527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038516, "dur": 10, "args": { "External id": 165539, "cbid": 211, "correlation": 165539 } }, { "ph": "s", "id": 165539, "pid": 76337, "tid": -914061504, "ts": 1716454224038516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224038585, "dur": 1, "args": { "External id": 165547, "device": 5, "context": 1, "stream": 7, "correlation": 165547, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 165547, "pid": 5, "tid": 7, "ts": 1716454224038585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038569, "dur": 26, "args": { "External id": 165547, "cbid": 41, "correlation": 165547 } }, { "ph": "s", "id": 165547, "pid": 76337, "tid": -914061504, "ts": 1716454224038569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038596, "dur": 4, "args": { "External id": 165548, "cbid": 131, "correlation": 165548 } }, { "ph": "f", "id": 165548, "pid": 76337, "tid": -914061504, "ts": 1716454224038596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038658, "dur": 1, "args": { "External id": 165558, "device": 5, "context": 1, "stream": 7, "correlation": 165558, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 165558, "pid": 5, "tid": 7, "ts": 1716454224038658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038645, "dur": 10, "args": { "External id": 165558, "cbid": 41, "correlation": 165558 } }, { "ph": "s", "id": 165558, "pid": 76337, "tid": -914061504, "ts": 1716454224038645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038656, "dur": 7, "args": { "External id": 165559, "cbid": 131, "correlation": 165559 } }, { "ph": "f", "id": 165559, "pid": 76337, "tid": -914061504, "ts": 1716454224038656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038710, "dur": 1, "args": { "External id": 165568, "device": 5, "context": 1, "stream": 7, "correlation": 165568, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 165568, "pid": 5, "tid": 7, "ts": 1716454224038710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038700, "dur": 8, "args": { "External id": 165568, "cbid": 41, "correlation": 165568 } }, { "ph": "s", "id": 165568, "pid": 76337, "tid": -914061504, "ts": 1716454224038700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038709, "dur": 8, "args": { "External id": 165569, "cbid": 131, "correlation": 165569 } }, { "ph": "f", "id": 165569, "pid": 76337, "tid": -914061504, "ts": 1716454224038709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224038783, "dur": 4, "args": { "External id": 165576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165576, "pid": 5, "tid": 7, "ts": 1716454224038783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038767, "dur": 17, "args": { "External id": 165576, "cbid": 211, "correlation": 165576 } }, { "ph": "s", "id": 165576, "pid": 76337, "tid": -914061504, "ts": 1716454224038767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454224038821, "dur": 4, "args": { "External id": 165596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165596, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165596, "pid": 5, "tid": 7, "ts": 1716454224038821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038809, "dur": 12, "args": { "External id": 165596, "cbid": 211, "correlation": 165596 } }, { "ph": "s", "id": 165596, "pid": 76337, "tid": -914061504, "ts": 1716454224038809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224038822, "dur": 0, "args": { "External id": 165597, "cbid": 11, "correlation": 165597 } }, { "ph": "f", "id": 165597, "pid": 76337, "tid": -914061504, "ts": 1716454224038822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224038823, "dur": 0, "args": { "External id": 165598, "cbid": 11, "correlation": 165598 } }, { "ph": "f", "id": 165598, "pid": 76337, "tid": -914061504, "ts": 1716454224038823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224038836, "dur": 1, "args": { "External id": 165601, "device": 5, "context": 1, "stream": 7, "correlation": 165601, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 165601, "pid": 5, "tid": 7, "ts": 1716454224038836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038824, "dur": 21, "args": { "External id": 165601, "cbid": 41, "correlation": 165601 } }, { "ph": "s", "id": 165601, "pid": 76337, "tid": -914061504, "ts": 1716454224038824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038846, "dur": 3, "args": { "External id": 165602, "cbid": 131, "correlation": 165602 } }, { "ph": "f", "id": 165602, "pid": 76337, "tid": -914061504, "ts": 1716454224038846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224038874, "dur": 3, "args": { "External id": 165626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165626, "pid": 5, "tid": 7, "ts": 1716454224038874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038864, "dur": 9, "args": { "External id": 165626, "cbid": 211, "correlation": 165626 } }, { "ph": "s", "id": 165626, "pid": 76337, "tid": -914061504, "ts": 1716454224038864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224038874, "dur": 0, "args": { "External id": 165627, "cbid": 11, "correlation": 165627 } }, { "ph": "f", "id": 165627, "pid": 76337, "tid": -914061504, "ts": 1716454224038874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224038874, "dur": 0, "args": { "External id": 165628, "cbid": 11, "correlation": 165628 } }, { "ph": "f", "id": 165628, "pid": 76337, "tid": -914061504, "ts": 1716454224038874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224038876, "dur": 1, "args": { "External id": 165630, "cbid": 200, "correlation": 165630 } }, { "ph": "f", "id": 165630, "pid": 76337, "tid": -914061504, "ts": 1716454224038876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454224038886, "dur": 4, "args": { "External id": 165632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165632, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165632, "pid": 5, "tid": 7, "ts": 1716454224038886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224038878, "dur": 7, "args": { "External id": 165632, "cbid": 211, "correlation": 165632 } }, { "ph": "s", "id": 165632, "pid": 76337, "tid": -914061504, "ts": 1716454224038878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224038887, "dur": 0, "args": { "External id": 165633, "cbid": 11, "correlation": 165633 } }, { "ph": "f", "id": 165633, "pid": 76337, "tid": -914061504, "ts": 1716454224038887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224038887, "dur": 0, "args": { "External id": 165634, "cbid": 11, "correlation": 165634 } }, { "ph": "f", "id": 165634, "pid": 76337, "tid": -914061504, "ts": 1716454224038887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224038925, "dur": 1, "args": { "External id": 165641, "device": 5, "context": 1, "stream": 7, "correlation": 165641, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 165641, "pid": 5, "tid": 7, "ts": 1716454224038925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038914, "dur": 19, "args": { "External id": 165641, "cbid": 41, "correlation": 165641 } }, { "ph": "s", "id": 165641, "pid": 76337, "tid": -914061504, "ts": 1716454224038914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038934, "dur": 3, "args": { "External id": 165642, "cbid": 131, "correlation": 165642 } }, { "ph": "f", "id": 165642, "pid": 76337, "tid": -914061504, "ts": 1716454224038934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224038992, "dur": 1, "args": { "External id": 165652, "device": 5, "context": 1, "stream": 7, "correlation": 165652, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 165652, "pid": 5, "tid": 7, "ts": 1716454224038992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224038972, "dur": 18, "args": { "External id": 165652, "cbid": 41, "correlation": 165652 } }, { "ph": "s", "id": 165652, "pid": 76337, "tid": -914061504, "ts": 1716454224038972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224038991, "dur": 7, "args": { "External id": 165653, "cbid": 131, "correlation": 165653 } }, { "ph": "f", "id": 165653, "pid": 76337, "tid": -914061504, "ts": 1716454224038991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224039062, "dur": 5, "args": { "External id": 165660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165660, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165660, "pid": 5, "tid": 7, "ts": 1716454224039062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039047, "dur": 16, "args": { "External id": 165660, "cbid": 211, "correlation": 165660 } }, { "ph": "s", "id": 165660, "pid": 76337, "tid": -914061504, "ts": 1716454224039047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039131, "dur": 3, "args": { "External id": 165669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165669, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165669, "pid": 5, "tid": 7, "ts": 1716454224039131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039118, "dur": 13, "args": { "External id": 165669, "cbid": 211, "correlation": 165669 } }, { "ph": "s", "id": 165669, "pid": 76337, "tid": -914061504, "ts": 1716454224039118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039167, "dur": 3, "args": { "External id": 165677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165677, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165677, "pid": 5, "tid": 7, "ts": 1716454224039167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039158, "dur": 9, "args": { "External id": 165677, "cbid": 211, "correlation": 165677 } }, { "ph": "s", "id": 165677, "pid": 76337, "tid": -914061504, "ts": 1716454224039158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039200, "dur": 4, "args": { "External id": 165685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165685, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165685, "pid": 5, "tid": 7, "ts": 1716454224039200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039189, "dur": 11, "args": { "External id": 165685, "cbid": 211, "correlation": 165685 } }, { "ph": "s", "id": 165685, "pid": 76337, "tid": -914061504, "ts": 1716454224039189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039228, "dur": 4, "args": { "External id": 165693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165693, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165693, "pid": 5, "tid": 7, "ts": 1716454224039228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039219, "dur": 9, "args": { "External id": 165693, "cbid": 211, "correlation": 165693 } }, { "ph": "s", "id": 165693, "pid": 76337, "tid": -914061504, "ts": 1716454224039219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039255, "dur": 3, "args": { "External id": 165701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165701, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165701, "pid": 5, "tid": 7, "ts": 1716454224039255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039246, "dur": 8, "args": { "External id": 165701, "cbid": 211, "correlation": 165701 } }, { "ph": "s", "id": 165701, "pid": 76337, "tid": -914061504, "ts": 1716454224039246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039280, "dur": 3, "args": { "External id": 165709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165709, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165709, "pid": 5, "tid": 7, "ts": 1716454224039280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039271, "dur": 8, "args": { "External id": 165709, "cbid": 211, "correlation": 165709 } }, { "ph": "s", "id": 165709, "pid": 76337, "tid": -914061504, "ts": 1716454224039271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224039302, "dur": 4, "args": { "External id": 165717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165717, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165717, "pid": 5, "tid": 7, "ts": 1716454224039302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039294, "dur": 7, "args": { "External id": 165717, "cbid": 211, "correlation": 165717 } }, { "ph": "s", "id": 165717, "pid": 76337, "tid": -914061504, "ts": 1716454224039294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224039321, "dur": 5, "args": { "External id": 165725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165725, "pid": 5, "tid": 7, "ts": 1716454224039321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039313, "dur": 7, "args": { "External id": 165725, "cbid": 211, "correlation": 165725 } }, { "ph": "s", "id": 165725, "pid": 76337, "tid": -914061504, "ts": 1716454224039313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039342, "dur": 3, "args": { "External id": 165733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165733, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165733, "pid": 5, "tid": 7, "ts": 1716454224039342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039333, "dur": 7, "args": { "External id": 165733, "cbid": 211, "correlation": 165733 } }, { "ph": "s", "id": 165733, "pid": 76337, "tid": -914061504, "ts": 1716454224039333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039398, "dur": 3, "args": { "External id": 165741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165741, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 165741, "pid": 5, "tid": 7, "ts": 1716454224039398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039387, "dur": 10, "args": { "External id": 165741, "cbid": 211, "correlation": 165741 } }, { "ph": "s", "id": 165741, "pid": 76337, "tid": -914061504, "ts": 1716454224039387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224039423, "dur": 4, "args": { "External id": 165749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165749, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165749, "pid": 5, "tid": 7, "ts": 1716454224039423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039414, "dur": 8, "args": { "External id": 165749, "cbid": 211, "correlation": 165749 } }, { "ph": "s", "id": 165749, "pid": 76337, "tid": -914061504, "ts": 1716454224039414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224039447, "dur": 4, "args": { "External id": 165757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165757, "pid": 5, "tid": 7, "ts": 1716454224039447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039439, "dur": 8, "args": { "External id": 165757, "cbid": 211, "correlation": 165757 } }, { "ph": "s", "id": 165757, "pid": 76337, "tid": -914061504, "ts": 1716454224039439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224039466, "dur": 3, "args": { "External id": 165765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165765, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 165765, "pid": 5, "tid": 7, "ts": 1716454224039466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039458, "dur": 7, "args": { "External id": 165765, "cbid": 211, "correlation": 165765 } }, { "ph": "s", "id": 165765, "pid": 76337, "tid": -914061504, "ts": 1716454224039458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224039867, "dur": 5, "args": { "External id": 165774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165774, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165774, "pid": 5, "tid": 7, "ts": 1716454224039867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039851, "dur": 17, "args": { "External id": 165774, "cbid": 211, "correlation": 165774 } }, { "ph": "s", "id": 165774, "pid": 76337, "tid": -914061504, "ts": 1716454224039851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224039904, "dur": 5, "args": { "External id": 165783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165783, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165783, "pid": 5, "tid": 7, "ts": 1716454224039904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224039894, "dur": 9, "args": { "External id": 165783, "cbid": 211, "correlation": 165783 } }, { "ph": "s", "id": 165783, "pid": 76337, "tid": -914061504, "ts": 1716454224039894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224040040, "dur": 3, "args": { "External id": 165799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165799, "pid": 5, "tid": 7, "ts": 1716454224040040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040025, "dur": 15, "args": { "External id": 165799, "cbid": 211, "correlation": 165799 } }, { "ph": "s", "id": 165799, "pid": 76337, "tid": -914061504, "ts": 1716454224040025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040073, "dur": 3, "args": { "External id": 165807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165807, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165807, "pid": 5, "tid": 7, "ts": 1716454224040073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040063, "dur": 10, "args": { "External id": 165807, "cbid": 211, "correlation": 165807 } }, { "ph": "s", "id": 165807, "pid": 76337, "tid": -914061504, "ts": 1716454224040063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040105, "dur": 3, "args": { "External id": 165815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165815, "pid": 5, "tid": 7, "ts": 1716454224040105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040095, "dur": 8, "args": { "External id": 165815, "cbid": 211, "correlation": 165815 } }, { "ph": "s", "id": 165815, "pid": 76337, "tid": -914061504, "ts": 1716454224040095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040137, "dur": 4, "args": { "External id": 165823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165823, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165823, "pid": 5, "tid": 7, "ts": 1716454224040137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040127, "dur": 8, "args": { "External id": 165823, "cbid": 211, "correlation": 165823 } }, { "ph": "s", "id": 165823, "pid": 76337, "tid": -914061504, "ts": 1716454224040127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224040192, "dur": 4, "args": { "External id": 165835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165835, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165835, "pid": 5, "tid": 7, "ts": 1716454224040192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040180, "dur": 12, "args": { "External id": 165835, "cbid": 211, "correlation": 165835 } }, { "ph": "s", "id": 165835, "pid": 76337, "tid": -914061504, "ts": 1716454224040180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224040238, "dur": 4, "args": { "External id": 165846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165846, "pid": 5, "tid": 7, "ts": 1716454224040238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040226, "dur": 11, "args": { "External id": 165846, "cbid": 211, "correlation": 165846 } }, { "ph": "s", "id": 165846, "pid": 76337, "tid": -914061504, "ts": 1716454224040226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040268, "dur": 3, "args": { "External id": 165854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165854, "pid": 5, "tid": 7, "ts": 1716454224040268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040258, "dur": 8, "args": { "External id": 165854, "cbid": 211, "correlation": 165854 } }, { "ph": "s", "id": 165854, "pid": 76337, "tid": -914061504, "ts": 1716454224040258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040301, "dur": 5, "args": { "External id": 165862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165862, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165862, "pid": 5, "tid": 7, "ts": 1716454224040301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040291, "dur": 10, "args": { "External id": 165862, "cbid": 211, "correlation": 165862 } }, { "ph": "s", "id": 165862, "pid": 76337, "tid": -914061504, "ts": 1716454224040291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040331, "dur": 5, "args": { "External id": 165870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165870, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165870, "pid": 5, "tid": 7, "ts": 1716454224040331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040321, "dur": 9, "args": { "External id": 165870, "cbid": 211, "correlation": 165870 } }, { "ph": "s", "id": 165870, "pid": 76337, "tid": -914061504, "ts": 1716454224040321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224040361, "dur": 4, "args": { "External id": 165879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165879, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165879, "pid": 5, "tid": 7, "ts": 1716454224040361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040351, "dur": 9, "args": { "External id": 165879, "cbid": 211, "correlation": 165879 } }, { "ph": "s", "id": 165879, "pid": 76337, "tid": -914061504, "ts": 1716454224040351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224040422, "dur": 4, "args": { "External id": 165892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165892, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165892, "pid": 5, "tid": 7, "ts": 1716454224040422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040408, "dur": 13, "args": { "External id": 165892, "cbid": 211, "correlation": 165892 } }, { "ph": "s", "id": 165892, "pid": 76337, "tid": -914061504, "ts": 1716454224040408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224040461, "dur": 5, "args": { "External id": 165902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165902, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 165902, "pid": 5, "tid": 7, "ts": 1716454224040461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040450, "dur": 10, "args": { "External id": 165902, "cbid": 211, "correlation": 165902 } }, { "ph": "s", "id": 165902, "pid": 76337, "tid": -914061504, "ts": 1716454224040450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224040577, "dur": 4, "args": { "External id": 165919, "cbid": 251, "correlation": 165919 } }, { "ph": "f", "id": 165919, "pid": 76337, "tid": -914061504, "ts": 1716454224040577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454224040604, "dur": 11, "args": { "External id": 165921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165921, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 165921, "pid": 5, "tid": 7, "ts": 1716454224040604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040588, "dur": 16, "args": { "External id": 165921, "cbid": 211, "correlation": 165921 } }, { "ph": "s", "id": 165921, "pid": 76337, "tid": -914061504, "ts": 1716454224040588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224040660, "dur": 3, "args": { "External id": 165929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165929, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 165929, "pid": 5, "tid": 7, "ts": 1716454224040660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040648, "dur": 12, "args": { "External id": 165929, "cbid": 211, "correlation": 165929 } }, { "ph": "s", "id": 165929, "pid": 76337, "tid": -914061504, "ts": 1716454224040648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224040717, "dur": 1, "args": { "External id": 165945, "cbid": 251, "correlation": 165945 } }, { "ph": "f", "id": 165945, "pid": 76337, "tid": -914061504, "ts": 1716454224040717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224040723, "dur": 0, "args": { "External id": 165947, "cbid": 251, "correlation": 165947 } }, { "ph": "f", "id": 165947, "pid": 76337, "tid": -914061504, "ts": 1716454224040723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224040738, "dur": 13, "args": { "External id": 165948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165948, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 165948, "pid": 5, "tid": 7, "ts": 1716454224040738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040725, "dur": 13, "args": { "External id": 165948, "cbid": 211, "correlation": 165948 } }, { "ph": "s", "id": 165948, "pid": 76337, "tid": -914061504, "ts": 1716454224040725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224040754, "dur": 5, "args": { "External id": 165950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165950, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 165950, "pid": 5, "tid": 7, "ts": 1716454224040754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040743, "dur": 10, "args": { "External id": 165950, "cbid": 211, "correlation": 165950 } }, { "ph": "s", "id": 165950, "pid": 76337, "tid": -914061504, "ts": 1716454224040743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224040848, "dur": 1, "args": { "External id": 165960, "cbid": 317, "correlation": 165960 } }, { "ph": "f", "id": 165960, "pid": 76337, "tid": -914061504, "ts": 1716454224040848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224040850, "dur": 1, "args": { "External id": 165961, "cbid": 203, "correlation": 165961 } }, { "ph": "f", "id": 165961, "pid": 76337, "tid": -914061504, "ts": 1716454224040850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224040852, "dur": 1, "args": { "External id": 165962, "cbid": 205, "correlation": 165962 } }, { "ph": "f", "id": 165962, "pid": 76337, "tid": -914061504, "ts": 1716454224040852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224040903, "dur": 7, "args": { "External id": 165966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165966, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165966, "pid": 5, "tid": 7, "ts": 1716454224040903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040889, "dur": 14, "args": { "External id": 165966, "cbid": 211, "correlation": 165966 } }, { "ph": "s", "id": 165966, "pid": 76337, "tid": -914061504, "ts": 1716454224040889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224040914, "dur": 4, "args": { "External id": 165968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165968, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 165968, "pid": 5, "tid": 7, "ts": 1716454224040914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040906, "dur": 6, "args": { "External id": 165968, "cbid": 211, "correlation": 165968 } }, { "ph": "s", "id": 165968, "pid": 76337, "tid": -914061504, "ts": 1716454224040906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224040932, "dur": 3, "args": { "External id": 165970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 165970, "pid": 5, "tid": 7, "ts": 1716454224040932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040922, "dur": 9, "args": { "External id": 165970, "cbid": 211, "correlation": 165970 } }, { "ph": "s", "id": 165970, "pid": 76337, "tid": -914061504, "ts": 1716454224040922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224040937, "dur": 0, "args": { "External id": 165971, "cbid": 51, "correlation": 165971 } }, { "ph": "s", "id": 165971, "pid": 76337, "tid": -914061504, "ts": 1716454224040937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224040948, "dur": 85, "args": { "External id": 165972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165972, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 165972, "pid": 5, "tid": 7, "ts": 1716454224040948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040939, "dur": 8, "args": { "External id": 165972, "cbid": 211, "correlation": 165972 } }, { "ph": "s", "id": 165972, "pid": 76337, "tid": -914061504, "ts": 1716454224040939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224041034, "dur": 60, "args": { "External id": 165977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 165977, "pid": 5, "tid": 7, "ts": 1716454224041034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224040983, "dur": 10, "args": { "External id": 165977, "cbid": 211, "correlation": 165977 } }, { "ph": "s", "id": 165977, "pid": 76337, "tid": -914061504, "ts": 1716454224040983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224042806, "dur": 51, "args": { "External id": 165997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 165997, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 165997, "pid": 5, "tid": 7, "ts": 1716454224042806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224042789, "dur": 17, "args": { "External id": 165997, "cbid": 211, "correlation": 165997 } }, { "ph": "s", "id": 165997, "pid": 76337, "tid": -914061504, "ts": 1716454224042789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224042858, "dur": 4, "args": { "External id": 166009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166009, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166009, "pid": 5, "tid": 7, "ts": 1716454224042858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224042818, "dur": 9, "args": { "External id": 166009, "cbid": 211, "correlation": 166009 } }, { "ph": "s", "id": 166009, "pid": 76337, "tid": -914061504, "ts": 1716454224042818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224042864, "dur": 56, "args": { "External id": 166012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166012, "pid": 5, "tid": 7, "ts": 1716454224042864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224042842, "dur": 7, "args": { "External id": 166012, "cbid": 211, "correlation": 166012 } }, { "ph": "s", "id": 166012, "pid": 76337, "tid": -914061504, "ts": 1716454224042842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224042921, "dur": 36, "args": { "External id": 166021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166021, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166021, "pid": 5, "tid": 7, "ts": 1716454224042921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224042888, "dur": 10, "args": { "External id": 166021, "cbid": 211, "correlation": 166021 } }, { "ph": "s", "id": 166021, "pid": 76337, "tid": -914061504, "ts": 1716454224042888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224042945, "dur": 0, "args": { "External id": 166031, "cbid": 317, "correlation": 166031 } }, { "ph": "f", "id": 166031, "pid": 76337, "tid": -914061504, "ts": 1716454224042945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224042946, "dur": 0, "args": { "External id": 166032, "cbid": 203, "correlation": 166032 } }, { "ph": "f", "id": 166032, "pid": 76337, "tid": -914061504, "ts": 1716454224042946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224042947, "dur": 0, "args": { "External id": 166033, "cbid": 205, "correlation": 166033 } }, { "ph": "f", "id": 166033, "pid": 76337, "tid": -914061504, "ts": 1716454224042947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224042987, "dur": 40, "args": { "External id": 166037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166037, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166037, "pid": 5, "tid": 7, "ts": 1716454224042987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224042966, "dur": 20, "args": { "External id": 166037, "cbid": 211, "correlation": 166037 } }, { "ph": "s", "id": 166037, "pid": 76337, "tid": -914061504, "ts": 1716454224042966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224043028, "dur": 14, "args": { "External id": 166039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166039, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166039, "pid": 5, "tid": 7, "ts": 1716454224043028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224042989, "dur": 6, "args": { "External id": 166039, "cbid": 211, "correlation": 166039 } }, { "ph": "s", "id": 166039, "pid": 76337, "tid": -914061504, "ts": 1716454224042989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224043043, "dur": 3, "args": { "External id": 166041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166041, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166041, "pid": 5, "tid": 7, "ts": 1716454224043043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043001, "dur": 6, "args": { "External id": 166041, "cbid": 211, "correlation": 166041 } }, { "ph": "s", "id": 166041, "pid": 76337, "tid": -914061504, "ts": 1716454224043001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224043011, "dur": 0, "args": { "External id": 166042, "cbid": 51, "correlation": 166042 } }, { "ph": "s", "id": 166042, "pid": 76337, "tid": -914061504, "ts": 1716454224043011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224043048, "dur": 705, "args": { "External id": 166043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166043, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166043, "pid": 5, "tid": 7, "ts": 1716454224043048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043012, "dur": 7, "args": { "External id": 166043, "cbid": 211, "correlation": 166043 } }, { "ph": "s", "id": 166043, "pid": 76337, "tid": -914061504, "ts": 1716454224043012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224043754, "dur": 60, "args": { "External id": 166048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166048, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166048, "pid": 5, "tid": 7, "ts": 1716454224043754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043043, "dur": 9, "args": { "External id": 166048, "cbid": 211, "correlation": 166048 } }, { "ph": "s", "id": 166048, "pid": 76337, "tid": -914061504, "ts": 1716454224043043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224043816, "dur": 5, "args": { "External id": 166056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166056, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166056, "pid": 5, "tid": 7, "ts": 1716454224043816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043088, "dur": 9, "args": { "External id": 166056, "cbid": 211, "correlation": 166056 } }, { "ph": "s", "id": 166056, "pid": 76337, "tid": -914061504, "ts": 1716454224043088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043153, "dur": 1, "args": { "External id": 166072, "cbid": 251, "correlation": 166072 } }, { "ph": "f", "id": 166072, "pid": 76337, "tid": -914061504, "ts": 1716454224043153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043158, "dur": 0, "args": { "External id": 166074, "cbid": 251, "correlation": 166074 } }, { "ph": "f", "id": 166074, "pid": 76337, "tid": -914061504, "ts": 1716454224043158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224043822, "dur": 9, "args": { "External id": 166075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166075, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 166075, "pid": 5, "tid": 7, "ts": 1716454224043822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043160, "dur": 12, "args": { "External id": 166075, "cbid": 211, "correlation": 166075 } }, { "ph": "s", "id": 166075, "pid": 76337, "tid": -914061504, "ts": 1716454224043160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224043832, "dur": 4, "args": { "External id": 166077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166077, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 166077, "pid": 5, "tid": 7, "ts": 1716454224043832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043174, "dur": 6, "args": { "External id": 166077, "cbid": 211, "correlation": 166077 } }, { "ph": "s", "id": 166077, "pid": 76337, "tid": -914061504, "ts": 1716454224043174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224043837, "dur": 55, "args": { "External id": 166087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166087, "pid": 5, "tid": 7, "ts": 1716454224043837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043234, "dur": 12, "args": { "External id": 166087, "cbid": 211, "correlation": 166087 } }, { "ph": "s", "id": 166087, "pid": 76337, "tid": -914061504, "ts": 1716454224043234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224043893, "dur": 51, "args": { "External id": 166107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166107, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 166107, "pid": 5, "tid": 7, "ts": 1716454224043893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043299, "dur": 10, "args": { "External id": 166107, "cbid": 211, "correlation": 166107 } }, { "ph": "s", "id": 166107, "pid": 76337, "tid": -914061504, "ts": 1716454224043299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224043946, "dur": 4, "args": { "External id": 166119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166119, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166119, "pid": 5, "tid": 7, "ts": 1716454224043946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043320, "dur": 7, "args": { "External id": 166119, "cbid": 211, "correlation": 166119 } }, { "ph": "s", "id": 166119, "pid": 76337, "tid": -914061504, "ts": 1716454224043320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224043951, "dur": 57, "args": { "External id": 166122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166122, "pid": 5, "tid": 7, "ts": 1716454224043951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043339, "dur": 7, "args": { "External id": 166122, "cbid": 211, "correlation": 166122 } }, { "ph": "s", "id": 166122, "pid": 76337, "tid": -914061504, "ts": 1716454224043339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224044010, "dur": 36, "args": { "External id": 166131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166131, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166131, "pid": 5, "tid": 7, "ts": 1716454224044010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043381, "dur": 10, "args": { "External id": 166131, "cbid": 211, "correlation": 166131 } }, { "ph": "s", "id": 166131, "pid": 76337, "tid": -914061504, "ts": 1716454224043381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224043450, "dur": 0, "args": { "External id": 166141, "cbid": 317, "correlation": 166141 } }, { "ph": "f", "id": 166141, "pid": 76337, "tid": -914061504, "ts": 1716454224043450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224043451, "dur": 0, "args": { "External id": 166142, "cbid": 203, "correlation": 166142 } }, { "ph": "f", "id": 166142, "pid": 76337, "tid": -914061504, "ts": 1716454224043451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224043452, "dur": 0, "args": { "External id": 166143, "cbid": 205, "correlation": 166143 } }, { "ph": "f", "id": 166143, "pid": 76337, "tid": -914061504, "ts": 1716454224043452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224044047, "dur": 40, "args": { "External id": 166147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166147, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166147, "pid": 5, "tid": 7, "ts": 1716454224044047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043467, "dur": 13, "args": { "External id": 166147, "cbid": 211, "correlation": 166147 } }, { "ph": "s", "id": 166147, "pid": 76337, "tid": -914061504, "ts": 1716454224043467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224044088, "dur": 14, "args": { "External id": 166149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166149, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166149, "pid": 5, "tid": 7, "ts": 1716454224044088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043482, "dur": 5, "args": { "External id": 166149, "cbid": 211, "correlation": 166149 } }, { "ph": "s", "id": 166149, "pid": 76337, "tid": -914061504, "ts": 1716454224043482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224044104, "dur": 3, "args": { "External id": 166151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166151, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166151, "pid": 5, "tid": 7, "ts": 1716454224044104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043491, "dur": 5, "args": { "External id": 166151, "cbid": 211, "correlation": 166151 } }, { "ph": "s", "id": 166151, "pid": 76337, "tid": -914061504, "ts": 1716454224043491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224043499, "dur": 0, "args": { "External id": 166152, "cbid": 51, "correlation": 166152 } }, { "ph": "s", "id": 166152, "pid": 76337, "tid": -914061504, "ts": 1716454224043499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224044109, "dur": 702, "args": { "External id": 166153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166153, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166153, "pid": 5, "tid": 7, "ts": 1716454224044109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043500, "dur": 5, "args": { "External id": 166153, "cbid": 211, "correlation": 166153 } }, { "ph": "s", "id": 166153, "pid": 76337, "tid": -914061504, "ts": 1716454224043500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224044813, "dur": 59, "args": { "External id": 166158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166158, "pid": 5, "tid": 7, "ts": 1716454224044813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043528, "dur": 8, "args": { "External id": 166158, "cbid": 211, "correlation": 166158 } }, { "ph": "s", "id": 166158, "pid": 76337, "tid": -914061504, "ts": 1716454224043528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224044873, "dur": 50, "args": { "External id": 166166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166166, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166166, "pid": 5, "tid": 7, "ts": 1716454224044873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043559, "dur": 9, "args": { "External id": 166166, "cbid": 211, "correlation": 166166 } }, { "ph": "s", "id": 166166, "pid": 76337, "tid": -914061504, "ts": 1716454224043559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224044924, "dur": 35, "args": { "External id": 166174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166174, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166174, "pid": 5, "tid": 7, "ts": 1716454224044924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043590, "dur": 9, "args": { "External id": 166174, "cbid": 211, "correlation": 166174 } }, { "ph": "s", "id": 166174, "pid": 76337, "tid": -914061504, "ts": 1716454224043590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224044961, "dur": 51, "args": { "External id": 166194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166194, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 166194, "pid": 5, "tid": 7, "ts": 1716454224044961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043674, "dur": 12, "args": { "External id": 166194, "cbid": 211, "correlation": 166194 } }, { "ph": "s", "id": 166194, "pid": 76337, "tid": -914061504, "ts": 1716454224043674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224045014, "dur": 4, "args": { "External id": 166206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166206, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166206, "pid": 5, "tid": 7, "ts": 1716454224045014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043695, "dur": 6, "args": { "External id": 166206, "cbid": 211, "correlation": 166206 } }, { "ph": "s", "id": 166206, "pid": 76337, "tid": -914061504, "ts": 1716454224043695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224045019, "dur": 56, "args": { "External id": 166209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166209, "pid": 5, "tid": 7, "ts": 1716454224045019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043713, "dur": 7, "args": { "External id": 166209, "cbid": 211, "correlation": 166209 } }, { "ph": "s", "id": 166209, "pid": 76337, "tid": -914061504, "ts": 1716454224043713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224043772, "dur": 0, "args": { "External id": 166220, "cbid": 317, "correlation": 166220 } }, { "ph": "f", "id": 166220, "pid": 76337, "tid": -914061504, "ts": 1716454224043772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224043773, "dur": 0, "args": { "External id": 166221, "cbid": 203, "correlation": 166221 } }, { "ph": "f", "id": 166221, "pid": 76337, "tid": -914061504, "ts": 1716454224043773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224043774, "dur": 0, "args": { "External id": 166222, "cbid": 205, "correlation": 166222 } }, { "ph": "f", "id": 166222, "pid": 76337, "tid": -914061504, "ts": 1716454224043774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043810, "dur": 2, "args": { "External id": 166226, "cbid": 251, "correlation": 166226 } }, { "ph": "f", "id": 166226, "pid": 76337, "tid": -914061504, "ts": 1716454224043810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043813, "dur": 1, "args": { "External id": 166227, "cbid": 251, "correlation": 166227 } }, { "ph": "f", "id": 166227, "pid": 76337, "tid": -914061504, "ts": 1716454224043813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043815, "dur": 1, "args": { "External id": 166228, "cbid": 251, "correlation": 166228 } }, { "ph": "f", "id": 166228, "pid": 76337, "tid": -914061504, "ts": 1716454224043815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043818, "dur": 1, "args": { "External id": 166229, "cbid": 251, "correlation": 166229 } }, { "ph": "f", "id": 166229, "pid": 76337, "tid": -914061504, "ts": 1716454224043818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043820, "dur": 1, "args": { "External id": 166230, "cbid": 251, "correlation": 166230 } }, { "ph": "f", "id": 166230, "pid": 76337, "tid": -914061504, "ts": 1716454224043820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043821, "dur": 1, "args": { "External id": 166231, "cbid": 251, "correlation": 166231 } }, { "ph": "f", "id": 166231, "pid": 76337, "tid": -914061504, "ts": 1716454224043821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043823, "dur": 1, "args": { "External id": 166232, "cbid": 251, "correlation": 166232 } }, { "ph": "f", "id": 166232, "pid": 76337, "tid": -914061504, "ts": 1716454224043823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043825, "dur": 1, "args": { "External id": 166233, "cbid": 251, "correlation": 166233 } }, { "ph": "f", "id": 166233, "pid": 76337, "tid": -914061504, "ts": 1716454224043825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224043828, "dur": 0, "args": { "External id": 166234, "cbid": 251, "correlation": 166234 } }, { "ph": "f", "id": 166234, "pid": 76337, "tid": -914061504, "ts": 1716454224043828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224045076, "dur": 114, "args": { "External id": 166235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166235, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166235, "pid": 5, "tid": 7, "ts": 1716454224045076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043831, "dur": 15, "args": { "External id": 166235, "cbid": 211, "correlation": 166235 } }, { "ph": "s", "id": 166235, "pid": 76337, "tid": -914061504, "ts": 1716454224043831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224045192, "dur": 60, "args": { "External id": 166241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166241, "pid": 5, "tid": 7, "ts": 1716454224045192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043870, "dur": 8, "args": { "External id": 166241, "cbid": 211, "correlation": 166241 } }, { "ph": "s", "id": 166241, "pid": 76337, "tid": -914061504, "ts": 1716454224043870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224045253, "dur": 562, "args": { "External id": 166250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166250, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166250, "pid": 5, "tid": 7, "ts": 1716454224045253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224043963, "dur": 24, "args": { "External id": 166250, "cbid": 211, "correlation": 166250 } }, { "ph": "s", "id": 166250, "pid": 76337, "tid": -914061504, "ts": 1716454224043963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224045816, "dur": 183, "args": { "External id": 166272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166272, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166272, "pid": 5, "tid": 7, "ts": 1716454224045816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044045, "dur": 13, "args": { "External id": 166272, "cbid": 211, "correlation": 166272 } }, { "ph": "s", "id": 166272, "pid": 76337, "tid": -914061504, "ts": 1716454224044045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044161, "dur": 2, "args": { "External id": 166283, "cbid": 251, "correlation": 166283 } }, { "ph": "f", "id": 166283, "pid": 76337, "tid": -914061504, "ts": 1716454224044161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224046000, "dur": 197, "args": { "External id": 166284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166284, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166284, "pid": 5, "tid": 7, "ts": 1716454224046000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044168, "dur": 14, "args": { "External id": 166284, "cbid": 211, "correlation": 166284 } }, { "ph": "s", "id": 166284, "pid": 76337, "tid": -914061504, "ts": 1716454224044168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044242, "dur": 1, "args": { "External id": 166295, "cbid": 251, "correlation": 166295 } }, { "ph": "f", "id": 166295, "pid": 76337, "tid": -914061504, "ts": 1716454224044242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224046199, "dur": 188, "args": { "External id": 166296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166296, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166296, "pid": 5, "tid": 7, "ts": 1716454224046199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044246, "dur": 12, "args": { "External id": 166296, "cbid": 211, "correlation": 166296 } }, { "ph": "s", "id": 166296, "pid": 76337, "tid": -914061504, "ts": 1716454224044246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044311, "dur": 1, "args": { "External id": 166307, "cbid": 251, "correlation": 166307 } }, { "ph": "f", "id": 166307, "pid": 76337, "tid": -914061504, "ts": 1716454224044311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224046389, "dur": 191, "args": { "External id": 166308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166308, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166308, "pid": 5, "tid": 7, "ts": 1716454224046389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044315, "dur": 11, "args": { "External id": 166308, "cbid": 211, "correlation": 166308 } }, { "ph": "s", "id": 166308, "pid": 76337, "tid": -914061504, "ts": 1716454224044315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224046581, "dur": 18828, "args": { "External id": 166329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166329, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166329, "pid": 5, "tid": 7, "ts": 1716454224046581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044423, "dur": 15, "args": { "External id": 166329, "cbid": 211, "correlation": 166329 } }, { "ph": "s", "id": 166329, "pid": 76337, "tid": -914061504, "ts": 1716454224044423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044534, "dur": 3, "args": { "External id": 166347, "cbid": 251, "correlation": 166347 } }, { "ph": "f", "id": 166347, "pid": 76337, "tid": -914061504, "ts": 1716454224044534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224065410, "dur": 203, "args": { "External id": 166349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166349, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166349, "pid": 5, "tid": 7, "ts": 1716454224065410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044542, "dur": 13, "args": { "External id": 166349, "cbid": 211, "correlation": 166349 } }, { "ph": "s", "id": 166349, "pid": 76337, "tid": -914061504, "ts": 1716454224044542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224065615, "dur": 67, "args": { "External id": 166357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166357, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166357, "pid": 5, "tid": 7, "ts": 1716454224065615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044614, "dur": 12, "args": { "External id": 166357, "cbid": 211, "correlation": 166357 } }, { "ph": "s", "id": 166357, "pid": 76337, "tid": -914061504, "ts": 1716454224044614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224065683, "dur": 97, "args": { "External id": 166365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166365, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166365, "pid": 5, "tid": 7, "ts": 1716454224065683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044653, "dur": 9, "args": { "External id": 166365, "cbid": 211, "correlation": 166365 } }, { "ph": "s", "id": 166365, "pid": 76337, "tid": -914061504, "ts": 1716454224044653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224065781, "dur": 55, "args": { "External id": 166376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166376, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166376, "pid": 5, "tid": 7, "ts": 1716454224065781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044735, "dur": 14, "args": { "External id": 166376, "cbid": 211, "correlation": 166376 } }, { "ph": "s", "id": 166376, "pid": 76337, "tid": -914061504, "ts": 1716454224044735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224065837, "dur": 92, "args": { "External id": 166398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166398, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166398, "pid": 5, "tid": 7, "ts": 1716454224065837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044769, "dur": 7, "args": { "External id": 166398, "cbid": 211, "correlation": 166398 } }, { "ph": "s", "id": 166398, "pid": 76337, "tid": -914061504, "ts": 1716454224044769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044850, "dur": 1, "args": { "External id": 166409, "cbid": 251, "correlation": 166409 } }, { "ph": "f", "id": 166409, "pid": 76337, "tid": -914061504, "ts": 1716454224044850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224065931, "dur": 104, "args": { "External id": 166410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166410, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166410, "pid": 5, "tid": 7, "ts": 1716454224065931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044856, "dur": 12, "args": { "External id": 166410, "cbid": 211, "correlation": 166410 } }, { "ph": "s", "id": 166410, "pid": 76337, "tid": -914061504, "ts": 1716454224044856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044941, "dur": 1, "args": { "External id": 166421, "cbid": 251, "correlation": 166421 } }, { "ph": "f", "id": 166421, "pid": 76337, "tid": -914061504, "ts": 1716454224044941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224044945, "dur": 0, "args": { "External id": 166422, "cbid": 251, "correlation": 166422 } }, { "ph": "f", "id": 166422, "pid": 76337, "tid": -914061504, "ts": 1716454224044945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224066036, "dur": 10, "args": { "External id": 166423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166423, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 166423, "pid": 5, "tid": 7, "ts": 1716454224066036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044947, "dur": 14, "args": { "External id": 166423, "cbid": 211, "correlation": 166423 } }, { "ph": "s", "id": 166423, "pid": 76337, "tid": -914061504, "ts": 1716454224044947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224066047, "dur": 5, "args": { "External id": 166425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166425, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 166425, "pid": 5, "tid": 7, "ts": 1716454224066047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224044964, "dur": 7, "args": { "External id": 166425, "cbid": 211, "correlation": 166425 } }, { "ph": "s", "id": 166425, "pid": 76337, "tid": -914061504, "ts": 1716454224044964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045037, "dur": 1, "args": { "External id": 166436, "cbid": 251, "correlation": 166436 } }, { "ph": "f", "id": 166436, "pid": 76337, "tid": -914061504, "ts": 1716454224045037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045041, "dur": 0, "args": { "External id": 166437, "cbid": 251, "correlation": 166437 } }, { "ph": "f", "id": 166437, "pid": 76337, "tid": -914061504, "ts": 1716454224045041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224066054, "dur": 6, "args": { "External id": 166438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166438, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 166438, "pid": 5, "tid": 7, "ts": 1716454224066054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045042, "dur": 12, "args": { "External id": 166438, "cbid": 211, "correlation": 166438 } }, { "ph": "s", "id": 166438, "pid": 76337, "tid": -914061504, "ts": 1716454224045042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224066061, "dur": 3, "args": { "External id": 166440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166440, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 166440, "pid": 5, "tid": 7, "ts": 1716454224066061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045057, "dur": 5, "args": { "External id": 166440, "cbid": 211, "correlation": 166440 } }, { "ph": "s", "id": 166440, "pid": 76337, "tid": -914061504, "ts": 1716454224045057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224066066, "dur": 156, "args": { "External id": 166461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166461, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166461, "pid": 5, "tid": 7, "ts": 1716454224066066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045130, "dur": 13, "args": { "External id": 166461, "cbid": 211, "correlation": 166461 } }, { "ph": "s", "id": 166461, "pid": 76337, "tid": -914061504, "ts": 1716454224045130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045228, "dur": 2, "args": { "External id": 166479, "cbid": 251, "correlation": 166479 } }, { "ph": "f", "id": 166479, "pid": 76337, "tid": -914061504, "ts": 1716454224045228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224066224, "dur": 107, "args": { "External id": 166481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166481, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166481, "pid": 5, "tid": 7, "ts": 1716454224066224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045234, "dur": 14, "args": { "External id": 166481, "cbid": 211, "correlation": 166481 } }, { "ph": "s", "id": 166481, "pid": 76337, "tid": -914061504, "ts": 1716454224045234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224066332, "dur": 35, "args": { "External id": 166489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166489, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166489, "pid": 5, "tid": 7, "ts": 1716454224066332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045305, "dur": 12, "args": { "External id": 166489, "cbid": 211, "correlation": 166489 } }, { "ph": "s", "id": 166489, "pid": 76337, "tid": -914061504, "ts": 1716454224045305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224066368, "dur": 67, "args": { "External id": 166497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166497, "pid": 5, "tid": 7, "ts": 1716454224066368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045346, "dur": 9, "args": { "External id": 166497, "cbid": 211, "correlation": 166497 } }, { "ph": "s", "id": 166497, "pid": 76337, "tid": -914061504, "ts": 1716454224045346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224066436, "dur": 93, "args": { "External id": 166519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166519, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166519, "pid": 5, "tid": 7, "ts": 1716454224066436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045399, "dur": 10, "args": { "External id": 166519, "cbid": 211, "correlation": 166519 } }, { "ph": "s", "id": 166519, "pid": 76337, "tid": -914061504, "ts": 1716454224045399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045493, "dur": 1, "args": { "External id": 166535, "cbid": 251, "correlation": 166535 } }, { "ph": "f", "id": 166535, "pid": 76337, "tid": -914061504, "ts": 1716454224045493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224066530, "dur": 580, "args": { "External id": 166537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166537, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166537, "pid": 5, "tid": 7, "ts": 1716454224066530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045498, "dur": 13, "args": { "External id": 166537, "cbid": 211, "correlation": 166537 } }, { "ph": "s", "id": 166537, "pid": 76337, "tid": -914061504, "ts": 1716454224045498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224067111, "dur": 243, "args": { "External id": 166545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166545, "pid": 5, "tid": 7, "ts": 1716454224067111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045577, "dur": 15, "args": { "External id": 166545, "cbid": 211, "correlation": 166545 } }, { "ph": "s", "id": 166545, "pid": 76337, "tid": -914061504, "ts": 1716454224045577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224067356, "dur": 253, "args": { "External id": 166553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166553, "pid": 5, "tid": 7, "ts": 1716454224067356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045612, "dur": 8, "args": { "External id": 166553, "cbid": 211, "correlation": 166553 } }, { "ph": "s", "id": 166553, "pid": 76337, "tid": -914061504, "ts": 1716454224045612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045695, "dur": 2, "args": { "External id": 166569, "cbid": 251, "correlation": 166569 } }, { "ph": "f", "id": 166569, "pid": 76337, "tid": -914061504, "ts": 1716454224045695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045701, "dur": 0, "args": { "External id": 166571, "cbid": 251, "correlation": 166571 } }, { "ph": "f", "id": 166571, "pid": 76337, "tid": -914061504, "ts": 1716454224045701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224067611, "dur": 361, "args": { "External id": 166572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166572, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 166572, "pid": 5, "tid": 7, "ts": 1716454224067611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045707, "dur": 13, "args": { "External id": 166572, "cbid": 211, "correlation": 166572 } }, { "ph": "s", "id": 166572, "pid": 76337, "tid": -914061504, "ts": 1716454224045707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224067973, "dur": 50, "args": { "External id": 166580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166580, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166580, "pid": 5, "tid": 7, "ts": 1716454224067973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045750, "dur": 10, "args": { "External id": 166580, "cbid": 211, "correlation": 166580 } }, { "ph": "s", "id": 166580, "pid": 76337, "tid": -914061504, "ts": 1716454224045750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224068024, "dur": 159, "args": { "External id": 166591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166591, "pid": 5, "tid": 7, "ts": 1716454224068024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045820, "dur": 13, "args": { "External id": 166591, "cbid": 211, "correlation": 166591 } }, { "ph": "s", "id": 166591, "pid": 76337, "tid": -914061504, "ts": 1716454224045820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224045885, "dur": 0, "args": { "External id": 166603, "cbid": 317, "correlation": 166603 } }, { "ph": "f", "id": 166603, "pid": 76337, "tid": -914061504, "ts": 1716454224045885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224045886, "dur": 0, "args": { "External id": 166604, "cbid": 203, "correlation": 166604 } }, { "ph": "f", "id": 166604, "pid": 76337, "tid": -914061504, "ts": 1716454224045886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224045887, "dur": 0, "args": { "External id": 166605, "cbid": 205, "correlation": 166605 } }, { "ph": "f", "id": 166605, "pid": 76337, "tid": -914061504, "ts": 1716454224045887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045910, "dur": 1, "args": { "External id": 166609, "cbid": 251, "correlation": 166609 } }, { "ph": "f", "id": 166609, "pid": 76337, "tid": -914061504, "ts": 1716454224045910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045912, "dur": 0, "args": { "External id": 166610, "cbid": 251, "correlation": 166610 } }, { "ph": "f", "id": 166610, "pid": 76337, "tid": -914061504, "ts": 1716454224045912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045913, "dur": 0, "args": { "External id": 166611, "cbid": 251, "correlation": 166611 } }, { "ph": "f", "id": 166611, "pid": 76337, "tid": -914061504, "ts": 1716454224045913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045914, "dur": 0, "args": { "External id": 166612, "cbid": 251, "correlation": 166612 } }, { "ph": "f", "id": 166612, "pid": 76337, "tid": -914061504, "ts": 1716454224045914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045915, "dur": 0, "args": { "External id": 166613, "cbid": 251, "correlation": 166613 } }, { "ph": "f", "id": 166613, "pid": 76337, "tid": -914061504, "ts": 1716454224045915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045915, "dur": 0, "args": { "External id": 166614, "cbid": 251, "correlation": 166614 } }, { "ph": "f", "id": 166614, "pid": 76337, "tid": -914061504, "ts": 1716454224045915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045916, "dur": 0, "args": { "External id": 166615, "cbid": 251, "correlation": 166615 } }, { "ph": "f", "id": 166615, "pid": 76337, "tid": -914061504, "ts": 1716454224045916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045917, "dur": 0, "args": { "External id": 166616, "cbid": 251, "correlation": 166616 } }, { "ph": "f", "id": 166616, "pid": 76337, "tid": -914061504, "ts": 1716454224045917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224045918, "dur": 0, "args": { "External id": 166617, "cbid": 251, "correlation": 166617 } }, { "ph": "f", "id": 166617, "pid": 76337, "tid": -914061504, "ts": 1716454224045918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224068184, "dur": 116, "args": { "External id": 166618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166618, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166618, "pid": 5, "tid": 7, "ts": 1716454224068184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045920, "dur": 13, "args": { "External id": 166618, "cbid": 211, "correlation": 166618 } }, { "ph": "s", "id": 166618, "pid": 76337, "tid": -914061504, "ts": 1716454224045920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224068302, "dur": 60, "args": { "External id": 166624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166624, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166624, "pid": 5, "tid": 7, "ts": 1716454224068302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045955, "dur": 9, "args": { "External id": 166624, "cbid": 211, "correlation": 166624 } }, { "ph": "s", "id": 166624, "pid": 76337, "tid": -914061504, "ts": 1716454224045955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224068363, "dur": 50, "args": { "External id": 166632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166632, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166632, "pid": 5, "tid": 7, "ts": 1716454224068363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224045997, "dur": 10, "args": { "External id": 166632, "cbid": 211, "correlation": 166632 } }, { "ph": "s", "id": 166632, "pid": 76337, "tid": -914061504, "ts": 1716454224045997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224068414, "dur": 52, "args": { "External id": 166652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166652, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 166652, "pid": 5, "tid": 7, "ts": 1716454224068414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046071, "dur": 12, "args": { "External id": 166652, "cbid": 211, "correlation": 166652 } }, { "ph": "s", "id": 166652, "pid": 76337, "tid": -914061504, "ts": 1716454224046071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224068467, "dur": 5, "args": { "External id": 166664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166664, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166664, "pid": 5, "tid": 7, "ts": 1716454224068467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046093, "dur": 6, "args": { "External id": 166664, "cbid": 211, "correlation": 166664 } }, { "ph": "s", "id": 166664, "pid": 76337, "tid": -914061504, "ts": 1716454224046093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224068473, "dur": 57, "args": { "External id": 166667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166667, "pid": 5, "tid": 7, "ts": 1716454224068473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046111, "dur": 6, "args": { "External id": 166667, "cbid": 211, "correlation": 166667 } }, { "ph": "s", "id": 166667, "pid": 76337, "tid": -914061504, "ts": 1716454224046111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224068531, "dur": 37, "args": { "External id": 166676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166676, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166676, "pid": 5, "tid": 7, "ts": 1716454224068531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046150, "dur": 10, "args": { "External id": 166676, "cbid": 211, "correlation": 166676 } }, { "ph": "s", "id": 166676, "pid": 76337, "tid": -914061504, "ts": 1716454224046150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224046203, "dur": 0, "args": { "External id": 166686, "cbid": 317, "correlation": 166686 } }, { "ph": "f", "id": 166686, "pid": 76337, "tid": -914061504, "ts": 1716454224046203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224046203, "dur": 0, "args": { "External id": 166687, "cbid": 203, "correlation": 166687 } }, { "ph": "f", "id": 166687, "pid": 76337, "tid": -914061504, "ts": 1716454224046203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224046204, "dur": 0, "args": { "External id": 166688, "cbid": 205, "correlation": 166688 } }, { "ph": "f", "id": 166688, "pid": 76337, "tid": -914061504, "ts": 1716454224046204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224068570, "dur": 41, "args": { "External id": 166692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166692, "pid": 5, "tid": 7, "ts": 1716454224068570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046218, "dur": 12, "args": { "External id": 166692, "cbid": 211, "correlation": 166692 } }, { "ph": "s", "id": 166692, "pid": 76337, "tid": -914061504, "ts": 1716454224046218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224068612, "dur": 14, "args": { "External id": 166694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166694, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166694, "pid": 5, "tid": 7, "ts": 1716454224068612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046233, "dur": 5, "args": { "External id": 166694, "cbid": 211, "correlation": 166694 } }, { "ph": "s", "id": 166694, "pid": 76337, "tid": -914061504, "ts": 1716454224046233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224068627, "dur": 4, "args": { "External id": 166696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166696, "pid": 5, "tid": 7, "ts": 1716454224068627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046242, "dur": 6, "args": { "External id": 166696, "cbid": 211, "correlation": 166696 } }, { "ph": "s", "id": 166696, "pid": 76337, "tid": -914061504, "ts": 1716454224046242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224046251, "dur": 0, "args": { "External id": 166697, "cbid": 51, "correlation": 166697 } }, { "ph": "s", "id": 166697, "pid": 76337, "tid": -914061504, "ts": 1716454224046251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224068633, "dur": 707, "args": { "External id": 166698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166698, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166698, "pid": 5, "tid": 7, "ts": 1716454224068633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046252, "dur": 5, "args": { "External id": 166698, "cbid": 211, "correlation": 166698 } }, { "ph": "s", "id": 166698, "pid": 76337, "tid": -914061504, "ts": 1716454224046252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224069341, "dur": 59, "args": { "External id": 166703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166703, "pid": 5, "tid": 7, "ts": 1716454224069341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046279, "dur": 9, "args": { "External id": 166703, "cbid": 211, "correlation": 166703 } }, { "ph": "s", "id": 166703, "pid": 76337, "tid": -914061504, "ts": 1716454224046279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224069401, "dur": 4, "args": { "External id": 166711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166711, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166711, "pid": 5, "tid": 7, "ts": 1716454224069401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046321, "dur": 10, "args": { "External id": 166711, "cbid": 211, "correlation": 166711 } }, { "ph": "s", "id": 166711, "pid": 76337, "tid": -914061504, "ts": 1716454224046321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224046387, "dur": 1, "args": { "External id": 166727, "cbid": 251, "correlation": 166727 } }, { "ph": "f", "id": 166727, "pid": 76337, "tid": -914061504, "ts": 1716454224046387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224046392, "dur": 0, "args": { "External id": 166729, "cbid": 251, "correlation": 166729 } }, { "ph": "f", "id": 166729, "pid": 76337, "tid": -914061504, "ts": 1716454224046392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224069407, "dur": 11, "args": { "External id": 166730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166730, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 166730, "pid": 5, "tid": 7, "ts": 1716454224069407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046394, "dur": 11, "args": { "External id": 166730, "cbid": 211, "correlation": 166730 } }, { "ph": "s", "id": 166730, "pid": 76337, "tid": -914061504, "ts": 1716454224046394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224069420, "dur": 5, "args": { "External id": 166732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166732, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 166732, "pid": 5, "tid": 7, "ts": 1716454224069420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046407, "dur": 5, "args": { "External id": 166732, "cbid": 211, "correlation": 166732 } }, { "ph": "s", "id": 166732, "pid": 76337, "tid": -914061504, "ts": 1716454224046407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224069426, "dur": 53, "args": { "External id": 166742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166742, "pid": 5, "tid": 7, "ts": 1716454224069426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046464, "dur": 13, "args": { "External id": 166742, "cbid": 211, "correlation": 166742 } }, { "ph": "s", "id": 166742, "pid": 76337, "tid": -914061504, "ts": 1716454224046464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224069480, "dur": 52, "args": { "External id": 166762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166762, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 166762, "pid": 5, "tid": 7, "ts": 1716454224069480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046532, "dur": 11, "args": { "External id": 166762, "cbid": 211, "correlation": 166762 } }, { "ph": "s", "id": 166762, "pid": 76337, "tid": -914061504, "ts": 1716454224046532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224069534, "dur": 4, "args": { "External id": 166774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166774, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166774, "pid": 5, "tid": 7, "ts": 1716454224069534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046553, "dur": 6, "args": { "External id": 166774, "cbid": 211, "correlation": 166774 } }, { "ph": "s", "id": 166774, "pid": 76337, "tid": -914061504, "ts": 1716454224046553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224069539, "dur": 57, "args": { "External id": 166777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166777, "pid": 5, "tid": 7, "ts": 1716454224069539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046571, "dur": 7, "args": { "External id": 166777, "cbid": 211, "correlation": 166777 } }, { "ph": "s", "id": 166777, "pid": 76337, "tid": -914061504, "ts": 1716454224046571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224069597, "dur": 36, "args": { "External id": 166786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166786, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166786, "pid": 5, "tid": 7, "ts": 1716454224069597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046612, "dur": 10, "args": { "External id": 166786, "cbid": 211, "correlation": 166786 } }, { "ph": "s", "id": 166786, "pid": 76337, "tid": -914061504, "ts": 1716454224046612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224046675, "dur": 0, "args": { "External id": 166796, "cbid": 317, "correlation": 166796 } }, { "ph": "f", "id": 166796, "pid": 76337, "tid": -914061504, "ts": 1716454224046675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224046675, "dur": 0, "args": { "External id": 166797, "cbid": 203, "correlation": 166797 } }, { "ph": "f", "id": 166797, "pid": 76337, "tid": -914061504, "ts": 1716454224046675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224046676, "dur": 0, "args": { "External id": 166798, "cbid": 205, "correlation": 166798 } }, { "ph": "f", "id": 166798, "pid": 76337, "tid": -914061504, "ts": 1716454224046676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224069634, "dur": 40, "args": { "External id": 166802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166802, "pid": 5, "tid": 7, "ts": 1716454224069634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046691, "dur": 12, "args": { "External id": 166802, "cbid": 211, "correlation": 166802 } }, { "ph": "s", "id": 166802, "pid": 76337, "tid": -914061504, "ts": 1716454224046691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224069676, "dur": 14, "args": { "External id": 166804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166804, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166804, "pid": 5, "tid": 7, "ts": 1716454224069676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046705, "dur": 5, "args": { "External id": 166804, "cbid": 211, "correlation": 166804 } }, { "ph": "s", "id": 166804, "pid": 76337, "tid": -914061504, "ts": 1716454224046705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224069691, "dur": 3, "args": { "External id": 166806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166806, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166806, "pid": 5, "tid": 7, "ts": 1716454224069691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046714, "dur": 5, "args": { "External id": 166806, "cbid": 211, "correlation": 166806 } }, { "ph": "s", "id": 166806, "pid": 76337, "tid": -914061504, "ts": 1716454224046714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224046722, "dur": 0, "args": { "External id": 166807, "cbid": 51, "correlation": 166807 } }, { "ph": "s", "id": 166807, "pid": 76337, "tid": -914061504, "ts": 1716454224046722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224069696, "dur": 699, "args": { "External id": 166808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166808, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166808, "pid": 5, "tid": 7, "ts": 1716454224069696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046723, "dur": 5, "args": { "External id": 166808, "cbid": 211, "correlation": 166808 } }, { "ph": "s", "id": 166808, "pid": 76337, "tid": -914061504, "ts": 1716454224046723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224070396, "dur": 60, "args": { "External id": 166813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166813, "pid": 5, "tid": 7, "ts": 1716454224070396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046751, "dur": 8, "args": { "External id": 166813, "cbid": 211, "correlation": 166813 } }, { "ph": "s", "id": 166813, "pid": 76337, "tid": -914061504, "ts": 1716454224046751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224070457, "dur": 49, "args": { "External id": 166821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166821, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166821, "pid": 5, "tid": 7, "ts": 1716454224070457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046784, "dur": 8, "args": { "External id": 166821, "cbid": 211, "correlation": 166821 } }, { "ph": "s", "id": 166821, "pid": 76337, "tid": -914061504, "ts": 1716454224046784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224070508, "dur": 35, "args": { "External id": 166829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166829, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166829, "pid": 5, "tid": 7, "ts": 1716454224070508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046813, "dur": 8, "args": { "External id": 166829, "cbid": 211, "correlation": 166829 } }, { "ph": "s", "id": 166829, "pid": 76337, "tid": -914061504, "ts": 1716454224046813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224070544, "dur": 51, "args": { "External id": 166849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166849, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 166849, "pid": 5, "tid": 7, "ts": 1716454224070544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046893, "dur": 12, "args": { "External id": 166849, "cbid": 211, "correlation": 166849 } }, { "ph": "s", "id": 166849, "pid": 76337, "tid": -914061504, "ts": 1716454224046893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224070597, "dur": 4, "args": { "External id": 166861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166861, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 166861, "pid": 5, "tid": 7, "ts": 1716454224070597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046915, "dur": 6, "args": { "External id": 166861, "cbid": 211, "correlation": 166861 } }, { "ph": "s", "id": 166861, "pid": 76337, "tid": -914061504, "ts": 1716454224046915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224070602, "dur": 56, "args": { "External id": 166864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166864, "pid": 5, "tid": 7, "ts": 1716454224070602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224046932, "dur": 8, "args": { "External id": 166864, "cbid": 211, "correlation": 166864 } }, { "ph": "s", "id": 166864, "pid": 76337, "tid": -914061504, "ts": 1716454224046932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224046998, "dur": 0, "args": { "External id": 166875, "cbid": 317, "correlation": 166875 } }, { "ph": "f", "id": 166875, "pid": 76337, "tid": -914061504, "ts": 1716454224046998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224046999, "dur": 0, "args": { "External id": 166876, "cbid": 203, "correlation": 166876 } }, { "ph": "f", "id": 166876, "pid": 76337, "tid": -914061504, "ts": 1716454224046999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224047000, "dur": 0, "args": { "External id": 166877, "cbid": 205, "correlation": 166877 } }, { "ph": "f", "id": 166877, "pid": 76337, "tid": -914061504, "ts": 1716454224047000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047021, "dur": 1, "args": { "External id": 166881, "cbid": 251, "correlation": 166881 } }, { "ph": "f", "id": 166881, "pid": 76337, "tid": -914061504, "ts": 1716454224047021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047023, "dur": 0, "args": { "External id": 166882, "cbid": 251, "correlation": 166882 } }, { "ph": "f", "id": 166882, "pid": 76337, "tid": -914061504, "ts": 1716454224047023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047024, "dur": 0, "args": { "External id": 166883, "cbid": 251, "correlation": 166883 } }, { "ph": "f", "id": 166883, "pid": 76337, "tid": -914061504, "ts": 1716454224047024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047025, "dur": 0, "args": { "External id": 166884, "cbid": 251, "correlation": 166884 } }, { "ph": "f", "id": 166884, "pid": 76337, "tid": -914061504, "ts": 1716454224047025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047026, "dur": 0, "args": { "External id": 166885, "cbid": 251, "correlation": 166885 } }, { "ph": "f", "id": 166885, "pid": 76337, "tid": -914061504, "ts": 1716454224047026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047026, "dur": 0, "args": { "External id": 166886, "cbid": 251, "correlation": 166886 } }, { "ph": "f", "id": 166886, "pid": 76337, "tid": -914061504, "ts": 1716454224047026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047028, "dur": 0, "args": { "External id": 166887, "cbid": 251, "correlation": 166887 } }, { "ph": "f", "id": 166887, "pid": 76337, "tid": -914061504, "ts": 1716454224047028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047028, "dur": 0, "args": { "External id": 166888, "cbid": 251, "correlation": 166888 } }, { "ph": "f", "id": 166888, "pid": 76337, "tid": -914061504, "ts": 1716454224047028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047030, "dur": 0, "args": { "External id": 166889, "cbid": 251, "correlation": 166889 } }, { "ph": "f", "id": 166889, "pid": 76337, "tid": -914061504, "ts": 1716454224047030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224070659, "dur": 110, "args": { "External id": 166890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166890, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166890, "pid": 5, "tid": 7, "ts": 1716454224070659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047032, "dur": 13, "args": { "External id": 166890, "cbid": 211, "correlation": 166890 } }, { "ph": "s", "id": 166890, "pid": 76337, "tid": -914061504, "ts": 1716454224047032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224070770, "dur": 60, "args": { "External id": 166896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166896, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166896, "pid": 5, "tid": 7, "ts": 1716454224070770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047068, "dur": 8, "args": { "External id": 166896, "cbid": 211, "correlation": 166896 } }, { "ph": "s", "id": 166896, "pid": 76337, "tid": -914061504, "ts": 1716454224047068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224070832, "dur": 600, "args": { "External id": 166905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166905, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166905, "pid": 5, "tid": 7, "ts": 1716454224070832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047151, "dur": 14, "args": { "External id": 166905, "cbid": 211, "correlation": 166905 } }, { "ph": "s", "id": 166905, "pid": 76337, "tid": -914061504, "ts": 1716454224047151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224071433, "dur": 182, "args": { "External id": 166927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166927, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 166927, "pid": 5, "tid": 7, "ts": 1716454224071433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047208, "dur": 10, "args": { "External id": 166927, "cbid": 211, "correlation": 166927 } }, { "ph": "s", "id": 166927, "pid": 76337, "tid": -914061504, "ts": 1716454224047208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047293, "dur": 1, "args": { "External id": 166938, "cbid": 251, "correlation": 166938 } }, { "ph": "f", "id": 166938, "pid": 76337, "tid": -914061504, "ts": 1716454224047293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224071617, "dur": 197, "args": { "External id": 166939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166939, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166939, "pid": 5, "tid": 7, "ts": 1716454224071617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047298, "dur": 13, "args": { "External id": 166939, "cbid": 211, "correlation": 166939 } }, { "ph": "s", "id": 166939, "pid": 76337, "tid": -914061504, "ts": 1716454224047298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047366, "dur": 1, "args": { "External id": 166950, "cbid": 251, "correlation": 166950 } }, { "ph": "f", "id": 166950, "pid": 76337, "tid": -914061504, "ts": 1716454224047366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224071815, "dur": 190, "args": { "External id": 166951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166951, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166951, "pid": 5, "tid": 7, "ts": 1716454224071815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047370, "dur": 11, "args": { "External id": 166951, "cbid": 211, "correlation": 166951 } }, { "ph": "s", "id": 166951, "pid": 76337, "tid": -914061504, "ts": 1716454224047370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047433, "dur": 1, "args": { "External id": 166962, "cbid": 251, "correlation": 166962 } }, { "ph": "f", "id": 166962, "pid": 76337, "tid": -914061504, "ts": 1716454224047433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224072006, "dur": 190, "args": { "External id": 166963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166963, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 166963, "pid": 5, "tid": 7, "ts": 1716454224072006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047437, "dur": 12, "args": { "External id": 166963, "cbid": 211, "correlation": 166963 } }, { "ph": "s", "id": 166963, "pid": 76337, "tid": -914061504, "ts": 1716454224047437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224072197, "dur": 18772, "args": { "External id": 166984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 166984, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 166984, "pid": 5, "tid": 7, "ts": 1716454224072197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047516, "dur": 12, "args": { "External id": 166984, "cbid": 211, "correlation": 166984 } }, { "ph": "s", "id": 166984, "pid": 76337, "tid": -914061504, "ts": 1716454224047516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047613, "dur": 1, "args": { "External id": 167002, "cbid": 251, "correlation": 167002 } }, { "ph": "f", "id": 167002, "pid": 76337, "tid": -914061504, "ts": 1716454224047613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224090970, "dur": 203, "args": { "External id": 167004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167004, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167004, "pid": 5, "tid": 7, "ts": 1716454224090970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047619, "dur": 13, "args": { "External id": 167004, "cbid": 211, "correlation": 167004 } }, { "ph": "s", "id": 167004, "pid": 76337, "tid": -914061504, "ts": 1716454224047619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224091175, "dur": 66, "args": { "External id": 167012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167012, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167012, "pid": 5, "tid": 7, "ts": 1716454224091175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047689, "dur": 12, "args": { "External id": 167012, "cbid": 211, "correlation": 167012 } }, { "ph": "s", "id": 167012, "pid": 76337, "tid": -914061504, "ts": 1716454224047689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224091243, "dur": 97, "args": { "External id": 167020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167020, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167020, "pid": 5, "tid": 7, "ts": 1716454224091243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047727, "dur": 9, "args": { "External id": 167020, "cbid": 211, "correlation": 167020 } }, { "ph": "s", "id": 167020, "pid": 76337, "tid": -914061504, "ts": 1716454224047727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224091341, "dur": 56, "args": { "External id": 167031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167031, "pid": 5, "tid": 7, "ts": 1716454224091341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047798, "dur": 12, "args": { "External id": 167031, "cbid": 211, "correlation": 167031 } }, { "ph": "s", "id": 167031, "pid": 76337, "tid": -914061504, "ts": 1716454224047798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224091398, "dur": 92, "args": { "External id": 167053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167053, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167053, "pid": 5, "tid": 7, "ts": 1716454224091398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047829, "dur": 8, "args": { "External id": 167053, "cbid": 211, "correlation": 167053 } }, { "ph": "s", "id": 167053, "pid": 76337, "tid": -914061504, "ts": 1716454224047829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047913, "dur": 1, "args": { "External id": 167064, "cbid": 251, "correlation": 167064 } }, { "ph": "f", "id": 167064, "pid": 76337, "tid": -914061504, "ts": 1716454224047913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224091492, "dur": 106, "args": { "External id": 167065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167065, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167065, "pid": 5, "tid": 7, "ts": 1716454224091492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224047918, "dur": 12, "args": { "External id": 167065, "cbid": 211, "correlation": 167065 } }, { "ph": "s", "id": 167065, "pid": 76337, "tid": -914061504, "ts": 1716454224047918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224047997, "dur": 1, "args": { "External id": 167076, "cbid": 251, "correlation": 167076 } }, { "ph": "f", "id": 167076, "pid": 76337, "tid": -914061504, "ts": 1716454224047997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048001, "dur": 0, "args": { "External id": 167077, "cbid": 251, "correlation": 167077 } }, { "ph": "f", "id": 167077, "pid": 76337, "tid": -914061504, "ts": 1716454224048001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224091599, "dur": 10, "args": { "External id": 167078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167078, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 167078, "pid": 5, "tid": 7, "ts": 1716454224091599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048002, "dur": 13, "args": { "External id": 167078, "cbid": 211, "correlation": 167078 } }, { "ph": "s", "id": 167078, "pid": 76337, "tid": -914061504, "ts": 1716454224048002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224091610, "dur": 5, "args": { "External id": 167080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167080, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 167080, "pid": 5, "tid": 7, "ts": 1716454224091610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048017, "dur": 6, "args": { "External id": 167080, "cbid": 211, "correlation": 167080 } }, { "ph": "s", "id": 167080, "pid": 76337, "tid": -914061504, "ts": 1716454224048017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048079, "dur": 1, "args": { "External id": 167091, "cbid": 251, "correlation": 167091 } }, { "ph": "f", "id": 167091, "pid": 76337, "tid": -914061504, "ts": 1716454224048079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048082, "dur": 0, "args": { "External id": 167092, "cbid": 251, "correlation": 167092 } }, { "ph": "f", "id": 167092, "pid": 76337, "tid": -914061504, "ts": 1716454224048082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224091617, "dur": 6, "args": { "External id": 167093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167093, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 167093, "pid": 5, "tid": 7, "ts": 1716454224091617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048084, "dur": 12, "args": { "External id": 167093, "cbid": 211, "correlation": 167093 } }, { "ph": "s", "id": 167093, "pid": 76337, "tid": -914061504, "ts": 1716454224048084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224091624, "dur": 3, "args": { "External id": 167095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167095, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 167095, "pid": 5, "tid": 7, "ts": 1716454224091624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048098, "dur": 5, "args": { "External id": 167095, "cbid": 211, "correlation": 167095 } }, { "ph": "s", "id": 167095, "pid": 76337, "tid": -914061504, "ts": 1716454224048098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224091629, "dur": 156, "args": { "External id": 167116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167116, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 167116, "pid": 5, "tid": 7, "ts": 1716454224091629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048171, "dur": 12, "args": { "External id": 167116, "cbid": 211, "correlation": 167116 } }, { "ph": "s", "id": 167116, "pid": 76337, "tid": -914061504, "ts": 1716454224048171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048266, "dur": 1, "args": { "External id": 167134, "cbid": 251, "correlation": 167134 } }, { "ph": "f", "id": 167134, "pid": 76337, "tid": -914061504, "ts": 1716454224048266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224091787, "dur": 106, "args": { "External id": 167136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167136, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 167136, "pid": 5, "tid": 7, "ts": 1716454224091787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048272, "dur": 13, "args": { "External id": 167136, "cbid": 211, "correlation": 167136 } }, { "ph": "s", "id": 167136, "pid": 76337, "tid": -914061504, "ts": 1716454224048272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224091894, "dur": 35, "args": { "External id": 167144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167144, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167144, "pid": 5, "tid": 7, "ts": 1716454224091894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048342, "dur": 12, "args": { "External id": 167144, "cbid": 211, "correlation": 167144 } }, { "ph": "s", "id": 167144, "pid": 76337, "tid": -914061504, "ts": 1716454224048342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224091930, "dur": 67, "args": { "External id": 167152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167152, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167152, "pid": 5, "tid": 7, "ts": 1716454224091930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048382, "dur": 9, "args": { "External id": 167152, "cbid": 211, "correlation": 167152 } }, { "ph": "s", "id": 167152, "pid": 76337, "tid": -914061504, "ts": 1716454224048382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224091999, "dur": 92, "args": { "External id": 167174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167174, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167174, "pid": 5, "tid": 7, "ts": 1716454224091999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048433, "dur": 10, "args": { "External id": 167174, "cbid": 211, "correlation": 167174 } }, { "ph": "s", "id": 167174, "pid": 76337, "tid": -914061504, "ts": 1716454224048433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048520, "dur": 1, "args": { "External id": 167190, "cbid": 251, "correlation": 167190 } }, { "ph": "f", "id": 167190, "pid": 76337, "tid": -914061504, "ts": 1716454224048520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224092092, "dur": 577, "args": { "External id": 167192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167192, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167192, "pid": 5, "tid": 7, "ts": 1716454224092092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048525, "dur": 12, "args": { "External id": 167192, "cbid": 211, "correlation": 167192 } }, { "ph": "s", "id": 167192, "pid": 76337, "tid": -914061504, "ts": 1716454224048525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224092671, "dur": 243, "args": { "External id": 167200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167200, "pid": 5, "tid": 7, "ts": 1716454224092671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048590, "dur": 13, "args": { "External id": 167200, "cbid": 211, "correlation": 167200 } }, { "ph": "s", "id": 167200, "pid": 76337, "tid": -914061504, "ts": 1716454224048590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224092916, "dur": 249, "args": { "External id": 167208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167208, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167208, "pid": 5, "tid": 7, "ts": 1716454224092916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048620, "dur": 8, "args": { "External id": 167208, "cbid": 211, "correlation": 167208 } }, { "ph": "s", "id": 167208, "pid": 76337, "tid": -914061504, "ts": 1716454224048620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048702, "dur": 1, "args": { "External id": 167224, "cbid": 251, "correlation": 167224 } }, { "ph": "f", "id": 167224, "pid": 76337, "tid": -914061504, "ts": 1716454224048702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048707, "dur": 0, "args": { "External id": 167226, "cbid": 251, "correlation": 167226 } }, { "ph": "f", "id": 167226, "pid": 76337, "tid": -914061504, "ts": 1716454224048707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224093166, "dur": 362, "args": { "External id": 167227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167227, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 167227, "pid": 5, "tid": 7, "ts": 1716454224093166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048710, "dur": 13, "args": { "External id": 167227, "cbid": 211, "correlation": 167227 } }, { "ph": "s", "id": 167227, "pid": 76337, "tid": -914061504, "ts": 1716454224048710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224093529, "dur": 50, "args": { "External id": 167235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167235, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167235, "pid": 5, "tid": 7, "ts": 1716454224093529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048752, "dur": 10, "args": { "External id": 167235, "cbid": 211, "correlation": 167235 } }, { "ph": "s", "id": 167235, "pid": 76337, "tid": -914061504, "ts": 1716454224048752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224093580, "dur": 158, "args": { "External id": 167246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167246, "pid": 5, "tid": 7, "ts": 1716454224093580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048819, "dur": 12, "args": { "External id": 167246, "cbid": 211, "correlation": 167246 } }, { "ph": "s", "id": 167246, "pid": 76337, "tid": -914061504, "ts": 1716454224048819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224048881, "dur": 0, "args": { "External id": 167258, "cbid": 317, "correlation": 167258 } }, { "ph": "f", "id": 167258, "pid": 76337, "tid": -914061504, "ts": 1716454224048881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224048882, "dur": 0, "args": { "External id": 167259, "cbid": 203, "correlation": 167259 } }, { "ph": "f", "id": 167259, "pid": 76337, "tid": -914061504, "ts": 1716454224048882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224048883, "dur": 0, "args": { "External id": 167260, "cbid": 205, "correlation": 167260 } }, { "ph": "f", "id": 167260, "pid": 76337, "tid": -914061504, "ts": 1716454224048883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048906, "dur": 1, "args": { "External id": 167264, "cbid": 251, "correlation": 167264 } }, { "ph": "f", "id": 167264, "pid": 76337, "tid": -914061504, "ts": 1716454224048906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048908, "dur": 0, "args": { "External id": 167265, "cbid": 251, "correlation": 167265 } }, { "ph": "f", "id": 167265, "pid": 76337, "tid": -914061504, "ts": 1716454224048908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048908, "dur": 0, "args": { "External id": 167266, "cbid": 251, "correlation": 167266 } }, { "ph": "f", "id": 167266, "pid": 76337, "tid": -914061504, "ts": 1716454224048908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048909, "dur": 0, "args": { "External id": 167267, "cbid": 251, "correlation": 167267 } }, { "ph": "f", "id": 167267, "pid": 76337, "tid": -914061504, "ts": 1716454224048909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048910, "dur": 0, "args": { "External id": 167268, "cbid": 251, "correlation": 167268 } }, { "ph": "f", "id": 167268, "pid": 76337, "tid": -914061504, "ts": 1716454224048910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048911, "dur": 0, "args": { "External id": 167269, "cbid": 251, "correlation": 167269 } }, { "ph": "f", "id": 167269, "pid": 76337, "tid": -914061504, "ts": 1716454224048911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048912, "dur": 0, "args": { "External id": 167270, "cbid": 251, "correlation": 167270 } }, { "ph": "f", "id": 167270, "pid": 76337, "tid": -914061504, "ts": 1716454224048912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048912, "dur": 0, "args": { "External id": 167271, "cbid": 251, "correlation": 167271 } }, { "ph": "f", "id": 167271, "pid": 76337, "tid": -914061504, "ts": 1716454224048912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224048914, "dur": 0, "args": { "External id": 167272, "cbid": 251, "correlation": 167272 } }, { "ph": "f", "id": 167272, "pid": 76337, "tid": -914061504, "ts": 1716454224048914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224093740, "dur": 115, "args": { "External id": 167273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167273, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 167273, "pid": 5, "tid": 7, "ts": 1716454224093740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048916, "dur": 12, "args": { "External id": 167273, "cbid": 211, "correlation": 167273 } }, { "ph": "s", "id": 167273, "pid": 76337, "tid": -914061504, "ts": 1716454224048916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224093856, "dur": 60, "args": { "External id": 167279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167279, "pid": 5, "tid": 7, "ts": 1716454224093856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048950, "dur": 9, "args": { "External id": 167279, "cbid": 211, "correlation": 167279 } }, { "ph": "s", "id": 167279, "pid": 76337, "tid": -914061504, "ts": 1716454224048950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224093917, "dur": 50, "args": { "External id": 167287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167287, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167287, "pid": 5, "tid": 7, "ts": 1716454224093917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224048989, "dur": 9, "args": { "External id": 167287, "cbid": 211, "correlation": 167287 } }, { "ph": "s", "id": 167287, "pid": 76337, "tid": -914061504, "ts": 1716454224048989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224049064, "dur": 0, "args": { "External id": 167297, "cbid": 317, "correlation": 167297 } }, { "ph": "f", "id": 167297, "pid": 76337, "tid": -914061504, "ts": 1716454224049064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224049065, "dur": 0, "args": { "External id": 167298, "cbid": 203, "correlation": 167298 } }, { "ph": "f", "id": 167298, "pid": 76337, "tid": -914061504, "ts": 1716454224049065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224049065, "dur": 0, "args": { "External id": 167299, "cbid": 205, "correlation": 167299 } }, { "ph": "f", "id": 167299, "pid": 76337, "tid": -914061504, "ts": 1716454224049065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224093968, "dur": 40, "args": { "External id": 167303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167303, "pid": 5, "tid": 7, "ts": 1716454224093968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049081, "dur": 13, "args": { "External id": 167303, "cbid": 211, "correlation": 167303 } }, { "ph": "s", "id": 167303, "pid": 76337, "tid": -914061504, "ts": 1716454224049081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224094010, "dur": 15, "args": { "External id": 167305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167305, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167305, "pid": 5, "tid": 7, "ts": 1716454224094010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049097, "dur": 5, "args": { "External id": 167305, "cbid": 211, "correlation": 167305 } }, { "ph": "s", "id": 167305, "pid": 76337, "tid": -914061504, "ts": 1716454224049097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224094027, "dur": 1, "args": { "External id": 167307, "device": 5, "context": 1, "stream": 7, "correlation": 167307, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 167307, "pid": 5, "tid": 7, "ts": 1716454224094027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224049115, "dur": 16, "args": { "External id": 167307, "cbid": 51, "correlation": 167307 } }, { "ph": "s", "id": 167307, "pid": 76337, "tid": -914061504, "ts": 1716454224049115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224094030, "dur": 366, "args": { "External id": 167308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167308, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167308, "pid": 5, "tid": 7, "ts": 1716454224094030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049132, "dur": 10, "args": { "External id": 167308, "cbid": 211, "correlation": 167308 } }, { "ph": "s", "id": 167308, "pid": 76337, "tid": -914061504, "ts": 1716454224049132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224094398, "dur": 13, "args": { "External id": 167310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167310, "pid": 5, "tid": 7, "ts": 1716454224094398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049150, "dur": 8, "args": { "External id": 167310, "cbid": 211, "correlation": 167310 } }, { "ph": "s", "id": 167310, "pid": 76337, "tid": -914061504, "ts": 1716454224049150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224094412, "dur": 15, "args": { "External id": 167316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167316, "pid": 5, "tid": 7, "ts": 1716454224094412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049180, "dur": 10, "args": { "External id": 167316, "cbid": 211, "correlation": 167316 } }, { "ph": "s", "id": 167316, "pid": 76337, "tid": -914061504, "ts": 1716454224049180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224094428, "dur": 19, "args": { "External id": 167336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167336, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 167336, "pid": 5, "tid": 7, "ts": 1716454224094428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049273, "dur": 12, "args": { "External id": 167336, "cbid": 211, "correlation": 167336 } }, { "ph": "s", "id": 167336, "pid": 76337, "tid": -914061504, "ts": 1716454224049273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224094448, "dur": 4, "args": { "External id": 167348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167348, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 167348, "pid": 5, "tid": 7, "ts": 1716454224094448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049296, "dur": 7, "args": { "External id": 167348, "cbid": 211, "correlation": 167348 } }, { "ph": "s", "id": 167348, "pid": 76337, "tid": -914061504, "ts": 1716454224049296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224094454, "dur": 17, "args": { "External id": 167351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167351, "pid": 5, "tid": 7, "ts": 1716454224094454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049314, "dur": 7, "args": { "External id": 167351, "cbid": 211, "correlation": 167351 } }, { "ph": "s", "id": 167351, "pid": 76337, "tid": -914061504, "ts": 1716454224049314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224094472, "dur": 12, "args": { "External id": 167360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167360, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167360, "pid": 5, "tid": 7, "ts": 1716454224094472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049354, "dur": 10, "args": { "External id": 167360, "cbid": 211, "correlation": 167360 } }, { "ph": "s", "id": 167360, "pid": 76337, "tid": -914061504, "ts": 1716454224049354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224049410, "dur": 0, "args": { "External id": 167370, "cbid": 317, "correlation": 167370 } }, { "ph": "f", "id": 167370, "pid": 76337, "tid": -914061504, "ts": 1716454224049410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224049411, "dur": 0, "args": { "External id": 167371, "cbid": 203, "correlation": 167371 } }, { "ph": "f", "id": 167371, "pid": 76337, "tid": -914061504, "ts": 1716454224049411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224049412, "dur": 0, "args": { "External id": 167372, "cbid": 205, "correlation": 167372 } }, { "ph": "f", "id": 167372, "pid": 76337, "tid": -914061504, "ts": 1716454224049412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224094485, "dur": 12, "args": { "External id": 167376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167376, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167376, "pid": 5, "tid": 7, "ts": 1716454224094485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049426, "dur": 12, "args": { "External id": 167376, "cbid": 211, "correlation": 167376 } }, { "ph": "s", "id": 167376, "pid": 76337, "tid": -914061504, "ts": 1716454224049426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224094499, "dur": 25, "args": { "External id": 167378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167378, "pid": 5, "tid": 7, "ts": 1716454224094499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049440, "dur": 6, "args": { "External id": 167378, "cbid": 211, "correlation": 167378 } }, { "ph": "s", "id": 167378, "pid": 76337, "tid": -914061504, "ts": 1716454224049440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224094525, "dur": 4, "args": { "External id": 167380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 167380, "pid": 5, "tid": 7, "ts": 1716454224094525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049452, "dur": 5, "args": { "External id": 167380, "cbid": 211, "correlation": 167380 } }, { "ph": "s", "id": 167380, "pid": 76337, "tid": -914061504, "ts": 1716454224049452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224049461, "dur": 0, "args": { "External id": 167381, "cbid": 51, "correlation": 167381 } }, { "ph": "s", "id": 167381, "pid": 76337, "tid": -914061504, "ts": 1716454224049461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224094530, "dur": 358, "args": { "External id": 167382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167382, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167382, "pid": 5, "tid": 7, "ts": 1716454224094530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049462, "dur": 7, "args": { "External id": 167382, "cbid": 211, "correlation": 167382 } }, { "ph": "s", "id": 167382, "pid": 76337, "tid": -914061504, "ts": 1716454224049462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224094889, "dur": 20, "args": { "External id": 167383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167383, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167383, "pid": 5, "tid": 7, "ts": 1716454224094889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049472, "dur": 5, "args": { "External id": 167383, "cbid": 211, "correlation": 167383 } }, { "ph": "s", "id": 167383, "pid": 76337, "tid": -914061504, "ts": 1716454224049472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224094911, "dur": 32, "args": { "External id": 167389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167389, "pid": 5, "tid": 7, "ts": 1716454224094911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049500, "dur": 8, "args": { "External id": 167389, "cbid": 211, "correlation": 167389 } }, { "ph": "s", "id": 167389, "pid": 76337, "tid": -914061504, "ts": 1716454224049500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224094944, "dur": 4, "args": { "External id": 167397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167397, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 167397, "pid": 5, "tid": 7, "ts": 1716454224094944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049543, "dur": 10, "args": { "External id": 167397, "cbid": 211, "correlation": 167397 } }, { "ph": "s", "id": 167397, "pid": 76337, "tid": -914061504, "ts": 1716454224049543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224049609, "dur": 1, "args": { "External id": 167413, "cbid": 251, "correlation": 167413 } }, { "ph": "f", "id": 167413, "pid": 76337, "tid": -914061504, "ts": 1716454224049609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224049614, "dur": 0, "args": { "External id": 167415, "cbid": 251, "correlation": 167415 } }, { "ph": "f", "id": 167415, "pid": 76337, "tid": -914061504, "ts": 1716454224049614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224094950, "dur": 13, "args": { "External id": 167416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167416, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 167416, "pid": 5, "tid": 7, "ts": 1716454224094950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049616, "dur": 12, "args": { "External id": 167416, "cbid": 211, "correlation": 167416 } }, { "ph": "s", "id": 167416, "pid": 76337, "tid": -914061504, "ts": 1716454224049616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224094964, "dur": 5, "args": { "External id": 167418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167418, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 167418, "pid": 5, "tid": 7, "ts": 1716454224094964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049630, "dur": 5, "args": { "External id": 167418, "cbid": 211, "correlation": 167418 } }, { "ph": "s", "id": 167418, "pid": 76337, "tid": -914061504, "ts": 1716454224049630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224094970, "dur": 29, "args": { "External id": 167428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167428, "pid": 5, "tid": 7, "ts": 1716454224094970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049689, "dur": 13, "args": { "External id": 167428, "cbid": 211, "correlation": 167428 } }, { "ph": "s", "id": 167428, "pid": 76337, "tid": -914061504, "ts": 1716454224049689, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224095000, "dur": 30, "args": { "External id": 167448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167448, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 167448, "pid": 5, "tid": 7, "ts": 1716454224095000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049758, "dur": 11, "args": { "External id": 167448, "cbid": 211, "correlation": 167448 } }, { "ph": "s", "id": 167448, "pid": 76337, "tid": -914061504, "ts": 1716454224049758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224095031, "dur": 4, "args": { "External id": 167460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167460, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 167460, "pid": 5, "tid": 7, "ts": 1716454224095031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049779, "dur": 6, "args": { "External id": 167460, "cbid": 211, "correlation": 167460 } }, { "ph": "s", "id": 167460, "pid": 76337, "tid": -914061504, "ts": 1716454224049779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224095037, "dur": 30, "args": { "External id": 167463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167463, "pid": 5, "tid": 7, "ts": 1716454224095037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049799, "dur": 7, "args": { "External id": 167463, "cbid": 211, "correlation": 167463 } }, { "ph": "s", "id": 167463, "pid": 76337, "tid": -914061504, "ts": 1716454224049799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224095068, "dur": 21, "args": { "External id": 167472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167472, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167472, "pid": 5, "tid": 7, "ts": 1716454224095068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049840, "dur": 11, "args": { "External id": 167472, "cbid": 211, "correlation": 167472 } }, { "ph": "s", "id": 167472, "pid": 76337, "tid": -914061504, "ts": 1716454224049840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224049910, "dur": 0, "args": { "External id": 167482, "cbid": 317, "correlation": 167482 } }, { "ph": "f", "id": 167482, "pid": 76337, "tid": -914061504, "ts": 1716454224049910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224049911, "dur": 0, "args": { "External id": 167483, "cbid": 203, "correlation": 167483 } }, { "ph": "f", "id": 167483, "pid": 76337, "tid": -914061504, "ts": 1716454224049911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224049912, "dur": 0, "args": { "External id": 167484, "cbid": 205, "correlation": 167484 } }, { "ph": "f", "id": 167484, "pid": 76337, "tid": -914061504, "ts": 1716454224049912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224095090, "dur": 25, "args": { "External id": 167488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167488, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167488, "pid": 5, "tid": 7, "ts": 1716454224095090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049931, "dur": 12, "args": { "External id": 167488, "cbid": 211, "correlation": 167488 } }, { "ph": "s", "id": 167488, "pid": 76337, "tid": -914061504, "ts": 1716454224049931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224095117, "dur": 45, "args": { "External id": 167490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167490, "pid": 5, "tid": 7, "ts": 1716454224095117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049946, "dur": 6, "args": { "External id": 167490, "cbid": 211, "correlation": 167490 } }, { "ph": "s", "id": 167490, "pid": 76337, "tid": -914061504, "ts": 1716454224049946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224095163, "dur": 656, "args": { "External id": 167492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167492, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167492, "pid": 5, "tid": 7, "ts": 1716454224095163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049963, "dur": 10, "args": { "External id": 167492, "cbid": 211, "correlation": 167492 } }, { "ph": "s", "id": 167492, "pid": 76337, "tid": -914061504, "ts": 1716454224049963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224095820, "dur": 21, "args": { "External id": 167494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167494, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167494, "pid": 5, "tid": 7, "ts": 1716454224095820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224049985, "dur": 6, "args": { "External id": 167494, "cbid": 211, "correlation": 167494 } }, { "ph": "s", "id": 167494, "pid": 76337, "tid": -914061504, "ts": 1716454224049985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224095842, "dur": 32, "args": { "External id": 167500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167500, "pid": 5, "tid": 7, "ts": 1716454224095842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050015, "dur": 9, "args": { "External id": 167500, "cbid": 211, "correlation": 167500 } }, { "ph": "s", "id": 167500, "pid": 76337, "tid": -914061504, "ts": 1716454224050015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224050075, "dur": 0, "args": { "External id": 167510, "cbid": 317, "correlation": 167510 } }, { "ph": "f", "id": 167510, "pid": 76337, "tid": -914061504, "ts": 1716454224050075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224050076, "dur": 0, "args": { "External id": 167511, "cbid": 203, "correlation": 167511 } }, { "ph": "f", "id": 167511, "pid": 76337, "tid": -914061504, "ts": 1716454224050076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224050077, "dur": 0, "args": { "External id": 167512, "cbid": 205, "correlation": 167512 } }, { "ph": "f", "id": 167512, "pid": 76337, "tid": -914061504, "ts": 1716454224050077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050103, "dur": 1, "args": { "External id": 167516, "cbid": 251, "correlation": 167516 } }, { "ph": "f", "id": 167516, "pid": 76337, "tid": -914061504, "ts": 1716454224050103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050105, "dur": 0, "args": { "External id": 167517, "cbid": 251, "correlation": 167517 } }, { "ph": "f", "id": 167517, "pid": 76337, "tid": -914061504, "ts": 1716454224050105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050106, "dur": 0, "args": { "External id": 167518, "cbid": 251, "correlation": 167518 } }, { "ph": "f", "id": 167518, "pid": 76337, "tid": -914061504, "ts": 1716454224050106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050107, "dur": 0, "args": { "External id": 167519, "cbid": 251, "correlation": 167519 } }, { "ph": "f", "id": 167519, "pid": 76337, "tid": -914061504, "ts": 1716454224050107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050108, "dur": 0, "args": { "External id": 167520, "cbid": 251, "correlation": 167520 } }, { "ph": "f", "id": 167520, "pid": 76337, "tid": -914061504, "ts": 1716454224050108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050109, "dur": 0, "args": { "External id": 167521, "cbid": 251, "correlation": 167521 } }, { "ph": "f", "id": 167521, "pid": 76337, "tid": -914061504, "ts": 1716454224050109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050110, "dur": 0, "args": { "External id": 167522, "cbid": 251, "correlation": 167522 } }, { "ph": "f", "id": 167522, "pid": 76337, "tid": -914061504, "ts": 1716454224050110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050111, "dur": 1, "args": { "External id": 167523, "cbid": 251, "correlation": 167523 } }, { "ph": "f", "id": 167523, "pid": 76337, "tid": -914061504, "ts": 1716454224050111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050113, "dur": 0, "args": { "External id": 167524, "cbid": 251, "correlation": 167524 } }, { "ph": "f", "id": 167524, "pid": 76337, "tid": -914061504, "ts": 1716454224050113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224095876, "dur": 51, "args": { "External id": 167525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167525, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 167525, "pid": 5, "tid": 7, "ts": 1716454224095876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050116, "dur": 13, "args": { "External id": 167525, "cbid": 211, "correlation": 167525 } }, { "ph": "s", "id": 167525, "pid": 76337, "tid": -914061504, "ts": 1716454224050116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224095928, "dur": 32, "args": { "External id": 167531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167531, "pid": 5, "tid": 7, "ts": 1716454224095928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050150, "dur": 8, "args": { "External id": 167531, "cbid": 211, "correlation": 167531 } }, { "ph": "s", "id": 167531, "pid": 76337, "tid": -914061504, "ts": 1716454224050150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224095962, "dur": 27, "args": { "External id": 167539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167539, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167539, "pid": 5, "tid": 7, "ts": 1716454224095962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050179, "dur": 8, "args": { "External id": 167539, "cbid": 211, "correlation": 167539 } }, { "ph": "s", "id": 167539, "pid": 76337, "tid": -914061504, "ts": 1716454224050179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224095990, "dur": 19, "args": { "External id": 167547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167547, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167547, "pid": 5, "tid": 7, "ts": 1716454224095990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050213, "dur": 9, "args": { "External id": 167547, "cbid": 211, "correlation": 167547 } }, { "ph": "s", "id": 167547, "pid": 76337, "tid": -914061504, "ts": 1716454224050213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224096010, "dur": 31, "args": { "External id": 167567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167567, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 167567, "pid": 5, "tid": 7, "ts": 1716454224096010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050303, "dur": 12, "args": { "External id": 167567, "cbid": 211, "correlation": 167567 } }, { "ph": "s", "id": 167567, "pid": 76337, "tid": -914061504, "ts": 1716454224050303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224096042, "dur": 4, "args": { "External id": 167579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167579, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 167579, "pid": 5, "tid": 7, "ts": 1716454224096042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050325, "dur": 6, "args": { "External id": 167579, "cbid": 211, "correlation": 167579 } }, { "ph": "s", "id": 167579, "pid": 76337, "tid": -914061504, "ts": 1716454224050325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224096048, "dur": 30, "args": { "External id": 167582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167582, "pid": 5, "tid": 7, "ts": 1716454224096048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050343, "dur": 7, "args": { "External id": 167582, "cbid": 211, "correlation": 167582 } }, { "ph": "s", "id": 167582, "pid": 76337, "tid": -914061504, "ts": 1716454224050343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224050401, "dur": 0, "args": { "External id": 167593, "cbid": 317, "correlation": 167593 } }, { "ph": "f", "id": 167593, "pid": 76337, "tid": -914061504, "ts": 1716454224050401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224050402, "dur": 0, "args": { "External id": 167594, "cbid": 203, "correlation": 167594 } }, { "ph": "f", "id": 167594, "pid": 76337, "tid": -914061504, "ts": 1716454224050402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224050403, "dur": 0, "args": { "External id": 167595, "cbid": 205, "correlation": 167595 } }, { "ph": "f", "id": 167595, "pid": 76337, "tid": -914061504, "ts": 1716454224050403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224096079, "dur": 22, "args": { "External id": 167599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167599, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167599, "pid": 5, "tid": 7, "ts": 1716454224096079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050417, "dur": 11, "args": { "External id": 167599, "cbid": 211, "correlation": 167599 } }, { "ph": "s", "id": 167599, "pid": 76337, "tid": -914061504, "ts": 1716454224050417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224096103, "dur": 121, "args": { "External id": 167601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167601, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167601, "pid": 5, "tid": 7, "ts": 1716454224096103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050437, "dur": 10, "args": { "External id": 167601, "cbid": 211, "correlation": 167601 } }, { "ph": "s", "id": 167601, "pid": 76337, "tid": -914061504, "ts": 1716454224050437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224096225, "dur": 22, "args": { "External id": 167603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167603, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167603, "pid": 5, "tid": 7, "ts": 1716454224096225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050451, "dur": 5, "args": { "External id": 167603, "cbid": 211, "correlation": 167603 } }, { "ph": "s", "id": 167603, "pid": 76337, "tid": -914061504, "ts": 1716454224050451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224096248, "dur": 32, "args": { "External id": 167609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167609, "pid": 5, "tid": 7, "ts": 1716454224096248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050478, "dur": 8, "args": { "External id": 167609, "cbid": 211, "correlation": 167609 } }, { "ph": "s", "id": 167609, "pid": 76337, "tid": -914061504, "ts": 1716454224050478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224096282, "dur": 178, "args": { "External id": 167618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167618, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167618, "pid": 5, "tid": 7, "ts": 1716454224096282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050572, "dur": 16, "args": { "External id": 167618, "cbid": 211, "correlation": 167618 } }, { "ph": "s", "id": 167618, "pid": 76337, "tid": -914061504, "ts": 1716454224050572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224096462, "dur": 66, "args": { "External id": 167640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167640, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167640, "pid": 5, "tid": 7, "ts": 1716454224096462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050636, "dur": 12, "args": { "External id": 167640, "cbid": 211, "correlation": 167640 } }, { "ph": "s", "id": 167640, "pid": 76337, "tid": -914061504, "ts": 1716454224050636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050741, "dur": 2, "args": { "External id": 167651, "cbid": 251, "correlation": 167651 } }, { "ph": "f", "id": 167651, "pid": 76337, "tid": -914061504, "ts": 1716454224050741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224096529, "dur": 159, "args": { "External id": 167652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167652, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167652, "pid": 5, "tid": 7, "ts": 1716454224096529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050747, "dur": 13, "args": { "External id": 167652, "cbid": 211, "correlation": 167652 } }, { "ph": "s", "id": 167652, "pid": 76337, "tid": -914061504, "ts": 1716454224050747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050817, "dur": 1, "args": { "External id": 167663, "cbid": 251, "correlation": 167663 } }, { "ph": "f", "id": 167663, "pid": 76337, "tid": -914061504, "ts": 1716454224050817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224096689, "dur": 147, "args": { "External id": 167664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167664, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167664, "pid": 5, "tid": 7, "ts": 1716454224096689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050821, "dur": 11, "args": { "External id": 167664, "cbid": 211, "correlation": 167664 } }, { "ph": "s", "id": 167664, "pid": 76337, "tid": -914061504, "ts": 1716454224050821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224050885, "dur": 1, "args": { "External id": 167675, "cbid": 251, "correlation": 167675 } }, { "ph": "f", "id": 167675, "pid": 76337, "tid": -914061504, "ts": 1716454224050885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224096837, "dur": 144, "args": { "External id": 167676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167676, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167676, "pid": 5, "tid": 7, "ts": 1716454224096837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050889, "dur": 12, "args": { "External id": 167676, "cbid": 211, "correlation": 167676 } }, { "ph": "s", "id": 167676, "pid": 76337, "tid": -914061504, "ts": 1716454224050889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224096983, "dur": 1963, "args": { "External id": 167697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167697, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 167697, "pid": 5, "tid": 7, "ts": 1716454224096983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224050997, "dur": 14, "args": { "External id": 167697, "cbid": 211, "correlation": 167697 } }, { "ph": "s", "id": 167697, "pid": 76337, "tid": -914061504, "ts": 1716454224050997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051111, "dur": 1, "args": { "External id": 167715, "cbid": 251, "correlation": 167715 } }, { "ph": "f", "id": 167715, "pid": 76337, "tid": -914061504, "ts": 1716454224051111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224098947, "dur": 149, "args": { "External id": 167717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167717, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 167717, "pid": 5, "tid": 7, "ts": 1716454224098947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051117, "dur": 13, "args": { "External id": 167717, "cbid": 211, "correlation": 167717 } }, { "ph": "s", "id": 167717, "pid": 76337, "tid": -914061504, "ts": 1716454224051117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224099097, "dur": 35, "args": { "External id": 167725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167725, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167725, "pid": 5, "tid": 7, "ts": 1716454224099097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051190, "dur": 12, "args": { "External id": 167725, "cbid": 211, "correlation": 167725 } }, { "ph": "s", "id": 167725, "pid": 76337, "tid": -914061504, "ts": 1716454224051190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224099134, "dur": 50, "args": { "External id": 167733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167733, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167733, "pid": 5, "tid": 7, "ts": 1716454224099134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051229, "dur": 9, "args": { "External id": 167733, "cbid": 211, "correlation": 167733 } }, { "ph": "s", "id": 167733, "pid": 76337, "tid": -914061504, "ts": 1716454224051229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224099185, "dur": 29, "args": { "External id": 167744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167744, "pid": 5, "tid": 7, "ts": 1716454224099185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051310, "dur": 13, "args": { "External id": 167744, "cbid": 211, "correlation": 167744 } }, { "ph": "s", "id": 167744, "pid": 76337, "tid": -914061504, "ts": 1716454224051310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224099216, "dur": 35, "args": { "External id": 167766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167766, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167766, "pid": 5, "tid": 7, "ts": 1716454224099216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051343, "dur": 7, "args": { "External id": 167766, "cbid": 211, "correlation": 167766 } }, { "ph": "s", "id": 167766, "pid": 76337, "tid": -914061504, "ts": 1716454224051343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051427, "dur": 1, "args": { "External id": 167777, "cbid": 251, "correlation": 167777 } }, { "ph": "f", "id": 167777, "pid": 76337, "tid": -914061504, "ts": 1716454224051427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224099252, "dur": 90, "args": { "External id": 167778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167778, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167778, "pid": 5, "tid": 7, "ts": 1716454224099252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051432, "dur": 14, "args": { "External id": 167778, "cbid": 211, "correlation": 167778 } }, { "ph": "s", "id": 167778, "pid": 76337, "tid": -914061504, "ts": 1716454224051432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051505, "dur": 1, "args": { "External id": 167789, "cbid": 251, "correlation": 167789 } }, { "ph": "f", "id": 167789, "pid": 76337, "tid": -914061504, "ts": 1716454224051505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051509, "dur": 0, "args": { "External id": 167790, "cbid": 251, "correlation": 167790 } }, { "ph": "f", "id": 167790, "pid": 76337, "tid": -914061504, "ts": 1716454224051509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224099343, "dur": 12, "args": { "External id": 167791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167791, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 167791, "pid": 5, "tid": 7, "ts": 1716454224099343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051511, "dur": 12, "args": { "External id": 167791, "cbid": 211, "correlation": 167791 } }, { "ph": "s", "id": 167791, "pid": 76337, "tid": -914061504, "ts": 1716454224051511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224099357, "dur": 5, "args": { "External id": 167793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167793, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 167793, "pid": 5, "tid": 7, "ts": 1716454224099357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051526, "dur": 8, "args": { "External id": 167793, "cbid": 211, "correlation": 167793 } }, { "ph": "s", "id": 167793, "pid": 76337, "tid": -914061504, "ts": 1716454224051526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051587, "dur": 1, "args": { "External id": 167804, "cbid": 251, "correlation": 167804 } }, { "ph": "f", "id": 167804, "pid": 76337, "tid": -914061504, "ts": 1716454224051587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051590, "dur": 0, "args": { "External id": 167805, "cbid": 251, "correlation": 167805 } }, { "ph": "f", "id": 167805, "pid": 76337, "tid": -914061504, "ts": 1716454224051590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224099363, "dur": 7, "args": { "External id": 167806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167806, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 167806, "pid": 5, "tid": 7, "ts": 1716454224099363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051591, "dur": 11, "args": { "External id": 167806, "cbid": 211, "correlation": 167806 } }, { "ph": "s", "id": 167806, "pid": 76337, "tid": -914061504, "ts": 1716454224051591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224099371, "dur": 3, "args": { "External id": 167808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167808, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 167808, "pid": 5, "tid": 7, "ts": 1716454224099371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051604, "dur": 5, "args": { "External id": 167808, "cbid": 211, "correlation": 167808 } }, { "ph": "s", "id": 167808, "pid": 76337, "tid": -914061504, "ts": 1716454224051604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224099376, "dur": 92, "args": { "External id": 167829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167829, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 167829, "pid": 5, "tid": 7, "ts": 1716454224099376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051677, "dur": 13, "args": { "External id": 167829, "cbid": 211, "correlation": 167829 } }, { "ph": "s", "id": 167829, "pid": 76337, "tid": -914061504, "ts": 1716454224051677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224051774, "dur": 1, "args": { "External id": 167847, "cbid": 251, "correlation": 167847 } }, { "ph": "f", "id": 167847, "pid": 76337, "tid": -914061504, "ts": 1716454224051774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224099469, "dur": 102, "args": { "External id": 167849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167849, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167849, "pid": 5, "tid": 7, "ts": 1716454224099469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051780, "dur": 14, "args": { "External id": 167849, "cbid": 211, "correlation": 167849 } }, { "ph": "s", "id": 167849, "pid": 76337, "tid": -914061504, "ts": 1716454224051780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224099573, "dur": 19, "args": { "External id": 167857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167857, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167857, "pid": 5, "tid": 7, "ts": 1716454224099573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051850, "dur": 12, "args": { "External id": 167857, "cbid": 211, "correlation": 167857 } }, { "ph": "s", "id": 167857, "pid": 76337, "tid": -914061504, "ts": 1716454224051850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224099593, "dur": 38, "args": { "External id": 167865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167865, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167865, "pid": 5, "tid": 7, "ts": 1716454224099593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051892, "dur": 9, "args": { "External id": 167865, "cbid": 211, "correlation": 167865 } }, { "ph": "s", "id": 167865, "pid": 76337, "tid": -914061504, "ts": 1716454224051892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224099631, "dur": 35, "args": { "External id": 167887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167887, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167887, "pid": 5, "tid": 7, "ts": 1716454224099631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224051945, "dur": 10, "args": { "External id": 167887, "cbid": 211, "correlation": 167887 } }, { "ph": "s", "id": 167887, "pid": 76337, "tid": -914061504, "ts": 1716454224051945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224052046, "dur": 1, "args": { "External id": 167903, "cbid": 251, "correlation": 167903 } }, { "ph": "f", "id": 167903, "pid": 76337, "tid": -914061504, "ts": 1716454224052046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224052051, "dur": 0, "args": { "External id": 167905, "cbid": 251, "correlation": 167905 } }, { "ph": "f", "id": 167905, "pid": 76337, "tid": -914061504, "ts": 1716454224052051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224099667, "dur": 544, "args": { "External id": 167906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167906, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 167906, "pid": 5, "tid": 7, "ts": 1716454224099667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052056, "dur": 13, "args": { "External id": 167906, "cbid": 211, "correlation": 167906 } }, { "ph": "s", "id": 167906, "pid": 76337, "tid": -914061504, "ts": 1716454224052056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224100213, "dur": 126, "args": { "External id": 167914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167914, "pid": 5, "tid": 7, "ts": 1716454224100213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052129, "dur": 14, "args": { "External id": 167914, "cbid": 211, "correlation": 167914 } }, { "ph": "s", "id": 167914, "pid": 76337, "tid": -914061504, "ts": 1716454224052129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224100341, "dur": 128, "args": { "External id": 167922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167922, "pid": 5, "tid": 7, "ts": 1716454224100341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052165, "dur": 9, "args": { "External id": 167922, "cbid": 211, "correlation": 167922 } }, { "ph": "s", "id": 167922, "pid": 76337, "tid": -914061504, "ts": 1716454224052165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224052244, "dur": 1, "args": { "External id": 167938, "cbid": 251, "correlation": 167938 } }, { "ph": "f", "id": 167938, "pid": 76337, "tid": -914061504, "ts": 1716454224052244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224100470, "dur": 260, "args": { "External id": 167940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167940, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167940, "pid": 5, "tid": 7, "ts": 1716454224100470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052250, "dur": 12, "args": { "External id": 167940, "cbid": 211, "correlation": 167940 } }, { "ph": "s", "id": 167940, "pid": 76337, "tid": -914061504, "ts": 1716454224052250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224100731, "dur": 27, "args": { "External id": 167948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167948, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167948, "pid": 5, "tid": 7, "ts": 1716454224100731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052292, "dur": 9, "args": { "External id": 167948, "cbid": 211, "correlation": 167948 } }, { "ph": "s", "id": 167948, "pid": 76337, "tid": -914061504, "ts": 1716454224052292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224100760, "dur": 82, "args": { "External id": 167959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167959, "pid": 5, "tid": 7, "ts": 1716454224100760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052362, "dur": 12, "args": { "External id": 167959, "cbid": 211, "correlation": 167959 } }, { "ph": "s", "id": 167959, "pid": 76337, "tid": -914061504, "ts": 1716454224052362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224052429, "dur": 0, "args": { "External id": 167971, "cbid": 317, "correlation": 167971 } }, { "ph": "f", "id": 167971, "pid": 76337, "tid": -914061504, "ts": 1716454224052429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224052430, "dur": 0, "args": { "External id": 167972, "cbid": 203, "correlation": 167972 } }, { "ph": "f", "id": 167972, "pid": 76337, "tid": -914061504, "ts": 1716454224052430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224052431, "dur": 0, "args": { "External id": 167973, "cbid": 205, "correlation": 167973 } }, { "ph": "f", "id": 167973, "pid": 76337, "tid": -914061504, "ts": 1716454224052431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224100843, "dur": 23, "args": { "External id": 167977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167977, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167977, "pid": 5, "tid": 7, "ts": 1716454224100843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052449, "dur": 12, "args": { "External id": 167977, "cbid": 211, "correlation": 167977 } }, { "ph": "s", "id": 167977, "pid": 76337, "tid": -914061504, "ts": 1716454224052449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224100867, "dur": 121, "args": { "External id": 167979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167979, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 167979, "pid": 5, "tid": 7, "ts": 1716454224100867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052468, "dur": 7, "args": { "External id": 167979, "cbid": 211, "correlation": 167979 } }, { "ph": "s", "id": 167979, "pid": 76337, "tid": -914061504, "ts": 1716454224052468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224100990, "dur": 23, "args": { "External id": 167981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167981, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167981, "pid": 5, "tid": 7, "ts": 1716454224100990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052479, "dur": 6, "args": { "External id": 167981, "cbid": 211, "correlation": 167981 } }, { "ph": "s", "id": 167981, "pid": 76337, "tid": -914061504, "ts": 1716454224052479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224101015, "dur": 33, "args": { "External id": 167987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167987, "pid": 5, "tid": 7, "ts": 1716454224101015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052507, "dur": 8, "args": { "External id": 167987, "cbid": 211, "correlation": 167987 } }, { "ph": "s", "id": 167987, "pid": 76337, "tid": -914061504, "ts": 1716454224052507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224101049, "dur": 27, "args": { "External id": 167995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 167995, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 167995, "pid": 5, "tid": 7, "ts": 1716454224101049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052539, "dur": 8, "args": { "External id": 167995, "cbid": 211, "correlation": 167995 } }, { "ph": "s", "id": 167995, "pid": 76337, "tid": -914061504, "ts": 1716454224052539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224101077, "dur": 30, "args": { "External id": 168015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168015, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 168015, "pid": 5, "tid": 7, "ts": 1716454224101077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052615, "dur": 12, "args": { "External id": 168015, "cbid": 211, "correlation": 168015 } }, { "ph": "s", "id": 168015, "pid": 76337, "tid": -914061504, "ts": 1716454224052615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224101108, "dur": 4, "args": { "External id": 168027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168027, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 168027, "pid": 5, "tid": 7, "ts": 1716454224101108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052638, "dur": 6, "args": { "External id": 168027, "cbid": 211, "correlation": 168027 } }, { "ph": "s", "id": 168027, "pid": 76337, "tid": -914061504, "ts": 1716454224052638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224101114, "dur": 30, "args": { "External id": 168030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168030, "pid": 5, "tid": 7, "ts": 1716454224101114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052656, "dur": 7, "args": { "External id": 168030, "cbid": 211, "correlation": 168030 } }, { "ph": "s", "id": 168030, "pid": 76337, "tid": -914061504, "ts": 1716454224052656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224101146, "dur": 22, "args": { "External id": 168039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168039, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168039, "pid": 5, "tid": 7, "ts": 1716454224101146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052697, "dur": 10, "args": { "External id": 168039, "cbid": 211, "correlation": 168039 } }, { "ph": "s", "id": 168039, "pid": 76337, "tid": -914061504, "ts": 1716454224052697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224052749, "dur": 0, "args": { "External id": 168049, "cbid": 317, "correlation": 168049 } }, { "ph": "f", "id": 168049, "pid": 76337, "tid": -914061504, "ts": 1716454224052749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224052750, "dur": 0, "args": { "External id": 168050, "cbid": 203, "correlation": 168050 } }, { "ph": "f", "id": 168050, "pid": 76337, "tid": -914061504, "ts": 1716454224052750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224052751, "dur": 0, "args": { "External id": 168051, "cbid": 205, "correlation": 168051 } }, { "ph": "f", "id": 168051, "pid": 76337, "tid": -914061504, "ts": 1716454224052751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224101169, "dur": 21, "args": { "External id": 168055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168055, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168055, "pid": 5, "tid": 7, "ts": 1716454224101169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052764, "dur": 11, "args": { "External id": 168055, "cbid": 211, "correlation": 168055 } }, { "ph": "s", "id": 168055, "pid": 76337, "tid": -914061504, "ts": 1716454224052764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224101192, "dur": 44, "args": { "External id": 168057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168057, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168057, "pid": 5, "tid": 7, "ts": 1716454224101192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052777, "dur": 5, "args": { "External id": 168057, "cbid": 211, "correlation": 168057 } }, { "ph": "s", "id": 168057, "pid": 76337, "tid": -914061504, "ts": 1716454224052777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224101237, "dur": 657, "args": { "External id": 168059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168059, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168059, "pid": 5, "tid": 7, "ts": 1716454224101237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052789, "dur": 7, "args": { "External id": 168059, "cbid": 211, "correlation": 168059 } }, { "ph": "s", "id": 168059, "pid": 76337, "tid": -914061504, "ts": 1716454224052789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224101895, "dur": 21, "args": { "External id": 168061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168061, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168061, "pid": 5, "tid": 7, "ts": 1716454224101895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052799, "dur": 5, "args": { "External id": 168061, "cbid": 211, "correlation": 168061 } }, { "ph": "s", "id": 168061, "pid": 76337, "tid": -914061504, "ts": 1716454224052799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224101918, "dur": 33, "args": { "External id": 168067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168067, "pid": 5, "tid": 7, "ts": 1716454224101918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052826, "dur": 9, "args": { "External id": 168067, "cbid": 211, "correlation": 168067 } }, { "ph": "s", "id": 168067, "pid": 76337, "tid": -914061504, "ts": 1716454224052826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224101952, "dur": 4, "args": { "External id": 168075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168075, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 168075, "pid": 5, "tid": 7, "ts": 1716454224101952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052870, "dur": 9, "args": { "External id": 168075, "cbid": 211, "correlation": 168075 } }, { "ph": "s", "id": 168075, "pid": 76337, "tid": -914061504, "ts": 1716454224052870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224052939, "dur": 1, "args": { "External id": 168091, "cbid": 251, "correlation": 168091 } }, { "ph": "f", "id": 168091, "pid": 76337, "tid": -914061504, "ts": 1716454224052939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224052944, "dur": 0, "args": { "External id": 168093, "cbid": 251, "correlation": 168093 } }, { "ph": "f", "id": 168093, "pid": 76337, "tid": -914061504, "ts": 1716454224052944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224101958, "dur": 13, "args": { "External id": 168094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168094, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 168094, "pid": 5, "tid": 7, "ts": 1716454224101958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052946, "dur": 11, "args": { "External id": 168094, "cbid": 211, "correlation": 168094 } }, { "ph": "s", "id": 168094, "pid": 76337, "tid": -914061504, "ts": 1716454224052946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224101972, "dur": 5, "args": { "External id": 168096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 168096, "pid": 5, "tid": 7, "ts": 1716454224101972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224052959, "dur": 5, "args": { "External id": 168096, "cbid": 211, "correlation": 168096 } }, { "ph": "s", "id": 168096, "pid": 76337, "tid": -914061504, "ts": 1716454224052959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224101978, "dur": 29, "args": { "External id": 168106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168106, "pid": 5, "tid": 7, "ts": 1716454224101978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053025, "dur": 14, "args": { "External id": 168106, "cbid": 211, "correlation": 168106 } }, { "ph": "s", "id": 168106, "pid": 76337, "tid": -914061504, "ts": 1716454224053025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224102009, "dur": 31, "args": { "External id": 168126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168126, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 168126, "pid": 5, "tid": 7, "ts": 1716454224102009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053094, "dur": 11, "args": { "External id": 168126, "cbid": 211, "correlation": 168126 } }, { "ph": "s", "id": 168126, "pid": 76337, "tid": -914061504, "ts": 1716454224053094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224102041, "dur": 4, "args": { "External id": 168138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168138, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 168138, "pid": 5, "tid": 7, "ts": 1716454224102041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053114, "dur": 6, "args": { "External id": 168138, "cbid": 211, "correlation": 168138 } }, { "ph": "s", "id": 168138, "pid": 76337, "tid": -914061504, "ts": 1716454224053114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224102047, "dur": 31, "args": { "External id": 168141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168141, "pid": 5, "tid": 7, "ts": 1716454224102047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053133, "dur": 7, "args": { "External id": 168141, "cbid": 211, "correlation": 168141 } }, { "ph": "s", "id": 168141, "pid": 76337, "tid": -914061504, "ts": 1716454224053133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224102079, "dur": 20, "args": { "External id": 168150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168150, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168150, "pid": 5, "tid": 7, "ts": 1716454224102079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053174, "dur": 10, "args": { "External id": 168150, "cbid": 211, "correlation": 168150 } }, { "ph": "s", "id": 168150, "pid": 76337, "tid": -914061504, "ts": 1716454224053174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224053237, "dur": 0, "args": { "External id": 168160, "cbid": 317, "correlation": 168160 } }, { "ph": "f", "id": 168160, "pid": 76337, "tid": -914061504, "ts": 1716454224053237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224053238, "dur": 0, "args": { "External id": 168161, "cbid": 203, "correlation": 168161 } }, { "ph": "f", "id": 168161, "pid": 76337, "tid": -914061504, "ts": 1716454224053238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224053238, "dur": 0, "args": { "External id": 168162, "cbid": 205, "correlation": 168162 } }, { "ph": "f", "id": 168162, "pid": 76337, "tid": -914061504, "ts": 1716454224053238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224102100, "dur": 22, "args": { "External id": 168166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168166, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168166, "pid": 5, "tid": 7, "ts": 1716454224102100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053252, "dur": 12, "args": { "External id": 168166, "cbid": 211, "correlation": 168166 } }, { "ph": "s", "id": 168166, "pid": 76337, "tid": -914061504, "ts": 1716454224053252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224102123, "dur": 44, "args": { "External id": 168168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168168, "pid": 5, "tid": 7, "ts": 1716454224102123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053266, "dur": 5, "args": { "External id": 168168, "cbid": 211, "correlation": 168168 } }, { "ph": "s", "id": 168168, "pid": 76337, "tid": -914061504, "ts": 1716454224053266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224102169, "dur": 651, "args": { "External id": 168170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168170, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168170, "pid": 5, "tid": 7, "ts": 1716454224102169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053277, "dur": 6, "args": { "External id": 168170, "cbid": 211, "correlation": 168170 } }, { "ph": "s", "id": 168170, "pid": 76337, "tid": -914061504, "ts": 1716454224053277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224102821, "dur": 22, "args": { "External id": 168172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168172, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168172, "pid": 5, "tid": 7, "ts": 1716454224102821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053287, "dur": 5, "args": { "External id": 168172, "cbid": 211, "correlation": 168172 } }, { "ph": "s", "id": 168172, "pid": 76337, "tid": -914061504, "ts": 1716454224053287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224102844, "dur": 33, "args": { "External id": 168178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168178, "pid": 5, "tid": 7, "ts": 1716454224102844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053314, "dur": 8, "args": { "External id": 168178, "cbid": 211, "correlation": 168178 } }, { "ph": "s", "id": 168178, "pid": 76337, "tid": -914061504, "ts": 1716454224053314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224102878, "dur": 27, "args": { "External id": 168186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168186, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168186, "pid": 5, "tid": 7, "ts": 1716454224102878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053347, "dur": 9, "args": { "External id": 168186, "cbid": 211, "correlation": 168186 } }, { "ph": "s", "id": 168186, "pid": 76337, "tid": -914061504, "ts": 1716454224053347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224102906, "dur": 20, "args": { "External id": 168194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168194, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168194, "pid": 5, "tid": 7, "ts": 1716454224102906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053376, "dur": 10, "args": { "External id": 168194, "cbid": 211, "correlation": 168194 } }, { "ph": "s", "id": 168194, "pid": 76337, "tid": -914061504, "ts": 1716454224053376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224102928, "dur": 30, "args": { "External id": 168214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168214, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 168214, "pid": 5, "tid": 7, "ts": 1716454224102928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053455, "dur": 12, "args": { "External id": 168214, "cbid": 211, "correlation": 168214 } }, { "ph": "s", "id": 168214, "pid": 76337, "tid": -914061504, "ts": 1716454224053455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224102959, "dur": 4, "args": { "External id": 168226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168226, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 168226, "pid": 5, "tid": 7, "ts": 1716454224102959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053477, "dur": 6, "args": { "External id": 168226, "cbid": 211, "correlation": 168226 } }, { "ph": "s", "id": 168226, "pid": 76337, "tid": -914061504, "ts": 1716454224053477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224102964, "dur": 30, "args": { "External id": 168229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168229, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168229, "pid": 5, "tid": 7, "ts": 1716454224102964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053495, "dur": 6, "args": { "External id": 168229, "cbid": 211, "correlation": 168229 } }, { "ph": "s", "id": 168229, "pid": 76337, "tid": -914061504, "ts": 1716454224053495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224053553, "dur": 0, "args": { "External id": 168240, "cbid": 317, "correlation": 168240 } }, { "ph": "f", "id": 168240, "pid": 76337, "tid": -914061504, "ts": 1716454224053553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224053554, "dur": 0, "args": { "External id": 168241, "cbid": 203, "correlation": 168241 } }, { "ph": "f", "id": 168241, "pid": 76337, "tid": -914061504, "ts": 1716454224053554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224053554, "dur": 0, "args": { "External id": 168242, "cbid": 205, "correlation": 168242 } }, { "ph": "f", "id": 168242, "pid": 76337, "tid": -914061504, "ts": 1716454224053554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224102995, "dur": 22, "args": { "External id": 168246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168246, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168246, "pid": 5, "tid": 7, "ts": 1716454224102995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053567, "dur": 12, "args": { "External id": 168246, "cbid": 211, "correlation": 168246 } }, { "ph": "s", "id": 168246, "pid": 76337, "tid": -914061504, "ts": 1716454224053567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224103019, "dur": 117, "args": { "External id": 168248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168248, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168248, "pid": 5, "tid": 7, "ts": 1716454224103019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053586, "dur": 6, "args": { "External id": 168248, "cbid": 211, "correlation": 168248 } }, { "ph": "s", "id": 168248, "pid": 76337, "tid": -914061504, "ts": 1716454224053586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224103137, "dur": 21, "args": { "External id": 168250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168250, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168250, "pid": 5, "tid": 7, "ts": 1716454224103137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053595, "dur": 5, "args": { "External id": 168250, "cbid": 211, "correlation": 168250 } }, { "ph": "s", "id": 168250, "pid": 76337, "tid": -914061504, "ts": 1716454224053595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224103159, "dur": 33, "args": { "External id": 168256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168256, "pid": 5, "tid": 7, "ts": 1716454224103159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053622, "dur": 8, "args": { "External id": 168256, "cbid": 211, "correlation": 168256 } }, { "ph": "s", "id": 168256, "pid": 76337, "tid": -914061504, "ts": 1716454224053622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224103194, "dur": 181, "args": { "External id": 168265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168265, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168265, "pid": 5, "tid": 7, "ts": 1716454224103194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053705, "dur": 14, "args": { "External id": 168265, "cbid": 211, "correlation": 168265 } }, { "ph": "s", "id": 168265, "pid": 76337, "tid": -914061504, "ts": 1716454224053705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224103376, "dur": 65, "args": { "External id": 168287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168287, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168287, "pid": 5, "tid": 7, "ts": 1716454224103376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053763, "dur": 10, "args": { "External id": 168287, "cbid": 211, "correlation": 168287 } }, { "ph": "s", "id": 168287, "pid": 76337, "tid": -914061504, "ts": 1716454224053763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224053850, "dur": 2, "args": { "External id": 168298, "cbid": 251, "correlation": 168298 } }, { "ph": "f", "id": 168298, "pid": 76337, "tid": -914061504, "ts": 1716454224053850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224103442, "dur": 152, "args": { "External id": 168299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168299, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168299, "pid": 5, "tid": 7, "ts": 1716454224103442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053856, "dur": 13, "args": { "External id": 168299, "cbid": 211, "correlation": 168299 } }, { "ph": "s", "id": 168299, "pid": 76337, "tid": -914061504, "ts": 1716454224053856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224053925, "dur": 1, "args": { "External id": 168310, "cbid": 251, "correlation": 168310 } }, { "ph": "f", "id": 168310, "pid": 76337, "tid": -914061504, "ts": 1716454224053925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224103596, "dur": 144, "args": { "External id": 168311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168311, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168311, "pid": 5, "tid": 7, "ts": 1716454224103596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224053929, "dur": 12, "args": { "External id": 168311, "cbid": 211, "correlation": 168311 } }, { "ph": "s", "id": 168311, "pid": 76337, "tid": -914061504, "ts": 1716454224053929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054009, "dur": 1, "args": { "External id": 168322, "cbid": 251, "correlation": 168322 } }, { "ph": "f", "id": 168322, "pid": 76337, "tid": -914061504, "ts": 1716454224054009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224103742, "dur": 143, "args": { "External id": 168323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168323, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168323, "pid": 5, "tid": 7, "ts": 1716454224103742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054013, "dur": 12, "args": { "External id": 168323, "cbid": 211, "correlation": 168323 } }, { "ph": "s", "id": 168323, "pid": 76337, "tid": -914061504, "ts": 1716454224054013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224103886, "dur": 1963, "args": { "External id": 168344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168344, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 168344, "pid": 5, "tid": 7, "ts": 1716454224103886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054096, "dur": 13, "args": { "External id": 168344, "cbid": 211, "correlation": 168344 } }, { "ph": "s", "id": 168344, "pid": 76337, "tid": -914061504, "ts": 1716454224054096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054196, "dur": 1, "args": { "External id": 168362, "cbid": 251, "correlation": 168362 } }, { "ph": "f", "id": 168362, "pid": 76337, "tid": -914061504, "ts": 1716454224054196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224105851, "dur": 148, "args": { "External id": 168364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168364, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 168364, "pid": 5, "tid": 7, "ts": 1716454224105851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054202, "dur": 13, "args": { "External id": 168364, "cbid": 211, "correlation": 168364 } }, { "ph": "s", "id": 168364, "pid": 76337, "tid": -914061504, "ts": 1716454224054202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224106000, "dur": 35, "args": { "External id": 168372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168372, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168372, "pid": 5, "tid": 7, "ts": 1716454224106000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054271, "dur": 12, "args": { "External id": 168372, "cbid": 211, "correlation": 168372 } }, { "ph": "s", "id": 168372, "pid": 76337, "tid": -914061504, "ts": 1716454224054271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224106036, "dur": 50, "args": { "External id": 168380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168380, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168380, "pid": 5, "tid": 7, "ts": 1716454224106036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054309, "dur": 8, "args": { "External id": 168380, "cbid": 211, "correlation": 168380 } }, { "ph": "s", "id": 168380, "pid": 76337, "tid": -914061504, "ts": 1716454224054309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224106088, "dur": 31, "args": { "External id": 168391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168391, "pid": 5, "tid": 7, "ts": 1716454224106088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054381, "dur": 13, "args": { "External id": 168391, "cbid": 211, "correlation": 168391 } }, { "ph": "s", "id": 168391, "pid": 76337, "tid": -914061504, "ts": 1716454224054381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224106120, "dur": 35, "args": { "External id": 168413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168413, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168413, "pid": 5, "tid": 7, "ts": 1716454224106120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054412, "dur": 8, "args": { "External id": 168413, "cbid": 211, "correlation": 168413 } }, { "ph": "s", "id": 168413, "pid": 76337, "tid": -914061504, "ts": 1716454224054412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054497, "dur": 1, "args": { "External id": 168424, "cbid": 251, "correlation": 168424 } }, { "ph": "f", "id": 168424, "pid": 76337, "tid": -914061504, "ts": 1716454224054497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224106156, "dur": 90, "args": { "External id": 168425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168425, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168425, "pid": 5, "tid": 7, "ts": 1716454224106156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054502, "dur": 12, "args": { "External id": 168425, "cbid": 211, "correlation": 168425 } }, { "ph": "s", "id": 168425, "pid": 76337, "tid": -914061504, "ts": 1716454224054502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054571, "dur": 1, "args": { "External id": 168436, "cbid": 251, "correlation": 168436 } }, { "ph": "f", "id": 168436, "pid": 76337, "tid": -914061504, "ts": 1716454224054571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054574, "dur": 0, "args": { "External id": 168437, "cbid": 251, "correlation": 168437 } }, { "ph": "f", "id": 168437, "pid": 76337, "tid": -914061504, "ts": 1716454224054574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224106248, "dur": 11, "args": { "External id": 168438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168438, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 168438, "pid": 5, "tid": 7, "ts": 1716454224106248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054576, "dur": 12, "args": { "External id": 168438, "cbid": 211, "correlation": 168438 } }, { "ph": "s", "id": 168438, "pid": 76337, "tid": -914061504, "ts": 1716454224054576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224106260, "dur": 5, "args": { "External id": 168440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168440, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 168440, "pid": 5, "tid": 7, "ts": 1716454224106260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054590, "dur": 6, "args": { "External id": 168440, "cbid": 211, "correlation": 168440 } }, { "ph": "s", "id": 168440, "pid": 76337, "tid": -914061504, "ts": 1716454224054590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054648, "dur": 1, "args": { "External id": 168451, "cbid": 251, "correlation": 168451 } }, { "ph": "f", "id": 168451, "pid": 76337, "tid": -914061504, "ts": 1716454224054648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054651, "dur": 0, "args": { "External id": 168452, "cbid": 251, "correlation": 168452 } }, { "ph": "f", "id": 168452, "pid": 76337, "tid": -914061504, "ts": 1716454224054651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224106266, "dur": 7, "args": { "External id": 168453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168453, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 168453, "pid": 5, "tid": 7, "ts": 1716454224106266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054653, "dur": 12, "args": { "External id": 168453, "cbid": 211, "correlation": 168453 } }, { "ph": "s", "id": 168453, "pid": 76337, "tid": -914061504, "ts": 1716454224054653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224106275, "dur": 3, "args": { "External id": 168455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168455, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 168455, "pid": 5, "tid": 7, "ts": 1716454224106275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054666, "dur": 6, "args": { "External id": 168455, "cbid": 211, "correlation": 168455 } }, { "ph": "s", "id": 168455, "pid": 76337, "tid": -914061504, "ts": 1716454224054666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224106279, "dur": 92, "args": { "External id": 168476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168476, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 168476, "pid": 5, "tid": 7, "ts": 1716454224106279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054742, "dur": 13, "args": { "External id": 168476, "cbid": 211, "correlation": 168476 } }, { "ph": "s", "id": 168476, "pid": 76337, "tid": -914061504, "ts": 1716454224054742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224054841, "dur": 1, "args": { "External id": 168494, "cbid": 251, "correlation": 168494 } }, { "ph": "f", "id": 168494, "pid": 76337, "tid": -914061504, "ts": 1716454224054841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224106373, "dur": 87, "args": { "External id": 168496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168496, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168496, "pid": 5, "tid": 7, "ts": 1716454224106373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054847, "dur": 13, "args": { "External id": 168496, "cbid": 211, "correlation": 168496 } }, { "ph": "s", "id": 168496, "pid": 76337, "tid": -914061504, "ts": 1716454224054847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224106461, "dur": 19, "args": { "External id": 168504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168504, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168504, "pid": 5, "tid": 7, "ts": 1716454224106461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054916, "dur": 11, "args": { "External id": 168504, "cbid": 211, "correlation": 168504 } }, { "ph": "s", "id": 168504, "pid": 76337, "tid": -914061504, "ts": 1716454224054916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224106482, "dur": 37, "args": { "External id": 168512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168512, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168512, "pid": 5, "tid": 7, "ts": 1716454224106482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224054957, "dur": 9, "args": { "External id": 168512, "cbid": 211, "correlation": 168512 } }, { "ph": "s", "id": 168512, "pid": 76337, "tid": -914061504, "ts": 1716454224054957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224106519, "dur": 35, "args": { "External id": 168534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168534, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168534, "pid": 5, "tid": 7, "ts": 1716454224106519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055018, "dur": 10, "args": { "External id": 168534, "cbid": 211, "correlation": 168534 } }, { "ph": "s", "id": 168534, "pid": 76337, "tid": -914061504, "ts": 1716454224055018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224055109, "dur": 1, "args": { "External id": 168550, "cbid": 251, "correlation": 168550 } }, { "ph": "f", "id": 168550, "pid": 76337, "tid": -914061504, "ts": 1716454224055109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224055114, "dur": 0, "args": { "External id": 168552, "cbid": 251, "correlation": 168552 } }, { "ph": "f", "id": 168552, "pid": 76337, "tid": -914061504, "ts": 1716454224055114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224106556, "dur": 543, "args": { "External id": 168553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168553, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 168553, "pid": 5, "tid": 7, "ts": 1716454224106556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055117, "dur": 13, "args": { "External id": 168553, "cbid": 211, "correlation": 168553 } }, { "ph": "s", "id": 168553, "pid": 76337, "tid": -914061504, "ts": 1716454224055117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224107100, "dur": 127, "args": { "External id": 168561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168561, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168561, "pid": 5, "tid": 7, "ts": 1716454224107100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055182, "dur": 13, "args": { "External id": 168561, "cbid": 211, "correlation": 168561 } }, { "ph": "s", "id": 168561, "pid": 76337, "tid": -914061504, "ts": 1716454224055182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224107228, "dur": 127, "args": { "External id": 168569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168569, "pid": 5, "tid": 7, "ts": 1716454224107228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055213, "dur": 8, "args": { "External id": 168569, "cbid": 211, "correlation": 168569 } }, { "ph": "s", "id": 168569, "pid": 76337, "tid": -914061504, "ts": 1716454224055213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224055290, "dur": 1, "args": { "External id": 168585, "cbid": 251, "correlation": 168585 } }, { "ph": "f", "id": 168585, "pid": 76337, "tid": -914061504, "ts": 1716454224055290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224107357, "dur": 305, "args": { "External id": 168587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168587, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168587, "pid": 5, "tid": 7, "ts": 1716454224107357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055296, "dur": 12, "args": { "External id": 168587, "cbid": 211, "correlation": 168587 } }, { "ph": "s", "id": 168587, "pid": 76337, "tid": -914061504, "ts": 1716454224055296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224107663, "dur": 27, "args": { "External id": 168595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168595, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168595, "pid": 5, "tid": 7, "ts": 1716454224107663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055337, "dur": 10, "args": { "External id": 168595, "cbid": 211, "correlation": 168595 } }, { "ph": "s", "id": 168595, "pid": 76337, "tid": -914061504, "ts": 1716454224055337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224107691, "dur": 81, "args": { "External id": 168606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168606, "pid": 5, "tid": 7, "ts": 1716454224107691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055406, "dur": 12, "args": { "External id": 168606, "cbid": 211, "correlation": 168606 } }, { "ph": "s", "id": 168606, "pid": 76337, "tid": -914061504, "ts": 1716454224055406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224055468, "dur": 0, "args": { "External id": 168618, "cbid": 317, "correlation": 168618 } }, { "ph": "f", "id": 168618, "pid": 76337, "tid": -914061504, "ts": 1716454224055468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224055469, "dur": 0, "args": { "External id": 168619, "cbid": 203, "correlation": 168619 } }, { "ph": "f", "id": 168619, "pid": 76337, "tid": -914061504, "ts": 1716454224055469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224055470, "dur": 0, "args": { "External id": 168620, "cbid": 205, "correlation": 168620 } }, { "ph": "f", "id": 168620, "pid": 76337, "tid": -914061504, "ts": 1716454224055470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224107774, "dur": 22, "args": { "External id": 168624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168624, "pid": 5, "tid": 7, "ts": 1716454224107774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055486, "dur": 13, "args": { "External id": 168624, "cbid": 211, "correlation": 168624 } }, { "ph": "s", "id": 168624, "pid": 76337, "tid": -914061504, "ts": 1716454224055486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224107798, "dur": 123, "args": { "External id": 168626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168626, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168626, "pid": 5, "tid": 7, "ts": 1716454224107798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055505, "dur": 6, "args": { "External id": 168626, "cbid": 211, "correlation": 168626 } }, { "ph": "s", "id": 168626, "pid": 76337, "tid": -914061504, "ts": 1716454224055505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224107922, "dur": 23, "args": { "External id": 168628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168628, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168628, "pid": 5, "tid": 7, "ts": 1716454224107922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055516, "dur": 5, "args": { "External id": 168628, "cbid": 211, "correlation": 168628 } }, { "ph": "s", "id": 168628, "pid": 76337, "tid": -914061504, "ts": 1716454224055516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224107946, "dur": 33, "args": { "External id": 168634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168634, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168634, "pid": 5, "tid": 7, "ts": 1716454224107946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055543, "dur": 9, "args": { "External id": 168634, "cbid": 211, "correlation": 168634 } }, { "ph": "s", "id": 168634, "pid": 76337, "tid": -914061504, "ts": 1716454224055543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224107980, "dur": 27, "args": { "External id": 168642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168642, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168642, "pid": 5, "tid": 7, "ts": 1716454224107980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055575, "dur": 8, "args": { "External id": 168642, "cbid": 211, "correlation": 168642 } }, { "ph": "s", "id": 168642, "pid": 76337, "tid": -914061504, "ts": 1716454224055575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224055649, "dur": 0, "args": { "External id": 168652, "cbid": 317, "correlation": 168652 } }, { "ph": "f", "id": 168652, "pid": 76337, "tid": -914061504, "ts": 1716454224055649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224055650, "dur": 0, "args": { "External id": 168653, "cbid": 203, "correlation": 168653 } }, { "ph": "f", "id": 168653, "pid": 76337, "tid": -914061504, "ts": 1716454224055650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224055650, "dur": 0, "args": { "External id": 168654, "cbid": 205, "correlation": 168654 } }, { "ph": "f", "id": 168654, "pid": 76337, "tid": -914061504, "ts": 1716454224055650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224108008, "dur": 23, "args": { "External id": 168658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168658, "pid": 5, "tid": 7, "ts": 1716454224108008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055665, "dur": 12, "args": { "External id": 168658, "cbid": 211, "correlation": 168658 } }, { "ph": "s", "id": 168658, "pid": 76337, "tid": -914061504, "ts": 1716454224055665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224108032, "dur": 44, "args": { "External id": 168660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168660, "pid": 5, "tid": 7, "ts": 1716454224108032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055679, "dur": 5, "args": { "External id": 168660, "cbid": 211, "correlation": 168660 } }, { "ph": "s", "id": 168660, "pid": 76337, "tid": -914061504, "ts": 1716454224055679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224108077, "dur": 236, "args": { "External id": 168662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168662, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 168662, "pid": 5, "tid": 7, "ts": 1716454224108077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055691, "dur": 8, "args": { "External id": 168662, "cbid": 211, "correlation": 168662 } }, { "ph": "s", "id": 168662, "pid": 76337, "tid": -914061504, "ts": 1716454224055691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224108314, "dur": 6, "args": { "External id": 168664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168664, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168664, "pid": 5, "tid": 7, "ts": 1716454224108314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055702, "dur": 5, "args": { "External id": 168664, "cbid": 211, "correlation": 168664 } }, { "ph": "s", "id": 168664, "pid": 76337, "tid": -914061504, "ts": 1716454224055702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224108322, "dur": 9, "args": { "External id": 168670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168670, "pid": 5, "tid": 7, "ts": 1716454224108322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055728, "dur": 8, "args": { "External id": 168670, "cbid": 211, "correlation": 168670 } }, { "ph": "s", "id": 168670, "pid": 76337, "tid": -914061504, "ts": 1716454224055728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224108333, "dur": 12, "args": { "External id": 168690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168690, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 168690, "pid": 5, "tid": 7, "ts": 1716454224108333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055825, "dur": 12, "args": { "External id": 168690, "cbid": 211, "correlation": 168690 } }, { "ph": "s", "id": 168690, "pid": 76337, "tid": -914061504, "ts": 1716454224055825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224108346, "dur": 4, "args": { "External id": 168702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168702, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 168702, "pid": 5, "tid": 7, "ts": 1716454224108346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055848, "dur": 7, "args": { "External id": 168702, "cbid": 211, "correlation": 168702 } }, { "ph": "s", "id": 168702, "pid": 76337, "tid": -914061504, "ts": 1716454224055848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224108351, "dur": 12, "args": { "External id": 168705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168705, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168705, "pid": 5, "tid": 7, "ts": 1716454224108351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055866, "dur": 6, "args": { "External id": 168705, "cbid": 211, "correlation": 168705 } }, { "ph": "s", "id": 168705, "pid": 76337, "tid": -914061504, "ts": 1716454224055866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224108365, "dur": 7, "args": { "External id": 168714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168714, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168714, "pid": 5, "tid": 7, "ts": 1716454224108365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055906, "dur": 10, "args": { "External id": 168714, "cbid": 211, "correlation": 168714 } }, { "ph": "s", "id": 168714, "pid": 76337, "tid": -914061504, "ts": 1716454224055906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224055959, "dur": 0, "args": { "External id": 168724, "cbid": 317, "correlation": 168724 } }, { "ph": "f", "id": 168724, "pid": 76337, "tid": -914061504, "ts": 1716454224055959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224055960, "dur": 0, "args": { "External id": 168725, "cbid": 203, "correlation": 168725 } }, { "ph": "f", "id": 168725, "pid": 76337, "tid": -914061504, "ts": 1716454224055960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224055961, "dur": 0, "args": { "External id": 168726, "cbid": 205, "correlation": 168726 } }, { "ph": "f", "id": 168726, "pid": 76337, "tid": -914061504, "ts": 1716454224055961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224108373, "dur": 6, "args": { "External id": 168730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168730, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168730, "pid": 5, "tid": 7, "ts": 1716454224108373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224055985, "dur": 12, "args": { "External id": 168730, "cbid": 211, "correlation": 168730 } }, { "ph": "s", "id": 168730, "pid": 76337, "tid": -914061504, "ts": 1716454224055985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224108380, "dur": 84, "args": { "External id": 168732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168732, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168732, "pid": 5, "tid": 7, "ts": 1716454224108380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056000, "dur": 5, "args": { "External id": 168732, "cbid": 211, "correlation": 168732 } }, { "ph": "s", "id": 168732, "pid": 76337, "tid": -914061504, "ts": 1716454224056000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224108466, "dur": 1, "args": { "External id": 168734, "device": 5, "context": 1, "stream": 7, "correlation": 168734, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 168734, "pid": 5, "tid": 7, "ts": 1716454224108466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224056013, "dur": 14, "args": { "External id": 168734, "cbid": 51, "correlation": 168734 } }, { "ph": "s", "id": 168734, "pid": 76337, "tid": -914061504, "ts": 1716454224056013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224108470, "dur": 541, "args": { "External id": 168735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168735, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168735, "pid": 5, "tid": 7, "ts": 1716454224108470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056028, "dur": 9, "args": { "External id": 168735, "cbid": 211, "correlation": 168735 } }, { "ph": "s", "id": 168735, "pid": 76337, "tid": -914061504, "ts": 1716454224056028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224109012, "dur": 12, "args": { "External id": 168737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168737, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168737, "pid": 5, "tid": 7, "ts": 1716454224109012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056041, "dur": 6, "args": { "External id": 168737, "cbid": 211, "correlation": 168737 } }, { "ph": "s", "id": 168737, "pid": 76337, "tid": -914061504, "ts": 1716454224056041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224109025, "dur": 15, "args": { "External id": 168743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168743, "pid": 5, "tid": 7, "ts": 1716454224109025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056070, "dur": 9, "args": { "External id": 168743, "cbid": 211, "correlation": 168743 } }, { "ph": "s", "id": 168743, "pid": 76337, "tid": -914061504, "ts": 1716454224056070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224109041, "dur": 4, "args": { "External id": 168751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168751, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 168751, "pid": 5, "tid": 7, "ts": 1716454224109041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056115, "dur": 9, "args": { "External id": 168751, "cbid": 211, "correlation": 168751 } }, { "ph": "s", "id": 168751, "pid": 76337, "tid": -914061504, "ts": 1716454224056115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224056179, "dur": 1, "args": { "External id": 168767, "cbid": 251, "correlation": 168767 } }, { "ph": "f", "id": 168767, "pid": 76337, "tid": -914061504, "ts": 1716454224056179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224056184, "dur": 0, "args": { "External id": 168769, "cbid": 251, "correlation": 168769 } }, { "ph": "f", "id": 168769, "pid": 76337, "tid": -914061504, "ts": 1716454224056184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224109047, "dur": 13, "args": { "External id": 168770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168770, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168770, "pid": 5, "tid": 7, "ts": 1716454224109047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056186, "dur": 12, "args": { "External id": 168770, "cbid": 211, "correlation": 168770 } }, { "ph": "s", "id": 168770, "pid": 76337, "tid": -914061504, "ts": 1716454224056186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224109061, "dur": 5, "args": { "External id": 168772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168772, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168772, "pid": 5, "tid": 7, "ts": 1716454224109061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056200, "dur": 6, "args": { "External id": 168772, "cbid": 211, "correlation": 168772 } }, { "ph": "s", "id": 168772, "pid": 76337, "tid": -914061504, "ts": 1716454224056200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224109068, "dur": 18, "args": { "External id": 168782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168782, "pid": 5, "tid": 7, "ts": 1716454224109068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056258, "dur": 12, "args": { "External id": 168782, "cbid": 211, "correlation": 168782 } }, { "ph": "s", "id": 168782, "pid": 76337, "tid": -914061504, "ts": 1716454224056258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224109087, "dur": 18, "args": { "External id": 168802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168802, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 168802, "pid": 5, "tid": 7, "ts": 1716454224109087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056324, "dur": 10, "args": { "External id": 168802, "cbid": 211, "correlation": 168802 } }, { "ph": "s", "id": 168802, "pid": 76337, "tid": -914061504, "ts": 1716454224056324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224109106, "dur": 5, "args": { "External id": 168814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168814, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 168814, "pid": 5, "tid": 7, "ts": 1716454224109106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056345, "dur": 7, "args": { "External id": 168814, "cbid": 211, "correlation": 168814 } }, { "ph": "s", "id": 168814, "pid": 76337, "tid": -914061504, "ts": 1716454224056345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224109112, "dur": 17, "args": { "External id": 168817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168817, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168817, "pid": 5, "tid": 7, "ts": 1716454224109112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056364, "dur": 6, "args": { "External id": 168817, "cbid": 211, "correlation": 168817 } }, { "ph": "s", "id": 168817, "pid": 76337, "tid": -914061504, "ts": 1716454224056364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224109130, "dur": 11, "args": { "External id": 168826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168826, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168826, "pid": 5, "tid": 7, "ts": 1716454224109130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056405, "dur": 9, "args": { "External id": 168826, "cbid": 211, "correlation": 168826 } }, { "ph": "s", "id": 168826, "pid": 76337, "tid": -914061504, "ts": 1716454224056405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224056467, "dur": 0, "args": { "External id": 168836, "cbid": 317, "correlation": 168836 } }, { "ph": "f", "id": 168836, "pid": 76337, "tid": -914061504, "ts": 1716454224056467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224056468, "dur": 0, "args": { "External id": 168837, "cbid": 203, "correlation": 168837 } }, { "ph": "f", "id": 168837, "pid": 76337, "tid": -914061504, "ts": 1716454224056468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224056469, "dur": 0, "args": { "External id": 168838, "cbid": 205, "correlation": 168838 } }, { "ph": "f", "id": 168838, "pid": 76337, "tid": -914061504, "ts": 1716454224056469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224109142, "dur": 12, "args": { "External id": 168842, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168842, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168842, "pid": 5, "tid": 7, "ts": 1716454224109142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056486, "dur": 12, "args": { "External id": 168842, "cbid": 211, "correlation": 168842 } }, { "ph": "s", "id": 168842, "pid": 76337, "tid": -914061504, "ts": 1716454224056486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224109155, "dur": 164, "args": { "External id": 168844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168844, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168844, "pid": 5, "tid": 7, "ts": 1716454224109155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056502, "dur": 5, "args": { "External id": 168844, "cbid": 211, "correlation": 168844 } }, { "ph": "s", "id": 168844, "pid": 76337, "tid": -914061504, "ts": 1716454224056502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224109321, "dur": 1, "args": { "External id": 168846, "device": 5, "context": 1, "stream": 7, "correlation": 168846, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 168846, "pid": 5, "tid": 7, "ts": 1716454224109321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224056514, "dur": 9, "args": { "External id": 168846, "cbid": 51, "correlation": 168846 } }, { "ph": "s", "id": 168846, "pid": 76337, "tid": -914061504, "ts": 1716454224056514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224109325, "dur": 666, "args": { "External id": 168847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168847, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 168847, "pid": 5, "tid": 7, "ts": 1716454224109325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056525, "dur": 8, "args": { "External id": 168847, "cbid": 211, "correlation": 168847 } }, { "ph": "s", "id": 168847, "pid": 76337, "tid": -914061504, "ts": 1716454224056525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224109992, "dur": 13, "args": { "External id": 168849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168849, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168849, "pid": 5, "tid": 7, "ts": 1716454224109992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056537, "dur": 5, "args": { "External id": 168849, "cbid": 211, "correlation": 168849 } }, { "ph": "s", "id": 168849, "pid": 76337, "tid": -914061504, "ts": 1716454224056537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224110007, "dur": 15, "args": { "External id": 168855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168855, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168855, "pid": 5, "tid": 7, "ts": 1716454224110007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056566, "dur": 8, "args": { "External id": 168855, "cbid": 211, "correlation": 168855 } }, { "ph": "s", "id": 168855, "pid": 76337, "tid": -914061504, "ts": 1716454224056566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224056624, "dur": 0, "args": { "External id": 168865, "cbid": 317, "correlation": 168865 } }, { "ph": "f", "id": 168865, "pid": 76337, "tid": -914061504, "ts": 1716454224056624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224056625, "dur": 0, "args": { "External id": 168866, "cbid": 203, "correlation": 168866 } }, { "ph": "f", "id": 168866, "pid": 76337, "tid": -914061504, "ts": 1716454224056625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224056626, "dur": 0, "args": { "External id": 168867, "cbid": 205, "correlation": 168867 } }, { "ph": "f", "id": 168867, "pid": 76337, "tid": -914061504, "ts": 1716454224056626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224110023, "dur": 9, "args": { "External id": 168871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168871, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168871, "pid": 5, "tid": 7, "ts": 1716454224110023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056641, "dur": 12, "args": { "External id": 168871, "cbid": 211, "correlation": 168871 } }, { "ph": "s", "id": 168871, "pid": 76337, "tid": -914061504, "ts": 1716454224056641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224110033, "dur": 3, "args": { "External id": 168873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 168873, "pid": 5, "tid": 7, "ts": 1716454224110033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056661, "dur": 6, "args": { "External id": 168873, "cbid": 211, "correlation": 168873 } }, { "ph": "s", "id": 168873, "pid": 76337, "tid": -914061504, "ts": 1716454224056661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224056672, "dur": 0, "args": { "External id": 168874, "cbid": 51, "correlation": 168874 } }, { "ph": "s", "id": 168874, "pid": 76337, "tid": -914061504, "ts": 1716454224056672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224110038, "dur": 58, "args": { "External id": 168875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168875, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 168875, "pid": 5, "tid": 7, "ts": 1716454224110038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056673, "dur": 7, "args": { "External id": 168875, "cbid": 211, "correlation": 168875 } }, { "ph": "s", "id": 168875, "pid": 76337, "tid": -914061504, "ts": 1716454224056673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224110097, "dur": 14, "args": { "External id": 168880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168880, "pid": 5, "tid": 7, "ts": 1716454224110097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056701, "dur": 9, "args": { "External id": 168880, "cbid": 211, "correlation": 168880 } }, { "ph": "s", "id": 168880, "pid": 76337, "tid": -914061504, "ts": 1716454224056701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224110113, "dur": 12, "args": { "External id": 168888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168888, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168888, "pid": 5, "tid": 7, "ts": 1716454224110113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056730, "dur": 8, "args": { "External id": 168888, "cbid": 211, "correlation": 168888 } }, { "ph": "s", "id": 168888, "pid": 76337, "tid": -914061504, "ts": 1716454224056730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224110126, "dur": 11, "args": { "External id": 168896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168896, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168896, "pid": 5, "tid": 7, "ts": 1716454224110126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056759, "dur": 8, "args": { "External id": 168896, "cbid": 211, "correlation": 168896 } }, { "ph": "s", "id": 168896, "pid": 76337, "tid": -914061504, "ts": 1716454224056759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224110138, "dur": 18, "args": { "External id": 168916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168916, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 168916, "pid": 5, "tid": 7, "ts": 1716454224110138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056840, "dur": 13, "args": { "External id": 168916, "cbid": 211, "correlation": 168916 } }, { "ph": "s", "id": 168916, "pid": 76337, "tid": -914061504, "ts": 1716454224056840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224110158, "dur": 5, "args": { "External id": 168928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168928, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 168928, "pid": 5, "tid": 7, "ts": 1716454224110158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056863, "dur": 6, "args": { "External id": 168928, "cbid": 211, "correlation": 168928 } }, { "ph": "s", "id": 168928, "pid": 76337, "tid": -914061504, "ts": 1716454224056863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224110163, "dur": 17, "args": { "External id": 168931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168931, "pid": 5, "tid": 7, "ts": 1716454224110163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056880, "dur": 6, "args": { "External id": 168931, "cbid": 211, "correlation": 168931 } }, { "ph": "s", "id": 168931, "pid": 76337, "tid": -914061504, "ts": 1716454224056880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224056938, "dur": 0, "args": { "External id": 168942, "cbid": 317, "correlation": 168942 } }, { "ph": "f", "id": 168942, "pid": 76337, "tid": -914061504, "ts": 1716454224056938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224056938, "dur": 0, "args": { "External id": 168943, "cbid": 203, "correlation": 168943 } }, { "ph": "f", "id": 168943, "pid": 76337, "tid": -914061504, "ts": 1716454224056938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224056939, "dur": 0, "args": { "External id": 168944, "cbid": 205, "correlation": 168944 } }, { "ph": "f", "id": 168944, "pid": 76337, "tid": -914061504, "ts": 1716454224056939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224110182, "dur": 11, "args": { "External id": 168948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168948, "pid": 5, "tid": 7, "ts": 1716454224110182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056955, "dur": 11, "args": { "External id": 168948, "cbid": 211, "correlation": 168948 } }, { "ph": "s", "id": 168948, "pid": 76337, "tid": -914061504, "ts": 1716454224056955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224110194, "dur": 4, "args": { "External id": 168950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168950, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 168950, "pid": 5, "tid": 7, "ts": 1716454224110194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056972, "dur": 15, "args": { "External id": 168950, "cbid": 211, "correlation": 168950 } }, { "ph": "s", "id": 168950, "pid": 76337, "tid": -914061504, "ts": 1716454224056972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224056991, "dur": 0, "args": { "External id": 168951, "cbid": 51, "correlation": 168951 } }, { "ph": "s", "id": 168951, "pid": 76337, "tid": -914061504, "ts": 1716454224056991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224110199, "dur": 99, "args": { "External id": 168952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168952, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 168952, "pid": 5, "tid": 7, "ts": 1716454224110199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224056992, "dur": 7, "args": { "External id": 168952, "cbid": 211, "correlation": 168952 } }, { "ph": "s", "id": 168952, "pid": 76337, "tid": -914061504, "ts": 1716454224056992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224110300, "dur": 16, "args": { "External id": 168957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168957, "pid": 5, "tid": 7, "ts": 1716454224110300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057021, "dur": 9, "args": { "External id": 168957, "cbid": 211, "correlation": 168957 } }, { "ph": "s", "id": 168957, "pid": 76337, "tid": -914061504, "ts": 1716454224057021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224110317, "dur": 84, "args": { "External id": 168966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168966, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168966, "pid": 5, "tid": 7, "ts": 1716454224110317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057106, "dur": 14, "args": { "External id": 168966, "cbid": 211, "correlation": 168966 } }, { "ph": "s", "id": 168966, "pid": 76337, "tid": -914061504, "ts": 1716454224057106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224110403, "dur": 31, "args": { "External id": 168988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 168988, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 168988, "pid": 5, "tid": 7, "ts": 1716454224110403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057162, "dur": 10, "args": { "External id": 168988, "cbid": 211, "correlation": 168988 } }, { "ph": "s", "id": 168988, "pid": 76337, "tid": -914061504, "ts": 1716454224057162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057253, "dur": 2, "args": { "External id": 168999, "cbid": 251, "correlation": 168999 } }, { "ph": "f", "id": 168999, "pid": 76337, "tid": -914061504, "ts": 1716454224057253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224110435, "dur": 165, "args": { "External id": 169000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169000, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169000, "pid": 5, "tid": 7, "ts": 1716454224110435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057258, "dur": 13, "args": { "External id": 169000, "cbid": 211, "correlation": 169000 } }, { "ph": "s", "id": 169000, "pid": 76337, "tid": -914061504, "ts": 1716454224057258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057328, "dur": 1, "args": { "External id": 169011, "cbid": 251, "correlation": 169011 } }, { "ph": "f", "id": 169011, "pid": 76337, "tid": -914061504, "ts": 1716454224057328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224110601, "dur": 160, "args": { "External id": 169012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169012, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169012, "pid": 5, "tid": 7, "ts": 1716454224110601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057332, "dur": 11, "args": { "External id": 169012, "cbid": 211, "correlation": 169012 } }, { "ph": "s", "id": 169012, "pid": 76337, "tid": -914061504, "ts": 1716454224057332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057396, "dur": 1, "args": { "External id": 169023, "cbid": 251, "correlation": 169023 } }, { "ph": "f", "id": 169023, "pid": 76337, "tid": -914061504, "ts": 1716454224057396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224110762, "dur": 158, "args": { "External id": 169024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169024, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169024, "pid": 5, "tid": 7, "ts": 1716454224110762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057400, "dur": 11, "args": { "External id": 169024, "cbid": 211, "correlation": 169024 } }, { "ph": "s", "id": 169024, "pid": 76337, "tid": -914061504, "ts": 1716454224057400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224110922, "dur": 341, "args": { "External id": 169049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169049, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169049, "pid": 5, "tid": 7, "ts": 1716454224110922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057488, "dur": 14, "args": { "External id": 169049, "cbid": 211, "correlation": 169049 } }, { "ph": "s", "id": 169049, "pid": 76337, "tid": -914061504, "ts": 1716454224057488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057588, "dur": 1, "args": { "External id": 169067, "cbid": 251, "correlation": 169067 } }, { "ph": "f", "id": 169067, "pid": 76337, "tid": -914061504, "ts": 1716454224057588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224111264, "dur": 167, "args": { "External id": 169069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169069, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169069, "pid": 5, "tid": 7, "ts": 1716454224111264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057594, "dur": 14, "args": { "External id": 169069, "cbid": 211, "correlation": 169069 } }, { "ph": "s", "id": 169069, "pid": 76337, "tid": -914061504, "ts": 1716454224057594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224111432, "dur": 19, "args": { "External id": 169077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169077, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169077, "pid": 5, "tid": 7, "ts": 1716454224111432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057664, "dur": 12, "args": { "External id": 169077, "cbid": 211, "correlation": 169077 } }, { "ph": "s", "id": 169077, "pid": 76337, "tid": -914061504, "ts": 1716454224057664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224111452, "dur": 27, "args": { "External id": 169085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169085, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169085, "pid": 5, "tid": 7, "ts": 1716454224111452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057703, "dur": 9, "args": { "External id": 169085, "cbid": 211, "correlation": 169085 } }, { "ph": "s", "id": 169085, "pid": 76337, "tid": -914061504, "ts": 1716454224057703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224111481, "dur": 19, "args": { "External id": 169096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169096, "pid": 5, "tid": 7, "ts": 1716454224111481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057775, "dur": 12, "args": { "External id": 169096, "cbid": 211, "correlation": 169096 } }, { "ph": "s", "id": 169096, "pid": 76337, "tid": -914061504, "ts": 1716454224057775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224111502, "dur": 17, "args": { "External id": 169118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169118, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169118, "pid": 5, "tid": 7, "ts": 1716454224111502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057805, "dur": 8, "args": { "External id": 169118, "cbid": 211, "correlation": 169118 } }, { "ph": "s", "id": 169118, "pid": 76337, "tid": -914061504, "ts": 1716454224057805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057890, "dur": 1, "args": { "External id": 169129, "cbid": 251, "correlation": 169129 } }, { "ph": "f", "id": 169129, "pid": 76337, "tid": -914061504, "ts": 1716454224057890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224111520, "dur": 90, "args": { "External id": 169130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169130, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169130, "pid": 5, "tid": 7, "ts": 1716454224111520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057897, "dur": 15, "args": { "External id": 169130, "cbid": 211, "correlation": 169130 } }, { "ph": "s", "id": 169130, "pid": 76337, "tid": -914061504, "ts": 1716454224057897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057968, "dur": 1, "args": { "External id": 169141, "cbid": 251, "correlation": 169141 } }, { "ph": "f", "id": 169141, "pid": 76337, "tid": -914061504, "ts": 1716454224057968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224057971, "dur": 0, "args": { "External id": 169142, "cbid": 251, "correlation": 169142 } }, { "ph": "f", "id": 169142, "pid": 76337, "tid": -914061504, "ts": 1716454224057971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224111611, "dur": 13, "args": { "External id": 169143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169143, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169143, "pid": 5, "tid": 7, "ts": 1716454224111611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057981, "dur": 13, "args": { "External id": 169143, "cbid": 211, "correlation": 169143 } }, { "ph": "s", "id": 169143, "pid": 76337, "tid": -914061504, "ts": 1716454224057981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224111625, "dur": 6, "args": { "External id": 169145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169145, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169145, "pid": 5, "tid": 7, "ts": 1716454224111625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224057996, "dur": 6, "args": { "External id": 169145, "cbid": 211, "correlation": 169145 } }, { "ph": "s", "id": 169145, "pid": 76337, "tid": -914061504, "ts": 1716454224057996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224058056, "dur": 1, "args": { "External id": 169156, "cbid": 251, "correlation": 169156 } }, { "ph": "f", "id": 169156, "pid": 76337, "tid": -914061504, "ts": 1716454224058056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224058060, "dur": 0, "args": { "External id": 169157, "cbid": 251, "correlation": 169157 } }, { "ph": "f", "id": 169157, "pid": 76337, "tid": -914061504, "ts": 1716454224058060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224111632, "dur": 9, "args": { "External id": 169158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169158, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169158, "pid": 5, "tid": 7, "ts": 1716454224111632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058062, "dur": 11, "args": { "External id": 169158, "cbid": 211, "correlation": 169158 } }, { "ph": "s", "id": 169158, "pid": 76337, "tid": -914061504, "ts": 1716454224058062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224111642, "dur": 3, "args": { "External id": 169160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169160, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169160, "pid": 5, "tid": 7, "ts": 1716454224111642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058075, "dur": 5, "args": { "External id": 169160, "cbid": 211, "correlation": 169160 } }, { "ph": "s", "id": 169160, "pid": 76337, "tid": -914061504, "ts": 1716454224058075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224111647, "dur": 55, "args": { "External id": 169185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169185, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169185, "pid": 5, "tid": 7, "ts": 1716454224111647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058152, "dur": 13, "args": { "External id": 169185, "cbid": 211, "correlation": 169185 } }, { "ph": "s", "id": 169185, "pid": 76337, "tid": -914061504, "ts": 1716454224058152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224058252, "dur": 2, "args": { "External id": 169203, "cbid": 251, "correlation": 169203 } }, { "ph": "f", "id": 169203, "pid": 76337, "tid": -914061504, "ts": 1716454224058252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224111703, "dur": 92, "args": { "External id": 169205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169205, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169205, "pid": 5, "tid": 7, "ts": 1716454224111703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058260, "dur": 15, "args": { "External id": 169205, "cbid": 211, "correlation": 169205 } }, { "ph": "s", "id": 169205, "pid": 76337, "tid": -914061504, "ts": 1716454224058260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224111796, "dur": 9, "args": { "External id": 169213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169213, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169213, "pid": 5, "tid": 7, "ts": 1716454224111796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058331, "dur": 12, "args": { "External id": 169213, "cbid": 211, "correlation": 169213 } }, { "ph": "s", "id": 169213, "pid": 76337, "tid": -914061504, "ts": 1716454224058331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224111807, "dur": 21, "args": { "External id": 169221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169221, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169221, "pid": 5, "tid": 7, "ts": 1716454224111807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058373, "dur": 9, "args": { "External id": 169221, "cbid": 211, "correlation": 169221 } }, { "ph": "s", "id": 169221, "pid": 76337, "tid": -914061504, "ts": 1716454224058373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224111830, "dur": 18, "args": { "External id": 169243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169243, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169243, "pid": 5, "tid": 7, "ts": 1716454224111830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058424, "dur": 10, "args": { "External id": 169243, "cbid": 211, "correlation": 169243 } }, { "ph": "s", "id": 169243, "pid": 76337, "tid": -914061504, "ts": 1716454224058424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224058514, "dur": 2, "args": { "External id": 169259, "cbid": 251, "correlation": 169259 } }, { "ph": "f", "id": 169259, "pid": 76337, "tid": -914061504, "ts": 1716454224058514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224058520, "dur": 0, "args": { "External id": 169261, "cbid": 251, "correlation": 169261 } }, { "ph": "f", "id": 169261, "pid": 76337, "tid": -914061504, "ts": 1716454224058520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224111849, "dur": 495, "args": { "External id": 169262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169262, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169262, "pid": 5, "tid": 7, "ts": 1716454224111849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058522, "dur": 15, "args": { "External id": 169262, "cbid": 211, "correlation": 169262 } }, { "ph": "s", "id": 169262, "pid": 76337, "tid": -914061504, "ts": 1716454224058522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224112345, "dur": 66, "args": { "External id": 169270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169270, "pid": 5, "tid": 7, "ts": 1716454224112345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058590, "dur": 12, "args": { "External id": 169270, "cbid": 211, "correlation": 169270 } }, { "ph": "s", "id": 169270, "pid": 76337, "tid": -914061504, "ts": 1716454224058590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224112413, "dur": 69, "args": { "External id": 169278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169278, "pid": 5, "tid": 7, "ts": 1716454224112413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058622, "dur": 8, "args": { "External id": 169278, "cbid": 211, "correlation": 169278 } }, { "ph": "s", "id": 169278, "pid": 76337, "tid": -914061504, "ts": 1716454224058622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224058701, "dur": 1, "args": { "External id": 169294, "cbid": 251, "correlation": 169294 } }, { "ph": "f", "id": 169294, "pid": 76337, "tid": -914061504, "ts": 1716454224058701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224112484, "dur": 1, "args": { "External id": 169296, "device": 5, "context": 1, "stream": 7, "correlation": 169296, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 169296, "pid": 5, "tid": 7, "ts": 1716454224112484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224058707, "dur": 12, "args": { "External id": 169296, "cbid": 51, "correlation": 169296 } }, { "ph": "s", "id": 169296, "pid": 76337, "tid": -914061504, "ts": 1716454224058707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224112487, "dur": 269, "args": { "External id": 169297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169297, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169297, "pid": 5, "tid": 7, "ts": 1716454224112487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058720, "dur": 11, "args": { "External id": 169297, "cbid": 211, "correlation": 169297 } }, { "ph": "s", "id": 169297, "pid": 76337, "tid": -914061504, "ts": 1716454224058720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224112757, "dur": 15, "args": { "External id": 169305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169305, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169305, "pid": 5, "tid": 7, "ts": 1716454224112757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058762, "dur": 10, "args": { "External id": 169305, "cbid": 211, "correlation": 169305 } }, { "ph": "s", "id": 169305, "pid": 76337, "tid": -914061504, "ts": 1716454224058762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224112773, "dur": 38, "args": { "External id": 169316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169316, "pid": 5, "tid": 7, "ts": 1716454224112773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058829, "dur": 12, "args": { "External id": 169316, "cbid": 211, "correlation": 169316 } }, { "ph": "s", "id": 169316, "pid": 76337, "tid": -914061504, "ts": 1716454224058829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224058893, "dur": 0, "args": { "External id": 169328, "cbid": 317, "correlation": 169328 } }, { "ph": "f", "id": 169328, "pid": 76337, "tid": -914061504, "ts": 1716454224058893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224058894, "dur": 0, "args": { "External id": 169329, "cbid": 203, "correlation": 169329 } }, { "ph": "f", "id": 169329, "pid": 76337, "tid": -914061504, "ts": 1716454224058894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224058895, "dur": 0, "args": { "External id": 169330, "cbid": 205, "correlation": 169330 } }, { "ph": "f", "id": 169330, "pid": 76337, "tid": -914061504, "ts": 1716454224058895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224112813, "dur": 13, "args": { "External id": 169334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169334, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169334, "pid": 5, "tid": 7, "ts": 1716454224112813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058910, "dur": 13, "args": { "External id": 169334, "cbid": 211, "correlation": 169334 } }, { "ph": "s", "id": 169334, "pid": 76337, "tid": -914061504, "ts": 1716454224058910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224112827, "dur": 4, "args": { "External id": 169336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169336, "pid": 5, "tid": 7, "ts": 1716454224112827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058927, "dur": 6, "args": { "External id": 169336, "cbid": 211, "correlation": 169336 } }, { "ph": "s", "id": 169336, "pid": 76337, "tid": -914061504, "ts": 1716454224058927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224058936, "dur": 0, "args": { "External id": 169337, "cbid": 51, "correlation": 169337 } }, { "ph": "s", "id": 169337, "pid": 76337, "tid": -914061504, "ts": 1716454224058936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224112833, "dur": 97, "args": { "External id": 169338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169338, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 169338, "pid": 5, "tid": 7, "ts": 1716454224112833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058937, "dur": 5, "args": { "External id": 169338, "cbid": 211, "correlation": 169338 } }, { "ph": "s", "id": 169338, "pid": 76337, "tid": -914061504, "ts": 1716454224058937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224112931, "dur": 17, "args": { "External id": 169343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169343, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169343, "pid": 5, "tid": 7, "ts": 1716454224112931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224058964, "dur": 18, "args": { "External id": 169343, "cbid": 211, "correlation": 169343 } }, { "ph": "s", "id": 169343, "pid": 76337, "tid": -914061504, "ts": 1716454224058964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224112949, "dur": 11, "args": { "External id": 169351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169351, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169351, "pid": 5, "tid": 7, "ts": 1716454224112949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059005, "dur": 10, "args": { "External id": 169351, "cbid": 211, "correlation": 169351 } }, { "ph": "s", "id": 169351, "pid": 76337, "tid": -914061504, "ts": 1716454224059005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224112962, "dur": 18, "args": { "External id": 169371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169371, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 169371, "pid": 5, "tid": 7, "ts": 1716454224112962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059080, "dur": 12, "args": { "External id": 169371, "cbid": 211, "correlation": 169371 } }, { "ph": "s", "id": 169371, "pid": 76337, "tid": -914061504, "ts": 1716454224059080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224112982, "dur": 5, "args": { "External id": 169383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169383, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 169383, "pid": 5, "tid": 7, "ts": 1716454224112982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059102, "dur": 6, "args": { "External id": 169383, "cbid": 211, "correlation": 169383 } }, { "ph": "s", "id": 169383, "pid": 76337, "tid": -914061504, "ts": 1716454224059102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224112988, "dur": 18, "args": { "External id": 169386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169386, "pid": 5, "tid": 7, "ts": 1716454224112988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059121, "dur": 6, "args": { "External id": 169386, "cbid": 211, "correlation": 169386 } }, { "ph": "s", "id": 169386, "pid": 76337, "tid": -914061504, "ts": 1716454224059121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224113007, "dur": 12, "args": { "External id": 169395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169395, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169395, "pid": 5, "tid": 7, "ts": 1716454224113007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059158, "dur": 10, "args": { "External id": 169395, "cbid": 211, "correlation": 169395 } }, { "ph": "s", "id": 169395, "pid": 76337, "tid": -914061504, "ts": 1716454224059158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224059209, "dur": 0, "args": { "External id": 169405, "cbid": 317, "correlation": 169405 } }, { "ph": "f", "id": 169405, "pid": 76337, "tid": -914061504, "ts": 1716454224059209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224059210, "dur": 0, "args": { "External id": 169406, "cbid": 203, "correlation": 169406 } }, { "ph": "f", "id": 169406, "pid": 76337, "tid": -914061504, "ts": 1716454224059210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224059211, "dur": 0, "args": { "External id": 169407, "cbid": 205, "correlation": 169407 } }, { "ph": "f", "id": 169407, "pid": 76337, "tid": -914061504, "ts": 1716454224059211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224113020, "dur": 11, "args": { "External id": 169411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169411, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169411, "pid": 5, "tid": 7, "ts": 1716454224113020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059226, "dur": 12, "args": { "External id": 169411, "cbid": 211, "correlation": 169411 } }, { "ph": "s", "id": 169411, "pid": 76337, "tid": -914061504, "ts": 1716454224059226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224113033, "dur": 165, "args": { "External id": 169413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169413, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169413, "pid": 5, "tid": 7, "ts": 1716454224113033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059240, "dur": 5, "args": { "External id": 169413, "cbid": 211, "correlation": 169413 } }, { "ph": "s", "id": 169413, "pid": 76337, "tid": -914061504, "ts": 1716454224059240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224113201, "dur": 1, "args": { "External id": 169415, "device": 5, "context": 1, "stream": 7, "correlation": 169415, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 169415, "pid": 5, "tid": 7, "ts": 1716454224113201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224059251, "dur": 8, "args": { "External id": 169415, "cbid": 51, "correlation": 169415 } }, { "ph": "s", "id": 169415, "pid": 76337, "tid": -914061504, "ts": 1716454224059251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224113204, "dur": 669, "args": { "External id": 169416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169416, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169416, "pid": 5, "tid": 7, "ts": 1716454224113204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059260, "dur": 7, "args": { "External id": 169416, "cbid": 211, "correlation": 169416 } }, { "ph": "s", "id": 169416, "pid": 76337, "tid": -914061504, "ts": 1716454224059260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224113875, "dur": 15, "args": { "External id": 169418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169418, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169418, "pid": 5, "tid": 7, "ts": 1716454224113875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059271, "dur": 5, "args": { "External id": 169418, "cbid": 211, "correlation": 169418 } }, { "ph": "s", "id": 169418, "pid": 76337, "tid": -914061504, "ts": 1716454224059271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224113891, "dur": 15, "args": { "External id": 169424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169424, "pid": 5, "tid": 7, "ts": 1716454224113891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059299, "dur": 9, "args": { "External id": 169424, "cbid": 211, "correlation": 169424 } }, { "ph": "s", "id": 169424, "pid": 76337, "tid": -914061504, "ts": 1716454224059299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224113907, "dur": 4, "args": { "External id": 169432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169432, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 169432, "pid": 5, "tid": 7, "ts": 1716454224113907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059342, "dur": 9, "args": { "External id": 169432, "cbid": 211, "correlation": 169432 } }, { "ph": "s", "id": 169432, "pid": 76337, "tid": -914061504, "ts": 1716454224059342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224059407, "dur": 1, "args": { "External id": 169448, "cbid": 251, "correlation": 169448 } }, { "ph": "f", "id": 169448, "pid": 76337, "tid": -914061504, "ts": 1716454224059407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224059412, "dur": 0, "args": { "External id": 169450, "cbid": 251, "correlation": 169450 } }, { "ph": "f", "id": 169450, "pid": 76337, "tid": -914061504, "ts": 1716454224059412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224113912, "dur": 13, "args": { "External id": 169451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169451, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169451, "pid": 5, "tid": 7, "ts": 1716454224113912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059414, "dur": 12, "args": { "External id": 169451, "cbid": 211, "correlation": 169451 } }, { "ph": "s", "id": 169451, "pid": 76337, "tid": -914061504, "ts": 1716454224059414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224113927, "dur": 5, "args": { "External id": 169453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169453, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169453, "pid": 5, "tid": 7, "ts": 1716454224113927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059428, "dur": 6, "args": { "External id": 169453, "cbid": 211, "correlation": 169453 } }, { "ph": "s", "id": 169453, "pid": 76337, "tid": -914061504, "ts": 1716454224059428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224113933, "dur": 17, "args": { "External id": 169463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169463, "pid": 5, "tid": 7, "ts": 1716454224113933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059486, "dur": 12, "args": { "External id": 169463, "cbid": 211, "correlation": 169463 } }, { "ph": "s", "id": 169463, "pid": 76337, "tid": -914061504, "ts": 1716454224059486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224113951, "dur": 18, "args": { "External id": 169483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169483, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 169483, "pid": 5, "tid": 7, "ts": 1716454224113951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059552, "dur": 11, "args": { "External id": 169483, "cbid": 211, "correlation": 169483 } }, { "ph": "s", "id": 169483, "pid": 76337, "tid": -914061504, "ts": 1716454224059552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224113970, "dur": 5, "args": { "External id": 169495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169495, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 169495, "pid": 5, "tid": 7, "ts": 1716454224113970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059574, "dur": 7, "args": { "External id": 169495, "cbid": 211, "correlation": 169495 } }, { "ph": "s", "id": 169495, "pid": 76337, "tid": -914061504, "ts": 1716454224059574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224113976, "dur": 17, "args": { "External id": 169498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169498, "pid": 5, "tid": 7, "ts": 1716454224113976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059593, "dur": 6, "args": { "External id": 169498, "cbid": 211, "correlation": 169498 } }, { "ph": "s", "id": 169498, "pid": 76337, "tid": -914061504, "ts": 1716454224059593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224113995, "dur": 11, "args": { "External id": 169507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169507, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169507, "pid": 5, "tid": 7, "ts": 1716454224113995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059633, "dur": 9, "args": { "External id": 169507, "cbid": 211, "correlation": 169507 } }, { "ph": "s", "id": 169507, "pid": 76337, "tid": -914061504, "ts": 1716454224059633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224059695, "dur": 0, "args": { "External id": 169517, "cbid": 317, "correlation": 169517 } }, { "ph": "f", "id": 169517, "pid": 76337, "tid": -914061504, "ts": 1716454224059695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224059695, "dur": 0, "args": { "External id": 169518, "cbid": 203, "correlation": 169518 } }, { "ph": "f", "id": 169518, "pid": 76337, "tid": -914061504, "ts": 1716454224059695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224059696, "dur": 0, "args": { "External id": 169519, "cbid": 205, "correlation": 169519 } }, { "ph": "f", "id": 169519, "pid": 76337, "tid": -914061504, "ts": 1716454224059696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224114007, "dur": 11, "args": { "External id": 169523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169523, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169523, "pid": 5, "tid": 7, "ts": 1716454224114007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059710, "dur": 13, "args": { "External id": 169523, "cbid": 211, "correlation": 169523 } }, { "ph": "s", "id": 169523, "pid": 76337, "tid": -914061504, "ts": 1716454224059710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224114020, "dur": 164, "args": { "External id": 169525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169525, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169525, "pid": 5, "tid": 7, "ts": 1716454224114020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059726, "dur": 5, "args": { "External id": 169525, "cbid": 211, "correlation": 169525 } }, { "ph": "s", "id": 169525, "pid": 76337, "tid": -914061504, "ts": 1716454224059726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224114186, "dur": 1, "args": { "External id": 169527, "device": 5, "context": 1, "stream": 7, "correlation": 169527, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 169527, "pid": 5, "tid": 7, "ts": 1716454224114186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224059737, "dur": 6, "args": { "External id": 169527, "cbid": 51, "correlation": 169527 } }, { "ph": "s", "id": 169527, "pid": 76337, "tid": -914061504, "ts": 1716454224059737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224114190, "dur": 652, "args": { "External id": 169528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169528, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169528, "pid": 5, "tid": 7, "ts": 1716454224114190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059744, "dur": 6, "args": { "External id": 169528, "cbid": 211, "correlation": 169528 } }, { "ph": "s", "id": 169528, "pid": 76337, "tid": -914061504, "ts": 1716454224059744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224114844, "dur": 13, "args": { "External id": 169530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169530, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169530, "pid": 5, "tid": 7, "ts": 1716454224114844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059753, "dur": 5, "args": { "External id": 169530, "cbid": 211, "correlation": 169530 } }, { "ph": "s", "id": 169530, "pid": 76337, "tid": -914061504, "ts": 1716454224059753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224114858, "dur": 15, "args": { "External id": 169536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169536, "pid": 5, "tid": 7, "ts": 1716454224114858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059782, "dur": 8, "args": { "External id": 169536, "cbid": 211, "correlation": 169536 } }, { "ph": "s", "id": 169536, "pid": 76337, "tid": -914061504, "ts": 1716454224059782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224114874, "dur": 12, "args": { "External id": 169544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169544, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169544, "pid": 5, "tid": 7, "ts": 1716454224114874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059813, "dur": 8, "args": { "External id": 169544, "cbid": 211, "correlation": 169544 } }, { "ph": "s", "id": 169544, "pid": 76337, "tid": -914061504, "ts": 1716454224059813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224114888, "dur": 10, "args": { "External id": 169552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169552, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169552, "pid": 5, "tid": 7, "ts": 1716454224114888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059842, "dur": 8, "args": { "External id": 169552, "cbid": 211, "correlation": 169552 } }, { "ph": "s", "id": 169552, "pid": 76337, "tid": -914061504, "ts": 1716454224059842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224114899, "dur": 18, "args": { "External id": 169572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169572, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 169572, "pid": 5, "tid": 7, "ts": 1716454224114899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059920, "dur": 13, "args": { "External id": 169572, "cbid": 211, "correlation": 169572 } }, { "ph": "s", "id": 169572, "pid": 76337, "tid": -914061504, "ts": 1716454224059920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224114919, "dur": 4, "args": { "External id": 169584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169584, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 169584, "pid": 5, "tid": 7, "ts": 1716454224114919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059942, "dur": 6, "args": { "External id": 169584, "cbid": 211, "correlation": 169584 } }, { "ph": "s", "id": 169584, "pid": 76337, "tid": -914061504, "ts": 1716454224059942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224114924, "dur": 16, "args": { "External id": 169587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169587, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169587, "pid": 5, "tid": 7, "ts": 1716454224114924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224059960, "dur": 7, "args": { "External id": 169587, "cbid": 211, "correlation": 169587 } }, { "ph": "s", "id": 169587, "pid": 76337, "tid": -914061504, "ts": 1716454224059960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224060031, "dur": 0, "args": { "External id": 169598, "cbid": 317, "correlation": 169598 } }, { "ph": "f", "id": 169598, "pid": 76337, "tid": -914061504, "ts": 1716454224060031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224060032, "dur": 0, "args": { "External id": 169599, "cbid": 203, "correlation": 169599 } }, { "ph": "f", "id": 169599, "pid": 76337, "tid": -914061504, "ts": 1716454224060032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224060032, "dur": 0, "args": { "External id": 169600, "cbid": 205, "correlation": 169600 } }, { "ph": "f", "id": 169600, "pid": 76337, "tid": -914061504, "ts": 1716454224060032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224114942, "dur": 11, "args": { "External id": 169604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169604, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169604, "pid": 5, "tid": 7, "ts": 1716454224114942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060045, "dur": 12, "args": { "External id": 169604, "cbid": 211, "correlation": 169604 } }, { "ph": "s", "id": 169604, "pid": 76337, "tid": -914061504, "ts": 1716454224060045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224114954, "dur": 4, "args": { "External id": 169606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169606, "pid": 5, "tid": 7, "ts": 1716454224114954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060062, "dur": 6, "args": { "External id": 169606, "cbid": 211, "correlation": 169606 } }, { "ph": "s", "id": 169606, "pid": 76337, "tid": -914061504, "ts": 1716454224060062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224060070, "dur": 0, "args": { "External id": 169607, "cbid": 51, "correlation": 169607 } }, { "ph": "s", "id": 169607, "pid": 76337, "tid": -914061504, "ts": 1716454224060070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224114960, "dur": 96, "args": { "External id": 169608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169608, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 169608, "pid": 5, "tid": 7, "ts": 1716454224114960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060071, "dur": 5, "args": { "External id": 169608, "cbid": 211, "correlation": 169608 } }, { "ph": "s", "id": 169608, "pid": 76337, "tid": -914061504, "ts": 1716454224060071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224115057, "dur": 15, "args": { "External id": 169613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169613, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169613, "pid": 5, "tid": 7, "ts": 1716454224115057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060098, "dur": 8, "args": { "External id": 169613, "cbid": 211, "correlation": 169613 } }, { "ph": "s", "id": 169613, "pid": 76337, "tid": -914061504, "ts": 1716454224060098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224115074, "dur": 85, "args": { "External id": 169622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169622, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169622, "pid": 5, "tid": 7, "ts": 1716454224115074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060180, "dur": 14, "args": { "External id": 169622, "cbid": 211, "correlation": 169622 } }, { "ph": "s", "id": 169622, "pid": 76337, "tid": -914061504, "ts": 1716454224060180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224115160, "dur": 30, "args": { "External id": 169644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169644, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169644, "pid": 5, "tid": 7, "ts": 1716454224115160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060238, "dur": 10, "args": { "External id": 169644, "cbid": 211, "correlation": 169644 } }, { "ph": "s", "id": 169644, "pid": 76337, "tid": -914061504, "ts": 1716454224060238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224060324, "dur": 2, "args": { "External id": 169655, "cbid": 251, "correlation": 169655 } }, { "ph": "f", "id": 169655, "pid": 76337, "tid": -914061504, "ts": 1716454224060324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224115191, "dur": 162, "args": { "External id": 169656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169656, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169656, "pid": 5, "tid": 7, "ts": 1716454224115191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060331, "dur": 13, "args": { "External id": 169656, "cbid": 211, "correlation": 169656 } }, { "ph": "s", "id": 169656, "pid": 76337, "tid": -914061504, "ts": 1716454224060331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224060400, "dur": 1, "args": { "External id": 169667, "cbid": 251, "correlation": 169667 } }, { "ph": "f", "id": 169667, "pid": 76337, "tid": -914061504, "ts": 1716454224060400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224115354, "dur": 160, "args": { "External id": 169668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169668, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169668, "pid": 5, "tid": 7, "ts": 1716454224115354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060404, "dur": 12, "args": { "External id": 169668, "cbid": 211, "correlation": 169668 } }, { "ph": "s", "id": 169668, "pid": 76337, "tid": -914061504, "ts": 1716454224060404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224060468, "dur": 1, "args": { "External id": 169679, "cbid": 251, "correlation": 169679 } }, { "ph": "f", "id": 169679, "pid": 76337, "tid": -914061504, "ts": 1716454224060468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224115516, "dur": 161, "args": { "External id": 169680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169680, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169680, "pid": 5, "tid": 7, "ts": 1716454224115516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060472, "dur": 12, "args": { "External id": 169680, "cbid": 211, "correlation": 169680 } }, { "ph": "s", "id": 169680, "pid": 76337, "tid": -914061504, "ts": 1716454224060472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224115678, "dur": 342, "args": { "External id": 169705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169705, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169705, "pid": 5, "tid": 7, "ts": 1716454224115678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060556, "dur": 12, "args": { "External id": 169705, "cbid": 211, "correlation": 169705 } }, { "ph": "s", "id": 169705, "pid": 76337, "tid": -914061504, "ts": 1716454224060556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224060655, "dur": 1, "args": { "External id": 169723, "cbid": 251, "correlation": 169723 } }, { "ph": "f", "id": 169723, "pid": 76337, "tid": -914061504, "ts": 1716454224060655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224116022, "dur": 167, "args": { "External id": 169725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169725, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169725, "pid": 5, "tid": 7, "ts": 1716454224116022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060661, "dur": 14, "args": { "External id": 169725, "cbid": 211, "correlation": 169725 } }, { "ph": "s", "id": 169725, "pid": 76337, "tid": -914061504, "ts": 1716454224060661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224116190, "dur": 19, "args": { "External id": 169733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169733, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169733, "pid": 5, "tid": 7, "ts": 1716454224116190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060731, "dur": 13, "args": { "External id": 169733, "cbid": 211, "correlation": 169733 } }, { "ph": "s", "id": 169733, "pid": 76337, "tid": -914061504, "ts": 1716454224060731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224116211, "dur": 28, "args": { "External id": 169741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169741, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169741, "pid": 5, "tid": 7, "ts": 1716454224116211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060770, "dur": 8, "args": { "External id": 169741, "cbid": 211, "correlation": 169741 } }, { "ph": "s", "id": 169741, "pid": 76337, "tid": -914061504, "ts": 1716454224060770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224116240, "dur": 19, "args": { "External id": 169752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169752, "pid": 5, "tid": 7, "ts": 1716454224116240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060841, "dur": 13, "args": { "External id": 169752, "cbid": 211, "correlation": 169752 } }, { "ph": "s", "id": 169752, "pid": 76337, "tid": -914061504, "ts": 1716454224060841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224116260, "dur": 16, "args": { "External id": 169774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169774, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169774, "pid": 5, "tid": 7, "ts": 1716454224116260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060872, "dur": 7, "args": { "External id": 169774, "cbid": 211, "correlation": 169774 } }, { "ph": "s", "id": 169774, "pid": 76337, "tid": -914061504, "ts": 1716454224060872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224060957, "dur": 1, "args": { "External id": 169785, "cbid": 251, "correlation": 169785 } }, { "ph": "f", "id": 169785, "pid": 76337, "tid": -914061504, "ts": 1716454224060957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224116278, "dur": 90, "args": { "External id": 169786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169786, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169786, "pid": 5, "tid": 7, "ts": 1716454224116278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224060963, "dur": 25, "args": { "External id": 169786, "cbid": 211, "correlation": 169786 } }, { "ph": "s", "id": 169786, "pid": 76337, "tid": -914061504, "ts": 1716454224060963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061043, "dur": 1, "args": { "External id": 169797, "cbid": 251, "correlation": 169797 } }, { "ph": "f", "id": 169797, "pid": 76337, "tid": -914061504, "ts": 1716454224061043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061047, "dur": 0, "args": { "External id": 169798, "cbid": 251, "correlation": 169798 } }, { "ph": "f", "id": 169798, "pid": 76337, "tid": -914061504, "ts": 1716454224061047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224116369, "dur": 12, "args": { "External id": 169799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169799, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169799, "pid": 5, "tid": 7, "ts": 1716454224116369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061049, "dur": 12, "args": { "External id": 169799, "cbid": 211, "correlation": 169799 } }, { "ph": "s", "id": 169799, "pid": 76337, "tid": -914061504, "ts": 1716454224061049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224116382, "dur": 6, "args": { "External id": 169801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169801, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169801, "pid": 5, "tid": 7, "ts": 1716454224116382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061063, "dur": 6, "args": { "External id": 169801, "cbid": 211, "correlation": 169801 } }, { "ph": "s", "id": 169801, "pid": 76337, "tid": -914061504, "ts": 1716454224061063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061119, "dur": 1, "args": { "External id": 169812, "cbid": 251, "correlation": 169812 } }, { "ph": "f", "id": 169812, "pid": 76337, "tid": -914061504, "ts": 1716454224061119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061123, "dur": 0, "args": { "External id": 169813, "cbid": 251, "correlation": 169813 } }, { "ph": "f", "id": 169813, "pid": 76337, "tid": -914061504, "ts": 1716454224061123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224116390, "dur": 8, "args": { "External id": 169814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169814, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169814, "pid": 5, "tid": 7, "ts": 1716454224116390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061124, "dur": 12, "args": { "External id": 169814, "cbid": 211, "correlation": 169814 } }, { "ph": "s", "id": 169814, "pid": 76337, "tid": -914061504, "ts": 1716454224061124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224116399, "dur": 4, "args": { "External id": 169816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169816, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169816, "pid": 5, "tid": 7, "ts": 1716454224116399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061138, "dur": 5, "args": { "External id": 169816, "cbid": 211, "correlation": 169816 } }, { "ph": "s", "id": 169816, "pid": 76337, "tid": -914061504, "ts": 1716454224061138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224116404, "dur": 55, "args": { "External id": 169841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169841, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169841, "pid": 5, "tid": 7, "ts": 1716454224116404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061214, "dur": 12, "args": { "External id": 169841, "cbid": 211, "correlation": 169841 } }, { "ph": "s", "id": 169841, "pid": 76337, "tid": -914061504, "ts": 1716454224061214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061312, "dur": 1, "args": { "External id": 169859, "cbid": 251, "correlation": 169859 } }, { "ph": "f", "id": 169859, "pid": 76337, "tid": -914061504, "ts": 1716454224061312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224116460, "dur": 91, "args": { "External id": 169861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169861, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169861, "pid": 5, "tid": 7, "ts": 1716454224116460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061317, "dur": 14, "args": { "External id": 169861, "cbid": 211, "correlation": 169861 } }, { "ph": "s", "id": 169861, "pid": 76337, "tid": -914061504, "ts": 1716454224061317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224116553, "dur": 10, "args": { "External id": 169869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169869, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169869, "pid": 5, "tid": 7, "ts": 1716454224116553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061388, "dur": 12, "args": { "External id": 169869, "cbid": 211, "correlation": 169869 } }, { "ph": "s", "id": 169869, "pid": 76337, "tid": -914061504, "ts": 1716454224061388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224116564, "dur": 21, "args": { "External id": 169877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169877, "pid": 5, "tid": 7, "ts": 1716454224116564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061428, "dur": 10, "args": { "External id": 169877, "cbid": 211, "correlation": 169877 } }, { "ph": "s", "id": 169877, "pid": 76337, "tid": -914061504, "ts": 1716454224061428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224116586, "dur": 18, "args": { "External id": 169899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169899, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169899, "pid": 5, "tid": 7, "ts": 1716454224116586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061480, "dur": 11, "args": { "External id": 169899, "cbid": 211, "correlation": 169899 } }, { "ph": "s", "id": 169899, "pid": 76337, "tid": -914061504, "ts": 1716454224061480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061567, "dur": 1, "args": { "External id": 169915, "cbid": 251, "correlation": 169915 } }, { "ph": "f", "id": 169915, "pid": 76337, "tid": -914061504, "ts": 1716454224061567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061572, "dur": 0, "args": { "External id": 169917, "cbid": 251, "correlation": 169917 } }, { "ph": "f", "id": 169917, "pid": 76337, "tid": -914061504, "ts": 1716454224061572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224116605, "dur": 495, "args": { "External id": 169918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169918, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 169918, "pid": 5, "tid": 7, "ts": 1716454224116605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061574, "dur": 13, "args": { "External id": 169918, "cbid": 211, "correlation": 169918 } }, { "ph": "s", "id": 169918, "pid": 76337, "tid": -914061504, "ts": 1716454224061574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224117102, "dur": 66, "args": { "External id": 169926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169926, "pid": 5, "tid": 7, "ts": 1716454224117102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061638, "dur": 13, "args": { "External id": 169926, "cbid": 211, "correlation": 169926 } }, { "ph": "s", "id": 169926, "pid": 76337, "tid": -914061504, "ts": 1716454224061638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224117169, "dur": 68, "args": { "External id": 169934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169934, "pid": 5, "tid": 7, "ts": 1716454224117169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061669, "dur": 8, "args": { "External id": 169934, "cbid": 211, "correlation": 169934 } }, { "ph": "s", "id": 169934, "pid": 76337, "tid": -914061504, "ts": 1716454224061669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224061748, "dur": 1, "args": { "External id": 169950, "cbid": 251, "correlation": 169950 } }, { "ph": "f", "id": 169950, "pid": 76337, "tid": -914061504, "ts": 1716454224061748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224117239, "dur": 1, "args": { "External id": 169952, "device": 5, "context": 1, "stream": 7, "correlation": 169952, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 169952, "pid": 5, "tid": 7, "ts": 1716454224117239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224061753, "dur": 9, "args": { "External id": 169952, "cbid": 51, "correlation": 169952 } }, { "ph": "s", "id": 169952, "pid": 76337, "tid": -914061504, "ts": 1716454224061753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224117243, "dur": 272, "args": { "External id": 169953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169953, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169953, "pid": 5, "tid": 7, "ts": 1716454224117243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061763, "dur": 11, "args": { "External id": 169953, "cbid": 211, "correlation": 169953 } }, { "ph": "s", "id": 169953, "pid": 76337, "tid": -914061504, "ts": 1716454224061763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224117516, "dur": 14, "args": { "External id": 169961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169961, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169961, "pid": 5, "tid": 7, "ts": 1716454224117516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061805, "dur": 10, "args": { "External id": 169961, "cbid": 211, "correlation": 169961 } }, { "ph": "s", "id": 169961, "pid": 76337, "tid": -914061504, "ts": 1716454224061805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224117532, "dur": 38, "args": { "External id": 169972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169972, "pid": 5, "tid": 7, "ts": 1716454224117532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061873, "dur": 12, "args": { "External id": 169972, "cbid": 211, "correlation": 169972 } }, { "ph": "s", "id": 169972, "pid": 76337, "tid": -914061504, "ts": 1716454224061873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224061937, "dur": 0, "args": { "External id": 169984, "cbid": 317, "correlation": 169984 } }, { "ph": "f", "id": 169984, "pid": 76337, "tid": -914061504, "ts": 1716454224061937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224061937, "dur": 0, "args": { "External id": 169985, "cbid": 203, "correlation": 169985 } }, { "ph": "f", "id": 169985, "pid": 76337, "tid": -914061504, "ts": 1716454224061937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224061938, "dur": 0, "args": { "External id": 169986, "cbid": 205, "correlation": 169986 } }, { "ph": "f", "id": 169986, "pid": 76337, "tid": -914061504, "ts": 1716454224061938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224117571, "dur": 13, "args": { "External id": 169990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169990, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169990, "pid": 5, "tid": 7, "ts": 1716454224117571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061954, "dur": 12, "args": { "External id": 169990, "cbid": 211, "correlation": 169990 } }, { "ph": "s", "id": 169990, "pid": 76337, "tid": -914061504, "ts": 1716454224061954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224117585, "dur": 4, "args": { "External id": 169992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 169992, "pid": 5, "tid": 7, "ts": 1716454224117585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061971, "dur": 15, "args": { "External id": 169992, "cbid": 211, "correlation": 169992 } }, { "ph": "s", "id": 169992, "pid": 76337, "tid": -914061504, "ts": 1716454224061971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224061989, "dur": 0, "args": { "External id": 169993, "cbid": 51, "correlation": 169993 } }, { "ph": "s", "id": 169993, "pid": 76337, "tid": -914061504, "ts": 1716454224061989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224117591, "dur": 99, "args": { "External id": 169994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169994, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 169994, "pid": 5, "tid": 7, "ts": 1716454224117591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224061990, "dur": 6, "args": { "External id": 169994, "cbid": 211, "correlation": 169994 } }, { "ph": "s", "id": 169994, "pid": 76337, "tid": -914061504, "ts": 1716454224061990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224117691, "dur": 16, "args": { "External id": 169999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 169999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 169999, "pid": 5, "tid": 7, "ts": 1716454224117691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062018, "dur": 9, "args": { "External id": 169999, "cbid": 211, "correlation": 169999 } }, { "ph": "s", "id": 169999, "pid": 76337, "tid": -914061504, "ts": 1716454224062018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224117709, "dur": 14, "args": { "External id": 170007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170007, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170007, "pid": 5, "tid": 7, "ts": 1716454224117709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062051, "dur": 8, "args": { "External id": 170007, "cbid": 211, "correlation": 170007 } }, { "ph": "s", "id": 170007, "pid": 76337, "tid": -914061504, "ts": 1716454224062051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224062122, "dur": 0, "args": { "External id": 170017, "cbid": 317, "correlation": 170017 } }, { "ph": "f", "id": 170017, "pid": 76337, "tid": -914061504, "ts": 1716454224062122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224062123, "dur": 0, "args": { "External id": 170018, "cbid": 203, "correlation": 170018 } }, { "ph": "f", "id": 170018, "pid": 76337, "tid": -914061504, "ts": 1716454224062123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224062124, "dur": 0, "args": { "External id": 170019, "cbid": 205, "correlation": 170019 } }, { "ph": "f", "id": 170019, "pid": 76337, "tid": -914061504, "ts": 1716454224062124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224117724, "dur": 13, "args": { "External id": 170023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170023, "pid": 5, "tid": 7, "ts": 1716454224117724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062138, "dur": 13, "args": { "External id": 170023, "cbid": 211, "correlation": 170023 } }, { "ph": "s", "id": 170023, "pid": 76337, "tid": -914061504, "ts": 1716454224062138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224117738, "dur": 164, "args": { "External id": 170025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170025, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170025, "pid": 5, "tid": 7, "ts": 1716454224117738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062153, "dur": 5, "args": { "External id": 170025, "cbid": 211, "correlation": 170025 } }, { "ph": "s", "id": 170025, "pid": 76337, "tid": -914061504, "ts": 1716454224062153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224117904, "dur": 1, "args": { "External id": 170027, "device": 5, "context": 1, "stream": 7, "correlation": 170027, "bytes": 960, "memory bandwidth (GB/s)": 0.5765765765765766 } }, { "ph": "f", "id": 170027, "pid": 5, "tid": 7, "ts": 1716454224117904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224062165, "dur": 6, "args": { "External id": 170027, "cbid": 51, "correlation": 170027 } }, { "ph": "s", "id": 170027, "pid": 76337, "tid": -914061504, "ts": 1716454224062165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224117908, "dur": 199, "args": { "External id": 170028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170028, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 170028, "pid": 5, "tid": 7, "ts": 1716454224117908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062172, "dur": 8, "args": { "External id": 170028, "cbid": 211, "correlation": 170028 } }, { "ph": "s", "id": 170028, "pid": 76337, "tid": -914061504, "ts": 1716454224062172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224118109, "dur": 6, "args": { "External id": 170030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170030, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170030, "pid": 5, "tid": 7, "ts": 1716454224118109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062185, "dur": 5, "args": { "External id": 170030, "cbid": 211, "correlation": 170030 } }, { "ph": "s", "id": 170030, "pid": 76337, "tid": -914061504, "ts": 1716454224062185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224118117, "dur": 6, "args": { "External id": 170036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170036, "pid": 5, "tid": 7, "ts": 1716454224118117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062213, "dur": 9, "args": { "External id": 170036, "cbid": 211, "correlation": 170036 } }, { "ph": "s", "id": 170036, "pid": 76337, "tid": -914061504, "ts": 1716454224062213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224118124, "dur": 11, "args": { "External id": 170056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170056, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170056, "pid": 5, "tid": 7, "ts": 1716454224118124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062309, "dur": 12, "args": { "External id": 170056, "cbid": 211, "correlation": 170056 } }, { "ph": "s", "id": 170056, "pid": 76337, "tid": -914061504, "ts": 1716454224062309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224118136, "dur": 4, "args": { "External id": 170068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170068, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170068, "pid": 5, "tid": 7, "ts": 1716454224118136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062331, "dur": 6, "args": { "External id": 170068, "cbid": 211, "correlation": 170068 } }, { "ph": "s", "id": 170068, "pid": 76337, "tid": -914061504, "ts": 1716454224062331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224118142, "dur": 9, "args": { "External id": 170071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170071, "pid": 5, "tid": 7, "ts": 1716454224118142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062351, "dur": 6, "args": { "External id": 170071, "cbid": 211, "correlation": 170071 } }, { "ph": "s", "id": 170071, "pid": 76337, "tid": -914061504, "ts": 1716454224062351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224118152, "dur": 5, "args": { "External id": 170080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170080, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170080, "pid": 5, "tid": 7, "ts": 1716454224118152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062390, "dur": 10, "args": { "External id": 170080, "cbid": 211, "correlation": 170080 } }, { "ph": "s", "id": 170080, "pid": 76337, "tid": -914061504, "ts": 1716454224062390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224062441, "dur": 0, "args": { "External id": 170090, "cbid": 317, "correlation": 170090 } }, { "ph": "f", "id": 170090, "pid": 76337, "tid": -914061504, "ts": 1716454224062441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224062442, "dur": 0, "args": { "External id": 170091, "cbid": 203, "correlation": 170091 } }, { "ph": "f", "id": 170091, "pid": 76337, "tid": -914061504, "ts": 1716454224062442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224062442, "dur": 0, "args": { "External id": 170092, "cbid": 205, "correlation": 170092 } }, { "ph": "f", "id": 170092, "pid": 76337, "tid": -914061504, "ts": 1716454224062442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224118159, "dur": 5, "args": { "External id": 170096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170096, "pid": 5, "tid": 7, "ts": 1716454224118159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062457, "dur": 11, "args": { "External id": 170096, "cbid": 211, "correlation": 170096 } }, { "ph": "s", "id": 170096, "pid": 76337, "tid": -914061504, "ts": 1716454224062457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224118165, "dur": 163, "args": { "External id": 170098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170098, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170098, "pid": 5, "tid": 7, "ts": 1716454224118165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062471, "dur": 5, "args": { "External id": 170098, "cbid": 211, "correlation": 170098 } }, { "ph": "s", "id": 170098, "pid": 76337, "tid": -914061504, "ts": 1716454224062471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224118330, "dur": 1, "args": { "External id": 170100, "device": 5, "context": 1, "stream": 7, "correlation": 170100, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 170100, "pid": 5, "tid": 7, "ts": 1716454224118330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224062483, "dur": 7, "args": { "External id": 170100, "cbid": 51, "correlation": 170100 } }, { "ph": "s", "id": 170100, "pid": 76337, "tid": -914061504, "ts": 1716454224062483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224118334, "dur": 272, "args": { "External id": 170101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170101, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170101, "pid": 5, "tid": 7, "ts": 1716454224118334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062491, "dur": 7, "args": { "External id": 170101, "cbid": 211, "correlation": 170101 } }, { "ph": "s", "id": 170101, "pid": 76337, "tid": -914061504, "ts": 1716454224062491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224118607, "dur": 6, "args": { "External id": 170103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170103, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170103, "pid": 5, "tid": 7, "ts": 1716454224118607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062502, "dur": 5, "args": { "External id": 170103, "cbid": 211, "correlation": 170103 } }, { "ph": "s", "id": 170103, "pid": 76337, "tid": -914061504, "ts": 1716454224062502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224118614, "dur": 6, "args": { "External id": 170109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170109, "pid": 5, "tid": 7, "ts": 1716454224118614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062529, "dur": 8, "args": { "External id": 170109, "cbid": 211, "correlation": 170109 } }, { "ph": "s", "id": 170109, "pid": 76337, "tid": -914061504, "ts": 1716454224062529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224118622, "dur": 4, "args": { "External id": 170117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170117, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 170117, "pid": 5, "tid": 7, "ts": 1716454224118622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062573, "dur": 9, "args": { "External id": 170117, "cbid": 211, "correlation": 170117 } }, { "ph": "s", "id": 170117, "pid": 76337, "tid": -914061504, "ts": 1716454224062573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224062636, "dur": 1, "args": { "External id": 170133, "cbid": 251, "correlation": 170133 } }, { "ph": "f", "id": 170133, "pid": 76337, "tid": -914061504, "ts": 1716454224062636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224062641, "dur": 0, "args": { "External id": 170135, "cbid": 251, "correlation": 170135 } }, { "ph": "f", "id": 170135, "pid": 76337, "tid": -914061504, "ts": 1716454224062641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224118627, "dur": 13, "args": { "External id": 170136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170136, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170136, "pid": 5, "tid": 7, "ts": 1716454224118627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062644, "dur": 12, "args": { "External id": 170136, "cbid": 211, "correlation": 170136 } }, { "ph": "s", "id": 170136, "pid": 76337, "tid": -914061504, "ts": 1716454224062644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224118642, "dur": 5, "args": { "External id": 170138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170138, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170138, "pid": 5, "tid": 7, "ts": 1716454224118642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062658, "dur": 5, "args": { "External id": 170138, "cbid": 211, "correlation": 170138 } }, { "ph": "s", "id": 170138, "pid": 76337, "tid": -914061504, "ts": 1716454224062658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224118649, "dur": 6, "args": { "External id": 170148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170148, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170148, "pid": 5, "tid": 7, "ts": 1716454224118649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062715, "dur": 12, "args": { "External id": 170148, "cbid": 211, "correlation": 170148 } }, { "ph": "s", "id": 170148, "pid": 76337, "tid": -914061504, "ts": 1716454224062715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224118656, "dur": 10, "args": { "External id": 170168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170168, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170168, "pid": 5, "tid": 7, "ts": 1716454224118656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062781, "dur": 11, "args": { "External id": 170168, "cbid": 211, "correlation": 170168 } }, { "ph": "s", "id": 170168, "pid": 76337, "tid": -914061504, "ts": 1716454224062781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224118667, "dur": 4, "args": { "External id": 170180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170180, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170180, "pid": 5, "tid": 7, "ts": 1716454224118667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062803, "dur": 6, "args": { "External id": 170180, "cbid": 211, "correlation": 170180 } }, { "ph": "s", "id": 170180, "pid": 76337, "tid": -914061504, "ts": 1716454224062803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224118672, "dur": 7, "args": { "External id": 170183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170183, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170183, "pid": 5, "tid": 7, "ts": 1716454224118672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062822, "dur": 6, "args": { "External id": 170183, "cbid": 211, "correlation": 170183 } }, { "ph": "s", "id": 170183, "pid": 76337, "tid": -914061504, "ts": 1716454224062822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224118680, "dur": 5, "args": { "External id": 170192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170192, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170192, "pid": 5, "tid": 7, "ts": 1716454224118680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062862, "dur": 9, "args": { "External id": 170192, "cbid": 211, "correlation": 170192 } }, { "ph": "s", "id": 170192, "pid": 76337, "tid": -914061504, "ts": 1716454224062862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224062925, "dur": 0, "args": { "External id": 170202, "cbid": 317, "correlation": 170202 } }, { "ph": "f", "id": 170202, "pid": 76337, "tid": -914061504, "ts": 1716454224062925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224062926, "dur": 0, "args": { "External id": 170203, "cbid": 203, "correlation": 170203 } }, { "ph": "f", "id": 170203, "pid": 76337, "tid": -914061504, "ts": 1716454224062926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224062926, "dur": 0, "args": { "External id": 170204, "cbid": 205, "correlation": 170204 } }, { "ph": "f", "id": 170204, "pid": 76337, "tid": -914061504, "ts": 1716454224062926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224118686, "dur": 5, "args": { "External id": 170208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170208, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170208, "pid": 5, "tid": 7, "ts": 1716454224118686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062941, "dur": 13, "args": { "External id": 170208, "cbid": 211, "correlation": 170208 } }, { "ph": "s", "id": 170208, "pid": 76337, "tid": -914061504, "ts": 1716454224062941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224118693, "dur": 164, "args": { "External id": 170210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170210, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170210, "pid": 5, "tid": 7, "ts": 1716454224118693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062956, "dur": 5, "args": { "External id": 170210, "cbid": 211, "correlation": 170210 } }, { "ph": "s", "id": 170210, "pid": 76337, "tid": -914061504, "ts": 1716454224062956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224118858, "dur": 1, "args": { "External id": 170212, "device": 5, "context": 1, "stream": 7, "correlation": 170212, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 170212, "pid": 5, "tid": 7, "ts": 1716454224118858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224062967, "dur": 15, "args": { "External id": 170212, "cbid": 51, "correlation": 170212 } }, { "ph": "s", "id": 170212, "pid": 76337, "tid": -914061504, "ts": 1716454224062967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224118862, "dur": 261, "args": { "External id": 170213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170213, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170213, "pid": 5, "tid": 7, "ts": 1716454224118862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062983, "dur": 7, "args": { "External id": 170213, "cbid": 211, "correlation": 170213 } }, { "ph": "s", "id": 170213, "pid": 76337, "tid": -914061504, "ts": 1716454224062983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224119125, "dur": 6, "args": { "External id": 170215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170215, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170215, "pid": 5, "tid": 7, "ts": 1716454224119125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224062993, "dur": 5, "args": { "External id": 170215, "cbid": 211, "correlation": 170215 } }, { "ph": "s", "id": 170215, "pid": 76337, "tid": -914061504, "ts": 1716454224062993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224119132, "dur": 6, "args": { "External id": 170221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170221, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170221, "pid": 5, "tid": 7, "ts": 1716454224119132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063023, "dur": 8, "args": { "External id": 170221, "cbid": 211, "correlation": 170221 } }, { "ph": "s", "id": 170221, "pid": 76337, "tid": -914061504, "ts": 1716454224063023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224119139, "dur": 5, "args": { "External id": 170229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170229, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170229, "pid": 5, "tid": 7, "ts": 1716454224119139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063057, "dur": 9, "args": { "External id": 170229, "cbid": 211, "correlation": 170229 } }, { "ph": "s", "id": 170229, "pid": 76337, "tid": -914061504, "ts": 1716454224063057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224119146, "dur": 4, "args": { "External id": 170237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170237, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170237, "pid": 5, "tid": 7, "ts": 1716454224119146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063086, "dur": 8, "args": { "External id": 170237, "cbid": 211, "correlation": 170237 } }, { "ph": "s", "id": 170237, "pid": 76337, "tid": -914061504, "ts": 1716454224063086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224119152, "dur": 10, "args": { "External id": 170257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170257, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170257, "pid": 5, "tid": 7, "ts": 1716454224119152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063161, "dur": 12, "args": { "External id": 170257, "cbid": 211, "correlation": 170257 } }, { "ph": "s", "id": 170257, "pid": 76337, "tid": -914061504, "ts": 1716454224063161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224119163, "dur": 4, "args": { "External id": 170269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170269, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170269, "pid": 5, "tid": 7, "ts": 1716454224119163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063182, "dur": 6, "args": { "External id": 170269, "cbid": 211, "correlation": 170269 } }, { "ph": "s", "id": 170269, "pid": 76337, "tid": -914061504, "ts": 1716454224063182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224119168, "dur": 7, "args": { "External id": 170272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170272, "pid": 5, "tid": 7, "ts": 1716454224119168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063200, "dur": 8, "args": { "External id": 170272, "cbid": 211, "correlation": 170272 } }, { "ph": "s", "id": 170272, "pid": 76337, "tid": -914061504, "ts": 1716454224063200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224119176, "dur": 5, "args": { "External id": 170281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170281, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170281, "pid": 5, "tid": 7, "ts": 1716454224119176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063239, "dur": 9, "args": { "External id": 170281, "cbid": 211, "correlation": 170281 } }, { "ph": "s", "id": 170281, "pid": 76337, "tid": -914061504, "ts": 1716454224063239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224063290, "dur": 0, "args": { "External id": 170291, "cbid": 317, "correlation": 170291 } }, { "ph": "f", "id": 170291, "pid": 76337, "tid": -914061504, "ts": 1716454224063290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224063291, "dur": 0, "args": { "External id": 170292, "cbid": 203, "correlation": 170292 } }, { "ph": "f", "id": 170292, "pid": 76337, "tid": -914061504, "ts": 1716454224063291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224063292, "dur": 0, "args": { "External id": 170293, "cbid": 205, "correlation": 170293 } }, { "ph": "f", "id": 170293, "pid": 76337, "tid": -914061504, "ts": 1716454224063292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224119182, "dur": 5, "args": { "External id": 170297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170297, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170297, "pid": 5, "tid": 7, "ts": 1716454224119182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063305, "dur": 12, "args": { "External id": 170297, "cbid": 211, "correlation": 170297 } }, { "ph": "s", "id": 170297, "pid": 76337, "tid": -914061504, "ts": 1716454224063305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224119188, "dur": 163, "args": { "External id": 170299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170299, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170299, "pid": 5, "tid": 7, "ts": 1716454224119188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063319, "dur": 5, "args": { "External id": 170299, "cbid": 211, "correlation": 170299 } }, { "ph": "s", "id": 170299, "pid": 76337, "tid": -914061504, "ts": 1716454224063319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224119353, "dur": 1, "args": { "External id": 170301, "device": 5, "context": 1, "stream": 7, "correlation": 170301, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 170301, "pid": 5, "tid": 7, "ts": 1716454224119353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224063330, "dur": 6, "args": { "External id": 170301, "cbid": 51, "correlation": 170301 } }, { "ph": "s", "id": 170301, "pid": 76337, "tid": -914061504, "ts": 1716454224063330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224119356, "dur": 260, "args": { "External id": 170302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170302, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170302, "pid": 5, "tid": 7, "ts": 1716454224119356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063337, "dur": 6, "args": { "External id": 170302, "cbid": 211, "correlation": 170302 } }, { "ph": "s", "id": 170302, "pid": 76337, "tid": -914061504, "ts": 1716454224063337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224119618, "dur": 6, "args": { "External id": 170304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170304, "pid": 5, "tid": 7, "ts": 1716454224119618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063346, "dur": 5, "args": { "External id": 170304, "cbid": 211, "correlation": 170304 } }, { "ph": "s", "id": 170304, "pid": 76337, "tid": -914061504, "ts": 1716454224063346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224119625, "dur": 6, "args": { "External id": 170310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170310, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170310, "pid": 5, "tid": 7, "ts": 1716454224119625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063375, "dur": 9, "args": { "External id": 170310, "cbid": 211, "correlation": 170310 } }, { "ph": "s", "id": 170310, "pid": 76337, "tid": -914061504, "ts": 1716454224063375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224119633, "dur": 4, "args": { "External id": 170318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170318, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 170318, "pid": 5, "tid": 7, "ts": 1716454224119633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063419, "dur": 9, "args": { "External id": 170318, "cbid": 211, "correlation": 170318 } }, { "ph": "s", "id": 170318, "pid": 76337, "tid": -914061504, "ts": 1716454224063419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224063482, "dur": 1, "args": { "External id": 170334, "cbid": 251, "correlation": 170334 } }, { "ph": "f", "id": 170334, "pid": 76337, "tid": -914061504, "ts": 1716454224063482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224063487, "dur": 0, "args": { "External id": 170336, "cbid": 251, "correlation": 170336 } }, { "ph": "f", "id": 170336, "pid": 76337, "tid": -914061504, "ts": 1716454224063487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224119638, "dur": 11, "args": { "External id": 170337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170337, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170337, "pid": 5, "tid": 7, "ts": 1716454224119638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063489, "dur": 11, "args": { "External id": 170337, "cbid": 211, "correlation": 170337 } }, { "ph": "s", "id": 170337, "pid": 76337, "tid": -914061504, "ts": 1716454224063489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224119650, "dur": 4, "args": { "External id": 170339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170339, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170339, "pid": 5, "tid": 7, "ts": 1716454224119650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063501, "dur": 5, "args": { "External id": 170339, "cbid": 211, "correlation": 170339 } }, { "ph": "s", "id": 170339, "pid": 76337, "tid": -914061504, "ts": 1716454224063501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224119654, "dur": 6, "args": { "External id": 170349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170349, "pid": 5, "tid": 7, "ts": 1716454224119654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063557, "dur": 13, "args": { "External id": 170349, "cbid": 211, "correlation": 170349 } }, { "ph": "s", "id": 170349, "pid": 76337, "tid": -914061504, "ts": 1716454224063557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224119662, "dur": 10, "args": { "External id": 170369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170369, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170369, "pid": 5, "tid": 7, "ts": 1716454224119662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063624, "dur": 11, "args": { "External id": 170369, "cbid": 211, "correlation": 170369 } }, { "ph": "s", "id": 170369, "pid": 76337, "tid": -914061504, "ts": 1716454224063624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224119673, "dur": 4, "args": { "External id": 170381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170381, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170381, "pid": 5, "tid": 7, "ts": 1716454224119673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063645, "dur": 6, "args": { "External id": 170381, "cbid": 211, "correlation": 170381 } }, { "ph": "s", "id": 170381, "pid": 76337, "tid": -914061504, "ts": 1716454224063645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224119678, "dur": 7, "args": { "External id": 170384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170384, "pid": 5, "tid": 7, "ts": 1716454224119678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063665, "dur": 6, "args": { "External id": 170384, "cbid": 211, "correlation": 170384 } }, { "ph": "s", "id": 170384, "pid": 76337, "tid": -914061504, "ts": 1716454224063665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224119686, "dur": 5, "args": { "External id": 170393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170393, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170393, "pid": 5, "tid": 7, "ts": 1716454224119686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063705, "dur": 10, "args": { "External id": 170393, "cbid": 211, "correlation": 170393 } }, { "ph": "s", "id": 170393, "pid": 76337, "tid": -914061504, "ts": 1716454224063705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224063767, "dur": 0, "args": { "External id": 170403, "cbid": 317, "correlation": 170403 } }, { "ph": "f", "id": 170403, "pid": 76337, "tid": -914061504, "ts": 1716454224063767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224063768, "dur": 0, "args": { "External id": 170404, "cbid": 203, "correlation": 170404 } }, { "ph": "f", "id": 170404, "pid": 76337, "tid": -914061504, "ts": 1716454224063768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224063769, "dur": 0, "args": { "External id": 170405, "cbid": 205, "correlation": 170405 } }, { "ph": "f", "id": 170405, "pid": 76337, "tid": -914061504, "ts": 1716454224063769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224119692, "dur": 5, "args": { "External id": 170409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170409, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170409, "pid": 5, "tid": 7, "ts": 1716454224119692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063782, "dur": 12, "args": { "External id": 170409, "cbid": 211, "correlation": 170409 } }, { "ph": "s", "id": 170409, "pid": 76337, "tid": -914061504, "ts": 1716454224063782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224119699, "dur": 164, "args": { "External id": 170411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170411, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170411, "pid": 5, "tid": 7, "ts": 1716454224119699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063796, "dur": 5, "args": { "External id": 170411, "cbid": 211, "correlation": 170411 } }, { "ph": "s", "id": 170411, "pid": 76337, "tid": -914061504, "ts": 1716454224063796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224119865, "dur": 1, "args": { "External id": 170413, "device": 5, "context": 1, "stream": 7, "correlation": 170413, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 170413, "pid": 5, "tid": 7, "ts": 1716454224119865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224063807, "dur": 6, "args": { "External id": 170413, "cbid": 51, "correlation": 170413 } }, { "ph": "s", "id": 170413, "pid": 76337, "tid": -914061504, "ts": 1716454224063807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224119869, "dur": 261, "args": { "External id": 170414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170414, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170414, "pid": 5, "tid": 7, "ts": 1716454224119869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063814, "dur": 7, "args": { "External id": 170414, "cbid": 211, "correlation": 170414 } }, { "ph": "s", "id": 170414, "pid": 76337, "tid": -914061504, "ts": 1716454224063814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224120131, "dur": 6, "args": { "External id": 170416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170416, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170416, "pid": 5, "tid": 7, "ts": 1716454224120131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063827, "dur": 5, "args": { "External id": 170416, "cbid": 211, "correlation": 170416 } }, { "ph": "s", "id": 170416, "pid": 76337, "tid": -914061504, "ts": 1716454224063827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224120139, "dur": 6, "args": { "External id": 170422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170422, "pid": 5, "tid": 7, "ts": 1716454224120139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063854, "dur": 8, "args": { "External id": 170422, "cbid": 211, "correlation": 170422 } }, { "ph": "s", "id": 170422, "pid": 76337, "tid": -914061504, "ts": 1716454224063854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224120146, "dur": 5, "args": { "External id": 170430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170430, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170430, "pid": 5, "tid": 7, "ts": 1716454224120146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063886, "dur": 8, "args": { "External id": 170430, "cbid": 211, "correlation": 170430 } }, { "ph": "s", "id": 170430, "pid": 76337, "tid": -914061504, "ts": 1716454224063886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224120153, "dur": 4, "args": { "External id": 170438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170438, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170438, "pid": 5, "tid": 7, "ts": 1716454224120153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224063915, "dur": 9, "args": { "External id": 170438, "cbid": 211, "correlation": 170438 } }, { "ph": "s", "id": 170438, "pid": 76337, "tid": -914061504, "ts": 1716454224063915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224120158, "dur": 10, "args": { "External id": 170458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170458, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170458, "pid": 5, "tid": 7, "ts": 1716454224120158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064029, "dur": 13, "args": { "External id": 170458, "cbid": 211, "correlation": 170458 } }, { "ph": "s", "id": 170458, "pid": 76337, "tid": -914061504, "ts": 1716454224064029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224120170, "dur": 4, "args": { "External id": 170470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170470, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170470, "pid": 5, "tid": 7, "ts": 1716454224120170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064052, "dur": 6, "args": { "External id": 170470, "cbid": 211, "correlation": 170470 } }, { "ph": "s", "id": 170470, "pid": 76337, "tid": -914061504, "ts": 1716454224064052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224120174, "dur": 7, "args": { "External id": 170473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170473, "pid": 5, "tid": 7, "ts": 1716454224120174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064071, "dur": 6, "args": { "External id": 170473, "cbid": 211, "correlation": 170473 } }, { "ph": "s", "id": 170473, "pid": 76337, "tid": -914061504, "ts": 1716454224064071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224120182, "dur": 5, "args": { "External id": 170482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170482, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170482, "pid": 5, "tid": 7, "ts": 1716454224120182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064108, "dur": 10, "args": { "External id": 170482, "cbid": 211, "correlation": 170482 } }, { "ph": "s", "id": 170482, "pid": 76337, "tid": -914061504, "ts": 1716454224064108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224064160, "dur": 0, "args": { "External id": 170492, "cbid": 317, "correlation": 170492 } }, { "ph": "f", "id": 170492, "pid": 76337, "tid": -914061504, "ts": 1716454224064160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224064161, "dur": 0, "args": { "External id": 170493, "cbid": 203, "correlation": 170493 } }, { "ph": "f", "id": 170493, "pid": 76337, "tid": -914061504, "ts": 1716454224064161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224064162, "dur": 0, "args": { "External id": 170494, "cbid": 205, "correlation": 170494 } }, { "ph": "f", "id": 170494, "pid": 76337, "tid": -914061504, "ts": 1716454224064162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224120188, "dur": 5, "args": { "External id": 170498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170498, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170498, "pid": 5, "tid": 7, "ts": 1716454224120188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064176, "dur": 11, "args": { "External id": 170498, "cbid": 211, "correlation": 170498 } }, { "ph": "s", "id": 170498, "pid": 76337, "tid": -914061504, "ts": 1716454224064176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224120195, "dur": 164, "args": { "External id": 170500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170500, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170500, "pid": 5, "tid": 7, "ts": 1716454224120195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064190, "dur": 5, "args": { "External id": 170500, "cbid": 211, "correlation": 170500 } }, { "ph": "s", "id": 170500, "pid": 76337, "tid": -914061504, "ts": 1716454224064190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224120360, "dur": 1, "args": { "External id": 170502, "device": 5, "context": 1, "stream": 7, "correlation": 170502, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 170502, "pid": 5, "tid": 7, "ts": 1716454224120360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224064200, "dur": 6, "args": { "External id": 170502, "cbid": 51, "correlation": 170502 } }, { "ph": "s", "id": 170502, "pid": 76337, "tid": -914061504, "ts": 1716454224064200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224120364, "dur": 260, "args": { "External id": 170503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170503, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170503, "pid": 5, "tid": 7, "ts": 1716454224120364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064207, "dur": 6, "args": { "External id": 170503, "cbid": 211, "correlation": 170503 } }, { "ph": "s", "id": 170503, "pid": 76337, "tid": -914061504, "ts": 1716454224064207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224120626, "dur": 6, "args": { "External id": 170505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170505, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170505, "pid": 5, "tid": 7, "ts": 1716454224120626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064217, "dur": 6, "args": { "External id": 170505, "cbid": 211, "correlation": 170505 } }, { "ph": "s", "id": 170505, "pid": 76337, "tid": -914061504, "ts": 1716454224064217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224120633, "dur": 6, "args": { "External id": 170511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170511, "pid": 5, "tid": 7, "ts": 1716454224120633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064246, "dur": 8, "args": { "External id": 170511, "cbid": 211, "correlation": 170511 } }, { "ph": "s", "id": 170511, "pid": 76337, "tid": -914061504, "ts": 1716454224064246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224120640, "dur": 4, "args": { "External id": 170519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170519, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 170519, "pid": 5, "tid": 7, "ts": 1716454224120640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064291, "dur": 9, "args": { "External id": 170519, "cbid": 211, "correlation": 170519 } }, { "ph": "s", "id": 170519, "pid": 76337, "tid": -914061504, "ts": 1716454224064291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224064354, "dur": 1, "args": { "External id": 170535, "cbid": 251, "correlation": 170535 } }, { "ph": "f", "id": 170535, "pid": 76337, "tid": -914061504, "ts": 1716454224064354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224064359, "dur": 0, "args": { "External id": 170537, "cbid": 251, "correlation": 170537 } }, { "ph": "f", "id": 170537, "pid": 76337, "tid": -914061504, "ts": 1716454224064359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224120645, "dur": 10, "args": { "External id": 170538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170538, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170538, "pid": 5, "tid": 7, "ts": 1716454224120645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064361, "dur": 12, "args": { "External id": 170538, "cbid": 211, "correlation": 170538 } }, { "ph": "s", "id": 170538, "pid": 76337, "tid": -914061504, "ts": 1716454224064361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224120657, "dur": 4, "args": { "External id": 170540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170540, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170540, "pid": 5, "tid": 7, "ts": 1716454224120657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064375, "dur": 5, "args": { "External id": 170540, "cbid": 211, "correlation": 170540 } }, { "ph": "s", "id": 170540, "pid": 76337, "tid": -914061504, "ts": 1716454224064375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224120662, "dur": 6, "args": { "External id": 170550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170550, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170550, "pid": 5, "tid": 7, "ts": 1716454224120662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064431, "dur": 12, "args": { "External id": 170550, "cbid": 211, "correlation": 170550 } }, { "ph": "s", "id": 170550, "pid": 76337, "tid": -914061504, "ts": 1716454224064431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224120669, "dur": 10, "args": { "External id": 170570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170570, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170570, "pid": 5, "tid": 7, "ts": 1716454224120669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064497, "dur": 10, "args": { "External id": 170570, "cbid": 211, "correlation": 170570 } }, { "ph": "s", "id": 170570, "pid": 76337, "tid": -914061504, "ts": 1716454224064497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224120680, "dur": 4, "args": { "External id": 170582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170582, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170582, "pid": 5, "tid": 7, "ts": 1716454224120680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064517, "dur": 6, "args": { "External id": 170582, "cbid": 211, "correlation": 170582 } }, { "ph": "s", "id": 170582, "pid": 76337, "tid": -914061504, "ts": 1716454224064517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224120685, "dur": 7, "args": { "External id": 170585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170585, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170585, "pid": 5, "tid": 7, "ts": 1716454224120685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064537, "dur": 6, "args": { "External id": 170585, "cbid": 211, "correlation": 170585 } }, { "ph": "s", "id": 170585, "pid": 76337, "tid": -914061504, "ts": 1716454224064537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224120693, "dur": 5, "args": { "External id": 170594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170594, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170594, "pid": 5, "tid": 7, "ts": 1716454224120693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064577, "dur": 10, "args": { "External id": 170594, "cbid": 211, "correlation": 170594 } }, { "ph": "s", "id": 170594, "pid": 76337, "tid": -914061504, "ts": 1716454224064577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224064639, "dur": 0, "args": { "External id": 170604, "cbid": 317, "correlation": 170604 } }, { "ph": "f", "id": 170604, "pid": 76337, "tid": -914061504, "ts": 1716454224064639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224064640, "dur": 0, "args": { "External id": 170605, "cbid": 203, "correlation": 170605 } }, { "ph": "f", "id": 170605, "pid": 76337, "tid": -914061504, "ts": 1716454224064640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224064641, "dur": 0, "args": { "External id": 170606, "cbid": 205, "correlation": 170606 } }, { "ph": "f", "id": 170606, "pid": 76337, "tid": -914061504, "ts": 1716454224064641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224120699, "dur": 5, "args": { "External id": 170610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170610, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170610, "pid": 5, "tid": 7, "ts": 1716454224120699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064654, "dur": 12, "args": { "External id": 170610, "cbid": 211, "correlation": 170610 } }, { "ph": "s", "id": 170610, "pid": 76337, "tid": -914061504, "ts": 1716454224064654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224120706, "dur": 163, "args": { "External id": 170612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170612, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170612, "pid": 5, "tid": 7, "ts": 1716454224120706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064669, "dur": 6, "args": { "External id": 170612, "cbid": 211, "correlation": 170612 } }, { "ph": "s", "id": 170612, "pid": 76337, "tid": -914061504, "ts": 1716454224064669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224120870, "dur": 1, "args": { "External id": 170614, "device": 5, "context": 1, "stream": 7, "correlation": 170614, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 170614, "pid": 5, "tid": 7, "ts": 1716454224120870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224064680, "dur": 6, "args": { "External id": 170614, "cbid": 51, "correlation": 170614 } }, { "ph": "s", "id": 170614, "pid": 76337, "tid": -914061504, "ts": 1716454224064680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224120874, "dur": 261, "args": { "External id": 170615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170615, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170615, "pid": 5, "tid": 7, "ts": 1716454224120874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064687, "dur": 6, "args": { "External id": 170615, "cbid": 211, "correlation": 170615 } }, { "ph": "s", "id": 170615, "pid": 76337, "tid": -914061504, "ts": 1716454224064687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224121136, "dur": 6, "args": { "External id": 170617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170617, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170617, "pid": 5, "tid": 7, "ts": 1716454224121136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064697, "dur": 5, "args": { "External id": 170617, "cbid": 211, "correlation": 170617 } }, { "ph": "s", "id": 170617, "pid": 76337, "tid": -914061504, "ts": 1716454224064697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121143, "dur": 6, "args": { "External id": 170623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170623, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170623, "pid": 5, "tid": 7, "ts": 1716454224121143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064725, "dur": 9, "args": { "External id": 170623, "cbid": 211, "correlation": 170623 } }, { "ph": "s", "id": 170623, "pid": 76337, "tid": -914061504, "ts": 1716454224064725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224121151, "dur": 5, "args": { "External id": 170631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170631, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170631, "pid": 5, "tid": 7, "ts": 1716454224121151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064757, "dur": 8, "args": { "External id": 170631, "cbid": 211, "correlation": 170631 } }, { "ph": "s", "id": 170631, "pid": 76337, "tid": -914061504, "ts": 1716454224064757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224121157, "dur": 4, "args": { "External id": 170639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170639, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170639, "pid": 5, "tid": 7, "ts": 1716454224121157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064786, "dur": 8, "args": { "External id": 170639, "cbid": 211, "correlation": 170639 } }, { "ph": "s", "id": 170639, "pid": 76337, "tid": -914061504, "ts": 1716454224064786, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224121163, "dur": 10, "args": { "External id": 170659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170659, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 170659, "pid": 5, "tid": 7, "ts": 1716454224121163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064933, "dur": 13, "args": { "External id": 170659, "cbid": 211, "correlation": 170659 } }, { "ph": "s", "id": 170659, "pid": 76337, "tid": -914061504, "ts": 1716454224064933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224121174, "dur": 4, "args": { "External id": 170671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170671, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 170671, "pid": 5, "tid": 7, "ts": 1716454224121174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064956, "dur": 6, "args": { "External id": 170671, "cbid": 211, "correlation": 170671 } }, { "ph": "s", "id": 170671, "pid": 76337, "tid": -914061504, "ts": 1716454224064956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121179, "dur": 7, "args": { "External id": 170674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170674, "pid": 5, "tid": 7, "ts": 1716454224121179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224064982, "dur": 8, "args": { "External id": 170674, "cbid": 211, "correlation": 170674 } }, { "ph": "s", "id": 170674, "pid": 76337, "tid": -914061504, "ts": 1716454224064982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224065043, "dur": 0, "args": { "External id": 170685, "cbid": 317, "correlation": 170685 } }, { "ph": "f", "id": 170685, "pid": 76337, "tid": -914061504, "ts": 1716454224065043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224065043, "dur": 0, "args": { "External id": 170686, "cbid": 203, "correlation": 170686 } }, { "ph": "f", "id": 170686, "pid": 76337, "tid": -914061504, "ts": 1716454224065043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224065044, "dur": 0, "args": { "External id": 170687, "cbid": 205, "correlation": 170687 } }, { "ph": "f", "id": 170687, "pid": 76337, "tid": -914061504, "ts": 1716454224065044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224121187, "dur": 5, "args": { "External id": 170691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170691, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170691, "pid": 5, "tid": 7, "ts": 1716454224121187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065060, "dur": 13, "args": { "External id": 170691, "cbid": 211, "correlation": 170691 } }, { "ph": "s", "id": 170691, "pid": 76337, "tid": -914061504, "ts": 1716454224065060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224121193, "dur": 38, "args": { "External id": 170693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170693, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 170693, "pid": 5, "tid": 7, "ts": 1716454224121193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065080, "dur": 8, "args": { "External id": 170693, "cbid": 211, "correlation": 170693 } }, { "ph": "s", "id": 170693, "pid": 76337, "tid": -914061504, "ts": 1716454224065080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224121232, "dur": 5, "args": { "External id": 170695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170695, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170695, "pid": 5, "tid": 7, "ts": 1716454224121232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065092, "dur": 5, "args": { "External id": 170695, "cbid": 211, "correlation": 170695 } }, { "ph": "s", "id": 170695, "pid": 76337, "tid": -914061504, "ts": 1716454224065092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121239, "dur": 6, "args": { "External id": 170701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170701, "pid": 5, "tid": 7, "ts": 1716454224121239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065119, "dur": 9, "args": { "External id": 170701, "cbid": 211, "correlation": 170701 } }, { "ph": "s", "id": 170701, "pid": 76337, "tid": -914061504, "ts": 1716454224065119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224121246, "dur": 20, "args": { "External id": 170710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170710, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170710, "pid": 5, "tid": 7, "ts": 1716454224121246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065202, "dur": 14, "args": { "External id": 170710, "cbid": 211, "correlation": 170710 } }, { "ph": "s", "id": 170710, "pid": 76337, "tid": -914061504, "ts": 1716454224065202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224121267, "dur": 10, "args": { "External id": 170732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170732, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 170732, "pid": 5, "tid": 7, "ts": 1716454224121267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065258, "dur": 10, "args": { "External id": 170732, "cbid": 211, "correlation": 170732 } }, { "ph": "s", "id": 170732, "pid": 76337, "tid": -914061504, "ts": 1716454224065258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065349, "dur": 2, "args": { "External id": 170743, "cbid": 251, "correlation": 170743 } }, { "ph": "f", "id": 170743, "pid": 76337, "tid": -914061504, "ts": 1716454224065349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065353, "dur": 0, "args": { "External id": 170744, "cbid": 251, "correlation": 170744 } }, { "ph": "f", "id": 170744, "pid": 76337, "tid": -914061504, "ts": 1716454224065353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224121279, "dur": 55, "args": { "External id": 170745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170745, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 170745, "pid": 5, "tid": 7, "ts": 1716454224121279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065356, "dur": 15, "args": { "External id": 170745, "cbid": 211, "correlation": 170745 } }, { "ph": "s", "id": 170745, "pid": 76337, "tid": -914061504, "ts": 1716454224065356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065428, "dur": 1, "args": { "External id": 170756, "cbid": 251, "correlation": 170756 } }, { "ph": "f", "id": 170756, "pid": 76337, "tid": -914061504, "ts": 1716454224065428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065431, "dur": 0, "args": { "External id": 170757, "cbid": 251, "correlation": 170757 } }, { "ph": "f", "id": 170757, "pid": 76337, "tid": -914061504, "ts": 1716454224065431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224121335, "dur": 53, "args": { "External id": 170758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170758, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 170758, "pid": 5, "tid": 7, "ts": 1716454224121335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065433, "dur": 11, "args": { "External id": 170758, "cbid": 211, "correlation": 170758 } }, { "ph": "s", "id": 170758, "pid": 76337, "tid": -914061504, "ts": 1716454224065433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065498, "dur": 1, "args": { "External id": 170769, "cbid": 251, "correlation": 170769 } }, { "ph": "f", "id": 170769, "pid": 76337, "tid": -914061504, "ts": 1716454224065498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065501, "dur": 0, "args": { "External id": 170770, "cbid": 251, "correlation": 170770 } }, { "ph": "f", "id": 170770, "pid": 76337, "tid": -914061504, "ts": 1716454224065501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224121390, "dur": 54, "args": { "External id": 170771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170771, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 170771, "pid": 5, "tid": 7, "ts": 1716454224121390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065503, "dur": 12, "args": { "External id": 170771, "cbid": 211, "correlation": 170771 } }, { "ph": "s", "id": 170771, "pid": 76337, "tid": -914061504, "ts": 1716454224065503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224121445, "dur": 55, "args": { "External id": 170796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170796, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170796, "pid": 5, "tid": 7, "ts": 1716454224121445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065590, "dur": 13, "args": { "External id": 170796, "cbid": 211, "correlation": 170796 } }, { "ph": "s", "id": 170796, "pid": 76337, "tid": -914061504, "ts": 1716454224065590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224065689, "dur": 1, "args": { "External id": 170814, "cbid": 251, "correlation": 170814 } }, { "ph": "f", "id": 170814, "pid": 76337, "tid": -914061504, "ts": 1716454224065689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224121501, "dur": 64, "args": { "External id": 170816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170816, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 170816, "pid": 5, "tid": 7, "ts": 1716454224121501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065695, "dur": 13, "args": { "External id": 170816, "cbid": 211, "correlation": 170816 } }, { "ph": "s", "id": 170816, "pid": 76337, "tid": -914061504, "ts": 1716454224065695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224121566, "dur": 6, "args": { "External id": 170824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170824, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170824, "pid": 5, "tid": 7, "ts": 1716454224121566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065766, "dur": 12, "args": { "External id": 170824, "cbid": 211, "correlation": 170824 } }, { "ph": "s", "id": 170824, "pid": 76337, "tid": -914061504, "ts": 1716454224065766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224121573, "dur": 7, "args": { "External id": 170832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170832, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 170832, "pid": 5, "tid": 7, "ts": 1716454224121573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065803, "dur": 9, "args": { "External id": 170832, "cbid": 211, "correlation": 170832 } }, { "ph": "s", "id": 170832, "pid": 76337, "tid": -914061504, "ts": 1716454224065803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121582, "dur": 8, "args": { "External id": 170843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170843, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170843, "pid": 5, "tid": 7, "ts": 1716454224121582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065876, "dur": 13, "args": { "External id": 170843, "cbid": 211, "correlation": 170843 } }, { "ph": "s", "id": 170843, "pid": 76337, "tid": -914061504, "ts": 1716454224065876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224121591, "dur": 9, "args": { "External id": 170865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170865, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 170865, "pid": 5, "tid": 7, "ts": 1716454224121591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224065908, "dur": 7, "args": { "External id": 170865, "cbid": 211, "correlation": 170865 } }, { "ph": "s", "id": 170865, "pid": 76337, "tid": -914061504, "ts": 1716454224065908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066002, "dur": 2, "args": { "External id": 170876, "cbid": 251, "correlation": 170876 } }, { "ph": "f", "id": 170876, "pid": 76337, "tid": -914061504, "ts": 1716454224066002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224121602, "dur": 1, "args": { "External id": 170877, "device": 5, "context": 1, "stream": 7, "correlation": 170877, "bytes": 480, "memory bandwidth (GB/s)": 0.3 } }, { "ph": "f", "id": 170877, "pid": 5, "tid": 7, "ts": 1716454224121602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224066007, "dur": 12, "args": { "External id": 170877, "cbid": 51, "correlation": 170877 } }, { "ph": "s", "id": 170877, "pid": 76337, "tid": -914061504, "ts": 1716454224066007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224121605, "dur": 36, "args": { "External id": 170878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170878, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 170878, "pid": 5, "tid": 7, "ts": 1716454224121605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066021, "dur": 13, "args": { "External id": 170878, "cbid": 211, "correlation": 170878 } }, { "ph": "s", "id": 170878, "pid": 76337, "tid": -914061504, "ts": 1716454224066021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066094, "dur": 1, "args": { "External id": 170889, "cbid": 251, "correlation": 170889 } }, { "ph": "f", "id": 170889, "pid": 76337, "tid": -914061504, "ts": 1716454224066094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066098, "dur": 0, "args": { "External id": 170890, "cbid": 251, "correlation": 170890 } }, { "ph": "f", "id": 170890, "pid": 76337, "tid": -914061504, "ts": 1716454224066098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224121643, "dur": 12, "args": { "External id": 170891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170891, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170891, "pid": 5, "tid": 7, "ts": 1716454224121643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066099, "dur": 12, "args": { "External id": 170891, "cbid": 211, "correlation": 170891 } }, { "ph": "s", "id": 170891, "pid": 76337, "tid": -914061504, "ts": 1716454224066099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224121657, "dur": 6, "args": { "External id": 170893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170893, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170893, "pid": 5, "tid": 7, "ts": 1716454224121657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066113, "dur": 6, "args": { "External id": 170893, "cbid": 211, "correlation": 170893 } }, { "ph": "s", "id": 170893, "pid": 76337, "tid": -914061504, "ts": 1716454224066113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066171, "dur": 1, "args": { "External id": 170904, "cbid": 251, "correlation": 170904 } }, { "ph": "f", "id": 170904, "pid": 76337, "tid": -914061504, "ts": 1716454224066171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066174, "dur": 0, "args": { "External id": 170905, "cbid": 251, "correlation": 170905 } }, { "ph": "f", "id": 170905, "pid": 76337, "tid": -914061504, "ts": 1716454224066174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224121663, "dur": 8, "args": { "External id": 170906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170906, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170906, "pid": 5, "tid": 7, "ts": 1716454224121663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066176, "dur": 11, "args": { "External id": 170906, "cbid": 211, "correlation": 170906 } }, { "ph": "s", "id": 170906, "pid": 76337, "tid": -914061504, "ts": 1716454224066176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224121673, "dur": 4, "args": { "External id": 170908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170908, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 170908, "pid": 5, "tid": 7, "ts": 1716454224121673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066188, "dur": 5, "args": { "External id": 170908, "cbid": 211, "correlation": 170908 } }, { "ph": "s", "id": 170908, "pid": 76337, "tid": -914061504, "ts": 1716454224066188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224121677, "dur": 20, "args": { "External id": 170933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170933, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 170933, "pid": 5, "tid": 7, "ts": 1716454224121677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066268, "dur": 12, "args": { "External id": 170933, "cbid": 211, "correlation": 170933 } }, { "ph": "s", "id": 170933, "pid": 76337, "tid": -914061504, "ts": 1716454224066268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066368, "dur": 2, "args": { "External id": 170951, "cbid": 251, "correlation": 170951 } }, { "ph": "f", "id": 170951, "pid": 76337, "tid": -914061504, "ts": 1716454224066368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224121700, "dur": 1, "args": { "External id": 170953, "device": 5, "context": 1, "stream": 7, "correlation": 170953, "bytes": 480, "memory bandwidth (GB/s)": 0.3 } }, { "ph": "f", "id": 170953, "pid": 5, "tid": 7, "ts": 1716454224121700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224066373, "dur": 10, "args": { "External id": 170953, "cbid": 51, "correlation": 170953 } }, { "ph": "s", "id": 170953, "pid": 76337, "tid": -914061504, "ts": 1716454224066373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224121703, "dur": 37, "args": { "External id": 170954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170954, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 170954, "pid": 5, "tid": 7, "ts": 1716454224121703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066385, "dur": 13, "args": { "External id": 170954, "cbid": 211, "correlation": 170954 } }, { "ph": "s", "id": 170954, "pid": 76337, "tid": -914061504, "ts": 1716454224066385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224121742, "dur": 4, "args": { "External id": 170962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170962, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170962, "pid": 5, "tid": 7, "ts": 1716454224121742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066457, "dur": 13, "args": { "External id": 170962, "cbid": 211, "correlation": 170962 } }, { "ph": "s", "id": 170962, "pid": 76337, "tid": -914061504, "ts": 1716454224066457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121747, "dur": 8, "args": { "External id": 170970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 170970, "pid": 5, "tid": 7, "ts": 1716454224121747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066498, "dur": 9, "args": { "External id": 170970, "cbid": 211, "correlation": 170970 } }, { "ph": "s", "id": 170970, "pid": 76337, "tid": -914061504, "ts": 1716454224066498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224121757, "dur": 8, "args": { "External id": 170992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 170992, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 170992, "pid": 5, "tid": 7, "ts": 1716454224121757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066550, "dur": 10, "args": { "External id": 170992, "cbid": 211, "correlation": 170992 } }, { "ph": "s", "id": 170992, "pid": 76337, "tid": -914061504, "ts": 1716454224066550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066642, "dur": 1, "args": { "External id": 171008, "cbid": 251, "correlation": 171008 } }, { "ph": "f", "id": 171008, "pid": 76337, "tid": -914061504, "ts": 1716454224066642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066647, "dur": 0, "args": { "External id": 171010, "cbid": 251, "correlation": 171010 } }, { "ph": "f", "id": 171010, "pid": 76337, "tid": -914061504, "ts": 1716454224066647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224121767, "dur": 189, "args": { "External id": 171011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171011, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171011, "pid": 5, "tid": 7, "ts": 1716454224121767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066649, "dur": 13, "args": { "External id": 171011, "cbid": 211, "correlation": 171011 } }, { "ph": "s", "id": 171011, "pid": 76337, "tid": -914061504, "ts": 1716454224066649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121957, "dur": 21, "args": { "External id": 171019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171019, "pid": 5, "tid": 7, "ts": 1716454224121957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066714, "dur": 13, "args": { "External id": 171019, "cbid": 211, "correlation": 171019 } }, { "ph": "s", "id": 171019, "pid": 76337, "tid": -914061504, "ts": 1716454224066714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224121979, "dur": 21, "args": { "External id": 171027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171027, "pid": 5, "tid": 7, "ts": 1716454224121979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066746, "dur": 8, "args": { "External id": 171027, "cbid": 211, "correlation": 171027 } }, { "ph": "s", "id": 171027, "pid": 76337, "tid": -914061504, "ts": 1716454224066746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224066828, "dur": 1, "args": { "External id": 171043, "cbid": 251, "correlation": 171043 } }, { "ph": "f", "id": 171043, "pid": 76337, "tid": -914061504, "ts": 1716454224066828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224122003, "dur": 1, "args": { "External id": 171045, "device": 5, "context": 1, "stream": 7, "correlation": 171045, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 171045, "pid": 5, "tid": 7, "ts": 1716454224122003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224066833, "dur": 8, "args": { "External id": 171045, "cbid": 51, "correlation": 171045 } }, { "ph": "s", "id": 171045, "pid": 76337, "tid": -914061504, "ts": 1716454224066833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224122006, "dur": 109, "args": { "External id": 171046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171046, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 171046, "pid": 5, "tid": 7, "ts": 1716454224122006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066843, "dur": 11, "args": { "External id": 171046, "cbid": 211, "correlation": 171046 } }, { "ph": "s", "id": 171046, "pid": 76337, "tid": -914061504, "ts": 1716454224066843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224122117, "dur": 6, "args": { "External id": 171054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171054, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171054, "pid": 5, "tid": 7, "ts": 1716454224122117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066885, "dur": 10, "args": { "External id": 171054, "cbid": 211, "correlation": 171054 } }, { "ph": "s", "id": 171054, "pid": 76337, "tid": -914061504, "ts": 1716454224066885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224122124, "dur": 10, "args": { "External id": 171065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171065, "pid": 5, "tid": 7, "ts": 1716454224122124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224066953, "dur": 12, "args": { "External id": 171065, "cbid": 211, "correlation": 171065 } }, { "ph": "s", "id": 171065, "pid": 76337, "tid": -914061504, "ts": 1716454224066953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224067029, "dur": 0, "args": { "External id": 171077, "cbid": 317, "correlation": 171077 } }, { "ph": "f", "id": 171077, "pid": 76337, "tid": -914061504, "ts": 1716454224067029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224067030, "dur": 0, "args": { "External id": 171078, "cbid": 203, "correlation": 171078 } }, { "ph": "f", "id": 171078, "pid": 76337, "tid": -914061504, "ts": 1716454224067030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224067031, "dur": 0, "args": { "External id": 171079, "cbid": 205, "correlation": 171079 } }, { "ph": "f", "id": 171079, "pid": 76337, "tid": -914061504, "ts": 1716454224067031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122136, "dur": 5, "args": { "External id": 171083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171083, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171083, "pid": 5, "tid": 7, "ts": 1716454224122136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067046, "dur": 12, "args": { "External id": 171083, "cbid": 211, "correlation": 171083 } }, { "ph": "s", "id": 171083, "pid": 76337, "tid": -914061504, "ts": 1716454224067046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224122142, "dur": 38, "args": { "External id": 171085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171085, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 171085, "pid": 5, "tid": 7, "ts": 1716454224122142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067065, "dur": 8, "args": { "External id": 171085, "cbid": 211, "correlation": 171085 } }, { "ph": "s", "id": 171085, "pid": 76337, "tid": -914061504, "ts": 1716454224067065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122182, "dur": 6, "args": { "External id": 171087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171087, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171087, "pid": 5, "tid": 7, "ts": 1716454224122182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067077, "dur": 6, "args": { "External id": 171087, "cbid": 211, "correlation": 171087 } }, { "ph": "s", "id": 171087, "pid": 76337, "tid": -914061504, "ts": 1716454224067077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224122189, "dur": 7, "args": { "External id": 171093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171093, "pid": 5, "tid": 7, "ts": 1716454224122189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067105, "dur": 9, "args": { "External id": 171093, "cbid": 211, "correlation": 171093 } }, { "ph": "s", "id": 171093, "pid": 76337, "tid": -914061504, "ts": 1716454224067105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224122197, "dur": 5, "args": { "External id": 171101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171101, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171101, "pid": 5, "tid": 7, "ts": 1716454224122197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067137, "dur": 8, "args": { "External id": 171101, "cbid": 211, "correlation": 171101 } }, { "ph": "s", "id": 171101, "pid": 76337, "tid": -914061504, "ts": 1716454224067137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224122203, "dur": 11, "args": { "External id": 171121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171121, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171121, "pid": 5, "tid": 7, "ts": 1716454224122203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067210, "dur": 12, "args": { "External id": 171121, "cbid": 211, "correlation": 171121 } }, { "ph": "s", "id": 171121, "pid": 76337, "tid": -914061504, "ts": 1716454224067210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224122216, "dur": 4, "args": { "External id": 171133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171133, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 171133, "pid": 5, "tid": 7, "ts": 1716454224122216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067232, "dur": 6, "args": { "External id": 171133, "cbid": 211, "correlation": 171133 } }, { "ph": "s", "id": 171133, "pid": 76337, "tid": -914061504, "ts": 1716454224067232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224122222, "dur": 8, "args": { "External id": 171136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171136, "pid": 5, "tid": 7, "ts": 1716454224122222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067249, "dur": 7, "args": { "External id": 171136, "cbid": 211, "correlation": 171136 } }, { "ph": "s", "id": 171136, "pid": 76337, "tid": -914061504, "ts": 1716454224067249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224122232, "dur": 5, "args": { "External id": 171145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171145, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171145, "pid": 5, "tid": 7, "ts": 1716454224122232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067288, "dur": 9, "args": { "External id": 171145, "cbid": 211, "correlation": 171145 } }, { "ph": "s", "id": 171145, "pid": 76337, "tid": -914061504, "ts": 1716454224067288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224067339, "dur": 0, "args": { "External id": 171155, "cbid": 317, "correlation": 171155 } }, { "ph": "f", "id": 171155, "pid": 76337, "tid": -914061504, "ts": 1716454224067339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224067340, "dur": 0, "args": { "External id": 171156, "cbid": 203, "correlation": 171156 } }, { "ph": "f", "id": 171156, "pid": 76337, "tid": -914061504, "ts": 1716454224067340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224067340, "dur": 0, "args": { "External id": 171157, "cbid": 205, "correlation": 171157 } }, { "ph": "f", "id": 171157, "pid": 76337, "tid": -914061504, "ts": 1716454224067340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122238, "dur": 5, "args": { "External id": 171161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171161, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171161, "pid": 5, "tid": 7, "ts": 1716454224122238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067354, "dur": 11, "args": { "External id": 171161, "cbid": 211, "correlation": 171161 } }, { "ph": "s", "id": 171161, "pid": 76337, "tid": -914061504, "ts": 1716454224067354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122245, "dur": 163, "args": { "External id": 171163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171163, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171163, "pid": 5, "tid": 7, "ts": 1716454224122245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067368, "dur": 6, "args": { "External id": 171163, "cbid": 211, "correlation": 171163 } }, { "ph": "s", "id": 171163, "pid": 76337, "tid": -914061504, "ts": 1716454224067368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224122410, "dur": 1, "args": { "External id": 171165, "device": 5, "context": 1, "stream": 7, "correlation": 171165, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 171165, "pid": 5, "tid": 7, "ts": 1716454224122410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224067379, "dur": 6, "args": { "External id": 171165, "cbid": 51, "correlation": 171165 } }, { "ph": "s", "id": 171165, "pid": 76337, "tid": -914061504, "ts": 1716454224067379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224122414, "dur": 272, "args": { "External id": 171166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171166, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171166, "pid": 5, "tid": 7, "ts": 1716454224122414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067386, "dur": 6, "args": { "External id": 171166, "cbid": 211, "correlation": 171166 } }, { "ph": "s", "id": 171166, "pid": 76337, "tid": -914061504, "ts": 1716454224067386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122687, "dur": 6, "args": { "External id": 171168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171168, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171168, "pid": 5, "tid": 7, "ts": 1716454224122687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067396, "dur": 5, "args": { "External id": 171168, "cbid": 211, "correlation": 171168 } }, { "ph": "s", "id": 171168, "pid": 76337, "tid": -914061504, "ts": 1716454224067396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224122694, "dur": 7, "args": { "External id": 171174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171174, "pid": 5, "tid": 7, "ts": 1716454224122694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067424, "dur": 8, "args": { "External id": 171174, "cbid": 211, "correlation": 171174 } }, { "ph": "s", "id": 171174, "pid": 76337, "tid": -914061504, "ts": 1716454224067424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224122702, "dur": 4, "args": { "External id": 171182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171182, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 171182, "pid": 5, "tid": 7, "ts": 1716454224122702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067468, "dur": 10, "args": { "External id": 171182, "cbid": 211, "correlation": 171182 } }, { "ph": "s", "id": 171182, "pid": 76337, "tid": -914061504, "ts": 1716454224067468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224067534, "dur": 1, "args": { "External id": 171198, "cbid": 251, "correlation": 171198 } }, { "ph": "f", "id": 171198, "pid": 76337, "tid": -914061504, "ts": 1716454224067534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224067539, "dur": 0, "args": { "External id": 171200, "cbid": 251, "correlation": 171200 } }, { "ph": "f", "id": 171200, "pid": 76337, "tid": -914061504, "ts": 1716454224067539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224122707, "dur": 12, "args": { "External id": 171201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171201, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171201, "pid": 5, "tid": 7, "ts": 1716454224122707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067541, "dur": 11, "args": { "External id": 171201, "cbid": 211, "correlation": 171201 } }, { "ph": "s", "id": 171201, "pid": 76337, "tid": -914061504, "ts": 1716454224067541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224122721, "dur": 5, "args": { "External id": 171203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171203, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171203, "pid": 5, "tid": 7, "ts": 1716454224122721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067554, "dur": 5, "args": { "External id": 171203, "cbid": 211, "correlation": 171203 } }, { "ph": "s", "id": 171203, "pid": 76337, "tid": -914061504, "ts": 1716454224067554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224122728, "dur": 6, "args": { "External id": 171213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171213, "pid": 5, "tid": 7, "ts": 1716454224122728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067610, "dur": 13, "args": { "External id": 171213, "cbid": 211, "correlation": 171213 } }, { "ph": "s", "id": 171213, "pid": 76337, "tid": -914061504, "ts": 1716454224067610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224122735, "dur": 10, "args": { "External id": 171233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171233, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171233, "pid": 5, "tid": 7, "ts": 1716454224122735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067678, "dur": 10, "args": { "External id": 171233, "cbid": 211, "correlation": 171233 } }, { "ph": "s", "id": 171233, "pid": 76337, "tid": -914061504, "ts": 1716454224067678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224122746, "dur": 4, "args": { "External id": 171245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171245, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 171245, "pid": 5, "tid": 7, "ts": 1716454224122746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067698, "dur": 6, "args": { "External id": 171245, "cbid": 211, "correlation": 171245 } }, { "ph": "s", "id": 171245, "pid": 76337, "tid": -914061504, "ts": 1716454224067698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224122751, "dur": 7, "args": { "External id": 171248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171248, "pid": 5, "tid": 7, "ts": 1716454224122751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067716, "dur": 7, "args": { "External id": 171248, "cbid": 211, "correlation": 171248 } }, { "ph": "s", "id": 171248, "pid": 76337, "tid": -914061504, "ts": 1716454224067716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224122760, "dur": 5, "args": { "External id": 171257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171257, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171257, "pid": 5, "tid": 7, "ts": 1716454224122760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067757, "dur": 9, "args": { "External id": 171257, "cbid": 211, "correlation": 171257 } }, { "ph": "s", "id": 171257, "pid": 76337, "tid": -914061504, "ts": 1716454224067757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224067819, "dur": 0, "args": { "External id": 171267, "cbid": 317, "correlation": 171267 } }, { "ph": "f", "id": 171267, "pid": 76337, "tid": -914061504, "ts": 1716454224067819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224067819, "dur": 0, "args": { "External id": 171268, "cbid": 203, "correlation": 171268 } }, { "ph": "f", "id": 171268, "pid": 76337, "tid": -914061504, "ts": 1716454224067819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224067820, "dur": 0, "args": { "External id": 171269, "cbid": 205, "correlation": 171269 } }, { "ph": "f", "id": 171269, "pid": 76337, "tid": -914061504, "ts": 1716454224067820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122766, "dur": 5, "args": { "External id": 171273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171273, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171273, "pid": 5, "tid": 7, "ts": 1716454224122766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067835, "dur": 12, "args": { "External id": 171273, "cbid": 211, "correlation": 171273 } }, { "ph": "s", "id": 171273, "pid": 76337, "tid": -914061504, "ts": 1716454224067835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224122772, "dur": 163, "args": { "External id": 171275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171275, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171275, "pid": 5, "tid": 7, "ts": 1716454224122772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067849, "dur": 5, "args": { "External id": 171275, "cbid": 211, "correlation": 171275 } }, { "ph": "s", "id": 171275, "pid": 76337, "tid": -914061504, "ts": 1716454224067849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224122938, "dur": 1, "args": { "External id": 171277, "device": 5, "context": 1, "stream": 7, "correlation": 171277, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 171277, "pid": 5, "tid": 7, "ts": 1716454224122938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224067859, "dur": 6, "args": { "External id": 171277, "cbid": 51, "correlation": 171277 } }, { "ph": "s", "id": 171277, "pid": 76337, "tid": -914061504, "ts": 1716454224067859, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224122942, "dur": 261, "args": { "External id": 171278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171278, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171278, "pid": 5, "tid": 7, "ts": 1716454224122942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067867, "dur": 6, "args": { "External id": 171278, "cbid": 211, "correlation": 171278 } }, { "ph": "s", "id": 171278, "pid": 76337, "tid": -914061504, "ts": 1716454224067867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224123204, "dur": 6, "args": { "External id": 171280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171280, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171280, "pid": 5, "tid": 7, "ts": 1716454224123204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067876, "dur": 6, "args": { "External id": 171280, "cbid": 211, "correlation": 171280 } }, { "ph": "s", "id": 171280, "pid": 76337, "tid": -914061504, "ts": 1716454224067876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224123211, "dur": 6, "args": { "External id": 171286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171286, "pid": 5, "tid": 7, "ts": 1716454224123211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067905, "dur": 8, "args": { "External id": 171286, "cbid": 211, "correlation": 171286 } }, { "ph": "s", "id": 171286, "pid": 76337, "tid": -914061504, "ts": 1716454224067905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224123219, "dur": 5, "args": { "External id": 171294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171294, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171294, "pid": 5, "tid": 7, "ts": 1716454224123219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067938, "dur": 8, "args": { "External id": 171294, "cbid": 211, "correlation": 171294 } }, { "ph": "s", "id": 171294, "pid": 76337, "tid": -914061504, "ts": 1716454224067938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224123225, "dur": 4, "args": { "External id": 171302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171302, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171302, "pid": 5, "tid": 7, "ts": 1716454224123225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224067967, "dur": 21, "args": { "External id": 171302, "cbid": 211, "correlation": 171302 } }, { "ph": "s", "id": 171302, "pid": 76337, "tid": -914061504, "ts": 1716454224067967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224123231, "dur": 12, "args": { "External id": 171311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171311, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171311, "pid": 5, "tid": 7, "ts": 1716454224123231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068069, "dur": 16, "args": { "External id": 171311, "cbid": 211, "correlation": 171311 } }, { "ph": "s", "id": 171311, "pid": 76337, "tid": -914061504, "ts": 1716454224068069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224123244, "dur": 12, "args": { "External id": 171331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171331, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171331, "pid": 5, "tid": 7, "ts": 1716454224123244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068139, "dur": 10, "args": { "External id": 171331, "cbid": 211, "correlation": 171331 } }, { "ph": "s", "id": 171331, "pid": 76337, "tid": -914061504, "ts": 1716454224068139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224123258, "dur": 4, "args": { "External id": 171343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171343, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171343, "pid": 5, "tid": 7, "ts": 1716454224123258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068160, "dur": 6, "args": { "External id": 171343, "cbid": 211, "correlation": 171343 } }, { "ph": "s", "id": 171343, "pid": 76337, "tid": -914061504, "ts": 1716454224068160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224123263, "dur": 10, "args": { "External id": 171346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171346, "pid": 5, "tid": 7, "ts": 1716454224123263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068178, "dur": 7, "args": { "External id": 171346, "cbid": 211, "correlation": 171346 } }, { "ph": "s", "id": 171346, "pid": 76337, "tid": -914061504, "ts": 1716454224068178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224123275, "dur": 6, "args": { "External id": 171355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171355, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171355, "pid": 5, "tid": 7, "ts": 1716454224123275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068217, "dur": 10, "args": { "External id": 171355, "cbid": 211, "correlation": 171355 } }, { "ph": "s", "id": 171355, "pid": 76337, "tid": -914061504, "ts": 1716454224068217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224068271, "dur": 0, "args": { "External id": 171365, "cbid": 317, "correlation": 171365 } }, { "ph": "f", "id": 171365, "pid": 76337, "tid": -914061504, "ts": 1716454224068271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224068271, "dur": 0, "args": { "External id": 171366, "cbid": 203, "correlation": 171366 } }, { "ph": "f", "id": 171366, "pid": 76337, "tid": -914061504, "ts": 1716454224068271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224068272, "dur": 0, "args": { "External id": 171367, "cbid": 205, "correlation": 171367 } }, { "ph": "f", "id": 171367, "pid": 76337, "tid": -914061504, "ts": 1716454224068272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224123282, "dur": 7, "args": { "External id": 171371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171371, "pid": 5, "tid": 7, "ts": 1716454224123282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068288, "dur": 12, "args": { "External id": 171371, "cbid": 211, "correlation": 171371 } }, { "ph": "s", "id": 171371, "pid": 76337, "tid": -914061504, "ts": 1716454224068288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224123290, "dur": 322, "args": { "External id": 171373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171373, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171373, "pid": 5, "tid": 7, "ts": 1716454224123290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068303, "dur": 5, "args": { "External id": 171373, "cbid": 211, "correlation": 171373 } }, { "ph": "s", "id": 171373, "pid": 76337, "tid": -914061504, "ts": 1716454224068303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224123614, "dur": 1, "args": { "External id": 171375, "device": 5, "context": 1, "stream": 7, "correlation": 171375, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 171375, "pid": 5, "tid": 7, "ts": 1716454224123614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224068314, "dur": 7, "args": { "External id": 171375, "cbid": 51, "correlation": 171375 } }, { "ph": "s", "id": 171375, "pid": 76337, "tid": -914061504, "ts": 1716454224068314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224123617, "dur": 498, "args": { "External id": 171376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171376, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171376, "pid": 5, "tid": 7, "ts": 1716454224123617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068322, "dur": 6, "args": { "External id": 171376, "cbid": 211, "correlation": 171376 } }, { "ph": "s", "id": 171376, "pid": 76337, "tid": -914061504, "ts": 1716454224068322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124117, "dur": 5, "args": { "External id": 171378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171378, "pid": 5, "tid": 7, "ts": 1716454224124117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068333, "dur": 6, "args": { "External id": 171378, "cbid": 211, "correlation": 171378 } }, { "ph": "s", "id": 171378, "pid": 76337, "tid": -914061504, "ts": 1716454224068333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224124124, "dur": 6, "args": { "External id": 171384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171384, "pid": 5, "tid": 7, "ts": 1716454224124124, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068363, "dur": 9, "args": { "External id": 171384, "cbid": 211, "correlation": 171384 } }, { "ph": "s", "id": 171384, "pid": 76337, "tid": -914061504, "ts": 1716454224068363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224124131, "dur": 4, "args": { "External id": 171392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171392, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 171392, "pid": 5, "tid": 7, "ts": 1716454224124131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068407, "dur": 9, "args": { "External id": 171392, "cbid": 211, "correlation": 171392 } }, { "ph": "s", "id": 171392, "pid": 76337, "tid": -914061504, "ts": 1716454224068407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224068470, "dur": 1, "args": { "External id": 171408, "cbid": 251, "correlation": 171408 } }, { "ph": "f", "id": 171408, "pid": 76337, "tid": -914061504, "ts": 1716454224068470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224068475, "dur": 0, "args": { "External id": 171410, "cbid": 251, "correlation": 171410 } }, { "ph": "f", "id": 171410, "pid": 76337, "tid": -914061504, "ts": 1716454224068475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224124136, "dur": 11, "args": { "External id": 171411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171411, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171411, "pid": 5, "tid": 7, "ts": 1716454224124136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068477, "dur": 12, "args": { "External id": 171411, "cbid": 211, "correlation": 171411 } }, { "ph": "s", "id": 171411, "pid": 76337, "tid": -914061504, "ts": 1716454224068477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224124149, "dur": 5, "args": { "External id": 171413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171413, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171413, "pid": 5, "tid": 7, "ts": 1716454224124149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068491, "dur": 6, "args": { "External id": 171413, "cbid": 211, "correlation": 171413 } }, { "ph": "s", "id": 171413, "pid": 76337, "tid": -914061504, "ts": 1716454224068491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224124155, "dur": 6, "args": { "External id": 171423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171423, "pid": 5, "tid": 7, "ts": 1716454224124155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068548, "dur": 12, "args": { "External id": 171423, "cbid": 211, "correlation": 171423 } }, { "ph": "s", "id": 171423, "pid": 76337, "tid": -914061504, "ts": 1716454224068548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224124162, "dur": 9, "args": { "External id": 171443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171443, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171443, "pid": 5, "tid": 7, "ts": 1716454224124162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068613, "dur": 11, "args": { "External id": 171443, "cbid": 211, "correlation": 171443 } }, { "ph": "s", "id": 171443, "pid": 76337, "tid": -914061504, "ts": 1716454224068613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224124173, "dur": 4, "args": { "External id": 171455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171455, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 171455, "pid": 5, "tid": 7, "ts": 1716454224124173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068634, "dur": 6, "args": { "External id": 171455, "cbid": 211, "correlation": 171455 } }, { "ph": "s", "id": 171455, "pid": 76337, "tid": -914061504, "ts": 1716454224068634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224124178, "dur": 7, "args": { "External id": 171458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171458, "pid": 5, "tid": 7, "ts": 1716454224124178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068653, "dur": 6, "args": { "External id": 171458, "cbid": 211, "correlation": 171458 } }, { "ph": "s", "id": 171458, "pid": 76337, "tid": -914061504, "ts": 1716454224068653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224124187, "dur": 5, "args": { "External id": 171467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171467, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171467, "pid": 5, "tid": 7, "ts": 1716454224124187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068693, "dur": 10, "args": { "External id": 171467, "cbid": 211, "correlation": 171467 } }, { "ph": "s", "id": 171467, "pid": 76337, "tid": -914061504, "ts": 1716454224068693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224068755, "dur": 0, "args": { "External id": 171477, "cbid": 317, "correlation": 171477 } }, { "ph": "f", "id": 171477, "pid": 76337, "tid": -914061504, "ts": 1716454224068755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224068755, "dur": 0, "args": { "External id": 171478, "cbid": 203, "correlation": 171478 } }, { "ph": "f", "id": 171478, "pid": 76337, "tid": -914061504, "ts": 1716454224068755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224068756, "dur": 0, "args": { "External id": 171479, "cbid": 205, "correlation": 171479 } }, { "ph": "f", "id": 171479, "pid": 76337, "tid": -914061504, "ts": 1716454224068756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124192, "dur": 5, "args": { "External id": 171483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171483, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171483, "pid": 5, "tid": 7, "ts": 1716454224124192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068770, "dur": 12, "args": { "External id": 171483, "cbid": 211, "correlation": 171483 } }, { "ph": "s", "id": 171483, "pid": 76337, "tid": -914061504, "ts": 1716454224068770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124199, "dur": 163, "args": { "External id": 171485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171485, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171485, "pid": 5, "tid": 7, "ts": 1716454224124199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068784, "dur": 5, "args": { "External id": 171485, "cbid": 211, "correlation": 171485 } }, { "ph": "s", "id": 171485, "pid": 76337, "tid": -914061504, "ts": 1716454224068784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224124364, "dur": 1, "args": { "External id": 171487, "device": 5, "context": 1, "stream": 7, "correlation": 171487, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 171487, "pid": 5, "tid": 7, "ts": 1716454224124364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224068796, "dur": 6, "args": { "External id": 171487, "cbid": 51, "correlation": 171487 } }, { "ph": "s", "id": 171487, "pid": 76337, "tid": -914061504, "ts": 1716454224068796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224124368, "dur": 262, "args": { "External id": 171488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171488, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171488, "pid": 5, "tid": 7, "ts": 1716454224124368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068803, "dur": 6, "args": { "External id": 171488, "cbid": 211, "correlation": 171488 } }, { "ph": "s", "id": 171488, "pid": 76337, "tid": -914061504, "ts": 1716454224068803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124631, "dur": 6, "args": { "External id": 171490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171490, "pid": 5, "tid": 7, "ts": 1716454224124631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068813, "dur": 5, "args": { "External id": 171490, "cbid": 211, "correlation": 171490 } }, { "ph": "s", "id": 171490, "pid": 76337, "tid": -914061504, "ts": 1716454224068813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224124638, "dur": 6, "args": { "External id": 171496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171496, "pid": 5, "tid": 7, "ts": 1716454224124638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068841, "dur": 9, "args": { "External id": 171496, "cbid": 211, "correlation": 171496 } }, { "ph": "s", "id": 171496, "pid": 76337, "tid": -914061504, "ts": 1716454224068841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224068901, "dur": 0, "args": { "External id": 171506, "cbid": 317, "correlation": 171506 } }, { "ph": "f", "id": 171506, "pid": 76337, "tid": -914061504, "ts": 1716454224068901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224068902, "dur": 0, "args": { "External id": 171507, "cbid": 203, "correlation": 171507 } }, { "ph": "f", "id": 171507, "pid": 76337, "tid": -914061504, "ts": 1716454224068902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224068902, "dur": 0, "args": { "External id": 171508, "cbid": 205, "correlation": 171508 } }, { "ph": "f", "id": 171508, "pid": 76337, "tid": -914061504, "ts": 1716454224068902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124645, "dur": 8, "args": { "External id": 171512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171512, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171512, "pid": 5, "tid": 7, "ts": 1716454224124645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068917, "dur": 11, "args": { "External id": 171512, "cbid": 211, "correlation": 171512 } }, { "ph": "s", "id": 171512, "pid": 76337, "tid": -914061504, "ts": 1716454224068917, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224124654, "dur": 3, "args": { "External id": 171514, "device": 5, "context": 1, "stream": 7, "correlation": 171514, "bytes": 4800, "memory bandwidth (GB/s)": 1.4563106796116505 } }, { "ph": "f", "id": 171514, "pid": 5, "tid": 7, "ts": 1716454224124654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224068934, "dur": 15, "args": { "External id": 171514, "cbid": 51, "correlation": 171514 } }, { "ph": "s", "id": 171514, "pid": 76337, "tid": -914061504, "ts": 1716454224068934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224124659, "dur": 95, "args": { "External id": 171515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171515, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 171515, "pid": 5, "tid": 7, "ts": 1716454224124659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068950, "dur": 6, "args": { "External id": 171515, "cbid": 211, "correlation": 171515 } }, { "ph": "s", "id": 171515, "pid": 76337, "tid": -914061504, "ts": 1716454224068950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124755, "dur": 5, "args": { "External id": 171517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171517, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171517, "pid": 5, "tid": 7, "ts": 1716454224124755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068960, "dur": 5, "args": { "External id": 171517, "cbid": 211, "correlation": 171517 } }, { "ph": "s", "id": 171517, "pid": 76337, "tid": -914061504, "ts": 1716454224068960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224124762, "dur": 6, "args": { "External id": 171523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171523, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171523, "pid": 5, "tid": 7, "ts": 1716454224124762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224068996, "dur": 10, "args": { "External id": 171523, "cbid": 211, "correlation": 171523 } }, { "ph": "s", "id": 171523, "pid": 76337, "tid": -914061504, "ts": 1716454224068996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224124769, "dur": 5, "args": { "External id": 171531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171531, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171531, "pid": 5, "tid": 7, "ts": 1716454224124769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069027, "dur": 8, "args": { "External id": 171531, "cbid": 211, "correlation": 171531 } }, { "ph": "s", "id": 171531, "pid": 76337, "tid": -914061504, "ts": 1716454224069027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224124776, "dur": 5, "args": { "External id": 171539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171539, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171539, "pid": 5, "tid": 7, "ts": 1716454224124776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069056, "dur": 8, "args": { "External id": 171539, "cbid": 211, "correlation": 171539 } }, { "ph": "s", "id": 171539, "pid": 76337, "tid": -914061504, "ts": 1716454224069056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224124782, "dur": 11, "args": { "External id": 171548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171548, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171548, "pid": 5, "tid": 7, "ts": 1716454224124782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069099, "dur": 10, "args": { "External id": 171548, "cbid": 211, "correlation": 171548 } }, { "ph": "s", "id": 171548, "pid": 76337, "tid": -914061504, "ts": 1716454224069099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224124794, "dur": 12, "args": { "External id": 171568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171568, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171568, "pid": 5, "tid": 7, "ts": 1716454224124794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069169, "dur": 11, "args": { "External id": 171568, "cbid": 211, "correlation": 171568 } }, { "ph": "s", "id": 171568, "pid": 76337, "tid": -914061504, "ts": 1716454224069169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224124808, "dur": 4, "args": { "External id": 171580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171580, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171580, "pid": 5, "tid": 7, "ts": 1716454224124808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069190, "dur": 7, "args": { "External id": 171580, "cbid": 211, "correlation": 171580 } }, { "ph": "s", "id": 171580, "pid": 76337, "tid": -914061504, "ts": 1716454224069190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224124813, "dur": 11, "args": { "External id": 171583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171583, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171583, "pid": 5, "tid": 7, "ts": 1716454224124813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069208, "dur": 6, "args": { "External id": 171583, "cbid": 211, "correlation": 171583 } }, { "ph": "s", "id": 171583, "pid": 76337, "tid": -914061504, "ts": 1716454224069208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224124826, "dur": 6, "args": { "External id": 171592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171592, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171592, "pid": 5, "tid": 7, "ts": 1716454224124826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069247, "dur": 10, "args": { "External id": 171592, "cbid": 211, "correlation": 171592 } }, { "ph": "s", "id": 171592, "pid": 76337, "tid": -914061504, "ts": 1716454224069247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224069298, "dur": 0, "args": { "External id": 171602, "cbid": 317, "correlation": 171602 } }, { "ph": "f", "id": 171602, "pid": 76337, "tid": -914061504, "ts": 1716454224069298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224069299, "dur": 0, "args": { "External id": 171603, "cbid": 203, "correlation": 171603 } }, { "ph": "f", "id": 171603, "pid": 76337, "tid": -914061504, "ts": 1716454224069299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224069300, "dur": 0, "args": { "External id": 171604, "cbid": 205, "correlation": 171604 } }, { "ph": "f", "id": 171604, "pid": 76337, "tid": -914061504, "ts": 1716454224069300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124834, "dur": 7, "args": { "External id": 171608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171608, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171608, "pid": 5, "tid": 7, "ts": 1716454224124834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069313, "dur": 11, "args": { "External id": 171608, "cbid": 211, "correlation": 171608 } }, { "ph": "s", "id": 171608, "pid": 76337, "tid": -914061504, "ts": 1716454224069313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224124842, "dur": 321, "args": { "External id": 171610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171610, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171610, "pid": 5, "tid": 7, "ts": 1716454224124842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069327, "dur": 5, "args": { "External id": 171610, "cbid": 211, "correlation": 171610 } }, { "ph": "s", "id": 171610, "pid": 76337, "tid": -914061504, "ts": 1716454224069327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224125165, "dur": 1, "args": { "External id": 171612, "device": 5, "context": 1, "stream": 7, "correlation": 171612, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 171612, "pid": 5, "tid": 7, "ts": 1716454224125165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224069337, "dur": 6, "args": { "External id": 171612, "cbid": 51, "correlation": 171612 } }, { "ph": "s", "id": 171612, "pid": 76337, "tid": -914061504, "ts": 1716454224069337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224125169, "dur": 501, "args": { "External id": 171613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171613, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171613, "pid": 5, "tid": 7, "ts": 1716454224125169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069345, "dur": 6, "args": { "External id": 171613, "cbid": 211, "correlation": 171613 } }, { "ph": "s", "id": 171613, "pid": 76337, "tid": -914061504, "ts": 1716454224069345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224125672, "dur": 6, "args": { "External id": 171615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171615, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171615, "pid": 5, "tid": 7, "ts": 1716454224125672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069356, "dur": 5, "args": { "External id": 171615, "cbid": 211, "correlation": 171615 } }, { "ph": "s", "id": 171615, "pid": 76337, "tid": -914061504, "ts": 1716454224069356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224125678, "dur": 6, "args": { "External id": 171621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171621, "pid": 5, "tid": 7, "ts": 1716454224125678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069384, "dur": 9, "args": { "External id": 171621, "cbid": 211, "correlation": 171621 } }, { "ph": "s", "id": 171621, "pid": 76337, "tid": -914061504, "ts": 1716454224069384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224125686, "dur": 4, "args": { "External id": 171629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171629, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 171629, "pid": 5, "tid": 7, "ts": 1716454224125686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069427, "dur": 9, "args": { "External id": 171629, "cbid": 211, "correlation": 171629 } }, { "ph": "s", "id": 171629, "pid": 76337, "tid": -914061504, "ts": 1716454224069427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224069489, "dur": 1, "args": { "External id": 171645, "cbid": 251, "correlation": 171645 } }, { "ph": "f", "id": 171645, "pid": 76337, "tid": -914061504, "ts": 1716454224069489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224069495, "dur": 0, "args": { "External id": 171647, "cbid": 251, "correlation": 171647 } }, { "ph": "f", "id": 171647, "pid": 76337, "tid": -914061504, "ts": 1716454224069495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224125691, "dur": 12, "args": { "External id": 171648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171648, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171648, "pid": 5, "tid": 7, "ts": 1716454224125691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069497, "dur": 11, "args": { "External id": 171648, "cbid": 211, "correlation": 171648 } }, { "ph": "s", "id": 171648, "pid": 76337, "tid": -914061504, "ts": 1716454224069497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224125704, "dur": 5, "args": { "External id": 171650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171650, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171650, "pid": 5, "tid": 7, "ts": 1716454224125704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069510, "dur": 5, "args": { "External id": 171650, "cbid": 211, "correlation": 171650 } }, { "ph": "s", "id": 171650, "pid": 76337, "tid": -914061504, "ts": 1716454224069510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224125711, "dur": 6, "args": { "External id": 171660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171660, "pid": 5, "tid": 7, "ts": 1716454224125711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069567, "dur": 13, "args": { "External id": 171660, "cbid": 211, "correlation": 171660 } }, { "ph": "s", "id": 171660, "pid": 76337, "tid": -914061504, "ts": 1716454224069567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224125718, "dur": 10, "args": { "External id": 171680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171680, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171680, "pid": 5, "tid": 7, "ts": 1716454224125718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069633, "dur": 11, "args": { "External id": 171680, "cbid": 211, "correlation": 171680 } }, { "ph": "s", "id": 171680, "pid": 76337, "tid": -914061504, "ts": 1716454224069633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224125729, "dur": 4, "args": { "External id": 171692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171692, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 171692, "pid": 5, "tid": 7, "ts": 1716454224125729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069655, "dur": 6, "args": { "External id": 171692, "cbid": 211, "correlation": 171692 } }, { "ph": "s", "id": 171692, "pid": 76337, "tid": -914061504, "ts": 1716454224069655, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224125734, "dur": 7, "args": { "External id": 171695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171695, "pid": 5, "tid": 7, "ts": 1716454224125734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069674, "dur": 6, "args": { "External id": 171695, "cbid": 211, "correlation": 171695 } }, { "ph": "s", "id": 171695, "pid": 76337, "tid": -914061504, "ts": 1716454224069674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224125742, "dur": 5, "args": { "External id": 171704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171704, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171704, "pid": 5, "tid": 7, "ts": 1716454224125742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069715, "dur": 10, "args": { "External id": 171704, "cbid": 211, "correlation": 171704 } }, { "ph": "s", "id": 171704, "pid": 76337, "tid": -914061504, "ts": 1716454224069715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224069776, "dur": 0, "args": { "External id": 171714, "cbid": 317, "correlation": 171714 } }, { "ph": "f", "id": 171714, "pid": 76337, "tid": -914061504, "ts": 1716454224069776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224069777, "dur": 0, "args": { "External id": 171715, "cbid": 203, "correlation": 171715 } }, { "ph": "f", "id": 171715, "pid": 76337, "tid": -914061504, "ts": 1716454224069777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224069778, "dur": 0, "args": { "External id": 171716, "cbid": 205, "correlation": 171716 } }, { "ph": "f", "id": 171716, "pid": 76337, "tid": -914061504, "ts": 1716454224069778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224125749, "dur": 5, "args": { "External id": 171720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171720, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171720, "pid": 5, "tid": 7, "ts": 1716454224125749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069791, "dur": 12, "args": { "External id": 171720, "cbid": 211, "correlation": 171720 } }, { "ph": "s", "id": 171720, "pid": 76337, "tid": -914061504, "ts": 1716454224069791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224125755, "dur": 164, "args": { "External id": 171722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171722, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171722, "pid": 5, "tid": 7, "ts": 1716454224125755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069806, "dur": 6, "args": { "External id": 171722, "cbid": 211, "correlation": 171722 } }, { "ph": "s", "id": 171722, "pid": 76337, "tid": -914061504, "ts": 1716454224069806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224125921, "dur": 1, "args": { "External id": 171724, "device": 5, "context": 1, "stream": 7, "correlation": 171724, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 171724, "pid": 5, "tid": 7, "ts": 1716454224125921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224069817, "dur": 6, "args": { "External id": 171724, "cbid": 51, "correlation": 171724 } }, { "ph": "s", "id": 171724, "pid": 76337, "tid": -914061504, "ts": 1716454224069817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224125925, "dur": 261, "args": { "External id": 171725, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171725, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171725, "pid": 5, "tid": 7, "ts": 1716454224125925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069824, "dur": 6, "args": { "External id": 171725, "cbid": 211, "correlation": 171725 } }, { "ph": "s", "id": 171725, "pid": 76337, "tid": -914061504, "ts": 1716454224069824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224126187, "dur": 6, "args": { "External id": 171727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171727, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171727, "pid": 5, "tid": 7, "ts": 1716454224126187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069834, "dur": 5, "args": { "External id": 171727, "cbid": 211, "correlation": 171727 } }, { "ph": "s", "id": 171727, "pid": 76337, "tid": -914061504, "ts": 1716454224069834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224126194, "dur": 6, "args": { "External id": 171733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171733, "pid": 5, "tid": 7, "ts": 1716454224126194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069862, "dur": 8, "args": { "External id": 171733, "cbid": 211, "correlation": 171733 } }, { "ph": "s", "id": 171733, "pid": 76337, "tid": -914061504, "ts": 1716454224069862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224069921, "dur": 0, "args": { "External id": 171743, "cbid": 317, "correlation": 171743 } }, { "ph": "f", "id": 171743, "pid": 76337, "tid": -914061504, "ts": 1716454224069921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224069922, "dur": 0, "args": { "External id": 171744, "cbid": 203, "correlation": 171744 } }, { "ph": "f", "id": 171744, "pid": 76337, "tid": -914061504, "ts": 1716454224069922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224069923, "dur": 0, "args": { "External id": 171745, "cbid": 205, "correlation": 171745 } }, { "ph": "f", "id": 171745, "pid": 76337, "tid": -914061504, "ts": 1716454224069923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224126202, "dur": 8, "args": { "External id": 171749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171749, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171749, "pid": 5, "tid": 7, "ts": 1716454224126202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069934, "dur": 12, "args": { "External id": 171749, "cbid": 211, "correlation": 171749 } }, { "ph": "s", "id": 171749, "pid": 76337, "tid": -914061504, "ts": 1716454224069934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224126211, "dur": 3, "args": { "External id": 171751, "device": 5, "context": 1, "stream": 7, "correlation": 171751, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 171751, "pid": 5, "tid": 7, "ts": 1716454224126211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224069951, "dur": 10, "args": { "External id": 171751, "cbid": 51, "correlation": 171751 } }, { "ph": "s", "id": 171751, "pid": 76337, "tid": -914061504, "ts": 1716454224069951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224126215, "dur": 96, "args": { "External id": 171752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171752, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 171752, "pid": 5, "tid": 7, "ts": 1716454224126215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069962, "dur": 6, "args": { "External id": 171752, "cbid": 211, "correlation": 171752 } }, { "ph": "s", "id": 171752, "pid": 76337, "tid": -914061504, "ts": 1716454224069962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224126313, "dur": 5, "args": { "External id": 171754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171754, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171754, "pid": 5, "tid": 7, "ts": 1716454224126313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224069971, "dur": 12, "args": { "External id": 171754, "cbid": 211, "correlation": 171754 } }, { "ph": "s", "id": 171754, "pid": 76337, "tid": -914061504, "ts": 1716454224069971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224126320, "dur": 6, "args": { "External id": 171760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171760, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171760, "pid": 5, "tid": 7, "ts": 1716454224126320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070006, "dur": 10, "args": { "External id": 171760, "cbid": 211, "correlation": 171760 } }, { "ph": "s", "id": 171760, "pid": 76337, "tid": -914061504, "ts": 1716454224070006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224126327, "dur": 5, "args": { "External id": 171768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171768, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171768, "pid": 5, "tid": 7, "ts": 1716454224126327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070036, "dur": 8, "args": { "External id": 171768, "cbid": 211, "correlation": 171768 } }, { "ph": "s", "id": 171768, "pid": 76337, "tid": -914061504, "ts": 1716454224070036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224126334, "dur": 4, "args": { "External id": 171776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171776, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171776, "pid": 5, "tid": 7, "ts": 1716454224126334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070066, "dur": 8, "args": { "External id": 171776, "cbid": 211, "correlation": 171776 } }, { "ph": "s", "id": 171776, "pid": 76337, "tid": -914061504, "ts": 1716454224070066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224126340, "dur": 11, "args": { "External id": 171785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171785, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171785, "pid": 5, "tid": 7, "ts": 1716454224126340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070108, "dur": 10, "args": { "External id": 171785, "cbid": 211, "correlation": 171785 } }, { "ph": "s", "id": 171785, "pid": 76337, "tid": -914061504, "ts": 1716454224070108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224126352, "dur": 12, "args": { "External id": 171805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171805, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171805, "pid": 5, "tid": 7, "ts": 1716454224126352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070177, "dur": 11, "args": { "External id": 171805, "cbid": 211, "correlation": 171805 } }, { "ph": "s", "id": 171805, "pid": 76337, "tid": -914061504, "ts": 1716454224070177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224126366, "dur": 4, "args": { "External id": 171817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171817, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171817, "pid": 5, "tid": 7, "ts": 1716454224126366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070198, "dur": 6, "args": { "External id": 171817, "cbid": 211, "correlation": 171817 } }, { "ph": "s", "id": 171817, "pid": 76337, "tid": -914061504, "ts": 1716454224070198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224126371, "dur": 11, "args": { "External id": 171820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171820, "pid": 5, "tid": 7, "ts": 1716454224126371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070217, "dur": 6, "args": { "External id": 171820, "cbid": 211, "correlation": 171820 } }, { "ph": "s", "id": 171820, "pid": 76337, "tid": -914061504, "ts": 1716454224070217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224126384, "dur": 6, "args": { "External id": 171829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171829, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171829, "pid": 5, "tid": 7, "ts": 1716454224126384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070254, "dur": 10, "args": { "External id": 171829, "cbid": 211, "correlation": 171829 } }, { "ph": "s", "id": 171829, "pid": 76337, "tid": -914061504, "ts": 1716454224070254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224070305, "dur": 0, "args": { "External id": 171839, "cbid": 317, "correlation": 171839 } }, { "ph": "f", "id": 171839, "pid": 76337, "tid": -914061504, "ts": 1716454224070305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224070306, "dur": 0, "args": { "External id": 171840, "cbid": 203, "correlation": 171840 } }, { "ph": "f", "id": 171840, "pid": 76337, "tid": -914061504, "ts": 1716454224070306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224070307, "dur": 0, "args": { "External id": 171841, "cbid": 205, "correlation": 171841 } }, { "ph": "f", "id": 171841, "pid": 76337, "tid": -914061504, "ts": 1716454224070307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224126391, "dur": 7, "args": { "External id": 171845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171845, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171845, "pid": 5, "tid": 7, "ts": 1716454224126391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070321, "dur": 11, "args": { "External id": 171845, "cbid": 211, "correlation": 171845 } }, { "ph": "s", "id": 171845, "pid": 76337, "tid": -914061504, "ts": 1716454224070321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224126399, "dur": 322, "args": { "External id": 171847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171847, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171847, "pid": 5, "tid": 7, "ts": 1716454224126399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070334, "dur": 5, "args": { "External id": 171847, "cbid": 211, "correlation": 171847 } }, { "ph": "s", "id": 171847, "pid": 76337, "tid": -914061504, "ts": 1716454224070334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224126723, "dur": 1, "args": { "External id": 171849, "device": 5, "context": 1, "stream": 7, "correlation": 171849, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 171849, "pid": 5, "tid": 7, "ts": 1716454224126723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224070345, "dur": 6, "args": { "External id": 171849, "cbid": 51, "correlation": 171849 } }, { "ph": "s", "id": 171849, "pid": 76337, "tid": -914061504, "ts": 1716454224070345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224126727, "dur": 500, "args": { "External id": 171850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171850, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171850, "pid": 5, "tid": 7, "ts": 1716454224126727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070353, "dur": 6, "args": { "External id": 171850, "cbid": 211, "correlation": 171850 } }, { "ph": "s", "id": 171850, "pid": 76337, "tid": -914061504, "ts": 1716454224070353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127228, "dur": 5, "args": { "External id": 171852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171852, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171852, "pid": 5, "tid": 7, "ts": 1716454224127228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070363, "dur": 5, "args": { "External id": 171852, "cbid": 211, "correlation": 171852 } }, { "ph": "s", "id": 171852, "pid": 76337, "tid": -914061504, "ts": 1716454224070363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224127235, "dur": 6, "args": { "External id": 171858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171858, "pid": 5, "tid": 7, "ts": 1716454224127235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070390, "dur": 9, "args": { "External id": 171858, "cbid": 211, "correlation": 171858 } }, { "ph": "s", "id": 171858, "pid": 76337, "tid": -914061504, "ts": 1716454224070390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224127242, "dur": 4, "args": { "External id": 171866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171866, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 171866, "pid": 5, "tid": 7, "ts": 1716454224127242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070434, "dur": 9, "args": { "External id": 171866, "cbid": 211, "correlation": 171866 } }, { "ph": "s", "id": 171866, "pid": 76337, "tid": -914061504, "ts": 1716454224070434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224070496, "dur": 1, "args": { "External id": 171882, "cbid": 251, "correlation": 171882 } }, { "ph": "f", "id": 171882, "pid": 76337, "tid": -914061504, "ts": 1716454224070496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224070501, "dur": 0, "args": { "External id": 171884, "cbid": 251, "correlation": 171884 } }, { "ph": "f", "id": 171884, "pid": 76337, "tid": -914061504, "ts": 1716454224070501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224127248, "dur": 12, "args": { "External id": 171885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171885, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171885, "pid": 5, "tid": 7, "ts": 1716454224127248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070503, "dur": 12, "args": { "External id": 171885, "cbid": 211, "correlation": 171885 } }, { "ph": "s", "id": 171885, "pid": 76337, "tid": -914061504, "ts": 1716454224070503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224127261, "dur": 5, "args": { "External id": 171887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171887, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171887, "pid": 5, "tid": 7, "ts": 1716454224127261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070517, "dur": 6, "args": { "External id": 171887, "cbid": 211, "correlation": 171887 } }, { "ph": "s", "id": 171887, "pid": 76337, "tid": -914061504, "ts": 1716454224070517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224127268, "dur": 6, "args": { "External id": 171897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171897, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171897, "pid": 5, "tid": 7, "ts": 1716454224127268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070574, "dur": 12, "args": { "External id": 171897, "cbid": 211, "correlation": 171897 } }, { "ph": "s", "id": 171897, "pid": 76337, "tid": -914061504, "ts": 1716454224070574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224127275, "dur": 9, "args": { "External id": 171917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171917, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 171917, "pid": 5, "tid": 7, "ts": 1716454224127275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070639, "dur": 11, "args": { "External id": 171917, "cbid": 211, "correlation": 171917 } }, { "ph": "s", "id": 171917, "pid": 76337, "tid": -914061504, "ts": 1716454224070639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224127286, "dur": 4, "args": { "External id": 171929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171929, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 171929, "pid": 5, "tid": 7, "ts": 1716454224127286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070659, "dur": 7, "args": { "External id": 171929, "cbid": 211, "correlation": 171929 } }, { "ph": "s", "id": 171929, "pid": 76337, "tid": -914061504, "ts": 1716454224070659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224127291, "dur": 7, "args": { "External id": 171932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171932, "pid": 5, "tid": 7, "ts": 1716454224127291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070680, "dur": 6, "args": { "External id": 171932, "cbid": 211, "correlation": 171932 } }, { "ph": "s", "id": 171932, "pid": 76337, "tid": -914061504, "ts": 1716454224070680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224127299, "dur": 5, "args": { "External id": 171941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171941, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171941, "pid": 5, "tid": 7, "ts": 1716454224127299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070720, "dur": 10, "args": { "External id": 171941, "cbid": 211, "correlation": 171941 } }, { "ph": "s", "id": 171941, "pid": 76337, "tid": -914061504, "ts": 1716454224070720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224070782, "dur": 0, "args": { "External id": 171951, "cbid": 317, "correlation": 171951 } }, { "ph": "f", "id": 171951, "pid": 76337, "tid": -914061504, "ts": 1716454224070782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224070783, "dur": 0, "args": { "External id": 171952, "cbid": 203, "correlation": 171952 } }, { "ph": "f", "id": 171952, "pid": 76337, "tid": -914061504, "ts": 1716454224070783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224070783, "dur": 0, "args": { "External id": 171953, "cbid": 205, "correlation": 171953 } }, { "ph": "f", "id": 171953, "pid": 76337, "tid": -914061504, "ts": 1716454224070783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127304, "dur": 5, "args": { "External id": 171957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171957, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171957, "pid": 5, "tid": 7, "ts": 1716454224127304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070797, "dur": 12, "args": { "External id": 171957, "cbid": 211, "correlation": 171957 } }, { "ph": "s", "id": 171957, "pid": 76337, "tid": -914061504, "ts": 1716454224070797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127311, "dur": 164, "args": { "External id": 171959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171959, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171959, "pid": 5, "tid": 7, "ts": 1716454224127311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070811, "dur": 5, "args": { "External id": 171959, "cbid": 211, "correlation": 171959 } }, { "ph": "s", "id": 171959, "pid": 76337, "tid": -914061504, "ts": 1716454224070811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224127477, "dur": 1, "args": { "External id": 171961, "device": 5, "context": 1, "stream": 7, "correlation": 171961, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 171961, "pid": 5, "tid": 7, "ts": 1716454224127477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224070822, "dur": 6, "args": { "External id": 171961, "cbid": 51, "correlation": 171961 } }, { "ph": "s", "id": 171961, "pid": 76337, "tid": -914061504, "ts": 1716454224070822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224127480, "dur": 262, "args": { "External id": 171962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171962, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 171962, "pid": 5, "tid": 7, "ts": 1716454224127480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070830, "dur": 6, "args": { "External id": 171962, "cbid": 211, "correlation": 171962 } }, { "ph": "s", "id": 171962, "pid": 76337, "tid": -914061504, "ts": 1716454224070830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127744, "dur": 6, "args": { "External id": 171964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171964, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171964, "pid": 5, "tid": 7, "ts": 1716454224127744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070839, "dur": 5, "args": { "External id": 171964, "cbid": 211, "correlation": 171964 } }, { "ph": "s", "id": 171964, "pid": 76337, "tid": -914061504, "ts": 1716454224070839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224127751, "dur": 6, "args": { "External id": 171970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171970, "pid": 5, "tid": 7, "ts": 1716454224127751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070867, "dur": 9, "args": { "External id": 171970, "cbid": 211, "correlation": 171970 } }, { "ph": "s", "id": 171970, "pid": 76337, "tid": -914061504, "ts": 1716454224070867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224070926, "dur": 0, "args": { "External id": 171980, "cbid": 317, "correlation": 171980 } }, { "ph": "f", "id": 171980, "pid": 76337, "tid": -914061504, "ts": 1716454224070926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224070927, "dur": 0, "args": { "External id": 171981, "cbid": 203, "correlation": 171981 } }, { "ph": "f", "id": 171981, "pid": 76337, "tid": -914061504, "ts": 1716454224070927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224070928, "dur": 0, "args": { "External id": 171982, "cbid": 205, "correlation": 171982 } }, { "ph": "f", "id": 171982, "pid": 76337, "tid": -914061504, "ts": 1716454224070928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127758, "dur": 8, "args": { "External id": 171986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171986, "pid": 5, "tid": 7, "ts": 1716454224127758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070939, "dur": 11, "args": { "External id": 171986, "cbid": 211, "correlation": 171986 } }, { "ph": "s", "id": 171986, "pid": 76337, "tid": -914061504, "ts": 1716454224070939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224127768, "dur": 3, "args": { "External id": 171988, "device": 5, "context": 1, "stream": 7, "correlation": 171988, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 171988, "pid": 5, "tid": 7, "ts": 1716454224127768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224070956, "dur": 9, "args": { "External id": 171988, "cbid": 51, "correlation": 171988 } }, { "ph": "s", "id": 171988, "pid": 76337, "tid": -914061504, "ts": 1716454224070956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224127772, "dur": 96, "args": { "External id": 171989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171989, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 171989, "pid": 5, "tid": 7, "ts": 1716454224127772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070966, "dur": 14, "args": { "External id": 171989, "cbid": 211, "correlation": 171989 } }, { "ph": "s", "id": 171989, "pid": 76337, "tid": -914061504, "ts": 1716454224070966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127869, "dur": 6, "args": { "External id": 171991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171991, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 171991, "pid": 5, "tid": 7, "ts": 1716454224127869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224070984, "dur": 6, "args": { "External id": 171991, "cbid": 211, "correlation": 171991 } }, { "ph": "s", "id": 171991, "pid": 76337, "tid": -914061504, "ts": 1716454224070984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224127877, "dur": 6, "args": { "External id": 171997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 171997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 171997, "pid": 5, "tid": 7, "ts": 1716454224127877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071011, "dur": 9, "args": { "External id": 171997, "cbid": 211, "correlation": 171997 } }, { "ph": "s", "id": 171997, "pid": 76337, "tid": -914061504, "ts": 1716454224071011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224127884, "dur": 5, "args": { "External id": 172005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172005, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 172005, "pid": 5, "tid": 7, "ts": 1716454224127884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071041, "dur": 8, "args": { "External id": 172005, "cbid": 211, "correlation": 172005 } }, { "ph": "s", "id": 172005, "pid": 76337, "tid": -914061504, "ts": 1716454224071041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224127891, "dur": 5, "args": { "External id": 172013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172013, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 172013, "pid": 5, "tid": 7, "ts": 1716454224127891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071070, "dur": 9, "args": { "External id": 172013, "cbid": 211, "correlation": 172013 } }, { "ph": "s", "id": 172013, "pid": 76337, "tid": -914061504, "ts": 1716454224071070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224127897, "dur": 14, "args": { "External id": 172024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172024, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172024, "pid": 5, "tid": 7, "ts": 1716454224127897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071150, "dur": 14, "args": { "External id": 172024, "cbid": 211, "correlation": 172024 } }, { "ph": "s", "id": 172024, "pid": 76337, "tid": -914061504, "ts": 1716454224071150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224071207, "dur": 0, "args": { "External id": 172034, "cbid": 317, "correlation": 172034 } }, { "ph": "f", "id": 172034, "pid": 76337, "tid": -914061504, "ts": 1716454224071207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224071208, "dur": 0, "args": { "External id": 172035, "cbid": 203, "correlation": 172035 } }, { "ph": "f", "id": 172035, "pid": 76337, "tid": -914061504, "ts": 1716454224071208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224071208, "dur": 0, "args": { "External id": 172036, "cbid": 205, "correlation": 172036 } }, { "ph": "f", "id": 172036, "pid": 76337, "tid": -914061504, "ts": 1716454224071208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127912, "dur": 9, "args": { "External id": 172040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172040, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172040, "pid": 5, "tid": 7, "ts": 1716454224127912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071223, "dur": 11, "args": { "External id": 172040, "cbid": 211, "correlation": 172040 } }, { "ph": "s", "id": 172040, "pid": 76337, "tid": -914061504, "ts": 1716454224071223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224127922, "dur": 164, "args": { "External id": 172042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172042, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172042, "pid": 5, "tid": 7, "ts": 1716454224127922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071236, "dur": 5, "args": { "External id": 172042, "cbid": 211, "correlation": 172042 } }, { "ph": "s", "id": 172042, "pid": 76337, "tid": -914061504, "ts": 1716454224071236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224128089, "dur": 1, "args": { "External id": 172044, "device": 5, "context": 1, "stream": 7, "correlation": 172044, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 172044, "pid": 5, "tid": 7, "ts": 1716454224128089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224071247, "dur": 6, "args": { "External id": 172044, "cbid": 51, "correlation": 172044 } }, { "ph": "s", "id": 172044, "pid": 76337, "tid": -914061504, "ts": 1716454224071247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224128092, "dur": 654, "args": { "External id": 172045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172045, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172045, "pid": 5, "tid": 7, "ts": 1716454224128092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071254, "dur": 6, "args": { "External id": 172045, "cbid": 211, "correlation": 172045 } }, { "ph": "s", "id": 172045, "pid": 76337, "tid": -914061504, "ts": 1716454224071254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224128747, "dur": 12, "args": { "External id": 172047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172047, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172047, "pid": 5, "tid": 7, "ts": 1716454224128747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071265, "dur": 6, "args": { "External id": 172047, "cbid": 211, "correlation": 172047 } }, { "ph": "s", "id": 172047, "pid": 76337, "tid": -914061504, "ts": 1716454224071265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224128760, "dur": 15, "args": { "External id": 172053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172053, "pid": 5, "tid": 7, "ts": 1716454224128760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071295, "dur": 9, "args": { "External id": 172053, "cbid": 211, "correlation": 172053 } }, { "ph": "s", "id": 172053, "pid": 76337, "tid": -914061504, "ts": 1716454224071295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224128776, "dur": 30, "args": { "External id": 172062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172062, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172062, "pid": 5, "tid": 7, "ts": 1716454224128776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071388, "dur": 13, "args": { "External id": 172062, "cbid": 211, "correlation": 172062 } }, { "ph": "s", "id": 172062, "pid": 76337, "tid": -914061504, "ts": 1716454224071388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224128808, "dur": 30, "args": { "External id": 172082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172082, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 172082, "pid": 5, "tid": 7, "ts": 1716454224128808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071458, "dur": 11, "args": { "External id": 172082, "cbid": 211, "correlation": 172082 } }, { "ph": "s", "id": 172082, "pid": 76337, "tid": -914061504, "ts": 1716454224071458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224128839, "dur": 4, "args": { "External id": 172094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172094, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172094, "pid": 5, "tid": 7, "ts": 1716454224128839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071479, "dur": 6, "args": { "External id": 172094, "cbid": 211, "correlation": 172094 } }, { "ph": "s", "id": 172094, "pid": 76337, "tid": -914061504, "ts": 1716454224071479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224128845, "dur": 31, "args": { "External id": 172097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172097, "pid": 5, "tid": 7, "ts": 1716454224128845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071497, "dur": 6, "args": { "External id": 172097, "cbid": 211, "correlation": 172097 } }, { "ph": "s", "id": 172097, "pid": 76337, "tid": -914061504, "ts": 1716454224071497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224128877, "dur": 22, "args": { "External id": 172106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172106, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172106, "pid": 5, "tid": 7, "ts": 1716454224128877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071536, "dur": 9, "args": { "External id": 172106, "cbid": 211, "correlation": 172106 } }, { "ph": "s", "id": 172106, "pid": 76337, "tid": -914061504, "ts": 1716454224071536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224071588, "dur": 0, "args": { "External id": 172116, "cbid": 317, "correlation": 172116 } }, { "ph": "f", "id": 172116, "pid": 76337, "tid": -914061504, "ts": 1716454224071588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224071589, "dur": 0, "args": { "External id": 172117, "cbid": 203, "correlation": 172117 } }, { "ph": "f", "id": 172117, "pid": 76337, "tid": -914061504, "ts": 1716454224071589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224071590, "dur": 0, "args": { "External id": 172118, "cbid": 205, "correlation": 172118 } }, { "ph": "f", "id": 172118, "pid": 76337, "tid": -914061504, "ts": 1716454224071590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224128900, "dur": 22, "args": { "External id": 172122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172122, "pid": 5, "tid": 7, "ts": 1716454224128900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071605, "dur": 11, "args": { "External id": 172122, "cbid": 211, "correlation": 172122 } }, { "ph": "s", "id": 172122, "pid": 76337, "tid": -914061504, "ts": 1716454224071605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224128923, "dur": 324, "args": { "External id": 172124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172124, "pid": 5, "tid": 7, "ts": 1716454224128923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071619, "dur": 6, "args": { "External id": 172124, "cbid": 211, "correlation": 172124 } }, { "ph": "s", "id": 172124, "pid": 76337, "tid": -914061504, "ts": 1716454224071619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224129249, "dur": 1, "args": { "External id": 172126, "device": 5, "context": 1, "stream": 7, "correlation": 172126, "bytes": 960, "memory bandwidth (GB/s)": 0.5878750765462339 } }, { "ph": "f", "id": 172126, "pid": 5, "tid": 7, "ts": 1716454224129249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224071631, "dur": 6, "args": { "External id": 172126, "cbid": 51, "correlation": 172126 } }, { "ph": "s", "id": 172126, "pid": 76337, "tid": -914061504, "ts": 1716454224071631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224129253, "dur": 1246, "args": { "External id": 172127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172127, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172127, "pid": 5, "tid": 7, "ts": 1716454224129253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071638, "dur": 6, "args": { "External id": 172127, "cbid": 211, "correlation": 172127 } }, { "ph": "s", "id": 172127, "pid": 76337, "tid": -914061504, "ts": 1716454224071638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224130500, "dur": 12, "args": { "External id": 172129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172129, "pid": 5, "tid": 7, "ts": 1716454224130500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071648, "dur": 5, "args": { "External id": 172129, "cbid": 211, "correlation": 172129 } }, { "ph": "s", "id": 172129, "pid": 76337, "tid": -914061504, "ts": 1716454224071648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224130514, "dur": 15, "args": { "External id": 172135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172135, "pid": 5, "tid": 7, "ts": 1716454224130514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071676, "dur": 8, "args": { "External id": 172135, "cbid": 211, "correlation": 172135 } }, { "ph": "s", "id": 172135, "pid": 76337, "tid": -914061504, "ts": 1716454224071676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224130530, "dur": 4, "args": { "External id": 172143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172143, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 172143, "pid": 5, "tid": 7, "ts": 1716454224130530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071719, "dur": 10, "args": { "External id": 172143, "cbid": 211, "correlation": 172143 } }, { "ph": "s", "id": 172143, "pid": 76337, "tid": -914061504, "ts": 1716454224071719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224071784, "dur": 1, "args": { "External id": 172159, "cbid": 251, "correlation": 172159 } }, { "ph": "f", "id": 172159, "pid": 76337, "tid": -914061504, "ts": 1716454224071784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224071789, "dur": 0, "args": { "External id": 172161, "cbid": 251, "correlation": 172161 } }, { "ph": "f", "id": 172161, "pid": 76337, "tid": -914061504, "ts": 1716454224071789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224130535, "dur": 13, "args": { "External id": 172162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172162, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172162, "pid": 5, "tid": 7, "ts": 1716454224130535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071791, "dur": 12, "args": { "External id": 172162, "cbid": 211, "correlation": 172162 } }, { "ph": "s", "id": 172162, "pid": 76337, "tid": -914061504, "ts": 1716454224071791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224130549, "dur": 5, "args": { "External id": 172164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172164, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172164, "pid": 5, "tid": 7, "ts": 1716454224130549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071804, "dur": 5, "args": { "External id": 172164, "cbid": 211, "correlation": 172164 } }, { "ph": "s", "id": 172164, "pid": 76337, "tid": -914061504, "ts": 1716454224071804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224130555, "dur": 17, "args": { "External id": 172174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172174, "pid": 5, "tid": 7, "ts": 1716454224130555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071862, "dur": 12, "args": { "External id": 172174, "cbid": 211, "correlation": 172174 } }, { "ph": "s", "id": 172174, "pid": 76337, "tid": -914061504, "ts": 1716454224071862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224130574, "dur": 17, "args": { "External id": 172194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172194, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 172194, "pid": 5, "tid": 7, "ts": 1716454224130574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071928, "dur": 11, "args": { "External id": 172194, "cbid": 211, "correlation": 172194 } }, { "ph": "s", "id": 172194, "pid": 76337, "tid": -914061504, "ts": 1716454224071928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224130592, "dur": 5, "args": { "External id": 172206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172206, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 172206, "pid": 5, "tid": 7, "ts": 1716454224130592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071949, "dur": 6, "args": { "External id": 172206, "cbid": 211, "correlation": 172206 } }, { "ph": "s", "id": 172206, "pid": 76337, "tid": -914061504, "ts": 1716454224071949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224130598, "dur": 17, "args": { "External id": 172209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172209, "pid": 5, "tid": 7, "ts": 1716454224130598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224071967, "dur": 15, "args": { "External id": 172209, "cbid": 211, "correlation": 172209 } }, { "ph": "s", "id": 172209, "pid": 76337, "tid": -914061504, "ts": 1716454224071967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224130616, "dur": 11, "args": { "External id": 172218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172218, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172218, "pid": 5, "tid": 7, "ts": 1716454224130616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072016, "dur": 11, "args": { "External id": 172218, "cbid": 211, "correlation": 172218 } }, { "ph": "s", "id": 172218, "pid": 76337, "tid": -914061504, "ts": 1716454224072016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224072080, "dur": 0, "args": { "External id": 172228, "cbid": 317, "correlation": 172228 } }, { "ph": "f", "id": 172228, "pid": 76337, "tid": -914061504, "ts": 1716454224072080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224072081, "dur": 0, "args": { "External id": 172229, "cbid": 203, "correlation": 172229 } }, { "ph": "f", "id": 172229, "pid": 76337, "tid": -914061504, "ts": 1716454224072081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224072082, "dur": 0, "args": { "External id": 172230, "cbid": 205, "correlation": 172230 } }, { "ph": "f", "id": 172230, "pid": 76337, "tid": -914061504, "ts": 1716454224072082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224130629, "dur": 11, "args": { "External id": 172234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172234, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172234, "pid": 5, "tid": 7, "ts": 1716454224130629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072095, "dur": 12, "args": { "External id": 172234, "cbid": 211, "correlation": 172234 } }, { "ph": "s", "id": 172234, "pid": 76337, "tid": -914061504, "ts": 1716454224072095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224130641, "dur": 163, "args": { "External id": 172236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172236, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172236, "pid": 5, "tid": 7, "ts": 1716454224130641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072109, "dur": 5, "args": { "External id": 172236, "cbid": 211, "correlation": 172236 } }, { "ph": "s", "id": 172236, "pid": 76337, "tid": -914061504, "ts": 1716454224072109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224130807, "dur": 1, "args": { "External id": 172238, "device": 5, "context": 1, "stream": 7, "correlation": 172238, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 172238, "pid": 5, "tid": 7, "ts": 1716454224130807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224072120, "dur": 7, "args": { "External id": 172238, "cbid": 51, "correlation": 172238 } }, { "ph": "s", "id": 172238, "pid": 76337, "tid": -914061504, "ts": 1716454224072120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224130810, "dur": 652, "args": { "External id": 172239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172239, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172239, "pid": 5, "tid": 7, "ts": 1716454224130810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072128, "dur": 7, "args": { "External id": 172239, "cbid": 211, "correlation": 172239 } }, { "ph": "s", "id": 172239, "pid": 76337, "tid": -914061504, "ts": 1716454224072128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224131463, "dur": 13, "args": { "External id": 172241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172241, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172241, "pid": 5, "tid": 7, "ts": 1716454224131463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072138, "dur": 5, "args": { "External id": 172241, "cbid": 211, "correlation": 172241 } }, { "ph": "s", "id": 172241, "pid": 76337, "tid": -914061504, "ts": 1716454224072138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224131478, "dur": 15, "args": { "External id": 172247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172247, "pid": 5, "tid": 7, "ts": 1716454224131478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072166, "dur": 9, "args": { "External id": 172247, "cbid": 211, "correlation": 172247 } }, { "ph": "s", "id": 172247, "pid": 76337, "tid": -914061504, "ts": 1716454224072166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224072224, "dur": 0, "args": { "External id": 172257, "cbid": 317, "correlation": 172257 } }, { "ph": "f", "id": 172257, "pid": 76337, "tid": -914061504, "ts": 1716454224072224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224072224, "dur": 0, "args": { "External id": 172258, "cbid": 203, "correlation": 172258 } }, { "ph": "f", "id": 172258, "pid": 76337, "tid": -914061504, "ts": 1716454224072224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224072225, "dur": 0, "args": { "External id": 172259, "cbid": 205, "correlation": 172259 } }, { "ph": "f", "id": 172259, "pid": 76337, "tid": -914061504, "ts": 1716454224072225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224131493, "dur": 21, "args": { "External id": 172263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172263, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172263, "pid": 5, "tid": 7, "ts": 1716454224131493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072239, "dur": 11, "args": { "External id": 172263, "cbid": 211, "correlation": 172263 } }, { "ph": "s", "id": 172263, "pid": 76337, "tid": -914061504, "ts": 1716454224072239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224131516, "dur": 4, "args": { "External id": 172265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172265, "pid": 5, "tid": 7, "ts": 1716454224131516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072256, "dur": 6, "args": { "External id": 172265, "cbid": 211, "correlation": 172265 } }, { "ph": "s", "id": 172265, "pid": 76337, "tid": -914061504, "ts": 1716454224072256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224072265, "dur": 0, "args": { "External id": 172266, "cbid": 51, "correlation": 172266 } }, { "ph": "s", "id": 172266, "pid": 76337, "tid": -914061504, "ts": 1716454224072265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224131521, "dur": 176, "args": { "External id": 172267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172267, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 172267, "pid": 5, "tid": 7, "ts": 1716454224131521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072266, "dur": 5, "args": { "External id": 172267, "cbid": 211, "correlation": 172267 } }, { "ph": "s", "id": 172267, "pid": 76337, "tid": -914061504, "ts": 1716454224072266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224131699, "dur": 16, "args": { "External id": 172272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172272, "pid": 5, "tid": 7, "ts": 1716454224131699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072293, "dur": 8, "args": { "External id": 172272, "cbid": 211, "correlation": 172272 } }, { "ph": "s", "id": 172272, "pid": 76337, "tid": -914061504, "ts": 1716454224072293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224131716, "dur": 12, "args": { "External id": 172280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172280, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172280, "pid": 5, "tid": 7, "ts": 1716454224131716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072321, "dur": 8, "args": { "External id": 172280, "cbid": 211, "correlation": 172280 } }, { "ph": "s", "id": 172280, "pid": 76337, "tid": -914061504, "ts": 1716454224072321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224131729, "dur": 11, "args": { "External id": 172288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172288, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172288, "pid": 5, "tid": 7, "ts": 1716454224131729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072350, "dur": 8, "args": { "External id": 172288, "cbid": 211, "correlation": 172288 } }, { "ph": "s", "id": 172288, "pid": 76337, "tid": -914061504, "ts": 1716454224072350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224131741, "dur": 19, "args": { "External id": 172308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172308, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 172308, "pid": 5, "tid": 7, "ts": 1716454224131741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072432, "dur": 12, "args": { "External id": 172308, "cbid": 211, "correlation": 172308 } }, { "ph": "s", "id": 172308, "pid": 76337, "tid": -914061504, "ts": 1716454224072432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224131761, "dur": 4, "args": { "External id": 172320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172320, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 172320, "pid": 5, "tid": 7, "ts": 1716454224131761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072454, "dur": 6, "args": { "External id": 172320, "cbid": 211, "correlation": 172320 } }, { "ph": "s", "id": 172320, "pid": 76337, "tid": -914061504, "ts": 1716454224072454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224131767, "dur": 17, "args": { "External id": 172323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172323, "pid": 5, "tid": 7, "ts": 1716454224131767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072471, "dur": 7, "args": { "External id": 172323, "cbid": 211, "correlation": 172323 } }, { "ph": "s", "id": 172323, "pid": 76337, "tid": -914061504, "ts": 1716454224072471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224072528, "dur": 0, "args": { "External id": 172334, "cbid": 317, "correlation": 172334 } }, { "ph": "f", "id": 172334, "pid": 76337, "tid": -914061504, "ts": 1716454224072528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224072529, "dur": 0, "args": { "External id": 172335, "cbid": 203, "correlation": 172335 } }, { "ph": "f", "id": 172335, "pid": 76337, "tid": -914061504, "ts": 1716454224072529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224072530, "dur": 0, "args": { "External id": 172336, "cbid": 205, "correlation": 172336 } }, { "ph": "f", "id": 172336, "pid": 76337, "tid": -914061504, "ts": 1716454224072530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224131785, "dur": 12, "args": { "External id": 172340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172340, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172340, "pid": 5, "tid": 7, "ts": 1716454224131785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072543, "dur": 11, "args": { "External id": 172340, "cbid": 211, "correlation": 172340 } }, { "ph": "s", "id": 172340, "pid": 76337, "tid": -914061504, "ts": 1716454224072543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224131798, "dur": 3, "args": { "External id": 172342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172342, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172342, "pid": 5, "tid": 7, "ts": 1716454224131798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072559, "dur": 6, "args": { "External id": 172342, "cbid": 211, "correlation": 172342 } }, { "ph": "s", "id": 172342, "pid": 76337, "tid": -914061504, "ts": 1716454224072559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224072567, "dur": 0, "args": { "External id": 172343, "cbid": 51, "correlation": 172343 } }, { "ph": "s", "id": 172343, "pid": 76337, "tid": -914061504, "ts": 1716454224072567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224131803, "dur": 90, "args": { "External id": 172344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172344, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 172344, "pid": 5, "tid": 7, "ts": 1716454224131803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072568, "dur": 5, "args": { "External id": 172344, "cbid": 211, "correlation": 172344 } }, { "ph": "s", "id": 172344, "pid": 76337, "tid": -914061504, "ts": 1716454224072568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224131895, "dur": 16, "args": { "External id": 172349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172349, "pid": 5, "tid": 7, "ts": 1716454224131895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072595, "dur": 9, "args": { "External id": 172349, "cbid": 211, "correlation": 172349 } }, { "ph": "s", "id": 172349, "pid": 76337, "tid": -914061504, "ts": 1716454224072595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224131912, "dur": 85, "args": { "External id": 172358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172358, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172358, "pid": 5, "tid": 7, "ts": 1716454224131912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072678, "dur": 15, "args": { "External id": 172358, "cbid": 211, "correlation": 172358 } }, { "ph": "s", "id": 172358, "pid": 76337, "tid": -914061504, "ts": 1716454224072678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224131999, "dur": 30, "args": { "External id": 172380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172380, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172380, "pid": 5, "tid": 7, "ts": 1716454224131999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072737, "dur": 10, "args": { "External id": 172380, "cbid": 211, "correlation": 172380 } }, { "ph": "s", "id": 172380, "pid": 76337, "tid": -914061504, "ts": 1716454224072737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224072830, "dur": 1, "args": { "External id": 172391, "cbid": 251, "correlation": 172391 } }, { "ph": "f", "id": 172391, "pid": 76337, "tid": -914061504, "ts": 1716454224072830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224132031, "dur": 167, "args": { "External id": 172392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172392, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172392, "pid": 5, "tid": 7, "ts": 1716454224132031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072836, "dur": 13, "args": { "External id": 172392, "cbid": 211, "correlation": 172392 } }, { "ph": "s", "id": 172392, "pid": 76337, "tid": -914061504, "ts": 1716454224072836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224072907, "dur": 1, "args": { "External id": 172403, "cbid": 251, "correlation": 172403 } }, { "ph": "f", "id": 172403, "pid": 76337, "tid": -914061504, "ts": 1716454224072907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224132199, "dur": 160, "args": { "External id": 172404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172404, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172404, "pid": 5, "tid": 7, "ts": 1716454224132199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072911, "dur": 11, "args": { "External id": 172404, "cbid": 211, "correlation": 172404 } }, { "ph": "s", "id": 172404, "pid": 76337, "tid": -914061504, "ts": 1716454224072911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224072984, "dur": 1, "args": { "External id": 172415, "cbid": 251, "correlation": 172415 } }, { "ph": "f", "id": 172415, "pid": 76337, "tid": -914061504, "ts": 1716454224072984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224132360, "dur": 159, "args": { "External id": 172416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172416, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172416, "pid": 5, "tid": 7, "ts": 1716454224132360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224072989, "dur": 12, "args": { "External id": 172416, "cbid": 211, "correlation": 172416 } }, { "ph": "s", "id": 172416, "pid": 76337, "tid": -914061504, "ts": 1716454224072989, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224132520, "dur": 338, "args": { "External id": 172441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172441, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172441, "pid": 5, "tid": 7, "ts": 1716454224132520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073077, "dur": 14, "args": { "External id": 172441, "cbid": 211, "correlation": 172441 } }, { "ph": "s", "id": 172441, "pid": 76337, "tid": -914061504, "ts": 1716454224073077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073179, "dur": 1, "args": { "External id": 172459, "cbid": 251, "correlation": 172459 } }, { "ph": "f", "id": 172459, "pid": 76337, "tid": -914061504, "ts": 1716454224073179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224132860, "dur": 167, "args": { "External id": 172461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172461, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172461, "pid": 5, "tid": 7, "ts": 1716454224132860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073185, "dur": 14, "args": { "External id": 172461, "cbid": 211, "correlation": 172461 } }, { "ph": "s", "id": 172461, "pid": 76337, "tid": -914061504, "ts": 1716454224073185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224133028, "dur": 20, "args": { "External id": 172469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172469, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172469, "pid": 5, "tid": 7, "ts": 1716454224133028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073256, "dur": 12, "args": { "External id": 172469, "cbid": 211, "correlation": 172469 } }, { "ph": "s", "id": 172469, "pid": 76337, "tid": -914061504, "ts": 1716454224073256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224133049, "dur": 28, "args": { "External id": 172477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172477, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172477, "pid": 5, "tid": 7, "ts": 1716454224133049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073295, "dur": 8, "args": { "External id": 172477, "cbid": 211, "correlation": 172477 } }, { "ph": "s", "id": 172477, "pid": 76337, "tid": -914061504, "ts": 1716454224073295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224133078, "dur": 18, "args": { "External id": 172488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172488, "pid": 5, "tid": 7, "ts": 1716454224133078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073368, "dur": 12, "args": { "External id": 172488, "cbid": 211, "correlation": 172488 } }, { "ph": "s", "id": 172488, "pid": 76337, "tid": -914061504, "ts": 1716454224073368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224133098, "dur": 16, "args": { "External id": 172510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172510, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172510, "pid": 5, "tid": 7, "ts": 1716454224133098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073399, "dur": 8, "args": { "External id": 172510, "cbid": 211, "correlation": 172510 } }, { "ph": "s", "id": 172510, "pid": 76337, "tid": -914061504, "ts": 1716454224073399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073486, "dur": 1, "args": { "External id": 172521, "cbid": 251, "correlation": 172521 } }, { "ph": "f", "id": 172521, "pid": 76337, "tid": -914061504, "ts": 1716454224073486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224133115, "dur": 89, "args": { "External id": 172522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172522, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172522, "pid": 5, "tid": 7, "ts": 1716454224133115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073491, "dur": 14, "args": { "External id": 172522, "cbid": 211, "correlation": 172522 } }, { "ph": "s", "id": 172522, "pid": 76337, "tid": -914061504, "ts": 1716454224073491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073563, "dur": 1, "args": { "External id": 172533, "cbid": 251, "correlation": 172533 } }, { "ph": "f", "id": 172533, "pid": 76337, "tid": -914061504, "ts": 1716454224073563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073567, "dur": 0, "args": { "External id": 172534, "cbid": 251, "correlation": 172534 } }, { "ph": "f", "id": 172534, "pid": 76337, "tid": -914061504, "ts": 1716454224073567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224133206, "dur": 12, "args": { "External id": 172535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172535, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172535, "pid": 5, "tid": 7, "ts": 1716454224133206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073569, "dur": 12, "args": { "External id": 172535, "cbid": 211, "correlation": 172535 } }, { "ph": "s", "id": 172535, "pid": 76337, "tid": -914061504, "ts": 1716454224073569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224133220, "dur": 5, "args": { "External id": 172537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172537, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172537, "pid": 5, "tid": 7, "ts": 1716454224133220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073583, "dur": 6, "args": { "External id": 172537, "cbid": 211, "correlation": 172537 } }, { "ph": "s", "id": 172537, "pid": 76337, "tid": -914061504, "ts": 1716454224073583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073640, "dur": 1, "args": { "External id": 172548, "cbid": 251, "correlation": 172548 } }, { "ph": "f", "id": 172548, "pid": 76337, "tid": -914061504, "ts": 1716454224073640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073643, "dur": 0, "args": { "External id": 172549, "cbid": 251, "correlation": 172549 } }, { "ph": "f", "id": 172549, "pid": 76337, "tid": -914061504, "ts": 1716454224073643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224133226, "dur": 9, "args": { "External id": 172550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172550, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172550, "pid": 5, "tid": 7, "ts": 1716454224133226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073645, "dur": 12, "args": { "External id": 172550, "cbid": 211, "correlation": 172550 } }, { "ph": "s", "id": 172550, "pid": 76337, "tid": -914061504, "ts": 1716454224073645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224133236, "dur": 4, "args": { "External id": 172552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172552, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172552, "pid": 5, "tid": 7, "ts": 1716454224133236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073658, "dur": 6, "args": { "External id": 172552, "cbid": 211, "correlation": 172552 } }, { "ph": "s", "id": 172552, "pid": 76337, "tid": -914061504, "ts": 1716454224073658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224133241, "dur": 56, "args": { "External id": 172577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172577, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172577, "pid": 5, "tid": 7, "ts": 1716454224133241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073735, "dur": 13, "args": { "External id": 172577, "cbid": 211, "correlation": 172577 } }, { "ph": "s", "id": 172577, "pid": 76337, "tid": -914061504, "ts": 1716454224073735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224073833, "dur": 2, "args": { "External id": 172595, "cbid": 251, "correlation": 172595 } }, { "ph": "f", "id": 172595, "pid": 76337, "tid": -914061504, "ts": 1716454224073833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224133298, "dur": 93, "args": { "External id": 172597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172597, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172597, "pid": 5, "tid": 7, "ts": 1716454224133298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073840, "dur": 14, "args": { "External id": 172597, "cbid": 211, "correlation": 172597 } }, { "ph": "s", "id": 172597, "pid": 76337, "tid": -914061504, "ts": 1716454224073840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224133392, "dur": 9, "args": { "External id": 172605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172605, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172605, "pid": 5, "tid": 7, "ts": 1716454224133392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073910, "dur": 12, "args": { "External id": 172605, "cbid": 211, "correlation": 172605 } }, { "ph": "s", "id": 172605, "pid": 76337, "tid": -914061504, "ts": 1716454224073910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224133402, "dur": 21, "args": { "External id": 172613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172613, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172613, "pid": 5, "tid": 7, "ts": 1716454224133402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224073951, "dur": 9, "args": { "External id": 172613, "cbid": 211, "correlation": 172613 } }, { "ph": "s", "id": 172613, "pid": 76337, "tid": -914061504, "ts": 1716454224073951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224133425, "dur": 18, "args": { "External id": 172635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172635, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172635, "pid": 5, "tid": 7, "ts": 1716454224133425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074011, "dur": 11, "args": { "External id": 172635, "cbid": 211, "correlation": 172635 } }, { "ph": "s", "id": 172635, "pid": 76337, "tid": -914061504, "ts": 1716454224074011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224074100, "dur": 1, "args": { "External id": 172651, "cbid": 251, "correlation": 172651 } }, { "ph": "f", "id": 172651, "pid": 76337, "tid": -914061504, "ts": 1716454224074100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224074105, "dur": 0, "args": { "External id": 172653, "cbid": 251, "correlation": 172653 } }, { "ph": "f", "id": 172653, "pid": 76337, "tid": -914061504, "ts": 1716454224074105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224133444, "dur": 496, "args": { "External id": 172654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172654, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172654, "pid": 5, "tid": 7, "ts": 1716454224133444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074107, "dur": 13, "args": { "External id": 172654, "cbid": 211, "correlation": 172654 } }, { "ph": "s", "id": 172654, "pid": 76337, "tid": -914061504, "ts": 1716454224074107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224133941, "dur": 67, "args": { "External id": 172662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172662, "pid": 5, "tid": 7, "ts": 1716454224133941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074174, "dur": 13, "args": { "External id": 172662, "cbid": 211, "correlation": 172662 } }, { "ph": "s", "id": 172662, "pid": 76337, "tid": -914061504, "ts": 1716454224074174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224134010, "dur": 66, "args": { "External id": 172670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172670, "pid": 5, "tid": 7, "ts": 1716454224134010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074207, "dur": 8, "args": { "External id": 172670, "cbid": 211, "correlation": 172670 } }, { "ph": "s", "id": 172670, "pid": 76337, "tid": -914061504, "ts": 1716454224074207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224074286, "dur": 1, "args": { "External id": 172686, "cbid": 251, "correlation": 172686 } }, { "ph": "f", "id": 172686, "pid": 76337, "tid": -914061504, "ts": 1716454224074286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224134078, "dur": 1, "args": { "External id": 172688, "device": 5, "context": 1, "stream": 7, "correlation": 172688, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 172688, "pid": 5, "tid": 7, "ts": 1716454224134078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224074291, "dur": 11, "args": { "External id": 172688, "cbid": 51, "correlation": 172688 } }, { "ph": "s", "id": 172688, "pid": 76337, "tid": -914061504, "ts": 1716454224074291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224134082, "dur": 272, "args": { "External id": 172689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172689, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172689, "pid": 5, "tid": 7, "ts": 1716454224134082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074304, "dur": 11, "args": { "External id": 172689, "cbid": 211, "correlation": 172689 } }, { "ph": "s", "id": 172689, "pid": 76337, "tid": -914061504, "ts": 1716454224074304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224134355, "dur": 14, "args": { "External id": 172697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172697, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172697, "pid": 5, "tid": 7, "ts": 1716454224134355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074346, "dur": 11, "args": { "External id": 172697, "cbid": 211, "correlation": 172697 } }, { "ph": "s", "id": 172697, "pid": 76337, "tid": -914061504, "ts": 1716454224074346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224134370, "dur": 38, "args": { "External id": 172708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172708, "pid": 5, "tid": 7, "ts": 1716454224134370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074415, "dur": 12, "args": { "External id": 172708, "cbid": 211, "correlation": 172708 } }, { "ph": "s", "id": 172708, "pid": 76337, "tid": -914061504, "ts": 1716454224074415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224074480, "dur": 0, "args": { "External id": 172720, "cbid": 317, "correlation": 172720 } }, { "ph": "f", "id": 172720, "pid": 76337, "tid": -914061504, "ts": 1716454224074480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224074481, "dur": 0, "args": { "External id": 172721, "cbid": 203, "correlation": 172721 } }, { "ph": "f", "id": 172721, "pid": 76337, "tid": -914061504, "ts": 1716454224074481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224074482, "dur": 0, "args": { "External id": 172722, "cbid": 205, "correlation": 172722 } }, { "ph": "f", "id": 172722, "pid": 76337, "tid": -914061504, "ts": 1716454224074482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224134410, "dur": 15, "args": { "External id": 172726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172726, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172726, "pid": 5, "tid": 7, "ts": 1716454224134410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074497, "dur": 12, "args": { "External id": 172726, "cbid": 211, "correlation": 172726 } }, { "ph": "s", "id": 172726, "pid": 76337, "tid": -914061504, "ts": 1716454224074497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224134426, "dur": 4, "args": { "External id": 172728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172728, "pid": 5, "tid": 7, "ts": 1716454224134426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074514, "dur": 6, "args": { "External id": 172728, "cbid": 211, "correlation": 172728 } }, { "ph": "s", "id": 172728, "pid": 76337, "tid": -914061504, "ts": 1716454224074514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224074522, "dur": 0, "args": { "External id": 172729, "cbid": 51, "correlation": 172729 } }, { "ph": "s", "id": 172729, "pid": 76337, "tid": -914061504, "ts": 1716454224074522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224134431, "dur": 98, "args": { "External id": 172730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172730, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 172730, "pid": 5, "tid": 7, "ts": 1716454224134431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074523, "dur": 5, "args": { "External id": 172730, "cbid": 211, "correlation": 172730 } }, { "ph": "s", "id": 172730, "pid": 76337, "tid": -914061504, "ts": 1716454224074523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224134531, "dur": 16, "args": { "External id": 172735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172735, "pid": 5, "tid": 7, "ts": 1716454224134531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074551, "dur": 8, "args": { "External id": 172735, "cbid": 211, "correlation": 172735 } }, { "ph": "s", "id": 172735, "pid": 76337, "tid": -914061504, "ts": 1716454224074551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224134548, "dur": 12, "args": { "External id": 172743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172743, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172743, "pid": 5, "tid": 7, "ts": 1716454224134548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074582, "dur": 8, "args": { "External id": 172743, "cbid": 211, "correlation": 172743 } }, { "ph": "s", "id": 172743, "pid": 76337, "tid": -914061504, "ts": 1716454224074582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224134561, "dur": 30, "args": { "External id": 172752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172752, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172752, "pid": 5, "tid": 7, "ts": 1716454224134561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074622, "dur": 10, "args": { "External id": 172752, "cbid": 211, "correlation": 172752 } }, { "ph": "s", "id": 172752, "pid": 76337, "tid": -914061504, "ts": 1716454224074622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224134592, "dur": 30, "args": { "External id": 172772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172772, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 172772, "pid": 5, "tid": 7, "ts": 1716454224134592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074693, "dur": 12, "args": { "External id": 172772, "cbid": 211, "correlation": 172772 } }, { "ph": "s", "id": 172772, "pid": 76337, "tid": -914061504, "ts": 1716454224074693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224134624, "dur": 5, "args": { "External id": 172784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172784, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172784, "pid": 5, "tid": 7, "ts": 1716454224134624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074715, "dur": 6, "args": { "External id": 172784, "cbid": 211, "correlation": 172784 } }, { "ph": "s", "id": 172784, "pid": 76337, "tid": -914061504, "ts": 1716454224074715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224134631, "dur": 31, "args": { "External id": 172787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172787, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172787, "pid": 5, "tid": 7, "ts": 1716454224134631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074733, "dur": 7, "args": { "External id": 172787, "cbid": 211, "correlation": 172787 } }, { "ph": "s", "id": 172787, "pid": 76337, "tid": -914061504, "ts": 1716454224074733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224134663, "dur": 22, "args": { "External id": 172796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172796, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172796, "pid": 5, "tid": 7, "ts": 1716454224134663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074772, "dur": 10, "args": { "External id": 172796, "cbid": 211, "correlation": 172796 } }, { "ph": "s", "id": 172796, "pid": 76337, "tid": -914061504, "ts": 1716454224074772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224074825, "dur": 0, "args": { "External id": 172806, "cbid": 317, "correlation": 172806 } }, { "ph": "f", "id": 172806, "pid": 76337, "tid": -914061504, "ts": 1716454224074825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224074826, "dur": 0, "args": { "External id": 172807, "cbid": 203, "correlation": 172807 } }, { "ph": "f", "id": 172807, "pid": 76337, "tid": -914061504, "ts": 1716454224074826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224074827, "dur": 0, "args": { "External id": 172808, "cbid": 205, "correlation": 172808 } }, { "ph": "f", "id": 172808, "pid": 76337, "tid": -914061504, "ts": 1716454224074827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224134686, "dur": 22, "args": { "External id": 172812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172812, "pid": 5, "tid": 7, "ts": 1716454224134686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074840, "dur": 12, "args": { "External id": 172812, "cbid": 211, "correlation": 172812 } }, { "ph": "s", "id": 172812, "pid": 76337, "tid": -914061504, "ts": 1716454224074840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224134710, "dur": 323, "args": { "External id": 172814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172814, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172814, "pid": 5, "tid": 7, "ts": 1716454224134710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074855, "dur": 5, "args": { "External id": 172814, "cbid": 211, "correlation": 172814 } }, { "ph": "s", "id": 172814, "pid": 76337, "tid": -914061504, "ts": 1716454224074855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224135035, "dur": 1, "args": { "External id": 172816, "device": 5, "context": 1, "stream": 7, "correlation": 172816, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 172816, "pid": 5, "tid": 7, "ts": 1716454224135035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224074866, "dur": 7, "args": { "External id": 172816, "cbid": 51, "correlation": 172816 } }, { "ph": "s", "id": 172816, "pid": 76337, "tid": -914061504, "ts": 1716454224074866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224135039, "dur": 1265, "args": { "External id": 172817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172817, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172817, "pid": 5, "tid": 7, "ts": 1716454224135039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074874, "dur": 6, "args": { "External id": 172817, "cbid": 211, "correlation": 172817 } }, { "ph": "s", "id": 172817, "pid": 76337, "tid": -914061504, "ts": 1716454224074874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224136305, "dur": 13, "args": { "External id": 172819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172819, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172819, "pid": 5, "tid": 7, "ts": 1716454224136305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074884, "dur": 5, "args": { "External id": 172819, "cbid": 211, "correlation": 172819 } }, { "ph": "s", "id": 172819, "pid": 76337, "tid": -914061504, "ts": 1716454224074884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224136319, "dur": 15, "args": { "External id": 172825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172825, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172825, "pid": 5, "tid": 7, "ts": 1716454224136319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074913, "dur": 8, "args": { "External id": 172825, "cbid": 211, "correlation": 172825 } }, { "ph": "s", "id": 172825, "pid": 76337, "tid": -914061504, "ts": 1716454224074913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224136336, "dur": 4, "args": { "External id": 172833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172833, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 172833, "pid": 5, "tid": 7, "ts": 1716454224136336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224074958, "dur": 9, "args": { "External id": 172833, "cbid": 211, "correlation": 172833 } }, { "ph": "s", "id": 172833, "pid": 76337, "tid": -914061504, "ts": 1716454224074958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224075031, "dur": 1, "args": { "External id": 172849, "cbid": 251, "correlation": 172849 } }, { "ph": "f", "id": 172849, "pid": 76337, "tid": -914061504, "ts": 1716454224075031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224075036, "dur": 0, "args": { "External id": 172851, "cbid": 251, "correlation": 172851 } }, { "ph": "f", "id": 172851, "pid": 76337, "tid": -914061504, "ts": 1716454224075036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224136341, "dur": 13, "args": { "External id": 172852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172852, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172852, "pid": 5, "tid": 7, "ts": 1716454224136341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075038, "dur": 12, "args": { "External id": 172852, "cbid": 211, "correlation": 172852 } }, { "ph": "s", "id": 172852, "pid": 76337, "tid": -914061504, "ts": 1716454224075038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224136356, "dur": 6, "args": { "External id": 172854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172854, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172854, "pid": 5, "tid": 7, "ts": 1716454224136356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075051, "dur": 6, "args": { "External id": 172854, "cbid": 211, "correlation": 172854 } }, { "ph": "s", "id": 172854, "pid": 76337, "tid": -914061504, "ts": 1716454224075051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224136362, "dur": 17, "args": { "External id": 172864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172864, "pid": 5, "tid": 7, "ts": 1716454224136362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075111, "dur": 11, "args": { "External id": 172864, "cbid": 211, "correlation": 172864 } }, { "ph": "s", "id": 172864, "pid": 76337, "tid": -914061504, "ts": 1716454224075111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224136381, "dur": 17, "args": { "External id": 172884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172884, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 172884, "pid": 5, "tid": 7, "ts": 1716454224136381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075177, "dur": 10, "args": { "External id": 172884, "cbid": 211, "correlation": 172884 } }, { "ph": "s", "id": 172884, "pid": 76337, "tid": -914061504, "ts": 1716454224075177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224136399, "dur": 5, "args": { "External id": 172896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172896, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 172896, "pid": 5, "tid": 7, "ts": 1716454224136399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075197, "dur": 6, "args": { "External id": 172896, "cbid": 211, "correlation": 172896 } }, { "ph": "s", "id": 172896, "pid": 76337, "tid": -914061504, "ts": 1716454224075197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224136405, "dur": 17, "args": { "External id": 172899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172899, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172899, "pid": 5, "tid": 7, "ts": 1716454224136405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075216, "dur": 6, "args": { "External id": 172899, "cbid": 211, "correlation": 172899 } }, { "ph": "s", "id": 172899, "pid": 76337, "tid": -914061504, "ts": 1716454224075216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224136424, "dur": 11, "args": { "External id": 172908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172908, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172908, "pid": 5, "tid": 7, "ts": 1716454224136424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075256, "dur": 10, "args": { "External id": 172908, "cbid": 211, "correlation": 172908 } }, { "ph": "s", "id": 172908, "pid": 76337, "tid": -914061504, "ts": 1716454224075256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224075318, "dur": 0, "args": { "External id": 172918, "cbid": 317, "correlation": 172918 } }, { "ph": "f", "id": 172918, "pid": 76337, "tid": -914061504, "ts": 1716454224075318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224075319, "dur": 0, "args": { "External id": 172919, "cbid": 203, "correlation": 172919 } }, { "ph": "f", "id": 172919, "pid": 76337, "tid": -914061504, "ts": 1716454224075319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224075319, "dur": 0, "args": { "External id": 172920, "cbid": 205, "correlation": 172920 } }, { "ph": "f", "id": 172920, "pid": 76337, "tid": -914061504, "ts": 1716454224075319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224136436, "dur": 12, "args": { "External id": 172924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172924, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172924, "pid": 5, "tid": 7, "ts": 1716454224136436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075333, "dur": 12, "args": { "External id": 172924, "cbid": 211, "correlation": 172924 } }, { "ph": "s", "id": 172924, "pid": 76337, "tid": -914061504, "ts": 1716454224075333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224136449, "dur": 164, "args": { "External id": 172926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172926, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172926, "pid": 5, "tid": 7, "ts": 1716454224136449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075347, "dur": 5, "args": { "External id": 172926, "cbid": 211, "correlation": 172926 } }, { "ph": "s", "id": 172926, "pid": 76337, "tid": -914061504, "ts": 1716454224075347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224136615, "dur": 1, "args": { "External id": 172928, "device": 5, "context": 1, "stream": 7, "correlation": 172928, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 172928, "pid": 5, "tid": 7, "ts": 1716454224136615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224075358, "dur": 7, "args": { "External id": 172928, "cbid": 51, "correlation": 172928 } }, { "ph": "s", "id": 172928, "pid": 76337, "tid": -914061504, "ts": 1716454224075358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224136619, "dur": 653, "args": { "External id": 172929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172929, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 172929, "pid": 5, "tid": 7, "ts": 1716454224136619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075366, "dur": 6, "args": { "External id": 172929, "cbid": 211, "correlation": 172929 } }, { "ph": "s", "id": 172929, "pid": 76337, "tid": -914061504, "ts": 1716454224075366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224137273, "dur": 13, "args": { "External id": 172931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172931, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172931, "pid": 5, "tid": 7, "ts": 1716454224137273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075375, "dur": 5, "args": { "External id": 172931, "cbid": 211, "correlation": 172931 } }, { "ph": "s", "id": 172931, "pid": 76337, "tid": -914061504, "ts": 1716454224075375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224137287, "dur": 15, "args": { "External id": 172937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172937, "pid": 5, "tid": 7, "ts": 1716454224137287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075404, "dur": 10, "args": { "External id": 172937, "cbid": 211, "correlation": 172937 } }, { "ph": "s", "id": 172937, "pid": 76337, "tid": -914061504, "ts": 1716454224075404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224075464, "dur": 0, "args": { "External id": 172947, "cbid": 317, "correlation": 172947 } }, { "ph": "f", "id": 172947, "pid": 76337, "tid": -914061504, "ts": 1716454224075464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224075465, "dur": 0, "args": { "External id": 172948, "cbid": 203, "correlation": 172948 } }, { "ph": "f", "id": 172948, "pid": 76337, "tid": -914061504, "ts": 1716454224075465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224075465, "dur": 0, "args": { "External id": 172949, "cbid": 205, "correlation": 172949 } }, { "ph": "f", "id": 172949, "pid": 76337, "tid": -914061504, "ts": 1716454224075465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224137303, "dur": 21, "args": { "External id": 172953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172953, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172953, "pid": 5, "tid": 7, "ts": 1716454224137303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075477, "dur": 12, "args": { "External id": 172953, "cbid": 211, "correlation": 172953 } }, { "ph": "s", "id": 172953, "pid": 76337, "tid": -914061504, "ts": 1716454224075477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224137326, "dur": 4, "args": { "External id": 172955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 172955, "pid": 5, "tid": 7, "ts": 1716454224137326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075492, "dur": 6, "args": { "External id": 172955, "cbid": 211, "correlation": 172955 } }, { "ph": "s", "id": 172955, "pid": 76337, "tid": -914061504, "ts": 1716454224075492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224075501, "dur": 0, "args": { "External id": 172956, "cbid": 51, "correlation": 172956 } }, { "ph": "s", "id": 172956, "pid": 76337, "tid": -914061504, "ts": 1716454224075501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224137331, "dur": 173, "args": { "External id": 172957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172957, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 172957, "pid": 5, "tid": 7, "ts": 1716454224137331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075502, "dur": 5, "args": { "External id": 172957, "cbid": 211, "correlation": 172957 } }, { "ph": "s", "id": 172957, "pid": 76337, "tid": -914061504, "ts": 1716454224075502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224137505, "dur": 16, "args": { "External id": 172962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172962, "pid": 5, "tid": 7, "ts": 1716454224137505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075527, "dur": 8, "args": { "External id": 172962, "cbid": 211, "correlation": 172962 } }, { "ph": "s", "id": 172962, "pid": 76337, "tid": -914061504, "ts": 1716454224075527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224137522, "dur": 12, "args": { "External id": 172970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172970, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172970, "pid": 5, "tid": 7, "ts": 1716454224137522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075556, "dur": 8, "args": { "External id": 172970, "cbid": 211, "correlation": 172970 } }, { "ph": "s", "id": 172970, "pid": 76337, "tid": -914061504, "ts": 1716454224075556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224137535, "dur": 10, "args": { "External id": 172978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172978, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 172978, "pid": 5, "tid": 7, "ts": 1716454224137535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075584, "dur": 8, "args": { "External id": 172978, "cbid": 211, "correlation": 172978 } }, { "ph": "s", "id": 172978, "pid": 76337, "tid": -914061504, "ts": 1716454224075584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224137547, "dur": 18, "args": { "External id": 172998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 172998, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 172998, "pid": 5, "tid": 7, "ts": 1716454224137547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075669, "dur": 12, "args": { "External id": 172998, "cbid": 211, "correlation": 172998 } }, { "ph": "s", "id": 172998, "pid": 76337, "tid": -914061504, "ts": 1716454224075669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224137566, "dur": 5, "args": { "External id": 173010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173010, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 173010, "pid": 5, "tid": 7, "ts": 1716454224137566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075691, "dur": 6, "args": { "External id": 173010, "cbid": 211, "correlation": 173010 } }, { "ph": "s", "id": 173010, "pid": 76337, "tid": -914061504, "ts": 1716454224075691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224137572, "dur": 17, "args": { "External id": 173013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173013, "pid": 5, "tid": 7, "ts": 1716454224137572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075709, "dur": 7, "args": { "External id": 173013, "cbid": 211, "correlation": 173013 } }, { "ph": "s", "id": 173013, "pid": 76337, "tid": -914061504, "ts": 1716454224075709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224075767, "dur": 0, "args": { "External id": 173024, "cbid": 317, "correlation": 173024 } }, { "ph": "f", "id": 173024, "pid": 76337, "tid": -914061504, "ts": 1716454224075767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224075768, "dur": 0, "args": { "External id": 173025, "cbid": 203, "correlation": 173025 } }, { "ph": "f", "id": 173025, "pid": 76337, "tid": -914061504, "ts": 1716454224075768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224075769, "dur": 0, "args": { "External id": 173026, "cbid": 205, "correlation": 173026 } }, { "ph": "f", "id": 173026, "pid": 76337, "tid": -914061504, "ts": 1716454224075769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224137591, "dur": 12, "args": { "External id": 173030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173030, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173030, "pid": 5, "tid": 7, "ts": 1716454224137591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075782, "dur": 12, "args": { "External id": 173030, "cbid": 211, "correlation": 173030 } }, { "ph": "s", "id": 173030, "pid": 76337, "tid": -914061504, "ts": 1716454224075782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224137604, "dur": 3, "args": { "External id": 173032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173032, "pid": 5, "tid": 7, "ts": 1716454224137604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075797, "dur": 6, "args": { "External id": 173032, "cbid": 211, "correlation": 173032 } }, { "ph": "s", "id": 173032, "pid": 76337, "tid": -914061504, "ts": 1716454224075797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224075805, "dur": 0, "args": { "External id": 173033, "cbid": 51, "correlation": 173033 } }, { "ph": "s", "id": 173033, "pid": 76337, "tid": -914061504, "ts": 1716454224075805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224137609, "dur": 91, "args": { "External id": 173034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173034, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 173034, "pid": 5, "tid": 7, "ts": 1716454224137609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075806, "dur": 5, "args": { "External id": 173034, "cbid": 211, "correlation": 173034 } }, { "ph": "s", "id": 173034, "pid": 76337, "tid": -914061504, "ts": 1716454224075806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224137701, "dur": 16, "args": { "External id": 173039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173039, "pid": 5, "tid": 7, "ts": 1716454224137701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075833, "dur": 8, "args": { "External id": 173039, "cbid": 211, "correlation": 173039 } }, { "ph": "s", "id": 173039, "pid": 76337, "tid": -914061504, "ts": 1716454224075833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224137718, "dur": 84, "args": { "External id": 173048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173048, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173048, "pid": 5, "tid": 7, "ts": 1716454224137718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075914, "dur": 15, "args": { "External id": 173048, "cbid": 211, "correlation": 173048 } }, { "ph": "s", "id": 173048, "pid": 76337, "tid": -914061504, "ts": 1716454224075914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224137803, "dur": 31, "args": { "External id": 173070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173070, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173070, "pid": 5, "tid": 7, "ts": 1716454224137803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224075971, "dur": 20, "args": { "External id": 173070, "cbid": 211, "correlation": 173070 } }, { "ph": "s", "id": 173070, "pid": 76337, "tid": -914061504, "ts": 1716454224075971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076070, "dur": 1, "args": { "External id": 173081, "cbid": 251, "correlation": 173081 } }, { "ph": "f", "id": 173081, "pid": 76337, "tid": -914061504, "ts": 1716454224076070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224137835, "dur": 166, "args": { "External id": 173082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173082, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173082, "pid": 5, "tid": 7, "ts": 1716454224137835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076076, "dur": 13, "args": { "External id": 173082, "cbid": 211, "correlation": 173082 } }, { "ph": "s", "id": 173082, "pid": 76337, "tid": -914061504, "ts": 1716454224076076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076145, "dur": 1, "args": { "External id": 173093, "cbid": 251, "correlation": 173093 } }, { "ph": "f", "id": 173093, "pid": 76337, "tid": -914061504, "ts": 1716454224076145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224138003, "dur": 160, "args": { "External id": 173094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173094, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173094, "pid": 5, "tid": 7, "ts": 1716454224138003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076150, "dur": 11, "args": { "External id": 173094, "cbid": 211, "correlation": 173094 } }, { "ph": "s", "id": 173094, "pid": 76337, "tid": -914061504, "ts": 1716454224076150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076214, "dur": 1, "args": { "External id": 173105, "cbid": 251, "correlation": 173105 } }, { "ph": "f", "id": 173105, "pid": 76337, "tid": -914061504, "ts": 1716454224076214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224138164, "dur": 160, "args": { "External id": 173106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173106, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173106, "pid": 5, "tid": 7, "ts": 1716454224138164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076219, "dur": 12, "args": { "External id": 173106, "cbid": 211, "correlation": 173106 } }, { "ph": "s", "id": 173106, "pid": 76337, "tid": -914061504, "ts": 1716454224076219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224138325, "dur": 339, "args": { "External id": 173131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173131, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173131, "pid": 5, "tid": 7, "ts": 1716454224138325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076304, "dur": 13, "args": { "External id": 173131, "cbid": 211, "correlation": 173131 } }, { "ph": "s", "id": 173131, "pid": 76337, "tid": -914061504, "ts": 1716454224076304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076404, "dur": 1, "args": { "External id": 173149, "cbid": 251, "correlation": 173149 } }, { "ph": "f", "id": 173149, "pid": 76337, "tid": -914061504, "ts": 1716454224076404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224138665, "dur": 166, "args": { "External id": 173151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173151, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173151, "pid": 5, "tid": 7, "ts": 1716454224138665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076410, "dur": 13, "args": { "External id": 173151, "cbid": 211, "correlation": 173151 } }, { "ph": "s", "id": 173151, "pid": 76337, "tid": -914061504, "ts": 1716454224076410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224138833, "dur": 19, "args": { "External id": 173159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173159, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173159, "pid": 5, "tid": 7, "ts": 1716454224138833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076482, "dur": 12, "args": { "External id": 173159, "cbid": 211, "correlation": 173159 } }, { "ph": "s", "id": 173159, "pid": 76337, "tid": -914061504, "ts": 1716454224076482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224138853, "dur": 27, "args": { "External id": 173167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173167, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173167, "pid": 5, "tid": 7, "ts": 1716454224138853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076520, "dur": 9, "args": { "External id": 173167, "cbid": 211, "correlation": 173167 } }, { "ph": "s", "id": 173167, "pid": 76337, "tid": -914061504, "ts": 1716454224076520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224138882, "dur": 18, "args": { "External id": 173178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173178, "pid": 5, "tid": 7, "ts": 1716454224138882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076592, "dur": 12, "args": { "External id": 173178, "cbid": 211, "correlation": 173178 } }, { "ph": "s", "id": 173178, "pid": 76337, "tid": -914061504, "ts": 1716454224076592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224138901, "dur": 16, "args": { "External id": 173200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173200, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173200, "pid": 5, "tid": 7, "ts": 1716454224138901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076622, "dur": 9, "args": { "External id": 173200, "cbid": 211, "correlation": 173200 } }, { "ph": "s", "id": 173200, "pid": 76337, "tid": -914061504, "ts": 1716454224076622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076709, "dur": 1, "args": { "External id": 173211, "cbid": 251, "correlation": 173211 } }, { "ph": "f", "id": 173211, "pid": 76337, "tid": -914061504, "ts": 1716454224076709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224138919, "dur": 90, "args": { "External id": 173212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173212, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173212, "pid": 5, "tid": 7, "ts": 1716454224138919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076714, "dur": 14, "args": { "External id": 173212, "cbid": 211, "correlation": 173212 } }, { "ph": "s", "id": 173212, "pid": 76337, "tid": -914061504, "ts": 1716454224076714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076784, "dur": 1, "args": { "External id": 173223, "cbid": 251, "correlation": 173223 } }, { "ph": "f", "id": 173223, "pid": 76337, "tid": -914061504, "ts": 1716454224076784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076788, "dur": 0, "args": { "External id": 173224, "cbid": 251, "correlation": 173224 } }, { "ph": "f", "id": 173224, "pid": 76337, "tid": -914061504, "ts": 1716454224076788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224139010, "dur": 12, "args": { "External id": 173225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173225, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173225, "pid": 5, "tid": 7, "ts": 1716454224139010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076789, "dur": 12, "args": { "External id": 173225, "cbid": 211, "correlation": 173225 } }, { "ph": "s", "id": 173225, "pid": 76337, "tid": -914061504, "ts": 1716454224076789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224139023, "dur": 6, "args": { "External id": 173227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173227, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173227, "pid": 5, "tid": 7, "ts": 1716454224139023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076803, "dur": 6, "args": { "External id": 173227, "cbid": 211, "correlation": 173227 } }, { "ph": "s", "id": 173227, "pid": 76337, "tid": -914061504, "ts": 1716454224076803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076860, "dur": 1, "args": { "External id": 173238, "cbid": 251, "correlation": 173238 } }, { "ph": "f", "id": 173238, "pid": 76337, "tid": -914061504, "ts": 1716454224076860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224076863, "dur": 0, "args": { "External id": 173239, "cbid": 251, "correlation": 173239 } }, { "ph": "f", "id": 173239, "pid": 76337, "tid": -914061504, "ts": 1716454224076863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224139030, "dur": 9, "args": { "External id": 173240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173240, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173240, "pid": 5, "tid": 7, "ts": 1716454224139030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076865, "dur": 12, "args": { "External id": 173240, "cbid": 211, "correlation": 173240 } }, { "ph": "s", "id": 173240, "pid": 76337, "tid": -914061504, "ts": 1716454224076865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224139040, "dur": 4, "args": { "External id": 173242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173242, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173242, "pid": 5, "tid": 7, "ts": 1716454224139040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076878, "dur": 6, "args": { "External id": 173242, "cbid": 211, "correlation": 173242 } }, { "ph": "s", "id": 173242, "pid": 76337, "tid": -914061504, "ts": 1716454224076878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224139045, "dur": 56, "args": { "External id": 173267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173267, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173267, "pid": 5, "tid": 7, "ts": 1716454224139045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224076956, "dur": 12, "args": { "External id": 173267, "cbid": 211, "correlation": 173267 } }, { "ph": "s", "id": 173267, "pid": 76337, "tid": -914061504, "ts": 1716454224076956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224077065, "dur": 1, "args": { "External id": 173285, "cbid": 251, "correlation": 173285 } }, { "ph": "f", "id": 173285, "pid": 76337, "tid": -914061504, "ts": 1716454224077065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224139102, "dur": 91, "args": { "External id": 173287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173287, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173287, "pid": 5, "tid": 7, "ts": 1716454224139102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077071, "dur": 15, "args": { "External id": 173287, "cbid": 211, "correlation": 173287 } }, { "ph": "s", "id": 173287, "pid": 76337, "tid": -914061504, "ts": 1716454224077071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224139194, "dur": 10, "args": { "External id": 173295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173295, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173295, "pid": 5, "tid": 7, "ts": 1716454224139194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077142, "dur": 12, "args": { "External id": 173295, "cbid": 211, "correlation": 173295 } }, { "ph": "s", "id": 173295, "pid": 76337, "tid": -914061504, "ts": 1716454224077142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224139205, "dur": 22, "args": { "External id": 173303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173303, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173303, "pid": 5, "tid": 7, "ts": 1716454224139205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077184, "dur": 9, "args": { "External id": 173303, "cbid": 211, "correlation": 173303 } }, { "ph": "s", "id": 173303, "pid": 76337, "tid": -914061504, "ts": 1716454224077184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224139228, "dur": 17, "args": { "External id": 173325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173325, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173325, "pid": 5, "tid": 7, "ts": 1716454224139228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077235, "dur": 10, "args": { "External id": 173325, "cbid": 211, "correlation": 173325 } }, { "ph": "s", "id": 173325, "pid": 76337, "tid": -914061504, "ts": 1716454224077235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224077320, "dur": 1, "args": { "External id": 173341, "cbid": 251, "correlation": 173341 } }, { "ph": "f", "id": 173341, "pid": 76337, "tid": -914061504, "ts": 1716454224077320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224077325, "dur": 0, "args": { "External id": 173343, "cbid": 251, "correlation": 173343 } }, { "ph": "f", "id": 173343, "pid": 76337, "tid": -914061504, "ts": 1716454224077325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224139247, "dur": 497, "args": { "External id": 173344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173344, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173344, "pid": 5, "tid": 7, "ts": 1716454224139247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077327, "dur": 14, "args": { "External id": 173344, "cbid": 211, "correlation": 173344 } }, { "ph": "s", "id": 173344, "pid": 76337, "tid": -914061504, "ts": 1716454224077327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224139745, "dur": 66, "args": { "External id": 173352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173352, "pid": 5, "tid": 7, "ts": 1716454224139745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077395, "dur": 12, "args": { "External id": 173352, "cbid": 211, "correlation": 173352 } }, { "ph": "s", "id": 173352, "pid": 76337, "tid": -914061504, "ts": 1716454224077395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224139813, "dur": 67, "args": { "External id": 173360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173360, "pid": 5, "tid": 7, "ts": 1716454224139813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077425, "dur": 9, "args": { "External id": 173360, "cbid": 211, "correlation": 173360 } }, { "ph": "s", "id": 173360, "pid": 76337, "tid": -914061504, "ts": 1716454224077425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224077506, "dur": 1, "args": { "External id": 173376, "cbid": 251, "correlation": 173376 } }, { "ph": "f", "id": 173376, "pid": 76337, "tid": -914061504, "ts": 1716454224077506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224139882, "dur": 1, "args": { "External id": 173378, "device": 5, "context": 1, "stream": 7, "correlation": 173378, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 173378, "pid": 5, "tid": 7, "ts": 1716454224139882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224077511, "dur": 9, "args": { "External id": 173378, "cbid": 51, "correlation": 173378 } }, { "ph": "s", "id": 173378, "pid": 76337, "tid": -914061504, "ts": 1716454224077511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224139886, "dur": 271, "args": { "External id": 173379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173379, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173379, "pid": 5, "tid": 7, "ts": 1716454224139886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077522, "dur": 12, "args": { "External id": 173379, "cbid": 211, "correlation": 173379 } }, { "ph": "s", "id": 173379, "pid": 76337, "tid": -914061504, "ts": 1716454224077522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224140158, "dur": 14, "args": { "External id": 173387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173387, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173387, "pid": 5, "tid": 7, "ts": 1716454224140158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077564, "dur": 10, "args": { "External id": 173387, "cbid": 211, "correlation": 173387 } }, { "ph": "s", "id": 173387, "pid": 76337, "tid": -914061504, "ts": 1716454224077564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224140173, "dur": 38, "args": { "External id": 173398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173398, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173398, "pid": 5, "tid": 7, "ts": 1716454224140173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077633, "dur": 12, "args": { "External id": 173398, "cbid": 211, "correlation": 173398 } }, { "ph": "s", "id": 173398, "pid": 76337, "tid": -914061504, "ts": 1716454224077633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224077695, "dur": 0, "args": { "External id": 173410, "cbid": 317, "correlation": 173410 } }, { "ph": "f", "id": 173410, "pid": 76337, "tid": -914061504, "ts": 1716454224077695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224077696, "dur": 0, "args": { "External id": 173411, "cbid": 203, "correlation": 173411 } }, { "ph": "f", "id": 173411, "pid": 76337, "tid": -914061504, "ts": 1716454224077696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224077697, "dur": 0, "args": { "External id": 173412, "cbid": 205, "correlation": 173412 } }, { "ph": "f", "id": 173412, "pid": 76337, "tid": -914061504, "ts": 1716454224077697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224140212, "dur": 15, "args": { "External id": 173416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173416, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173416, "pid": 5, "tid": 7, "ts": 1716454224140212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077712, "dur": 12, "args": { "External id": 173416, "cbid": 211, "correlation": 173416 } }, { "ph": "s", "id": 173416, "pid": 76337, "tid": -914061504, "ts": 1716454224077712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224140228, "dur": 4, "args": { "External id": 173418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173418, "pid": 5, "tid": 7, "ts": 1716454224140228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077728, "dur": 6, "args": { "External id": 173418, "cbid": 211, "correlation": 173418 } }, { "ph": "s", "id": 173418, "pid": 76337, "tid": -914061504, "ts": 1716454224077728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224077737, "dur": 0, "args": { "External id": 173419, "cbid": 51, "correlation": 173419 } }, { "ph": "s", "id": 173419, "pid": 76337, "tid": -914061504, "ts": 1716454224077737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224140233, "dur": 96, "args": { "External id": 173420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173420, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 173420, "pid": 5, "tid": 7, "ts": 1716454224140233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077738, "dur": 5, "args": { "External id": 173420, "cbid": 211, "correlation": 173420 } }, { "ph": "s", "id": 173420, "pid": 76337, "tid": -914061504, "ts": 1716454224077738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224140331, "dur": 17, "args": { "External id": 173425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173425, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173425, "pid": 5, "tid": 7, "ts": 1716454224140331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077765, "dur": 9, "args": { "External id": 173425, "cbid": 211, "correlation": 173425 } }, { "ph": "s", "id": 173425, "pid": 76337, "tid": -914061504, "ts": 1716454224077765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224140348, "dur": 12, "args": { "External id": 173433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173433, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173433, "pid": 5, "tid": 7, "ts": 1716454224140348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077797, "dur": 8, "args": { "External id": 173433, "cbid": 211, "correlation": 173433 } }, { "ph": "s", "id": 173433, "pid": 76337, "tid": -914061504, "ts": 1716454224077797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224140361, "dur": 25, "args": { "External id": 173442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173442, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173442, "pid": 5, "tid": 7, "ts": 1716454224140361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077837, "dur": 10, "args": { "External id": 173442, "cbid": 211, "correlation": 173442 } }, { "ph": "s", "id": 173442, "pid": 76337, "tid": -914061504, "ts": 1716454224077837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224140388, "dur": 25, "args": { "External id": 173462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173462, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 173462, "pid": 5, "tid": 7, "ts": 1716454224140388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077907, "dur": 12, "args": { "External id": 173462, "cbid": 211, "correlation": 173462 } }, { "ph": "s", "id": 173462, "pid": 76337, "tid": -914061504, "ts": 1716454224077907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224140414, "dur": 5, "args": { "External id": 173474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173474, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 173474, "pid": 5, "tid": 7, "ts": 1716454224140414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077929, "dur": 7, "args": { "External id": 173474, "cbid": 211, "correlation": 173474 } }, { "ph": "s", "id": 173474, "pid": 76337, "tid": -914061504, "ts": 1716454224077929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224140420, "dur": 24, "args": { "External id": 173477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173477, "pid": 5, "tid": 7, "ts": 1716454224140420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077948, "dur": 6, "args": { "External id": 173477, "cbid": 211, "correlation": 173477 } }, { "ph": "s", "id": 173477, "pid": 76337, "tid": -914061504, "ts": 1716454224077948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224140446, "dur": 17, "args": { "External id": 173486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173486, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173486, "pid": 5, "tid": 7, "ts": 1716454224140446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224077995, "dur": 11, "args": { "External id": 173486, "cbid": 211, "correlation": 173486 } }, { "ph": "s", "id": 173486, "pid": 76337, "tid": -914061504, "ts": 1716454224077995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224078048, "dur": 0, "args": { "External id": 173496, "cbid": 317, "correlation": 173496 } }, { "ph": "f", "id": 173496, "pid": 76337, "tid": -914061504, "ts": 1716454224078048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224078049, "dur": 0, "args": { "External id": 173497, "cbid": 203, "correlation": 173497 } }, { "ph": "f", "id": 173497, "pid": 76337, "tid": -914061504, "ts": 1716454224078049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224078050, "dur": 0, "args": { "External id": 173498, "cbid": 205, "correlation": 173498 } }, { "ph": "f", "id": 173498, "pid": 76337, "tid": -914061504, "ts": 1716454224078050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224140464, "dur": 18, "args": { "External id": 173502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173502, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173502, "pid": 5, "tid": 7, "ts": 1716454224140464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078065, "dur": 11, "args": { "External id": 173502, "cbid": 211, "correlation": 173502 } }, { "ph": "s", "id": 173502, "pid": 76337, "tid": -914061504, "ts": 1716454224078065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224140483, "dur": 241, "args": { "External id": 173504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173504, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173504, "pid": 5, "tid": 7, "ts": 1716454224140483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078079, "dur": 6, "args": { "External id": 173504, "cbid": 211, "correlation": 173504 } }, { "ph": "s", "id": 173504, "pid": 76337, "tid": -914061504, "ts": 1716454224078079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224140726, "dur": 1, "args": { "External id": 173506, "device": 5, "context": 1, "stream": 7, "correlation": 173506, "bytes": 960, "memory bandwidth (GB/s)": 0.5263157894736842 } }, { "ph": "f", "id": 173506, "pid": 5, "tid": 7, "ts": 1716454224140726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224078091, "dur": 8, "args": { "External id": 173506, "cbid": 51, "correlation": 173506 } }, { "ph": "s", "id": 173506, "pid": 76337, "tid": -914061504, "ts": 1716454224078091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224140730, "dur": 810, "args": { "External id": 173507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173507, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173507, "pid": 5, "tid": 7, "ts": 1716454224140730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078100, "dur": 6, "args": { "External id": 173507, "cbid": 211, "correlation": 173507 } }, { "ph": "s", "id": 173507, "pid": 76337, "tid": -914061504, "ts": 1716454224078100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224141541, "dur": 13, "args": { "External id": 173509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173509, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173509, "pid": 5, "tid": 7, "ts": 1716454224141541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078111, "dur": 5, "args": { "External id": 173509, "cbid": 211, "correlation": 173509 } }, { "ph": "s", "id": 173509, "pid": 76337, "tid": -914061504, "ts": 1716454224078111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224141556, "dur": 15, "args": { "External id": 173515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173515, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173515, "pid": 5, "tid": 7, "ts": 1716454224141556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078139, "dur": 9, "args": { "External id": 173515, "cbid": 211, "correlation": 173515 } }, { "ph": "s", "id": 173515, "pid": 76337, "tid": -914061504, "ts": 1716454224078139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224141572, "dur": 4, "args": { "External id": 173523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173523, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 173523, "pid": 5, "tid": 7, "ts": 1716454224141572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078184, "dur": 9, "args": { "External id": 173523, "cbid": 211, "correlation": 173523 } }, { "ph": "s", "id": 173523, "pid": 76337, "tid": -914061504, "ts": 1716454224078184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224078250, "dur": 1, "args": { "External id": 173539, "cbid": 251, "correlation": 173539 } }, { "ph": "f", "id": 173539, "pid": 76337, "tid": -914061504, "ts": 1716454224078250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224078255, "dur": 0, "args": { "External id": 173541, "cbid": 251, "correlation": 173541 } }, { "ph": "f", "id": 173541, "pid": 76337, "tid": -914061504, "ts": 1716454224078255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224141577, "dur": 14, "args": { "External id": 173542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173542, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173542, "pid": 5, "tid": 7, "ts": 1716454224141577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078256, "dur": 11, "args": { "External id": 173542, "cbid": 211, "correlation": 173542 } }, { "ph": "s", "id": 173542, "pid": 76337, "tid": -914061504, "ts": 1716454224078256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224141592, "dur": 5, "args": { "External id": 173544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173544, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173544, "pid": 5, "tid": 7, "ts": 1716454224141592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078269, "dur": 5, "args": { "External id": 173544, "cbid": 211, "correlation": 173544 } }, { "ph": "s", "id": 173544, "pid": 76337, "tid": -914061504, "ts": 1716454224078269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224141599, "dur": 17, "args": { "External id": 173554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173554, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173554, "pid": 5, "tid": 7, "ts": 1716454224141599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078327, "dur": 12, "args": { "External id": 173554, "cbid": 211, "correlation": 173554 } }, { "ph": "s", "id": 173554, "pid": 76337, "tid": -914061504, "ts": 1716454224078327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224141616, "dur": 19, "args": { "External id": 173574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173574, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 173574, "pid": 5, "tid": 7, "ts": 1716454224141616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078392, "dur": 11, "args": { "External id": 173574, "cbid": 211, "correlation": 173574 } }, { "ph": "s", "id": 173574, "pid": 76337, "tid": -914061504, "ts": 1716454224078392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224141636, "dur": 4, "args": { "External id": 173586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173586, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 173586, "pid": 5, "tid": 7, "ts": 1716454224141636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078413, "dur": 6, "args": { "External id": 173586, "cbid": 211, "correlation": 173586 } }, { "ph": "s", "id": 173586, "pid": 76337, "tid": -914061504, "ts": 1716454224078413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224141642, "dur": 17, "args": { "External id": 173589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173589, "pid": 5, "tid": 7, "ts": 1716454224141642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078431, "dur": 6, "args": { "External id": 173589, "cbid": 211, "correlation": 173589 } }, { "ph": "s", "id": 173589, "pid": 76337, "tid": -914061504, "ts": 1716454224078431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224141661, "dur": 10, "args": { "External id": 173598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173598, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173598, "pid": 5, "tid": 7, "ts": 1716454224141661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078471, "dur": 10, "args": { "External id": 173598, "cbid": 211, "correlation": 173598 } }, { "ph": "s", "id": 173598, "pid": 76337, "tid": -914061504, "ts": 1716454224078471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224078534, "dur": 0, "args": { "External id": 173608, "cbid": 317, "correlation": 173608 } }, { "ph": "f", "id": 173608, "pid": 76337, "tid": -914061504, "ts": 1716454224078534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224078535, "dur": 0, "args": { "External id": 173609, "cbid": 203, "correlation": 173609 } }, { "ph": "f", "id": 173609, "pid": 76337, "tid": -914061504, "ts": 1716454224078535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224078536, "dur": 0, "args": { "External id": 173610, "cbid": 205, "correlation": 173610 } }, { "ph": "f", "id": 173610, "pid": 76337, "tid": -914061504, "ts": 1716454224078536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224141672, "dur": 11, "args": { "External id": 173614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173614, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173614, "pid": 5, "tid": 7, "ts": 1716454224141672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078549, "dur": 12, "args": { "External id": 173614, "cbid": 211, "correlation": 173614 } }, { "ph": "s", "id": 173614, "pid": 76337, "tid": -914061504, "ts": 1716454224078549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224141684, "dur": 163, "args": { "External id": 173616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173616, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173616, "pid": 5, "tid": 7, "ts": 1716454224141684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078563, "dur": 5, "args": { "External id": 173616, "cbid": 211, "correlation": 173616 } }, { "ph": "s", "id": 173616, "pid": 76337, "tid": -914061504, "ts": 1716454224078563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224141850, "dur": 1, "args": { "External id": 173618, "device": 5, "context": 1, "stream": 7, "correlation": 173618, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 173618, "pid": 5, "tid": 7, "ts": 1716454224141850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224078575, "dur": 7, "args": { "External id": 173618, "cbid": 51, "correlation": 173618 } }, { "ph": "s", "id": 173618, "pid": 76337, "tid": -914061504, "ts": 1716454224078575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224141854, "dur": 649, "args": { "External id": 173619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173619, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173619, "pid": 5, "tid": 7, "ts": 1716454224141854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078583, "dur": 6, "args": { "External id": 173619, "cbid": 211, "correlation": 173619 } }, { "ph": "s", "id": 173619, "pid": 76337, "tid": -914061504, "ts": 1716454224078583, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224142504, "dur": 13, "args": { "External id": 173621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173621, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173621, "pid": 5, "tid": 7, "ts": 1716454224142504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078594, "dur": 5, "args": { "External id": 173621, "cbid": 211, "correlation": 173621 } }, { "ph": "s", "id": 173621, "pid": 76337, "tid": -914061504, "ts": 1716454224078594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224142519, "dur": 15, "args": { "External id": 173627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173627, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173627, "pid": 5, "tid": 7, "ts": 1716454224142519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078622, "dur": 10, "args": { "External id": 173627, "cbid": 211, "correlation": 173627 } }, { "ph": "s", "id": 173627, "pid": 76337, "tid": -914061504, "ts": 1716454224078622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224078681, "dur": 0, "args": { "External id": 173637, "cbid": 317, "correlation": 173637 } }, { "ph": "f", "id": 173637, "pid": 76337, "tid": -914061504, "ts": 1716454224078681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224078682, "dur": 0, "args": { "External id": 173638, "cbid": 203, "correlation": 173638 } }, { "ph": "f", "id": 173638, "pid": 76337, "tid": -914061504, "ts": 1716454224078682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224078683, "dur": 0, "args": { "External id": 173639, "cbid": 205, "correlation": 173639 } }, { "ph": "f", "id": 173639, "pid": 76337, "tid": -914061504, "ts": 1716454224078683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224142535, "dur": 17, "args": { "External id": 173643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173643, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173643, "pid": 5, "tid": 7, "ts": 1716454224142535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078697, "dur": 11, "args": { "External id": 173643, "cbid": 211, "correlation": 173643 } }, { "ph": "s", "id": 173643, "pid": 76337, "tid": -914061504, "ts": 1716454224078697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224142553, "dur": 4, "args": { "External id": 173645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173645, "pid": 5, "tid": 7, "ts": 1716454224142553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078713, "dur": 6, "args": { "External id": 173645, "cbid": 211, "correlation": 173645 } }, { "ph": "s", "id": 173645, "pid": 76337, "tid": -914061504, "ts": 1716454224078713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224078721, "dur": 0, "args": { "External id": 173646, "cbid": 51, "correlation": 173646 } }, { "ph": "s", "id": 173646, "pid": 76337, "tid": -914061504, "ts": 1716454224078721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224142559, "dur": 132, "args": { "External id": 173647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173647, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 173647, "pid": 5, "tid": 7, "ts": 1716454224142559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078722, "dur": 5, "args": { "External id": 173647, "cbid": 211, "correlation": 173647 } }, { "ph": "s", "id": 173647, "pid": 76337, "tid": -914061504, "ts": 1716454224078722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224142692, "dur": 16, "args": { "External id": 173652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173652, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173652, "pid": 5, "tid": 7, "ts": 1716454224142692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078748, "dur": 8, "args": { "External id": 173652, "cbid": 211, "correlation": 173652 } }, { "ph": "s", "id": 173652, "pid": 76337, "tid": -914061504, "ts": 1716454224078748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224142709, "dur": 11, "args": { "External id": 173660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173660, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173660, "pid": 5, "tid": 7, "ts": 1716454224142709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078777, "dur": 8, "args": { "External id": 173660, "cbid": 211, "correlation": 173660 } }, { "ph": "s", "id": 173660, "pid": 76337, "tid": -914061504, "ts": 1716454224078777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224142721, "dur": 11, "args": { "External id": 173668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173668, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173668, "pid": 5, "tid": 7, "ts": 1716454224142721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078805, "dur": 9, "args": { "External id": 173668, "cbid": 211, "correlation": 173668 } }, { "ph": "s", "id": 173668, "pid": 76337, "tid": -914061504, "ts": 1716454224078805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224142733, "dur": 19, "args": { "External id": 173688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173688, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 173688, "pid": 5, "tid": 7, "ts": 1716454224142733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078890, "dur": 12, "args": { "External id": 173688, "cbid": 211, "correlation": 173688 } }, { "ph": "s", "id": 173688, "pid": 76337, "tid": -914061504, "ts": 1716454224078890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224142753, "dur": 4, "args": { "External id": 173700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173700, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 173700, "pid": 5, "tid": 7, "ts": 1716454224142753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078912, "dur": 6, "args": { "External id": 173700, "cbid": 211, "correlation": 173700 } }, { "ph": "s", "id": 173700, "pid": 76337, "tid": -914061504, "ts": 1716454224078912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224142759, "dur": 17, "args": { "External id": 173703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173703, "pid": 5, "tid": 7, "ts": 1716454224142759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224078929, "dur": 7, "args": { "External id": 173703, "cbid": 211, "correlation": 173703 } }, { "ph": "s", "id": 173703, "pid": 76337, "tid": -914061504, "ts": 1716454224078929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224078994, "dur": 0, "args": { "External id": 173714, "cbid": 317, "correlation": 173714 } }, { "ph": "f", "id": 173714, "pid": 76337, "tid": -914061504, "ts": 1716454224078994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224078995, "dur": 0, "args": { "External id": 173715, "cbid": 203, "correlation": 173715 } }, { "ph": "f", "id": 173715, "pid": 76337, "tid": -914061504, "ts": 1716454224078995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224078995, "dur": 0, "args": { "External id": 173716, "cbid": 205, "correlation": 173716 } }, { "ph": "f", "id": 173716, "pid": 76337, "tid": -914061504, "ts": 1716454224078995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224142777, "dur": 11, "args": { "External id": 173720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173720, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173720, "pid": 5, "tid": 7, "ts": 1716454224142777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079009, "dur": 12, "args": { "External id": 173720, "cbid": 211, "correlation": 173720 } }, { "ph": "s", "id": 173720, "pid": 76337, "tid": -914061504, "ts": 1716454224079009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224142789, "dur": 3, "args": { "External id": 173722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173722, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173722, "pid": 5, "tid": 7, "ts": 1716454224142789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079025, "dur": 6, "args": { "External id": 173722, "cbid": 211, "correlation": 173722 } }, { "ph": "s", "id": 173722, "pid": 76337, "tid": -914061504, "ts": 1716454224079025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224079034, "dur": 0, "args": { "External id": 173723, "cbid": 51, "correlation": 173723 } }, { "ph": "s", "id": 173723, "pid": 76337, "tid": -914061504, "ts": 1716454224079034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224142794, "dur": 92, "args": { "External id": 173724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173724, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 173724, "pid": 5, "tid": 7, "ts": 1716454224142794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079035, "dur": 5, "args": { "External id": 173724, "cbid": 211, "correlation": 173724 } }, { "ph": "s", "id": 173724, "pid": 76337, "tid": -914061504, "ts": 1716454224079035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224142887, "dur": 16, "args": { "External id": 173729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173729, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173729, "pid": 5, "tid": 7, "ts": 1716454224142887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079061, "dur": 9, "args": { "External id": 173729, "cbid": 211, "correlation": 173729 } }, { "ph": "s", "id": 173729, "pid": 76337, "tid": -914061504, "ts": 1716454224079061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224142904, "dur": 85, "args": { "External id": 173738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173738, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173738, "pid": 5, "tid": 7, "ts": 1716454224142904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079142, "dur": 13, "args": { "External id": 173738, "cbid": 211, "correlation": 173738 } }, { "ph": "s", "id": 173738, "pid": 76337, "tid": -914061504, "ts": 1716454224079142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224142990, "dur": 30, "args": { "External id": 173760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173760, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173760, "pid": 5, "tid": 7, "ts": 1716454224142990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079200, "dur": 10, "args": { "External id": 173760, "cbid": 211, "correlation": 173760 } }, { "ph": "s", "id": 173760, "pid": 76337, "tid": -914061504, "ts": 1716454224079200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224079288, "dur": 1, "args": { "External id": 173771, "cbid": 251, "correlation": 173771 } }, { "ph": "f", "id": 173771, "pid": 76337, "tid": -914061504, "ts": 1716454224079288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224143021, "dur": 165, "args": { "External id": 173772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173772, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173772, "pid": 5, "tid": 7, "ts": 1716454224143021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079294, "dur": 14, "args": { "External id": 173772, "cbid": 211, "correlation": 173772 } }, { "ph": "s", "id": 173772, "pid": 76337, "tid": -914061504, "ts": 1716454224079294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224079365, "dur": 1, "args": { "External id": 173783, "cbid": 251, "correlation": 173783 } }, { "ph": "f", "id": 173783, "pid": 76337, "tid": -914061504, "ts": 1716454224079365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224143187, "dur": 159, "args": { "External id": 173784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173784, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173784, "pid": 5, "tid": 7, "ts": 1716454224143187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079369, "dur": 13, "args": { "External id": 173784, "cbid": 211, "correlation": 173784 } }, { "ph": "s", "id": 173784, "pid": 76337, "tid": -914061504, "ts": 1716454224079369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224079435, "dur": 1, "args": { "External id": 173795, "cbid": 251, "correlation": 173795 } }, { "ph": "f", "id": 173795, "pid": 76337, "tid": -914061504, "ts": 1716454224079435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224143348, "dur": 158, "args": { "External id": 173796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173796, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173796, "pid": 5, "tid": 7, "ts": 1716454224143348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079439, "dur": 11, "args": { "External id": 173796, "cbid": 211, "correlation": 173796 } }, { "ph": "s", "id": 173796, "pid": 76337, "tid": -914061504, "ts": 1716454224079439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224143508, "dur": 336, "args": { "External id": 173821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173821, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173821, "pid": 5, "tid": 7, "ts": 1716454224143508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079524, "dur": 13, "args": { "External id": 173821, "cbid": 211, "correlation": 173821 } }, { "ph": "s", "id": 173821, "pid": 76337, "tid": -914061504, "ts": 1716454224079524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224079625, "dur": 1, "args": { "External id": 173839, "cbid": 251, "correlation": 173839 } }, { "ph": "f", "id": 173839, "pid": 76337, "tid": -914061504, "ts": 1716454224079625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224143845, "dur": 167, "args": { "External id": 173841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173841, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173841, "pid": 5, "tid": 7, "ts": 1716454224143845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079631, "dur": 13, "args": { "External id": 173841, "cbid": 211, "correlation": 173841 } }, { "ph": "s", "id": 173841, "pid": 76337, "tid": -914061504, "ts": 1716454224079631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224144014, "dur": 19, "args": { "External id": 173849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173849, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173849, "pid": 5, "tid": 7, "ts": 1716454224144014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079701, "dur": 12, "args": { "External id": 173849, "cbid": 211, "correlation": 173849 } }, { "ph": "s", "id": 173849, "pid": 76337, "tid": -914061504, "ts": 1716454224079701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224144034, "dur": 28, "args": { "External id": 173857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173857, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173857, "pid": 5, "tid": 7, "ts": 1716454224144034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079739, "dur": 8, "args": { "External id": 173857, "cbid": 211, "correlation": 173857 } }, { "ph": "s", "id": 173857, "pid": 76337, "tid": -914061504, "ts": 1716454224079739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224144062, "dur": 19, "args": { "External id": 173868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173868, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173868, "pid": 5, "tid": 7, "ts": 1716454224144062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079810, "dur": 12, "args": { "External id": 173868, "cbid": 211, "correlation": 173868 } }, { "ph": "s", "id": 173868, "pid": 76337, "tid": -914061504, "ts": 1716454224079810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224144082, "dur": 16, "args": { "External id": 173890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173890, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173890, "pid": 5, "tid": 7, "ts": 1716454224144082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079842, "dur": 7, "args": { "External id": 173890, "cbid": 211, "correlation": 173890 } }, { "ph": "s", "id": 173890, "pid": 76337, "tid": -914061504, "ts": 1716454224079842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224079927, "dur": 1, "args": { "External id": 173901, "cbid": 251, "correlation": 173901 } }, { "ph": "f", "id": 173901, "pid": 76337, "tid": -914061504, "ts": 1716454224079927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224144100, "dur": 90, "args": { "External id": 173902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173902, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173902, "pid": 5, "tid": 7, "ts": 1716454224144100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224079932, "dur": 13, "args": { "External id": 173902, "cbid": 211, "correlation": 173902 } }, { "ph": "s", "id": 173902, "pid": 76337, "tid": -914061504, "ts": 1716454224079932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080009, "dur": 1, "args": { "External id": 173913, "cbid": 251, "correlation": 173913 } }, { "ph": "f", "id": 173913, "pid": 76337, "tid": -914061504, "ts": 1716454224080009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080012, "dur": 0, "args": { "External id": 173914, "cbid": 251, "correlation": 173914 } }, { "ph": "f", "id": 173914, "pid": 76337, "tid": -914061504, "ts": 1716454224080012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224144191, "dur": 13, "args": { "External id": 173915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173915, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173915, "pid": 5, "tid": 7, "ts": 1716454224144191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080014, "dur": 12, "args": { "External id": 173915, "cbid": 211, "correlation": 173915 } }, { "ph": "s", "id": 173915, "pid": 76337, "tid": -914061504, "ts": 1716454224080014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224144205, "dur": 6, "args": { "External id": 173917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173917, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173917, "pid": 5, "tid": 7, "ts": 1716454224144205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080028, "dur": 6, "args": { "External id": 173917, "cbid": 211, "correlation": 173917 } }, { "ph": "s", "id": 173917, "pid": 76337, "tid": -914061504, "ts": 1716454224080028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080087, "dur": 1, "args": { "External id": 173928, "cbid": 251, "correlation": 173928 } }, { "ph": "f", "id": 173928, "pid": 76337, "tid": -914061504, "ts": 1716454224080087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080091, "dur": 0, "args": { "External id": 173929, "cbid": 251, "correlation": 173929 } }, { "ph": "f", "id": 173929, "pid": 76337, "tid": -914061504, "ts": 1716454224080091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224144212, "dur": 9, "args": { "External id": 173930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173930, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173930, "pid": 5, "tid": 7, "ts": 1716454224144212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080092, "dur": 12, "args": { "External id": 173930, "cbid": 211, "correlation": 173930 } }, { "ph": "s", "id": 173930, "pid": 76337, "tid": -914061504, "ts": 1716454224080092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224144222, "dur": 4, "args": { "External id": 173932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173932, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173932, "pid": 5, "tid": 7, "ts": 1716454224144222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080105, "dur": 5, "args": { "External id": 173932, "cbid": 211, "correlation": 173932 } }, { "ph": "s", "id": 173932, "pid": 76337, "tid": -914061504, "ts": 1716454224080105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224144227, "dur": 55, "args": { "External id": 173957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173957, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 173957, "pid": 5, "tid": 7, "ts": 1716454224144227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080183, "dur": 13, "args": { "External id": 173957, "cbid": 211, "correlation": 173957 } }, { "ph": "s", "id": 173957, "pid": 76337, "tid": -914061504, "ts": 1716454224080183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080282, "dur": 1, "args": { "External id": 173975, "cbid": 251, "correlation": 173975 } }, { "ph": "f", "id": 173975, "pid": 76337, "tid": -914061504, "ts": 1716454224080282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224144284, "dur": 91, "args": { "External id": 173977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173977, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 173977, "pid": 5, "tid": 7, "ts": 1716454224144284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080288, "dur": 14, "args": { "External id": 173977, "cbid": 211, "correlation": 173977 } }, { "ph": "s", "id": 173977, "pid": 76337, "tid": -914061504, "ts": 1716454224080288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224144376, "dur": 10, "args": { "External id": 173985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173985, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173985, "pid": 5, "tid": 7, "ts": 1716454224144376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080360, "dur": 12, "args": { "External id": 173985, "cbid": 211, "correlation": 173985 } }, { "ph": "s", "id": 173985, "pid": 76337, "tid": -914061504, "ts": 1716454224080360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224144387, "dur": 21, "args": { "External id": 173993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 173993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 173993, "pid": 5, "tid": 7, "ts": 1716454224144387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080401, "dur": 9, "args": { "External id": 173993, "cbid": 211, "correlation": 173993 } }, { "ph": "s", "id": 173993, "pid": 76337, "tid": -914061504, "ts": 1716454224080401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224144410, "dur": 18, "args": { "External id": 174015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174015, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174015, "pid": 5, "tid": 7, "ts": 1716454224144410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080452, "dur": 10, "args": { "External id": 174015, "cbid": 211, "correlation": 174015 } }, { "ph": "s", "id": 174015, "pid": 76337, "tid": -914061504, "ts": 1716454224080452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080541, "dur": 1, "args": { "External id": 174031, "cbid": 251, "correlation": 174031 } }, { "ph": "f", "id": 174031, "pid": 76337, "tid": -914061504, "ts": 1716454224080541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080545, "dur": 0, "args": { "External id": 174033, "cbid": 251, "correlation": 174033 } }, { "ph": "f", "id": 174033, "pid": 76337, "tid": -914061504, "ts": 1716454224080545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224144429, "dur": 499, "args": { "External id": 174034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174034, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174034, "pid": 5, "tid": 7, "ts": 1716454224144429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080547, "dur": 12, "args": { "External id": 174034, "cbid": 211, "correlation": 174034 } }, { "ph": "s", "id": 174034, "pid": 76337, "tid": -914061504, "ts": 1716454224080547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224144929, "dur": 67, "args": { "External id": 174042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174042, "pid": 5, "tid": 7, "ts": 1716454224144929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080612, "dur": 12, "args": { "External id": 174042, "cbid": 211, "correlation": 174042 } }, { "ph": "s", "id": 174042, "pid": 76337, "tid": -914061504, "ts": 1716454224080612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224144997, "dur": 67, "args": { "External id": 174050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174050, "pid": 5, "tid": 7, "ts": 1716454224144997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080644, "dur": 8, "args": { "External id": 174050, "cbid": 211, "correlation": 174050 } }, { "ph": "s", "id": 174050, "pid": 76337, "tid": -914061504, "ts": 1716454224080644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224080722, "dur": 1, "args": { "External id": 174066, "cbid": 251, "correlation": 174066 } }, { "ph": "f", "id": 174066, "pid": 76337, "tid": -914061504, "ts": 1716454224080722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224145066, "dur": 1, "args": { "External id": 174068, "device": 5, "context": 1, "stream": 7, "correlation": 174068, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 174068, "pid": 5, "tid": 7, "ts": 1716454224145066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224080727, "dur": 10, "args": { "External id": 174068, "cbid": 51, "correlation": 174068 } }, { "ph": "s", "id": 174068, "pid": 76337, "tid": -914061504, "ts": 1716454224080727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224145069, "dur": 271, "args": { "External id": 174069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174069, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 174069, "pid": 5, "tid": 7, "ts": 1716454224145069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080739, "dur": 11, "args": { "External id": 174069, "cbid": 211, "correlation": 174069 } }, { "ph": "s", "id": 174069, "pid": 76337, "tid": -914061504, "ts": 1716454224080739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224145342, "dur": 15, "args": { "External id": 174077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174077, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174077, "pid": 5, "tid": 7, "ts": 1716454224145342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080781, "dur": 11, "args": { "External id": 174077, "cbid": 211, "correlation": 174077 } }, { "ph": "s", "id": 174077, "pid": 76337, "tid": -914061504, "ts": 1716454224080781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224145358, "dur": 38, "args": { "External id": 174088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174088, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174088, "pid": 5, "tid": 7, "ts": 1716454224145358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080849, "dur": 12, "args": { "External id": 174088, "cbid": 211, "correlation": 174088 } }, { "ph": "s", "id": 174088, "pid": 76337, "tid": -914061504, "ts": 1716454224080849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224080912, "dur": 0, "args": { "External id": 174100, "cbid": 317, "correlation": 174100 } }, { "ph": "f", "id": 174100, "pid": 76337, "tid": -914061504, "ts": 1716454224080912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224080913, "dur": 0, "args": { "External id": 174101, "cbid": 203, "correlation": 174101 } }, { "ph": "f", "id": 174101, "pid": 76337, "tid": -914061504, "ts": 1716454224080913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224080914, "dur": 0, "args": { "External id": 174102, "cbid": 205, "correlation": 174102 } }, { "ph": "f", "id": 174102, "pid": 76337, "tid": -914061504, "ts": 1716454224080914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224145397, "dur": 13, "args": { "External id": 174106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174106, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174106, "pid": 5, "tid": 7, "ts": 1716454224145397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080928, "dur": 12, "args": { "External id": 174106, "cbid": 211, "correlation": 174106 } }, { "ph": "s", "id": 174106, "pid": 76337, "tid": -914061504, "ts": 1716454224080928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224145411, "dur": 4, "args": { "External id": 174108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 174108, "pid": 5, "tid": 7, "ts": 1716454224145411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080946, "dur": 6, "args": { "External id": 174108, "cbid": 211, "correlation": 174108 } }, { "ph": "s", "id": 174108, "pid": 76337, "tid": -914061504, "ts": 1716454224080946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224080954, "dur": 0, "args": { "External id": 174109, "cbid": 51, "correlation": 174109 } }, { "ph": "s", "id": 174109, "pid": 76337, "tid": -914061504, "ts": 1716454224080954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224145416, "dur": 98, "args": { "External id": 174110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174110, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 174110, "pid": 5, "tid": 7, "ts": 1716454224145416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224080955, "dur": 5, "args": { "External id": 174110, "cbid": 211, "correlation": 174110 } }, { "ph": "s", "id": 174110, "pid": 76337, "tid": -914061504, "ts": 1716454224080955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224145516, "dur": 16, "args": { "External id": 174115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174115, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174115, "pid": 5, "tid": 7, "ts": 1716454224145516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224081613, "dur": 12, "args": { "External id": 174115, "cbid": 211, "correlation": 174115 } }, { "ph": "s", "id": 174115, "pid": 76337, "tid": -914061504, "ts": 1716454224081613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224145533, "dur": 12, "args": { "External id": 174123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174123, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174123, "pid": 5, "tid": 7, "ts": 1716454224145533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224081712, "dur": 11, "args": { "External id": 174123, "cbid": 211, "correlation": 174123 } }, { "ph": "s", "id": 174123, "pid": 76337, "tid": -914061504, "ts": 1716454224081712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224145546, "dur": 57, "args": { "External id": 174134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174134, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174134, "pid": 5, "tid": 7, "ts": 1716454224145546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224081863, "dur": 15, "args": { "External id": 174134, "cbid": 211, "correlation": 174134 } }, { "ph": "s", "id": 174134, "pid": 76337, "tid": -914061504, "ts": 1716454224081863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224081931, "dur": 0, "args": { "External id": 174144, "cbid": 317, "correlation": 174144 } }, { "ph": "f", "id": 174144, "pid": 76337, "tid": -914061504, "ts": 1716454224081931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224081932, "dur": 0, "args": { "External id": 174145, "cbid": 203, "correlation": 174145 } }, { "ph": "f", "id": 174145, "pid": 76337, "tid": -914061504, "ts": 1716454224081932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224081933, "dur": 0, "args": { "External id": 174146, "cbid": 205, "correlation": 174146 } }, { "ph": "f", "id": 174146, "pid": 76337, "tid": -914061504, "ts": 1716454224081933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224145605, "dur": 41, "args": { "External id": 174150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174150, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174150, "pid": 5, "tid": 7, "ts": 1716454224145605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224081950, "dur": 12, "args": { "External id": 174150, "cbid": 211, "correlation": 174150 } }, { "ph": "s", "id": 174150, "pid": 76337, "tid": -914061504, "ts": 1716454224081950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224145648, "dur": 163, "args": { "External id": 174152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174152, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174152, "pid": 5, "tid": 7, "ts": 1716454224145648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224081964, "dur": 6, "args": { "External id": 174152, "cbid": 211, "correlation": 174152 } }, { "ph": "s", "id": 174152, "pid": 76337, "tid": -914061504, "ts": 1716454224081964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224145811, "dur": 1968, "args": { "External id": 174154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174154, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174154, "pid": 5, "tid": 7, "ts": 1716454224145811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224081987, "dur": 9, "args": { "External id": 174154, "cbid": 211, "correlation": 174154 } }, { "ph": "s", "id": 174154, "pid": 76337, "tid": -914061504, "ts": 1716454224081987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224147781, "dur": 39, "args": { "External id": 174156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174156, "pid": 5, "tid": 7, "ts": 1716454224147781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082001, "dur": 5, "args": { "External id": 174156, "cbid": 211, "correlation": 174156 } }, { "ph": "s", "id": 174156, "pid": 76337, "tid": -914061504, "ts": 1716454224082001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224147821, "dur": 59, "args": { "External id": 174162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174162, "pid": 5, "tid": 7, "ts": 1716454224147821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082032, "dur": 9, "args": { "External id": 174162, "cbid": 211, "correlation": 174162 } }, { "ph": "s", "id": 174162, "pid": 76337, "tid": -914061504, "ts": 1716454224082032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224147881, "dur": 85, "args": { "External id": 174171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174171, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174171, "pid": 5, "tid": 7, "ts": 1716454224147881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082191, "dur": 15, "args": { "External id": 174171, "cbid": 211, "correlation": 174171 } }, { "ph": "s", "id": 174171, "pid": 76337, "tid": -914061504, "ts": 1716454224082191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224147968, "dur": 74, "args": { "External id": 174191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174191, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 174191, "pid": 5, "tid": 7, "ts": 1716454224147968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082289, "dur": 13, "args": { "External id": 174191, "cbid": 211, "correlation": 174191 } }, { "ph": "s", "id": 174191, "pid": 76337, "tid": -914061504, "ts": 1716454224082289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224148043, "dur": 5, "args": { "External id": 174203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174203, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 174203, "pid": 5, "tid": 7, "ts": 1716454224148043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082312, "dur": 6, "args": { "External id": 174203, "cbid": 211, "correlation": 174203 } }, { "ph": "s", "id": 174203, "pid": 76337, "tid": -914061504, "ts": 1716454224082312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224148049, "dur": 82, "args": { "External id": 174206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174206, "pid": 5, "tid": 7, "ts": 1716454224148049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082332, "dur": 7, "args": { "External id": 174206, "cbid": 211, "correlation": 174206 } }, { "ph": "s", "id": 174206, "pid": 76337, "tid": -914061504, "ts": 1716454224082332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224148133, "dur": 54, "args": { "External id": 174215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174215, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174215, "pid": 5, "tid": 7, "ts": 1716454224148133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082378, "dur": 10, "args": { "External id": 174215, "cbid": 211, "correlation": 174215 } }, { "ph": "s", "id": 174215, "pid": 76337, "tid": -914061504, "ts": 1716454224082378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224082434, "dur": 0, "args": { "External id": 174225, "cbid": 317, "correlation": 174225 } }, { "ph": "f", "id": 174225, "pid": 76337, "tid": -914061504, "ts": 1716454224082434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224082435, "dur": 0, "args": { "External id": 174226, "cbid": 203, "correlation": 174226 } }, { "ph": "f", "id": 174226, "pid": 76337, "tid": -914061504, "ts": 1716454224082435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224082436, "dur": 0, "args": { "External id": 174227, "cbid": 205, "correlation": 174227 } }, { "ph": "f", "id": 174227, "pid": 76337, "tid": -914061504, "ts": 1716454224082436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224148188, "dur": 58, "args": { "External id": 174231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174231, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174231, "pid": 5, "tid": 7, "ts": 1716454224148188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082452, "dur": 12, "args": { "External id": 174231, "cbid": 211, "correlation": 174231 } }, { "ph": "s", "id": 174231, "pid": 76337, "tid": -914061504, "ts": 1716454224082452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224148247, "dur": 122, "args": { "External id": 174233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174233, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174233, "pid": 5, "tid": 7, "ts": 1716454224148247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082466, "dur": 5, "args": { "External id": 174233, "cbid": 211, "correlation": 174233 } }, { "ph": "s", "id": 174233, "pid": 76337, "tid": -914061504, "ts": 1716454224082466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224148370, "dur": 1889, "args": { "External id": 174235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174235, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174235, "pid": 5, "tid": 7, "ts": 1716454224148370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082478, "dur": 6, "args": { "External id": 174235, "cbid": 211, "correlation": 174235 } }, { "ph": "s", "id": 174235, "pid": 76337, "tid": -914061504, "ts": 1716454224082478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224150261, "dur": 21, "args": { "External id": 174237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174237, "pid": 5, "tid": 7, "ts": 1716454224150261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082488, "dur": 5, "args": { "External id": 174237, "cbid": 211, "correlation": 174237 } }, { "ph": "s", "id": 174237, "pid": 76337, "tid": -914061504, "ts": 1716454224082488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224150283, "dur": 33, "args": { "External id": 174243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174243, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174243, "pid": 5, "tid": 7, "ts": 1716454224150283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082516, "dur": 9, "args": { "External id": 174243, "cbid": 211, "correlation": 174243 } }, { "ph": "s", "id": 174243, "pid": 76337, "tid": -914061504, "ts": 1716454224082516, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224150318, "dur": 4, "args": { "External id": 174251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174251, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 174251, "pid": 5, "tid": 7, "ts": 1716454224150318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082560, "dur": 10, "args": { "External id": 174251, "cbid": 211, "correlation": 174251 } }, { "ph": "s", "id": 174251, "pid": 76337, "tid": -914061504, "ts": 1716454224082560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224082647, "dur": 2, "args": { "External id": 174267, "cbid": 251, "correlation": 174267 } }, { "ph": "f", "id": 174267, "pid": 76337, "tid": -914061504, "ts": 1716454224082647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224082654, "dur": 0, "args": { "External id": 174269, "cbid": 251, "correlation": 174269 } }, { "ph": "f", "id": 174269, "pid": 76337, "tid": -914061504, "ts": 1716454224082654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224150323, "dur": 12, "args": { "External id": 174270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174270, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 174270, "pid": 5, "tid": 7, "ts": 1716454224150323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082656, "dur": 13, "args": { "External id": 174270, "cbid": 211, "correlation": 174270 } }, { "ph": "s", "id": 174270, "pid": 76337, "tid": -914061504, "ts": 1716454224082656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224150337, "dur": 5, "args": { "External id": 174272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174272, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 174272, "pid": 5, "tid": 7, "ts": 1716454224150337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082671, "dur": 6, "args": { "External id": 174272, "cbid": 211, "correlation": 174272 } }, { "ph": "s", "id": 174272, "pid": 76337, "tid": -914061504, "ts": 1716454224082671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224150343, "dur": 29, "args": { "External id": 174282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174282, "pid": 5, "tid": 7, "ts": 1716454224150343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082743, "dur": 14, "args": { "External id": 174282, "cbid": 211, "correlation": 174282 } }, { "ph": "s", "id": 174282, "pid": 76337, "tid": -914061504, "ts": 1716454224082743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224150373, "dur": 30, "args": { "External id": 174302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174302, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 174302, "pid": 5, "tid": 7, "ts": 1716454224150373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082814, "dur": 12, "args": { "External id": 174302, "cbid": 211, "correlation": 174302 } }, { "ph": "s", "id": 174302, "pid": 76337, "tid": -914061504, "ts": 1716454224082814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224150404, "dur": 5, "args": { "External id": 174314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174314, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 174314, "pid": 5, "tid": 7, "ts": 1716454224150404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082836, "dur": 6, "args": { "External id": 174314, "cbid": 211, "correlation": 174314 } }, { "ph": "s", "id": 174314, "pid": 76337, "tid": -914061504, "ts": 1716454224082836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224150410, "dur": 30, "args": { "External id": 174317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174317, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174317, "pid": 5, "tid": 7, "ts": 1716454224150410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082853, "dur": 6, "args": { "External id": 174317, "cbid": 211, "correlation": 174317 } }, { "ph": "s", "id": 174317, "pid": 76337, "tid": -914061504, "ts": 1716454224082853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224150442, "dur": 20, "args": { "External id": 174326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174326, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174326, "pid": 5, "tid": 7, "ts": 1716454224150442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082894, "dur": 9, "args": { "External id": 174326, "cbid": 211, "correlation": 174326 } }, { "ph": "s", "id": 174326, "pid": 76337, "tid": -914061504, "ts": 1716454224082894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224082964, "dur": 0, "args": { "External id": 174336, "cbid": 317, "correlation": 174336 } }, { "ph": "f", "id": 174336, "pid": 76337, "tid": -914061504, "ts": 1716454224082964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224082965, "dur": 0, "args": { "External id": 174337, "cbid": 203, "correlation": 174337 } }, { "ph": "f", "id": 174337, "pid": 76337, "tid": -914061504, "ts": 1716454224082965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224082965, "dur": 0, "args": { "External id": 174338, "cbid": 205, "correlation": 174338 } }, { "ph": "f", "id": 174338, "pid": 76337, "tid": -914061504, "ts": 1716454224082965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224150464, "dur": 24, "args": { "External id": 174342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174342, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174342, "pid": 5, "tid": 7, "ts": 1716454224150464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224082990, "dur": 13, "args": { "External id": 174342, "cbid": 211, "correlation": 174342 } }, { "ph": "s", "id": 174342, "pid": 76337, "tid": -914061504, "ts": 1716454224082990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224150488, "dur": 44, "args": { "External id": 174344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174344, "pid": 5, "tid": 7, "ts": 1716454224150488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083005, "dur": 5, "args": { "External id": 174344, "cbid": 211, "correlation": 174344 } }, { "ph": "s", "id": 174344, "pid": 76337, "tid": -914061504, "ts": 1716454224083005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224150534, "dur": 648, "args": { "External id": 174346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174346, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174346, "pid": 5, "tid": 7, "ts": 1716454224150534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083017, "dur": 7, "args": { "External id": 174346, "cbid": 211, "correlation": 174346 } }, { "ph": "s", "id": 174346, "pid": 76337, "tid": -914061504, "ts": 1716454224083017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224151183, "dur": 23, "args": { "External id": 174348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174348, "pid": 5, "tid": 7, "ts": 1716454224151183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083028, "dur": 5, "args": { "External id": 174348, "cbid": 211, "correlation": 174348 } }, { "ph": "s", "id": 174348, "pid": 76337, "tid": -914061504, "ts": 1716454224083028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224151207, "dur": 32, "args": { "External id": 174354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174354, "pid": 5, "tid": 7, "ts": 1716454224151207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083056, "dur": 9, "args": { "External id": 174354, "cbid": 211, "correlation": 174354 } }, { "ph": "s", "id": 174354, "pid": 76337, "tid": -914061504, "ts": 1716454224083056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224083115, "dur": 0, "args": { "External id": 174364, "cbid": 317, "correlation": 174364 } }, { "ph": "f", "id": 174364, "pid": 76337, "tid": -914061504, "ts": 1716454224083115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224083115, "dur": 0, "args": { "External id": 174365, "cbid": 203, "correlation": 174365 } }, { "ph": "f", "id": 174365, "pid": 76337, "tid": -914061504, "ts": 1716454224083115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224083116, "dur": 0, "args": { "External id": 174366, "cbid": 205, "correlation": 174366 } }, { "ph": "f", "id": 174366, "pid": 76337, "tid": -914061504, "ts": 1716454224083116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224151240, "dur": 57, "args": { "External id": 174370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174370, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174370, "pid": 5, "tid": 7, "ts": 1716454224151240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083129, "dur": 11, "args": { "External id": 174370, "cbid": 211, "correlation": 174370 } }, { "ph": "s", "id": 174370, "pid": 76337, "tid": -914061504, "ts": 1716454224083129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224151298, "dur": 270, "args": { "External id": 174372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174372, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174372, "pid": 5, "tid": 7, "ts": 1716454224151298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083147, "dur": 8, "args": { "External id": 174372, "cbid": 211, "correlation": 174372 } }, { "ph": "s", "id": 174372, "pid": 76337, "tid": -914061504, "ts": 1716454224083147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224151569, "dur": 23, "args": { "External id": 174374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174374, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174374, "pid": 5, "tid": 7, "ts": 1716454224151569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083159, "dur": 5, "args": { "External id": 174374, "cbid": 211, "correlation": 174374 } }, { "ph": "s", "id": 174374, "pid": 76337, "tid": -914061504, "ts": 1716454224083159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224151593, "dur": 32, "args": { "External id": 174380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174380, "pid": 5, "tid": 7, "ts": 1716454224151593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083185, "dur": 8, "args": { "External id": 174380, "cbid": 211, "correlation": 174380 } }, { "ph": "s", "id": 174380, "pid": 76337, "tid": -914061504, "ts": 1716454224083185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224151627, "dur": 27, "args": { "External id": 174388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174388, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174388, "pid": 5, "tid": 7, "ts": 1716454224151627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083213, "dur": 8, "args": { "External id": 174388, "cbid": 211, "correlation": 174388 } }, { "ph": "s", "id": 174388, "pid": 76337, "tid": -914061504, "ts": 1716454224083213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224151655, "dur": 20, "args": { "External id": 174396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174396, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174396, "pid": 5, "tid": 7, "ts": 1716454224151655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083243, "dur": 9, "args": { "External id": 174396, "cbid": 211, "correlation": 174396 } }, { "ph": "s", "id": 174396, "pid": 76337, "tid": -914061504, "ts": 1716454224083243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224151675, "dur": 31, "args": { "External id": 174416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174416, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 174416, "pid": 5, "tid": 7, "ts": 1716454224151675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083332, "dur": 13, "args": { "External id": 174416, "cbid": 211, "correlation": 174416 } }, { "ph": "s", "id": 174416, "pid": 76337, "tid": -914061504, "ts": 1716454224083332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224151708, "dur": 4, "args": { "External id": 174428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174428, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 174428, "pid": 5, "tid": 7, "ts": 1716454224151708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083355, "dur": 6, "args": { "External id": 174428, "cbid": 211, "correlation": 174428 } }, { "ph": "s", "id": 174428, "pid": 76337, "tid": -914061504, "ts": 1716454224083355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224151714, "dur": 31, "args": { "External id": 174431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174431, "pid": 5, "tid": 7, "ts": 1716454224151714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083373, "dur": 7, "args": { "External id": 174431, "cbid": 211, "correlation": 174431 } }, { "ph": "s", "id": 174431, "pid": 76337, "tid": -914061504, "ts": 1716454224083373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224083434, "dur": 0, "args": { "External id": 174442, "cbid": 317, "correlation": 174442 } }, { "ph": "f", "id": 174442, "pid": 76337, "tid": -914061504, "ts": 1716454224083434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224083435, "dur": 0, "args": { "External id": 174443, "cbid": 203, "correlation": 174443 } }, { "ph": "f", "id": 174443, "pid": 76337, "tid": -914061504, "ts": 1716454224083435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224083436, "dur": 0, "args": { "External id": 174444, "cbid": 205, "correlation": 174444 } }, { "ph": "f", "id": 174444, "pid": 76337, "tid": -914061504, "ts": 1716454224083436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224151745, "dur": 22, "args": { "External id": 174448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174448, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174448, "pid": 5, "tid": 7, "ts": 1716454224151745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083450, "dur": 12, "args": { "External id": 174448, "cbid": 211, "correlation": 174448 } }, { "ph": "s", "id": 174448, "pid": 76337, "tid": -914061504, "ts": 1716454224083450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224151769, "dur": 105, "args": { "External id": 174450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174450, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174450, "pid": 5, "tid": 7, "ts": 1716454224151769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083468, "dur": 7, "args": { "External id": 174450, "cbid": 211, "correlation": 174450 } }, { "ph": "s", "id": 174450, "pid": 76337, "tid": -914061504, "ts": 1716454224083468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224151875, "dur": 22, "args": { "External id": 174452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174452, "pid": 5, "tid": 7, "ts": 1716454224151875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083479, "dur": 5, "args": { "External id": 174452, "cbid": 211, "correlation": 174452 } }, { "ph": "s", "id": 174452, "pid": 76337, "tid": -914061504, "ts": 1716454224083479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224151899, "dur": 33, "args": { "External id": 174458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174458, "pid": 5, "tid": 7, "ts": 1716454224151899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083506, "dur": 8, "args": { "External id": 174458, "cbid": 211, "correlation": 174458 } }, { "ph": "s", "id": 174458, "pid": 76337, "tid": -914061504, "ts": 1716454224083506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224151933, "dur": 163, "args": { "External id": 174467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174467, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174467, "pid": 5, "tid": 7, "ts": 1716454224151933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083597, "dur": 15, "args": { "External id": 174467, "cbid": 211, "correlation": 174467 } }, { "ph": "s", "id": 174467, "pid": 76337, "tid": -914061504, "ts": 1716454224083597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224152098, "dur": 65, "args": { "External id": 174489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174489, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174489, "pid": 5, "tid": 7, "ts": 1716454224152098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083672, "dur": 13, "args": { "External id": 174489, "cbid": 211, "correlation": 174489 } }, { "ph": "s", "id": 174489, "pid": 76337, "tid": -914061504, "ts": 1716454224083672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224083775, "dur": 2, "args": { "External id": 174500, "cbid": 251, "correlation": 174500 } }, { "ph": "f", "id": 174500, "pid": 76337, "tid": -914061504, "ts": 1716454224083775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224152164, "dur": 155, "args": { "External id": 174501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174501, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174501, "pid": 5, "tid": 7, "ts": 1716454224152164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083781, "dur": 13, "args": { "External id": 174501, "cbid": 211, "correlation": 174501 } }, { "ph": "s", "id": 174501, "pid": 76337, "tid": -914061504, "ts": 1716454224083781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224083854, "dur": 1, "args": { "External id": 174512, "cbid": 251, "correlation": 174512 } }, { "ph": "f", "id": 174512, "pid": 76337, "tid": -914061504, "ts": 1716454224083854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224152321, "dur": 147, "args": { "External id": 174513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174513, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174513, "pid": 5, "tid": 7, "ts": 1716454224152321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083858, "dur": 11, "args": { "External id": 174513, "cbid": 211, "correlation": 174513 } }, { "ph": "s", "id": 174513, "pid": 76337, "tid": -914061504, "ts": 1716454224083858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224083927, "dur": 1, "args": { "External id": 174524, "cbid": 251, "correlation": 174524 } }, { "ph": "f", "id": 174524, "pid": 76337, "tid": -914061504, "ts": 1716454224083927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224152470, "dur": 145, "args": { "External id": 174525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174525, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174525, "pid": 5, "tid": 7, "ts": 1716454224152470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224083931, "dur": 11, "args": { "External id": 174525, "cbid": 211, "correlation": 174525 } }, { "ph": "s", "id": 174525, "pid": 76337, "tid": -914061504, "ts": 1716454224083931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224152616, "dur": 1949, "args": { "External id": 174546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174546, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 174546, "pid": 5, "tid": 7, "ts": 1716454224152616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084031, "dur": 14, "args": { "External id": 174546, "cbid": 211, "correlation": 174546 } }, { "ph": "s", "id": 174546, "pid": 76337, "tid": -914061504, "ts": 1716454224084031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084141, "dur": 2, "args": { "External id": 174564, "cbid": 251, "correlation": 174564 } }, { "ph": "f", "id": 174564, "pid": 76337, "tid": -914061504, "ts": 1716454224084141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224154566, "dur": 148, "args": { "External id": 174566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174566, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 174566, "pid": 5, "tid": 7, "ts": 1716454224154566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084147, "dur": 14, "args": { "External id": 174566, "cbid": 211, "correlation": 174566 } }, { "ph": "s", "id": 174566, "pid": 76337, "tid": -914061504, "ts": 1716454224084147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224154716, "dur": 35, "args": { "External id": 174574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174574, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174574, "pid": 5, "tid": 7, "ts": 1716454224154716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084221, "dur": 13, "args": { "External id": 174574, "cbid": 211, "correlation": 174574 } }, { "ph": "s", "id": 174574, "pid": 76337, "tid": -914061504, "ts": 1716454224084221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224154753, "dur": 51, "args": { "External id": 174582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174582, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174582, "pid": 5, "tid": 7, "ts": 1716454224154753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084261, "dur": 8, "args": { "External id": 174582, "cbid": 211, "correlation": 174582 } }, { "ph": "s", "id": 174582, "pid": 76337, "tid": -914061504, "ts": 1716454224084261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224154805, "dur": 30, "args": { "External id": 174593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174593, "pid": 5, "tid": 7, "ts": 1716454224154805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084341, "dur": 14, "args": { "External id": 174593, "cbid": 211, "correlation": 174593 } }, { "ph": "s", "id": 174593, "pid": 76337, "tid": -914061504, "ts": 1716454224084341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224154836, "dur": 34, "args": { "External id": 174615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174615, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174615, "pid": 5, "tid": 7, "ts": 1716454224154836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084373, "dur": 8, "args": { "External id": 174615, "cbid": 211, "correlation": 174615 } }, { "ph": "s", "id": 174615, "pid": 76337, "tid": -914061504, "ts": 1716454224084373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084462, "dur": 1, "args": { "External id": 174626, "cbid": 251, "correlation": 174626 } }, { "ph": "f", "id": 174626, "pid": 76337, "tid": -914061504, "ts": 1716454224084462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224154872, "dur": 92, "args": { "External id": 174627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174627, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174627, "pid": 5, "tid": 7, "ts": 1716454224154872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084467, "dur": 14, "args": { "External id": 174627, "cbid": 211, "correlation": 174627 } }, { "ph": "s", "id": 174627, "pid": 76337, "tid": -914061504, "ts": 1716454224084467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084542, "dur": 1, "args": { "External id": 174638, "cbid": 251, "correlation": 174638 } }, { "ph": "f", "id": 174638, "pid": 76337, "tid": -914061504, "ts": 1716454224084542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084546, "dur": 0, "args": { "External id": 174639, "cbid": 251, "correlation": 174639 } }, { "ph": "f", "id": 174639, "pid": 76337, "tid": -914061504, "ts": 1716454224084546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224154966, "dur": 10, "args": { "External id": 174640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174640, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 174640, "pid": 5, "tid": 7, "ts": 1716454224154966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084548, "dur": 12, "args": { "External id": 174640, "cbid": 211, "correlation": 174640 } }, { "ph": "s", "id": 174640, "pid": 76337, "tid": -914061504, "ts": 1716454224084548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224154977, "dur": 5, "args": { "External id": 174642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174642, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 174642, "pid": 5, "tid": 7, "ts": 1716454224154977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084561, "dur": 6, "args": { "External id": 174642, "cbid": 211, "correlation": 174642 } }, { "ph": "s", "id": 174642, "pid": 76337, "tid": -914061504, "ts": 1716454224084561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084621, "dur": 1, "args": { "External id": 174653, "cbid": 251, "correlation": 174653 } }, { "ph": "f", "id": 174653, "pid": 76337, "tid": -914061504, "ts": 1716454224084621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084625, "dur": 0, "args": { "External id": 174654, "cbid": 251, "correlation": 174654 } }, { "ph": "f", "id": 174654, "pid": 76337, "tid": -914061504, "ts": 1716454224084625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224154983, "dur": 7, "args": { "External id": 174655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174655, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 174655, "pid": 5, "tid": 7, "ts": 1716454224154983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084627, "dur": 12, "args": { "External id": 174655, "cbid": 211, "correlation": 174655 } }, { "ph": "s", "id": 174655, "pid": 76337, "tid": -914061504, "ts": 1716454224084627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224154992, "dur": 3, "args": { "External id": 174657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174657, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 174657, "pid": 5, "tid": 7, "ts": 1716454224154992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084640, "dur": 5, "args": { "External id": 174657, "cbid": 211, "correlation": 174657 } }, { "ph": "s", "id": 174657, "pid": 76337, "tid": -914061504, "ts": 1716454224084640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224154996, "dur": 91, "args": { "External id": 174678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174678, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 174678, "pid": 5, "tid": 7, "ts": 1716454224154996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084714, "dur": 13, "args": { "External id": 174678, "cbid": 211, "correlation": 174678 } }, { "ph": "s", "id": 174678, "pid": 76337, "tid": -914061504, "ts": 1716454224084714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224084816, "dur": 2, "args": { "External id": 174696, "cbid": 251, "correlation": 174696 } }, { "ph": "f", "id": 174696, "pid": 76337, "tid": -914061504, "ts": 1716454224084816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224155089, "dur": 100, "args": { "External id": 174698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174698, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174698, "pid": 5, "tid": 7, "ts": 1716454224155089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084822, "dur": 14, "args": { "External id": 174698, "cbid": 211, "correlation": 174698 } }, { "ph": "s", "id": 174698, "pid": 76337, "tid": -914061504, "ts": 1716454224084822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224155190, "dur": 20, "args": { "External id": 174706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174706, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174706, "pid": 5, "tid": 7, "ts": 1716454224155190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084895, "dur": 12, "args": { "External id": 174706, "cbid": 211, "correlation": 174706 } }, { "ph": "s", "id": 174706, "pid": 76337, "tid": -914061504, "ts": 1716454224084895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224155211, "dur": 38, "args": { "External id": 174714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174714, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174714, "pid": 5, "tid": 7, "ts": 1716454224155211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084938, "dur": 9, "args": { "External id": 174714, "cbid": 211, "correlation": 174714 } }, { "ph": "s", "id": 174714, "pid": 76337, "tid": -914061504, "ts": 1716454224084938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224155250, "dur": 35, "args": { "External id": 174736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174736, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174736, "pid": 5, "tid": 7, "ts": 1716454224155250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224084998, "dur": 11, "args": { "External id": 174736, "cbid": 211, "correlation": 174736 } }, { "ph": "s", "id": 174736, "pid": 76337, "tid": -914061504, "ts": 1716454224084998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224085096, "dur": 1, "args": { "External id": 174752, "cbid": 251, "correlation": 174752 } }, { "ph": "f", "id": 174752, "pid": 76337, "tid": -914061504, "ts": 1716454224085096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224085101, "dur": 0, "args": { "External id": 174754, "cbid": 251, "correlation": 174754 } }, { "ph": "f", "id": 174754, "pid": 76337, "tid": -914061504, "ts": 1716454224085101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224155286, "dur": 541, "args": { "External id": 174755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174755, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 174755, "pid": 5, "tid": 7, "ts": 1716454224155286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085106, "dur": 12, "args": { "External id": 174755, "cbid": 211, "correlation": 174755 } }, { "ph": "s", "id": 174755, "pid": 76337, "tid": -914061504, "ts": 1716454224085106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224155829, "dur": 125, "args": { "External id": 174763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174763, "pid": 5, "tid": 7, "ts": 1716454224155829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085175, "dur": 14, "args": { "External id": 174763, "cbid": 211, "correlation": 174763 } }, { "ph": "s", "id": 174763, "pid": 76337, "tid": -914061504, "ts": 1716454224085175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224155956, "dur": 129, "args": { "External id": 174771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174771, "pid": 5, "tid": 7, "ts": 1716454224155956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085210, "dur": 8, "args": { "External id": 174771, "cbid": 211, "correlation": 174771 } }, { "ph": "s", "id": 174771, "pid": 76337, "tid": -914061504, "ts": 1716454224085210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224085289, "dur": 1, "args": { "External id": 174787, "cbid": 251, "correlation": 174787 } }, { "ph": "f", "id": 174787, "pid": 76337, "tid": -914061504, "ts": 1716454224085289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224156085, "dur": 307, "args": { "External id": 174789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174789, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174789, "pid": 5, "tid": 7, "ts": 1716454224156085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085294, "dur": 12, "args": { "External id": 174789, "cbid": 211, "correlation": 174789 } }, { "ph": "s", "id": 174789, "pid": 76337, "tid": -914061504, "ts": 1716454224085294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224156394, "dur": 27, "args": { "External id": 174797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174797, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174797, "pid": 5, "tid": 7, "ts": 1716454224156394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085337, "dur": 9, "args": { "External id": 174797, "cbid": 211, "correlation": 174797 } }, { "ph": "s", "id": 174797, "pid": 76337, "tid": -914061504, "ts": 1716454224085337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224156422, "dur": 82, "args": { "External id": 174808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174808, "pid": 5, "tid": 7, "ts": 1716454224156422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085406, "dur": 12, "args": { "External id": 174808, "cbid": 211, "correlation": 174808 } }, { "ph": "s", "id": 174808, "pid": 76337, "tid": -914061504, "ts": 1716454224085406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224085473, "dur": 0, "args": { "External id": 174820, "cbid": 317, "correlation": 174820 } }, { "ph": "f", "id": 174820, "pid": 76337, "tid": -914061504, "ts": 1716454224085473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224085473, "dur": 0, "args": { "External id": 174821, "cbid": 203, "correlation": 174821 } }, { "ph": "f", "id": 174821, "pid": 76337, "tid": -914061504, "ts": 1716454224085473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224085474, "dur": 0, "args": { "External id": 174822, "cbid": 205, "correlation": 174822 } }, { "ph": "f", "id": 174822, "pid": 76337, "tid": -914061504, "ts": 1716454224085474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224156505, "dur": 23, "args": { "External id": 174826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174826, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174826, "pid": 5, "tid": 7, "ts": 1716454224156505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085490, "dur": 12, "args": { "External id": 174826, "cbid": 211, "correlation": 174826 } }, { "ph": "s", "id": 174826, "pid": 76337, "tid": -914061504, "ts": 1716454224085490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224156529, "dur": 121, "args": { "External id": 174828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174828, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174828, "pid": 5, "tid": 7, "ts": 1716454224156529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085508, "dur": 6, "args": { "External id": 174828, "cbid": 211, "correlation": 174828 } }, { "ph": "s", "id": 174828, "pid": 76337, "tid": -914061504, "ts": 1716454224085508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224156652, "dur": 23, "args": { "External id": 174830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174830, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174830, "pid": 5, "tid": 7, "ts": 1716454224156652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085518, "dur": 5, "args": { "External id": 174830, "cbid": 211, "correlation": 174830 } }, { "ph": "s", "id": 174830, "pid": 76337, "tid": -914061504, "ts": 1716454224085518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224156676, "dur": 33, "args": { "External id": 174836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174836, "pid": 5, "tid": 7, "ts": 1716454224156676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085547, "dur": 9, "args": { "External id": 174836, "cbid": 211, "correlation": 174836 } }, { "ph": "s", "id": 174836, "pid": 76337, "tid": -914061504, "ts": 1716454224085547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224156711, "dur": 26, "args": { "External id": 174844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174844, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174844, "pid": 5, "tid": 7, "ts": 1716454224156711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085579, "dur": 8, "args": { "External id": 174844, "cbid": 211, "correlation": 174844 } }, { "ph": "s", "id": 174844, "pid": 76337, "tid": -914061504, "ts": 1716454224085579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224156738, "dur": 54, "args": { "External id": 174853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174853, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174853, "pid": 5, "tid": 7, "ts": 1716454224156738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085616, "dur": 10, "args": { "External id": 174853, "cbid": 211, "correlation": 174853 } }, { "ph": "s", "id": 174853, "pid": 76337, "tid": -914061504, "ts": 1716454224085616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224156793, "dur": 53, "args": { "External id": 174873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174873, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 174873, "pid": 5, "tid": 7, "ts": 1716454224156793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085692, "dur": 11, "args": { "External id": 174873, "cbid": 211, "correlation": 174873 } }, { "ph": "s", "id": 174873, "pid": 76337, "tid": -914061504, "ts": 1716454224085692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224156848, "dur": 5, "args": { "External id": 174885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174885, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 174885, "pid": 5, "tid": 7, "ts": 1716454224156848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085713, "dur": 6, "args": { "External id": 174885, "cbid": 211, "correlation": 174885 } }, { "ph": "s", "id": 174885, "pid": 76337, "tid": -914061504, "ts": 1716454224085713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224156854, "dur": 57, "args": { "External id": 174888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174888, "pid": 5, "tid": 7, "ts": 1716454224156854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085732, "dur": 6, "args": { "External id": 174888, "cbid": 211, "correlation": 174888 } }, { "ph": "s", "id": 174888, "pid": 76337, "tid": -914061504, "ts": 1716454224085732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224156912, "dur": 37, "args": { "External id": 174897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174897, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174897, "pid": 5, "tid": 7, "ts": 1716454224156912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085770, "dur": 11, "args": { "External id": 174897, "cbid": 211, "correlation": 174897 } }, { "ph": "s", "id": 174897, "pid": 76337, "tid": -914061504, "ts": 1716454224085770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224085825, "dur": 0, "args": { "External id": 174907, "cbid": 317, "correlation": 174907 } }, { "ph": "f", "id": 174907, "pid": 76337, "tid": -914061504, "ts": 1716454224085825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224085826, "dur": 0, "args": { "External id": 174908, "cbid": 203, "correlation": 174908 } }, { "ph": "f", "id": 174908, "pid": 76337, "tid": -914061504, "ts": 1716454224085826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224085827, "dur": 0, "args": { "External id": 174909, "cbid": 205, "correlation": 174909 } }, { "ph": "f", "id": 174909, "pid": 76337, "tid": -914061504, "ts": 1716454224085827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224156950, "dur": 39, "args": { "External id": 174913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174913, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174913, "pid": 5, "tid": 7, "ts": 1716454224156950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085842, "dur": 11, "args": { "External id": 174913, "cbid": 211, "correlation": 174913 } }, { "ph": "s", "id": 174913, "pid": 76337, "tid": -914061504, "ts": 1716454224085842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224156990, "dur": 83, "args": { "External id": 174915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174915, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174915, "pid": 5, "tid": 7, "ts": 1716454224156990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085856, "dur": 5, "args": { "External id": 174915, "cbid": 211, "correlation": 174915 } }, { "ph": "s", "id": 174915, "pid": 76337, "tid": -914061504, "ts": 1716454224085856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224157074, "dur": 1279, "args": { "External id": 174917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174917, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 174917, "pid": 5, "tid": 7, "ts": 1716454224157074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085867, "dur": 6, "args": { "External id": 174917, "cbid": 211, "correlation": 174917 } }, { "ph": "s", "id": 174917, "pid": 76337, "tid": -914061504, "ts": 1716454224085867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224158354, "dur": 21, "args": { "External id": 174919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174919, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174919, "pid": 5, "tid": 7, "ts": 1716454224158354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085878, "dur": 6, "args": { "External id": 174919, "cbid": 211, "correlation": 174919 } }, { "ph": "s", "id": 174919, "pid": 76337, "tid": -914061504, "ts": 1716454224085878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224158377, "dur": 33, "args": { "External id": 174925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174925, "pid": 5, "tid": 7, "ts": 1716454224158377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085906, "dur": 8, "args": { "External id": 174925, "cbid": 211, "correlation": 174925 } }, { "ph": "s", "id": 174925, "pid": 76337, "tid": -914061504, "ts": 1716454224085906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224158411, "dur": 4, "args": { "External id": 174933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174933, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 174933, "pid": 5, "tid": 7, "ts": 1716454224158411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224085950, "dur": 9, "args": { "External id": 174933, "cbid": 211, "correlation": 174933 } }, { "ph": "s", "id": 174933, "pid": 76337, "tid": -914061504, "ts": 1716454224085950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224086024, "dur": 1, "args": { "External id": 174949, "cbid": 251, "correlation": 174949 } }, { "ph": "f", "id": 174949, "pid": 76337, "tid": -914061504, "ts": 1716454224086024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224086029, "dur": 0, "args": { "External id": 174951, "cbid": 251, "correlation": 174951 } }, { "ph": "f", "id": 174951, "pid": 76337, "tid": -914061504, "ts": 1716454224086029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224158416, "dur": 12, "args": { "External id": 174952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174952, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 174952, "pid": 5, "tid": 7, "ts": 1716454224158416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086031, "dur": 12, "args": { "External id": 174952, "cbid": 211, "correlation": 174952 } }, { "ph": "s", "id": 174952, "pid": 76337, "tid": -914061504, "ts": 1716454224086031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224158430, "dur": 5, "args": { "External id": 174954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174954, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 174954, "pid": 5, "tid": 7, "ts": 1716454224158430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086045, "dur": 5, "args": { "External id": 174954, "cbid": 211, "correlation": 174954 } }, { "ph": "s", "id": 174954, "pid": 76337, "tid": -914061504, "ts": 1716454224086045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224158436, "dur": 29, "args": { "External id": 174964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174964, "pid": 5, "tid": 7, "ts": 1716454224158436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086103, "dur": 12, "args": { "External id": 174964, "cbid": 211, "correlation": 174964 } }, { "ph": "s", "id": 174964, "pid": 76337, "tid": -914061504, "ts": 1716454224086103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224158467, "dur": 31, "args": { "External id": 174984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174984, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 174984, "pid": 5, "tid": 7, "ts": 1716454224158467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086171, "dur": 11, "args": { "External id": 174984, "cbid": 211, "correlation": 174984 } }, { "ph": "s", "id": 174984, "pid": 76337, "tid": -914061504, "ts": 1716454224086171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224158499, "dur": 4, "args": { "External id": 174996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174996, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 174996, "pid": 5, "tid": 7, "ts": 1716454224158499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086192, "dur": 6, "args": { "External id": 174996, "cbid": 211, "correlation": 174996 } }, { "ph": "s", "id": 174996, "pid": 76337, "tid": -914061504, "ts": 1716454224086192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224158504, "dur": 30, "args": { "External id": 174999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 174999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 174999, "pid": 5, "tid": 7, "ts": 1716454224158504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086211, "dur": 7, "args": { "External id": 174999, "cbid": 211, "correlation": 174999 } }, { "ph": "s", "id": 174999, "pid": 76337, "tid": -914061504, "ts": 1716454224086211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224158535, "dur": 21, "args": { "External id": 175008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175008, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175008, "pid": 5, "tid": 7, "ts": 1716454224158535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086252, "dur": 10, "args": { "External id": 175008, "cbid": 211, "correlation": 175008 } }, { "ph": "s", "id": 175008, "pid": 76337, "tid": -914061504, "ts": 1716454224086252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224086318, "dur": 0, "args": { "External id": 175018, "cbid": 317, "correlation": 175018 } }, { "ph": "f", "id": 175018, "pid": 76337, "tid": -914061504, "ts": 1716454224086318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224086319, "dur": 0, "args": { "External id": 175019, "cbid": 203, "correlation": 175019 } }, { "ph": "f", "id": 175019, "pid": 76337, "tid": -914061504, "ts": 1716454224086319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224086320, "dur": 0, "args": { "External id": 175020, "cbid": 205, "correlation": 175020 } }, { "ph": "f", "id": 175020, "pid": 76337, "tid": -914061504, "ts": 1716454224086320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224158558, "dur": 22, "args": { "External id": 175024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175024, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175024, "pid": 5, "tid": 7, "ts": 1716454224158558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086334, "dur": 15, "args": { "External id": 175024, "cbid": 211, "correlation": 175024 } }, { "ph": "s", "id": 175024, "pid": 76337, "tid": -914061504, "ts": 1716454224086334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224158581, "dur": 44, "args": { "External id": 175026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175026, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175026, "pid": 5, "tid": 7, "ts": 1716454224158581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086351, "dur": 5, "args": { "External id": 175026, "cbid": 211, "correlation": 175026 } }, { "ph": "s", "id": 175026, "pid": 76337, "tid": -914061504, "ts": 1716454224086351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224158625, "dur": 646, "args": { "External id": 175028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175028, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175028, "pid": 5, "tid": 7, "ts": 1716454224158625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086362, "dur": 6, "args": { "External id": 175028, "cbid": 211, "correlation": 175028 } }, { "ph": "s", "id": 175028, "pid": 76337, "tid": -914061504, "ts": 1716454224086362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224159272, "dur": 21, "args": { "External id": 175030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175030, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175030, "pid": 5, "tid": 7, "ts": 1716454224159272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086372, "dur": 5, "args": { "External id": 175030, "cbid": 211, "correlation": 175030 } }, { "ph": "s", "id": 175030, "pid": 76337, "tid": -914061504, "ts": 1716454224086372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224159295, "dur": 33, "args": { "External id": 175036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175036, "pid": 5, "tid": 7, "ts": 1716454224159295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086400, "dur": 8, "args": { "External id": 175036, "cbid": 211, "correlation": 175036 } }, { "ph": "s", "id": 175036, "pid": 76337, "tid": -914061504, "ts": 1716454224086400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224086461, "dur": 0, "args": { "External id": 175046, "cbid": 317, "correlation": 175046 } }, { "ph": "f", "id": 175046, "pid": 76337, "tid": -914061504, "ts": 1716454224086461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224086462, "dur": 0, "args": { "External id": 175047, "cbid": 203, "correlation": 175047 } }, { "ph": "f", "id": 175047, "pid": 76337, "tid": -914061504, "ts": 1716454224086462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224086462, "dur": 0, "args": { "External id": 175048, "cbid": 205, "correlation": 175048 } }, { "ph": "f", "id": 175048, "pid": 76337, "tid": -914061504, "ts": 1716454224086462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224159329, "dur": 39, "args": { "External id": 175052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175052, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175052, "pid": 5, "tid": 7, "ts": 1716454224159329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086475, "dur": 13, "args": { "External id": 175052, "cbid": 211, "correlation": 175052 } }, { "ph": "s", "id": 175052, "pid": 76337, "tid": -914061504, "ts": 1716454224086475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224159370, "dur": 191, "args": { "External id": 175054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175054, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175054, "pid": 5, "tid": 7, "ts": 1716454224159370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086493, "dur": 6, "args": { "External id": 175054, "cbid": 211, "correlation": 175054 } }, { "ph": "s", "id": 175054, "pid": 76337, "tid": -914061504, "ts": 1716454224086493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224159562, "dur": 22, "args": { "External id": 175056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175056, "pid": 5, "tid": 7, "ts": 1716454224159562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086502, "dur": 5, "args": { "External id": 175056, "cbid": 211, "correlation": 175056 } }, { "ph": "s", "id": 175056, "pid": 76337, "tid": -914061504, "ts": 1716454224086502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224159585, "dur": 32, "args": { "External id": 175062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175062, "pid": 5, "tid": 7, "ts": 1716454224159585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086528, "dur": 9, "args": { "External id": 175062, "cbid": 211, "correlation": 175062 } }, { "ph": "s", "id": 175062, "pid": 76337, "tid": -914061504, "ts": 1716454224086528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224159619, "dur": 27, "args": { "External id": 175070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175070, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175070, "pid": 5, "tid": 7, "ts": 1716454224159619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086558, "dur": 8, "args": { "External id": 175070, "cbid": 211, "correlation": 175070 } }, { "ph": "s", "id": 175070, "pid": 76337, "tid": -914061504, "ts": 1716454224086558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224159647, "dur": 20, "args": { "External id": 175078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175078, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175078, "pid": 5, "tid": 7, "ts": 1716454224159647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086587, "dur": 8, "args": { "External id": 175078, "cbid": 211, "correlation": 175078 } }, { "ph": "s", "id": 175078, "pid": 76337, "tid": -914061504, "ts": 1716454224086587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224159668, "dur": 30, "args": { "External id": 175098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175098, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 175098, "pid": 5, "tid": 7, "ts": 1716454224159668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086673, "dur": 13, "args": { "External id": 175098, "cbid": 211, "correlation": 175098 } }, { "ph": "s", "id": 175098, "pid": 76337, "tid": -914061504, "ts": 1716454224086673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224159700, "dur": 4, "args": { "External id": 175110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175110, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 175110, "pid": 5, "tid": 7, "ts": 1716454224159700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086696, "dur": 6, "args": { "External id": 175110, "cbid": 211, "correlation": 175110 } }, { "ph": "s", "id": 175110, "pid": 76337, "tid": -914061504, "ts": 1716454224086696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224159705, "dur": 30, "args": { "External id": 175113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175113, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175113, "pid": 5, "tid": 7, "ts": 1716454224159705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086713, "dur": 6, "args": { "External id": 175113, "cbid": 211, "correlation": 175113 } }, { "ph": "s", "id": 175113, "pid": 76337, "tid": -914061504, "ts": 1716454224086713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224086773, "dur": 0, "args": { "External id": 175124, "cbid": 317, "correlation": 175124 } }, { "ph": "f", "id": 175124, "pid": 76337, "tid": -914061504, "ts": 1716454224086773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224086773, "dur": 0, "args": { "External id": 175125, "cbid": 203, "correlation": 175125 } }, { "ph": "f", "id": 175125, "pid": 76337, "tid": -914061504, "ts": 1716454224086773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224086774, "dur": 0, "args": { "External id": 175126, "cbid": 205, "correlation": 175126 } }, { "ph": "f", "id": 175126, "pid": 76337, "tid": -914061504, "ts": 1716454224086774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224159737, "dur": 23, "args": { "External id": 175130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175130, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175130, "pid": 5, "tid": 7, "ts": 1716454224159737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086789, "dur": 12, "args": { "External id": 175130, "cbid": 211, "correlation": 175130 } }, { "ph": "s", "id": 175130, "pid": 76337, "tid": -914061504, "ts": 1716454224086789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224159761, "dur": 106, "args": { "External id": 175132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175132, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175132, "pid": 5, "tid": 7, "ts": 1716454224159761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086807, "dur": 6, "args": { "External id": 175132, "cbid": 211, "correlation": 175132 } }, { "ph": "s", "id": 175132, "pid": 76337, "tid": -914061504, "ts": 1716454224086807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224159868, "dur": 23, "args": { "External id": 175134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175134, "pid": 5, "tid": 7, "ts": 1716454224159868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086817, "dur": 5, "args": { "External id": 175134, "cbid": 211, "correlation": 175134 } }, { "ph": "s", "id": 175134, "pid": 76337, "tid": -914061504, "ts": 1716454224086817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224159892, "dur": 33, "args": { "External id": 175140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175140, "pid": 5, "tid": 7, "ts": 1716454224159892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086844, "dur": 8, "args": { "External id": 175140, "cbid": 211, "correlation": 175140 } }, { "ph": "s", "id": 175140, "pid": 76337, "tid": -914061504, "ts": 1716454224086844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224159926, "dur": 164, "args": { "External id": 175149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175149, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175149, "pid": 5, "tid": 7, "ts": 1716454224159926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086931, "dur": 15, "args": { "External id": 175149, "cbid": 211, "correlation": 175149 } }, { "ph": "s", "id": 175149, "pid": 76337, "tid": -914061504, "ts": 1716454224086931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224160091, "dur": 64, "args": { "External id": 175171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175171, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175171, "pid": 5, "tid": 7, "ts": 1716454224160091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224086999, "dur": 11, "args": { "External id": 175171, "cbid": 211, "correlation": 175171 } }, { "ph": "s", "id": 175171, "pid": 76337, "tid": -914061504, "ts": 1716454224086999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087093, "dur": 1, "args": { "External id": 175182, "cbid": 251, "correlation": 175182 } }, { "ph": "f", "id": 175182, "pid": 76337, "tid": -914061504, "ts": 1716454224087093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224160157, "dur": 152, "args": { "External id": 175183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175183, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175183, "pid": 5, "tid": 7, "ts": 1716454224160157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087098, "dur": 13, "args": { "External id": 175183, "cbid": 211, "correlation": 175183 } }, { "ph": "s", "id": 175183, "pid": 76337, "tid": -914061504, "ts": 1716454224087098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087170, "dur": 1, "args": { "External id": 175194, "cbid": 251, "correlation": 175194 } }, { "ph": "f", "id": 175194, "pid": 76337, "tid": -914061504, "ts": 1716454224087170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224160310, "dur": 144, "args": { "External id": 175195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175195, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175195, "pid": 5, "tid": 7, "ts": 1716454224160310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087174, "dur": 12, "args": { "External id": 175195, "cbid": 211, "correlation": 175195 } }, { "ph": "s", "id": 175195, "pid": 76337, "tid": -914061504, "ts": 1716454224087174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087244, "dur": 1, "args": { "External id": 175206, "cbid": 251, "correlation": 175206 } }, { "ph": "f", "id": 175206, "pid": 76337, "tid": -914061504, "ts": 1716454224087244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224160456, "dur": 146, "args": { "External id": 175207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175207, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175207, "pid": 5, "tid": 7, "ts": 1716454224160456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087248, "dur": 12, "args": { "External id": 175207, "cbid": 211, "correlation": 175207 } }, { "ph": "s", "id": 175207, "pid": 76337, "tid": -914061504, "ts": 1716454224087248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224160603, "dur": 1946, "args": { "External id": 175228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175228, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 175228, "pid": 5, "tid": 7, "ts": 1716454224160603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087330, "dur": 13, "args": { "External id": 175228, "cbid": 211, "correlation": 175228 } }, { "ph": "s", "id": 175228, "pid": 76337, "tid": -914061504, "ts": 1716454224087330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087432, "dur": 1, "args": { "External id": 175246, "cbid": 251, "correlation": 175246 } }, { "ph": "f", "id": 175246, "pid": 76337, "tid": -914061504, "ts": 1716454224087432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224162551, "dur": 148, "args": { "External id": 175248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175248, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 175248, "pid": 5, "tid": 7, "ts": 1716454224162551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087438, "dur": 14, "args": { "External id": 175248, "cbid": 211, "correlation": 175248 } }, { "ph": "s", "id": 175248, "pid": 76337, "tid": -914061504, "ts": 1716454224087438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224162700, "dur": 36, "args": { "External id": 175256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175256, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175256, "pid": 5, "tid": 7, "ts": 1716454224162700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087512, "dur": 12, "args": { "External id": 175256, "cbid": 211, "correlation": 175256 } }, { "ph": "s", "id": 175256, "pid": 76337, "tid": -914061504, "ts": 1716454224087512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224162737, "dur": 50, "args": { "External id": 175264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175264, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175264, "pid": 5, "tid": 7, "ts": 1716454224162737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087551, "dur": 8, "args": { "External id": 175264, "cbid": 211, "correlation": 175264 } }, { "ph": "s", "id": 175264, "pid": 76337, "tid": -914061504, "ts": 1716454224087551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224162789, "dur": 29, "args": { "External id": 175275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175275, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175275, "pid": 5, "tid": 7, "ts": 1716454224162789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087622, "dur": 12, "args": { "External id": 175275, "cbid": 211, "correlation": 175275 } }, { "ph": "s", "id": 175275, "pid": 76337, "tid": -914061504, "ts": 1716454224087622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224162819, "dur": 34, "args": { "External id": 175297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175297, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175297, "pid": 5, "tid": 7, "ts": 1716454224162819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087654, "dur": 8, "args": { "External id": 175297, "cbid": 211, "correlation": 175297 } }, { "ph": "s", "id": 175297, "pid": 76337, "tid": -914061504, "ts": 1716454224087654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087744, "dur": 1, "args": { "External id": 175308, "cbid": 251, "correlation": 175308 } }, { "ph": "f", "id": 175308, "pid": 76337, "tid": -914061504, "ts": 1716454224087744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224162855, "dur": 91, "args": { "External id": 175309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175309, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175309, "pid": 5, "tid": 7, "ts": 1716454224162855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087750, "dur": 13, "args": { "External id": 175309, "cbid": 211, "correlation": 175309 } }, { "ph": "s", "id": 175309, "pid": 76337, "tid": -914061504, "ts": 1716454224087750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087822, "dur": 1, "args": { "External id": 175320, "cbid": 251, "correlation": 175320 } }, { "ph": "f", "id": 175320, "pid": 76337, "tid": -914061504, "ts": 1716454224087822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087825, "dur": 0, "args": { "External id": 175321, "cbid": 251, "correlation": 175321 } }, { "ph": "f", "id": 175321, "pid": 76337, "tid": -914061504, "ts": 1716454224087825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224162947, "dur": 11, "args": { "External id": 175322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175322, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 175322, "pid": 5, "tid": 7, "ts": 1716454224162947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087827, "dur": 12, "args": { "External id": 175322, "cbid": 211, "correlation": 175322 } }, { "ph": "s", "id": 175322, "pid": 76337, "tid": -914061504, "ts": 1716454224087827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224162959, "dur": 5, "args": { "External id": 175324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175324, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 175324, "pid": 5, "tid": 7, "ts": 1716454224162959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087841, "dur": 6, "args": { "External id": 175324, "cbid": 211, "correlation": 175324 } }, { "ph": "s", "id": 175324, "pid": 76337, "tid": -914061504, "ts": 1716454224087841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087902, "dur": 1, "args": { "External id": 175335, "cbid": 251, "correlation": 175335 } }, { "ph": "f", "id": 175335, "pid": 76337, "tid": -914061504, "ts": 1716454224087902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224087906, "dur": 0, "args": { "External id": 175336, "cbid": 251, "correlation": 175336 } }, { "ph": "f", "id": 175336, "pid": 76337, "tid": -914061504, "ts": 1716454224087906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224162966, "dur": 7, "args": { "External id": 175337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175337, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 175337, "pid": 5, "tid": 7, "ts": 1716454224162966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087907, "dur": 11, "args": { "External id": 175337, "cbid": 211, "correlation": 175337 } }, { "ph": "s", "id": 175337, "pid": 76337, "tid": -914061504, "ts": 1716454224087907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224162974, "dur": 3, "args": { "External id": 175339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175339, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 175339, "pid": 5, "tid": 7, "ts": 1716454224162974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224087920, "dur": 5, "args": { "External id": 175339, "cbid": 211, "correlation": 175339 } }, { "ph": "s", "id": 175339, "pid": 76337, "tid": -914061504, "ts": 1716454224087920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224162979, "dur": 92, "args": { "External id": 175360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175360, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 175360, "pid": 5, "tid": 7, "ts": 1716454224162979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088002, "dur": 13, "args": { "External id": 175360, "cbid": 211, "correlation": 175360 } }, { "ph": "s", "id": 175360, "pid": 76337, "tid": -914061504, "ts": 1716454224088002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224088104, "dur": 1, "args": { "External id": 175378, "cbid": 251, "correlation": 175378 } }, { "ph": "f", "id": 175378, "pid": 76337, "tid": -914061504, "ts": 1716454224088104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224163072, "dur": 85, "args": { "External id": 175380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175380, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175380, "pid": 5, "tid": 7, "ts": 1716454224163072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088110, "dur": 13, "args": { "External id": 175380, "cbid": 211, "correlation": 175380 } }, { "ph": "s", "id": 175380, "pid": 76337, "tid": -914061504, "ts": 1716454224088110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224163158, "dur": 19, "args": { "External id": 175388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175388, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175388, "pid": 5, "tid": 7, "ts": 1716454224163158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088182, "dur": 12, "args": { "External id": 175388, "cbid": 211, "correlation": 175388 } }, { "ph": "s", "id": 175388, "pid": 76337, "tid": -914061504, "ts": 1716454224088182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224163178, "dur": 38, "args": { "External id": 175396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175396, "pid": 5, "tid": 7, "ts": 1716454224163178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088224, "dur": 9, "args": { "External id": 175396, "cbid": 211, "correlation": 175396 } }, { "ph": "s", "id": 175396, "pid": 76337, "tid": -914061504, "ts": 1716454224088224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224163218, "dur": 35, "args": { "External id": 175418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175418, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175418, "pid": 5, "tid": 7, "ts": 1716454224163218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088278, "dur": 10, "args": { "External id": 175418, "cbid": 211, "correlation": 175418 } }, { "ph": "s", "id": 175418, "pid": 76337, "tid": -914061504, "ts": 1716454224088278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224088372, "dur": 1, "args": { "External id": 175434, "cbid": 251, "correlation": 175434 } }, { "ph": "f", "id": 175434, "pid": 76337, "tid": -914061504, "ts": 1716454224088372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224088377, "dur": 0, "args": { "External id": 175436, "cbid": 251, "correlation": 175436 } }, { "ph": "f", "id": 175436, "pid": 76337, "tid": -914061504, "ts": 1716454224088377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224163254, "dur": 540, "args": { "External id": 175437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175437, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 175437, "pid": 5, "tid": 7, "ts": 1716454224163254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088381, "dur": 12, "args": { "External id": 175437, "cbid": 211, "correlation": 175437 } }, { "ph": "s", "id": 175437, "pid": 76337, "tid": -914061504, "ts": 1716454224088381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224163795, "dur": 124, "args": { "External id": 175445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175445, "pid": 5, "tid": 7, "ts": 1716454224163795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088445, "dur": 13, "args": { "External id": 175445, "cbid": 211, "correlation": 175445 } }, { "ph": "s", "id": 175445, "pid": 76337, "tid": -914061504, "ts": 1716454224088445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224163920, "dur": 128, "args": { "External id": 175453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175453, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175453, "pid": 5, "tid": 7, "ts": 1716454224163920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088476, "dur": 8, "args": { "External id": 175453, "cbid": 211, "correlation": 175453 } }, { "ph": "s", "id": 175453, "pid": 76337, "tid": -914061504, "ts": 1716454224088476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224088553, "dur": 1, "args": { "External id": 175469, "cbid": 251, "correlation": 175469 } }, { "ph": "f", "id": 175469, "pid": 76337, "tid": -914061504, "ts": 1716454224088553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224164049, "dur": 305, "args": { "External id": 175471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175471, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175471, "pid": 5, "tid": 7, "ts": 1716454224164049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088559, "dur": 12, "args": { "External id": 175471, "cbid": 211, "correlation": 175471 } }, { "ph": "s", "id": 175471, "pid": 76337, "tid": -914061504, "ts": 1716454224088559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224164356, "dur": 27, "args": { "External id": 175479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175479, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175479, "pid": 5, "tid": 7, "ts": 1716454224164356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088600, "dur": 10, "args": { "External id": 175479, "cbid": 211, "correlation": 175479 } }, { "ph": "s", "id": 175479, "pid": 76337, "tid": -914061504, "ts": 1716454224088600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224164383, "dur": 81, "args": { "External id": 175490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175490, "pid": 5, "tid": 7, "ts": 1716454224164383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088669, "dur": 12, "args": { "External id": 175490, "cbid": 211, "correlation": 175490 } }, { "ph": "s", "id": 175490, "pid": 76337, "tid": -914061504, "ts": 1716454224088669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224088732, "dur": 0, "args": { "External id": 175502, "cbid": 317, "correlation": 175502 } }, { "ph": "f", "id": 175502, "pid": 76337, "tid": -914061504, "ts": 1716454224088732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224088733, "dur": 0, "args": { "External id": 175503, "cbid": 203, "correlation": 175503 } }, { "ph": "f", "id": 175503, "pid": 76337, "tid": -914061504, "ts": 1716454224088733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224088733, "dur": 0, "args": { "External id": 175504, "cbid": 205, "correlation": 175504 } }, { "ph": "f", "id": 175504, "pid": 76337, "tid": -914061504, "ts": 1716454224088733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224164466, "dur": 23, "args": { "External id": 175508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175508, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175508, "pid": 5, "tid": 7, "ts": 1716454224164466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088748, "dur": 13, "args": { "External id": 175508, "cbid": 211, "correlation": 175508 } }, { "ph": "s", "id": 175508, "pid": 76337, "tid": -914061504, "ts": 1716454224088748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224164490, "dur": 121, "args": { "External id": 175510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175510, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175510, "pid": 5, "tid": 7, "ts": 1716454224164490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088767, "dur": 7, "args": { "External id": 175510, "cbid": 211, "correlation": 175510 } }, { "ph": "s", "id": 175510, "pid": 76337, "tid": -914061504, "ts": 1716454224088767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224164612, "dur": 23, "args": { "External id": 175512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175512, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175512, "pid": 5, "tid": 7, "ts": 1716454224164612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088777, "dur": 5, "args": { "External id": 175512, "cbid": 211, "correlation": 175512 } }, { "ph": "s", "id": 175512, "pid": 76337, "tid": -914061504, "ts": 1716454224088777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224164637, "dur": 33, "args": { "External id": 175518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175518, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175518, "pid": 5, "tid": 7, "ts": 1716454224164637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088804, "dur": 9, "args": { "External id": 175518, "cbid": 211, "correlation": 175518 } }, { "ph": "s", "id": 175518, "pid": 76337, "tid": -914061504, "ts": 1716454224088804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224164671, "dur": 26, "args": { "External id": 175526, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175526, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175526, "pid": 5, "tid": 7, "ts": 1716454224164671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088836, "dur": 8, "args": { "External id": 175526, "cbid": 211, "correlation": 175526 } }, { "ph": "s", "id": 175526, "pid": 76337, "tid": -914061504, "ts": 1716454224088836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224164699, "dur": 47, "args": { "External id": 175535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175535, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175535, "pid": 5, "tid": 7, "ts": 1716454224164699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088875, "dur": 10, "args": { "External id": 175535, "cbid": 211, "correlation": 175535 } }, { "ph": "s", "id": 175535, "pid": 76337, "tid": -914061504, "ts": 1716454224088875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224164747, "dur": 41, "args": { "External id": 175555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175555, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 175555, "pid": 5, "tid": 7, "ts": 1716454224164747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088950, "dur": 13, "args": { "External id": 175555, "cbid": 211, "correlation": 175555 } }, { "ph": "s", "id": 175555, "pid": 76337, "tid": -914061504, "ts": 1716454224088950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224164790, "dur": 5, "args": { "External id": 175567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175567, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 175567, "pid": 5, "tid": 7, "ts": 1716454224164790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224088972, "dur": 15, "args": { "External id": 175567, "cbid": 211, "correlation": 175567 } }, { "ph": "s", "id": 175567, "pid": 76337, "tid": -914061504, "ts": 1716454224088972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224164797, "dur": 44, "args": { "External id": 175570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175570, "pid": 5, "tid": 7, "ts": 1716454224164797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089000, "dur": 7, "args": { "External id": 175570, "cbid": 211, "correlation": 175570 } }, { "ph": "s", "id": 175570, "pid": 76337, "tid": -914061504, "ts": 1716454224089000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224164843, "dur": 29, "args": { "External id": 175579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175579, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175579, "pid": 5, "tid": 7, "ts": 1716454224164843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089041, "dur": 10, "args": { "External id": 175579, "cbid": 211, "correlation": 175579 } }, { "ph": "s", "id": 175579, "pid": 76337, "tid": -914061504, "ts": 1716454224089041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224089093, "dur": 0, "args": { "External id": 175589, "cbid": 317, "correlation": 175589 } }, { "ph": "f", "id": 175589, "pid": 76337, "tid": -914061504, "ts": 1716454224089093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224089094, "dur": 0, "args": { "External id": 175590, "cbid": 203, "correlation": 175590 } }, { "ph": "f", "id": 175590, "pid": 76337, "tid": -914061504, "ts": 1716454224089094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224089094, "dur": 0, "args": { "External id": 175591, "cbid": 205, "correlation": 175591 } }, { "ph": "f", "id": 175591, "pid": 76337, "tid": -914061504, "ts": 1716454224089094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224164873, "dur": 31, "args": { "External id": 175595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175595, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175595, "pid": 5, "tid": 7, "ts": 1716454224164873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089110, "dur": 11, "args": { "External id": 175595, "cbid": 211, "correlation": 175595 } }, { "ph": "s", "id": 175595, "pid": 76337, "tid": -914061504, "ts": 1716454224089110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224164905, "dur": 63, "args": { "External id": 175597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175597, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175597, "pid": 5, "tid": 7, "ts": 1716454224164905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089123, "dur": 5, "args": { "External id": 175597, "cbid": 211, "correlation": 175597 } }, { "ph": "s", "id": 175597, "pid": 76337, "tid": -914061504, "ts": 1716454224089123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224164970, "dur": 964, "args": { "External id": 175599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175599, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175599, "pid": 5, "tid": 7, "ts": 1716454224164970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089134, "dur": 6, "args": { "External id": 175599, "cbid": 211, "correlation": 175599 } }, { "ph": "s", "id": 175599, "pid": 76337, "tid": -914061504, "ts": 1716454224089134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224165936, "dur": 21, "args": { "External id": 175601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175601, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175601, "pid": 5, "tid": 7, "ts": 1716454224165936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089144, "dur": 5, "args": { "External id": 175601, "cbid": 211, "correlation": 175601 } }, { "ph": "s", "id": 175601, "pid": 76337, "tid": -914061504, "ts": 1716454224089144, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224165958, "dur": 33, "args": { "External id": 175607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175607, "pid": 5, "tid": 7, "ts": 1716454224165958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089172, "dur": 8, "args": { "External id": 175607, "cbid": 211, "correlation": 175607 } }, { "ph": "s", "id": 175607, "pid": 76337, "tid": -914061504, "ts": 1716454224089172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224165992, "dur": 4, "args": { "External id": 175615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175615, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 175615, "pid": 5, "tid": 7, "ts": 1716454224165992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089215, "dur": 9, "args": { "External id": 175615, "cbid": 211, "correlation": 175615 } }, { "ph": "s", "id": 175615, "pid": 76337, "tid": -914061504, "ts": 1716454224089215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224089284, "dur": 1, "args": { "External id": 175631, "cbid": 251, "correlation": 175631 } }, { "ph": "f", "id": 175631, "pid": 76337, "tid": -914061504, "ts": 1716454224089284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224089289, "dur": 0, "args": { "External id": 175633, "cbid": 251, "correlation": 175633 } }, { "ph": "f", "id": 175633, "pid": 76337, "tid": -914061504, "ts": 1716454224089289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224165998, "dur": 12, "args": { "External id": 175634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175634, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 175634, "pid": 5, "tid": 7, "ts": 1716454224165998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089291, "dur": 11, "args": { "External id": 175634, "cbid": 211, "correlation": 175634 } }, { "ph": "s", "id": 175634, "pid": 76337, "tid": -914061504, "ts": 1716454224089291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224166012, "dur": 5, "args": { "External id": 175636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175636, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 175636, "pid": 5, "tid": 7, "ts": 1716454224166012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089304, "dur": 6, "args": { "External id": 175636, "cbid": 211, "correlation": 175636 } }, { "ph": "s", "id": 175636, "pid": 76337, "tid": -914061504, "ts": 1716454224089304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224166018, "dur": 30, "args": { "External id": 175646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175646, "pid": 5, "tid": 7, "ts": 1716454224166018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089362, "dur": 12, "args": { "External id": 175646, "cbid": 211, "correlation": 175646 } }, { "ph": "s", "id": 175646, "pid": 76337, "tid": -914061504, "ts": 1716454224089362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224166049, "dur": 31, "args": { "External id": 175666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175666, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 175666, "pid": 5, "tid": 7, "ts": 1716454224166049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089430, "dur": 11, "args": { "External id": 175666, "cbid": 211, "correlation": 175666 } }, { "ph": "s", "id": 175666, "pid": 76337, "tid": -914061504, "ts": 1716454224089430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224166082, "dur": 4, "args": { "External id": 175678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175678, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 175678, "pid": 5, "tid": 7, "ts": 1716454224166082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089450, "dur": 6, "args": { "External id": 175678, "cbid": 211, "correlation": 175678 } }, { "ph": "s", "id": 175678, "pid": 76337, "tid": -914061504, "ts": 1716454224089450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224166087, "dur": 30, "args": { "External id": 175681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175681, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175681, "pid": 5, "tid": 7, "ts": 1716454224166087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089470, "dur": 7, "args": { "External id": 175681, "cbid": 211, "correlation": 175681 } }, { "ph": "s", "id": 175681, "pid": 76337, "tid": -914061504, "ts": 1716454224089470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224166119, "dur": 21, "args": { "External id": 175690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175690, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175690, "pid": 5, "tid": 7, "ts": 1716454224166119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089510, "dur": 10, "args": { "External id": 175690, "cbid": 211, "correlation": 175690 } }, { "ph": "s", "id": 175690, "pid": 76337, "tid": -914061504, "ts": 1716454224089510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224089576, "dur": 0, "args": { "External id": 175700, "cbid": 317, "correlation": 175700 } }, { "ph": "f", "id": 175700, "pid": 76337, "tid": -914061504, "ts": 1716454224089576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224089577, "dur": 0, "args": { "External id": 175701, "cbid": 203, "correlation": 175701 } }, { "ph": "f", "id": 175701, "pid": 76337, "tid": -914061504, "ts": 1716454224089577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224089578, "dur": 0, "args": { "External id": 175702, "cbid": 205, "correlation": 175702 } }, { "ph": "f", "id": 175702, "pid": 76337, "tid": -914061504, "ts": 1716454224089578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224166140, "dur": 22, "args": { "External id": 175706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175706, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175706, "pid": 5, "tid": 7, "ts": 1716454224166140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089592, "dur": 12, "args": { "External id": 175706, "cbid": 211, "correlation": 175706 } }, { "ph": "s", "id": 175706, "pid": 76337, "tid": -914061504, "ts": 1716454224089592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224166164, "dur": 45, "args": { "External id": 175708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175708, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175708, "pid": 5, "tid": 7, "ts": 1716454224166164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089606, "dur": 6, "args": { "External id": 175708, "cbid": 211, "correlation": 175708 } }, { "ph": "s", "id": 175708, "pid": 76337, "tid": -914061504, "ts": 1716454224089606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224166210, "dur": 644, "args": { "External id": 175710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175710, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175710, "pid": 5, "tid": 7, "ts": 1716454224166210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089619, "dur": 6, "args": { "External id": 175710, "cbid": 211, "correlation": 175710 } }, { "ph": "s", "id": 175710, "pid": 76337, "tid": -914061504, "ts": 1716454224089619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224166856, "dur": 23, "args": { "External id": 175712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175712, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175712, "pid": 5, "tid": 7, "ts": 1716454224166856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089628, "dur": 5, "args": { "External id": 175712, "cbid": 211, "correlation": 175712 } }, { "ph": "s", "id": 175712, "pid": 76337, "tid": -914061504, "ts": 1716454224089628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224166880, "dur": 34, "args": { "External id": 175718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175718, "pid": 5, "tid": 7, "ts": 1716454224166880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089656, "dur": 9, "args": { "External id": 175718, "cbid": 211, "correlation": 175718 } }, { "ph": "s", "id": 175718, "pid": 76337, "tid": -914061504, "ts": 1716454224089656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224089718, "dur": 0, "args": { "External id": 175728, "cbid": 317, "correlation": 175728 } }, { "ph": "f", "id": 175728, "pid": 76337, "tid": -914061504, "ts": 1716454224089718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224089719, "dur": 0, "args": { "External id": 175729, "cbid": 203, "correlation": 175729 } }, { "ph": "f", "id": 175729, "pid": 76337, "tid": -914061504, "ts": 1716454224089719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224089720, "dur": 0, "args": { "External id": 175730, "cbid": 205, "correlation": 175730 } }, { "ph": "f", "id": 175730, "pid": 76337, "tid": -914061504, "ts": 1716454224089720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224166915, "dur": 31, "args": { "External id": 175734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175734, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175734, "pid": 5, "tid": 7, "ts": 1716454224166915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089733, "dur": 12, "args": { "External id": 175734, "cbid": 211, "correlation": 175734 } }, { "ph": "s", "id": 175734, "pid": 76337, "tid": -914061504, "ts": 1716454224089733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224166948, "dur": 154, "args": { "External id": 175736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175736, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175736, "pid": 5, "tid": 7, "ts": 1716454224166948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089751, "dur": 6, "args": { "External id": 175736, "cbid": 211, "correlation": 175736 } }, { "ph": "s", "id": 175736, "pid": 76337, "tid": -914061504, "ts": 1716454224089751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224167103, "dur": 22, "args": { "External id": 175738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175738, "pid": 5, "tid": 7, "ts": 1716454224167103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089761, "dur": 6, "args": { "External id": 175738, "cbid": 211, "correlation": 175738 } }, { "ph": "s", "id": 175738, "pid": 76337, "tid": -914061504, "ts": 1716454224089761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224167126, "dur": 33, "args": { "External id": 175744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175744, "pid": 5, "tid": 7, "ts": 1716454224167126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089787, "dur": 8, "args": { "External id": 175744, "cbid": 211, "correlation": 175744 } }, { "ph": "s", "id": 175744, "pid": 76337, "tid": -914061504, "ts": 1716454224089787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224167160, "dur": 27, "args": { "External id": 175752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175752, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175752, "pid": 5, "tid": 7, "ts": 1716454224167160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089818, "dur": 8, "args": { "External id": 175752, "cbid": 211, "correlation": 175752 } }, { "ph": "s", "id": 175752, "pid": 76337, "tid": -914061504, "ts": 1716454224089818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224167188, "dur": 20, "args": { "External id": 175760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175760, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175760, "pid": 5, "tid": 7, "ts": 1716454224167188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089846, "dur": 8, "args": { "External id": 175760, "cbid": 211, "correlation": 175760 } }, { "ph": "s", "id": 175760, "pid": 76337, "tid": -914061504, "ts": 1716454224089846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224167210, "dur": 30, "args": { "External id": 175780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175780, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 175780, "pid": 5, "tid": 7, "ts": 1716454224167210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089931, "dur": 13, "args": { "External id": 175780, "cbid": 211, "correlation": 175780 } }, { "ph": "s", "id": 175780, "pid": 76337, "tid": -914061504, "ts": 1716454224089931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224167241, "dur": 4, "args": { "External id": 175792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175792, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 175792, "pid": 5, "tid": 7, "ts": 1716454224167241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089953, "dur": 6, "args": { "External id": 175792, "cbid": 211, "correlation": 175792 } }, { "ph": "s", "id": 175792, "pid": 76337, "tid": -914061504, "ts": 1716454224089953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224167247, "dur": 31, "args": { "External id": 175795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175795, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175795, "pid": 5, "tid": 7, "ts": 1716454224167247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224089971, "dur": 16, "args": { "External id": 175795, "cbid": 211, "correlation": 175795 } }, { "ph": "s", "id": 175795, "pid": 76337, "tid": -914061504, "ts": 1716454224089971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224090043, "dur": 0, "args": { "External id": 175806, "cbid": 317, "correlation": 175806 } }, { "ph": "f", "id": 175806, "pid": 76337, "tid": -914061504, "ts": 1716454224090043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224090043, "dur": 0, "args": { "External id": 175807, "cbid": 203, "correlation": 175807 } }, { "ph": "f", "id": 175807, "pid": 76337, "tid": -914061504, "ts": 1716454224090043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224090044, "dur": 0, "args": { "External id": 175808, "cbid": 205, "correlation": 175808 } }, { "ph": "f", "id": 175808, "pid": 76337, "tid": -914061504, "ts": 1716454224090044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224167280, "dur": 22, "args": { "External id": 175812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175812, "pid": 5, "tid": 7, "ts": 1716454224167280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090057, "dur": 13, "args": { "External id": 175812, "cbid": 211, "correlation": 175812 } }, { "ph": "s", "id": 175812, "pid": 76337, "tid": -914061504, "ts": 1716454224090057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224167303, "dur": 105, "args": { "External id": 175814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175814, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175814, "pid": 5, "tid": 7, "ts": 1716454224167303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090076, "dur": 6, "args": { "External id": 175814, "cbid": 211, "correlation": 175814 } }, { "ph": "s", "id": 175814, "pid": 76337, "tid": -914061504, "ts": 1716454224090076, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224167410, "dur": 21, "args": { "External id": 175816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175816, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175816, "pid": 5, "tid": 7, "ts": 1716454224167410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090085, "dur": 5, "args": { "External id": 175816, "cbid": 211, "correlation": 175816 } }, { "ph": "s", "id": 175816, "pid": 76337, "tid": -914061504, "ts": 1716454224090085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224167432, "dur": 32, "args": { "External id": 175822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175822, "pid": 5, "tid": 7, "ts": 1716454224167432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090112, "dur": 9, "args": { "External id": 175822, "cbid": 211, "correlation": 175822 } }, { "ph": "s", "id": 175822, "pid": 76337, "tid": -914061504, "ts": 1716454224090112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224167466, "dur": 195, "args": { "External id": 175831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175831, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175831, "pid": 5, "tid": 7, "ts": 1716454224167466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090208, "dur": 16, "args": { "External id": 175831, "cbid": 211, "correlation": 175831 } }, { "ph": "s", "id": 175831, "pid": 76337, "tid": -914061504, "ts": 1716454224090208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224167662, "dur": 65, "args": { "External id": 175853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175853, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175853, "pid": 5, "tid": 7, "ts": 1716454224167662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090269, "dur": 10, "args": { "External id": 175853, "cbid": 211, "correlation": 175853 } }, { "ph": "s", "id": 175853, "pid": 76337, "tid": -914061504, "ts": 1716454224090269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224090359, "dur": 1, "args": { "External id": 175864, "cbid": 251, "correlation": 175864 } }, { "ph": "f", "id": 175864, "pid": 76337, "tid": -914061504, "ts": 1716454224090359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224167728, "dur": 151, "args": { "External id": 175865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175865, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175865, "pid": 5, "tid": 7, "ts": 1716454224167728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090364, "dur": 14, "args": { "External id": 175865, "cbid": 211, "correlation": 175865 } }, { "ph": "s", "id": 175865, "pid": 76337, "tid": -914061504, "ts": 1716454224090364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224090438, "dur": 1, "args": { "External id": 175876, "cbid": 251, "correlation": 175876 } }, { "ph": "f", "id": 175876, "pid": 76337, "tid": -914061504, "ts": 1716454224090438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224167881, "dur": 144, "args": { "External id": 175877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175877, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175877, "pid": 5, "tid": 7, "ts": 1716454224167881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090442, "dur": 12, "args": { "External id": 175877, "cbid": 211, "correlation": 175877 } }, { "ph": "s", "id": 175877, "pid": 76337, "tid": -914061504, "ts": 1716454224090442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224090511, "dur": 1, "args": { "External id": 175888, "cbid": 251, "correlation": 175888 } }, { "ph": "f", "id": 175888, "pid": 76337, "tid": -914061504, "ts": 1716454224090511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224168027, "dur": 143, "args": { "External id": 175889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175889, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175889, "pid": 5, "tid": 7, "ts": 1716454224168027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090515, "dur": 12, "args": { "External id": 175889, "cbid": 211, "correlation": 175889 } }, { "ph": "s", "id": 175889, "pid": 76337, "tid": -914061504, "ts": 1716454224090515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224168171, "dur": 1950, "args": { "External id": 175910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175910, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 175910, "pid": 5, "tid": 7, "ts": 1716454224168171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090598, "dur": 12, "args": { "External id": 175910, "cbid": 211, "correlation": 175910 } }, { "ph": "s", "id": 175910, "pid": 76337, "tid": -914061504, "ts": 1716454224090598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224090700, "dur": 1, "args": { "External id": 175928, "cbid": 251, "correlation": 175928 } }, { "ph": "f", "id": 175928, "pid": 76337, "tid": -914061504, "ts": 1716454224090700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224170123, "dur": 148, "args": { "External id": 175930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175930, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 175930, "pid": 5, "tid": 7, "ts": 1716454224170123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090705, "dur": 14, "args": { "External id": 175930, "cbid": 211, "correlation": 175930 } }, { "ph": "s", "id": 175930, "pid": 76337, "tid": -914061504, "ts": 1716454224090705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224170272, "dur": 36, "args": { "External id": 175938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175938, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175938, "pid": 5, "tid": 7, "ts": 1716454224170272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090780, "dur": 12, "args": { "External id": 175938, "cbid": 211, "correlation": 175938 } }, { "ph": "s", "id": 175938, "pid": 76337, "tid": -914061504, "ts": 1716454224090780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224170309, "dur": 50, "args": { "External id": 175946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175946, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175946, "pid": 5, "tid": 7, "ts": 1716454224170309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090819, "dur": 10, "args": { "External id": 175946, "cbid": 211, "correlation": 175946 } }, { "ph": "s", "id": 175946, "pid": 76337, "tid": -914061504, "ts": 1716454224090819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224170360, "dur": 30, "args": { "External id": 175957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175957, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175957, "pid": 5, "tid": 7, "ts": 1716454224170360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090891, "dur": 13, "args": { "External id": 175957, "cbid": 211, "correlation": 175957 } }, { "ph": "s", "id": 175957, "pid": 76337, "tid": -914061504, "ts": 1716454224090891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224170392, "dur": 35, "args": { "External id": 175979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175979, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 175979, "pid": 5, "tid": 7, "ts": 1716454224170392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224090922, "dur": 8, "args": { "External id": 175979, "cbid": 211, "correlation": 175979 } }, { "ph": "s", "id": 175979, "pid": 76337, "tid": -914061504, "ts": 1716454224090922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091018, "dur": 1, "args": { "External id": 175990, "cbid": 251, "correlation": 175990 } }, { "ph": "f", "id": 175990, "pid": 76337, "tid": -914061504, "ts": 1716454224091018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224170428, "dur": 91, "args": { "External id": 175991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 175991, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 175991, "pid": 5, "tid": 7, "ts": 1716454224170428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091023, "dur": 14, "args": { "External id": 175991, "cbid": 211, "correlation": 175991 } }, { "ph": "s", "id": 175991, "pid": 76337, "tid": -914061504, "ts": 1716454224091023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091097, "dur": 1, "args": { "External id": 176002, "cbid": 251, "correlation": 176002 } }, { "ph": "f", "id": 176002, "pid": 76337, "tid": -914061504, "ts": 1716454224091097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091101, "dur": 0, "args": { "External id": 176003, "cbid": 251, "correlation": 176003 } }, { "ph": "f", "id": 176003, "pid": 76337, "tid": -914061504, "ts": 1716454224091101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224170520, "dur": 11, "args": { "External id": 176004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176004, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 176004, "pid": 5, "tid": 7, "ts": 1716454224170520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091102, "dur": 12, "args": { "External id": 176004, "cbid": 211, "correlation": 176004 } }, { "ph": "s", "id": 176004, "pid": 76337, "tid": -914061504, "ts": 1716454224091102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224170532, "dur": 5, "args": { "External id": 176006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176006, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 176006, "pid": 5, "tid": 7, "ts": 1716454224170532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091116, "dur": 6, "args": { "External id": 176006, "cbid": 211, "correlation": 176006 } }, { "ph": "s", "id": 176006, "pid": 76337, "tid": -914061504, "ts": 1716454224091116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091177, "dur": 1, "args": { "External id": 176017, "cbid": 251, "correlation": 176017 } }, { "ph": "f", "id": 176017, "pid": 76337, "tid": -914061504, "ts": 1716454224091177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091181, "dur": 0, "args": { "External id": 176018, "cbid": 251, "correlation": 176018 } }, { "ph": "f", "id": 176018, "pid": 76337, "tid": -914061504, "ts": 1716454224091181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224170538, "dur": 7, "args": { "External id": 176019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176019, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 176019, "pid": 5, "tid": 7, "ts": 1716454224170538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091183, "dur": 12, "args": { "External id": 176019, "cbid": 211, "correlation": 176019 } }, { "ph": "s", "id": 176019, "pid": 76337, "tid": -914061504, "ts": 1716454224091183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224170547, "dur": 3, "args": { "External id": 176021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176021, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 176021, "pid": 5, "tid": 7, "ts": 1716454224170547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091196, "dur": 5, "args": { "External id": 176021, "cbid": 211, "correlation": 176021 } }, { "ph": "s", "id": 176021, "pid": 76337, "tid": -914061504, "ts": 1716454224091196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224170551, "dur": 91, "args": { "External id": 176042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176042, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 176042, "pid": 5, "tid": 7, "ts": 1716454224170551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091269, "dur": 14, "args": { "External id": 176042, "cbid": 211, "correlation": 176042 } }, { "ph": "s", "id": 176042, "pid": 76337, "tid": -914061504, "ts": 1716454224091269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091369, "dur": 1, "args": { "External id": 176060, "cbid": 251, "correlation": 176060 } }, { "ph": "f", "id": 176060, "pid": 76337, "tid": -914061504, "ts": 1716454224091369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224170644, "dur": 84, "args": { "External id": 176062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176062, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176062, "pid": 5, "tid": 7, "ts": 1716454224170644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091375, "dur": 14, "args": { "External id": 176062, "cbid": 211, "correlation": 176062 } }, { "ph": "s", "id": 176062, "pid": 76337, "tid": -914061504, "ts": 1716454224091375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224170729, "dur": 20, "args": { "External id": 176070, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176070, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176070, "pid": 5, "tid": 7, "ts": 1716454224170729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091448, "dur": 12, "args": { "External id": 176070, "cbid": 211, "correlation": 176070 } }, { "ph": "s", "id": 176070, "pid": 76337, "tid": -914061504, "ts": 1716454224091448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224170750, "dur": 38, "args": { "External id": 176078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176078, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176078, "pid": 5, "tid": 7, "ts": 1716454224170750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091490, "dur": 9, "args": { "External id": 176078, "cbid": 211, "correlation": 176078 } }, { "ph": "s", "id": 176078, "pid": 76337, "tid": -914061504, "ts": 1716454224091490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224170789, "dur": 35, "args": { "External id": 176100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176100, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176100, "pid": 5, "tid": 7, "ts": 1716454224170789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091542, "dur": 10, "args": { "External id": 176100, "cbid": 211, "correlation": 176100 } }, { "ph": "s", "id": 176100, "pid": 76337, "tid": -914061504, "ts": 1716454224091542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091638, "dur": 1, "args": { "External id": 176116, "cbid": 251, "correlation": 176116 } }, { "ph": "f", "id": 176116, "pid": 76337, "tid": -914061504, "ts": 1716454224091638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091643, "dur": 0, "args": { "External id": 176118, "cbid": 251, "correlation": 176118 } }, { "ph": "f", "id": 176118, "pid": 76337, "tid": -914061504, "ts": 1716454224091643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224170825, "dur": 539, "args": { "External id": 176119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176119, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 176119, "pid": 5, "tid": 7, "ts": 1716454224170825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091647, "dur": 13, "args": { "External id": 176119, "cbid": 211, "correlation": 176119 } }, { "ph": "s", "id": 176119, "pid": 76337, "tid": -914061504, "ts": 1716454224091647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224171366, "dur": 126, "args": { "External id": 176127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176127, "pid": 5, "tid": 7, "ts": 1716454224171366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091711, "dur": 13, "args": { "External id": 176127, "cbid": 211, "correlation": 176127 } }, { "ph": "s", "id": 176127, "pid": 76337, "tid": -914061504, "ts": 1716454224091711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224171493, "dur": 128, "args": { "External id": 176135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176135, "pid": 5, "tid": 7, "ts": 1716454224171493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091742, "dur": 8, "args": { "External id": 176135, "cbid": 211, "correlation": 176135 } }, { "ph": "s", "id": 176135, "pid": 76337, "tid": -914061504, "ts": 1716454224091742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224091820, "dur": 1, "args": { "External id": 176151, "cbid": 251, "correlation": 176151 } }, { "ph": "f", "id": 176151, "pid": 76337, "tid": -914061504, "ts": 1716454224091820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224171623, "dur": 307, "args": { "External id": 176153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176153, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176153, "pid": 5, "tid": 7, "ts": 1716454224171623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091826, "dur": 13, "args": { "External id": 176153, "cbid": 211, "correlation": 176153 } }, { "ph": "s", "id": 176153, "pid": 76337, "tid": -914061504, "ts": 1716454224091826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224171931, "dur": 27, "args": { "External id": 176161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176161, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176161, "pid": 5, "tid": 7, "ts": 1716454224171931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091869, "dur": 10, "args": { "External id": 176161, "cbid": 211, "correlation": 176161 } }, { "ph": "s", "id": 176161, "pid": 76337, "tid": -914061504, "ts": 1716454224091869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224171959, "dur": 81, "args": { "External id": 176172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176172, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176172, "pid": 5, "tid": 7, "ts": 1716454224171959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224091934, "dur": 13, "args": { "External id": 176172, "cbid": 211, "correlation": 176172 } }, { "ph": "s", "id": 176172, "pid": 76337, "tid": -914061504, "ts": 1716454224091934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224092008, "dur": 0, "args": { "External id": 176184, "cbid": 317, "correlation": 176184 } }, { "ph": "f", "id": 176184, "pid": 76337, "tid": -914061504, "ts": 1716454224092008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224092009, "dur": 0, "args": { "External id": 176185, "cbid": 203, "correlation": 176185 } }, { "ph": "f", "id": 176185, "pid": 76337, "tid": -914061504, "ts": 1716454224092009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224092009, "dur": 0, "args": { "External id": 176186, "cbid": 205, "correlation": 176186 } }, { "ph": "f", "id": 176186, "pid": 76337, "tid": -914061504, "ts": 1716454224092009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224172042, "dur": 22, "args": { "External id": 176190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176190, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176190, "pid": 5, "tid": 7, "ts": 1716454224172042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092024, "dur": 12, "args": { "External id": 176190, "cbid": 211, "correlation": 176190 } }, { "ph": "s", "id": 176190, "pid": 76337, "tid": -914061504, "ts": 1716454224092024, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224172065, "dur": 119, "args": { "External id": 176192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176192, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176192, "pid": 5, "tid": 7, "ts": 1716454224172065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092044, "dur": 6, "args": { "External id": 176192, "cbid": 211, "correlation": 176192 } }, { "ph": "s", "id": 176192, "pid": 76337, "tid": -914061504, "ts": 1716454224092044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224172186, "dur": 22, "args": { "External id": 176194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176194, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176194, "pid": 5, "tid": 7, "ts": 1716454224172186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092054, "dur": 5, "args": { "External id": 176194, "cbid": 211, "correlation": 176194 } }, { "ph": "s", "id": 176194, "pid": 76337, "tid": -914061504, "ts": 1716454224092054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224172210, "dur": 33, "args": { "External id": 176200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176200, "pid": 5, "tid": 7, "ts": 1716454224172210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092081, "dur": 10, "args": { "External id": 176200, "cbid": 211, "correlation": 176200 } }, { "ph": "s", "id": 176200, "pid": 76337, "tid": -914061504, "ts": 1716454224092081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224172244, "dur": 27, "args": { "External id": 176208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176208, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176208, "pid": 5, "tid": 7, "ts": 1716454224172244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092113, "dur": 8, "args": { "External id": 176208, "cbid": 211, "correlation": 176208 } }, { "ph": "s", "id": 176208, "pid": 76337, "tid": -914061504, "ts": 1716454224092113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224172272, "dur": 102, "args": { "External id": 176219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176219, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176219, "pid": 5, "tid": 7, "ts": 1716454224172272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092181, "dur": 12, "args": { "External id": 176219, "cbid": 211, "correlation": 176219 } }, { "ph": "s", "id": 176219, "pid": 76337, "tid": -914061504, "ts": 1716454224092181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224092241, "dur": 0, "args": { "External id": 176229, "cbid": 317, "correlation": 176229 } }, { "ph": "f", "id": 176229, "pid": 76337, "tid": -914061504, "ts": 1716454224092241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224092242, "dur": 0, "args": { "External id": 176230, "cbid": 203, "correlation": 176230 } }, { "ph": "f", "id": 176230, "pid": 76337, "tid": -914061504, "ts": 1716454224092242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224092243, "dur": 0, "args": { "External id": 176231, "cbid": 205, "correlation": 176231 } }, { "ph": "f", "id": 176231, "pid": 76337, "tid": -914061504, "ts": 1716454224092243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224172375, "dur": 74, "args": { "External id": 176235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176235, "pid": 5, "tid": 7, "ts": 1716454224172375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092258, "dur": 12, "args": { "External id": 176235, "cbid": 211, "correlation": 176235 } }, { "ph": "s", "id": 176235, "pid": 76337, "tid": -914061504, "ts": 1716454224092258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224172450, "dur": 44, "args": { "External id": 176237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176237, "pid": 5, "tid": 7, "ts": 1716454224172450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092272, "dur": 5, "args": { "External id": 176237, "cbid": 211, "correlation": 176237 } }, { "ph": "s", "id": 176237, "pid": 76337, "tid": -914061504, "ts": 1716454224092272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224172496, "dur": 4, "args": { "External id": 176239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176239, "pid": 5, "tid": 7, "ts": 1716454224172496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092283, "dur": 7, "args": { "External id": 176239, "cbid": 211, "correlation": 176239 } }, { "ph": "s", "id": 176239, "pid": 76337, "tid": -914061504, "ts": 1716454224092283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224092294, "dur": 0, "args": { "External id": 176240, "cbid": 51, "correlation": 176240 } }, { "ph": "s", "id": 176240, "pid": 76337, "tid": -914061504, "ts": 1716454224092294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224172501, "dur": 2230, "args": { "External id": 176241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176241, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176241, "pid": 5, "tid": 7, "ts": 1716454224172501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092296, "dur": 5, "args": { "External id": 176241, "cbid": 211, "correlation": 176241 } }, { "ph": "s", "id": 176241, "pid": 76337, "tid": -914061504, "ts": 1716454224092296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224174732, "dur": 112, "args": { "External id": 176246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176246, "pid": 5, "tid": 7, "ts": 1716454224174732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092324, "dur": 8, "args": { "External id": 176246, "cbid": 211, "correlation": 176246 } }, { "ph": "s", "id": 176246, "pid": 76337, "tid": -914061504, "ts": 1716454224092324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224174846, "dur": 169, "args": { "External id": 176255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176255, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176255, "pid": 5, "tid": 7, "ts": 1716454224174846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092442, "dur": 14, "args": { "External id": 176255, "cbid": 211, "correlation": 176255 } }, { "ph": "s", "id": 176255, "pid": 76337, "tid": -914061504, "ts": 1716454224092442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224175016, "dur": 129, "args": { "External id": 176275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176275, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 176275, "pid": 5, "tid": 7, "ts": 1716454224175016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092518, "dur": 12, "args": { "External id": 176275, "cbid": 211, "correlation": 176275 } }, { "ph": "s", "id": 176275, "pid": 76337, "tid": -914061504, "ts": 1716454224092518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224175147, "dur": 4, "args": { "External id": 176287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176287, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 176287, "pid": 5, "tid": 7, "ts": 1716454224175147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092539, "dur": 7, "args": { "External id": 176287, "cbid": 211, "correlation": 176287 } }, { "ph": "s", "id": 176287, "pid": 76337, "tid": -914061504, "ts": 1716454224092539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224175152, "dur": 159, "args": { "External id": 176290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176290, "pid": 5, "tid": 7, "ts": 1716454224175152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092559, "dur": 7, "args": { "External id": 176290, "cbid": 211, "correlation": 176290 } }, { "ph": "s", "id": 176290, "pid": 76337, "tid": -914061504, "ts": 1716454224092559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224175312, "dur": 102, "args": { "External id": 176299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176299, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176299, "pid": 5, "tid": 7, "ts": 1716454224175312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092600, "dur": 9, "args": { "External id": 176299, "cbid": 211, "correlation": 176299 } }, { "ph": "s", "id": 176299, "pid": 76337, "tid": -914061504, "ts": 1716454224092600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224092657, "dur": 0, "args": { "External id": 176309, "cbid": 317, "correlation": 176309 } }, { "ph": "f", "id": 176309, "pid": 76337, "tid": -914061504, "ts": 1716454224092657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224092657, "dur": 0, "args": { "External id": 176310, "cbid": 203, "correlation": 176310 } }, { "ph": "f", "id": 176310, "pid": 76337, "tid": -914061504, "ts": 1716454224092657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224092658, "dur": 0, "args": { "External id": 176311, "cbid": 205, "correlation": 176311 } }, { "ph": "f", "id": 176311, "pid": 76337, "tid": -914061504, "ts": 1716454224092658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224175416, "dur": 110, "args": { "External id": 176315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176315, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176315, "pid": 5, "tid": 7, "ts": 1716454224175416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092674, "dur": 11, "args": { "External id": 176315, "cbid": 211, "correlation": 176315 } }, { "ph": "s", "id": 176315, "pid": 76337, "tid": -914061504, "ts": 1716454224092674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224175527, "dur": 34, "args": { "External id": 176317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176317, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176317, "pid": 5, "tid": 7, "ts": 1716454224175527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092688, "dur": 6, "args": { "External id": 176317, "cbid": 211, "correlation": 176317 } }, { "ph": "s", "id": 176317, "pid": 76337, "tid": -914061504, "ts": 1716454224092688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224175562, "dur": 4, "args": { "External id": 176319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176319, "pid": 5, "tid": 7, "ts": 1716454224175562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092699, "dur": 5, "args": { "External id": 176319, "cbid": 211, "correlation": 176319 } }, { "ph": "s", "id": 176319, "pid": 76337, "tid": -914061504, "ts": 1716454224092699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224092708, "dur": 0, "args": { "External id": 176320, "cbid": 51, "correlation": 176320 } }, { "ph": "s", "id": 176320, "pid": 76337, "tid": -914061504, "ts": 1716454224092708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224175567, "dur": 2009, "args": { "External id": 176321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176321, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176321, "pid": 5, "tid": 7, "ts": 1716454224175567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092709, "dur": 6, "args": { "External id": 176321, "cbid": 211, "correlation": 176321 } }, { "ph": "s", "id": 176321, "pid": 76337, "tid": -914061504, "ts": 1716454224092709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224177577, "dur": 58, "args": { "External id": 176326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176326, "pid": 5, "tid": 7, "ts": 1716454224177577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092739, "dur": 9, "args": { "External id": 176326, "cbid": 211, "correlation": 176326 } }, { "ph": "s", "id": 176326, "pid": 76337, "tid": -914061504, "ts": 1716454224092739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224177637, "dur": 4, "args": { "External id": 176334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176334, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176334, "pid": 5, "tid": 7, "ts": 1716454224177637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092782, "dur": 10, "args": { "External id": 176334, "cbid": 211, "correlation": 176334 } }, { "ph": "s", "id": 176334, "pid": 76337, "tid": -914061504, "ts": 1716454224092782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224092850, "dur": 1, "args": { "External id": 176350, "cbid": 251, "correlation": 176350 } }, { "ph": "f", "id": 176350, "pid": 76337, "tid": -914061504, "ts": 1716454224092850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224092855, "dur": 0, "args": { "External id": 176352, "cbid": 251, "correlation": 176352 } }, { "ph": "f", "id": 176352, "pid": 76337, "tid": -914061504, "ts": 1716454224092855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224177643, "dur": 11, "args": { "External id": 176353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176353, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 176353, "pid": 5, "tid": 7, "ts": 1716454224177643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092857, "dur": 11, "args": { "External id": 176353, "cbid": 211, "correlation": 176353 } }, { "ph": "s", "id": 176353, "pid": 76337, "tid": -914061504, "ts": 1716454224092857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224177655, "dur": 5, "args": { "External id": 176355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176355, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 176355, "pid": 5, "tid": 7, "ts": 1716454224177655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092870, "dur": 5, "args": { "External id": 176355, "cbid": 211, "correlation": 176355 } }, { "ph": "s", "id": 176355, "pid": 76337, "tid": -914061504, "ts": 1716454224092870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224177662, "dur": 55, "args": { "External id": 176365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176365, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176365, "pid": 5, "tid": 7, "ts": 1716454224177662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224092927, "dur": 13, "args": { "External id": 176365, "cbid": 211, "correlation": 176365 } }, { "ph": "s", "id": 176365, "pid": 76337, "tid": -914061504, "ts": 1716454224092927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224177718, "dur": 53, "args": { "External id": 176385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176385, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 176385, "pid": 5, "tid": 7, "ts": 1716454224177718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093009, "dur": 12, "args": { "External id": 176385, "cbid": 211, "correlation": 176385 } }, { "ph": "s", "id": 176385, "pid": 76337, "tid": -914061504, "ts": 1716454224093009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224177772, "dur": 4, "args": { "External id": 176397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176397, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176397, "pid": 5, "tid": 7, "ts": 1716454224177772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093030, "dur": 6, "args": { "External id": 176397, "cbid": 211, "correlation": 176397 } }, { "ph": "s", "id": 176397, "pid": 76337, "tid": -914061504, "ts": 1716454224093030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224177777, "dur": 55, "args": { "External id": 176400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176400, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176400, "pid": 5, "tid": 7, "ts": 1716454224177777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093049, "dur": 6, "args": { "External id": 176400, "cbid": 211, "correlation": 176400 } }, { "ph": "s", "id": 176400, "pid": 76337, "tid": -914061504, "ts": 1716454224093049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224177834, "dur": 36, "args": { "External id": 176409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176409, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176409, "pid": 5, "tid": 7, "ts": 1716454224177834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093091, "dur": 10, "args": { "External id": 176409, "cbid": 211, "correlation": 176409 } }, { "ph": "s", "id": 176409, "pid": 76337, "tid": -914061504, "ts": 1716454224093091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224093157, "dur": 0, "args": { "External id": 176419, "cbid": 317, "correlation": 176419 } }, { "ph": "f", "id": 176419, "pid": 76337, "tid": -914061504, "ts": 1716454224093157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224093158, "dur": 0, "args": { "External id": 176420, "cbid": 203, "correlation": 176420 } }, { "ph": "f", "id": 176420, "pid": 76337, "tid": -914061504, "ts": 1716454224093158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224093159, "dur": 0, "args": { "External id": 176421, "cbid": 205, "correlation": 176421 } }, { "ph": "f", "id": 176421, "pid": 76337, "tid": -914061504, "ts": 1716454224093159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224177871, "dur": 40, "args": { "External id": 176425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176425, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176425, "pid": 5, "tid": 7, "ts": 1716454224177871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093174, "dur": 12, "args": { "External id": 176425, "cbid": 211, "correlation": 176425 } }, { "ph": "s", "id": 176425, "pid": 76337, "tid": -914061504, "ts": 1716454224093174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224177912, "dur": 14, "args": { "External id": 176427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176427, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176427, "pid": 5, "tid": 7, "ts": 1716454224177912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093189, "dur": 5, "args": { "External id": 176427, "cbid": 211, "correlation": 176427 } }, { "ph": "s", "id": 176427, "pid": 76337, "tid": -914061504, "ts": 1716454224093189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224177928, "dur": 4, "args": { "External id": 176429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176429, "pid": 5, "tid": 7, "ts": 1716454224177928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093199, "dur": 5, "args": { "External id": 176429, "cbid": 211, "correlation": 176429 } }, { "ph": "s", "id": 176429, "pid": 76337, "tid": -914061504, "ts": 1716454224093199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224093208, "dur": 0, "args": { "External id": 176430, "cbid": 51, "correlation": 176430 } }, { "ph": "s", "id": 176430, "pid": 76337, "tid": -914061504, "ts": 1716454224093208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224177932, "dur": 698, "args": { "External id": 176431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176431, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176431, "pid": 5, "tid": 7, "ts": 1716454224177932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093208, "dur": 5, "args": { "External id": 176431, "cbid": 211, "correlation": 176431 } }, { "ph": "s", "id": 176431, "pid": 76337, "tid": -914061504, "ts": 1716454224093208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224178632, "dur": 60, "args": { "External id": 176436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176436, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176436, "pid": 5, "tid": 7, "ts": 1716454224178632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093236, "dur": 9, "args": { "External id": 176436, "cbid": 211, "correlation": 176436 } }, { "ph": "s", "id": 176436, "pid": 76337, "tid": -914061504, "ts": 1716454224093236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224093297, "dur": 0, "args": { "External id": 176446, "cbid": 317, "correlation": 176446 } }, { "ph": "f", "id": 176446, "pid": 76337, "tid": -914061504, "ts": 1716454224093297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224093298, "dur": 0, "args": { "External id": 176447, "cbid": 203, "correlation": 176447 } }, { "ph": "f", "id": 176447, "pid": 76337, "tid": -914061504, "ts": 1716454224093298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224093299, "dur": 0, "args": { "External id": 176448, "cbid": 205, "correlation": 176448 } }, { "ph": "f", "id": 176448, "pid": 76337, "tid": -914061504, "ts": 1716454224093299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224178693, "dur": 4, "args": { "External id": 176452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176452, "pid": 5, "tid": 7, "ts": 1716454224178693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093314, "dur": 12, "args": { "External id": 176452, "cbid": 211, "correlation": 176452 } }, { "ph": "s", "id": 176452, "pid": 76337, "tid": -914061504, "ts": 1716454224093314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224093330, "dur": 0, "args": { "External id": 176453, "cbid": 51, "correlation": 176453 } }, { "ph": "s", "id": 176453, "pid": 76337, "tid": -914061504, "ts": 1716454224093330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454224178698, "dur": 266, "args": { "External id": 176454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176454, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176454, "pid": 5, "tid": 7, "ts": 1716454224178698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093331, "dur": 7, "args": { "External id": 176454, "cbid": 211, "correlation": 176454 } }, { "ph": "s", "id": 176454, "pid": 76337, "tid": -914061504, "ts": 1716454224093331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224178965, "dur": 59, "args": { "External id": 176459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176459, "pid": 5, "tid": 7, "ts": 1716454224178965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093359, "dur": 8, "args": { "External id": 176459, "cbid": 211, "correlation": 176459 } }, { "ph": "s", "id": 176459, "pid": 76337, "tid": -914061504, "ts": 1716454224093359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224179025, "dur": 50, "args": { "External id": 176467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176467, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176467, "pid": 5, "tid": 7, "ts": 1716454224179025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093387, "dur": 8, "args": { "External id": 176467, "cbid": 211, "correlation": 176467 } }, { "ph": "s", "id": 176467, "pid": 76337, "tid": -914061504, "ts": 1716454224093387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224179076, "dur": 36, "args": { "External id": 176475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176475, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176475, "pid": 5, "tid": 7, "ts": 1716454224179076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093416, "dur": 9, "args": { "External id": 176475, "cbid": 211, "correlation": 176475 } }, { "ph": "s", "id": 176475, "pid": 76337, "tid": -914061504, "ts": 1716454224093416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224179113, "dur": 52, "args": { "External id": 176495, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176495, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 176495, "pid": 5, "tid": 7, "ts": 1716454224179113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093504, "dur": 13, "args": { "External id": 176495, "cbid": 211, "correlation": 176495 } }, { "ph": "s", "id": 176495, "pid": 76337, "tid": -914061504, "ts": 1716454224093504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224179166, "dur": 4, "args": { "External id": 176507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176507, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 176507, "pid": 5, "tid": 7, "ts": 1716454224179166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093526, "dur": 6, "args": { "External id": 176507, "cbid": 211, "correlation": 176507 } }, { "ph": "s", "id": 176507, "pid": 76337, "tid": -914061504, "ts": 1716454224093526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224179171, "dur": 55, "args": { "External id": 176510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176510, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176510, "pid": 5, "tid": 7, "ts": 1716454224179171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093544, "dur": 7, "args": { "External id": 176510, "cbid": 211, "correlation": 176510 } }, { "ph": "s", "id": 176510, "pid": 76337, "tid": -914061504, "ts": 1716454224093544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224093604, "dur": 0, "args": { "External id": 176521, "cbid": 317, "correlation": 176521 } }, { "ph": "f", "id": 176521, "pid": 76337, "tid": -914061504, "ts": 1716454224093604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224093605, "dur": 0, "args": { "External id": 176522, "cbid": 203, "correlation": 176522 } }, { "ph": "f", "id": 176522, "pid": 76337, "tid": -914061504, "ts": 1716454224093605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224093606, "dur": 0, "args": { "External id": 176523, "cbid": 205, "correlation": 176523 } }, { "ph": "f", "id": 176523, "pid": 76337, "tid": -914061504, "ts": 1716454224093606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093636, "dur": 2, "args": { "External id": 176527, "cbid": 251, "correlation": 176527 } }, { "ph": "f", "id": 176527, "pid": 76337, "tid": -914061504, "ts": 1716454224093636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093640, "dur": 1, "args": { "External id": 176528, "cbid": 251, "correlation": 176528 } }, { "ph": "f", "id": 176528, "pid": 76337, "tid": -914061504, "ts": 1716454224093640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093642, "dur": 1, "args": { "External id": 176529, "cbid": 251, "correlation": 176529 } }, { "ph": "f", "id": 176529, "pid": 76337, "tid": -914061504, "ts": 1716454224093642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093644, "dur": 1, "args": { "External id": 176530, "cbid": 251, "correlation": 176530 } }, { "ph": "f", "id": 176530, "pid": 76337, "tid": -914061504, "ts": 1716454224093644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093646, "dur": 1, "args": { "External id": 176531, "cbid": 251, "correlation": 176531 } }, { "ph": "f", "id": 176531, "pid": 76337, "tid": -914061504, "ts": 1716454224093646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093648, "dur": 1, "args": { "External id": 176532, "cbid": 251, "correlation": 176532 } }, { "ph": "f", "id": 176532, "pid": 76337, "tid": -914061504, "ts": 1716454224093648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093650, "dur": 1, "args": { "External id": 176533, "cbid": 251, "correlation": 176533 } }, { "ph": "f", "id": 176533, "pid": 76337, "tid": -914061504, "ts": 1716454224093650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093652, "dur": 1, "args": { "External id": 176534, "cbid": 251, "correlation": 176534 } }, { "ph": "f", "id": 176534, "pid": 76337, "tid": -914061504, "ts": 1716454224093652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093654, "dur": 0, "args": { "External id": 176535, "cbid": 251, "correlation": 176535 } }, { "ph": "f", "id": 176535, "pid": 76337, "tid": -914061504, "ts": 1716454224093654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224179227, "dur": 114, "args": { "External id": 176536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176536, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 176536, "pid": 5, "tid": 7, "ts": 1716454224179227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093657, "dur": 13, "args": { "External id": 176536, "cbid": 211, "correlation": 176536 } }, { "ph": "s", "id": 176536, "pid": 76337, "tid": -914061504, "ts": 1716454224093657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224179342, "dur": 60, "args": { "External id": 176542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176542, "pid": 5, "tid": 7, "ts": 1716454224179342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093693, "dur": 10, "args": { "External id": 176542, "cbid": 211, "correlation": 176542 } }, { "ph": "s", "id": 176542, "pid": 76337, "tid": -914061504, "ts": 1716454224093693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224179403, "dur": 587, "args": { "External id": 176551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176551, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176551, "pid": 5, "tid": 7, "ts": 1716454224179403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093780, "dur": 14, "args": { "External id": 176551, "cbid": 211, "correlation": 176551 } }, { "ph": "s", "id": 176551, "pid": 76337, "tid": -914061504, "ts": 1716454224093780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224179991, "dur": 181, "args": { "External id": 176573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176573, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176573, "pid": 5, "tid": 7, "ts": 1716454224179991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093842, "dur": 12, "args": { "External id": 176573, "cbid": 211, "correlation": 176573 } }, { "ph": "s", "id": 176573, "pid": 76337, "tid": -914061504, "ts": 1716454224093842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224093935, "dur": 1, "args": { "External id": 176584, "cbid": 251, "correlation": 176584 } }, { "ph": "f", "id": 176584, "pid": 76337, "tid": -914061504, "ts": 1716454224093935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224180174, "dur": 197, "args": { "External id": 176585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176585, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176585, "pid": 5, "tid": 7, "ts": 1716454224180174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224093940, "dur": 14, "args": { "External id": 176585, "cbid": 211, "correlation": 176585 } }, { "ph": "s", "id": 176585, "pid": 76337, "tid": -914061504, "ts": 1716454224093940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094022, "dur": 1, "args": { "External id": 176596, "cbid": 251, "correlation": 176596 } }, { "ph": "f", "id": 176596, "pid": 76337, "tid": -914061504, "ts": 1716454224094022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224180372, "dur": 187, "args": { "External id": 176597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176597, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176597, "pid": 5, "tid": 7, "ts": 1716454224180372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094027, "dur": 12, "args": { "External id": 176597, "cbid": 211, "correlation": 176597 } }, { "ph": "s", "id": 176597, "pid": 76337, "tid": -914061504, "ts": 1716454224094027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094095, "dur": 1, "args": { "External id": 176608, "cbid": 251, "correlation": 176608 } }, { "ph": "f", "id": 176608, "pid": 76337, "tid": -914061504, "ts": 1716454224094095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224180561, "dur": 185, "args": { "External id": 176609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176609, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176609, "pid": 5, "tid": 7, "ts": 1716454224180561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094100, "dur": 12, "args": { "External id": 176609, "cbid": 211, "correlation": 176609 } }, { "ph": "s", "id": 176609, "pid": 76337, "tid": -914061504, "ts": 1716454224094100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224180747, "dur": 18572, "args": { "External id": 176630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176630, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 176630, "pid": 5, "tid": 7, "ts": 1716454224180747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094183, "dur": 14, "args": { "External id": 176630, "cbid": 211, "correlation": 176630 } }, { "ph": "s", "id": 176630, "pid": 76337, "tid": -914061504, "ts": 1716454224094183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094282, "dur": 1, "args": { "External id": 176648, "cbid": 251, "correlation": 176648 } }, { "ph": "f", "id": 176648, "pid": 76337, "tid": -914061504, "ts": 1716454224094282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224199321, "dur": 207, "args": { "External id": 176650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176650, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176650, "pid": 5, "tid": 7, "ts": 1716454224199321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094288, "dur": 14, "args": { "External id": 176650, "cbid": 211, "correlation": 176650 } }, { "ph": "s", "id": 176650, "pid": 76337, "tid": -914061504, "ts": 1716454224094288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224199529, "dur": 66, "args": { "External id": 176658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176658, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176658, "pid": 5, "tid": 7, "ts": 1716454224199529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094363, "dur": 12, "args": { "External id": 176658, "cbid": 211, "correlation": 176658 } }, { "ph": "s", "id": 176658, "pid": 76337, "tid": -914061504, "ts": 1716454224094363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224199597, "dur": 96, "args": { "External id": 176666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176666, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176666, "pid": 5, "tid": 7, "ts": 1716454224199597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094404, "dur": 8, "args": { "External id": 176666, "cbid": 211, "correlation": 176666 } }, { "ph": "s", "id": 176666, "pid": 76337, "tid": -914061504, "ts": 1716454224094404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224199695, "dur": 54, "args": { "External id": 176677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176677, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176677, "pid": 5, "tid": 7, "ts": 1716454224199695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094479, "dur": 13, "args": { "External id": 176677, "cbid": 211, "correlation": 176677 } }, { "ph": "s", "id": 176677, "pid": 76337, "tid": -914061504, "ts": 1716454224094479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224199750, "dur": 93, "args": { "External id": 176699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176699, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176699, "pid": 5, "tid": 7, "ts": 1716454224199750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094512, "dur": 8, "args": { "External id": 176699, "cbid": 211, "correlation": 176699 } }, { "ph": "s", "id": 176699, "pid": 76337, "tid": -914061504, "ts": 1716454224094512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094598, "dur": 1, "args": { "External id": 176710, "cbid": 251, "correlation": 176710 } }, { "ph": "f", "id": 176710, "pid": 76337, "tid": -914061504, "ts": 1716454224094598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224199845, "dur": 103, "args": { "External id": 176711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176711, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176711, "pid": 5, "tid": 7, "ts": 1716454224199845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094604, "dur": 13, "args": { "External id": 176711, "cbid": 211, "correlation": 176711 } }, { "ph": "s", "id": 176711, "pid": 76337, "tid": -914061504, "ts": 1716454224094604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094681, "dur": 1, "args": { "External id": 176722, "cbid": 251, "correlation": 176722 } }, { "ph": "f", "id": 176722, "pid": 76337, "tid": -914061504, "ts": 1716454224094681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094686, "dur": 0, "args": { "External id": 176723, "cbid": 251, "correlation": 176723 } }, { "ph": "f", "id": 176723, "pid": 76337, "tid": -914061504, "ts": 1716454224094686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224199950, "dur": 10, "args": { "External id": 176724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176724, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 176724, "pid": 5, "tid": 7, "ts": 1716454224199950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094688, "dur": 14, "args": { "External id": 176724, "cbid": 211, "correlation": 176724 } }, { "ph": "s", "id": 176724, "pid": 76337, "tid": -914061504, "ts": 1716454224094688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224199961, "dur": 5, "args": { "External id": 176726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176726, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 176726, "pid": 5, "tid": 7, "ts": 1716454224199961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094706, "dur": 8, "args": { "External id": 176726, "cbid": 211, "correlation": 176726 } }, { "ph": "s", "id": 176726, "pid": 76337, "tid": -914061504, "ts": 1716454224094706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094771, "dur": 1, "args": { "External id": 176737, "cbid": 251, "correlation": 176737 } }, { "ph": "f", "id": 176737, "pid": 76337, "tid": -914061504, "ts": 1716454224094771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094774, "dur": 0, "args": { "External id": 176738, "cbid": 251, "correlation": 176738 } }, { "ph": "f", "id": 176738, "pid": 76337, "tid": -914061504, "ts": 1716454224094774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224199968, "dur": 6, "args": { "External id": 176739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176739, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 176739, "pid": 5, "tid": 7, "ts": 1716454224199968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094776, "dur": 12, "args": { "External id": 176739, "cbid": 211, "correlation": 176739 } }, { "ph": "s", "id": 176739, "pid": 76337, "tid": -914061504, "ts": 1716454224094776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224199975, "dur": 3, "args": { "External id": 176741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176741, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 176741, "pid": 5, "tid": 7, "ts": 1716454224199975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094789, "dur": 5, "args": { "External id": 176741, "cbid": 211, "correlation": 176741 } }, { "ph": "s", "id": 176741, "pid": 76337, "tid": -914061504, "ts": 1716454224094789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224199980, "dur": 156, "args": { "External id": 176762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176762, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 176762, "pid": 5, "tid": 7, "ts": 1716454224199980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094863, "dur": 12, "args": { "External id": 176762, "cbid": 211, "correlation": 176762 } }, { "ph": "s", "id": 176762, "pid": 76337, "tid": -914061504, "ts": 1716454224094863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224094964, "dur": 2, "args": { "External id": 176780, "cbid": 251, "correlation": 176780 } }, { "ph": "f", "id": 176780, "pid": 76337, "tid": -914061504, "ts": 1716454224094964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224200138, "dur": 109, "args": { "External id": 176782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176782, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 176782, "pid": 5, "tid": 7, "ts": 1716454224200138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224094970, "dur": 21, "args": { "External id": 176782, "cbid": 211, "correlation": 176782 } }, { "ph": "s", "id": 176782, "pid": 76337, "tid": -914061504, "ts": 1716454224094970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224200248, "dur": 34, "args": { "External id": 176790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176790, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176790, "pid": 5, "tid": 7, "ts": 1716454224200248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095052, "dur": 12, "args": { "External id": 176790, "cbid": 211, "correlation": 176790 } }, { "ph": "s", "id": 176790, "pid": 76337, "tid": -914061504, "ts": 1716454224095052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224200283, "dur": 68, "args": { "External id": 176798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176798, "pid": 5, "tid": 7, "ts": 1716454224200283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095093, "dur": 10, "args": { "External id": 176798, "cbid": 211, "correlation": 176798 } }, { "ph": "s", "id": 176798, "pid": 76337, "tid": -914061504, "ts": 1716454224095093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224200353, "dur": 93, "args": { "External id": 176820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176820, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176820, "pid": 5, "tid": 7, "ts": 1716454224200353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095147, "dur": 11, "args": { "External id": 176820, "cbid": 211, "correlation": 176820 } }, { "ph": "s", "id": 176820, "pid": 76337, "tid": -914061504, "ts": 1716454224095147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095239, "dur": 1, "args": { "External id": 176836, "cbid": 251, "correlation": 176836 } }, { "ph": "f", "id": 176836, "pid": 76337, "tid": -914061504, "ts": 1716454224095239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224200446, "dur": 575, "args": { "External id": 176838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176838, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 176838, "pid": 5, "tid": 7, "ts": 1716454224200446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095245, "dur": 13, "args": { "External id": 176838, "cbid": 211, "correlation": 176838 } }, { "ph": "s", "id": 176838, "pid": 76337, "tid": -914061504, "ts": 1716454224095245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224201023, "dur": 244, "args": { "External id": 176846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176846, "pid": 5, "tid": 7, "ts": 1716454224201023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095311, "dur": 13, "args": { "External id": 176846, "cbid": 211, "correlation": 176846 } }, { "ph": "s", "id": 176846, "pid": 76337, "tid": -914061504, "ts": 1716454224095311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224201269, "dur": 253, "args": { "External id": 176854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176854, "pid": 5, "tid": 7, "ts": 1716454224201269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095342, "dur": 9, "args": { "External id": 176854, "cbid": 211, "correlation": 176854 } }, { "ph": "s", "id": 176854, "pid": 76337, "tid": -914061504, "ts": 1716454224095342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095429, "dur": 1, "args": { "External id": 176870, "cbid": 251, "correlation": 176870 } }, { "ph": "f", "id": 176870, "pid": 76337, "tid": -914061504, "ts": 1716454224095429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095434, "dur": 0, "args": { "External id": 176872, "cbid": 251, "correlation": 176872 } }, { "ph": "f", "id": 176872, "pid": 76337, "tid": -914061504, "ts": 1716454224095434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224201523, "dur": 357, "args": { "External id": 176873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176873, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 176873, "pid": 5, "tid": 7, "ts": 1716454224201523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095437, "dur": 13, "args": { "External id": 176873, "cbid": 211, "correlation": 176873 } }, { "ph": "s", "id": 176873, "pid": 76337, "tid": -914061504, "ts": 1716454224095437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224201881, "dur": 50, "args": { "External id": 176881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176881, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176881, "pid": 5, "tid": 7, "ts": 1716454224201881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095479, "dur": 10, "args": { "External id": 176881, "cbid": 211, "correlation": 176881 } }, { "ph": "s", "id": 176881, "pid": 76337, "tid": -914061504, "ts": 1716454224095479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224201933, "dur": 160, "args": { "External id": 176892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176892, "pid": 5, "tid": 7, "ts": 1716454224201933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095546, "dur": 13, "args": { "External id": 176892, "cbid": 211, "correlation": 176892 } }, { "ph": "s", "id": 176892, "pid": 76337, "tid": -914061504, "ts": 1716454224095546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224095613, "dur": 0, "args": { "External id": 176904, "cbid": 317, "correlation": 176904 } }, { "ph": "f", "id": 176904, "pid": 76337, "tid": -914061504, "ts": 1716454224095613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224095614, "dur": 0, "args": { "External id": 176905, "cbid": 203, "correlation": 176905 } }, { "ph": "f", "id": 176905, "pid": 76337, "tid": -914061504, "ts": 1716454224095614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224095615, "dur": 0, "args": { "External id": 176906, "cbid": 205, "correlation": 176906 } }, { "ph": "f", "id": 176906, "pid": 76337, "tid": -914061504, "ts": 1716454224095615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095639, "dur": 1, "args": { "External id": 176910, "cbid": 251, "correlation": 176910 } }, { "ph": "f", "id": 176910, "pid": 76337, "tid": -914061504, "ts": 1716454224095639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095641, "dur": 0, "args": { "External id": 176911, "cbid": 251, "correlation": 176911 } }, { "ph": "f", "id": 176911, "pid": 76337, "tid": -914061504, "ts": 1716454224095641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095641, "dur": 0, "args": { "External id": 176912, "cbid": 251, "correlation": 176912 } }, { "ph": "f", "id": 176912, "pid": 76337, "tid": -914061504, "ts": 1716454224095641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095643, "dur": 0, "args": { "External id": 176913, "cbid": 251, "correlation": 176913 } }, { "ph": "f", "id": 176913, "pid": 76337, "tid": -914061504, "ts": 1716454224095643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095644, "dur": 0, "args": { "External id": 176914, "cbid": 251, "correlation": 176914 } }, { "ph": "f", "id": 176914, "pid": 76337, "tid": -914061504, "ts": 1716454224095644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095645, "dur": 0, "args": { "External id": 176915, "cbid": 251, "correlation": 176915 } }, { "ph": "f", "id": 176915, "pid": 76337, "tid": -914061504, "ts": 1716454224095645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095646, "dur": 0, "args": { "External id": 176916, "cbid": 251, "correlation": 176916 } }, { "ph": "f", "id": 176916, "pid": 76337, "tid": -914061504, "ts": 1716454224095646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095647, "dur": 0, "args": { "External id": 176917, "cbid": 251, "correlation": 176917 } }, { "ph": "f", "id": 176917, "pid": 76337, "tid": -914061504, "ts": 1716454224095647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224095649, "dur": 0, "args": { "External id": 176918, "cbid": 251, "correlation": 176918 } }, { "ph": "f", "id": 176918, "pid": 76337, "tid": -914061504, "ts": 1716454224095649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224202094, "dur": 115, "args": { "External id": 176919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176919, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 176919, "pid": 5, "tid": 7, "ts": 1716454224202094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095651, "dur": 12, "args": { "External id": 176919, "cbid": 211, "correlation": 176919 } }, { "ph": "s", "id": 176919, "pid": 76337, "tid": -914061504, "ts": 1716454224095651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224202210, "dur": 60, "args": { "External id": 176925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176925, "pid": 5, "tid": 7, "ts": 1716454224202210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095685, "dur": 9, "args": { "External id": 176925, "cbid": 211, "correlation": 176925 } }, { "ph": "s", "id": 176925, "pid": 76337, "tid": -914061504, "ts": 1716454224095685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224202271, "dur": 50, "args": { "External id": 176933, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176933, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176933, "pid": 5, "tid": 7, "ts": 1716454224202271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095716, "dur": 8, "args": { "External id": 176933, "cbid": 211, "correlation": 176933 } }, { "ph": "s", "id": 176933, "pid": 76337, "tid": -914061504, "ts": 1716454224095716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224202322, "dur": 98, "args": { "External id": 176942, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176942, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176942, "pid": 5, "tid": 7, "ts": 1716454224202322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095755, "dur": 11, "args": { "External id": 176942, "cbid": 211, "correlation": 176942 } }, { "ph": "s", "id": 176942, "pid": 76337, "tid": -914061504, "ts": 1716454224095755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224202421, "dur": 92, "args": { "External id": 176962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176962, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 176962, "pid": 5, "tid": 7, "ts": 1716454224202421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095832, "dur": 12, "args": { "External id": 176962, "cbid": 211, "correlation": 176962 } }, { "ph": "s", "id": 176962, "pid": 76337, "tid": -914061504, "ts": 1716454224095832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224202514, "dur": 4, "args": { "External id": 176974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176974, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 176974, "pid": 5, "tid": 7, "ts": 1716454224202514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095855, "dur": 7, "args": { "External id": 176974, "cbid": 211, "correlation": 176974 } }, { "ph": "s", "id": 176974, "pid": 76337, "tid": -914061504, "ts": 1716454224095855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224202520, "dur": 108, "args": { "External id": 176977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176977, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176977, "pid": 5, "tid": 7, "ts": 1716454224202520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095874, "dur": 7, "args": { "External id": 176977, "cbid": 211, "correlation": 176977 } }, { "ph": "s", "id": 176977, "pid": 76337, "tid": -914061504, "ts": 1716454224095874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224202630, "dur": 69, "args": { "External id": 176986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 176986, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 176986, "pid": 5, "tid": 7, "ts": 1716454224202630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095914, "dur": 9, "args": { "External id": 176986, "cbid": 211, "correlation": 176986 } }, { "ph": "s", "id": 176986, "pid": 76337, "tid": -914061504, "ts": 1716454224095914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224095967, "dur": 0, "args": { "External id": 176996, "cbid": 317, "correlation": 176996 } }, { "ph": "f", "id": 176996, "pid": 76337, "tid": -914061504, "ts": 1716454224095967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224095968, "dur": 0, "args": { "External id": 176997, "cbid": 203, "correlation": 176997 } }, { "ph": "f", "id": 176997, "pid": 76337, "tid": -914061504, "ts": 1716454224095968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224095969, "dur": 0, "args": { "External id": 176998, "cbid": 205, "correlation": 176998 } }, { "ph": "f", "id": 176998, "pid": 76337, "tid": -914061504, "ts": 1716454224095969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224202700, "dur": 76, "args": { "External id": 177002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177002, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177002, "pid": 5, "tid": 7, "ts": 1716454224202700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224095996, "dur": 13, "args": { "External id": 177002, "cbid": 211, "correlation": 177002 } }, { "ph": "s", "id": 177002, "pid": 76337, "tid": -914061504, "ts": 1716454224095996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224202777, "dur": 24, "args": { "External id": 177004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177004, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177004, "pid": 5, "tid": 7, "ts": 1716454224202777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096012, "dur": 5, "args": { "External id": 177004, "cbid": 211, "correlation": 177004 } }, { "ph": "s", "id": 177004, "pid": 76337, "tid": -914061504, "ts": 1716454224096012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224202803, "dur": 4, "args": { "External id": 177006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177006, "pid": 5, "tid": 7, "ts": 1716454224202803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096021, "dur": 6, "args": { "External id": 177006, "cbid": 211, "correlation": 177006 } }, { "ph": "s", "id": 177006, "pid": 76337, "tid": -914061504, "ts": 1716454224096021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224096030, "dur": 0, "args": { "External id": 177007, "cbid": 51, "correlation": 177007 } }, { "ph": "s", "id": 177007, "pid": 76337, "tid": -914061504, "ts": 1716454224096030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224202809, "dur": 1371, "args": { "External id": 177008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177008, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177008, "pid": 5, "tid": 7, "ts": 1716454224202809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096031, "dur": 5, "args": { "External id": 177008, "cbid": 211, "correlation": 177008 } }, { "ph": "s", "id": 177008, "pid": 76337, "tid": -914061504, "ts": 1716454224096031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224204181, "dur": 59, "args": { "External id": 177013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177013, "pid": 5, "tid": 7, "ts": 1716454224204181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096059, "dur": 9, "args": { "External id": 177013, "cbid": 211, "correlation": 177013 } }, { "ph": "s", "id": 177013, "pid": 76337, "tid": -914061504, "ts": 1716454224096059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224204241, "dur": 4, "args": { "External id": 177021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177021, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177021, "pid": 5, "tid": 7, "ts": 1716454224204241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096105, "dur": 9, "args": { "External id": 177021, "cbid": 211, "correlation": 177021 } }, { "ph": "s", "id": 177021, "pid": 76337, "tid": -914061504, "ts": 1716454224096105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096173, "dur": 1, "args": { "External id": 177037, "cbid": 251, "correlation": 177037 } }, { "ph": "f", "id": 177037, "pid": 76337, "tid": -914061504, "ts": 1716454224096173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096178, "dur": 0, "args": { "External id": 177039, "cbid": 251, "correlation": 177039 } }, { "ph": "f", "id": 177039, "pid": 76337, "tid": -914061504, "ts": 1716454224096178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224204247, "dur": 11, "args": { "External id": 177040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177040, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 177040, "pid": 5, "tid": 7, "ts": 1716454224204247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096180, "dur": 12, "args": { "External id": 177040, "cbid": 211, "correlation": 177040 } }, { "ph": "s", "id": 177040, "pid": 76337, "tid": -914061504, "ts": 1716454224096180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224204259, "dur": 5, "args": { "External id": 177042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177042, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 177042, "pid": 5, "tid": 7, "ts": 1716454224204259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096193, "dur": 5, "args": { "External id": 177042, "cbid": 211, "correlation": 177042 } }, { "ph": "s", "id": 177042, "pid": 76337, "tid": -914061504, "ts": 1716454224096193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224204266, "dur": 55, "args": { "External id": 177052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177052, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177052, "pid": 5, "tid": 7, "ts": 1716454224204266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096250, "dur": 12, "args": { "External id": 177052, "cbid": 211, "correlation": 177052 } }, { "ph": "s", "id": 177052, "pid": 76337, "tid": -914061504, "ts": 1716454224096250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224204322, "dur": 51, "args": { "External id": 177072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177072, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 177072, "pid": 5, "tid": 7, "ts": 1716454224204322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096319, "dur": 11, "args": { "External id": 177072, "cbid": 211, "correlation": 177072 } }, { "ph": "s", "id": 177072, "pid": 76337, "tid": -914061504, "ts": 1716454224096319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224204375, "dur": 4, "args": { "External id": 177084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177084, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177084, "pid": 5, "tid": 7, "ts": 1716454224204375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096340, "dur": 6, "args": { "External id": 177084, "cbid": 211, "correlation": 177084 } }, { "ph": "s", "id": 177084, "pid": 76337, "tid": -914061504, "ts": 1716454224096340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224204380, "dur": 55, "args": { "External id": 177087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177087, "pid": 5, "tid": 7, "ts": 1716454224204380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096359, "dur": 6, "args": { "External id": 177087, "cbid": 211, "correlation": 177087 } }, { "ph": "s", "id": 177087, "pid": 76337, "tid": -914061504, "ts": 1716454224096359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224204436, "dur": 36, "args": { "External id": 177096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177096, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177096, "pid": 5, "tid": 7, "ts": 1716454224204436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096398, "dur": 11, "args": { "External id": 177096, "cbid": 211, "correlation": 177096 } }, { "ph": "s", "id": 177096, "pid": 76337, "tid": -914061504, "ts": 1716454224096398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224096465, "dur": 0, "args": { "External id": 177106, "cbid": 317, "correlation": 177106 } }, { "ph": "f", "id": 177106, "pid": 76337, "tid": -914061504, "ts": 1716454224096465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224096466, "dur": 0, "args": { "External id": 177107, "cbid": 203, "correlation": 177107 } }, { "ph": "f", "id": 177107, "pid": 76337, "tid": -914061504, "ts": 1716454224096466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224096467, "dur": 0, "args": { "External id": 177108, "cbid": 205, "correlation": 177108 } }, { "ph": "f", "id": 177108, "pid": 76337, "tid": -914061504, "ts": 1716454224096467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224204474, "dur": 40, "args": { "External id": 177112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177112, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177112, "pid": 5, "tid": 7, "ts": 1716454224204474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096481, "dur": 12, "args": { "External id": 177112, "cbid": 211, "correlation": 177112 } }, { "ph": "s", "id": 177112, "pid": 76337, "tid": -914061504, "ts": 1716454224096481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224204515, "dur": 14, "args": { "External id": 177114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177114, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177114, "pid": 5, "tid": 7, "ts": 1716454224204515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096496, "dur": 5, "args": { "External id": 177114, "cbid": 211, "correlation": 177114 } }, { "ph": "s", "id": 177114, "pid": 76337, "tid": -914061504, "ts": 1716454224096496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224204531, "dur": 3, "args": { "External id": 177116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177116, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177116, "pid": 5, "tid": 7, "ts": 1716454224204531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096505, "dur": 5, "args": { "External id": 177116, "cbid": 211, "correlation": 177116 } }, { "ph": "s", "id": 177116, "pid": 76337, "tid": -914061504, "ts": 1716454224096505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224096513, "dur": 0, "args": { "External id": 177117, "cbid": 51, "correlation": 177117 } }, { "ph": "s", "id": 177117, "pid": 76337, "tid": -914061504, "ts": 1716454224096513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224204535, "dur": 702, "args": { "External id": 177118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177118, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177118, "pid": 5, "tid": 7, "ts": 1716454224204535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096514, "dur": 5, "args": { "External id": 177118, "cbid": 211, "correlation": 177118 } }, { "ph": "s", "id": 177118, "pid": 76337, "tid": -914061504, "ts": 1716454224096514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224205238, "dur": 60, "args": { "External id": 177123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177123, "pid": 5, "tid": 7, "ts": 1716454224205238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096541, "dur": 8, "args": { "External id": 177123, "cbid": 211, "correlation": 177123 } }, { "ph": "s", "id": 177123, "pid": 76337, "tid": -914061504, "ts": 1716454224096541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224096598, "dur": 0, "args": { "External id": 177133, "cbid": 317, "correlation": 177133 } }, { "ph": "f", "id": 177133, "pid": 76337, "tid": -914061504, "ts": 1716454224096598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224096599, "dur": 0, "args": { "External id": 177134, "cbid": 203, "correlation": 177134 } }, { "ph": "f", "id": 177134, "pid": 76337, "tid": -914061504, "ts": 1716454224096599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224096599, "dur": 0, "args": { "External id": 177135, "cbid": 205, "correlation": 177135 } }, { "ph": "f", "id": 177135, "pid": 76337, "tid": -914061504, "ts": 1716454224096599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224205299, "dur": 75, "args": { "External id": 177139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177139, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177139, "pid": 5, "tid": 7, "ts": 1716454224205299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096613, "dur": 11, "args": { "External id": 177139, "cbid": 211, "correlation": 177139 } }, { "ph": "s", "id": 177139, "pid": 76337, "tid": -914061504, "ts": 1716454224096613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224205375, "dur": 209, "args": { "External id": 177141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177141, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177141, "pid": 5, "tid": 7, "ts": 1716454224205375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096632, "dur": 8, "args": { "External id": 177141, "cbid": 211, "correlation": 177141 } }, { "ph": "s", "id": 177141, "pid": 76337, "tid": -914061504, "ts": 1716454224096632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224205586, "dur": 39, "args": { "External id": 177143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177143, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177143, "pid": 5, "tid": 7, "ts": 1716454224205586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096644, "dur": 6, "args": { "External id": 177143, "cbid": 211, "correlation": 177143 } }, { "ph": "s", "id": 177143, "pid": 76337, "tid": -914061504, "ts": 1716454224096644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224205626, "dur": 59, "args": { "External id": 177149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177149, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177149, "pid": 5, "tid": 7, "ts": 1716454224205626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096671, "dur": 8, "args": { "External id": 177149, "cbid": 211, "correlation": 177149 } }, { "ph": "s", "id": 177149, "pid": 76337, "tid": -914061504, "ts": 1716454224096671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224205687, "dur": 50, "args": { "External id": 177157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177157, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177157, "pid": 5, "tid": 7, "ts": 1716454224205687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096700, "dur": 9, "args": { "External id": 177157, "cbid": 211, "correlation": 177157 } }, { "ph": "s", "id": 177157, "pid": 76337, "tid": -914061504, "ts": 1716454224096700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224205738, "dur": 35, "args": { "External id": 177165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177165, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177165, "pid": 5, "tid": 7, "ts": 1716454224205738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096729, "dur": 8, "args": { "External id": 177165, "cbid": 211, "correlation": 177165 } }, { "ph": "s", "id": 177165, "pid": 76337, "tid": -914061504, "ts": 1716454224096729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224205775, "dur": 53, "args": { "External id": 177185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177185, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 177185, "pid": 5, "tid": 7, "ts": 1716454224205775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096817, "dur": 13, "args": { "External id": 177185, "cbid": 211, "correlation": 177185 } }, { "ph": "s", "id": 177185, "pid": 76337, "tid": -914061504, "ts": 1716454224096817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224205829, "dur": 5, "args": { "External id": 177197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177197, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177197, "pid": 5, "tid": 7, "ts": 1716454224205829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096839, "dur": 7, "args": { "External id": 177197, "cbid": 211, "correlation": 177197 } }, { "ph": "s", "id": 177197, "pid": 76337, "tid": -914061504, "ts": 1716454224096839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224205835, "dur": 56, "args": { "External id": 177200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177200, "pid": 5, "tid": 7, "ts": 1716454224205835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096857, "dur": 7, "args": { "External id": 177200, "cbid": 211, "correlation": 177200 } }, { "ph": "s", "id": 177200, "pid": 76337, "tid": -914061504, "ts": 1716454224096857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224096919, "dur": 0, "args": { "External id": 177211, "cbid": 317, "correlation": 177211 } }, { "ph": "f", "id": 177211, "pid": 76337, "tid": -914061504, "ts": 1716454224096919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224096920, "dur": 0, "args": { "External id": 177212, "cbid": 203, "correlation": 177212 } }, { "ph": "f", "id": 177212, "pid": 76337, "tid": -914061504, "ts": 1716454224096920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224096920, "dur": 0, "args": { "External id": 177213, "cbid": 205, "correlation": 177213 } }, { "ph": "f", "id": 177213, "pid": 76337, "tid": -914061504, "ts": 1716454224096920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096942, "dur": 1, "args": { "External id": 177217, "cbid": 251, "correlation": 177217 } }, { "ph": "f", "id": 177217, "pid": 76337, "tid": -914061504, "ts": 1716454224096942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096944, "dur": 0, "args": { "External id": 177218, "cbid": 251, "correlation": 177218 } }, { "ph": "f", "id": 177218, "pid": 76337, "tid": -914061504, "ts": 1716454224096944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096945, "dur": 0, "args": { "External id": 177219, "cbid": 251, "correlation": 177219 } }, { "ph": "f", "id": 177219, "pid": 76337, "tid": -914061504, "ts": 1716454224096945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096946, "dur": 0, "args": { "External id": 177220, "cbid": 251, "correlation": 177220 } }, { "ph": "f", "id": 177220, "pid": 76337, "tid": -914061504, "ts": 1716454224096946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096946, "dur": 0, "args": { "External id": 177221, "cbid": 251, "correlation": 177221 } }, { "ph": "f", "id": 177221, "pid": 76337, "tid": -914061504, "ts": 1716454224096946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096947, "dur": 0, "args": { "External id": 177222, "cbid": 251, "correlation": 177222 } }, { "ph": "f", "id": 177222, "pid": 76337, "tid": -914061504, "ts": 1716454224096947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096948, "dur": 0, "args": { "External id": 177223, "cbid": 251, "correlation": 177223 } }, { "ph": "f", "id": 177223, "pid": 76337, "tid": -914061504, "ts": 1716454224096948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096949, "dur": 0, "args": { "External id": 177224, "cbid": 251, "correlation": 177224 } }, { "ph": "f", "id": 177224, "pid": 76337, "tid": -914061504, "ts": 1716454224096949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224096950, "dur": 0, "args": { "External id": 177225, "cbid": 251, "correlation": 177225 } }, { "ph": "f", "id": 177225, "pid": 76337, "tid": -914061504, "ts": 1716454224096950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224205892, "dur": 113, "args": { "External id": 177226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177226, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 177226, "pid": 5, "tid": 7, "ts": 1716454224205892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096952, "dur": 13, "args": { "External id": 177226, "cbid": 211, "correlation": 177226 } }, { "ph": "s", "id": 177226, "pid": 76337, "tid": -914061504, "ts": 1716454224096952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224206006, "dur": 60, "args": { "External id": 177232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177232, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177232, "pid": 5, "tid": 7, "ts": 1716454224206006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224096996, "dur": 9, "args": { "External id": 177232, "cbid": 211, "correlation": 177232 } }, { "ph": "s", "id": 177232, "pid": 76337, "tid": -914061504, "ts": 1716454224096996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224206067, "dur": 582, "args": { "External id": 177241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177241, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177241, "pid": 5, "tid": 7, "ts": 1716454224206067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097082, "dur": 14, "args": { "External id": 177241, "cbid": 211, "correlation": 177241 } }, { "ph": "s", "id": 177241, "pid": 76337, "tid": -914061504, "ts": 1716454224097082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224206651, "dur": 183, "args": { "External id": 177263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177263, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177263, "pid": 5, "tid": 7, "ts": 1716454224206651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097142, "dur": 11, "args": { "External id": 177263, "cbid": 211, "correlation": 177263 } }, { "ph": "s", "id": 177263, "pid": 76337, "tid": -914061504, "ts": 1716454224097142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224097232, "dur": 1, "args": { "External id": 177274, "cbid": 251, "correlation": 177274 } }, { "ph": "f", "id": 177274, "pid": 76337, "tid": -914061504, "ts": 1716454224097232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224206835, "dur": 199, "args": { "External id": 177275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177275, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177275, "pid": 5, "tid": 7, "ts": 1716454224206835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097237, "dur": 13, "args": { "External id": 177275, "cbid": 211, "correlation": 177275 } }, { "ph": "s", "id": 177275, "pid": 76337, "tid": -914061504, "ts": 1716454224097237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224097309, "dur": 1, "args": { "External id": 177286, "cbid": 251, "correlation": 177286 } }, { "ph": "f", "id": 177286, "pid": 76337, "tid": -914061504, "ts": 1716454224097309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224207036, "dur": 191, "args": { "External id": 177287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177287, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177287, "pid": 5, "tid": 7, "ts": 1716454224207036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097313, "dur": 12, "args": { "External id": 177287, "cbid": 211, "correlation": 177287 } }, { "ph": "s", "id": 177287, "pid": 76337, "tid": -914061504, "ts": 1716454224097313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224097379, "dur": 1, "args": { "External id": 177298, "cbid": 251, "correlation": 177298 } }, { "ph": "f", "id": 177298, "pid": 76337, "tid": -914061504, "ts": 1716454224097379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224207228, "dur": 192, "args": { "External id": 177299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177299, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177299, "pid": 5, "tid": 7, "ts": 1716454224207228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097383, "dur": 11, "args": { "External id": 177299, "cbid": 211, "correlation": 177299 } }, { "ph": "s", "id": 177299, "pid": 76337, "tid": -914061504, "ts": 1716454224097383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224207421, "dur": 18792, "args": { "External id": 177320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177320, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 177320, "pid": 5, "tid": 7, "ts": 1716454224207421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097465, "dur": 12, "args": { "External id": 177320, "cbid": 211, "correlation": 177320 } }, { "ph": "s", "id": 177320, "pid": 76337, "tid": -914061504, "ts": 1716454224097465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224097565, "dur": 1, "args": { "External id": 177338, "cbid": 251, "correlation": 177338 } }, { "ph": "f", "id": 177338, "pid": 76337, "tid": -914061504, "ts": 1716454224097565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224226214, "dur": 203, "args": { "External id": 177340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177340, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177340, "pid": 5, "tid": 7, "ts": 1716454224226214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097571, "dur": 13, "args": { "External id": 177340, "cbid": 211, "correlation": 177340 } }, { "ph": "s", "id": 177340, "pid": 76337, "tid": -914061504, "ts": 1716454224097571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224226418, "dur": 66, "args": { "External id": 177348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177348, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177348, "pid": 5, "tid": 7, "ts": 1716454224226418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097643, "dur": 12, "args": { "External id": 177348, "cbid": 211, "correlation": 177348 } }, { "ph": "s", "id": 177348, "pid": 76337, "tid": -914061504, "ts": 1716454224097643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224226485, "dur": 97, "args": { "External id": 177356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177356, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177356, "pid": 5, "tid": 7, "ts": 1716454224226485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097683, "dur": 9, "args": { "External id": 177356, "cbid": 211, "correlation": 177356 } }, { "ph": "s", "id": 177356, "pid": 76337, "tid": -914061504, "ts": 1716454224097683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224226584, "dur": 55, "args": { "External id": 177367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177367, "pid": 5, "tid": 7, "ts": 1716454224226584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097757, "dur": 13, "args": { "External id": 177367, "cbid": 211, "correlation": 177367 } }, { "ph": "s", "id": 177367, "pid": 76337, "tid": -914061504, "ts": 1716454224097757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224226640, "dur": 92, "args": { "External id": 177389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177389, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177389, "pid": 5, "tid": 7, "ts": 1716454224226640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224097789, "dur": 1166, "args": { "External id": 177389, "cbid": 211, "correlation": 177389 } }, { "ph": "s", "id": 177389, "pid": 76337, "tid": -914061504, "ts": 1716454224097789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099044, "dur": 1, "args": { "External id": 177400, "cbid": 251, "correlation": 177400 } }, { "ph": "f", "id": 177400, "pid": 76337, "tid": -914061504, "ts": 1716454224099044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224226733, "dur": 106, "args": { "External id": 177401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177401, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177401, "pid": 5, "tid": 7, "ts": 1716454224226733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099049, "dur": 58, "args": { "External id": 177401, "cbid": 211, "correlation": 177401 } }, { "ph": "s", "id": 177401, "pid": 76337, "tid": -914061504, "ts": 1716454224099049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099168, "dur": 1, "args": { "External id": 177412, "cbid": 251, "correlation": 177412 } }, { "ph": "f", "id": 177412, "pid": 76337, "tid": -914061504, "ts": 1716454224099168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099172, "dur": 0, "args": { "External id": 177413, "cbid": 251, "correlation": 177413 } }, { "ph": "f", "id": 177413, "pid": 76337, "tid": -914061504, "ts": 1716454224099172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224226841, "dur": 10, "args": { "External id": 177414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177414, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 177414, "pid": 5, "tid": 7, "ts": 1716454224226841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099174, "dur": 13, "args": { "External id": 177414, "cbid": 211, "correlation": 177414 } }, { "ph": "s", "id": 177414, "pid": 76337, "tid": -914061504, "ts": 1716454224099174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224226852, "dur": 5, "args": { "External id": 177416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177416, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 177416, "pid": 5, "tid": 7, "ts": 1716454224226852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099188, "dur": 6, "args": { "External id": 177416, "cbid": 211, "correlation": 177416 } }, { "ph": "s", "id": 177416, "pid": 76337, "tid": -914061504, "ts": 1716454224099188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099251, "dur": 1, "args": { "External id": 177427, "cbid": 251, "correlation": 177427 } }, { "ph": "f", "id": 177427, "pid": 76337, "tid": -914061504, "ts": 1716454224099251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099254, "dur": 0, "args": { "External id": 177428, "cbid": 251, "correlation": 177428 } }, { "ph": "f", "id": 177428, "pid": 76337, "tid": -914061504, "ts": 1716454224099254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224226859, "dur": 6, "args": { "External id": 177429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177429, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 177429, "pid": 5, "tid": 7, "ts": 1716454224226859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099256, "dur": 12, "args": { "External id": 177429, "cbid": 211, "correlation": 177429 } }, { "ph": "s", "id": 177429, "pid": 76337, "tid": -914061504, "ts": 1716454224099256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224226866, "dur": 3, "args": { "External id": 177431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177431, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 177431, "pid": 5, "tid": 7, "ts": 1716454224226866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099269, "dur": 5, "args": { "External id": 177431, "cbid": 211, "correlation": 177431 } }, { "ph": "s", "id": 177431, "pid": 76337, "tid": -914061504, "ts": 1716454224099269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224226871, "dur": 156, "args": { "External id": 177452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177452, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 177452, "pid": 5, "tid": 7, "ts": 1716454224226871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099344, "dur": 12, "args": { "External id": 177452, "cbid": 211, "correlation": 177452 } }, { "ph": "s", "id": 177452, "pid": 76337, "tid": -914061504, "ts": 1716454224099344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099445, "dur": 1, "args": { "External id": 177470, "cbid": 251, "correlation": 177470 } }, { "ph": "f", "id": 177470, "pid": 76337, "tid": -914061504, "ts": 1716454224099445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224227029, "dur": 106, "args": { "External id": 177472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177472, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 177472, "pid": 5, "tid": 7, "ts": 1716454224227029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099451, "dur": 14, "args": { "External id": 177472, "cbid": 211, "correlation": 177472 } }, { "ph": "s", "id": 177472, "pid": 76337, "tid": -914061504, "ts": 1716454224099451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224227136, "dur": 35, "args": { "External id": 177480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177480, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177480, "pid": 5, "tid": 7, "ts": 1716454224227136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099526, "dur": 12, "args": { "External id": 177480, "cbid": 211, "correlation": 177480 } }, { "ph": "s", "id": 177480, "pid": 76337, "tid": -914061504, "ts": 1716454224099526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224227172, "dur": 67, "args": { "External id": 177488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177488, "pid": 5, "tid": 7, "ts": 1716454224227172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099567, "dur": 10, "args": { "External id": 177488, "cbid": 211, "correlation": 177488 } }, { "ph": "s", "id": 177488, "pid": 76337, "tid": -914061504, "ts": 1716454224099567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224227240, "dur": 92, "args": { "External id": 177510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177510, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177510, "pid": 5, "tid": 7, "ts": 1716454224227240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099619, "dur": 11, "args": { "External id": 177510, "cbid": 211, "correlation": 177510 } }, { "ph": "s", "id": 177510, "pid": 76337, "tid": -914061504, "ts": 1716454224099619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099712, "dur": 1, "args": { "External id": 177526, "cbid": 251, "correlation": 177526 } }, { "ph": "f", "id": 177526, "pid": 76337, "tid": -914061504, "ts": 1716454224099712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224227334, "dur": 580, "args": { "External id": 177528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177528, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177528, "pid": 5, "tid": 7, "ts": 1716454224227334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099718, "dur": 14, "args": { "External id": 177528, "cbid": 211, "correlation": 177528 } }, { "ph": "s", "id": 177528, "pid": 76337, "tid": -914061504, "ts": 1716454224099718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224227915, "dur": 246, "args": { "External id": 177536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177536, "pid": 5, "tid": 7, "ts": 1716454224227915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099785, "dur": 12, "args": { "External id": 177536, "cbid": 211, "correlation": 177536 } }, { "ph": "s", "id": 177536, "pid": 76337, "tid": -914061504, "ts": 1716454224099785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224228162, "dur": 254, "args": { "External id": 177544, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177544, "pid": 5, "tid": 7, "ts": 1716454224228162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099817, "dur": 8, "args": { "External id": 177544, "cbid": 211, "correlation": 177544 } }, { "ph": "s", "id": 177544, "pid": 76337, "tid": -914061504, "ts": 1716454224099817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099902, "dur": 1, "args": { "External id": 177560, "cbid": 251, "correlation": 177560 } }, { "ph": "f", "id": 177560, "pid": 76337, "tid": -914061504, "ts": 1716454224099902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224099907, "dur": 0, "args": { "External id": 177562, "cbid": 251, "correlation": 177562 } }, { "ph": "f", "id": 177562, "pid": 76337, "tid": -914061504, "ts": 1716454224099907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224228418, "dur": 361, "args": { "External id": 177563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177563, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 177563, "pid": 5, "tid": 7, "ts": 1716454224228418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099910, "dur": 13, "args": { "External id": 177563, "cbid": 211, "correlation": 177563 } }, { "ph": "s", "id": 177563, "pid": 76337, "tid": -914061504, "ts": 1716454224099910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224228780, "dur": 50, "args": { "External id": 177571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177571, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177571, "pid": 5, "tid": 7, "ts": 1716454224228780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224099952, "dur": 10, "args": { "External id": 177571, "cbid": 211, "correlation": 177571 } }, { "ph": "s", "id": 177571, "pid": 76337, "tid": -914061504, "ts": 1716454224099952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224228831, "dur": 159, "args": { "External id": 177582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177582, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177582, "pid": 5, "tid": 7, "ts": 1716454224228831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100032, "dur": 190, "args": { "External id": 177582, "cbid": 211, "correlation": 177582 } }, { "ph": "s", "id": 177582, "pid": 76337, "tid": -914061504, "ts": 1716454224100032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224100281, "dur": 0, "args": { "External id": 177594, "cbid": 317, "correlation": 177594 } }, { "ph": "f", "id": 177594, "pid": 76337, "tid": -914061504, "ts": 1716454224100281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224100282, "dur": 0, "args": { "External id": 177595, "cbid": 203, "correlation": 177595 } }, { "ph": "f", "id": 177595, "pid": 76337, "tid": -914061504, "ts": 1716454224100282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224100282, "dur": 1, "args": { "External id": 177596, "cbid": 205, "correlation": 177596 } }, { "ph": "f", "id": 177596, "pid": 76337, "tid": -914061504, "ts": 1716454224100282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100309, "dur": 1, "args": { "External id": 177600, "cbid": 251, "correlation": 177600 } }, { "ph": "f", "id": 177600, "pid": 76337, "tid": -914061504, "ts": 1716454224100309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100311, "dur": 0, "args": { "External id": 177601, "cbid": 251, "correlation": 177601 } }, { "ph": "f", "id": 177601, "pid": 76337, "tid": -914061504, "ts": 1716454224100311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100312, "dur": 0, "args": { "External id": 177602, "cbid": 251, "correlation": 177602 } }, { "ph": "f", "id": 177602, "pid": 76337, "tid": -914061504, "ts": 1716454224100312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100313, "dur": 0, "args": { "External id": 177603, "cbid": 251, "correlation": 177603 } }, { "ph": "f", "id": 177603, "pid": 76337, "tid": -914061504, "ts": 1716454224100313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100314, "dur": 0, "args": { "External id": 177604, "cbid": 251, "correlation": 177604 } }, { "ph": "f", "id": 177604, "pid": 76337, "tid": -914061504, "ts": 1716454224100314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100314, "dur": 0, "args": { "External id": 177605, "cbid": 251, "correlation": 177605 } }, { "ph": "f", "id": 177605, "pid": 76337, "tid": -914061504, "ts": 1716454224100314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100315, "dur": 0, "args": { "External id": 177606, "cbid": 251, "correlation": 177606 } }, { "ph": "f", "id": 177606, "pid": 76337, "tid": -914061504, "ts": 1716454224100315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100316, "dur": 0, "args": { "External id": 177607, "cbid": 251, "correlation": 177607 } }, { "ph": "f", "id": 177607, "pid": 76337, "tid": -914061504, "ts": 1716454224100316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224100317, "dur": 0, "args": { "External id": 177608, "cbid": 251, "correlation": 177608 } }, { "ph": "f", "id": 177608, "pid": 76337, "tid": -914061504, "ts": 1716454224100317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224228992, "dur": 116, "args": { "External id": 177609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177609, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 177609, "pid": 5, "tid": 7, "ts": 1716454224228992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100319, "dur": 31, "args": { "External id": 177609, "cbid": 211, "correlation": 177609 } }, { "ph": "s", "id": 177609, "pid": 76337, "tid": -914061504, "ts": 1716454224100319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224229109, "dur": 60, "args": { "External id": 177615, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177615, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177615, "pid": 5, "tid": 7, "ts": 1716454224229109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100373, "dur": 104, "args": { "External id": 177615, "cbid": 211, "correlation": 177615 } }, { "ph": "s", "id": 177615, "pid": 76337, "tid": -914061504, "ts": 1716454224100373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224229170, "dur": 50, "args": { "External id": 177623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177623, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177623, "pid": 5, "tid": 7, "ts": 1716454224229170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100501, "dur": 238, "args": { "External id": 177623, "cbid": 211, "correlation": 177623 } }, { "ph": "s", "id": 177623, "pid": 76337, "tid": -914061504, "ts": 1716454224100501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224229221, "dur": 99, "args": { "External id": 177632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177632, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177632, "pid": 5, "tid": 7, "ts": 1716454224229221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100773, "dur": 11, "args": { "External id": 177632, "cbid": 211, "correlation": 177632 } }, { "ph": "s", "id": 177632, "pid": 76337, "tid": -914061504, "ts": 1716454224100773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224229322, "dur": 92, "args": { "External id": 177652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177652, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 177652, "pid": 5, "tid": 7, "ts": 1716454224229322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100853, "dur": 12, "args": { "External id": 177652, "cbid": 211, "correlation": 177652 } }, { "ph": "s", "id": 177652, "pid": 76337, "tid": -914061504, "ts": 1716454224100853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224229415, "dur": 5, "args": { "External id": 177664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177664, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 177664, "pid": 5, "tid": 7, "ts": 1716454224229415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100876, "dur": 7, "args": { "External id": 177664, "cbid": 211, "correlation": 177664 } }, { "ph": "s", "id": 177664, "pid": 76337, "tid": -914061504, "ts": 1716454224100876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224229421, "dur": 109, "args": { "External id": 177667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177667, "pid": 5, "tid": 7, "ts": 1716454224229421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224100895, "dur": 103, "args": { "External id": 177667, "cbid": 211, "correlation": 177667 } }, { "ph": "s", "id": 177667, "pid": 76337, "tid": -914061504, "ts": 1716454224100895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224229531, "dur": 69, "args": { "External id": 177676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177676, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177676, "pid": 5, "tid": 7, "ts": 1716454224229531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101034, "dur": 10, "args": { "External id": 177676, "cbid": 211, "correlation": 177676 } }, { "ph": "s", "id": 177676, "pid": 76337, "tid": -914061504, "ts": 1716454224101034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224101091, "dur": 0, "args": { "External id": 177686, "cbid": 317, "correlation": 177686 } }, { "ph": "f", "id": 177686, "pid": 76337, "tid": -914061504, "ts": 1716454224101091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224101092, "dur": 0, "args": { "External id": 177687, "cbid": 203, "correlation": 177687 } }, { "ph": "f", "id": 177687, "pid": 76337, "tid": -914061504, "ts": 1716454224101092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224101093, "dur": 0, "args": { "External id": 177688, "cbid": 205, "correlation": 177688 } }, { "ph": "f", "id": 177688, "pid": 76337, "tid": -914061504, "ts": 1716454224101093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224229601, "dur": 77, "args": { "External id": 177692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177692, "pid": 5, "tid": 7, "ts": 1716454224229601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101110, "dur": 12, "args": { "External id": 177692, "cbid": 211, "correlation": 177692 } }, { "ph": "s", "id": 177692, "pid": 76337, "tid": -914061504, "ts": 1716454224101110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224229679, "dur": 24, "args": { "External id": 177694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177694, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177694, "pid": 5, "tid": 7, "ts": 1716454224229679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101125, "dur": 5, "args": { "External id": 177694, "cbid": 211, "correlation": 177694 } }, { "ph": "s", "id": 177694, "pid": 76337, "tid": -914061504, "ts": 1716454224101125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224229704, "dur": 4, "args": { "External id": 177696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177696, "pid": 5, "tid": 7, "ts": 1716454224229704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101136, "dur": 6, "args": { "External id": 177696, "cbid": 211, "correlation": 177696 } }, { "ph": "s", "id": 177696, "pid": 76337, "tid": -914061504, "ts": 1716454224101136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224101146, "dur": 0, "args": { "External id": 177697, "cbid": 51, "correlation": 177697 } }, { "ph": "s", "id": 177697, "pid": 76337, "tid": -914061504, "ts": 1716454224101146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224229709, "dur": 1375, "args": { "External id": 177698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177698, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177698, "pid": 5, "tid": 7, "ts": 1716454224229709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101146, "dur": 6, "args": { "External id": 177698, "cbid": 211, "correlation": 177698 } }, { "ph": "s", "id": 177698, "pid": 76337, "tid": -914061504, "ts": 1716454224101146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224231085, "dur": 59, "args": { "External id": 177703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177703, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177703, "pid": 5, "tid": 7, "ts": 1716454224231085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101175, "dur": 9, "args": { "External id": 177703, "cbid": 211, "correlation": 177703 } }, { "ph": "s", "id": 177703, "pid": 76337, "tid": -914061504, "ts": 1716454224101175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224231145, "dur": 5, "args": { "External id": 177711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177711, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177711, "pid": 5, "tid": 7, "ts": 1716454224231145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101220, "dur": 10, "args": { "External id": 177711, "cbid": 211, "correlation": 177711 } }, { "ph": "s", "id": 177711, "pid": 76337, "tid": -914061504, "ts": 1716454224101220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224101292, "dur": 1, "args": { "External id": 177727, "cbid": 251, "correlation": 177727 } }, { "ph": "f", "id": 177727, "pid": 76337, "tid": -914061504, "ts": 1716454224101292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224101297, "dur": 0, "args": { "External id": 177729, "cbid": 251, "correlation": 177729 } }, { "ph": "f", "id": 177729, "pid": 76337, "tid": -914061504, "ts": 1716454224101297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224231152, "dur": 12, "args": { "External id": 177730, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177730, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 177730, "pid": 5, "tid": 7, "ts": 1716454224231152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101299, "dur": 12, "args": { "External id": 177730, "cbid": 211, "correlation": 177730 } }, { "ph": "s", "id": 177730, "pid": 76337, "tid": -914061504, "ts": 1716454224101299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224231165, "dur": 5, "args": { "External id": 177732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177732, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 177732, "pid": 5, "tid": 7, "ts": 1716454224231165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101314, "dur": 5, "args": { "External id": 177732, "cbid": 211, "correlation": 177732 } }, { "ph": "s", "id": 177732, "pid": 76337, "tid": -914061504, "ts": 1716454224101314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224231171, "dur": 54, "args": { "External id": 177742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177742, "pid": 5, "tid": 7, "ts": 1716454224231171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101372, "dur": 532, "args": { "External id": 177742, "cbid": 211, "correlation": 177742 } }, { "ph": "s", "id": 177742, "pid": 76337, "tid": -914061504, "ts": 1716454224101372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224231227, "dur": 52, "args": { "External id": 177762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177762, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 177762, "pid": 5, "tid": 7, "ts": 1716454224231227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101961, "dur": 19, "args": { "External id": 177762, "cbid": 211, "correlation": 177762 } }, { "ph": "s", "id": 177762, "pid": 76337, "tid": -914061504, "ts": 1716454224101961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224231280, "dur": 4, "args": { "External id": 177774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177774, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177774, "pid": 5, "tid": 7, "ts": 1716454224231280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224101991, "dur": 7, "args": { "External id": 177774, "cbid": 211, "correlation": 177774 } }, { "ph": "s", "id": 177774, "pid": 76337, "tid": -914061504, "ts": 1716454224101991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224231285, "dur": 55, "args": { "External id": 177777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177777, "pid": 5, "tid": 7, "ts": 1716454224231285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102013, "dur": 7, "args": { "External id": 177777, "cbid": 211, "correlation": 177777 } }, { "ph": "s", "id": 177777, "pid": 76337, "tid": -914061504, "ts": 1716454224102013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224231342, "dur": 37, "args": { "External id": 177786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177786, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177786, "pid": 5, "tid": 7, "ts": 1716454224231342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102055, "dur": 10, "args": { "External id": 177786, "cbid": 211, "correlation": 177786 } }, { "ph": "s", "id": 177786, "pid": 76337, "tid": -914061504, "ts": 1716454224102055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224102121, "dur": 0, "args": { "External id": 177796, "cbid": 317, "correlation": 177796 } }, { "ph": "f", "id": 177796, "pid": 76337, "tid": -914061504, "ts": 1716454224102121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224102122, "dur": 0, "args": { "External id": 177797, "cbid": 203, "correlation": 177797 } }, { "ph": "f", "id": 177797, "pid": 76337, "tid": -914061504, "ts": 1716454224102122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224102123, "dur": 0, "args": { "External id": 177798, "cbid": 205, "correlation": 177798 } }, { "ph": "f", "id": 177798, "pid": 76337, "tid": -914061504, "ts": 1716454224102123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224231380, "dur": 40, "args": { "External id": 177802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177802, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177802, "pid": 5, "tid": 7, "ts": 1716454224231380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102137, "dur": 12, "args": { "External id": 177802, "cbid": 211, "correlation": 177802 } }, { "ph": "s", "id": 177802, "pid": 76337, "tid": -914061504, "ts": 1716454224102137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224231422, "dur": 14, "args": { "External id": 177804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177804, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177804, "pid": 5, "tid": 7, "ts": 1716454224231422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102152, "dur": 5, "args": { "External id": 177804, "cbid": 211, "correlation": 177804 } }, { "ph": "s", "id": 177804, "pid": 76337, "tid": -914061504, "ts": 1716454224102152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224231437, "dur": 3, "args": { "External id": 177806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177806, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177806, "pid": 5, "tid": 7, "ts": 1716454224231437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102161, "dur": 5, "args": { "External id": 177806, "cbid": 211, "correlation": 177806 } }, { "ph": "s", "id": 177806, "pid": 76337, "tid": -914061504, "ts": 1716454224102161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224102170, "dur": 0, "args": { "External id": 177807, "cbid": 51, "correlation": 177807 } }, { "ph": "s", "id": 177807, "pid": 76337, "tid": -914061504, "ts": 1716454224102170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224231442, "dur": 701, "args": { "External id": 177808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177808, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177808, "pid": 5, "tid": 7, "ts": 1716454224231442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102171, "dur": 5, "args": { "External id": 177808, "cbid": 211, "correlation": 177808 } }, { "ph": "s", "id": 177808, "pid": 76337, "tid": -914061504, "ts": 1716454224102171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224232145, "dur": 60, "args": { "External id": 177813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177813, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177813, "pid": 5, "tid": 7, "ts": 1716454224232145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102199, "dur": 9, "args": { "External id": 177813, "cbid": 211, "correlation": 177813 } }, { "ph": "s", "id": 177813, "pid": 76337, "tid": -914061504, "ts": 1716454224102199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224102258, "dur": 0, "args": { "External id": 177823, "cbid": 317, "correlation": 177823 } }, { "ph": "f", "id": 177823, "pid": 76337, "tid": -914061504, "ts": 1716454224102258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224102259, "dur": 0, "args": { "External id": 177824, "cbid": 203, "correlation": 177824 } }, { "ph": "f", "id": 177824, "pid": 76337, "tid": -914061504, "ts": 1716454224102259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224102259, "dur": 0, "args": { "External id": 177825, "cbid": 205, "correlation": 177825 } }, { "ph": "f", "id": 177825, "pid": 76337, "tid": -914061504, "ts": 1716454224102259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224232206, "dur": 75, "args": { "External id": 177829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177829, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177829, "pid": 5, "tid": 7, "ts": 1716454224232206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102271, "dur": 11, "args": { "External id": 177829, "cbid": 211, "correlation": 177829 } }, { "ph": "s", "id": 177829, "pid": 76337, "tid": -914061504, "ts": 1716454224102271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224232282, "dur": 209, "args": { "External id": 177831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177831, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177831, "pid": 5, "tid": 7, "ts": 1716454224232282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102291, "dur": 7, "args": { "External id": 177831, "cbid": 211, "correlation": 177831 } }, { "ph": "s", "id": 177831, "pid": 76337, "tid": -914061504, "ts": 1716454224102291, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224232493, "dur": 38, "args": { "External id": 177833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177833, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177833, "pid": 5, "tid": 7, "ts": 1716454224232493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102303, "dur": 5, "args": { "External id": 177833, "cbid": 211, "correlation": 177833 } }, { "ph": "s", "id": 177833, "pid": 76337, "tid": -914061504, "ts": 1716454224102303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224232532, "dur": 60, "args": { "External id": 177839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177839, "pid": 5, "tid": 7, "ts": 1716454224232532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102329, "dur": 500, "args": { "External id": 177839, "cbid": 211, "correlation": 177839 } }, { "ph": "s", "id": 177839, "pid": 76337, "tid": -914061504, "ts": 1716454224102329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224232593, "dur": 50, "args": { "External id": 177847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177847, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177847, "pid": 5, "tid": 7, "ts": 1716454224232593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102849, "dur": 9, "args": { "External id": 177847, "cbid": 211, "correlation": 177847 } }, { "ph": "s", "id": 177847, "pid": 76337, "tid": -914061504, "ts": 1716454224102849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224232644, "dur": 35, "args": { "External id": 177855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177855, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177855, "pid": 5, "tid": 7, "ts": 1716454224232644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102879, "dur": 9, "args": { "External id": 177855, "cbid": 211, "correlation": 177855 } }, { "ph": "s", "id": 177855, "pid": 76337, "tid": -914061504, "ts": 1716454224102879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224232680, "dur": 52, "args": { "External id": 177875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177875, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 177875, "pid": 5, "tid": 7, "ts": 1716454224232680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102967, "dur": 20, "args": { "External id": 177875, "cbid": 211, "correlation": 177875 } }, { "ph": "s", "id": 177875, "pid": 76337, "tid": -914061504, "ts": 1716454224102967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224232734, "dur": 4, "args": { "External id": 177887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177887, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 177887, "pid": 5, "tid": 7, "ts": 1716454224232734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224102997, "dur": 7, "args": { "External id": 177887, "cbid": 211, "correlation": 177887 } }, { "ph": "s", "id": 177887, "pid": 76337, "tid": -914061504, "ts": 1716454224102997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224232740, "dur": 55, "args": { "External id": 177890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177890, "pid": 5, "tid": 7, "ts": 1716454224232740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103016, "dur": 7, "args": { "External id": 177890, "cbid": 211, "correlation": 177890 } }, { "ph": "s", "id": 177890, "pid": 76337, "tid": -914061504, "ts": 1716454224103016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224103078, "dur": 0, "args": { "External id": 177901, "cbid": 317, "correlation": 177901 } }, { "ph": "f", "id": 177901, "pid": 76337, "tid": -914061504, "ts": 1716454224103078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224103078, "dur": 0, "args": { "External id": 177902, "cbid": 203, "correlation": 177902 } }, { "ph": "f", "id": 177902, "pid": 76337, "tid": -914061504, "ts": 1716454224103078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224103079, "dur": 0, "args": { "External id": 177903, "cbid": 205, "correlation": 177903 } }, { "ph": "f", "id": 177903, "pid": 76337, "tid": -914061504, "ts": 1716454224103079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103101, "dur": 1, "args": { "External id": 177907, "cbid": 251, "correlation": 177907 } }, { "ph": "f", "id": 177907, "pid": 76337, "tid": -914061504, "ts": 1716454224103101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103104, "dur": 0, "args": { "External id": 177908, "cbid": 251, "correlation": 177908 } }, { "ph": "f", "id": 177908, "pid": 76337, "tid": -914061504, "ts": 1716454224103104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103105, "dur": 0, "args": { "External id": 177909, "cbid": 251, "correlation": 177909 } }, { "ph": "f", "id": 177909, "pid": 76337, "tid": -914061504, "ts": 1716454224103105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103106, "dur": 0, "args": { "External id": 177910, "cbid": 251, "correlation": 177910 } }, { "ph": "f", "id": 177910, "pid": 76337, "tid": -914061504, "ts": 1716454224103106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103106, "dur": 0, "args": { "External id": 177911, "cbid": 251, "correlation": 177911 } }, { "ph": "f", "id": 177911, "pid": 76337, "tid": -914061504, "ts": 1716454224103106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103107, "dur": 0, "args": { "External id": 177912, "cbid": 251, "correlation": 177912 } }, { "ph": "f", "id": 177912, "pid": 76337, "tid": -914061504, "ts": 1716454224103107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103108, "dur": 0, "args": { "External id": 177913, "cbid": 251, "correlation": 177913 } }, { "ph": "f", "id": 177913, "pid": 76337, "tid": -914061504, "ts": 1716454224103108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103109, "dur": 0, "args": { "External id": 177914, "cbid": 251, "correlation": 177914 } }, { "ph": "f", "id": 177914, "pid": 76337, "tid": -914061504, "ts": 1716454224103109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103111, "dur": 0, "args": { "External id": 177915, "cbid": 251, "correlation": 177915 } }, { "ph": "f", "id": 177915, "pid": 76337, "tid": -914061504, "ts": 1716454224103111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224232796, "dur": 115, "args": { "External id": 177916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177916, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 177916, "pid": 5, "tid": 7, "ts": 1716454224232796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103113, "dur": 12, "args": { "External id": 177916, "cbid": 211, "correlation": 177916 } }, { "ph": "s", "id": 177916, "pid": 76337, "tid": -914061504, "ts": 1716454224103113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224232912, "dur": 60, "args": { "External id": 177922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177922, "pid": 5, "tid": 7, "ts": 1716454224232912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103148, "dur": 9, "args": { "External id": 177922, "cbid": 211, "correlation": 177922 } }, { "ph": "s", "id": 177922, "pid": 76337, "tid": -914061504, "ts": 1716454224103148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224232973, "dur": 645, "args": { "External id": 177931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177931, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177931, "pid": 5, "tid": 7, "ts": 1716454224232973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103254, "dur": 15, "args": { "External id": 177931, "cbid": 211, "correlation": 177931 } }, { "ph": "s", "id": 177931, "pid": 76337, "tid": -914061504, "ts": 1716454224103254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224233620, "dur": 182, "args": { "External id": 177953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177953, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 177953, "pid": 5, "tid": 7, "ts": 1716454224233620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103317, "dur": 10, "args": { "External id": 177953, "cbid": 211, "correlation": 177953 } }, { "ph": "s", "id": 177953, "pid": 76337, "tid": -914061504, "ts": 1716454224103317, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103413, "dur": 2, "args": { "External id": 177964, "cbid": 251, "correlation": 177964 } }, { "ph": "f", "id": 177964, "pid": 76337, "tid": -914061504, "ts": 1716454224103413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224233804, "dur": 195, "args": { "External id": 177965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177965, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177965, "pid": 5, "tid": 7, "ts": 1716454224233804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103419, "dur": 14, "args": { "External id": 177965, "cbid": 211, "correlation": 177965 } }, { "ph": "s", "id": 177965, "pid": 76337, "tid": -914061504, "ts": 1716454224103419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103491, "dur": 1, "args": { "External id": 177976, "cbid": 251, "correlation": 177976 } }, { "ph": "f", "id": 177976, "pid": 76337, "tid": -914061504, "ts": 1716454224103491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224234000, "dur": 188, "args": { "External id": 177977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177977, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177977, "pid": 5, "tid": 7, "ts": 1716454224234000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103495, "dur": 12, "args": { "External id": 177977, "cbid": 211, "correlation": 177977 } }, { "ph": "s", "id": 177977, "pid": 76337, "tid": -914061504, "ts": 1716454224103495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103563, "dur": 1, "args": { "External id": 177988, "cbid": 251, "correlation": 177988 } }, { "ph": "f", "id": 177988, "pid": 76337, "tid": -914061504, "ts": 1716454224103563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224234190, "dur": 189, "args": { "External id": 177989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 177989, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 177989, "pid": 5, "tid": 7, "ts": 1716454224234190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103568, "dur": 11, "args": { "External id": 177989, "cbid": 211, "correlation": 177989 } }, { "ph": "s", "id": 177989, "pid": 76337, "tid": -914061504, "ts": 1716454224103568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224234380, "dur": 18769, "args": { "External id": 178010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178010, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 178010, "pid": 5, "tid": 7, "ts": 1716454224234380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103649, "dur": 12, "args": { "External id": 178010, "cbid": 211, "correlation": 178010 } }, { "ph": "s", "id": 178010, "pid": 76337, "tid": -914061504, "ts": 1716454224103649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224103753, "dur": 1, "args": { "External id": 178028, "cbid": 251, "correlation": 178028 } }, { "ph": "f", "id": 178028, "pid": 76337, "tid": -914061504, "ts": 1716454224103753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224253150, "dur": 201, "args": { "External id": 178030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178030, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 178030, "pid": 5, "tid": 7, "ts": 1716454224253150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103758, "dur": 14, "args": { "External id": 178030, "cbid": 211, "correlation": 178030 } }, { "ph": "s", "id": 178030, "pid": 76337, "tid": -914061504, "ts": 1716454224103758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224253353, "dur": 67, "args": { "External id": 178038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178038, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178038, "pid": 5, "tid": 7, "ts": 1716454224253353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103833, "dur": 12, "args": { "External id": 178038, "cbid": 211, "correlation": 178038 } }, { "ph": "s", "id": 178038, "pid": 76337, "tid": -914061504, "ts": 1716454224103833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224253421, "dur": 97, "args": { "External id": 178046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178046, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178046, "pid": 5, "tid": 7, "ts": 1716454224253421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103872, "dur": 21, "args": { "External id": 178046, "cbid": 211, "correlation": 178046 } }, { "ph": "s", "id": 178046, "pid": 76337, "tid": -914061504, "ts": 1716454224103872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224253519, "dur": 55, "args": { "External id": 178057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178057, "pid": 5, "tid": 7, "ts": 1716454224253519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224103962, "dur": 1898, "args": { "External id": 178057, "cbid": 211, "correlation": 178057 } }, { "ph": "s", "id": 178057, "pid": 76337, "tid": -914061504, "ts": 1716454224103962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224253575, "dur": 92, "args": { "External id": 178079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178079, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178079, "pid": 5, "tid": 7, "ts": 1716454224253575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224105880, "dur": 127, "args": { "External id": 178079, "cbid": 211, "correlation": 178079 } }, { "ph": "s", "id": 178079, "pid": 76337, "tid": -914061504, "ts": 1716454224105880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106087, "dur": 1, "args": { "External id": 178090, "cbid": 251, "correlation": 178090 } }, { "ph": "f", "id": 178090, "pid": 76337, "tid": -914061504, "ts": 1716454224106087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224253669, "dur": 103, "args": { "External id": 178091, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178091, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 178091, "pid": 5, "tid": 7, "ts": 1716454224253669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106092, "dur": 14, "args": { "External id": 178091, "cbid": 211, "correlation": 178091 } }, { "ph": "s", "id": 178091, "pid": 76337, "tid": -914061504, "ts": 1716454224106092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106170, "dur": 1, "args": { "External id": 178102, "cbid": 251, "correlation": 178102 } }, { "ph": "f", "id": 178102, "pid": 76337, "tid": -914061504, "ts": 1716454224106170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106173, "dur": 0, "args": { "External id": 178103, "cbid": 251, "correlation": 178103 } }, { "ph": "f", "id": 178103, "pid": 76337, "tid": -914061504, "ts": 1716454224106173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224253774, "dur": 10, "args": { "External id": 178104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178104, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178104, "pid": 5, "tid": 7, "ts": 1716454224253774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106175, "dur": 13, "args": { "External id": 178104, "cbid": 211, "correlation": 178104 } }, { "ph": "s", "id": 178104, "pid": 76337, "tid": -914061504, "ts": 1716454224106175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224253785, "dur": 5, "args": { "External id": 178106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178106, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 178106, "pid": 5, "tid": 7, "ts": 1716454224253785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106190, "dur": 6, "args": { "External id": 178106, "cbid": 211, "correlation": 178106 } }, { "ph": "s", "id": 178106, "pid": 76337, "tid": -914061504, "ts": 1716454224106190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106251, "dur": 1, "args": { "External id": 178117, "cbid": 251, "correlation": 178117 } }, { "ph": "f", "id": 178117, "pid": 76337, "tid": -914061504, "ts": 1716454224106251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106255, "dur": 0, "args": { "External id": 178118, "cbid": 251, "correlation": 178118 } }, { "ph": "f", "id": 178118, "pid": 76337, "tid": -914061504, "ts": 1716454224106255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224253792, "dur": 6, "args": { "External id": 178119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178119, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178119, "pid": 5, "tid": 7, "ts": 1716454224253792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106257, "dur": 12, "args": { "External id": 178119, "cbid": 211, "correlation": 178119 } }, { "ph": "s", "id": 178119, "pid": 76337, "tid": -914061504, "ts": 1716454224106257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224253799, "dur": 3, "args": { "External id": 178121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178121, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 178121, "pid": 5, "tid": 7, "ts": 1716454224253799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106270, "dur": 6, "args": { "External id": 178121, "cbid": 211, "correlation": 178121 } }, { "ph": "s", "id": 178121, "pid": 76337, "tid": -914061504, "ts": 1716454224106270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224253804, "dur": 158, "args": { "External id": 178142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178142, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 178142, "pid": 5, "tid": 7, "ts": 1716454224253804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106344, "dur": 13, "args": { "External id": 178142, "cbid": 211, "correlation": 178142 } }, { "ph": "s", "id": 178142, "pid": 76337, "tid": -914061504, "ts": 1716454224106344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106444, "dur": 1, "args": { "External id": 178160, "cbid": 251, "correlation": 178160 } }, { "ph": "f", "id": 178160, "pid": 76337, "tid": -914061504, "ts": 1716454224106444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224253964, "dur": 109, "args": { "External id": 178162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178162, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 178162, "pid": 5, "tid": 7, "ts": 1716454224253964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106450, "dur": 14, "args": { "External id": 178162, "cbid": 211, "correlation": 178162 } }, { "ph": "s", "id": 178162, "pid": 76337, "tid": -914061504, "ts": 1716454224106450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224254074, "dur": 34, "args": { "External id": 178170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178170, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178170, "pid": 5, "tid": 7, "ts": 1716454224254074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106522, "dur": 12, "args": { "External id": 178170, "cbid": 211, "correlation": 178170 } }, { "ph": "s", "id": 178170, "pid": 76337, "tid": -914061504, "ts": 1716454224106522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224254109, "dur": 66, "args": { "External id": 178178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178178, "pid": 5, "tid": 7, "ts": 1716454224254109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106563, "dur": 9, "args": { "External id": 178178, "cbid": 211, "correlation": 178178 } }, { "ph": "s", "id": 178178, "pid": 76337, "tid": -914061504, "ts": 1716454224106563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224254177, "dur": 92, "args": { "External id": 178200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178200, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178200, "pid": 5, "tid": 7, "ts": 1716454224254177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106616, "dur": 10, "args": { "External id": 178200, "cbid": 211, "correlation": 178200 } }, { "ph": "s", "id": 178200, "pid": 76337, "tid": -914061504, "ts": 1716454224106616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106707, "dur": 1, "args": { "External id": 178216, "cbid": 251, "correlation": 178216 } }, { "ph": "f", "id": 178216, "pid": 76337, "tid": -914061504, "ts": 1716454224106707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224254271, "dur": 578, "args": { "External id": 178218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178218, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 178218, "pid": 5, "tid": 7, "ts": 1716454224254271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106713, "dur": 13, "args": { "External id": 178218, "cbid": 211, "correlation": 178218 } }, { "ph": "s", "id": 178218, "pid": 76337, "tid": -914061504, "ts": 1716454224106713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224254850, "dur": 247, "args": { "External id": 178226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178226, "pid": 5, "tid": 7, "ts": 1716454224254850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106780, "dur": 13, "args": { "External id": 178226, "cbid": 211, "correlation": 178226 } }, { "ph": "s", "id": 178226, "pid": 76337, "tid": -914061504, "ts": 1716454224106780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224255098, "dur": 253, "args": { "External id": 178234, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178234, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178234, "pid": 5, "tid": 7, "ts": 1716454224255098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106812, "dur": 9, "args": { "External id": 178234, "cbid": 211, "correlation": 178234 } }, { "ph": "s", "id": 178234, "pid": 76337, "tid": -914061504, "ts": 1716454224106812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106897, "dur": 1, "args": { "External id": 178250, "cbid": 251, "correlation": 178250 } }, { "ph": "f", "id": 178250, "pid": 76337, "tid": -914061504, "ts": 1716454224106897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224106902, "dur": 0, "args": { "External id": 178252, "cbid": 251, "correlation": 178252 } }, { "ph": "f", "id": 178252, "pid": 76337, "tid": -914061504, "ts": 1716454224106902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224255352, "dur": 358, "args": { "External id": 178253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178253, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178253, "pid": 5, "tid": 7, "ts": 1716454224255352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106905, "dur": 14, "args": { "External id": 178253, "cbid": 211, "correlation": 178253 } }, { "ph": "s", "id": 178253, "pid": 76337, "tid": -914061504, "ts": 1716454224106905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224255712, "dur": 50, "args": { "External id": 178261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178261, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178261, "pid": 5, "tid": 7, "ts": 1716454224255712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224106948, "dur": 161, "args": { "External id": 178261, "cbid": 211, "correlation": 178261 } }, { "ph": "s", "id": 178261, "pid": 76337, "tid": -914061504, "ts": 1716454224106948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224255763, "dur": 158, "args": { "External id": 178272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178272, "pid": 5, "tid": 7, "ts": 1716454224255763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107168, "dur": 69, "args": { "External id": 178272, "cbid": 211, "correlation": 178272 } }, { "ph": "s", "id": 178272, "pid": 76337, "tid": -914061504, "ts": 1716454224107168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224107294, "dur": 0, "args": { "External id": 178284, "cbid": 317, "correlation": 178284 } }, { "ph": "f", "id": 178284, "pid": 76337, "tid": -914061504, "ts": 1716454224107294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224107295, "dur": 0, "args": { "External id": 178285, "cbid": 203, "correlation": 178285 } }, { "ph": "f", "id": 178285, "pid": 76337, "tid": -914061504, "ts": 1716454224107295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224107295, "dur": 0, "args": { "External id": 178286, "cbid": 205, "correlation": 178286 } }, { "ph": "f", "id": 178286, "pid": 76337, "tid": -914061504, "ts": 1716454224107295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107318, "dur": 1, "args": { "External id": 178290, "cbid": 251, "correlation": 178290 } }, { "ph": "f", "id": 178290, "pid": 76337, "tid": -914061504, "ts": 1716454224107318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107320, "dur": 0, "args": { "External id": 178291, "cbid": 251, "correlation": 178291 } }, { "ph": "f", "id": 178291, "pid": 76337, "tid": -914061504, "ts": 1716454224107320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107321, "dur": 0, "args": { "External id": 178292, "cbid": 251, "correlation": 178292 } }, { "ph": "f", "id": 178292, "pid": 76337, "tid": -914061504, "ts": 1716454224107321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107322, "dur": 0, "args": { "External id": 178293, "cbid": 251, "correlation": 178293 } }, { "ph": "f", "id": 178293, "pid": 76337, "tid": -914061504, "ts": 1716454224107322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107323, "dur": 0, "args": { "External id": 178294, "cbid": 251, "correlation": 178294 } }, { "ph": "f", "id": 178294, "pid": 76337, "tid": -914061504, "ts": 1716454224107323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107323, "dur": 0, "args": { "External id": 178295, "cbid": 251, "correlation": 178295 } }, { "ph": "f", "id": 178295, "pid": 76337, "tid": -914061504, "ts": 1716454224107323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107324, "dur": 0, "args": { "External id": 178296, "cbid": 251, "correlation": 178296 } }, { "ph": "f", "id": 178296, "pid": 76337, "tid": -914061504, "ts": 1716454224107324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107325, "dur": 0, "args": { "External id": 178297, "cbid": 251, "correlation": 178297 } }, { "ph": "f", "id": 178297, "pid": 76337, "tid": -914061504, "ts": 1716454224107325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224107326, "dur": 0, "args": { "External id": 178298, "cbid": 251, "correlation": 178298 } }, { "ph": "f", "id": 178298, "pid": 76337, "tid": -914061504, "ts": 1716454224107326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224255922, "dur": 116, "args": { "External id": 178299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178299, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 178299, "pid": 5, "tid": 7, "ts": 1716454224255922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107328, "dur": 37, "args": { "External id": 178299, "cbid": 211, "correlation": 178299 } }, { "ph": "s", "id": 178299, "pid": 76337, "tid": -914061504, "ts": 1716454224107328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224256040, "dur": 60, "args": { "External id": 178305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178305, "pid": 5, "tid": 7, "ts": 1716454224256040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107388, "dur": 282, "args": { "External id": 178305, "cbid": 211, "correlation": 178305 } }, { "ph": "s", "id": 178305, "pid": 76337, "tid": -914061504, "ts": 1716454224107388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256101, "dur": 50, "args": { "External id": 178313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178313, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178313, "pid": 5, "tid": 7, "ts": 1716454224256101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107694, "dur": 9, "args": { "External id": 178313, "cbid": 211, "correlation": 178313 } }, { "ph": "s", "id": 178313, "pid": 76337, "tid": -914061504, "ts": 1716454224107694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224256152, "dur": 53, "args": { "External id": 178333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178333, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 178333, "pid": 5, "tid": 7, "ts": 1716454224256152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107779, "dur": 12, "args": { "External id": 178333, "cbid": 211, "correlation": 178333 } }, { "ph": "s", "id": 178333, "pid": 76337, "tid": -914061504, "ts": 1716454224107779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224256207, "dur": 5, "args": { "External id": 178345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178345, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 178345, "pid": 5, "tid": 7, "ts": 1716454224256207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107802, "dur": 7, "args": { "External id": 178345, "cbid": 211, "correlation": 178345 } }, { "ph": "s", "id": 178345, "pid": 76337, "tid": -914061504, "ts": 1716454224107802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224256213, "dur": 56, "args": { "External id": 178348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178348, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178348, "pid": 5, "tid": 7, "ts": 1716454224256213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107820, "dur": 108, "args": { "External id": 178348, "cbid": 211, "correlation": 178348 } }, { "ph": "s", "id": 178348, "pid": 76337, "tid": -914061504, "ts": 1716454224107820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256270, "dur": 36, "args": { "External id": 178357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178357, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178357, "pid": 5, "tid": 7, "ts": 1716454224256270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224107967, "dur": 19, "args": { "External id": 178357, "cbid": 211, "correlation": 178357 } }, { "ph": "s", "id": 178357, "pid": 76337, "tid": -914061504, "ts": 1716454224107967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224108035, "dur": 0, "args": { "External id": 178367, "cbid": 317, "correlation": 178367 } }, { "ph": "f", "id": 178367, "pid": 76337, "tid": -914061504, "ts": 1716454224108035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224108036, "dur": 0, "args": { "External id": 178368, "cbid": 203, "correlation": 178368 } }, { "ph": "f", "id": 178368, "pid": 76337, "tid": -914061504, "ts": 1716454224108036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224108037, "dur": 0, "args": { "External id": 178369, "cbid": 205, "correlation": 178369 } }, { "ph": "f", "id": 178369, "pid": 76337, "tid": -914061504, "ts": 1716454224108037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224256307, "dur": 40, "args": { "External id": 178373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178373, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178373, "pid": 5, "tid": 7, "ts": 1716454224256307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224108052, "dur": 12, "args": { "External id": 178373, "cbid": 211, "correlation": 178373 } }, { "ph": "s", "id": 178373, "pid": 76337, "tid": -914061504, "ts": 1716454224108052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224256349, "dur": 4, "args": { "External id": 178375, "device": 5, "context": 1, "stream": 7, "correlation": 178375, "bytes": 46080, "memory bandwidth (GB/s)": 11.428571428571429 } }, { "ph": "f", "id": 178375, "pid": 5, "tid": 7, "ts": 1716454224256349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224108068, "dur": 21, "args": { "External id": 178375, "cbid": 51, "correlation": 178375 } }, { "ph": "s", "id": 178375, "pid": 76337, "tid": -914061504, "ts": 1716454224108068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224108095, "dur": 1, "args": { "External id": 178377, "cbid": 200, "correlation": 178377 } }, { "ph": "f", "id": 178377, "pid": 76337, "tid": -914061504, "ts": 1716454224108095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224108096, "dur": 0, "args": { "External id": 178378, "cbid": 200, "correlation": 178378 } }, { "ph": "f", "id": 178378, "pid": 76337, "tid": -914061504, "ts": 1716454224108096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224108097, "dur": 0, "args": { "External id": 178379, "cbid": 200, "correlation": 178379 } }, { "ph": "f", "id": 178379, "pid": 76337, "tid": -914061504, "ts": 1716454224108097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224108097, "dur": 0, "args": { "External id": 178380, "cbid": 200, "correlation": 178380 } }, { "ph": "f", "id": 178380, "pid": 76337, "tid": -914061504, "ts": 1716454224108097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454224108099, "dur": 3, "args": { "External id": 178381, "cbid": 15, "correlation": 178381 } }, { "ph": "f", "id": 178381, "pid": 76337, "tid": -914061504, "ts": 1716454224108099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224108103, "dur": 1, "args": { "External id": 178382, "cbid": 251, "correlation": 178382 } }, { "ph": "f", "id": 178382, "pid": 76337, "tid": -914061504, "ts": 1716454224108103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454224256354, "dur": 23, "args": { "External id": 178383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178383, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178383, "pid": 5, "tid": 7, "ts": 1716454224256354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224108106, "dur": 8, "args": { "External id": 178383, "cbid": 211, "correlation": 178383 } }, { "ph": "s", "id": 178383, "pid": 76337, "tid": -914061504, "ts": 1716454224108106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224256378, "dur": 4, "args": { "External id": 178385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 178385, "pid": 5, "tid": 7, "ts": 1716454224256378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224108121, "dur": 5, "args": { "External id": 178385, "cbid": 211, "correlation": 178385 } }, { "ph": "s", "id": 178385, "pid": 76337, "tid": -914061504, "ts": 1716454224108121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224108130, "dur": 0, "args": { "External id": 178386, "cbid": 51, "correlation": 178386 } }, { "ph": "s", "id": 178386, "pid": 76337, "tid": -914061504, "ts": 1716454224108130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224256384, "dur": 190, "args": { "External id": 178387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178387, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178387, "pid": 5, "tid": 7, "ts": 1716454224256384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224108131, "dur": 191, "args": { "External id": 178387, "cbid": 211, "correlation": 178387 } }, { "ph": "s", "id": 178387, "pid": 76337, "tid": -914061504, "ts": 1716454224108131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224256575, "dur": 6, "args": { "External id": 178388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178388, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178388, "pid": 5, "tid": 7, "ts": 1716454224256575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224108325, "dur": 6, "args": { "External id": 178388, "cbid": 211, "correlation": 178388 } }, { "ph": "s", "id": 178388, "pid": 76337, "tid": -914061504, "ts": 1716454224108325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224256582, "dur": 5, "args": { "External id": 178394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 178394, "pid": 5, "tid": 7, "ts": 1716454224256582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224108355, "dur": 9, "args": { "External id": 178394, "cbid": 211, "correlation": 178394 } }, { "ph": "s", "id": 178394, "pid": 76337, "tid": -914061504, "ts": 1716454224108355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256588, "dur": 3, "args": { "External id": 178402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178402, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178402, "pid": 5, "tid": 7, "ts": 1716454224256588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224110508, "dur": 17, "args": { "External id": 178402, "cbid": 211, "correlation": 178402 } }, { "ph": "s", "id": 178402, "pid": 76337, "tid": -914061504, "ts": 1716454224110508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256593, "dur": 3, "args": { "External id": 178410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178410, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178410, "pid": 5, "tid": 7, "ts": 1716454224256593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224110552, "dur": 11, "args": { "External id": 178410, "cbid": 211, "correlation": 178410 } }, { "ph": "s", "id": 178410, "pid": 76337, "tid": -914061504, "ts": 1716454224110552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256597, "dur": 3, "args": { "External id": 178418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178418, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178418, "pid": 5, "tid": 7, "ts": 1716454224256597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224110579, "dur": 9, "args": { "External id": 178418, "cbid": 211, "correlation": 178418 } }, { "ph": "s", "id": 178418, "pid": 76337, "tid": -914061504, "ts": 1716454224110579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256601, "dur": 3, "args": { "External id": 178427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178427, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178427, "pid": 5, "tid": 7, "ts": 1716454224256601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224110788, "dur": 15, "args": { "External id": 178427, "cbid": 211, "correlation": 178427 } }, { "ph": "s", "id": 178427, "pid": 76337, "tid": -914061504, "ts": 1716454224110788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256605, "dur": 3, "args": { "External id": 178436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178436, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178436, "pid": 5, "tid": 7, "ts": 1716454224256605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224110818, "dur": 8, "args": { "External id": 178436, "cbid": 211, "correlation": 178436 } }, { "ph": "s", "id": 178436, "pid": 76337, "tid": -914061504, "ts": 1716454224110818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256609, "dur": 3, "args": { "External id": 178444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178444, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178444, "pid": 5, "tid": 7, "ts": 1716454224256609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224110845, "dur": 8, "args": { "External id": 178444, "cbid": 211, "correlation": 178444 } }, { "ph": "s", "id": 178444, "pid": 76337, "tid": -914061504, "ts": 1716454224110845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256614, "dur": 3, "args": { "External id": 178452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178452, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178452, "pid": 5, "tid": 7, "ts": 1716454224256614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224111115, "dur": 16, "args": { "External id": 178452, "cbid": 211, "correlation": 178452 } }, { "ph": "s", "id": 178452, "pid": 76337, "tid": -914061504, "ts": 1716454224111115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256618, "dur": 3, "args": { "External id": 178460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178460, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178460, "pid": 5, "tid": 7, "ts": 1716454224256618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224111147, "dur": 8, "args": { "External id": 178460, "cbid": 211, "correlation": 178460 } }, { "ph": "s", "id": 178460, "pid": 76337, "tid": -914061504, "ts": 1716454224111147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224256623, "dur": 1, "args": { "External id": 178470, "device": 5, "context": 1, "stream": 7, "correlation": 178470, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 178470, "pid": 5, "tid": 7, "ts": 1716454224256623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224111230, "dur": 36, "args": { "External id": 178470, "cbid": 41, "correlation": 178470 } }, { "ph": "s", "id": 178470, "pid": 76337, "tid": -914061504, "ts": 1716454224111230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224111267, "dur": 145383, "args": { "External id": 178471, "cbid": 131, "correlation": 178471 } }, { "ph": "f", "id": 178471, "pid": 76337, "tid": -914061504, "ts": 1716454224111267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224256988, "dur": 3, "args": { "External id": 178479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178479, "pid": 5, "tid": 7, "ts": 1716454224256988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224256951, "dur": 39, "args": { "External id": 178479, "cbid": 211, "correlation": 178479 } }, { "ph": "s", "id": 178479, "pid": 76337, "tid": -914061504, "ts": 1716454224256951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257100, "dur": 3, "args": { "External id": 178488, "device": 5, "context": 1, "stream": 7, "correlation": 178488, "bytes": 8, "memory bandwidth (GB/s)": 0.002380952380952381 } }, { "ph": "f", "id": 178488, "pid": 5, "tid": 7, "ts": 1716454224257100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257067, "dur": 32, "args": { "External id": 178488, "cbid": 41, "correlation": 178488 } }, { "ph": "s", "id": 178488, "pid": 76337, "tid": -914061504, "ts": 1716454224257067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224257212, "dur": 4, "args": { "External id": 178498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178498, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178498, "pid": 5, "tid": 7, "ts": 1716454224257212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257194, "dur": 19, "args": { "External id": 178498, "cbid": 211, "correlation": 178498 } }, { "ph": "s", "id": 178498, "pid": 76337, "tid": -914061504, "ts": 1716454224257194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257309, "dur": 1, "args": { "External id": 178508, "device": 5, "context": 1, "stream": 7, "correlation": 178508, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 178508, "pid": 5, "tid": 7, "ts": 1716454224257309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257288, "dur": 20, "args": { "External id": 178508, "cbid": 41, "correlation": 178508 } }, { "ph": "s", "id": 178508, "pid": 76337, "tid": -914061504, "ts": 1716454224257288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224257309, "dur": 8, "args": { "External id": 178509, "cbid": 131, "correlation": 178509 } }, { "ph": "f", "id": 178509, "pid": 76337, "tid": -914061504, "ts": 1716454224257309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257388, "dur": 3, "args": { "External id": 178516, "device": 5, "context": 1, "stream": 7, "correlation": 178516, "bytes": 98304, "memory bandwidth (GB/s)": 30.425255338904364 } }, { "ph": "f", "id": 178516, "pid": 5, "tid": 7, "ts": 1716454224257388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257364, "dur": 23, "args": { "External id": 178516, "cbid": 41, "correlation": 178516 } }, { "ph": "s", "id": 178516, "pid": 76337, "tid": -914061504, "ts": 1716454224257364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257501, "dur": 3, "args": { "External id": 178535, "device": 5, "context": 1, "stream": 7, "correlation": 178535, "bytes": 16, "memory bandwidth (GB/s)": 0.005263157894736842 } }, { "ph": "f", "id": 178535, "pid": 5, "tid": 7, "ts": 1716454224257501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257479, "dur": 21, "args": { "External id": 178535, "cbid": 41, "correlation": 178535 } }, { "ph": "s", "id": 178535, "pid": 76337, "tid": -914061504, "ts": 1716454224257479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224257546, "dur": 3, "args": { "External id": 178541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178541, "pid": 5, "tid": 7, "ts": 1716454224257546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257534, "dur": 12, "args": { "External id": 178541, "cbid": 211, "correlation": 178541 } }, { "ph": "s", "id": 178541, "pid": 76337, "tid": -914061504, "ts": 1716454224257534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454224257559, "dur": 6, "args": { "External id": 178543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178543, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 178543, "pid": 5, "tid": 7, "ts": 1716454224257559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257549, "dur": 9, "args": { "External id": 178543, "cbid": 211, "correlation": 178543 } }, { "ph": "s", "id": 178543, "pid": 76337, "tid": -914061504, "ts": 1716454224257549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454224257568, "dur": 3, "args": { "External id": 178545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178545, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178545, "pid": 5, "tid": 7, "ts": 1716454224257568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257560, "dur": 6, "args": { "External id": 178545, "cbid": 211, "correlation": 178545 } }, { "ph": "s", "id": 178545, "pid": 76337, "tid": -914061504, "ts": 1716454224257560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257606, "dur": 2, "args": { "External id": 178553, "device": 5, "context": 1, "stream": 7, "correlation": 178553, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 178553, "pid": 5, "tid": 7, "ts": 1716454224257606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257592, "dur": 14, "args": { "External id": 178553, "cbid": 41, "correlation": 178553 } }, { "ph": "s", "id": 178553, "pid": 76337, "tid": -914061504, "ts": 1716454224257592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224257657, "dur": 3, "args": { "External id": 178567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178567, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178567, "pid": 5, "tid": 7, "ts": 1716454224257657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257646, "dur": 13, "args": { "External id": 178567, "cbid": 211, "correlation": 178567 } }, { "ph": "s", "id": 178567, "pid": 76337, "tid": -914061504, "ts": 1716454224257646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224257677, "dur": 2, "args": { "External id": 178581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178581, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178581, "pid": 5, "tid": 7, "ts": 1716454224257677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257670, "dur": 6, "args": { "External id": 178581, "cbid": 211, "correlation": 178581 } }, { "ph": "s", "id": 178581, "pid": 76337, "tid": -914061504, "ts": 1716454224257670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224257720, "dur": 6, "args": { "External id": 178588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178588, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178588, "pid": 5, "tid": 7, "ts": 1716454224257720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257709, "dur": 11, "args": { "External id": 178588, "cbid": 211, "correlation": 178588 } }, { "ph": "s", "id": 178588, "pid": 76337, "tid": -914061504, "ts": 1716454224257709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224257731, "dur": 6, "args": { "External id": 178591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178591, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178591, "pid": 5, "tid": 7, "ts": 1716454224257731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257723, "dur": 7, "args": { "External id": 178591, "cbid": 211, "correlation": 178591 } }, { "ph": "s", "id": 178591, "pid": 76337, "tid": -914061504, "ts": 1716454224257723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454224257740, "dur": 3, "args": { "External id": 178593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178593, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178593, "pid": 5, "tid": 7, "ts": 1716454224257740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257732, "dur": 7, "args": { "External id": 178593, "cbid": 211, "correlation": 178593 } }, { "ph": "s", "id": 178593, "pid": 76337, "tid": -914061504, "ts": 1716454224257732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257760, "dur": 2, "args": { "External id": 178596, "device": 5, "context": 1, "stream": 7, "correlation": 178596, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 178596, "pid": 5, "tid": 7, "ts": 1716454224257760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257748, "dur": 12, "args": { "External id": 178596, "cbid": 41, "correlation": 178596 } }, { "ph": "s", "id": 178596, "pid": 76337, "tid": -914061504, "ts": 1716454224257748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224257819, "dur": 4, "args": { "External id": 178612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178612, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178612, "pid": 5, "tid": 7, "ts": 1716454224257819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257806, "dur": 13, "args": { "External id": 178612, "cbid": 211, "correlation": 178612 } }, { "ph": "s", "id": 178612, "pid": 76337, "tid": -914061504, "ts": 1716454224257806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224257841, "dur": 3, "args": { "External id": 178617, "device": 5, "context": 1, "stream": 7, "correlation": 178617, "bytes": 1, "memory bandwidth (GB/s)": 0.0003125 } }, { "ph": "f", "id": 178617, "pid": 5, "tid": 7, "ts": 1716454224257841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257825, "dur": 14, "args": { "External id": 178617, "cbid": 41, "correlation": 178617 } }, { "ph": "s", "id": 178617, "pid": 76337, "tid": -914061504, "ts": 1716454224257825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224257877, "dur": 1, "args": { "External id": 178623, "device": 5, "context": 1, "stream": 7, "correlation": 178623, "bytes": 1, "memory bandwidth (GB/s)": 0.0005787037037037037 } }, { "ph": "f", "id": 178623, "pid": 5, "tid": 7, "ts": 1716454224257877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224257849, "dur": 37, "args": { "External id": 178623, "cbid": 41, "correlation": 178623 } }, { "ph": "s", "id": 178623, "pid": 76337, "tid": -914061504, "ts": 1716454224257849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224257887, "dur": 4, "args": { "External id": 178624, "cbid": 131, "correlation": 178624 } }, { "ph": "f", "id": 178624, "pid": 76337, "tid": -914061504, "ts": 1716454224257887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224257942, "dur": 3, "args": { "External id": 178632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178632, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178632, "pid": 5, "tid": 7, "ts": 1716454224257942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257928, "dur": 14, "args": { "External id": 178632, "cbid": 211, "correlation": 178632 } }, { "ph": "s", "id": 178632, "pid": 76337, "tid": -914061504, "ts": 1716454224257928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224257972, "dur": 3, "args": { "External id": 178642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178642, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178642, "pid": 5, "tid": 7, "ts": 1716454224257972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257963, "dur": 9, "args": { "External id": 178642, "cbid": 211, "correlation": 178642 } }, { "ph": "s", "id": 178642, "pid": 76337, "tid": -914061504, "ts": 1716454224257963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224258007, "dur": 3, "args": { "External id": 178651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178651, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178651, "pid": 5, "tid": 7, "ts": 1716454224258007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224257997, "dur": 9, "args": { "External id": 178651, "cbid": 211, "correlation": 178651 } }, { "ph": "s", "id": 178651, "pid": 76337, "tid": -914061504, "ts": 1716454224257997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224258155, "dur": 12, "args": { "External id": 178661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178661, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178661, "pid": 5, "tid": 7, "ts": 1716454224258155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258141, "dur": 15, "args": { "External id": 178661, "cbid": 211, "correlation": 178661 } }, { "ph": "s", "id": 178661, "pid": 76337, "tid": -914061504, "ts": 1716454224258141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224258195, "dur": 3, "args": { "External id": 178669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178669, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178669, "pid": 5, "tid": 7, "ts": 1716454224258195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258186, "dur": 8, "args": { "External id": 178669, "cbid": 211, "correlation": 178669 } }, { "ph": "s", "id": 178669, "pid": 76337, "tid": -914061504, "ts": 1716454224258186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224258242, "dur": 11, "args": { "External id": 178679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178679, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178679, "pid": 5, "tid": 7, "ts": 1716454224258242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258230, "dur": 12, "args": { "External id": 178679, "cbid": 211, "correlation": 178679 } }, { "ph": "s", "id": 178679, "pid": 76337, "tid": -914061504, "ts": 1716454224258230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224258279, "dur": 10, "args": { "External id": 178687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178687, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178687, "pid": 5, "tid": 7, "ts": 1716454224258279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258269, "dur": 10, "args": { "External id": 178687, "cbid": 211, "correlation": 178687 } }, { "ph": "s", "id": 178687, "pid": 76337, "tid": -914061504, "ts": 1716454224258269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224258306, "dur": 3, "args": { "External id": 178696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178696, "pid": 5, "tid": 7, "ts": 1716454224258306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258297, "dur": 9, "args": { "External id": 178696, "cbid": 211, "correlation": 178696 } }, { "ph": "s", "id": 178696, "pid": 76337, "tid": -914061504, "ts": 1716454224258297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224258332, "dur": 5, "args": { "External id": 178705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178705, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178705, "pid": 5, "tid": 7, "ts": 1716454224258332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258323, "dur": 8, "args": { "External id": 178705, "cbid": 211, "correlation": 178705 } }, { "ph": "s", "id": 178705, "pid": 76337, "tid": -914061504, "ts": 1716454224258323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224258377, "dur": 8, "args": { "External id": 178715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178715, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178715, "pid": 5, "tid": 7, "ts": 1716454224258377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258366, "dur": 11, "args": { "External id": 178715, "cbid": 211, "correlation": 178715 } }, { "ph": "s", "id": 178715, "pid": 76337, "tid": -914061504, "ts": 1716454224258366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224258765, "dur": 3, "args": { "External id": 178724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178724, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178724, "pid": 5, "tid": 7, "ts": 1716454224258765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258748, "dur": 17, "args": { "External id": 178724, "cbid": 211, "correlation": 178724 } }, { "ph": "s", "id": 178724, "pid": 76337, "tid": -914061504, "ts": 1716454224258748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224258793, "dur": 3, "args": { "External id": 178732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178732, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178732, "pid": 5, "tid": 7, "ts": 1716454224258793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258783, "dur": 10, "args": { "External id": 178732, "cbid": 211, "correlation": 178732 } }, { "ph": "s", "id": 178732, "pid": 76337, "tid": -914061504, "ts": 1716454224258783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224258845, "dur": 1, "args": { "External id": 178742, "device": 5, "context": 1, "stream": 7, "correlation": 178742, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 178742, "pid": 5, "tid": 7, "ts": 1716454224258845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224258830, "dur": 13, "args": { "External id": 178742, "cbid": 41, "correlation": 178742 } }, { "ph": "s", "id": 178742, "pid": 76337, "tid": -914061504, "ts": 1716454224258830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224258844, "dur": 8, "args": { "External id": 178743, "cbid": 131, "correlation": 178743 } }, { "ph": "f", "id": 178743, "pid": 76337, "tid": -914061504, "ts": 1716454224258844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224258935, "dur": 2, "args": { "External id": 178751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178751, "pid": 5, "tid": 7, "ts": 1716454224258935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224258921, "dur": 14, "args": { "External id": 178751, "cbid": 211, "correlation": 178751 } }, { "ph": "s", "id": 178751, "pid": 76337, "tid": -914061504, "ts": 1716454224258921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224259017, "dur": 3, "args": { "External id": 178760, "device": 5, "context": 1, "stream": 7, "correlation": 178760, "bytes": 8, "memory bandwidth (GB/s)": 0.0025252525252525255 } }, { "ph": "f", "id": 178760, "pid": 5, "tid": 7, "ts": 1716454224259017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224258998, "dur": 19, "args": { "External id": 178760, "cbid": 41, "correlation": 178760 } }, { "ph": "s", "id": 178760, "pid": 76337, "tid": -914061504, "ts": 1716454224258998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224259089, "dur": 3, "args": { "External id": 178770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178770, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 178770, "pid": 5, "tid": 7, "ts": 1716454224259089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259075, "dur": 14, "args": { "External id": 178770, "cbid": 211, "correlation": 178770 } }, { "ph": "s", "id": 178770, "pid": 76337, "tid": -914061504, "ts": 1716454224259075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224259140, "dur": 1, "args": { "External id": 178780, "device": 5, "context": 1, "stream": 7, "correlation": 178780, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 178780, "pid": 5, "tid": 7, "ts": 1716454224259140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259127, "dur": 12, "args": { "External id": 178780, "cbid": 41, "correlation": 178780 } }, { "ph": "s", "id": 178780, "pid": 76337, "tid": -914061504, "ts": 1716454224259127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224259139, "dur": 8, "args": { "External id": 178781, "cbid": 131, "correlation": 178781 } }, { "ph": "f", "id": 178781, "pid": 76337, "tid": -914061504, "ts": 1716454224259139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224259200, "dur": 3, "args": { "External id": 178788, "device": 5, "context": 1, "stream": 7, "correlation": 178788, "bytes": 98304, "memory bandwidth (GB/s)": 31.346938775510203 } }, { "ph": "f", "id": 178788, "pid": 5, "tid": 7, "ts": 1716454224259200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259181, "dur": 18, "args": { "External id": 178788, "cbid": 41, "correlation": 178788 } }, { "ph": "s", "id": 178788, "pid": 76337, "tid": -914061504, "ts": 1716454224259181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224259254, "dur": 1, "args": { "External id": 178799, "device": 5, "context": 1, "stream": 7, "correlation": 178799, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 178799, "pid": 5, "tid": 7, "ts": 1716454224259254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259240, "dur": 11, "args": { "External id": 178799, "cbid": 41, "correlation": 178799 } }, { "ph": "s", "id": 178799, "pid": 76337, "tid": -914061504, "ts": 1716454224259240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224259252, "dur": 8, "args": { "External id": 178800, "cbid": 131, "correlation": 178800 } }, { "ph": "f", "id": 178800, "pid": 76337, "tid": -914061504, "ts": 1716454224259252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224259302, "dur": 3, "args": { "External id": 178808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178808, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178808, "pid": 5, "tid": 7, "ts": 1716454224259302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259288, "dur": 13, "args": { "External id": 178808, "cbid": 211, "correlation": 178808 } }, { "ph": "s", "id": 178808, "pid": 76337, "tid": -914061504, "ts": 1716454224259288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224259331, "dur": 3, "args": { "External id": 178818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178818, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178818, "pid": 5, "tid": 7, "ts": 1716454224259331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259321, "dur": 8, "args": { "External id": 178818, "cbid": 211, "correlation": 178818 } }, { "ph": "s", "id": 178818, "pid": 76337, "tid": -914061504, "ts": 1716454224259321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224259353, "dur": 3, "args": { "External id": 178827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178827, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178827, "pid": 5, "tid": 7, "ts": 1716454224259353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259344, "dur": 7, "args": { "External id": 178827, "cbid": 211, "correlation": 178827 } }, { "ph": "s", "id": 178827, "pid": 76337, "tid": -914061504, "ts": 1716454224259344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224259422, "dur": 5, "args": { "External id": 178835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178835, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178835, "pid": 5, "tid": 7, "ts": 1716454224259422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259409, "dur": 14, "args": { "External id": 178835, "cbid": 211, "correlation": 178835 } }, { "ph": "s", "id": 178835, "pid": 76337, "tid": -914061504, "ts": 1716454224259409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224259463, "dur": 3, "args": { "External id": 178844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178844, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178844, "pid": 5, "tid": 7, "ts": 1716454224259463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259452, "dur": 10, "args": { "External id": 178844, "cbid": 211, "correlation": 178844 } }, { "ph": "s", "id": 178844, "pid": 76337, "tid": -914061504, "ts": 1716454224259452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224259486, "dur": 3, "args": { "External id": 178853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178853, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178853, "pid": 5, "tid": 7, "ts": 1716454224259486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259477, "dur": 7, "args": { "External id": 178853, "cbid": 211, "correlation": 178853 } }, { "ph": "s", "id": 178853, "pid": 76337, "tid": -914061504, "ts": 1716454224259477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224259568, "dur": 3, "args": { "External id": 178861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178861, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 178861, "pid": 5, "tid": 7, "ts": 1716454224259568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259557, "dur": 10, "args": { "External id": 178861, "cbid": 211, "correlation": 178861 } }, { "ph": "s", "id": 178861, "pid": 76337, "tid": -914061504, "ts": 1716454224259557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224259630, "dur": 1, "args": { "External id": 178869, "device": 5, "context": 1, "stream": 7, "correlation": 178869, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 178869, "pid": 5, "tid": 7, "ts": 1716454224259630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259614, "dur": 26, "args": { "External id": 178869, "cbid": 41, "correlation": 178869 } }, { "ph": "s", "id": 178869, "pid": 76337, "tid": -914061504, "ts": 1716454224259614, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224259640, "dur": 3, "args": { "External id": 178870, "cbid": 131, "correlation": 178870 } }, { "ph": "f", "id": 178870, "pid": 76337, "tid": -914061504, "ts": 1716454224259640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224259713, "dur": 1, "args": { "External id": 178880, "device": 5, "context": 1, "stream": 7, "correlation": 178880, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 178880, "pid": 5, "tid": 7, "ts": 1716454224259713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259698, "dur": 12, "args": { "External id": 178880, "cbid": 41, "correlation": 178880 } }, { "ph": "s", "id": 178880, "pid": 76337, "tid": -914061504, "ts": 1716454224259698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224259711, "dur": 8, "args": { "External id": 178881, "cbid": 131, "correlation": 178881 } }, { "ph": "f", "id": 178881, "pid": 76337, "tid": -914061504, "ts": 1716454224259711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224259775, "dur": 1, "args": { "External id": 178890, "device": 5, "context": 1, "stream": 7, "correlation": 178890, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 178890, "pid": 5, "tid": 7, "ts": 1716454224259775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259763, "dur": 9, "args": { "External id": 178890, "cbid": 41, "correlation": 178890 } }, { "ph": "s", "id": 178890, "pid": 76337, "tid": -914061504, "ts": 1716454224259763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224259773, "dur": 8, "args": { "External id": 178891, "cbid": 131, "correlation": 178891 } }, { "ph": "f", "id": 178891, "pid": 76337, "tid": -914061504, "ts": 1716454224259773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224259854, "dur": 4, "args": { "External id": 178898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178898, "pid": 5, "tid": 7, "ts": 1716454224259854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259837, "dur": 18, "args": { "External id": 178898, "cbid": 211, "correlation": 178898 } }, { "ph": "s", "id": 178898, "pid": 76337, "tid": -914061504, "ts": 1716454224259837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454224259897, "dur": 4, "args": { "External id": 178918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178918, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178918, "pid": 5, "tid": 7, "ts": 1716454224259897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259884, "dur": 13, "args": { "External id": 178918, "cbid": 211, "correlation": 178918 } }, { "ph": "s", "id": 178918, "pid": 76337, "tid": -914061504, "ts": 1716454224259884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224259898, "dur": 0, "args": { "External id": 178919, "cbid": 11, "correlation": 178919 } }, { "ph": "f", "id": 178919, "pid": 76337, "tid": -914061504, "ts": 1716454224259898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224259899, "dur": 0, "args": { "External id": 178920, "cbid": 11, "correlation": 178920 } }, { "ph": "f", "id": 178920, "pid": 76337, "tid": -914061504, "ts": 1716454224259899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224259912, "dur": 1, "args": { "External id": 178923, "device": 5, "context": 1, "stream": 7, "correlation": 178923, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 178923, "pid": 5, "tid": 7, "ts": 1716454224259912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224259900, "dur": 21, "args": { "External id": 178923, "cbid": 41, "correlation": 178923 } }, { "ph": "s", "id": 178923, "pid": 76337, "tid": -914061504, "ts": 1716454224259900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224259922, "dur": 3, "args": { "External id": 178924, "cbid": 131, "correlation": 178924 } }, { "ph": "f", "id": 178924, "pid": 76337, "tid": -914061504, "ts": 1716454224259922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224259951, "dur": 3, "args": { "External id": 178948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178948, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178948, "pid": 5, "tid": 7, "ts": 1716454224259951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259940, "dur": 10, "args": { "External id": 178948, "cbid": 211, "correlation": 178948 } }, { "ph": "s", "id": 178948, "pid": 76337, "tid": -914061504, "ts": 1716454224259940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224259951, "dur": 0, "args": { "External id": 178949, "cbid": 11, "correlation": 178949 } }, { "ph": "f", "id": 178949, "pid": 76337, "tid": -914061504, "ts": 1716454224259951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224259952, "dur": 0, "args": { "External id": 178950, "cbid": 11, "correlation": 178950 } }, { "ph": "f", "id": 178950, "pid": 76337, "tid": -914061504, "ts": 1716454224259952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224259953, "dur": 1, "args": { "External id": 178952, "cbid": 200, "correlation": 178952 } }, { "ph": "f", "id": 178952, "pid": 76337, "tid": -914061504, "ts": 1716454224259953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454224259964, "dur": 4, "args": { "External id": 178954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178954, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178954, "pid": 5, "tid": 7, "ts": 1716454224259964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224259956, "dur": 8, "args": { "External id": 178954, "cbid": 211, "correlation": 178954 } }, { "ph": "s", "id": 178954, "pid": 76337, "tid": -914061504, "ts": 1716454224259956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224259965, "dur": 0, "args": { "External id": 178955, "cbid": 11, "correlation": 178955 } }, { "ph": "f", "id": 178955, "pid": 76337, "tid": -914061504, "ts": 1716454224259965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224259965, "dur": 0, "args": { "External id": 178956, "cbid": 11, "correlation": 178956 } }, { "ph": "f", "id": 178956, "pid": 76337, "tid": -914061504, "ts": 1716454224259965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224260014, "dur": 1, "args": { "External id": 178963, "device": 5, "context": 1, "stream": 7, "correlation": 178963, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 178963, "pid": 5, "tid": 7, "ts": 1716454224260014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224260002, "dur": 21, "args": { "External id": 178963, "cbid": 41, "correlation": 178963 } }, { "ph": "s", "id": 178963, "pid": 76337, "tid": -914061504, "ts": 1716454224260002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224260024, "dur": 3, "args": { "External id": 178964, "cbid": 131, "correlation": 178964 } }, { "ph": "f", "id": 178964, "pid": 76337, "tid": -914061504, "ts": 1716454224260024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224260075, "dur": 1, "args": { "External id": 178974, "device": 5, "context": 1, "stream": 7, "correlation": 178974, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 178974, "pid": 5, "tid": 7, "ts": 1716454224260075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224260063, "dur": 10, "args": { "External id": 178974, "cbid": 41, "correlation": 178974 } }, { "ph": "s", "id": 178974, "pid": 76337, "tid": -914061504, "ts": 1716454224260063, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224260073, "dur": 8, "args": { "External id": 178975, "cbid": 131, "correlation": 178975 } }, { "ph": "f", "id": 178975, "pid": 76337, "tid": -914061504, "ts": 1716454224260073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224260147, "dur": 5, "args": { "External id": 178982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178982, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178982, "pid": 5, "tid": 7, "ts": 1716454224260147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260131, "dur": 16, "args": { "External id": 178982, "cbid": 211, "correlation": 178982 } }, { "ph": "s", "id": 178982, "pid": 76337, "tid": -914061504, "ts": 1716454224260131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260225, "dur": 3, "args": { "External id": 178991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178991, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178991, "pid": 5, "tid": 7, "ts": 1716454224260225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260212, "dur": 13, "args": { "External id": 178991, "cbid": 211, "correlation": 178991 } }, { "ph": "s", "id": 178991, "pid": 76337, "tid": -914061504, "ts": 1716454224260212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260261, "dur": 3, "args": { "External id": 178999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 178999, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 178999, "pid": 5, "tid": 7, "ts": 1716454224260261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260251, "dur": 9, "args": { "External id": 178999, "cbid": 211, "correlation": 178999 } }, { "ph": "s", "id": 178999, "pid": 76337, "tid": -914061504, "ts": 1716454224260251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260295, "dur": 4, "args": { "External id": 179007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179007, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179007, "pid": 5, "tid": 7, "ts": 1716454224260295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260284, "dur": 12, "args": { "External id": 179007, "cbid": 211, "correlation": 179007 } }, { "ph": "s", "id": 179007, "pid": 76337, "tid": -914061504, "ts": 1716454224260284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260327, "dur": 4, "args": { "External id": 179015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179015, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179015, "pid": 5, "tid": 7, "ts": 1716454224260327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260316, "dur": 10, "args": { "External id": 179015, "cbid": 211, "correlation": 179015 } }, { "ph": "s", "id": 179015, "pid": 76337, "tid": -914061504, "ts": 1716454224260316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260354, "dur": 3, "args": { "External id": 179023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179023, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179023, "pid": 5, "tid": 7, "ts": 1716454224260354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260345, "dur": 8, "args": { "External id": 179023, "cbid": 211, "correlation": 179023 } }, { "ph": "s", "id": 179023, "pid": 76337, "tid": -914061504, "ts": 1716454224260345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260381, "dur": 3, "args": { "External id": 179031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179031, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179031, "pid": 5, "tid": 7, "ts": 1716454224260381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260371, "dur": 10, "args": { "External id": 179031, "cbid": 211, "correlation": 179031 } }, { "ph": "s", "id": 179031, "pid": 76337, "tid": -914061504, "ts": 1716454224260371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224260402, "dur": 4, "args": { "External id": 179039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179039, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179039, "pid": 5, "tid": 7, "ts": 1716454224260402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260393, "dur": 7, "args": { "External id": 179039, "cbid": 211, "correlation": 179039 } }, { "ph": "s", "id": 179039, "pid": 76337, "tid": -914061504, "ts": 1716454224260393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224260420, "dur": 4, "args": { "External id": 179047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179047, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179047, "pid": 5, "tid": 7, "ts": 1716454224260420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260412, "dur": 7, "args": { "External id": 179047, "cbid": 211, "correlation": 179047 } }, { "ph": "s", "id": 179047, "pid": 76337, "tid": -914061504, "ts": 1716454224260412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260440, "dur": 3, "args": { "External id": 179055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179055, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179055, "pid": 5, "tid": 7, "ts": 1716454224260440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260433, "dur": 7, "args": { "External id": 179055, "cbid": 211, "correlation": 179055 } }, { "ph": "s", "id": 179055, "pid": 76337, "tid": -914061504, "ts": 1716454224260433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260507, "dur": 3, "args": { "External id": 179063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179063, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179063, "pid": 5, "tid": 7, "ts": 1716454224260507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260495, "dur": 11, "args": { "External id": 179063, "cbid": 211, "correlation": 179063 } }, { "ph": "s", "id": 179063, "pid": 76337, "tid": -914061504, "ts": 1716454224260495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224260533, "dur": 4, "args": { "External id": 179071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179071, "pid": 5, "tid": 7, "ts": 1716454224260533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260524, "dur": 8, "args": { "External id": 179071, "cbid": 211, "correlation": 179071 } }, { "ph": "s", "id": 179071, "pid": 76337, "tid": -914061504, "ts": 1716454224260524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224260555, "dur": 4, "args": { "External id": 179079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179079, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179079, "pid": 5, "tid": 7, "ts": 1716454224260555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260546, "dur": 8, "args": { "External id": 179079, "cbid": 211, "correlation": 179079 } }, { "ph": "s", "id": 179079, "pid": 76337, "tid": -914061504, "ts": 1716454224260546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224260574, "dur": 3, "args": { "External id": 179087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179087, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179087, "pid": 5, "tid": 7, "ts": 1716454224260574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224260566, "dur": 7, "args": { "External id": 179087, "cbid": 211, "correlation": 179087 } }, { "ph": "s", "id": 179087, "pid": 76337, "tid": -914061504, "ts": 1716454224260566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224261141, "dur": 5, "args": { "External id": 179096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179096, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179096, "pid": 5, "tid": 7, "ts": 1716454224261141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261125, "dur": 16, "args": { "External id": 179096, "cbid": 211, "correlation": 179096 } }, { "ph": "s", "id": 179096, "pid": 76337, "tid": -914061504, "ts": 1716454224261125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224261179, "dur": 5, "args": { "External id": 179105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179105, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179105, "pid": 5, "tid": 7, "ts": 1716454224261179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261169, "dur": 9, "args": { "External id": 179105, "cbid": 211, "correlation": 179105 } }, { "ph": "s", "id": 179105, "pid": 76337, "tid": -914061504, "ts": 1716454224261169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224261374, "dur": 3, "args": { "External id": 179121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179121, "pid": 5, "tid": 7, "ts": 1716454224261374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261360, "dur": 15, "args": { "External id": 179121, "cbid": 211, "correlation": 179121 } }, { "ph": "s", "id": 179121, "pid": 76337, "tid": -914061504, "ts": 1716454224261360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224261409, "dur": 3, "args": { "External id": 179129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179129, "pid": 5, "tid": 7, "ts": 1716454224261409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261399, "dur": 9, "args": { "External id": 179129, "cbid": 211, "correlation": 179129 } }, { "ph": "s", "id": 179129, "pid": 76337, "tid": -914061504, "ts": 1716454224261399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224261439, "dur": 3, "args": { "External id": 179137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179137, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179137, "pid": 5, "tid": 7, "ts": 1716454224261439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261429, "dur": 8, "args": { "External id": 179137, "cbid": 211, "correlation": 179137 } }, { "ph": "s", "id": 179137, "pid": 76337, "tid": -914061504, "ts": 1716454224261429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224261470, "dur": 4, "args": { "External id": 179145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179145, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179145, "pid": 5, "tid": 7, "ts": 1716454224261470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261461, "dur": 8, "args": { "External id": 179145, "cbid": 211, "correlation": 179145 } }, { "ph": "s", "id": 179145, "pid": 76337, "tid": -914061504, "ts": 1716454224261461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224261524, "dur": 4, "args": { "External id": 179157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179157, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179157, "pid": 5, "tid": 7, "ts": 1716454224261524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261512, "dur": 12, "args": { "External id": 179157, "cbid": 211, "correlation": 179157 } }, { "ph": "s", "id": 179157, "pid": 76337, "tid": -914061504, "ts": 1716454224261512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224261569, "dur": 4, "args": { "External id": 179168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179168, "pid": 5, "tid": 7, "ts": 1716454224261569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261557, "dur": 11, "args": { "External id": 179168, "cbid": 211, "correlation": 179168 } }, { "ph": "s", "id": 179168, "pid": 76337, "tid": -914061504, "ts": 1716454224261557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224261600, "dur": 3, "args": { "External id": 179176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179176, "pid": 5, "tid": 7, "ts": 1716454224261600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261590, "dur": 9, "args": { "External id": 179176, "cbid": 211, "correlation": 179176 } }, { "ph": "s", "id": 179176, "pid": 76337, "tid": -914061504, "ts": 1716454224261590, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224261637, "dur": 5, "args": { "External id": 179184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179184, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179184, "pid": 5, "tid": 7, "ts": 1716454224261637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261626, "dur": 11, "args": { "External id": 179184, "cbid": 211, "correlation": 179184 } }, { "ph": "s", "id": 179184, "pid": 76337, "tid": -914061504, "ts": 1716454224261626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224261668, "dur": 5, "args": { "External id": 179192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179192, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179192, "pid": 5, "tid": 7, "ts": 1716454224261668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261658, "dur": 10, "args": { "External id": 179192, "cbid": 211, "correlation": 179192 } }, { "ph": "s", "id": 179192, "pid": 76337, "tid": -914061504, "ts": 1716454224261658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224261705, "dur": 4, "args": { "External id": 179201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179201, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179201, "pid": 5, "tid": 7, "ts": 1716454224261705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261694, "dur": 10, "args": { "External id": 179201, "cbid": 211, "correlation": 179201 } }, { "ph": "s", "id": 179201, "pid": 76337, "tid": -914061504, "ts": 1716454224261694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224261765, "dur": 4, "args": { "External id": 179214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179214, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179214, "pid": 5, "tid": 7, "ts": 1716454224261765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261753, "dur": 13, "args": { "External id": 179214, "cbid": 211, "correlation": 179214 } }, { "ph": "s", "id": 179214, "pid": 76337, "tid": -914061504, "ts": 1716454224261753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224261808, "dur": 5, "args": { "External id": 179224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179224, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179224, "pid": 5, "tid": 7, "ts": 1716454224261808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224261797, "dur": 11, "args": { "External id": 179224, "cbid": 211, "correlation": 179224 } }, { "ph": "s", "id": 179224, "pid": 76337, "tid": -914061504, "ts": 1716454224261797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224261989, "dur": 5, "args": { "External id": 179241, "cbid": 251, "correlation": 179241 } }, { "ph": "f", "id": 179241, "pid": 76337, "tid": -914061504, "ts": 1716454224261989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454224262020, "dur": 12, "args": { "External id": 179243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179243, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179243, "pid": 5, "tid": 7, "ts": 1716454224262020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262004, "dur": 17, "args": { "External id": 179243, "cbid": 211, "correlation": 179243 } }, { "ph": "s", "id": 179243, "pid": 76337, "tid": -914061504, "ts": 1716454224262004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224262105, "dur": 3, "args": { "External id": 179251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179251, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179251, "pid": 5, "tid": 7, "ts": 1716454224262105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262092, "dur": 13, "args": { "External id": 179251, "cbid": 211, "correlation": 179251 } }, { "ph": "s", "id": 179251, "pid": 76337, "tid": -914061504, "ts": 1716454224262092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224262170, "dur": 2, "args": { "External id": 179267, "cbid": 251, "correlation": 179267 } }, { "ph": "f", "id": 179267, "pid": 76337, "tid": -914061504, "ts": 1716454224262170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224262177, "dur": 0, "args": { "External id": 179269, "cbid": 251, "correlation": 179269 } }, { "ph": "f", "id": 179269, "pid": 76337, "tid": -914061504, "ts": 1716454224262177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224262194, "dur": 13, "args": { "External id": 179270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179270, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179270, "pid": 5, "tid": 7, "ts": 1716454224262194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262180, "dur": 15, "args": { "External id": 179270, "cbid": 211, "correlation": 179270 } }, { "ph": "s", "id": 179270, "pid": 76337, "tid": -914061504, "ts": 1716454224262180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224262210, "dur": 5, "args": { "External id": 179272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179272, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179272, "pid": 5, "tid": 7, "ts": 1716454224262210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262200, "dur": 9, "args": { "External id": 179272, "cbid": 211, "correlation": 179272 } }, { "ph": "s", "id": 179272, "pid": 76337, "tid": -914061504, "ts": 1716454224262200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224262339, "dur": 1, "args": { "External id": 179282, "cbid": 317, "correlation": 179282 } }, { "ph": "f", "id": 179282, "pid": 76337, "tid": -914061504, "ts": 1716454224262339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224262341, "dur": 1, "args": { "External id": 179283, "cbid": 203, "correlation": 179283 } }, { "ph": "f", "id": 179283, "pid": 76337, "tid": -914061504, "ts": 1716454224262341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224262343, "dur": 1, "args": { "External id": 179284, "cbid": 205, "correlation": 179284 } }, { "ph": "f", "id": 179284, "pid": 76337, "tid": -914061504, "ts": 1716454224262343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224262403, "dur": 6, "args": { "External id": 179288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179288, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179288, "pid": 5, "tid": 7, "ts": 1716454224262403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262386, "dur": 17, "args": { "External id": 179288, "cbid": 211, "correlation": 179288 } }, { "ph": "s", "id": 179288, "pid": 76337, "tid": -914061504, "ts": 1716454224262386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224262415, "dur": 4, "args": { "External id": 179290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179290, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 179290, "pid": 5, "tid": 7, "ts": 1716454224262415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262407, "dur": 7, "args": { "External id": 179290, "cbid": 211, "correlation": 179290 } }, { "ph": "s", "id": 179290, "pid": 76337, "tid": -914061504, "ts": 1716454224262407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224262436, "dur": 3, "args": { "External id": 179292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179292, "pid": 5, "tid": 7, "ts": 1716454224262436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262427, "dur": 8, "args": { "External id": 179292, "cbid": 211, "correlation": 179292 } }, { "ph": "s", "id": 179292, "pid": 76337, "tid": -914061504, "ts": 1716454224262427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224262440, "dur": 0, "args": { "External id": 179293, "cbid": 51, "correlation": 179293 } }, { "ph": "s", "id": 179293, "pid": 76337, "tid": -914061504, "ts": 1716454224262440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224262450, "dur": 87, "args": { "External id": 179294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179294, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179294, "pid": 5, "tid": 7, "ts": 1716454224262450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262442, "dur": 7, "args": { "External id": 179294, "cbid": 211, "correlation": 179294 } }, { "ph": "s", "id": 179294, "pid": 76337, "tid": -914061504, "ts": 1716454224262442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224262539, "dur": 60, "args": { "External id": 179299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179299, "pid": 5, "tid": 7, "ts": 1716454224262539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224262481, "dur": 11, "args": { "External id": 179299, "cbid": 211, "correlation": 179299 } }, { "ph": "s", "id": 179299, "pid": 76337, "tid": -914061504, "ts": 1716454224262481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224264426, "dur": 52, "args": { "External id": 179319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179319, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 179319, "pid": 5, "tid": 7, "ts": 1716454224264426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264409, "dur": 17, "args": { "External id": 179319, "cbid": 211, "correlation": 179319 } }, { "ph": "s", "id": 179319, "pid": 76337, "tid": -914061504, "ts": 1716454224264409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224264479, "dur": 4, "args": { "External id": 179331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179331, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179331, "pid": 5, "tid": 7, "ts": 1716454224264479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264438, "dur": 8, "args": { "External id": 179331, "cbid": 211, "correlation": 179331 } }, { "ph": "s", "id": 179331, "pid": 76337, "tid": -914061504, "ts": 1716454224264438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224264485, "dur": 58, "args": { "External id": 179334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179334, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179334, "pid": 5, "tid": 7, "ts": 1716454224264485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264464, "dur": 8, "args": { "External id": 179334, "cbid": 211, "correlation": 179334 } }, { "ph": "s", "id": 179334, "pid": 76337, "tid": -914061504, "ts": 1716454224264464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224264544, "dur": 36, "args": { "External id": 179343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179343, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179343, "pid": 5, "tid": 7, "ts": 1716454224264544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264513, "dur": 11, "args": { "External id": 179343, "cbid": 211, "correlation": 179343 } }, { "ph": "s", "id": 179343, "pid": 76337, "tid": -914061504, "ts": 1716454224264513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224264576, "dur": 0, "args": { "External id": 179353, "cbid": 317, "correlation": 179353 } }, { "ph": "f", "id": 179353, "pid": 76337, "tid": -914061504, "ts": 1716454224264576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224264577, "dur": 0, "args": { "External id": 179354, "cbid": 203, "correlation": 179354 } }, { "ph": "f", "id": 179354, "pid": 76337, "tid": -914061504, "ts": 1716454224264577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224264578, "dur": 0, "args": { "External id": 179355, "cbid": 205, "correlation": 179355 } }, { "ph": "f", "id": 179355, "pid": 76337, "tid": -914061504, "ts": 1716454224264578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224264613, "dur": 40, "args": { "External id": 179359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179359, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179359, "pid": 5, "tid": 7, "ts": 1716454224264613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264595, "dur": 17, "args": { "External id": 179359, "cbid": 211, "correlation": 179359 } }, { "ph": "s", "id": 179359, "pid": 76337, "tid": -914061504, "ts": 1716454224264595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224264654, "dur": 14, "args": { "External id": 179361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179361, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179361, "pid": 5, "tid": 7, "ts": 1716454224264654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264615, "dur": 6, "args": { "External id": 179361, "cbid": 211, "correlation": 179361 } }, { "ph": "s", "id": 179361, "pid": 76337, "tid": -914061504, "ts": 1716454224264615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224264669, "dur": 3, "args": { "External id": 179363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179363, "pid": 5, "tid": 7, "ts": 1716454224264669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264626, "dur": 6, "args": { "External id": 179363, "cbid": 211, "correlation": 179363 } }, { "ph": "s", "id": 179363, "pid": 76337, "tid": -914061504, "ts": 1716454224264626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224264636, "dur": 0, "args": { "External id": 179364, "cbid": 51, "correlation": 179364 } }, { "ph": "s", "id": 179364, "pid": 76337, "tid": -914061504, "ts": 1716454224264636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224264674, "dur": 704, "args": { "External id": 179365, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179365, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179365, "pid": 5, "tid": 7, "ts": 1716454224264674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264638, "dur": 8, "args": { "External id": 179365, "cbid": 211, "correlation": 179365 } }, { "ph": "s", "id": 179365, "pid": 76337, "tid": -914061504, "ts": 1716454224264638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224265379, "dur": 60, "args": { "External id": 179370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179370, "pid": 5, "tid": 7, "ts": 1716454224265379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264669, "dur": 9, "args": { "External id": 179370, "cbid": 211, "correlation": 179370 } }, { "ph": "s", "id": 179370, "pid": 76337, "tid": -914061504, "ts": 1716454224264669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224265440, "dur": 4, "args": { "External id": 179378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179378, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179378, "pid": 5, "tid": 7, "ts": 1716454224265440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264715, "dur": 9, "args": { "External id": 179378, "cbid": 211, "correlation": 179378 } }, { "ph": "s", "id": 179378, "pid": 76337, "tid": -914061504, "ts": 1716454224264715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224264784, "dur": 2, "args": { "External id": 179394, "cbid": 251, "correlation": 179394 } }, { "ph": "f", "id": 179394, "pid": 76337, "tid": -914061504, "ts": 1716454224264784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224264790, "dur": 0, "args": { "External id": 179396, "cbid": 251, "correlation": 179396 } }, { "ph": "f", "id": 179396, "pid": 76337, "tid": -914061504, "ts": 1716454224264790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224265445, "dur": 9, "args": { "External id": 179397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179397, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 179397, "pid": 5, "tid": 7, "ts": 1716454224265445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264792, "dur": 13, "args": { "External id": 179397, "cbid": 211, "correlation": 179397 } }, { "ph": "s", "id": 179397, "pid": 76337, "tid": -914061504, "ts": 1716454224264792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224265455, "dur": 4, "args": { "External id": 179399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179399, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 179399, "pid": 5, "tid": 7, "ts": 1716454224265455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264807, "dur": 6, "args": { "External id": 179399, "cbid": 211, "correlation": 179399 } }, { "ph": "s", "id": 179399, "pid": 76337, "tid": -914061504, "ts": 1716454224264807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224265461, "dur": 55, "args": { "External id": 179409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179409, "pid": 5, "tid": 7, "ts": 1716454224265461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264868, "dur": 12, "args": { "External id": 179409, "cbid": 211, "correlation": 179409 } }, { "ph": "s", "id": 179409, "pid": 76337, "tid": -914061504, "ts": 1716454224264868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224265517, "dur": 52, "args": { "External id": 179429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179429, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 179429, "pid": 5, "tid": 7, "ts": 1716454224265517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264936, "dur": 11, "args": { "External id": 179429, "cbid": 211, "correlation": 179429 } }, { "ph": "s", "id": 179429, "pid": 76337, "tid": -914061504, "ts": 1716454224264936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224265571, "dur": 4, "args": { "External id": 179441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179441, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179441, "pid": 5, "tid": 7, "ts": 1716454224265571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264958, "dur": 6, "args": { "External id": 179441, "cbid": 211, "correlation": 179441 } }, { "ph": "s", "id": 179441, "pid": 76337, "tid": -914061504, "ts": 1716454224264958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224265576, "dur": 56, "args": { "External id": 179444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179444, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179444, "pid": 5, "tid": 7, "ts": 1716454224265576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224264986, "dur": 7, "args": { "External id": 179444, "cbid": 211, "correlation": 179444 } }, { "ph": "s", "id": 179444, "pid": 76337, "tid": -914061504, "ts": 1716454224264986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224265633, "dur": 37, "args": { "External id": 179453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179453, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179453, "pid": 5, "tid": 7, "ts": 1716454224265633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265029, "dur": 10, "args": { "External id": 179453, "cbid": 211, "correlation": 179453 } }, { "ph": "s", "id": 179453, "pid": 76337, "tid": -914061504, "ts": 1716454224265029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224265109, "dur": 0, "args": { "External id": 179463, "cbid": 317, "correlation": 179463 } }, { "ph": "f", "id": 179463, "pid": 76337, "tid": -914061504, "ts": 1716454224265109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224265110, "dur": 0, "args": { "External id": 179464, "cbid": 203, "correlation": 179464 } }, { "ph": "f", "id": 179464, "pid": 76337, "tid": -914061504, "ts": 1716454224265110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224265110, "dur": 0, "args": { "External id": 179465, "cbid": 205, "correlation": 179465 } }, { "ph": "f", "id": 179465, "pid": 76337, "tid": -914061504, "ts": 1716454224265110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224265672, "dur": 39, "args": { "External id": 179469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179469, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179469, "pid": 5, "tid": 7, "ts": 1716454224265672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265125, "dur": 12, "args": { "External id": 179469, "cbid": 211, "correlation": 179469 } }, { "ph": "s", "id": 179469, "pid": 76337, "tid": -914061504, "ts": 1716454224265125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224265712, "dur": 14, "args": { "External id": 179471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179471, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179471, "pid": 5, "tid": 7, "ts": 1716454224265712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265139, "dur": 6, "args": { "External id": 179471, "cbid": 211, "correlation": 179471 } }, { "ph": "s", "id": 179471, "pid": 76337, "tid": -914061504, "ts": 1716454224265139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224265728, "dur": 3, "args": { "External id": 179473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179473, "pid": 5, "tid": 7, "ts": 1716454224265728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265149, "dur": 6, "args": { "External id": 179473, "cbid": 211, "correlation": 179473 } }, { "ph": "s", "id": 179473, "pid": 76337, "tid": -914061504, "ts": 1716454224265149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224265159, "dur": 0, "args": { "External id": 179474, "cbid": 51, "correlation": 179474 } }, { "ph": "s", "id": 179474, "pid": 76337, "tid": -914061504, "ts": 1716454224265159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224265733, "dur": 700, "args": { "External id": 179475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179475, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179475, "pid": 5, "tid": 7, "ts": 1716454224265733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265160, "dur": 5, "args": { "External id": 179475, "cbid": 211, "correlation": 179475 } }, { "ph": "s", "id": 179475, "pid": 76337, "tid": -914061504, "ts": 1716454224265160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224266435, "dur": 60, "args": { "External id": 179480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179480, "pid": 5, "tid": 7, "ts": 1716454224266435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265188, "dur": 8, "args": { "External id": 179480, "cbid": 211, "correlation": 179480 } }, { "ph": "s", "id": 179480, "pid": 76337, "tid": -914061504, "ts": 1716454224265188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224266496, "dur": 50, "args": { "External id": 179488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179488, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179488, "pid": 5, "tid": 7, "ts": 1716454224266496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265221, "dur": 8, "args": { "External id": 179488, "cbid": 211, "correlation": 179488 } }, { "ph": "s", "id": 179488, "pid": 76337, "tid": -914061504, "ts": 1716454224265221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224266547, "dur": 35, "args": { "External id": 179496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179496, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179496, "pid": 5, "tid": 7, "ts": 1716454224266547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265252, "dur": 10, "args": { "External id": 179496, "cbid": 211, "correlation": 179496 } }, { "ph": "s", "id": 179496, "pid": 76337, "tid": -914061504, "ts": 1716454224265252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224266583, "dur": 53, "args": { "External id": 179516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179516, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 179516, "pid": 5, "tid": 7, "ts": 1716454224266583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265346, "dur": 13, "args": { "External id": 179516, "cbid": 211, "correlation": 179516 } }, { "ph": "s", "id": 179516, "pid": 76337, "tid": -914061504, "ts": 1716454224265346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224266638, "dur": 4, "args": { "External id": 179528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179528, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179528, "pid": 5, "tid": 7, "ts": 1716454224266638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265370, "dur": 6, "args": { "External id": 179528, "cbid": 211, "correlation": 179528 } }, { "ph": "s", "id": 179528, "pid": 76337, "tid": -914061504, "ts": 1716454224265370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224266643, "dur": 56, "args": { "External id": 179531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179531, "pid": 5, "tid": 7, "ts": 1716454224266643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265387, "dur": 6, "args": { "External id": 179531, "cbid": 211, "correlation": 179531 } }, { "ph": "s", "id": 179531, "pid": 76337, "tid": -914061504, "ts": 1716454224265387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224265449, "dur": 0, "args": { "External id": 179542, "cbid": 317, "correlation": 179542 } }, { "ph": "f", "id": 179542, "pid": 76337, "tid": -914061504, "ts": 1716454224265449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224265450, "dur": 0, "args": { "External id": 179543, "cbid": 203, "correlation": 179543 } }, { "ph": "f", "id": 179543, "pid": 76337, "tid": -914061504, "ts": 1716454224265450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224265451, "dur": 0, "args": { "External id": 179544, "cbid": 205, "correlation": 179544 } }, { "ph": "f", "id": 179544, "pid": 76337, "tid": -914061504, "ts": 1716454224265451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265491, "dur": 3, "args": { "External id": 179548, "cbid": 251, "correlation": 179548 } }, { "ph": "f", "id": 179548, "pid": 76337, "tid": -914061504, "ts": 1716454224265491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265495, "dur": 1, "args": { "External id": 179549, "cbid": 251, "correlation": 179549 } }, { "ph": "f", "id": 179549, "pid": 76337, "tid": -914061504, "ts": 1716454224265495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265497, "dur": 1, "args": { "External id": 179550, "cbid": 251, "correlation": 179550 } }, { "ph": "f", "id": 179550, "pid": 76337, "tid": -914061504, "ts": 1716454224265497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265499, "dur": 1, "args": { "External id": 179551, "cbid": 251, "correlation": 179551 } }, { "ph": "f", "id": 179551, "pid": 76337, "tid": -914061504, "ts": 1716454224265499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265501, "dur": 1, "args": { "External id": 179552, "cbid": 251, "correlation": 179552 } }, { "ph": "f", "id": 179552, "pid": 76337, "tid": -914061504, "ts": 1716454224265501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265503, "dur": 1, "args": { "External id": 179553, "cbid": 251, "correlation": 179553 } }, { "ph": "f", "id": 179553, "pid": 76337, "tid": -914061504, "ts": 1716454224265503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265505, "dur": 1, "args": { "External id": 179554, "cbid": 251, "correlation": 179554 } }, { "ph": "f", "id": 179554, "pid": 76337, "tid": -914061504, "ts": 1716454224265505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265507, "dur": 1, "args": { "External id": 179555, "cbid": 251, "correlation": 179555 } }, { "ph": "f", "id": 179555, "pid": 76337, "tid": -914061504, "ts": 1716454224265507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265510, "dur": 0, "args": { "External id": 179556, "cbid": 251, "correlation": 179556 } }, { "ph": "f", "id": 179556, "pid": 76337, "tid": -914061504, "ts": 1716454224265510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224266701, "dur": 114, "args": { "External id": 179557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179557, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 179557, "pid": 5, "tid": 7, "ts": 1716454224266701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265514, "dur": 15, "args": { "External id": 179557, "cbid": 211, "correlation": 179557 } }, { "ph": "s", "id": 179557, "pid": 76337, "tid": -914061504, "ts": 1716454224265514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224266816, "dur": 60, "args": { "External id": 179563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179563, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179563, "pid": 5, "tid": 7, "ts": 1716454224266816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265552, "dur": 9, "args": { "External id": 179563, "cbid": 211, "correlation": 179563 } }, { "ph": "s", "id": 179563, "pid": 76337, "tid": -914061504, "ts": 1716454224265552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224266877, "dur": 575, "args": { "External id": 179572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179572, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179572, "pid": 5, "tid": 7, "ts": 1716454224266877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265658, "dur": 16, "args": { "External id": 179572, "cbid": 211, "correlation": 179572 } }, { "ph": "s", "id": 179572, "pid": 76337, "tid": -914061504, "ts": 1716454224265658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224267453, "dur": 183, "args": { "External id": 179594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179594, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179594, "pid": 5, "tid": 7, "ts": 1716454224267453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265735, "dur": 12, "args": { "External id": 179594, "cbid": 211, "correlation": 179594 } }, { "ph": "s", "id": 179594, "pid": 76337, "tid": -914061504, "ts": 1716454224265735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265878, "dur": 2, "args": { "External id": 179605, "cbid": 251, "correlation": 179605 } }, { "ph": "f", "id": 179605, "pid": 76337, "tid": -914061504, "ts": 1716454224265878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224267637, "dur": 196, "args": { "External id": 179606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179606, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179606, "pid": 5, "tid": 7, "ts": 1716454224267637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265885, "dur": 15, "args": { "External id": 179606, "cbid": 211, "correlation": 179606 } }, { "ph": "s", "id": 179606, "pid": 76337, "tid": -914061504, "ts": 1716454224265885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224265960, "dur": 1, "args": { "External id": 179617, "cbid": 251, "correlation": 179617 } }, { "ph": "f", "id": 179617, "pid": 76337, "tid": -914061504, "ts": 1716454224265960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224267835, "dur": 190, "args": { "External id": 179618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179618, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179618, "pid": 5, "tid": 7, "ts": 1716454224267835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224265964, "dur": 19, "args": { "External id": 179618, "cbid": 211, "correlation": 179618 } }, { "ph": "s", "id": 179618, "pid": 76337, "tid": -914061504, "ts": 1716454224265964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266039, "dur": 1, "args": { "External id": 179629, "cbid": 251, "correlation": 179629 } }, { "ph": "f", "id": 179629, "pid": 76337, "tid": -914061504, "ts": 1716454224266039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224268026, "dur": 186, "args": { "External id": 179630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179630, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179630, "pid": 5, "tid": 7, "ts": 1716454224268026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266043, "dur": 13, "args": { "External id": 179630, "cbid": 211, "correlation": 179630 } }, { "ph": "s", "id": 179630, "pid": 76337, "tid": -914061504, "ts": 1716454224266043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224268213, "dur": 18802, "args": { "External id": 179651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179651, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 179651, "pid": 5, "tid": 7, "ts": 1716454224268213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266166, "dur": 15, "args": { "External id": 179651, "cbid": 211, "correlation": 179651 } }, { "ph": "s", "id": 179651, "pid": 76337, "tid": -914061504, "ts": 1716454224266166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266306, "dur": 2, "args": { "External id": 179669, "cbid": 251, "correlation": 179669 } }, { "ph": "f", "id": 179669, "pid": 76337, "tid": -914061504, "ts": 1716454224266306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224287017, "dur": 204, "args": { "External id": 179671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179671, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179671, "pid": 5, "tid": 7, "ts": 1716454224287017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266313, "dur": 13, "args": { "External id": 179671, "cbid": 211, "correlation": 179671 } }, { "ph": "s", "id": 179671, "pid": 76337, "tid": -914061504, "ts": 1716454224266313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224287222, "dur": 66, "args": { "External id": 179679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179679, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179679, "pid": 5, "tid": 7, "ts": 1716454224287222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266387, "dur": 13, "args": { "External id": 179679, "cbid": 211, "correlation": 179679 } }, { "ph": "s", "id": 179679, "pid": 76337, "tid": -914061504, "ts": 1716454224266387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224287289, "dur": 96, "args": { "External id": 179687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179687, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179687, "pid": 5, "tid": 7, "ts": 1716454224287289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266428, "dur": 9, "args": { "External id": 179687, "cbid": 211, "correlation": 179687 } }, { "ph": "s", "id": 179687, "pid": 76337, "tid": -914061504, "ts": 1716454224266428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224287387, "dur": 55, "args": { "External id": 179698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179698, "pid": 5, "tid": 7, "ts": 1716454224287387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266520, "dur": 16, "args": { "External id": 179698, "cbid": 211, "correlation": 179698 } }, { "ph": "s", "id": 179698, "pid": 76337, "tid": -914061504, "ts": 1716454224266520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224287443, "dur": 93, "args": { "External id": 179720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179720, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179720, "pid": 5, "tid": 7, "ts": 1716454224287443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266556, "dur": 7, "args": { "External id": 179720, "cbid": 211, "correlation": 179720 } }, { "ph": "s", "id": 179720, "pid": 76337, "tid": -914061504, "ts": 1716454224266556, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266640, "dur": 1, "args": { "External id": 179731, "cbid": 251, "correlation": 179731 } }, { "ph": "f", "id": 179731, "pid": 76337, "tid": -914061504, "ts": 1716454224266640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224287537, "dur": 107, "args": { "External id": 179732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179732, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179732, "pid": 5, "tid": 7, "ts": 1716454224287537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266646, "dur": 13, "args": { "External id": 179732, "cbid": 211, "correlation": 179732 } }, { "ph": "s", "id": 179732, "pid": 76337, "tid": -914061504, "ts": 1716454224266646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266741, "dur": 2, "args": { "External id": 179743, "cbid": 251, "correlation": 179743 } }, { "ph": "f", "id": 179743, "pid": 76337, "tid": -914061504, "ts": 1716454224266741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266747, "dur": 0, "args": { "External id": 179744, "cbid": 251, "correlation": 179744 } }, { "ph": "f", "id": 179744, "pid": 76337, "tid": -914061504, "ts": 1716454224266747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224287646, "dur": 11, "args": { "External id": 179745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179745, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179745, "pid": 5, "tid": 7, "ts": 1716454224287646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266749, "dur": 15, "args": { "External id": 179745, "cbid": 211, "correlation": 179745 } }, { "ph": "s", "id": 179745, "pid": 76337, "tid": -914061504, "ts": 1716454224266749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224287658, "dur": 5, "args": { "External id": 179747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179747, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 179747, "pid": 5, "tid": 7, "ts": 1716454224287658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266768, "dur": 7, "args": { "External id": 179747, "cbid": 211, "correlation": 179747 } }, { "ph": "s", "id": 179747, "pid": 76337, "tid": -914061504, "ts": 1716454224266768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266831, "dur": 1, "args": { "External id": 179758, "cbid": 251, "correlation": 179758 } }, { "ph": "f", "id": 179758, "pid": 76337, "tid": -914061504, "ts": 1716454224266831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224266835, "dur": 0, "args": { "External id": 179759, "cbid": 251, "correlation": 179759 } }, { "ph": "f", "id": 179759, "pid": 76337, "tid": -914061504, "ts": 1716454224266835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224287664, "dur": 6, "args": { "External id": 179760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179760, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 179760, "pid": 5, "tid": 7, "ts": 1716454224287664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266836, "dur": 11, "args": { "External id": 179760, "cbid": 211, "correlation": 179760 } }, { "ph": "s", "id": 179760, "pid": 76337, "tid": -914061504, "ts": 1716454224266836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224287672, "dur": 3, "args": { "External id": 179762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179762, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 179762, "pid": 5, "tid": 7, "ts": 1716454224287672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266849, "dur": 6, "args": { "External id": 179762, "cbid": 211, "correlation": 179762 } }, { "ph": "s", "id": 179762, "pid": 76337, "tid": -914061504, "ts": 1716454224266849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224287677, "dur": 157, "args": { "External id": 179783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179783, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 179783, "pid": 5, "tid": 7, "ts": 1716454224287677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224266923, "dur": 12, "args": { "External id": 179783, "cbid": 211, "correlation": 179783 } }, { "ph": "s", "id": 179783, "pid": 76337, "tid": -914061504, "ts": 1716454224266923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267029, "dur": 2, "args": { "External id": 179801, "cbid": 251, "correlation": 179801 } }, { "ph": "f", "id": 179801, "pid": 76337, "tid": -914061504, "ts": 1716454224267029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224287835, "dur": 107, "args": { "External id": 179803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179803, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 179803, "pid": 5, "tid": 7, "ts": 1716454224287835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267035, "dur": 14, "args": { "External id": 179803, "cbid": 211, "correlation": 179803 } }, { "ph": "s", "id": 179803, "pid": 76337, "tid": -914061504, "ts": 1716454224267035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224287943, "dur": 34, "args": { "External id": 179811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179811, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179811, "pid": 5, "tid": 7, "ts": 1716454224287943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267108, "dur": 12, "args": { "External id": 179811, "cbid": 211, "correlation": 179811 } }, { "ph": "s", "id": 179811, "pid": 76337, "tid": -914061504, "ts": 1716454224267108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224287978, "dur": 68, "args": { "External id": 179819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179819, "pid": 5, "tid": 7, "ts": 1716454224287978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267148, "dur": 10, "args": { "External id": 179819, "cbid": 211, "correlation": 179819 } }, { "ph": "s", "id": 179819, "pid": 76337, "tid": -914061504, "ts": 1716454224267148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224288048, "dur": 92, "args": { "External id": 179841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179841, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179841, "pid": 5, "tid": 7, "ts": 1716454224288048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267200, "dur": 11, "args": { "External id": 179841, "cbid": 211, "correlation": 179841 } }, { "ph": "s", "id": 179841, "pid": 76337, "tid": -914061504, "ts": 1716454224267200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267301, "dur": 1, "args": { "External id": 179857, "cbid": 251, "correlation": 179857 } }, { "ph": "f", "id": 179857, "pid": 76337, "tid": -914061504, "ts": 1716454224267301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224288142, "dur": 578, "args": { "External id": 179859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179859, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 179859, "pid": 5, "tid": 7, "ts": 1716454224288142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267307, "dur": 14, "args": { "External id": 179859, "cbid": 211, "correlation": 179859 } }, { "ph": "s", "id": 179859, "pid": 76337, "tid": -914061504, "ts": 1716454224267307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224288721, "dur": 246, "args": { "External id": 179867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179867, "pid": 5, "tid": 7, "ts": 1716454224288721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267392, "dur": 15, "args": { "External id": 179867, "cbid": 211, "correlation": 179867 } }, { "ph": "s", "id": 179867, "pid": 76337, "tid": -914061504, "ts": 1716454224267392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224288968, "dur": 252, "args": { "External id": 179875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179875, "pid": 5, "tid": 7, "ts": 1716454224288968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267428, "dur": 9, "args": { "External id": 179875, "cbid": 211, "correlation": 179875 } }, { "ph": "s", "id": 179875, "pid": 76337, "tid": -914061504, "ts": 1716454224267428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267516, "dur": 2, "args": { "External id": 179891, "cbid": 251, "correlation": 179891 } }, { "ph": "f", "id": 179891, "pid": 76337, "tid": -914061504, "ts": 1716454224267516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267522, "dur": 0, "args": { "External id": 179893, "cbid": 251, "correlation": 179893 } }, { "ph": "f", "id": 179893, "pid": 76337, "tid": -914061504, "ts": 1716454224267522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224289222, "dur": 360, "args": { "External id": 179894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179894, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 179894, "pid": 5, "tid": 7, "ts": 1716454224289222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267527, "dur": 14, "args": { "External id": 179894, "cbid": 211, "correlation": 179894 } }, { "ph": "s", "id": 179894, "pid": 76337, "tid": -914061504, "ts": 1716454224267527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224289583, "dur": 50, "args": { "External id": 179902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179902, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179902, "pid": 5, "tid": 7, "ts": 1716454224289583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267571, "dur": 10, "args": { "External id": 179902, "cbid": 211, "correlation": 179902 } }, { "ph": "s", "id": 179902, "pid": 76337, "tid": -914061504, "ts": 1716454224267571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224289635, "dur": 159, "args": { "External id": 179913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179913, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179913, "pid": 5, "tid": 7, "ts": 1716454224289635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267642, "dur": 12, "args": { "External id": 179913, "cbid": 211, "correlation": 179913 } }, { "ph": "s", "id": 179913, "pid": 76337, "tid": -914061504, "ts": 1716454224267642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224267712, "dur": 0, "args": { "External id": 179925, "cbid": 317, "correlation": 179925 } }, { "ph": "f", "id": 179925, "pid": 76337, "tid": -914061504, "ts": 1716454224267712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224267713, "dur": 0, "args": { "External id": 179926, "cbid": 203, "correlation": 179926 } }, { "ph": "f", "id": 179926, "pid": 76337, "tid": -914061504, "ts": 1716454224267713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224267714, "dur": 0, "args": { "External id": 179927, "cbid": 205, "correlation": 179927 } }, { "ph": "f", "id": 179927, "pid": 76337, "tid": -914061504, "ts": 1716454224267714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267737, "dur": 1, "args": { "External id": 179931, "cbid": 251, "correlation": 179931 } }, { "ph": "f", "id": 179931, "pid": 76337, "tid": -914061504, "ts": 1716454224267737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267739, "dur": 0, "args": { "External id": 179932, "cbid": 251, "correlation": 179932 } }, { "ph": "f", "id": 179932, "pid": 76337, "tid": -914061504, "ts": 1716454224267739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267740, "dur": 0, "args": { "External id": 179933, "cbid": 251, "correlation": 179933 } }, { "ph": "f", "id": 179933, "pid": 76337, "tid": -914061504, "ts": 1716454224267740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267740, "dur": 0, "args": { "External id": 179934, "cbid": 251, "correlation": 179934 } }, { "ph": "f", "id": 179934, "pid": 76337, "tid": -914061504, "ts": 1716454224267740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267741, "dur": 0, "args": { "External id": 179935, "cbid": 251, "correlation": 179935 } }, { "ph": "f", "id": 179935, "pid": 76337, "tid": -914061504, "ts": 1716454224267741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267742, "dur": 0, "args": { "External id": 179936, "cbid": 251, "correlation": 179936 } }, { "ph": "f", "id": 179936, "pid": 76337, "tid": -914061504, "ts": 1716454224267742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267743, "dur": 0, "args": { "External id": 179937, "cbid": 251, "correlation": 179937 } }, { "ph": "f", "id": 179937, "pid": 76337, "tid": -914061504, "ts": 1716454224267743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267744, "dur": 0, "args": { "External id": 179938, "cbid": 251, "correlation": 179938 } }, { "ph": "f", "id": 179938, "pid": 76337, "tid": -914061504, "ts": 1716454224267744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224267745, "dur": 0, "args": { "External id": 179939, "cbid": 251, "correlation": 179939 } }, { "ph": "f", "id": 179939, "pid": 76337, "tid": -914061504, "ts": 1716454224267745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224289794, "dur": 114, "args": { "External id": 179940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179940, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 179940, "pid": 5, "tid": 7, "ts": 1716454224289794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267747, "dur": 13, "args": { "External id": 179940, "cbid": 211, "correlation": 179940 } }, { "ph": "s", "id": 179940, "pid": 76337, "tid": -914061504, "ts": 1716454224267747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224289910, "dur": 60, "args": { "External id": 179946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179946, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179946, "pid": 5, "tid": 7, "ts": 1716454224289910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267783, "dur": 9, "args": { "External id": 179946, "cbid": 211, "correlation": 179946 } }, { "ph": "s", "id": 179946, "pid": 76337, "tid": -914061504, "ts": 1716454224267783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224289971, "dur": 50, "args": { "External id": 179954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179954, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179954, "pid": 5, "tid": 7, "ts": 1716454224289971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267817, "dur": 8, "args": { "External id": 179954, "cbid": 211, "correlation": 179954 } }, { "ph": "s", "id": 179954, "pid": 76337, "tid": -914061504, "ts": 1716454224267817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224290023, "dur": 52, "args": { "External id": 179974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179974, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 179974, "pid": 5, "tid": 7, "ts": 1716454224290023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267890, "dur": 12, "args": { "External id": 179974, "cbid": 211, "correlation": 179974 } }, { "ph": "s", "id": 179974, "pid": 76337, "tid": -914061504, "ts": 1716454224267890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224290076, "dur": 4, "args": { "External id": 179986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179986, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 179986, "pid": 5, "tid": 7, "ts": 1716454224290076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267911, "dur": 7, "args": { "External id": 179986, "cbid": 211, "correlation": 179986 } }, { "ph": "s", "id": 179986, "pid": 76337, "tid": -914061504, "ts": 1716454224267911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224290082, "dur": 55, "args": { "External id": 179989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179989, "pid": 5, "tid": 7, "ts": 1716454224290082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267930, "dur": 7, "args": { "External id": 179989, "cbid": 211, "correlation": 179989 } }, { "ph": "s", "id": 179989, "pid": 76337, "tid": -914061504, "ts": 1716454224267930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224290139, "dur": 37, "args": { "External id": 179998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 179998, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 179998, "pid": 5, "tid": 7, "ts": 1716454224290139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224267969, "dur": 18, "args": { "External id": 179998, "cbid": 211, "correlation": 179998 } }, { "ph": "s", "id": 179998, "pid": 76337, "tid": -914061504, "ts": 1716454224267969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224268031, "dur": 0, "args": { "External id": 180008, "cbid": 317, "correlation": 180008 } }, { "ph": "f", "id": 180008, "pid": 76337, "tid": -914061504, "ts": 1716454224268031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224268031, "dur": 0, "args": { "External id": 180009, "cbid": 203, "correlation": 180009 } }, { "ph": "f", "id": 180009, "pid": 76337, "tid": -914061504, "ts": 1716454224268031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224268032, "dur": 0, "args": { "External id": 180010, "cbid": 205, "correlation": 180010 } }, { "ph": "f", "id": 180010, "pid": 76337, "tid": -914061504, "ts": 1716454224268032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224290177, "dur": 40, "args": { "External id": 180014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180014, "pid": 5, "tid": 7, "ts": 1716454224290177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268046, "dur": 11, "args": { "External id": 180014, "cbid": 211, "correlation": 180014 } }, { "ph": "s", "id": 180014, "pid": 76337, "tid": -914061504, "ts": 1716454224268046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224290218, "dur": 14, "args": { "External id": 180016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180016, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180016, "pid": 5, "tid": 7, "ts": 1716454224290218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268060, "dur": 5, "args": { "External id": 180016, "cbid": 211, "correlation": 180016 } }, { "ph": "s", "id": 180016, "pid": 76337, "tid": -914061504, "ts": 1716454224268060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224290234, "dur": 4, "args": { "External id": 180018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180018, "pid": 5, "tid": 7, "ts": 1716454224290234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268070, "dur": 6, "args": { "External id": 180018, "cbid": 211, "correlation": 180018 } }, { "ph": "s", "id": 180018, "pid": 76337, "tid": -914061504, "ts": 1716454224268070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224268079, "dur": 0, "args": { "External id": 180019, "cbid": 51, "correlation": 180019 } }, { "ph": "s", "id": 180019, "pid": 76337, "tid": -914061504, "ts": 1716454224268079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224290239, "dur": 706, "args": { "External id": 180020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180020, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180020, "pid": 5, "tid": 7, "ts": 1716454224290239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268080, "dur": 5, "args": { "External id": 180020, "cbid": 211, "correlation": 180020 } }, { "ph": "s", "id": 180020, "pid": 76337, "tid": -914061504, "ts": 1716454224268080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224290947, "dur": 60, "args": { "External id": 180025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180025, "pid": 5, "tid": 7, "ts": 1716454224290947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268107, "dur": 9, "args": { "External id": 180025, "cbid": 211, "correlation": 180025 } }, { "ph": "s", "id": 180025, "pid": 76337, "tid": -914061504, "ts": 1716454224268107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224291008, "dur": 3, "args": { "External id": 180033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180033, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180033, "pid": 5, "tid": 7, "ts": 1716454224291008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268150, "dur": 9, "args": { "External id": 180033, "cbid": 211, "correlation": 180033 } }, { "ph": "s", "id": 180033, "pid": 76337, "tid": -914061504, "ts": 1716454224268150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268215, "dur": 2, "args": { "External id": 180049, "cbid": 251, "correlation": 180049 } }, { "ph": "f", "id": 180049, "pid": 76337, "tid": -914061504, "ts": 1716454224268215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268221, "dur": 0, "args": { "External id": 180051, "cbid": 251, "correlation": 180051 } }, { "ph": "f", "id": 180051, "pid": 76337, "tid": -914061504, "ts": 1716454224268221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224291012, "dur": 11, "args": { "External id": 180052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180052, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 180052, "pid": 5, "tid": 7, "ts": 1716454224291012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268223, "dur": 11, "args": { "External id": 180052, "cbid": 211, "correlation": 180052 } }, { "ph": "s", "id": 180052, "pid": 76337, "tid": -914061504, "ts": 1716454224268223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224291025, "dur": 5, "args": { "External id": 180054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180054, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 180054, "pid": 5, "tid": 7, "ts": 1716454224291025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268236, "dur": 6, "args": { "External id": 180054, "cbid": 211, "correlation": 180054 } }, { "ph": "s", "id": 180054, "pid": 76337, "tid": -914061504, "ts": 1716454224268236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224291031, "dur": 53, "args": { "External id": 180064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180064, "pid": 5, "tid": 7, "ts": 1716454224291031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268293, "dur": 12, "args": { "External id": 180064, "cbid": 211, "correlation": 180064 } }, { "ph": "s", "id": 180064, "pid": 76337, "tid": -914061504, "ts": 1716454224268293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224291086, "dur": 52, "args": { "External id": 180084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180084, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 180084, "pid": 5, "tid": 7, "ts": 1716454224291086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268360, "dur": 11, "args": { "External id": 180084, "cbid": 211, "correlation": 180084 } }, { "ph": "s", "id": 180084, "pid": 76337, "tid": -914061504, "ts": 1716454224268360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224291139, "dur": 4, "args": { "External id": 180096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180096, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180096, "pid": 5, "tid": 7, "ts": 1716454224291139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268380, "dur": 6, "args": { "External id": 180096, "cbid": 211, "correlation": 180096 } }, { "ph": "s", "id": 180096, "pid": 76337, "tid": -914061504, "ts": 1716454224268380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224291144, "dur": 54, "args": { "External id": 180099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180099, "pid": 5, "tid": 7, "ts": 1716454224291144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268399, "dur": 6, "args": { "External id": 180099, "cbid": 211, "correlation": 180099 } }, { "ph": "s", "id": 180099, "pid": 76337, "tid": -914061504, "ts": 1716454224268399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224291200, "dur": 37, "args": { "External id": 180108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180108, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180108, "pid": 5, "tid": 7, "ts": 1716454224291200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268439, "dur": 10, "args": { "External id": 180108, "cbid": 211, "correlation": 180108 } }, { "ph": "s", "id": 180108, "pid": 76337, "tid": -914061504, "ts": 1716454224268439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224268503, "dur": 0, "args": { "External id": 180118, "cbid": 317, "correlation": 180118 } }, { "ph": "f", "id": 180118, "pid": 76337, "tid": -914061504, "ts": 1716454224268503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224268503, "dur": 0, "args": { "External id": 180119, "cbid": 203, "correlation": 180119 } }, { "ph": "f", "id": 180119, "pid": 76337, "tid": -914061504, "ts": 1716454224268503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224268504, "dur": 0, "args": { "External id": 180120, "cbid": 205, "correlation": 180120 } }, { "ph": "f", "id": 180120, "pid": 76337, "tid": -914061504, "ts": 1716454224268504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224291238, "dur": 39, "args": { "External id": 180124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180124, "pid": 5, "tid": 7, "ts": 1716454224291238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268518, "dur": 13, "args": { "External id": 180124, "cbid": 211, "correlation": 180124 } }, { "ph": "s", "id": 180124, "pid": 76337, "tid": -914061504, "ts": 1716454224268518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224291278, "dur": 14, "args": { "External id": 180126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180126, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180126, "pid": 5, "tid": 7, "ts": 1716454224291278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268533, "dur": 5, "args": { "External id": 180126, "cbid": 211, "correlation": 180126 } }, { "ph": "s", "id": 180126, "pid": 76337, "tid": -914061504, "ts": 1716454224268533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224291294, "dur": 3, "args": { "External id": 180128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180128, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180128, "pid": 5, "tid": 7, "ts": 1716454224291294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268542, "dur": 5, "args": { "External id": 180128, "cbid": 211, "correlation": 180128 } }, { "ph": "s", "id": 180128, "pid": 76337, "tid": -914061504, "ts": 1716454224268542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224268550, "dur": 0, "args": { "External id": 180129, "cbid": 51, "correlation": 180129 } }, { "ph": "s", "id": 180129, "pid": 76337, "tid": -914061504, "ts": 1716454224268550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224291298, "dur": 697, "args": { "External id": 180130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180130, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180130, "pid": 5, "tid": 7, "ts": 1716454224291298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268551, "dur": 5, "args": { "External id": 180130, "cbid": 211, "correlation": 180130 } }, { "ph": "s", "id": 180130, "pid": 76337, "tid": -914061504, "ts": 1716454224268551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224291997, "dur": 59, "args": { "External id": 180135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180135, "pid": 5, "tid": 7, "ts": 1716454224291997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268578, "dur": 10, "args": { "External id": 180135, "cbid": 211, "correlation": 180135 } }, { "ph": "s", "id": 180135, "pid": 76337, "tid": -914061504, "ts": 1716454224268578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224292057, "dur": 50, "args": { "External id": 180143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180143, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180143, "pid": 5, "tid": 7, "ts": 1716454224292057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268611, "dur": 9, "args": { "External id": 180143, "cbid": 211, "correlation": 180143 } }, { "ph": "s", "id": 180143, "pid": 76337, "tid": -914061504, "ts": 1716454224268611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224292108, "dur": 35, "args": { "External id": 180151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180151, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180151, "pid": 5, "tid": 7, "ts": 1716454224292108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268641, "dur": 9, "args": { "External id": 180151, "cbid": 211, "correlation": 180151 } }, { "ph": "s", "id": 180151, "pid": 76337, "tid": -914061504, "ts": 1716454224268641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224292144, "dur": 51, "args": { "External id": 180171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180171, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 180171, "pid": 5, "tid": 7, "ts": 1716454224292144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268725, "dur": 13, "args": { "External id": 180171, "cbid": 211, "correlation": 180171 } }, { "ph": "s", "id": 180171, "pid": 76337, "tid": -914061504, "ts": 1716454224268725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224292197, "dur": 4, "args": { "External id": 180183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180183, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180183, "pid": 5, "tid": 7, "ts": 1716454224292197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268747, "dur": 6, "args": { "External id": 180183, "cbid": 211, "correlation": 180183 } }, { "ph": "s", "id": 180183, "pid": 76337, "tid": -914061504, "ts": 1716454224268747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224292202, "dur": 55, "args": { "External id": 180186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180186, "pid": 5, "tid": 7, "ts": 1716454224292202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268764, "dur": 6, "args": { "External id": 180186, "cbid": 211, "correlation": 180186 } }, { "ph": "s", "id": 180186, "pid": 76337, "tid": -914061504, "ts": 1716454224268764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224268821, "dur": 0, "args": { "External id": 180197, "cbid": 317, "correlation": 180197 } }, { "ph": "f", "id": 180197, "pid": 76337, "tid": -914061504, "ts": 1716454224268821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224268821, "dur": 0, "args": { "External id": 180198, "cbid": 203, "correlation": 180198 } }, { "ph": "f", "id": 180198, "pid": 76337, "tid": -914061504, "ts": 1716454224268821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224268822, "dur": 0, "args": { "External id": 180199, "cbid": 205, "correlation": 180199 } }, { "ph": "f", "id": 180199, "pid": 76337, "tid": -914061504, "ts": 1716454224268822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268844, "dur": 1, "args": { "External id": 180203, "cbid": 251, "correlation": 180203 } }, { "ph": "f", "id": 180203, "pid": 76337, "tid": -914061504, "ts": 1716454224268844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268846, "dur": 0, "args": { "External id": 180204, "cbid": 251, "correlation": 180204 } }, { "ph": "f", "id": 180204, "pid": 76337, "tid": -914061504, "ts": 1716454224268846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268847, "dur": 0, "args": { "External id": 180205, "cbid": 251, "correlation": 180205 } }, { "ph": "f", "id": 180205, "pid": 76337, "tid": -914061504, "ts": 1716454224268847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268847, "dur": 0, "args": { "External id": 180206, "cbid": 251, "correlation": 180206 } }, { "ph": "f", "id": 180206, "pid": 76337, "tid": -914061504, "ts": 1716454224268847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268848, "dur": 0, "args": { "External id": 180207, "cbid": 251, "correlation": 180207 } }, { "ph": "f", "id": 180207, "pid": 76337, "tid": -914061504, "ts": 1716454224268848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268849, "dur": 0, "args": { "External id": 180208, "cbid": 251, "correlation": 180208 } }, { "ph": "f", "id": 180208, "pid": 76337, "tid": -914061504, "ts": 1716454224268849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268850, "dur": 0, "args": { "External id": 180209, "cbid": 251, "correlation": 180209 } }, { "ph": "f", "id": 180209, "pid": 76337, "tid": -914061504, "ts": 1716454224268850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268850, "dur": 0, "args": { "External id": 180210, "cbid": 251, "correlation": 180210 } }, { "ph": "f", "id": 180210, "pid": 76337, "tid": -914061504, "ts": 1716454224268850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224268852, "dur": 0, "args": { "External id": 180211, "cbid": 251, "correlation": 180211 } }, { "ph": "f", "id": 180211, "pid": 76337, "tid": -914061504, "ts": 1716454224268852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224292258, "dur": 111, "args": { "External id": 180212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180212, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 180212, "pid": 5, "tid": 7, "ts": 1716454224292258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268854, "dur": 12, "args": { "External id": 180212, "cbid": 211, "correlation": 180212 } }, { "ph": "s", "id": 180212, "pid": 76337, "tid": -914061504, "ts": 1716454224268854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224292371, "dur": 60, "args": { "External id": 180218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180218, "pid": 5, "tid": 7, "ts": 1716454224292371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268889, "dur": 9, "args": { "External id": 180218, "cbid": 211, "correlation": 180218 } }, { "ph": "s", "id": 180218, "pid": 76337, "tid": -914061504, "ts": 1716454224268889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224292432, "dur": 600, "args": { "External id": 180227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180227, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180227, "pid": 5, "tid": 7, "ts": 1716454224292432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224268972, "dur": 25, "args": { "External id": 180227, "cbid": 211, "correlation": 180227 } }, { "ph": "s", "id": 180227, "pid": 76337, "tid": -914061504, "ts": 1716454224268972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224293033, "dur": 181, "args": { "External id": 180249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180249, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180249, "pid": 5, "tid": 7, "ts": 1716454224293033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269040, "dur": 10, "args": { "External id": 180249, "cbid": 211, "correlation": 180249 } }, { "ph": "s", "id": 180249, "pid": 76337, "tid": -914061504, "ts": 1716454224269040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269125, "dur": 1, "args": { "External id": 180260, "cbid": 251, "correlation": 180260 } }, { "ph": "f", "id": 180260, "pid": 76337, "tid": -914061504, "ts": 1716454224269125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224293216, "dur": 196, "args": { "External id": 180261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180261, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180261, "pid": 5, "tid": 7, "ts": 1716454224293216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269131, "dur": 13, "args": { "External id": 180261, "cbid": 211, "correlation": 180261 } }, { "ph": "s", "id": 180261, "pid": 76337, "tid": -914061504, "ts": 1716454224269131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269199, "dur": 1, "args": { "External id": 180272, "cbid": 251, "correlation": 180272 } }, { "ph": "f", "id": 180272, "pid": 76337, "tid": -914061504, "ts": 1716454224269199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224293414, "dur": 190, "args": { "External id": 180273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180273, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180273, "pid": 5, "tid": 7, "ts": 1716454224293414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269203, "dur": 11, "args": { "External id": 180273, "cbid": 211, "correlation": 180273 } }, { "ph": "s", "id": 180273, "pid": 76337, "tid": -914061504, "ts": 1716454224269203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269265, "dur": 1, "args": { "External id": 180284, "cbid": 251, "correlation": 180284 } }, { "ph": "f", "id": 180284, "pid": 76337, "tid": -914061504, "ts": 1716454224269265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224293605, "dur": 189, "args": { "External id": 180285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180285, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180285, "pid": 5, "tid": 7, "ts": 1716454224293605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269269, "dur": 12, "args": { "External id": 180285, "cbid": 211, "correlation": 180285 } }, { "ph": "s", "id": 180285, "pid": 76337, "tid": -914061504, "ts": 1716454224269269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224293796, "dur": 18777, "args": { "External id": 180306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180306, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 180306, "pid": 5, "tid": 7, "ts": 1716454224293796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269349, "dur": 12, "args": { "External id": 180306, "cbid": 211, "correlation": 180306 } }, { "ph": "s", "id": 180306, "pid": 76337, "tid": -914061504, "ts": 1716454224269349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269447, "dur": 1, "args": { "External id": 180324, "cbid": 251, "correlation": 180324 } }, { "ph": "f", "id": 180324, "pid": 76337, "tid": -914061504, "ts": 1716454224269447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224312574, "dur": 203, "args": { "External id": 180326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180326, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180326, "pid": 5, "tid": 7, "ts": 1716454224312574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269452, "dur": 13, "args": { "External id": 180326, "cbid": 211, "correlation": 180326 } }, { "ph": "s", "id": 180326, "pid": 76337, "tid": -914061504, "ts": 1716454224269452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224312779, "dur": 67, "args": { "External id": 180334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180334, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180334, "pid": 5, "tid": 7, "ts": 1716454224312779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269522, "dur": 13, "args": { "External id": 180334, "cbid": 211, "correlation": 180334 } }, { "ph": "s", "id": 180334, "pid": 76337, "tid": -914061504, "ts": 1716454224269522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224312847, "dur": 96, "args": { "External id": 180342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180342, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180342, "pid": 5, "tid": 7, "ts": 1716454224312847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269561, "dur": 8, "args": { "External id": 180342, "cbid": 211, "correlation": 180342 } }, { "ph": "s", "id": 180342, "pid": 76337, "tid": -914061504, "ts": 1716454224269561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224312944, "dur": 54, "args": { "External id": 180353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180353, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180353, "pid": 5, "tid": 7, "ts": 1716454224312944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269631, "dur": 12, "args": { "External id": 180353, "cbid": 211, "correlation": 180353 } }, { "ph": "s", "id": 180353, "pid": 76337, "tid": -914061504, "ts": 1716454224269631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224313000, "dur": 92, "args": { "External id": 180375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180375, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180375, "pid": 5, "tid": 7, "ts": 1716454224313000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269662, "dur": 8, "args": { "External id": 180375, "cbid": 211, "correlation": 180375 } }, { "ph": "s", "id": 180375, "pid": 76337, "tid": -914061504, "ts": 1716454224269662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269744, "dur": 1, "args": { "External id": 180386, "cbid": 251, "correlation": 180386 } }, { "ph": "f", "id": 180386, "pid": 76337, "tid": -914061504, "ts": 1716454224269744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224313094, "dur": 104, "args": { "External id": 180387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180387, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180387, "pid": 5, "tid": 7, "ts": 1716454224313094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269749, "dur": 13, "args": { "External id": 180387, "cbid": 211, "correlation": 180387 } }, { "ph": "s", "id": 180387, "pid": 76337, "tid": -914061504, "ts": 1716454224269749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269820, "dur": 1, "args": { "External id": 180398, "cbid": 251, "correlation": 180398 } }, { "ph": "f", "id": 180398, "pid": 76337, "tid": -914061504, "ts": 1716454224269820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269824, "dur": 0, "args": { "External id": 180399, "cbid": 251, "correlation": 180399 } }, { "ph": "f", "id": 180399, "pid": 76337, "tid": -914061504, "ts": 1716454224269824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224313199, "dur": 10, "args": { "External id": 180400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180400, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 180400, "pid": 5, "tid": 7, "ts": 1716454224313199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269825, "dur": 13, "args": { "External id": 180400, "cbid": 211, "correlation": 180400 } }, { "ph": "s", "id": 180400, "pid": 76337, "tid": -914061504, "ts": 1716454224269825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224313210, "dur": 5, "args": { "External id": 180402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180402, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 180402, "pid": 5, "tid": 7, "ts": 1716454224313210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269840, "dur": 6, "args": { "External id": 180402, "cbid": 211, "correlation": 180402 } }, { "ph": "s", "id": 180402, "pid": 76337, "tid": -914061504, "ts": 1716454224269840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269899, "dur": 1, "args": { "External id": 180413, "cbid": 251, "correlation": 180413 } }, { "ph": "f", "id": 180413, "pid": 76337, "tid": -914061504, "ts": 1716454224269899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224269903, "dur": 0, "args": { "External id": 180414, "cbid": 251, "correlation": 180414 } }, { "ph": "f", "id": 180414, "pid": 76337, "tid": -914061504, "ts": 1716454224269903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224313217, "dur": 6, "args": { "External id": 180415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180415, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 180415, "pid": 5, "tid": 7, "ts": 1716454224313217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269904, "dur": 12, "args": { "External id": 180415, "cbid": 211, "correlation": 180415 } }, { "ph": "s", "id": 180415, "pid": 76337, "tid": -914061504, "ts": 1716454224269904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224313224, "dur": 3, "args": { "External id": 180417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180417, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 180417, "pid": 5, "tid": 7, "ts": 1716454224313224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269918, "dur": 5, "args": { "External id": 180417, "cbid": 211, "correlation": 180417 } }, { "ph": "s", "id": 180417, "pid": 76337, "tid": -914061504, "ts": 1716454224269918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224313229, "dur": 156, "args": { "External id": 180438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180438, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 180438, "pid": 5, "tid": 7, "ts": 1716454224313229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224269999, "dur": 12, "args": { "External id": 180438, "cbid": 211, "correlation": 180438 } }, { "ph": "s", "id": 180438, "pid": 76337, "tid": -914061504, "ts": 1716454224269999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270097, "dur": 1, "args": { "External id": 180456, "cbid": 251, "correlation": 180456 } }, { "ph": "f", "id": 180456, "pid": 76337, "tid": -914061504, "ts": 1716454224270097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224313387, "dur": 107, "args": { "External id": 180458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180458, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 180458, "pid": 5, "tid": 7, "ts": 1716454224313387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270103, "dur": 13, "args": { "External id": 180458, "cbid": 211, "correlation": 180458 } }, { "ph": "s", "id": 180458, "pid": 76337, "tid": -914061504, "ts": 1716454224270103, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224313495, "dur": 34, "args": { "External id": 180466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180466, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180466, "pid": 5, "tid": 7, "ts": 1716454224313495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270172, "dur": 12, "args": { "External id": 180466, "cbid": 211, "correlation": 180466 } }, { "ph": "s", "id": 180466, "pid": 76337, "tid": -914061504, "ts": 1716454224270172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224313531, "dur": 67, "args": { "External id": 180474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180474, "pid": 5, "tid": 7, "ts": 1716454224313531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270213, "dur": 9, "args": { "External id": 180474, "cbid": 211, "correlation": 180474 } }, { "ph": "s", "id": 180474, "pid": 76337, "tid": -914061504, "ts": 1716454224270213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224313600, "dur": 92, "args": { "External id": 180496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180496, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180496, "pid": 5, "tid": 7, "ts": 1716454224313600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270264, "dur": 10, "args": { "External id": 180496, "cbid": 211, "correlation": 180496 } }, { "ph": "s", "id": 180496, "pid": 76337, "tid": -914061504, "ts": 1716454224270264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270350, "dur": 1, "args": { "External id": 180512, "cbid": 251, "correlation": 180512 } }, { "ph": "f", "id": 180512, "pid": 76337, "tid": -914061504, "ts": 1716454224270350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224313694, "dur": 581, "args": { "External id": 180514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180514, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180514, "pid": 5, "tid": 7, "ts": 1716454224313694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270356, "dur": 12, "args": { "External id": 180514, "cbid": 211, "correlation": 180514 } }, { "ph": "s", "id": 180514, "pid": 76337, "tid": -914061504, "ts": 1716454224270356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224314276, "dur": 246, "args": { "External id": 180522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180522, "pid": 5, "tid": 7, "ts": 1716454224314276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270421, "dur": 12, "args": { "External id": 180522, "cbid": 211, "correlation": 180522 } }, { "ph": "s", "id": 180522, "pid": 76337, "tid": -914061504, "ts": 1716454224270421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224314523, "dur": 254, "args": { "External id": 180530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180530, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180530, "pid": 5, "tid": 7, "ts": 1716454224314523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270453, "dur": 8, "args": { "External id": 180530, "cbid": 211, "correlation": 180530 } }, { "ph": "s", "id": 180530, "pid": 76337, "tid": -914061504, "ts": 1716454224270453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270543, "dur": 1, "args": { "External id": 180546, "cbid": 251, "correlation": 180546 } }, { "ph": "f", "id": 180546, "pid": 76337, "tid": -914061504, "ts": 1716454224270543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270549, "dur": 0, "args": { "External id": 180548, "cbid": 251, "correlation": 180548 } }, { "ph": "f", "id": 180548, "pid": 76337, "tid": -914061504, "ts": 1716454224270549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224314779, "dur": 360, "args": { "External id": 180549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180549, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 180549, "pid": 5, "tid": 7, "ts": 1716454224314779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270551, "dur": 13, "args": { "External id": 180549, "cbid": 211, "correlation": 180549 } }, { "ph": "s", "id": 180549, "pid": 76337, "tid": -914061504, "ts": 1716454224270551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224315140, "dur": 50, "args": { "External id": 180557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180557, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180557, "pid": 5, "tid": 7, "ts": 1716454224315140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270594, "dur": 10, "args": { "External id": 180557, "cbid": 211, "correlation": 180557 } }, { "ph": "s", "id": 180557, "pid": 76337, "tid": -914061504, "ts": 1716454224270594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224315191, "dur": 159, "args": { "External id": 180568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180568, "pid": 5, "tid": 7, "ts": 1716454224315191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270660, "dur": 12, "args": { "External id": 180568, "cbid": 211, "correlation": 180568 } }, { "ph": "s", "id": 180568, "pid": 76337, "tid": -914061504, "ts": 1716454224270660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224270723, "dur": 0, "args": { "External id": 180580, "cbid": 317, "correlation": 180580 } }, { "ph": "f", "id": 180580, "pid": 76337, "tid": -914061504, "ts": 1716454224270723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224270724, "dur": 0, "args": { "External id": 180581, "cbid": 203, "correlation": 180581 } }, { "ph": "f", "id": 180581, "pid": 76337, "tid": -914061504, "ts": 1716454224270724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224270725, "dur": 0, "args": { "External id": 180582, "cbid": 205, "correlation": 180582 } }, { "ph": "f", "id": 180582, "pid": 76337, "tid": -914061504, "ts": 1716454224270725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270747, "dur": 1, "args": { "External id": 180586, "cbid": 251, "correlation": 180586 } }, { "ph": "f", "id": 180586, "pid": 76337, "tid": -914061504, "ts": 1716454224270747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270749, "dur": 0, "args": { "External id": 180587, "cbid": 251, "correlation": 180587 } }, { "ph": "f", "id": 180587, "pid": 76337, "tid": -914061504, "ts": 1716454224270749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270750, "dur": 0, "args": { "External id": 180588, "cbid": 251, "correlation": 180588 } }, { "ph": "f", "id": 180588, "pid": 76337, "tid": -914061504, "ts": 1716454224270750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270750, "dur": 0, "args": { "External id": 180589, "cbid": 251, "correlation": 180589 } }, { "ph": "f", "id": 180589, "pid": 76337, "tid": -914061504, "ts": 1716454224270750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270751, "dur": 0, "args": { "External id": 180590, "cbid": 251, "correlation": 180590 } }, { "ph": "f", "id": 180590, "pid": 76337, "tid": -914061504, "ts": 1716454224270751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270752, "dur": 0, "args": { "External id": 180591, "cbid": 251, "correlation": 180591 } }, { "ph": "f", "id": 180591, "pid": 76337, "tid": -914061504, "ts": 1716454224270752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270753, "dur": 0, "args": { "External id": 180592, "cbid": 251, "correlation": 180592 } }, { "ph": "f", "id": 180592, "pid": 76337, "tid": -914061504, "ts": 1716454224270753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270753, "dur": 0, "args": { "External id": 180593, "cbid": 251, "correlation": 180593 } }, { "ph": "f", "id": 180593, "pid": 76337, "tid": -914061504, "ts": 1716454224270753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224270755, "dur": 0, "args": { "External id": 180594, "cbid": 251, "correlation": 180594 } }, { "ph": "f", "id": 180594, "pid": 76337, "tid": -914061504, "ts": 1716454224270755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224315352, "dur": 116, "args": { "External id": 180595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180595, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 180595, "pid": 5, "tid": 7, "ts": 1716454224315352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270757, "dur": 12, "args": { "External id": 180595, "cbid": 211, "correlation": 180595 } }, { "ph": "s", "id": 180595, "pid": 76337, "tid": -914061504, "ts": 1716454224270757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224315469, "dur": 60, "args": { "External id": 180601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180601, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180601, "pid": 5, "tid": 7, "ts": 1716454224315469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270791, "dur": 10, "args": { "External id": 180601, "cbid": 211, "correlation": 180601 } }, { "ph": "s", "id": 180601, "pid": 76337, "tid": -914061504, "ts": 1716454224270791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224315531, "dur": 49, "args": { "External id": 180609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180609, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180609, "pid": 5, "tid": 7, "ts": 1716454224315531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270824, "dur": 8, "args": { "External id": 180609, "cbid": 211, "correlation": 180609 } }, { "ph": "s", "id": 180609, "pid": 76337, "tid": -914061504, "ts": 1716454224270824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224270907, "dur": 0, "args": { "External id": 180619, "cbid": 317, "correlation": 180619 } }, { "ph": "f", "id": 180619, "pid": 76337, "tid": -914061504, "ts": 1716454224270907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224270907, "dur": 0, "args": { "External id": 180620, "cbid": 203, "correlation": 180620 } }, { "ph": "f", "id": 180620, "pid": 76337, "tid": -914061504, "ts": 1716454224270907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224270908, "dur": 0, "args": { "External id": 180621, "cbid": 205, "correlation": 180621 } }, { "ph": "f", "id": 180621, "pid": 76337, "tid": -914061504, "ts": 1716454224270908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224315581, "dur": 41, "args": { "External id": 180625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180625, "pid": 5, "tid": 7, "ts": 1716454224315581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270924, "dur": 12, "args": { "External id": 180625, "cbid": 211, "correlation": 180625 } }, { "ph": "s", "id": 180625, "pid": 76337, "tid": -914061504, "ts": 1716454224270924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224315624, "dur": 15, "args": { "External id": 180627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180627, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180627, "pid": 5, "tid": 7, "ts": 1716454224315624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270939, "dur": 6, "args": { "External id": 180627, "cbid": 211, "correlation": 180627 } }, { "ph": "s", "id": 180627, "pid": 76337, "tid": -914061504, "ts": 1716454224270939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224315641, "dur": 1, "args": { "External id": 180629, "device": 5, "context": 1, "stream": 7, "correlation": 180629, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 180629, "pid": 5, "tid": 7, "ts": 1716454224315641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224270959, "dur": 25, "args": { "External id": 180629, "cbid": 51, "correlation": 180629 } }, { "ph": "s", "id": 180629, "pid": 76337, "tid": -914061504, "ts": 1716454224270959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224315645, "dur": 364, "args": { "External id": 180630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180630, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180630, "pid": 5, "tid": 7, "ts": 1716454224315645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224270986, "dur": 11, "args": { "External id": 180630, "cbid": 211, "correlation": 180630 } }, { "ph": "s", "id": 180630, "pid": 76337, "tid": -914061504, "ts": 1716454224270986, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224316010, "dur": 14, "args": { "External id": 180632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180632, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180632, "pid": 5, "tid": 7, "ts": 1716454224316010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271005, "dur": 7, "args": { "External id": 180632, "cbid": 211, "correlation": 180632 } }, { "ph": "s", "id": 180632, "pid": 76337, "tid": -914061504, "ts": 1716454224271005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224316025, "dur": 15, "args": { "External id": 180638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180638, "pid": 5, "tid": 7, "ts": 1716454224316025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271035, "dur": 9, "args": { "External id": 180638, "cbid": 211, "correlation": 180638 } }, { "ph": "s", "id": 180638, "pid": 76337, "tid": -914061504, "ts": 1716454224271035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224316042, "dur": 19, "args": { "External id": 180658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180658, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 180658, "pid": 5, "tid": 7, "ts": 1716454224316042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271134, "dur": 14, "args": { "External id": 180658, "cbid": 211, "correlation": 180658 } }, { "ph": "s", "id": 180658, "pid": 76337, "tid": -914061504, "ts": 1716454224271134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224316062, "dur": 4, "args": { "External id": 180670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180670, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180670, "pid": 5, "tid": 7, "ts": 1716454224316062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271158, "dur": 6, "args": { "External id": 180670, "cbid": 211, "correlation": 180670 } }, { "ph": "s", "id": 180670, "pid": 76337, "tid": -914061504, "ts": 1716454224271158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224316068, "dur": 18, "args": { "External id": 180673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180673, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180673, "pid": 5, "tid": 7, "ts": 1716454224316068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271175, "dur": 7, "args": { "External id": 180673, "cbid": 211, "correlation": 180673 } }, { "ph": "s", "id": 180673, "pid": 76337, "tid": -914061504, "ts": 1716454224271175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224316087, "dur": 11, "args": { "External id": 180682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180682, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180682, "pid": 5, "tid": 7, "ts": 1716454224316087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271216, "dur": 9, "args": { "External id": 180682, "cbid": 211, "correlation": 180682 } }, { "ph": "s", "id": 180682, "pid": 76337, "tid": -914061504, "ts": 1716454224271216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224271274, "dur": 0, "args": { "External id": 180692, "cbid": 317, "correlation": 180692 } }, { "ph": "f", "id": 180692, "pid": 76337, "tid": -914061504, "ts": 1716454224271274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224271274, "dur": 0, "args": { "External id": 180693, "cbid": 203, "correlation": 180693 } }, { "ph": "f", "id": 180693, "pid": 76337, "tid": -914061504, "ts": 1716454224271274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224271275, "dur": 0, "args": { "External id": 180694, "cbid": 205, "correlation": 180694 } }, { "ph": "f", "id": 180694, "pid": 76337, "tid": -914061504, "ts": 1716454224271275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224316100, "dur": 11, "args": { "External id": 180698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180698, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180698, "pid": 5, "tid": 7, "ts": 1716454224316100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271290, "dur": 13, "args": { "External id": 180698, "cbid": 211, "correlation": 180698 } }, { "ph": "s", "id": 180698, "pid": 76337, "tid": -914061504, "ts": 1716454224271290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224316112, "dur": 25, "args": { "External id": 180700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180700, "pid": 5, "tid": 7, "ts": 1716454224316112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271305, "dur": 5, "args": { "External id": 180700, "cbid": 211, "correlation": 180700 } }, { "ph": "s", "id": 180700, "pid": 76337, "tid": -914061504, "ts": 1716454224271305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224316138, "dur": 4, "args": { "External id": 180702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 180702, "pid": 5, "tid": 7, "ts": 1716454224316138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271316, "dur": 5, "args": { "External id": 180702, "cbid": 211, "correlation": 180702 } }, { "ph": "s", "id": 180702, "pid": 76337, "tid": -914061504, "ts": 1716454224271316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224271325, "dur": 0, "args": { "External id": 180703, "cbid": 51, "correlation": 180703 } }, { "ph": "s", "id": 180703, "pid": 76337, "tid": -914061504, "ts": 1716454224271325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224316143, "dur": 360, "args": { "External id": 180704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180704, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180704, "pid": 5, "tid": 7, "ts": 1716454224316143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271326, "dur": 7, "args": { "External id": 180704, "cbid": 211, "correlation": 180704 } }, { "ph": "s", "id": 180704, "pid": 76337, "tid": -914061504, "ts": 1716454224271326, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224316504, "dur": 19, "args": { "External id": 180705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180705, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180705, "pid": 5, "tid": 7, "ts": 1716454224316504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271336, "dur": 5, "args": { "External id": 180705, "cbid": 211, "correlation": 180705 } }, { "ph": "s", "id": 180705, "pid": 76337, "tid": -914061504, "ts": 1716454224271336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224316525, "dur": 34, "args": { "External id": 180711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180711, "pid": 5, "tid": 7, "ts": 1716454224316525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271364, "dur": 9, "args": { "External id": 180711, "cbid": 211, "correlation": 180711 } }, { "ph": "s", "id": 180711, "pid": 76337, "tid": -914061504, "ts": 1716454224271364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224316560, "dur": 4, "args": { "External id": 180719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180719, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 180719, "pid": 5, "tid": 7, "ts": 1716454224316560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271407, "dur": 9, "args": { "External id": 180719, "cbid": 211, "correlation": 180719 } }, { "ph": "s", "id": 180719, "pid": 76337, "tid": -914061504, "ts": 1716454224271407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271473, "dur": 1, "args": { "External id": 180735, "cbid": 251, "correlation": 180735 } }, { "ph": "f", "id": 180735, "pid": 76337, "tid": -914061504, "ts": 1716454224271473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271478, "dur": 0, "args": { "External id": 180737, "cbid": 251, "correlation": 180737 } }, { "ph": "f", "id": 180737, "pid": 76337, "tid": -914061504, "ts": 1716454224271478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224316565, "dur": 13, "args": { "External id": 180738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180738, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 180738, "pid": 5, "tid": 7, "ts": 1716454224316565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271480, "dur": 12, "args": { "External id": 180738, "cbid": 211, "correlation": 180738 } }, { "ph": "s", "id": 180738, "pid": 76337, "tid": -914061504, "ts": 1716454224271480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224316579, "dur": 5, "args": { "External id": 180740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180740, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 180740, "pid": 5, "tid": 7, "ts": 1716454224316579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271493, "dur": 5, "args": { "External id": 180740, "cbid": 211, "correlation": 180740 } }, { "ph": "s", "id": 180740, "pid": 76337, "tid": -914061504, "ts": 1716454224271493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224316585, "dur": 30, "args": { "External id": 180750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180750, "pid": 5, "tid": 7, "ts": 1716454224316585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271552, "dur": 13, "args": { "External id": 180750, "cbid": 211, "correlation": 180750 } }, { "ph": "s", "id": 180750, "pid": 76337, "tid": -914061504, "ts": 1716454224271552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224316616, "dur": 30, "args": { "External id": 180770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180770, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 180770, "pid": 5, "tid": 7, "ts": 1716454224316616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271619, "dur": 11, "args": { "External id": 180770, "cbid": 211, "correlation": 180770 } }, { "ph": "s", "id": 180770, "pid": 76337, "tid": -914061504, "ts": 1716454224271619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224316648, "dur": 4, "args": { "External id": 180782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180782, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 180782, "pid": 5, "tid": 7, "ts": 1716454224316648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271640, "dur": 6, "args": { "External id": 180782, "cbid": 211, "correlation": 180782 } }, { "ph": "s", "id": 180782, "pid": 76337, "tid": -914061504, "ts": 1716454224271640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224316653, "dur": 30, "args": { "External id": 180785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180785, "pid": 5, "tid": 7, "ts": 1716454224316653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271659, "dur": 6, "args": { "External id": 180785, "cbid": 211, "correlation": 180785 } }, { "ph": "s", "id": 180785, "pid": 76337, "tid": -914061504, "ts": 1716454224271659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224316684, "dur": 20, "args": { "External id": 180794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180794, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180794, "pid": 5, "tid": 7, "ts": 1716454224316684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271699, "dur": 10, "args": { "External id": 180794, "cbid": 211, "correlation": 180794 } }, { "ph": "s", "id": 180794, "pid": 76337, "tid": -914061504, "ts": 1716454224271699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224271766, "dur": 0, "args": { "External id": 180804, "cbid": 317, "correlation": 180804 } }, { "ph": "f", "id": 180804, "pid": 76337, "tid": -914061504, "ts": 1716454224271766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224271767, "dur": 0, "args": { "External id": 180805, "cbid": 203, "correlation": 180805 } }, { "ph": "f", "id": 180805, "pid": 76337, "tid": -914061504, "ts": 1716454224271767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224271768, "dur": 0, "args": { "External id": 180806, "cbid": 205, "correlation": 180806 } }, { "ph": "f", "id": 180806, "pid": 76337, "tid": -914061504, "ts": 1716454224271768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224316706, "dur": 21, "args": { "External id": 180810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180810, "pid": 5, "tid": 7, "ts": 1716454224316706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271787, "dur": 12, "args": { "External id": 180810, "cbid": 211, "correlation": 180810 } }, { "ph": "s", "id": 180810, "pid": 76337, "tid": -914061504, "ts": 1716454224271787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224316729, "dur": 44, "args": { "External id": 180812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180812, "pid": 5, "tid": 7, "ts": 1716454224316729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271802, "dur": 6, "args": { "External id": 180812, "cbid": 211, "correlation": 180812 } }, { "ph": "s", "id": 180812, "pid": 76337, "tid": -914061504, "ts": 1716454224271802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224316774, "dur": 657, "args": { "External id": 180814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180814, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180814, "pid": 5, "tid": 7, "ts": 1716454224316774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271816, "dur": 9, "args": { "External id": 180814, "cbid": 211, "correlation": 180814 } }, { "ph": "s", "id": 180814, "pid": 76337, "tid": -914061504, "ts": 1716454224271816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224317433, "dur": 20, "args": { "External id": 180816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180816, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180816, "pid": 5, "tid": 7, "ts": 1716454224317433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271829, "dur": 5, "args": { "External id": 180816, "cbid": 211, "correlation": 180816 } }, { "ph": "s", "id": 180816, "pid": 76337, "tid": -914061504, "ts": 1716454224271829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224317454, "dur": 34, "args": { "External id": 180822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180822, "pid": 5, "tid": 7, "ts": 1716454224317454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271858, "dur": 9, "args": { "External id": 180822, "cbid": 211, "correlation": 180822 } }, { "ph": "s", "id": 180822, "pid": 76337, "tid": -914061504, "ts": 1716454224271858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224271916, "dur": 0, "args": { "External id": 180832, "cbid": 317, "correlation": 180832 } }, { "ph": "f", "id": 180832, "pid": 76337, "tid": -914061504, "ts": 1716454224271916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224271917, "dur": 0, "args": { "External id": 180833, "cbid": 203, "correlation": 180833 } }, { "ph": "f", "id": 180833, "pid": 76337, "tid": -914061504, "ts": 1716454224271917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224271918, "dur": 0, "args": { "External id": 180834, "cbid": 205, "correlation": 180834 } }, { "ph": "f", "id": 180834, "pid": 76337, "tid": -914061504, "ts": 1716454224271918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271940, "dur": 1, "args": { "External id": 180838, "cbid": 251, "correlation": 180838 } }, { "ph": "f", "id": 180838, "pid": 76337, "tid": -914061504, "ts": 1716454224271940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271941, "dur": 0, "args": { "External id": 180839, "cbid": 251, "correlation": 180839 } }, { "ph": "f", "id": 180839, "pid": 76337, "tid": -914061504, "ts": 1716454224271941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271942, "dur": 0, "args": { "External id": 180840, "cbid": 251, "correlation": 180840 } }, { "ph": "f", "id": 180840, "pid": 76337, "tid": -914061504, "ts": 1716454224271942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271943, "dur": 0, "args": { "External id": 180841, "cbid": 251, "correlation": 180841 } }, { "ph": "f", "id": 180841, "pid": 76337, "tid": -914061504, "ts": 1716454224271943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271944, "dur": 0, "args": { "External id": 180842, "cbid": 251, "correlation": 180842 } }, { "ph": "f", "id": 180842, "pid": 76337, "tid": -914061504, "ts": 1716454224271944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271944, "dur": 0, "args": { "External id": 180843, "cbid": 251, "correlation": 180843 } }, { "ph": "f", "id": 180843, "pid": 76337, "tid": -914061504, "ts": 1716454224271944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271945, "dur": 0, "args": { "External id": 180844, "cbid": 251, "correlation": 180844 } }, { "ph": "f", "id": 180844, "pid": 76337, "tid": -914061504, "ts": 1716454224271945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271946, "dur": 0, "args": { "External id": 180845, "cbid": 251, "correlation": 180845 } }, { "ph": "f", "id": 180845, "pid": 76337, "tid": -914061504, "ts": 1716454224271946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224271947, "dur": 0, "args": { "External id": 180846, "cbid": 251, "correlation": 180846 } }, { "ph": "f", "id": 180846, "pid": 76337, "tid": -914061504, "ts": 1716454224271947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224317489, "dur": 53, "args": { "External id": 180847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180847, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 180847, "pid": 5, "tid": 7, "ts": 1716454224317489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271949, "dur": 13, "args": { "External id": 180847, "cbid": 211, "correlation": 180847 } }, { "ph": "s", "id": 180847, "pid": 76337, "tid": -914061504, "ts": 1716454224271949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224317543, "dur": 33, "args": { "External id": 180853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180853, "pid": 5, "tid": 7, "ts": 1716454224317543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224271990, "dur": 9, "args": { "External id": 180853, "cbid": 211, "correlation": 180853 } }, { "ph": "s", "id": 180853, "pid": 76337, "tid": -914061504, "ts": 1716454224271990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224317577, "dur": 27, "args": { "External id": 180861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180861, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180861, "pid": 5, "tid": 7, "ts": 1716454224317577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272021, "dur": 8, "args": { "External id": 180861, "cbid": 211, "correlation": 180861 } }, { "ph": "s", "id": 180861, "pid": 76337, "tid": -914061504, "ts": 1716454224272021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224317605, "dur": 20, "args": { "External id": 180869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180869, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180869, "pid": 5, "tid": 7, "ts": 1716454224317605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272050, "dur": 9, "args": { "External id": 180869, "cbid": 211, "correlation": 180869 } }, { "ph": "s", "id": 180869, "pid": 76337, "tid": -914061504, "ts": 1716454224272050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224317626, "dur": 30, "args": { "External id": 180889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180889, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 180889, "pid": 5, "tid": 7, "ts": 1716454224317626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272133, "dur": 13, "args": { "External id": 180889, "cbid": 211, "correlation": 180889 } }, { "ph": "s", "id": 180889, "pid": 76337, "tid": -914061504, "ts": 1716454224272133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224317657, "dur": 4, "args": { "External id": 180901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180901, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 180901, "pid": 5, "tid": 7, "ts": 1716454224317657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272155, "dur": 7, "args": { "External id": 180901, "cbid": 211, "correlation": 180901 } }, { "ph": "s", "id": 180901, "pid": 76337, "tid": -914061504, "ts": 1716454224272155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224317662, "dur": 30, "args": { "External id": 180904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180904, "pid": 5, "tid": 7, "ts": 1716454224317662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272174, "dur": 7, "args": { "External id": 180904, "cbid": 211, "correlation": 180904 } }, { "ph": "s", "id": 180904, "pid": 76337, "tid": -914061504, "ts": 1716454224272174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224272234, "dur": 0, "args": { "External id": 180915, "cbid": 317, "correlation": 180915 } }, { "ph": "f", "id": 180915, "pid": 76337, "tid": -914061504, "ts": 1716454224272234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224272235, "dur": 0, "args": { "External id": 180916, "cbid": 203, "correlation": 180916 } }, { "ph": "f", "id": 180916, "pid": 76337, "tid": -914061504, "ts": 1716454224272235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224272236, "dur": 0, "args": { "External id": 180917, "cbid": 205, "correlation": 180917 } }, { "ph": "f", "id": 180917, "pid": 76337, "tid": -914061504, "ts": 1716454224272236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224317693, "dur": 21, "args": { "External id": 180921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180921, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180921, "pid": 5, "tid": 7, "ts": 1716454224317693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272251, "dur": 12, "args": { "External id": 180921, "cbid": 211, "correlation": 180921 } }, { "ph": "s", "id": 180921, "pid": 76337, "tid": -914061504, "ts": 1716454224272251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224317716, "dur": 121, "args": { "External id": 180923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180923, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180923, "pid": 5, "tid": 7, "ts": 1716454224317716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272272, "dur": 8, "args": { "External id": 180923, "cbid": 211, "correlation": 180923 } }, { "ph": "s", "id": 180923, "pid": 76337, "tid": -914061504, "ts": 1716454224272272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224317839, "dur": 21, "args": { "External id": 180925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180925, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180925, "pid": 5, "tid": 7, "ts": 1716454224317839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272284, "dur": 5, "args": { "External id": 180925, "cbid": 211, "correlation": 180925 } }, { "ph": "s", "id": 180925, "pid": 76337, "tid": -914061504, "ts": 1716454224272284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224317861, "dur": 33, "args": { "External id": 180931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180931, "pid": 5, "tid": 7, "ts": 1716454224317861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272312, "dur": 9, "args": { "External id": 180931, "cbid": 211, "correlation": 180931 } }, { "ph": "s", "id": 180931, "pid": 76337, "tid": -914061504, "ts": 1716454224272312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224317895, "dur": 201, "args": { "External id": 180940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180940, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180940, "pid": 5, "tid": 7, "ts": 1716454224317895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272394, "dur": 14, "args": { "External id": 180940, "cbid": 211, "correlation": 180940 } }, { "ph": "s", "id": 180940, "pid": 76337, "tid": -914061504, "ts": 1716454224272394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224318098, "dur": 66, "args": { "External id": 180962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180962, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 180962, "pid": 5, "tid": 7, "ts": 1716454224318098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272452, "dur": 10, "args": { "External id": 180962, "cbid": 211, "correlation": 180962 } }, { "ph": "s", "id": 180962, "pid": 76337, "tid": -914061504, "ts": 1716454224272452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224272547, "dur": 1, "args": { "External id": 180973, "cbid": 251, "correlation": 180973 } }, { "ph": "f", "id": 180973, "pid": 76337, "tid": -914061504, "ts": 1716454224272547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224318165, "dur": 156, "args": { "External id": 180974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180974, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180974, "pid": 5, "tid": 7, "ts": 1716454224318165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272552, "dur": 14, "args": { "External id": 180974, "cbid": 211, "correlation": 180974 } }, { "ph": "s", "id": 180974, "pid": 76337, "tid": -914061504, "ts": 1716454224272552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224272623, "dur": 1, "args": { "External id": 180985, "cbid": 251, "correlation": 180985 } }, { "ph": "f", "id": 180985, "pid": 76337, "tid": -914061504, "ts": 1716454224272623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224318323, "dur": 147, "args": { "External id": 180986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180986, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180986, "pid": 5, "tid": 7, "ts": 1716454224318323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272627, "dur": 11, "args": { "External id": 180986, "cbid": 211, "correlation": 180986 } }, { "ph": "s", "id": 180986, "pid": 76337, "tid": -914061504, "ts": 1716454224272627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224272693, "dur": 1, "args": { "External id": 180997, "cbid": 251, "correlation": 180997 } }, { "ph": "f", "id": 180997, "pid": 76337, "tid": -914061504, "ts": 1716454224272693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224318471, "dur": 145, "args": { "External id": 180998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 180998, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 180998, "pid": 5, "tid": 7, "ts": 1716454224318471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272697, "dur": 11, "args": { "External id": 180998, "cbid": 211, "correlation": 180998 } }, { "ph": "s", "id": 180998, "pid": 76337, "tid": -914061504, "ts": 1716454224272697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224318618, "dur": 1965, "args": { "External id": 181019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181019, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 181019, "pid": 5, "tid": 7, "ts": 1716454224318618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272781, "dur": 14, "args": { "External id": 181019, "cbid": 211, "correlation": 181019 } }, { "ph": "s", "id": 181019, "pid": 76337, "tid": -914061504, "ts": 1716454224272781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224272885, "dur": 1, "args": { "External id": 181037, "cbid": 251, "correlation": 181037 } }, { "ph": "f", "id": 181037, "pid": 76337, "tid": -914061504, "ts": 1716454224272885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224320583, "dur": 148, "args": { "External id": 181039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181039, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 181039, "pid": 5, "tid": 7, "ts": 1716454224320583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272891, "dur": 13, "args": { "External id": 181039, "cbid": 211, "correlation": 181039 } }, { "ph": "s", "id": 181039, "pid": 76337, "tid": -914061504, "ts": 1716454224272891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224320733, "dur": 36, "args": { "External id": 181047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181047, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181047, "pid": 5, "tid": 7, "ts": 1716454224320733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224272963, "dur": 21, "args": { "External id": 181047, "cbid": 211, "correlation": 181047 } }, { "ph": "s", "id": 181047, "pid": 76337, "tid": -914061504, "ts": 1716454224272963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224320770, "dur": 50, "args": { "External id": 181055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181055, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181055, "pid": 5, "tid": 7, "ts": 1716454224320770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273012, "dur": 9, "args": { "External id": 181055, "cbid": 211, "correlation": 181055 } }, { "ph": "s", "id": 181055, "pid": 76337, "tid": -914061504, "ts": 1716454224273012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224320821, "dur": 31, "args": { "External id": 181066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181066, "pid": 5, "tid": 7, "ts": 1716454224320821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273086, "dur": 13, "args": { "External id": 181066, "cbid": 211, "correlation": 181066 } }, { "ph": "s", "id": 181066, "pid": 76337, "tid": -914061504, "ts": 1716454224273086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224320853, "dur": 35, "args": { "External id": 181088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181088, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181088, "pid": 5, "tid": 7, "ts": 1716454224320853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273118, "dur": 8, "args": { "External id": 181088, "cbid": 211, "correlation": 181088 } }, { "ph": "s", "id": 181088, "pid": 76337, "tid": -914061504, "ts": 1716454224273118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273201, "dur": 1, "args": { "External id": 181099, "cbid": 251, "correlation": 181099 } }, { "ph": "f", "id": 181099, "pid": 76337, "tid": -914061504, "ts": 1716454224273201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224320889, "dur": 90, "args": { "External id": 181100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181100, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181100, "pid": 5, "tid": 7, "ts": 1716454224320889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273207, "dur": 14, "args": { "External id": 181100, "cbid": 211, "correlation": 181100 } }, { "ph": "s", "id": 181100, "pid": 76337, "tid": -914061504, "ts": 1716454224273207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273277, "dur": 1, "args": { "External id": 181111, "cbid": 251, "correlation": 181111 } }, { "ph": "f", "id": 181111, "pid": 76337, "tid": -914061504, "ts": 1716454224273277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273281, "dur": 0, "args": { "External id": 181112, "cbid": 251, "correlation": 181112 } }, { "ph": "f", "id": 181112, "pid": 76337, "tid": -914061504, "ts": 1716454224273281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224320981, "dur": 12, "args": { "External id": 181113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181113, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 181113, "pid": 5, "tid": 7, "ts": 1716454224320981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273282, "dur": 12, "args": { "External id": 181113, "cbid": 211, "correlation": 181113 } }, { "ph": "s", "id": 181113, "pid": 76337, "tid": -914061504, "ts": 1716454224273282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224320994, "dur": 5, "args": { "External id": 181115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181115, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 181115, "pid": 5, "tid": 7, "ts": 1716454224320994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273297, "dur": 7, "args": { "External id": 181115, "cbid": 211, "correlation": 181115 } }, { "ph": "s", "id": 181115, "pid": 76337, "tid": -914061504, "ts": 1716454224273297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273355, "dur": 1, "args": { "External id": 181126, "cbid": 251, "correlation": 181126 } }, { "ph": "f", "id": 181126, "pid": 76337, "tid": -914061504, "ts": 1716454224273355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273359, "dur": 0, "args": { "External id": 181127, "cbid": 251, "correlation": 181127 } }, { "ph": "f", "id": 181127, "pid": 76337, "tid": -914061504, "ts": 1716454224273359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224321000, "dur": 7, "args": { "External id": 181128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181128, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 181128, "pid": 5, "tid": 7, "ts": 1716454224321000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273360, "dur": 12, "args": { "External id": 181128, "cbid": 211, "correlation": 181128 } }, { "ph": "s", "id": 181128, "pid": 76337, "tid": -914061504, "ts": 1716454224273360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224321009, "dur": 4, "args": { "External id": 181130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181130, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 181130, "pid": 5, "tid": 7, "ts": 1716454224321009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273374, "dur": 5, "args": { "External id": 181130, "cbid": 211, "correlation": 181130 } }, { "ph": "s", "id": 181130, "pid": 76337, "tid": -914061504, "ts": 1716454224273374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224321014, "dur": 92, "args": { "External id": 181151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181151, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 181151, "pid": 5, "tid": 7, "ts": 1716454224321014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273447, "dur": 12, "args": { "External id": 181151, "cbid": 211, "correlation": 181151 } }, { "ph": "s", "id": 181151, "pid": 76337, "tid": -914061504, "ts": 1716454224273447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273544, "dur": 1, "args": { "External id": 181169, "cbid": 251, "correlation": 181169 } }, { "ph": "f", "id": 181169, "pid": 76337, "tid": -914061504, "ts": 1716454224273544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224321107, "dur": 98, "args": { "External id": 181171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181171, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181171, "pid": 5, "tid": 7, "ts": 1716454224321107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273550, "dur": 13, "args": { "External id": 181171, "cbid": 211, "correlation": 181171 } }, { "ph": "s", "id": 181171, "pid": 76337, "tid": -914061504, "ts": 1716454224273550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224321207, "dur": 19, "args": { "External id": 181179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181179, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181179, "pid": 5, "tid": 7, "ts": 1716454224321207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273620, "dur": 12, "args": { "External id": 181179, "cbid": 211, "correlation": 181179 } }, { "ph": "s", "id": 181179, "pid": 76337, "tid": -914061504, "ts": 1716454224273620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224321227, "dur": 37, "args": { "External id": 181187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181187, "pid": 5, "tid": 7, "ts": 1716454224321227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273661, "dur": 10, "args": { "External id": 181187, "cbid": 211, "correlation": 181187 } }, { "ph": "s", "id": 181187, "pid": 76337, "tid": -914061504, "ts": 1716454224273661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224321265, "dur": 35, "args": { "External id": 181209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181209, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181209, "pid": 5, "tid": 7, "ts": 1716454224321265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273712, "dur": 11, "args": { "External id": 181209, "cbid": 211, "correlation": 181209 } }, { "ph": "s", "id": 181209, "pid": 76337, "tid": -914061504, "ts": 1716454224273712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273807, "dur": 1, "args": { "External id": 181225, "cbid": 251, "correlation": 181225 } }, { "ph": "f", "id": 181225, "pid": 76337, "tid": -914061504, "ts": 1716454224273807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224273812, "dur": 0, "args": { "External id": 181227, "cbid": 251, "correlation": 181227 } }, { "ph": "f", "id": 181227, "pid": 76337, "tid": -914061504, "ts": 1716454224273812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224321302, "dur": 547, "args": { "External id": 181228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181228, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 181228, "pid": 5, "tid": 7, "ts": 1716454224321302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273815, "dur": 13, "args": { "External id": 181228, "cbid": 211, "correlation": 181228 } }, { "ph": "s", "id": 181228, "pid": 76337, "tid": -914061504, "ts": 1716454224273815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224321850, "dur": 126, "args": { "External id": 181236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181236, "pid": 5, "tid": 7, "ts": 1716454224321850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273881, "dur": 12, "args": { "External id": 181236, "cbid": 211, "correlation": 181236 } }, { "ph": "s", "id": 181236, "pid": 76337, "tid": -914061504, "ts": 1716454224273881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224321978, "dur": 129, "args": { "External id": 181244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181244, "pid": 5, "tid": 7, "ts": 1716454224321978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224273912, "dur": 9, "args": { "External id": 181244, "cbid": 211, "correlation": 181244 } }, { "ph": "s", "id": 181244, "pid": 76337, "tid": -914061504, "ts": 1716454224273912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224274002, "dur": 1, "args": { "External id": 181260, "cbid": 251, "correlation": 181260 } }, { "ph": "f", "id": 181260, "pid": 76337, "tid": -914061504, "ts": 1716454224274002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224322108, "dur": 308, "args": { "External id": 181262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181262, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181262, "pid": 5, "tid": 7, "ts": 1716454224322108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274008, "dur": 14, "args": { "External id": 181262, "cbid": 211, "correlation": 181262 } }, { "ph": "s", "id": 181262, "pid": 76337, "tid": -914061504, "ts": 1716454224274008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224322417, "dur": 27, "args": { "External id": 181270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181270, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181270, "pid": 5, "tid": 7, "ts": 1716454224322417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274052, "dur": 10, "args": { "External id": 181270, "cbid": 211, "correlation": 181270 } }, { "ph": "s", "id": 181270, "pid": 76337, "tid": -914061504, "ts": 1716454224274052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224322445, "dur": 82, "args": { "External id": 181281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181281, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181281, "pid": 5, "tid": 7, "ts": 1716454224322445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274121, "dur": 12, "args": { "External id": 181281, "cbid": 211, "correlation": 181281 } }, { "ph": "s", "id": 181281, "pid": 76337, "tid": -914061504, "ts": 1716454224274121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224274189, "dur": 0, "args": { "External id": 181293, "cbid": 317, "correlation": 181293 } }, { "ph": "f", "id": 181293, "pid": 76337, "tid": -914061504, "ts": 1716454224274189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224274190, "dur": 0, "args": { "External id": 181294, "cbid": 203, "correlation": 181294 } }, { "ph": "f", "id": 181294, "pid": 76337, "tid": -914061504, "ts": 1716454224274190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224274191, "dur": 0, "args": { "External id": 181295, "cbid": 205, "correlation": 181295 } }, { "ph": "f", "id": 181295, "pid": 76337, "tid": -914061504, "ts": 1716454224274191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224322529, "dur": 22, "args": { "External id": 181299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181299, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181299, "pid": 5, "tid": 7, "ts": 1716454224322529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274206, "dur": 13, "args": { "External id": 181299, "cbid": 211, "correlation": 181299 } }, { "ph": "s", "id": 181299, "pid": 76337, "tid": -914061504, "ts": 1716454224274206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224322552, "dur": 122, "args": { "External id": 181301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181301, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181301, "pid": 5, "tid": 7, "ts": 1716454224322552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274225, "dur": 7, "args": { "External id": 181301, "cbid": 211, "correlation": 181301 } }, { "ph": "s", "id": 181301, "pid": 76337, "tid": -914061504, "ts": 1716454224274225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224322676, "dur": 22, "args": { "External id": 181303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181303, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181303, "pid": 5, "tid": 7, "ts": 1716454224322676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274236, "dur": 5, "args": { "External id": 181303, "cbid": 211, "correlation": 181303 } }, { "ph": "s", "id": 181303, "pid": 76337, "tid": -914061504, "ts": 1716454224274236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224322699, "dur": 33, "args": { "External id": 181309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181309, "pid": 5, "tid": 7, "ts": 1716454224322699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274263, "dur": 9, "args": { "External id": 181309, "cbid": 211, "correlation": 181309 } }, { "ph": "s", "id": 181309, "pid": 76337, "tid": -914061504, "ts": 1716454224274263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224322733, "dur": 27, "args": { "External id": 181317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181317, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181317, "pid": 5, "tid": 7, "ts": 1716454224322733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274295, "dur": 8, "args": { "External id": 181317, "cbid": 211, "correlation": 181317 } }, { "ph": "s", "id": 181317, "pid": 76337, "tid": -914061504, "ts": 1716454224274295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224322761, "dur": 31, "args": { "External id": 181337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181337, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 181337, "pid": 5, "tid": 7, "ts": 1716454224322761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274368, "dur": 12, "args": { "External id": 181337, "cbid": 211, "correlation": 181337 } }, { "ph": "s", "id": 181337, "pid": 76337, "tid": -914061504, "ts": 1716454224274368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224322793, "dur": 4, "args": { "External id": 181349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181349, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 181349, "pid": 5, "tid": 7, "ts": 1716454224322793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274390, "dur": 6, "args": { "External id": 181349, "cbid": 211, "correlation": 181349 } }, { "ph": "s", "id": 181349, "pid": 76337, "tid": -914061504, "ts": 1716454224274390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224322799, "dur": 33, "args": { "External id": 181352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181352, "pid": 5, "tid": 7, "ts": 1716454224322799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274407, "dur": 7, "args": { "External id": 181352, "cbid": 211, "correlation": 181352 } }, { "ph": "s", "id": 181352, "pid": 76337, "tid": -914061504, "ts": 1716454224274407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224322833, "dur": 21, "args": { "External id": 181361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181361, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181361, "pid": 5, "tid": 7, "ts": 1716454224322833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274446, "dur": 9, "args": { "External id": 181361, "cbid": 211, "correlation": 181361 } }, { "ph": "s", "id": 181361, "pid": 76337, "tid": -914061504, "ts": 1716454224274446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224274498, "dur": 0, "args": { "External id": 181371, "cbid": 317, "correlation": 181371 } }, { "ph": "f", "id": 181371, "pid": 76337, "tid": -914061504, "ts": 1716454224274498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224274499, "dur": 0, "args": { "External id": 181372, "cbid": 203, "correlation": 181372 } }, { "ph": "f", "id": 181372, "pid": 76337, "tid": -914061504, "ts": 1716454224274499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224274499, "dur": 0, "args": { "External id": 181373, "cbid": 205, "correlation": 181373 } }, { "ph": "f", "id": 181373, "pid": 76337, "tid": -914061504, "ts": 1716454224274499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224322856, "dur": 23, "args": { "External id": 181377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181377, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181377, "pid": 5, "tid": 7, "ts": 1716454224322856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274512, "dur": 12, "args": { "External id": 181377, "cbid": 211, "correlation": 181377 } }, { "ph": "s", "id": 181377, "pid": 76337, "tid": -914061504, "ts": 1716454224274512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224322880, "dur": 45, "args": { "External id": 181379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181379, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181379, "pid": 5, "tid": 7, "ts": 1716454224322880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274527, "dur": 5, "args": { "External id": 181379, "cbid": 211, "correlation": 181379 } }, { "ph": "s", "id": 181379, "pid": 76337, "tid": -914061504, "ts": 1716454224274527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224322926, "dur": 659, "args": { "External id": 181381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181381, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181381, "pid": 5, "tid": 7, "ts": 1716454224322926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274538, "dur": 6, "args": { "External id": 181381, "cbid": 211, "correlation": 181381 } }, { "ph": "s", "id": 181381, "pid": 76337, "tid": -914061504, "ts": 1716454224274538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224323587, "dur": 24, "args": { "External id": 181383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181383, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181383, "pid": 5, "tid": 7, "ts": 1716454224323587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274548, "dur": 5, "args": { "External id": 181383, "cbid": 211, "correlation": 181383 } }, { "ph": "s", "id": 181383, "pid": 76337, "tid": -914061504, "ts": 1716454224274548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224323612, "dur": 34, "args": { "External id": 181389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181389, "pid": 5, "tid": 7, "ts": 1716454224323612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274576, "dur": 8, "args": { "External id": 181389, "cbid": 211, "correlation": 181389 } }, { "ph": "s", "id": 181389, "pid": 76337, "tid": -914061504, "ts": 1716454224274576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224323647, "dur": 3, "args": { "External id": 181397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181397, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 181397, "pid": 5, "tid": 7, "ts": 1716454224323647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274619, "dur": 10, "args": { "External id": 181397, "cbid": 211, "correlation": 181397 } }, { "ph": "s", "id": 181397, "pid": 76337, "tid": -914061504, "ts": 1716454224274619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224274684, "dur": 1, "args": { "External id": 181413, "cbid": 251, "correlation": 181413 } }, { "ph": "f", "id": 181413, "pid": 76337, "tid": -914061504, "ts": 1716454224274684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224274689, "dur": 0, "args": { "External id": 181415, "cbid": 251, "correlation": 181415 } }, { "ph": "f", "id": 181415, "pid": 76337, "tid": -914061504, "ts": 1716454224274689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224323652, "dur": 13, "args": { "External id": 181416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181416, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 181416, "pid": 5, "tid": 7, "ts": 1716454224323652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274691, "dur": 11, "args": { "External id": 181416, "cbid": 211, "correlation": 181416 } }, { "ph": "s", "id": 181416, "pid": 76337, "tid": -914061504, "ts": 1716454224274691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224323666, "dur": 5, "args": { "External id": 181418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181418, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 181418, "pid": 5, "tid": 7, "ts": 1716454224323666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274704, "dur": 5, "args": { "External id": 181418, "cbid": 211, "correlation": 181418 } }, { "ph": "s", "id": 181418, "pid": 76337, "tid": -914061504, "ts": 1716454224274704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224323672, "dur": 29, "args": { "External id": 181428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181428, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181428, "pid": 5, "tid": 7, "ts": 1716454224323672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274763, "dur": 14, "args": { "External id": 181428, "cbid": 211, "correlation": 181428 } }, { "ph": "s", "id": 181428, "pid": 76337, "tid": -914061504, "ts": 1716454224274763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224323703, "dur": 31, "args": { "External id": 181448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181448, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 181448, "pid": 5, "tid": 7, "ts": 1716454224323703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274831, "dur": 11, "args": { "External id": 181448, "cbid": 211, "correlation": 181448 } }, { "ph": "s", "id": 181448, "pid": 76337, "tid": -914061504, "ts": 1716454224274831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224323735, "dur": 4, "args": { "External id": 181460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181460, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 181460, "pid": 5, "tid": 7, "ts": 1716454224323735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274851, "dur": 7, "args": { "External id": 181460, "cbid": 211, "correlation": 181460 } }, { "ph": "s", "id": 181460, "pid": 76337, "tid": -914061504, "ts": 1716454224274851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224323740, "dur": 30, "args": { "External id": 181463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181463, "pid": 5, "tid": 7, "ts": 1716454224323740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274871, "dur": 7, "args": { "External id": 181463, "cbid": 211, "correlation": 181463 } }, { "ph": "s", "id": 181463, "pid": 76337, "tid": -914061504, "ts": 1716454224274871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224323771, "dur": 20, "args": { "External id": 181472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181472, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181472, "pid": 5, "tid": 7, "ts": 1716454224323771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224274912, "dur": 10, "args": { "External id": 181472, "cbid": 211, "correlation": 181472 } }, { "ph": "s", "id": 181472, "pid": 76337, "tid": -914061504, "ts": 1716454224274912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224274984, "dur": 0, "args": { "External id": 181482, "cbid": 317, "correlation": 181482 } }, { "ph": "f", "id": 181482, "pid": 76337, "tid": -914061504, "ts": 1716454224274984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224274985, "dur": 0, "args": { "External id": 181483, "cbid": 203, "correlation": 181483 } }, { "ph": "f", "id": 181483, "pid": 76337, "tid": -914061504, "ts": 1716454224274985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224274986, "dur": 0, "args": { "External id": 181484, "cbid": 205, "correlation": 181484 } }, { "ph": "f", "id": 181484, "pid": 76337, "tid": -914061504, "ts": 1716454224274986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224323792, "dur": 23, "args": { "External id": 181488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181488, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181488, "pid": 5, "tid": 7, "ts": 1716454224323792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275000, "dur": 12, "args": { "External id": 181488, "cbid": 211, "correlation": 181488 } }, { "ph": "s", "id": 181488, "pid": 76337, "tid": -914061504, "ts": 1716454224275000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224323816, "dur": 45, "args": { "External id": 181490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181490, "pid": 5, "tid": 7, "ts": 1716454224323816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275014, "dur": 5, "args": { "External id": 181490, "cbid": 211, "correlation": 181490 } }, { "ph": "s", "id": 181490, "pid": 76337, "tid": -914061504, "ts": 1716454224275014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224323862, "dur": 651, "args": { "External id": 181492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181492, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181492, "pid": 5, "tid": 7, "ts": 1716454224323862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275025, "dur": 7, "args": { "External id": 181492, "cbid": 211, "correlation": 181492 } }, { "ph": "s", "id": 181492, "pid": 76337, "tid": -914061504, "ts": 1716454224275025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224324515, "dur": 21, "args": { "External id": 181494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181494, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181494, "pid": 5, "tid": 7, "ts": 1716454224324515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275036, "dur": 5, "args": { "External id": 181494, "cbid": 211, "correlation": 181494 } }, { "ph": "s", "id": 181494, "pid": 76337, "tid": -914061504, "ts": 1716454224275036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224324537, "dur": 34, "args": { "External id": 181500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181500, "pid": 5, "tid": 7, "ts": 1716454224324537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275064, "dur": 8, "args": { "External id": 181500, "cbid": 211, "correlation": 181500 } }, { "ph": "s", "id": 181500, "pid": 76337, "tid": -914061504, "ts": 1716454224275064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224324572, "dur": 27, "args": { "External id": 181508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181508, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181508, "pid": 5, "tid": 7, "ts": 1716454224324572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275097, "dur": 8, "args": { "External id": 181508, "cbid": 211, "correlation": 181508 } }, { "ph": "s", "id": 181508, "pid": 76337, "tid": -914061504, "ts": 1716454224275097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224324600, "dur": 19, "args": { "External id": 181516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181516, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181516, "pid": 5, "tid": 7, "ts": 1716454224324600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275126, "dur": 9, "args": { "External id": 181516, "cbid": 211, "correlation": 181516 } }, { "ph": "s", "id": 181516, "pid": 76337, "tid": -914061504, "ts": 1716454224275126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224324621, "dur": 31, "args": { "External id": 181536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181536, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 181536, "pid": 5, "tid": 7, "ts": 1716454224324621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275206, "dur": 13, "args": { "External id": 181536, "cbid": 211, "correlation": 181536 } }, { "ph": "s", "id": 181536, "pid": 76337, "tid": -914061504, "ts": 1716454224275206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224324652, "dur": 4, "args": { "External id": 181548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181548, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 181548, "pid": 5, "tid": 7, "ts": 1716454224324652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275229, "dur": 7, "args": { "External id": 181548, "cbid": 211, "correlation": 181548 } }, { "ph": "s", "id": 181548, "pid": 76337, "tid": -914061504, "ts": 1716454224275229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224324658, "dur": 30, "args": { "External id": 181551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181551, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181551, "pid": 5, "tid": 7, "ts": 1716454224324658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275247, "dur": 6, "args": { "External id": 181551, "cbid": 211, "correlation": 181551 } }, { "ph": "s", "id": 181551, "pid": 76337, "tid": -914061504, "ts": 1716454224275247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224275306, "dur": 0, "args": { "External id": 181562, "cbid": 317, "correlation": 181562 } }, { "ph": "f", "id": 181562, "pid": 76337, "tid": -914061504, "ts": 1716454224275306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224275306, "dur": 0, "args": { "External id": 181563, "cbid": 203, "correlation": 181563 } }, { "ph": "f", "id": 181563, "pid": 76337, "tid": -914061504, "ts": 1716454224275306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224275307, "dur": 0, "args": { "External id": 181564, "cbid": 205, "correlation": 181564 } }, { "ph": "f", "id": 181564, "pid": 76337, "tid": -914061504, "ts": 1716454224275307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224324689, "dur": 23, "args": { "External id": 181568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181568, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181568, "pid": 5, "tid": 7, "ts": 1716454224324689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275320, "dur": 12, "args": { "External id": 181568, "cbid": 211, "correlation": 181568 } }, { "ph": "s", "id": 181568, "pid": 76337, "tid": -914061504, "ts": 1716454224275320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224324713, "dur": 117, "args": { "External id": 181570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181570, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181570, "pid": 5, "tid": 7, "ts": 1716454224324713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275338, "dur": 7, "args": { "External id": 181570, "cbid": 211, "correlation": 181570 } }, { "ph": "s", "id": 181570, "pid": 76337, "tid": -914061504, "ts": 1716454224275338, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224324832, "dur": 22, "args": { "External id": 181572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181572, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181572, "pid": 5, "tid": 7, "ts": 1716454224324832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275348, "dur": 5, "args": { "External id": 181572, "cbid": 211, "correlation": 181572 } }, { "ph": "s", "id": 181572, "pid": 76337, "tid": -914061504, "ts": 1716454224275348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224324855, "dur": 32, "args": { "External id": 181578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181578, "pid": 5, "tid": 7, "ts": 1716454224324855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275376, "dur": 9, "args": { "External id": 181578, "cbid": 211, "correlation": 181578 } }, { "ph": "s", "id": 181578, "pid": 76337, "tid": -914061504, "ts": 1716454224275376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224324889, "dur": 199, "args": { "External id": 181587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181587, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181587, "pid": 5, "tid": 7, "ts": 1716454224324889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275459, "dur": 14, "args": { "External id": 181587, "cbid": 211, "correlation": 181587 } }, { "ph": "s", "id": 181587, "pid": 76337, "tid": -914061504, "ts": 1716454224275459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224325089, "dur": 65, "args": { "External id": 181609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181609, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181609, "pid": 5, "tid": 7, "ts": 1716454224325089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275517, "dur": 10, "args": { "External id": 181609, "cbid": 211, "correlation": 181609 } }, { "ph": "s", "id": 181609, "pid": 76337, "tid": -914061504, "ts": 1716454224275517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224275606, "dur": 2, "args": { "External id": 181620, "cbid": 251, "correlation": 181620 } }, { "ph": "f", "id": 181620, "pid": 76337, "tid": -914061504, "ts": 1716454224275606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224325156, "dur": 154, "args": { "External id": 181621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181621, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181621, "pid": 5, "tid": 7, "ts": 1716454224325156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275612, "dur": 14, "args": { "External id": 181621, "cbid": 211, "correlation": 181621 } }, { "ph": "s", "id": 181621, "pid": 76337, "tid": -914061504, "ts": 1716454224275612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224275683, "dur": 1, "args": { "External id": 181632, "cbid": 251, "correlation": 181632 } }, { "ph": "f", "id": 181632, "pid": 76337, "tid": -914061504, "ts": 1716454224275683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224325311, "dur": 145, "args": { "External id": 181633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181633, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181633, "pid": 5, "tid": 7, "ts": 1716454224325311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275687, "dur": 12, "args": { "External id": 181633, "cbid": 211, "correlation": 181633 } }, { "ph": "s", "id": 181633, "pid": 76337, "tid": -914061504, "ts": 1716454224275687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224275752, "dur": 1, "args": { "External id": 181644, "cbid": 251, "correlation": 181644 } }, { "ph": "f", "id": 181644, "pid": 76337, "tid": -914061504, "ts": 1716454224275752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224325457, "dur": 141, "args": { "External id": 181645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181645, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181645, "pid": 5, "tid": 7, "ts": 1716454224325457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275756, "dur": 11, "args": { "External id": 181645, "cbid": 211, "correlation": 181645 } }, { "ph": "s", "id": 181645, "pid": 76337, "tid": -914061504, "ts": 1716454224275756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224325600, "dur": 1870, "args": { "External id": 181666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181666, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 181666, "pid": 5, "tid": 7, "ts": 1716454224325600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275837, "dur": 13, "args": { "External id": 181666, "cbid": 211, "correlation": 181666 } }, { "ph": "s", "id": 181666, "pid": 76337, "tid": -914061504, "ts": 1716454224275837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224275938, "dur": 1, "args": { "External id": 181684, "cbid": 251, "correlation": 181684 } }, { "ph": "f", "id": 181684, "pid": 76337, "tid": -914061504, "ts": 1716454224275938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224327471, "dur": 143, "args": { "External id": 181686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181686, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 181686, "pid": 5, "tid": 7, "ts": 1716454224327471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224275944, "dur": 13, "args": { "External id": 181686, "cbid": 211, "correlation": 181686 } }, { "ph": "s", "id": 181686, "pid": 76337, "tid": -914061504, "ts": 1716454224275944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224327615, "dur": 35, "args": { "External id": 181694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181694, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181694, "pid": 5, "tid": 7, "ts": 1716454224327615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276026, "dur": 13, "args": { "External id": 181694, "cbid": 211, "correlation": 181694 } }, { "ph": "s", "id": 181694, "pid": 76337, "tid": -914061504, "ts": 1716454224276026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224327652, "dur": 51, "args": { "External id": 181702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181702, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181702, "pid": 5, "tid": 7, "ts": 1716454224327652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276066, "dur": 8, "args": { "External id": 181702, "cbid": 211, "correlation": 181702 } }, { "ph": "s", "id": 181702, "pid": 76337, "tid": -914061504, "ts": 1716454224276066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224327704, "dur": 31, "args": { "External id": 181713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181713, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181713, "pid": 5, "tid": 7, "ts": 1716454224327704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276137, "dur": 14, "args": { "External id": 181713, "cbid": 211, "correlation": 181713 } }, { "ph": "s", "id": 181713, "pid": 76337, "tid": -914061504, "ts": 1716454224276137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224327735, "dur": 33, "args": { "External id": 181735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181735, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181735, "pid": 5, "tid": 7, "ts": 1716454224327735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276168, "dur": 7, "args": { "External id": 181735, "cbid": 211, "correlation": 181735 } }, { "ph": "s", "id": 181735, "pid": 76337, "tid": -914061504, "ts": 1716454224276168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276253, "dur": 1, "args": { "External id": 181746, "cbid": 251, "correlation": 181746 } }, { "ph": "f", "id": 181746, "pid": 76337, "tid": -914061504, "ts": 1716454224276253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224327770, "dur": 74, "args": { "External id": 181747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181747, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181747, "pid": 5, "tid": 7, "ts": 1716454224327770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276259, "dur": 13, "args": { "External id": 181747, "cbid": 211, "correlation": 181747 } }, { "ph": "s", "id": 181747, "pid": 76337, "tid": -914061504, "ts": 1716454224276259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276327, "dur": 1, "args": { "External id": 181758, "cbid": 251, "correlation": 181758 } }, { "ph": "f", "id": 181758, "pid": 76337, "tid": -914061504, "ts": 1716454224276327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276331, "dur": 0, "args": { "External id": 181759, "cbid": 251, "correlation": 181759 } }, { "ph": "f", "id": 181759, "pid": 76337, "tid": -914061504, "ts": 1716454224276331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224327845, "dur": 10, "args": { "External id": 181760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181760, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 181760, "pid": 5, "tid": 7, "ts": 1716454224327845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276332, "dur": 12, "args": { "External id": 181760, "cbid": 211, "correlation": 181760 } }, { "ph": "s", "id": 181760, "pid": 76337, "tid": -914061504, "ts": 1716454224276332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224327856, "dur": 5, "args": { "External id": 181762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181762, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 181762, "pid": 5, "tid": 7, "ts": 1716454224327856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276346, "dur": 7, "args": { "External id": 181762, "cbid": 211, "correlation": 181762 } }, { "ph": "s", "id": 181762, "pid": 76337, "tid": -914061504, "ts": 1716454224276346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276405, "dur": 1, "args": { "External id": 181773, "cbid": 251, "correlation": 181773 } }, { "ph": "f", "id": 181773, "pid": 76337, "tid": -914061504, "ts": 1716454224276405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276409, "dur": 0, "args": { "External id": 181774, "cbid": 251, "correlation": 181774 } }, { "ph": "f", "id": 181774, "pid": 76337, "tid": -914061504, "ts": 1716454224276409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224327862, "dur": 7, "args": { "External id": 181775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181775, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 181775, "pid": 5, "tid": 7, "ts": 1716454224327862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276410, "dur": 11, "args": { "External id": 181775, "cbid": 211, "correlation": 181775 } }, { "ph": "s", "id": 181775, "pid": 76337, "tid": -914061504, "ts": 1716454224276410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224327870, "dur": 3, "args": { "External id": 181777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181777, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 181777, "pid": 5, "tid": 7, "ts": 1716454224327870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276423, "dur": 6, "args": { "External id": 181777, "cbid": 211, "correlation": 181777 } }, { "ph": "s", "id": 181777, "pid": 76337, "tid": -914061504, "ts": 1716454224276423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224327874, "dur": 88, "args": { "External id": 181798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181798, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 181798, "pid": 5, "tid": 7, "ts": 1716454224327874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276496, "dur": 13, "args": { "External id": 181798, "cbid": 211, "correlation": 181798 } }, { "ph": "s", "id": 181798, "pid": 76337, "tid": -914061504, "ts": 1716454224276496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276595, "dur": 1, "args": { "External id": 181816, "cbid": 251, "correlation": 181816 } }, { "ph": "f", "id": 181816, "pid": 76337, "tid": -914061504, "ts": 1716454224276595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224327963, "dur": 95, "args": { "External id": 181818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181818, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181818, "pid": 5, "tid": 7, "ts": 1716454224327963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276601, "dur": 14, "args": { "External id": 181818, "cbid": 211, "correlation": 181818 } }, { "ph": "s", "id": 181818, "pid": 76337, "tid": -914061504, "ts": 1716454224276601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224328060, "dur": 19, "args": { "External id": 181826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181826, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181826, "pid": 5, "tid": 7, "ts": 1716454224328060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276671, "dur": 12, "args": { "External id": 181826, "cbid": 211, "correlation": 181826 } }, { "ph": "s", "id": 181826, "pid": 76337, "tid": -914061504, "ts": 1716454224276671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224328081, "dur": 37, "args": { "External id": 181834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181834, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181834, "pid": 5, "tid": 7, "ts": 1716454224328081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276713, "dur": 9, "args": { "External id": 181834, "cbid": 211, "correlation": 181834 } }, { "ph": "s", "id": 181834, "pid": 76337, "tid": -914061504, "ts": 1716454224276713, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224328119, "dur": 33, "args": { "External id": 181856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181856, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181856, "pid": 5, "tid": 7, "ts": 1716454224328119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276764, "dur": 10, "args": { "External id": 181856, "cbid": 211, "correlation": 181856 } }, { "ph": "s", "id": 181856, "pid": 76337, "tid": -914061504, "ts": 1716454224276764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276853, "dur": 1, "args": { "External id": 181872, "cbid": 251, "correlation": 181872 } }, { "ph": "f", "id": 181872, "pid": 76337, "tid": -914061504, "ts": 1716454224276853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224276858, "dur": 0, "args": { "External id": 181874, "cbid": 251, "correlation": 181874 } }, { "ph": "f", "id": 181874, "pid": 76337, "tid": -914061504, "ts": 1716454224276858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224328153, "dur": 525, "args": { "External id": 181875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181875, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 181875, "pid": 5, "tid": 7, "ts": 1716454224328153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276861, "dur": 13, "args": { "External id": 181875, "cbid": 211, "correlation": 181875 } }, { "ph": "s", "id": 181875, "pid": 76337, "tid": -914061504, "ts": 1716454224276861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224328679, "dur": 121, "args": { "External id": 181883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181883, "pid": 5, "tid": 7, "ts": 1716454224328679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276930, "dur": 12, "args": { "External id": 181883, "cbid": 211, "correlation": 181883 } }, { "ph": "s", "id": 181883, "pid": 76337, "tid": -914061504, "ts": 1716454224276930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224328802, "dur": 128, "args": { "External id": 181891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181891, "pid": 5, "tid": 7, "ts": 1716454224328802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224276960, "dur": 9, "args": { "External id": 181891, "cbid": 211, "correlation": 181891 } }, { "ph": "s", "id": 181891, "pid": 76337, "tid": -914061504, "ts": 1716454224276960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224277046, "dur": 1, "args": { "External id": 181907, "cbid": 251, "correlation": 181907 } }, { "ph": "f", "id": 181907, "pid": 76337, "tid": -914061504, "ts": 1716454224277046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224328932, "dur": 298, "args": { "External id": 181909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181909, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181909, "pid": 5, "tid": 7, "ts": 1716454224328932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277051, "dur": 13, "args": { "External id": 181909, "cbid": 211, "correlation": 181909 } }, { "ph": "s", "id": 181909, "pid": 76337, "tid": -914061504, "ts": 1716454224277051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224329231, "dur": 27, "args": { "External id": 181917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181917, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181917, "pid": 5, "tid": 7, "ts": 1716454224329231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277094, "dur": 10, "args": { "External id": 181917, "cbid": 211, "correlation": 181917 } }, { "ph": "s", "id": 181917, "pid": 76337, "tid": -914061504, "ts": 1716454224277094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224329259, "dur": 78, "args": { "External id": 181928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181928, "pid": 5, "tid": 7, "ts": 1716454224329259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277161, "dur": 13, "args": { "External id": 181928, "cbid": 211, "correlation": 181928 } }, { "ph": "s", "id": 181928, "pid": 76337, "tid": -914061504, "ts": 1716454224277161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224277228, "dur": 0, "args": { "External id": 181940, "cbid": 317, "correlation": 181940 } }, { "ph": "f", "id": 181940, "pid": 76337, "tid": -914061504, "ts": 1716454224277228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224277228, "dur": 0, "args": { "External id": 181941, "cbid": 203, "correlation": 181941 } }, { "ph": "f", "id": 181941, "pid": 76337, "tid": -914061504, "ts": 1716454224277228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224277229, "dur": 0, "args": { "External id": 181942, "cbid": 205, "correlation": 181942 } }, { "ph": "f", "id": 181942, "pid": 76337, "tid": -914061504, "ts": 1716454224277229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329339, "dur": 23, "args": { "External id": 181946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181946, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181946, "pid": 5, "tid": 7, "ts": 1716454224329339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277244, "dur": 12, "args": { "External id": 181946, "cbid": 211, "correlation": 181946 } }, { "ph": "s", "id": 181946, "pid": 76337, "tid": -914061504, "ts": 1716454224277244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224329363, "dur": 116, "args": { "External id": 181948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181948, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 181948, "pid": 5, "tid": 7, "ts": 1716454224329363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277262, "dur": 6, "args": { "External id": 181948, "cbid": 211, "correlation": 181948 } }, { "ph": "s", "id": 181948, "pid": 76337, "tid": -914061504, "ts": 1716454224277262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329481, "dur": 23, "args": { "External id": 181950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181950, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181950, "pid": 5, "tid": 7, "ts": 1716454224329481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277273, "dur": 5, "args": { "External id": 181950, "cbid": 211, "correlation": 181950 } }, { "ph": "s", "id": 181950, "pid": 76337, "tid": -914061504, "ts": 1716454224277273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224329505, "dur": 32, "args": { "External id": 181956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181956, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181956, "pid": 5, "tid": 7, "ts": 1716454224329505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277301, "dur": 8, "args": { "External id": 181956, "cbid": 211, "correlation": 181956 } }, { "ph": "s", "id": 181956, "pid": 76337, "tid": -914061504, "ts": 1716454224277301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224329538, "dur": 27, "args": { "External id": 181964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181964, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181964, "pid": 5, "tid": 7, "ts": 1716454224329538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277333, "dur": 8, "args": { "External id": 181964, "cbid": 211, "correlation": 181964 } }, { "ph": "s", "id": 181964, "pid": 76337, "tid": -914061504, "ts": 1716454224277333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224277405, "dur": 0, "args": { "External id": 181974, "cbid": 317, "correlation": 181974 } }, { "ph": "f", "id": 181974, "pid": 76337, "tid": -914061504, "ts": 1716454224277405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224277406, "dur": 0, "args": { "External id": 181975, "cbid": 203, "correlation": 181975 } }, { "ph": "f", "id": 181975, "pid": 76337, "tid": -914061504, "ts": 1716454224277406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224277406, "dur": 0, "args": { "External id": 181976, "cbid": 205, "correlation": 181976 } }, { "ph": "f", "id": 181976, "pid": 76337, "tid": -914061504, "ts": 1716454224277406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329567, "dur": 21, "args": { "External id": 181980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181980, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181980, "pid": 5, "tid": 7, "ts": 1716454224329567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277421, "dur": 13, "args": { "External id": 181980, "cbid": 211, "correlation": 181980 } }, { "ph": "s", "id": 181980, "pid": 76337, "tid": -914061504, "ts": 1716454224277421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329589, "dur": 43, "args": { "External id": 181982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181982, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181982, "pid": 5, "tid": 7, "ts": 1716454224329589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277436, "dur": 5, "args": { "External id": 181982, "cbid": 211, "correlation": 181982 } }, { "ph": "s", "id": 181982, "pid": 76337, "tid": -914061504, "ts": 1716454224277436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224329633, "dur": 229, "args": { "External id": 181984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181984, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 181984, "pid": 5, "tid": 7, "ts": 1716454224329633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277448, "dur": 7, "args": { "External id": 181984, "cbid": 211, "correlation": 181984 } }, { "ph": "s", "id": 181984, "pid": 76337, "tid": -914061504, "ts": 1716454224277448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329864, "dur": 6, "args": { "External id": 181986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181986, "pid": 5, "tid": 7, "ts": 1716454224329864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277458, "dur": 5, "args": { "External id": 181986, "cbid": 211, "correlation": 181986 } }, { "ph": "s", "id": 181986, "pid": 76337, "tid": -914061504, "ts": 1716454224277458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224329871, "dur": 9, "args": { "External id": 181992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 181992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 181992, "pid": 5, "tid": 7, "ts": 1716454224329871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277485, "dur": 8, "args": { "External id": 181992, "cbid": 211, "correlation": 181992 } }, { "ph": "s", "id": 181992, "pid": 76337, "tid": -914061504, "ts": 1716454224277485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224329882, "dur": 12, "args": { "External id": 182012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182012, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182012, "pid": 5, "tid": 7, "ts": 1716454224329882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277577, "dur": 13, "args": { "External id": 182012, "cbid": 211, "correlation": 182012 } }, { "ph": "s", "id": 182012, "pid": 76337, "tid": -914061504, "ts": 1716454224277577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224329894, "dur": 4, "args": { "External id": 182024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182024, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 182024, "pid": 5, "tid": 7, "ts": 1716454224329894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277600, "dur": 6, "args": { "External id": 182024, "cbid": 211, "correlation": 182024 } }, { "ph": "s", "id": 182024, "pid": 76337, "tid": -914061504, "ts": 1716454224277600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224329900, "dur": 12, "args": { "External id": 182027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182027, "pid": 5, "tid": 7, "ts": 1716454224329900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277618, "dur": 6, "args": { "External id": 182027, "cbid": 211, "correlation": 182027 } }, { "ph": "s", "id": 182027, "pid": 76337, "tid": -914061504, "ts": 1716454224277618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224329912, "dur": 6, "args": { "External id": 182036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182036, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182036, "pid": 5, "tid": 7, "ts": 1716454224329912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277656, "dur": 9, "args": { "External id": 182036, "cbid": 211, "correlation": 182036 } }, { "ph": "s", "id": 182036, "pid": 76337, "tid": -914061504, "ts": 1716454224277656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224277713, "dur": 0, "args": { "External id": 182046, "cbid": 317, "correlation": 182046 } }, { "ph": "f", "id": 182046, "pid": 76337, "tid": -914061504, "ts": 1716454224277713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224277714, "dur": 0, "args": { "External id": 182047, "cbid": 203, "correlation": 182047 } }, { "ph": "f", "id": 182047, "pid": 76337, "tid": -914061504, "ts": 1716454224277714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224277714, "dur": 0, "args": { "External id": 182048, "cbid": 205, "correlation": 182048 } }, { "ph": "f", "id": 182048, "pid": 76337, "tid": -914061504, "ts": 1716454224277714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329920, "dur": 5, "args": { "External id": 182052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182052, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182052, "pid": 5, "tid": 7, "ts": 1716454224329920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277730, "dur": 12, "args": { "External id": 182052, "cbid": 211, "correlation": 182052 } }, { "ph": "s", "id": 182052, "pid": 76337, "tid": -914061504, "ts": 1716454224277730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224329927, "dur": 81, "args": { "External id": 182054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182054, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182054, "pid": 5, "tid": 7, "ts": 1716454224329927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277745, "dur": 5, "args": { "External id": 182054, "cbid": 211, "correlation": 182054 } }, { "ph": "s", "id": 182054, "pid": 76337, "tid": -914061504, "ts": 1716454224277745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224330010, "dur": 1, "args": { "External id": 182056, "device": 5, "context": 1, "stream": 7, "correlation": 182056, "bytes": 960, "memory bandwidth (GB/s)": 0.6122448979591837 } }, { "ph": "f", "id": 182056, "pid": 5, "tid": 7, "ts": 1716454224330010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224277758, "dur": 9, "args": { "External id": 182056, "cbid": 51, "correlation": 182056 } }, { "ph": "s", "id": 182056, "pid": 76337, "tid": -914061504, "ts": 1716454224277758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224330014, "dur": 530, "args": { "External id": 182057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182057, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182057, "pid": 5, "tid": 7, "ts": 1716454224330014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277769, "dur": 9, "args": { "External id": 182057, "cbid": 211, "correlation": 182057 } }, { "ph": "s", "id": 182057, "pid": 76337, "tid": -914061504, "ts": 1716454224277769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224330545, "dur": 11, "args": { "External id": 182059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182059, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182059, "pid": 5, "tid": 7, "ts": 1716454224330545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277781, "dur": 5, "args": { "External id": 182059, "cbid": 211, "correlation": 182059 } }, { "ph": "s", "id": 182059, "pid": 76337, "tid": -914061504, "ts": 1716454224277781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224330558, "dur": 14, "args": { "External id": 182065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182065, "pid": 5, "tid": 7, "ts": 1716454224330558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277809, "dur": 8, "args": { "External id": 182065, "cbid": 211, "correlation": 182065 } }, { "ph": "s", "id": 182065, "pid": 76337, "tid": -914061504, "ts": 1716454224277809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224330573, "dur": 3, "args": { "External id": 182073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182073, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 182073, "pid": 5, "tid": 7, "ts": 1716454224330573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277853, "dur": 9, "args": { "External id": 182073, "cbid": 211, "correlation": 182073 } }, { "ph": "s", "id": 182073, "pid": 76337, "tid": -914061504, "ts": 1716454224277853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224277917, "dur": 1, "args": { "External id": 182089, "cbid": 251, "correlation": 182089 } }, { "ph": "f", "id": 182089, "pid": 76337, "tid": -914061504, "ts": 1716454224277917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224277922, "dur": 0, "args": { "External id": 182091, "cbid": 251, "correlation": 182091 } }, { "ph": "f", "id": 182091, "pid": 76337, "tid": -914061504, "ts": 1716454224277922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224330578, "dur": 13, "args": { "External id": 182092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182092, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182092, "pid": 5, "tid": 7, "ts": 1716454224330578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277924, "dur": 12, "args": { "External id": 182092, "cbid": 211, "correlation": 182092 } }, { "ph": "s", "id": 182092, "pid": 76337, "tid": -914061504, "ts": 1716454224277924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224330592, "dur": 5, "args": { "External id": 182094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182094, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182094, "pid": 5, "tid": 7, "ts": 1716454224330592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224277937, "dur": 6, "args": { "External id": 182094, "cbid": 211, "correlation": 182094 } }, { "ph": "s", "id": 182094, "pid": 76337, "tid": -914061504, "ts": 1716454224277937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224330599, "dur": 16, "args": { "External id": 182104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182104, "pid": 5, "tid": 7, "ts": 1716454224330599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278005, "dur": 13, "args": { "External id": 182104, "cbid": 211, "correlation": 182104 } }, { "ph": "s", "id": 182104, "pid": 76337, "tid": -914061504, "ts": 1716454224278005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224330616, "dur": 19, "args": { "External id": 182124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182124, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182124, "pid": 5, "tid": 7, "ts": 1716454224330616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278071, "dur": 12, "args": { "External id": 182124, "cbid": 211, "correlation": 182124 } }, { "ph": "s", "id": 182124, "pid": 76337, "tid": -914061504, "ts": 1716454224278071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224330636, "dur": 5, "args": { "External id": 182136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182136, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 182136, "pid": 5, "tid": 7, "ts": 1716454224330636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278093, "dur": 6, "args": { "External id": 182136, "cbid": 211, "correlation": 182136 } }, { "ph": "s", "id": 182136, "pid": 76337, "tid": -914061504, "ts": 1716454224278093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224330642, "dur": 17, "args": { "External id": 182139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182139, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182139, "pid": 5, "tid": 7, "ts": 1716454224330642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278112, "dur": 7, "args": { "External id": 182139, "cbid": 211, "correlation": 182139 } }, { "ph": "s", "id": 182139, "pid": 76337, "tid": -914061504, "ts": 1716454224278112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224330660, "dur": 11, "args": { "External id": 182148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182148, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182148, "pid": 5, "tid": 7, "ts": 1716454224330660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278153, "dur": 10, "args": { "External id": 182148, "cbid": 211, "correlation": 182148 } }, { "ph": "s", "id": 182148, "pid": 76337, "tid": -914061504, "ts": 1716454224278153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224278219, "dur": 0, "args": { "External id": 182158, "cbid": 317, "correlation": 182158 } }, { "ph": "f", "id": 182158, "pid": 76337, "tid": -914061504, "ts": 1716454224278219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224278220, "dur": 0, "args": { "External id": 182159, "cbid": 203, "correlation": 182159 } }, { "ph": "f", "id": 182159, "pid": 76337, "tid": -914061504, "ts": 1716454224278220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224278221, "dur": 0, "args": { "External id": 182160, "cbid": 205, "correlation": 182160 } }, { "ph": "f", "id": 182160, "pid": 76337, "tid": -914061504, "ts": 1716454224278221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224330672, "dur": 11, "args": { "External id": 182164, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182164, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182164, "pid": 5, "tid": 7, "ts": 1716454224330672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278239, "dur": 12, "args": { "External id": 182164, "cbid": 211, "correlation": 182164 } }, { "ph": "s", "id": 182164, "pid": 76337, "tid": -914061504, "ts": 1716454224278239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224330685, "dur": 160, "args": { "External id": 182166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182166, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182166, "pid": 5, "tid": 7, "ts": 1716454224330685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278253, "dur": 5, "args": { "External id": 182166, "cbid": 211, "correlation": 182166 } }, { "ph": "s", "id": 182166, "pid": 76337, "tid": -914061504, "ts": 1716454224278253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224330847, "dur": 1, "args": { "External id": 182168, "device": 5, "context": 1, "stream": 7, "correlation": 182168, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 182168, "pid": 5, "tid": 7, "ts": 1716454224330847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224278265, "dur": 7, "args": { "External id": 182168, "cbid": 51, "correlation": 182168 } }, { "ph": "s", "id": 182168, "pid": 76337, "tid": -914061504, "ts": 1716454224278265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224330851, "dur": 658, "args": { "External id": 182169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182169, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182169, "pid": 5, "tid": 7, "ts": 1716454224330851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278273, "dur": 6, "args": { "External id": 182169, "cbid": 211, "correlation": 182169 } }, { "ph": "s", "id": 182169, "pid": 76337, "tid": -914061504, "ts": 1716454224278273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224331510, "dur": 12, "args": { "External id": 182171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182171, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182171, "pid": 5, "tid": 7, "ts": 1716454224331510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278284, "dur": 6, "args": { "External id": 182171, "cbid": 211, "correlation": 182171 } }, { "ph": "s", "id": 182171, "pid": 76337, "tid": -914061504, "ts": 1716454224278284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224331524, "dur": 14, "args": { "External id": 182177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182177, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182177, "pid": 5, "tid": 7, "ts": 1716454224331524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278313, "dur": 8, "args": { "External id": 182177, "cbid": 211, "correlation": 182177 } }, { "ph": "s", "id": 182177, "pid": 76337, "tid": -914061504, "ts": 1716454224278313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224278371, "dur": 0, "args": { "External id": 182187, "cbid": 317, "correlation": 182187 } }, { "ph": "f", "id": 182187, "pid": 76337, "tid": -914061504, "ts": 1716454224278371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224278372, "dur": 0, "args": { "External id": 182188, "cbid": 203, "correlation": 182188 } }, { "ph": "f", "id": 182188, "pid": 76337, "tid": -914061504, "ts": 1716454224278372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224278372, "dur": 0, "args": { "External id": 182189, "cbid": 205, "correlation": 182189 } }, { "ph": "f", "id": 182189, "pid": 76337, "tid": -914061504, "ts": 1716454224278372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224331540, "dur": 8, "args": { "External id": 182193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182193, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182193, "pid": 5, "tid": 7, "ts": 1716454224331540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278388, "dur": 12, "args": { "External id": 182193, "cbid": 211, "correlation": 182193 } }, { "ph": "s", "id": 182193, "pid": 76337, "tid": -914061504, "ts": 1716454224278388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224331549, "dur": 3, "args": { "External id": 182195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182195, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182195, "pid": 5, "tid": 7, "ts": 1716454224331549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278406, "dur": 6, "args": { "External id": 182195, "cbid": 211, "correlation": 182195 } }, { "ph": "s", "id": 182195, "pid": 76337, "tid": -914061504, "ts": 1716454224278406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224278415, "dur": 0, "args": { "External id": 182196, "cbid": 51, "correlation": 182196 } }, { "ph": "s", "id": 182196, "pid": 76337, "tid": -914061504, "ts": 1716454224278415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224331554, "dur": 56, "args": { "External id": 182197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182197, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 182197, "pid": 5, "tid": 7, "ts": 1716454224331554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278416, "dur": 5, "args": { "External id": 182197, "cbid": 211, "correlation": 182197 } }, { "ph": "s", "id": 182197, "pid": 76337, "tid": -914061504, "ts": 1716454224278416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224331611, "dur": 14, "args": { "External id": 182202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182202, "pid": 5, "tid": 7, "ts": 1716454224331611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278442, "dur": 8, "args": { "External id": 182202, "cbid": 211, "correlation": 182202 } }, { "ph": "s", "id": 182202, "pid": 76337, "tid": -914061504, "ts": 1716454224278442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224331626, "dur": 12, "args": { "External id": 182210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182210, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182210, "pid": 5, "tid": 7, "ts": 1716454224331626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278470, "dur": 8, "args": { "External id": 182210, "cbid": 211, "correlation": 182210 } }, { "ph": "s", "id": 182210, "pid": 76337, "tid": -914061504, "ts": 1716454224278470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224331639, "dur": 11, "args": { "External id": 182218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182218, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182218, "pid": 5, "tid": 7, "ts": 1716454224331639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278500, "dur": 8, "args": { "External id": 182218, "cbid": 211, "correlation": 182218 } }, { "ph": "s", "id": 182218, "pid": 76337, "tid": -914061504, "ts": 1716454224278500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224331651, "dur": 19, "args": { "External id": 182238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182238, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182238, "pid": 5, "tid": 7, "ts": 1716454224331651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278580, "dur": 13, "args": { "External id": 182238, "cbid": 211, "correlation": 182238 } }, { "ph": "s", "id": 182238, "pid": 76337, "tid": -914061504, "ts": 1716454224278580, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224331671, "dur": 4, "args": { "External id": 182250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182250, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 182250, "pid": 5, "tid": 7, "ts": 1716454224331671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278603, "dur": 6, "args": { "External id": 182250, "cbid": 211, "correlation": 182250 } }, { "ph": "s", "id": 182250, "pid": 76337, "tid": -914061504, "ts": 1716454224278603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224331677, "dur": 16, "args": { "External id": 182253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182253, "pid": 5, "tid": 7, "ts": 1716454224331677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278620, "dur": 7, "args": { "External id": 182253, "cbid": 211, "correlation": 182253 } }, { "ph": "s", "id": 182253, "pid": 76337, "tid": -914061504, "ts": 1716454224278620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224278680, "dur": 0, "args": { "External id": 182264, "cbid": 317, "correlation": 182264 } }, { "ph": "f", "id": 182264, "pid": 76337, "tid": -914061504, "ts": 1716454224278680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224278681, "dur": 0, "args": { "External id": 182265, "cbid": 203, "correlation": 182265 } }, { "ph": "f", "id": 182265, "pid": 76337, "tid": -914061504, "ts": 1716454224278681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224278682, "dur": 0, "args": { "External id": 182266, "cbid": 205, "correlation": 182266 } }, { "ph": "f", "id": 182266, "pid": 76337, "tid": -914061504, "ts": 1716454224278682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224331694, "dur": 11, "args": { "External id": 182270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182270, "pid": 5, "tid": 7, "ts": 1716454224331694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278698, "dur": 12, "args": { "External id": 182270, "cbid": 211, "correlation": 182270 } }, { "ph": "s", "id": 182270, "pid": 76337, "tid": -914061504, "ts": 1716454224278698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224331707, "dur": 3, "args": { "External id": 182272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182272, "pid": 5, "tid": 7, "ts": 1716454224331707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278714, "dur": 6, "args": { "External id": 182272, "cbid": 211, "correlation": 182272 } }, { "ph": "s", "id": 182272, "pid": 76337, "tid": -914061504, "ts": 1716454224278714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224278723, "dur": 0, "args": { "External id": 182273, "cbid": 51, "correlation": 182273 } }, { "ph": "s", "id": 182273, "pid": 76337, "tid": -914061504, "ts": 1716454224278723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224331711, "dur": 95, "args": { "External id": 182274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182274, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 182274, "pid": 5, "tid": 7, "ts": 1716454224331711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278724, "dur": 6, "args": { "External id": 182274, "cbid": 211, "correlation": 182274 } }, { "ph": "s", "id": 182274, "pid": 76337, "tid": -914061504, "ts": 1716454224278724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224331808, "dur": 15, "args": { "External id": 182279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182279, "pid": 5, "tid": 7, "ts": 1716454224331808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278753, "dur": 8, "args": { "External id": 182279, "cbid": 211, "correlation": 182279 } }, { "ph": "s", "id": 182279, "pid": 76337, "tid": -914061504, "ts": 1716454224278753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224331824, "dur": 81, "args": { "External id": 182288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182288, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182288, "pid": 5, "tid": 7, "ts": 1716454224331824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278835, "dur": 14, "args": { "External id": 182288, "cbid": 211, "correlation": 182288 } }, { "ph": "s", "id": 182288, "pid": 76337, "tid": -914061504, "ts": 1716454224278835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224331906, "dur": 30, "args": { "External id": 182310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182310, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182310, "pid": 5, "tid": 7, "ts": 1716454224331906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224278896, "dur": 11, "args": { "External id": 182310, "cbid": 211, "correlation": 182310 } }, { "ph": "s", "id": 182310, "pid": 76337, "tid": -914061504, "ts": 1716454224278896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279002, "dur": 2, "args": { "External id": 182321, "cbid": 251, "correlation": 182321 } }, { "ph": "f", "id": 182321, "pid": 76337, "tid": -914061504, "ts": 1716454224279002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224331937, "dur": 163, "args": { "External id": 182322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182322, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182322, "pid": 5, "tid": 7, "ts": 1716454224331937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279008, "dur": 13, "args": { "External id": 182322, "cbid": 211, "correlation": 182322 } }, { "ph": "s", "id": 182322, "pid": 76337, "tid": -914061504, "ts": 1716454224279008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279081, "dur": 1, "args": { "External id": 182333, "cbid": 251, "correlation": 182333 } }, { "ph": "f", "id": 182333, "pid": 76337, "tid": -914061504, "ts": 1716454224279081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224332101, "dur": 135, "args": { "External id": 182334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182334, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182334, "pid": 5, "tid": 7, "ts": 1716454224332101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279085, "dur": 11, "args": { "External id": 182334, "cbid": 211, "correlation": 182334 } }, { "ph": "s", "id": 182334, "pid": 76337, "tid": -914061504, "ts": 1716454224279085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279151, "dur": 1, "args": { "External id": 182345, "cbid": 251, "correlation": 182345 } }, { "ph": "f", "id": 182345, "pid": 76337, "tid": -914061504, "ts": 1716454224279151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224332238, "dur": 154, "args": { "External id": 182346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182346, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182346, "pid": 5, "tid": 7, "ts": 1716454224332238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279156, "dur": 11, "args": { "External id": 182346, "cbid": 211, "correlation": 182346 } }, { "ph": "s", "id": 182346, "pid": 76337, "tid": -914061504, "ts": 1716454224279156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224332393, "dur": 331, "args": { "External id": 182371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182371, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182371, "pid": 5, "tid": 7, "ts": 1716454224332393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279242, "dur": 14, "args": { "External id": 182371, "cbid": 211, "correlation": 182371 } }, { "ph": "s", "id": 182371, "pid": 76337, "tid": -914061504, "ts": 1716454224279242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279347, "dur": 1, "args": { "External id": 182389, "cbid": 251, "correlation": 182389 } }, { "ph": "f", "id": 182389, "pid": 76337, "tid": -914061504, "ts": 1716454224279347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224332725, "dur": 164, "args": { "External id": 182391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182391, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182391, "pid": 5, "tid": 7, "ts": 1716454224332725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279353, "dur": 14, "args": { "External id": 182391, "cbid": 211, "correlation": 182391 } }, { "ph": "s", "id": 182391, "pid": 76337, "tid": -914061504, "ts": 1716454224279353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224332891, "dur": 19, "args": { "External id": 182399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182399, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182399, "pid": 5, "tid": 7, "ts": 1716454224332891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279425, "dur": 12, "args": { "External id": 182399, "cbid": 211, "correlation": 182399 } }, { "ph": "s", "id": 182399, "pid": 76337, "tid": -914061504, "ts": 1716454224279425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224332912, "dur": 27, "args": { "External id": 182407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182407, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182407, "pid": 5, "tid": 7, "ts": 1716454224332912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279465, "dur": 9, "args": { "External id": 182407, "cbid": 211, "correlation": 182407 } }, { "ph": "s", "id": 182407, "pid": 76337, "tid": -914061504, "ts": 1716454224279465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224332940, "dur": 18, "args": { "External id": 182418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182418, "pid": 5, "tid": 7, "ts": 1716454224332940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279537, "dur": 12, "args": { "External id": 182418, "cbid": 211, "correlation": 182418 } }, { "ph": "s", "id": 182418, "pid": 76337, "tid": -914061504, "ts": 1716454224279537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224332960, "dur": 16, "args": { "External id": 182440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182440, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182440, "pid": 5, "tid": 7, "ts": 1716454224332960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279569, "dur": 8, "args": { "External id": 182440, "cbid": 211, "correlation": 182440 } }, { "ph": "s", "id": 182440, "pid": 76337, "tid": -914061504, "ts": 1716454224279569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279653, "dur": 1, "args": { "External id": 182451, "cbid": 251, "correlation": 182451 } }, { "ph": "f", "id": 182451, "pid": 76337, "tid": -914061504, "ts": 1716454224279653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224332977, "dur": 89, "args": { "External id": 182452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182452, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182452, "pid": 5, "tid": 7, "ts": 1716454224332977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279660, "dur": 16, "args": { "External id": 182452, "cbid": 211, "correlation": 182452 } }, { "ph": "s", "id": 182452, "pid": 76337, "tid": -914061504, "ts": 1716454224279660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279732, "dur": 1, "args": { "External id": 182463, "cbid": 251, "correlation": 182463 } }, { "ph": "f", "id": 182463, "pid": 76337, "tid": -914061504, "ts": 1716454224279732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279736, "dur": 0, "args": { "External id": 182464, "cbid": 251, "correlation": 182464 } }, { "ph": "f", "id": 182464, "pid": 76337, "tid": -914061504, "ts": 1716454224279736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224333068, "dur": 12, "args": { "External id": 182465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182465, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182465, "pid": 5, "tid": 7, "ts": 1716454224333068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279738, "dur": 12, "args": { "External id": 182465, "cbid": 211, "correlation": 182465 } }, { "ph": "s", "id": 182465, "pid": 76337, "tid": -914061504, "ts": 1716454224279738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224333081, "dur": 6, "args": { "External id": 182467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182467, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182467, "pid": 5, "tid": 7, "ts": 1716454224333081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279751, "dur": 6, "args": { "External id": 182467, "cbid": 211, "correlation": 182467 } }, { "ph": "s", "id": 182467, "pid": 76337, "tid": -914061504, "ts": 1716454224279751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279809, "dur": 1, "args": { "External id": 182478, "cbid": 251, "correlation": 182478 } }, { "ph": "f", "id": 182478, "pid": 76337, "tid": -914061504, "ts": 1716454224279809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224279812, "dur": 0, "args": { "External id": 182479, "cbid": 251, "correlation": 182479 } }, { "ph": "f", "id": 182479, "pid": 76337, "tid": -914061504, "ts": 1716454224279812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224333088, "dur": 8, "args": { "External id": 182480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182480, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182480, "pid": 5, "tid": 7, "ts": 1716454224333088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279814, "dur": 12, "args": { "External id": 182480, "cbid": 211, "correlation": 182480 } }, { "ph": "s", "id": 182480, "pid": 76337, "tid": -914061504, "ts": 1716454224279814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224333097, "dur": 3, "args": { "External id": 182482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182482, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182482, "pid": 5, "tid": 7, "ts": 1716454224333097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279827, "dur": 5, "args": { "External id": 182482, "cbid": 211, "correlation": 182482 } }, { "ph": "s", "id": 182482, "pid": 76337, "tid": -914061504, "ts": 1716454224279827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224333101, "dur": 54, "args": { "External id": 182507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182507, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182507, "pid": 5, "tid": 7, "ts": 1716454224333101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224279903, "dur": 12, "args": { "External id": 182507, "cbid": 211, "correlation": 182507 } }, { "ph": "s", "id": 182507, "pid": 76337, "tid": -914061504, "ts": 1716454224279903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224280015, "dur": 2, "args": { "External id": 182525, "cbid": 251, "correlation": 182525 } }, { "ph": "f", "id": 182525, "pid": 76337, "tid": -914061504, "ts": 1716454224280015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224333157, "dur": 90, "args": { "External id": 182527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182527, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182527, "pid": 5, "tid": 7, "ts": 1716454224333157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280021, "dur": 15, "args": { "External id": 182527, "cbid": 211, "correlation": 182527 } }, { "ph": "s", "id": 182527, "pid": 76337, "tid": -914061504, "ts": 1716454224280021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224333248, "dur": 9, "args": { "External id": 182535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182535, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182535, "pid": 5, "tid": 7, "ts": 1716454224333248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280094, "dur": 12, "args": { "External id": 182535, "cbid": 211, "correlation": 182535 } }, { "ph": "s", "id": 182535, "pid": 76337, "tid": -914061504, "ts": 1716454224280094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224333259, "dur": 21, "args": { "External id": 182543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182543, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182543, "pid": 5, "tid": 7, "ts": 1716454224333259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280135, "dur": 9, "args": { "External id": 182543, "cbid": 211, "correlation": 182543 } }, { "ph": "s", "id": 182543, "pid": 76337, "tid": -914061504, "ts": 1716454224280135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224333281, "dur": 18, "args": { "External id": 182565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182565, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182565, "pid": 5, "tid": 7, "ts": 1716454224333281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280187, "dur": 9, "args": { "External id": 182565, "cbid": 211, "correlation": 182565 } }, { "ph": "s", "id": 182565, "pid": 76337, "tid": -914061504, "ts": 1716454224280187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224280276, "dur": 2, "args": { "External id": 182581, "cbid": 251, "correlation": 182581 } }, { "ph": "f", "id": 182581, "pid": 76337, "tid": -914061504, "ts": 1716454224280276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224280283, "dur": 0, "args": { "External id": 182583, "cbid": 251, "correlation": 182583 } }, { "ph": "f", "id": 182583, "pid": 76337, "tid": -914061504, "ts": 1716454224280283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224333300, "dur": 494, "args": { "External id": 182584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182584, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182584, "pid": 5, "tid": 7, "ts": 1716454224333300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280285, "dur": 15, "args": { "External id": 182584, "cbid": 211, "correlation": 182584 } }, { "ph": "s", "id": 182584, "pid": 76337, "tid": -914061504, "ts": 1716454224280285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224333795, "dur": 65, "args": { "External id": 182592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182592, "pid": 5, "tid": 7, "ts": 1716454224333795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280352, "dur": 12, "args": { "External id": 182592, "cbid": 211, "correlation": 182592 } }, { "ph": "s", "id": 182592, "pid": 76337, "tid": -914061504, "ts": 1716454224280352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224333862, "dur": 68, "args": { "External id": 182600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182600, "pid": 5, "tid": 7, "ts": 1716454224333862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280384, "dur": 8, "args": { "External id": 182600, "cbid": 211, "correlation": 182600 } }, { "ph": "s", "id": 182600, "pid": 76337, "tid": -914061504, "ts": 1716454224280384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224280464, "dur": 1, "args": { "External id": 182616, "cbid": 251, "correlation": 182616 } }, { "ph": "f", "id": 182616, "pid": 76337, "tid": -914061504, "ts": 1716454224280464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224333932, "dur": 1, "args": { "External id": 182618, "device": 5, "context": 1, "stream": 7, "correlation": 182618, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 182618, "pid": 5, "tid": 7, "ts": 1716454224333932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224280469, "dur": 12, "args": { "External id": 182618, "cbid": 51, "correlation": 182618 } }, { "ph": "s", "id": 182618, "pid": 76337, "tid": -914061504, "ts": 1716454224280469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224333936, "dur": 266, "args": { "External id": 182619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182619, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182619, "pid": 5, "tid": 7, "ts": 1716454224333936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280484, "dur": 11, "args": { "External id": 182619, "cbid": 211, "correlation": 182619 } }, { "ph": "s", "id": 182619, "pid": 76337, "tid": -914061504, "ts": 1716454224280484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224334204, "dur": 14, "args": { "External id": 182627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182627, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182627, "pid": 5, "tid": 7, "ts": 1716454224334204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280525, "dur": 11, "args": { "External id": 182627, "cbid": 211, "correlation": 182627 } }, { "ph": "s", "id": 182627, "pid": 76337, "tid": -914061504, "ts": 1716454224280525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224334219, "dur": 37, "args": { "External id": 182638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182638, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182638, "pid": 5, "tid": 7, "ts": 1716454224334219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280593, "dur": 12, "args": { "External id": 182638, "cbid": 211, "correlation": 182638 } }, { "ph": "s", "id": 182638, "pid": 76337, "tid": -914061504, "ts": 1716454224280593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224280662, "dur": 0, "args": { "External id": 182650, "cbid": 317, "correlation": 182650 } }, { "ph": "f", "id": 182650, "pid": 76337, "tid": -914061504, "ts": 1716454224280662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224280662, "dur": 0, "args": { "External id": 182651, "cbid": 203, "correlation": 182651 } }, { "ph": "f", "id": 182651, "pid": 76337, "tid": -914061504, "ts": 1716454224280662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224280663, "dur": 0, "args": { "External id": 182652, "cbid": 205, "correlation": 182652 } }, { "ph": "f", "id": 182652, "pid": 76337, "tid": -914061504, "ts": 1716454224280663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224334258, "dur": 13, "args": { "External id": 182656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182656, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182656, "pid": 5, "tid": 7, "ts": 1716454224334258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280679, "dur": 13, "args": { "External id": 182656, "cbid": 211, "correlation": 182656 } }, { "ph": "s", "id": 182656, "pid": 76337, "tid": -914061504, "ts": 1716454224280679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224334272, "dur": 4, "args": { "External id": 182658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182658, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182658, "pid": 5, "tid": 7, "ts": 1716454224334272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280696, "dur": 6, "args": { "External id": 182658, "cbid": 211, "correlation": 182658 } }, { "ph": "s", "id": 182658, "pid": 76337, "tid": -914061504, "ts": 1716454224280696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224280705, "dur": 0, "args": { "External id": 182659, "cbid": 51, "correlation": 182659 } }, { "ph": "s", "id": 182659, "pid": 76337, "tid": -914061504, "ts": 1716454224280705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224334277, "dur": 94, "args": { "External id": 182660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182660, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 182660, "pid": 5, "tid": 7, "ts": 1716454224334277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280706, "dur": 5, "args": { "External id": 182660, "cbid": 211, "correlation": 182660 } }, { "ph": "s", "id": 182660, "pid": 76337, "tid": -914061504, "ts": 1716454224280706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224334373, "dur": 16, "args": { "External id": 182665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182665, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182665, "pid": 5, "tid": 7, "ts": 1716454224334373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280733, "dur": 10, "args": { "External id": 182665, "cbid": 211, "correlation": 182665 } }, { "ph": "s", "id": 182665, "pid": 76337, "tid": -914061504, "ts": 1716454224280733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224334390, "dur": 11, "args": { "External id": 182673, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182673, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182673, "pid": 5, "tid": 7, "ts": 1716454224334390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280766, "dur": 8, "args": { "External id": 182673, "cbid": 211, "correlation": 182673 } }, { "ph": "s", "id": 182673, "pid": 76337, "tid": -914061504, "ts": 1716454224280766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224334403, "dur": 17, "args": { "External id": 182693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182693, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182693, "pid": 5, "tid": 7, "ts": 1716454224334403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280838, "dur": 12, "args": { "External id": 182693, "cbid": 211, "correlation": 182693 } }, { "ph": "s", "id": 182693, "pid": 76337, "tid": -914061504, "ts": 1716454224280838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224334421, "dur": 5, "args": { "External id": 182705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182705, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 182705, "pid": 5, "tid": 7, "ts": 1716454224334421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280861, "dur": 6, "args": { "External id": 182705, "cbid": 211, "correlation": 182705 } }, { "ph": "s", "id": 182705, "pid": 76337, "tid": -914061504, "ts": 1716454224280861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224334427, "dur": 18, "args": { "External id": 182708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182708, "pid": 5, "tid": 7, "ts": 1716454224334427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280879, "dur": 7, "args": { "External id": 182708, "cbid": 211, "correlation": 182708 } }, { "ph": "s", "id": 182708, "pid": 76337, "tid": -914061504, "ts": 1716454224280879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224334447, "dur": 12, "args": { "External id": 182717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182717, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182717, "pid": 5, "tid": 7, "ts": 1716454224334447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280918, "dur": 9, "args": { "External id": 182717, "cbid": 211, "correlation": 182717 } }, { "ph": "s", "id": 182717, "pid": 76337, "tid": -914061504, "ts": 1716454224280918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224280969, "dur": 0, "args": { "External id": 182727, "cbid": 317, "correlation": 182727 } }, { "ph": "f", "id": 182727, "pid": 76337, "tid": -914061504, "ts": 1716454224280969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224280970, "dur": 0, "args": { "External id": 182728, "cbid": 203, "correlation": 182728 } }, { "ph": "f", "id": 182728, "pid": 76337, "tid": -914061504, "ts": 1716454224280970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224280970, "dur": 0, "args": { "External id": 182729, "cbid": 205, "correlation": 182729 } }, { "ph": "f", "id": 182729, "pid": 76337, "tid": -914061504, "ts": 1716454224280970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224334460, "dur": 11, "args": { "External id": 182733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182733, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182733, "pid": 5, "tid": 7, "ts": 1716454224334460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224280993, "dur": 12, "args": { "External id": 182733, "cbid": 211, "correlation": 182733 } }, { "ph": "s", "id": 182733, "pid": 76337, "tid": -914061504, "ts": 1716454224280993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224334472, "dur": 160, "args": { "External id": 182735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182735, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182735, "pid": 5, "tid": 7, "ts": 1716454224334472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281008, "dur": 5, "args": { "External id": 182735, "cbid": 211, "correlation": 182735 } }, { "ph": "s", "id": 182735, "pid": 76337, "tid": -914061504, "ts": 1716454224281008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224334634, "dur": 1, "args": { "External id": 182737, "device": 5, "context": 1, "stream": 7, "correlation": 182737, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 182737, "pid": 5, "tid": 7, "ts": 1716454224334634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224281019, "dur": 6, "args": { "External id": 182737, "cbid": 51, "correlation": 182737 } }, { "ph": "s", "id": 182737, "pid": 76337, "tid": -914061504, "ts": 1716454224281019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224334638, "dur": 657, "args": { "External id": 182738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182738, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182738, "pid": 5, "tid": 7, "ts": 1716454224334638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281027, "dur": 6, "args": { "External id": 182738, "cbid": 211, "correlation": 182738 } }, { "ph": "s", "id": 182738, "pid": 76337, "tid": -914061504, "ts": 1716454224281027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224335296, "dur": 13, "args": { "External id": 182740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182740, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182740, "pid": 5, "tid": 7, "ts": 1716454224335296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281037, "dur": 5, "args": { "External id": 182740, "cbid": 211, "correlation": 182740 } }, { "ph": "s", "id": 182740, "pid": 76337, "tid": -914061504, "ts": 1716454224281037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224335311, "dur": 14, "args": { "External id": 182746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182746, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182746, "pid": 5, "tid": 7, "ts": 1716454224335311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281065, "dur": 8, "args": { "External id": 182746, "cbid": 211, "correlation": 182746 } }, { "ph": "s", "id": 182746, "pid": 76337, "tid": -914061504, "ts": 1716454224281065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224335326, "dur": 3, "args": { "External id": 182754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182754, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 182754, "pid": 5, "tid": 7, "ts": 1716454224335326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281108, "dur": 9, "args": { "External id": 182754, "cbid": 211, "correlation": 182754 } }, { "ph": "s", "id": 182754, "pid": 76337, "tid": -914061504, "ts": 1716454224281108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224281172, "dur": 1, "args": { "External id": 182770, "cbid": 251, "correlation": 182770 } }, { "ph": "f", "id": 182770, "pid": 76337, "tid": -914061504, "ts": 1716454224281172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224281177, "dur": 0, "args": { "External id": 182772, "cbid": 251, "correlation": 182772 } }, { "ph": "f", "id": 182772, "pid": 76337, "tid": -914061504, "ts": 1716454224281177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224335331, "dur": 13, "args": { "External id": 182773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182773, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182773, "pid": 5, "tid": 7, "ts": 1716454224335331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281179, "dur": 11, "args": { "External id": 182773, "cbid": 211, "correlation": 182773 } }, { "ph": "s", "id": 182773, "pid": 76337, "tid": -914061504, "ts": 1716454224281179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224335345, "dur": 5, "args": { "External id": 182775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182775, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182775, "pid": 5, "tid": 7, "ts": 1716454224335345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281192, "dur": 5, "args": { "External id": 182775, "cbid": 211, "correlation": 182775 } }, { "ph": "s", "id": 182775, "pid": 76337, "tid": -914061504, "ts": 1716454224281192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224335351, "dur": 16, "args": { "External id": 182785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182785, "pid": 5, "tid": 7, "ts": 1716454224335351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281250, "dur": 12, "args": { "External id": 182785, "cbid": 211, "correlation": 182785 } }, { "ph": "s", "id": 182785, "pid": 76337, "tid": -914061504, "ts": 1716454224281250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224335369, "dur": 18, "args": { "External id": 182805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182805, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182805, "pid": 5, "tid": 7, "ts": 1716454224335369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281316, "dur": 10, "args": { "External id": 182805, "cbid": 211, "correlation": 182805 } }, { "ph": "s", "id": 182805, "pid": 76337, "tid": -914061504, "ts": 1716454224281316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224335389, "dur": 4, "args": { "External id": 182817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182817, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 182817, "pid": 5, "tid": 7, "ts": 1716454224335389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281336, "dur": 6, "args": { "External id": 182817, "cbid": 211, "correlation": 182817 } }, { "ph": "s", "id": 182817, "pid": 76337, "tid": -914061504, "ts": 1716454224281336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224335394, "dur": 17, "args": { "External id": 182820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182820, "pid": 5, "tid": 7, "ts": 1716454224335394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281357, "dur": 6, "args": { "External id": 182820, "cbid": 211, "correlation": 182820 } }, { "ph": "s", "id": 182820, "pid": 76337, "tid": -914061504, "ts": 1716454224281357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224335412, "dur": 11, "args": { "External id": 182829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182829, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182829, "pid": 5, "tid": 7, "ts": 1716454224335412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281396, "dur": 10, "args": { "External id": 182829, "cbid": 211, "correlation": 182829 } }, { "ph": "s", "id": 182829, "pid": 76337, "tid": -914061504, "ts": 1716454224281396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224281458, "dur": 0, "args": { "External id": 182839, "cbid": 317, "correlation": 182839 } }, { "ph": "f", "id": 182839, "pid": 76337, "tid": -914061504, "ts": 1716454224281458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224281459, "dur": 0, "args": { "External id": 182840, "cbid": 203, "correlation": 182840 } }, { "ph": "f", "id": 182840, "pid": 76337, "tid": -914061504, "ts": 1716454224281459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224281459, "dur": 0, "args": { "External id": 182841, "cbid": 205, "correlation": 182841 } }, { "ph": "f", "id": 182841, "pid": 76337, "tid": -914061504, "ts": 1716454224281459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224335425, "dur": 11, "args": { "External id": 182845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182845, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182845, "pid": 5, "tid": 7, "ts": 1716454224335425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281473, "dur": 12, "args": { "External id": 182845, "cbid": 211, "correlation": 182845 } }, { "ph": "s", "id": 182845, "pid": 76337, "tid": -914061504, "ts": 1716454224281473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224335437, "dur": 160, "args": { "External id": 182847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182847, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182847, "pid": 5, "tid": 7, "ts": 1716454224335437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281487, "dur": 5, "args": { "External id": 182847, "cbid": 211, "correlation": 182847 } }, { "ph": "s", "id": 182847, "pid": 76337, "tid": -914061504, "ts": 1716454224281487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224335599, "dur": 1, "args": { "External id": 182849, "device": 5, "context": 1, "stream": 7, "correlation": 182849, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 182849, "pid": 5, "tid": 7, "ts": 1716454224335599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224281498, "dur": 8, "args": { "External id": 182849, "cbid": 51, "correlation": 182849 } }, { "ph": "s", "id": 182849, "pid": 76337, "tid": -914061504, "ts": 1716454224281498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224335603, "dur": 640, "args": { "External id": 182850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182850, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182850, "pid": 5, "tid": 7, "ts": 1716454224335603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281507, "dur": 6, "args": { "External id": 182850, "cbid": 211, "correlation": 182850 } }, { "ph": "s", "id": 182850, "pid": 76337, "tid": -914061504, "ts": 1716454224281507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224336245, "dur": 12, "args": { "External id": 182852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182852, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182852, "pid": 5, "tid": 7, "ts": 1716454224336245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281517, "dur": 5, "args": { "External id": 182852, "cbid": 211, "correlation": 182852 } }, { "ph": "s", "id": 182852, "pid": 76337, "tid": -914061504, "ts": 1716454224281517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224336258, "dur": 14, "args": { "External id": 182858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182858, "pid": 5, "tid": 7, "ts": 1716454224336258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281544, "dur": 9, "args": { "External id": 182858, "cbid": 211, "correlation": 182858 } }, { "ph": "s", "id": 182858, "pid": 76337, "tid": -914061504, "ts": 1716454224281544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224336273, "dur": 12, "args": { "External id": 182866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182866, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182866, "pid": 5, "tid": 7, "ts": 1716454224336273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281577, "dur": 8, "args": { "External id": 182866, "cbid": 211, "correlation": 182866 } }, { "ph": "s", "id": 182866, "pid": 76337, "tid": -914061504, "ts": 1716454224281577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224336286, "dur": 11, "args": { "External id": 182874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182874, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182874, "pid": 5, "tid": 7, "ts": 1716454224336286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281605, "dur": 8, "args": { "External id": 182874, "cbid": 211, "correlation": 182874 } }, { "ph": "s", "id": 182874, "pid": 76337, "tid": -914061504, "ts": 1716454224281605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224336299, "dur": 18, "args": { "External id": 182894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182894, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 182894, "pid": 5, "tid": 7, "ts": 1716454224336299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281682, "dur": 12, "args": { "External id": 182894, "cbid": 211, "correlation": 182894 } }, { "ph": "s", "id": 182894, "pid": 76337, "tid": -914061504, "ts": 1716454224281682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224336318, "dur": 4, "args": { "External id": 182906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182906, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 182906, "pid": 5, "tid": 7, "ts": 1716454224336318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281704, "dur": 7, "args": { "External id": 182906, "cbid": 211, "correlation": 182906 } }, { "ph": "s", "id": 182906, "pid": 76337, "tid": -914061504, "ts": 1716454224281704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224336323, "dur": 16, "args": { "External id": 182909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182909, "pid": 5, "tid": 7, "ts": 1716454224336323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281723, "dur": 6, "args": { "External id": 182909, "cbid": 211, "correlation": 182909 } }, { "ph": "s", "id": 182909, "pid": 76337, "tid": -914061504, "ts": 1716454224281723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224281779, "dur": 0, "args": { "External id": 182920, "cbid": 317, "correlation": 182920 } }, { "ph": "f", "id": 182920, "pid": 76337, "tid": -914061504, "ts": 1716454224281779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224281780, "dur": 0, "args": { "External id": 182921, "cbid": 203, "correlation": 182921 } }, { "ph": "f", "id": 182921, "pid": 76337, "tid": -914061504, "ts": 1716454224281780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224281781, "dur": 0, "args": { "External id": 182922, "cbid": 205, "correlation": 182922 } }, { "ph": "f", "id": 182922, "pid": 76337, "tid": -914061504, "ts": 1716454224281781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224336340, "dur": 11, "args": { "External id": 182926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182926, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182926, "pid": 5, "tid": 7, "ts": 1716454224336340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281795, "dur": 12, "args": { "External id": 182926, "cbid": 211, "correlation": 182926 } }, { "ph": "s", "id": 182926, "pid": 76337, "tid": -914061504, "ts": 1716454224281795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224336353, "dur": 4, "args": { "External id": 182928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 182928, "pid": 5, "tid": 7, "ts": 1716454224336353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281812, "dur": 6, "args": { "External id": 182928, "cbid": 211, "correlation": 182928 } }, { "ph": "s", "id": 182928, "pid": 76337, "tid": -914061504, "ts": 1716454224281812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224281820, "dur": 0, "args": { "External id": 182929, "cbid": 51, "correlation": 182929 } }, { "ph": "s", "id": 182929, "pid": 76337, "tid": -914061504, "ts": 1716454224281820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224336358, "dur": 93, "args": { "External id": 182930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182930, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 182930, "pid": 5, "tid": 7, "ts": 1716454224336358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281821, "dur": 5, "args": { "External id": 182930, "cbid": 211, "correlation": 182930 } }, { "ph": "s", "id": 182930, "pid": 76337, "tid": -914061504, "ts": 1716454224281821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224336452, "dur": 16, "args": { "External id": 182935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182935, "pid": 5, "tid": 7, "ts": 1716454224336452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281848, "dur": 8, "args": { "External id": 182935, "cbid": 211, "correlation": 182935 } }, { "ph": "s", "id": 182935, "pid": 76337, "tid": -914061504, "ts": 1716454224281848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224336469, "dur": 83, "args": { "External id": 182944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182944, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182944, "pid": 5, "tid": 7, "ts": 1716454224336469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224281930, "dur": 14, "args": { "External id": 182944, "cbid": 211, "correlation": 182944 } }, { "ph": "s", "id": 182944, "pid": 76337, "tid": -914061504, "ts": 1716454224281930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224336553, "dur": 29, "args": { "External id": 182966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182966, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 182966, "pid": 5, "tid": 7, "ts": 1716454224336553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282000, "dur": 11, "args": { "External id": 182966, "cbid": 211, "correlation": 182966 } }, { "ph": "s", "id": 182966, "pid": 76337, "tid": -914061504, "ts": 1716454224282000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282089, "dur": 1, "args": { "External id": 182977, "cbid": 251, "correlation": 182977 } }, { "ph": "f", "id": 182977, "pid": 76337, "tid": -914061504, "ts": 1716454224282089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224336584, "dur": 161, "args": { "External id": 182978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182978, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182978, "pid": 5, "tid": 7, "ts": 1716454224336584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282094, "dur": 12, "args": { "External id": 182978, "cbid": 211, "correlation": 182978 } }, { "ph": "s", "id": 182978, "pid": 76337, "tid": -914061504, "ts": 1716454224282094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282162, "dur": 1, "args": { "External id": 182989, "cbid": 251, "correlation": 182989 } }, { "ph": "f", "id": 182989, "pid": 76337, "tid": -914061504, "ts": 1716454224282162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224336747, "dur": 157, "args": { "External id": 182990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 182990, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 182990, "pid": 5, "tid": 7, "ts": 1716454224336747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282167, "dur": 12, "args": { "External id": 182990, "cbid": 211, "correlation": 182990 } }, { "ph": "s", "id": 182990, "pid": 76337, "tid": -914061504, "ts": 1716454224282167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282231, "dur": 1, "args": { "External id": 183001, "cbid": 251, "correlation": 183001 } }, { "ph": "f", "id": 183001, "pid": 76337, "tid": -914061504, "ts": 1716454224282231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224336906, "dur": 156, "args": { "External id": 183002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183002, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183002, "pid": 5, "tid": 7, "ts": 1716454224336906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282235, "dur": 11, "args": { "External id": 183002, "cbid": 211, "correlation": 183002 } }, { "ph": "s", "id": 183002, "pid": 76337, "tid": -914061504, "ts": 1716454224282235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224337063, "dur": 336, "args": { "External id": 183027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183027, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183027, "pid": 5, "tid": 7, "ts": 1716454224337063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282318, "dur": 13, "args": { "External id": 183027, "cbid": 211, "correlation": 183027 } }, { "ph": "s", "id": 183027, "pid": 76337, "tid": -914061504, "ts": 1716454224282318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282415, "dur": 1, "args": { "External id": 183045, "cbid": 251, "correlation": 183045 } }, { "ph": "f", "id": 183045, "pid": 76337, "tid": -914061504, "ts": 1716454224282415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224337400, "dur": 164, "args": { "External id": 183047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183047, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183047, "pid": 5, "tid": 7, "ts": 1716454224337400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282421, "dur": 14, "args": { "External id": 183047, "cbid": 211, "correlation": 183047 } }, { "ph": "s", "id": 183047, "pid": 76337, "tid": -914061504, "ts": 1716454224282421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224337566, "dur": 19, "args": { "External id": 183055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183055, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183055, "pid": 5, "tid": 7, "ts": 1716454224337566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282491, "dur": 12, "args": { "External id": 183055, "cbid": 211, "correlation": 183055 } }, { "ph": "s", "id": 183055, "pid": 76337, "tid": -914061504, "ts": 1716454224282491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224337586, "dur": 27, "args": { "External id": 183063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183063, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183063, "pid": 5, "tid": 7, "ts": 1716454224337586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282530, "dur": 8, "args": { "External id": 183063, "cbid": 211, "correlation": 183063 } }, { "ph": "s", "id": 183063, "pid": 76337, "tid": -914061504, "ts": 1716454224282530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224337615, "dur": 18, "args": { "External id": 183074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183074, "pid": 5, "tid": 7, "ts": 1716454224337615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282600, "dur": 13, "args": { "External id": 183074, "cbid": 211, "correlation": 183074 } }, { "ph": "s", "id": 183074, "pid": 76337, "tid": -914061504, "ts": 1716454224282600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224337634, "dur": 16, "args": { "External id": 183096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183096, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183096, "pid": 5, "tid": 7, "ts": 1716454224337634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282631, "dur": 8, "args": { "External id": 183096, "cbid": 211, "correlation": 183096 } }, { "ph": "s", "id": 183096, "pid": 76337, "tid": -914061504, "ts": 1716454224282631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282714, "dur": 1, "args": { "External id": 183107, "cbid": 251, "correlation": 183107 } }, { "ph": "f", "id": 183107, "pid": 76337, "tid": -914061504, "ts": 1716454224282714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224337651, "dur": 87, "args": { "External id": 183108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183108, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 183108, "pid": 5, "tid": 7, "ts": 1716454224337651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282719, "dur": 14, "args": { "External id": 183108, "cbid": 211, "correlation": 183108 } }, { "ph": "s", "id": 183108, "pid": 76337, "tid": -914061504, "ts": 1716454224282719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282788, "dur": 1, "args": { "External id": 183119, "cbid": 251, "correlation": 183119 } }, { "ph": "f", "id": 183119, "pid": 76337, "tid": -914061504, "ts": 1716454224282788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282792, "dur": 0, "args": { "External id": 183120, "cbid": 251, "correlation": 183120 } }, { "ph": "f", "id": 183120, "pid": 76337, "tid": -914061504, "ts": 1716454224282792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224337739, "dur": 12, "args": { "External id": 183121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183121, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183121, "pid": 5, "tid": 7, "ts": 1716454224337739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282794, "dur": 12, "args": { "External id": 183121, "cbid": 211, "correlation": 183121 } }, { "ph": "s", "id": 183121, "pid": 76337, "tid": -914061504, "ts": 1716454224282794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224337753, "dur": 5, "args": { "External id": 183123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183123, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183123, "pid": 5, "tid": 7, "ts": 1716454224337753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282807, "dur": 6, "args": { "External id": 183123, "cbid": 211, "correlation": 183123 } }, { "ph": "s", "id": 183123, "pid": 76337, "tid": -914061504, "ts": 1716454224282807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282865, "dur": 1, "args": { "External id": 183134, "cbid": 251, "correlation": 183134 } }, { "ph": "f", "id": 183134, "pid": 76337, "tid": -914061504, "ts": 1716454224282865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224282868, "dur": 0, "args": { "External id": 183135, "cbid": 251, "correlation": 183135 } }, { "ph": "f", "id": 183135, "pid": 76337, "tid": -914061504, "ts": 1716454224282868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224337760, "dur": 8, "args": { "External id": 183136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183136, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183136, "pid": 5, "tid": 7, "ts": 1716454224337760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282870, "dur": 12, "args": { "External id": 183136, "cbid": 211, "correlation": 183136 } }, { "ph": "s", "id": 183136, "pid": 76337, "tid": -914061504, "ts": 1716454224282870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224337769, "dur": 3, "args": { "External id": 183138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183138, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183138, "pid": 5, "tid": 7, "ts": 1716454224337769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282883, "dur": 6, "args": { "External id": 183138, "cbid": 211, "correlation": 183138 } }, { "ph": "s", "id": 183138, "pid": 76337, "tid": -914061504, "ts": 1716454224282883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224337773, "dur": 53, "args": { "External id": 183163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183163, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183163, "pid": 5, "tid": 7, "ts": 1716454224337773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224282960, "dur": 12, "args": { "External id": 183163, "cbid": 211, "correlation": 183163 } }, { "ph": "s", "id": 183163, "pid": 76337, "tid": -914061504, "ts": 1716454224282960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224283068, "dur": 1, "args": { "External id": 183181, "cbid": 251, "correlation": 183181 } }, { "ph": "f", "id": 183181, "pid": 76337, "tid": -914061504, "ts": 1716454224283068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224337828, "dur": 90, "args": { "External id": 183183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183183, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 183183, "pid": 5, "tid": 7, "ts": 1716454224337828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283074, "dur": 14, "args": { "External id": 183183, "cbid": 211, "correlation": 183183 } }, { "ph": "s", "id": 183183, "pid": 76337, "tid": -914061504, "ts": 1716454224283074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224337919, "dur": 9, "args": { "External id": 183191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183191, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183191, "pid": 5, "tid": 7, "ts": 1716454224337919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283143, "dur": 12, "args": { "External id": 183191, "cbid": 211, "correlation": 183191 } }, { "ph": "s", "id": 183191, "pid": 76337, "tid": -914061504, "ts": 1716454224283143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224337929, "dur": 20, "args": { "External id": 183199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183199, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183199, "pid": 5, "tid": 7, "ts": 1716454224337929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283184, "dur": 10, "args": { "External id": 183199, "cbid": 211, "correlation": 183199 } }, { "ph": "s", "id": 183199, "pid": 76337, "tid": -914061504, "ts": 1716454224283184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224337951, "dur": 17, "args": { "External id": 183221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183221, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183221, "pid": 5, "tid": 7, "ts": 1716454224337951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283236, "dur": 11, "args": { "External id": 183221, "cbid": 211, "correlation": 183221 } }, { "ph": "s", "id": 183221, "pid": 76337, "tid": -914061504, "ts": 1716454224283236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224283323, "dur": 1, "args": { "External id": 183237, "cbid": 251, "correlation": 183237 } }, { "ph": "f", "id": 183237, "pid": 76337, "tid": -914061504, "ts": 1716454224283323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224283328, "dur": 0, "args": { "External id": 183239, "cbid": 251, "correlation": 183239 } }, { "ph": "f", "id": 183239, "pid": 76337, "tid": -914061504, "ts": 1716454224283328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224337969, "dur": 493, "args": { "External id": 183240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183240, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183240, "pid": 5, "tid": 7, "ts": 1716454224337969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283330, "dur": 13, "args": { "External id": 183240, "cbid": 211, "correlation": 183240 } }, { "ph": "s", "id": 183240, "pid": 76337, "tid": -914061504, "ts": 1716454224283330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224338463, "dur": 65, "args": { "External id": 183248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183248, "pid": 5, "tid": 7, "ts": 1716454224338463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283394, "dur": 13, "args": { "External id": 183248, "cbid": 211, "correlation": 183248 } }, { "ph": "s", "id": 183248, "pid": 76337, "tid": -914061504, "ts": 1716454224283394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224338529, "dur": 67, "args": { "External id": 183256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183256, "pid": 5, "tid": 7, "ts": 1716454224338529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283425, "dur": 8, "args": { "External id": 183256, "cbid": 211, "correlation": 183256 } }, { "ph": "s", "id": 183256, "pid": 76337, "tid": -914061504, "ts": 1716454224283425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224283505, "dur": 1, "args": { "External id": 183272, "cbid": 251, "correlation": 183272 } }, { "ph": "f", "id": 183272, "pid": 76337, "tid": -914061504, "ts": 1716454224283505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224338598, "dur": 1, "args": { "External id": 183274, "device": 5, "context": 1, "stream": 7, "correlation": 183274, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 183274, "pid": 5, "tid": 7, "ts": 1716454224338598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224283510, "dur": 9, "args": { "External id": 183274, "cbid": 51, "correlation": 183274 } }, { "ph": "s", "id": 183274, "pid": 76337, "tid": -914061504, "ts": 1716454224283510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224338602, "dur": 268, "args": { "External id": 183275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183275, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 183275, "pid": 5, "tid": 7, "ts": 1716454224338602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283521, "dur": 11, "args": { "External id": 183275, "cbid": 211, "correlation": 183275 } }, { "ph": "s", "id": 183275, "pid": 76337, "tid": -914061504, "ts": 1716454224283521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224338872, "dur": 14, "args": { "External id": 183283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183283, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183283, "pid": 5, "tid": 7, "ts": 1716454224338872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283563, "dur": 10, "args": { "External id": 183283, "cbid": 211, "correlation": 183283 } }, { "ph": "s", "id": 183283, "pid": 76337, "tid": -914061504, "ts": 1716454224283563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224338886, "dur": 37, "args": { "External id": 183294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183294, "pid": 5, "tid": 7, "ts": 1716454224338886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283631, "dur": 12, "args": { "External id": 183294, "cbid": 211, "correlation": 183294 } }, { "ph": "s", "id": 183294, "pid": 76337, "tid": -914061504, "ts": 1716454224283631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224283694, "dur": 0, "args": { "External id": 183306, "cbid": 317, "correlation": 183306 } }, { "ph": "f", "id": 183306, "pid": 76337, "tid": -914061504, "ts": 1716454224283694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224283695, "dur": 0, "args": { "External id": 183307, "cbid": 203, "correlation": 183307 } }, { "ph": "f", "id": 183307, "pid": 76337, "tid": -914061504, "ts": 1716454224283695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224283695, "dur": 0, "args": { "External id": 183308, "cbid": 205, "correlation": 183308 } }, { "ph": "f", "id": 183308, "pid": 76337, "tid": -914061504, "ts": 1716454224283695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224338925, "dur": 13, "args": { "External id": 183312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183312, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183312, "pid": 5, "tid": 7, "ts": 1716454224338925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283711, "dur": 12, "args": { "External id": 183312, "cbid": 211, "correlation": 183312 } }, { "ph": "s", "id": 183312, "pid": 76337, "tid": -914061504, "ts": 1716454224283711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224338939, "dur": 4, "args": { "External id": 183314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 183314, "pid": 5, "tid": 7, "ts": 1716454224338939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283728, "dur": 6, "args": { "External id": 183314, "cbid": 211, "correlation": 183314 } }, { "ph": "s", "id": 183314, "pid": 76337, "tid": -914061504, "ts": 1716454224283728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224283736, "dur": 0, "args": { "External id": 183315, "cbid": 51, "correlation": 183315 } }, { "ph": "s", "id": 183315, "pid": 76337, "tid": -914061504, "ts": 1716454224283736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224338945, "dur": 95, "args": { "External id": 183316, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183316, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 183316, "pid": 5, "tid": 7, "ts": 1716454224338945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283737, "dur": 5, "args": { "External id": 183316, "cbid": 211, "correlation": 183316 } }, { "ph": "s", "id": 183316, "pid": 76337, "tid": -914061504, "ts": 1716454224283737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224339041, "dur": 16, "args": { "External id": 183321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183321, "pid": 5, "tid": 7, "ts": 1716454224339041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283765, "dur": 9, "args": { "External id": 183321, "cbid": 211, "correlation": 183321 } }, { "ph": "s", "id": 183321, "pid": 76337, "tid": -914061504, "ts": 1716454224283765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224339058, "dur": 11, "args": { "External id": 183329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183329, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183329, "pid": 5, "tid": 7, "ts": 1716454224339058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283796, "dur": 8, "args": { "External id": 183329, "cbid": 211, "correlation": 183329 } }, { "ph": "s", "id": 183329, "pid": 76337, "tid": -914061504, "ts": 1716454224283796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224283867, "dur": 0, "args": { "External id": 183339, "cbid": 317, "correlation": 183339 } }, { "ph": "f", "id": 183339, "pid": 76337, "tid": -914061504, "ts": 1716454224283867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224283868, "dur": 0, "args": { "External id": 183340, "cbid": 203, "correlation": 183340 } }, { "ph": "f", "id": 183340, "pid": 76337, "tid": -914061504, "ts": 1716454224283868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224283869, "dur": 0, "args": { "External id": 183341, "cbid": 205, "correlation": 183341 } }, { "ph": "f", "id": 183341, "pid": 76337, "tid": -914061504, "ts": 1716454224283869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224339071, "dur": 11, "args": { "External id": 183345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183345, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183345, "pid": 5, "tid": 7, "ts": 1716454224339071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283884, "dur": 12, "args": { "External id": 183345, "cbid": 211, "correlation": 183345 } }, { "ph": "s", "id": 183345, "pid": 76337, "tid": -914061504, "ts": 1716454224283884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224339083, "dur": 160, "args": { "External id": 183347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183347, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183347, "pid": 5, "tid": 7, "ts": 1716454224339083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283898, "dur": 5, "args": { "External id": 183347, "cbid": 211, "correlation": 183347 } }, { "ph": "s", "id": 183347, "pid": 76337, "tid": -914061504, "ts": 1716454224283898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224339245, "dur": 1, "args": { "External id": 183349, "device": 5, "context": 1, "stream": 7, "correlation": 183349, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 183349, "pid": 5, "tid": 7, "ts": 1716454224339245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224283910, "dur": 7, "args": { "External id": 183349, "cbid": 51, "correlation": 183349 } }, { "ph": "s", "id": 183349, "pid": 76337, "tid": -914061504, "ts": 1716454224283910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224339249, "dur": 198, "args": { "External id": 183350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183350, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 183350, "pid": 5, "tid": 7, "ts": 1716454224339249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283918, "dur": 8, "args": { "External id": 183350, "cbid": 211, "correlation": 183350 } }, { "ph": "s", "id": 183350, "pid": 76337, "tid": -914061504, "ts": 1716454224283918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224339448, "dur": 6, "args": { "External id": 183352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183352, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183352, "pid": 5, "tid": 7, "ts": 1716454224339448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283931, "dur": 5, "args": { "External id": 183352, "cbid": 211, "correlation": 183352 } }, { "ph": "s", "id": 183352, "pid": 76337, "tid": -914061504, "ts": 1716454224283931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224339455, "dur": 6, "args": { "External id": 183358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183358, "pid": 5, "tid": 7, "ts": 1716454224339455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224283958, "dur": 9, "args": { "External id": 183358, "cbid": 211, "correlation": 183358 } }, { "ph": "s", "id": 183358, "pid": 76337, "tid": -914061504, "ts": 1716454224283958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224339462, "dur": 11, "args": { "External id": 183378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183378, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183378, "pid": 5, "tid": 7, "ts": 1716454224339462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284073, "dur": 13, "args": { "External id": 183378, "cbid": 211, "correlation": 183378 } }, { "ph": "s", "id": 183378, "pid": 76337, "tid": -914061504, "ts": 1716454224284073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224339475, "dur": 4, "args": { "External id": 183390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183390, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183390, "pid": 5, "tid": 7, "ts": 1716454224339475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284096, "dur": 6, "args": { "External id": 183390, "cbid": 211, "correlation": 183390 } }, { "ph": "s", "id": 183390, "pid": 76337, "tid": -914061504, "ts": 1716454224284096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224339480, "dur": 8, "args": { "External id": 183393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183393, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183393, "pid": 5, "tid": 7, "ts": 1716454224339480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284116, "dur": 7, "args": { "External id": 183393, "cbid": 211, "correlation": 183393 } }, { "ph": "s", "id": 183393, "pid": 76337, "tid": -914061504, "ts": 1716454224284116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224339489, "dur": 5, "args": { "External id": 183402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183402, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183402, "pid": 5, "tid": 7, "ts": 1716454224339489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284155, "dur": 11, "args": { "External id": 183402, "cbid": 211, "correlation": 183402 } }, { "ph": "s", "id": 183402, "pid": 76337, "tid": -914061504, "ts": 1716454224284155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224284208, "dur": 0, "args": { "External id": 183412, "cbid": 317, "correlation": 183412 } }, { "ph": "f", "id": 183412, "pid": 76337, "tid": -914061504, "ts": 1716454224284208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224284208, "dur": 0, "args": { "External id": 183413, "cbid": 203, "correlation": 183413 } }, { "ph": "f", "id": 183413, "pid": 76337, "tid": -914061504, "ts": 1716454224284208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224284209, "dur": 0, "args": { "External id": 183414, "cbid": 205, "correlation": 183414 } }, { "ph": "f", "id": 183414, "pid": 76337, "tid": -914061504, "ts": 1716454224284209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224339496, "dur": 5, "args": { "External id": 183418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183418, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183418, "pid": 5, "tid": 7, "ts": 1716454224339496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284224, "dur": 11, "args": { "External id": 183418, "cbid": 211, "correlation": 183418 } }, { "ph": "s", "id": 183418, "pid": 76337, "tid": -914061504, "ts": 1716454224284224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224339502, "dur": 160, "args": { "External id": 183420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183420, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183420, "pid": 5, "tid": 7, "ts": 1716454224339502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284238, "dur": 5, "args": { "External id": 183420, "cbid": 211, "correlation": 183420 } }, { "ph": "s", "id": 183420, "pid": 76337, "tid": -914061504, "ts": 1716454224284238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224339664, "dur": 1, "args": { "External id": 183422, "device": 5, "context": 1, "stream": 7, "correlation": 183422, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 183422, "pid": 5, "tid": 7, "ts": 1716454224339664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224284249, "dur": 6, "args": { "External id": 183422, "cbid": 51, "correlation": 183422 } }, { "ph": "s", "id": 183422, "pid": 76337, "tid": -914061504, "ts": 1716454224284249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224339668, "dur": 266, "args": { "External id": 183423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183423, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183423, "pid": 5, "tid": 7, "ts": 1716454224339668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284257, "dur": 6, "args": { "External id": 183423, "cbid": 211, "correlation": 183423 } }, { "ph": "s", "id": 183423, "pid": 76337, "tid": -914061504, "ts": 1716454224284257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224339934, "dur": 5, "args": { "External id": 183425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183425, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183425, "pid": 5, "tid": 7, "ts": 1716454224339934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284268, "dur": 5, "args": { "External id": 183425, "cbid": 211, "correlation": 183425 } }, { "ph": "s", "id": 183425, "pid": 76337, "tid": -914061504, "ts": 1716454224284268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224339941, "dur": 6, "args": { "External id": 183431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183431, "pid": 5, "tid": 7, "ts": 1716454224339941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284296, "dur": 8, "args": { "External id": 183431, "cbid": 211, "correlation": 183431 } }, { "ph": "s", "id": 183431, "pid": 76337, "tid": -914061504, "ts": 1716454224284296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224339948, "dur": 3, "args": { "External id": 183439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183439, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 183439, "pid": 5, "tid": 7, "ts": 1716454224339948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284340, "dur": 9, "args": { "External id": 183439, "cbid": 211, "correlation": 183439 } }, { "ph": "s", "id": 183439, "pid": 76337, "tid": -914061504, "ts": 1716454224284340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224284406, "dur": 1, "args": { "External id": 183455, "cbid": 251, "correlation": 183455 } }, { "ph": "f", "id": 183455, "pid": 76337, "tid": -914061504, "ts": 1716454224284406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224284411, "dur": 0, "args": { "External id": 183457, "cbid": 251, "correlation": 183457 } }, { "ph": "f", "id": 183457, "pid": 76337, "tid": -914061504, "ts": 1716454224284411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224339953, "dur": 13, "args": { "External id": 183458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183458, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183458, "pid": 5, "tid": 7, "ts": 1716454224339953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284414, "dur": 12, "args": { "External id": 183458, "cbid": 211, "correlation": 183458 } }, { "ph": "s", "id": 183458, "pid": 76337, "tid": -914061504, "ts": 1716454224284414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224339967, "dur": 5, "args": { "External id": 183460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183460, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183460, "pid": 5, "tid": 7, "ts": 1716454224339967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284427, "dur": 6, "args": { "External id": 183460, "cbid": 211, "correlation": 183460 } }, { "ph": "s", "id": 183460, "pid": 76337, "tid": -914061504, "ts": 1716454224284427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224339973, "dur": 6, "args": { "External id": 183470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183470, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183470, "pid": 5, "tid": 7, "ts": 1716454224339973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284485, "dur": 12, "args": { "External id": 183470, "cbid": 211, "correlation": 183470 } }, { "ph": "s", "id": 183470, "pid": 76337, "tid": -914061504, "ts": 1716454224284485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224339980, "dur": 9, "args": { "External id": 183490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183490, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183490, "pid": 5, "tid": 7, "ts": 1716454224339980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284551, "dur": 10, "args": { "External id": 183490, "cbid": 211, "correlation": 183490 } }, { "ph": "s", "id": 183490, "pid": 76337, "tid": -914061504, "ts": 1716454224284551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224339991, "dur": 3, "args": { "External id": 183502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183502, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183502, "pid": 5, "tid": 7, "ts": 1716454224339991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284572, "dur": 7, "args": { "External id": 183502, "cbid": 211, "correlation": 183502 } }, { "ph": "s", "id": 183502, "pid": 76337, "tid": -914061504, "ts": 1716454224284572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224339995, "dur": 7, "args": { "External id": 183505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183505, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183505, "pid": 5, "tid": 7, "ts": 1716454224339995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284592, "dur": 6, "args": { "External id": 183505, "cbid": 211, "correlation": 183505 } }, { "ph": "s", "id": 183505, "pid": 76337, "tid": -914061504, "ts": 1716454224284592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224340004, "dur": 4, "args": { "External id": 183514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183514, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183514, "pid": 5, "tid": 7, "ts": 1716454224340004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284632, "dur": 10, "args": { "External id": 183514, "cbid": 211, "correlation": 183514 } }, { "ph": "s", "id": 183514, "pid": 76337, "tid": -914061504, "ts": 1716454224284632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224284699, "dur": 0, "args": { "External id": 183524, "cbid": 317, "correlation": 183524 } }, { "ph": "f", "id": 183524, "pid": 76337, "tid": -914061504, "ts": 1716454224284699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224284700, "dur": 0, "args": { "External id": 183525, "cbid": 203, "correlation": 183525 } }, { "ph": "f", "id": 183525, "pid": 76337, "tid": -914061504, "ts": 1716454224284700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224284701, "dur": 0, "args": { "External id": 183526, "cbid": 205, "correlation": 183526 } }, { "ph": "f", "id": 183526, "pid": 76337, "tid": -914061504, "ts": 1716454224284701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340010, "dur": 5, "args": { "External id": 183530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183530, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183530, "pid": 5, "tid": 7, "ts": 1716454224340010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284715, "dur": 12, "args": { "External id": 183530, "cbid": 211, "correlation": 183530 } }, { "ph": "s", "id": 183530, "pid": 76337, "tid": -914061504, "ts": 1716454224284715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340016, "dur": 160, "args": { "External id": 183532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183532, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183532, "pid": 5, "tid": 7, "ts": 1716454224340016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284730, "dur": 5, "args": { "External id": 183532, "cbid": 211, "correlation": 183532 } }, { "ph": "s", "id": 183532, "pid": 76337, "tid": -914061504, "ts": 1716454224284730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224340178, "dur": 1, "args": { "External id": 183534, "device": 5, "context": 1, "stream": 7, "correlation": 183534, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 183534, "pid": 5, "tid": 7, "ts": 1716454224340178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224284741, "dur": 6, "args": { "External id": 183534, "cbid": 51, "correlation": 183534 } }, { "ph": "s", "id": 183534, "pid": 76337, "tid": -914061504, "ts": 1716454224284741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224340182, "dur": 255, "args": { "External id": 183535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183535, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183535, "pid": 5, "tid": 7, "ts": 1716454224340182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284748, "dur": 6, "args": { "External id": 183535, "cbid": 211, "correlation": 183535 } }, { "ph": "s", "id": 183535, "pid": 76337, "tid": -914061504, "ts": 1716454224284748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340438, "dur": 6, "args": { "External id": 183537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183537, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183537, "pid": 5, "tid": 7, "ts": 1716454224340438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284757, "dur": 5, "args": { "External id": 183537, "cbid": 211, "correlation": 183537 } }, { "ph": "s", "id": 183537, "pid": 76337, "tid": -914061504, "ts": 1716454224284757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224340446, "dur": 6, "args": { "External id": 183543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183543, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183543, "pid": 5, "tid": 7, "ts": 1716454224340446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284787, "dur": 8, "args": { "External id": 183543, "cbid": 211, "correlation": 183543 } }, { "ph": "s", "id": 183543, "pid": 76337, "tid": -914061504, "ts": 1716454224284787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224340453, "dur": 5, "args": { "External id": 183551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183551, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183551, "pid": 5, "tid": 7, "ts": 1716454224340453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284819, "dur": 8, "args": { "External id": 183551, "cbid": 211, "correlation": 183551 } }, { "ph": "s", "id": 183551, "pid": 76337, "tid": -914061504, "ts": 1716454224284819, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224340460, "dur": 4, "args": { "External id": 183559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183559, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183559, "pid": 5, "tid": 7, "ts": 1716454224340460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284849, "dur": 8, "args": { "External id": 183559, "cbid": 211, "correlation": 183559 } }, { "ph": "s", "id": 183559, "pid": 76337, "tid": -914061504, "ts": 1716454224284849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224340465, "dur": 9, "args": { "External id": 183579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183579, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183579, "pid": 5, "tid": 7, "ts": 1716454224340465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284922, "dur": 13, "args": { "External id": 183579, "cbid": 211, "correlation": 183579 } }, { "ph": "s", "id": 183579, "pid": 76337, "tid": -914061504, "ts": 1716454224284922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224340476, "dur": 4, "args": { "External id": 183591, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183591, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183591, "pid": 5, "tid": 7, "ts": 1716454224340476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284945, "dur": 6, "args": { "External id": 183591, "cbid": 211, "correlation": 183591 } }, { "ph": "s", "id": 183591, "pid": 76337, "tid": -914061504, "ts": 1716454224284945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224340481, "dur": 6, "args": { "External id": 183594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183594, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183594, "pid": 5, "tid": 7, "ts": 1716454224340481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224284963, "dur": 6, "args": { "External id": 183594, "cbid": 211, "correlation": 183594 } }, { "ph": "s", "id": 183594, "pid": 76337, "tid": -914061504, "ts": 1716454224284963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224340489, "dur": 4, "args": { "External id": 183603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183603, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183603, "pid": 5, "tid": 7, "ts": 1716454224340489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285010, "dur": 10, "args": { "External id": 183603, "cbid": 211, "correlation": 183603 } }, { "ph": "s", "id": 183603, "pid": 76337, "tid": -914061504, "ts": 1716454224285010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224285062, "dur": 0, "args": { "External id": 183613, "cbid": 317, "correlation": 183613 } }, { "ph": "f", "id": 183613, "pid": 76337, "tid": -914061504, "ts": 1716454224285062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224285063, "dur": 0, "args": { "External id": 183614, "cbid": 203, "correlation": 183614 } }, { "ph": "f", "id": 183614, "pid": 76337, "tid": -914061504, "ts": 1716454224285063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224285064, "dur": 0, "args": { "External id": 183615, "cbid": 205, "correlation": 183615 } }, { "ph": "f", "id": 183615, "pid": 76337, "tid": -914061504, "ts": 1716454224285064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340495, "dur": 5, "args": { "External id": 183619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183619, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183619, "pid": 5, "tid": 7, "ts": 1716454224340495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285077, "dur": 12, "args": { "External id": 183619, "cbid": 211, "correlation": 183619 } }, { "ph": "s", "id": 183619, "pid": 76337, "tid": -914061504, "ts": 1716454224285077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340501, "dur": 160, "args": { "External id": 183621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183621, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183621, "pid": 5, "tid": 7, "ts": 1716454224340501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285091, "dur": 5, "args": { "External id": 183621, "cbid": 211, "correlation": 183621 } }, { "ph": "s", "id": 183621, "pid": 76337, "tid": -914061504, "ts": 1716454224285091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224340663, "dur": 1, "args": { "External id": 183623, "device": 5, "context": 1, "stream": 7, "correlation": 183623, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 183623, "pid": 5, "tid": 7, "ts": 1716454224340663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224285102, "dur": 6, "args": { "External id": 183623, "cbid": 51, "correlation": 183623 } }, { "ph": "s", "id": 183623, "pid": 76337, "tid": -914061504, "ts": 1716454224285102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224340666, "dur": 254, "args": { "External id": 183624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183624, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183624, "pid": 5, "tid": 7, "ts": 1716454224340666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285109, "dur": 6, "args": { "External id": 183624, "cbid": 211, "correlation": 183624 } }, { "ph": "s", "id": 183624, "pid": 76337, "tid": -914061504, "ts": 1716454224285109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340922, "dur": 6, "args": { "External id": 183626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183626, "pid": 5, "tid": 7, "ts": 1716454224340922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285118, "dur": 5, "args": { "External id": 183626, "cbid": 211, "correlation": 183626 } }, { "ph": "s", "id": 183626, "pid": 76337, "tid": -914061504, "ts": 1716454224285118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224340929, "dur": 6, "args": { "External id": 183632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183632, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183632, "pid": 5, "tid": 7, "ts": 1716454224340929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285147, "dur": 8, "args": { "External id": 183632, "cbid": 211, "correlation": 183632 } }, { "ph": "s", "id": 183632, "pid": 76337, "tid": -914061504, "ts": 1716454224285147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224340936, "dur": 3, "args": { "External id": 183640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183640, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 183640, "pid": 5, "tid": 7, "ts": 1716454224340936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285190, "dur": 9, "args": { "External id": 183640, "cbid": 211, "correlation": 183640 } }, { "ph": "s", "id": 183640, "pid": 76337, "tid": -914061504, "ts": 1716454224285190, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224285253, "dur": 1, "args": { "External id": 183656, "cbid": 251, "correlation": 183656 } }, { "ph": "f", "id": 183656, "pid": 76337, "tid": -914061504, "ts": 1716454224285253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224285258, "dur": 0, "args": { "External id": 183658, "cbid": 251, "correlation": 183658 } }, { "ph": "f", "id": 183658, "pid": 76337, "tid": -914061504, "ts": 1716454224285258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224340940, "dur": 10, "args": { "External id": 183659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183659, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183659, "pid": 5, "tid": 7, "ts": 1716454224340940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285260, "dur": 11, "args": { "External id": 183659, "cbid": 211, "correlation": 183659 } }, { "ph": "s", "id": 183659, "pid": 76337, "tid": -914061504, "ts": 1716454224285260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224340952, "dur": 4, "args": { "External id": 183661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183661, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183661, "pid": 5, "tid": 7, "ts": 1716454224340952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285273, "dur": 5, "args": { "External id": 183661, "cbid": 211, "correlation": 183661 } }, { "ph": "s", "id": 183661, "pid": 76337, "tid": -914061504, "ts": 1716454224285273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224340957, "dur": 6, "args": { "External id": 183671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183671, "pid": 5, "tid": 7, "ts": 1716454224340957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285329, "dur": 13, "args": { "External id": 183671, "cbid": 211, "correlation": 183671 } }, { "ph": "s", "id": 183671, "pid": 76337, "tid": -914061504, "ts": 1716454224285329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224340964, "dur": 9, "args": { "External id": 183691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183691, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183691, "pid": 5, "tid": 7, "ts": 1716454224340964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285394, "dur": 11, "args": { "External id": 183691, "cbid": 211, "correlation": 183691 } }, { "ph": "s", "id": 183691, "pid": 76337, "tid": -914061504, "ts": 1716454224285394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224340974, "dur": 4, "args": { "External id": 183703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183703, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183703, "pid": 5, "tid": 7, "ts": 1716454224340974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285415, "dur": 6, "args": { "External id": 183703, "cbid": 211, "correlation": 183703 } }, { "ph": "s", "id": 183703, "pid": 76337, "tid": -914061504, "ts": 1716454224285415, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224340980, "dur": 6, "args": { "External id": 183706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183706, "pid": 5, "tid": 7, "ts": 1716454224340980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285434, "dur": 6, "args": { "External id": 183706, "cbid": 211, "correlation": 183706 } }, { "ph": "s", "id": 183706, "pid": 76337, "tid": -914061504, "ts": 1716454224285434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224340987, "dur": 5, "args": { "External id": 183715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183715, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183715, "pid": 5, "tid": 7, "ts": 1716454224340987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285474, "dur": 10, "args": { "External id": 183715, "cbid": 211, "correlation": 183715 } }, { "ph": "s", "id": 183715, "pid": 76337, "tid": -914061504, "ts": 1716454224285474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224285536, "dur": 0, "args": { "External id": 183725, "cbid": 317, "correlation": 183725 } }, { "ph": "f", "id": 183725, "pid": 76337, "tid": -914061504, "ts": 1716454224285536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224285536, "dur": 0, "args": { "External id": 183726, "cbid": 203, "correlation": 183726 } }, { "ph": "f", "id": 183726, "pid": 76337, "tid": -914061504, "ts": 1716454224285536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224285537, "dur": 0, "args": { "External id": 183727, "cbid": 205, "correlation": 183727 } }, { "ph": "f", "id": 183727, "pid": 76337, "tid": -914061504, "ts": 1716454224285537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224340993, "dur": 5, "args": { "External id": 183731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183731, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183731, "pid": 5, "tid": 7, "ts": 1716454224340993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285551, "dur": 12, "args": { "External id": 183731, "cbid": 211, "correlation": 183731 } }, { "ph": "s", "id": 183731, "pid": 76337, "tid": -914061504, "ts": 1716454224285551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341000, "dur": 160, "args": { "External id": 183733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183733, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183733, "pid": 5, "tid": 7, "ts": 1716454224341000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285566, "dur": 5, "args": { "External id": 183733, "cbid": 211, "correlation": 183733 } }, { "ph": "s", "id": 183733, "pid": 76337, "tid": -914061504, "ts": 1716454224285566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224341162, "dur": 1, "args": { "External id": 183735, "device": 5, "context": 1, "stream": 7, "correlation": 183735, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 183735, "pid": 5, "tid": 7, "ts": 1716454224341162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224285576, "dur": 6, "args": { "External id": 183735, "cbid": 51, "correlation": 183735 } }, { "ph": "s", "id": 183735, "pid": 76337, "tid": -914061504, "ts": 1716454224285576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224341166, "dur": 254, "args": { "External id": 183736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183736, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183736, "pid": 5, "tid": 7, "ts": 1716454224341166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285584, "dur": 6, "args": { "External id": 183736, "cbid": 211, "correlation": 183736 } }, { "ph": "s", "id": 183736, "pid": 76337, "tid": -914061504, "ts": 1716454224285584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341421, "dur": 6, "args": { "External id": 183738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183738, "pid": 5, "tid": 7, "ts": 1716454224341421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285595, "dur": 6, "args": { "External id": 183738, "cbid": 211, "correlation": 183738 } }, { "ph": "s", "id": 183738, "pid": 76337, "tid": -914061504, "ts": 1716454224285595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224341428, "dur": 6, "args": { "External id": 183744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183744, "pid": 5, "tid": 7, "ts": 1716454224341428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285624, "dur": 9, "args": { "External id": 183744, "cbid": 211, "correlation": 183744 } }, { "ph": "s", "id": 183744, "pid": 76337, "tid": -914061504, "ts": 1716454224285624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224341435, "dur": 5, "args": { "External id": 183752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183752, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183752, "pid": 5, "tid": 7, "ts": 1716454224341435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285657, "dur": 8, "args": { "External id": 183752, "cbid": 211, "correlation": 183752 } }, { "ph": "s", "id": 183752, "pid": 76337, "tid": -914061504, "ts": 1716454224285657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224341441, "dur": 4, "args": { "External id": 183760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183760, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183760, "pid": 5, "tid": 7, "ts": 1716454224341441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285686, "dur": 8, "args": { "External id": 183760, "cbid": 211, "correlation": 183760 } }, { "ph": "s", "id": 183760, "pid": 76337, "tid": -914061504, "ts": 1716454224285686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224341447, "dur": 10, "args": { "External id": 183780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183780, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183780, "pid": 5, "tid": 7, "ts": 1716454224341447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285797, "dur": 14, "args": { "External id": 183780, "cbid": 211, "correlation": 183780 } }, { "ph": "s", "id": 183780, "pid": 76337, "tid": -914061504, "ts": 1716454224285797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224341458, "dur": 4, "args": { "External id": 183792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183792, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183792, "pid": 5, "tid": 7, "ts": 1716454224341458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285820, "dur": 6, "args": { "External id": 183792, "cbid": 211, "correlation": 183792 } }, { "ph": "s", "id": 183792, "pid": 76337, "tid": -914061504, "ts": 1716454224285820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224341463, "dur": 7, "args": { "External id": 183795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183795, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183795, "pid": 5, "tid": 7, "ts": 1716454224341463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285838, "dur": 6, "args": { "External id": 183795, "cbid": 211, "correlation": 183795 } }, { "ph": "s", "id": 183795, "pid": 76337, "tid": -914061504, "ts": 1716454224285838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224341471, "dur": 4, "args": { "External id": 183804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183804, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183804, "pid": 5, "tid": 7, "ts": 1716454224341471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285876, "dur": 9, "args": { "External id": 183804, "cbid": 211, "correlation": 183804 } }, { "ph": "s", "id": 183804, "pid": 76337, "tid": -914061504, "ts": 1716454224285876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224285932, "dur": 0, "args": { "External id": 183814, "cbid": 317, "correlation": 183814 } }, { "ph": "f", "id": 183814, "pid": 76337, "tid": -914061504, "ts": 1716454224285932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224285933, "dur": 0, "args": { "External id": 183815, "cbid": 203, "correlation": 183815 } }, { "ph": "f", "id": 183815, "pid": 76337, "tid": -914061504, "ts": 1716454224285933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224285934, "dur": 0, "args": { "External id": 183816, "cbid": 205, "correlation": 183816 } }, { "ph": "f", "id": 183816, "pid": 76337, "tid": -914061504, "ts": 1716454224285934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341477, "dur": 5, "args": { "External id": 183820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183820, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183820, "pid": 5, "tid": 7, "ts": 1716454224341477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285947, "dur": 12, "args": { "External id": 183820, "cbid": 211, "correlation": 183820 } }, { "ph": "s", "id": 183820, "pid": 76337, "tid": -914061504, "ts": 1716454224285947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341483, "dur": 159, "args": { "External id": 183822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183822, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183822, "pid": 5, "tid": 7, "ts": 1716454224341483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285961, "dur": 6, "args": { "External id": 183822, "cbid": 211, "correlation": 183822 } }, { "ph": "s", "id": 183822, "pid": 76337, "tid": -914061504, "ts": 1716454224285961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224341644, "dur": 1, "args": { "External id": 183824, "device": 5, "context": 1, "stream": 7, "correlation": 183824, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 183824, "pid": 5, "tid": 7, "ts": 1716454224341644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224285972, "dur": 15, "args": { "External id": 183824, "cbid": 51, "correlation": 183824 } }, { "ph": "s", "id": 183824, "pid": 76337, "tid": -914061504, "ts": 1716454224285972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224341648, "dur": 254, "args": { "External id": 183825, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183825, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183825, "pid": 5, "tid": 7, "ts": 1716454224341648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285988, "dur": 7, "args": { "External id": 183825, "cbid": 211, "correlation": 183825 } }, { "ph": "s", "id": 183825, "pid": 76337, "tid": -914061504, "ts": 1716454224285988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341903, "dur": 6, "args": { "External id": 183827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183827, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183827, "pid": 5, "tid": 7, "ts": 1716454224341903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224285998, "dur": 5, "args": { "External id": 183827, "cbid": 211, "correlation": 183827 } }, { "ph": "s", "id": 183827, "pid": 76337, "tid": -914061504, "ts": 1716454224285998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224341910, "dur": 6, "args": { "External id": 183833, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183833, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183833, "pid": 5, "tid": 7, "ts": 1716454224341910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286027, "dur": 8, "args": { "External id": 183833, "cbid": 211, "correlation": 183833 } }, { "ph": "s", "id": 183833, "pid": 76337, "tid": -914061504, "ts": 1716454224286027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224341918, "dur": 3, "args": { "External id": 183841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183841, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 183841, "pid": 5, "tid": 7, "ts": 1716454224341918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286071, "dur": 10, "args": { "External id": 183841, "cbid": 211, "correlation": 183841 } }, { "ph": "s", "id": 183841, "pid": 76337, "tid": -914061504, "ts": 1716454224286071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224286133, "dur": 1, "args": { "External id": 183857, "cbid": 251, "correlation": 183857 } }, { "ph": "f", "id": 183857, "pid": 76337, "tid": -914061504, "ts": 1716454224286133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224286138, "dur": 0, "args": { "External id": 183859, "cbid": 251, "correlation": 183859 } }, { "ph": "f", "id": 183859, "pid": 76337, "tid": -914061504, "ts": 1716454224286138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224341922, "dur": 10, "args": { "External id": 183860, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183860, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183860, "pid": 5, "tid": 7, "ts": 1716454224341922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286140, "dur": 11, "args": { "External id": 183860, "cbid": 211, "correlation": 183860 } }, { "ph": "s", "id": 183860, "pid": 76337, "tid": -914061504, "ts": 1716454224286140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224341933, "dur": 4, "args": { "External id": 183862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183862, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183862, "pid": 5, "tid": 7, "ts": 1716454224341933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286153, "dur": 5, "args": { "External id": 183862, "cbid": 211, "correlation": 183862 } }, { "ph": "s", "id": 183862, "pid": 76337, "tid": -914061504, "ts": 1716454224286153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224341938, "dur": 6, "args": { "External id": 183872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183872, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183872, "pid": 5, "tid": 7, "ts": 1716454224341938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286209, "dur": 13, "args": { "External id": 183872, "cbid": 211, "correlation": 183872 } }, { "ph": "s", "id": 183872, "pid": 76337, "tid": -914061504, "ts": 1716454224286209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224341945, "dur": 9, "args": { "External id": 183892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183892, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183892, "pid": 5, "tid": 7, "ts": 1716454224341945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286276, "dur": 10, "args": { "External id": 183892, "cbid": 211, "correlation": 183892 } }, { "ph": "s", "id": 183892, "pid": 76337, "tid": -914061504, "ts": 1716454224286276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224341956, "dur": 4, "args": { "External id": 183904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183904, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183904, "pid": 5, "tid": 7, "ts": 1716454224341956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286295, "dur": 6, "args": { "External id": 183904, "cbid": 211, "correlation": 183904 } }, { "ph": "s", "id": 183904, "pid": 76337, "tid": -914061504, "ts": 1716454224286295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224341961, "dur": 6, "args": { "External id": 183907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183907, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183907, "pid": 5, "tid": 7, "ts": 1716454224341961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286314, "dur": 7, "args": { "External id": 183907, "cbid": 211, "correlation": 183907 } }, { "ph": "s", "id": 183907, "pid": 76337, "tid": -914061504, "ts": 1716454224286314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224341969, "dur": 4, "args": { "External id": 183916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183916, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183916, "pid": 5, "tid": 7, "ts": 1716454224341969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286354, "dur": 9, "args": { "External id": 183916, "cbid": 211, "correlation": 183916 } }, { "ph": "s", "id": 183916, "pid": 76337, "tid": -914061504, "ts": 1716454224286354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224286420, "dur": 0, "args": { "External id": 183926, "cbid": 317, "correlation": 183926 } }, { "ph": "f", "id": 183926, "pid": 76337, "tid": -914061504, "ts": 1716454224286420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224286421, "dur": 0, "args": { "External id": 183927, "cbid": 203, "correlation": 183927 } }, { "ph": "f", "id": 183927, "pid": 76337, "tid": -914061504, "ts": 1716454224286421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224286421, "dur": 0, "args": { "External id": 183928, "cbid": 205, "correlation": 183928 } }, { "ph": "f", "id": 183928, "pid": 76337, "tid": -914061504, "ts": 1716454224286421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341974, "dur": 5, "args": { "External id": 183932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183932, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183932, "pid": 5, "tid": 7, "ts": 1716454224341974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286436, "dur": 12, "args": { "External id": 183932, "cbid": 211, "correlation": 183932 } }, { "ph": "s", "id": 183932, "pid": 76337, "tid": -914061504, "ts": 1716454224286436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224341981, "dur": 159, "args": { "External id": 183934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183934, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183934, "pid": 5, "tid": 7, "ts": 1716454224341981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286450, "dur": 5, "args": { "External id": 183934, "cbid": 211, "correlation": 183934 } }, { "ph": "s", "id": 183934, "pid": 76337, "tid": -914061504, "ts": 1716454224286450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224342141, "dur": 1, "args": { "External id": 183936, "device": 5, "context": 1, "stream": 7, "correlation": 183936, "bytes": 240, "memory bandwidth (GB/s)": 0.1563517915309446 } }, { "ph": "f", "id": 183936, "pid": 5, "tid": 7, "ts": 1716454224342141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224286461, "dur": 6, "args": { "External id": 183936, "cbid": 51, "correlation": 183936 } }, { "ph": "s", "id": 183936, "pid": 76337, "tid": -914061504, "ts": 1716454224286461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224342145, "dur": 254, "args": { "External id": 183937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183937, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 183937, "pid": 5, "tid": 7, "ts": 1716454224342145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286468, "dur": 6, "args": { "External id": 183937, "cbid": 211, "correlation": 183937 } }, { "ph": "s", "id": 183937, "pid": 76337, "tid": -914061504, "ts": 1716454224286468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224342401, "dur": 6, "args": { "External id": 183939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183939, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 183939, "pid": 5, "tid": 7, "ts": 1716454224342401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286479, "dur": 5, "args": { "External id": 183939, "cbid": 211, "correlation": 183939 } }, { "ph": "s", "id": 183939, "pid": 76337, "tid": -914061504, "ts": 1716454224286479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224342407, "dur": 6, "args": { "External id": 183945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183945, "pid": 5, "tid": 7, "ts": 1716454224342407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286506, "dur": 8, "args": { "External id": 183945, "cbid": 211, "correlation": 183945 } }, { "ph": "s", "id": 183945, "pid": 76337, "tid": -914061504, "ts": 1716454224286506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224342415, "dur": 5, "args": { "External id": 183953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183953, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183953, "pid": 5, "tid": 7, "ts": 1716454224342415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286540, "dur": 8, "args": { "External id": 183953, "cbid": 211, "correlation": 183953 } }, { "ph": "s", "id": 183953, "pid": 76337, "tid": -914061504, "ts": 1716454224286540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224342421, "dur": 4, "args": { "External id": 183961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183961, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183961, "pid": 5, "tid": 7, "ts": 1716454224342421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286568, "dur": 9, "args": { "External id": 183961, "cbid": 211, "correlation": 183961 } }, { "ph": "s", "id": 183961, "pid": 76337, "tid": -914061504, "ts": 1716454224286568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224342427, "dur": 9, "args": { "External id": 183981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183981, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 183981, "pid": 5, "tid": 7, "ts": 1716454224342427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286765, "dur": 15, "args": { "External id": 183981, "cbid": 211, "correlation": 183981 } }, { "ph": "s", "id": 183981, "pid": 76337, "tid": -914061504, "ts": 1716454224286765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224342437, "dur": 4, "args": { "External id": 183993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183993, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 183993, "pid": 5, "tid": 7, "ts": 1716454224342437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286790, "dur": 6, "args": { "External id": 183993, "cbid": 211, "correlation": 183993 } }, { "ph": "s", "id": 183993, "pid": 76337, "tid": -914061504, "ts": 1716454224286790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224342442, "dur": 6, "args": { "External id": 183996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 183996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 183996, "pid": 5, "tid": 7, "ts": 1716454224342442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286808, "dur": 7, "args": { "External id": 183996, "cbid": 211, "correlation": 183996 } }, { "ph": "s", "id": 183996, "pid": 76337, "tid": -914061504, "ts": 1716454224286808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224286870, "dur": 0, "args": { "External id": 184007, "cbid": 317, "correlation": 184007 } }, { "ph": "f", "id": 184007, "pid": 76337, "tid": -914061504, "ts": 1716454224286870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224286871, "dur": 0, "args": { "External id": 184008, "cbid": 203, "correlation": 184008 } }, { "ph": "f", "id": 184008, "pid": 76337, "tid": -914061504, "ts": 1716454224286871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224286872, "dur": 0, "args": { "External id": 184009, "cbid": 205, "correlation": 184009 } }, { "ph": "f", "id": 184009, "pid": 76337, "tid": -914061504, "ts": 1716454224286872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224342450, "dur": 5, "args": { "External id": 184013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184013, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184013, "pid": 5, "tid": 7, "ts": 1716454224342450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286888, "dur": 12, "args": { "External id": 184013, "cbid": 211, "correlation": 184013 } }, { "ph": "s", "id": 184013, "pid": 76337, "tid": -914061504, "ts": 1716454224286888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224342456, "dur": 37, "args": { "External id": 184015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184015, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 184015, "pid": 5, "tid": 7, "ts": 1716454224342456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286907, "dur": 9, "args": { "External id": 184015, "cbid": 211, "correlation": 184015 } }, { "ph": "s", "id": 184015, "pid": 76337, "tid": -914061504, "ts": 1716454224286907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224342494, "dur": 5, "args": { "External id": 184017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184017, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184017, "pid": 5, "tid": 7, "ts": 1716454224342494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286920, "dur": 5, "args": { "External id": 184017, "cbid": 211, "correlation": 184017 } }, { "ph": "s", "id": 184017, "pid": 76337, "tid": -914061504, "ts": 1716454224286920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224342500, "dur": 6, "args": { "External id": 184023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184023, "pid": 5, "tid": 7, "ts": 1716454224342500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224286948, "dur": 8, "args": { "External id": 184023, "cbid": 211, "correlation": 184023 } }, { "ph": "s", "id": 184023, "pid": 76337, "tid": -914061504, "ts": 1716454224286948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224342507, "dur": 20, "args": { "External id": 184032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184032, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184032, "pid": 5, "tid": 7, "ts": 1716454224342507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287040, "dur": 14, "args": { "External id": 184032, "cbid": 211, "correlation": 184032 } }, { "ph": "s", "id": 184032, "pid": 76337, "tid": -914061504, "ts": 1716454224287040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224342529, "dur": 10, "args": { "External id": 184054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184054, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 184054, "pid": 5, "tid": 7, "ts": 1716454224342529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287098, "dur": 10, "args": { "External id": 184054, "cbid": 211, "correlation": 184054 } }, { "ph": "s", "id": 184054, "pid": 76337, "tid": -914061504, "ts": 1716454224287098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287188, "dur": 2, "args": { "External id": 184065, "cbid": 251, "correlation": 184065 } }, { "ph": "f", "id": 184065, "pid": 76337, "tid": -914061504, "ts": 1716454224287188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287192, "dur": 0, "args": { "External id": 184066, "cbid": 251, "correlation": 184066 } }, { "ph": "f", "id": 184066, "pid": 76337, "tid": -914061504, "ts": 1716454224287192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224342540, "dur": 53, "args": { "External id": 184067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184067, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 184067, "pid": 5, "tid": 7, "ts": 1716454224342540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287196, "dur": 14, "args": { "External id": 184067, "cbid": 211, "correlation": 184067 } }, { "ph": "s", "id": 184067, "pid": 76337, "tid": -914061504, "ts": 1716454224287196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287266, "dur": 1, "args": { "External id": 184078, "cbid": 251, "correlation": 184078 } }, { "ph": "f", "id": 184078, "pid": 76337, "tid": -914061504, "ts": 1716454224287266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287270, "dur": 0, "args": { "External id": 184079, "cbid": 251, "correlation": 184079 } }, { "ph": "f", "id": 184079, "pid": 76337, "tid": -914061504, "ts": 1716454224287270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224342594, "dur": 52, "args": { "External id": 184080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184080, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 184080, "pid": 5, "tid": 7, "ts": 1716454224342594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287271, "dur": 12, "args": { "External id": 184080, "cbid": 211, "correlation": 184080 } }, { "ph": "s", "id": 184080, "pid": 76337, "tid": -914061504, "ts": 1716454224287271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287338, "dur": 1, "args": { "External id": 184091, "cbid": 251, "correlation": 184091 } }, { "ph": "f", "id": 184091, "pid": 76337, "tid": -914061504, "ts": 1716454224287338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287342, "dur": 0, "args": { "External id": 184092, "cbid": 251, "correlation": 184092 } }, { "ph": "f", "id": 184092, "pid": 76337, "tid": -914061504, "ts": 1716454224287342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224342648, "dur": 53, "args": { "External id": 184093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184093, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 184093, "pid": 5, "tid": 7, "ts": 1716454224342648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287344, "dur": 11, "args": { "External id": 184093, "cbid": 211, "correlation": 184093 } }, { "ph": "s", "id": 184093, "pid": 76337, "tid": -914061504, "ts": 1716454224287344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224342701, "dur": 54, "args": { "External id": 184118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184118, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184118, "pid": 5, "tid": 7, "ts": 1716454224342701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287426, "dur": 13, "args": { "External id": 184118, "cbid": 211, "correlation": 184118 } }, { "ph": "s", "id": 184118, "pid": 76337, "tid": -914061504, "ts": 1716454224287426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287526, "dur": 1, "args": { "External id": 184136, "cbid": 251, "correlation": 184136 } }, { "ph": "f", "id": 184136, "pid": 76337, "tid": -914061504, "ts": 1716454224287526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224342757, "dur": 62, "args": { "External id": 184138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184138, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 184138, "pid": 5, "tid": 7, "ts": 1716454224342757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287532, "dur": 14, "args": { "External id": 184138, "cbid": 211, "correlation": 184138 } }, { "ph": "s", "id": 184138, "pid": 76337, "tid": -914061504, "ts": 1716454224287532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224342821, "dur": 6, "args": { "External id": 184146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184146, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184146, "pid": 5, "tid": 7, "ts": 1716454224342821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287602, "dur": 12, "args": { "External id": 184146, "cbid": 211, "correlation": 184146 } }, { "ph": "s", "id": 184146, "pid": 76337, "tid": -914061504, "ts": 1716454224287602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224342828, "dur": 7, "args": { "External id": 184154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184154, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184154, "pid": 5, "tid": 7, "ts": 1716454224342828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287641, "dur": 8, "args": { "External id": 184154, "cbid": 211, "correlation": 184154 } }, { "ph": "s", "id": 184154, "pid": 76337, "tid": -914061504, "ts": 1716454224287641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224342837, "dur": 8, "args": { "External id": 184165, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184165, "pid": 5, "tid": 7, "ts": 1716454224342837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287712, "dur": 12, "args": { "External id": 184165, "cbid": 211, "correlation": 184165 } }, { "ph": "s", "id": 184165, "pid": 76337, "tid": -914061504, "ts": 1716454224287712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224342846, "dur": 8, "args": { "External id": 184187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184187, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 184187, "pid": 5, "tid": 7, "ts": 1716454224342846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287745, "dur": 8, "args": { "External id": 184187, "cbid": 211, "correlation": 184187 } }, { "ph": "s", "id": 184187, "pid": 76337, "tid": -914061504, "ts": 1716454224287745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287830, "dur": 2, "args": { "External id": 184198, "cbid": 251, "correlation": 184198 } }, { "ph": "f", "id": 184198, "pid": 76337, "tid": -914061504, "ts": 1716454224287830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224342856, "dur": 1, "args": { "External id": 184199, "device": 5, "context": 1, "stream": 7, "correlation": 184199, "bytes": 480, "memory bandwidth (GB/s)": 0.3 } }, { "ph": "f", "id": 184199, "pid": 5, "tid": 7, "ts": 1716454224342856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224287836, "dur": 12, "args": { "External id": 184199, "cbid": 51, "correlation": 184199 } }, { "ph": "s", "id": 184199, "pid": 76337, "tid": -914061504, "ts": 1716454224287836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224342860, "dur": 36, "args": { "External id": 184200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184200, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 184200, "pid": 5, "tid": 7, "ts": 1716454224342860, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287849, "dur": 12, "args": { "External id": 184200, "cbid": 211, "correlation": 184200 } }, { "ph": "s", "id": 184200, "pid": 76337, "tid": -914061504, "ts": 1716454224287849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287921, "dur": 1, "args": { "External id": 184211, "cbid": 251, "correlation": 184211 } }, { "ph": "f", "id": 184211, "pid": 76337, "tid": -914061504, "ts": 1716454224287921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224287926, "dur": 0, "args": { "External id": 184212, "cbid": 251, "correlation": 184212 } }, { "ph": "f", "id": 184212, "pid": 76337, "tid": -914061504, "ts": 1716454224287926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224342897, "dur": 12, "args": { "External id": 184213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184213, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184213, "pid": 5, "tid": 7, "ts": 1716454224342897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287927, "dur": 12, "args": { "External id": 184213, "cbid": 211, "correlation": 184213 } }, { "ph": "s", "id": 184213, "pid": 76337, "tid": -914061504, "ts": 1716454224287927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224342910, "dur": 5, "args": { "External id": 184215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184215, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184215, "pid": 5, "tid": 7, "ts": 1716454224342910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224287941, "dur": 7, "args": { "External id": 184215, "cbid": 211, "correlation": 184215 } }, { "ph": "s", "id": 184215, "pid": 76337, "tid": -914061504, "ts": 1716454224287941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224288008, "dur": 1, "args": { "External id": 184226, "cbid": 251, "correlation": 184226 } }, { "ph": "f", "id": 184226, "pid": 76337, "tid": -914061504, "ts": 1716454224288008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224288011, "dur": 0, "args": { "External id": 184227, "cbid": 251, "correlation": 184227 } }, { "ph": "f", "id": 184227, "pid": 76337, "tid": -914061504, "ts": 1716454224288011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224342917, "dur": 8, "args": { "External id": 184228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184228, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184228, "pid": 5, "tid": 7, "ts": 1716454224342917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288013, "dur": 13, "args": { "External id": 184228, "cbid": 211, "correlation": 184228 } }, { "ph": "s", "id": 184228, "pid": 76337, "tid": -914061504, "ts": 1716454224288013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224342926, "dur": 3, "args": { "External id": 184230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184230, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184230, "pid": 5, "tid": 7, "ts": 1716454224342926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288027, "dur": 6, "args": { "External id": 184230, "cbid": 211, "correlation": 184230 } }, { "ph": "s", "id": 184230, "pid": 76337, "tid": -914061504, "ts": 1716454224288027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224342931, "dur": 20, "args": { "External id": 184255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184255, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 184255, "pid": 5, "tid": 7, "ts": 1716454224342931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288107, "dur": 12, "args": { "External id": 184255, "cbid": 211, "correlation": 184255 } }, { "ph": "s", "id": 184255, "pid": 76337, "tid": -914061504, "ts": 1716454224288107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224288213, "dur": 2, "args": { "External id": 184273, "cbid": 251, "correlation": 184273 } }, { "ph": "f", "id": 184273, "pid": 76337, "tid": -914061504, "ts": 1716454224288213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224342952, "dur": 1, "args": { "External id": 184275, "device": 5, "context": 1, "stream": 7, "correlation": 184275, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 184275, "pid": 5, "tid": 7, "ts": 1716454224342952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224288219, "dur": 10, "args": { "External id": 184275, "cbid": 51, "correlation": 184275 } }, { "ph": "s", "id": 184275, "pid": 76337, "tid": -914061504, "ts": 1716454224288219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224342956, "dur": 36, "args": { "External id": 184276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184276, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 184276, "pid": 5, "tid": 7, "ts": 1716454224342956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288231, "dur": 12, "args": { "External id": 184276, "cbid": 211, "correlation": 184276 } }, { "ph": "s", "id": 184276, "pid": 76337, "tid": -914061504, "ts": 1716454224288231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224342993, "dur": 4, "args": { "External id": 184284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184284, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184284, "pid": 5, "tid": 7, "ts": 1716454224342993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288302, "dur": 12, "args": { "External id": 184284, "cbid": 211, "correlation": 184284 } }, { "ph": "s", "id": 184284, "pid": 76337, "tid": -914061504, "ts": 1716454224288302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224342998, "dur": 8, "args": { "External id": 184292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184292, "pid": 5, "tid": 7, "ts": 1716454224342998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288344, "dur": 10, "args": { "External id": 184292, "cbid": 211, "correlation": 184292 } }, { "ph": "s", "id": 184292, "pid": 76337, "tid": -914061504, "ts": 1716454224288344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224343008, "dur": 8, "args": { "External id": 184314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184314, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 184314, "pid": 5, "tid": 7, "ts": 1716454224343008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288396, "dur": 11, "args": { "External id": 184314, "cbid": 211, "correlation": 184314 } }, { "ph": "s", "id": 184314, "pid": 76337, "tid": -914061504, "ts": 1716454224288396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224288488, "dur": 1, "args": { "External id": 184330, "cbid": 251, "correlation": 184330 } }, { "ph": "f", "id": 184330, "pid": 76337, "tid": -914061504, "ts": 1716454224288488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224288493, "dur": 0, "args": { "External id": 184332, "cbid": 251, "correlation": 184332 } }, { "ph": "f", "id": 184332, "pid": 76337, "tid": -914061504, "ts": 1716454224288493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224343017, "dur": 188, "args": { "External id": 184333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184333, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184333, "pid": 5, "tid": 7, "ts": 1716454224343017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288495, "dur": 13, "args": { "External id": 184333, "cbid": 211, "correlation": 184333 } }, { "ph": "s", "id": 184333, "pid": 76337, "tid": -914061504, "ts": 1716454224288495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343207, "dur": 21, "args": { "External id": 184341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184341, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184341, "pid": 5, "tid": 7, "ts": 1716454224343207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288562, "dur": 12, "args": { "External id": 184341, "cbid": 211, "correlation": 184341 } }, { "ph": "s", "id": 184341, "pid": 76337, "tid": -914061504, "ts": 1716454224288562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343229, "dur": 21, "args": { "External id": 184349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184349, "pid": 5, "tid": 7, "ts": 1716454224343229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288592, "dur": 8, "args": { "External id": 184349, "cbid": 211, "correlation": 184349 } }, { "ph": "s", "id": 184349, "pid": 76337, "tid": -914061504, "ts": 1716454224288592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224288674, "dur": 1, "args": { "External id": 184365, "cbid": 251, "correlation": 184365 } }, { "ph": "f", "id": 184365, "pid": 76337, "tid": -914061504, "ts": 1716454224288674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224343252, "dur": 1, "args": { "External id": 184367, "device": 5, "context": 1, "stream": 7, "correlation": 184367, "bytes": 120, "memory bandwidth (GB/s)": 0.0797872340425532 } }, { "ph": "f", "id": 184367, "pid": 5, "tid": 7, "ts": 1716454224343252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224288679, "dur": 9, "args": { "External id": 184367, "cbid": 51, "correlation": 184367 } }, { "ph": "s", "id": 184367, "pid": 76337, "tid": -914061504, "ts": 1716454224288679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224343256, "dur": 109, "args": { "External id": 184368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184368, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 184368, "pid": 5, "tid": 7, "ts": 1716454224343256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288690, "dur": 12, "args": { "External id": 184368, "cbid": 211, "correlation": 184368 } }, { "ph": "s", "id": 184368, "pid": 76337, "tid": -914061504, "ts": 1716454224288690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224343366, "dur": 5, "args": { "External id": 184376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184376, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184376, "pid": 5, "tid": 7, "ts": 1716454224343366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288732, "dur": 10, "args": { "External id": 184376, "cbid": 211, "correlation": 184376 } }, { "ph": "s", "id": 184376, "pid": 76337, "tid": -914061504, "ts": 1716454224288732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343373, "dur": 10, "args": { "External id": 184387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184387, "pid": 5, "tid": 7, "ts": 1716454224343373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288799, "dur": 12, "args": { "External id": 184387, "cbid": 211, "correlation": 184387 } }, { "ph": "s", "id": 184387, "pid": 76337, "tid": -914061504, "ts": 1716454224288799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224288866, "dur": 0, "args": { "External id": 184399, "cbid": 317, "correlation": 184399 } }, { "ph": "f", "id": 184399, "pid": 76337, "tid": -914061504, "ts": 1716454224288866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224288867, "dur": 0, "args": { "External id": 184400, "cbid": 203, "correlation": 184400 } }, { "ph": "f", "id": 184400, "pid": 76337, "tid": -914061504, "ts": 1716454224288867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224288868, "dur": 0, "args": { "External id": 184401, "cbid": 205, "correlation": 184401 } }, { "ph": "f", "id": 184401, "pid": 76337, "tid": -914061504, "ts": 1716454224288868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224343384, "dur": 6, "args": { "External id": 184405, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184405, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184405, "pid": 5, "tid": 7, "ts": 1716454224343384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288884, "dur": 12, "args": { "External id": 184405, "cbid": 211, "correlation": 184405 } }, { "ph": "s", "id": 184405, "pid": 76337, "tid": -914061504, "ts": 1716454224288884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224343391, "dur": 37, "args": { "External id": 184407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184407, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 184407, "pid": 5, "tid": 7, "ts": 1716454224343391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288903, "dur": 7, "args": { "External id": 184407, "cbid": 211, "correlation": 184407 } }, { "ph": "s", "id": 184407, "pid": 76337, "tid": -914061504, "ts": 1716454224288903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224343429, "dur": 6, "args": { "External id": 184409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184409, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184409, "pid": 5, "tid": 7, "ts": 1716454224343429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288913, "dur": 6, "args": { "External id": 184409, "cbid": 211, "correlation": 184409 } }, { "ph": "s", "id": 184409, "pid": 76337, "tid": -914061504, "ts": 1716454224288913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343436, "dur": 7, "args": { "External id": 184415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184415, "pid": 5, "tid": 7, "ts": 1716454224343436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288941, "dur": 8, "args": { "External id": 184415, "cbid": 211, "correlation": 184415 } }, { "ph": "s", "id": 184415, "pid": 76337, "tid": -914061504, "ts": 1716454224288941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224343444, "dur": 5, "args": { "External id": 184423, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184423, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184423, "pid": 5, "tid": 7, "ts": 1716454224343444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224288981, "dur": 9, "args": { "External id": 184423, "cbid": 211, "correlation": 184423 } }, { "ph": "s", "id": 184423, "pid": 76337, "tid": -914061504, "ts": 1716454224288981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224343450, "dur": 11, "args": { "External id": 184443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184443, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 184443, "pid": 5, "tid": 7, "ts": 1716454224343450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289056, "dur": 12, "args": { "External id": 184443, "cbid": 211, "correlation": 184443 } }, { "ph": "s", "id": 184443, "pid": 76337, "tid": -914061504, "ts": 1716454224289056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224343462, "dur": 4, "args": { "External id": 184455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184455, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 184455, "pid": 5, "tid": 7, "ts": 1716454224343462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289078, "dur": 6, "args": { "External id": 184455, "cbid": 211, "correlation": 184455 } }, { "ph": "s", "id": 184455, "pid": 76337, "tid": -914061504, "ts": 1716454224289078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343468, "dur": 8, "args": { "External id": 184458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184458, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184458, "pid": 5, "tid": 7, "ts": 1716454224343468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289096, "dur": 6, "args": { "External id": 184458, "cbid": 211, "correlation": 184458 } }, { "ph": "s", "id": 184458, "pid": 76337, "tid": -914061504, "ts": 1716454224289096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224343477, "dur": 5, "args": { "External id": 184467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184467, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184467, "pid": 5, "tid": 7, "ts": 1716454224343477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289135, "dur": 10, "args": { "External id": 184467, "cbid": 211, "correlation": 184467 } }, { "ph": "s", "id": 184467, "pid": 76337, "tid": -914061504, "ts": 1716454224289135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224289191, "dur": 0, "args": { "External id": 184477, "cbid": 317, "correlation": 184477 } }, { "ph": "f", "id": 184477, "pid": 76337, "tid": -914061504, "ts": 1716454224289191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224289191, "dur": 0, "args": { "External id": 184478, "cbid": 203, "correlation": 184478 } }, { "ph": "f", "id": 184478, "pid": 76337, "tid": -914061504, "ts": 1716454224289191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224289192, "dur": 0, "args": { "External id": 184479, "cbid": 205, "correlation": 184479 } }, { "ph": "f", "id": 184479, "pid": 76337, "tid": -914061504, "ts": 1716454224289192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224343484, "dur": 5, "args": { "External id": 184483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184483, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184483, "pid": 5, "tid": 7, "ts": 1716454224343484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289206, "dur": 12, "args": { "External id": 184483, "cbid": 211, "correlation": 184483 } }, { "ph": "s", "id": 184483, "pid": 76337, "tid": -914061504, "ts": 1716454224289206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224343490, "dur": 160, "args": { "External id": 184485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184485, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184485, "pid": 5, "tid": 7, "ts": 1716454224343490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289220, "dur": 6, "args": { "External id": 184485, "cbid": 211, "correlation": 184485 } }, { "ph": "s", "id": 184485, "pid": 76337, "tid": -914061504, "ts": 1716454224289220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224343652, "dur": 1, "args": { "External id": 184487, "device": 5, "context": 1, "stream": 7, "correlation": 184487, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 184487, "pid": 5, "tid": 7, "ts": 1716454224343652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224289232, "dur": 6, "args": { "External id": 184487, "cbid": 51, "correlation": 184487 } }, { "ph": "s", "id": 184487, "pid": 76337, "tid": -914061504, "ts": 1716454224289232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224343655, "dur": 265, "args": { "External id": 184488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184488, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184488, "pid": 5, "tid": 7, "ts": 1716454224343655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289239, "dur": 6, "args": { "External id": 184488, "cbid": 211, "correlation": 184488 } }, { "ph": "s", "id": 184488, "pid": 76337, "tid": -914061504, "ts": 1716454224289239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224343921, "dur": 5, "args": { "External id": 184490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184490, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184490, "pid": 5, "tid": 7, "ts": 1716454224343921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289249, "dur": 5, "args": { "External id": 184490, "cbid": 211, "correlation": 184490 } }, { "ph": "s", "id": 184490, "pid": 76337, "tid": -914061504, "ts": 1716454224289249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343928, "dur": 6, "args": { "External id": 184496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184496, "pid": 5, "tid": 7, "ts": 1716454224343928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289277, "dur": 9, "args": { "External id": 184496, "cbid": 211, "correlation": 184496 } }, { "ph": "s", "id": 184496, "pid": 76337, "tid": -914061504, "ts": 1716454224289277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224343936, "dur": 3, "args": { "External id": 184504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184504, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 184504, "pid": 5, "tid": 7, "ts": 1716454224343936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289322, "dur": 10, "args": { "External id": 184504, "cbid": 211, "correlation": 184504 } }, { "ph": "s", "id": 184504, "pid": 76337, "tid": -914061504, "ts": 1716454224289322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224289389, "dur": 1, "args": { "External id": 184520, "cbid": 251, "correlation": 184520 } }, { "ph": "f", "id": 184520, "pid": 76337, "tid": -914061504, "ts": 1716454224289389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224289394, "dur": 0, "args": { "External id": 184522, "cbid": 251, "correlation": 184522 } }, { "ph": "f", "id": 184522, "pid": 76337, "tid": -914061504, "ts": 1716454224289394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224343940, "dur": 12, "args": { "External id": 184523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184523, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184523, "pid": 5, "tid": 7, "ts": 1716454224343940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289395, "dur": 11, "args": { "External id": 184523, "cbid": 211, "correlation": 184523 } }, { "ph": "s", "id": 184523, "pid": 76337, "tid": -914061504, "ts": 1716454224289395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224343954, "dur": 5, "args": { "External id": 184525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184525, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184525, "pid": 5, "tid": 7, "ts": 1716454224343954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289408, "dur": 5, "args": { "External id": 184525, "cbid": 211, "correlation": 184525 } }, { "ph": "s", "id": 184525, "pid": 76337, "tid": -914061504, "ts": 1716454224289408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343960, "dur": 6, "args": { "External id": 184535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184535, "pid": 5, "tid": 7, "ts": 1716454224343960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289467, "dur": 13, "args": { "External id": 184535, "cbid": 211, "correlation": 184535 } }, { "ph": "s", "id": 184535, "pid": 76337, "tid": -914061504, "ts": 1716454224289467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224343967, "dur": 10, "args": { "External id": 184555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184555, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 184555, "pid": 5, "tid": 7, "ts": 1716454224343967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289535, "dur": 11, "args": { "External id": 184555, "cbid": 211, "correlation": 184555 } }, { "ph": "s", "id": 184555, "pid": 76337, "tid": -914061504, "ts": 1716454224289535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224343978, "dur": 4, "args": { "External id": 184567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184567, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 184567, "pid": 5, "tid": 7, "ts": 1716454224343978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289555, "dur": 6, "args": { "External id": 184567, "cbid": 211, "correlation": 184567 } }, { "ph": "s", "id": 184567, "pid": 76337, "tid": -914061504, "ts": 1716454224289555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224343983, "dur": 7, "args": { "External id": 184570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184570, "pid": 5, "tid": 7, "ts": 1716454224343983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289574, "dur": 7, "args": { "External id": 184570, "cbid": 211, "correlation": 184570 } }, { "ph": "s", "id": 184570, "pid": 76337, "tid": -914061504, "ts": 1716454224289574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224343991, "dur": 4, "args": { "External id": 184579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184579, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184579, "pid": 5, "tid": 7, "ts": 1716454224343991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289615, "dur": 9, "args": { "External id": 184579, "cbid": 211, "correlation": 184579 } }, { "ph": "s", "id": 184579, "pid": 76337, "tid": -914061504, "ts": 1716454224289615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224289677, "dur": 0, "args": { "External id": 184589, "cbid": 317, "correlation": 184589 } }, { "ph": "f", "id": 184589, "pid": 76337, "tid": -914061504, "ts": 1716454224289677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224289678, "dur": 0, "args": { "External id": 184590, "cbid": 203, "correlation": 184590 } }, { "ph": "f", "id": 184590, "pid": 76337, "tid": -914061504, "ts": 1716454224289678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224289678, "dur": 0, "args": { "External id": 184591, "cbid": 205, "correlation": 184591 } }, { "ph": "f", "id": 184591, "pid": 76337, "tid": -914061504, "ts": 1716454224289678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224343997, "dur": 5, "args": { "External id": 184595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184595, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184595, "pid": 5, "tid": 7, "ts": 1716454224343997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289693, "dur": 12, "args": { "External id": 184595, "cbid": 211, "correlation": 184595 } }, { "ph": "s", "id": 184595, "pid": 76337, "tid": -914061504, "ts": 1716454224289693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224344003, "dur": 159, "args": { "External id": 184597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184597, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184597, "pid": 5, "tid": 7, "ts": 1716454224344003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289707, "dur": 5, "args": { "External id": 184597, "cbid": 211, "correlation": 184597 } }, { "ph": "s", "id": 184597, "pid": 76337, "tid": -914061504, "ts": 1716454224289707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224344164, "dur": 1, "args": { "External id": 184599, "device": 5, "context": 1, "stream": 7, "correlation": 184599, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 184599, "pid": 5, "tid": 7, "ts": 1716454224344164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224289718, "dur": 6, "args": { "External id": 184599, "cbid": 51, "correlation": 184599 } }, { "ph": "s", "id": 184599, "pid": 76337, "tid": -914061504, "ts": 1716454224289718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224344168, "dur": 255, "args": { "External id": 184600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184600, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184600, "pid": 5, "tid": 7, "ts": 1716454224344168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289725, "dur": 6, "args": { "External id": 184600, "cbid": 211, "correlation": 184600 } }, { "ph": "s", "id": 184600, "pid": 76337, "tid": -914061504, "ts": 1716454224289725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224344424, "dur": 6, "args": { "External id": 184602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184602, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184602, "pid": 5, "tid": 7, "ts": 1716454224344424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289735, "dur": 6, "args": { "External id": 184602, "cbid": 211, "correlation": 184602 } }, { "ph": "s", "id": 184602, "pid": 76337, "tid": -914061504, "ts": 1716454224289735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224344431, "dur": 6, "args": { "External id": 184608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184608, "pid": 5, "tid": 7, "ts": 1716454224344431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289764, "dur": 8, "args": { "External id": 184608, "cbid": 211, "correlation": 184608 } }, { "ph": "s", "id": 184608, "pid": 76337, "tid": -914061504, "ts": 1716454224289764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224344439, "dur": 5, "args": { "External id": 184616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184616, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184616, "pid": 5, "tid": 7, "ts": 1716454224344439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289797, "dur": 8, "args": { "External id": 184616, "cbid": 211, "correlation": 184616 } }, { "ph": "s", "id": 184616, "pid": 76337, "tid": -914061504, "ts": 1716454224289797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224344445, "dur": 4, "args": { "External id": 184624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184624, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184624, "pid": 5, "tid": 7, "ts": 1716454224344445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289826, "dur": 8, "args": { "External id": 184624, "cbid": 211, "correlation": 184624 } }, { "ph": "s", "id": 184624, "pid": 76337, "tid": -914061504, "ts": 1716454224289826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224344451, "dur": 12, "args": { "External id": 184633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184633, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184633, "pid": 5, "tid": 7, "ts": 1716454224344451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224289926, "dur": 14, "args": { "External id": 184633, "cbid": 211, "correlation": 184633 } }, { "ph": "s", "id": 184633, "pid": 76337, "tid": -914061504, "ts": 1716454224289926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224344464, "dur": 12, "args": { "External id": 184653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184653, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 184653, "pid": 5, "tid": 7, "ts": 1716454224344464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290014, "dur": 12, "args": { "External id": 184653, "cbid": 211, "correlation": 184653 } }, { "ph": "s", "id": 184653, "pid": 76337, "tid": -914061504, "ts": 1716454224290014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224344477, "dur": 4, "args": { "External id": 184665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184665, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184665, "pid": 5, "tid": 7, "ts": 1716454224344477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290036, "dur": 7, "args": { "External id": 184665, "cbid": 211, "correlation": 184665 } }, { "ph": "s", "id": 184665, "pid": 76337, "tid": -914061504, "ts": 1716454224290036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224344482, "dur": 10, "args": { "External id": 184668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184668, "pid": 5, "tid": 7, "ts": 1716454224344482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290056, "dur": 6, "args": { "External id": 184668, "cbid": 211, "correlation": 184668 } }, { "ph": "s", "id": 184668, "pid": 76337, "tid": -914061504, "ts": 1716454224290056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224344493, "dur": 6, "args": { "External id": 184677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184677, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184677, "pid": 5, "tid": 7, "ts": 1716454224344493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290095, "dur": 10, "args": { "External id": 184677, "cbid": 211, "correlation": 184677 } }, { "ph": "s", "id": 184677, "pid": 76337, "tid": -914061504, "ts": 1716454224290095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224290152, "dur": 0, "args": { "External id": 184687, "cbid": 317, "correlation": 184687 } }, { "ph": "f", "id": 184687, "pid": 76337, "tid": -914061504, "ts": 1716454224290152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224290153, "dur": 0, "args": { "External id": 184688, "cbid": 203, "correlation": 184688 } }, { "ph": "f", "id": 184688, "pid": 76337, "tid": -914061504, "ts": 1716454224290153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224290153, "dur": 0, "args": { "External id": 184689, "cbid": 205, "correlation": 184689 } }, { "ph": "f", "id": 184689, "pid": 76337, "tid": -914061504, "ts": 1716454224290153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224344501, "dur": 6, "args": { "External id": 184693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184693, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184693, "pid": 5, "tid": 7, "ts": 1716454224344501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290169, "dur": 11, "args": { "External id": 184693, "cbid": 211, "correlation": 184693 } }, { "ph": "s", "id": 184693, "pid": 76337, "tid": -914061504, "ts": 1716454224290169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224344509, "dur": 314, "args": { "External id": 184695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184695, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184695, "pid": 5, "tid": 7, "ts": 1716454224344509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290183, "dur": 5, "args": { "External id": 184695, "cbid": 211, "correlation": 184695 } }, { "ph": "s", "id": 184695, "pid": 76337, "tid": -914061504, "ts": 1716454224290183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224344825, "dur": 1, "args": { "External id": 184697, "device": 5, "context": 1, "stream": 7, "correlation": 184697, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 184697, "pid": 5, "tid": 7, "ts": 1716454224344825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224290194, "dur": 7, "args": { "External id": 184697, "cbid": 51, "correlation": 184697 } }, { "ph": "s", "id": 184697, "pid": 76337, "tid": -914061504, "ts": 1716454224290194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224344829, "dur": 488, "args": { "External id": 184698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184698, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184698, "pid": 5, "tid": 7, "ts": 1716454224344829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290202, "dur": 6, "args": { "External id": 184698, "cbid": 211, "correlation": 184698 } }, { "ph": "s", "id": 184698, "pid": 76337, "tid": -914061504, "ts": 1716454224290202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224345318, "dur": 6, "args": { "External id": 184700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184700, "pid": 5, "tid": 7, "ts": 1716454224345318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290212, "dur": 5, "args": { "External id": 184700, "cbid": 211, "correlation": 184700 } }, { "ph": "s", "id": 184700, "pid": 76337, "tid": -914061504, "ts": 1716454224290212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224345325, "dur": 6, "args": { "External id": 184706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184706, "pid": 5, "tid": 7, "ts": 1716454224345325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290241, "dur": 10, "args": { "External id": 184706, "cbid": 211, "correlation": 184706 } }, { "ph": "s", "id": 184706, "pid": 76337, "tid": -914061504, "ts": 1716454224290241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224345332, "dur": 3, "args": { "External id": 184714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184714, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 184714, "pid": 5, "tid": 7, "ts": 1716454224345332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290285, "dur": 9, "args": { "External id": 184714, "cbid": 211, "correlation": 184714 } }, { "ph": "s", "id": 184714, "pid": 76337, "tid": -914061504, "ts": 1716454224290285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224290350, "dur": 1, "args": { "External id": 184730, "cbid": 251, "correlation": 184730 } }, { "ph": "f", "id": 184730, "pid": 76337, "tid": -914061504, "ts": 1716454224290350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224290355, "dur": 0, "args": { "External id": 184732, "cbid": 251, "correlation": 184732 } }, { "ph": "f", "id": 184732, "pid": 76337, "tid": -914061504, "ts": 1716454224290355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224345336, "dur": 11, "args": { "External id": 184733, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184733, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184733, "pid": 5, "tid": 7, "ts": 1716454224345336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290357, "dur": 11, "args": { "External id": 184733, "cbid": 211, "correlation": 184733 } }, { "ph": "s", "id": 184733, "pid": 76337, "tid": -914061504, "ts": 1716454224290357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224345349, "dur": 4, "args": { "External id": 184735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184735, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184735, "pid": 5, "tid": 7, "ts": 1716454224345349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290370, "dur": 5, "args": { "External id": 184735, "cbid": 211, "correlation": 184735 } }, { "ph": "s", "id": 184735, "pid": 76337, "tid": -914061504, "ts": 1716454224290370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224345355, "dur": 6, "args": { "External id": 184745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184745, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184745, "pid": 5, "tid": 7, "ts": 1716454224345355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290426, "dur": 12, "args": { "External id": 184745, "cbid": 211, "correlation": 184745 } }, { "ph": "s", "id": 184745, "pid": 76337, "tid": -914061504, "ts": 1716454224290426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224345361, "dur": 9, "args": { "External id": 184765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184765, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 184765, "pid": 5, "tid": 7, "ts": 1716454224345361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290492, "dur": 12, "args": { "External id": 184765, "cbid": 211, "correlation": 184765 } }, { "ph": "s", "id": 184765, "pid": 76337, "tid": -914061504, "ts": 1716454224290492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224345372, "dur": 4, "args": { "External id": 184777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184777, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 184777, "pid": 5, "tid": 7, "ts": 1716454224345372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290514, "dur": 6, "args": { "External id": 184777, "cbid": 211, "correlation": 184777 } }, { "ph": "s", "id": 184777, "pid": 76337, "tid": -914061504, "ts": 1716454224290514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224345377, "dur": 7, "args": { "External id": 184780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184780, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184780, "pid": 5, "tid": 7, "ts": 1716454224345377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290533, "dur": 6, "args": { "External id": 184780, "cbid": 211, "correlation": 184780 } }, { "ph": "s", "id": 184780, "pid": 76337, "tid": -914061504, "ts": 1716454224290533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224345385, "dur": 4, "args": { "External id": 184789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184789, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184789, "pid": 5, "tid": 7, "ts": 1716454224345385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290574, "dur": 9, "args": { "External id": 184789, "cbid": 211, "correlation": 184789 } }, { "ph": "s", "id": 184789, "pid": 76337, "tid": -914061504, "ts": 1716454224290574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224290636, "dur": 0, "args": { "External id": 184799, "cbid": 317, "correlation": 184799 } }, { "ph": "f", "id": 184799, "pid": 76337, "tid": -914061504, "ts": 1716454224290636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224290637, "dur": 0, "args": { "External id": 184800, "cbid": 203, "correlation": 184800 } }, { "ph": "f", "id": 184800, "pid": 76337, "tid": -914061504, "ts": 1716454224290637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224290637, "dur": 0, "args": { "External id": 184801, "cbid": 205, "correlation": 184801 } }, { "ph": "f", "id": 184801, "pid": 76337, "tid": -914061504, "ts": 1716454224290637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224345391, "dur": 5, "args": { "External id": 184805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184805, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184805, "pid": 5, "tid": 7, "ts": 1716454224345391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290651, "dur": 13, "args": { "External id": 184805, "cbid": 211, "correlation": 184805 } }, { "ph": "s", "id": 184805, "pid": 76337, "tid": -914061504, "ts": 1716454224290651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224345397, "dur": 159, "args": { "External id": 184807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184807, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184807, "pid": 5, "tid": 7, "ts": 1716454224345397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290666, "dur": 5, "args": { "External id": 184807, "cbid": 211, "correlation": 184807 } }, { "ph": "s", "id": 184807, "pid": 76337, "tid": -914061504, "ts": 1716454224290666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224345558, "dur": 1, "args": { "External id": 184809, "device": 5, "context": 1, "stream": 7, "correlation": 184809, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 184809, "pid": 5, "tid": 7, "ts": 1716454224345558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224290677, "dur": 7, "args": { "External id": 184809, "cbid": 51, "correlation": 184809 } }, { "ph": "s", "id": 184809, "pid": 76337, "tid": -914061504, "ts": 1716454224290677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224345562, "dur": 254, "args": { "External id": 184810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184810, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184810, "pid": 5, "tid": 7, "ts": 1716454224345562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290686, "dur": 6, "args": { "External id": 184810, "cbid": 211, "correlation": 184810 } }, { "ph": "s", "id": 184810, "pid": 76337, "tid": -914061504, "ts": 1716454224290686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224345817, "dur": 6, "args": { "External id": 184812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184812, "pid": 5, "tid": 7, "ts": 1716454224345817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290695, "dur": 5, "args": { "External id": 184812, "cbid": 211, "correlation": 184812 } }, { "ph": "s", "id": 184812, "pid": 76337, "tid": -914061504, "ts": 1716454224290695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224345824, "dur": 6, "args": { "External id": 184818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184818, "pid": 5, "tid": 7, "ts": 1716454224345824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290724, "dur": 8, "args": { "External id": 184818, "cbid": 211, "correlation": 184818 } }, { "ph": "s", "id": 184818, "pid": 76337, "tid": -914061504, "ts": 1716454224290724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224290782, "dur": 0, "args": { "External id": 184828, "cbid": 317, "correlation": 184828 } }, { "ph": "f", "id": 184828, "pid": 76337, "tid": -914061504, "ts": 1716454224290782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224290782, "dur": 0, "args": { "External id": 184829, "cbid": 203, "correlation": 184829 } }, { "ph": "f", "id": 184829, "pid": 76337, "tid": -914061504, "ts": 1716454224290782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224290783, "dur": 0, "args": { "External id": 184830, "cbid": 205, "correlation": 184830 } }, { "ph": "f", "id": 184830, "pid": 76337, "tid": -914061504, "ts": 1716454224290783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224345832, "dur": 7, "args": { "External id": 184834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184834, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184834, "pid": 5, "tid": 7, "ts": 1716454224345832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290797, "dur": 12, "args": { "External id": 184834, "cbid": 211, "correlation": 184834 } }, { "ph": "s", "id": 184834, "pid": 76337, "tid": -914061504, "ts": 1716454224290797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224345840, "dur": 3, "args": { "External id": 184836, "device": 5, "context": 1, "stream": 7, "correlation": 184836, "bytes": 4800, "memory bandwidth (GB/s)": 1.4150943396226414 } }, { "ph": "f", "id": 184836, "pid": 5, "tid": 7, "ts": 1716454224345840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224290815, "dur": 14, "args": { "External id": 184836, "cbid": 51, "correlation": 184836 } }, { "ph": "s", "id": 184836, "pid": 76337, "tid": -914061504, "ts": 1716454224290815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224345844, "dur": 96, "args": { "External id": 184837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184837, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 184837, "pid": 5, "tid": 7, "ts": 1716454224345844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290830, "dur": 7, "args": { "External id": 184837, "cbid": 211, "correlation": 184837 } }, { "ph": "s", "id": 184837, "pid": 76337, "tid": -914061504, "ts": 1716454224290830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224345942, "dur": 5, "args": { "External id": 184839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184839, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184839, "pid": 5, "tid": 7, "ts": 1716454224345942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290841, "dur": 5, "args": { "External id": 184839, "cbid": 211, "correlation": 184839 } }, { "ph": "s", "id": 184839, "pid": 76337, "tid": -914061504, "ts": 1716454224290841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224345948, "dur": 6, "args": { "External id": 184845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184845, "pid": 5, "tid": 7, "ts": 1716454224345948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290869, "dur": 8, "args": { "External id": 184845, "cbid": 211, "correlation": 184845 } }, { "ph": "s", "id": 184845, "pid": 76337, "tid": -914061504, "ts": 1716454224290869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224345956, "dur": 5, "args": { "External id": 184853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184853, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184853, "pid": 5, "tid": 7, "ts": 1716454224345956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290897, "dur": 8, "args": { "External id": 184853, "cbid": 211, "correlation": 184853 } }, { "ph": "s", "id": 184853, "pid": 76337, "tid": -914061504, "ts": 1716454224290897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224345962, "dur": 4, "args": { "External id": 184861, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184861, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184861, "pid": 5, "tid": 7, "ts": 1716454224345962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290926, "dur": 8, "args": { "External id": 184861, "cbid": 211, "correlation": 184861 } }, { "ph": "s", "id": 184861, "pid": 76337, "tid": -914061504, "ts": 1716454224290926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224345967, "dur": 11, "args": { "External id": 184870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184870, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184870, "pid": 5, "tid": 7, "ts": 1716454224345967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224290970, "dur": 45, "args": { "External id": 184870, "cbid": 211, "correlation": 184870 } }, { "ph": "s", "id": 184870, "pid": 76337, "tid": -914061504, "ts": 1716454224290970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224345980, "dur": 12, "args": { "External id": 184890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184890, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 184890, "pid": 5, "tid": 7, "ts": 1716454224345980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291078, "dur": 11, "args": { "External id": 184890, "cbid": 211, "correlation": 184890 } }, { "ph": "s", "id": 184890, "pid": 76337, "tid": -914061504, "ts": 1716454224291078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224345993, "dur": 4, "args": { "External id": 184902, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184902, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184902, "pid": 5, "tid": 7, "ts": 1716454224345993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291100, "dur": 6, "args": { "External id": 184902, "cbid": 211, "correlation": 184902 } }, { "ph": "s", "id": 184902, "pid": 76337, "tid": -914061504, "ts": 1716454224291100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224345999, "dur": 10, "args": { "External id": 184905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184905, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184905, "pid": 5, "tid": 7, "ts": 1716454224345999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291118, "dur": 6, "args": { "External id": 184905, "cbid": 211, "correlation": 184905 } }, { "ph": "s", "id": 184905, "pid": 76337, "tid": -914061504, "ts": 1716454224291118, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224346010, "dur": 6, "args": { "External id": 184914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184914, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184914, "pid": 5, "tid": 7, "ts": 1716454224346010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291155, "dur": 10, "args": { "External id": 184914, "cbid": 211, "correlation": 184914 } }, { "ph": "s", "id": 184914, "pid": 76337, "tid": -914061504, "ts": 1716454224291155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224291211, "dur": 0, "args": { "External id": 184924, "cbid": 317, "correlation": 184924 } }, { "ph": "f", "id": 184924, "pid": 76337, "tid": -914061504, "ts": 1716454224291211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224291212, "dur": 0, "args": { "External id": 184925, "cbid": 203, "correlation": 184925 } }, { "ph": "f", "id": 184925, "pid": 76337, "tid": -914061504, "ts": 1716454224291212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224291212, "dur": 0, "args": { "External id": 184926, "cbid": 205, "correlation": 184926 } }, { "ph": "f", "id": 184926, "pid": 76337, "tid": -914061504, "ts": 1716454224291212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224346018, "dur": 6, "args": { "External id": 184930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184930, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184930, "pid": 5, "tid": 7, "ts": 1716454224346018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291227, "dur": 11, "args": { "External id": 184930, "cbid": 211, "correlation": 184930 } }, { "ph": "s", "id": 184930, "pid": 76337, "tid": -914061504, "ts": 1716454224291227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224346026, "dur": 314, "args": { "External id": 184932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184932, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184932, "pid": 5, "tid": 7, "ts": 1716454224346026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291241, "dur": 5, "args": { "External id": 184932, "cbid": 211, "correlation": 184932 } }, { "ph": "s", "id": 184932, "pid": 76337, "tid": -914061504, "ts": 1716454224291241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224346342, "dur": 1, "args": { "External id": 184934, "device": 5, "context": 1, "stream": 7, "correlation": 184934, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 184934, "pid": 5, "tid": 7, "ts": 1716454224346342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224291251, "dur": 6, "args": { "External id": 184934, "cbid": 51, "correlation": 184934 } }, { "ph": "s", "id": 184934, "pid": 76337, "tid": -914061504, "ts": 1716454224291251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224346345, "dur": 489, "args": { "External id": 184935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184935, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184935, "pid": 5, "tid": 7, "ts": 1716454224346345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291259, "dur": 6, "args": { "External id": 184935, "cbid": 211, "correlation": 184935 } }, { "ph": "s", "id": 184935, "pid": 76337, "tid": -914061504, "ts": 1716454224291259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224346836, "dur": 5, "args": { "External id": 184937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184937, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 184937, "pid": 5, "tid": 7, "ts": 1716454224346836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291269, "dur": 6, "args": { "External id": 184937, "cbid": 211, "correlation": 184937 } }, { "ph": "s", "id": 184937, "pid": 76337, "tid": -914061504, "ts": 1716454224291269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224346842, "dur": 7, "args": { "External id": 184943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184943, "pid": 5, "tid": 7, "ts": 1716454224346842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291297, "dur": 8, "args": { "External id": 184943, "cbid": 211, "correlation": 184943 } }, { "ph": "s", "id": 184943, "pid": 76337, "tid": -914061504, "ts": 1716454224291297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224346850, "dur": 3, "args": { "External id": 184951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184951, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 184951, "pid": 5, "tid": 7, "ts": 1716454224346850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291342, "dur": 10, "args": { "External id": 184951, "cbid": 211, "correlation": 184951 } }, { "ph": "s", "id": 184951, "pid": 76337, "tid": -914061504, "ts": 1716454224291342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224291404, "dur": 1, "args": { "External id": 184967, "cbid": 251, "correlation": 184967 } }, { "ph": "f", "id": 184967, "pid": 76337, "tid": -914061504, "ts": 1716454224291404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224291410, "dur": 0, "args": { "External id": 184969, "cbid": 251, "correlation": 184969 } }, { "ph": "f", "id": 184969, "pid": 76337, "tid": -914061504, "ts": 1716454224291410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224346855, "dur": 12, "args": { "External id": 184970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184970, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184970, "pid": 5, "tid": 7, "ts": 1716454224346855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291411, "dur": 15, "args": { "External id": 184970, "cbid": 211, "correlation": 184970 } }, { "ph": "s", "id": 184970, "pid": 76337, "tid": -914061504, "ts": 1716454224291411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224346868, "dur": 5, "args": { "External id": 184972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184972, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 184972, "pid": 5, "tid": 7, "ts": 1716454224346868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291428, "dur": 5, "args": { "External id": 184972, "cbid": 211, "correlation": 184972 } }, { "ph": "s", "id": 184972, "pid": 76337, "tid": -914061504, "ts": 1716454224291428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224346875, "dur": 6, "args": { "External id": 184982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 184982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 184982, "pid": 5, "tid": 7, "ts": 1716454224346875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291485, "dur": 11, "args": { "External id": 184982, "cbid": 211, "correlation": 184982 } }, { "ph": "s", "id": 184982, "pid": 76337, "tid": -914061504, "ts": 1716454224291485, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224346882, "dur": 9, "args": { "External id": 185002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185002, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 185002, "pid": 5, "tid": 7, "ts": 1716454224346882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291551, "dur": 10, "args": { "External id": 185002, "cbid": 211, "correlation": 185002 } }, { "ph": "s", "id": 185002, "pid": 76337, "tid": -914061504, "ts": 1716454224291551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224346892, "dur": 4, "args": { "External id": 185014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185014, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 185014, "pid": 5, "tid": 7, "ts": 1716454224346892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291571, "dur": 6, "args": { "External id": 185014, "cbid": 211, "correlation": 185014 } }, { "ph": "s", "id": 185014, "pid": 76337, "tid": -914061504, "ts": 1716454224291571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224346897, "dur": 6, "args": { "External id": 185017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185017, "pid": 5, "tid": 7, "ts": 1716454224346897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291591, "dur": 6, "args": { "External id": 185017, "cbid": 211, "correlation": 185017 } }, { "ph": "s", "id": 185017, "pid": 76337, "tid": -914061504, "ts": 1716454224291591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224346905, "dur": 4, "args": { "External id": 185026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185026, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185026, "pid": 5, "tid": 7, "ts": 1716454224346905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291630, "dur": 10, "args": { "External id": 185026, "cbid": 211, "correlation": 185026 } }, { "ph": "s", "id": 185026, "pid": 76337, "tid": -914061504, "ts": 1716454224291630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224291696, "dur": 0, "args": { "External id": 185036, "cbid": 317, "correlation": 185036 } }, { "ph": "f", "id": 185036, "pid": 76337, "tid": -914061504, "ts": 1716454224291696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224291697, "dur": 0, "args": { "External id": 185037, "cbid": 203, "correlation": 185037 } }, { "ph": "f", "id": 185037, "pid": 76337, "tid": -914061504, "ts": 1716454224291697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224291698, "dur": 0, "args": { "External id": 185038, "cbid": 205, "correlation": 185038 } }, { "ph": "f", "id": 185038, "pid": 76337, "tid": -914061504, "ts": 1716454224291698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224346911, "dur": 5, "args": { "External id": 185042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185042, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185042, "pid": 5, "tid": 7, "ts": 1716454224346911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291711, "dur": 12, "args": { "External id": 185042, "cbid": 211, "correlation": 185042 } }, { "ph": "s", "id": 185042, "pid": 76337, "tid": -914061504, "ts": 1716454224291711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224346917, "dur": 159, "args": { "External id": 185044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185044, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185044, "pid": 5, "tid": 7, "ts": 1716454224346917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291726, "dur": 5, "args": { "External id": 185044, "cbid": 211, "correlation": 185044 } }, { "ph": "s", "id": 185044, "pid": 76337, "tid": -914061504, "ts": 1716454224291726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224347078, "dur": 1, "args": { "External id": 185046, "device": 5, "context": 1, "stream": 7, "correlation": 185046, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 185046, "pid": 5, "tid": 7, "ts": 1716454224347078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224291737, "dur": 7, "args": { "External id": 185046, "cbid": 51, "correlation": 185046 } }, { "ph": "s", "id": 185046, "pid": 76337, "tid": -914061504, "ts": 1716454224291737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224347082, "dur": 256, "args": { "External id": 185047, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185047, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185047, "pid": 5, "tid": 7, "ts": 1716454224347082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291746, "dur": 6, "args": { "External id": 185047, "cbid": 211, "correlation": 185047 } }, { "ph": "s", "id": 185047, "pid": 76337, "tid": -914061504, "ts": 1716454224291746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224347338, "dur": 6, "args": { "External id": 185049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185049, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185049, "pid": 5, "tid": 7, "ts": 1716454224347338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291755, "dur": 5, "args": { "External id": 185049, "cbid": 211, "correlation": 185049 } }, { "ph": "s", "id": 185049, "pid": 76337, "tid": -914061504, "ts": 1716454224291755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224347345, "dur": 6, "args": { "External id": 185055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185055, "pid": 5, "tid": 7, "ts": 1716454224347345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291783, "dur": 9, "args": { "External id": 185055, "cbid": 211, "correlation": 185055 } }, { "ph": "s", "id": 185055, "pid": 76337, "tid": -914061504, "ts": 1716454224291783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224291843, "dur": 0, "args": { "External id": 185065, "cbid": 317, "correlation": 185065 } }, { "ph": "f", "id": 185065, "pid": 76337, "tid": -914061504, "ts": 1716454224291843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224291844, "dur": 0, "args": { "External id": 185066, "cbid": 203, "correlation": 185066 } }, { "ph": "f", "id": 185066, "pid": 76337, "tid": -914061504, "ts": 1716454224291844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224291845, "dur": 0, "args": { "External id": 185067, "cbid": 205, "correlation": 185067 } }, { "ph": "f", "id": 185067, "pid": 76337, "tid": -914061504, "ts": 1716454224291845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224347353, "dur": 7, "args": { "External id": 185071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185071, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185071, "pid": 5, "tid": 7, "ts": 1716454224347353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291856, "dur": 12, "args": { "External id": 185071, "cbid": 211, "correlation": 185071 } }, { "ph": "s", "id": 185071, "pid": 76337, "tid": -914061504, "ts": 1716454224291856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224347361, "dur": 3, "args": { "External id": 185073, "device": 5, "context": 1, "stream": 7, "correlation": 185073, "bytes": 4800, "memory bandwidth (GB/s)": 1.3888888888888888 } }, { "ph": "f", "id": 185073, "pid": 5, "tid": 7, "ts": 1716454224347361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224291873, "dur": 10, "args": { "External id": 185073, "cbid": 51, "correlation": 185073 } }, { "ph": "s", "id": 185073, "pid": 76337, "tid": -914061504, "ts": 1716454224291873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224347366, "dur": 95, "args": { "External id": 185074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185074, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 185074, "pid": 5, "tid": 7, "ts": 1716454224347366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291884, "dur": 7, "args": { "External id": 185074, "cbid": 211, "correlation": 185074 } }, { "ph": "s", "id": 185074, "pid": 76337, "tid": -914061504, "ts": 1716454224291884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224347462, "dur": 5, "args": { "External id": 185076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185076, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185076, "pid": 5, "tid": 7, "ts": 1716454224347462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291894, "dur": 6, "args": { "External id": 185076, "cbid": 211, "correlation": 185076 } }, { "ph": "s", "id": 185076, "pid": 76337, "tid": -914061504, "ts": 1716454224291894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224347469, "dur": 6, "args": { "External id": 185082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185082, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185082, "pid": 5, "tid": 7, "ts": 1716454224347469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291920, "dur": 8, "args": { "External id": 185082, "cbid": 211, "correlation": 185082 } }, { "ph": "s", "id": 185082, "pid": 76337, "tid": -914061504, "ts": 1716454224291920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224347476, "dur": 5, "args": { "External id": 185090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185090, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185090, "pid": 5, "tid": 7, "ts": 1716454224347476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291950, "dur": 8, "args": { "External id": 185090, "cbid": 211, "correlation": 185090 } }, { "ph": "s", "id": 185090, "pid": 76337, "tid": -914061504, "ts": 1716454224291950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224347482, "dur": 4, "args": { "External id": 185098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185098, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185098, "pid": 5, "tid": 7, "ts": 1716454224347482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224291987, "dur": 9, "args": { "External id": 185098, "cbid": 211, "correlation": 185098 } }, { "ph": "s", "id": 185098, "pid": 76337, "tid": -914061504, "ts": 1716454224291987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224347488, "dur": 11, "args": { "External id": 185107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185107, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185107, "pid": 5, "tid": 7, "ts": 1716454224347488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292032, "dur": 11, "args": { "External id": 185107, "cbid": 211, "correlation": 185107 } }, { "ph": "s", "id": 185107, "pid": 76337, "tid": -914061504, "ts": 1716454224292032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224347500, "dur": 12, "args": { "External id": 185127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185127, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 185127, "pid": 5, "tid": 7, "ts": 1716454224347500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292106, "dur": 12, "args": { "External id": 185127, "cbid": 211, "correlation": 185127 } }, { "ph": "s", "id": 185127, "pid": 76337, "tid": -914061504, "ts": 1716454224292106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224347514, "dur": 4, "args": { "External id": 185139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185139, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185139, "pid": 5, "tid": 7, "ts": 1716454224347514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292127, "dur": 6, "args": { "External id": 185139, "cbid": 211, "correlation": 185139 } }, { "ph": "s", "id": 185139, "pid": 76337, "tid": -914061504, "ts": 1716454224292127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224347520, "dur": 10, "args": { "External id": 185142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185142, "pid": 5, "tid": 7, "ts": 1716454224347520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292146, "dur": 7, "args": { "External id": 185142, "cbid": 211, "correlation": 185142 } }, { "ph": "s", "id": 185142, "pid": 76337, "tid": -914061504, "ts": 1716454224292146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224347531, "dur": 6, "args": { "External id": 185151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185151, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185151, "pid": 5, "tid": 7, "ts": 1716454224347531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292183, "dur": 10, "args": { "External id": 185151, "cbid": 211, "correlation": 185151 } }, { "ph": "s", "id": 185151, "pid": 76337, "tid": -914061504, "ts": 1716454224292183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224292235, "dur": 0, "args": { "External id": 185161, "cbid": 317, "correlation": 185161 } }, { "ph": "f", "id": 185161, "pid": 76337, "tid": -914061504, "ts": 1716454224292235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224292236, "dur": 0, "args": { "External id": 185162, "cbid": 203, "correlation": 185162 } }, { "ph": "f", "id": 185162, "pid": 76337, "tid": -914061504, "ts": 1716454224292236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224292237, "dur": 0, "args": { "External id": 185163, "cbid": 205, "correlation": 185163 } }, { "ph": "f", "id": 185163, "pid": 76337, "tid": -914061504, "ts": 1716454224292237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224347539, "dur": 6, "args": { "External id": 185167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185167, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185167, "pid": 5, "tid": 7, "ts": 1716454224347539, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292251, "dur": 11, "args": { "External id": 185167, "cbid": 211, "correlation": 185167 } }, { "ph": "s", "id": 185167, "pid": 76337, "tid": -914061504, "ts": 1716454224292251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224347547, "dur": 314, "args": { "External id": 185169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185169, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185169, "pid": 5, "tid": 7, "ts": 1716454224347547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292265, "dur": 5, "args": { "External id": 185169, "cbid": 211, "correlation": 185169 } }, { "ph": "s", "id": 185169, "pid": 76337, "tid": -914061504, "ts": 1716454224292265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224347863, "dur": 1, "args": { "External id": 185171, "device": 5, "context": 1, "stream": 7, "correlation": 185171, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 185171, "pid": 5, "tid": 7, "ts": 1716454224347863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224292275, "dur": 6, "args": { "External id": 185171, "cbid": 51, "correlation": 185171 } }, { "ph": "s", "id": 185171, "pid": 76337, "tid": -914061504, "ts": 1716454224292275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224347866, "dur": 490, "args": { "External id": 185172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185172, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185172, "pid": 5, "tid": 7, "ts": 1716454224347866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292282, "dur": 6, "args": { "External id": 185172, "cbid": 211, "correlation": 185172 } }, { "ph": "s", "id": 185172, "pid": 76337, "tid": -914061504, "ts": 1716454224292282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224348357, "dur": 6, "args": { "External id": 185174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185174, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185174, "pid": 5, "tid": 7, "ts": 1716454224348357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292292, "dur": 6, "args": { "External id": 185174, "cbid": 211, "correlation": 185174 } }, { "ph": "s", "id": 185174, "pid": 76337, "tid": -914061504, "ts": 1716454224292292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224348364, "dur": 6, "args": { "External id": 185180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185180, "pid": 5, "tid": 7, "ts": 1716454224348364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292320, "dur": 8, "args": { "External id": 185180, "cbid": 211, "correlation": 185180 } }, { "ph": "s", "id": 185180, "pid": 76337, "tid": -914061504, "ts": 1716454224292320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224348371, "dur": 3, "args": { "External id": 185188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185188, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 185188, "pid": 5, "tid": 7, "ts": 1716454224348371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292363, "dur": 9, "args": { "External id": 185188, "cbid": 211, "correlation": 185188 } }, { "ph": "s", "id": 185188, "pid": 76337, "tid": -914061504, "ts": 1716454224292363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224292426, "dur": 1, "args": { "External id": 185204, "cbid": 251, "correlation": 185204 } }, { "ph": "f", "id": 185204, "pid": 76337, "tid": -914061504, "ts": 1716454224292426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224292431, "dur": 0, "args": { "External id": 185206, "cbid": 251, "correlation": 185206 } }, { "ph": "f", "id": 185206, "pid": 76337, "tid": -914061504, "ts": 1716454224292431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224348376, "dur": 12, "args": { "External id": 185207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185207, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185207, "pid": 5, "tid": 7, "ts": 1716454224348376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292433, "dur": 11, "args": { "External id": 185207, "cbid": 211, "correlation": 185207 } }, { "ph": "s", "id": 185207, "pid": 76337, "tid": -914061504, "ts": 1716454224292433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224348389, "dur": 5, "args": { "External id": 185209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185209, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185209, "pid": 5, "tid": 7, "ts": 1716454224348389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292446, "dur": 6, "args": { "External id": 185209, "cbid": 211, "correlation": 185209 } }, { "ph": "s", "id": 185209, "pid": 76337, "tid": -914061504, "ts": 1716454224292446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224348395, "dur": 6, "args": { "External id": 185219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185219, "pid": 5, "tid": 7, "ts": 1716454224348395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292502, "dur": 12, "args": { "External id": 185219, "cbid": 211, "correlation": 185219 } }, { "ph": "s", "id": 185219, "pid": 76337, "tid": -914061504, "ts": 1716454224292502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224348402, "dur": 9, "args": { "External id": 185239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185239, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 185239, "pid": 5, "tid": 7, "ts": 1716454224348402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292567, "dur": 11, "args": { "External id": 185239, "cbid": 211, "correlation": 185239 } }, { "ph": "s", "id": 185239, "pid": 76337, "tid": -914061504, "ts": 1716454224292567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224348413, "dur": 4, "args": { "External id": 185251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185251, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 185251, "pid": 5, "tid": 7, "ts": 1716454224348413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292588, "dur": 6, "args": { "External id": 185251, "cbid": 211, "correlation": 185251 } }, { "ph": "s", "id": 185251, "pid": 76337, "tid": -914061504, "ts": 1716454224292588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224348418, "dur": 7, "args": { "External id": 185254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185254, "pid": 5, "tid": 7, "ts": 1716454224348418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292608, "dur": 6, "args": { "External id": 185254, "cbid": 211, "correlation": 185254 } }, { "ph": "s", "id": 185254, "pid": 76337, "tid": -914061504, "ts": 1716454224292608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224348426, "dur": 4, "args": { "External id": 185263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185263, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185263, "pid": 5, "tid": 7, "ts": 1716454224348426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292646, "dur": 10, "args": { "External id": 185263, "cbid": 211, "correlation": 185263 } }, { "ph": "s", "id": 185263, "pid": 76337, "tid": -914061504, "ts": 1716454224292646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224292713, "dur": 0, "args": { "External id": 185273, "cbid": 317, "correlation": 185273 } }, { "ph": "f", "id": 185273, "pid": 76337, "tid": -914061504, "ts": 1716454224292713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224292714, "dur": 0, "args": { "External id": 185274, "cbid": 203, "correlation": 185274 } }, { "ph": "f", "id": 185274, "pid": 76337, "tid": -914061504, "ts": 1716454224292714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224292715, "dur": 0, "args": { "External id": 185275, "cbid": 205, "correlation": 185275 } }, { "ph": "f", "id": 185275, "pid": 76337, "tid": -914061504, "ts": 1716454224292715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224348432, "dur": 5, "args": { "External id": 185279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185279, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185279, "pid": 5, "tid": 7, "ts": 1716454224348432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292728, "dur": 12, "args": { "External id": 185279, "cbid": 211, "correlation": 185279 } }, { "ph": "s", "id": 185279, "pid": 76337, "tid": -914061504, "ts": 1716454224292728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224348438, "dur": 159, "args": { "External id": 185281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185281, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185281, "pid": 5, "tid": 7, "ts": 1716454224348438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292742, "dur": 5, "args": { "External id": 185281, "cbid": 211, "correlation": 185281 } }, { "ph": "s", "id": 185281, "pid": 76337, "tid": -914061504, "ts": 1716454224292742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224348599, "dur": 1, "args": { "External id": 185283, "device": 5, "context": 1, "stream": 7, "correlation": 185283, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 185283, "pid": 5, "tid": 7, "ts": 1716454224348599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224292753, "dur": 7, "args": { "External id": 185283, "cbid": 51, "correlation": 185283 } }, { "ph": "s", "id": 185283, "pid": 76337, "tid": -914061504, "ts": 1716454224292753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224348603, "dur": 254, "args": { "External id": 185284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185284, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185284, "pid": 5, "tid": 7, "ts": 1716454224348603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292761, "dur": 6, "args": { "External id": 185284, "cbid": 211, "correlation": 185284 } }, { "ph": "s", "id": 185284, "pid": 76337, "tid": -914061504, "ts": 1716454224292761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224348859, "dur": 6, "args": { "External id": 185286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185286, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185286, "pid": 5, "tid": 7, "ts": 1716454224348859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292771, "dur": 5, "args": { "External id": 185286, "cbid": 211, "correlation": 185286 } }, { "ph": "s", "id": 185286, "pid": 76337, "tid": -914061504, "ts": 1716454224292771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224348866, "dur": 6, "args": { "External id": 185292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185292, "pid": 5, "tid": 7, "ts": 1716454224348866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292799, "dur": 9, "args": { "External id": 185292, "cbid": 211, "correlation": 185292 } }, { "ph": "s", "id": 185292, "pid": 76337, "tid": -914061504, "ts": 1716454224292799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224292857, "dur": 0, "args": { "External id": 185302, "cbid": 317, "correlation": 185302 } }, { "ph": "f", "id": 185302, "pid": 76337, "tid": -914061504, "ts": 1716454224292857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224292858, "dur": 0, "args": { "External id": 185303, "cbid": 203, "correlation": 185303 } }, { "ph": "f", "id": 185303, "pid": 76337, "tid": -914061504, "ts": 1716454224292858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224292859, "dur": 0, "args": { "External id": 185304, "cbid": 205, "correlation": 185304 } }, { "ph": "f", "id": 185304, "pid": 76337, "tid": -914061504, "ts": 1716454224292859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224348874, "dur": 7, "args": { "External id": 185308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185308, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185308, "pid": 5, "tid": 7, "ts": 1716454224348874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292871, "dur": 11, "args": { "External id": 185308, "cbid": 211, "correlation": 185308 } }, { "ph": "s", "id": 185308, "pid": 76337, "tid": -914061504, "ts": 1716454224292871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224348882, "dur": 3, "args": { "External id": 185310, "device": 5, "context": 1, "stream": 7, "correlation": 185310, "bytes": 4800, "memory bandwidth (GB/s)": 1.530612244897959 } }, { "ph": "f", "id": 185310, "pid": 5, "tid": 7, "ts": 1716454224348882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224292887, "dur": 10, "args": { "External id": 185310, "cbid": 51, "correlation": 185310 } }, { "ph": "s", "id": 185310, "pid": 76337, "tid": -914061504, "ts": 1716454224292887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224348886, "dur": 93, "args": { "External id": 185311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185311, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 185311, "pid": 5, "tid": 7, "ts": 1716454224348886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292899, "dur": 6, "args": { "External id": 185311, "cbid": 211, "correlation": 185311 } }, { "ph": "s", "id": 185311, "pid": 76337, "tid": -914061504, "ts": 1716454224292899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224348981, "dur": 5, "args": { "External id": 185313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185313, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185313, "pid": 5, "tid": 7, "ts": 1716454224348981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292908, "dur": 5, "args": { "External id": 185313, "cbid": 211, "correlation": 185313 } }, { "ph": "s", "id": 185313, "pid": 76337, "tid": -914061504, "ts": 1716454224292908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224348987, "dur": 6, "args": { "External id": 185319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185319, "pid": 5, "tid": 7, "ts": 1716454224348987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292935, "dur": 8, "args": { "External id": 185319, "cbid": 211, "correlation": 185319 } }, { "ph": "s", "id": 185319, "pid": 76337, "tid": -914061504, "ts": 1716454224292935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224348994, "dur": 5, "args": { "External id": 185327, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185327, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185327, "pid": 5, "tid": 7, "ts": 1716454224348994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224292963, "dur": 8, "args": { "External id": 185327, "cbid": 211, "correlation": 185327 } }, { "ph": "s", "id": 185327, "pid": 76337, "tid": -914061504, "ts": 1716454224292963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224349001, "dur": 4, "args": { "External id": 185335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185335, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 185335, "pid": 5, "tid": 7, "ts": 1716454224349001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293000, "dur": 9, "args": { "External id": 185335, "cbid": 211, "correlation": 185335 } }, { "ph": "s", "id": 185335, "pid": 76337, "tid": -914061504, "ts": 1716454224293000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224349006, "dur": 14, "args": { "External id": 185346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185346, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185346, "pid": 5, "tid": 7, "ts": 1716454224349006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293102, "dur": 14, "args": { "External id": 185346, "cbid": 211, "correlation": 185346 } }, { "ph": "s", "id": 185346, "pid": 76337, "tid": -914061504, "ts": 1716454224293102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224293159, "dur": 0, "args": { "External id": 185356, "cbid": 317, "correlation": 185356 } }, { "ph": "f", "id": 185356, "pid": 76337, "tid": -914061504, "ts": 1716454224293159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224293160, "dur": 0, "args": { "External id": 185357, "cbid": 203, "correlation": 185357 } }, { "ph": "f", "id": 185357, "pid": 76337, "tid": -914061504, "ts": 1716454224293160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224293160, "dur": 0, "args": { "External id": 185358, "cbid": 205, "correlation": 185358 } }, { "ph": "f", "id": 185358, "pid": 76337, "tid": -914061504, "ts": 1716454224293160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224349022, "dur": 8, "args": { "External id": 185362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185362, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185362, "pid": 5, "tid": 7, "ts": 1716454224349022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293176, "dur": 11, "args": { "External id": 185362, "cbid": 211, "correlation": 185362 } }, { "ph": "s", "id": 185362, "pid": 76337, "tid": -914061504, "ts": 1716454224293176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224349031, "dur": 160, "args": { "External id": 185364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185364, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185364, "pid": 5, "tid": 7, "ts": 1716454224349031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293189, "dur": 5, "args": { "External id": 185364, "cbid": 211, "correlation": 185364 } }, { "ph": "s", "id": 185364, "pid": 76337, "tid": -914061504, "ts": 1716454224293189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224349193, "dur": 1, "args": { "External id": 185366, "device": 5, "context": 1, "stream": 7, "correlation": 185366, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 185366, "pid": 5, "tid": 7, "ts": 1716454224349193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224293200, "dur": 6, "args": { "External id": 185366, "cbid": 51, "correlation": 185366 } }, { "ph": "s", "id": 185366, "pid": 76337, "tid": -914061504, "ts": 1716454224293200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224349197, "dur": 643, "args": { "External id": 185367, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185367, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185367, "pid": 5, "tid": 7, "ts": 1716454224349197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293207, "dur": 6, "args": { "External id": 185367, "cbid": 211, "correlation": 185367 } }, { "ph": "s", "id": 185367, "pid": 76337, "tid": -914061504, "ts": 1716454224293207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224349842, "dur": 12, "args": { "External id": 185369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185369, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185369, "pid": 5, "tid": 7, "ts": 1716454224349842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293217, "dur": 6, "args": { "External id": 185369, "cbid": 211, "correlation": 185369 } }, { "ph": "s", "id": 185369, "pid": 76337, "tid": -914061504, "ts": 1716454224293217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224349855, "dur": 15, "args": { "External id": 185375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185375, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185375, "pid": 5, "tid": 7, "ts": 1716454224349855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293247, "dur": 9, "args": { "External id": 185375, "cbid": 211, "correlation": 185375 } }, { "ph": "s", "id": 185375, "pid": 76337, "tid": -914061504, "ts": 1716454224293247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224349871, "dur": 30, "args": { "External id": 185384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185384, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185384, "pid": 5, "tid": 7, "ts": 1716454224349871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293345, "dur": 13, "args": { "External id": 185384, "cbid": 211, "correlation": 185384 } }, { "ph": "s", "id": 185384, "pid": 76337, "tid": -914061504, "ts": 1716454224293345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224349902, "dur": 31, "args": { "External id": 185404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185404, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 185404, "pid": 5, "tid": 7, "ts": 1716454224349902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293417, "dur": 12, "args": { "External id": 185404, "cbid": 211, "correlation": 185404 } }, { "ph": "s", "id": 185404, "pid": 76337, "tid": -914061504, "ts": 1716454224293417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224349934, "dur": 5, "args": { "External id": 185416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185416, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185416, "pid": 5, "tid": 7, "ts": 1716454224349934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293439, "dur": 6, "args": { "External id": 185416, "cbid": 211, "correlation": 185416 } }, { "ph": "s", "id": 185416, "pid": 76337, "tid": -914061504, "ts": 1716454224293439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224349940, "dur": 30, "args": { "External id": 185419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185419, "pid": 5, "tid": 7, "ts": 1716454224349940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293457, "dur": 6, "args": { "External id": 185419, "cbid": 211, "correlation": 185419 } }, { "ph": "s", "id": 185419, "pid": 76337, "tid": -914061504, "ts": 1716454224293457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224349972, "dur": 22, "args": { "External id": 185428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185428, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185428, "pid": 5, "tid": 7, "ts": 1716454224349972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293496, "dur": 10, "args": { "External id": 185428, "cbid": 211, "correlation": 185428 } }, { "ph": "s", "id": 185428, "pid": 76337, "tid": -914061504, "ts": 1716454224293496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224293550, "dur": 0, "args": { "External id": 185438, "cbid": 317, "correlation": 185438 } }, { "ph": "f", "id": 185438, "pid": 76337, "tid": -914061504, "ts": 1716454224293550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224293551, "dur": 0, "args": { "External id": 185439, "cbid": 203, "correlation": 185439 } }, { "ph": "f", "id": 185439, "pid": 76337, "tid": -914061504, "ts": 1716454224293551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224293552, "dur": 0, "args": { "External id": 185440, "cbid": 205, "correlation": 185440 } }, { "ph": "f", "id": 185440, "pid": 76337, "tid": -914061504, "ts": 1716454224293552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224349994, "dur": 22, "args": { "External id": 185444, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185444, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185444, "pid": 5, "tid": 7, "ts": 1716454224349994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293567, "dur": 12, "args": { "External id": 185444, "cbid": 211, "correlation": 185444 } }, { "ph": "s", "id": 185444, "pid": 76337, "tid": -914061504, "ts": 1716454224293567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224350017, "dur": 313, "args": { "External id": 185446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185446, "pid": 5, "tid": 7, "ts": 1716454224350017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293582, "dur": 6, "args": { "External id": 185446, "cbid": 211, "correlation": 185446 } }, { "ph": "s", "id": 185446, "pid": 76337, "tid": -914061504, "ts": 1716454224293582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224350333, "dur": 1, "args": { "External id": 185448, "device": 5, "context": 1, "stream": 7, "correlation": 185448, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 185448, "pid": 5, "tid": 7, "ts": 1716454224350333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224293593, "dur": 7, "args": { "External id": 185448, "cbid": 51, "correlation": 185448 } }, { "ph": "s", "id": 185448, "pid": 76337, "tid": -914061504, "ts": 1716454224293593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224350336, "dur": 1228, "args": { "External id": 185449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185449, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185449, "pid": 5, "tid": 7, "ts": 1716454224350336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293601, "dur": 6, "args": { "External id": 185449, "cbid": 211, "correlation": 185449 } }, { "ph": "s", "id": 185449, "pid": 76337, "tid": -914061504, "ts": 1716454224293601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224351566, "dur": 13, "args": { "External id": 185451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185451, "pid": 5, "tid": 7, "ts": 1716454224351566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293611, "dur": 5, "args": { "External id": 185451, "cbid": 211, "correlation": 185451 } }, { "ph": "s", "id": 185451, "pid": 76337, "tid": -914061504, "ts": 1716454224293611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224351580, "dur": 15, "args": { "External id": 185457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185457, "pid": 5, "tid": 7, "ts": 1716454224351580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293640, "dur": 9, "args": { "External id": 185457, "cbid": 211, "correlation": 185457 } }, { "ph": "s", "id": 185457, "pid": 76337, "tid": -914061504, "ts": 1716454224293640, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224351596, "dur": 3, "args": { "External id": 185465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185465, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 185465, "pid": 5, "tid": 7, "ts": 1716454224351596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293684, "dur": 10, "args": { "External id": 185465, "cbid": 211, "correlation": 185465 } }, { "ph": "s", "id": 185465, "pid": 76337, "tid": -914061504, "ts": 1716454224293684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224293748, "dur": 1, "args": { "External id": 185481, "cbid": 251, "correlation": 185481 } }, { "ph": "f", "id": 185481, "pid": 76337, "tid": -914061504, "ts": 1716454224293748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224293753, "dur": 0, "args": { "External id": 185483, "cbid": 251, "correlation": 185483 } }, { "ph": "f", "id": 185483, "pid": 76337, "tid": -914061504, "ts": 1716454224293753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224351601, "dur": 12, "args": { "External id": 185484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185484, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185484, "pid": 5, "tid": 7, "ts": 1716454224351601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293755, "dur": 11, "args": { "External id": 185484, "cbid": 211, "correlation": 185484 } }, { "ph": "s", "id": 185484, "pid": 76337, "tid": -914061504, "ts": 1716454224293755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224351614, "dur": 5, "args": { "External id": 185486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185486, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185486, "pid": 5, "tid": 7, "ts": 1716454224351614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293768, "dur": 5, "args": { "External id": 185486, "cbid": 211, "correlation": 185486 } }, { "ph": "s", "id": 185486, "pid": 76337, "tid": -914061504, "ts": 1716454224293768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224351620, "dur": 17, "args": { "External id": 185496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185496, "pid": 5, "tid": 7, "ts": 1716454224351620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293824, "dur": 12, "args": { "External id": 185496, "cbid": 211, "correlation": 185496 } }, { "ph": "s", "id": 185496, "pid": 76337, "tid": -914061504, "ts": 1716454224293824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224351638, "dur": 19, "args": { "External id": 185516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185516, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 185516, "pid": 5, "tid": 7, "ts": 1716454224351638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293890, "dur": 11, "args": { "External id": 185516, "cbid": 211, "correlation": 185516 } }, { "ph": "s", "id": 185516, "pid": 76337, "tid": -914061504, "ts": 1716454224293890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224351658, "dur": 4, "args": { "External id": 185528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185528, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 185528, "pid": 5, "tid": 7, "ts": 1716454224351658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293911, "dur": 6, "args": { "External id": 185528, "cbid": 211, "correlation": 185528 } }, { "ph": "s", "id": 185528, "pid": 76337, "tid": -914061504, "ts": 1716454224293911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224351664, "dur": 16, "args": { "External id": 185531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185531, "pid": 5, "tid": 7, "ts": 1716454224351664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293929, "dur": 6, "args": { "External id": 185531, "cbid": 211, "correlation": 185531 } }, { "ph": "s", "id": 185531, "pid": 76337, "tid": -914061504, "ts": 1716454224293929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224351681, "dur": 12, "args": { "External id": 185540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185540, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185540, "pid": 5, "tid": 7, "ts": 1716454224351681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224293969, "dur": 19, "args": { "External id": 185540, "cbid": 211, "correlation": 185540 } }, { "ph": "s", "id": 185540, "pid": 76337, "tid": -914061504, "ts": 1716454224293969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224294042, "dur": 0, "args": { "External id": 185550, "cbid": 317, "correlation": 185550 } }, { "ph": "f", "id": 185550, "pid": 76337, "tid": -914061504, "ts": 1716454224294042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224294043, "dur": 0, "args": { "External id": 185551, "cbid": 203, "correlation": 185551 } }, { "ph": "f", "id": 185551, "pid": 76337, "tid": -914061504, "ts": 1716454224294043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224294044, "dur": 0, "args": { "External id": 185552, "cbid": 205, "correlation": 185552 } }, { "ph": "f", "id": 185552, "pid": 76337, "tid": -914061504, "ts": 1716454224294044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224351694, "dur": 11, "args": { "External id": 185556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185556, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185556, "pid": 5, "tid": 7, "ts": 1716454224351694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294058, "dur": 12, "args": { "External id": 185556, "cbid": 211, "correlation": 185556 } }, { "ph": "s", "id": 185556, "pid": 76337, "tid": -914061504, "ts": 1716454224294058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224351706, "dur": 160, "args": { "External id": 185558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185558, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185558, "pid": 5, "tid": 7, "ts": 1716454224351706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294072, "dur": 5, "args": { "External id": 185558, "cbid": 211, "correlation": 185558 } }, { "ph": "s", "id": 185558, "pid": 76337, "tid": -914061504, "ts": 1716454224294072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224351868, "dur": 1, "args": { "External id": 185560, "device": 5, "context": 1, "stream": 7, "correlation": 185560, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 185560, "pid": 5, "tid": 7, "ts": 1716454224351868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224294083, "dur": 7, "args": { "External id": 185560, "cbid": 51, "correlation": 185560 } }, { "ph": "s", "id": 185560, "pid": 76337, "tid": -914061504, "ts": 1716454224294083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224351872, "dur": 640, "args": { "External id": 185561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185561, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185561, "pid": 5, "tid": 7, "ts": 1716454224351872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294091, "dur": 7, "args": { "External id": 185561, "cbid": 211, "correlation": 185561 } }, { "ph": "s", "id": 185561, "pid": 76337, "tid": -914061504, "ts": 1716454224294091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224352513, "dur": 13, "args": { "External id": 185563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185563, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185563, "pid": 5, "tid": 7, "ts": 1716454224352513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294102, "dur": 5, "args": { "External id": 185563, "cbid": 211, "correlation": 185563 } }, { "ph": "s", "id": 185563, "pid": 76337, "tid": -914061504, "ts": 1716454224294102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224352528, "dur": 14, "args": { "External id": 185569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185569, "pid": 5, "tid": 7, "ts": 1716454224352528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294130, "dur": 8, "args": { "External id": 185569, "cbid": 211, "correlation": 185569 } }, { "ph": "s", "id": 185569, "pid": 76337, "tid": -914061504, "ts": 1716454224294130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224294188, "dur": 0, "args": { "External id": 185579, "cbid": 317, "correlation": 185579 } }, { "ph": "f", "id": 185579, "pid": 76337, "tid": -914061504, "ts": 1716454224294188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224294189, "dur": 0, "args": { "External id": 185580, "cbid": 203, "correlation": 185580 } }, { "ph": "f", "id": 185580, "pid": 76337, "tid": -914061504, "ts": 1716454224294189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224294190, "dur": 0, "args": { "External id": 185581, "cbid": 205, "correlation": 185581 } }, { "ph": "f", "id": 185581, "pid": 76337, "tid": -914061504, "ts": 1716454224294190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224352543, "dur": 21, "args": { "External id": 185585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185585, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185585, "pid": 5, "tid": 7, "ts": 1716454224352543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294205, "dur": 11, "args": { "External id": 185585, "cbid": 211, "correlation": 185585 } }, { "ph": "s", "id": 185585, "pid": 76337, "tid": -914061504, "ts": 1716454224294205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224352566, "dur": 4, "args": { "External id": 185587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185587, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 185587, "pid": 5, "tid": 7, "ts": 1716454224352566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294221, "dur": 6, "args": { "External id": 185587, "cbid": 211, "correlation": 185587 } }, { "ph": "s", "id": 185587, "pid": 76337, "tid": -914061504, "ts": 1716454224294221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224294231, "dur": 0, "args": { "External id": 185588, "cbid": 51, "correlation": 185588 } }, { "ph": "s", "id": 185588, "pid": 76337, "tid": -914061504, "ts": 1716454224294231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224352571, "dur": 173, "args": { "External id": 185589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185589, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 185589, "pid": 5, "tid": 7, "ts": 1716454224352571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294231, "dur": 6, "args": { "External id": 185589, "cbid": 211, "correlation": 185589 } }, { "ph": "s", "id": 185589, "pid": 76337, "tid": -914061504, "ts": 1716454224294231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224352745, "dur": 15, "args": { "External id": 185594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185594, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185594, "pid": 5, "tid": 7, "ts": 1716454224352745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294258, "dur": 9, "args": { "External id": 185594, "cbid": 211, "correlation": 185594 } }, { "ph": "s", "id": 185594, "pid": 76337, "tid": -914061504, "ts": 1716454224294258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224352762, "dur": 12, "args": { "External id": 185602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185602, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185602, "pid": 5, "tid": 7, "ts": 1716454224352762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294287, "dur": 8, "args": { "External id": 185602, "cbid": 211, "correlation": 185602 } }, { "ph": "s", "id": 185602, "pid": 76337, "tid": -914061504, "ts": 1716454224294287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224352774, "dur": 10, "args": { "External id": 185610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185610, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185610, "pid": 5, "tid": 7, "ts": 1716454224352774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294316, "dur": 8, "args": { "External id": 185610, "cbid": 211, "correlation": 185610 } }, { "ph": "s", "id": 185610, "pid": 76337, "tid": -914061504, "ts": 1716454224294316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224352786, "dur": 18, "args": { "External id": 185630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185630, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 185630, "pid": 5, "tid": 7, "ts": 1716454224352786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294397, "dur": 13, "args": { "External id": 185630, "cbid": 211, "correlation": 185630 } }, { "ph": "s", "id": 185630, "pid": 76337, "tid": -914061504, "ts": 1716454224294397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224352805, "dur": 4, "args": { "External id": 185642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185642, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 185642, "pid": 5, "tid": 7, "ts": 1716454224352805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294420, "dur": 6, "args": { "External id": 185642, "cbid": 211, "correlation": 185642 } }, { "ph": "s", "id": 185642, "pid": 76337, "tid": -914061504, "ts": 1716454224294420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224352810, "dur": 16, "args": { "External id": 185645, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185645, "pid": 5, "tid": 7, "ts": 1716454224352810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294438, "dur": 6, "args": { "External id": 185645, "cbid": 211, "correlation": 185645 } }, { "ph": "s", "id": 185645, "pid": 76337, "tid": -914061504, "ts": 1716454224294438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224294495, "dur": 0, "args": { "External id": 185656, "cbid": 317, "correlation": 185656 } }, { "ph": "f", "id": 185656, "pid": 76337, "tid": -914061504, "ts": 1716454224294495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224294496, "dur": 0, "args": { "External id": 185657, "cbid": 203, "correlation": 185657 } }, { "ph": "f", "id": 185657, "pid": 76337, "tid": -914061504, "ts": 1716454224294496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224294497, "dur": 0, "args": { "External id": 185658, "cbid": 205, "correlation": 185658 } }, { "ph": "f", "id": 185658, "pid": 76337, "tid": -914061504, "ts": 1716454224294497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224352828, "dur": 11, "args": { "External id": 185662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185662, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185662, "pid": 5, "tid": 7, "ts": 1716454224352828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294511, "dur": 12, "args": { "External id": 185662, "cbid": 211, "correlation": 185662 } }, { "ph": "s", "id": 185662, "pid": 76337, "tid": -914061504, "ts": 1716454224294511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224352841, "dur": 3, "args": { "External id": 185664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185664, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 185664, "pid": 5, "tid": 7, "ts": 1716454224352841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294528, "dur": 6, "args": { "External id": 185664, "cbid": 211, "correlation": 185664 } }, { "ph": "s", "id": 185664, "pid": 76337, "tid": -914061504, "ts": 1716454224294528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224294536, "dur": 0, "args": { "External id": 185665, "cbid": 51, "correlation": 185665 } }, { "ph": "s", "id": 185665, "pid": 76337, "tid": -914061504, "ts": 1716454224294536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224352845, "dur": 89, "args": { "External id": 185666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185666, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 185666, "pid": 5, "tid": 7, "ts": 1716454224352845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294537, "dur": 5, "args": { "External id": 185666, "cbid": 211, "correlation": 185666 } }, { "ph": "s", "id": 185666, "pid": 76337, "tid": -914061504, "ts": 1716454224294537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224352935, "dur": 16, "args": { "External id": 185671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185671, "pid": 5, "tid": 7, "ts": 1716454224352935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294564, "dur": 8, "args": { "External id": 185671, "cbid": 211, "correlation": 185671 } }, { "ph": "s", "id": 185671, "pid": 76337, "tid": -914061504, "ts": 1716454224294564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224352952, "dur": 83, "args": { "External id": 185680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185680, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185680, "pid": 5, "tid": 7, "ts": 1716454224352952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294645, "dur": 16, "args": { "External id": 185680, "cbid": 211, "correlation": 185680 } }, { "ph": "s", "id": 185680, "pid": 76337, "tid": -914061504, "ts": 1716454224294645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224353036, "dur": 29, "args": { "External id": 185702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185702, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185702, "pid": 5, "tid": 7, "ts": 1716454224353036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294705, "dur": 11, "args": { "External id": 185702, "cbid": 211, "correlation": 185702 } }, { "ph": "s", "id": 185702, "pid": 76337, "tid": -914061504, "ts": 1716454224294705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224294802, "dur": 2, "args": { "External id": 185713, "cbid": 251, "correlation": 185713 } }, { "ph": "f", "id": 185713, "pid": 76337, "tid": -914061504, "ts": 1716454224294802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224353067, "dur": 140, "args": { "External id": 185714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185714, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185714, "pid": 5, "tid": 7, "ts": 1716454224353067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294809, "dur": 14, "args": { "External id": 185714, "cbid": 211, "correlation": 185714 } }, { "ph": "s", "id": 185714, "pid": 76337, "tid": -914061504, "ts": 1716454224294809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224294880, "dur": 1, "args": { "External id": 185725, "cbid": 251, "correlation": 185725 } }, { "ph": "f", "id": 185725, "pid": 76337, "tid": -914061504, "ts": 1716454224294880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224353208, "dur": 157, "args": { "External id": 185726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185726, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185726, "pid": 5, "tid": 7, "ts": 1716454224353208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294884, "dur": 11, "args": { "External id": 185726, "cbid": 211, "correlation": 185726 } }, { "ph": "s", "id": 185726, "pid": 76337, "tid": -914061504, "ts": 1716454224294884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224294949, "dur": 1, "args": { "External id": 185737, "cbid": 251, "correlation": 185737 } }, { "ph": "f", "id": 185737, "pid": 76337, "tid": -914061504, "ts": 1716454224294949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224353366, "dur": 135, "args": { "External id": 185738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185738, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185738, "pid": 5, "tid": 7, "ts": 1716454224353366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224294953, "dur": 12, "args": { "External id": 185738, "cbid": 211, "correlation": 185738 } }, { "ph": "s", "id": 185738, "pid": 76337, "tid": -914061504, "ts": 1716454224294953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224353503, "dur": 332, "args": { "External id": 185763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185763, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185763, "pid": 5, "tid": 7, "ts": 1716454224353503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295051, "dur": 14, "args": { "External id": 185763, "cbid": 211, "correlation": 185763 } }, { "ph": "s", "id": 185763, "pid": 76337, "tid": -914061504, "ts": 1716454224295051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295154, "dur": 1, "args": { "External id": 185781, "cbid": 251, "correlation": 185781 } }, { "ph": "f", "id": 185781, "pid": 76337, "tid": -914061504, "ts": 1716454224295154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224353836, "dur": 164, "args": { "External id": 185783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185783, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185783, "pid": 5, "tid": 7, "ts": 1716454224353836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295160, "dur": 14, "args": { "External id": 185783, "cbid": 211, "correlation": 185783 } }, { "ph": "s", "id": 185783, "pid": 76337, "tid": -914061504, "ts": 1716454224295160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224354002, "dur": 19, "args": { "External id": 185791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185791, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185791, "pid": 5, "tid": 7, "ts": 1716454224354002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295231, "dur": 12, "args": { "External id": 185791, "cbid": 211, "correlation": 185791 } }, { "ph": "s", "id": 185791, "pid": 76337, "tid": -914061504, "ts": 1716454224295231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224354023, "dur": 27, "args": { "External id": 185799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185799, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185799, "pid": 5, "tid": 7, "ts": 1716454224354023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295270, "dur": 8, "args": { "External id": 185799, "cbid": 211, "correlation": 185799 } }, { "ph": "s", "id": 185799, "pid": 76337, "tid": -914061504, "ts": 1716454224295270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224354051, "dur": 19, "args": { "External id": 185810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185810, "pid": 5, "tid": 7, "ts": 1716454224354051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295342, "dur": 13, "args": { "External id": 185810, "cbid": 211, "correlation": 185810 } }, { "ph": "s", "id": 185810, "pid": 76337, "tid": -914061504, "ts": 1716454224295342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224354071, "dur": 16, "args": { "External id": 185832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185832, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185832, "pid": 5, "tid": 7, "ts": 1716454224354071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295374, "dur": 7, "args": { "External id": 185832, "cbid": 211, "correlation": 185832 } }, { "ph": "s", "id": 185832, "pid": 76337, "tid": -914061504, "ts": 1716454224295374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295458, "dur": 1, "args": { "External id": 185843, "cbid": 251, "correlation": 185843 } }, { "ph": "f", "id": 185843, "pid": 76337, "tid": -914061504, "ts": 1716454224295458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224354089, "dur": 88, "args": { "External id": 185844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185844, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 185844, "pid": 5, "tid": 7, "ts": 1716454224354089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295464, "dur": 14, "args": { "External id": 185844, "cbid": 211, "correlation": 185844 } }, { "ph": "s", "id": 185844, "pid": 76337, "tid": -914061504, "ts": 1716454224295464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295535, "dur": 1, "args": { "External id": 185855, "cbid": 251, "correlation": 185855 } }, { "ph": "f", "id": 185855, "pid": 76337, "tid": -914061504, "ts": 1716454224295535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295538, "dur": 0, "args": { "External id": 185856, "cbid": 251, "correlation": 185856 } }, { "ph": "f", "id": 185856, "pid": 76337, "tid": -914061504, "ts": 1716454224295538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224354178, "dur": 12, "args": { "External id": 185857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185857, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185857, "pid": 5, "tid": 7, "ts": 1716454224354178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295540, "dur": 12, "args": { "External id": 185857, "cbid": 211, "correlation": 185857 } }, { "ph": "s", "id": 185857, "pid": 76337, "tid": -914061504, "ts": 1716454224295540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224354192, "dur": 5, "args": { "External id": 185859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185859, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185859, "pid": 5, "tid": 7, "ts": 1716454224354192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295553, "dur": 6, "args": { "External id": 185859, "cbid": 211, "correlation": 185859 } }, { "ph": "s", "id": 185859, "pid": 76337, "tid": -914061504, "ts": 1716454224295553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295611, "dur": 1, "args": { "External id": 185870, "cbid": 251, "correlation": 185870 } }, { "ph": "f", "id": 185870, "pid": 76337, "tid": -914061504, "ts": 1716454224295611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295615, "dur": 0, "args": { "External id": 185871, "cbid": 251, "correlation": 185871 } }, { "ph": "f", "id": 185871, "pid": 76337, "tid": -914061504, "ts": 1716454224295615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224354199, "dur": 8, "args": { "External id": 185872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185872, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185872, "pid": 5, "tid": 7, "ts": 1716454224354199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295616, "dur": 12, "args": { "External id": 185872, "cbid": 211, "correlation": 185872 } }, { "ph": "s", "id": 185872, "pid": 76337, "tid": -914061504, "ts": 1716454224295616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224354208, "dur": 3, "args": { "External id": 185874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185874, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185874, "pid": 5, "tid": 7, "ts": 1716454224354208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295630, "dur": 6, "args": { "External id": 185874, "cbid": 211, "correlation": 185874 } }, { "ph": "s", "id": 185874, "pid": 76337, "tid": -914061504, "ts": 1716454224295630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224354213, "dur": 54, "args": { "External id": 185899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185899, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185899, "pid": 5, "tid": 7, "ts": 1716454224354213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295707, "dur": 12, "args": { "External id": 185899, "cbid": 211, "correlation": 185899 } }, { "ph": "s", "id": 185899, "pid": 76337, "tid": -914061504, "ts": 1716454224295707, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224295806, "dur": 1, "args": { "External id": 185917, "cbid": 251, "correlation": 185917 } }, { "ph": "f", "id": 185917, "pid": 76337, "tid": -914061504, "ts": 1716454224295806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224354268, "dur": 88, "args": { "External id": 185919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185919, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 185919, "pid": 5, "tid": 7, "ts": 1716454224354268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295813, "dur": 14, "args": { "External id": 185919, "cbid": 211, "correlation": 185919 } }, { "ph": "s", "id": 185919, "pid": 76337, "tid": -914061504, "ts": 1716454224295813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224354357, "dur": 9, "args": { "External id": 185927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185927, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185927, "pid": 5, "tid": 7, "ts": 1716454224354357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295882, "dur": 12, "args": { "External id": 185927, "cbid": 211, "correlation": 185927 } }, { "ph": "s", "id": 185927, "pid": 76337, "tid": -914061504, "ts": 1716454224295882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224354367, "dur": 20, "args": { "External id": 185935, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185935, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185935, "pid": 5, "tid": 7, "ts": 1716454224354367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295923, "dur": 9, "args": { "External id": 185935, "cbid": 211, "correlation": 185935 } }, { "ph": "s", "id": 185935, "pid": 76337, "tid": -914061504, "ts": 1716454224295923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224354389, "dur": 17, "args": { "External id": 185957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185957, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185957, "pid": 5, "tid": 7, "ts": 1716454224354389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224295982, "dur": 11, "args": { "External id": 185957, "cbid": 211, "correlation": 185957 } }, { "ph": "s", "id": 185957, "pid": 76337, "tid": -914061504, "ts": 1716454224295982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224296073, "dur": 1, "args": { "External id": 185973, "cbid": 251, "correlation": 185973 } }, { "ph": "f", "id": 185973, "pid": 76337, "tid": -914061504, "ts": 1716454224296073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224296078, "dur": 0, "args": { "External id": 185975, "cbid": 251, "correlation": 185975 } }, { "ph": "f", "id": 185975, "pid": 76337, "tid": -914061504, "ts": 1716454224296078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224354407, "dur": 495, "args": { "External id": 185976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185976, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 185976, "pid": 5, "tid": 7, "ts": 1716454224354407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296079, "dur": 13, "args": { "External id": 185976, "cbid": 211, "correlation": 185976 } }, { "ph": "s", "id": 185976, "pid": 76337, "tid": -914061504, "ts": 1716454224296079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224354904, "dur": 65, "args": { "External id": 185984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185984, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185984, "pid": 5, "tid": 7, "ts": 1716454224354904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296148, "dur": 12, "args": { "External id": 185984, "cbid": 211, "correlation": 185984 } }, { "ph": "s", "id": 185984, "pid": 76337, "tid": -914061504, "ts": 1716454224296148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224354970, "dur": 66, "args": { "External id": 185992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 185992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 185992, "pid": 5, "tid": 7, "ts": 1716454224354970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296181, "dur": 9, "args": { "External id": 185992, "cbid": 211, "correlation": 185992 } }, { "ph": "s", "id": 185992, "pid": 76337, "tid": -914061504, "ts": 1716454224296181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224296262, "dur": 1, "args": { "External id": 186008, "cbid": 251, "correlation": 186008 } }, { "ph": "f", "id": 186008, "pid": 76337, "tid": -914061504, "ts": 1716454224296262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224355039, "dur": 1, "args": { "External id": 186010, "device": 5, "context": 1, "stream": 7, "correlation": 186010, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 186010, "pid": 5, "tid": 7, "ts": 1716454224355039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224296267, "dur": 10, "args": { "External id": 186010, "cbid": 51, "correlation": 186010 } }, { "ph": "s", "id": 186010, "pid": 76337, "tid": -914061504, "ts": 1716454224296267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224355042, "dur": 267, "args": { "External id": 186011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186011, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186011, "pid": 5, "tid": 7, "ts": 1716454224355042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296279, "dur": 12, "args": { "External id": 186011, "cbid": 211, "correlation": 186011 } }, { "ph": "s", "id": 186011, "pid": 76337, "tid": -914061504, "ts": 1716454224296279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224355311, "dur": 14, "args": { "External id": 186019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186019, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186019, "pid": 5, "tid": 7, "ts": 1716454224355311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296322, "dur": 10, "args": { "External id": 186019, "cbid": 211, "correlation": 186019 } }, { "ph": "s", "id": 186019, "pid": 76337, "tid": -914061504, "ts": 1716454224296322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224355326, "dur": 37, "args": { "External id": 186030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186030, "pid": 5, "tid": 7, "ts": 1716454224355326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296391, "dur": 12, "args": { "External id": 186030, "cbid": 211, "correlation": 186030 } }, { "ph": "s", "id": 186030, "pid": 76337, "tid": -914061504, "ts": 1716454224296391, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224296456, "dur": 0, "args": { "External id": 186042, "cbid": 317, "correlation": 186042 } }, { "ph": "f", "id": 186042, "pid": 76337, "tid": -914061504, "ts": 1716454224296456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224296457, "dur": 0, "args": { "External id": 186043, "cbid": 203, "correlation": 186043 } }, { "ph": "f", "id": 186043, "pid": 76337, "tid": -914061504, "ts": 1716454224296457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224296458, "dur": 0, "args": { "External id": 186044, "cbid": 205, "correlation": 186044 } }, { "ph": "f", "id": 186044, "pid": 76337, "tid": -914061504, "ts": 1716454224296458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224355365, "dur": 13, "args": { "External id": 186048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186048, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186048, "pid": 5, "tid": 7, "ts": 1716454224355365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296473, "dur": 12, "args": { "External id": 186048, "cbid": 211, "correlation": 186048 } }, { "ph": "s", "id": 186048, "pid": 76337, "tid": -914061504, "ts": 1716454224296473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224355379, "dur": 4, "args": { "External id": 186050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186050, "pid": 5, "tid": 7, "ts": 1716454224355379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296490, "dur": 6, "args": { "External id": 186050, "cbid": 211, "correlation": 186050 } }, { "ph": "s", "id": 186050, "pid": 76337, "tid": -914061504, "ts": 1716454224296490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224296499, "dur": 0, "args": { "External id": 186051, "cbid": 51, "correlation": 186051 } }, { "ph": "s", "id": 186051, "pid": 76337, "tid": -914061504, "ts": 1716454224296499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224355384, "dur": 96, "args": { "External id": 186052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186052, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 186052, "pid": 5, "tid": 7, "ts": 1716454224355384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296500, "dur": 5, "args": { "External id": 186052, "cbid": 211, "correlation": 186052 } }, { "ph": "s", "id": 186052, "pid": 76337, "tid": -914061504, "ts": 1716454224296500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224355481, "dur": 16, "args": { "External id": 186057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186057, "pid": 5, "tid": 7, "ts": 1716454224355481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296527, "dur": 9, "args": { "External id": 186057, "cbid": 211, "correlation": 186057 } }, { "ph": "s", "id": 186057, "pid": 76337, "tid": -914061504, "ts": 1716454224296527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224355497, "dur": 12, "args": { "External id": 186065, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186065, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186065, "pid": 5, "tid": 7, "ts": 1716454224355497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296559, "dur": 8, "args": { "External id": 186065, "cbid": 211, "correlation": 186065 } }, { "ph": "s", "id": 186065, "pid": 76337, "tid": -914061504, "ts": 1716454224296559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224355510, "dur": 30, "args": { "External id": 186074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186074, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186074, "pid": 5, "tid": 7, "ts": 1716454224355510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296600, "dur": 10, "args": { "External id": 186074, "cbid": 211, "correlation": 186074 } }, { "ph": "s", "id": 186074, "pid": 76337, "tid": -914061504, "ts": 1716454224296600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224355542, "dur": 31, "args": { "External id": 186094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186094, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 186094, "pid": 5, "tid": 7, "ts": 1716454224355542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296671, "dur": 12, "args": { "External id": 186094, "cbid": 211, "correlation": 186094 } }, { "ph": "s", "id": 186094, "pid": 76337, "tid": -914061504, "ts": 1716454224296671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224355574, "dur": 5, "args": { "External id": 186106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186106, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186106, "pid": 5, "tid": 7, "ts": 1716454224355574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296692, "dur": 7, "args": { "External id": 186106, "cbid": 211, "correlation": 186106 } }, { "ph": "s", "id": 186106, "pid": 76337, "tid": -914061504, "ts": 1716454224296692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224355581, "dur": 30, "args": { "External id": 186109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186109, "pid": 5, "tid": 7, "ts": 1716454224355581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296711, "dur": 6, "args": { "External id": 186109, "cbid": 211, "correlation": 186109 } }, { "ph": "s", "id": 186109, "pid": 76337, "tid": -914061504, "ts": 1716454224296711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224355613, "dur": 21, "args": { "External id": 186118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186118, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186118, "pid": 5, "tid": 7, "ts": 1716454224355613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296750, "dur": 9, "args": { "External id": 186118, "cbid": 211, "correlation": 186118 } }, { "ph": "s", "id": 186118, "pid": 76337, "tid": -914061504, "ts": 1716454224296750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224296806, "dur": 0, "args": { "External id": 186128, "cbid": 317, "correlation": 186128 } }, { "ph": "f", "id": 186128, "pid": 76337, "tid": -914061504, "ts": 1716454224296806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224296807, "dur": 0, "args": { "External id": 186129, "cbid": 203, "correlation": 186129 } }, { "ph": "f", "id": 186129, "pid": 76337, "tid": -914061504, "ts": 1716454224296807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224296808, "dur": 0, "args": { "External id": 186130, "cbid": 205, "correlation": 186130 } }, { "ph": "f", "id": 186130, "pid": 76337, "tid": -914061504, "ts": 1716454224296808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224355635, "dur": 21, "args": { "External id": 186134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186134, "pid": 5, "tid": 7, "ts": 1716454224355635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296821, "dur": 12, "args": { "External id": 186134, "cbid": 211, "correlation": 186134 } }, { "ph": "s", "id": 186134, "pid": 76337, "tid": -914061504, "ts": 1716454224296821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224355658, "dur": 314, "args": { "External id": 186136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186136, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186136, "pid": 5, "tid": 7, "ts": 1716454224355658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296835, "dur": 5, "args": { "External id": 186136, "cbid": 211, "correlation": 186136 } }, { "ph": "s", "id": 186136, "pid": 76337, "tid": -914061504, "ts": 1716454224296835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224355975, "dur": 1, "args": { "External id": 186138, "device": 5, "context": 1, "stream": 7, "correlation": 186138, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 186138, "pid": 5, "tid": 7, "ts": 1716454224355975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224296847, "dur": 8, "args": { "External id": 186138, "cbid": 51, "correlation": 186138 } }, { "ph": "s", "id": 186138, "pid": 76337, "tid": -914061504, "ts": 1716454224296847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224355978, "dur": 1243, "args": { "External id": 186139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186139, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186139, "pid": 5, "tid": 7, "ts": 1716454224355978, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296855, "dur": 6, "args": { "External id": 186139, "cbid": 211, "correlation": 186139 } }, { "ph": "s", "id": 186139, "pid": 76337, "tid": -914061504, "ts": 1716454224296855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224357223, "dur": 13, "args": { "External id": 186141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186141, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186141, "pid": 5, "tid": 7, "ts": 1716454224357223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296865, "dur": 5, "args": { "External id": 186141, "cbid": 211, "correlation": 186141 } }, { "ph": "s", "id": 186141, "pid": 76337, "tid": -914061504, "ts": 1716454224296865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224357237, "dur": 14, "args": { "External id": 186147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186147, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186147, "pid": 5, "tid": 7, "ts": 1716454224357237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296893, "dur": 9, "args": { "External id": 186147, "cbid": 211, "correlation": 186147 } }, { "ph": "s", "id": 186147, "pid": 76337, "tid": -914061504, "ts": 1716454224296893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224357253, "dur": 3, "args": { "External id": 186155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186155, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 186155, "pid": 5, "tid": 7, "ts": 1716454224357253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224296937, "dur": 9, "args": { "External id": 186155, "cbid": 211, "correlation": 186155 } }, { "ph": "s", "id": 186155, "pid": 76337, "tid": -914061504, "ts": 1716454224296937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224297010, "dur": 1, "args": { "External id": 186171, "cbid": 251, "correlation": 186171 } }, { "ph": "f", "id": 186171, "pid": 76337, "tid": -914061504, "ts": 1716454224297010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224297015, "dur": 0, "args": { "External id": 186173, "cbid": 251, "correlation": 186173 } }, { "ph": "f", "id": 186173, "pid": 76337, "tid": -914061504, "ts": 1716454224297015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224357257, "dur": 13, "args": { "External id": 186174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186174, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186174, "pid": 5, "tid": 7, "ts": 1716454224357257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297017, "dur": 11, "args": { "External id": 186174, "cbid": 211, "correlation": 186174 } }, { "ph": "s", "id": 186174, "pid": 76337, "tid": -914061504, "ts": 1716454224297017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224357272, "dur": 5, "args": { "External id": 186176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186176, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186176, "pid": 5, "tid": 7, "ts": 1716454224357272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297030, "dur": 5, "args": { "External id": 186176, "cbid": 211, "correlation": 186176 } }, { "ph": "s", "id": 186176, "pid": 76337, "tid": -914061504, "ts": 1716454224297030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224357278, "dur": 16, "args": { "External id": 186186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186186, "pid": 5, "tid": 7, "ts": 1716454224357278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297088, "dur": 12, "args": { "External id": 186186, "cbid": 211, "correlation": 186186 } }, { "ph": "s", "id": 186186, "pid": 76337, "tid": -914061504, "ts": 1716454224297088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224357295, "dur": 18, "args": { "External id": 186206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186206, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 186206, "pid": 5, "tid": 7, "ts": 1716454224357295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297154, "dur": 11, "args": { "External id": 186206, "cbid": 211, "correlation": 186206 } }, { "ph": "s", "id": 186206, "pid": 76337, "tid": -914061504, "ts": 1716454224297154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224357314, "dur": 4, "args": { "External id": 186218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186218, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 186218, "pid": 5, "tid": 7, "ts": 1716454224357314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297175, "dur": 6, "args": { "External id": 186218, "cbid": 211, "correlation": 186218 } }, { "ph": "s", "id": 186218, "pid": 76337, "tid": -914061504, "ts": 1716454224297175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224357320, "dur": 16, "args": { "External id": 186221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186221, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186221, "pid": 5, "tid": 7, "ts": 1716454224357320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297193, "dur": 7, "args": { "External id": 186221, "cbid": 211, "correlation": 186221 } }, { "ph": "s", "id": 186221, "pid": 76337, "tid": -914061504, "ts": 1716454224297193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224357338, "dur": 11, "args": { "External id": 186230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186230, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186230, "pid": 5, "tid": 7, "ts": 1716454224357338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297233, "dur": 10, "args": { "External id": 186230, "cbid": 211, "correlation": 186230 } }, { "ph": "s", "id": 186230, "pid": 76337, "tid": -914061504, "ts": 1716454224297233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224297295, "dur": 0, "args": { "External id": 186240, "cbid": 317, "correlation": 186240 } }, { "ph": "f", "id": 186240, "pid": 76337, "tid": -914061504, "ts": 1716454224297295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224297295, "dur": 0, "args": { "External id": 186241, "cbid": 203, "correlation": 186241 } }, { "ph": "f", "id": 186241, "pid": 76337, "tid": -914061504, "ts": 1716454224297295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224297296, "dur": 0, "args": { "External id": 186242, "cbid": 205, "correlation": 186242 } }, { "ph": "f", "id": 186242, "pid": 76337, "tid": -914061504, "ts": 1716454224297296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224357349, "dur": 11, "args": { "External id": 186246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186246, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186246, "pid": 5, "tid": 7, "ts": 1716454224357349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297310, "dur": 12, "args": { "External id": 186246, "cbid": 211, "correlation": 186246 } }, { "ph": "s", "id": 186246, "pid": 76337, "tid": -914061504, "ts": 1716454224297310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224357362, "dur": 161, "args": { "External id": 186248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186248, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186248, "pid": 5, "tid": 7, "ts": 1716454224357362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297325, "dur": 5, "args": { "External id": 186248, "cbid": 211, "correlation": 186248 } }, { "ph": "s", "id": 186248, "pid": 76337, "tid": -914061504, "ts": 1716454224297325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224357525, "dur": 1, "args": { "External id": 186250, "device": 5, "context": 1, "stream": 7, "correlation": 186250, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 186250, "pid": 5, "tid": 7, "ts": 1716454224357525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224297335, "dur": 6, "args": { "External id": 186250, "cbid": 51, "correlation": 186250 } }, { "ph": "s", "id": 186250, "pid": 76337, "tid": -914061504, "ts": 1716454224297335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224357529, "dur": 642, "args": { "External id": 186251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186251, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186251, "pid": 5, "tid": 7, "ts": 1716454224357529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297343, "dur": 6, "args": { "External id": 186251, "cbid": 211, "correlation": 186251 } }, { "ph": "s", "id": 186251, "pid": 76337, "tid": -914061504, "ts": 1716454224297343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224358172, "dur": 13, "args": { "External id": 186253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186253, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186253, "pid": 5, "tid": 7, "ts": 1716454224358172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297353, "dur": 5, "args": { "External id": 186253, "cbid": 211, "correlation": 186253 } }, { "ph": "s", "id": 186253, "pid": 76337, "tid": -914061504, "ts": 1716454224297353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224358186, "dur": 14, "args": { "External id": 186259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186259, "pid": 5, "tid": 7, "ts": 1716454224358186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297382, "dur": 8, "args": { "External id": 186259, "cbid": 211, "correlation": 186259 } }, { "ph": "s", "id": 186259, "pid": 76337, "tid": -914061504, "ts": 1716454224297382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224297441, "dur": 0, "args": { "External id": 186269, "cbid": 317, "correlation": 186269 } }, { "ph": "f", "id": 186269, "pid": 76337, "tid": -914061504, "ts": 1716454224297441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224297442, "dur": 0, "args": { "External id": 186270, "cbid": 203, "correlation": 186270 } }, { "ph": "f", "id": 186270, "pid": 76337, "tid": -914061504, "ts": 1716454224297442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224297442, "dur": 0, "args": { "External id": 186271, "cbid": 205, "correlation": 186271 } }, { "ph": "f", "id": 186271, "pid": 76337, "tid": -914061504, "ts": 1716454224297442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224358202, "dur": 21, "args": { "External id": 186275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186275, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186275, "pid": 5, "tid": 7, "ts": 1716454224358202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297454, "dur": 12, "args": { "External id": 186275, "cbid": 211, "correlation": 186275 } }, { "ph": "s", "id": 186275, "pid": 76337, "tid": -914061504, "ts": 1716454224297454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224358224, "dur": 4, "args": { "External id": 186277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186277, "pid": 5, "tid": 7, "ts": 1716454224358224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297470, "dur": 6, "args": { "External id": 186277, "cbid": 211, "correlation": 186277 } }, { "ph": "s", "id": 186277, "pid": 76337, "tid": -914061504, "ts": 1716454224297470, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224297479, "dur": 0, "args": { "External id": 186278, "cbid": 51, "correlation": 186278 } }, { "ph": "s", "id": 186278, "pid": 76337, "tid": -914061504, "ts": 1716454224297479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224358229, "dur": 169, "args": { "External id": 186279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186279, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 186279, "pid": 5, "tid": 7, "ts": 1716454224358229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297479, "dur": 5, "args": { "External id": 186279, "cbid": 211, "correlation": 186279 } }, { "ph": "s", "id": 186279, "pid": 76337, "tid": -914061504, "ts": 1716454224297479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224358399, "dur": 16, "args": { "External id": 186284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186284, "pid": 5, "tid": 7, "ts": 1716454224358399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297504, "dur": 9, "args": { "External id": 186284, "cbid": 211, "correlation": 186284 } }, { "ph": "s", "id": 186284, "pid": 76337, "tid": -914061504, "ts": 1716454224297504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224358416, "dur": 12, "args": { "External id": 186292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186292, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186292, "pid": 5, "tid": 7, "ts": 1716454224358416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297533, "dur": 8, "args": { "External id": 186292, "cbid": 211, "correlation": 186292 } }, { "ph": "s", "id": 186292, "pid": 76337, "tid": -914061504, "ts": 1716454224297533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224358429, "dur": 10, "args": { "External id": 186300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186300, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186300, "pid": 5, "tid": 7, "ts": 1716454224358429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297561, "dur": 9, "args": { "External id": 186300, "cbid": 211, "correlation": 186300 } }, { "ph": "s", "id": 186300, "pid": 76337, "tid": -914061504, "ts": 1716454224297561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224358440, "dur": 18, "args": { "External id": 186320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186320, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 186320, "pid": 5, "tid": 7, "ts": 1716454224358440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297643, "dur": 12, "args": { "External id": 186320, "cbid": 211, "correlation": 186320 } }, { "ph": "s", "id": 186320, "pid": 76337, "tid": -914061504, "ts": 1716454224297643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224358460, "dur": 5, "args": { "External id": 186332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186332, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 186332, "pid": 5, "tid": 7, "ts": 1716454224358460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297665, "dur": 7, "args": { "External id": 186332, "cbid": 211, "correlation": 186332 } }, { "ph": "s", "id": 186332, "pid": 76337, "tid": -914061504, "ts": 1716454224297665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224358466, "dur": 17, "args": { "External id": 186335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186335, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186335, "pid": 5, "tid": 7, "ts": 1716454224358466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297683, "dur": 6, "args": { "External id": 186335, "cbid": 211, "correlation": 186335 } }, { "ph": "s", "id": 186335, "pid": 76337, "tid": -914061504, "ts": 1716454224297683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224297739, "dur": 0, "args": { "External id": 186346, "cbid": 317, "correlation": 186346 } }, { "ph": "f", "id": 186346, "pid": 76337, "tid": -914061504, "ts": 1716454224297739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224297740, "dur": 0, "args": { "External id": 186347, "cbid": 203, "correlation": 186347 } }, { "ph": "f", "id": 186347, "pid": 76337, "tid": -914061504, "ts": 1716454224297740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224297741, "dur": 0, "args": { "External id": 186348, "cbid": 205, "correlation": 186348 } }, { "ph": "f", "id": 186348, "pid": 76337, "tid": -914061504, "ts": 1716454224297741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224358484, "dur": 11, "args": { "External id": 186352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186352, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186352, "pid": 5, "tid": 7, "ts": 1716454224358484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297754, "dur": 12, "args": { "External id": 186352, "cbid": 211, "correlation": 186352 } }, { "ph": "s", "id": 186352, "pid": 76337, "tid": -914061504, "ts": 1716454224297754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224358496, "dur": 4, "args": { "External id": 186354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186354, "pid": 5, "tid": 7, "ts": 1716454224358496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297770, "dur": 6, "args": { "External id": 186354, "cbid": 211, "correlation": 186354 } }, { "ph": "s", "id": 186354, "pid": 76337, "tid": -914061504, "ts": 1716454224297770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224297779, "dur": 0, "args": { "External id": 186355, "cbid": 51, "correlation": 186355 } }, { "ph": "s", "id": 186355, "pid": 76337, "tid": -914061504, "ts": 1716454224297779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224358501, "dur": 89, "args": { "External id": 186356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186356, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 186356, "pid": 5, "tid": 7, "ts": 1716454224358501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297779, "dur": 5, "args": { "External id": 186356, "cbid": 211, "correlation": 186356 } }, { "ph": "s", "id": 186356, "pid": 76337, "tid": -914061504, "ts": 1716454224297779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224358591, "dur": 15, "args": { "External id": 186361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186361, "pid": 5, "tid": 7, "ts": 1716454224358591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297805, "dur": 9, "args": { "External id": 186361, "cbid": 211, "correlation": 186361 } }, { "ph": "s", "id": 186361, "pid": 76337, "tid": -914061504, "ts": 1716454224297805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224358608, "dur": 83, "args": { "External id": 186370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186370, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186370, "pid": 5, "tid": 7, "ts": 1716454224358608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297887, "dur": 14, "args": { "External id": 186370, "cbid": 211, "correlation": 186370 } }, { "ph": "s", "id": 186370, "pid": 76337, "tid": -914061504, "ts": 1716454224297887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224358692, "dur": 30, "args": { "External id": 186392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186392, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186392, "pid": 5, "tid": 7, "ts": 1716454224358692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224297944, "dur": 10, "args": { "External id": 186392, "cbid": 211, "correlation": 186392 } }, { "ph": "s", "id": 186392, "pid": 76337, "tid": -914061504, "ts": 1716454224297944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298043, "dur": 1, "args": { "External id": 186403, "cbid": 251, "correlation": 186403 } }, { "ph": "f", "id": 186403, "pid": 76337, "tid": -914061504, "ts": 1716454224298043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224358723, "dur": 162, "args": { "External id": 186404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186404, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186404, "pid": 5, "tid": 7, "ts": 1716454224358723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298049, "dur": 14, "args": { "External id": 186404, "cbid": 211, "correlation": 186404 } }, { "ph": "s", "id": 186404, "pid": 76337, "tid": -914061504, "ts": 1716454224298049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298120, "dur": 1, "args": { "External id": 186415, "cbid": 251, "correlation": 186415 } }, { "ph": "f", "id": 186415, "pid": 76337, "tid": -914061504, "ts": 1716454224298120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224358886, "dur": 158, "args": { "External id": 186416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186416, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186416, "pid": 5, "tid": 7, "ts": 1716454224358886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298125, "dur": 11, "args": { "External id": 186416, "cbid": 211, "correlation": 186416 } }, { "ph": "s", "id": 186416, "pid": 76337, "tid": -914061504, "ts": 1716454224298125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298190, "dur": 1, "args": { "External id": 186427, "cbid": 251, "correlation": 186427 } }, { "ph": "f", "id": 186427, "pid": 76337, "tid": -914061504, "ts": 1716454224298190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224359046, "dur": 157, "args": { "External id": 186428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186428, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186428, "pid": 5, "tid": 7, "ts": 1716454224359046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298194, "dur": 11, "args": { "External id": 186428, "cbid": 211, "correlation": 186428 } }, { "ph": "s", "id": 186428, "pid": 76337, "tid": -914061504, "ts": 1716454224298194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224359204, "dur": 332, "args": { "External id": 186453, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186453, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186453, "pid": 5, "tid": 7, "ts": 1716454224359204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298278, "dur": 13, "args": { "External id": 186453, "cbid": 211, "correlation": 186453 } }, { "ph": "s", "id": 186453, "pid": 76337, "tid": -914061504, "ts": 1716454224298278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298377, "dur": 1, "args": { "External id": 186471, "cbid": 251, "correlation": 186471 } }, { "ph": "f", "id": 186471, "pid": 76337, "tid": -914061504, "ts": 1716454224298377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224359538, "dur": 163, "args": { "External id": 186473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186473, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186473, "pid": 5, "tid": 7, "ts": 1716454224359538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298383, "dur": 13, "args": { "External id": 186473, "cbid": 211, "correlation": 186473 } }, { "ph": "s", "id": 186473, "pid": 76337, "tid": -914061504, "ts": 1716454224298383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224359703, "dur": 19, "args": { "External id": 186481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186481, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186481, "pid": 5, "tid": 7, "ts": 1716454224359703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298454, "dur": 12, "args": { "External id": 186481, "cbid": 211, "correlation": 186481 } }, { "ph": "s", "id": 186481, "pid": 76337, "tid": -914061504, "ts": 1716454224298454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224359723, "dur": 27, "args": { "External id": 186489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186489, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186489, "pid": 5, "tid": 7, "ts": 1716454224359723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298493, "dur": 8, "args": { "External id": 186489, "cbid": 211, "correlation": 186489 } }, { "ph": "s", "id": 186489, "pid": 76337, "tid": -914061504, "ts": 1716454224298493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224359752, "dur": 19, "args": { "External id": 186500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186500, "pid": 5, "tid": 7, "ts": 1716454224359752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298565, "dur": 12, "args": { "External id": 186500, "cbid": 211, "correlation": 186500 } }, { "ph": "s", "id": 186500, "pid": 76337, "tid": -914061504, "ts": 1716454224298565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224359772, "dur": 16, "args": { "External id": 186522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186522, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186522, "pid": 5, "tid": 7, "ts": 1716454224359772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298596, "dur": 8, "args": { "External id": 186522, "cbid": 211, "correlation": 186522 } }, { "ph": "s", "id": 186522, "pid": 76337, "tid": -914061504, "ts": 1716454224298596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298681, "dur": 1, "args": { "External id": 186533, "cbid": 251, "correlation": 186533 } }, { "ph": "f", "id": 186533, "pid": 76337, "tid": -914061504, "ts": 1716454224298681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224359790, "dur": 87, "args": { "External id": 186534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186534, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186534, "pid": 5, "tid": 7, "ts": 1716454224359790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298687, "dur": 13, "args": { "External id": 186534, "cbid": 211, "correlation": 186534 } }, { "ph": "s", "id": 186534, "pid": 76337, "tid": -914061504, "ts": 1716454224298687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298756, "dur": 1, "args": { "External id": 186545, "cbid": 251, "correlation": 186545 } }, { "ph": "f", "id": 186545, "pid": 76337, "tid": -914061504, "ts": 1716454224298756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298760, "dur": 0, "args": { "External id": 186546, "cbid": 251, "correlation": 186546 } }, { "ph": "f", "id": 186546, "pid": 76337, "tid": -914061504, "ts": 1716454224298760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224359878, "dur": 12, "args": { "External id": 186547, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186547, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186547, "pid": 5, "tid": 7, "ts": 1716454224359878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298761, "dur": 12, "args": { "External id": 186547, "cbid": 211, "correlation": 186547 } }, { "ph": "s", "id": 186547, "pid": 76337, "tid": -914061504, "ts": 1716454224298761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224359892, "dur": 5, "args": { "External id": 186549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186549, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186549, "pid": 5, "tid": 7, "ts": 1716454224359892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298775, "dur": 6, "args": { "External id": 186549, "cbid": 211, "correlation": 186549 } }, { "ph": "s", "id": 186549, "pid": 76337, "tid": -914061504, "ts": 1716454224298775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298833, "dur": 1, "args": { "External id": 186560, "cbid": 251, "correlation": 186560 } }, { "ph": "f", "id": 186560, "pid": 76337, "tid": -914061504, "ts": 1716454224298833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224298836, "dur": 0, "args": { "External id": 186561, "cbid": 251, "correlation": 186561 } }, { "ph": "f", "id": 186561, "pid": 76337, "tid": -914061504, "ts": 1716454224298836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224359899, "dur": 8, "args": { "External id": 186562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186562, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186562, "pid": 5, "tid": 7, "ts": 1716454224359899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298838, "dur": 12, "args": { "External id": 186562, "cbid": 211, "correlation": 186562 } }, { "ph": "s", "id": 186562, "pid": 76337, "tid": -914061504, "ts": 1716454224298838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224359908, "dur": 3, "args": { "External id": 186564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186564, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186564, "pid": 5, "tid": 7, "ts": 1716454224359908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298851, "dur": 5, "args": { "External id": 186564, "cbid": 211, "correlation": 186564 } }, { "ph": "s", "id": 186564, "pid": 76337, "tid": -914061504, "ts": 1716454224298851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224359913, "dur": 54, "args": { "External id": 186589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186589, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186589, "pid": 5, "tid": 7, "ts": 1716454224359913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224298927, "dur": 13, "args": { "External id": 186589, "cbid": 211, "correlation": 186589 } }, { "ph": "s", "id": 186589, "pid": 76337, "tid": -914061504, "ts": 1716454224298927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224299036, "dur": 1, "args": { "External id": 186607, "cbid": 251, "correlation": 186607 } }, { "ph": "f", "id": 186607, "pid": 76337, "tid": -914061504, "ts": 1716454224299036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224359968, "dur": 89, "args": { "External id": 186609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186609, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186609, "pid": 5, "tid": 7, "ts": 1716454224359968, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299042, "dur": 14, "args": { "External id": 186609, "cbid": 211, "correlation": 186609 } }, { "ph": "s", "id": 186609, "pid": 76337, "tid": -914061504, "ts": 1716454224299042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224360059, "dur": 10, "args": { "External id": 186617, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186617, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186617, "pid": 5, "tid": 7, "ts": 1716454224360059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299112, "dur": 12, "args": { "External id": 186617, "cbid": 211, "correlation": 186617 } }, { "ph": "s", "id": 186617, "pid": 76337, "tid": -914061504, "ts": 1716454224299112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224360070, "dur": 21, "args": { "External id": 186625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186625, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186625, "pid": 5, "tid": 7, "ts": 1716454224360070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299153, "dur": 9, "args": { "External id": 186625, "cbid": 211, "correlation": 186625 } }, { "ph": "s", "id": 186625, "pid": 76337, "tid": -914061504, "ts": 1716454224299153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224360092, "dur": 17, "args": { "External id": 186647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186647, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186647, "pid": 5, "tid": 7, "ts": 1716454224360092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299204, "dur": 10, "args": { "External id": 186647, "cbid": 211, "correlation": 186647 } }, { "ph": "s", "id": 186647, "pid": 76337, "tid": -914061504, "ts": 1716454224299204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224299290, "dur": 1, "args": { "External id": 186663, "cbid": 251, "correlation": 186663 } }, { "ph": "f", "id": 186663, "pid": 76337, "tid": -914061504, "ts": 1716454224299290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224299296, "dur": 0, "args": { "External id": 186665, "cbid": 251, "correlation": 186665 } }, { "ph": "f", "id": 186665, "pid": 76337, "tid": -914061504, "ts": 1716454224299296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224360110, "dur": 494, "args": { "External id": 186666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186666, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186666, "pid": 5, "tid": 7, "ts": 1716454224360110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299297, "dur": 13, "args": { "External id": 186666, "cbid": 211, "correlation": 186666 } }, { "ph": "s", "id": 186666, "pid": 76337, "tid": -914061504, "ts": 1716454224299297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224360605, "dur": 66, "args": { "External id": 186674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186674, "pid": 5, "tid": 7, "ts": 1716454224360605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299363, "dur": 12, "args": { "External id": 186674, "cbid": 211, "correlation": 186674 } }, { "ph": "s", "id": 186674, "pid": 76337, "tid": -914061504, "ts": 1716454224299363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224360673, "dur": 66, "args": { "External id": 186682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186682, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186682, "pid": 5, "tid": 7, "ts": 1716454224360673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299394, "dur": 8, "args": { "External id": 186682, "cbid": 211, "correlation": 186682 } }, { "ph": "s", "id": 186682, "pid": 76337, "tid": -914061504, "ts": 1716454224299394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224299472, "dur": 1, "args": { "External id": 186698, "cbid": 251, "correlation": 186698 } }, { "ph": "f", "id": 186698, "pid": 76337, "tid": -914061504, "ts": 1716454224299472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224360740, "dur": 1, "args": { "External id": 186700, "device": 5, "context": 1, "stream": 7, "correlation": 186700, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 186700, "pid": 5, "tid": 7, "ts": 1716454224360740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224299477, "dur": 9, "args": { "External id": 186700, "cbid": 51, "correlation": 186700 } }, { "ph": "s", "id": 186700, "pid": 76337, "tid": -914061504, "ts": 1716454224299477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224360744, "dur": 265, "args": { "External id": 186701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186701, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186701, "pid": 5, "tid": 7, "ts": 1716454224360744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299487, "dur": 11, "args": { "External id": 186701, "cbid": 211, "correlation": 186701 } }, { "ph": "s", "id": 186701, "pid": 76337, "tid": -914061504, "ts": 1716454224299487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224361010, "dur": 14, "args": { "External id": 186709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186709, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186709, "pid": 5, "tid": 7, "ts": 1716454224361010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299529, "dur": 11, "args": { "External id": 186709, "cbid": 211, "correlation": 186709 } }, { "ph": "s", "id": 186709, "pid": 76337, "tid": -914061504, "ts": 1716454224299529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224361025, "dur": 38, "args": { "External id": 186720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186720, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186720, "pid": 5, "tid": 7, "ts": 1716454224361025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299598, "dur": 12, "args": { "External id": 186720, "cbid": 211, "correlation": 186720 } }, { "ph": "s", "id": 186720, "pid": 76337, "tid": -914061504, "ts": 1716454224299598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224299662, "dur": 0, "args": { "External id": 186732, "cbid": 317, "correlation": 186732 } }, { "ph": "f", "id": 186732, "pid": 76337, "tid": -914061504, "ts": 1716454224299662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224299663, "dur": 0, "args": { "External id": 186733, "cbid": 203, "correlation": 186733 } }, { "ph": "f", "id": 186733, "pid": 76337, "tid": -914061504, "ts": 1716454224299663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224299664, "dur": 0, "args": { "External id": 186734, "cbid": 205, "correlation": 186734 } }, { "ph": "f", "id": 186734, "pid": 76337, "tid": -914061504, "ts": 1716454224299664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224361065, "dur": 14, "args": { "External id": 186738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186738, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186738, "pid": 5, "tid": 7, "ts": 1716454224361065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299679, "dur": 12, "args": { "External id": 186738, "cbid": 211, "correlation": 186738 } }, { "ph": "s", "id": 186738, "pid": 76337, "tid": -914061504, "ts": 1716454224299679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224361080, "dur": 4, "args": { "External id": 186740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186740, "pid": 5, "tid": 7, "ts": 1716454224361080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299696, "dur": 6, "args": { "External id": 186740, "cbid": 211, "correlation": 186740 } }, { "ph": "s", "id": 186740, "pid": 76337, "tid": -914061504, "ts": 1716454224299696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224299704, "dur": 0, "args": { "External id": 186741, "cbid": 51, "correlation": 186741 } }, { "ph": "s", "id": 186741, "pid": 76337, "tid": -914061504, "ts": 1716454224299704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224361085, "dur": 94, "args": { "External id": 186742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186742, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 186742, "pid": 5, "tid": 7, "ts": 1716454224361085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299705, "dur": 5, "args": { "External id": 186742, "cbid": 211, "correlation": 186742 } }, { "ph": "s", "id": 186742, "pid": 76337, "tid": -914061504, "ts": 1716454224299705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224361181, "dur": 16, "args": { "External id": 186747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186747, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186747, "pid": 5, "tid": 7, "ts": 1716454224361181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299732, "dur": 9, "args": { "External id": 186747, "cbid": 211, "correlation": 186747 } }, { "ph": "s", "id": 186747, "pid": 76337, "tid": -914061504, "ts": 1716454224299732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224361198, "dur": 13, "args": { "External id": 186755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186755, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186755, "pid": 5, "tid": 7, "ts": 1716454224361198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299764, "dur": 8, "args": { "External id": 186755, "cbid": 211, "correlation": 186755 } }, { "ph": "s", "id": 186755, "pid": 76337, "tid": -914061504, "ts": 1716454224299764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224361213, "dur": 25, "args": { "External id": 186764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186764, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186764, "pid": 5, "tid": 7, "ts": 1716454224361213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299802, "dur": 11, "args": { "External id": 186764, "cbid": 211, "correlation": 186764 } }, { "ph": "s", "id": 186764, "pid": 76337, "tid": -914061504, "ts": 1716454224299802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224361239, "dur": 24, "args": { "External id": 186784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186784, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 186784, "pid": 5, "tid": 7, "ts": 1716454224361239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299873, "dur": 11, "args": { "External id": 186784, "cbid": 211, "correlation": 186784 } }, { "ph": "s", "id": 186784, "pid": 76337, "tid": -914061504, "ts": 1716454224299873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224361264, "dur": 5, "args": { "External id": 186796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186796, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 186796, "pid": 5, "tid": 7, "ts": 1716454224361264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299895, "dur": 7, "args": { "External id": 186796, "cbid": 211, "correlation": 186796 } }, { "ph": "s", "id": 186796, "pid": 76337, "tid": -914061504, "ts": 1716454224299895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224361270, "dur": 25, "args": { "External id": 186799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186799, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186799, "pid": 5, "tid": 7, "ts": 1716454224361270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299913, "dur": 6, "args": { "External id": 186799, "cbid": 211, "correlation": 186799 } }, { "ph": "s", "id": 186799, "pid": 76337, "tid": -914061504, "ts": 1716454224299913, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224361296, "dur": 17, "args": { "External id": 186808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186808, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186808, "pid": 5, "tid": 7, "ts": 1716454224361296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224299953, "dur": 9, "args": { "External id": 186808, "cbid": 211, "correlation": 186808 } }, { "ph": "s", "id": 186808, "pid": 76337, "tid": -914061504, "ts": 1716454224299953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224300017, "dur": 0, "args": { "External id": 186818, "cbid": 317, "correlation": 186818 } }, { "ph": "f", "id": 186818, "pid": 76337, "tid": -914061504, "ts": 1716454224300017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224300018, "dur": 0, "args": { "External id": 186819, "cbid": 203, "correlation": 186819 } }, { "ph": "f", "id": 186819, "pid": 76337, "tid": -914061504, "ts": 1716454224300018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224300018, "dur": 0, "args": { "External id": 186820, "cbid": 205, "correlation": 186820 } }, { "ph": "f", "id": 186820, "pid": 76337, "tid": -914061504, "ts": 1716454224300018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224361315, "dur": 17, "args": { "External id": 186824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186824, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186824, "pid": 5, "tid": 7, "ts": 1716454224361315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300034, "dur": 12, "args": { "External id": 186824, "cbid": 211, "correlation": 186824 } }, { "ph": "s", "id": 186824, "pid": 76337, "tid": -914061504, "ts": 1716454224300034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224361333, "dur": 238, "args": { "External id": 186826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186826, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186826, "pid": 5, "tid": 7, "ts": 1716454224361333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300049, "dur": 5, "args": { "External id": 186826, "cbid": 211, "correlation": 186826 } }, { "ph": "s", "id": 186826, "pid": 76337, "tid": -914061504, "ts": 1716454224300049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224361573, "dur": 1, "args": { "External id": 186828, "device": 5, "context": 1, "stream": 7, "correlation": 186828, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 186828, "pid": 5, "tid": 7, "ts": 1716454224361573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224300060, "dur": 8, "args": { "External id": 186828, "cbid": 51, "correlation": 186828 } }, { "ph": "s", "id": 186828, "pid": 76337, "tid": -914061504, "ts": 1716454224300060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224361577, "dur": 806, "args": { "External id": 186829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186829, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186829, "pid": 5, "tid": 7, "ts": 1716454224361577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300069, "dur": 6, "args": { "External id": 186829, "cbid": 211, "correlation": 186829 } }, { "ph": "s", "id": 186829, "pid": 76337, "tid": -914061504, "ts": 1716454224300069, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224362384, "dur": 13, "args": { "External id": 186831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186831, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186831, "pid": 5, "tid": 7, "ts": 1716454224362384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300080, "dur": 5, "args": { "External id": 186831, "cbid": 211, "correlation": 186831 } }, { "ph": "s", "id": 186831, "pid": 76337, "tid": -914061504, "ts": 1716454224300080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224362399, "dur": 14, "args": { "External id": 186837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186837, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186837, "pid": 5, "tid": 7, "ts": 1716454224362399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300108, "dur": 8, "args": { "External id": 186837, "cbid": 211, "correlation": 186837 } }, { "ph": "s", "id": 186837, "pid": 76337, "tid": -914061504, "ts": 1716454224300108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224362414, "dur": 3, "args": { "External id": 186845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186845, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 186845, "pid": 5, "tid": 7, "ts": 1716454224362414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300153, "dur": 9, "args": { "External id": 186845, "cbid": 211, "correlation": 186845 } }, { "ph": "s", "id": 186845, "pid": 76337, "tid": -914061504, "ts": 1716454224300153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224300218, "dur": 1, "args": { "External id": 186861, "cbid": 251, "correlation": 186861 } }, { "ph": "f", "id": 186861, "pid": 76337, "tid": -914061504, "ts": 1716454224300218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224300224, "dur": 0, "args": { "External id": 186863, "cbid": 251, "correlation": 186863 } }, { "ph": "f", "id": 186863, "pid": 76337, "tid": -914061504, "ts": 1716454224300224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224362419, "dur": 14, "args": { "External id": 186864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186864, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186864, "pid": 5, "tid": 7, "ts": 1716454224362419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300225, "dur": 11, "args": { "External id": 186864, "cbid": 211, "correlation": 186864 } }, { "ph": "s", "id": 186864, "pid": 76337, "tid": -914061504, "ts": 1716454224300225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224362434, "dur": 5, "args": { "External id": 186866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186866, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186866, "pid": 5, "tid": 7, "ts": 1716454224362434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300238, "dur": 6, "args": { "External id": 186866, "cbid": 211, "correlation": 186866 } }, { "ph": "s", "id": 186866, "pid": 76337, "tid": -914061504, "ts": 1716454224300238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224362441, "dur": 16, "args": { "External id": 186876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186876, "pid": 5, "tid": 7, "ts": 1716454224362441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300297, "dur": 12, "args": { "External id": 186876, "cbid": 211, "correlation": 186876 } }, { "ph": "s", "id": 186876, "pid": 76337, "tid": -914061504, "ts": 1716454224300297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224362458, "dur": 19, "args": { "External id": 186896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186896, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 186896, "pid": 5, "tid": 7, "ts": 1716454224362458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300364, "dur": 10, "args": { "External id": 186896, "cbid": 211, "correlation": 186896 } }, { "ph": "s", "id": 186896, "pid": 76337, "tid": -914061504, "ts": 1716454224300364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224362478, "dur": 4, "args": { "External id": 186908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186908, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 186908, "pid": 5, "tid": 7, "ts": 1716454224362478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300384, "dur": 7, "args": { "External id": 186908, "cbid": 211, "correlation": 186908 } }, { "ph": "s", "id": 186908, "pid": 76337, "tid": -914061504, "ts": 1716454224300384, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224362484, "dur": 16, "args": { "External id": 186911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186911, "pid": 5, "tid": 7, "ts": 1716454224362484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300403, "dur": 7, "args": { "External id": 186911, "cbid": 211, "correlation": 186911 } }, { "ph": "s", "id": 186911, "pid": 76337, "tid": -914061504, "ts": 1716454224300403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224362501, "dur": 10, "args": { "External id": 186920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186920, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186920, "pid": 5, "tid": 7, "ts": 1716454224362501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300444, "dur": 10, "args": { "External id": 186920, "cbid": 211, "correlation": 186920 } }, { "ph": "s", "id": 186920, "pid": 76337, "tid": -914061504, "ts": 1716454224300444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224300506, "dur": 0, "args": { "External id": 186930, "cbid": 317, "correlation": 186930 } }, { "ph": "f", "id": 186930, "pid": 76337, "tid": -914061504, "ts": 1716454224300506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224300507, "dur": 0, "args": { "External id": 186931, "cbid": 203, "correlation": 186931 } }, { "ph": "f", "id": 186931, "pid": 76337, "tid": -914061504, "ts": 1716454224300507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224300508, "dur": 0, "args": { "External id": 186932, "cbid": 205, "correlation": 186932 } }, { "ph": "f", "id": 186932, "pid": 76337, "tid": -914061504, "ts": 1716454224300508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224362513, "dur": 12, "args": { "External id": 186936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186936, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186936, "pid": 5, "tid": 7, "ts": 1716454224362513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300521, "dur": 11, "args": { "External id": 186936, "cbid": 211, "correlation": 186936 } }, { "ph": "s", "id": 186936, "pid": 76337, "tid": -914061504, "ts": 1716454224300521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224362525, "dur": 159, "args": { "External id": 186938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186938, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186938, "pid": 5, "tid": 7, "ts": 1716454224362525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300535, "dur": 5, "args": { "External id": 186938, "cbid": 211, "correlation": 186938 } }, { "ph": "s", "id": 186938, "pid": 76337, "tid": -914061504, "ts": 1716454224300535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224362687, "dur": 1, "args": { "External id": 186940, "device": 5, "context": 1, "stream": 7, "correlation": 186940, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 186940, "pid": 5, "tid": 7, "ts": 1716454224362687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224300546, "dur": 7, "args": { "External id": 186940, "cbid": 51, "correlation": 186940 } }, { "ph": "s", "id": 186940, "pid": 76337, "tid": -914061504, "ts": 1716454224300546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224362691, "dur": 641, "args": { "External id": 186941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186941, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 186941, "pid": 5, "tid": 7, "ts": 1716454224362691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300554, "dur": 6, "args": { "External id": 186941, "cbid": 211, "correlation": 186941 } }, { "ph": "s", "id": 186941, "pid": 76337, "tid": -914061504, "ts": 1716454224300554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224363333, "dur": 12, "args": { "External id": 186943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186943, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186943, "pid": 5, "tid": 7, "ts": 1716454224363333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300565, "dur": 5, "args": { "External id": 186943, "cbid": 211, "correlation": 186943 } }, { "ph": "s", "id": 186943, "pid": 76337, "tid": -914061504, "ts": 1716454224300565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224363347, "dur": 14, "args": { "External id": 186949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186949, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186949, "pid": 5, "tid": 7, "ts": 1716454224363347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300593, "dur": 10, "args": { "External id": 186949, "cbid": 211, "correlation": 186949 } }, { "ph": "s", "id": 186949, "pid": 76337, "tid": -914061504, "ts": 1716454224300593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224300652, "dur": 0, "args": { "External id": 186959, "cbid": 317, "correlation": 186959 } }, { "ph": "f", "id": 186959, "pid": 76337, "tid": -914061504, "ts": 1716454224300652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224300653, "dur": 0, "args": { "External id": 186960, "cbid": 203, "correlation": 186960 } }, { "ph": "f", "id": 186960, "pid": 76337, "tid": -914061504, "ts": 1716454224300653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224300654, "dur": 0, "args": { "External id": 186961, "cbid": 205, "correlation": 186961 } }, { "ph": "f", "id": 186961, "pid": 76337, "tid": -914061504, "ts": 1716454224300654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224363362, "dur": 17, "args": { "External id": 186965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186965, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186965, "pid": 5, "tid": 7, "ts": 1716454224363362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300668, "dur": 12, "args": { "External id": 186965, "cbid": 211, "correlation": 186965 } }, { "ph": "s", "id": 186965, "pid": 76337, "tid": -914061504, "ts": 1716454224300668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224363380, "dur": 4, "args": { "External id": 186967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186967, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 186967, "pid": 5, "tid": 7, "ts": 1716454224363380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300685, "dur": 6, "args": { "External id": 186967, "cbid": 211, "correlation": 186967 } }, { "ph": "s", "id": 186967, "pid": 76337, "tid": -914061504, "ts": 1716454224300685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224300693, "dur": 0, "args": { "External id": 186968, "cbid": 51, "correlation": 186968 } }, { "ph": "s", "id": 186968, "pid": 76337, "tid": -914061504, "ts": 1716454224300693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224363385, "dur": 128, "args": { "External id": 186969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186969, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 186969, "pid": 5, "tid": 7, "ts": 1716454224363385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300694, "dur": 5, "args": { "External id": 186969, "cbid": 211, "correlation": 186969 } }, { "ph": "s", "id": 186969, "pid": 76337, "tid": -914061504, "ts": 1716454224300694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224363515, "dur": 15, "args": { "External id": 186974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186974, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186974, "pid": 5, "tid": 7, "ts": 1716454224363515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300720, "dur": 8, "args": { "External id": 186974, "cbid": 211, "correlation": 186974 } }, { "ph": "s", "id": 186974, "pid": 76337, "tid": -914061504, "ts": 1716454224300720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224363532, "dur": 13, "args": { "External id": 186982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186982, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186982, "pid": 5, "tid": 7, "ts": 1716454224363532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300749, "dur": 8, "args": { "External id": 186982, "cbid": 211, "correlation": 186982 } }, { "ph": "s", "id": 186982, "pid": 76337, "tid": -914061504, "ts": 1716454224300749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224363546, "dur": 10, "args": { "External id": 186990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 186990, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 186990, "pid": 5, "tid": 7, "ts": 1716454224363546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300778, "dur": 8, "args": { "External id": 186990, "cbid": 211, "correlation": 186990 } }, { "ph": "s", "id": 186990, "pid": 76337, "tid": -914061504, "ts": 1716454224300778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224363557, "dur": 18, "args": { "External id": 187010, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187010, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 187010, "pid": 5, "tid": 7, "ts": 1716454224363557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300865, "dur": 12, "args": { "External id": 187010, "cbid": 211, "correlation": 187010 } }, { "ph": "s", "id": 187010, "pid": 76337, "tid": -914061504, "ts": 1716454224300865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224363577, "dur": 4, "args": { "External id": 187022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187022, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 187022, "pid": 5, "tid": 7, "ts": 1716454224363577, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300887, "dur": 6, "args": { "External id": 187022, "cbid": 211, "correlation": 187022 } }, { "ph": "s", "id": 187022, "pid": 76337, "tid": -914061504, "ts": 1716454224300887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224363583, "dur": 17, "args": { "External id": 187025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187025, "pid": 5, "tid": 7, "ts": 1716454224363583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300905, "dur": 7, "args": { "External id": 187025, "cbid": 211, "correlation": 187025 } }, { "ph": "s", "id": 187025, "pid": 76337, "tid": -914061504, "ts": 1716454224300905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224300962, "dur": 0, "args": { "External id": 187036, "cbid": 317, "correlation": 187036 } }, { "ph": "f", "id": 187036, "pid": 76337, "tid": -914061504, "ts": 1716454224300962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224300963, "dur": 0, "args": { "External id": 187037, "cbid": 203, "correlation": 187037 } }, { "ph": "f", "id": 187037, "pid": 76337, "tid": -914061504, "ts": 1716454224300963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224300964, "dur": 0, "args": { "External id": 187038, "cbid": 205, "correlation": 187038 } }, { "ph": "f", "id": 187038, "pid": 76337, "tid": -914061504, "ts": 1716454224300964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224363601, "dur": 11, "args": { "External id": 187042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187042, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187042, "pid": 5, "tid": 7, "ts": 1716454224363601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224300985, "dur": 12, "args": { "External id": 187042, "cbid": 211, "correlation": 187042 } }, { "ph": "s", "id": 187042, "pid": 76337, "tid": -914061504, "ts": 1716454224300985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224363613, "dur": 3, "args": { "External id": 187044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187044, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 187044, "pid": 5, "tid": 7, "ts": 1716454224363613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301002, "dur": 6, "args": { "External id": 187044, "cbid": 211, "correlation": 187044 } }, { "ph": "s", "id": 187044, "pid": 76337, "tid": -914061504, "ts": 1716454224301002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224301010, "dur": 0, "args": { "External id": 187045, "cbid": 51, "correlation": 187045 } }, { "ph": "s", "id": 187045, "pid": 76337, "tid": -914061504, "ts": 1716454224301010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224363618, "dur": 89, "args": { "External id": 187046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187046, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 187046, "pid": 5, "tid": 7, "ts": 1716454224363618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301011, "dur": 6, "args": { "External id": 187046, "cbid": 211, "correlation": 187046 } }, { "ph": "s", "id": 187046, "pid": 76337, "tid": -914061504, "ts": 1716454224301011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224363708, "dur": 15, "args": { "External id": 187051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187051, "pid": 5, "tid": 7, "ts": 1716454224363708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301039, "dur": 8, "args": { "External id": 187051, "cbid": 211, "correlation": 187051 } }, { "ph": "s", "id": 187051, "pid": 76337, "tid": -914061504, "ts": 1716454224301039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224363724, "dur": 83, "args": { "External id": 187060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187060, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187060, "pid": 5, "tid": 7, "ts": 1716454224363724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301124, "dur": 14, "args": { "External id": 187060, "cbid": 211, "correlation": 187060 } }, { "ph": "s", "id": 187060, "pid": 76337, "tid": -914061504, "ts": 1716454224301124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224363808, "dur": 29, "args": { "External id": 187082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187082, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187082, "pid": 5, "tid": 7, "ts": 1716454224363808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301183, "dur": 10, "args": { "External id": 187082, "cbid": 211, "correlation": 187082 } }, { "ph": "s", "id": 187082, "pid": 76337, "tid": -914061504, "ts": 1716454224301183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224301281, "dur": 1, "args": { "External id": 187093, "cbid": 251, "correlation": 187093 } }, { "ph": "f", "id": 187093, "pid": 76337, "tid": -914061504, "ts": 1716454224301281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224363839, "dur": 162, "args": { "External id": 187094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187094, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187094, "pid": 5, "tid": 7, "ts": 1716454224363839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301287, "dur": 13, "args": { "External id": 187094, "cbid": 211, "correlation": 187094 } }, { "ph": "s", "id": 187094, "pid": 76337, "tid": -914061504, "ts": 1716454224301287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224301358, "dur": 1, "args": { "External id": 187105, "cbid": 251, "correlation": 187105 } }, { "ph": "f", "id": 187105, "pid": 76337, "tid": -914061504, "ts": 1716454224301358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224364003, "dur": 156, "args": { "External id": 187106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187106, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187106, "pid": 5, "tid": 7, "ts": 1716454224364003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301362, "dur": 11, "args": { "External id": 187106, "cbid": 211, "correlation": 187106 } }, { "ph": "s", "id": 187106, "pid": 76337, "tid": -914061504, "ts": 1716454224301362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224301426, "dur": 1, "args": { "External id": 187117, "cbid": 251, "correlation": 187117 } }, { "ph": "f", "id": 187117, "pid": 76337, "tid": -914061504, "ts": 1716454224301426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224364159, "dur": 158, "args": { "External id": 187118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187118, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187118, "pid": 5, "tid": 7, "ts": 1716454224364159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301430, "dur": 11, "args": { "External id": 187118, "cbid": 211, "correlation": 187118 } }, { "ph": "s", "id": 187118, "pid": 76337, "tid": -914061504, "ts": 1716454224301430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224364318, "dur": 331, "args": { "External id": 187143, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187143, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187143, "pid": 5, "tid": 7, "ts": 1716454224364318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301518, "dur": 13, "args": { "External id": 187143, "cbid": 211, "correlation": 187143 } }, { "ph": "s", "id": 187143, "pid": 76337, "tid": -914061504, "ts": 1716454224301518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224301623, "dur": 1, "args": { "External id": 187161, "cbid": 251, "correlation": 187161 } }, { "ph": "f", "id": 187161, "pid": 76337, "tid": -914061504, "ts": 1716454224301623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224364650, "dur": 165, "args": { "External id": 187163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187163, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187163, "pid": 5, "tid": 7, "ts": 1716454224364650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301629, "dur": 13, "args": { "External id": 187163, "cbid": 211, "correlation": 187163 } }, { "ph": "s", "id": 187163, "pid": 76337, "tid": -914061504, "ts": 1716454224301629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224364817, "dur": 19, "args": { "External id": 187171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187171, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187171, "pid": 5, "tid": 7, "ts": 1716454224364817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301699, "dur": 13, "args": { "External id": 187171, "cbid": 211, "correlation": 187171 } }, { "ph": "s", "id": 187171, "pid": 76337, "tid": -914061504, "ts": 1716454224301699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224364837, "dur": 27, "args": { "External id": 187179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187179, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187179, "pid": 5, "tid": 7, "ts": 1716454224364837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301739, "dur": 8, "args": { "External id": 187179, "cbid": 211, "correlation": 187179 } }, { "ph": "s", "id": 187179, "pid": 76337, "tid": -914061504, "ts": 1716454224301739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224364866, "dur": 18, "args": { "External id": 187190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187190, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187190, "pid": 5, "tid": 7, "ts": 1716454224364866, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301814, "dur": 13, "args": { "External id": 187190, "cbid": 211, "correlation": 187190 } }, { "ph": "s", "id": 187190, "pid": 76337, "tid": -914061504, "ts": 1716454224301814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224364885, "dur": 16, "args": { "External id": 187212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187212, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187212, "pid": 5, "tid": 7, "ts": 1716454224364885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301845, "dur": 7, "args": { "External id": 187212, "cbid": 211, "correlation": 187212 } }, { "ph": "s", "id": 187212, "pid": 76337, "tid": -914061504, "ts": 1716454224301845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224301930, "dur": 1, "args": { "External id": 187223, "cbid": 251, "correlation": 187223 } }, { "ph": "f", "id": 187223, "pid": 76337, "tid": -914061504, "ts": 1716454224301930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224364902, "dur": 88, "args": { "External id": 187224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187224, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 187224, "pid": 5, "tid": 7, "ts": 1716454224364902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224301935, "dur": 13, "args": { "External id": 187224, "cbid": 211, "correlation": 187224 } }, { "ph": "s", "id": 187224, "pid": 76337, "tid": -914061504, "ts": 1716454224301935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302017, "dur": 1, "args": { "External id": 187235, "cbid": 251, "correlation": 187235 } }, { "ph": "f", "id": 187235, "pid": 76337, "tid": -914061504, "ts": 1716454224302017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302021, "dur": 0, "args": { "External id": 187236, "cbid": 251, "correlation": 187236 } }, { "ph": "f", "id": 187236, "pid": 76337, "tid": -914061504, "ts": 1716454224302021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224364991, "dur": 12, "args": { "External id": 187237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187237, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187237, "pid": 5, "tid": 7, "ts": 1716454224364991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302023, "dur": 13, "args": { "External id": 187237, "cbid": 211, "correlation": 187237 } }, { "ph": "s", "id": 187237, "pid": 76337, "tid": -914061504, "ts": 1716454224302023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224365005, "dur": 6, "args": { "External id": 187239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187239, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187239, "pid": 5, "tid": 7, "ts": 1716454224365005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302038, "dur": 6, "args": { "External id": 187239, "cbid": 211, "correlation": 187239 } }, { "ph": "s", "id": 187239, "pid": 76337, "tid": -914061504, "ts": 1716454224302038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302097, "dur": 1, "args": { "External id": 187250, "cbid": 251, "correlation": 187250 } }, { "ph": "f", "id": 187250, "pid": 76337, "tid": -914061504, "ts": 1716454224302097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302100, "dur": 0, "args": { "External id": 187251, "cbid": 251, "correlation": 187251 } }, { "ph": "f", "id": 187251, "pid": 76337, "tid": -914061504, "ts": 1716454224302100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224365012, "dur": 8, "args": { "External id": 187252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187252, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187252, "pid": 5, "tid": 7, "ts": 1716454224365012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302102, "dur": 12, "args": { "External id": 187252, "cbid": 211, "correlation": 187252 } }, { "ph": "s", "id": 187252, "pid": 76337, "tid": -914061504, "ts": 1716454224302102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224365022, "dur": 3, "args": { "External id": 187254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187254, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187254, "pid": 5, "tid": 7, "ts": 1716454224365022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302116, "dur": 5, "args": { "External id": 187254, "cbid": 211, "correlation": 187254 } }, { "ph": "s", "id": 187254, "pid": 76337, "tid": -914061504, "ts": 1716454224302116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224365026, "dur": 54, "args": { "External id": 187279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187279, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187279, "pid": 5, "tid": 7, "ts": 1716454224365026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302191, "dur": 12, "args": { "External id": 187279, "cbid": 211, "correlation": 187279 } }, { "ph": "s", "id": 187279, "pid": 76337, "tid": -914061504, "ts": 1716454224302191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302290, "dur": 1, "args": { "External id": 187297, "cbid": 251, "correlation": 187297 } }, { "ph": "f", "id": 187297, "pid": 76337, "tid": -914061504, "ts": 1716454224302290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224365081, "dur": 90, "args": { "External id": 187299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187299, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 187299, "pid": 5, "tid": 7, "ts": 1716454224365081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302296, "dur": 15, "args": { "External id": 187299, "cbid": 211, "correlation": 187299 } }, { "ph": "s", "id": 187299, "pid": 76337, "tid": -914061504, "ts": 1716454224302296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224365173, "dur": 9, "args": { "External id": 187307, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187307, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187307, "pid": 5, "tid": 7, "ts": 1716454224365173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302368, "dur": 12, "args": { "External id": 187307, "cbid": 211, "correlation": 187307 } }, { "ph": "s", "id": 187307, "pid": 76337, "tid": -914061504, "ts": 1716454224302368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224365183, "dur": 21, "args": { "External id": 187315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187315, "pid": 5, "tid": 7, "ts": 1716454224365183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302411, "dur": 9, "args": { "External id": 187315, "cbid": 211, "correlation": 187315 } }, { "ph": "s", "id": 187315, "pid": 76337, "tid": -914061504, "ts": 1716454224302411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224365206, "dur": 18, "args": { "External id": 187337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187337, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187337, "pid": 5, "tid": 7, "ts": 1716454224365206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302463, "dur": 10, "args": { "External id": 187337, "cbid": 211, "correlation": 187337 } }, { "ph": "s", "id": 187337, "pid": 76337, "tid": -914061504, "ts": 1716454224302463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302550, "dur": 1, "args": { "External id": 187353, "cbid": 251, "correlation": 187353 } }, { "ph": "f", "id": 187353, "pid": 76337, "tid": -914061504, "ts": 1716454224302550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302555, "dur": 0, "args": { "External id": 187355, "cbid": 251, "correlation": 187355 } }, { "ph": "f", "id": 187355, "pid": 76337, "tid": -914061504, "ts": 1716454224302555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224365225, "dur": 493, "args": { "External id": 187356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187356, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187356, "pid": 5, "tid": 7, "ts": 1716454224365225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302557, "dur": 14, "args": { "External id": 187356, "cbid": 211, "correlation": 187356 } }, { "ph": "s", "id": 187356, "pid": 76337, "tid": -914061504, "ts": 1716454224302557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224365720, "dur": 65, "args": { "External id": 187364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187364, "pid": 5, "tid": 7, "ts": 1716454224365720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302628, "dur": 12, "args": { "External id": 187364, "cbid": 211, "correlation": 187364 } }, { "ph": "s", "id": 187364, "pid": 76337, "tid": -914061504, "ts": 1716454224302628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224365787, "dur": 67, "args": { "External id": 187372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187372, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187372, "pid": 5, "tid": 7, "ts": 1716454224365787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302662, "dur": 9, "args": { "External id": 187372, "cbid": 211, "correlation": 187372 } }, { "ph": "s", "id": 187372, "pid": 76337, "tid": -914061504, "ts": 1716454224302662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224302742, "dur": 1, "args": { "External id": 187388, "cbid": 251, "correlation": 187388 } }, { "ph": "f", "id": 187388, "pid": 76337, "tid": -914061504, "ts": 1716454224302742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224365855, "dur": 1, "args": { "External id": 187390, "device": 5, "context": 1, "stream": 7, "correlation": 187390, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 187390, "pid": 5, "tid": 7, "ts": 1716454224365855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224302746, "dur": 11, "args": { "External id": 187390, "cbid": 51, "correlation": 187390 } }, { "ph": "s", "id": 187390, "pid": 76337, "tid": -914061504, "ts": 1716454224302746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224365859, "dur": 267, "args": { "External id": 187391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187391, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 187391, "pid": 5, "tid": 7, "ts": 1716454224365859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302759, "dur": 11, "args": { "External id": 187391, "cbid": 211, "correlation": 187391 } }, { "ph": "s", "id": 187391, "pid": 76337, "tid": -914061504, "ts": 1716454224302759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224366127, "dur": 14, "args": { "External id": 187399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187399, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187399, "pid": 5, "tid": 7, "ts": 1716454224366127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302800, "dur": 11, "args": { "External id": 187399, "cbid": 211, "correlation": 187399 } }, { "ph": "s", "id": 187399, "pid": 76337, "tid": -914061504, "ts": 1716454224302800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224366143, "dur": 37, "args": { "External id": 187410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187410, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187410, "pid": 5, "tid": 7, "ts": 1716454224366143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302870, "dur": 12, "args": { "External id": 187410, "cbid": 211, "correlation": 187410 } }, { "ph": "s", "id": 187410, "pid": 76337, "tid": -914061504, "ts": 1716454224302870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224302934, "dur": 0, "args": { "External id": 187422, "cbid": 317, "correlation": 187422 } }, { "ph": "f", "id": 187422, "pid": 76337, "tid": -914061504, "ts": 1716454224302934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224302935, "dur": 0, "args": { "External id": 187423, "cbid": 203, "correlation": 187423 } }, { "ph": "f", "id": 187423, "pid": 76337, "tid": -914061504, "ts": 1716454224302935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224302935, "dur": 0, "args": { "External id": 187424, "cbid": 205, "correlation": 187424 } }, { "ph": "f", "id": 187424, "pid": 76337, "tid": -914061504, "ts": 1716454224302935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224366182, "dur": 13, "args": { "External id": 187428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187428, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187428, "pid": 5, "tid": 7, "ts": 1716454224366182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302951, "dur": 13, "args": { "External id": 187428, "cbid": 211, "correlation": 187428 } }, { "ph": "s", "id": 187428, "pid": 76337, "tid": -914061504, "ts": 1716454224302951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224366196, "dur": 4, "args": { "External id": 187430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 187430, "pid": 5, "tid": 7, "ts": 1716454224366196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302968, "dur": 13, "args": { "External id": 187430, "cbid": 211, "correlation": 187430 } }, { "ph": "s", "id": 187430, "pid": 76337, "tid": -914061504, "ts": 1716454224302968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224302984, "dur": 0, "args": { "External id": 187431, "cbid": 51, "correlation": 187431 } }, { "ph": "s", "id": 187431, "pid": 76337, "tid": -914061504, "ts": 1716454224302984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224366201, "dur": 96, "args": { "External id": 187432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187432, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 187432, "pid": 5, "tid": 7, "ts": 1716454224366201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224302985, "dur": 6, "args": { "External id": 187432, "cbid": 211, "correlation": 187432 } }, { "ph": "s", "id": 187432, "pid": 76337, "tid": -914061504, "ts": 1716454224302985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224366299, "dur": 16, "args": { "External id": 187437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187437, "pid": 5, "tid": 7, "ts": 1716454224366299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303014, "dur": 9, "args": { "External id": 187437, "cbid": 211, "correlation": 187437 } }, { "ph": "s", "id": 187437, "pid": 76337, "tid": -914061504, "ts": 1716454224303014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224366316, "dur": 12, "args": { "External id": 187445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187445, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187445, "pid": 5, "tid": 7, "ts": 1716454224366316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303045, "dur": 9, "args": { "External id": 187445, "cbid": 211, "correlation": 187445 } }, { "ph": "s", "id": 187445, "pid": 76337, "tid": -914061504, "ts": 1716454224303045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224366329, "dur": 55, "args": { "External id": 187456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187456, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187456, "pid": 5, "tid": 7, "ts": 1716454224366329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303113, "dur": 12, "args": { "External id": 187456, "cbid": 211, "correlation": 187456 } }, { "ph": "s", "id": 187456, "pid": 76337, "tid": -914061504, "ts": 1716454224303113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224303169, "dur": 0, "args": { "External id": 187466, "cbid": 317, "correlation": 187466 } }, { "ph": "f", "id": 187466, "pid": 76337, "tid": -914061504, "ts": 1716454224303169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224303170, "dur": 0, "args": { "External id": 187467, "cbid": 203, "correlation": 187467 } }, { "ph": "f", "id": 187467, "pid": 76337, "tid": -914061504, "ts": 1716454224303170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224303170, "dur": 0, "args": { "External id": 187468, "cbid": 205, "correlation": 187468 } }, { "ph": "f", "id": 187468, "pid": 76337, "tid": -914061504, "ts": 1716454224303170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224366385, "dur": 38, "args": { "External id": 187472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187472, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187472, "pid": 5, "tid": 7, "ts": 1716454224366385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303187, "dur": 12, "args": { "External id": 187472, "cbid": 211, "correlation": 187472 } }, { "ph": "s", "id": 187472, "pid": 76337, "tid": -914061504, "ts": 1716454224303187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224366424, "dur": 159, "args": { "External id": 187474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187474, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187474, "pid": 5, "tid": 7, "ts": 1716454224366424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303201, "dur": 6, "args": { "External id": 187474, "cbid": 211, "correlation": 187474 } }, { "ph": "s", "id": 187474, "pid": 76337, "tid": -914061504, "ts": 1716454224303201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224366585, "dur": 1975, "args": { "External id": 187476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187476, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187476, "pid": 5, "tid": 7, "ts": 1716454224366585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303214, "dur": 9, "args": { "External id": 187476, "cbid": 211, "correlation": 187476 } }, { "ph": "s", "id": 187476, "pid": 76337, "tid": -914061504, "ts": 1716454224303214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224368561, "dur": 38, "args": { "External id": 187478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187478, "pid": 5, "tid": 7, "ts": 1716454224368561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303227, "dur": 5, "args": { "External id": 187478, "cbid": 211, "correlation": 187478 } }, { "ph": "s", "id": 187478, "pid": 76337, "tid": -914061504, "ts": 1716454224303227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224368601, "dur": 59, "args": { "External id": 187484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187484, "pid": 5, "tid": 7, "ts": 1716454224368601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303256, "dur": 8, "args": { "External id": 187484, "cbid": 211, "correlation": 187484 } }, { "ph": "s", "id": 187484, "pid": 76337, "tid": -914061504, "ts": 1716454224303256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224368661, "dur": 82, "args": { "External id": 187493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187493, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187493, "pid": 5, "tid": 7, "ts": 1716454224368661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303349, "dur": 14, "args": { "External id": 187493, "cbid": 211, "correlation": 187493 } }, { "ph": "s", "id": 187493, "pid": 76337, "tid": -914061504, "ts": 1716454224303349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224368745, "dur": 73, "args": { "External id": 187513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187513, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 187513, "pid": 5, "tid": 7, "ts": 1716454224368745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303422, "dur": 11, "args": { "External id": 187513, "cbid": 211, "correlation": 187513 } }, { "ph": "s", "id": 187513, "pid": 76337, "tid": -914061504, "ts": 1716454224303422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224368819, "dur": 5, "args": { "External id": 187525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187525, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 187525, "pid": 5, "tid": 7, "ts": 1716454224368819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303442, "dur": 6, "args": { "External id": 187525, "cbid": 211, "correlation": 187525 } }, { "ph": "s", "id": 187525, "pid": 76337, "tid": -914061504, "ts": 1716454224303442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224368825, "dur": 81, "args": { "External id": 187528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187528, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187528, "pid": 5, "tid": 7, "ts": 1716454224368825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303462, "dur": 7, "args": { "External id": 187528, "cbid": 211, "correlation": 187528 } }, { "ph": "s", "id": 187528, "pid": 76337, "tid": -914061504, "ts": 1716454224303462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224368907, "dur": 53, "args": { "External id": 187537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187537, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187537, "pid": 5, "tid": 7, "ts": 1716454224368907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303503, "dur": 10, "args": { "External id": 187537, "cbid": 211, "correlation": 187537 } }, { "ph": "s", "id": 187537, "pid": 76337, "tid": -914061504, "ts": 1716454224303503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224303555, "dur": 0, "args": { "External id": 187547, "cbid": 317, "correlation": 187547 } }, { "ph": "f", "id": 187547, "pid": 76337, "tid": -914061504, "ts": 1716454224303555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224303555, "dur": 0, "args": { "External id": 187548, "cbid": 203, "correlation": 187548 } }, { "ph": "f", "id": 187548, "pid": 76337, "tid": -914061504, "ts": 1716454224303555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224303556, "dur": 0, "args": { "External id": 187549, "cbid": 205, "correlation": 187549 } }, { "ph": "f", "id": 187549, "pid": 76337, "tid": -914061504, "ts": 1716454224303556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224368962, "dur": 57, "args": { "External id": 187553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187553, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187553, "pid": 5, "tid": 7, "ts": 1716454224368962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303573, "dur": 11, "args": { "External id": 187553, "cbid": 211, "correlation": 187553 } }, { "ph": "s", "id": 187553, "pid": 76337, "tid": -914061504, "ts": 1716454224303573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224369021, "dur": 121, "args": { "External id": 187555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187555, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187555, "pid": 5, "tid": 7, "ts": 1716454224369021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303587, "dur": 5, "args": { "External id": 187555, "cbid": 211, "correlation": 187555 } }, { "ph": "s", "id": 187555, "pid": 76337, "tid": -914061504, "ts": 1716454224303587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224369142, "dur": 1880, "args": { "External id": 187557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187557, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187557, "pid": 5, "tid": 7, "ts": 1716454224369142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303598, "dur": 7, "args": { "External id": 187557, "cbid": 211, "correlation": 187557 } }, { "ph": "s", "id": 187557, "pid": 76337, "tid": -914061504, "ts": 1716454224303598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224371024, "dur": 19, "args": { "External id": 187559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187559, "pid": 5, "tid": 7, "ts": 1716454224371024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303609, "dur": 5, "args": { "External id": 187559, "cbid": 211, "correlation": 187559 } }, { "ph": "s", "id": 187559, "pid": 76337, "tid": -914061504, "ts": 1716454224303609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224371044, "dur": 32, "args": { "External id": 187565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187565, "pid": 5, "tid": 7, "ts": 1716454224371044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303636, "dur": 8, "args": { "External id": 187565, "cbid": 211, "correlation": 187565 } }, { "ph": "s", "id": 187565, "pid": 76337, "tid": -914061504, "ts": 1716454224303636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224371078, "dur": 4, "args": { "External id": 187573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187573, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 187573, "pid": 5, "tid": 7, "ts": 1716454224371078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303680, "dur": 9, "args": { "External id": 187573, "cbid": 211, "correlation": 187573 } }, { "ph": "s", "id": 187573, "pid": 76337, "tid": -914061504, "ts": 1716454224303680, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224303749, "dur": 1, "args": { "External id": 187589, "cbid": 251, "correlation": 187589 } }, { "ph": "f", "id": 187589, "pid": 76337, "tid": -914061504, "ts": 1716454224303749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224303755, "dur": 0, "args": { "External id": 187591, "cbid": 251, "correlation": 187591 } }, { "ph": "f", "id": 187591, "pid": 76337, "tid": -914061504, "ts": 1716454224303755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224371083, "dur": 12, "args": { "External id": 187592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187592, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 187592, "pid": 5, "tid": 7, "ts": 1716454224371083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303757, "dur": 11, "args": { "External id": 187592, "cbid": 211, "correlation": 187592 } }, { "ph": "s", "id": 187592, "pid": 76337, "tid": -914061504, "ts": 1716454224303757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224371096, "dur": 5, "args": { "External id": 187594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187594, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 187594, "pid": 5, "tid": 7, "ts": 1716454224371096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303770, "dur": 5, "args": { "External id": 187594, "cbid": 211, "correlation": 187594 } }, { "ph": "s", "id": 187594, "pid": 76337, "tid": -914061504, "ts": 1716454224303770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224371102, "dur": 29, "args": { "External id": 187604, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187604, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187604, "pid": 5, "tid": 7, "ts": 1716454224371102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303831, "dur": 12, "args": { "External id": 187604, "cbid": 211, "correlation": 187604 } }, { "ph": "s", "id": 187604, "pid": 76337, "tid": -914061504, "ts": 1716454224303831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224371132, "dur": 30, "args": { "External id": 187624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187624, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 187624, "pid": 5, "tid": 7, "ts": 1716454224371132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303897, "dur": 11, "args": { "External id": 187624, "cbid": 211, "correlation": 187624 } }, { "ph": "s", "id": 187624, "pid": 76337, "tid": -914061504, "ts": 1716454224303897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224371164, "dur": 4, "args": { "External id": 187636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187636, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 187636, "pid": 5, "tid": 7, "ts": 1716454224371164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303918, "dur": 6, "args": { "External id": 187636, "cbid": 211, "correlation": 187636 } }, { "ph": "s", "id": 187636, "pid": 76337, "tid": -914061504, "ts": 1716454224303918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224371169, "dur": 29, "args": { "External id": 187639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187639, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187639, "pid": 5, "tid": 7, "ts": 1716454224371169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303936, "dur": 6, "args": { "External id": 187639, "cbid": 211, "correlation": 187639 } }, { "ph": "s", "id": 187639, "pid": 76337, "tid": -914061504, "ts": 1716454224303936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224371199, "dur": 20, "args": { "External id": 187648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187648, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187648, "pid": 5, "tid": 7, "ts": 1716454224371199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224303983, "dur": 11, "args": { "External id": 187648, "cbid": 211, "correlation": 187648 } }, { "ph": "s", "id": 187648, "pid": 76337, "tid": -914061504, "ts": 1716454224303983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224304049, "dur": 0, "args": { "External id": 187658, "cbid": 317, "correlation": 187658 } }, { "ph": "f", "id": 187658, "pid": 76337, "tid": -914061504, "ts": 1716454224304049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224304050, "dur": 0, "args": { "External id": 187659, "cbid": 203, "correlation": 187659 } }, { "ph": "f", "id": 187659, "pid": 76337, "tid": -914061504, "ts": 1716454224304050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224304050, "dur": 0, "args": { "External id": 187660, "cbid": 205, "correlation": 187660 } }, { "ph": "f", "id": 187660, "pid": 76337, "tid": -914061504, "ts": 1716454224304050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224371221, "dur": 21, "args": { "External id": 187664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187664, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187664, "pid": 5, "tid": 7, "ts": 1716454224371221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304067, "dur": 12, "args": { "External id": 187664, "cbid": 211, "correlation": 187664 } }, { "ph": "s", "id": 187664, "pid": 76337, "tid": -914061504, "ts": 1716454224304067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224371243, "dur": 43, "args": { "External id": 187666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187666, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187666, "pid": 5, "tid": 7, "ts": 1716454224371243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304082, "dur": 5, "args": { "External id": 187666, "cbid": 211, "correlation": 187666 } }, { "ph": "s", "id": 187666, "pid": 76337, "tid": -914061504, "ts": 1716454224304082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224371288, "dur": 640, "args": { "External id": 187668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187668, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187668, "pid": 5, "tid": 7, "ts": 1716454224371288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304093, "dur": 6, "args": { "External id": 187668, "cbid": 211, "correlation": 187668 } }, { "ph": "s", "id": 187668, "pid": 76337, "tid": -914061504, "ts": 1716454224304093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224371929, "dur": 23, "args": { "External id": 187670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187670, "pid": 5, "tid": 7, "ts": 1716454224371929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304104, "dur": 5, "args": { "External id": 187670, "cbid": 211, "correlation": 187670 } }, { "ph": "s", "id": 187670, "pid": 76337, "tid": -914061504, "ts": 1716454224304104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224371953, "dur": 32, "args": { "External id": 187676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187676, "pid": 5, "tid": 7, "ts": 1716454224371953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304132, "dur": 8, "args": { "External id": 187676, "cbid": 211, "correlation": 187676 } }, { "ph": "s", "id": 187676, "pid": 76337, "tid": -914061504, "ts": 1716454224304132, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224304190, "dur": 0, "args": { "External id": 187686, "cbid": 317, "correlation": 187686 } }, { "ph": "f", "id": 187686, "pid": 76337, "tid": -914061504, "ts": 1716454224304190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224304191, "dur": 0, "args": { "External id": 187687, "cbid": 203, "correlation": 187687 } }, { "ph": "f", "id": 187687, "pid": 76337, "tid": -914061504, "ts": 1716454224304191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224304192, "dur": 0, "args": { "External id": 187688, "cbid": 205, "correlation": 187688 } }, { "ph": "f", "id": 187688, "pid": 76337, "tid": -914061504, "ts": 1716454224304192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224371986, "dur": 56, "args": { "External id": 187692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187692, "pid": 5, "tid": 7, "ts": 1716454224371986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304206, "dur": 12, "args": { "External id": 187692, "cbid": 211, "correlation": 187692 } }, { "ph": "s", "id": 187692, "pid": 76337, "tid": -914061504, "ts": 1716454224304206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224372044, "dur": 263, "args": { "External id": 187694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187694, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187694, "pid": 5, "tid": 7, "ts": 1716454224372044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304224, "dur": 8, "args": { "External id": 187694, "cbid": 211, "correlation": 187694 } }, { "ph": "s", "id": 187694, "pid": 76337, "tid": -914061504, "ts": 1716454224304224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224372308, "dur": 23, "args": { "External id": 187696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187696, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187696, "pid": 5, "tid": 7, "ts": 1716454224372308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304235, "dur": 5, "args": { "External id": 187696, "cbid": 211, "correlation": 187696 } }, { "ph": "s", "id": 187696, "pid": 76337, "tid": -914061504, "ts": 1716454224304235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224372332, "dur": 32, "args": { "External id": 187702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187702, "pid": 5, "tid": 7, "ts": 1716454224372332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304261, "dur": 8, "args": { "External id": 187702, "cbid": 211, "correlation": 187702 } }, { "ph": "s", "id": 187702, "pid": 76337, "tid": -914061504, "ts": 1716454224304261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224372365, "dur": 27, "args": { "External id": 187710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187710, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187710, "pid": 5, "tid": 7, "ts": 1716454224372365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304289, "dur": 8, "args": { "External id": 187710, "cbid": 211, "correlation": 187710 } }, { "ph": "s", "id": 187710, "pid": 76337, "tid": -914061504, "ts": 1716454224304289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224372394, "dur": 19, "args": { "External id": 187718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187718, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187718, "pid": 5, "tid": 7, "ts": 1716454224372394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304318, "dur": 9, "args": { "External id": 187718, "cbid": 211, "correlation": 187718 } }, { "ph": "s", "id": 187718, "pid": 76337, "tid": -914061504, "ts": 1716454224304318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224372414, "dur": 31, "args": { "External id": 187738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187738, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 187738, "pid": 5, "tid": 7, "ts": 1716454224372414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304401, "dur": 13, "args": { "External id": 187738, "cbid": 211, "correlation": 187738 } }, { "ph": "s", "id": 187738, "pid": 76337, "tid": -914061504, "ts": 1716454224304401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224372447, "dur": 4, "args": { "External id": 187750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187750, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 187750, "pid": 5, "tid": 7, "ts": 1716454224372447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304423, "dur": 6, "args": { "External id": 187750, "cbid": 211, "correlation": 187750 } }, { "ph": "s", "id": 187750, "pid": 76337, "tid": -914061504, "ts": 1716454224304423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224372453, "dur": 30, "args": { "External id": 187753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187753, "pid": 5, "tid": 7, "ts": 1716454224372453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304440, "dur": 7, "args": { "External id": 187753, "cbid": 211, "correlation": 187753 } }, { "ph": "s", "id": 187753, "pid": 76337, "tid": -914061504, "ts": 1716454224304440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224304498, "dur": 0, "args": { "External id": 187764, "cbid": 317, "correlation": 187764 } }, { "ph": "f", "id": 187764, "pid": 76337, "tid": -914061504, "ts": 1716454224304498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224304499, "dur": 0, "args": { "External id": 187765, "cbid": 203, "correlation": 187765 } }, { "ph": "f", "id": 187765, "pid": 76337, "tid": -914061504, "ts": 1716454224304499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224304500, "dur": 0, "args": { "External id": 187766, "cbid": 205, "correlation": 187766 } }, { "ph": "f", "id": 187766, "pid": 76337, "tid": -914061504, "ts": 1716454224304500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224372484, "dur": 21, "args": { "External id": 187770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187770, "pid": 5, "tid": 7, "ts": 1716454224372484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304514, "dur": 12, "args": { "External id": 187770, "cbid": 211, "correlation": 187770 } }, { "ph": "s", "id": 187770, "pid": 76337, "tid": -914061504, "ts": 1716454224304514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224372507, "dur": 104, "args": { "External id": 187772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187772, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187772, "pid": 5, "tid": 7, "ts": 1716454224372507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304532, "dur": 7, "args": { "External id": 187772, "cbid": 211, "correlation": 187772 } }, { "ph": "s", "id": 187772, "pid": 76337, "tid": -914061504, "ts": 1716454224304532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224372612, "dur": 23, "args": { "External id": 187774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187774, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187774, "pid": 5, "tid": 7, "ts": 1716454224372612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304543, "dur": 5, "args": { "External id": 187774, "cbid": 211, "correlation": 187774 } }, { "ph": "s", "id": 187774, "pid": 76337, "tid": -914061504, "ts": 1716454224304543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224372636, "dur": 32, "args": { "External id": 187780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187780, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187780, "pid": 5, "tid": 7, "ts": 1716454224372636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304571, "dur": 8, "args": { "External id": 187780, "cbid": 211, "correlation": 187780 } }, { "ph": "s", "id": 187780, "pid": 76337, "tid": -914061504, "ts": 1716454224304571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224372669, "dur": 192, "args": { "External id": 187789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187789, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187789, "pid": 5, "tid": 7, "ts": 1716454224372669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304652, "dur": 15, "args": { "External id": 187789, "cbid": 211, "correlation": 187789 } }, { "ph": "s", "id": 187789, "pid": 76337, "tid": -914061504, "ts": 1716454224304652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224372863, "dur": 64, "args": { "External id": 187811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187811, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187811, "pid": 5, "tid": 7, "ts": 1716454224372863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304711, "dur": 10, "args": { "External id": 187811, "cbid": 211, "correlation": 187811 } }, { "ph": "s", "id": 187811, "pid": 76337, "tid": -914061504, "ts": 1716454224304711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224304801, "dur": 1, "args": { "External id": 187822, "cbid": 251, "correlation": 187822 } }, { "ph": "f", "id": 187822, "pid": 76337, "tid": -914061504, "ts": 1716454224304801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224372928, "dur": 152, "args": { "External id": 187823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187823, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187823, "pid": 5, "tid": 7, "ts": 1716454224372928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304807, "dur": 13, "args": { "External id": 187823, "cbid": 211, "correlation": 187823 } }, { "ph": "s", "id": 187823, "pid": 76337, "tid": -914061504, "ts": 1716454224304807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224304876, "dur": 1, "args": { "External id": 187834, "cbid": 251, "correlation": 187834 } }, { "ph": "f", "id": 187834, "pid": 76337, "tid": -914061504, "ts": 1716454224304876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224373081, "dur": 146, "args": { "External id": 187835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187835, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187835, "pid": 5, "tid": 7, "ts": 1716454224373081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304880, "dur": 11, "args": { "External id": 187835, "cbid": 211, "correlation": 187835 } }, { "ph": "s", "id": 187835, "pid": 76337, "tid": -914061504, "ts": 1716454224304880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224304945, "dur": 1, "args": { "External id": 187846, "cbid": 251, "correlation": 187846 } }, { "ph": "f", "id": 187846, "pid": 76337, "tid": -914061504, "ts": 1716454224304945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224373229, "dur": 143, "args": { "External id": 187847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187847, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187847, "pid": 5, "tid": 7, "ts": 1716454224373229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224304949, "dur": 12, "args": { "External id": 187847, "cbid": 211, "correlation": 187847 } }, { "ph": "s", "id": 187847, "pid": 76337, "tid": -914061504, "ts": 1716454224304949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224373374, "dur": 1910, "args": { "External id": 187868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187868, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 187868, "pid": 5, "tid": 7, "ts": 1716454224373374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305040, "dur": 14, "args": { "External id": 187868, "cbid": 211, "correlation": 187868 } }, { "ph": "s", "id": 187868, "pid": 76337, "tid": -914061504, "ts": 1716454224305040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305140, "dur": 1, "args": { "External id": 187886, "cbid": 251, "correlation": 187886 } }, { "ph": "f", "id": 187886, "pid": 76337, "tid": -914061504, "ts": 1716454224305140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224375285, "dur": 149, "args": { "External id": 187888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187888, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 187888, "pid": 5, "tid": 7, "ts": 1716454224375285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305146, "dur": 14, "args": { "External id": 187888, "cbid": 211, "correlation": 187888 } }, { "ph": "s", "id": 187888, "pid": 76337, "tid": -914061504, "ts": 1716454224305146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224375435, "dur": 35, "args": { "External id": 187896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187896, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187896, "pid": 5, "tid": 7, "ts": 1716454224375435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305217, "dur": 12, "args": { "External id": 187896, "cbid": 211, "correlation": 187896 } }, { "ph": "s", "id": 187896, "pid": 76337, "tid": -914061504, "ts": 1716454224305217, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224375471, "dur": 51, "args": { "External id": 187904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187904, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187904, "pid": 5, "tid": 7, "ts": 1716454224375471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305256, "dur": 9, "args": { "External id": 187904, "cbid": 211, "correlation": 187904 } }, { "ph": "s", "id": 187904, "pid": 76337, "tid": -914061504, "ts": 1716454224305256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224375523, "dur": 29, "args": { "External id": 187915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187915, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187915, "pid": 5, "tid": 7, "ts": 1716454224375523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305327, "dur": 12, "args": { "External id": 187915, "cbid": 211, "correlation": 187915 } }, { "ph": "s", "id": 187915, "pid": 76337, "tid": -914061504, "ts": 1716454224305327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224375553, "dur": 33, "args": { "External id": 187937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187937, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 187937, "pid": 5, "tid": 7, "ts": 1716454224375553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305358, "dur": 8, "args": { "External id": 187937, "cbid": 211, "correlation": 187937 } }, { "ph": "s", "id": 187937, "pid": 76337, "tid": -914061504, "ts": 1716454224305358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305442, "dur": 1, "args": { "External id": 187948, "cbid": 251, "correlation": 187948 } }, { "ph": "f", "id": 187948, "pid": 76337, "tid": -914061504, "ts": 1716454224305442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224375588, "dur": 76, "args": { "External id": 187949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187949, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 187949, "pid": 5, "tid": 7, "ts": 1716454224375588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305448, "dur": 14, "args": { "External id": 187949, "cbid": 211, "correlation": 187949 } }, { "ph": "s", "id": 187949, "pid": 76337, "tid": -914061504, "ts": 1716454224305448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305517, "dur": 1, "args": { "External id": 187960, "cbid": 251, "correlation": 187960 } }, { "ph": "f", "id": 187960, "pid": 76337, "tid": -914061504, "ts": 1716454224305517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305520, "dur": 0, "args": { "External id": 187961, "cbid": 251, "correlation": 187961 } }, { "ph": "f", "id": 187961, "pid": 76337, "tid": -914061504, "ts": 1716454224305520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224375666, "dur": 11, "args": { "External id": 187962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187962, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 187962, "pid": 5, "tid": 7, "ts": 1716454224375666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305522, "dur": 12, "args": { "External id": 187962, "cbid": 211, "correlation": 187962 } }, { "ph": "s", "id": 187962, "pid": 76337, "tid": -914061504, "ts": 1716454224305522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224375678, "dur": 5, "args": { "External id": 187964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187964, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 187964, "pid": 5, "tid": 7, "ts": 1716454224375678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305536, "dur": 6, "args": { "External id": 187964, "cbid": 211, "correlation": 187964 } }, { "ph": "s", "id": 187964, "pid": 76337, "tid": -914061504, "ts": 1716454224305536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305593, "dur": 1, "args": { "External id": 187975, "cbid": 251, "correlation": 187975 } }, { "ph": "f", "id": 187975, "pid": 76337, "tid": -914061504, "ts": 1716454224305593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305596, "dur": 0, "args": { "External id": 187976, "cbid": 251, "correlation": 187976 } }, { "ph": "f", "id": 187976, "pid": 76337, "tid": -914061504, "ts": 1716454224305596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224375684, "dur": 7, "args": { "External id": 187977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187977, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 187977, "pid": 5, "tid": 7, "ts": 1716454224375684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305598, "dur": 12, "args": { "External id": 187977, "cbid": 211, "correlation": 187977 } }, { "ph": "s", "id": 187977, "pid": 76337, "tid": -914061504, "ts": 1716454224305598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224375692, "dur": 3, "args": { "External id": 187979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 187979, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 187979, "pid": 5, "tid": 7, "ts": 1716454224375692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305612, "dur": 5, "args": { "External id": 187979, "cbid": 211, "correlation": 187979 } }, { "ph": "s", "id": 187979, "pid": 76337, "tid": -914061504, "ts": 1716454224305612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224375697, "dur": 91, "args": { "External id": 188000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188000, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 188000, "pid": 5, "tid": 7, "ts": 1716454224375697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305684, "dur": 13, "args": { "External id": 188000, "cbid": 211, "correlation": 188000 } }, { "ph": "s", "id": 188000, "pid": 76337, "tid": -914061504, "ts": 1716454224305684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224305782, "dur": 1, "args": { "External id": 188018, "cbid": 251, "correlation": 188018 } }, { "ph": "f", "id": 188018, "pid": 76337, "tid": -914061504, "ts": 1716454224305782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224375789, "dur": 96, "args": { "External id": 188020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188020, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188020, "pid": 5, "tid": 7, "ts": 1716454224375789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305788, "dur": 13, "args": { "External id": 188020, "cbid": 211, "correlation": 188020 } }, { "ph": "s", "id": 188020, "pid": 76337, "tid": -914061504, "ts": 1716454224305788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224375887, "dur": 19, "args": { "External id": 188028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188028, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188028, "pid": 5, "tid": 7, "ts": 1716454224375887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305857, "dur": 13, "args": { "External id": 188028, "cbid": 211, "correlation": 188028 } }, { "ph": "s", "id": 188028, "pid": 76337, "tid": -914061504, "ts": 1716454224305857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224375907, "dur": 37, "args": { "External id": 188036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188036, "pid": 5, "tid": 7, "ts": 1716454224375907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305898, "dur": 9, "args": { "External id": 188036, "cbid": 211, "correlation": 188036 } }, { "ph": "s", "id": 188036, "pid": 76337, "tid": -914061504, "ts": 1716454224305898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224375945, "dur": 34, "args": { "External id": 188058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188058, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188058, "pid": 5, "tid": 7, "ts": 1716454224375945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224305949, "dur": 10, "args": { "External id": 188058, "cbid": 211, "correlation": 188058 } }, { "ph": "s", "id": 188058, "pid": 76337, "tid": -914061504, "ts": 1716454224305949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224306052, "dur": 1, "args": { "External id": 188074, "cbid": 251, "correlation": 188074 } }, { "ph": "f", "id": 188074, "pid": 76337, "tid": -914061504, "ts": 1716454224306052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224306057, "dur": 0, "args": { "External id": 188076, "cbid": 251, "correlation": 188076 } }, { "ph": "f", "id": 188076, "pid": 76337, "tid": -914061504, "ts": 1716454224306057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224375980, "dur": 532, "args": { "External id": 188077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188077, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 188077, "pid": 5, "tid": 7, "ts": 1716454224375980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306062, "dur": 13, "args": { "External id": 188077, "cbid": 211, "correlation": 188077 } }, { "ph": "s", "id": 188077, "pid": 76337, "tid": -914061504, "ts": 1716454224306062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224376514, "dur": 124, "args": { "External id": 188085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188085, "pid": 5, "tid": 7, "ts": 1716454224376514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306129, "dur": 12, "args": { "External id": 188085, "cbid": 211, "correlation": 188085 } }, { "ph": "s", "id": 188085, "pid": 76337, "tid": -914061504, "ts": 1716454224306129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224376639, "dur": 129, "args": { "External id": 188093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188093, "pid": 5, "tid": 7, "ts": 1716454224376639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306160, "dur": 8, "args": { "External id": 188093, "cbid": 211, "correlation": 188093 } }, { "ph": "s", "id": 188093, "pid": 76337, "tid": -914061504, "ts": 1716454224306160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224306236, "dur": 1, "args": { "External id": 188109, "cbid": 251, "correlation": 188109 } }, { "ph": "f", "id": 188109, "pid": 76337, "tid": -914061504, "ts": 1716454224306236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224376770, "dur": 299, "args": { "External id": 188111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188111, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188111, "pid": 5, "tid": 7, "ts": 1716454224376770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306242, "dur": 12, "args": { "External id": 188111, "cbid": 211, "correlation": 188111 } }, { "ph": "s", "id": 188111, "pid": 76337, "tid": -914061504, "ts": 1716454224306242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224377070, "dur": 27, "args": { "External id": 188119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188119, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188119, "pid": 5, "tid": 7, "ts": 1716454224377070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306284, "dur": 10, "args": { "External id": 188119, "cbid": 211, "correlation": 188119 } }, { "ph": "s", "id": 188119, "pid": 76337, "tid": -914061504, "ts": 1716454224306284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224377099, "dur": 80, "args": { "External id": 188130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188130, "pid": 5, "tid": 7, "ts": 1716454224377099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306350, "dur": 13, "args": { "External id": 188130, "cbid": 211, "correlation": 188130 } }, { "ph": "s", "id": 188130, "pid": 76337, "tid": -914061504, "ts": 1716454224306350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224306414, "dur": 0, "args": { "External id": 188142, "cbid": 317, "correlation": 188142 } }, { "ph": "f", "id": 188142, "pid": 76337, "tid": -914061504, "ts": 1716454224306414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224306414, "dur": 0, "args": { "External id": 188143, "cbid": 203, "correlation": 188143 } }, { "ph": "f", "id": 188143, "pid": 76337, "tid": -914061504, "ts": 1716454224306414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224306415, "dur": 0, "args": { "External id": 188144, "cbid": 205, "correlation": 188144 } }, { "ph": "f", "id": 188144, "pid": 76337, "tid": -914061504, "ts": 1716454224306415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224377180, "dur": 24, "args": { "External id": 188148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188148, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188148, "pid": 5, "tid": 7, "ts": 1716454224377180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306430, "dur": 12, "args": { "External id": 188148, "cbid": 211, "correlation": 188148 } }, { "ph": "s", "id": 188148, "pid": 76337, "tid": -914061504, "ts": 1716454224306430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224377205, "dur": 118, "args": { "External id": 188150, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188150, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188150, "pid": 5, "tid": 7, "ts": 1716454224377205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306448, "dur": 6, "args": { "External id": 188150, "cbid": 211, "correlation": 188150 } }, { "ph": "s", "id": 188150, "pid": 76337, "tid": -914061504, "ts": 1716454224306448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224377325, "dur": 25, "args": { "External id": 188152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188152, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188152, "pid": 5, "tid": 7, "ts": 1716454224377325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306458, "dur": 6, "args": { "External id": 188152, "cbid": 211, "correlation": 188152 } }, { "ph": "s", "id": 188152, "pid": 76337, "tid": -914061504, "ts": 1716454224306458, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224377351, "dur": 32, "args": { "External id": 188158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188158, "pid": 5, "tid": 7, "ts": 1716454224377351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306486, "dur": 8, "args": { "External id": 188158, "cbid": 211, "correlation": 188158 } }, { "ph": "s", "id": 188158, "pid": 76337, "tid": -914061504, "ts": 1716454224306486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224377385, "dur": 27, "args": { "External id": 188166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188166, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188166, "pid": 5, "tid": 7, "ts": 1716454224377385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306518, "dur": 8, "args": { "External id": 188166, "cbid": 211, "correlation": 188166 } }, { "ph": "s", "id": 188166, "pid": 76337, "tid": -914061504, "ts": 1716454224306518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224377413, "dur": 54, "args": { "External id": 188175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188175, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188175, "pid": 5, "tid": 7, "ts": 1716454224377413, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306555, "dur": 11, "args": { "External id": 188175, "cbid": 211, "correlation": 188175 } }, { "ph": "s", "id": 188175, "pid": 76337, "tid": -914061504, "ts": 1716454224306555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224377468, "dur": 53, "args": { "External id": 188195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188195, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 188195, "pid": 5, "tid": 7, "ts": 1716454224377468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306626, "dur": 11, "args": { "External id": 188195, "cbid": 211, "correlation": 188195 } }, { "ph": "s", "id": 188195, "pid": 76337, "tid": -914061504, "ts": 1716454224306626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224377522, "dur": 5, "args": { "External id": 188207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188207, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 188207, "pid": 5, "tid": 7, "ts": 1716454224377522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306648, "dur": 6, "args": { "External id": 188207, "cbid": 211, "correlation": 188207 } }, { "ph": "s", "id": 188207, "pid": 76337, "tid": -914061504, "ts": 1716454224306648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224377528, "dur": 57, "args": { "External id": 188210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188210, "pid": 5, "tid": 7, "ts": 1716454224377528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306667, "dur": 6, "args": { "External id": 188210, "cbid": 211, "correlation": 188210 } }, { "ph": "s", "id": 188210, "pid": 76337, "tid": -914061504, "ts": 1716454224306667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224377586, "dur": 37, "args": { "External id": 188219, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188219, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188219, "pid": 5, "tid": 7, "ts": 1716454224377586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306705, "dur": 10, "args": { "External id": 188219, "cbid": 211, "correlation": 188219 } }, { "ph": "s", "id": 188219, "pid": 76337, "tid": -914061504, "ts": 1716454224306705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224306756, "dur": 0, "args": { "External id": 188229, "cbid": 317, "correlation": 188229 } }, { "ph": "f", "id": 188229, "pid": 76337, "tid": -914061504, "ts": 1716454224306756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224306756, "dur": 0, "args": { "External id": 188230, "cbid": 203, "correlation": 188230 } }, { "ph": "f", "id": 188230, "pid": 76337, "tid": -914061504, "ts": 1716454224306756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224306757, "dur": 0, "args": { "External id": 188231, "cbid": 205, "correlation": 188231 } }, { "ph": "f", "id": 188231, "pid": 76337, "tid": -914061504, "ts": 1716454224306757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224377624, "dur": 39, "args": { "External id": 188235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188235, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188235, "pid": 5, "tid": 7, "ts": 1716454224377624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306773, "dur": 11, "args": { "External id": 188235, "cbid": 211, "correlation": 188235 } }, { "ph": "s", "id": 188235, "pid": 76337, "tid": -914061504, "ts": 1716454224306773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224377665, "dur": 82, "args": { "External id": 188237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188237, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188237, "pid": 5, "tid": 7, "ts": 1716454224377665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306787, "dur": 5, "args": { "External id": 188237, "cbid": 211, "correlation": 188237 } }, { "ph": "s", "id": 188237, "pid": 76337, "tid": -914061504, "ts": 1716454224306787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224377748, "dur": 1266, "args": { "External id": 188239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188239, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188239, "pid": 5, "tid": 7, "ts": 1716454224377748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306797, "dur": 6, "args": { "External id": 188239, "cbid": 211, "correlation": 188239 } }, { "ph": "s", "id": 188239, "pid": 76337, "tid": -914061504, "ts": 1716454224306797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224379016, "dur": 22, "args": { "External id": 188241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188241, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188241, "pid": 5, "tid": 7, "ts": 1716454224379016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306807, "dur": 5, "args": { "External id": 188241, "cbid": 211, "correlation": 188241 } }, { "ph": "s", "id": 188241, "pid": 76337, "tid": -914061504, "ts": 1716454224306807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224379040, "dur": 33, "args": { "External id": 188247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188247, "pid": 5, "tid": 7, "ts": 1716454224379040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306835, "dur": 8, "args": { "External id": 188247, "cbid": 211, "correlation": 188247 } }, { "ph": "s", "id": 188247, "pid": 76337, "tid": -914061504, "ts": 1716454224306835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224379074, "dur": 3, "args": { "External id": 188255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188255, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 188255, "pid": 5, "tid": 7, "ts": 1716454224379074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306878, "dur": 9, "args": { "External id": 188255, "cbid": 211, "correlation": 188255 } }, { "ph": "s", "id": 188255, "pid": 76337, "tid": -914061504, "ts": 1716454224306878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224306943, "dur": 1, "args": { "External id": 188271, "cbid": 251, "correlation": 188271 } }, { "ph": "f", "id": 188271, "pid": 76337, "tid": -914061504, "ts": 1716454224306943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224306948, "dur": 0, "args": { "External id": 188273, "cbid": 251, "correlation": 188273 } }, { "ph": "f", "id": 188273, "pid": 76337, "tid": -914061504, "ts": 1716454224306948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224379079, "dur": 12, "args": { "External id": 188274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188274, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 188274, "pid": 5, "tid": 7, "ts": 1716454224379079, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306950, "dur": 11, "args": { "External id": 188274, "cbid": 211, "correlation": 188274 } }, { "ph": "s", "id": 188274, "pid": 76337, "tid": -914061504, "ts": 1716454224306950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224379091, "dur": 5, "args": { "External id": 188276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188276, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 188276, "pid": 5, "tid": 7, "ts": 1716454224379091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224306964, "dur": 6, "args": { "External id": 188276, "cbid": 211, "correlation": 188276 } }, { "ph": "s", "id": 188276, "pid": 76337, "tid": -914061504, "ts": 1716454224306964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224379098, "dur": 28, "args": { "External id": 188286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188286, "pid": 5, "tid": 7, "ts": 1716454224379098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307031, "dur": 13, "args": { "External id": 188286, "cbid": 211, "correlation": 188286 } }, { "ph": "s", "id": 188286, "pid": 76337, "tid": -914061504, "ts": 1716454224307031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224379128, "dur": 31, "args": { "External id": 188306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188306, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 188306, "pid": 5, "tid": 7, "ts": 1716454224379128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307098, "dur": 11, "args": { "External id": 188306, "cbid": 211, "correlation": 188306 } }, { "ph": "s", "id": 188306, "pid": 76337, "tid": -914061504, "ts": 1716454224307098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224379160, "dur": 4, "args": { "External id": 188318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188318, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 188318, "pid": 5, "tid": 7, "ts": 1716454224379160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307120, "dur": 6, "args": { "External id": 188318, "cbid": 211, "correlation": 188318 } }, { "ph": "s", "id": 188318, "pid": 76337, "tid": -914061504, "ts": 1716454224307120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224379165, "dur": 29, "args": { "External id": 188321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188321, "pid": 5, "tid": 7, "ts": 1716454224379165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307139, "dur": 6, "args": { "External id": 188321, "cbid": 211, "correlation": 188321 } }, { "ph": "s", "id": 188321, "pid": 76337, "tid": -914061504, "ts": 1716454224307139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224379196, "dur": 20, "args": { "External id": 188330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188330, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188330, "pid": 5, "tid": 7, "ts": 1716454224379196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307179, "dur": 10, "args": { "External id": 188330, "cbid": 211, "correlation": 188330 } }, { "ph": "s", "id": 188330, "pid": 76337, "tid": -914061504, "ts": 1716454224307179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224307242, "dur": 0, "args": { "External id": 188340, "cbid": 317, "correlation": 188340 } }, { "ph": "f", "id": 188340, "pid": 76337, "tid": -914061504, "ts": 1716454224307242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224307243, "dur": 0, "args": { "External id": 188341, "cbid": 203, "correlation": 188341 } }, { "ph": "f", "id": 188341, "pid": 76337, "tid": -914061504, "ts": 1716454224307243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224307243, "dur": 0, "args": { "External id": 188342, "cbid": 205, "correlation": 188342 } }, { "ph": "f", "id": 188342, "pid": 76337, "tid": -914061504, "ts": 1716454224307243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224379217, "dur": 23, "args": { "External id": 188346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188346, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188346, "pid": 5, "tid": 7, "ts": 1716454224379217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307257, "dur": 13, "args": { "External id": 188346, "cbid": 211, "correlation": 188346 } }, { "ph": "s", "id": 188346, "pid": 76337, "tid": -914061504, "ts": 1716454224307257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224379242, "dur": 44, "args": { "External id": 188348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188348, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188348, "pid": 5, "tid": 7, "ts": 1716454224379242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307273, "dur": 5, "args": { "External id": 188348, "cbid": 211, "correlation": 188348 } }, { "ph": "s", "id": 188348, "pid": 76337, "tid": -914061504, "ts": 1716454224307273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224379287, "dur": 637, "args": { "External id": 188350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188350, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188350, "pid": 5, "tid": 7, "ts": 1716454224379287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307284, "dur": 6, "args": { "External id": 188350, "cbid": 211, "correlation": 188350 } }, { "ph": "s", "id": 188350, "pid": 76337, "tid": -914061504, "ts": 1716454224307284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224379925, "dur": 21, "args": { "External id": 188352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188352, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188352, "pid": 5, "tid": 7, "ts": 1716454224379925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307293, "dur": 5, "args": { "External id": 188352, "cbid": 211, "correlation": 188352 } }, { "ph": "s", "id": 188352, "pid": 76337, "tid": -914061504, "ts": 1716454224307293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224379948, "dur": 32, "args": { "External id": 188358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188358, "pid": 5, "tid": 7, "ts": 1716454224379948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307322, "dur": 8, "args": { "External id": 188358, "cbid": 211, "correlation": 188358 } }, { "ph": "s", "id": 188358, "pid": 76337, "tid": -914061504, "ts": 1716454224307322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224307380, "dur": 0, "args": { "External id": 188368, "cbid": 317, "correlation": 188368 } }, { "ph": "f", "id": 188368, "pid": 76337, "tid": -914061504, "ts": 1716454224307380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224307380, "dur": 0, "args": { "External id": 188369, "cbid": 203, "correlation": 188369 } }, { "ph": "f", "id": 188369, "pid": 76337, "tid": -914061504, "ts": 1716454224307380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224307381, "dur": 0, "args": { "External id": 188370, "cbid": 205, "correlation": 188370 } }, { "ph": "f", "id": 188370, "pid": 76337, "tid": -914061504, "ts": 1716454224307381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224379981, "dur": 38, "args": { "External id": 188374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188374, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188374, "pid": 5, "tid": 7, "ts": 1716454224379981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307393, "dur": 12, "args": { "External id": 188374, "cbid": 211, "correlation": 188374 } }, { "ph": "s", "id": 188374, "pid": 76337, "tid": -914061504, "ts": 1716454224307393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224380020, "dur": 186, "args": { "External id": 188376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188376, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188376, "pid": 5, "tid": 7, "ts": 1716454224380020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307411, "dur": 7, "args": { "External id": 188376, "cbid": 211, "correlation": 188376 } }, { "ph": "s", "id": 188376, "pid": 76337, "tid": -914061504, "ts": 1716454224307411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224380207, "dur": 22, "args": { "External id": 188378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188378, "pid": 5, "tid": 7, "ts": 1716454224380207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307421, "dur": 5, "args": { "External id": 188378, "cbid": 211, "correlation": 188378 } }, { "ph": "s", "id": 188378, "pid": 76337, "tid": -914061504, "ts": 1716454224307421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224380231, "dur": 32, "args": { "External id": 188384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188384, "pid": 5, "tid": 7, "ts": 1716454224380231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307447, "dur": 8, "args": { "External id": 188384, "cbid": 211, "correlation": 188384 } }, { "ph": "s", "id": 188384, "pid": 76337, "tid": -914061504, "ts": 1716454224307447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224380265, "dur": 27, "args": { "External id": 188392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188392, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188392, "pid": 5, "tid": 7, "ts": 1716454224380265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307476, "dur": 9, "args": { "External id": 188392, "cbid": 211, "correlation": 188392 } }, { "ph": "s", "id": 188392, "pid": 76337, "tid": -914061504, "ts": 1716454224307476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224380293, "dur": 20, "args": { "External id": 188400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188400, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188400, "pid": 5, "tid": 7, "ts": 1716454224380293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307506, "dur": 8, "args": { "External id": 188400, "cbid": 211, "correlation": 188400 } }, { "ph": "s", "id": 188400, "pid": 76337, "tid": -914061504, "ts": 1716454224307506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224380314, "dur": 30, "args": { "External id": 188420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188420, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 188420, "pid": 5, "tid": 7, "ts": 1716454224380314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307588, "dur": 12, "args": { "External id": 188420, "cbid": 211, "correlation": 188420 } }, { "ph": "s", "id": 188420, "pid": 76337, "tid": -914061504, "ts": 1716454224307588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224380346, "dur": 4, "args": { "External id": 188432, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188432, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 188432, "pid": 5, "tid": 7, "ts": 1716454224380346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307610, "dur": 6, "args": { "External id": 188432, "cbid": 211, "correlation": 188432 } }, { "ph": "s", "id": 188432, "pid": 76337, "tid": -914061504, "ts": 1716454224307610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224380351, "dur": 30, "args": { "External id": 188435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188435, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188435, "pid": 5, "tid": 7, "ts": 1716454224380351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307628, "dur": 6, "args": { "External id": 188435, "cbid": 211, "correlation": 188435 } }, { "ph": "s", "id": 188435, "pid": 76337, "tid": -914061504, "ts": 1716454224307628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224307685, "dur": 0, "args": { "External id": 188446, "cbid": 317, "correlation": 188446 } }, { "ph": "f", "id": 188446, "pid": 76337, "tid": -914061504, "ts": 1716454224307685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224307686, "dur": 0, "args": { "External id": 188447, "cbid": 203, "correlation": 188447 } }, { "ph": "f", "id": 188447, "pid": 76337, "tid": -914061504, "ts": 1716454224307686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224307686, "dur": 0, "args": { "External id": 188448, "cbid": 205, "correlation": 188448 } }, { "ph": "f", "id": 188448, "pid": 76337, "tid": -914061504, "ts": 1716454224307686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224380382, "dur": 22, "args": { "External id": 188452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188452, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188452, "pid": 5, "tid": 7, "ts": 1716454224380382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307699, "dur": 12, "args": { "External id": 188452, "cbid": 211, "correlation": 188452 } }, { "ph": "s", "id": 188452, "pid": 76337, "tid": -914061504, "ts": 1716454224307699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224380406, "dur": 103, "args": { "External id": 188454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188454, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188454, "pid": 5, "tid": 7, "ts": 1716454224380406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307717, "dur": 7, "args": { "External id": 188454, "cbid": 211, "correlation": 188454 } }, { "ph": "s", "id": 188454, "pid": 76337, "tid": -914061504, "ts": 1716454224307717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224380510, "dur": 25, "args": { "External id": 188456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188456, "pid": 5, "tid": 7, "ts": 1716454224380510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307727, "dur": 5, "args": { "External id": 188456, "cbid": 211, "correlation": 188456 } }, { "ph": "s", "id": 188456, "pid": 76337, "tid": -914061504, "ts": 1716454224307727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224380536, "dur": 32, "args": { "External id": 188462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188462, "pid": 5, "tid": 7, "ts": 1716454224380536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307754, "dur": 8, "args": { "External id": 188462, "cbid": 211, "correlation": 188462 } }, { "ph": "s", "id": 188462, "pid": 76337, "tid": -914061504, "ts": 1716454224307754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224380569, "dur": 196, "args": { "External id": 188471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188471, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188471, "pid": 5, "tid": 7, "ts": 1716454224380569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307835, "dur": 13, "args": { "External id": 188471, "cbid": 211, "correlation": 188471 } }, { "ph": "s", "id": 188471, "pid": 76337, "tid": -914061504, "ts": 1716454224307835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224380766, "dur": 64, "args": { "External id": 188493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188493, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188493, "pid": 5, "tid": 7, "ts": 1716454224380766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307890, "dur": 10, "args": { "External id": 188493, "cbid": 211, "correlation": 188493 } }, { "ph": "s", "id": 188493, "pid": 76337, "tid": -914061504, "ts": 1716454224307890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224307988, "dur": 1, "args": { "External id": 188504, "cbid": 251, "correlation": 188504 } }, { "ph": "f", "id": 188504, "pid": 76337, "tid": -914061504, "ts": 1716454224307988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224380831, "dur": 152, "args": { "External id": 188505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188505, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188505, "pid": 5, "tid": 7, "ts": 1716454224380831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224307994, "dur": 14, "args": { "External id": 188505, "cbid": 211, "correlation": 188505 } }, { "ph": "s", "id": 188505, "pid": 76337, "tid": -914061504, "ts": 1716454224307994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308065, "dur": 1, "args": { "External id": 188516, "cbid": 251, "correlation": 188516 } }, { "ph": "f", "id": 188516, "pid": 76337, "tid": -914061504, "ts": 1716454224308065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224380984, "dur": 144, "args": { "External id": 188517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188517, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188517, "pid": 5, "tid": 7, "ts": 1716454224380984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308070, "dur": 12, "args": { "External id": 188517, "cbid": 211, "correlation": 188517 } }, { "ph": "s", "id": 188517, "pid": 76337, "tid": -914061504, "ts": 1716454224308070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308135, "dur": 1, "args": { "External id": 188528, "cbid": 251, "correlation": 188528 } }, { "ph": "f", "id": 188528, "pid": 76337, "tid": -914061504, "ts": 1716454224308135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224381129, "dur": 143, "args": { "External id": 188529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188529, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188529, "pid": 5, "tid": 7, "ts": 1716454224381129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308139, "dur": 11, "args": { "External id": 188529, "cbid": 211, "correlation": 188529 } }, { "ph": "s", "id": 188529, "pid": 76337, "tid": -914061504, "ts": 1716454224308139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224381273, "dur": 1909, "args": { "External id": 188550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188550, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 188550, "pid": 5, "tid": 7, "ts": 1716454224381273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308221, "dur": 13, "args": { "External id": 188550, "cbid": 211, "correlation": 188550 } }, { "ph": "s", "id": 188550, "pid": 76337, "tid": -914061504, "ts": 1716454224308221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308318, "dur": 1, "args": { "External id": 188568, "cbid": 251, "correlation": 188568 } }, { "ph": "f", "id": 188568, "pid": 76337, "tid": -914061504, "ts": 1716454224308318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224383183, "dur": 147, "args": { "External id": 188570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188570, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 188570, "pid": 5, "tid": 7, "ts": 1716454224383183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308324, "dur": 14, "args": { "External id": 188570, "cbid": 211, "correlation": 188570 } }, { "ph": "s", "id": 188570, "pid": 76337, "tid": -914061504, "ts": 1716454224308324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224383332, "dur": 35, "args": { "External id": 188578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188578, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188578, "pid": 5, "tid": 7, "ts": 1716454224383332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308394, "dur": 11, "args": { "External id": 188578, "cbid": 211, "correlation": 188578 } }, { "ph": "s", "id": 188578, "pid": 76337, "tid": -914061504, "ts": 1716454224308394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224383369, "dur": 51, "args": { "External id": 188586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188586, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188586, "pid": 5, "tid": 7, "ts": 1716454224383369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308433, "dur": 9, "args": { "External id": 188586, "cbid": 211, "correlation": 188586 } }, { "ph": "s", "id": 188586, "pid": 76337, "tid": -914061504, "ts": 1716454224308433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224383422, "dur": 29, "args": { "External id": 188597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188597, "pid": 5, "tid": 7, "ts": 1716454224383422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308504, "dur": 12, "args": { "External id": 188597, "cbid": 211, "correlation": 188597 } }, { "ph": "s", "id": 188597, "pid": 76337, "tid": -914061504, "ts": 1716454224308504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224383452, "dur": 34, "args": { "External id": 188619, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188619, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188619, "pid": 5, "tid": 7, "ts": 1716454224383452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308536, "dur": 7, "args": { "External id": 188619, "cbid": 211, "correlation": 188619 } }, { "ph": "s", "id": 188619, "pid": 76337, "tid": -914061504, "ts": 1716454224308536, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308620, "dur": 1, "args": { "External id": 188630, "cbid": 251, "correlation": 188630 } }, { "ph": "f", "id": 188630, "pid": 76337, "tid": -914061504, "ts": 1716454224308620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224383487, "dur": 90, "args": { "External id": 188631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188631, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188631, "pid": 5, "tid": 7, "ts": 1716454224383487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308625, "dur": 14, "args": { "External id": 188631, "cbid": 211, "correlation": 188631 } }, { "ph": "s", "id": 188631, "pid": 76337, "tid": -914061504, "ts": 1716454224308625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308695, "dur": 1, "args": { "External id": 188642, "cbid": 251, "correlation": 188642 } }, { "ph": "f", "id": 188642, "pid": 76337, "tid": -914061504, "ts": 1716454224308695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308699, "dur": 0, "args": { "External id": 188643, "cbid": 251, "correlation": 188643 } }, { "ph": "f", "id": 188643, "pid": 76337, "tid": -914061504, "ts": 1716454224308699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224383578, "dur": 11, "args": { "External id": 188644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188644, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 188644, "pid": 5, "tid": 7, "ts": 1716454224383578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308700, "dur": 12, "args": { "External id": 188644, "cbid": 211, "correlation": 188644 } }, { "ph": "s", "id": 188644, "pid": 76337, "tid": -914061504, "ts": 1716454224308700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224383590, "dur": 5, "args": { "External id": 188646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188646, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 188646, "pid": 5, "tid": 7, "ts": 1716454224383590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308714, "dur": 6, "args": { "External id": 188646, "cbid": 211, "correlation": 188646 } }, { "ph": "s", "id": 188646, "pid": 76337, "tid": -914061504, "ts": 1716454224308714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308772, "dur": 1, "args": { "External id": 188657, "cbid": 251, "correlation": 188657 } }, { "ph": "f", "id": 188657, "pid": 76337, "tid": -914061504, "ts": 1716454224308772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308775, "dur": 0, "args": { "External id": 188658, "cbid": 251, "correlation": 188658 } }, { "ph": "f", "id": 188658, "pid": 76337, "tid": -914061504, "ts": 1716454224308775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224383597, "dur": 7, "args": { "External id": 188659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188659, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 188659, "pid": 5, "tid": 7, "ts": 1716454224383597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308776, "dur": 13, "args": { "External id": 188659, "cbid": 211, "correlation": 188659 } }, { "ph": "s", "id": 188659, "pid": 76337, "tid": -914061504, "ts": 1716454224308776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224383605, "dur": 3, "args": { "External id": 188661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188661, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 188661, "pid": 5, "tid": 7, "ts": 1716454224383605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308791, "dur": 5, "args": { "External id": 188661, "cbid": 211, "correlation": 188661 } }, { "ph": "s", "id": 188661, "pid": 76337, "tid": -914061504, "ts": 1716454224308791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224383609, "dur": 90, "args": { "External id": 188682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188682, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 188682, "pid": 5, "tid": 7, "ts": 1716454224383609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308864, "dur": 12, "args": { "External id": 188682, "cbid": 211, "correlation": 188682 } }, { "ph": "s", "id": 188682, "pid": 76337, "tid": -914061504, "ts": 1716454224308864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224308961, "dur": 1, "args": { "External id": 188700, "cbid": 251, "correlation": 188700 } }, { "ph": "f", "id": 188700, "pid": 76337, "tid": -914061504, "ts": 1716454224308961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224383701, "dur": 97, "args": { "External id": 188702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188702, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188702, "pid": 5, "tid": 7, "ts": 1716454224383701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224308967, "dur": 22, "args": { "External id": 188702, "cbid": 211, "correlation": 188702 } }, { "ph": "s", "id": 188702, "pid": 76337, "tid": -914061504, "ts": 1716454224308967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224383799, "dur": 19, "args": { "External id": 188710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188710, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188710, "pid": 5, "tid": 7, "ts": 1716454224383799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309046, "dur": 12, "args": { "External id": 188710, "cbid": 211, "correlation": 188710 } }, { "ph": "s", "id": 188710, "pid": 76337, "tid": -914061504, "ts": 1716454224309046, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224383819, "dur": 38, "args": { "External id": 188718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188718, "pid": 5, "tid": 7, "ts": 1716454224383819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309088, "dur": 9, "args": { "External id": 188718, "cbid": 211, "correlation": 188718 } }, { "ph": "s", "id": 188718, "pid": 76337, "tid": -914061504, "ts": 1716454224309088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224383858, "dur": 35, "args": { "External id": 188740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188740, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188740, "pid": 5, "tid": 7, "ts": 1716454224383858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309138, "dur": 10, "args": { "External id": 188740, "cbid": 211, "correlation": 188740 } }, { "ph": "s", "id": 188740, "pid": 76337, "tid": -914061504, "ts": 1716454224309138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224309225, "dur": 1, "args": { "External id": 188756, "cbid": 251, "correlation": 188756 } }, { "ph": "f", "id": 188756, "pid": 76337, "tid": -914061504, "ts": 1716454224309225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224309230, "dur": 0, "args": { "External id": 188758, "cbid": 251, "correlation": 188758 } }, { "ph": "f", "id": 188758, "pid": 76337, "tid": -914061504, "ts": 1716454224309230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224383894, "dur": 531, "args": { "External id": 188759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188759, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 188759, "pid": 5, "tid": 7, "ts": 1716454224383894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309234, "dur": 13, "args": { "External id": 188759, "cbid": 211, "correlation": 188759 } }, { "ph": "s", "id": 188759, "pid": 76337, "tid": -914061504, "ts": 1716454224309234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224384427, "dur": 126, "args": { "External id": 188767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188767, "pid": 5, "tid": 7, "ts": 1716454224384427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309299, "dur": 13, "args": { "External id": 188767, "cbid": 211, "correlation": 188767 } }, { "ph": "s", "id": 188767, "pid": 76337, "tid": -914061504, "ts": 1716454224309299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224384554, "dur": 130, "args": { "External id": 188775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188775, "pid": 5, "tid": 7, "ts": 1716454224384554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309329, "dur": 9, "args": { "External id": 188775, "cbid": 211, "correlation": 188775 } }, { "ph": "s", "id": 188775, "pid": 76337, "tid": -914061504, "ts": 1716454224309329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224309406, "dur": 1, "args": { "External id": 188791, "cbid": 251, "correlation": 188791 } }, { "ph": "f", "id": 188791, "pid": 76337, "tid": -914061504, "ts": 1716454224309406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224384685, "dur": 303, "args": { "External id": 188793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188793, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188793, "pid": 5, "tid": 7, "ts": 1716454224384685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309412, "dur": 12, "args": { "External id": 188793, "cbid": 211, "correlation": 188793 } }, { "ph": "s", "id": 188793, "pid": 76337, "tid": -914061504, "ts": 1716454224309412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224384990, "dur": 26, "args": { "External id": 188801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188801, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188801, "pid": 5, "tid": 7, "ts": 1716454224384990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309455, "dur": 9, "args": { "External id": 188801, "cbid": 211, "correlation": 188801 } }, { "ph": "s", "id": 188801, "pid": 76337, "tid": -914061504, "ts": 1716454224309455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224385018, "dur": 81, "args": { "External id": 188812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188812, "pid": 5, "tid": 7, "ts": 1716454224385018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309522, "dur": 12, "args": { "External id": 188812, "cbid": 211, "correlation": 188812 } }, { "ph": "s", "id": 188812, "pid": 76337, "tid": -914061504, "ts": 1716454224309522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224309586, "dur": 0, "args": { "External id": 188824, "cbid": 317, "correlation": 188824 } }, { "ph": "f", "id": 188824, "pid": 76337, "tid": -914061504, "ts": 1716454224309586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224309587, "dur": 0, "args": { "External id": 188825, "cbid": 203, "correlation": 188825 } }, { "ph": "f", "id": 188825, "pid": 76337, "tid": -914061504, "ts": 1716454224309587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224309588, "dur": 0, "args": { "External id": 188826, "cbid": 205, "correlation": 188826 } }, { "ph": "f", "id": 188826, "pid": 76337, "tid": -914061504, "ts": 1716454224309588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224385100, "dur": 23, "args": { "External id": 188830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188830, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188830, "pid": 5, "tid": 7, "ts": 1716454224385100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309603, "dur": 12, "args": { "External id": 188830, "cbid": 211, "correlation": 188830 } }, { "ph": "s", "id": 188830, "pid": 76337, "tid": -914061504, "ts": 1716454224309603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224385125, "dur": 120, "args": { "External id": 188832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188832, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188832, "pid": 5, "tid": 7, "ts": 1716454224385125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309621, "dur": 6, "args": { "External id": 188832, "cbid": 211, "correlation": 188832 } }, { "ph": "s", "id": 188832, "pid": 76337, "tid": -914061504, "ts": 1716454224309621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224385246, "dur": 24, "args": { "External id": 188834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188834, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188834, "pid": 5, "tid": 7, "ts": 1716454224385246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309631, "dur": 5, "args": { "External id": 188834, "cbid": 211, "correlation": 188834 } }, { "ph": "s", "id": 188834, "pid": 76337, "tid": -914061504, "ts": 1716454224309631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224385271, "dur": 33, "args": { "External id": 188840, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188840, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188840, "pid": 5, "tid": 7, "ts": 1716454224385271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309660, "dur": 8, "args": { "External id": 188840, "cbid": 211, "correlation": 188840 } }, { "ph": "s", "id": 188840, "pid": 76337, "tid": -914061504, "ts": 1716454224309660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224385305, "dur": 27, "args": { "External id": 188848, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188848, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188848, "pid": 5, "tid": 7, "ts": 1716454224385305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309692, "dur": 8, "args": { "External id": 188848, "cbid": 211, "correlation": 188848 } }, { "ph": "s", "id": 188848, "pid": 76337, "tid": -914061504, "ts": 1716454224309692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224385333, "dur": 45, "args": { "External id": 188857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188857, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188857, "pid": 5, "tid": 7, "ts": 1716454224385333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309728, "dur": 10, "args": { "External id": 188857, "cbid": 211, "correlation": 188857 } }, { "ph": "s", "id": 188857, "pid": 76337, "tid": -914061504, "ts": 1716454224309728, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224385379, "dur": 42, "args": { "External id": 188877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188877, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 188877, "pid": 5, "tid": 7, "ts": 1716454224385379, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309800, "dur": 11, "args": { "External id": 188877, "cbid": 211, "correlation": 188877 } }, { "ph": "s", "id": 188877, "pid": 76337, "tid": -914061504, "ts": 1716454224309800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224385423, "dur": 5, "args": { "External id": 188889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188889, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 188889, "pid": 5, "tid": 7, "ts": 1716454224385423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309821, "dur": 6, "args": { "External id": 188889, "cbid": 211, "correlation": 188889 } }, { "ph": "s", "id": 188889, "pid": 76337, "tid": -914061504, "ts": 1716454224309821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224385429, "dur": 43, "args": { "External id": 188892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188892, "pid": 5, "tid": 7, "ts": 1716454224385429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309840, "dur": 7, "args": { "External id": 188892, "cbid": 211, "correlation": 188892 } }, { "ph": "s", "id": 188892, "pid": 76337, "tid": -914061504, "ts": 1716454224309840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224385473, "dur": 31, "args": { "External id": 188901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188901, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188901, "pid": 5, "tid": 7, "ts": 1716454224385473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309879, "dur": 10, "args": { "External id": 188901, "cbid": 211, "correlation": 188901 } }, { "ph": "s", "id": 188901, "pid": 76337, "tid": -914061504, "ts": 1716454224309879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224309931, "dur": 0, "args": { "External id": 188911, "cbid": 317, "correlation": 188911 } }, { "ph": "f", "id": 188911, "pid": 76337, "tid": -914061504, "ts": 1716454224309931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224309932, "dur": 0, "args": { "External id": 188912, "cbid": 203, "correlation": 188912 } }, { "ph": "f", "id": 188912, "pid": 76337, "tid": -914061504, "ts": 1716454224309932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224309932, "dur": 0, "args": { "External id": 188913, "cbid": 205, "correlation": 188913 } }, { "ph": "f", "id": 188913, "pid": 76337, "tid": -914061504, "ts": 1716454224309932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224385505, "dur": 31, "args": { "External id": 188917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188917, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188917, "pid": 5, "tid": 7, "ts": 1716454224385505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309948, "dur": 11, "args": { "External id": 188917, "cbid": 211, "correlation": 188917 } }, { "ph": "s", "id": 188917, "pid": 76337, "tid": -914061504, "ts": 1716454224309948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224385538, "dur": 63, "args": { "External id": 188919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188919, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188919, "pid": 5, "tid": 7, "ts": 1716454224385538, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309961, "dur": 5, "args": { "External id": 188919, "cbid": 211, "correlation": 188919 } }, { "ph": "s", "id": 188919, "pid": 76337, "tid": -914061504, "ts": 1716454224309961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224385603, "dur": 966, "args": { "External id": 188921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188921, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 188921, "pid": 5, "tid": 7, "ts": 1716454224385603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309972, "dur": 15, "args": { "External id": 188921, "cbid": 211, "correlation": 188921 } }, { "ph": "s", "id": 188921, "pid": 76337, "tid": -914061504, "ts": 1716454224309972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224386570, "dur": 21, "args": { "External id": 188923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188923, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188923, "pid": 5, "tid": 7, "ts": 1716454224386570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224309991, "dur": 5, "args": { "External id": 188923, "cbid": 211, "correlation": 188923 } }, { "ph": "s", "id": 188923, "pid": 76337, "tid": -914061504, "ts": 1716454224309991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224386592, "dur": 33, "args": { "External id": 188929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188929, "pid": 5, "tid": 7, "ts": 1716454224386592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310019, "dur": 8, "args": { "External id": 188929, "cbid": 211, "correlation": 188929 } }, { "ph": "s", "id": 188929, "pid": 76337, "tid": -914061504, "ts": 1716454224310019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224386626, "dur": 3, "args": { "External id": 188937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188937, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 188937, "pid": 5, "tid": 7, "ts": 1716454224386626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310064, "dur": 9, "args": { "External id": 188937, "cbid": 211, "correlation": 188937 } }, { "ph": "s", "id": 188937, "pid": 76337, "tid": -914061504, "ts": 1716454224310064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224310129, "dur": 1, "args": { "External id": 188953, "cbid": 251, "correlation": 188953 } }, { "ph": "f", "id": 188953, "pid": 76337, "tid": -914061504, "ts": 1716454224310129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224310134, "dur": 0, "args": { "External id": 188955, "cbid": 251, "correlation": 188955 } }, { "ph": "f", "id": 188955, "pid": 76337, "tid": -914061504, "ts": 1716454224310134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224386631, "dur": 12, "args": { "External id": 188956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188956, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 188956, "pid": 5, "tid": 7, "ts": 1716454224386631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310135, "dur": 12, "args": { "External id": 188956, "cbid": 211, "correlation": 188956 } }, { "ph": "s", "id": 188956, "pid": 76337, "tid": -914061504, "ts": 1716454224310135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224386644, "dur": 5, "args": { "External id": 188958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188958, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 188958, "pid": 5, "tid": 7, "ts": 1716454224386644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310149, "dur": 5, "args": { "External id": 188958, "cbid": 211, "correlation": 188958 } }, { "ph": "s", "id": 188958, "pid": 76337, "tid": -914061504, "ts": 1716454224310149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224386651, "dur": 29, "args": { "External id": 188968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188968, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 188968, "pid": 5, "tid": 7, "ts": 1716454224386651, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310206, "dur": 12, "args": { "External id": 188968, "cbid": 211, "correlation": 188968 } }, { "ph": "s", "id": 188968, "pid": 76337, "tid": -914061504, "ts": 1716454224310206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224386680, "dur": 31, "args": { "External id": 188988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 188988, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 188988, "pid": 5, "tid": 7, "ts": 1716454224386680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310272, "dur": 10, "args": { "External id": 188988, "cbid": 211, "correlation": 188988 } }, { "ph": "s", "id": 188988, "pid": 76337, "tid": -914061504, "ts": 1716454224310272, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224386713, "dur": 4, "args": { "External id": 189000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189000, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 189000, "pid": 5, "tid": 7, "ts": 1716454224386713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310292, "dur": 7, "args": { "External id": 189000, "cbid": 211, "correlation": 189000 } }, { "ph": "s", "id": 189000, "pid": 76337, "tid": -914061504, "ts": 1716454224310292, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224386718, "dur": 30, "args": { "External id": 189003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189003, "pid": 5, "tid": 7, "ts": 1716454224386718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310312, "dur": 6, "args": { "External id": 189003, "cbid": 211, "correlation": 189003 } }, { "ph": "s", "id": 189003, "pid": 76337, "tid": -914061504, "ts": 1716454224310312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224386750, "dur": 20, "args": { "External id": 189012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189012, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189012, "pid": 5, "tid": 7, "ts": 1716454224386750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310352, "dur": 9, "args": { "External id": 189012, "cbid": 211, "correlation": 189012 } }, { "ph": "s", "id": 189012, "pid": 76337, "tid": -914061504, "ts": 1716454224310352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224310414, "dur": 0, "args": { "External id": 189022, "cbid": 317, "correlation": 189022 } }, { "ph": "f", "id": 189022, "pid": 76337, "tid": -914061504, "ts": 1716454224310414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224310414, "dur": 0, "args": { "External id": 189023, "cbid": 203, "correlation": 189023 } }, { "ph": "f", "id": 189023, "pid": 76337, "tid": -914061504, "ts": 1716454224310414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224310415, "dur": 0, "args": { "External id": 189024, "cbid": 205, "correlation": 189024 } }, { "ph": "f", "id": 189024, "pid": 76337, "tid": -914061504, "ts": 1716454224310415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224386771, "dur": 22, "args": { "External id": 189028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189028, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189028, "pid": 5, "tid": 7, "ts": 1716454224386771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310429, "dur": 12, "args": { "External id": 189028, "cbid": 211, "correlation": 189028 } }, { "ph": "s", "id": 189028, "pid": 76337, "tid": -914061504, "ts": 1716454224310429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224386794, "dur": 44, "args": { "External id": 189030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189030, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189030, "pid": 5, "tid": 7, "ts": 1716454224386794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310444, "dur": 6, "args": { "External id": 189030, "cbid": 211, "correlation": 189030 } }, { "ph": "s", "id": 189030, "pid": 76337, "tid": -914061504, "ts": 1716454224310444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224386840, "dur": 646, "args": { "External id": 189032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189032, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189032, "pid": 5, "tid": 7, "ts": 1716454224386840, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310456, "dur": 6, "args": { "External id": 189032, "cbid": 211, "correlation": 189032 } }, { "ph": "s", "id": 189032, "pid": 76337, "tid": -914061504, "ts": 1716454224310456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224387487, "dur": 22, "args": { "External id": 189034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189034, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189034, "pid": 5, "tid": 7, "ts": 1716454224387487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310465, "dur": 5, "args": { "External id": 189034, "cbid": 211, "correlation": 189034 } }, { "ph": "s", "id": 189034, "pid": 76337, "tid": -914061504, "ts": 1716454224310465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224387510, "dur": 33, "args": { "External id": 189040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189040, "pid": 5, "tid": 7, "ts": 1716454224387510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310493, "dur": 9, "args": { "External id": 189040, "cbid": 211, "correlation": 189040 } }, { "ph": "s", "id": 189040, "pid": 76337, "tid": -914061504, "ts": 1716454224310493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224310551, "dur": 0, "args": { "External id": 189050, "cbid": 317, "correlation": 189050 } }, { "ph": "f", "id": 189050, "pid": 76337, "tid": -914061504, "ts": 1716454224310551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224310552, "dur": 0, "args": { "External id": 189051, "cbid": 203, "correlation": 189051 } }, { "ph": "f", "id": 189051, "pid": 76337, "tid": -914061504, "ts": 1716454224310552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224310553, "dur": 0, "args": { "External id": 189052, "cbid": 205, "correlation": 189052 } }, { "ph": "f", "id": 189052, "pid": 76337, "tid": -914061504, "ts": 1716454224310553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224387544, "dur": 30, "args": { "External id": 189056, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189056, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189056, "pid": 5, "tid": 7, "ts": 1716454224387544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310566, "dur": 12, "args": { "External id": 189056, "cbid": 211, "correlation": 189056 } }, { "ph": "s", "id": 189056, "pid": 76337, "tid": -914061504, "ts": 1716454224310566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224387575, "dur": 153, "args": { "External id": 189058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189058, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189058, "pid": 5, "tid": 7, "ts": 1716454224387575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310584, "dur": 7, "args": { "External id": 189058, "cbid": 211, "correlation": 189058 } }, { "ph": "s", "id": 189058, "pid": 76337, "tid": -914061504, "ts": 1716454224310584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224387730, "dur": 25, "args": { "External id": 189060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189060, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189060, "pid": 5, "tid": 7, "ts": 1716454224387730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310596, "dur": 5, "args": { "External id": 189060, "cbid": 211, "correlation": 189060 } }, { "ph": "s", "id": 189060, "pid": 76337, "tid": -914061504, "ts": 1716454224310596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224387756, "dur": 32, "args": { "External id": 189066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189066, "pid": 5, "tid": 7, "ts": 1716454224387756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310621, "dur": 8, "args": { "External id": 189066, "cbid": 211, "correlation": 189066 } }, { "ph": "s", "id": 189066, "pid": 76337, "tid": -914061504, "ts": 1716454224310621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224387789, "dur": 27, "args": { "External id": 189074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189074, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189074, "pid": 5, "tid": 7, "ts": 1716454224387789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310650, "dur": 8, "args": { "External id": 189074, "cbid": 211, "correlation": 189074 } }, { "ph": "s", "id": 189074, "pid": 76337, "tid": -914061504, "ts": 1716454224310650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224387817, "dur": 20, "args": { "External id": 189082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189082, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189082, "pid": 5, "tid": 7, "ts": 1716454224387817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310678, "dur": 8, "args": { "External id": 189082, "cbid": 211, "correlation": 189082 } }, { "ph": "s", "id": 189082, "pid": 76337, "tid": -914061504, "ts": 1716454224310678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224387839, "dur": 31, "args": { "External id": 189102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189102, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 189102, "pid": 5, "tid": 7, "ts": 1716454224387839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310761, "dur": 12, "args": { "External id": 189102, "cbid": 211, "correlation": 189102 } }, { "ph": "s", "id": 189102, "pid": 76337, "tid": -914061504, "ts": 1716454224310761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224387871, "dur": 5, "args": { "External id": 189114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189114, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 189114, "pid": 5, "tid": 7, "ts": 1716454224387871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310782, "dur": 6, "args": { "External id": 189114, "cbid": 211, "correlation": 189114 } }, { "ph": "s", "id": 189114, "pid": 76337, "tid": -914061504, "ts": 1716454224310782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224387877, "dur": 31, "args": { "External id": 189117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189117, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189117, "pid": 5, "tid": 7, "ts": 1716454224387877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310801, "dur": 6, "args": { "External id": 189117, "cbid": 211, "correlation": 189117 } }, { "ph": "s", "id": 189117, "pid": 76337, "tid": -914061504, "ts": 1716454224310801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224310857, "dur": 0, "args": { "External id": 189128, "cbid": 317, "correlation": 189128 } }, { "ph": "f", "id": 189128, "pid": 76337, "tid": -914061504, "ts": 1716454224310857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224310858, "dur": 0, "args": { "External id": 189129, "cbid": 203, "correlation": 189129 } }, { "ph": "f", "id": 189129, "pid": 76337, "tid": -914061504, "ts": 1716454224310858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224310859, "dur": 0, "args": { "External id": 189130, "cbid": 205, "correlation": 189130 } }, { "ph": "f", "id": 189130, "pid": 76337, "tid": -914061504, "ts": 1716454224310859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224387909, "dur": 22, "args": { "External id": 189134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189134, "pid": 5, "tid": 7, "ts": 1716454224387909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310872, "dur": 12, "args": { "External id": 189134, "cbid": 211, "correlation": 189134 } }, { "ph": "s", "id": 189134, "pid": 76337, "tid": -914061504, "ts": 1716454224310872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224387933, "dur": 106, "args": { "External id": 189136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189136, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189136, "pid": 5, "tid": 7, "ts": 1716454224387933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310889, "dur": 6, "args": { "External id": 189136, "cbid": 211, "correlation": 189136 } }, { "ph": "s", "id": 189136, "pid": 76337, "tid": -914061504, "ts": 1716454224310889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224388040, "dur": 24, "args": { "External id": 189138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189138, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189138, "pid": 5, "tid": 7, "ts": 1716454224388040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310900, "dur": 5, "args": { "External id": 189138, "cbid": 211, "correlation": 189138 } }, { "ph": "s", "id": 189138, "pid": 76337, "tid": -914061504, "ts": 1716454224310900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224388065, "dur": 33, "args": { "External id": 189144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189144, "pid": 5, "tid": 7, "ts": 1716454224388065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224310926, "dur": 8, "args": { "External id": 189144, "cbid": 211, "correlation": 189144 } }, { "ph": "s", "id": 189144, "pid": 76337, "tid": -914061504, "ts": 1716454224310926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224388099, "dur": 199, "args": { "External id": 189153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189153, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189153, "pid": 5, "tid": 7, "ts": 1716454224388099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311015, "dur": 14, "args": { "External id": 189153, "cbid": 211, "correlation": 189153 } }, { "ph": "s", "id": 189153, "pid": 76337, "tid": -914061504, "ts": 1716454224311015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224388299, "dur": 66, "args": { "External id": 189175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189175, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189175, "pid": 5, "tid": 7, "ts": 1716454224388299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311073, "dur": 10, "args": { "External id": 189175, "cbid": 211, "correlation": 189175 } }, { "ph": "s", "id": 189175, "pid": 76337, "tid": -914061504, "ts": 1716454224311073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311160, "dur": 1, "args": { "External id": 189186, "cbid": 251, "correlation": 189186 } }, { "ph": "f", "id": 189186, "pid": 76337, "tid": -914061504, "ts": 1716454224311160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224388366, "dur": 151, "args": { "External id": 189187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189187, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189187, "pid": 5, "tid": 7, "ts": 1716454224388366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311165, "dur": 13, "args": { "External id": 189187, "cbid": 211, "correlation": 189187 } }, { "ph": "s", "id": 189187, "pid": 76337, "tid": -914061504, "ts": 1716454224311165, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311235, "dur": 1, "args": { "External id": 189198, "cbid": 251, "correlation": 189198 } }, { "ph": "f", "id": 189198, "pid": 76337, "tid": -914061504, "ts": 1716454224311235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224388519, "dur": 145, "args": { "External id": 189199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189199, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189199, "pid": 5, "tid": 7, "ts": 1716454224388519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311239, "dur": 12, "args": { "External id": 189199, "cbid": 211, "correlation": 189199 } }, { "ph": "s", "id": 189199, "pid": 76337, "tid": -914061504, "ts": 1716454224311239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311306, "dur": 1, "args": { "External id": 189210, "cbid": 251, "correlation": 189210 } }, { "ph": "f", "id": 189210, "pid": 76337, "tid": -914061504, "ts": 1716454224311306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224388665, "dur": 145, "args": { "External id": 189211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189211, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189211, "pid": 5, "tid": 7, "ts": 1716454224388665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311310, "dur": 11, "args": { "External id": 189211, "cbid": 211, "correlation": 189211 } }, { "ph": "s", "id": 189211, "pid": 76337, "tid": -914061504, "ts": 1716454224311310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224388812, "dur": 1944, "args": { "External id": 189232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189232, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 189232, "pid": 5, "tid": 7, "ts": 1716454224388812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311389, "dur": 13, "args": { "External id": 189232, "cbid": 211, "correlation": 189232 } }, { "ph": "s", "id": 189232, "pid": 76337, "tid": -914061504, "ts": 1716454224311389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311486, "dur": 1, "args": { "External id": 189250, "cbid": 251, "correlation": 189250 } }, { "ph": "f", "id": 189250, "pid": 76337, "tid": -914061504, "ts": 1716454224311486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224390757, "dur": 149, "args": { "External id": 189252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189252, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 189252, "pid": 5, "tid": 7, "ts": 1716454224390757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311492, "dur": 14, "args": { "External id": 189252, "cbid": 211, "correlation": 189252 } }, { "ph": "s", "id": 189252, "pid": 76337, "tid": -914061504, "ts": 1716454224311492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224390908, "dur": 35, "args": { "External id": 189260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189260, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189260, "pid": 5, "tid": 7, "ts": 1716454224390908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311561, "dur": 12, "args": { "External id": 189260, "cbid": 211, "correlation": 189260 } }, { "ph": "s", "id": 189260, "pid": 76337, "tid": -914061504, "ts": 1716454224311561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224390945, "dur": 52, "args": { "External id": 189268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189268, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189268, "pid": 5, "tid": 7, "ts": 1716454224390945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311600, "dur": 9, "args": { "External id": 189268, "cbid": 211, "correlation": 189268 } }, { "ph": "s", "id": 189268, "pid": 76337, "tid": -914061504, "ts": 1716454224311600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224390997, "dur": 31, "args": { "External id": 189279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189279, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189279, "pid": 5, "tid": 7, "ts": 1716454224390997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311670, "dur": 12, "args": { "External id": 189279, "cbid": 211, "correlation": 189279 } }, { "ph": "s", "id": 189279, "pid": 76337, "tid": -914061504, "ts": 1716454224311670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224391029, "dur": 34, "args": { "External id": 189301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189301, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189301, "pid": 5, "tid": 7, "ts": 1716454224391029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311701, "dur": 8, "args": { "External id": 189301, "cbid": 211, "correlation": 189301 } }, { "ph": "s", "id": 189301, "pid": 76337, "tid": -914061504, "ts": 1716454224311701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311784, "dur": 1, "args": { "External id": 189312, "cbid": 251, "correlation": 189312 } }, { "ph": "f", "id": 189312, "pid": 76337, "tid": -914061504, "ts": 1716454224311784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224391065, "dur": 91, "args": { "External id": 189313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189313, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189313, "pid": 5, "tid": 7, "ts": 1716454224391065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311790, "dur": 14, "args": { "External id": 189313, "cbid": 211, "correlation": 189313 } }, { "ph": "s", "id": 189313, "pid": 76337, "tid": -914061504, "ts": 1716454224311790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311858, "dur": 1, "args": { "External id": 189324, "cbid": 251, "correlation": 189324 } }, { "ph": "f", "id": 189324, "pid": 76337, "tid": -914061504, "ts": 1716454224311858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311862, "dur": 0, "args": { "External id": 189325, "cbid": 251, "correlation": 189325 } }, { "ph": "f", "id": 189325, "pid": 76337, "tid": -914061504, "ts": 1716454224311862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224391158, "dur": 11, "args": { "External id": 189326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189326, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 189326, "pid": 5, "tid": 7, "ts": 1716454224391158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311863, "dur": 12, "args": { "External id": 189326, "cbid": 211, "correlation": 189326 } }, { "ph": "s", "id": 189326, "pid": 76337, "tid": -914061504, "ts": 1716454224311863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224391170, "dur": 5, "args": { "External id": 189328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189328, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 189328, "pid": 5, "tid": 7, "ts": 1716454224391170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311877, "dur": 6, "args": { "External id": 189328, "cbid": 211, "correlation": 189328 } }, { "ph": "s", "id": 189328, "pid": 76337, "tid": -914061504, "ts": 1716454224311877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311934, "dur": 1, "args": { "External id": 189339, "cbid": 251, "correlation": 189339 } }, { "ph": "f", "id": 189339, "pid": 76337, "tid": -914061504, "ts": 1716454224311934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224311937, "dur": 0, "args": { "External id": 189340, "cbid": 251, "correlation": 189340 } }, { "ph": "f", "id": 189340, "pid": 76337, "tid": -914061504, "ts": 1716454224311937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224391177, "dur": 7, "args": { "External id": 189341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189341, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 189341, "pid": 5, "tid": 7, "ts": 1716454224391177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311939, "dur": 12, "args": { "External id": 189341, "cbid": 211, "correlation": 189341 } }, { "ph": "s", "id": 189341, "pid": 76337, "tid": -914061504, "ts": 1716454224311939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224391186, "dur": 4, "args": { "External id": 189343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189343, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 189343, "pid": 5, "tid": 7, "ts": 1716454224391186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224311952, "dur": 7, "args": { "External id": 189343, "cbid": 211, "correlation": 189343 } }, { "ph": "s", "id": 189343, "pid": 76337, "tid": -914061504, "ts": 1716454224311952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224391190, "dur": 92, "args": { "External id": 189364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189364, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 189364, "pid": 5, "tid": 7, "ts": 1716454224391190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312035, "dur": 13, "args": { "External id": 189364, "cbid": 211, "correlation": 189364 } }, { "ph": "s", "id": 189364, "pid": 76337, "tid": -914061504, "ts": 1716454224312035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224312133, "dur": 1, "args": { "External id": 189382, "cbid": 251, "correlation": 189382 } }, { "ph": "f", "id": 189382, "pid": 76337, "tid": -914061504, "ts": 1716454224312133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224391283, "dur": 97, "args": { "External id": 189384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189384, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189384, "pid": 5, "tid": 7, "ts": 1716454224391283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312139, "dur": 13, "args": { "External id": 189384, "cbid": 211, "correlation": 189384 } }, { "ph": "s", "id": 189384, "pid": 76337, "tid": -914061504, "ts": 1716454224312139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224391381, "dur": 19, "args": { "External id": 189392, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189392, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189392, "pid": 5, "tid": 7, "ts": 1716454224391381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312210, "dur": 12, "args": { "External id": 189392, "cbid": 211, "correlation": 189392 } }, { "ph": "s", "id": 189392, "pid": 76337, "tid": -914061504, "ts": 1716454224312210, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224391402, "dur": 38, "args": { "External id": 189400, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189400, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189400, "pid": 5, "tid": 7, "ts": 1716454224391402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312251, "dur": 10, "args": { "External id": 189400, "cbid": 211, "correlation": 189400 } }, { "ph": "s", "id": 189400, "pid": 76337, "tid": -914061504, "ts": 1716454224312251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224391441, "dur": 34, "args": { "External id": 189422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189422, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189422, "pid": 5, "tid": 7, "ts": 1716454224391441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312302, "dur": 10, "args": { "External id": 189422, "cbid": 211, "correlation": 189422 } }, { "ph": "s", "id": 189422, "pid": 76337, "tid": -914061504, "ts": 1716454224312302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224312392, "dur": 1, "args": { "External id": 189438, "cbid": 251, "correlation": 189438 } }, { "ph": "f", "id": 189438, "pid": 76337, "tid": -914061504, "ts": 1716454224312392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224312397, "dur": 0, "args": { "External id": 189440, "cbid": 251, "correlation": 189440 } }, { "ph": "f", "id": 189440, "pid": 76337, "tid": -914061504, "ts": 1716454224312397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224391477, "dur": 538, "args": { "External id": 189441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189441, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 189441, "pid": 5, "tid": 7, "ts": 1716454224391477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312400, "dur": 14, "args": { "External id": 189441, "cbid": 211, "correlation": 189441 } }, { "ph": "s", "id": 189441, "pid": 76337, "tid": -914061504, "ts": 1716454224312400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224392016, "dur": 124, "args": { "External id": 189449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189449, "pid": 5, "tid": 7, "ts": 1716454224392016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312465, "dur": 12, "args": { "External id": 189449, "cbid": 211, "correlation": 189449 } }, { "ph": "s", "id": 189449, "pid": 76337, "tid": -914061504, "ts": 1716454224312465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224392141, "dur": 129, "args": { "External id": 189457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189457, "pid": 5, "tid": 7, "ts": 1716454224392141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312495, "dur": 8, "args": { "External id": 189457, "cbid": 211, "correlation": 189457 } }, { "ph": "s", "id": 189457, "pid": 76337, "tid": -914061504, "ts": 1716454224312495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224312573, "dur": 1, "args": { "External id": 189473, "cbid": 251, "correlation": 189473 } }, { "ph": "f", "id": 189473, "pid": 76337, "tid": -914061504, "ts": 1716454224312573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224392272, "dur": 309, "args": { "External id": 189475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189475, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189475, "pid": 5, "tid": 7, "ts": 1716454224392272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312578, "dur": 12, "args": { "External id": 189475, "cbid": 211, "correlation": 189475 } }, { "ph": "s", "id": 189475, "pid": 76337, "tid": -914061504, "ts": 1716454224312578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224392583, "dur": 28, "args": { "External id": 189483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189483, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189483, "pid": 5, "tid": 7, "ts": 1716454224392583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312621, "dur": 10, "args": { "External id": 189483, "cbid": 211, "correlation": 189483 } }, { "ph": "s", "id": 189483, "pid": 76337, "tid": -914061504, "ts": 1716454224312621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224392612, "dur": 82, "args": { "External id": 189494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189494, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189494, "pid": 5, "tid": 7, "ts": 1716454224392612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312688, "dur": 12, "args": { "External id": 189494, "cbid": 211, "correlation": 189494 } }, { "ph": "s", "id": 189494, "pid": 76337, "tid": -914061504, "ts": 1716454224312688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224312750, "dur": 0, "args": { "External id": 189506, "cbid": 317, "correlation": 189506 } }, { "ph": "f", "id": 189506, "pid": 76337, "tid": -914061504, "ts": 1716454224312750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224312751, "dur": 0, "args": { "External id": 189507, "cbid": 203, "correlation": 189507 } }, { "ph": "f", "id": 189507, "pid": 76337, "tid": -914061504, "ts": 1716454224312751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224312751, "dur": 0, "args": { "External id": 189508, "cbid": 205, "correlation": 189508 } }, { "ph": "f", "id": 189508, "pid": 76337, "tid": -914061504, "ts": 1716454224312751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224392695, "dur": 23, "args": { "External id": 189512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189512, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189512, "pid": 5, "tid": 7, "ts": 1716454224392695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312766, "dur": 12, "args": { "External id": 189512, "cbid": 211, "correlation": 189512 } }, { "ph": "s", "id": 189512, "pid": 76337, "tid": -914061504, "ts": 1716454224312766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224392719, "dur": 121, "args": { "External id": 189514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189514, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189514, "pid": 5, "tid": 7, "ts": 1716454224392719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312785, "dur": 6, "args": { "External id": 189514, "cbid": 211, "correlation": 189514 } }, { "ph": "s", "id": 189514, "pid": 76337, "tid": -914061504, "ts": 1716454224312785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224392841, "dur": 25, "args": { "External id": 189516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189516, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189516, "pid": 5, "tid": 7, "ts": 1716454224392841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312794, "dur": 5, "args": { "External id": 189516, "cbid": 211, "correlation": 189516 } }, { "ph": "s", "id": 189516, "pid": 76337, "tid": -914061504, "ts": 1716454224312794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224392867, "dur": 33, "args": { "External id": 189522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189522, "pid": 5, "tid": 7, "ts": 1716454224392867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312822, "dur": 8, "args": { "External id": 189522, "cbid": 211, "correlation": 189522 } }, { "ph": "s", "id": 189522, "pid": 76337, "tid": -914061504, "ts": 1716454224312822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224392901, "dur": 27, "args": { "External id": 189530, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189530, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189530, "pid": 5, "tid": 7, "ts": 1716454224392901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312853, "dur": 9, "args": { "External id": 189530, "cbid": 211, "correlation": 189530 } }, { "ph": "s", "id": 189530, "pid": 76337, "tid": -914061504, "ts": 1716454224312853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224392929, "dur": 102, "args": { "External id": 189541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189541, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189541, "pid": 5, "tid": 7, "ts": 1716454224392929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312918, "dur": 11, "args": { "External id": 189541, "cbid": 211, "correlation": 189541 } }, { "ph": "s", "id": 189541, "pid": 76337, "tid": -914061504, "ts": 1716454224312918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224312972, "dur": 7, "args": { "External id": 189551, "cbid": 317, "correlation": 189551 } }, { "ph": "f", "id": 189551, "pid": 76337, "tid": -914061504, "ts": 1716454224312972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224312980, "dur": 0, "args": { "External id": 189552, "cbid": 203, "correlation": 189552 } }, { "ph": "f", "id": 189552, "pid": 76337, "tid": -914061504, "ts": 1716454224312980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224312981, "dur": 0, "args": { "External id": 189553, "cbid": 205, "correlation": 189553 } }, { "ph": "f", "id": 189553, "pid": 76337, "tid": -914061504, "ts": 1716454224312981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224393033, "dur": 73, "args": { "External id": 189557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189557, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189557, "pid": 5, "tid": 7, "ts": 1716454224393033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224312995, "dur": 12, "args": { "External id": 189557, "cbid": 211, "correlation": 189557 } }, { "ph": "s", "id": 189557, "pid": 76337, "tid": -914061504, "ts": 1716454224312995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224393107, "dur": 45, "args": { "External id": 189559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189559, "pid": 5, "tid": 7, "ts": 1716454224393107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313010, "dur": 6, "args": { "External id": 189559, "cbid": 211, "correlation": 189559 } }, { "ph": "s", "id": 189559, "pid": 76337, "tid": -914061504, "ts": 1716454224313010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224393153, "dur": 4, "args": { "External id": 189561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189561, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189561, "pid": 5, "tid": 7, "ts": 1716454224393153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313021, "dur": 6, "args": { "External id": 189561, "cbid": 211, "correlation": 189561 } }, { "ph": "s", "id": 189561, "pid": 76337, "tid": -914061504, "ts": 1716454224313021, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224313030, "dur": 0, "args": { "External id": 189562, "cbid": 51, "correlation": 189562 } }, { "ph": "s", "id": 189562, "pid": 76337, "tid": -914061504, "ts": 1716454224313030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224393158, "dur": 2245, "args": { "External id": 189563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189563, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189563, "pid": 5, "tid": 7, "ts": 1716454224393158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313031, "dur": 5, "args": { "External id": 189563, "cbid": 211, "correlation": 189563 } }, { "ph": "s", "id": 189563, "pid": 76337, "tid": -914061504, "ts": 1716454224313031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224395404, "dur": 114, "args": { "External id": 189568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189568, "pid": 5, "tid": 7, "ts": 1716454224395404, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313059, "dur": 9, "args": { "External id": 189568, "cbid": 211, "correlation": 189568 } }, { "ph": "s", "id": 189568, "pid": 76337, "tid": -914061504, "ts": 1716454224313059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224395520, "dur": 167, "args": { "External id": 189577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189577, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189577, "pid": 5, "tid": 7, "ts": 1716454224395520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313152, "dur": 14, "args": { "External id": 189577, "cbid": 211, "correlation": 189577 } }, { "ph": "s", "id": 189577, "pid": 76337, "tid": -914061504, "ts": 1716454224313152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224395688, "dur": 128, "args": { "External id": 189597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189597, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 189597, "pid": 5, "tid": 7, "ts": 1716454224395688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313224, "dur": 11, "args": { "External id": 189597, "cbid": 211, "correlation": 189597 } }, { "ph": "s", "id": 189597, "pid": 76337, "tid": -914061504, "ts": 1716454224313224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224395817, "dur": 4, "args": { "External id": 189609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189609, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 189609, "pid": 5, "tid": 7, "ts": 1716454224395817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313245, "dur": 6, "args": { "External id": 189609, "cbid": 211, "correlation": 189609 } }, { "ph": "s", "id": 189609, "pid": 76337, "tid": -914061504, "ts": 1716454224313245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224395823, "dur": 159, "args": { "External id": 189612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189612, "pid": 5, "tid": 7, "ts": 1716454224395823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313264, "dur": 7, "args": { "External id": 189612, "cbid": 211, "correlation": 189612 } }, { "ph": "s", "id": 189612, "pid": 76337, "tid": -914061504, "ts": 1716454224313264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224395983, "dur": 101, "args": { "External id": 189621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189621, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189621, "pid": 5, "tid": 7, "ts": 1716454224395983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313304, "dur": 11, "args": { "External id": 189621, "cbid": 211, "correlation": 189621 } }, { "ph": "s", "id": 189621, "pid": 76337, "tid": -914061504, "ts": 1716454224313304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224313357, "dur": 0, "args": { "External id": 189631, "cbid": 317, "correlation": 189631 } }, { "ph": "f", "id": 189631, "pid": 76337, "tid": -914061504, "ts": 1716454224313357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224313357, "dur": 0, "args": { "External id": 189632, "cbid": 203, "correlation": 189632 } }, { "ph": "f", "id": 189632, "pid": 76337, "tid": -914061504, "ts": 1716454224313357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224313358, "dur": 0, "args": { "External id": 189633, "cbid": 205, "correlation": 189633 } }, { "ph": "f", "id": 189633, "pid": 76337, "tid": -914061504, "ts": 1716454224313358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224396086, "dur": 112, "args": { "External id": 189637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189637, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189637, "pid": 5, "tid": 7, "ts": 1716454224396086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313374, "dur": 11, "args": { "External id": 189637, "cbid": 211, "correlation": 189637 } }, { "ph": "s", "id": 189637, "pid": 76337, "tid": -914061504, "ts": 1716454224313374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224396199, "dur": 34, "args": { "External id": 189639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189639, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189639, "pid": 5, "tid": 7, "ts": 1716454224396199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313388, "dur": 5, "args": { "External id": 189639, "cbid": 211, "correlation": 189639 } }, { "ph": "s", "id": 189639, "pid": 76337, "tid": -914061504, "ts": 1716454224313388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224396235, "dur": 4, "args": { "External id": 189641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189641, "pid": 5, "tid": 7, "ts": 1716454224396235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313398, "dur": 5, "args": { "External id": 189641, "cbid": 211, "correlation": 189641 } }, { "ph": "s", "id": 189641, "pid": 76337, "tid": -914061504, "ts": 1716454224313398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224313407, "dur": 0, "args": { "External id": 189642, "cbid": 51, "correlation": 189642 } }, { "ph": "s", "id": 189642, "pid": 76337, "tid": -914061504, "ts": 1716454224313407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224396240, "dur": 2008, "args": { "External id": 189643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189643, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189643, "pid": 5, "tid": 7, "ts": 1716454224396240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313408, "dur": 7, "args": { "External id": 189643, "cbid": 211, "correlation": 189643 } }, { "ph": "s", "id": 189643, "pid": 76337, "tid": -914061504, "ts": 1716454224313408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224398249, "dur": 59, "args": { "External id": 189648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189648, "pid": 5, "tid": 7, "ts": 1716454224398249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313438, "dur": 8, "args": { "External id": 189648, "cbid": 211, "correlation": 189648 } }, { "ph": "s", "id": 189648, "pid": 76337, "tid": -914061504, "ts": 1716454224313438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224398310, "dur": 3, "args": { "External id": 189656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189656, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189656, "pid": 5, "tid": 7, "ts": 1716454224398310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313481, "dur": 9, "args": { "External id": 189656, "cbid": 211, "correlation": 189656 } }, { "ph": "s", "id": 189656, "pid": 76337, "tid": -914061504, "ts": 1716454224313481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224313547, "dur": 1, "args": { "External id": 189672, "cbid": 251, "correlation": 189672 } }, { "ph": "f", "id": 189672, "pid": 76337, "tid": -914061504, "ts": 1716454224313547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224313552, "dur": 0, "args": { "External id": 189674, "cbid": 251, "correlation": 189674 } }, { "ph": "f", "id": 189674, "pid": 76337, "tid": -914061504, "ts": 1716454224313552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224398315, "dur": 11, "args": { "External id": 189675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189675, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 189675, "pid": 5, "tid": 7, "ts": 1716454224398315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313554, "dur": 12, "args": { "External id": 189675, "cbid": 211, "correlation": 189675 } }, { "ph": "s", "id": 189675, "pid": 76337, "tid": -914061504, "ts": 1716454224313554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224398327, "dur": 5, "args": { "External id": 189677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189677, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 189677, "pid": 5, "tid": 7, "ts": 1716454224398327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313568, "dur": 5, "args": { "External id": 189677, "cbid": 211, "correlation": 189677 } }, { "ph": "s", "id": 189677, "pid": 76337, "tid": -914061504, "ts": 1716454224313568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224398334, "dur": 55, "args": { "External id": 189687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189687, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189687, "pid": 5, "tid": 7, "ts": 1716454224398334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313625, "dur": 12, "args": { "External id": 189687, "cbid": 211, "correlation": 189687 } }, { "ph": "s", "id": 189687, "pid": 76337, "tid": -914061504, "ts": 1716454224313625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224398390, "dur": 51, "args": { "External id": 189707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189707, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 189707, "pid": 5, "tid": 7, "ts": 1716454224398390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313692, "dur": 11, "args": { "External id": 189707, "cbid": 211, "correlation": 189707 } }, { "ph": "s", "id": 189707, "pid": 76337, "tid": -914061504, "ts": 1716454224313692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224398442, "dur": 4, "args": { "External id": 189719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189719, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189719, "pid": 5, "tid": 7, "ts": 1716454224398442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313712, "dur": 7, "args": { "External id": 189719, "cbid": 211, "correlation": 189719 } }, { "ph": "s", "id": 189719, "pid": 76337, "tid": -914061504, "ts": 1716454224313712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224398447, "dur": 56, "args": { "External id": 189722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189722, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189722, "pid": 5, "tid": 7, "ts": 1716454224398447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313732, "dur": 6, "args": { "External id": 189722, "cbid": 211, "correlation": 189722 } }, { "ph": "s", "id": 189722, "pid": 76337, "tid": -914061504, "ts": 1716454224313732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224398504, "dur": 37, "args": { "External id": 189731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189731, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189731, "pid": 5, "tid": 7, "ts": 1716454224398504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313773, "dur": 9, "args": { "External id": 189731, "cbid": 211, "correlation": 189731 } }, { "ph": "s", "id": 189731, "pid": 76337, "tid": -914061504, "ts": 1716454224313773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224313835, "dur": 0, "args": { "External id": 189741, "cbid": 317, "correlation": 189741 } }, { "ph": "f", "id": 189741, "pid": 76337, "tid": -914061504, "ts": 1716454224313835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224313836, "dur": 0, "args": { "External id": 189742, "cbid": 203, "correlation": 189742 } }, { "ph": "f", "id": 189742, "pid": 76337, "tid": -914061504, "ts": 1716454224313836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224313837, "dur": 0, "args": { "External id": 189743, "cbid": 205, "correlation": 189743 } }, { "ph": "f", "id": 189743, "pid": 76337, "tid": -914061504, "ts": 1716454224313837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224398543, "dur": 40, "args": { "External id": 189747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189747, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189747, "pid": 5, "tid": 7, "ts": 1716454224398543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313855, "dur": 12, "args": { "External id": 189747, "cbid": 211, "correlation": 189747 } }, { "ph": "s", "id": 189747, "pid": 76337, "tid": -914061504, "ts": 1716454224313855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224398584, "dur": 14, "args": { "External id": 189749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189749, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189749, "pid": 5, "tid": 7, "ts": 1716454224398584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313870, "dur": 6, "args": { "External id": 189749, "cbid": 211, "correlation": 189749 } }, { "ph": "s", "id": 189749, "pid": 76337, "tid": -914061504, "ts": 1716454224313870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224398600, "dur": 4, "args": { "External id": 189751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189751, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189751, "pid": 5, "tid": 7, "ts": 1716454224398600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313880, "dur": 5, "args": { "External id": 189751, "cbid": 211, "correlation": 189751 } }, { "ph": "s", "id": 189751, "pid": 76337, "tid": -914061504, "ts": 1716454224313880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224313888, "dur": 0, "args": { "External id": 189752, "cbid": 51, "correlation": 189752 } }, { "ph": "s", "id": 189752, "pid": 76337, "tid": -914061504, "ts": 1716454224313888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224398604, "dur": 697, "args": { "External id": 189753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189753, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189753, "pid": 5, "tid": 7, "ts": 1716454224398604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313889, "dur": 5, "args": { "External id": 189753, "cbid": 211, "correlation": 189753 } }, { "ph": "s", "id": 189753, "pid": 76337, "tid": -914061504, "ts": 1716454224313889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224399303, "dur": 60, "args": { "External id": 189758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189758, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189758, "pid": 5, "tid": 7, "ts": 1716454224399303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313916, "dur": 9, "args": { "External id": 189758, "cbid": 211, "correlation": 189758 } }, { "ph": "s", "id": 189758, "pid": 76337, "tid": -914061504, "ts": 1716454224313916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224313981, "dur": 0, "args": { "External id": 189768, "cbid": 317, "correlation": 189768 } }, { "ph": "f", "id": 189768, "pid": 76337, "tid": -914061504, "ts": 1716454224313981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224313982, "dur": 0, "args": { "External id": 189769, "cbid": 203, "correlation": 189769 } }, { "ph": "f", "id": 189769, "pid": 76337, "tid": -914061504, "ts": 1716454224313982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224313983, "dur": 0, "args": { "External id": 189770, "cbid": 205, "correlation": 189770 } }, { "ph": "f", "id": 189770, "pid": 76337, "tid": -914061504, "ts": 1716454224313983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224399364, "dur": 3, "args": { "External id": 189774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189774, "pid": 5, "tid": 7, "ts": 1716454224399364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224313998, "dur": 12, "args": { "External id": 189774, "cbid": 211, "correlation": 189774 } }, { "ph": "s", "id": 189774, "pid": 76337, "tid": -914061504, "ts": 1716454224313998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224314014, "dur": 0, "args": { "External id": 189775, "cbid": 51, "correlation": 189775 } }, { "ph": "s", "id": 189775, "pid": 76337, "tid": -914061504, "ts": 1716454224314014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454224399369, "dur": 265, "args": { "External id": 189776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189776, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189776, "pid": 5, "tid": 7, "ts": 1716454224399369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314015, "dur": 8, "args": { "External id": 189776, "cbid": 211, "correlation": 189776 } }, { "ph": "s", "id": 189776, "pid": 76337, "tid": -914061504, "ts": 1716454224314015, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224399635, "dur": 59, "args": { "External id": 189781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189781, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189781, "pid": 5, "tid": 7, "ts": 1716454224399635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314044, "dur": 8, "args": { "External id": 189781, "cbid": 211, "correlation": 189781 } }, { "ph": "s", "id": 189781, "pid": 76337, "tid": -914061504, "ts": 1716454224314044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224399695, "dur": 51, "args": { "External id": 189789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189789, "pid": 5, "tid": 7, "ts": 1716454224399695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314074, "dur": 8, "args": { "External id": 189789, "cbid": 211, "correlation": 189789 } }, { "ph": "s", "id": 189789, "pid": 76337, "tid": -914061504, "ts": 1716454224314074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224399747, "dur": 35, "args": { "External id": 189797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189797, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189797, "pid": 5, "tid": 7, "ts": 1716454224399747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314102, "dur": 8, "args": { "External id": 189797, "cbid": 211, "correlation": 189797 } }, { "ph": "s", "id": 189797, "pid": 76337, "tid": -914061504, "ts": 1716454224314102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224399783, "dur": 51, "args": { "External id": 189817, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189817, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 189817, "pid": 5, "tid": 7, "ts": 1716454224399783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314184, "dur": 12, "args": { "External id": 189817, "cbid": 211, "correlation": 189817 } }, { "ph": "s", "id": 189817, "pid": 76337, "tid": -914061504, "ts": 1716454224314184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224399836, "dur": 4, "args": { "External id": 189829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189829, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 189829, "pid": 5, "tid": 7, "ts": 1716454224399836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314206, "dur": 6, "args": { "External id": 189829, "cbid": 211, "correlation": 189829 } }, { "ph": "s", "id": 189829, "pid": 76337, "tid": -914061504, "ts": 1716454224314206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224399841, "dur": 55, "args": { "External id": 189832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189832, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189832, "pid": 5, "tid": 7, "ts": 1716454224399841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314224, "dur": 6, "args": { "External id": 189832, "cbid": 211, "correlation": 189832 } }, { "ph": "s", "id": 189832, "pid": 76337, "tid": -914061504, "ts": 1716454224314224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224314280, "dur": 0, "args": { "External id": 189843, "cbid": 317, "correlation": 189843 } }, { "ph": "f", "id": 189843, "pid": 76337, "tid": -914061504, "ts": 1716454224314280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224314281, "dur": 0, "args": { "External id": 189844, "cbid": 203, "correlation": 189844 } }, { "ph": "f", "id": 189844, "pid": 76337, "tid": -914061504, "ts": 1716454224314281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224314282, "dur": 0, "args": { "External id": 189845, "cbid": 205, "correlation": 189845 } }, { "ph": "f", "id": 189845, "pid": 76337, "tid": -914061504, "ts": 1716454224314282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314313, "dur": 2, "args": { "External id": 189849, "cbid": 251, "correlation": 189849 } }, { "ph": "f", "id": 189849, "pid": 76337, "tid": -914061504, "ts": 1716454224314313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314317, "dur": 1, "args": { "External id": 189850, "cbid": 251, "correlation": 189850 } }, { "ph": "f", "id": 189850, "pid": 76337, "tid": -914061504, "ts": 1716454224314317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314318, "dur": 1, "args": { "External id": 189851, "cbid": 251, "correlation": 189851 } }, { "ph": "f", "id": 189851, "pid": 76337, "tid": -914061504, "ts": 1716454224314318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314320, "dur": 1, "args": { "External id": 189852, "cbid": 251, "correlation": 189852 } }, { "ph": "f", "id": 189852, "pid": 76337, "tid": -914061504, "ts": 1716454224314320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314322, "dur": 1, "args": { "External id": 189853, "cbid": 251, "correlation": 189853 } }, { "ph": "f", "id": 189853, "pid": 76337, "tid": -914061504, "ts": 1716454224314322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314324, "dur": 2, "args": { "External id": 189854, "cbid": 251, "correlation": 189854 } }, { "ph": "f", "id": 189854, "pid": 76337, "tid": -914061504, "ts": 1716454224314324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314326, "dur": 1, "args": { "External id": 189855, "cbid": 251, "correlation": 189855 } }, { "ph": "f", "id": 189855, "pid": 76337, "tid": -914061504, "ts": 1716454224314326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314328, "dur": 1, "args": { "External id": 189856, "cbid": 251, "correlation": 189856 } }, { "ph": "f", "id": 189856, "pid": 76337, "tid": -914061504, "ts": 1716454224314328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314331, "dur": 0, "args": { "External id": 189857, "cbid": 251, "correlation": 189857 } }, { "ph": "f", "id": 189857, "pid": 76337, "tid": -914061504, "ts": 1716454224314331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224399897, "dur": 116, "args": { "External id": 189858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189858, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 189858, "pid": 5, "tid": 7, "ts": 1716454224399897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314334, "dur": 13, "args": { "External id": 189858, "cbid": 211, "correlation": 189858 } }, { "ph": "s", "id": 189858, "pid": 76337, "tid": -914061504, "ts": 1716454224314334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224400015, "dur": 60, "args": { "External id": 189864, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189864, "pid": 5, "tid": 7, "ts": 1716454224400015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314370, "dur": 9, "args": { "External id": 189864, "cbid": 211, "correlation": 189864 } }, { "ph": "s", "id": 189864, "pid": 76337, "tid": -914061504, "ts": 1716454224314370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224400076, "dur": 609, "args": { "External id": 189873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189873, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189873, "pid": 5, "tid": 7, "ts": 1716454224400076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314451, "dur": 14, "args": { "External id": 189873, "cbid": 211, "correlation": 189873 } }, { "ph": "s", "id": 189873, "pid": 76337, "tid": -914061504, "ts": 1716454224314451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224400687, "dur": 181, "args": { "External id": 189895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189895, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189895, "pid": 5, "tid": 7, "ts": 1716454224400687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314509, "dur": 11, "args": { "External id": 189895, "cbid": 211, "correlation": 189895 } }, { "ph": "s", "id": 189895, "pid": 76337, "tid": -914061504, "ts": 1716454224314509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314597, "dur": 1, "args": { "External id": 189906, "cbid": 251, "correlation": 189906 } }, { "ph": "f", "id": 189906, "pid": 76337, "tid": -914061504, "ts": 1716454224314597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224400869, "dur": 197, "args": { "External id": 189907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189907, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189907, "pid": 5, "tid": 7, "ts": 1716454224400869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314602, "dur": 14, "args": { "External id": 189907, "cbid": 211, "correlation": 189907 } }, { "ph": "s", "id": 189907, "pid": 76337, "tid": -914061504, "ts": 1716454224314602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314672, "dur": 1, "args": { "External id": 189918, "cbid": 251, "correlation": 189918 } }, { "ph": "f", "id": 189918, "pid": 76337, "tid": -914061504, "ts": 1716454224314672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224401068, "dur": 187, "args": { "External id": 189919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189919, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189919, "pid": 5, "tid": 7, "ts": 1716454224401068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314676, "dur": 12, "args": { "External id": 189919, "cbid": 211, "correlation": 189919 } }, { "ph": "s", "id": 189919, "pid": 76337, "tid": -914061504, "ts": 1716454224314676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314740, "dur": 1, "args": { "External id": 189930, "cbid": 251, "correlation": 189930 } }, { "ph": "f", "id": 189930, "pid": 76337, "tid": -914061504, "ts": 1716454224314740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224401256, "dur": 185, "args": { "External id": 189931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189931, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189931, "pid": 5, "tid": 7, "ts": 1716454224401256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314744, "dur": 12, "args": { "External id": 189931, "cbid": 211, "correlation": 189931 } }, { "ph": "s", "id": 189931, "pid": 76337, "tid": -914061504, "ts": 1716454224314744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224401443, "dur": 18686, "args": { "External id": 189952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189952, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 189952, "pid": 5, "tid": 7, "ts": 1716454224401443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314828, "dur": 13, "args": { "External id": 189952, "cbid": 211, "correlation": 189952 } }, { "ph": "s", "id": 189952, "pid": 76337, "tid": -914061504, "ts": 1716454224314828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224314925, "dur": 1, "args": { "External id": 189970, "cbid": 251, "correlation": 189970 } }, { "ph": "f", "id": 189970, "pid": 76337, "tid": -914061504, "ts": 1716454224314925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224420131, "dur": 203, "args": { "External id": 189972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189972, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 189972, "pid": 5, "tid": 7, "ts": 1716454224420131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224314930, "dur": 13, "args": { "External id": 189972, "cbid": 211, "correlation": 189972 } }, { "ph": "s", "id": 189972, "pid": 76337, "tid": -914061504, "ts": 1716454224314930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224420335, "dur": 67, "args": { "External id": 189980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189980, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189980, "pid": 5, "tid": 7, "ts": 1716454224420335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315008, "dur": 14, "args": { "External id": 189980, "cbid": 211, "correlation": 189980 } }, { "ph": "s", "id": 189980, "pid": 76337, "tid": -914061504, "ts": 1716454224315008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224420403, "dur": 96, "args": { "External id": 189988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189988, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189988, "pid": 5, "tid": 7, "ts": 1716454224420403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315048, "dur": 9, "args": { "External id": 189988, "cbid": 211, "correlation": 189988 } }, { "ph": "s", "id": 189988, "pid": 76337, "tid": -914061504, "ts": 1716454224315048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224420500, "dur": 53, "args": { "External id": 189999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 189999, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 189999, "pid": 5, "tid": 7, "ts": 1716454224420500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315122, "dur": 12, "args": { "External id": 189999, "cbid": 211, "correlation": 189999 } }, { "ph": "s", "id": 189999, "pid": 76337, "tid": -914061504, "ts": 1716454224315122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224420555, "dur": 93, "args": { "External id": 190021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190021, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190021, "pid": 5, "tid": 7, "ts": 1716454224420555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315153, "dur": 8, "args": { "External id": 190021, "cbid": 211, "correlation": 190021 } }, { "ph": "s", "id": 190021, "pid": 76337, "tid": -914061504, "ts": 1716454224315153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315237, "dur": 1, "args": { "External id": 190032, "cbid": 251, "correlation": 190032 } }, { "ph": "f", "id": 190032, "pid": 76337, "tid": -914061504, "ts": 1716454224315237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224420649, "dur": 106, "args": { "External id": 190033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190033, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190033, "pid": 5, "tid": 7, "ts": 1716454224420649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315242, "dur": 13, "args": { "External id": 190033, "cbid": 211, "correlation": 190033 } }, { "ph": "s", "id": 190033, "pid": 76337, "tid": -914061504, "ts": 1716454224315242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315316, "dur": 1, "args": { "External id": 190044, "cbid": 251, "correlation": 190044 } }, { "ph": "f", "id": 190044, "pid": 76337, "tid": -914061504, "ts": 1716454224315316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315320, "dur": 0, "args": { "External id": 190045, "cbid": 251, "correlation": 190045 } }, { "ph": "f", "id": 190045, "pid": 76337, "tid": -914061504, "ts": 1716454224315320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224420756, "dur": 10, "args": { "External id": 190046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190046, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 190046, "pid": 5, "tid": 7, "ts": 1716454224420756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315322, "dur": 14, "args": { "External id": 190046, "cbid": 211, "correlation": 190046 } }, { "ph": "s", "id": 190046, "pid": 76337, "tid": -914061504, "ts": 1716454224315322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224420767, "dur": 5, "args": { "External id": 190048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190048, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190048, "pid": 5, "tid": 7, "ts": 1716454224420767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315340, "dur": 7, "args": { "External id": 190048, "cbid": 211, "correlation": 190048 } }, { "ph": "s", "id": 190048, "pid": 76337, "tid": -914061504, "ts": 1716454224315340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315402, "dur": 1, "args": { "External id": 190059, "cbid": 251, "correlation": 190059 } }, { "ph": "f", "id": 190059, "pid": 76337, "tid": -914061504, "ts": 1716454224315402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315405, "dur": 0, "args": { "External id": 190060, "cbid": 251, "correlation": 190060 } }, { "ph": "f", "id": 190060, "pid": 76337, "tid": -914061504, "ts": 1716454224315405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224420774, "dur": 6, "args": { "External id": 190061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190061, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 190061, "pid": 5, "tid": 7, "ts": 1716454224420774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315407, "dur": 12, "args": { "External id": 190061, "cbid": 211, "correlation": 190061 } }, { "ph": "s", "id": 190061, "pid": 76337, "tid": -914061504, "ts": 1716454224315407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224420781, "dur": 3, "args": { "External id": 190063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190063, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190063, "pid": 5, "tid": 7, "ts": 1716454224420781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315421, "dur": 5, "args": { "External id": 190063, "cbid": 211, "correlation": 190063 } }, { "ph": "s", "id": 190063, "pid": 76337, "tid": -914061504, "ts": 1716454224315421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224420786, "dur": 157, "args": { "External id": 190084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190084, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190084, "pid": 5, "tid": 7, "ts": 1716454224420786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315495, "dur": 13, "args": { "External id": 190084, "cbid": 211, "correlation": 190084 } }, { "ph": "s", "id": 190084, "pid": 76337, "tid": -914061504, "ts": 1716454224315495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315591, "dur": 1, "args": { "External id": 190102, "cbid": 251, "correlation": 190102 } }, { "ph": "f", "id": 190102, "pid": 76337, "tid": -914061504, "ts": 1716454224315591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224420944, "dur": 108, "args": { "External id": 190104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190104, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190104, "pid": 5, "tid": 7, "ts": 1716454224420944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315597, "dur": 14, "args": { "External id": 190104, "cbid": 211, "correlation": 190104 } }, { "ph": "s", "id": 190104, "pid": 76337, "tid": -914061504, "ts": 1716454224315597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224421054, "dur": 35, "args": { "External id": 190112, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190112, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190112, "pid": 5, "tid": 7, "ts": 1716454224421054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315667, "dur": 12, "args": { "External id": 190112, "cbid": 211, "correlation": 190112 } }, { "ph": "s", "id": 190112, "pid": 76337, "tid": -914061504, "ts": 1716454224315667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224421090, "dur": 67, "args": { "External id": 190120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190120, "pid": 5, "tid": 7, "ts": 1716454224421090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315708, "dur": 10, "args": { "External id": 190120, "cbid": 211, "correlation": 190120 } }, { "ph": "s", "id": 190120, "pid": 76337, "tid": -914061504, "ts": 1716454224315708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224421158, "dur": 93, "args": { "External id": 190142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190142, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190142, "pid": 5, "tid": 7, "ts": 1716454224421158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315759, "dur": 10, "args": { "External id": 190142, "cbid": 211, "correlation": 190142 } }, { "ph": "s", "id": 190142, "pid": 76337, "tid": -914061504, "ts": 1716454224315759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224315845, "dur": 1, "args": { "External id": 190158, "cbid": 251, "correlation": 190158 } }, { "ph": "f", "id": 190158, "pid": 76337, "tid": -914061504, "ts": 1716454224315845, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224421253, "dur": 583, "args": { "External id": 190160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190160, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190160, "pid": 5, "tid": 7, "ts": 1716454224421253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315851, "dur": 13, "args": { "External id": 190160, "cbid": 211, "correlation": 190160 } }, { "ph": "s", "id": 190160, "pid": 76337, "tid": -914061504, "ts": 1716454224315851, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224421837, "dur": 245, "args": { "External id": 190168, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190168, "pid": 5, "tid": 7, "ts": 1716454224421837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315916, "dur": 13, "args": { "External id": 190168, "cbid": 211, "correlation": 190168 } }, { "ph": "s", "id": 190168, "pid": 76337, "tid": -914061504, "ts": 1716454224315916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224422084, "dur": 252, "args": { "External id": 190176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190176, "pid": 5, "tid": 7, "ts": 1716454224422084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224315948, "dur": 8, "args": { "External id": 190176, "cbid": 211, "correlation": 190176 } }, { "ph": "s", "id": 190176, "pid": 76337, "tid": -914061504, "ts": 1716454224315948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316041, "dur": 1, "args": { "External id": 190192, "cbid": 251, "correlation": 190192 } }, { "ph": "f", "id": 190192, "pid": 76337, "tid": -914061504, "ts": 1716454224316041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316046, "dur": 0, "args": { "External id": 190194, "cbid": 251, "correlation": 190194 } }, { "ph": "f", "id": 190194, "pid": 76337, "tid": -914061504, "ts": 1716454224316046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224422337, "dur": 359, "args": { "External id": 190195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190195, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 190195, "pid": 5, "tid": 7, "ts": 1716454224422337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316049, "dur": 14, "args": { "External id": 190195, "cbid": 211, "correlation": 190195 } }, { "ph": "s", "id": 190195, "pid": 76337, "tid": -914061504, "ts": 1716454224316049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224422697, "dur": 50, "args": { "External id": 190203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190203, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190203, "pid": 5, "tid": 7, "ts": 1716454224422697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316092, "dur": 10, "args": { "External id": 190203, "cbid": 211, "correlation": 190203 } }, { "ph": "s", "id": 190203, "pid": 76337, "tid": -914061504, "ts": 1716454224316092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224422748, "dur": 159, "args": { "External id": 190214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190214, "pid": 5, "tid": 7, "ts": 1716454224422748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316160, "dur": 13, "args": { "External id": 190214, "cbid": 211, "correlation": 190214 } }, { "ph": "s", "id": 190214, "pid": 76337, "tid": -914061504, "ts": 1716454224316160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224316224, "dur": 0, "args": { "External id": 190226, "cbid": 317, "correlation": 190226 } }, { "ph": "f", "id": 190226, "pid": 76337, "tid": -914061504, "ts": 1716454224316224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224316225, "dur": 0, "args": { "External id": 190227, "cbid": 203, "correlation": 190227 } }, { "ph": "f", "id": 190227, "pid": 76337, "tid": -914061504, "ts": 1716454224316225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224316226, "dur": 0, "args": { "External id": 190228, "cbid": 205, "correlation": 190228 } }, { "ph": "f", "id": 190228, "pid": 76337, "tid": -914061504, "ts": 1716454224316226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316250, "dur": 1, "args": { "External id": 190232, "cbid": 251, "correlation": 190232 } }, { "ph": "f", "id": 190232, "pid": 76337, "tid": -914061504, "ts": 1716454224316250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316252, "dur": 0, "args": { "External id": 190233, "cbid": 251, "correlation": 190233 } }, { "ph": "f", "id": 190233, "pid": 76337, "tid": -914061504, "ts": 1716454224316252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316252, "dur": 0, "args": { "External id": 190234, "cbid": 251, "correlation": 190234 } }, { "ph": "f", "id": 190234, "pid": 76337, "tid": -914061504, "ts": 1716454224316252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316253, "dur": 0, "args": { "External id": 190235, "cbid": 251, "correlation": 190235 } }, { "ph": "f", "id": 190235, "pid": 76337, "tid": -914061504, "ts": 1716454224316253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316254, "dur": 0, "args": { "External id": 190236, "cbid": 251, "correlation": 190236 } }, { "ph": "f", "id": 190236, "pid": 76337, "tid": -914061504, "ts": 1716454224316254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316255, "dur": 0, "args": { "External id": 190237, "cbid": 251, "correlation": 190237 } }, { "ph": "f", "id": 190237, "pid": 76337, "tid": -914061504, "ts": 1716454224316255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316256, "dur": 0, "args": { "External id": 190238, "cbid": 251, "correlation": 190238 } }, { "ph": "f", "id": 190238, "pid": 76337, "tid": -914061504, "ts": 1716454224316256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316257, "dur": 0, "args": { "External id": 190239, "cbid": 251, "correlation": 190239 } }, { "ph": "f", "id": 190239, "pid": 76337, "tid": -914061504, "ts": 1716454224316257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316258, "dur": 0, "args": { "External id": 190240, "cbid": 251, "correlation": 190240 } }, { "ph": "f", "id": 190240, "pid": 76337, "tid": -914061504, "ts": 1716454224316258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224422909, "dur": 115, "args": { "External id": 190241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190241, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190241, "pid": 5, "tid": 7, "ts": 1716454224422909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316261, "dur": 12, "args": { "External id": 190241, "cbid": 211, "correlation": 190241 } }, { "ph": "s", "id": 190241, "pid": 76337, "tid": -914061504, "ts": 1716454224316261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224423025, "dur": 59, "args": { "External id": 190247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190247, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190247, "pid": 5, "tid": 7, "ts": 1716454224423025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316296, "dur": 9, "args": { "External id": 190247, "cbid": 211, "correlation": 190247 } }, { "ph": "s", "id": 190247, "pid": 76337, "tid": -914061504, "ts": 1716454224316296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224423086, "dur": 50, "args": { "External id": 190255, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190255, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190255, "pid": 5, "tid": 7, "ts": 1716454224423086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316327, "dur": 8, "args": { "External id": 190255, "cbid": 211, "correlation": 190255 } }, { "ph": "s", "id": 190255, "pid": 76337, "tid": -914061504, "ts": 1716454224316327, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224423137, "dur": 99, "args": { "External id": 190264, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190264, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190264, "pid": 5, "tid": 7, "ts": 1716454224423137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316367, "dur": 10, "args": { "External id": 190264, "cbid": 211, "correlation": 190264 } }, { "ph": "s", "id": 190264, "pid": 76337, "tid": -914061504, "ts": 1716454224316367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224423237, "dur": 92, "args": { "External id": 190284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190284, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 190284, "pid": 5, "tid": 7, "ts": 1716454224423237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316439, "dur": 11, "args": { "External id": 190284, "cbid": 211, "correlation": 190284 } }, { "ph": "s", "id": 190284, "pid": 76337, "tid": -914061504, "ts": 1716454224316439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224423331, "dur": 5, "args": { "External id": 190296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190296, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190296, "pid": 5, "tid": 7, "ts": 1716454224423331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316460, "dur": 8, "args": { "External id": 190296, "cbid": 211, "correlation": 190296 } }, { "ph": "s", "id": 190296, "pid": 76337, "tid": -914061504, "ts": 1716454224316460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224423336, "dur": 108, "args": { "External id": 190299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190299, "pid": 5, "tid": 7, "ts": 1716454224423336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316480, "dur": 6, "args": { "External id": 190299, "cbid": 211, "correlation": 190299 } }, { "ph": "s", "id": 190299, "pid": 76337, "tid": -914061504, "ts": 1716454224316480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224423446, "dur": 70, "args": { "External id": 190308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190308, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190308, "pid": 5, "tid": 7, "ts": 1716454224423446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316520, "dur": 10, "args": { "External id": 190308, "cbid": 211, "correlation": 190308 } }, { "ph": "s", "id": 190308, "pid": 76337, "tid": -914061504, "ts": 1716454224316520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224316572, "dur": 0, "args": { "External id": 190318, "cbid": 317, "correlation": 190318 } }, { "ph": "f", "id": 190318, "pid": 76337, "tid": -914061504, "ts": 1716454224316572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224316573, "dur": 0, "args": { "External id": 190319, "cbid": 203, "correlation": 190319 } }, { "ph": "f", "id": 190319, "pid": 76337, "tid": -914061504, "ts": 1716454224316573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224316574, "dur": 0, "args": { "External id": 190320, "cbid": 205, "correlation": 190320 } }, { "ph": "f", "id": 190320, "pid": 76337, "tid": -914061504, "ts": 1716454224316574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224423517, "dur": 75, "args": { "External id": 190324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190324, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190324, "pid": 5, "tid": 7, "ts": 1716454224423517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316589, "dur": 11, "args": { "External id": 190324, "cbid": 211, "correlation": 190324 } }, { "ph": "s", "id": 190324, "pid": 76337, "tid": -914061504, "ts": 1716454224316589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224423594, "dur": 24, "args": { "External id": 190326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190326, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190326, "pid": 5, "tid": 7, "ts": 1716454224423594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316603, "dur": 5, "args": { "External id": 190326, "cbid": 211, "correlation": 190326 } }, { "ph": "s", "id": 190326, "pid": 76337, "tid": -914061504, "ts": 1716454224316603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224423620, "dur": 4, "args": { "External id": 190328, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 190328, "pid": 5, "tid": 7, "ts": 1716454224423620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316613, "dur": 6, "args": { "External id": 190328, "cbid": 211, "correlation": 190328 } }, { "ph": "s", "id": 190328, "pid": 76337, "tid": -914061504, "ts": 1716454224316613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224316622, "dur": 0, "args": { "External id": 190329, "cbid": 51, "correlation": 190329 } }, { "ph": "s", "id": 190329, "pid": 76337, "tid": -914061504, "ts": 1716454224316622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224423625, "dur": 1373, "args": { "External id": 190330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190330, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190330, "pid": 5, "tid": 7, "ts": 1716454224423625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316623, "dur": 5, "args": { "External id": 190330, "cbid": 211, "correlation": 190330 } }, { "ph": "s", "id": 190330, "pid": 76337, "tid": -914061504, "ts": 1716454224316623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224425000, "dur": 59, "args": { "External id": 190335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190335, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190335, "pid": 5, "tid": 7, "ts": 1716454224425000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316650, "dur": 8, "args": { "External id": 190335, "cbid": 211, "correlation": 190335 } }, { "ph": "s", "id": 190335, "pid": 76337, "tid": -914061504, "ts": 1716454224316650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224425061, "dur": 3, "args": { "External id": 190343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190343, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 190343, "pid": 5, "tid": 7, "ts": 1716454224425061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316694, "dur": 10, "args": { "External id": 190343, "cbid": 211, "correlation": 190343 } }, { "ph": "s", "id": 190343, "pid": 76337, "tid": -914061504, "ts": 1716454224316694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316760, "dur": 2, "args": { "External id": 190359, "cbid": 251, "correlation": 190359 } }, { "ph": "f", "id": 190359, "pid": 76337, "tid": -914061504, "ts": 1716454224316760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224316766, "dur": 0, "args": { "External id": 190361, "cbid": 251, "correlation": 190361 } }, { "ph": "f", "id": 190361, "pid": 76337, "tid": -914061504, "ts": 1716454224316766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224425065, "dur": 12, "args": { "External id": 190362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190362, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 190362, "pid": 5, "tid": 7, "ts": 1716454224425065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316768, "dur": 11, "args": { "External id": 190362, "cbid": 211, "correlation": 190362 } }, { "ph": "s", "id": 190362, "pid": 76337, "tid": -914061504, "ts": 1716454224316768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224425078, "dur": 5, "args": { "External id": 190364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190364, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190364, "pid": 5, "tid": 7, "ts": 1716454224425078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316781, "dur": 6, "args": { "External id": 190364, "cbid": 211, "correlation": 190364 } }, { "ph": "s", "id": 190364, "pid": 76337, "tid": -914061504, "ts": 1716454224316781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224425085, "dur": 54, "args": { "External id": 190374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190374, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190374, "pid": 5, "tid": 7, "ts": 1716454224425085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316840, "dur": 12, "args": { "External id": 190374, "cbid": 211, "correlation": 190374 } }, { "ph": "s", "id": 190374, "pid": 76337, "tid": -914061504, "ts": 1716454224316840, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224425140, "dur": 51, "args": { "External id": 190394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190394, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 190394, "pid": 5, "tid": 7, "ts": 1716454224425140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316906, "dur": 12, "args": { "External id": 190394, "cbid": 211, "correlation": 190394 } }, { "ph": "s", "id": 190394, "pid": 76337, "tid": -914061504, "ts": 1716454224316906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224425193, "dur": 4, "args": { "External id": 190406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190406, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 190406, "pid": 5, "tid": 7, "ts": 1716454224425193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316927, "dur": 6, "args": { "External id": 190406, "cbid": 211, "correlation": 190406 } }, { "ph": "s", "id": 190406, "pid": 76337, "tid": -914061504, "ts": 1716454224316927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224425198, "dur": 56, "args": { "External id": 190409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190409, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190409, "pid": 5, "tid": 7, "ts": 1716454224425198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316946, "dur": 6, "args": { "External id": 190409, "cbid": 211, "correlation": 190409 } }, { "ph": "s", "id": 190409, "pid": 76337, "tid": -914061504, "ts": 1716454224316946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224425255, "dur": 38, "args": { "External id": 190418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190418, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190418, "pid": 5, "tid": 7, "ts": 1716454224425255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224316995, "dur": 10, "args": { "External id": 190418, "cbid": 211, "correlation": 190418 } }, { "ph": "s", "id": 190418, "pid": 76337, "tid": -914061504, "ts": 1716454224316995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224317059, "dur": 0, "args": { "External id": 190428, "cbid": 317, "correlation": 190428 } }, { "ph": "f", "id": 190428, "pid": 76337, "tid": -914061504, "ts": 1716454224317059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224317060, "dur": 0, "args": { "External id": 190429, "cbid": 203, "correlation": 190429 } }, { "ph": "f", "id": 190429, "pid": 76337, "tid": -914061504, "ts": 1716454224317060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224317061, "dur": 0, "args": { "External id": 190430, "cbid": 205, "correlation": 190430 } }, { "ph": "f", "id": 190430, "pid": 76337, "tid": -914061504, "ts": 1716454224317061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224425294, "dur": 40, "args": { "External id": 190434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190434, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190434, "pid": 5, "tid": 7, "ts": 1716454224425294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317075, "dur": 12, "args": { "External id": 190434, "cbid": 211, "correlation": 190434 } }, { "ph": "s", "id": 190434, "pid": 76337, "tid": -914061504, "ts": 1716454224317075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224425335, "dur": 14, "args": { "External id": 190436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190436, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190436, "pid": 5, "tid": 7, "ts": 1716454224425335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317089, "dur": 5, "args": { "External id": 190436, "cbid": 211, "correlation": 190436 } }, { "ph": "s", "id": 190436, "pid": 76337, "tid": -914061504, "ts": 1716454224317089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224425351, "dur": 3, "args": { "External id": 190438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190438, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 190438, "pid": 5, "tid": 7, "ts": 1716454224425351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317099, "dur": 5, "args": { "External id": 190438, "cbid": 211, "correlation": 190438 } }, { "ph": "s", "id": 190438, "pid": 76337, "tid": -914061504, "ts": 1716454224317099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224317107, "dur": 0, "args": { "External id": 190439, "cbid": 51, "correlation": 190439 } }, { "ph": "s", "id": 190439, "pid": 76337, "tid": -914061504, "ts": 1716454224317107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224425356, "dur": 702, "args": { "External id": 190440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190440, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190440, "pid": 5, "tid": 7, "ts": 1716454224425356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317108, "dur": 5, "args": { "External id": 190440, "cbid": 211, "correlation": 190440 } }, { "ph": "s", "id": 190440, "pid": 76337, "tid": -914061504, "ts": 1716454224317108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224426059, "dur": 61, "args": { "External id": 190445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190445, "pid": 5, "tid": 7, "ts": 1716454224426059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317135, "dur": 8, "args": { "External id": 190445, "cbid": 211, "correlation": 190445 } }, { "ph": "s", "id": 190445, "pid": 76337, "tid": -914061504, "ts": 1716454224317135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224317192, "dur": 0, "args": { "External id": 190455, "cbid": 317, "correlation": 190455 } }, { "ph": "f", "id": 190455, "pid": 76337, "tid": -914061504, "ts": 1716454224317192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224317193, "dur": 0, "args": { "External id": 190456, "cbid": 203, "correlation": 190456 } }, { "ph": "f", "id": 190456, "pid": 76337, "tid": -914061504, "ts": 1716454224317193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224317193, "dur": 0, "args": { "External id": 190457, "cbid": 205, "correlation": 190457 } }, { "ph": "f", "id": 190457, "pid": 76337, "tid": -914061504, "ts": 1716454224317193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224426121, "dur": 74, "args": { "External id": 190461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190461, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190461, "pid": 5, "tid": 7, "ts": 1716454224426121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317206, "dur": 12, "args": { "External id": 190461, "cbid": 211, "correlation": 190461 } }, { "ph": "s", "id": 190461, "pid": 76337, "tid": -914061504, "ts": 1716454224317206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224426196, "dur": 208, "args": { "External id": 190463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190463, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190463, "pid": 5, "tid": 7, "ts": 1716454224426196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317225, "dur": 8, "args": { "External id": 190463, "cbid": 211, "correlation": 190463 } }, { "ph": "s", "id": 190463, "pid": 76337, "tid": -914061504, "ts": 1716454224317225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224426405, "dur": 40, "args": { "External id": 190465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190465, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190465, "pid": 5, "tid": 7, "ts": 1716454224426405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317238, "dur": 202, "args": { "External id": 190465, "cbid": 211, "correlation": 190465 } }, { "ph": "s", "id": 190465, "pid": 76337, "tid": -914061504, "ts": 1716454224317238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224426447, "dur": 59, "args": { "External id": 190471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190471, "pid": 5, "tid": 7, "ts": 1716454224426447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317462, "dur": 9, "args": { "External id": 190471, "cbid": 211, "correlation": 190471 } }, { "ph": "s", "id": 190471, "pid": 76337, "tid": -914061504, "ts": 1716454224317462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224426507, "dur": 49, "args": { "External id": 190479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190479, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190479, "pid": 5, "tid": 7, "ts": 1716454224426507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317493, "dur": 8, "args": { "External id": 190479, "cbid": 211, "correlation": 190479 } }, { "ph": "s", "id": 190479, "pid": 76337, "tid": -914061504, "ts": 1716454224317493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224426558, "dur": 35, "args": { "External id": 190487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190487, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190487, "pid": 5, "tid": 7, "ts": 1716454224426558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317520, "dur": 30, "args": { "External id": 190487, "cbid": 211, "correlation": 190487 } }, { "ph": "s", "id": 190487, "pid": 76337, "tid": -914061504, "ts": 1716454224317520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224426594, "dur": 54, "args": { "External id": 190507, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190507, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 190507, "pid": 5, "tid": 7, "ts": 1716454224426594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317626, "dur": 13, "args": { "External id": 190507, "cbid": 211, "correlation": 190507 } }, { "ph": "s", "id": 190507, "pid": 76337, "tid": -914061504, "ts": 1716454224317626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224426649, "dur": 4, "args": { "External id": 190519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190519, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 190519, "pid": 5, "tid": 7, "ts": 1716454224426649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317650, "dur": 6, "args": { "External id": 190519, "cbid": 211, "correlation": 190519 } }, { "ph": "s", "id": 190519, "pid": 76337, "tid": -914061504, "ts": 1716454224317650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224426655, "dur": 56, "args": { "External id": 190522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190522, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190522, "pid": 5, "tid": 7, "ts": 1716454224426655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317667, "dur": 6, "args": { "External id": 190522, "cbid": 211, "correlation": 190522 } }, { "ph": "s", "id": 190522, "pid": 76337, "tid": -914061504, "ts": 1716454224317667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224317725, "dur": 0, "args": { "External id": 190533, "cbid": 317, "correlation": 190533 } }, { "ph": "f", "id": 190533, "pid": 76337, "tid": -914061504, "ts": 1716454224317725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224317726, "dur": 0, "args": { "External id": 190534, "cbid": 203, "correlation": 190534 } }, { "ph": "f", "id": 190534, "pid": 76337, "tid": -914061504, "ts": 1716454224317726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224317727, "dur": 0, "args": { "External id": 190535, "cbid": 205, "correlation": 190535 } }, { "ph": "f", "id": 190535, "pid": 76337, "tid": -914061504, "ts": 1716454224317727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317750, "dur": 1, "args": { "External id": 190539, "cbid": 251, "correlation": 190539 } }, { "ph": "f", "id": 190539, "pid": 76337, "tid": -914061504, "ts": 1716454224317750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317752, "dur": 0, "args": { "External id": 190540, "cbid": 251, "correlation": 190540 } }, { "ph": "f", "id": 190540, "pid": 76337, "tid": -914061504, "ts": 1716454224317752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317752, "dur": 0, "args": { "External id": 190541, "cbid": 251, "correlation": 190541 } }, { "ph": "f", "id": 190541, "pid": 76337, "tid": -914061504, "ts": 1716454224317752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317753, "dur": 0, "args": { "External id": 190542, "cbid": 251, "correlation": 190542 } }, { "ph": "f", "id": 190542, "pid": 76337, "tid": -914061504, "ts": 1716454224317753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317754, "dur": 0, "args": { "External id": 190543, "cbid": 251, "correlation": 190543 } }, { "ph": "f", "id": 190543, "pid": 76337, "tid": -914061504, "ts": 1716454224317754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317755, "dur": 0, "args": { "External id": 190544, "cbid": 251, "correlation": 190544 } }, { "ph": "f", "id": 190544, "pid": 76337, "tid": -914061504, "ts": 1716454224317755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317756, "dur": 0, "args": { "External id": 190545, "cbid": 251, "correlation": 190545 } }, { "ph": "f", "id": 190545, "pid": 76337, "tid": -914061504, "ts": 1716454224317756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317756, "dur": 0, "args": { "External id": 190546, "cbid": 251, "correlation": 190546 } }, { "ph": "f", "id": 190546, "pid": 76337, "tid": -914061504, "ts": 1716454224317756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224317758, "dur": 0, "args": { "External id": 190547, "cbid": 251, "correlation": 190547 } }, { "ph": "f", "id": 190547, "pid": 76337, "tid": -914061504, "ts": 1716454224317758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224426712, "dur": 113, "args": { "External id": 190548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190548, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190548, "pid": 5, "tid": 7, "ts": 1716454224426712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317760, "dur": 13, "args": { "External id": 190548, "cbid": 211, "correlation": 190548 } }, { "ph": "s", "id": 190548, "pid": 76337, "tid": -914061504, "ts": 1716454224317760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224426826, "dur": 60, "args": { "External id": 190554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190554, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190554, "pid": 5, "tid": 7, "ts": 1716454224426826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317795, "dur": 9, "args": { "External id": 190554, "cbid": 211, "correlation": 190554 } }, { "ph": "s", "id": 190554, "pid": 76337, "tid": -914061504, "ts": 1716454224317795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224426887, "dur": 557, "args": { "External id": 190563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190563, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190563, "pid": 5, "tid": 7, "ts": 1716454224426887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317877, "dur": 15, "args": { "External id": 190563, "cbid": 211, "correlation": 190563 } }, { "ph": "s", "id": 190563, "pid": 76337, "tid": -914061504, "ts": 1716454224317877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224427445, "dur": 182, "args": { "External id": 190585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190585, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190585, "pid": 5, "tid": 7, "ts": 1716454224427445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224317936, "dur": 10, "args": { "External id": 190585, "cbid": 211, "correlation": 190585 } }, { "ph": "s", "id": 190585, "pid": 76337, "tid": -914061504, "ts": 1716454224317936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224318030, "dur": 1, "args": { "External id": 190596, "cbid": 251, "correlation": 190596 } }, { "ph": "f", "id": 190596, "pid": 76337, "tid": -914061504, "ts": 1716454224318030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224427628, "dur": 196, "args": { "External id": 190597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190597, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190597, "pid": 5, "tid": 7, "ts": 1716454224427628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318035, "dur": 15, "args": { "External id": 190597, "cbid": 211, "correlation": 190597 } }, { "ph": "s", "id": 190597, "pid": 76337, "tid": -914061504, "ts": 1716454224318035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224318105, "dur": 1, "args": { "External id": 190608, "cbid": 251, "correlation": 190608 } }, { "ph": "f", "id": 190608, "pid": 76337, "tid": -914061504, "ts": 1716454224318105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224427826, "dur": 189, "args": { "External id": 190609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190609, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190609, "pid": 5, "tid": 7, "ts": 1716454224427826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318109, "dur": 11, "args": { "External id": 190609, "cbid": 211, "correlation": 190609 } }, { "ph": "s", "id": 190609, "pid": 76337, "tid": -914061504, "ts": 1716454224318109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224318171, "dur": 1, "args": { "External id": 190620, "cbid": 251, "correlation": 190620 } }, { "ph": "f", "id": 190620, "pid": 76337, "tid": -914061504, "ts": 1716454224318171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224428016, "dur": 188, "args": { "External id": 190621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190621, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190621, "pid": 5, "tid": 7, "ts": 1716454224428016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318176, "dur": 11, "args": { "External id": 190621, "cbid": 211, "correlation": 190621 } }, { "ph": "s", "id": 190621, "pid": 76337, "tid": -914061504, "ts": 1716454224318176, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224428206, "dur": 18742, "args": { "External id": 190642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190642, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190642, "pid": 5, "tid": 7, "ts": 1716454224428206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318257, "dur": 12, "args": { "External id": 190642, "cbid": 211, "correlation": 190642 } }, { "ph": "s", "id": 190642, "pid": 76337, "tid": -914061504, "ts": 1716454224318257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224318354, "dur": 1, "args": { "External id": 190660, "cbid": 251, "correlation": 190660 } }, { "ph": "f", "id": 190660, "pid": 76337, "tid": -914061504, "ts": 1716454224318354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224446949, "dur": 204, "args": { "External id": 190662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190662, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190662, "pid": 5, "tid": 7, "ts": 1716454224446949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318359, "dur": 13, "args": { "External id": 190662, "cbid": 211, "correlation": 190662 } }, { "ph": "s", "id": 190662, "pid": 76337, "tid": -914061504, "ts": 1716454224318359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224447155, "dur": 66, "args": { "External id": 190670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190670, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190670, "pid": 5, "tid": 7, "ts": 1716454224447155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318428, "dur": 13, "args": { "External id": 190670, "cbid": 211, "correlation": 190670 } }, { "ph": "s", "id": 190670, "pid": 76337, "tid": -914061504, "ts": 1716454224318428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224447222, "dur": 96, "args": { "External id": 190678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190678, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190678, "pid": 5, "tid": 7, "ts": 1716454224447222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318468, "dur": 13, "args": { "External id": 190678, "cbid": 211, "correlation": 190678 } }, { "ph": "s", "id": 190678, "pid": 76337, "tid": -914061504, "ts": 1716454224318468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224447320, "dur": 54, "args": { "External id": 190689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190689, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190689, "pid": 5, "tid": 7, "ts": 1716454224447320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318544, "dur": 83, "args": { "External id": 190689, "cbid": 211, "correlation": 190689 } }, { "ph": "s", "id": 190689, "pid": 76337, "tid": -914061504, "ts": 1716454224318544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224447375, "dur": 92, "args": { "External id": 190711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190711, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190711, "pid": 5, "tid": 7, "ts": 1716454224447375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224318646, "dur": 1945, "args": { "External id": 190711, "cbid": 211, "correlation": 190711 } }, { "ph": "s", "id": 190711, "pid": 76337, "tid": -914061504, "ts": 1716454224318646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224320671, "dur": 1, "args": { "External id": 190722, "cbid": 251, "correlation": 190722 } }, { "ph": "f", "id": 190722, "pid": 76337, "tid": -914061504, "ts": 1716454224320671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224447469, "dur": 105, "args": { "External id": 190723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190723, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190723, "pid": 5, "tid": 7, "ts": 1716454224447469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224320676, "dur": 67, "args": { "External id": 190723, "cbid": 211, "correlation": 190723 } }, { "ph": "s", "id": 190723, "pid": 76337, "tid": -914061504, "ts": 1716454224320676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224320802, "dur": 1, "args": { "External id": 190734, "cbid": 251, "correlation": 190734 } }, { "ph": "f", "id": 190734, "pid": 76337, "tid": -914061504, "ts": 1716454224320802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224320806, "dur": 0, "args": { "External id": 190735, "cbid": 251, "correlation": 190735 } }, { "ph": "f", "id": 190735, "pid": 76337, "tid": -914061504, "ts": 1716454224320806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224447575, "dur": 10, "args": { "External id": 190736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190736, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 190736, "pid": 5, "tid": 7, "ts": 1716454224447575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224320807, "dur": 14, "args": { "External id": 190736, "cbid": 211, "correlation": 190736 } }, { "ph": "s", "id": 190736, "pid": 76337, "tid": -914061504, "ts": 1716454224320807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224447587, "dur": 5, "args": { "External id": 190738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190738, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190738, "pid": 5, "tid": 7, "ts": 1716454224447587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224320823, "dur": 6, "args": { "External id": 190738, "cbid": 211, "correlation": 190738 } }, { "ph": "s", "id": 190738, "pid": 76337, "tid": -914061504, "ts": 1716454224320823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224320883, "dur": 1, "args": { "External id": 190749, "cbid": 251, "correlation": 190749 } }, { "ph": "f", "id": 190749, "pid": 76337, "tid": -914061504, "ts": 1716454224320883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224320887, "dur": 0, "args": { "External id": 190750, "cbid": 251, "correlation": 190750 } }, { "ph": "f", "id": 190750, "pid": 76337, "tid": -914061504, "ts": 1716454224320887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224447593, "dur": 6, "args": { "External id": 190751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190751, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 190751, "pid": 5, "tid": 7, "ts": 1716454224447593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224320888, "dur": 12, "args": { "External id": 190751, "cbid": 211, "correlation": 190751 } }, { "ph": "s", "id": 190751, "pid": 76337, "tid": -914061504, "ts": 1716454224320888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224447600, "dur": 3, "args": { "External id": 190753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190753, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190753, "pid": 5, "tid": 7, "ts": 1716454224447600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224320901, "dur": 5, "args": { "External id": 190753, "cbid": 211, "correlation": 190753 } }, { "ph": "s", "id": 190753, "pid": 76337, "tid": -914061504, "ts": 1716454224320901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224447605, "dur": 157, "args": { "External id": 190774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190774, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190774, "pid": 5, "tid": 7, "ts": 1716454224447605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224320983, "dur": 13, "args": { "External id": 190774, "cbid": 211, "correlation": 190774 } }, { "ph": "s", "id": 190774, "pid": 76337, "tid": -914061504, "ts": 1716454224320983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321081, "dur": 1, "args": { "External id": 190792, "cbid": 251, "correlation": 190792 } }, { "ph": "f", "id": 190792, "pid": 76337, "tid": -914061504, "ts": 1716454224321081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224447764, "dur": 106, "args": { "External id": 190794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190794, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190794, "pid": 5, "tid": 7, "ts": 1716454224447764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321087, "dur": 13, "args": { "External id": 190794, "cbid": 211, "correlation": 190794 } }, { "ph": "s", "id": 190794, "pid": 76337, "tid": -914061504, "ts": 1716454224321087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224447871, "dur": 35, "args": { "External id": 190802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190802, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190802, "pid": 5, "tid": 7, "ts": 1716454224447871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321156, "dur": 13, "args": { "External id": 190802, "cbid": 211, "correlation": 190802 } }, { "ph": "s", "id": 190802, "pid": 76337, "tid": -914061504, "ts": 1716454224321156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224447908, "dur": 67, "args": { "External id": 190810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190810, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190810, "pid": 5, "tid": 7, "ts": 1716454224447908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321197, "dur": 9, "args": { "External id": 190810, "cbid": 211, "correlation": 190810 } }, { "ph": "s", "id": 190810, "pid": 76337, "tid": -914061504, "ts": 1716454224321197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224447976, "dur": 93, "args": { "External id": 190832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190832, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190832, "pid": 5, "tid": 7, "ts": 1716454224447976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321248, "dur": 10, "args": { "External id": 190832, "cbid": 211, "correlation": 190832 } }, { "ph": "s", "id": 190832, "pid": 76337, "tid": -914061504, "ts": 1716454224321248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321334, "dur": 1, "args": { "External id": 190848, "cbid": 251, "correlation": 190848 } }, { "ph": "f", "id": 190848, "pid": 76337, "tid": -914061504, "ts": 1716454224321334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224448070, "dur": 577, "args": { "External id": 190850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190850, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 190850, "pid": 5, "tid": 7, "ts": 1716454224448070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321340, "dur": 12, "args": { "External id": 190850, "cbid": 211, "correlation": 190850 } }, { "ph": "s", "id": 190850, "pid": 76337, "tid": -914061504, "ts": 1716454224321340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224448648, "dur": 245, "args": { "External id": 190858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190858, "pid": 5, "tid": 7, "ts": 1716454224448648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321405, "dur": 13, "args": { "External id": 190858, "cbid": 211, "correlation": 190858 } }, { "ph": "s", "id": 190858, "pid": 76337, "tid": -914061504, "ts": 1716454224321405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224448894, "dur": 253, "args": { "External id": 190866, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190866, "pid": 5, "tid": 7, "ts": 1716454224448894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321436, "dur": 8, "args": { "External id": 190866, "cbid": 211, "correlation": 190866 } }, { "ph": "s", "id": 190866, "pid": 76337, "tid": -914061504, "ts": 1716454224321436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321518, "dur": 1, "args": { "External id": 190882, "cbid": 251, "correlation": 190882 } }, { "ph": "f", "id": 190882, "pid": 76337, "tid": -914061504, "ts": 1716454224321518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321523, "dur": 0, "args": { "External id": 190884, "cbid": 251, "correlation": 190884 } }, { "ph": "f", "id": 190884, "pid": 76337, "tid": -914061504, "ts": 1716454224321523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224449148, "dur": 359, "args": { "External id": 190885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190885, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 190885, "pid": 5, "tid": 7, "ts": 1716454224449148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321525, "dur": 13, "args": { "External id": 190885, "cbid": 211, "correlation": 190885 } }, { "ph": "s", "id": 190885, "pid": 76337, "tid": -914061504, "ts": 1716454224321525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224449508, "dur": 50, "args": { "External id": 190893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190893, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190893, "pid": 5, "tid": 7, "ts": 1716454224449508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321568, "dur": 10, "args": { "External id": 190893, "cbid": 211, "correlation": 190893 } }, { "ph": "s", "id": 190893, "pid": 76337, "tid": -914061504, "ts": 1716454224321568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224449560, "dur": 160, "args": { "External id": 190904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190904, "pid": 5, "tid": 7, "ts": 1716454224449560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321635, "dur": 225, "args": { "External id": 190904, "cbid": 211, "correlation": 190904 } }, { "ph": "s", "id": 190904, "pid": 76337, "tid": -914061504, "ts": 1716454224321635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224321912, "dur": 0, "args": { "External id": 190916, "cbid": 317, "correlation": 190916 } }, { "ph": "f", "id": 190916, "pid": 76337, "tid": -914061504, "ts": 1716454224321912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224321913, "dur": 0, "args": { "External id": 190917, "cbid": 203, "correlation": 190917 } }, { "ph": "f", "id": 190917, "pid": 76337, "tid": -914061504, "ts": 1716454224321913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224321914, "dur": 0, "args": { "External id": 190918, "cbid": 205, "correlation": 190918 } }, { "ph": "f", "id": 190918, "pid": 76337, "tid": -914061504, "ts": 1716454224321914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321936, "dur": 1, "args": { "External id": 190922, "cbid": 251, "correlation": 190922 } }, { "ph": "f", "id": 190922, "pid": 76337, "tid": -914061504, "ts": 1716454224321936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321938, "dur": 0, "args": { "External id": 190923, "cbid": 251, "correlation": 190923 } }, { "ph": "f", "id": 190923, "pid": 76337, "tid": -914061504, "ts": 1716454224321938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321938, "dur": 0, "args": { "External id": 190924, "cbid": 251, "correlation": 190924 } }, { "ph": "f", "id": 190924, "pid": 76337, "tid": -914061504, "ts": 1716454224321938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321939, "dur": 0, "args": { "External id": 190925, "cbid": 251, "correlation": 190925 } }, { "ph": "f", "id": 190925, "pid": 76337, "tid": -914061504, "ts": 1716454224321939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321940, "dur": 0, "args": { "External id": 190926, "cbid": 251, "correlation": 190926 } }, { "ph": "f", "id": 190926, "pid": 76337, "tid": -914061504, "ts": 1716454224321940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321940, "dur": 0, "args": { "External id": 190927, "cbid": 251, "correlation": 190927 } }, { "ph": "f", "id": 190927, "pid": 76337, "tid": -914061504, "ts": 1716454224321940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321941, "dur": 0, "args": { "External id": 190928, "cbid": 251, "correlation": 190928 } }, { "ph": "f", "id": 190928, "pid": 76337, "tid": -914061504, "ts": 1716454224321941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321942, "dur": 0, "args": { "External id": 190929, "cbid": 251, "correlation": 190929 } }, { "ph": "f", "id": 190929, "pid": 76337, "tid": -914061504, "ts": 1716454224321942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224321943, "dur": 0, "args": { "External id": 190930, "cbid": 251, "correlation": 190930 } }, { "ph": "f", "id": 190930, "pid": 76337, "tid": -914061504, "ts": 1716454224321943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224449720, "dur": 115, "args": { "External id": 190931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190931, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 190931, "pid": 5, "tid": 7, "ts": 1716454224449720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224321945, "dur": 42, "args": { "External id": 190931, "cbid": 211, "correlation": 190931 } }, { "ph": "s", "id": 190931, "pid": 76337, "tid": -914061504, "ts": 1716454224321945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224449836, "dur": 61, "args": { "External id": 190937, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190937, "pid": 5, "tid": 7, "ts": 1716454224449836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322011, "dur": 104, "args": { "External id": 190937, "cbid": 211, "correlation": 190937 } }, { "ph": "s", "id": 190937, "pid": 76337, "tid": -914061504, "ts": 1716454224322011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224449898, "dur": 50, "args": { "External id": 190945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190945, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190945, "pid": 5, "tid": 7, "ts": 1716454224449898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322139, "dur": 285, "args": { "External id": 190945, "cbid": 211, "correlation": 190945 } }, { "ph": "s", "id": 190945, "pid": 76337, "tid": -914061504, "ts": 1716454224322139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224449949, "dur": 98, "args": { "External id": 190954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190954, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190954, "pid": 5, "tid": 7, "ts": 1716454224449949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322455, "dur": 10, "args": { "External id": 190954, "cbid": 211, "correlation": 190954 } }, { "ph": "s", "id": 190954, "pid": 76337, "tid": -914061504, "ts": 1716454224322455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224450048, "dur": 95, "args": { "External id": 190974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190974, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 190974, "pid": 5, "tid": 7, "ts": 1716454224450048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322527, "dur": 12, "args": { "External id": 190974, "cbid": 211, "correlation": 190974 } }, { "ph": "s", "id": 190974, "pid": 76337, "tid": -914061504, "ts": 1716454224322527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224450144, "dur": 4, "args": { "External id": 190986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190986, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 190986, "pid": 5, "tid": 7, "ts": 1716454224450144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322550, "dur": 10, "args": { "External id": 190986, "cbid": 211, "correlation": 190986 } }, { "ph": "s", "id": 190986, "pid": 76337, "tid": -914061504, "ts": 1716454224322550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224450150, "dur": 109, "args": { "External id": 190989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190989, "pid": 5, "tid": 7, "ts": 1716454224450150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322572, "dur": 110, "args": { "External id": 190989, "cbid": 211, "correlation": 190989 } }, { "ph": "s", "id": 190989, "pid": 76337, "tid": -914061504, "ts": 1716454224322572, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224450261, "dur": 69, "args": { "External id": 190998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 190998, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 190998, "pid": 5, "tid": 7, "ts": 1716454224450261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322716, "dur": 11, "args": { "External id": 190998, "cbid": 211, "correlation": 190998 } }, { "ph": "s", "id": 190998, "pid": 76337, "tid": -914061504, "ts": 1716454224322716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224322769, "dur": 0, "args": { "External id": 191008, "cbid": 317, "correlation": 191008 } }, { "ph": "f", "id": 191008, "pid": 76337, "tid": -914061504, "ts": 1716454224322769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224322770, "dur": 0, "args": { "External id": 191009, "cbid": 203, "correlation": 191009 } }, { "ph": "f", "id": 191009, "pid": 76337, "tid": -914061504, "ts": 1716454224322770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224322771, "dur": 0, "args": { "External id": 191010, "cbid": 205, "correlation": 191010 } }, { "ph": "f", "id": 191010, "pid": 76337, "tid": -914061504, "ts": 1716454224322771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224450331, "dur": 76, "args": { "External id": 191014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191014, "pid": 5, "tid": 7, "ts": 1716454224450331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322784, "dur": 12, "args": { "External id": 191014, "cbid": 211, "correlation": 191014 } }, { "ph": "s", "id": 191014, "pid": 76337, "tid": -914061504, "ts": 1716454224322784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224450409, "dur": 24, "args": { "External id": 191016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191016, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191016, "pid": 5, "tid": 7, "ts": 1716454224450409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322798, "dur": 5, "args": { "External id": 191016, "cbid": 211, "correlation": 191016 } }, { "ph": "s", "id": 191016, "pid": 76337, "tid": -914061504, "ts": 1716454224322798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224450434, "dur": 4, "args": { "External id": 191018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191018, "pid": 5, "tid": 7, "ts": 1716454224450434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322809, "dur": 6, "args": { "External id": 191018, "cbid": 211, "correlation": 191018 } }, { "ph": "s", "id": 191018, "pid": 76337, "tid": -914061504, "ts": 1716454224322809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224322817, "dur": 0, "args": { "External id": 191019, "cbid": 51, "correlation": 191019 } }, { "ph": "s", "id": 191019, "pid": 76337, "tid": -914061504, "ts": 1716454224322817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224450439, "dur": 1375, "args": { "External id": 191020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191020, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191020, "pid": 5, "tid": 7, "ts": 1716454224450439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322818, "dur": 5, "args": { "External id": 191020, "cbid": 211, "correlation": 191020 } }, { "ph": "s", "id": 191020, "pid": 76337, "tid": -914061504, "ts": 1716454224322818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224451816, "dur": 59, "args": { "External id": 191025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191025, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191025, "pid": 5, "tid": 7, "ts": 1716454224451816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322845, "dur": 9, "args": { "External id": 191025, "cbid": 211, "correlation": 191025 } }, { "ph": "s", "id": 191025, "pid": 76337, "tid": -914061504, "ts": 1716454224322845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224451876, "dur": 4, "args": { "External id": 191033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191033, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191033, "pid": 5, "tid": 7, "ts": 1716454224451876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322889, "dur": 9, "args": { "External id": 191033, "cbid": 211, "correlation": 191033 } }, { "ph": "s", "id": 191033, "pid": 76337, "tid": -914061504, "ts": 1716454224322889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224322955, "dur": 1, "args": { "External id": 191049, "cbid": 251, "correlation": 191049 } }, { "ph": "f", "id": 191049, "pid": 76337, "tid": -914061504, "ts": 1716454224322955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224322960, "dur": 0, "args": { "External id": 191051, "cbid": 251, "correlation": 191051 } }, { "ph": "f", "id": 191051, "pid": 76337, "tid": -914061504, "ts": 1716454224322960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224451881, "dur": 11, "args": { "External id": 191052, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191052, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 191052, "pid": 5, "tid": 7, "ts": 1716454224451881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322962, "dur": 20, "args": { "External id": 191052, "cbid": 211, "correlation": 191052 } }, { "ph": "s", "id": 191052, "pid": 76337, "tid": -914061504, "ts": 1716454224322962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224451893, "dur": 5, "args": { "External id": 191054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191054, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 191054, "pid": 5, "tid": 7, "ts": 1716454224451893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224322984, "dur": 6, "args": { "External id": 191054, "cbid": 211, "correlation": 191054 } }, { "ph": "s", "id": 191054, "pid": 76337, "tid": -914061504, "ts": 1716454224322984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224451900, "dur": 55, "args": { "External id": 191064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191064, "pid": 5, "tid": 7, "ts": 1716454224451900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323043, "dur": 553, "args": { "External id": 191064, "cbid": 211, "correlation": 191064 } }, { "ph": "s", "id": 191064, "pid": 76337, "tid": -914061504, "ts": 1716454224323043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224451956, "dur": 53, "args": { "External id": 191084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191084, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 191084, "pid": 5, "tid": 7, "ts": 1716454224451956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323651, "dur": 11, "args": { "External id": 191084, "cbid": 211, "correlation": 191084 } }, { "ph": "s", "id": 191084, "pid": 76337, "tid": -914061504, "ts": 1716454224323651, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224452011, "dur": 4, "args": { "External id": 191096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191096, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191096, "pid": 5, "tid": 7, "ts": 1716454224452011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323671, "dur": 6, "args": { "External id": 191096, "cbid": 211, "correlation": 191096 } }, { "ph": "s", "id": 191096, "pid": 76337, "tid": -914061504, "ts": 1716454224323671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224452016, "dur": 56, "args": { "External id": 191099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191099, "pid": 5, "tid": 7, "ts": 1716454224452016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323691, "dur": 6, "args": { "External id": 191099, "cbid": 211, "correlation": 191099 } }, { "ph": "s", "id": 191099, "pid": 76337, "tid": -914061504, "ts": 1716454224323691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224452074, "dur": 37, "args": { "External id": 191108, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191108, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191108, "pid": 5, "tid": 7, "ts": 1716454224452074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323732, "dur": 10, "args": { "External id": 191108, "cbid": 211, "correlation": 191108 } }, { "ph": "s", "id": 191108, "pid": 76337, "tid": -914061504, "ts": 1716454224323732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224323794, "dur": 0, "args": { "External id": 191118, "cbid": 317, "correlation": 191118 } }, { "ph": "f", "id": 191118, "pid": 76337, "tid": -914061504, "ts": 1716454224323794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224323795, "dur": 0, "args": { "External id": 191119, "cbid": 203, "correlation": 191119 } }, { "ph": "f", "id": 191119, "pid": 76337, "tid": -914061504, "ts": 1716454224323795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224323796, "dur": 0, "args": { "External id": 191120, "cbid": 205, "correlation": 191120 } }, { "ph": "f", "id": 191120, "pid": 76337, "tid": -914061504, "ts": 1716454224323796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224452112, "dur": 40, "args": { "External id": 191124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191124, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191124, "pid": 5, "tid": 7, "ts": 1716454224452112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323810, "dur": 12, "args": { "External id": 191124, "cbid": 211, "correlation": 191124 } }, { "ph": "s", "id": 191124, "pid": 76337, "tid": -914061504, "ts": 1716454224323810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224452154, "dur": 14, "args": { "External id": 191126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191126, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191126, "pid": 5, "tid": 7, "ts": 1716454224452154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323824, "dur": 5, "args": { "External id": 191126, "cbid": 211, "correlation": 191126 } }, { "ph": "s", "id": 191126, "pid": 76337, "tid": -914061504, "ts": 1716454224323824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224452169, "dur": 3, "args": { "External id": 191128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191128, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191128, "pid": 5, "tid": 7, "ts": 1716454224452169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323833, "dur": 6, "args": { "External id": 191128, "cbid": 211, "correlation": 191128 } }, { "ph": "s", "id": 191128, "pid": 76337, "tid": -914061504, "ts": 1716454224323833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224323843, "dur": 0, "args": { "External id": 191129, "cbid": 51, "correlation": 191129 } }, { "ph": "s", "id": 191129, "pid": 76337, "tid": -914061504, "ts": 1716454224323843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224452174, "dur": 703, "args": { "External id": 191130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191130, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191130, "pid": 5, "tid": 7, "ts": 1716454224452174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323844, "dur": 5, "args": { "External id": 191130, "cbid": 211, "correlation": 191130 } }, { "ph": "s", "id": 191130, "pid": 76337, "tid": -914061504, "ts": 1716454224323844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224452878, "dur": 60, "args": { "External id": 191135, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191135, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191135, "pid": 5, "tid": 7, "ts": 1716454224452878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323871, "dur": 9, "args": { "External id": 191135, "cbid": 211, "correlation": 191135 } }, { "ph": "s", "id": 191135, "pid": 76337, "tid": -914061504, "ts": 1716454224323871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224323929, "dur": 0, "args": { "External id": 191145, "cbid": 317, "correlation": 191145 } }, { "ph": "f", "id": 191145, "pid": 76337, "tid": -914061504, "ts": 1716454224323929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224323930, "dur": 0, "args": { "External id": 191146, "cbid": 203, "correlation": 191146 } }, { "ph": "f", "id": 191146, "pid": 76337, "tid": -914061504, "ts": 1716454224323930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224323930, "dur": 0, "args": { "External id": 191147, "cbid": 205, "correlation": 191147 } }, { "ph": "f", "id": 191147, "pid": 76337, "tid": -914061504, "ts": 1716454224323930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224452939, "dur": 75, "args": { "External id": 191151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191151, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191151, "pid": 5, "tid": 7, "ts": 1716454224452939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323943, "dur": 11, "args": { "External id": 191151, "cbid": 211, "correlation": 191151 } }, { "ph": "s", "id": 191151, "pid": 76337, "tid": -914061504, "ts": 1716454224323943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224453016, "dur": 209, "args": { "External id": 191153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191153, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191153, "pid": 5, "tid": 7, "ts": 1716454224453016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323960, "dur": 6, "args": { "External id": 191153, "cbid": 211, "correlation": 191153 } }, { "ph": "s", "id": 191153, "pid": 76337, "tid": -914061504, "ts": 1716454224323960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224453226, "dur": 39, "args": { "External id": 191155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191155, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191155, "pid": 5, "tid": 7, "ts": 1716454224453226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224323971, "dur": 14, "args": { "External id": 191155, "cbid": 211, "correlation": 191155 } }, { "ph": "s", "id": 191155, "pid": 76337, "tid": -914061504, "ts": 1716454224323971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224453266, "dur": 59, "args": { "External id": 191161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191161, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191161, "pid": 5, "tid": 7, "ts": 1716454224453266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324006, "dur": 516, "args": { "External id": 191161, "cbid": 211, "correlation": 191161 } }, { "ph": "s", "id": 191161, "pid": 76337, "tid": -914061504, "ts": 1716454224324006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224453326, "dur": 50, "args": { "External id": 191169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191169, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191169, "pid": 5, "tid": 7, "ts": 1716454224453326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324544, "dur": 8, "args": { "External id": 191169, "cbid": 211, "correlation": 191169 } }, { "ph": "s", "id": 191169, "pid": 76337, "tid": -914061504, "ts": 1716454224324544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224453377, "dur": 35, "args": { "External id": 191177, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191177, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191177, "pid": 5, "tid": 7, "ts": 1716454224453377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324573, "dur": 9, "args": { "External id": 191177, "cbid": 211, "correlation": 191177 } }, { "ph": "s", "id": 191177, "pid": 76337, "tid": -914061504, "ts": 1716454224324573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224453414, "dur": 52, "args": { "External id": 191197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191197, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 191197, "pid": 5, "tid": 7, "ts": 1716454224453414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324656, "dur": 13, "args": { "External id": 191197, "cbid": 211, "correlation": 191197 } }, { "ph": "s", "id": 191197, "pid": 76337, "tid": -914061504, "ts": 1716454224324656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224453468, "dur": 5, "args": { "External id": 191209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191209, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191209, "pid": 5, "tid": 7, "ts": 1716454224453468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324679, "dur": 6, "args": { "External id": 191209, "cbid": 211, "correlation": 191209 } }, { "ph": "s", "id": 191209, "pid": 76337, "tid": -914061504, "ts": 1716454224324679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224453474, "dur": 56, "args": { "External id": 191212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191212, "pid": 5, "tid": 7, "ts": 1716454224453474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324696, "dur": 7, "args": { "External id": 191212, "cbid": 211, "correlation": 191212 } }, { "ph": "s", "id": 191212, "pid": 76337, "tid": -914061504, "ts": 1716454224324696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224324753, "dur": 0, "args": { "External id": 191223, "cbid": 317, "correlation": 191223 } }, { "ph": "f", "id": 191223, "pid": 76337, "tid": -914061504, "ts": 1716454224324753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224324754, "dur": 0, "args": { "External id": 191224, "cbid": 203, "correlation": 191224 } }, { "ph": "f", "id": 191224, "pid": 76337, "tid": -914061504, "ts": 1716454224324754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224324755, "dur": 0, "args": { "External id": 191225, "cbid": 205, "correlation": 191225 } }, { "ph": "f", "id": 191225, "pid": 76337, "tid": -914061504, "ts": 1716454224324755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324777, "dur": 1, "args": { "External id": 191229, "cbid": 251, "correlation": 191229 } }, { "ph": "f", "id": 191229, "pid": 76337, "tid": -914061504, "ts": 1716454224324777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324779, "dur": 0, "args": { "External id": 191230, "cbid": 251, "correlation": 191230 } }, { "ph": "f", "id": 191230, "pid": 76337, "tid": -914061504, "ts": 1716454224324779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324780, "dur": 0, "args": { "External id": 191231, "cbid": 251, "correlation": 191231 } }, { "ph": "f", "id": 191231, "pid": 76337, "tid": -914061504, "ts": 1716454224324780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324781, "dur": 0, "args": { "External id": 191232, "cbid": 251, "correlation": 191232 } }, { "ph": "f", "id": 191232, "pid": 76337, "tid": -914061504, "ts": 1716454224324781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324781, "dur": 0, "args": { "External id": 191233, "cbid": 251, "correlation": 191233 } }, { "ph": "f", "id": 191233, "pid": 76337, "tid": -914061504, "ts": 1716454224324781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324782, "dur": 0, "args": { "External id": 191234, "cbid": 251, "correlation": 191234 } }, { "ph": "f", "id": 191234, "pid": 76337, "tid": -914061504, "ts": 1716454224324782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324783, "dur": 0, "args": { "External id": 191235, "cbid": 251, "correlation": 191235 } }, { "ph": "f", "id": 191235, "pid": 76337, "tid": -914061504, "ts": 1716454224324783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324783, "dur": 0, "args": { "External id": 191236, "cbid": 251, "correlation": 191236 } }, { "ph": "f", "id": 191236, "pid": 76337, "tid": -914061504, "ts": 1716454224324783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224324785, "dur": 0, "args": { "External id": 191237, "cbid": 251, "correlation": 191237 } }, { "ph": "f", "id": 191237, "pid": 76337, "tid": -914061504, "ts": 1716454224324785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224453532, "dur": 115, "args": { "External id": 191238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191238, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 191238, "pid": 5, "tid": 7, "ts": 1716454224453532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324787, "dur": 12, "args": { "External id": 191238, "cbid": 211, "correlation": 191238 } }, { "ph": "s", "id": 191238, "pid": 76337, "tid": -914061504, "ts": 1716454224324787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224453648, "dur": 60, "args": { "External id": 191244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191244, "pid": 5, "tid": 7, "ts": 1716454224453648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324822, "dur": 9, "args": { "External id": 191244, "cbid": 211, "correlation": 191244 } }, { "ph": "s", "id": 191244, "pid": 76337, "tid": -914061504, "ts": 1716454224324822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224453709, "dur": 606, "args": { "External id": 191253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191253, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191253, "pid": 5, "tid": 7, "ts": 1716454224453709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324903, "dur": 14, "args": { "External id": 191253, "cbid": 211, "correlation": 191253 } }, { "ph": "s", "id": 191253, "pid": 76337, "tid": -914061504, "ts": 1716454224324903, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224454316, "dur": 182, "args": { "External id": 191275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191275, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191275, "pid": 5, "tid": 7, "ts": 1716454224454316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224324960, "dur": 11, "args": { "External id": 191275, "cbid": 211, "correlation": 191275 } }, { "ph": "s", "id": 191275, "pid": 76337, "tid": -914061504, "ts": 1716454224324960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224325053, "dur": 1, "args": { "External id": 191286, "cbid": 251, "correlation": 191286 } }, { "ph": "f", "id": 191286, "pid": 76337, "tid": -914061504, "ts": 1716454224325053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224454500, "dur": 194, "args": { "External id": 191287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191287, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191287, "pid": 5, "tid": 7, "ts": 1716454224454500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325059, "dur": 14, "args": { "External id": 191287, "cbid": 211, "correlation": 191287 } }, { "ph": "s", "id": 191287, "pid": 76337, "tid": -914061504, "ts": 1716454224325059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224325127, "dur": 1, "args": { "External id": 191298, "cbid": 251, "correlation": 191298 } }, { "ph": "f", "id": 191298, "pid": 76337, "tid": -914061504, "ts": 1716454224325127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224454696, "dur": 190, "args": { "External id": 191299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191299, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191299, "pid": 5, "tid": 7, "ts": 1716454224454696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325130, "dur": 11, "args": { "External id": 191299, "cbid": 211, "correlation": 191299 } }, { "ph": "s", "id": 191299, "pid": 76337, "tid": -914061504, "ts": 1716454224325130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224325192, "dur": 1, "args": { "External id": 191310, "cbid": 251, "correlation": 191310 } }, { "ph": "f", "id": 191310, "pid": 76337, "tid": -914061504, "ts": 1716454224325192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224454887, "dur": 187, "args": { "External id": 191311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191311, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191311, "pid": 5, "tid": 7, "ts": 1716454224454887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325196, "dur": 11, "args": { "External id": 191311, "cbid": 211, "correlation": 191311 } }, { "ph": "s", "id": 191311, "pid": 76337, "tid": -914061504, "ts": 1716454224325196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224455076, "dur": 18741, "args": { "External id": 191332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191332, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 191332, "pid": 5, "tid": 7, "ts": 1716454224455076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325278, "dur": 12, "args": { "External id": 191332, "cbid": 211, "correlation": 191332 } }, { "ph": "s", "id": 191332, "pid": 76337, "tid": -914061504, "ts": 1716454224325278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224325374, "dur": 1, "args": { "External id": 191350, "cbid": 251, "correlation": 191350 } }, { "ph": "f", "id": 191350, "pid": 76337, "tid": -914061504, "ts": 1716454224325374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224473818, "dur": 203, "args": { "External id": 191352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191352, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191352, "pid": 5, "tid": 7, "ts": 1716454224473818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325380, "dur": 13, "args": { "External id": 191352, "cbid": 211, "correlation": 191352 } }, { "ph": "s", "id": 191352, "pid": 76337, "tid": -914061504, "ts": 1716454224325380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224474023, "dur": 66, "args": { "External id": 191360, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191360, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191360, "pid": 5, "tid": 7, "ts": 1716454224474023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325449, "dur": 18, "args": { "External id": 191360, "cbid": 211, "correlation": 191360 } }, { "ph": "s", "id": 191360, "pid": 76337, "tid": -914061504, "ts": 1716454224325449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224474090, "dur": 96, "args": { "External id": 191368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191368, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191368, "pid": 5, "tid": 7, "ts": 1716454224474090, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325494, "dur": 113, "args": { "External id": 191368, "cbid": 211, "correlation": 191368 } }, { "ph": "s", "id": 191368, "pid": 76337, "tid": -914061504, "ts": 1716454224325494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224474188, "dur": 55, "args": { "External id": 191379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191379, "pid": 5, "tid": 7, "ts": 1716454224474188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224325670, "dur": 1810, "args": { "External id": 191379, "cbid": 211, "correlation": 191379 } }, { "ph": "s", "id": 191379, "pid": 76337, "tid": -914061504, "ts": 1716454224325670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224474244, "dur": 93, "args": { "External id": 191401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191401, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191401, "pid": 5, "tid": 7, "ts": 1716454224474244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327501, "dur": 122, "args": { "External id": 191401, "cbid": 211, "correlation": 191401 } }, { "ph": "s", "id": 191401, "pid": 76337, "tid": -914061504, "ts": 1716454224327501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224327701, "dur": 1, "args": { "External id": 191412, "cbid": 251, "correlation": 191412 } }, { "ph": "f", "id": 191412, "pid": 76337, "tid": -914061504, "ts": 1716454224327701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224474338, "dur": 104, "args": { "External id": 191413, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191413, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191413, "pid": 5, "tid": 7, "ts": 1716454224474338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327706, "dur": 13, "args": { "External id": 191413, "cbid": 211, "correlation": 191413 } }, { "ph": "s", "id": 191413, "pid": 76337, "tid": -914061504, "ts": 1716454224327706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224327777, "dur": 1, "args": { "External id": 191424, "cbid": 251, "correlation": 191424 } }, { "ph": "f", "id": 191424, "pid": 76337, "tid": -914061504, "ts": 1716454224327777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224327781, "dur": 0, "args": { "External id": 191425, "cbid": 251, "correlation": 191425 } }, { "ph": "f", "id": 191425, "pid": 76337, "tid": -914061504, "ts": 1716454224327781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224474443, "dur": 11, "args": { "External id": 191426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191426, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191426, "pid": 5, "tid": 7, "ts": 1716454224474443, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327783, "dur": 12, "args": { "External id": 191426, "cbid": 211, "correlation": 191426 } }, { "ph": "s", "id": 191426, "pid": 76337, "tid": -914061504, "ts": 1716454224327783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224474455, "dur": 5, "args": { "External id": 191428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191428, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 191428, "pid": 5, "tid": 7, "ts": 1716454224474455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327797, "dur": 6, "args": { "External id": 191428, "cbid": 211, "correlation": 191428 } }, { "ph": "s", "id": 191428, "pid": 76337, "tid": -914061504, "ts": 1716454224327797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224327858, "dur": 1, "args": { "External id": 191439, "cbid": 251, "correlation": 191439 } }, { "ph": "f", "id": 191439, "pid": 76337, "tid": -914061504, "ts": 1716454224327858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224327861, "dur": 0, "args": { "External id": 191440, "cbid": 251, "correlation": 191440 } }, { "ph": "f", "id": 191440, "pid": 76337, "tid": -914061504, "ts": 1716454224327861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224474462, "dur": 6, "args": { "External id": 191441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191441, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191441, "pid": 5, "tid": 7, "ts": 1716454224474462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327863, "dur": 12, "args": { "External id": 191441, "cbid": 211, "correlation": 191441 } }, { "ph": "s", "id": 191441, "pid": 76337, "tid": -914061504, "ts": 1716454224327863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224474469, "dur": 3, "args": { "External id": 191443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191443, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 191443, "pid": 5, "tid": 7, "ts": 1716454224474469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327876, "dur": 6, "args": { "External id": 191443, "cbid": 211, "correlation": 191443 } }, { "ph": "s", "id": 191443, "pid": 76337, "tid": -914061504, "ts": 1716454224327876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224474474, "dur": 157, "args": { "External id": 191464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191464, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 191464, "pid": 5, "tid": 7, "ts": 1716454224474474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224327950, "dur": 12, "args": { "External id": 191464, "cbid": 211, "correlation": 191464 } }, { "ph": "s", "id": 191464, "pid": 76337, "tid": -914061504, "ts": 1716454224327950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328053, "dur": 1, "args": { "External id": 191482, "cbid": 251, "correlation": 191482 } }, { "ph": "f", "id": 191482, "pid": 76337, "tid": -914061504, "ts": 1716454224328053, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224474633, "dur": 106, "args": { "External id": 191484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191484, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 191484, "pid": 5, "tid": 7, "ts": 1716454224474633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328058, "dur": 14, "args": { "External id": 191484, "cbid": 211, "correlation": 191484 } }, { "ph": "s", "id": 191484, "pid": 76337, "tid": -914061504, "ts": 1716454224328058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224474740, "dur": 35, "args": { "External id": 191492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191492, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191492, "pid": 5, "tid": 7, "ts": 1716454224474740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328128, "dur": 13, "args": { "External id": 191492, "cbid": 211, "correlation": 191492 } }, { "ph": "s", "id": 191492, "pid": 76337, "tid": -914061504, "ts": 1716454224328128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224474776, "dur": 67, "args": { "External id": 191500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191500, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191500, "pid": 5, "tid": 7, "ts": 1716454224474776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328170, "dur": 9, "args": { "External id": 191500, "cbid": 211, "correlation": 191500 } }, { "ph": "s", "id": 191500, "pid": 76337, "tid": -914061504, "ts": 1716454224328170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224474844, "dur": 93, "args": { "External id": 191522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191522, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191522, "pid": 5, "tid": 7, "ts": 1716454224474844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328220, "dur": 10, "args": { "External id": 191522, "cbid": 211, "correlation": 191522 } }, { "ph": "s", "id": 191522, "pid": 76337, "tid": -914061504, "ts": 1716454224328220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328308, "dur": 1, "args": { "External id": 191538, "cbid": 251, "correlation": 191538 } }, { "ph": "f", "id": 191538, "pid": 76337, "tid": -914061504, "ts": 1716454224328308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224474938, "dur": 577, "args": { "External id": 191540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191540, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 191540, "pid": 5, "tid": 7, "ts": 1716454224474938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328314, "dur": 12, "args": { "External id": 191540, "cbid": 211, "correlation": 191540 } }, { "ph": "s", "id": 191540, "pid": 76337, "tid": -914061504, "ts": 1716454224328314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224475517, "dur": 244, "args": { "External id": 191548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191548, "pid": 5, "tid": 7, "ts": 1716454224475517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328378, "dur": 13, "args": { "External id": 191548, "cbid": 211, "correlation": 191548 } }, { "ph": "s", "id": 191548, "pid": 76337, "tid": -914061504, "ts": 1716454224328378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224475762, "dur": 253, "args": { "External id": 191556, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191556, "pid": 5, "tid": 7, "ts": 1716454224475762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328409, "dur": 8, "args": { "External id": 191556, "cbid": 211, "correlation": 191556 } }, { "ph": "s", "id": 191556, "pid": 76337, "tid": -914061504, "ts": 1716454224328409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328489, "dur": 1, "args": { "External id": 191572, "cbid": 251, "correlation": 191572 } }, { "ph": "f", "id": 191572, "pid": 76337, "tid": -914061504, "ts": 1716454224328489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328495, "dur": 0, "args": { "External id": 191574, "cbid": 251, "correlation": 191574 } }, { "ph": "f", "id": 191574, "pid": 76337, "tid": -914061504, "ts": 1716454224328495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224476016, "dur": 360, "args": { "External id": 191575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191575, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191575, "pid": 5, "tid": 7, "ts": 1716454224476016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328498, "dur": 13, "args": { "External id": 191575, "cbid": 211, "correlation": 191575 } }, { "ph": "s", "id": 191575, "pid": 76337, "tid": -914061504, "ts": 1716454224328498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224476378, "dur": 50, "args": { "External id": 191583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191583, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191583, "pid": 5, "tid": 7, "ts": 1716454224476378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328539, "dur": 148, "args": { "External id": 191583, "cbid": 211, "correlation": 191583 } }, { "ph": "s", "id": 191583, "pid": 76337, "tid": -914061504, "ts": 1716454224328539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224476429, "dur": 159, "args": { "External id": 191594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191594, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191594, "pid": 5, "tid": 7, "ts": 1716454224476429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328745, "dur": 66, "args": { "External id": 191594, "cbid": 211, "correlation": 191594 } }, { "ph": "s", "id": 191594, "pid": 76337, "tid": -914061504, "ts": 1716454224328745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224328864, "dur": 0, "args": { "External id": 191606, "cbid": 317, "correlation": 191606 } }, { "ph": "f", "id": 191606, "pid": 76337, "tid": -914061504, "ts": 1716454224328864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224328865, "dur": 0, "args": { "External id": 191607, "cbid": 203, "correlation": 191607 } }, { "ph": "f", "id": 191607, "pid": 76337, "tid": -914061504, "ts": 1716454224328865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224328865, "dur": 0, "args": { "External id": 191608, "cbid": 205, "correlation": 191608 } }, { "ph": "f", "id": 191608, "pid": 76337, "tid": -914061504, "ts": 1716454224328865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328889, "dur": 1, "args": { "External id": 191612, "cbid": 251, "correlation": 191612 } }, { "ph": "f", "id": 191612, "pid": 76337, "tid": -914061504, "ts": 1716454224328889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328891, "dur": 0, "args": { "External id": 191613, "cbid": 251, "correlation": 191613 } }, { "ph": "f", "id": 191613, "pid": 76337, "tid": -914061504, "ts": 1716454224328891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328892, "dur": 0, "args": { "External id": 191614, "cbid": 251, "correlation": 191614 } }, { "ph": "f", "id": 191614, "pid": 76337, "tid": -914061504, "ts": 1716454224328892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328893, "dur": 0, "args": { "External id": 191615, "cbid": 251, "correlation": 191615 } }, { "ph": "f", "id": 191615, "pid": 76337, "tid": -914061504, "ts": 1716454224328893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328893, "dur": 0, "args": { "External id": 191616, "cbid": 251, "correlation": 191616 } }, { "ph": "f", "id": 191616, "pid": 76337, "tid": -914061504, "ts": 1716454224328893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328894, "dur": 0, "args": { "External id": 191617, "cbid": 251, "correlation": 191617 } }, { "ph": "f", "id": 191617, "pid": 76337, "tid": -914061504, "ts": 1716454224328894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328895, "dur": 0, "args": { "External id": 191618, "cbid": 251, "correlation": 191618 } }, { "ph": "f", "id": 191618, "pid": 76337, "tid": -914061504, "ts": 1716454224328895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328896, "dur": 0, "args": { "External id": 191619, "cbid": 251, "correlation": 191619 } }, { "ph": "f", "id": 191619, "pid": 76337, "tid": -914061504, "ts": 1716454224328896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224328897, "dur": 0, "args": { "External id": 191620, "cbid": 251, "correlation": 191620 } }, { "ph": "f", "id": 191620, "pid": 76337, "tid": -914061504, "ts": 1716454224328897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224476589, "dur": 117, "args": { "External id": 191621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191621, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 191621, "pid": 5, "tid": 7, "ts": 1716454224476589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328899, "dur": 42, "args": { "External id": 191621, "cbid": 211, "correlation": 191621 } }, { "ph": "s", "id": 191621, "pid": 76337, "tid": -914061504, "ts": 1716454224328899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224476707, "dur": 60, "args": { "External id": 191627, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191627, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191627, "pid": 5, "tid": 7, "ts": 1716454224476707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224328963, "dur": 276, "args": { "External id": 191627, "cbid": 211, "correlation": 191627 } }, { "ph": "s", "id": 191627, "pid": 76337, "tid": -914061504, "ts": 1716454224328963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224476768, "dur": 50, "args": { "External id": 191635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191635, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191635, "pid": 5, "tid": 7, "ts": 1716454224476768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329264, "dur": 11, "args": { "External id": 191635, "cbid": 211, "correlation": 191635 } }, { "ph": "s", "id": 191635, "pid": 76337, "tid": -914061504, "ts": 1716454224329264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224476819, "dur": 51, "args": { "External id": 191655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191655, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 191655, "pid": 5, "tid": 7, "ts": 1716454224476819, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329345, "dur": 12, "args": { "External id": 191655, "cbid": 211, "correlation": 191655 } }, { "ph": "s", "id": 191655, "pid": 76337, "tid": -914061504, "ts": 1716454224329345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224476872, "dur": 5, "args": { "External id": 191667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191667, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191667, "pid": 5, "tid": 7, "ts": 1716454224476872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329368, "dur": 7, "args": { "External id": 191667, "cbid": 211, "correlation": 191667 } }, { "ph": "s", "id": 191667, "pid": 76337, "tid": -914061504, "ts": 1716454224329368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224476878, "dur": 56, "args": { "External id": 191670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191670, "pid": 5, "tid": 7, "ts": 1716454224476878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329386, "dur": 101, "args": { "External id": 191670, "cbid": 211, "correlation": 191670 } }, { "ph": "s", "id": 191670, "pid": 76337, "tid": -914061504, "ts": 1716454224329386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224476936, "dur": 37, "args": { "External id": 191679, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191679, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191679, "pid": 5, "tid": 7, "ts": 1716454224476936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329526, "dur": 11, "args": { "External id": 191679, "cbid": 211, "correlation": 191679 } }, { "ph": "s", "id": 191679, "pid": 76337, "tid": -914061504, "ts": 1716454224329526, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224329581, "dur": 0, "args": { "External id": 191689, "cbid": 317, "correlation": 191689 } }, { "ph": "f", "id": 191689, "pid": 76337, "tid": -914061504, "ts": 1716454224329581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224329582, "dur": 0, "args": { "External id": 191690, "cbid": 203, "correlation": 191690 } }, { "ph": "f", "id": 191690, "pid": 76337, "tid": -914061504, "ts": 1716454224329582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224329583, "dur": 0, "args": { "External id": 191691, "cbid": 205, "correlation": 191691 } }, { "ph": "f", "id": 191691, "pid": 76337, "tid": -914061504, "ts": 1716454224329583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224476974, "dur": 40, "args": { "External id": 191695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191695, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191695, "pid": 5, "tid": 7, "ts": 1716454224476974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329599, "dur": 12, "args": { "External id": 191695, "cbid": 211, "correlation": 191695 } }, { "ph": "s", "id": 191695, "pid": 76337, "tid": -914061504, "ts": 1716454224329599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224477016, "dur": 3, "args": { "External id": 191697, "device": 5, "context": 1, "stream": 7, "correlation": 191697, "bytes": 46080, "memory bandwidth (GB/s)": 12.307692307692308 } }, { "ph": "f", "id": 191697, "pid": 5, "tid": 7, "ts": 1716454224477016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224329615, "dur": 20, "args": { "External id": 191697, "cbid": 51, "correlation": 191697 } }, { "ph": "s", "id": 191697, "pid": 76337, "tid": -914061504, "ts": 1716454224329615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224329640, "dur": 2, "args": { "External id": 191699, "cbid": 200, "correlation": 191699 } }, { "ph": "f", "id": 191699, "pid": 76337, "tid": -914061504, "ts": 1716454224329640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224329642, "dur": 0, "args": { "External id": 191700, "cbid": 200, "correlation": 191700 } }, { "ph": "f", "id": 191700, "pid": 76337, "tid": -914061504, "ts": 1716454224329642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224329643, "dur": 0, "args": { "External id": 191701, "cbid": 200, "correlation": 191701 } }, { "ph": "f", "id": 191701, "pid": 76337, "tid": -914061504, "ts": 1716454224329643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224329643, "dur": 0, "args": { "External id": 191702, "cbid": 200, "correlation": 191702 } }, { "ph": "f", "id": 191702, "pid": 76337, "tid": -914061504, "ts": 1716454224329643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454224329644, "dur": 4, "args": { "External id": 191703, "cbid": 15, "correlation": 191703 } }, { "ph": "f", "id": 191703, "pid": 76337, "tid": -914061504, "ts": 1716454224329644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224329649, "dur": 1, "args": { "External id": 191704, "cbid": 251, "correlation": 191704 } }, { "ph": "f", "id": 191704, "pid": 76337, "tid": -914061504, "ts": 1716454224329649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454224477021, "dur": 23, "args": { "External id": 191705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191705, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191705, "pid": 5, "tid": 7, "ts": 1716454224477021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329652, "dur": 8, "args": { "External id": 191705, "cbid": 211, "correlation": 191705 } }, { "ph": "s", "id": 191705, "pid": 76337, "tid": -914061504, "ts": 1716454224329652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224477045, "dur": 4, "args": { "External id": 191707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191707, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 191707, "pid": 5, "tid": 7, "ts": 1716454224477045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329666, "dur": 6, "args": { "External id": 191707, "cbid": 211, "correlation": 191707 } }, { "ph": "s", "id": 191707, "pid": 76337, "tid": -914061504, "ts": 1716454224329666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224329676, "dur": 0, "args": { "External id": 191708, "cbid": 51, "correlation": 191708 } }, { "ph": "s", "id": 191708, "pid": 76337, "tid": -914061504, "ts": 1716454224329676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224477051, "dur": 191, "args": { "External id": 191709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191709, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191709, "pid": 5, "tid": 7, "ts": 1716454224477051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329677, "dur": 194, "args": { "External id": 191709, "cbid": 211, "correlation": 191709 } }, { "ph": "s", "id": 191709, "pid": 76337, "tid": -914061504, "ts": 1716454224329677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224477243, "dur": 7, "args": { "External id": 191710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191710, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191710, "pid": 5, "tid": 7, "ts": 1716454224477243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329875, "dur": 5, "args": { "External id": 191710, "cbid": 211, "correlation": 191710 } }, { "ph": "s", "id": 191710, "pid": 76337, "tid": -914061504, "ts": 1716454224329875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224477251, "dur": 5, "args": { "External id": 191716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191716, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 191716, "pid": 5, "tid": 7, "ts": 1716454224477251, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224329905, "dur": 9, "args": { "External id": 191716, "cbid": 211, "correlation": 191716 } }, { "ph": "s", "id": 191716, "pid": 76337, "tid": -914061504, "ts": 1716454224329905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477258, "dur": 3, "args": { "External id": 191724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191724, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191724, "pid": 5, "tid": 7, "ts": 1716454224477258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224331609, "dur": 17, "args": { "External id": 191724, "cbid": 211, "correlation": 191724 } }, { "ph": "s", "id": 191724, "pid": 76337, "tid": -914061504, "ts": 1716454224331609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477262, "dur": 3, "args": { "External id": 191732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191732, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191732, "pid": 5, "tid": 7, "ts": 1716454224477262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224331650, "dur": 11, "args": { "External id": 191732, "cbid": 211, "correlation": 191732 } }, { "ph": "s", "id": 191732, "pid": 76337, "tid": -914061504, "ts": 1716454224331650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477267, "dur": 3, "args": { "External id": 191740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191740, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191740, "pid": 5, "tid": 7, "ts": 1716454224477267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224331678, "dur": 8, "args": { "External id": 191740, "cbid": 211, "correlation": 191740 } }, { "ph": "s", "id": 191740, "pid": 76337, "tid": -914061504, "ts": 1716454224331678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477270, "dur": 3, "args": { "External id": 191749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191749, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191749, "pid": 5, "tid": 7, "ts": 1716454224477270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224331864, "dur": 14, "args": { "External id": 191749, "cbid": 211, "correlation": 191749 } }, { "ph": "s", "id": 191749, "pid": 76337, "tid": -914061504, "ts": 1716454224331864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477275, "dur": 3, "args": { "External id": 191758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191758, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191758, "pid": 5, "tid": 7, "ts": 1716454224477275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224331893, "dur": 7, "args": { "External id": 191758, "cbid": 211, "correlation": 191758 } }, { "ph": "s", "id": 191758, "pid": 76337, "tid": -914061504, "ts": 1716454224331893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477279, "dur": 3, "args": { "External id": 191766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191766, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191766, "pid": 5, "tid": 7, "ts": 1716454224477279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224331919, "dur": 8, "args": { "External id": 191766, "cbid": 211, "correlation": 191766 } }, { "ph": "s", "id": 191766, "pid": 76337, "tid": -914061504, "ts": 1716454224331919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477283, "dur": 3, "args": { "External id": 191774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191774, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191774, "pid": 5, "tid": 7, "ts": 1716454224477283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224332191, "dur": 16, "args": { "External id": 191774, "cbid": 211, "correlation": 191774 } }, { "ph": "s", "id": 191774, "pid": 76337, "tid": -914061504, "ts": 1716454224332191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477288, "dur": 3, "args": { "External id": 191782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191782, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191782, "pid": 5, "tid": 7, "ts": 1716454224477288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224332222, "dur": 7, "args": { "External id": 191782, "cbid": 211, "correlation": 191782 } }, { "ph": "s", "id": 191782, "pid": 76337, "tid": -914061504, "ts": 1716454224332222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224477293, "dur": 1, "args": { "External id": 191792, "device": 5, "context": 1, "stream": 7, "correlation": 191792, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 191792, "pid": 5, "tid": 7, "ts": 1716454224477293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224332288, "dur": 34, "args": { "External id": 191792, "cbid": 41, "correlation": 191792 } }, { "ph": "s", "id": 191792, "pid": 76337, "tid": -914061504, "ts": 1716454224332288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224332323, "dur": 145001, "args": { "External id": 191793, "cbid": 131, "correlation": 191793 } }, { "ph": "f", "id": 191793, "pid": 76337, "tid": -914061504, "ts": 1716454224332323, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224477592, "dur": 3, "args": { "External id": 191801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191801, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191801, "pid": 5, "tid": 7, "ts": 1716454224477592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224477540, "dur": 54, "args": { "External id": 191801, "cbid": 211, "correlation": 191801 } }, { "ph": "s", "id": 191801, "pid": 76337, "tid": -914061504, "ts": 1716454224477540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224477702, "dur": 3, "args": { "External id": 191810, "device": 5, "context": 1, "stream": 7, "correlation": 191810, "bytes": 8, "memory bandwidth (GB/s)": 0.002380952380952381 } }, { "ph": "f", "id": 191810, "pid": 5, "tid": 7, "ts": 1716454224477702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224477662, "dur": 41, "args": { "External id": 191810, "cbid": 41, "correlation": 191810 } }, { "ph": "s", "id": 191810, "pid": 76337, "tid": -914061504, "ts": 1716454224477662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224477809, "dur": 4, "args": { "External id": 191820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191820, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 191820, "pid": 5, "tid": 7, "ts": 1716454224477809, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224477792, "dur": 19, "args": { "External id": 191820, "cbid": 211, "correlation": 191820 } }, { "ph": "s", "id": 191820, "pid": 76337, "tid": -914061504, "ts": 1716454224477792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224477903, "dur": 1, "args": { "External id": 191830, "device": 5, "context": 1, "stream": 7, "correlation": 191830, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 191830, "pid": 5, "tid": 7, "ts": 1716454224477903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224477878, "dur": 24, "args": { "External id": 191830, "cbid": 41, "correlation": 191830 } }, { "ph": "s", "id": 191830, "pid": 76337, "tid": -914061504, "ts": 1716454224477878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224477902, "dur": 8, "args": { "External id": 191831, "cbid": 131, "correlation": 191831 } }, { "ph": "f", "id": 191831, "pid": 76337, "tid": -914061504, "ts": 1716454224477902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224477992, "dur": 3, "args": { "External id": 191838, "device": 5, "context": 1, "stream": 7, "correlation": 191838, "bytes": 98304, "memory bandwidth (GB/s)": 30.425255338904364 } }, { "ph": "f", "id": 191838, "pid": 5, "tid": 7, "ts": 1716454224477992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224477959, "dur": 33, "args": { "External id": 191838, "cbid": 41, "correlation": 191838 } }, { "ph": "s", "id": 191838, "pid": 76337, "tid": -914061504, "ts": 1716454224477959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224478108, "dur": 3, "args": { "External id": 191857, "device": 5, "context": 1, "stream": 7, "correlation": 191857, "bytes": 16, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 191857, "pid": 5, "tid": 7, "ts": 1716454224478108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224478083, "dur": 25, "args": { "External id": 191857, "cbid": 41, "correlation": 191857 } }, { "ph": "s", "id": 191857, "pid": 76337, "tid": -914061504, "ts": 1716454224478083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224478150, "dur": 3, "args": { "External id": 191863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191863, "pid": 5, "tid": 7, "ts": 1716454224478150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478139, "dur": 12, "args": { "External id": 191863, "cbid": 211, "correlation": 191863 } }, { "ph": "s", "id": 191863, "pid": 76337, "tid": -914061504, "ts": 1716454224478139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454224478164, "dur": 6, "args": { "External id": 191865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191865, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 191865, "pid": 5, "tid": 7, "ts": 1716454224478164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478154, "dur": 9, "args": { "External id": 191865, "cbid": 211, "correlation": 191865 } }, { "ph": "s", "id": 191865, "pid": 76337, "tid": -914061504, "ts": 1716454224478154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454224478172, "dur": 3, "args": { "External id": 191867, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191867, "pid": 5, "tid": 7, "ts": 1716454224478172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478164, "dur": 7, "args": { "External id": 191867, "cbid": 211, "correlation": 191867 } }, { "ph": "s", "id": 191867, "pid": 76337, "tid": -914061504, "ts": 1716454224478164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224478211, "dur": 2, "args": { "External id": 191875, "device": 5, "context": 1, "stream": 7, "correlation": 191875, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 191875, "pid": 5, "tid": 7, "ts": 1716454224478211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224478196, "dur": 14, "args": { "External id": 191875, "cbid": 41, "correlation": 191875 } }, { "ph": "s", "id": 191875, "pid": 76337, "tid": -914061504, "ts": 1716454224478196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224478264, "dur": 3, "args": { "External id": 191889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191889, "pid": 5, "tid": 7, "ts": 1716454224478264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478253, "dur": 12, "args": { "External id": 191889, "cbid": 211, "correlation": 191889 } }, { "ph": "s", "id": 191889, "pid": 76337, "tid": -914061504, "ts": 1716454224478253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224478287, "dur": 2, "args": { "External id": 191903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191903, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191903, "pid": 5, "tid": 7, "ts": 1716454224478287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478279, "dur": 7, "args": { "External id": 191903, "cbid": 211, "correlation": 191903 } }, { "ph": "s", "id": 191903, "pid": 76337, "tid": -914061504, "ts": 1716454224478279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224478329, "dur": 6, "args": { "External id": 191910, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191910, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191910, "pid": 5, "tid": 7, "ts": 1716454224478329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478318, "dur": 11, "args": { "External id": 191910, "cbid": 211, "correlation": 191910 } }, { "ph": "s", "id": 191910, "pid": 76337, "tid": -914061504, "ts": 1716454224478318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224478339, "dur": 6, "args": { "External id": 191913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191913, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191913, "pid": 5, "tid": 7, "ts": 1716454224478339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478332, "dur": 7, "args": { "External id": 191913, "cbid": 211, "correlation": 191913 } }, { "ph": "s", "id": 191913, "pid": 76337, "tid": -914061504, "ts": 1716454224478332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454224478350, "dur": 3, "args": { "External id": 191915, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191915, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191915, "pid": 5, "tid": 7, "ts": 1716454224478350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478341, "dur": 8, "args": { "External id": 191915, "cbid": 211, "correlation": 191915 } }, { "ph": "s", "id": 191915, "pid": 76337, "tid": -914061504, "ts": 1716454224478341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224478371, "dur": 2, "args": { "External id": 191918, "device": 5, "context": 1, "stream": 7, "correlation": 191918, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 191918, "pid": 5, "tid": 7, "ts": 1716454224478371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224478357, "dur": 13, "args": { "External id": 191918, "cbid": 41, "correlation": 191918 } }, { "ph": "s", "id": 191918, "pid": 76337, "tid": -914061504, "ts": 1716454224478357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224478429, "dur": 4, "args": { "External id": 191934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191934, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 191934, "pid": 5, "tid": 7, "ts": 1716454224478429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478416, "dur": 14, "args": { "External id": 191934, "cbid": 211, "correlation": 191934 } }, { "ph": "s", "id": 191934, "pid": 76337, "tid": -914061504, "ts": 1716454224478416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224478453, "dur": 3, "args": { "External id": 191939, "device": 5, "context": 1, "stream": 7, "correlation": 191939, "bytes": 1, "memory bandwidth (GB/s)": 0.0003125 } }, { "ph": "f", "id": 191939, "pid": 5, "tid": 7, "ts": 1716454224478453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224478434, "dur": 18, "args": { "External id": 191939, "cbid": 41, "correlation": 191939 } }, { "ph": "s", "id": 191939, "pid": 76337, "tid": -914061504, "ts": 1716454224478434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224478518, "dur": 1, "args": { "External id": 191945, "device": 5, "context": 1, "stream": 7, "correlation": 191945, "bytes": 1, "memory bandwidth (GB/s)": 0.0005896226415094339 } }, { "ph": "f", "id": 191945, "pid": 5, "tid": 7, "ts": 1716454224478518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224478463, "dur": 64, "args": { "External id": 191945, "cbid": 41, "correlation": 191945 } }, { "ph": "s", "id": 191945, "pid": 76337, "tid": -914061504, "ts": 1716454224478463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224478528, "dur": 4, "args": { "External id": 191946, "cbid": 131, "correlation": 191946 } }, { "ph": "f", "id": 191946, "pid": 76337, "tid": -914061504, "ts": 1716454224478528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224478581, "dur": 3, "args": { "External id": 191954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191954, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191954, "pid": 5, "tid": 7, "ts": 1716454224478581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478568, "dur": 16, "args": { "External id": 191954, "cbid": 211, "correlation": 191954 } }, { "ph": "s", "id": 191954, "pid": 76337, "tid": -914061504, "ts": 1716454224478568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224478618, "dur": 3, "args": { "External id": 191964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191964, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191964, "pid": 5, "tid": 7, "ts": 1716454224478618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478608, "dur": 8, "args": { "External id": 191964, "cbid": 211, "correlation": 191964 } }, { "ph": "s", "id": 191964, "pid": 76337, "tid": -914061504, "ts": 1716454224478608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224478645, "dur": 3, "args": { "External id": 191973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191973, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191973, "pid": 5, "tid": 7, "ts": 1716454224478645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478635, "dur": 9, "args": { "External id": 191973, "cbid": 211, "correlation": 191973 } }, { "ph": "s", "id": 191973, "pid": 76337, "tid": -914061504, "ts": 1716454224478635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224478776, "dur": 12, "args": { "External id": 191983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191983, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191983, "pid": 5, "tid": 7, "ts": 1716454224478776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478762, "dur": 14, "args": { "External id": 191983, "cbid": 211, "correlation": 191983 } }, { "ph": "s", "id": 191983, "pid": 76337, "tid": -914061504, "ts": 1716454224478762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224478817, "dur": 3, "args": { "External id": 191991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 191991, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 191991, "pid": 5, "tid": 7, "ts": 1716454224478817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478808, "dur": 9, "args": { "External id": 191991, "cbid": 211, "correlation": 191991 } }, { "ph": "s", "id": 191991, "pid": 76337, "tid": -914061504, "ts": 1716454224478808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224478871, "dur": 12, "args": { "External id": 192001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192001, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192001, "pid": 5, "tid": 7, "ts": 1716454224478871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478854, "dur": 17, "args": { "External id": 192001, "cbid": 211, "correlation": 192001 } }, { "ph": "s", "id": 192001, "pid": 76337, "tid": -914061504, "ts": 1716454224478854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224478907, "dur": 10, "args": { "External id": 192009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192009, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192009, "pid": 5, "tid": 7, "ts": 1716454224478907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478896, "dur": 10, "args": { "External id": 192009, "cbid": 211, "correlation": 192009 } }, { "ph": "s", "id": 192009, "pid": 76337, "tid": -914061504, "ts": 1716454224478896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224478937, "dur": 3, "args": { "External id": 192018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192018, "pid": 5, "tid": 7, "ts": 1716454224478937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478927, "dur": 9, "args": { "External id": 192018, "cbid": 211, "correlation": 192018 } }, { "ph": "s", "id": 192018, "pid": 76337, "tid": -914061504, "ts": 1716454224478927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224478962, "dur": 5, "args": { "External id": 192027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192027, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192027, "pid": 5, "tid": 7, "ts": 1716454224478962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224478953, "dur": 8, "args": { "External id": 192027, "cbid": 211, "correlation": 192027 } }, { "ph": "s", "id": 192027, "pid": 76337, "tid": -914061504, "ts": 1716454224478953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224479019, "dur": 8, "args": { "External id": 192037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192037, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192037, "pid": 5, "tid": 7, "ts": 1716454224479019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479007, "dur": 11, "args": { "External id": 192037, "cbid": 211, "correlation": 192037 } }, { "ph": "s", "id": 192037, "pid": 76337, "tid": -914061504, "ts": 1716454224479007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224479376, "dur": 3, "args": { "External id": 192046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192046, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192046, "pid": 5, "tid": 7, "ts": 1716454224479376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479361, "dur": 15, "args": { "External id": 192046, "cbid": 211, "correlation": 192046 } }, { "ph": "s", "id": 192046, "pid": 76337, "tid": -914061504, "ts": 1716454224479361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224479407, "dur": 3, "args": { "External id": 192054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192054, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192054, "pid": 5, "tid": 7, "ts": 1716454224479407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479397, "dur": 10, "args": { "External id": 192054, "cbid": 211, "correlation": 192054 } }, { "ph": "s", "id": 192054, "pid": 76337, "tid": -914061504, "ts": 1716454224479397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224479459, "dur": 1, "args": { "External id": 192064, "device": 5, "context": 1, "stream": 7, "correlation": 192064, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 192064, "pid": 5, "tid": 7, "ts": 1716454224479459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224479443, "dur": 14, "args": { "External id": 192064, "cbid": 41, "correlation": 192064 } }, { "ph": "s", "id": 192064, "pid": 76337, "tid": -914061504, "ts": 1716454224479443, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224479458, "dur": 8, "args": { "External id": 192065, "cbid": 131, "correlation": 192065 } }, { "ph": "f", "id": 192065, "pid": 76337, "tid": -914061504, "ts": 1716454224479458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224479549, "dur": 3, "args": { "External id": 192073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192073, "pid": 5, "tid": 7, "ts": 1716454224479549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479533, "dur": 16, "args": { "External id": 192073, "cbid": 211, "correlation": 192073 } }, { "ph": "s", "id": 192073, "pid": 76337, "tid": -914061504, "ts": 1716454224479533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224479621, "dur": 3, "args": { "External id": 192082, "device": 5, "context": 1, "stream": 7, "correlation": 192082, "bytes": 8, "memory bandwidth (GB/s)": 0.002577319587628866 } }, { "ph": "f", "id": 192082, "pid": 5, "tid": 7, "ts": 1716454224479621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224479603, "dur": 17, "args": { "External id": 192082, "cbid": 41, "correlation": 192082 } }, { "ph": "s", "id": 192082, "pid": 76337, "tid": -914061504, "ts": 1716454224479603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224479692, "dur": 3, "args": { "External id": 192092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192092, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192092, "pid": 5, "tid": 7, "ts": 1716454224479692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479678, "dur": 14, "args": { "External id": 192092, "cbid": 211, "correlation": 192092 } }, { "ph": "s", "id": 192092, "pid": 76337, "tid": -914061504, "ts": 1716454224479678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224479744, "dur": 1, "args": { "External id": 192102, "device": 5, "context": 1, "stream": 7, "correlation": 192102, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 192102, "pid": 5, "tid": 7, "ts": 1716454224479744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224479730, "dur": 13, "args": { "External id": 192102, "cbid": 41, "correlation": 192102 } }, { "ph": "s", "id": 192102, "pid": 76337, "tid": -914061504, "ts": 1716454224479730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224479743, "dur": 7, "args": { "External id": 192103, "cbid": 131, "correlation": 192103 } }, { "ph": "f", "id": 192103, "pid": 76337, "tid": -914061504, "ts": 1716454224479743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224479803, "dur": 3, "args": { "External id": 192110, "device": 5, "context": 1, "stream": 7, "correlation": 192110, "bytes": 98304, "memory bandwidth (GB/s)": 31.670103092783506 } }, { "ph": "f", "id": 192110, "pid": 5, "tid": 7, "ts": 1716454224479803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224479783, "dur": 20, "args": { "External id": 192110, "cbid": 41, "correlation": 192110 } }, { "ph": "s", "id": 192110, "pid": 76337, "tid": -914061504, "ts": 1716454224479783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224479851, "dur": 1, "args": { "External id": 192121, "device": 5, "context": 1, "stream": 7, "correlation": 192121, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 192121, "pid": 5, "tid": 7, "ts": 1716454224479851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224479838, "dur": 11, "args": { "External id": 192121, "cbid": 41, "correlation": 192121 } }, { "ph": "s", "id": 192121, "pid": 76337, "tid": -914061504, "ts": 1716454224479838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224479850, "dur": 8, "args": { "External id": 192122, "cbid": 131, "correlation": 192122 } }, { "ph": "f", "id": 192122, "pid": 76337, "tid": -914061504, "ts": 1716454224479850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224479901, "dur": 3, "args": { "External id": 192130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192130, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192130, "pid": 5, "tid": 7, "ts": 1716454224479901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479887, "dur": 14, "args": { "External id": 192130, "cbid": 211, "correlation": 192130 } }, { "ph": "s", "id": 192130, "pid": 76337, "tid": -914061504, "ts": 1716454224479887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224479929, "dur": 3, "args": { "External id": 192140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192140, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192140, "pid": 5, "tid": 7, "ts": 1716454224479929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479920, "dur": 8, "args": { "External id": 192140, "cbid": 211, "correlation": 192140 } }, { "ph": "s", "id": 192140, "pid": 76337, "tid": -914061504, "ts": 1716454224479920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224479952, "dur": 3, "args": { "External id": 192149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192149, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192149, "pid": 5, "tid": 7, "ts": 1716454224479952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224479942, "dur": 8, "args": { "External id": 192149, "cbid": 211, "correlation": 192149 } }, { "ph": "s", "id": 192149, "pid": 76337, "tid": -914061504, "ts": 1716454224479942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224480029, "dur": 6, "args": { "External id": 192157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192157, "pid": 5, "tid": 7, "ts": 1716454224480029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480016, "dur": 14, "args": { "External id": 192157, "cbid": 211, "correlation": 192157 } }, { "ph": "s", "id": 192157, "pid": 76337, "tid": -914061504, "ts": 1716454224480016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480070, "dur": 3, "args": { "External id": 192166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192166, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192166, "pid": 5, "tid": 7, "ts": 1716454224480070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480060, "dur": 9, "args": { "External id": 192166, "cbid": 211, "correlation": 192166 } }, { "ph": "s", "id": 192166, "pid": 76337, "tid": -914061504, "ts": 1716454224480060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480093, "dur": 3, "args": { "External id": 192175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192175, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192175, "pid": 5, "tid": 7, "ts": 1716454224480093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480084, "dur": 7, "args": { "External id": 192175, "cbid": 211, "correlation": 192175 } }, { "ph": "s", "id": 192175, "pid": 76337, "tid": -914061504, "ts": 1716454224480084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480171, "dur": 3, "args": { "External id": 192183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192183, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192183, "pid": 5, "tid": 7, "ts": 1716454224480171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480160, "dur": 10, "args": { "External id": 192183, "cbid": 211, "correlation": 192183 } }, { "ph": "s", "id": 192183, "pid": 76337, "tid": -914061504, "ts": 1716454224480160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224480234, "dur": 1, "args": { "External id": 192191, "device": 5, "context": 1, "stream": 7, "correlation": 192191, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 192191, "pid": 5, "tid": 7, "ts": 1716454224480234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224480216, "dur": 28, "args": { "External id": 192191, "cbid": 41, "correlation": 192191 } }, { "ph": "s", "id": 192191, "pid": 76337, "tid": -914061504, "ts": 1716454224480216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224480245, "dur": 3, "args": { "External id": 192192, "cbid": 131, "correlation": 192192 } }, { "ph": "f", "id": 192192, "pid": 76337, "tid": -914061504, "ts": 1716454224480245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224480310, "dur": 1, "args": { "External id": 192202, "device": 5, "context": 1, "stream": 7, "correlation": 192202, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 192202, "pid": 5, "tid": 7, "ts": 1716454224480310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224480297, "dur": 11, "args": { "External id": 192202, "cbid": 41, "correlation": 192202 } }, { "ph": "s", "id": 192202, "pid": 76337, "tid": -914061504, "ts": 1716454224480297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224480309, "dur": 7, "args": { "External id": 192203, "cbid": 131, "correlation": 192203 } }, { "ph": "f", "id": 192203, "pid": 76337, "tid": -914061504, "ts": 1716454224480309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224480367, "dur": 1, "args": { "External id": 192212, "device": 5, "context": 1, "stream": 7, "correlation": 192212, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 192212, "pid": 5, "tid": 7, "ts": 1716454224480367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224480356, "dur": 9, "args": { "External id": 192212, "cbid": 41, "correlation": 192212 } }, { "ph": "s", "id": 192212, "pid": 76337, "tid": -914061504, "ts": 1716454224480356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224480365, "dur": 8, "args": { "External id": 192213, "cbid": 131, "correlation": 192213 } }, { "ph": "f", "id": 192213, "pid": 76337, "tid": -914061504, "ts": 1716454224480365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224480445, "dur": 4, "args": { "External id": 192220, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192220, "pid": 5, "tid": 7, "ts": 1716454224480445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480428, "dur": 18, "args": { "External id": 192220, "cbid": 211, "correlation": 192220 } }, { "ph": "s", "id": 192220, "pid": 76337, "tid": -914061504, "ts": 1716454224480428, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454224480486, "dur": 4, "args": { "External id": 192240, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192240, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192240, "pid": 5, "tid": 7, "ts": 1716454224480486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480473, "dur": 13, "args": { "External id": 192240, "cbid": 211, "correlation": 192240 } }, { "ph": "s", "id": 192240, "pid": 76337, "tid": -914061504, "ts": 1716454224480473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224480487, "dur": 0, "args": { "External id": 192241, "cbid": 11, "correlation": 192241 } }, { "ph": "f", "id": 192241, "pid": 76337, "tid": -914061504, "ts": 1716454224480487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224480487, "dur": 0, "args": { "External id": 192242, "cbid": 11, "correlation": 192242 } }, { "ph": "f", "id": 192242, "pid": 76337, "tid": -914061504, "ts": 1716454224480487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224480504, "dur": 1, "args": { "External id": 192245, "device": 5, "context": 1, "stream": 7, "correlation": 192245, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 192245, "pid": 5, "tid": 7, "ts": 1716454224480504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224480489, "dur": 24, "args": { "External id": 192245, "cbid": 41, "correlation": 192245 } }, { "ph": "s", "id": 192245, "pid": 76337, "tid": -914061504, "ts": 1716454224480489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224480513, "dur": 3, "args": { "External id": 192246, "cbid": 131, "correlation": 192246 } }, { "ph": "f", "id": 192246, "pid": 76337, "tid": -914061504, "ts": 1716454224480513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224480543, "dur": 3, "args": { "External id": 192270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192270, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192270, "pid": 5, "tid": 7, "ts": 1716454224480543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480533, "dur": 9, "args": { "External id": 192270, "cbid": 211, "correlation": 192270 } }, { "ph": "s", "id": 192270, "pid": 76337, "tid": -914061504, "ts": 1716454224480533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224480543, "dur": 0, "args": { "External id": 192271, "cbid": 11, "correlation": 192271 } }, { "ph": "f", "id": 192271, "pid": 76337, "tid": -914061504, "ts": 1716454224480543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224480543, "dur": 0, "args": { "External id": 192272, "cbid": 11, "correlation": 192272 } }, { "ph": "f", "id": 192272, "pid": 76337, "tid": -914061504, "ts": 1716454224480543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224480545, "dur": 1, "args": { "External id": 192274, "cbid": 200, "correlation": 192274 } }, { "ph": "f", "id": 192274, "pid": 76337, "tid": -914061504, "ts": 1716454224480545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454224480555, "dur": 4, "args": { "External id": 192276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192276, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192276, "pid": 5, "tid": 7, "ts": 1716454224480555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480547, "dur": 8, "args": { "External id": 192276, "cbid": 211, "correlation": 192276 } }, { "ph": "s", "id": 192276, "pid": 76337, "tid": -914061504, "ts": 1716454224480547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224480556, "dur": 0, "args": { "External id": 192277, "cbid": 11, "correlation": 192277 } }, { "ph": "f", "id": 192277, "pid": 76337, "tid": -914061504, "ts": 1716454224480556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224480556, "dur": 0, "args": { "External id": 192278, "cbid": 11, "correlation": 192278 } }, { "ph": "f", "id": 192278, "pid": 76337, "tid": -914061504, "ts": 1716454224480556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224480602, "dur": 1, "args": { "External id": 192285, "device": 5, "context": 1, "stream": 7, "correlation": 192285, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 192285, "pid": 5, "tid": 7, "ts": 1716454224480602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224480588, "dur": 21, "args": { "External id": 192285, "cbid": 41, "correlation": 192285 } }, { "ph": "s", "id": 192285, "pid": 76337, "tid": -914061504, "ts": 1716454224480588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224480611, "dur": 3, "args": { "External id": 192286, "cbid": 131, "correlation": 192286 } }, { "ph": "f", "id": 192286, "pid": 76337, "tid": -914061504, "ts": 1716454224480611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224480663, "dur": 1, "args": { "External id": 192296, "device": 5, "context": 1, "stream": 7, "correlation": 192296, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 192296, "pid": 5, "tid": 7, "ts": 1716454224480663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224480649, "dur": 12, "args": { "External id": 192296, "cbid": 41, "correlation": 192296 } }, { "ph": "s", "id": 192296, "pid": 76337, "tid": -914061504, "ts": 1716454224480649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224480662, "dur": 7, "args": { "External id": 192297, "cbid": 131, "correlation": 192297 } }, { "ph": "f", "id": 192297, "pid": 76337, "tid": -914061504, "ts": 1716454224480662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224480738, "dur": 5, "args": { "External id": 192304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192304, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192304, "pid": 5, "tid": 7, "ts": 1716454224480738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480723, "dur": 15, "args": { "External id": 192304, "cbid": 211, "correlation": 192304 } }, { "ph": "s", "id": 192304, "pid": 76337, "tid": -914061504, "ts": 1716454224480723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480817, "dur": 3, "args": { "External id": 192313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192313, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192313, "pid": 5, "tid": 7, "ts": 1716454224480817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480799, "dur": 19, "args": { "External id": 192313, "cbid": 211, "correlation": 192313 } }, { "ph": "s", "id": 192313, "pid": 76337, "tid": -914061504, "ts": 1716454224480799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480855, "dur": 3, "args": { "External id": 192321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192321, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192321, "pid": 5, "tid": 7, "ts": 1716454224480855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480845, "dur": 10, "args": { "External id": 192321, "cbid": 211, "correlation": 192321 } }, { "ph": "s", "id": 192321, "pid": 76337, "tid": -914061504, "ts": 1716454224480845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480891, "dur": 4, "args": { "External id": 192329, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192329, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192329, "pid": 5, "tid": 7, "ts": 1716454224480891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480879, "dur": 12, "args": { "External id": 192329, "cbid": 211, "correlation": 192329 } }, { "ph": "s", "id": 192329, "pid": 76337, "tid": -914061504, "ts": 1716454224480879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480923, "dur": 4, "args": { "External id": 192337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192337, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192337, "pid": 5, "tid": 7, "ts": 1716454224480923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480912, "dur": 11, "args": { "External id": 192337, "cbid": 211, "correlation": 192337 } }, { "ph": "s", "id": 192337, "pid": 76337, "tid": -914061504, "ts": 1716454224480912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480949, "dur": 3, "args": { "External id": 192345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192345, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192345, "pid": 5, "tid": 7, "ts": 1716454224480949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480941, "dur": 8, "args": { "External id": 192345, "cbid": 211, "correlation": 192345 } }, { "ph": "s", "id": 192345, "pid": 76337, "tid": -914061504, "ts": 1716454224480941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224480982, "dur": 3, "args": { "External id": 192353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192353, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192353, "pid": 5, "tid": 7, "ts": 1716454224480982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480966, "dur": 17, "args": { "External id": 192353, "cbid": 211, "correlation": 192353 } }, { "ph": "s", "id": 192353, "pid": 76337, "tid": -914061504, "ts": 1716454224480966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224481007, "dur": 4, "args": { "External id": 192361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192361, "pid": 5, "tid": 7, "ts": 1716454224481007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224480998, "dur": 8, "args": { "External id": 192361, "cbid": 211, "correlation": 192361 } }, { "ph": "s", "id": 192361, "pid": 76337, "tid": -914061504, "ts": 1716454224480998, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224481027, "dur": 5, "args": { "External id": 192369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192369, "pid": 5, "tid": 7, "ts": 1716454224481027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481018, "dur": 7, "args": { "External id": 192369, "cbid": 211, "correlation": 192369 } }, { "ph": "s", "id": 192369, "pid": 76337, "tid": -914061504, "ts": 1716454224481018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224481046, "dur": 3, "args": { "External id": 192377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192377, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192377, "pid": 5, "tid": 7, "ts": 1716454224481046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481038, "dur": 7, "args": { "External id": 192377, "cbid": 211, "correlation": 192377 } }, { "ph": "s", "id": 192377, "pid": 76337, "tid": -914061504, "ts": 1716454224481038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224481110, "dur": 3, "args": { "External id": 192385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192385, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192385, "pid": 5, "tid": 7, "ts": 1716454224481110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481099, "dur": 11, "args": { "External id": 192385, "cbid": 211, "correlation": 192385 } }, { "ph": "s", "id": 192385, "pid": 76337, "tid": -914061504, "ts": 1716454224481099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224481136, "dur": 4, "args": { "External id": 192393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192393, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192393, "pid": 5, "tid": 7, "ts": 1716454224481136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481128, "dur": 8, "args": { "External id": 192393, "cbid": 211, "correlation": 192393 } }, { "ph": "s", "id": 192393, "pid": 76337, "tid": -914061504, "ts": 1716454224481128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224481159, "dur": 4, "args": { "External id": 192401, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192401, "pid": 5, "tid": 7, "ts": 1716454224481159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481150, "dur": 7, "args": { "External id": 192401, "cbid": 211, "correlation": 192401 } }, { "ph": "s", "id": 192401, "pid": 76337, "tid": -914061504, "ts": 1716454224481150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224481178, "dur": 3, "args": { "External id": 192409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192409, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 192409, "pid": 5, "tid": 7, "ts": 1716454224481178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481170, "dur": 6, "args": { "External id": 192409, "cbid": 211, "correlation": 192409 } }, { "ph": "s", "id": 192409, "pid": 76337, "tid": -914061504, "ts": 1716454224481170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224481618, "dur": 5, "args": { "External id": 192418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192418, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192418, "pid": 5, "tid": 7, "ts": 1716454224481618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481601, "dur": 17, "args": { "External id": 192418, "cbid": 211, "correlation": 192418 } }, { "ph": "s", "id": 192418, "pid": 76337, "tid": -914061504, "ts": 1716454224481601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224481657, "dur": 5, "args": { "External id": 192427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192427, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192427, "pid": 5, "tid": 7, "ts": 1716454224481657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481646, "dur": 10, "args": { "External id": 192427, "cbid": 211, "correlation": 192427 } }, { "ph": "s", "id": 192427, "pid": 76337, "tid": -914061504, "ts": 1716454224481646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224481805, "dur": 3, "args": { "External id": 192443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192443, "pid": 5, "tid": 7, "ts": 1716454224481805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481790, "dur": 15, "args": { "External id": 192443, "cbid": 211, "correlation": 192443 } }, { "ph": "s", "id": 192443, "pid": 76337, "tid": -914061504, "ts": 1716454224481790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224481838, "dur": 3, "args": { "External id": 192451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192451, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192451, "pid": 5, "tid": 7, "ts": 1716454224481838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481829, "dur": 9, "args": { "External id": 192451, "cbid": 211, "correlation": 192451 } }, { "ph": "s", "id": 192451, "pid": 76337, "tid": -914061504, "ts": 1716454224481829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224481871, "dur": 3, "args": { "External id": 192459, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192459, "pid": 5, "tid": 7, "ts": 1716454224481871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481861, "dur": 8, "args": { "External id": 192459, "cbid": 211, "correlation": 192459 } }, { "ph": "s", "id": 192459, "pid": 76337, "tid": -914061504, "ts": 1716454224481861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224481906, "dur": 4, "args": { "External id": 192467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192467, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192467, "pid": 5, "tid": 7, "ts": 1716454224481906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481895, "dur": 9, "args": { "External id": 192467, "cbid": 211, "correlation": 192467 } }, { "ph": "s", "id": 192467, "pid": 76337, "tid": -914061504, "ts": 1716454224481895, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224481962, "dur": 4, "args": { "External id": 192479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192479, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192479, "pid": 5, "tid": 7, "ts": 1716454224481962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224481948, "dur": 13, "args": { "External id": 192479, "cbid": 211, "correlation": 192479 } }, { "ph": "s", "id": 192479, "pid": 76337, "tid": -914061504, "ts": 1716454224481948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224482019, "dur": 4, "args": { "External id": 192490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192490, "pid": 5, "tid": 7, "ts": 1716454224482019, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482006, "dur": 13, "args": { "External id": 192490, "cbid": 211, "correlation": 192490 } }, { "ph": "s", "id": 192490, "pid": 76337, "tid": -914061504, "ts": 1716454224482006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224482052, "dur": 3, "args": { "External id": 192498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192498, "pid": 5, "tid": 7, "ts": 1716454224482052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482041, "dur": 10, "args": { "External id": 192498, "cbid": 211, "correlation": 192498 } }, { "ph": "s", "id": 192498, "pid": 76337, "tid": -914061504, "ts": 1716454224482041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224482088, "dur": 5, "args": { "External id": 192506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192506, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192506, "pid": 5, "tid": 7, "ts": 1716454224482088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482077, "dur": 11, "args": { "External id": 192506, "cbid": 211, "correlation": 192506 } }, { "ph": "s", "id": 192506, "pid": 76337, "tid": -914061504, "ts": 1716454224482077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224482121, "dur": 5, "args": { "External id": 192514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192514, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192514, "pid": 5, "tid": 7, "ts": 1716454224482121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482111, "dur": 10, "args": { "External id": 192514, "cbid": 211, "correlation": 192514 } }, { "ph": "s", "id": 192514, "pid": 76337, "tid": -914061504, "ts": 1716454224482111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224482153, "dur": 4, "args": { "External id": 192523, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192523, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192523, "pid": 5, "tid": 7, "ts": 1716454224482153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482142, "dur": 11, "args": { "External id": 192523, "cbid": 211, "correlation": 192523 } }, { "ph": "s", "id": 192523, "pid": 76337, "tid": -914061504, "ts": 1716454224482142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224482216, "dur": 4, "args": { "External id": 192536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192536, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192536, "pid": 5, "tid": 7, "ts": 1716454224482216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482203, "dur": 13, "args": { "External id": 192536, "cbid": 211, "correlation": 192536 } }, { "ph": "s", "id": 192536, "pid": 76337, "tid": -914061504, "ts": 1716454224482203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224482256, "dur": 5, "args": { "External id": 192546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192546, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 192546, "pid": 5, "tid": 7, "ts": 1716454224482256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482246, "dur": 10, "args": { "External id": 192546, "cbid": 211, "correlation": 192546 } }, { "ph": "s", "id": 192546, "pid": 76337, "tid": -914061504, "ts": 1716454224482246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224482396, "dur": 4, "args": { "External id": 192563, "cbid": 251, "correlation": 192563 } }, { "ph": "f", "id": 192563, "pid": 76337, "tid": -914061504, "ts": 1716454224482396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454224482426, "dur": 11, "args": { "External id": 192565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192565, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192565, "pid": 5, "tid": 7, "ts": 1716454224482426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482410, "dur": 18, "args": { "External id": 192565, "cbid": 211, "correlation": 192565 } }, { "ph": "s", "id": 192565, "pid": 76337, "tid": -914061504, "ts": 1716454224482410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224482499, "dur": 4, "args": { "External id": 192573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192573, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192573, "pid": 5, "tid": 7, "ts": 1716454224482499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482486, "dur": 12, "args": { "External id": 192573, "cbid": 211, "correlation": 192573 } }, { "ph": "s", "id": 192573, "pid": 76337, "tid": -914061504, "ts": 1716454224482486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224482559, "dur": 2, "args": { "External id": 192589, "cbid": 251, "correlation": 192589 } }, { "ph": "f", "id": 192589, "pid": 76337, "tid": -914061504, "ts": 1716454224482559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224482565, "dur": 0, "args": { "External id": 192591, "cbid": 251, "correlation": 192591 } }, { "ph": "f", "id": 192591, "pid": 76337, "tid": -914061504, "ts": 1716454224482565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224482582, "dur": 13, "args": { "External id": 192592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192592, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192592, "pid": 5, "tid": 7, "ts": 1716454224482582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482567, "dur": 14, "args": { "External id": 192592, "cbid": 211, "correlation": 192592 } }, { "ph": "s", "id": 192592, "pid": 76337, "tid": -914061504, "ts": 1716454224482567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224482597, "dur": 5, "args": { "External id": 192594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192594, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192594, "pid": 5, "tid": 7, "ts": 1716454224482597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482586, "dur": 9, "args": { "External id": 192594, "cbid": 211, "correlation": 192594 } }, { "ph": "s", "id": 192594, "pid": 76337, "tid": -914061504, "ts": 1716454224482586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224482711, "dur": 1, "args": { "External id": 192604, "cbid": 317, "correlation": 192604 } }, { "ph": "f", "id": 192604, "pid": 76337, "tid": -914061504, "ts": 1716454224482711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224482714, "dur": 1, "args": { "External id": 192605, "cbid": 203, "correlation": 192605 } }, { "ph": "f", "id": 192605, "pid": 76337, "tid": -914061504, "ts": 1716454224482714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224482716, "dur": 1, "args": { "External id": 192606, "cbid": 205, "correlation": 192606 } }, { "ph": "f", "id": 192606, "pid": 76337, "tid": -914061504, "ts": 1716454224482716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224482771, "dur": 7, "args": { "External id": 192610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192610, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192610, "pid": 5, "tid": 7, "ts": 1716454224482771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482756, "dur": 15, "args": { "External id": 192610, "cbid": 211, "correlation": 192610 } }, { "ph": "s", "id": 192610, "pid": 76337, "tid": -914061504, "ts": 1716454224482756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224482782, "dur": 4, "args": { "External id": 192612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192612, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 192612, "pid": 5, "tid": 7, "ts": 1716454224482782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482775, "dur": 6, "args": { "External id": 192612, "cbid": 211, "correlation": 192612 } }, { "ph": "s", "id": 192612, "pid": 76337, "tid": -914061504, "ts": 1716454224482775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224482803, "dur": 3, "args": { "External id": 192614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192614, "pid": 5, "tid": 7, "ts": 1716454224482803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482793, "dur": 8, "args": { "External id": 192614, "cbid": 211, "correlation": 192614 } }, { "ph": "s", "id": 192614, "pid": 76337, "tid": -914061504, "ts": 1716454224482793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224482808, "dur": 0, "args": { "External id": 192615, "cbid": 51, "correlation": 192615 } }, { "ph": "s", "id": 192615, "pid": 76337, "tid": -914061504, "ts": 1716454224482808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224482818, "dur": 86, "args": { "External id": 192616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192616, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192616, "pid": 5, "tid": 7, "ts": 1716454224482818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482810, "dur": 7, "args": { "External id": 192616, "cbid": 211, "correlation": 192616 } }, { "ph": "s", "id": 192616, "pid": 76337, "tid": -914061504, "ts": 1716454224482810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224482905, "dur": 60, "args": { "External id": 192621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192621, "pid": 5, "tid": 7, "ts": 1716454224482905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224482850, "dur": 11, "args": { "External id": 192621, "cbid": 211, "correlation": 192621 } }, { "ph": "s", "id": 192621, "pid": 76337, "tid": -914061504, "ts": 1716454224482850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224484757, "dur": 51, "args": { "External id": 192641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192641, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 192641, "pid": 5, "tid": 7, "ts": 1716454224484757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484739, "dur": 18, "args": { "External id": 192641, "cbid": 211, "correlation": 192641 } }, { "ph": "s", "id": 192641, "pid": 76337, "tid": -914061504, "ts": 1716454224484739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224484810, "dur": 4, "args": { "External id": 192653, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192653, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192653, "pid": 5, "tid": 7, "ts": 1716454224484810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484769, "dur": 8, "args": { "External id": 192653, "cbid": 211, "correlation": 192653 } }, { "ph": "s", "id": 192653, "pid": 76337, "tid": -914061504, "ts": 1716454224484769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224484815, "dur": 57, "args": { "External id": 192656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192656, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192656, "pid": 5, "tid": 7, "ts": 1716454224484815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484794, "dur": 9, "args": { "External id": 192656, "cbid": 211, "correlation": 192656 } }, { "ph": "s", "id": 192656, "pid": 76337, "tid": -914061504, "ts": 1716454224484794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224484873, "dur": 36, "args": { "External id": 192665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192665, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192665, "pid": 5, "tid": 7, "ts": 1716454224484873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484841, "dur": 11, "args": { "External id": 192665, "cbid": 211, "correlation": 192665 } }, { "ph": "s", "id": 192665, "pid": 76337, "tid": -914061504, "ts": 1716454224484841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224484897, "dur": 0, "args": { "External id": 192675, "cbid": 317, "correlation": 192675 } }, { "ph": "f", "id": 192675, "pid": 76337, "tid": -914061504, "ts": 1716454224484897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224484898, "dur": 0, "args": { "External id": 192676, "cbid": 203, "correlation": 192676 } }, { "ph": "f", "id": 192676, "pid": 76337, "tid": -914061504, "ts": 1716454224484898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224484899, "dur": 0, "args": { "External id": 192677, "cbid": 205, "correlation": 192677 } }, { "ph": "f", "id": 192677, "pid": 76337, "tid": -914061504, "ts": 1716454224484899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224484930, "dur": 40, "args": { "External id": 192681, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192681, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192681, "pid": 5, "tid": 7, "ts": 1716454224484930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484918, "dur": 12, "args": { "External id": 192681, "cbid": 211, "correlation": 192681 } }, { "ph": "s", "id": 192681, "pid": 76337, "tid": -914061504, "ts": 1716454224484918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224484972, "dur": 15, "args": { "External id": 192683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192683, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192683, "pid": 5, "tid": 7, "ts": 1716454224484972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484932, "dur": 6, "args": { "External id": 192683, "cbid": 211, "correlation": 192683 } }, { "ph": "s", "id": 192683, "pid": 76337, "tid": -914061504, "ts": 1716454224484932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224484988, "dur": 3, "args": { "External id": 192685, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192685, "pid": 5, "tid": 7, "ts": 1716454224484988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484945, "dur": 7, "args": { "External id": 192685, "cbid": 211, "correlation": 192685 } }, { "ph": "s", "id": 192685, "pid": 76337, "tid": -914061504, "ts": 1716454224484945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224484956, "dur": 0, "args": { "External id": 192686, "cbid": 51, "correlation": 192686 } }, { "ph": "s", "id": 192686, "pid": 76337, "tid": -914061504, "ts": 1716454224484956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224484993, "dur": 705, "args": { "External id": 192687, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192687, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192687, "pid": 5, "tid": 7, "ts": 1716454224484993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484957, "dur": 6, "args": { "External id": 192687, "cbid": 211, "correlation": 192687 } }, { "ph": "s", "id": 192687, "pid": 76337, "tid": -914061504, "ts": 1716454224484957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224485699, "dur": 60, "args": { "External id": 192692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192692, "pid": 5, "tid": 7, "ts": 1716454224485699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224484994, "dur": 11, "args": { "External id": 192692, "cbid": 211, "correlation": 192692 } }, { "ph": "s", "id": 192692, "pid": 76337, "tid": -914061504, "ts": 1716454224484994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224485760, "dur": 4, "args": { "External id": 192700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192700, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192700, "pid": 5, "tid": 7, "ts": 1716454224485760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485040, "dur": 9, "args": { "External id": 192700, "cbid": 211, "correlation": 192700 } }, { "ph": "s", "id": 192700, "pid": 76337, "tid": -914061504, "ts": 1716454224485040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485107, "dur": 2, "args": { "External id": 192716, "cbid": 251, "correlation": 192716 } }, { "ph": "f", "id": 192716, "pid": 76337, "tid": -914061504, "ts": 1716454224485107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485113, "dur": 0, "args": { "External id": 192718, "cbid": 251, "correlation": 192718 } }, { "ph": "f", "id": 192718, "pid": 76337, "tid": -914061504, "ts": 1716454224485113, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224485765, "dur": 9, "args": { "External id": 192719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192719, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 192719, "pid": 5, "tid": 7, "ts": 1716454224485765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485115, "dur": 11, "args": { "External id": 192719, "cbid": 211, "correlation": 192719 } }, { "ph": "s", "id": 192719, "pid": 76337, "tid": -914061504, "ts": 1716454224485115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224485775, "dur": 4, "args": { "External id": 192721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192721, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 192721, "pid": 5, "tid": 7, "ts": 1716454224485775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485128, "dur": 6, "args": { "External id": 192721, "cbid": 211, "correlation": 192721 } }, { "ph": "s", "id": 192721, "pid": 76337, "tid": -914061504, "ts": 1716454224485128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224485780, "dur": 54, "args": { "External id": 192731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192731, "pid": 5, "tid": 7, "ts": 1716454224485780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485186, "dur": 12, "args": { "External id": 192731, "cbid": 211, "correlation": 192731 } }, { "ph": "s", "id": 192731, "pid": 76337, "tid": -914061504, "ts": 1716454224485186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224485836, "dur": 51, "args": { "External id": 192751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192751, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 192751, "pid": 5, "tid": 7, "ts": 1716454224485836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485252, "dur": 11, "args": { "External id": 192751, "cbid": 211, "correlation": 192751 } }, { "ph": "s", "id": 192751, "pid": 76337, "tid": -914061504, "ts": 1716454224485252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224485888, "dur": 4, "args": { "External id": 192763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192763, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192763, "pid": 5, "tid": 7, "ts": 1716454224485888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485273, "dur": 6, "args": { "External id": 192763, "cbid": 211, "correlation": 192763 } }, { "ph": "s", "id": 192763, "pid": 76337, "tid": -914061504, "ts": 1716454224485273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224485894, "dur": 56, "args": { "External id": 192766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192766, "pid": 5, "tid": 7, "ts": 1716454224485894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485290, "dur": 7, "args": { "External id": 192766, "cbid": 211, "correlation": 192766 } }, { "ph": "s", "id": 192766, "pid": 76337, "tid": -914061504, "ts": 1716454224485290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224485951, "dur": 37, "args": { "External id": 192775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192775, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192775, "pid": 5, "tid": 7, "ts": 1716454224485951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485331, "dur": 10, "args": { "External id": 192775, "cbid": 211, "correlation": 192775 } }, { "ph": "s", "id": 192775, "pid": 76337, "tid": -914061504, "ts": 1716454224485331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224485407, "dur": 0, "args": { "External id": 192785, "cbid": 317, "correlation": 192785 } }, { "ph": "f", "id": 192785, "pid": 76337, "tid": -914061504, "ts": 1716454224485407, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224485408, "dur": 0, "args": { "External id": 192786, "cbid": 203, "correlation": 192786 } }, { "ph": "f", "id": 192786, "pid": 76337, "tid": -914061504, "ts": 1716454224485408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224485408, "dur": 0, "args": { "External id": 192787, "cbid": 205, "correlation": 192787 } }, { "ph": "f", "id": 192787, "pid": 76337, "tid": -914061504, "ts": 1716454224485408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224485989, "dur": 39, "args": { "External id": 192791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192791, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192791, "pid": 5, "tid": 7, "ts": 1716454224485989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485422, "dur": 12, "args": { "External id": 192791, "cbid": 211, "correlation": 192791 } }, { "ph": "s", "id": 192791, "pid": 76337, "tid": -914061504, "ts": 1716454224485422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224486029, "dur": 14, "args": { "External id": 192793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192793, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192793, "pid": 5, "tid": 7, "ts": 1716454224486029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485437, "dur": 5, "args": { "External id": 192793, "cbid": 211, "correlation": 192793 } }, { "ph": "s", "id": 192793, "pid": 76337, "tid": -914061504, "ts": 1716454224485437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224486045, "dur": 3, "args": { "External id": 192795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192795, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192795, "pid": 5, "tid": 7, "ts": 1716454224486045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485446, "dur": 6, "args": { "External id": 192795, "cbid": 211, "correlation": 192795 } }, { "ph": "s", "id": 192795, "pid": 76337, "tid": -914061504, "ts": 1716454224485446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224485455, "dur": 0, "args": { "External id": 192796, "cbid": 51, "correlation": 192796 } }, { "ph": "s", "id": 192796, "pid": 76337, "tid": -914061504, "ts": 1716454224485455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224486050, "dur": 697, "args": { "External id": 192797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192797, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192797, "pid": 5, "tid": 7, "ts": 1716454224486050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485456, "dur": 5, "args": { "External id": 192797, "cbid": 211, "correlation": 192797 } }, { "ph": "s", "id": 192797, "pid": 76337, "tid": -914061504, "ts": 1716454224485456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224486749, "dur": 60, "args": { "External id": 192802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192802, "pid": 5, "tid": 7, "ts": 1716454224486749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485484, "dur": 9, "args": { "External id": 192802, "cbid": 211, "correlation": 192802 } }, { "ph": "s", "id": 192802, "pid": 76337, "tid": -914061504, "ts": 1716454224485484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224486810, "dur": 50, "args": { "External id": 192810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192810, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192810, "pid": 5, "tid": 7, "ts": 1716454224486810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485517, "dur": 8, "args": { "External id": 192810, "cbid": 211, "correlation": 192810 } }, { "ph": "s", "id": 192810, "pid": 76337, "tid": -914061504, "ts": 1716454224485517, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224486861, "dur": 36, "args": { "External id": 192818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192818, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192818, "pid": 5, "tid": 7, "ts": 1716454224486861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485547, "dur": 10, "args": { "External id": 192818, "cbid": 211, "correlation": 192818 } }, { "ph": "s", "id": 192818, "pid": 76337, "tid": -914061504, "ts": 1716454224485547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224486898, "dur": 52, "args": { "External id": 192838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192838, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 192838, "pid": 5, "tid": 7, "ts": 1716454224486898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485635, "dur": 13, "args": { "External id": 192838, "cbid": 211, "correlation": 192838 } }, { "ph": "s", "id": 192838, "pid": 76337, "tid": -914061504, "ts": 1716454224485635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224486952, "dur": 4, "args": { "External id": 192850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192850, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 192850, "pid": 5, "tid": 7, "ts": 1716454224486952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485659, "dur": 7, "args": { "External id": 192850, "cbid": 211, "correlation": 192850 } }, { "ph": "s", "id": 192850, "pid": 76337, "tid": -914061504, "ts": 1716454224485659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224486957, "dur": 55, "args": { "External id": 192853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192853, "pid": 5, "tid": 7, "ts": 1716454224486957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485676, "dur": 7, "args": { "External id": 192853, "cbid": 211, "correlation": 192853 } }, { "ph": "s", "id": 192853, "pid": 76337, "tid": -914061504, "ts": 1716454224485676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224485735, "dur": 0, "args": { "External id": 192864, "cbid": 317, "correlation": 192864 } }, { "ph": "f", "id": 192864, "pid": 76337, "tid": -914061504, "ts": 1716454224485735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224485736, "dur": 0, "args": { "External id": 192865, "cbid": 203, "correlation": 192865 } }, { "ph": "f", "id": 192865, "pid": 76337, "tid": -914061504, "ts": 1716454224485736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224485736, "dur": 0, "args": { "External id": 192866, "cbid": 205, "correlation": 192866 } }, { "ph": "f", "id": 192866, "pid": 76337, "tid": -914061504, "ts": 1716454224485736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485772, "dur": 2, "args": { "External id": 192870, "cbid": 251, "correlation": 192870 } }, { "ph": "f", "id": 192870, "pid": 76337, "tid": -914061504, "ts": 1716454224485772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485775, "dur": 1, "args": { "External id": 192871, "cbid": 251, "correlation": 192871 } }, { "ph": "f", "id": 192871, "pid": 76337, "tid": -914061504, "ts": 1716454224485775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485777, "dur": 2, "args": { "External id": 192872, "cbid": 251, "correlation": 192872 } }, { "ph": "f", "id": 192872, "pid": 76337, "tid": -914061504, "ts": 1716454224485777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485780, "dur": 1, "args": { "External id": 192873, "cbid": 251, "correlation": 192873 } }, { "ph": "f", "id": 192873, "pid": 76337, "tid": -914061504, "ts": 1716454224485780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485782, "dur": 1, "args": { "External id": 192874, "cbid": 251, "correlation": 192874 } }, { "ph": "f", "id": 192874, "pid": 76337, "tid": -914061504, "ts": 1716454224485782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485784, "dur": 1, "args": { "External id": 192875, "cbid": 251, "correlation": 192875 } }, { "ph": "f", "id": 192875, "pid": 76337, "tid": -914061504, "ts": 1716454224485784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485786, "dur": 1, "args": { "External id": 192876, "cbid": 251, "correlation": 192876 } }, { "ph": "f", "id": 192876, "pid": 76337, "tid": -914061504, "ts": 1716454224485786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485788, "dur": 1, "args": { "External id": 192877, "cbid": 251, "correlation": 192877 } }, { "ph": "f", "id": 192877, "pid": 76337, "tid": -914061504, "ts": 1716454224485788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224485791, "dur": 0, "args": { "External id": 192878, "cbid": 251, "correlation": 192878 } }, { "ph": "f", "id": 192878, "pid": 76337, "tid": -914061504, "ts": 1716454224485791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224487013, "dur": 116, "args": { "External id": 192879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192879, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 192879, "pid": 5, "tid": 7, "ts": 1716454224487013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485795, "dur": 16, "args": { "External id": 192879, "cbid": 211, "correlation": 192879 } }, { "ph": "s", "id": 192879, "pid": 76337, "tid": -914061504, "ts": 1716454224485795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224487131, "dur": 60, "args": { "External id": 192885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192885, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192885, "pid": 5, "tid": 7, "ts": 1716454224487131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485834, "dur": 9, "args": { "External id": 192885, "cbid": 211, "correlation": 192885 } }, { "ph": "s", "id": 192885, "pid": 76337, "tid": -914061504, "ts": 1716454224485834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224487192, "dur": 498, "args": { "External id": 192894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192894, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192894, "pid": 5, "tid": 7, "ts": 1716454224487192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224485935, "dur": 15, "args": { "External id": 192894, "cbid": 211, "correlation": 192894 } }, { "ph": "s", "id": 192894, "pid": 76337, "tid": -914061504, "ts": 1716454224485935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224487691, "dur": 182, "args": { "External id": 192916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192916, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 192916, "pid": 5, "tid": 7, "ts": 1716454224487691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486020, "dur": 13, "args": { "External id": 192916, "cbid": 211, "correlation": 192916 } }, { "ph": "s", "id": 192916, "pid": 76337, "tid": -914061504, "ts": 1716454224486020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486146, "dur": 2, "args": { "External id": 192927, "cbid": 251, "correlation": 192927 } }, { "ph": "f", "id": 192927, "pid": 76337, "tid": -914061504, "ts": 1716454224486146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224487875, "dur": 198, "args": { "External id": 192928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192928, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192928, "pid": 5, "tid": 7, "ts": 1716454224487875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486153, "dur": 14, "args": { "External id": 192928, "cbid": 211, "correlation": 192928 } }, { "ph": "s", "id": 192928, "pid": 76337, "tid": -914061504, "ts": 1716454224486153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486230, "dur": 1, "args": { "External id": 192939, "cbid": 251, "correlation": 192939 } }, { "ph": "f", "id": 192939, "pid": 76337, "tid": -914061504, "ts": 1716454224486230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224488074, "dur": 186, "args": { "External id": 192940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192940, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192940, "pid": 5, "tid": 7, "ts": 1716454224488074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486234, "dur": 13, "args": { "External id": 192940, "cbid": 211, "correlation": 192940 } }, { "ph": "s", "id": 192940, "pid": 76337, "tid": -914061504, "ts": 1716454224486234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486300, "dur": 1, "args": { "External id": 192951, "cbid": 251, "correlation": 192951 } }, { "ph": "f", "id": 192951, "pid": 76337, "tid": -914061504, "ts": 1716454224486300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224488261, "dur": 187, "args": { "External id": 192952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192952, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192952, "pid": 5, "tid": 7, "ts": 1716454224488261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486304, "dur": 11, "args": { "External id": 192952, "cbid": 211, "correlation": 192952 } }, { "ph": "s", "id": 192952, "pid": 76337, "tid": -914061504, "ts": 1716454224486304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224488450, "dur": 18689, "args": { "External id": 192973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192973, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 192973, "pid": 5, "tid": 7, "ts": 1716454224488450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486423, "dur": 16, "args": { "External id": 192973, "cbid": 211, "correlation": 192973 } }, { "ph": "s", "id": 192973, "pid": 76337, "tid": -914061504, "ts": 1716454224486423, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486544, "dur": 2, "args": { "External id": 192991, "cbid": 251, "correlation": 192991 } }, { "ph": "f", "id": 192991, "pid": 76337, "tid": -914061504, "ts": 1716454224486544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224507140, "dur": 200, "args": { "External id": 192993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 192993, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 192993, "pid": 5, "tid": 7, "ts": 1716454224507140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486551, "dur": 13, "args": { "External id": 192993, "cbid": 211, "correlation": 192993 } }, { "ph": "s", "id": 192993, "pid": 76337, "tid": -914061504, "ts": 1716454224486551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224507342, "dur": 67, "args": { "External id": 193001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193001, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193001, "pid": 5, "tid": 7, "ts": 1716454224507342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486624, "dur": 13, "args": { "External id": 193001, "cbid": 211, "correlation": 193001 } }, { "ph": "s", "id": 193001, "pid": 76337, "tid": -914061504, "ts": 1716454224486624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224507410, "dur": 96, "args": { "External id": 193009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193009, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193009, "pid": 5, "tid": 7, "ts": 1716454224507410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486665, "dur": 9, "args": { "External id": 193009, "cbid": 211, "correlation": 193009 } }, { "ph": "s", "id": 193009, "pid": 76337, "tid": -914061504, "ts": 1716454224486665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224507507, "dur": 55, "args": { "External id": 193020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193020, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193020, "pid": 5, "tid": 7, "ts": 1716454224507507, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486756, "dur": 14, "args": { "External id": 193020, "cbid": 211, "correlation": 193020 } }, { "ph": "s", "id": 193020, "pid": 76337, "tid": -914061504, "ts": 1716454224486756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224507564, "dur": 92, "args": { "External id": 193042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193042, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193042, "pid": 5, "tid": 7, "ts": 1716454224507564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486792, "dur": 8, "args": { "External id": 193042, "cbid": 211, "correlation": 193042 } }, { "ph": "s", "id": 193042, "pid": 76337, "tid": -914061504, "ts": 1716454224486792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486877, "dur": 1, "args": { "External id": 193053, "cbid": 251, "correlation": 193053 } }, { "ph": "f", "id": 193053, "pid": 76337, "tid": -914061504, "ts": 1716454224486877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224507657, "dur": 105, "args": { "External id": 193054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193054, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193054, "pid": 5, "tid": 7, "ts": 1716454224507657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486882, "dur": 13, "args": { "External id": 193054, "cbid": 211, "correlation": 193054 } }, { "ph": "s", "id": 193054, "pid": 76337, "tid": -914061504, "ts": 1716454224486882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486964, "dur": 1, "args": { "External id": 193065, "cbid": 251, "correlation": 193065 } }, { "ph": "f", "id": 193065, "pid": 76337, "tid": -914061504, "ts": 1716454224486964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224486969, "dur": 0, "args": { "External id": 193066, "cbid": 251, "correlation": 193066 } }, { "ph": "f", "id": 193066, "pid": 76337, "tid": -914061504, "ts": 1716454224486969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224507764, "dur": 10, "args": { "External id": 193067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193067, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 193067, "pid": 5, "tid": 7, "ts": 1716454224507764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224486971, "dur": 26, "args": { "External id": 193067, "cbid": 211, "correlation": 193067 } }, { "ph": "s", "id": 193067, "pid": 76337, "tid": -914061504, "ts": 1716454224486971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224507775, "dur": 5, "args": { "External id": 193069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193069, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 193069, "pid": 5, "tid": 7, "ts": 1716454224507775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487001, "dur": 8, "args": { "External id": 193069, "cbid": 211, "correlation": 193069 } }, { "ph": "s", "id": 193069, "pid": 76337, "tid": -914061504, "ts": 1716454224487001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487067, "dur": 1, "args": { "External id": 193080, "cbid": 251, "correlation": 193080 } }, { "ph": "f", "id": 193080, "pid": 76337, "tid": -914061504, "ts": 1716454224487067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487070, "dur": 0, "args": { "External id": 193081, "cbid": 251, "correlation": 193081 } }, { "ph": "f", "id": 193081, "pid": 76337, "tid": -914061504, "ts": 1716454224487070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224507782, "dur": 6, "args": { "External id": 193082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193082, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 193082, "pid": 5, "tid": 7, "ts": 1716454224507782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487072, "dur": 13, "args": { "External id": 193082, "cbid": 211, "correlation": 193082 } }, { "ph": "s", "id": 193082, "pid": 76337, "tid": -914061504, "ts": 1716454224487072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224507789, "dur": 3, "args": { "External id": 193084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193084, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 193084, "pid": 5, "tid": 7, "ts": 1716454224507789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487086, "dur": 5, "args": { "External id": 193084, "cbid": 211, "correlation": 193084 } }, { "ph": "s", "id": 193084, "pid": 76337, "tid": -914061504, "ts": 1716454224487086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224507794, "dur": 156, "args": { "External id": 193105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193105, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193105, "pid": 5, "tid": 7, "ts": 1716454224507794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487159, "dur": 12, "args": { "External id": 193105, "cbid": 211, "correlation": 193105 } }, { "ph": "s", "id": 193105, "pid": 76337, "tid": -914061504, "ts": 1716454224487159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487257, "dur": 2, "args": { "External id": 193123, "cbid": 251, "correlation": 193123 } }, { "ph": "f", "id": 193123, "pid": 76337, "tid": -914061504, "ts": 1716454224487257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224507952, "dur": 106, "args": { "External id": 193125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193125, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193125, "pid": 5, "tid": 7, "ts": 1716454224507952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487263, "dur": 15, "args": { "External id": 193125, "cbid": 211, "correlation": 193125 } }, { "ph": "s", "id": 193125, "pid": 76337, "tid": -914061504, "ts": 1716454224487263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224508060, "dur": 34, "args": { "External id": 193133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193133, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193133, "pid": 5, "tid": 7, "ts": 1716454224508060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487339, "dur": 12, "args": { "External id": 193133, "cbid": 211, "correlation": 193133 } }, { "ph": "s", "id": 193133, "pid": 76337, "tid": -914061504, "ts": 1716454224487339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224508095, "dur": 68, "args": { "External id": 193141, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193141, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193141, "pid": 5, "tid": 7, "ts": 1716454224508095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487381, "dur": 9, "args": { "External id": 193141, "cbid": 211, "correlation": 193141 } }, { "ph": "s", "id": 193141, "pid": 76337, "tid": -914061504, "ts": 1716454224487381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224508165, "dur": 92, "args": { "External id": 193163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193163, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193163, "pid": 5, "tid": 7, "ts": 1716454224508165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487432, "dur": 10, "args": { "External id": 193163, "cbid": 211, "correlation": 193163 } }, { "ph": "s", "id": 193163, "pid": 76337, "tid": -914061504, "ts": 1716454224487432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487523, "dur": 1, "args": { "External id": 193179, "cbid": 251, "correlation": 193179 } }, { "ph": "f", "id": 193179, "pid": 76337, "tid": -914061504, "ts": 1716454224487523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224508259, "dur": 576, "args": { "External id": 193181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193181, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193181, "pid": 5, "tid": 7, "ts": 1716454224508259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487529, "dur": 13, "args": { "External id": 193181, "cbid": 211, "correlation": 193181 } }, { "ph": "s", "id": 193181, "pid": 76337, "tid": -914061504, "ts": 1716454224487529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224508836, "dur": 245, "args": { "External id": 193189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193189, "pid": 5, "tid": 7, "ts": 1716454224508836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487611, "dur": 15, "args": { "External id": 193189, "cbid": 211, "correlation": 193189 } }, { "ph": "s", "id": 193189, "pid": 76337, "tid": -914061504, "ts": 1716454224487611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224509083, "dur": 250, "args": { "External id": 193197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193197, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193197, "pid": 5, "tid": 7, "ts": 1716454224509083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487648, "dur": 8, "args": { "External id": 193197, "cbid": 211, "correlation": 193197 } }, { "ph": "s", "id": 193197, "pid": 76337, "tid": -914061504, "ts": 1716454224487648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487733, "dur": 2, "args": { "External id": 193213, "cbid": 251, "correlation": 193213 } }, { "ph": "f", "id": 193213, "pid": 76337, "tid": -914061504, "ts": 1716454224487733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487739, "dur": 0, "args": { "External id": 193215, "cbid": 251, "correlation": 193215 } }, { "ph": "f", "id": 193215, "pid": 76337, "tid": -914061504, "ts": 1716454224487739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224509334, "dur": 355, "args": { "External id": 193216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193216, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 193216, "pid": 5, "tid": 7, "ts": 1716454224509334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487744, "dur": 13, "args": { "External id": 193216, "cbid": 211, "correlation": 193216 } }, { "ph": "s", "id": 193216, "pid": 76337, "tid": -914061504, "ts": 1716454224487744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224509691, "dur": 49, "args": { "External id": 193224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193224, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193224, "pid": 5, "tid": 7, "ts": 1716454224509691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487787, "dur": 10, "args": { "External id": 193224, "cbid": 211, "correlation": 193224 } }, { "ph": "s", "id": 193224, "pid": 76337, "tid": -914061504, "ts": 1716454224487787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224509742, "dur": 157, "args": { "External id": 193235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193235, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193235, "pid": 5, "tid": 7, "ts": 1716454224509742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487861, "dur": 13, "args": { "External id": 193235, "cbid": 211, "correlation": 193235 } }, { "ph": "s", "id": 193235, "pid": 76337, "tid": -914061504, "ts": 1716454224487861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224487928, "dur": 0, "args": { "External id": 193247, "cbid": 317, "correlation": 193247 } }, { "ph": "f", "id": 193247, "pid": 76337, "tid": -914061504, "ts": 1716454224487928, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224487929, "dur": 0, "args": { "External id": 193248, "cbid": 203, "correlation": 193248 } }, { "ph": "f", "id": 193248, "pid": 76337, "tid": -914061504, "ts": 1716454224487929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224487930, "dur": 0, "args": { "External id": 193249, "cbid": 205, "correlation": 193249 } }, { "ph": "f", "id": 193249, "pid": 76337, "tid": -914061504, "ts": 1716454224487930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487952, "dur": 1, "args": { "External id": 193253, "cbid": 251, "correlation": 193253 } }, { "ph": "f", "id": 193253, "pid": 76337, "tid": -914061504, "ts": 1716454224487952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487954, "dur": 0, "args": { "External id": 193254, "cbid": 251, "correlation": 193254 } }, { "ph": "f", "id": 193254, "pid": 76337, "tid": -914061504, "ts": 1716454224487954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487955, "dur": 0, "args": { "External id": 193255, "cbid": 251, "correlation": 193255 } }, { "ph": "f", "id": 193255, "pid": 76337, "tid": -914061504, "ts": 1716454224487955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487956, "dur": 0, "args": { "External id": 193256, "cbid": 251, "correlation": 193256 } }, { "ph": "f", "id": 193256, "pid": 76337, "tid": -914061504, "ts": 1716454224487956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487956, "dur": 1, "args": { "External id": 193257, "cbid": 251, "correlation": 193257 } }, { "ph": "f", "id": 193257, "pid": 76337, "tid": -914061504, "ts": 1716454224487956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487958, "dur": 0, "args": { "External id": 193258, "cbid": 251, "correlation": 193258 } }, { "ph": "f", "id": 193258, "pid": 76337, "tid": -914061504, "ts": 1716454224487958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487958, "dur": 0, "args": { "External id": 193259, "cbid": 251, "correlation": 193259 } }, { "ph": "f", "id": 193259, "pid": 76337, "tid": -914061504, "ts": 1716454224487958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487959, "dur": 0, "args": { "External id": 193260, "cbid": 251, "correlation": 193260 } }, { "ph": "f", "id": 193260, "pid": 76337, "tid": -914061504, "ts": 1716454224487959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224487960, "dur": 0, "args": { "External id": 193261, "cbid": 251, "correlation": 193261 } }, { "ph": "f", "id": 193261, "pid": 76337, "tid": -914061504, "ts": 1716454224487960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224509900, "dur": 113, "args": { "External id": 193262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193262, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193262, "pid": 5, "tid": 7, "ts": 1716454224509900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224487963, "dur": 22, "args": { "External id": 193262, "cbid": 211, "correlation": 193262 } }, { "ph": "s", "id": 193262, "pid": 76337, "tid": -914061504, "ts": 1716454224487963, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224510015, "dur": 60, "args": { "External id": 193268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193268, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193268, "pid": 5, "tid": 7, "ts": 1716454224510015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488007, "dur": 9, "args": { "External id": 193268, "cbid": 211, "correlation": 193268 } }, { "ph": "s", "id": 193268, "pid": 76337, "tid": -914061504, "ts": 1716454224488007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224510076, "dur": 50, "args": { "External id": 193276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193276, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193276, "pid": 5, "tid": 7, "ts": 1716454224510076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488042, "dur": 8, "args": { "External id": 193276, "cbid": 211, "correlation": 193276 } }, { "ph": "s", "id": 193276, "pid": 76337, "tid": -914061504, "ts": 1716454224488042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224510127, "dur": 51, "args": { "External id": 193296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193296, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 193296, "pid": 5, "tid": 7, "ts": 1716454224510127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488115, "dur": 13, "args": { "External id": 193296, "cbid": 211, "correlation": 193296 } }, { "ph": "s", "id": 193296, "pid": 76337, "tid": -914061504, "ts": 1716454224488115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224510179, "dur": 4, "args": { "External id": 193308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193308, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193308, "pid": 5, "tid": 7, "ts": 1716454224510179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488138, "dur": 7, "args": { "External id": 193308, "cbid": 211, "correlation": 193308 } }, { "ph": "s", "id": 193308, "pid": 76337, "tid": -914061504, "ts": 1716454224488138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224510185, "dur": 57, "args": { "External id": 193311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193311, "pid": 5, "tid": 7, "ts": 1716454224510185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488156, "dur": 6, "args": { "External id": 193311, "cbid": 211, "correlation": 193311 } }, { "ph": "s", "id": 193311, "pid": 76337, "tid": -914061504, "ts": 1716454224488156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224510244, "dur": 37, "args": { "External id": 193320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193320, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193320, "pid": 5, "tid": 7, "ts": 1716454224510244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488196, "dur": 10, "args": { "External id": 193320, "cbid": 211, "correlation": 193320 } }, { "ph": "s", "id": 193320, "pid": 76337, "tid": -914061504, "ts": 1716454224488196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224488247, "dur": 0, "args": { "External id": 193330, "cbid": 317, "correlation": 193330 } }, { "ph": "f", "id": 193330, "pid": 76337, "tid": -914061504, "ts": 1716454224488247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224488247, "dur": 0, "args": { "External id": 193331, "cbid": 203, "correlation": 193331 } }, { "ph": "f", "id": 193331, "pid": 76337, "tid": -914061504, "ts": 1716454224488247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224488248, "dur": 0, "args": { "External id": 193332, "cbid": 205, "correlation": 193332 } }, { "ph": "f", "id": 193332, "pid": 76337, "tid": -914061504, "ts": 1716454224488248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224510282, "dur": 41, "args": { "External id": 193336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193336, "pid": 5, "tid": 7, "ts": 1716454224510282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488262, "dur": 12, "args": { "External id": 193336, "cbid": 211, "correlation": 193336 } }, { "ph": "s", "id": 193336, "pid": 76337, "tid": -914061504, "ts": 1716454224488262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224510324, "dur": 15, "args": { "External id": 193338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193338, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193338, "pid": 5, "tid": 7, "ts": 1716454224510324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488277, "dur": 6, "args": { "External id": 193338, "cbid": 211, "correlation": 193338 } }, { "ph": "s", "id": 193338, "pid": 76337, "tid": -914061504, "ts": 1716454224488277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224510340, "dur": 3, "args": { "External id": 193340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193340, "pid": 5, "tid": 7, "ts": 1716454224510340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488287, "dur": 5, "args": { "External id": 193340, "cbid": 211, "correlation": 193340 } }, { "ph": "s", "id": 193340, "pid": 76337, "tid": -914061504, "ts": 1716454224488287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224488296, "dur": 0, "args": { "External id": 193341, "cbid": 51, "correlation": 193341 } }, { "ph": "s", "id": 193341, "pid": 76337, "tid": -914061504, "ts": 1716454224488296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224510345, "dur": 703, "args": { "External id": 193342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193342, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193342, "pid": 5, "tid": 7, "ts": 1716454224510345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488296, "dur": 5, "args": { "External id": 193342, "cbid": 211, "correlation": 193342 } }, { "ph": "s", "id": 193342, "pid": 76337, "tid": -914061504, "ts": 1716454224488296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224511049, "dur": 59, "args": { "External id": 193347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193347, "pid": 5, "tid": 7, "ts": 1716454224511049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488323, "dur": 10, "args": { "External id": 193347, "cbid": 211, "correlation": 193347 } }, { "ph": "s", "id": 193347, "pid": 76337, "tid": -914061504, "ts": 1716454224488323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224511110, "dur": 3, "args": { "External id": 193355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193355, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193355, "pid": 5, "tid": 7, "ts": 1716454224511110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488366, "dur": 10, "args": { "External id": 193355, "cbid": 211, "correlation": 193355 } }, { "ph": "s", "id": 193355, "pid": 76337, "tid": -914061504, "ts": 1716454224488366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224488433, "dur": 1, "args": { "External id": 193371, "cbid": 251, "correlation": 193371 } }, { "ph": "f", "id": 193371, "pid": 76337, "tid": -914061504, "ts": 1716454224488433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224488438, "dur": 0, "args": { "External id": 193373, "cbid": 251, "correlation": 193373 } }, { "ph": "f", "id": 193373, "pid": 76337, "tid": -914061504, "ts": 1716454224488438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224511115, "dur": 11, "args": { "External id": 193374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193374, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 193374, "pid": 5, "tid": 7, "ts": 1716454224511115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488439, "dur": 11, "args": { "External id": 193374, "cbid": 211, "correlation": 193374 } }, { "ph": "s", "id": 193374, "pid": 76337, "tid": -914061504, "ts": 1716454224488439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224511128, "dur": 5, "args": { "External id": 193376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193376, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 193376, "pid": 5, "tid": 7, "ts": 1716454224511128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488453, "dur": 5, "args": { "External id": 193376, "cbid": 211, "correlation": 193376 } }, { "ph": "s", "id": 193376, "pid": 76337, "tid": -914061504, "ts": 1716454224488453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224511134, "dur": 52, "args": { "External id": 193386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193386, "pid": 5, "tid": 7, "ts": 1716454224511134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488511, "dur": 12, "args": { "External id": 193386, "cbid": 211, "correlation": 193386 } }, { "ph": "s", "id": 193386, "pid": 76337, "tid": -914061504, "ts": 1716454224488511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224511188, "dur": 50, "args": { "External id": 193406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193406, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 193406, "pid": 5, "tid": 7, "ts": 1716454224511188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488578, "dur": 11, "args": { "External id": 193406, "cbid": 211, "correlation": 193406 } }, { "ph": "s", "id": 193406, "pid": 76337, "tid": -914061504, "ts": 1716454224488578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224511239, "dur": 4, "args": { "External id": 193418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193418, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193418, "pid": 5, "tid": 7, "ts": 1716454224511239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488598, "dur": 10, "args": { "External id": 193418, "cbid": 211, "correlation": 193418 } }, { "ph": "s", "id": 193418, "pid": 76337, "tid": -914061504, "ts": 1716454224488598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224511244, "dur": 55, "args": { "External id": 193421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193421, "pid": 5, "tid": 7, "ts": 1716454224511244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488620, "dur": 7, "args": { "External id": 193421, "cbid": 211, "correlation": 193421 } }, { "ph": "s", "id": 193421, "pid": 76337, "tid": -914061504, "ts": 1716454224488620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224511301, "dur": 36, "args": { "External id": 193430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193430, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193430, "pid": 5, "tid": 7, "ts": 1716454224511301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488661, "dur": 10, "args": { "External id": 193430, "cbid": 211, "correlation": 193430 } }, { "ph": "s", "id": 193430, "pid": 76337, "tid": -914061504, "ts": 1716454224488661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224488726, "dur": 0, "args": { "External id": 193440, "cbid": 317, "correlation": 193440 } }, { "ph": "f", "id": 193440, "pid": 76337, "tid": -914061504, "ts": 1716454224488726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224488727, "dur": 0, "args": { "External id": 193441, "cbid": 203, "correlation": 193441 } }, { "ph": "f", "id": 193441, "pid": 76337, "tid": -914061504, "ts": 1716454224488727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224488728, "dur": 0, "args": { "External id": 193442, "cbid": 205, "correlation": 193442 } }, { "ph": "f", "id": 193442, "pid": 76337, "tid": -914061504, "ts": 1716454224488728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224511338, "dur": 40, "args": { "External id": 193446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193446, "pid": 5, "tid": 7, "ts": 1716454224511338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488741, "dur": 12, "args": { "External id": 193446, "cbid": 211, "correlation": 193446 } }, { "ph": "s", "id": 193446, "pid": 76337, "tid": -914061504, "ts": 1716454224488741, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224511380, "dur": 14, "args": { "External id": 193448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193448, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193448, "pid": 5, "tid": 7, "ts": 1716454224511380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488756, "dur": 5, "args": { "External id": 193448, "cbid": 211, "correlation": 193448 } }, { "ph": "s", "id": 193448, "pid": 76337, "tid": -914061504, "ts": 1716454224488756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224511395, "dur": 3, "args": { "External id": 193450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193450, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193450, "pid": 5, "tid": 7, "ts": 1716454224511395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488765, "dur": 5, "args": { "External id": 193450, "cbid": 211, "correlation": 193450 } }, { "ph": "s", "id": 193450, "pid": 76337, "tid": -914061504, "ts": 1716454224488765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224488774, "dur": 0, "args": { "External id": 193451, "cbid": 51, "correlation": 193451 } }, { "ph": "s", "id": 193451, "pid": 76337, "tid": -914061504, "ts": 1716454224488774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224511400, "dur": 697, "args": { "External id": 193452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193452, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193452, "pid": 5, "tid": 7, "ts": 1716454224511400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488774, "dur": 6, "args": { "External id": 193452, "cbid": 211, "correlation": 193452 } }, { "ph": "s", "id": 193452, "pid": 76337, "tid": -914061504, "ts": 1716454224488774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224512098, "dur": 59, "args": { "External id": 193457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193457, "pid": 5, "tid": 7, "ts": 1716454224512098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488802, "dur": 8, "args": { "External id": 193457, "cbid": 211, "correlation": 193457 } }, { "ph": "s", "id": 193457, "pid": 76337, "tid": -914061504, "ts": 1716454224488802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224512159, "dur": 50, "args": { "External id": 193465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193465, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193465, "pid": 5, "tid": 7, "ts": 1716454224512159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488835, "dur": 8, "args": { "External id": 193465, "cbid": 211, "correlation": 193465 } }, { "ph": "s", "id": 193465, "pid": 76337, "tid": -914061504, "ts": 1716454224488835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224512209, "dur": 35, "args": { "External id": 193473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193473, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193473, "pid": 5, "tid": 7, "ts": 1716454224512209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488863, "dur": 8, "args": { "External id": 193473, "cbid": 211, "correlation": 193473 } }, { "ph": "s", "id": 193473, "pid": 76337, "tid": -914061504, "ts": 1716454224488863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224512246, "dur": 52, "args": { "External id": 193493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193493, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 193493, "pid": 5, "tid": 7, "ts": 1716454224512246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488942, "dur": 12, "args": { "External id": 193493, "cbid": 211, "correlation": 193493 } }, { "ph": "s", "id": 193493, "pid": 76337, "tid": -914061504, "ts": 1716454224488942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224512299, "dur": 4, "args": { "External id": 193505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193505, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193505, "pid": 5, "tid": 7, "ts": 1716454224512299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488964, "dur": 6, "args": { "External id": 193505, "cbid": 211, "correlation": 193505 } }, { "ph": "s", "id": 193505, "pid": 76337, "tid": -914061504, "ts": 1716454224488964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224512304, "dur": 55, "args": { "External id": 193508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193508, "pid": 5, "tid": 7, "ts": 1716454224512304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224488990, "dur": 7, "args": { "External id": 193508, "cbid": 211, "correlation": 193508 } }, { "ph": "s", "id": 193508, "pid": 76337, "tid": -914061504, "ts": 1716454224488990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224489048, "dur": 0, "args": { "External id": 193519, "cbid": 317, "correlation": 193519 } }, { "ph": "f", "id": 193519, "pid": 76337, "tid": -914061504, "ts": 1716454224489048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224489049, "dur": 0, "args": { "External id": 193520, "cbid": 203, "correlation": 193520 } }, { "ph": "f", "id": 193520, "pid": 76337, "tid": -914061504, "ts": 1716454224489049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224489050, "dur": 0, "args": { "External id": 193521, "cbid": 205, "correlation": 193521 } }, { "ph": "f", "id": 193521, "pid": 76337, "tid": -914061504, "ts": 1716454224489050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489070, "dur": 1, "args": { "External id": 193525, "cbid": 251, "correlation": 193525 } }, { "ph": "f", "id": 193525, "pid": 76337, "tid": -914061504, "ts": 1716454224489070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489072, "dur": 0, "args": { "External id": 193526, "cbid": 251, "correlation": 193526 } }, { "ph": "f", "id": 193526, "pid": 76337, "tid": -914061504, "ts": 1716454224489072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489073, "dur": 0, "args": { "External id": 193527, "cbid": 251, "correlation": 193527 } }, { "ph": "f", "id": 193527, "pid": 76337, "tid": -914061504, "ts": 1716454224489073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489074, "dur": 0, "args": { "External id": 193528, "cbid": 251, "correlation": 193528 } }, { "ph": "f", "id": 193528, "pid": 76337, "tid": -914061504, "ts": 1716454224489074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489074, "dur": 0, "args": { "External id": 193529, "cbid": 251, "correlation": 193529 } }, { "ph": "f", "id": 193529, "pid": 76337, "tid": -914061504, "ts": 1716454224489074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489075, "dur": 0, "args": { "External id": 193530, "cbid": 251, "correlation": 193530 } }, { "ph": "f", "id": 193530, "pid": 76337, "tid": -914061504, "ts": 1716454224489075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489076, "dur": 0, "args": { "External id": 193531, "cbid": 251, "correlation": 193531 } }, { "ph": "f", "id": 193531, "pid": 76337, "tid": -914061504, "ts": 1716454224489076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489076, "dur": 0, "args": { "External id": 193532, "cbid": 251, "correlation": 193532 } }, { "ph": "f", "id": 193532, "pid": 76337, "tid": -914061504, "ts": 1716454224489076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489078, "dur": 0, "args": { "External id": 193533, "cbid": 251, "correlation": 193533 } }, { "ph": "f", "id": 193533, "pid": 76337, "tid": -914061504, "ts": 1716454224489078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224512360, "dur": 112, "args": { "External id": 193534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193534, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193534, "pid": 5, "tid": 7, "ts": 1716454224512360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489080, "dur": 13, "args": { "External id": 193534, "cbid": 211, "correlation": 193534 } }, { "ph": "s", "id": 193534, "pid": 76337, "tid": -914061504, "ts": 1716454224489080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224512473, "dur": 60, "args": { "External id": 193540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193540, "pid": 5, "tid": 7, "ts": 1716454224512473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489115, "dur": 9, "args": { "External id": 193540, "cbid": 211, "correlation": 193540 } }, { "ph": "s", "id": 193540, "pid": 76337, "tid": -914061504, "ts": 1716454224489115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224512534, "dur": 653, "args": { "External id": 193549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193549, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193549, "pid": 5, "tid": 7, "ts": 1716454224512534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489198, "dur": 13, "args": { "External id": 193549, "cbid": 211, "correlation": 193549 } }, { "ph": "s", "id": 193549, "pid": 76337, "tid": -914061504, "ts": 1716454224489198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224513188, "dur": 182, "args": { "External id": 193571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193571, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193571, "pid": 5, "tid": 7, "ts": 1716454224513188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489258, "dur": 11, "args": { "External id": 193571, "cbid": 211, "correlation": 193571 } }, { "ph": "s", "id": 193571, "pid": 76337, "tid": -914061504, "ts": 1716454224489258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489346, "dur": 1, "args": { "External id": 193582, "cbid": 251, "correlation": 193582 } }, { "ph": "f", "id": 193582, "pid": 76337, "tid": -914061504, "ts": 1716454224489346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224513371, "dur": 196, "args": { "External id": 193583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193583, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193583, "pid": 5, "tid": 7, "ts": 1716454224513371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489351, "dur": 13, "args": { "External id": 193583, "cbid": 211, "correlation": 193583 } }, { "ph": "s", "id": 193583, "pid": 76337, "tid": -914061504, "ts": 1716454224489351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489422, "dur": 1, "args": { "External id": 193594, "cbid": 251, "correlation": 193594 } }, { "ph": "f", "id": 193594, "pid": 76337, "tid": -914061504, "ts": 1716454224489422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224513568, "dur": 185, "args": { "External id": 193595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193595, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193595, "pid": 5, "tid": 7, "ts": 1716454224513568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489427, "dur": 13, "args": { "External id": 193595, "cbid": 211, "correlation": 193595 } }, { "ph": "s", "id": 193595, "pid": 76337, "tid": -914061504, "ts": 1716454224489427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489490, "dur": 1, "args": { "External id": 193606, "cbid": 251, "correlation": 193606 } }, { "ph": "f", "id": 193606, "pid": 76337, "tid": -914061504, "ts": 1716454224489490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224513755, "dur": 186, "args": { "External id": 193607, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193607, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193607, "pid": 5, "tid": 7, "ts": 1716454224513755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489495, "dur": 11, "args": { "External id": 193607, "cbid": 211, "correlation": 193607 } }, { "ph": "s", "id": 193607, "pid": 76337, "tid": -914061504, "ts": 1716454224489495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224513942, "dur": 18696, "args": { "External id": 193628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193628, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193628, "pid": 5, "tid": 7, "ts": 1716454224513942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489574, "dur": 14, "args": { "External id": 193628, "cbid": 211, "correlation": 193628 } }, { "ph": "s", "id": 193628, "pid": 76337, "tid": -914061504, "ts": 1716454224489574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489673, "dur": 1, "args": { "External id": 193646, "cbid": 251, "correlation": 193646 } }, { "ph": "f", "id": 193646, "pid": 76337, "tid": -914061504, "ts": 1716454224489673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224532638, "dur": 203, "args": { "External id": 193648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193648, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193648, "pid": 5, "tid": 7, "ts": 1716454224532638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489679, "dur": 14, "args": { "External id": 193648, "cbid": 211, "correlation": 193648 } }, { "ph": "s", "id": 193648, "pid": 76337, "tid": -914061504, "ts": 1716454224489679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224532842, "dur": 66, "args": { "External id": 193656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193656, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193656, "pid": 5, "tid": 7, "ts": 1716454224532842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489750, "dur": 12, "args": { "External id": 193656, "cbid": 211, "correlation": 193656 } }, { "ph": "s", "id": 193656, "pid": 76337, "tid": -914061504, "ts": 1716454224489750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224532910, "dur": 97, "args": { "External id": 193664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193664, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193664, "pid": 5, "tid": 7, "ts": 1716454224532910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489790, "dur": 8, "args": { "External id": 193664, "cbid": 211, "correlation": 193664 } }, { "ph": "s", "id": 193664, "pid": 76337, "tid": -914061504, "ts": 1716454224489790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224533008, "dur": 54, "args": { "External id": 193675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193675, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193675, "pid": 5, "tid": 7, "ts": 1716454224533008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489860, "dur": 12, "args": { "External id": 193675, "cbid": 211, "correlation": 193675 } }, { "ph": "s", "id": 193675, "pid": 76337, "tid": -914061504, "ts": 1716454224489860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224533064, "dur": 92, "args": { "External id": 193697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193697, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193697, "pid": 5, "tid": 7, "ts": 1716454224533064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489894, "dur": 7, "args": { "External id": 193697, "cbid": 211, "correlation": 193697 } }, { "ph": "s", "id": 193697, "pid": 76337, "tid": -914061504, "ts": 1716454224489894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224489986, "dur": 1, "args": { "External id": 193708, "cbid": 251, "correlation": 193708 } }, { "ph": "f", "id": 193708, "pid": 76337, "tid": -914061504, "ts": 1716454224489986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224533157, "dur": 105, "args": { "External id": 193709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193709, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193709, "pid": 5, "tid": 7, "ts": 1716454224533157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224489992, "dur": 13, "args": { "External id": 193709, "cbid": 211, "correlation": 193709 } }, { "ph": "s", "id": 193709, "pid": 76337, "tid": -914061504, "ts": 1716454224489992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490065, "dur": 1, "args": { "External id": 193720, "cbid": 251, "correlation": 193720 } }, { "ph": "f", "id": 193720, "pid": 76337, "tid": -914061504, "ts": 1716454224490065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490069, "dur": 0, "args": { "External id": 193721, "cbid": 251, "correlation": 193721 } }, { "ph": "f", "id": 193721, "pid": 76337, "tid": -914061504, "ts": 1716454224490069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224533264, "dur": 10, "args": { "External id": 193722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193722, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 193722, "pid": 5, "tid": 7, "ts": 1716454224533264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490071, "dur": 12, "args": { "External id": 193722, "cbid": 211, "correlation": 193722 } }, { "ph": "s", "id": 193722, "pid": 76337, "tid": -914061504, "ts": 1716454224490071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224533275, "dur": 5, "args": { "External id": 193724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193724, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 193724, "pid": 5, "tid": 7, "ts": 1716454224533275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490084, "dur": 7, "args": { "External id": 193724, "cbid": 211, "correlation": 193724 } }, { "ph": "s", "id": 193724, "pid": 76337, "tid": -914061504, "ts": 1716454224490084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490146, "dur": 1, "args": { "External id": 193735, "cbid": 251, "correlation": 193735 } }, { "ph": "f", "id": 193735, "pid": 76337, "tid": -914061504, "ts": 1716454224490146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490149, "dur": 0, "args": { "External id": 193736, "cbid": 251, "correlation": 193736 } }, { "ph": "f", "id": 193736, "pid": 76337, "tid": -914061504, "ts": 1716454224490149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224533282, "dur": 6, "args": { "External id": 193737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193737, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 193737, "pid": 5, "tid": 7, "ts": 1716454224533282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490150, "dur": 11, "args": { "External id": 193737, "cbid": 211, "correlation": 193737 } }, { "ph": "s", "id": 193737, "pid": 76337, "tid": -914061504, "ts": 1716454224490150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224533289, "dur": 3, "args": { "External id": 193739, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193739, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 193739, "pid": 5, "tid": 7, "ts": 1716454224533289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490163, "dur": 5, "args": { "External id": 193739, "cbid": 211, "correlation": 193739 } }, { "ph": "s", "id": 193739, "pid": 76337, "tid": -914061504, "ts": 1716454224490163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224533294, "dur": 154, "args": { "External id": 193760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193760, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193760, "pid": 5, "tid": 7, "ts": 1716454224533294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490238, "dur": 12, "args": { "External id": 193760, "cbid": 211, "correlation": 193760 } }, { "ph": "s", "id": 193760, "pid": 76337, "tid": -914061504, "ts": 1716454224490238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490333, "dur": 1, "args": { "External id": 193778, "cbid": 251, "correlation": 193778 } }, { "ph": "f", "id": 193778, "pid": 76337, "tid": -914061504, "ts": 1716454224490333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224533449, "dur": 106, "args": { "External id": 193780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193780, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193780, "pid": 5, "tid": 7, "ts": 1716454224533449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490340, "dur": 13, "args": { "External id": 193780, "cbid": 211, "correlation": 193780 } }, { "ph": "s", "id": 193780, "pid": 76337, "tid": -914061504, "ts": 1716454224490340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224533556, "dur": 35, "args": { "External id": 193788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193788, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193788, "pid": 5, "tid": 7, "ts": 1716454224533556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490408, "dur": 12, "args": { "External id": 193788, "cbid": 211, "correlation": 193788 } }, { "ph": "s", "id": 193788, "pid": 76337, "tid": -914061504, "ts": 1716454224490408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224533592, "dur": 66, "args": { "External id": 193796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193796, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193796, "pid": 5, "tid": 7, "ts": 1716454224533592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490449, "dur": 9, "args": { "External id": 193796, "cbid": 211, "correlation": 193796 } }, { "ph": "s", "id": 193796, "pid": 76337, "tid": -914061504, "ts": 1716454224490449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224533659, "dur": 92, "args": { "External id": 193818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193818, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193818, "pid": 5, "tid": 7, "ts": 1716454224533659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490501, "dur": 10, "args": { "External id": 193818, "cbid": 211, "correlation": 193818 } }, { "ph": "s", "id": 193818, "pid": 76337, "tid": -914061504, "ts": 1716454224490501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490586, "dur": 1, "args": { "External id": 193834, "cbid": 251, "correlation": 193834 } }, { "ph": "f", "id": 193834, "pid": 76337, "tid": -914061504, "ts": 1716454224490586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224533753, "dur": 576, "args": { "External id": 193836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193836, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193836, "pid": 5, "tid": 7, "ts": 1716454224533753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490592, "dur": 12, "args": { "External id": 193836, "cbid": 211, "correlation": 193836 } }, { "ph": "s", "id": 193836, "pid": 76337, "tid": -914061504, "ts": 1716454224490592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224534331, "dur": 242, "args": { "External id": 193844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193844, "pid": 5, "tid": 7, "ts": 1716454224534331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490656, "dur": 12, "args": { "External id": 193844, "cbid": 211, "correlation": 193844 } }, { "ph": "s", "id": 193844, "pid": 76337, "tid": -914061504, "ts": 1716454224490656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224534574, "dur": 251, "args": { "External id": 193852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193852, "pid": 5, "tid": 7, "ts": 1716454224534574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490686, "dur": 9, "args": { "External id": 193852, "cbid": 211, "correlation": 193852 } }, { "ph": "s", "id": 193852, "pid": 76337, "tid": -914061504, "ts": 1716454224490686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490768, "dur": 1, "args": { "External id": 193868, "cbid": 251, "correlation": 193868 } }, { "ph": "f", "id": 193868, "pid": 76337, "tid": -914061504, "ts": 1716454224490768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490773, "dur": 0, "args": { "External id": 193870, "cbid": 251, "correlation": 193870 } }, { "ph": "f", "id": 193870, "pid": 76337, "tid": -914061504, "ts": 1716454224490773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224534827, "dur": 358, "args": { "External id": 193871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193871, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 193871, "pid": 5, "tid": 7, "ts": 1716454224534827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490776, "dur": 13, "args": { "External id": 193871, "cbid": 211, "correlation": 193871 } }, { "ph": "s", "id": 193871, "pid": 76337, "tid": -914061504, "ts": 1716454224490776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224535186, "dur": 50, "args": { "External id": 193879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193879, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193879, "pid": 5, "tid": 7, "ts": 1716454224535186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490818, "dur": 10, "args": { "External id": 193879, "cbid": 211, "correlation": 193879 } }, { "ph": "s", "id": 193879, "pid": 76337, "tid": -914061504, "ts": 1716454224490818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224535238, "dur": 157, "args": { "External id": 193890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193890, "pid": 5, "tid": 7, "ts": 1716454224535238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490886, "dur": 13, "args": { "External id": 193890, "cbid": 211, "correlation": 193890 } }, { "ph": "s", "id": 193890, "pid": 76337, "tid": -914061504, "ts": 1716454224490886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224490951, "dur": 0, "args": { "External id": 193902, "cbid": 317, "correlation": 193902 } }, { "ph": "f", "id": 193902, "pid": 76337, "tid": -914061504, "ts": 1716454224490951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224490951, "dur": 0, "args": { "External id": 193903, "cbid": 203, "correlation": 193903 } }, { "ph": "f", "id": 193903, "pid": 76337, "tid": -914061504, "ts": 1716454224490951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224490952, "dur": 0, "args": { "External id": 193904, "cbid": 205, "correlation": 193904 } }, { "ph": "f", "id": 193904, "pid": 76337, "tid": -914061504, "ts": 1716454224490952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490980, "dur": 1, "args": { "External id": 193908, "cbid": 251, "correlation": 193908 } }, { "ph": "f", "id": 193908, "pid": 76337, "tid": -914061504, "ts": 1716454224490980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490982, "dur": 0, "args": { "External id": 193909, "cbid": 251, "correlation": 193909 } }, { "ph": "f", "id": 193909, "pid": 76337, "tid": -914061504, "ts": 1716454224490982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490983, "dur": 0, "args": { "External id": 193910, "cbid": 251, "correlation": 193910 } }, { "ph": "f", "id": 193910, "pid": 76337, "tid": -914061504, "ts": 1716454224490983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490984, "dur": 0, "args": { "External id": 193911, "cbid": 251, "correlation": 193911 } }, { "ph": "f", "id": 193911, "pid": 76337, "tid": -914061504, "ts": 1716454224490984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490985, "dur": 0, "args": { "External id": 193912, "cbid": 251, "correlation": 193912 } }, { "ph": "f", "id": 193912, "pid": 76337, "tid": -914061504, "ts": 1716454224490985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490985, "dur": 0, "args": { "External id": 193913, "cbid": 251, "correlation": 193913 } }, { "ph": "f", "id": 193913, "pid": 76337, "tid": -914061504, "ts": 1716454224490985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490986, "dur": 0, "args": { "External id": 193914, "cbid": 251, "correlation": 193914 } }, { "ph": "f", "id": 193914, "pid": 76337, "tid": -914061504, "ts": 1716454224490986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490987, "dur": 0, "args": { "External id": 193915, "cbid": 251, "correlation": 193915 } }, { "ph": "f", "id": 193915, "pid": 76337, "tid": -914061504, "ts": 1716454224490987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224490988, "dur": 0, "args": { "External id": 193916, "cbid": 251, "correlation": 193916 } }, { "ph": "f", "id": 193916, "pid": 76337, "tid": -914061504, "ts": 1716454224490988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224535396, "dur": 116, "args": { "External id": 193917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193917, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 193917, "pid": 5, "tid": 7, "ts": 1716454224535396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224490990, "dur": 13, "args": { "External id": 193917, "cbid": 211, "correlation": 193917 } }, { "ph": "s", "id": 193917, "pid": 76337, "tid": -914061504, "ts": 1716454224490990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224535514, "dur": 59, "args": { "External id": 193923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193923, "pid": 5, "tid": 7, "ts": 1716454224535514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491026, "dur": 9, "args": { "External id": 193923, "cbid": 211, "correlation": 193923 } }, { "ph": "s", "id": 193923, "pid": 76337, "tid": -914061504, "ts": 1716454224491026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224535574, "dur": 50, "args": { "External id": 193931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193931, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193931, "pid": 5, "tid": 7, "ts": 1716454224535574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491059, "dur": 8, "args": { "External id": 193931, "cbid": 211, "correlation": 193931 } }, { "ph": "s", "id": 193931, "pid": 76337, "tid": -914061504, "ts": 1716454224491059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224491130, "dur": 0, "args": { "External id": 193941, "cbid": 317, "correlation": 193941 } }, { "ph": "f", "id": 193941, "pid": 76337, "tid": -914061504, "ts": 1716454224491130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224491131, "dur": 0, "args": { "External id": 193942, "cbid": 203, "correlation": 193942 } }, { "ph": "f", "id": 193942, "pid": 76337, "tid": -914061504, "ts": 1716454224491131, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224491132, "dur": 0, "args": { "External id": 193943, "cbid": 205, "correlation": 193943 } }, { "ph": "f", "id": 193943, "pid": 76337, "tid": -914061504, "ts": 1716454224491132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224535625, "dur": 42, "args": { "External id": 193947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193947, "pid": 5, "tid": 7, "ts": 1716454224535625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491148, "dur": 12, "args": { "External id": 193947, "cbid": 211, "correlation": 193947 } }, { "ph": "s", "id": 193947, "pid": 76337, "tid": -914061504, "ts": 1716454224491148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224535668, "dur": 15, "args": { "External id": 193949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193949, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193949, "pid": 5, "tid": 7, "ts": 1716454224535668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491163, "dur": 5, "args": { "External id": 193949, "cbid": 211, "correlation": 193949 } }, { "ph": "s", "id": 193949, "pid": 76337, "tid": -914061504, "ts": 1716454224491163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224535685, "dur": 1, "args": { "External id": 193951, "device": 5, "context": 1, "stream": 7, "correlation": 193951, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 193951, "pid": 5, "tid": 7, "ts": 1716454224535685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224491181, "dur": 21, "args": { "External id": 193951, "cbid": 51, "correlation": 193951 } }, { "ph": "s", "id": 193951, "pid": 76337, "tid": -914061504, "ts": 1716454224491181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224535688, "dur": 364, "args": { "External id": 193952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193952, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 193952, "pid": 5, "tid": 7, "ts": 1716454224535688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491204, "dur": 10, "args": { "External id": 193952, "cbid": 211, "correlation": 193952 } }, { "ph": "s", "id": 193952, "pid": 76337, "tid": -914061504, "ts": 1716454224491204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224536054, "dur": 13, "args": { "External id": 193954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193954, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193954, "pid": 5, "tid": 7, "ts": 1716454224536054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491222, "dur": 7, "args": { "External id": 193954, "cbid": 211, "correlation": 193954 } }, { "ph": "s", "id": 193954, "pid": 76337, "tid": -914061504, "ts": 1716454224491222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224536068, "dur": 15, "args": { "External id": 193960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193960, "pid": 5, "tid": 7, "ts": 1716454224536068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491253, "dur": 8, "args": { "External id": 193960, "cbid": 211, "correlation": 193960 } }, { "ph": "s", "id": 193960, "pid": 76337, "tid": -914061504, "ts": 1716454224491253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224536084, "dur": 19, "args": { "External id": 193980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193980, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 193980, "pid": 5, "tid": 7, "ts": 1716454224536084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491349, "dur": 12, "args": { "External id": 193980, "cbid": 211, "correlation": 193980 } }, { "ph": "s", "id": 193980, "pid": 76337, "tid": -914061504, "ts": 1716454224491349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224536105, "dur": 4, "args": { "External id": 193992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193992, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 193992, "pid": 5, "tid": 7, "ts": 1716454224536105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491371, "dur": 6, "args": { "External id": 193992, "cbid": 211, "correlation": 193992 } }, { "ph": "s", "id": 193992, "pid": 76337, "tid": -914061504, "ts": 1716454224491371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224536110, "dur": 17, "args": { "External id": 193995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 193995, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 193995, "pid": 5, "tid": 7, "ts": 1716454224536110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491390, "dur": 6, "args": { "External id": 193995, "cbid": 211, "correlation": 193995 } }, { "ph": "s", "id": 193995, "pid": 76337, "tid": -914061504, "ts": 1716454224491390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224536129, "dur": 12, "args": { "External id": 194004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194004, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194004, "pid": 5, "tid": 7, "ts": 1716454224536129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491429, "dur": 11, "args": { "External id": 194004, "cbid": 211, "correlation": 194004 } }, { "ph": "s", "id": 194004, "pid": 76337, "tid": -914061504, "ts": 1716454224491429, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224491490, "dur": 0, "args": { "External id": 194014, "cbid": 317, "correlation": 194014 } }, { "ph": "f", "id": 194014, "pid": 76337, "tid": -914061504, "ts": 1716454224491490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224491491, "dur": 0, "args": { "External id": 194015, "cbid": 203, "correlation": 194015 } }, { "ph": "f", "id": 194015, "pid": 76337, "tid": -914061504, "ts": 1716454224491491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224491492, "dur": 0, "args": { "External id": 194016, "cbid": 205, "correlation": 194016 } }, { "ph": "f", "id": 194016, "pid": 76337, "tid": -914061504, "ts": 1716454224491492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224536142, "dur": 11, "args": { "External id": 194020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194020, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194020, "pid": 5, "tid": 7, "ts": 1716454224536142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491508, "dur": 12, "args": { "External id": 194020, "cbid": 211, "correlation": 194020 } }, { "ph": "s", "id": 194020, "pid": 76337, "tid": -914061504, "ts": 1716454224491508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224536155, "dur": 24, "args": { "External id": 194022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194022, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194022, "pid": 5, "tid": 7, "ts": 1716454224536155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491522, "dur": 5, "args": { "External id": 194022, "cbid": 211, "correlation": 194022 } }, { "ph": "s", "id": 194022, "pid": 76337, "tid": -914061504, "ts": 1716454224491522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224536180, "dur": 4, "args": { "External id": 194024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194024, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 194024, "pid": 5, "tid": 7, "ts": 1716454224536180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491533, "dur": 6, "args": { "External id": 194024, "cbid": 211, "correlation": 194024 } }, { "ph": "s", "id": 194024, "pid": 76337, "tid": -914061504, "ts": 1716454224491533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224491543, "dur": 0, "args": { "External id": 194025, "cbid": 51, "correlation": 194025 } }, { "ph": "s", "id": 194025, "pid": 76337, "tid": -914061504, "ts": 1716454224491543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224536185, "dur": 358, "args": { "External id": 194026, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194026, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194026, "pid": 5, "tid": 7, "ts": 1716454224536185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491544, "dur": 7, "args": { "External id": 194026, "cbid": 211, "correlation": 194026 } }, { "ph": "s", "id": 194026, "pid": 76337, "tid": -914061504, "ts": 1716454224491544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224536545, "dur": 20, "args": { "External id": 194027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194027, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194027, "pid": 5, "tid": 7, "ts": 1716454224536545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491554, "dur": 5, "args": { "External id": 194027, "cbid": 211, "correlation": 194027 } }, { "ph": "s", "id": 194027, "pid": 76337, "tid": -914061504, "ts": 1716454224491554, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224536566, "dur": 32, "args": { "External id": 194033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194033, "pid": 5, "tid": 7, "ts": 1716454224536566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491581, "dur": 9, "args": { "External id": 194033, "cbid": 211, "correlation": 194033 } }, { "ph": "s", "id": 194033, "pid": 76337, "tid": -914061504, "ts": 1716454224491581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224536599, "dur": 3, "args": { "External id": 194041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194041, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 194041, "pid": 5, "tid": 7, "ts": 1716454224536599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491627, "dur": 10, "args": { "External id": 194041, "cbid": 211, "correlation": 194041 } }, { "ph": "s", "id": 194041, "pid": 76337, "tid": -914061504, "ts": 1716454224491627, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224491696, "dur": 1, "args": { "External id": 194057, "cbid": 251, "correlation": 194057 } }, { "ph": "f", "id": 194057, "pid": 76337, "tid": -914061504, "ts": 1716454224491696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224491701, "dur": 0, "args": { "External id": 194059, "cbid": 251, "correlation": 194059 } }, { "ph": "f", "id": 194059, "pid": 76337, "tid": -914061504, "ts": 1716454224491701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224536604, "dur": 13, "args": { "External id": 194060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194060, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 194060, "pid": 5, "tid": 7, "ts": 1716454224536604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491703, "dur": 11, "args": { "External id": 194060, "cbid": 211, "correlation": 194060 } }, { "ph": "s", "id": 194060, "pid": 76337, "tid": -914061504, "ts": 1716454224491703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224536618, "dur": 5, "args": { "External id": 194062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194062, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 194062, "pid": 5, "tid": 7, "ts": 1716454224536618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491717, "dur": 5, "args": { "External id": 194062, "cbid": 211, "correlation": 194062 } }, { "ph": "s", "id": 194062, "pid": 76337, "tid": -914061504, "ts": 1716454224491717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224536624, "dur": 30, "args": { "External id": 194072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194072, "pid": 5, "tid": 7, "ts": 1716454224536624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491775, "dur": 13, "args": { "External id": 194072, "cbid": 211, "correlation": 194072 } }, { "ph": "s", "id": 194072, "pid": 76337, "tid": -914061504, "ts": 1716454224491775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224536655, "dur": 31, "args": { "External id": 194092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194092, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 194092, "pid": 5, "tid": 7, "ts": 1716454224536655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491848, "dur": 11, "args": { "External id": 194092, "cbid": 211, "correlation": 194092 } }, { "ph": "s", "id": 194092, "pid": 76337, "tid": -914061504, "ts": 1716454224491848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224536687, "dur": 4, "args": { "External id": 194104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194104, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 194104, "pid": 5, "tid": 7, "ts": 1716454224536687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491871, "dur": 6, "args": { "External id": 194104, "cbid": 211, "correlation": 194104 } }, { "ph": "s", "id": 194104, "pid": 76337, "tid": -914061504, "ts": 1716454224491871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224536693, "dur": 29, "args": { "External id": 194107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194107, "pid": 5, "tid": 7, "ts": 1716454224536693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491891, "dur": 6, "args": { "External id": 194107, "cbid": 211, "correlation": 194107 } }, { "ph": "s", "id": 194107, "pid": 76337, "tid": -914061504, "ts": 1716454224491891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224536724, "dur": 20, "args": { "External id": 194116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194116, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194116, "pid": 5, "tid": 7, "ts": 1716454224536724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224491932, "dur": 10, "args": { "External id": 194116, "cbid": 211, "correlation": 194116 } }, { "ph": "s", "id": 194116, "pid": 76337, "tid": -914061504, "ts": 1716454224491932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224492004, "dur": 0, "args": { "External id": 194126, "cbid": 317, "correlation": 194126 } }, { "ph": "f", "id": 194126, "pid": 76337, "tid": -914061504, "ts": 1716454224492004, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224492005, "dur": 0, "args": { "External id": 194127, "cbid": 203, "correlation": 194127 } }, { "ph": "f", "id": 194127, "pid": 76337, "tid": -914061504, "ts": 1716454224492005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224492005, "dur": 0, "args": { "External id": 194128, "cbid": 205, "correlation": 194128 } }, { "ph": "f", "id": 194128, "pid": 76337, "tid": -914061504, "ts": 1716454224492005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224536746, "dur": 23, "args": { "External id": 194132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194132, "pid": 5, "tid": 7, "ts": 1716454224536746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492023, "dur": 13, "args": { "External id": 194132, "cbid": 211, "correlation": 194132 } }, { "ph": "s", "id": 194132, "pid": 76337, "tid": -914061504, "ts": 1716454224492023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224536770, "dur": 44, "args": { "External id": 194134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194134, "pid": 5, "tid": 7, "ts": 1716454224536770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492039, "dur": 5, "args": { "External id": 194134, "cbid": 211, "correlation": 194134 } }, { "ph": "s", "id": 194134, "pid": 76337, "tid": -914061504, "ts": 1716454224492039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224536816, "dur": 655, "args": { "External id": 194136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194136, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194136, "pid": 5, "tid": 7, "ts": 1716454224536816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492053, "dur": 9, "args": { "External id": 194136, "cbid": 211, "correlation": 194136 } }, { "ph": "s", "id": 194136, "pid": 76337, "tid": -914061504, "ts": 1716454224492053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224537472, "dur": 21, "args": { "External id": 194138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194138, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194138, "pid": 5, "tid": 7, "ts": 1716454224537472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492066, "dur": 5, "args": { "External id": 194138, "cbid": 211, "correlation": 194138 } }, { "ph": "s", "id": 194138, "pid": 76337, "tid": -914061504, "ts": 1716454224492066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224537494, "dur": 33, "args": { "External id": 194144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194144, "pid": 5, "tid": 7, "ts": 1716454224537494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492095, "dur": 12, "args": { "External id": 194144, "cbid": 211, "correlation": 194144 } }, { "ph": "s", "id": 194144, "pid": 76337, "tid": -914061504, "ts": 1716454224492095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224492158, "dur": 0, "args": { "External id": 194154, "cbid": 317, "correlation": 194154 } }, { "ph": "f", "id": 194154, "pid": 76337, "tid": -914061504, "ts": 1716454224492158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224492159, "dur": 0, "args": { "External id": 194155, "cbid": 203, "correlation": 194155 } }, { "ph": "f", "id": 194155, "pid": 76337, "tid": -914061504, "ts": 1716454224492159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224492160, "dur": 0, "args": { "External id": 194156, "cbid": 205, "correlation": 194156 } }, { "ph": "f", "id": 194156, "pid": 76337, "tid": -914061504, "ts": 1716454224492160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492181, "dur": 1, "args": { "External id": 194160, "cbid": 251, "correlation": 194160 } }, { "ph": "f", "id": 194160, "pid": 76337, "tid": -914061504, "ts": 1716454224492181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492183, "dur": 0, "args": { "External id": 194161, "cbid": 251, "correlation": 194161 } }, { "ph": "f", "id": 194161, "pid": 76337, "tid": -914061504, "ts": 1716454224492183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492184, "dur": 0, "args": { "External id": 194162, "cbid": 251, "correlation": 194162 } }, { "ph": "f", "id": 194162, "pid": 76337, "tid": -914061504, "ts": 1716454224492184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492185, "dur": 0, "args": { "External id": 194163, "cbid": 251, "correlation": 194163 } }, { "ph": "f", "id": 194163, "pid": 76337, "tid": -914061504, "ts": 1716454224492185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492186, "dur": 0, "args": { "External id": 194164, "cbid": 251, "correlation": 194164 } }, { "ph": "f", "id": 194164, "pid": 76337, "tid": -914061504, "ts": 1716454224492186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492187, "dur": 0, "args": { "External id": 194165, "cbid": 251, "correlation": 194165 } }, { "ph": "f", "id": 194165, "pid": 76337, "tid": -914061504, "ts": 1716454224492187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492188, "dur": 0, "args": { "External id": 194166, "cbid": 251, "correlation": 194166 } }, { "ph": "f", "id": 194166, "pid": 76337, "tid": -914061504, "ts": 1716454224492188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492188, "dur": 0, "args": { "External id": 194167, "cbid": 251, "correlation": 194167 } }, { "ph": "f", "id": 194167, "pid": 76337, "tid": -914061504, "ts": 1716454224492188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492190, "dur": 0, "args": { "External id": 194168, "cbid": 251, "correlation": 194168 } }, { "ph": "f", "id": 194168, "pid": 76337, "tid": -914061504, "ts": 1716454224492190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224537529, "dur": 52, "args": { "External id": 194169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194169, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 194169, "pid": 5, "tid": 7, "ts": 1716454224537529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492192, "dur": 12, "args": { "External id": 194169, "cbid": 211, "correlation": 194169 } }, { "ph": "s", "id": 194169, "pid": 76337, "tid": -914061504, "ts": 1716454224492192, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224537582, "dur": 32, "args": { "External id": 194175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194175, "pid": 5, "tid": 7, "ts": 1716454224537582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492224, "dur": 11, "args": { "External id": 194175, "cbid": 211, "correlation": 194175 } }, { "ph": "s", "id": 194175, "pid": 76337, "tid": -914061504, "ts": 1716454224492224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224537615, "dur": 27, "args": { "External id": 194183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194183, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194183, "pid": 5, "tid": 7, "ts": 1716454224537615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492256, "dur": 8, "args": { "External id": 194183, "cbid": 211, "correlation": 194183 } }, { "ph": "s", "id": 194183, "pid": 76337, "tid": -914061504, "ts": 1716454224492256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224537643, "dur": 20, "args": { "External id": 194191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194191, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194191, "pid": 5, "tid": 7, "ts": 1716454224537643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492288, "dur": 8, "args": { "External id": 194191, "cbid": 211, "correlation": 194191 } }, { "ph": "s", "id": 194191, "pid": 76337, "tid": -914061504, "ts": 1716454224492288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224537664, "dur": 30, "args": { "External id": 194211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194211, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 194211, "pid": 5, "tid": 7, "ts": 1716454224537664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492371, "dur": 13, "args": { "External id": 194211, "cbid": 211, "correlation": 194211 } }, { "ph": "s", "id": 194211, "pid": 76337, "tid": -914061504, "ts": 1716454224492371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224537695, "dur": 4, "args": { "External id": 194223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194223, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 194223, "pid": 5, "tid": 7, "ts": 1716454224537695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492394, "dur": 6, "args": { "External id": 194223, "cbid": 211, "correlation": 194223 } }, { "ph": "s", "id": 194223, "pid": 76337, "tid": -914061504, "ts": 1716454224492394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224537701, "dur": 29, "args": { "External id": 194226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194226, "pid": 5, "tid": 7, "ts": 1716454224537701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492411, "dur": 6, "args": { "External id": 194226, "cbid": 211, "correlation": 194226 } }, { "ph": "s", "id": 194226, "pid": 76337, "tid": -914061504, "ts": 1716454224492411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224492472, "dur": 0, "args": { "External id": 194237, "cbid": 317, "correlation": 194237 } }, { "ph": "f", "id": 194237, "pid": 76337, "tid": -914061504, "ts": 1716454224492472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224492473, "dur": 0, "args": { "External id": 194238, "cbid": 203, "correlation": 194238 } }, { "ph": "f", "id": 194238, "pid": 76337, "tid": -914061504, "ts": 1716454224492473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224492473, "dur": 0, "args": { "External id": 194239, "cbid": 205, "correlation": 194239 } }, { "ph": "f", "id": 194239, "pid": 76337, "tid": -914061504, "ts": 1716454224492473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224537732, "dur": 22, "args": { "External id": 194243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194243, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194243, "pid": 5, "tid": 7, "ts": 1716454224537732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492489, "dur": 12, "args": { "External id": 194243, "cbid": 211, "correlation": 194243 } }, { "ph": "s", "id": 194243, "pid": 76337, "tid": -914061504, "ts": 1716454224492489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224537755, "dur": 121, "args": { "External id": 194245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194245, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194245, "pid": 5, "tid": 7, "ts": 1716454224537755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492509, "dur": 8, "args": { "External id": 194245, "cbid": 211, "correlation": 194245 } }, { "ph": "s", "id": 194245, "pid": 76337, "tid": -914061504, "ts": 1716454224492509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224537877, "dur": 23, "args": { "External id": 194247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194247, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194247, "pid": 5, "tid": 7, "ts": 1716454224537877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492521, "dur": 5, "args": { "External id": 194247, "cbid": 211, "correlation": 194247 } }, { "ph": "s", "id": 194247, "pid": 76337, "tid": -914061504, "ts": 1716454224492521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224537902, "dur": 32, "args": { "External id": 194253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194253, "pid": 5, "tid": 7, "ts": 1716454224537902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492550, "dur": 8, "args": { "External id": 194253, "cbid": 211, "correlation": 194253 } }, { "ph": "s", "id": 194253, "pid": 76337, "tid": -914061504, "ts": 1716454224492550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224537935, "dur": 164, "args": { "External id": 194262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194262, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194262, "pid": 5, "tid": 7, "ts": 1716454224537935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492633, "dur": 14, "args": { "External id": 194262, "cbid": 211, "correlation": 194262 } }, { "ph": "s", "id": 194262, "pid": 76337, "tid": -914061504, "ts": 1716454224492633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224538100, "dur": 66, "args": { "External id": 194284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194284, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194284, "pid": 5, "tid": 7, "ts": 1716454224538100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492695, "dur": 10, "args": { "External id": 194284, "cbid": 211, "correlation": 194284 } }, { "ph": "s", "id": 194284, "pid": 76337, "tid": -914061504, "ts": 1716454224492695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492787, "dur": 1, "args": { "External id": 194295, "cbid": 251, "correlation": 194295 } }, { "ph": "f", "id": 194295, "pid": 76337, "tid": -914061504, "ts": 1716454224492787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224538167, "dur": 158, "args": { "External id": 194296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194296, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194296, "pid": 5, "tid": 7, "ts": 1716454224538167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492792, "dur": 13, "args": { "External id": 194296, "cbid": 211, "correlation": 194296 } }, { "ph": "s", "id": 194296, "pid": 76337, "tid": -914061504, "ts": 1716454224492792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492862, "dur": 1, "args": { "External id": 194307, "cbid": 251, "correlation": 194307 } }, { "ph": "f", "id": 194307, "pid": 76337, "tid": -914061504, "ts": 1716454224492862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224538326, "dur": 148, "args": { "External id": 194308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194308, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194308, "pid": 5, "tid": 7, "ts": 1716454224538326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492866, "dur": 11, "args": { "External id": 194308, "cbid": 211, "correlation": 194308 } }, { "ph": "s", "id": 194308, "pid": 76337, "tid": -914061504, "ts": 1716454224492866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224492930, "dur": 1, "args": { "External id": 194319, "cbid": 251, "correlation": 194319 } }, { "ph": "f", "id": 194319, "pid": 76337, "tid": -914061504, "ts": 1716454224492930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224538475, "dur": 143, "args": { "External id": 194320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194320, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194320, "pid": 5, "tid": 7, "ts": 1716454224538475, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224492934, "dur": 12, "args": { "External id": 194320, "cbid": 211, "correlation": 194320 } }, { "ph": "s", "id": 194320, "pid": 76337, "tid": -914061504, "ts": 1716454224492934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224538620, "dur": 1946, "args": { "External id": 194341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194341, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 194341, "pid": 5, "tid": 7, "ts": 1716454224538620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493027, "dur": 15, "args": { "External id": 194341, "cbid": 211, "correlation": 194341 } }, { "ph": "s", "id": 194341, "pid": 76337, "tid": -914061504, "ts": 1716454224493027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493128, "dur": 1, "args": { "External id": 194359, "cbid": 251, "correlation": 194359 } }, { "ph": "f", "id": 194359, "pid": 76337, "tid": -914061504, "ts": 1716454224493128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224540568, "dur": 146, "args": { "External id": 194361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194361, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 194361, "pid": 5, "tid": 7, "ts": 1716454224540568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493134, "dur": 14, "args": { "External id": 194361, "cbid": 211, "correlation": 194361 } }, { "ph": "s", "id": 194361, "pid": 76337, "tid": -914061504, "ts": 1716454224493134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224540715, "dur": 35, "args": { "External id": 194369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194369, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194369, "pid": 5, "tid": 7, "ts": 1716454224540715, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493206, "dur": 12, "args": { "External id": 194369, "cbid": 211, "correlation": 194369 } }, { "ph": "s", "id": 194369, "pid": 76337, "tid": -914061504, "ts": 1716454224493206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224540751, "dur": 51, "args": { "External id": 194377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194377, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194377, "pid": 5, "tid": 7, "ts": 1716454224540751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493246, "dur": 9, "args": { "External id": 194377, "cbid": 211, "correlation": 194377 } }, { "ph": "s", "id": 194377, "pid": 76337, "tid": -914061504, "ts": 1716454224493246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224540803, "dur": 31, "args": { "External id": 194388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194388, "pid": 5, "tid": 7, "ts": 1716454224540803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493318, "dur": 13, "args": { "External id": 194388, "cbid": 211, "correlation": 194388 } }, { "ph": "s", "id": 194388, "pid": 76337, "tid": -914061504, "ts": 1716454224493318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224540835, "dur": 34, "args": { "External id": 194410, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194410, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194410, "pid": 5, "tid": 7, "ts": 1716454224540835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493350, "dur": 7, "args": { "External id": 194410, "cbid": 211, "correlation": 194410 } }, { "ph": "s", "id": 194410, "pid": 76337, "tid": -914061504, "ts": 1716454224493350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493433, "dur": 1, "args": { "External id": 194421, "cbid": 251, "correlation": 194421 } }, { "ph": "f", "id": 194421, "pid": 76337, "tid": -914061504, "ts": 1716454224493433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224540871, "dur": 90, "args": { "External id": 194422, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194422, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194422, "pid": 5, "tid": 7, "ts": 1716454224540871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493438, "dur": 14, "args": { "External id": 194422, "cbid": 211, "correlation": 194422 } }, { "ph": "s", "id": 194422, "pid": 76337, "tid": -914061504, "ts": 1716454224493438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493508, "dur": 1, "args": { "External id": 194433, "cbid": 251, "correlation": 194433 } }, { "ph": "f", "id": 194433, "pid": 76337, "tid": -914061504, "ts": 1716454224493508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493511, "dur": 0, "args": { "External id": 194434, "cbid": 251, "correlation": 194434 } }, { "ph": "f", "id": 194434, "pid": 76337, "tid": -914061504, "ts": 1716454224493511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224540962, "dur": 12, "args": { "External id": 194435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194435, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 194435, "pid": 5, "tid": 7, "ts": 1716454224540962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493513, "dur": 12, "args": { "External id": 194435, "cbid": 211, "correlation": 194435 } }, { "ph": "s", "id": 194435, "pid": 76337, "tid": -914061504, "ts": 1716454224493513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224540975, "dur": 5, "args": { "External id": 194437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194437, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 194437, "pid": 5, "tid": 7, "ts": 1716454224540975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493527, "dur": 7, "args": { "External id": 194437, "cbid": 211, "correlation": 194437 } }, { "ph": "s", "id": 194437, "pid": 76337, "tid": -914061504, "ts": 1716454224493527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493587, "dur": 1, "args": { "External id": 194448, "cbid": 251, "correlation": 194448 } }, { "ph": "f", "id": 194448, "pid": 76337, "tid": -914061504, "ts": 1716454224493587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493590, "dur": 0, "args": { "External id": 194449, "cbid": 251, "correlation": 194449 } }, { "ph": "f", "id": 194449, "pid": 76337, "tid": -914061504, "ts": 1716454224493590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224540981, "dur": 7, "args": { "External id": 194450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194450, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 194450, "pid": 5, "tid": 7, "ts": 1716454224540981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493592, "dur": 12, "args": { "External id": 194450, "cbid": 211, "correlation": 194450 } }, { "ph": "s", "id": 194450, "pid": 76337, "tid": -914061504, "ts": 1716454224493592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224540990, "dur": 4, "args": { "External id": 194452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194452, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 194452, "pid": 5, "tid": 7, "ts": 1716454224540990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493605, "dur": 5, "args": { "External id": 194452, "cbid": 211, "correlation": 194452 } }, { "ph": "s", "id": 194452, "pid": 76337, "tid": -914061504, "ts": 1716454224493605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224540995, "dur": 90, "args": { "External id": 194473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194473, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 194473, "pid": 5, "tid": 7, "ts": 1716454224540995, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493678, "dur": 13, "args": { "External id": 194473, "cbid": 211, "correlation": 194473 } }, { "ph": "s", "id": 194473, "pid": 76337, "tid": -914061504, "ts": 1716454224493678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224493779, "dur": 1, "args": { "External id": 194491, "cbid": 251, "correlation": 194491 } }, { "ph": "f", "id": 194491, "pid": 76337, "tid": -914061504, "ts": 1716454224493779, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224541086, "dur": 98, "args": { "External id": 194493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194493, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194493, "pid": 5, "tid": 7, "ts": 1716454224541086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493785, "dur": 13, "args": { "External id": 194493, "cbid": 211, "correlation": 194493 } }, { "ph": "s", "id": 194493, "pid": 76337, "tid": -914061504, "ts": 1716454224493785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224541186, "dur": 19, "args": { "External id": 194501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194501, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194501, "pid": 5, "tid": 7, "ts": 1716454224541186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493855, "dur": 12, "args": { "External id": 194501, "cbid": 211, "correlation": 194501 } }, { "ph": "s", "id": 194501, "pid": 76337, "tid": -914061504, "ts": 1716454224493855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224541206, "dur": 37, "args": { "External id": 194509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194509, "pid": 5, "tid": 7, "ts": 1716454224541206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493896, "dur": 9, "args": { "External id": 194509, "cbid": 211, "correlation": 194509 } }, { "ph": "s", "id": 194509, "pid": 76337, "tid": -914061504, "ts": 1716454224493896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224541245, "dur": 34, "args": { "External id": 194531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194531, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194531, "pid": 5, "tid": 7, "ts": 1716454224541245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224493950, "dur": 12, "args": { "External id": 194531, "cbid": 211, "correlation": 194531 } }, { "ph": "s", "id": 194531, "pid": 76337, "tid": -914061504, "ts": 1716454224493950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224494050, "dur": 1, "args": { "External id": 194547, "cbid": 251, "correlation": 194547 } }, { "ph": "f", "id": 194547, "pid": 76337, "tid": -914061504, "ts": 1716454224494050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224494055, "dur": 0, "args": { "External id": 194549, "cbid": 251, "correlation": 194549 } }, { "ph": "f", "id": 194549, "pid": 76337, "tid": -914061504, "ts": 1716454224494055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224541281, "dur": 540, "args": { "External id": 194550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194550, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 194550, "pid": 5, "tid": 7, "ts": 1716454224541281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494059, "dur": 13, "args": { "External id": 194550, "cbid": 211, "correlation": 194550 } }, { "ph": "s", "id": 194550, "pid": 76337, "tid": -914061504, "ts": 1716454224494059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224541822, "dur": 125, "args": { "External id": 194558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194558, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194558, "pid": 5, "tid": 7, "ts": 1716454224541822, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494126, "dur": 13, "args": { "External id": 194558, "cbid": 211, "correlation": 194558 } }, { "ph": "s", "id": 194558, "pid": 76337, "tid": -914061504, "ts": 1716454224494126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224541948, "dur": 127, "args": { "External id": 194566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194566, "pid": 5, "tid": 7, "ts": 1716454224541948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494158, "dur": 8, "args": { "External id": 194566, "cbid": 211, "correlation": 194566 } }, { "ph": "s", "id": 194566, "pid": 76337, "tid": -914061504, "ts": 1716454224494158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224494236, "dur": 1, "args": { "External id": 194582, "cbid": 251, "correlation": 194582 } }, { "ph": "f", "id": 194582, "pid": 76337, "tid": -914061504, "ts": 1716454224494236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224542077, "dur": 257, "args": { "External id": 194584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194584, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194584, "pid": 5, "tid": 7, "ts": 1716454224542077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494241, "dur": 12, "args": { "External id": 194584, "cbid": 211, "correlation": 194584 } }, { "ph": "s", "id": 194584, "pid": 76337, "tid": -914061504, "ts": 1716454224494241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224542335, "dur": 27, "args": { "External id": 194592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194592, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194592, "pid": 5, "tid": 7, "ts": 1716454224542335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494283, "dur": 10, "args": { "External id": 194592, "cbid": 211, "correlation": 194592 } }, { "ph": "s", "id": 194592, "pid": 76337, "tid": -914061504, "ts": 1716454224494283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224542364, "dur": 81, "args": { "External id": 194603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194603, "pid": 5, "tid": 7, "ts": 1716454224542364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494351, "dur": 13, "args": { "External id": 194603, "cbid": 211, "correlation": 194603 } }, { "ph": "s", "id": 194603, "pid": 76337, "tid": -914061504, "ts": 1716454224494351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224494416, "dur": 0, "args": { "External id": 194615, "cbid": 317, "correlation": 194615 } }, { "ph": "f", "id": 194615, "pid": 76337, "tid": -914061504, "ts": 1716454224494416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224494417, "dur": 0, "args": { "External id": 194616, "cbid": 203, "correlation": 194616 } }, { "ph": "f", "id": 194616, "pid": 76337, "tid": -914061504, "ts": 1716454224494417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224494418, "dur": 0, "args": { "External id": 194617, "cbid": 205, "correlation": 194617 } }, { "ph": "f", "id": 194617, "pid": 76337, "tid": -914061504, "ts": 1716454224494418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224542446, "dur": 25, "args": { "External id": 194621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194621, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194621, "pid": 5, "tid": 7, "ts": 1716454224542446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494433, "dur": 12, "args": { "External id": 194621, "cbid": 211, "correlation": 194621 } }, { "ph": "s", "id": 194621, "pid": 76337, "tid": -914061504, "ts": 1716454224494433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224542473, "dur": 120, "args": { "External id": 194623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194623, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194623, "pid": 5, "tid": 7, "ts": 1716454224542473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494451, "dur": 7, "args": { "External id": 194623, "cbid": 211, "correlation": 194623 } }, { "ph": "s", "id": 194623, "pid": 76337, "tid": -914061504, "ts": 1716454224494451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224542594, "dur": 23, "args": { "External id": 194625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194625, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194625, "pid": 5, "tid": 7, "ts": 1716454224542594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494463, "dur": 5, "args": { "External id": 194625, "cbid": 211, "correlation": 194625 } }, { "ph": "s", "id": 194625, "pid": 76337, "tid": -914061504, "ts": 1716454224494463, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224542618, "dur": 32, "args": { "External id": 194631, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194631, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194631, "pid": 5, "tid": 7, "ts": 1716454224542618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494490, "dur": 8, "args": { "External id": 194631, "cbid": 211, "correlation": 194631 } }, { "ph": "s", "id": 194631, "pid": 76337, "tid": -914061504, "ts": 1716454224494490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224542652, "dur": 27, "args": { "External id": 194639, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194639, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194639, "pid": 5, "tid": 7, "ts": 1716454224542652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494522, "dur": 8, "args": { "External id": 194639, "cbid": 211, "correlation": 194639 } }, { "ph": "s", "id": 194639, "pid": 76337, "tid": -914061504, "ts": 1716454224494522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224542680, "dur": 31, "args": { "External id": 194659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194659, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 194659, "pid": 5, "tid": 7, "ts": 1716454224542680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494599, "dur": 13, "args": { "External id": 194659, "cbid": 211, "correlation": 194659 } }, { "ph": "s", "id": 194659, "pid": 76337, "tid": -914061504, "ts": 1716454224494599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224542713, "dur": 5, "args": { "External id": 194671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194671, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 194671, "pid": 5, "tid": 7, "ts": 1716454224542713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494621, "dur": 6, "args": { "External id": 194671, "cbid": 211, "correlation": 194671 } }, { "ph": "s", "id": 194671, "pid": 76337, "tid": -914061504, "ts": 1716454224494621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224542719, "dur": 30, "args": { "External id": 194674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194674, "pid": 5, "tid": 7, "ts": 1716454224542719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494639, "dur": 6, "args": { "External id": 194674, "cbid": 211, "correlation": 194674 } }, { "ph": "s", "id": 194674, "pid": 76337, "tid": -914061504, "ts": 1716454224494639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224542751, "dur": 21, "args": { "External id": 194683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194683, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194683, "pid": 5, "tid": 7, "ts": 1716454224542751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494678, "dur": 9, "args": { "External id": 194683, "cbid": 211, "correlation": 194683 } }, { "ph": "s", "id": 194683, "pid": 76337, "tid": -914061504, "ts": 1716454224494678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224494729, "dur": 0, "args": { "External id": 194693, "cbid": 317, "correlation": 194693 } }, { "ph": "f", "id": 194693, "pid": 76337, "tid": -914061504, "ts": 1716454224494729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224494730, "dur": 0, "args": { "External id": 194694, "cbid": 203, "correlation": 194694 } }, { "ph": "f", "id": 194694, "pid": 76337, "tid": -914061504, "ts": 1716454224494730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224494731, "dur": 0, "args": { "External id": 194695, "cbid": 205, "correlation": 194695 } }, { "ph": "f", "id": 194695, "pid": 76337, "tid": -914061504, "ts": 1716454224494731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224542773, "dur": 22, "args": { "External id": 194699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194699, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194699, "pid": 5, "tid": 7, "ts": 1716454224542773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494743, "dur": 11, "args": { "External id": 194699, "cbid": 211, "correlation": 194699 } }, { "ph": "s", "id": 194699, "pid": 76337, "tid": -914061504, "ts": 1716454224494743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224542796, "dur": 44, "args": { "External id": 194701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194701, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194701, "pid": 5, "tid": 7, "ts": 1716454224542796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494757, "dur": 6, "args": { "External id": 194701, "cbid": 211, "correlation": 194701 } }, { "ph": "s", "id": 194701, "pid": 76337, "tid": -914061504, "ts": 1716454224494757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224542841, "dur": 656, "args": { "External id": 194703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194703, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194703, "pid": 5, "tid": 7, "ts": 1716454224542841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494769, "dur": 6, "args": { "External id": 194703, "cbid": 211, "correlation": 194703 } }, { "ph": "s", "id": 194703, "pid": 76337, "tid": -914061504, "ts": 1716454224494769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224543498, "dur": 21, "args": { "External id": 194705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194705, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194705, "pid": 5, "tid": 7, "ts": 1716454224543498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494779, "dur": 5, "args": { "External id": 194705, "cbid": 211, "correlation": 194705 } }, { "ph": "s", "id": 194705, "pid": 76337, "tid": -914061504, "ts": 1716454224494779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224543521, "dur": 33, "args": { "External id": 194711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194711, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194711, "pid": 5, "tid": 7, "ts": 1716454224543521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494805, "dur": 9, "args": { "External id": 194711, "cbid": 211, "correlation": 194711 } }, { "ph": "s", "id": 194711, "pid": 76337, "tid": -914061504, "ts": 1716454224494805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224543555, "dur": 4, "args": { "External id": 194719, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194719, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 194719, "pid": 5, "tid": 7, "ts": 1716454224543555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494848, "dur": 9, "args": { "External id": 194719, "cbid": 211, "correlation": 194719 } }, { "ph": "s", "id": 194719, "pid": 76337, "tid": -914061504, "ts": 1716454224494848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224494914, "dur": 1, "args": { "External id": 194735, "cbid": 251, "correlation": 194735 } }, { "ph": "f", "id": 194735, "pid": 76337, "tid": -914061504, "ts": 1716454224494914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224494919, "dur": 0, "args": { "External id": 194737, "cbid": 251, "correlation": 194737 } }, { "ph": "f", "id": 194737, "pid": 76337, "tid": -914061504, "ts": 1716454224494919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224543560, "dur": 13, "args": { "External id": 194738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194738, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 194738, "pid": 5, "tid": 7, "ts": 1716454224543560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494921, "dur": 11, "args": { "External id": 194738, "cbid": 211, "correlation": 194738 } }, { "ph": "s", "id": 194738, "pid": 76337, "tid": -914061504, "ts": 1716454224494921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224543574, "dur": 5, "args": { "External id": 194740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194740, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 194740, "pid": 5, "tid": 7, "ts": 1716454224543574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224494934, "dur": 5, "args": { "External id": 194740, "cbid": 211, "correlation": 194740 } }, { "ph": "s", "id": 194740, "pid": 76337, "tid": -914061504, "ts": 1716454224494934, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224543581, "dur": 29, "args": { "External id": 194750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194750, "pid": 5, "tid": 7, "ts": 1716454224543581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495006, "dur": 14, "args": { "External id": 194750, "cbid": 211, "correlation": 194750 } }, { "ph": "s", "id": 194750, "pid": 76337, "tid": -914061504, "ts": 1716454224495006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224543612, "dur": 31, "args": { "External id": 194770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194770, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 194770, "pid": 5, "tid": 7, "ts": 1716454224543612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495075, "dur": 11, "args": { "External id": 194770, "cbid": 211, "correlation": 194770 } }, { "ph": "s", "id": 194770, "pid": 76337, "tid": -914061504, "ts": 1716454224495075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224543644, "dur": 5, "args": { "External id": 194782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194782, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 194782, "pid": 5, "tid": 7, "ts": 1716454224543644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495096, "dur": 7, "args": { "External id": 194782, "cbid": 211, "correlation": 194782 } }, { "ph": "s", "id": 194782, "pid": 76337, "tid": -914061504, "ts": 1716454224495096, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224543649, "dur": 30, "args": { "External id": 194785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194785, "pid": 5, "tid": 7, "ts": 1716454224543649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495115, "dur": 6, "args": { "External id": 194785, "cbid": 211, "correlation": 194785 } }, { "ph": "s", "id": 194785, "pid": 76337, "tid": -914061504, "ts": 1716454224495115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224543681, "dur": 20, "args": { "External id": 194794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194794, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194794, "pid": 5, "tid": 7, "ts": 1716454224543681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495156, "dur": 10, "args": { "External id": 194794, "cbid": 211, "correlation": 194794 } }, { "ph": "s", "id": 194794, "pid": 76337, "tid": -914061504, "ts": 1716454224495156, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224495219, "dur": 0, "args": { "External id": 194804, "cbid": 317, "correlation": 194804 } }, { "ph": "f", "id": 194804, "pid": 76337, "tid": -914061504, "ts": 1716454224495219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224495219, "dur": 0, "args": { "External id": 194805, "cbid": 203, "correlation": 194805 } }, { "ph": "f", "id": 194805, "pid": 76337, "tid": -914061504, "ts": 1716454224495219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224495220, "dur": 0, "args": { "External id": 194806, "cbid": 205, "correlation": 194806 } }, { "ph": "f", "id": 194806, "pid": 76337, "tid": -914061504, "ts": 1716454224495220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224543702, "dur": 23, "args": { "External id": 194810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194810, "pid": 5, "tid": 7, "ts": 1716454224543702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495234, "dur": 12, "args": { "External id": 194810, "cbid": 211, "correlation": 194810 } }, { "ph": "s", "id": 194810, "pid": 76337, "tid": -914061504, "ts": 1716454224495234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224543726, "dur": 44, "args": { "External id": 194812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194812, "pid": 5, "tid": 7, "ts": 1716454224543726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495248, "dur": 5, "args": { "External id": 194812, "cbid": 211, "correlation": 194812 } }, { "ph": "s", "id": 194812, "pid": 76337, "tid": -914061504, "ts": 1716454224495248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224543772, "dur": 646, "args": { "External id": 194814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194814, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194814, "pid": 5, "tid": 7, "ts": 1716454224543772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495259, "dur": 6, "args": { "External id": 194814, "cbid": 211, "correlation": 194814 } }, { "ph": "s", "id": 194814, "pid": 76337, "tid": -914061504, "ts": 1716454224495259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224544419, "dur": 21, "args": { "External id": 194816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194816, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194816, "pid": 5, "tid": 7, "ts": 1716454224544419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495269, "dur": 5, "args": { "External id": 194816, "cbid": 211, "correlation": 194816 } }, { "ph": "s", "id": 194816, "pid": 76337, "tid": -914061504, "ts": 1716454224495269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224544442, "dur": 33, "args": { "External id": 194822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194822, "pid": 5, "tid": 7, "ts": 1716454224544442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495297, "dur": 8, "args": { "External id": 194822, "cbid": 211, "correlation": 194822 } }, { "ph": "s", "id": 194822, "pid": 76337, "tid": -914061504, "ts": 1716454224495297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224544476, "dur": 26, "args": { "External id": 194830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194830, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194830, "pid": 5, "tid": 7, "ts": 1716454224544476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495329, "dur": 8, "args": { "External id": 194830, "cbid": 211, "correlation": 194830 } }, { "ph": "s", "id": 194830, "pid": 76337, "tid": -914061504, "ts": 1716454224495329, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224544503, "dur": 20, "args": { "External id": 194838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194838, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194838, "pid": 5, "tid": 7, "ts": 1716454224544503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495359, "dur": 8, "args": { "External id": 194838, "cbid": 211, "correlation": 194838 } }, { "ph": "s", "id": 194838, "pid": 76337, "tid": -914061504, "ts": 1716454224495359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224544524, "dur": 30, "args": { "External id": 194858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194858, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 194858, "pid": 5, "tid": 7, "ts": 1716454224544524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495437, "dur": 12, "args": { "External id": 194858, "cbid": 211, "correlation": 194858 } }, { "ph": "s", "id": 194858, "pid": 76337, "tid": -914061504, "ts": 1716454224495437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224544555, "dur": 5, "args": { "External id": 194870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194870, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 194870, "pid": 5, "tid": 7, "ts": 1716454224544555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495459, "dur": 6, "args": { "External id": 194870, "cbid": 211, "correlation": 194870 } }, { "ph": "s", "id": 194870, "pid": 76337, "tid": -914061504, "ts": 1716454224495459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224544561, "dur": 30, "args": { "External id": 194873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194873, "pid": 5, "tid": 7, "ts": 1716454224544561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495477, "dur": 6, "args": { "External id": 194873, "cbid": 211, "correlation": 194873 } }, { "ph": "s", "id": 194873, "pid": 76337, "tid": -914061504, "ts": 1716454224495477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224495535, "dur": 0, "args": { "External id": 194884, "cbid": 317, "correlation": 194884 } }, { "ph": "f", "id": 194884, "pid": 76337, "tid": -914061504, "ts": 1716454224495535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224495535, "dur": 0, "args": { "External id": 194885, "cbid": 203, "correlation": 194885 } }, { "ph": "f", "id": 194885, "pid": 76337, "tid": -914061504, "ts": 1716454224495535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224495536, "dur": 0, "args": { "External id": 194886, "cbid": 205, "correlation": 194886 } }, { "ph": "f", "id": 194886, "pid": 76337, "tid": -914061504, "ts": 1716454224495536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224544593, "dur": 22, "args": { "External id": 194890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194890, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194890, "pid": 5, "tid": 7, "ts": 1716454224544593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495549, "dur": 12, "args": { "External id": 194890, "cbid": 211, "correlation": 194890 } }, { "ph": "s", "id": 194890, "pid": 76337, "tid": -914061504, "ts": 1716454224495549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224544616, "dur": 116, "args": { "External id": 194892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194892, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194892, "pid": 5, "tid": 7, "ts": 1716454224544616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495567, "dur": 6, "args": { "External id": 194892, "cbid": 211, "correlation": 194892 } }, { "ph": "s", "id": 194892, "pid": 76337, "tid": -914061504, "ts": 1716454224495567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224544734, "dur": 22, "args": { "External id": 194894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194894, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194894, "pid": 5, "tid": 7, "ts": 1716454224544734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495577, "dur": 9, "args": { "External id": 194894, "cbid": 211, "correlation": 194894 } }, { "ph": "s", "id": 194894, "pid": 76337, "tid": -914061504, "ts": 1716454224495577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224544757, "dur": 32, "args": { "External id": 194900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194900, "pid": 5, "tid": 7, "ts": 1716454224544757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495608, "dur": 10, "args": { "External id": 194900, "cbid": 211, "correlation": 194900 } }, { "ph": "s", "id": 194900, "pid": 76337, "tid": -914061504, "ts": 1716454224495608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224544791, "dur": 171, "args": { "External id": 194909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194909, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194909, "pid": 5, "tid": 7, "ts": 1716454224544791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495690, "dur": 14, "args": { "External id": 194909, "cbid": 211, "correlation": 194909 } }, { "ph": "s", "id": 194909, "pid": 76337, "tid": -914061504, "ts": 1716454224495690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224544963, "dur": 65, "args": { "External id": 194931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194931, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 194931, "pid": 5, "tid": 7, "ts": 1716454224544963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495752, "dur": 11, "args": { "External id": 194931, "cbid": 211, "correlation": 194931 } }, { "ph": "s", "id": 194931, "pid": 76337, "tid": -914061504, "ts": 1716454224495752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224495841, "dur": 2, "args": { "External id": 194942, "cbid": 251, "correlation": 194942 } }, { "ph": "f", "id": 194942, "pid": 76337, "tid": -914061504, "ts": 1716454224495841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224545029, "dur": 155, "args": { "External id": 194943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194943, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194943, "pid": 5, "tid": 7, "ts": 1716454224545029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495847, "dur": 13, "args": { "External id": 194943, "cbid": 211, "correlation": 194943 } }, { "ph": "s", "id": 194943, "pid": 76337, "tid": -914061504, "ts": 1716454224495847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224495918, "dur": 1, "args": { "External id": 194954, "cbid": 251, "correlation": 194954 } }, { "ph": "f", "id": 194954, "pid": 76337, "tid": -914061504, "ts": 1716454224495918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224545185, "dur": 145, "args": { "External id": 194955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194955, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194955, "pid": 5, "tid": 7, "ts": 1716454224545185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224495922, "dur": 11, "args": { "External id": 194955, "cbid": 211, "correlation": 194955 } }, { "ph": "s", "id": 194955, "pid": 76337, "tid": -914061504, "ts": 1716454224495922, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224495996, "dur": 1, "args": { "External id": 194966, "cbid": 251, "correlation": 194966 } }, { "ph": "f", "id": 194966, "pid": 76337, "tid": -914061504, "ts": 1716454224495996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224545332, "dur": 142, "args": { "External id": 194967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194967, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 194967, "pid": 5, "tid": 7, "ts": 1716454224545332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496000, "dur": 12, "args": { "External id": 194967, "cbid": 211, "correlation": 194967 } }, { "ph": "s", "id": 194967, "pid": 76337, "tid": -914061504, "ts": 1716454224496000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224545476, "dur": 1946, "args": { "External id": 194988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 194988, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 194988, "pid": 5, "tid": 7, "ts": 1716454224545476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496084, "dur": 13, "args": { "External id": 194988, "cbid": 211, "correlation": 194988 } }, { "ph": "s", "id": 194988, "pid": 76337, "tid": -914061504, "ts": 1716454224496084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496182, "dur": 1, "args": { "External id": 195006, "cbid": 251, "correlation": 195006 } }, { "ph": "f", "id": 195006, "pid": 76337, "tid": -914061504, "ts": 1716454224496182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224547424, "dur": 147, "args": { "External id": 195008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195008, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195008, "pid": 5, "tid": 7, "ts": 1716454224547424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496188, "dur": 13, "args": { "External id": 195008, "cbid": 211, "correlation": 195008 } }, { "ph": "s", "id": 195008, "pid": 76337, "tid": -914061504, "ts": 1716454224496188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224547572, "dur": 36, "args": { "External id": 195016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195016, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195016, "pid": 5, "tid": 7, "ts": 1716454224547572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496259, "dur": 13, "args": { "External id": 195016, "cbid": 211, "correlation": 195016 } }, { "ph": "s", "id": 195016, "pid": 76337, "tid": -914061504, "ts": 1716454224496259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224547609, "dur": 51, "args": { "External id": 195024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195024, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195024, "pid": 5, "tid": 7, "ts": 1716454224547609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496298, "dur": 9, "args": { "External id": 195024, "cbid": 211, "correlation": 195024 } }, { "ph": "s", "id": 195024, "pid": 76337, "tid": -914061504, "ts": 1716454224496298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224547661, "dur": 30, "args": { "External id": 195035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195035, "pid": 5, "tid": 7, "ts": 1716454224547661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496370, "dur": 12, "args": { "External id": 195035, "cbid": 211, "correlation": 195035 } }, { "ph": "s", "id": 195035, "pid": 76337, "tid": -914061504, "ts": 1716454224496370, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224547692, "dur": 34, "args": { "External id": 195057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195057, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195057, "pid": 5, "tid": 7, "ts": 1716454224547692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496400, "dur": 7, "args": { "External id": 195057, "cbid": 211, "correlation": 195057 } }, { "ph": "s", "id": 195057, "pid": 76337, "tid": -914061504, "ts": 1716454224496400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496489, "dur": 1, "args": { "External id": 195068, "cbid": 251, "correlation": 195068 } }, { "ph": "f", "id": 195068, "pid": 76337, "tid": -914061504, "ts": 1716454224496489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224547728, "dur": 89, "args": { "External id": 195069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195069, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195069, "pid": 5, "tid": 7, "ts": 1716454224547728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496494, "dur": 13, "args": { "External id": 195069, "cbid": 211, "correlation": 195069 } }, { "ph": "s", "id": 195069, "pid": 76337, "tid": -914061504, "ts": 1716454224496494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496562, "dur": 1, "args": { "External id": 195080, "cbid": 251, "correlation": 195080 } }, { "ph": "f", "id": 195080, "pid": 76337, "tid": -914061504, "ts": 1716454224496562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496567, "dur": 0, "args": { "External id": 195081, "cbid": 251, "correlation": 195081 } }, { "ph": "f", "id": 195081, "pid": 76337, "tid": -914061504, "ts": 1716454224496567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224547818, "dur": 12, "args": { "External id": 195082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195082, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 195082, "pid": 5, "tid": 7, "ts": 1716454224547818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496569, "dur": 12, "args": { "External id": 195082, "cbid": 211, "correlation": 195082 } }, { "ph": "s", "id": 195082, "pid": 76337, "tid": -914061504, "ts": 1716454224496569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224547832, "dur": 5, "args": { "External id": 195084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195084, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 195084, "pid": 5, "tid": 7, "ts": 1716454224547832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496582, "dur": 6, "args": { "External id": 195084, "cbid": 211, "correlation": 195084 } }, { "ph": "s", "id": 195084, "pid": 76337, "tid": -914061504, "ts": 1716454224496582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496640, "dur": 1, "args": { "External id": 195095, "cbid": 251, "correlation": 195095 } }, { "ph": "f", "id": 195095, "pid": 76337, "tid": -914061504, "ts": 1716454224496640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496644, "dur": 0, "args": { "External id": 195096, "cbid": 251, "correlation": 195096 } }, { "ph": "f", "id": 195096, "pid": 76337, "tid": -914061504, "ts": 1716454224496644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224547838, "dur": 7, "args": { "External id": 195097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195097, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 195097, "pid": 5, "tid": 7, "ts": 1716454224547838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496645, "dur": 12, "args": { "External id": 195097, "cbid": 211, "correlation": 195097 } }, { "ph": "s", "id": 195097, "pid": 76337, "tid": -914061504, "ts": 1716454224496645, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224547846, "dur": 4, "args": { "External id": 195099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195099, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 195099, "pid": 5, "tid": 7, "ts": 1716454224547846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496658, "dur": 6, "args": { "External id": 195099, "cbid": 211, "correlation": 195099 } }, { "ph": "s", "id": 195099, "pid": 76337, "tid": -914061504, "ts": 1716454224496658, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224547851, "dur": 94, "args": { "External id": 195120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195120, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 195120, "pid": 5, "tid": 7, "ts": 1716454224547851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496732, "dur": 12, "args": { "External id": 195120, "cbid": 211, "correlation": 195120 } }, { "ph": "s", "id": 195120, "pid": 76337, "tid": -914061504, "ts": 1716454224496732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224496832, "dur": 1, "args": { "External id": 195138, "cbid": 251, "correlation": 195138 } }, { "ph": "f", "id": 195138, "pid": 76337, "tid": -914061504, "ts": 1716454224496832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224547946, "dur": 99, "args": { "External id": 195140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195140, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195140, "pid": 5, "tid": 7, "ts": 1716454224547946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496838, "dur": 13, "args": { "External id": 195140, "cbid": 211, "correlation": 195140 } }, { "ph": "s", "id": 195140, "pid": 76337, "tid": -914061504, "ts": 1716454224496838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224548047, "dur": 19, "args": { "External id": 195148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195148, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195148, "pid": 5, "tid": 7, "ts": 1716454224548047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496906, "dur": 12, "args": { "External id": 195148, "cbid": 211, "correlation": 195148 } }, { "ph": "s", "id": 195148, "pid": 76337, "tid": -914061504, "ts": 1716454224496906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224548068, "dur": 37, "args": { "External id": 195156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195156, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195156, "pid": 5, "tid": 7, "ts": 1716454224548068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224496948, "dur": 9, "args": { "External id": 195156, "cbid": 211, "correlation": 195156 } }, { "ph": "s", "id": 195156, "pid": 76337, "tid": -914061504, "ts": 1716454224496948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224548106, "dur": 35, "args": { "External id": 195178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195178, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195178, "pid": 5, "tid": 7, "ts": 1716454224548106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497008, "dur": 11, "args": { "External id": 195178, "cbid": 211, "correlation": 195178 } }, { "ph": "s", "id": 195178, "pid": 76337, "tid": -914061504, "ts": 1716454224497008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224497098, "dur": 1, "args": { "External id": 195194, "cbid": 251, "correlation": 195194 } }, { "ph": "f", "id": 195194, "pid": 76337, "tid": -914061504, "ts": 1716454224497098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224497103, "dur": 0, "args": { "External id": 195196, "cbid": 251, "correlation": 195196 } }, { "ph": "f", "id": 195196, "pid": 76337, "tid": -914061504, "ts": 1716454224497103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224548143, "dur": 540, "args": { "External id": 195197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195197, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195197, "pid": 5, "tid": 7, "ts": 1716454224548143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497106, "dur": 13, "args": { "External id": 195197, "cbid": 211, "correlation": 195197 } }, { "ph": "s", "id": 195197, "pid": 76337, "tid": -914061504, "ts": 1716454224497106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224548684, "dur": 126, "args": { "External id": 195205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195205, "pid": 5, "tid": 7, "ts": 1716454224548684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497171, "dur": 13, "args": { "External id": 195205, "cbid": 211, "correlation": 195205 } }, { "ph": "s", "id": 195205, "pid": 76337, "tid": -914061504, "ts": 1716454224497171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224548811, "dur": 128, "args": { "External id": 195213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195213, "pid": 5, "tid": 7, "ts": 1716454224548811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497201, "dur": 8, "args": { "External id": 195213, "cbid": 211, "correlation": 195213 } }, { "ph": "s", "id": 195213, "pid": 76337, "tid": -914061504, "ts": 1716454224497201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224497278, "dur": 2, "args": { "External id": 195229, "cbid": 251, "correlation": 195229 } }, { "ph": "f", "id": 195229, "pid": 76337, "tid": -914061504, "ts": 1716454224497278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224548940, "dur": 302, "args": { "External id": 195231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195231, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195231, "pid": 5, "tid": 7, "ts": 1716454224548940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497284, "dur": 12, "args": { "External id": 195231, "cbid": 211, "correlation": 195231 } }, { "ph": "s", "id": 195231, "pid": 76337, "tid": -914061504, "ts": 1716454224497284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224549243, "dur": 27, "args": { "External id": 195239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195239, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195239, "pid": 5, "tid": 7, "ts": 1716454224549243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497325, "dur": 10, "args": { "External id": 195239, "cbid": 211, "correlation": 195239 } }, { "ph": "s", "id": 195239, "pid": 76337, "tid": -914061504, "ts": 1716454224497325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224549271, "dur": 81, "args": { "External id": 195250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195250, "pid": 5, "tid": 7, "ts": 1716454224549271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497393, "dur": 12, "args": { "External id": 195250, "cbid": 211, "correlation": 195250 } }, { "ph": "s", "id": 195250, "pid": 76337, "tid": -914061504, "ts": 1716454224497393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224497456, "dur": 0, "args": { "External id": 195262, "cbid": 317, "correlation": 195262 } }, { "ph": "f", "id": 195262, "pid": 76337, "tid": -914061504, "ts": 1716454224497456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224497457, "dur": 0, "args": { "External id": 195263, "cbid": 203, "correlation": 195263 } }, { "ph": "f", "id": 195263, "pid": 76337, "tid": -914061504, "ts": 1716454224497457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224497457, "dur": 0, "args": { "External id": 195264, "cbid": 205, "correlation": 195264 } }, { "ph": "f", "id": 195264, "pid": 76337, "tid": -914061504, "ts": 1716454224497457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549354, "dur": 23, "args": { "External id": 195268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195268, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195268, "pid": 5, "tid": 7, "ts": 1716454224549354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497472, "dur": 13, "args": { "External id": 195268, "cbid": 211, "correlation": 195268 } }, { "ph": "s", "id": 195268, "pid": 76337, "tid": -914061504, "ts": 1716454224497472, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224549378, "dur": 121, "args": { "External id": 195270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195270, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195270, "pid": 5, "tid": 7, "ts": 1716454224549378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497491, "dur": 7, "args": { "External id": 195270, "cbid": 211, "correlation": 195270 } }, { "ph": "s", "id": 195270, "pid": 76337, "tid": -914061504, "ts": 1716454224497491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549500, "dur": 22, "args": { "External id": 195272, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195272, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195272, "pid": 5, "tid": 7, "ts": 1716454224549500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497502, "dur": 5, "args": { "External id": 195272, "cbid": 211, "correlation": 195272 } }, { "ph": "s", "id": 195272, "pid": 76337, "tid": -914061504, "ts": 1716454224497502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224549523, "dur": 32, "args": { "External id": 195278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195278, "pid": 5, "tid": 7, "ts": 1716454224549523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497530, "dur": 9, "args": { "External id": 195278, "cbid": 211, "correlation": 195278 } }, { "ph": "s", "id": 195278, "pid": 76337, "tid": -914061504, "ts": 1716454224497530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224549556, "dur": 26, "args": { "External id": 195286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195286, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195286, "pid": 5, "tid": 7, "ts": 1716454224549556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497561, "dur": 8, "args": { "External id": 195286, "cbid": 211, "correlation": 195286 } }, { "ph": "s", "id": 195286, "pid": 76337, "tid": -914061504, "ts": 1716454224497561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224497637, "dur": 0, "args": { "External id": 195296, "cbid": 317, "correlation": 195296 } }, { "ph": "f", "id": 195296, "pid": 76337, "tid": -914061504, "ts": 1716454224497637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224497638, "dur": 0, "args": { "External id": 195297, "cbid": 203, "correlation": 195297 } }, { "ph": "f", "id": 195297, "pid": 76337, "tid": -914061504, "ts": 1716454224497638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224497638, "dur": 0, "args": { "External id": 195298, "cbid": 205, "correlation": 195298 } }, { "ph": "f", "id": 195298, "pid": 76337, "tid": -914061504, "ts": 1716454224497638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549584, "dur": 24, "args": { "External id": 195302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195302, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195302, "pid": 5, "tid": 7, "ts": 1716454224549584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497652, "dur": 12, "args": { "External id": 195302, "cbid": 211, "correlation": 195302 } }, { "ph": "s", "id": 195302, "pid": 76337, "tid": -914061504, "ts": 1716454224497652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549610, "dur": 44, "args": { "External id": 195304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195304, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195304, "pid": 5, "tid": 7, "ts": 1716454224549610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497667, "dur": 5, "args": { "External id": 195304, "cbid": 211, "correlation": 195304 } }, { "ph": "s", "id": 195304, "pid": 76337, "tid": -914061504, "ts": 1716454224497667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224549656, "dur": 234, "args": { "External id": 195306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195306, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 195306, "pid": 5, "tid": 7, "ts": 1716454224549656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497679, "dur": 7, "args": { "External id": 195306, "cbid": 211, "correlation": 195306 } }, { "ph": "s", "id": 195306, "pid": 76337, "tid": -914061504, "ts": 1716454224497679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549891, "dur": 7, "args": { "External id": 195308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195308, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195308, "pid": 5, "tid": 7, "ts": 1716454224549891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497690, "dur": 5, "args": { "External id": 195308, "cbid": 211, "correlation": 195308 } }, { "ph": "s", "id": 195308, "pid": 76337, "tid": -914061504, "ts": 1716454224497690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224549899, "dur": 9, "args": { "External id": 195314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195314, "pid": 5, "tid": 7, "ts": 1716454224549899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497716, "dur": 8, "args": { "External id": 195314, "cbid": 211, "correlation": 195314 } }, { "ph": "s", "id": 195314, "pid": 76337, "tid": -914061504, "ts": 1716454224497716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224549909, "dur": 12, "args": { "External id": 195334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195334, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 195334, "pid": 5, "tid": 7, "ts": 1716454224549909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497809, "dur": 12, "args": { "External id": 195334, "cbid": 211, "correlation": 195334 } }, { "ph": "s", "id": 195334, "pid": 76337, "tid": -914061504, "ts": 1716454224497809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224549923, "dur": 5, "args": { "External id": 195346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195346, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 195346, "pid": 5, "tid": 7, "ts": 1716454224549923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497831, "dur": 7, "args": { "External id": 195346, "cbid": 211, "correlation": 195346 } }, { "ph": "s", "id": 195346, "pid": 76337, "tid": -914061504, "ts": 1716454224497831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224549929, "dur": 12, "args": { "External id": 195349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195349, "pid": 5, "tid": 7, "ts": 1716454224549929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497849, "dur": 7, "args": { "External id": 195349, "cbid": 211, "correlation": 195349 } }, { "ph": "s", "id": 195349, "pid": 76337, "tid": -914061504, "ts": 1716454224497849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224549942, "dur": 7, "args": { "External id": 195358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195358, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195358, "pid": 5, "tid": 7, "ts": 1716454224549942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497889, "dur": 10, "args": { "External id": 195358, "cbid": 211, "correlation": 195358 } }, { "ph": "s", "id": 195358, "pid": 76337, "tid": -914061504, "ts": 1716454224497889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224497945, "dur": 0, "args": { "External id": 195368, "cbid": 317, "correlation": 195368 } }, { "ph": "f", "id": 195368, "pid": 76337, "tid": -914061504, "ts": 1716454224497945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224497945, "dur": 0, "args": { "External id": 195369, "cbid": 203, "correlation": 195369 } }, { "ph": "f", "id": 195369, "pid": 76337, "tid": -914061504, "ts": 1716454224497945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224497946, "dur": 0, "args": { "External id": 195370, "cbid": 205, "correlation": 195370 } }, { "ph": "f", "id": 195370, "pid": 76337, "tid": -914061504, "ts": 1716454224497946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549950, "dur": 6, "args": { "External id": 195374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195374, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195374, "pid": 5, "tid": 7, "ts": 1716454224549950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497962, "dur": 19, "args": { "External id": 195374, "cbid": 211, "correlation": 195374 } }, { "ph": "s", "id": 195374, "pid": 76337, "tid": -914061504, "ts": 1716454224497962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224549957, "dur": 84, "args": { "External id": 195376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195376, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195376, "pid": 5, "tid": 7, "ts": 1716454224549957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224497984, "dur": 6, "args": { "External id": 195376, "cbid": 211, "correlation": 195376 } }, { "ph": "s", "id": 195376, "pid": 76337, "tid": -914061504, "ts": 1716454224497984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224550043, "dur": 1, "args": { "External id": 195378, "device": 5, "context": 1, "stream": 7, "correlation": 195378, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 195378, "pid": 5, "tid": 7, "ts": 1716454224550043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224497997, "dur": 9, "args": { "External id": 195378, "cbid": 51, "correlation": 195378 } }, { "ph": "s", "id": 195378, "pid": 76337, "tid": -914061504, "ts": 1716454224497997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224550047, "dur": 543, "args": { "External id": 195379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195379, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195379, "pid": 5, "tid": 7, "ts": 1716454224550047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498007, "dur": 8, "args": { "External id": 195379, "cbid": 211, "correlation": 195379 } }, { "ph": "s", "id": 195379, "pid": 76337, "tid": -914061504, "ts": 1716454224498007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224550591, "dur": 12, "args": { "External id": 195381, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195381, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195381, "pid": 5, "tid": 7, "ts": 1716454224550591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498019, "dur": 5, "args": { "External id": 195381, "cbid": 211, "correlation": 195381 } }, { "ph": "s", "id": 195381, "pid": 76337, "tid": -914061504, "ts": 1716454224498019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224550605, "dur": 14, "args": { "External id": 195387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195387, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195387, "pid": 5, "tid": 7, "ts": 1716454224550605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498047, "dur": 8, "args": { "External id": 195387, "cbid": 211, "correlation": 195387 } }, { "ph": "s", "id": 195387, "pid": 76337, "tid": -914061504, "ts": 1716454224498047, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224550620, "dur": 3, "args": { "External id": 195395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195395, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 195395, "pid": 5, "tid": 7, "ts": 1716454224550620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498095, "dur": 10, "args": { "External id": 195395, "cbid": 211, "correlation": 195395 } }, { "ph": "s", "id": 195395, "pid": 76337, "tid": -914061504, "ts": 1716454224498095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224498159, "dur": 1, "args": { "External id": 195411, "cbid": 251, "correlation": 195411 } }, { "ph": "f", "id": 195411, "pid": 76337, "tid": -914061504, "ts": 1716454224498159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224498164, "dur": 0, "args": { "External id": 195413, "cbid": 251, "correlation": 195413 } }, { "ph": "f", "id": 195413, "pid": 76337, "tid": -914061504, "ts": 1716454224498164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224550625, "dur": 13, "args": { "External id": 195414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195414, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195414, "pid": 5, "tid": 7, "ts": 1716454224550625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498166, "dur": 11, "args": { "External id": 195414, "cbid": 211, "correlation": 195414 } }, { "ph": "s", "id": 195414, "pid": 76337, "tid": -914061504, "ts": 1716454224498166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224550639, "dur": 5, "args": { "External id": 195416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195416, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195416, "pid": 5, "tid": 7, "ts": 1716454224550639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498179, "dur": 6, "args": { "External id": 195416, "cbid": 211, "correlation": 195416 } }, { "ph": "s", "id": 195416, "pid": 76337, "tid": -914061504, "ts": 1716454224498179, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224550646, "dur": 17, "args": { "External id": 195426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195426, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195426, "pid": 5, "tid": 7, "ts": 1716454224550646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498237, "dur": 12, "args": { "External id": 195426, "cbid": 211, "correlation": 195426 } }, { "ph": "s", "id": 195426, "pid": 76337, "tid": -914061504, "ts": 1716454224498237, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224550664, "dur": 18, "args": { "External id": 195446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195446, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 195446, "pid": 5, "tid": 7, "ts": 1716454224550664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498303, "dur": 10, "args": { "External id": 195446, "cbid": 211, "correlation": 195446 } }, { "ph": "s", "id": 195446, "pid": 76337, "tid": -914061504, "ts": 1716454224498303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224550684, "dur": 4, "args": { "External id": 195458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195458, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 195458, "pid": 5, "tid": 7, "ts": 1716454224550684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498324, "dur": 6, "args": { "External id": 195458, "cbid": 211, "correlation": 195458 } }, { "ph": "s", "id": 195458, "pid": 76337, "tid": -914061504, "ts": 1716454224498324, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224550690, "dur": 17, "args": { "External id": 195461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195461, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195461, "pid": 5, "tid": 7, "ts": 1716454224550690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498343, "dur": 6, "args": { "External id": 195461, "cbid": 211, "correlation": 195461 } }, { "ph": "s", "id": 195461, "pid": 76337, "tid": -914061504, "ts": 1716454224498343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224550708, "dur": 11, "args": { "External id": 195470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195470, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195470, "pid": 5, "tid": 7, "ts": 1716454224550708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498383, "dur": 11, "args": { "External id": 195470, "cbid": 211, "correlation": 195470 } }, { "ph": "s", "id": 195470, "pid": 76337, "tid": -914061504, "ts": 1716454224498383, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224498446, "dur": 0, "args": { "External id": 195480, "cbid": 317, "correlation": 195480 } }, { "ph": "f", "id": 195480, "pid": 76337, "tid": -914061504, "ts": 1716454224498446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224498447, "dur": 0, "args": { "External id": 195481, "cbid": 203, "correlation": 195481 } }, { "ph": "f", "id": 195481, "pid": 76337, "tid": -914061504, "ts": 1716454224498447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224498448, "dur": 0, "args": { "External id": 195482, "cbid": 205, "correlation": 195482 } }, { "ph": "f", "id": 195482, "pid": 76337, "tid": -914061504, "ts": 1716454224498448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224550720, "dur": 11, "args": { "External id": 195486, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195486, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195486, "pid": 5, "tid": 7, "ts": 1716454224550720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498465, "dur": 12, "args": { "External id": 195486, "cbid": 211, "correlation": 195486 } }, { "ph": "s", "id": 195486, "pid": 76337, "tid": -914061504, "ts": 1716454224498465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224550733, "dur": 163, "args": { "External id": 195488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195488, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195488, "pid": 5, "tid": 7, "ts": 1716454224550733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498479, "dur": 5, "args": { "External id": 195488, "cbid": 211, "correlation": 195488 } }, { "ph": "s", "id": 195488, "pid": 76337, "tid": -914061504, "ts": 1716454224498479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224550898, "dur": 1, "args": { "External id": 195490, "device": 5, "context": 1, "stream": 7, "correlation": 195490, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 195490, "pid": 5, "tid": 7, "ts": 1716454224550898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224498491, "dur": 6, "args": { "External id": 195490, "cbid": 51, "correlation": 195490 } }, { "ph": "s", "id": 195490, "pid": 76337, "tid": -914061504, "ts": 1716454224498491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224550902, "dur": 662, "args": { "External id": 195491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195491, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195491, "pid": 5, "tid": 7, "ts": 1716454224550902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498499, "dur": 6, "args": { "External id": 195491, "cbid": 211, "correlation": 195491 } }, { "ph": "s", "id": 195491, "pid": 76337, "tid": -914061504, "ts": 1716454224498499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224551565, "dur": 12, "args": { "External id": 195493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195493, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195493, "pid": 5, "tid": 7, "ts": 1716454224551565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498509, "dur": 5, "args": { "External id": 195493, "cbid": 211, "correlation": 195493 } }, { "ph": "s", "id": 195493, "pid": 76337, "tid": -914061504, "ts": 1716454224498509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224551578, "dur": 14, "args": { "External id": 195499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195499, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195499, "pid": 5, "tid": 7, "ts": 1716454224551578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498538, "dur": 9, "args": { "External id": 195499, "cbid": 211, "correlation": 195499 } }, { "ph": "s", "id": 195499, "pid": 76337, "tid": -914061504, "ts": 1716454224498538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224498596, "dur": 0, "args": { "External id": 195509, "cbid": 317, "correlation": 195509 } }, { "ph": "f", "id": 195509, "pid": 76337, "tid": -914061504, "ts": 1716454224498596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224498597, "dur": 0, "args": { "External id": 195510, "cbid": 203, "correlation": 195510 } }, { "ph": "f", "id": 195510, "pid": 76337, "tid": -914061504, "ts": 1716454224498597, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224498598, "dur": 0, "args": { "External id": 195511, "cbid": 205, "correlation": 195511 } }, { "ph": "f", "id": 195511, "pid": 76337, "tid": -914061504, "ts": 1716454224498598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224551594, "dur": 8, "args": { "External id": 195515, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195515, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195515, "pid": 5, "tid": 7, "ts": 1716454224551594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498612, "dur": 11, "args": { "External id": 195515, "cbid": 211, "correlation": 195515 } }, { "ph": "s", "id": 195515, "pid": 76337, "tid": -914061504, "ts": 1716454224498612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224551604, "dur": 4, "args": { "External id": 195517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195517, "pid": 5, "tid": 7, "ts": 1716454224551604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498628, "dur": 6, "args": { "External id": 195517, "cbid": 211, "correlation": 195517 } }, { "ph": "s", "id": 195517, "pid": 76337, "tid": -914061504, "ts": 1716454224498628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224498638, "dur": 0, "args": { "External id": 195518, "cbid": 51, "correlation": 195518 } }, { "ph": "s", "id": 195518, "pid": 76337, "tid": -914061504, "ts": 1716454224498638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224551609, "dur": 57, "args": { "External id": 195519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195519, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 195519, "pid": 5, "tid": 7, "ts": 1716454224551609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498639, "dur": 5, "args": { "External id": 195519, "cbid": 211, "correlation": 195519 } }, { "ph": "s", "id": 195519, "pid": 76337, "tid": -914061504, "ts": 1716454224498639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224551667, "dur": 14, "args": { "External id": 195524, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195524, "pid": 5, "tid": 7, "ts": 1716454224551667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498664, "dur": 8, "args": { "External id": 195524, "cbid": 211, "correlation": 195524 } }, { "ph": "s", "id": 195524, "pid": 76337, "tid": -914061504, "ts": 1716454224498664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224551682, "dur": 12, "args": { "External id": 195532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195532, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195532, "pid": 5, "tid": 7, "ts": 1716454224551682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498694, "dur": 8, "args": { "External id": 195532, "cbid": 211, "correlation": 195532 } }, { "ph": "s", "id": 195532, "pid": 76337, "tid": -914061504, "ts": 1716454224498694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224551696, "dur": 10, "args": { "External id": 195540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195540, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195540, "pid": 5, "tid": 7, "ts": 1716454224551696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498722, "dur": 8, "args": { "External id": 195540, "cbid": 211, "correlation": 195540 } }, { "ph": "s", "id": 195540, "pid": 76337, "tid": -914061504, "ts": 1716454224498722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224551707, "dur": 19, "args": { "External id": 195560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195560, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 195560, "pid": 5, "tid": 7, "ts": 1716454224551707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498804, "dur": 12, "args": { "External id": 195560, "cbid": 211, "correlation": 195560 } }, { "ph": "s", "id": 195560, "pid": 76337, "tid": -914061504, "ts": 1716454224498804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224551727, "dur": 5, "args": { "External id": 195572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195572, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 195572, "pid": 5, "tid": 7, "ts": 1716454224551727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498826, "dur": 6, "args": { "External id": 195572, "cbid": 211, "correlation": 195572 } }, { "ph": "s", "id": 195572, "pid": 76337, "tid": -914061504, "ts": 1716454224498826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224551734, "dur": 17, "args": { "External id": 195575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195575, "pid": 5, "tid": 7, "ts": 1716454224551734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498844, "dur": 6, "args": { "External id": 195575, "cbid": 211, "correlation": 195575 } }, { "ph": "s", "id": 195575, "pid": 76337, "tid": -914061504, "ts": 1716454224498844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224498904, "dur": 0, "args": { "External id": 195586, "cbid": 317, "correlation": 195586 } }, { "ph": "f", "id": 195586, "pid": 76337, "tid": -914061504, "ts": 1716454224498904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224498904, "dur": 0, "args": { "External id": 195587, "cbid": 203, "correlation": 195587 } }, { "ph": "f", "id": 195587, "pid": 76337, "tid": -914061504, "ts": 1716454224498904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224498905, "dur": 0, "args": { "External id": 195588, "cbid": 205, "correlation": 195588 } }, { "ph": "f", "id": 195588, "pid": 76337, "tid": -914061504, "ts": 1716454224498905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224551752, "dur": 12, "args": { "External id": 195592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195592, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195592, "pid": 5, "tid": 7, "ts": 1716454224551752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498921, "dur": 12, "args": { "External id": 195592, "cbid": 211, "correlation": 195592 } }, { "ph": "s", "id": 195592, "pid": 76337, "tid": -914061504, "ts": 1716454224498921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224551765, "dur": 3, "args": { "External id": 195594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195594, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195594, "pid": 5, "tid": 7, "ts": 1716454224551765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498937, "dur": 9, "args": { "External id": 195594, "cbid": 211, "correlation": 195594 } }, { "ph": "s", "id": 195594, "pid": 76337, "tid": -914061504, "ts": 1716454224498937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224498950, "dur": 0, "args": { "External id": 195595, "cbid": 51, "correlation": 195595 } }, { "ph": "s", "id": 195595, "pid": 76337, "tid": -914061504, "ts": 1716454224498950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224551770, "dur": 97, "args": { "External id": 195596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195596, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 195596, "pid": 5, "tid": 7, "ts": 1716454224551770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498951, "dur": 6, "args": { "External id": 195596, "cbid": 211, "correlation": 195596 } }, { "ph": "s", "id": 195596, "pid": 76337, "tid": -914061504, "ts": 1716454224498951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224551868, "dur": 15, "args": { "External id": 195601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195601, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195601, "pid": 5, "tid": 7, "ts": 1716454224551868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224498987, "dur": 10, "args": { "External id": 195601, "cbid": 211, "correlation": 195601 } }, { "ph": "s", "id": 195601, "pid": 76337, "tid": -914061504, "ts": 1716454224498987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224551885, "dur": 84, "args": { "External id": 195610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195610, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195610, "pid": 5, "tid": 7, "ts": 1716454224551885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499070, "dur": 14, "args": { "External id": 195610, "cbid": 211, "correlation": 195610 } }, { "ph": "s", "id": 195610, "pid": 76337, "tid": -914061504, "ts": 1716454224499070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224551970, "dur": 31, "args": { "External id": 195632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195632, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195632, "pid": 5, "tid": 7, "ts": 1716454224551970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499126, "dur": 11, "args": { "External id": 195632, "cbid": 211, "correlation": 195632 } }, { "ph": "s", "id": 195632, "pid": 76337, "tid": -914061504, "ts": 1716454224499126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499219, "dur": 1, "args": { "External id": 195643, "cbid": 251, "correlation": 195643 } }, { "ph": "f", "id": 195643, "pid": 76337, "tid": -914061504, "ts": 1716454224499219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224552003, "dur": 162, "args": { "External id": 195644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195644, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195644, "pid": 5, "tid": 7, "ts": 1716454224552003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499224, "dur": 14, "args": { "External id": 195644, "cbid": 211, "correlation": 195644 } }, { "ph": "s", "id": 195644, "pid": 76337, "tid": -914061504, "ts": 1716454224499224, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499298, "dur": 1, "args": { "External id": 195655, "cbid": 251, "correlation": 195655 } }, { "ph": "f", "id": 195655, "pid": 76337, "tid": -914061504, "ts": 1716454224499298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224552166, "dur": 158, "args": { "External id": 195656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195656, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195656, "pid": 5, "tid": 7, "ts": 1716454224552166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499302, "dur": 11, "args": { "External id": 195656, "cbid": 211, "correlation": 195656 } }, { "ph": "s", "id": 195656, "pid": 76337, "tid": -914061504, "ts": 1716454224499302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499367, "dur": 1, "args": { "External id": 195667, "cbid": 251, "correlation": 195667 } }, { "ph": "f", "id": 195667, "pid": 76337, "tid": -914061504, "ts": 1716454224499367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224552325, "dur": 159, "args": { "External id": 195668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195668, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195668, "pid": 5, "tid": 7, "ts": 1716454224552325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499371, "dur": 11, "args": { "External id": 195668, "cbid": 211, "correlation": 195668 } }, { "ph": "s", "id": 195668, "pid": 76337, "tid": -914061504, "ts": 1716454224499371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224552485, "dur": 339, "args": { "External id": 195693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195693, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195693, "pid": 5, "tid": 7, "ts": 1716454224552485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499457, "dur": 13, "args": { "External id": 195693, "cbid": 211, "correlation": 195693 } }, { "ph": "s", "id": 195693, "pid": 76337, "tid": -914061504, "ts": 1716454224499457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499561, "dur": 1, "args": { "External id": 195711, "cbid": 251, "correlation": 195711 } }, { "ph": "f", "id": 195711, "pid": 76337, "tid": -914061504, "ts": 1716454224499561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224552825, "dur": 164, "args": { "External id": 195713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195713, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195713, "pid": 5, "tid": 7, "ts": 1716454224552825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499567, "dur": 13, "args": { "External id": 195713, "cbid": 211, "correlation": 195713 } }, { "ph": "s", "id": 195713, "pid": 76337, "tid": -914061504, "ts": 1716454224499567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224552991, "dur": 20, "args": { "External id": 195721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195721, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195721, "pid": 5, "tid": 7, "ts": 1716454224552991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499636, "dur": 13, "args": { "External id": 195721, "cbid": 211, "correlation": 195721 } }, { "ph": "s", "id": 195721, "pid": 76337, "tid": -914061504, "ts": 1716454224499636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224553012, "dur": 27, "args": { "External id": 195729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195729, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195729, "pid": 5, "tid": 7, "ts": 1716454224553012, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499675, "dur": 8, "args": { "External id": 195729, "cbid": 211, "correlation": 195729 } }, { "ph": "s", "id": 195729, "pid": 76337, "tid": -914061504, "ts": 1716454224499675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224553040, "dur": 19, "args": { "External id": 195740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195740, "pid": 5, "tid": 7, "ts": 1716454224553040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499748, "dur": 13, "args": { "External id": 195740, "cbid": 211, "correlation": 195740 } }, { "ph": "s", "id": 195740, "pid": 76337, "tid": -914061504, "ts": 1716454224499748, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224553061, "dur": 16, "args": { "External id": 195762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195762, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195762, "pid": 5, "tid": 7, "ts": 1716454224553061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499783, "dur": 8, "args": { "External id": 195762, "cbid": 211, "correlation": 195762 } }, { "ph": "s", "id": 195762, "pid": 76337, "tid": -914061504, "ts": 1716454224499783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499871, "dur": 2, "args": { "External id": 195773, "cbid": 251, "correlation": 195773 } }, { "ph": "f", "id": 195773, "pid": 76337, "tid": -914061504, "ts": 1716454224499871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224553078, "dur": 89, "args": { "External id": 195774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195774, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195774, "pid": 5, "tid": 7, "ts": 1716454224553078, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499878, "dur": 15, "args": { "External id": 195774, "cbid": 211, "correlation": 195774 } }, { "ph": "s", "id": 195774, "pid": 76337, "tid": -914061504, "ts": 1716454224499878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499953, "dur": 1, "args": { "External id": 195785, "cbid": 251, "correlation": 195785 } }, { "ph": "f", "id": 195785, "pid": 76337, "tid": -914061504, "ts": 1716454224499953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224499956, "dur": 0, "args": { "External id": 195786, "cbid": 251, "correlation": 195786 } }, { "ph": "f", "id": 195786, "pid": 76337, "tid": -914061504, "ts": 1716454224499956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224553168, "dur": 12, "args": { "External id": 195787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195787, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195787, "pid": 5, "tid": 7, "ts": 1716454224553168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499958, "dur": 12, "args": { "External id": 195787, "cbid": 211, "correlation": 195787 } }, { "ph": "s", "id": 195787, "pid": 76337, "tid": -914061504, "ts": 1716454224499958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224553182, "dur": 5, "args": { "External id": 195789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195789, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195789, "pid": 5, "tid": 7, "ts": 1716454224553182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224499971, "dur": 13, "args": { "External id": 195789, "cbid": 211, "correlation": 195789 } }, { "ph": "s", "id": 195789, "pid": 76337, "tid": -914061504, "ts": 1716454224499971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224500039, "dur": 1, "args": { "External id": 195800, "cbid": 251, "correlation": 195800 } }, { "ph": "f", "id": 195800, "pid": 76337, "tid": -914061504, "ts": 1716454224500039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224500043, "dur": 0, "args": { "External id": 195801, "cbid": 251, "correlation": 195801 } }, { "ph": "f", "id": 195801, "pid": 76337, "tid": -914061504, "ts": 1716454224500043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224553188, "dur": 8, "args": { "External id": 195802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195802, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195802, "pid": 5, "tid": 7, "ts": 1716454224553188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500045, "dur": 12, "args": { "External id": 195802, "cbid": 211, "correlation": 195802 } }, { "ph": "s", "id": 195802, "pid": 76337, "tid": -914061504, "ts": 1716454224500045, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224553198, "dur": 3, "args": { "External id": 195804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195804, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195804, "pid": 5, "tid": 7, "ts": 1716454224553198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500058, "dur": 5, "args": { "External id": 195804, "cbid": 211, "correlation": 195804 } }, { "ph": "s", "id": 195804, "pid": 76337, "tid": -914061504, "ts": 1716454224500058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224553203, "dur": 55, "args": { "External id": 195829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195829, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195829, "pid": 5, "tid": 7, "ts": 1716454224553203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500134, "dur": 13, "args": { "External id": 195829, "cbid": 211, "correlation": 195829 } }, { "ph": "s", "id": 195829, "pid": 76337, "tid": -914061504, "ts": 1716454224500134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224500236, "dur": 2, "args": { "External id": 195847, "cbid": 251, "correlation": 195847 } }, { "ph": "f", "id": 195847, "pid": 76337, "tid": -914061504, "ts": 1716454224500236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224553259, "dur": 91, "args": { "External id": 195849, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195849, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195849, "pid": 5, "tid": 7, "ts": 1716454224553259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500244, "dur": 15, "args": { "External id": 195849, "cbid": 211, "correlation": 195849 } }, { "ph": "s", "id": 195849, "pid": 76337, "tid": -914061504, "ts": 1716454224500244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224553352, "dur": 10, "args": { "External id": 195857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195857, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195857, "pid": 5, "tid": 7, "ts": 1716454224553352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500315, "dur": 12, "args": { "External id": 195857, "cbid": 211, "correlation": 195857 } }, { "ph": "s", "id": 195857, "pid": 76337, "tid": -914061504, "ts": 1716454224500315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224553363, "dur": 20, "args": { "External id": 195865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195865, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195865, "pid": 5, "tid": 7, "ts": 1716454224553363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500357, "dur": 9, "args": { "External id": 195865, "cbid": 211, "correlation": 195865 } }, { "ph": "s", "id": 195865, "pid": 76337, "tid": -914061504, "ts": 1716454224500357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224553385, "dur": 17, "args": { "External id": 195887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195887, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195887, "pid": 5, "tid": 7, "ts": 1716454224553385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500411, "dur": 10, "args": { "External id": 195887, "cbid": 211, "correlation": 195887 } }, { "ph": "s", "id": 195887, "pid": 76337, "tid": -914061504, "ts": 1716454224500411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224500505, "dur": 2, "args": { "External id": 195903, "cbid": 251, "correlation": 195903 } }, { "ph": "f", "id": 195903, "pid": 76337, "tid": -914061504, "ts": 1716454224500505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224500510, "dur": 0, "args": { "External id": 195905, "cbid": 251, "correlation": 195905 } }, { "ph": "f", "id": 195905, "pid": 76337, "tid": -914061504, "ts": 1716454224500510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224553403, "dur": 497, "args": { "External id": 195906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195906, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 195906, "pid": 5, "tid": 7, "ts": 1716454224553403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500513, "dur": 15, "args": { "External id": 195906, "cbid": 211, "correlation": 195906 } }, { "ph": "s", "id": 195906, "pid": 76337, "tid": -914061504, "ts": 1716454224500513, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224553901, "dur": 66, "args": { "External id": 195914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195914, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195914, "pid": 5, "tid": 7, "ts": 1716454224553901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500581, "dur": 13, "args": { "External id": 195914, "cbid": 211, "correlation": 195914 } }, { "ph": "s", "id": 195914, "pid": 76337, "tid": -914061504, "ts": 1716454224500581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224553969, "dur": 69, "args": { "External id": 195922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195922, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195922, "pid": 5, "tid": 7, "ts": 1716454224553969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500612, "dur": 8, "args": { "External id": 195922, "cbid": 211, "correlation": 195922 } }, { "ph": "s", "id": 195922, "pid": 76337, "tid": -914061504, "ts": 1716454224500612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224500693, "dur": 1, "args": { "External id": 195938, "cbid": 251, "correlation": 195938 } }, { "ph": "f", "id": 195938, "pid": 76337, "tid": -914061504, "ts": 1716454224500693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224554040, "dur": 1, "args": { "External id": 195940, "device": 5, "context": 1, "stream": 7, "correlation": 195940, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 195940, "pid": 5, "tid": 7, "ts": 1716454224554040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224500698, "dur": 14, "args": { "External id": 195940, "cbid": 51, "correlation": 195940 } }, { "ph": "s", "id": 195940, "pid": 76337, "tid": -914061504, "ts": 1716454224500698, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224554044, "dur": 271, "args": { "External id": 195941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195941, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195941, "pid": 5, "tid": 7, "ts": 1716454224554044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500714, "dur": 11, "args": { "External id": 195941, "cbid": 211, "correlation": 195941 } }, { "ph": "s", "id": 195941, "pid": 76337, "tid": -914061504, "ts": 1716454224500714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224554316, "dur": 16, "args": { "External id": 195949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195949, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195949, "pid": 5, "tid": 7, "ts": 1716454224554316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500757, "dur": 10, "args": { "External id": 195949, "cbid": 211, "correlation": 195949 } }, { "ph": "s", "id": 195949, "pid": 76337, "tid": -914061504, "ts": 1716454224500757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224554333, "dur": 38, "args": { "External id": 195960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195960, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195960, "pid": 5, "tid": 7, "ts": 1716454224554333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500826, "dur": 12, "args": { "External id": 195960, "cbid": 211, "correlation": 195960 } }, { "ph": "s", "id": 195960, "pid": 76337, "tid": -914061504, "ts": 1716454224500826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224500893, "dur": 0, "args": { "External id": 195972, "cbid": 317, "correlation": 195972 } }, { "ph": "f", "id": 195972, "pid": 76337, "tid": -914061504, "ts": 1716454224500893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224500893, "dur": 0, "args": { "External id": 195973, "cbid": 203, "correlation": 195973 } }, { "ph": "f", "id": 195973, "pid": 76337, "tid": -914061504, "ts": 1716454224500893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224500894, "dur": 0, "args": { "External id": 195974, "cbid": 205, "correlation": 195974 } }, { "ph": "f", "id": 195974, "pid": 76337, "tid": -914061504, "ts": 1716454224500894, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224554373, "dur": 13, "args": { "External id": 195978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195978, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195978, "pid": 5, "tid": 7, "ts": 1716454224554373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500914, "dur": 12, "args": { "External id": 195978, "cbid": 211, "correlation": 195978 } }, { "ph": "s", "id": 195978, "pid": 76337, "tid": -914061504, "ts": 1716454224500914, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224554387, "dur": 4, "args": { "External id": 195980, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 195980, "pid": 5, "tid": 7, "ts": 1716454224554387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500931, "dur": 6, "args": { "External id": 195980, "cbid": 211, "correlation": 195980 } }, { "ph": "s", "id": 195980, "pid": 76337, "tid": -914061504, "ts": 1716454224500931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224500940, "dur": 0, "args": { "External id": 195981, "cbid": 51, "correlation": 195981 } }, { "ph": "s", "id": 195981, "pid": 76337, "tid": -914061504, "ts": 1716454224500940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224554392, "dur": 97, "args": { "External id": 195982, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195982, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 195982, "pid": 5, "tid": 7, "ts": 1716454224554392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500941, "dur": 5, "args": { "External id": 195982, "cbid": 211, "correlation": 195982 } }, { "ph": "s", "id": 195982, "pid": 76337, "tid": -914061504, "ts": 1716454224500941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224554491, "dur": 16, "args": { "External id": 195987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195987, "pid": 5, "tid": 7, "ts": 1716454224554491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224500969, "dur": 17, "args": { "External id": 195987, "cbid": 211, "correlation": 195987 } }, { "ph": "s", "id": 195987, "pid": 76337, "tid": -914061504, "ts": 1716454224500969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224554508, "dur": 11, "args": { "External id": 195995, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 195995, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 195995, "pid": 5, "tid": 7, "ts": 1716454224554508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501011, "dur": 9, "args": { "External id": 195995, "cbid": 211, "correlation": 195995 } }, { "ph": "s", "id": 195995, "pid": 76337, "tid": -914061504, "ts": 1716454224501011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224554521, "dur": 19, "args": { "External id": 196015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196015, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 196015, "pid": 5, "tid": 7, "ts": 1716454224554521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501091, "dur": 13, "args": { "External id": 196015, "cbid": 211, "correlation": 196015 } }, { "ph": "s", "id": 196015, "pid": 76337, "tid": -914061504, "ts": 1716454224501091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224554542, "dur": 4, "args": { "External id": 196027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196027, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 196027, "pid": 5, "tid": 7, "ts": 1716454224554542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501116, "dur": 7, "args": { "External id": 196027, "cbid": 211, "correlation": 196027 } }, { "ph": "s", "id": 196027, "pid": 76337, "tid": -914061504, "ts": 1716454224501116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224554547, "dur": 19, "args": { "External id": 196030, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196030, "pid": 5, "tid": 7, "ts": 1716454224554547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501135, "dur": 7, "args": { "External id": 196030, "cbid": 211, "correlation": 196030 } }, { "ph": "s", "id": 196030, "pid": 76337, "tid": -914061504, "ts": 1716454224501135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224554567, "dur": 11, "args": { "External id": 196039, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196039, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196039, "pid": 5, "tid": 7, "ts": 1716454224554567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501175, "dur": 10, "args": { "External id": 196039, "cbid": 211, "correlation": 196039 } }, { "ph": "s", "id": 196039, "pid": 76337, "tid": -914061504, "ts": 1716454224501175, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224501228, "dur": 0, "args": { "External id": 196049, "cbid": 317, "correlation": 196049 } }, { "ph": "f", "id": 196049, "pid": 76337, "tid": -914061504, "ts": 1716454224501228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224501229, "dur": 0, "args": { "External id": 196050, "cbid": 203, "correlation": 196050 } }, { "ph": "f", "id": 196050, "pid": 76337, "tid": -914061504, "ts": 1716454224501229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224501229, "dur": 0, "args": { "External id": 196051, "cbid": 205, "correlation": 196051 } }, { "ph": "f", "id": 196051, "pid": 76337, "tid": -914061504, "ts": 1716454224501229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224554580, "dur": 11, "args": { "External id": 196055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196055, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196055, "pid": 5, "tid": 7, "ts": 1716454224554580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501243, "dur": 12, "args": { "External id": 196055, "cbid": 211, "correlation": 196055 } }, { "ph": "s", "id": 196055, "pid": 76337, "tid": -914061504, "ts": 1716454224501243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224554592, "dur": 162, "args": { "External id": 196057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196057, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196057, "pid": 5, "tid": 7, "ts": 1716454224554592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501258, "dur": 5, "args": { "External id": 196057, "cbid": 211, "correlation": 196057 } }, { "ph": "s", "id": 196057, "pid": 76337, "tid": -914061504, "ts": 1716454224501258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224554757, "dur": 1, "args": { "External id": 196059, "device": 5, "context": 1, "stream": 7, "correlation": 196059, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 196059, "pid": 5, "tid": 7, "ts": 1716454224554757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224501270, "dur": 7, "args": { "External id": 196059, "cbid": 51, "correlation": 196059 } }, { "ph": "s", "id": 196059, "pid": 76337, "tid": -914061504, "ts": 1716454224501270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224554761, "dur": 664, "args": { "External id": 196060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196060, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196060, "pid": 5, "tid": 7, "ts": 1716454224554761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501278, "dur": 6, "args": { "External id": 196060, "cbid": 211, "correlation": 196060 } }, { "ph": "s", "id": 196060, "pid": 76337, "tid": -914061504, "ts": 1716454224501278, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224555426, "dur": 13, "args": { "External id": 196062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196062, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196062, "pid": 5, "tid": 7, "ts": 1716454224555426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501289, "dur": 5, "args": { "External id": 196062, "cbid": 211, "correlation": 196062 } }, { "ph": "s", "id": 196062, "pid": 76337, "tid": -914061504, "ts": 1716454224501289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224555441, "dur": 15, "args": { "External id": 196068, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196068, "pid": 5, "tid": 7, "ts": 1716454224555441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501318, "dur": 9, "args": { "External id": 196068, "cbid": 211, "correlation": 196068 } }, { "ph": "s", "id": 196068, "pid": 76337, "tid": -914061504, "ts": 1716454224501318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224555457, "dur": 3, "args": { "External id": 196076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196076, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 196076, "pid": 5, "tid": 7, "ts": 1716454224555457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501362, "dur": 9, "args": { "External id": 196076, "cbid": 211, "correlation": 196076 } }, { "ph": "s", "id": 196076, "pid": 76337, "tid": -914061504, "ts": 1716454224501362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224501430, "dur": 1, "args": { "External id": 196092, "cbid": 251, "correlation": 196092 } }, { "ph": "f", "id": 196092, "pid": 76337, "tid": -914061504, "ts": 1716454224501430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224501435, "dur": 0, "args": { "External id": 196094, "cbid": 251, "correlation": 196094 } }, { "ph": "f", "id": 196094, "pid": 76337, "tid": -914061504, "ts": 1716454224501435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224555462, "dur": 13, "args": { "External id": 196095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196095, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196095, "pid": 5, "tid": 7, "ts": 1716454224555462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501437, "dur": 12, "args": { "External id": 196095, "cbid": 211, "correlation": 196095 } }, { "ph": "s", "id": 196095, "pid": 76337, "tid": -914061504, "ts": 1716454224501437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224555477, "dur": 5, "args": { "External id": 196097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196097, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196097, "pid": 5, "tid": 7, "ts": 1716454224555477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501452, "dur": 6, "args": { "External id": 196097, "cbid": 211, "correlation": 196097 } }, { "ph": "s", "id": 196097, "pid": 76337, "tid": -914061504, "ts": 1716454224501452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224555483, "dur": 17, "args": { "External id": 196107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196107, "pid": 5, "tid": 7, "ts": 1716454224555483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501510, "dur": 12, "args": { "External id": 196107, "cbid": 211, "correlation": 196107 } }, { "ph": "s", "id": 196107, "pid": 76337, "tid": -914061504, "ts": 1716454224501510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224555502, "dur": 18, "args": { "External id": 196127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196127, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 196127, "pid": 5, "tid": 7, "ts": 1716454224555502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501577, "dur": 10, "args": { "External id": 196127, "cbid": 211, "correlation": 196127 } }, { "ph": "s", "id": 196127, "pid": 76337, "tid": -914061504, "ts": 1716454224501577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224555521, "dur": 4, "args": { "External id": 196139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196139, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 196139, "pid": 5, "tid": 7, "ts": 1716454224555521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501597, "dur": 7, "args": { "External id": 196139, "cbid": 211, "correlation": 196139 } }, { "ph": "s", "id": 196139, "pid": 76337, "tid": -914061504, "ts": 1716454224501597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224555526, "dur": 16, "args": { "External id": 196142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196142, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196142, "pid": 5, "tid": 7, "ts": 1716454224555526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501616, "dur": 6, "args": { "External id": 196142, "cbid": 211, "correlation": 196142 } }, { "ph": "s", "id": 196142, "pid": 76337, "tid": -914061504, "ts": 1716454224501616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224555544, "dur": 11, "args": { "External id": 196151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196151, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196151, "pid": 5, "tid": 7, "ts": 1716454224555544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501656, "dur": 9, "args": { "External id": 196151, "cbid": 211, "correlation": 196151 } }, { "ph": "s", "id": 196151, "pid": 76337, "tid": -914061504, "ts": 1716454224501656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224501718, "dur": 0, "args": { "External id": 196161, "cbid": 317, "correlation": 196161 } }, { "ph": "f", "id": 196161, "pid": 76337, "tid": -914061504, "ts": 1716454224501718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224501719, "dur": 0, "args": { "External id": 196162, "cbid": 203, "correlation": 196162 } }, { "ph": "f", "id": 196162, "pid": 76337, "tid": -914061504, "ts": 1716454224501719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224501719, "dur": 0, "args": { "External id": 196163, "cbid": 205, "correlation": 196163 } }, { "ph": "f", "id": 196163, "pid": 76337, "tid": -914061504, "ts": 1716454224501719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224555556, "dur": 12, "args": { "External id": 196167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196167, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196167, "pid": 5, "tid": 7, "ts": 1716454224555556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501733, "dur": 12, "args": { "External id": 196167, "cbid": 211, "correlation": 196167 } }, { "ph": "s", "id": 196167, "pid": 76337, "tid": -914061504, "ts": 1716454224501733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224555569, "dur": 162, "args": { "External id": 196169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196169, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196169, "pid": 5, "tid": 7, "ts": 1716454224555569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501747, "dur": 6, "args": { "External id": 196169, "cbid": 211, "correlation": 196169 } }, { "ph": "s", "id": 196169, "pid": 76337, "tid": -914061504, "ts": 1716454224501747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224555733, "dur": 1, "args": { "External id": 196171, "device": 5, "context": 1, "stream": 7, "correlation": 196171, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 196171, "pid": 5, "tid": 7, "ts": 1716454224555733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224501759, "dur": 6, "args": { "External id": 196171, "cbid": 51, "correlation": 196171 } }, { "ph": "s", "id": 196171, "pid": 76337, "tid": -914061504, "ts": 1716454224501759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224555737, "dur": 648, "args": { "External id": 196172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196172, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196172, "pid": 5, "tid": 7, "ts": 1716454224555737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501766, "dur": 6, "args": { "External id": 196172, "cbid": 211, "correlation": 196172 } }, { "ph": "s", "id": 196172, "pid": 76337, "tid": -914061504, "ts": 1716454224501766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224556386, "dur": 13, "args": { "External id": 196174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196174, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196174, "pid": 5, "tid": 7, "ts": 1716454224556386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501778, "dur": 5, "args": { "External id": 196174, "cbid": 211, "correlation": 196174 } }, { "ph": "s", "id": 196174, "pid": 76337, "tid": -914061504, "ts": 1716454224501778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224556400, "dur": 15, "args": { "External id": 196180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196180, "pid": 5, "tid": 7, "ts": 1716454224556400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501806, "dur": 9, "args": { "External id": 196180, "cbid": 211, "correlation": 196180 } }, { "ph": "s", "id": 196180, "pid": 76337, "tid": -914061504, "ts": 1716454224501806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224556416, "dur": 12, "args": { "External id": 196188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196188, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196188, "pid": 5, "tid": 7, "ts": 1716454224556416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501838, "dur": 8, "args": { "External id": 196188, "cbid": 211, "correlation": 196188 } }, { "ph": "s", "id": 196188, "pid": 76337, "tid": -914061504, "ts": 1716454224501838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224556429, "dur": 10, "args": { "External id": 196196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196196, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196196, "pid": 5, "tid": 7, "ts": 1716454224556429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501868, "dur": 8, "args": { "External id": 196196, "cbid": 211, "correlation": 196196 } }, { "ph": "s", "id": 196196, "pid": 76337, "tid": -914061504, "ts": 1716454224501868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224556441, "dur": 18, "args": { "External id": 196216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196216, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 196216, "pid": 5, "tid": 7, "ts": 1716454224556441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501950, "dur": 13, "args": { "External id": 196216, "cbid": 211, "correlation": 196216 } }, { "ph": "s", "id": 196216, "pid": 76337, "tid": -914061504, "ts": 1716454224501950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224556460, "dur": 4, "args": { "External id": 196228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196228, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 196228, "pid": 5, "tid": 7, "ts": 1716454224556460, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224501982, "dur": 7, "args": { "External id": 196228, "cbid": 211, "correlation": 196228 } }, { "ph": "s", "id": 196228, "pid": 76337, "tid": -914061504, "ts": 1716454224501982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224556466, "dur": 16, "args": { "External id": 196231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196231, "pid": 5, "tid": 7, "ts": 1716454224556466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502000, "dur": 6, "args": { "External id": 196231, "cbid": 211, "correlation": 196231 } }, { "ph": "s", "id": 196231, "pid": 76337, "tid": -914061504, "ts": 1716454224502000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224502058, "dur": 0, "args": { "External id": 196242, "cbid": 317, "correlation": 196242 } }, { "ph": "f", "id": 196242, "pid": 76337, "tid": -914061504, "ts": 1716454224502058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224502059, "dur": 0, "args": { "External id": 196243, "cbid": 203, "correlation": 196243 } }, { "ph": "f", "id": 196243, "pid": 76337, "tid": -914061504, "ts": 1716454224502059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224502060, "dur": 0, "args": { "External id": 196244, "cbid": 205, "correlation": 196244 } }, { "ph": "f", "id": 196244, "pid": 76337, "tid": -914061504, "ts": 1716454224502060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224556483, "dur": 12, "args": { "External id": 196248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196248, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196248, "pid": 5, "tid": 7, "ts": 1716454224556483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502074, "dur": 12, "args": { "External id": 196248, "cbid": 211, "correlation": 196248 } }, { "ph": "s", "id": 196248, "pid": 76337, "tid": -914061504, "ts": 1716454224502074, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224556496, "dur": 4, "args": { "External id": 196250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 196250, "pid": 5, "tid": 7, "ts": 1716454224556496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502090, "dur": 6, "args": { "External id": 196250, "cbid": 211, "correlation": 196250 } }, { "ph": "s", "id": 196250, "pid": 76337, "tid": -914061504, "ts": 1716454224502090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224502099, "dur": 0, "args": { "External id": 196251, "cbid": 51, "correlation": 196251 } }, { "ph": "s", "id": 196251, "pid": 76337, "tid": -914061504, "ts": 1716454224502099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224556501, "dur": 95, "args": { "External id": 196252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196252, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 196252, "pid": 5, "tid": 7, "ts": 1716454224556501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502099, "dur": 5, "args": { "External id": 196252, "cbid": 211, "correlation": 196252 } }, { "ph": "s", "id": 196252, "pid": 76337, "tid": -914061504, "ts": 1716454224502099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224556598, "dur": 16, "args": { "External id": 196257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196257, "pid": 5, "tid": 7, "ts": 1716454224556598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502127, "dur": 8, "args": { "External id": 196257, "cbid": 211, "correlation": 196257 } }, { "ph": "s", "id": 196257, "pid": 76337, "tid": -914061504, "ts": 1716454224502127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224556615, "dur": 83, "args": { "External id": 196266, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196266, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196266, "pid": 5, "tid": 7, "ts": 1716454224556615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502213, "dur": 14, "args": { "External id": 196266, "cbid": 211, "correlation": 196266 } }, { "ph": "s", "id": 196266, "pid": 76337, "tid": -914061504, "ts": 1716454224502213, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224556698, "dur": 31, "args": { "External id": 196288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196288, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196288, "pid": 5, "tid": 7, "ts": 1716454224556698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502273, "dur": 10, "args": { "External id": 196288, "cbid": 211, "correlation": 196288 } }, { "ph": "s", "id": 196288, "pid": 76337, "tid": -914061504, "ts": 1716454224502273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224502367, "dur": 1, "args": { "External id": 196299, "cbid": 251, "correlation": 196299 } }, { "ph": "f", "id": 196299, "pid": 76337, "tid": -914061504, "ts": 1716454224502367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224556731, "dur": 165, "args": { "External id": 196300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196300, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196300, "pid": 5, "tid": 7, "ts": 1716454224556731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502373, "dur": 13, "args": { "External id": 196300, "cbid": 211, "correlation": 196300 } }, { "ph": "s", "id": 196300, "pid": 76337, "tid": -914061504, "ts": 1716454224502373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224502446, "dur": 1, "args": { "External id": 196311, "cbid": 251, "correlation": 196311 } }, { "ph": "f", "id": 196311, "pid": 76337, "tid": -914061504, "ts": 1716454224502446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224556897, "dur": 158, "args": { "External id": 196312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196312, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196312, "pid": 5, "tid": 7, "ts": 1716454224556897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502450, "dur": 12, "args": { "External id": 196312, "cbid": 211, "correlation": 196312 } }, { "ph": "s", "id": 196312, "pid": 76337, "tid": -914061504, "ts": 1716454224502450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224502517, "dur": 1, "args": { "External id": 196323, "cbid": 251, "correlation": 196323 } }, { "ph": "f", "id": 196323, "pid": 76337, "tid": -914061504, "ts": 1716454224502517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224557057, "dur": 157, "args": { "External id": 196324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196324, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196324, "pid": 5, "tid": 7, "ts": 1716454224557057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502521, "dur": 14, "args": { "External id": 196324, "cbid": 211, "correlation": 196324 } }, { "ph": "s", "id": 196324, "pid": 76337, "tid": -914061504, "ts": 1716454224502521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224557215, "dur": 336, "args": { "External id": 196349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196349, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196349, "pid": 5, "tid": 7, "ts": 1716454224557215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502612, "dur": 13, "args": { "External id": 196349, "cbid": 211, "correlation": 196349 } }, { "ph": "s", "id": 196349, "pid": 76337, "tid": -914061504, "ts": 1716454224502612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224502717, "dur": 1, "args": { "External id": 196367, "cbid": 251, "correlation": 196367 } }, { "ph": "f", "id": 196367, "pid": 76337, "tid": -914061504, "ts": 1716454224502717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224557553, "dur": 167, "args": { "External id": 196369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196369, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196369, "pid": 5, "tid": 7, "ts": 1716454224557553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502723, "dur": 13, "args": { "External id": 196369, "cbid": 211, "correlation": 196369 } }, { "ph": "s", "id": 196369, "pid": 76337, "tid": -914061504, "ts": 1716454224502723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224557721, "dur": 20, "args": { "External id": 196377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196377, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196377, "pid": 5, "tid": 7, "ts": 1716454224557721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502793, "dur": 12, "args": { "External id": 196377, "cbid": 211, "correlation": 196377 } }, { "ph": "s", "id": 196377, "pid": 76337, "tid": -914061504, "ts": 1716454224502793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224557742, "dur": 27, "args": { "External id": 196385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196385, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196385, "pid": 5, "tid": 7, "ts": 1716454224557742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502832, "dur": 9, "args": { "External id": 196385, "cbid": 211, "correlation": 196385 } }, { "ph": "s", "id": 196385, "pid": 76337, "tid": -914061504, "ts": 1716454224502832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224557771, "dur": 19, "args": { "External id": 196396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196396, "pid": 5, "tid": 7, "ts": 1716454224557771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502905, "dur": 14, "args": { "External id": 196396, "cbid": 211, "correlation": 196396 } }, { "ph": "s", "id": 196396, "pid": 76337, "tid": -914061504, "ts": 1716454224502905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224557791, "dur": 16, "args": { "External id": 196418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196418, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196418, "pid": 5, "tid": 7, "ts": 1716454224557791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224502936, "dur": 7, "args": { "External id": 196418, "cbid": 211, "correlation": 196418 } }, { "ph": "s", "id": 196418, "pid": 76337, "tid": -914061504, "ts": 1716454224502936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503031, "dur": 1, "args": { "External id": 196429, "cbid": 251, "correlation": 196429 } }, { "ph": "f", "id": 196429, "pid": 76337, "tid": -914061504, "ts": 1716454224503031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224557808, "dur": 89, "args": { "External id": 196430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196430, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 196430, "pid": 5, "tid": 7, "ts": 1716454224557808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503037, "dur": 13, "args": { "External id": 196430, "cbid": 211, "correlation": 196430 } }, { "ph": "s", "id": 196430, "pid": 76337, "tid": -914061504, "ts": 1716454224503037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503111, "dur": 1, "args": { "External id": 196441, "cbid": 251, "correlation": 196441 } }, { "ph": "f", "id": 196441, "pid": 76337, "tid": -914061504, "ts": 1716454224503111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503115, "dur": 0, "args": { "External id": 196442, "cbid": 251, "correlation": 196442 } }, { "ph": "f", "id": 196442, "pid": 76337, "tid": -914061504, "ts": 1716454224503115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224557898, "dur": 12, "args": { "External id": 196443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196443, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196443, "pid": 5, "tid": 7, "ts": 1716454224557898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503117, "dur": 12, "args": { "External id": 196443, "cbid": 211, "correlation": 196443 } }, { "ph": "s", "id": 196443, "pid": 76337, "tid": -914061504, "ts": 1716454224503117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224557912, "dur": 6, "args": { "External id": 196445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196445, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196445, "pid": 5, "tid": 7, "ts": 1716454224557912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503130, "dur": 6, "args": { "External id": 196445, "cbid": 211, "correlation": 196445 } }, { "ph": "s", "id": 196445, "pid": 76337, "tid": -914061504, "ts": 1716454224503130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503189, "dur": 1, "args": { "External id": 196456, "cbid": 251, "correlation": 196456 } }, { "ph": "f", "id": 196456, "pid": 76337, "tid": -914061504, "ts": 1716454224503189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503192, "dur": 0, "args": { "External id": 196457, "cbid": 251, "correlation": 196457 } }, { "ph": "f", "id": 196457, "pid": 76337, "tid": -914061504, "ts": 1716454224503192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224557919, "dur": 8, "args": { "External id": 196458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196458, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196458, "pid": 5, "tid": 7, "ts": 1716454224557919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503194, "dur": 11, "args": { "External id": 196458, "cbid": 211, "correlation": 196458 } }, { "ph": "s", "id": 196458, "pid": 76337, "tid": -914061504, "ts": 1716454224503194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224557929, "dur": 4, "args": { "External id": 196460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196460, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196460, "pid": 5, "tid": 7, "ts": 1716454224557929, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503207, "dur": 6, "args": { "External id": 196460, "cbid": 211, "correlation": 196460 } }, { "ph": "s", "id": 196460, "pid": 76337, "tid": -914061504, "ts": 1716454224503207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224557933, "dur": 54, "args": { "External id": 196485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196485, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196485, "pid": 5, "tid": 7, "ts": 1716454224557933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503283, "dur": 12, "args": { "External id": 196485, "cbid": 211, "correlation": 196485 } }, { "ph": "s", "id": 196485, "pid": 76337, "tid": -914061504, "ts": 1716454224503283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503381, "dur": 1, "args": { "External id": 196503, "cbid": 251, "correlation": 196503 } }, { "ph": "f", "id": 196503, "pid": 76337, "tid": -914061504, "ts": 1716454224503381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224557989, "dur": 91, "args": { "External id": 196505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196505, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 196505, "pid": 5, "tid": 7, "ts": 1716454224557989, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503387, "dur": 14, "args": { "External id": 196505, "cbid": 211, "correlation": 196505 } }, { "ph": "s", "id": 196505, "pid": 76337, "tid": -914061504, "ts": 1716454224503387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224558082, "dur": 10, "args": { "External id": 196513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196513, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196513, "pid": 5, "tid": 7, "ts": 1716454224558082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503456, "dur": 13, "args": { "External id": 196513, "cbid": 211, "correlation": 196513 } }, { "ph": "s", "id": 196513, "pid": 76337, "tid": -914061504, "ts": 1716454224503456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224558093, "dur": 20, "args": { "External id": 196521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196521, "pid": 5, "tid": 7, "ts": 1716454224558093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503497, "dur": 9, "args": { "External id": 196521, "cbid": 211, "correlation": 196521 } }, { "ph": "s", "id": 196521, "pid": 76337, "tid": -914061504, "ts": 1716454224503497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224558114, "dur": 17, "args": { "External id": 196543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196543, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196543, "pid": 5, "tid": 7, "ts": 1716454224558114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503551, "dur": 11, "args": { "External id": 196543, "cbid": 211, "correlation": 196543 } }, { "ph": "s", "id": 196543, "pid": 76337, "tid": -914061504, "ts": 1716454224503551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503643, "dur": 1, "args": { "External id": 196559, "cbid": 251, "correlation": 196559 } }, { "ph": "f", "id": 196559, "pid": 76337, "tid": -914061504, "ts": 1716454224503643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503648, "dur": 0, "args": { "External id": 196561, "cbid": 251, "correlation": 196561 } }, { "ph": "f", "id": 196561, "pid": 76337, "tid": -914061504, "ts": 1716454224503648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224558133, "dur": 494, "args": { "External id": 196562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196562, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196562, "pid": 5, "tid": 7, "ts": 1716454224558133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503650, "dur": 15, "args": { "External id": 196562, "cbid": 211, "correlation": 196562 } }, { "ph": "s", "id": 196562, "pid": 76337, "tid": -914061504, "ts": 1716454224503650, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224558628, "dur": 66, "args": { "External id": 196570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196570, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196570, "pid": 5, "tid": 7, "ts": 1716454224558628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503722, "dur": 13, "args": { "External id": 196570, "cbid": 211, "correlation": 196570 } }, { "ph": "s", "id": 196570, "pid": 76337, "tid": -914061504, "ts": 1716454224503722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224558696, "dur": 69, "args": { "External id": 196578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196578, "pid": 5, "tid": 7, "ts": 1716454224558696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503754, "dur": 9, "args": { "External id": 196578, "cbid": 211, "correlation": 196578 } }, { "ph": "s", "id": 196578, "pid": 76337, "tid": -914061504, "ts": 1716454224503754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224503834, "dur": 1, "args": { "External id": 196594, "cbid": 251, "correlation": 196594 } }, { "ph": "f", "id": 196594, "pid": 76337, "tid": -914061504, "ts": 1716454224503834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224558767, "dur": 1, "args": { "External id": 196596, "device": 5, "context": 1, "stream": 7, "correlation": 196596, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 196596, "pid": 5, "tid": 7, "ts": 1716454224558767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224503839, "dur": 10, "args": { "External id": 196596, "cbid": 51, "correlation": 196596 } }, { "ph": "s", "id": 196596, "pid": 76337, "tid": -914061504, "ts": 1716454224503839, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224558770, "dur": 272, "args": { "External id": 196597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196597, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 196597, "pid": 5, "tid": 7, "ts": 1716454224558770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503850, "dur": 12, "args": { "External id": 196597, "cbid": 211, "correlation": 196597 } }, { "ph": "s", "id": 196597, "pid": 76337, "tid": -914061504, "ts": 1716454224503850, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224559044, "dur": 14, "args": { "External id": 196605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196605, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196605, "pid": 5, "tid": 7, "ts": 1716454224559044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503892, "dur": 10, "args": { "External id": 196605, "cbid": 211, "correlation": 196605 } }, { "ph": "s", "id": 196605, "pid": 76337, "tid": -914061504, "ts": 1716454224503892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224559059, "dur": 38, "args": { "External id": 196616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196616, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196616, "pid": 5, "tid": 7, "ts": 1716454224559059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224503961, "dur": 20, "args": { "External id": 196616, "cbid": 211, "correlation": 196616 } }, { "ph": "s", "id": 196616, "pid": 76337, "tid": -914061504, "ts": 1716454224503961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224504034, "dur": 0, "args": { "External id": 196628, "cbid": 317, "correlation": 196628 } }, { "ph": "f", "id": 196628, "pid": 76337, "tid": -914061504, "ts": 1716454224504034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224504035, "dur": 0, "args": { "External id": 196629, "cbid": 203, "correlation": 196629 } }, { "ph": "f", "id": 196629, "pid": 76337, "tid": -914061504, "ts": 1716454224504035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224504036, "dur": 0, "args": { "External id": 196630, "cbid": 205, "correlation": 196630 } }, { "ph": "f", "id": 196630, "pid": 76337, "tid": -914061504, "ts": 1716454224504036, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224559098, "dur": 14, "args": { "External id": 196634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196634, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196634, "pid": 5, "tid": 7, "ts": 1716454224559098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504051, "dur": 13, "args": { "External id": 196634, "cbid": 211, "correlation": 196634 } }, { "ph": "s", "id": 196634, "pid": 76337, "tid": -914061504, "ts": 1716454224504051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224559114, "dur": 4, "args": { "External id": 196636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 196636, "pid": 5, "tid": 7, "ts": 1716454224559114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504070, "dur": 6, "args": { "External id": 196636, "cbid": 211, "correlation": 196636 } }, { "ph": "s", "id": 196636, "pid": 76337, "tid": -914061504, "ts": 1716454224504070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224504078, "dur": 0, "args": { "External id": 196637, "cbid": 51, "correlation": 196637 } }, { "ph": "s", "id": 196637, "pid": 76337, "tid": -914061504, "ts": 1716454224504078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224559119, "dur": 98, "args": { "External id": 196638, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196638, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 196638, "pid": 5, "tid": 7, "ts": 1716454224559119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504079, "dur": 5, "args": { "External id": 196638, "cbid": 211, "correlation": 196638 } }, { "ph": "s", "id": 196638, "pid": 76337, "tid": -914061504, "ts": 1716454224504079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224559219, "dur": 16, "args": { "External id": 196643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196643, "pid": 5, "tid": 7, "ts": 1716454224559219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504107, "dur": 9, "args": { "External id": 196643, "cbid": 211, "correlation": 196643 } }, { "ph": "s", "id": 196643, "pid": 76337, "tid": -914061504, "ts": 1716454224504107, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224559236, "dur": 11, "args": { "External id": 196651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196651, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196651, "pid": 5, "tid": 7, "ts": 1716454224559236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504139, "dur": 8, "args": { "External id": 196651, "cbid": 211, "correlation": 196651 } }, { "ph": "s", "id": 196651, "pid": 76337, "tid": -914061504, "ts": 1716454224504139, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224504211, "dur": 0, "args": { "External id": 196661, "cbid": 317, "correlation": 196661 } }, { "ph": "f", "id": 196661, "pid": 76337, "tid": -914061504, "ts": 1716454224504211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224504212, "dur": 0, "args": { "External id": 196662, "cbid": 203, "correlation": 196662 } }, { "ph": "f", "id": 196662, "pid": 76337, "tid": -914061504, "ts": 1716454224504212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224504212, "dur": 0, "args": { "External id": 196663, "cbid": 205, "correlation": 196663 } }, { "ph": "f", "id": 196663, "pid": 76337, "tid": -914061504, "ts": 1716454224504212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224559249, "dur": 12, "args": { "External id": 196667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196667, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196667, "pid": 5, "tid": 7, "ts": 1716454224559249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504227, "dur": 12, "args": { "External id": 196667, "cbid": 211, "correlation": 196667 } }, { "ph": "s", "id": 196667, "pid": 76337, "tid": -914061504, "ts": 1716454224504227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224559262, "dur": 162, "args": { "External id": 196669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196669, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196669, "pid": 5, "tid": 7, "ts": 1716454224559262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504242, "dur": 5, "args": { "External id": 196669, "cbid": 211, "correlation": 196669 } }, { "ph": "s", "id": 196669, "pid": 76337, "tid": -914061504, "ts": 1716454224504242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224559427, "dur": 1, "args": { "External id": 196671, "device": 5, "context": 1, "stream": 7, "correlation": 196671, "bytes": 960, "memory bandwidth (GB/s)": 0.5769230769230769 } }, { "ph": "f", "id": 196671, "pid": 5, "tid": 7, "ts": 1716454224559427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224504253, "dur": 7, "args": { "External id": 196671, "cbid": 51, "correlation": 196671 } }, { "ph": "s", "id": 196671, "pid": 76337, "tid": -914061504, "ts": 1716454224504253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224559430, "dur": 198, "args": { "External id": 196672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196672, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 196672, "pid": 5, "tid": 7, "ts": 1716454224559430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504262, "dur": 7, "args": { "External id": 196672, "cbid": 211, "correlation": 196672 } }, { "ph": "s", "id": 196672, "pid": 76337, "tid": -914061504, "ts": 1716454224504262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224559630, "dur": 6, "args": { "External id": 196674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196674, "pid": 5, "tid": 7, "ts": 1716454224559630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504274, "dur": 5, "args": { "External id": 196674, "cbid": 211, "correlation": 196674 } }, { "ph": "s", "id": 196674, "pid": 76337, "tid": -914061504, "ts": 1716454224504274, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224559638, "dur": 6, "args": { "External id": 196680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196680, "pid": 5, "tid": 7, "ts": 1716454224559638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504301, "dur": 9, "args": { "External id": 196680, "cbid": 211, "correlation": 196680 } }, { "ph": "s", "id": 196680, "pid": 76337, "tid": -914061504, "ts": 1716454224504301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224559645, "dur": 11, "args": { "External id": 196700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196700, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 196700, "pid": 5, "tid": 7, "ts": 1716454224559645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504402, "dur": 13, "args": { "External id": 196700, "cbid": 211, "correlation": 196700 } }, { "ph": "s", "id": 196700, "pid": 76337, "tid": -914061504, "ts": 1716454224504402, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224559657, "dur": 4, "args": { "External id": 196712, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196712, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 196712, "pid": 5, "tid": 7, "ts": 1716454224559657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504426, "dur": 6, "args": { "External id": 196712, "cbid": 211, "correlation": 196712 } }, { "ph": "s", "id": 196712, "pid": 76337, "tid": -914061504, "ts": 1716454224504426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224559663, "dur": 8, "args": { "External id": 196715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196715, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196715, "pid": 5, "tid": 7, "ts": 1716454224559663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504444, "dur": 7, "args": { "External id": 196715, "cbid": 211, "correlation": 196715 } }, { "ph": "s", "id": 196715, "pid": 76337, "tid": -914061504, "ts": 1716454224504444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224559673, "dur": 5, "args": { "External id": 196724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196724, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196724, "pid": 5, "tid": 7, "ts": 1716454224559673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504488, "dur": 10, "args": { "External id": 196724, "cbid": 211, "correlation": 196724 } }, { "ph": "s", "id": 196724, "pid": 76337, "tid": -914061504, "ts": 1716454224504488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224504543, "dur": 0, "args": { "External id": 196734, "cbid": 317, "correlation": 196734 } }, { "ph": "f", "id": 196734, "pid": 76337, "tid": -914061504, "ts": 1716454224504543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224504543, "dur": 0, "args": { "External id": 196735, "cbid": 203, "correlation": 196735 } }, { "ph": "f", "id": 196735, "pid": 76337, "tid": -914061504, "ts": 1716454224504543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224504544, "dur": 0, "args": { "External id": 196736, "cbid": 205, "correlation": 196736 } }, { "ph": "f", "id": 196736, "pid": 76337, "tid": -914061504, "ts": 1716454224504544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224559679, "dur": 5, "args": { "External id": 196740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196740, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196740, "pid": 5, "tid": 7, "ts": 1716454224559679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504559, "dur": 11, "args": { "External id": 196740, "cbid": 211, "correlation": 196740 } }, { "ph": "s", "id": 196740, "pid": 76337, "tid": -914061504, "ts": 1716454224504559, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224559686, "dur": 163, "args": { "External id": 196742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196742, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196742, "pid": 5, "tid": 7, "ts": 1716454224559686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504573, "dur": 5, "args": { "External id": 196742, "cbid": 211, "correlation": 196742 } }, { "ph": "s", "id": 196742, "pid": 76337, "tid": -914061504, "ts": 1716454224504573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224559850, "dur": 1, "args": { "External id": 196744, "device": 5, "context": 1, "stream": 7, "correlation": 196744, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 196744, "pid": 5, "tid": 7, "ts": 1716454224559850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224504584, "dur": 6, "args": { "External id": 196744, "cbid": 51, "correlation": 196744 } }, { "ph": "s", "id": 196744, "pid": 76337, "tid": -914061504, "ts": 1716454224504584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224559854, "dur": 270, "args": { "External id": 196745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196745, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196745, "pid": 5, "tid": 7, "ts": 1716454224559854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504591, "dur": 6, "args": { "External id": 196745, "cbid": 211, "correlation": 196745 } }, { "ph": "s", "id": 196745, "pid": 76337, "tid": -914061504, "ts": 1716454224504591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224560126, "dur": 6, "args": { "External id": 196747, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196747, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196747, "pid": 5, "tid": 7, "ts": 1716454224560126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504603, "dur": 6, "args": { "External id": 196747, "cbid": 211, "correlation": 196747 } }, { "ph": "s", "id": 196747, "pid": 76337, "tid": -914061504, "ts": 1716454224504603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224560133, "dur": 6, "args": { "External id": 196753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196753, "pid": 5, "tid": 7, "ts": 1716454224560133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504632, "dur": 8, "args": { "External id": 196753, "cbid": 211, "correlation": 196753 } }, { "ph": "s", "id": 196753, "pid": 76337, "tid": -914061504, "ts": 1716454224504632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224560140, "dur": 3, "args": { "External id": 196761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196761, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 196761, "pid": 5, "tid": 7, "ts": 1716454224560140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504675, "dur": 9, "args": { "External id": 196761, "cbid": 211, "correlation": 196761 } }, { "ph": "s", "id": 196761, "pid": 76337, "tid": -914061504, "ts": 1716454224504675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224504743, "dur": 1, "args": { "External id": 196777, "cbid": 251, "correlation": 196777 } }, { "ph": "f", "id": 196777, "pid": 76337, "tid": -914061504, "ts": 1716454224504743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224504748, "dur": 0, "args": { "External id": 196779, "cbid": 251, "correlation": 196779 } }, { "ph": "f", "id": 196779, "pid": 76337, "tid": -914061504, "ts": 1716454224504748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224560144, "dur": 13, "args": { "External id": 196780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196780, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196780, "pid": 5, "tid": 7, "ts": 1716454224560144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504750, "dur": 12, "args": { "External id": 196780, "cbid": 211, "correlation": 196780 } }, { "ph": "s", "id": 196780, "pid": 76337, "tid": -914061504, "ts": 1716454224504750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224560159, "dur": 5, "args": { "External id": 196782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196782, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196782, "pid": 5, "tid": 7, "ts": 1716454224560159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504764, "dur": 5, "args": { "External id": 196782, "cbid": 211, "correlation": 196782 } }, { "ph": "s", "id": 196782, "pid": 76337, "tid": -914061504, "ts": 1716454224504764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224560165, "dur": 6, "args": { "External id": 196792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196792, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196792, "pid": 5, "tid": 7, "ts": 1716454224560165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504821, "dur": 12, "args": { "External id": 196792, "cbid": 211, "correlation": 196792 } }, { "ph": "s", "id": 196792, "pid": 76337, "tid": -914061504, "ts": 1716454224504821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224560172, "dur": 10, "args": { "External id": 196812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196812, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 196812, "pid": 5, "tid": 7, "ts": 1716454224560172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504887, "dur": 11, "args": { "External id": 196812, "cbid": 211, "correlation": 196812 } }, { "ph": "s", "id": 196812, "pid": 76337, "tid": -914061504, "ts": 1716454224504887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224560183, "dur": 4, "args": { "External id": 196824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196824, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 196824, "pid": 5, "tid": 7, "ts": 1716454224560183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504909, "dur": 6, "args": { "External id": 196824, "cbid": 211, "correlation": 196824 } }, { "ph": "s", "id": 196824, "pid": 76337, "tid": -914061504, "ts": 1716454224504909, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224560188, "dur": 7, "args": { "External id": 196827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196827, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196827, "pid": 5, "tid": 7, "ts": 1716454224560188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504928, "dur": 6, "args": { "External id": 196827, "cbid": 211, "correlation": 196827 } }, { "ph": "s", "id": 196827, "pid": 76337, "tid": -914061504, "ts": 1716454224504928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224560196, "dur": 5, "args": { "External id": 196836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196836, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196836, "pid": 5, "tid": 7, "ts": 1716454224560196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224504968, "dur": 17, "args": { "External id": 196836, "cbid": 211, "correlation": 196836 } }, { "ph": "s", "id": 196836, "pid": 76337, "tid": -914061504, "ts": 1716454224504968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224505040, "dur": 0, "args": { "External id": 196846, "cbid": 317, "correlation": 196846 } }, { "ph": "f", "id": 196846, "pid": 76337, "tid": -914061504, "ts": 1716454224505040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224505041, "dur": 0, "args": { "External id": 196847, "cbid": 203, "correlation": 196847 } }, { "ph": "f", "id": 196847, "pid": 76337, "tid": -914061504, "ts": 1716454224505041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224505042, "dur": 0, "args": { "External id": 196848, "cbid": 205, "correlation": 196848 } }, { "ph": "f", "id": 196848, "pid": 76337, "tid": -914061504, "ts": 1716454224505042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224560202, "dur": 5, "args": { "External id": 196852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196852, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196852, "pid": 5, "tid": 7, "ts": 1716454224560202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505056, "dur": 12, "args": { "External id": 196852, "cbid": 211, "correlation": 196852 } }, { "ph": "s", "id": 196852, "pid": 76337, "tid": -914061504, "ts": 1716454224505056, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224560208, "dur": 161, "args": { "External id": 196854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196854, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196854, "pid": 5, "tid": 7, "ts": 1716454224560208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505071, "dur": 5, "args": { "External id": 196854, "cbid": 211, "correlation": 196854 } }, { "ph": "s", "id": 196854, "pid": 76337, "tid": -914061504, "ts": 1716454224505071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224560372, "dur": 1, "args": { "External id": 196856, "device": 5, "context": 1, "stream": 7, "correlation": 196856, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 196856, "pid": 5, "tid": 7, "ts": 1716454224560372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224505082, "dur": 6, "args": { "External id": 196856, "cbid": 51, "correlation": 196856 } }, { "ph": "s", "id": 196856, "pid": 76337, "tid": -914061504, "ts": 1716454224505082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224560375, "dur": 260, "args": { "External id": 196857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196857, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196857, "pid": 5, "tid": 7, "ts": 1716454224560375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505089, "dur": 6, "args": { "External id": 196857, "cbid": 211, "correlation": 196857 } }, { "ph": "s", "id": 196857, "pid": 76337, "tid": -914061504, "ts": 1716454224505089, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224560637, "dur": 6, "args": { "External id": 196859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196859, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196859, "pid": 5, "tid": 7, "ts": 1716454224560637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505099, "dur": 5, "args": { "External id": 196859, "cbid": 211, "correlation": 196859 } }, { "ph": "s", "id": 196859, "pid": 76337, "tid": -914061504, "ts": 1716454224505099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224560644, "dur": 6, "args": { "External id": 196865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196865, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196865, "pid": 5, "tid": 7, "ts": 1716454224560644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505127, "dur": 9, "args": { "External id": 196865, "cbid": 211, "correlation": 196865 } }, { "ph": "s", "id": 196865, "pid": 76337, "tid": -914061504, "ts": 1716454224505127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224560652, "dur": 5, "args": { "External id": 196873, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196873, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196873, "pid": 5, "tid": 7, "ts": 1716454224560652, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505161, "dur": 8, "args": { "External id": 196873, "cbid": 211, "correlation": 196873 } }, { "ph": "s", "id": 196873, "pid": 76337, "tid": -914061504, "ts": 1716454224505161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224560658, "dur": 4, "args": { "External id": 196881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196881, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196881, "pid": 5, "tid": 7, "ts": 1716454224560658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505189, "dur": 9, "args": { "External id": 196881, "cbid": 211, "correlation": 196881 } }, { "ph": "s", "id": 196881, "pid": 76337, "tid": -914061504, "ts": 1716454224505189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224560664, "dur": 9, "args": { "External id": 196901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196901, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 196901, "pid": 5, "tid": 7, "ts": 1716454224560664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505265, "dur": 12, "args": { "External id": 196901, "cbid": 211, "correlation": 196901 } }, { "ph": "s", "id": 196901, "pid": 76337, "tid": -914061504, "ts": 1716454224505265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224560675, "dur": 4, "args": { "External id": 196913, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196913, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 196913, "pid": 5, "tid": 7, "ts": 1716454224560675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505286, "dur": 6, "args": { "External id": 196913, "cbid": 211, "correlation": 196913 } }, { "ph": "s", "id": 196913, "pid": 76337, "tid": -914061504, "ts": 1716454224505286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224560680, "dur": 7, "args": { "External id": 196916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196916, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196916, "pid": 5, "tid": 7, "ts": 1716454224560680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505304, "dur": 7, "args": { "External id": 196916, "cbid": 211, "correlation": 196916 } }, { "ph": "s", "id": 196916, "pid": 76337, "tid": -914061504, "ts": 1716454224505304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224560688, "dur": 5, "args": { "External id": 196925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196925, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196925, "pid": 5, "tid": 7, "ts": 1716454224560688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505342, "dur": 9, "args": { "External id": 196925, "cbid": 211, "correlation": 196925 } }, { "ph": "s", "id": 196925, "pid": 76337, "tid": -914061504, "ts": 1716454224505342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224505394, "dur": 0, "args": { "External id": 196935, "cbid": 317, "correlation": 196935 } }, { "ph": "f", "id": 196935, "pid": 76337, "tid": -914061504, "ts": 1716454224505394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224505394, "dur": 0, "args": { "External id": 196936, "cbid": 203, "correlation": 196936 } }, { "ph": "f", "id": 196936, "pid": 76337, "tid": -914061504, "ts": 1716454224505394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224505395, "dur": 0, "args": { "External id": 196937, "cbid": 205, "correlation": 196937 } }, { "ph": "f", "id": 196937, "pid": 76337, "tid": -914061504, "ts": 1716454224505395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224560694, "dur": 5, "args": { "External id": 196941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196941, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196941, "pid": 5, "tid": 7, "ts": 1716454224560694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505409, "dur": 11, "args": { "External id": 196941, "cbid": 211, "correlation": 196941 } }, { "ph": "s", "id": 196941, "pid": 76337, "tid": -914061504, "ts": 1716454224505409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224560700, "dur": 163, "args": { "External id": 196943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196943, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196943, "pid": 5, "tid": 7, "ts": 1716454224560700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505422, "dur": 5, "args": { "External id": 196943, "cbid": 211, "correlation": 196943 } }, { "ph": "s", "id": 196943, "pid": 76337, "tid": -914061504, "ts": 1716454224505422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224560865, "dur": 1, "args": { "External id": 196945, "device": 5, "context": 1, "stream": 7, "correlation": 196945, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 196945, "pid": 5, "tid": 7, "ts": 1716454224560865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224505433, "dur": 6, "args": { "External id": 196945, "cbid": 51, "correlation": 196945 } }, { "ph": "s", "id": 196945, "pid": 76337, "tid": -914061504, "ts": 1716454224505433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224560869, "dur": 258, "args": { "External id": 196946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196946, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196946, "pid": 5, "tid": 7, "ts": 1716454224560869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505440, "dur": 6, "args": { "External id": 196946, "cbid": 211, "correlation": 196946 } }, { "ph": "s", "id": 196946, "pid": 76337, "tid": -914061504, "ts": 1716454224505440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224561128, "dur": 6, "args": { "External id": 196948, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196948, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 196948, "pid": 5, "tid": 7, "ts": 1716454224561128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505449, "dur": 5, "args": { "External id": 196948, "cbid": 211, "correlation": 196948 } }, { "ph": "s", "id": 196948, "pid": 76337, "tid": -914061504, "ts": 1716454224505449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224561135, "dur": 6, "args": { "External id": 196954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196954, "pid": 5, "tid": 7, "ts": 1716454224561135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505477, "dur": 8, "args": { "External id": 196954, "cbid": 211, "correlation": 196954 } }, { "ph": "s", "id": 196954, "pid": 76337, "tid": -914061504, "ts": 1716454224505477, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224561143, "dur": 3, "args": { "External id": 196962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196962, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 196962, "pid": 5, "tid": 7, "ts": 1716454224561143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505520, "dur": 10, "args": { "External id": 196962, "cbid": 211, "correlation": 196962 } }, { "ph": "s", "id": 196962, "pid": 76337, "tid": -914061504, "ts": 1716454224505520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224505582, "dur": 1, "args": { "External id": 196978, "cbid": 251, "correlation": 196978 } }, { "ph": "f", "id": 196978, "pid": 76337, "tid": -914061504, "ts": 1716454224505582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224505587, "dur": 0, "args": { "External id": 196980, "cbid": 251, "correlation": 196980 } }, { "ph": "f", "id": 196980, "pid": 76337, "tid": -914061504, "ts": 1716454224505587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224561147, "dur": 10, "args": { "External id": 196981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196981, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196981, "pid": 5, "tid": 7, "ts": 1716454224561147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505588, "dur": 11, "args": { "External id": 196981, "cbid": 211, "correlation": 196981 } }, { "ph": "s", "id": 196981, "pid": 76337, "tid": -914061504, "ts": 1716454224505588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224561159, "dur": 4, "args": { "External id": 196983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196983, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 196983, "pid": 5, "tid": 7, "ts": 1716454224561159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505601, "dur": 9, "args": { "External id": 196983, "cbid": 211, "correlation": 196983 } }, { "ph": "s", "id": 196983, "pid": 76337, "tid": -914061504, "ts": 1716454224505601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224561164, "dur": 6, "args": { "External id": 196993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 196993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 196993, "pid": 5, "tid": 7, "ts": 1716454224561164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505661, "dur": 12, "args": { "External id": 196993, "cbid": 211, "correlation": 196993 } }, { "ph": "s", "id": 196993, "pid": 76337, "tid": -914061504, "ts": 1716454224505661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224561171, "dur": 10, "args": { "External id": 197013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197013, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197013, "pid": 5, "tid": 7, "ts": 1716454224561171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505727, "dur": 10, "args": { "External id": 197013, "cbid": 211, "correlation": 197013 } }, { "ph": "s", "id": 197013, "pid": 76337, "tid": -914061504, "ts": 1716454224505727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224561182, "dur": 4, "args": { "External id": 197025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197025, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 197025, "pid": 5, "tid": 7, "ts": 1716454224561182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505747, "dur": 6, "args": { "External id": 197025, "cbid": 211, "correlation": 197025 } }, { "ph": "s", "id": 197025, "pid": 76337, "tid": -914061504, "ts": 1716454224505747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224561187, "dur": 7, "args": { "External id": 197028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197028, "pid": 5, "tid": 7, "ts": 1716454224561187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505767, "dur": 6, "args": { "External id": 197028, "cbid": 211, "correlation": 197028 } }, { "ph": "s", "id": 197028, "pid": 76337, "tid": -914061504, "ts": 1716454224505767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224561194, "dur": 5, "args": { "External id": 197037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197037, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197037, "pid": 5, "tid": 7, "ts": 1716454224561194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505806, "dur": 11, "args": { "External id": 197037, "cbid": 211, "correlation": 197037 } }, { "ph": "s", "id": 197037, "pid": 76337, "tid": -914061504, "ts": 1716454224505806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224505870, "dur": 0, "args": { "External id": 197047, "cbid": 317, "correlation": 197047 } }, { "ph": "f", "id": 197047, "pid": 76337, "tid": -914061504, "ts": 1716454224505870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224505870, "dur": 0, "args": { "External id": 197048, "cbid": 203, "correlation": 197048 } }, { "ph": "f", "id": 197048, "pid": 76337, "tid": -914061504, "ts": 1716454224505870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224505871, "dur": 0, "args": { "External id": 197049, "cbid": 205, "correlation": 197049 } }, { "ph": "f", "id": 197049, "pid": 76337, "tid": -914061504, "ts": 1716454224505871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224561200, "dur": 5, "args": { "External id": 197053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197053, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197053, "pid": 5, "tid": 7, "ts": 1716454224561200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505885, "dur": 12, "args": { "External id": 197053, "cbid": 211, "correlation": 197053 } }, { "ph": "s", "id": 197053, "pid": 76337, "tid": -914061504, "ts": 1716454224505885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224561207, "dur": 162, "args": { "External id": 197055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197055, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197055, "pid": 5, "tid": 7, "ts": 1716454224561207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505899, "dur": 5, "args": { "External id": 197055, "cbid": 211, "correlation": 197055 } }, { "ph": "s", "id": 197055, "pid": 76337, "tid": -914061504, "ts": 1716454224505899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224561371, "dur": 1, "args": { "External id": 197057, "device": 5, "context": 1, "stream": 7, "correlation": 197057, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 197057, "pid": 5, "tid": 7, "ts": 1716454224561371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224505910, "dur": 7, "args": { "External id": 197057, "cbid": 51, "correlation": 197057 } }, { "ph": "s", "id": 197057, "pid": 76337, "tid": -914061504, "ts": 1716454224505910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224561374, "dur": 259, "args": { "External id": 197058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197058, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197058, "pid": 5, "tid": 7, "ts": 1716454224561374, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505918, "dur": 6, "args": { "External id": 197058, "cbid": 211, "correlation": 197058 } }, { "ph": "s", "id": 197058, "pid": 76337, "tid": -914061504, "ts": 1716454224505918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224561635, "dur": 6, "args": { "External id": 197060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197060, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197060, "pid": 5, "tid": 7, "ts": 1716454224561635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505929, "dur": 5, "args": { "External id": 197060, "cbid": 211, "correlation": 197060 } }, { "ph": "s", "id": 197060, "pid": 76337, "tid": -914061504, "ts": 1716454224505929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224561641, "dur": 6, "args": { "External id": 197066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197066, "pid": 5, "tid": 7, "ts": 1716454224561641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505957, "dur": 9, "args": { "External id": 197066, "cbid": 211, "correlation": 197066 } }, { "ph": "s", "id": 197066, "pid": 76337, "tid": -914061504, "ts": 1716454224505957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224561649, "dur": 5, "args": { "External id": 197074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197074, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197074, "pid": 5, "tid": 7, "ts": 1716454224561649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224505999, "dur": 10, "args": { "External id": 197074, "cbid": 211, "correlation": 197074 } }, { "ph": "s", "id": 197074, "pid": 76337, "tid": -914061504, "ts": 1716454224505999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224561655, "dur": 4, "args": { "External id": 197082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197082, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197082, "pid": 5, "tid": 7, "ts": 1716454224561655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506032, "dur": 8, "args": { "External id": 197082, "cbid": 211, "correlation": 197082 } }, { "ph": "s", "id": 197082, "pid": 76337, "tid": -914061504, "ts": 1716454224506032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224561661, "dur": 10, "args": { "External id": 197102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197102, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197102, "pid": 5, "tid": 7, "ts": 1716454224561661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506136, "dur": 12, "args": { "External id": 197102, "cbid": 211, "correlation": 197102 } }, { "ph": "s", "id": 197102, "pid": 76337, "tid": -914061504, "ts": 1716454224506136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224561672, "dur": 4, "args": { "External id": 197114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197114, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 197114, "pid": 5, "tid": 7, "ts": 1716454224561672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506158, "dur": 6, "args": { "External id": 197114, "cbid": 211, "correlation": 197114 } }, { "ph": "s", "id": 197114, "pid": 76337, "tid": -914061504, "ts": 1716454224506158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224561677, "dur": 6, "args": { "External id": 197117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197117, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197117, "pid": 5, "tid": 7, "ts": 1716454224561677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506177, "dur": 6, "args": { "External id": 197117, "cbid": 211, "correlation": 197117 } }, { "ph": "s", "id": 197117, "pid": 76337, "tid": -914061504, "ts": 1716454224506177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224561685, "dur": 5, "args": { "External id": 197126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197126, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197126, "pid": 5, "tid": 7, "ts": 1716454224561685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506214, "dur": 10, "args": { "External id": 197126, "cbid": 211, "correlation": 197126 } }, { "ph": "s", "id": 197126, "pid": 76337, "tid": -914061504, "ts": 1716454224506214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224506267, "dur": 0, "args": { "External id": 197136, "cbid": 317, "correlation": 197136 } }, { "ph": "f", "id": 197136, "pid": 76337, "tid": -914061504, "ts": 1716454224506267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224506268, "dur": 0, "args": { "External id": 197137, "cbid": 203, "correlation": 197137 } }, { "ph": "f", "id": 197137, "pid": 76337, "tid": -914061504, "ts": 1716454224506268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224506269, "dur": 0, "args": { "External id": 197138, "cbid": 205, "correlation": 197138 } }, { "ph": "f", "id": 197138, "pid": 76337, "tid": -914061504, "ts": 1716454224506269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224561690, "dur": 5, "args": { "External id": 197142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197142, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197142, "pid": 5, "tid": 7, "ts": 1716454224561690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506283, "dur": 11, "args": { "External id": 197142, "cbid": 211, "correlation": 197142 } }, { "ph": "s", "id": 197142, "pid": 76337, "tid": -914061504, "ts": 1716454224506283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224561697, "dur": 162, "args": { "External id": 197144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197144, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197144, "pid": 5, "tid": 7, "ts": 1716454224561697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506297, "dur": 5, "args": { "External id": 197144, "cbid": 211, "correlation": 197144 } }, { "ph": "s", "id": 197144, "pid": 76337, "tid": -914061504, "ts": 1716454224506297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224561861, "dur": 1, "args": { "External id": 197146, "device": 5, "context": 1, "stream": 7, "correlation": 197146, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 197146, "pid": 5, "tid": 7, "ts": 1716454224561861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224506308, "dur": 6, "args": { "External id": 197146, "cbid": 51, "correlation": 197146 } }, { "ph": "s", "id": 197146, "pid": 76337, "tid": -914061504, "ts": 1716454224506308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224561865, "dur": 259, "args": { "External id": 197147, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197147, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197147, "pid": 5, "tid": 7, "ts": 1716454224561865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506315, "dur": 6, "args": { "External id": 197147, "cbid": 211, "correlation": 197147 } }, { "ph": "s", "id": 197147, "pid": 76337, "tid": -914061504, "ts": 1716454224506315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224562125, "dur": 5, "args": { "External id": 197149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197149, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197149, "pid": 5, "tid": 7, "ts": 1716454224562125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506325, "dur": 5, "args": { "External id": 197149, "cbid": 211, "correlation": 197149 } }, { "ph": "s", "id": 197149, "pid": 76337, "tid": -914061504, "ts": 1716454224506325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224562132, "dur": 6, "args": { "External id": 197155, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197155, "pid": 5, "tid": 7, "ts": 1716454224562132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506354, "dur": 8, "args": { "External id": 197155, "cbid": 211, "correlation": 197155 } }, { "ph": "s", "id": 197155, "pid": 76337, "tid": -914061504, "ts": 1716454224506354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224562139, "dur": 3, "args": { "External id": 197163, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197163, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 197163, "pid": 5, "tid": 7, "ts": 1716454224562139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506397, "dur": 9, "args": { "External id": 197163, "cbid": 211, "correlation": 197163 } }, { "ph": "s", "id": 197163, "pid": 76337, "tid": -914061504, "ts": 1716454224506397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224506458, "dur": 1, "args": { "External id": 197179, "cbid": 251, "correlation": 197179 } }, { "ph": "f", "id": 197179, "pid": 76337, "tid": -914061504, "ts": 1716454224506458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224506463, "dur": 0, "args": { "External id": 197181, "cbid": 251, "correlation": 197181 } }, { "ph": "f", "id": 197181, "pid": 76337, "tid": -914061504, "ts": 1716454224506463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224562143, "dur": 10, "args": { "External id": 197182, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197182, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197182, "pid": 5, "tid": 7, "ts": 1716454224562143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506465, "dur": 12, "args": { "External id": 197182, "cbid": 211, "correlation": 197182 } }, { "ph": "s", "id": 197182, "pid": 76337, "tid": -914061504, "ts": 1716454224506465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224562155, "dur": 4, "args": { "External id": 197184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197184, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197184, "pid": 5, "tid": 7, "ts": 1716454224562155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506479, "dur": 5, "args": { "External id": 197184, "cbid": 211, "correlation": 197184 } }, { "ph": "s", "id": 197184, "pid": 76337, "tid": -914061504, "ts": 1716454224506479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224562160, "dur": 6, "args": { "External id": 197194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197194, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197194, "pid": 5, "tid": 7, "ts": 1716454224562160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506535, "dur": 12, "args": { "External id": 197194, "cbid": 211, "correlation": 197194 } }, { "ph": "s", "id": 197194, "pid": 76337, "tid": -914061504, "ts": 1716454224506535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224562167, "dur": 9, "args": { "External id": 197214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197214, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197214, "pid": 5, "tid": 7, "ts": 1716454224562167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506600, "dur": 11, "args": { "External id": 197214, "cbid": 211, "correlation": 197214 } }, { "ph": "s", "id": 197214, "pid": 76337, "tid": -914061504, "ts": 1716454224506600, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224562178, "dur": 4, "args": { "External id": 197226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197226, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 197226, "pid": 5, "tid": 7, "ts": 1716454224562178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506621, "dur": 6, "args": { "External id": 197226, "cbid": 211, "correlation": 197226 } }, { "ph": "s", "id": 197226, "pid": 76337, "tid": -914061504, "ts": 1716454224506621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224562183, "dur": 7, "args": { "External id": 197229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197229, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197229, "pid": 5, "tid": 7, "ts": 1716454224562183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506639, "dur": 6, "args": { "External id": 197229, "cbid": 211, "correlation": 197229 } }, { "ph": "s", "id": 197229, "pid": 76337, "tid": -914061504, "ts": 1716454224506639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224562191, "dur": 5, "args": { "External id": 197238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197238, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197238, "pid": 5, "tid": 7, "ts": 1716454224562191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506678, "dur": 11, "args": { "External id": 197238, "cbid": 211, "correlation": 197238 } }, { "ph": "s", "id": 197238, "pid": 76337, "tid": -914061504, "ts": 1716454224506678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224506743, "dur": 0, "args": { "External id": 197248, "cbid": 317, "correlation": 197248 } }, { "ph": "f", "id": 197248, "pid": 76337, "tid": -914061504, "ts": 1716454224506743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224506744, "dur": 0, "args": { "External id": 197249, "cbid": 203, "correlation": 197249 } }, { "ph": "f", "id": 197249, "pid": 76337, "tid": -914061504, "ts": 1716454224506744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224506745, "dur": 0, "args": { "External id": 197250, "cbid": 205, "correlation": 197250 } }, { "ph": "f", "id": 197250, "pid": 76337, "tid": -914061504, "ts": 1716454224506745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224562197, "dur": 5, "args": { "External id": 197254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197254, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197254, "pid": 5, "tid": 7, "ts": 1716454224562197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506758, "dur": 12, "args": { "External id": 197254, "cbid": 211, "correlation": 197254 } }, { "ph": "s", "id": 197254, "pid": 76337, "tid": -914061504, "ts": 1716454224506758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224562203, "dur": 162, "args": { "External id": 197256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197256, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197256, "pid": 5, "tid": 7, "ts": 1716454224562203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506772, "dur": 5, "args": { "External id": 197256, "cbid": 211, "correlation": 197256 } }, { "ph": "s", "id": 197256, "pid": 76337, "tid": -914061504, "ts": 1716454224506772, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224562367, "dur": 1, "args": { "External id": 197258, "device": 5, "context": 1, "stream": 7, "correlation": 197258, "bytes": 240, "memory bandwidth (GB/s)": 0.15296367112810708 } }, { "ph": "f", "id": 197258, "pid": 5, "tid": 7, "ts": 1716454224562367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224506783, "dur": 7, "args": { "External id": 197258, "cbid": 51, "correlation": 197258 } }, { "ph": "s", "id": 197258, "pid": 76337, "tid": -914061504, "ts": 1716454224506783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224562371, "dur": 258, "args": { "External id": 197259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197259, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197259, "pid": 5, "tid": 7, "ts": 1716454224562371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506791, "dur": 6, "args": { "External id": 197259, "cbid": 211, "correlation": 197259 } }, { "ph": "s", "id": 197259, "pid": 76337, "tid": -914061504, "ts": 1716454224506791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224562630, "dur": 5, "args": { "External id": 197261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197261, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197261, "pid": 5, "tid": 7, "ts": 1716454224562630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506801, "dur": 5, "args": { "External id": 197261, "cbid": 211, "correlation": 197261 } }, { "ph": "s", "id": 197261, "pid": 76337, "tid": -914061504, "ts": 1716454224506801, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224562637, "dur": 6, "args": { "External id": 197267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197267, "pid": 5, "tid": 7, "ts": 1716454224562637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506829, "dur": 9, "args": { "External id": 197267, "cbid": 211, "correlation": 197267 } }, { "ph": "s", "id": 197267, "pid": 76337, "tid": -914061504, "ts": 1716454224506829, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224562644, "dur": 5, "args": { "External id": 197275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197275, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197275, "pid": 5, "tid": 7, "ts": 1716454224562644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506862, "dur": 8, "args": { "External id": 197275, "cbid": 211, "correlation": 197275 } }, { "ph": "s", "id": 197275, "pid": 76337, "tid": -914061504, "ts": 1716454224506862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224562650, "dur": 4, "args": { "External id": 197283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197283, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197283, "pid": 5, "tid": 7, "ts": 1716454224562650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224506891, "dur": 9, "args": { "External id": 197283, "cbid": 211, "correlation": 197283 } }, { "ph": "s", "id": 197283, "pid": 76337, "tid": -914061504, "ts": 1716454224506891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224562656, "dur": 10, "args": { "External id": 197303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197303, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197303, "pid": 5, "tid": 7, "ts": 1716454224562656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507057, "dur": 14, "args": { "External id": 197303, "cbid": 211, "correlation": 197303 } }, { "ph": "s", "id": 197303, "pid": 76337, "tid": -914061504, "ts": 1716454224507057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224562667, "dur": 4, "args": { "External id": 197315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197315, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 197315, "pid": 5, "tid": 7, "ts": 1716454224562667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507081, "dur": 6, "args": { "External id": 197315, "cbid": 211, "correlation": 197315 } }, { "ph": "s", "id": 197315, "pid": 76337, "tid": -914061504, "ts": 1716454224507081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224562672, "dur": 7, "args": { "External id": 197318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197318, "pid": 5, "tid": 7, "ts": 1716454224562672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507100, "dur": 7, "args": { "External id": 197318, "cbid": 211, "correlation": 197318 } }, { "ph": "s", "id": 197318, "pid": 76337, "tid": -914061504, "ts": 1716454224507100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224507159, "dur": 0, "args": { "External id": 197329, "cbid": 317, "correlation": 197329 } }, { "ph": "f", "id": 197329, "pid": 76337, "tid": -914061504, "ts": 1716454224507159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224507160, "dur": 0, "args": { "External id": 197330, "cbid": 203, "correlation": 197330 } }, { "ph": "f", "id": 197330, "pid": 76337, "tid": -914061504, "ts": 1716454224507160, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224507161, "dur": 0, "args": { "External id": 197331, "cbid": 205, "correlation": 197331 } }, { "ph": "f", "id": 197331, "pid": 76337, "tid": -914061504, "ts": 1716454224507161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224562680, "dur": 5, "args": { "External id": 197335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197335, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197335, "pid": 5, "tid": 7, "ts": 1716454224562680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507178, "dur": 13, "args": { "External id": 197335, "cbid": 211, "correlation": 197335 } }, { "ph": "s", "id": 197335, "pid": 76337, "tid": -914061504, "ts": 1716454224507178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224562687, "dur": 37, "args": { "External id": 197337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197337, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 197337, "pid": 5, "tid": 7, "ts": 1716454224562687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507198, "dur": 9, "args": { "External id": 197337, "cbid": 211, "correlation": 197337 } }, { "ph": "s", "id": 197337, "pid": 76337, "tid": -914061504, "ts": 1716454224507198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224562725, "dur": 5, "args": { "External id": 197339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197339, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197339, "pid": 5, "tid": 7, "ts": 1716454224562725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507211, "dur": 5, "args": { "External id": 197339, "cbid": 211, "correlation": 197339 } }, { "ph": "s", "id": 197339, "pid": 76337, "tid": -914061504, "ts": 1716454224507211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224562731, "dur": 6, "args": { "External id": 197345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197345, "pid": 5, "tid": 7, "ts": 1716454224562731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507239, "dur": 8, "args": { "External id": 197345, "cbid": 211, "correlation": 197345 } }, { "ph": "s", "id": 197345, "pid": 76337, "tid": -914061504, "ts": 1716454224507239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224562738, "dur": 20, "args": { "External id": 197354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197354, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197354, "pid": 5, "tid": 7, "ts": 1716454224562738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507321, "dur": 14, "args": { "External id": 197354, "cbid": 211, "correlation": 197354 } }, { "ph": "s", "id": 197354, "pid": 76337, "tid": -914061504, "ts": 1716454224507321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224562759, "dur": 11, "args": { "External id": 197376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197376, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 197376, "pid": 5, "tid": 7, "ts": 1716454224562759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507378, "dur": 11, "args": { "External id": 197376, "cbid": 211, "correlation": 197376 } }, { "ph": "s", "id": 197376, "pid": 76337, "tid": -914061504, "ts": 1716454224507378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507468, "dur": 2, "args": { "External id": 197387, "cbid": 251, "correlation": 197387 } }, { "ph": "f", "id": 197387, "pid": 76337, "tid": -914061504, "ts": 1716454224507468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507473, "dur": 0, "args": { "External id": 197388, "cbid": 251, "correlation": 197388 } }, { "ph": "f", "id": 197388, "pid": 76337, "tid": -914061504, "ts": 1716454224507473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224562771, "dur": 54, "args": { "External id": 197389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197389, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 197389, "pid": 5, "tid": 7, "ts": 1716454224562771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507476, "dur": 15, "args": { "External id": 197389, "cbid": 211, "correlation": 197389 } }, { "ph": "s", "id": 197389, "pid": 76337, "tid": -914061504, "ts": 1716454224507476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507548, "dur": 1, "args": { "External id": 197400, "cbid": 251, "correlation": 197400 } }, { "ph": "f", "id": 197400, "pid": 76337, "tid": -914061504, "ts": 1716454224507548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507551, "dur": 0, "args": { "External id": 197401, "cbid": 251, "correlation": 197401 } }, { "ph": "f", "id": 197401, "pid": 76337, "tid": -914061504, "ts": 1716454224507551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224562826, "dur": 53, "args": { "External id": 197402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197402, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 197402, "pid": 5, "tid": 7, "ts": 1716454224562826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507553, "dur": 11, "args": { "External id": 197402, "cbid": 211, "correlation": 197402 } }, { "ph": "s", "id": 197402, "pid": 76337, "tid": -914061504, "ts": 1716454224507553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507618, "dur": 1, "args": { "External id": 197413, "cbid": 251, "correlation": 197413 } }, { "ph": "f", "id": 197413, "pid": 76337, "tid": -914061504, "ts": 1716454224507618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507622, "dur": 0, "args": { "External id": 197414, "cbid": 251, "correlation": 197414 } }, { "ph": "f", "id": 197414, "pid": 76337, "tid": -914061504, "ts": 1716454224507622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224562881, "dur": 54, "args": { "External id": 197415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197415, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 197415, "pid": 5, "tid": 7, "ts": 1716454224562881, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507624, "dur": 12, "args": { "External id": 197415, "cbid": 211, "correlation": 197415 } }, { "ph": "s", "id": 197415, "pid": 76337, "tid": -914061504, "ts": 1716454224507624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224562936, "dur": 55, "args": { "External id": 197440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197440, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197440, "pid": 5, "tid": 7, "ts": 1716454224562936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507708, "dur": 12, "args": { "External id": 197440, "cbid": 211, "correlation": 197440 } }, { "ph": "s", "id": 197440, "pid": 76337, "tid": -914061504, "ts": 1716454224507708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224507807, "dur": 1, "args": { "External id": 197458, "cbid": 251, "correlation": 197458 } }, { "ph": "f", "id": 197458, "pid": 76337, "tid": -914061504, "ts": 1716454224507807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224562992, "dur": 63, "args": { "External id": 197460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197460, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 197460, "pid": 5, "tid": 7, "ts": 1716454224562992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507813, "dur": 13, "args": { "External id": 197460, "cbid": 211, "correlation": 197460 } }, { "ph": "s", "id": 197460, "pid": 76337, "tid": -914061504, "ts": 1716454224507813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224563056, "dur": 6, "args": { "External id": 197468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197468, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197468, "pid": 5, "tid": 7, "ts": 1716454224563056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507882, "dur": 13, "args": { "External id": 197468, "cbid": 211, "correlation": 197468 } }, { "ph": "s", "id": 197468, "pid": 76337, "tid": -914061504, "ts": 1716454224507882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224563064, "dur": 7, "args": { "External id": 197476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197476, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197476, "pid": 5, "tid": 7, "ts": 1716454224563064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507920, "dur": 8, "args": { "External id": 197476, "cbid": 211, "correlation": 197476 } }, { "ph": "s", "id": 197476, "pid": 76337, "tid": -914061504, "ts": 1716454224507920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563072, "dur": 7, "args": { "External id": 197487, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197487, "pid": 5, "tid": 7, "ts": 1716454224563072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224507999, "dur": 13, "args": { "External id": 197487, "cbid": 211, "correlation": 197487 } }, { "ph": "s", "id": 197487, "pid": 76337, "tid": -914061504, "ts": 1716454224507999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224563081, "dur": 9, "args": { "External id": 197509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197509, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 197509, "pid": 5, "tid": 7, "ts": 1716454224563081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508031, "dur": 9, "args": { "External id": 197509, "cbid": 211, "correlation": 197509 } }, { "ph": "s", "id": 197509, "pid": 76337, "tid": -914061504, "ts": 1716454224508031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508118, "dur": 2, "args": { "External id": 197520, "cbid": 251, "correlation": 197520 } }, { "ph": "f", "id": 197520, "pid": 76337, "tid": -914061504, "ts": 1716454224508118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224563092, "dur": 1, "args": { "External id": 197521, "device": 5, "context": 1, "stream": 7, "correlation": 197521, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 197521, "pid": 5, "tid": 7, "ts": 1716454224563092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224508124, "dur": 10, "args": { "External id": 197521, "cbid": 51, "correlation": 197521 } }, { "ph": "s", "id": 197521, "pid": 76337, "tid": -914061504, "ts": 1716454224508124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224563096, "dur": 37, "args": { "External id": 197522, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197522, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 197522, "pid": 5, "tid": 7, "ts": 1716454224563096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508136, "dur": 12, "args": { "External id": 197522, "cbid": 211, "correlation": 197522 } }, { "ph": "s", "id": 197522, "pid": 76337, "tid": -914061504, "ts": 1716454224508136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508207, "dur": 1, "args": { "External id": 197533, "cbid": 251, "correlation": 197533 } }, { "ph": "f", "id": 197533, "pid": 76337, "tid": -914061504, "ts": 1716454224508207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508210, "dur": 0, "args": { "External id": 197534, "cbid": 251, "correlation": 197534 } }, { "ph": "f", "id": 197534, "pid": 76337, "tid": -914061504, "ts": 1716454224508210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224563134, "dur": 12, "args": { "External id": 197535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197535, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197535, "pid": 5, "tid": 7, "ts": 1716454224563134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508212, "dur": 12, "args": { "External id": 197535, "cbid": 211, "correlation": 197535 } }, { "ph": "s", "id": 197535, "pid": 76337, "tid": -914061504, "ts": 1716454224508212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224563147, "dur": 5, "args": { "External id": 197537, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197537, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197537, "pid": 5, "tid": 7, "ts": 1716454224563147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508225, "dur": 6, "args": { "External id": 197537, "cbid": 211, "correlation": 197537 } }, { "ph": "s", "id": 197537, "pid": 76337, "tid": -914061504, "ts": 1716454224508225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508283, "dur": 1, "args": { "External id": 197548, "cbid": 251, "correlation": 197548 } }, { "ph": "f", "id": 197548, "pid": 76337, "tid": -914061504, "ts": 1716454224508283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508287, "dur": 0, "args": { "External id": 197549, "cbid": 251, "correlation": 197549 } }, { "ph": "f", "id": 197549, "pid": 76337, "tid": -914061504, "ts": 1716454224508287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224563154, "dur": 8, "args": { "External id": 197550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197550, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197550, "pid": 5, "tid": 7, "ts": 1716454224563154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508289, "dur": 11, "args": { "External id": 197550, "cbid": 211, "correlation": 197550 } }, { "ph": "s", "id": 197550, "pid": 76337, "tid": -914061504, "ts": 1716454224508289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224563163, "dur": 4, "args": { "External id": 197552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197552, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197552, "pid": 5, "tid": 7, "ts": 1716454224563163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508302, "dur": 5, "args": { "External id": 197552, "cbid": 211, "correlation": 197552 } }, { "ph": "s", "id": 197552, "pid": 76337, "tid": -914061504, "ts": 1716454224508302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224563168, "dur": 20, "args": { "External id": 197577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197577, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 197577, "pid": 5, "tid": 7, "ts": 1716454224563168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508380, "dur": 13, "args": { "External id": 197577, "cbid": 211, "correlation": 197577 } }, { "ph": "s", "id": 197577, "pid": 76337, "tid": -914061504, "ts": 1716454224508380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508483, "dur": 2, "args": { "External id": 197595, "cbid": 251, "correlation": 197595 } }, { "ph": "f", "id": 197595, "pid": 76337, "tid": -914061504, "ts": 1716454224508483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224563190, "dur": 1, "args": { "External id": 197597, "device": 5, "context": 1, "stream": 7, "correlation": 197597, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 197597, "pid": 5, "tid": 7, "ts": 1716454224563190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224508489, "dur": 11, "args": { "External id": 197597, "cbid": 51, "correlation": 197597 } }, { "ph": "s", "id": 197597, "pid": 76337, "tid": -914061504, "ts": 1716454224508489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224563194, "dur": 37, "args": { "External id": 197598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197598, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 197598, "pid": 5, "tid": 7, "ts": 1716454224563194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508501, "dur": 13, "args": { "External id": 197598, "cbid": 211, "correlation": 197598 } }, { "ph": "s", "id": 197598, "pid": 76337, "tid": -914061504, "ts": 1716454224508501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224563232, "dur": 4, "args": { "External id": 197606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197606, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197606, "pid": 5, "tid": 7, "ts": 1716454224563232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508573, "dur": 12, "args": { "External id": 197606, "cbid": 211, "correlation": 197606 } }, { "ph": "s", "id": 197606, "pid": 76337, "tid": -914061504, "ts": 1716454224508573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563237, "dur": 8, "args": { "External id": 197614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197614, "pid": 5, "tid": 7, "ts": 1716454224563237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508615, "dur": 9, "args": { "External id": 197614, "cbid": 211, "correlation": 197614 } }, { "ph": "s", "id": 197614, "pid": 76337, "tid": -914061504, "ts": 1716454224508615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224563247, "dur": 8, "args": { "External id": 197636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197636, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 197636, "pid": 5, "tid": 7, "ts": 1716454224563247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508666, "dur": 10, "args": { "External id": 197636, "cbid": 211, "correlation": 197636 } }, { "ph": "s", "id": 197636, "pid": 76337, "tid": -914061504, "ts": 1716454224508666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508758, "dur": 1, "args": { "External id": 197652, "cbid": 251, "correlation": 197652 } }, { "ph": "f", "id": 197652, "pid": 76337, "tid": -914061504, "ts": 1716454224508758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508763, "dur": 0, "args": { "External id": 197654, "cbid": 251, "correlation": 197654 } }, { "ph": "f", "id": 197654, "pid": 76337, "tid": -914061504, "ts": 1716454224508763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224563256, "dur": 190, "args": { "External id": 197655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197655, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197655, "pid": 5, "tid": 7, "ts": 1716454224563256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508765, "dur": 13, "args": { "External id": 197655, "cbid": 211, "correlation": 197655 } }, { "ph": "s", "id": 197655, "pid": 76337, "tid": -914061504, "ts": 1716454224508765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563448, "dur": 21, "args": { "External id": 197663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197663, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197663, "pid": 5, "tid": 7, "ts": 1716454224563448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508832, "dur": 12, "args": { "External id": 197663, "cbid": 211, "correlation": 197663 } }, { "ph": "s", "id": 197663, "pid": 76337, "tid": -914061504, "ts": 1716454224508832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563471, "dur": 21, "args": { "External id": 197671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197671, "pid": 5, "tid": 7, "ts": 1716454224563471, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508864, "dur": 8, "args": { "External id": 197671, "cbid": 211, "correlation": 197671 } }, { "ph": "s", "id": 197671, "pid": 76337, "tid": -914061504, "ts": 1716454224508864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224508945, "dur": 1, "args": { "External id": 197687, "cbid": 251, "correlation": 197687 } }, { "ph": "f", "id": 197687, "pid": 76337, "tid": -914061504, "ts": 1716454224508945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224563494, "dur": 1, "args": { "External id": 197689, "device": 5, "context": 1, "stream": 7, "correlation": 197689, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 197689, "pid": 5, "tid": 7, "ts": 1716454224563494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224508951, "dur": 9, "args": { "External id": 197689, "cbid": 51, "correlation": 197689 } }, { "ph": "s", "id": 197689, "pid": 76337, "tid": -914061504, "ts": 1716454224508951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224563497, "dur": 110, "args": { "External id": 197690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197690, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 197690, "pid": 5, "tid": 7, "ts": 1716454224563497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224508961, "dur": 22, "args": { "External id": 197690, "cbid": 211, "correlation": 197690 } }, { "ph": "s", "id": 197690, "pid": 76337, "tid": -914061504, "ts": 1716454224508961, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224563609, "dur": 6, "args": { "External id": 197698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197698, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197698, "pid": 5, "tid": 7, "ts": 1716454224563609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509016, "dur": 11, "args": { "External id": 197698, "cbid": 211, "correlation": 197698 } }, { "ph": "s", "id": 197698, "pid": 76337, "tid": -914061504, "ts": 1716454224509016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563616, "dur": 10, "args": { "External id": 197709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197709, "pid": 5, "tid": 7, "ts": 1716454224563616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509086, "dur": 13, "args": { "External id": 197709, "cbid": 211, "correlation": 197709 } }, { "ph": "s", "id": 197709, "pid": 76337, "tid": -914061504, "ts": 1716454224509086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224509153, "dur": 0, "args": { "External id": 197721, "cbid": 317, "correlation": 197721 } }, { "ph": "f", "id": 197721, "pid": 76337, "tid": -914061504, "ts": 1716454224509153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224509154, "dur": 0, "args": { "External id": 197722, "cbid": 203, "correlation": 197722 } }, { "ph": "f", "id": 197722, "pid": 76337, "tid": -914061504, "ts": 1716454224509154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224509155, "dur": 0, "args": { "External id": 197723, "cbid": 205, "correlation": 197723 } }, { "ph": "f", "id": 197723, "pid": 76337, "tid": -914061504, "ts": 1716454224509155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224563627, "dur": 5, "args": { "External id": 197727, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197727, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197727, "pid": 5, "tid": 7, "ts": 1716454224563627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509170, "dur": 12, "args": { "External id": 197727, "cbid": 211, "correlation": 197727 } }, { "ph": "s", "id": 197727, "pid": 76337, "tid": -914061504, "ts": 1716454224509170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224563634, "dur": 37, "args": { "External id": 197729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197729, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 197729, "pid": 5, "tid": 7, "ts": 1716454224563634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509189, "dur": 7, "args": { "External id": 197729, "cbid": 211, "correlation": 197729 } }, { "ph": "s", "id": 197729, "pid": 76337, "tid": -914061504, "ts": 1716454224509189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224563673, "dur": 6, "args": { "External id": 197731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197731, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197731, "pid": 5, "tid": 7, "ts": 1716454224563673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509200, "dur": 6, "args": { "External id": 197731, "cbid": 211, "correlation": 197731 } }, { "ph": "s", "id": 197731, "pid": 76337, "tid": -914061504, "ts": 1716454224509200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563680, "dur": 7, "args": { "External id": 197737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197737, "pid": 5, "tid": 7, "ts": 1716454224563680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509227, "dur": 8, "args": { "External id": 197737, "cbid": 211, "correlation": 197737 } }, { "ph": "s", "id": 197737, "pid": 76337, "tid": -914061504, "ts": 1716454224509227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224563689, "dur": 6, "args": { "External id": 197745, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197745, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197745, "pid": 5, "tid": 7, "ts": 1716454224563689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509259, "dur": 8, "args": { "External id": 197745, "cbid": 211, "correlation": 197745 } }, { "ph": "s", "id": 197745, "pid": 76337, "tid": -914061504, "ts": 1716454224509259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224563696, "dur": 11, "args": { "External id": 197765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197765, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197765, "pid": 5, "tid": 7, "ts": 1716454224563696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509332, "dur": 12, "args": { "External id": 197765, "cbid": 211, "correlation": 197765 } }, { "ph": "s", "id": 197765, "pid": 76337, "tid": -914061504, "ts": 1716454224509332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224563709, "dur": 4, "args": { "External id": 197777, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197777, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 197777, "pid": 5, "tid": 7, "ts": 1716454224563709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509354, "dur": 7, "args": { "External id": 197777, "cbid": 211, "correlation": 197777 } }, { "ph": "s", "id": 197777, "pid": 76337, "tid": -914061504, "ts": 1716454224509354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224563714, "dur": 9, "args": { "External id": 197780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197780, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197780, "pid": 5, "tid": 7, "ts": 1716454224563714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509373, "dur": 7, "args": { "External id": 197780, "cbid": 211, "correlation": 197780 } }, { "ph": "s", "id": 197780, "pid": 76337, "tid": -914061504, "ts": 1716454224509373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224563724, "dur": 5, "args": { "External id": 197789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197789, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197789, "pid": 5, "tid": 7, "ts": 1716454224563724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509412, "dur": 10, "args": { "External id": 197789, "cbid": 211, "correlation": 197789 } }, { "ph": "s", "id": 197789, "pid": 76337, "tid": -914061504, "ts": 1716454224509412, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224509464, "dur": 0, "args": { "External id": 197799, "cbid": 317, "correlation": 197799 } }, { "ph": "f", "id": 197799, "pid": 76337, "tid": -914061504, "ts": 1716454224509464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224509465, "dur": 0, "args": { "External id": 197800, "cbid": 203, "correlation": 197800 } }, { "ph": "f", "id": 197800, "pid": 76337, "tid": -914061504, "ts": 1716454224509465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224509466, "dur": 0, "args": { "External id": 197801, "cbid": 205, "correlation": 197801 } }, { "ph": "f", "id": 197801, "pid": 76337, "tid": -914061504, "ts": 1716454224509466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224563731, "dur": 5, "args": { "External id": 197805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197805, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197805, "pid": 5, "tid": 7, "ts": 1716454224563731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509479, "dur": 11, "args": { "External id": 197805, "cbid": 211, "correlation": 197805 } }, { "ph": "s", "id": 197805, "pid": 76337, "tid": -914061504, "ts": 1716454224509479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224563737, "dur": 162, "args": { "External id": 197807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197807, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197807, "pid": 5, "tid": 7, "ts": 1716454224563737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509493, "dur": 5, "args": { "External id": 197807, "cbid": 211, "correlation": 197807 } }, { "ph": "s", "id": 197807, "pid": 76337, "tid": -914061504, "ts": 1716454224509493, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224563902, "dur": 1, "args": { "External id": 197809, "device": 5, "context": 1, "stream": 7, "correlation": 197809, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 197809, "pid": 5, "tid": 7, "ts": 1716454224563902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224509504, "dur": 7, "args": { "External id": 197809, "cbid": 51, "correlation": 197809 } }, { "ph": "s", "id": 197809, "pid": 76337, "tid": -914061504, "ts": 1716454224509504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224563905, "dur": 269, "args": { "External id": 197810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197810, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197810, "pid": 5, "tid": 7, "ts": 1716454224563905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509512, "dur": 6, "args": { "External id": 197810, "cbid": 211, "correlation": 197810 } }, { "ph": "s", "id": 197810, "pid": 76337, "tid": -914061504, "ts": 1716454224509512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224564176, "dur": 6, "args": { "External id": 197812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197812, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197812, "pid": 5, "tid": 7, "ts": 1716454224564176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509522, "dur": 5, "args": { "External id": 197812, "cbid": 211, "correlation": 197812 } }, { "ph": "s", "id": 197812, "pid": 76337, "tid": -914061504, "ts": 1716454224509522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224564183, "dur": 6, "args": { "External id": 197818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197818, "pid": 5, "tid": 7, "ts": 1716454224564183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509549, "dur": 9, "args": { "External id": 197818, "cbid": 211, "correlation": 197818 } }, { "ph": "s", "id": 197818, "pid": 76337, "tid": -914061504, "ts": 1716454224509549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224564191, "dur": 3, "args": { "External id": 197826, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197826, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 197826, "pid": 5, "tid": 7, "ts": 1716454224564191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509593, "dur": 10, "args": { "External id": 197826, "cbid": 211, "correlation": 197826 } }, { "ph": "s", "id": 197826, "pid": 76337, "tid": -914061504, "ts": 1716454224509593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224509658, "dur": 1, "args": { "External id": 197842, "cbid": 251, "correlation": 197842 } }, { "ph": "f", "id": 197842, "pid": 76337, "tid": -914061504, "ts": 1716454224509658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224509664, "dur": 0, "args": { "External id": 197844, "cbid": 251, "correlation": 197844 } }, { "ph": "f", "id": 197844, "pid": 76337, "tid": -914061504, "ts": 1716454224509664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224564195, "dur": 13, "args": { "External id": 197845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197845, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197845, "pid": 5, "tid": 7, "ts": 1716454224564195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509666, "dur": 11, "args": { "External id": 197845, "cbid": 211, "correlation": 197845 } }, { "ph": "s", "id": 197845, "pid": 76337, "tid": -914061504, "ts": 1716454224509666, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224564209, "dur": 5, "args": { "External id": 197847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197847, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197847, "pid": 5, "tid": 7, "ts": 1716454224564209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509679, "dur": 5, "args": { "External id": 197847, "cbid": 211, "correlation": 197847 } }, { "ph": "s", "id": 197847, "pid": 76337, "tid": -914061504, "ts": 1716454224509679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224564215, "dur": 6, "args": { "External id": 197857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197857, "pid": 5, "tid": 7, "ts": 1716454224564215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509736, "dur": 12, "args": { "External id": 197857, "cbid": 211, "correlation": 197857 } }, { "ph": "s", "id": 197857, "pid": 76337, "tid": -914061504, "ts": 1716454224509736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224564222, "dur": 10, "args": { "External id": 197877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197877, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197877, "pid": 5, "tid": 7, "ts": 1716454224564222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509802, "dur": 11, "args": { "External id": 197877, "cbid": 211, "correlation": 197877 } }, { "ph": "s", "id": 197877, "pid": 76337, "tid": -914061504, "ts": 1716454224509802, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224564233, "dur": 3, "args": { "External id": 197889, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197889, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 197889, "pid": 5, "tid": 7, "ts": 1716454224564233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509823, "dur": 6, "args": { "External id": 197889, "cbid": 211, "correlation": 197889 } }, { "ph": "s", "id": 197889, "pid": 76337, "tid": -914061504, "ts": 1716454224509823, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224564238, "dur": 7, "args": { "External id": 197892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197892, "pid": 5, "tid": 7, "ts": 1716454224564238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509842, "dur": 6, "args": { "External id": 197892, "cbid": 211, "correlation": 197892 } }, { "ph": "s", "id": 197892, "pid": 76337, "tid": -914061504, "ts": 1716454224509842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224564246, "dur": 5, "args": { "External id": 197901, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197901, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197901, "pid": 5, "tid": 7, "ts": 1716454224564246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509882, "dur": 9, "args": { "External id": 197901, "cbid": 211, "correlation": 197901 } }, { "ph": "s", "id": 197901, "pid": 76337, "tid": -914061504, "ts": 1716454224509882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224509944, "dur": 0, "args": { "External id": 197911, "cbid": 317, "correlation": 197911 } }, { "ph": "f", "id": 197911, "pid": 76337, "tid": -914061504, "ts": 1716454224509944, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224509945, "dur": 0, "args": { "External id": 197912, "cbid": 203, "correlation": 197912 } }, { "ph": "f", "id": 197912, "pid": 76337, "tid": -914061504, "ts": 1716454224509945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224509945, "dur": 0, "args": { "External id": 197913, "cbid": 205, "correlation": 197913 } }, { "ph": "f", "id": 197913, "pid": 76337, "tid": -914061504, "ts": 1716454224509945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224564252, "dur": 5, "args": { "External id": 197917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197917, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197917, "pid": 5, "tid": 7, "ts": 1716454224564252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509959, "dur": 13, "args": { "External id": 197917, "cbid": 211, "correlation": 197917 } }, { "ph": "s", "id": 197917, "pid": 76337, "tid": -914061504, "ts": 1716454224509959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224564259, "dur": 163, "args": { "External id": 197919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197919, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197919, "pid": 5, "tid": 7, "ts": 1716454224564259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224509982, "dur": 6, "args": { "External id": 197919, "cbid": 211, "correlation": 197919 } }, { "ph": "s", "id": 197919, "pid": 76337, "tid": -914061504, "ts": 1716454224509982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224564423, "dur": 1, "args": { "External id": 197921, "device": 5, "context": 1, "stream": 7, "correlation": 197921, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 197921, "pid": 5, "tid": 7, "ts": 1716454224564423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224509993, "dur": 6, "args": { "External id": 197921, "cbid": 51, "correlation": 197921 } }, { "ph": "s", "id": 197921, "pid": 76337, "tid": -914061504, "ts": 1716454224509993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224564427, "dur": 259, "args": { "External id": 197922, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197922, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197922, "pid": 5, "tid": 7, "ts": 1716454224564427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510001, "dur": 6, "args": { "External id": 197922, "cbid": 211, "correlation": 197922 } }, { "ph": "s", "id": 197922, "pid": 76337, "tid": -914061504, "ts": 1716454224510001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224564688, "dur": 6, "args": { "External id": 197924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197924, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197924, "pid": 5, "tid": 7, "ts": 1716454224564688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510010, "dur": 6, "args": { "External id": 197924, "cbid": 211, "correlation": 197924 } }, { "ph": "s", "id": 197924, "pid": 76337, "tid": -914061504, "ts": 1716454224510010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224564695, "dur": 6, "args": { "External id": 197930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197930, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197930, "pid": 5, "tid": 7, "ts": 1716454224564695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510040, "dur": 9, "args": { "External id": 197930, "cbid": 211, "correlation": 197930 } }, { "ph": "s", "id": 197930, "pid": 76337, "tid": -914061504, "ts": 1716454224510040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224564703, "dur": 4, "args": { "External id": 197938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197938, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197938, "pid": 5, "tid": 7, "ts": 1716454224564703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510073, "dur": 8, "args": { "External id": 197938, "cbid": 211, "correlation": 197938 } }, { "ph": "s", "id": 197938, "pid": 76337, "tid": -914061504, "ts": 1716454224510073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224564709, "dur": 4, "args": { "External id": 197946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197946, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 197946, "pid": 5, "tid": 7, "ts": 1716454224564709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510102, "dur": 8, "args": { "External id": 197946, "cbid": 211, "correlation": 197946 } }, { "ph": "s", "id": 197946, "pid": 76337, "tid": -914061504, "ts": 1716454224510102, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224564714, "dur": 11, "args": { "External id": 197955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197955, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197955, "pid": 5, "tid": 7, "ts": 1716454224564714, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510191, "dur": 14, "args": { "External id": 197955, "cbid": 211, "correlation": 197955 } }, { "ph": "s", "id": 197955, "pid": 76337, "tid": -914061504, "ts": 1716454224510191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224564727, "dur": 12, "args": { "External id": 197975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197975, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 197975, "pid": 5, "tid": 7, "ts": 1716454224564727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510260, "dur": 11, "args": { "External id": 197975, "cbid": 211, "correlation": 197975 } }, { "ph": "s", "id": 197975, "pid": 76337, "tid": -914061504, "ts": 1716454224510260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224564740, "dur": 4, "args": { "External id": 197987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197987, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 197987, "pid": 5, "tid": 7, "ts": 1716454224564740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510281, "dur": 6, "args": { "External id": 197987, "cbid": 211, "correlation": 197987 } }, { "ph": "s", "id": 197987, "pid": 76337, "tid": -914061504, "ts": 1716454224510281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224564746, "dur": 10, "args": { "External id": 197990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197990, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197990, "pid": 5, "tid": 7, "ts": 1716454224564746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510299, "dur": 7, "args": { "External id": 197990, "cbid": 211, "correlation": 197990 } }, { "ph": "s", "id": 197990, "pid": 76337, "tid": -914061504, "ts": 1716454224510299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224564757, "dur": 6, "args": { "External id": 197999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 197999, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 197999, "pid": 5, "tid": 7, "ts": 1716454224564757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510339, "dur": 9, "args": { "External id": 197999, "cbid": 211, "correlation": 197999 } }, { "ph": "s", "id": 197999, "pid": 76337, "tid": -914061504, "ts": 1716454224510339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224510391, "dur": 0, "args": { "External id": 198009, "cbid": 317, "correlation": 198009 } }, { "ph": "f", "id": 198009, "pid": 76337, "tid": -914061504, "ts": 1716454224510391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224510392, "dur": 0, "args": { "External id": 198010, "cbid": 203, "correlation": 198010 } }, { "ph": "f", "id": 198010, "pid": 76337, "tid": -914061504, "ts": 1716454224510392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224510393, "dur": 0, "args": { "External id": 198011, "cbid": 205, "correlation": 198011 } }, { "ph": "f", "id": 198011, "pid": 76337, "tid": -914061504, "ts": 1716454224510393, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224564765, "dur": 6, "args": { "External id": 198015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198015, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198015, "pid": 5, "tid": 7, "ts": 1716454224564765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510407, "dur": 12, "args": { "External id": 198015, "cbid": 211, "correlation": 198015 } }, { "ph": "s", "id": 198015, "pid": 76337, "tid": -914061504, "ts": 1716454224510407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224564773, "dur": 320, "args": { "External id": 198017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198017, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198017, "pid": 5, "tid": 7, "ts": 1716454224564773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510422, "dur": 5, "args": { "External id": 198017, "cbid": 211, "correlation": 198017 } }, { "ph": "s", "id": 198017, "pid": 76337, "tid": -914061504, "ts": 1716454224510422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224565095, "dur": 1, "args": { "External id": 198019, "device": 5, "context": 1, "stream": 7, "correlation": 198019, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 198019, "pid": 5, "tid": 7, "ts": 1716454224565095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224510433, "dur": 8, "args": { "External id": 198019, "cbid": 51, "correlation": 198019 } }, { "ph": "s", "id": 198019, "pid": 76337, "tid": -914061504, "ts": 1716454224510433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224565099, "dur": 494, "args": { "External id": 198020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198020, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198020, "pid": 5, "tid": 7, "ts": 1716454224565099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510442, "dur": 6, "args": { "External id": 198020, "cbid": 211, "correlation": 198020 } }, { "ph": "s", "id": 198020, "pid": 76337, "tid": -914061504, "ts": 1716454224510442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224565594, "dur": 6, "args": { "External id": 198022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198022, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198022, "pid": 5, "tid": 7, "ts": 1716454224565594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510451, "dur": 5, "args": { "External id": 198022, "cbid": 211, "correlation": 198022 } }, { "ph": "s", "id": 198022, "pid": 76337, "tid": -914061504, "ts": 1716454224510451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224565601, "dur": 6, "args": { "External id": 198028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198028, "pid": 5, "tid": 7, "ts": 1716454224565601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510481, "dur": 9, "args": { "External id": 198028, "cbid": 211, "correlation": 198028 } }, { "ph": "s", "id": 198028, "pid": 76337, "tid": -914061504, "ts": 1716454224510481, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224565609, "dur": 3, "args": { "External id": 198036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198036, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 198036, "pid": 5, "tid": 7, "ts": 1716454224565609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510524, "dur": 9, "args": { "External id": 198036, "cbid": 211, "correlation": 198036 } }, { "ph": "s", "id": 198036, "pid": 76337, "tid": -914061504, "ts": 1716454224510524, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224510587, "dur": 1, "args": { "External id": 198052, "cbid": 251, "correlation": 198052 } }, { "ph": "f", "id": 198052, "pid": 76337, "tid": -914061504, "ts": 1716454224510587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224510592, "dur": 0, "args": { "External id": 198054, "cbid": 251, "correlation": 198054 } }, { "ph": "f", "id": 198054, "pid": 76337, "tid": -914061504, "ts": 1716454224510592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224565614, "dur": 11, "args": { "External id": 198055, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198055, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198055, "pid": 5, "tid": 7, "ts": 1716454224565614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510594, "dur": 11, "args": { "External id": 198055, "cbid": 211, "correlation": 198055 } }, { "ph": "s", "id": 198055, "pid": 76337, "tid": -914061504, "ts": 1716454224510594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224565626, "dur": 5, "args": { "External id": 198057, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198057, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198057, "pid": 5, "tid": 7, "ts": 1716454224565626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510606, "dur": 6, "args": { "External id": 198057, "cbid": 211, "correlation": 198057 } }, { "ph": "s", "id": 198057, "pid": 76337, "tid": -914061504, "ts": 1716454224510606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224565632, "dur": 6, "args": { "External id": 198067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198067, "pid": 5, "tid": 7, "ts": 1716454224565632, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510663, "dur": 12, "args": { "External id": 198067, "cbid": 211, "correlation": 198067 } }, { "ph": "s", "id": 198067, "pid": 76337, "tid": -914061504, "ts": 1716454224510663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224565639, "dur": 9, "args": { "External id": 198087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198087, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198087, "pid": 5, "tid": 7, "ts": 1716454224565639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510729, "dur": 11, "args": { "External id": 198087, "cbid": 211, "correlation": 198087 } }, { "ph": "s", "id": 198087, "pid": 76337, "tid": -914061504, "ts": 1716454224510729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224565650, "dur": 4, "args": { "External id": 198099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198099, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 198099, "pid": 5, "tid": 7, "ts": 1716454224565650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510750, "dur": 6, "args": { "External id": 198099, "cbid": 211, "correlation": 198099 } }, { "ph": "s", "id": 198099, "pid": 76337, "tid": -914061504, "ts": 1716454224510750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224565655, "dur": 7, "args": { "External id": 198102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198102, "pid": 5, "tid": 7, "ts": 1716454224565655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510769, "dur": 6, "args": { "External id": 198102, "cbid": 211, "correlation": 198102 } }, { "ph": "s", "id": 198102, "pid": 76337, "tid": -914061504, "ts": 1716454224510769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224565663, "dur": 5, "args": { "External id": 198111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198111, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198111, "pid": 5, "tid": 7, "ts": 1716454224565663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510808, "dur": 10, "args": { "External id": 198111, "cbid": 211, "correlation": 198111 } }, { "ph": "s", "id": 198111, "pid": 76337, "tid": -914061504, "ts": 1716454224510808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224510871, "dur": 0, "args": { "External id": 198121, "cbid": 317, "correlation": 198121 } }, { "ph": "f", "id": 198121, "pid": 76337, "tid": -914061504, "ts": 1716454224510871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224510872, "dur": 0, "args": { "External id": 198122, "cbid": 203, "correlation": 198122 } }, { "ph": "f", "id": 198122, "pid": 76337, "tid": -914061504, "ts": 1716454224510872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224510873, "dur": 0, "args": { "External id": 198123, "cbid": 205, "correlation": 198123 } }, { "ph": "f", "id": 198123, "pid": 76337, "tid": -914061504, "ts": 1716454224510873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224565669, "dur": 5, "args": { "External id": 198127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198127, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198127, "pid": 5, "tid": 7, "ts": 1716454224565669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510887, "dur": 12, "args": { "External id": 198127, "cbid": 211, "correlation": 198127 } }, { "ph": "s", "id": 198127, "pid": 76337, "tid": -914061504, "ts": 1716454224510887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224565675, "dur": 161, "args": { "External id": 198129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198129, "pid": 5, "tid": 7, "ts": 1716454224565675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510901, "dur": 5, "args": { "External id": 198129, "cbid": 211, "correlation": 198129 } }, { "ph": "s", "id": 198129, "pid": 76337, "tid": -914061504, "ts": 1716454224510901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224565838, "dur": 1, "args": { "External id": 198131, "device": 5, "context": 1, "stream": 7, "correlation": 198131, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 198131, "pid": 5, "tid": 7, "ts": 1716454224565838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224510911, "dur": 8, "args": { "External id": 198131, "cbid": 51, "correlation": 198131 } }, { "ph": "s", "id": 198131, "pid": 76337, "tid": -914061504, "ts": 1716454224510911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224565842, "dur": 258, "args": { "External id": 198132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198132, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198132, "pid": 5, "tid": 7, "ts": 1716454224565842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510920, "dur": 6, "args": { "External id": 198132, "cbid": 211, "correlation": 198132 } }, { "ph": "s", "id": 198132, "pid": 76337, "tid": -914061504, "ts": 1716454224510920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224566102, "dur": 6, "args": { "External id": 198134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198134, "pid": 5, "tid": 7, "ts": 1716454224566102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510930, "dur": 5, "args": { "External id": 198134, "cbid": 211, "correlation": 198134 } }, { "ph": "s", "id": 198134, "pid": 76337, "tid": -914061504, "ts": 1716454224510930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224566109, "dur": 6, "args": { "External id": 198140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198140, "pid": 5, "tid": 7, "ts": 1716454224566109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224510958, "dur": 10, "args": { "External id": 198140, "cbid": 211, "correlation": 198140 } }, { "ph": "s", "id": 198140, "pid": 76337, "tid": -914061504, "ts": 1716454224510958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224511027, "dur": 0, "args": { "External id": 198150, "cbid": 317, "correlation": 198150 } }, { "ph": "f", "id": 198150, "pid": 76337, "tid": -914061504, "ts": 1716454224511027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224511028, "dur": 0, "args": { "External id": 198151, "cbid": 203, "correlation": 198151 } }, { "ph": "f", "id": 198151, "pid": 76337, "tid": -914061504, "ts": 1716454224511028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224511028, "dur": 0, "args": { "External id": 198152, "cbid": 205, "correlation": 198152 } }, { "ph": "f", "id": 198152, "pid": 76337, "tid": -914061504, "ts": 1716454224511028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224566117, "dur": 7, "args": { "External id": 198156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198156, "pid": 5, "tid": 7, "ts": 1716454224566117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511043, "dur": 12, "args": { "External id": 198156, "cbid": 211, "correlation": 198156 } }, { "ph": "s", "id": 198156, "pid": 76337, "tid": -914061504, "ts": 1716454224511043, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224566125, "dur": 3, "args": { "External id": 198158, "device": 5, "context": 1, "stream": 7, "correlation": 198158, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 198158, "pid": 5, "tid": 7, "ts": 1716454224566125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224511062, "dur": 21, "args": { "External id": 198158, "cbid": 51, "correlation": 198158 } }, { "ph": "s", "id": 198158, "pid": 76337, "tid": -914061504, "ts": 1716454224511062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224566129, "dur": 97, "args": { "External id": 198159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198159, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 198159, "pid": 5, "tid": 7, "ts": 1716454224566129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511084, "dur": 6, "args": { "External id": 198159, "cbid": 211, "correlation": 198159 } }, { "ph": "s", "id": 198159, "pid": 76337, "tid": -914061504, "ts": 1716454224511084, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224566228, "dur": 6, "args": { "External id": 198161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198161, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198161, "pid": 5, "tid": 7, "ts": 1716454224566228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511095, "dur": 5, "args": { "External id": 198161, "cbid": 211, "correlation": 198161 } }, { "ph": "s", "id": 198161, "pid": 76337, "tid": -914061504, "ts": 1716454224511095, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224566235, "dur": 6, "args": { "External id": 198167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198167, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198167, "pid": 5, "tid": 7, "ts": 1716454224566235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511123, "dur": 9, "args": { "External id": 198167, "cbid": 211, "correlation": 198167 } }, { "ph": "s", "id": 198167, "pid": 76337, "tid": -914061504, "ts": 1716454224511123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224566242, "dur": 5, "args": { "External id": 198175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198175, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198175, "pid": 5, "tid": 7, "ts": 1716454224566242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511153, "dur": 9, "args": { "External id": 198175, "cbid": 211, "correlation": 198175 } }, { "ph": "s", "id": 198175, "pid": 76337, "tid": -914061504, "ts": 1716454224511153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224566248, "dur": 4, "args": { "External id": 198183, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198183, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198183, "pid": 5, "tid": 7, "ts": 1716454224566248, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511183, "dur": 8, "args": { "External id": 198183, "cbid": 211, "correlation": 198183 } }, { "ph": "s", "id": 198183, "pid": 76337, "tid": -914061504, "ts": 1716454224511183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224566254, "dur": 11, "args": { "External id": 198192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198192, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198192, "pid": 5, "tid": 7, "ts": 1716454224566254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511227, "dur": 10, "args": { "External id": 198192, "cbid": 211, "correlation": 198192 } }, { "ph": "s", "id": 198192, "pid": 76337, "tid": -914061504, "ts": 1716454224511227, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224566266, "dur": 12, "args": { "External id": 198212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198212, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198212, "pid": 5, "tid": 7, "ts": 1716454224566266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511296, "dur": 11, "args": { "External id": 198212, "cbid": 211, "correlation": 198212 } }, { "ph": "s", "id": 198212, "pid": 76337, "tid": -914061504, "ts": 1716454224511296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224566280, "dur": 4, "args": { "External id": 198224, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198224, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198224, "pid": 5, "tid": 7, "ts": 1716454224566280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511319, "dur": 6, "args": { "External id": 198224, "cbid": 211, "correlation": 198224 } }, { "ph": "s", "id": 198224, "pid": 76337, "tid": -914061504, "ts": 1716454224511319, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224566285, "dur": 10, "args": { "External id": 198227, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198227, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198227, "pid": 5, "tid": 7, "ts": 1716454224566285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511336, "dur": 7, "args": { "External id": 198227, "cbid": 211, "correlation": 198227 } }, { "ph": "s", "id": 198227, "pid": 76337, "tid": -914061504, "ts": 1716454224511336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224566297, "dur": 6, "args": { "External id": 198236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198236, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198236, "pid": 5, "tid": 7, "ts": 1716454224566297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511375, "dur": 10, "args": { "External id": 198236, "cbid": 211, "correlation": 198236 } }, { "ph": "s", "id": 198236, "pid": 76337, "tid": -914061504, "ts": 1716454224511375, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224511428, "dur": 0, "args": { "External id": 198246, "cbid": 317, "correlation": 198246 } }, { "ph": "f", "id": 198246, "pid": 76337, "tid": -914061504, "ts": 1716454224511428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224511428, "dur": 0, "args": { "External id": 198247, "cbid": 203, "correlation": 198247 } }, { "ph": "f", "id": 198247, "pid": 76337, "tid": -914061504, "ts": 1716454224511428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224511429, "dur": 0, "args": { "External id": 198248, "cbid": 205, "correlation": 198248 } }, { "ph": "f", "id": 198248, "pid": 76337, "tid": -914061504, "ts": 1716454224511429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224566305, "dur": 6, "args": { "External id": 198252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198252, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198252, "pid": 5, "tid": 7, "ts": 1716454224566305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511442, "dur": 11, "args": { "External id": 198252, "cbid": 211, "correlation": 198252 } }, { "ph": "s", "id": 198252, "pid": 76337, "tid": -914061504, "ts": 1716454224511442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224566313, "dur": 321, "args": { "External id": 198254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198254, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198254, "pid": 5, "tid": 7, "ts": 1716454224566313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511456, "dur": 5, "args": { "External id": 198254, "cbid": 211, "correlation": 198254 } }, { "ph": "s", "id": 198254, "pid": 76337, "tid": -914061504, "ts": 1716454224511456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224566635, "dur": 1, "args": { "External id": 198256, "device": 5, "context": 1, "stream": 7, "correlation": 198256, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 198256, "pid": 5, "tid": 7, "ts": 1716454224566635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224511467, "dur": 6, "args": { "External id": 198256, "cbid": 51, "correlation": 198256 } }, { "ph": "s", "id": 198256, "pid": 76337, "tid": -914061504, "ts": 1716454224511467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224566639, "dur": 497, "args": { "External id": 198257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198257, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198257, "pid": 5, "tid": 7, "ts": 1716454224566639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511474, "dur": 6, "args": { "External id": 198257, "cbid": 211, "correlation": 198257 } }, { "ph": "s", "id": 198257, "pid": 76337, "tid": -914061504, "ts": 1716454224511474, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567137, "dur": 6, "args": { "External id": 198259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198259, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198259, "pid": 5, "tid": 7, "ts": 1716454224567137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511484, "dur": 5, "args": { "External id": 198259, "cbid": 211, "correlation": 198259 } }, { "ph": "s", "id": 198259, "pid": 76337, "tid": -914061504, "ts": 1716454224511484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224567144, "dur": 6, "args": { "External id": 198265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198265, "pid": 5, "tid": 7, "ts": 1716454224567144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511512, "dur": 9, "args": { "External id": 198265, "cbid": 211, "correlation": 198265 } }, { "ph": "s", "id": 198265, "pid": 76337, "tid": -914061504, "ts": 1716454224511512, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224567152, "dur": 3, "args": { "External id": 198273, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198273, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 198273, "pid": 5, "tid": 7, "ts": 1716454224567152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511555, "dur": 10, "args": { "External id": 198273, "cbid": 211, "correlation": 198273 } }, { "ph": "s", "id": 198273, "pid": 76337, "tid": -914061504, "ts": 1716454224511555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224511617, "dur": 1, "args": { "External id": 198289, "cbid": 251, "correlation": 198289 } }, { "ph": "f", "id": 198289, "pid": 76337, "tid": -914061504, "ts": 1716454224511617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224511623, "dur": 0, "args": { "External id": 198291, "cbid": 251, "correlation": 198291 } }, { "ph": "f", "id": 198291, "pid": 76337, "tid": -914061504, "ts": 1716454224511623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224567156, "dur": 13, "args": { "External id": 198292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198292, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198292, "pid": 5, "tid": 7, "ts": 1716454224567156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511624, "dur": 11, "args": { "External id": 198292, "cbid": 211, "correlation": 198292 } }, { "ph": "s", "id": 198292, "pid": 76337, "tid": -914061504, "ts": 1716454224511624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224567171, "dur": 5, "args": { "External id": 198294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198294, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198294, "pid": 5, "tid": 7, "ts": 1716454224567171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511637, "dur": 5, "args": { "External id": 198294, "cbid": 211, "correlation": 198294 } }, { "ph": "s", "id": 198294, "pid": 76337, "tid": -914061504, "ts": 1716454224511637, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224567178, "dur": 6, "args": { "External id": 198304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198304, "pid": 5, "tid": 7, "ts": 1716454224567178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511693, "dur": 12, "args": { "External id": 198304, "cbid": 211, "correlation": 198304 } }, { "ph": "s", "id": 198304, "pid": 76337, "tid": -914061504, "ts": 1716454224511693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224567185, "dur": 9, "args": { "External id": 198324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198324, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198324, "pid": 5, "tid": 7, "ts": 1716454224567185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511759, "dur": 11, "args": { "External id": 198324, "cbid": 211, "correlation": 198324 } }, { "ph": "s", "id": 198324, "pid": 76337, "tid": -914061504, "ts": 1716454224511759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224567195, "dur": 4, "args": { "External id": 198336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198336, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 198336, "pid": 5, "tid": 7, "ts": 1716454224567195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511780, "dur": 6, "args": { "External id": 198336, "cbid": 211, "correlation": 198336 } }, { "ph": "s", "id": 198336, "pid": 76337, "tid": -914061504, "ts": 1716454224511780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224567200, "dur": 7, "args": { "External id": 198339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198339, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198339, "pid": 5, "tid": 7, "ts": 1716454224567200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511798, "dur": 6, "args": { "External id": 198339, "cbid": 211, "correlation": 198339 } }, { "ph": "s", "id": 198339, "pid": 76337, "tid": -914061504, "ts": 1716454224511798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224567208, "dur": 5, "args": { "External id": 198348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198348, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198348, "pid": 5, "tid": 7, "ts": 1716454224567208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511838, "dur": 10, "args": { "External id": 198348, "cbid": 211, "correlation": 198348 } }, { "ph": "s", "id": 198348, "pid": 76337, "tid": -914061504, "ts": 1716454224511838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224511900, "dur": 0, "args": { "External id": 198358, "cbid": 317, "correlation": 198358 } }, { "ph": "f", "id": 198358, "pid": 76337, "tid": -914061504, "ts": 1716454224511900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224511901, "dur": 0, "args": { "External id": 198359, "cbid": 203, "correlation": 198359 } }, { "ph": "f", "id": 198359, "pid": 76337, "tid": -914061504, "ts": 1716454224511901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224511901, "dur": 0, "args": { "External id": 198360, "cbid": 205, "correlation": 198360 } }, { "ph": "f", "id": 198360, "pid": 76337, "tid": -914061504, "ts": 1716454224511901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567214, "dur": 5, "args": { "External id": 198364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198364, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198364, "pid": 5, "tid": 7, "ts": 1716454224567214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511916, "dur": 12, "args": { "External id": 198364, "cbid": 211, "correlation": 198364 } }, { "ph": "s", "id": 198364, "pid": 76337, "tid": -914061504, "ts": 1716454224511916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567221, "dur": 162, "args": { "External id": 198366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198366, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198366, "pid": 5, "tid": 7, "ts": 1716454224567221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511931, "dur": 5, "args": { "External id": 198366, "cbid": 211, "correlation": 198366 } }, { "ph": "s", "id": 198366, "pid": 76337, "tid": -914061504, "ts": 1716454224511931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224567385, "dur": 1, "args": { "External id": 198368, "device": 5, "context": 1, "stream": 7, "correlation": 198368, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 198368, "pid": 5, "tid": 7, "ts": 1716454224567385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224511941, "dur": 6, "args": { "External id": 198368, "cbid": 51, "correlation": 198368 } }, { "ph": "s", "id": 198368, "pid": 76337, "tid": -914061504, "ts": 1716454224511941, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224567388, "dur": 260, "args": { "External id": 198369, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198369, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198369, "pid": 5, "tid": 7, "ts": 1716454224567388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511949, "dur": 6, "args": { "External id": 198369, "cbid": 211, "correlation": 198369 } }, { "ph": "s", "id": 198369, "pid": 76337, "tid": -914061504, "ts": 1716454224511949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567649, "dur": 6, "args": { "External id": 198371, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198371, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198371, "pid": 5, "tid": 7, "ts": 1716454224567649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511960, "dur": 6, "args": { "External id": 198371, "cbid": 211, "correlation": 198371 } }, { "ph": "s", "id": 198371, "pid": 76337, "tid": -914061504, "ts": 1716454224511960, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224567657, "dur": 6, "args": { "External id": 198377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198377, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198377, "pid": 5, "tid": 7, "ts": 1716454224567657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224511997, "dur": 9, "args": { "External id": 198377, "cbid": 211, "correlation": 198377 } }, { "ph": "s", "id": 198377, "pid": 76337, "tid": -914061504, "ts": 1716454224511997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224512057, "dur": 0, "args": { "External id": 198387, "cbid": 317, "correlation": 198387 } }, { "ph": "f", "id": 198387, "pid": 76337, "tid": -914061504, "ts": 1716454224512057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224512058, "dur": 0, "args": { "External id": 198388, "cbid": 203, "correlation": 198388 } }, { "ph": "f", "id": 198388, "pid": 76337, "tid": -914061504, "ts": 1716454224512058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224512059, "dur": 0, "args": { "External id": 198389, "cbid": 205, "correlation": 198389 } }, { "ph": "f", "id": 198389, "pid": 76337, "tid": -914061504, "ts": 1716454224512059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567664, "dur": 7, "args": { "External id": 198393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198393, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198393, "pid": 5, "tid": 7, "ts": 1716454224567664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512071, "dur": 11, "args": { "External id": 198393, "cbid": 211, "correlation": 198393 } }, { "ph": "s", "id": 198393, "pid": 76337, "tid": -914061504, "ts": 1716454224512071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224567673, "dur": 3, "args": { "External id": 198395, "device": 5, "context": 1, "stream": 7, "correlation": 198395, "bytes": 4800, "memory bandwidth (GB/s)": 1.3513513513513513 } }, { "ph": "f", "id": 198395, "pid": 5, "tid": 7, "ts": 1716454224567673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224512088, "dur": 10, "args": { "External id": 198395, "cbid": 51, "correlation": 198395 } }, { "ph": "s", "id": 198395, "pid": 76337, "tid": -914061504, "ts": 1716454224512088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224567677, "dur": 94, "args": { "External id": 198396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198396, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 198396, "pid": 5, "tid": 7, "ts": 1716454224567677, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512098, "dur": 6, "args": { "External id": 198396, "cbid": 211, "correlation": 198396 } }, { "ph": "s", "id": 198396, "pid": 76337, "tid": -914061504, "ts": 1716454224512098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567772, "dur": 6, "args": { "External id": 198398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198398, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198398, "pid": 5, "tid": 7, "ts": 1716454224567772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512108, "dur": 6, "args": { "External id": 198398, "cbid": 211, "correlation": 198398 } }, { "ph": "s", "id": 198398, "pid": 76337, "tid": -914061504, "ts": 1716454224512108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224567780, "dur": 6, "args": { "External id": 198404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198404, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198404, "pid": 5, "tid": 7, "ts": 1716454224567780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512135, "dur": 8, "args": { "External id": 198404, "cbid": 211, "correlation": 198404 } }, { "ph": "s", "id": 198404, "pid": 76337, "tid": -914061504, "ts": 1716454224512135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224567787, "dur": 5, "args": { "External id": 198412, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198412, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198412, "pid": 5, "tid": 7, "ts": 1716454224567787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512166, "dur": 8, "args": { "External id": 198412, "cbid": 211, "correlation": 198412 } }, { "ph": "s", "id": 198412, "pid": 76337, "tid": -914061504, "ts": 1716454224512166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224567793, "dur": 4, "args": { "External id": 198420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198420, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198420, "pid": 5, "tid": 7, "ts": 1716454224567793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512194, "dur": 8, "args": { "External id": 198420, "cbid": 211, "correlation": 198420 } }, { "ph": "s", "id": 198420, "pid": 76337, "tid": -914061504, "ts": 1716454224512194, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224567799, "dur": 11, "args": { "External id": 198429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198429, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198429, "pid": 5, "tid": 7, "ts": 1716454224567799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512238, "dur": 10, "args": { "External id": 198429, "cbid": 211, "correlation": 198429 } }, { "ph": "s", "id": 198429, "pid": 76337, "tid": -914061504, "ts": 1716454224512238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224567812, "dur": 12, "args": { "External id": 198449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198449, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198449, "pid": 5, "tid": 7, "ts": 1716454224567812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512308, "dur": 12, "args": { "External id": 198449, "cbid": 211, "correlation": 198449 } }, { "ph": "s", "id": 198449, "pid": 76337, "tid": -914061504, "ts": 1716454224512308, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224567825, "dur": 4, "args": { "External id": 198461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198461, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198461, "pid": 5, "tid": 7, "ts": 1716454224567825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512330, "dur": 6, "args": { "External id": 198461, "cbid": 211, "correlation": 198461 } }, { "ph": "s", "id": 198461, "pid": 76337, "tid": -914061504, "ts": 1716454224512330, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224567831, "dur": 11, "args": { "External id": 198464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198464, "pid": 5, "tid": 7, "ts": 1716454224567831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512347, "dur": 6, "args": { "External id": 198464, "cbid": 211, "correlation": 198464 } }, { "ph": "s", "id": 198464, "pid": 76337, "tid": -914061504, "ts": 1716454224512347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224567843, "dur": 6, "args": { "External id": 198473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198473, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198473, "pid": 5, "tid": 7, "ts": 1716454224567843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512385, "dur": 9, "args": { "External id": 198473, "cbid": 211, "correlation": 198473 } }, { "ph": "s", "id": 198473, "pid": 76337, "tid": -914061504, "ts": 1716454224512385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224512437, "dur": 0, "args": { "External id": 198483, "cbid": 317, "correlation": 198483 } }, { "ph": "f", "id": 198483, "pid": 76337, "tid": -914061504, "ts": 1716454224512437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224512438, "dur": 0, "args": { "External id": 198484, "cbid": 203, "correlation": 198484 } }, { "ph": "f", "id": 198484, "pid": 76337, "tid": -914061504, "ts": 1716454224512438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224512438, "dur": 0, "args": { "External id": 198485, "cbid": 205, "correlation": 198485 } }, { "ph": "f", "id": 198485, "pid": 76337, "tid": -914061504, "ts": 1716454224512438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567850, "dur": 7, "args": { "External id": 198489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198489, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198489, "pid": 5, "tid": 7, "ts": 1716454224567850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512451, "dur": 12, "args": { "External id": 198489, "cbid": 211, "correlation": 198489 } }, { "ph": "s", "id": 198489, "pid": 76337, "tid": -914061504, "ts": 1716454224512451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224567858, "dur": 319, "args": { "External id": 198491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198491, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198491, "pid": 5, "tid": 7, "ts": 1716454224567858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512466, "dur": 5, "args": { "External id": 198491, "cbid": 211, "correlation": 198491 } }, { "ph": "s", "id": 198491, "pid": 76337, "tid": -914061504, "ts": 1716454224512466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224568180, "dur": 1, "args": { "External id": 198493, "device": 5, "context": 1, "stream": 7, "correlation": 198493, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 198493, "pid": 5, "tid": 7, "ts": 1716454224568180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224512476, "dur": 6, "args": { "External id": 198493, "cbid": 51, "correlation": 198493 } }, { "ph": "s", "id": 198493, "pid": 76337, "tid": -914061504, "ts": 1716454224512476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224568183, "dur": 495, "args": { "External id": 198494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198494, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198494, "pid": 5, "tid": 7, "ts": 1716454224568183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512484, "dur": 6, "args": { "External id": 198494, "cbid": 211, "correlation": 198494 } }, { "ph": "s", "id": 198494, "pid": 76337, "tid": -914061504, "ts": 1716454224512484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224568680, "dur": 5, "args": { "External id": 198496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198496, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198496, "pid": 5, "tid": 7, "ts": 1716454224568680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512494, "dur": 5, "args": { "External id": 198496, "cbid": 211, "correlation": 198496 } }, { "ph": "s", "id": 198496, "pid": 76337, "tid": -914061504, "ts": 1716454224512494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224568687, "dur": 6, "args": { "External id": 198502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198502, "pid": 5, "tid": 7, "ts": 1716454224568687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512522, "dur": 9, "args": { "External id": 198502, "cbid": 211, "correlation": 198502 } }, { "ph": "s", "id": 198502, "pid": 76337, "tid": -914061504, "ts": 1716454224512522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224568695, "dur": 3, "args": { "External id": 198510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198510, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 198510, "pid": 5, "tid": 7, "ts": 1716454224568695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512567, "dur": 9, "args": { "External id": 198510, "cbid": 211, "correlation": 198510 } }, { "ph": "s", "id": 198510, "pid": 76337, "tid": -914061504, "ts": 1716454224512567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224512628, "dur": 1, "args": { "External id": 198526, "cbid": 251, "correlation": 198526 } }, { "ph": "f", "id": 198526, "pid": 76337, "tid": -914061504, "ts": 1716454224512628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224512633, "dur": 0, "args": { "External id": 198528, "cbid": 251, "correlation": 198528 } }, { "ph": "f", "id": 198528, "pid": 76337, "tid": -914061504, "ts": 1716454224512633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224568699, "dur": 13, "args": { "External id": 198529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198529, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198529, "pid": 5, "tid": 7, "ts": 1716454224568699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512635, "dur": 11, "args": { "External id": 198529, "cbid": 211, "correlation": 198529 } }, { "ph": "s", "id": 198529, "pid": 76337, "tid": -914061504, "ts": 1716454224512635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224568713, "dur": 5, "args": { "External id": 198531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198531, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198531, "pid": 5, "tid": 7, "ts": 1716454224568713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512647, "dur": 5, "args": { "External id": 198531, "cbid": 211, "correlation": 198531 } }, { "ph": "s", "id": 198531, "pid": 76337, "tid": -914061504, "ts": 1716454224512647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224568720, "dur": 6, "args": { "External id": 198541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198541, "pid": 5, "tid": 7, "ts": 1716454224568720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512702, "dur": 12, "args": { "External id": 198541, "cbid": 211, "correlation": 198541 } }, { "ph": "s", "id": 198541, "pid": 76337, "tid": -914061504, "ts": 1716454224512702, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224568727, "dur": 9, "args": { "External id": 198561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198561, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198561, "pid": 5, "tid": 7, "ts": 1716454224568727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512769, "dur": 11, "args": { "External id": 198561, "cbid": 211, "correlation": 198561 } }, { "ph": "s", "id": 198561, "pid": 76337, "tid": -914061504, "ts": 1716454224512769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224568737, "dur": 4, "args": { "External id": 198573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198573, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 198573, "pid": 5, "tid": 7, "ts": 1716454224568737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512789, "dur": 6, "args": { "External id": 198573, "cbid": 211, "correlation": 198573 } }, { "ph": "s", "id": 198573, "pid": 76337, "tid": -914061504, "ts": 1716454224512789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224568742, "dur": 6, "args": { "External id": 198576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198576, "pid": 5, "tid": 7, "ts": 1716454224568742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512808, "dur": 7, "args": { "External id": 198576, "cbid": 211, "correlation": 198576 } }, { "ph": "s", "id": 198576, "pid": 76337, "tid": -914061504, "ts": 1716454224512808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224568750, "dur": 5, "args": { "External id": 198585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198585, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198585, "pid": 5, "tid": 7, "ts": 1716454224568750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512848, "dur": 10, "args": { "External id": 198585, "cbid": 211, "correlation": 198585 } }, { "ph": "s", "id": 198585, "pid": 76337, "tid": -914061504, "ts": 1716454224512848, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224512910, "dur": 0, "args": { "External id": 198595, "cbid": 317, "correlation": 198595 } }, { "ph": "f", "id": 198595, "pid": 76337, "tid": -914061504, "ts": 1716454224512910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224512911, "dur": 0, "args": { "External id": 198596, "cbid": 203, "correlation": 198596 } }, { "ph": "f", "id": 198596, "pid": 76337, "tid": -914061504, "ts": 1716454224512911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224512911, "dur": 0, "args": { "External id": 198597, "cbid": 205, "correlation": 198597 } }, { "ph": "f", "id": 198597, "pid": 76337, "tid": -914061504, "ts": 1716454224512911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224568756, "dur": 5, "args": { "External id": 198601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198601, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198601, "pid": 5, "tid": 7, "ts": 1716454224568756, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512926, "dur": 12, "args": { "External id": 198601, "cbid": 211, "correlation": 198601 } }, { "ph": "s", "id": 198601, "pid": 76337, "tid": -914061504, "ts": 1716454224512926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224568762, "dur": 163, "args": { "External id": 198603, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198603, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198603, "pid": 5, "tid": 7, "ts": 1716454224568762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512940, "dur": 5, "args": { "External id": 198603, "cbid": 211, "correlation": 198603 } }, { "ph": "s", "id": 198603, "pid": 76337, "tid": -914061504, "ts": 1716454224512940, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224568927, "dur": 1, "args": { "External id": 198605, "device": 5, "context": 1, "stream": 7, "correlation": 198605, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 198605, "pid": 5, "tid": 7, "ts": 1716454224568927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224512951, "dur": 6, "args": { "External id": 198605, "cbid": 51, "correlation": 198605 } }, { "ph": "s", "id": 198605, "pid": 76337, "tid": -914061504, "ts": 1716454224512951, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224568931, "dur": 259, "args": { "External id": 198606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198606, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198606, "pid": 5, "tid": 7, "ts": 1716454224568931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512958, "dur": 6, "args": { "External id": 198606, "cbid": 211, "correlation": 198606 } }, { "ph": "s", "id": 198606, "pid": 76337, "tid": -914061504, "ts": 1716454224512958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224569191, "dur": 6, "args": { "External id": 198608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198608, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198608, "pid": 5, "tid": 7, "ts": 1716454224569191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224512968, "dur": 13, "args": { "External id": 198608, "cbid": 211, "correlation": 198608 } }, { "ph": "s", "id": 198608, "pid": 76337, "tid": -914061504, "ts": 1716454224512968, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224569198, "dur": 6, "args": { "External id": 198614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198614, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198614, "pid": 5, "tid": 7, "ts": 1716454224569198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513004, "dur": 9, "args": { "External id": 198614, "cbid": 211, "correlation": 198614 } }, { "ph": "s", "id": 198614, "pid": 76337, "tid": -914061504, "ts": 1716454224513004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224513063, "dur": 0, "args": { "External id": 198624, "cbid": 317, "correlation": 198624 } }, { "ph": "f", "id": 198624, "pid": 76337, "tid": -914061504, "ts": 1716454224513063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224513064, "dur": 0, "args": { "External id": 198625, "cbid": 203, "correlation": 198625 } }, { "ph": "f", "id": 198625, "pid": 76337, "tid": -914061504, "ts": 1716454224513064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224513065, "dur": 0, "args": { "External id": 198626, "cbid": 205, "correlation": 198626 } }, { "ph": "f", "id": 198626, "pid": 76337, "tid": -914061504, "ts": 1716454224513065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224569206, "dur": 8, "args": { "External id": 198630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198630, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198630, "pid": 5, "tid": 7, "ts": 1716454224569206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513077, "dur": 12, "args": { "External id": 198630, "cbid": 211, "correlation": 198630 } }, { "ph": "s", "id": 198630, "pid": 76337, "tid": -914061504, "ts": 1716454224513077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224569215, "dur": 3, "args": { "External id": 198632, "device": 5, "context": 1, "stream": 7, "correlation": 198632, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 198632, "pid": 5, "tid": 7, "ts": 1716454224569215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224513094, "dur": 10, "args": { "External id": 198632, "cbid": 51, "correlation": 198632 } }, { "ph": "s", "id": 198632, "pid": 76337, "tid": -914061504, "ts": 1716454224513094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224569219, "dur": 93, "args": { "External id": 198633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198633, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 198633, "pid": 5, "tid": 7, "ts": 1716454224569219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513105, "dur": 6, "args": { "External id": 198633, "cbid": 211, "correlation": 198633 } }, { "ph": "s", "id": 198633, "pid": 76337, "tid": -914061504, "ts": 1716454224513105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224569314, "dur": 5, "args": { "External id": 198635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198635, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198635, "pid": 5, "tid": 7, "ts": 1716454224569314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513115, "dur": 6, "args": { "External id": 198635, "cbid": 211, "correlation": 198635 } }, { "ph": "s", "id": 198635, "pid": 76337, "tid": -914061504, "ts": 1716454224513115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224569320, "dur": 6, "args": { "External id": 198641, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198641, "pid": 5, "tid": 7, "ts": 1716454224569320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513142, "dur": 8, "args": { "External id": 198641, "cbid": 211, "correlation": 198641 } }, { "ph": "s", "id": 198641, "pid": 76337, "tid": -914061504, "ts": 1716454224513142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224569328, "dur": 5, "args": { "External id": 198649, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198649, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198649, "pid": 5, "tid": 7, "ts": 1716454224569328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513171, "dur": 8, "args": { "External id": 198649, "cbid": 211, "correlation": 198649 } }, { "ph": "s", "id": 198649, "pid": 76337, "tid": -914061504, "ts": 1716454224513171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224569334, "dur": 4, "args": { "External id": 198657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198657, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 198657, "pid": 5, "tid": 7, "ts": 1716454224569334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513199, "dur": 8, "args": { "External id": 198657, "cbid": 211, "correlation": 198657 } }, { "ph": "s", "id": 198657, "pid": 76337, "tid": -914061504, "ts": 1716454224513199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224569339, "dur": 14, "args": { "External id": 198668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198668, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198668, "pid": 5, "tid": 7, "ts": 1716454224569339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513290, "dur": 14, "args": { "External id": 198668, "cbid": 211, "correlation": 198668 } }, { "ph": "s", "id": 198668, "pid": 76337, "tid": -914061504, "ts": 1716454224513290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224513348, "dur": 0, "args": { "External id": 198678, "cbid": 317, "correlation": 198678 } }, { "ph": "f", "id": 198678, "pid": 76337, "tid": -914061504, "ts": 1716454224513348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224513349, "dur": 0, "args": { "External id": 198679, "cbid": 203, "correlation": 198679 } }, { "ph": "f", "id": 198679, "pid": 76337, "tid": -914061504, "ts": 1716454224513349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224513349, "dur": 0, "args": { "External id": 198680, "cbid": 205, "correlation": 198680 } }, { "ph": "f", "id": 198680, "pid": 76337, "tid": -914061504, "ts": 1716454224513349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224569355, "dur": 9, "args": { "External id": 198684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198684, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198684, "pid": 5, "tid": 7, "ts": 1716454224569355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513365, "dur": 12, "args": { "External id": 198684, "cbid": 211, "correlation": 198684 } }, { "ph": "s", "id": 198684, "pid": 76337, "tid": -914061504, "ts": 1716454224513365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224569365, "dur": 162, "args": { "External id": 198686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198686, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198686, "pid": 5, "tid": 7, "ts": 1716454224569365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513380, "dur": 5, "args": { "External id": 198686, "cbid": 211, "correlation": 198686 } }, { "ph": "s", "id": 198686, "pid": 76337, "tid": -914061504, "ts": 1716454224513380, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224569529, "dur": 1, "args": { "External id": 198688, "device": 5, "context": 1, "stream": 7, "correlation": 198688, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 198688, "pid": 5, "tid": 7, "ts": 1716454224569529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224513390, "dur": 6, "args": { "External id": 198688, "cbid": 51, "correlation": 198688 } }, { "ph": "s", "id": 198688, "pid": 76337, "tid": -914061504, "ts": 1716454224513390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224569533, "dur": 648, "args": { "External id": 198689, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198689, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198689, "pid": 5, "tid": 7, "ts": 1716454224569533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513397, "dur": 6, "args": { "External id": 198689, "cbid": 211, "correlation": 198689 } }, { "ph": "s", "id": 198689, "pid": 76337, "tid": -914061504, "ts": 1716454224513397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224570183, "dur": 13, "args": { "External id": 198691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198691, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198691, "pid": 5, "tid": 7, "ts": 1716454224570183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513407, "dur": 5, "args": { "External id": 198691, "cbid": 211, "correlation": 198691 } }, { "ph": "s", "id": 198691, "pid": 76337, "tid": -914061504, "ts": 1716454224513407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224570197, "dur": 14, "args": { "External id": 198697, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198697, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198697, "pid": 5, "tid": 7, "ts": 1716454224570197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513436, "dur": 9, "args": { "External id": 198697, "cbid": 211, "correlation": 198697 } }, { "ph": "s", "id": 198697, "pid": 76337, "tid": -914061504, "ts": 1716454224513436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224570213, "dur": 30, "args": { "External id": 198706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198706, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198706, "pid": 5, "tid": 7, "ts": 1716454224570213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513527, "dur": 13, "args": { "External id": 198706, "cbid": 211, "correlation": 198706 } }, { "ph": "s", "id": 198706, "pid": 76337, "tid": -914061504, "ts": 1716454224513527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224570244, "dur": 31, "args": { "External id": 198726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198726, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198726, "pid": 5, "tid": 7, "ts": 1716454224570244, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513595, "dur": 11, "args": { "External id": 198726, "cbid": 211, "correlation": 198726 } }, { "ph": "s", "id": 198726, "pid": 76337, "tid": -914061504, "ts": 1716454224513595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224570276, "dur": 4, "args": { "External id": 198738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198738, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198738, "pid": 5, "tid": 7, "ts": 1716454224570276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513619, "dur": 7, "args": { "External id": 198738, "cbid": 211, "correlation": 198738 } }, { "ph": "s", "id": 198738, "pid": 76337, "tid": -914061504, "ts": 1716454224513619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224570282, "dur": 30, "args": { "External id": 198741, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198741, "pid": 5, "tid": 7, "ts": 1716454224570282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513638, "dur": 6, "args": { "External id": 198741, "cbid": 211, "correlation": 198741 } }, { "ph": "s", "id": 198741, "pid": 76337, "tid": -914061504, "ts": 1716454224513638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224570313, "dur": 21, "args": { "External id": 198750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198750, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198750, "pid": 5, "tid": 7, "ts": 1716454224570313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513678, "dur": 10, "args": { "External id": 198750, "cbid": 211, "correlation": 198750 } }, { "ph": "s", "id": 198750, "pid": 76337, "tid": -914061504, "ts": 1716454224513678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224513733, "dur": 0, "args": { "External id": 198760, "cbid": 317, "correlation": 198760 } }, { "ph": "f", "id": 198760, "pid": 76337, "tid": -914061504, "ts": 1716454224513733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224513734, "dur": 0, "args": { "External id": 198761, "cbid": 203, "correlation": 198761 } }, { "ph": "f", "id": 198761, "pid": 76337, "tid": -914061504, "ts": 1716454224513734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224513735, "dur": 0, "args": { "External id": 198762, "cbid": 205, "correlation": 198762 } }, { "ph": "f", "id": 198762, "pid": 76337, "tid": -914061504, "ts": 1716454224513735, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224570335, "dur": 22, "args": { "External id": 198766, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198766, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198766, "pid": 5, "tid": 7, "ts": 1716454224570335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513751, "dur": 11, "args": { "External id": 198766, "cbid": 211, "correlation": 198766 } }, { "ph": "s", "id": 198766, "pid": 76337, "tid": -914061504, "ts": 1716454224513751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224570359, "dur": 319, "args": { "External id": 198768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198768, "pid": 5, "tid": 7, "ts": 1716454224570359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513765, "dur": 5, "args": { "External id": 198768, "cbid": 211, "correlation": 198768 } }, { "ph": "s", "id": 198768, "pid": 76337, "tid": -914061504, "ts": 1716454224513765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224570680, "dur": 1, "args": { "External id": 198770, "device": 5, "context": 1, "stream": 7, "correlation": 198770, "bytes": 960, "memory bandwidth (GB/s)": 0.600375234521576 } }, { "ph": "f", "id": 198770, "pid": 5, "tid": 7, "ts": 1716454224570680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224513776, "dur": 7, "args": { "External id": 198770, "cbid": 51, "correlation": 198770 } }, { "ph": "s", "id": 198770, "pid": 76337, "tid": -914061504, "ts": 1716454224513776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224570684, "dur": 1240, "args": { "External id": 198771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198771, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198771, "pid": 5, "tid": 7, "ts": 1716454224570684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513784, "dur": 6, "args": { "External id": 198771, "cbid": 211, "correlation": 198771 } }, { "ph": "s", "id": 198771, "pid": 76337, "tid": -914061504, "ts": 1716454224513784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224571925, "dur": 12, "args": { "External id": 198773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198773, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198773, "pid": 5, "tid": 7, "ts": 1716454224571925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513794, "dur": 5, "args": { "External id": 198773, "cbid": 211, "correlation": 198773 } }, { "ph": "s", "id": 198773, "pid": 76337, "tid": -914061504, "ts": 1716454224513794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224571939, "dur": 15, "args": { "External id": 198779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198779, "pid": 5, "tid": 7, "ts": 1716454224571939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513824, "dur": 10, "args": { "External id": 198779, "cbid": 211, "correlation": 198779 } }, { "ph": "s", "id": 198779, "pid": 76337, "tid": -914061504, "ts": 1716454224513824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224571955, "dur": 3, "args": { "External id": 198787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198787, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 198787, "pid": 5, "tid": 7, "ts": 1716454224571955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513869, "dur": 10, "args": { "External id": 198787, "cbid": 211, "correlation": 198787 } }, { "ph": "s", "id": 198787, "pid": 76337, "tid": -914061504, "ts": 1716454224513869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224513937, "dur": 1, "args": { "External id": 198803, "cbid": 251, "correlation": 198803 } }, { "ph": "f", "id": 198803, "pid": 76337, "tid": -914061504, "ts": 1716454224513937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224513942, "dur": 0, "args": { "External id": 198805, "cbid": 251, "correlation": 198805 } }, { "ph": "f", "id": 198805, "pid": 76337, "tid": -914061504, "ts": 1716454224513942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224571960, "dur": 12, "args": { "External id": 198806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198806, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198806, "pid": 5, "tid": 7, "ts": 1716454224571960, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513944, "dur": 11, "args": { "External id": 198806, "cbid": 211, "correlation": 198806 } }, { "ph": "s", "id": 198806, "pid": 76337, "tid": -914061504, "ts": 1716454224513944, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224571973, "dur": 5, "args": { "External id": 198808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198808, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198808, "pid": 5, "tid": 7, "ts": 1716454224571973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224513957, "dur": 5, "args": { "External id": 198808, "cbid": 211, "correlation": 198808 } }, { "ph": "s", "id": 198808, "pid": 76337, "tid": -914061504, "ts": 1716454224513957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224571980, "dur": 16, "args": { "External id": 198818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198818, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198818, "pid": 5, "tid": 7, "ts": 1716454224571980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514025, "dur": 13, "args": { "External id": 198818, "cbid": 211, "correlation": 198818 } }, { "ph": "s", "id": 198818, "pid": 76337, "tid": -914061504, "ts": 1716454224514025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224571997, "dur": 17, "args": { "External id": 198838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198838, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198838, "pid": 5, "tid": 7, "ts": 1716454224571997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514094, "dur": 11, "args": { "External id": 198838, "cbid": 211, "correlation": 198838 } }, { "ph": "s", "id": 198838, "pid": 76337, "tid": -914061504, "ts": 1716454224514094, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224572016, "dur": 4, "args": { "External id": 198850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198850, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 198850, "pid": 5, "tid": 7, "ts": 1716454224572016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514114, "dur": 6, "args": { "External id": 198850, "cbid": 211, "correlation": 198850 } }, { "ph": "s", "id": 198850, "pid": 76337, "tid": -914061504, "ts": 1716454224514114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224572022, "dur": 17, "args": { "External id": 198853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198853, "pid": 5, "tid": 7, "ts": 1716454224572022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514134, "dur": 7, "args": { "External id": 198853, "cbid": 211, "correlation": 198853 } }, { "ph": "s", "id": 198853, "pid": 76337, "tid": -914061504, "ts": 1716454224514134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224572040, "dur": 11, "args": { "External id": 198862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198862, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198862, "pid": 5, "tid": 7, "ts": 1716454224572040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514174, "dur": 10, "args": { "External id": 198862, "cbid": 211, "correlation": 198862 } }, { "ph": "s", "id": 198862, "pid": 76337, "tid": -914061504, "ts": 1716454224514174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224514236, "dur": 0, "args": { "External id": 198872, "cbid": 317, "correlation": 198872 } }, { "ph": "f", "id": 198872, "pid": 76337, "tid": -914061504, "ts": 1716454224514236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224514237, "dur": 0, "args": { "External id": 198873, "cbid": 203, "correlation": 198873 } }, { "ph": "f", "id": 198873, "pid": 76337, "tid": -914061504, "ts": 1716454224514237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224514238, "dur": 0, "args": { "External id": 198874, "cbid": 205, "correlation": 198874 } }, { "ph": "f", "id": 198874, "pid": 76337, "tid": -914061504, "ts": 1716454224514238, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224572052, "dur": 11, "args": { "External id": 198878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198878, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198878, "pid": 5, "tid": 7, "ts": 1716454224572052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514255, "dur": 12, "args": { "External id": 198878, "cbid": 211, "correlation": 198878 } }, { "ph": "s", "id": 198878, "pid": 76337, "tid": -914061504, "ts": 1716454224514255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224572064, "dur": 163, "args": { "External id": 198880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198880, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198880, "pid": 5, "tid": 7, "ts": 1716454224572064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514270, "dur": 5, "args": { "External id": 198880, "cbid": 211, "correlation": 198880 } }, { "ph": "s", "id": 198880, "pid": 76337, "tid": -914061504, "ts": 1716454224514270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224572230, "dur": 1, "args": { "External id": 198882, "device": 5, "context": 1, "stream": 7, "correlation": 198882, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 198882, "pid": 5, "tid": 7, "ts": 1716454224572230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224514281, "dur": 8, "args": { "External id": 198882, "cbid": 51, "correlation": 198882 } }, { "ph": "s", "id": 198882, "pid": 76337, "tid": -914061504, "ts": 1716454224514281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224572233, "dur": 649, "args": { "External id": 198883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198883, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 198883, "pid": 5, "tid": 7, "ts": 1716454224572233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514290, "dur": 7, "args": { "External id": 198883, "cbid": 211, "correlation": 198883 } }, { "ph": "s", "id": 198883, "pid": 76337, "tid": -914061504, "ts": 1716454224514290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224572883, "dur": 12, "args": { "External id": 198885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198885, "pid": 5, "tid": 7, "ts": 1716454224572883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514300, "dur": 5, "args": { "External id": 198885, "cbid": 211, "correlation": 198885 } }, { "ph": "s", "id": 198885, "pid": 76337, "tid": -914061504, "ts": 1716454224514300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224572897, "dur": 15, "args": { "External id": 198891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198891, "pid": 5, "tid": 7, "ts": 1716454224572897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514328, "dur": 9, "args": { "External id": 198891, "cbid": 211, "correlation": 198891 } }, { "ph": "s", "id": 198891, "pid": 76337, "tid": -914061504, "ts": 1716454224514328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224514387, "dur": 0, "args": { "External id": 198901, "cbid": 317, "correlation": 198901 } }, { "ph": "f", "id": 198901, "pid": 76337, "tid": -914061504, "ts": 1716454224514387, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224514388, "dur": 0, "args": { "External id": 198902, "cbid": 203, "correlation": 198902 } }, { "ph": "f", "id": 198902, "pid": 76337, "tid": -914061504, "ts": 1716454224514388, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224514389, "dur": 0, "args": { "External id": 198903, "cbid": 205, "correlation": 198903 } }, { "ph": "f", "id": 198903, "pid": 76337, "tid": -914061504, "ts": 1716454224514389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224572913, "dur": 21, "args": { "External id": 198907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198907, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198907, "pid": 5, "tid": 7, "ts": 1716454224572913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514403, "dur": 11, "args": { "External id": 198907, "cbid": 211, "correlation": 198907 } }, { "ph": "s", "id": 198907, "pid": 76337, "tid": -914061504, "ts": 1716454224514403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224572935, "dur": 4, "args": { "External id": 198909, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198909, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 198909, "pid": 5, "tid": 7, "ts": 1716454224572935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514420, "dur": 6, "args": { "External id": 198909, "cbid": 211, "correlation": 198909 } }, { "ph": "s", "id": 198909, "pid": 76337, "tid": -914061504, "ts": 1716454224514420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224514430, "dur": 0, "args": { "External id": 198910, "cbid": 51, "correlation": 198910 } }, { "ph": "s", "id": 198910, "pid": 76337, "tid": -914061504, "ts": 1716454224514430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224572941, "dur": 175, "args": { "External id": 198911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198911, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 198911, "pid": 5, "tid": 7, "ts": 1716454224572941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514430, "dur": 6, "args": { "External id": 198911, "cbid": 211, "correlation": 198911 } }, { "ph": "s", "id": 198911, "pid": 76337, "tid": -914061504, "ts": 1716454224514430, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224573117, "dur": 16, "args": { "External id": 198916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198916, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198916, "pid": 5, "tid": 7, "ts": 1716454224573117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514457, "dur": 8, "args": { "External id": 198916, "cbid": 211, "correlation": 198916 } }, { "ph": "s", "id": 198916, "pid": 76337, "tid": -914061504, "ts": 1716454224514457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224573135, "dur": 11, "args": { "External id": 198924, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198924, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198924, "pid": 5, "tid": 7, "ts": 1716454224573135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514486, "dur": 8, "args": { "External id": 198924, "cbid": 211, "correlation": 198924 } }, { "ph": "s", "id": 198924, "pid": 76337, "tid": -914061504, "ts": 1716454224514486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224573147, "dur": 10, "args": { "External id": 198932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198932, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198932, "pid": 5, "tid": 7, "ts": 1716454224573147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514514, "dur": 8, "args": { "External id": 198932, "cbid": 211, "correlation": 198932 } }, { "ph": "s", "id": 198932, "pid": 76337, "tid": -914061504, "ts": 1716454224514514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224573158, "dur": 20, "args": { "External id": 198952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198952, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 198952, "pid": 5, "tid": 7, "ts": 1716454224573158, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514597, "dur": 12, "args": { "External id": 198952, "cbid": 211, "correlation": 198952 } }, { "ph": "s", "id": 198952, "pid": 76337, "tid": -914061504, "ts": 1716454224514597, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224573179, "dur": 5, "args": { "External id": 198964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198964, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 198964, "pid": 5, "tid": 7, "ts": 1716454224573179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514619, "dur": 6, "args": { "External id": 198964, "cbid": 211, "correlation": 198964 } }, { "ph": "s", "id": 198964, "pid": 76337, "tid": -914061504, "ts": 1716454224514619, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224573186, "dur": 17, "args": { "External id": 198967, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198967, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198967, "pid": 5, "tid": 7, "ts": 1716454224573186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514636, "dur": 7, "args": { "External id": 198967, "cbid": 211, "correlation": 198967 } }, { "ph": "s", "id": 198967, "pid": 76337, "tid": -914061504, "ts": 1716454224514636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224514694, "dur": 0, "args": { "External id": 198978, "cbid": 317, "correlation": 198978 } }, { "ph": "f", "id": 198978, "pid": 76337, "tid": -914061504, "ts": 1716454224514694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224514695, "dur": 0, "args": { "External id": 198979, "cbid": 203, "correlation": 198979 } }, { "ph": "f", "id": 198979, "pid": 76337, "tid": -914061504, "ts": 1716454224514695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224514696, "dur": 0, "args": { "External id": 198980, "cbid": 205, "correlation": 198980 } }, { "ph": "f", "id": 198980, "pid": 76337, "tid": -914061504, "ts": 1716454224514696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224573204, "dur": 11, "args": { "External id": 198984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198984, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198984, "pid": 5, "tid": 7, "ts": 1716454224573204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514709, "dur": 12, "args": { "External id": 198984, "cbid": 211, "correlation": 198984 } }, { "ph": "s", "id": 198984, "pid": 76337, "tid": -914061504, "ts": 1716454224514709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224573216, "dur": 3, "args": { "External id": 198986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198986, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 198986, "pid": 5, "tid": 7, "ts": 1716454224573216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514725, "dur": 5, "args": { "External id": 198986, "cbid": 211, "correlation": 198986 } }, { "ph": "s", "id": 198986, "pid": 76337, "tid": -914061504, "ts": 1716454224514725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224514733, "dur": 0, "args": { "External id": 198987, "cbid": 51, "correlation": 198987 } }, { "ph": "s", "id": 198987, "pid": 76337, "tid": -914061504, "ts": 1716454224514733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224573221, "dur": 93, "args": { "External id": 198988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198988, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 198988, "pid": 5, "tid": 7, "ts": 1716454224573221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514734, "dur": 5, "args": { "External id": 198988, "cbid": 211, "correlation": 198988 } }, { "ph": "s", "id": 198988, "pid": 76337, "tid": -914061504, "ts": 1716454224514734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224573315, "dur": 15, "args": { "External id": 198993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 198993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 198993, "pid": 5, "tid": 7, "ts": 1716454224573315, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514761, "dur": 9, "args": { "External id": 198993, "cbid": 211, "correlation": 198993 } }, { "ph": "s", "id": 198993, "pid": 76337, "tid": -914061504, "ts": 1716454224514761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224573332, "dur": 83, "args": { "External id": 199002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199002, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199002, "pid": 5, "tid": 7, "ts": 1716454224573332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514843, "dur": 15, "args": { "External id": 199002, "cbid": 211, "correlation": 199002 } }, { "ph": "s", "id": 199002, "pid": 76337, "tid": -914061504, "ts": 1716454224514843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224573417, "dur": 30, "args": { "External id": 199024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199024, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199024, "pid": 5, "tid": 7, "ts": 1716454224573417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224514905, "dur": 11, "args": { "External id": 199024, "cbid": 211, "correlation": 199024 } }, { "ph": "s", "id": 199024, "pid": 76337, "tid": -914061504, "ts": 1716454224514905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515005, "dur": 2, "args": { "External id": 199035, "cbid": 251, "correlation": 199035 } }, { "ph": "f", "id": 199035, "pid": 76337, "tid": -914061504, "ts": 1716454224515005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224573448, "dur": 163, "args": { "External id": 199036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199036, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199036, "pid": 5, "tid": 7, "ts": 1716454224573448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515011, "dur": 13, "args": { "External id": 199036, "cbid": 211, "correlation": 199036 } }, { "ph": "s", "id": 199036, "pid": 76337, "tid": -914061504, "ts": 1716454224515011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515083, "dur": 1, "args": { "External id": 199047, "cbid": 251, "correlation": 199047 } }, { "ph": "f", "id": 199047, "pid": 76337, "tid": -914061504, "ts": 1716454224515083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224573612, "dur": 157, "args": { "External id": 199048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199048, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199048, "pid": 5, "tid": 7, "ts": 1716454224573612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515087, "dur": 12, "args": { "External id": 199048, "cbid": 211, "correlation": 199048 } }, { "ph": "s", "id": 199048, "pid": 76337, "tid": -914061504, "ts": 1716454224515087, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515153, "dur": 1, "args": { "External id": 199059, "cbid": 251, "correlation": 199059 } }, { "ph": "f", "id": 199059, "pid": 76337, "tid": -914061504, "ts": 1716454224515153, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224573770, "dur": 136, "args": { "External id": 199060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199060, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199060, "pid": 5, "tid": 7, "ts": 1716454224573770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515157, "dur": 11, "args": { "External id": 199060, "cbid": 211, "correlation": 199060 } }, { "ph": "s", "id": 199060, "pid": 76337, "tid": -914061504, "ts": 1716454224515157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224573908, "dur": 334, "args": { "External id": 199085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199085, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199085, "pid": 5, "tid": 7, "ts": 1716454224573908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515244, "dur": 14, "args": { "External id": 199085, "cbid": 211, "correlation": 199085 } }, { "ph": "s", "id": 199085, "pid": 76337, "tid": -914061504, "ts": 1716454224515244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515346, "dur": 1, "args": { "External id": 199103, "cbid": 251, "correlation": 199103 } }, { "ph": "f", "id": 199103, "pid": 76337, "tid": -914061504, "ts": 1716454224515346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224574243, "dur": 167, "args": { "External id": 199105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199105, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199105, "pid": 5, "tid": 7, "ts": 1716454224574243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515352, "dur": 13, "args": { "External id": 199105, "cbid": 211, "correlation": 199105 } }, { "ph": "s", "id": 199105, "pid": 76337, "tid": -914061504, "ts": 1716454224515352, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224574412, "dur": 19, "args": { "External id": 199113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199113, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199113, "pid": 5, "tid": 7, "ts": 1716454224574412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515422, "dur": 12, "args": { "External id": 199113, "cbid": 211, "correlation": 199113 } }, { "ph": "s", "id": 199113, "pid": 76337, "tid": -914061504, "ts": 1716454224515422, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224574432, "dur": 27, "args": { "External id": 199121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199121, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199121, "pid": 5, "tid": 7, "ts": 1716454224574432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515461, "dur": 8, "args": { "External id": 199121, "cbid": 211, "correlation": 199121 } }, { "ph": "s", "id": 199121, "pid": 76337, "tid": -914061504, "ts": 1716454224515461, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224574461, "dur": 19, "args": { "External id": 199132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199132, "pid": 5, "tid": 7, "ts": 1716454224574461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515534, "dur": 12, "args": { "External id": 199132, "cbid": 211, "correlation": 199132 } }, { "ph": "s", "id": 199132, "pid": 76337, "tid": -914061504, "ts": 1716454224515534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224574481, "dur": 16, "args": { "External id": 199154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199154, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199154, "pid": 5, "tid": 7, "ts": 1716454224574481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515566, "dur": 8, "args": { "External id": 199154, "cbid": 211, "correlation": 199154 } }, { "ph": "s", "id": 199154, "pid": 76337, "tid": -914061504, "ts": 1716454224515566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515650, "dur": 2, "args": { "External id": 199165, "cbid": 251, "correlation": 199165 } }, { "ph": "f", "id": 199165, "pid": 76337, "tid": -914061504, "ts": 1716454224515650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224574499, "dur": 89, "args": { "External id": 199166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199166, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199166, "pid": 5, "tid": 7, "ts": 1716454224574499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515656, "dur": 13, "args": { "External id": 199166, "cbid": 211, "correlation": 199166 } }, { "ph": "s", "id": 199166, "pid": 76337, "tid": -914061504, "ts": 1716454224515656, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515726, "dur": 1, "args": { "External id": 199177, "cbid": 251, "correlation": 199177 } }, { "ph": "f", "id": 199177, "pid": 76337, "tid": -914061504, "ts": 1716454224515726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515730, "dur": 0, "args": { "External id": 199178, "cbid": 251, "correlation": 199178 } }, { "ph": "f", "id": 199178, "pid": 76337, "tid": -914061504, "ts": 1716454224515730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224574589, "dur": 13, "args": { "External id": 199179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199179, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199179, "pid": 5, "tid": 7, "ts": 1716454224574589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515732, "dur": 12, "args": { "External id": 199179, "cbid": 211, "correlation": 199179 } }, { "ph": "s", "id": 199179, "pid": 76337, "tid": -914061504, "ts": 1716454224515732, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224574603, "dur": 6, "args": { "External id": 199181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199181, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199181, "pid": 5, "tid": 7, "ts": 1716454224574603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515745, "dur": 6, "args": { "External id": 199181, "cbid": 211, "correlation": 199181 } }, { "ph": "s", "id": 199181, "pid": 76337, "tid": -914061504, "ts": 1716454224515745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515802, "dur": 1, "args": { "External id": 199192, "cbid": 251, "correlation": 199192 } }, { "ph": "f", "id": 199192, "pid": 76337, "tid": -914061504, "ts": 1716454224515802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224515805, "dur": 0, "args": { "External id": 199193, "cbid": 251, "correlation": 199193 } }, { "ph": "f", "id": 199193, "pid": 76337, "tid": -914061504, "ts": 1716454224515805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224574610, "dur": 9, "args": { "External id": 199194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199194, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199194, "pid": 5, "tid": 7, "ts": 1716454224574610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515807, "dur": 12, "args": { "External id": 199194, "cbid": 211, "correlation": 199194 } }, { "ph": "s", "id": 199194, "pid": 76337, "tid": -914061504, "ts": 1716454224515807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224574620, "dur": 4, "args": { "External id": 199196, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199196, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199196, "pid": 5, "tid": 7, "ts": 1716454224574620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515820, "dur": 5, "args": { "External id": 199196, "cbid": 211, "correlation": 199196 } }, { "ph": "s", "id": 199196, "pid": 76337, "tid": -914061504, "ts": 1716454224515820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224574625, "dur": 54, "args": { "External id": 199221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199221, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199221, "pid": 5, "tid": 7, "ts": 1716454224574625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224515897, "dur": 13, "args": { "External id": 199221, "cbid": 211, "correlation": 199221 } }, { "ph": "s", "id": 199221, "pid": 76337, "tid": -914061504, "ts": 1716454224515897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224516003, "dur": 1, "args": { "External id": 199239, "cbid": 251, "correlation": 199239 } }, { "ph": "f", "id": 199239, "pid": 76337, "tid": -914061504, "ts": 1716454224516003, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224574681, "dur": 92, "args": { "External id": 199241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199241, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199241, "pid": 5, "tid": 7, "ts": 1716454224574681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516010, "dur": 15, "args": { "External id": 199241, "cbid": 211, "correlation": 199241 } }, { "ph": "s", "id": 199241, "pid": 76337, "tid": -914061504, "ts": 1716454224516010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224574774, "dur": 9, "args": { "External id": 199249, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199249, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199249, "pid": 5, "tid": 7, "ts": 1716454224574774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516080, "dur": 12, "args": { "External id": 199249, "cbid": 211, "correlation": 199249 } }, { "ph": "s", "id": 199249, "pid": 76337, "tid": -914061504, "ts": 1716454224516080, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224574784, "dur": 21, "args": { "External id": 199257, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199257, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199257, "pid": 5, "tid": 7, "ts": 1716454224574784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516122, "dur": 9, "args": { "External id": 199257, "cbid": 211, "correlation": 199257 } }, { "ph": "s", "id": 199257, "pid": 76337, "tid": -914061504, "ts": 1716454224516122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224574807, "dur": 18, "args": { "External id": 199279, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199279, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199279, "pid": 5, "tid": 7, "ts": 1716454224574807, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516174, "dur": 10, "args": { "External id": 199279, "cbid": 211, "correlation": 199279 } }, { "ph": "s", "id": 199279, "pid": 76337, "tid": -914061504, "ts": 1716454224516174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224516261, "dur": 1, "args": { "External id": 199295, "cbid": 251, "correlation": 199295 } }, { "ph": "f", "id": 199295, "pid": 76337, "tid": -914061504, "ts": 1716454224516261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224516266, "dur": 0, "args": { "External id": 199297, "cbid": 251, "correlation": 199297 } }, { "ph": "f", "id": 199297, "pid": 76337, "tid": -914061504, "ts": 1716454224516266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224574826, "dur": 499, "args": { "External id": 199298, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199298, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199298, "pid": 5, "tid": 7, "ts": 1716454224574826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516268, "dur": 13, "args": { "External id": 199298, "cbid": 211, "correlation": 199298 } }, { "ph": "s", "id": 199298, "pid": 76337, "tid": -914061504, "ts": 1716454224516268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224575326, "dur": 67, "args": { "External id": 199306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199306, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199306, "pid": 5, "tid": 7, "ts": 1716454224575326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516334, "dur": 12, "args": { "External id": 199306, "cbid": 211, "correlation": 199306 } }, { "ph": "s", "id": 199306, "pid": 76337, "tid": -914061504, "ts": 1716454224516334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224575394, "dur": 67, "args": { "External id": 199314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199314, "pid": 5, "tid": 7, "ts": 1716454224575394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516366, "dur": 9, "args": { "External id": 199314, "cbid": 211, "correlation": 199314 } }, { "ph": "s", "id": 199314, "pid": 76337, "tid": -914061504, "ts": 1716454224516366, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224516446, "dur": 1, "args": { "External id": 199330, "cbid": 251, "correlation": 199330 } }, { "ph": "f", "id": 199330, "pid": 76337, "tid": -914061504, "ts": 1716454224516446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224575463, "dur": 1, "args": { "External id": 199332, "device": 5, "context": 1, "stream": 7, "correlation": 199332, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 199332, "pid": 5, "tid": 7, "ts": 1716454224575463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224516451, "dur": 9, "args": { "External id": 199332, "cbid": 51, "correlation": 199332 } }, { "ph": "s", "id": 199332, "pid": 76337, "tid": -914061504, "ts": 1716454224516451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224575467, "dur": 273, "args": { "External id": 199333, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199333, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199333, "pid": 5, "tid": 7, "ts": 1716454224575467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516462, "dur": 12, "args": { "External id": 199333, "cbid": 211, "correlation": 199333 } }, { "ph": "s", "id": 199333, "pid": 76337, "tid": -914061504, "ts": 1716454224516462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224575741, "dur": 14, "args": { "External id": 199341, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199341, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199341, "pid": 5, "tid": 7, "ts": 1716454224575741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516504, "dur": 10, "args": { "External id": 199341, "cbid": 211, "correlation": 199341 } }, { "ph": "s", "id": 199341, "pid": 76337, "tid": -914061504, "ts": 1716454224516504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224575757, "dur": 38, "args": { "External id": 199352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199352, "pid": 5, "tid": 7, "ts": 1716454224575757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516574, "dur": 12, "args": { "External id": 199352, "cbid": 211, "correlation": 199352 } }, { "ph": "s", "id": 199352, "pid": 76337, "tid": -914061504, "ts": 1716454224516574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224516639, "dur": 0, "args": { "External id": 199364, "cbid": 317, "correlation": 199364 } }, { "ph": "f", "id": 199364, "pid": 76337, "tid": -914061504, "ts": 1716454224516639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224516640, "dur": 0, "args": { "External id": 199365, "cbid": 203, "correlation": 199365 } }, { "ph": "f", "id": 199365, "pid": 76337, "tid": -914061504, "ts": 1716454224516640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224516641, "dur": 0, "args": { "External id": 199366, "cbid": 205, "correlation": 199366 } }, { "ph": "f", "id": 199366, "pid": 76337, "tid": -914061504, "ts": 1716454224516641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224575796, "dur": 14, "args": { "External id": 199370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199370, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199370, "pid": 5, "tid": 7, "ts": 1716454224575796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516657, "dur": 13, "args": { "External id": 199370, "cbid": 211, "correlation": 199370 } }, { "ph": "s", "id": 199370, "pid": 76337, "tid": -914061504, "ts": 1716454224516657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224575811, "dur": 4, "args": { "External id": 199372, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199372, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199372, "pid": 5, "tid": 7, "ts": 1716454224575811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516674, "dur": 6, "args": { "External id": 199372, "cbid": 211, "correlation": 199372 } }, { "ph": "s", "id": 199372, "pid": 76337, "tid": -914061504, "ts": 1716454224516674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224516683, "dur": 0, "args": { "External id": 199373, "cbid": 51, "correlation": 199373 } }, { "ph": "s", "id": 199373, "pid": 76337, "tid": -914061504, "ts": 1716454224516683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224575816, "dur": 97, "args": { "External id": 199374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199374, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 199374, "pid": 5, "tid": 7, "ts": 1716454224575816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516684, "dur": 5, "args": { "External id": 199374, "cbid": 211, "correlation": 199374 } }, { "ph": "s", "id": 199374, "pid": 76337, "tid": -914061504, "ts": 1716454224516684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224575914, "dur": 16, "args": { "External id": 199379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199379, "pid": 5, "tid": 7, "ts": 1716454224575914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516710, "dur": 9, "args": { "External id": 199379, "cbid": 211, "correlation": 199379 } }, { "ph": "s", "id": 199379, "pid": 76337, "tid": -914061504, "ts": 1716454224516710, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224575932, "dur": 11, "args": { "External id": 199387, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199387, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199387, "pid": 5, "tid": 7, "ts": 1716454224575932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516742, "dur": 8, "args": { "External id": 199387, "cbid": 211, "correlation": 199387 } }, { "ph": "s", "id": 199387, "pid": 76337, "tid": -914061504, "ts": 1716454224516742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224575945, "dur": 30, "args": { "External id": 199396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199396, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199396, "pid": 5, "tid": 7, "ts": 1716454224575945, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516781, "dur": 10, "args": { "External id": 199396, "cbid": 211, "correlation": 199396 } }, { "ph": "s", "id": 199396, "pid": 76337, "tid": -914061504, "ts": 1716454224516781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224575976, "dur": 31, "args": { "External id": 199416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199416, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 199416, "pid": 5, "tid": 7, "ts": 1716454224575976, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516853, "dur": 11, "args": { "External id": 199416, "cbid": 211, "correlation": 199416 } }, { "ph": "s", "id": 199416, "pid": 76337, "tid": -914061504, "ts": 1716454224516853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224576008, "dur": 5, "args": { "External id": 199428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199428, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199428, "pid": 5, "tid": 7, "ts": 1716454224576008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516880, "dur": 7, "args": { "External id": 199428, "cbid": 211, "correlation": 199428 } }, { "ph": "s", "id": 199428, "pid": 76337, "tid": -914061504, "ts": 1716454224516880, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224576015, "dur": 31, "args": { "External id": 199431, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199431, "pid": 5, "tid": 7, "ts": 1716454224576015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516899, "dur": 7, "args": { "External id": 199431, "cbid": 211, "correlation": 199431 } }, { "ph": "s", "id": 199431, "pid": 76337, "tid": -914061504, "ts": 1716454224516899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224576048, "dur": 21, "args": { "External id": 199440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199440, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199440, "pid": 5, "tid": 7, "ts": 1716454224576048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224516939, "dur": 10, "args": { "External id": 199440, "cbid": 211, "correlation": 199440 } }, { "ph": "s", "id": 199440, "pid": 76337, "tid": -914061504, "ts": 1716454224516939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224516999, "dur": 0, "args": { "External id": 199450, "cbid": 317, "correlation": 199450 } }, { "ph": "f", "id": 199450, "pid": 76337, "tid": -914061504, "ts": 1716454224516999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224516999, "dur": 0, "args": { "External id": 199451, "cbid": 203, "correlation": 199451 } }, { "ph": "f", "id": 199451, "pid": 76337, "tid": -914061504, "ts": 1716454224516999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224517000, "dur": 0, "args": { "External id": 199452, "cbid": 205, "correlation": 199452 } }, { "ph": "f", "id": 199452, "pid": 76337, "tid": -914061504, "ts": 1716454224517000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224576070, "dur": 21, "args": { "External id": 199456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199456, "pid": 5, "tid": 7, "ts": 1716454224576070, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517016, "dur": 12, "args": { "External id": 199456, "cbid": 211, "correlation": 199456 } }, { "ph": "s", "id": 199456, "pid": 76337, "tid": -914061504, "ts": 1716454224517016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224576093, "dur": 319, "args": { "External id": 199458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199458, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199458, "pid": 5, "tid": 7, "ts": 1716454224576093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517031, "dur": 5, "args": { "External id": 199458, "cbid": 211, "correlation": 199458 } }, { "ph": "s", "id": 199458, "pid": 76337, "tid": -914061504, "ts": 1716454224517031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224576414, "dur": 1, "args": { "External id": 199460, "device": 5, "context": 1, "stream": 7, "correlation": 199460, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 199460, "pid": 5, "tid": 7, "ts": 1716454224576414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224517042, "dur": 6, "args": { "External id": 199460, "cbid": 51, "correlation": 199460 } }, { "ph": "s", "id": 199460, "pid": 76337, "tid": -914061504, "ts": 1716454224517042, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224576418, "dur": 1257, "args": { "External id": 199461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199461, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199461, "pid": 5, "tid": 7, "ts": 1716454224576418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517050, "dur": 6, "args": { "External id": 199461, "cbid": 211, "correlation": 199461 } }, { "ph": "s", "id": 199461, "pid": 76337, "tid": -914061504, "ts": 1716454224517050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224577676, "dur": 12, "args": { "External id": 199463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199463, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199463, "pid": 5, "tid": 7, "ts": 1716454224577676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517060, "dur": 6, "args": { "External id": 199463, "cbid": 211, "correlation": 199463 } }, { "ph": "s", "id": 199463, "pid": 76337, "tid": -914061504, "ts": 1716454224517060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224577690, "dur": 15, "args": { "External id": 199469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199469, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199469, "pid": 5, "tid": 7, "ts": 1716454224577690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517092, "dur": 9, "args": { "External id": 199469, "cbid": 211, "correlation": 199469 } }, { "ph": "s", "id": 199469, "pid": 76337, "tid": -914061504, "ts": 1716454224517092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224577706, "dur": 3, "args": { "External id": 199477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199477, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 199477, "pid": 5, "tid": 7, "ts": 1716454224577706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517137, "dur": 10, "args": { "External id": 199477, "cbid": 211, "correlation": 199477 } }, { "ph": "s", "id": 199477, "pid": 76337, "tid": -914061504, "ts": 1716454224517137, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224517202, "dur": 1, "args": { "External id": 199493, "cbid": 251, "correlation": 199493 } }, { "ph": "f", "id": 199493, "pid": 76337, "tid": -914061504, "ts": 1716454224517202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224517207, "dur": 0, "args": { "External id": 199495, "cbid": 251, "correlation": 199495 } }, { "ph": "f", "id": 199495, "pid": 76337, "tid": -914061504, "ts": 1716454224517207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224577711, "dur": 13, "args": { "External id": 199496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199496, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199496, "pid": 5, "tid": 7, "ts": 1716454224577711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517209, "dur": 12, "args": { "External id": 199496, "cbid": 211, "correlation": 199496 } }, { "ph": "s", "id": 199496, "pid": 76337, "tid": -914061504, "ts": 1716454224517209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224577725, "dur": 5, "args": { "External id": 199498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199498, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199498, "pid": 5, "tid": 7, "ts": 1716454224577725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517222, "dur": 6, "args": { "External id": 199498, "cbid": 211, "correlation": 199498 } }, { "ph": "s", "id": 199498, "pid": 76337, "tid": -914061504, "ts": 1716454224517222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224577732, "dur": 17, "args": { "External id": 199508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199508, "pid": 5, "tid": 7, "ts": 1716454224577732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517281, "dur": 12, "args": { "External id": 199508, "cbid": 211, "correlation": 199508 } }, { "ph": "s", "id": 199508, "pid": 76337, "tid": -914061504, "ts": 1716454224517281, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224577750, "dur": 17, "args": { "External id": 199528, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199528, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 199528, "pid": 5, "tid": 7, "ts": 1716454224577750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517347, "dur": 10, "args": { "External id": 199528, "cbid": 211, "correlation": 199528 } }, { "ph": "s", "id": 199528, "pid": 76337, "tid": -914061504, "ts": 1716454224517347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224577769, "dur": 4, "args": { "External id": 199540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199540, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 199540, "pid": 5, "tid": 7, "ts": 1716454224577769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517367, "dur": 6, "args": { "External id": 199540, "cbid": 211, "correlation": 199540 } }, { "ph": "s", "id": 199540, "pid": 76337, "tid": -914061504, "ts": 1716454224517367, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224577774, "dur": 17, "args": { "External id": 199543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199543, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199543, "pid": 5, "tid": 7, "ts": 1716454224577774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517387, "dur": 6, "args": { "External id": 199543, "cbid": 211, "correlation": 199543 } }, { "ph": "s", "id": 199543, "pid": 76337, "tid": -914061504, "ts": 1716454224517387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224577792, "dur": 11, "args": { "External id": 199552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199552, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199552, "pid": 5, "tid": 7, "ts": 1716454224577792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517426, "dur": 10, "args": { "External id": 199552, "cbid": 211, "correlation": 199552 } }, { "ph": "s", "id": 199552, "pid": 76337, "tid": -914061504, "ts": 1716454224517426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224517489, "dur": 0, "args": { "External id": 199562, "cbid": 317, "correlation": 199562 } }, { "ph": "f", "id": 199562, "pid": 76337, "tid": -914061504, "ts": 1716454224517489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224517490, "dur": 0, "args": { "External id": 199563, "cbid": 203, "correlation": 199563 } }, { "ph": "f", "id": 199563, "pid": 76337, "tid": -914061504, "ts": 1716454224517490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224517490, "dur": 0, "args": { "External id": 199564, "cbid": 205, "correlation": 199564 } }, { "ph": "f", "id": 199564, "pid": 76337, "tid": -914061504, "ts": 1716454224517490, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224577804, "dur": 12, "args": { "External id": 199568, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199568, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199568, "pid": 5, "tid": 7, "ts": 1716454224577804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517505, "dur": 12, "args": { "External id": 199568, "cbid": 211, "correlation": 199568 } }, { "ph": "s", "id": 199568, "pid": 76337, "tid": -914061504, "ts": 1716454224517505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224577817, "dur": 162, "args": { "External id": 199570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199570, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199570, "pid": 5, "tid": 7, "ts": 1716454224577817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517518, "dur": 5, "args": { "External id": 199570, "cbid": 211, "correlation": 199570 } }, { "ph": "s", "id": 199570, "pid": 76337, "tid": -914061504, "ts": 1716454224517518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224577981, "dur": 1, "args": { "External id": 199572, "device": 5, "context": 1, "stream": 7, "correlation": 199572, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 199572, "pid": 5, "tid": 7, "ts": 1716454224577981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224517530, "dur": 6, "args": { "External id": 199572, "cbid": 51, "correlation": 199572 } }, { "ph": "s", "id": 199572, "pid": 76337, "tid": -914061504, "ts": 1716454224517530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224577985, "dur": 650, "args": { "External id": 199573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199573, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199573, "pid": 5, "tid": 7, "ts": 1716454224577985, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517537, "dur": 6, "args": { "External id": 199573, "cbid": 211, "correlation": 199573 } }, { "ph": "s", "id": 199573, "pid": 76337, "tid": -914061504, "ts": 1716454224517537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224578636, "dur": 12, "args": { "External id": 199575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199575, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199575, "pid": 5, "tid": 7, "ts": 1716454224578636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517548, "dur": 5, "args": { "External id": 199575, "cbid": 211, "correlation": 199575 } }, { "ph": "s", "id": 199575, "pid": 76337, "tid": -914061504, "ts": 1716454224517548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224578649, "dur": 15, "args": { "External id": 199581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199581, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199581, "pid": 5, "tid": 7, "ts": 1716454224578649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517576, "dur": 9, "args": { "External id": 199581, "cbid": 211, "correlation": 199581 } }, { "ph": "s", "id": 199581, "pid": 76337, "tid": -914061504, "ts": 1716454224517576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224517634, "dur": 0, "args": { "External id": 199591, "cbid": 317, "correlation": 199591 } }, { "ph": "f", "id": 199591, "pid": 76337, "tid": -914061504, "ts": 1716454224517634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224517635, "dur": 0, "args": { "External id": 199592, "cbid": 203, "correlation": 199592 } }, { "ph": "f", "id": 199592, "pid": 76337, "tid": -914061504, "ts": 1716454224517635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224517635, "dur": 0, "args": { "External id": 199593, "cbid": 205, "correlation": 199593 } }, { "ph": "f", "id": 199593, "pid": 76337, "tid": -914061504, "ts": 1716454224517635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224578665, "dur": 22, "args": { "External id": 199597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199597, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199597, "pid": 5, "tid": 7, "ts": 1716454224578665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517647, "dur": 11, "args": { "External id": 199597, "cbid": 211, "correlation": 199597 } }, { "ph": "s", "id": 199597, "pid": 76337, "tid": -914061504, "ts": 1716454224517647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224578689, "dur": 4, "args": { "External id": 199599, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199599, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199599, "pid": 5, "tid": 7, "ts": 1716454224578689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517662, "dur": 6, "args": { "External id": 199599, "cbid": 211, "correlation": 199599 } }, { "ph": "s", "id": 199599, "pid": 76337, "tid": -914061504, "ts": 1716454224517662, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224517670, "dur": 0, "args": { "External id": 199600, "cbid": 51, "correlation": 199600 } }, { "ph": "s", "id": 199600, "pid": 76337, "tid": -914061504, "ts": 1716454224517670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224578694, "dur": 173, "args": { "External id": 199601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199601, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 199601, "pid": 5, "tid": 7, "ts": 1716454224578694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517671, "dur": 5, "args": { "External id": 199601, "cbid": 211, "correlation": 199601 } }, { "ph": "s", "id": 199601, "pid": 76337, "tid": -914061504, "ts": 1716454224517671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224578869, "dur": 16, "args": { "External id": 199606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199606, "pid": 5, "tid": 7, "ts": 1716454224578869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517697, "dur": 8, "args": { "External id": 199606, "cbid": 211, "correlation": 199606 } }, { "ph": "s", "id": 199606, "pid": 76337, "tid": -914061504, "ts": 1716454224517697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224578886, "dur": 12, "args": { "External id": 199614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199614, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199614, "pid": 5, "tid": 7, "ts": 1716454224578886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517725, "dur": 8, "args": { "External id": 199614, "cbid": 211, "correlation": 199614 } }, { "ph": "s", "id": 199614, "pid": 76337, "tid": -914061504, "ts": 1716454224517725, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224578900, "dur": 10, "args": { "External id": 199622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199622, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199622, "pid": 5, "tid": 7, "ts": 1716454224578900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517754, "dur": 8, "args": { "External id": 199622, "cbid": 211, "correlation": 199622 } }, { "ph": "s", "id": 199622, "pid": 76337, "tid": -914061504, "ts": 1716454224517754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224578911, "dur": 18, "args": { "External id": 199642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199642, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 199642, "pid": 5, "tid": 7, "ts": 1716454224578911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517835, "dur": 13, "args": { "External id": 199642, "cbid": 211, "correlation": 199642 } }, { "ph": "s", "id": 199642, "pid": 76337, "tid": -914061504, "ts": 1716454224517835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224578930, "dur": 5, "args": { "External id": 199654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199654, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 199654, "pid": 5, "tid": 7, "ts": 1716454224578930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517858, "dur": 6, "args": { "External id": 199654, "cbid": 211, "correlation": 199654 } }, { "ph": "s", "id": 199654, "pid": 76337, "tid": -914061504, "ts": 1716454224517858, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224578936, "dur": 16, "args": { "External id": 199657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199657, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199657, "pid": 5, "tid": 7, "ts": 1716454224578936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517876, "dur": 6, "args": { "External id": 199657, "cbid": 211, "correlation": 199657 } }, { "ph": "s", "id": 199657, "pid": 76337, "tid": -914061504, "ts": 1716454224517876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224517933, "dur": 0, "args": { "External id": 199668, "cbid": 317, "correlation": 199668 } }, { "ph": "f", "id": 199668, "pid": 76337, "tid": -914061504, "ts": 1716454224517933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224517934, "dur": 0, "args": { "External id": 199669, "cbid": 203, "correlation": 199669 } }, { "ph": "f", "id": 199669, "pid": 76337, "tid": -914061504, "ts": 1716454224517934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224517935, "dur": 0, "args": { "External id": 199670, "cbid": 205, "correlation": 199670 } }, { "ph": "f", "id": 199670, "pid": 76337, "tid": -914061504, "ts": 1716454224517935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224578954, "dur": 11, "args": { "External id": 199674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199674, "pid": 5, "tid": 7, "ts": 1716454224578954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517949, "dur": 11, "args": { "External id": 199674, "cbid": 211, "correlation": 199674 } }, { "ph": "s", "id": 199674, "pid": 76337, "tid": -914061504, "ts": 1716454224517949, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224578966, "dur": 3, "args": { "External id": 199676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199676, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199676, "pid": 5, "tid": 7, "ts": 1716454224578966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517964, "dur": 6, "args": { "External id": 199676, "cbid": 211, "correlation": 199676 } }, { "ph": "s", "id": 199676, "pid": 76337, "tid": -914061504, "ts": 1716454224517964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224517972, "dur": 7, "args": { "External id": 199677, "cbid": 51, "correlation": 199677 } }, { "ph": "s", "id": 199677, "pid": 76337, "tid": -914061504, "ts": 1716454224517972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224578971, "dur": 91, "args": { "External id": 199678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199678, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 199678, "pid": 5, "tid": 7, "ts": 1716454224578971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224517980, "dur": 6, "args": { "External id": 199678, "cbid": 211, "correlation": 199678 } }, { "ph": "s", "id": 199678, "pid": 76337, "tid": -914061504, "ts": 1716454224517980, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224579063, "dur": 16, "args": { "External id": 199683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199683, "pid": 5, "tid": 7, "ts": 1716454224579063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518009, "dur": 8, "args": { "External id": 199683, "cbid": 211, "correlation": 199683 } }, { "ph": "s", "id": 199683, "pid": 76337, "tid": -914061504, "ts": 1716454224518009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224579081, "dur": 82, "args": { "External id": 199692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199692, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199692, "pid": 5, "tid": 7, "ts": 1716454224579081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518090, "dur": 15, "args": { "External id": 199692, "cbid": 211, "correlation": 199692 } }, { "ph": "s", "id": 199692, "pid": 76337, "tid": -914061504, "ts": 1716454224518090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224579164, "dur": 30, "args": { "External id": 199714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199714, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199714, "pid": 5, "tid": 7, "ts": 1716454224579164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518151, "dur": 10, "args": { "External id": 199714, "cbid": 211, "correlation": 199714 } }, { "ph": "s", "id": 199714, "pid": 76337, "tid": -914061504, "ts": 1716454224518151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518240, "dur": 1, "args": { "External id": 199725, "cbid": 251, "correlation": 199725 } }, { "ph": "f", "id": 199725, "pid": 76337, "tid": -914061504, "ts": 1716454224518240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224579196, "dur": 164, "args": { "External id": 199726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199726, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199726, "pid": 5, "tid": 7, "ts": 1716454224579196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518246, "dur": 13, "args": { "External id": 199726, "cbid": 211, "correlation": 199726 } }, { "ph": "s", "id": 199726, "pid": 76337, "tid": -914061504, "ts": 1716454224518246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518316, "dur": 1, "args": { "External id": 199737, "cbid": 251, "correlation": 199737 } }, { "ph": "f", "id": 199737, "pid": 76337, "tid": -914061504, "ts": 1716454224518316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224579361, "dur": 157, "args": { "External id": 199738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199738, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199738, "pid": 5, "tid": 7, "ts": 1716454224579361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518320, "dur": 11, "args": { "External id": 199738, "cbid": 211, "correlation": 199738 } }, { "ph": "s", "id": 199738, "pid": 76337, "tid": -914061504, "ts": 1716454224518320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518385, "dur": 1, "args": { "External id": 199749, "cbid": 251, "correlation": 199749 } }, { "ph": "f", "id": 199749, "pid": 76337, "tid": -914061504, "ts": 1716454224518385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224579520, "dur": 159, "args": { "External id": 199750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199750, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199750, "pid": 5, "tid": 7, "ts": 1716454224579520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518389, "dur": 12, "args": { "External id": 199750, "cbid": 211, "correlation": 199750 } }, { "ph": "s", "id": 199750, "pid": 76337, "tid": -914061504, "ts": 1716454224518389, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224579680, "dur": 334, "args": { "External id": 199775, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199775, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199775, "pid": 5, "tid": 7, "ts": 1716454224579680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518473, "dur": 13, "args": { "External id": 199775, "cbid": 211, "correlation": 199775 } }, { "ph": "s", "id": 199775, "pid": 76337, "tid": -914061504, "ts": 1716454224518473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518573, "dur": 1, "args": { "External id": 199793, "cbid": 251, "correlation": 199793 } }, { "ph": "f", "id": 199793, "pid": 76337, "tid": -914061504, "ts": 1716454224518573, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224580016, "dur": 167, "args": { "External id": 199795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199795, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199795, "pid": 5, "tid": 7, "ts": 1716454224580016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518578, "dur": 13, "args": { "External id": 199795, "cbid": 211, "correlation": 199795 } }, { "ph": "s", "id": 199795, "pid": 76337, "tid": -914061504, "ts": 1716454224518578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224580184, "dur": 19, "args": { "External id": 199803, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199803, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199803, "pid": 5, "tid": 7, "ts": 1716454224580184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518647, "dur": 13, "args": { "External id": 199803, "cbid": 211, "correlation": 199803 } }, { "ph": "s", "id": 199803, "pid": 76337, "tid": -914061504, "ts": 1716454224518647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224580205, "dur": 27, "args": { "External id": 199811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199811, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199811, "pid": 5, "tid": 7, "ts": 1716454224580205, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518686, "dur": 8, "args": { "External id": 199811, "cbid": 211, "correlation": 199811 } }, { "ph": "s", "id": 199811, "pid": 76337, "tid": -914061504, "ts": 1716454224518686, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224580234, "dur": 18, "args": { "External id": 199822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199822, "pid": 5, "tid": 7, "ts": 1716454224580234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518758, "dur": 12, "args": { "External id": 199822, "cbid": 211, "correlation": 199822 } }, { "ph": "s", "id": 199822, "pid": 76337, "tid": -914061504, "ts": 1716454224518758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224580253, "dur": 16, "args": { "External id": 199844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199844, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199844, "pid": 5, "tid": 7, "ts": 1716454224580253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518788, "dur": 8, "args": { "External id": 199844, "cbid": 211, "correlation": 199844 } }, { "ph": "s", "id": 199844, "pid": 76337, "tid": -914061504, "ts": 1716454224518788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518873, "dur": 1, "args": { "External id": 199855, "cbid": 251, "correlation": 199855 } }, { "ph": "f", "id": 199855, "pid": 76337, "tid": -914061504, "ts": 1716454224518873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224580271, "dur": 89, "args": { "External id": 199856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199856, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199856, "pid": 5, "tid": 7, "ts": 1716454224580271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518879, "dur": 13, "args": { "External id": 199856, "cbid": 211, "correlation": 199856 } }, { "ph": "s", "id": 199856, "pid": 76337, "tid": -914061504, "ts": 1716454224518879, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518947, "dur": 1, "args": { "External id": 199867, "cbid": 251, "correlation": 199867 } }, { "ph": "f", "id": 199867, "pid": 76337, "tid": -914061504, "ts": 1716454224518947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224518950, "dur": 0, "args": { "External id": 199868, "cbid": 251, "correlation": 199868 } }, { "ph": "f", "id": 199868, "pid": 76337, "tid": -914061504, "ts": 1716454224518950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224580362, "dur": 13, "args": { "External id": 199869, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199869, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199869, "pid": 5, "tid": 7, "ts": 1716454224580362, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518952, "dur": 12, "args": { "External id": 199869, "cbid": 211, "correlation": 199869 } }, { "ph": "s", "id": 199869, "pid": 76337, "tid": -914061504, "ts": 1716454224518952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224580376, "dur": 6, "args": { "External id": 199871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199871, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199871, "pid": 5, "tid": 7, "ts": 1716454224580376, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224518966, "dur": 6, "args": { "External id": 199871, "cbid": 211, "correlation": 199871 } }, { "ph": "s", "id": 199871, "pid": 76337, "tid": -914061504, "ts": 1716454224518966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224519034, "dur": 1, "args": { "External id": 199882, "cbid": 251, "correlation": 199882 } }, { "ph": "f", "id": 199882, "pid": 76337, "tid": -914061504, "ts": 1716454224519034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224519038, "dur": 0, "args": { "External id": 199883, "cbid": 251, "correlation": 199883 } }, { "ph": "f", "id": 199883, "pid": 76337, "tid": -914061504, "ts": 1716454224519038, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224580382, "dur": 8, "args": { "External id": 199884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199884, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199884, "pid": 5, "tid": 7, "ts": 1716454224580382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519039, "dur": 12, "args": { "External id": 199884, "cbid": 211, "correlation": 199884 } }, { "ph": "s", "id": 199884, "pid": 76337, "tid": -914061504, "ts": 1716454224519039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224580392, "dur": 4, "args": { "External id": 199886, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199886, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199886, "pid": 5, "tid": 7, "ts": 1716454224580392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519053, "dur": 6, "args": { "External id": 199886, "cbid": 211, "correlation": 199886 } }, { "ph": "s", "id": 199886, "pid": 76337, "tid": -914061504, "ts": 1716454224519053, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224580397, "dur": 55, "args": { "External id": 199911, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199911, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199911, "pid": 5, "tid": 7, "ts": 1716454224580397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519130, "dur": 12, "args": { "External id": 199911, "cbid": 211, "correlation": 199911 } }, { "ph": "s", "id": 199911, "pid": 76337, "tid": -914061504, "ts": 1716454224519130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224519228, "dur": 1, "args": { "External id": 199929, "cbid": 251, "correlation": 199929 } }, { "ph": "f", "id": 199929, "pid": 76337, "tid": -914061504, "ts": 1716454224519228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224580453, "dur": 91, "args": { "External id": 199931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199931, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 199931, "pid": 5, "tid": 7, "ts": 1716454224580453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519234, "dur": 14, "args": { "External id": 199931, "cbid": 211, "correlation": 199931 } }, { "ph": "s", "id": 199931, "pid": 76337, "tid": -914061504, "ts": 1716454224519234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224580546, "dur": 10, "args": { "External id": 199939, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199939, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199939, "pid": 5, "tid": 7, "ts": 1716454224580546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519303, "dur": 13, "args": { "External id": 199939, "cbid": 211, "correlation": 199939 } }, { "ph": "s", "id": 199939, "pid": 76337, "tid": -914061504, "ts": 1716454224519303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224580557, "dur": 21, "args": { "External id": 199947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199947, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199947, "pid": 5, "tid": 7, "ts": 1716454224580557, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519344, "dur": 9, "args": { "External id": 199947, "cbid": 211, "correlation": 199947 } }, { "ph": "s", "id": 199947, "pid": 76337, "tid": -914061504, "ts": 1716454224519344, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224580580, "dur": 18, "args": { "External id": 199969, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199969, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199969, "pid": 5, "tid": 7, "ts": 1716454224580580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519397, "dur": 10, "args": { "External id": 199969, "cbid": 211, "correlation": 199969 } }, { "ph": "s", "id": 199969, "pid": 76337, "tid": -914061504, "ts": 1716454224519397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224519484, "dur": 1, "args": { "External id": 199985, "cbid": 251, "correlation": 199985 } }, { "ph": "f", "id": 199985, "pid": 76337, "tid": -914061504, "ts": 1716454224519484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224519489, "dur": 0, "args": { "External id": 199987, "cbid": 251, "correlation": 199987 } }, { "ph": "f", "id": 199987, "pid": 76337, "tid": -914061504, "ts": 1716454224519489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224580599, "dur": 498, "args": { "External id": 199988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199988, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 199988, "pid": 5, "tid": 7, "ts": 1716454224580599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519491, "dur": 12, "args": { "External id": 199988, "cbid": 211, "correlation": 199988 } }, { "ph": "s", "id": 199988, "pid": 76337, "tid": -914061504, "ts": 1716454224519491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224581098, "dur": 67, "args": { "External id": 199996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 199996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 199996, "pid": 5, "tid": 7, "ts": 1716454224581098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519555, "dur": 13, "args": { "External id": 199996, "cbid": 211, "correlation": 199996 } }, { "ph": "s", "id": 199996, "pid": 76337, "tid": -914061504, "ts": 1716454224519555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224581166, "dur": 68, "args": { "External id": 200004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200004, "pid": 5, "tid": 7, "ts": 1716454224581166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519587, "dur": 8, "args": { "External id": 200004, "cbid": 211, "correlation": 200004 } }, { "ph": "s", "id": 200004, "pid": 76337, "tid": -914061504, "ts": 1716454224519587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224519668, "dur": 1, "args": { "External id": 200020, "cbid": 251, "correlation": 200020 } }, { "ph": "f", "id": 200020, "pid": 76337, "tid": -914061504, "ts": 1716454224519668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224581236, "dur": 1, "args": { "External id": 200022, "device": 5, "context": 1, "stream": 7, "correlation": 200022, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 200022, "pid": 5, "tid": 7, "ts": 1716454224581236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224519673, "dur": 9, "args": { "External id": 200022, "cbid": 51, "correlation": 200022 } }, { "ph": "s", "id": 200022, "pid": 76337, "tid": -914061504, "ts": 1716454224519673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224581240, "dur": 270, "args": { "External id": 200023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200023, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200023, "pid": 5, "tid": 7, "ts": 1716454224581240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519684, "dur": 11, "args": { "External id": 200023, "cbid": 211, "correlation": 200023 } }, { "ph": "s", "id": 200023, "pid": 76337, "tid": -914061504, "ts": 1716454224519684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224581511, "dur": 14, "args": { "External id": 200031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200031, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200031, "pid": 5, "tid": 7, "ts": 1716454224581511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519727, "dur": 10, "args": { "External id": 200031, "cbid": 211, "correlation": 200031 } }, { "ph": "s", "id": 200031, "pid": 76337, "tid": -914061504, "ts": 1716454224519727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224581526, "dur": 37, "args": { "External id": 200042, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200042, "pid": 5, "tid": 7, "ts": 1716454224581526, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519794, "dur": 12, "args": { "External id": 200042, "cbid": 211, "correlation": 200042 } }, { "ph": "s", "id": 200042, "pid": 76337, "tid": -914061504, "ts": 1716454224519794, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224519858, "dur": 0, "args": { "External id": 200054, "cbid": 317, "correlation": 200054 } }, { "ph": "f", "id": 200054, "pid": 76337, "tid": -914061504, "ts": 1716454224519858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224519859, "dur": 0, "args": { "External id": 200055, "cbid": 203, "correlation": 200055 } }, { "ph": "f", "id": 200055, "pid": 76337, "tid": -914061504, "ts": 1716454224519859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224519859, "dur": 0, "args": { "External id": 200056, "cbid": 205, "correlation": 200056 } }, { "ph": "f", "id": 200056, "pid": 76337, "tid": -914061504, "ts": 1716454224519859, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224581565, "dur": 14, "args": { "External id": 200060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200060, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200060, "pid": 5, "tid": 7, "ts": 1716454224581565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519875, "dur": 12, "args": { "External id": 200060, "cbid": 211, "correlation": 200060 } }, { "ph": "s", "id": 200060, "pid": 76337, "tid": -914061504, "ts": 1716454224519875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224581580, "dur": 4, "args": { "External id": 200062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200062, "pid": 5, "tid": 7, "ts": 1716454224581580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519892, "dur": 6, "args": { "External id": 200062, "cbid": 211, "correlation": 200062 } }, { "ph": "s", "id": 200062, "pid": 76337, "tid": -914061504, "ts": 1716454224519892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224519900, "dur": 0, "args": { "External id": 200063, "cbid": 51, "correlation": 200063 } }, { "ph": "s", "id": 200063, "pid": 76337, "tid": -914061504, "ts": 1716454224519900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224581586, "dur": 97, "args": { "External id": 200064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200064, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 200064, "pid": 5, "tid": 7, "ts": 1716454224581586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519901, "dur": 5, "args": { "External id": 200064, "cbid": 211, "correlation": 200064 } }, { "ph": "s", "id": 200064, "pid": 76337, "tid": -914061504, "ts": 1716454224519901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224581684, "dur": 17, "args": { "External id": 200069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200069, "pid": 5, "tid": 7, "ts": 1716454224581684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519928, "dur": 8, "args": { "External id": 200069, "cbid": 211, "correlation": 200069 } }, { "ph": "s", "id": 200069, "pid": 76337, "tid": -914061504, "ts": 1716454224519928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224581702, "dur": 13, "args": { "External id": 200077, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200077, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200077, "pid": 5, "tid": 7, "ts": 1716454224581702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224519959, "dur": 9, "args": { "External id": 200077, "cbid": 211, "correlation": 200077 } }, { "ph": "s", "id": 200077, "pid": 76337, "tid": -914061504, "ts": 1716454224519959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224581716, "dur": 24, "args": { "External id": 200086, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200086, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200086, "pid": 5, "tid": 7, "ts": 1716454224581716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520007, "dur": 12, "args": { "External id": 200086, "cbid": 211, "correlation": 200086 } }, { "ph": "s", "id": 200086, "pid": 76337, "tid": -914061504, "ts": 1716454224520007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224581742, "dur": 24, "args": { "External id": 200106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200106, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 200106, "pid": 5, "tid": 7, "ts": 1716454224581742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520082, "dur": 12, "args": { "External id": 200106, "cbid": 211, "correlation": 200106 } }, { "ph": "s", "id": 200106, "pid": 76337, "tid": -914061504, "ts": 1716454224520082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224581767, "dur": 5, "args": { "External id": 200118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200118, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 200118, "pid": 5, "tid": 7, "ts": 1716454224581767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520104, "dur": 6, "args": { "External id": 200118, "cbid": 211, "correlation": 200118 } }, { "ph": "s", "id": 200118, "pid": 76337, "tid": -914061504, "ts": 1716454224520104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224581773, "dur": 26, "args": { "External id": 200121, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200121, "pid": 5, "tid": 7, "ts": 1716454224581773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520123, "dur": 7, "args": { "External id": 200121, "cbid": 211, "correlation": 200121 } }, { "ph": "s", "id": 200121, "pid": 76337, "tid": -914061504, "ts": 1716454224520123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224581800, "dur": 17, "args": { "External id": 200130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200130, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200130, "pid": 5, "tid": 7, "ts": 1716454224581800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520162, "dur": 10, "args": { "External id": 200130, "cbid": 211, "correlation": 200130 } }, { "ph": "s", "id": 200130, "pid": 76337, "tid": -914061504, "ts": 1716454224520162, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224520214, "dur": 0, "args": { "External id": 200140, "cbid": 317, "correlation": 200140 } }, { "ph": "f", "id": 200140, "pid": 76337, "tid": -914061504, "ts": 1716454224520214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224520215, "dur": 0, "args": { "External id": 200141, "cbid": 203, "correlation": 200141 } }, { "ph": "f", "id": 200141, "pid": 76337, "tid": -914061504, "ts": 1716454224520215, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224520216, "dur": 0, "args": { "External id": 200142, "cbid": 205, "correlation": 200142 } }, { "ph": "f", "id": 200142, "pid": 76337, "tid": -914061504, "ts": 1716454224520216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224581818, "dur": 18, "args": { "External id": 200146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200146, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200146, "pid": 5, "tid": 7, "ts": 1716454224581818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520231, "dur": 11, "args": { "External id": 200146, "cbid": 211, "correlation": 200146 } }, { "ph": "s", "id": 200146, "pid": 76337, "tid": -914061504, "ts": 1716454224520231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224581837, "dur": 240, "args": { "External id": 200148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200148, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200148, "pid": 5, "tid": 7, "ts": 1716454224581837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520245, "dur": 5, "args": { "External id": 200148, "cbid": 211, "correlation": 200148 } }, { "ph": "s", "id": 200148, "pid": 76337, "tid": -914061504, "ts": 1716454224520245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224582080, "dur": 1, "args": { "External id": 200150, "device": 5, "context": 1, "stream": 7, "correlation": 200150, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 200150, "pid": 5, "tid": 7, "ts": 1716454224582080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224520256, "dur": 9, "args": { "External id": 200150, "cbid": 51, "correlation": 200150 } }, { "ph": "s", "id": 200150, "pid": 76337, "tid": -914061504, "ts": 1716454224520256, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224582084, "dur": 815, "args": { "External id": 200151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200151, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200151, "pid": 5, "tid": 7, "ts": 1716454224582084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520266, "dur": 6, "args": { "External id": 200151, "cbid": 211, "correlation": 200151 } }, { "ph": "s", "id": 200151, "pid": 76337, "tid": -914061504, "ts": 1716454224520266, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224582900, "dur": 14, "args": { "External id": 200153, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200153, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200153, "pid": 5, "tid": 7, "ts": 1716454224582900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520277, "dur": 5, "args": { "External id": 200153, "cbid": 211, "correlation": 200153 } }, { "ph": "s", "id": 200153, "pid": 76337, "tid": -914061504, "ts": 1716454224520277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224582915, "dur": 15, "args": { "External id": 200159, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200159, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200159, "pid": 5, "tid": 7, "ts": 1716454224582915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520304, "dur": 8, "args": { "External id": 200159, "cbid": 211, "correlation": 200159 } }, { "ph": "s", "id": 200159, "pid": 76337, "tid": -914061504, "ts": 1716454224520304, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224582931, "dur": 4, "args": { "External id": 200167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200167, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 200167, "pid": 5, "tid": 7, "ts": 1716454224582931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520348, "dur": 9, "args": { "External id": 200167, "cbid": 211, "correlation": 200167 } }, { "ph": "s", "id": 200167, "pid": 76337, "tid": -914061504, "ts": 1716454224520348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224520417, "dur": 1, "args": { "External id": 200183, "cbid": 251, "correlation": 200183 } }, { "ph": "f", "id": 200183, "pid": 76337, "tid": -914061504, "ts": 1716454224520417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224520422, "dur": 0, "args": { "External id": 200185, "cbid": 251, "correlation": 200185 } }, { "ph": "f", "id": 200185, "pid": 76337, "tid": -914061504, "ts": 1716454224520422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224582936, "dur": 13, "args": { "External id": 200186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200186, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200186, "pid": 5, "tid": 7, "ts": 1716454224582936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520424, "dur": 11, "args": { "External id": 200186, "cbid": 211, "correlation": 200186 } }, { "ph": "s", "id": 200186, "pid": 76337, "tid": -914061504, "ts": 1716454224520424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224582951, "dur": 5, "args": { "External id": 200188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200188, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200188, "pid": 5, "tid": 7, "ts": 1716454224582951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520437, "dur": 5, "args": { "External id": 200188, "cbid": 211, "correlation": 200188 } }, { "ph": "s", "id": 200188, "pid": 76337, "tid": -914061504, "ts": 1716454224520437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224582957, "dur": 17, "args": { "External id": 200198, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200198, "pid": 5, "tid": 7, "ts": 1716454224582957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520495, "dur": 12, "args": { "External id": 200198, "cbid": 211, "correlation": 200198 } }, { "ph": "s", "id": 200198, "pid": 76337, "tid": -914061504, "ts": 1716454224520495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224582975, "dur": 17, "args": { "External id": 200218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200218, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 200218, "pid": 5, "tid": 7, "ts": 1716454224582975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520561, "dur": 11, "args": { "External id": 200218, "cbid": 211, "correlation": 200218 } }, { "ph": "s", "id": 200218, "pid": 76337, "tid": -914061504, "ts": 1716454224520561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224582994, "dur": 4, "args": { "External id": 200230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200230, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 200230, "pid": 5, "tid": 7, "ts": 1716454224582994, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520582, "dur": 6, "args": { "External id": 200230, "cbid": 211, "correlation": 200230 } }, { "ph": "s", "id": 200230, "pid": 76337, "tid": -914061504, "ts": 1716454224520582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224582999, "dur": 17, "args": { "External id": 200233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200233, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200233, "pid": 5, "tid": 7, "ts": 1716454224582999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520601, "dur": 7, "args": { "External id": 200233, "cbid": 211, "correlation": 200233 } }, { "ph": "s", "id": 200233, "pid": 76337, "tid": -914061504, "ts": 1716454224520601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224583017, "dur": 11, "args": { "External id": 200242, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200242, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200242, "pid": 5, "tid": 7, "ts": 1716454224583017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520642, "dur": 9, "args": { "External id": 200242, "cbid": 211, "correlation": 200242 } }, { "ph": "s", "id": 200242, "pid": 76337, "tid": -914061504, "ts": 1716454224520642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224520704, "dur": 0, "args": { "External id": 200252, "cbid": 317, "correlation": 200252 } }, { "ph": "f", "id": 200252, "pid": 76337, "tid": -914061504, "ts": 1716454224520704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224520705, "dur": 0, "args": { "External id": 200253, "cbid": 203, "correlation": 200253 } }, { "ph": "f", "id": 200253, "pid": 76337, "tid": -914061504, "ts": 1716454224520705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224520705, "dur": 0, "args": { "External id": 200254, "cbid": 205, "correlation": 200254 } }, { "ph": "f", "id": 200254, "pid": 76337, "tid": -914061504, "ts": 1716454224520705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224583028, "dur": 11, "args": { "External id": 200258, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200258, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200258, "pid": 5, "tid": 7, "ts": 1716454224583028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520719, "dur": 12, "args": { "External id": 200258, "cbid": 211, "correlation": 200258 } }, { "ph": "s", "id": 200258, "pid": 76337, "tid": -914061504, "ts": 1716454224520719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224583041, "dur": 163, "args": { "External id": 200260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200260, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200260, "pid": 5, "tid": 7, "ts": 1716454224583041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520734, "dur": 5, "args": { "External id": 200260, "cbid": 211, "correlation": 200260 } }, { "ph": "s", "id": 200260, "pid": 76337, "tid": -914061504, "ts": 1716454224520734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224583206, "dur": 1, "args": { "External id": 200262, "device": 5, "context": 1, "stream": 7, "correlation": 200262, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 200262, "pid": 5, "tid": 7, "ts": 1716454224583206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224520745, "dur": 6, "args": { "External id": 200262, "cbid": 51, "correlation": 200262 } }, { "ph": "s", "id": 200262, "pid": 76337, "tid": -914061504, "ts": 1716454224520745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224583210, "dur": 650, "args": { "External id": 200263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200263, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200263, "pid": 5, "tid": 7, "ts": 1716454224583210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520752, "dur": 6, "args": { "External id": 200263, "cbid": 211, "correlation": 200263 } }, { "ph": "s", "id": 200263, "pid": 76337, "tid": -914061504, "ts": 1716454224520752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224583861, "dur": 14, "args": { "External id": 200265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200265, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200265, "pid": 5, "tid": 7, "ts": 1716454224583861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520763, "dur": 5, "args": { "External id": 200265, "cbid": 211, "correlation": 200265 } }, { "ph": "s", "id": 200265, "pid": 76337, "tid": -914061504, "ts": 1716454224520763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224583876, "dur": 15, "args": { "External id": 200271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200271, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200271, "pid": 5, "tid": 7, "ts": 1716454224583876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520792, "dur": 8, "args": { "External id": 200271, "cbid": 211, "correlation": 200271 } }, { "ph": "s", "id": 200271, "pid": 76337, "tid": -914061504, "ts": 1716454224520792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224520850, "dur": 0, "args": { "External id": 200281, "cbid": 317, "correlation": 200281 } }, { "ph": "f", "id": 200281, "pid": 76337, "tid": -914061504, "ts": 1716454224520850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224520851, "dur": 0, "args": { "External id": 200282, "cbid": 203, "correlation": 200282 } }, { "ph": "f", "id": 200282, "pid": 76337, "tid": -914061504, "ts": 1716454224520851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224520852, "dur": 0, "args": { "External id": 200283, "cbid": 205, "correlation": 200283 } }, { "ph": "f", "id": 200283, "pid": 76337, "tid": -914061504, "ts": 1716454224520852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224583892, "dur": 17, "args": { "External id": 200287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200287, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200287, "pid": 5, "tid": 7, "ts": 1716454224583892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520865, "dur": 13, "args": { "External id": 200287, "cbid": 211, "correlation": 200287 } }, { "ph": "s", "id": 200287, "pid": 76337, "tid": -914061504, "ts": 1716454224520865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224583911, "dur": 4, "args": { "External id": 200289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200289, "pid": 5, "tid": 7, "ts": 1716454224583911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520882, "dur": 5, "args": { "External id": 200289, "cbid": 211, "correlation": 200289 } }, { "ph": "s", "id": 200289, "pid": 76337, "tid": -914061504, "ts": 1716454224520882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224520890, "dur": 0, "args": { "External id": 200290, "cbid": 51, "correlation": 200290 } }, { "ph": "s", "id": 200290, "pid": 76337, "tid": -914061504, "ts": 1716454224520890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224583916, "dur": 132, "args": { "External id": 200291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200291, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 200291, "pid": 5, "tid": 7, "ts": 1716454224583916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520891, "dur": 5, "args": { "External id": 200291, "cbid": 211, "correlation": 200291 } }, { "ph": "s", "id": 200291, "pid": 76337, "tid": -914061504, "ts": 1716454224520891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224584049, "dur": 16, "args": { "External id": 200296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200296, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200296, "pid": 5, "tid": 7, "ts": 1716454224584049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520916, "dur": 9, "args": { "External id": 200296, "cbid": 211, "correlation": 200296 } }, { "ph": "s", "id": 200296, "pid": 76337, "tid": -914061504, "ts": 1716454224520916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224584066, "dur": 12, "args": { "External id": 200304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200304, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200304, "pid": 5, "tid": 7, "ts": 1716454224584066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520945, "dur": 7, "args": { "External id": 200304, "cbid": 211, "correlation": 200304 } }, { "ph": "s", "id": 200304, "pid": 76337, "tid": -914061504, "ts": 1716454224520945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224584080, "dur": 10, "args": { "External id": 200312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200312, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200312, "pid": 5, "tid": 7, "ts": 1716454224584080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224520972, "dur": 16, "args": { "External id": 200312, "cbid": 211, "correlation": 200312 } }, { "ph": "s", "id": 200312, "pid": 76337, "tid": -914061504, "ts": 1716454224520972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224584092, "dur": 18, "args": { "External id": 200332, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200332, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 200332, "pid": 5, "tid": 7, "ts": 1716454224584092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521068, "dur": 14, "args": { "External id": 200332, "cbid": 211, "correlation": 200332 } }, { "ph": "s", "id": 200332, "pid": 76337, "tid": -914061504, "ts": 1716454224521068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224584111, "dur": 4, "args": { "External id": 200344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200344, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 200344, "pid": 5, "tid": 7, "ts": 1716454224584111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521092, "dur": 6, "args": { "External id": 200344, "cbid": 211, "correlation": 200344 } }, { "ph": "s", "id": 200344, "pid": 76337, "tid": -914061504, "ts": 1716454224521092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224584117, "dur": 17, "args": { "External id": 200347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200347, "pid": 5, "tid": 7, "ts": 1716454224584117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521109, "dur": 6, "args": { "External id": 200347, "cbid": 211, "correlation": 200347 } }, { "ph": "s", "id": 200347, "pid": 76337, "tid": -914061504, "ts": 1716454224521109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224521167, "dur": 0, "args": { "External id": 200358, "cbid": 317, "correlation": 200358 } }, { "ph": "f", "id": 200358, "pid": 76337, "tid": -914061504, "ts": 1716454224521167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224521167, "dur": 0, "args": { "External id": 200359, "cbid": 203, "correlation": 200359 } }, { "ph": "f", "id": 200359, "pid": 76337, "tid": -914061504, "ts": 1716454224521167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224521168, "dur": 0, "args": { "External id": 200360, "cbid": 205, "correlation": 200360 } }, { "ph": "f", "id": 200360, "pid": 76337, "tid": -914061504, "ts": 1716454224521168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224584135, "dur": 11, "args": { "External id": 200364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200364, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200364, "pid": 5, "tid": 7, "ts": 1716454224584135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521182, "dur": 11, "args": { "External id": 200364, "cbid": 211, "correlation": 200364 } }, { "ph": "s", "id": 200364, "pid": 76337, "tid": -914061504, "ts": 1716454224521182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224584147, "dur": 3, "args": { "External id": 200366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200366, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200366, "pid": 5, "tid": 7, "ts": 1716454224584147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521197, "dur": 5, "args": { "External id": 200366, "cbid": 211, "correlation": 200366 } }, { "ph": "s", "id": 200366, "pid": 76337, "tid": -914061504, "ts": 1716454224521197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224521205, "dur": 0, "args": { "External id": 200367, "cbid": 51, "correlation": 200367 } }, { "ph": "s", "id": 200367, "pid": 76337, "tid": -914061504, "ts": 1716454224521205, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224584152, "dur": 91, "args": { "External id": 200368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200368, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 200368, "pid": 5, "tid": 7, "ts": 1716454224584152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521206, "dur": 5, "args": { "External id": 200368, "cbid": 211, "correlation": 200368 } }, { "ph": "s", "id": 200368, "pid": 76337, "tid": -914061504, "ts": 1716454224521206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224584245, "dur": 15, "args": { "External id": 200373, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200373, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200373, "pid": 5, "tid": 7, "ts": 1716454224584245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521233, "dur": 8, "args": { "External id": 200373, "cbid": 211, "correlation": 200373 } }, { "ph": "s", "id": 200373, "pid": 76337, "tid": -914061504, "ts": 1716454224521233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224584262, "dur": 84, "args": { "External id": 200382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200382, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200382, "pid": 5, "tid": 7, "ts": 1716454224584262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521313, "dur": 14, "args": { "External id": 200382, "cbid": 211, "correlation": 200382 } }, { "ph": "s", "id": 200382, "pid": 76337, "tid": -914061504, "ts": 1716454224521313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224584347, "dur": 30, "args": { "External id": 200404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200404, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200404, "pid": 5, "tid": 7, "ts": 1716454224584347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521369, "dur": 11, "args": { "External id": 200404, "cbid": 211, "correlation": 200404 } }, { "ph": "s", "id": 200404, "pid": 76337, "tid": -914061504, "ts": 1716454224521369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224521459, "dur": 1, "args": { "External id": 200415, "cbid": 251, "correlation": 200415 } }, { "ph": "f", "id": 200415, "pid": 76337, "tid": -914061504, "ts": 1716454224521459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224584378, "dur": 142, "args": { "External id": 200416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200416, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200416, "pid": 5, "tid": 7, "ts": 1716454224584378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521464, "dur": 13, "args": { "External id": 200416, "cbid": 211, "correlation": 200416 } }, { "ph": "s", "id": 200416, "pid": 76337, "tid": -914061504, "ts": 1716454224521464, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224521535, "dur": 1, "args": { "External id": 200427, "cbid": 251, "correlation": 200427 } }, { "ph": "f", "id": 200427, "pid": 76337, "tid": -914061504, "ts": 1716454224521535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224584522, "dur": 159, "args": { "External id": 200428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200428, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200428, "pid": 5, "tid": 7, "ts": 1716454224584522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521539, "dur": 11, "args": { "External id": 200428, "cbid": 211, "correlation": 200428 } }, { "ph": "s", "id": 200428, "pid": 76337, "tid": -914061504, "ts": 1716454224521539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224521602, "dur": 1, "args": { "External id": 200439, "cbid": 251, "correlation": 200439 } }, { "ph": "f", "id": 200439, "pid": 76337, "tid": -914061504, "ts": 1716454224521602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224584682, "dur": 158, "args": { "External id": 200440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200440, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200440, "pid": 5, "tid": 7, "ts": 1716454224584682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521606, "dur": 11, "args": { "External id": 200440, "cbid": 211, "correlation": 200440 } }, { "ph": "s", "id": 200440, "pid": 76337, "tid": -914061504, "ts": 1716454224521606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224584842, "dur": 336, "args": { "External id": 200465, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200465, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200465, "pid": 5, "tid": 7, "ts": 1716454224584842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521691, "dur": 12, "args": { "External id": 200465, "cbid": 211, "correlation": 200465 } }, { "ph": "s", "id": 200465, "pid": 76337, "tid": -914061504, "ts": 1716454224521691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224521791, "dur": 1, "args": { "External id": 200483, "cbid": 251, "correlation": 200483 } }, { "ph": "f", "id": 200483, "pid": 76337, "tid": -914061504, "ts": 1716454224521791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224585179, "dur": 166, "args": { "External id": 200485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200485, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200485, "pid": 5, "tid": 7, "ts": 1716454224585179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521797, "dur": 14, "args": { "External id": 200485, "cbid": 211, "correlation": 200485 } }, { "ph": "s", "id": 200485, "pid": 76337, "tid": -914061504, "ts": 1716454224521797, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224585347, "dur": 19, "args": { "External id": 200493, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200493, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200493, "pid": 5, "tid": 7, "ts": 1716454224585347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521866, "dur": 12, "args": { "External id": 200493, "cbid": 211, "correlation": 200493 } }, { "ph": "s", "id": 200493, "pid": 76337, "tid": -914061504, "ts": 1716454224521866, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224585367, "dur": 27, "args": { "External id": 200501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200501, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200501, "pid": 5, "tid": 7, "ts": 1716454224585367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521905, "dur": 8, "args": { "External id": 200501, "cbid": 211, "correlation": 200501 } }, { "ph": "s", "id": 200501, "pid": 76337, "tid": -914061504, "ts": 1716454224521905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224585396, "dur": 18, "args": { "External id": 200512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200512, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200512, "pid": 5, "tid": 7, "ts": 1716454224585396, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224521984, "dur": 14, "args": { "External id": 200512, "cbid": 211, "correlation": 200512 } }, { "ph": "s", "id": 200512, "pid": 76337, "tid": -914061504, "ts": 1716454224521984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224585415, "dur": 16, "args": { "External id": 200534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200534, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200534, "pid": 5, "tid": 7, "ts": 1716454224585415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522020, "dur": 8, "args": { "External id": 200534, "cbid": 211, "correlation": 200534 } }, { "ph": "s", "id": 200534, "pid": 76337, "tid": -914061504, "ts": 1716454224522020, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522107, "dur": 1, "args": { "External id": 200545, "cbid": 251, "correlation": 200545 } }, { "ph": "f", "id": 200545, "pid": 76337, "tid": -914061504, "ts": 1716454224522107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224585433, "dur": 90, "args": { "External id": 200546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200546, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200546, "pid": 5, "tid": 7, "ts": 1716454224585433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522113, "dur": 13, "args": { "External id": 200546, "cbid": 211, "correlation": 200546 } }, { "ph": "s", "id": 200546, "pid": 76337, "tid": -914061504, "ts": 1716454224522113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522180, "dur": 1, "args": { "External id": 200557, "cbid": 251, "correlation": 200557 } }, { "ph": "f", "id": 200557, "pid": 76337, "tid": -914061504, "ts": 1716454224522180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522184, "dur": 0, "args": { "External id": 200558, "cbid": 251, "correlation": 200558 } }, { "ph": "f", "id": 200558, "pid": 76337, "tid": -914061504, "ts": 1716454224522184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224585524, "dur": 12, "args": { "External id": 200559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200559, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200559, "pid": 5, "tid": 7, "ts": 1716454224585524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522186, "dur": 12, "args": { "External id": 200559, "cbid": 211, "correlation": 200559 } }, { "ph": "s", "id": 200559, "pid": 76337, "tid": -914061504, "ts": 1716454224522186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224585537, "dur": 6, "args": { "External id": 200561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200561, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200561, "pid": 5, "tid": 7, "ts": 1716454224585537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522200, "dur": 6, "args": { "External id": 200561, "cbid": 211, "correlation": 200561 } }, { "ph": "s", "id": 200561, "pid": 76337, "tid": -914061504, "ts": 1716454224522200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522257, "dur": 1, "args": { "External id": 200572, "cbid": 251, "correlation": 200572 } }, { "ph": "f", "id": 200572, "pid": 76337, "tid": -914061504, "ts": 1716454224522257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522260, "dur": 0, "args": { "External id": 200573, "cbid": 251, "correlation": 200573 } }, { "ph": "f", "id": 200573, "pid": 76337, "tid": -914061504, "ts": 1716454224522260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224585545, "dur": 8, "args": { "External id": 200574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200574, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200574, "pid": 5, "tid": 7, "ts": 1716454224585545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522262, "dur": 11, "args": { "External id": 200574, "cbid": 211, "correlation": 200574 } }, { "ph": "s", "id": 200574, "pid": 76337, "tid": -914061504, "ts": 1716454224522262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224585554, "dur": 4, "args": { "External id": 200576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200576, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200576, "pid": 5, "tid": 7, "ts": 1716454224585554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522275, "dur": 5, "args": { "External id": 200576, "cbid": 211, "correlation": 200576 } }, { "ph": "s", "id": 200576, "pid": 76337, "tid": -914061504, "ts": 1716454224522275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224585559, "dur": 56, "args": { "External id": 200601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200601, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200601, "pid": 5, "tid": 7, "ts": 1716454224585559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522351, "dur": 12, "args": { "External id": 200601, "cbid": 211, "correlation": 200601 } }, { "ph": "s", "id": 200601, "pid": 76337, "tid": -914061504, "ts": 1716454224522351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522449, "dur": 1, "args": { "External id": 200619, "cbid": 251, "correlation": 200619 } }, { "ph": "f", "id": 200619, "pid": 76337, "tid": -914061504, "ts": 1716454224522449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224585616, "dur": 90, "args": { "External id": 200621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200621, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200621, "pid": 5, "tid": 7, "ts": 1716454224585616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522455, "dur": 13, "args": { "External id": 200621, "cbid": 211, "correlation": 200621 } }, { "ph": "s", "id": 200621, "pid": 76337, "tid": -914061504, "ts": 1716454224522455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224585708, "dur": 10, "args": { "External id": 200629, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200629, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200629, "pid": 5, "tid": 7, "ts": 1716454224585708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522525, "dur": 12, "args": { "External id": 200629, "cbid": 211, "correlation": 200629 } }, { "ph": "s", "id": 200629, "pid": 76337, "tid": -914061504, "ts": 1716454224522525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224585719, "dur": 21, "args": { "External id": 200637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200637, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200637, "pid": 5, "tid": 7, "ts": 1716454224585719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522566, "dur": 9, "args": { "External id": 200637, "cbid": 211, "correlation": 200637 } }, { "ph": "s", "id": 200637, "pid": 76337, "tid": -914061504, "ts": 1716454224522566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224585741, "dur": 18, "args": { "External id": 200659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200659, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200659, "pid": 5, "tid": 7, "ts": 1716454224585741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522618, "dur": 10, "args": { "External id": 200659, "cbid": 211, "correlation": 200659 } }, { "ph": "s", "id": 200659, "pid": 76337, "tid": -914061504, "ts": 1716454224522618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522704, "dur": 1, "args": { "External id": 200675, "cbid": 251, "correlation": 200675 } }, { "ph": "f", "id": 200675, "pid": 76337, "tid": -914061504, "ts": 1716454224522704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522709, "dur": 0, "args": { "External id": 200677, "cbid": 251, "correlation": 200677 } }, { "ph": "f", "id": 200677, "pid": 76337, "tid": -914061504, "ts": 1716454224522709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224585761, "dur": 497, "args": { "External id": 200678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200678, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200678, "pid": 5, "tid": 7, "ts": 1716454224585761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522711, "dur": 13, "args": { "External id": 200678, "cbid": 211, "correlation": 200678 } }, { "ph": "s", "id": 200678, "pid": 76337, "tid": -914061504, "ts": 1716454224522711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224586259, "dur": 66, "args": { "External id": 200686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200686, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200686, "pid": 5, "tid": 7, "ts": 1716454224586259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522776, "dur": 12, "args": { "External id": 200686, "cbid": 211, "correlation": 200686 } }, { "ph": "s", "id": 200686, "pid": 76337, "tid": -914061504, "ts": 1716454224522776, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224586326, "dur": 67, "args": { "External id": 200694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200694, "pid": 5, "tid": 7, "ts": 1716454224586326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522806, "dur": 8, "args": { "External id": 200694, "cbid": 211, "correlation": 200694 } }, { "ph": "s", "id": 200694, "pid": 76337, "tid": -914061504, "ts": 1716454224522806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224522884, "dur": 1, "args": { "External id": 200710, "cbid": 251, "correlation": 200710 } }, { "ph": "f", "id": 200710, "pid": 76337, "tid": -914061504, "ts": 1716454224522884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224586395, "dur": 1, "args": { "External id": 200712, "device": 5, "context": 1, "stream": 7, "correlation": 200712, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 200712, "pid": 5, "tid": 7, "ts": 1716454224586395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224522889, "dur": 10, "args": { "External id": 200712, "cbid": 51, "correlation": 200712 } }, { "ph": "s", "id": 200712, "pid": 76337, "tid": -914061504, "ts": 1716454224522889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224586399, "dur": 271, "args": { "External id": 200713, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200713, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200713, "pid": 5, "tid": 7, "ts": 1716454224586399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522900, "dur": 11, "args": { "External id": 200713, "cbid": 211, "correlation": 200713 } }, { "ph": "s", "id": 200713, "pid": 76337, "tid": -914061504, "ts": 1716454224522900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224586672, "dur": 14, "args": { "External id": 200721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200721, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200721, "pid": 5, "tid": 7, "ts": 1716454224586672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224522942, "dur": 11, "args": { "External id": 200721, "cbid": 211, "correlation": 200721 } }, { "ph": "s", "id": 200721, "pid": 76337, "tid": -914061504, "ts": 1716454224522942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224586687, "dur": 38, "args": { "External id": 200732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200732, "pid": 5, "tid": 7, "ts": 1716454224586687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523018, "dur": 13, "args": { "External id": 200732, "cbid": 211, "correlation": 200732 } }, { "ph": "s", "id": 200732, "pid": 76337, "tid": -914061504, "ts": 1716454224523018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224523082, "dur": 0, "args": { "External id": 200744, "cbid": 317, "correlation": 200744 } }, { "ph": "f", "id": 200744, "pid": 76337, "tid": -914061504, "ts": 1716454224523082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224523083, "dur": 0, "args": { "External id": 200745, "cbid": 203, "correlation": 200745 } }, { "ph": "f", "id": 200745, "pid": 76337, "tid": -914061504, "ts": 1716454224523083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224523084, "dur": 0, "args": { "External id": 200746, "cbid": 205, "correlation": 200746 } }, { "ph": "f", "id": 200746, "pid": 76337, "tid": -914061504, "ts": 1716454224523084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224586726, "dur": 13, "args": { "External id": 200750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200750, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200750, "pid": 5, "tid": 7, "ts": 1716454224586726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523100, "dur": 12, "args": { "External id": 200750, "cbid": 211, "correlation": 200750 } }, { "ph": "s", "id": 200750, "pid": 76337, "tid": -914061504, "ts": 1716454224523100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224586740, "dur": 4, "args": { "External id": 200752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 200752, "pid": 5, "tid": 7, "ts": 1716454224586740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523117, "dur": 6, "args": { "External id": 200752, "cbid": 211, "correlation": 200752 } }, { "ph": "s", "id": 200752, "pid": 76337, "tid": -914061504, "ts": 1716454224523117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224523126, "dur": 0, "args": { "External id": 200753, "cbid": 51, "correlation": 200753 } }, { "ph": "s", "id": 200753, "pid": 76337, "tid": -914061504, "ts": 1716454224523126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224586745, "dur": 98, "args": { "External id": 200754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200754, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 200754, "pid": 5, "tid": 7, "ts": 1716454224586745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523126, "dur": 5, "args": { "External id": 200754, "cbid": 211, "correlation": 200754 } }, { "ph": "s", "id": 200754, "pid": 76337, "tid": -914061504, "ts": 1716454224523126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224586844, "dur": 16, "args": { "External id": 200759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200759, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200759, "pid": 5, "tid": 7, "ts": 1716454224586844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523154, "dur": 8, "args": { "External id": 200759, "cbid": 211, "correlation": 200759 } }, { "ph": "s", "id": 200759, "pid": 76337, "tid": -914061504, "ts": 1716454224523154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224586862, "dur": 11, "args": { "External id": 200767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200767, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200767, "pid": 5, "tid": 7, "ts": 1716454224586862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523185, "dur": 8, "args": { "External id": 200767, "cbid": 211, "correlation": 200767 } }, { "ph": "s", "id": 200767, "pid": 76337, "tid": -914061504, "ts": 1716454224523185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224586874, "dur": 58, "args": { "External id": 200778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200778, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200778, "pid": 5, "tid": 7, "ts": 1716454224586874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523247, "dur": 12, "args": { "External id": 200778, "cbid": 211, "correlation": 200778 } }, { "ph": "s", "id": 200778, "pid": 76337, "tid": -914061504, "ts": 1716454224523247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224523303, "dur": 0, "args": { "External id": 200788, "cbid": 317, "correlation": 200788 } }, { "ph": "f", "id": 200788, "pid": 76337, "tid": -914061504, "ts": 1716454224523303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224523304, "dur": 0, "args": { "External id": 200789, "cbid": 203, "correlation": 200789 } }, { "ph": "f", "id": 200789, "pid": 76337, "tid": -914061504, "ts": 1716454224523304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224523304, "dur": 0, "args": { "External id": 200790, "cbid": 205, "correlation": 200790 } }, { "ph": "f", "id": 200790, "pid": 76337, "tid": -914061504, "ts": 1716454224523304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224586933, "dur": 38, "args": { "External id": 200794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200794, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200794, "pid": 5, "tid": 7, "ts": 1716454224586933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523320, "dur": 11, "args": { "External id": 200794, "cbid": 211, "correlation": 200794 } }, { "ph": "s", "id": 200794, "pid": 76337, "tid": -914061504, "ts": 1716454224523320, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224586972, "dur": 163, "args": { "External id": 200796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200796, "pid": 5, "tid": 7, "ts": 1716454224586972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523334, "dur": 5, "args": { "External id": 200796, "cbid": 211, "correlation": 200796 } }, { "ph": "s", "id": 200796, "pid": 76337, "tid": -914061504, "ts": 1716454224523334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224587137, "dur": 1970, "args": { "External id": 200798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200798, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200798, "pid": 5, "tid": 7, "ts": 1716454224587137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523346, "dur": 8, "args": { "External id": 200798, "cbid": 211, "correlation": 200798 } }, { "ph": "s", "id": 200798, "pid": 76337, "tid": -914061504, "ts": 1716454224523346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224589107, "dur": 39, "args": { "External id": 200800, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200800, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200800, "pid": 5, "tid": 7, "ts": 1716454224589107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523358, "dur": 5, "args": { "External id": 200800, "cbid": 211, "correlation": 200800 } }, { "ph": "s", "id": 200800, "pid": 76337, "tid": -914061504, "ts": 1716454224523358, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224589148, "dur": 58, "args": { "External id": 200806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200806, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200806, "pid": 5, "tid": 7, "ts": 1716454224589148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523385, "dur": 9, "args": { "External id": 200806, "cbid": 211, "correlation": 200806 } }, { "ph": "s", "id": 200806, "pid": 76337, "tid": -914061504, "ts": 1716454224523385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224589207, "dur": 86, "args": { "External id": 200815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200815, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200815, "pid": 5, "tid": 7, "ts": 1716454224589207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523475, "dur": 13, "args": { "External id": 200815, "cbid": 211, "correlation": 200815 } }, { "ph": "s", "id": 200815, "pid": 76337, "tid": -914061504, "ts": 1716454224523475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224589295, "dur": 73, "args": { "External id": 200835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200835, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 200835, "pid": 5, "tid": 7, "ts": 1716454224589295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523545, "dur": 12, "args": { "External id": 200835, "cbid": 211, "correlation": 200835 } }, { "ph": "s", "id": 200835, "pid": 76337, "tid": -914061504, "ts": 1716454224523545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224589369, "dur": 5, "args": { "External id": 200847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200847, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 200847, "pid": 5, "tid": 7, "ts": 1716454224589369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523567, "dur": 6, "args": { "External id": 200847, "cbid": 211, "correlation": 200847 } }, { "ph": "s", "id": 200847, "pid": 76337, "tid": -914061504, "ts": 1716454224523567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224589375, "dur": 81, "args": { "External id": 200850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200850, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200850, "pid": 5, "tid": 7, "ts": 1716454224589375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523585, "dur": 7, "args": { "External id": 200850, "cbid": 211, "correlation": 200850 } }, { "ph": "s", "id": 200850, "pid": 76337, "tid": -914061504, "ts": 1716454224523585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224589457, "dur": 54, "args": { "External id": 200859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200859, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200859, "pid": 5, "tid": 7, "ts": 1716454224589457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523629, "dur": 10, "args": { "External id": 200859, "cbid": 211, "correlation": 200859 } }, { "ph": "s", "id": 200859, "pid": 76337, "tid": -914061504, "ts": 1716454224523629, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224523682, "dur": 0, "args": { "External id": 200869, "cbid": 317, "correlation": 200869 } }, { "ph": "f", "id": 200869, "pid": 76337, "tid": -914061504, "ts": 1716454224523682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224523683, "dur": 0, "args": { "External id": 200870, "cbid": 203, "correlation": 200870 } }, { "ph": "f", "id": 200870, "pid": 76337, "tid": -914061504, "ts": 1716454224523683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224523683, "dur": 0, "args": { "External id": 200871, "cbid": 205, "correlation": 200871 } }, { "ph": "f", "id": 200871, "pid": 76337, "tid": -914061504, "ts": 1716454224523683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224589512, "dur": 55, "args": { "External id": 200875, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200875, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200875, "pid": 5, "tid": 7, "ts": 1716454224589512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523700, "dur": 12, "args": { "External id": 200875, "cbid": 211, "correlation": 200875 } }, { "ph": "s", "id": 200875, "pid": 76337, "tid": -914061504, "ts": 1716454224523700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224589569, "dur": 123, "args": { "External id": 200877, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200877, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200877, "pid": 5, "tid": 7, "ts": 1716454224589569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523715, "dur": 5, "args": { "External id": 200877, "cbid": 211, "correlation": 200877 } }, { "ph": "s", "id": 200877, "pid": 76337, "tid": -914061504, "ts": 1716454224523715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224589693, "dur": 1895, "args": { "External id": 200879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200879, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200879, "pid": 5, "tid": 7, "ts": 1716454224589693, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523726, "dur": 6, "args": { "External id": 200879, "cbid": 211, "correlation": 200879 } }, { "ph": "s", "id": 200879, "pid": 76337, "tid": -914061504, "ts": 1716454224523726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224591589, "dur": 20, "args": { "External id": 200881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200881, "pid": 5, "tid": 7, "ts": 1716454224591589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523736, "dur": 5, "args": { "External id": 200881, "cbid": 211, "correlation": 200881 } }, { "ph": "s", "id": 200881, "pid": 76337, "tid": -914061504, "ts": 1716454224523736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224591610, "dur": 33, "args": { "External id": 200887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200887, "pid": 5, "tid": 7, "ts": 1716454224591610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523764, "dur": 8, "args": { "External id": 200887, "cbid": 211, "correlation": 200887 } }, { "ph": "s", "id": 200887, "pid": 76337, "tid": -914061504, "ts": 1716454224523764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224591645, "dur": 4, "args": { "External id": 200895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200895, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 200895, "pid": 5, "tid": 7, "ts": 1716454224591645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523808, "dur": 10, "args": { "External id": 200895, "cbid": 211, "correlation": 200895 } }, { "ph": "s", "id": 200895, "pid": 76337, "tid": -914061504, "ts": 1716454224523808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224523874, "dur": 1, "args": { "External id": 200911, "cbid": 251, "correlation": 200911 } }, { "ph": "f", "id": 200911, "pid": 76337, "tid": -914061504, "ts": 1716454224523874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224523879, "dur": 0, "args": { "External id": 200913, "cbid": 251, "correlation": 200913 } }, { "ph": "f", "id": 200913, "pid": 76337, "tid": -914061504, "ts": 1716454224523879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224591650, "dur": 12, "args": { "External id": 200914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200914, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 200914, "pid": 5, "tid": 7, "ts": 1716454224591650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523881, "dur": 11, "args": { "External id": 200914, "cbid": 211, "correlation": 200914 } }, { "ph": "s", "id": 200914, "pid": 76337, "tid": -914061504, "ts": 1716454224523881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224591663, "dur": 5, "args": { "External id": 200916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200916, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 200916, "pid": 5, "tid": 7, "ts": 1716454224591663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523894, "dur": 5, "args": { "External id": 200916, "cbid": 211, "correlation": 200916 } }, { "ph": "s", "id": 200916, "pid": 76337, "tid": -914061504, "ts": 1716454224523894, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224591670, "dur": 30, "args": { "External id": 200926, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200926, "pid": 5, "tid": 7, "ts": 1716454224591670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224523952, "dur": 13, "args": { "External id": 200926, "cbid": 211, "correlation": 200926 } }, { "ph": "s", "id": 200926, "pid": 76337, "tid": -914061504, "ts": 1716454224523952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224591701, "dur": 31, "args": { "External id": 200946, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200946, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 200946, "pid": 5, "tid": 7, "ts": 1716454224591701, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524027, "dur": 11, "args": { "External id": 200946, "cbid": 211, "correlation": 200946 } }, { "ph": "s", "id": 200946, "pid": 76337, "tid": -914061504, "ts": 1716454224524027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224591734, "dur": 5, "args": { "External id": 200958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200958, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 200958, "pid": 5, "tid": 7, "ts": 1716454224591734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524048, "dur": 6, "args": { "External id": 200958, "cbid": 211, "correlation": 200958 } }, { "ph": "s", "id": 200958, "pid": 76337, "tid": -914061504, "ts": 1716454224524048, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224591740, "dur": 30, "args": { "External id": 200961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200961, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200961, "pid": 5, "tid": 7, "ts": 1716454224591740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524066, "dur": 6, "args": { "External id": 200961, "cbid": 211, "correlation": 200961 } }, { "ph": "s", "id": 200961, "pid": 76337, "tid": -914061504, "ts": 1716454224524066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224591772, "dur": 21, "args": { "External id": 200970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200970, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200970, "pid": 5, "tid": 7, "ts": 1716454224591772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524106, "dur": 10, "args": { "External id": 200970, "cbid": 211, "correlation": 200970 } }, { "ph": "s", "id": 200970, "pid": 76337, "tid": -914061504, "ts": 1716454224524106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224524169, "dur": 0, "args": { "External id": 200980, "cbid": 317, "correlation": 200980 } }, { "ph": "f", "id": 200980, "pid": 76337, "tid": -914061504, "ts": 1716454224524169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224524169, "dur": 0, "args": { "External id": 200981, "cbid": 203, "correlation": 200981 } }, { "ph": "f", "id": 200981, "pid": 76337, "tid": -914061504, "ts": 1716454224524169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224524170, "dur": 0, "args": { "External id": 200982, "cbid": 205, "correlation": 200982 } }, { "ph": "f", "id": 200982, "pid": 76337, "tid": -914061504, "ts": 1716454224524170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224591793, "dur": 23, "args": { "External id": 200986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200986, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200986, "pid": 5, "tid": 7, "ts": 1716454224591793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524186, "dur": 11, "args": { "External id": 200986, "cbid": 211, "correlation": 200986 } }, { "ph": "s", "id": 200986, "pid": 76337, "tid": -914061504, "ts": 1716454224524186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224591818, "dur": 44, "args": { "External id": 200988, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200988, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200988, "pid": 5, "tid": 7, "ts": 1716454224591818, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524200, "dur": 5, "args": { "External id": 200988, "cbid": 211, "correlation": 200988 } }, { "ph": "s", "id": 200988, "pid": 76337, "tid": -914061504, "ts": 1716454224524200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224591864, "dur": 646, "args": { "External id": 200990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200990, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 200990, "pid": 5, "tid": 7, "ts": 1716454224591864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524212, "dur": 6, "args": { "External id": 200990, "cbid": 211, "correlation": 200990 } }, { "ph": "s", "id": 200990, "pid": 76337, "tid": -914061504, "ts": 1716454224524212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224592511, "dur": 22, "args": { "External id": 200992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200992, "pid": 5, "tid": 7, "ts": 1716454224592511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524222, "dur": 5, "args": { "External id": 200992, "cbid": 211, "correlation": 200992 } }, { "ph": "s", "id": 200992, "pid": 76337, "tid": -914061504, "ts": 1716454224524222, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224592534, "dur": 33, "args": { "External id": 200998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 200998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 200998, "pid": 5, "tid": 7, "ts": 1716454224592534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524250, "dur": 10, "args": { "External id": 200998, "cbid": 211, "correlation": 200998 } }, { "ph": "s", "id": 200998, "pid": 76337, "tid": -914061504, "ts": 1716454224524250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224524309, "dur": 0, "args": { "External id": 201008, "cbid": 317, "correlation": 201008 } }, { "ph": "f", "id": 201008, "pid": 76337, "tid": -914061504, "ts": 1716454224524309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224524309, "dur": 0, "args": { "External id": 201009, "cbid": 203, "correlation": 201009 } }, { "ph": "f", "id": 201009, "pid": 76337, "tid": -914061504, "ts": 1716454224524309, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224524310, "dur": 0, "args": { "External id": 201010, "cbid": 205, "correlation": 201010 } }, { "ph": "f", "id": 201010, "pid": 76337, "tid": -914061504, "ts": 1716454224524310, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224592568, "dur": 56, "args": { "External id": 201014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201014, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201014, "pid": 5, "tid": 7, "ts": 1716454224592568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524323, "dur": 11, "args": { "External id": 201014, "cbid": 211, "correlation": 201014 } }, { "ph": "s", "id": 201014, "pid": 76337, "tid": -914061504, "ts": 1716454224524323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224592625, "dur": 270, "args": { "External id": 201016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201016, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201016, "pid": 5, "tid": 7, "ts": 1716454224592625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524341, "dur": 8, "args": { "External id": 201016, "cbid": 211, "correlation": 201016 } }, { "ph": "s", "id": 201016, "pid": 76337, "tid": -914061504, "ts": 1716454224524341, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224592896, "dur": 22, "args": { "External id": 201018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201018, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201018, "pid": 5, "tid": 7, "ts": 1716454224592896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524353, "dur": 6, "args": { "External id": 201018, "cbid": 211, "correlation": 201018 } }, { "ph": "s", "id": 201018, "pid": 76337, "tid": -914061504, "ts": 1716454224524353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224592919, "dur": 32, "args": { "External id": 201024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201024, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201024, "pid": 5, "tid": 7, "ts": 1716454224592919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524379, "dur": 8, "args": { "External id": 201024, "cbid": 211, "correlation": 201024 } }, { "ph": "s", "id": 201024, "pid": 76337, "tid": -914061504, "ts": 1716454224524379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224592953, "dur": 27, "args": { "External id": 201032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201032, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201032, "pid": 5, "tid": 7, "ts": 1716454224592953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524408, "dur": 8, "args": { "External id": 201032, "cbid": 211, "correlation": 201032 } }, { "ph": "s", "id": 201032, "pid": 76337, "tid": -914061504, "ts": 1716454224524408, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224592981, "dur": 20, "args": { "External id": 201040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201040, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201040, "pid": 5, "tid": 7, "ts": 1716454224592981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524437, "dur": 8, "args": { "External id": 201040, "cbid": 211, "correlation": 201040 } }, { "ph": "s", "id": 201040, "pid": 76337, "tid": -914061504, "ts": 1716454224524437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224593002, "dur": 30, "args": { "External id": 201060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201060, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 201060, "pid": 5, "tid": 7, "ts": 1716454224593002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524520, "dur": 12, "args": { "External id": 201060, "cbid": 211, "correlation": 201060 } }, { "ph": "s", "id": 201060, "pid": 76337, "tid": -914061504, "ts": 1716454224524520, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224593033, "dur": 5, "args": { "External id": 201072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201072, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 201072, "pid": 5, "tid": 7, "ts": 1716454224593033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524542, "dur": 6, "args": { "External id": 201072, "cbid": 211, "correlation": 201072 } }, { "ph": "s", "id": 201072, "pid": 76337, "tid": -914061504, "ts": 1716454224524542, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224593039, "dur": 30, "args": { "External id": 201075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201075, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201075, "pid": 5, "tid": 7, "ts": 1716454224593039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524560, "dur": 7, "args": { "External id": 201075, "cbid": 211, "correlation": 201075 } }, { "ph": "s", "id": 201075, "pid": 76337, "tid": -914061504, "ts": 1716454224524560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224524617, "dur": 0, "args": { "External id": 201086, "cbid": 317, "correlation": 201086 } }, { "ph": "f", "id": 201086, "pid": 76337, "tid": -914061504, "ts": 1716454224524617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224524617, "dur": 0, "args": { "External id": 201087, "cbid": 203, "correlation": 201087 } }, { "ph": "f", "id": 201087, "pid": 76337, "tid": -914061504, "ts": 1716454224524617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224524618, "dur": 0, "args": { "External id": 201088, "cbid": 205, "correlation": 201088 } }, { "ph": "f", "id": 201088, "pid": 76337, "tid": -914061504, "ts": 1716454224524618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224593071, "dur": 21, "args": { "External id": 201092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201092, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201092, "pid": 5, "tid": 7, "ts": 1716454224593071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524632, "dur": 11, "args": { "External id": 201092, "cbid": 211, "correlation": 201092 } }, { "ph": "s", "id": 201092, "pid": 76337, "tid": -914061504, "ts": 1716454224524632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224593094, "dur": 105, "args": { "External id": 201094, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201094, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201094, "pid": 5, "tid": 7, "ts": 1716454224593094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524649, "dur": 6, "args": { "External id": 201094, "cbid": 211, "correlation": 201094 } }, { "ph": "s", "id": 201094, "pid": 76337, "tid": -914061504, "ts": 1716454224524649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224593200, "dur": 23, "args": { "External id": 201096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201096, "pid": 5, "tid": 7, "ts": 1716454224593200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524660, "dur": 5, "args": { "External id": 201096, "cbid": 211, "correlation": 201096 } }, { "ph": "s", "id": 201096, "pid": 76337, "tid": -914061504, "ts": 1716454224524660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224593225, "dur": 33, "args": { "External id": 201102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201102, "pid": 5, "tid": 7, "ts": 1716454224593225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524687, "dur": 8, "args": { "External id": 201102, "cbid": 211, "correlation": 201102 } }, { "ph": "s", "id": 201102, "pid": 76337, "tid": -914061504, "ts": 1716454224524687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224593259, "dur": 199, "args": { "External id": 201111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201111, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201111, "pid": 5, "tid": 7, "ts": 1716454224593259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524769, "dur": 14, "args": { "External id": 201111, "cbid": 211, "correlation": 201111 } }, { "ph": "s", "id": 201111, "pid": 76337, "tid": -914061504, "ts": 1716454224524769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224593459, "dur": 65, "args": { "External id": 201133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201133, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201133, "pid": 5, "tid": 7, "ts": 1716454224593459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524826, "dur": 10, "args": { "External id": 201133, "cbid": 211, "correlation": 201133 } }, { "ph": "s", "id": 201133, "pid": 76337, "tid": -914061504, "ts": 1716454224524826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224524916, "dur": 1, "args": { "External id": 201144, "cbid": 251, "correlation": 201144 } }, { "ph": "f", "id": 201144, "pid": 76337, "tid": -914061504, "ts": 1716454224524916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224593525, "dur": 152, "args": { "External id": 201145, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201145, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201145, "pid": 5, "tid": 7, "ts": 1716454224593525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224524921, "dur": 13, "args": { "External id": 201145, "cbid": 211, "correlation": 201145 } }, { "ph": "s", "id": 201145, "pid": 76337, "tid": -914061504, "ts": 1716454224524921, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224524998, "dur": 1, "args": { "External id": 201156, "cbid": 251, "correlation": 201156 } }, { "ph": "f", "id": 201156, "pid": 76337, "tid": -914061504, "ts": 1716454224524998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224593679, "dur": 147, "args": { "External id": 201157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201157, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201157, "pid": 5, "tid": 7, "ts": 1716454224593679, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525002, "dur": 13, "args": { "External id": 201157, "cbid": 211, "correlation": 201157 } }, { "ph": "s", "id": 201157, "pid": 76337, "tid": -914061504, "ts": 1716454224525002, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525071, "dur": 1, "args": { "External id": 201168, "cbid": 251, "correlation": 201168 } }, { "ph": "f", "id": 201168, "pid": 76337, "tid": -914061504, "ts": 1716454224525071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224593827, "dur": 145, "args": { "External id": 201169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201169, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201169, "pid": 5, "tid": 7, "ts": 1716454224593827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525075, "dur": 11, "args": { "External id": 201169, "cbid": 211, "correlation": 201169 } }, { "ph": "s", "id": 201169, "pid": 76337, "tid": -914061504, "ts": 1716454224525075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224593973, "dur": 1948, "args": { "External id": 201190, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201190, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 201190, "pid": 5, "tid": 7, "ts": 1716454224593973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525160, "dur": 14, "args": { "External id": 201190, "cbid": 211, "correlation": 201190 } }, { "ph": "s", "id": 201190, "pid": 76337, "tid": -914061504, "ts": 1716454224525160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525260, "dur": 2, "args": { "External id": 201208, "cbid": 251, "correlation": 201208 } }, { "ph": "f", "id": 201208, "pid": 76337, "tid": -914061504, "ts": 1716454224525260, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224595923, "dur": 150, "args": { "External id": 201210, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201210, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 201210, "pid": 5, "tid": 7, "ts": 1716454224595923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525267, "dur": 13, "args": { "External id": 201210, "cbid": 211, "correlation": 201210 } }, { "ph": "s", "id": 201210, "pid": 76337, "tid": -914061504, "ts": 1716454224525267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224596074, "dur": 35, "args": { "External id": 201218, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201218, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201218, "pid": 5, "tid": 7, "ts": 1716454224596074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525337, "dur": 12, "args": { "External id": 201218, "cbid": 211, "correlation": 201218 } }, { "ph": "s", "id": 201218, "pid": 76337, "tid": -914061504, "ts": 1716454224525337, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224596111, "dur": 50, "args": { "External id": 201226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201226, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201226, "pid": 5, "tid": 7, "ts": 1716454224596111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525376, "dur": 8, "args": { "External id": 201226, "cbid": 211, "correlation": 201226 } }, { "ph": "s", "id": 201226, "pid": 76337, "tid": -914061504, "ts": 1716454224525376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224596162, "dur": 30, "args": { "External id": 201237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201237, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201237, "pid": 5, "tid": 7, "ts": 1716454224596162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525447, "dur": 12, "args": { "External id": 201237, "cbid": 211, "correlation": 201237 } }, { "ph": "s", "id": 201237, "pid": 76337, "tid": -914061504, "ts": 1716454224525447, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224596193, "dur": 34, "args": { "External id": 201259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201259, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201259, "pid": 5, "tid": 7, "ts": 1716454224596193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525478, "dur": 8, "args": { "External id": 201259, "cbid": 211, "correlation": 201259 } }, { "ph": "s", "id": 201259, "pid": 76337, "tid": -914061504, "ts": 1716454224525478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525562, "dur": 1, "args": { "External id": 201270, "cbid": 251, "correlation": 201270 } }, { "ph": "f", "id": 201270, "pid": 76337, "tid": -914061504, "ts": 1716454224525562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224596229, "dur": 91, "args": { "External id": 201271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201271, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201271, "pid": 5, "tid": 7, "ts": 1716454224596229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525568, "dur": 12, "args": { "External id": 201271, "cbid": 211, "correlation": 201271 } }, { "ph": "s", "id": 201271, "pid": 76337, "tid": -914061504, "ts": 1716454224525568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525636, "dur": 1, "args": { "External id": 201282, "cbid": 251, "correlation": 201282 } }, { "ph": "f", "id": 201282, "pid": 76337, "tid": -914061504, "ts": 1716454224525636, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525639, "dur": 0, "args": { "External id": 201283, "cbid": 251, "correlation": 201283 } }, { "ph": "f", "id": 201283, "pid": 76337, "tid": -914061504, "ts": 1716454224525639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224596321, "dur": 11, "args": { "External id": 201284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201284, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 201284, "pid": 5, "tid": 7, "ts": 1716454224596321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525641, "dur": 12, "args": { "External id": 201284, "cbid": 211, "correlation": 201284 } }, { "ph": "s", "id": 201284, "pid": 76337, "tid": -914061504, "ts": 1716454224525641, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224596333, "dur": 5, "args": { "External id": 201286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201286, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 201286, "pid": 5, "tid": 7, "ts": 1716454224596333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525654, "dur": 6, "args": { "External id": 201286, "cbid": 211, "correlation": 201286 } }, { "ph": "s", "id": 201286, "pid": 76337, "tid": -914061504, "ts": 1716454224525654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525712, "dur": 1, "args": { "External id": 201297, "cbid": 251, "correlation": 201297 } }, { "ph": "f", "id": 201297, "pid": 76337, "tid": -914061504, "ts": 1716454224525712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525716, "dur": 0, "args": { "External id": 201298, "cbid": 251, "correlation": 201298 } }, { "ph": "f", "id": 201298, "pid": 76337, "tid": -914061504, "ts": 1716454224525716, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224596339, "dur": 8, "args": { "External id": 201299, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201299, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 201299, "pid": 5, "tid": 7, "ts": 1716454224596339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525718, "dur": 11, "args": { "External id": 201299, "cbid": 211, "correlation": 201299 } }, { "ph": "s", "id": 201299, "pid": 76337, "tid": -914061504, "ts": 1716454224525718, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224596348, "dur": 4, "args": { "External id": 201301, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201301, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 201301, "pid": 5, "tid": 7, "ts": 1716454224596348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525731, "dur": 5, "args": { "External id": 201301, "cbid": 211, "correlation": 201301 } }, { "ph": "s", "id": 201301, "pid": 76337, "tid": -914061504, "ts": 1716454224525731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224596354, "dur": 93, "args": { "External id": 201322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201322, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 201322, "pid": 5, "tid": 7, "ts": 1716454224596354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525803, "dur": 13, "args": { "External id": 201322, "cbid": 211, "correlation": 201322 } }, { "ph": "s", "id": 201322, "pid": 76337, "tid": -914061504, "ts": 1716454224525803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224525899, "dur": 1, "args": { "External id": 201340, "cbid": 251, "correlation": 201340 } }, { "ph": "f", "id": 201340, "pid": 76337, "tid": -914061504, "ts": 1716454224525899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224596448, "dur": 98, "args": { "External id": 201342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201342, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201342, "pid": 5, "tid": 7, "ts": 1716454224596448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525904, "dur": 14, "args": { "External id": 201342, "cbid": 211, "correlation": 201342 } }, { "ph": "s", "id": 201342, "pid": 76337, "tid": -914061504, "ts": 1716454224525904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224596548, "dur": 19, "args": { "External id": 201350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201350, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201350, "pid": 5, "tid": 7, "ts": 1716454224596548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224525983, "dur": 13, "args": { "External id": 201350, "cbid": 211, "correlation": 201350 } }, { "ph": "s", "id": 201350, "pid": 76337, "tid": -914061504, "ts": 1716454224525983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224596568, "dur": 37, "args": { "External id": 201358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201358, "pid": 5, "tid": 7, "ts": 1716454224596568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526027, "dur": 9, "args": { "External id": 201358, "cbid": 211, "correlation": 201358 } }, { "ph": "s", "id": 201358, "pid": 76337, "tid": -914061504, "ts": 1716454224526027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224596607, "dur": 35, "args": { "External id": 201380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201380, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201380, "pid": 5, "tid": 7, "ts": 1716454224596607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526078, "dur": 10, "args": { "External id": 201380, "cbid": 211, "correlation": 201380 } }, { "ph": "s", "id": 201380, "pid": 76337, "tid": -914061504, "ts": 1716454224526078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224526167, "dur": 2, "args": { "External id": 201396, "cbid": 251, "correlation": 201396 } }, { "ph": "f", "id": 201396, "pid": 76337, "tid": -914061504, "ts": 1716454224526167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224526172, "dur": 0, "args": { "External id": 201398, "cbid": 251, "correlation": 201398 } }, { "ph": "f", "id": 201398, "pid": 76337, "tid": -914061504, "ts": 1716454224526172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224596643, "dur": 540, "args": { "External id": 201399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201399, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 201399, "pid": 5, "tid": 7, "ts": 1716454224596643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526177, "dur": 13, "args": { "External id": 201399, "cbid": 211, "correlation": 201399 } }, { "ph": "s", "id": 201399, "pid": 76337, "tid": -914061504, "ts": 1716454224526177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224597184, "dur": 126, "args": { "External id": 201407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201407, "pid": 5, "tid": 7, "ts": 1716454224597184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526242, "dur": 12, "args": { "External id": 201407, "cbid": 211, "correlation": 201407 } }, { "ph": "s", "id": 201407, "pid": 76337, "tid": -914061504, "ts": 1716454224526242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224597312, "dur": 128, "args": { "External id": 201415, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201415, "pid": 5, "tid": 7, "ts": 1716454224597312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526275, "dur": 8, "args": { "External id": 201415, "cbid": 211, "correlation": 201415 } }, { "ph": "s", "id": 201415, "pid": 76337, "tid": -914061504, "ts": 1716454224526275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224526352, "dur": 1, "args": { "External id": 201431, "cbid": 251, "correlation": 201431 } }, { "ph": "f", "id": 201431, "pid": 76337, "tid": -914061504, "ts": 1716454224526352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224597441, "dur": 304, "args": { "External id": 201433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201433, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201433, "pid": 5, "tid": 7, "ts": 1716454224597441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526357, "dur": 13, "args": { "External id": 201433, "cbid": 211, "correlation": 201433 } }, { "ph": "s", "id": 201433, "pid": 76337, "tid": -914061504, "ts": 1716454224526357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224597746, "dur": 27, "args": { "External id": 201441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201441, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201441, "pid": 5, "tid": 7, "ts": 1716454224597746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526399, "dur": 10, "args": { "External id": 201441, "cbid": 211, "correlation": 201441 } }, { "ph": "s", "id": 201441, "pid": 76337, "tid": -914061504, "ts": 1716454224526399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224597774, "dur": 81, "args": { "External id": 201452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201452, "pid": 5, "tid": 7, "ts": 1716454224597774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526467, "dur": 12, "args": { "External id": 201452, "cbid": 211, "correlation": 201452 } }, { "ph": "s", "id": 201452, "pid": 76337, "tid": -914061504, "ts": 1716454224526467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224526529, "dur": 0, "args": { "External id": 201464, "cbid": 317, "correlation": 201464 } }, { "ph": "f", "id": 201464, "pid": 76337, "tid": -914061504, "ts": 1716454224526529, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224526530, "dur": 0, "args": { "External id": 201465, "cbid": 203, "correlation": 201465 } }, { "ph": "f", "id": 201465, "pid": 76337, "tid": -914061504, "ts": 1716454224526530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224526530, "dur": 0, "args": { "External id": 201466, "cbid": 205, "correlation": 201466 } }, { "ph": "f", "id": 201466, "pid": 76337, "tid": -914061504, "ts": 1716454224526530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224597857, "dur": 22, "args": { "External id": 201470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201470, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201470, "pid": 5, "tid": 7, "ts": 1716454224597857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526546, "dur": 12, "args": { "External id": 201470, "cbid": 211, "correlation": 201470 } }, { "ph": "s", "id": 201470, "pid": 76337, "tid": -914061504, "ts": 1716454224526546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224597880, "dur": 121, "args": { "External id": 201472, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201472, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201472, "pid": 5, "tid": 7, "ts": 1716454224597880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526565, "dur": 7, "args": { "External id": 201472, "cbid": 211, "correlation": 201472 } }, { "ph": "s", "id": 201472, "pid": 76337, "tid": -914061504, "ts": 1716454224526565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224598002, "dur": 24, "args": { "External id": 201474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201474, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201474, "pid": 5, "tid": 7, "ts": 1716454224598002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526576, "dur": 5, "args": { "External id": 201474, "cbid": 211, "correlation": 201474 } }, { "ph": "s", "id": 201474, "pid": 76337, "tid": -914061504, "ts": 1716454224526576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224598027, "dur": 32, "args": { "External id": 201480, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201480, "pid": 5, "tid": 7, "ts": 1716454224598027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526603, "dur": 9, "args": { "External id": 201480, "cbid": 211, "correlation": 201480 } }, { "ph": "s", "id": 201480, "pid": 76337, "tid": -914061504, "ts": 1716454224526603, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224598061, "dur": 27, "args": { "External id": 201488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201488, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201488, "pid": 5, "tid": 7, "ts": 1716454224598061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526635, "dur": 8, "args": { "External id": 201488, "cbid": 211, "correlation": 201488 } }, { "ph": "s", "id": 201488, "pid": 76337, "tid": -914061504, "ts": 1716454224526635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224598089, "dur": 54, "args": { "External id": 201497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201497, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201497, "pid": 5, "tid": 7, "ts": 1716454224598089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526673, "dur": 10, "args": { "External id": 201497, "cbid": 211, "correlation": 201497 } }, { "ph": "s", "id": 201497, "pid": 76337, "tid": -914061504, "ts": 1716454224526673, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224598144, "dur": 52, "args": { "External id": 201517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201517, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 201517, "pid": 5, "tid": 7, "ts": 1716454224598144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526742, "dur": 11, "args": { "External id": 201517, "cbid": 211, "correlation": 201517 } }, { "ph": "s", "id": 201517, "pid": 76337, "tid": -914061504, "ts": 1716454224526742, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224598198, "dur": 4, "args": { "External id": 201529, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201529, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 201529, "pid": 5, "tid": 7, "ts": 1716454224598198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526763, "dur": 7, "args": { "External id": 201529, "cbid": 211, "correlation": 201529 } }, { "ph": "s", "id": 201529, "pid": 76337, "tid": -914061504, "ts": 1716454224526763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224598204, "dur": 56, "args": { "External id": 201532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201532, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201532, "pid": 5, "tid": 7, "ts": 1716454224598204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526782, "dur": 6, "args": { "External id": 201532, "cbid": 211, "correlation": 201532 } }, { "ph": "s", "id": 201532, "pid": 76337, "tid": -914061504, "ts": 1716454224526782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224598261, "dur": 37, "args": { "External id": 201541, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201541, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201541, "pid": 5, "tid": 7, "ts": 1716454224598261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526822, "dur": 9, "args": { "External id": 201541, "cbid": 211, "correlation": 201541 } }, { "ph": "s", "id": 201541, "pid": 76337, "tid": -914061504, "ts": 1716454224526822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224526874, "dur": 0, "args": { "External id": 201551, "cbid": 317, "correlation": 201551 } }, { "ph": "f", "id": 201551, "pid": 76337, "tid": -914061504, "ts": 1716454224526874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224526875, "dur": 0, "args": { "External id": 201552, "cbid": 203, "correlation": 201552 } }, { "ph": "f", "id": 201552, "pid": 76337, "tid": -914061504, "ts": 1716454224526875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224526875, "dur": 0, "args": { "External id": 201553, "cbid": 205, "correlation": 201553 } }, { "ph": "f", "id": 201553, "pid": 76337, "tid": -914061504, "ts": 1716454224526875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224598300, "dur": 40, "args": { "External id": 201557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201557, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201557, "pid": 5, "tid": 7, "ts": 1716454224598300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526890, "dur": 11, "args": { "External id": 201557, "cbid": 211, "correlation": 201557 } }, { "ph": "s", "id": 201557, "pid": 76337, "tid": -914061504, "ts": 1716454224526890, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224598341, "dur": 83, "args": { "External id": 201559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201559, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201559, "pid": 5, "tid": 7, "ts": 1716454224598341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526904, "dur": 5, "args": { "External id": 201559, "cbid": 211, "correlation": 201559 } }, { "ph": "s", "id": 201559, "pid": 76337, "tid": -914061504, "ts": 1716454224526904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224598426, "dur": 1282, "args": { "External id": 201561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201561, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201561, "pid": 5, "tid": 7, "ts": 1716454224598426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526915, "dur": 7, "args": { "External id": 201561, "cbid": 211, "correlation": 201561 } }, { "ph": "s", "id": 201561, "pid": 76337, "tid": -914061504, "ts": 1716454224526915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224599710, "dur": 20, "args": { "External id": 201563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201563, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201563, "pid": 5, "tid": 7, "ts": 1716454224599710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526926, "dur": 5, "args": { "External id": 201563, "cbid": 211, "correlation": 201563 } }, { "ph": "s", "id": 201563, "pid": 76337, "tid": -914061504, "ts": 1716454224526926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224599731, "dur": 33, "args": { "External id": 201569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201569, "pid": 5, "tid": 7, "ts": 1716454224599731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224526953, "dur": 9, "args": { "External id": 201569, "cbid": 211, "correlation": 201569 } }, { "ph": "s", "id": 201569, "pid": 76337, "tid": -914061504, "ts": 1716454224526953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224599766, "dur": 3, "args": { "External id": 201577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201577, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 201577, "pid": 5, "tid": 7, "ts": 1716454224599766, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527005, "dur": 11, "args": { "External id": 201577, "cbid": 211, "correlation": 201577 } }, { "ph": "s", "id": 201577, "pid": 76337, "tid": -914061504, "ts": 1716454224527005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224527071, "dur": 1, "args": { "External id": 201593, "cbid": 251, "correlation": 201593 } }, { "ph": "f", "id": 201593, "pid": 76337, "tid": -914061504, "ts": 1716454224527071, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224527076, "dur": 0, "args": { "External id": 201595, "cbid": 251, "correlation": 201595 } }, { "ph": "f", "id": 201595, "pid": 76337, "tid": -914061504, "ts": 1716454224527076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224599771, "dur": 13, "args": { "External id": 201596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201596, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 201596, "pid": 5, "tid": 7, "ts": 1716454224599771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527078, "dur": 11, "args": { "External id": 201596, "cbid": 211, "correlation": 201596 } }, { "ph": "s", "id": 201596, "pid": 76337, "tid": -914061504, "ts": 1716454224527078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224599785, "dur": 5, "args": { "External id": 201598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201598, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 201598, "pid": 5, "tid": 7, "ts": 1716454224599785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527091, "dur": 5, "args": { "External id": 201598, "cbid": 211, "correlation": 201598 } }, { "ph": "s", "id": 201598, "pid": 76337, "tid": -914061504, "ts": 1716454224527091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224599791, "dur": 28, "args": { "External id": 201608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201608, "pid": 5, "tid": 7, "ts": 1716454224599791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527149, "dur": 13, "args": { "External id": 201608, "cbid": 211, "correlation": 201608 } }, { "ph": "s", "id": 201608, "pid": 76337, "tid": -914061504, "ts": 1716454224527149, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224599821, "dur": 31, "args": { "External id": 201628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201628, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 201628, "pid": 5, "tid": 7, "ts": 1716454224599821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527214, "dur": 11, "args": { "External id": 201628, "cbid": 211, "correlation": 201628 } }, { "ph": "s", "id": 201628, "pid": 76337, "tid": -914061504, "ts": 1716454224527214, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224599853, "dur": 4, "args": { "External id": 201640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201640, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 201640, "pid": 5, "tid": 7, "ts": 1716454224599853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527235, "dur": 6, "args": { "External id": 201640, "cbid": 211, "correlation": 201640 } }, { "ph": "s", "id": 201640, "pid": 76337, "tid": -914061504, "ts": 1716454224527235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224599858, "dur": 31, "args": { "External id": 201643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201643, "pid": 5, "tid": 7, "ts": 1716454224599858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527253, "dur": 7, "args": { "External id": 201643, "cbid": 211, "correlation": 201643 } }, { "ph": "s", "id": 201643, "pid": 76337, "tid": -914061504, "ts": 1716454224527253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224599891, "dur": 20, "args": { "External id": 201652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201652, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201652, "pid": 5, "tid": 7, "ts": 1716454224599891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527295, "dur": 10, "args": { "External id": 201652, "cbid": 211, "correlation": 201652 } }, { "ph": "s", "id": 201652, "pid": 76337, "tid": -914061504, "ts": 1716454224527295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224527357, "dur": 0, "args": { "External id": 201662, "cbid": 317, "correlation": 201662 } }, { "ph": "f", "id": 201662, "pid": 76337, "tid": -914061504, "ts": 1716454224527357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224527358, "dur": 0, "args": { "External id": 201663, "cbid": 203, "correlation": 201663 } }, { "ph": "f", "id": 201663, "pid": 76337, "tid": -914061504, "ts": 1716454224527358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224527358, "dur": 0, "args": { "External id": 201664, "cbid": 205, "correlation": 201664 } }, { "ph": "f", "id": 201664, "pid": 76337, "tid": -914061504, "ts": 1716454224527358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224599912, "dur": 22, "args": { "External id": 201668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201668, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201668, "pid": 5, "tid": 7, "ts": 1716454224599912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527373, "dur": 12, "args": { "External id": 201668, "cbid": 211, "correlation": 201668 } }, { "ph": "s", "id": 201668, "pid": 76337, "tid": -914061504, "ts": 1716454224527373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224599935, "dur": 44, "args": { "External id": 201670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201670, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201670, "pid": 5, "tid": 7, "ts": 1716454224599935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527387, "dur": 5, "args": { "External id": 201670, "cbid": 211, "correlation": 201670 } }, { "ph": "s", "id": 201670, "pid": 76337, "tid": -914061504, "ts": 1716454224527387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224599980, "dur": 645, "args": { "External id": 201672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201672, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201672, "pid": 5, "tid": 7, "ts": 1716454224599980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527398, "dur": 8, "args": { "External id": 201672, "cbid": 211, "correlation": 201672 } }, { "ph": "s", "id": 201672, "pid": 76337, "tid": -914061504, "ts": 1716454224527398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224600627, "dur": 22, "args": { "External id": 201674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201674, "pid": 5, "tid": 7, "ts": 1716454224600627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527410, "dur": 6, "args": { "External id": 201674, "cbid": 211, "correlation": 201674 } }, { "ph": "s", "id": 201674, "pid": 76337, "tid": -914061504, "ts": 1716454224527410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224600649, "dur": 32, "args": { "External id": 201680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201680, "pid": 5, "tid": 7, "ts": 1716454224600649, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527439, "dur": 8, "args": { "External id": 201680, "cbid": 211, "correlation": 201680 } }, { "ph": "s", "id": 201680, "pid": 76337, "tid": -914061504, "ts": 1716454224527439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224527497, "dur": 0, "args": { "External id": 201690, "cbid": 317, "correlation": 201690 } }, { "ph": "f", "id": 201690, "pid": 76337, "tid": -914061504, "ts": 1716454224527497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224527498, "dur": 0, "args": { "External id": 201691, "cbid": 203, "correlation": 201691 } }, { "ph": "f", "id": 201691, "pid": 76337, "tid": -914061504, "ts": 1716454224527498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224527498, "dur": 0, "args": { "External id": 201692, "cbid": 205, "correlation": 201692 } }, { "ph": "f", "id": 201692, "pid": 76337, "tid": -914061504, "ts": 1716454224527498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224600683, "dur": 38, "args": { "External id": 201696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201696, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201696, "pid": 5, "tid": 7, "ts": 1716454224600683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527511, "dur": 12, "args": { "External id": 201696, "cbid": 211, "correlation": 201696 } }, { "ph": "s", "id": 201696, "pid": 76337, "tid": -914061504, "ts": 1716454224527511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224600722, "dur": 190, "args": { "External id": 201698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201698, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201698, "pid": 5, "tid": 7, "ts": 1716454224600722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527529, "dur": 6, "args": { "External id": 201698, "cbid": 211, "correlation": 201698 } }, { "ph": "s", "id": 201698, "pid": 76337, "tid": -914061504, "ts": 1716454224527529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224600914, "dur": 22, "args": { "External id": 201700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201700, "pid": 5, "tid": 7, "ts": 1716454224600914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527538, "dur": 5, "args": { "External id": 201700, "cbid": 211, "correlation": 201700 } }, { "ph": "s", "id": 201700, "pid": 76337, "tid": -914061504, "ts": 1716454224527538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224600937, "dur": 32, "args": { "External id": 201706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201706, "pid": 5, "tid": 7, "ts": 1716454224600937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527564, "dur": 8, "args": { "External id": 201706, "cbid": 211, "correlation": 201706 } }, { "ph": "s", "id": 201706, "pid": 76337, "tid": -914061504, "ts": 1716454224527564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224600971, "dur": 27, "args": { "External id": 201714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201714, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201714, "pid": 5, "tid": 7, "ts": 1716454224600971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527593, "dur": 8, "args": { "External id": 201714, "cbid": 211, "correlation": 201714 } }, { "ph": "s", "id": 201714, "pid": 76337, "tid": -914061504, "ts": 1716454224527593, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224600999, "dur": 19, "args": { "External id": 201722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201722, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201722, "pid": 5, "tid": 7, "ts": 1716454224600999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527622, "dur": 8, "args": { "External id": 201722, "cbid": 211, "correlation": 201722 } }, { "ph": "s", "id": 201722, "pid": 76337, "tid": -914061504, "ts": 1716454224527622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224601020, "dur": 30, "args": { "External id": 201742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201742, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 201742, "pid": 5, "tid": 7, "ts": 1716454224601020, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527703, "dur": 13, "args": { "External id": 201742, "cbid": 211, "correlation": 201742 } }, { "ph": "s", "id": 201742, "pid": 76337, "tid": -914061504, "ts": 1716454224527703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224601051, "dur": 5, "args": { "External id": 201754, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201754, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 201754, "pid": 5, "tid": 7, "ts": 1716454224601051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527726, "dur": 6, "args": { "External id": 201754, "cbid": 211, "correlation": 201754 } }, { "ph": "s", "id": 201754, "pid": 76337, "tid": -914061504, "ts": 1716454224527726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224601057, "dur": 30, "args": { "External id": 201757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201757, "pid": 5, "tid": 7, "ts": 1716454224601057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527743, "dur": 6, "args": { "External id": 201757, "cbid": 211, "correlation": 201757 } }, { "ph": "s", "id": 201757, "pid": 76337, "tid": -914061504, "ts": 1716454224527743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224527800, "dur": 0, "args": { "External id": 201768, "cbid": 317, "correlation": 201768 } }, { "ph": "f", "id": 201768, "pid": 76337, "tid": -914061504, "ts": 1716454224527800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224527801, "dur": 0, "args": { "External id": 201769, "cbid": 203, "correlation": 201769 } }, { "ph": "f", "id": 201769, "pid": 76337, "tid": -914061504, "ts": 1716454224527801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224527801, "dur": 0, "args": { "External id": 201770, "cbid": 205, "correlation": 201770 } }, { "ph": "f", "id": 201770, "pid": 76337, "tid": -914061504, "ts": 1716454224527801, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224601089, "dur": 21, "args": { "External id": 201774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201774, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201774, "pid": 5, "tid": 7, "ts": 1716454224601089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527815, "dur": 11, "args": { "External id": 201774, "cbid": 211, "correlation": 201774 } }, { "ph": "s", "id": 201774, "pid": 76337, "tid": -914061504, "ts": 1716454224527815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224601111, "dur": 105, "args": { "External id": 201776, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201776, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201776, "pid": 5, "tid": 7, "ts": 1716454224601111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527832, "dur": 6, "args": { "External id": 201776, "cbid": 211, "correlation": 201776 } }, { "ph": "s", "id": 201776, "pid": 76337, "tid": -914061504, "ts": 1716454224527832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224601218, "dur": 23, "args": { "External id": 201778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201778, "pid": 5, "tid": 7, "ts": 1716454224601218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527842, "dur": 5, "args": { "External id": 201778, "cbid": 211, "correlation": 201778 } }, { "ph": "s", "id": 201778, "pid": 76337, "tid": -914061504, "ts": 1716454224527842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224601242, "dur": 32, "args": { "External id": 201784, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201784, "pid": 5, "tid": 7, "ts": 1716454224601242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527870, "dur": 8, "args": { "External id": 201784, "cbid": 211, "correlation": 201784 } }, { "ph": "s", "id": 201784, "pid": 76337, "tid": -914061504, "ts": 1716454224527870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224601276, "dur": 190, "args": { "External id": 201793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201793, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201793, "pid": 5, "tid": 7, "ts": 1716454224601276, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224527950, "dur": 15, "args": { "External id": 201793, "cbid": 211, "correlation": 201793 } }, { "ph": "s", "id": 201793, "pid": 76337, "tid": -914061504, "ts": 1716454224527950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224601467, "dur": 65, "args": { "External id": 201815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201815, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201815, "pid": 5, "tid": 7, "ts": 1716454224601467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528016, "dur": 11, "args": { "External id": 201815, "cbid": 211, "correlation": 201815 } }, { "ph": "s", "id": 201815, "pid": 76337, "tid": -914061504, "ts": 1716454224528016, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528103, "dur": 1, "args": { "External id": 201826, "cbid": 251, "correlation": 201826 } }, { "ph": "f", "id": 201826, "pid": 76337, "tid": -914061504, "ts": 1716454224528103, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224601534, "dur": 153, "args": { "External id": 201827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201827, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201827, "pid": 5, "tid": 7, "ts": 1716454224601534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528109, "dur": 14, "args": { "External id": 201827, "cbid": 211, "correlation": 201827 } }, { "ph": "s", "id": 201827, "pid": 76337, "tid": -914061504, "ts": 1716454224528109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528179, "dur": 1, "args": { "External id": 201838, "cbid": 251, "correlation": 201838 } }, { "ph": "f", "id": 201838, "pid": 76337, "tid": -914061504, "ts": 1716454224528179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224601688, "dur": 146, "args": { "External id": 201839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201839, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201839, "pid": 5, "tid": 7, "ts": 1716454224601688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528182, "dur": 11, "args": { "External id": 201839, "cbid": 211, "correlation": 201839 } }, { "ph": "s", "id": 201839, "pid": 76337, "tid": -914061504, "ts": 1716454224528182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528247, "dur": 1, "args": { "External id": 201850, "cbid": 251, "correlation": 201850 } }, { "ph": "f", "id": 201850, "pid": 76337, "tid": -914061504, "ts": 1716454224528247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224601836, "dur": 145, "args": { "External id": 201851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201851, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201851, "pid": 5, "tid": 7, "ts": 1716454224601836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528251, "dur": 12, "args": { "External id": 201851, "cbid": 211, "correlation": 201851 } }, { "ph": "s", "id": 201851, "pid": 76337, "tid": -914061504, "ts": 1716454224528251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224601982, "dur": 1951, "args": { "External id": 201872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201872, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 201872, "pid": 5, "tid": 7, "ts": 1716454224601982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528332, "dur": 12, "args": { "External id": 201872, "cbid": 211, "correlation": 201872 } }, { "ph": "s", "id": 201872, "pid": 76337, "tid": -914061504, "ts": 1716454224528332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528430, "dur": 1, "args": { "External id": 201890, "cbid": 251, "correlation": 201890 } }, { "ph": "f", "id": 201890, "pid": 76337, "tid": -914061504, "ts": 1716454224528430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224603935, "dur": 150, "args": { "External id": 201892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201892, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 201892, "pid": 5, "tid": 7, "ts": 1716454224603935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528436, "dur": 13, "args": { "External id": 201892, "cbid": 211, "correlation": 201892 } }, { "ph": "s", "id": 201892, "pid": 76337, "tid": -914061504, "ts": 1716454224528436, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224604086, "dur": 35, "args": { "External id": 201900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201900, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201900, "pid": 5, "tid": 7, "ts": 1716454224604086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528505, "dur": 13, "args": { "External id": 201900, "cbid": 211, "correlation": 201900 } }, { "ph": "s", "id": 201900, "pid": 76337, "tid": -914061504, "ts": 1716454224528505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224604123, "dur": 51, "args": { "External id": 201908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201908, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201908, "pid": 5, "tid": 7, "ts": 1716454224604123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528545, "dur": 8, "args": { "External id": 201908, "cbid": 211, "correlation": 201908 } }, { "ph": "s", "id": 201908, "pid": 76337, "tid": -914061504, "ts": 1716454224528545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224604175, "dur": 30, "args": { "External id": 201919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201919, "pid": 5, "tid": 7, "ts": 1716454224604175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528615, "dur": 13, "args": { "External id": 201919, "cbid": 211, "correlation": 201919 } }, { "ph": "s", "id": 201919, "pid": 76337, "tid": -914061504, "ts": 1716454224528615, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224604206, "dur": 34, "args": { "External id": 201941, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201941, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 201941, "pid": 5, "tid": 7, "ts": 1716454224604206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528646, "dur": 7, "args": { "External id": 201941, "cbid": 211, "correlation": 201941 } }, { "ph": "s", "id": 201941, "pid": 76337, "tid": -914061504, "ts": 1716454224528646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528731, "dur": 1, "args": { "External id": 201952, "cbid": 251, "correlation": 201952 } }, { "ph": "f", "id": 201952, "pid": 76337, "tid": -914061504, "ts": 1716454224528731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224604242, "dur": 90, "args": { "External id": 201953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201953, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 201953, "pid": 5, "tid": 7, "ts": 1716454224604242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528736, "dur": 13, "args": { "External id": 201953, "cbid": 211, "correlation": 201953 } }, { "ph": "s", "id": 201953, "pid": 76337, "tid": -914061504, "ts": 1716454224528736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528805, "dur": 1, "args": { "External id": 201964, "cbid": 251, "correlation": 201964 } }, { "ph": "f", "id": 201964, "pid": 76337, "tid": -914061504, "ts": 1716454224528805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528810, "dur": 0, "args": { "External id": 201965, "cbid": 251, "correlation": 201965 } }, { "ph": "f", "id": 201965, "pid": 76337, "tid": -914061504, "ts": 1716454224528810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224604333, "dur": 12, "args": { "External id": 201966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201966, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 201966, "pid": 5, "tid": 7, "ts": 1716454224604333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528812, "dur": 11, "args": { "External id": 201966, "cbid": 211, "correlation": 201966 } }, { "ph": "s", "id": 201966, "pid": 76337, "tid": -914061504, "ts": 1716454224528812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224604346, "dur": 5, "args": { "External id": 201968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201968, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 201968, "pid": 5, "tid": 7, "ts": 1716454224604346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528825, "dur": 5, "args": { "External id": 201968, "cbid": 211, "correlation": 201968 } }, { "ph": "s", "id": 201968, "pid": 76337, "tid": -914061504, "ts": 1716454224528825, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528882, "dur": 1, "args": { "External id": 201979, "cbid": 251, "correlation": 201979 } }, { "ph": "f", "id": 201979, "pid": 76337, "tid": -914061504, "ts": 1716454224528882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224528885, "dur": 0, "args": { "External id": 201980, "cbid": 251, "correlation": 201980 } }, { "ph": "f", "id": 201980, "pid": 76337, "tid": -914061504, "ts": 1716454224528885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224604352, "dur": 7, "args": { "External id": 201981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201981, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 201981, "pid": 5, "tid": 7, "ts": 1716454224604352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528886, "dur": 11, "args": { "External id": 201981, "cbid": 211, "correlation": 201981 } }, { "ph": "s", "id": 201981, "pid": 76337, "tid": -914061504, "ts": 1716454224528886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224604361, "dur": 4, "args": { "External id": 201983, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 201983, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 201983, "pid": 5, "tid": 7, "ts": 1716454224604361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528899, "dur": 6, "args": { "External id": 201983, "cbid": 211, "correlation": 201983 } }, { "ph": "s", "id": 201983, "pid": 76337, "tid": -914061504, "ts": 1716454224528899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224604366, "dur": 91, "args": { "External id": 202004, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202004, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 202004, "pid": 5, "tid": 7, "ts": 1716454224604366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224528981, "dur": 13, "args": { "External id": 202004, "cbid": 211, "correlation": 202004 } }, { "ph": "s", "id": 202004, "pid": 76337, "tid": -914061504, "ts": 1716454224528981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224529080, "dur": 1, "args": { "External id": 202022, "cbid": 251, "correlation": 202022 } }, { "ph": "f", "id": 202022, "pid": 76337, "tid": -914061504, "ts": 1716454224529080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224604459, "dur": 98, "args": { "External id": 202024, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202024, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202024, "pid": 5, "tid": 7, "ts": 1716454224604459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529086, "dur": 13, "args": { "External id": 202024, "cbid": 211, "correlation": 202024 } }, { "ph": "s", "id": 202024, "pid": 76337, "tid": -914061504, "ts": 1716454224529086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224604558, "dur": 19, "args": { "External id": 202032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202032, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202032, "pid": 5, "tid": 7, "ts": 1716454224604558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529154, "dur": 13, "args": { "External id": 202032, "cbid": 211, "correlation": 202032 } }, { "ph": "s", "id": 202032, "pid": 76337, "tid": -914061504, "ts": 1716454224529154, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224604578, "dur": 38, "args": { "External id": 202040, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202040, "pid": 5, "tid": 7, "ts": 1716454224604578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529197, "dur": 9, "args": { "External id": 202040, "cbid": 211, "correlation": 202040 } }, { "ph": "s", "id": 202040, "pid": 76337, "tid": -914061504, "ts": 1716454224529197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224604618, "dur": 35, "args": { "External id": 202062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202062, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202062, "pid": 5, "tid": 7, "ts": 1716454224604618, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529248, "dur": 10, "args": { "External id": 202062, "cbid": 211, "correlation": 202062 } }, { "ph": "s", "id": 202062, "pid": 76337, "tid": -914061504, "ts": 1716454224529248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224529337, "dur": 1, "args": { "External id": 202078, "cbid": 251, "correlation": 202078 } }, { "ph": "f", "id": 202078, "pid": 76337, "tid": -914061504, "ts": 1716454224529337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224529342, "dur": 0, "args": { "External id": 202080, "cbid": 251, "correlation": 202080 } }, { "ph": "f", "id": 202080, "pid": 76337, "tid": -914061504, "ts": 1716454224529342, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224604654, "dur": 538, "args": { "External id": 202081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202081, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 202081, "pid": 5, "tid": 7, "ts": 1716454224604654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529345, "dur": 13, "args": { "External id": 202081, "cbid": 211, "correlation": 202081 } }, { "ph": "s", "id": 202081, "pid": 76337, "tid": -914061504, "ts": 1716454224529345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224605193, "dur": 127, "args": { "External id": 202089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202089, "pid": 5, "tid": 7, "ts": 1716454224605193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529411, "dur": 13, "args": { "External id": 202089, "cbid": 211, "correlation": 202089 } }, { "ph": "s", "id": 202089, "pid": 76337, "tid": -914061504, "ts": 1716454224529411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224605321, "dur": 130, "args": { "External id": 202097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202097, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202097, "pid": 5, "tid": 7, "ts": 1716454224605321, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529442, "dur": 8, "args": { "External id": 202097, "cbid": 211, "correlation": 202097 } }, { "ph": "s", "id": 202097, "pid": 76337, "tid": -914061504, "ts": 1716454224529442, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224529517, "dur": 1, "args": { "External id": 202113, "cbid": 251, "correlation": 202113 } }, { "ph": "f", "id": 202113, "pid": 76337, "tid": -914061504, "ts": 1716454224529517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224605452, "dur": 309, "args": { "External id": 202115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202115, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202115, "pid": 5, "tid": 7, "ts": 1716454224605452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529523, "dur": 12, "args": { "External id": 202115, "cbid": 211, "correlation": 202115 } }, { "ph": "s", "id": 202115, "pid": 76337, "tid": -914061504, "ts": 1716454224529523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224605763, "dur": 27, "args": { "External id": 202123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202123, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202123, "pid": 5, "tid": 7, "ts": 1716454224605763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529565, "dur": 9, "args": { "External id": 202123, "cbid": 211, "correlation": 202123 } }, { "ph": "s", "id": 202123, "pid": 76337, "tid": -914061504, "ts": 1716454224529565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224605791, "dur": 82, "args": { "External id": 202134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202134, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202134, "pid": 5, "tid": 7, "ts": 1716454224605791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529631, "dur": 12, "args": { "External id": 202134, "cbid": 211, "correlation": 202134 } }, { "ph": "s", "id": 202134, "pid": 76337, "tid": -914061504, "ts": 1716454224529631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224529694, "dur": 0, "args": { "External id": 202146, "cbid": 317, "correlation": 202146 } }, { "ph": "f", "id": 202146, "pid": 76337, "tid": -914061504, "ts": 1716454224529694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224529695, "dur": 0, "args": { "External id": 202147, "cbid": 203, "correlation": 202147 } }, { "ph": "f", "id": 202147, "pid": 76337, "tid": -914061504, "ts": 1716454224529695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224529696, "dur": 0, "args": { "External id": 202148, "cbid": 205, "correlation": 202148 } }, { "ph": "f", "id": 202148, "pid": 76337, "tid": -914061504, "ts": 1716454224529696, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224605875, "dur": 24, "args": { "External id": 202152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202152, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202152, "pid": 5, "tid": 7, "ts": 1716454224605875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529711, "dur": 12, "args": { "External id": 202152, "cbid": 211, "correlation": 202152 } }, { "ph": "s", "id": 202152, "pid": 76337, "tid": -914061504, "ts": 1716454224529711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224605900, "dur": 121, "args": { "External id": 202154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202154, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202154, "pid": 5, "tid": 7, "ts": 1716454224605900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529729, "dur": 6, "args": { "External id": 202154, "cbid": 211, "correlation": 202154 } }, { "ph": "s", "id": 202154, "pid": 76337, "tid": -914061504, "ts": 1716454224529729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224606022, "dur": 22, "args": { "External id": 202156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202156, "pid": 5, "tid": 7, "ts": 1716454224606022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529739, "dur": 5, "args": { "External id": 202156, "cbid": 211, "correlation": 202156 } }, { "ph": "s", "id": 202156, "pid": 76337, "tid": -914061504, "ts": 1716454224529739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224606045, "dur": 33, "args": { "External id": 202162, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202162, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202162, "pid": 5, "tid": 7, "ts": 1716454224606045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529768, "dur": 8, "args": { "External id": 202162, "cbid": 211, "correlation": 202162 } }, { "ph": "s", "id": 202162, "pid": 76337, "tid": -914061504, "ts": 1716454224529768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224606080, "dur": 26, "args": { "External id": 202170, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202170, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202170, "pid": 5, "tid": 7, "ts": 1716454224606080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529799, "dur": 8, "args": { "External id": 202170, "cbid": 211, "correlation": 202170 } }, { "ph": "s", "id": 202170, "pid": 76337, "tid": -914061504, "ts": 1716454224529799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224606107, "dur": 45, "args": { "External id": 202179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202179, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202179, "pid": 5, "tid": 7, "ts": 1716454224606107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529836, "dur": 10, "args": { "External id": 202179, "cbid": 211, "correlation": 202179 } }, { "ph": "s", "id": 202179, "pid": 76337, "tid": -914061504, "ts": 1716454224529836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224606154, "dur": 42, "args": { "External id": 202199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202199, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 202199, "pid": 5, "tid": 7, "ts": 1716454224606154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529906, "dur": 12, "args": { "External id": 202199, "cbid": 211, "correlation": 202199 } }, { "ph": "s", "id": 202199, "pid": 76337, "tid": -914061504, "ts": 1716454224529906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224606197, "dur": 5, "args": { "External id": 202211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202211, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 202211, "pid": 5, "tid": 7, "ts": 1716454224606197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529928, "dur": 6, "args": { "External id": 202211, "cbid": 211, "correlation": 202211 } }, { "ph": "s", "id": 202211, "pid": 76337, "tid": -914061504, "ts": 1716454224529928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224606203, "dur": 44, "args": { "External id": 202214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202214, "pid": 5, "tid": 7, "ts": 1716454224606203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529946, "dur": 7, "args": { "External id": 202214, "cbid": 211, "correlation": 202214 } }, { "ph": "s", "id": 202214, "pid": 76337, "tid": -914061504, "ts": 1716454224529946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224606249, "dur": 29, "args": { "External id": 202223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202223, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202223, "pid": 5, "tid": 7, "ts": 1716454224606249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224529994, "dur": 10, "args": { "External id": 202223, "cbid": 211, "correlation": 202223 } }, { "ph": "s", "id": 202223, "pid": 76337, "tid": -914061504, "ts": 1716454224529994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224530048, "dur": 0, "args": { "External id": 202233, "cbid": 317, "correlation": 202233 } }, { "ph": "f", "id": 202233, "pid": 76337, "tid": -914061504, "ts": 1716454224530048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224530049, "dur": 0, "args": { "External id": 202234, "cbid": 203, "correlation": 202234 } }, { "ph": "f", "id": 202234, "pid": 76337, "tid": -914061504, "ts": 1716454224530049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224530049, "dur": 0, "args": { "External id": 202235, "cbid": 205, "correlation": 202235 } }, { "ph": "f", "id": 202235, "pid": 76337, "tid": -914061504, "ts": 1716454224530049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224606280, "dur": 31, "args": { "External id": 202239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202239, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202239, "pid": 5, "tid": 7, "ts": 1716454224606280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530065, "dur": 11, "args": { "External id": 202239, "cbid": 211, "correlation": 202239 } }, { "ph": "s", "id": 202239, "pid": 76337, "tid": -914061504, "ts": 1716454224530065, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224606312, "dur": 64, "args": { "External id": 202241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202241, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202241, "pid": 5, "tid": 7, "ts": 1716454224606312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530079, "dur": 5, "args": { "External id": 202241, "cbid": 211, "correlation": 202241 } }, { "ph": "s", "id": 202241, "pid": 76337, "tid": -914061504, "ts": 1716454224530079, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224606377, "dur": 975, "args": { "External id": 202243, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202243, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202243, "pid": 5, "tid": 7, "ts": 1716454224606377, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530090, "dur": 6, "args": { "External id": 202243, "cbid": 211, "correlation": 202243 } }, { "ph": "s", "id": 202243, "pid": 76337, "tid": -914061504, "ts": 1716454224530090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224607354, "dur": 20, "args": { "External id": 202245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202245, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202245, "pid": 5, "tid": 7, "ts": 1716454224607354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530100, "dur": 5, "args": { "External id": 202245, "cbid": 211, "correlation": 202245 } }, { "ph": "s", "id": 202245, "pid": 76337, "tid": -914061504, "ts": 1716454224530100, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224607375, "dur": 33, "args": { "External id": 202251, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202251, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202251, "pid": 5, "tid": 7, "ts": 1716454224607375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530128, "dur": 9, "args": { "External id": 202251, "cbid": 211, "correlation": 202251 } }, { "ph": "s", "id": 202251, "pid": 76337, "tid": -914061504, "ts": 1716454224530128, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224607410, "dur": 3, "args": { "External id": 202259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202259, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 202259, "pid": 5, "tid": 7, "ts": 1716454224607410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530171, "dur": 9, "args": { "External id": 202259, "cbid": 211, "correlation": 202259 } }, { "ph": "s", "id": 202259, "pid": 76337, "tid": -914061504, "ts": 1716454224530171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224530234, "dur": 1, "args": { "External id": 202275, "cbid": 251, "correlation": 202275 } }, { "ph": "f", "id": 202275, "pid": 76337, "tid": -914061504, "ts": 1716454224530234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224530239, "dur": 0, "args": { "External id": 202277, "cbid": 251, "correlation": 202277 } }, { "ph": "f", "id": 202277, "pid": 76337, "tid": -914061504, "ts": 1716454224530239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224607414, "dur": 12, "args": { "External id": 202278, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202278, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 202278, "pid": 5, "tid": 7, "ts": 1716454224607414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530241, "dur": 11, "args": { "External id": 202278, "cbid": 211, "correlation": 202278 } }, { "ph": "s", "id": 202278, "pid": 76337, "tid": -914061504, "ts": 1716454224530241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224607428, "dur": 5, "args": { "External id": 202280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202280, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 202280, "pid": 5, "tid": 7, "ts": 1716454224607428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530254, "dur": 6, "args": { "External id": 202280, "cbid": 211, "correlation": 202280 } }, { "ph": "s", "id": 202280, "pid": 76337, "tid": -914061504, "ts": 1716454224530254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224607435, "dur": 30, "args": { "External id": 202290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202290, "pid": 5, "tid": 7, "ts": 1716454224607435, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530312, "dur": 12, "args": { "External id": 202290, "cbid": 211, "correlation": 202290 } }, { "ph": "s", "id": 202290, "pid": 76337, "tid": -914061504, "ts": 1716454224530312, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224607466, "dur": 32, "args": { "External id": 202310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202310, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 202310, "pid": 5, "tid": 7, "ts": 1716454224607466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530377, "dur": 10, "args": { "External id": 202310, "cbid": 211, "correlation": 202310 } }, { "ph": "s", "id": 202310, "pid": 76337, "tid": -914061504, "ts": 1716454224530377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224607499, "dur": 4, "args": { "External id": 202322, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202322, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 202322, "pid": 5, "tid": 7, "ts": 1716454224607499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530397, "dur": 6, "args": { "External id": 202322, "cbid": 211, "correlation": 202322 } }, { "ph": "s", "id": 202322, "pid": 76337, "tid": -914061504, "ts": 1716454224530397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224607505, "dur": 30, "args": { "External id": 202325, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202325, "pid": 5, "tid": 7, "ts": 1716454224607505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530417, "dur": 6, "args": { "External id": 202325, "cbid": 211, "correlation": 202325 } }, { "ph": "s", "id": 202325, "pid": 76337, "tid": -914061504, "ts": 1716454224530417, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224607536, "dur": 20, "args": { "External id": 202334, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202334, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202334, "pid": 5, "tid": 7, "ts": 1716454224607536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530456, "dur": 10, "args": { "External id": 202334, "cbid": 211, "correlation": 202334 } }, { "ph": "s", "id": 202334, "pid": 76337, "tid": -914061504, "ts": 1716454224530456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224530520, "dur": 0, "args": { "External id": 202344, "cbid": 317, "correlation": 202344 } }, { "ph": "f", "id": 202344, "pid": 76337, "tid": -914061504, "ts": 1716454224530520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224530521, "dur": 0, "args": { "External id": 202345, "cbid": 203, "correlation": 202345 } }, { "ph": "f", "id": 202345, "pid": 76337, "tid": -914061504, "ts": 1716454224530521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224530522, "dur": 0, "args": { "External id": 202346, "cbid": 205, "correlation": 202346 } }, { "ph": "f", "id": 202346, "pid": 76337, "tid": -914061504, "ts": 1716454224530522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224607558, "dur": 22, "args": { "External id": 202350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202350, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202350, "pid": 5, "tid": 7, "ts": 1716454224607558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530535, "dur": 12, "args": { "External id": 202350, "cbid": 211, "correlation": 202350 } }, { "ph": "s", "id": 202350, "pid": 76337, "tid": -914061504, "ts": 1716454224530535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224607581, "dur": 44, "args": { "External id": 202352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202352, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202352, "pid": 5, "tid": 7, "ts": 1716454224607581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530549, "dur": 5, "args": { "External id": 202352, "cbid": 211, "correlation": 202352 } }, { "ph": "s", "id": 202352, "pid": 76337, "tid": -914061504, "ts": 1716454224530549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224607626, "dur": 649, "args": { "External id": 202354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202354, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202354, "pid": 5, "tid": 7, "ts": 1716454224607626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530560, "dur": 7, "args": { "External id": 202354, "cbid": 211, "correlation": 202354 } }, { "ph": "s", "id": 202354, "pid": 76337, "tid": -914061504, "ts": 1716454224530560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224608277, "dur": 24, "args": { "External id": 202356, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202356, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202356, "pid": 5, "tid": 7, "ts": 1716454224608277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530571, "dur": 5, "args": { "External id": 202356, "cbid": 211, "correlation": 202356 } }, { "ph": "s", "id": 202356, "pid": 76337, "tid": -914061504, "ts": 1716454224530571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224608302, "dur": 34, "args": { "External id": 202362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202362, "pid": 5, "tid": 7, "ts": 1716454224608302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530598, "dur": 8, "args": { "External id": 202362, "cbid": 211, "correlation": 202362 } }, { "ph": "s", "id": 202362, "pid": 76337, "tid": -914061504, "ts": 1716454224530598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224530655, "dur": 0, "args": { "External id": 202372, "cbid": 317, "correlation": 202372 } }, { "ph": "f", "id": 202372, "pid": 76337, "tid": -914061504, "ts": 1716454224530655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224530656, "dur": 0, "args": { "External id": 202373, "cbid": 203, "correlation": 202373 } }, { "ph": "f", "id": 202373, "pid": 76337, "tid": -914061504, "ts": 1716454224530656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224530657, "dur": 0, "args": { "External id": 202374, "cbid": 205, "correlation": 202374 } }, { "ph": "f", "id": 202374, "pid": 76337, "tid": -914061504, "ts": 1716454224530657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224608337, "dur": 30, "args": { "External id": 202378, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202378, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202378, "pid": 5, "tid": 7, "ts": 1716454224608337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530670, "dur": 11, "args": { "External id": 202378, "cbid": 211, "correlation": 202378 } }, { "ph": "s", "id": 202378, "pid": 76337, "tid": -914061504, "ts": 1716454224530670, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224608369, "dur": 153, "args": { "External id": 202380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202380, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202380, "pid": 5, "tid": 7, "ts": 1716454224608369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530687, "dur": 6, "args": { "External id": 202380, "cbid": 211, "correlation": 202380 } }, { "ph": "s", "id": 202380, "pid": 76337, "tid": -914061504, "ts": 1716454224530687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224608523, "dur": 21, "args": { "External id": 202382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202382, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202382, "pid": 5, "tid": 7, "ts": 1716454224608523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530697, "dur": 5, "args": { "External id": 202382, "cbid": 211, "correlation": 202382 } }, { "ph": "s", "id": 202382, "pid": 76337, "tid": -914061504, "ts": 1716454224530697, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224608546, "dur": 33, "args": { "External id": 202388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202388, "pid": 5, "tid": 7, "ts": 1716454224608546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530723, "dur": 8, "args": { "External id": 202388, "cbid": 211, "correlation": 202388 } }, { "ph": "s", "id": 202388, "pid": 76337, "tid": -914061504, "ts": 1716454224530723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224608580, "dur": 27, "args": { "External id": 202396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202396, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202396, "pid": 5, "tid": 7, "ts": 1716454224608580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530751, "dur": 8, "args": { "External id": 202396, "cbid": 211, "correlation": 202396 } }, { "ph": "s", "id": 202396, "pid": 76337, "tid": -914061504, "ts": 1716454224530751, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224608608, "dur": 19, "args": { "External id": 202404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202404, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202404, "pid": 5, "tid": 7, "ts": 1716454224608608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530780, "dur": 8, "args": { "External id": 202404, "cbid": 211, "correlation": 202404 } }, { "ph": "s", "id": 202404, "pid": 76337, "tid": -914061504, "ts": 1716454224530780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224608629, "dur": 29, "args": { "External id": 202424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202424, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 202424, "pid": 5, "tid": 7, "ts": 1716454224608629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530861, "dur": 13, "args": { "External id": 202424, "cbid": 211, "correlation": 202424 } }, { "ph": "s", "id": 202424, "pid": 76337, "tid": -914061504, "ts": 1716454224530861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224608659, "dur": 5, "args": { "External id": 202436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202436, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 202436, "pid": 5, "tid": 7, "ts": 1716454224608659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530883, "dur": 6, "args": { "External id": 202436, "cbid": 211, "correlation": 202436 } }, { "ph": "s", "id": 202436, "pid": 76337, "tid": -914061504, "ts": 1716454224530883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224608666, "dur": 32, "args": { "External id": 202439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202439, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202439, "pid": 5, "tid": 7, "ts": 1716454224608666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530901, "dur": 7, "args": { "External id": 202439, "cbid": 211, "correlation": 202439 } }, { "ph": "s", "id": 202439, "pid": 76337, "tid": -914061504, "ts": 1716454224530901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224530957, "dur": 0, "args": { "External id": 202450, "cbid": 317, "correlation": 202450 } }, { "ph": "f", "id": 202450, "pid": 76337, "tid": -914061504, "ts": 1716454224530957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224530958, "dur": 0, "args": { "External id": 202451, "cbid": 203, "correlation": 202451 } }, { "ph": "f", "id": 202451, "pid": 76337, "tid": -914061504, "ts": 1716454224530958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224530959, "dur": 0, "args": { "External id": 202452, "cbid": 205, "correlation": 202452 } }, { "ph": "f", "id": 202452, "pid": 76337, "tid": -914061504, "ts": 1716454224530959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224608698, "dur": 22, "args": { "External id": 202456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202456, "pid": 5, "tid": 7, "ts": 1716454224608698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530973, "dur": 21, "args": { "External id": 202456, "cbid": 211, "correlation": 202456 } }, { "ph": "s", "id": 202456, "pid": 76337, "tid": -914061504, "ts": 1716454224530973, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224608722, "dur": 106, "args": { "External id": 202458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202458, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202458, "pid": 5, "tid": 7, "ts": 1716454224608722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224530999, "dur": 6, "args": { "External id": 202458, "cbid": 211, "correlation": 202458 } }, { "ph": "s", "id": 202458, "pid": 76337, "tid": -914061504, "ts": 1716454224530999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224608829, "dur": 24, "args": { "External id": 202460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202460, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202460, "pid": 5, "tid": 7, "ts": 1716454224608829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531008, "dur": 5, "args": { "External id": 202460, "cbid": 211, "correlation": 202460 } }, { "ph": "s", "id": 202460, "pid": 76337, "tid": -914061504, "ts": 1716454224531008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224608854, "dur": 32, "args": { "External id": 202466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202466, "pid": 5, "tid": 7, "ts": 1716454224608854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531037, "dur": 9, "args": { "External id": 202466, "cbid": 211, "correlation": 202466 } }, { "ph": "s", "id": 202466, "pid": 76337, "tid": -914061504, "ts": 1716454224531037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224608887, "dur": 203, "args": { "External id": 202475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202475, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202475, "pid": 5, "tid": 7, "ts": 1716454224608887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531119, "dur": 14, "args": { "External id": 202475, "cbid": 211, "correlation": 202475 } }, { "ph": "s", "id": 202475, "pid": 76337, "tid": -914061504, "ts": 1716454224531119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224609092, "dur": 66, "args": { "External id": 202497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202497, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202497, "pid": 5, "tid": 7, "ts": 1716454224609092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531178, "dur": 10, "args": { "External id": 202497, "cbid": 211, "correlation": 202497 } }, { "ph": "s", "id": 202497, "pid": 76337, "tid": -914061504, "ts": 1716454224531178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531264, "dur": 2, "args": { "External id": 202508, "cbid": 251, "correlation": 202508 } }, { "ph": "f", "id": 202508, "pid": 76337, "tid": -914061504, "ts": 1716454224531264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224609159, "dur": 153, "args": { "External id": 202509, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202509, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202509, "pid": 5, "tid": 7, "ts": 1716454224609159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531270, "dur": 13, "args": { "External id": 202509, "cbid": 211, "correlation": 202509 } }, { "ph": "s", "id": 202509, "pid": 76337, "tid": -914061504, "ts": 1716454224531270, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531341, "dur": 1, "args": { "External id": 202520, "cbid": 251, "correlation": 202520 } }, { "ph": "f", "id": 202520, "pid": 76337, "tid": -914061504, "ts": 1716454224531341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224609314, "dur": 146, "args": { "External id": 202521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202521, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202521, "pid": 5, "tid": 7, "ts": 1716454224609314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531345, "dur": 11, "args": { "External id": 202521, "cbid": 211, "correlation": 202521 } }, { "ph": "s", "id": 202521, "pid": 76337, "tid": -914061504, "ts": 1716454224531345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531411, "dur": 1, "args": { "External id": 202532, "cbid": 251, "correlation": 202532 } }, { "ph": "f", "id": 202532, "pid": 76337, "tid": -914061504, "ts": 1716454224531411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224609461, "dur": 146, "args": { "External id": 202533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202533, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202533, "pid": 5, "tid": 7, "ts": 1716454224609461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531416, "dur": 12, "args": { "External id": 202533, "cbid": 211, "correlation": 202533 } }, { "ph": "s", "id": 202533, "pid": 76337, "tid": -914061504, "ts": 1716454224531416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224609609, "dur": 1969, "args": { "External id": 202554, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202554, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 202554, "pid": 5, "tid": 7, "ts": 1716454224609609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531495, "dur": 12, "args": { "External id": 202554, "cbid": 211, "correlation": 202554 } }, { "ph": "s", "id": 202554, "pid": 76337, "tid": -914061504, "ts": 1716454224531495, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531593, "dur": 1, "args": { "External id": 202572, "cbid": 251, "correlation": 202572 } }, { "ph": "f", "id": 202572, "pid": 76337, "tid": -914061504, "ts": 1716454224531593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224611579, "dur": 150, "args": { "External id": 202574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202574, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 202574, "pid": 5, "tid": 7, "ts": 1716454224611579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531599, "dur": 13, "args": { "External id": 202574, "cbid": 211, "correlation": 202574 } }, { "ph": "s", "id": 202574, "pid": 76337, "tid": -914061504, "ts": 1716454224531599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224611730, "dur": 36, "args": { "External id": 202582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202582, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202582, "pid": 5, "tid": 7, "ts": 1716454224611730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531667, "dur": 12, "args": { "External id": 202582, "cbid": 211, "correlation": 202582 } }, { "ph": "s", "id": 202582, "pid": 76337, "tid": -914061504, "ts": 1716454224531667, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224611767, "dur": 51, "args": { "External id": 202590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202590, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202590, "pid": 5, "tid": 7, "ts": 1716454224611767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531706, "dur": 9, "args": { "External id": 202590, "cbid": 211, "correlation": 202590 } }, { "ph": "s", "id": 202590, "pid": 76337, "tid": -914061504, "ts": 1716454224531706, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224611820, "dur": 30, "args": { "External id": 202601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202601, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202601, "pid": 5, "tid": 7, "ts": 1716454224611820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531777, "dur": 12, "args": { "External id": 202601, "cbid": 211, "correlation": 202601 } }, { "ph": "s", "id": 202601, "pid": 76337, "tid": -914061504, "ts": 1716454224531777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224611851, "dur": 35, "args": { "External id": 202623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202623, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202623, "pid": 5, "tid": 7, "ts": 1716454224611851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531808, "dur": 7, "args": { "External id": 202623, "cbid": 211, "correlation": 202623 } }, { "ph": "s", "id": 202623, "pid": 76337, "tid": -914061504, "ts": 1716454224531808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531891, "dur": 1, "args": { "External id": 202634, "cbid": 251, "correlation": 202634 } }, { "ph": "f", "id": 202634, "pid": 76337, "tid": -914061504, "ts": 1716454224531891, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224611888, "dur": 91, "args": { "External id": 202635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202635, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202635, "pid": 5, "tid": 7, "ts": 1716454224611888, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531896, "dur": 13, "args": { "External id": 202635, "cbid": 211, "correlation": 202635 } }, { "ph": "s", "id": 202635, "pid": 76337, "tid": -914061504, "ts": 1716454224531896, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531965, "dur": 1, "args": { "External id": 202646, "cbid": 251, "correlation": 202646 } }, { "ph": "f", "id": 202646, "pid": 76337, "tid": -914061504, "ts": 1716454224531965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224531969, "dur": 0, "args": { "External id": 202647, "cbid": 251, "correlation": 202647 } }, { "ph": "f", "id": 202647, "pid": 76337, "tid": -914061504, "ts": 1716454224531969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224611980, "dur": 12, "args": { "External id": 202648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202648, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 202648, "pid": 5, "tid": 7, "ts": 1716454224611980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531971, "dur": 20, "args": { "External id": 202648, "cbid": 211, "correlation": 202648 } }, { "ph": "s", "id": 202648, "pid": 76337, "tid": -914061504, "ts": 1716454224531971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224611993, "dur": 5, "args": { "External id": 202650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202650, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 202650, "pid": 5, "tid": 7, "ts": 1716454224611993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224531992, "dur": 6, "args": { "External id": 202650, "cbid": 211, "correlation": 202650 } }, { "ph": "s", "id": 202650, "pid": 76337, "tid": -914061504, "ts": 1716454224531992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224532052, "dur": 1, "args": { "External id": 202661, "cbid": 251, "correlation": 202661 } }, { "ph": "f", "id": 202661, "pid": 76337, "tid": -914061504, "ts": 1716454224532052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224532055, "dur": 0, "args": { "External id": 202662, "cbid": 251, "correlation": 202662 } }, { "ph": "f", "id": 202662, "pid": 76337, "tid": -914061504, "ts": 1716454224532055, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224611999, "dur": 7, "args": { "External id": 202663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202663, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 202663, "pid": 5, "tid": 7, "ts": 1716454224611999, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532057, "dur": 12, "args": { "External id": 202663, "cbid": 211, "correlation": 202663 } }, { "ph": "s", "id": 202663, "pid": 76337, "tid": -914061504, "ts": 1716454224532057, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224612008, "dur": 4, "args": { "External id": 202665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202665, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 202665, "pid": 5, "tid": 7, "ts": 1716454224612008, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532071, "dur": 5, "args": { "External id": 202665, "cbid": 211, "correlation": 202665 } }, { "ph": "s", "id": 202665, "pid": 76337, "tid": -914061504, "ts": 1716454224532071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224612013, "dur": 92, "args": { "External id": 202686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202686, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 202686, "pid": 5, "tid": 7, "ts": 1716454224612013, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532143, "dur": 12, "args": { "External id": 202686, "cbid": 211, "correlation": 202686 } }, { "ph": "s", "id": 202686, "pid": 76337, "tid": -914061504, "ts": 1716454224532143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224532240, "dur": 1, "args": { "External id": 202704, "cbid": 251, "correlation": 202704 } }, { "ph": "f", "id": 202704, "pid": 76337, "tid": -914061504, "ts": 1716454224532240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224612106, "dur": 97, "args": { "External id": 202706, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202706, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202706, "pid": 5, "tid": 7, "ts": 1716454224612106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532245, "dur": 13, "args": { "External id": 202706, "cbid": 211, "correlation": 202706 } }, { "ph": "s", "id": 202706, "pid": 76337, "tid": -914061504, "ts": 1716454224532245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224612204, "dur": 19, "args": { "External id": 202714, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202714, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202714, "pid": 5, "tid": 7, "ts": 1716454224612204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532314, "dur": 13, "args": { "External id": 202714, "cbid": 211, "correlation": 202714 } }, { "ph": "s", "id": 202714, "pid": 76337, "tid": -914061504, "ts": 1716454224532314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224612225, "dur": 37, "args": { "External id": 202722, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202722, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202722, "pid": 5, "tid": 7, "ts": 1716454224612225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532355, "dur": 9, "args": { "External id": 202722, "cbid": 211, "correlation": 202722 } }, { "ph": "s", "id": 202722, "pid": 76337, "tid": -914061504, "ts": 1716454224532355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224612263, "dur": 35, "args": { "External id": 202744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202744, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202744, "pid": 5, "tid": 7, "ts": 1716454224612263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532407, "dur": 10, "args": { "External id": 202744, "cbid": 211, "correlation": 202744 } }, { "ph": "s", "id": 202744, "pid": 76337, "tid": -914061504, "ts": 1716454224532407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224532495, "dur": 1, "args": { "External id": 202760, "cbid": 251, "correlation": 202760 } }, { "ph": "f", "id": 202760, "pid": 76337, "tid": -914061504, "ts": 1716454224532495, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224532499, "dur": 0, "args": { "External id": 202762, "cbid": 251, "correlation": 202762 } }, { "ph": "f", "id": 202762, "pid": 76337, "tid": -914061504, "ts": 1716454224532499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224612300, "dur": 543, "args": { "External id": 202763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202763, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 202763, "pid": 5, "tid": 7, "ts": 1716454224612300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532503, "dur": 12, "args": { "External id": 202763, "cbid": 211, "correlation": 202763 } }, { "ph": "s", "id": 202763, "pid": 76337, "tid": -914061504, "ts": 1716454224532503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224612844, "dur": 126, "args": { "External id": 202771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202771, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202771, "pid": 5, "tid": 7, "ts": 1716454224612844, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532567, "dur": 13, "args": { "External id": 202771, "cbid": 211, "correlation": 202771 } }, { "ph": "s", "id": 202771, "pid": 76337, "tid": -914061504, "ts": 1716454224532567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224612972, "dur": 127, "args": { "External id": 202779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202779, "pid": 5, "tid": 7, "ts": 1716454224612972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532598, "dur": 8, "args": { "External id": 202779, "cbid": 211, "correlation": 202779 } }, { "ph": "s", "id": 202779, "pid": 76337, "tid": -914061504, "ts": 1716454224532598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224532675, "dur": 1, "args": { "External id": 202795, "cbid": 251, "correlation": 202795 } }, { "ph": "f", "id": 202795, "pid": 76337, "tid": -914061504, "ts": 1716454224532675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224613101, "dur": 306, "args": { "External id": 202797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202797, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202797, "pid": 5, "tid": 7, "ts": 1716454224613101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532681, "dur": 12, "args": { "External id": 202797, "cbid": 211, "correlation": 202797 } }, { "ph": "s", "id": 202797, "pid": 76337, "tid": -914061504, "ts": 1716454224532681, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224613408, "dur": 27, "args": { "External id": 202805, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202805, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202805, "pid": 5, "tid": 7, "ts": 1716454224613408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532724, "dur": 10, "args": { "External id": 202805, "cbid": 211, "correlation": 202805 } }, { "ph": "s", "id": 202805, "pid": 76337, "tid": -914061504, "ts": 1716454224532724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224613437, "dur": 82, "args": { "External id": 202816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202816, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202816, "pid": 5, "tid": 7, "ts": 1716454224613437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532791, "dur": 12, "args": { "External id": 202816, "cbid": 211, "correlation": 202816 } }, { "ph": "s", "id": 202816, "pid": 76337, "tid": -914061504, "ts": 1716454224532791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224532853, "dur": 0, "args": { "External id": 202828, "cbid": 317, "correlation": 202828 } }, { "ph": "f", "id": 202828, "pid": 76337, "tid": -914061504, "ts": 1716454224532853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224532854, "dur": 0, "args": { "External id": 202829, "cbid": 203, "correlation": 202829 } }, { "ph": "f", "id": 202829, "pid": 76337, "tid": -914061504, "ts": 1716454224532854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224532855, "dur": 0, "args": { "External id": 202830, "cbid": 205, "correlation": 202830 } }, { "ph": "f", "id": 202830, "pid": 76337, "tid": -914061504, "ts": 1716454224532855, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224613520, "dur": 23, "args": { "External id": 202834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202834, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202834, "pid": 5, "tid": 7, "ts": 1716454224613520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532869, "dur": 13, "args": { "External id": 202834, "cbid": 211, "correlation": 202834 } }, { "ph": "s", "id": 202834, "pid": 76337, "tid": -914061504, "ts": 1716454224532869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224613545, "dur": 121, "args": { "External id": 202836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202836, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202836, "pid": 5, "tid": 7, "ts": 1716454224613545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532889, "dur": 6, "args": { "External id": 202836, "cbid": 211, "correlation": 202836 } }, { "ph": "s", "id": 202836, "pid": 76337, "tid": -914061504, "ts": 1716454224532889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224613667, "dur": 23, "args": { "External id": 202838, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202838, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202838, "pid": 5, "tid": 7, "ts": 1716454224613667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532899, "dur": 5, "args": { "External id": 202838, "cbid": 211, "correlation": 202838 } }, { "ph": "s", "id": 202838, "pid": 76337, "tid": -914061504, "ts": 1716454224532899, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224613692, "dur": 33, "args": { "External id": 202844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202844, "pid": 5, "tid": 7, "ts": 1716454224613692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532927, "dur": 8, "args": { "External id": 202844, "cbid": 211, "correlation": 202844 } }, { "ph": "s", "id": 202844, "pid": 76337, "tid": -914061504, "ts": 1716454224532927, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224613726, "dur": 27, "args": { "External id": 202852, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202852, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202852, "pid": 5, "tid": 7, "ts": 1716454224613726, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224532958, "dur": 8, "args": { "External id": 202852, "cbid": 211, "correlation": 202852 } }, { "ph": "s", "id": 202852, "pid": 76337, "tid": -914061504, "ts": 1716454224532958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224613754, "dur": 102, "args": { "External id": 202863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202863, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202863, "pid": 5, "tid": 7, "ts": 1716454224613754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533029, "dur": 12, "args": { "External id": 202863, "cbid": 211, "correlation": 202863 } }, { "ph": "s", "id": 202863, "pid": 76337, "tid": -914061504, "ts": 1716454224533029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224533086, "dur": 0, "args": { "External id": 202873, "cbid": 317, "correlation": 202873 } }, { "ph": "f", "id": 202873, "pid": 76337, "tid": -914061504, "ts": 1716454224533086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224533086, "dur": 0, "args": { "External id": 202874, "cbid": 203, "correlation": 202874 } }, { "ph": "f", "id": 202874, "pid": 76337, "tid": -914061504, "ts": 1716454224533086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224533087, "dur": 0, "args": { "External id": 202875, "cbid": 205, "correlation": 202875 } }, { "ph": "f", "id": 202875, "pid": 76337, "tid": -914061504, "ts": 1716454224533087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224613858, "dur": 75, "args": { "External id": 202879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202879, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202879, "pid": 5, "tid": 7, "ts": 1716454224613858, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533101, "dur": 11, "args": { "External id": 202879, "cbid": 211, "correlation": 202879 } }, { "ph": "s", "id": 202879, "pid": 76337, "tid": -914061504, "ts": 1716454224533101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224613934, "dur": 45, "args": { "External id": 202881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202881, "pid": 5, "tid": 7, "ts": 1716454224613934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533115, "dur": 5, "args": { "External id": 202881, "cbid": 211, "correlation": 202881 } }, { "ph": "s", "id": 202881, "pid": 76337, "tid": -914061504, "ts": 1716454224533115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224613980, "dur": 4, "args": { "External id": 202883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 202883, "pid": 5, "tid": 7, "ts": 1716454224613980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533126, "dur": 6, "args": { "External id": 202883, "cbid": 211, "correlation": 202883 } }, { "ph": "s", "id": 202883, "pid": 76337, "tid": -914061504, "ts": 1716454224533126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224533134, "dur": 0, "args": { "External id": 202884, "cbid": 51, "correlation": 202884 } }, { "ph": "s", "id": 202884, "pid": 76337, "tid": -914061504, "ts": 1716454224533134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224613986, "dur": 2239, "args": { "External id": 202885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202885, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202885, "pid": 5, "tid": 7, "ts": 1716454224613986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533135, "dur": 5, "args": { "External id": 202885, "cbid": 211, "correlation": 202885 } }, { "ph": "s", "id": 202885, "pid": 76337, "tid": -914061504, "ts": 1716454224533135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224616226, "dur": 113, "args": { "External id": 202890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202890, "pid": 5, "tid": 7, "ts": 1716454224616226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533163, "dur": 9, "args": { "External id": 202890, "cbid": 211, "correlation": 202890 } }, { "ph": "s", "id": 202890, "pid": 76337, "tid": -914061504, "ts": 1716454224533163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224616340, "dur": 168, "args": { "External id": 202899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202899, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202899, "pid": 5, "tid": 7, "ts": 1716454224616340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533254, "dur": 13, "args": { "External id": 202899, "cbid": 211, "correlation": 202899 } }, { "ph": "s", "id": 202899, "pid": 76337, "tid": -914061504, "ts": 1716454224533254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224616509, "dur": 131, "args": { "External id": 202919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202919, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 202919, "pid": 5, "tid": 7, "ts": 1716454224616509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533325, "dur": 11, "args": { "External id": 202919, "cbid": 211, "correlation": 202919 } }, { "ph": "s", "id": 202919, "pid": 76337, "tid": -914061504, "ts": 1716454224533325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224616642, "dur": 4, "args": { "External id": 202931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202931, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 202931, "pid": 5, "tid": 7, "ts": 1716454224616642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533346, "dur": 6, "args": { "External id": 202931, "cbid": 211, "correlation": 202931 } }, { "ph": "s", "id": 202931, "pid": 76337, "tid": -914061504, "ts": 1716454224533346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224616647, "dur": 159, "args": { "External id": 202934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202934, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202934, "pid": 5, "tid": 7, "ts": 1716454224616647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533364, "dur": 7, "args": { "External id": 202934, "cbid": 211, "correlation": 202934 } }, { "ph": "s", "id": 202934, "pid": 76337, "tid": -914061504, "ts": 1716454224533364, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224616808, "dur": 101, "args": { "External id": 202943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202943, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202943, "pid": 5, "tid": 7, "ts": 1716454224616808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533404, "dur": 10, "args": { "External id": 202943, "cbid": 211, "correlation": 202943 } }, { "ph": "s", "id": 202943, "pid": 76337, "tid": -914061504, "ts": 1716454224533404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224533456, "dur": 0, "args": { "External id": 202953, "cbid": 317, "correlation": 202953 } }, { "ph": "f", "id": 202953, "pid": 76337, "tid": -914061504, "ts": 1716454224533456, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224533457, "dur": 0, "args": { "External id": 202954, "cbid": 203, "correlation": 202954 } }, { "ph": "f", "id": 202954, "pid": 76337, "tid": -914061504, "ts": 1716454224533457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224533458, "dur": 0, "args": { "External id": 202955, "cbid": 205, "correlation": 202955 } }, { "ph": "f", "id": 202955, "pid": 76337, "tid": -914061504, "ts": 1716454224533458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224616910, "dur": 111, "args": { "External id": 202959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202959, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202959, "pid": 5, "tid": 7, "ts": 1716454224616910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533475, "dur": 12, "args": { "External id": 202959, "cbid": 211, "correlation": 202959 } }, { "ph": "s", "id": 202959, "pid": 76337, "tid": -914061504, "ts": 1716454224533475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224617022, "dur": 34, "args": { "External id": 202961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202961, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202961, "pid": 5, "tid": 7, "ts": 1716454224617022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533489, "dur": 5, "args": { "External id": 202961, "cbid": 211, "correlation": 202961 } }, { "ph": "s", "id": 202961, "pid": 76337, "tid": -914061504, "ts": 1716454224533489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224617057, "dur": 4, "args": { "External id": 202963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 202963, "pid": 5, "tid": 7, "ts": 1716454224617057, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533499, "dur": 5, "args": { "External id": 202963, "cbid": 211, "correlation": 202963 } }, { "ph": "s", "id": 202963, "pid": 76337, "tid": -914061504, "ts": 1716454224533499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224533508, "dur": 0, "args": { "External id": 202964, "cbid": 51, "correlation": 202964 } }, { "ph": "s", "id": 202964, "pid": 76337, "tid": -914061504, "ts": 1716454224533508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224617062, "dur": 2023, "args": { "External id": 202965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202965, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 202965, "pid": 5, "tid": 7, "ts": 1716454224617062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533509, "dur": 5, "args": { "External id": 202965, "cbid": 211, "correlation": 202965 } }, { "ph": "s", "id": 202965, "pid": 76337, "tid": -914061504, "ts": 1716454224533509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224619086, "dur": 58, "args": { "External id": 202970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 202970, "pid": 5, "tid": 7, "ts": 1716454224619086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533538, "dur": 8, "args": { "External id": 202970, "cbid": 211, "correlation": 202970 } }, { "ph": "s", "id": 202970, "pid": 76337, "tid": -914061504, "ts": 1716454224533538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224619146, "dur": 3, "args": { "External id": 202978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202978, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 202978, "pid": 5, "tid": 7, "ts": 1716454224619146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533581, "dur": 9, "args": { "External id": 202978, "cbid": 211, "correlation": 202978 } }, { "ph": "s", "id": 202978, "pid": 76337, "tid": -914061504, "ts": 1716454224533581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224533645, "dur": 1, "args": { "External id": 202994, "cbid": 251, "correlation": 202994 } }, { "ph": "f", "id": 202994, "pid": 76337, "tid": -914061504, "ts": 1716454224533645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224533650, "dur": 0, "args": { "External id": 202996, "cbid": 251, "correlation": 202996 } }, { "ph": "f", "id": 202996, "pid": 76337, "tid": -914061504, "ts": 1716454224533650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224619150, "dur": 12, "args": { "External id": 202997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202997, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 202997, "pid": 5, "tid": 7, "ts": 1716454224619150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533652, "dur": 11, "args": { "External id": 202997, "cbid": 211, "correlation": 202997 } }, { "ph": "s", "id": 202997, "pid": 76337, "tid": -914061504, "ts": 1716454224533652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224619163, "dur": 5, "args": { "External id": 202999, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 202999, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 202999, "pid": 5, "tid": 7, "ts": 1716454224619163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533665, "dur": 6, "args": { "External id": 202999, "cbid": 211, "correlation": 202999 } }, { "ph": "s", "id": 202999, "pid": 76337, "tid": -914061504, "ts": 1716454224533665, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224619170, "dur": 54, "args": { "External id": 203009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203009, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203009, "pid": 5, "tid": 7, "ts": 1716454224619170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533724, "dur": 12, "args": { "External id": 203009, "cbid": 211, "correlation": 203009 } }, { "ph": "s", "id": 203009, "pid": 76337, "tid": -914061504, "ts": 1716454224533724, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224619225, "dur": 51, "args": { "External id": 203029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203029, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 203029, "pid": 5, "tid": 7, "ts": 1716454224619225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533791, "dur": 11, "args": { "External id": 203029, "cbid": 211, "correlation": 203029 } }, { "ph": "s", "id": 203029, "pid": 76337, "tid": -914061504, "ts": 1716454224533791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224619278, "dur": 4, "args": { "External id": 203041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203041, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203041, "pid": 5, "tid": 7, "ts": 1716454224619278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533812, "dur": 6, "args": { "External id": 203041, "cbid": 211, "correlation": 203041 } }, { "ph": "s", "id": 203041, "pid": 76337, "tid": -914061504, "ts": 1716454224533812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224619283, "dur": 57, "args": { "External id": 203044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203044, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203044, "pid": 5, "tid": 7, "ts": 1716454224619283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533830, "dur": 6, "args": { "External id": 203044, "cbid": 211, "correlation": 203044 } }, { "ph": "s", "id": 203044, "pid": 76337, "tid": -914061504, "ts": 1716454224533830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224619341, "dur": 37, "args": { "External id": 203053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203053, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203053, "pid": 5, "tid": 7, "ts": 1716454224619341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533870, "dur": 11, "args": { "External id": 203053, "cbid": 211, "correlation": 203053 } }, { "ph": "s", "id": 203053, "pid": 76337, "tid": -914061504, "ts": 1716454224533870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224533934, "dur": 0, "args": { "External id": 203063, "cbid": 317, "correlation": 203063 } }, { "ph": "f", "id": 203063, "pid": 76337, "tid": -914061504, "ts": 1716454224533934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224533935, "dur": 0, "args": { "External id": 203064, "cbid": 203, "correlation": 203064 } }, { "ph": "f", "id": 203064, "pid": 76337, "tid": -914061504, "ts": 1716454224533935, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224533936, "dur": 0, "args": { "External id": 203065, "cbid": 205, "correlation": 203065 } }, { "ph": "f", "id": 203065, "pid": 76337, "tid": -914061504, "ts": 1716454224533936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224619380, "dur": 40, "args": { "External id": 203069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203069, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203069, "pid": 5, "tid": 7, "ts": 1716454224619380, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533952, "dur": 12, "args": { "External id": 203069, "cbid": 211, "correlation": 203069 } }, { "ph": "s", "id": 203069, "pid": 76337, "tid": -914061504, "ts": 1716454224533952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224619421, "dur": 14, "args": { "External id": 203071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203071, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203071, "pid": 5, "tid": 7, "ts": 1716454224619421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533967, "dur": 13, "args": { "External id": 203071, "cbid": 211, "correlation": 203071 } }, { "ph": "s", "id": 203071, "pid": 76337, "tid": -914061504, "ts": 1716454224533967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224619437, "dur": 3, "args": { "External id": 203073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203073, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203073, "pid": 5, "tid": 7, "ts": 1716454224619437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533985, "dur": 6, "args": { "External id": 203073, "cbid": 211, "correlation": 203073 } }, { "ph": "s", "id": 203073, "pid": 76337, "tid": -914061504, "ts": 1716454224533985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224533994, "dur": 0, "args": { "External id": 203074, "cbid": 51, "correlation": 203074 } }, { "ph": "s", "id": 203074, "pid": 76337, "tid": -914061504, "ts": 1716454224533994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224619441, "dur": 703, "args": { "External id": 203075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203075, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203075, "pid": 5, "tid": 7, "ts": 1716454224619441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224533995, "dur": 5, "args": { "External id": 203075, "cbid": 211, "correlation": 203075 } }, { "ph": "s", "id": 203075, "pid": 76337, "tid": -914061504, "ts": 1716454224533995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224620145, "dur": 60, "args": { "External id": 203080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203080, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203080, "pid": 5, "tid": 7, "ts": 1716454224620145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534023, "dur": 9, "args": { "External id": 203080, "cbid": 211, "correlation": 203080 } }, { "ph": "s", "id": 203080, "pid": 76337, "tid": -914061504, "ts": 1716454224534023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224534083, "dur": 0, "args": { "External id": 203090, "cbid": 317, "correlation": 203090 } }, { "ph": "f", "id": 203090, "pid": 76337, "tid": -914061504, "ts": 1716454224534083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224534083, "dur": 0, "args": { "External id": 203091, "cbid": 203, "correlation": 203091 } }, { "ph": "f", "id": 203091, "pid": 76337, "tid": -914061504, "ts": 1716454224534083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224534084, "dur": 0, "args": { "External id": 203092, "cbid": 205, "correlation": 203092 } }, { "ph": "f", "id": 203092, "pid": 76337, "tid": -914061504, "ts": 1716454224534084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224620206, "dur": 4, "args": { "External id": 203096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203096, "pid": 5, "tid": 7, "ts": 1716454224620206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534099, "dur": 11, "args": { "External id": 203096, "cbid": 211, "correlation": 203096 } }, { "ph": "s", "id": 203096, "pid": 76337, "tid": -914061504, "ts": 1716454224534099, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224534115, "dur": 0, "args": { "External id": 203097, "cbid": 51, "correlation": 203097 } }, { "ph": "s", "id": 203097, "pid": 76337, "tid": -914061504, "ts": 1716454224534115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454224620211, "dur": 268, "args": { "External id": 203098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203098, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203098, "pid": 5, "tid": 7, "ts": 1716454224620211, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534116, "dur": 8, "args": { "External id": 203098, "cbid": 211, "correlation": 203098 } }, { "ph": "s", "id": 203098, "pid": 76337, "tid": -914061504, "ts": 1716454224534116, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224620481, "dur": 59, "args": { "External id": 203103, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203103, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203103, "pid": 5, "tid": 7, "ts": 1716454224620481, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534145, "dur": 8, "args": { "External id": 203103, "cbid": 211, "correlation": 203103 } }, { "ph": "s", "id": 203103, "pid": 76337, "tid": -914061504, "ts": 1716454224534145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224620542, "dur": 51, "args": { "External id": 203111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203111, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203111, "pid": 5, "tid": 7, "ts": 1716454224620542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534173, "dur": 8, "args": { "External id": 203111, "cbid": 211, "correlation": 203111 } }, { "ph": "s", "id": 203111, "pid": 76337, "tid": -914061504, "ts": 1716454224534173, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224620594, "dur": 35, "args": { "External id": 203119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203119, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203119, "pid": 5, "tid": 7, "ts": 1716454224620594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534202, "dur": 8, "args": { "External id": 203119, "cbid": 211, "correlation": 203119 } }, { "ph": "s", "id": 203119, "pid": 76337, "tid": -914061504, "ts": 1716454224534202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224620631, "dur": 51, "args": { "External id": 203139, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203139, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 203139, "pid": 5, "tid": 7, "ts": 1716454224620631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534285, "dur": 12, "args": { "External id": 203139, "cbid": 211, "correlation": 203139 } }, { "ph": "s", "id": 203139, "pid": 76337, "tid": -914061504, "ts": 1716454224534285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224620683, "dur": 4, "args": { "External id": 203151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203151, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203151, "pid": 5, "tid": 7, "ts": 1716454224620683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534307, "dur": 6, "args": { "External id": 203151, "cbid": 211, "correlation": 203151 } }, { "ph": "s", "id": 203151, "pid": 76337, "tid": -914061504, "ts": 1716454224534307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224620688, "dur": 56, "args": { "External id": 203154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203154, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203154, "pid": 5, "tid": 7, "ts": 1716454224620688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534325, "dur": 6, "args": { "External id": 203154, "cbid": 211, "correlation": 203154 } }, { "ph": "s", "id": 203154, "pid": 76337, "tid": -914061504, "ts": 1716454224534325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224534382, "dur": 0, "args": { "External id": 203165, "cbid": 317, "correlation": 203165 } }, { "ph": "f", "id": 203165, "pid": 76337, "tid": -914061504, "ts": 1716454224534382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224534382, "dur": 0, "args": { "External id": 203166, "cbid": 203, "correlation": 203166 } }, { "ph": "f", "id": 203166, "pid": 76337, "tid": -914061504, "ts": 1716454224534382, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224534383, "dur": 0, "args": { "External id": 203167, "cbid": 205, "correlation": 203167 } }, { "ph": "f", "id": 203167, "pid": 76337, "tid": -914061504, "ts": 1716454224534383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534415, "dur": 2, "args": { "External id": 203171, "cbid": 251, "correlation": 203171 } }, { "ph": "f", "id": 203171, "pid": 76337, "tid": -914061504, "ts": 1716454224534415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534419, "dur": 1, "args": { "External id": 203172, "cbid": 251, "correlation": 203172 } }, { "ph": "f", "id": 203172, "pid": 76337, "tid": -914061504, "ts": 1716454224534419, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534420, "dur": 0, "args": { "External id": 203173, "cbid": 251, "correlation": 203173 } }, { "ph": "f", "id": 203173, "pid": 76337, "tid": -914061504, "ts": 1716454224534420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534422, "dur": 1, "args": { "External id": 203174, "cbid": 251, "correlation": 203174 } }, { "ph": "f", "id": 203174, "pid": 76337, "tid": -914061504, "ts": 1716454224534422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534423, "dur": 1, "args": { "External id": 203175, "cbid": 251, "correlation": 203175 } }, { "ph": "f", "id": 203175, "pid": 76337, "tid": -914061504, "ts": 1716454224534423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534425, "dur": 1, "args": { "External id": 203176, "cbid": 251, "correlation": 203176 } }, { "ph": "f", "id": 203176, "pid": 76337, "tid": -914061504, "ts": 1716454224534425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534427, "dur": 1, "args": { "External id": 203177, "cbid": 251, "correlation": 203177 } }, { "ph": "f", "id": 203177, "pid": 76337, "tid": -914061504, "ts": 1716454224534427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534429, "dur": 1, "args": { "External id": 203178, "cbid": 251, "correlation": 203178 } }, { "ph": "f", "id": 203178, "pid": 76337, "tid": -914061504, "ts": 1716454224534429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534432, "dur": 0, "args": { "External id": 203179, "cbid": 251, "correlation": 203179 } }, { "ph": "f", "id": 203179, "pid": 76337, "tid": -914061504, "ts": 1716454224534432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224620745, "dur": 114, "args": { "External id": 203180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203180, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203180, "pid": 5, "tid": 7, "ts": 1716454224620745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534434, "dur": 13, "args": { "External id": 203180, "cbid": 211, "correlation": 203180 } }, { "ph": "s", "id": 203180, "pid": 76337, "tid": -914061504, "ts": 1716454224534434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224620861, "dur": 59, "args": { "External id": 203186, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203186, "pid": 5, "tid": 7, "ts": 1716454224620861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534471, "dur": 9, "args": { "External id": 203186, "cbid": 211, "correlation": 203186 } }, { "ph": "s", "id": 203186, "pid": 76337, "tid": -914061504, "ts": 1716454224534471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224620922, "dur": 568, "args": { "External id": 203195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203195, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203195, "pid": 5, "tid": 7, "ts": 1716454224620922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534553, "dur": 14, "args": { "External id": 203195, "cbid": 211, "correlation": 203195 } }, { "ph": "s", "id": 203195, "pid": 76337, "tid": -914061504, "ts": 1716454224534553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224621491, "dur": 183, "args": { "External id": 203217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203217, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203217, "pid": 5, "tid": 7, "ts": 1716454224621491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534616, "dur": 12, "args": { "External id": 203217, "cbid": 211, "correlation": 203217 } }, { "ph": "s", "id": 203217, "pid": 76337, "tid": -914061504, "ts": 1716454224534616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534706, "dur": 1, "args": { "External id": 203228, "cbid": 251, "correlation": 203228 } }, { "ph": "f", "id": 203228, "pid": 76337, "tid": -914061504, "ts": 1716454224534706, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224621675, "dur": 199, "args": { "External id": 203229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203229, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203229, "pid": 5, "tid": 7, "ts": 1716454224621675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534711, "dur": 14, "args": { "External id": 203229, "cbid": 211, "correlation": 203229 } }, { "ph": "s", "id": 203229, "pid": 76337, "tid": -914061504, "ts": 1716454224534711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534783, "dur": 1, "args": { "External id": 203240, "cbid": 251, "correlation": 203240 } }, { "ph": "f", "id": 203240, "pid": 76337, "tid": -914061504, "ts": 1716454224534783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224621875, "dur": 188, "args": { "External id": 203241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203241, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203241, "pid": 5, "tid": 7, "ts": 1716454224621875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534787, "dur": 11, "args": { "External id": 203241, "cbid": 211, "correlation": 203241 } }, { "ph": "s", "id": 203241, "pid": 76337, "tid": -914061504, "ts": 1716454224534787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224534850, "dur": 1, "args": { "External id": 203252, "cbid": 251, "correlation": 203252 } }, { "ph": "f", "id": 203252, "pid": 76337, "tid": -914061504, "ts": 1716454224534850, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224622065, "dur": 187, "args": { "External id": 203253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203253, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203253, "pid": 5, "tid": 7, "ts": 1716454224622065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534854, "dur": 11, "args": { "External id": 203253, "cbid": 211, "correlation": 203253 } }, { "ph": "s", "id": 203253, "pid": 76337, "tid": -914061504, "ts": 1716454224534854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224622254, "dur": 18715, "args": { "External id": 203274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203274, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203274, "pid": 5, "tid": 7, "ts": 1716454224622254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224534938, "dur": 14, "args": { "External id": 203274, "cbid": 211, "correlation": 203274 } }, { "ph": "s", "id": 203274, "pid": 76337, "tid": -914061504, "ts": 1716454224534938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535044, "dur": 1, "args": { "External id": 203292, "cbid": 251, "correlation": 203292 } }, { "ph": "f", "id": 203292, "pid": 76337, "tid": -914061504, "ts": 1716454224535044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224640970, "dur": 203, "args": { "External id": 203294, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203294, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203294, "pid": 5, "tid": 7, "ts": 1716454224640970, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535050, "dur": 13, "args": { "External id": 203294, "cbid": 211, "correlation": 203294 } }, { "ph": "s", "id": 203294, "pid": 76337, "tid": -914061504, "ts": 1716454224535050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224641175, "dur": 67, "args": { "External id": 203302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203302, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203302, "pid": 5, "tid": 7, "ts": 1716454224641175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535119, "dur": 13, "args": { "External id": 203302, "cbid": 211, "correlation": 203302 } }, { "ph": "s", "id": 203302, "pid": 76337, "tid": -914061504, "ts": 1716454224535119, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224641243, "dur": 97, "args": { "External id": 203310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203310, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203310, "pid": 5, "tid": 7, "ts": 1716454224641243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535158, "dur": 9, "args": { "External id": 203310, "cbid": 211, "correlation": 203310 } }, { "ph": "s", "id": 203310, "pid": 76337, "tid": -914061504, "ts": 1716454224535158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224641341, "dur": 54, "args": { "External id": 203321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203321, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203321, "pid": 5, "tid": 7, "ts": 1716454224641341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535232, "dur": 12, "args": { "External id": 203321, "cbid": 211, "correlation": 203321 } }, { "ph": "s", "id": 203321, "pid": 76337, "tid": -914061504, "ts": 1716454224535232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224641397, "dur": 93, "args": { "External id": 203343, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203343, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203343, "pid": 5, "tid": 7, "ts": 1716454224641397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535263, "dur": 8, "args": { "External id": 203343, "cbid": 211, "correlation": 203343 } }, { "ph": "s", "id": 203343, "pid": 76337, "tid": -914061504, "ts": 1716454224535263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535346, "dur": 1, "args": { "External id": 203354, "cbid": 251, "correlation": 203354 } }, { "ph": "f", "id": 203354, "pid": 76337, "tid": -914061504, "ts": 1716454224535346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224641491, "dur": 104, "args": { "External id": 203355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203355, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203355, "pid": 5, "tid": 7, "ts": 1716454224641491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535351, "dur": 13, "args": { "External id": 203355, "cbid": 211, "correlation": 203355 } }, { "ph": "s", "id": 203355, "pid": 76337, "tid": -914061504, "ts": 1716454224535351, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535425, "dur": 1, "args": { "External id": 203366, "cbid": 251, "correlation": 203366 } }, { "ph": "f", "id": 203366, "pid": 76337, "tid": -914061504, "ts": 1716454224535425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535429, "dur": 0, "args": { "External id": 203367, "cbid": 251, "correlation": 203367 } }, { "ph": "f", "id": 203367, "pid": 76337, "tid": -914061504, "ts": 1716454224535429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224641596, "dur": 10, "args": { "External id": 203368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203368, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 203368, "pid": 5, "tid": 7, "ts": 1716454224641596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535431, "dur": 13, "args": { "External id": 203368, "cbid": 211, "correlation": 203368 } }, { "ph": "s", "id": 203368, "pid": 76337, "tid": -914061504, "ts": 1716454224535431, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224641608, "dur": 5, "args": { "External id": 203370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203370, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 203370, "pid": 5, "tid": 7, "ts": 1716454224641608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535448, "dur": 7, "args": { "External id": 203370, "cbid": 211, "correlation": 203370 } }, { "ph": "s", "id": 203370, "pid": 76337, "tid": -914061504, "ts": 1716454224535448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535511, "dur": 1, "args": { "External id": 203381, "cbid": 251, "correlation": 203381 } }, { "ph": "f", "id": 203381, "pid": 76337, "tid": -914061504, "ts": 1716454224535511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535514, "dur": 0, "args": { "External id": 203382, "cbid": 251, "correlation": 203382 } }, { "ph": "f", "id": 203382, "pid": 76337, "tid": -914061504, "ts": 1716454224535514, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224641614, "dur": 6, "args": { "External id": 203383, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203383, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 203383, "pid": 5, "tid": 7, "ts": 1716454224641614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535515, "dur": 13, "args": { "External id": 203383, "cbid": 211, "correlation": 203383 } }, { "ph": "s", "id": 203383, "pid": 76337, "tid": -914061504, "ts": 1716454224535515, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224641622, "dur": 3, "args": { "External id": 203385, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203385, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 203385, "pid": 5, "tid": 7, "ts": 1716454224641622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535530, "dur": 5, "args": { "External id": 203385, "cbid": 211, "correlation": 203385 } }, { "ph": "s", "id": 203385, "pid": 76337, "tid": -914061504, "ts": 1716454224535530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224641627, "dur": 158, "args": { "External id": 203406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203406, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203406, "pid": 5, "tid": 7, "ts": 1716454224641627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535602, "dur": 12, "args": { "External id": 203406, "cbid": 211, "correlation": 203406 } }, { "ph": "s", "id": 203406, "pid": 76337, "tid": -914061504, "ts": 1716454224535602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535698, "dur": 1, "args": { "External id": 203424, "cbid": 251, "correlation": 203424 } }, { "ph": "f", "id": 203424, "pid": 76337, "tid": -914061504, "ts": 1716454224535698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224641786, "dur": 108, "args": { "External id": 203426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203426, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203426, "pid": 5, "tid": 7, "ts": 1716454224641786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535704, "dur": 14, "args": { "External id": 203426, "cbid": 211, "correlation": 203426 } }, { "ph": "s", "id": 203426, "pid": 76337, "tid": -914061504, "ts": 1716454224535704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224641895, "dur": 35, "args": { "External id": 203434, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203434, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203434, "pid": 5, "tid": 7, "ts": 1716454224641895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535773, "dur": 13, "args": { "External id": 203434, "cbid": 211, "correlation": 203434 } }, { "ph": "s", "id": 203434, "pid": 76337, "tid": -914061504, "ts": 1716454224535773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224641931, "dur": 66, "args": { "External id": 203442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203442, "pid": 5, "tid": 7, "ts": 1716454224641931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535814, "dur": 9, "args": { "External id": 203442, "cbid": 211, "correlation": 203442 } }, { "ph": "s", "id": 203442, "pid": 76337, "tid": -914061504, "ts": 1716454224535814, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224641998, "dur": 93, "args": { "External id": 203464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203464, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203464, "pid": 5, "tid": 7, "ts": 1716454224641998, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535865, "dur": 10, "args": { "External id": 203464, "cbid": 211, "correlation": 203464 } }, { "ph": "s", "id": 203464, "pid": 76337, "tid": -914061504, "ts": 1716454224535865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224535951, "dur": 1, "args": { "External id": 203480, "cbid": 251, "correlation": 203480 } }, { "ph": "f", "id": 203480, "pid": 76337, "tid": -914061504, "ts": 1716454224535951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224642092, "dur": 577, "args": { "External id": 203482, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203482, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203482, "pid": 5, "tid": 7, "ts": 1716454224642092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224535956, "dur": 12, "args": { "External id": 203482, "cbid": 211, "correlation": 203482 } }, { "ph": "s", "id": 203482, "pid": 76337, "tid": -914061504, "ts": 1716454224535956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224642671, "dur": 245, "args": { "External id": 203490, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203490, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203490, "pid": 5, "tid": 7, "ts": 1716454224642671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536029, "dur": 13, "args": { "External id": 203490, "cbid": 211, "correlation": 203490 } }, { "ph": "s", "id": 203490, "pid": 76337, "tid": -914061504, "ts": 1716454224536029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224642918, "dur": 253, "args": { "External id": 203498, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203498, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203498, "pid": 5, "tid": 7, "ts": 1716454224642918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536060, "dur": 8, "args": { "External id": 203498, "cbid": 211, "correlation": 203498 } }, { "ph": "s", "id": 203498, "pid": 76337, "tid": -914061504, "ts": 1716454224536060, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536143, "dur": 1, "args": { "External id": 203514, "cbid": 251, "correlation": 203514 } }, { "ph": "f", "id": 203514, "pid": 76337, "tid": -914061504, "ts": 1716454224536143, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536148, "dur": 0, "args": { "External id": 203516, "cbid": 251, "correlation": 203516 } }, { "ph": "f", "id": 203516, "pid": 76337, "tid": -914061504, "ts": 1716454224536148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224643172, "dur": 360, "args": { "External id": 203517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203517, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 203517, "pid": 5, "tid": 7, "ts": 1716454224643172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536151, "dur": 12, "args": { "External id": 203517, "cbid": 211, "correlation": 203517 } }, { "ph": "s", "id": 203517, "pid": 76337, "tid": -914061504, "ts": 1716454224536151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224643533, "dur": 50, "args": { "External id": 203525, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203525, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203525, "pid": 5, "tid": 7, "ts": 1716454224643533, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536193, "dur": 10, "args": { "External id": 203525, "cbid": 211, "correlation": 203525 } }, { "ph": "s", "id": 203525, "pid": 76337, "tid": -914061504, "ts": 1716454224536193, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224643584, "dur": 159, "args": { "External id": 203536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203536, "pid": 5, "tid": 7, "ts": 1716454224643584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536260, "dur": 12, "args": { "External id": 203536, "cbid": 211, "correlation": 203536 } }, { "ph": "s", "id": 203536, "pid": 76337, "tid": -914061504, "ts": 1716454224536260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224536324, "dur": 0, "args": { "External id": 203548, "cbid": 317, "correlation": 203548 } }, { "ph": "f", "id": 203548, "pid": 76337, "tid": -914061504, "ts": 1716454224536324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224536325, "dur": 0, "args": { "External id": 203549, "cbid": 203, "correlation": 203549 } }, { "ph": "f", "id": 203549, "pid": 76337, "tid": -914061504, "ts": 1716454224536325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224536326, "dur": 0, "args": { "External id": 203550, "cbid": 205, "correlation": 203550 } }, { "ph": "f", "id": 203550, "pid": 76337, "tid": -914061504, "ts": 1716454224536326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536350, "dur": 1, "args": { "External id": 203554, "cbid": 251, "correlation": 203554 } }, { "ph": "f", "id": 203554, "pid": 76337, "tid": -914061504, "ts": 1716454224536350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536352, "dur": 0, "args": { "External id": 203555, "cbid": 251, "correlation": 203555 } }, { "ph": "f", "id": 203555, "pid": 76337, "tid": -914061504, "ts": 1716454224536352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536353, "dur": 0, "args": { "External id": 203556, "cbid": 251, "correlation": 203556 } }, { "ph": "f", "id": 203556, "pid": 76337, "tid": -914061504, "ts": 1716454224536353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536353, "dur": 0, "args": { "External id": 203557, "cbid": 251, "correlation": 203557 } }, { "ph": "f", "id": 203557, "pid": 76337, "tid": -914061504, "ts": 1716454224536353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536354, "dur": 0, "args": { "External id": 203558, "cbid": 251, "correlation": 203558 } }, { "ph": "f", "id": 203558, "pid": 76337, "tid": -914061504, "ts": 1716454224536354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536355, "dur": 0, "args": { "External id": 203559, "cbid": 251, "correlation": 203559 } }, { "ph": "f", "id": 203559, "pid": 76337, "tid": -914061504, "ts": 1716454224536355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536356, "dur": 0, "args": { "External id": 203560, "cbid": 251, "correlation": 203560 } }, { "ph": "f", "id": 203560, "pid": 76337, "tid": -914061504, "ts": 1716454224536356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536357, "dur": 0, "args": { "External id": 203561, "cbid": 251, "correlation": 203561 } }, { "ph": "f", "id": 203561, "pid": 76337, "tid": -914061504, "ts": 1716454224536357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536358, "dur": 0, "args": { "External id": 203562, "cbid": 251, "correlation": 203562 } }, { "ph": "f", "id": 203562, "pid": 76337, "tid": -914061504, "ts": 1716454224536358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224643745, "dur": 115, "args": { "External id": 203563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203563, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203563, "pid": 5, "tid": 7, "ts": 1716454224643745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536360, "dur": 12, "args": { "External id": 203563, "cbid": 211, "correlation": 203563 } }, { "ph": "s", "id": 203563, "pid": 76337, "tid": -914061504, "ts": 1716454224536360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224643861, "dur": 60, "args": { "External id": 203569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203569, "pid": 5, "tid": 7, "ts": 1716454224643861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536396, "dur": 8, "args": { "External id": 203569, "cbid": 211, "correlation": 203569 } }, { "ph": "s", "id": 203569, "pid": 76337, "tid": -914061504, "ts": 1716454224536396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224643922, "dur": 50, "args": { "External id": 203577, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203577, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203577, "pid": 5, "tid": 7, "ts": 1716454224643922, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536427, "dur": 9, "args": { "External id": 203577, "cbid": 211, "correlation": 203577 } }, { "ph": "s", "id": 203577, "pid": 76337, "tid": -914061504, "ts": 1716454224536427, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224643973, "dur": 98, "args": { "External id": 203586, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203586, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203586, "pid": 5, "tid": 7, "ts": 1716454224643973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536466, "dur": 10, "args": { "External id": 203586, "cbid": 211, "correlation": 203586 } }, { "ph": "s", "id": 203586, "pid": 76337, "tid": -914061504, "ts": 1716454224536466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224644072, "dur": 92, "args": { "External id": 203606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203606, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 203606, "pid": 5, "tid": 7, "ts": 1716454224644072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536539, "dur": 11, "args": { "External id": 203606, "cbid": 211, "correlation": 203606 } }, { "ph": "s", "id": 203606, "pid": 76337, "tid": -914061504, "ts": 1716454224536539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224644165, "dur": 4, "args": { "External id": 203618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203618, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 203618, "pid": 5, "tid": 7, "ts": 1716454224644165, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536560, "dur": 6, "args": { "External id": 203618, "cbid": 211, "correlation": 203618 } }, { "ph": "s", "id": 203618, "pid": 76337, "tid": -914061504, "ts": 1716454224536560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224644171, "dur": 108, "args": { "External id": 203621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203621, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203621, "pid": 5, "tid": 7, "ts": 1716454224644171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536579, "dur": 7, "args": { "External id": 203621, "cbid": 211, "correlation": 203621 } }, { "ph": "s", "id": 203621, "pid": 76337, "tid": -914061504, "ts": 1716454224536579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224644281, "dur": 70, "args": { "External id": 203630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203630, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203630, "pid": 5, "tid": 7, "ts": 1716454224644281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536618, "dur": 10, "args": { "External id": 203630, "cbid": 211, "correlation": 203630 } }, { "ph": "s", "id": 203630, "pid": 76337, "tid": -914061504, "ts": 1716454224536618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224536669, "dur": 0, "args": { "External id": 203640, "cbid": 317, "correlation": 203640 } }, { "ph": "f", "id": 203640, "pid": 76337, "tid": -914061504, "ts": 1716454224536669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224536670, "dur": 0, "args": { "External id": 203641, "cbid": 203, "correlation": 203641 } }, { "ph": "f", "id": 203641, "pid": 76337, "tid": -914061504, "ts": 1716454224536670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224536671, "dur": 0, "args": { "External id": 203642, "cbid": 205, "correlation": 203642 } }, { "ph": "f", "id": 203642, "pid": 76337, "tid": -914061504, "ts": 1716454224536671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224644352, "dur": 75, "args": { "External id": 203646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203646, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203646, "pid": 5, "tid": 7, "ts": 1716454224644352, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536687, "dur": 11, "args": { "External id": 203646, "cbid": 211, "correlation": 203646 } }, { "ph": "s", "id": 203646, "pid": 76337, "tid": -914061504, "ts": 1716454224536687, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224644429, "dur": 24, "args": { "External id": 203648, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203648, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203648, "pid": 5, "tid": 7, "ts": 1716454224644429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536701, "dur": 5, "args": { "External id": 203648, "cbid": 211, "correlation": 203648 } }, { "ph": "s", "id": 203648, "pid": 76337, "tid": -914061504, "ts": 1716454224536701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224644454, "dur": 4, "args": { "External id": 203650, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203650, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203650, "pid": 5, "tid": 7, "ts": 1716454224644454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536712, "dur": 5, "args": { "External id": 203650, "cbid": 211, "correlation": 203650 } }, { "ph": "s", "id": 203650, "pid": 76337, "tid": -914061504, "ts": 1716454224536712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224536720, "dur": 0, "args": { "External id": 203651, "cbid": 51, "correlation": 203651 } }, { "ph": "s", "id": 203651, "pid": 76337, "tid": -914061504, "ts": 1716454224536720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224644459, "dur": 1371, "args": { "External id": 203652, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203652, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203652, "pid": 5, "tid": 7, "ts": 1716454224644459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536721, "dur": 5, "args": { "External id": 203652, "cbid": 211, "correlation": 203652 } }, { "ph": "s", "id": 203652, "pid": 76337, "tid": -914061504, "ts": 1716454224536721, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224645832, "dur": 59, "args": { "External id": 203657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203657, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203657, "pid": 5, "tid": 7, "ts": 1716454224645832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536749, "dur": 8, "args": { "External id": 203657, "cbid": 211, "correlation": 203657 } }, { "ph": "s", "id": 203657, "pid": 76337, "tid": -914061504, "ts": 1716454224536749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224645893, "dur": 3, "args": { "External id": 203665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203665, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203665, "pid": 5, "tid": 7, "ts": 1716454224645893, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536792, "dur": 9, "args": { "External id": 203665, "cbid": 211, "correlation": 203665 } }, { "ph": "s", "id": 203665, "pid": 76337, "tid": -914061504, "ts": 1716454224536792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536857, "dur": 1, "args": { "External id": 203681, "cbid": 251, "correlation": 203681 } }, { "ph": "f", "id": 203681, "pid": 76337, "tid": -914061504, "ts": 1716454224536857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224536862, "dur": 0, "args": { "External id": 203683, "cbid": 251, "correlation": 203683 } }, { "ph": "f", "id": 203683, "pid": 76337, "tid": -914061504, "ts": 1716454224536862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224645898, "dur": 11, "args": { "External id": 203684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203684, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 203684, "pid": 5, "tid": 7, "ts": 1716454224645898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536864, "dur": 11, "args": { "External id": 203684, "cbid": 211, "correlation": 203684 } }, { "ph": "s", "id": 203684, "pid": 76337, "tid": -914061504, "ts": 1716454224536864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224645910, "dur": 5, "args": { "External id": 203686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203686, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 203686, "pid": 5, "tid": 7, "ts": 1716454224645910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536878, "dur": 6, "args": { "External id": 203686, "cbid": 211, "correlation": 203686 } }, { "ph": "s", "id": 203686, "pid": 76337, "tid": -914061504, "ts": 1716454224536878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224645917, "dur": 55, "args": { "External id": 203696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203696, "pid": 5, "tid": 7, "ts": 1716454224645917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224536936, "dur": 12, "args": { "External id": 203696, "cbid": 211, "correlation": 203696 } }, { "ph": "s", "id": 203696, "pid": 76337, "tid": -914061504, "ts": 1716454224536936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224645973, "dur": 52, "args": { "External id": 203716, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203716, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 203716, "pid": 5, "tid": 7, "ts": 1716454224645973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537010, "dur": 12, "args": { "External id": 203716, "cbid": 211, "correlation": 203716 } }, { "ph": "s", "id": 203716, "pid": 76337, "tid": -914061504, "ts": 1716454224537010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224646027, "dur": 4, "args": { "External id": 203728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203728, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203728, "pid": 5, "tid": 7, "ts": 1716454224646027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537032, "dur": 7, "args": { "External id": 203728, "cbid": 211, "correlation": 203728 } }, { "ph": "s", "id": 203728, "pid": 76337, "tid": -914061504, "ts": 1716454224537032, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224646032, "dur": 54, "args": { "External id": 203731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203731, "pid": 5, "tid": 7, "ts": 1716454224646032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537051, "dur": 6, "args": { "External id": 203731, "cbid": 211, "correlation": 203731 } }, { "ph": "s", "id": 203731, "pid": 76337, "tid": -914061504, "ts": 1716454224537051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224646087, "dur": 36, "args": { "External id": 203740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203740, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203740, "pid": 5, "tid": 7, "ts": 1716454224646087, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537092, "dur": 10, "args": { "External id": 203740, "cbid": 211, "correlation": 203740 } }, { "ph": "s", "id": 203740, "pid": 76337, "tid": -914061504, "ts": 1716454224537092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224537155, "dur": 0, "args": { "External id": 203750, "cbid": 317, "correlation": 203750 } }, { "ph": "f", "id": 203750, "pid": 76337, "tid": -914061504, "ts": 1716454224537155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224537155, "dur": 0, "args": { "External id": 203751, "cbid": 203, "correlation": 203751 } }, { "ph": "f", "id": 203751, "pid": 76337, "tid": -914061504, "ts": 1716454224537155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224537156, "dur": 0, "args": { "External id": 203752, "cbid": 205, "correlation": 203752 } }, { "ph": "f", "id": 203752, "pid": 76337, "tid": -914061504, "ts": 1716454224537156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224646125, "dur": 41, "args": { "External id": 203756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203756, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203756, "pid": 5, "tid": 7, "ts": 1716454224646125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537170, "dur": 12, "args": { "External id": 203756, "cbid": 211, "correlation": 203756 } }, { "ph": "s", "id": 203756, "pid": 76337, "tid": -914061504, "ts": 1716454224537170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224646167, "dur": 14, "args": { "External id": 203758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203758, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203758, "pid": 5, "tid": 7, "ts": 1716454224646167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537185, "dur": 6, "args": { "External id": 203758, "cbid": 211, "correlation": 203758 } }, { "ph": "s", "id": 203758, "pid": 76337, "tid": -914061504, "ts": 1716454224537185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224646183, "dur": 3, "args": { "External id": 203760, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203760, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203760, "pid": 5, "tid": 7, "ts": 1716454224646183, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537195, "dur": 5, "args": { "External id": 203760, "cbid": 211, "correlation": 203760 } }, { "ph": "s", "id": 203760, "pid": 76337, "tid": -914061504, "ts": 1716454224537195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224537203, "dur": 0, "args": { "External id": 203761, "cbid": 51, "correlation": 203761 } }, { "ph": "s", "id": 203761, "pid": 76337, "tid": -914061504, "ts": 1716454224537203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224646187, "dur": 701, "args": { "External id": 203762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203762, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203762, "pid": 5, "tid": 7, "ts": 1716454224646187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537204, "dur": 5, "args": { "External id": 203762, "cbid": 211, "correlation": 203762 } }, { "ph": "s", "id": 203762, "pid": 76337, "tid": -914061504, "ts": 1716454224537204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224646890, "dur": 60, "args": { "External id": 203767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203767, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203767, "pid": 5, "tid": 7, "ts": 1716454224646890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537231, "dur": 9, "args": { "External id": 203767, "cbid": 211, "correlation": 203767 } }, { "ph": "s", "id": 203767, "pid": 76337, "tid": -914061504, "ts": 1716454224537231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224537289, "dur": 0, "args": { "External id": 203777, "cbid": 317, "correlation": 203777 } }, { "ph": "f", "id": 203777, "pid": 76337, "tid": -914061504, "ts": 1716454224537289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224537290, "dur": 0, "args": { "External id": 203778, "cbid": 203, "correlation": 203778 } }, { "ph": "f", "id": 203778, "pid": 76337, "tid": -914061504, "ts": 1716454224537290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224537291, "dur": 0, "args": { "External id": 203779, "cbid": 205, "correlation": 203779 } }, { "ph": "f", "id": 203779, "pid": 76337, "tid": -914061504, "ts": 1716454224537291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224646951, "dur": 77, "args": { "External id": 203783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203783, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203783, "pid": 5, "tid": 7, "ts": 1716454224646951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537303, "dur": 11, "args": { "External id": 203783, "cbid": 211, "correlation": 203783 } }, { "ph": "s", "id": 203783, "pid": 76337, "tid": -914061504, "ts": 1716454224537303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224647029, "dur": 211, "args": { "External id": 203785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203785, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203785, "pid": 5, "tid": 7, "ts": 1716454224647029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537322, "dur": 7, "args": { "External id": 203785, "cbid": 211, "correlation": 203785 } }, { "ph": "s", "id": 203785, "pid": 76337, "tid": -914061504, "ts": 1716454224537322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224647241, "dur": 39, "args": { "External id": 203787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203787, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203787, "pid": 5, "tid": 7, "ts": 1716454224647241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537334, "dur": 145, "args": { "External id": 203787, "cbid": 211, "correlation": 203787 } }, { "ph": "s", "id": 203787, "pid": 76337, "tid": -914061504, "ts": 1716454224537334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224647282, "dur": 60, "args": { "External id": 203793, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203793, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203793, "pid": 5, "tid": 7, "ts": 1716454224647282, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537500, "dur": 9, "args": { "External id": 203793, "cbid": 211, "correlation": 203793 } }, { "ph": "s", "id": 203793, "pid": 76337, "tid": -914061504, "ts": 1716454224537500, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224647344, "dur": 50, "args": { "External id": 203801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203801, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203801, "pid": 5, "tid": 7, "ts": 1716454224647344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537530, "dur": 8, "args": { "External id": 203801, "cbid": 211, "correlation": 203801 } }, { "ph": "s", "id": 203801, "pid": 76337, "tid": -914061504, "ts": 1716454224537530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224647395, "dur": 36, "args": { "External id": 203809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203809, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203809, "pid": 5, "tid": 7, "ts": 1716454224647395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537560, "dur": 29, "args": { "External id": 203809, "cbid": 211, "correlation": 203809 } }, { "ph": "s", "id": 203809, "pid": 76337, "tid": -914061504, "ts": 1716454224537560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224647432, "dur": 51, "args": { "External id": 203829, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203829, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 203829, "pid": 5, "tid": 7, "ts": 1716454224647432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537668, "dur": 13, "args": { "External id": 203829, "cbid": 211, "correlation": 203829 } }, { "ph": "s", "id": 203829, "pid": 76337, "tid": -914061504, "ts": 1716454224537668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224647485, "dur": 4, "args": { "External id": 203841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203841, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 203841, "pid": 5, "tid": 7, "ts": 1716454224647485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537690, "dur": 6, "args": { "External id": 203841, "cbid": 211, "correlation": 203841 } }, { "ph": "s", "id": 203841, "pid": 76337, "tid": -914061504, "ts": 1716454224537690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224647491, "dur": 55, "args": { "External id": 203844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203844, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203844, "pid": 5, "tid": 7, "ts": 1716454224647491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537709, "dur": 7, "args": { "External id": 203844, "cbid": 211, "correlation": 203844 } }, { "ph": "s", "id": 203844, "pid": 76337, "tid": -914061504, "ts": 1716454224537709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224537767, "dur": 0, "args": { "External id": 203855, "cbid": 317, "correlation": 203855 } }, { "ph": "f", "id": 203855, "pid": 76337, "tid": -914061504, "ts": 1716454224537767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224537768, "dur": 0, "args": { "External id": 203856, "cbid": 203, "correlation": 203856 } }, { "ph": "f", "id": 203856, "pid": 76337, "tid": -914061504, "ts": 1716454224537768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224537769, "dur": 0, "args": { "External id": 203857, "cbid": 205, "correlation": 203857 } }, { "ph": "f", "id": 203857, "pid": 76337, "tid": -914061504, "ts": 1716454224537769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537790, "dur": 1, "args": { "External id": 203861, "cbid": 251, "correlation": 203861 } }, { "ph": "f", "id": 203861, "pid": 76337, "tid": -914061504, "ts": 1716454224537790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537792, "dur": 0, "args": { "External id": 203862, "cbid": 251, "correlation": 203862 } }, { "ph": "f", "id": 203862, "pid": 76337, "tid": -914061504, "ts": 1716454224537792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537792, "dur": 0, "args": { "External id": 203863, "cbid": 251, "correlation": 203863 } }, { "ph": "f", "id": 203863, "pid": 76337, "tid": -914061504, "ts": 1716454224537792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537793, "dur": 0, "args": { "External id": 203864, "cbid": 251, "correlation": 203864 } }, { "ph": "f", "id": 203864, "pid": 76337, "tid": -914061504, "ts": 1716454224537793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537794, "dur": 0, "args": { "External id": 203865, "cbid": 251, "correlation": 203865 } }, { "ph": "f", "id": 203865, "pid": 76337, "tid": -914061504, "ts": 1716454224537794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537794, "dur": 0, "args": { "External id": 203866, "cbid": 251, "correlation": 203866 } }, { "ph": "f", "id": 203866, "pid": 76337, "tid": -914061504, "ts": 1716454224537794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537795, "dur": 0, "args": { "External id": 203867, "cbid": 251, "correlation": 203867 } }, { "ph": "f", "id": 203867, "pid": 76337, "tid": -914061504, "ts": 1716454224537795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537796, "dur": 0, "args": { "External id": 203868, "cbid": 251, "correlation": 203868 } }, { "ph": "f", "id": 203868, "pid": 76337, "tid": -914061504, "ts": 1716454224537796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224537797, "dur": 0, "args": { "External id": 203869, "cbid": 251, "correlation": 203869 } }, { "ph": "f", "id": 203869, "pid": 76337, "tid": -914061504, "ts": 1716454224537797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224647547, "dur": 112, "args": { "External id": 203870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203870, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203870, "pid": 5, "tid": 7, "ts": 1716454224647547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537799, "dur": 13, "args": { "External id": 203870, "cbid": 211, "correlation": 203870 } }, { "ph": "s", "id": 203870, "pid": 76337, "tid": -914061504, "ts": 1716454224537799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224647661, "dur": 59, "args": { "External id": 203876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203876, "pid": 5, "tid": 7, "ts": 1716454224647661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537835, "dur": 9, "args": { "External id": 203876, "cbid": 211, "correlation": 203876 } }, { "ph": "s", "id": 203876, "pid": 76337, "tid": -914061504, "ts": 1716454224537835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224647721, "dur": 577, "args": { "External id": 203885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203885, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203885, "pid": 5, "tid": 7, "ts": 1716454224647721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537918, "dur": 14, "args": { "External id": 203885, "cbid": 211, "correlation": 203885 } }, { "ph": "s", "id": 203885, "pid": 76337, "tid": -914061504, "ts": 1716454224537918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224648299, "dur": 182, "args": { "External id": 203907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203907, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203907, "pid": 5, "tid": 7, "ts": 1716454224648299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224537983, "dur": 11, "args": { "External id": 203907, "cbid": 211, "correlation": 203907 } }, { "ph": "s", "id": 203907, "pid": 76337, "tid": -914061504, "ts": 1716454224537983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224538072, "dur": 1, "args": { "External id": 203918, "cbid": 251, "correlation": 203918 } }, { "ph": "f", "id": 203918, "pid": 76337, "tid": -914061504, "ts": 1716454224538072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224648483, "dur": 196, "args": { "External id": 203919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203919, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203919, "pid": 5, "tid": 7, "ts": 1716454224648483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538077, "dur": 13, "args": { "External id": 203919, "cbid": 211, "correlation": 203919 } }, { "ph": "s", "id": 203919, "pid": 76337, "tid": -914061504, "ts": 1716454224538077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224538144, "dur": 1, "args": { "External id": 203930, "cbid": 251, "correlation": 203930 } }, { "ph": "f", "id": 203930, "pid": 76337, "tid": -914061504, "ts": 1716454224538144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224648681, "dur": 190, "args": { "External id": 203931, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203931, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203931, "pid": 5, "tid": 7, "ts": 1716454224648681, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538148, "dur": 11, "args": { "External id": 203931, "cbid": 211, "correlation": 203931 } }, { "ph": "s", "id": 203931, "pid": 76337, "tid": -914061504, "ts": 1716454224538148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224538212, "dur": 1, "args": { "External id": 203942, "cbid": 251, "correlation": 203942 } }, { "ph": "f", "id": 203942, "pid": 76337, "tid": -914061504, "ts": 1716454224538212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224648872, "dur": 189, "args": { "External id": 203943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203943, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203943, "pid": 5, "tid": 7, "ts": 1716454224648872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538216, "dur": 11, "args": { "External id": 203943, "cbid": 211, "correlation": 203943 } }, { "ph": "s", "id": 203943, "pid": 76337, "tid": -914061504, "ts": 1716454224538216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224649062, "dur": 18775, "args": { "External id": 203964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203964, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 203964, "pid": 5, "tid": 7, "ts": 1716454224649062, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538296, "dur": 12, "args": { "External id": 203964, "cbid": 211, "correlation": 203964 } }, { "ph": "s", "id": 203964, "pid": 76337, "tid": -914061504, "ts": 1716454224538296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224538392, "dur": 1, "args": { "External id": 203982, "cbid": 251, "correlation": 203982 } }, { "ph": "f", "id": 203982, "pid": 76337, "tid": -914061504, "ts": 1716454224538392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224667838, "dur": 200, "args": { "External id": 203984, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203984, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 203984, "pid": 5, "tid": 7, "ts": 1716454224667838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538397, "dur": 13, "args": { "External id": 203984, "cbid": 211, "correlation": 203984 } }, { "ph": "s", "id": 203984, "pid": 76337, "tid": -914061504, "ts": 1716454224538397, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224668040, "dur": 66, "args": { "External id": 203992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 203992, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 203992, "pid": 5, "tid": 7, "ts": 1716454224668040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538467, "dur": 12, "args": { "External id": 203992, "cbid": 211, "correlation": 203992 } }, { "ph": "s", "id": 203992, "pid": 76337, "tid": -914061504, "ts": 1716454224538467, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224668108, "dur": 97, "args": { "External id": 204000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204000, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204000, "pid": 5, "tid": 7, "ts": 1716454224668108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538506, "dur": 9, "args": { "External id": 204000, "cbid": 211, "correlation": 204000 } }, { "ph": "s", "id": 204000, "pid": 76337, "tid": -914061504, "ts": 1716454224538506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224668206, "dur": 54, "args": { "External id": 204011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204011, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204011, "pid": 5, "tid": 7, "ts": 1716454224668206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538579, "dur": 50, "args": { "External id": 204011, "cbid": 211, "correlation": 204011 } }, { "ph": "s", "id": 204011, "pid": 76337, "tid": -914061504, "ts": 1716454224538579, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224668261, "dur": 93, "args": { "External id": 204033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204033, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204033, "pid": 5, "tid": 7, "ts": 1716454224668261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224538648, "dur": 1928, "args": { "External id": 204033, "cbid": 211, "correlation": 204033 } }, { "ph": "s", "id": 204033, "pid": 76337, "tid": -914061504, "ts": 1716454224538648, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224540654, "dur": 1, "args": { "External id": 204044, "cbid": 251, "correlation": 204044 } }, { "ph": "f", "id": 204044, "pid": 76337, "tid": -914061504, "ts": 1716454224540654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224668355, "dur": 105, "args": { "External id": 204045, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204045, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204045, "pid": 5, "tid": 7, "ts": 1716454224668355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224540659, "dur": 65, "args": { "External id": 204045, "cbid": 211, "correlation": 204045 } }, { "ph": "s", "id": 204045, "pid": 76337, "tid": -914061504, "ts": 1716454224540659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224540784, "dur": 1, "args": { "External id": 204056, "cbid": 251, "correlation": 204056 } }, { "ph": "f", "id": 204056, "pid": 76337, "tid": -914061504, "ts": 1716454224540784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224540788, "dur": 0, "args": { "External id": 204057, "cbid": 251, "correlation": 204057 } }, { "ph": "f", "id": 204057, "pid": 76337, "tid": -914061504, "ts": 1716454224540788, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224668461, "dur": 10, "args": { "External id": 204058, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204058, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 204058, "pid": 5, "tid": 7, "ts": 1716454224668461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224540789, "dur": 12, "args": { "External id": 204058, "cbid": 211, "correlation": 204058 } }, { "ph": "s", "id": 204058, "pid": 76337, "tid": -914061504, "ts": 1716454224540789, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224668473, "dur": 5, "args": { "External id": 204060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204060, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 204060, "pid": 5, "tid": 7, "ts": 1716454224668473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224540803, "dur": 6, "args": { "External id": 204060, "cbid": 211, "correlation": 204060 } }, { "ph": "s", "id": 204060, "pid": 76337, "tid": -914061504, "ts": 1716454224540803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224540863, "dur": 1, "args": { "External id": 204071, "cbid": 251, "correlation": 204071 } }, { "ph": "f", "id": 204071, "pid": 76337, "tid": -914061504, "ts": 1716454224540863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224540867, "dur": 0, "args": { "External id": 204072, "cbid": 251, "correlation": 204072 } }, { "ph": "f", "id": 204072, "pid": 76337, "tid": -914061504, "ts": 1716454224540867, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224668480, "dur": 6, "args": { "External id": 204073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204073, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 204073, "pid": 5, "tid": 7, "ts": 1716454224668480, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224540868, "dur": 12, "args": { "External id": 204073, "cbid": 211, "correlation": 204073 } }, { "ph": "s", "id": 204073, "pid": 76337, "tid": -914061504, "ts": 1716454224540868, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224668487, "dur": 3, "args": { "External id": 204075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204075, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 204075, "pid": 5, "tid": 7, "ts": 1716454224668487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224540882, "dur": 6, "args": { "External id": 204075, "cbid": 211, "correlation": 204075 } }, { "ph": "s", "id": 204075, "pid": 76337, "tid": -914061504, "ts": 1716454224540882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224668492, "dur": 157, "args": { "External id": 204096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204096, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204096, "pid": 5, "tid": 7, "ts": 1716454224668492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224540955, "dur": 16, "args": { "External id": 204096, "cbid": 211, "correlation": 204096 } }, { "ph": "s", "id": 204096, "pid": 76337, "tid": -914061504, "ts": 1716454224540955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541064, "dur": 1, "args": { "External id": 204114, "cbid": 251, "correlation": 204114 } }, { "ph": "f", "id": 204114, "pid": 76337, "tid": -914061504, "ts": 1716454224541064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224668650, "dur": 107, "args": { "External id": 204116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204116, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204116, "pid": 5, "tid": 7, "ts": 1716454224668650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541070, "dur": 14, "args": { "External id": 204116, "cbid": 211, "correlation": 204116 } }, { "ph": "s", "id": 204116, "pid": 76337, "tid": -914061504, "ts": 1716454224541070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224668758, "dur": 35, "args": { "External id": 204124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204124, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204124, "pid": 5, "tid": 7, "ts": 1716454224668758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541141, "dur": 12, "args": { "External id": 204124, "cbid": 211, "correlation": 204124 } }, { "ph": "s", "id": 204124, "pid": 76337, "tid": -914061504, "ts": 1716454224541141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224668795, "dur": 66, "args": { "External id": 204132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204132, "pid": 5, "tid": 7, "ts": 1716454224668795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541183, "dur": 9, "args": { "External id": 204132, "cbid": 211, "correlation": 204132 } }, { "ph": "s", "id": 204132, "pid": 76337, "tid": -914061504, "ts": 1716454224541183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224668862, "dur": 93, "args": { "External id": 204154, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204154, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204154, "pid": 5, "tid": 7, "ts": 1716454224668862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541233, "dur": 10, "args": { "External id": 204154, "cbid": 211, "correlation": 204154 } }, { "ph": "s", "id": 204154, "pid": 76337, "tid": -914061504, "ts": 1716454224541233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541319, "dur": 1, "args": { "External id": 204170, "cbid": 251, "correlation": 204170 } }, { "ph": "f", "id": 204170, "pid": 76337, "tid": -914061504, "ts": 1716454224541319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224668956, "dur": 579, "args": { "External id": 204172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204172, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204172, "pid": 5, "tid": 7, "ts": 1716454224668956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541325, "dur": 13, "args": { "External id": 204172, "cbid": 211, "correlation": 204172 } }, { "ph": "s", "id": 204172, "pid": 76337, "tid": -914061504, "ts": 1716454224541325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224669536, "dur": 243, "args": { "External id": 204180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204180, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204180, "pid": 5, "tid": 7, "ts": 1716454224669536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541390, "dur": 12, "args": { "External id": 204180, "cbid": 211, "correlation": 204180 } }, { "ph": "s", "id": 204180, "pid": 76337, "tid": -914061504, "ts": 1716454224541390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224669781, "dur": 251, "args": { "External id": 204188, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204188, "pid": 5, "tid": 7, "ts": 1716454224669781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541420, "dur": 9, "args": { "External id": 204188, "cbid": 211, "correlation": 204188 } }, { "ph": "s", "id": 204188, "pid": 76337, "tid": -914061504, "ts": 1716454224541420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541501, "dur": 1, "args": { "External id": 204204, "cbid": 251, "correlation": 204204 } }, { "ph": "f", "id": 204204, "pid": 76337, "tid": -914061504, "ts": 1716454224541501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541506, "dur": 0, "args": { "External id": 204206, "cbid": 251, "correlation": 204206 } }, { "ph": "f", "id": 204206, "pid": 76337, "tid": -914061504, "ts": 1716454224541506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224670033, "dur": 360, "args": { "External id": 204207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204207, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 204207, "pid": 5, "tid": 7, "ts": 1716454224670033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541509, "dur": 12, "args": { "External id": 204207, "cbid": 211, "correlation": 204207 } }, { "ph": "s", "id": 204207, "pid": 76337, "tid": -914061504, "ts": 1716454224541509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224670395, "dur": 50, "args": { "External id": 204215, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204215, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204215, "pid": 5, "tid": 7, "ts": 1716454224670395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541551, "dur": 10, "args": { "External id": 204215, "cbid": 211, "correlation": 204215 } }, { "ph": "s", "id": 204215, "pid": 76337, "tid": -914061504, "ts": 1716454224541551, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224670446, "dur": 159, "args": { "External id": 204226, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204226, "pid": 5, "tid": 7, "ts": 1716454224670446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541617, "dur": 215, "args": { "External id": 204226, "cbid": 211, "correlation": 204226 } }, { "ph": "s", "id": 204226, "pid": 76337, "tid": -914061504, "ts": 1716454224541617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224541884, "dur": 0, "args": { "External id": 204238, "cbid": 317, "correlation": 204238 } }, { "ph": "f", "id": 204238, "pid": 76337, "tid": -914061504, "ts": 1716454224541884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224541885, "dur": 0, "args": { "External id": 204239, "cbid": 203, "correlation": 204239 } }, { "ph": "f", "id": 204239, "pid": 76337, "tid": -914061504, "ts": 1716454224541885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224541885, "dur": 0, "args": { "External id": 204240, "cbid": 205, "correlation": 204240 } }, { "ph": "f", "id": 204240, "pid": 76337, "tid": -914061504, "ts": 1716454224541885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541908, "dur": 1, "args": { "External id": 204244, "cbid": 251, "correlation": 204244 } }, { "ph": "f", "id": 204244, "pid": 76337, "tid": -914061504, "ts": 1716454224541908, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541910, "dur": 0, "args": { "External id": 204245, "cbid": 251, "correlation": 204245 } }, { "ph": "f", "id": 204245, "pid": 76337, "tid": -914061504, "ts": 1716454224541910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541911, "dur": 0, "args": { "External id": 204246, "cbid": 251, "correlation": 204246 } }, { "ph": "f", "id": 204246, "pid": 76337, "tid": -914061504, "ts": 1716454224541911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541912, "dur": 0, "args": { "External id": 204247, "cbid": 251, "correlation": 204247 } }, { "ph": "f", "id": 204247, "pid": 76337, "tid": -914061504, "ts": 1716454224541912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541912, "dur": 0, "args": { "External id": 204248, "cbid": 251, "correlation": 204248 } }, { "ph": "f", "id": 204248, "pid": 76337, "tid": -914061504, "ts": 1716454224541912, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541913, "dur": 0, "args": { "External id": 204249, "cbid": 251, "correlation": 204249 } }, { "ph": "f", "id": 204249, "pid": 76337, "tid": -914061504, "ts": 1716454224541913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541914, "dur": 0, "args": { "External id": 204250, "cbid": 251, "correlation": 204250 } }, { "ph": "f", "id": 204250, "pid": 76337, "tid": -914061504, "ts": 1716454224541914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541915, "dur": 0, "args": { "External id": 204251, "cbid": 251, "correlation": 204251 } }, { "ph": "f", "id": 204251, "pid": 76337, "tid": -914061504, "ts": 1716454224541915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224541916, "dur": 0, "args": { "External id": 204252, "cbid": 251, "correlation": 204252 } }, { "ph": "f", "id": 204252, "pid": 76337, "tid": -914061504, "ts": 1716454224541916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224670606, "dur": 115, "args": { "External id": 204253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204253, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204253, "pid": 5, "tid": 7, "ts": 1716454224670606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541918, "dur": 39, "args": { "External id": 204253, "cbid": 211, "correlation": 204253 } }, { "ph": "s", "id": 204253, "pid": 76337, "tid": -914061504, "ts": 1716454224541918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224670722, "dur": 61, "args": { "External id": 204259, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204259, "pid": 5, "tid": 7, "ts": 1716454224670722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224541988, "dur": 97, "args": { "External id": 204259, "cbid": 211, "correlation": 204259 } }, { "ph": "s", "id": 204259, "pid": 76337, "tid": -914061504, "ts": 1716454224541988, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224670784, "dur": 50, "args": { "External id": 204267, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204267, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204267, "pid": 5, "tid": 7, "ts": 1716454224670784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542109, "dur": 233, "args": { "External id": 204267, "cbid": 211, "correlation": 204267 } }, { "ph": "s", "id": 204267, "pid": 76337, "tid": -914061504, "ts": 1716454224542109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224670836, "dur": 99, "args": { "External id": 204276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204276, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204276, "pid": 5, "tid": 7, "ts": 1716454224670836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542374, "dur": 10, "args": { "External id": 204276, "cbid": 211, "correlation": 204276 } }, { "ph": "s", "id": 204276, "pid": 76337, "tid": -914061504, "ts": 1716454224542374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224670936, "dur": 93, "args": { "External id": 204296, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204296, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 204296, "pid": 5, "tid": 7, "ts": 1716454224670936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542446, "dur": 12, "args": { "External id": 204296, "cbid": 211, "correlation": 204296 } }, { "ph": "s", "id": 204296, "pid": 76337, "tid": -914061504, "ts": 1716454224542446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224671031, "dur": 5, "args": { "External id": 204308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204308, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 204308, "pid": 5, "tid": 7, "ts": 1716454224671031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542468, "dur": 12, "args": { "External id": 204308, "cbid": 211, "correlation": 204308 } }, { "ph": "s", "id": 204308, "pid": 76337, "tid": -914061504, "ts": 1716454224542468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224671037, "dur": 108, "args": { "External id": 204311, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204311, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204311, "pid": 5, "tid": 7, "ts": 1716454224671037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542492, "dur": 109, "args": { "External id": 204311, "cbid": 211, "correlation": 204311 } }, { "ph": "s", "id": 204311, "pid": 76337, "tid": -914061504, "ts": 1716454224542492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224671147, "dur": 68, "args": { "External id": 204320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204320, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204320, "pid": 5, "tid": 7, "ts": 1716454224671147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542635, "dur": 10, "args": { "External id": 204320, "cbid": 211, "correlation": 204320 } }, { "ph": "s", "id": 204320, "pid": 76337, "tid": -914061504, "ts": 1716454224542635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224542686, "dur": 0, "args": { "External id": 204330, "cbid": 317, "correlation": 204330 } }, { "ph": "f", "id": 204330, "pid": 76337, "tid": -914061504, "ts": 1716454224542686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224542687, "dur": 0, "args": { "External id": 204331, "cbid": 203, "correlation": 204331 } }, { "ph": "f", "id": 204331, "pid": 76337, "tid": -914061504, "ts": 1716454224542687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224542688, "dur": 0, "args": { "External id": 204332, "cbid": 205, "correlation": 204332 } }, { "ph": "f", "id": 204332, "pid": 76337, "tid": -914061504, "ts": 1716454224542688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224671217, "dur": 76, "args": { "External id": 204336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204336, "pid": 5, "tid": 7, "ts": 1716454224671217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542703, "dur": 12, "args": { "External id": 204336, "cbid": 211, "correlation": 204336 } }, { "ph": "s", "id": 204336, "pid": 76337, "tid": -914061504, "ts": 1716454224542703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224671294, "dur": 24, "args": { "External id": 204338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204338, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204338, "pid": 5, "tid": 7, "ts": 1716454224671294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542717, "dur": 6, "args": { "External id": 204338, "cbid": 211, "correlation": 204338 } }, { "ph": "s", "id": 204338, "pid": 76337, "tid": -914061504, "ts": 1716454224542717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224671320, "dur": 4, "args": { "External id": 204340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 204340, "pid": 5, "tid": 7, "ts": 1716454224671320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542729, "dur": 6, "args": { "External id": 204340, "cbid": 211, "correlation": 204340 } }, { "ph": "s", "id": 204340, "pid": 76337, "tid": -914061504, "ts": 1716454224542729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224542737, "dur": 0, "args": { "External id": 204341, "cbid": 51, "correlation": 204341 } }, { "ph": "s", "id": 204341, "pid": 76337, "tid": -914061504, "ts": 1716454224542737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224671325, "dur": 1372, "args": { "External id": 204342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204342, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204342, "pid": 5, "tid": 7, "ts": 1716454224671325, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542738, "dur": 5, "args": { "External id": 204342, "cbid": 211, "correlation": 204342 } }, { "ph": "s", "id": 204342, "pid": 76337, "tid": -914061504, "ts": 1716454224542738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224672698, "dur": 59, "args": { "External id": 204347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204347, "pid": 5, "tid": 7, "ts": 1716454224672698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542765, "dur": 9, "args": { "External id": 204347, "cbid": 211, "correlation": 204347 } }, { "ph": "s", "id": 204347, "pid": 76337, "tid": -914061504, "ts": 1716454224542765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224672758, "dur": 3, "args": { "External id": 204355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204355, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 204355, "pid": 5, "tid": 7, "ts": 1716454224672758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542809, "dur": 9, "args": { "External id": 204355, "cbid": 211, "correlation": 204355 } }, { "ph": "s", "id": 204355, "pid": 76337, "tid": -914061504, "ts": 1716454224542809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224542876, "dur": 1, "args": { "External id": 204371, "cbid": 251, "correlation": 204371 } }, { "ph": "f", "id": 204371, "pid": 76337, "tid": -914061504, "ts": 1716454224542876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224542882, "dur": 0, "args": { "External id": 204373, "cbid": 251, "correlation": 204373 } }, { "ph": "f", "id": 204373, "pid": 76337, "tid": -914061504, "ts": 1716454224542882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224672763, "dur": 11, "args": { "External id": 204374, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204374, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 204374, "pid": 5, "tid": 7, "ts": 1716454224672763, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542884, "dur": 12, "args": { "External id": 204374, "cbid": 211, "correlation": 204374 } }, { "ph": "s", "id": 204374, "pid": 76337, "tid": -914061504, "ts": 1716454224542884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224672775, "dur": 5, "args": { "External id": 204376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204376, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 204376, "pid": 5, "tid": 7, "ts": 1716454224672775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542898, "dur": 6, "args": { "External id": 204376, "cbid": 211, "correlation": 204376 } }, { "ph": "s", "id": 204376, "pid": 76337, "tid": -914061504, "ts": 1716454224542898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224672782, "dur": 54, "args": { "External id": 204386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204386, "pid": 5, "tid": 7, "ts": 1716454224672782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224542955, "dur": 553, "args": { "External id": 204386, "cbid": 211, "correlation": 204386 } }, { "ph": "s", "id": 204386, "pid": 76337, "tid": -914061504, "ts": 1716454224542955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224672837, "dur": 52, "args": { "External id": 204406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204406, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 204406, "pid": 5, "tid": 7, "ts": 1716454224672837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543565, "dur": 11, "args": { "External id": 204406, "cbid": 211, "correlation": 204406 } }, { "ph": "s", "id": 204406, "pid": 76337, "tid": -914061504, "ts": 1716454224543565, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224672890, "dur": 4, "args": { "External id": 204418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204418, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 204418, "pid": 5, "tid": 7, "ts": 1716454224672890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543585, "dur": 7, "args": { "External id": 204418, "cbid": 211, "correlation": 204418 } }, { "ph": "s", "id": 204418, "pid": 76337, "tid": -914061504, "ts": 1716454224543585, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224672895, "dur": 54, "args": { "External id": 204421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204421, "pid": 5, "tid": 7, "ts": 1716454224672895, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543604, "dur": 7, "args": { "External id": 204421, "cbid": 211, "correlation": 204421 } }, { "ph": "s", "id": 204421, "pid": 76337, "tid": -914061504, "ts": 1716454224543604, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224672951, "dur": 36, "args": { "External id": 204430, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204430, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204430, "pid": 5, "tid": 7, "ts": 1716454224672951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543644, "dur": 10, "args": { "External id": 204430, "cbid": 211, "correlation": 204430 } }, { "ph": "s", "id": 204430, "pid": 76337, "tid": -914061504, "ts": 1716454224543644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224543707, "dur": 0, "args": { "External id": 204440, "cbid": 317, "correlation": 204440 } }, { "ph": "f", "id": 204440, "pid": 76337, "tid": -914061504, "ts": 1716454224543707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224543708, "dur": 0, "args": { "External id": 204441, "cbid": 203, "correlation": 204441 } }, { "ph": "f", "id": 204441, "pid": 76337, "tid": -914061504, "ts": 1716454224543708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224543709, "dur": 0, "args": { "External id": 204442, "cbid": 205, "correlation": 204442 } }, { "ph": "f", "id": 204442, "pid": 76337, "tid": -914061504, "ts": 1716454224543709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224672988, "dur": 40, "args": { "External id": 204446, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204446, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204446, "pid": 5, "tid": 7, "ts": 1716454224672988, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543723, "dur": 12, "args": { "External id": 204446, "cbid": 211, "correlation": 204446 } }, { "ph": "s", "id": 204446, "pid": 76337, "tid": -914061504, "ts": 1716454224543723, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224673029, "dur": 14, "args": { "External id": 204448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204448, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204448, "pid": 5, "tid": 7, "ts": 1716454224673029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543737, "dur": 5, "args": { "External id": 204448, "cbid": 211, "correlation": 204448 } }, { "ph": "s", "id": 204448, "pid": 76337, "tid": -914061504, "ts": 1716454224543737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224673045, "dur": 3, "args": { "External id": 204450, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204450, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 204450, "pid": 5, "tid": 7, "ts": 1716454224673045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543747, "dur": 6, "args": { "External id": 204450, "cbid": 211, "correlation": 204450 } }, { "ph": "s", "id": 204450, "pid": 76337, "tid": -914061504, "ts": 1716454224543747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224543756, "dur": 0, "args": { "External id": 204451, "cbid": 51, "correlation": 204451 } }, { "ph": "s", "id": 204451, "pid": 76337, "tid": -914061504, "ts": 1716454224543756, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224673050, "dur": 702, "args": { "External id": 204452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204452, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204452, "pid": 5, "tid": 7, "ts": 1716454224673050, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543757, "dur": 5, "args": { "External id": 204452, "cbid": 211, "correlation": 204452 } }, { "ph": "s", "id": 204452, "pid": 76337, "tid": -914061504, "ts": 1716454224543757, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224673753, "dur": 60, "args": { "External id": 204457, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204457, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204457, "pid": 5, "tid": 7, "ts": 1716454224673753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543785, "dur": 9, "args": { "External id": 204457, "cbid": 211, "correlation": 204457 } }, { "ph": "s", "id": 204457, "pid": 76337, "tid": -914061504, "ts": 1716454224543785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224543842, "dur": 0, "args": { "External id": 204467, "cbid": 317, "correlation": 204467 } }, { "ph": "f", "id": 204467, "pid": 76337, "tid": -914061504, "ts": 1716454224543842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224543842, "dur": 0, "args": { "External id": 204468, "cbid": 203, "correlation": 204468 } }, { "ph": "f", "id": 204468, "pid": 76337, "tid": -914061504, "ts": 1716454224543842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224543843, "dur": 0, "args": { "External id": 204469, "cbid": 205, "correlation": 204469 } }, { "ph": "f", "id": 204469, "pid": 76337, "tid": -914061504, "ts": 1716454224543843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224673814, "dur": 75, "args": { "External id": 204473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204473, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204473, "pid": 5, "tid": 7, "ts": 1716454224673814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543855, "dur": 11, "args": { "External id": 204473, "cbid": 211, "correlation": 204473 } }, { "ph": "s", "id": 204473, "pid": 76337, "tid": -914061504, "ts": 1716454224543855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224673890, "dur": 206, "args": { "External id": 204475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204475, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204475, "pid": 5, "tid": 7, "ts": 1716454224673890, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543873, "dur": 6, "args": { "External id": 204475, "cbid": 211, "correlation": 204475 } }, { "ph": "s", "id": 204475, "pid": 76337, "tid": -914061504, "ts": 1716454224543873, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224674097, "dur": 39, "args": { "External id": 204477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204477, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204477, "pid": 5, "tid": 7, "ts": 1716454224674097, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543884, "dur": 5, "args": { "External id": 204477, "cbid": 211, "correlation": 204477 } }, { "ph": "s", "id": 204477, "pid": 76337, "tid": -914061504, "ts": 1716454224543884, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224674137, "dur": 60, "args": { "External id": 204483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204483, "pid": 5, "tid": 7, "ts": 1716454224674137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224543910, "dur": 516, "args": { "External id": 204483, "cbid": 211, "correlation": 204483 } }, { "ph": "s", "id": 204483, "pid": 76337, "tid": -914061504, "ts": 1716454224543910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224674198, "dur": 50, "args": { "External id": 204491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204491, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204491, "pid": 5, "tid": 7, "ts": 1716454224674198, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544449, "dur": 9, "args": { "External id": 204491, "cbid": 211, "correlation": 204491 } }, { "ph": "s", "id": 204491, "pid": 76337, "tid": -914061504, "ts": 1716454224544449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224674249, "dur": 35, "args": { "External id": 204499, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204499, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204499, "pid": 5, "tid": 7, "ts": 1716454224674249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544479, "dur": 8, "args": { "External id": 204499, "cbid": 211, "correlation": 204499 } }, { "ph": "s", "id": 204499, "pid": 76337, "tid": -914061504, "ts": 1716454224544479, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224674286, "dur": 50, "args": { "External id": 204519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204519, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 204519, "pid": 5, "tid": 7, "ts": 1716454224674286, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544561, "dur": 13, "args": { "External id": 204519, "cbid": 211, "correlation": 204519 } }, { "ph": "s", "id": 204519, "pid": 76337, "tid": -914061504, "ts": 1716454224544561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224674337, "dur": 4, "args": { "External id": 204531, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204531, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 204531, "pid": 5, "tid": 7, "ts": 1716454224674337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544584, "dur": 6, "args": { "External id": 204531, "cbid": 211, "correlation": 204531 } }, { "ph": "s", "id": 204531, "pid": 76337, "tid": -914061504, "ts": 1716454224544584, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224674343, "dur": 55, "args": { "External id": 204534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204534, "pid": 5, "tid": 7, "ts": 1716454224674343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544601, "dur": 7, "args": { "External id": 204534, "cbid": 211, "correlation": 204534 } }, { "ph": "s", "id": 204534, "pid": 76337, "tid": -914061504, "ts": 1716454224544601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224544659, "dur": 0, "args": { "External id": 204545, "cbid": 317, "correlation": 204545 } }, { "ph": "f", "id": 204545, "pid": 76337, "tid": -914061504, "ts": 1716454224544659, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224544660, "dur": 0, "args": { "External id": 204546, "cbid": 203, "correlation": 204546 } }, { "ph": "f", "id": 204546, "pid": 76337, "tid": -914061504, "ts": 1716454224544660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224544660, "dur": 0, "args": { "External id": 204547, "cbid": 205, "correlation": 204547 } }, { "ph": "f", "id": 204547, "pid": 76337, "tid": -914061504, "ts": 1716454224544660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544682, "dur": 1, "args": { "External id": 204551, "cbid": 251, "correlation": 204551 } }, { "ph": "f", "id": 204551, "pid": 76337, "tid": -914061504, "ts": 1716454224544682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544684, "dur": 0, "args": { "External id": 204552, "cbid": 251, "correlation": 204552 } }, { "ph": "f", "id": 204552, "pid": 76337, "tid": -914061504, "ts": 1716454224544684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544685, "dur": 0, "args": { "External id": 204553, "cbid": 251, "correlation": 204553 } }, { "ph": "f", "id": 204553, "pid": 76337, "tid": -914061504, "ts": 1716454224544685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544686, "dur": 0, "args": { "External id": 204554, "cbid": 251, "correlation": 204554 } }, { "ph": "f", "id": 204554, "pid": 76337, "tid": -914061504, "ts": 1716454224544686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544686, "dur": 0, "args": { "External id": 204555, "cbid": 251, "correlation": 204555 } }, { "ph": "f", "id": 204555, "pid": 76337, "tid": -914061504, "ts": 1716454224544686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544687, "dur": 0, "args": { "External id": 204556, "cbid": 251, "correlation": 204556 } }, { "ph": "f", "id": 204556, "pid": 76337, "tid": -914061504, "ts": 1716454224544687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544688, "dur": 0, "args": { "External id": 204557, "cbid": 251, "correlation": 204557 } }, { "ph": "f", "id": 204557, "pid": 76337, "tid": -914061504, "ts": 1716454224544688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544688, "dur": 0, "args": { "External id": 204558, "cbid": 251, "correlation": 204558 } }, { "ph": "f", "id": 204558, "pid": 76337, "tid": -914061504, "ts": 1716454224544688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544690, "dur": 0, "args": { "External id": 204559, "cbid": 251, "correlation": 204559 } }, { "ph": "f", "id": 204559, "pid": 76337, "tid": -914061504, "ts": 1716454224544690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224674399, "dur": 113, "args": { "External id": 204560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204560, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204560, "pid": 5, "tid": 7, "ts": 1716454224674399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544692, "dur": 12, "args": { "External id": 204560, "cbid": 211, "correlation": 204560 } }, { "ph": "s", "id": 204560, "pid": 76337, "tid": -914061504, "ts": 1716454224544692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224674513, "dur": 59, "args": { "External id": 204566, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204566, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204566, "pid": 5, "tid": 7, "ts": 1716454224674513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544727, "dur": 9, "args": { "External id": 204566, "cbid": 211, "correlation": 204566 } }, { "ph": "s", "id": 204566, "pid": 76337, "tid": -914061504, "ts": 1716454224544727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224674574, "dur": 600, "args": { "External id": 204575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204575, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204575, "pid": 5, "tid": 7, "ts": 1716454224674574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544810, "dur": 15, "args": { "External id": 204575, "cbid": 211, "correlation": 204575 } }, { "ph": "s", "id": 204575, "pid": 76337, "tid": -914061504, "ts": 1716454224544810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224675175, "dur": 183, "args": { "External id": 204597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204597, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204597, "pid": 5, "tid": 7, "ts": 1716454224675175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544867, "dur": 11, "args": { "External id": 204597, "cbid": 211, "correlation": 204597 } }, { "ph": "s", "id": 204597, "pid": 76337, "tid": -914061504, "ts": 1716454224544867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224544954, "dur": 1, "args": { "External id": 204608, "cbid": 251, "correlation": 204608 } }, { "ph": "f", "id": 204608, "pid": 76337, "tid": -914061504, "ts": 1716454224544954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224675359, "dur": 199, "args": { "External id": 204609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204609, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204609, "pid": 5, "tid": 7, "ts": 1716454224675359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224544959, "dur": 21, "args": { "External id": 204609, "cbid": 211, "correlation": 204609 } }, { "ph": "s", "id": 204609, "pid": 76337, "tid": -914061504, "ts": 1716454224544959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224545037, "dur": 1, "args": { "External id": 204620, "cbid": 251, "correlation": 204620 } }, { "ph": "f", "id": 204620, "pid": 76337, "tid": -914061504, "ts": 1716454224545037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224675560, "dur": 190, "args": { "External id": 204621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204621, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204621, "pid": 5, "tid": 7, "ts": 1716454224675560, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545041, "dur": 12, "args": { "External id": 204621, "cbid": 211, "correlation": 204621 } }, { "ph": "s", "id": 204621, "pid": 76337, "tid": -914061504, "ts": 1716454224545041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224545105, "dur": 1, "args": { "External id": 204632, "cbid": 251, "correlation": 204632 } }, { "ph": "f", "id": 204632, "pid": 76337, "tid": -914061504, "ts": 1716454224545105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224675751, "dur": 187, "args": { "External id": 204633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204633, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204633, "pid": 5, "tid": 7, "ts": 1716454224675751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545109, "dur": 12, "args": { "External id": 204633, "cbid": 211, "correlation": 204633 } }, { "ph": "s", "id": 204633, "pid": 76337, "tid": -914061504, "ts": 1716454224545109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224675940, "dur": 18734, "args": { "External id": 204654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204654, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204654, "pid": 5, "tid": 7, "ts": 1716454224675940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545188, "dur": 12, "args": { "External id": 204654, "cbid": 211, "correlation": 204654 } }, { "ph": "s", "id": 204654, "pid": 76337, "tid": -914061504, "ts": 1716454224545188, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224545284, "dur": 1, "args": { "External id": 204672, "cbid": 251, "correlation": 204672 } }, { "ph": "f", "id": 204672, "pid": 76337, "tid": -914061504, "ts": 1716454224545284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224694675, "dur": 203, "args": { "External id": 204674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204674, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204674, "pid": 5, "tid": 7, "ts": 1716454224694675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545290, "dur": 13, "args": { "External id": 204674, "cbid": 211, "correlation": 204674 } }, { "ph": "s", "id": 204674, "pid": 76337, "tid": -914061504, "ts": 1716454224545290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224694880, "dur": 66, "args": { "External id": 204682, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204682, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204682, "pid": 5, "tid": 7, "ts": 1716454224694880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545359, "dur": 13, "args": { "External id": 204682, "cbid": 211, "correlation": 204682 } }, { "ph": "s", "id": 204682, "pid": 76337, "tid": -914061504, "ts": 1716454224545359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224694947, "dur": 97, "args": { "External id": 204690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204690, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204690, "pid": 5, "tid": 7, "ts": 1716454224694947, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545399, "dur": 85, "args": { "External id": 204690, "cbid": 211, "correlation": 204690 } }, { "ph": "s", "id": 204690, "pid": 76337, "tid": -914061504, "ts": 1716454224545399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224695046, "dur": 54, "args": { "External id": 204701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204701, "pid": 5, "tid": 7, "ts": 1716454224695046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224545546, "dur": 1887, "args": { "External id": 204701, "cbid": 211, "correlation": 204701 } }, { "ph": "s", "id": 204701, "pid": 76337, "tid": -914061504, "ts": 1716454224545546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224695101, "dur": 93, "args": { "External id": 204723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204723, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204723, "pid": 5, "tid": 7, "ts": 1716454224695101, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547453, "dur": 126, "args": { "External id": 204723, "cbid": 211, "correlation": 204723 } }, { "ph": "s", "id": 204723, "pid": 76337, "tid": -914061504, "ts": 1716454224547453, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224547655, "dur": 1, "args": { "External id": 204734, "cbid": 251, "correlation": 204734 } }, { "ph": "f", "id": 204734, "pid": 76337, "tid": -914061504, "ts": 1716454224547655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224695195, "dur": 104, "args": { "External id": 204735, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204735, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204735, "pid": 5, "tid": 7, "ts": 1716454224695195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547661, "dur": 13, "args": { "External id": 204735, "cbid": 211, "correlation": 204735 } }, { "ph": "s", "id": 204735, "pid": 76337, "tid": -914061504, "ts": 1716454224547661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224547733, "dur": 1, "args": { "External id": 204746, "cbid": 251, "correlation": 204746 } }, { "ph": "f", "id": 204746, "pid": 76337, "tid": -914061504, "ts": 1716454224547733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224547737, "dur": 0, "args": { "External id": 204747, "cbid": 251, "correlation": 204747 } }, { "ph": "f", "id": 204747, "pid": 76337, "tid": -914061504, "ts": 1716454224547737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224695300, "dur": 11, "args": { "External id": 204748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204748, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 204748, "pid": 5, "tid": 7, "ts": 1716454224695300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547739, "dur": 12, "args": { "External id": 204748, "cbid": 211, "correlation": 204748 } }, { "ph": "s", "id": 204748, "pid": 76337, "tid": -914061504, "ts": 1716454224547739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224695312, "dur": 5, "args": { "External id": 204750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204750, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 204750, "pid": 5, "tid": 7, "ts": 1716454224695312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547753, "dur": 6, "args": { "External id": 204750, "cbid": 211, "correlation": 204750 } }, { "ph": "s", "id": 204750, "pid": 76337, "tid": -914061504, "ts": 1716454224547753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224547813, "dur": 1, "args": { "External id": 204761, "cbid": 251, "correlation": 204761 } }, { "ph": "f", "id": 204761, "pid": 76337, "tid": -914061504, "ts": 1716454224547813, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224547816, "dur": 0, "args": { "External id": 204762, "cbid": 251, "correlation": 204762 } }, { "ph": "f", "id": 204762, "pid": 76337, "tid": -914061504, "ts": 1716454224547816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224695318, "dur": 6, "args": { "External id": 204763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204763, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 204763, "pid": 5, "tid": 7, "ts": 1716454224695318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547818, "dur": 12, "args": { "External id": 204763, "cbid": 211, "correlation": 204763 } }, { "ph": "s", "id": 204763, "pid": 76337, "tid": -914061504, "ts": 1716454224547818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224695326, "dur": 3, "args": { "External id": 204765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204765, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 204765, "pid": 5, "tid": 7, "ts": 1716454224695326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547831, "dur": 6, "args": { "External id": 204765, "cbid": 211, "correlation": 204765 } }, { "ph": "s", "id": 204765, "pid": 76337, "tid": -914061504, "ts": 1716454224547831, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224695331, "dur": 155, "args": { "External id": 204786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204786, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204786, "pid": 5, "tid": 7, "ts": 1716454224695331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224547905, "dur": 12, "args": { "External id": 204786, "cbid": 211, "correlation": 204786 } }, { "ph": "s", "id": 204786, "pid": 76337, "tid": -914061504, "ts": 1716454224547905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548009, "dur": 1, "args": { "External id": 204804, "cbid": 251, "correlation": 204804 } }, { "ph": "f", "id": 204804, "pid": 76337, "tid": -914061504, "ts": 1716454224548009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224695487, "dur": 107, "args": { "External id": 204806, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204806, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204806, "pid": 5, "tid": 7, "ts": 1716454224695487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548014, "dur": 14, "args": { "External id": 204806, "cbid": 211, "correlation": 204806 } }, { "ph": "s", "id": 204806, "pid": 76337, "tid": -914061504, "ts": 1716454224548014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224695596, "dur": 34, "args": { "External id": 204814, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204814, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204814, "pid": 5, "tid": 7, "ts": 1716454224695596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548085, "dur": 12, "args": { "External id": 204814, "cbid": 211, "correlation": 204814 } }, { "ph": "s", "id": 204814, "pid": 76337, "tid": -914061504, "ts": 1716454224548085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224695631, "dur": 67, "args": { "External id": 204822, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204822, "pid": 5, "tid": 7, "ts": 1716454224695631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548126, "dur": 10, "args": { "External id": 204822, "cbid": 211, "correlation": 204822 } }, { "ph": "s", "id": 204822, "pid": 76337, "tid": -914061504, "ts": 1716454224548126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224695699, "dur": 93, "args": { "External id": 204844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204844, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204844, "pid": 5, "tid": 7, "ts": 1716454224695699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548177, "dur": 10, "args": { "External id": 204844, "cbid": 211, "correlation": 204844 } }, { "ph": "s", "id": 204844, "pid": 76337, "tid": -914061504, "ts": 1716454224548177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548263, "dur": 1, "args": { "External id": 204860, "cbid": 251, "correlation": 204860 } }, { "ph": "f", "id": 204860, "pid": 76337, "tid": -914061504, "ts": 1716454224548263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224695794, "dur": 580, "args": { "External id": 204862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204862, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 204862, "pid": 5, "tid": 7, "ts": 1716454224695794, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548269, "dur": 12, "args": { "External id": 204862, "cbid": 211, "correlation": 204862 } }, { "ph": "s", "id": 204862, "pid": 76337, "tid": -914061504, "ts": 1716454224548269, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224696375, "dur": 248, "args": { "External id": 204870, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204870, "pid": 5, "tid": 7, "ts": 1716454224696375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548334, "dur": 13, "args": { "External id": 204870, "cbid": 211, "correlation": 204870 } }, { "ph": "s", "id": 204870, "pid": 76337, "tid": -914061504, "ts": 1716454224548334, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224696624, "dur": 250, "args": { "External id": 204878, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204878, "pid": 5, "tid": 7, "ts": 1716454224696624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548365, "dur": 8, "args": { "External id": 204878, "cbid": 211, "correlation": 204878 } }, { "ph": "s", "id": 204878, "pid": 76337, "tid": -914061504, "ts": 1716454224548365, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548447, "dur": 1, "args": { "External id": 204894, "cbid": 251, "correlation": 204894 } }, { "ph": "f", "id": 204894, "pid": 76337, "tid": -914061504, "ts": 1716454224548447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548452, "dur": 0, "args": { "External id": 204896, "cbid": 251, "correlation": 204896 } }, { "ph": "f", "id": 204896, "pid": 76337, "tid": -914061504, "ts": 1716454224548452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224696875, "dur": 360, "args": { "External id": 204897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204897, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 204897, "pid": 5, "tid": 7, "ts": 1716454224696875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548454, "dur": 12, "args": { "External id": 204897, "cbid": 211, "correlation": 204897 } }, { "ph": "s", "id": 204897, "pid": 76337, "tid": -914061504, "ts": 1716454224548454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224697236, "dur": 50, "args": { "External id": 204905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204905, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204905, "pid": 5, "tid": 7, "ts": 1716454224697236, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548497, "dur": 195, "args": { "External id": 204905, "cbid": 211, "correlation": 204905 } }, { "ph": "s", "id": 204905, "pid": 76337, "tid": -914061504, "ts": 1716454224548497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224697288, "dur": 159, "args": { "External id": 204916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204916, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204916, "pid": 5, "tid": 7, "ts": 1716454224697288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548750, "dur": 70, "args": { "External id": 204916, "cbid": 211, "correlation": 204916 } }, { "ph": "s", "id": 204916, "pid": 76337, "tid": -914061504, "ts": 1716454224548750, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224548873, "dur": 0, "args": { "External id": 204928, "cbid": 317, "correlation": 204928 } }, { "ph": "f", "id": 204928, "pid": 76337, "tid": -914061504, "ts": 1716454224548873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224548874, "dur": 0, "args": { "External id": 204929, "cbid": 203, "correlation": 204929 } }, { "ph": "f", "id": 204929, "pid": 76337, "tid": -914061504, "ts": 1716454224548874, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224548875, "dur": 0, "args": { "External id": 204930, "cbid": 205, "correlation": 204930 } }, { "ph": "f", "id": 204930, "pid": 76337, "tid": -914061504, "ts": 1716454224548875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548898, "dur": 1, "args": { "External id": 204934, "cbid": 251, "correlation": 204934 } }, { "ph": "f", "id": 204934, "pid": 76337, "tid": -914061504, "ts": 1716454224548898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548900, "dur": 0, "args": { "External id": 204935, "cbid": 251, "correlation": 204935 } }, { "ph": "f", "id": 204935, "pid": 76337, "tid": -914061504, "ts": 1716454224548900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548900, "dur": 0, "args": { "External id": 204936, "cbid": 251, "correlation": 204936 } }, { "ph": "f", "id": 204936, "pid": 76337, "tid": -914061504, "ts": 1716454224548900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548901, "dur": 0, "args": { "External id": 204937, "cbid": 251, "correlation": 204937 } }, { "ph": "f", "id": 204937, "pid": 76337, "tid": -914061504, "ts": 1716454224548901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548902, "dur": 1, "args": { "External id": 204938, "cbid": 251, "correlation": 204938 } }, { "ph": "f", "id": 204938, "pid": 76337, "tid": -914061504, "ts": 1716454224548902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548903, "dur": 0, "args": { "External id": 204939, "cbid": 251, "correlation": 204939 } }, { "ph": "f", "id": 204939, "pid": 76337, "tid": -914061504, "ts": 1716454224548903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548904, "dur": 0, "args": { "External id": 204940, "cbid": 251, "correlation": 204940 } }, { "ph": "f", "id": 204940, "pid": 76337, "tid": -914061504, "ts": 1716454224548904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548905, "dur": 0, "args": { "External id": 204941, "cbid": 251, "correlation": 204941 } }, { "ph": "f", "id": 204941, "pid": 76337, "tid": -914061504, "ts": 1716454224548905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224548906, "dur": 0, "args": { "External id": 204942, "cbid": 251, "correlation": 204942 } }, { "ph": "f", "id": 204942, "pid": 76337, "tid": -914061504, "ts": 1716454224548906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224697448, "dur": 115, "args": { "External id": 204943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204943, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 204943, "pid": 5, "tid": 7, "ts": 1716454224697448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548908, "dur": 41, "args": { "External id": 204943, "cbid": 211, "correlation": 204943 } }, { "ph": "s", "id": 204943, "pid": 76337, "tid": -914061504, "ts": 1716454224548908, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224697565, "dur": 60, "args": { "External id": 204949, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204949, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204949, "pid": 5, "tid": 7, "ts": 1716454224697565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224548971, "dur": 280, "args": { "External id": 204949, "cbid": 211, "correlation": 204949 } }, { "ph": "s", "id": 204949, "pid": 76337, "tid": -914061504, "ts": 1716454224548971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224697626, "dur": 51, "args": { "External id": 204957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204957, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204957, "pid": 5, "tid": 7, "ts": 1716454224697626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549276, "dur": 9, "args": { "External id": 204957, "cbid": 211, "correlation": 204957 } }, { "ph": "s", "id": 204957, "pid": 76337, "tid": -914061504, "ts": 1716454224549276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224697678, "dur": 53, "args": { "External id": 204977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204977, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 204977, "pid": 5, "tid": 7, "ts": 1716454224697678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549356, "dur": 12, "args": { "External id": 204977, "cbid": 211, "correlation": 204977 } }, { "ph": "s", "id": 204977, "pid": 76337, "tid": -914061504, "ts": 1716454224549356, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224697732, "dur": 4, "args": { "External id": 204989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204989, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 204989, "pid": 5, "tid": 7, "ts": 1716454224697732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549378, "dur": 6, "args": { "External id": 204989, "cbid": 211, "correlation": 204989 } }, { "ph": "s", "id": 204989, "pid": 76337, "tid": -914061504, "ts": 1716454224549378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224697738, "dur": 56, "args": { "External id": 204992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 204992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 204992, "pid": 5, "tid": 7, "ts": 1716454224697738, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549396, "dur": 110, "args": { "External id": 204992, "cbid": 211, "correlation": 204992 } }, { "ph": "s", "id": 204992, "pid": 76337, "tid": -914061504, "ts": 1716454224549396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224697795, "dur": 38, "args": { "External id": 205001, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205001, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205001, "pid": 5, "tid": 7, "ts": 1716454224697795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549547, "dur": 10, "args": { "External id": 205001, "cbid": 211, "correlation": 205001 } }, { "ph": "s", "id": 205001, "pid": 76337, "tid": -914061504, "ts": 1716454224549547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224549602, "dur": 0, "args": { "External id": 205011, "cbid": 317, "correlation": 205011 } }, { "ph": "f", "id": 205011, "pid": 76337, "tid": -914061504, "ts": 1716454224549602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224549604, "dur": 0, "args": { "External id": 205012, "cbid": 203, "correlation": 205012 } }, { "ph": "f", "id": 205012, "pid": 76337, "tid": -914061504, "ts": 1716454224549604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224549604, "dur": 0, "args": { "External id": 205013, "cbid": 205, "correlation": 205013 } }, { "ph": "f", "id": 205013, "pid": 76337, "tid": -914061504, "ts": 1716454224549604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224697835, "dur": 42, "args": { "External id": 205017, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205017, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205017, "pid": 5, "tid": 7, "ts": 1716454224697835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549620, "dur": 13, "args": { "External id": 205017, "cbid": 211, "correlation": 205017 } }, { "ph": "s", "id": 205017, "pid": 76337, "tid": -914061504, "ts": 1716454224549620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224697878, "dur": 3, "args": { "External id": 205019, "device": 5, "context": 1, "stream": 7, "correlation": 205019, "bytes": 46080, "memory bandwidth (GB/s)": 11.900826446280991 } }, { "ph": "f", "id": 205019, "pid": 5, "tid": 7, "ts": 1716454224697878, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224549636, "dur": 16, "args": { "External id": 205019, "cbid": 51, "correlation": 205019 } }, { "ph": "s", "id": 205019, "pid": 76337, "tid": -914061504, "ts": 1716454224549636, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224549662, "dur": 2, "args": { "External id": 205021, "cbid": 200, "correlation": 205021 } }, { "ph": "f", "id": 205021, "pid": 76337, "tid": -914061504, "ts": 1716454224549662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224549664, "dur": 0, "args": { "External id": 205022, "cbid": 200, "correlation": 205022 } }, { "ph": "f", "id": 205022, "pid": 76337, "tid": -914061504, "ts": 1716454224549664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224549665, "dur": 0, "args": { "External id": 205023, "cbid": 200, "correlation": 205023 } }, { "ph": "f", "id": 205023, "pid": 76337, "tid": -914061504, "ts": 1716454224549665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224549665, "dur": 0, "args": { "External id": 205024, "cbid": 200, "correlation": 205024 } }, { "ph": "f", "id": 205024, "pid": 76337, "tid": -914061504, "ts": 1716454224549665, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncGetAttributes", "pid": 76337, "tid": -914061504, "ts": 1716454224549667, "dur": 4, "args": { "External id": 205025, "cbid": 15, "correlation": 205025 } }, { "ph": "f", "id": 205025, "pid": 76337, "tid": -914061504, "ts": 1716454224549667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224549672, "dur": 1, "args": { "External id": 205026, "cbid": 251, "correlation": 205026 } }, { "ph": "f", "id": 205026, "pid": 76337, "tid": -914061504, "ts": 1716454224549672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStruct, tensorTransformParams, int, unsigned long, __half const*, __half*, float, float)", "pid": 5, "tid": 7, "ts": 1716454224697883, "dur": 25, "args": { "External id": 205027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205027, "registers per thread": 22, "shared memory": 0, "blocks per SM": 2, "warps per SM": 64, "grid": [160, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205027, "pid": 5, "tid": 7, "ts": 1716454224697883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549675, "dur": 9, "args": { "External id": 205027, "cbid": 211, "correlation": 205027 } }, { "ph": "s", "id": 205027, "pid": 76337, "tid": -914061504, "ts": 1716454224549675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224697909, "dur": 4, "args": { "External id": 205029, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 205029, "pid": 5, "tid": 7, "ts": 1716454224697909, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549690, "dur": 6, "args": { "External id": 205029, "cbid": 211, "correlation": 205029 } }, { "ph": "s", "id": 205029, "pid": 76337, "tid": -914061504, "ts": 1716454224549690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224549699, "dur": 0, "args": { "External id": 205030, "cbid": 51, "correlation": 205030 } }, { "ph": "s", "id": 205030, "pid": 76337, "tid": -914061504, "ts": 1716454224549699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224697914, "dur": 191, "args": { "External id": 205031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205031, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205031, "pid": 5, "tid": 7, "ts": 1716454224697914, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549700, "dur": 199, "args": { "External id": 205031, "cbid": 211, "correlation": 205031 } }, { "ph": "s", "id": 205031, "pid": 76337, "tid": -914061504, "ts": 1716454224549700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224698107, "dur": 7, "args": { "External id": 205032, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205032, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205032, "pid": 5, "tid": 7, "ts": 1716454224698107, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549902, "dur": 5, "args": { "External id": 205032, "cbid": 211, "correlation": 205032 } }, { "ph": "s", "id": 205032, "pid": 76337, "tid": -914061504, "ts": 1716454224549902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224698115, "dur": 5, "args": { "External id": 205038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205038, "registers per thread": 16, "shared memory": 0, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [192, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 15 } }, { "ph": "f", "id": 205038, "pid": 5, "tid": 7, "ts": 1716454224698115, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224549931, "dur": 9, "args": { "External id": 205038, "cbid": 211, "correlation": 205038 } }, { "ph": "s", "id": 205038, "pid": 76337, "tid": -914061504, "ts": 1716454224549931, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698121, "dur": 3, "args": { "External id": 205046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205046, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205046, "pid": 5, "tid": 7, "ts": 1716454224698121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224551623, "dur": 15, "args": { "External id": 205046, "cbid": 211, "correlation": 205046 } }, { "ph": "s", "id": 205046, "pid": 76337, "tid": -914061504, "ts": 1716454224551623, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698125, "dur": 3, "args": { "External id": 205054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205054, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205054, "pid": 5, "tid": 7, "ts": 1716454224698125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224551664, "dur": 10, "args": { "External id": 205054, "cbid": 211, "correlation": 205054 } }, { "ph": "s", "id": 205054, "pid": 76337, "tid": -914061504, "ts": 1716454224551664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698130, "dur": 3, "args": { "External id": 205062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205062, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205062, "pid": 5, "tid": 7, "ts": 1716454224698130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224551691, "dur": 8, "args": { "External id": 205062, "cbid": 211, "correlation": 205062 } }, { "ph": "s", "id": 205062, "pid": 76337, "tid": -914061504, "ts": 1716454224551691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698134, "dur": 3, "args": { "External id": 205071, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205071, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205071, "pid": 5, "tid": 7, "ts": 1716454224698134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224551863, "dur": 14, "args": { "External id": 205071, "cbid": 211, "correlation": 205071 } }, { "ph": "s", "id": 205071, "pid": 76337, "tid": -914061504, "ts": 1716454224551863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698138, "dur": 3, "args": { "External id": 205080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205080, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205080, "pid": 5, "tid": 7, "ts": 1716454224698138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224551892, "dur": 7, "args": { "External id": 205080, "cbid": 211, "correlation": 205080 } }, { "ph": "s", "id": 205080, "pid": 76337, "tid": -914061504, "ts": 1716454224551892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698142, "dur": 3, "args": { "External id": 205088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205088, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205088, "pid": 5, "tid": 7, "ts": 1716454224698142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224551918, "dur": 8, "args": { "External id": 205088, "cbid": 211, "correlation": 205088 } }, { "ph": "s", "id": 205088, "pid": 76337, "tid": -914061504, "ts": 1716454224551918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698146, "dur": 3, "args": { "External id": 205096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205096, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205096, "pid": 5, "tid": 7, "ts": 1716454224698146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224552183, "dur": 15, "args": { "External id": 205096, "cbid": 211, "correlation": 205096 } }, { "ph": "s", "id": 205096, "pid": 76337, "tid": -914061504, "ts": 1716454224552183, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698150, "dur": 3, "args": { "External id": 205104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205104, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205104, "pid": 5, "tid": 7, "ts": 1716454224698150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224552215, "dur": 8, "args": { "External id": 205104, "cbid": 211, "correlation": 205104 } }, { "ph": "s", "id": 205104, "pid": 76337, "tid": -914061504, "ts": 1716454224552215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224698155, "dur": 1, "args": { "External id": 205114, "device": 5, "context": 1, "stream": 7, "correlation": 205114, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 205114, "pid": 5, "tid": 7, "ts": 1716454224698155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224552280, "dur": 35, "args": { "External id": 205114, "cbid": 41, "correlation": 205114 } }, { "ph": "s", "id": 205114, "pid": 76337, "tid": -914061504, "ts": 1716454224552280, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224552316, "dur": 145856, "args": { "External id": 205115, "cbid": 131, "correlation": 205115 } }, { "ph": "f", "id": 205115, "pid": 76337, "tid": -914061504, "ts": 1716454224552316, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224698360, "dur": 3, "args": { "External id": 205123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205123, "pid": 5, "tid": 7, "ts": 1716454224698360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698332, "dur": 31, "args": { "External id": 205123, "cbid": 211, "correlation": 205123 } }, { "ph": "s", "id": 205123, "pid": 76337, "tid": -914061504, "ts": 1716454224698332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224698458, "dur": 3, "args": { "External id": 205132, "device": 5, "context": 1, "stream": 7, "correlation": 205132, "bytes": 8, "memory bandwidth (GB/s)": 0.002380952380952381 } }, { "ph": "f", "id": 205132, "pid": 5, "tid": 7, "ts": 1716454224698458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224698426, "dur": 33, "args": { "External id": 205132, "cbid": 41, "correlation": 205132 } }, { "ph": "s", "id": 205132, "pid": 76337, "tid": -914061504, "ts": 1716454224698426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224698565, "dur": 4, "args": { "External id": 205142, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205142, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205142, "pid": 5, "tid": 7, "ts": 1716454224698565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698545, "dur": 22, "args": { "External id": 205142, "cbid": 211, "correlation": 205142 } }, { "ph": "s", "id": 205142, "pid": 76337, "tid": -914061504, "ts": 1716454224698545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224698645, "dur": 1, "args": { "External id": 205152, "device": 5, "context": 1, "stream": 7, "correlation": 205152, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 205152, "pid": 5, "tid": 7, "ts": 1716454224698645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224698626, "dur": 17, "args": { "External id": 205152, "cbid": 41, "correlation": 205152 } }, { "ph": "s", "id": 205152, "pid": 76337, "tid": -914061504, "ts": 1716454224698626, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224698644, "dur": 8, "args": { "External id": 205153, "cbid": 131, "correlation": 205153 } }, { "ph": "f", "id": 205153, "pid": 76337, "tid": -914061504, "ts": 1716454224698644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224698722, "dur": 3, "args": { "External id": 205160, "device": 5, "context": 1, "stream": 7, "correlation": 205160, "bytes": 98304, "memory bandwidth (GB/s)": 30.415841584158414 } }, { "ph": "f", "id": 205160, "pid": 5, "tid": 7, "ts": 1716454224698722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224698694, "dur": 28, "args": { "External id": 205160, "cbid": 41, "correlation": 205160 } }, { "ph": "s", "id": 205160, "pid": 76337, "tid": -914061504, "ts": 1716454224698694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224698815, "dur": 3, "args": { "External id": 205179, "device": 5, "context": 1, "stream": 7, "correlation": 205179, "bytes": 16, "memory bandwidth (GB/s)": 0.005263157894736842 } }, { "ph": "f", "id": 205179, "pid": 5, "tid": 7, "ts": 1716454224698815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224698796, "dur": 18, "args": { "External id": 205179, "cbid": 41, "correlation": 205179 } }, { "ph": "s", "id": 205179, "pid": 76337, "tid": -914061504, "ts": 1716454224698796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_set_info(int, int, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224698853, "dur": 3, "args": { "External id": 205185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205185, "pid": 5, "tid": 7, "ts": 1716454224698853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698842, "dur": 11, "args": { "External id": 205185, "cbid": 211, "correlation": 205185 } }, { "ph": "s", "id": 205185, "pid": 76337, "tid": -914061504, "ts": 1716454224698842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void getf2_cta_32x32(int, int, int, int, float*, int, int*, int*)", "pid": 5, "tid": 7, "ts": 1716454224698868, "dur": 6, "args": { "External id": 205187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205187, "registers per thread": 26, "shared memory": 4480, "blocks per SM": 0.0125, "warps per SM": 0.4, "grid": [1, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 205187, "pid": 5, "tid": 7, "ts": 1716454224698868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698857, "dur": 10, "args": { "External id": 205187, "cbid": 211, "correlation": 205187 } }, { "ph": "s", "id": 205187, "pid": 76337, "tid": -914061504, "ts": 1716454224698857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "getrf2_reset_info(int*)", "pid": 5, "tid": 7, "ts": 1716454224698877, "dur": 3, "args": { "External id": 205189, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.000390625, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205189, "pid": 5, "tid": 7, "ts": 1716454224698877, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698869, "dur": 7, "args": { "External id": 205189, "cbid": 211, "correlation": 205189 } }, { "ph": "s", "id": 205189, "pid": 76337, "tid": -914061504, "ts": 1716454224698869, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224698911, "dur": 2, "args": { "External id": 205197, "device": 5, "context": 1, "stream": 7, "correlation": 205197, "bytes": 8, "memory bandwidth (GB/s)": 0.002717391304347826 } }, { "ph": "f", "id": 205197, "pid": 5, "tid": 7, "ts": 1716454224698911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224698897, "dur": 13, "args": { "External id": 205197, "cbid": 41, "correlation": 205197 } }, { "ph": "s", "id": 205197, "pid": 76337, "tid": -914061504, "ts": 1716454224698897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224698956, "dur": 3, "args": { "External id": 205211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205211, "pid": 5, "tid": 7, "ts": 1716454224698956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698946, "dur": 12, "args": { "External id": 205211, "cbid": 211, "correlation": 205211 } }, { "ph": "s", "id": 205211, "pid": 76337, "tid": -914061504, "ts": 1716454224698946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224698986, "dur": 2, "args": { "External id": 205225, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205225, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.025, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205225, "pid": 5, "tid": 7, "ts": 1716454224698986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224698971, "dur": 15, "args": { "External id": 205225, "cbid": 211, "correlation": 205225 } }, { "ph": "s", "id": 205225, "pid": 76337, "tid": -914061504, "ts": 1716454224698971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_lower_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224699022, "dur": 6, "args": { "External id": 205232, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205232, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205232, "pid": 5, "tid": 7, "ts": 1716454224699022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699012, "dur": 11, "args": { "External id": 205232, "cbid": 211, "correlation": 205232 } }, { "ph": "s", "id": 205232, "pid": 76337, "tid": -914061504, "ts": 1716454224699012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void trsm_batch_left_upper_kernel(cublasTrsmBatchParams, float const* const*, float* const*, float const*, float)", "pid": 5, "tid": 7, "ts": 1716454224699033, "dur": 6, "args": { "External id": 205235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205235, "registers per thread": 56, "shared memory": 192, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [32, 8, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205235, "pid": 5, "tid": 7, "ts": 1716454224699033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699025, "dur": 7, "args": { "External id": 205235, "cbid": 211, "correlation": 205235 } }, { "ph": "s", "id": 205235, "pid": 76337, "tid": -914061504, "ts": 1716454224699025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void laswp_kernel_reverse(int, float* const*, int, int, int, int const*, int, int, int)", "pid": 5, "tid": 7, "ts": 1716454224699042, "dur": 3, "args": { "External id": 205237, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205237, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [1, 256, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205237, "pid": 5, "tid": 7, "ts": 1716454224699042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699034, "dur": 7, "args": { "External id": 205237, "cbid": 211, "correlation": 205237 } }, { "ph": "s", "id": 205237, "pid": 76337, "tid": -914061504, "ts": 1716454224699034, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224699064, "dur": 2, "args": { "External id": 205240, "device": 5, "context": 1, "stream": 7, "correlation": 205240, "bytes": 8, "memory bandwidth (GB/s)": 0.0027472527472527475 } }, { "ph": "f", "id": 205240, "pid": 5, "tid": 7, "ts": 1716454224699064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224699050, "dur": 13, "args": { "External id": 205240, "cbid": 41, "correlation": 205240 } }, { "ph": "s", "id": 205240, "pid": 76337, "tid": -914061504, "ts": 1716454224699050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#11}::operator()() const::{lambda(bool)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224699121, "dur": 4, "args": { "External id": 205256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205256, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205256, "pid": 5, "tid": 7, "ts": 1716454224699121, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699108, "dur": 13, "args": { "External id": 205256, "cbid": 211, "correlation": 205256 } }, { "ph": "s", "id": 205256, "pid": 76337, "tid": -914061504, "ts": 1716454224699108, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224699141, "dur": 3, "args": { "External id": 205261, "device": 5, "context": 1, "stream": 7, "correlation": 205261, "bytes": 1, "memory bandwidth (GB/s)": 0.0003125 } }, { "ph": "f", "id": 205261, "pid": 5, "tid": 7, "ts": 1716454224699141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224699126, "dur": 13, "args": { "External id": 205261, "cbid": 41, "correlation": 205261 } }, { "ph": "s", "id": 205261, "pid": 76337, "tid": -914061504, "ts": 1716454224699126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224699172, "dur": 1, "args": { "External id": 205267, "device": 5, "context": 1, "stream": 7, "correlation": 205267, "bytes": 1, "memory bandwidth (GB/s)": 0.0005896226415094339 } }, { "ph": "f", "id": 205267, "pid": 5, "tid": 7, "ts": 1716454224699172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224699150, "dur": 31, "args": { "External id": 205267, "cbid": 41, "correlation": 205267 } }, { "ph": "s", "id": 205267, "pid": 76337, "tid": -914061504, "ts": 1716454224699150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224699181, "dur": 3, "args": { "External id": 205268, "cbid": 131, "correlation": 205268 } }, { "ph": "f", "id": 205268, "pid": 76337, "tid": -914061504, "ts": 1716454224699181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224699243, "dur": 3, "args": { "External id": 205276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205276, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205276, "pid": 5, "tid": 7, "ts": 1716454224699243, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699225, "dur": 18, "args": { "External id": 205276, "cbid": 211, "correlation": 205276 } }, { "ph": "s", "id": 205276, "pid": 76337, "tid": -914061504, "ts": 1716454224699225, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224699280, "dur": 3, "args": { "External id": 205286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205286, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205286, "pid": 5, "tid": 7, "ts": 1716454224699280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699267, "dur": 12, "args": { "External id": 205286, "cbid": 211, "correlation": 205286 } }, { "ph": "s", "id": 205286, "pid": 76337, "tid": -914061504, "ts": 1716454224699267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224699307, "dur": 3, "args": { "External id": 205295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205295, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205295, "pid": 5, "tid": 7, "ts": 1716454224699307, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699297, "dur": 9, "args": { "External id": 205295, "cbid": 211, "correlation": 205295 } }, { "ph": "s", "id": 205295, "pid": 76337, "tid": -914061504, "ts": 1716454224699297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224699428, "dur": 12, "args": { "External id": 205305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205305, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205305, "pid": 5, "tid": 7, "ts": 1716454224699428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699410, "dur": 19, "args": { "External id": 205305, "cbid": 211, "correlation": 205305 } }, { "ph": "s", "id": 205305, "pid": 76337, "tid": -914061504, "ts": 1716454224699410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224699470, "dur": 3, "args": { "External id": 205313, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205313, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205313, "pid": 5, "tid": 7, "ts": 1716454224699470, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699460, "dur": 9, "args": { "External id": 205313, "cbid": 211, "correlation": 205313 } }, { "ph": "s", "id": 205313, "pid": 76337, "tid": -914061504, "ts": 1716454224699460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224699517, "dur": 12, "args": { "External id": 205323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205323, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205323, "pid": 5, "tid": 7, "ts": 1716454224699517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699503, "dur": 14, "args": { "External id": 205323, "cbid": 211, "correlation": 205323 } }, { "ph": "s", "id": 205323, "pid": 76337, "tid": -914061504, "ts": 1716454224699503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224699550, "dur": 10, "args": { "External id": 205331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205331, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205331, "pid": 5, "tid": 7, "ts": 1716454224699550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699540, "dur": 9, "args": { "External id": 205331, "cbid": 211, "correlation": 205331 } }, { "ph": "s", "id": 205331, "pid": 76337, "tid": -914061504, "ts": 1716454224699540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224699581, "dur": 3, "args": { "External id": 205340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205340, "pid": 5, "tid": 7, "ts": 1716454224699581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699570, "dur": 12, "args": { "External id": 205340, "cbid": 211, "correlation": 205340 } }, { "ph": "s", "id": 205340, "pid": 76337, "tid": -914061504, "ts": 1716454224699570, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, at::detail::Array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224699610, "dur": 5, "args": { "External id": 205349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205349, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205349, "pid": 5, "tid": 7, "ts": 1716454224699610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699601, "dur": 8, "args": { "External id": 205349, "cbid": 211, "correlation": 205349 } }, { "ph": "s", "id": 205349, "pid": 76337, "tid": -914061504, "ts": 1716454224699601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224699650, "dur": 8, "args": { "External id": 205359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205359, "registers per thread": 32, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205359, "pid": 5, "tid": 7, "ts": 1716454224699650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699639, "dur": 10, "args": { "External id": 205359, "cbid": 211, "correlation": 205359 } }, { "ph": "s", "id": 205359, "pid": 76337, "tid": -914061504, "ts": 1716454224699639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224699973, "dur": 3, "args": { "External id": 205368, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205368, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205368, "pid": 5, "tid": 7, "ts": 1716454224699973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224699957, "dur": 24, "args": { "External id": 205368, "cbid": 211, "correlation": 205368 } }, { "ph": "s", "id": 205368, "pid": 76337, "tid": -914061504, "ts": 1716454224699957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700015, "dur": 3, "args": { "External id": 205376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205376, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205376, "pid": 5, "tid": 7, "ts": 1716454224700015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700004, "dur": 11, "args": { "External id": 205376, "cbid": 211, "correlation": 205376 } }, { "ph": "s", "id": 205376, "pid": 76337, "tid": -914061504, "ts": 1716454224700004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700067, "dur": 1, "args": { "External id": 205386, "device": 5, "context": 1, "stream": 7, "correlation": 205386, "bytes": 8, "memory bandwidth (GB/s)": 0.005211726384364821 } }, { "ph": "f", "id": 205386, "pid": 5, "tid": 7, "ts": 1716454224700067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700052, "dur": 12, "args": { "External id": 205386, "cbid": 41, "correlation": 205386 } }, { "ph": "s", "id": 205386, "pid": 76337, "tid": -914061504, "ts": 1716454224700052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224700066, "dur": 8, "args": { "External id": 205387, "cbid": 131, "correlation": 205387 } }, { "ph": "f", "id": 205387, "pid": 76337, "tid": -914061504, "ts": 1716454224700066, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700155, "dur": 2, "args": { "External id": 205395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205395, "pid": 5, "tid": 7, "ts": 1716454224700155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700141, "dur": 14, "args": { "External id": 205395, "cbid": 211, "correlation": 205395 } }, { "ph": "s", "id": 205395, "pid": 76337, "tid": -914061504, "ts": 1716454224700141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700227, "dur": 3, "args": { "External id": 205404, "device": 5, "context": 1, "stream": 7, "correlation": 205404, "bytes": 8, "memory bandwidth (GB/s)": 0.002551020408163265 } }, { "ph": "f", "id": 205404, "pid": 5, "tid": 7, "ts": 1716454224700227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700209, "dur": 18, "args": { "External id": 205404, "cbid": 41, "correlation": 205404 } }, { "ph": "s", "id": 205404, "pid": 76337, "tid": -914061504, "ts": 1716454224700209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224700297, "dur": 3, "args": { "External id": 205414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205414, "registers per thread": 18, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205414, "pid": 5, "tid": 7, "ts": 1716454224700297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700283, "dur": 13, "args": { "External id": 205414, "cbid": 211, "correlation": 205414 } }, { "ph": "s", "id": 205414, "pid": 76337, "tid": -914061504, "ts": 1716454224700283, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700349, "dur": 1, "args": { "External id": 205424, "device": 5, "context": 1, "stream": 7, "correlation": 205424, "bytes": 8, "memory bandwidth (GB/s)": 0.005208333333333333 } }, { "ph": "f", "id": 205424, "pid": 5, "tid": 7, "ts": 1716454224700349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700335, "dur": 12, "args": { "External id": 205424, "cbid": 41, "correlation": 205424 } }, { "ph": "s", "id": 205424, "pid": 76337, "tid": -914061504, "ts": 1716454224700335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224700348, "dur": 8, "args": { "External id": 205425, "cbid": 131, "correlation": 205425 } }, { "ph": "f", "id": 205425, "pid": 76337, "tid": -914061504, "ts": 1716454224700348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700410, "dur": 3, "args": { "External id": 205432, "device": 5, "context": 1, "stream": 7, "correlation": 205432, "bytes": 98304, "memory bandwidth (GB/s)": 31.670103092783506 } }, { "ph": "f", "id": 205432, "pid": 5, "tid": 7, "ts": 1716454224700410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700390, "dur": 19, "args": { "External id": 205432, "cbid": 41, "correlation": 205432 } }, { "ph": "s", "id": 205432, "pid": 76337, "tid": -914061504, "ts": 1716454224700390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700457, "dur": 1, "args": { "External id": 205443, "device": 5, "context": 1, "stream": 7, "correlation": 205443, "bytes": 2, "memory bandwidth (GB/s)": 0.0013020833333333333 } }, { "ph": "f", "id": 205443, "pid": 5, "tid": 7, "ts": 1716454224700457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700445, "dur": 10, "args": { "External id": 205443, "cbid": 41, "correlation": 205443 } }, { "ph": "s", "id": 205443, "pid": 76337, "tid": -914061504, "ts": 1716454224700445, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224700455, "dur": 8, "args": { "External id": 205444, "cbid": 131, "correlation": 205444 } }, { "ph": "f", "id": 205444, "pid": 76337, "tid": -914061504, "ts": 1716454224700455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700505, "dur": 3, "args": { "External id": 205452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205452, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205452, "pid": 5, "tid": 7, "ts": 1716454224700505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700492, "dur": 12, "args": { "External id": 205452, "cbid": 211, "correlation": 205452 } }, { "ph": "s", "id": 205452, "pid": 76337, "tid": -914061504, "ts": 1716454224700492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700534, "dur": 3, "args": { "External id": 205462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205462, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205462, "pid": 5, "tid": 7, "ts": 1716454224700534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700525, "dur": 8, "args": { "External id": 205462, "cbid": 211, "correlation": 205462 } }, { "ph": "s", "id": 205462, "pid": 76337, "tid": -914061504, "ts": 1716454224700525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700556, "dur": 3, "args": { "External id": 205471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205471, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205471, "pid": 5, "tid": 7, "ts": 1716454224700556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700547, "dur": 7, "args": { "External id": 205471, "cbid": 211, "correlation": 205471 } }, { "ph": "s", "id": 205471, "pid": 76337, "tid": -914061504, "ts": 1716454224700547, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224700628, "dur": 5, "args": { "External id": 205479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205479, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205479, "pid": 5, "tid": 7, "ts": 1716454224700628, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700611, "dur": 18, "args": { "External id": 205479, "cbid": 211, "correlation": 205479 } }, { "ph": "s", "id": 205479, "pid": 76337, "tid": -914061504, "ts": 1716454224700611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700670, "dur": 3, "args": { "External id": 205488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205488, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205488, "pid": 5, "tid": 7, "ts": 1716454224700670, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700659, "dur": 10, "args": { "External id": 205488, "cbid": 211, "correlation": 205488 } }, { "ph": "s", "id": 205488, "pid": 76337, "tid": -914061504, "ts": 1716454224700659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700694, "dur": 3, "args": { "External id": 205497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205497, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205497, "pid": 5, "tid": 7, "ts": 1716454224700694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700685, "dur": 7, "args": { "External id": 205497, "cbid": 211, "correlation": 205497 } }, { "ph": "s", "id": 205497, "pid": 76337, "tid": -914061504, "ts": 1716454224700685, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224700755, "dur": 3, "args": { "External id": 205505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205505, "registers per thread": 17, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205505, "pid": 5, "tid": 7, "ts": 1716454224700755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224700744, "dur": 9, "args": { "External id": 205505, "cbid": 211, "correlation": 205505 } }, { "ph": "s", "id": 205505, "pid": 76337, "tid": -914061504, "ts": 1716454224700744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224700816, "dur": 1, "args": { "External id": 205513, "device": 5, "context": 1, "stream": 7, "correlation": 205513, "bytes": 8, "memory bandwidth (GB/s)": 0.004464285714285714 } }, { "ph": "f", "id": 205513, "pid": 5, "tid": 7, "ts": 1716454224700816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700798, "dur": 28, "args": { "External id": 205513, "cbid": 41, "correlation": 205513 } }, { "ph": "s", "id": 205513, "pid": 76337, "tid": -914061504, "ts": 1716454224700798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224700827, "dur": 3, "args": { "External id": 205514, "cbid": 131, "correlation": 205514 } }, { "ph": "f", "id": 205514, "pid": 76337, "tid": -914061504, "ts": 1716454224700827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700889, "dur": 1, "args": { "External id": 205524, "device": 5, "context": 1, "stream": 7, "correlation": 205524, "bytes": 42, "memory bandwidth (GB/s)": 0.02734375 } }, { "ph": "f", "id": 205524, "pid": 5, "tid": 7, "ts": 1716454224700889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700876, "dur": 10, "args": { "External id": 205524, "cbid": 41, "correlation": 205524 } }, { "ph": "s", "id": 205524, "pid": 76337, "tid": -914061504, "ts": 1716454224700876, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224700887, "dur": 7, "args": { "External id": 205525, "cbid": 131, "correlation": 205525 } }, { "ph": "f", "id": 205525, "pid": 76337, "tid": -914061504, "ts": 1716454224700887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224700943, "dur": 1, "args": { "External id": 205534, "device": 5, "context": 1, "stream": 7, "correlation": 205534, "bytes": 8, "memory bandwidth (GB/s)": 0.005319148936170213 } }, { "ph": "f", "id": 205534, "pid": 5, "tid": 7, "ts": 1716454224700943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224700933, "dur": 8, "args": { "External id": 205534, "cbid": 41, "correlation": 205534 } }, { "ph": "s", "id": 205534, "pid": 76337, "tid": -914061504, "ts": 1716454224700933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224700942, "dur": 8, "args": { "External id": 205535, "cbid": 131, "correlation": 205535 } }, { "ph": "f", "id": 205535, "pid": 76337, "tid": -914061504, "ts": 1716454224700942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224701029, "dur": 4, "args": { "External id": 205542, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205542, "pid": 5, "tid": 7, "ts": 1716454224701029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701009, "dur": 22, "args": { "External id": 205542, "cbid": 211, "correlation": 205542 } }, { "ph": "s", "id": 205542, "pid": 76337, "tid": -914061504, "ts": 1716454224701009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceReduceSingleTileKernel::Policy600, at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int>(at_cuda_detail::cub::TransformInputIterator, bool*, long>, int*, int, at_cuda_detail::cub::Sum, int)", "pid": 5, "tid": 7, "ts": 1716454224701067, "dur": 4, "args": { "External id": 205562, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205562, "registers per thread": 40, "shared memory": 44, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205562, "pid": 5, "tid": 7, "ts": 1716454224701067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701055, "dur": 12, "args": { "External id": 205562, "cbid": 211, "correlation": 205562 } }, { "ph": "s", "id": 205562, "pid": 76337, "tid": -914061504, "ts": 1716454224701055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224701068, "dur": 0, "args": { "External id": 205563, "cbid": 11, "correlation": 205563 } }, { "ph": "f", "id": 205563, "pid": 76337, "tid": -914061504, "ts": 1716454224701068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224701069, "dur": 0, "args": { "External id": 205564, "cbid": 11, "correlation": 205564 } }, { "ph": "f", "id": 205564, "pid": 76337, "tid": -914061504, "ts": 1716454224701069, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224701082, "dur": 1, "args": { "External id": 205567, "device": 5, "context": 1, "stream": 7, "correlation": 205567, "bytes": 4, "memory bandwidth (GB/s)": 0.0023584905660377358 } }, { "ph": "f", "id": 205567, "pid": 5, "tid": 7, "ts": 1716454224701082, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224701070, "dur": 21, "args": { "External id": 205567, "cbid": 41, "correlation": 205567 } }, { "ph": "s", "id": 205567, "pid": 76337, "tid": -914061504, "ts": 1716454224701070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224701092, "dur": 3, "args": { "External id": 205568, "cbid": 131, "correlation": 205568 } }, { "ph": "f", "id": 205568, "pid": 76337, "tid": -914061504, "ts": 1716454224701092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceCompactInitKernel, int*>(at_cuda_detail::cub::ScanTileState, int, int*)", "pid": 5, "tid": 7, "ts": 1716454224701120, "dur": 3, "args": { "External id": 205592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205592, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205592, "pid": 5, "tid": 7, "ts": 1716454224701120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701110, "dur": 9, "args": { "External id": 205592, "cbid": 211, "correlation": 205592 } }, { "ph": "s", "id": 205592, "pid": 76337, "tid": -914061504, "ts": 1716454224701110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224701120, "dur": 0, "args": { "External id": 205593, "cbid": 11, "correlation": 205593 } }, { "ph": "f", "id": 205593, "pid": 76337, "tid": -914061504, "ts": 1716454224701120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224701120, "dur": 0, "args": { "External id": 205594, "cbid": 11, "correlation": 205594 } }, { "ph": "f", "id": 205594, "pid": 76337, "tid": -914061504, "ts": 1716454224701120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 76337, "tid": -914061504, "ts": 1716454224701122, "dur": 1, "args": { "External id": 205596, "cbid": 200, "correlation": 205596 } }, { "ph": "f", "id": 205596, "pid": 76337, "tid": -914061504, "ts": 1716454224701122, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at_cuda_detail::cub::DeviceSelectSweepKernel, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>::PtxSelectIfPolicyT, at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, false>(at_cuda_detail::cub::CountingInputIterator, at_cuda_detail::cub::TransformInputIterator, bool*, long>, long*, int*, at_cuda_detail::cub::ScanTileState, at_cuda_detail::cub::NullType, at_cuda_detail::cub::NullType, int, int)", "pid": 5, "tid": 7, "ts": 1716454224701132, "dur": 4, "args": { "External id": 205598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205598, "registers per thread": 40, "shared memory": 5120, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205598, "pid": 5, "tid": 7, "ts": 1716454224701132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701125, "dur": 8, "args": { "External id": 205598, "cbid": 211, "correlation": 205598 } }, { "ph": "s", "id": 205598, "pid": 76337, "tid": -914061504, "ts": 1716454224701125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224701133, "dur": 0, "args": { "External id": 205599, "cbid": 11, "correlation": 205599 } }, { "ph": "f", "id": 205599, "pid": 76337, "tid": -914061504, "ts": 1716454224701133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaPeekAtLastError", "pid": 76337, "tid": -914061504, "ts": 1716454224701134, "dur": 0, "args": { "External id": 205600, "cbid": 11, "correlation": 205600 } }, { "ph": "f", "id": 205600, "pid": 76337, "tid": -914061504, "ts": 1716454224701134, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 5, "tid": 7, "ts": 1716454224701171, "dur": 1, "args": { "External id": 205607, "device": 5, "context": 1, "stream": 7, "correlation": 205607, "bytes": 8, "memory bandwidth (GB/s)": 0.004807692307692308 } }, { "ph": "f", "id": 205607, "pid": 5, "tid": 7, "ts": 1716454224701171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224701159, "dur": 20, "args": { "External id": 205607, "cbid": 41, "correlation": 205607 } }, { "ph": "s", "id": 205607, "pid": 76337, "tid": -914061504, "ts": 1716454224701159, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224701180, "dur": 3, "args": { "External id": 205608, "cbid": 131, "correlation": 205608 } }, { "ph": "f", "id": 205608, "pid": 76337, "tid": -914061504, "ts": 1716454224701180, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 5, "tid": 7, "ts": 1716454224701230, "dur": 1, "args": { "External id": 205618, "device": 5, "context": 1, "stream": 7, "correlation": 205618, "bytes": 8, "memory bandwidth (GB/s)": 0.0047169811320754715 } }, { "ph": "f", "id": 205618, "pid": 5, "tid": 7, "ts": 1716454224701230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224701218, "dur": 9, "args": { "External id": 205618, "cbid": 41, "correlation": 205618 } }, { "ph": "s", "id": 205618, "pid": 76337, "tid": -914061504, "ts": 1716454224701218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 76337, "tid": -914061504, "ts": 1716454224701229, "dur": 8, "args": { "External id": 205619, "cbid": 131, "correlation": 205619 } }, { "ph": "f", "id": 205619, "pid": 76337, "tid": -914061504, "ts": 1716454224701229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void at::native::index_elementwise_kernel<128, 4, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_index_kernel >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1}>(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef, at::native::index_kernel_impl >(at::TensorIteratorBase&, c10::ArrayRef, c10::ArrayRef)::{lambda(char*, char*, long)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224701299, "dur": 5, "args": { "External id": 205626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205626, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205626, "pid": 5, "tid": 7, "ts": 1716454224701299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701284, "dur": 16, "args": { "External id": 205626, "cbid": 211, "correlation": 205626 } }, { "ph": "s", "id": 205626, "pid": 76337, "tid": -914061504, "ts": 1716454224701284, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, c10::Half)::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701370, "dur": 3, "args": { "External id": 205635, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205635, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205635, "pid": 5, "tid": 7, "ts": 1716454224701370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701357, "dur": 12, "args": { "External id": 205635, "cbid": 211, "correlation": 205635 } }, { "ph": "s", "id": 205635, "pid": 76337, "tid": -914061504, "ts": 1716454224701357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, at::detail::Array >(int, at::native::CUDAFunctorOnSelf_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701405, "dur": 3, "args": { "External id": 205643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205643, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205643, "pid": 5, "tid": 7, "ts": 1716454224701405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701394, "dur": 10, "args": { "External id": 205643, "cbid": 211, "correlation": 205643 } }, { "ph": "s", "id": 205643, "pid": 76337, "tid": -914061504, "ts": 1716454224701394, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701436, "dur": 4, "args": { "External id": 205651, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205651, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205651, "pid": 5, "tid": 7, "ts": 1716454224701436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701425, "dur": 11, "args": { "External id": 205651, "cbid": 211, "correlation": 205651 } }, { "ph": "s", "id": 205651, "pid": 76337, "tid": -914061504, "ts": 1716454224701425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#5}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701467, "dur": 4, "args": { "External id": 205659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205659, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205659, "pid": 5, "tid": 7, "ts": 1716454224701467, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701456, "dur": 9, "args": { "External id": 205659, "cbid": 211, "correlation": 205659 } }, { "ph": "s", "id": 205659, "pid": 76337, "tid": -914061504, "ts": 1716454224701456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701492, "dur": 3, "args": { "External id": 205667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205667, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205667, "pid": 5, "tid": 7, "ts": 1716454224701492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701483, "dur": 8, "args": { "External id": 205667, "cbid": 211, "correlation": 205667 } }, { "ph": "s", "id": 205667, "pid": 76337, "tid": -914061504, "ts": 1716454224701483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, at::detail::Array >(int, at::native::BinaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701522, "dur": 3, "args": { "External id": 205675, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205675, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205675, "pid": 5, "tid": 7, "ts": 1716454224701522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701509, "dur": 14, "args": { "External id": 205675, "cbid": 211, "correlation": 205675 } }, { "ph": "s", "id": 205675, "pid": 76337, "tid": -914061504, "ts": 1716454224701509, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224701546, "dur": 4, "args": { "External id": 205683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205683, "pid": 5, "tid": 7, "ts": 1716454224701546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701537, "dur": 8, "args": { "External id": 205683, "cbid": 211, "correlation": 205683 } }, { "ph": "s", "id": 205683, "pid": 76337, "tid": -914061504, "ts": 1716454224701537, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224701565, "dur": 5, "args": { "External id": 205691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205691, "pid": 5, "tid": 7, "ts": 1716454224701565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701557, "dur": 6, "args": { "External id": 205691, "cbid": 211, "correlation": 205691 } }, { "ph": "s", "id": 205691, "pid": 76337, "tid": -914061504, "ts": 1716454224701557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701584, "dur": 3, "args": { "External id": 205699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205699, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205699, "pid": 5, "tid": 7, "ts": 1716454224701584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701576, "dur": 7, "args": { "External id": 205699, "cbid": 211, "correlation": 205699 } }, { "ph": "s", "id": 205699, "pid": 76337, "tid": -914061504, "ts": 1716454224701576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnOther_add, at::detail::Array >(int, at::native::CUDAFunctorOnOther_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701644, "dur": 3, "args": { "External id": 205707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205707, "registers per thread": 17, "shared memory": 0, "blocks per SM": 0.3, "warps per SM": 1.2, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 205707, "pid": 5, "tid": 7, "ts": 1716454224701644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701631, "dur": 12, "args": { "External id": 205707, "cbid": 211, "correlation": 205707 } }, { "ph": "s", "id": 205707, "pid": 76337, "tid": -914061504, "ts": 1716454224701631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224701669, "dur": 4, "args": { "External id": 205715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205715, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205715, "pid": 5, "tid": 7, "ts": 1716454224701669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701661, "dur": 7, "args": { "External id": 205715, "cbid": 211, "correlation": 205715 } }, { "ph": "s", "id": 205715, "pid": 76337, "tid": -914061504, "ts": 1716454224701661, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224701692, "dur": 4, "args": { "External id": 205723, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205723, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205723, "pid": 5, "tid": 7, "ts": 1716454224701692, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701683, "dur": 8, "args": { "External id": 205723, "cbid": 211, "correlation": 205723 } }, { "ph": "s", "id": 205723, "pid": 76337, "tid": -914061504, "ts": 1716454224701683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224701710, "dur": 3, "args": { "External id": 205731, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205731, "registers per thread": 19, "shared memory": 0, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [96, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 205731, "pid": 5, "tid": 7, "ts": 1716454224701710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224701703, "dur": 6, "args": { "External id": 205731, "cbid": 211, "correlation": 205731 } }, { "ph": "s", "id": 205731, "pid": 76337, "tid": -914061504, "ts": 1716454224701703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224702142, "dur": 5, "args": { "External id": 205740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205740, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205740, "pid": 5, "tid": 7, "ts": 1716454224702142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702124, "dur": 19, "args": { "External id": 205740, "cbid": 211, "correlation": 205740 } }, { "ph": "s", "id": 205740, "pid": 76337, "tid": -914061504, "ts": 1716454224702124, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224702182, "dur": 5, "args": { "External id": 205749, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205749, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205749, "pid": 5, "tid": 7, "ts": 1716454224702182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702171, "dur": 10, "args": { "External id": 205749, "cbid": 211, "correlation": 205749 } }, { "ph": "s", "id": 205749, "pid": 76337, "tid": -914061504, "ts": 1716454224702171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 5, "tid": 7, "ts": 1716454224702312, "dur": 3, "args": { "External id": 205765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205765, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.075, "grid": [3, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205765, "pid": 5, "tid": 7, "ts": 1716454224702312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702298, "dur": 14, "args": { "External id": 205765, "cbid": 211, "correlation": 205765 } }, { "ph": "s", "id": 205765, "pid": 76337, "tid": -914061504, "ts": 1716454224702298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224702346, "dur": 3, "args": { "External id": 205773, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205773, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205773, "pid": 5, "tid": 7, "ts": 1716454224702346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702336, "dur": 8, "args": { "External id": 205773, "cbid": 211, "correlation": 205773 } }, { "ph": "s", "id": 205773, "pid": 76337, "tid": -914061504, "ts": 1716454224702336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224702381, "dur": 3, "args": { "External id": 205781, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205781, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205781, "pid": 5, "tid": 7, "ts": 1716454224702381, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702371, "dur": 9, "args": { "External id": 205781, "cbid": 211, "correlation": 205781 } }, { "ph": "s", "id": 205781, "pid": 76337, "tid": -914061504, "ts": 1716454224702371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224702414, "dur": 4, "args": { "External id": 205789, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205789, "registers per thread": 19, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205789, "pid": 5, "tid": 7, "ts": 1716454224702414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702403, "dur": 10, "args": { "External id": 205789, "cbid": 211, "correlation": 205789 } }, { "ph": "s", "id": 205789, "pid": 76337, "tid": -914061504, "ts": 1716454224702403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1} const&)::{lambda(int)#2})", "pid": 5, "tid": 7, "ts": 1716454224702469, "dur": 4, "args": { "External id": 205801, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205801, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.05, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205801, "pid": 5, "tid": 7, "ts": 1716454224702469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702456, "dur": 13, "args": { "External id": 205801, "cbid": 211, "correlation": 205801 } }, { "ph": "s", "id": 205801, "pid": 76337, "tid": -914061504, "ts": 1716454224702456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224702519, "dur": 4, "args": { "External id": 205812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205812, "pid": 5, "tid": 7, "ts": 1716454224702519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702502, "dur": 17, "args": { "External id": 205812, "cbid": 211, "correlation": 205812 } }, { "ph": "s", "id": 205812, "pid": 76337, "tid": -914061504, "ts": 1716454224702502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224702550, "dur": 3, "args": { "External id": 205820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205820, "pid": 5, "tid": 7, "ts": 1716454224702550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702541, "dur": 8, "args": { "External id": 205820, "cbid": 211, "correlation": 205820 } }, { "ph": "s", "id": 205820, "pid": 76337, "tid": -914061504, "ts": 1716454224702541, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::sin_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224702588, "dur": 5, "args": { "External id": 205828, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205828, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205828, "pid": 5, "tid": 7, "ts": 1716454224702588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702575, "dur": 14, "args": { "External id": 205828, "cbid": 211, "correlation": 205828 } }, { "ph": "s", "id": 205828, "pid": 76337, "tid": -914061504, "ts": 1716454224702575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::cos_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224702622, "dur": 5, "args": { "External id": 205836, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205836, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.0375, "warps per SM": 0.15, "grid": [3, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205836, "pid": 5, "tid": 7, "ts": 1716454224702622, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702611, "dur": 10, "args": { "External id": 205836, "cbid": 211, "correlation": 205836 } }, { "ph": "s", "id": 205836, "pid": 76337, "tid": -914061504, "ts": 1716454224702611, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224702655, "dur": 4, "args": { "External id": 205845, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205845, "registers per thread": 22, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205845, "pid": 5, "tid": 7, "ts": 1716454224702655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702642, "dur": 13, "args": { "External id": 205845, "cbid": 211, "correlation": 205845 } }, { "ph": "s", "id": 205845, "pid": 76337, "tid": -914061504, "ts": 1716454224702642, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224702720, "dur": 4, "args": { "External id": 205858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205858, "registers per thread": 24, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205858, "pid": 5, "tid": 7, "ts": 1716454224702720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702705, "dur": 16, "args": { "External id": 205858, "cbid": 211, "correlation": 205858 } }, { "ph": "s", "id": 205858, "pid": 76337, "tid": -914061504, "ts": 1716454224702705, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 5, "tid": 7, "ts": 1716454224702764, "dur": 5, "args": { "External id": 205868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205868, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.0625, "warps per SM": 0.25, "grid": [5, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 205868, "pid": 5, "tid": 7, "ts": 1716454224702764, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702753, "dur": 11, "args": { "External id": 205868, "cbid": 211, "correlation": 205868 } }, { "ph": "s", "id": 205868, "pid": 76337, "tid": -914061504, "ts": 1716454224702753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224702896, "dur": 6, "args": { "External id": 205885, "cbid": 251, "correlation": 205885 } }, { "ph": "f", "id": 205885, "pid": 76337, "tid": -914061504, "ts": 1716454224702896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void gemmSN_TN_kernel, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half> >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched<__half const>, cublasGemvTensorStridedBatched<__half>, float>)", "pid": 5, "tid": 7, "ts": 1716454224702926, "dur": 12, "args": { "External id": 205887, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205887, "registers per thread": 72, "shared memory": 13824, "blocks per SM": 2, "warps per SM": 8, "grid": [160, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 205887, "pid": 5, "tid": 7, "ts": 1716454224702926, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702910, "dur": 17, "args": { "External id": 205887, "cbid": 211, "correlation": 205887 } }, { "ph": "s", "id": 205887, "pid": 76337, "tid": -914061504, "ts": 1716454224702910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224703001, "dur": 4, "args": { "External id": 205895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205895, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 205895, "pid": 5, "tid": 7, "ts": 1716454224703001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224702984, "dur": 16, "args": { "External id": 205895, "cbid": 211, "correlation": 205895 } }, { "ph": "s", "id": 205895, "pid": 76337, "tid": -914061504, "ts": 1716454224702984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224703061, "dur": 2, "args": { "External id": 205911, "cbid": 251, "correlation": 205911 } }, { "ph": "f", "id": 205911, "pid": 76337, "tid": -914061504, "ts": 1716454224703061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224703067, "dur": 0, "args": { "External id": 205913, "cbid": 251, "correlation": 205913 } }, { "ph": "f", "id": 205913, "pid": 76337, "tid": -914061504, "ts": 1716454224703067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224703084, "dur": 13, "args": { "External id": 205914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205914, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 205914, "pid": 5, "tid": 7, "ts": 1716454224703084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703070, "dur": 14, "args": { "External id": 205914, "cbid": 211, "correlation": 205914 } }, { "ph": "s", "id": 205914, "pid": 76337, "tid": -914061504, "ts": 1716454224703070, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224703100, "dur": 5, "args": { "External id": 205916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205916, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 205916, "pid": 5, "tid": 7, "ts": 1716454224703100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703090, "dur": 8, "args": { "External id": 205916, "cbid": 211, "correlation": 205916 } }, { "ph": "s", "id": 205916, "pid": 76337, "tid": -914061504, "ts": 1716454224703090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224703204, "dur": 1, "args": { "External id": 205926, "cbid": 317, "correlation": 205926 } }, { "ph": "f", "id": 205926, "pid": 76337, "tid": -914061504, "ts": 1716454224703204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224703206, "dur": 1, "args": { "External id": 205927, "cbid": 203, "correlation": 205927 } }, { "ph": "f", "id": 205927, "pid": 76337, "tid": -914061504, "ts": 1716454224703206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224703208, "dur": 1, "args": { "External id": 205928, "cbid": 205, "correlation": 205928 } }, { "ph": "f", "id": 205928, "pid": 76337, "tid": -914061504, "ts": 1716454224703208, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224703265, "dur": 7, "args": { "External id": 205932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205932, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 9.6, "warps per SM": 76.8, "grid": [96, 1, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205932, "pid": 5, "tid": 7, "ts": 1716454224703265, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703251, "dur": 14, "args": { "External id": 205932, "cbid": 211, "correlation": 205932 } }, { "ph": "s", "id": 205932, "pid": 76337, "tid": -914061504, "ts": 1716454224703251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224703277, "dur": 4, "args": { "External id": 205934, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205934, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 4, "warps per SM": 32, "grid": [1, 1, 320], "block": [256, 1, 1], "est. achieved occupancy %": 50 } }, { "ph": "f", "id": 205934, "pid": 5, "tid": 7, "ts": 1716454224703277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703268, "dur": 7, "args": { "External id": 205934, "cbid": 211, "correlation": 205934 } }, { "ph": "s", "id": 205934, "pid": 76337, "tid": -914061504, "ts": 1716454224703268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224703297, "dur": 3, "args": { "External id": 205936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 205936, "pid": 5, "tid": 7, "ts": 1716454224703297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703288, "dur": 8, "args": { "External id": 205936, "cbid": 211, "correlation": 205936 } }, { "ph": "s", "id": 205936, "pid": 76337, "tid": -914061504, "ts": 1716454224703288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224703303, "dur": 0, "args": { "External id": 205937, "cbid": 51, "correlation": 205937 } }, { "ph": "s", "id": 205937, "pid": 76337, "tid": -914061504, "ts": 1716454224703303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224703314, "dur": 84, "args": { "External id": 205938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205938, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 6, "warps per SM": 24, "grid": [96, 5, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 205938, "pid": 5, "tid": 7, "ts": 1716454224703314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703305, "dur": 8, "args": { "External id": 205938, "cbid": 211, "correlation": 205938 } }, { "ph": "s", "id": 205938, "pid": 76337, "tid": -914061504, "ts": 1716454224703305, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224703400, "dur": 60, "args": { "External id": 205943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205943, "pid": 5, "tid": 7, "ts": 1716454224703400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224703343, "dur": 10, "args": { "External id": 205943, "cbid": 211, "correlation": 205943 } }, { "ph": "s", "id": 205943, "pid": 76337, "tid": -914061504, "ts": 1716454224703343, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224705168, "dur": 52, "args": { "External id": 205963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205963, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 205963, "pid": 5, "tid": 7, "ts": 1716454224705168, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705147, "dur": 21, "args": { "External id": 205963, "cbid": 211, "correlation": 205963 } }, { "ph": "s", "id": 205963, "pid": 76337, "tid": -914061504, "ts": 1716454224705147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224705222, "dur": 5, "args": { "External id": 205975, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205975, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 205975, "pid": 5, "tid": 7, "ts": 1716454224705222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705180, "dur": 8, "args": { "External id": 205975, "cbid": 211, "correlation": 205975 } }, { "ph": "s", "id": 205975, "pid": 76337, "tid": -914061504, "ts": 1716454224705180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224705227, "dur": 56, "args": { "External id": 205978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205978, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205978, "pid": 5, "tid": 7, "ts": 1716454224705227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705206, "dur": 7, "args": { "External id": 205978, "cbid": 211, "correlation": 205978 } }, { "ph": "s", "id": 205978, "pid": 76337, "tid": -914061504, "ts": 1716454224705206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224705285, "dur": 36, "args": { "External id": 205987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 205987, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 205987, "pid": 5, "tid": 7, "ts": 1716454224705285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705252, "dur": 10, "args": { "External id": 205987, "cbid": 211, "correlation": 205987 } }, { "ph": "s", "id": 205987, "pid": 76337, "tid": -914061504, "ts": 1716454224705252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224705311, "dur": 0, "args": { "External id": 205997, "cbid": 317, "correlation": 205997 } }, { "ph": "f", "id": 205997, "pid": 76337, "tid": -914061504, "ts": 1716454224705311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224705312, "dur": 0, "args": { "External id": 205998, "cbid": 203, "correlation": 205998 } }, { "ph": "f", "id": 205998, "pid": 76337, "tid": -914061504, "ts": 1716454224705312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224705313, "dur": 0, "args": { "External id": 205999, "cbid": 205, "correlation": 205999 } }, { "ph": "f", "id": 205999, "pid": 76337, "tid": -914061504, "ts": 1716454224705313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224705344, "dur": 40, "args": { "External id": 206003, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206003, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206003, "pid": 5, "tid": 7, "ts": 1716454224705344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705331, "dur": 13, "args": { "External id": 206003, "cbid": 211, "correlation": 206003 } }, { "ph": "s", "id": 206003, "pid": 76337, "tid": -914061504, "ts": 1716454224705331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224705386, "dur": 14, "args": { "External id": 206005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206005, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206005, "pid": 5, "tid": 7, "ts": 1716454224705386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705347, "dur": 6, "args": { "External id": 206005, "cbid": 211, "correlation": 206005 } }, { "ph": "s", "id": 206005, "pid": 76337, "tid": -914061504, "ts": 1716454224705347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224705401, "dur": 3, "args": { "External id": 206007, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206007, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206007, "pid": 5, "tid": 7, "ts": 1716454224705401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705359, "dur": 6, "args": { "External id": 206007, "cbid": 211, "correlation": 206007 } }, { "ph": "s", "id": 206007, "pid": 76337, "tid": -914061504, "ts": 1716454224705359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224705368, "dur": 0, "args": { "External id": 206008, "cbid": 51, "correlation": 206008 } }, { "ph": "s", "id": 206008, "pid": 76337, "tid": -914061504, "ts": 1716454224705368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224705405, "dur": 706, "args": { "External id": 206009, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206009, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206009, "pid": 5, "tid": 7, "ts": 1716454224705405, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705369, "dur": 7, "args": { "External id": 206009, "cbid": 211, "correlation": 206009 } }, { "ph": "s", "id": 206009, "pid": 76337, "tid": -914061504, "ts": 1716454224705369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224706112, "dur": 59, "args": { "External id": 206014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206014, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206014, "pid": 5, "tid": 7, "ts": 1716454224706112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705401, "dur": 9, "args": { "External id": 206014, "cbid": 211, "correlation": 206014 } }, { "ph": "s", "id": 206014, "pid": 76337, "tid": -914061504, "ts": 1716454224705401, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224706173, "dur": 4, "args": { "External id": 206022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206022, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206022, "pid": 5, "tid": 7, "ts": 1716454224706173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705444, "dur": 9, "args": { "External id": 206022, "cbid": 211, "correlation": 206022 } }, { "ph": "s", "id": 206022, "pid": 76337, "tid": -914061504, "ts": 1716454224705444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224705513, "dur": 1, "args": { "External id": 206038, "cbid": 251, "correlation": 206038 } }, { "ph": "f", "id": 206038, "pid": 76337, "tid": -914061504, "ts": 1716454224705513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224705519, "dur": 0, "args": { "External id": 206040, "cbid": 251, "correlation": 206040 } }, { "ph": "f", "id": 206040, "pid": 76337, "tid": -914061504, "ts": 1716454224705519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224706178, "dur": 9, "args": { "External id": 206041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206041, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 206041, "pid": 5, "tid": 7, "ts": 1716454224706178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705521, "dur": 11, "args": { "External id": 206041, "cbid": 211, "correlation": 206041 } }, { "ph": "s", "id": 206041, "pid": 76337, "tid": -914061504, "ts": 1716454224705521, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224706188, "dur": 4, "args": { "External id": 206043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206043, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 206043, "pid": 5, "tid": 7, "ts": 1716454224706188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705534, "dur": 7, "args": { "External id": 206043, "cbid": 211, "correlation": 206043 } }, { "ph": "s", "id": 206043, "pid": 76337, "tid": -914061504, "ts": 1716454224705534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224706193, "dur": 55, "args": { "External id": 206053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206053, "pid": 5, "tid": 7, "ts": 1716454224706193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705596, "dur": 12, "args": { "External id": 206053, "cbid": 211, "correlation": 206053 } }, { "ph": "s", "id": 206053, "pid": 76337, "tid": -914061504, "ts": 1716454224705596, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224706250, "dur": 51, "args": { "External id": 206073, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206073, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 206073, "pid": 5, "tid": 7, "ts": 1716454224706250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705663, "dur": 10, "args": { "External id": 206073, "cbid": 211, "correlation": 206073 } }, { "ph": "s", "id": 206073, "pid": 76337, "tid": -914061504, "ts": 1716454224705663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224706302, "dur": 4, "args": { "External id": 206085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206085, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206085, "pid": 5, "tid": 7, "ts": 1716454224706302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705683, "dur": 7, "args": { "External id": 206085, "cbid": 211, "correlation": 206085 } }, { "ph": "s", "id": 206085, "pid": 76337, "tid": -914061504, "ts": 1716454224705683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224706308, "dur": 56, "args": { "External id": 206088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206088, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206088, "pid": 5, "tid": 7, "ts": 1716454224706308, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705704, "dur": 6, "args": { "External id": 206088, "cbid": 211, "correlation": 206088 } }, { "ph": "s", "id": 206088, "pid": 76337, "tid": -914061504, "ts": 1716454224705704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224706364, "dur": 37, "args": { "External id": 206097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206097, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206097, "pid": 5, "tid": 7, "ts": 1716454224706364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705744, "dur": 10, "args": { "External id": 206097, "cbid": 211, "correlation": 206097 } }, { "ph": "s", "id": 206097, "pid": 76337, "tid": -914061504, "ts": 1716454224705744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224705814, "dur": 0, "args": { "External id": 206107, "cbid": 317, "correlation": 206107 } }, { "ph": "f", "id": 206107, "pid": 76337, "tid": -914061504, "ts": 1716454224705814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224705815, "dur": 0, "args": { "External id": 206108, "cbid": 203, "correlation": 206108 } }, { "ph": "f", "id": 206108, "pid": 76337, "tid": -914061504, "ts": 1716454224705815, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224705816, "dur": 0, "args": { "External id": 206109, "cbid": 205, "correlation": 206109 } }, { "ph": "f", "id": 206109, "pid": 76337, "tid": -914061504, "ts": 1716454224705816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224706402, "dur": 41, "args": { "External id": 206113, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206113, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206113, "pid": 5, "tid": 7, "ts": 1716454224706402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705830, "dur": 13, "args": { "External id": 206113, "cbid": 211, "correlation": 206113 } }, { "ph": "s", "id": 206113, "pid": 76337, "tid": -914061504, "ts": 1716454224705830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224706445, "dur": 14, "args": { "External id": 206115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206115, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206115, "pid": 5, "tid": 7, "ts": 1716454224706445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705845, "dur": 5, "args": { "External id": 206115, "cbid": 211, "correlation": 206115 } }, { "ph": "s", "id": 206115, "pid": 76337, "tid": -914061504, "ts": 1716454224705845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224706461, "dur": 4, "args": { "External id": 206117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206117, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206117, "pid": 5, "tid": 7, "ts": 1716454224706461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705854, "dur": 5, "args": { "External id": 206117, "cbid": 211, "correlation": 206117 } }, { "ph": "s", "id": 206117, "pid": 76337, "tid": -914061504, "ts": 1716454224705854, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224705862, "dur": 0, "args": { "External id": 206118, "cbid": 51, "correlation": 206118 } }, { "ph": "s", "id": 206118, "pid": 76337, "tid": -914061504, "ts": 1716454224705862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224706466, "dur": 699, "args": { "External id": 206119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206119, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206119, "pid": 5, "tid": 7, "ts": 1716454224706466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705863, "dur": 5, "args": { "External id": 206119, "cbid": 211, "correlation": 206119 } }, { "ph": "s", "id": 206119, "pid": 76337, "tid": -914061504, "ts": 1716454224705863, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224707166, "dur": 59, "args": { "External id": 206124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206124, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206124, "pid": 5, "tid": 7, "ts": 1716454224707166, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705892, "dur": 8, "args": { "External id": 206124, "cbid": 211, "correlation": 206124 } }, { "ph": "s", "id": 206124, "pid": 76337, "tid": -914061504, "ts": 1716454224705892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224707226, "dur": 50, "args": { "External id": 206132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206132, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206132, "pid": 5, "tid": 7, "ts": 1716454224707226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705924, "dur": 8, "args": { "External id": 206132, "cbid": 211, "correlation": 206132 } }, { "ph": "s", "id": 206132, "pid": 76337, "tid": -914061504, "ts": 1716454224705924, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224707278, "dur": 35, "args": { "External id": 206140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206140, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206140, "pid": 5, "tid": 7, "ts": 1716454224707278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224705956, "dur": 9, "args": { "External id": 206140, "cbid": 211, "correlation": 206140 } }, { "ph": "s", "id": 206140, "pid": 76337, "tid": -914061504, "ts": 1716454224705956, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224707314, "dur": 53, "args": { "External id": 206160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206160, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 206160, "pid": 5, "tid": 7, "ts": 1716454224707314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706050, "dur": 13, "args": { "External id": 206160, "cbid": 211, "correlation": 206160 } }, { "ph": "s", "id": 206160, "pid": 76337, "tid": -914061504, "ts": 1716454224706050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224707368, "dur": 4, "args": { "External id": 206172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206172, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206172, "pid": 5, "tid": 7, "ts": 1716454224707368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706072, "dur": 6, "args": { "External id": 206172, "cbid": 211, "correlation": 206172 } }, { "ph": "s", "id": 206172, "pid": 76337, "tid": -914061504, "ts": 1716454224706072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224707373, "dur": 56, "args": { "External id": 206175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206175, "pid": 5, "tid": 7, "ts": 1716454224707373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706090, "dur": 7, "args": { "External id": 206175, "cbid": 211, "correlation": 206175 } }, { "ph": "s", "id": 206175, "pid": 76337, "tid": -914061504, "ts": 1716454224706090, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224706149, "dur": 0, "args": { "External id": 206186, "cbid": 317, "correlation": 206186 } }, { "ph": "f", "id": 206186, "pid": 76337, "tid": -914061504, "ts": 1716454224706149, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224706150, "dur": 0, "args": { "External id": 206187, "cbid": 203, "correlation": 206187 } }, { "ph": "f", "id": 206187, "pid": 76337, "tid": -914061504, "ts": 1716454224706150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224706150, "dur": 0, "args": { "External id": 206188, "cbid": 205, "correlation": 206188 } }, { "ph": "f", "id": 206188, "pid": 76337, "tid": -914061504, "ts": 1716454224706150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706185, "dur": 2, "args": { "External id": 206192, "cbid": 251, "correlation": 206192 } }, { "ph": "f", "id": 206192, "pid": 76337, "tid": -914061504, "ts": 1716454224706185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706189, "dur": 1, "args": { "External id": 206193, "cbid": 251, "correlation": 206193 } }, { "ph": "f", "id": 206193, "pid": 76337, "tid": -914061504, "ts": 1716454224706189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706191, "dur": 1, "args": { "External id": 206194, "cbid": 251, "correlation": 206194 } }, { "ph": "f", "id": 206194, "pid": 76337, "tid": -914061504, "ts": 1716454224706191, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706193, "dur": 1, "args": { "External id": 206195, "cbid": 251, "correlation": 206195 } }, { "ph": "f", "id": 206195, "pid": 76337, "tid": -914061504, "ts": 1716454224706193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706195, "dur": 1, "args": { "External id": 206196, "cbid": 251, "correlation": 206196 } }, { "ph": "f", "id": 206196, "pid": 76337, "tid": -914061504, "ts": 1716454224706195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706197, "dur": 1, "args": { "External id": 206197, "cbid": 251, "correlation": 206197 } }, { "ph": "f", "id": 206197, "pid": 76337, "tid": -914061504, "ts": 1716454224706197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706199, "dur": 1, "args": { "External id": 206198, "cbid": 251, "correlation": 206198 } }, { "ph": "f", "id": 206198, "pid": 76337, "tid": -914061504, "ts": 1716454224706199, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706201, "dur": 1, "args": { "External id": 206199, "cbid": 251, "correlation": 206199 } }, { "ph": "f", "id": 206199, "pid": 76337, "tid": -914061504, "ts": 1716454224706201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706204, "dur": 0, "args": { "External id": 206200, "cbid": 251, "correlation": 206200 } }, { "ph": "f", "id": 206200, "pid": 76337, "tid": -914061504, "ts": 1716454224706204, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224707430, "dur": 115, "args": { "External id": 206201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206201, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206201, "pid": 5, "tid": 7, "ts": 1716454224707430, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706208, "dur": 14, "args": { "External id": 206201, "cbid": 211, "correlation": 206201 } }, { "ph": "s", "id": 206201, "pid": 76337, "tid": -914061504, "ts": 1716454224706208, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224707547, "dur": 60, "args": { "External id": 206207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206207, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206207, "pid": 5, "tid": 7, "ts": 1716454224707547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706246, "dur": 9, "args": { "External id": 206207, "cbid": 211, "correlation": 206207 } }, { "ph": "s", "id": 206207, "pid": 76337, "tid": -914061504, "ts": 1716454224706246, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224707609, "dur": 560, "args": { "External id": 206216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206216, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206216, "pid": 5, "tid": 7, "ts": 1716454224707609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706347, "dur": 27, "args": { "External id": 206216, "cbid": 211, "correlation": 206216 } }, { "ph": "s", "id": 206216, "pid": 76337, "tid": -914061504, "ts": 1716454224706347, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224708170, "dur": 183, "args": { "External id": 206238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206238, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206238, "pid": 5, "tid": 7, "ts": 1716454224708170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706433, "dur": 12, "args": { "External id": 206238, "cbid": 211, "correlation": 206238 } }, { "ph": "s", "id": 206238, "pid": 76337, "tid": -914061504, "ts": 1716454224706433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706550, "dur": 2, "args": { "External id": 206249, "cbid": 251, "correlation": 206249 } }, { "ph": "f", "id": 206249, "pid": 76337, "tid": -914061504, "ts": 1716454224706550, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224708354, "dur": 196, "args": { "External id": 206250, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206250, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206250, "pid": 5, "tid": 7, "ts": 1716454224708354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706558, "dur": 15, "args": { "External id": 206250, "cbid": 211, "correlation": 206250 } }, { "ph": "s", "id": 206250, "pid": 76337, "tid": -914061504, "ts": 1716454224706558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706631, "dur": 1, "args": { "External id": 206261, "cbid": 251, "correlation": 206261 } }, { "ph": "f", "id": 206261, "pid": 76337, "tid": -914061504, "ts": 1716454224706631, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224708551, "dur": 190, "args": { "External id": 206262, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206262, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206262, "pid": 5, "tid": 7, "ts": 1716454224708551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706635, "dur": 11, "args": { "External id": 206262, "cbid": 211, "correlation": 206262 } }, { "ph": "s", "id": 206262, "pid": 76337, "tid": -914061504, "ts": 1716454224706635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706700, "dur": 1, "args": { "External id": 206273, "cbid": 251, "correlation": 206273 } }, { "ph": "f", "id": 206273, "pid": 76337, "tid": -914061504, "ts": 1716454224706700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224708742, "dur": 188, "args": { "External id": 206274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206274, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206274, "pid": 5, "tid": 7, "ts": 1716454224708742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706704, "dur": 12, "args": { "External id": 206274, "cbid": 211, "correlation": 206274 } }, { "ph": "s", "id": 206274, "pid": 76337, "tid": -914061504, "ts": 1716454224706704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224708932, "dur": 18774, "args": { "External id": 206295, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206295, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206295, "pid": 5, "tid": 7, "ts": 1716454224708932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706816, "dur": 14, "args": { "External id": 206295, "cbid": 211, "correlation": 206295 } }, { "ph": "s", "id": 206295, "pid": 76337, "tid": -914061504, "ts": 1716454224706816, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224706932, "dur": 2, "args": { "External id": 206313, "cbid": 251, "correlation": 206313 } }, { "ph": "f", "id": 206313, "pid": 76337, "tid": -914061504, "ts": 1716454224706932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224727707, "dur": 210, "args": { "External id": 206315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206315, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206315, "pid": 5, "tid": 7, "ts": 1716454224727707, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224706939, "dur": 14, "args": { "External id": 206315, "cbid": 211, "correlation": 206315 } }, { "ph": "s", "id": 206315, "pid": 76337, "tid": -914061504, "ts": 1716454224706939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224727919, "dur": 67, "args": { "External id": 206323, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206323, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206323, "pid": 5, "tid": 7, "ts": 1716454224727919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707022, "dur": 13, "args": { "External id": 206323, "cbid": 211, "correlation": 206323 } }, { "ph": "s", "id": 206323, "pid": 76337, "tid": -914061504, "ts": 1716454224707022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224727987, "dur": 96, "args": { "External id": 206331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206331, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206331, "pid": 5, "tid": 7, "ts": 1716454224727987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707062, "dur": 9, "args": { "External id": 206331, "cbid": 211, "correlation": 206331 } }, { "ph": "s", "id": 206331, "pid": 76337, "tid": -914061504, "ts": 1716454224707062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224728084, "dur": 54, "args": { "External id": 206342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206342, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206342, "pid": 5, "tid": 7, "ts": 1716454224728084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707148, "dur": 14, "args": { "External id": 206342, "cbid": 211, "correlation": 206342 } }, { "ph": "s", "id": 206342, "pid": 76337, "tid": -914061504, "ts": 1716454224707148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224728140, "dur": 93, "args": { "External id": 206364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206364, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206364, "pid": 5, "tid": 7, "ts": 1716454224728140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707182, "dur": 7, "args": { "External id": 206364, "cbid": 211, "correlation": 206364 } }, { "ph": "s", "id": 206364, "pid": 76337, "tid": -914061504, "ts": 1716454224707182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707266, "dur": 1, "args": { "External id": 206375, "cbid": 251, "correlation": 206375 } }, { "ph": "f", "id": 206375, "pid": 76337, "tid": -914061504, "ts": 1716454224707266, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224728234, "dur": 103, "args": { "External id": 206376, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206376, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206376, "pid": 5, "tid": 7, "ts": 1716454224728234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707271, "dur": 12, "args": { "External id": 206376, "cbid": 211, "correlation": 206376 } }, { "ph": "s", "id": 206376, "pid": 76337, "tid": -914061504, "ts": 1716454224707271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707354, "dur": 2, "args": { "External id": 206387, "cbid": 251, "correlation": 206387 } }, { "ph": "f", "id": 206387, "pid": 76337, "tid": -914061504, "ts": 1716454224707354, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707359, "dur": 0, "args": { "External id": 206388, "cbid": 251, "correlation": 206388 } }, { "ph": "f", "id": 206388, "pid": 76337, "tid": -914061504, "ts": 1716454224707359, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224728338, "dur": 10, "args": { "External id": 206389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206389, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 206389, "pid": 5, "tid": 7, "ts": 1716454224728338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707361, "dur": 14, "args": { "External id": 206389, "cbid": 211, "correlation": 206389 } }, { "ph": "s", "id": 206389, "pid": 76337, "tid": -914061504, "ts": 1716454224707361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224728350, "dur": 5, "args": { "External id": 206391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206391, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 206391, "pid": 5, "tid": 7, "ts": 1716454224728350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707379, "dur": 7, "args": { "External id": 206391, "cbid": 211, "correlation": 206391 } }, { "ph": "s", "id": 206391, "pid": 76337, "tid": -914061504, "ts": 1716454224707379, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707441, "dur": 1, "args": { "External id": 206402, "cbid": 251, "correlation": 206402 } }, { "ph": "f", "id": 206402, "pid": 76337, "tid": -914061504, "ts": 1716454224707441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707444, "dur": 0, "args": { "External id": 206403, "cbid": 251, "correlation": 206403 } }, { "ph": "f", "id": 206403, "pid": 76337, "tid": -914061504, "ts": 1716454224707444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224728356, "dur": 6, "args": { "External id": 206404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206404, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 206404, "pid": 5, "tid": 7, "ts": 1716454224728356, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707446, "dur": 12, "args": { "External id": 206404, "cbid": 211, "correlation": 206404 } }, { "ph": "s", "id": 206404, "pid": 76337, "tid": -914061504, "ts": 1716454224707446, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224728363, "dur": 3, "args": { "External id": 206406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206406, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 206406, "pid": 5, "tid": 7, "ts": 1716454224728363, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707460, "dur": 5, "args": { "External id": 206406, "cbid": 211, "correlation": 206406 } }, { "ph": "s", "id": 206406, "pid": 76337, "tid": -914061504, "ts": 1716454224707460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224728368, "dur": 156, "args": { "External id": 206427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206427, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206427, "pid": 5, "tid": 7, "ts": 1716454224728368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707535, "dur": 13, "args": { "External id": 206427, "cbid": 211, "correlation": 206427 } }, { "ph": "s", "id": 206427, "pid": 76337, "tid": -914061504, "ts": 1716454224707535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707633, "dur": 2, "args": { "External id": 206445, "cbid": 251, "correlation": 206445 } }, { "ph": "f", "id": 206445, "pid": 76337, "tid": -914061504, "ts": 1716454224707633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224728525, "dur": 107, "args": { "External id": 206447, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206447, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206447, "pid": 5, "tid": 7, "ts": 1716454224728525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707639, "dur": 15, "args": { "External id": 206447, "cbid": 211, "correlation": 206447 } }, { "ph": "s", "id": 206447, "pid": 76337, "tid": -914061504, "ts": 1716454224707639, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224728634, "dur": 34, "args": { "External id": 206455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206455, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206455, "pid": 5, "tid": 7, "ts": 1716454224728634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707712, "dur": 12, "args": { "External id": 206455, "cbid": 211, "correlation": 206455 } }, { "ph": "s", "id": 206455, "pid": 76337, "tid": -914061504, "ts": 1716454224707712, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224728669, "dur": 68, "args": { "External id": 206463, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206463, "pid": 5, "tid": 7, "ts": 1716454224728669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707753, "dur": 9, "args": { "External id": 206463, "cbid": 211, "correlation": 206463 } }, { "ph": "s", "id": 206463, "pid": 76337, "tid": -914061504, "ts": 1716454224707753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224728739, "dur": 93, "args": { "External id": 206485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206485, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206485, "pid": 5, "tid": 7, "ts": 1716454224728739, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707805, "dur": 10, "args": { "External id": 206485, "cbid": 211, "correlation": 206485 } }, { "ph": "s", "id": 206485, "pid": 76337, "tid": -914061504, "ts": 1716454224707805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224707897, "dur": 1, "args": { "External id": 206501, "cbid": 251, "correlation": 206501 } }, { "ph": "f", "id": 206501, "pid": 76337, "tid": -914061504, "ts": 1716454224707897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224728833, "dur": 580, "args": { "External id": 206503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206503, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206503, "pid": 5, "tid": 7, "ts": 1716454224728833, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707902, "dur": 12, "args": { "External id": 206503, "cbid": 211, "correlation": 206503 } }, { "ph": "s", "id": 206503, "pid": 76337, "tid": -914061504, "ts": 1716454224707902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224729414, "dur": 247, "args": { "External id": 206511, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206511, "pid": 5, "tid": 7, "ts": 1716454224729414, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224707990, "dur": 15, "args": { "External id": 206511, "cbid": 211, "correlation": 206511 } }, { "ph": "s", "id": 206511, "pid": 76337, "tid": -914061504, "ts": 1716454224707990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224729662, "dur": 252, "args": { "External id": 206519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206519, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206519, "pid": 5, "tid": 7, "ts": 1716454224729662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708026, "dur": 8, "args": { "External id": 206519, "cbid": 211, "correlation": 206519 } }, { "ph": "s", "id": 206519, "pid": 76337, "tid": -914061504, "ts": 1716454224708026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708111, "dur": 2, "args": { "External id": 206535, "cbid": 251, "correlation": 206535 } }, { "ph": "f", "id": 206535, "pid": 76337, "tid": -914061504, "ts": 1716454224708111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708116, "dur": 0, "args": { "External id": 206537, "cbid": 251, "correlation": 206537 } }, { "ph": "f", "id": 206537, "pid": 76337, "tid": -914061504, "ts": 1716454224708116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224729916, "dur": 360, "args": { "External id": 206538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206538, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 206538, "pid": 5, "tid": 7, "ts": 1716454224729916, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708120, "dur": 14, "args": { "External id": 206538, "cbid": 211, "correlation": 206538 } }, { "ph": "s", "id": 206538, "pid": 76337, "tid": -914061504, "ts": 1716454224708120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224730277, "dur": 49, "args": { "External id": 206546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206546, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206546, "pid": 5, "tid": 7, "ts": 1716454224730277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708164, "dur": 10, "args": { "External id": 206546, "cbid": 211, "correlation": 206546 } }, { "ph": "s", "id": 206546, "pid": 76337, "tid": -914061504, "ts": 1716454224708164, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224730328, "dur": 158, "args": { "External id": 206557, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206557, "pid": 5, "tid": 7, "ts": 1716454224730328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708235, "dur": 13, "args": { "External id": 206557, "cbid": 211, "correlation": 206557 } }, { "ph": "s", "id": 206557, "pid": 76337, "tid": -914061504, "ts": 1716454224708235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224708301, "dur": 0, "args": { "External id": 206569, "cbid": 317, "correlation": 206569 } }, { "ph": "f", "id": 206569, "pid": 76337, "tid": -914061504, "ts": 1716454224708301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224708302, "dur": 0, "args": { "External id": 206570, "cbid": 203, "correlation": 206570 } }, { "ph": "f", "id": 206570, "pid": 76337, "tid": -914061504, "ts": 1716454224708302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224708303, "dur": 0, "args": { "External id": 206571, "cbid": 205, "correlation": 206571 } }, { "ph": "f", "id": 206571, "pid": 76337, "tid": -914061504, "ts": 1716454224708303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708327, "dur": 1, "args": { "External id": 206575, "cbid": 251, "correlation": 206575 } }, { "ph": "f", "id": 206575, "pid": 76337, "tid": -914061504, "ts": 1716454224708327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708329, "dur": 0, "args": { "External id": 206576, "cbid": 251, "correlation": 206576 } }, { "ph": "f", "id": 206576, "pid": 76337, "tid": -914061504, "ts": 1716454224708329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708330, "dur": 0, "args": { "External id": 206577, "cbid": 251, "correlation": 206577 } }, { "ph": "f", "id": 206577, "pid": 76337, "tid": -914061504, "ts": 1716454224708330, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708331, "dur": 0, "args": { "External id": 206578, "cbid": 251, "correlation": 206578 } }, { "ph": "f", "id": 206578, "pid": 76337, "tid": -914061504, "ts": 1716454224708331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708332, "dur": 0, "args": { "External id": 206579, "cbid": 251, "correlation": 206579 } }, { "ph": "f", "id": 206579, "pid": 76337, "tid": -914061504, "ts": 1716454224708332, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708333, "dur": 0, "args": { "External id": 206580, "cbid": 251, "correlation": 206580 } }, { "ph": "f", "id": 206580, "pid": 76337, "tid": -914061504, "ts": 1716454224708333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708334, "dur": 0, "args": { "External id": 206581, "cbid": 251, "correlation": 206581 } }, { "ph": "f", "id": 206581, "pid": 76337, "tid": -914061504, "ts": 1716454224708334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708335, "dur": 0, "args": { "External id": 206582, "cbid": 251, "correlation": 206582 } }, { "ph": "f", "id": 206582, "pid": 76337, "tid": -914061504, "ts": 1716454224708335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708336, "dur": 0, "args": { "External id": 206583, "cbid": 251, "correlation": 206583 } }, { "ph": "f", "id": 206583, "pid": 76337, "tid": -914061504, "ts": 1716454224708336, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224730487, "dur": 117, "args": { "External id": 206584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206584, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206584, "pid": 5, "tid": 7, "ts": 1716454224730487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708339, "dur": 13, "args": { "External id": 206584, "cbid": 211, "correlation": 206584 } }, { "ph": "s", "id": 206584, "pid": 76337, "tid": -914061504, "ts": 1716454224708339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224730605, "dur": 59, "args": { "External id": 206590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206590, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206590, "pid": 5, "tid": 7, "ts": 1716454224730605, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708374, "dur": 9, "args": { "External id": 206590, "cbid": 211, "correlation": 206590 } }, { "ph": "s", "id": 206590, "pid": 76337, "tid": -914061504, "ts": 1716454224708374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224730666, "dur": 50, "args": { "External id": 206598, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206598, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206598, "pid": 5, "tid": 7, "ts": 1716454224730666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708407, "dur": 9, "args": { "External id": 206598, "cbid": 211, "correlation": 206598 } }, { "ph": "s", "id": 206598, "pid": 76337, "tid": -914061504, "ts": 1716454224708407, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224730717, "dur": 53, "args": { "External id": 206618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206618, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 206618, "pid": 5, "tid": 7, "ts": 1716454224730717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708482, "dur": 12, "args": { "External id": 206618, "cbid": 211, "correlation": 206618 } }, { "ph": "s", "id": 206618, "pid": 76337, "tid": -914061504, "ts": 1716454224708482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224730771, "dur": 5, "args": { "External id": 206630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206630, "pid": 5, "tid": 7, "ts": 1716454224730771, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708504, "dur": 6, "args": { "External id": 206630, "cbid": 211, "correlation": 206630 } }, { "ph": "s", "id": 206630, "pid": 76337, "tid": -914061504, "ts": 1716454224708504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224730777, "dur": 56, "args": { "External id": 206633, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206633, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206633, "pid": 5, "tid": 7, "ts": 1716454224730777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708522, "dur": 7, "args": { "External id": 206633, "cbid": 211, "correlation": 206633 } }, { "ph": "s", "id": 206633, "pid": 76337, "tid": -914061504, "ts": 1716454224708522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224730835, "dur": 37, "args": { "External id": 206642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206642, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206642, "pid": 5, "tid": 7, "ts": 1716454224730835, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708562, "dur": 10, "args": { "External id": 206642, "cbid": 211, "correlation": 206642 } }, { "ph": "s", "id": 206642, "pid": 76337, "tid": -914061504, "ts": 1716454224708562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224708614, "dur": 0, "args": { "External id": 206652, "cbid": 317, "correlation": 206652 } }, { "ph": "f", "id": 206652, "pid": 76337, "tid": -914061504, "ts": 1716454224708614, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224708615, "dur": 0, "args": { "External id": 206653, "cbid": 203, "correlation": 206653 } }, { "ph": "f", "id": 206653, "pid": 76337, "tid": -914061504, "ts": 1716454224708615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224708616, "dur": 0, "args": { "External id": 206654, "cbid": 205, "correlation": 206654 } }, { "ph": "f", "id": 206654, "pid": 76337, "tid": -914061504, "ts": 1716454224708616, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224730873, "dur": 40, "args": { "External id": 206658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206658, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206658, "pid": 5, "tid": 7, "ts": 1716454224730873, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708630, "dur": 12, "args": { "External id": 206658, "cbid": 211, "correlation": 206658 } }, { "ph": "s", "id": 206658, "pid": 76337, "tid": -914061504, "ts": 1716454224708630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224730915, "dur": 14, "args": { "External id": 206660, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206660, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206660, "pid": 5, "tid": 7, "ts": 1716454224730915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708644, "dur": 5, "args": { "External id": 206660, "cbid": 211, "correlation": 206660 } }, { "ph": "s", "id": 206660, "pid": 76337, "tid": -914061504, "ts": 1716454224708644, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224730931, "dur": 4, "args": { "External id": 206662, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206662, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206662, "pid": 5, "tid": 7, "ts": 1716454224730931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708654, "dur": 6, "args": { "External id": 206662, "cbid": 211, "correlation": 206662 } }, { "ph": "s", "id": 206662, "pid": 76337, "tid": -914061504, "ts": 1716454224708654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224708663, "dur": 0, "args": { "External id": 206663, "cbid": 51, "correlation": 206663 } }, { "ph": "s", "id": 206663, "pid": 76337, "tid": -914061504, "ts": 1716454224708663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224730936, "dur": 706, "args": { "External id": 206664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206664, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206664, "pid": 5, "tid": 7, "ts": 1716454224730936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708664, "dur": 5, "args": { "External id": 206664, "cbid": 211, "correlation": 206664 } }, { "ph": "s", "id": 206664, "pid": 76337, "tid": -914061504, "ts": 1716454224708664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224731643, "dur": 59, "args": { "External id": 206669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206669, "pid": 5, "tid": 7, "ts": 1716454224731643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708691, "dur": 9, "args": { "External id": 206669, "cbid": 211, "correlation": 206669 } }, { "ph": "s", "id": 206669, "pid": 76337, "tid": -914061504, "ts": 1716454224708691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224731704, "dur": 4, "args": { "External id": 206677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206677, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206677, "pid": 5, "tid": 7, "ts": 1716454224731704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708734, "dur": 10, "args": { "External id": 206677, "cbid": 211, "correlation": 206677 } }, { "ph": "s", "id": 206677, "pid": 76337, "tid": -914061504, "ts": 1716454224708734, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708800, "dur": 1, "args": { "External id": 206693, "cbid": 251, "correlation": 206693 } }, { "ph": "f", "id": 206693, "pid": 76337, "tid": -914061504, "ts": 1716454224708800, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224708805, "dur": 0, "args": { "External id": 206695, "cbid": 251, "correlation": 206695 } }, { "ph": "f", "id": 206695, "pid": 76337, "tid": -914061504, "ts": 1716454224708805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224731709, "dur": 11, "args": { "External id": 206696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206696, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 206696, "pid": 5, "tid": 7, "ts": 1716454224731709, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708807, "dur": 11, "args": { "External id": 206696, "cbid": 211, "correlation": 206696 } }, { "ph": "s", "id": 206696, "pid": 76337, "tid": -914061504, "ts": 1716454224708807, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224731721, "dur": 5, "args": { "External id": 206698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206698, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 206698, "pid": 5, "tid": 7, "ts": 1716454224731721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708821, "dur": 5, "args": { "External id": 206698, "cbid": 211, "correlation": 206698 } }, { "ph": "s", "id": 206698, "pid": 76337, "tid": -914061504, "ts": 1716454224708821, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224731728, "dur": 54, "args": { "External id": 206708, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206708, "pid": 5, "tid": 7, "ts": 1716454224731728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708878, "dur": 13, "args": { "External id": 206708, "cbid": 211, "correlation": 206708 } }, { "ph": "s", "id": 206708, "pid": 76337, "tid": -914061504, "ts": 1716454224708878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224731783, "dur": 51, "args": { "External id": 206728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206728, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 206728, "pid": 5, "tid": 7, "ts": 1716454224731783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708946, "dur": 11, "args": { "External id": 206728, "cbid": 211, "correlation": 206728 } }, { "ph": "s", "id": 206728, "pid": 76337, "tid": -914061504, "ts": 1716454224708946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224731836, "dur": 4, "args": { "External id": 206740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206740, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206740, "pid": 5, "tid": 7, "ts": 1716454224731836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708966, "dur": 6, "args": { "External id": 206740, "cbid": 211, "correlation": 206740 } }, { "ph": "s", "id": 206740, "pid": 76337, "tid": -914061504, "ts": 1716454224708966, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224731841, "dur": 55, "args": { "External id": 206743, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206743, "pid": 5, "tid": 7, "ts": 1716454224731841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224708994, "dur": 7, "args": { "External id": 206743, "cbid": 211, "correlation": 206743 } }, { "ph": "s", "id": 206743, "pid": 76337, "tid": -914061504, "ts": 1716454224708994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224731898, "dur": 37, "args": { "External id": 206752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206752, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206752, "pid": 5, "tid": 7, "ts": 1716454224731898, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709036, "dur": 10, "args": { "External id": 206752, "cbid": 211, "correlation": 206752 } }, { "ph": "s", "id": 206752, "pid": 76337, "tid": -914061504, "ts": 1716454224709036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224709099, "dur": 0, "args": { "External id": 206762, "cbid": 317, "correlation": 206762 } }, { "ph": "f", "id": 206762, "pid": 76337, "tid": -914061504, "ts": 1716454224709099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224709099, "dur": 0, "args": { "External id": 206763, "cbid": 203, "correlation": 206763 } }, { "ph": "f", "id": 206763, "pid": 76337, "tid": -914061504, "ts": 1716454224709099, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224709100, "dur": 0, "args": { "External id": 206764, "cbid": 205, "correlation": 206764 } }, { "ph": "f", "id": 206764, "pid": 76337, "tid": -914061504, "ts": 1716454224709100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224731936, "dur": 39, "args": { "External id": 206768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206768, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206768, "pid": 5, "tid": 7, "ts": 1716454224731936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709113, "dur": 12, "args": { "External id": 206768, "cbid": 211, "correlation": 206768 } }, { "ph": "s", "id": 206768, "pid": 76337, "tid": -914061504, "ts": 1716454224709113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224731977, "dur": 14, "args": { "External id": 206770, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206770, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206770, "pid": 5, "tid": 7, "ts": 1716454224731977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709129, "dur": 6, "args": { "External id": 206770, "cbid": 211, "correlation": 206770 } }, { "ph": "s", "id": 206770, "pid": 76337, "tid": -914061504, "ts": 1716454224709129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224731992, "dur": 3, "args": { "External id": 206772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206772, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206772, "pid": 5, "tid": 7, "ts": 1716454224731992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709138, "dur": 5, "args": { "External id": 206772, "cbid": 211, "correlation": 206772 } }, { "ph": "s", "id": 206772, "pid": 76337, "tid": -914061504, "ts": 1716454224709138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224709146, "dur": 0, "args": { "External id": 206773, "cbid": 51, "correlation": 206773 } }, { "ph": "s", "id": 206773, "pid": 76337, "tid": -914061504, "ts": 1716454224709146, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224731997, "dur": 700, "args": { "External id": 206774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206774, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206774, "pid": 5, "tid": 7, "ts": 1716454224731997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709147, "dur": 5, "args": { "External id": 206774, "cbid": 211, "correlation": 206774 } }, { "ph": "s", "id": 206774, "pid": 76337, "tid": -914061504, "ts": 1716454224709147, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224732698, "dur": 59, "args": { "External id": 206779, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206779, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206779, "pid": 5, "tid": 7, "ts": 1716454224732698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709174, "dur": 9, "args": { "External id": 206779, "cbid": 211, "correlation": 206779 } }, { "ph": "s", "id": 206779, "pid": 76337, "tid": -914061504, "ts": 1716454224709174, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224732759, "dur": 50, "args": { "External id": 206787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206787, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206787, "pid": 5, "tid": 7, "ts": 1716454224732759, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709207, "dur": 8, "args": { "External id": 206787, "cbid": 211, "correlation": 206787 } }, { "ph": "s", "id": 206787, "pid": 76337, "tid": -914061504, "ts": 1716454224709207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224732810, "dur": 36, "args": { "External id": 206795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206795, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206795, "pid": 5, "tid": 7, "ts": 1716454224732810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709236, "dur": 8, "args": { "External id": 206795, "cbid": 211, "correlation": 206795 } }, { "ph": "s", "id": 206795, "pid": 76337, "tid": -914061504, "ts": 1716454224709236, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224732847, "dur": 52, "args": { "External id": 206815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206815, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 206815, "pid": 5, "tid": 7, "ts": 1716454224732847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709314, "dur": 13, "args": { "External id": 206815, "cbid": 211, "correlation": 206815 } }, { "ph": "s", "id": 206815, "pid": 76337, "tid": -914061504, "ts": 1716454224709314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224732900, "dur": 4, "args": { "External id": 206827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206827, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 206827, "pid": 5, "tid": 7, "ts": 1716454224732900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709336, "dur": 6, "args": { "External id": 206827, "cbid": 211, "correlation": 206827 } }, { "ph": "s", "id": 206827, "pid": 76337, "tid": -914061504, "ts": 1716454224709336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224732905, "dur": 55, "args": { "External id": 206830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206830, "pid": 5, "tid": 7, "ts": 1716454224732905, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709354, "dur": 6, "args": { "External id": 206830, "cbid": 211, "correlation": 206830 } }, { "ph": "s", "id": 206830, "pid": 76337, "tid": -914061504, "ts": 1716454224709354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224709410, "dur": 0, "args": { "External id": 206841, "cbid": 317, "correlation": 206841 } }, { "ph": "f", "id": 206841, "pid": 76337, "tid": -914061504, "ts": 1716454224709410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224709411, "dur": 0, "args": { "External id": 206842, "cbid": 203, "correlation": 206842 } }, { "ph": "f", "id": 206842, "pid": 76337, "tid": -914061504, "ts": 1716454224709411, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224709412, "dur": 0, "args": { "External id": 206843, "cbid": 205, "correlation": 206843 } }, { "ph": "f", "id": 206843, "pid": 76337, "tid": -914061504, "ts": 1716454224709412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709434, "dur": 1, "args": { "External id": 206847, "cbid": 251, "correlation": 206847 } }, { "ph": "f", "id": 206847, "pid": 76337, "tid": -914061504, "ts": 1716454224709434, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709436, "dur": 0, "args": { "External id": 206848, "cbid": 251, "correlation": 206848 } }, { "ph": "f", "id": 206848, "pid": 76337, "tid": -914061504, "ts": 1716454224709436, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709437, "dur": 0, "args": { "External id": 206849, "cbid": 251, "correlation": 206849 } }, { "ph": "f", "id": 206849, "pid": 76337, "tid": -914061504, "ts": 1716454224709437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709438, "dur": 0, "args": { "External id": 206850, "cbid": 251, "correlation": 206850 } }, { "ph": "f", "id": 206850, "pid": 76337, "tid": -914061504, "ts": 1716454224709438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709438, "dur": 0, "args": { "External id": 206851, "cbid": 251, "correlation": 206851 } }, { "ph": "f", "id": 206851, "pid": 76337, "tid": -914061504, "ts": 1716454224709438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709439, "dur": 0, "args": { "External id": 206852, "cbid": 251, "correlation": 206852 } }, { "ph": "f", "id": 206852, "pid": 76337, "tid": -914061504, "ts": 1716454224709439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709440, "dur": 0, "args": { "External id": 206853, "cbid": 251, "correlation": 206853 } }, { "ph": "f", "id": 206853, "pid": 76337, "tid": -914061504, "ts": 1716454224709440, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709441, "dur": 0, "args": { "External id": 206854, "cbid": 251, "correlation": 206854 } }, { "ph": "f", "id": 206854, "pid": 76337, "tid": -914061504, "ts": 1716454224709441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709442, "dur": 0, "args": { "External id": 206855, "cbid": 251, "correlation": 206855 } }, { "ph": "f", "id": 206855, "pid": 76337, "tid": -914061504, "ts": 1716454224709442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224732962, "dur": 112, "args": { "External id": 206856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206856, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206856, "pid": 5, "tid": 7, "ts": 1716454224732962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709444, "dur": 12, "args": { "External id": 206856, "cbid": 211, "correlation": 206856 } }, { "ph": "s", "id": 206856, "pid": 76337, "tid": -914061504, "ts": 1716454224709444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224733075, "dur": 59, "args": { "External id": 206862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206862, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206862, "pid": 5, "tid": 7, "ts": 1716454224733075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709480, "dur": 8, "args": { "External id": 206862, "cbid": 211, "correlation": 206862 } }, { "ph": "s", "id": 206862, "pid": 76337, "tid": -914061504, "ts": 1716454224709480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224733136, "dur": 567, "args": { "External id": 206871, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206871, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206871, "pid": 5, "tid": 7, "ts": 1716454224733136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709561, "dur": 15, "args": { "External id": 206871, "cbid": 211, "correlation": 206871 } }, { "ph": "s", "id": 206871, "pid": 76337, "tid": -914061504, "ts": 1716454224709561, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224733704, "dur": 182, "args": { "External id": 206893, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206893, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206893, "pid": 5, "tid": 7, "ts": 1716454224733704, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709617, "dur": 11, "args": { "External id": 206893, "cbid": 211, "correlation": 206893 } }, { "ph": "s", "id": 206893, "pid": 76337, "tid": -914061504, "ts": 1716454224709617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709703, "dur": 1, "args": { "External id": 206904, "cbid": 251, "correlation": 206904 } }, { "ph": "f", "id": 206904, "pid": 76337, "tid": -914061504, "ts": 1716454224709703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224733887, "dur": 197, "args": { "External id": 206905, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206905, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206905, "pid": 5, "tid": 7, "ts": 1716454224733887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709708, "dur": 14, "args": { "External id": 206905, "cbid": 211, "correlation": 206905 } }, { "ph": "s", "id": 206905, "pid": 76337, "tid": -914061504, "ts": 1716454224709708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709777, "dur": 1, "args": { "External id": 206916, "cbid": 251, "correlation": 206916 } }, { "ph": "f", "id": 206916, "pid": 76337, "tid": -914061504, "ts": 1716454224709777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224734085, "dur": 188, "args": { "External id": 206917, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206917, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206917, "pid": 5, "tid": 7, "ts": 1716454224734085, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709781, "dur": 11, "args": { "External id": 206917, "cbid": 211, "correlation": 206917 } }, { "ph": "s", "id": 206917, "pid": 76337, "tid": -914061504, "ts": 1716454224709781, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224709843, "dur": 1, "args": { "External id": 206928, "cbid": 251, "correlation": 206928 } }, { "ph": "f", "id": 206928, "pid": 76337, "tid": -914061504, "ts": 1716454224709843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224734274, "dur": 186, "args": { "External id": 206929, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206929, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206929, "pid": 5, "tid": 7, "ts": 1716454224734274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709847, "dur": 11, "args": { "External id": 206929, "cbid": 211, "correlation": 206929 } }, { "ph": "s", "id": 206929, "pid": 76337, "tid": -914061504, "ts": 1716454224709847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224734462, "dur": 18799, "args": { "External id": 206950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206950, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 206950, "pid": 5, "tid": 7, "ts": 1716454224734462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224709928, "dur": 12, "args": { "External id": 206950, "cbid": 211, "correlation": 206950 } }, { "ph": "s", "id": 206950, "pid": 76337, "tid": -914061504, "ts": 1716454224709928, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710034, "dur": 1, "args": { "External id": 206968, "cbid": 251, "correlation": 206968 } }, { "ph": "f", "id": 206968, "pid": 76337, "tid": -914061504, "ts": 1716454224710034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224753262, "dur": 205, "args": { "External id": 206970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206970, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 206970, "pid": 5, "tid": 7, "ts": 1716454224753262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710040, "dur": 14, "args": { "External id": 206970, "cbid": 211, "correlation": 206970 } }, { "ph": "s", "id": 206970, "pid": 76337, "tid": -914061504, "ts": 1716454224710040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224753469, "dur": 66, "args": { "External id": 206978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206978, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206978, "pid": 5, "tid": 7, "ts": 1716454224753469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710112, "dur": 13, "args": { "External id": 206978, "cbid": 211, "correlation": 206978 } }, { "ph": "s", "id": 206978, "pid": 76337, "tid": -914061504, "ts": 1716454224710112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224753537, "dur": 96, "args": { "External id": 206986, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206986, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206986, "pid": 5, "tid": 7, "ts": 1716454224753537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710152, "dur": 9, "args": { "External id": 206986, "cbid": 211, "correlation": 206986 } }, { "ph": "s", "id": 206986, "pid": 76337, "tid": -914061504, "ts": 1716454224710152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224753634, "dur": 55, "args": { "External id": 206997, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 206997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 206997, "pid": 5, "tid": 7, "ts": 1716454224753634, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710226, "dur": 12, "args": { "External id": 206997, "cbid": 211, "correlation": 206997 } }, { "ph": "s", "id": 206997, "pid": 76337, "tid": -914061504, "ts": 1716454224710226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224753690, "dur": 93, "args": { "External id": 207019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207019, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207019, "pid": 5, "tid": 7, "ts": 1716454224753690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710257, "dur": 8, "args": { "External id": 207019, "cbid": 211, "correlation": 207019 } }, { "ph": "s", "id": 207019, "pid": 76337, "tid": -914061504, "ts": 1716454224710257, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710343, "dur": 1, "args": { "External id": 207030, "cbid": 251, "correlation": 207030 } }, { "ph": "f", "id": 207030, "pid": 76337, "tid": -914061504, "ts": 1716454224710343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224753785, "dur": 103, "args": { "External id": 207031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207031, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207031, "pid": 5, "tid": 7, "ts": 1716454224753785, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710348, "dur": 13, "args": { "External id": 207031, "cbid": 211, "correlation": 207031 } }, { "ph": "s", "id": 207031, "pid": 76337, "tid": -914061504, "ts": 1716454224710348, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710420, "dur": 1, "args": { "External id": 207042, "cbid": 251, "correlation": 207042 } }, { "ph": "f", "id": 207042, "pid": 76337, "tid": -914061504, "ts": 1716454224710420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710423, "dur": 0, "args": { "External id": 207043, "cbid": 251, "correlation": 207043 } }, { "ph": "f", "id": 207043, "pid": 76337, "tid": -914061504, "ts": 1716454224710423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224753889, "dur": 10, "args": { "External id": 207044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207044, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 207044, "pid": 5, "tid": 7, "ts": 1716454224753889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710425, "dur": 12, "args": { "External id": 207044, "cbid": 211, "correlation": 207044 } }, { "ph": "s", "id": 207044, "pid": 76337, "tid": -914061504, "ts": 1716454224710425, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224753901, "dur": 5, "args": { "External id": 207046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207046, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 207046, "pid": 5, "tid": 7, "ts": 1716454224753901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710439, "dur": 6, "args": { "External id": 207046, "cbid": 211, "correlation": 207046 } }, { "ph": "s", "id": 207046, "pid": 76337, "tid": -914061504, "ts": 1716454224710439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710500, "dur": 1, "args": { "External id": 207057, "cbid": 251, "correlation": 207057 } }, { "ph": "f", "id": 207057, "pid": 76337, "tid": -914061504, "ts": 1716454224710500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710503, "dur": 0, "args": { "External id": 207058, "cbid": 251, "correlation": 207058 } }, { "ph": "f", "id": 207058, "pid": 76337, "tid": -914061504, "ts": 1716454224710503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224753907, "dur": 6, "args": { "External id": 207059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207059, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 207059, "pid": 5, "tid": 7, "ts": 1716454224753907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710504, "dur": 13, "args": { "External id": 207059, "cbid": 211, "correlation": 207059 } }, { "ph": "s", "id": 207059, "pid": 76337, "tid": -914061504, "ts": 1716454224710504, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224753915, "dur": 3, "args": { "External id": 207061, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207061, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 207061, "pid": 5, "tid": 7, "ts": 1716454224753915, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710519, "dur": 5, "args": { "External id": 207061, "cbid": 211, "correlation": 207061 } }, { "ph": "s", "id": 207061, "pid": 76337, "tid": -914061504, "ts": 1716454224710519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224753920, "dur": 156, "args": { "External id": 207082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207082, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 207082, "pid": 5, "tid": 7, "ts": 1716454224753920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710592, "dur": 12, "args": { "External id": 207082, "cbid": 211, "correlation": 207082 } }, { "ph": "s", "id": 207082, "pid": 76337, "tid": -914061504, "ts": 1716454224710592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710689, "dur": 1, "args": { "External id": 207100, "cbid": 251, "correlation": 207100 } }, { "ph": "f", "id": 207100, "pid": 76337, "tid": -914061504, "ts": 1716454224710689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224754077, "dur": 107, "args": { "External id": 207102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207102, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 207102, "pid": 5, "tid": 7, "ts": 1716454224754077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710695, "dur": 14, "args": { "External id": 207102, "cbid": 211, "correlation": 207102 } }, { "ph": "s", "id": 207102, "pid": 76337, "tid": -914061504, "ts": 1716454224710695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224754186, "dur": 35, "args": { "External id": 207110, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207110, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207110, "pid": 5, "tid": 7, "ts": 1716454224754186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710764, "dur": 11, "args": { "External id": 207110, "cbid": 211, "correlation": 207110 } }, { "ph": "s", "id": 207110, "pid": 76337, "tid": -914061504, "ts": 1716454224710764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224754222, "dur": 66, "args": { "External id": 207118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207118, "pid": 5, "tid": 7, "ts": 1716454224754222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710805, "dur": 10, "args": { "External id": 207118, "cbid": 211, "correlation": 207118 } }, { "ph": "s", "id": 207118, "pid": 76337, "tid": -914061504, "ts": 1716454224710805, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224754290, "dur": 93, "args": { "External id": 207140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207140, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207140, "pid": 5, "tid": 7, "ts": 1716454224754290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710857, "dur": 11, "args": { "External id": 207140, "cbid": 211, "correlation": 207140 } }, { "ph": "s", "id": 207140, "pid": 76337, "tid": -914061504, "ts": 1716454224710857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224710942, "dur": 1, "args": { "External id": 207156, "cbid": 251, "correlation": 207156 } }, { "ph": "f", "id": 207156, "pid": 76337, "tid": -914061504, "ts": 1716454224710942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224754384, "dur": 579, "args": { "External id": 207158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207158, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207158, "pid": 5, "tid": 7, "ts": 1716454224754384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224710948, "dur": 13, "args": { "External id": 207158, "cbid": 211, "correlation": 207158 } }, { "ph": "s", "id": 207158, "pid": 76337, "tid": -914061504, "ts": 1716454224710948, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224754964, "dur": 244, "args": { "External id": 207166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207166, "pid": 5, "tid": 7, "ts": 1716454224754964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711023, "dur": 13, "args": { "External id": 207166, "cbid": 211, "correlation": 207166 } }, { "ph": "s", "id": 207166, "pid": 76337, "tid": -914061504, "ts": 1716454224711023, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224755209, "dur": 253, "args": { "External id": 207174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207174, "pid": 5, "tid": 7, "ts": 1716454224755209, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711054, "dur": 8, "args": { "External id": 207174, "cbid": 211, "correlation": 207174 } }, { "ph": "s", "id": 207174, "pid": 76337, "tid": -914061504, "ts": 1716454224711054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711136, "dur": 1, "args": { "External id": 207190, "cbid": 251, "correlation": 207190 } }, { "ph": "f", "id": 207190, "pid": 76337, "tid": -914061504, "ts": 1716454224711136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711140, "dur": 0, "args": { "External id": 207192, "cbid": 251, "correlation": 207192 } }, { "ph": "f", "id": 207192, "pid": 76337, "tid": -914061504, "ts": 1716454224711140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224755464, "dur": 363, "args": { "External id": 207193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207193, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 207193, "pid": 5, "tid": 7, "ts": 1716454224755464, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711143, "dur": 14, "args": { "External id": 207193, "cbid": 211, "correlation": 207193 } }, { "ph": "s", "id": 207193, "pid": 76337, "tid": -914061504, "ts": 1716454224711143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224755828, "dur": 50, "args": { "External id": 207201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207201, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207201, "pid": 5, "tid": 7, "ts": 1716454224755828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711185, "dur": 10, "args": { "External id": 207201, "cbid": 211, "correlation": 207201 } }, { "ph": "s", "id": 207201, "pid": 76337, "tid": -914061504, "ts": 1716454224711185, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224755879, "dur": 160, "args": { "External id": 207212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207212, "pid": 5, "tid": 7, "ts": 1716454224755879, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711253, "dur": 12, "args": { "External id": 207212, "cbid": 211, "correlation": 207212 } }, { "ph": "s", "id": 207212, "pid": 76337, "tid": -914061504, "ts": 1716454224711253, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224711318, "dur": 0, "args": { "External id": 207224, "cbid": 317, "correlation": 207224 } }, { "ph": "f", "id": 207224, "pid": 76337, "tid": -914061504, "ts": 1716454224711318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224711318, "dur": 0, "args": { "External id": 207225, "cbid": 203, "correlation": 207225 } }, { "ph": "f", "id": 207225, "pid": 76337, "tid": -914061504, "ts": 1716454224711318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224711319, "dur": 0, "args": { "External id": 207226, "cbid": 205, "correlation": 207226 } }, { "ph": "f", "id": 207226, "pid": 76337, "tid": -914061504, "ts": 1716454224711319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711341, "dur": 1, "args": { "External id": 207230, "cbid": 251, "correlation": 207230 } }, { "ph": "f", "id": 207230, "pid": 76337, "tid": -914061504, "ts": 1716454224711341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711343, "dur": 0, "args": { "External id": 207231, "cbid": 251, "correlation": 207231 } }, { "ph": "f", "id": 207231, "pid": 76337, "tid": -914061504, "ts": 1716454224711343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711344, "dur": 0, "args": { "External id": 207232, "cbid": 251, "correlation": 207232 } }, { "ph": "f", "id": 207232, "pid": 76337, "tid": -914061504, "ts": 1716454224711344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711345, "dur": 0, "args": { "External id": 207233, "cbid": 251, "correlation": 207233 } }, { "ph": "f", "id": 207233, "pid": 76337, "tid": -914061504, "ts": 1716454224711345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711346, "dur": 0, "args": { "External id": 207234, "cbid": 251, "correlation": 207234 } }, { "ph": "f", "id": 207234, "pid": 76337, "tid": -914061504, "ts": 1716454224711346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711347, "dur": 0, "args": { "External id": 207235, "cbid": 251, "correlation": 207235 } }, { "ph": "f", "id": 207235, "pid": 76337, "tid": -914061504, "ts": 1716454224711347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711348, "dur": 0, "args": { "External id": 207236, "cbid": 251, "correlation": 207236 } }, { "ph": "f", "id": 207236, "pid": 76337, "tid": -914061504, "ts": 1716454224711348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711349, "dur": 0, "args": { "External id": 207237, "cbid": 251, "correlation": 207237 } }, { "ph": "f", "id": 207237, "pid": 76337, "tid": -914061504, "ts": 1716454224711349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224711350, "dur": 0, "args": { "External id": 207238, "cbid": 251, "correlation": 207238 } }, { "ph": "f", "id": 207238, "pid": 76337, "tid": -914061504, "ts": 1716454224711350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224756040, "dur": 116, "args": { "External id": 207239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207239, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 207239, "pid": 5, "tid": 7, "ts": 1716454224756040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711353, "dur": 12, "args": { "External id": 207239, "cbid": 211, "correlation": 207239 } }, { "ph": "s", "id": 207239, "pid": 76337, "tid": -914061504, "ts": 1716454224711353, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224756157, "dur": 60, "args": { "External id": 207245, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207245, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207245, "pid": 5, "tid": 7, "ts": 1716454224756157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711387, "dur": 9, "args": { "External id": 207245, "cbid": 211, "correlation": 207245 } }, { "ph": "s", "id": 207245, "pid": 76337, "tid": -914061504, "ts": 1716454224711387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224756218, "dur": 50, "args": { "External id": 207253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207253, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207253, "pid": 5, "tid": 7, "ts": 1716454224756218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711418, "dur": 8, "args": { "External id": 207253, "cbid": 211, "correlation": 207253 } }, { "ph": "s", "id": 207253, "pid": 76337, "tid": -914061504, "ts": 1716454224711418, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224711491, "dur": 0, "args": { "External id": 207263, "cbid": 317, "correlation": 207263 } }, { "ph": "f", "id": 207263, "pid": 76337, "tid": -914061504, "ts": 1716454224711491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224711492, "dur": 0, "args": { "External id": 207264, "cbid": 203, "correlation": 207264 } }, { "ph": "f", "id": 207264, "pid": 76337, "tid": -914061504, "ts": 1716454224711492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224711493, "dur": 0, "args": { "External id": 207265, "cbid": 205, "correlation": 207265 } }, { "ph": "f", "id": 207265, "pid": 76337, "tid": -914061504, "ts": 1716454224711493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224756269, "dur": 42, "args": { "External id": 207269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207269, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207269, "pid": 5, "tid": 7, "ts": 1716454224756269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711510, "dur": 12, "args": { "External id": 207269, "cbid": 211, "correlation": 207269 } }, { "ph": "s", "id": 207269, "pid": 76337, "tid": -914061504, "ts": 1716454224711510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224756313, "dur": 14, "args": { "External id": 207271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207271, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207271, "pid": 5, "tid": 7, "ts": 1716454224756313, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711525, "dur": 5, "args": { "External id": 207271, "cbid": 211, "correlation": 207271 } }, { "ph": "s", "id": 207271, "pid": 76337, "tid": -914061504, "ts": 1716454224711525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224756329, "dur": 1, "args": { "External id": 207273, "device": 5, "context": 1, "stream": 7, "correlation": 207273, "bytes": 1536, "memory bandwidth (GB/s)": 0.9056603773584906 } }, { "ph": "f", "id": 207273, "pid": 5, "tid": 7, "ts": 1716454224756329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224711544, "dur": 17, "args": { "External id": 207273, "cbid": 51, "correlation": 207273 } }, { "ph": "s", "id": 207273, "pid": 76337, "tid": -914061504, "ts": 1716454224711544, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224756333, "dur": 363, "args": { "External id": 207274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207274, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 7.2, "warps per SM": 57.6, "grid": [2, 96, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207274, "pid": 5, "tid": 7, "ts": 1716454224756333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711563, "dur": 9, "args": { "External id": 207274, "cbid": 211, "correlation": 207274 } }, { "ph": "s", "id": 207274, "pid": 76337, "tid": -914061504, "ts": 1716454224711563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224756698, "dur": 12, "args": { "External id": 207276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207276, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207276, "pid": 5, "tid": 7, "ts": 1716454224756698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711581, "dur": 8, "args": { "External id": 207276, "cbid": 211, "correlation": 207276 } }, { "ph": "s", "id": 207276, "pid": 76337, "tid": -914061504, "ts": 1716454224711581, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224756712, "dur": 15, "args": { "External id": 207282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207282, "pid": 5, "tid": 7, "ts": 1716454224756712, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711612, "dur": 8, "args": { "External id": 207282, "cbid": 211, "correlation": 207282 } }, { "ph": "s", "id": 207282, "pid": 76337, "tid": -914061504, "ts": 1716454224711612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224756728, "dur": 19, "args": { "External id": 207302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207302, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 207302, "pid": 5, "tid": 7, "ts": 1716454224756728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711704, "dur": 12, "args": { "External id": 207302, "cbid": 211, "correlation": 207302 } }, { "ph": "s", "id": 207302, "pid": 76337, "tid": -914061504, "ts": 1716454224711704, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224756749, "dur": 5, "args": { "External id": 207314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207314, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 207314, "pid": 5, "tid": 7, "ts": 1716454224756749, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711726, "dur": 6, "args": { "External id": 207314, "cbid": 211, "correlation": 207314 } }, { "ph": "s", "id": 207314, "pid": 76337, "tid": -914061504, "ts": 1716454224711726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224756755, "dur": 18, "args": { "External id": 207317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207317, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207317, "pid": 5, "tid": 7, "ts": 1716454224756755, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711745, "dur": 7, "args": { "External id": 207317, "cbid": 211, "correlation": 207317 } }, { "ph": "s", "id": 207317, "pid": 76337, "tid": -914061504, "ts": 1716454224711745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224756773, "dur": 11, "args": { "External id": 207326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207326, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207326, "pid": 5, "tid": 7, "ts": 1716454224756773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711784, "dur": 10, "args": { "External id": 207326, "cbid": 211, "correlation": 207326 } }, { "ph": "s", "id": 207326, "pid": 76337, "tid": -914061504, "ts": 1716454224711784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224711841, "dur": 0, "args": { "External id": 207336, "cbid": 317, "correlation": 207336 } }, { "ph": "f", "id": 207336, "pid": 76337, "tid": -914061504, "ts": 1716454224711841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224711841, "dur": 0, "args": { "External id": 207337, "cbid": 203, "correlation": 207337 } }, { "ph": "f", "id": 207337, "pid": 76337, "tid": -914061504, "ts": 1716454224711841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224711842, "dur": 0, "args": { "External id": 207338, "cbid": 205, "correlation": 207338 } }, { "ph": "f", "id": 207338, "pid": 76337, "tid": -914061504, "ts": 1716454224711842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224756786, "dur": 11, "args": { "External id": 207342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207342, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [24, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207342, "pid": 5, "tid": 7, "ts": 1716454224756786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711857, "dur": 12, "args": { "External id": 207342, "cbid": 211, "correlation": 207342 } }, { "ph": "s", "id": 207342, "pid": 76337, "tid": -914061504, "ts": 1716454224711857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224756798, "dur": 24, "args": { "External id": 207344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 10, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207344, "pid": 5, "tid": 7, "ts": 1716454224756798, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711871, "dur": 5, "args": { "External id": 207344, "cbid": 211, "correlation": 207344 } }, { "ph": "s", "id": 207344, "pid": 76337, "tid": -914061504, "ts": 1716454224711871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224756824, "dur": 3, "args": { "External id": 207346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.05, "warps per SM": 0.4, "grid": [4, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 1 } }, { "ph": "f", "id": 207346, "pid": 5, "tid": 7, "ts": 1716454224756824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711882, "dur": 6, "args": { "External id": 207346, "cbid": 211, "correlation": 207346 } }, { "ph": "s", "id": 207346, "pid": 76337, "tid": -914061504, "ts": 1716454224711882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224711892, "dur": 0, "args": { "External id": 207347, "cbid": 51, "correlation": 207347 } }, { "ph": "s", "id": 207347, "pid": 76337, "tid": -914061504, "ts": 1716454224711892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224756829, "dur": 359, "args": { "External id": 207348, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207348, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207348, "pid": 5, "tid": 7, "ts": 1716454224756829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711893, "dur": 7, "args": { "External id": 207348, "cbid": 211, "correlation": 207348 } }, { "ph": "s", "id": 207348, "pid": 76337, "tid": -914061504, "ts": 1716454224711893, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224757190, "dur": 21, "args": { "External id": 207349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207349, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207349, "pid": 5, "tid": 7, "ts": 1716454224757190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711902, "dur": 5, "args": { "External id": 207349, "cbid": 211, "correlation": 207349 } }, { "ph": "s", "id": 207349, "pid": 76337, "tid": -914061504, "ts": 1716454224711902, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224757212, "dur": 33, "args": { "External id": 207355, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207355, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207355, "pid": 5, "tid": 7, "ts": 1716454224757212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711930, "dur": 9, "args": { "External id": 207355, "cbid": 211, "correlation": 207355 } }, { "ph": "s", "id": 207355, "pid": 76337, "tid": -914061504, "ts": 1716454224711930, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224757246, "dur": 3, "args": { "External id": 207363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207363, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 207363, "pid": 5, "tid": 7, "ts": 1716454224757246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224711983, "dur": 11, "args": { "External id": 207363, "cbid": 211, "correlation": 207363 } }, { "ph": "s", "id": 207363, "pid": 76337, "tid": -914061504, "ts": 1716454224711983, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712051, "dur": 1, "args": { "External id": 207379, "cbid": 251, "correlation": 207379 } }, { "ph": "f", "id": 207379, "pid": 76337, "tid": -914061504, "ts": 1716454224712051, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712056, "dur": 0, "args": { "External id": 207381, "cbid": 251, "correlation": 207381 } }, { "ph": "f", "id": 207381, "pid": 76337, "tid": -914061504, "ts": 1716454224712056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224757250, "dur": 12, "args": { "External id": 207382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207382, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 207382, "pid": 5, "tid": 7, "ts": 1716454224757250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712059, "dur": 12, "args": { "External id": 207382, "cbid": 211, "correlation": 207382 } }, { "ph": "s", "id": 207382, "pid": 76337, "tid": -914061504, "ts": 1716454224712059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224757264, "dur": 5, "args": { "External id": 207384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207384, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 207384, "pid": 5, "tid": 7, "ts": 1716454224757264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712072, "dur": 6, "args": { "External id": 207384, "cbid": 211, "correlation": 207384 } }, { "ph": "s", "id": 207384, "pid": 76337, "tid": -914061504, "ts": 1716454224712072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224757271, "dur": 30, "args": { "External id": 207394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207394, "pid": 5, "tid": 7, "ts": 1716454224757271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712131, "dur": 12, "args": { "External id": 207394, "cbid": 211, "correlation": 207394 } }, { "ph": "s", "id": 207394, "pid": 76337, "tid": -914061504, "ts": 1716454224712131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224757302, "dur": 30, "args": { "External id": 207414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207414, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 207414, "pid": 5, "tid": 7, "ts": 1716454224757302, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712198, "dur": 11, "args": { "External id": 207414, "cbid": 211, "correlation": 207414 } }, { "ph": "s", "id": 207414, "pid": 76337, "tid": -914061504, "ts": 1716454224712198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224757333, "dur": 4, "args": { "External id": 207426, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207426, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 207426, "pid": 5, "tid": 7, "ts": 1716454224757333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712219, "dur": 6, "args": { "External id": 207426, "cbid": 211, "correlation": 207426 } }, { "ph": "s", "id": 207426, "pid": 76337, "tid": -914061504, "ts": 1716454224712219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224757338, "dur": 30, "args": { "External id": 207429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207429, "pid": 5, "tid": 7, "ts": 1716454224757338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712238, "dur": 6, "args": { "External id": 207429, "cbid": 211, "correlation": 207429 } }, { "ph": "s", "id": 207429, "pid": 76337, "tid": -914061504, "ts": 1716454224712238, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224757369, "dur": 22, "args": { "External id": 207438, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207438, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207438, "pid": 5, "tid": 7, "ts": 1716454224757369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712279, "dur": 9, "args": { "External id": 207438, "cbid": 211, "correlation": 207438 } }, { "ph": "s", "id": 207438, "pid": 76337, "tid": -914061504, "ts": 1716454224712279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224712343, "dur": 0, "args": { "External id": 207448, "cbid": 317, "correlation": 207448 } }, { "ph": "f", "id": 207448, "pid": 76337, "tid": -914061504, "ts": 1716454224712343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224712343, "dur": 0, "args": { "External id": 207449, "cbid": 203, "correlation": 207449 } }, { "ph": "f", "id": 207449, "pid": 76337, "tid": -914061504, "ts": 1716454224712343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224712344, "dur": 0, "args": { "External id": 207450, "cbid": 205, "correlation": 207450 } }, { "ph": "f", "id": 207450, "pid": 76337, "tid": -914061504, "ts": 1716454224712344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224757392, "dur": 23, "args": { "External id": 207454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207454, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207454, "pid": 5, "tid": 7, "ts": 1716454224757392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712362, "dur": 12, "args": { "External id": 207454, "cbid": 211, "correlation": 207454 } }, { "ph": "s", "id": 207454, "pid": 76337, "tid": -914061504, "ts": 1716454224712362, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224757417, "dur": 44, "args": { "External id": 207456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207456, "pid": 5, "tid": 7, "ts": 1716454224757417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712377, "dur": 5, "args": { "External id": 207456, "cbid": 211, "correlation": 207456 } }, { "ph": "s", "id": 207456, "pid": 76337, "tid": -914061504, "ts": 1716454224712377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224757463, "dur": 659, "args": { "External id": 207458, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207458, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207458, "pid": 5, "tid": 7, "ts": 1716454224757463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712392, "dur": 9, "args": { "External id": 207458, "cbid": 211, "correlation": 207458 } }, { "ph": "s", "id": 207458, "pid": 76337, "tid": -914061504, "ts": 1716454224712392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224758123, "dur": 23, "args": { "External id": 207460, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207460, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207460, "pid": 5, "tid": 7, "ts": 1716454224758123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712405, "dur": 5, "args": { "External id": 207460, "cbid": 211, "correlation": 207460 } }, { "ph": "s", "id": 207460, "pid": 76337, "tid": -914061504, "ts": 1716454224712405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224758147, "dur": 33, "args": { "External id": 207466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207466, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207466, "pid": 5, "tid": 7, "ts": 1716454224758147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712434, "dur": 8, "args": { "External id": 207466, "cbid": 211, "correlation": 207466 } }, { "ph": "s", "id": 207466, "pid": 76337, "tid": -914061504, "ts": 1716454224712434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224712492, "dur": 0, "args": { "External id": 207476, "cbid": 317, "correlation": 207476 } }, { "ph": "f", "id": 207476, "pid": 76337, "tid": -914061504, "ts": 1716454224712492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224712493, "dur": 0, "args": { "External id": 207477, "cbid": 203, "correlation": 207477 } }, { "ph": "f", "id": 207477, "pid": 76337, "tid": -914061504, "ts": 1716454224712493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224712493, "dur": 0, "args": { "External id": 207478, "cbid": 205, "correlation": 207478 } }, { "ph": "f", "id": 207478, "pid": 76337, "tid": -914061504, "ts": 1716454224712493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712515, "dur": 1, "args": { "External id": 207482, "cbid": 251, "correlation": 207482 } }, { "ph": "f", "id": 207482, "pid": 76337, "tid": -914061504, "ts": 1716454224712515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712517, "dur": 0, "args": { "External id": 207483, "cbid": 251, "correlation": 207483 } }, { "ph": "f", "id": 207483, "pid": 76337, "tid": -914061504, "ts": 1716454224712517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712518, "dur": 0, "args": { "External id": 207484, "cbid": 251, "correlation": 207484 } }, { "ph": "f", "id": 207484, "pid": 76337, "tid": -914061504, "ts": 1716454224712518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712519, "dur": 0, "args": { "External id": 207485, "cbid": 251, "correlation": 207485 } }, { "ph": "f", "id": 207485, "pid": 76337, "tid": -914061504, "ts": 1716454224712519, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712520, "dur": 0, "args": { "External id": 207486, "cbid": 251, "correlation": 207486 } }, { "ph": "f", "id": 207486, "pid": 76337, "tid": -914061504, "ts": 1716454224712520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712521, "dur": 0, "args": { "External id": 207487, "cbid": 251, "correlation": 207487 } }, { "ph": "f", "id": 207487, "pid": 76337, "tid": -914061504, "ts": 1716454224712521, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712522, "dur": 0, "args": { "External id": 207488, "cbid": 251, "correlation": 207488 } }, { "ph": "f", "id": 207488, "pid": 76337, "tid": -914061504, "ts": 1716454224712522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712523, "dur": 0, "args": { "External id": 207489, "cbid": 251, "correlation": 207489 } }, { "ph": "f", "id": 207489, "pid": 76337, "tid": -914061504, "ts": 1716454224712523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224712524, "dur": 0, "args": { "External id": 207490, "cbid": 251, "correlation": 207490 } }, { "ph": "f", "id": 207490, "pid": 76337, "tid": -914061504, "ts": 1716454224712524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224758182, "dur": 52, "args": { "External id": 207491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207491, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 6, "warps per SM": 24, "grid": [12, 5, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 207491, "pid": 5, "tid": 7, "ts": 1716454224758182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712527, "dur": 12, "args": { "External id": 207491, "cbid": 211, "correlation": 207491 } }, { "ph": "s", "id": 207491, "pid": 76337, "tid": -914061504, "ts": 1716454224712527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224758235, "dur": 32, "args": { "External id": 207497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207497, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207497, "pid": 5, "tid": 7, "ts": 1716454224758235, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712558, "dur": 8, "args": { "External id": 207497, "cbid": 211, "correlation": 207497 } }, { "ph": "s", "id": 207497, "pid": 76337, "tid": -914061504, "ts": 1716454224712558, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224758268, "dur": 27, "args": { "External id": 207505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207505, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207505, "pid": 5, "tid": 7, "ts": 1716454224758268, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712588, "dur": 8, "args": { "External id": 207505, "cbid": 211, "correlation": 207505 } }, { "ph": "s", "id": 207505, "pid": 76337, "tid": -914061504, "ts": 1716454224712588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224758296, "dur": 19, "args": { "External id": 207513, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207513, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207513, "pid": 5, "tid": 7, "ts": 1716454224758296, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712616, "dur": 9, "args": { "External id": 207513, "cbid": 211, "correlation": 207513 } }, { "ph": "s", "id": 207513, "pid": 76337, "tid": -914061504, "ts": 1716454224712616, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224758317, "dur": 29, "args": { "External id": 207533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207533, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 207533, "pid": 5, "tid": 7, "ts": 1716454224758317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712699, "dur": 12, "args": { "External id": 207533, "cbid": 211, "correlation": 207533 } }, { "ph": "s", "id": 207533, "pid": 76337, "tid": -914061504, "ts": 1716454224712699, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224758347, "dur": 4, "args": { "External id": 207545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207545, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 207545, "pid": 5, "tid": 7, "ts": 1716454224758347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712720, "dur": 7, "args": { "External id": 207545, "cbid": 211, "correlation": 207545 } }, { "ph": "s", "id": 207545, "pid": 76337, "tid": -914061504, "ts": 1716454224712720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224758353, "dur": 29, "args": { "External id": 207548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207548, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207548, "pid": 5, "tid": 7, "ts": 1716454224758353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712738, "dur": 6, "args": { "External id": 207548, "cbid": 211, "correlation": 207548 } }, { "ph": "s", "id": 207548, "pid": 76337, "tid": -914061504, "ts": 1716454224712738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224712796, "dur": 0, "args": { "External id": 207559, "cbid": 317, "correlation": 207559 } }, { "ph": "f", "id": 207559, "pid": 76337, "tid": -914061504, "ts": 1716454224712796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224712797, "dur": 0, "args": { "External id": 207560, "cbid": 203, "correlation": 207560 } }, { "ph": "f", "id": 207560, "pid": 76337, "tid": -914061504, "ts": 1716454224712797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224712797, "dur": 0, "args": { "External id": 207561, "cbid": 205, "correlation": 207561 } }, { "ph": "f", "id": 207561, "pid": 76337, "tid": -914061504, "ts": 1716454224712797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224758383, "dur": 23, "args": { "External id": 207565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207565, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207565, "pid": 5, "tid": 7, "ts": 1716454224758383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712812, "dur": 13, "args": { "External id": 207565, "cbid": 211, "correlation": 207565 } }, { "ph": "s", "id": 207565, "pid": 76337, "tid": -914061504, "ts": 1716454224712812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224758408, "dur": 121, "args": { "External id": 207567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207567, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207567, "pid": 5, "tid": 7, "ts": 1716454224758408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712834, "dur": 9, "args": { "External id": 207567, "cbid": 211, "correlation": 207567 } }, { "ph": "s", "id": 207567, "pid": 76337, "tid": -914061504, "ts": 1716454224712834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224758531, "dur": 23, "args": { "External id": 207569, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207569, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207569, "pid": 5, "tid": 7, "ts": 1716454224758531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712846, "dur": 5, "args": { "External id": 207569, "cbid": 211, "correlation": 207569 } }, { "ph": "s", "id": 207569, "pid": 76337, "tid": -914061504, "ts": 1716454224712846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224758555, "dur": 33, "args": { "External id": 207575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207575, "pid": 5, "tid": 7, "ts": 1716454224758555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712874, "dur": 8, "args": { "External id": 207575, "cbid": 211, "correlation": 207575 } }, { "ph": "s", "id": 207575, "pid": 76337, "tid": -914061504, "ts": 1716454224712874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224758589, "dur": 199, "args": { "External id": 207584, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207584, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207584, "pid": 5, "tid": 7, "ts": 1716454224758589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224712957, "dur": 15, "args": { "External id": 207584, "cbid": 211, "correlation": 207584 } }, { "ph": "s", "id": 207584, "pid": 76337, "tid": -914061504, "ts": 1716454224712957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224758789, "dur": 66, "args": { "External id": 207606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207606, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207606, "pid": 5, "tid": 7, "ts": 1716454224758789, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713025, "dur": 11, "args": { "External id": 207606, "cbid": 211, "correlation": 207606 } }, { "ph": "s", "id": 207606, "pid": 76337, "tid": -914061504, "ts": 1716454224713025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713116, "dur": 1, "args": { "External id": 207617, "cbid": 251, "correlation": 207617 } }, { "ph": "f", "id": 207617, "pid": 76337, "tid": -914061504, "ts": 1716454224713116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224758856, "dur": 157, "args": { "External id": 207618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207618, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207618, "pid": 5, "tid": 7, "ts": 1716454224758856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713121, "dur": 13, "args": { "External id": 207618, "cbid": 211, "correlation": 207618 } }, { "ph": "s", "id": 207618, "pid": 76337, "tid": -914061504, "ts": 1716454224713121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713192, "dur": 1, "args": { "External id": 207629, "cbid": 251, "correlation": 207629 } }, { "ph": "f", "id": 207629, "pid": 76337, "tid": -914061504, "ts": 1716454224713192, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224759014, "dur": 149, "args": { "External id": 207630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207630, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207630, "pid": 5, "tid": 7, "ts": 1716454224759014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713196, "dur": 11, "args": { "External id": 207630, "cbid": 211, "correlation": 207630 } }, { "ph": "s", "id": 207630, "pid": 76337, "tid": -914061504, "ts": 1716454224713196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713261, "dur": 1, "args": { "External id": 207641, "cbid": 251, "correlation": 207641 } }, { "ph": "f", "id": 207641, "pid": 76337, "tid": -914061504, "ts": 1716454224713261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224759164, "dur": 147, "args": { "External id": 207642, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207642, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207642, "pid": 5, "tid": 7, "ts": 1716454224759164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713265, "dur": 12, "args": { "External id": 207642, "cbid": 211, "correlation": 207642 } }, { "ph": "s", "id": 207642, "pid": 76337, "tid": -914061504, "ts": 1716454224713265, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224759312, "dur": 1965, "args": { "External id": 207663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207663, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 207663, "pid": 5, "tid": 7, "ts": 1716454224759312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713350, "dur": 13, "args": { "External id": 207663, "cbid": 211, "correlation": 207663 } }, { "ph": "s", "id": 207663, "pid": 76337, "tid": -914061504, "ts": 1716454224713350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713450, "dur": 1, "args": { "External id": 207681, "cbid": 251, "correlation": 207681 } }, { "ph": "f", "id": 207681, "pid": 76337, "tid": -914061504, "ts": 1716454224713450, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224761278, "dur": 147, "args": { "External id": 207683, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207683, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 207683, "pid": 5, "tid": 7, "ts": 1716454224761278, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713456, "dur": 13, "args": { "External id": 207683, "cbid": 211, "correlation": 207683 } }, { "ph": "s", "id": 207683, "pid": 76337, "tid": -914061504, "ts": 1716454224713456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224761427, "dur": 35, "args": { "External id": 207691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207691, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207691, "pid": 5, "tid": 7, "ts": 1716454224761427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713527, "dur": 13, "args": { "External id": 207691, "cbid": 211, "correlation": 207691 } }, { "ph": "s", "id": 207691, "pid": 76337, "tid": -914061504, "ts": 1716454224713527, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224761463, "dur": 50, "args": { "External id": 207699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207699, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207699, "pid": 5, "tid": 7, "ts": 1716454224761463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713566, "dur": 10, "args": { "External id": 207699, "cbid": 211, "correlation": 207699 } }, { "ph": "s", "id": 207699, "pid": 76337, "tid": -914061504, "ts": 1716454224713566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224761515, "dur": 31, "args": { "External id": 207710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207710, "pid": 5, "tid": 7, "ts": 1716454224761515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713638, "dur": 13, "args": { "External id": 207710, "cbid": 211, "correlation": 207710 } }, { "ph": "s", "id": 207710, "pid": 76337, "tid": -914061504, "ts": 1716454224713638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224761547, "dur": 35, "args": { "External id": 207732, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207732, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207732, "pid": 5, "tid": 7, "ts": 1716454224761547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713669, "dur": 9, "args": { "External id": 207732, "cbid": 211, "correlation": 207732 } }, { "ph": "s", "id": 207732, "pid": 76337, "tid": -914061504, "ts": 1716454224713669, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713754, "dur": 1, "args": { "External id": 207743, "cbid": 251, "correlation": 207743 } }, { "ph": "f", "id": 207743, "pid": 76337, "tid": -914061504, "ts": 1716454224713754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224761583, "dur": 91, "args": { "External id": 207744, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207744, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207744, "pid": 5, "tid": 7, "ts": 1716454224761583, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713759, "dur": 13, "args": { "External id": 207744, "cbid": 211, "correlation": 207744 } }, { "ph": "s", "id": 207744, "pid": 76337, "tid": -914061504, "ts": 1716454224713759, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713828, "dur": 1, "args": { "External id": 207755, "cbid": 251, "correlation": 207755 } }, { "ph": "f", "id": 207755, "pid": 76337, "tid": -914061504, "ts": 1716454224713828, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713831, "dur": 0, "args": { "External id": 207756, "cbid": 251, "correlation": 207756 } }, { "ph": "f", "id": 207756, "pid": 76337, "tid": -914061504, "ts": 1716454224713831, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224761676, "dur": 11, "args": { "External id": 207757, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207757, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 207757, "pid": 5, "tid": 7, "ts": 1716454224761676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713833, "dur": 12, "args": { "External id": 207757, "cbid": 211, "correlation": 207757 } }, { "ph": "s", "id": 207757, "pid": 76337, "tid": -914061504, "ts": 1716454224713833, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224761688, "dur": 5, "args": { "External id": 207759, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207759, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 207759, "pid": 5, "tid": 7, "ts": 1716454224761688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713847, "dur": 7, "args": { "External id": 207759, "cbid": 211, "correlation": 207759 } }, { "ph": "s", "id": 207759, "pid": 76337, "tid": -914061504, "ts": 1716454224713847, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713906, "dur": 1, "args": { "External id": 207770, "cbid": 251, "correlation": 207770 } }, { "ph": "f", "id": 207770, "pid": 76337, "tid": -914061504, "ts": 1716454224713906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224713910, "dur": 0, "args": { "External id": 207771, "cbid": 251, "correlation": 207771 } }, { "ph": "f", "id": 207771, "pid": 76337, "tid": -914061504, "ts": 1716454224713910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224761694, "dur": 7, "args": { "External id": 207772, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207772, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 207772, "pid": 5, "tid": 7, "ts": 1716454224761694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713911, "dur": 12, "args": { "External id": 207772, "cbid": 211, "correlation": 207772 } }, { "ph": "s", "id": 207772, "pid": 76337, "tid": -914061504, "ts": 1716454224713911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224761703, "dur": 4, "args": { "External id": 207774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207774, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 207774, "pid": 5, "tid": 7, "ts": 1716454224761703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224713925, "dur": 6, "args": { "External id": 207774, "cbid": 211, "correlation": 207774 } }, { "ph": "s", "id": 207774, "pid": 76337, "tid": -914061504, "ts": 1716454224713925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224761708, "dur": 93, "args": { "External id": 207795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207795, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 207795, "pid": 5, "tid": 7, "ts": 1716454224761708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714007, "dur": 13, "args": { "External id": 207795, "cbid": 211, "correlation": 207795 } }, { "ph": "s", "id": 207795, "pid": 76337, "tid": -914061504, "ts": 1716454224714007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224714106, "dur": 1, "args": { "External id": 207813, "cbid": 251, "correlation": 207813 } }, { "ph": "f", "id": 207813, "pid": 76337, "tid": -914061504, "ts": 1716454224714106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224761802, "dur": 100, "args": { "External id": 207815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207815, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207815, "pid": 5, "tid": 7, "ts": 1716454224761802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714112, "dur": 14, "args": { "External id": 207815, "cbid": 211, "correlation": 207815 } }, { "ph": "s", "id": 207815, "pid": 76337, "tid": -914061504, "ts": 1716454224714112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224761904, "dur": 20, "args": { "External id": 207823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207823, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207823, "pid": 5, "tid": 7, "ts": 1716454224761904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714182, "dur": 12, "args": { "External id": 207823, "cbid": 211, "correlation": 207823 } }, { "ph": "s", "id": 207823, "pid": 76337, "tid": -914061504, "ts": 1716454224714182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224761925, "dur": 37, "args": { "External id": 207831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207831, "pid": 5, "tid": 7, "ts": 1716454224761925, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714223, "dur": 9, "args": { "External id": 207831, "cbid": 211, "correlation": 207831 } }, { "ph": "s", "id": 207831, "pid": 76337, "tid": -914061504, "ts": 1716454224714223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224761963, "dur": 35, "args": { "External id": 207853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207853, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207853, "pid": 5, "tid": 7, "ts": 1716454224761963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714275, "dur": 11, "args": { "External id": 207853, "cbid": 211, "correlation": 207853 } }, { "ph": "s", "id": 207853, "pid": 76337, "tid": -914061504, "ts": 1716454224714275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224714364, "dur": 1, "args": { "External id": 207869, "cbid": 251, "correlation": 207869 } }, { "ph": "f", "id": 207869, "pid": 76337, "tid": -914061504, "ts": 1716454224714364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224714369, "dur": 0, "args": { "External id": 207871, "cbid": 251, "correlation": 207871 } }, { "ph": "f", "id": 207871, "pid": 76337, "tid": -914061504, "ts": 1716454224714369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224762000, "dur": 545, "args": { "External id": 207872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207872, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 207872, "pid": 5, "tid": 7, "ts": 1716454224762000, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714373, "dur": 13, "args": { "External id": 207872, "cbid": 211, "correlation": 207872 } }, { "ph": "s", "id": 207872, "pid": 76337, "tid": -914061504, "ts": 1716454224714373, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224762546, "dur": 127, "args": { "External id": 207880, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207880, "pid": 5, "tid": 7, "ts": 1716454224762546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714438, "dur": 12, "args": { "External id": 207880, "cbid": 211, "correlation": 207880 } }, { "ph": "s", "id": 207880, "pid": 76337, "tid": -914061504, "ts": 1716454224714438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224762674, "dur": 127, "args": { "External id": 207888, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207888, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207888, "pid": 5, "tid": 7, "ts": 1716454224762674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714469, "dur": 9, "args": { "External id": 207888, "cbid": 211, "correlation": 207888 } }, { "ph": "s", "id": 207888, "pid": 76337, "tid": -914061504, "ts": 1716454224714469, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224714547, "dur": 1, "args": { "External id": 207904, "cbid": 251, "correlation": 207904 } }, { "ph": "f", "id": 207904, "pid": 76337, "tid": -914061504, "ts": 1716454224714547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224762802, "dur": 306, "args": { "External id": 207906, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207906, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207906, "pid": 5, "tid": 7, "ts": 1716454224762802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714553, "dur": 12, "args": { "External id": 207906, "cbid": 211, "correlation": 207906 } }, { "ph": "s", "id": 207906, "pid": 76337, "tid": -914061504, "ts": 1716454224714553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224763110, "dur": 27, "args": { "External id": 207914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207914, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207914, "pid": 5, "tid": 7, "ts": 1716454224763110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714595, "dur": 9, "args": { "External id": 207914, "cbid": 211, "correlation": 207914 } }, { "ph": "s", "id": 207914, "pid": 76337, "tid": -914061504, "ts": 1716454224714595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224763138, "dur": 82, "args": { "External id": 207925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207925, "pid": 5, "tid": 7, "ts": 1716454224763138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714663, "dur": 13, "args": { "External id": 207925, "cbid": 211, "correlation": 207925 } }, { "ph": "s", "id": 207925, "pid": 76337, "tid": -914061504, "ts": 1716454224714663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224714728, "dur": 0, "args": { "External id": 207937, "cbid": 317, "correlation": 207937 } }, { "ph": "f", "id": 207937, "pid": 76337, "tid": -914061504, "ts": 1716454224714728, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224714729, "dur": 0, "args": { "External id": 207938, "cbid": 203, "correlation": 207938 } }, { "ph": "f", "id": 207938, "pid": 76337, "tid": -914061504, "ts": 1716454224714729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224714730, "dur": 0, "args": { "External id": 207939, "cbid": 205, "correlation": 207939 } }, { "ph": "f", "id": 207939, "pid": 76337, "tid": -914061504, "ts": 1716454224714730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224763222, "dur": 22, "args": { "External id": 207943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207943, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207943, "pid": 5, "tid": 7, "ts": 1716454224763222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714744, "dur": 12, "args": { "External id": 207943, "cbid": 211, "correlation": 207943 } }, { "ph": "s", "id": 207943, "pid": 76337, "tid": -914061504, "ts": 1716454224714744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224763245, "dur": 121, "args": { "External id": 207945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207945, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 207945, "pid": 5, "tid": 7, "ts": 1716454224763245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714763, "dur": 8, "args": { "External id": 207945, "cbid": 211, "correlation": 207945 } }, { "ph": "s", "id": 207945, "pid": 76337, "tid": -914061504, "ts": 1716454224714763, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224763368, "dur": 22, "args": { "External id": 207947, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207947, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207947, "pid": 5, "tid": 7, "ts": 1716454224763368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714775, "dur": 5, "args": { "External id": 207947, "cbid": 211, "correlation": 207947 } }, { "ph": "s", "id": 207947, "pid": 76337, "tid": -914061504, "ts": 1716454224714775, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224763391, "dur": 33, "args": { "External id": 207953, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207953, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207953, "pid": 5, "tid": 7, "ts": 1716454224763391, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714803, "dur": 8, "args": { "External id": 207953, "cbid": 211, "correlation": 207953 } }, { "ph": "s", "id": 207953, "pid": 76337, "tid": -914061504, "ts": 1716454224714803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224763425, "dur": 26, "args": { "External id": 207961, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207961, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207961, "pid": 5, "tid": 7, "ts": 1716454224763425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714834, "dur": 8, "args": { "External id": 207961, "cbid": 211, "correlation": 207961 } }, { "ph": "s", "id": 207961, "pid": 76337, "tid": -914061504, "ts": 1716454224714834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224763453, "dur": 30, "args": { "External id": 207981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207981, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 207981, "pid": 5, "tid": 7, "ts": 1716454224763453, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714906, "dur": 12, "args": { "External id": 207981, "cbid": 211, "correlation": 207981 } }, { "ph": "s", "id": 207981, "pid": 76337, "tid": -914061504, "ts": 1716454224714906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224763484, "dur": 5, "args": { "External id": 207993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207993, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 207993, "pid": 5, "tid": 7, "ts": 1716454224763484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714929, "dur": 6, "args": { "External id": 207993, "cbid": 211, "correlation": 207993 } }, { "ph": "s", "id": 207993, "pid": 76337, "tid": -914061504, "ts": 1716454224714929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224763491, "dur": 31, "args": { "External id": 207996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 207996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 207996, "pid": 5, "tid": 7, "ts": 1716454224763491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714947, "dur": 7, "args": { "External id": 207996, "cbid": 211, "correlation": 207996 } }, { "ph": "s", "id": 207996, "pid": 76337, "tid": -914061504, "ts": 1716454224714947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224763523, "dur": 22, "args": { "External id": 208005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208005, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208005, "pid": 5, "tid": 7, "ts": 1716454224763523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224714994, "dur": 10, "args": { "External id": 208005, "cbid": 211, "correlation": 208005 } }, { "ph": "s", "id": 208005, "pid": 76337, "tid": -914061504, "ts": 1716454224714994, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224715047, "dur": 0, "args": { "External id": 208015, "cbid": 317, "correlation": 208015 } }, { "ph": "f", "id": 208015, "pid": 76337, "tid": -914061504, "ts": 1716454224715047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224715048, "dur": 0, "args": { "External id": 208016, "cbid": 203, "correlation": 208016 } }, { "ph": "f", "id": 208016, "pid": 76337, "tid": -914061504, "ts": 1716454224715048, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224715049, "dur": 0, "args": { "External id": 208017, "cbid": 205, "correlation": 208017 } }, { "ph": "f", "id": 208017, "pid": 76337, "tid": -914061504, "ts": 1716454224715049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224763546, "dur": 22, "args": { "External id": 208021, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208021, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208021, "pid": 5, "tid": 7, "ts": 1716454224763546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715061, "dur": 13, "args": { "External id": 208021, "cbid": 211, "correlation": 208021 } }, { "ph": "s", "id": 208021, "pid": 76337, "tid": -914061504, "ts": 1716454224715061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224763569, "dur": 45, "args": { "External id": 208023, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208023, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208023, "pid": 5, "tid": 7, "ts": 1716454224763569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715077, "dur": 5, "args": { "External id": 208023, "cbid": 211, "correlation": 208023 } }, { "ph": "s", "id": 208023, "pid": 76337, "tid": -914061504, "ts": 1716454224715077, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224763615, "dur": 658, "args": { "External id": 208025, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208025, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208025, "pid": 5, "tid": 7, "ts": 1716454224763615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715088, "dur": 6, "args": { "External id": 208025, "cbid": 211, "correlation": 208025 } }, { "ph": "s", "id": 208025, "pid": 76337, "tid": -914061504, "ts": 1716454224715088, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224764274, "dur": 21, "args": { "External id": 208027, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208027, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208027, "pid": 5, "tid": 7, "ts": 1716454224764274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715097, "dur": 5, "args": { "External id": 208027, "cbid": 211, "correlation": 208027 } }, { "ph": "s", "id": 208027, "pid": 76337, "tid": -914061504, "ts": 1716454224715097, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224764297, "dur": 32, "args": { "External id": 208033, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208033, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208033, "pid": 5, "tid": 7, "ts": 1716454224764297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715125, "dur": 9, "args": { "External id": 208033, "cbid": 211, "correlation": 208033 } }, { "ph": "s", "id": 208033, "pid": 76337, "tid": -914061504, "ts": 1716454224715125, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224764331, "dur": 3, "args": { "External id": 208041, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208041, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 208041, "pid": 5, "tid": 7, "ts": 1716454224764331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715168, "dur": 10, "args": { "External id": 208041, "cbid": 211, "correlation": 208041 } }, { "ph": "s", "id": 208041, "pid": 76337, "tid": -914061504, "ts": 1716454224715168, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224715234, "dur": 1, "args": { "External id": 208057, "cbid": 251, "correlation": 208057 } }, { "ph": "f", "id": 208057, "pid": 76337, "tid": -914061504, "ts": 1716454224715234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224715240, "dur": 0, "args": { "External id": 208059, "cbid": 251, "correlation": 208059 } }, { "ph": "f", "id": 208059, "pid": 76337, "tid": -914061504, "ts": 1716454224715240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224764335, "dur": 12, "args": { "External id": 208060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208060, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 208060, "pid": 5, "tid": 7, "ts": 1716454224764335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715242, "dur": 11, "args": { "External id": 208060, "cbid": 211, "correlation": 208060 } }, { "ph": "s", "id": 208060, "pid": 76337, "tid": -914061504, "ts": 1716454224715242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224764349, "dur": 5, "args": { "External id": 208062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208062, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 208062, "pid": 5, "tid": 7, "ts": 1716454224764349, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715255, "dur": 5, "args": { "External id": 208062, "cbid": 211, "correlation": 208062 } }, { "ph": "s", "id": 208062, "pid": 76337, "tid": -914061504, "ts": 1716454224715255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224764355, "dur": 30, "args": { "External id": 208072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208072, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208072, "pid": 5, "tid": 7, "ts": 1716454224764355, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715313, "dur": 13, "args": { "External id": 208072, "cbid": 211, "correlation": 208072 } }, { "ph": "s", "id": 208072, "pid": 76337, "tid": -914061504, "ts": 1716454224715313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224764386, "dur": 30, "args": { "External id": 208092, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208092, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 208092, "pid": 5, "tid": 7, "ts": 1716454224764386, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715381, "dur": 10, "args": { "External id": 208092, "cbid": 211, "correlation": 208092 } }, { "ph": "s", "id": 208092, "pid": 76337, "tid": -914061504, "ts": 1716454224715381, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224764418, "dur": 4, "args": { "External id": 208104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208104, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 208104, "pid": 5, "tid": 7, "ts": 1716454224764418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715400, "dur": 6, "args": { "External id": 208104, "cbid": 211, "correlation": 208104 } }, { "ph": "s", "id": 208104, "pid": 76337, "tid": -914061504, "ts": 1716454224715400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224764423, "dur": 30, "args": { "External id": 208107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208107, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208107, "pid": 5, "tid": 7, "ts": 1716454224764423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715421, "dur": 6, "args": { "External id": 208107, "cbid": 211, "correlation": 208107 } }, { "ph": "s", "id": 208107, "pid": 76337, "tid": -914061504, "ts": 1716454224715421, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224764455, "dur": 20, "args": { "External id": 208116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208116, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208116, "pid": 5, "tid": 7, "ts": 1716454224764455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715460, "dur": 10, "args": { "External id": 208116, "cbid": 211, "correlation": 208116 } }, { "ph": "s", "id": 208116, "pid": 76337, "tid": -914061504, "ts": 1716454224715460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224715524, "dur": 0, "args": { "External id": 208126, "cbid": 317, "correlation": 208126 } }, { "ph": "f", "id": 208126, "pid": 76337, "tid": -914061504, "ts": 1716454224715524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224715525, "dur": 0, "args": { "External id": 208127, "cbid": 203, "correlation": 208127 } }, { "ph": "f", "id": 208127, "pid": 76337, "tid": -914061504, "ts": 1716454224715525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224715525, "dur": 0, "args": { "External id": 208128, "cbid": 205, "correlation": 208128 } }, { "ph": "f", "id": 208128, "pid": 76337, "tid": -914061504, "ts": 1716454224715525, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224764476, "dur": 22, "args": { "External id": 208132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208132, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208132, "pid": 5, "tid": 7, "ts": 1716454224764476, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715539, "dur": 12, "args": { "External id": 208132, "cbid": 211, "correlation": 208132 } }, { "ph": "s", "id": 208132, "pid": 76337, "tid": -914061504, "ts": 1716454224715539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224764500, "dur": 44, "args": { "External id": 208134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208134, "pid": 5, "tid": 7, "ts": 1716454224764500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715553, "dur": 5, "args": { "External id": 208134, "cbid": 211, "correlation": 208134 } }, { "ph": "s", "id": 208134, "pid": 76337, "tid": -914061504, "ts": 1716454224715553, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224764546, "dur": 650, "args": { "External id": 208136, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208136, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208136, "pid": 5, "tid": 7, "ts": 1716454224764546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715564, "dur": 7, "args": { "External id": 208136, "cbid": 211, "correlation": 208136 } }, { "ph": "s", "id": 208136, "pid": 76337, "tid": -914061504, "ts": 1716454224715564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224765197, "dur": 22, "args": { "External id": 208138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208138, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208138, "pid": 5, "tid": 7, "ts": 1716454224765197, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715574, "dur": 5, "args": { "External id": 208138, "cbid": 211, "correlation": 208138 } }, { "ph": "s", "id": 208138, "pid": 76337, "tid": -914061504, "ts": 1716454224715574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224765220, "dur": 33, "args": { "External id": 208144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208144, "pid": 5, "tid": 7, "ts": 1716454224765220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715602, "dur": 8, "args": { "External id": 208144, "cbid": 211, "correlation": 208144 } }, { "ph": "s", "id": 208144, "pid": 76337, "tid": -914061504, "ts": 1716454224715602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224765255, "dur": 27, "args": { "External id": 208152, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208152, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208152, "pid": 5, "tid": 7, "ts": 1716454224765255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715635, "dur": 8, "args": { "External id": 208152, "cbid": 211, "correlation": 208152 } }, { "ph": "s", "id": 208152, "pid": 76337, "tid": -914061504, "ts": 1716454224715635, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224765283, "dur": 20, "args": { "External id": 208160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208160, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208160, "pid": 5, "tid": 7, "ts": 1716454224765283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715664, "dur": 8, "args": { "External id": 208160, "cbid": 211, "correlation": 208160 } }, { "ph": "s", "id": 208160, "pid": 76337, "tid": -914061504, "ts": 1716454224715664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224765304, "dur": 30, "args": { "External id": 208180, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208180, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 208180, "pid": 5, "tid": 7, "ts": 1716454224765304, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715743, "dur": 12, "args": { "External id": 208180, "cbid": 211, "correlation": 208180 } }, { "ph": "s", "id": 208180, "pid": 76337, "tid": -914061504, "ts": 1716454224715743, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224765335, "dur": 4, "args": { "External id": 208192, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208192, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 208192, "pid": 5, "tid": 7, "ts": 1716454224765335, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715765, "dur": 7, "args": { "External id": 208192, "cbid": 211, "correlation": 208192 } }, { "ph": "s", "id": 208192, "pid": 76337, "tid": -914061504, "ts": 1716454224715765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224765340, "dur": 30, "args": { "External id": 208195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208195, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208195, "pid": 5, "tid": 7, "ts": 1716454224765340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715783, "dur": 6, "args": { "External id": 208195, "cbid": 211, "correlation": 208195 } }, { "ph": "s", "id": 208195, "pid": 76337, "tid": -914061504, "ts": 1716454224715783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224715841, "dur": 0, "args": { "External id": 208206, "cbid": 317, "correlation": 208206 } }, { "ph": "f", "id": 208206, "pid": 76337, "tid": -914061504, "ts": 1716454224715841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224715842, "dur": 0, "args": { "External id": 208207, "cbid": 203, "correlation": 208207 } }, { "ph": "f", "id": 208207, "pid": 76337, "tid": -914061504, "ts": 1716454224715842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224715843, "dur": 0, "args": { "External id": 208208, "cbid": 205, "correlation": 208208 } }, { "ph": "f", "id": 208208, "pid": 76337, "tid": -914061504, "ts": 1716454224715843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224765372, "dur": 22, "args": { "External id": 208212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208212, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208212, "pid": 5, "tid": 7, "ts": 1716454224765372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715855, "dur": 12, "args": { "External id": 208212, "cbid": 211, "correlation": 208212 } }, { "ph": "s", "id": 208212, "pid": 76337, "tid": -914061504, "ts": 1716454224715855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224765395, "dur": 117, "args": { "External id": 208214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208214, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208214, "pid": 5, "tid": 7, "ts": 1716454224765395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715874, "dur": 6, "args": { "External id": 208214, "cbid": 211, "correlation": 208214 } }, { "ph": "s", "id": 208214, "pid": 76337, "tid": -914061504, "ts": 1716454224715874, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224765513, "dur": 21, "args": { "External id": 208216, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208216, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208216, "pid": 5, "tid": 7, "ts": 1716454224765513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715883, "dur": 5, "args": { "External id": 208216, "cbid": 211, "correlation": 208216 } }, { "ph": "s", "id": 208216, "pid": 76337, "tid": -914061504, "ts": 1716454224715883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224765535, "dur": 32, "args": { "External id": 208222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208222, "pid": 5, "tid": 7, "ts": 1716454224765535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224715910, "dur": 9, "args": { "External id": 208222, "cbid": 211, "correlation": 208222 } }, { "ph": "s", "id": 208222, "pid": 76337, "tid": -914061504, "ts": 1716454224715910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224765569, "dur": 199, "args": { "External id": 208231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208231, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208231, "pid": 5, "tid": 7, "ts": 1716454224765569, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716000, "dur": 14, "args": { "External id": 208231, "cbid": 211, "correlation": 208231 } }, { "ph": "s", "id": 208231, "pid": 76337, "tid": -914061504, "ts": 1716454224716000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224765769, "dur": 66, "args": { "External id": 208253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208253, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208253, "pid": 5, "tid": 7, "ts": 1716454224765769, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716058, "dur": 10, "args": { "External id": 208253, "cbid": 211, "correlation": 208253 } }, { "ph": "s", "id": 208253, "pid": 76337, "tid": -914061504, "ts": 1716454224716058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716146, "dur": 1, "args": { "External id": 208264, "cbid": 251, "correlation": 208264 } }, { "ph": "f", "id": 208264, "pid": 76337, "tid": -914061504, "ts": 1716454224716146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224765836, "dur": 154, "args": { "External id": 208265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208265, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208265, "pid": 5, "tid": 7, "ts": 1716454224765836, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716152, "dur": 13, "args": { "External id": 208265, "cbid": 211, "correlation": 208265 } }, { "ph": "s", "id": 208265, "pid": 76337, "tid": -914061504, "ts": 1716454224716152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716222, "dur": 1, "args": { "External id": 208276, "cbid": 251, "correlation": 208276 } }, { "ph": "f", "id": 208276, "pid": 76337, "tid": -914061504, "ts": 1716454224716222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224765992, "dur": 149, "args": { "External id": 208277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208277, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208277, "pid": 5, "tid": 7, "ts": 1716454224765992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716226, "dur": 11, "args": { "External id": 208277, "cbid": 211, "correlation": 208277 } }, { "ph": "s", "id": 208277, "pid": 76337, "tid": -914061504, "ts": 1716454224716226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716292, "dur": 1, "args": { "External id": 208288, "cbid": 251, "correlation": 208288 } }, { "ph": "f", "id": 208288, "pid": 76337, "tid": -914061504, "ts": 1716454224716292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224766142, "dur": 145, "args": { "External id": 208289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208289, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208289, "pid": 5, "tid": 7, "ts": 1716454224766142, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716295, "dur": 10, "args": { "External id": 208289, "cbid": 211, "correlation": 208289 } }, { "ph": "s", "id": 208289, "pid": 76337, "tid": -914061504, "ts": 1716454224716295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224766289, "dur": 1972, "args": { "External id": 208310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208310, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 208310, "pid": 5, "tid": 7, "ts": 1716454224766289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716377, "dur": 13, "args": { "External id": 208310, "cbid": 211, "correlation": 208310 } }, { "ph": "s", "id": 208310, "pid": 76337, "tid": -914061504, "ts": 1716454224716377, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716474, "dur": 1, "args": { "External id": 208328, "cbid": 251, "correlation": 208328 } }, { "ph": "f", "id": 208328, "pid": 76337, "tid": -914061504, "ts": 1716454224716474, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224768262, "dur": 148, "args": { "External id": 208330, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208330, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 208330, "pid": 5, "tid": 7, "ts": 1716454224768262, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716480, "dur": 13, "args": { "External id": 208330, "cbid": 211, "correlation": 208330 } }, { "ph": "s", "id": 208330, "pid": 76337, "tid": -914061504, "ts": 1716454224716480, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224768412, "dur": 35, "args": { "External id": 208338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208338, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208338, "pid": 5, "tid": 7, "ts": 1716454224768412, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716550, "dur": 13, "args": { "External id": 208338, "cbid": 211, "correlation": 208338 } }, { "ph": "s", "id": 208338, "pid": 76337, "tid": -914061504, "ts": 1716454224716550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224768448, "dur": 50, "args": { "External id": 208346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208346, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208346, "pid": 5, "tid": 7, "ts": 1716454224768448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716591, "dur": 9, "args": { "External id": 208346, "cbid": 211, "correlation": 208346 } }, { "ph": "s", "id": 208346, "pid": 76337, "tid": -914061504, "ts": 1716454224716591, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224768499, "dur": 30, "args": { "External id": 208357, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208357, "pid": 5, "tid": 7, "ts": 1716454224768499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716663, "dur": 13, "args": { "External id": 208357, "cbid": 211, "correlation": 208357 } }, { "ph": "s", "id": 208357, "pid": 76337, "tid": -914061504, "ts": 1716454224716663, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224768530, "dur": 34, "args": { "External id": 208379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208379, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208379, "pid": 5, "tid": 7, "ts": 1716454224768530, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716693, "dur": 7, "args": { "External id": 208379, "cbid": 211, "correlation": 208379 } }, { "ph": "s", "id": 208379, "pid": 76337, "tid": -914061504, "ts": 1716454224716693, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716777, "dur": 1, "args": { "External id": 208390, "cbid": 251, "correlation": 208390 } }, { "ph": "f", "id": 208390, "pid": 76337, "tid": -914061504, "ts": 1716454224716777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224768566, "dur": 89, "args": { "External id": 208391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208391, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208391, "pid": 5, "tid": 7, "ts": 1716454224768566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716782, "dur": 13, "args": { "External id": 208391, "cbid": 211, "correlation": 208391 } }, { "ph": "s", "id": 208391, "pid": 76337, "tid": -914061504, "ts": 1716454224716782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716851, "dur": 1, "args": { "External id": 208402, "cbid": 251, "correlation": 208402 } }, { "ph": "f", "id": 208402, "pid": 76337, "tid": -914061504, "ts": 1716454224716851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716854, "dur": 0, "args": { "External id": 208403, "cbid": 251, "correlation": 208403 } }, { "ph": "f", "id": 208403, "pid": 76337, "tid": -914061504, "ts": 1716454224716854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224768656, "dur": 12, "args": { "External id": 208404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208404, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 208404, "pid": 5, "tid": 7, "ts": 1716454224768656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716856, "dur": 13, "args": { "External id": 208404, "cbid": 211, "correlation": 208404 } }, { "ph": "s", "id": 208404, "pid": 76337, "tid": -914061504, "ts": 1716454224716856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224768669, "dur": 5, "args": { "External id": 208406, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208406, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 208406, "pid": 5, "tid": 7, "ts": 1716454224768669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716870, "dur": 6, "args": { "External id": 208406, "cbid": 211, "correlation": 208406 } }, { "ph": "s", "id": 208406, "pid": 76337, "tid": -914061504, "ts": 1716454224716870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716927, "dur": 1, "args": { "External id": 208417, "cbid": 251, "correlation": 208417 } }, { "ph": "f", "id": 208417, "pid": 76337, "tid": -914061504, "ts": 1716454224716927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224716931, "dur": 0, "args": { "External id": 208418, "cbid": 251, "correlation": 208418 } }, { "ph": "f", "id": 208418, "pid": 76337, "tid": -914061504, "ts": 1716454224716931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224768675, "dur": 7, "args": { "External id": 208419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208419, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 208419, "pid": 5, "tid": 7, "ts": 1716454224768675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716932, "dur": 12, "args": { "External id": 208419, "cbid": 211, "correlation": 208419 } }, { "ph": "s", "id": 208419, "pid": 76337, "tid": -914061504, "ts": 1716454224716932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224768684, "dur": 3, "args": { "External id": 208421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208421, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 208421, "pid": 5, "tid": 7, "ts": 1716454224768684, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224716945, "dur": 5, "args": { "External id": 208421, "cbid": 211, "correlation": 208421 } }, { "ph": "s", "id": 208421, "pid": 76337, "tid": -914061504, "ts": 1716454224716945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224768688, "dur": 94, "args": { "External id": 208442, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208442, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 208442, "pid": 5, "tid": 7, "ts": 1716454224768688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717030, "dur": 13, "args": { "External id": 208442, "cbid": 211, "correlation": 208442 } }, { "ph": "s", "id": 208442, "pid": 76337, "tid": -914061504, "ts": 1716454224717030, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224717128, "dur": 1, "args": { "External id": 208460, "cbid": 251, "correlation": 208460 } }, { "ph": "f", "id": 208460, "pid": 76337, "tid": -914061504, "ts": 1716454224717128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224768783, "dur": 99, "args": { "External id": 208462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208462, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208462, "pid": 5, "tid": 7, "ts": 1716454224768783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717133, "dur": 13, "args": { "External id": 208462, "cbid": 211, "correlation": 208462 } }, { "ph": "s", "id": 208462, "pid": 76337, "tid": -914061504, "ts": 1716454224717133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224768884, "dur": 19, "args": { "External id": 208470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208470, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208470, "pid": 5, "tid": 7, "ts": 1716454224768884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717202, "dur": 13, "args": { "External id": 208470, "cbid": 211, "correlation": 208470 } }, { "ph": "s", "id": 208470, "pid": 76337, "tid": -914061504, "ts": 1716454224717202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224768904, "dur": 37, "args": { "External id": 208478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208478, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208478, "pid": 5, "tid": 7, "ts": 1716454224768904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717244, "dur": 9, "args": { "External id": 208478, "cbid": 211, "correlation": 208478 } }, { "ph": "s", "id": 208478, "pid": 76337, "tid": -914061504, "ts": 1716454224717244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224768942, "dur": 35, "args": { "External id": 208500, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208500, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208500, "pid": 5, "tid": 7, "ts": 1716454224768942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717295, "dur": 10, "args": { "External id": 208500, "cbid": 211, "correlation": 208500 } }, { "ph": "s", "id": 208500, "pid": 76337, "tid": -914061504, "ts": 1716454224717295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224717384, "dur": 1, "args": { "External id": 208516, "cbid": 251, "correlation": 208516 } }, { "ph": "f", "id": 208516, "pid": 76337, "tid": -914061504, "ts": 1716454224717384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224717389, "dur": 0, "args": { "External id": 208518, "cbid": 251, "correlation": 208518 } }, { "ph": "f", "id": 208518, "pid": 76337, "tid": -914061504, "ts": 1716454224717389, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224768979, "dur": 543, "args": { "External id": 208519, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208519, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 208519, "pid": 5, "tid": 7, "ts": 1716454224768979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717392, "dur": 12, "args": { "External id": 208519, "cbid": 211, "correlation": 208519 } }, { "ph": "s", "id": 208519, "pid": 76337, "tid": -914061504, "ts": 1716454224717392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224769523, "dur": 124, "args": { "External id": 208527, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208527, "pid": 5, "tid": 7, "ts": 1716454224769523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717456, "dur": 13, "args": { "External id": 208527, "cbid": 211, "correlation": 208527 } }, { "ph": "s", "id": 208527, "pid": 76337, "tid": -914061504, "ts": 1716454224717456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224769648, "dur": 126, "args": { "External id": 208535, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208535, "pid": 5, "tid": 7, "ts": 1716454224769648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717487, "dur": 8, "args": { "External id": 208535, "cbid": 211, "correlation": 208535 } }, { "ph": "s", "id": 208535, "pid": 76337, "tid": -914061504, "ts": 1716454224717487, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224717563, "dur": 1, "args": { "External id": 208551, "cbid": 251, "correlation": 208551 } }, { "ph": "f", "id": 208551, "pid": 76337, "tid": -914061504, "ts": 1716454224717563, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224769775, "dur": 307, "args": { "External id": 208553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208553, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208553, "pid": 5, "tid": 7, "ts": 1716454224769775, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717568, "dur": 12, "args": { "External id": 208553, "cbid": 211, "correlation": 208553 } }, { "ph": "s", "id": 208553, "pid": 76337, "tid": -914061504, "ts": 1716454224717568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224770084, "dur": 27, "args": { "External id": 208561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208561, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208561, "pid": 5, "tid": 7, "ts": 1716454224770084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717610, "dur": 10, "args": { "External id": 208561, "cbid": 211, "correlation": 208561 } }, { "ph": "s", "id": 208561, "pid": 76337, "tid": -914061504, "ts": 1716454224717610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224770112, "dur": 82, "args": { "External id": 208572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208572, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208572, "pid": 5, "tid": 7, "ts": 1716454224770112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717679, "dur": 12, "args": { "External id": 208572, "cbid": 211, "correlation": 208572 } }, { "ph": "s", "id": 208572, "pid": 76337, "tid": -914061504, "ts": 1716454224717679, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224717741, "dur": 0, "args": { "External id": 208584, "cbid": 317, "correlation": 208584 } }, { "ph": "f", "id": 208584, "pid": 76337, "tid": -914061504, "ts": 1716454224717741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224717742, "dur": 0, "args": { "External id": 208585, "cbid": 203, "correlation": 208585 } }, { "ph": "f", "id": 208585, "pid": 76337, "tid": -914061504, "ts": 1716454224717742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224717742, "dur": 0, "args": { "External id": 208586, "cbid": 205, "correlation": 208586 } }, { "ph": "f", "id": 208586, "pid": 76337, "tid": -914061504, "ts": 1716454224717742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770196, "dur": 23, "args": { "External id": 208590, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208590, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208590, "pid": 5, "tid": 7, "ts": 1716454224770196, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717758, "dur": 12, "args": { "External id": 208590, "cbid": 211, "correlation": 208590 } }, { "ph": "s", "id": 208590, "pid": 76337, "tid": -914061504, "ts": 1716454224717758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224770219, "dur": 122, "args": { "External id": 208592, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208592, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208592, "pid": 5, "tid": 7, "ts": 1716454224770219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717777, "dur": 6, "args": { "External id": 208592, "cbid": 211, "correlation": 208592 } }, { "ph": "s", "id": 208592, "pid": 76337, "tid": -914061504, "ts": 1716454224717777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770343, "dur": 21, "args": { "External id": 208594, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208594, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208594, "pid": 5, "tid": 7, "ts": 1716454224770343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717787, "dur": 5, "args": { "External id": 208594, "cbid": 211, "correlation": 208594 } }, { "ph": "s", "id": 208594, "pid": 76337, "tid": -914061504, "ts": 1716454224717787, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224770365, "dur": 33, "args": { "External id": 208600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208600, "pid": 5, "tid": 7, "ts": 1716454224770365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717815, "dur": 8, "args": { "External id": 208600, "cbid": 211, "correlation": 208600 } }, { "ph": "s", "id": 208600, "pid": 76337, "tid": -914061504, "ts": 1716454224717815, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224770399, "dur": 27, "args": { "External id": 208608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208608, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208608, "pid": 5, "tid": 7, "ts": 1716454224770399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717846, "dur": 8, "args": { "External id": 208608, "cbid": 211, "correlation": 208608 } }, { "ph": "s", "id": 208608, "pid": 76337, "tid": -914061504, "ts": 1716454224717846, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224717917, "dur": 0, "args": { "External id": 208618, "cbid": 317, "correlation": 208618 } }, { "ph": "f", "id": 208618, "pid": 76337, "tid": -914061504, "ts": 1716454224717917, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224717918, "dur": 0, "args": { "External id": 208619, "cbid": 203, "correlation": 208619 } }, { "ph": "f", "id": 208619, "pid": 76337, "tid": -914061504, "ts": 1716454224717918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224717918, "dur": 0, "args": { "External id": 208620, "cbid": 205, "correlation": 208620 } }, { "ph": "f", "id": 208620, "pid": 76337, "tid": -914061504, "ts": 1716454224717918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770427, "dur": 23, "args": { "External id": 208624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208624, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208624, "pid": 5, "tid": 7, "ts": 1716454224770427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717933, "dur": 12, "args": { "External id": 208624, "cbid": 211, "correlation": 208624 } }, { "ph": "s", "id": 208624, "pid": 76337, "tid": -914061504, "ts": 1716454224717933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770452, "dur": 46, "args": { "External id": 208626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208626, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208626, "pid": 5, "tid": 7, "ts": 1716454224770452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717947, "dur": 5, "args": { "External id": 208626, "cbid": 211, "correlation": 208626 } }, { "ph": "s", "id": 208626, "pid": 76337, "tid": -914061504, "ts": 1716454224717947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224770498, "dur": 236, "args": { "External id": 208628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208628, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 0.75, "warps per SM": 3, "grid": [5, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 208628, "pid": 5, "tid": 7, "ts": 1716454224770498, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717959, "dur": 6, "args": { "External id": 208628, "cbid": 211, "correlation": 208628 } }, { "ph": "s", "id": 208628, "pid": 76337, "tid": -914061504, "ts": 1716454224717959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770736, "dur": 7, "args": { "External id": 208630, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208630, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208630, "pid": 5, "tid": 7, "ts": 1716454224770736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224717969, "dur": 14, "args": { "External id": 208630, "cbid": 211, "correlation": 208630 } }, { "ph": "s", "id": 208630, "pid": 76337, "tid": -914061504, "ts": 1716454224717969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224770744, "dur": 9, "args": { "External id": 208636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208636, "pid": 5, "tid": 7, "ts": 1716454224770744, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718006, "dur": 9, "args": { "External id": 208636, "cbid": 211, "correlation": 208636 } }, { "ph": "s", "id": 208636, "pid": 76337, "tid": -914061504, "ts": 1716454224718006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224770754, "dur": 12, "args": { "External id": 208656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208656, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 208656, "pid": 5, "tid": 7, "ts": 1716454224770754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718098, "dur": 13, "args": { "External id": 208656, "cbid": 211, "correlation": 208656 } }, { "ph": "s", "id": 208656, "pid": 76337, "tid": -914061504, "ts": 1716454224718098, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224770768, "dur": 4, "args": { "External id": 208668, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208668, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 208668, "pid": 5, "tid": 7, "ts": 1716454224770768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718121, "dur": 6, "args": { "External id": 208668, "cbid": 211, "correlation": 208668 } }, { "ph": "s", "id": 208668, "pid": 76337, "tid": -914061504, "ts": 1716454224718121, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224770773, "dur": 13, "args": { "External id": 208671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208671, "pid": 5, "tid": 7, "ts": 1716454224770773, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718138, "dur": 6, "args": { "External id": 208671, "cbid": 211, "correlation": 208671 } }, { "ph": "s", "id": 208671, "pid": 76337, "tid": -914061504, "ts": 1716454224718138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224770787, "dur": 7, "args": { "External id": 208680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208680, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208680, "pid": 5, "tid": 7, "ts": 1716454224770787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718178, "dur": 10, "args": { "External id": 208680, "cbid": 211, "correlation": 208680 } }, { "ph": "s", "id": 208680, "pid": 76337, "tid": -914061504, "ts": 1716454224718178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224718230, "dur": 0, "args": { "External id": 208690, "cbid": 317, "correlation": 208690 } }, { "ph": "f", "id": 208690, "pid": 76337, "tid": -914061504, "ts": 1716454224718230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224718231, "dur": 0, "args": { "External id": 208691, "cbid": 203, "correlation": 208691 } }, { "ph": "f", "id": 208691, "pid": 76337, "tid": -914061504, "ts": 1716454224718231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224718232, "dur": 0, "args": { "External id": 208692, "cbid": 205, "correlation": 208692 } }, { "ph": "f", "id": 208692, "pid": 76337, "tid": -914061504, "ts": 1716454224718232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770795, "dur": 6, "args": { "External id": 208696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208696, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208696, "pid": 5, "tid": 7, "ts": 1716454224770795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718248, "dur": 12, "args": { "External id": 208696, "cbid": 211, "correlation": 208696 } }, { "ph": "s", "id": 208696, "pid": 76337, "tid": -914061504, "ts": 1716454224718248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224770802, "dur": 84, "args": { "External id": 208698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208698, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 20, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208698, "pid": 5, "tid": 7, "ts": 1716454224770802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718263, "dur": 5, "args": { "External id": 208698, "cbid": 211, "correlation": 208698 } }, { "ph": "s", "id": 208698, "pid": 76337, "tid": -914061504, "ts": 1716454224718263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224770889, "dur": 1, "args": { "External id": 208700, "device": 5, "context": 1, "stream": 7, "correlation": 208700, "bytes": 960, "memory bandwidth (GB/s)": 0.6 } }, { "ph": "f", "id": 208700, "pid": 5, "tid": 7, "ts": 1716454224770889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224718275, "dur": 8, "args": { "External id": 208700, "cbid": 51, "correlation": 208700 } }, { "ph": "s", "id": 208700, "pid": 76337, "tid": -914061504, "ts": 1716454224718275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224770892, "dur": 545, "args": { "External id": 208701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208701, "registers per thread": 236, "shared memory": 18432, "blocks per SM": 30, "warps per SM": 120, "grid": [10, 12, 20], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208701, "pid": 5, "tid": 7, "ts": 1716454224770892, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718285, "dur": 8, "args": { "External id": 208701, "cbid": 211, "correlation": 208701 } }, { "ph": "s", "id": 208701, "pid": 76337, "tid": -914061504, "ts": 1716454224718285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224771439, "dur": 12, "args": { "External id": 208703, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208703, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208703, "pid": 5, "tid": 7, "ts": 1716454224771439, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718298, "dur": 5, "args": { "External id": 208703, "cbid": 211, "correlation": 208703 } }, { "ph": "s", "id": 208703, "pid": 76337, "tid": -914061504, "ts": 1716454224718298, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224771452, "dur": 15, "args": { "External id": 208709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208709, "pid": 5, "tid": 7, "ts": 1716454224771452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718325, "dur": 8, "args": { "External id": 208709, "cbid": 211, "correlation": 208709 } }, { "ph": "s", "id": 208709, "pid": 76337, "tid": -914061504, "ts": 1716454224718325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224771468, "dur": 3, "args": { "External id": 208717, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208717, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 208717, "pid": 5, "tid": 7, "ts": 1716454224771468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718369, "dur": 9, "args": { "External id": 208717, "cbid": 211, "correlation": 208717 } }, { "ph": "s", "id": 208717, "pid": 76337, "tid": -914061504, "ts": 1716454224718369, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224718432, "dur": 1, "args": { "External id": 208733, "cbid": 251, "correlation": 208733 } }, { "ph": "f", "id": 208733, "pid": 76337, "tid": -914061504, "ts": 1716454224718432, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224718437, "dur": 0, "args": { "External id": 208735, "cbid": 251, "correlation": 208735 } }, { "ph": "f", "id": 208735, "pid": 76337, "tid": -914061504, "ts": 1716454224718437, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224771473, "dur": 13, "args": { "External id": 208736, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208736, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208736, "pid": 5, "tid": 7, "ts": 1716454224771473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718439, "dur": 12, "args": { "External id": 208736, "cbid": 211, "correlation": 208736 } }, { "ph": "s", "id": 208736, "pid": 76337, "tid": -914061504, "ts": 1716454224718439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224771487, "dur": 5, "args": { "External id": 208738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208738, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208738, "pid": 5, "tid": 7, "ts": 1716454224771487, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718452, "dur": 5, "args": { "External id": 208738, "cbid": 211, "correlation": 208738 } }, { "ph": "s", "id": 208738, "pid": 76337, "tid": -914061504, "ts": 1716454224718452, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224771494, "dur": 17, "args": { "External id": 208748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208748, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208748, "pid": 5, "tid": 7, "ts": 1716454224771494, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718510, "dur": 11, "args": { "External id": 208748, "cbid": 211, "correlation": 208748 } }, { "ph": "s", "id": 208748, "pid": 76337, "tid": -914061504, "ts": 1716454224718510, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224771512, "dur": 18, "args": { "External id": 208768, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208768, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 208768, "pid": 5, "tid": 7, "ts": 1716454224771512, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718576, "dur": 11, "args": { "External id": 208768, "cbid": 211, "correlation": 208768 } }, { "ph": "s", "id": 208768, "pid": 76337, "tid": -914061504, "ts": 1716454224718576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224771531, "dur": 5, "args": { "External id": 208780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208780, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 208780, "pid": 5, "tid": 7, "ts": 1716454224771531, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718598, "dur": 6, "args": { "External id": 208780, "cbid": 211, "correlation": 208780 } }, { "ph": "s", "id": 208780, "pid": 76337, "tid": -914061504, "ts": 1716454224718598, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224771537, "dur": 17, "args": { "External id": 208783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208783, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208783, "pid": 5, "tid": 7, "ts": 1716454224771537, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718617, "dur": 6, "args": { "External id": 208783, "cbid": 211, "correlation": 208783 } }, { "ph": "s", "id": 208783, "pid": 76337, "tid": -914061504, "ts": 1716454224718617, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224771555, "dur": 11, "args": { "External id": 208792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208792, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208792, "pid": 5, "tid": 7, "ts": 1716454224771555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718657, "dur": 9, "args": { "External id": 208792, "cbid": 211, "correlation": 208792 } }, { "ph": "s", "id": 208792, "pid": 76337, "tid": -914061504, "ts": 1716454224718657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224718719, "dur": 0, "args": { "External id": 208802, "cbid": 317, "correlation": 208802 } }, { "ph": "f", "id": 208802, "pid": 76337, "tid": -914061504, "ts": 1716454224718719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224718720, "dur": 0, "args": { "External id": 208803, "cbid": 203, "correlation": 208803 } }, { "ph": "f", "id": 208803, "pid": 76337, "tid": -914061504, "ts": 1716454224718720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224718720, "dur": 0, "args": { "External id": 208804, "cbid": 205, "correlation": 208804 } }, { "ph": "f", "id": 208804, "pid": 76337, "tid": -914061504, "ts": 1716454224718720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224771567, "dur": 11, "args": { "External id": 208808, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208808, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208808, "pid": 5, "tid": 7, "ts": 1716454224771567, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718738, "dur": 13, "args": { "External id": 208808, "cbid": 211, "correlation": 208808 } }, { "ph": "s", "id": 208808, "pid": 76337, "tid": -914061504, "ts": 1716454224718738, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224771579, "dur": 165, "args": { "External id": 208810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208810, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208810, "pid": 5, "tid": 7, "ts": 1716454224771579, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718753, "dur": 5, "args": { "External id": 208810, "cbid": 211, "correlation": 208810 } }, { "ph": "s", "id": 208810, "pid": 76337, "tid": -914061504, "ts": 1716454224718753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224771746, "dur": 1, "args": { "External id": 208812, "device": 5, "context": 1, "stream": 7, "correlation": 208812, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 208812, "pid": 5, "tid": 7, "ts": 1716454224771746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224718765, "dur": 7, "args": { "External id": 208812, "cbid": 51, "correlation": 208812 } }, { "ph": "s", "id": 208812, "pid": 76337, "tid": -914061504, "ts": 1716454224718765, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224771750, "dur": 666, "args": { "External id": 208813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208813, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208813, "pid": 5, "tid": 7, "ts": 1716454224771750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718773, "dur": 6, "args": { "External id": 208813, "cbid": 211, "correlation": 208813 } }, { "ph": "s", "id": 208813, "pid": 76337, "tid": -914061504, "ts": 1716454224718773, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224772417, "dur": 12, "args": { "External id": 208815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208815, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208815, "pid": 5, "tid": 7, "ts": 1716454224772417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718783, "dur": 5, "args": { "External id": 208815, "cbid": 211, "correlation": 208815 } }, { "ph": "s", "id": 208815, "pid": 76337, "tid": -914061504, "ts": 1716454224718783, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224772431, "dur": 15, "args": { "External id": 208821, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208821, "pid": 5, "tid": 7, "ts": 1716454224772431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718812, "dur": 9, "args": { "External id": 208821, "cbid": 211, "correlation": 208821 } }, { "ph": "s", "id": 208821, "pid": 76337, "tid": -914061504, "ts": 1716454224718812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224718871, "dur": 0, "args": { "External id": 208831, "cbid": 317, "correlation": 208831 } }, { "ph": "f", "id": 208831, "pid": 76337, "tid": -914061504, "ts": 1716454224718871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224718872, "dur": 0, "args": { "External id": 208832, "cbid": 203, "correlation": 208832 } }, { "ph": "f", "id": 208832, "pid": 76337, "tid": -914061504, "ts": 1716454224718872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224718872, "dur": 0, "args": { "External id": 208833, "cbid": 205, "correlation": 208833 } }, { "ph": "f", "id": 208833, "pid": 76337, "tid": -914061504, "ts": 1716454224718872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224772447, "dur": 8, "args": { "External id": 208837, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208837, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 12, "warps per SM": 96, "grid": [6, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208837, "pid": 5, "tid": 7, "ts": 1716454224772447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718888, "dur": 13, "args": { "External id": 208837, "cbid": 211, "correlation": 208837 } }, { "ph": "s", "id": 208837, "pid": 76337, "tid": -914061504, "ts": 1716454224718888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224772457, "dur": 3, "args": { "External id": 208839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 208839, "pid": 5, "tid": 7, "ts": 1716454224772457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718907, "dur": 6, "args": { "External id": 208839, "cbid": 211, "correlation": 208839 } }, { "ph": "s", "id": 208839, "pid": 76337, "tid": -914061504, "ts": 1716454224718907, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224718916, "dur": 0, "args": { "External id": 208840, "cbid": 51, "correlation": 208840 } }, { "ph": "s", "id": 208840, "pid": 76337, "tid": -914061504, "ts": 1716454224718916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224772461, "dur": 57, "args": { "External id": 208841, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208841, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.5, "warps per SM": 6, "grid": [12, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 208841, "pid": 5, "tid": 7, "ts": 1716454224772461, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718916, "dur": 5, "args": { "External id": 208841, "cbid": 211, "correlation": 208841 } }, { "ph": "s", "id": 208841, "pid": 76337, "tid": -914061504, "ts": 1716454224718916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224772520, "dur": 14, "args": { "External id": 208846, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208846, "pid": 5, "tid": 7, "ts": 1716454224772520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718943, "dur": 8, "args": { "External id": 208846, "cbid": 211, "correlation": 208846 } }, { "ph": "s", "id": 208846, "pid": 76337, "tid": -914061504, "ts": 1716454224718943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224772536, "dur": 12, "args": { "External id": 208854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208854, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208854, "pid": 5, "tid": 7, "ts": 1716454224772536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224718971, "dur": 17, "args": { "External id": 208854, "cbid": 211, "correlation": 208854 } }, { "ph": "s", "id": 208854, "pid": 76337, "tid": -914061504, "ts": 1716454224718971, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224772549, "dur": 10, "args": { "External id": 208862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208862, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208862, "pid": 5, "tid": 7, "ts": 1716454224772549, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719011, "dur": 8, "args": { "External id": 208862, "cbid": 211, "correlation": 208862 } }, { "ph": "s", "id": 208862, "pid": 76337, "tid": -914061504, "ts": 1716454224719011, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224772561, "dur": 18, "args": { "External id": 208882, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208882, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 208882, "pid": 5, "tid": 7, "ts": 1716454224772561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719091, "dur": 12, "args": { "External id": 208882, "cbid": 211, "correlation": 208882 } }, { "ph": "s", "id": 208882, "pid": 76337, "tid": -914061504, "ts": 1716454224719091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224772580, "dur": 4, "args": { "External id": 208894, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208894, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 208894, "pid": 5, "tid": 7, "ts": 1716454224772580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719114, "dur": 6, "args": { "External id": 208894, "cbid": 211, "correlation": 208894 } }, { "ph": "s", "id": 208894, "pid": 76337, "tid": -914061504, "ts": 1716454224719114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224772586, "dur": 17, "args": { "External id": 208897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208897, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208897, "pid": 5, "tid": 7, "ts": 1716454224772586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719131, "dur": 6, "args": { "External id": 208897, "cbid": 211, "correlation": 208897 } }, { "ph": "s", "id": 208897, "pid": 76337, "tid": -914061504, "ts": 1716454224719131, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224719189, "dur": 0, "args": { "External id": 208908, "cbid": 317, "correlation": 208908 } }, { "ph": "f", "id": 208908, "pid": 76337, "tid": -914061504, "ts": 1716454224719189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224719190, "dur": 0, "args": { "External id": 208909, "cbid": 203, "correlation": 208909 } }, { "ph": "f", "id": 208909, "pid": 76337, "tid": -914061504, "ts": 1716454224719190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224719190, "dur": 0, "args": { "External id": 208910, "cbid": 205, "correlation": 208910 } }, { "ph": "f", "id": 208910, "pid": 76337, "tid": -914061504, "ts": 1716454224719190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224772604, "dur": 11, "args": { "External id": 208914, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208914, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208914, "pid": 5, "tid": 7, "ts": 1716454224772604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719207, "dur": 12, "args": { "External id": 208914, "cbid": 211, "correlation": 208914 } }, { "ph": "s", "id": 208914, "pid": 76337, "tid": -914061504, "ts": 1716454224719207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224772617, "dur": 3, "args": { "External id": 208916, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208916, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 208916, "pid": 5, "tid": 7, "ts": 1716454224772617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719223, "dur": 5, "args": { "External id": 208916, "cbid": 211, "correlation": 208916 } }, { "ph": "s", "id": 208916, "pid": 76337, "tid": -914061504, "ts": 1716454224719223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224719232, "dur": 0, "args": { "External id": 208917, "cbid": 51, "correlation": 208917 } }, { "ph": "s", "id": 208917, "pid": 76337, "tid": -914061504, "ts": 1716454224719232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224772621, "dur": 97, "args": { "External id": 208918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208918, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 208918, "pid": 5, "tid": 7, "ts": 1716454224772621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719233, "dur": 6, "args": { "External id": 208918, "cbid": 211, "correlation": 208918 } }, { "ph": "s", "id": 208918, "pid": 76337, "tid": -914061504, "ts": 1716454224719233, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224772720, "dur": 16, "args": { "External id": 208923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208923, "pid": 5, "tid": 7, "ts": 1716454224772720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719261, "dur": 8, "args": { "External id": 208923, "cbid": 211, "correlation": 208923 } }, { "ph": "s", "id": 208923, "pid": 76337, "tid": -914061504, "ts": 1716454224719261, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224772737, "dur": 85, "args": { "External id": 208932, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208932, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208932, "pid": 5, "tid": 7, "ts": 1716454224772737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719342, "dur": 14, "args": { "External id": 208932, "cbid": 211, "correlation": 208932 } }, { "ph": "s", "id": 208932, "pid": 76337, "tid": -914061504, "ts": 1716454224719342, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224772824, "dur": 31, "args": { "External id": 208954, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208954, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 208954, "pid": 5, "tid": 7, "ts": 1716454224772824, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719398, "dur": 10, "args": { "External id": 208954, "cbid": 211, "correlation": 208954 } }, { "ph": "s", "id": 208954, "pid": 76337, "tid": -914061504, "ts": 1716454224719398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224719485, "dur": 1, "args": { "External id": 208965, "cbid": 251, "correlation": 208965 } }, { "ph": "f", "id": 208965, "pid": 76337, "tid": -914061504, "ts": 1716454224719485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224772856, "dur": 166, "args": { "External id": 208966, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208966, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208966, "pid": 5, "tid": 7, "ts": 1716454224772856, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719491, "dur": 13, "args": { "External id": 208966, "cbid": 211, "correlation": 208966 } }, { "ph": "s", "id": 208966, "pid": 76337, "tid": -914061504, "ts": 1716454224719491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224719562, "dur": 1, "args": { "External id": 208977, "cbid": 251, "correlation": 208977 } }, { "ph": "f", "id": 208977, "pid": 76337, "tid": -914061504, "ts": 1716454224719562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224773023, "dur": 159, "args": { "External id": 208978, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208978, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208978, "pid": 5, "tid": 7, "ts": 1716454224773023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719566, "dur": 11, "args": { "External id": 208978, "cbid": 211, "correlation": 208978 } }, { "ph": "s", "id": 208978, "pid": 76337, "tid": -914061504, "ts": 1716454224719566, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224719630, "dur": 1, "args": { "External id": 208989, "cbid": 251, "correlation": 208989 } }, { "ph": "f", "id": 208989, "pid": 76337, "tid": -914061504, "ts": 1716454224719630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224773184, "dur": 158, "args": { "External id": 208990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 208990, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 208990, "pid": 5, "tid": 7, "ts": 1716454224773184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719634, "dur": 12, "args": { "External id": 208990, "cbid": 211, "correlation": 208990 } }, { "ph": "s", "id": 208990, "pid": 76337, "tid": -914061504, "ts": 1716454224719634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224773343, "dur": 338, "args": { "External id": 209015, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209015, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209015, "pid": 5, "tid": 7, "ts": 1716454224773343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719720, "dur": 13, "args": { "External id": 209015, "cbid": 211, "correlation": 209015 } }, { "ph": "s", "id": 209015, "pid": 76337, "tid": -914061504, "ts": 1716454224719720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224719821, "dur": 1, "args": { "External id": 209033, "cbid": 251, "correlation": 209033 } }, { "ph": "f", "id": 209033, "pid": 76337, "tid": -914061504, "ts": 1716454224719821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224773682, "dur": 167, "args": { "External id": 209035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209035, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209035, "pid": 5, "tid": 7, "ts": 1716454224773682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719827, "dur": 13, "args": { "External id": 209035, "cbid": 211, "correlation": 209035 } }, { "ph": "s", "id": 209035, "pid": 76337, "tid": -914061504, "ts": 1716454224719827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224773851, "dur": 19, "args": { "External id": 209043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209043, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209043, "pid": 5, "tid": 7, "ts": 1716454224773851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719897, "dur": 12, "args": { "External id": 209043, "cbid": 211, "correlation": 209043 } }, { "ph": "s", "id": 209043, "pid": 76337, "tid": -914061504, "ts": 1716454224719897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224773871, "dur": 27, "args": { "External id": 209051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209051, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209051, "pid": 5, "tid": 7, "ts": 1716454224773871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224719936, "dur": 9, "args": { "External id": 209051, "cbid": 211, "correlation": 209051 } }, { "ph": "s", "id": 209051, "pid": 76337, "tid": -914061504, "ts": 1716454224719936, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224773900, "dur": 18, "args": { "External id": 209062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209062, "pid": 5, "tid": 7, "ts": 1716454224773900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720017, "dur": 13, "args": { "External id": 209062, "cbid": 211, "correlation": 209062 } }, { "ph": "s", "id": 209062, "pid": 76337, "tid": -914061504, "ts": 1716454224720017, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224773919, "dur": 16, "args": { "External id": 209084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209084, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209084, "pid": 5, "tid": 7, "ts": 1716454224773919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720049, "dur": 7, "args": { "External id": 209084, "cbid": 211, "correlation": 209084 } }, { "ph": "s", "id": 209084, "pid": 76337, "tid": -914061504, "ts": 1716454224720049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720135, "dur": 1, "args": { "External id": 209095, "cbid": 251, "correlation": 209095 } }, { "ph": "f", "id": 209095, "pid": 76337, "tid": -914061504, "ts": 1716454224720135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224773938, "dur": 89, "args": { "External id": 209096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209096, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209096, "pid": 5, "tid": 7, "ts": 1716454224773938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720141, "dur": 16, "args": { "External id": 209096, "cbid": 211, "correlation": 209096 } }, { "ph": "s", "id": 209096, "pid": 76337, "tid": -914061504, "ts": 1716454224720141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720213, "dur": 1, "args": { "External id": 209107, "cbid": 251, "correlation": 209107 } }, { "ph": "f", "id": 209107, "pid": 76337, "tid": -914061504, "ts": 1716454224720213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720217, "dur": 0, "args": { "External id": 209108, "cbid": 251, "correlation": 209108 } }, { "ph": "f", "id": 209108, "pid": 76337, "tid": -914061504, "ts": 1716454224720217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224774028, "dur": 12, "args": { "External id": 209109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209109, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209109, "pid": 5, "tid": 7, "ts": 1716454224774028, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720219, "dur": 12, "args": { "External id": 209109, "cbid": 211, "correlation": 209109 } }, { "ph": "s", "id": 209109, "pid": 76337, "tid": -914061504, "ts": 1716454224720219, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224774042, "dur": 6, "args": { "External id": 209111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209111, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209111, "pid": 5, "tid": 7, "ts": 1716454224774042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720232, "dur": 6, "args": { "External id": 209111, "cbid": 211, "correlation": 209111 } }, { "ph": "s", "id": 209111, "pid": 76337, "tid": -914061504, "ts": 1716454224720232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720291, "dur": 1, "args": { "External id": 209122, "cbid": 251, "correlation": 209122 } }, { "ph": "f", "id": 209122, "pid": 76337, "tid": -914061504, "ts": 1716454224720291, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720295, "dur": 0, "args": { "External id": 209123, "cbid": 251, "correlation": 209123 } }, { "ph": "f", "id": 209123, "pid": 76337, "tid": -914061504, "ts": 1716454224720295, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224774049, "dur": 8, "args": { "External id": 209124, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209124, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209124, "pid": 5, "tid": 7, "ts": 1716454224774049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720296, "dur": 12, "args": { "External id": 209124, "cbid": 211, "correlation": 209124 } }, { "ph": "s", "id": 209124, "pid": 76337, "tid": -914061504, "ts": 1716454224720296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224774059, "dur": 3, "args": { "External id": 209126, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209126, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209126, "pid": 5, "tid": 7, "ts": 1716454224774059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720310, "dur": 5, "args": { "External id": 209126, "cbid": 211, "correlation": 209126 } }, { "ph": "s", "id": 209126, "pid": 76337, "tid": -914061504, "ts": 1716454224720310, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224774063, "dur": 55, "args": { "External id": 209151, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209151, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209151, "pid": 5, "tid": 7, "ts": 1716454224774063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720387, "dur": 13, "args": { "External id": 209151, "cbid": 211, "correlation": 209151 } }, { "ph": "s", "id": 209151, "pid": 76337, "tid": -914061504, "ts": 1716454224720387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720486, "dur": 2, "args": { "External id": 209169, "cbid": 251, "correlation": 209169 } }, { "ph": "f", "id": 209169, "pid": 76337, "tid": -914061504, "ts": 1716454224720486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224774120, "dur": 92, "args": { "External id": 209171, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209171, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209171, "pid": 5, "tid": 7, "ts": 1716454224774120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720494, "dur": 14, "args": { "External id": 209171, "cbid": 211, "correlation": 209171 } }, { "ph": "s", "id": 209171, "pid": 76337, "tid": -914061504, "ts": 1716454224720494, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224774213, "dur": 9, "args": { "External id": 209179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209179, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209179, "pid": 5, "tid": 7, "ts": 1716454224774213, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720564, "dur": 12, "args": { "External id": 209179, "cbid": 211, "correlation": 209179 } }, { "ph": "s", "id": 209179, "pid": 76337, "tid": -914061504, "ts": 1716454224720564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224774224, "dur": 22, "args": { "External id": 209187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209187, "pid": 5, "tid": 7, "ts": 1716454224774224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720606, "dur": 9, "args": { "External id": 209187, "cbid": 211, "correlation": 209187 } }, { "ph": "s", "id": 209187, "pid": 76337, "tid": -914061504, "ts": 1716454224720606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224774247, "dur": 18, "args": { "External id": 209209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209209, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209209, "pid": 5, "tid": 7, "ts": 1716454224774247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720657, "dur": 10, "args": { "External id": 209209, "cbid": 211, "correlation": 209209 } }, { "ph": "s", "id": 209209, "pid": 76337, "tid": -914061504, "ts": 1716454224720657, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720745, "dur": 2, "args": { "External id": 209225, "cbid": 251, "correlation": 209225 } }, { "ph": "f", "id": 209225, "pid": 76337, "tid": -914061504, "ts": 1716454224720745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720752, "dur": 0, "args": { "External id": 209227, "cbid": 251, "correlation": 209227 } }, { "ph": "f", "id": 209227, "pid": 76337, "tid": -914061504, "ts": 1716454224720752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224774267, "dur": 497, "args": { "External id": 209228, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209228, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209228, "pid": 5, "tid": 7, "ts": 1716454224774267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720754, "dur": 15, "args": { "External id": 209228, "cbid": 211, "correlation": 209228 } }, { "ph": "s", "id": 209228, "pid": 76337, "tid": -914061504, "ts": 1716454224720754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224774765, "dur": 66, "args": { "External id": 209236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209236, "pid": 5, "tid": 7, "ts": 1716454224774765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720822, "dur": 12, "args": { "External id": 209236, "cbid": 211, "correlation": 209236 } }, { "ph": "s", "id": 209236, "pid": 76337, "tid": -914061504, "ts": 1716454224720822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224774832, "dur": 68, "args": { "External id": 209244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209244, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209244, "pid": 5, "tid": 7, "ts": 1716454224774832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720853, "dur": 8, "args": { "External id": 209244, "cbid": 211, "correlation": 209244 } }, { "ph": "s", "id": 209244, "pid": 76337, "tid": -914061504, "ts": 1716454224720853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224720933, "dur": 1, "args": { "External id": 209260, "cbid": 251, "correlation": 209260 } }, { "ph": "f", "id": 209260, "pid": 76337, "tid": -914061504, "ts": 1716454224720933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224774902, "dur": 1, "args": { "External id": 209262, "device": 5, "context": 1, "stream": 7, "correlation": 209262, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 209262, "pid": 5, "tid": 7, "ts": 1716454224774902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224720939, "dur": 12, "args": { "External id": 209262, "cbid": 51, "correlation": 209262 } }, { "ph": "s", "id": 209262, "pid": 76337, "tid": -914061504, "ts": 1716454224720939, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224774906, "dur": 271, "args": { "External id": 209263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209263, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209263, "pid": 5, "tid": 7, "ts": 1716454224774906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224720953, "dur": 11, "args": { "External id": 209263, "cbid": 211, "correlation": 209263 } }, { "ph": "s", "id": 209263, "pid": 76337, "tid": -914061504, "ts": 1716454224720953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224775178, "dur": 14, "args": { "External id": 209271, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209271, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209271, "pid": 5, "tid": 7, "ts": 1716454224775178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721005, "dur": 11, "args": { "External id": 209271, "cbid": 211, "correlation": 209271 } }, { "ph": "s", "id": 209271, "pid": 76337, "tid": -914061504, "ts": 1716454224721005, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224775193, "dur": 38, "args": { "External id": 209282, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209282, "pid": 5, "tid": 7, "ts": 1716454224775193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721073, "dur": 12, "args": { "External id": 209282, "cbid": 211, "correlation": 209282 } }, { "ph": "s", "id": 209282, "pid": 76337, "tid": -914061504, "ts": 1716454224721073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224721138, "dur": 0, "args": { "External id": 209294, "cbid": 317, "correlation": 209294 } }, { "ph": "f", "id": 209294, "pid": 76337, "tid": -914061504, "ts": 1716454224721138, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224721139, "dur": 0, "args": { "External id": 209295, "cbid": 203, "correlation": 209295 } }, { "ph": "f", "id": 209295, "pid": 76337, "tid": -914061504, "ts": 1716454224721139, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224721140, "dur": 0, "args": { "External id": 209296, "cbid": 205, "correlation": 209296 } }, { "ph": "f", "id": 209296, "pid": 76337, "tid": -914061504, "ts": 1716454224721140, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224775233, "dur": 13, "args": { "External id": 209300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209300, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209300, "pid": 5, "tid": 7, "ts": 1716454224775233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721155, "dur": 12, "args": { "External id": 209300, "cbid": 211, "correlation": 209300 } }, { "ph": "s", "id": 209300, "pid": 76337, "tid": -914061504, "ts": 1716454224721155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224775247, "dur": 4, "args": { "External id": 209302, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209302, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209302, "pid": 5, "tid": 7, "ts": 1716454224775247, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721172, "dur": 6, "args": { "External id": 209302, "cbid": 211, "correlation": 209302 } }, { "ph": "s", "id": 209302, "pid": 76337, "tid": -914061504, "ts": 1716454224721172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224721181, "dur": 0, "args": { "External id": 209303, "cbid": 51, "correlation": 209303 } }, { "ph": "s", "id": 209303, "pid": 76337, "tid": -914061504, "ts": 1716454224721181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224775252, "dur": 99, "args": { "External id": 209304, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209304, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 209304, "pid": 5, "tid": 7, "ts": 1716454224775252, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721181, "dur": 5, "args": { "External id": 209304, "cbid": 211, "correlation": 209304 } }, { "ph": "s", "id": 209304, "pid": 76337, "tid": -914061504, "ts": 1716454224721181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224775353, "dur": 16, "args": { "External id": 209309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209309, "pid": 5, "tid": 7, "ts": 1716454224775353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721209, "dur": 8, "args": { "External id": 209309, "cbid": 211, "correlation": 209309 } }, { "ph": "s", "id": 209309, "pid": 76337, "tid": -914061504, "ts": 1716454224721209, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224775370, "dur": 11, "args": { "External id": 209317, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209317, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209317, "pid": 5, "tid": 7, "ts": 1716454224775370, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721240, "dur": 10, "args": { "External id": 209317, "cbid": 211, "correlation": 209317 } }, { "ph": "s", "id": 209317, "pid": 76337, "tid": -914061504, "ts": 1716454224721240, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224775383, "dur": 19, "args": { "External id": 209337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209337, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 209337, "pid": 5, "tid": 7, "ts": 1716454224775383, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721313, "dur": 12, "args": { "External id": 209337, "cbid": 211, "correlation": 209337 } }, { "ph": "s", "id": 209337, "pid": 76337, "tid": -914061504, "ts": 1716454224721313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224775403, "dur": 4, "args": { "External id": 209349, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209349, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 209349, "pid": 5, "tid": 7, "ts": 1716454224775403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721335, "dur": 7, "args": { "External id": 209349, "cbid": 211, "correlation": 209349 } }, { "ph": "s", "id": 209349, "pid": 76337, "tid": -914061504, "ts": 1716454224721335, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224775408, "dur": 18, "args": { "External id": 209352, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209352, "pid": 5, "tid": 7, "ts": 1716454224775408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721354, "dur": 6, "args": { "External id": 209352, "cbid": 211, "correlation": 209352 } }, { "ph": "s", "id": 209352, "pid": 76337, "tid": -914061504, "ts": 1716454224721354, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224775428, "dur": 12, "args": { "External id": 209361, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209361, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209361, "pid": 5, "tid": 7, "ts": 1716454224775428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721392, "dur": 10, "args": { "External id": 209361, "cbid": 211, "correlation": 209361 } }, { "ph": "s", "id": 209361, "pid": 76337, "tid": -914061504, "ts": 1716454224721392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224721444, "dur": 0, "args": { "External id": 209371, "cbid": 317, "correlation": 209371 } }, { "ph": "f", "id": 209371, "pid": 76337, "tid": -914061504, "ts": 1716454224721444, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224721445, "dur": 0, "args": { "External id": 209372, "cbid": 203, "correlation": 209372 } }, { "ph": "f", "id": 209372, "pid": 76337, "tid": -914061504, "ts": 1716454224721445, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224721446, "dur": 0, "args": { "External id": 209373, "cbid": 205, "correlation": 209373 } }, { "ph": "f", "id": 209373, "pid": 76337, "tid": -914061504, "ts": 1716454224721446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224775442, "dur": 11, "args": { "External id": 209377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209377, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209377, "pid": 5, "tid": 7, "ts": 1716454224775442, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721459, "dur": 11, "args": { "External id": 209377, "cbid": 211, "correlation": 209377 } }, { "ph": "s", "id": 209377, "pid": 76337, "tid": -914061504, "ts": 1716454224721459, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224775454, "dur": 164, "args": { "External id": 209379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209379, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209379, "pid": 5, "tid": 7, "ts": 1716454224775454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721473, "dur": 5, "args": { "External id": 209379, "cbid": 211, "correlation": 209379 } }, { "ph": "s", "id": 209379, "pid": 76337, "tid": -914061504, "ts": 1716454224721473, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224775620, "dur": 1, "args": { "External id": 209381, "device": 5, "context": 1, "stream": 7, "correlation": 209381, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 209381, "pid": 5, "tid": 7, "ts": 1716454224775620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224721484, "dur": 8, "args": { "External id": 209381, "cbid": 51, "correlation": 209381 } }, { "ph": "s", "id": 209381, "pid": 76337, "tid": -914061504, "ts": 1716454224721484, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224775624, "dur": 665, "args": { "External id": 209382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209382, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209382, "pid": 5, "tid": 7, "ts": 1716454224775624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721492, "dur": 6, "args": { "External id": 209382, "cbid": 211, "correlation": 209382 } }, { "ph": "s", "id": 209382, "pid": 76337, "tid": -914061504, "ts": 1716454224721492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224776290, "dur": 14, "args": { "External id": 209384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209384, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209384, "pid": 5, "tid": 7, "ts": 1716454224776290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721503, "dur": 5, "args": { "External id": 209384, "cbid": 211, "correlation": 209384 } }, { "ph": "s", "id": 209384, "pid": 76337, "tid": -914061504, "ts": 1716454224721503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224776305, "dur": 15, "args": { "External id": 209390, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209390, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209390, "pid": 5, "tid": 7, "ts": 1716454224776305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721530, "dur": 9, "args": { "External id": 209390, "cbid": 211, "correlation": 209390 } }, { "ph": "s", "id": 209390, "pid": 76337, "tid": -914061504, "ts": 1716454224721530, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224776322, "dur": 3, "args": { "External id": 209398, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209398, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 209398, "pid": 5, "tid": 7, "ts": 1716454224776322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721574, "dur": 9, "args": { "External id": 209398, "cbid": 211, "correlation": 209398 } }, { "ph": "s", "id": 209398, "pid": 76337, "tid": -914061504, "ts": 1716454224721574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224721638, "dur": 1, "args": { "External id": 209414, "cbid": 251, "correlation": 209414 } }, { "ph": "f", "id": 209414, "pid": 76337, "tid": -914061504, "ts": 1716454224721638, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224721644, "dur": 0, "args": { "External id": 209416, "cbid": 251, "correlation": 209416 } }, { "ph": "f", "id": 209416, "pid": 76337, "tid": -914061504, "ts": 1716454224721644, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224776326, "dur": 13, "args": { "External id": 209417, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209417, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209417, "pid": 5, "tid": 7, "ts": 1716454224776326, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721646, "dur": 11, "args": { "External id": 209417, "cbid": 211, "correlation": 209417 } }, { "ph": "s", "id": 209417, "pid": 76337, "tid": -914061504, "ts": 1716454224721646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224776341, "dur": 5, "args": { "External id": 209419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209419, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209419, "pid": 5, "tid": 7, "ts": 1716454224776341, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721659, "dur": 5, "args": { "External id": 209419, "cbid": 211, "correlation": 209419 } }, { "ph": "s", "id": 209419, "pid": 76337, "tid": -914061504, "ts": 1716454224721659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224776347, "dur": 17, "args": { "External id": 209429, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209429, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209429, "pid": 5, "tid": 7, "ts": 1716454224776347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721716, "dur": 12, "args": { "External id": 209429, "cbid": 211, "correlation": 209429 } }, { "ph": "s", "id": 209429, "pid": 76337, "tid": -914061504, "ts": 1716454224721716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224776366, "dur": 18, "args": { "External id": 209449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209449, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 209449, "pid": 5, "tid": 7, "ts": 1716454224776366, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721782, "dur": 12, "args": { "External id": 209449, "cbid": 211, "correlation": 209449 } }, { "ph": "s", "id": 209449, "pid": 76337, "tid": -914061504, "ts": 1716454224721782, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224776385, "dur": 4, "args": { "External id": 209461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209461, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 209461, "pid": 5, "tid": 7, "ts": 1716454224776385, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721803, "dur": 6, "args": { "External id": 209461, "cbid": 211, "correlation": 209461 } }, { "ph": "s", "id": 209461, "pid": 76337, "tid": -914061504, "ts": 1716454224721803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224776390, "dur": 17, "args": { "External id": 209464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209464, "pid": 5, "tid": 7, "ts": 1716454224776390, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721822, "dur": 6, "args": { "External id": 209464, "cbid": 211, "correlation": 209464 } }, { "ph": "s", "id": 209464, "pid": 76337, "tid": -914061504, "ts": 1716454224721822, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224776408, "dur": 11, "args": { "External id": 209473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209473, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209473, "pid": 5, "tid": 7, "ts": 1716454224776408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721862, "dur": 10, "args": { "External id": 209473, "cbid": 211, "correlation": 209473 } }, { "ph": "s", "id": 209473, "pid": 76337, "tid": -914061504, "ts": 1716454224721862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224721923, "dur": 0, "args": { "External id": 209483, "cbid": 317, "correlation": 209483 } }, { "ph": "f", "id": 209483, "pid": 76337, "tid": -914061504, "ts": 1716454224721923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224721924, "dur": 0, "args": { "External id": 209484, "cbid": 203, "correlation": 209484 } }, { "ph": "f", "id": 209484, "pid": 76337, "tid": -914061504, "ts": 1716454224721924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224721924, "dur": 0, "args": { "External id": 209485, "cbid": 205, "correlation": 209485 } }, { "ph": "f", "id": 209485, "pid": 76337, "tid": -914061504, "ts": 1716454224721924, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224776421, "dur": 11, "args": { "External id": 209489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209489, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209489, "pid": 5, "tid": 7, "ts": 1716454224776421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721938, "dur": 13, "args": { "External id": 209489, "cbid": 211, "correlation": 209489 } }, { "ph": "s", "id": 209489, "pid": 76337, "tid": -914061504, "ts": 1716454224721938, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224776433, "dur": 164, "args": { "External id": 209491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209491, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209491, "pid": 5, "tid": 7, "ts": 1716454224776433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721954, "dur": 5, "args": { "External id": 209491, "cbid": 211, "correlation": 209491 } }, { "ph": "s", "id": 209491, "pid": 76337, "tid": -914061504, "ts": 1716454224721954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224776600, "dur": 1, "args": { "External id": 209493, "device": 5, "context": 1, "stream": 7, "correlation": 209493, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 209493, "pid": 5, "tid": 7, "ts": 1716454224776600, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224721964, "dur": 6, "args": { "External id": 209493, "cbid": 51, "correlation": 209493 } }, { "ph": "s", "id": 209493, "pid": 76337, "tid": -914061504, "ts": 1716454224721964, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224776603, "dur": 654, "args": { "External id": 209494, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209494, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209494, "pid": 5, "tid": 7, "ts": 1716454224776603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721972, "dur": 14, "args": { "External id": 209494, "cbid": 211, "correlation": 209494 } }, { "ph": "s", "id": 209494, "pid": 76337, "tid": -914061504, "ts": 1716454224721972, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224777259, "dur": 13, "args": { "External id": 209496, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209496, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209496, "pid": 5, "tid": 7, "ts": 1716454224777259, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224721990, "dur": 6, "args": { "External id": 209496, "cbid": 211, "correlation": 209496 } }, { "ph": "s", "id": 209496, "pid": 76337, "tid": -914061504, "ts": 1716454224721990, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224777273, "dur": 15, "args": { "External id": 209502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209502, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209502, "pid": 5, "tid": 7, "ts": 1716454224777273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722019, "dur": 9, "args": { "External id": 209502, "cbid": 211, "correlation": 209502 } }, { "ph": "s", "id": 209502, "pid": 76337, "tid": -914061504, "ts": 1716454224722019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224777289, "dur": 12, "args": { "External id": 209510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209510, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209510, "pid": 5, "tid": 7, "ts": 1716454224777289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722052, "dur": 8, "args": { "External id": 209510, "cbid": 211, "correlation": 209510 } }, { "ph": "s", "id": 209510, "pid": 76337, "tid": -914061504, "ts": 1716454224722052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224777303, "dur": 10, "args": { "External id": 209518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209518, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209518, "pid": 5, "tid": 7, "ts": 1716454224777303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722081, "dur": 9, "args": { "External id": 209518, "cbid": 211, "correlation": 209518 } }, { "ph": "s", "id": 209518, "pid": 76337, "tid": -914061504, "ts": 1716454224722081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224777314, "dur": 18, "args": { "External id": 209538, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209538, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 209538, "pid": 5, "tid": 7, "ts": 1716454224777314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722160, "dur": 12, "args": { "External id": 209538, "cbid": 211, "correlation": 209538 } }, { "ph": "s", "id": 209538, "pid": 76337, "tid": -914061504, "ts": 1716454224722160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224777333, "dur": 4, "args": { "External id": 209550, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209550, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 209550, "pid": 5, "tid": 7, "ts": 1716454224777333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722182, "dur": 6, "args": { "External id": 209550, "cbid": 211, "correlation": 209550 } }, { "ph": "s", "id": 209550, "pid": 76337, "tid": -914061504, "ts": 1716454224722182, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224777339, "dur": 16, "args": { "External id": 209553, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209553, "pid": 5, "tid": 7, "ts": 1716454224777339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722200, "dur": 7, "args": { "External id": 209553, "cbid": 211, "correlation": 209553 } }, { "ph": "s", "id": 209553, "pid": 76337, "tid": -914061504, "ts": 1716454224722200, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224722256, "dur": 0, "args": { "External id": 209564, "cbid": 317, "correlation": 209564 } }, { "ph": "f", "id": 209564, "pid": 76337, "tid": -914061504, "ts": 1716454224722256, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224722257, "dur": 0, "args": { "External id": 209565, "cbid": 203, "correlation": 209565 } }, { "ph": "f", "id": 209565, "pid": 76337, "tid": -914061504, "ts": 1716454224722257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224722258, "dur": 0, "args": { "External id": 209566, "cbid": 205, "correlation": 209566 } }, { "ph": "f", "id": 209566, "pid": 76337, "tid": -914061504, "ts": 1716454224722258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224777357, "dur": 11, "args": { "External id": 209570, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209570, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209570, "pid": 5, "tid": 7, "ts": 1716454224777357, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722271, "dur": 11, "args": { "External id": 209570, "cbid": 211, "correlation": 209570 } }, { "ph": "s", "id": 209570, "pid": 76337, "tid": -914061504, "ts": 1716454224722271, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224777369, "dur": 4, "args": { "External id": 209572, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209572, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209572, "pid": 5, "tid": 7, "ts": 1716454224777369, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722286, "dur": 7, "args": { "External id": 209572, "cbid": 211, "correlation": 209572 } }, { "ph": "s", "id": 209572, "pid": 76337, "tid": -914061504, "ts": 1716454224722286, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224722296, "dur": 0, "args": { "External id": 209573, "cbid": 51, "correlation": 209573 } }, { "ph": "s", "id": 209573, "pid": 76337, "tid": -914061504, "ts": 1716454224722296, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224777375, "dur": 95, "args": { "External id": 209574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209574, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 209574, "pid": 5, "tid": 7, "ts": 1716454224777375, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722297, "dur": 5, "args": { "External id": 209574, "cbid": 211, "correlation": 209574 } }, { "ph": "s", "id": 209574, "pid": 76337, "tid": -914061504, "ts": 1716454224722297, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224777472, "dur": 16, "args": { "External id": 209579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209579, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209579, "pid": 5, "tid": 7, "ts": 1716454224777472, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722323, "dur": 8, "args": { "External id": 209579, "cbid": 211, "correlation": 209579 } }, { "ph": "s", "id": 209579, "pid": 76337, "tid": -914061504, "ts": 1716454224722323, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224777489, "dur": 85, "args": { "External id": 209588, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209588, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209588, "pid": 5, "tid": 7, "ts": 1716454224777489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722404, "dur": 14, "args": { "External id": 209588, "cbid": 211, "correlation": 209588 } }, { "ph": "s", "id": 209588, "pid": 76337, "tid": -914061504, "ts": 1716454224722404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224777576, "dur": 30, "args": { "External id": 209610, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209610, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209610, "pid": 5, "tid": 7, "ts": 1716454224777576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722460, "dur": 10, "args": { "External id": 209610, "cbid": 211, "correlation": 209610 } }, { "ph": "s", "id": 209610, "pid": 76337, "tid": -914061504, "ts": 1716454224722460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224722547, "dur": 1, "args": { "External id": 209621, "cbid": 251, "correlation": 209621 } }, { "ph": "f", "id": 209621, "pid": 76337, "tid": -914061504, "ts": 1716454224722547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224777607, "dur": 166, "args": { "External id": 209622, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209622, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209622, "pid": 5, "tid": 7, "ts": 1716454224777607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722552, "dur": 13, "args": { "External id": 209622, "cbid": 211, "correlation": 209622 } }, { "ph": "s", "id": 209622, "pid": 76337, "tid": -914061504, "ts": 1716454224722552, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224722621, "dur": 1, "args": { "External id": 209633, "cbid": 251, "correlation": 209633 } }, { "ph": "f", "id": 209633, "pid": 76337, "tid": -914061504, "ts": 1716454224722621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224777774, "dur": 164, "args": { "External id": 209634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209634, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209634, "pid": 5, "tid": 7, "ts": 1716454224777774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722625, "dur": 11, "args": { "External id": 209634, "cbid": 211, "correlation": 209634 } }, { "ph": "s", "id": 209634, "pid": 76337, "tid": -914061504, "ts": 1716454224722625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224722689, "dur": 1, "args": { "External id": 209645, "cbid": 251, "correlation": 209645 } }, { "ph": "f", "id": 209645, "pid": 76337, "tid": -914061504, "ts": 1716454224722689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224777939, "dur": 161, "args": { "External id": 209646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209646, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209646, "pid": 5, "tid": 7, "ts": 1716454224777939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722694, "dur": 12, "args": { "External id": 209646, "cbid": 211, "correlation": 209646 } }, { "ph": "s", "id": 209646, "pid": 76337, "tid": -914061504, "ts": 1716454224722694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224778102, "dur": 343, "args": { "External id": 209671, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209671, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209671, "pid": 5, "tid": 7, "ts": 1716454224778102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722777, "dur": 12, "args": { "External id": 209671, "cbid": 211, "correlation": 209671 } }, { "ph": "s", "id": 209671, "pid": 76337, "tid": -914061504, "ts": 1716454224722777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224722875, "dur": 1, "args": { "External id": 209689, "cbid": 251, "correlation": 209689 } }, { "ph": "f", "id": 209689, "pid": 76337, "tid": -914061504, "ts": 1716454224722875, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224778446, "dur": 166, "args": { "External id": 209691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209691, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209691, "pid": 5, "tid": 7, "ts": 1716454224778446, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722881, "dur": 13, "args": { "External id": 209691, "cbid": 211, "correlation": 209691 } }, { "ph": "s", "id": 209691, "pid": 76337, "tid": -914061504, "ts": 1716454224722881, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224778613, "dur": 19, "args": { "External id": 209699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209699, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209699, "pid": 5, "tid": 7, "ts": 1716454224778613, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224722952, "dur": 12, "args": { "External id": 209699, "cbid": 211, "correlation": 209699 } }, { "ph": "s", "id": 209699, "pid": 76337, "tid": -914061504, "ts": 1716454224722952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224778633, "dur": 27, "args": { "External id": 209707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209707, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209707, "pid": 5, "tid": 7, "ts": 1716454224778633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723000, "dur": 9, "args": { "External id": 209707, "cbid": 211, "correlation": 209707 } }, { "ph": "s", "id": 209707, "pid": 76337, "tid": -914061504, "ts": 1716454224723000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224778662, "dur": 19, "args": { "External id": 209718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209718, "pid": 5, "tid": 7, "ts": 1716454224778662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723072, "dur": 12, "args": { "External id": 209718, "cbid": 211, "correlation": 209718 } }, { "ph": "s", "id": 209718, "pid": 76337, "tid": -914061504, "ts": 1716454224723072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224778682, "dur": 16, "args": { "External id": 209740, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209740, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209740, "pid": 5, "tid": 7, "ts": 1716454224778682, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723104, "dur": 8, "args": { "External id": 209740, "cbid": 211, "correlation": 209740 } }, { "ph": "s", "id": 209740, "pid": 76337, "tid": -914061504, "ts": 1716454224723104, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723189, "dur": 1, "args": { "External id": 209751, "cbid": 251, "correlation": 209751 } }, { "ph": "f", "id": 209751, "pid": 76337, "tid": -914061504, "ts": 1716454224723189, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224778700, "dur": 90, "args": { "External id": 209752, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209752, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209752, "pid": 5, "tid": 7, "ts": 1716454224778700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723195, "dur": 14, "args": { "External id": 209752, "cbid": 211, "correlation": 209752 } }, { "ph": "s", "id": 209752, "pid": 76337, "tid": -914061504, "ts": 1716454224723195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723263, "dur": 1, "args": { "External id": 209763, "cbid": 251, "correlation": 209763 } }, { "ph": "f", "id": 209763, "pid": 76337, "tid": -914061504, "ts": 1716454224723263, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723267, "dur": 0, "args": { "External id": 209764, "cbid": 251, "correlation": 209764 } }, { "ph": "f", "id": 209764, "pid": 76337, "tid": -914061504, "ts": 1716454224723267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224778790, "dur": 12, "args": { "External id": 209765, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209765, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209765, "pid": 5, "tid": 7, "ts": 1716454224778790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723268, "dur": 12, "args": { "External id": 209765, "cbid": 211, "correlation": 209765 } }, { "ph": "s", "id": 209765, "pid": 76337, "tid": -914061504, "ts": 1716454224723268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224778804, "dur": 5, "args": { "External id": 209767, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209767, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209767, "pid": 5, "tid": 7, "ts": 1716454224778804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723282, "dur": 6, "args": { "External id": 209767, "cbid": 211, "correlation": 209767 } }, { "ph": "s", "id": 209767, "pid": 76337, "tid": -914061504, "ts": 1716454224723282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723340, "dur": 1, "args": { "External id": 209778, "cbid": 251, "correlation": 209778 } }, { "ph": "f", "id": 209778, "pid": 76337, "tid": -914061504, "ts": 1716454224723340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723343, "dur": 0, "args": { "External id": 209779, "cbid": 251, "correlation": 209779 } }, { "ph": "f", "id": 209779, "pid": 76337, "tid": -914061504, "ts": 1716454224723343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224778811, "dur": 8, "args": { "External id": 209780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209780, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209780, "pid": 5, "tid": 7, "ts": 1716454224778811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723345, "dur": 12, "args": { "External id": 209780, "cbid": 211, "correlation": 209780 } }, { "ph": "s", "id": 209780, "pid": 76337, "tid": -914061504, "ts": 1716454224723345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224778821, "dur": 3, "args": { "External id": 209782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209782, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209782, "pid": 5, "tid": 7, "ts": 1716454224778821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723359, "dur": 6, "args": { "External id": 209782, "cbid": 211, "correlation": 209782 } }, { "ph": "s", "id": 209782, "pid": 76337, "tid": -914061504, "ts": 1716454224723359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224778825, "dur": 56, "args": { "External id": 209807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209807, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209807, "pid": 5, "tid": 7, "ts": 1716454224778825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723435, "dur": 13, "args": { "External id": 209807, "cbid": 211, "correlation": 209807 } }, { "ph": "s", "id": 209807, "pid": 76337, "tid": -914061504, "ts": 1716454224723435, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723532, "dur": 1, "args": { "External id": 209825, "cbid": 251, "correlation": 209825 } }, { "ph": "f", "id": 209825, "pid": 76337, "tid": -914061504, "ts": 1716454224723532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224778882, "dur": 92, "args": { "External id": 209827, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209827, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209827, "pid": 5, "tid": 7, "ts": 1716454224778882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723538, "dur": 14, "args": { "External id": 209827, "cbid": 211, "correlation": 209827 } }, { "ph": "s", "id": 209827, "pid": 76337, "tid": -914061504, "ts": 1716454224723538, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224778975, "dur": 10, "args": { "External id": 209835, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209835, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209835, "pid": 5, "tid": 7, "ts": 1716454224778975, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723608, "dur": 12, "args": { "External id": 209835, "cbid": 211, "correlation": 209835 } }, { "ph": "s", "id": 209835, "pid": 76337, "tid": -914061504, "ts": 1716454224723608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224778986, "dur": 21, "args": { "External id": 209843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209843, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209843, "pid": 5, "tid": 7, "ts": 1716454224778986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723649, "dur": 9, "args": { "External id": 209843, "cbid": 211, "correlation": 209843 } }, { "ph": "s", "id": 209843, "pid": 76337, "tid": -914061504, "ts": 1716454224723649, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224779009, "dur": 17, "args": { "External id": 209865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209865, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209865, "pid": 5, "tid": 7, "ts": 1716454224779009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723701, "dur": 10, "args": { "External id": 209865, "cbid": 211, "correlation": 209865 } }, { "ph": "s", "id": 209865, "pid": 76337, "tid": -914061504, "ts": 1716454224723701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723787, "dur": 1, "args": { "External id": 209881, "cbid": 251, "correlation": 209881 } }, { "ph": "f", "id": 209881, "pid": 76337, "tid": -914061504, "ts": 1716454224723787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723792, "dur": 0, "args": { "External id": 209883, "cbid": 251, "correlation": 209883 } }, { "ph": "f", "id": 209883, "pid": 76337, "tid": -914061504, "ts": 1716454224723792, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224779027, "dur": 493, "args": { "External id": 209884, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209884, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 209884, "pid": 5, "tid": 7, "ts": 1716454224779027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723795, "dur": 12, "args": { "External id": 209884, "cbid": 211, "correlation": 209884 } }, { "ph": "s", "id": 209884, "pid": 76337, "tid": -914061504, "ts": 1716454224723795, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224779522, "dur": 65, "args": { "External id": 209892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209892, "pid": 5, "tid": 7, "ts": 1716454224779522, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723860, "dur": 12, "args": { "External id": 209892, "cbid": 211, "correlation": 209892 } }, { "ph": "s", "id": 209892, "pid": 76337, "tid": -914061504, "ts": 1716454224723860, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224779588, "dur": 68, "args": { "External id": 209900, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209900, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209900, "pid": 5, "tid": 7, "ts": 1716454224779588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723889, "dur": 9, "args": { "External id": 209900, "cbid": 211, "correlation": 209900 } }, { "ph": "s", "id": 209900, "pid": 76337, "tid": -914061504, "ts": 1716454224723889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224723971, "dur": 10, "args": { "External id": 209916, "cbid": 251, "correlation": 209916 } }, { "ph": "f", "id": 209916, "pid": 76337, "tid": -914061504, "ts": 1716454224723971, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224779658, "dur": 1, "args": { "External id": 209918, "device": 5, "context": 1, "stream": 7, "correlation": 209918, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 209918, "pid": 5, "tid": 7, "ts": 1716454224779658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224723985, "dur": 11, "args": { "External id": 209918, "cbid": 51, "correlation": 209918 } }, { "ph": "s", "id": 209918, "pid": 76337, "tid": -914061504, "ts": 1716454224723985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224779662, "dur": 273, "args": { "External id": 209919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209919, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209919, "pid": 5, "tid": 7, "ts": 1716454224779662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224723997, "dur": 12, "args": { "External id": 209919, "cbid": 211, "correlation": 209919 } }, { "ph": "s", "id": 209919, "pid": 76337, "tid": -914061504, "ts": 1716454224723997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224779936, "dur": 14, "args": { "External id": 209927, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209927, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209927, "pid": 5, "tid": 7, "ts": 1716454224779936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724041, "dur": 10, "args": { "External id": 209927, "cbid": 211, "correlation": 209927 } }, { "ph": "s", "id": 209927, "pid": 76337, "tid": -914061504, "ts": 1716454224724041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224779951, "dur": 38, "args": { "External id": 209938, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209938, "pid": 5, "tid": 7, "ts": 1716454224779951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724109, "dur": 12, "args": { "External id": 209938, "cbid": 211, "correlation": 209938 } }, { "ph": "s", "id": 209938, "pid": 76337, "tid": -914061504, "ts": 1716454224724109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224724172, "dur": 0, "args": { "External id": 209950, "cbid": 317, "correlation": 209950 } }, { "ph": "f", "id": 209950, "pid": 76337, "tid": -914061504, "ts": 1716454224724172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224724173, "dur": 0, "args": { "External id": 209951, "cbid": 203, "correlation": 209951 } }, { "ph": "f", "id": 209951, "pid": 76337, "tid": -914061504, "ts": 1716454224724173, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224724174, "dur": 0, "args": { "External id": 209952, "cbid": 205, "correlation": 209952 } }, { "ph": "f", "id": 209952, "pid": 76337, "tid": -914061504, "ts": 1716454224724174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224779991, "dur": 13, "args": { "External id": 209956, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209956, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209956, "pid": 5, "tid": 7, "ts": 1716454224779991, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724189, "dur": 13, "args": { "External id": 209956, "cbid": 211, "correlation": 209956 } }, { "ph": "s", "id": 209956, "pid": 76337, "tid": -914061504, "ts": 1716454224724189, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224780006, "dur": 4, "args": { "External id": 209958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 209958, "pid": 5, "tid": 7, "ts": 1716454224780006, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724207, "dur": 6, "args": { "External id": 209958, "cbid": 211, "correlation": 209958 } }, { "ph": "s", "id": 209958, "pid": 76337, "tid": -914061504, "ts": 1716454224724207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224724215, "dur": 0, "args": { "External id": 209959, "cbid": 51, "correlation": 209959 } }, { "ph": "s", "id": 209959, "pid": 76337, "tid": -914061504, "ts": 1716454224724215, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224780011, "dur": 99, "args": { "External id": 209960, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209960, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 209960, "pid": 5, "tid": 7, "ts": 1716454224780011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724216, "dur": 5, "args": { "External id": 209960, "cbid": 211, "correlation": 209960 } }, { "ph": "s", "id": 209960, "pid": 76337, "tid": -914061504, "ts": 1716454224724216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224780111, "dur": 17, "args": { "External id": 209965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209965, "pid": 5, "tid": 7, "ts": 1716454224780111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724244, "dur": 8, "args": { "External id": 209965, "cbid": 211, "correlation": 209965 } }, { "ph": "s", "id": 209965, "pid": 76337, "tid": -914061504, "ts": 1716454224724244, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224780129, "dur": 13, "args": { "External id": 209973, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209973, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209973, "pid": 5, "tid": 7, "ts": 1716454224780129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724275, "dur": 8, "args": { "External id": 209973, "cbid": 211, "correlation": 209973 } }, { "ph": "s", "id": 209973, "pid": 76337, "tid": -914061504, "ts": 1716454224724275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224724344, "dur": 0, "args": { "External id": 209983, "cbid": 317, "correlation": 209983 } }, { "ph": "f", "id": 209983, "pid": 76337, "tid": -914061504, "ts": 1716454224724344, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224724345, "dur": 0, "args": { "External id": 209984, "cbid": 203, "correlation": 209984 } }, { "ph": "f", "id": 209984, "pid": 76337, "tid": -914061504, "ts": 1716454224724345, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224724346, "dur": 0, "args": { "External id": 209985, "cbid": 205, "correlation": 209985 } }, { "ph": "f", "id": 209985, "pid": 76337, "tid": -914061504, "ts": 1716454224724346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224780144, "dur": 12, "args": { "External id": 209989, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209989, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209989, "pid": 5, "tid": 7, "ts": 1716454224780144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724360, "dur": 12, "args": { "External id": 209989, "cbid": 211, "correlation": 209989 } }, { "ph": "s", "id": 209989, "pid": 76337, "tid": -914061504, "ts": 1716454224724360, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224780157, "dur": 164, "args": { "External id": 209991, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209991, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209991, "pid": 5, "tid": 7, "ts": 1716454224780157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724374, "dur": 5, "args": { "External id": 209991, "cbid": 211, "correlation": 209991 } }, { "ph": "s", "id": 209991, "pid": 76337, "tid": -914061504, "ts": 1716454224724374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224780324, "dur": 1, "args": { "External id": 209993, "device": 5, "context": 1, "stream": 7, "correlation": 209993, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 209993, "pid": 5, "tid": 7, "ts": 1716454224780324, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224724386, "dur": 7, "args": { "External id": 209993, "cbid": 51, "correlation": 209993 } }, { "ph": "s", "id": 209993, "pid": 76337, "tid": -914061504, "ts": 1716454224724386, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224780327, "dur": 199, "args": { "External id": 209994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209994, "registers per thread": 144, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 209994, "pid": 5, "tid": 7, "ts": 1716454224780327, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724395, "dur": 8, "args": { "External id": 209994, "cbid": 211, "correlation": 209994 } }, { "ph": "s", "id": 209994, "pid": 76337, "tid": -914061504, "ts": 1716454224724395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224780528, "dur": 7, "args": { "External id": 209996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 209996, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 209996, "pid": 5, "tid": 7, "ts": 1716454224780528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724409, "dur": 5, "args": { "External id": 209996, "cbid": 211, "correlation": 209996 } }, { "ph": "s", "id": 209996, "pid": 76337, "tid": -914061504, "ts": 1716454224724409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224780536, "dur": 6, "args": { "External id": 210002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210002, "pid": 5, "tid": 7, "ts": 1716454224780536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724437, "dur": 8, "args": { "External id": 210002, "cbid": 211, "correlation": 210002 } }, { "ph": "s", "id": 210002, "pid": 76337, "tid": -914061504, "ts": 1716454224724437, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224780543, "dur": 11, "args": { "External id": 210022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210022, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210022, "pid": 5, "tid": 7, "ts": 1716454224780543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724528, "dur": 12, "args": { "External id": 210022, "cbid": 211, "correlation": 210022 } }, { "ph": "s", "id": 210022, "pid": 76337, "tid": -914061504, "ts": 1716454224724528, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224780556, "dur": 4, "args": { "External id": 210034, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210034, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210034, "pid": 5, "tid": 7, "ts": 1716454224780556, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724550, "dur": 6, "args": { "External id": 210034, "cbid": 211, "correlation": 210034 } }, { "ph": "s", "id": 210034, "pid": 76337, "tid": -914061504, "ts": 1716454224724550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224780561, "dur": 9, "args": { "External id": 210037, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210037, "pid": 5, "tid": 7, "ts": 1716454224780561, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724569, "dur": 6, "args": { "External id": 210037, "cbid": 211, "correlation": 210037 } }, { "ph": "s", "id": 210037, "pid": 76337, "tid": -914061504, "ts": 1716454224724569, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224780571, "dur": 5, "args": { "External id": 210046, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210046, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210046, "pid": 5, "tid": 7, "ts": 1716454224780571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724609, "dur": 10, "args": { "External id": 210046, "cbid": 211, "correlation": 210046 } }, { "ph": "s", "id": 210046, "pid": 76337, "tid": -914061504, "ts": 1716454224724609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224724661, "dur": 0, "args": { "External id": 210056, "cbid": 317, "correlation": 210056 } }, { "ph": "f", "id": 210056, "pid": 76337, "tid": -914061504, "ts": 1716454224724661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224724661, "dur": 0, "args": { "External id": 210057, "cbid": 203, "correlation": 210057 } }, { "ph": "f", "id": 210057, "pid": 76337, "tid": -914061504, "ts": 1716454224724661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224724662, "dur": 0, "args": { "External id": 210058, "cbid": 205, "correlation": 210058 } }, { "ph": "f", "id": 210058, "pid": 76337, "tid": -914061504, "ts": 1716454224724662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224780578, "dur": 5, "args": { "External id": 210062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210062, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210062, "pid": 5, "tid": 7, "ts": 1716454224780578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724677, "dur": 12, "args": { "External id": 210062, "cbid": 211, "correlation": 210062 } }, { "ph": "s", "id": 210062, "pid": 76337, "tid": -914061504, "ts": 1716454224724677, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224780584, "dur": 164, "args": { "External id": 210064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210064, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210064, "pid": 5, "tid": 7, "ts": 1716454224780584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724692, "dur": 5, "args": { "External id": 210064, "cbid": 211, "correlation": 210064 } }, { "ph": "s", "id": 210064, "pid": 76337, "tid": -914061504, "ts": 1716454224724692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224780750, "dur": 1, "args": { "External id": 210066, "device": 5, "context": 1, "stream": 7, "correlation": 210066, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 210066, "pid": 5, "tid": 7, "ts": 1716454224780750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224724703, "dur": 6, "args": { "External id": 210066, "cbid": 51, "correlation": 210066 } }, { "ph": "s", "id": 210066, "pid": 76337, "tid": -914061504, "ts": 1716454224724703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224780753, "dur": 272, "args": { "External id": 210067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210067, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210067, "pid": 5, "tid": 7, "ts": 1716454224780753, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724711, "dur": 6, "args": { "External id": 210067, "cbid": 211, "correlation": 210067 } }, { "ph": "s", "id": 210067, "pid": 76337, "tid": -914061504, "ts": 1716454224724711, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224781027, "dur": 6, "args": { "External id": 210069, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210069, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210069, "pid": 5, "tid": 7, "ts": 1716454224781027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724720, "dur": 5, "args": { "External id": 210069, "cbid": 211, "correlation": 210069 } }, { "ph": "s", "id": 210069, "pid": 76337, "tid": -914061504, "ts": 1716454224724720, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224781034, "dur": 6, "args": { "External id": 210075, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210075, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210075, "pid": 5, "tid": 7, "ts": 1716454224781034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724749, "dur": 8, "args": { "External id": 210075, "cbid": 211, "correlation": 210075 } }, { "ph": "s", "id": 210075, "pid": 76337, "tid": -914061504, "ts": 1716454224724749, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224781042, "dur": 3, "args": { "External id": 210083, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210083, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 210083, "pid": 5, "tid": 7, "ts": 1716454224781042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724792, "dur": 9, "args": { "External id": 210083, "cbid": 211, "correlation": 210083 } }, { "ph": "s", "id": 210083, "pid": 76337, "tid": -914061504, "ts": 1716454224724792, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224724857, "dur": 1, "args": { "External id": 210099, "cbid": 251, "correlation": 210099 } }, { "ph": "f", "id": 210099, "pid": 76337, "tid": -914061504, "ts": 1716454224724857, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224724862, "dur": 0, "args": { "External id": 210101, "cbid": 251, "correlation": 210101 } }, { "ph": "f", "id": 210101, "pid": 76337, "tid": -914061504, "ts": 1716454224724862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224781046, "dur": 14, "args": { "External id": 210102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210102, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210102, "pid": 5, "tid": 7, "ts": 1716454224781046, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724864, "dur": 11, "args": { "External id": 210102, "cbid": 211, "correlation": 210102 } }, { "ph": "s", "id": 210102, "pid": 76337, "tid": -914061504, "ts": 1716454224724864, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224781061, "dur": 5, "args": { "External id": 210104, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210104, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210104, "pid": 5, "tid": 7, "ts": 1716454224781061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724877, "dur": 6, "args": { "External id": 210104, "cbid": 211, "correlation": 210104 } }, { "ph": "s", "id": 210104, "pid": 76337, "tid": -914061504, "ts": 1716454224724877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224781068, "dur": 6, "args": { "External id": 210114, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210114, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210114, "pid": 5, "tid": 7, "ts": 1716454224781068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224724935, "dur": 12, "args": { "External id": 210114, "cbid": 211, "correlation": 210114 } }, { "ph": "s", "id": 210114, "pid": 76337, "tid": -914061504, "ts": 1716454224724935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224781075, "dur": 10, "args": { "External id": 210134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210134, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210134, "pid": 5, "tid": 7, "ts": 1716454224781075, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725009, "dur": 11, "args": { "External id": 210134, "cbid": 211, "correlation": 210134 } }, { "ph": "s", "id": 210134, "pid": 76337, "tid": -914061504, "ts": 1716454224725009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224781086, "dur": 4, "args": { "External id": 210146, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210146, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210146, "pid": 5, "tid": 7, "ts": 1716454224781086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725031, "dur": 7, "args": { "External id": 210146, "cbid": 211, "correlation": 210146 } }, { "ph": "s", "id": 210146, "pid": 76337, "tid": -914061504, "ts": 1716454224725031, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224781091, "dur": 7, "args": { "External id": 210149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210149, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210149, "pid": 5, "tid": 7, "ts": 1716454224781091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725050, "dur": 6, "args": { "External id": 210149, "cbid": 211, "correlation": 210149 } }, { "ph": "s", "id": 210149, "pid": 76337, "tid": -914061504, "ts": 1716454224725050, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224781100, "dur": 5, "args": { "External id": 210158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210158, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210158, "pid": 5, "tid": 7, "ts": 1716454224781100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725091, "dur": 10, "args": { "External id": 210158, "cbid": 211, "correlation": 210158 } }, { "ph": "s", "id": 210158, "pid": 76337, "tid": -914061504, "ts": 1716454224725091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224725154, "dur": 0, "args": { "External id": 210168, "cbid": 317, "correlation": 210168 } }, { "ph": "f", "id": 210168, "pid": 76337, "tid": -914061504, "ts": 1716454224725154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224725154, "dur": 0, "args": { "External id": 210169, "cbid": 203, "correlation": 210169 } }, { "ph": "f", "id": 210169, "pid": 76337, "tid": -914061504, "ts": 1716454224725154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224725155, "dur": 0, "args": { "External id": 210170, "cbid": 205, "correlation": 210170 } }, { "ph": "f", "id": 210170, "pid": 76337, "tid": -914061504, "ts": 1716454224725155, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224781106, "dur": 5, "args": { "External id": 210174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210174, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210174, "pid": 5, "tid": 7, "ts": 1716454224781106, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725169, "dur": 12, "args": { "External id": 210174, "cbid": 211, "correlation": 210174 } }, { "ph": "s", "id": 210174, "pid": 76337, "tid": -914061504, "ts": 1716454224725169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224781112, "dur": 163, "args": { "External id": 210176, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210176, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210176, "pid": 5, "tid": 7, "ts": 1716454224781112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725184, "dur": 6, "args": { "External id": 210176, "cbid": 211, "correlation": 210176 } }, { "ph": "s", "id": 210176, "pid": 76337, "tid": -914061504, "ts": 1716454224725184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224781277, "dur": 1, "args": { "External id": 210178, "device": 5, "context": 1, "stream": 7, "correlation": 210178, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 210178, "pid": 5, "tid": 7, "ts": 1716454224781277, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224725195, "dur": 6, "args": { "External id": 210178, "cbid": 51, "correlation": 210178 } }, { "ph": "s", "id": 210178, "pid": 76337, "tid": -914061504, "ts": 1716454224725195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224781281, "dur": 261, "args": { "External id": 210179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210179, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210179, "pid": 5, "tid": 7, "ts": 1716454224781281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725202, "dur": 6, "args": { "External id": 210179, "cbid": 211, "correlation": 210179 } }, { "ph": "s", "id": 210179, "pid": 76337, "tid": -914061504, "ts": 1716454224725202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224781543, "dur": 6, "args": { "External id": 210181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210181, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210181, "pid": 5, "tid": 7, "ts": 1716454224781543, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725212, "dur": 5, "args": { "External id": 210181, "cbid": 211, "correlation": 210181 } }, { "ph": "s", "id": 210181, "pid": 76337, "tid": -914061504, "ts": 1716454224725212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224781551, "dur": 6, "args": { "External id": 210187, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210187, "pid": 5, "tid": 7, "ts": 1716454224781551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725241, "dur": 8, "args": { "External id": 210187, "cbid": 211, "correlation": 210187 } }, { "ph": "s", "id": 210187, "pid": 76337, "tid": -914061504, "ts": 1716454224725241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224781558, "dur": 5, "args": { "External id": 210195, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210195, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210195, "pid": 5, "tid": 7, "ts": 1716454224781558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725273, "dur": 8, "args": { "External id": 210195, "cbid": 211, "correlation": 210195 } }, { "ph": "s", "id": 210195, "pid": 76337, "tid": -914061504, "ts": 1716454224725273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224781565, "dur": 4, "args": { "External id": 210203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210203, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210203, "pid": 5, "tid": 7, "ts": 1716454224781565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725303, "dur": 8, "args": { "External id": 210203, "cbid": 211, "correlation": 210203 } }, { "ph": "s", "id": 210203, "pid": 76337, "tid": -914061504, "ts": 1716454224725303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224781570, "dur": 10, "args": { "External id": 210223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210223, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210223, "pid": 5, "tid": 7, "ts": 1716454224781570, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725376, "dur": 12, "args": { "External id": 210223, "cbid": 211, "correlation": 210223 } }, { "ph": "s", "id": 210223, "pid": 76337, "tid": -914061504, "ts": 1716454224725376, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224781582, "dur": 4, "args": { "External id": 210235, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210235, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210235, "pid": 5, "tid": 7, "ts": 1716454224781582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725398, "dur": 6, "args": { "External id": 210235, "cbid": 211, "correlation": 210235 } }, { "ph": "s", "id": 210235, "pid": 76337, "tid": -914061504, "ts": 1716454224725398, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224781587, "dur": 7, "args": { "External id": 210238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210238, "pid": 5, "tid": 7, "ts": 1716454224781587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725416, "dur": 6, "args": { "External id": 210238, "cbid": 211, "correlation": 210238 } }, { "ph": "s", "id": 210238, "pid": 76337, "tid": -914061504, "ts": 1716454224725416, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224781595, "dur": 5, "args": { "External id": 210247, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210247, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210247, "pid": 5, "tid": 7, "ts": 1716454224781595, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725454, "dur": 9, "args": { "External id": 210247, "cbid": 211, "correlation": 210247 } }, { "ph": "s", "id": 210247, "pid": 76337, "tid": -914061504, "ts": 1716454224725454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224725504, "dur": 0, "args": { "External id": 210257, "cbid": 317, "correlation": 210257 } }, { "ph": "f", "id": 210257, "pid": 76337, "tid": -914061504, "ts": 1716454224725504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224725505, "dur": 0, "args": { "External id": 210258, "cbid": 203, "correlation": 210258 } }, { "ph": "f", "id": 210258, "pid": 76337, "tid": -914061504, "ts": 1716454224725505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224725505, "dur": 0, "args": { "External id": 210259, "cbid": 205, "correlation": 210259 } }, { "ph": "f", "id": 210259, "pid": 76337, "tid": -914061504, "ts": 1716454224725505, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224781601, "dur": 5, "args": { "External id": 210263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210263, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210263, "pid": 5, "tid": 7, "ts": 1716454224781601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725518, "dur": 11, "args": { "External id": 210263, "cbid": 211, "correlation": 210263 } }, { "ph": "s", "id": 210263, "pid": 76337, "tid": -914061504, "ts": 1716454224725518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224781607, "dur": 163, "args": { "External id": 210265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210265, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210265, "pid": 5, "tid": 7, "ts": 1716454224781607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725532, "dur": 6, "args": { "External id": 210265, "cbid": 211, "correlation": 210265 } }, { "ph": "s", "id": 210265, "pid": 76337, "tid": -914061504, "ts": 1716454224725532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224781772, "dur": 1, "args": { "External id": 210267, "device": 5, "context": 1, "stream": 7, "correlation": 210267, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 210267, "pid": 5, "tid": 7, "ts": 1716454224781772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224725543, "dur": 6, "args": { "External id": 210267, "cbid": 51, "correlation": 210267 } }, { "ph": "s", "id": 210267, "pid": 76337, "tid": -914061504, "ts": 1716454224725543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224781776, "dur": 260, "args": { "External id": 210268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210268, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210268, "pid": 5, "tid": 7, "ts": 1716454224781776, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725550, "dur": 6, "args": { "External id": 210268, "cbid": 211, "correlation": 210268 } }, { "ph": "s", "id": 210268, "pid": 76337, "tid": -914061504, "ts": 1716454224725550, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224782037, "dur": 6, "args": { "External id": 210270, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210270, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210270, "pid": 5, "tid": 7, "ts": 1716454224782037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725560, "dur": 5, "args": { "External id": 210270, "cbid": 211, "correlation": 210270 } }, { "ph": "s", "id": 210270, "pid": 76337, "tid": -914061504, "ts": 1716454224725560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224782044, "dur": 6, "args": { "External id": 210276, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210276, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210276, "pid": 5, "tid": 7, "ts": 1716454224782044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725589, "dur": 9, "args": { "External id": 210276, "cbid": 211, "correlation": 210276 } }, { "ph": "s", "id": 210276, "pid": 76337, "tid": -914061504, "ts": 1716454224725589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224782052, "dur": 3, "args": { "External id": 210284, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210284, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 210284, "pid": 5, "tid": 7, "ts": 1716454224782052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725633, "dur": 10, "args": { "External id": 210284, "cbid": 211, "correlation": 210284 } }, { "ph": "s", "id": 210284, "pid": 76337, "tid": -914061504, "ts": 1716454224725633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224725695, "dur": 1, "args": { "External id": 210300, "cbid": 251, "correlation": 210300 } }, { "ph": "f", "id": 210300, "pid": 76337, "tid": -914061504, "ts": 1716454224725695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224725699, "dur": 0, "args": { "External id": 210302, "cbid": 251, "correlation": 210302 } }, { "ph": "f", "id": 210302, "pid": 76337, "tid": -914061504, "ts": 1716454224725699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224782056, "dur": 10, "args": { "External id": 210303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210303, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210303, "pid": 5, "tid": 7, "ts": 1716454224782056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725701, "dur": 10, "args": { "External id": 210303, "cbid": 211, "correlation": 210303 } }, { "ph": "s", "id": 210303, "pid": 76337, "tid": -914061504, "ts": 1716454224725701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224782068, "dur": 4, "args": { "External id": 210305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210305, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210305, "pid": 5, "tid": 7, "ts": 1716454224782068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725714, "dur": 5, "args": { "External id": 210305, "cbid": 211, "correlation": 210305 } }, { "ph": "s", "id": 210305, "pid": 76337, "tid": -914061504, "ts": 1716454224725714, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224782073, "dur": 6, "args": { "External id": 210315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210315, "pid": 5, "tid": 7, "ts": 1716454224782073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725770, "dur": 12, "args": { "External id": 210315, "cbid": 211, "correlation": 210315 } }, { "ph": "s", "id": 210315, "pid": 76337, "tid": -914061504, "ts": 1716454224725770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224782080, "dur": 10, "args": { "External id": 210335, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210335, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210335, "pid": 5, "tid": 7, "ts": 1716454224782080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725835, "dur": 10, "args": { "External id": 210335, "cbid": 211, "correlation": 210335 } }, { "ph": "s", "id": 210335, "pid": 76337, "tid": -914061504, "ts": 1716454224725835, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224782091, "dur": 4, "args": { "External id": 210347, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210347, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210347, "pid": 5, "tid": 7, "ts": 1716454224782091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725855, "dur": 6, "args": { "External id": 210347, "cbid": 211, "correlation": 210347 } }, { "ph": "s", "id": 210347, "pid": 76337, "tid": -914061504, "ts": 1716454224725855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224782096, "dur": 7, "args": { "External id": 210350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210350, "pid": 5, "tid": 7, "ts": 1716454224782096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725872, "dur": 6, "args": { "External id": 210350, "cbid": 211, "correlation": 210350 } }, { "ph": "s", "id": 210350, "pid": 76337, "tid": -914061504, "ts": 1716454224725872, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224782104, "dur": 5, "args": { "External id": 210359, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210359, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210359, "pid": 5, "tid": 7, "ts": 1716454224782104, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725912, "dur": 10, "args": { "External id": 210359, "cbid": 211, "correlation": 210359 } }, { "ph": "s", "id": 210359, "pid": 76337, "tid": -914061504, "ts": 1716454224725912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224725981, "dur": 0, "args": { "External id": 210369, "cbid": 317, "correlation": 210369 } }, { "ph": "f", "id": 210369, "pid": 76337, "tid": -914061504, "ts": 1716454224725981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224725982, "dur": 0, "args": { "External id": 210370, "cbid": 203, "correlation": 210370 } }, { "ph": "f", "id": 210370, "pid": 76337, "tid": -914061504, "ts": 1716454224725982, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224725983, "dur": 0, "args": { "External id": 210371, "cbid": 205, "correlation": 210371 } }, { "ph": "f", "id": 210371, "pid": 76337, "tid": -914061504, "ts": 1716454224725983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224782110, "dur": 5, "args": { "External id": 210375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210375, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210375, "pid": 5, "tid": 7, "ts": 1716454224782110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224725997, "dur": 12, "args": { "External id": 210375, "cbid": 211, "correlation": 210375 } }, { "ph": "s", "id": 210375, "pid": 76337, "tid": -914061504, "ts": 1716454224725997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224782116, "dur": 162, "args": { "External id": 210377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210377, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210377, "pid": 5, "tid": 7, "ts": 1716454224782116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726012, "dur": 5, "args": { "External id": 210377, "cbid": 211, "correlation": 210377 } }, { "ph": "s", "id": 210377, "pid": 76337, "tid": -914061504, "ts": 1716454224726012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224782281, "dur": 1, "args": { "External id": 210379, "device": 5, "context": 1, "stream": 7, "correlation": 210379, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 210379, "pid": 5, "tid": 7, "ts": 1716454224782281, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224726022, "dur": 6, "args": { "External id": 210379, "cbid": 51, "correlation": 210379 } }, { "ph": "s", "id": 210379, "pid": 76337, "tid": -914061504, "ts": 1716454224726022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224782285, "dur": 261, "args": { "External id": 210380, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210380, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210380, "pid": 5, "tid": 7, "ts": 1716454224782285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726029, "dur": 7, "args": { "External id": 210380, "cbid": 211, "correlation": 210380 } }, { "ph": "s", "id": 210380, "pid": 76337, "tid": -914061504, "ts": 1716454224726029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224782546, "dur": 6, "args": { "External id": 210382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210382, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210382, "pid": 5, "tid": 7, "ts": 1716454224782546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726040, "dur": 5, "args": { "External id": 210382, "cbid": 211, "correlation": 210382 } }, { "ph": "s", "id": 210382, "pid": 76337, "tid": -914061504, "ts": 1716454224726040, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224782554, "dur": 6, "args": { "External id": 210388, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210388, "pid": 5, "tid": 7, "ts": 1716454224782554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726068, "dur": 8, "args": { "External id": 210388, "cbid": 211, "correlation": 210388 } }, { "ph": "s", "id": 210388, "pid": 76337, "tid": -914061504, "ts": 1716454224726068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224782562, "dur": 5, "args": { "External id": 210396, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210396, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210396, "pid": 5, "tid": 7, "ts": 1716454224782562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726101, "dur": 8, "args": { "External id": 210396, "cbid": 211, "correlation": 210396 } }, { "ph": "s", "id": 210396, "pid": 76337, "tid": -914061504, "ts": 1716454224726101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224782568, "dur": 4, "args": { "External id": 210404, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210404, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210404, "pid": 5, "tid": 7, "ts": 1716454224782568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726129, "dur": 9, "args": { "External id": 210404, "cbid": 211, "correlation": 210404 } }, { "ph": "s", "id": 210404, "pid": 76337, "tid": -914061504, "ts": 1716454224726129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224782574, "dur": 10, "args": { "External id": 210424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210424, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210424, "pid": 5, "tid": 7, "ts": 1716454224782574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726232, "dur": 13, "args": { "External id": 210424, "cbid": 211, "correlation": 210424 } }, { "ph": "s", "id": 210424, "pid": 76337, "tid": -914061504, "ts": 1716454224726232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224782585, "dur": 4, "args": { "External id": 210436, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210436, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210436, "pid": 5, "tid": 7, "ts": 1716454224782585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726255, "dur": 6, "args": { "External id": 210436, "cbid": 211, "correlation": 210436 } }, { "ph": "s", "id": 210436, "pid": 76337, "tid": -914061504, "ts": 1716454224726255, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224782590, "dur": 6, "args": { "External id": 210439, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210439, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210439, "pid": 5, "tid": 7, "ts": 1716454224782590, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726273, "dur": 6, "args": { "External id": 210439, "cbid": 211, "correlation": 210439 } }, { "ph": "s", "id": 210439, "pid": 76337, "tid": -914061504, "ts": 1716454224726273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224782598, "dur": 5, "args": { "External id": 210448, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210448, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210448, "pid": 5, "tid": 7, "ts": 1716454224782598, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726311, "dur": 10, "args": { "External id": 210448, "cbid": 211, "correlation": 210448 } }, { "ph": "s", "id": 210448, "pid": 76337, "tid": -914061504, "ts": 1716454224726311, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224726364, "dur": 0, "args": { "External id": 210458, "cbid": 317, "correlation": 210458 } }, { "ph": "f", "id": 210458, "pid": 76337, "tid": -914061504, "ts": 1716454224726364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224726365, "dur": 0, "args": { "External id": 210459, "cbid": 203, "correlation": 210459 } }, { "ph": "f", "id": 210459, "pid": 76337, "tid": -914061504, "ts": 1716454224726365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224726365, "dur": 0, "args": { "External id": 210460, "cbid": 205, "correlation": 210460 } }, { "ph": "f", "id": 210460, "pid": 76337, "tid": -914061504, "ts": 1716454224726365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224782604, "dur": 5, "args": { "External id": 210464, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210464, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210464, "pid": 5, "tid": 7, "ts": 1716454224782604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726378, "dur": 12, "args": { "External id": 210464, "cbid": 211, "correlation": 210464 } }, { "ph": "s", "id": 210464, "pid": 76337, "tid": -914061504, "ts": 1716454224726378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224782610, "dur": 164, "args": { "External id": 210466, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210466, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210466, "pid": 5, "tid": 7, "ts": 1716454224782610, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726393, "dur": 5, "args": { "External id": 210466, "cbid": 211, "correlation": 210466 } }, { "ph": "s", "id": 210466, "pid": 76337, "tid": -914061504, "ts": 1716454224726393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224782777, "dur": 1, "args": { "External id": 210468, "device": 5, "context": 1, "stream": 7, "correlation": 210468, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 210468, "pid": 5, "tid": 7, "ts": 1716454224782777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224726403, "dur": 6, "args": { "External id": 210468, "cbid": 51, "correlation": 210468 } }, { "ph": "s", "id": 210468, "pid": 76337, "tid": -914061504, "ts": 1716454224726403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224782780, "dur": 260, "args": { "External id": 210469, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210469, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210469, "pid": 5, "tid": 7, "ts": 1716454224782780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726410, "dur": 6, "args": { "External id": 210469, "cbid": 211, "correlation": 210469 } }, { "ph": "s", "id": 210469, "pid": 76337, "tid": -914061504, "ts": 1716454224726410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224783042, "dur": 6, "args": { "External id": 210471, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210471, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210471, "pid": 5, "tid": 7, "ts": 1716454224783042, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726420, "dur": 5, "args": { "External id": 210471, "cbid": 211, "correlation": 210471 } }, { "ph": "s", "id": 210471, "pid": 76337, "tid": -914061504, "ts": 1716454224726420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224783049, "dur": 6, "args": { "External id": 210477, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210477, "pid": 5, "tid": 7, "ts": 1716454224783049, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726448, "dur": 8, "args": { "External id": 210477, "cbid": 211, "correlation": 210477 } }, { "ph": "s", "id": 210477, "pid": 76337, "tid": -914061504, "ts": 1716454224726448, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224783056, "dur": 3, "args": { "External id": 210485, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210485, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 210485, "pid": 5, "tid": 7, "ts": 1716454224783056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726492, "dur": 10, "args": { "External id": 210485, "cbid": 211, "correlation": 210485 } }, { "ph": "s", "id": 210485, "pid": 76337, "tid": -914061504, "ts": 1716454224726492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224726553, "dur": 1, "args": { "External id": 210501, "cbid": 251, "correlation": 210501 } }, { "ph": "f", "id": 210501, "pid": 76337, "tid": -914061504, "ts": 1716454224726553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224726559, "dur": 0, "args": { "External id": 210503, "cbid": 251, "correlation": 210503 } }, { "ph": "f", "id": 210503, "pid": 76337, "tid": -914061504, "ts": 1716454224726559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224783061, "dur": 10, "args": { "External id": 210504, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210504, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210504, "pid": 5, "tid": 7, "ts": 1716454224783061, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726560, "dur": 11, "args": { "External id": 210504, "cbid": 211, "correlation": 210504 } }, { "ph": "s", "id": 210504, "pid": 76337, "tid": -914061504, "ts": 1716454224726560, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224783072, "dur": 4, "args": { "External id": 210506, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210506, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210506, "pid": 5, "tid": 7, "ts": 1716454224783072, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726573, "dur": 6, "args": { "External id": 210506, "cbid": 211, "correlation": 210506 } }, { "ph": "s", "id": 210506, "pid": 76337, "tid": -914061504, "ts": 1716454224726573, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224783077, "dur": 6, "args": { "External id": 210516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210516, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210516, "pid": 5, "tid": 7, "ts": 1716454224783077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726630, "dur": 12, "args": { "External id": 210516, "cbid": 211, "correlation": 210516 } }, { "ph": "s", "id": 210516, "pid": 76337, "tid": -914061504, "ts": 1716454224726630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224783084, "dur": 9, "args": { "External id": 210536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210536, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210536, "pid": 5, "tid": 7, "ts": 1716454224783084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726695, "dur": 11, "args": { "External id": 210536, "cbid": 211, "correlation": 210536 } }, { "ph": "s", "id": 210536, "pid": 76337, "tid": -914061504, "ts": 1716454224726695, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224783095, "dur": 4, "args": { "External id": 210548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210548, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210548, "pid": 5, "tid": 7, "ts": 1716454224783095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726716, "dur": 6, "args": { "External id": 210548, "cbid": 211, "correlation": 210548 } }, { "ph": "s", "id": 210548, "pid": 76337, "tid": -914061504, "ts": 1716454224726716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224783100, "dur": 7, "args": { "External id": 210551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210551, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210551, "pid": 5, "tid": 7, "ts": 1716454224783100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726735, "dur": 6, "args": { "External id": 210551, "cbid": 211, "correlation": 210551 } }, { "ph": "s", "id": 210551, "pid": 76337, "tid": -914061504, "ts": 1716454224726735, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224783108, "dur": 5, "args": { "External id": 210560, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210560, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210560, "pid": 5, "tid": 7, "ts": 1716454224783108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726774, "dur": 11, "args": { "External id": 210560, "cbid": 211, "correlation": 210560 } }, { "ph": "s", "id": 210560, "pid": 76337, "tid": -914061504, "ts": 1716454224726774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224726838, "dur": 0, "args": { "External id": 210570, "cbid": 317, "correlation": 210570 } }, { "ph": "f", "id": 210570, "pid": 76337, "tid": -914061504, "ts": 1716454224726838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224726838, "dur": 0, "args": { "External id": 210571, "cbid": 203, "correlation": 210571 } }, { "ph": "f", "id": 210571, "pid": 76337, "tid": -914061504, "ts": 1716454224726838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224726839, "dur": 0, "args": { "External id": 210572, "cbid": 205, "correlation": 210572 } }, { "ph": "f", "id": 210572, "pid": 76337, "tid": -914061504, "ts": 1716454224726839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224783114, "dur": 5, "args": { "External id": 210576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210576, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210576, "pid": 5, "tid": 7, "ts": 1716454224783114, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726853, "dur": 12, "args": { "External id": 210576, "cbid": 211, "correlation": 210576 } }, { "ph": "s", "id": 210576, "pid": 76337, "tid": -914061504, "ts": 1716454224726853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224783120, "dur": 163, "args": { "External id": 210578, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210578, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210578, "pid": 5, "tid": 7, "ts": 1716454224783120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726867, "dur": 5, "args": { "External id": 210578, "cbid": 211, "correlation": 210578 } }, { "ph": "s", "id": 210578, "pid": 76337, "tid": -914061504, "ts": 1716454224726867, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224783285, "dur": 1, "args": { "External id": 210580, "device": 5, "context": 1, "stream": 7, "correlation": 210580, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 210580, "pid": 5, "tid": 7, "ts": 1716454224783285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224726878, "dur": 7, "args": { "External id": 210580, "cbid": 51, "correlation": 210580 } }, { "ph": "s", "id": 210580, "pid": 76337, "tid": -914061504, "ts": 1716454224726878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224783289, "dur": 261, "args": { "External id": 210581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210581, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210581, "pid": 5, "tid": 7, "ts": 1716454224783289, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726886, "dur": 6, "args": { "External id": 210581, "cbid": 211, "correlation": 210581 } }, { "ph": "s", "id": 210581, "pid": 76337, "tid": -914061504, "ts": 1716454224726886, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224783551, "dur": 6, "args": { "External id": 210583, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210583, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210583, "pid": 5, "tid": 7, "ts": 1716454224783551, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726897, "dur": 5, "args": { "External id": 210583, "cbid": 211, "correlation": 210583 } }, { "ph": "s", "id": 210583, "pid": 76337, "tid": -914061504, "ts": 1716454224726897, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224783558, "dur": 6, "args": { "External id": 210589, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210589, "pid": 5, "tid": 7, "ts": 1716454224783558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726925, "dur": 9, "args": { "External id": 210589, "cbid": 211, "correlation": 210589 } }, { "ph": "s", "id": 210589, "pid": 76337, "tid": -914061504, "ts": 1716454224726925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224783566, "dur": 5, "args": { "External id": 210597, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210597, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210597, "pid": 5, "tid": 7, "ts": 1716454224783566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726958, "dur": 8, "args": { "External id": 210597, "cbid": 211, "correlation": 210597 } }, { "ph": "s", "id": 210597, "pid": 76337, "tid": -914061504, "ts": 1716454224726958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224783572, "dur": 4, "args": { "External id": 210605, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210605, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210605, "pid": 5, "tid": 7, "ts": 1716454224783572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224726996, "dur": 9, "args": { "External id": 210605, "cbid": 211, "correlation": 210605 } }, { "ph": "s", "id": 210605, "pid": 76337, "tid": -914061504, "ts": 1716454224726996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224783578, "dur": 10, "args": { "External id": 210625, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210625, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 210625, "pid": 5, "tid": 7, "ts": 1716454224783578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727138, "dur": 13, "args": { "External id": 210625, "cbid": 211, "correlation": 210625 } }, { "ph": "s", "id": 210625, "pid": 76337, "tid": -914061504, "ts": 1716454224727138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224783589, "dur": 4, "args": { "External id": 210637, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210637, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 210637, "pid": 5, "tid": 7, "ts": 1716454224783589, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727161, "dur": 7, "args": { "External id": 210637, "cbid": 211, "correlation": 210637 } }, { "ph": "s", "id": 210637, "pid": 76337, "tid": -914061504, "ts": 1716454224727161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224783594, "dur": 7, "args": { "External id": 210640, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210640, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210640, "pid": 5, "tid": 7, "ts": 1716454224783594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727180, "dur": 7, "args": { "External id": 210640, "cbid": 211, "correlation": 210640 } }, { "ph": "s", "id": 210640, "pid": 76337, "tid": -914061504, "ts": 1716454224727180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224727239, "dur": 0, "args": { "External id": 210651, "cbid": 317, "correlation": 210651 } }, { "ph": "f", "id": 210651, "pid": 76337, "tid": -914061504, "ts": 1716454224727239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224727240, "dur": 0, "args": { "External id": 210652, "cbid": 203, "correlation": 210652 } }, { "ph": "f", "id": 210652, "pid": 76337, "tid": -914061504, "ts": 1716454224727240, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224727241, "dur": 0, "args": { "External id": 210653, "cbid": 205, "correlation": 210653 } }, { "ph": "f", "id": 210653, "pid": 76337, "tid": -914061504, "ts": 1716454224727241, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224783602, "dur": 5, "args": { "External id": 210657, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210657, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210657, "pid": 5, "tid": 7, "ts": 1716454224783602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727258, "dur": 12, "args": { "External id": 210657, "cbid": 211, "correlation": 210657 } }, { "ph": "s", "id": 210657, "pid": 76337, "tid": -914061504, "ts": 1716454224727258, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224783608, "dur": 37, "args": { "External id": 210659, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210659, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 210659, "pid": 5, "tid": 7, "ts": 1716454224783608, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727277, "dur": 9, "args": { "External id": 210659, "cbid": 211, "correlation": 210659 } }, { "ph": "s", "id": 210659, "pid": 76337, "tid": -914061504, "ts": 1716454224727277, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224783647, "dur": 5, "args": { "External id": 210661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210661, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210661, "pid": 5, "tid": 7, "ts": 1716454224783647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727290, "dur": 5, "args": { "External id": 210661, "cbid": 211, "correlation": 210661 } }, { "ph": "s", "id": 210661, "pid": 76337, "tid": -914061504, "ts": 1716454224727290, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224783653, "dur": 6, "args": { "External id": 210667, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210667, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210667, "pid": 5, "tid": 7, "ts": 1716454224783653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727318, "dur": 9, "args": { "External id": 210667, "cbid": 211, "correlation": 210667 } }, { "ph": "s", "id": 210667, "pid": 76337, "tid": -914061504, "ts": 1716454224727318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224783661, "dur": 20, "args": { "External id": 210676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210676, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210676, "pid": 5, "tid": 7, "ts": 1716454224783661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727400, "dur": 14, "args": { "External id": 210676, "cbid": 211, "correlation": 210676 } }, { "ph": "s", "id": 210676, "pid": 76337, "tid": -914061504, "ts": 1716454224727400, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224783683, "dur": 11, "args": { "External id": 210698, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210698, "registers per thread": 32, "shared memory": 24, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [768, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 60 } }, { "ph": "f", "id": 210698, "pid": 5, "tid": 7, "ts": 1716454224783683, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727457, "dur": 11, "args": { "External id": 210698, "cbid": 211, "correlation": 210698 } }, { "ph": "s", "id": 210698, "pid": 76337, "tid": -914061504, "ts": 1716454224727457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727547, "dur": 2, "args": { "External id": 210709, "cbid": 251, "correlation": 210709 } }, { "ph": "f", "id": 210709, "pid": 76337, "tid": -914061504, "ts": 1716454224727547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727552, "dur": 0, "args": { "External id": 210710, "cbid": 251, "correlation": 210710 } }, { "ph": "f", "id": 210710, "pid": 76337, "tid": -914061504, "ts": 1716454224727552, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224783695, "dur": 55, "args": { "External id": 210711, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210711, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 210711, "pid": 5, "tid": 7, "ts": 1716454224783695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727555, "dur": 14, "args": { "External id": 210711, "cbid": 211, "correlation": 210711 } }, { "ph": "s", "id": 210711, "pid": 76337, "tid": -914061504, "ts": 1716454224727555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727626, "dur": 1, "args": { "External id": 210722, "cbid": 251, "correlation": 210722 } }, { "ph": "f", "id": 210722, "pid": 76337, "tid": -914061504, "ts": 1716454224727626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727629, "dur": 0, "args": { "External id": 210723, "cbid": 251, "correlation": 210723 } }, { "ph": "f", "id": 210723, "pid": 76337, "tid": -914061504, "ts": 1716454224727629, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224783751, "dur": 53, "args": { "External id": 210724, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210724, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 210724, "pid": 5, "tid": 7, "ts": 1716454224783751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727631, "dur": 11, "args": { "External id": 210724, "cbid": 211, "correlation": 210724 } }, { "ph": "s", "id": 210724, "pid": 76337, "tid": -914061504, "ts": 1716454224727631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727695, "dur": 1, "args": { "External id": 210735, "cbid": 251, "correlation": 210735 } }, { "ph": "f", "id": 210735, "pid": 76337, "tid": -914061504, "ts": 1716454224727695, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727699, "dur": 0, "args": { "External id": 210736, "cbid": 251, "correlation": 210736 } }, { "ph": "f", "id": 210736, "pid": 76337, "tid": -914061504, "ts": 1716454224727699, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224783806, "dur": 54, "args": { "External id": 210737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210737, "registers per thread": 186, "shared memory": 32768, "blocks per SM": 1.5, "warps per SM": 6, "grid": [10, 12, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 210737, "pid": 5, "tid": 7, "ts": 1716454224783806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727701, "dur": 12, "args": { "External id": 210737, "cbid": 211, "correlation": 210737 } }, { "ph": "s", "id": 210737, "pid": 76337, "tid": -914061504, "ts": 1716454224727701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224783861, "dur": 58, "args": { "External id": 210762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210762, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 2.4, "warps per SM": 9.6, "grid": [3, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210762, "pid": 5, "tid": 7, "ts": 1716454224783861, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727785, "dur": 12, "args": { "External id": 210762, "cbid": 211, "correlation": 210762 } }, { "ph": "s", "id": 210762, "pid": 76337, "tid": -914061504, "ts": 1716454224727785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224727883, "dur": 1, "args": { "External id": 210780, "cbid": 251, "correlation": 210780 } }, { "ph": "f", "id": 210780, "pid": 76337, "tid": -914061504, "ts": 1716454224727883, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224783921, "dur": 64, "args": { "External id": 210782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210782, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 0.75, "warps per SM": 3, "grid": [10, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 210782, "pid": 5, "tid": 7, "ts": 1716454224783921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727888, "dur": 13, "args": { "External id": 210782, "cbid": 211, "correlation": 210782 } }, { "ph": "s", "id": 210782, "pid": 76337, "tid": -914061504, "ts": 1716454224727888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224783986, "dur": 6, "args": { "External id": 210790, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210790, "registers per thread": 17, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210790, "pid": 5, "tid": 7, "ts": 1716454224783986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224727958, "dur": 13, "args": { "External id": 210790, "cbid": 211, "correlation": 210790 } }, { "ph": "s", "id": 210790, "pid": 76337, "tid": -914061504, "ts": 1716454224727958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224783993, "dur": 7, "args": { "External id": 210798, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210798, "registers per thread": 19, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210798, "pid": 5, "tid": 7, "ts": 1716454224783993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728007, "dur": 10, "args": { "External id": 210798, "cbid": 211, "correlation": 210798 } }, { "ph": "s", "id": 210798, "pid": 76337, "tid": -914061504, "ts": 1716454224728007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784002, "dur": 7, "args": { "External id": 210809, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210809, "pid": 5, "tid": 7, "ts": 1716454224784002, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728082, "dur": 12, "args": { "External id": 210809, "cbid": 211, "correlation": 210809 } }, { "ph": "s", "id": 210809, "pid": 76337, "tid": -914061504, "ts": 1716454224728082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224784010, "dur": 9, "args": { "External id": 210831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210831, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 210831, "pid": 5, "tid": 7, "ts": 1716454224784010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728115, "dur": 7, "args": { "External id": 210831, "cbid": 211, "correlation": 210831 } }, { "ph": "s", "id": 210831, "pid": 76337, "tid": -914061504, "ts": 1716454224728115, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728200, "dur": 2, "args": { "External id": 210842, "cbid": 251, "correlation": 210842 } }, { "ph": "f", "id": 210842, "pid": 76337, "tid": -914061504, "ts": 1716454224728200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224784021, "dur": 1, "args": { "External id": 210843, "device": 5, "context": 1, "stream": 7, "correlation": 210843, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 210843, "pid": 5, "tid": 7, "ts": 1716454224784021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224728206, "dur": 11, "args": { "External id": 210843, "cbid": 51, "correlation": 210843 } }, { "ph": "s", "id": 210843, "pid": 76337, "tid": -914061504, "ts": 1716454224728206, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224784025, "dur": 37, "args": { "External id": 210844, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210844, "registers per thread": 106, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 210844, "pid": 5, "tid": 7, "ts": 1716454224784025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728218, "dur": 13, "args": { "External id": 210844, "cbid": 211, "correlation": 210844 } }, { "ph": "s", "id": 210844, "pid": 76337, "tid": -914061504, "ts": 1716454224728218, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728290, "dur": 1, "args": { "External id": 210855, "cbid": 251, "correlation": 210855 } }, { "ph": "f", "id": 210855, "pid": 76337, "tid": -914061504, "ts": 1716454224728290, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728294, "dur": 0, "args": { "External id": 210856, "cbid": 251, "correlation": 210856 } }, { "ph": "f", "id": 210856, "pid": 76337, "tid": -914061504, "ts": 1716454224728294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224784063, "dur": 11, "args": { "External id": 210857, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210857, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210857, "pid": 5, "tid": 7, "ts": 1716454224784063, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728295, "dur": 12, "args": { "External id": 210857, "cbid": 211, "correlation": 210857 } }, { "ph": "s", "id": 210857, "pid": 76337, "tid": -914061504, "ts": 1716454224728295, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224784076, "dur": 6, "args": { "External id": 210859, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210859, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210859, "pid": 5, "tid": 7, "ts": 1716454224784076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728309, "dur": 7, "args": { "External id": 210859, "cbid": 211, "correlation": 210859 } }, { "ph": "s", "id": 210859, "pid": 76337, "tid": -914061504, "ts": 1716454224728309, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728367, "dur": 1, "args": { "External id": 210870, "cbid": 251, "correlation": 210870 } }, { "ph": "f", "id": 210870, "pid": 76337, "tid": -914061504, "ts": 1716454224728367, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728371, "dur": 0, "args": { "External id": 210871, "cbid": 251, "correlation": 210871 } }, { "ph": "f", "id": 210871, "pid": 76337, "tid": -914061504, "ts": 1716454224728371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224784083, "dur": 8, "args": { "External id": 210872, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210872, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210872, "pid": 5, "tid": 7, "ts": 1716454224784083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728372, "dur": 11, "args": { "External id": 210872, "cbid": 211, "correlation": 210872 } }, { "ph": "s", "id": 210872, "pid": 76337, "tid": -914061504, "ts": 1716454224728372, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224784093, "dur": 4, "args": { "External id": 210874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210874, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210874, "pid": 5, "tid": 7, "ts": 1716454224784093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728385, "dur": 5, "args": { "External id": 210874, "cbid": 211, "correlation": 210874 } }, { "ph": "s", "id": 210874, "pid": 76337, "tid": -914061504, "ts": 1716454224728385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224784098, "dur": 19, "args": { "External id": 210899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210899, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 1.6, "warps per SM": 6.4, "grid": [2, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 10 } }, { "ph": "f", "id": 210899, "pid": 5, "tid": 7, "ts": 1716454224784098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728465, "dur": 12, "args": { "External id": 210899, "cbid": 211, "correlation": 210899 } }, { "ph": "s", "id": 210899, "pid": 76337, "tid": -914061504, "ts": 1716454224728465, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728565, "dur": 2, "args": { "External id": 210917, "cbid": 251, "correlation": 210917 } }, { "ph": "f", "id": 210917, "pid": 76337, "tid": -914061504, "ts": 1716454224728565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224784119, "dur": 1, "args": { "External id": 210919, "device": 5, "context": 1, "stream": 7, "correlation": 210919, "bytes": 480, "memory bandwidth (GB/s)": 0.30612244897959184 } }, { "ph": "f", "id": 210919, "pid": 5, "tid": 7, "ts": 1716454224784119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224728571, "dur": 9, "args": { "External id": 210919, "cbid": 51, "correlation": 210919 } }, { "ph": "s", "id": 210919, "pid": 76337, "tid": -914061504, "ts": 1716454224728571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x64_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224784123, "dur": 37, "args": { "External id": 210920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210920, "registers per thread": 110, "shared memory": 16896, "blocks per SM": 3, "warps per SM": 12, "grid": [20, 6, 2], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 210920, "pid": 5, "tid": 7, "ts": 1716454224784123, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728582, "dur": 12, "args": { "External id": 210920, "cbid": 211, "correlation": 210920 } }, { "ph": "s", "id": 210920, "pid": 76337, "tid": -914061504, "ts": 1716454224728582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224784161, "dur": 4, "args": { "External id": 210928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210928, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210928, "pid": 5, "tid": 7, "ts": 1716454224784161, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728652, "dur": 13, "args": { "External id": 210928, "cbid": 211, "correlation": 210928 } }, { "ph": "s", "id": 210928, "pid": 76337, "tid": -914061504, "ts": 1716454224728652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784167, "dur": 8, "args": { "External id": 210936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 210936, "pid": 5, "tid": 7, "ts": 1716454224784167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728694, "dur": 9, "args": { "External id": 210936, "cbid": 211, "correlation": 210936 } }, { "ph": "s", "id": 210936, "pid": 76337, "tid": -914061504, "ts": 1716454224728694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224784176, "dur": 8, "args": { "External id": 210958, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210958, "registers per thread": 32, "shared memory": 24, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [384, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 30 } }, { "ph": "f", "id": 210958, "pid": 5, "tid": 7, "ts": 1716454224784176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728747, "dur": 10, "args": { "External id": 210958, "cbid": 211, "correlation": 210958 } }, { "ph": "s", "id": 210958, "pid": 76337, "tid": -914061504, "ts": 1716454224728747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728838, "dur": 1, "args": { "External id": 210974, "cbid": 251, "correlation": 210974 } }, { "ph": "f", "id": 210974, "pid": 76337, "tid": -914061504, "ts": 1716454224728838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224728843, "dur": 0, "args": { "External id": 210976, "cbid": 251, "correlation": 210976 } }, { "ph": "f", "id": 210976, "pid": 76337, "tid": -914061504, "ts": 1716454224728843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224784186, "dur": 191, "args": { "External id": 210977, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210977, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 3, "warps per SM": 12, "grid": [24, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 210977, "pid": 5, "tid": 7, "ts": 1716454224784186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728845, "dur": 13, "args": { "External id": 210977, "cbid": 211, "correlation": 210977 } }, { "ph": "s", "id": 210977, "pid": 76337, "tid": -914061504, "ts": 1716454224728845, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784378, "dur": 20, "args": { "External id": 210985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210985, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210985, "pid": 5, "tid": 7, "ts": 1716454224784378, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728912, "dur": 13, "args": { "External id": 210985, "cbid": 211, "correlation": 210985 } }, { "ph": "s", "id": 210985, "pid": 76337, "tid": -914061504, "ts": 1716454224728912, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784400, "dur": 23, "args": { "External id": 210993, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 210993, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 210993, "pid": 5, "tid": 7, "ts": 1716454224784400, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224728942, "dur": 8, "args": { "External id": 210993, "cbid": 211, "correlation": 210993 } }, { "ph": "s", "id": 210993, "pid": 76337, "tid": -914061504, "ts": 1716454224728942, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224729034, "dur": 1, "args": { "External id": 211009, "cbid": 251, "correlation": 211009 } }, { "ph": "f", "id": 211009, "pid": 76337, "tid": -914061504, "ts": 1716454224729034, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224784425, "dur": 1, "args": { "External id": 211011, "device": 5, "context": 1, "stream": 7, "correlation": 211011, "bytes": 120, "memory bandwidth (GB/s)": 0.078125 } }, { "ph": "f", "id": 211011, "pid": 5, "tid": 7, "ts": 1716454224784425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224729039, "dur": 8, "args": { "External id": 211011, "cbid": 51, "correlation": 211011 } }, { "ph": "s", "id": 211011, "pid": 76337, "tid": -914061504, "ts": 1716454224729039, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224784429, "dur": 111, "args": { "External id": 211012, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211012, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 1.875, "warps per SM": 7.5, "grid": [10, 3, 5], "block": [128, 1, 1], "est. achieved occupancy %": 12 } }, { "ph": "f", "id": 211012, "pid": 5, "tid": 7, "ts": 1716454224784429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729049, "dur": 13, "args": { "External id": 211012, "cbid": 211, "correlation": 211012 } }, { "ph": "s", "id": 211012, "pid": 76337, "tid": -914061504, "ts": 1716454224729049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224784541, "dur": 6, "args": { "External id": 211020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211020, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211020, "pid": 5, "tid": 7, "ts": 1716454224784541, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729093, "dur": 10, "args": { "External id": 211020, "cbid": 211, "correlation": 211020 } }, { "ph": "s", "id": 211020, "pid": 76337, "tid": -914061504, "ts": 1716454224729093, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784548, "dur": 10, "args": { "External id": 211031, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211031, "pid": 5, "tid": 7, "ts": 1716454224784548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729163, "dur": 12, "args": { "External id": 211031, "cbid": 211, "correlation": 211031 } }, { "ph": "s", "id": 211031, "pid": 76337, "tid": -914061504, "ts": 1716454224729163, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224729226, "dur": 0, "args": { "External id": 211043, "cbid": 317, "correlation": 211043 } }, { "ph": "f", "id": 211043, "pid": 76337, "tid": -914061504, "ts": 1716454224729226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224729227, "dur": 0, "args": { "External id": 211044, "cbid": 203, "correlation": 211044 } }, { "ph": "f", "id": 211044, "pid": 76337, "tid": -914061504, "ts": 1716454224729227, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224729228, "dur": 0, "args": { "External id": 211045, "cbid": 205, "correlation": 211045 } }, { "ph": "f", "id": 211045, "pid": 76337, "tid": -914061504, "ts": 1716454224729228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224784559, "dur": 6, "args": { "External id": 211049, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211049, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211049, "pid": 5, "tid": 7, "ts": 1716454224784559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729243, "dur": 12, "args": { "External id": 211049, "cbid": 211, "correlation": 211049 } }, { "ph": "s", "id": 211049, "pid": 76337, "tid": -914061504, "ts": 1716454224729243, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224784566, "dur": 37, "args": { "External id": 211051, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211051, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 1.5, "warps per SM": 6, "grid": [20, 6, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 211051, "pid": 5, "tid": 7, "ts": 1716454224784566, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729263, "dur": 6, "args": { "External id": 211051, "cbid": 211, "correlation": 211051 } }, { "ph": "s", "id": 211051, "pid": 76337, "tid": -914061504, "ts": 1716454224729263, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224784604, "dur": 6, "args": { "External id": 211053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211053, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211053, "pid": 5, "tid": 7, "ts": 1716454224784604, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729273, "dur": 5, "args": { "External id": 211053, "cbid": 211, "correlation": 211053 } }, { "ph": "s", "id": 211053, "pid": 76337, "tid": -914061504, "ts": 1716454224729273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784611, "dur": 7, "args": { "External id": 211059, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211059, "pid": 5, "tid": 7, "ts": 1716454224784611, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729300, "dur": 9, "args": { "External id": 211059, "cbid": 211, "correlation": 211059 } }, { "ph": "s", "id": 211059, "pid": 76337, "tid": -914061504, "ts": 1716454224729300, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224784620, "dur": 5, "args": { "External id": 211067, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211067, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211067, "pid": 5, "tid": 7, "ts": 1716454224784620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729331, "dur": 8, "args": { "External id": 211067, "cbid": 211, "correlation": 211067 } }, { "ph": "s", "id": 211067, "pid": 76337, "tid": -914061504, "ts": 1716454224729331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224784626, "dur": 11, "args": { "External id": 211087, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211087, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211087, "pid": 5, "tid": 7, "ts": 1716454224784626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729404, "dur": 12, "args": { "External id": 211087, "cbid": 211, "correlation": 211087 } }, { "ph": "s", "id": 211087, "pid": 76337, "tid": -914061504, "ts": 1716454224729404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224784639, "dur": 5, "args": { "External id": 211099, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211099, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 211099, "pid": 5, "tid": 7, "ts": 1716454224784639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729426, "dur": 6, "args": { "External id": 211099, "cbid": 211, "correlation": 211099 } }, { "ph": "s", "id": 211099, "pid": 76337, "tid": -914061504, "ts": 1716454224729426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224784645, "dur": 9, "args": { "External id": 211102, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211102, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211102, "pid": 5, "tid": 7, "ts": 1716454224784645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729444, "dur": 7, "args": { "External id": 211102, "cbid": 211, "correlation": 211102 } }, { "ph": "s", "id": 211102, "pid": 76337, "tid": -914061504, "ts": 1716454224729444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224784655, "dur": 5, "args": { "External id": 211111, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211111, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211111, "pid": 5, "tid": 7, "ts": 1716454224784655, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729483, "dur": 9, "args": { "External id": 211111, "cbid": 211, "correlation": 211111 } }, { "ph": "s", "id": 211111, "pid": 76337, "tid": -914061504, "ts": 1716454224729483, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224729534, "dur": 0, "args": { "External id": 211121, "cbid": 317, "correlation": 211121 } }, { "ph": "f", "id": 211121, "pid": 76337, "tid": -914061504, "ts": 1716454224729534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224729535, "dur": 0, "args": { "External id": 211122, "cbid": 203, "correlation": 211122 } }, { "ph": "f", "id": 211122, "pid": 76337, "tid": -914061504, "ts": 1716454224729535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224729535, "dur": 0, "args": { "External id": 211123, "cbid": 205, "correlation": 211123 } }, { "ph": "f", "id": 211123, "pid": 76337, "tid": -914061504, "ts": 1716454224729535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224784662, "dur": 5, "args": { "External id": 211127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211127, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211127, "pid": 5, "tid": 7, "ts": 1716454224784662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729548, "dur": 12, "args": { "External id": 211127, "cbid": 211, "correlation": 211127 } }, { "ph": "s", "id": 211127, "pid": 76337, "tid": -914061504, "ts": 1716454224729548, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224784668, "dur": 163, "args": { "External id": 211129, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211129, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211129, "pid": 5, "tid": 7, "ts": 1716454224784668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729563, "dur": 5, "args": { "External id": 211129, "cbid": 211, "correlation": 211129 } }, { "ph": "s", "id": 211129, "pid": 76337, "tid": -914061504, "ts": 1716454224729563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224784834, "dur": 1, "args": { "External id": 211131, "device": 5, "context": 1, "stream": 7, "correlation": 211131, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 211131, "pid": 5, "tid": 7, "ts": 1716454224784834, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224729574, "dur": 7, "args": { "External id": 211131, "cbid": 51, "correlation": 211131 } }, { "ph": "s", "id": 211131, "pid": 76337, "tid": -914061504, "ts": 1716454224729574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224784837, "dur": 272, "args": { "External id": 211132, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211132, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211132, "pid": 5, "tid": 7, "ts": 1716454224784837, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729582, "dur": 7, "args": { "External id": 211132, "cbid": 211, "correlation": 211132 } }, { "ph": "s", "id": 211132, "pid": 76337, "tid": -914061504, "ts": 1716454224729582, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224785111, "dur": 5, "args": { "External id": 211134, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211134, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211134, "pid": 5, "tid": 7, "ts": 1716454224785111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729592, "dur": 5, "args": { "External id": 211134, "cbid": 211, "correlation": 211134 } }, { "ph": "s", "id": 211134, "pid": 76337, "tid": -914061504, "ts": 1716454224729592, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224785118, "dur": 6, "args": { "External id": 211140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211140, "pid": 5, "tid": 7, "ts": 1716454224785118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729620, "dur": 9, "args": { "External id": 211140, "cbid": 211, "correlation": 211140 } }, { "ph": "s", "id": 211140, "pid": 76337, "tid": -914061504, "ts": 1716454224729620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224785125, "dur": 3, "args": { "External id": 211148, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211148, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 211148, "pid": 5, "tid": 7, "ts": 1716454224785125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729664, "dur": 9, "args": { "External id": 211148, "cbid": 211, "correlation": 211148 } }, { "ph": "s", "id": 211148, "pid": 76337, "tid": -914061504, "ts": 1716454224729664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224729727, "dur": 1, "args": { "External id": 211164, "cbid": 251, "correlation": 211164 } }, { "ph": "f", "id": 211164, "pid": 76337, "tid": -914061504, "ts": 1716454224729727, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224729732, "dur": 0, "args": { "External id": 211166, "cbid": 251, "correlation": 211166 } }, { "ph": "f", "id": 211166, "pid": 76337, "tid": -914061504, "ts": 1716454224729732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224785130, "dur": 13, "args": { "External id": 211167, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211167, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211167, "pid": 5, "tid": 7, "ts": 1716454224785130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729733, "dur": 11, "args": { "External id": 211167, "cbid": 211, "correlation": 211167 } }, { "ph": "s", "id": 211167, "pid": 76337, "tid": -914061504, "ts": 1716454224729733, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224785144, "dur": 5, "args": { "External id": 211169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211169, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211169, "pid": 5, "tid": 7, "ts": 1716454224785144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729746, "dur": 5, "args": { "External id": 211169, "cbid": 211, "correlation": 211169 } }, { "ph": "s", "id": 211169, "pid": 76337, "tid": -914061504, "ts": 1716454224729746, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224785150, "dur": 6, "args": { "External id": 211179, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211179, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211179, "pid": 5, "tid": 7, "ts": 1716454224785150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729803, "dur": 13, "args": { "External id": 211179, "cbid": 211, "correlation": 211179 } }, { "ph": "s", "id": 211179, "pid": 76337, "tid": -914061504, "ts": 1716454224729803, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224785157, "dur": 10, "args": { "External id": 211199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211199, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211199, "pid": 5, "tid": 7, "ts": 1716454224785157, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729871, "dur": 10, "args": { "External id": 211199, "cbid": 211, "correlation": 211199 } }, { "ph": "s", "id": 211199, "pid": 76337, "tid": -914061504, "ts": 1716454224729871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224785169, "dur": 4, "args": { "External id": 211211, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211211, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 211211, "pid": 5, "tid": 7, "ts": 1716454224785169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729891, "dur": 6, "args": { "External id": 211211, "cbid": 211, "correlation": 211211 } }, { "ph": "s", "id": 211211, "pid": 76337, "tid": -914061504, "ts": 1716454224729891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224785174, "dur": 7, "args": { "External id": 211214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211214, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211214, "pid": 5, "tid": 7, "ts": 1716454224785174, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729911, "dur": 7, "args": { "External id": 211214, "cbid": 211, "correlation": 211214 } }, { "ph": "s", "id": 211214, "pid": 76337, "tid": -914061504, "ts": 1716454224729911, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224785182, "dur": 5, "args": { "External id": 211223, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211223, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211223, "pid": 5, "tid": 7, "ts": 1716454224785182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224729950, "dur": 10, "args": { "External id": 211223, "cbid": 211, "correlation": 211223 } }, { "ph": "s", "id": 211223, "pid": 76337, "tid": -914061504, "ts": 1716454224729950, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224730022, "dur": 0, "args": { "External id": 211233, "cbid": 317, "correlation": 211233 } }, { "ph": "f", "id": 211233, "pid": 76337, "tid": -914061504, "ts": 1716454224730022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224730023, "dur": 0, "args": { "External id": 211234, "cbid": 203, "correlation": 211234 } }, { "ph": "f", "id": 211234, "pid": 76337, "tid": -914061504, "ts": 1716454224730023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224730024, "dur": 0, "args": { "External id": 211235, "cbid": 205, "correlation": 211235 } }, { "ph": "f", "id": 211235, "pid": 76337, "tid": -914061504, "ts": 1716454224730024, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224785188, "dur": 5, "args": { "External id": 211239, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211239, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211239, "pid": 5, "tid": 7, "ts": 1716454224785188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730037, "dur": 12, "args": { "External id": 211239, "cbid": 211, "correlation": 211239 } }, { "ph": "s", "id": 211239, "pid": 76337, "tid": -914061504, "ts": 1716454224730037, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224785195, "dur": 164, "args": { "External id": 211241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211241, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211241, "pid": 5, "tid": 7, "ts": 1716454224785195, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730052, "dur": 6, "args": { "External id": 211241, "cbid": 211, "correlation": 211241 } }, { "ph": "s", "id": 211241, "pid": 76337, "tid": -914061504, "ts": 1716454224730052, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224785361, "dur": 1, "args": { "External id": 211243, "device": 5, "context": 1, "stream": 7, "correlation": 211243, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 211243, "pid": 5, "tid": 7, "ts": 1716454224785361, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224730064, "dur": 6, "args": { "External id": 211243, "cbid": 51, "correlation": 211243 } }, { "ph": "s", "id": 211243, "pid": 76337, "tid": -914061504, "ts": 1716454224730064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224785365, "dur": 261, "args": { "External id": 211244, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211244, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211244, "pid": 5, "tid": 7, "ts": 1716454224785365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730071, "dur": 6, "args": { "External id": 211244, "cbid": 211, "correlation": 211244 } }, { "ph": "s", "id": 211244, "pid": 76337, "tid": -914061504, "ts": 1716454224730071, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224785627, "dur": 6, "args": { "External id": 211246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211246, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211246, "pid": 5, "tid": 7, "ts": 1716454224785627, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730081, "dur": 5, "args": { "External id": 211246, "cbid": 211, "correlation": 211246 } }, { "ph": "s", "id": 211246, "pid": 76337, "tid": -914061504, "ts": 1716454224730081, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224785635, "dur": 6, "args": { "External id": 211252, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211252, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211252, "pid": 5, "tid": 7, "ts": 1716454224785635, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730110, "dur": 8, "args": { "External id": 211252, "cbid": 211, "correlation": 211252 } }, { "ph": "s", "id": 211252, "pid": 76337, "tid": -914061504, "ts": 1716454224730110, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224785642, "dur": 5, "args": { "External id": 211260, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211260, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211260, "pid": 5, "tid": 7, "ts": 1716454224785642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730142, "dur": 8, "args": { "External id": 211260, "cbid": 211, "correlation": 211260 } }, { "ph": "s", "id": 211260, "pid": 76337, "tid": -914061504, "ts": 1716454224730142, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224785648, "dur": 4, "args": { "External id": 211268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211268, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211268, "pid": 5, "tid": 7, "ts": 1716454224785648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730171, "dur": 8, "args": { "External id": 211268, "cbid": 211, "correlation": 211268 } }, { "ph": "s", "id": 211268, "pid": 76337, "tid": -914061504, "ts": 1716454224730171, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224785654, "dur": 11, "args": { "External id": 211277, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211277, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211277, "pid": 5, "tid": 7, "ts": 1716454224785654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730260, "dur": 14, "args": { "External id": 211277, "cbid": 211, "correlation": 211277 } }, { "ph": "s", "id": 211277, "pid": 76337, "tid": -914061504, "ts": 1716454224730260, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224785667, "dur": 12, "args": { "External id": 211297, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211297, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211297, "pid": 5, "tid": 7, "ts": 1716454224785667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730328, "dur": 11, "args": { "External id": 211297, "cbid": 211, "correlation": 211297 } }, { "ph": "s", "id": 211297, "pid": 76337, "tid": -914061504, "ts": 1716454224730328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224785680, "dur": 4, "args": { "External id": 211309, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211309, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211309, "pid": 5, "tid": 7, "ts": 1716454224785680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730349, "dur": 6, "args": { "External id": 211309, "cbid": 211, "correlation": 211309 } }, { "ph": "s", "id": 211309, "pid": 76337, "tid": -914061504, "ts": 1716454224730349, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224785686, "dur": 10, "args": { "External id": 211312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211312, "pid": 5, "tid": 7, "ts": 1716454224785686, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730368, "dur": 6, "args": { "External id": 211312, "cbid": 211, "correlation": 211312 } }, { "ph": "s", "id": 211312, "pid": 76337, "tid": -914061504, "ts": 1716454224730368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224785697, "dur": 6, "args": { "External id": 211321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211321, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211321, "pid": 5, "tid": 7, "ts": 1716454224785697, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730406, "dur": 9, "args": { "External id": 211321, "cbid": 211, "correlation": 211321 } }, { "ph": "s", "id": 211321, "pid": 76337, "tid": -914061504, "ts": 1716454224730406, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224730458, "dur": 0, "args": { "External id": 211331, "cbid": 317, "correlation": 211331 } }, { "ph": "f", "id": 211331, "pid": 76337, "tid": -914061504, "ts": 1716454224730458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224730458, "dur": 0, "args": { "External id": 211332, "cbid": 203, "correlation": 211332 } }, { "ph": "f", "id": 211332, "pid": 76337, "tid": -914061504, "ts": 1716454224730458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224730459, "dur": 0, "args": { "External id": 211333, "cbid": 205, "correlation": 211333 } }, { "ph": "f", "id": 211333, "pid": 76337, "tid": -914061504, "ts": 1716454224730459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224785705, "dur": 7, "args": { "External id": 211337, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211337, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211337, "pid": 5, "tid": 7, "ts": 1716454224785705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730475, "dur": 11, "args": { "External id": 211337, "cbid": 211, "correlation": 211337 } }, { "ph": "s", "id": 211337, "pid": 76337, "tid": -914061504, "ts": 1716454224730475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224785713, "dur": 322, "args": { "External id": 211339, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211339, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211339, "pid": 5, "tid": 7, "ts": 1716454224785713, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730488, "dur": 5, "args": { "External id": 211339, "cbid": 211, "correlation": 211339 } }, { "ph": "s", "id": 211339, "pid": 76337, "tid": -914061504, "ts": 1716454224730488, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224786037, "dur": 1, "args": { "External id": 211341, "device": 5, "context": 1, "stream": 7, "correlation": 211341, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 211341, "pid": 5, "tid": 7, "ts": 1716454224786037, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224730499, "dur": 7, "args": { "External id": 211341, "cbid": 51, "correlation": 211341 } }, { "ph": "s", "id": 211341, "pid": 76337, "tid": -914061504, "ts": 1716454224730499, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224786040, "dur": 498, "args": { "External id": 211342, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211342, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211342, "pid": 5, "tid": 7, "ts": 1716454224786040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730508, "dur": 6, "args": { "External id": 211342, "cbid": 211, "correlation": 211342 } }, { "ph": "s", "id": 211342, "pid": 76337, "tid": -914061504, "ts": 1716454224730508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224786540, "dur": 6, "args": { "External id": 211344, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211344, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211344, "pid": 5, "tid": 7, "ts": 1716454224786540, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730518, "dur": 5, "args": { "External id": 211344, "cbid": 211, "correlation": 211344 } }, { "ph": "s", "id": 211344, "pid": 76337, "tid": -914061504, "ts": 1716454224730518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224786547, "dur": 6, "args": { "External id": 211350, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211350, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211350, "pid": 5, "tid": 7, "ts": 1716454224786547, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730545, "dur": 9, "args": { "External id": 211350, "cbid": 211, "correlation": 211350 } }, { "ph": "s", "id": 211350, "pid": 76337, "tid": -914061504, "ts": 1716454224730545, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224786554, "dur": 3, "args": { "External id": 211358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211358, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 211358, "pid": 5, "tid": 7, "ts": 1716454224786554, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730589, "dur": 10, "args": { "External id": 211358, "cbid": 211, "correlation": 211358 } }, { "ph": "s", "id": 211358, "pid": 76337, "tid": -914061504, "ts": 1716454224730589, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224730653, "dur": 1, "args": { "External id": 211374, "cbid": 251, "correlation": 211374 } }, { "ph": "f", "id": 211374, "pid": 76337, "tid": -914061504, "ts": 1716454224730653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224730658, "dur": 0, "args": { "External id": 211376, "cbid": 251, "correlation": 211376 } }, { "ph": "f", "id": 211376, "pid": 76337, "tid": -914061504, "ts": 1716454224730658, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224786559, "dur": 11, "args": { "External id": 211377, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211377, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211377, "pid": 5, "tid": 7, "ts": 1716454224786559, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730660, "dur": 11, "args": { "External id": 211377, "cbid": 211, "correlation": 211377 } }, { "ph": "s", "id": 211377, "pid": 76337, "tid": -914061504, "ts": 1716454224730660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224786572, "dur": 5, "args": { "External id": 211379, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211379, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211379, "pid": 5, "tid": 7, "ts": 1716454224786572, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730674, "dur": 6, "args": { "External id": 211379, "cbid": 211, "correlation": 211379 } }, { "ph": "s", "id": 211379, "pid": 76337, "tid": -914061504, "ts": 1716454224730674, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224786578, "dur": 6, "args": { "External id": 211389, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211389, "pid": 5, "tid": 7, "ts": 1716454224786578, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730731, "dur": 12, "args": { "External id": 211389, "cbid": 211, "correlation": 211389 } }, { "ph": "s", "id": 211389, "pid": 76337, "tid": -914061504, "ts": 1716454224730731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224786585, "dur": 9, "args": { "External id": 211409, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211409, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211409, "pid": 5, "tid": 7, "ts": 1716454224786585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730796, "dur": 11, "args": { "External id": 211409, "cbid": 211, "correlation": 211409 } }, { "ph": "s", "id": 211409, "pid": 76337, "tid": -914061504, "ts": 1716454224730796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224786596, "dur": 4, "args": { "External id": 211421, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211421, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 211421, "pid": 5, "tid": 7, "ts": 1716454224786596, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730818, "dur": 6, "args": { "External id": 211421, "cbid": 211, "correlation": 211421 } }, { "ph": "s", "id": 211421, "pid": 76337, "tid": -914061504, "ts": 1716454224730818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224786601, "dur": 7, "args": { "External id": 211424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211424, "pid": 5, "tid": 7, "ts": 1716454224786601, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730837, "dur": 6, "args": { "External id": 211424, "cbid": 211, "correlation": 211424 } }, { "ph": "s", "id": 211424, "pid": 76337, "tid": -914061504, "ts": 1716454224730837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224786609, "dur": 5, "args": { "External id": 211433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211433, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211433, "pid": 5, "tid": 7, "ts": 1716454224786609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730877, "dur": 10, "args": { "External id": 211433, "cbid": 211, "correlation": 211433 } }, { "ph": "s", "id": 211433, "pid": 76337, "tid": -914061504, "ts": 1716454224730877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224730939, "dur": 0, "args": { "External id": 211443, "cbid": 317, "correlation": 211443 } }, { "ph": "f", "id": 211443, "pid": 76337, "tid": -914061504, "ts": 1716454224730939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224730940, "dur": 0, "args": { "External id": 211444, "cbid": 203, "correlation": 211444 } }, { "ph": "f", "id": 211444, "pid": 76337, "tid": -914061504, "ts": 1716454224730940, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224730941, "dur": 0, "args": { "External id": 211445, "cbid": 205, "correlation": 211445 } }, { "ph": "f", "id": 211445, "pid": 76337, "tid": -914061504, "ts": 1716454224730941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224786615, "dur": 5, "args": { "External id": 211449, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211449, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211449, "pid": 5, "tid": 7, "ts": 1716454224786615, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730954, "dur": 12, "args": { "External id": 211449, "cbid": 211, "correlation": 211449 } }, { "ph": "s", "id": 211449, "pid": 76337, "tid": -914061504, "ts": 1716454224730954, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224786621, "dur": 163, "args": { "External id": 211451, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211451, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211451, "pid": 5, "tid": 7, "ts": 1716454224786621, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730969, "dur": 13, "args": { "External id": 211451, "cbid": 211, "correlation": 211451 } }, { "ph": "s", "id": 211451, "pid": 76337, "tid": -914061504, "ts": 1716454224730969, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224786787, "dur": 1, "args": { "External id": 211453, "device": 5, "context": 1, "stream": 7, "correlation": 211453, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 211453, "pid": 5, "tid": 7, "ts": 1716454224786787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224730987, "dur": 8, "args": { "External id": 211453, "cbid": 51, "correlation": 211453 } }, { "ph": "s", "id": 211453, "pid": 76337, "tid": -914061504, "ts": 1716454224730987, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224786790, "dur": 261, "args": { "External id": 211454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211454, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211454, "pid": 5, "tid": 7, "ts": 1716454224786790, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224730996, "dur": 7, "args": { "External id": 211454, "cbid": 211, "correlation": 211454 } }, { "ph": "s", "id": 211454, "pid": 76337, "tid": -914061504, "ts": 1716454224730996, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224787052, "dur": 6, "args": { "External id": 211456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211456, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211456, "pid": 5, "tid": 7, "ts": 1716454224787052, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731007, "dur": 5, "args": { "External id": 211456, "cbid": 211, "correlation": 211456 } }, { "ph": "s", "id": 211456, "pid": 76337, "tid": -914061504, "ts": 1716454224731007, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224787059, "dur": 6, "args": { "External id": 211462, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211462, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211462, "pid": 5, "tid": 7, "ts": 1716454224787059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731035, "dur": 9, "args": { "External id": 211462, "cbid": 211, "correlation": 211462 } }, { "ph": "s", "id": 211462, "pid": 76337, "tid": -914061504, "ts": 1716454224731035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224731094, "dur": 0, "args": { "External id": 211472, "cbid": 317, "correlation": 211472 } }, { "ph": "f", "id": 211472, "pid": 76337, "tid": -914061504, "ts": 1716454224731094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224731094, "dur": 0, "args": { "External id": 211473, "cbid": 203, "correlation": 211473 } }, { "ph": "f", "id": 211473, "pid": 76337, "tid": -914061504, "ts": 1716454224731094, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224731095, "dur": 0, "args": { "External id": 211474, "cbid": 205, "correlation": 211474 } }, { "ph": "f", "id": 211474, "pid": 76337, "tid": -914061504, "ts": 1716454224731095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224787067, "dur": 8, "args": { "External id": 211478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211478, "pid": 5, "tid": 7, "ts": 1716454224787067, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731109, "dur": 12, "args": { "External id": 211478, "cbid": 211, "correlation": 211478 } }, { "ph": "s", "id": 211478, "pid": 76337, "tid": -914061504, "ts": 1716454224731109, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224787076, "dur": 3, "args": { "External id": 211480, "device": 5, "context": 1, "stream": 7, "correlation": 211480, "bytes": 4800, "memory bandwidth (GB/s)": 1.5 } }, { "ph": "f", "id": 211480, "pid": 5, "tid": 7, "ts": 1716454224787076, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224731127, "dur": 13, "args": { "External id": 211480, "cbid": 51, "correlation": 211480 } }, { "ph": "s", "id": 211480, "pid": 76337, "tid": -914061504, "ts": 1716454224731127, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224787080, "dur": 98, "args": { "External id": 211481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211481, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 211481, "pid": 5, "tid": 7, "ts": 1716454224787080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731141, "dur": 7, "args": { "External id": 211481, "cbid": 211, "correlation": 211481 } }, { "ph": "s", "id": 211481, "pid": 76337, "tid": -914061504, "ts": 1716454224731141, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224787179, "dur": 6, "args": { "External id": 211483, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211483, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211483, "pid": 5, "tid": 7, "ts": 1716454224787179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731152, "dur": 5, "args": { "External id": 211483, "cbid": 211, "correlation": 211483 } }, { "ph": "s", "id": 211483, "pid": 76337, "tid": -914061504, "ts": 1716454224731152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224787186, "dur": 6, "args": { "External id": 211489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211489, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211489, "pid": 5, "tid": 7, "ts": 1716454224787186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731178, "dur": 8, "args": { "External id": 211489, "cbid": 211, "correlation": 211489 } }, { "ph": "s", "id": 211489, "pid": 76337, "tid": -914061504, "ts": 1716454224731178, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224787194, "dur": 5, "args": { "External id": 211497, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211497, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211497, "pid": 5, "tid": 7, "ts": 1716454224787194, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731207, "dur": 7, "args": { "External id": 211497, "cbid": 211, "correlation": 211497 } }, { "ph": "s", "id": 211497, "pid": 76337, "tid": -914061504, "ts": 1716454224731207, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224787200, "dur": 4, "args": { "External id": 211505, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211505, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211505, "pid": 5, "tid": 7, "ts": 1716454224787200, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731235, "dur": 9, "args": { "External id": 211505, "cbid": 211, "correlation": 211505 } }, { "ph": "s", "id": 211505, "pid": 76337, "tid": -914061504, "ts": 1716454224731235, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224787206, "dur": 11, "args": { "External id": 211514, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211514, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211514, "pid": 5, "tid": 7, "ts": 1716454224787206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731279, "dur": 10, "args": { "External id": 211514, "cbid": 211, "correlation": 211514 } }, { "ph": "s", "id": 211514, "pid": 76337, "tid": -914061504, "ts": 1716454224731279, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224787218, "dur": 12, "args": { "External id": 211534, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211534, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211534, "pid": 5, "tid": 7, "ts": 1716454224787218, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731350, "dur": 12, "args": { "External id": 211534, "cbid": 211, "correlation": 211534 } }, { "ph": "s", "id": 211534, "pid": 76337, "tid": -914061504, "ts": 1716454224731350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224787231, "dur": 4, "args": { "External id": 211546, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211546, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211546, "pid": 5, "tid": 7, "ts": 1716454224787231, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731371, "dur": 6, "args": { "External id": 211546, "cbid": 211, "correlation": 211546 } }, { "ph": "s", "id": 211546, "pid": 76337, "tid": -914061504, "ts": 1716454224731371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224787237, "dur": 11, "args": { "External id": 211549, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211549, "pid": 5, "tid": 7, "ts": 1716454224787237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731388, "dur": 7, "args": { "External id": 211549, "cbid": 211, "correlation": 211549 } }, { "ph": "s", "id": 211549, "pid": 76337, "tid": -914061504, "ts": 1716454224731388, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224787249, "dur": 6, "args": { "External id": 211558, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211558, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211558, "pid": 5, "tid": 7, "ts": 1716454224787249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731426, "dur": 9, "args": { "External id": 211558, "cbid": 211, "correlation": 211558 } }, { "ph": "s", "id": 211558, "pid": 76337, "tid": -914061504, "ts": 1716454224731426, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224731477, "dur": 0, "args": { "External id": 211568, "cbid": 317, "correlation": 211568 } }, { "ph": "f", "id": 211568, "pid": 76337, "tid": -914061504, "ts": 1716454224731477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224731478, "dur": 0, "args": { "External id": 211569, "cbid": 203, "correlation": 211569 } }, { "ph": "f", "id": 211569, "pid": 76337, "tid": -914061504, "ts": 1716454224731478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224731479, "dur": 0, "args": { "External id": 211570, "cbid": 205, "correlation": 211570 } }, { "ph": "f", "id": 211570, "pid": 76337, "tid": -914061504, "ts": 1716454224731479, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224787257, "dur": 6, "args": { "External id": 211574, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211574, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211574, "pid": 5, "tid": 7, "ts": 1716454224787257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731492, "dur": 12, "args": { "External id": 211574, "cbid": 211, "correlation": 211574 } }, { "ph": "s", "id": 211574, "pid": 76337, "tid": -914061504, "ts": 1716454224731492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224787264, "dur": 322, "args": { "External id": 211576, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211576, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211576, "pid": 5, "tid": 7, "ts": 1716454224787264, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731507, "dur": 5, "args": { "External id": 211576, "cbid": 211, "correlation": 211576 } }, { "ph": "s", "id": 211576, "pid": 76337, "tid": -914061504, "ts": 1716454224731507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224787588, "dur": 1, "args": { "External id": 211578, "device": 5, "context": 1, "stream": 7, "correlation": 211578, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 211578, "pid": 5, "tid": 7, "ts": 1716454224787588, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224731518, "dur": 6, "args": { "External id": 211578, "cbid": 51, "correlation": 211578 } }, { "ph": "s", "id": 211578, "pid": 76337, "tid": -914061504, "ts": 1716454224731518, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224787592, "dur": 500, "args": { "External id": 211579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211579, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211579, "pid": 5, "tid": 7, "ts": 1716454224787592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731525, "dur": 6, "args": { "External id": 211579, "cbid": 211, "correlation": 211579 } }, { "ph": "s", "id": 211579, "pid": 76337, "tid": -914061504, "ts": 1716454224731525, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788093, "dur": 5, "args": { "External id": 211581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211581, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211581, "pid": 5, "tid": 7, "ts": 1716454224788093, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731534, "dur": 5, "args": { "External id": 211581, "cbid": 211, "correlation": 211581 } }, { "ph": "s", "id": 211581, "pid": 76337, "tid": -914061504, "ts": 1716454224731534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224788100, "dur": 6, "args": { "External id": 211587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211587, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211587, "pid": 5, "tid": 7, "ts": 1716454224788100, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731563, "dur": 8, "args": { "External id": 211587, "cbid": 211, "correlation": 211587 } }, { "ph": "s", "id": 211587, "pid": 76337, "tid": -914061504, "ts": 1716454224731563, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224788108, "dur": 3, "args": { "External id": 211595, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211595, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 211595, "pid": 5, "tid": 7, "ts": 1716454224788108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731606, "dur": 9, "args": { "External id": 211595, "cbid": 211, "correlation": 211595 } }, { "ph": "s", "id": 211595, "pid": 76337, "tid": -914061504, "ts": 1716454224731606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224731668, "dur": 1, "args": { "External id": 211611, "cbid": 251, "correlation": 211611 } }, { "ph": "f", "id": 211611, "pid": 76337, "tid": -914061504, "ts": 1716454224731668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224731673, "dur": 0, "args": { "External id": 211613, "cbid": 251, "correlation": 211613 } }, { "ph": "f", "id": 211613, "pid": 76337, "tid": -914061504, "ts": 1716454224731673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224788112, "dur": 13, "args": { "External id": 211614, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211614, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211614, "pid": 5, "tid": 7, "ts": 1716454224788112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731675, "dur": 11, "args": { "External id": 211614, "cbid": 211, "correlation": 211614 } }, { "ph": "s", "id": 211614, "pid": 76337, "tid": -914061504, "ts": 1716454224731675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224788126, "dur": 5, "args": { "External id": 211616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211616, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211616, "pid": 5, "tid": 7, "ts": 1716454224788126, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731688, "dur": 5, "args": { "External id": 211616, "cbid": 211, "correlation": 211616 } }, { "ph": "s", "id": 211616, "pid": 76337, "tid": -914061504, "ts": 1716454224731688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224788133, "dur": 6, "args": { "External id": 211626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211626, "pid": 5, "tid": 7, "ts": 1716454224788133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731744, "dur": 13, "args": { "External id": 211626, "cbid": 211, "correlation": 211626 } }, { "ph": "s", "id": 211626, "pid": 76337, "tid": -914061504, "ts": 1716454224731744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224788141, "dur": 9, "args": { "External id": 211646, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211646, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211646, "pid": 5, "tid": 7, "ts": 1716454224788141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731812, "dur": 11, "args": { "External id": 211646, "cbid": 211, "correlation": 211646 } }, { "ph": "s", "id": 211646, "pid": 76337, "tid": -914061504, "ts": 1716454224731812, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224788151, "dur": 4, "args": { "External id": 211658, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211658, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 211658, "pid": 5, "tid": 7, "ts": 1716454224788151, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731832, "dur": 6, "args": { "External id": 211658, "cbid": 211, "correlation": 211658 } }, { "ph": "s", "id": 211658, "pid": 76337, "tid": -914061504, "ts": 1716454224731832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224788156, "dur": 7, "args": { "External id": 211661, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211661, "pid": 5, "tid": 7, "ts": 1716454224788156, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731852, "dur": 6, "args": { "External id": 211661, "cbid": 211, "correlation": 211661 } }, { "ph": "s", "id": 211661, "pid": 76337, "tid": -914061504, "ts": 1716454224731852, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224788164, "dur": 5, "args": { "External id": 211670, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211670, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211670, "pid": 5, "tid": 7, "ts": 1716454224788164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731892, "dur": 10, "args": { "External id": 211670, "cbid": 211, "correlation": 211670 } }, { "ph": "s", "id": 211670, "pid": 76337, "tid": -914061504, "ts": 1716454224731892, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224731955, "dur": 0, "args": { "External id": 211680, "cbid": 317, "correlation": 211680 } }, { "ph": "f", "id": 211680, "pid": 76337, "tid": -914061504, "ts": 1716454224731955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224731956, "dur": 0, "args": { "External id": 211681, "cbid": 203, "correlation": 211681 } }, { "ph": "f", "id": 211681, "pid": 76337, "tid": -914061504, "ts": 1716454224731956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224731956, "dur": 0, "args": { "External id": 211682, "cbid": 205, "correlation": 211682 } }, { "ph": "f", "id": 211682, "pid": 76337, "tid": -914061504, "ts": 1716454224731956, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788170, "dur": 5, "args": { "External id": 211686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211686, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211686, "pid": 5, "tid": 7, "ts": 1716454224788170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731970, "dur": 21, "args": { "External id": 211686, "cbid": 211, "correlation": 211686 } }, { "ph": "s", "id": 211686, "pid": 76337, "tid": -914061504, "ts": 1716454224731970, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788177, "dur": 164, "args": { "External id": 211688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211688, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211688, "pid": 5, "tid": 7, "ts": 1716454224788177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224731993, "dur": 5, "args": { "External id": 211688, "cbid": 211, "correlation": 211688 } }, { "ph": "s", "id": 211688, "pid": 76337, "tid": -914061504, "ts": 1716454224731993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224788343, "dur": 1, "args": { "External id": 211690, "device": 5, "context": 1, "stream": 7, "correlation": 211690, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 211690, "pid": 5, "tid": 7, "ts": 1716454224788343, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224732004, "dur": 7, "args": { "External id": 211690, "cbid": 51, "correlation": 211690 } }, { "ph": "s", "id": 211690, "pid": 76337, "tid": -914061504, "ts": 1716454224732004, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224788346, "dur": 261, "args": { "External id": 211691, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211691, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211691, "pid": 5, "tid": 7, "ts": 1716454224788346, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732012, "dur": 6, "args": { "External id": 211691, "cbid": 211, "correlation": 211691 } }, { "ph": "s", "id": 211691, "pid": 76337, "tid": -914061504, "ts": 1716454224732012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788609, "dur": 6, "args": { "External id": 211693, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211693, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211693, "pid": 5, "tid": 7, "ts": 1716454224788609, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732022, "dur": 5, "args": { "External id": 211693, "cbid": 211, "correlation": 211693 } }, { "ph": "s", "id": 211693, "pid": 76337, "tid": -914061504, "ts": 1716454224732022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224788617, "dur": 6, "args": { "External id": 211699, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211699, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211699, "pid": 5, "tid": 7, "ts": 1716454224788617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732051, "dur": 8, "args": { "External id": 211699, "cbid": 211, "correlation": 211699 } }, { "ph": "s", "id": 211699, "pid": 76337, "tid": -914061504, "ts": 1716454224732051, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224732110, "dur": 0, "args": { "External id": 211709, "cbid": 317, "correlation": 211709 } }, { "ph": "f", "id": 211709, "pid": 76337, "tid": -914061504, "ts": 1716454224732110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224732111, "dur": 0, "args": { "External id": 211710, "cbid": 203, "correlation": 211710 } }, { "ph": "f", "id": 211710, "pid": 76337, "tid": -914061504, "ts": 1716454224732111, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224732112, "dur": 0, "args": { "External id": 211711, "cbid": 205, "correlation": 211711 } }, { "ph": "f", "id": 211711, "pid": 76337, "tid": -914061504, "ts": 1716454224732112, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788624, "dur": 8, "args": { "External id": 211715, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211715, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211715, "pid": 5, "tid": 7, "ts": 1716454224788624, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732123, "dur": 11, "args": { "External id": 211715, "cbid": 211, "correlation": 211715 } }, { "ph": "s", "id": 211715, "pid": 76337, "tid": -914061504, "ts": 1716454224732123, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224788633, "dur": 3, "args": { "External id": 211717, "device": 5, "context": 1, "stream": 7, "correlation": 211717, "bytes": 4800, "memory bandwidth (GB/s)": 1.4563106796116505 } }, { "ph": "f", "id": 211717, "pid": 5, "tid": 7, "ts": 1716454224788633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224732140, "dur": 10, "args": { "External id": 211717, "cbid": 51, "correlation": 211717 } }, { "ph": "s", "id": 211717, "pid": 76337, "tid": -914061504, "ts": 1716454224732140, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224788637, "dur": 94, "args": { "External id": 211718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211718, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 211718, "pid": 5, "tid": 7, "ts": 1716454224788637, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732151, "dur": 6, "args": { "External id": 211718, "cbid": 211, "correlation": 211718 } }, { "ph": "s", "id": 211718, "pid": 76337, "tid": -914061504, "ts": 1716454224732151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788733, "dur": 6, "args": { "External id": 211720, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211720, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211720, "pid": 5, "tid": 7, "ts": 1716454224788733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732160, "dur": 5, "args": { "External id": 211720, "cbid": 211, "correlation": 211720 } }, { "ph": "s", "id": 211720, "pid": 76337, "tid": -914061504, "ts": 1716454224732160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224788740, "dur": 6, "args": { "External id": 211726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211726, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211726, "pid": 5, "tid": 7, "ts": 1716454224788740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732187, "dur": 9, "args": { "External id": 211726, "cbid": 211, "correlation": 211726 } }, { "ph": "s", "id": 211726, "pid": 76337, "tid": -914061504, "ts": 1716454224732187, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224788748, "dur": 5, "args": { "External id": 211734, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211734, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211734, "pid": 5, "tid": 7, "ts": 1716454224788748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732216, "dur": 8, "args": { "External id": 211734, "cbid": 211, "correlation": 211734 } }, { "ph": "s", "id": 211734, "pid": 76337, "tid": -914061504, "ts": 1716454224732216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224788754, "dur": 4, "args": { "External id": 211742, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211742, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211742, "pid": 5, "tid": 7, "ts": 1716454224788754, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732245, "dur": 8, "args": { "External id": 211742, "cbid": 211, "correlation": 211742 } }, { "ph": "s", "id": 211742, "pid": 76337, "tid": -914061504, "ts": 1716454224732245, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224788760, "dur": 11, "args": { "External id": 211751, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211751, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211751, "pid": 5, "tid": 7, "ts": 1716454224788760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732287, "dur": 10, "args": { "External id": 211751, "cbid": 211, "correlation": 211751 } }, { "ph": "s", "id": 211751, "pid": 76337, "tid": -914061504, "ts": 1716454224732287, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224788772, "dur": 12, "args": { "External id": 211771, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211771, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211771, "pid": 5, "tid": 7, "ts": 1716454224788772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732357, "dur": 11, "args": { "External id": 211771, "cbid": 211, "correlation": 211771 } }, { "ph": "s", "id": 211771, "pid": 76337, "tid": -914061504, "ts": 1716454224732357, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224788786, "dur": 4, "args": { "External id": 211783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211783, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211783, "pid": 5, "tid": 7, "ts": 1716454224788786, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732378, "dur": 6, "args": { "External id": 211783, "cbid": 211, "correlation": 211783 } }, { "ph": "s", "id": 211783, "pid": 76337, "tid": -914061504, "ts": 1716454224732378, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224788791, "dur": 11, "args": { "External id": 211786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211786, "pid": 5, "tid": 7, "ts": 1716454224788791, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732395, "dur": 7, "args": { "External id": 211786, "cbid": 211, "correlation": 211786 } }, { "ph": "s", "id": 211786, "pid": 76337, "tid": -914061504, "ts": 1716454224732395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224788804, "dur": 6, "args": { "External id": 211795, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211795, "registers per thread": 24, "shared memory": 0, "blocks per SM": 24, "warps per SM": 96, "grid": [1920, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211795, "pid": 5, "tid": 7, "ts": 1716454224788804, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732432, "dur": 9, "args": { "External id": 211795, "cbid": 211, "correlation": 211795 } }, { "ph": "s", "id": 211795, "pid": 76337, "tid": -914061504, "ts": 1716454224732432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224732482, "dur": 0, "args": { "External id": 211805, "cbid": 317, "correlation": 211805 } }, { "ph": "f", "id": 211805, "pid": 76337, "tid": -914061504, "ts": 1716454224732482, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224732483, "dur": 0, "args": { "External id": 211806, "cbid": 203, "correlation": 211806 } }, { "ph": "f", "id": 211806, "pid": 76337, "tid": -914061504, "ts": 1716454224732483, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224732484, "dur": 0, "args": { "External id": 211807, "cbid": 205, "correlation": 211807 } }, { "ph": "f", "id": 211807, "pid": 76337, "tid": -914061504, "ts": 1716454224732484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788812, "dur": 7, "args": { "External id": 211811, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211811, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211811, "pid": 5, "tid": 7, "ts": 1716454224788812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732498, "dur": 11, "args": { "External id": 211811, "cbid": 211, "correlation": 211811 } }, { "ph": "s", "id": 211811, "pid": 76337, "tid": -914061504, "ts": 1716454224732498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224788820, "dur": 322, "args": { "External id": 211813, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211813, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211813, "pid": 5, "tid": 7, "ts": 1716454224788820, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732511, "dur": 5, "args": { "External id": 211813, "cbid": 211, "correlation": 211813 } }, { "ph": "s", "id": 211813, "pid": 76337, "tid": -914061504, "ts": 1716454224732511, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224789144, "dur": 1, "args": { "External id": 211815, "device": 5, "context": 1, "stream": 7, "correlation": 211815, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 211815, "pid": 5, "tid": 7, "ts": 1716454224789144, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224732522, "dur": 6, "args": { "External id": 211815, "cbid": 51, "correlation": 211815 } }, { "ph": "s", "id": 211815, "pid": 76337, "tid": -914061504, "ts": 1716454224732522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224789148, "dur": 499, "args": { "External id": 211816, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211816, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211816, "pid": 5, "tid": 7, "ts": 1716454224789148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732529, "dur": 6, "args": { "External id": 211816, "cbid": 211, "correlation": 211816 } }, { "ph": "s", "id": 211816, "pid": 76337, "tid": -914061504, "ts": 1716454224732529, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224789648, "dur": 5, "args": { "External id": 211818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211818, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211818, "pid": 5, "tid": 7, "ts": 1716454224789648, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732539, "dur": 6, "args": { "External id": 211818, "cbid": 211, "correlation": 211818 } }, { "ph": "s", "id": 211818, "pid": 76337, "tid": -914061504, "ts": 1716454224732539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224789654, "dur": 7, "args": { "External id": 211824, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211824, "pid": 5, "tid": 7, "ts": 1716454224789654, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732568, "dur": 8, "args": { "External id": 211824, "cbid": 211, "correlation": 211824 } }, { "ph": "s", "id": 211824, "pid": 76337, "tid": -914061504, "ts": 1716454224732568, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224789662, "dur": 3, "args": { "External id": 211832, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211832, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 211832, "pid": 5, "tid": 7, "ts": 1716454224789662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732612, "dur": 9, "args": { "External id": 211832, "cbid": 211, "correlation": 211832 } }, { "ph": "s", "id": 211832, "pid": 76337, "tid": -914061504, "ts": 1716454224732612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224732674, "dur": 2, "args": { "External id": 211848, "cbid": 251, "correlation": 211848 } }, { "ph": "f", "id": 211848, "pid": 76337, "tid": -914061504, "ts": 1716454224732674, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224732680, "dur": 0, "args": { "External id": 211850, "cbid": 251, "correlation": 211850 } }, { "ph": "f", "id": 211850, "pid": 76337, "tid": -914061504, "ts": 1716454224732680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224789667, "dur": 12, "args": { "External id": 211851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211851, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211851, "pid": 5, "tid": 7, "ts": 1716454224789667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732682, "dur": 12, "args": { "External id": 211851, "cbid": 211, "correlation": 211851 } }, { "ph": "s", "id": 211851, "pid": 76337, "tid": -914061504, "ts": 1716454224732682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224789680, "dur": 5, "args": { "External id": 211853, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211853, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211853, "pid": 5, "tid": 7, "ts": 1716454224789680, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732696, "dur": 5, "args": { "External id": 211853, "cbid": 211, "correlation": 211853 } }, { "ph": "s", "id": 211853, "pid": 76337, "tid": -914061504, "ts": 1716454224732696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224789687, "dur": 6, "args": { "External id": 211863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211863, "pid": 5, "tid": 7, "ts": 1716454224789687, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732752, "dur": 12, "args": { "External id": 211863, "cbid": 211, "correlation": 211863 } }, { "ph": "s", "id": 211863, "pid": 76337, "tid": -914061504, "ts": 1716454224732752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224789694, "dur": 9, "args": { "External id": 211883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211883, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 211883, "pid": 5, "tid": 7, "ts": 1716454224789694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732817, "dur": 11, "args": { "External id": 211883, "cbid": 211, "correlation": 211883 } }, { "ph": "s", "id": 211883, "pid": 76337, "tid": -914061504, "ts": 1716454224732817, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224789705, "dur": 4, "args": { "External id": 211895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211895, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 211895, "pid": 5, "tid": 7, "ts": 1716454224789705, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732838, "dur": 6, "args": { "External id": 211895, "cbid": 211, "correlation": 211895 } }, { "ph": "s", "id": 211895, "pid": 76337, "tid": -914061504, "ts": 1716454224732838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224789710, "dur": 6, "args": { "External id": 211898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211898, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211898, "pid": 5, "tid": 7, "ts": 1716454224789710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732857, "dur": 7, "args": { "External id": 211898, "cbid": 211, "correlation": 211898 } }, { "ph": "s", "id": 211898, "pid": 76337, "tid": -914061504, "ts": 1716454224732857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224789718, "dur": 5, "args": { "External id": 211907, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211907, "registers per thread": 24, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211907, "pid": 5, "tid": 7, "ts": 1716454224789718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732898, "dur": 10, "args": { "External id": 211907, "cbid": 211, "correlation": 211907 } }, { "ph": "s", "id": 211907, "pid": 76337, "tid": -914061504, "ts": 1716454224732898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224732961, "dur": 0, "args": { "External id": 211917, "cbid": 317, "correlation": 211917 } }, { "ph": "f", "id": 211917, "pid": 76337, "tid": -914061504, "ts": 1716454224732961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224732962, "dur": 0, "args": { "External id": 211918, "cbid": 203, "correlation": 211918 } }, { "ph": "f", "id": 211918, "pid": 76337, "tid": -914061504, "ts": 1716454224732962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224732962, "dur": 0, "args": { "External id": 211919, "cbid": 205, "correlation": 211919 } }, { "ph": "f", "id": 211919, "pid": 76337, "tid": -914061504, "ts": 1716454224732962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224789724, "dur": 5, "args": { "External id": 211923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211923, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211923, "pid": 5, "tid": 7, "ts": 1716454224789724, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224732984, "dur": 13, "args": { "External id": 211923, "cbid": 211, "correlation": 211923 } }, { "ph": "s", "id": 211923, "pid": 76337, "tid": -914061504, "ts": 1716454224732984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224789730, "dur": 163, "args": { "External id": 211925, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211925, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211925, "pid": 5, "tid": 7, "ts": 1716454224789730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733000, "dur": 5, "args": { "External id": 211925, "cbid": 211, "correlation": 211925 } }, { "ph": "s", "id": 211925, "pid": 76337, "tid": -914061504, "ts": 1716454224733000, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224789896, "dur": 1, "args": { "External id": 211927, "device": 5, "context": 1, "stream": 7, "correlation": 211927, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 211927, "pid": 5, "tid": 7, "ts": 1716454224789896, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224733010, "dur": 8, "args": { "External id": 211927, "cbid": 51, "correlation": 211927 } }, { "ph": "s", "id": 211927, "pid": 76337, "tid": -914061504, "ts": 1716454224733010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224789900, "dur": 261, "args": { "External id": 211928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211928, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 1.125, "warps per SM": 9, "grid": [5, 6, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 211928, "pid": 5, "tid": 7, "ts": 1716454224789900, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733019, "dur": 6, "args": { "External id": 211928, "cbid": 211, "correlation": 211928 } }, { "ph": "s", "id": 211928, "pid": 76337, "tid": -914061504, "ts": 1716454224733019, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224790162, "dur": 6, "args": { "External id": 211930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211930, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211930, "pid": 5, "tid": 7, "ts": 1716454224790162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733028, "dur": 5, "args": { "External id": 211930, "cbid": 211, "correlation": 211930 } }, { "ph": "s", "id": 211930, "pid": 76337, "tid": -914061504, "ts": 1716454224733028, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224790169, "dur": 6, "args": { "External id": 211936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211936, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211936, "pid": 5, "tid": 7, "ts": 1716454224790169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733058, "dur": 9, "args": { "External id": 211936, "cbid": 211, "correlation": 211936 } }, { "ph": "s", "id": 211936, "pid": 76337, "tid": -914061504, "ts": 1716454224733058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224733117, "dur": 0, "args": { "External id": 211946, "cbid": 317, "correlation": 211946 } }, { "ph": "f", "id": 211946, "pid": 76337, "tid": -914061504, "ts": 1716454224733117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224733118, "dur": 0, "args": { "External id": 211947, "cbid": 203, "correlation": 211947 } }, { "ph": "f", "id": 211947, "pid": 76337, "tid": -914061504, "ts": 1716454224733118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224733119, "dur": 0, "args": { "External id": 211948, "cbid": 205, "correlation": 211948 } }, { "ph": "f", "id": 211948, "pid": 76337, "tid": -914061504, "ts": 1716454224733119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224790177, "dur": 8, "args": { "External id": 211952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211952, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 16, "warps per SM": 128, "grid": [2, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211952, "pid": 5, "tid": 7, "ts": 1716454224790177, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733130, "dur": 12, "args": { "External id": 211952, "cbid": 211, "correlation": 211952 } }, { "ph": "s", "id": 211952, "pid": 76337, "tid": -914061504, "ts": 1716454224733130, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224790186, "dur": 3, "args": { "External id": 211954, "device": 5, "context": 1, "stream": 7, "correlation": 211954, "bytes": 4800, "memory bandwidth (GB/s)": 1.4851485148514851 } }, { "ph": "f", "id": 211954, "pid": 5, "tid": 7, "ts": 1716454224790186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224733148, "dur": 9, "args": { "External id": 211954, "cbid": 51, "correlation": 211954 } }, { "ph": "s", "id": 211954, "pid": 76337, "tid": -914061504, "ts": 1716454224733148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224790190, "dur": 94, "args": { "External id": 211955, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211955, "registers per thread": 154, "shared memory": 16384, "blocks per SM": 15, "warps per SM": 60, "grid": [20, 6, 10], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 211955, "pid": 5, "tid": 7, "ts": 1716454224790190, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733158, "dur": 6, "args": { "External id": 211955, "cbid": 211, "correlation": 211955 } }, { "ph": "s", "id": 211955, "pid": 76337, "tid": -914061504, "ts": 1716454224733158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224790285, "dur": 6, "args": { "External id": 211957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211957, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 8, "warps per SM": 64, "grid": [2, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211957, "pid": 5, "tid": 7, "ts": 1716454224790285, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733167, "dur": 5, "args": { "External id": 211957, "cbid": 211, "correlation": 211957 } }, { "ph": "s", "id": 211957, "pid": 76337, "tid": -914061504, "ts": 1716454224733167, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224790292, "dur": 6, "args": { "External id": 211963, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211963, "pid": 5, "tid": 7, "ts": 1716454224790292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733195, "dur": 8, "args": { "External id": 211963, "cbid": 211, "correlation": 211963 } }, { "ph": "s", "id": 211963, "pid": 76337, "tid": -914061504, "ts": 1716454224733195, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224790300, "dur": 5, "args": { "External id": 211971, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211971, "registers per thread": 19, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211971, "pid": 5, "tid": 7, "ts": 1716454224790300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733223, "dur": 8, "args": { "External id": 211971, "cbid": 211, "correlation": 211971 } }, { "ph": "s", "id": 211971, "pid": 76337, "tid": -914061504, "ts": 1716454224733223, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224790306, "dur": 4, "args": { "External id": 211979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211979, "registers per thread": 17, "shared memory": 0, "blocks per SM": 12, "warps per SM": 48, "grid": [960, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 75 } }, { "ph": "f", "id": 211979, "pid": 5, "tid": 7, "ts": 1716454224790306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733252, "dur": 8, "args": { "External id": 211979, "cbid": 211, "correlation": 211979 } }, { "ph": "s", "id": 211979, "pid": 76337, "tid": -914061504, "ts": 1716454224733252, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224790312, "dur": 14, "args": { "External id": 211990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 211990, "registers per thread": 24, "shared memory": 0, "blocks per SM": 16, "warps per SM": 256, "grid": [2, 1, 640], "block": [8, 16, 4], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 211990, "pid": 5, "tid": 7, "ts": 1716454224790312, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733340, "dur": 24, "args": { "External id": 211990, "cbid": 211, "correlation": 211990 } }, { "ph": "s", "id": 211990, "pid": 76337, "tid": -914061504, "ts": 1716454224733340, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224733408, "dur": 0, "args": { "External id": 212000, "cbid": 317, "correlation": 212000 } }, { "ph": "f", "id": 212000, "pid": 76337, "tid": -914061504, "ts": 1716454224733408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224733409, "dur": 0, "args": { "External id": 212001, "cbid": 203, "correlation": 212001 } }, { "ph": "f", "id": 212001, "pid": 76337, "tid": -914061504, "ts": 1716454224733409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224733410, "dur": 0, "args": { "External id": 212002, "cbid": 205, "correlation": 212002 } }, { "ph": "f", "id": 212002, "pid": 76337, "tid": -914061504, "ts": 1716454224733410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224790328, "dur": 9, "args": { "External id": 212006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212006, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212006, "pid": 5, "tid": 7, "ts": 1716454224790328, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733424, "dur": 12, "args": { "External id": 212006, "cbid": 211, "correlation": 212006 } }, { "ph": "s", "id": 212006, "pid": 76337, "tid": -914061504, "ts": 1716454224733424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224790338, "dur": 164, "args": { "External id": 212008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212008, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212008, "pid": 5, "tid": 7, "ts": 1716454224790338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733438, "dur": 6, "args": { "External id": 212008, "cbid": 211, "correlation": 212008 } }, { "ph": "s", "id": 212008, "pid": 76337, "tid": -914061504, "ts": 1716454224733438, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224790504, "dur": 1, "args": { "External id": 212010, "device": 5, "context": 1, "stream": 7, "correlation": 212010, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 212010, "pid": 5, "tid": 7, "ts": 1716454224790504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224733449, "dur": 6, "args": { "External id": 212010, "cbid": 51, "correlation": 212010 } }, { "ph": "s", "id": 212010, "pid": 76337, "tid": -914061504, "ts": 1716454224733449, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224790508, "dur": 654, "args": { "External id": 212011, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212011, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212011, "pid": 5, "tid": 7, "ts": 1716454224790508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733457, "dur": 6, "args": { "External id": 212011, "cbid": 211, "correlation": 212011 } }, { "ph": "s", "id": 212011, "pid": 76337, "tid": -914061504, "ts": 1716454224733457, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224791164, "dur": 12, "args": { "External id": 212013, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212013, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212013, "pid": 5, "tid": 7, "ts": 1716454224791164, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733466, "dur": 5, "args": { "External id": 212013, "cbid": 211, "correlation": 212013 } }, { "ph": "s", "id": 212013, "pid": 76337, "tid": -914061504, "ts": 1716454224733466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224791178, "dur": 14, "args": { "External id": 212019, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212019, "pid": 5, "tid": 7, "ts": 1716454224791178, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733496, "dur": 9, "args": { "External id": 212019, "cbid": 211, "correlation": 212019 } }, { "ph": "s", "id": 212019, "pid": 76337, "tid": -914061504, "ts": 1716454224733496, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224791193, "dur": 29, "args": { "External id": 212028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212028, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212028, "pid": 5, "tid": 7, "ts": 1716454224791193, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733586, "dur": 13, "args": { "External id": 212028, "cbid": 211, "correlation": 212028 } }, { "ph": "s", "id": 212028, "pid": 76337, "tid": -914061504, "ts": 1716454224733586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224791224, "dur": 30, "args": { "External id": 212048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212048, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 212048, "pid": 5, "tid": 7, "ts": 1716454224791224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733654, "dur": 11, "args": { "External id": 212048, "cbid": 211, "correlation": 212048 } }, { "ph": "s", "id": 212048, "pid": 76337, "tid": -914061504, "ts": 1716454224733654, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224791255, "dur": 4, "args": { "External id": 212060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212060, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212060, "pid": 5, "tid": 7, "ts": 1716454224791255, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733675, "dur": 6, "args": { "External id": 212060, "cbid": 211, "correlation": 212060 } }, { "ph": "s", "id": 212060, "pid": 76337, "tid": -914061504, "ts": 1716454224733675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224791261, "dur": 30, "args": { "External id": 212063, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212063, "pid": 5, "tid": 7, "ts": 1716454224791261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733694, "dur": 6, "args": { "External id": 212063, "cbid": 211, "correlation": 212063 } }, { "ph": "s", "id": 212063, "pid": 76337, "tid": -914061504, "ts": 1716454224733694, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224791292, "dur": 21, "args": { "External id": 212072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212072, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212072, "pid": 5, "tid": 7, "ts": 1716454224791292, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733731, "dur": 10, "args": { "External id": 212072, "cbid": 211, "correlation": 212072 } }, { "ph": "s", "id": 212072, "pid": 76337, "tid": -914061504, "ts": 1716454224733731, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224733782, "dur": 0, "args": { "External id": 212082, "cbid": 317, "correlation": 212082 } }, { "ph": "f", "id": 212082, "pid": 76337, "tid": -914061504, "ts": 1716454224733782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224733783, "dur": 0, "args": { "External id": 212083, "cbid": 203, "correlation": 212083 } }, { "ph": "f", "id": 212083, "pid": 76337, "tid": -914061504, "ts": 1716454224733783, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224733784, "dur": 0, "args": { "External id": 212084, "cbid": 205, "correlation": 212084 } }, { "ph": "f", "id": 212084, "pid": 76337, "tid": -914061504, "ts": 1716454224733784, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224791314, "dur": 23, "args": { "External id": 212088, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212088, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212088, "pid": 5, "tid": 7, "ts": 1716454224791314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733799, "dur": 12, "args": { "External id": 212088, "cbid": 211, "correlation": 212088 } }, { "ph": "s", "id": 212088, "pid": 76337, "tid": -914061504, "ts": 1716454224733799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224791338, "dur": 324, "args": { "External id": 212090, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212090, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212090, "pid": 5, "tid": 7, "ts": 1716454224791338, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733813, "dur": 5, "args": { "External id": 212090, "cbid": 211, "correlation": 212090 } }, { "ph": "s", "id": 212090, "pid": 76337, "tid": -914061504, "ts": 1716454224733813, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224791664, "dur": 1, "args": { "External id": 212092, "device": 5, "context": 1, "stream": 7, "correlation": 212092, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 212092, "pid": 5, "tid": 7, "ts": 1716454224791664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224733824, "dur": 7, "args": { "External id": 212092, "cbid": 51, "correlation": 212092 } }, { "ph": "s", "id": 212092, "pid": 76337, "tid": -914061504, "ts": 1716454224733824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224791668, "dur": 1248, "args": { "External id": 212093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212093, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212093, "pid": 5, "tid": 7, "ts": 1716454224791668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733832, "dur": 7, "args": { "External id": 212093, "cbid": 211, "correlation": 212093 } }, { "ph": "s", "id": 212093, "pid": 76337, "tid": -914061504, "ts": 1716454224733832, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224792918, "dur": 13, "args": { "External id": 212095, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212095, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212095, "pid": 5, "tid": 7, "ts": 1716454224792918, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733844, "dur": 5, "args": { "External id": 212095, "cbid": 211, "correlation": 212095 } }, { "ph": "s", "id": 212095, "pid": 76337, "tid": -914061504, "ts": 1716454224733844, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224792932, "dur": 15, "args": { "External id": 212101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212101, "pid": 5, "tid": 7, "ts": 1716454224792932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733871, "dur": 8, "args": { "External id": 212101, "cbid": 211, "correlation": 212101 } }, { "ph": "s", "id": 212101, "pid": 76337, "tid": -914061504, "ts": 1716454224733871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224792948, "dur": 3, "args": { "External id": 212109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212109, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 212109, "pid": 5, "tid": 7, "ts": 1716454224792948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733915, "dur": 9, "args": { "External id": 212109, "cbid": 211, "correlation": 212109 } }, { "ph": "s", "id": 212109, "pid": 76337, "tid": -914061504, "ts": 1716454224733915, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224733987, "dur": 2, "args": { "External id": 212125, "cbid": 251, "correlation": 212125 } }, { "ph": "f", "id": 212125, "pid": 76337, "tid": -914061504, "ts": 1716454224733987, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224733993, "dur": 0, "args": { "External id": 212127, "cbid": 251, "correlation": 212127 } }, { "ph": "f", "id": 212127, "pid": 76337, "tid": -914061504, "ts": 1716454224733993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224792953, "dur": 12, "args": { "External id": 212128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212128, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212128, "pid": 5, "tid": 7, "ts": 1716454224792953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224733995, "dur": 12, "args": { "External id": 212128, "cbid": 211, "correlation": 212128 } }, { "ph": "s", "id": 212128, "pid": 76337, "tid": -914061504, "ts": 1716454224733995, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224792966, "dur": 5, "args": { "External id": 212130, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212130, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212130, "pid": 5, "tid": 7, "ts": 1716454224792966, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734008, "dur": 5, "args": { "External id": 212130, "cbid": 211, "correlation": 212130 } }, { "ph": "s", "id": 212130, "pid": 76337, "tid": -914061504, "ts": 1716454224734008, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224792972, "dur": 17, "args": { "External id": 212140, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212140, "pid": 5, "tid": 7, "ts": 1716454224792972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734066, "dur": 12, "args": { "External id": 212140, "cbid": 211, "correlation": 212140 } }, { "ph": "s", "id": 212140, "pid": 76337, "tid": -914061504, "ts": 1716454224734066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224792990, "dur": 17, "args": { "External id": 212160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212160, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 212160, "pid": 5, "tid": 7, "ts": 1716454224792990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734134, "dur": 11, "args": { "External id": 212160, "cbid": 211, "correlation": 212160 } }, { "ph": "s", "id": 212160, "pid": 76337, "tid": -914061504, "ts": 1716454224734134, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224793009, "dur": 4, "args": { "External id": 212172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212172, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 212172, "pid": 5, "tid": 7, "ts": 1716454224793009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734155, "dur": 6, "args": { "External id": 212172, "cbid": 211, "correlation": 212172 } }, { "ph": "s", "id": 212172, "pid": 76337, "tid": -914061504, "ts": 1716454224734155, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224793014, "dur": 17, "args": { "External id": 212175, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212175, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212175, "pid": 5, "tid": 7, "ts": 1716454224793014, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734172, "dur": 6, "args": { "External id": 212175, "cbid": 211, "correlation": 212175 } }, { "ph": "s", "id": 212175, "pid": 76337, "tid": -914061504, "ts": 1716454224734172, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224793032, "dur": 11, "args": { "External id": 212184, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212184, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212184, "pid": 5, "tid": 7, "ts": 1716454224793032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734212, "dur": 10, "args": { "External id": 212184, "cbid": 211, "correlation": 212184 } }, { "ph": "s", "id": 212184, "pid": 76337, "tid": -914061504, "ts": 1716454224734212, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224734273, "dur": 0, "args": { "External id": 212194, "cbid": 317, "correlation": 212194 } }, { "ph": "f", "id": 212194, "pid": 76337, "tid": -914061504, "ts": 1716454224734273, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224734274, "dur": 0, "args": { "External id": 212195, "cbid": 203, "correlation": 212195 } }, { "ph": "f", "id": 212195, "pid": 76337, "tid": -914061504, "ts": 1716454224734274, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224734275, "dur": 0, "args": { "External id": 212196, "cbid": 205, "correlation": 212196 } }, { "ph": "f", "id": 212196, "pid": 76337, "tid": -914061504, "ts": 1716454224734275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224793045, "dur": 12, "args": { "External id": 212200, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212200, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212200, "pid": 5, "tid": 7, "ts": 1716454224793045, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734289, "dur": 12, "args": { "External id": 212200, "cbid": 211, "correlation": 212200 } }, { "ph": "s", "id": 212200, "pid": 76337, "tid": -914061504, "ts": 1716454224734289, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224793058, "dur": 165, "args": { "External id": 212202, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212202, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212202, "pid": 5, "tid": 7, "ts": 1716454224793058, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734303, "dur": 5, "args": { "External id": 212202, "cbid": 211, "correlation": 212202 } }, { "ph": "s", "id": 212202, "pid": 76337, "tid": -914061504, "ts": 1716454224734303, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224793225, "dur": 1, "args": { "External id": 212204, "device": 5, "context": 1, "stream": 7, "correlation": 212204, "bytes": 960, "memory bandwidth (GB/s)": 0.5878750765462339 } }, { "ph": "f", "id": 212204, "pid": 5, "tid": 7, "ts": 1716454224793225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224734314, "dur": 6, "args": { "External id": 212204, "cbid": 51, "correlation": 212204 } }, { "ph": "s", "id": 212204, "pid": 76337, "tid": -914061504, "ts": 1716454224734314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224793229, "dur": 652, "args": { "External id": 212205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212205, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212205, "pid": 5, "tid": 7, "ts": 1716454224793229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734321, "dur": 6, "args": { "External id": 212205, "cbid": 211, "correlation": 212205 } }, { "ph": "s", "id": 212205, "pid": 76337, "tid": -914061504, "ts": 1716454224734321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224793882, "dur": 13, "args": { "External id": 212207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212207, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212207, "pid": 5, "tid": 7, "ts": 1716454224793882, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734331, "dur": 5, "args": { "External id": 212207, "cbid": 211, "correlation": 212207 } }, { "ph": "s", "id": 212207, "pid": 76337, "tid": -914061504, "ts": 1716454224734331, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224793897, "dur": 15, "args": { "External id": 212213, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212213, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212213, "pid": 5, "tid": 7, "ts": 1716454224793897, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734359, "dur": 8, "args": { "External id": 212213, "cbid": 211, "correlation": 212213 } }, { "ph": "s", "id": 212213, "pid": 76337, "tid": -914061504, "ts": 1716454224734359, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224734417, "dur": 0, "args": { "External id": 212223, "cbid": 317, "correlation": 212223 } }, { "ph": "f", "id": 212223, "pid": 76337, "tid": -914061504, "ts": 1716454224734417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224734418, "dur": 0, "args": { "External id": 212224, "cbid": 203, "correlation": 212224 } }, { "ph": "f", "id": 212224, "pid": 76337, "tid": -914061504, "ts": 1716454224734418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224734418, "dur": 0, "args": { "External id": 212225, "cbid": 205, "correlation": 212225 } }, { "ph": "f", "id": 212225, "pid": 76337, "tid": -914061504, "ts": 1716454224734418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224793913, "dur": 21, "args": { "External id": 212229, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212229, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212229, "pid": 5, "tid": 7, "ts": 1716454224793913, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734432, "dur": 12, "args": { "External id": 212229, "cbid": 211, "correlation": 212229 } }, { "ph": "s", "id": 212229, "pid": 76337, "tid": -914061504, "ts": 1716454224734432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224793936, "dur": 4, "args": { "External id": 212231, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212231, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212231, "pid": 5, "tid": 7, "ts": 1716454224793936, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734450, "dur": 6, "args": { "External id": 212231, "cbid": 211, "correlation": 212231 } }, { "ph": "s", "id": 212231, "pid": 76337, "tid": -914061504, "ts": 1716454224734450, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224734460, "dur": 0, "args": { "External id": 212232, "cbid": 51, "correlation": 212232 } }, { "ph": "s", "id": 212232, "pid": 76337, "tid": -914061504, "ts": 1716454224734460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224793941, "dur": 175, "args": { "External id": 212233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212233, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 212233, "pid": 5, "tid": 7, "ts": 1716454224793941, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734460, "dur": 5, "args": { "External id": 212233, "cbid": 211, "correlation": 212233 } }, { "ph": "s", "id": 212233, "pid": 76337, "tid": -914061504, "ts": 1716454224734460, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224794117, "dur": 16, "args": { "External id": 212238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212238, "pid": 5, "tid": 7, "ts": 1716454224794117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734486, "dur": 8, "args": { "External id": 212238, "cbid": 211, "correlation": 212238 } }, { "ph": "s", "id": 212238, "pid": 76337, "tid": -914061504, "ts": 1716454224734486, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224794135, "dur": 12, "args": { "External id": 212246, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212246, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212246, "pid": 5, "tid": 7, "ts": 1716454224794135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734514, "dur": 8, "args": { "External id": 212246, "cbid": 211, "correlation": 212246 } }, { "ph": "s", "id": 212246, "pid": 76337, "tid": -914061504, "ts": 1716454224734514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224794148, "dur": 10, "args": { "External id": 212254, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212254, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212254, "pid": 5, "tid": 7, "ts": 1716454224794148, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734543, "dur": 8, "args": { "External id": 212254, "cbid": 211, "correlation": 212254 } }, { "ph": "s", "id": 212254, "pid": 76337, "tid": -914061504, "ts": 1716454224734543, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224794159, "dur": 19, "args": { "External id": 212274, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212274, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 212274, "pid": 5, "tid": 7, "ts": 1716454224794159, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734624, "dur": 13, "args": { "External id": 212274, "cbid": 211, "correlation": 212274 } }, { "ph": "s", "id": 212274, "pid": 76337, "tid": -914061504, "ts": 1716454224734624, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224794179, "dur": 5, "args": { "External id": 212286, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212286, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 212286, "pid": 5, "tid": 7, "ts": 1716454224794179, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734647, "dur": 6, "args": { "External id": 212286, "cbid": 211, "correlation": 212286 } }, { "ph": "s", "id": 212286, "pid": 76337, "tid": -914061504, "ts": 1716454224734647, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224794185, "dur": 16, "args": { "External id": 212289, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212289, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212289, "pid": 5, "tid": 7, "ts": 1716454224794185, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734664, "dur": 7, "args": { "External id": 212289, "cbid": 211, "correlation": 212289 } }, { "ph": "s", "id": 212289, "pid": 76337, "tid": -914061504, "ts": 1716454224734664, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224734721, "dur": 0, "args": { "External id": 212300, "cbid": 317, "correlation": 212300 } }, { "ph": "f", "id": 212300, "pid": 76337, "tid": -914061504, "ts": 1716454224734721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224734722, "dur": 0, "args": { "External id": 212301, "cbid": 203, "correlation": 212301 } }, { "ph": "f", "id": 212301, "pid": 76337, "tid": -914061504, "ts": 1716454224734722, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224734723, "dur": 0, "args": { "External id": 212302, "cbid": 205, "correlation": 212302 } }, { "ph": "f", "id": 212302, "pid": 76337, "tid": -914061504, "ts": 1716454224734723, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224794203, "dur": 12, "args": { "External id": 212306, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212306, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212306, "pid": 5, "tid": 7, "ts": 1716454224794203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734736, "dur": 12, "args": { "External id": 212306, "cbid": 211, "correlation": 212306 } }, { "ph": "s", "id": 212306, "pid": 76337, "tid": -914061504, "ts": 1716454224734736, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224794216, "dur": 4, "args": { "External id": 212308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212308, "pid": 5, "tid": 7, "ts": 1716454224794216, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734752, "dur": 6, "args": { "External id": 212308, "cbid": 211, "correlation": 212308 } }, { "ph": "s", "id": 212308, "pid": 76337, "tid": -914061504, "ts": 1716454224734752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224734760, "dur": 0, "args": { "External id": 212309, "cbid": 51, "correlation": 212309 } }, { "ph": "s", "id": 212309, "pid": 76337, "tid": -914061504, "ts": 1716454224734760, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224794221, "dur": 91, "args": { "External id": 212310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212310, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 212310, "pid": 5, "tid": 7, "ts": 1716454224794221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734761, "dur": 5, "args": { "External id": 212310, "cbid": 211, "correlation": 212310 } }, { "ph": "s", "id": 212310, "pid": 76337, "tid": -914061504, "ts": 1716454224734761, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224794314, "dur": 16, "args": { "External id": 212315, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212315, "pid": 5, "tid": 7, "ts": 1716454224794314, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734788, "dur": 8, "args": { "External id": 212315, "cbid": 211, "correlation": 212315 } }, { "ph": "s", "id": 212315, "pid": 76337, "tid": -914061504, "ts": 1716454224734788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224794331, "dur": 84, "args": { "External id": 212324, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212324, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212324, "pid": 5, "tid": 7, "ts": 1716454224794331, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734870, "dur": 16, "args": { "External id": 212324, "cbid": 211, "correlation": 212324 } }, { "ph": "s", "id": 212324, "pid": 76337, "tid": -914061504, "ts": 1716454224734870, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224794417, "dur": 30, "args": { "External id": 212346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212346, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212346, "pid": 5, "tid": 7, "ts": 1716454224794417, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224734929, "dur": 11, "args": { "External id": 212346, "cbid": 211, "correlation": 212346 } }, { "ph": "s", "id": 212346, "pid": 76337, "tid": -914061504, "ts": 1716454224734929, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735029, "dur": 2, "args": { "External id": 212357, "cbid": 251, "correlation": 212357 } }, { "ph": "f", "id": 212357, "pid": 76337, "tid": -914061504, "ts": 1716454224735029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224794449, "dur": 167, "args": { "External id": 212358, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212358, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212358, "pid": 5, "tid": 7, "ts": 1716454224794449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735036, "dur": 14, "args": { "External id": 212358, "cbid": 211, "correlation": 212358 } }, { "ph": "s", "id": 212358, "pid": 76337, "tid": -914061504, "ts": 1716454224735036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735108, "dur": 1, "args": { "External id": 212369, "cbid": 251, "correlation": 212369 } }, { "ph": "f", "id": 212369, "pid": 76337, "tid": -914061504, "ts": 1716454224735108, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224794617, "dur": 159, "args": { "External id": 212370, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212370, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212370, "pid": 5, "tid": 7, "ts": 1716454224794617, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735111, "dur": 11, "args": { "External id": 212370, "cbid": 211, "correlation": 212370 } }, { "ph": "s", "id": 212370, "pid": 76337, "tid": -914061504, "ts": 1716454224735111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735176, "dur": 1, "args": { "External id": 212381, "cbid": 251, "correlation": 212381 } }, { "ph": "f", "id": 212381, "pid": 76337, "tid": -914061504, "ts": 1716454224735176, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224794777, "dur": 160, "args": { "External id": 212382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212382, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212382, "pid": 5, "tid": 7, "ts": 1716454224794777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735180, "dur": 11, "args": { "External id": 212382, "cbid": 211, "correlation": 212382 } }, { "ph": "s", "id": 212382, "pid": 76337, "tid": -914061504, "ts": 1716454224735180, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224794938, "dur": 339, "args": { "External id": 212407, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212407, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212407, "pid": 5, "tid": 7, "ts": 1716454224794938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735267, "dur": 13, "args": { "External id": 212407, "cbid": 211, "correlation": 212407 } }, { "ph": "s", "id": 212407, "pid": 76337, "tid": -914061504, "ts": 1716454224735267, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735368, "dur": 1, "args": { "External id": 212425, "cbid": 251, "correlation": 212425 } }, { "ph": "f", "id": 212425, "pid": 76337, "tid": -914061504, "ts": 1716454224735368, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224795279, "dur": 168, "args": { "External id": 212427, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212427, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212427, "pid": 5, "tid": 7, "ts": 1716454224795279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735374, "dur": 14, "args": { "External id": 212427, "cbid": 211, "correlation": 212427 } }, { "ph": "s", "id": 212427, "pid": 76337, "tid": -914061504, "ts": 1716454224735374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224795448, "dur": 19, "args": { "External id": 212435, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212435, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212435, "pid": 5, "tid": 7, "ts": 1716454224795448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735444, "dur": 11, "args": { "External id": 212435, "cbid": 211, "correlation": 212435 } }, { "ph": "s", "id": 212435, "pid": 76337, "tid": -914061504, "ts": 1716454224735444, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224795469, "dur": 27, "args": { "External id": 212443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212443, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212443, "pid": 5, "tid": 7, "ts": 1716454224795469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735482, "dur": 8, "args": { "External id": 212443, "cbid": 211, "correlation": 212443 } }, { "ph": "s", "id": 212443, "pid": 76337, "tid": -914061504, "ts": 1716454224735482, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224795497, "dur": 18, "args": { "External id": 212454, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212454, "pid": 5, "tid": 7, "ts": 1716454224795497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735555, "dur": 12, "args": { "External id": 212454, "cbid": 211, "correlation": 212454 } }, { "ph": "s", "id": 212454, "pid": 76337, "tid": -914061504, "ts": 1716454224735555, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224795517, "dur": 16, "args": { "External id": 212476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212476, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212476, "pid": 5, "tid": 7, "ts": 1716454224795517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735587, "dur": 7, "args": { "External id": 212476, "cbid": 211, "correlation": 212476 } }, { "ph": "s", "id": 212476, "pid": 76337, "tid": -914061504, "ts": 1716454224735587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735673, "dur": 1, "args": { "External id": 212487, "cbid": 251, "correlation": 212487 } }, { "ph": "f", "id": 212487, "pid": 76337, "tid": -914061504, "ts": 1716454224735673, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224795534, "dur": 90, "args": { "External id": 212488, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212488, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212488, "pid": 5, "tid": 7, "ts": 1716454224795534, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735678, "dur": 14, "args": { "External id": 212488, "cbid": 211, "correlation": 212488 } }, { "ph": "s", "id": 212488, "pid": 76337, "tid": -914061504, "ts": 1716454224735678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735748, "dur": 1, "args": { "External id": 212499, "cbid": 251, "correlation": 212499 } }, { "ph": "f", "id": 212499, "pid": 76337, "tid": -914061504, "ts": 1716454224735748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735752, "dur": 0, "args": { "External id": 212500, "cbid": 251, "correlation": 212500 } }, { "ph": "f", "id": 212500, "pid": 76337, "tid": -914061504, "ts": 1716454224735752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224795625, "dur": 12, "args": { "External id": 212501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212501, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212501, "pid": 5, "tid": 7, "ts": 1716454224795625, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735754, "dur": 12, "args": { "External id": 212501, "cbid": 211, "correlation": 212501 } }, { "ph": "s", "id": 212501, "pid": 76337, "tid": -914061504, "ts": 1716454224735754, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224795639, "dur": 6, "args": { "External id": 212503, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212503, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212503, "pid": 5, "tid": 7, "ts": 1716454224795639, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735767, "dur": 6, "args": { "External id": 212503, "cbid": 211, "correlation": 212503 } }, { "ph": "s", "id": 212503, "pid": 76337, "tid": -914061504, "ts": 1716454224735767, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735825, "dur": 1, "args": { "External id": 212514, "cbid": 251, "correlation": 212514 } }, { "ph": "f", "id": 212514, "pid": 76337, "tid": -914061504, "ts": 1716454224735825, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224735829, "dur": 0, "args": { "External id": 212515, "cbid": 251, "correlation": 212515 } }, { "ph": "f", "id": 212515, "pid": 76337, "tid": -914061504, "ts": 1716454224735829, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224795646, "dur": 8, "args": { "External id": 212516, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212516, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212516, "pid": 5, "tid": 7, "ts": 1716454224795646, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735830, "dur": 12, "args": { "External id": 212516, "cbid": 211, "correlation": 212516 } }, { "ph": "s", "id": 212516, "pid": 76337, "tid": -914061504, "ts": 1716454224735830, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224795656, "dur": 3, "args": { "External id": 212518, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212518, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212518, "pid": 5, "tid": 7, "ts": 1716454224795656, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735843, "dur": 5, "args": { "External id": 212518, "cbid": 211, "correlation": 212518 } }, { "ph": "s", "id": 212518, "pid": 76337, "tid": -914061504, "ts": 1716454224735843, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224795660, "dur": 56, "args": { "External id": 212543, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212543, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212543, "pid": 5, "tid": 7, "ts": 1716454224795660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224735920, "dur": 13, "args": { "External id": 212543, "cbid": 211, "correlation": 212543 } }, { "ph": "s", "id": 212543, "pid": 76337, "tid": -914061504, "ts": 1716454224735920, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224736029, "dur": 2, "args": { "External id": 212561, "cbid": 251, "correlation": 212561 } }, { "ph": "f", "id": 212561, "pid": 76337, "tid": -914061504, "ts": 1716454224736029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224795717, "dur": 91, "args": { "External id": 212563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212563, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212563, "pid": 5, "tid": 7, "ts": 1716454224795717, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736035, "dur": 14, "args": { "External id": 212563, "cbid": 211, "correlation": 212563 } }, { "ph": "s", "id": 212563, "pid": 76337, "tid": -914061504, "ts": 1716454224736035, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224795810, "dur": 10, "args": { "External id": 212571, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212571, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212571, "pid": 5, "tid": 7, "ts": 1716454224795810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736106, "dur": 12, "args": { "External id": 212571, "cbid": 211, "correlation": 212571 } }, { "ph": "s", "id": 212571, "pid": 76337, "tid": -914061504, "ts": 1716454224736106, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224795821, "dur": 21, "args": { "External id": 212579, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212579, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212579, "pid": 5, "tid": 7, "ts": 1716454224795821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736148, "dur": 9, "args": { "External id": 212579, "cbid": 211, "correlation": 212579 } }, { "ph": "s", "id": 212579, "pid": 76337, "tid": -914061504, "ts": 1716454224736148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224795843, "dur": 17, "args": { "External id": 212601, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212601, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212601, "pid": 5, "tid": 7, "ts": 1716454224795843, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736199, "dur": 10, "args": { "External id": 212601, "cbid": 211, "correlation": 212601 } }, { "ph": "s", "id": 212601, "pid": 76337, "tid": -914061504, "ts": 1716454224736199, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224736288, "dur": 1, "args": { "External id": 212617, "cbid": 251, "correlation": 212617 } }, { "ph": "f", "id": 212617, "pid": 76337, "tid": -914061504, "ts": 1716454224736288, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224736293, "dur": 0, "args": { "External id": 212619, "cbid": 251, "correlation": 212619 } }, { "ph": "f", "id": 212619, "pid": 76337, "tid": -914061504, "ts": 1716454224736293, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224795862, "dur": 496, "args": { "External id": 212620, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212620, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212620, "pid": 5, "tid": 7, "ts": 1716454224795862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736294, "dur": 13, "args": { "External id": 212620, "cbid": 211, "correlation": 212620 } }, { "ph": "s", "id": 212620, "pid": 76337, "tid": -914061504, "ts": 1716454224736294, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224796360, "dur": 66, "args": { "External id": 212628, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212628, "pid": 5, "tid": 7, "ts": 1716454224796360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736361, "dur": 12, "args": { "External id": 212628, "cbid": 211, "correlation": 212628 } }, { "ph": "s", "id": 212628, "pid": 76337, "tid": -914061504, "ts": 1716454224736361, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224796427, "dur": 65, "args": { "External id": 212636, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212636, "pid": 5, "tid": 7, "ts": 1716454224796427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736393, "dur": 8, "args": { "External id": 212636, "cbid": 211, "correlation": 212636 } }, { "ph": "s", "id": 212636, "pid": 76337, "tid": -914061504, "ts": 1716454224736393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224736473, "dur": 1, "args": { "External id": 212652, "cbid": 251, "correlation": 212652 } }, { "ph": "f", "id": 212652, "pid": 76337, "tid": -914061504, "ts": 1716454224736473, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224796493, "dur": 1, "args": { "External id": 212654, "device": 5, "context": 1, "stream": 7, "correlation": 212654, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 212654, "pid": 5, "tid": 7, "ts": 1716454224796493, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224736478, "dur": 10, "args": { "External id": 212654, "cbid": 51, "correlation": 212654 } }, { "ph": "s", "id": 212654, "pid": 76337, "tid": -914061504, "ts": 1716454224736478, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224796497, "dur": 274, "args": { "External id": 212655, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212655, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212655, "pid": 5, "tid": 7, "ts": 1716454224796497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736489, "dur": 11, "args": { "External id": 212655, "cbid": 211, "correlation": 212655 } }, { "ph": "s", "id": 212655, "pid": 76337, "tid": -914061504, "ts": 1716454224736489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224796772, "dur": 13, "args": { "External id": 212663, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212663, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212663, "pid": 5, "tid": 7, "ts": 1716454224796772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736533, "dur": 10, "args": { "External id": 212663, "cbid": 211, "correlation": 212663 } }, { "ph": "s", "id": 212663, "pid": 76337, "tid": -914061504, "ts": 1716454224736533, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224796787, "dur": 38, "args": { "External id": 212674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212674, "pid": 5, "tid": 7, "ts": 1716454224796787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736602, "dur": 12, "args": { "External id": 212674, "cbid": 211, "correlation": 212674 } }, { "ph": "s", "id": 212674, "pid": 76337, "tid": -914061504, "ts": 1716454224736602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224736666, "dur": 0, "args": { "External id": 212686, "cbid": 317, "correlation": 212686 } }, { "ph": "f", "id": 212686, "pid": 76337, "tid": -914061504, "ts": 1716454224736666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224736667, "dur": 0, "args": { "External id": 212687, "cbid": 203, "correlation": 212687 } }, { "ph": "f", "id": 212687, "pid": 76337, "tid": -914061504, "ts": 1716454224736667, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224736668, "dur": 0, "args": { "External id": 212688, "cbid": 205, "correlation": 212688 } }, { "ph": "f", "id": 212688, "pid": 76337, "tid": -914061504, "ts": 1716454224736668, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224796826, "dur": 14, "args": { "External id": 212692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212692, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212692, "pid": 5, "tid": 7, "ts": 1716454224796826, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736684, "dur": 12, "args": { "External id": 212692, "cbid": 211, "correlation": 212692 } }, { "ph": "s", "id": 212692, "pid": 76337, "tid": -914061504, "ts": 1716454224736684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224796841, "dur": 4, "args": { "External id": 212694, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212694, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212694, "pid": 5, "tid": 7, "ts": 1716454224796841, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736700, "dur": 6, "args": { "External id": 212694, "cbid": 211, "correlation": 212694 } }, { "ph": "s", "id": 212694, "pid": 76337, "tid": -914061504, "ts": 1716454224736700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224736708, "dur": 0, "args": { "External id": 212695, "cbid": 51, "correlation": 212695 } }, { "ph": "s", "id": 212695, "pid": 76337, "tid": -914061504, "ts": 1716454224736708, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224796847, "dur": 98, "args": { "External id": 212696, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212696, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 212696, "pid": 5, "tid": 7, "ts": 1716454224796847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736709, "dur": 5, "args": { "External id": 212696, "cbid": 211, "correlation": 212696 } }, { "ph": "s", "id": 212696, "pid": 76337, "tid": -914061504, "ts": 1716454224736709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224796946, "dur": 17, "args": { "External id": 212701, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212701, "pid": 5, "tid": 7, "ts": 1716454224796946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736737, "dur": 8, "args": { "External id": 212701, "cbid": 211, "correlation": 212701 } }, { "ph": "s", "id": 212701, "pid": 76337, "tid": -914061504, "ts": 1716454224736737, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224796964, "dur": 11, "args": { "External id": 212709, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212709, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212709, "pid": 5, "tid": 7, "ts": 1716454224796964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736769, "dur": 9, "args": { "External id": 212709, "cbid": 211, "correlation": 212709 } }, { "ph": "s", "id": 212709, "pid": 76337, "tid": -914061504, "ts": 1716454224736769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224796977, "dur": 30, "args": { "External id": 212718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212718, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212718, "pid": 5, "tid": 7, "ts": 1716454224796977, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736809, "dur": 10, "args": { "External id": 212718, "cbid": 211, "correlation": 212718 } }, { "ph": "s", "id": 212718, "pid": 76337, "tid": -914061504, "ts": 1716454224736809, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224797009, "dur": 31, "args": { "External id": 212738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212738, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 212738, "pid": 5, "tid": 7, "ts": 1716454224797009, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736883, "dur": 11, "args": { "External id": 212738, "cbid": 211, "correlation": 212738 } }, { "ph": "s", "id": 212738, "pid": 76337, "tid": -914061504, "ts": 1716454224736883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224797041, "dur": 5, "args": { "External id": 212750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212750, "registers per thread": 20, "shared memory": 0, "blocks per SM": 1, "warps per SM": 8, "grid": [80, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212750, "pid": 5, "tid": 7, "ts": 1716454224797041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736904, "dur": 6, "args": { "External id": 212750, "cbid": 211, "correlation": 212750 } }, { "ph": "s", "id": 212750, "pid": 76337, "tid": -914061504, "ts": 1716454224736904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224797047, "dur": 31, "args": { "External id": 212753, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212753, "pid": 5, "tid": 7, "ts": 1716454224797047, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736923, "dur": 8, "args": { "External id": 212753, "cbid": 211, "correlation": 212753 } }, { "ph": "s", "id": 212753, "pid": 76337, "tid": -914061504, "ts": 1716454224736923, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224797080, "dur": 21, "args": { "External id": 212762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212762, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212762, "pid": 5, "tid": 7, "ts": 1716454224797080, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224736962, "dur": 9, "args": { "External id": 212762, "cbid": 211, "correlation": 212762 } }, { "ph": "s", "id": 212762, "pid": 76337, "tid": -914061504, "ts": 1716454224736962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224737025, "dur": 0, "args": { "External id": 212772, "cbid": 317, "correlation": 212772 } }, { "ph": "f", "id": 212772, "pid": 76337, "tid": -914061504, "ts": 1716454224737025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224737026, "dur": 0, "args": { "External id": 212773, "cbid": 203, "correlation": 212773 } }, { "ph": "f", "id": 212773, "pid": 76337, "tid": -914061504, "ts": 1716454224737026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224737027, "dur": 0, "args": { "External id": 212774, "cbid": 205, "correlation": 212774 } }, { "ph": "f", "id": 212774, "pid": 76337, "tid": -914061504, "ts": 1716454224737027, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224797102, "dur": 22, "args": { "External id": 212778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212778, "pid": 5, "tid": 7, "ts": 1716454224797102, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737041, "dur": 12, "args": { "External id": 212778, "cbid": 211, "correlation": 212778 } }, { "ph": "s", "id": 212778, "pid": 76337, "tid": -914061504, "ts": 1716454224737041, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224797125, "dur": 323, "args": { "External id": 212780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212780, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 1280, "warps per SM": 10240, "grid": [1, 80, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212780, "pid": 5, "tid": 7, "ts": 1716454224797125, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737055, "dur": 5, "args": { "External id": 212780, "cbid": 211, "correlation": 212780 } }, { "ph": "s", "id": 212780, "pid": 76337, "tid": -914061504, "ts": 1716454224737055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224797451, "dur": 1, "args": { "External id": 212782, "device": 5, "context": 1, "stream": 7, "correlation": 212782, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 212782, "pid": 5, "tid": 7, "ts": 1716454224797451, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224737067, "dur": 7, "args": { "External id": 212782, "cbid": 51, "correlation": 212782 } }, { "ph": "s", "id": 212782, "pid": 76337, "tid": -914061504, "ts": 1716454224737067, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224797455, "dur": 1265, "args": { "External id": 212783, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212783, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212783, "pid": 5, "tid": 7, "ts": 1716454224797455, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737075, "dur": 6, "args": { "External id": 212783, "cbid": 211, "correlation": 212783 } }, { "ph": "s", "id": 212783, "pid": 76337, "tid": -914061504, "ts": 1716454224737075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224798721, "dur": 13, "args": { "External id": 212785, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212785, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212785, "pid": 5, "tid": 7, "ts": 1716454224798721, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737086, "dur": 5, "args": { "External id": 212785, "cbid": 211, "correlation": 212785 } }, { "ph": "s", "id": 212785, "pid": 76337, "tid": -914061504, "ts": 1716454224737086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224798736, "dur": 15, "args": { "External id": 212791, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212791, "pid": 5, "tid": 7, "ts": 1716454224798736, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737114, "dur": 9, "args": { "External id": 212791, "cbid": 211, "correlation": 212791 } }, { "ph": "s", "id": 212791, "pid": 76337, "tid": -914061504, "ts": 1716454224737114, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224798752, "dur": 3, "args": { "External id": 212799, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212799, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 212799, "pid": 5, "tid": 7, "ts": 1716454224798752, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737158, "dur": 9, "args": { "External id": 212799, "cbid": 211, "correlation": 212799 } }, { "ph": "s", "id": 212799, "pid": 76337, "tid": -914061504, "ts": 1716454224737158, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224737226, "dur": 1, "args": { "External id": 212815, "cbid": 251, "correlation": 212815 } }, { "ph": "f", "id": 212815, "pid": 76337, "tid": -914061504, "ts": 1716454224737226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224737232, "dur": 0, "args": { "External id": 212817, "cbid": 251, "correlation": 212817 } }, { "ph": "f", "id": 212817, "pid": 76337, "tid": -914061504, "ts": 1716454224737232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224798757, "dur": 13, "args": { "External id": 212818, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212818, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212818, "pid": 5, "tid": 7, "ts": 1716454224798757, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737234, "dur": 11, "args": { "External id": 212818, "cbid": 211, "correlation": 212818 } }, { "ph": "s", "id": 212818, "pid": 76337, "tid": -914061504, "ts": 1716454224737234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224798772, "dur": 5, "args": { "External id": 212820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212820, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212820, "pid": 5, "tid": 7, "ts": 1716454224798772, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737247, "dur": 5, "args": { "External id": 212820, "cbid": 211, "correlation": 212820 } }, { "ph": "s", "id": 212820, "pid": 76337, "tid": -914061504, "ts": 1716454224737247, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224798778, "dur": 17, "args": { "External id": 212830, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212830, "pid": 5, "tid": 7, "ts": 1716454224798778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737306, "dur": 12, "args": { "External id": 212830, "cbid": 211, "correlation": 212830 } }, { "ph": "s", "id": 212830, "pid": 76337, "tid": -914061504, "ts": 1716454224737306, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224798796, "dur": 18, "args": { "External id": 212850, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212850, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 212850, "pid": 5, "tid": 7, "ts": 1716454224798796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737374, "dur": 11, "args": { "External id": 212850, "cbid": 211, "correlation": 212850 } }, { "ph": "s", "id": 212850, "pid": 76337, "tid": -914061504, "ts": 1716454224737374, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224798816, "dur": 4, "args": { "External id": 212862, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212862, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 212862, "pid": 5, "tid": 7, "ts": 1716454224798816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737395, "dur": 6, "args": { "External id": 212862, "cbid": 211, "correlation": 212862 } }, { "ph": "s", "id": 212862, "pid": 76337, "tid": -914061504, "ts": 1716454224737395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224798821, "dur": 16, "args": { "External id": 212865, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212865, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212865, "pid": 5, "tid": 7, "ts": 1716454224798821, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737413, "dur": 7, "args": { "External id": 212865, "cbid": 211, "correlation": 212865 } }, { "ph": "s", "id": 212865, "pid": 76337, "tid": -914061504, "ts": 1716454224737413, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224798839, "dur": 11, "args": { "External id": 212874, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212874, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212874, "pid": 5, "tid": 7, "ts": 1716454224798839, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737454, "dur": 10, "args": { "External id": 212874, "cbid": 211, "correlation": 212874 } }, { "ph": "s", "id": 212874, "pid": 76337, "tid": -914061504, "ts": 1716454224737454, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224737516, "dur": 0, "args": { "External id": 212884, "cbid": 317, "correlation": 212884 } }, { "ph": "f", "id": 212884, "pid": 76337, "tid": -914061504, "ts": 1716454224737516, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224737517, "dur": 0, "args": { "External id": 212885, "cbid": 203, "correlation": 212885 } }, { "ph": "f", "id": 212885, "pid": 76337, "tid": -914061504, "ts": 1716454224737517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224737518, "dur": 0, "args": { "External id": 212886, "cbid": 205, "correlation": 212886 } }, { "ph": "f", "id": 212886, "pid": 76337, "tid": -914061504, "ts": 1716454224737518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224798851, "dur": 11, "args": { "External id": 212890, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212890, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212890, "pid": 5, "tid": 7, "ts": 1716454224798851, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737532, "dur": 11, "args": { "External id": 212890, "cbid": 211, "correlation": 212890 } }, { "ph": "s", "id": 212890, "pid": 76337, "tid": -914061504, "ts": 1716454224737532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224798864, "dur": 165, "args": { "External id": 212892, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212892, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212892, "pid": 5, "tid": 7, "ts": 1716454224798864, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737546, "dur": 5, "args": { "External id": 212892, "cbid": 211, "correlation": 212892 } }, { "ph": "s", "id": 212892, "pid": 76337, "tid": -914061504, "ts": 1716454224737546, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224799030, "dur": 1, "args": { "External id": 212894, "device": 5, "context": 1, "stream": 7, "correlation": 212894, "bytes": 960, "memory bandwidth (GB/s)": 0.5 } }, { "ph": "f", "id": 212894, "pid": 5, "tid": 7, "ts": 1716454224799030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224737557, "dur": 7, "args": { "External id": 212894, "cbid": 51, "correlation": 212894 } }, { "ph": "s", "id": 212894, "pid": 76337, "tid": -914061504, "ts": 1716454224737557, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224799035, "dur": 653, "args": { "External id": 212895, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212895, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 212895, "pid": 5, "tid": 7, "ts": 1716454224799035, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737564, "dur": 7, "args": { "External id": 212895, "cbid": 211, "correlation": 212895 } }, { "ph": "s", "id": 212895, "pid": 76337, "tid": -914061504, "ts": 1716454224737564, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224799688, "dur": 14, "args": { "External id": 212897, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212897, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212897, "pid": 5, "tid": 7, "ts": 1716454224799688, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737576, "dur": 5, "args": { "External id": 212897, "cbid": 211, "correlation": 212897 } }, { "ph": "s", "id": 212897, "pid": 76337, "tid": -914061504, "ts": 1716454224737576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224799703, "dur": 16, "args": { "External id": 212903, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212903, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212903, "pid": 5, "tid": 7, "ts": 1716454224799703, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737605, "dur": 9, "args": { "External id": 212903, "cbid": 211, "correlation": 212903 } }, { "ph": "s", "id": 212903, "pid": 76337, "tid": -914061504, "ts": 1716454224737605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224737662, "dur": 0, "args": { "External id": 212913, "cbid": 317, "correlation": 212913 } }, { "ph": "f", "id": 212913, "pid": 76337, "tid": -914061504, "ts": 1716454224737662, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224737663, "dur": 0, "args": { "External id": 212914, "cbid": 203, "correlation": 212914 } }, { "ph": "f", "id": 212914, "pid": 76337, "tid": -914061504, "ts": 1716454224737663, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224737664, "dur": 0, "args": { "External id": 212915, "cbid": 205, "correlation": 212915 } }, { "ph": "f", "id": 212915, "pid": 76337, "tid": -914061504, "ts": 1716454224737664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224799720, "dur": 21, "args": { "External id": 212919, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212919, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [6, 80, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212919, "pid": 5, "tid": 7, "ts": 1716454224799720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737676, "dur": 11, "args": { "External id": 212919, "cbid": 211, "correlation": 212919 } }, { "ph": "s", "id": 212919, "pid": 76337, "tid": -914061504, "ts": 1716454224737676, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224799743, "dur": 4, "args": { "External id": 212921, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212921, "pid": 5, "tid": 7, "ts": 1716454224799743, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737692, "dur": 6, "args": { "External id": 212921, "cbid": 211, "correlation": 212921 } }, { "ph": "s", "id": 212921, "pid": 76337, "tid": -914061504, "ts": 1716454224737692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224737700, "dur": 0, "args": { "External id": 212922, "cbid": 51, "correlation": 212922 } }, { "ph": "s", "id": 212922, "pid": 76337, "tid": -914061504, "ts": 1716454224737700, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224799748, "dur": 172, "args": { "External id": 212923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212923, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 212923, "pid": 5, "tid": 7, "ts": 1716454224799748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737701, "dur": 5, "args": { "External id": 212923, "cbid": 211, "correlation": 212923 } }, { "ph": "s", "id": 212923, "pid": 76337, "tid": -914061504, "ts": 1716454224737701, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224799921, "dur": 16, "args": { "External id": 212928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212928, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212928, "pid": 5, "tid": 7, "ts": 1716454224799921, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737727, "dur": 8, "args": { "External id": 212928, "cbid": 211, "correlation": 212928 } }, { "ph": "s", "id": 212928, "pid": 76337, "tid": -914061504, "ts": 1716454224737727, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224799939, "dur": 13, "args": { "External id": 212936, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212936, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212936, "pid": 5, "tid": 7, "ts": 1716454224799939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737755, "dur": 8, "args": { "External id": 212936, "cbid": 211, "correlation": 212936 } }, { "ph": "s", "id": 212936, "pid": 76337, "tid": -914061504, "ts": 1716454224737755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224799953, "dur": 10, "args": { "External id": 212944, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212944, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212944, "pid": 5, "tid": 7, "ts": 1716454224799953, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737784, "dur": 8, "args": { "External id": 212944, "cbid": 211, "correlation": 212944 } }, { "ph": "s", "id": 212944, "pid": 76337, "tid": -914061504, "ts": 1716454224737784, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224799964, "dur": 18, "args": { "External id": 212964, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212964, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 212964, "pid": 5, "tid": 7, "ts": 1716454224799964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737865, "dur": 13, "args": { "External id": 212964, "cbid": 211, "correlation": 212964 } }, { "ph": "s", "id": 212964, "pid": 76337, "tid": -914061504, "ts": 1716454224737865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224799984, "dur": 4, "args": { "External id": 212976, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212976, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 212976, "pid": 5, "tid": 7, "ts": 1716454224799984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737888, "dur": 6, "args": { "External id": 212976, "cbid": 211, "correlation": 212976 } }, { "ph": "s", "id": 212976, "pid": 76337, "tid": -914061504, "ts": 1716454224737888, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224799990, "dur": 16, "args": { "External id": 212979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212979, "pid": 5, "tid": 7, "ts": 1716454224799990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737905, "dur": 6, "args": { "External id": 212979, "cbid": 211, "correlation": 212979 } }, { "ph": "s", "id": 212979, "pid": 76337, "tid": -914061504, "ts": 1716454224737905, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224737962, "dur": 0, "args": { "External id": 212990, "cbid": 317, "correlation": 212990 } }, { "ph": "f", "id": 212990, "pid": 76337, "tid": -914061504, "ts": 1716454224737962, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224737963, "dur": 0, "args": { "External id": 212991, "cbid": 203, "correlation": 212991 } }, { "ph": "f", "id": 212991, "pid": 76337, "tid": -914061504, "ts": 1716454224737963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224737964, "dur": 0, "args": { "External id": 212992, "cbid": 205, "correlation": 212992 } }, { "ph": "f", "id": 212992, "pid": 76337, "tid": -914061504, "ts": 1716454224737964, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224800007, "dur": 12, "args": { "External id": 212996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212996, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 212996, "pid": 5, "tid": 7, "ts": 1716454224800007, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224737985, "dur": 12, "args": { "External id": 212996, "cbid": 211, "correlation": 212996 } }, { "ph": "s", "id": 212996, "pid": 76337, "tid": -914061504, "ts": 1716454224737985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224800021, "dur": 3, "args": { "External id": 212998, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 212998, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 212998, "pid": 5, "tid": 7, "ts": 1716454224800021, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738001, "dur": 6, "args": { "External id": 212998, "cbid": 211, "correlation": 212998 } }, { "ph": "s", "id": 212998, "pid": 76337, "tid": -914061504, "ts": 1716454224738001, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224738009, "dur": 0, "args": { "External id": 212999, "cbid": 51, "correlation": 212999 } }, { "ph": "s", "id": 212999, "pid": 76337, "tid": -914061504, "ts": 1716454224738009, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224800025, "dur": 91, "args": { "External id": 213000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213000, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 213000, "pid": 5, "tid": 7, "ts": 1716454224800025, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738010, "dur": 5, "args": { "External id": 213000, "cbid": 211, "correlation": 213000 } }, { "ph": "s", "id": 213000, "pid": 76337, "tid": -914061504, "ts": 1716454224738010, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224800118, "dur": 16, "args": { "External id": 213005, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213005, "pid": 5, "tid": 7, "ts": 1716454224800118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738038, "dur": 8, "args": { "External id": 213005, "cbid": 211, "correlation": 213005 } }, { "ph": "s", "id": 213005, "pid": 76337, "tid": -914061504, "ts": 1716454224738038, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224800135, "dur": 85, "args": { "External id": 213014, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213014, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213014, "pid": 5, "tid": 7, "ts": 1716454224800135, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738120, "dur": 14, "args": { "External id": 213014, "cbid": 211, "correlation": 213014 } }, { "ph": "s", "id": 213014, "pid": 76337, "tid": -914061504, "ts": 1716454224738120, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224800221, "dur": 30, "args": { "External id": 213036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213036, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213036, "pid": 5, "tid": 7, "ts": 1716454224800221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738177, "dur": 10, "args": { "External id": 213036, "cbid": 211, "correlation": 213036 } }, { "ph": "s", "id": 213036, "pid": 76337, "tid": -914061504, "ts": 1716454224738177, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738270, "dur": 1, "args": { "External id": 213047, "cbid": 251, "correlation": 213047 } }, { "ph": "f", "id": 213047, "pid": 76337, "tid": -914061504, "ts": 1716454224738270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224800253, "dur": 165, "args": { "External id": 213048, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213048, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213048, "pid": 5, "tid": 7, "ts": 1716454224800253, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738276, "dur": 13, "args": { "External id": 213048, "cbid": 211, "correlation": 213048 } }, { "ph": "s", "id": 213048, "pid": 76337, "tid": -914061504, "ts": 1716454224738276, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738347, "dur": 1, "args": { "External id": 213059, "cbid": 251, "correlation": 213059 } }, { "ph": "f", "id": 213059, "pid": 76337, "tid": -914061504, "ts": 1716454224738347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224800420, "dur": 161, "args": { "External id": 213060, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213060, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213060, "pid": 5, "tid": 7, "ts": 1716454224800420, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738350, "dur": 11, "args": { "External id": 213060, "cbid": 211, "correlation": 213060 } }, { "ph": "s", "id": 213060, "pid": 76337, "tid": -914061504, "ts": 1716454224738350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738416, "dur": 1, "args": { "External id": 213071, "cbid": 251, "correlation": 213071 } }, { "ph": "f", "id": 213071, "pid": 76337, "tid": -914061504, "ts": 1716454224738416, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224800582, "dur": 159, "args": { "External id": 213072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213072, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213072, "pid": 5, "tid": 7, "ts": 1716454224800582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738420, "dur": 11, "args": { "External id": 213072, "cbid": 211, "correlation": 213072 } }, { "ph": "s", "id": 213072, "pid": 76337, "tid": -914061504, "ts": 1716454224738420, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224800742, "dur": 339, "args": { "External id": 213097, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213097, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213097, "pid": 5, "tid": 7, "ts": 1716454224800742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738506, "dur": 14, "args": { "External id": 213097, "cbid": 211, "correlation": 213097 } }, { "ph": "s", "id": 213097, "pid": 76337, "tid": -914061504, "ts": 1716454224738506, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738606, "dur": 1, "args": { "External id": 213115, "cbid": 251, "correlation": 213115 } }, { "ph": "f", "id": 213115, "pid": 76337, "tid": -914061504, "ts": 1716454224738606, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224801083, "dur": 166, "args": { "External id": 213117, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213117, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213117, "pid": 5, "tid": 7, "ts": 1716454224801083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738612, "dur": 14, "args": { "External id": 213117, "cbid": 211, "correlation": 213117 } }, { "ph": "s", "id": 213117, "pid": 76337, "tid": -914061504, "ts": 1716454224738612, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224801250, "dur": 19, "args": { "External id": 213125, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213125, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213125, "pid": 5, "tid": 7, "ts": 1716454224801250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738682, "dur": 12, "args": { "External id": 213125, "cbid": 211, "correlation": 213125 } }, { "ph": "s", "id": 213125, "pid": 76337, "tid": -914061504, "ts": 1716454224738682, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224801271, "dur": 28, "args": { "External id": 213133, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213133, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213133, "pid": 5, "tid": 7, "ts": 1716454224801271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738722, "dur": 8, "args": { "External id": 213133, "cbid": 211, "correlation": 213133 } }, { "ph": "s", "id": 213133, "pid": 76337, "tid": -914061504, "ts": 1716454224738722, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224801300, "dur": 18, "args": { "External id": 213144, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213144, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213144, "pid": 5, "tid": 7, "ts": 1716454224801300, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738793, "dur": 12, "args": { "External id": 213144, "cbid": 211, "correlation": 213144 } }, { "ph": "s", "id": 213144, "pid": 76337, "tid": -914061504, "ts": 1716454224738793, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224801319, "dur": 16, "args": { "External id": 213166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213166, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213166, "pid": 5, "tid": 7, "ts": 1716454224801319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738824, "dur": 8, "args": { "External id": 213166, "cbid": 211, "correlation": 213166 } }, { "ph": "s", "id": 213166, "pid": 76337, "tid": -914061504, "ts": 1716454224738824, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738911, "dur": 1, "args": { "External id": 213177, "cbid": 251, "correlation": 213177 } }, { "ph": "f", "id": 213177, "pid": 76337, "tid": -914061504, "ts": 1716454224738911, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224801337, "dur": 89, "args": { "External id": 213178, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213178, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213178, "pid": 5, "tid": 7, "ts": 1716454224801337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738916, "dur": 14, "args": { "External id": 213178, "cbid": 211, "correlation": 213178 } }, { "ph": "s", "id": 213178, "pid": 76337, "tid": -914061504, "ts": 1716454224738916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738993, "dur": 1, "args": { "External id": 213189, "cbid": 251, "correlation": 213189 } }, { "ph": "f", "id": 213189, "pid": 76337, "tid": -914061504, "ts": 1716454224738993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224738997, "dur": 0, "args": { "External id": 213190, "cbid": 251, "correlation": 213190 } }, { "ph": "f", "id": 213190, "pid": 76337, "tid": -914061504, "ts": 1716454224738997, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224801428, "dur": 12, "args": { "External id": 213191, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213191, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213191, "pid": 5, "tid": 7, "ts": 1716454224801428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224738999, "dur": 12, "args": { "External id": 213191, "cbid": 211, "correlation": 213191 } }, { "ph": "s", "id": 213191, "pid": 76337, "tid": -914061504, "ts": 1716454224738999, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224801441, "dur": 5, "args": { "External id": 213193, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213193, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213193, "pid": 5, "tid": 7, "ts": 1716454224801441, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739013, "dur": 6, "args": { "External id": 213193, "cbid": 211, "correlation": 213193 } }, { "ph": "s", "id": 213193, "pid": 76337, "tid": -914061504, "ts": 1716454224739013, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224739073, "dur": 1, "args": { "External id": 213204, "cbid": 251, "correlation": 213204 } }, { "ph": "f", "id": 213204, "pid": 76337, "tid": -914061504, "ts": 1716454224739073, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224739077, "dur": 0, "args": { "External id": 213205, "cbid": 251, "correlation": 213205 } }, { "ph": "f", "id": 213205, "pid": 76337, "tid": -914061504, "ts": 1716454224739077, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224801448, "dur": 8, "args": { "External id": 213206, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213206, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213206, "pid": 5, "tid": 7, "ts": 1716454224801448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739078, "dur": 12, "args": { "External id": 213206, "cbid": 211, "correlation": 213206 } }, { "ph": "s", "id": 213206, "pid": 76337, "tid": -914061504, "ts": 1716454224739078, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224801458, "dur": 3, "args": { "External id": 213208, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213208, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213208, "pid": 5, "tid": 7, "ts": 1716454224801458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739092, "dur": 5, "args": { "External id": 213208, "cbid": 211, "correlation": 213208 } }, { "ph": "s", "id": 213208, "pid": 76337, "tid": -914061504, "ts": 1716454224739092, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224801462, "dur": 56, "args": { "External id": 213233, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213233, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213233, "pid": 5, "tid": 7, "ts": 1716454224801462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739169, "dur": 13, "args": { "External id": 213233, "cbid": 211, "correlation": 213233 } }, { "ph": "s", "id": 213233, "pid": 76337, "tid": -914061504, "ts": 1716454224739169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224739269, "dur": 1, "args": { "External id": 213251, "cbid": 251, "correlation": 213251 } }, { "ph": "f", "id": 213251, "pid": 76337, "tid": -914061504, "ts": 1716454224739269, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224801520, "dur": 91, "args": { "External id": 213253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213253, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213253, "pid": 5, "tid": 7, "ts": 1716454224801520, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739275, "dur": 13, "args": { "External id": 213253, "cbid": 211, "correlation": 213253 } }, { "ph": "s", "id": 213253, "pid": 76337, "tid": -914061504, "ts": 1716454224739275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224801612, "dur": 9, "args": { "External id": 213261, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213261, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213261, "pid": 5, "tid": 7, "ts": 1716454224801612, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739345, "dur": 12, "args": { "External id": 213261, "cbid": 211, "correlation": 213261 } }, { "ph": "s", "id": 213261, "pid": 76337, "tid": -914061504, "ts": 1716454224739345, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224801623, "dur": 21, "args": { "External id": 213269, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213269, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213269, "pid": 5, "tid": 7, "ts": 1716454224801623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739387, "dur": 9, "args": { "External id": 213269, "cbid": 211, "correlation": 213269 } }, { "ph": "s", "id": 213269, "pid": 76337, "tid": -914061504, "ts": 1716454224739387, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224801645, "dur": 17, "args": { "External id": 213291, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213291, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213291, "pid": 5, "tid": 7, "ts": 1716454224801645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739440, "dur": 10, "args": { "External id": 213291, "cbid": 211, "correlation": 213291 } }, { "ph": "s", "id": 213291, "pid": 76337, "tid": -914061504, "ts": 1716454224739440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224739527, "dur": 1, "args": { "External id": 213307, "cbid": 251, "correlation": 213307 } }, { "ph": "f", "id": 213307, "pid": 76337, "tid": -914061504, "ts": 1716454224739527, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224739532, "dur": 0, "args": { "External id": 213309, "cbid": 251, "correlation": 213309 } }, { "ph": "f", "id": 213309, "pid": 76337, "tid": -914061504, "ts": 1716454224739532, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224801664, "dur": 498, "args": { "External id": 213310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213310, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213310, "pid": 5, "tid": 7, "ts": 1716454224801664, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739534, "dur": 12, "args": { "External id": 213310, "cbid": 211, "correlation": 213310 } }, { "ph": "s", "id": 213310, "pid": 76337, "tid": -914061504, "ts": 1716454224739534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224802163, "dur": 65, "args": { "External id": 213318, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213318, "pid": 5, "tid": 7, "ts": 1716454224802163, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739599, "dur": 12, "args": { "External id": 213318, "cbid": 211, "correlation": 213318 } }, { "ph": "s", "id": 213318, "pid": 76337, "tid": -914061504, "ts": 1716454224739599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224802229, "dur": 68, "args": { "External id": 213326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213326, "pid": 5, "tid": 7, "ts": 1716454224802229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739631, "dur": 8, "args": { "External id": 213326, "cbid": 211, "correlation": 213326 } }, { "ph": "s", "id": 213326, "pid": 76337, "tid": -914061504, "ts": 1716454224739631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224739710, "dur": 1, "args": { "External id": 213342, "cbid": 251, "correlation": 213342 } }, { "ph": "f", "id": 213342, "pid": 76337, "tid": -914061504, "ts": 1716454224739710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224802299, "dur": 1, "args": { "External id": 213344, "device": 5, "context": 1, "stream": 7, "correlation": 213344, "bytes": 240, "memory bandwidth (GB/s)": 0.15306122448979592 } }, { "ph": "f", "id": 213344, "pid": 5, "tid": 7, "ts": 1716454224802299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224739715, "dur": 9, "args": { "External id": 213344, "cbid": 51, "correlation": 213344 } }, { "ph": "s", "id": 213344, "pid": 76337, "tid": -914061504, "ts": 1716454224739715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224802303, "dur": 271, "args": { "External id": 213345, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213345, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213345, "pid": 5, "tid": 7, "ts": 1716454224802303, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739726, "dur": 11, "args": { "External id": 213345, "cbid": 211, "correlation": 213345 } }, { "ph": "s", "id": 213345, "pid": 76337, "tid": -914061504, "ts": 1716454224739726, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224802575, "dur": 14, "args": { "External id": 213353, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213353, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213353, "pid": 5, "tid": 7, "ts": 1716454224802575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739768, "dur": 11, "args": { "External id": 213353, "cbid": 211, "correlation": 213353 } }, { "ph": "s", "id": 213353, "pid": 76337, "tid": -914061504, "ts": 1716454224739768, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224802591, "dur": 38, "args": { "External id": 213364, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213364, "pid": 5, "tid": 7, "ts": 1716454224802591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739837, "dur": 12, "args": { "External id": 213364, "cbid": 211, "correlation": 213364 } }, { "ph": "s", "id": 213364, "pid": 76337, "tid": -914061504, "ts": 1716454224739837, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224739903, "dur": 0, "args": { "External id": 213376, "cbid": 317, "correlation": 213376 } }, { "ph": "f", "id": 213376, "pid": 76337, "tid": -914061504, "ts": 1716454224739903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224739903, "dur": 0, "args": { "External id": 213377, "cbid": 203, "correlation": 213377 } }, { "ph": "f", "id": 213377, "pid": 76337, "tid": -914061504, "ts": 1716454224739903, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224739904, "dur": 0, "args": { "External id": 213378, "cbid": 205, "correlation": 213378 } }, { "ph": "f", "id": 213378, "pid": 76337, "tid": -914061504, "ts": 1716454224739904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224802630, "dur": 13, "args": { "External id": 213382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213382, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213382, "pid": 5, "tid": 7, "ts": 1716454224802630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739919, "dur": 13, "args": { "External id": 213382, "cbid": 211, "correlation": 213382 } }, { "ph": "s", "id": 213382, "pid": 76337, "tid": -914061504, "ts": 1716454224739919, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224802645, "dur": 4, "args": { "External id": 213384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213384, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213384, "pid": 5, "tid": 7, "ts": 1716454224802645, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739937, "dur": 6, "args": { "External id": 213384, "cbid": 211, "correlation": 213384 } }, { "ph": "s", "id": 213384, "pid": 76337, "tid": -914061504, "ts": 1716454224739937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224739946, "dur": 0, "args": { "External id": 213385, "cbid": 51, "correlation": 213385 } }, { "ph": "s", "id": 213385, "pid": 76337, "tid": -914061504, "ts": 1716454224739946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224802650, "dur": 98, "args": { "External id": 213386, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213386, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 213386, "pid": 5, "tid": 7, "ts": 1716454224802650, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739946, "dur": 5, "args": { "External id": 213386, "cbid": 211, "correlation": 213386 } }, { "ph": "s", "id": 213386, "pid": 76337, "tid": -914061504, "ts": 1716454224739946, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224802750, "dur": 16, "args": { "External id": 213391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213391, "pid": 5, "tid": 7, "ts": 1716454224802750, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224739982, "dur": 9, "args": { "External id": 213391, "cbid": 211, "correlation": 213391 } }, { "ph": "s", "id": 213391, "pid": 76337, "tid": -914061504, "ts": 1716454224739982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224802768, "dur": 12, "args": { "External id": 213399, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213399, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213399, "pid": 5, "tid": 7, "ts": 1716454224802768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740014, "dur": 8, "args": { "External id": 213399, "cbid": 211, "correlation": 213399 } }, { "ph": "s", "id": 213399, "pid": 76337, "tid": -914061504, "ts": 1716454224740014, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224802781, "dur": 24, "args": { "External id": 213408, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213408, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213408, "pid": 5, "tid": 7, "ts": 1716454224802781, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740054, "dur": 10, "args": { "External id": 213408, "cbid": 211, "correlation": 213408 } }, { "ph": "s", "id": 213408, "pid": 76337, "tid": -914061504, "ts": 1716454224740054, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224802806, "dur": 24, "args": { "External id": 213428, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213428, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 213428, "pid": 5, "tid": 7, "ts": 1716454224802806, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740126, "dur": 12, "args": { "External id": 213428, "cbid": 211, "correlation": 213428 } }, { "ph": "s", "id": 213428, "pid": 76337, "tid": -914061504, "ts": 1716454224740126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224802832, "dur": 5, "args": { "External id": 213440, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213440, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 213440, "pid": 5, "tid": 7, "ts": 1716454224802832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740148, "dur": 6, "args": { "External id": 213440, "cbid": 211, "correlation": 213440 } }, { "ph": "s", "id": 213440, "pid": 76337, "tid": -914061504, "ts": 1716454224740148, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224802838, "dur": 25, "args": { "External id": 213443, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213443, "registers per thread": 16, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213443, "pid": 5, "tid": 7, "ts": 1716454224802838, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740166, "dur": 6, "args": { "External id": 213443, "cbid": 211, "correlation": 213443 } }, { "ph": "s", "id": 213443, "pid": 76337, "tid": -914061504, "ts": 1716454224740166, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224802865, "dur": 18, "args": { "External id": 213452, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213452, "registers per thread": 24, "shared memory": 0, "blocks per SM": 72, "warps per SM": 288, "grid": [5760, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213452, "pid": 5, "tid": 7, "ts": 1716454224802865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740204, "dur": 9, "args": { "External id": 213452, "cbid": 211, "correlation": 213452 } }, { "ph": "s", "id": 213452, "pid": 76337, "tid": -914061504, "ts": 1716454224740204, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224740257, "dur": 0, "args": { "External id": 213462, "cbid": 317, "correlation": 213462 } }, { "ph": "f", "id": 213462, "pid": 76337, "tid": -914061504, "ts": 1716454224740257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224740257, "dur": 0, "args": { "External id": 213463, "cbid": 203, "correlation": 213463 } }, { "ph": "f", "id": 213463, "pid": 76337, "tid": -914061504, "ts": 1716454224740257, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224740258, "dur": 0, "args": { "External id": 213464, "cbid": 205, "correlation": 213464 } }, { "ph": "f", "id": 213464, "pid": 76337, "tid": -914061504, "ts": 1716454224740258, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224802884, "dur": 17, "args": { "External id": 213468, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213468, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213468, "pid": 5, "tid": 7, "ts": 1716454224802884, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740273, "dur": 12, "args": { "External id": 213468, "cbid": 211, "correlation": 213468 } }, { "ph": "s", "id": 213468, "pid": 76337, "tid": -914061504, "ts": 1716454224740273, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224802902, "dur": 242, "args": { "External id": 213470, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213470, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 960, "warps per SM": 7680, "grid": [1, 60, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213470, "pid": 5, "tid": 7, "ts": 1716454224802902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740288, "dur": 5, "args": { "External id": 213470, "cbid": 211, "correlation": 213470 } }, { "ph": "s", "id": 213470, "pid": 76337, "tid": -914061504, "ts": 1716454224740288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224803146, "dur": 1, "args": { "External id": 213472, "device": 5, "context": 1, "stream": 7, "correlation": 213472, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 213472, "pid": 5, "tid": 7, "ts": 1716454224803146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224740299, "dur": 8, "args": { "External id": 213472, "cbid": 51, "correlation": 213472 } }, { "ph": "s", "id": 213472, "pid": 76337, "tid": -914061504, "ts": 1716454224740299, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224803150, "dur": 813, "args": { "External id": 213473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213473, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 24, 2], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213473, "pid": 5, "tid": 7, "ts": 1716454224803150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740307, "dur": 6, "args": { "External id": 213473, "cbid": 211, "correlation": 213473 } }, { "ph": "s", "id": 213473, "pid": 76337, "tid": -914061504, "ts": 1716454224740307, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224803965, "dur": 14, "args": { "External id": 213475, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213475, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213475, "pid": 5, "tid": 7, "ts": 1716454224803965, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740318, "dur": 5, "args": { "External id": 213475, "cbid": 211, "correlation": 213475 } }, { "ph": "s", "id": 213475, "pid": 76337, "tid": -914061504, "ts": 1716454224740318, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224803980, "dur": 15, "args": { "External id": 213481, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213481, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213481, "pid": 5, "tid": 7, "ts": 1716454224803980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740346, "dur": 8, "args": { "External id": 213481, "cbid": 211, "correlation": 213481 } }, { "ph": "s", "id": 213481, "pid": 76337, "tid": -914061504, "ts": 1716454224740346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224803996, "dur": 4, "args": { "External id": 213489, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213489, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 213489, "pid": 5, "tid": 7, "ts": 1716454224803996, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740390, "dur": 9, "args": { "External id": 213489, "cbid": 211, "correlation": 213489 } }, { "ph": "s", "id": 213489, "pid": 76337, "tid": -914061504, "ts": 1716454224740390, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224740454, "dur": 1, "args": { "External id": 213505, "cbid": 251, "correlation": 213505 } }, { "ph": "f", "id": 213505, "pid": 76337, "tid": -914061504, "ts": 1716454224740454, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224740459, "dur": 0, "args": { "External id": 213507, "cbid": 251, "correlation": 213507 } }, { "ph": "f", "id": 213507, "pid": 76337, "tid": -914061504, "ts": 1716454224740459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224804001, "dur": 14, "args": { "External id": 213508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213508, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213508, "pid": 5, "tid": 7, "ts": 1716454224804001, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740462, "dur": 11, "args": { "External id": 213508, "cbid": 211, "correlation": 213508 } }, { "ph": "s", "id": 213508, "pid": 76337, "tid": -914061504, "ts": 1716454224740462, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224804016, "dur": 5, "args": { "External id": 213510, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213510, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213510, "pid": 5, "tid": 7, "ts": 1716454224804016, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740475, "dur": 5, "args": { "External id": 213510, "cbid": 211, "correlation": 213510 } }, { "ph": "s", "id": 213510, "pid": 76337, "tid": -914061504, "ts": 1716454224740475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224804022, "dur": 17, "args": { "External id": 213520, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213520, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213520, "pid": 5, "tid": 7, "ts": 1716454224804022, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740534, "dur": 12, "args": { "External id": 213520, "cbid": 211, "correlation": 213520 } }, { "ph": "s", "id": 213520, "pid": 76337, "tid": -914061504, "ts": 1716454224740534, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224804041, "dur": 17, "args": { "External id": 213540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213540, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 213540, "pid": 5, "tid": 7, "ts": 1716454224804041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740599, "dur": 11, "args": { "External id": 213540, "cbid": 211, "correlation": 213540 } }, { "ph": "s", "id": 213540, "pid": 76337, "tid": -914061504, "ts": 1716454224740599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224804060, "dur": 4, "args": { "External id": 213552, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213552, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 213552, "pid": 5, "tid": 7, "ts": 1716454224804060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740620, "dur": 6, "args": { "External id": 213552, "cbid": 211, "correlation": 213552 } }, { "ph": "s", "id": 213552, "pid": 76337, "tid": -914061504, "ts": 1716454224740620, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224804065, "dur": 16, "args": { "External id": 213555, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213555, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213555, "pid": 5, "tid": 7, "ts": 1716454224804065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740638, "dur": 6, "args": { "External id": 213555, "cbid": 211, "correlation": 213555 } }, { "ph": "s", "id": 213555, "pid": 76337, "tid": -914061504, "ts": 1716454224740638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224804083, "dur": 11, "args": { "External id": 213564, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213564, "registers per thread": 24, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213564, "pid": 5, "tid": 7, "ts": 1716454224804083, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740678, "dur": 10, "args": { "External id": 213564, "cbid": 211, "correlation": 213564 } }, { "ph": "s", "id": 213564, "pid": 76337, "tid": -914061504, "ts": 1716454224740678, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224740740, "dur": 0, "args": { "External id": 213574, "cbid": 317, "correlation": 213574 } }, { "ph": "f", "id": 213574, "pid": 76337, "tid": -914061504, "ts": 1716454224740740, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224740741, "dur": 0, "args": { "External id": 213575, "cbid": 203, "correlation": 213575 } }, { "ph": "f", "id": 213575, "pid": 76337, "tid": -914061504, "ts": 1716454224740741, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224740742, "dur": 0, "args": { "External id": 213576, "cbid": 205, "correlation": 213576 } }, { "ph": "f", "id": 213576, "pid": 76337, "tid": -914061504, "ts": 1716454224740742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224804095, "dur": 12, "args": { "External id": 213580, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213580, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213580, "pid": 5, "tid": 7, "ts": 1716454224804095, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740755, "dur": 12, "args": { "External id": 213580, "cbid": 211, "correlation": 213580 } }, { "ph": "s", "id": 213580, "pid": 76337, "tid": -914061504, "ts": 1716454224740755, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224804109, "dur": 164, "args": { "External id": 213582, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213582, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213582, "pid": 5, "tid": 7, "ts": 1716454224804109, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740769, "dur": 5, "args": { "External id": 213582, "cbid": 211, "correlation": 213582 } }, { "ph": "s", "id": 213582, "pid": 76337, "tid": -914061504, "ts": 1716454224740769, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224804275, "dur": 1, "args": { "External id": 213584, "device": 5, "context": 1, "stream": 7, "correlation": 213584, "bytes": 960, "memory bandwidth (GB/s)": 0.5882352941176471 } }, { "ph": "f", "id": 213584, "pid": 5, "tid": 7, "ts": 1716454224804275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224740780, "dur": 7, "args": { "External id": 213584, "cbid": 51, "correlation": 213584 } }, { "ph": "s", "id": 213584, "pid": 76337, "tid": -914061504, "ts": 1716454224740780, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224804279, "dur": 653, "args": { "External id": 213585, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213585, "registers per thread": 254, "shared memory": 41984, "blocks per SM": 4.5, "warps per SM": 36, "grid": [5, 24, 3], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213585, "pid": 5, "tid": 7, "ts": 1716454224804279, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740788, "dur": 6, "args": { "External id": 213585, "cbid": 211, "correlation": 213585 } }, { "ph": "s", "id": 213585, "pid": 76337, "tid": -914061504, "ts": 1716454224740788, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224804933, "dur": 12, "args": { "External id": 213587, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213587, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213587, "pid": 5, "tid": 7, "ts": 1716454224804933, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740798, "dur": 5, "args": { "External id": 213587, "cbid": 211, "correlation": 213587 } }, { "ph": "s", "id": 213587, "pid": 76337, "tid": -914061504, "ts": 1716454224740798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224804946, "dur": 15, "args": { "External id": 213593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213593, "pid": 5, "tid": 7, "ts": 1716454224804946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740827, "dur": 9, "args": { "External id": 213593, "cbid": 211, "correlation": 213593 } }, { "ph": "s", "id": 213593, "pid": 76337, "tid": -914061504, "ts": 1716454224740827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224740885, "dur": 0, "args": { "External id": 213603, "cbid": 317, "correlation": 213603 } }, { "ph": "f", "id": 213603, "pid": 76337, "tid": -914061504, "ts": 1716454224740885, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224740886, "dur": 0, "args": { "External id": 213604, "cbid": 203, "correlation": 213604 } }, { "ph": "f", "id": 213604, "pid": 76337, "tid": -914061504, "ts": 1716454224740886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224740887, "dur": 0, "args": { "External id": 213605, "cbid": 205, "correlation": 213605 } }, { "ph": "f", "id": 213605, "pid": 76337, "tid": -914061504, "ts": 1716454224740887, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224804963, "dur": 17, "args": { "External id": 213609, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213609, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 36, "warps per SM": 288, "grid": [6, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213609, "pid": 5, "tid": 7, "ts": 1716454224804963, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740901, "dur": 11, "args": { "External id": 213609, "cbid": 211, "correlation": 213609 } }, { "ph": "s", "id": 213609, "pid": 76337, "tid": -914061504, "ts": 1716454224740901, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224804981, "dur": 4, "args": { "External id": 213611, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213611, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213611, "pid": 5, "tid": 7, "ts": 1716454224804981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740916, "dur": 6, "args": { "External id": 213611, "cbid": 211, "correlation": 213611 } }, { "ph": "s", "id": 213611, "pid": 76337, "tid": -914061504, "ts": 1716454224740916, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224740925, "dur": 0, "args": { "External id": 213612, "cbid": 51, "correlation": 213612 } }, { "ph": "s", "id": 213612, "pid": 76337, "tid": -914061504, "ts": 1716454224740925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224804986, "dur": 132, "args": { "External id": 213613, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213613, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 213613, "pid": 5, "tid": 7, "ts": 1716454224804986, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740926, "dur": 5, "args": { "External id": 213613, "cbid": 211, "correlation": 213613 } }, { "ph": "s", "id": 213613, "pid": 76337, "tid": -914061504, "ts": 1716454224740926, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224805120, "dur": 16, "args": { "External id": 213618, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213618, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213618, "pid": 5, "tid": 7, "ts": 1716454224805120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740952, "dur": 8, "args": { "External id": 213618, "cbid": 211, "correlation": 213618 } }, { "ph": "s", "id": 213618, "pid": 76337, "tid": -914061504, "ts": 1716454224740952, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224805137, "dur": 12, "args": { "External id": 213626, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213626, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213626, "pid": 5, "tid": 7, "ts": 1716454224805137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224740991, "dur": 9, "args": { "External id": 213626, "cbid": 211, "correlation": 213626 } }, { "ph": "s", "id": 213626, "pid": 76337, "tid": -914061504, "ts": 1716454224740991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224805150, "dur": 10, "args": { "External id": 213634, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213634, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213634, "pid": 5, "tid": 7, "ts": 1716454224805150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741022, "dur": 8, "args": { "External id": 213634, "cbid": 211, "correlation": 213634 } }, { "ph": "s", "id": 213634, "pid": 76337, "tid": -914061504, "ts": 1716454224741022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224805162, "dur": 19, "args": { "External id": 213654, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213654, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 213654, "pid": 5, "tid": 7, "ts": 1716454224805162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741105, "dur": 12, "args": { "External id": 213654, "cbid": 211, "correlation": 213654 } }, { "ph": "s", "id": 213654, "pid": 76337, "tid": -914061504, "ts": 1716454224741105, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224805182, "dur": 4, "args": { "External id": 213666, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213666, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 213666, "pid": 5, "tid": 7, "ts": 1716454224805182, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741126, "dur": 6, "args": { "External id": 213666, "cbid": 211, "correlation": 213666 } }, { "ph": "s", "id": 213666, "pid": 76337, "tid": -914061504, "ts": 1716454224741126, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224805188, "dur": 17, "args": { "External id": 213669, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213669, "pid": 5, "tid": 7, "ts": 1716454224805188, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741145, "dur": 6, "args": { "External id": 213669, "cbid": 211, "correlation": 213669 } }, { "ph": "s", "id": 213669, "pid": 76337, "tid": -914061504, "ts": 1716454224741145, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224741201, "dur": 0, "args": { "External id": 213680, "cbid": 317, "correlation": 213680 } }, { "ph": "f", "id": 213680, "pid": 76337, "tid": -914061504, "ts": 1716454224741201, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224741202, "dur": 0, "args": { "External id": 213681, "cbid": 203, "correlation": 213681 } }, { "ph": "f", "id": 213681, "pid": 76337, "tid": -914061504, "ts": 1716454224741202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224741203, "dur": 0, "args": { "External id": 213682, "cbid": 205, "correlation": 213682 } }, { "ph": "f", "id": 213682, "pid": 76337, "tid": -914061504, "ts": 1716454224741203, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224805206, "dur": 13, "args": { "External id": 213686, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213686, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213686, "pid": 5, "tid": 7, "ts": 1716454224805206, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741216, "dur": 12, "args": { "External id": 213686, "cbid": 211, "correlation": 213686 } }, { "ph": "s", "id": 213686, "pid": 76337, "tid": -914061504, "ts": 1716454224741216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224805220, "dur": 3, "args": { "External id": 213688, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213688, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213688, "pid": 5, "tid": 7, "ts": 1716454224805220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741232, "dur": 6, "args": { "External id": 213688, "cbid": 211, "correlation": 213688 } }, { "ph": "s", "id": 213688, "pid": 76337, "tid": -914061504, "ts": 1716454224741232, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224741241, "dur": 0, "args": { "External id": 213689, "cbid": 51, "correlation": 213689 } }, { "ph": "s", "id": 213689, "pid": 76337, "tid": -914061504, "ts": 1716454224741241, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224805224, "dur": 91, "args": { "External id": 213690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213690, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 213690, "pid": 5, "tid": 7, "ts": 1716454224805224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741242, "dur": 5, "args": { "External id": 213690, "cbid": 211, "correlation": 213690 } }, { "ph": "s", "id": 213690, "pid": 76337, "tid": -914061504, "ts": 1716454224741242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224805317, "dur": 16, "args": { "External id": 213695, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213695, "pid": 5, "tid": 7, "ts": 1716454224805317, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741268, "dur": 8, "args": { "External id": 213695, "cbid": 211, "correlation": 213695 } }, { "ph": "s", "id": 213695, "pid": 76337, "tid": -914061504, "ts": 1716454224741268, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224805334, "dur": 85, "args": { "External id": 213704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213704, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213704, "pid": 5, "tid": 7, "ts": 1716454224805334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741350, "dur": 13, "args": { "External id": 213704, "cbid": 211, "correlation": 213704 } }, { "ph": "s", "id": 213704, "pid": 76337, "tid": -914061504, "ts": 1716454224741350, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224805421, "dur": 30, "args": { "External id": 213726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213726, "registers per thread": 32, "shared memory": 24, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [3072, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213726, "pid": 5, "tid": 7, "ts": 1716454224805421, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741405, "dur": 10, "args": { "External id": 213726, "cbid": 211, "correlation": 213726 } }, { "ph": "s", "id": 213726, "pid": 76337, "tid": -914061504, "ts": 1716454224741405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224741496, "dur": 1, "args": { "External id": 213737, "cbid": 251, "correlation": 213737 } }, { "ph": "f", "id": 213737, "pid": 76337, "tid": -914061504, "ts": 1716454224741496, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224805452, "dur": 166, "args": { "External id": 213738, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213738, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213738, "pid": 5, "tid": 7, "ts": 1716454224805452, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741502, "dur": 13, "args": { "External id": 213738, "cbid": 211, "correlation": 213738 } }, { "ph": "s", "id": 213738, "pid": 76337, "tid": -914061504, "ts": 1716454224741502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224741571, "dur": 1, "args": { "External id": 213749, "cbid": 251, "correlation": 213749 } }, { "ph": "f", "id": 213749, "pid": 76337, "tid": -914061504, "ts": 1716454224741571, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224805619, "dur": 159, "args": { "External id": 213750, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213750, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213750, "pid": 5, "tid": 7, "ts": 1716454224805619, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741575, "dur": 12, "args": { "External id": 213750, "cbid": 211, "correlation": 213750 } }, { "ph": "s", "id": 213750, "pid": 76337, "tid": -914061504, "ts": 1716454224741575, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224741642, "dur": 1, "args": { "External id": 213761, "cbid": 251, "correlation": 213761 } }, { "ph": "f", "id": 213761, "pid": 76337, "tid": -914061504, "ts": 1716454224741642, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224805780, "dur": 161, "args": { "External id": 213762, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213762, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213762, "pid": 5, "tid": 7, "ts": 1716454224805780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741646, "dur": 11, "args": { "External id": 213762, "cbid": 211, "correlation": 213762 } }, { "ph": "s", "id": 213762, "pid": 76337, "tid": -914061504, "ts": 1716454224741646, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224805942, "dur": 337, "args": { "External id": 213787, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213787, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 9.6, "warps per SM": 38.4, "grid": [12, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213787, "pid": 5, "tid": 7, "ts": 1716454224805942, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741730, "dur": 13, "args": { "External id": 213787, "cbid": 211, "correlation": 213787 } }, { "ph": "s", "id": 213787, "pid": 76337, "tid": -914061504, "ts": 1716454224741730, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224741830, "dur": 1, "args": { "External id": 213805, "cbid": 251, "correlation": 213805 } }, { "ph": "f", "id": 213805, "pid": 76337, "tid": -914061504, "ts": 1716454224741830, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224806280, "dur": 166, "args": { "External id": 213807, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213807, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [10, 24, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213807, "pid": 5, "tid": 7, "ts": 1716454224806280, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741836, "dur": 14, "args": { "External id": 213807, "cbid": 211, "correlation": 213807 } }, { "ph": "s", "id": 213807, "pid": 76337, "tid": -914061504, "ts": 1716454224741836, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224806448, "dur": 19, "args": { "External id": 213815, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213815, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213815, "pid": 5, "tid": 7, "ts": 1716454224806448, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741906, "dur": 12, "args": { "External id": 213815, "cbid": 211, "correlation": 213815 } }, { "ph": "s", "id": 213815, "pid": 76337, "tid": -914061504, "ts": 1716454224741906, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224806468, "dur": 28, "args": { "External id": 213823, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213823, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213823, "pid": 5, "tid": 7, "ts": 1716454224806468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224741945, "dur": 8, "args": { "External id": 213823, "cbid": 211, "correlation": 213823 } }, { "ph": "s", "id": 213823, "pid": 76337, "tid": -914061504, "ts": 1716454224741945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224806497, "dur": 18, "args": { "External id": 213834, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213834, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213834, "pid": 5, "tid": 7, "ts": 1716454224806497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742027, "dur": 14, "args": { "External id": 213834, "cbid": 211, "correlation": 213834 } }, { "ph": "s", "id": 213834, "pid": 76337, "tid": -914061504, "ts": 1716454224742027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224806517, "dur": 16, "args": { "External id": 213856, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213856, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213856, "pid": 5, "tid": 7, "ts": 1716454224806517, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742059, "dur": 7, "args": { "External id": 213856, "cbid": 211, "correlation": 213856 } }, { "ph": "s", "id": 213856, "pid": 76337, "tid": -914061504, "ts": 1716454224742059, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742146, "dur": 1, "args": { "External id": 213867, "cbid": 251, "correlation": 213867 } }, { "ph": "f", "id": 213867, "pid": 76337, "tid": -914061504, "ts": 1716454224742146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224806535, "dur": 90, "args": { "External id": 213868, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213868, "registers per thread": 250, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213868, "pid": 5, "tid": 7, "ts": 1716454224806535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742151, "dur": 13, "args": { "External id": 213868, "cbid": 211, "correlation": 213868 } }, { "ph": "s", "id": 213868, "pid": 76337, "tid": -914061504, "ts": 1716454224742151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742220, "dur": 1, "args": { "External id": 213879, "cbid": 251, "correlation": 213879 } }, { "ph": "f", "id": 213879, "pid": 76337, "tid": -914061504, "ts": 1716454224742220, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742224, "dur": 0, "args": { "External id": 213880, "cbid": 251, "correlation": 213880 } }, { "ph": "f", "id": 213880, "pid": 76337, "tid": -914061504, "ts": 1716454224742224, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224806626, "dur": 12, "args": { "External id": 213881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213881, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213881, "pid": 5, "tid": 7, "ts": 1716454224806626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742226, "dur": 12, "args": { "External id": 213881, "cbid": 211, "correlation": 213881 } }, { "ph": "s", "id": 213881, "pid": 76337, "tid": -914061504, "ts": 1716454224742226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224806640, "dur": 6, "args": { "External id": 213883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213883, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213883, "pid": 5, "tid": 7, "ts": 1716454224806640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742239, "dur": 7, "args": { "External id": 213883, "cbid": 211, "correlation": 213883 } }, { "ph": "s", "id": 213883, "pid": 76337, "tid": -914061504, "ts": 1716454224742239, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742298, "dur": 1, "args": { "External id": 213894, "cbid": 251, "correlation": 213894 } }, { "ph": "f", "id": 213894, "pid": 76337, "tid": -914061504, "ts": 1716454224742298, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742301, "dur": 0, "args": { "External id": 213895, "cbid": 251, "correlation": 213895 } }, { "ph": "f", "id": 213895, "pid": 76337, "tid": -914061504, "ts": 1716454224742301, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224806647, "dur": 9, "args": { "External id": 213896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213896, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 2, "warps per SM": 8, "grid": [8, 5, 4], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213896, "pid": 5, "tid": 7, "ts": 1716454224806647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742302, "dur": 11, "args": { "External id": 213896, "cbid": 211, "correlation": 213896 } }, { "ph": "s", "id": 213896, "pid": 76337, "tid": -914061504, "ts": 1716454224742302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224806657, "dur": 3, "args": { "External id": 213898, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213898, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 8, "grid": [40, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213898, "pid": 5, "tid": 7, "ts": 1716454224806657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742315, "dur": 5, "args": { "External id": 213898, "cbid": 211, "correlation": 213898 } }, { "ph": "s", "id": 213898, "pid": 76337, "tid": -914061504, "ts": 1716454224742315, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224806661, "dur": 55, "args": { "External id": 213923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213923, "registers per thread": 168, "shared memory": 33280, "blocks per SM": 4.8, "warps per SM": 19.2, "grid": [6, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 213923, "pid": 5, "tid": 7, "ts": 1716454224806661, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742392, "dur": 13, "args": { "External id": 213923, "cbid": 211, "correlation": 213923 } }, { "ph": "s", "id": 213923, "pid": 76337, "tid": -914061504, "ts": 1716454224742392, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742491, "dur": 1, "args": { "External id": 213941, "cbid": 251, "correlation": 213941 } }, { "ph": "f", "id": 213941, "pid": 76337, "tid": -914061504, "ts": 1716454224742491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224806718, "dur": 92, "args": { "External id": 213943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213943, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 0.75, "warps per SM": 6, "grid": [5, 12, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 213943, "pid": 5, "tid": 7, "ts": 1716454224806718, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742498, "dur": 14, "args": { "External id": 213943, "cbid": 211, "correlation": 213943 } }, { "ph": "s", "id": 213943, "pid": 76337, "tid": -914061504, "ts": 1716454224742498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224806812, "dur": 10, "args": { "External id": 213951, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213951, "registers per thread": 17, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213951, "pid": 5, "tid": 7, "ts": 1716454224806812, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742567, "dur": 12, "args": { "External id": 213951, "cbid": 211, "correlation": 213951 } }, { "ph": "s", "id": 213951, "pid": 76337, "tid": -914061504, "ts": 1716454224742567, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224806823, "dur": 22, "args": { "External id": 213959, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213959, "pid": 5, "tid": 7, "ts": 1716454224806823, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742608, "dur": 9, "args": { "External id": 213959, "cbid": 211, "correlation": 213959 } }, { "ph": "s", "id": 213959, "pid": 76337, "tid": -914061504, "ts": 1716454224742608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224806846, "dur": 18, "args": { "External id": 213981, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 213981, "registers per thread": 32, "shared memory": 24, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [1536, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 213981, "pid": 5, "tid": 7, "ts": 1716454224806846, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742659, "dur": 10, "args": { "External id": 213981, "cbid": 211, "correlation": 213981 } }, { "ph": "s", "id": 213981, "pid": 76337, "tid": -914061504, "ts": 1716454224742659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742746, "dur": 1, "args": { "External id": 213997, "cbid": 251, "correlation": 213997 } }, { "ph": "f", "id": 213997, "pid": 76337, "tid": -914061504, "ts": 1716454224742746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742751, "dur": 0, "args": { "External id": 213999, "cbid": 251, "correlation": 213999 } }, { "ph": "f", "id": 213999, "pid": 76337, "tid": -914061504, "ts": 1716454224742751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_tensorop_f16_s884gemm_relu_f16_128x128_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224806865, "dur": 499, "args": { "External id": 214000, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214000, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214000, "pid": 5, "tid": 7, "ts": 1716454224806865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742753, "dur": 12, "args": { "External id": 214000, "cbid": 211, "correlation": 214000 } }, { "ph": "s", "id": 214000, "pid": 76337, "tid": -914061504, "ts": 1716454224742753, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224807365, "dur": 66, "args": { "External id": 214008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214008, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214008, "pid": 5, "tid": 7, "ts": 1716454224807365, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742818, "dur": 12, "args": { "External id": 214008, "cbid": 211, "correlation": 214008 } }, { "ph": "s", "id": 214008, "pid": 76337, "tid": -914061504, "ts": 1716454224742818, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224807433, "dur": 65, "args": { "External id": 214016, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214016, "pid": 5, "tid": 7, "ts": 1716454224807433, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742849, "dur": 8, "args": { "External id": 214016, "cbid": 211, "correlation": 214016 } }, { "ph": "s", "id": 214016, "pid": 76337, "tid": -914061504, "ts": 1716454224742849, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224742927, "dur": 1, "args": { "External id": 214032, "cbid": 251, "correlation": 214032 } }, { "ph": "f", "id": 214032, "pid": 76337, "tid": -914061504, "ts": 1716454224742927, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 5, "tid": 7, "ts": 1716454224807500, "dur": 1, "args": { "External id": 214034, "device": 5, "context": 1, "stream": 7, "correlation": 214034, "bytes": 240, "memory bandwidth (GB/s)": 0.15625 } }, { "ph": "f", "id": 214034, "pid": 5, "tid": 7, "ts": 1716454224807500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224742932, "dur": 9, "args": { "External id": 214034, "cbid": 51, "correlation": 214034 } }, { "ph": "s", "id": 214034, "pid": 76337, "tid": -914061504, "ts": 1716454224742932, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_256x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224807504, "dur": 272, "args": { "External id": 214035, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214035, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 12, 4], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 214035, "pid": 5, "tid": 7, "ts": 1716454224807504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742943, "dur": 12, "args": { "External id": 214035, "cbid": 211, "correlation": 214035 } }, { "ph": "s", "id": 214035, "pid": 76337, "tid": -914061504, "ts": 1716454224742943, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224807778, "dur": 14, "args": { "External id": 214043, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214043, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214043, "pid": 5, "tid": 7, "ts": 1716454224807778, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224742993, "dur": 12, "args": { "External id": 214043, "cbid": 211, "correlation": 214043 } }, { "ph": "s", "id": 214043, "pid": 76337, "tid": -914061504, "ts": 1716454224742993, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224807793, "dur": 38, "args": { "External id": 214054, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214054, "pid": 5, "tid": 7, "ts": 1716454224807793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743062, "dur": 12, "args": { "External id": 214054, "cbid": 211, "correlation": 214054 } }, { "ph": "s", "id": 214054, "pid": 76337, "tid": -914061504, "ts": 1716454224743062, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224743127, "dur": 0, "args": { "External id": 214066, "cbid": 317, "correlation": 214066 } }, { "ph": "f", "id": 214066, "pid": 76337, "tid": -914061504, "ts": 1716454224743127, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224743128, "dur": 0, "args": { "External id": 214067, "cbid": 203, "correlation": 214067 } }, { "ph": "f", "id": 214067, "pid": 76337, "tid": -914061504, "ts": 1716454224743128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224743128, "dur": 0, "args": { "External id": 214068, "cbid": 205, "correlation": 214068 } }, { "ph": "f", "id": 214068, "pid": 76337, "tid": -914061504, "ts": 1716454224743128, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224807832, "dur": 13, "args": { "External id": 214072, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214072, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 24, "warps per SM": 192, "grid": [6, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214072, "pid": 5, "tid": 7, "ts": 1716454224807832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743143, "dur": 13, "args": { "External id": 214072, "cbid": 211, "correlation": 214072 } }, { "ph": "s", "id": 214072, "pid": 76337, "tid": -914061504, "ts": 1716454224743143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224807847, "dur": 4, "args": { "External id": 214074, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214074, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.0125, "warps per SM": 0.1, "grid": [1, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 214074, "pid": 5, "tid": 7, "ts": 1716454224807847, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743161, "dur": 6, "args": { "External id": 214074, "cbid": 211, "correlation": 214074 } }, { "ph": "s", "id": 214074, "pid": 76337, "tid": -914061504, "ts": 1716454224743161, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224743169, "dur": 0, "args": { "External id": 214075, "cbid": 51, "correlation": 214075 } }, { "ph": "s", "id": 214075, "pid": 76337, "tid": -914061504, "ts": 1716454224743169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224807852, "dur": 98, "args": { "External id": 214076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214076, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 1.5, "warps per SM": 6, "grid": [6, 20, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 214076, "pid": 5, "tid": 7, "ts": 1716454224807852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743170, "dur": 5, "args": { "External id": 214076, "cbid": 211, "correlation": 214076 } }, { "ph": "s", "id": 214076, "pid": 76337, "tid": -914061504, "ts": 1716454224743170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224807951, "dur": 17, "args": { "External id": 214081, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214081, "registers per thread": 16, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214081, "pid": 5, "tid": 7, "ts": 1716454224807951, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743197, "dur": 9, "args": { "External id": 214081, "cbid": 211, "correlation": 214081 } }, { "ph": "s", "id": 214081, "pid": 76337, "tid": -914061504, "ts": 1716454224743197, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224807969, "dur": 12, "args": { "External id": 214089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214089, "registers per thread": 19, "shared memory": 0, "blocks per SM": 48, "warps per SM": 192, "grid": [3840, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214089, "pid": 5, "tid": 7, "ts": 1716454224807969, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743229, "dur": 8, "args": { "External id": 214089, "cbid": 211, "correlation": 214089 } }, { "ph": "s", "id": 214089, "pid": 76337, "tid": -914061504, "ts": 1716454224743229, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224807983, "dur": 57, "args": { "External id": 214100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214100, "registers per thread": 24, "shared memory": 0, "blocks per SM": 64, "warps per SM": 1024, "grid": [2, 1, 2560], "block": [16, 32, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214100, "pid": 5, "tid": 7, "ts": 1716454224807983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743293, "dur": 12, "args": { "External id": 214100, "cbid": 211, "correlation": 214100 } }, { "ph": "s", "id": 214100, "pid": 76337, "tid": -914061504, "ts": 1716454224743293, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224743350, "dur": 0, "args": { "External id": 214110, "cbid": 317, "correlation": 214110 } }, { "ph": "f", "id": 214110, "pid": 76337, "tid": -914061504, "ts": 1716454224743350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224743351, "dur": 0, "args": { "External id": 214111, "cbid": 203, "correlation": 214111 } }, { "ph": "f", "id": 214111, "pid": 76337, "tid": -914061504, "ts": 1716454224743351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224743351, "dur": 0, "args": { "External id": 214112, "cbid": 205, "correlation": 214112 } }, { "ph": "f", "id": 214112, "pid": 76337, "tid": -914061504, "ts": 1716454224743351, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224808041, "dur": 39, "args": { "External id": 214116, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214116, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214116, "pid": 5, "tid": 7, "ts": 1716454224808041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743368, "dur": 11, "args": { "External id": 214116, "cbid": 211, "correlation": 214116 } }, { "ph": "s", "id": 214116, "pid": 76337, "tid": -914061504, "ts": 1716454224743368, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224808081, "dur": 164, "args": { "External id": 214118, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214118, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 640, "warps per SM": 5120, "grid": [1, 40, 1280], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214118, "pid": 5, "tid": 7, "ts": 1716454224808081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743382, "dur": 5, "args": { "External id": 214118, "cbid": 211, "correlation": 214118 } }, { "ph": "s", "id": 214118, "pid": 76337, "tid": -914061504, "ts": 1716454224743382, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224808246, "dur": 1974, "args": { "External id": 214120, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214120, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 6, "warps per SM": 24, "grid": [10, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214120, "pid": 5, "tid": 7, "ts": 1716454224808246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743393, "dur": 8, "args": { "External id": 214120, "cbid": 211, "correlation": 214120 } }, { "ph": "s", "id": 214120, "pid": 76337, "tid": -914061504, "ts": 1716454224743393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224810222, "dur": 38, "args": { "External id": 214122, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214122, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214122, "pid": 5, "tid": 7, "ts": 1716454224810222, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743405, "dur": 5, "args": { "External id": 214122, "cbid": 211, "correlation": 214122 } }, { "ph": "s", "id": 214122, "pid": 76337, "tid": -914061504, "ts": 1716454224743405, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224810261, "dur": 59, "args": { "External id": 214128, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214128, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214128, "pid": 5, "tid": 7, "ts": 1716454224810261, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743432, "dur": 8, "args": { "External id": 214128, "cbid": 211, "correlation": 214128 } }, { "ph": "s", "id": 214128, "pid": 76337, "tid": -914061504, "ts": 1716454224743432, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224810322, "dur": 86, "args": { "External id": 214137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214137, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214137, "pid": 5, "tid": 7, "ts": 1716454224810322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743522, "dur": 13, "args": { "External id": 214137, "cbid": 211, "correlation": 214137 } }, { "ph": "s", "id": 214137, "pid": 76337, "tid": -914061504, "ts": 1716454224743522, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224810409, "dur": 74, "args": { "External id": 214157, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214157, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 214157, "pid": 5, "tid": 7, "ts": 1716454224810409, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743610, "dur": 12, "args": { "External id": 214157, "cbid": 211, "correlation": 214157 } }, { "ph": "s", "id": 214157, "pid": 76337, "tid": -914061504, "ts": 1716454224743610, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224810485, "dur": 5, "args": { "External id": 214169, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214169, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.75, "warps per SM": 6, "grid": [60, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 9 } }, { "ph": "f", "id": 214169, "pid": 5, "tid": 7, "ts": 1716454224810485, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743632, "dur": 6, "args": { "External id": 214169, "cbid": 211, "correlation": 214169 } }, { "ph": "s", "id": 214169, "pid": 76337, "tid": -914061504, "ts": 1716454224743632, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224810491, "dur": 83, "args": { "External id": 214172, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214172, "registers per thread": 16, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214172, "pid": 5, "tid": 7, "ts": 1716454224810491, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743652, "dur": 7, "args": { "External id": 214172, "cbid": 211, "correlation": 214172 } }, { "ph": "s", "id": 214172, "pid": 76337, "tid": -914061504, "ts": 1716454224743652, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224810576, "dur": 53, "args": { "External id": 214181, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214181, "registers per thread": 24, "shared memory": 0, "blocks per SM": 288, "warps per SM": 1152, "grid": [23040, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214181, "pid": 5, "tid": 7, "ts": 1716454224810576, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743692, "dur": 10, "args": { "External id": 214181, "cbid": 211, "correlation": 214181 } }, { "ph": "s", "id": 214181, "pid": 76337, "tid": -914061504, "ts": 1716454224743692, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224743746, "dur": 0, "args": { "External id": 214191, "cbid": 317, "correlation": 214191 } }, { "ph": "f", "id": 214191, "pid": 76337, "tid": -914061504, "ts": 1716454224743746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224743746, "dur": 0, "args": { "External id": 214192, "cbid": 203, "correlation": 214192 } }, { "ph": "f", "id": 214192, "pid": 76337, "tid": -914061504, "ts": 1716454224743746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224743747, "dur": 0, "args": { "External id": 214193, "cbid": 205, "correlation": 214193 } }, { "ph": "f", "id": 214193, "pid": 76337, "tid": -914061504, "ts": 1716454224743747, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224810630, "dur": 58, "args": { "External id": 214197, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214197, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214197, "pid": 5, "tid": 7, "ts": 1716454224810630, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743764, "dur": 12, "args": { "External id": 214197, "cbid": 211, "correlation": 214197 } }, { "ph": "s", "id": 214197, "pid": 76337, "tid": -914061504, "ts": 1716454224743764, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224810689, "dur": 123, "args": { "External id": 214199, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214199, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 480, "warps per SM": 3840, "grid": [1, 60, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214199, "pid": 5, "tid": 7, "ts": 1716454224810689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743779, "dur": 5, "args": { "External id": 214199, "cbid": 211, "correlation": 214199 } }, { "ph": "s", "id": 214199, "pid": 76337, "tid": -914061504, "ts": 1716454224743779, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224810814, "dur": 1905, "args": { "External id": 214201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214201, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214201, "pid": 5, "tid": 7, "ts": 1716454224810814, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743790, "dur": 6, "args": { "External id": 214201, "cbid": 211, "correlation": 214201 } }, { "ph": "s", "id": 214201, "pid": 76337, "tid": -914061504, "ts": 1716454224743790, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224812720, "dur": 21, "args": { "External id": 214203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214203, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214203, "pid": 5, "tid": 7, "ts": 1716454224812720, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743800, "dur": 5, "args": { "External id": 214203, "cbid": 211, "correlation": 214203 } }, { "ph": "s", "id": 214203, "pid": 76337, "tid": -914061504, "ts": 1716454224743800, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224812742, "dur": 33, "args": { "External id": 214209, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214209, "pid": 5, "tid": 7, "ts": 1716454224812742, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743827, "dur": 8, "args": { "External id": 214209, "cbid": 211, "correlation": 214209 } }, { "ph": "s", "id": 214209, "pid": 76337, "tid": -914061504, "ts": 1716454224743827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224812777, "dur": 3, "args": { "External id": 214217, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214217, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 214217, "pid": 5, "tid": 7, "ts": 1716454224812777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743871, "dur": 9, "args": { "External id": 214217, "cbid": 211, "correlation": 214217 } }, { "ph": "s", "id": 214217, "pid": 76337, "tid": -914061504, "ts": 1716454224743871, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224743937, "dur": 1, "args": { "External id": 214233, "cbid": 251, "correlation": 214233 } }, { "ph": "f", "id": 214233, "pid": 76337, "tid": -914061504, "ts": 1716454224743937, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224743943, "dur": 0, "args": { "External id": 214235, "cbid": 251, "correlation": 214235 } }, { "ph": "f", "id": 214235, "pid": 76337, "tid": -914061504, "ts": 1716454224743943, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224812782, "dur": 12, "args": { "External id": 214236, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214236, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 214236, "pid": 5, "tid": 7, "ts": 1716454224812782, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743945, "dur": 11, "args": { "External id": 214236, "cbid": 211, "correlation": 214236 } }, { "ph": "s", "id": 214236, "pid": 76337, "tid": -914061504, "ts": 1716454224743945, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224812795, "dur": 5, "args": { "External id": 214238, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214238, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 214238, "pid": 5, "tid": 7, "ts": 1716454224812795, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224743959, "dur": 5, "args": { "External id": 214238, "cbid": 211, "correlation": 214238 } }, { "ph": "s", "id": 214238, "pid": 76337, "tid": -914061504, "ts": 1716454224743959, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224812802, "dur": 28, "args": { "External id": 214248, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214248, "pid": 5, "tid": 7, "ts": 1716454224812802, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744025, "dur": 13, "args": { "External id": 214248, "cbid": 211, "correlation": 214248 } }, { "ph": "s", "id": 214248, "pid": 76337, "tid": -914061504, "ts": 1716454224744025, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224812832, "dur": 32, "args": { "External id": 214268, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214268, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 214268, "pid": 5, "tid": 7, "ts": 1716454224812832, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744091, "dur": 11, "args": { "External id": 214268, "cbid": 211, "correlation": 214268 } }, { "ph": "s", "id": 214268, "pid": 76337, "tid": -914061504, "ts": 1716454224744091, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224812865, "dur": 4, "args": { "External id": 214280, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214280, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 214280, "pid": 5, "tid": 7, "ts": 1716454224812865, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744112, "dur": 6, "args": { "External id": 214280, "cbid": 211, "correlation": 214280 } }, { "ph": "s", "id": 214280, "pid": 76337, "tid": -914061504, "ts": 1716454224744112, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224812871, "dur": 29, "args": { "External id": 214283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214283, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214283, "pid": 5, "tid": 7, "ts": 1716454224812871, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744129, "dur": 6, "args": { "External id": 214283, "cbid": 211, "correlation": 214283 } }, { "ph": "s", "id": 214283, "pid": 76337, "tid": -914061504, "ts": 1716454224744129, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224812901, "dur": 20, "args": { "External id": 214292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214292, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214292, "pid": 5, "tid": 7, "ts": 1716454224812901, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744169, "dur": 9, "args": { "External id": 214292, "cbid": 211, "correlation": 214292 } }, { "ph": "s", "id": 214292, "pid": 76337, "tid": -914061504, "ts": 1716454224744169, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224744232, "dur": 0, "args": { "External id": 214302, "cbid": 317, "correlation": 214302 } }, { "ph": "f", "id": 214302, "pid": 76337, "tid": -914061504, "ts": 1716454224744232, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224744233, "dur": 0, "args": { "External id": 214303, "cbid": 203, "correlation": 214303 } }, { "ph": "f", "id": 214303, "pid": 76337, "tid": -914061504, "ts": 1716454224744233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224744234, "dur": 0, "args": { "External id": 214304, "cbid": 205, "correlation": 214304 } }, { "ph": "f", "id": 214304, "pid": 76337, "tid": -914061504, "ts": 1716454224744234, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224812923, "dur": 22, "args": { "External id": 214308, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214308, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214308, "pid": 5, "tid": 7, "ts": 1716454224812923, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744249, "dur": 12, "args": { "External id": 214308, "cbid": 211, "correlation": 214308 } }, { "ph": "s", "id": 214308, "pid": 76337, "tid": -914061504, "ts": 1716454224744249, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224812946, "dur": 45, "args": { "External id": 214310, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214310, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214310, "pid": 5, "tid": 7, "ts": 1716454224812946, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744264, "dur": 5, "args": { "External id": 214310, "cbid": 211, "correlation": 214310 } }, { "ph": "s", "id": 214310, "pid": 76337, "tid": -914061504, "ts": 1716454224744264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224812992, "dur": 650, "args": { "External id": 214312, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214312, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214312, "pid": 5, "tid": 7, "ts": 1716454224812992, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744275, "dur": 6, "args": { "External id": 214312, "cbid": 211, "correlation": 214312 } }, { "ph": "s", "id": 214312, "pid": 76337, "tid": -914061504, "ts": 1716454224744275, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224813643, "dur": 22, "args": { "External id": 214314, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214314, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214314, "pid": 5, "tid": 7, "ts": 1716454224813643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744285, "dur": 5, "args": { "External id": 214314, "cbid": 211, "correlation": 214314 } }, { "ph": "s", "id": 214314, "pid": 76337, "tid": -914061504, "ts": 1716454224744285, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224813666, "dur": 33, "args": { "External id": 214320, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214320, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214320, "pid": 5, "tid": 7, "ts": 1716454224813666, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744314, "dur": 8, "args": { "External id": 214320, "cbid": 211, "correlation": 214320 } }, { "ph": "s", "id": 214320, "pid": 76337, "tid": -914061504, "ts": 1716454224744314, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224744372, "dur": 0, "args": { "External id": 214330, "cbid": 317, "correlation": 214330 } }, { "ph": "f", "id": 214330, "pid": 76337, "tid": -914061504, "ts": 1716454224744372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224744372, "dur": 0, "args": { "External id": 214331, "cbid": 203, "correlation": 214331 } }, { "ph": "f", "id": 214331, "pid": 76337, "tid": -914061504, "ts": 1716454224744372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224744373, "dur": 0, "args": { "External id": 214332, "cbid": 205, "correlation": 214332 } }, { "ph": "f", "id": 214332, "pid": 76337, "tid": -914061504, "ts": 1716454224744373, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224813700, "dur": 57, "args": { "External id": 214336, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214336, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 144, "warps per SM": 1152, "grid": [24, 60, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214336, "pid": 5, "tid": 7, "ts": 1716454224813700, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744385, "dur": 12, "args": { "External id": 214336, "cbid": 211, "correlation": 214336 } }, { "ph": "s", "id": 214336, "pid": 76337, "tid": -914061504, "ts": 1716454224744385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224813758, "dur": 272, "args": { "External id": 214338, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214338, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214338, "pid": 5, "tid": 7, "ts": 1716454224813758, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744403, "dur": 7, "args": { "External id": 214338, "cbid": 211, "correlation": 214338 } }, { "ph": "s", "id": 214338, "pid": 76337, "tid": -914061504, "ts": 1716454224744403, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224814031, "dur": 21, "args": { "External id": 214340, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214340, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214340, "pid": 5, "tid": 7, "ts": 1716454224814031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744414, "dur": 5, "args": { "External id": 214340, "cbid": 211, "correlation": 214340 } }, { "ph": "s", "id": 214340, "pid": 76337, "tid": -914061504, "ts": 1716454224744414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224814054, "dur": 32, "args": { "External id": 214346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214346, "pid": 5, "tid": 7, "ts": 1716454224814054, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744439, "dur": 9, "args": { "External id": 214346, "cbid": 211, "correlation": 214346 } }, { "ph": "s", "id": 214346, "pid": 76337, "tid": -914061504, "ts": 1716454224744439, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224814088, "dur": 27, "args": { "External id": 214354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214354, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214354, "pid": 5, "tid": 7, "ts": 1716454224814088, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744468, "dur": 8, "args": { "External id": 214354, "cbid": 211, "correlation": 214354 } }, { "ph": "s", "id": 214354, "pid": 76337, "tid": -914061504, "ts": 1716454224744468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224814116, "dur": 20, "args": { "External id": 214362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214362, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214362, "pid": 5, "tid": 7, "ts": 1716454224814116, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744497, "dur": 8, "args": { "External id": 214362, "cbid": 211, "correlation": 214362 } }, { "ph": "s", "id": 214362, "pid": 76337, "tid": -914061504, "ts": 1716454224744497, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224814137, "dur": 31, "args": { "External id": 214382, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214382, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 214382, "pid": 5, "tid": 7, "ts": 1716454224814137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744578, "dur": 12, "args": { "External id": 214382, "cbid": 211, "correlation": 214382 } }, { "ph": "s", "id": 214382, "pid": 76337, "tid": -914061504, "ts": 1716454224744578, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224814169, "dur": 4, "args": { "External id": 214394, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214394, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 214394, "pid": 5, "tid": 7, "ts": 1716454224814169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744601, "dur": 6, "args": { "External id": 214394, "cbid": 211, "correlation": 214394 } }, { "ph": "s", "id": 214394, "pid": 76337, "tid": -914061504, "ts": 1716454224744601, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224814175, "dur": 31, "args": { "External id": 214397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214397, "pid": 5, "tid": 7, "ts": 1716454224814175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744618, "dur": 6, "args": { "External id": 214397, "cbid": 211, "correlation": 214397 } }, { "ph": "s", "id": 214397, "pid": 76337, "tid": -914061504, "ts": 1716454224744618, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224744675, "dur": 0, "args": { "External id": 214408, "cbid": 317, "correlation": 214408 } }, { "ph": "f", "id": 214408, "pid": 76337, "tid": -914061504, "ts": 1716454224744675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224744675, "dur": 0, "args": { "External id": 214409, "cbid": 203, "correlation": 214409 } }, { "ph": "f", "id": 214409, "pid": 76337, "tid": -914061504, "ts": 1716454224744675, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224744676, "dur": 0, "args": { "External id": 214410, "cbid": 205, "correlation": 214410 } }, { "ph": "f", "id": 214410, "pid": 76337, "tid": -914061504, "ts": 1716454224744676, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224814207, "dur": 21, "args": { "External id": 214414, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214414, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214414, "pid": 5, "tid": 7, "ts": 1716454224814207, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744690, "dur": 13, "args": { "External id": 214414, "cbid": 211, "correlation": 214414 } }, { "ph": "s", "id": 214414, "pid": 76337, "tid": -914061504, "ts": 1716454224744690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224814229, "dur": 106, "args": { "External id": 214416, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214416, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214416, "pid": 5, "tid": 7, "ts": 1716454224814229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744709, "dur": 6, "args": { "External id": 214416, "cbid": 211, "correlation": 214416 } }, { "ph": "s", "id": 214416, "pid": 76337, "tid": -914061504, "ts": 1716454224744709, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224814337, "dur": 23, "args": { "External id": 214418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214418, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214418, "pid": 5, "tid": 7, "ts": 1716454224814337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744719, "dur": 5, "args": { "External id": 214418, "cbid": 211, "correlation": 214418 } }, { "ph": "s", "id": 214418, "pid": 76337, "tid": -914061504, "ts": 1716454224744719, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224814360, "dur": 32, "args": { "External id": 214424, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214424, "pid": 5, "tid": 7, "ts": 1716454224814360, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744747, "dur": 9, "args": { "External id": 214424, "cbid": 211, "correlation": 214424 } }, { "ph": "s", "id": 214424, "pid": 76337, "tid": -914061504, "ts": 1716454224744747, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224814394, "dur": 178, "args": { "External id": 214433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214433, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214433, "pid": 5, "tid": 7, "ts": 1716454224814394, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744828, "dur": 14, "args": { "External id": 214433, "cbid": 211, "correlation": 214433 } }, { "ph": "s", "id": 214433, "pid": 76337, "tid": -914061504, "ts": 1716454224744828, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224814574, "dur": 66, "args": { "External id": 214455, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214455, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214455, "pid": 5, "tid": 7, "ts": 1716454224814574, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744885, "dur": 10, "args": { "External id": 214455, "cbid": 211, "correlation": 214455 } }, { "ph": "s", "id": 214455, "pid": 76337, "tid": -914061504, "ts": 1716454224744885, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224744984, "dur": 2, "args": { "External id": 214466, "cbid": 251, "correlation": 214466 } }, { "ph": "f", "id": 214466, "pid": 76337, "tid": -914061504, "ts": 1716454224744984, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224814641, "dur": 155, "args": { "External id": 214467, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214467, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214467, "pid": 5, "tid": 7, "ts": 1716454224814641, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224744991, "dur": 15, "args": { "External id": 214467, "cbid": 211, "correlation": 214467 } }, { "ph": "s", "id": 214467, "pid": 76337, "tid": -914061504, "ts": 1716454224744991, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745064, "dur": 1, "args": { "External id": 214478, "cbid": 251, "correlation": 214478 } }, { "ph": "f", "id": 214478, "pid": 76337, "tid": -914061504, "ts": 1716454224745064, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224814797, "dur": 149, "args": { "External id": 214479, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214479, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214479, "pid": 5, "tid": 7, "ts": 1716454224814797, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745068, "dur": 11, "args": { "External id": 214479, "cbid": 211, "correlation": 214479 } }, { "ph": "s", "id": 214479, "pid": 76337, "tid": -914061504, "ts": 1716454224745068, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745132, "dur": 1, "args": { "External id": 214490, "cbid": 251, "correlation": 214490 } }, { "ph": "f", "id": 214490, "pid": 76337, "tid": -914061504, "ts": 1716454224745132, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224814948, "dur": 143, "args": { "External id": 214491, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214491, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214491, "pid": 5, "tid": 7, "ts": 1716454224814948, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745136, "dur": 11, "args": { "External id": 214491, "cbid": 211, "correlation": 214491 } }, { "ph": "s", "id": 214491, "pid": 76337, "tid": -914061504, "ts": 1716454224745136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224815092, "dur": 1974, "args": { "External id": 214512, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214512, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 214512, "pid": 5, "tid": 7, "ts": 1716454224815092, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745220, "dur": 13, "args": { "External id": 214512, "cbid": 211, "correlation": 214512 } }, { "ph": "s", "id": 214512, "pid": 76337, "tid": -914061504, "ts": 1716454224745220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745319, "dur": 1, "args": { "External id": 214530, "cbid": 251, "correlation": 214530 } }, { "ph": "f", "id": 214530, "pid": 76337, "tid": -914061504, "ts": 1716454224745319, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224817068, "dur": 148, "args": { "External id": 214532, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214532, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 214532, "pid": 5, "tid": 7, "ts": 1716454224817068, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745325, "dur": 13, "args": { "External id": 214532, "cbid": 211, "correlation": 214532 } }, { "ph": "s", "id": 214532, "pid": 76337, "tid": -914061504, "ts": 1716454224745325, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224817217, "dur": 35, "args": { "External id": 214540, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214540, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214540, "pid": 5, "tid": 7, "ts": 1716454224817217, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745395, "dur": 13, "args": { "External id": 214540, "cbid": 211, "correlation": 214540 } }, { "ph": "s", "id": 214540, "pid": 76337, "tid": -914061504, "ts": 1716454224745395, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224817254, "dur": 50, "args": { "External id": 214548, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214548, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214548, "pid": 5, "tid": 7, "ts": 1716454224817254, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745434, "dur": 8, "args": { "External id": 214548, "cbid": 211, "correlation": 214548 } }, { "ph": "s", "id": 214548, "pid": 76337, "tid": -914061504, "ts": 1716454224745434, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224817305, "dur": 30, "args": { "External id": 214559, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214559, "pid": 5, "tid": 7, "ts": 1716454224817305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745508, "dur": 13, "args": { "External id": 214559, "cbid": 211, "correlation": 214559 } }, { "ph": "s", "id": 214559, "pid": 76337, "tid": -914061504, "ts": 1716454224745508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224817337, "dur": 34, "args": { "External id": 214581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214581, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214581, "pid": 5, "tid": 7, "ts": 1716454224817337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745539, "dur": 7, "args": { "External id": 214581, "cbid": 211, "correlation": 214581 } }, { "ph": "s", "id": 214581, "pid": 76337, "tid": -914061504, "ts": 1716454224745539, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745623, "dur": 1, "args": { "External id": 214592, "cbid": 251, "correlation": 214592 } }, { "ph": "f", "id": 214592, "pid": 76337, "tid": -914061504, "ts": 1716454224745623, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224817372, "dur": 91, "args": { "External id": 214593, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214593, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214593, "pid": 5, "tid": 7, "ts": 1716454224817372, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745628, "dur": 13, "args": { "External id": 214593, "cbid": 211, "correlation": 214593 } }, { "ph": "s", "id": 214593, "pid": 76337, "tid": -914061504, "ts": 1716454224745628, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745698, "dur": 1, "args": { "External id": 214604, "cbid": 251, "correlation": 214604 } }, { "ph": "f", "id": 214604, "pid": 76337, "tid": -914061504, "ts": 1716454224745698, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745702, "dur": 0, "args": { "External id": 214605, "cbid": 251, "correlation": 214605 } }, { "ph": "f", "id": 214605, "pid": 76337, "tid": -914061504, "ts": 1716454224745702, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224817465, "dur": 11, "args": { "External id": 214606, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214606, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 214606, "pid": 5, "tid": 7, "ts": 1716454224817465, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745703, "dur": 11, "args": { "External id": 214606, "cbid": 211, "correlation": 214606 } }, { "ph": "s", "id": 214606, "pid": 76337, "tid": -914061504, "ts": 1716454224745703, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224817477, "dur": 5, "args": { "External id": 214608, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214608, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 214608, "pid": 5, "tid": 7, "ts": 1716454224817477, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745717, "dur": 6, "args": { "External id": 214608, "cbid": 211, "correlation": 214608 } }, { "ph": "s", "id": 214608, "pid": 76337, "tid": -914061504, "ts": 1716454224745717, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745774, "dur": 1, "args": { "External id": 214619, "cbid": 251, "correlation": 214619 } }, { "ph": "f", "id": 214619, "pid": 76337, "tid": -914061504, "ts": 1716454224745774, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745777, "dur": 0, "args": { "External id": 214620, "cbid": 251, "correlation": 214620 } }, { "ph": "f", "id": 214620, "pid": 76337, "tid": -914061504, "ts": 1716454224745777, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224817484, "dur": 7, "args": { "External id": 214621, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214621, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 214621, "pid": 5, "tid": 7, "ts": 1716454224817484, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745778, "dur": 12, "args": { "External id": 214621, "cbid": 211, "correlation": 214621 } }, { "ph": "s", "id": 214621, "pid": 76337, "tid": -914061504, "ts": 1716454224745778, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224817492, "dur": 4, "args": { "External id": 214623, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214623, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 214623, "pid": 5, "tid": 7, "ts": 1716454224817492, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745791, "dur": 6, "args": { "External id": 214623, "cbid": 211, "correlation": 214623 } }, { "ph": "s", "id": 214623, "pid": 76337, "tid": -914061504, "ts": 1716454224745791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224817497, "dur": 93, "args": { "External id": 214644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214644, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 214644, "pid": 5, "tid": 7, "ts": 1716454224817497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745865, "dur": 12, "args": { "External id": 214644, "cbid": 211, "correlation": 214644 } }, { "ph": "s", "id": 214644, "pid": 76337, "tid": -914061504, "ts": 1716454224745865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224745961, "dur": 1, "args": { "External id": 214662, "cbid": 251, "correlation": 214662 } }, { "ph": "f", "id": 214662, "pid": 76337, "tid": -914061504, "ts": 1716454224745961, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224817591, "dur": 98, "args": { "External id": 214664, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214664, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214664, "pid": 5, "tid": 7, "ts": 1716454224817591, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224745967, "dur": 21, "args": { "External id": 214664, "cbid": 211, "correlation": 214664 } }, { "ph": "s", "id": 214664, "pid": 76337, "tid": -914061504, "ts": 1716454224745967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224817691, "dur": 19, "args": { "External id": 214672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214672, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214672, "pid": 5, "tid": 7, "ts": 1716454224817691, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746044, "dur": 13, "args": { "External id": 214672, "cbid": 211, "correlation": 214672 } }, { "ph": "s", "id": 214672, "pid": 76337, "tid": -914061504, "ts": 1716454224746044, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224817711, "dur": 39, "args": { "External id": 214680, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214680, "pid": 5, "tid": 7, "ts": 1716454224817711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746086, "dur": 9, "args": { "External id": 214680, "cbid": 211, "correlation": 214680 } }, { "ph": "s", "id": 214680, "pid": 76337, "tid": -914061504, "ts": 1716454224746086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224817751, "dur": 35, "args": { "External id": 214702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214702, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214702, "pid": 5, "tid": 7, "ts": 1716454224817751, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746135, "dur": 10, "args": { "External id": 214702, "cbid": 211, "correlation": 214702 } }, { "ph": "s", "id": 214702, "pid": 76337, "tid": -914061504, "ts": 1716454224746135, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224746225, "dur": 1, "args": { "External id": 214718, "cbid": 251, "correlation": 214718 } }, { "ph": "f", "id": 214718, "pid": 76337, "tid": -914061504, "ts": 1716454224746225, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224746230, "dur": 0, "args": { "External id": 214720, "cbid": 251, "correlation": 214720 } }, { "ph": "f", "id": 214720, "pid": 76337, "tid": -914061504, "ts": 1716454224746230, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224817787, "dur": 545, "args": { "External id": 214721, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214721, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 214721, "pid": 5, "tid": 7, "ts": 1716454224817787, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746234, "dur": 13, "args": { "External id": 214721, "cbid": 211, "correlation": 214721 } }, { "ph": "s", "id": 214721, "pid": 76337, "tid": -914061504, "ts": 1716454224746234, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224818333, "dur": 127, "args": { "External id": 214729, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214729, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214729, "pid": 5, "tid": 7, "ts": 1716454224818333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746301, "dur": 12, "args": { "External id": 214729, "cbid": 211, "correlation": 214729 } }, { "ph": "s", "id": 214729, "pid": 76337, "tid": -914061504, "ts": 1716454224746301, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224818462, "dur": 130, "args": { "External id": 214737, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214737, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214737, "pid": 5, "tid": 7, "ts": 1716454224818462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746332, "dur": 8, "args": { "External id": 214737, "cbid": 211, "correlation": 214737 } }, { "ph": "s", "id": 214737, "pid": 76337, "tid": -914061504, "ts": 1716454224746332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224746408, "dur": 1, "args": { "External id": 214753, "cbid": 251, "correlation": 214753 } }, { "ph": "f", "id": 214753, "pid": 76337, "tid": -914061504, "ts": 1716454224746408, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224818593, "dur": 309, "args": { "External id": 214755, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214755, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214755, "pid": 5, "tid": 7, "ts": 1716454224818593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746414, "dur": 12, "args": { "External id": 214755, "cbid": 211, "correlation": 214755 } }, { "ph": "s", "id": 214755, "pid": 76337, "tid": -914061504, "ts": 1716454224746414, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224818904, "dur": 27, "args": { "External id": 214763, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214763, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214763, "pid": 5, "tid": 7, "ts": 1716454224818904, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746456, "dur": 10, "args": { "External id": 214763, "cbid": 211, "correlation": 214763 } }, { "ph": "s", "id": 214763, "pid": 76337, "tid": -914061504, "ts": 1716454224746456, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224818932, "dur": 82, "args": { "External id": 214774, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214774, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214774, "pid": 5, "tid": 7, "ts": 1716454224818932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746523, "dur": 12, "args": { "External id": 214774, "cbid": 211, "correlation": 214774 } }, { "ph": "s", "id": 214774, "pid": 76337, "tid": -914061504, "ts": 1716454224746523, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224746585, "dur": 0, "args": { "External id": 214786, "cbid": 317, "correlation": 214786 } }, { "ph": "f", "id": 214786, "pid": 76337, "tid": -914061504, "ts": 1716454224746585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224746585, "dur": 0, "args": { "External id": 214787, "cbid": 203, "correlation": 214787 } }, { "ph": "f", "id": 214787, "pid": 76337, "tid": -914061504, "ts": 1716454224746585, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224746586, "dur": 0, "args": { "External id": 214788, "cbid": 205, "correlation": 214788 } }, { "ph": "f", "id": 214788, "pid": 76337, "tid": -914061504, "ts": 1716454224746586, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224819015, "dur": 23, "args": { "External id": 214792, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214792, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214792, "pid": 5, "tid": 7, "ts": 1716454224819015, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746602, "dur": 12, "args": { "External id": 214792, "cbid": 211, "correlation": 214792 } }, { "ph": "s", "id": 214792, "pid": 76337, "tid": -914061504, "ts": 1716454224746602, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224819039, "dur": 122, "args": { "External id": 214794, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214794, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214794, "pid": 5, "tid": 7, "ts": 1716454224819039, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746621, "dur": 6, "args": { "External id": 214794, "cbid": 211, "correlation": 214794 } }, { "ph": "s", "id": 214794, "pid": 76337, "tid": -914061504, "ts": 1716454224746621, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224819162, "dur": 24, "args": { "External id": 214796, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214796, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214796, "pid": 5, "tid": 7, "ts": 1716454224819162, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746631, "dur": 5, "args": { "External id": 214796, "cbid": 211, "correlation": 214796 } }, { "ph": "s", "id": 214796, "pid": 76337, "tid": -914061504, "ts": 1716454224746631, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224819187, "dur": 32, "args": { "External id": 214802, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214802, "pid": 5, "tid": 7, "ts": 1716454224819187, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746659, "dur": 8, "args": { "External id": 214802, "cbid": 211, "correlation": 214802 } }, { "ph": "s", "id": 214802, "pid": 76337, "tid": -914061504, "ts": 1716454224746659, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224819221, "dur": 27, "args": { "External id": 214810, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214810, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214810, "pid": 5, "tid": 7, "ts": 1716454224819221, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746690, "dur": 9, "args": { "External id": 214810, "cbid": 211, "correlation": 214810 } }, { "ph": "s", "id": 214810, "pid": 76337, "tid": -914061504, "ts": 1716454224746690, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224819249, "dur": 55, "args": { "External id": 214819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214819, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214819, "pid": 5, "tid": 7, "ts": 1716454224819249, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746729, "dur": 10, "args": { "External id": 214819, "cbid": 211, "correlation": 214819 } }, { "ph": "s", "id": 214819, "pid": 76337, "tid": -914061504, "ts": 1716454224746729, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224819305, "dur": 52, "args": { "External id": 214839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214839, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 214839, "pid": 5, "tid": 7, "ts": 1716454224819305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746799, "dur": 12, "args": { "External id": 214839, "cbid": 211, "correlation": 214839 } }, { "ph": "s", "id": 214839, "pid": 76337, "tid": -914061504, "ts": 1716454224746799, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224819358, "dur": 4, "args": { "External id": 214851, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214851, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.5, "warps per SM": 4, "grid": [40, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 214851, "pid": 5, "tid": 7, "ts": 1716454224819358, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746820, "dur": 6, "args": { "External id": 214851, "cbid": 211, "correlation": 214851 } }, { "ph": "s", "id": 214851, "pid": 76337, "tid": -914061504, "ts": 1716454224746820, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224819364, "dur": 58, "args": { "External id": 214854, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214854, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214854, "pid": 5, "tid": 7, "ts": 1716454224819364, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746838, "dur": 6, "args": { "External id": 214854, "cbid": 211, "correlation": 214854 } }, { "ph": "s", "id": 214854, "pid": 76337, "tid": -914061504, "ts": 1716454224746838, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224819423, "dur": 37, "args": { "External id": 214863, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214863, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214863, "pid": 5, "tid": 7, "ts": 1716454224819423, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746878, "dur": 10, "args": { "External id": 214863, "cbid": 211, "correlation": 214863 } }, { "ph": "s", "id": 214863, "pid": 76337, "tid": -914061504, "ts": 1716454224746878, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224746931, "dur": 0, "args": { "External id": 214873, "cbid": 317, "correlation": 214873 } }, { "ph": "f", "id": 214873, "pid": 76337, "tid": -914061504, "ts": 1716454224746931, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224746932, "dur": 0, "args": { "External id": 214874, "cbid": 203, "correlation": 214874 } }, { "ph": "f", "id": 214874, "pid": 76337, "tid": -914061504, "ts": 1716454224746932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224746932, "dur": 0, "args": { "External id": 214875, "cbid": 205, "correlation": 214875 } }, { "ph": "f", "id": 214875, "pid": 76337, "tid": -914061504, "ts": 1716454224746932, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224819462, "dur": 39, "args": { "External id": 214879, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214879, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214879, "pid": 5, "tid": 7, "ts": 1716454224819462, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746947, "dur": 12, "args": { "External id": 214879, "cbid": 211, "correlation": 214879 } }, { "ph": "s", "id": 214879, "pid": 76337, "tid": -914061504, "ts": 1716454224746947, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224819502, "dur": 84, "args": { "External id": 214881, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214881, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 320, "warps per SM": 2560, "grid": [1, 40, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214881, "pid": 5, "tid": 7, "ts": 1716454224819502, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746962, "dur": 5, "args": { "External id": 214881, "cbid": 211, "correlation": 214881 } }, { "ph": "s", "id": 214881, "pid": 76337, "tid": -914061504, "ts": 1716454224746962, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224819587, "dur": 1288, "args": { "External id": 214883, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214883, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214883, "pid": 5, "tid": 7, "ts": 1716454224819587, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746973, "dur": 21, "args": { "External id": 214883, "cbid": 211, "correlation": 214883 } }, { "ph": "s", "id": 214883, "pid": 76337, "tid": -914061504, "ts": 1716454224746973, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224820876, "dur": 21, "args": { "External id": 214885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214885, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214885, "pid": 5, "tid": 7, "ts": 1716454224820876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224746997, "dur": 6, "args": { "External id": 214885, "cbid": 211, "correlation": 214885 } }, { "ph": "s", "id": 214885, "pid": 76337, "tid": -914061504, "ts": 1716454224746997, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224820899, "dur": 34, "args": { "External id": 214891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214891, "pid": 5, "tid": 7, "ts": 1716454224820899, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747027, "dur": 8, "args": { "External id": 214891, "cbid": 211, "correlation": 214891 } }, { "ph": "s", "id": 214891, "pid": 76337, "tid": -914061504, "ts": 1716454224747027, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224820934, "dur": 3, "args": { "External id": 214899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214899, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 214899, "pid": 5, "tid": 7, "ts": 1716454224820934, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747072, "dur": 9, "args": { "External id": 214899, "cbid": 211, "correlation": 214899 } }, { "ph": "s", "id": 214899, "pid": 76337, "tid": -914061504, "ts": 1716454224747072, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224747137, "dur": 1, "args": { "External id": 214915, "cbid": 251, "correlation": 214915 } }, { "ph": "f", "id": 214915, "pid": 76337, "tid": -914061504, "ts": 1716454224747137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224747141, "dur": 0, "args": { "External id": 214917, "cbid": 251, "correlation": 214917 } }, { "ph": "f", "id": 214917, "pid": 76337, "tid": -914061504, "ts": 1716454224747141, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224820939, "dur": 12, "args": { "External id": 214918, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214918, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 214918, "pid": 5, "tid": 7, "ts": 1716454224820939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747143, "dur": 12, "args": { "External id": 214918, "cbid": 211, "correlation": 214918 } }, { "ph": "s", "id": 214918, "pid": 76337, "tid": -914061504, "ts": 1716454224747143, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224820952, "dur": 5, "args": { "External id": 214920, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214920, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 214920, "pid": 5, "tid": 7, "ts": 1716454224820952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747157, "dur": 5, "args": { "External id": 214920, "cbid": 211, "correlation": 214920 } }, { "ph": "s", "id": 214920, "pid": 76337, "tid": -914061504, "ts": 1716454224747157, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224820959, "dur": 29, "args": { "External id": 214930, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214930, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214930, "pid": 5, "tid": 7, "ts": 1716454224820959, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747216, "dur": 12, "args": { "External id": 214930, "cbid": 211, "correlation": 214930 } }, { "ph": "s", "id": 214930, "pid": 76337, "tid": -914061504, "ts": 1716454224747216, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224820990, "dur": 32, "args": { "External id": 214950, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214950, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 214950, "pid": 5, "tid": 7, "ts": 1716454224820990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747282, "dur": 11, "args": { "External id": 214950, "cbid": 211, "correlation": 214950 } }, { "ph": "s", "id": 214950, "pid": 76337, "tid": -914061504, "ts": 1716454224747282, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224821023, "dur": 4, "args": { "External id": 214962, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214962, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 214962, "pid": 5, "tid": 7, "ts": 1716454224821023, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747302, "dur": 7, "args": { "External id": 214962, "cbid": 211, "correlation": 214962 } }, { "ph": "s", "id": 214962, "pid": 76337, "tid": -914061504, "ts": 1716454224747302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224821029, "dur": 30, "args": { "External id": 214965, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214965, "pid": 5, "tid": 7, "ts": 1716454224821029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747322, "dur": 8, "args": { "External id": 214965, "cbid": 211, "correlation": 214965 } }, { "ph": "s", "id": 214965, "pid": 76337, "tid": -914061504, "ts": 1716454224747322, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224821060, "dur": 20, "args": { "External id": 214974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214974, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214974, "pid": 5, "tid": 7, "ts": 1716454224821060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747363, "dur": 9, "args": { "External id": 214974, "cbid": 211, "correlation": 214974 } }, { "ph": "s", "id": 214974, "pid": 76337, "tid": -914061504, "ts": 1716454224747363, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224747425, "dur": 0, "args": { "External id": 214984, "cbid": 317, "correlation": 214984 } }, { "ph": "f", "id": 214984, "pid": 76337, "tid": -914061504, "ts": 1716454224747425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224747426, "dur": 0, "args": { "External id": 214985, "cbid": 203, "correlation": 214985 } }, { "ph": "f", "id": 214985, "pid": 76337, "tid": -914061504, "ts": 1716454224747426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224747426, "dur": 0, "args": { "External id": 214986, "cbid": 205, "correlation": 214986 } }, { "ph": "f", "id": 214986, "pid": 76337, "tid": -914061504, "ts": 1716454224747426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224821081, "dur": 23, "args": { "External id": 214990, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214990, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214990, "pid": 5, "tid": 7, "ts": 1716454224821081, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747440, "dur": 12, "args": { "External id": 214990, "cbid": 211, "correlation": 214990 } }, { "ph": "s", "id": 214990, "pid": 76337, "tid": -914061504, "ts": 1716454224747440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224821105, "dur": 45, "args": { "External id": 214992, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214992, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214992, "pid": 5, "tid": 7, "ts": 1716454224821105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747455, "dur": 5, "args": { "External id": 214992, "cbid": 211, "correlation": 214992 } }, { "ph": "s", "id": 214992, "pid": 76337, "tid": -914061504, "ts": 1716454224747455, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224821152, "dur": 650, "args": { "External id": 214994, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214994, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 214994, "pid": 5, "tid": 7, "ts": 1716454224821152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747466, "dur": 6, "args": { "External id": 214994, "cbid": 211, "correlation": 214994 } }, { "ph": "s", "id": 214994, "pid": 76337, "tid": -914061504, "ts": 1716454224747466, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224821803, "dur": 23, "args": { "External id": 214996, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 214996, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 214996, "pid": 5, "tid": 7, "ts": 1716454224821803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747476, "dur": 5, "args": { "External id": 214996, "cbid": 211, "correlation": 214996 } }, { "ph": "s", "id": 214996, "pid": 76337, "tid": -914061504, "ts": 1716454224747476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224821827, "dur": 34, "args": { "External id": 215002, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215002, "pid": 5, "tid": 7, "ts": 1716454224821827, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747503, "dur": 9, "args": { "External id": 215002, "cbid": 211, "correlation": 215002 } }, { "ph": "s", "id": 215002, "pid": 76337, "tid": -914061504, "ts": 1716454224747503, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224747564, "dur": 0, "args": { "External id": 215012, "cbid": 317, "correlation": 215012 } }, { "ph": "f", "id": 215012, "pid": 76337, "tid": -914061504, "ts": 1716454224747564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224747564, "dur": 0, "args": { "External id": 215013, "cbid": 203, "correlation": 215013 } }, { "ph": "f", "id": 215013, "pid": 76337, "tid": -914061504, "ts": 1716454224747564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224747565, "dur": 0, "args": { "External id": 215014, "cbid": 205, "correlation": 215014 } }, { "ph": "f", "id": 215014, "pid": 76337, "tid": -914061504, "ts": 1716454224747565, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224821862, "dur": 39, "args": { "External id": 215018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215018, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [24, 40, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215018, "pid": 5, "tid": 7, "ts": 1716454224821862, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747577, "dur": 11, "args": { "External id": 215018, "cbid": 211, "correlation": 215018 } }, { "ph": "s", "id": 215018, "pid": 76337, "tid": -914061504, "ts": 1716454224747577, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224821902, "dur": 192, "args": { "External id": 215020, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215020, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215020, "pid": 5, "tid": 7, "ts": 1716454224821902, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747594, "dur": 7, "args": { "External id": 215020, "cbid": 211, "correlation": 215020 } }, { "ph": "s", "id": 215020, "pid": 76337, "tid": -914061504, "ts": 1716454224747594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224822096, "dur": 22, "args": { "External id": 215022, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215022, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215022, "pid": 5, "tid": 7, "ts": 1716454224822096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747605, "dur": 5, "args": { "External id": 215022, "cbid": 211, "correlation": 215022 } }, { "ph": "s", "id": 215022, "pid": 76337, "tid": -914061504, "ts": 1716454224747605, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224822119, "dur": 32, "args": { "External id": 215028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215028, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215028, "pid": 5, "tid": 7, "ts": 1716454224822119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747630, "dur": 8, "args": { "External id": 215028, "cbid": 211, "correlation": 215028 } }, { "ph": "s", "id": 215028, "pid": 76337, "tid": -914061504, "ts": 1716454224747630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224822152, "dur": 27, "args": { "External id": 215036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215036, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215036, "pid": 5, "tid": 7, "ts": 1716454224822152, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747660, "dur": 7, "args": { "External id": 215036, "cbid": 211, "correlation": 215036 } }, { "ph": "s", "id": 215036, "pid": 76337, "tid": -914061504, "ts": 1716454224747660, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224822181, "dur": 20, "args": { "External id": 215044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215044, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215044, "pid": 5, "tid": 7, "ts": 1716454224822181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747688, "dur": 8, "args": { "External id": 215044, "cbid": 211, "correlation": 215044 } }, { "ph": "s", "id": 215044, "pid": 76337, "tid": -914061504, "ts": 1716454224747688, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224822202, "dur": 30, "args": { "External id": 215064, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215064, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 215064, "pid": 5, "tid": 7, "ts": 1716454224822202, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747770, "dur": 12, "args": { "External id": 215064, "cbid": 211, "correlation": 215064 } }, { "ph": "s", "id": 215064, "pid": 76337, "tid": -914061504, "ts": 1716454224747770, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224822233, "dur": 4, "args": { "External id": 215076, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215076, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 215076, "pid": 5, "tid": 7, "ts": 1716454224822233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747791, "dur": 6, "args": { "External id": 215076, "cbid": 211, "correlation": 215076 } }, { "ph": "s", "id": 215076, "pid": 76337, "tid": -914061504, "ts": 1716454224747791, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224822239, "dur": 30, "args": { "External id": 215079, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215079, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215079, "pid": 5, "tid": 7, "ts": 1716454224822239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747810, "dur": 6, "args": { "External id": 215079, "cbid": 211, "correlation": 215079 } }, { "ph": "s", "id": 215079, "pid": 76337, "tid": -914061504, "ts": 1716454224747810, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224747868, "dur": 0, "args": { "External id": 215090, "cbid": 317, "correlation": 215090 } }, { "ph": "f", "id": 215090, "pid": 76337, "tid": -914061504, "ts": 1716454224747868, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224747869, "dur": 0, "args": { "External id": 215091, "cbid": 203, "correlation": 215091 } }, { "ph": "f", "id": 215091, "pid": 76337, "tid": -914061504, "ts": 1716454224747869, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224747870, "dur": 0, "args": { "External id": 215092, "cbid": 205, "correlation": 215092 } }, { "ph": "f", "id": 215092, "pid": 76337, "tid": -914061504, "ts": 1716454224747870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224822271, "dur": 22, "args": { "External id": 215096, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215096, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215096, "pid": 5, "tid": 7, "ts": 1716454224822271, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747882, "dur": 12, "args": { "External id": 215096, "cbid": 211, "correlation": 215096 } }, { "ph": "s", "id": 215096, "pid": 76337, "tid": -914061504, "ts": 1716454224747882, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224822294, "dur": 106, "args": { "External id": 215098, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215098, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215098, "pid": 5, "tid": 7, "ts": 1716454224822294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747900, "dur": 7, "args": { "External id": 215098, "cbid": 211, "correlation": 215098 } }, { "ph": "s", "id": 215098, "pid": 76337, "tid": -914061504, "ts": 1716454224747900, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224822401, "dur": 22, "args": { "External id": 215100, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215100, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215100, "pid": 5, "tid": 7, "ts": 1716454224822401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747910, "dur": 5, "args": { "External id": 215100, "cbid": 211, "correlation": 215100 } }, { "ph": "s", "id": 215100, "pid": 76337, "tid": -914061504, "ts": 1716454224747910, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224822425, "dur": 31, "args": { "External id": 215106, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215106, "pid": 5, "tid": 7, "ts": 1716454224822425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224747937, "dur": 8, "args": { "External id": 215106, "cbid": 211, "correlation": 215106 } }, { "ph": "s", "id": 215106, "pid": 76337, "tid": -914061504, "ts": 1716454224747937, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224822457, "dur": 195, "args": { "External id": 215115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215115, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215115, "pid": 5, "tid": 7, "ts": 1716454224822457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748029, "dur": 14, "args": { "External id": 215115, "cbid": 211, "correlation": 215115 } }, { "ph": "s", "id": 215115, "pid": 76337, "tid": -914061504, "ts": 1716454224748029, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224822653, "dur": 65, "args": { "External id": 215137, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215137, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215137, "pid": 5, "tid": 7, "ts": 1716454224822653, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748085, "dur": 10, "args": { "External id": 215137, "cbid": 211, "correlation": 215137 } }, { "ph": "s", "id": 215137, "pid": 76337, "tid": -914061504, "ts": 1716454224748085, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748175, "dur": 1, "args": { "External id": 215148, "cbid": 251, "correlation": 215148 } }, { "ph": "f", "id": 215148, "pid": 76337, "tid": -914061504, "ts": 1716454224748175, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224822719, "dur": 152, "args": { "External id": 215149, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215149, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215149, "pid": 5, "tid": 7, "ts": 1716454224822719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748181, "dur": 13, "args": { "External id": 215149, "cbid": 211, "correlation": 215149 } }, { "ph": "s", "id": 215149, "pid": 76337, "tid": -914061504, "ts": 1716454224748181, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748250, "dur": 1, "args": { "External id": 215160, "cbid": 251, "correlation": 215160 } }, { "ph": "f", "id": 215160, "pid": 76337, "tid": -914061504, "ts": 1716454224748250, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224822872, "dur": 145, "args": { "External id": 215161, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215161, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215161, "pid": 5, "tid": 7, "ts": 1716454224822872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748254, "dur": 11, "args": { "External id": 215161, "cbid": 211, "correlation": 215161 } }, { "ph": "s", "id": 215161, "pid": 76337, "tid": -914061504, "ts": 1716454224748254, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748318, "dur": 1, "args": { "External id": 215172, "cbid": 251, "correlation": 215172 } }, { "ph": "f", "id": 215172, "pid": 76337, "tid": -914061504, "ts": 1716454224748318, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224823018, "dur": 147, "args": { "External id": 215173, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215173, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215173, "pid": 5, "tid": 7, "ts": 1716454224823018, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748321, "dur": 11, "args": { "External id": 215173, "cbid": 211, "correlation": 215173 } }, { "ph": "s", "id": 215173, "pid": 76337, "tid": -914061504, "ts": 1716454224748321, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224823167, "dur": 1965, "args": { "External id": 215194, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215194, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 215194, "pid": 5, "tid": 7, "ts": 1716454224823167, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748404, "dur": 13, "args": { "External id": 215194, "cbid": 211, "correlation": 215194 } }, { "ph": "s", "id": 215194, "pid": 76337, "tid": -914061504, "ts": 1716454224748404, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748499, "dur": 1, "args": { "External id": 215212, "cbid": 251, "correlation": 215212 } }, { "ph": "f", "id": 215212, "pid": 76337, "tid": -914061504, "ts": 1716454224748499, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224825133, "dur": 149, "args": { "External id": 215214, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215214, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 215214, "pid": 5, "tid": 7, "ts": 1716454224825133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748505, "dur": 13, "args": { "External id": 215214, "cbid": 211, "correlation": 215214 } }, { "ph": "s", "id": 215214, "pid": 76337, "tid": -914061504, "ts": 1716454224748505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224825283, "dur": 36, "args": { "External id": 215222, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215222, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215222, "pid": 5, "tid": 7, "ts": 1716454224825283, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748574, "dur": 12, "args": { "External id": 215222, "cbid": 211, "correlation": 215222 } }, { "ph": "s", "id": 215222, "pid": 76337, "tid": -914061504, "ts": 1716454224748574, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224825320, "dur": 50, "args": { "External id": 215230, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215230, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215230, "pid": 5, "tid": 7, "ts": 1716454224825320, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748613, "dur": 8, "args": { "External id": 215230, "cbid": 211, "correlation": 215230 } }, { "ph": "s", "id": 215230, "pid": 76337, "tid": -914061504, "ts": 1716454224748613, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224825371, "dur": 29, "args": { "External id": 215241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215241, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215241, "pid": 5, "tid": 7, "ts": 1716454224825371, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748684, "dur": 12, "args": { "External id": 215241, "cbid": 211, "correlation": 215241 } }, { "ph": "s", "id": 215241, "pid": 76337, "tid": -914061504, "ts": 1716454224748684, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224825402, "dur": 35, "args": { "External id": 215263, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215263, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215263, "pid": 5, "tid": 7, "ts": 1716454224825402, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748715, "dur": 7, "args": { "External id": 215263, "cbid": 211, "correlation": 215263 } }, { "ph": "s", "id": 215263, "pid": 76337, "tid": -914061504, "ts": 1716454224748715, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748799, "dur": 1, "args": { "External id": 215274, "cbid": 251, "correlation": 215274 } }, { "ph": "f", "id": 215274, "pid": 76337, "tid": -914061504, "ts": 1716454224748799, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224825438, "dur": 76, "args": { "External id": 215275, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215275, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215275, "pid": 5, "tid": 7, "ts": 1716454224825438, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748804, "dur": 14, "args": { "External id": 215275, "cbid": 211, "correlation": 215275 } }, { "ph": "s", "id": 215275, "pid": 76337, "tid": -914061504, "ts": 1716454224748804, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748872, "dur": 1, "args": { "External id": 215286, "cbid": 251, "correlation": 215286 } }, { "ph": "f", "id": 215286, "pid": 76337, "tid": -914061504, "ts": 1716454224748872, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748876, "dur": 0, "args": { "External id": 215287, "cbid": 251, "correlation": 215287 } }, { "ph": "f", "id": 215287, "pid": 76337, "tid": -914061504, "ts": 1716454224748876, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224825515, "dur": 11, "args": { "External id": 215288, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215288, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 215288, "pid": 5, "tid": 7, "ts": 1716454224825515, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748877, "dur": 12, "args": { "External id": 215288, "cbid": 211, "correlation": 215288 } }, { "ph": "s", "id": 215288, "pid": 76337, "tid": -914061504, "ts": 1716454224748877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224825528, "dur": 5, "args": { "External id": 215290, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215290, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 215290, "pid": 5, "tid": 7, "ts": 1716454224825528, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748891, "dur": 6, "args": { "External id": 215290, "cbid": 211, "correlation": 215290 } }, { "ph": "s", "id": 215290, "pid": 76337, "tid": -914061504, "ts": 1716454224748891, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748949, "dur": 1, "args": { "External id": 215301, "cbid": 251, "correlation": 215301 } }, { "ph": "f", "id": 215301, "pid": 76337, "tid": -914061504, "ts": 1716454224748949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224748952, "dur": 0, "args": { "External id": 215302, "cbid": 251, "correlation": 215302 } }, { "ph": "f", "id": 215302, "pid": 76337, "tid": -914061504, "ts": 1716454224748952, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224825535, "dur": 7, "args": { "External id": 215303, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215303, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 215303, "pid": 5, "tid": 7, "ts": 1716454224825535, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748953, "dur": 12, "args": { "External id": 215303, "cbid": 211, "correlation": 215303 } }, { "ph": "s", "id": 215303, "pid": 76337, "tid": -914061504, "ts": 1716454224748953, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224825544, "dur": 4, "args": { "External id": 215305, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215305, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 215305, "pid": 5, "tid": 7, "ts": 1716454224825544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224748967, "dur": 13, "args": { "External id": 215305, "cbid": 211, "correlation": 215305 } }, { "ph": "s", "id": 215305, "pid": 76337, "tid": -914061504, "ts": 1716454224748967, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224825548, "dur": 93, "args": { "External id": 215326, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215326, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 215326, "pid": 5, "tid": 7, "ts": 1716454224825548, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749049, "dur": 13, "args": { "External id": 215326, "cbid": 211, "correlation": 215326 } }, { "ph": "s", "id": 215326, "pid": 76337, "tid": -914061504, "ts": 1716454224749049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224749146, "dur": 1, "args": { "External id": 215344, "cbid": 251, "correlation": 215344 } }, { "ph": "f", "id": 215344, "pid": 76337, "tid": -914061504, "ts": 1716454224749146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224825643, "dur": 101, "args": { "External id": 215346, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215346, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215346, "pid": 5, "tid": 7, "ts": 1716454224825643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749152, "dur": 13, "args": { "External id": 215346, "cbid": 211, "correlation": 215346 } }, { "ph": "s", "id": 215346, "pid": 76337, "tid": -914061504, "ts": 1716454224749152, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224825745, "dur": 19, "args": { "External id": 215354, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215354, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215354, "pid": 5, "tid": 7, "ts": 1716454224825745, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749220, "dur": 12, "args": { "External id": 215354, "cbid": 211, "correlation": 215354 } }, { "ph": "s", "id": 215354, "pid": 76337, "tid": -914061504, "ts": 1716454224749220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224825765, "dur": 39, "args": { "External id": 215362, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215362, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215362, "pid": 5, "tid": 7, "ts": 1716454224825765, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749262, "dur": 10, "args": { "External id": 215362, "cbid": 211, "correlation": 215362 } }, { "ph": "s", "id": 215362, "pid": 76337, "tid": -914061504, "ts": 1716454224749262, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224825805, "dur": 35, "args": { "External id": 215384, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215384, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215384, "pid": 5, "tid": 7, "ts": 1716454224825805, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749313, "dur": 10, "args": { "External id": 215384, "cbid": 211, "correlation": 215384 } }, { "ph": "s", "id": 215384, "pid": 76337, "tid": -914061504, "ts": 1716454224749313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224749401, "dur": 1, "args": { "External id": 215400, "cbid": 251, "correlation": 215400 } }, { "ph": "f", "id": 215400, "pid": 76337, "tid": -914061504, "ts": 1716454224749401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224749406, "dur": 0, "args": { "External id": 215402, "cbid": 251, "correlation": 215402 } }, { "ph": "f", "id": 215402, "pid": 76337, "tid": -914061504, "ts": 1716454224749406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224825842, "dur": 541, "args": { "External id": 215403, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215403, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 215403, "pid": 5, "tid": 7, "ts": 1716454224825842, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749409, "dur": 13, "args": { "External id": 215403, "cbid": 211, "correlation": 215403 } }, { "ph": "s", "id": 215403, "pid": 76337, "tid": -914061504, "ts": 1716454224749409, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224826384, "dur": 125, "args": { "External id": 215411, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215411, "pid": 5, "tid": 7, "ts": 1716454224826384, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749476, "dur": 12, "args": { "External id": 215411, "cbid": 211, "correlation": 215411 } }, { "ph": "s", "id": 215411, "pid": 76337, "tid": -914061504, "ts": 1716454224749476, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224826511, "dur": 128, "args": { "External id": 215419, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215419, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215419, "pid": 5, "tid": 7, "ts": 1716454224826511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749505, "dur": 8, "args": { "External id": 215419, "cbid": 211, "correlation": 215419 } }, { "ph": "s", "id": 215419, "pid": 76337, "tid": -914061504, "ts": 1716454224749505, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224749582, "dur": 1, "args": { "External id": 215435, "cbid": 251, "correlation": 215435 } }, { "ph": "f", "id": 215435, "pid": 76337, "tid": -914061504, "ts": 1716454224749582, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224826640, "dur": 312, "args": { "External id": 215437, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215437, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215437, "pid": 5, "tid": 7, "ts": 1716454224826640, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749587, "dur": 12, "args": { "External id": 215437, "cbid": 211, "correlation": 215437 } }, { "ph": "s", "id": 215437, "pid": 76337, "tid": -914061504, "ts": 1716454224749587, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224826954, "dur": 27, "args": { "External id": 215445, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215445, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215445, "pid": 5, "tid": 7, "ts": 1716454224826954, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749630, "dur": 10, "args": { "External id": 215445, "cbid": 211, "correlation": 215445 } }, { "ph": "s", "id": 215445, "pid": 76337, "tid": -914061504, "ts": 1716454224749630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224826981, "dur": 82, "args": { "External id": 215456, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215456, "pid": 5, "tid": 7, "ts": 1716454224826981, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749696, "dur": 12, "args": { "External id": 215456, "cbid": 211, "correlation": 215456 } }, { "ph": "s", "id": 215456, "pid": 76337, "tid": -914061504, "ts": 1716454224749696, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224749760, "dur": 0, "args": { "External id": 215468, "cbid": 317, "correlation": 215468 } }, { "ph": "f", "id": 215468, "pid": 76337, "tid": -914061504, "ts": 1716454224749760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224749761, "dur": 0, "args": { "External id": 215469, "cbid": 203, "correlation": 215469 } }, { "ph": "f", "id": 215469, "pid": 76337, "tid": -914061504, "ts": 1716454224749761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224749761, "dur": 0, "args": { "External id": 215470, "cbid": 205, "correlation": 215470 } }, { "ph": "f", "id": 215470, "pid": 76337, "tid": -914061504, "ts": 1716454224749761, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224827065, "dur": 24, "args": { "External id": 215474, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215474, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215474, "pid": 5, "tid": 7, "ts": 1716454224827065, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749777, "dur": 12, "args": { "External id": 215474, "cbid": 211, "correlation": 215474 } }, { "ph": "s", "id": 215474, "pid": 76337, "tid": -914061504, "ts": 1716454224749777, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224827091, "dur": 120, "args": { "External id": 215476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215476, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215476, "pid": 5, "tid": 7, "ts": 1716454224827091, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749796, "dur": 7, "args": { "External id": 215476, "cbid": 211, "correlation": 215476 } }, { "ph": "s", "id": 215476, "pid": 76337, "tid": -914061504, "ts": 1716454224749796, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224827212, "dur": 23, "args": { "External id": 215478, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215478, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215478, "pid": 5, "tid": 7, "ts": 1716454224827212, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749806, "dur": 5, "args": { "External id": 215478, "cbid": 211, "correlation": 215478 } }, { "ph": "s", "id": 215478, "pid": 76337, "tid": -914061504, "ts": 1716454224749806, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224827237, "dur": 33, "args": { "External id": 215484, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215484, "pid": 5, "tid": 7, "ts": 1716454224827237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749834, "dur": 8, "args": { "External id": 215484, "cbid": 211, "correlation": 215484 } }, { "ph": "s", "id": 215484, "pid": 76337, "tid": -914061504, "ts": 1716454224749834, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224827272, "dur": 26, "args": { "External id": 215492, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215492, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215492, "pid": 5, "tid": 7, "ts": 1716454224827272, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749865, "dur": 8, "args": { "External id": 215492, "cbid": 211, "correlation": 215492 } }, { "ph": "s", "id": 215492, "pid": 76337, "tid": -914061504, "ts": 1716454224749865, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224827299, "dur": 50, "args": { "External id": 215501, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215501, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215501, "pid": 5, "tid": 7, "ts": 1716454224827299, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749904, "dur": 10, "args": { "External id": 215501, "cbid": 211, "correlation": 215501 } }, { "ph": "s", "id": 215501, "pid": 76337, "tid": -914061504, "ts": 1716454224749904, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224827350, "dur": 43, "args": { "External id": 215521, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215521, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 215521, "pid": 5, "tid": 7, "ts": 1716454224827350, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224749984, "dur": 12, "args": { "External id": 215521, "cbid": 211, "correlation": 215521 } }, { "ph": "s", "id": 215521, "pid": 76337, "tid": -914061504, "ts": 1716454224749984, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224827395, "dur": 5, "args": { "External id": 215533, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215533, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 215533, "pid": 5, "tid": 7, "ts": 1716454224827395, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750006, "dur": 6, "args": { "External id": 215533, "cbid": 211, "correlation": 215533 } }, { "ph": "s", "id": 215533, "pid": 76337, "tid": -914061504, "ts": 1716454224750006, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224827401, "dur": 44, "args": { "External id": 215536, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215536, "pid": 5, "tid": 7, "ts": 1716454224827401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750026, "dur": 7, "args": { "External id": 215536, "cbid": 211, "correlation": 215536 } }, { "ph": "s", "id": 215536, "pid": 76337, "tid": -914061504, "ts": 1716454224750026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224827447, "dur": 29, "args": { "External id": 215545, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215545, "registers per thread": 24, "shared memory": 0, "blocks per SM": 144, "warps per SM": 576, "grid": [11520, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215545, "pid": 5, "tid": 7, "ts": 1716454224827447, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750066, "dur": 11, "args": { "External id": 215545, "cbid": 211, "correlation": 215545 } }, { "ph": "s", "id": 215545, "pid": 76337, "tid": -914061504, "ts": 1716454224750066, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224750118, "dur": 0, "args": { "External id": 215555, "cbid": 317, "correlation": 215555 } }, { "ph": "f", "id": 215555, "pid": 76337, "tid": -914061504, "ts": 1716454224750118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224750119, "dur": 0, "args": { "External id": 215556, "cbid": 203, "correlation": 215556 } }, { "ph": "f", "id": 215556, "pid": 76337, "tid": -914061504, "ts": 1716454224750119, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224750120, "dur": 0, "args": { "External id": 215557, "cbid": 205, "correlation": 215557 } }, { "ph": "f", "id": 215557, "pid": 76337, "tid": -914061504, "ts": 1716454224750120, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224827478, "dur": 31, "args": { "External id": 215561, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215561, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215561, "pid": 5, "tid": 7, "ts": 1716454224827478, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750136, "dur": 11, "args": { "External id": 215561, "cbid": 211, "correlation": 215561 } }, { "ph": "s", "id": 215561, "pid": 76337, "tid": -914061504, "ts": 1716454224750136, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224827510, "dur": 64, "args": { "External id": 215563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215563, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 240, "warps per SM": 1920, "grid": [1, 30, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215563, "pid": 5, "tid": 7, "ts": 1716454224827510, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750150, "dur": 5, "args": { "External id": 215563, "cbid": 211, "correlation": 215563 } }, { "ph": "s", "id": 215563, "pid": 76337, "tid": -914061504, "ts": 1716454224750150, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224827575, "dur": 968, "args": { "External id": 215565, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215565, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215565, "pid": 5, "tid": 7, "ts": 1716454224827575, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750160, "dur": 6, "args": { "External id": 215565, "cbid": 211, "correlation": 215565 } }, { "ph": "s", "id": 215565, "pid": 76337, "tid": -914061504, "ts": 1716454224750160, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224828545, "dur": 21, "args": { "External id": 215567, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215567, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215567, "pid": 5, "tid": 7, "ts": 1716454224828545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750170, "dur": 6, "args": { "External id": 215567, "cbid": 211, "correlation": 215567 } }, { "ph": "s", "id": 215567, "pid": 76337, "tid": -914061504, "ts": 1716454224750170, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224828568, "dur": 33, "args": { "External id": 215573, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215573, "pid": 5, "tid": 7, "ts": 1716454224828568, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750198, "dur": 8, "args": { "External id": 215573, "cbid": 211, "correlation": 215573 } }, { "ph": "s", "id": 215573, "pid": 76337, "tid": -914061504, "ts": 1716454224750198, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224828602, "dur": 3, "args": { "External id": 215581, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215581, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 215581, "pid": 5, "tid": 7, "ts": 1716454224828602, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750242, "dur": 9, "args": { "External id": 215581, "cbid": 211, "correlation": 215581 } }, { "ph": "s", "id": 215581, "pid": 76337, "tid": -914061504, "ts": 1716454224750242, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224750306, "dur": 1, "args": { "External id": 215597, "cbid": 251, "correlation": 215597 } }, { "ph": "f", "id": 215597, "pid": 76337, "tid": -914061504, "ts": 1716454224750306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224750311, "dur": 0, "args": { "External id": 215599, "cbid": 251, "correlation": 215599 } }, { "ph": "f", "id": 215599, "pid": 76337, "tid": -914061504, "ts": 1716454224750311, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224828607, "dur": 12, "args": { "External id": 215600, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215600, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 215600, "pid": 5, "tid": 7, "ts": 1716454224828607, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750313, "dur": 11, "args": { "External id": 215600, "cbid": 211, "correlation": 215600 } }, { "ph": "s", "id": 215600, "pid": 76337, "tid": -914061504, "ts": 1716454224750313, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224828620, "dur": 5, "args": { "External id": 215602, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215602, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 215602, "pid": 5, "tid": 7, "ts": 1716454224828620, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750328, "dur": 5, "args": { "External id": 215602, "cbid": 211, "correlation": 215602 } }, { "ph": "s", "id": 215602, "pid": 76337, "tid": -914061504, "ts": 1716454224750328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224828626, "dur": 29, "args": { "External id": 215612, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215612, "pid": 5, "tid": 7, "ts": 1716454224828626, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750385, "dur": 12, "args": { "External id": 215612, "cbid": 211, "correlation": 215612 } }, { "ph": "s", "id": 215612, "pid": 76337, "tid": -914061504, "ts": 1716454224750385, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224828657, "dur": 31, "args": { "External id": 215632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215632, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 215632, "pid": 5, "tid": 7, "ts": 1716454224828657, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750451, "dur": 11, "args": { "External id": 215632, "cbid": 211, "correlation": 215632 } }, { "ph": "s", "id": 215632, "pid": 76337, "tid": -914061504, "ts": 1716454224750451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224828689, "dur": 4, "args": { "External id": 215644, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215644, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 215644, "pid": 5, "tid": 7, "ts": 1716454224828689, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750471, "dur": 7, "args": { "External id": 215644, "cbid": 211, "correlation": 215644 } }, { "ph": "s", "id": 215644, "pid": 76337, "tid": -914061504, "ts": 1716454224750471, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224828694, "dur": 29, "args": { "External id": 215647, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215647, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215647, "pid": 5, "tid": 7, "ts": 1716454224828694, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750491, "dur": 7, "args": { "External id": 215647, "cbid": 211, "correlation": 215647 } }, { "ph": "s", "id": 215647, "pid": 76337, "tid": -914061504, "ts": 1716454224750491, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224828725, "dur": 20, "args": { "External id": 215656, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215656, "registers per thread": 24, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215656, "pid": 5, "tid": 7, "ts": 1716454224828725, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750532, "dur": 9, "args": { "External id": 215656, "cbid": 211, "correlation": 215656 } }, { "ph": "s", "id": 215656, "pid": 76337, "tid": -914061504, "ts": 1716454224750532, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224750593, "dur": 0, "args": { "External id": 215666, "cbid": 317, "correlation": 215666 } }, { "ph": "f", "id": 215666, "pid": 76337, "tid": -914061504, "ts": 1716454224750593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224750593, "dur": 0, "args": { "External id": 215667, "cbid": 203, "correlation": 215667 } }, { "ph": "f", "id": 215667, "pid": 76337, "tid": -914061504, "ts": 1716454224750593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224750594, "dur": 0, "args": { "External id": 215668, "cbid": 205, "correlation": 215668 } }, { "ph": "f", "id": 215668, "pid": 76337, "tid": -914061504, "ts": 1716454224750594, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224828746, "dur": 23, "args": { "External id": 215672, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215672, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215672, "pid": 5, "tid": 7, "ts": 1716454224828746, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750608, "dur": 12, "args": { "External id": 215672, "cbid": 211, "correlation": 215672 } }, { "ph": "s", "id": 215672, "pid": 76337, "tid": -914061504, "ts": 1716454224750608, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224828770, "dur": 44, "args": { "External id": 215674, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215674, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215674, "pid": 5, "tid": 7, "ts": 1716454224828770, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750622, "dur": 6, "args": { "External id": 215674, "cbid": 211, "correlation": 215674 } }, { "ph": "s", "id": 215674, "pid": 76337, "tid": -914061504, "ts": 1716454224750622, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224828816, "dur": 648, "args": { "External id": 215676, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215676, "registers per thread": 254, "shared memory": 17408, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215676, "pid": 5, "tid": 7, "ts": 1716454224828816, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750634, "dur": 6, "args": { "External id": 215676, "cbid": 211, "correlation": 215676 } }, { "ph": "s", "id": 215676, "pid": 76337, "tid": -914061504, "ts": 1716454224750634, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224829466, "dur": 22, "args": { "External id": 215678, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215678, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215678, "pid": 5, "tid": 7, "ts": 1716454224829466, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750643, "dur": 5, "args": { "External id": 215678, "cbid": 211, "correlation": 215678 } }, { "ph": "s", "id": 215678, "pid": 76337, "tid": -914061504, "ts": 1716454224750643, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224829489, "dur": 33, "args": { "External id": 215684, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215684, "pid": 5, "tid": 7, "ts": 1716454224829489, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750671, "dur": 9, "args": { "External id": 215684, "cbid": 211, "correlation": 215684 } }, { "ph": "s", "id": 215684, "pid": 76337, "tid": -914061504, "ts": 1716454224750671, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224750729, "dur": 0, "args": { "External id": 215694, "cbid": 317, "correlation": 215694 } }, { "ph": "f", "id": 215694, "pid": 76337, "tid": -914061504, "ts": 1716454224750729, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224750730, "dur": 0, "args": { "External id": 215695, "cbid": 203, "correlation": 215695 } }, { "ph": "f", "id": 215695, "pid": 76337, "tid": -914061504, "ts": 1716454224750730, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224750731, "dur": 0, "args": { "External id": 215696, "cbid": 205, "correlation": 215696 } }, { "ph": "f", "id": 215696, "pid": 76337, "tid": -914061504, "ts": 1716454224750731, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224829523, "dur": 30, "args": { "External id": 215700, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215700, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 72, "warps per SM": 576, "grid": [24, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215700, "pid": 5, "tid": 7, "ts": 1716454224829523, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750744, "dur": 11, "args": { "External id": 215700, "cbid": 211, "correlation": 215700 } }, { "ph": "s", "id": 215700, "pid": 76337, "tid": -914061504, "ts": 1716454224750744, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224829555, "dur": 153, "args": { "External id": 215702, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215702, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215702, "pid": 5, "tid": 7, "ts": 1716454224829555, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750762, "dur": 6, "args": { "External id": 215702, "cbid": 211, "correlation": 215702 } }, { "ph": "s", "id": 215702, "pid": 76337, "tid": -914061504, "ts": 1716454224750762, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224829710, "dur": 22, "args": { "External id": 215704, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215704, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215704, "pid": 5, "tid": 7, "ts": 1716454224829710, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750771, "dur": 5, "args": { "External id": 215704, "cbid": 211, "correlation": 215704 } }, { "ph": "s", "id": 215704, "pid": 76337, "tid": -914061504, "ts": 1716454224750771, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224829733, "dur": 33, "args": { "External id": 215710, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215710, "pid": 5, "tid": 7, "ts": 1716454224829733, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750798, "dur": 8, "args": { "External id": 215710, "cbid": 211, "correlation": 215710 } }, { "ph": "s", "id": 215710, "pid": 76337, "tid": -914061504, "ts": 1716454224750798, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224829767, "dur": 27, "args": { "External id": 215718, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215718, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215718, "pid": 5, "tid": 7, "ts": 1716454224829767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750826, "dur": 9, "args": { "External id": 215718, "cbid": 211, "correlation": 215718 } }, { "ph": "s", "id": 215718, "pid": 76337, "tid": -914061504, "ts": 1716454224750826, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224829796, "dur": 20, "args": { "External id": 215726, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215726, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215726, "pid": 5, "tid": 7, "ts": 1716454224829796, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750855, "dur": 8, "args": { "External id": 215726, "cbid": 211, "correlation": 215726 } }, { "ph": "s", "id": 215726, "pid": 76337, "tid": -914061504, "ts": 1716454224750855, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224829817, "dur": 30, "args": { "External id": 215746, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215746, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 215746, "pid": 5, "tid": 7, "ts": 1716454224829817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750935, "dur": 12, "args": { "External id": 215746, "cbid": 211, "correlation": 215746 } }, { "ph": "s", "id": 215746, "pid": 76337, "tid": -914061504, "ts": 1716454224750935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224829848, "dur": 4, "args": { "External id": 215758, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215758, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 215758, "pid": 5, "tid": 7, "ts": 1716454224829848, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750957, "dur": 6, "args": { "External id": 215758, "cbid": 211, "correlation": 215758 } }, { "ph": "s", "id": 215758, "pid": 76337, "tid": -914061504, "ts": 1716454224750957, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224829854, "dur": 31, "args": { "External id": 215761, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215761, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215761, "pid": 5, "tid": 7, "ts": 1716454224829854, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224750982, "dur": 9, "args": { "External id": 215761, "cbid": 211, "correlation": 215761 } }, { "ph": "s", "id": 215761, "pid": 76337, "tid": -914061504, "ts": 1716454224750982, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224751043, "dur": 0, "args": { "External id": 215772, "cbid": 317, "correlation": 215772 } }, { "ph": "f", "id": 215772, "pid": 76337, "tid": -914061504, "ts": 1716454224751043, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224751044, "dur": 0, "args": { "External id": 215773, "cbid": 203, "correlation": 215773 } }, { "ph": "f", "id": 215773, "pid": 76337, "tid": -914061504, "ts": 1716454224751044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224751044, "dur": 0, "args": { "External id": 215774, "cbid": 205, "correlation": 215774 } }, { "ph": "f", "id": 215774, "pid": 76337, "tid": -914061504, "ts": 1716454224751044, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224829886, "dur": 23, "args": { "External id": 215778, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215778, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215778, "pid": 5, "tid": 7, "ts": 1716454224829886, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751058, "dur": 12, "args": { "External id": 215778, "cbid": 211, "correlation": 215778 } }, { "ph": "s", "id": 215778, "pid": 76337, "tid": -914061504, "ts": 1716454224751058, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224829910, "dur": 105, "args": { "External id": 215780, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215780, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215780, "pid": 5, "tid": 7, "ts": 1716454224829910, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751075, "dur": 6, "args": { "External id": 215780, "cbid": 211, "correlation": 215780 } }, { "ph": "s", "id": 215780, "pid": 76337, "tid": -914061504, "ts": 1716454224751075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224830017, "dur": 22, "args": { "External id": 215782, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215782, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215782, "pid": 5, "tid": 7, "ts": 1716454224830017, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751086, "dur": 5, "args": { "External id": 215782, "cbid": 211, "correlation": 215782 } }, { "ph": "s", "id": 215782, "pid": 76337, "tid": -914061504, "ts": 1716454224751086, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224830041, "dur": 32, "args": { "External id": 215788, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215788, "pid": 5, "tid": 7, "ts": 1716454224830041, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751113, "dur": 8, "args": { "External id": 215788, "cbid": 211, "correlation": 215788 } }, { "ph": "s", "id": 215788, "pid": 76337, "tid": -914061504, "ts": 1716454224751113, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224830074, "dur": 163, "args": { "External id": 215797, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215797, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215797, "pid": 5, "tid": 7, "ts": 1716454224830074, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751196, "dur": 13, "args": { "External id": 215797, "cbid": 211, "correlation": 215797 } }, { "ph": "s", "id": 215797, "pid": 76337, "tid": -914061504, "ts": 1716454224751196, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224830239, "dur": 65, "args": { "External id": 215819, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215819, "registers per thread": 32, "shared memory": 24, "blocks per SM": 153.6, "warps per SM": 614.4, "grid": [12288, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215819, "pid": 5, "tid": 7, "ts": 1716454224830239, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751251, "dur": 10, "args": { "External id": 215819, "cbid": 211, "correlation": 215819 } }, { "ph": "s", "id": 215819, "pid": 76337, "tid": -914061504, "ts": 1716454224751251, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224751340, "dur": 1, "args": { "External id": 215830, "cbid": 251, "correlation": 215830 } }, { "ph": "f", "id": 215830, "pid": 76337, "tid": -914061504, "ts": 1716454224751340, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224830306, "dur": 150, "args": { "External id": 215831, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215831, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215831, "pid": 5, "tid": 7, "ts": 1716454224830306, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751346, "dur": 13, "args": { "External id": 215831, "cbid": 211, "correlation": 215831 } }, { "ph": "s", "id": 215831, "pid": 76337, "tid": -914061504, "ts": 1716454224751346, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224751415, "dur": 1, "args": { "External id": 215842, "cbid": 251, "correlation": 215842 } }, { "ph": "f", "id": 215842, "pid": 76337, "tid": -914061504, "ts": 1716454224751415, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224830457, "dur": 144, "args": { "External id": 215843, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215843, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215843, "pid": 5, "tid": 7, "ts": 1716454224830457, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751419, "dur": 12, "args": { "External id": 215843, "cbid": 211, "correlation": 215843 } }, { "ph": "s", "id": 215843, "pid": 76337, "tid": -914061504, "ts": 1716454224751419, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224751486, "dur": 1, "args": { "External id": 215854, "cbid": 251, "correlation": 215854 } }, { "ph": "f", "id": 215854, "pid": 76337, "tid": -914061504, "ts": 1716454224751486, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224830603, "dur": 144, "args": { "External id": 215855, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215855, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 6, "warps per SM": 24, "grid": [5, 96, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215855, "pid": 5, "tid": 7, "ts": 1716454224830603, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751490, "dur": 11, "args": { "External id": 215855, "cbid": 211, "correlation": 215855 } }, { "ph": "s", "id": 215855, "pid": 76337, "tid": -914061504, "ts": 1716454224751490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224830748, "dur": 1969, "args": { "External id": 215876, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215876, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 215876, "pid": 5, "tid": 7, "ts": 1716454224830748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751571, "dur": 13, "args": { "External id": 215876, "cbid": 211, "correlation": 215876 } }, { "ph": "s", "id": 215876, "pid": 76337, "tid": -914061504, "ts": 1716454224751571, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224751669, "dur": 1, "args": { "External id": 215894, "cbid": 251, "correlation": 215894 } }, { "ph": "f", "id": 215894, "pid": 76337, "tid": -914061504, "ts": 1716454224751669, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224832719, "dur": 150, "args": { "External id": 215896, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215896, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3, "warps per SM": 24, "grid": [5, 48, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 215896, "pid": 5, "tid": 7, "ts": 1716454224832719, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751675, "dur": 14, "args": { "External id": 215896, "cbid": 211, "correlation": 215896 } }, { "ph": "s", "id": 215896, "pid": 76337, "tid": -914061504, "ts": 1716454224751675, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224832870, "dur": 35, "args": { "External id": 215904, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215904, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215904, "pid": 5, "tid": 7, "ts": 1716454224832870, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751745, "dur": 12, "args": { "External id": 215904, "cbid": 211, "correlation": 215904 } }, { "ph": "s", "id": 215904, "pid": 76337, "tid": -914061504, "ts": 1716454224751745, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224832907, "dur": 50, "args": { "External id": 215912, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215912, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215912, "pid": 5, "tid": 7, "ts": 1716454224832907, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751785, "dur": 9, "args": { "External id": 215912, "cbid": 211, "correlation": 215912 } }, { "ph": "s", "id": 215912, "pid": 76337, "tid": -914061504, "ts": 1716454224751785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224832958, "dur": 30, "args": { "External id": 215923, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215923, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215923, "pid": 5, "tid": 7, "ts": 1716454224832958, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751856, "dur": 12, "args": { "External id": 215923, "cbid": 211, "correlation": 215923 } }, { "ph": "s", "id": 215923, "pid": 76337, "tid": -914061504, "ts": 1716454224751856, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224832990, "dur": 35, "args": { "External id": 215945, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215945, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 215945, "pid": 5, "tid": 7, "ts": 1716454224832990, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751887, "dur": 8, "args": { "External id": 215945, "cbid": 211, "correlation": 215945 } }, { "ph": "s", "id": 215945, "pid": 76337, "tid": -914061504, "ts": 1716454224751887, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224751973, "dur": 8, "args": { "External id": 215956, "cbid": 251, "correlation": 215956 } }, { "ph": "f", "id": 215956, "pid": 76337, "tid": -914061504, "ts": 1716454224751973, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224833026, "dur": 91, "args": { "External id": 215957, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215957, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 215957, "pid": 5, "tid": 7, "ts": 1716454224833026, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224751985, "dur": 14, "args": { "External id": 215957, "cbid": 211, "correlation": 215957 } }, { "ph": "s", "id": 215957, "pid": 76337, "tid": -914061504, "ts": 1716454224751985, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752056, "dur": 1, "args": { "External id": 215968, "cbid": 251, "correlation": 215968 } }, { "ph": "f", "id": 215968, "pid": 76337, "tid": -914061504, "ts": 1716454224752056, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752060, "dur": 0, "args": { "External id": 215969, "cbid": 251, "correlation": 215969 } }, { "ph": "f", "id": 215969, "pid": 76337, "tid": -914061504, "ts": 1716454224752060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224833118, "dur": 11, "args": { "External id": 215970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215970, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 215970, "pid": 5, "tid": 7, "ts": 1716454224833118, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752061, "dur": 12, "args": { "External id": 215970, "cbid": 211, "correlation": 215970 } }, { "ph": "s", "id": 215970, "pid": 76337, "tid": -914061504, "ts": 1716454224752061, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224833130, "dur": 5, "args": { "External id": 215972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215972, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 215972, "pid": 5, "tid": 7, "ts": 1716454224833130, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752075, "dur": 6, "args": { "External id": 215972, "cbid": 211, "correlation": 215972 } }, { "ph": "s", "id": 215972, "pid": 76337, "tid": -914061504, "ts": 1716454224752075, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752133, "dur": 1, "args": { "External id": 215983, "cbid": 251, "correlation": 215983 } }, { "ph": "f", "id": 215983, "pid": 76337, "tid": -914061504, "ts": 1716454224752133, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752136, "dur": 0, "args": { "External id": 215984, "cbid": 251, "correlation": 215984 } }, { "ph": "f", "id": 215984, "pid": 76337, "tid": -914061504, "ts": 1716454224752136, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224833137, "dur": 7, "args": { "External id": 215985, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215985, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 3, 4], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 215985, "pid": 5, "tid": 7, "ts": 1716454224833137, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752138, "dur": 12, "args": { "External id": 215985, "cbid": 211, "correlation": 215985 } }, { "ph": "s", "id": 215985, "pid": 76337, "tid": -914061504, "ts": 1716454224752138, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224833145, "dur": 4, "args": { "External id": 215987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 215987, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 4, "grid": [20, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 6 } }, { "ph": "f", "id": 215987, "pid": 5, "tid": 7, "ts": 1716454224833145, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752151, "dur": 5, "args": { "External id": 215987, "cbid": 211, "correlation": 215987 } }, { "ph": "s", "id": 215987, "pid": 76337, "tid": -914061504, "ts": 1716454224752151, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224833150, "dur": 94, "args": { "External id": 216008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216008, "registers per thread": 168, "shared memory": 24960, "blocks per SM": 19.2, "warps per SM": 76.8, "grid": [24, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 216008, "pid": 5, "tid": 7, "ts": 1716454224833150, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752226, "dur": 13, "args": { "External id": 216008, "cbid": 211, "correlation": 216008 } }, { "ph": "s", "id": 216008, "pid": 76337, "tid": -914061504, "ts": 1716454224752226, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752322, "dur": 1, "args": { "External id": 216026, "cbid": 251, "correlation": 216026 } }, { "ph": "f", "id": 216026, "pid": 76337, "tid": -914061504, "ts": 1716454224752322, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224833245, "dur": 86, "args": { "External id": 216028, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216028, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216028, "pid": 5, "tid": 7, "ts": 1716454224833245, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752328, "dur": 14, "args": { "External id": 216028, "cbid": 211, "correlation": 216028 } }, { "ph": "s", "id": 216028, "pid": 76337, "tid": -914061504, "ts": 1716454224752328, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224833333, "dur": 19, "args": { "External id": 216036, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216036, "registers per thread": 17, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216036, "pid": 5, "tid": 7, "ts": 1716454224833333, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752399, "dur": 12, "args": { "External id": 216036, "cbid": 211, "correlation": 216036 } }, { "ph": "s", "id": 216036, "pid": 76337, "tid": -914061504, "ts": 1716454224752399, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224833353, "dur": 37, "args": { "External id": 216044, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216044, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216044, "pid": 5, "tid": 7, "ts": 1716454224833353, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752440, "dur": 10, "args": { "External id": 216044, "cbid": 211, "correlation": 216044 } }, { "ph": "s", "id": 216044, "pid": 76337, "tid": -914061504, "ts": 1716454224752440, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224833392, "dur": 35, "args": { "External id": 216066, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216066, "registers per thread": 32, "shared memory": 24, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [6144, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216066, "pid": 5, "tid": 7, "ts": 1716454224833392, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752490, "dur": 10, "args": { "External id": 216066, "cbid": 211, "correlation": 216066 } }, { "ph": "s", "id": 216066, "pid": 76337, "tid": -914061504, "ts": 1716454224752490, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752580, "dur": 1, "args": { "External id": 216082, "cbid": 251, "correlation": 216082 } }, { "ph": "f", "id": 216082, "pid": 76337, "tid": -914061504, "ts": 1716454224752580, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752584, "dur": 0, "args": { "External id": 216084, "cbid": 251, "correlation": 216084 } }, { "ph": "f", "id": 216084, "pid": 76337, "tid": -914061504, "ts": 1716454224752584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224833428, "dur": 543, "args": { "External id": 216085, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216085, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 12, "warps per SM": 96, "grid": [40, 24, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 216085, "pid": 5, "tid": 7, "ts": 1716454224833428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752588, "dur": 14, "args": { "External id": 216085, "cbid": 211, "correlation": 216085 } }, { "ph": "s", "id": 216085, "pid": 76337, "tid": -914061504, "ts": 1716454224752588, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224833972, "dur": 124, "args": { "External id": 216093, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216093, "pid": 5, "tid": 7, "ts": 1716454224833972, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752653, "dur": 12, "args": { "External id": 216093, "cbid": 211, "correlation": 216093 } }, { "ph": "s", "id": 216093, "pid": 76337, "tid": -914061504, "ts": 1716454224752653, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224834098, "dur": 130, "args": { "External id": 216101, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216101, "pid": 5, "tid": 7, "ts": 1716454224834098, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752683, "dur": 8, "args": { "External id": 216101, "cbid": 211, "correlation": 216101 } }, { "ph": "s", "id": 216101, "pid": 76337, "tid": -914061504, "ts": 1716454224752683, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224752760, "dur": 1, "args": { "External id": 216117, "cbid": 251, "correlation": 216117 } }, { "ph": "f", "id": 216117, "pid": 76337, "tid": -914061504, "ts": 1716454224752760, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224834229, "dur": 305, "args": { "External id": 216119, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216119, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 3, "warps per SM": 12, "grid": [5, 48, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216119, "pid": 5, "tid": 7, "ts": 1716454224834229, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752766, "dur": 12, "args": { "External id": 216119, "cbid": 211, "correlation": 216119 } }, { "ph": "s", "id": 216119, "pid": 76337, "tid": -914061504, "ts": 1716454224752766, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224834536, "dur": 27, "args": { "External id": 216127, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216127, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216127, "pid": 5, "tid": 7, "ts": 1716454224834536, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752808, "dur": 9, "args": { "External id": 216127, "cbid": 211, "correlation": 216127 } }, { "ph": "s", "id": 216127, "pid": 76337, "tid": -914061504, "ts": 1716454224752808, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224834564, "dur": 81, "args": { "External id": 216138, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216138, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216138, "pid": 5, "tid": 7, "ts": 1716454224834564, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752875, "dur": 12, "args": { "External id": 216138, "cbid": 211, "correlation": 216138 } }, { "ph": "s", "id": 216138, "pid": 76337, "tid": -914061504, "ts": 1716454224752875, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224752938, "dur": 0, "args": { "External id": 216150, "cbid": 317, "correlation": 216150 } }, { "ph": "f", "id": 216150, "pid": 76337, "tid": -914061504, "ts": 1716454224752938, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224752939, "dur": 0, "args": { "External id": 216151, "cbid": 203, "correlation": 216151 } }, { "ph": "f", "id": 216151, "pid": 76337, "tid": -914061504, "ts": 1716454224752939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224752939, "dur": 0, "args": { "External id": 216152, "cbid": 205, "correlation": 216152 } }, { "ph": "f", "id": 216152, "pid": 76337, "tid": -914061504, "ts": 1716454224752939, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224834647, "dur": 22, "args": { "External id": 216156, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216156, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216156, "pid": 5, "tid": 7, "ts": 1716454224834647, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752955, "dur": 12, "args": { "External id": 216156, "cbid": 211, "correlation": 216156 } }, { "ph": "s", "id": 216156, "pid": 76337, "tid": -914061504, "ts": 1716454224752955, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x256x64_stage1_warpsize1x4x2_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224834671, "dur": 121, "args": { "External id": 216158, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216158, "registers per thread": 255, "shared memory": 40960, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216158, "pid": 5, "tid": 7, "ts": 1716454224834671, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752981, "dur": 7, "args": { "External id": 216158, "cbid": 211, "correlation": 216158 } }, { "ph": "s", "id": 216158, "pid": 76337, "tid": -914061504, "ts": 1716454224752981, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224834793, "dur": 23, "args": { "External id": 216160, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216160, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 48, "warps per SM": 384, "grid": [24, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216160, "pid": 5, "tid": 7, "ts": 1716454224834793, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224752992, "dur": 6, "args": { "External id": 216160, "cbid": 211, "correlation": 216160 } }, { "ph": "s", "id": 216160, "pid": 76337, "tid": -914061504, "ts": 1716454224752992, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224834817, "dur": 33, "args": { "External id": 216166, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216166, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216166, "pid": 5, "tid": 7, "ts": 1716454224834817, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753022, "dur": 9, "args": { "External id": 216166, "cbid": 211, "correlation": 216166 } }, { "ph": "s", "id": 216166, "pid": 76337, "tid": -914061504, "ts": 1716454224753022, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224834852, "dur": 26, "args": { "External id": 216174, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216174, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96, "warps per SM": 384, "grid": [7680, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216174, "pid": 5, "tid": 7, "ts": 1716454224834852, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753055, "dur": 8, "args": { "External id": 216174, "cbid": 211, "correlation": 216174 } }, { "ph": "s", "id": 216174, "pid": 76337, "tid": -914061504, "ts": 1716454224753055, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::upsample_nearest2d_out_frame(c10::Half const*, c10::Half*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)", "pid": 5, "tid": 7, "ts": 1716454224834880, "dur": 102, "args": { "External id": 216185, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216185, "registers per thread": 24, "shared memory": 0, "blocks per SM": 128, "warps per SM": 2048, "grid": [2, 4, 1280], "block": [32, 16, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216185, "pid": 5, "tid": 7, "ts": 1716454224834880, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753117, "dur": 11, "args": { "External id": 216185, "cbid": 211, "correlation": 216185 } }, { "ph": "s", "id": 216185, "pid": 76337, "tid": -914061504, "ts": 1716454224753117, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224753171, "dur": 0, "args": { "External id": 216195, "cbid": 317, "correlation": 216195 } }, { "ph": "f", "id": 216195, "pid": 76337, "tid": -914061504, "ts": 1716454224753171, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224753172, "dur": 0, "args": { "External id": 216196, "cbid": 203, "correlation": 216196 } }, { "ph": "f", "id": 216196, "pid": 76337, "tid": -914061504, "ts": 1716454224753172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224753172, "dur": 0, "args": { "External id": 216197, "cbid": 205, "correlation": 216197 } }, { "ph": "f", "id": 216197, "pid": 76337, "tid": -914061504, "ts": 1716454224753172, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224834983, "dur": 75, "args": { "External id": 216201, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216201, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216201, "pid": 5, "tid": 7, "ts": 1716454224834983, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753186, "dur": 11, "args": { "External id": 216201, "cbid": 211, "correlation": 216201 } }, { "ph": "s", "id": 216201, "pid": 76337, "tid": -914061504, "ts": 1716454224753186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224835059, "dur": 44, "args": { "External id": 216203, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216203, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 160, "warps per SM": 1280, "grid": [1, 20, 640], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216203, "pid": 5, "tid": 7, "ts": 1716454224835059, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753201, "dur": 6, "args": { "External id": 216203, "cbid": 211, "correlation": 216203 } }, { "ph": "s", "id": 216203, "pid": 76337, "tid": -914061504, "ts": 1716454224753201, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224835105, "dur": 4, "args": { "External id": 216205, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216205, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216205, "pid": 5, "tid": 7, "ts": 1716454224835105, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753211, "dur": 6, "args": { "External id": 216205, "cbid": 211, "correlation": 216205 } }, { "ph": "s", "id": 216205, "pid": 76337, "tid": -914061504, "ts": 1716454224753211, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224753220, "dur": 0, "args": { "External id": 216206, "cbid": 51, "correlation": 216206 } }, { "ph": "s", "id": 216206, "pid": 76337, "tid": -914061504, "ts": 1716454224753220, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224835110, "dur": 2235, "args": { "External id": 216207, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216207, "registers per thread": 242, "shared memory": 41472, "blocks per SM": 12, "warps per SM": 48, "grid": [96, 10, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216207, "pid": 5, "tid": 7, "ts": 1716454224835110, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753221, "dur": 5, "args": { "External id": 216207, "cbid": 211, "correlation": 216207 } }, { "ph": "s", "id": 216207, "pid": 76337, "tid": -914061504, "ts": 1716454224753221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224837347, "dur": 114, "args": { "External id": 216212, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216212, "pid": 5, "tid": 7, "ts": 1716454224837347, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753248, "dur": 9, "args": { "External id": 216212, "cbid": 211, "correlation": 216212 } }, { "ph": "s", "id": 216212, "pid": 76337, "tid": -914061504, "ts": 1716454224753248, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224837463, "dur": 169, "args": { "External id": 216221, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216221, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216221, "pid": 5, "tid": 7, "ts": 1716454224837463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753339, "dur": 14, "args": { "External id": 216221, "cbid": 211, "correlation": 216221 } }, { "ph": "s", "id": 216221, "pid": 76337, "tid": -914061504, "ts": 1716454224753339, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224837633, "dur": 128, "args": { "External id": 216241, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216241, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 216241, "pid": 5, "tid": 7, "ts": 1716454224837633, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753411, "dur": 11, "args": { "External id": 216241, "cbid": 211, "correlation": 216241 } }, { "ph": "s", "id": 216241, "pid": 76337, "tid": -914061504, "ts": 1716454224753411, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224837762, "dur": 5, "args": { "External id": 216253, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216253, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.375, "warps per SM": 3, "grid": [30, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 216253, "pid": 5, "tid": 7, "ts": 1716454224837762, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753433, "dur": 6, "args": { "External id": 216253, "cbid": 211, "correlation": 216253 } }, { "ph": "s", "id": 216253, "pid": 76337, "tid": -914061504, "ts": 1716454224753433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224837768, "dur": 160, "args": { "External id": 216256, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216256, "registers per thread": 16, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216256, "pid": 5, "tid": 7, "ts": 1716454224837768, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753451, "dur": 8, "args": { "External id": 216256, "cbid": 211, "correlation": 216256 } }, { "ph": "s", "id": 216256, "pid": 76337, "tid": -914061504, "ts": 1716454224753451, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224837930, "dur": 102, "args": { "External id": 216265, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216265, "registers per thread": 24, "shared memory": 0, "blocks per SM": 576, "warps per SM": 2304, "grid": [46080, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216265, "pid": 5, "tid": 7, "ts": 1716454224837930, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753492, "dur": 10, "args": { "External id": 216265, "cbid": 211, "correlation": 216265 } }, { "ph": "s", "id": 216265, "pid": 76337, "tid": -914061504, "ts": 1716454224753492, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224753544, "dur": 0, "args": { "External id": 216275, "cbid": 317, "correlation": 216275 } }, { "ph": "f", "id": 216275, "pid": 76337, "tid": -914061504, "ts": 1716454224753544, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224753545, "dur": 0, "args": { "External id": 216276, "cbid": 203, "correlation": 216276 } }, { "ph": "f", "id": 216276, "pid": 76337, "tid": -914061504, "ts": 1716454224753545, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224753546, "dur": 0, "args": { "External id": 216277, "cbid": 205, "correlation": 216277 } }, { "ph": "f", "id": 216277, "pid": 76337, "tid": -914061504, "ts": 1716454224753546, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224838033, "dur": 111, "args": { "External id": 216281, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216281, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 288, "warps per SM": 2304, "grid": [96, 30, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216281, "pid": 5, "tid": 7, "ts": 1716454224838033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753562, "dur": 11, "args": { "External id": 216281, "cbid": 211, "correlation": 216281 } }, { "ph": "s", "id": 216281, "pid": 76337, "tid": -914061504, "ts": 1716454224753562, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224838146, "dur": 34, "args": { "External id": 216283, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216283, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 120, "warps per SM": 960, "grid": [1, 30, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216283, "pid": 5, "tid": 7, "ts": 1716454224838146, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753576, "dur": 5, "args": { "External id": 216283, "cbid": 211, "correlation": 216283 } }, { "ph": "s", "id": 216283, "pid": 76337, "tid": -914061504, "ts": 1716454224753576, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224838181, "dur": 3, "args": { "External id": 216285, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216285, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216285, "pid": 5, "tid": 7, "ts": 1716454224838181, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753586, "dur": 5, "args": { "External id": 216285, "cbid": 211, "correlation": 216285 } }, { "ph": "s", "id": 216285, "pid": 76337, "tid": -914061504, "ts": 1716454224753586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224753594, "dur": 0, "args": { "External id": 216286, "cbid": 51, "correlation": 216286 } }, { "ph": "s", "id": 216286, "pid": 76337, "tid": -914061504, "ts": 1716454224753594, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224838186, "dur": 2023, "args": { "External id": 216287, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216287, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216287, "pid": 5, "tid": 7, "ts": 1716454224838186, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753595, "dur": 6, "args": { "External id": 216287, "cbid": 211, "correlation": 216287 } }, { "ph": "s", "id": 216287, "pid": 76337, "tid": -914061504, "ts": 1716454224753595, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224840210, "dur": 59, "args": { "External id": 216292, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216292, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216292, "pid": 5, "tid": 7, "ts": 1716454224840210, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753625, "dur": 8, "args": { "External id": 216292, "cbid": 211, "correlation": 216292 } }, { "ph": "s", "id": 216292, "pid": 76337, "tid": -914061504, "ts": 1716454224753625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224840270, "dur": 3, "args": { "External id": 216300, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216300, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216300, "pid": 5, "tid": 7, "ts": 1716454224840270, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753668, "dur": 9, "args": { "External id": 216300, "cbid": 211, "correlation": 216300 } }, { "ph": "s", "id": 216300, "pid": 76337, "tid": -914061504, "ts": 1716454224753668, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224753732, "dur": 1, "args": { "External id": 216316, "cbid": 251, "correlation": 216316 } }, { "ph": "f", "id": 216316, "pid": 76337, "tid": -914061504, "ts": 1716454224753732, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224753737, "dur": 0, "args": { "External id": 216318, "cbid": 251, "correlation": 216318 } }, { "ph": "f", "id": 216318, "pid": 76337, "tid": -914061504, "ts": 1716454224753737, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224840275, "dur": 11, "args": { "External id": 216319, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216319, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 216319, "pid": 5, "tid": 7, "ts": 1716454224840275, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753739, "dur": 11, "args": { "External id": 216319, "cbid": 211, "correlation": 216319 } }, { "ph": "s", "id": 216319, "pid": 76337, "tid": -914061504, "ts": 1716454224753739, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224840287, "dur": 5, "args": { "External id": 216321, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216321, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 216321, "pid": 5, "tid": 7, "ts": 1716454224840287, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753752, "dur": 6, "args": { "External id": 216321, "cbid": 211, "correlation": 216321 } }, { "ph": "s", "id": 216321, "pid": 76337, "tid": -914061504, "ts": 1716454224753752, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224840294, "dur": 53, "args": { "External id": 216331, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216331, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216331, "pid": 5, "tid": 7, "ts": 1716454224840294, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753811, "dur": 12, "args": { "External id": 216331, "cbid": 211, "correlation": 216331 } }, { "ph": "s", "id": 216331, "pid": 76337, "tid": -914061504, "ts": 1716454224753811, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224840348, "dur": 51, "args": { "External id": 216351, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216351, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 216351, "pid": 5, "tid": 7, "ts": 1716454224840348, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753877, "dur": 11, "args": { "External id": 216351, "cbid": 211, "correlation": 216351 } }, { "ph": "s", "id": 216351, "pid": 76337, "tid": -914061504, "ts": 1716454224753877, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224840401, "dur": 4, "args": { "External id": 216363, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216363, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216363, "pid": 5, "tid": 7, "ts": 1716454224840401, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753898, "dur": 6, "args": { "External id": 216363, "cbid": 211, "correlation": 216363 } }, { "ph": "s", "id": 216363, "pid": 76337, "tid": -914061504, "ts": 1716454224753898, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224840406, "dur": 56, "args": { "External id": 216366, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216366, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216366, "pid": 5, "tid": 7, "ts": 1716454224840406, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753918, "dur": 6, "args": { "External id": 216366, "cbid": 211, "correlation": 216366 } }, { "ph": "s", "id": 216366, "pid": 76337, "tid": -914061504, "ts": 1716454224753918, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224840463, "dur": 36, "args": { "External id": 216375, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216375, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216375, "pid": 5, "tid": 7, "ts": 1716454224840463, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224753958, "dur": 10, "args": { "External id": 216375, "cbid": 211, "correlation": 216375 } }, { "ph": "s", "id": 216375, "pid": 76337, "tid": -914061504, "ts": 1716454224753958, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224754031, "dur": 0, "args": { "External id": 216385, "cbid": 317, "correlation": 216385 } }, { "ph": "f", "id": 216385, "pid": 76337, "tid": -914061504, "ts": 1716454224754031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224754032, "dur": 0, "args": { "External id": 216386, "cbid": 203, "correlation": 216386 } }, { "ph": "f", "id": 216386, "pid": 76337, "tid": -914061504, "ts": 1716454224754032, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224754033, "dur": 0, "args": { "External id": 216387, "cbid": 205, "correlation": 216387 } }, { "ph": "f", "id": 216387, "pid": 76337, "tid": -914061504, "ts": 1716454224754033, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224840501, "dur": 40, "args": { "External id": 216391, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216391, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216391, "pid": 5, "tid": 7, "ts": 1716454224840501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754049, "dur": 13, "args": { "External id": 216391, "cbid": 211, "correlation": 216391 } }, { "ph": "s", "id": 216391, "pid": 76337, "tid": -914061504, "ts": 1716454224754049, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224840542, "dur": 14, "args": { "External id": 216393, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216393, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216393, "pid": 5, "tid": 7, "ts": 1716454224840542, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754064, "dur": 5, "args": { "External id": 216393, "cbid": 211, "correlation": 216393 } }, { "ph": "s", "id": 216393, "pid": 76337, "tid": -914061504, "ts": 1716454224754064, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224840558, "dur": 3, "args": { "External id": 216395, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216395, "pid": 5, "tid": 7, "ts": 1716454224840558, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754073, "dur": 5, "args": { "External id": 216395, "cbid": 211, "correlation": 216395 } }, { "ph": "s", "id": 216395, "pid": 76337, "tid": -914061504, "ts": 1716454224754073, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224754082, "dur": 0, "args": { "External id": 216396, "cbid": 51, "correlation": 216396 } }, { "ph": "s", "id": 216396, "pid": 76337, "tid": -914061504, "ts": 1716454224754082, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224840562, "dur": 704, "args": { "External id": 216397, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216397, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216397, "pid": 5, "tid": 7, "ts": 1716454224840562, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754083, "dur": 5, "args": { "External id": 216397, "cbid": 211, "correlation": 216397 } }, { "ph": "s", "id": 216397, "pid": 76337, "tid": -914061504, "ts": 1716454224754083, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224841267, "dur": 60, "args": { "External id": 216402, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216402, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216402, "pid": 5, "tid": 7, "ts": 1716454224841267, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754111, "dur": 9, "args": { "External id": 216402, "cbid": 211, "correlation": 216402 } }, { "ph": "s", "id": 216402, "pid": 76337, "tid": -914061504, "ts": 1716454224754111, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224754169, "dur": 0, "args": { "External id": 216412, "cbid": 317, "correlation": 216412 } }, { "ph": "f", "id": 216412, "pid": 76337, "tid": -914061504, "ts": 1716454224754169, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224754170, "dur": 0, "args": { "External id": 216413, "cbid": 203, "correlation": 216413 } }, { "ph": "f", "id": 216413, "pid": 76337, "tid": -914061504, "ts": 1716454224754170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224754170, "dur": 0, "args": { "External id": 216414, "cbid": 205, "correlation": 216414 } }, { "ph": "f", "id": 216414, "pid": 76337, "tid": -914061504, "ts": 1716454224754170, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224841329, "dur": 3, "args": { "External id": 216418, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216418, "pid": 5, "tid": 7, "ts": 1716454224841329, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754186, "dur": 11, "args": { "External id": 216418, "cbid": 211, "correlation": 216418 } }, { "ph": "s", "id": 216418, "pid": 76337, "tid": -914061504, "ts": 1716454224754186, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224754202, "dur": 0, "args": { "External id": 216419, "cbid": 51, "correlation": 216419 } }, { "ph": "s", "id": 216419, "pid": 76337, "tid": -914061504, "ts": 1716454224754202, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1", "pid": 5, "tid": 7, "ts": 1716454224841334, "dur": 264, "args": { "External id": 216420, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216420, "registers per thread": 226, "shared memory": 49152, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [96, 3, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216420, "pid": 5, "tid": 7, "ts": 1716454224841334, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754203, "dur": 7, "args": { "External id": 216420, "cbid": 211, "correlation": 216420 } }, { "ph": "s", "id": 216420, "pid": 76337, "tid": -914061504, "ts": 1716454224754203, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224841599, "dur": 60, "args": { "External id": 216425, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216425, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216425, "pid": 5, "tid": 7, "ts": 1716454224841599, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754231, "dur": 8, "args": { "External id": 216425, "cbid": 211, "correlation": 216425 } }, { "ph": "s", "id": 216425, "pid": 76337, "tid": -914061504, "ts": 1716454224754231, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224841660, "dur": 50, "args": { "External id": 216433, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216433, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216433, "pid": 5, "tid": 7, "ts": 1716454224841660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754259, "dur": 8, "args": { "External id": 216433, "cbid": 211, "correlation": 216433 } }, { "ph": "s", "id": 216433, "pid": 76337, "tid": -914061504, "ts": 1716454224754259, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224841711, "dur": 35, "args": { "External id": 216441, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216441, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216441, "pid": 5, "tid": 7, "ts": 1716454224841711, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754288, "dur": 8, "args": { "External id": 216441, "cbid": 211, "correlation": 216441 } }, { "ph": "s", "id": 216441, "pid": 76337, "tid": -914061504, "ts": 1716454224754288, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224841748, "dur": 53, "args": { "External id": 216461, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216461, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 216461, "pid": 5, "tid": 7, "ts": 1716454224841748, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754371, "dur": 12, "args": { "External id": 216461, "cbid": 211, "correlation": 216461 } }, { "ph": "s", "id": 216461, "pid": 76337, "tid": -914061504, "ts": 1716454224754371, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224841803, "dur": 4, "args": { "External id": 216473, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216473, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216473, "pid": 5, "tid": 7, "ts": 1716454224841803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754393, "dur": 6, "args": { "External id": 216473, "cbid": 211, "correlation": 216473 } }, { "ph": "s", "id": 216473, "pid": 76337, "tid": -914061504, "ts": 1716454224754393, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224841808, "dur": 55, "args": { "External id": 216476, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216476, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216476, "pid": 5, "tid": 7, "ts": 1716454224841808, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754410, "dur": 7, "args": { "External id": 216476, "cbid": 211, "correlation": 216476 } }, { "ph": "s", "id": 216476, "pid": 76337, "tid": -914061504, "ts": 1716454224754410, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224754468, "dur": 0, "args": { "External id": 216487, "cbid": 317, "correlation": 216487 } }, { "ph": "f", "id": 216487, "pid": 76337, "tid": -914061504, "ts": 1716454224754468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224754468, "dur": 0, "args": { "External id": 216488, "cbid": 203, "correlation": 216488 } }, { "ph": "f", "id": 216488, "pid": 76337, "tid": -914061504, "ts": 1716454224754468, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224754469, "dur": 0, "args": { "External id": 216489, "cbid": 205, "correlation": 216489 } }, { "ph": "f", "id": 216489, "pid": 76337, "tid": -914061504, "ts": 1716454224754469, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754497, "dur": 2, "args": { "External id": 216493, "cbid": 251, "correlation": 216493 } }, { "ph": "f", "id": 216493, "pid": 76337, "tid": -914061504, "ts": 1716454224754497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754500, "dur": 0, "args": { "External id": 216494, "cbid": 251, "correlation": 216494 } }, { "ph": "f", "id": 216494, "pid": 76337, "tid": -914061504, "ts": 1716454224754500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754501, "dur": 0, "args": { "External id": 216495, "cbid": 251, "correlation": 216495 } }, { "ph": "f", "id": 216495, "pid": 76337, "tid": -914061504, "ts": 1716454224754501, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754503, "dur": 0, "args": { "External id": 216496, "cbid": 251, "correlation": 216496 } }, { "ph": "f", "id": 216496, "pid": 76337, "tid": -914061504, "ts": 1716454224754503, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754504, "dur": 1, "args": { "External id": 216497, "cbid": 251, "correlation": 216497 } }, { "ph": "f", "id": 216497, "pid": 76337, "tid": -914061504, "ts": 1716454224754504, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754506, "dur": 1, "args": { "External id": 216498, "cbid": 251, "correlation": 216498 } }, { "ph": "f", "id": 216498, "pid": 76337, "tid": -914061504, "ts": 1716454224754506, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754508, "dur": 1, "args": { "External id": 216499, "cbid": 251, "correlation": 216499 } }, { "ph": "f", "id": 216499, "pid": 76337, "tid": -914061504, "ts": 1716454224754508, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754509, "dur": 1, "args": { "External id": 216500, "cbid": 251, "correlation": 216500 } }, { "ph": "f", "id": 216500, "pid": 76337, "tid": -914061504, "ts": 1716454224754509, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754511, "dur": 0, "args": { "External id": 216501, "cbid": 251, "correlation": 216501 } }, { "ph": "f", "id": 216501, "pid": 76337, "tid": -914061504, "ts": 1716454224754511, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224841863, "dur": 115, "args": { "External id": 216502, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216502, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 216502, "pid": 5, "tid": 7, "ts": 1716454224841863, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754514, "dur": 13, "args": { "External id": 216502, "cbid": 211, "correlation": 216502 } }, { "ph": "s", "id": 216502, "pid": 76337, "tid": -914061504, "ts": 1716454224754514, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224841979, "dur": 59, "args": { "External id": 216508, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216508, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216508, "pid": 5, "tid": 7, "ts": 1716454224841979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754549, "dur": 9, "args": { "External id": 216508, "cbid": 211, "correlation": 216508 } }, { "ph": "s", "id": 216508, "pid": 76337, "tid": -914061504, "ts": 1716454224754549, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224842040, "dur": 483, "args": { "External id": 216517, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216517, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216517, "pid": 5, "tid": 7, "ts": 1716454224842040, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754633, "dur": 14, "args": { "External id": 216517, "cbid": 211, "correlation": 216517 } }, { "ph": "s", "id": 216517, "pid": 76337, "tid": -914061504, "ts": 1716454224754633, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224842524, "dur": 182, "args": { "External id": 216539, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216539, "registers per thread": 32, "shared memory": 24, "blocks per SM": 614.4, "warps per SM": 2457.6, "grid": [49152, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216539, "pid": 5, "tid": 7, "ts": 1716454224842524, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754691, "dur": 10, "args": { "External id": 216539, "cbid": 211, "correlation": 216539 } }, { "ph": "s", "id": 216539, "pid": 76337, "tid": -914061504, "ts": 1716454224754691, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754780, "dur": 1, "args": { "External id": 216550, "cbid": 251, "correlation": 216550 } }, { "ph": "f", "id": 216550, "pid": 76337, "tid": -914061504, "ts": 1716454224754780, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224842708, "dur": 197, "args": { "External id": 216551, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216551, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216551, "pid": 5, "tid": 7, "ts": 1716454224842708, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754785, "dur": 13, "args": { "External id": 216551, "cbid": 211, "correlation": 216551 } }, { "ph": "s", "id": 216551, "pid": 76337, "tid": -914061504, "ts": 1716454224754785, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754853, "dur": 1, "args": { "External id": 216562, "cbid": 251, "correlation": 216562 } }, { "ph": "f", "id": 216562, "pid": 76337, "tid": -914061504, "ts": 1716454224754853, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224842906, "dur": 189, "args": { "External id": 216563, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216563, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216563, "pid": 5, "tid": 7, "ts": 1716454224842906, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754857, "dur": 12, "args": { "External id": 216563, "cbid": 211, "correlation": 216563 } }, { "ph": "s", "id": 216563, "pid": 76337, "tid": -914061504, "ts": 1716454224754857, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224754920, "dur": 1, "args": { "External id": 216574, "cbid": 251, "correlation": 216574 } }, { "ph": "f", "id": 216574, "pid": 76337, "tid": -914061504, "ts": 1716454224754920, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224843096, "dur": 187, "args": { "External id": 216575, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216575, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216575, "pid": 5, "tid": 7, "ts": 1716454224843096, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224754925, "dur": 12, "args": { "External id": 216575, "cbid": 211, "correlation": 216575 } }, { "ph": "s", "id": 216575, "pid": 76337, "tid": -914061504, "ts": 1716454224754925, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224843284, "dur": 18744, "args": { "External id": 216596, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216596, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 76.8, "warps per SM": 307.2, "grid": [96, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 216596, "pid": 5, "tid": 7, "ts": 1716454224843284, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755018, "dur": 14, "args": { "External id": 216596, "cbid": 211, "correlation": 216596 } }, { "ph": "s", "id": 216596, "pid": 76337, "tid": -914061504, "ts": 1716454224755018, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755117, "dur": 1, "args": { "External id": 216614, "cbid": 251, "correlation": 216614 } }, { "ph": "f", "id": 216614, "pid": 76337, "tid": -914061504, "ts": 1716454224755117, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224862030, "dur": 206, "args": { "External id": 216616, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216616, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [3, 384, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216616, "pid": 5, "tid": 7, "ts": 1716454224862030, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755122, "dur": 13, "args": { "External id": 216616, "cbid": 211, "correlation": 216616 } }, { "ph": "s", "id": 216616, "pid": 76337, "tid": -914061504, "ts": 1716454224755122, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224862237, "dur": 66, "args": { "External id": 216624, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216624, "registers per thread": 17, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216624, "pid": 5, "tid": 7, "ts": 1716454224862237, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755191, "dur": 12, "args": { "External id": 216624, "cbid": 211, "correlation": 216624 } }, { "ph": "s", "id": 216624, "pid": 76337, "tid": -914061504, "ts": 1716454224755191, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224862305, "dur": 97, "args": { "External id": 216632, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216632, "registers per thread": 19, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216632, "pid": 5, "tid": 7, "ts": 1716454224862305, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755230, "dur": 8, "args": { "External id": 216632, "cbid": 211, "correlation": 216632 } }, { "ph": "s", "id": 216632, "pid": 76337, "tid": -914061504, "ts": 1716454224755230, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224862403, "dur": 54, "args": { "External id": 216643, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216643, "pid": 5, "tid": 7, "ts": 1716454224862403, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755302, "dur": 13, "args": { "External id": 216643, "cbid": 211, "correlation": 216643 } }, { "ph": "s", "id": 216643, "pid": 76337, "tid": -914061504, "ts": 1716454224755302, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224862459, "dur": 92, "args": { "External id": 216665, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216665, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216665, "pid": 5, "tid": 7, "ts": 1716454224862459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755333, "dur": 8, "args": { "External id": 216665, "cbid": 211, "correlation": 216665 } }, { "ph": "s", "id": 216665, "pid": 76337, "tid": -914061504, "ts": 1716454224755333, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755418, "dur": 1, "args": { "External id": 216676, "cbid": 251, "correlation": 216676 } }, { "ph": "f", "id": 216676, "pid": 76337, "tid": -914061504, "ts": 1716454224755418, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224862553, "dur": 106, "args": { "External id": 216677, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216677, "registers per thread": 250, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216677, "pid": 5, "tid": 7, "ts": 1716454224862553, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755424, "dur": 12, "args": { "External id": 216677, "cbid": 211, "correlation": 216677 } }, { "ph": "s", "id": 216677, "pid": 76337, "tid": -914061504, "ts": 1716454224755424, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755497, "dur": 1, "args": { "External id": 216688, "cbid": 251, "correlation": 216688 } }, { "ph": "f", "id": 216688, "pid": 76337, "tid": -914061504, "ts": 1716454224755497, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755500, "dur": 0, "args": { "External id": 216689, "cbid": 251, "correlation": 216689 } }, { "ph": "f", "id": 216689, "pid": 76337, "tid": -914061504, "ts": 1716454224755500, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224862660, "dur": 10, "args": { "External id": 216690, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216690, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 216690, "pid": 5, "tid": 7, "ts": 1716454224862660, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755502, "dur": 13, "args": { "External id": 216690, "cbid": 211, "correlation": 216690 } }, { "ph": "s", "id": 216690, "pid": 76337, "tid": -914061504, "ts": 1716454224755502, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224862672, "dur": 5, "args": { "External id": 216692, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216692, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 216692, "pid": 5, "tid": 7, "ts": 1716454224862672, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755519, "dur": 7, "args": { "External id": 216692, "cbid": 211, "correlation": 216692 } }, { "ph": "s", "id": 216692, "pid": 76337, "tid": -914061504, "ts": 1716454224755519, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755581, "dur": 1, "args": { "External id": 216703, "cbid": 251, "correlation": 216703 } }, { "ph": "f", "id": 216703, "pid": 76337, "tid": -914061504, "ts": 1716454224755581, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755584, "dur": 0, "args": { "External id": 216704, "cbid": 251, "correlation": 216704 } }, { "ph": "f", "id": 216704, "pid": 76337, "tid": -914061504, "ts": 1716454224755584, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_f16_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224862678, "dur": 6, "args": { "External id": 216705, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216705, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 1.2, "warps per SM": 4.8, "grid": [8, 2, 6], "block": [128, 1, 1], "est. achieved occupancy %": 8 } }, { "ph": "f", "id": 216705, "pid": 5, "tid": 7, "ts": 1716454224862678, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755586, "dur": 12, "args": { "External id": 216705, "cbid": 211, "correlation": 216705 } }, { "ph": "s", "id": 216705, "pid": 76337, "tid": -914061504, "ts": 1716454224755586, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __half const*, __half const*, __half*, float const*, float const*, __half const*, __half const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224862685, "dur": 3, "args": { "External id": 216707, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216707, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 216707, "pid": 5, "tid": 7, "ts": 1716454224862685, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755599, "dur": 5, "args": { "External id": 216707, "cbid": 211, "correlation": 216707 } }, { "ph": "s", "id": 216707, "pid": 76337, "tid": -914061504, "ts": 1716454224755599, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void attention_kernel_batched >(AttentionKernel::Params)", "pid": 5, "tid": 7, "ts": 1716454224862690, "dur": 157, "args": { "External id": 216728, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216728, "registers per thread": 168, "shared memory": 17152, "blocks per SM": 38.4, "warps per SM": 153.6, "grid": [48, 8, 8], "block": [32, 4, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 216728, "pid": 5, "tid": 7, "ts": 1716454224862690, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755672, "dur": 12, "args": { "External id": 216728, "cbid": 211, "correlation": 216728 } }, { "ph": "s", "id": 216728, "pid": 76337, "tid": -914061504, "ts": 1716454224755672, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224755767, "dur": 2, "args": { "External id": 216746, "cbid": 251, "correlation": 216746 } }, { "ph": "f", "id": 216746, "pid": 76337, "tid": -914061504, "ts": 1716454224755767, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224862849, "dur": 107, "args": { "External id": 216748, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216748, "registers per thread": 142, "shared memory": 25088, "blocks per SM": 12, "warps per SM": 48, "grid": [5, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 216748, "pid": 5, "tid": 7, "ts": 1716454224862849, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755774, "dur": 13, "args": { "External id": 216748, "cbid": 211, "correlation": 216748 } }, { "ph": "s", "id": 216748, "pid": 76337, "tid": -914061504, "ts": 1716454224755774, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224862957, "dur": 35, "args": { "External id": 216756, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216756, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216756, "pid": 5, "tid": 7, "ts": 1716454224862957, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755842, "dur": 12, "args": { "External id": 216756, "cbid": 211, "correlation": 216756 } }, { "ph": "s", "id": 216756, "pid": 76337, "tid": -914061504, "ts": 1716454224755842, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224862993, "dur": 66, "args": { "External id": 216764, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216764, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216764, "pid": 5, "tid": 7, "ts": 1716454224862993, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755883, "dur": 9, "args": { "External id": 216764, "cbid": 211, "correlation": 216764 } }, { "ph": "s", "id": 216764, "pid": 76337, "tid": -914061504, "ts": 1716454224755883, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::vectorized_layer_norm_kernel(int, float, c10::Half const*, c10::Half const*, c10::Half const*, float*, float*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224863060, "dur": 92, "args": { "External id": 216786, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216786, "registers per thread": 32, "shared memory": 24, "blocks per SM": 307.2, "warps per SM": 1228.8, "grid": [24576, 1, 1], "block": [32, 4, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216786, "pid": 5, "tid": 7, "ts": 1716454224863060, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224755935, "dur": 10, "args": { "External id": 216786, "cbid": 211, "correlation": 216786 } }, { "ph": "s", "id": 216786, "pid": 76337, "tid": -914061504, "ts": 1716454224755935, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756029, "dur": 1, "args": { "External id": 216802, "cbid": 251, "correlation": 216802 } }, { "ph": "f", "id": 216802, "pid": 76337, "tid": -914061504, "ts": 1716454224756029, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x128_ldg8_relu_f2f_stages_32x1_tn", "pid": 5, "tid": 7, "ts": 1716454224863154, "dur": 578, "args": { "External id": 216804, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216804, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 48, "warps per SM": 192, "grid": [20, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216804, "pid": 5, "tid": 7, "ts": 1716454224863154, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756036, "dur": 13, "args": { "External id": 216804, "cbid": 211, "correlation": 216804 } }, { "ph": "s", "id": 216804, "pid": 76337, "tid": -914061504, "ts": 1716454224756036, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::GeluCUDAKernelImpl(at::TensorIteratorBase&, at::native::GeluType)::{lambda()#2}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224863734, "dur": 244, "args": { "External id": 216812, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216812, "pid": 5, "tid": 7, "ts": 1716454224863734, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756101, "dur": 12, "args": { "External id": 216812, "cbid": 211, "correlation": 216812 } }, { "ph": "s", "id": 216812, "pid": 76337, "tid": -914061504, "ts": 1716454224756101, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224863979, "dur": 252, "args": { "External id": 216820, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 768, "warps per SM": 3072, "grid": [61440, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216820, "pid": 5, "tid": 7, "ts": 1716454224863979, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756133, "dur": 8, "args": { "External id": 216820, "cbid": 211, "correlation": 216820 } }, { "ph": "s", "id": 216820, "pid": 76337, "tid": -914061504, "ts": 1716454224756133, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756214, "dur": 1, "args": { "External id": 216836, "cbid": 251, "correlation": 216836 } }, { "ph": "f", "id": 216836, "pid": 76337, "tid": -914061504, "ts": 1716454224756214, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756219, "dur": 0, "args": { "External id": 216838, "cbid": 251, "correlation": 216838 } }, { "ph": "f", "id": 216838, "pid": 76337, "tid": -914061504, "ts": 1716454224756219, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_128x256_ldg8_relu_f2f_tn", "pid": 5, "tid": 7, "ts": 1716454224864233, "dur": 358, "args": { "External id": 216839, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216839, "registers per thread": 254, "shared memory": 65536, "blocks per SM": 3.6, "warps per SM": 28.8, "grid": [3, 96, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 } }, { "ph": "f", "id": 216839, "pid": 5, "tid": 7, "ts": 1716454224864233, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756221, "dur": 13, "args": { "External id": 216839, "cbid": 211, "correlation": 216839 } }, { "ph": "s", "id": 216839, "pid": 76337, "tid": -914061504, "ts": 1716454224756221, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224864592, "dur": 50, "args": { "External id": 216847, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216847, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216847, "pid": 5, "tid": 7, "ts": 1716454224864592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756264, "dur": 10, "args": { "External id": 216847, "cbid": 211, "correlation": 216847 } }, { "ph": "s", "id": 216847, "pid": 76337, "tid": -914061504, "ts": 1716454224756264, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#10}::operator()() const::{lambda(c10::Half)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224864643, "dur": 159, "args": { "External id": 216858, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216858, "pid": 5, "tid": 7, "ts": 1716454224864643, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756332, "dur": 12, "args": { "External id": 216858, "cbid": 211, "correlation": 216858 } }, { "ph": "s", "id": 216858, "pid": 76337, "tid": -914061504, "ts": 1716454224756332, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224756397, "dur": 0, "args": { "External id": 216870, "cbid": 317, "correlation": 216870 } }, { "ph": "f", "id": 216870, "pid": 76337, "tid": -914061504, "ts": 1716454224756397, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224756398, "dur": 0, "args": { "External id": 216871, "cbid": 203, "correlation": 216871 } }, { "ph": "f", "id": 216871, "pid": 76337, "tid": -914061504, "ts": 1716454224756398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224756399, "dur": 0, "args": { "External id": 216872, "cbid": 205, "correlation": 216872 } }, { "ph": "f", "id": 216872, "pid": 76337, "tid": -914061504, "ts": 1716454224756399, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756422, "dur": 1, "args": { "External id": 216876, "cbid": 251, "correlation": 216876 } }, { "ph": "f", "id": 216876, "pid": 76337, "tid": -914061504, "ts": 1716454224756422, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756424, "dur": 0, "args": { "External id": 216877, "cbid": 251, "correlation": 216877 } }, { "ph": "f", "id": 216877, "pid": 76337, "tid": -914061504, "ts": 1716454224756424, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756425, "dur": 0, "args": { "External id": 216878, "cbid": 251, "correlation": 216878 } }, { "ph": "f", "id": 216878, "pid": 76337, "tid": -914061504, "ts": 1716454224756425, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756426, "dur": 0, "args": { "External id": 216879, "cbid": 251, "correlation": 216879 } }, { "ph": "f", "id": 216879, "pid": 76337, "tid": -914061504, "ts": 1716454224756426, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756427, "dur": 0, "args": { "External id": 216880, "cbid": 251, "correlation": 216880 } }, { "ph": "f", "id": 216880, "pid": 76337, "tid": -914061504, "ts": 1716454224756427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756427, "dur": 0, "args": { "External id": 216881, "cbid": 251, "correlation": 216881 } }, { "ph": "f", "id": 216881, "pid": 76337, "tid": -914061504, "ts": 1716454224756427, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756428, "dur": 0, "args": { "External id": 216882, "cbid": 251, "correlation": 216882 } }, { "ph": "f", "id": 216882, "pid": 76337, "tid": -914061504, "ts": 1716454224756428, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756429, "dur": 0, "args": { "External id": 216883, "cbid": 251, "correlation": 216883 } }, { "ph": "f", "id": 216883, "pid": 76337, "tid": -914061504, "ts": 1716454224756429, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224756431, "dur": 0, "args": { "External id": 216884, "cbid": 251, "correlation": 216884 } }, { "ph": "f", "id": 216884, "pid": 76337, "tid": -914061504, "ts": 1716454224756431, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "volta_fp16_s884gemm_fp16_64x128_ldg8_f2f_nn", "pid": 5, "tid": 7, "ts": 1716454224864803, "dur": 114, "args": { "External id": 216885, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216885, "registers per thread": 138, "shared memory": 24832, "blocks per SM": 14.4, "warps per SM": 57.6, "grid": [48, 3, 8], "block": [128, 1, 1], "est. achieved occupancy %": 19 } }, { "ph": "f", "id": 216885, "pid": 5, "tid": 7, "ts": 1716454224864803, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756433, "dur": 12, "args": { "External id": 216885, "cbid": 211, "correlation": 216885 } }, { "ph": "s", "id": 216885, "pid": 76337, "tid": -914061504, "ts": 1716454224756433, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224864919, "dur": 60, "args": { "External id": 216891, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216891, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216891, "pid": 5, "tid": 7, "ts": 1716454224864919, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756468, "dur": 9, "args": { "External id": 216891, "cbid": 211, "correlation": 216891 } }, { "ph": "s", "id": 216891, "pid": 76337, "tid": -914061504, "ts": 1716454224756468, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224864980, "dur": 50, "args": { "External id": 216899, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216899, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216899, "pid": 5, "tid": 7, "ts": 1716454224864980, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756501, "dur": 8, "args": { "External id": 216899, "cbid": 211, "correlation": 216899 } }, { "ph": "s", "id": 216899, "pid": 76337, "tid": -914061504, "ts": 1716454224756501, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy(c10::Half*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 5, "tid": 7, "ts": 1716454224865031, "dur": 97, "args": { "External id": 216908, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216908, "registers per thread": 32, "shared memory": 0, "blocks per SM": 4, "warps per SM": 64, "grid": [160, 2, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216908, "pid": 5, "tid": 7, "ts": 1716454224865031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756540, "dur": 10, "args": { "External id": 216908, "cbid": 211, "correlation": 216908 } }, { "ph": "s", "id": 216908, "pid": 76337, "tid": -914061504, "ts": 1716454224756540, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224865129, "dur": 92, "args": { "External id": 216928, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216928, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 216928, "pid": 5, "tid": 7, "ts": 1716454224865129, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756609, "dur": 11, "args": { "External id": 216928, "cbid": 211, "correlation": 216928 } }, { "ph": "s", "id": 216928, "pid": 76337, "tid": -914061504, "ts": 1716454224756609, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224865223, "dur": 4, "args": { "External id": 216940, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216940, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 2, "grid": [20, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 216940, "pid": 5, "tid": 7, "ts": 1716454224865223, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756630, "dur": 75, "args": { "External id": 216940, "cbid": 211, "correlation": 216940 } }, { "ph": "s", "id": 216940, "pid": 76337, "tid": -914061504, "ts": 1716454224756630, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224865228, "dur": 109, "args": { "External id": 216943, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216943, "pid": 5, "tid": 7, "ts": 1716454224865228, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756716, "dur": 7, "args": { "External id": 216943, "cbid": 211, "correlation": 216943 } }, { "ph": "s", "id": 216943, "pid": 76337, "tid": -914061504, "ts": 1716454224756716, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224865339, "dur": 69, "args": { "External id": 216952, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216952, "registers per thread": 24, "shared memory": 0, "blocks per SM": 384, "warps per SM": 1536, "grid": [30720, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216952, "pid": 5, "tid": 7, "ts": 1716454224865339, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756758, "dur": 10, "args": { "External id": 216952, "cbid": 211, "correlation": 216952 } }, { "ph": "s", "id": 216952, "pid": 76337, "tid": -914061504, "ts": 1716454224756758, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224756810, "dur": 0, "args": { "External id": 216962, "cbid": 317, "correlation": 216962 } }, { "ph": "f", "id": 216962, "pid": 76337, "tid": -914061504, "ts": 1716454224756810, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224756811, "dur": 0, "args": { "External id": 216963, "cbid": 203, "correlation": 216963 } }, { "ph": "f", "id": 216963, "pid": 76337, "tid": -914061504, "ts": 1716454224756811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224756811, "dur": 0, "args": { "External id": 216964, "cbid": 205, "correlation": 216964 } }, { "ph": "f", "id": 216964, "pid": 76337, "tid": -914061504, "ts": 1716454224756811, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224865410, "dur": 77, "args": { "External id": 216968, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216968, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216968, "pid": 5, "tid": 7, "ts": 1716454224865410, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756827, "dur": 11, "args": { "External id": 216968, "cbid": 211, "correlation": 216968 } }, { "ph": "s", "id": 216968, "pid": 76337, "tid": -914061504, "ts": 1716454224756827, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224865488, "dur": 24, "args": { "External id": 216970, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216970, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 80, "warps per SM": 640, "grid": [1, 20, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216970, "pid": 5, "tid": 7, "ts": 1716454224865488, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756841, "dur": 6, "args": { "External id": 216970, "cbid": 211, "correlation": 216970 } }, { "ph": "s", "id": 216970, "pid": 76337, "tid": -914061504, "ts": 1716454224756841, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224865513, "dur": 4, "args": { "External id": 216972, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216972, "pid": 5, "tid": 7, "ts": 1716454224865513, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756853, "dur": 6, "args": { "External id": 216972, "cbid": 211, "correlation": 216972 } }, { "ph": "s", "id": 216972, "pid": 76337, "tid": -914061504, "ts": 1716454224756853, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224756861, "dur": 0, "args": { "External id": 216973, "cbid": 51, "correlation": 216973 } }, { "ph": "s", "id": 216973, "pid": 76337, "tid": -914061504, "ts": 1716454224756861, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224865518, "dur": 1369, "args": { "External id": 216974, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216974, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 216974, "pid": 5, "tid": 7, "ts": 1716454224865518, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756862, "dur": 5, "args": { "External id": 216974, "cbid": 211, "correlation": 216974 } }, { "ph": "s", "id": 216974, "pid": 76337, "tid": -914061504, "ts": 1716454224756862, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224866889, "dur": 60, "args": { "External id": 216979, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216979, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 216979, "pid": 5, "tid": 7, "ts": 1716454224866889, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756889, "dur": 9, "args": { "External id": 216979, "cbid": 211, "correlation": 216979 } }, { "ph": "s", "id": 216979, "pid": 76337, "tid": -914061504, "ts": 1716454224756889, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224866950, "dur": 4, "args": { "External id": 216987, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 216987, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.25, "warps per SM": 1, "grid": [20, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 216987, "pid": 5, "tid": 7, "ts": 1716454224866950, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224756933, "dur": 9, "args": { "External id": 216987, "cbid": 211, "correlation": 216987 } }, { "ph": "s", "id": 216987, "pid": 76337, "tid": -914061504, "ts": 1716454224756933, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224757005, "dur": 1, "args": { "External id": 217003, "cbid": 251, "correlation": 217003 } }, { "ph": "f", "id": 217003, "pid": 76337, "tid": -914061504, "ts": 1716454224757005, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 76337, "tid": -914061504, "ts": 1716454224757011, "dur": 0, "args": { "External id": 217005, "cbid": 251, "correlation": 217005 } }, { "ph": "f", "id": 217005, "pid": 76337, "tid": -914061504, "ts": 1716454224757011, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel(cutlass_70_wmma_tensorop_s161616gemm_f16_32x32_64x2_tn_align8::Params)", "pid": 5, "tid": 7, "ts": 1716454224866955, "dur": 11, "args": { "External id": 217006, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217006, "registers per thread": 96, "shared memory": 17408, "blocks per SM": 0.8, "warps per SM": 3.2, "grid": [8, 2, 4], "block": [128, 1, 1], "est. achieved occupancy %": 5 } }, { "ph": "f", "id": 217006, "pid": 5, "tid": 7, "ts": 1716454224866955, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757012, "dur": 12, "args": { "External id": 217006, "cbid": 211, "correlation": 217006 } }, { "ph": "s", "id": 217006, "pid": 76337, "tid": -914061504, "ts": 1716454224757012, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, true, false>(cublasSplitKParams, float const*, __half const*, __half*, float const*, float const*, __half const*, float const*, __half*, void*, long, float*, int*)", "pid": 5, "tid": 7, "ts": 1716454224866967, "dur": 5, "args": { "External id": 217008, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217008, "registers per thread": 32, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 2, "grid": [10, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 3 } }, { "ph": "f", "id": 217008, "pid": 5, "tid": 7, "ts": 1716454224866967, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757026, "dur": 170, "args": { "External id": 217008, "cbid": 211, "correlation": 217008 } }, { "ph": "s", "id": 217008, "pid": 76337, "tid": -914061504, "ts": 1716454224757026, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224866974, "dur": 56, "args": { "External id": 217018, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217018, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217018, "pid": 5, "tid": 7, "ts": 1716454224866974, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757250, "dur": 12, "args": { "External id": 217018, "cbid": 211, "correlation": 217018 } }, { "ph": "s", "id": 217018, "pid": 76337, "tid": -914061504, "ts": 1716454224757250, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::RowwiseMomentsCUDAKernel(long, c10::Half, c10::Half const*, c10::Half*, c10::Half*)", "pid": 5, "tid": 7, "ts": 1716454224867031, "dur": 52, "args": { "External id": 217038, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217038, "registers per thread": 25, "shared memory": 768, "blocks per SM": 3.2, "warps per SM": 51.2, "grid": [256, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 80 } }, { "ph": "f", "id": 217038, "pid": 5, "tid": 7, "ts": 1716454224867031, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757316, "dur": 11, "args": { "External id": 217038, "cbid": 211, "correlation": 217038 } }, { "ph": "s", "id": 217038, "pid": 76337, "tid": -914061504, "ts": 1716454224757316, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::ComputeFusedParamsCUDAKernel(long, long, long, c10::Half const*, c10::Half const*, c10::Half const*, c10::Half const*, at::AccumulateType::type*, at::AccumulateType::type*)", "pid": 5, "tid": 7, "ts": 1716454224867084, "dur": 4, "args": { "External id": 217050, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217050, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.125, "warps per SM": 1, "grid": [10, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217050, "pid": 5, "tid": 7, "ts": 1716454224867084, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757336, "dur": 6, "args": { "External id": 217050, "cbid": 211, "correlation": 217050 } }, { "ph": "s", "id": 217050, "pid": 76337, "tid": -914061504, "ts": 1716454224757336, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1}>(at::TensorIteratorBase&, at::native::(anonymous namespace)::GroupNormKernelImplInternal(at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, long, c10::Half, at::Tensor&, at::Tensor&, at::Tensor&)::{lambda(c10::Half, float, float)#1} const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224867089, "dur": 56, "args": { "External id": 217053, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217053, "pid": 5, "tid": 7, "ts": 1716454224867089, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757355, "dur": 7, "args": { "External id": 217053, "cbid": 211, "correlation": 217053 } }, { "ph": "s", "id": 217053, "pid": 76337, "tid": -914061504, "ts": 1716454224757355, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::silu_kernel(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#3}::operator()() const::{lambda(c10::Half)#1}, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224867147, "dur": 37, "args": { "External id": 217062, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217062, "registers per thread": 24, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217062, "pid": 5, "tid": 7, "ts": 1716454224867147, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757396, "dur": 10, "args": { "External id": 217062, "cbid": 211, "correlation": 217062 } }, { "ph": "s", "id": 217062, "pid": 76337, "tid": -914061504, "ts": 1716454224757396, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224757458, "dur": 0, "args": { "External id": 217072, "cbid": 317, "correlation": 217072 } }, { "ph": "f", "id": 217072, "pid": 76337, "tid": -914061504, "ts": 1716454224757458, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224757459, "dur": 0, "args": { "External id": 217073, "cbid": 203, "correlation": 217073 } }, { "ph": "f", "id": 217073, "pid": 76337, "tid": -914061504, "ts": 1716454224757459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224757459, "dur": 0, "args": { "External id": 217074, "cbid": 205, "correlation": 217074 } }, { "ph": "f", "id": 217074, "pid": 76337, "tid": -914061504, "ts": 1716454224757459, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224867184, "dur": 40, "args": { "External id": 217078, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217078, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217078, "pid": 5, "tid": 7, "ts": 1716454224867184, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757475, "dur": 12, "args": { "External id": 217078, "cbid": 211, "correlation": 217078 } }, { "ph": "s", "id": 217078, "pid": 76337, "tid": -914061504, "ts": 1716454224757475, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224867226, "dur": 14, "args": { "External id": 217080, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217080, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 40, "warps per SM": 320, "grid": [1, 10, 320], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217080, "pid": 5, "tid": 7, "ts": 1716454224867226, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757489, "dur": 5, "args": { "External id": 217080, "cbid": 211, "correlation": 217080 } }, { "ph": "s", "id": 217080, "pid": 76337, "tid": -914061504, "ts": 1716454224757489, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "pid": 5, "tid": 7, "ts": 1716454224867242, "dur": 3, "args": { "External id": 217082, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217082, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.1625, "warps per SM": 1.3, "grid": [13, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 2 } }, { "ph": "f", "id": 217082, "pid": 5, "tid": 7, "ts": 1716454224867242, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757498, "dur": 6, "args": { "External id": 217082, "cbid": 211, "correlation": 217082 } }, { "ph": "s", "id": 217082, "pid": 76337, "tid": -914061504, "ts": 1716454224757498, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 76337, "tid": -914061504, "ts": 1716454224757507, "dur": 0, "args": { "External id": 217083, "cbid": 51, "correlation": 217083 } }, { "ph": "s", "id": 217083, "pid": 76337, "tid": -914061504, "ts": 1716454224757507, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "cudnn_volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc2nchw_tn_v1", "pid": 5, "tid": 7, "ts": 1716454224867246, "dur": 702, "args": { "External id": 217084, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217084, "registers per thread": 254, "shared memory": 33280, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [192, 3, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217084, "pid": 5, "tid": 7, "ts": 1716454224867246, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757508, "dur": 5, "args": { "External id": 217084, "cbid": 211, "correlation": 217084 } }, { "ph": "s", "id": 217084, "pid": 76337, "tid": -914061504, "ts": 1716454224757508, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224867949, "dur": 60, "args": { "External id": 217089, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217089, "pid": 5, "tid": 7, "ts": 1716454224867949, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757535, "dur": 8, "args": { "External id": 217089, "cbid": 211, "correlation": 217089 } }, { "ph": "s", "id": 217089, "pid": 76337, "tid": -914061504, "ts": 1716454224757535, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 76337, "tid": -914061504, "ts": 1716454224757592, "dur": 0, "args": { "External id": 217099, "cbid": 317, "correlation": 217099 } }, { "ph": "f", "id": 217099, "pid": 76337, "tid": -914061504, "ts": 1716454224757592, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetPriority", "pid": 76337, "tid": -914061504, "ts": 1716454224757593, "dur": 0, "args": { "External id": 217100, "cbid": 203, "correlation": 217100 } }, { "ph": "f", "id": 217100, "pid": 76337, "tid": -914061504, "ts": 1716454224757593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetStreamPriorityRange", "pid": 76337, "tid": -914061504, "ts": 1716454224757593, "dur": 0, "args": { "External id": 217101, "cbid": 205, "correlation": 217101 } }, { "ph": "f", "id": 217101, "pid": 76337, "tid": -914061504, "ts": 1716454224757593, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224868010, "dur": 75, "args": { "External id": 217105, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217105, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 192, "warps per SM": 1536, "grid": [96, 20, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217105, "pid": 5, "tid": 7, "ts": 1716454224868010, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757606, "dur": 12, "args": { "External id": 217105, "cbid": 211, "correlation": 217105 } }, { "ph": "s", "id": 217105, "pid": 76337, "tid": -914061504, "ts": 1716454224757606, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "sm70_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage1_warpsize2x2x1_g1_tensor8x8x4_t1r1s1_execute_kernel_cudnn", "pid": 5, "tid": 7, "ts": 1716454224868086, "dur": 209, "args": { "External id": 217107, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217107, "registers per thread": 252, "shared memory": 17408, "blocks per SM": 7.2, "warps per SM": 28.8, "grid": [3, 192, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 } }, { "ph": "f", "id": 217107, "pid": 5, "tid": 7, "ts": 1716454224868086, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757625, "dur": 8, "args": { "External id": 217107, "cbid": 211, "correlation": 217107 } }, { "ph": "s", "id": 217107, "pid": 76337, "tid": -914061504, "ts": 1716454224757625, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_params_t, __half const*, __half*)", "pid": 5, "tid": 7, "ts": 1716454224868297, "dur": 39, "args": { "External id": 217109, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217109, "registers per thread": 32, "shared memory": 2112, "blocks per SM": 96, "warps per SM": 768, "grid": [96, 10, 8], "block": [256, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217109, "pid": 5, "tid": 7, "ts": 1716454224868297, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224757638, "dur": 493, "args": { "External id": 217109, "cbid": 211, "correlation": 217109 } }, { "ph": "s", "id": 217109, "pid": 76337, "tid": -914061504, "ts": 1716454224757638, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 5, "tid": 7, "ts": 1716454224868337, "dur": 59, "args": { "External id": 217115, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217115, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217115, "pid": 5, "tid": 7, "ts": 1716454224868337, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224758153, "dur": 10, "args": { "External id": 217115, "cbid": 211, "correlation": 217115 } }, { "ph": "s", "id": 217115, "pid": 76337, "tid": -914061504, "ts": 1716454224758153, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array >(int, at::native::CUDAFunctor_add, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224868398, "dur": 50, "args": { "External id": 217123, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217123, "registers per thread": 19, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217123, "pid": 5, "tid": 7, "ts": 1716454224868398, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224758184, "dur": 9, "args": { "External id": 217123, "cbid": 211, "correlation": 217123 } }, { "ph": "s", "id": 217123, "pid": 76337, "tid": -914061504, "ts": 1716454224758184, "cat": "ac2g", "name": "ac2g" }, { "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 5, "tid": 7, "ts": 1716454224868449, "dur": 36, "args": { "External id": 217131, "queued": 0, "device": 5, "context": 1, "stream": 7, "correlation": 217131, "registers per thread": 17, "shared memory": 0, "blocks per SM": 192, "warps per SM": 768, "grid": [15360, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 } }, { "ph": "f", "id": 217131, "pid": 5, "tid": 7, "ts": 1716454224868449, "cat": "ac2g", "name": "ac2g", "bp": "e" }, { "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 76337, "tid": -914061504, "ts": 1716454224758213, "dur": 29, "args": { "External id": 217131, "cbid": 211, "correlation": 217131 } }, { "ph": "s", "id": 217131, "pid": 76337, "tid": -914061504, "ts": 1716454224758213, "cat": "ac2g", "name": "ac2g" }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 76337, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 76337, "tid": 0, "args": { "labels": "CPU" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 76337, "tid": 0, "args": { "sort_index": 76337 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 0, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 0, "tid": 0, "args": { "labels": "GPU 0" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 0, "tid": 0, "args": { "sort_index": 16777216 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 1, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 1, "tid": 0, "args": { "labels": "GPU 1" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 1, "tid": 0, "args": { "sort_index": 16777217 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 2, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 2, "tid": 0, "args": { "labels": "GPU 2" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 2, "tid": 0, "args": { "sort_index": 16777218 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 3, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 3, "tid": 0, "args": { "labels": "GPU 3" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 3, "tid": 0, "args": { "sort_index": 16777219 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 4, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 4, "tid": 0, "args": { "labels": "GPU 4" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 4, "tid": 0, "args": { "sort_index": 16777220 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 5, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 5, "tid": 0, "args": { "labels": "GPU 5" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 5, "tid": 0, "args": { "sort_index": 16777221 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 6, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 6, "tid": 0, "args": { "labels": "GPU 6" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 6, "tid": 0, "args": { "sort_index": 16777222 } }, { "name": "process_name", "ph": "M", "ts": 1716454215749449, "pid": 7, "tid": 0, "args": { "name": "python" } }, { "name": "process_labels", "ph": "M", "ts": 1716454215749449, "pid": 7, "tid": 0, "args": { "labels": "GPU 7" } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 7, "tid": 0, "args": { "sort_index": 16777223 } }, { "name": "thread_name", "ph": "M", "ts": 1716454215749449, "pid": 5, "tid": 7, "args": { "name": "stream 7 " } }, { "name": "thread_sort_index", "ph": "M", "ts": 1716454215749449, "pid": 5, "tid": 7, "args": { "sort_index": 7 } }, { "ph": "X", "cat": "Trace", "ts": 1716454215749281, "dur": 11255661, "pid": "Spans", "tid": "PyTorch Profiler", "name": "PyTorch Profiler (0)", "args": { "Op count": 0 } }, { "name": "process_sort_index", "ph": "M", "ts": 1716454215749281, "pid": "Spans", "tid": 0, "args": { "sort_index": 536870912 } }, { "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 1716454215749281 }, { "name": "Record Window End", "ph": "i", "s": "g", "pid": "", "tid": "", "ts": 1716454227005160 } ], "traceName": "trace.json" }